From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/fs/9p/Kconfig | 46 + kernel/fs/9p/Makefile | 19 + kernel/fs/9p/acl.c | 381 + kernel/fs/9p/acl.h | 55 + kernel/fs/9p/cache.c | 414 + kernel/fs/9p/cache.h | 151 + kernel/fs/9p/fid.c | 306 + kernel/fs/9p/fid.h | 30 + kernel/fs/9p/v9fs.c | 685 + kernel/fs/9p/v9fs.h | 227 + kernel/fs/9p/v9fs_vfs.h | 86 + kernel/fs/9p/vfs_addr.c | 351 + kernel/fs/9p/vfs_dentry.c | 122 + kernel/fs/9p/vfs_dir.c | 261 + kernel/fs/9p/vfs_file.c | 705 ++ kernel/fs/9p/vfs_inode.c | 1537 +++ kernel/fs/9p/vfs_inode_dotl.c | 1016 ++ kernel/fs/9p/vfs_super.c | 370 + kernel/fs/9p/xattr.c | 151 + kernel/fs/9p/xattr.h | 37 + kernel/fs/9p/xattr_security.c | 80 + kernel/fs/9p/xattr_trusted.c | 80 + kernel/fs/9p/xattr_user.c | 80 + kernel/fs/Kconfig | 281 + kernel/fs/Kconfig.binfmt | 181 + kernel/fs/Makefile | 129 + kernel/fs/adfs/Kconfig | 27 + kernel/fs/adfs/Makefile | 7 + kernel/fs/adfs/adfs.h | 208 + kernel/fs/adfs/dir.c | 288 + kernel/fs/adfs/dir_f.c | 486 + kernel/fs/adfs/dir_f.h | 65 + kernel/fs/adfs/dir_fplus.c | 265 + kernel/fs/adfs/dir_fplus.h | 45 + kernel/fs/adfs/file.c | 35 + kernel/fs/adfs/inode.c | 371 + kernel/fs/adfs/map.c | 290 + kernel/fs/adfs/super.c | 562 + kernel/fs/affs/Changes | 343 + kernel/fs/affs/Kconfig | 21 + kernel/fs/affs/Makefile | 9 + kernel/fs/affs/affs.h | 314 + kernel/fs/affs/amigaffs.c | 515 + kernel/fs/affs/bitmap.c | 364 + kernel/fs/affs/dir.c | 142 + kernel/fs/affs/file.c | 982 ++ kernel/fs/affs/inode.c | 416 + kernel/fs/affs/namei.c | 462 + kernel/fs/affs/super.c | 649 + kernel/fs/affs/symlink.c | 81 + kernel/fs/afs/Kconfig | 29 + kernel/fs/afs/Makefile | 32 + kernel/fs/afs/afs.h | 170 + kernel/fs/afs/afs_cm.h | 33 + kernel/fs/afs/afs_fs.h | 56 + kernel/fs/afs/afs_vl.h | 84 + kernel/fs/afs/cache.c | 402 + kernel/fs/afs/callback.c | 475 + kernel/fs/afs/cell.c | 460 + kernel/fs/afs/cmservice.c | 604 + kernel/fs/afs/dir.c | 1123 ++ kernel/fs/afs/file.c | 378 + kernel/fs/afs/flock.c | 585 + kernel/fs/afs/fsclient.c | 1911 +++ kernel/fs/afs/inode.c | 498 + kernel/fs/afs/internal.h | 889 ++ kernel/fs/afs/main.c | 184 + kernel/fs/afs/misc.c | 89 + kernel/fs/afs/mntpt.c | 270 + kernel/fs/afs/netdevices.c | 68 + kernel/fs/afs/proc.c | 666 + kernel/fs/afs/rxrpc.c | 862 ++ kernel/fs/afs/security.c | 363 + kernel/fs/afs/server.c | 322 + kernel/fs/afs/super.c | 557 + kernel/fs/afs/vlclient.c | 219 + kernel/fs/afs/vlocation.c | 718 ++ kernel/fs/afs/vnode.c | 1025 ++ kernel/fs/afs/volume.c | 401 + kernel/fs/afs/write.c | 761 ++ kernel/fs/aio.c | 1742 +++ kernel/fs/anon_inodes.c | 179 + kernel/fs/attr.c | 278 + kernel/fs/autofs4/Kconfig | 20 + kernel/fs/autofs4/Makefile | 7 + kernel/fs/autofs4/autofs_i.h | 284 + kernel/fs/autofs4/dev-ioctl.c | 762 ++ kernel/fs/autofs4/expire.c | 603 + kernel/fs/autofs4/init.c | 52 + kernel/fs/autofs4/inode.c | 370 + kernel/fs/autofs4/root.c | 923 ++ kernel/fs/autofs4/symlink.c | 28 + kernel/fs/autofs4/waitq.c | 565 + kernel/fs/bad_inode.c | 214 + kernel/fs/befs/ChangeLog | 417 + kernel/fs/befs/Kconfig | 26 + kernel/fs/befs/Makefile | 7 + kernel/fs/befs/TODO | 14 + kernel/fs/befs/befs.h | 157 + kernel/fs/befs/befs_fs_types.h | 255 + kernel/fs/befs/btree.c | 793 ++ kernel/fs/befs/btree.h | 13 + kernel/fs/befs/datastream.c | 529 + kernel/fs/befs/datastream.h | 19 + kernel/fs/befs/debug.c | 258 + kernel/fs/befs/endian.h | 125 + kernel/fs/befs/inode.c | 54 + kernel/fs/befs/inode.h | 8 + kernel/fs/befs/io.c | 85 + kernel/fs/befs/io.h | 9 + kernel/fs/befs/linuxvfs.c | 981 ++ kernel/fs/befs/super.c | 112 + kernel/fs/befs/super.h | 8 + kernel/fs/bfs/Kconfig | 19 + kernel/fs/bfs/Makefile | 7 + kernel/fs/bfs/bfs.h | 60 + kernel/fs/bfs/dir.c | 365 + kernel/fs/bfs/file.c | 197 + kernel/fs/bfs/inode.c | 499 + kernel/fs/binfmt_aout.c | 423 + kernel/fs/binfmt_elf.c | 2326 ++++ kernel/fs/binfmt_elf_fdpic.c | 1794 +++ kernel/fs/binfmt_em86.c | 117 + kernel/fs/binfmt_flat.c | 953 ++ kernel/fs/binfmt_misc.c | 835 ++ kernel/fs/binfmt_script.c | 129 + kernel/fs/block_dev.c | 1795 +++ kernel/fs/btrfs/Kconfig | 91 + kernel/fs/btrfs/Makefile | 19 + kernel/fs/btrfs/acl.c | 166 + kernel/fs/btrfs/async-thread.c | 365 + kernel/fs/btrfs/async-thread.h | 81 + kernel/fs/btrfs/backref.c | 1978 +++ kernel/fs/btrfs/backref.h | 77 + kernel/fs/btrfs/btrfs_inode.h | 328 + kernel/fs/btrfs/check-integrity.c | 3245 +++++ kernel/fs/btrfs/check-integrity.h | 38 + kernel/fs/btrfs/compression.c | 1091 ++ kernel/fs/btrfs/compression.h | 83 + kernel/fs/btrfs/ctree.c | 5910 +++++++++ kernel/fs/btrfs/ctree.h | 4247 +++++++ kernel/fs/btrfs/delayed-inode.c | 1997 +++ kernel/fs/btrfs/delayed-inode.h | 156 + kernel/fs/btrfs/delayed-ref.c | 962 ++ kernel/fs/btrfs/delayed-ref.h | 276 + kernel/fs/btrfs/dev-replace.c | 937 ++ kernel/fs/btrfs/dev-replace.h | 44 + kernel/fs/btrfs/dir-item.c | 482 + kernel/fs/btrfs/disk-io.c | 4338 +++++++ kernel/fs/btrfs/disk-io.h | 159 + kernel/fs/btrfs/export.c | 304 + kernel/fs/btrfs/export.h | 19 + kernel/fs/btrfs/extent-tree.c | 10174 +++++++++++++++ kernel/fs/btrfs/extent_io.c | 5632 +++++++++ kernel/fs/btrfs/extent_io.h | 382 + kernel/fs/btrfs/extent_map.c | 455 + kernel/fs/btrfs/extent_map.h | 85 + kernel/fs/btrfs/file-item.c | 953 ++ kernel/fs/btrfs/file.c | 2860 +++++ kernel/fs/btrfs/free-space-cache.c | 3653 ++++++ kernel/fs/btrfs/free-space-cache.h | 133 + kernel/fs/btrfs/hash.c | 46 + kernel/fs/btrfs/hash.h | 42 + kernel/fs/btrfs/inode-item.c | 440 + kernel/fs/btrfs/inode-map.c | 568 + kernel/fs/btrfs/inode-map.h | 13 + kernel/fs/btrfs/inode.c | 9907 +++++++++++++++ kernel/fs/btrfs/ioctl.c | 5359 ++++++++ kernel/fs/btrfs/locking.c | 300 + kernel/fs/btrfs/locking.h | 62 + kernel/fs/btrfs/lzo.c | 443 + kernel/fs/btrfs/math.h | 42 + kernel/fs/btrfs/ordered-data.c | 1051 ++ kernel/fs/btrfs/ordered-data.h | 212 + kernel/fs/btrfs/orphan.c | 71 + kernel/fs/btrfs/print-tree.c | 349 + kernel/fs/btrfs/print-tree.h | 23 + kernel/fs/btrfs/props.c | 429 + kernel/fs/btrfs/props.h | 42 + kernel/fs/btrfs/qgroup.c | 2966 +++++ kernel/fs/btrfs/qgroup.h | 107 + kernel/fs/btrfs/raid56.c | 2670 ++++ kernel/fs/btrfs/raid56.h | 62 + kernel/fs/btrfs/rcu-string.h | 56 + kernel/fs/btrfs/reada.c | 992 ++ kernel/fs/btrfs/relocation.c | 4658 +++++++ kernel/fs/btrfs/root-tree.c | 497 + kernel/fs/btrfs/scrub.c | 4228 +++++++ kernel/fs/btrfs/send.c | 5962 +++++++++ kernel/fs/btrfs/send.h | 134 + kernel/fs/btrfs/struct-funcs.c | 142 + kernel/fs/btrfs/super.c | 2220 ++++ kernel/fs/btrfs/sysfs.c | 758 ++ kernel/fs/btrfs/sysfs.h | 89 + kernel/fs/btrfs/tests/btrfs-tests.c | 171 + kernel/fs/btrfs/tests/btrfs-tests.h | 68 + kernel/fs/btrfs/tests/extent-buffer-tests.c | 229 + kernel/fs/btrfs/tests/extent-io-tests.c | 275 + kernel/fs/btrfs/tests/free-space-tests.c | 909 ++ kernel/fs/btrfs/tests/inode-tests.c | 1123 ++ kernel/fs/btrfs/tests/qgroup-tests.c | 469 + kernel/fs/btrfs/transaction.c | 2204 ++++ kernel/fs/btrfs/transaction.h | 194 + kernel/fs/btrfs/tree-defrag.c | 139 + kernel/fs/btrfs/tree-log.c | 5089 ++++++++ kernel/fs/btrfs/tree-log.h | 85 + kernel/fs/btrfs/ulist.c | 251 + kernel/fs/btrfs/ulist.h | 80 + kernel/fs/btrfs/uuid-tree.c | 354 + kernel/fs/btrfs/volumes.c | 6730 ++++++++++ kernel/fs/btrfs/volumes.h | 541 + kernel/fs/btrfs/xattr.c | 517 + kernel/fs/btrfs/xattr.h | 41 + kernel/fs/btrfs/zlib.c | 412 + kernel/fs/buffer.c | 3422 +++++ kernel/fs/cachefiles/Kconfig | 39 + kernel/fs/cachefiles/Makefile | 18 + kernel/fs/cachefiles/bind.c | 277 + kernel/fs/cachefiles/daemon.c | 751 ++ kernel/fs/cachefiles/interface.c | 557 + kernel/fs/cachefiles/internal.h | 363 + kernel/fs/cachefiles/key.c | 159 + kernel/fs/cachefiles/main.c | 105 + kernel/fs/cachefiles/namei.c | 978 ++ kernel/fs/cachefiles/proc.c | 134 + kernel/fs/cachefiles/rdwr.c | 967 ++ kernel/fs/cachefiles/security.c | 116 + kernel/fs/cachefiles/xattr.c | 324 + kernel/fs/ceph/Kconfig | 40 + kernel/fs/ceph/Makefile | 13 + kernel/fs/ceph/acl.c | 263 + kernel/fs/ceph/addr.c | 1599 +++ kernel/fs/ceph/cache.c | 402 + kernel/fs/ceph/cache.h | 182 + kernel/fs/ceph/caps.c | 3456 +++++ kernel/fs/ceph/ceph_frag.c | 22 + kernel/fs/ceph/debugfs.c | 321 + kernel/fs/ceph/dir.c | 1411 +++ kernel/fs/ceph/export.c | 250 + kernel/fs/ceph/file.c | 1344 ++ kernel/fs/ceph/inode.c | 2016 +++ kernel/fs/ceph/ioctl.c | 295 + kernel/fs/ceph/ioctl.h | 100 + kernel/fs/ceph/locks.c | 381 + kernel/fs/ceph/mds_client.c | 3871 ++++++ kernel/fs/ceph/mds_client.h | 406 + kernel/fs/ceph/mdsmap.c | 189 + kernel/fs/ceph/snap.c | 977 ++ kernel/fs/ceph/strings.c | 125 + kernel/fs/ceph/super.c | 1059 ++ kernel/fs/ceph/super.h | 946 ++ kernel/fs/ceph/xattr.c | 1117 ++ kernel/fs/char_dev.c | 569 + kernel/fs/cifs/Kconfig | 202 + kernel/fs/cifs/Makefile | 20 + kernel/fs/cifs/asn1.c | 623 + kernel/fs/cifs/cache.c | 333 + kernel/fs/cifs/cifs_debug.c | 701 ++ kernel/fs/cifs/cifs_debug.h | 77 + kernel/fs/cifs/cifs_dfs_ref.c | 379 + kernel/fs/cifs/cifs_fs_sb.h | 71 + kernel/fs/cifs/cifs_spnego.c | 179 + kernel/fs/cifs/cifs_spnego.h | 47 + kernel/fs/cifs/cifs_unicode.c | 611 + kernel/fs/cifs/cifs_unicode.h | 418 + kernel/fs/cifs/cifs_uniupr.h | 253 + kernel/fs/cifs/cifsacl.c | 1119 ++ kernel/fs/cifs/cifsacl.h | 101 + kernel/fs/cifs/cifsencrypt.c | 818 ++ kernel/fs/cifs/cifsfs.c | 1308 ++ kernel/fs/cifs/cifsfs.h | 140 + kernel/fs/cifs/cifsglob.h | 1620 +++ kernel/fs/cifs/cifspdu.h | 2739 ++++ kernel/fs/cifs/cifsproto.h | 514 + kernel/fs/cifs/cifssmb.c | 6526 ++++++++++ kernel/fs/cifs/connect.c | 4137 ++++++ kernel/fs/cifs/dir.c | 924 ++ kernel/fs/cifs/dns_resolve.c | 99 + kernel/fs/cifs/dns_resolve.h | 30 + kernel/fs/cifs/export.c | 67 + kernel/fs/cifs/file.c | 3896 ++++++ kernel/fs/cifs/fscache.c | 242 + kernel/fs/cifs/fscache.h | 149 + kernel/fs/cifs/inode.c | 2454 ++++ kernel/fs/cifs/ioctl.c | 217 + kernel/fs/cifs/link.c | 746 ++ kernel/fs/cifs/misc.c | 641 + kernel/fs/cifs/netmisc.c | 1014 ++ kernel/fs/cifs/nterr.c | 687 + kernel/fs/cifs/nterr.h | 563 + kernel/fs/cifs/ntlmssp.h | 138 + kernel/fs/cifs/readdir.c | 875 ++ kernel/fs/cifs/rfc1002pdu.h | 74 + kernel/fs/cifs/sess.c | 1442 +++ kernel/fs/cifs/smb1ops.c | 1116 ++ kernel/fs/cifs/smb2file.c | 265 + kernel/fs/cifs/smb2glob.h | 63 + kernel/fs/cifs/smb2inode.c | 273 + kernel/fs/cifs/smb2maperror.c | 2481 ++++ kernel/fs/cifs/smb2misc.c | 632 + kernel/fs/cifs/smb2ops.c | 1716 +++ kernel/fs/cifs/smb2pdu.c | 2661 ++++ kernel/fs/cifs/smb2pdu.h | 1080 ++ kernel/fs/cifs/smb2proto.h | 174 + kernel/fs/cifs/smb2status.h | 1782 +++ kernel/fs/cifs/smb2transport.c | 612 + kernel/fs/cifs/smbencrypt.c | 249 + kernel/fs/cifs/smberr.h | 184 + kernel/fs/cifs/smbfsctl.h | 121 + kernel/fs/cifs/transport.c | 1113 ++ kernel/fs/cifs/winucase.c | 663 + kernel/fs/cifs/xattr.c | 424 + kernel/fs/coda/Kconfig | 21 + kernel/fs/coda/Makefile | 12 + kernel/fs/coda/cache.c | 118 + kernel/fs/coda/cnode.c | 168 + kernel/fs/coda/coda_cache.h | 22 + kernel/fs/coda/coda_fs_i.h | 58 + kernel/fs/coda/coda_int.h | 20 + kernel/fs/coda/coda_linux.c | 186 + kernel/fs/coda/coda_linux.h | 105 + kernel/fs/coda/dir.c | 579 + kernel/fs/coda/file.c | 230 + kernel/fs/coda/inode.c | 333 + kernel/fs/coda/pioctl.c | 90 + kernel/fs/coda/psdev.c | 437 + kernel/fs/coda/symlink.c | 50 + kernel/fs/coda/sysctl.c | 73 + kernel/fs/coda/upcall.c | 882 ++ kernel/fs/compat.c | 1481 +++ kernel/fs/compat_binfmt_elf.c | 145 + kernel/fs/compat_ioctl.c | 1638 +++ kernel/fs/configfs/Kconfig | 11 + kernel/fs/configfs/Makefile | 7 + kernel/fs/configfs/configfs_internal.h | 163 + kernel/fs/configfs/dir.c | 1724 +++ kernel/fs/configfs/file.c | 336 + kernel/fs/configfs/inode.c | 272 + kernel/fs/configfs/item.c | 214 + kernel/fs/configfs/mount.c | 175 + kernel/fs/configfs/symlink.c | 314 + kernel/fs/coredump.c | 751 ++ kernel/fs/cramfs/Kconfig | 22 + kernel/fs/cramfs/Makefile | 7 + kernel/fs/cramfs/README | 168 + kernel/fs/cramfs/inode.c | 611 + kernel/fs/cramfs/internal.h | 4 + kernel/fs/cramfs/uncompress.c | 79 + kernel/fs/dax.c | 550 + kernel/fs/dcache.c | 3459 +++++ kernel/fs/dcookies.c | 350 + kernel/fs/debugfs/Makefile | 4 + kernel/fs/debugfs/file.c | 818 ++ kernel/fs/debugfs/inode.c | 736 ++ kernel/fs/devpts/Makefile | 7 + kernel/fs/devpts/inode.c | 689 + kernel/fs/direct-io.c | 1332 ++ kernel/fs/dlm/Kconfig | 16 + kernel/fs/dlm/Makefile | 21 + kernel/fs/dlm/ast.c | 314 + kernel/fs/dlm/ast.h | 32 + kernel/fs/dlm/config.c | 1040 ++ kernel/fs/dlm/config.h | 53 + kernel/fs/dlm/debug_fs.c | 791 ++ kernel/fs/dlm/dir.c | 308 + kernel/fs/dlm/dir.h | 25 + kernel/fs/dlm/dlm_internal.h | 729 ++ kernel/fs/dlm/lock.c | 6304 ++++++++++ kernel/fs/dlm/lock.h | 80 + kernel/fs/dlm/lockspace.c | 917 ++ kernel/fs/dlm/lockspace.h | 26 + kernel/fs/dlm/lowcomms.c | 1830 +++ kernel/fs/dlm/lowcomms.h | 27 + kernel/fs/dlm/lvb_table.h | 18 + kernel/fs/dlm/main.c | 97 + kernel/fs/dlm/member.c | 722 ++ kernel/fs/dlm/member.h | 33 + kernel/fs/dlm/memory.c | 96 + kernel/fs/dlm/memory.h | 27 + kernel/fs/dlm/midcomms.c | 137 + kernel/fs/dlm/midcomms.h | 21 + kernel/fs/dlm/netlink.c | 136 + kernel/fs/dlm/plock.c | 515 + kernel/fs/dlm/rcom.c | 656 + kernel/fs/dlm/rcom.h | 26 + kernel/fs/dlm/recover.c | 955 ++ kernel/fs/dlm/recover.h | 34 + kernel/fs/dlm/recoverd.c | 342 + kernel/fs/dlm/recoverd.h | 23 + kernel/fs/dlm/requestqueue.c | 171 + kernel/fs/dlm/requestqueue.h | 22 + kernel/fs/dlm/user.c | 1015 ++ kernel/fs/dlm/user.h | 19 + kernel/fs/dlm/util.c | 154 + kernel/fs/dlm/util.h | 22 + kernel/fs/drop_caches.c | 67 + kernel/fs/ecryptfs/Kconfig | 22 + kernel/fs/ecryptfs/Makefile | 10 + kernel/fs/ecryptfs/crypto.c | 2166 ++++ kernel/fs/ecryptfs/debug.c | 121 + kernel/fs/ecryptfs/dentry.c | 92 + kernel/fs/ecryptfs/ecryptfs_kernel.h | 721 ++ kernel/fs/ecryptfs/file.c | 375 + kernel/fs/ecryptfs/inode.c | 1138 ++ kernel/fs/ecryptfs/keystore.c | 2529 ++++ kernel/fs/ecryptfs/kthread.c | 172 + kernel/fs/ecryptfs/main.c | 906 ++ kernel/fs/ecryptfs/messaging.c | 468 + kernel/fs/ecryptfs/miscdev.c | 511 + kernel/fs/ecryptfs/mmap.c | 561 + kernel/fs/ecryptfs/read_write.c | 273 + kernel/fs/ecryptfs/super.c | 191 + kernel/fs/efivarfs/Kconfig | 13 + kernel/fs/efivarfs/Makefile | 7 + kernel/fs/efivarfs/file.c | 111 + kernel/fs/efivarfs/inode.c | 162 + kernel/fs/efivarfs/internal.h | 22 + kernel/fs/efivarfs/super.c | 267 + kernel/fs/efs/Kconfig | 14 + kernel/fs/efs/Makefile | 7 + kernel/fs/efs/dir.c | 103 + kernel/fs/efs/efs.h | 146 + kernel/fs/efs/file.c | 56 + kernel/fs/efs/inode.c | 311 + kernel/fs/efs/namei.c | 119 + kernel/fs/efs/super.c | 353 + kernel/fs/efs/symlink.c | 54 + kernel/fs/eventfd.c | 449 + kernel/fs/eventpoll.c | 2133 ++++ kernel/fs/exec.c | 1747 +++ kernel/fs/exofs/BUGS | 3 + kernel/fs/exofs/Kbuild | 20 + kernel/fs/exofs/Kconfig | 13 + kernel/fs/exofs/Kconfig.ore | 14 + kernel/fs/exofs/common.h | 262 + kernel/fs/exofs/dir.c | 667 + kernel/fs/exofs/exofs.h | 245 + kernel/fs/exofs/file.c | 83 + kernel/fs/exofs/inode.c | 1524 +++ kernel/fs/exofs/namei.c | 320 + kernel/fs/exofs/ore.c | 1164 ++ kernel/fs/exofs/ore_raid.c | 721 ++ kernel/fs/exofs/ore_raid.h | 62 + kernel/fs/exofs/super.c | 1049 ++ kernel/fs/exofs/symlink.c | 55 + kernel/fs/exofs/sys.c | 205 + kernel/fs/exportfs/Makefile | 6 + kernel/fs/exportfs/expfs.c | 544 + kernel/fs/ext2/Kconfig | 44 + kernel/fs/ext2/Makefile | 12 + kernel/fs/ext2/acl.c | 257 + kernel/fs/ext2/acl.h | 71 + kernel/fs/ext2/balloc.c | 1536 +++ kernel/fs/ext2/dir.c | 730 ++ kernel/fs/ext2/ext2.h | 820 ++ kernel/fs/ext2/file.c | 121 + kernel/fs/ext2/ialloc.c | 675 + kernel/fs/ext2/inode.c | 1573 +++ kernel/fs/ext2/ioctl.c | 188 + kernel/fs/ext2/namei.c | 431 + kernel/fs/ext2/super.c | 1580 +++ kernel/fs/ext2/symlink.c | 54 + kernel/fs/ext2/xattr.c | 1029 ++ kernel/fs/ext2/xattr.h | 125 + kernel/fs/ext2/xattr_security.c | 74 + kernel/fs/ext2/xattr_trusted.c | 54 + kernel/fs/ext2/xattr_user.c | 61 + kernel/fs/ext3/Kconfig | 89 + kernel/fs/ext3/Makefile | 12 + kernel/fs/ext3/acl.c | 281 + kernel/fs/ext3/acl.h | 72 + kernel/fs/ext3/balloc.c | 2158 ++++ kernel/fs/ext3/bitmap.c | 20 + kernel/fs/ext3/dir.c | 537 + kernel/fs/ext3/ext3.h | 1332 ++ kernel/fs/ext3/ext3_jbd.c | 59 + kernel/fs/ext3/file.c | 79 + kernel/fs/ext3/fsync.c | 109 + kernel/fs/ext3/hash.c | 206 + kernel/fs/ext3/ialloc.c | 706 ++ kernel/fs/ext3/inode.c | 3573 ++++++ kernel/fs/ext3/ioctl.c | 327 + kernel/fs/ext3/namei.c | 2585 ++++ kernel/fs/ext3/namei.h | 27 + kernel/fs/ext3/resize.c | 1117 ++ kernel/fs/ext3/super.c | 3165 +++++ kernel/fs/ext3/symlink.c | 54 + kernel/fs/ext3/xattr.c | 1330 ++ kernel/fs/ext3/xattr.h | 136 + kernel/fs/ext3/xattr_security.c | 78 + kernel/fs/ext3/xattr_trusted.c | 54 + kernel/fs/ext3/xattr_user.c | 58 + kernel/fs/ext4/Kconfig | 97 + kernel/fs/ext4/Makefile | 16 + kernel/fs/ext4/acl.c | 283 + kernel/fs/ext4/acl.h | 72 + kernel/fs/ext4/balloc.c | 880 ++ kernel/fs/ext4/bitmap.c | 97 + kernel/fs/ext4/block_validity.c | 242 + kernel/fs/ext4/crypto.c | 558 + kernel/fs/ext4/crypto_fname.c | 719 ++ kernel/fs/ext4/crypto_key.c | 166 + kernel/fs/ext4/crypto_policy.c | 198 + kernel/fs/ext4/dir.c | 644 + kernel/fs/ext4/ext4.h | 2998 +++++ kernel/fs/ext4/ext4_crypto.h | 156 + kernel/fs/ext4/ext4_extents.h | 274 + kernel/fs/ext4/ext4_jbd2.c | 327 + kernel/fs/ext4/ext4_jbd2.h | 450 + kernel/fs/ext4/extents.c | 5707 +++++++++ kernel/fs/ext4/extents_status.c | 1310 ++ kernel/fs/ext4/extents_status.h | 175 + kernel/fs/ext4/file.c | 651 + kernel/fs/ext4/fsync.c | 150 + kernel/fs/ext4/hash.c | 207 + kernel/fs/ext4/ialloc.c | 1350 ++ kernel/fs/ext4/indirect.c | 1558 +++ kernel/fs/ext4/inline.c | 2024 +++ kernel/fs/ext4/inode.c | 5279 ++++++++ kernel/fs/ext4/ioctl.c | 773 ++ kernel/fs/ext4/mballoc.c | 5238 ++++++++ kernel/fs/ext4/mballoc.h | 215 + kernel/fs/ext4/migrate.c | 672 + kernel/fs/ext4/mmp.c | 393 + kernel/fs/ext4/move_extent.c | 684 + kernel/fs/ext4/namei.c | 3918 ++++++ kernel/fs/ext4/page-io.c | 529 + kernel/fs/ext4/readpage.c | 328 + kernel/fs/ext4/resize.c | 2024 +++ kernel/fs/ext4/super.c | 5668 +++++++++ kernel/fs/ext4/symlink.c | 145 + kernel/fs/ext4/truncate.h | 43 + kernel/fs/ext4/xattr.c | 1727 +++ kernel/fs/ext4/xattr.h | 139 + kernel/fs/ext4/xattr_security.c | 82 + kernel/fs/ext4/xattr_trusted.c | 58 + kernel/fs/ext4/xattr_user.c | 61 + kernel/fs/f2fs/Kconfig | 83 + kernel/fs/f2fs/Makefile | 8 + kernel/fs/f2fs/acl.c | 410 + kernel/fs/f2fs/acl.h | 54 + kernel/fs/f2fs/checkpoint.c | 1135 ++ kernel/fs/f2fs/data.c | 1864 +++ kernel/fs/f2fs/debug.c | 413 + kernel/fs/f2fs/dir.c | 811 ++ kernel/fs/f2fs/f2fs.h | 1814 +++ kernel/fs/f2fs/file.c | 1172 ++ kernel/fs/f2fs/gc.c | 746 ++ kernel/fs/f2fs/gc.h | 110 + kernel/fs/f2fs/hash.c | 104 + kernel/fs/f2fs/inline.c | 532 + kernel/fs/f2fs/inode.c | 387 + kernel/fs/f2fs/namei.c | 832 ++ kernel/fs/f2fs/node.c | 2084 +++ kernel/fs/f2fs/node.h | 416 + kernel/fs/f2fs/recovery.c | 575 + kernel/fs/f2fs/segment.c | 2307 ++++ kernel/fs/f2fs/segment.h | 751 ++ kernel/fs/f2fs/super.c | 1350 ++ kernel/fs/f2fs/trace.c | 159 + kernel/fs/f2fs/trace.h | 46 + kernel/fs/f2fs/xattr.c | 618 + kernel/fs/f2fs/xattr.h | 152 + kernel/fs/fat/Kconfig | 100 + kernel/fs/fat/Makefile | 11 + kernel/fs/fat/cache.c | 351 + kernel/fs/fat/dir.c | 1402 +++ kernel/fs/fat/fat.h | 420 + kernel/fs/fat/fatent.c | 692 + kernel/fs/fat/file.c | 459 + kernel/fs/fat/inode.c | 1850 +++ kernel/fs/fat/misc.c | 278 + kernel/fs/fat/namei_msdos.c | 684 + kernel/fs/fat/namei_vfat.c | 1089 ++ kernel/fs/fat/nfs.c | 301 + kernel/fs/fcntl.c | 759 ++ kernel/fs/fhandle.c | 266 + kernel/fs/file.c | 914 ++ kernel/fs/file_table.c | 327 + kernel/fs/filesystems.c | 288 + kernel/fs/freevxfs/Kconfig | 16 + kernel/fs/freevxfs/Makefile | 8 + kernel/fs/freevxfs/vxfs.h | 263 + kernel/fs/freevxfs/vxfs_bmap.c | 281 + kernel/fs/freevxfs/vxfs_dir.h | 92 + kernel/fs/freevxfs/vxfs_extern.h | 81 + kernel/fs/freevxfs/vxfs_fshead.c | 203 + kernel/fs/freevxfs/vxfs_fshead.h | 67 + kernel/fs/freevxfs/vxfs_immed.c | 115 + kernel/fs/freevxfs/vxfs_inode.c | 360 + kernel/fs/freevxfs/vxfs_inode.h | 180 + kernel/fs/freevxfs/vxfs_lookup.c | 313 + kernel/fs/freevxfs/vxfs_olt.c | 130 + kernel/fs/freevxfs/vxfs_olt.h | 145 + kernel/fs/freevxfs/vxfs_subr.c | 183 + kernel/fs/freevxfs/vxfs_super.c | 293 + kernel/fs/fs-writeback.c | 1595 +++ kernel/fs/fs_pin.c | 102 + kernel/fs/fs_struct.c | 166 + kernel/fs/fscache/Kconfig | 61 + kernel/fs/fscache/Makefile | 20 + kernel/fs/fscache/cache.c | 421 + kernel/fs/fscache/cookie.c | 722 ++ kernel/fs/fscache/fsdef.c | 146 + kernel/fs/fscache/histogram.c | 107 + kernel/fs/fscache/internal.h | 464 + kernel/fs/fscache/main.c | 206 + kernel/fs/fscache/netfs.c | 104 + kernel/fs/fscache/object-list.c | 412 + kernel/fs/fscache/object.c | 1018 ++ kernel/fs/fscache/operation.c | 529 + kernel/fs/fscache/page.c | 1195 ++ kernel/fs/fscache/proc.c | 81 + kernel/fs/fscache/stats.c | 291 + kernel/fs/fuse/Kconfig | 27 + kernel/fs/fuse/Makefile | 8 + kernel/fs/fuse/control.c | 354 + kernel/fs/fuse/cuse.c | 636 + kernel/fs/fuse/dev.c | 2273 ++++ kernel/fs/fuse/dir.c | 1952 +++ kernel/fs/fuse/file.c | 2994 +++++ kernel/fs/fuse/fuse_i.h | 912 ++ kernel/fs/fuse/inode.c | 1320 ++ kernel/fs/gfs2/Kconfig | 34 + kernel/fs/gfs2/Makefile | 10 + kernel/fs/gfs2/acl.c | 117 + kernel/fs/gfs2/acl.h | 22 + kernel/fs/gfs2/aops.c | 1229 ++ kernel/fs/gfs2/bmap.c | 1495 +++ kernel/fs/gfs2/bmap.h | 61 + kernel/fs/gfs2/dentry.c | 134 + kernel/fs/gfs2/dir.c | 2090 +++ kernel/fs/gfs2/dir.h | 86 + kernel/fs/gfs2/export.c | 208 + kernel/fs/gfs2/file.c | 1159 ++ kernel/fs/gfs2/gfs2.h | 26 + kernel/fs/gfs2/glock.c | 2119 ++++ kernel/fs/gfs2/glock.h | 250 + kernel/fs/gfs2/glops.c | 616 + kernel/fs/gfs2/glops.h | 30 + kernel/fs/gfs2/incore.h | 843 ++ kernel/fs/gfs2/inode.c | 1973 +++ kernel/fs/gfs2/inode.h | 141 + kernel/fs/gfs2/lock_dlm.c | 1336 ++ kernel/fs/gfs2/log.c | 950 ++ kernel/fs/gfs2/log.h | 85 + kernel/fs/gfs2/lops.c | 886 ++ kernel/fs/gfs2/lops.h | 104 + kernel/fs/gfs2/main.c | 258 + kernel/fs/gfs2/meta_io.c | 403 + kernel/fs/gfs2/meta_io.h | 78 + kernel/fs/gfs2/ops_fstype.c | 1409 +++ kernel/fs/gfs2/quota.c | 1680 +++ kernel/fs/gfs2/quota.h | 64 + kernel/fs/gfs2/recovery.c | 611 + kernel/fs/gfs2/recovery.h | 36 + kernel/fs/gfs2/rgrp.c | 2623 ++++ kernel/fs/gfs2/rgrp.h | 86 + kernel/fs/gfs2/super.c | 1666 +++ kernel/fs/gfs2/super.h | 58 + kernel/fs/gfs2/sys.c | 711 ++ kernel/fs/gfs2/sys.h | 25 + kernel/fs/gfs2/trace_gfs2.h | 558 + kernel/fs/gfs2/trans.c | 279 + kernel/fs/gfs2/trans.h | 47 + kernel/fs/gfs2/util.c | 265 + kernel/fs/gfs2/util.h | 171 + kernel/fs/gfs2/xattr.c | 1508 +++ kernel/fs/gfs2/xattr.h | 67 + kernel/fs/hfs/Kconfig | 12 + kernel/fs/hfs/Makefile | 10 + kernel/fs/hfs/attr.c | 121 + kernel/fs/hfs/bfind.c | 224 + kernel/fs/hfs/bitmap.c | 243 + kernel/fs/hfs/bnode.c | 485 + kernel/fs/hfs/brec.c | 520 + kernel/fs/hfs/btree.c | 369 + kernel/fs/hfs/btree.h | 163 + kernel/fs/hfs/catalog.c | 366 + kernel/fs/hfs/dir.c | 319 + kernel/fs/hfs/extent.c | 545 + kernel/fs/hfs/hfs.h | 289 + kernel/fs/hfs/hfs_fs.h | 290 + kernel/fs/hfs/inode.c | 692 + kernel/fs/hfs/mdb.c | 366 + kernel/fs/hfs/part_tbl.c | 117 + kernel/fs/hfs/string.c | 114 + kernel/fs/hfs/super.c | 508 + kernel/fs/hfs/sysdep.c | 45 + kernel/fs/hfs/trans.c | 150 + kernel/fs/hfsplus/Kconfig | 31 + kernel/fs/hfsplus/Makefile | 11 + kernel/fs/hfsplus/acl.h | 27 + kernel/fs/hfsplus/attributes.c | 371 + kernel/fs/hfsplus/bfind.c | 293 + kernel/fs/hfsplus/bitmap.c | 245 + kernel/fs/hfsplus/bnode.c | 671 + kernel/fs/hfsplus/brec.c | 527 + kernel/fs/hfsplus/btree.c | 497 + kernel/fs/hfsplus/catalog.c | 521 + kernel/fs/hfsplus/dir.c | 576 + kernel/fs/hfsplus/extents.c | 611 + kernel/fs/hfsplus/hfsplus_fs.h | 540 + kernel/fs/hfsplus/hfsplus_raw.h | 407 + kernel/fs/hfsplus/inode.c | 617 + kernel/fs/hfsplus/ioctl.c | 150 + kernel/fs/hfsplus/options.c | 238 + kernel/fs/hfsplus/part_tbl.c | 157 + kernel/fs/hfsplus/posix_acl.c | 140 + kernel/fs/hfsplus/super.c | 700 ++ kernel/fs/hfsplus/tables.c | 3245 +++++ kernel/fs/hfsplus/unicode.c | 469 + kernel/fs/hfsplus/wrapper.c | 261 + kernel/fs/hfsplus/xattr.c | 911 ++ kernel/fs/hfsplus/xattr.h | 43 + kernel/fs/hfsplus/xattr_security.c | 98 + kernel/fs/hfsplus/xattr_trusted.c | 44 + kernel/fs/hfsplus/xattr_user.c | 44 + kernel/fs/hostfs/Makefile | 11 + kernel/fs/hostfs/hostfs.h | 98 + kernel/fs/hostfs/hostfs_kern.c | 1023 ++ kernel/fs/hostfs/hostfs_user.c | 410 + kernel/fs/hpfs/Kconfig | 14 + kernel/fs/hpfs/Makefile | 8 + kernel/fs/hpfs/alloc.c | 486 + kernel/fs/hpfs/anode.c | 496 + kernel/fs/hpfs/buffer.c | 204 + kernel/fs/hpfs/dentry.c | 61 + kernel/fs/hpfs/dir.c | 330 + kernel/fs/hpfs/dnode.c | 1093 ++ kernel/fs/hpfs/ea.c | 367 + kernel/fs/hpfs/file.c | 211 + kernel/fs/hpfs/hpfs.h | 559 + kernel/fs/hpfs/hpfs_fn.h | 363 + kernel/fs/hpfs/inode.c | 315 + kernel/fs/hpfs/map.c | 308 + kernel/fs/hpfs/name.c | 113 + kernel/fs/hpfs/namei.c | 628 + kernel/fs/hpfs/super.c | 753 ++ kernel/fs/hppfs/Makefile | 6 + kernel/fs/hppfs/hppfs.c | 766 ++ kernel/fs/hugetlbfs/Makefile | 7 + kernel/fs/hugetlbfs/inode.c | 1114 ++ kernel/fs/inode.c | 1977 +++ kernel/fs/internal.h | 153 + kernel/fs/ioctl.c | 625 + kernel/fs/isofs/Kconfig | 39 + kernel/fs/isofs/Makefile | 10 + kernel/fs/isofs/compress.c | 380 + kernel/fs/isofs/dir.c | 283 + kernel/fs/isofs/export.c | 192 + kernel/fs/isofs/inode.c | 1557 +++ kernel/fs/isofs/isofs.h | 198 + kernel/fs/isofs/joliet.c | 69 + kernel/fs/isofs/namei.c | 172 + kernel/fs/isofs/rock.c | 801 ++ kernel/fs/isofs/rock.h | 122 + kernel/fs/isofs/util.c | 70 + kernel/fs/isofs/zisofs.h | 21 + kernel/fs/jbd/Kconfig | 30 + kernel/fs/jbd/Makefile | 7 + kernel/fs/jbd/checkpoint.c | 784 ++ kernel/fs/jbd/commit.c | 1021 ++ kernel/fs/jbd/journal.c | 2145 ++++ kernel/fs/jbd/recovery.c | 594 + kernel/fs/jbd/revoke.c | 733 ++ kernel/fs/jbd/transaction.c | 2237 ++++ kernel/fs/jbd2/Kconfig | 35 + kernel/fs/jbd2/Makefile | 7 + kernel/fs/jbd2/checkpoint.c | 647 + kernel/fs/jbd2/commit.c | 1164 ++ kernel/fs/jbd2/journal.c | 2671 ++++ kernel/fs/jbd2/recovery.c | 880 ++ kernel/fs/jbd2/revoke.c | 764 ++ kernel/fs/jbd2/transaction.c | 2487 ++++ kernel/fs/jffs2/Kconfig | 188 + kernel/fs/jffs2/LICENCE | 30 + kernel/fs/jffs2/Makefile | 21 + kernel/fs/jffs2/README.Locking | 172 + kernel/fs/jffs2/TODO | 37 + kernel/fs/jffs2/acl.c | 309 + kernel/fs/jffs2/acl.h | 41 + kernel/fs/jffs2/background.c | 168 + kernel/fs/jffs2/build.c | 394 + kernel/fs/jffs2/compr.c | 418 + kernel/fs/jffs2/compr.h | 105 + kernel/fs/jffs2/compr_lzo.c | 110 + kernel/fs/jffs2/compr_rtime.c | 130 + kernel/fs/jffs2/compr_rubin.c | 452 + kernel/fs/jffs2/compr_zlib.c | 222 + kernel/fs/jffs2/debug.c | 866 ++ kernel/fs/jffs2/debug.h | 275 + kernel/fs/jffs2/dir.c | 862 ++ kernel/fs/jffs2/erase.c | 515 + kernel/fs/jffs2/file.c | 337 + kernel/fs/jffs2/fs.c | 766 ++ kernel/fs/jffs2/gc.c | 1378 ++ kernel/fs/jffs2/ioctl.c | 22 + kernel/fs/jffs2/jffs2_fs_i.h | 56 + kernel/fs/jffs2/jffs2_fs_sb.h | 162 + kernel/fs/jffs2/malloc.c | 324 + kernel/fs/jffs2/nodelist.c | 755 ++ kernel/fs/jffs2/nodelist.h | 480 + kernel/fs/jffs2/nodemgmt.c | 883 ++ kernel/fs/jffs2/os-linux.h | 199 + kernel/fs/jffs2/read.c | 228 + kernel/fs/jffs2/readinode.c | 1451 +++ kernel/fs/jffs2/scan.c | 1176 ++ kernel/fs/jffs2/security.c | 89 + kernel/fs/jffs2/summary.c | 874 ++ kernel/fs/jffs2/summary.h | 213 + kernel/fs/jffs2/super.c | 442 + kernel/fs/jffs2/symlink.c | 66 + kernel/fs/jffs2/wbuf.c | 1353 ++ kernel/fs/jffs2/write.c | 722 ++ kernel/fs/jffs2/writev.c | 51 + kernel/fs/jffs2/xattr.c | 1340 ++ kernel/fs/jffs2/xattr.h | 133 + kernel/fs/jffs2/xattr_trusted.c | 55 + kernel/fs/jffs2/xattr_user.c | 55 + kernel/fs/jfs/Kconfig | 50 + kernel/fs/jfs/Makefile | 16 + kernel/fs/jfs/acl.c | 161 + kernel/fs/jfs/file.c | 165 + kernel/fs/jfs/inode.c | 423 + kernel/fs/jfs/ioctl.c | 189 + kernel/fs/jfs/jfs_acl.h | 36 + kernel/fs/jfs/jfs_btree.h | 172 + kernel/fs/jfs/jfs_debug.c | 109 + kernel/fs/jfs/jfs_debug.h | 122 + kernel/fs/jfs/jfs_dinode.h | 176 + kernel/fs/jfs/jfs_discard.c | 121 + kernel/fs/jfs/jfs_discard.h | 26 + kernel/fs/jfs/jfs_dmap.c | 4092 ++++++ kernel/fs/jfs/jfs_dmap.h | 316 + kernel/fs/jfs/jfs_dtree.c | 4576 +++++++ kernel/fs/jfs/jfs_dtree.h | 269 + kernel/fs/jfs/jfs_extent.c | 651 + kernel/fs/jfs/jfs_extent.h | 31 + kernel/fs/jfs/jfs_filsys.h | 285 + kernel/fs/jfs/jfs_imap.c | 3178 +++++ kernel/fs/jfs/jfs_imap.h | 175 + kernel/fs/jfs/jfs_incore.h | 228 + kernel/fs/jfs/jfs_inode.c | 165 + kernel/fs/jfs/jfs_inode.h | 53 + kernel/fs/jfs/jfs_lock.h | 52 + kernel/fs/jfs/jfs_logmgr.c | 2533 ++++ kernel/fs/jfs/jfs_logmgr.h | 513 + kernel/fs/jfs/jfs_metapage.c | 837 ++ kernel/fs/jfs/jfs_metapage.h | 154 + kernel/fs/jfs/jfs_mount.c | 507 + kernel/fs/jfs/jfs_superblock.h | 122 + kernel/fs/jfs/jfs_txnmgr.c | 3093 +++++ kernel/fs/jfs/jfs_txnmgr.h | 311 + kernel/fs/jfs/jfs_types.h | 170 + kernel/fs/jfs/jfs_umount.c | 168 + kernel/fs/jfs/jfs_unicode.c | 138 + kernel/fs/jfs/jfs_unicode.h | 156 + kernel/fs/jfs/jfs_uniupr.c | 134 + kernel/fs/jfs/jfs_xattr.h | 77 + kernel/fs/jfs/jfs_xtree.c | 3903 ++++++ kernel/fs/jfs/jfs_xtree.h | 125 + kernel/fs/jfs/namei.c | 1606 +++ kernel/fs/jfs/resize.c | 543 + kernel/fs/jfs/super.c | 1012 ++ kernel/fs/jfs/symlink.c | 52 + kernel/fs/jfs/xattr.c | 1106 ++ kernel/fs/kernfs/Kconfig | 7 + kernel/fs/kernfs/Makefile | 5 + kernel/fs/kernfs/dir.c | 1464 +++ kernel/fs/kernfs/file.c | 954 ++ kernel/fs/kernfs/inode.c | 372 + kernel/fs/kernfs/kernfs-internal.h | 119 + kernel/fs/kernfs/mount.c | 249 + kernel/fs/kernfs/symlink.c | 147 + kernel/fs/libfs.c | 1191 ++ kernel/fs/lockd/Makefile | 11 + kernel/fs/lockd/clnt4xdr.c | 599 + kernel/fs/lockd/clntlock.c | 301 + kernel/fs/lockd/clntproc.c | 849 ++ kernel/fs/lockd/clntxdr.c | 621 + kernel/fs/lockd/host.c | 675 + kernel/fs/lockd/mon.c | 622 + kernel/fs/lockd/netns.h | 22 + kernel/fs/lockd/procfs.c | 92 + kernel/fs/lockd/procfs.h | 28 + kernel/fs/lockd/svc.c | 695 + kernel/fs/lockd/svc4proc.c | 505 + kernel/fs/lockd/svclock.c | 939 ++ kernel/fs/lockd/svcproc.c | 549 + kernel/fs/lockd/svcshare.c | 106 + kernel/fs/lockd/svcsubs.c | 457 + kernel/fs/lockd/xdr.c | 337 + kernel/fs/lockd/xdr4.c | 334 + kernel/fs/locks.c | 2703 ++++ kernel/fs/logfs/Kconfig | 17 + kernel/fs/logfs/Makefile | 13 + kernel/fs/logfs/compr.c | 95 + kernel/fs/logfs/dev_bdev.c | 321 + kernel/fs/logfs/dev_mtd.c | 274 + kernel/fs/logfs/dir.c | 801 ++ kernel/fs/logfs/file.c | 285 + kernel/fs/logfs/gc.c | 732 ++ kernel/fs/logfs/inode.c | 426 + kernel/fs/logfs/journal.c | 894 ++ kernel/fs/logfs/logfs.h | 736 ++ kernel/fs/logfs/logfs_abi.h | 629 + kernel/fs/logfs/readwrite.c | 2298 ++++ kernel/fs/logfs/segment.c | 961 ++ kernel/fs/logfs/super.c | 653 + kernel/fs/mbcache.c | 858 ++ kernel/fs/minix/Kconfig | 25 + kernel/fs/minix/Makefile | 7 + kernel/fs/minix/bitmap.c | 272 + kernel/fs/minix/dir.c | 473 + kernel/fs/minix/file.c | 51 + kernel/fs/minix/inode.c | 690 + kernel/fs/minix/itree_common.c | 364 + kernel/fs/minix/itree_v1.c | 67 + kernel/fs/minix/itree_v2.c | 76 + kernel/fs/minix/minix.h | 169 + kernel/fs/minix/namei.c | 270 + kernel/fs/mount.h | 140 + kernel/fs/mpage.c | 717 ++ kernel/fs/namei.c | 4552 +++++++ kernel/fs/namespace.c | 3298 +++++ kernel/fs/ncpfs/Kconfig | 108 + kernel/fs/ncpfs/Makefile | 16 + kernel/fs/ncpfs/dir.c | 1237 ++ kernel/fs/ncpfs/file.c | 262 + kernel/fs/ncpfs/getopt.c | 75 + kernel/fs/ncpfs/getopt.h | 16 + kernel/fs/ncpfs/inode.c | 1072 ++ kernel/fs/ncpfs/ioctl.c | 919 ++ kernel/fs/ncpfs/mmap.c | 125 + kernel/fs/ncpfs/ncp_fs.h | 100 + kernel/fs/ncpfs/ncp_fs_i.h | 30 + kernel/fs/ncpfs/ncp_fs_sb.h | 174 + kernel/fs/ncpfs/ncplib_kernel.c | 1321 ++ kernel/fs/ncpfs/ncplib_kernel.h | 214 + kernel/fs/ncpfs/ncpsign_kernel.c | 127 + kernel/fs/ncpfs/ncpsign_kernel.h | 26 + kernel/fs/ncpfs/sock.c | 881 ++ kernel/fs/ncpfs/symlink.c | 181 + kernel/fs/nfs/Kconfig | 202 + kernel/fs/nfs/Makefile | 36 + kernel/fs/nfs/blocklayout/Makefile | 6 + kernel/fs/nfs/blocklayout/blocklayout.c | 928 ++ kernel/fs/nfs/blocklayout/blocklayout.h | 204 + kernel/fs/nfs/blocklayout/dev.c | 363 + kernel/fs/nfs/blocklayout/extent_tree.c | 602 + kernel/fs/nfs/blocklayout/rpc_pipefs.c | 288 + kernel/fs/nfs/cache_lib.c | 158 + kernel/fs/nfs/cache_lib.h | 31 + kernel/fs/nfs/callback.c | 499 + kernel/fs/nfs/callback.h | 214 + kernel/fs/nfs/callback_proc.c | 554 + kernel/fs/nfs/callback_xdr.c | 1033 ++ kernel/fs/nfs/client.c | 1472 +++ kernel/fs/nfs/delegation.c | 902 ++ kernel/fs/nfs/delegation.h | 73 + kernel/fs/nfs/dir.c | 2520 ++++ kernel/fs/nfs/direct.c | 1066 ++ kernel/fs/nfs/dns_resolve.c | 470 + kernel/fs/nfs/dns_resolve.h | 36 + kernel/fs/nfs/file.c | 947 ++ kernel/fs/nfs/filelayout/Makefile | 5 + kernel/fs/nfs/filelayout/filelayout.c | 1162 ++ kernel/fs/nfs/filelayout/filelayout.h | 117 + kernel/fs/nfs/filelayout/filelayoutdev.c | 299 + kernel/fs/nfs/flexfilelayout/Makefile | 5 + kernel/fs/nfs/flexfilelayout/flexfilelayout.c | 1535 +++ kernel/fs/nfs/flexfilelayout/flexfilelayout.h | 155 + kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c | 552 + kernel/fs/nfs/fscache-index.c | 336 + kernel/fs/nfs/fscache.c | 439 + kernel/fs/nfs/fscache.h | 229 + kernel/fs/nfs/getroot.c | 133 + kernel/fs/nfs/inode.c | 2066 +++ kernel/fs/nfs/internal.h | 683 + kernel/fs/nfs/iostat.h | 76 + kernel/fs/nfs/mount_clnt.c | 535 + kernel/fs/nfs/namespace.c | 289 + kernel/fs/nfs/netns.h | 40 + kernel/fs/nfs/nfs.h | 29 + kernel/fs/nfs/nfs2super.c | 31 + kernel/fs/nfs/nfs2xdr.c | 1151 ++ kernel/fs/nfs/nfs3_fs.h | 36 + kernel/fs/nfs/nfs3acl.c | 296 + kernel/fs/nfs/nfs3client.c | 107 + kernel/fs/nfs/nfs3proc.c | 974 ++ kernel/fs/nfs/nfs3super.c | 35 + kernel/fs/nfs/nfs3xdr.c | 2564 ++++ kernel/fs/nfs/nfs42.h | 16 + kernel/fs/nfs/nfs42proc.c | 167 + kernel/fs/nfs/nfs42xdr.c | 249 + kernel/fs/nfs/nfs4_fs.h | 529 + kernel/fs/nfs/nfs4client.c | 1232 ++ kernel/fs/nfs/nfs4file.c | 189 + kernel/fs/nfs/nfs4getroot.c | 50 + kernel/fs/nfs/nfs4idmap.c | 792 ++ kernel/fs/nfs/nfs4idmap.h | 68 + kernel/fs/nfs/nfs4namespace.c | 523 + kernel/fs/nfs/nfs4proc.c | 8687 +++++++++++++ kernel/fs/nfs/nfs4renewd.c | 143 + kernel/fs/nfs/nfs4session.c | 565 + kernel/fs/nfs/nfs4session.h | 160 + kernel/fs/nfs/nfs4state.c | 2455 ++++ kernel/fs/nfs/nfs4super.c | 361 + kernel/fs/nfs/nfs4sysctl.c | 69 + kernel/fs/nfs/nfs4trace.c | 17 + kernel/fs/nfs/nfs4trace.h | 1148 ++ kernel/fs/nfs/nfs4xdr.c | 7443 +++++++++++ kernel/fs/nfs/nfsroot.c | 309 + kernel/fs/nfs/nfstrace.c | 12 + kernel/fs/nfs/nfstrace.h | 730 ++ kernel/fs/nfs/objlayout/Kbuild | 5 + kernel/fs/nfs/objlayout/objio_osd.c | 678 + kernel/fs/nfs/objlayout/objlayout.c | 706 ++ kernel/fs/nfs/objlayout/objlayout.h | 184 + kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 415 + kernel/fs/nfs/pagelist.c | 1309 ++ kernel/fs/nfs/pnfs.c | 2249 ++++ kernel/fs/nfs/pnfs.h | 692 + kernel/fs/nfs/pnfs_dev.c | 365 + kernel/fs/nfs/pnfs_nfs.c | 880 ++ kernel/fs/nfs/proc.c | 749 ++ kernel/fs/nfs/read.c | 445 + kernel/fs/nfs/super.c | 2877 +++++ kernel/fs/nfs/symlink.c | 78 + kernel/fs/nfs/sysctl.c | 64 + kernel/fs/nfs/unlink.c | 604 + kernel/fs/nfs/write.c | 2019 +++ kernel/fs/nfs_common/Makefile | 8 + kernel/fs/nfs_common/grace.c | 113 + kernel/fs/nfs_common/nfsacl.c | 296 + kernel/fs/nfsd/Kconfig | 117 + kernel/fs/nfsd/Makefile | 20 + kernel/fs/nfsd/acl.h | 59 + kernel/fs/nfsd/auth.c | 90 + kernel/fs/nfsd/auth.h | 16 + kernel/fs/nfsd/blocklayout.c | 200 + kernel/fs/nfsd/blocklayoutxdr.c | 157 + kernel/fs/nfsd/blocklayoutxdr.h | 62 + kernel/fs/nfsd/cache.h | 86 + kernel/fs/nfsd/current_stateid.h | 28 + kernel/fs/nfsd/export.c | 1345 ++ kernel/fs/nfsd/export.h | 113 + kernel/fs/nfsd/fault_inject.c | 150 + kernel/fs/nfsd/idmap.h | 62 + kernel/fs/nfsd/lockd.c | 77 + kernel/fs/nfsd/netns.h | 121 + kernel/fs/nfsd/nfs2acl.c | 382 + kernel/fs/nfsd/nfs3acl.c | 273 + kernel/fs/nfsd/nfs3proc.c | 891 ++ kernel/fs/nfsd/nfs3xdr.c | 1114 ++ kernel/fs/nfsd/nfs4acl.c | 896 ++ kernel/fs/nfsd/nfs4callback.c | 1099 ++ kernel/fs/nfsd/nfs4idmap.c | 666 + kernel/fs/nfsd/nfs4layouts.c | 723 ++ kernel/fs/nfsd/nfs4proc.c | 2368 ++++ kernel/fs/nfsd/nfs4recover.c | 1551 +++ kernel/fs/nfsd/nfs4state.c | 6716 ++++++++++ kernel/fs/nfsd/nfs4xdr.c | 4453 +++++++ kernel/fs/nfsd/nfscache.c | 637 + kernel/fs/nfsd/nfsctl.c | 1318 ++ kernel/fs/nfsd/nfsd.h | 422 + kernel/fs/nfsd/nfsfh.c | 692 + kernel/fs/nfsd/nfsfh.h | 292 + kernel/fs/nfsd/nfsproc.c | 759 ++ kernel/fs/nfsd/nfssvc.c | 752 ++ kernel/fs/nfsd/nfsxdr.c | 550 + kernel/fs/nfsd/pnfs.h | 86 + kernel/fs/nfsd/state.h | 662 + kernel/fs/nfsd/stats.c | 104 + kernel/fs/nfsd/stats.h | 43 + kernel/fs/nfsd/trace.c | 5 + kernel/fs/nfsd/trace.h | 54 + kernel/fs/nfsd/vfs.c | 2154 ++++ kernel/fs/nfsd/vfs.h | 131 + kernel/fs/nfsd/xdr.h | 173 + kernel/fs/nfsd/xdr3.h | 349 + kernel/fs/nfsd/xdr4.h | 724 ++ kernel/fs/nfsd/xdr4cb.h | 30 + kernel/fs/nilfs2/Kconfig | 24 + kernel/fs/nilfs2/Makefile | 5 + kernel/fs/nilfs2/alloc.c | 785 ++ kernel/fs/nilfs2/alloc.h | 110 + kernel/fs/nilfs2/bmap.c | 593 + kernel/fs/nilfs2/bmap.h | 280 + kernel/fs/nilfs2/btnode.c | 297 + kernel/fs/nilfs2/btnode.h | 59 + kernel/fs/nilfs2/btree.c | 2414 ++++ kernel/fs/nilfs2/btree.h | 77 + kernel/fs/nilfs2/cpfile.c | 1027 ++ kernel/fs/nilfs2/cpfile.h | 46 + kernel/fs/nilfs2/dat.c | 529 + kernel/fs/nilfs2/dat.h | 59 + kernel/fs/nilfs2/dir.c | 676 + kernel/fs/nilfs2/direct.c | 386 + kernel/fs/nilfs2/direct.h | 51 + kernel/fs/nilfs2/export.h | 25 + kernel/fs/nilfs2/file.c | 165 + kernel/fs/nilfs2/gcinode.c | 197 + kernel/fs/nilfs2/ifile.c | 227 + kernel/fs/nilfs2/ifile.h | 58 + kernel/fs/nilfs2/inode.c | 1135 ++ kernel/fs/nilfs2/ioctl.c | 1379 ++ kernel/fs/nilfs2/mdt.c | 652 + kernel/fs/nilfs2/mdt.h | 123 + kernel/fs/nilfs2/namei.c | 585 + kernel/fs/nilfs2/nilfs.h | 354 + kernel/fs/nilfs2/page.c | 581 + kernel/fs/nilfs2/page.h | 80 + kernel/fs/nilfs2/recovery.c | 964 ++ kernel/fs/nilfs2/segbuf.c | 536 + kernel/fs/nilfs2/segbuf.h | 184 + kernel/fs/nilfs2/segment.c | 2758 ++++ kernel/fs/nilfs2/segment.h | 251 + kernel/fs/nilfs2/sufile.c | 1222 ++ kernel/fs/nilfs2/sufile.h | 146 + kernel/fs/nilfs2/super.c | 1486 +++ kernel/fs/nilfs2/sysfs.c | 1137 ++ kernel/fs/nilfs2/sysfs.h | 176 + kernel/fs/nilfs2/the_nilfs.c | 815 ++ kernel/fs/nilfs2/the_nilfs.h | 396 + kernel/fs/nls/Kconfig | 619 + kernel/fs/nls/Makefile | 55 + kernel/fs/nls/mac-celtic.c | 601 + kernel/fs/nls/mac-centeuro.c | 531 + kernel/fs/nls/mac-croatian.c | 601 + kernel/fs/nls/mac-cyrillic.c | 496 + kernel/fs/nls/mac-gaelic.c | 566 + kernel/fs/nls/mac-greek.c | 496 + kernel/fs/nls/mac-iceland.c | 601 + kernel/fs/nls/mac-inuit.c | 531 + kernel/fs/nls/mac-roman.c | 636 + kernel/fs/nls/mac-romanian.c | 601 + kernel/fs/nls/mac-turkish.c | 601 + kernel/fs/nls/nls_ascii.c | 166 + kernel/fs/nls/nls_base.c | 548 + kernel/fs/nls/nls_cp1250.c | 346 + kernel/fs/nls/nls_cp1251.c | 301 + kernel/fs/nls/nls_cp1255.c | 384 + kernel/fs/nls/nls_cp437.c | 387 + kernel/fs/nls/nls_cp737.c | 350 + kernel/fs/nls/nls_cp775.c | 319 + kernel/fs/nls/nls_cp850.c | 315 + kernel/fs/nls/nls_cp852.c | 337 + kernel/fs/nls/nls_cp855.c | 299 + kernel/fs/nls/nls_cp857.c | 301 + kernel/fs/nls/nls_cp860.c | 364 + kernel/fs/nls/nls_cp861.c | 387 + kernel/fs/nls/nls_cp862.c | 421 + kernel/fs/nls/nls_cp863.c | 381 + kernel/fs/nls/nls_cp864.c | 407 + kernel/fs/nls/nls_cp865.c | 387 + kernel/fs/nls/nls_cp866.c | 305 + kernel/fs/nls/nls_cp869.c | 315 + kernel/fs/nls/nls_cp874.c | 275 + kernel/fs/nls/nls_cp932.c | 7933 ++++++++++++ kernel/fs/nls/nls_cp936.c | 11111 ++++++++++++++++ kernel/fs/nls/nls_cp949.c | 13946 +++++++++++++++++++++ kernel/fs/nls/nls_cp950.c | 9482 ++++++++++++++ kernel/fs/nls/nls_euc-jp.c | 580 + kernel/fs/nls/nls_iso8859-1.c | 257 + kernel/fs/nls/nls_iso8859-13.c | 285 + kernel/fs/nls/nls_iso8859-14.c | 341 + kernel/fs/nls/nls_iso8859-15.c | 307 + kernel/fs/nls/nls_iso8859-2.c | 308 + kernel/fs/nls/nls_iso8859-3.c | 308 + kernel/fs/nls/nls_iso8859-4.c | 308 + kernel/fs/nls/nls_iso8859-5.c | 272 + kernel/fs/nls/nls_iso8859-6.c | 263 + kernel/fs/nls/nls_iso8859-7.c | 317 + kernel/fs/nls/nls_iso8859-9.c | 272 + kernel/fs/nls/nls_koi8-r.c | 323 + kernel/fs/nls/nls_koi8-ru.c | 82 + kernel/fs/nls/nls_koi8-u.c | 330 + kernel/fs/nls/nls_utf8.c | 67 + kernel/fs/no-block.c | 23 + kernel/fs/notify/Kconfig | 7 + kernel/fs/notify/Makefile | 6 + kernel/fs/notify/dnotify/Kconfig | 11 + kernel/fs/notify/dnotify/Makefile | 1 + kernel/fs/notify/dnotify/dnotify.c | 388 + kernel/fs/notify/fanotify/Kconfig | 26 + kernel/fs/notify/fanotify/Makefile | 1 + kernel/fs/notify/fanotify/fanotify.c | 270 + kernel/fs/notify/fanotify/fanotify.h | 50 + kernel/fs/notify/fanotify/fanotify_user.c | 940 ++ kernel/fs/notify/fdinfo.c | 165 + kernel/fs/notify/fdinfo.h | 27 + kernel/fs/notify/fsnotify.c | 299 + kernel/fs/notify/fsnotify.h | 60 + kernel/fs/notify/group.c | 118 + kernel/fs/notify/inode_mark.c | 247 + kernel/fs/notify/inotify/Kconfig | 17 + kernel/fs/notify/inotify/Makefile | 1 + kernel/fs/notify/inotify/inotify.h | 32 + kernel/fs/notify/inotify/inotify_fsnotify.c | 184 + kernel/fs/notify/inotify/inotify_user.c | 815 ++ kernel/fs/notify/mark.c | 494 + kernel/fs/notify/notification.c | 213 + kernel/fs/notify/vfsmount_mark.c | 127 + kernel/fs/nsfs.c | 161 + kernel/fs/ntfs/Kconfig | 78 + kernel/fs/ntfs/Makefile | 14 + kernel/fs/ntfs/aops.c | 1773 +++ kernel/fs/ntfs/aops.h | 107 + kernel/fs/ntfs/attrib.c | 2614 ++++ kernel/fs/ntfs/attrib.h | 116 + kernel/fs/ntfs/bitmap.c | 193 + kernel/fs/ntfs/bitmap.h | 118 + kernel/fs/ntfs/collate.c | 124 + kernel/fs/ntfs/collate.h | 50 + kernel/fs/ntfs/compress.c | 969 ++ kernel/fs/ntfs/debug.c | 173 + kernel/fs/ntfs/debug.h | 71 + kernel/fs/ntfs/dir.c | 1553 +++ kernel/fs/ntfs/dir.h | 48 + kernel/fs/ntfs/endian.h | 93 + kernel/fs/ntfs/file.c | 2043 +++ kernel/fs/ntfs/index.c | 454 + kernel/fs/ntfs/index.h | 148 + kernel/fs/ntfs/inode.c | 3111 +++++ kernel/fs/ntfs/inode.h | 325 + kernel/fs/ntfs/layout.h | 2435 ++++ kernel/fs/ntfs/lcnalloc.c | 1014 ++ kernel/fs/ntfs/lcnalloc.h | 145 + kernel/fs/ntfs/logfile.c | 863 ++ kernel/fs/ntfs/logfile.h | 309 + kernel/fs/ntfs/malloc.h | 96 + kernel/fs/ntfs/mft.c | 2917 +++++ kernel/fs/ntfs/mft.h | 124 + kernel/fs/ntfs/mst.c | 203 + kernel/fs/ntfs/namei.c | 405 + kernel/fs/ntfs/ntfs.h | 164 + kernel/fs/ntfs/quota.c | 117 + kernel/fs/ntfs/quota.h | 35 + kernel/fs/ntfs/runlist.c | 1907 +++ kernel/fs/ntfs/runlist.h | 102 + kernel/fs/ntfs/super.c | 3220 +++++ kernel/fs/ntfs/sysctl.c | 83 + kernel/fs/ntfs/sysctl.h | 41 + kernel/fs/ntfs/time.h | 100 + kernel/fs/ntfs/types.h | 69 + kernel/fs/ntfs/unistr.c | 398 + kernel/fs/ntfs/upcase.c | 87 + kernel/fs/ntfs/usnjrnl.c | 84 + kernel/fs/ntfs/usnjrnl.h | 205 + kernel/fs/ntfs/volume.h | 178 + kernel/fs/ocfs2/Kconfig | 76 + kernel/fs/ocfs2/Makefile | 53 + kernel/fs/ocfs2/acl.c | 310 + kernel/fs/ocfs2/acl.h | 39 + kernel/fs/ocfs2/alloc.c | 7405 +++++++++++ kernel/fs/ocfs2/alloc.h | 324 + kernel/fs/ocfs2/aops.c | 2448 ++++ kernel/fs/ocfs2/aops.h | 105 + kernel/fs/ocfs2/blockcheck.c | 647 + kernel/fs/ocfs2/blockcheck.h | 107 + kernel/fs/ocfs2/buffer_head_io.c | 427 + kernel/fs/ocfs2/buffer_head_io.h | 77 + kernel/fs/ocfs2/cluster/Makefile | 4 + kernel/fs/ocfs2/cluster/heartbeat.c | 2678 ++++ kernel/fs/ocfs2/cluster/heartbeat.h | 90 + kernel/fs/ocfs2/cluster/masklog.c | 155 + kernel/fs/ocfs2/cluster/masklog.h | 221 + kernel/fs/ocfs2/cluster/netdebug.c | 539 + kernel/fs/ocfs2/cluster/nodemanager.c | 987 ++ kernel/fs/ocfs2/cluster/nodemanager.h | 88 + kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h | 38 + kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h | 45 + kernel/fs/ocfs2/cluster/quorum.c | 340 + kernel/fs/ocfs2/cluster/quorum.h | 36 + kernel/fs/ocfs2/cluster/sys.c | 82 + kernel/fs/ocfs2/cluster/sys.h | 33 + kernel/fs/ocfs2/cluster/tcp.c | 2219 ++++ kernel/fs/ocfs2/cluster/tcp.h | 155 + kernel/fs/ocfs2/cluster/tcp_internal.h | 242 + kernel/fs/ocfs2/dcache.c | 474 + kernel/fs/ocfs2/dcache.h | 59 + kernel/fs/ocfs2/dir.c | 4501 +++++++ kernel/fs/ocfs2/dir.h | 116 + kernel/fs/ocfs2/dlm/Makefile | 7 + kernel/fs/ocfs2/dlm/dlmapi.h | 220 + kernel/fs/ocfs2/dlm/dlmast.c | 504 + kernel/fs/ocfs2/dlm/dlmcommon.h | 1150 ++ kernel/fs/ocfs2/dlm/dlmconvert.c | 544 + kernel/fs/ocfs2/dlm/dlmconvert.h | 35 + kernel/fs/ocfs2/dlm/dlmdebug.c | 1000 ++ kernel/fs/ocfs2/dlm/dlmdebug.h | 81 + kernel/fs/ocfs2/dlm/dlmdomain.c | 2375 ++++ kernel/fs/ocfs2/dlm/dlmdomain.h | 35 + kernel/fs/ocfs2/dlm/dlmlock.c | 761 ++ kernel/fs/ocfs2/dlm/dlmmaster.c | 3487 ++++++ kernel/fs/ocfs2/dlm/dlmrecovery.c | 2935 +++++ kernel/fs/ocfs2/dlm/dlmthread.c | 756 ++ kernel/fs/ocfs2/dlm/dlmunlock.c | 698 ++ kernel/fs/ocfs2/dlmfs/Makefile | 5 + kernel/fs/ocfs2/dlmfs/dlmfs.c | 690 + kernel/fs/ocfs2/dlmfs/userdlm.c | 688 + kernel/fs/ocfs2/dlmfs/userdlm.h | 113 + kernel/fs/ocfs2/dlmglue.c | 4106 ++++++ kernel/fs/ocfs2/dlmglue.h | 173 + kernel/fs/ocfs2/export.c | 271 + kernel/fs/ocfs2/export.h | 33 + kernel/fs/ocfs2/extent_map.c | 994 ++ kernel/fs/ocfs2/extent_map.h | 92 + kernel/fs/ocfs2/file.c | 2702 ++++ kernel/fs/ocfs2/file.h | 85 + kernel/fs/ocfs2/heartbeat.c | 134 + kernel/fs/ocfs2/heartbeat.h | 45 + kernel/fs/ocfs2/inode.c | 1460 +++ kernel/fs/ocfs2/inode.h | 190 + kernel/fs/ocfs2/ioctl.c | 1005 ++ kernel/fs/ocfs2/ioctl.h | 16 + kernel/fs/ocfs2/journal.c | 2323 ++++ kernel/fs/ocfs2/journal.h | 645 + kernel/fs/ocfs2/localalloc.c | 1343 ++ kernel/fs/ocfs2/localalloc.h | 68 + kernel/fs/ocfs2/locks.c | 141 + kernel/fs/ocfs2/locks.h | 32 + kernel/fs/ocfs2/mmap.c | 193 + kernel/fs/ocfs2/mmap.h | 6 + kernel/fs/ocfs2/move_extents.c | 1075 ++ kernel/fs/ocfs2/move_extents.h | 22 + kernel/fs/ocfs2/namei.c | 2921 +++++ kernel/fs/ocfs2/namei.h | 51 + kernel/fs/ocfs2/ocfs1_fs_compat.h | 109 + kernel/fs/ocfs2/ocfs2.h | 919 ++ kernel/fs/ocfs2/ocfs2_fs.h | 1651 +++ kernel/fs/ocfs2/ocfs2_ioctl.h | 242 + kernel/fs/ocfs2/ocfs2_lockid.h | 128 + kernel/fs/ocfs2/ocfs2_lockingver.h | 32 + kernel/fs/ocfs2/ocfs2_trace.h | 2768 ++++ kernel/fs/ocfs2/quota.h | 123 + kernel/fs/ocfs2/quota_global.c | 971 ++ kernel/fs/ocfs2/quota_local.c | 1315 ++ kernel/fs/ocfs2/refcounttree.c | 4474 +++++++ kernel/fs/ocfs2/refcounttree.h | 118 + kernel/fs/ocfs2/reservations.c | 839 ++ kernel/fs/ocfs2/reservations.h | 159 + kernel/fs/ocfs2/resize.c | 589 + kernel/fs/ocfs2/resize.h | 32 + kernel/fs/ocfs2/slot_map.c | 538 + kernel/fs/ocfs2/slot_map.h | 44 + kernel/fs/ocfs2/stack_o2cb.c | 449 + kernel/fs/ocfs2/stack_user.c | 1134 ++ kernel/fs/ocfs2/stackglue.c | 748 ++ kernel/fs/ocfs2/stackglue.h | 301 + kernel/fs/ocfs2/suballoc.c | 2919 +++++ kernel/fs/ocfs2/suballoc.h | 237 + kernel/fs/ocfs2/super.c | 2644 ++++ kernel/fs/ocfs2/super.h | 53 + kernel/fs/ocfs2/symlink.c | 100 + kernel/fs/ocfs2/symlink.h | 42 + kernel/fs/ocfs2/sysfile.c | 182 + kernel/fs/ocfs2/sysfile.h | 33 + kernel/fs/ocfs2/uptodate.c | 638 + kernel/fs/ocfs2/uptodate.h | 84 + kernel/fs/ocfs2/xattr.c | 7425 +++++++++++ kernel/fs/ocfs2/xattr.h | 100 + kernel/fs/omfs/Kconfig | 13 + kernel/fs/omfs/Makefile | 4 + kernel/fs/omfs/bitmap.c | 193 + kernel/fs/omfs/dir.c | 457 + kernel/fs/omfs/file.c | 383 + kernel/fs/omfs/inode.c | 596 + kernel/fs/omfs/omfs.h | 68 + kernel/fs/omfs/omfs_fs.h | 82 + kernel/fs/open.c | 1143 ++ kernel/fs/openpromfs/Makefile | 7 + kernel/fs/openpromfs/inode.c | 471 + kernel/fs/overlayfs/Kconfig | 10 + kernel/fs/overlayfs/Makefile | 7 + kernel/fs/overlayfs/copy_up.c | 416 + kernel/fs/overlayfs/dir.c | 951 ++ kernel/fs/overlayfs/inode.c | 436 + kernel/fs/overlayfs/overlayfs.h | 199 + kernel/fs/overlayfs/readdir.c | 557 + kernel/fs/overlayfs/super.c | 1046 ++ kernel/fs/pipe.c | 1125 ++ kernel/fs/pnode.c | 452 + kernel/fs/pnode.h | 57 + kernel/fs/posix_acl.c | 905 ++ kernel/fs/proc/Kconfig | 73 + kernel/fs/proc/Makefile | 32 + kernel/fs/proc/array.c | 695 + kernel/fs/proc/base.c | 3214 +++++ kernel/fs/proc/cmdline.c | 29 + kernel/fs/proc/consoles.c | 112 + kernel/fs/proc/cpuinfo.c | 24 + kernel/fs/proc/devices.c | 70 + kernel/fs/proc/fd.c | 357 + kernel/fs/proc/fd.h | 19 + kernel/fs/proc/generic.c | 645 + kernel/fs/proc/inode.c | 489 + kernel/fs/proc/internal.h | 305 + kernel/fs/proc/interrupts.c | 53 + kernel/fs/proc/kcore.c | 644 + kernel/fs/proc/kmsg.c | 64 + kernel/fs/proc/loadavg.c | 45 + kernel/fs/proc/meminfo.c | 234 + kernel/fs/proc/namespaces.c | 172 + kernel/fs/proc/nommu.c | 134 + kernel/fs/proc/page.c | 234 + kernel/fs/proc/proc_net.c | 233 + kernel/fs/proc/proc_sysctl.c | 1619 +++ kernel/fs/proc/proc_tty.c | 189 + kernel/fs/proc/root.c | 270 + kernel/fs/proc/self.c | 84 + kernel/fs/proc/softirqs.c | 44 + kernel/fs/proc/stat.c | 206 + kernel/fs/proc/task_mmu.c | 1625 +++ kernel/fs/proc/task_nommu.c | 345 + kernel/fs/proc/thread_self.c | 85 + kernel/fs/proc/uptime.c | 52 + kernel/fs/proc/version.c | 34 + kernel/fs/proc/vmcore.c | 1194 ++ kernel/fs/proc_namespace.c | 335 + kernel/fs/pstore/Kconfig | 62 + kernel/fs/pstore/Makefile | 13 + kernel/fs/pstore/ftrace.c | 131 + kernel/fs/pstore/inode.c | 483 + kernel/fs/pstore/internal.h | 64 + kernel/fs/pstore/platform.c | 547 + kernel/fs/pstore/pmsg.c | 114 + kernel/fs/pstore/ram.c | 648 + kernel/fs/pstore/ram_core.c | 539 + kernel/fs/qnx4/Kconfig | 14 + kernel/fs/qnx4/Makefile | 7 + kernel/fs/qnx4/README | 9 + kernel/fs/qnx4/bitmap.c | 44 + kernel/fs/qnx4/dir.c | 81 + kernel/fs/qnx4/inode.c | 426 + kernel/fs/qnx4/namei.c | 125 + kernel/fs/qnx4/qnx4.h | 45 + kernel/fs/qnx6/Kconfig | 26 + kernel/fs/qnx6/Makefile | 8 + kernel/fs/qnx6/README | 8 + kernel/fs/qnx6/dir.c | 286 + kernel/fs/qnx6/inode.c | 685 + kernel/fs/qnx6/namei.c | 42 + kernel/fs/qnx6/qnx6.h | 135 + kernel/fs/qnx6/super_mmi.c | 148 + kernel/fs/quota/Kconfig | 76 + kernel/fs/quota/Makefile | 7 + kernel/fs/quota/compat.c | 118 + kernel/fs/quota/dquot.c | 2889 +++++ kernel/fs/quota/kqid.c | 132 + kernel/fs/quota/netlink.c | 109 + kernel/fs/quota/quota.c | 808 ++ kernel/fs/quota/quota_tree.c | 670 + kernel/fs/quota/quota_tree.h | 25 + kernel/fs/quota/quota_v1.c | 235 + kernel/fs/quota/quota_v2.c | 346 + kernel/fs/quota/quotaio_v1.h | 33 + kernel/fs/quota/quotaio_v2.h | 75 + kernel/fs/ramfs/Makefile | 9 + kernel/fs/ramfs/file-mmu.c | 46 + kernel/fs/ramfs/file-nommu.c | 271 + kernel/fs/ramfs/inode.c | 266 + kernel/fs/ramfs/internal.h | 13 + kernel/fs/read_write.c | 1329 ++ kernel/fs/readdir.c | 309 + kernel/fs/reiserfs/Kconfig | 88 + kernel/fs/reiserfs/Makefile | 38 + kernel/fs/reiserfs/README | 161 + kernel/fs/reiserfs/acl.h | 76 + kernel/fs/reiserfs/bitmap.c | 1468 +++ kernel/fs/reiserfs/dir.c | 346 + kernel/fs/reiserfs/do_balan.c | 1911 +++ kernel/fs/reiserfs/file.c | 270 + kernel/fs/reiserfs/fix_node.c | 2825 +++++ kernel/fs/reiserfs/hashes.c | 177 + kernel/fs/reiserfs/ibalance.c | 1160 ++ kernel/fs/reiserfs/inode.c | 3461 +++++ kernel/fs/reiserfs/ioctl.c | 230 + kernel/fs/reiserfs/item_ops.c | 752 ++ kernel/fs/reiserfs/journal.c | 4403 +++++++ kernel/fs/reiserfs/lbalance.c | 1427 +++ kernel/fs/reiserfs/lock.c | 100 + kernel/fs/reiserfs/namei.c | 1659 +++ kernel/fs/reiserfs/objectid.c | 217 + kernel/fs/reiserfs/prints.c | 777 ++ kernel/fs/reiserfs/procfs.c | 508 + kernel/fs/reiserfs/reiserfs.h | 3411 +++++ kernel/fs/reiserfs/resize.c | 229 + kernel/fs/reiserfs/stree.c | 2262 ++++ kernel/fs/reiserfs/super.c | 2563 ++++ kernel/fs/reiserfs/tail_conversion.c | 317 + kernel/fs/reiserfs/xattr.c | 1064 ++ kernel/fs/reiserfs/xattr.h | 122 + kernel/fs/reiserfs/xattr_acl.c | 407 + kernel/fs/reiserfs/xattr_security.c | 120 + kernel/fs/reiserfs/xattr_trusted.c | 56 + kernel/fs/reiserfs/xattr_user.c | 52 + kernel/fs/romfs/Kconfig | 62 + kernel/fs/romfs/Makefile | 12 + kernel/fs/romfs/internal.h | 47 + kernel/fs/romfs/mmap-nommu.c | 89 + kernel/fs/romfs/storage.c | 293 + kernel/fs/romfs/super.c | 659 + kernel/fs/select.c | 1040 ++ kernel/fs/seq_file.c | 968 ++ kernel/fs/signalfd.c | 342 + kernel/fs/splice.c | 2034 +++ kernel/fs/squashfs/Kconfig | 210 + kernel/fs/squashfs/Makefile | 17 + kernel/fs/squashfs/block.c | 214 + kernel/fs/squashfs/cache.c | 458 + kernel/fs/squashfs/decompressor.c | 148 + kernel/fs/squashfs/decompressor.h | 61 + kernel/fs/squashfs/decompressor_multi.c | 198 + kernel/fs/squashfs/decompressor_multi_percpu.c | 97 + kernel/fs/squashfs/decompressor_single.c | 85 + kernel/fs/squashfs/dir.c | 236 + kernel/fs/squashfs/export.c | 163 + kernel/fs/squashfs/file.c | 503 + kernel/fs/squashfs/file_cache.c | 38 + kernel/fs/squashfs/file_direct.c | 176 + kernel/fs/squashfs/fragment.c | 99 + kernel/fs/squashfs/id.c | 102 + kernel/fs/squashfs/inode.c | 429 + kernel/fs/squashfs/lz4_wrapper.c | 142 + kernel/fs/squashfs/lzo_wrapper.c | 130 + kernel/fs/squashfs/namei.c | 252 + kernel/fs/squashfs/page_actor.c | 100 + kernel/fs/squashfs/page_actor.h | 81 + kernel/fs/squashfs/squashfs.h | 113 + kernel/fs/squashfs/squashfs_fs.h | 457 + kernel/fs/squashfs/squashfs_fs_i.h | 54 + kernel/fs/squashfs/squashfs_fs_sb.h | 80 + kernel/fs/squashfs/super.c | 508 + kernel/fs/squashfs/symlink.c | 127 + kernel/fs/squashfs/xattr.c | 324 + kernel/fs/squashfs/xattr.h | 47 + kernel/fs/squashfs/xattr_id.c | 95 + kernel/fs/squashfs/xz_wrapper.c | 193 + kernel/fs/squashfs/zlib_wrapper.c | 135 + kernel/fs/stack.c | 76 + kernel/fs/stat.c | 511 + kernel/fs/statfs.c | 241 + kernel/fs/super.c | 1396 +++ kernel/fs/sync.c | 366 + kernel/fs/sysfs/Kconfig | 24 + kernel/fs/sysfs/Makefile | 5 + kernel/fs/sysfs/dir.c | 157 + kernel/fs/sysfs/file.c | 497 + kernel/fs/sysfs/group.c | 354 + kernel/fs/sysfs/mount.c | 79 + kernel/fs/sysfs/symlink.c | 197 + kernel/fs/sysfs/sysfs.h | 43 + kernel/fs/sysv/Kconfig | 36 + kernel/fs/sysv/Makefile | 8 + kernel/fs/sysv/balloc.c | 239 + kernel/fs/sysv/dir.c | 372 + kernel/fs/sysv/file.c | 57 + kernel/fs/sysv/ialloc.c | 234 + kernel/fs/sysv/inode.c | 370 + kernel/fs/sysv/itree.c | 501 + kernel/fs/sysv/namei.c | 290 + kernel/fs/sysv/super.c | 593 + kernel/fs/sysv/symlink.c | 20 + kernel/fs/sysv/sysv.h | 247 + kernel/fs/timerfd.c | 571 + kernel/fs/tracefs/Makefile | 4 + kernel/fs/tracefs/inode.c | 648 + kernel/fs/ubifs/Kconfig | 37 + kernel/fs/ubifs/Makefile | 6 + kernel/fs/ubifs/budget.c | 730 ++ kernel/fs/ubifs/commit.c | 734 ++ kernel/fs/ubifs/compress.c | 250 + kernel/fs/ubifs/debug.c | 3104 +++++ kernel/fs/ubifs/debug.h | 315 + kernel/fs/ubifs/dir.c | 1202 ++ kernel/fs/ubifs/file.c | 1594 +++ kernel/fs/ubifs/find.c | 985 ++ kernel/fs/ubifs/gc.c | 984 ++ kernel/fs/ubifs/io.c | 1150 ++ kernel/fs/ubifs/ioctl.c | 205 + kernel/fs/ubifs/journal.c | 1466 +++ kernel/fs/ubifs/key.h | 548 + kernel/fs/ubifs/log.c | 753 ++ kernel/fs/ubifs/lprops.c | 1321 ++ kernel/fs/ubifs/lpt.c | 2278 ++++ kernel/fs/ubifs/lpt_commit.c | 2037 +++ kernel/fs/ubifs/master.c | 395 + kernel/fs/ubifs/misc.h | 303 + kernel/fs/ubifs/orphan.c | 956 ++ kernel/fs/ubifs/recovery.c | 1547 +++ kernel/fs/ubifs/replay.c | 1082 ++ kernel/fs/ubifs/sb.c | 809 ++ kernel/fs/ubifs/scan.c | 379 + kernel/fs/ubifs/shrinker.c | 331 + kernel/fs/ubifs/super.c | 2300 ++++ kernel/fs/ubifs/tnc.c | 3327 +++++ kernel/fs/ubifs/tnc_commit.c | 1071 ++ kernel/fs/ubifs/tnc_misc.c | 494 + kernel/fs/ubifs/ubifs-media.h | 784 ++ kernel/fs/ubifs/ubifs.h | 1803 +++ kernel/fs/ubifs/xattr.c | 666 + kernel/fs/udf/Kconfig | 20 + kernel/fs/udf/Makefile | 9 + kernel/fs/udf/balloc.c | 821 ++ kernel/fs/udf/dir.c | 199 + kernel/fs/udf/directory.c | 240 + kernel/fs/udf/ecma_167.h | 796 ++ kernel/fs/udf/file.c | 273 + kernel/fs/udf/ialloc.c | 133 + kernel/fs/udf/inode.c | 2291 ++++ kernel/fs/udf/lowlevel.c | 67 + kernel/fs/udf/misc.c | 298 + kernel/fs/udf/namei.c | 1300 ++ kernel/fs/udf/osta_udf.h | 279 + kernel/fs/udf/partition.c | 338 + kernel/fs/udf/super.c | 2469 ++++ kernel/fs/udf/symlink.c | 159 + kernel/fs/udf/truncate.c | 286 + kernel/fs/udf/udf_i.h | 62 + kernel/fs/udf/udf_sb.h | 184 + kernel/fs/udf/udfdecl.h | 255 + kernel/fs/udf/udfend.h | 77 + kernel/fs/udf/udftime.c | 170 + kernel/fs/udf/unicode.c | 496 + kernel/fs/ufs/Kconfig | 43 + kernel/fs/ufs/Makefile | 9 + kernel/fs/ufs/balloc.c | 938 ++ kernel/fs/ufs/cylinder.c | 201 + kernel/fs/ufs/dir.c | 662 + kernel/fs/ufs/file.c | 44 + kernel/fs/ufs/ialloc.c | 353 + kernel/fs/ufs/inode.c | 910 ++ kernel/fs/ufs/namei.c | 356 + kernel/fs/ufs/super.c | 1524 +++ kernel/fs/ufs/swab.h | 115 + kernel/fs/ufs/symlink.c | 53 + kernel/fs/ufs/truncate.c | 523 + kernel/fs/ufs/ufs.h | 176 + kernel/fs/ufs/ufs_fs.h | 960 ++ kernel/fs/ufs/util.c | 282 + kernel/fs/ufs/util.h | 592 + kernel/fs/utimes.c | 235 + kernel/fs/xattr.c | 972 ++ kernel/fs/xfs/Kconfig | 97 + kernel/fs/xfs/Makefile | 124 + kernel/fs/xfs/kmem.c | 127 + kernel/fs/xfs/kmem.h | 125 + kernel/fs/xfs/libxfs/xfs_alloc.c | 2639 ++++ kernel/fs/xfs/libxfs/xfs_alloc.h | 237 + kernel/fs/xfs/libxfs/xfs_alloc_btree.c | 503 + kernel/fs/xfs/libxfs/xfs_alloc_btree.h | 65 + kernel/fs/xfs/libxfs/xfs_attr.c | 1456 +++ kernel/fs/xfs/libxfs/xfs_attr_leaf.c | 2773 ++++ kernel/fs/xfs/libxfs/xfs_attr_leaf.h | 110 + kernel/fs/xfs/libxfs/xfs_attr_remote.c | 626 + kernel/fs/xfs/libxfs/xfs_attr_remote.h | 27 + kernel/fs/xfs/libxfs/xfs_attr_sf.h | 70 + kernel/fs/xfs/libxfs/xfs_bit.h | 87 + kernel/fs/xfs/libxfs/xfs_bmap.c | 5945 +++++++++ kernel/fs/xfs/libxfs/xfs_bmap.h | 225 + kernel/fs/xfs/libxfs/xfs_bmap_btree.c | 883 ++ kernel/fs/xfs/libxfs/xfs_bmap_btree.h | 143 + kernel/fs/xfs/libxfs/xfs_btree.c | 4067 ++++++ kernel/fs/xfs/libxfs/xfs_btree.h | 468 + kernel/fs/xfs/libxfs/xfs_cksum.h | 63 + kernel/fs/xfs/libxfs/xfs_da_btree.c | 2660 ++++ kernel/fs/xfs/libxfs/xfs_da_btree.h | 221 + kernel/fs/xfs/libxfs/xfs_da_format.c | 908 ++ kernel/fs/xfs/libxfs/xfs_da_format.h | 873 ++ kernel/fs/xfs/libxfs/xfs_dir2.c | 731 ++ kernel/fs/xfs/libxfs/xfs_dir2.h | 320 + kernel/fs/xfs/libxfs/xfs_dir2_block.c | 1254 ++ kernel/fs/xfs/libxfs/xfs_dir2_data.c | 1049 ++ kernel/fs/xfs/libxfs/xfs_dir2_leaf.c | 1819 +++ kernel/fs/xfs/libxfs/xfs_dir2_node.c | 2270 ++++ kernel/fs/xfs/libxfs/xfs_dir2_priv.h | 134 + kernel/fs/xfs/libxfs/xfs_dir2_sf.c | 1142 ++ kernel/fs/xfs/libxfs/xfs_dquot_buf.c | 288 + kernel/fs/xfs/libxfs/xfs_format.h | 1461 +++ kernel/fs/xfs/libxfs/xfs_fs.h | 576 + kernel/fs/xfs/libxfs/xfs_ialloc.c | 2202 ++++ kernel/fs/xfs/libxfs/xfs_ialloc.h | 167 + kernel/fs/xfs/libxfs/xfs_ialloc_btree.c | 420 + kernel/fs/xfs/libxfs/xfs_ialloc_btree.h | 65 + kernel/fs/xfs/libxfs/xfs_inode_buf.c | 476 + kernel/fs/xfs/libxfs/xfs_inode_buf.h | 50 + kernel/fs/xfs/libxfs/xfs_inode_fork.c | 1902 +++ kernel/fs/xfs/libxfs/xfs_inode_fork.h | 171 + kernel/fs/xfs/libxfs/xfs_log_format.h | 679 + kernel/fs/xfs/libxfs/xfs_log_recover.h | 66 + kernel/fs/xfs/libxfs/xfs_log_rlimit.c | 148 + kernel/fs/xfs/libxfs/xfs_quota_defs.h | 159 + kernel/fs/xfs/libxfs/xfs_rtbitmap.c | 991 ++ kernel/fs/xfs/libxfs/xfs_sb.c | 803 ++ kernel/fs/xfs/libxfs/xfs_sb.h | 38 + kernel/fs/xfs/libxfs/xfs_shared.h | 243 + kernel/fs/xfs/libxfs/xfs_symlink_remote.c | 201 + kernel/fs/xfs/libxfs/xfs_trans_resv.c | 878 ++ kernel/fs/xfs/libxfs/xfs_trans_resv.h | 116 + kernel/fs/xfs/libxfs/xfs_trans_space.h | 92 + kernel/fs/xfs/libxfs/xfs_types.h | 137 + kernel/fs/xfs/mrlock.h | 90 + kernel/fs/xfs/uuid.c | 63 + kernel/fs/xfs/uuid.h | 35 + kernel/fs/xfs/xfs.h | 34 + kernel/fs/xfs/xfs_acl.c | 304 + kernel/fs/xfs/xfs_acl.h | 39 + kernel/fs/xfs/xfs_aops.c | 1931 +++ kernel/fs/xfs/xfs_aops.h | 60 + kernel/fs/xfs/xfs_attr.h | 154 + kernel/fs/xfs/xfs_attr_inactive.c | 465 + kernel/fs/xfs/xfs_attr_list.c | 653 + kernel/fs/xfs/xfs_bit.c | 118 + kernel/fs/xfs/xfs_bmap_util.c | 1920 +++ kernel/fs/xfs/xfs_bmap_util.h | 79 + kernel/fs/xfs/xfs_buf.c | 1901 +++ kernel/fs/xfs/xfs_buf.h | 393 + kernel/fs/xfs/xfs_buf_item.c | 1155 ++ kernel/fs/xfs/xfs_buf_item.h | 76 + kernel/fs/xfs/xfs_dir2_readdir.c | 681 + kernel/fs/xfs/xfs_discard.c | 239 + kernel/fs/xfs/xfs_discard.h | 10 + kernel/fs/xfs/xfs_dquot.c | 1104 ++ kernel/fs/xfs/xfs_dquot.h | 188 + kernel/fs/xfs/xfs_dquot_item.c | 443 + kernel/fs/xfs/xfs_dquot_item.h | 47 + kernel/fs/xfs/xfs_error.c | 180 + kernel/fs/xfs/xfs_error.h | 153 + kernel/fs/xfs/xfs_export.c | 254 + kernel/fs/xfs/xfs_export.h | 72 + kernel/fs/xfs/xfs_extent_busy.c | 604 + kernel/fs/xfs/xfs_extent_busy.h | 73 + kernel/fs/xfs/xfs_extfree_item.c | 507 + kernel/fs/xfs/xfs_extfree_item.h | 81 + kernel/fs/xfs/xfs_file.c | 1534 +++ kernel/fs/xfs/xfs_filestream.c | 431 + kernel/fs/xfs/xfs_filestream.h | 40 + kernel/fs/xfs/xfs_fsops.c | 850 ++ kernel/fs/xfs/xfs_fsops.h | 30 + kernel/fs/xfs/xfs_globals.c | 49 + kernel/fs/xfs/xfs_icache.c | 1418 +++ kernel/fs/xfs/xfs_icache.h | 115 + kernel/fs/xfs/xfs_icreate_item.c | 188 + kernel/fs/xfs/xfs_icreate_item.h | 34 + kernel/fs/xfs/xfs_inode.c | 3606 ++++++ kernel/fs/xfs/xfs_inode.h | 456 + kernel/fs/xfs/xfs_inode_item.c | 789 ++ kernel/fs/xfs/xfs_inode_item.h | 54 + kernel/fs/xfs/xfs_ioctl.c | 1806 +++ kernel/fs/xfs/xfs_ioctl.h | 95 + kernel/fs/xfs/xfs_ioctl32.c | 680 + kernel/fs/xfs/xfs_ioctl32.h | 238 + kernel/fs/xfs/xfs_iomap.c | 920 ++ kernel/fs/xfs/xfs_iomap.h | 32 + kernel/fs/xfs/xfs_iops.c | 1305 ++ kernel/fs/xfs/xfs_iops.h | 38 + kernel/fs/xfs/xfs_itable.c | 652 + kernel/fs/xfs/xfs_itable.h | 99 + kernel/fs/xfs/xfs_linux.h | 384 + kernel/fs/xfs/xfs_log.c | 4007 ++++++ kernel/fs/xfs/xfs_log.h | 193 + kernel/fs/xfs/xfs_log_cil.c | 998 ++ kernel/fs/xfs/xfs_log_priv.h | 561 + kernel/fs/xfs/xfs_log_recover.c | 4651 +++++++ kernel/fs/xfs/xfs_message.c | 113 + kernel/fs/xfs/xfs_message.h | 64 + kernel/fs/xfs/xfs_mount.c | 1283 ++ kernel/fs/xfs/xfs_mount.h | 335 + kernel/fs/xfs/xfs_mru_cache.c | 552 + kernel/fs/xfs/xfs_mru_cache.h | 46 + kernel/fs/xfs/xfs_pnfs.c | 329 + kernel/fs/xfs/xfs_pnfs.h | 19 + kernel/fs/xfs/xfs_qm.c | 1939 +++ kernel/fs/xfs/xfs_qm.h | 174 + kernel/fs/xfs/xfs_qm_bhv.c | 149 + kernel/fs/xfs/xfs_qm_syscalls.c | 770 ++ kernel/fs/xfs/xfs_quota.h | 155 + kernel/fs/xfs/xfs_quotaops.c | 271 + kernel/fs/xfs/xfs_rtalloc.c | 1302 ++ kernel/fs/xfs/xfs_rtalloc.h | 145 + kernel/fs/xfs/xfs_stats.c | 198 + kernel/fs/xfs/xfs_stats.h | 249 + kernel/fs/xfs/xfs_super.c | 1889 +++ kernel/fs/xfs/xfs_super.h | 79 + kernel/fs/xfs/xfs_symlink.c | 608 + kernel/fs/xfs/xfs_symlink.h | 27 + kernel/fs/xfs/xfs_sysctl.c | 243 + kernel/fs/xfs/xfs_sysctl.h | 108 + kernel/fs/xfs/xfs_sysfs.c | 239 + kernel/fs/xfs/xfs_sysfs.h | 60 + kernel/fs/xfs/xfs_trace.c | 54 + kernel/fs/xfs/xfs_trace.h | 2083 +++ kernel/fs/xfs/xfs_trans.c | 1105 ++ kernel/fs/xfs/xfs_trans.h | 245 + kernel/fs/xfs/xfs_trans_ail.c | 794 ++ kernel/fs/xfs/xfs_trans_buf.c | 802 ++ kernel/fs/xfs/xfs_trans_dquot.c | 887 ++ kernel/fs/xfs/xfs_trans_extfree.c | 133 + kernel/fs/xfs/xfs_trans_inode.c | 135 + kernel/fs/xfs/xfs_trans_priv.h | 161 + kernel/fs/xfs/xfs_xattr.c | 243 + 1804 files changed, 1171684 insertions(+) create mode 100644 kernel/fs/9p/Kconfig create mode 100644 kernel/fs/9p/Makefile create mode 100644 kernel/fs/9p/acl.c create mode 100644 kernel/fs/9p/acl.h create mode 100644 kernel/fs/9p/cache.c create mode 100644 kernel/fs/9p/cache.h create mode 100644 kernel/fs/9p/fid.c create mode 100644 kernel/fs/9p/fid.h create mode 100644 kernel/fs/9p/v9fs.c create mode 100644 kernel/fs/9p/v9fs.h create mode 100644 kernel/fs/9p/v9fs_vfs.h create mode 100644 kernel/fs/9p/vfs_addr.c create mode 100644 kernel/fs/9p/vfs_dentry.c create mode 100644 kernel/fs/9p/vfs_dir.c create mode 100644 kernel/fs/9p/vfs_file.c create mode 100644 kernel/fs/9p/vfs_inode.c create mode 100644 kernel/fs/9p/vfs_inode_dotl.c create mode 100644 kernel/fs/9p/vfs_super.c create mode 100644 kernel/fs/9p/xattr.c create mode 100644 kernel/fs/9p/xattr.h create mode 100644 kernel/fs/9p/xattr_security.c create mode 100644 kernel/fs/9p/xattr_trusted.c create mode 100644 kernel/fs/9p/xattr_user.c create mode 100644 kernel/fs/Kconfig create mode 100644 kernel/fs/Kconfig.binfmt create mode 100644 kernel/fs/Makefile create mode 100644 kernel/fs/adfs/Kconfig create mode 100644 kernel/fs/adfs/Makefile create mode 100644 kernel/fs/adfs/adfs.h create mode 100644 kernel/fs/adfs/dir.c create mode 100644 kernel/fs/adfs/dir_f.c create mode 100644 kernel/fs/adfs/dir_f.h create mode 100644 kernel/fs/adfs/dir_fplus.c create mode 100644 kernel/fs/adfs/dir_fplus.h create mode 100644 kernel/fs/adfs/file.c create mode 100644 kernel/fs/adfs/inode.c create mode 100644 kernel/fs/adfs/map.c create mode 100644 kernel/fs/adfs/super.c create mode 100644 kernel/fs/affs/Changes create mode 100644 kernel/fs/affs/Kconfig create mode 100644 kernel/fs/affs/Makefile create mode 100644 kernel/fs/affs/affs.h create mode 100644 kernel/fs/affs/amigaffs.c create mode 100644 kernel/fs/affs/bitmap.c create mode 100644 kernel/fs/affs/dir.c create mode 100644 kernel/fs/affs/file.c create mode 100644 kernel/fs/affs/inode.c create mode 100644 kernel/fs/affs/namei.c create mode 100644 kernel/fs/affs/super.c create mode 100644 kernel/fs/affs/symlink.c create mode 100644 kernel/fs/afs/Kconfig create mode 100644 kernel/fs/afs/Makefile create mode 100644 kernel/fs/afs/afs.h create mode 100644 kernel/fs/afs/afs_cm.h create mode 100644 kernel/fs/afs/afs_fs.h create mode 100644 kernel/fs/afs/afs_vl.h create mode 100644 kernel/fs/afs/cache.c create mode 100644 kernel/fs/afs/callback.c create mode 100644 kernel/fs/afs/cell.c create mode 100644 kernel/fs/afs/cmservice.c create mode 100644 kernel/fs/afs/dir.c create mode 100644 kernel/fs/afs/file.c create mode 100644 kernel/fs/afs/flock.c create mode 100644 kernel/fs/afs/fsclient.c create mode 100644 kernel/fs/afs/inode.c create mode 100644 kernel/fs/afs/internal.h create mode 100644 kernel/fs/afs/main.c create mode 100644 kernel/fs/afs/misc.c create mode 100644 kernel/fs/afs/mntpt.c create mode 100644 kernel/fs/afs/netdevices.c create mode 100644 kernel/fs/afs/proc.c create mode 100644 kernel/fs/afs/rxrpc.c create mode 100644 kernel/fs/afs/security.c create mode 100644 kernel/fs/afs/server.c create mode 100644 kernel/fs/afs/super.c create mode 100644 kernel/fs/afs/vlclient.c create mode 100644 kernel/fs/afs/vlocation.c create mode 100644 kernel/fs/afs/vnode.c create mode 100644 kernel/fs/afs/volume.c create mode 100644 kernel/fs/afs/write.c create mode 100644 kernel/fs/aio.c create mode 100644 kernel/fs/anon_inodes.c create mode 100644 kernel/fs/attr.c create mode 100644 kernel/fs/autofs4/Kconfig create mode 100644 kernel/fs/autofs4/Makefile create mode 100644 kernel/fs/autofs4/autofs_i.h create mode 100644 kernel/fs/autofs4/dev-ioctl.c create mode 100644 kernel/fs/autofs4/expire.c create mode 100644 kernel/fs/autofs4/init.c create mode 100644 kernel/fs/autofs4/inode.c create mode 100644 kernel/fs/autofs4/root.c create mode 100644 kernel/fs/autofs4/symlink.c create mode 100644 kernel/fs/autofs4/waitq.c create mode 100644 kernel/fs/bad_inode.c create mode 100644 kernel/fs/befs/ChangeLog create mode 100644 kernel/fs/befs/Kconfig create mode 100644 kernel/fs/befs/Makefile create mode 100644 kernel/fs/befs/TODO create mode 100644 kernel/fs/befs/befs.h create mode 100644 kernel/fs/befs/befs_fs_types.h create mode 100644 kernel/fs/befs/btree.c create mode 100644 kernel/fs/befs/btree.h create mode 100644 kernel/fs/befs/datastream.c create mode 100644 kernel/fs/befs/datastream.h create mode 100644 kernel/fs/befs/debug.c create mode 100644 kernel/fs/befs/endian.h create mode 100644 kernel/fs/befs/inode.c create mode 100644 kernel/fs/befs/inode.h create mode 100644 kernel/fs/befs/io.c create mode 100644 kernel/fs/befs/io.h create mode 100644 kernel/fs/befs/linuxvfs.c create mode 100644 kernel/fs/befs/super.c create mode 100644 kernel/fs/befs/super.h create mode 100644 kernel/fs/bfs/Kconfig create mode 100644 kernel/fs/bfs/Makefile create mode 100644 kernel/fs/bfs/bfs.h create mode 100644 kernel/fs/bfs/dir.c create mode 100644 kernel/fs/bfs/file.c create mode 100644 kernel/fs/bfs/inode.c create mode 100644 kernel/fs/binfmt_aout.c create mode 100644 kernel/fs/binfmt_elf.c create mode 100644 kernel/fs/binfmt_elf_fdpic.c create mode 100644 kernel/fs/binfmt_em86.c create mode 100644 kernel/fs/binfmt_flat.c create mode 100644 kernel/fs/binfmt_misc.c create mode 100644 kernel/fs/binfmt_script.c create mode 100644 kernel/fs/block_dev.c create mode 100644 kernel/fs/btrfs/Kconfig create mode 100644 kernel/fs/btrfs/Makefile create mode 100644 kernel/fs/btrfs/acl.c create mode 100644 kernel/fs/btrfs/async-thread.c create mode 100644 kernel/fs/btrfs/async-thread.h create mode 100644 kernel/fs/btrfs/backref.c create mode 100644 kernel/fs/btrfs/backref.h create mode 100644 kernel/fs/btrfs/btrfs_inode.h create mode 100644 kernel/fs/btrfs/check-integrity.c create mode 100644 kernel/fs/btrfs/check-integrity.h create mode 100644 kernel/fs/btrfs/compression.c create mode 100644 kernel/fs/btrfs/compression.h create mode 100644 kernel/fs/btrfs/ctree.c create mode 100644 kernel/fs/btrfs/ctree.h create mode 100644 kernel/fs/btrfs/delayed-inode.c create mode 100644 kernel/fs/btrfs/delayed-inode.h create mode 100644 kernel/fs/btrfs/delayed-ref.c create mode 100644 kernel/fs/btrfs/delayed-ref.h create mode 100644 kernel/fs/btrfs/dev-replace.c create mode 100644 kernel/fs/btrfs/dev-replace.h create mode 100644 kernel/fs/btrfs/dir-item.c create mode 100644 kernel/fs/btrfs/disk-io.c create mode 100644 kernel/fs/btrfs/disk-io.h create mode 100644 kernel/fs/btrfs/export.c create mode 100644 kernel/fs/btrfs/export.h create mode 100644 kernel/fs/btrfs/extent-tree.c create mode 100644 kernel/fs/btrfs/extent_io.c create mode 100644 kernel/fs/btrfs/extent_io.h create mode 100644 kernel/fs/btrfs/extent_map.c create mode 100644 kernel/fs/btrfs/extent_map.h create mode 100644 kernel/fs/btrfs/file-item.c create mode 100644 kernel/fs/btrfs/file.c create mode 100644 kernel/fs/btrfs/free-space-cache.c create mode 100644 kernel/fs/btrfs/free-space-cache.h create mode 100644 kernel/fs/btrfs/hash.c create mode 100644 kernel/fs/btrfs/hash.h create mode 100644 kernel/fs/btrfs/inode-item.c create mode 100644 kernel/fs/btrfs/inode-map.c create mode 100644 kernel/fs/btrfs/inode-map.h create mode 100644 kernel/fs/btrfs/inode.c create mode 100644 kernel/fs/btrfs/ioctl.c create mode 100644 kernel/fs/btrfs/locking.c create mode 100644 kernel/fs/btrfs/locking.h create mode 100644 kernel/fs/btrfs/lzo.c create mode 100644 kernel/fs/btrfs/math.h create mode 100644 kernel/fs/btrfs/ordered-data.c create mode 100644 kernel/fs/btrfs/ordered-data.h create mode 100644 kernel/fs/btrfs/orphan.c create mode 100644 kernel/fs/btrfs/print-tree.c create mode 100644 kernel/fs/btrfs/print-tree.h create mode 100644 kernel/fs/btrfs/props.c create mode 100644 kernel/fs/btrfs/props.h create mode 100644 kernel/fs/btrfs/qgroup.c create mode 100644 kernel/fs/btrfs/qgroup.h create mode 100644 kernel/fs/btrfs/raid56.c create mode 100644 kernel/fs/btrfs/raid56.h create mode 100644 kernel/fs/btrfs/rcu-string.h create mode 100644 kernel/fs/btrfs/reada.c create mode 100644 kernel/fs/btrfs/relocation.c create mode 100644 kernel/fs/btrfs/root-tree.c create mode 100644 kernel/fs/btrfs/scrub.c create mode 100644 kernel/fs/btrfs/send.c create mode 100644 kernel/fs/btrfs/send.h create mode 100644 kernel/fs/btrfs/struct-funcs.c create mode 100644 kernel/fs/btrfs/super.c create mode 100644 kernel/fs/btrfs/sysfs.c create mode 100644 kernel/fs/btrfs/sysfs.h create mode 100644 kernel/fs/btrfs/tests/btrfs-tests.c create mode 100644 kernel/fs/btrfs/tests/btrfs-tests.h create mode 100644 kernel/fs/btrfs/tests/extent-buffer-tests.c create mode 100644 kernel/fs/btrfs/tests/extent-io-tests.c create mode 100644 kernel/fs/btrfs/tests/free-space-tests.c create mode 100644 kernel/fs/btrfs/tests/inode-tests.c create mode 100644 kernel/fs/btrfs/tests/qgroup-tests.c create mode 100644 kernel/fs/btrfs/transaction.c create mode 100644 kernel/fs/btrfs/transaction.h create mode 100644 kernel/fs/btrfs/tree-defrag.c create mode 100644 kernel/fs/btrfs/tree-log.c create mode 100644 kernel/fs/btrfs/tree-log.h create mode 100644 kernel/fs/btrfs/ulist.c create mode 100644 kernel/fs/btrfs/ulist.h create mode 100644 kernel/fs/btrfs/uuid-tree.c create mode 100644 kernel/fs/btrfs/volumes.c create mode 100644 kernel/fs/btrfs/volumes.h create mode 100644 kernel/fs/btrfs/xattr.c create mode 100644 kernel/fs/btrfs/xattr.h create mode 100644 kernel/fs/btrfs/zlib.c create mode 100644 kernel/fs/buffer.c create mode 100644 kernel/fs/cachefiles/Kconfig create mode 100644 kernel/fs/cachefiles/Makefile create mode 100644 kernel/fs/cachefiles/bind.c create mode 100644 kernel/fs/cachefiles/daemon.c create mode 100644 kernel/fs/cachefiles/interface.c create mode 100644 kernel/fs/cachefiles/internal.h create mode 100644 kernel/fs/cachefiles/key.c create mode 100644 kernel/fs/cachefiles/main.c create mode 100644 kernel/fs/cachefiles/namei.c create mode 100644 kernel/fs/cachefiles/proc.c create mode 100644 kernel/fs/cachefiles/rdwr.c create mode 100644 kernel/fs/cachefiles/security.c create mode 100644 kernel/fs/cachefiles/xattr.c create mode 100644 kernel/fs/ceph/Kconfig create mode 100644 kernel/fs/ceph/Makefile create mode 100644 kernel/fs/ceph/acl.c create mode 100644 kernel/fs/ceph/addr.c create mode 100644 kernel/fs/ceph/cache.c create mode 100644 kernel/fs/ceph/cache.h create mode 100644 kernel/fs/ceph/caps.c create mode 100644 kernel/fs/ceph/ceph_frag.c create mode 100644 kernel/fs/ceph/debugfs.c create mode 100644 kernel/fs/ceph/dir.c create mode 100644 kernel/fs/ceph/export.c create mode 100644 kernel/fs/ceph/file.c create mode 100644 kernel/fs/ceph/inode.c create mode 100644 kernel/fs/ceph/ioctl.c create mode 100644 kernel/fs/ceph/ioctl.h create mode 100644 kernel/fs/ceph/locks.c create mode 100644 kernel/fs/ceph/mds_client.c create mode 100644 kernel/fs/ceph/mds_client.h create mode 100644 kernel/fs/ceph/mdsmap.c create mode 100644 kernel/fs/ceph/snap.c create mode 100644 kernel/fs/ceph/strings.c create mode 100644 kernel/fs/ceph/super.c create mode 100644 kernel/fs/ceph/super.h create mode 100644 kernel/fs/ceph/xattr.c create mode 100644 kernel/fs/char_dev.c create mode 100644 kernel/fs/cifs/Kconfig create mode 100644 kernel/fs/cifs/Makefile create mode 100644 kernel/fs/cifs/asn1.c create mode 100644 kernel/fs/cifs/cache.c create mode 100644 kernel/fs/cifs/cifs_debug.c create mode 100644 kernel/fs/cifs/cifs_debug.h create mode 100644 kernel/fs/cifs/cifs_dfs_ref.c create mode 100644 kernel/fs/cifs/cifs_fs_sb.h create mode 100644 kernel/fs/cifs/cifs_spnego.c create mode 100644 kernel/fs/cifs/cifs_spnego.h create mode 100644 kernel/fs/cifs/cifs_unicode.c create mode 100644 kernel/fs/cifs/cifs_unicode.h create mode 100644 kernel/fs/cifs/cifs_uniupr.h create mode 100644 kernel/fs/cifs/cifsacl.c create mode 100644 kernel/fs/cifs/cifsacl.h create mode 100644 kernel/fs/cifs/cifsencrypt.c create mode 100644 kernel/fs/cifs/cifsfs.c create mode 100644 kernel/fs/cifs/cifsfs.h create mode 100644 kernel/fs/cifs/cifsglob.h create mode 100644 kernel/fs/cifs/cifspdu.h create mode 100644 kernel/fs/cifs/cifsproto.h create mode 100644 kernel/fs/cifs/cifssmb.c create mode 100644 kernel/fs/cifs/connect.c create mode 100644 kernel/fs/cifs/dir.c create mode 100644 kernel/fs/cifs/dns_resolve.c create mode 100644 kernel/fs/cifs/dns_resolve.h create mode 100644 kernel/fs/cifs/export.c create mode 100644 kernel/fs/cifs/file.c create mode 100644 kernel/fs/cifs/fscache.c create mode 100644 kernel/fs/cifs/fscache.h create mode 100644 kernel/fs/cifs/inode.c create mode 100644 kernel/fs/cifs/ioctl.c create mode 100644 kernel/fs/cifs/link.c create mode 100644 kernel/fs/cifs/misc.c create mode 100644 kernel/fs/cifs/netmisc.c create mode 100644 kernel/fs/cifs/nterr.c create mode 100644 kernel/fs/cifs/nterr.h create mode 100644 kernel/fs/cifs/ntlmssp.h create mode 100644 kernel/fs/cifs/readdir.c create mode 100644 kernel/fs/cifs/rfc1002pdu.h create mode 100644 kernel/fs/cifs/sess.c create mode 100644 kernel/fs/cifs/smb1ops.c create mode 100644 kernel/fs/cifs/smb2file.c create mode 100644 kernel/fs/cifs/smb2glob.h create mode 100644 kernel/fs/cifs/smb2inode.c create mode 100644 kernel/fs/cifs/smb2maperror.c create mode 100644 kernel/fs/cifs/smb2misc.c create mode 100644 kernel/fs/cifs/smb2ops.c create mode 100644 kernel/fs/cifs/smb2pdu.c create mode 100644 kernel/fs/cifs/smb2pdu.h create mode 100644 kernel/fs/cifs/smb2proto.h create mode 100644 kernel/fs/cifs/smb2status.h create mode 100644 kernel/fs/cifs/smb2transport.c create mode 100644 kernel/fs/cifs/smbencrypt.c create mode 100644 kernel/fs/cifs/smberr.h create mode 100644 kernel/fs/cifs/smbfsctl.h create mode 100644 kernel/fs/cifs/transport.c create mode 100644 kernel/fs/cifs/winucase.c create mode 100644 kernel/fs/cifs/xattr.c create mode 100644 kernel/fs/coda/Kconfig create mode 100644 kernel/fs/coda/Makefile create mode 100644 kernel/fs/coda/cache.c create mode 100644 kernel/fs/coda/cnode.c create mode 100644 kernel/fs/coda/coda_cache.h create mode 100644 kernel/fs/coda/coda_fs_i.h create mode 100644 kernel/fs/coda/coda_int.h create mode 100644 kernel/fs/coda/coda_linux.c create mode 100644 kernel/fs/coda/coda_linux.h create mode 100644 kernel/fs/coda/dir.c create mode 100644 kernel/fs/coda/file.c create mode 100644 kernel/fs/coda/inode.c create mode 100644 kernel/fs/coda/pioctl.c create mode 100644 kernel/fs/coda/psdev.c create mode 100644 kernel/fs/coda/symlink.c create mode 100644 kernel/fs/coda/sysctl.c create mode 100644 kernel/fs/coda/upcall.c create mode 100644 kernel/fs/compat.c create mode 100644 kernel/fs/compat_binfmt_elf.c create mode 100644 kernel/fs/compat_ioctl.c create mode 100644 kernel/fs/configfs/Kconfig create mode 100644 kernel/fs/configfs/Makefile create mode 100644 kernel/fs/configfs/configfs_internal.h create mode 100644 kernel/fs/configfs/dir.c create mode 100644 kernel/fs/configfs/file.c create mode 100644 kernel/fs/configfs/inode.c create mode 100644 kernel/fs/configfs/item.c create mode 100644 kernel/fs/configfs/mount.c create mode 100644 kernel/fs/configfs/symlink.c create mode 100644 kernel/fs/coredump.c create mode 100644 kernel/fs/cramfs/Kconfig create mode 100644 kernel/fs/cramfs/Makefile create mode 100644 kernel/fs/cramfs/README create mode 100644 kernel/fs/cramfs/inode.c create mode 100644 kernel/fs/cramfs/internal.h create mode 100644 kernel/fs/cramfs/uncompress.c create mode 100644 kernel/fs/dax.c create mode 100644 kernel/fs/dcache.c create mode 100644 kernel/fs/dcookies.c create mode 100644 kernel/fs/debugfs/Makefile create mode 100644 kernel/fs/debugfs/file.c create mode 100644 kernel/fs/debugfs/inode.c create mode 100644 kernel/fs/devpts/Makefile create mode 100644 kernel/fs/devpts/inode.c create mode 100644 kernel/fs/direct-io.c create mode 100644 kernel/fs/dlm/Kconfig create mode 100644 kernel/fs/dlm/Makefile create mode 100644 kernel/fs/dlm/ast.c create mode 100644 kernel/fs/dlm/ast.h create mode 100644 kernel/fs/dlm/config.c create mode 100644 kernel/fs/dlm/config.h create mode 100644 kernel/fs/dlm/debug_fs.c create mode 100644 kernel/fs/dlm/dir.c create mode 100644 kernel/fs/dlm/dir.h create mode 100644 kernel/fs/dlm/dlm_internal.h create mode 100644 kernel/fs/dlm/lock.c create mode 100644 kernel/fs/dlm/lock.h create mode 100644 kernel/fs/dlm/lockspace.c create mode 100644 kernel/fs/dlm/lockspace.h create mode 100644 kernel/fs/dlm/lowcomms.c create mode 100644 kernel/fs/dlm/lowcomms.h create mode 100644 kernel/fs/dlm/lvb_table.h create mode 100644 kernel/fs/dlm/main.c create mode 100644 kernel/fs/dlm/member.c create mode 100644 kernel/fs/dlm/member.h create mode 100644 kernel/fs/dlm/memory.c create mode 100644 kernel/fs/dlm/memory.h create mode 100644 kernel/fs/dlm/midcomms.c create mode 100644 kernel/fs/dlm/midcomms.h create mode 100644 kernel/fs/dlm/netlink.c create mode 100644 kernel/fs/dlm/plock.c create mode 100644 kernel/fs/dlm/rcom.c create mode 100644 kernel/fs/dlm/rcom.h create mode 100644 kernel/fs/dlm/recover.c create mode 100644 kernel/fs/dlm/recover.h create mode 100644 kernel/fs/dlm/recoverd.c create mode 100644 kernel/fs/dlm/recoverd.h create mode 100644 kernel/fs/dlm/requestqueue.c create mode 100644 kernel/fs/dlm/requestqueue.h create mode 100644 kernel/fs/dlm/user.c create mode 100644 kernel/fs/dlm/user.h create mode 100644 kernel/fs/dlm/util.c create mode 100644 kernel/fs/dlm/util.h create mode 100644 kernel/fs/drop_caches.c create mode 100644 kernel/fs/ecryptfs/Kconfig create mode 100644 kernel/fs/ecryptfs/Makefile create mode 100644 kernel/fs/ecryptfs/crypto.c create mode 100644 kernel/fs/ecryptfs/debug.c create mode 100644 kernel/fs/ecryptfs/dentry.c create mode 100644 kernel/fs/ecryptfs/ecryptfs_kernel.h create mode 100644 kernel/fs/ecryptfs/file.c create mode 100644 kernel/fs/ecryptfs/inode.c create mode 100644 kernel/fs/ecryptfs/keystore.c create mode 100644 kernel/fs/ecryptfs/kthread.c create mode 100644 kernel/fs/ecryptfs/main.c create mode 100644 kernel/fs/ecryptfs/messaging.c create mode 100644 kernel/fs/ecryptfs/miscdev.c create mode 100644 kernel/fs/ecryptfs/mmap.c create mode 100644 kernel/fs/ecryptfs/read_write.c create mode 100644 kernel/fs/ecryptfs/super.c create mode 100644 kernel/fs/efivarfs/Kconfig create mode 100644 kernel/fs/efivarfs/Makefile create mode 100644 kernel/fs/efivarfs/file.c create mode 100644 kernel/fs/efivarfs/inode.c create mode 100644 kernel/fs/efivarfs/internal.h create mode 100644 kernel/fs/efivarfs/super.c create mode 100644 kernel/fs/efs/Kconfig create mode 100644 kernel/fs/efs/Makefile create mode 100644 kernel/fs/efs/dir.c create mode 100644 kernel/fs/efs/efs.h create mode 100644 kernel/fs/efs/file.c create mode 100644 kernel/fs/efs/inode.c create mode 100644 kernel/fs/efs/namei.c create mode 100644 kernel/fs/efs/super.c create mode 100644 kernel/fs/efs/symlink.c create mode 100644 kernel/fs/eventfd.c create mode 100644 kernel/fs/eventpoll.c create mode 100644 kernel/fs/exec.c create mode 100644 kernel/fs/exofs/BUGS create mode 100644 kernel/fs/exofs/Kbuild create mode 100644 kernel/fs/exofs/Kconfig create mode 100644 kernel/fs/exofs/Kconfig.ore create mode 100644 kernel/fs/exofs/common.h create mode 100644 kernel/fs/exofs/dir.c create mode 100644 kernel/fs/exofs/exofs.h create mode 100644 kernel/fs/exofs/file.c create mode 100644 kernel/fs/exofs/inode.c create mode 100644 kernel/fs/exofs/namei.c create mode 100644 kernel/fs/exofs/ore.c create mode 100644 kernel/fs/exofs/ore_raid.c create mode 100644 kernel/fs/exofs/ore_raid.h create mode 100644 kernel/fs/exofs/super.c create mode 100644 kernel/fs/exofs/symlink.c create mode 100644 kernel/fs/exofs/sys.c create mode 100644 kernel/fs/exportfs/Makefile create mode 100644 kernel/fs/exportfs/expfs.c create mode 100644 kernel/fs/ext2/Kconfig create mode 100644 kernel/fs/ext2/Makefile create mode 100644 kernel/fs/ext2/acl.c create mode 100644 kernel/fs/ext2/acl.h create mode 100644 kernel/fs/ext2/balloc.c create mode 100644 kernel/fs/ext2/dir.c create mode 100644 kernel/fs/ext2/ext2.h create mode 100644 kernel/fs/ext2/file.c create mode 100644 kernel/fs/ext2/ialloc.c create mode 100644 kernel/fs/ext2/inode.c create mode 100644 kernel/fs/ext2/ioctl.c create mode 100644 kernel/fs/ext2/namei.c create mode 100644 kernel/fs/ext2/super.c create mode 100644 kernel/fs/ext2/symlink.c create mode 100644 kernel/fs/ext2/xattr.c create mode 100644 kernel/fs/ext2/xattr.h create mode 100644 kernel/fs/ext2/xattr_security.c create mode 100644 kernel/fs/ext2/xattr_trusted.c create mode 100644 kernel/fs/ext2/xattr_user.c create mode 100644 kernel/fs/ext3/Kconfig create mode 100644 kernel/fs/ext3/Makefile create mode 100644 kernel/fs/ext3/acl.c create mode 100644 kernel/fs/ext3/acl.h create mode 100644 kernel/fs/ext3/balloc.c create mode 100644 kernel/fs/ext3/bitmap.c create mode 100644 kernel/fs/ext3/dir.c create mode 100644 kernel/fs/ext3/ext3.h create mode 100644 kernel/fs/ext3/ext3_jbd.c create mode 100644 kernel/fs/ext3/file.c create mode 100644 kernel/fs/ext3/fsync.c create mode 100644 kernel/fs/ext3/hash.c create mode 100644 kernel/fs/ext3/ialloc.c create mode 100644 kernel/fs/ext3/inode.c create mode 100644 kernel/fs/ext3/ioctl.c create mode 100644 kernel/fs/ext3/namei.c create mode 100644 kernel/fs/ext3/namei.h create mode 100644 kernel/fs/ext3/resize.c create mode 100644 kernel/fs/ext3/super.c create mode 100644 kernel/fs/ext3/symlink.c create mode 100644 kernel/fs/ext3/xattr.c create mode 100644 kernel/fs/ext3/xattr.h create mode 100644 kernel/fs/ext3/xattr_security.c create mode 100644 kernel/fs/ext3/xattr_trusted.c create mode 100644 kernel/fs/ext3/xattr_user.c create mode 100644 kernel/fs/ext4/Kconfig create mode 100644 kernel/fs/ext4/Makefile create mode 100644 kernel/fs/ext4/acl.c create mode 100644 kernel/fs/ext4/acl.h create mode 100644 kernel/fs/ext4/balloc.c create mode 100644 kernel/fs/ext4/bitmap.c create mode 100644 kernel/fs/ext4/block_validity.c create mode 100644 kernel/fs/ext4/crypto.c create mode 100644 kernel/fs/ext4/crypto_fname.c create mode 100644 kernel/fs/ext4/crypto_key.c create mode 100644 kernel/fs/ext4/crypto_policy.c create mode 100644 kernel/fs/ext4/dir.c create mode 100644 kernel/fs/ext4/ext4.h create mode 100644 kernel/fs/ext4/ext4_crypto.h create mode 100644 kernel/fs/ext4/ext4_extents.h create mode 100644 kernel/fs/ext4/ext4_jbd2.c create mode 100644 kernel/fs/ext4/ext4_jbd2.h create mode 100644 kernel/fs/ext4/extents.c create mode 100644 kernel/fs/ext4/extents_status.c create mode 100644 kernel/fs/ext4/extents_status.h create mode 100644 kernel/fs/ext4/file.c create mode 100644 kernel/fs/ext4/fsync.c create mode 100644 kernel/fs/ext4/hash.c create mode 100644 kernel/fs/ext4/ialloc.c create mode 100644 kernel/fs/ext4/indirect.c create mode 100644 kernel/fs/ext4/inline.c create mode 100644 kernel/fs/ext4/inode.c create mode 100644 kernel/fs/ext4/ioctl.c create mode 100644 kernel/fs/ext4/mballoc.c create mode 100644 kernel/fs/ext4/mballoc.h create mode 100644 kernel/fs/ext4/migrate.c create mode 100644 kernel/fs/ext4/mmp.c create mode 100644 kernel/fs/ext4/move_extent.c create mode 100644 kernel/fs/ext4/namei.c create mode 100644 kernel/fs/ext4/page-io.c create mode 100644 kernel/fs/ext4/readpage.c create mode 100644 kernel/fs/ext4/resize.c create mode 100644 kernel/fs/ext4/super.c create mode 100644 kernel/fs/ext4/symlink.c create mode 100644 kernel/fs/ext4/truncate.h create mode 100644 kernel/fs/ext4/xattr.c create mode 100644 kernel/fs/ext4/xattr.h create mode 100644 kernel/fs/ext4/xattr_security.c create mode 100644 kernel/fs/ext4/xattr_trusted.c create mode 100644 kernel/fs/ext4/xattr_user.c create mode 100644 kernel/fs/f2fs/Kconfig create mode 100644 kernel/fs/f2fs/Makefile create mode 100644 kernel/fs/f2fs/acl.c create mode 100644 kernel/fs/f2fs/acl.h create mode 100644 kernel/fs/f2fs/checkpoint.c create mode 100644 kernel/fs/f2fs/data.c create mode 100644 kernel/fs/f2fs/debug.c create mode 100644 kernel/fs/f2fs/dir.c create mode 100644 kernel/fs/f2fs/f2fs.h create mode 100644 kernel/fs/f2fs/file.c create mode 100644 kernel/fs/f2fs/gc.c create mode 100644 kernel/fs/f2fs/gc.h create mode 100644 kernel/fs/f2fs/hash.c create mode 100644 kernel/fs/f2fs/inline.c create mode 100644 kernel/fs/f2fs/inode.c create mode 100644 kernel/fs/f2fs/namei.c create mode 100644 kernel/fs/f2fs/node.c create mode 100644 kernel/fs/f2fs/node.h create mode 100644 kernel/fs/f2fs/recovery.c create mode 100644 kernel/fs/f2fs/segment.c create mode 100644 kernel/fs/f2fs/segment.h create mode 100644 kernel/fs/f2fs/super.c create mode 100644 kernel/fs/f2fs/trace.c create mode 100644 kernel/fs/f2fs/trace.h create mode 100644 kernel/fs/f2fs/xattr.c create mode 100644 kernel/fs/f2fs/xattr.h create mode 100644 kernel/fs/fat/Kconfig create mode 100644 kernel/fs/fat/Makefile create mode 100644 kernel/fs/fat/cache.c create mode 100644 kernel/fs/fat/dir.c create mode 100644 kernel/fs/fat/fat.h create mode 100644 kernel/fs/fat/fatent.c create mode 100644 kernel/fs/fat/file.c create mode 100644 kernel/fs/fat/inode.c create mode 100644 kernel/fs/fat/misc.c create mode 100644 kernel/fs/fat/namei_msdos.c create mode 100644 kernel/fs/fat/namei_vfat.c create mode 100644 kernel/fs/fat/nfs.c create mode 100644 kernel/fs/fcntl.c create mode 100644 kernel/fs/fhandle.c create mode 100644 kernel/fs/file.c create mode 100644 kernel/fs/file_table.c create mode 100644 kernel/fs/filesystems.c create mode 100644 kernel/fs/freevxfs/Kconfig create mode 100644 kernel/fs/freevxfs/Makefile create mode 100644 kernel/fs/freevxfs/vxfs.h create mode 100644 kernel/fs/freevxfs/vxfs_bmap.c create mode 100644 kernel/fs/freevxfs/vxfs_dir.h create mode 100644 kernel/fs/freevxfs/vxfs_extern.h create mode 100644 kernel/fs/freevxfs/vxfs_fshead.c create mode 100644 kernel/fs/freevxfs/vxfs_fshead.h create mode 100644 kernel/fs/freevxfs/vxfs_immed.c create mode 100644 kernel/fs/freevxfs/vxfs_inode.c create mode 100644 kernel/fs/freevxfs/vxfs_inode.h create mode 100644 kernel/fs/freevxfs/vxfs_lookup.c create mode 100644 kernel/fs/freevxfs/vxfs_olt.c create mode 100644 kernel/fs/freevxfs/vxfs_olt.h create mode 100644 kernel/fs/freevxfs/vxfs_subr.c create mode 100644 kernel/fs/freevxfs/vxfs_super.c create mode 100644 kernel/fs/fs-writeback.c create mode 100644 kernel/fs/fs_pin.c create mode 100644 kernel/fs/fs_struct.c create mode 100644 kernel/fs/fscache/Kconfig create mode 100644 kernel/fs/fscache/Makefile create mode 100644 kernel/fs/fscache/cache.c create mode 100644 kernel/fs/fscache/cookie.c create mode 100644 kernel/fs/fscache/fsdef.c create mode 100644 kernel/fs/fscache/histogram.c create mode 100644 kernel/fs/fscache/internal.h create mode 100644 kernel/fs/fscache/main.c create mode 100644 kernel/fs/fscache/netfs.c create mode 100644 kernel/fs/fscache/object-list.c create mode 100644 kernel/fs/fscache/object.c create mode 100644 kernel/fs/fscache/operation.c create mode 100644 kernel/fs/fscache/page.c create mode 100644 kernel/fs/fscache/proc.c create mode 100644 kernel/fs/fscache/stats.c create mode 100644 kernel/fs/fuse/Kconfig create mode 100644 kernel/fs/fuse/Makefile create mode 100644 kernel/fs/fuse/control.c create mode 100644 kernel/fs/fuse/cuse.c create mode 100644 kernel/fs/fuse/dev.c create mode 100644 kernel/fs/fuse/dir.c create mode 100644 kernel/fs/fuse/file.c create mode 100644 kernel/fs/fuse/fuse_i.h create mode 100644 kernel/fs/fuse/inode.c create mode 100644 kernel/fs/gfs2/Kconfig create mode 100644 kernel/fs/gfs2/Makefile create mode 100644 kernel/fs/gfs2/acl.c create mode 100644 kernel/fs/gfs2/acl.h create mode 100644 kernel/fs/gfs2/aops.c create mode 100644 kernel/fs/gfs2/bmap.c create mode 100644 kernel/fs/gfs2/bmap.h create mode 100644 kernel/fs/gfs2/dentry.c create mode 100644 kernel/fs/gfs2/dir.c create mode 100644 kernel/fs/gfs2/dir.h create mode 100644 kernel/fs/gfs2/export.c create mode 100644 kernel/fs/gfs2/file.c create mode 100644 kernel/fs/gfs2/gfs2.h create mode 100644 kernel/fs/gfs2/glock.c create mode 100644 kernel/fs/gfs2/glock.h create mode 100644 kernel/fs/gfs2/glops.c create mode 100644 kernel/fs/gfs2/glops.h create mode 100644 kernel/fs/gfs2/incore.h create mode 100644 kernel/fs/gfs2/inode.c create mode 100644 kernel/fs/gfs2/inode.h create mode 100644 kernel/fs/gfs2/lock_dlm.c create mode 100644 kernel/fs/gfs2/log.c create mode 100644 kernel/fs/gfs2/log.h create mode 100644 kernel/fs/gfs2/lops.c create mode 100644 kernel/fs/gfs2/lops.h create mode 100644 kernel/fs/gfs2/main.c create mode 100644 kernel/fs/gfs2/meta_io.c create mode 100644 kernel/fs/gfs2/meta_io.h create mode 100644 kernel/fs/gfs2/ops_fstype.c create mode 100644 kernel/fs/gfs2/quota.c create mode 100644 kernel/fs/gfs2/quota.h create mode 100644 kernel/fs/gfs2/recovery.c create mode 100644 kernel/fs/gfs2/recovery.h create mode 100644 kernel/fs/gfs2/rgrp.c create mode 100644 kernel/fs/gfs2/rgrp.h create mode 100644 kernel/fs/gfs2/super.c create mode 100644 kernel/fs/gfs2/super.h create mode 100644 kernel/fs/gfs2/sys.c create mode 100644 kernel/fs/gfs2/sys.h create mode 100644 kernel/fs/gfs2/trace_gfs2.h create mode 100644 kernel/fs/gfs2/trans.c create mode 100644 kernel/fs/gfs2/trans.h create mode 100644 kernel/fs/gfs2/util.c create mode 100644 kernel/fs/gfs2/util.h create mode 100644 kernel/fs/gfs2/xattr.c create mode 100644 kernel/fs/gfs2/xattr.h create mode 100644 kernel/fs/hfs/Kconfig create mode 100644 kernel/fs/hfs/Makefile create mode 100644 kernel/fs/hfs/attr.c create mode 100644 kernel/fs/hfs/bfind.c create mode 100644 kernel/fs/hfs/bitmap.c create mode 100644 kernel/fs/hfs/bnode.c create mode 100644 kernel/fs/hfs/brec.c create mode 100644 kernel/fs/hfs/btree.c create mode 100644 kernel/fs/hfs/btree.h create mode 100644 kernel/fs/hfs/catalog.c create mode 100644 kernel/fs/hfs/dir.c create mode 100644 kernel/fs/hfs/extent.c create mode 100644 kernel/fs/hfs/hfs.h create mode 100644 kernel/fs/hfs/hfs_fs.h create mode 100644 kernel/fs/hfs/inode.c create mode 100644 kernel/fs/hfs/mdb.c create mode 100644 kernel/fs/hfs/part_tbl.c create mode 100644 kernel/fs/hfs/string.c create mode 100644 kernel/fs/hfs/super.c create mode 100644 kernel/fs/hfs/sysdep.c create mode 100644 kernel/fs/hfs/trans.c create mode 100644 kernel/fs/hfsplus/Kconfig create mode 100644 kernel/fs/hfsplus/Makefile create mode 100644 kernel/fs/hfsplus/acl.h create mode 100644 kernel/fs/hfsplus/attributes.c create mode 100644 kernel/fs/hfsplus/bfind.c create mode 100644 kernel/fs/hfsplus/bitmap.c create mode 100644 kernel/fs/hfsplus/bnode.c create mode 100644 kernel/fs/hfsplus/brec.c create mode 100644 kernel/fs/hfsplus/btree.c create mode 100644 kernel/fs/hfsplus/catalog.c create mode 100644 kernel/fs/hfsplus/dir.c create mode 100644 kernel/fs/hfsplus/extents.c create mode 100644 kernel/fs/hfsplus/hfsplus_fs.h create mode 100644 kernel/fs/hfsplus/hfsplus_raw.h create mode 100644 kernel/fs/hfsplus/inode.c create mode 100644 kernel/fs/hfsplus/ioctl.c create mode 100644 kernel/fs/hfsplus/options.c create mode 100644 kernel/fs/hfsplus/part_tbl.c create mode 100644 kernel/fs/hfsplus/posix_acl.c create mode 100644 kernel/fs/hfsplus/super.c create mode 100644 kernel/fs/hfsplus/tables.c create mode 100644 kernel/fs/hfsplus/unicode.c create mode 100644 kernel/fs/hfsplus/wrapper.c create mode 100644 kernel/fs/hfsplus/xattr.c create mode 100644 kernel/fs/hfsplus/xattr.h create mode 100644 kernel/fs/hfsplus/xattr_security.c create mode 100644 kernel/fs/hfsplus/xattr_trusted.c create mode 100644 kernel/fs/hfsplus/xattr_user.c create mode 100644 kernel/fs/hostfs/Makefile create mode 100644 kernel/fs/hostfs/hostfs.h create mode 100644 kernel/fs/hostfs/hostfs_kern.c create mode 100644 kernel/fs/hostfs/hostfs_user.c create mode 100644 kernel/fs/hpfs/Kconfig create mode 100644 kernel/fs/hpfs/Makefile create mode 100644 kernel/fs/hpfs/alloc.c create mode 100644 kernel/fs/hpfs/anode.c create mode 100644 kernel/fs/hpfs/buffer.c create mode 100644 kernel/fs/hpfs/dentry.c create mode 100644 kernel/fs/hpfs/dir.c create mode 100644 kernel/fs/hpfs/dnode.c create mode 100644 kernel/fs/hpfs/ea.c create mode 100644 kernel/fs/hpfs/file.c create mode 100644 kernel/fs/hpfs/hpfs.h create mode 100644 kernel/fs/hpfs/hpfs_fn.h create mode 100644 kernel/fs/hpfs/inode.c create mode 100644 kernel/fs/hpfs/map.c create mode 100644 kernel/fs/hpfs/name.c create mode 100644 kernel/fs/hpfs/namei.c create mode 100644 kernel/fs/hpfs/super.c create mode 100644 kernel/fs/hppfs/Makefile create mode 100644 kernel/fs/hppfs/hppfs.c create mode 100644 kernel/fs/hugetlbfs/Makefile create mode 100644 kernel/fs/hugetlbfs/inode.c create mode 100644 kernel/fs/inode.c create mode 100644 kernel/fs/internal.h create mode 100644 kernel/fs/ioctl.c create mode 100644 kernel/fs/isofs/Kconfig create mode 100644 kernel/fs/isofs/Makefile create mode 100644 kernel/fs/isofs/compress.c create mode 100644 kernel/fs/isofs/dir.c create mode 100644 kernel/fs/isofs/export.c create mode 100644 kernel/fs/isofs/inode.c create mode 100644 kernel/fs/isofs/isofs.h create mode 100644 kernel/fs/isofs/joliet.c create mode 100644 kernel/fs/isofs/namei.c create mode 100644 kernel/fs/isofs/rock.c create mode 100644 kernel/fs/isofs/rock.h create mode 100644 kernel/fs/isofs/util.c create mode 100644 kernel/fs/isofs/zisofs.h create mode 100644 kernel/fs/jbd/Kconfig create mode 100644 kernel/fs/jbd/Makefile create mode 100644 kernel/fs/jbd/checkpoint.c create mode 100644 kernel/fs/jbd/commit.c create mode 100644 kernel/fs/jbd/journal.c create mode 100644 kernel/fs/jbd/recovery.c create mode 100644 kernel/fs/jbd/revoke.c create mode 100644 kernel/fs/jbd/transaction.c create mode 100644 kernel/fs/jbd2/Kconfig create mode 100644 kernel/fs/jbd2/Makefile create mode 100644 kernel/fs/jbd2/checkpoint.c create mode 100644 kernel/fs/jbd2/commit.c create mode 100644 kernel/fs/jbd2/journal.c create mode 100644 kernel/fs/jbd2/recovery.c create mode 100644 kernel/fs/jbd2/revoke.c create mode 100644 kernel/fs/jbd2/transaction.c create mode 100644 kernel/fs/jffs2/Kconfig create mode 100644 kernel/fs/jffs2/LICENCE create mode 100644 kernel/fs/jffs2/Makefile create mode 100644 kernel/fs/jffs2/README.Locking create mode 100644 kernel/fs/jffs2/TODO create mode 100644 kernel/fs/jffs2/acl.c create mode 100644 kernel/fs/jffs2/acl.h create mode 100644 kernel/fs/jffs2/background.c create mode 100644 kernel/fs/jffs2/build.c create mode 100644 kernel/fs/jffs2/compr.c create mode 100644 kernel/fs/jffs2/compr.h create mode 100644 kernel/fs/jffs2/compr_lzo.c create mode 100644 kernel/fs/jffs2/compr_rtime.c create mode 100644 kernel/fs/jffs2/compr_rubin.c create mode 100644 kernel/fs/jffs2/compr_zlib.c create mode 100644 kernel/fs/jffs2/debug.c create mode 100644 kernel/fs/jffs2/debug.h create mode 100644 kernel/fs/jffs2/dir.c create mode 100644 kernel/fs/jffs2/erase.c create mode 100644 kernel/fs/jffs2/file.c create mode 100644 kernel/fs/jffs2/fs.c create mode 100644 kernel/fs/jffs2/gc.c create mode 100644 kernel/fs/jffs2/ioctl.c create mode 100644 kernel/fs/jffs2/jffs2_fs_i.h create mode 100644 kernel/fs/jffs2/jffs2_fs_sb.h create mode 100644 kernel/fs/jffs2/malloc.c create mode 100644 kernel/fs/jffs2/nodelist.c create mode 100644 kernel/fs/jffs2/nodelist.h create mode 100644 kernel/fs/jffs2/nodemgmt.c create mode 100644 kernel/fs/jffs2/os-linux.h create mode 100644 kernel/fs/jffs2/read.c create mode 100644 kernel/fs/jffs2/readinode.c create mode 100644 kernel/fs/jffs2/scan.c create mode 100644 kernel/fs/jffs2/security.c create mode 100644 kernel/fs/jffs2/summary.c create mode 100644 kernel/fs/jffs2/summary.h create mode 100644 kernel/fs/jffs2/super.c create mode 100644 kernel/fs/jffs2/symlink.c create mode 100644 kernel/fs/jffs2/wbuf.c create mode 100644 kernel/fs/jffs2/write.c create mode 100644 kernel/fs/jffs2/writev.c create mode 100644 kernel/fs/jffs2/xattr.c create mode 100644 kernel/fs/jffs2/xattr.h create mode 100644 kernel/fs/jffs2/xattr_trusted.c create mode 100644 kernel/fs/jffs2/xattr_user.c create mode 100644 kernel/fs/jfs/Kconfig create mode 100644 kernel/fs/jfs/Makefile create mode 100644 kernel/fs/jfs/acl.c create mode 100644 kernel/fs/jfs/file.c create mode 100644 kernel/fs/jfs/inode.c create mode 100644 kernel/fs/jfs/ioctl.c create mode 100644 kernel/fs/jfs/jfs_acl.h create mode 100644 kernel/fs/jfs/jfs_btree.h create mode 100644 kernel/fs/jfs/jfs_debug.c create mode 100644 kernel/fs/jfs/jfs_debug.h create mode 100644 kernel/fs/jfs/jfs_dinode.h create mode 100644 kernel/fs/jfs/jfs_discard.c create mode 100644 kernel/fs/jfs/jfs_discard.h create mode 100644 kernel/fs/jfs/jfs_dmap.c create mode 100644 kernel/fs/jfs/jfs_dmap.h create mode 100644 kernel/fs/jfs/jfs_dtree.c create mode 100644 kernel/fs/jfs/jfs_dtree.h create mode 100644 kernel/fs/jfs/jfs_extent.c create mode 100644 kernel/fs/jfs/jfs_extent.h create mode 100644 kernel/fs/jfs/jfs_filsys.h create mode 100644 kernel/fs/jfs/jfs_imap.c create mode 100644 kernel/fs/jfs/jfs_imap.h create mode 100644 kernel/fs/jfs/jfs_incore.h create mode 100644 kernel/fs/jfs/jfs_inode.c create mode 100644 kernel/fs/jfs/jfs_inode.h create mode 100644 kernel/fs/jfs/jfs_lock.h create mode 100644 kernel/fs/jfs/jfs_logmgr.c create mode 100644 kernel/fs/jfs/jfs_logmgr.h create mode 100644 kernel/fs/jfs/jfs_metapage.c create mode 100644 kernel/fs/jfs/jfs_metapage.h create mode 100644 kernel/fs/jfs/jfs_mount.c create mode 100644 kernel/fs/jfs/jfs_superblock.h create mode 100644 kernel/fs/jfs/jfs_txnmgr.c create mode 100644 kernel/fs/jfs/jfs_txnmgr.h create mode 100644 kernel/fs/jfs/jfs_types.h create mode 100644 kernel/fs/jfs/jfs_umount.c create mode 100644 kernel/fs/jfs/jfs_unicode.c create mode 100644 kernel/fs/jfs/jfs_unicode.h create mode 100644 kernel/fs/jfs/jfs_uniupr.c create mode 100644 kernel/fs/jfs/jfs_xattr.h create mode 100644 kernel/fs/jfs/jfs_xtree.c create mode 100644 kernel/fs/jfs/jfs_xtree.h create mode 100644 kernel/fs/jfs/namei.c create mode 100644 kernel/fs/jfs/resize.c create mode 100644 kernel/fs/jfs/super.c create mode 100644 kernel/fs/jfs/symlink.c create mode 100644 kernel/fs/jfs/xattr.c create mode 100644 kernel/fs/kernfs/Kconfig create mode 100644 kernel/fs/kernfs/Makefile create mode 100644 kernel/fs/kernfs/dir.c create mode 100644 kernel/fs/kernfs/file.c create mode 100644 kernel/fs/kernfs/inode.c create mode 100644 kernel/fs/kernfs/kernfs-internal.h create mode 100644 kernel/fs/kernfs/mount.c create mode 100644 kernel/fs/kernfs/symlink.c create mode 100644 kernel/fs/libfs.c create mode 100644 kernel/fs/lockd/Makefile create mode 100644 kernel/fs/lockd/clnt4xdr.c create mode 100644 kernel/fs/lockd/clntlock.c create mode 100644 kernel/fs/lockd/clntproc.c create mode 100644 kernel/fs/lockd/clntxdr.c create mode 100644 kernel/fs/lockd/host.c create mode 100644 kernel/fs/lockd/mon.c create mode 100644 kernel/fs/lockd/netns.h create mode 100644 kernel/fs/lockd/procfs.c create mode 100644 kernel/fs/lockd/procfs.h create mode 100644 kernel/fs/lockd/svc.c create mode 100644 kernel/fs/lockd/svc4proc.c create mode 100644 kernel/fs/lockd/svclock.c create mode 100644 kernel/fs/lockd/svcproc.c create mode 100644 kernel/fs/lockd/svcshare.c create mode 100644 kernel/fs/lockd/svcsubs.c create mode 100644 kernel/fs/lockd/xdr.c create mode 100644 kernel/fs/lockd/xdr4.c create mode 100644 kernel/fs/locks.c create mode 100644 kernel/fs/logfs/Kconfig create mode 100644 kernel/fs/logfs/Makefile create mode 100644 kernel/fs/logfs/compr.c create mode 100644 kernel/fs/logfs/dev_bdev.c create mode 100644 kernel/fs/logfs/dev_mtd.c create mode 100644 kernel/fs/logfs/dir.c create mode 100644 kernel/fs/logfs/file.c create mode 100644 kernel/fs/logfs/gc.c create mode 100644 kernel/fs/logfs/inode.c create mode 100644 kernel/fs/logfs/journal.c create mode 100644 kernel/fs/logfs/logfs.h create mode 100644 kernel/fs/logfs/logfs_abi.h create mode 100644 kernel/fs/logfs/readwrite.c create mode 100644 kernel/fs/logfs/segment.c create mode 100644 kernel/fs/logfs/super.c create mode 100644 kernel/fs/mbcache.c create mode 100644 kernel/fs/minix/Kconfig create mode 100644 kernel/fs/minix/Makefile create mode 100644 kernel/fs/minix/bitmap.c create mode 100644 kernel/fs/minix/dir.c create mode 100644 kernel/fs/minix/file.c create mode 100644 kernel/fs/minix/inode.c create mode 100644 kernel/fs/minix/itree_common.c create mode 100644 kernel/fs/minix/itree_v1.c create mode 100644 kernel/fs/minix/itree_v2.c create mode 100644 kernel/fs/minix/minix.h create mode 100644 kernel/fs/minix/namei.c create mode 100644 kernel/fs/mount.h create mode 100644 kernel/fs/mpage.c create mode 100644 kernel/fs/namei.c create mode 100644 kernel/fs/namespace.c create mode 100644 kernel/fs/ncpfs/Kconfig create mode 100644 kernel/fs/ncpfs/Makefile create mode 100644 kernel/fs/ncpfs/dir.c create mode 100644 kernel/fs/ncpfs/file.c create mode 100644 kernel/fs/ncpfs/getopt.c create mode 100644 kernel/fs/ncpfs/getopt.h create mode 100644 kernel/fs/ncpfs/inode.c create mode 100644 kernel/fs/ncpfs/ioctl.c create mode 100644 kernel/fs/ncpfs/mmap.c create mode 100644 kernel/fs/ncpfs/ncp_fs.h create mode 100644 kernel/fs/ncpfs/ncp_fs_i.h create mode 100644 kernel/fs/ncpfs/ncp_fs_sb.h create mode 100644 kernel/fs/ncpfs/ncplib_kernel.c create mode 100644 kernel/fs/ncpfs/ncplib_kernel.h create mode 100644 kernel/fs/ncpfs/ncpsign_kernel.c create mode 100644 kernel/fs/ncpfs/ncpsign_kernel.h create mode 100644 kernel/fs/ncpfs/sock.c create mode 100644 kernel/fs/ncpfs/symlink.c create mode 100644 kernel/fs/nfs/Kconfig create mode 100644 kernel/fs/nfs/Makefile create mode 100644 kernel/fs/nfs/blocklayout/Makefile create mode 100644 kernel/fs/nfs/blocklayout/blocklayout.c create mode 100644 kernel/fs/nfs/blocklayout/blocklayout.h create mode 100644 kernel/fs/nfs/blocklayout/dev.c create mode 100644 kernel/fs/nfs/blocklayout/extent_tree.c create mode 100644 kernel/fs/nfs/blocklayout/rpc_pipefs.c create mode 100644 kernel/fs/nfs/cache_lib.c create mode 100644 kernel/fs/nfs/cache_lib.h create mode 100644 kernel/fs/nfs/callback.c create mode 100644 kernel/fs/nfs/callback.h create mode 100644 kernel/fs/nfs/callback_proc.c create mode 100644 kernel/fs/nfs/callback_xdr.c create mode 100644 kernel/fs/nfs/client.c create mode 100644 kernel/fs/nfs/delegation.c create mode 100644 kernel/fs/nfs/delegation.h create mode 100644 kernel/fs/nfs/dir.c create mode 100644 kernel/fs/nfs/direct.c create mode 100644 kernel/fs/nfs/dns_resolve.c create mode 100644 kernel/fs/nfs/dns_resolve.h create mode 100644 kernel/fs/nfs/file.c create mode 100644 kernel/fs/nfs/filelayout/Makefile create mode 100644 kernel/fs/nfs/filelayout/filelayout.c create mode 100644 kernel/fs/nfs/filelayout/filelayout.h create mode 100644 kernel/fs/nfs/filelayout/filelayoutdev.c create mode 100644 kernel/fs/nfs/flexfilelayout/Makefile create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayout.c create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayout.h create mode 100644 kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c create mode 100644 kernel/fs/nfs/fscache-index.c create mode 100644 kernel/fs/nfs/fscache.c create mode 100644 kernel/fs/nfs/fscache.h create mode 100644 kernel/fs/nfs/getroot.c create mode 100644 kernel/fs/nfs/inode.c create mode 100644 kernel/fs/nfs/internal.h create mode 100644 kernel/fs/nfs/iostat.h create mode 100644 kernel/fs/nfs/mount_clnt.c create mode 100644 kernel/fs/nfs/namespace.c create mode 100644 kernel/fs/nfs/netns.h create mode 100644 kernel/fs/nfs/nfs.h create mode 100644 kernel/fs/nfs/nfs2super.c create mode 100644 kernel/fs/nfs/nfs2xdr.c create mode 100644 kernel/fs/nfs/nfs3_fs.h create mode 100644 kernel/fs/nfs/nfs3acl.c create mode 100644 kernel/fs/nfs/nfs3client.c create mode 100644 kernel/fs/nfs/nfs3proc.c create mode 100644 kernel/fs/nfs/nfs3super.c create mode 100644 kernel/fs/nfs/nfs3xdr.c create mode 100644 kernel/fs/nfs/nfs42.h create mode 100644 kernel/fs/nfs/nfs42proc.c create mode 100644 kernel/fs/nfs/nfs42xdr.c create mode 100644 kernel/fs/nfs/nfs4_fs.h create mode 100644 kernel/fs/nfs/nfs4client.c create mode 100644 kernel/fs/nfs/nfs4file.c create mode 100644 kernel/fs/nfs/nfs4getroot.c create mode 100644 kernel/fs/nfs/nfs4idmap.c create mode 100644 kernel/fs/nfs/nfs4idmap.h create mode 100644 kernel/fs/nfs/nfs4namespace.c create mode 100644 kernel/fs/nfs/nfs4proc.c create mode 100644 kernel/fs/nfs/nfs4renewd.c create mode 100644 kernel/fs/nfs/nfs4session.c create mode 100644 kernel/fs/nfs/nfs4session.h create mode 100644 kernel/fs/nfs/nfs4state.c create mode 100644 kernel/fs/nfs/nfs4super.c create mode 100644 kernel/fs/nfs/nfs4sysctl.c create mode 100644 kernel/fs/nfs/nfs4trace.c create mode 100644 kernel/fs/nfs/nfs4trace.h create mode 100644 kernel/fs/nfs/nfs4xdr.c create mode 100644 kernel/fs/nfs/nfsroot.c create mode 100644 kernel/fs/nfs/nfstrace.c create mode 100644 kernel/fs/nfs/nfstrace.h create mode 100644 kernel/fs/nfs/objlayout/Kbuild create mode 100644 kernel/fs/nfs/objlayout/objio_osd.c create mode 100644 kernel/fs/nfs/objlayout/objlayout.c create mode 100644 kernel/fs/nfs/objlayout/objlayout.h create mode 100644 kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c create mode 100644 kernel/fs/nfs/pagelist.c create mode 100644 kernel/fs/nfs/pnfs.c create mode 100644 kernel/fs/nfs/pnfs.h create mode 100644 kernel/fs/nfs/pnfs_dev.c create mode 100644 kernel/fs/nfs/pnfs_nfs.c create mode 100644 kernel/fs/nfs/proc.c create mode 100644 kernel/fs/nfs/read.c create mode 100644 kernel/fs/nfs/super.c create mode 100644 kernel/fs/nfs/symlink.c create mode 100644 kernel/fs/nfs/sysctl.c create mode 100644 kernel/fs/nfs/unlink.c create mode 100644 kernel/fs/nfs/write.c create mode 100644 kernel/fs/nfs_common/Makefile create mode 100644 kernel/fs/nfs_common/grace.c create mode 100644 kernel/fs/nfs_common/nfsacl.c create mode 100644 kernel/fs/nfsd/Kconfig create mode 100644 kernel/fs/nfsd/Makefile create mode 100644 kernel/fs/nfsd/acl.h create mode 100644 kernel/fs/nfsd/auth.c create mode 100644 kernel/fs/nfsd/auth.h create mode 100644 kernel/fs/nfsd/blocklayout.c create mode 100644 kernel/fs/nfsd/blocklayoutxdr.c create mode 100644 kernel/fs/nfsd/blocklayoutxdr.h create mode 100644 kernel/fs/nfsd/cache.h create mode 100644 kernel/fs/nfsd/current_stateid.h create mode 100644 kernel/fs/nfsd/export.c create mode 100644 kernel/fs/nfsd/export.h create mode 100644 kernel/fs/nfsd/fault_inject.c create mode 100644 kernel/fs/nfsd/idmap.h create mode 100644 kernel/fs/nfsd/lockd.c create mode 100644 kernel/fs/nfsd/netns.h create mode 100644 kernel/fs/nfsd/nfs2acl.c create mode 100644 kernel/fs/nfsd/nfs3acl.c create mode 100644 kernel/fs/nfsd/nfs3proc.c create mode 100644 kernel/fs/nfsd/nfs3xdr.c create mode 100644 kernel/fs/nfsd/nfs4acl.c create mode 100644 kernel/fs/nfsd/nfs4callback.c create mode 100644 kernel/fs/nfsd/nfs4idmap.c create mode 100644 kernel/fs/nfsd/nfs4layouts.c create mode 100644 kernel/fs/nfsd/nfs4proc.c create mode 100644 kernel/fs/nfsd/nfs4recover.c create mode 100644 kernel/fs/nfsd/nfs4state.c create mode 100644 kernel/fs/nfsd/nfs4xdr.c create mode 100644 kernel/fs/nfsd/nfscache.c create mode 100644 kernel/fs/nfsd/nfsctl.c create mode 100644 kernel/fs/nfsd/nfsd.h create mode 100644 kernel/fs/nfsd/nfsfh.c create mode 100644 kernel/fs/nfsd/nfsfh.h create mode 100644 kernel/fs/nfsd/nfsproc.c create mode 100644 kernel/fs/nfsd/nfssvc.c create mode 100644 kernel/fs/nfsd/nfsxdr.c create mode 100644 kernel/fs/nfsd/pnfs.h create mode 100644 kernel/fs/nfsd/state.h create mode 100644 kernel/fs/nfsd/stats.c create mode 100644 kernel/fs/nfsd/stats.h create mode 100644 kernel/fs/nfsd/trace.c create mode 100644 kernel/fs/nfsd/trace.h create mode 100644 kernel/fs/nfsd/vfs.c create mode 100644 kernel/fs/nfsd/vfs.h create mode 100644 kernel/fs/nfsd/xdr.h create mode 100644 kernel/fs/nfsd/xdr3.h create mode 100644 kernel/fs/nfsd/xdr4.h create mode 100644 kernel/fs/nfsd/xdr4cb.h create mode 100644 kernel/fs/nilfs2/Kconfig create mode 100644 kernel/fs/nilfs2/Makefile create mode 100644 kernel/fs/nilfs2/alloc.c create mode 100644 kernel/fs/nilfs2/alloc.h create mode 100644 kernel/fs/nilfs2/bmap.c create mode 100644 kernel/fs/nilfs2/bmap.h create mode 100644 kernel/fs/nilfs2/btnode.c create mode 100644 kernel/fs/nilfs2/btnode.h create mode 100644 kernel/fs/nilfs2/btree.c create mode 100644 kernel/fs/nilfs2/btree.h create mode 100644 kernel/fs/nilfs2/cpfile.c create mode 100644 kernel/fs/nilfs2/cpfile.h create mode 100644 kernel/fs/nilfs2/dat.c create mode 100644 kernel/fs/nilfs2/dat.h create mode 100644 kernel/fs/nilfs2/dir.c create mode 100644 kernel/fs/nilfs2/direct.c create mode 100644 kernel/fs/nilfs2/direct.h create mode 100644 kernel/fs/nilfs2/export.h create mode 100644 kernel/fs/nilfs2/file.c create mode 100644 kernel/fs/nilfs2/gcinode.c create mode 100644 kernel/fs/nilfs2/ifile.c create mode 100644 kernel/fs/nilfs2/ifile.h create mode 100644 kernel/fs/nilfs2/inode.c create mode 100644 kernel/fs/nilfs2/ioctl.c create mode 100644 kernel/fs/nilfs2/mdt.c create mode 100644 kernel/fs/nilfs2/mdt.h create mode 100644 kernel/fs/nilfs2/namei.c create mode 100644 kernel/fs/nilfs2/nilfs.h create mode 100644 kernel/fs/nilfs2/page.c create mode 100644 kernel/fs/nilfs2/page.h create mode 100644 kernel/fs/nilfs2/recovery.c create mode 100644 kernel/fs/nilfs2/segbuf.c create mode 100644 kernel/fs/nilfs2/segbuf.h create mode 100644 kernel/fs/nilfs2/segment.c create mode 100644 kernel/fs/nilfs2/segment.h create mode 100644 kernel/fs/nilfs2/sufile.c create mode 100644 kernel/fs/nilfs2/sufile.h create mode 100644 kernel/fs/nilfs2/super.c create mode 100644 kernel/fs/nilfs2/sysfs.c create mode 100644 kernel/fs/nilfs2/sysfs.h create mode 100644 kernel/fs/nilfs2/the_nilfs.c create mode 100644 kernel/fs/nilfs2/the_nilfs.h create mode 100644 kernel/fs/nls/Kconfig create mode 100644 kernel/fs/nls/Makefile create mode 100644 kernel/fs/nls/mac-celtic.c create mode 100644 kernel/fs/nls/mac-centeuro.c create mode 100644 kernel/fs/nls/mac-croatian.c create mode 100644 kernel/fs/nls/mac-cyrillic.c create mode 100644 kernel/fs/nls/mac-gaelic.c create mode 100644 kernel/fs/nls/mac-greek.c create mode 100644 kernel/fs/nls/mac-iceland.c create mode 100644 kernel/fs/nls/mac-inuit.c create mode 100644 kernel/fs/nls/mac-roman.c create mode 100644 kernel/fs/nls/mac-romanian.c create mode 100644 kernel/fs/nls/mac-turkish.c create mode 100644 kernel/fs/nls/nls_ascii.c create mode 100644 kernel/fs/nls/nls_base.c create mode 100644 kernel/fs/nls/nls_cp1250.c create mode 100644 kernel/fs/nls/nls_cp1251.c create mode 100644 kernel/fs/nls/nls_cp1255.c create mode 100644 kernel/fs/nls/nls_cp437.c create mode 100644 kernel/fs/nls/nls_cp737.c create mode 100644 kernel/fs/nls/nls_cp775.c create mode 100644 kernel/fs/nls/nls_cp850.c create mode 100644 kernel/fs/nls/nls_cp852.c create mode 100644 kernel/fs/nls/nls_cp855.c create mode 100644 kernel/fs/nls/nls_cp857.c create mode 100644 kernel/fs/nls/nls_cp860.c create mode 100644 kernel/fs/nls/nls_cp861.c create mode 100644 kernel/fs/nls/nls_cp862.c create mode 100644 kernel/fs/nls/nls_cp863.c create mode 100644 kernel/fs/nls/nls_cp864.c create mode 100644 kernel/fs/nls/nls_cp865.c create mode 100644 kernel/fs/nls/nls_cp866.c create mode 100644 kernel/fs/nls/nls_cp869.c create mode 100644 kernel/fs/nls/nls_cp874.c create mode 100644 kernel/fs/nls/nls_cp932.c create mode 100644 kernel/fs/nls/nls_cp936.c create mode 100644 kernel/fs/nls/nls_cp949.c create mode 100644 kernel/fs/nls/nls_cp950.c create mode 100644 kernel/fs/nls/nls_euc-jp.c create mode 100644 kernel/fs/nls/nls_iso8859-1.c create mode 100644 kernel/fs/nls/nls_iso8859-13.c create mode 100644 kernel/fs/nls/nls_iso8859-14.c create mode 100644 kernel/fs/nls/nls_iso8859-15.c create mode 100644 kernel/fs/nls/nls_iso8859-2.c create mode 100644 kernel/fs/nls/nls_iso8859-3.c create mode 100644 kernel/fs/nls/nls_iso8859-4.c create mode 100644 kernel/fs/nls/nls_iso8859-5.c create mode 100644 kernel/fs/nls/nls_iso8859-6.c create mode 100644 kernel/fs/nls/nls_iso8859-7.c create mode 100644 kernel/fs/nls/nls_iso8859-9.c create mode 100644 kernel/fs/nls/nls_koi8-r.c create mode 100644 kernel/fs/nls/nls_koi8-ru.c create mode 100644 kernel/fs/nls/nls_koi8-u.c create mode 100644 kernel/fs/nls/nls_utf8.c create mode 100644 kernel/fs/no-block.c create mode 100644 kernel/fs/notify/Kconfig create mode 100644 kernel/fs/notify/Makefile create mode 100644 kernel/fs/notify/dnotify/Kconfig create mode 100644 kernel/fs/notify/dnotify/Makefile create mode 100644 kernel/fs/notify/dnotify/dnotify.c create mode 100644 kernel/fs/notify/fanotify/Kconfig create mode 100644 kernel/fs/notify/fanotify/Makefile create mode 100644 kernel/fs/notify/fanotify/fanotify.c create mode 100644 kernel/fs/notify/fanotify/fanotify.h create mode 100644 kernel/fs/notify/fanotify/fanotify_user.c create mode 100644 kernel/fs/notify/fdinfo.c create mode 100644 kernel/fs/notify/fdinfo.h create mode 100644 kernel/fs/notify/fsnotify.c create mode 100644 kernel/fs/notify/fsnotify.h create mode 100644 kernel/fs/notify/group.c create mode 100644 kernel/fs/notify/inode_mark.c create mode 100644 kernel/fs/notify/inotify/Kconfig create mode 100644 kernel/fs/notify/inotify/Makefile create mode 100644 kernel/fs/notify/inotify/inotify.h create mode 100644 kernel/fs/notify/inotify/inotify_fsnotify.c create mode 100644 kernel/fs/notify/inotify/inotify_user.c create mode 100644 kernel/fs/notify/mark.c create mode 100644 kernel/fs/notify/notification.c create mode 100644 kernel/fs/notify/vfsmount_mark.c create mode 100644 kernel/fs/nsfs.c create mode 100644 kernel/fs/ntfs/Kconfig create mode 100644 kernel/fs/ntfs/Makefile create mode 100644 kernel/fs/ntfs/aops.c create mode 100644 kernel/fs/ntfs/aops.h create mode 100644 kernel/fs/ntfs/attrib.c create mode 100644 kernel/fs/ntfs/attrib.h create mode 100644 kernel/fs/ntfs/bitmap.c create mode 100644 kernel/fs/ntfs/bitmap.h create mode 100644 kernel/fs/ntfs/collate.c create mode 100644 kernel/fs/ntfs/collate.h create mode 100644 kernel/fs/ntfs/compress.c create mode 100644 kernel/fs/ntfs/debug.c create mode 100644 kernel/fs/ntfs/debug.h create mode 100644 kernel/fs/ntfs/dir.c create mode 100644 kernel/fs/ntfs/dir.h create mode 100644 kernel/fs/ntfs/endian.h create mode 100644 kernel/fs/ntfs/file.c create mode 100644 kernel/fs/ntfs/index.c create mode 100644 kernel/fs/ntfs/index.h create mode 100644 kernel/fs/ntfs/inode.c create mode 100644 kernel/fs/ntfs/inode.h create mode 100644 kernel/fs/ntfs/layout.h create mode 100644 kernel/fs/ntfs/lcnalloc.c create mode 100644 kernel/fs/ntfs/lcnalloc.h create mode 100644 kernel/fs/ntfs/logfile.c create mode 100644 kernel/fs/ntfs/logfile.h create mode 100644 kernel/fs/ntfs/malloc.h create mode 100644 kernel/fs/ntfs/mft.c create mode 100644 kernel/fs/ntfs/mft.h create mode 100644 kernel/fs/ntfs/mst.c create mode 100644 kernel/fs/ntfs/namei.c create mode 100644 kernel/fs/ntfs/ntfs.h create mode 100644 kernel/fs/ntfs/quota.c create mode 100644 kernel/fs/ntfs/quota.h create mode 100644 kernel/fs/ntfs/runlist.c create mode 100644 kernel/fs/ntfs/runlist.h create mode 100644 kernel/fs/ntfs/super.c create mode 100644 kernel/fs/ntfs/sysctl.c create mode 100644 kernel/fs/ntfs/sysctl.h create mode 100644 kernel/fs/ntfs/time.h create mode 100644 kernel/fs/ntfs/types.h create mode 100644 kernel/fs/ntfs/unistr.c create mode 100644 kernel/fs/ntfs/upcase.c create mode 100644 kernel/fs/ntfs/usnjrnl.c create mode 100644 kernel/fs/ntfs/usnjrnl.h create mode 100644 kernel/fs/ntfs/volume.h create mode 100644 kernel/fs/ocfs2/Kconfig create mode 100644 kernel/fs/ocfs2/Makefile create mode 100644 kernel/fs/ocfs2/acl.c create mode 100644 kernel/fs/ocfs2/acl.h create mode 100644 kernel/fs/ocfs2/alloc.c create mode 100644 kernel/fs/ocfs2/alloc.h create mode 100644 kernel/fs/ocfs2/aops.c create mode 100644 kernel/fs/ocfs2/aops.h create mode 100644 kernel/fs/ocfs2/blockcheck.c create mode 100644 kernel/fs/ocfs2/blockcheck.h create mode 100644 kernel/fs/ocfs2/buffer_head_io.c create mode 100644 kernel/fs/ocfs2/buffer_head_io.h create mode 100644 kernel/fs/ocfs2/cluster/Makefile create mode 100644 kernel/fs/ocfs2/cluster/heartbeat.c create mode 100644 kernel/fs/ocfs2/cluster/heartbeat.h create mode 100644 kernel/fs/ocfs2/cluster/masklog.c create mode 100644 kernel/fs/ocfs2/cluster/masklog.h create mode 100644 kernel/fs/ocfs2/cluster/netdebug.c create mode 100644 kernel/fs/ocfs2/cluster/nodemanager.c create mode 100644 kernel/fs/ocfs2/cluster/nodemanager.h create mode 100644 kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h create mode 100644 kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h create mode 100644 kernel/fs/ocfs2/cluster/quorum.c create mode 100644 kernel/fs/ocfs2/cluster/quorum.h create mode 100644 kernel/fs/ocfs2/cluster/sys.c create mode 100644 kernel/fs/ocfs2/cluster/sys.h create mode 100644 kernel/fs/ocfs2/cluster/tcp.c create mode 100644 kernel/fs/ocfs2/cluster/tcp.h create mode 100644 kernel/fs/ocfs2/cluster/tcp_internal.h create mode 100644 kernel/fs/ocfs2/dcache.c create mode 100644 kernel/fs/ocfs2/dcache.h create mode 100644 kernel/fs/ocfs2/dir.c create mode 100644 kernel/fs/ocfs2/dir.h create mode 100644 kernel/fs/ocfs2/dlm/Makefile create mode 100644 kernel/fs/ocfs2/dlm/dlmapi.h create mode 100644 kernel/fs/ocfs2/dlm/dlmast.c create mode 100644 kernel/fs/ocfs2/dlm/dlmcommon.h create mode 100644 kernel/fs/ocfs2/dlm/dlmconvert.c create mode 100644 kernel/fs/ocfs2/dlm/dlmconvert.h create mode 100644 kernel/fs/ocfs2/dlm/dlmdebug.c create mode 100644 kernel/fs/ocfs2/dlm/dlmdebug.h create mode 100644 kernel/fs/ocfs2/dlm/dlmdomain.c create mode 100644 kernel/fs/ocfs2/dlm/dlmdomain.h create mode 100644 kernel/fs/ocfs2/dlm/dlmlock.c create mode 100644 kernel/fs/ocfs2/dlm/dlmmaster.c create mode 100644 kernel/fs/ocfs2/dlm/dlmrecovery.c create mode 100644 kernel/fs/ocfs2/dlm/dlmthread.c create mode 100644 kernel/fs/ocfs2/dlm/dlmunlock.c create mode 100644 kernel/fs/ocfs2/dlmfs/Makefile create mode 100644 kernel/fs/ocfs2/dlmfs/dlmfs.c create mode 100644 kernel/fs/ocfs2/dlmfs/userdlm.c create mode 100644 kernel/fs/ocfs2/dlmfs/userdlm.h create mode 100644 kernel/fs/ocfs2/dlmglue.c create mode 100644 kernel/fs/ocfs2/dlmglue.h create mode 100644 kernel/fs/ocfs2/export.c create mode 100644 kernel/fs/ocfs2/export.h create mode 100644 kernel/fs/ocfs2/extent_map.c create mode 100644 kernel/fs/ocfs2/extent_map.h create mode 100644 kernel/fs/ocfs2/file.c create mode 100644 kernel/fs/ocfs2/file.h create mode 100644 kernel/fs/ocfs2/heartbeat.c create mode 100644 kernel/fs/ocfs2/heartbeat.h create mode 100644 kernel/fs/ocfs2/inode.c create mode 100644 kernel/fs/ocfs2/inode.h create mode 100644 kernel/fs/ocfs2/ioctl.c create mode 100644 kernel/fs/ocfs2/ioctl.h create mode 100644 kernel/fs/ocfs2/journal.c create mode 100644 kernel/fs/ocfs2/journal.h create mode 100644 kernel/fs/ocfs2/localalloc.c create mode 100644 kernel/fs/ocfs2/localalloc.h create mode 100644 kernel/fs/ocfs2/locks.c create mode 100644 kernel/fs/ocfs2/locks.h create mode 100644 kernel/fs/ocfs2/mmap.c create mode 100644 kernel/fs/ocfs2/mmap.h create mode 100644 kernel/fs/ocfs2/move_extents.c create mode 100644 kernel/fs/ocfs2/move_extents.h create mode 100644 kernel/fs/ocfs2/namei.c create mode 100644 kernel/fs/ocfs2/namei.h create mode 100644 kernel/fs/ocfs2/ocfs1_fs_compat.h create mode 100644 kernel/fs/ocfs2/ocfs2.h create mode 100644 kernel/fs/ocfs2/ocfs2_fs.h create mode 100644 kernel/fs/ocfs2/ocfs2_ioctl.h create mode 100644 kernel/fs/ocfs2/ocfs2_lockid.h create mode 100644 kernel/fs/ocfs2/ocfs2_lockingver.h create mode 100644 kernel/fs/ocfs2/ocfs2_trace.h create mode 100644 kernel/fs/ocfs2/quota.h create mode 100644 kernel/fs/ocfs2/quota_global.c create mode 100644 kernel/fs/ocfs2/quota_local.c create mode 100644 kernel/fs/ocfs2/refcounttree.c create mode 100644 kernel/fs/ocfs2/refcounttree.h create mode 100644 kernel/fs/ocfs2/reservations.c create mode 100644 kernel/fs/ocfs2/reservations.h create mode 100644 kernel/fs/ocfs2/resize.c create mode 100644 kernel/fs/ocfs2/resize.h create mode 100644 kernel/fs/ocfs2/slot_map.c create mode 100644 kernel/fs/ocfs2/slot_map.h create mode 100644 kernel/fs/ocfs2/stack_o2cb.c create mode 100644 kernel/fs/ocfs2/stack_user.c create mode 100644 kernel/fs/ocfs2/stackglue.c create mode 100644 kernel/fs/ocfs2/stackglue.h create mode 100644 kernel/fs/ocfs2/suballoc.c create mode 100644 kernel/fs/ocfs2/suballoc.h create mode 100644 kernel/fs/ocfs2/super.c create mode 100644 kernel/fs/ocfs2/super.h create mode 100644 kernel/fs/ocfs2/symlink.c create mode 100644 kernel/fs/ocfs2/symlink.h create mode 100644 kernel/fs/ocfs2/sysfile.c create mode 100644 kernel/fs/ocfs2/sysfile.h create mode 100644 kernel/fs/ocfs2/uptodate.c create mode 100644 kernel/fs/ocfs2/uptodate.h create mode 100644 kernel/fs/ocfs2/xattr.c create mode 100644 kernel/fs/ocfs2/xattr.h create mode 100644 kernel/fs/omfs/Kconfig create mode 100644 kernel/fs/omfs/Makefile create mode 100644 kernel/fs/omfs/bitmap.c create mode 100644 kernel/fs/omfs/dir.c create mode 100644 kernel/fs/omfs/file.c create mode 100644 kernel/fs/omfs/inode.c create mode 100644 kernel/fs/omfs/omfs.h create mode 100644 kernel/fs/omfs/omfs_fs.h create mode 100644 kernel/fs/open.c create mode 100644 kernel/fs/openpromfs/Makefile create mode 100644 kernel/fs/openpromfs/inode.c create mode 100644 kernel/fs/overlayfs/Kconfig create mode 100644 kernel/fs/overlayfs/Makefile create mode 100644 kernel/fs/overlayfs/copy_up.c create mode 100644 kernel/fs/overlayfs/dir.c create mode 100644 kernel/fs/overlayfs/inode.c create mode 100644 kernel/fs/overlayfs/overlayfs.h create mode 100644 kernel/fs/overlayfs/readdir.c create mode 100644 kernel/fs/overlayfs/super.c create mode 100644 kernel/fs/pipe.c create mode 100644 kernel/fs/pnode.c create mode 100644 kernel/fs/pnode.h create mode 100644 kernel/fs/posix_acl.c create mode 100644 kernel/fs/proc/Kconfig create mode 100644 kernel/fs/proc/Makefile create mode 100644 kernel/fs/proc/array.c create mode 100644 kernel/fs/proc/base.c create mode 100644 kernel/fs/proc/cmdline.c create mode 100644 kernel/fs/proc/consoles.c create mode 100644 kernel/fs/proc/cpuinfo.c create mode 100644 kernel/fs/proc/devices.c create mode 100644 kernel/fs/proc/fd.c create mode 100644 kernel/fs/proc/fd.h create mode 100644 kernel/fs/proc/generic.c create mode 100644 kernel/fs/proc/inode.c create mode 100644 kernel/fs/proc/internal.h create mode 100644 kernel/fs/proc/interrupts.c create mode 100644 kernel/fs/proc/kcore.c create mode 100644 kernel/fs/proc/kmsg.c create mode 100644 kernel/fs/proc/loadavg.c create mode 100644 kernel/fs/proc/meminfo.c create mode 100644 kernel/fs/proc/namespaces.c create mode 100644 kernel/fs/proc/nommu.c create mode 100644 kernel/fs/proc/page.c create mode 100644 kernel/fs/proc/proc_net.c create mode 100644 kernel/fs/proc/proc_sysctl.c create mode 100644 kernel/fs/proc/proc_tty.c create mode 100644 kernel/fs/proc/root.c create mode 100644 kernel/fs/proc/self.c create mode 100644 kernel/fs/proc/softirqs.c create mode 100644 kernel/fs/proc/stat.c create mode 100644 kernel/fs/proc/task_mmu.c create mode 100644 kernel/fs/proc/task_nommu.c create mode 100644 kernel/fs/proc/thread_self.c create mode 100644 kernel/fs/proc/uptime.c create mode 100644 kernel/fs/proc/version.c create mode 100644 kernel/fs/proc/vmcore.c create mode 100644 kernel/fs/proc_namespace.c create mode 100644 kernel/fs/pstore/Kconfig create mode 100644 kernel/fs/pstore/Makefile create mode 100644 kernel/fs/pstore/ftrace.c create mode 100644 kernel/fs/pstore/inode.c create mode 100644 kernel/fs/pstore/internal.h create mode 100644 kernel/fs/pstore/platform.c create mode 100644 kernel/fs/pstore/pmsg.c create mode 100644 kernel/fs/pstore/ram.c create mode 100644 kernel/fs/pstore/ram_core.c create mode 100644 kernel/fs/qnx4/Kconfig create mode 100644 kernel/fs/qnx4/Makefile create mode 100644 kernel/fs/qnx4/README create mode 100644 kernel/fs/qnx4/bitmap.c create mode 100644 kernel/fs/qnx4/dir.c create mode 100644 kernel/fs/qnx4/inode.c create mode 100644 kernel/fs/qnx4/namei.c create mode 100644 kernel/fs/qnx4/qnx4.h create mode 100644 kernel/fs/qnx6/Kconfig create mode 100644 kernel/fs/qnx6/Makefile create mode 100644 kernel/fs/qnx6/README create mode 100644 kernel/fs/qnx6/dir.c create mode 100644 kernel/fs/qnx6/inode.c create mode 100644 kernel/fs/qnx6/namei.c create mode 100644 kernel/fs/qnx6/qnx6.h create mode 100644 kernel/fs/qnx6/super_mmi.c create mode 100644 kernel/fs/quota/Kconfig create mode 100644 kernel/fs/quota/Makefile create mode 100644 kernel/fs/quota/compat.c create mode 100644 kernel/fs/quota/dquot.c create mode 100644 kernel/fs/quota/kqid.c create mode 100644 kernel/fs/quota/netlink.c create mode 100644 kernel/fs/quota/quota.c create mode 100644 kernel/fs/quota/quota_tree.c create mode 100644 kernel/fs/quota/quota_tree.h create mode 100644 kernel/fs/quota/quota_v1.c create mode 100644 kernel/fs/quota/quota_v2.c create mode 100644 kernel/fs/quota/quotaio_v1.h create mode 100644 kernel/fs/quota/quotaio_v2.h create mode 100644 kernel/fs/ramfs/Makefile create mode 100644 kernel/fs/ramfs/file-mmu.c create mode 100644 kernel/fs/ramfs/file-nommu.c create mode 100644 kernel/fs/ramfs/inode.c create mode 100644 kernel/fs/ramfs/internal.h create mode 100644 kernel/fs/read_write.c create mode 100644 kernel/fs/readdir.c create mode 100644 kernel/fs/reiserfs/Kconfig create mode 100644 kernel/fs/reiserfs/Makefile create mode 100644 kernel/fs/reiserfs/README create mode 100644 kernel/fs/reiserfs/acl.h create mode 100644 kernel/fs/reiserfs/bitmap.c create mode 100644 kernel/fs/reiserfs/dir.c create mode 100644 kernel/fs/reiserfs/do_balan.c create mode 100644 kernel/fs/reiserfs/file.c create mode 100644 kernel/fs/reiserfs/fix_node.c create mode 100644 kernel/fs/reiserfs/hashes.c create mode 100644 kernel/fs/reiserfs/ibalance.c create mode 100644 kernel/fs/reiserfs/inode.c create mode 100644 kernel/fs/reiserfs/ioctl.c create mode 100644 kernel/fs/reiserfs/item_ops.c create mode 100644 kernel/fs/reiserfs/journal.c create mode 100644 kernel/fs/reiserfs/lbalance.c create mode 100644 kernel/fs/reiserfs/lock.c create mode 100644 kernel/fs/reiserfs/namei.c create mode 100644 kernel/fs/reiserfs/objectid.c create mode 100644 kernel/fs/reiserfs/prints.c create mode 100644 kernel/fs/reiserfs/procfs.c create mode 100644 kernel/fs/reiserfs/reiserfs.h create mode 100644 kernel/fs/reiserfs/resize.c create mode 100644 kernel/fs/reiserfs/stree.c create mode 100644 kernel/fs/reiserfs/super.c create mode 100644 kernel/fs/reiserfs/tail_conversion.c create mode 100644 kernel/fs/reiserfs/xattr.c create mode 100644 kernel/fs/reiserfs/xattr.h create mode 100644 kernel/fs/reiserfs/xattr_acl.c create mode 100644 kernel/fs/reiserfs/xattr_security.c create mode 100644 kernel/fs/reiserfs/xattr_trusted.c create mode 100644 kernel/fs/reiserfs/xattr_user.c create mode 100644 kernel/fs/romfs/Kconfig create mode 100644 kernel/fs/romfs/Makefile create mode 100644 kernel/fs/romfs/internal.h create mode 100644 kernel/fs/romfs/mmap-nommu.c create mode 100644 kernel/fs/romfs/storage.c create mode 100644 kernel/fs/romfs/super.c create mode 100644 kernel/fs/select.c create mode 100644 kernel/fs/seq_file.c create mode 100644 kernel/fs/signalfd.c create mode 100644 kernel/fs/splice.c create mode 100644 kernel/fs/squashfs/Kconfig create mode 100644 kernel/fs/squashfs/Makefile create mode 100644 kernel/fs/squashfs/block.c create mode 100644 kernel/fs/squashfs/cache.c create mode 100644 kernel/fs/squashfs/decompressor.c create mode 100644 kernel/fs/squashfs/decompressor.h create mode 100644 kernel/fs/squashfs/decompressor_multi.c create mode 100644 kernel/fs/squashfs/decompressor_multi_percpu.c create mode 100644 kernel/fs/squashfs/decompressor_single.c create mode 100644 kernel/fs/squashfs/dir.c create mode 100644 kernel/fs/squashfs/export.c create mode 100644 kernel/fs/squashfs/file.c create mode 100644 kernel/fs/squashfs/file_cache.c create mode 100644 kernel/fs/squashfs/file_direct.c create mode 100644 kernel/fs/squashfs/fragment.c create mode 100644 kernel/fs/squashfs/id.c create mode 100644 kernel/fs/squashfs/inode.c create mode 100644 kernel/fs/squashfs/lz4_wrapper.c create mode 100644 kernel/fs/squashfs/lzo_wrapper.c create mode 100644 kernel/fs/squashfs/namei.c create mode 100644 kernel/fs/squashfs/page_actor.c create mode 100644 kernel/fs/squashfs/page_actor.h create mode 100644 kernel/fs/squashfs/squashfs.h create mode 100644 kernel/fs/squashfs/squashfs_fs.h create mode 100644 kernel/fs/squashfs/squashfs_fs_i.h create mode 100644 kernel/fs/squashfs/squashfs_fs_sb.h create mode 100644 kernel/fs/squashfs/super.c create mode 100644 kernel/fs/squashfs/symlink.c create mode 100644 kernel/fs/squashfs/xattr.c create mode 100644 kernel/fs/squashfs/xattr.h create mode 100644 kernel/fs/squashfs/xattr_id.c create mode 100644 kernel/fs/squashfs/xz_wrapper.c create mode 100644 kernel/fs/squashfs/zlib_wrapper.c create mode 100644 kernel/fs/stack.c create mode 100644 kernel/fs/stat.c create mode 100644 kernel/fs/statfs.c create mode 100644 kernel/fs/super.c create mode 100644 kernel/fs/sync.c create mode 100644 kernel/fs/sysfs/Kconfig create mode 100644 kernel/fs/sysfs/Makefile create mode 100644 kernel/fs/sysfs/dir.c create mode 100644 kernel/fs/sysfs/file.c create mode 100644 kernel/fs/sysfs/group.c create mode 100644 kernel/fs/sysfs/mount.c create mode 100644 kernel/fs/sysfs/symlink.c create mode 100644 kernel/fs/sysfs/sysfs.h create mode 100644 kernel/fs/sysv/Kconfig create mode 100644 kernel/fs/sysv/Makefile create mode 100644 kernel/fs/sysv/balloc.c create mode 100644 kernel/fs/sysv/dir.c create mode 100644 kernel/fs/sysv/file.c create mode 100644 kernel/fs/sysv/ialloc.c create mode 100644 kernel/fs/sysv/inode.c create mode 100644 kernel/fs/sysv/itree.c create mode 100644 kernel/fs/sysv/namei.c create mode 100644 kernel/fs/sysv/super.c create mode 100644 kernel/fs/sysv/symlink.c create mode 100644 kernel/fs/sysv/sysv.h create mode 100644 kernel/fs/timerfd.c create mode 100644 kernel/fs/tracefs/Makefile create mode 100644 kernel/fs/tracefs/inode.c create mode 100644 kernel/fs/ubifs/Kconfig create mode 100644 kernel/fs/ubifs/Makefile create mode 100644 kernel/fs/ubifs/budget.c create mode 100644 kernel/fs/ubifs/commit.c create mode 100644 kernel/fs/ubifs/compress.c create mode 100644 kernel/fs/ubifs/debug.c create mode 100644 kernel/fs/ubifs/debug.h create mode 100644 kernel/fs/ubifs/dir.c create mode 100644 kernel/fs/ubifs/file.c create mode 100644 kernel/fs/ubifs/find.c create mode 100644 kernel/fs/ubifs/gc.c create mode 100644 kernel/fs/ubifs/io.c create mode 100644 kernel/fs/ubifs/ioctl.c create mode 100644 kernel/fs/ubifs/journal.c create mode 100644 kernel/fs/ubifs/key.h create mode 100644 kernel/fs/ubifs/log.c create mode 100644 kernel/fs/ubifs/lprops.c create mode 100644 kernel/fs/ubifs/lpt.c create mode 100644 kernel/fs/ubifs/lpt_commit.c create mode 100644 kernel/fs/ubifs/master.c create mode 100644 kernel/fs/ubifs/misc.h create mode 100644 kernel/fs/ubifs/orphan.c create mode 100644 kernel/fs/ubifs/recovery.c create mode 100644 kernel/fs/ubifs/replay.c create mode 100644 kernel/fs/ubifs/sb.c create mode 100644 kernel/fs/ubifs/scan.c create mode 100644 kernel/fs/ubifs/shrinker.c create mode 100644 kernel/fs/ubifs/super.c create mode 100644 kernel/fs/ubifs/tnc.c create mode 100644 kernel/fs/ubifs/tnc_commit.c create mode 100644 kernel/fs/ubifs/tnc_misc.c create mode 100644 kernel/fs/ubifs/ubifs-media.h create mode 100644 kernel/fs/ubifs/ubifs.h create mode 100644 kernel/fs/ubifs/xattr.c create mode 100644 kernel/fs/udf/Kconfig create mode 100644 kernel/fs/udf/Makefile create mode 100644 kernel/fs/udf/balloc.c create mode 100644 kernel/fs/udf/dir.c create mode 100644 kernel/fs/udf/directory.c create mode 100644 kernel/fs/udf/ecma_167.h create mode 100644 kernel/fs/udf/file.c create mode 100644 kernel/fs/udf/ialloc.c create mode 100644 kernel/fs/udf/inode.c create mode 100644 kernel/fs/udf/lowlevel.c create mode 100644 kernel/fs/udf/misc.c create mode 100644 kernel/fs/udf/namei.c create mode 100644 kernel/fs/udf/osta_udf.h create mode 100644 kernel/fs/udf/partition.c create mode 100644 kernel/fs/udf/super.c create mode 100644 kernel/fs/udf/symlink.c create mode 100644 kernel/fs/udf/truncate.c create mode 100644 kernel/fs/udf/udf_i.h create mode 100644 kernel/fs/udf/udf_sb.h create mode 100644 kernel/fs/udf/udfdecl.h create mode 100644 kernel/fs/udf/udfend.h create mode 100644 kernel/fs/udf/udftime.c create mode 100644 kernel/fs/udf/unicode.c create mode 100644 kernel/fs/ufs/Kconfig create mode 100644 kernel/fs/ufs/Makefile create mode 100644 kernel/fs/ufs/balloc.c create mode 100644 kernel/fs/ufs/cylinder.c create mode 100644 kernel/fs/ufs/dir.c create mode 100644 kernel/fs/ufs/file.c create mode 100644 kernel/fs/ufs/ialloc.c create mode 100644 kernel/fs/ufs/inode.c create mode 100644 kernel/fs/ufs/namei.c create mode 100644 kernel/fs/ufs/super.c create mode 100644 kernel/fs/ufs/swab.h create mode 100644 kernel/fs/ufs/symlink.c create mode 100644 kernel/fs/ufs/truncate.c create mode 100644 kernel/fs/ufs/ufs.h create mode 100644 kernel/fs/ufs/ufs_fs.h create mode 100644 kernel/fs/ufs/util.c create mode 100644 kernel/fs/ufs/util.h create mode 100644 kernel/fs/utimes.c create mode 100644 kernel/fs/xattr.c create mode 100644 kernel/fs/xfs/Kconfig create mode 100644 kernel/fs/xfs/Makefile create mode 100644 kernel/fs/xfs/kmem.c create mode 100644 kernel/fs/xfs/kmem.h create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.c create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.h create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_sf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bit.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.c create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_cksum.h create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.c create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_block.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_data.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_leaf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_node.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_priv.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_sf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dquot_buf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_fs.h create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.c create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.h create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.c create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_recover.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_rlimit.c create mode 100644 kernel/fs/xfs/libxfs/xfs_quota_defs.h create mode 100644 kernel/fs/xfs/libxfs/xfs_rtbitmap.c create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.c create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.h create mode 100644 kernel/fs/xfs/libxfs/xfs_shared.h create mode 100644 kernel/fs/xfs/libxfs/xfs_symlink_remote.c create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.c create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.h create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_space.h create mode 100644 kernel/fs/xfs/libxfs/xfs_types.h create mode 100644 kernel/fs/xfs/mrlock.h create mode 100644 kernel/fs/xfs/uuid.c create mode 100644 kernel/fs/xfs/uuid.h create mode 100644 kernel/fs/xfs/xfs.h create mode 100644 kernel/fs/xfs/xfs_acl.c create mode 100644 kernel/fs/xfs/xfs_acl.h create mode 100644 kernel/fs/xfs/xfs_aops.c create mode 100644 kernel/fs/xfs/xfs_aops.h create mode 100644 kernel/fs/xfs/xfs_attr.h create mode 100644 kernel/fs/xfs/xfs_attr_inactive.c create mode 100644 kernel/fs/xfs/xfs_attr_list.c create mode 100644 kernel/fs/xfs/xfs_bit.c create mode 100644 kernel/fs/xfs/xfs_bmap_util.c create mode 100644 kernel/fs/xfs/xfs_bmap_util.h create mode 100644 kernel/fs/xfs/xfs_buf.c create mode 100644 kernel/fs/xfs/xfs_buf.h create mode 100644 kernel/fs/xfs/xfs_buf_item.c create mode 100644 kernel/fs/xfs/xfs_buf_item.h create mode 100644 kernel/fs/xfs/xfs_dir2_readdir.c create mode 100644 kernel/fs/xfs/xfs_discard.c create mode 100644 kernel/fs/xfs/xfs_discard.h create mode 100644 kernel/fs/xfs/xfs_dquot.c create mode 100644 kernel/fs/xfs/xfs_dquot.h create mode 100644 kernel/fs/xfs/xfs_dquot_item.c create mode 100644 kernel/fs/xfs/xfs_dquot_item.h create mode 100644 kernel/fs/xfs/xfs_error.c create mode 100644 kernel/fs/xfs/xfs_error.h create mode 100644 kernel/fs/xfs/xfs_export.c create mode 100644 kernel/fs/xfs/xfs_export.h create mode 100644 kernel/fs/xfs/xfs_extent_busy.c create mode 100644 kernel/fs/xfs/xfs_extent_busy.h create mode 100644 kernel/fs/xfs/xfs_extfree_item.c create mode 100644 kernel/fs/xfs/xfs_extfree_item.h create mode 100644 kernel/fs/xfs/xfs_file.c create mode 100644 kernel/fs/xfs/xfs_filestream.c create mode 100644 kernel/fs/xfs/xfs_filestream.h create mode 100644 kernel/fs/xfs/xfs_fsops.c create mode 100644 kernel/fs/xfs/xfs_fsops.h create mode 100644 kernel/fs/xfs/xfs_globals.c create mode 100644 kernel/fs/xfs/xfs_icache.c create mode 100644 kernel/fs/xfs/xfs_icache.h create mode 100644 kernel/fs/xfs/xfs_icreate_item.c create mode 100644 kernel/fs/xfs/xfs_icreate_item.h create mode 100644 kernel/fs/xfs/xfs_inode.c create mode 100644 kernel/fs/xfs/xfs_inode.h create mode 100644 kernel/fs/xfs/xfs_inode_item.c create mode 100644 kernel/fs/xfs/xfs_inode_item.h create mode 100644 kernel/fs/xfs/xfs_ioctl.c create mode 100644 kernel/fs/xfs/xfs_ioctl.h create mode 100644 kernel/fs/xfs/xfs_ioctl32.c create mode 100644 kernel/fs/xfs/xfs_ioctl32.h create mode 100644 kernel/fs/xfs/xfs_iomap.c create mode 100644 kernel/fs/xfs/xfs_iomap.h create mode 100644 kernel/fs/xfs/xfs_iops.c create mode 100644 kernel/fs/xfs/xfs_iops.h create mode 100644 kernel/fs/xfs/xfs_itable.c create mode 100644 kernel/fs/xfs/xfs_itable.h create mode 100644 kernel/fs/xfs/xfs_linux.h create mode 100644 kernel/fs/xfs/xfs_log.c create mode 100644 kernel/fs/xfs/xfs_log.h create mode 100644 kernel/fs/xfs/xfs_log_cil.c create mode 100644 kernel/fs/xfs/xfs_log_priv.h create mode 100644 kernel/fs/xfs/xfs_log_recover.c create mode 100644 kernel/fs/xfs/xfs_message.c create mode 100644 kernel/fs/xfs/xfs_message.h create mode 100644 kernel/fs/xfs/xfs_mount.c create mode 100644 kernel/fs/xfs/xfs_mount.h create mode 100644 kernel/fs/xfs/xfs_mru_cache.c create mode 100644 kernel/fs/xfs/xfs_mru_cache.h create mode 100644 kernel/fs/xfs/xfs_pnfs.c create mode 100644 kernel/fs/xfs/xfs_pnfs.h create mode 100644 kernel/fs/xfs/xfs_qm.c create mode 100644 kernel/fs/xfs/xfs_qm.h create mode 100644 kernel/fs/xfs/xfs_qm_bhv.c create mode 100644 kernel/fs/xfs/xfs_qm_syscalls.c create mode 100644 kernel/fs/xfs/xfs_quota.h create mode 100644 kernel/fs/xfs/xfs_quotaops.c create mode 100644 kernel/fs/xfs/xfs_rtalloc.c create mode 100644 kernel/fs/xfs/xfs_rtalloc.h create mode 100644 kernel/fs/xfs/xfs_stats.c create mode 100644 kernel/fs/xfs/xfs_stats.h create mode 100644 kernel/fs/xfs/xfs_super.c create mode 100644 kernel/fs/xfs/xfs_super.h create mode 100644 kernel/fs/xfs/xfs_symlink.c create mode 100644 kernel/fs/xfs/xfs_symlink.h create mode 100644 kernel/fs/xfs/xfs_sysctl.c create mode 100644 kernel/fs/xfs/xfs_sysctl.h create mode 100644 kernel/fs/xfs/xfs_sysfs.c create mode 100644 kernel/fs/xfs/xfs_sysfs.h create mode 100644 kernel/fs/xfs/xfs_trace.c create mode 100644 kernel/fs/xfs/xfs_trace.h create mode 100644 kernel/fs/xfs/xfs_trans.c create mode 100644 kernel/fs/xfs/xfs_trans.h create mode 100644 kernel/fs/xfs/xfs_trans_ail.c create mode 100644 kernel/fs/xfs/xfs_trans_buf.c create mode 100644 kernel/fs/xfs/xfs_trans_dquot.c create mode 100644 kernel/fs/xfs/xfs_trans_extfree.c create mode 100644 kernel/fs/xfs/xfs_trans_inode.c create mode 100644 kernel/fs/xfs/xfs_trans_priv.h create mode 100644 kernel/fs/xfs/xfs_xattr.c (limited to 'kernel/fs') diff --git a/kernel/fs/9p/Kconfig b/kernel/fs/9p/Kconfig new file mode 100644 index 000000000..6489e1fc1 --- /dev/null +++ b/kernel/fs/9p/Kconfig @@ -0,0 +1,46 @@ +config 9P_FS + tristate "Plan 9 Resource Sharing Support (9P2000)" + depends on INET && NET_9P + help + If you say Y here, you will get experimental support for + Plan 9 resource sharing via the 9P2000 protocol. + + See for more information. + + If unsure, say N. + +if 9P_FS +config 9P_FSCACHE + bool "Enable 9P client caching support" + depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for 9p clients using FS-Cache + + +config 9P_FS_POSIX_ACL + bool "9P POSIX Access Control Lists" + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +endif + + +config 9P_FS_SECURITY + bool "9P Security Labels" + depends on 9P_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the 9P filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/kernel/fs/9p/Makefile b/kernel/fs/9p/Makefile new file mode 100644 index 000000000..ff7be98f8 --- /dev/null +++ b/kernel/fs/9p/Makefile @@ -0,0 +1,19 @@ +obj-$(CONFIG_9P_FS) := 9p.o + +9p-objs := \ + vfs_super.o \ + vfs_inode.o \ + vfs_inode_dotl.o \ + vfs_addr.o \ + vfs_file.o \ + vfs_dir.o \ + vfs_dentry.o \ + v9fs.o \ + fid.o \ + xattr.o \ + xattr_user.o \ + xattr_trusted.o + +9p-$(CONFIG_9P_FSCACHE) += cache.o +9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o +9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o diff --git a/kernel/fs/9p/acl.c b/kernel/fs/9p/acl.c new file mode 100644 index 000000000..31c010372 --- /dev/null +++ b/kernel/fs/9p/acl.c @@ -0,0 +1,381 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +{ + ssize_t size; + void *value = NULL; + struct posix_acl *acl = NULL; + + size = v9fs_fid_xattr_get(fid, name, NULL, 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + goto err_out; + } + } else if (size == -ENODATA || size == 0 || + size == -ENOSYS || size == -EOPNOTSUPP) { + acl = NULL; + } else + acl = ERR_PTR(-EIO); + +err_out: + kfree(value); + return acl; +} + +int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + int retval = 0; + struct posix_acl *pacl, *dacl; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return 0; + } + /* get the default/access acl values and cache them */ + dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT); + pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS); + + if (!IS_ERR(dacl) && !IS_ERR(pacl)) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); + } else + retval = -EIO; + + if (!IS_ERR(dacl)) + posix_acl_release(dacl); + + if (!IS_ERR(pacl)) + posix_acl_release(pacl); + + return retval; +} + +static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + /* + * 9p Always cache the acl value when + * instantiating the inode (v9fs_inode_from_fid) + */ + acl = get_cached_acl(inode, type); + BUG_ON(acl == ACL_NOT_CACHED); + return acl; +} + +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { + /* + * On access = client and acl = on mode get the acl + * values from the server + */ + return NULL; + } + return v9fs_get_cached_acl(inode, type); + +} + +static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) +{ + int retval; + char *name; + size_t size; + void *buffer; + if (!acl) + return 0; + + /* Set a setxattr request to server */ + size = posix_acl_xattr_size(acl->a_count); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + if (retval < 0) + goto err_free_out; + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); +err_free_out: + kfree(buffer); + return retval; +} + +int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) +{ + int retval = 0; + struct posix_acl *acl; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (retval) + return retval; + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + } + return retval; +} + +int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, + struct posix_acl *dacl, struct posix_acl *acl) +{ + set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl); + v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); + return 0; +} + +void v9fs_put_acl(struct posix_acl *dacl, + struct posix_acl *acl) +{ + posix_acl_release(dacl); + posix_acl_release(acl); +} + +int v9fs_acl_mode(struct inode *dir, umode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl) +{ + int retval = 0; + umode_t mode = *modep; + struct posix_acl *acl = NULL; + + if (!S_ISLNK(mode)) { + acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + mode &= ~current_umask(); + } + if (acl) { + if (S_ISDIR(mode)) + *dpacl = posix_acl_dup(acl); + retval = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (retval < 0) + return retval; + if (retval > 0) + *pacl = acl; + else + posix_acl_release(acl); + } + *modep = mode; + return 0; +} + +static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_get(dentry, full_name, buffer, size); +} + +static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct v9fs_session_info *v9ses; + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_dentry2v9ses(dentry); + /* + * We allow set/get/list of acl when access=client is not specified + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_get_acl(dentry, name, buffer, size, type); + + acl = v9fs_get_cached_acl(d_inode(dentry), type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int v9fs_remote_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_set(dentry, full_name, value, size, flags); +} + + +static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + int retval; + struct posix_acl *acl; + struct v9fs_session_info *v9ses; + struct inode *inode = d_inode(dentry); + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_dentry2v9ses(dentry); + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_set_acl(dentry, name, + value, size, flags, type); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value) { + /* update the cached acl value */ + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + retval = posix_acl_valid(acl); + if (retval) + goto err_out; + } + } else + acl = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + umode_t mode = inode->i_mode; + retval = posix_acl_equiv_mode(acl, &mode); + if (retval < 0) + goto err_out; + else { + struct iattr iattr; + if (retval == 0) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. + */ + acl = NULL; + value = NULL; + size = 0; + } + /* Updte the mode bits */ + iattr.ia_mode = ((mode & S_IALLUGO) | + (inode->i_mode & ~S_IALLUGO)); + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) { + retval = acl ? -EINVAL : 0; + goto err_out; + } + break; + default: + BUG(); + } + retval = v9fs_xattr_set(dentry, name, value, size, flags); + if (!retval) + set_cached_acl(inode, type, acl); +err_out: + posix_acl_release(acl); + return retval; +} + +const struct xattr_handler v9fs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; + +const struct xattr_handler v9fs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; diff --git a/kernel/fs/9p/acl.h b/kernel/fs/9p/acl.h new file mode 100644 index 000000000..e4f7e8822 --- /dev/null +++ b/kernel/fs/9p/acl.h @@ -0,0 +1,55 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_ACL_H +#define FS_9P_ACL_H + +#ifdef CONFIG_9P_FS_POSIX_ACL +extern int v9fs_get_acl(struct inode *, struct p9_fid *); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); +extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); +extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, + struct posix_acl *, struct posix_acl *); +extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl); +extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); +#else +#define v9fs_iop_get_acl NULL +static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + return 0; +} +static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) +{ + return 0; +} +static inline int v9fs_set_create_acl(struct inode *inode, + struct p9_fid *fid, + struct posix_acl *dacl, + struct posix_acl *acl) +{ + return 0; +} +static inline void v9fs_put_acl(struct posix_acl *dacl, + struct posix_acl *acl) +{ +} +static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, + struct posix_acl **dpacl, + struct posix_acl **pacl) +{ + return 0; +} + +#endif +#endif /* FS_9P_XATTR_H */ diff --git a/kernel/fs/9p/cache.c b/kernel/fs/9p/cache.c new file mode 100644 index 000000000..a69260f27 --- /dev/null +++ b/kernel/fs/9p/cache.c @@ -0,0 +1,414 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "cache.h" + +#define CACHETAG_LEN 11 + +struct fscache_netfs v9fs_cache_netfs = { + .name = "9p", + .version = 0, +}; + +/** + * v9fs_random_cachetag - Generate a random tag to be associated + * with a new cache session. + * + * The value of jiffies is used for a fairly randomly cache tag. + */ + +static +int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +{ + v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); + if (!v9ses->cachetag) + return -ENOMEM; + + return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); +} + +static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct v9fs_session_info *v9ses; + uint16_t klen = 0; + + v9ses = (struct v9fs_session_info *)cookie_netfs_data; + p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", + v9ses, buffer, bufmax); + + if (v9ses->cachetag) + klen = strlen(v9ses->cachetag); + + if (klen > bufmax) + return 0; + + memcpy(buffer, v9ses->cachetag, klen); + p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); + return klen; +} + +const struct fscache_cookie_def v9fs_cache_session_index_def = { + .name = "9P.session", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = v9fs_cache_session_get_key, +}; + +void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) +{ + /* If no cache session tag was specified, we generate a random one. */ + if (!v9ses->cachetag) + v9fs_random_cachetag(v9ses); + + v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, + &v9fs_cache_session_index_def, + v9ses, true); + p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", + v9ses, v9ses->fscache); +} + +void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) +{ + p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", + v9ses, v9ses->fscache); + fscache_relinquish_cookie(v9ses->fscache, 0); + v9ses->fscache = NULL; +} + + +static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); + p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", + &v9inode->vfs_inode, v9inode->qid.path); + return sizeof(v9inode->qid.path); +} + +static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + *size = i_size_read(&v9inode->vfs_inode); + + p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", + &v9inode->vfs_inode, *size); +} + +static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); + p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", + &v9inode->vfs_inode, v9inode->qid.version); + return sizeof(v9inode->qid.version); +} + +static enum +fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + + if (buflen != sizeof(v9inode->qid.version)) + return FSCACHE_CHECKAUX_OBSOLETE; + + if (memcmp(buffer, &v9inode->qid.version, + sizeof(v9inode->qid.version))) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) +{ + struct v9fs_inode *v9inode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def v9fs_cache_inode_index_def = { + .name = "9p.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = v9fs_cache_inode_get_key, + .get_attr = v9fs_cache_inode_get_attr, + .get_aux = v9fs_cache_inode_get_aux, + .check_aux = v9fs_cache_inode_check_aux, + .now_uncached = v9fs_cache_inode_now_uncached, +}; + +void v9fs_cache_inode_get_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + + if (!S_ISREG(inode->i_mode)) + return; + + v9inode = V9FS_I(inode); + if (v9inode->fscache) + return; + + v9ses = v9fs_inode2v9ses(inode); + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + v9inode, true); + + p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", + inode, v9inode->fscache); +} + +void v9fs_cache_inode_put_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + if (!v9inode->fscache) + return; + p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", + inode, v9inode->fscache); + + fscache_relinquish_cookie(v9inode->fscache, 0); + v9inode->fscache = NULL; +} + +void v9fs_cache_inode_flush_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + if (!v9inode->fscache) + return; + p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", + inode, v9inode->fscache); + + fscache_relinquish_cookie(v9inode->fscache, 1); + v9inode->fscache = NULL; +} + +void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + if (!v9inode->fscache) + return; + + spin_lock(&v9inode->fscache_lock); + + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + v9fs_cache_inode_flush_cookie(inode); + else + v9fs_cache_inode_get_cookie(inode); + + spin_unlock(&v9inode->fscache_lock); +} + +void v9fs_cache_inode_reset_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct v9fs_session_info *v9ses; + struct fscache_cookie *old; + + if (!v9inode->fscache) + return; + + old = v9inode->fscache; + + spin_lock(&v9inode->fscache_lock); + fscache_relinquish_cookie(v9inode->fscache, 1); + + v9ses = v9fs_inode2v9ses(inode); + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + v9inode, true); + p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", + inode, old, v9inode->fscache); + + spin_unlock(&v9inode->fscache_lock); +} + +int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) +{ + struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + + BUG_ON(!v9inode->fscache); + + return fscache_maybe_release_page(v9inode->fscache, page, gfp); +} + +void __v9fs_fscache_invalidate_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + + BUG_ON(!v9inode->fscache); + + if (PageFsCache(page)) { + fscache_wait_on_page_write(v9inode->fscache, page); + BUG_ON(!PageLocked(page)); + fscache_uncache_page(v9inode->fscache, page); + } +} + +static void v9fs_vfs_readpage_complete(struct page *page, void *data, + int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +/** + * __v9fs_readpage_from_fscache - read a page from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); + if (!v9inode->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(v9inode->fscache, + page, + v9fs_vfs_readpage_complete, + NULL, + GFP_KERNEL); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret); + return 1; + case 0: + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); + return ret; + default: + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); + return ret; + } +} + +/** + * __v9fs_readpages_from_fscache - read multiple pages from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages); + if (!v9inode->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(v9inode->fscache, + mapping, pages, nr_pages, + v9fs_vfs_readpage_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret); + return 1; + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); + return ret; + default: + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); + return ret; + } +} + +/** + * __v9fs_readpage_to_fscache - write a page to the cache + * + */ + +void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); + ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); + p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); + if (ret != 0) + v9fs_uncache_page(inode, page); +} + +/* + * wait for a page to complete writing to the cache + */ +void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +{ + const struct v9fs_inode *v9inode = V9FS_I(inode); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); + if (PageFsCache(page)) + fscache_wait_on_page_write(v9inode->fscache, page); +} diff --git a/kernel/fs/9p/cache.h b/kernel/fs/9p/cache.h new file mode 100644 index 000000000..2f9675491 --- /dev/null +++ b/kernel/fs/9p/cache.h @@ -0,0 +1,151 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _9P_CACHE_H +#ifdef CONFIG_9P_FSCACHE +#include +#include + +extern struct fscache_netfs v9fs_cache_netfs; +extern const struct fscache_cookie_def v9fs_cache_session_index_def; +extern const struct fscache_cookie_def v9fs_cache_inode_index_def; + +extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); +extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); + +extern void v9fs_cache_inode_get_cookie(struct inode *inode); +extern void v9fs_cache_inode_put_cookie(struct inode *inode); +extern void v9fs_cache_inode_flush_cookie(struct inode *inode); +extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); +extern void v9fs_cache_inode_reset_cookie(struct inode *inode); + +extern int __v9fs_cache_register(void); +extern void __v9fs_cache_unregister(void); + +extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp); +extern void __v9fs_fscache_invalidate_page(struct page *page); +extern int __v9fs_readpage_from_fscache(struct inode *inode, + struct page *page); +extern int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); +extern void __v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page); + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) +{ + return __v9fs_fscache_release_page(page, gfp); +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) +{ + __v9fs_fscache_invalidate_page(page); +} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return __v9fs_readpage_from_fscache(inode, page); +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return __v9fs_readpages_from_fscache(inode, mapping, pages, + nr_pages); +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __v9fs_readpage_to_fscache(inode, page); +} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + fscache_uncache_page(v9inode->fscache, page); + BUG_ON(PageFsCache(page)); +} + +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return __v9fs_fscache_wait_on_page_write(inode, page); +} + +#else /* CONFIG_9P_FSCACHE */ + +static inline void v9fs_cache_inode_get_cookie(struct inode *inode) +{ +} + +static inline void v9fs_cache_inode_put_cookie(struct inode *inode) +{ +} + +static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file) +{ +} + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) { + return 1; +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) {} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{} + +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return; +} + +#endif /* CONFIG_9P_FSCACHE */ +#endif /* _9P_CACHE_H */ diff --git a/kernel/fs/9p/fid.c b/kernel/fs/9p/fid.c new file mode 100644 index 000000000..47db55aee --- /dev/null +++ b/kernel/fs/9p/fid.c @@ -0,0 +1,306 @@ +/* + * V9FS FID Management + * + * Copyright (C) 2007 by Latchesar Ionkov + * Copyright (C) 2005, 2006 by Eric Van Hensbergen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * v9fs_fid_add - add a fid to a dentry + * @dentry: dentry that the fid is being added to + * @fid: fid to add + * + */ + +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) +{ + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} + +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +{ + spin_lock(&dentry->d_lock); + __add_fid(dentry, fid); + spin_unlock(&dentry->d_lock); +} + +/** + * v9fs_fid_find - retrieve a fid that belongs to the specified uid + * @dentry: dentry to look for fid in + * @uid: return fid that belongs to the specified user + * @any: if non-zero, return any fid associated with the dentry + * + */ + +static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) +{ + struct p9_fid *fid, *ret; + + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n", + dentry, dentry, from_kuid(&init_user_ns, uid), + any); + ret = NULL; + /* we'll recheck under lock if there's anything to look in */ + if (dentry->d_fsdata) { + struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; + spin_lock(&dentry->d_lock); + hlist_for_each_entry(fid, h, dlist) { + if (any || uid_eq(fid->uid, uid)) { + ret = fid; + break; + } + } + spin_unlock(&dentry->d_lock); + } + + return ret; +} + +/* + * We need to hold v9ses->rename_sem as long as we hold references + * to returned path array. Array element contain pointers to + * dentry names. + */ +static int build_path_from_dentry(struct v9fs_session_info *v9ses, + struct dentry *dentry, char ***names) +{ + int n = 0, i; + char **wnames; + struct dentry *ds; + + for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) + n++; + + wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + if (!wnames) + goto err_out; + + for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) + wnames[i] = (char *)ds->d_name.name; + + *names = wnames; + return n; +err_out: + return -ENOMEM; +} + +static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, + kuid_t uid, int any) +{ + struct dentry *ds; + char **wnames, *uname; + int i, n, l, clone, access; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *old_fid = NULL; + + v9ses = v9fs_dentry2v9ses(dentry); + access = v9ses->flags & V9FS_ACCESS_MASK; + fid = v9fs_fid_find(dentry, uid, any); + if (fid) + return fid; + /* + * we don't have a matching fid. To do a TWALK we need + * parent fid. We need to prevent rename when we want to + * look at the parent. + */ + down_read(&v9ses->rename_sem); + ds = dentry->d_parent; + fid = v9fs_fid_find(ds, uid, any); + if (fid) { + /* Found the parent fid do a lookup with that */ + fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + goto fid_out; + } + up_read(&v9ses->rename_sem); + + /* start from the root and try to do a lookup */ + fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!fid) { + /* the user is not attached to the fs yet */ + if (access == V9FS_ACCESS_SINGLE) + return ERR_PTR(-EPERM); + + if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) + uname = NULL; + else + uname = v9ses->uname; + + fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, + v9ses->aname); + if (IS_ERR(fid)) + return fid; + + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ + if (dentry->d_sb->s_root == dentry) + return fid; + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we + * don't have a parallel rename happening + */ + down_read(&v9ses->rename_sem); + n = build_path_from_dentry(v9ses, dentry, &wnames); + if (n < 0) { + fid = ERR_PTR(n); + goto err_out; + } + clone = 1; + i = 0; + while (i < n) { + l = min(n - i, P9_MAXWELEM); + /* + * We need to hold rename lock when doing a multipath + * walk to ensure none of the patch component change + */ + fid = p9_client_walk(fid, l, &wnames[i], clone); + if (IS_ERR(fid)) { + if (old_fid) { + /* + * If we fail, clunk fid which are mapping + * to path component and not the last component + * of the path. + */ + p9_client_clunk(old_fid); + } + kfree(wnames); + goto err_out; + } + old_fid = fid; + i += l; + clone = 0; + } + kfree(wnames); +fid_out: + if (!IS_ERR(fid)) { + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + p9_client_clunk(fid); + fid = ERR_PTR(-ENOENT); + } else { + __add_fid(dentry, fid); + spin_unlock(&dentry->d_lock); + } + } +err_out: + up_read(&v9ses->rename_sem); + return fid; +} + +/** + * v9fs_fid_lookup - lookup for a fid, try to walk if not found + * @dentry: dentry to look for fid in + * + * Look for a fid in the specified dentry for the current user. + * If no fid is found, try to create one walking from a fid from the parent + * dentry (if it has one), or the root dentry. If the user haven't accessed + * the fs yet, attach now and walk from the root. + */ + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +{ + kuid_t uid; + int any, access; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_dentry2v9ses(dentry); + access = v9ses->flags & V9FS_ACCESS_MASK; + switch (access) { + case V9FS_ACCESS_SINGLE: + case V9FS_ACCESS_USER: + case V9FS_ACCESS_CLIENT: + uid = current_fsuid(); + any = 0; + break; + + case V9FS_ACCESS_ANY: + uid = v9ses->uid; + any = 1; + break; + + default: + uid = INVALID_UID; + any = 0; + break; + } + return v9fs_fid_lookup_with_uid(dentry, uid, any); +} + +struct p9_fid *v9fs_fid_clone(struct dentry *dentry) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup_with_uid(dentry, uid, 0); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) +{ + int err; + struct p9_fid *fid; + + fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID); + if (IS_ERR(fid)) + goto error_out; + /* + * writeback fid will only be used to write back the + * dirty pages. We always request for the open fid in read-write + * mode so that a partial page write which result in page + * read can work. + */ + err = p9_client_open(fid, O_RDWR); + if (err < 0) { + p9_client_clunk(fid); + fid = ERR_PTR(err); + goto error_out; + } +error_out: + return fid; +} diff --git a/kernel/fs/9p/fid.h b/kernel/fs/9p/fid.h new file mode 100644 index 000000000..2b6787fcb --- /dev/null +++ b/kernel/fs/9p/fid.h @@ -0,0 +1,30 @@ +/* + * V9FS FID Management + * + * Copyright (C) 2005 by Eric Van Hensbergen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_FID_H +#define FS_9P_FID_H +#include + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); +struct p9_fid *v9fs_fid_clone(struct dentry *dentry); +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); +#endif diff --git a/kernel/fs/9p/v9fs.c b/kernel/fs/9p/v9fs.c new file mode 100644 index 000000000..620d93489 --- /dev/null +++ b/kernel/fs/9p/v9fs.c @@ -0,0 +1,685 @@ +/* + * linux/fs/9p/v9fs.c + * + * This file contains functions assisting in mapping VFS to 9P2000 + * + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "cache.h" + +static DEFINE_SPINLOCK(v9fs_sessionlist_lock); +static LIST_HEAD(v9fs_sessionlist); +struct kmem_cache *v9fs_inode_cache; + +/* + * Option Parsing (code inspired by NFS code) + * NOTE: each transport will parse its own options + */ + +enum { + /* Options that take integer arguments */ + Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, + /* String options */ + Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag, + /* Options that take no arguments */ + Opt_nodevmap, + /* Cache options */ + Opt_cache_loose, Opt_fscache, Opt_mmap, + /* Access options */ + Opt_access, Opt_posixacl, + /* Error token */ + Opt_err +}; + +static const match_table_t tokens = { + {Opt_debug, "debug=%x"}, + {Opt_dfltuid, "dfltuid=%u"}, + {Opt_dfltgid, "dfltgid=%u"}, + {Opt_afid, "afid=%u"}, + {Opt_uname, "uname=%s"}, + {Opt_remotename, "aname=%s"}, + {Opt_nodevmap, "nodevmap"}, + {Opt_cache, "cache=%s"}, + {Opt_cache_loose, "loose"}, + {Opt_fscache, "fscache"}, + {Opt_mmap, "mmap"}, + {Opt_cachetag, "cachetag=%s"}, + {Opt_access, "access=%s"}, + {Opt_posixacl, "posixacl"}, + {Opt_err, NULL} +}; + +/* Interpret mount options for cache mode */ +static int get_cache_mode(char *s) +{ + int version = -EINVAL; + + if (!strcmp(s, "loose")) { + version = CACHE_LOOSE; + p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); + } else if (!strcmp(s, "fscache")) { + version = CACHE_FSCACHE; + p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "mmap")) { + version = CACHE_MMAP; + p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); + } else if (!strcmp(s, "none")) { + version = CACHE_NONE; + p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); + } else + pr_info("Unknown Cache mode %s\n", s); + return version; +} + +/** + * v9fs_parse_options - parse mount options into session structure + * @v9ses: existing v9fs session information + * + * Return 0 upon success, -ERRNO upon failure. + */ + +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) +{ + char *options, *tmp_options; + substring_t args[MAX_OPT_ARGS]; + char *p; + int option = 0; + char *s, *e; + int ret = 0; + + /* setup defaults */ + v9ses->afid = ~0; + v9ses->debug = 0; + v9ses->cache = CACHE_NONE; +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = NULL; +#endif + + if (!opts) + return 0; + + tmp_options = kstrdup(opts, GFP_KERNEL); + if (!tmp_options) { + ret = -ENOMEM; + goto fail_option_alloc; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token, r; + if (!*p) + continue; + token = match_token(p, tokens, args); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + v9ses->debug = option; +#ifdef CONFIG_NET_9P_DEBUG + p9_debug_level = option; +#endif + break; + + case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + v9ses->dfltuid = make_kuid(current_user_ns(), option); + if (!uid_valid(v9ses->dfltuid)) { + p9_debug(P9_DEBUG_ERROR, + "uid field, but not a uid?\n"); + ret = -EINVAL; + continue; + } + break; + case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + v9ses->dfltgid = make_kgid(current_user_ns(), option); + if (!gid_valid(v9ses->dfltgid)) { + p9_debug(P9_DEBUG_ERROR, + "gid field, but not a gid?\n"); + ret = -EINVAL; + continue; + } + break; + case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + v9ses->afid = option; + break; + case Opt_uname: + kfree(v9ses->uname); + v9ses->uname = match_strdup(&args[0]); + if (!v9ses->uname) { + ret = -ENOMEM; + goto free_and_return; + } + break; + case Opt_remotename: + kfree(v9ses->aname); + v9ses->aname = match_strdup(&args[0]); + if (!v9ses->aname) { + ret = -ENOMEM; + goto free_and_return; + } + break; + case Opt_nodevmap: + v9ses->nodev = 1; + break; + case Opt_cache_loose: + v9ses->cache = CACHE_LOOSE; + break; + case Opt_fscache: + v9ses->cache = CACHE_FSCACHE; + break; + case Opt_mmap: + v9ses->cache = CACHE_MMAP; + break; + case Opt_cachetag: +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = match_strdup(&args[0]); +#endif + break; + case Opt_cache: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); + goto free_and_return; + } + ret = get_cache_mode(s); + if (ret == -EINVAL) { + kfree(s); + goto free_and_return; + } + + v9ses->cache = ret; + kfree(s); + break; + + case Opt_access: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); + goto free_and_return; + } + + v9ses->flags &= ~V9FS_ACCESS_MASK; + if (strcmp(s, "user") == 0) + v9ses->flags |= V9FS_ACCESS_USER; + else if (strcmp(s, "any") == 0) + v9ses->flags |= V9FS_ACCESS_ANY; + else if (strcmp(s, "client") == 0) { + v9ses->flags |= V9FS_ACCESS_CLIENT; + } else { + uid_t uid; + v9ses->flags |= V9FS_ACCESS_SINGLE; + uid = simple_strtoul(s, &e, 10); + if (*e != '\0') { + ret = -EINVAL; + pr_info("Unknown access argument %s\n", + s); + kfree(s); + goto free_and_return; + } + v9ses->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(v9ses->uid)) { + ret = -EINVAL; + pr_info("Uknown uid %s\n", s); + kfree(s); + goto free_and_return; + } + } + + kfree(s); + break; + + case Opt_posixacl: +#ifdef CONFIG_9P_FS_POSIX_ACL + v9ses->flags |= V9FS_POSIX_ACL; +#else + p9_debug(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); +#endif + break; + + default: + continue; + } + } + +free_and_return: + kfree(tmp_options); +fail_option_alloc: + return ret; +} + +/** + * v9fs_session_init - initialize session + * @v9ses: session information structure + * @dev_name: device being mounted + * @data: options + * + */ + +struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, + const char *dev_name, char *data) +{ + int retval = -EINVAL; + struct p9_fid *fid; + int rc; + + v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); + if (!v9ses->uname) + return ERR_PTR(-ENOMEM); + + v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); + if (!v9ses->aname) { + kfree(v9ses->uname); + return ERR_PTR(-ENOMEM); + } + init_rwsem(&v9ses->rename_sem); + + rc = bdi_setup_and_register(&v9ses->bdi, "9p"); + if (rc) { + kfree(v9ses->aname); + kfree(v9ses->uname); + return ERR_PTR(rc); + } + + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); + + v9ses->uid = INVALID_UID; + v9ses->dfltuid = V9FS_DEFUID; + v9ses->dfltgid = V9FS_DEFGID; + + v9ses->clnt = p9_client_create(dev_name, data); + if (IS_ERR(v9ses->clnt)) { + retval = PTR_ERR(v9ses->clnt); + v9ses->clnt = NULL; + p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + goto error; + } + + v9ses->flags = V9FS_ACCESS_USER; + + if (p9_is_proto_dotl(v9ses->clnt)) { + v9ses->flags = V9FS_ACCESS_CLIENT; + v9ses->flags |= V9FS_PROTO_2000L; + } else if (p9_is_proto_dotu(v9ses->clnt)) { + v9ses->flags |= V9FS_PROTO_2000U; + } + + rc = v9fs_parse_options(v9ses, data); + if (rc < 0) { + retval = rc; + goto error; + } + + v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; + + if (!v9fs_proto_dotl(v9ses) && + ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACCESS_CLIENT only for dotl. + * Fall back to ACCESS_USER + */ + v9ses->flags &= ~V9FS_ACCESS_MASK; + v9ses->flags |= V9FS_ACCESS_USER; + } + /*FIXME !! */ + /* for legacy mode, fall back to V9FS_ACCESS_ANY */ + if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && + ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { + + v9ses->flags &= ~V9FS_ACCESS_MASK; + v9ses->flags |= V9FS_ACCESS_ANY; + v9ses->uid = INVALID_UID; + } + if (!v9fs_proto_dotl(v9ses) || + !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACL checks on clinet only if the protocol is + * 9P2000.L and access is V9FS_ACCESS_CLIENT. + */ + v9ses->flags &= ~V9FS_ACL_MASK; + } + + fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, + v9ses->aname); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); + goto error; + } + + if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) + fid->uid = v9ses->uid; + else + fid->uid = INVALID_UID; + +#ifdef CONFIG_9P_FSCACHE + /* register the session for caching */ + v9fs_cache_session_get_cookie(v9ses); +#endif + + return fid; + +error: + bdi_destroy(&v9ses->bdi); + return ERR_PTR(retval); +} + +/** + * v9fs_session_close - shutdown a session + * @v9ses: session information structure + * + */ + +void v9fs_session_close(struct v9fs_session_info *v9ses) +{ + if (v9ses->clnt) { + p9_client_destroy(v9ses->clnt); + v9ses->clnt = NULL; + } + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->fscache) { + v9fs_cache_session_put_cookie(v9ses); + kfree(v9ses->cachetag); + } +#endif + kfree(v9ses->uname); + kfree(v9ses->aname); + + bdi_destroy(&v9ses->bdi); + + spin_lock(&v9fs_sessionlist_lock); + list_del(&v9ses->slist); + spin_unlock(&v9fs_sessionlist_lock); +} + +/** + * v9fs_session_cancel - terminate a session + * @v9ses: session to terminate + * + * mark transport as disconnected and cancel all pending requests. + */ + +void v9fs_session_cancel(struct v9fs_session_info *v9ses) { + p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_client_disconnect(v9ses->clnt); +} + +/** + * v9fs_session_begin_cancel - Begin terminate of a session + * @v9ses: session to terminate + * + * After this call we don't allow any request other than clunk. + */ + +void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) +{ + p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); + p9_client_begin_disconnect(v9ses->clnt); +} + +extern int v9fs_error_init(void); + +static struct kobject *v9fs_kobj; + +#ifdef CONFIG_9P_FSCACHE +/** + * caches_show - list caches associated with a session + * + * Returns the size of buffer written. + */ + +static ssize_t caches_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t n = 0, count = 0, limit = PAGE_SIZE; + struct v9fs_session_info *v9ses; + + spin_lock(&v9fs_sessionlist_lock); + list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { + if (v9ses->cachetag) { + n = snprintf(buf, limit, "%s\n", v9ses->cachetag); + if (n < 0) { + count = n; + break; + } + + count += n; + limit -= n; + } + } + + spin_unlock(&v9fs_sessionlist_lock); + return count; +} + +static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches); +#endif /* CONFIG_9P_FSCACHE */ + +static struct attribute *v9fs_attrs[] = { +#ifdef CONFIG_9P_FSCACHE + &v9fs_attr_cache.attr, +#endif + NULL, +}; + +static struct attribute_group v9fs_attr_group = { + .attrs = v9fs_attrs, +}; + +/** + * v9fs_sysfs_init - Initialize the v9fs sysfs interface + * + */ + +static int __init v9fs_sysfs_init(void) +{ + v9fs_kobj = kobject_create_and_add("9p", fs_kobj); + if (!v9fs_kobj) + return -ENOMEM; + + if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { + kobject_put(v9fs_kobj); + return -ENOMEM; + } + + return 0; +} + +/** + * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface + * + */ + +static void v9fs_sysfs_cleanup(void) +{ + sysfs_remove_group(v9fs_kobj, &v9fs_attr_group); + kobject_put(v9fs_kobj); +} + +static void v9fs_inode_init_once(void *foo) +{ + struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; +#endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); + inode_init_once(&v9inode->vfs_inode); +} + +/** + * v9fs_init_inode_cache - initialize a cache for 9P + * Returns 0 on success. + */ +static int v9fs_init_inode_cache(void) +{ + v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", + sizeof(struct v9fs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + v9fs_inode_init_once); + if (!v9fs_inode_cache) + return -ENOMEM; + + return 0; +} + +/** + * v9fs_destroy_inode_cache - destroy the cache of 9P inode + * + */ +static void v9fs_destroy_inode_cache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(v9fs_inode_cache); +} + +static int v9fs_cache_register(void) +{ + int ret; + ret = v9fs_init_inode_cache(); + if (ret < 0) + return ret; +#ifdef CONFIG_9P_FSCACHE + ret = fscache_register_netfs(&v9fs_cache_netfs); + if (ret < 0) + v9fs_destroy_inode_cache(); +#endif + return ret; +} + +static void v9fs_cache_unregister(void) +{ + v9fs_destroy_inode_cache(); +#ifdef CONFIG_9P_FSCACHE + fscache_unregister_netfs(&v9fs_cache_netfs); +#endif +} + +/** + * init_v9fs - Initialize module + * + */ + +static int __init init_v9fs(void) +{ + int err; + pr_info("Installing v9fs 9p2000 file system support\n"); + /* TODO: Setup list of registered trasnport modules */ + + err = v9fs_cache_register(); + if (err < 0) { + pr_err("Failed to register v9fs for caching\n"); + return err; + } + + err = v9fs_sysfs_init(); + if (err < 0) { + pr_err("Failed to register with sysfs\n"); + goto out_cache; + } + err = register_filesystem(&v9fs_fs_type); + if (err < 0) { + pr_err("Failed to register filesystem\n"); + goto out_sysfs_cleanup; + } + + return 0; + +out_sysfs_cleanup: + v9fs_sysfs_cleanup(); + +out_cache: + v9fs_cache_unregister(); + + return err; +} + +/** + * exit_v9fs - shutdown module + * + */ + +static void __exit exit_v9fs(void) +{ + v9fs_sysfs_cleanup(); + v9fs_cache_unregister(); + unregister_filesystem(&v9fs_fs_type); +} + +module_init(init_v9fs) +module_exit(exit_v9fs) + +MODULE_AUTHOR("Latchesar Ionkov "); +MODULE_AUTHOR("Eric Van Hensbergen "); +MODULE_AUTHOR("Ron Minnich "); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/9p/v9fs.h b/kernel/fs/9p/v9fs.h new file mode 100644 index 000000000..fb9ffcb43 --- /dev/null +++ b/kernel/fs/9p/v9fs.h @@ -0,0 +1,227 @@ +/* + * V9FS definitions. + * + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_V9FS_H +#define FS_9P_V9FS_H + +#include + +/** + * enum p9_session_flags - option flags for each 9P session + * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions + * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy + * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) + * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. + * @V9FS_ACCESS_ANY: use a single attach for all users + * @V9FS_ACCESS_MASK: bit mask of different ACCESS options + * @V9FS_POSIX_ACL: POSIX ACLs are enforced + * + * Session flags reflect options selected by users at mount time + */ +#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ + V9FS_ACCESS_USER | \ + V9FS_ACCESS_CLIENT) +#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY +#define V9FS_ACL_MASK V9FS_POSIX_ACL + +enum p9_session_flags { + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2000L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20 +}; + +/* possible values of ->cache */ +/** + * enum p9_cache_modes - user specified cache preferences + * @CACHE_NONE: do not cache data, dentries, or directory contents (default) + * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency + * + * eventually support loose, tight, time, session, default always none + */ + +enum p9_cache_modes { + CACHE_NONE, + CACHE_MMAP, + CACHE_LOOSE, + CACHE_FSCACHE, +}; + +/** + * struct v9fs_session_info - per-instance session information + * @flags: session options of type &p9_session_flags + * @nodev: set to 1 to disable device mapping + * @debug: debug level + * @afid: authentication handle + * @cache: cache mode of type &p9_cache_modes + * @cachetag: the tag of the cache associated with this session + * @fscache: session cookie associated with FS-Cache + * @uname: string user name to mount hierarchy as + * @aname: mount specifier for remote hierarchy + * @maxdata: maximum data to be sent/recvd per protocol message + * @dfltuid: default numeric userid to mount hierarchy as + * @dfltgid: default numeric groupid to mount hierarchy as + * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy + * @clnt: reference to 9P network client instantiated for this session + * @slist: reference to list of registered 9p sessions + * + * This structure holds state for each session instance established during + * a sys_mount() . + * + * Bugs: there seems to be a lot of state which could be condensed and/or + * removed. + */ + +struct v9fs_session_info { + /* options */ + unsigned char flags; + unsigned char nodev; + unsigned short debug; + unsigned int afid; + unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; + struct fscache_cookie *fscache; +#endif + + char *uname; /* user name to mount as */ + char *aname; /* name of remote hierarchy being mounted */ + unsigned int maxdata; /* max data for client interface */ + kuid_t dfltuid; /* default uid/muid for legacy support */ + kgid_t dfltgid; /* default gid for legacy support */ + kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ + struct p9_client *clnt; /* 9p client */ + struct list_head slist; /* list of sessions registered with v9fs */ + struct backing_dev_info bdi; + struct rw_semaphore rename_sem; +}; + +/* cache_validity flags */ +#define V9FS_INO_INVALID_ATTR 0x01 + +struct v9fs_inode { +#ifdef CONFIG_9P_FSCACHE + spinlock_t fscache_lock; + struct fscache_cookie *fscache; +#endif + struct p9_qid qid; + unsigned int cache_validity; + struct p9_fid *writeback_fid; + struct mutex v_mutex; + struct inode vfs_inode; +}; + +static inline struct v9fs_inode *V9FS_I(const struct inode *inode) +{ + return container_of(inode, struct v9fs_inode, vfs_inode); +} + +struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, + char *); +extern void v9fs_session_close(struct v9fs_session_info *v9ses); +extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); +extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); +extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); +extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *p); +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); +extern const struct inode_operations v9fs_dir_inode_operations_dotl; +extern const struct inode_operations v9fs_file_inode_operations_dotl; +extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); + +/* other default globals */ +#define V9FS_PORT 564 +#define V9FS_DEFUSER "nobody" +#define V9FS_DEFANAME "" +#define V9FS_DEFUID KUIDT_INIT(-2) +#define V9FS_DEFGID KGIDT_INIT(-2) + +static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) +{ + return (inode->i_sb->s_fs_info); +} + +static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) +{ + return dentry->d_sb->s_fs_info; +} + +static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2000U; +} + +static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2000L; +} + +/** + * v9fs_get_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 0); +} + +/** + * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 1); +} + +#endif diff --git a/kernel/fs/9p/v9fs_vfs.h b/kernel/fs/9p/v9fs_vfs.h new file mode 100644 index 000000000..5a0db6dec --- /dev/null +++ b/kernel/fs/9p/v9fs_vfs.h @@ -0,0 +1,86 @@ +/* + * V9FS VFS extensions. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_V9FS_VFS_H +#define FS_9P_V9FS_VFS_H + +/* plan9 semantics are that created files are implicitly opened. + * But linux semantics are that you call create, then open. + * the plan9 approach is superior as it provides an atomic + * open. + * we track the create fid here. When the file is opened, if fidopen is + * non-zero, we use the fid and can skip some steps. + * there may be a better way to do this, but I don't know it. + * one BAD way is to clunk the fid on create, then open it again: + * you lose the atomicity of file open + */ + +/* special case: + * unlink calls remove, which is an implicit clunk. So we have to track + * that kind of thing so that we don't try to clunk a dead fid. + */ +#define P9_LOCK_TIMEOUT (30*HZ) + +extern struct file_system_type v9fs_fs_type; +extern const struct address_space_operations v9fs_addr_operations; +extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_file_operations_dotl; +extern const struct file_operations v9fs_dir_operations; +extern const struct file_operations v9fs_dir_operations_dotl; +extern const struct dentry_operations v9fs_dentry_operations; +extern const struct dentry_operations v9fs_cached_dentry_operations; +extern const struct file_operations v9fs_cached_file_operations; +extern const struct file_operations v9fs_cached_file_operations_dotl; +extern const struct file_operations v9fs_mmap_file_operations; +extern const struct file_operations v9fs_mmap_file_operations_dotl; +extern struct kmem_cache *v9fs_inode_cache; + +struct inode *v9fs_alloc_inode(struct super_block *sb); +void v9fs_destroy_inode(struct inode *inode); +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, umode_t mode, dev_t); +void v9fs_evict_inode(struct inode *inode); +ino_t v9fs_qid2ino(struct p9_qid *qid); +void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +int v9fs_dir_release(struct inode *inode, struct file *filp); +int v9fs_file_open(struct inode *inode, struct file *file); +void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); +int v9fs_uflags2omode(int uflags, int extended); + +void v9fs_blank_wstat(struct p9_wstat *wstat); +int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); +int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, + int datasync); +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); +static inline void v9fs_invalidate_inode_attr(struct inode *inode) +{ + struct v9fs_inode *v9inode; + v9inode = V9FS_I(inode); + v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; + return; +} + +int v9fs_open_to_dotl_flags(int flags); +#endif diff --git a/kernel/fs/9p/vfs_addr.c b/kernel/fs/9p/vfs_addr.c new file mode 100644 index 000000000..e9e04376c --- /dev/null +++ b/kernel/fs/9p/vfs_addr.c @@ -0,0 +1,351 @@ +/* + * linux/fs/9p/vfs_addr.c + * + * This file contians vfs address (mmap) ops for 9P2000. + * + * Copyright (C) 2005 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "cache.h" +#include "fid.h" + +/** + * v9fs_fid_readpage - read an entire page in from 9P + * + * @fid: fid being read + * @page: structure to page + * + */ +static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE}; + struct iov_iter to; + int retval, err; + + p9_debug(P9_DEBUG_VFS, "\n"); + + BUG_ON(!PageLocked(page)); + + retval = v9fs_readpage_from_fscache(inode, page); + if (retval == 0) + return retval; + + iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE); + + retval = p9_client_read(fid, page_offset(page), &to, &err); + if (err) { + v9fs_uncache_page(inode, page); + retval = err; + goto done; + } + + zero_user(page, retval, PAGE_SIZE - retval); + flush_dcache_page(page); + SetPageUptodate(page); + + v9fs_readpage_to_fscache(inode, page); + retval = 0; + +done: + unlock_page(page); + return retval; +} + +/** + * v9fs_vfs_readpage - read an entire page in from 9P + * + * @filp: file being read + * @page: structure to page + * + */ + +static int v9fs_vfs_readpage(struct file *filp, struct page *page) +{ + return v9fs_fid_readpage(filp->private_data, page); +} + +/** + * v9fs_vfs_readpages - read a set of pages from 9P + * + * @filp: file being read + * @mapping: the address space + * @pages: list of pages to read + * @nr_pages: count of pages to read + * + */ + +static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret = 0; + struct inode *inode; + + inode = mapping->host; + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + + ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); + if (ret == 0) + return ret; + + ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); + p9_debug(P9_DEBUG_VFS, " = %d\n", ret); + return ret; +} + +/** + * v9fs_release_page - release the private state associated with a page + * + * Returns 1 if the page can be released, false otherwise. + */ + +static int v9fs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + return v9fs_fscache_release_page(page, gfp); +} + +/** + * v9fs_invalidate_page - Invalidate a page completely or partially + * + * @page: structure to page + * @offset: offset in the page + */ + +static void v9fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) +{ + /* + * If called with zero offset, we should release + * the private state assocated with the page + */ + if (offset == 0 && length == PAGE_CACHE_SIZE) + v9fs_fscache_invalidate_page(page); +} + +static int v9fs_vfs_writepage_locked(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + loff_t size = i_size_read(inode); + struct iov_iter from; + struct bio_vec bvec; + int err, len; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + bvec.bv_page = page; + bvec.bv_offset = 0; + bvec.bv_len = len; + iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len); + + /* We should have writeback_fid always set */ + BUG_ON(!v9inode->writeback_fid); + + set_page_writeback(page); + + p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err); + + end_page_writeback(page); + return err; +} + +static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int retval; + + p9_debug(P9_DEBUG_VFS, "page %p\n", page); + + retval = v9fs_vfs_writepage_locked(page); + if (retval < 0) { + if (retval == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + retval = 0; + } else { + SetPageError(page); + mapping_set_error(page->mapping, retval); + } + } else + retval = 0; + + unlock_page(page); + return retval; +} + +/** + * v9fs_launder_page - Writeback a dirty page + * Returns 0 on success. + */ + +static int v9fs_launder_page(struct page *page) +{ + int retval; + struct inode *inode = page->mapping->host; + + v9fs_fscache_wait_on_page_write(inode, page); + if (clear_page_dirty_for_io(page)) { + retval = v9fs_vfs_writepage_locked(page); + if (retval) + return retval; + } + return 0; +} + +/** + * v9fs_direct_IO - 9P address space operation for direct I/O + * @iocb: target I/O control block + * @pos: offset in file to begin the operation + * + * The presence of v9fs_direct_IO() in the address space ops vector + * allowes open() O_DIRECT flags which would have failed otherwise. + * + * In the non-cached mode, we shunt off direct read and write requests before + * the VFS gets them, so this method should never be called. + * + * Direct IO is not 'yet' supported in the cached mode. Hence when + * this routine is called through generic_file_aio_read(), the read/write fails + * with an error. + * + */ +static ssize_t +v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t n; + int err = 0; + if (iov_iter_rw(iter) == WRITE) { + n = p9_client_write(file->private_data, pos, iter, &err); + if (n) { + struct inode *inode = file_inode(file); + loff_t i_size = i_size_read(inode); + if (pos + n > i_size) + inode_add_bytes(inode, pos + n - i_size); + } + } else { + n = p9_client_read(file->private_data, pos, iter, &err); + } + return n ? n : err; +} + +static int v9fs_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int retval = 0; + struct page *page; + struct v9fs_inode *v9inode; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = mapping->host; + + + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + + v9inode = V9FS_I(inode); +start: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + retval = -ENOMEM; + goto out; + } + BUG_ON(!v9inode->writeback_fid); + if (PageUptodate(page)) + goto out; + + if (len == PAGE_CACHE_SIZE) + goto out; + + retval = v9fs_fid_readpage(v9inode->writeback_fid, page); + page_cache_release(page); + if (!retval) + goto start; +out: + *pagep = page; + return retval; +} + +static int v9fs_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + loff_t last_pos = pos + copied; + struct inode *inode = page->mapping->host; + + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + + if (unlikely(copied < len)) { + /* + * zero out the rest of the area + */ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + flush_dcache_page(page); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) { + inode_add_bytes(inode, last_pos - inode->i_size); + i_size_write(inode, last_pos); + } + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + + +const struct address_space_operations v9fs_addr_operations = { + .readpage = v9fs_vfs_readpage, + .readpages = v9fs_vfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = v9fs_vfs_writepage, + .write_begin = v9fs_write_begin, + .write_end = v9fs_write_end, + .releasepage = v9fs_release_page, + .invalidatepage = v9fs_invalidate_page, + .launder_page = v9fs_launder_page, + .direct_IO = v9fs_direct_IO, +}; diff --git a/kernel/fs/9p/vfs_dentry.c b/kernel/fs/9p/vfs_dentry.c new file mode 100644 index 000000000..bd456c668 --- /dev/null +++ b/kernel/fs/9p/vfs_dentry.c @@ -0,0 +1,122 @@ +/* + * linux/fs/9p/vfs_dentry.c + * + * This file contians vfs dentry ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * v9fs_cached_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question + * + */ +static int v9fs_cached_dentry_delete(const struct dentry *dentry) +{ + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", + dentry, dentry); + + /* Don't cache negative dentries */ + if (d_really_is_negative(dentry)) + return 1; + return 0; +} + +/** + * v9fs_dentry_release - called when dentry is going to be freed + * @dentry: dentry that is being release + * + */ + +static void v9fs_dentry_release(struct dentry *dentry) +{ + struct hlist_node *p, *n; + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", + dentry, dentry); + hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) + p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); + dentry->d_fsdata = NULL; +} + +static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct p9_fid *fid; + struct inode *inode; + struct v9fs_inode *v9inode; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + if (!inode) + goto out_valid; + + v9inode = V9FS_I(inode); + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + int retval; + struct v9fs_session_info *v9ses; + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + retval = v9fs_refresh_inode_dotl(fid, inode); + else + retval = v9fs_refresh_inode(fid, inode); + if (retval == -ENOENT) + return 0; + if (retval < 0) + return retval; + } +out_valid: + return 1; +} + +const struct dentry_operations v9fs_cached_dentry_operations = { + .d_revalidate = v9fs_lookup_revalidate, + .d_weak_revalidate = v9fs_lookup_revalidate, + .d_delete = v9fs_cached_dentry_delete, + .d_release = v9fs_dentry_release, +}; + +const struct dentry_operations v9fs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_release = v9fs_dentry_release, +}; diff --git a/kernel/fs/9p/vfs_dir.c b/kernel/fs/9p/vfs_dir.c new file mode 100644 index 000000000..5cc00e562 --- /dev/null +++ b/kernel/fs/9p/vfs_dir.c @@ -0,0 +1,261 @@ +/* + * linux/fs/9p/vfs_dir.c + * + * This file contains vfs directory ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * struct p9_rdir - readdir accounting + * @head: start offset of current dirread buffer + * @tail: end offset of current dirread buffer + * @buf: dirread buffer + * + * private structure for keeping track of readdir + * allocated on demand + */ + +struct p9_rdir { + int head; + int tail; + uint8_t buf[]; +}; + +/** + * dt_type - return file type + * @mistat: mistat structure + * + */ + +static inline int dt_type(struct p9_wstat *mistat) +{ + unsigned long perm = mistat->mode; + int rettype = DT_REG; + + if (perm & P9_DMDIR) + rettype = DT_DIR; + if (perm & P9_DMSYMLINK) + rettype = DT_LNK; + + return rettype; +} + +static void p9stat_init(struct p9_wstat *stbuf) +{ + stbuf->name = NULL; + stbuf->uid = NULL; + stbuf->gid = NULL; + stbuf->muid = NULL; + stbuf->extension = NULL; +} + +/** + * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir + * @filp: opened file structure + * @buflen: Length in bytes of buffer to allocate + * + */ + +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) +{ + struct p9_fid *fid = filp->private_data; + if (!fid->rdir) + fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + return fid->rdir; +} + +/** + * v9fs_dir_readdir - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to + * + */ + +static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) +{ + bool over; + struct p9_wstat st; + int err = 0; + struct p9_fid *fid; + int buflen; + int reclen = 0; + struct p9_rdir *rdir; + struct kvec kvec; + + p9_debug(P9_DEBUG_VFS, "name %pD\n", file); + fid = file->private_data; + + buflen = fid->clnt->msize - P9_IOHDRSZ; + + rdir = v9fs_alloc_rdir_buf(file, buflen); + if (!rdir) + return -ENOMEM; + kvec.iov_base = rdir->buf; + kvec.iov_len = buflen; + + while (1) { + if (rdir->tail == rdir->head) { + struct iov_iter to; + int n; + iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen); + n = p9_client_read(file->private_data, ctx->pos, &to, + &err); + if (err) + return err; + if (n == 0) + return 0; + + rdir->head = 0; + rdir->tail = n; + } + while (rdir->head < rdir->tail) { + p9stat_init(&st); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); + if (err) { + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); + p9stat_free(&st); + return -EIO; + } + reclen = st.size+2; + + over = !dir_emit(ctx, st.name, strlen(st.name), + v9fs_qid2ino(&st.qid), dt_type(&st)); + p9stat_free(&st); + if (over) + return 0; + + rdir->head += reclen; + ctx->pos += reclen; + } + } +} + +/** + * v9fs_dir_readdir_dotl - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to + * + */ +static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) +{ + int err = 0; + struct p9_fid *fid; + int buflen; + struct p9_rdir *rdir; + struct p9_dirent curdirent; + + p9_debug(P9_DEBUG_VFS, "name %pD\n", file); + fid = file->private_data; + + buflen = fid->clnt->msize - P9_READDIRHDRSZ; + + rdir = v9fs_alloc_rdir_buf(file, buflen); + if (!rdir) + return -ENOMEM; + + while (1) { + if (rdir->tail == rdir->head) { + err = p9_client_readdir(fid, rdir->buf, buflen, + ctx->pos); + if (err <= 0) + return err; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); + return -EIO; + } + + if (!dir_emit(ctx, curdirent.d_name, + strlen(curdirent.d_name), + v9fs_qid2ino(&curdirent.qid), + curdirent.d_type)) + return 0; + + ctx->pos = curdirent.d_off; + rdir->head += err; + } + } +} + + +/** + * v9fs_dir_release - close a directory + * @inode: inode of the directory + * @filp: file pointer to a directory + * + */ + +int v9fs_dir_release(struct inode *inode, struct file *filp) +{ + struct p9_fid *fid; + + fid = filp->private_data; + p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); + if (fid) + p9_client_clunk(fid); + return 0; +} + +const struct file_operations v9fs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .iterate = v9fs_dir_readdir, + .open = v9fs_file_open, + .release = v9fs_dir_release, +}; + +const struct file_operations v9fs_dir_operations_dotl = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .iterate = v9fs_dir_readdir_dotl, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/kernel/fs/9p/vfs_file.c b/kernel/fs/9p/vfs_file.c new file mode 100644 index 000000000..1ef16bd82 --- /dev/null +++ b/kernel/fs/9p/vfs_file.c @@ -0,0 +1,705 @@ +/* + * linux/fs/9p/vfs_file.c + * + * This file contians vfs file ops for 9P2000. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" + +static const struct vm_operations_struct v9fs_file_vm_ops; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops; + +/** + * v9fs_file_open - open a file (or directory) + * @inode: inode to be opened + * @file: file being opened + * + */ + +int v9fs_file_open(struct inode *inode, struct file *file) +{ + int err; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + int omode; + + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + v9inode = V9FS_I(inode); + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + omode = v9fs_open_to_dotl_flags(file->f_flags); + else + omode = v9fs_uflags2omode(file->f_flags, + v9fs_proto_dotu(v9ses)); + fid = file->private_data; + if (!fid) { + fid = v9fs_fid_clone(file->f_path.dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + err = p9_client_open(fid, omode); + if (err < 0) { + p9_client_clunk(fid); + return err; + } + if ((file->f_flags & O_APPEND) && + (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) + generic_file_llseek(file, 0, SEEK_END); + } + + file->private_data = fid; + mutex_lock(&v9inode->v_mutex); + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && + ((file->f_flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(file->f_path.dentry); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + goto out_error; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + v9fs_cache_inode_set_cookie(inode, file); + return 0; +out_error: + p9_client_clunk(file->private_data); + file->private_data = NULL; + return err; +} + +/** + * v9fs_file_lock - lock a file (or directory) + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + * Bugs: this looks like a local only lock, we should extend into 9P + * by using open exclusive + */ + +static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + int res = 0; + struct inode *inode = file_inode(filp); + + p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + + return res; +} + +static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct p9_flock flock; + struct p9_fid *fid; + uint8_t status = P9_LOCK_ERROR; + int res = 0; + unsigned char fl_type; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + if ((fl->fl_flags & FL_POSIX) != FL_POSIX) + BUG(); + + res = posix_lock_file_wait(filp, fl); + if (res < 0) + goto out; + + /* convert posix lock to p9 tlock args */ + memset(&flock, 0, sizeof(flock)); + /* map the lock type */ + switch (fl->fl_type) { + case F_RDLCK: + flock.type = P9_LOCK_TYPE_RDLCK; + break; + case F_WRLCK: + flock.type = P9_LOCK_TYPE_WRLCK; + break; + case F_UNLCK: + flock.type = P9_LOCK_TYPE_UNLCK; + break; + } + flock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + flock.length = 0; + else + flock.length = fl->fl_end - fl->fl_start + 1; + flock.proc_id = fl->fl_pid; + flock.client_id = fid->clnt->name; + if (IS_SETLKW(cmd)) + flock.flags = P9_LOCK_FLAGS_BLOCK; + + /* + * if its a blocked request and we get P9_LOCK_BLOCKED as the status + * for lock request, keep on trying + */ + for (;;) { + res = p9_client_lock_dotl(fid, &flock, &status); + if (res < 0) + goto out_unlock; + + if (status != P9_LOCK_BLOCKED) + break; + if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) + break; + if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + break; + } + + /* map 9p status to VFS status */ + switch (status) { + case P9_LOCK_SUCCESS: + res = 0; + break; + case P9_LOCK_BLOCKED: + res = -EAGAIN; + break; + default: + WARN_ONCE(1, "unknown lock status code: %d\n", status); + /* fallthough */ + case P9_LOCK_ERROR: + case P9_LOCK_GRACE: + res = -ENOLCK; + break; + } + +out_unlock: + /* + * incase server returned error for lock request, revert + * it locally + */ + if (res < 0 && fl->fl_type != F_UNLCK) { + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + res = posix_lock_file_wait(filp, fl); + fl->fl_type = fl_type; + } +out: + return res; +} + +static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) +{ + struct p9_getlock glock; + struct p9_fid *fid; + int res = 0; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + posix_test_lock(filp, fl); + /* + * if we have a conflicting lock locally, no need to validate + * with server + */ + if (fl->fl_type != F_UNLCK) + return res; + + /* convert posix lock to p9 tgetlock args */ + memset(&glock, 0, sizeof(glock)); + glock.type = P9_LOCK_TYPE_UNLCK; + glock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + glock.length = 0; + else + glock.length = fl->fl_end - fl->fl_start + 1; + glock.proc_id = fl->fl_pid; + glock.client_id = fid->clnt->name; + + res = p9_client_getlock_dotl(fid, &glock); + if (res < 0) + return res; + /* map 9p lock type to os lock type */ + switch (glock.type) { + case P9_LOCK_TYPE_RDLCK: + fl->fl_type = F_RDLCK; + break; + case P9_LOCK_TYPE_WRLCK: + fl->fl_type = F_WRLCK; + break; + case P9_LOCK_TYPE_UNLCK: + fl->fl_type = F_UNLCK; + break; + } + if (glock.type != P9_LOCK_TYPE_UNLCK) { + fl->fl_start = glock.start; + if (glock.length == 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = glock.start + glock.length - 1; + fl->fl_pid = glock.proc_id; + } + kfree(glock.client_id); + return res; +} + +/** + * v9fs_file_lock_dotl - lock a file (or directory) + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(filp); + int ret = -ENOLCK; + + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", + filp, cmd, fl, filp); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + + if (IS_SETLK(cmd) || IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else if (IS_GETLK(cmd)) + ret = v9fs_file_getlock(filp, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + +/** + * v9fs_file_flock_dotl - lock a file + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_flock_dotl(struct file *filp, int cmd, + struct file_lock *fl) +{ + struct inode *inode = file_inode(filp); + int ret = -ENOLCK; + + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", + filp, cmd, fl, filp); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (!(fl->fl_flags & FL_FLOCK)) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + /* Convert flock to posix lock */ + fl->fl_flags |= FL_POSIX; + fl->fl_flags ^= FL_FLOCK; + + if (IS_SETLK(cmd) | IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + +/** + * v9fs_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ + +static ssize_t +v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct p9_fid *fid = iocb->ki_filp->private_data; + int ret, err; + + p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", + iov_iter_count(to), iocb->ki_pos); + + ret = p9_client_read(fid, iocb->ki_pos, to, &err); + if (!ret) + return err; + + iocb->ki_pos += ret; + return ret; +} + +/** + * v9fs_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t retval; + loff_t origin; + int err = 0; + + retval = generic_write_checks(iocb, from); + if (retval <= 0) + return retval; + + origin = iocb->ki_pos; + retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); + if (retval > 0) { + struct inode *inode = file_inode(file); + loff_t i_size; + unsigned long pg_start, pg_end; + pg_start = origin >> PAGE_CACHE_SHIFT; + pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT; + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + iocb->ki_pos += retval; + i_size = i_size_read(inode); + if (iocb->ki_pos > i_size) { + inode_add_bytes(inode, iocb->ki_pos - i_size); + i_size_write(inode, iocb->ki_pos); + } + return retval; + } + return err; +} + +static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct p9_fid *fid; + struct inode *inode = filp->f_mapping->host; + struct p9_wstat wstat; + int retval; + + retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (retval) + return retval; + + mutex_lock(&inode->i_mutex); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + + fid = filp->private_data; + v9fs_blank_wstat(&wstat); + + retval = p9_client_wstat(fid, &wstat); + mutex_unlock(&inode->i_mutex); + + return retval; +} + +int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct p9_fid *fid; + struct inode *inode = filp->f_mapping->host; + int retval; + + retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (retval) + return retval; + + mutex_lock(&inode->i_mutex); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + + fid = filp->private_data; + + retval = p9_client_fsync(fid, datasync); + mutex_unlock(&inode->i_mutex); + + return retval; +} + +static int +v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int retval; + + + retval = generic_file_mmap(filp, vma); + if (!retval) + vma->vm_ops = &v9fs_file_vm_ops; + + return retval; +} + +static int +v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int retval; + struct inode *inode; + struct v9fs_inode *v9inode; + struct p9_fid *fid; + + inode = file_inode(filp); + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if (!v9inode->writeback_fid && + (vma->vm_flags & VM_WRITE)) { + /* + * clone a fid and add it to writeback_fid + * we do it during mmap instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(filp->f_path.dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + return retval; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); + + retval = generic_file_mmap(filp, vma); + if (!retval) + vma->vm_ops = &v9fs_mmap_file_vm_ops; + + return retval; +} + +static int +v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct v9fs_inode *v9inode; + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct inode *inode = file_inode(filp); + + + p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); + + /* Update file times before taking page lock */ + file_update_time(filp); + + v9inode = V9FS_I(inode); + /* make sure the cache has finished storing the page */ + v9fs_fscache_wait_on_page_write(inode, page); + BUG_ON(!v9inode->writeback_fid); + lock_page(page); + if (page->mapping != inode->i_mapping) + goto out_unlock; + wait_for_stable_page(page); + + return VM_FAULT_LOCKED; +out_unlock: + unlock_page(page); + return VM_FAULT_NOPAGE; +} + +/** + * v9fs_mmap_file_read - read from a file + * @filp: file pointer to read + * @data: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + /* TODO: Check if there are dirty pages */ + return v9fs_file_read_iter(iocb, to); +} + +/** + * v9fs_mmap_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + /* + * TODO: invalidate mmaps on filp's inode between + * offset and offset+count + */ + return v9fs_file_write_iter(iocb, from); +} + +static void v9fs_mmap_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode; + + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + .range_start = vma->vm_pgoff * PAGE_SIZE, + /* absolute end, byte at end included */ + .range_end = vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1), + }; + + + p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); + + inode = file_inode(vma->vm_file); + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + sync_inode(inode, &wbc); +} + + +static const struct vm_operations_struct v9fs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = v9fs_vm_page_mkwrite, +}; + +static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { + .close = v9fs_mmap_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = v9fs_vm_page_mkwrite, +}; + + +const struct file_operations v9fs_cached_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_cached_file_operations_dotl = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; + +const struct file_operations v9fs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = v9fs_file_read_iter, + .write_iter = v9fs_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_file_operations_dotl = { + .llseek = generic_file_llseek, + .read_iter = v9fs_file_read_iter, + .write_iter = v9fs_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync_dotl, +}; + +const struct file_operations v9fs_mmap_file_operations = { + .llseek = generic_file_llseek, + .read_iter = v9fs_mmap_file_read_iter, + .write_iter = v9fs_mmap_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_mmap_file_operations_dotl = { + .llseek = generic_file_llseek, + .read_iter = v9fs_mmap_file_read_iter, + .write_iter = v9fs_mmap_file_write_iter, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/kernel/fs/9p/vfs_inode.c b/kernel/fs/9p/vfs_inode.c new file mode 100644 index 000000000..703342e30 --- /dev/null +++ b/kernel/fs/9p/vfs_inode.c @@ -0,0 +1,1537 @@ +/* + * linux/fs/9p/vfs_inode.c + * + * This file contains vfs inode ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" +#include "xattr.h" +#include "acl.h" + +static const struct inode_operations v9fs_dir_inode_operations; +static const struct inode_operations v9fs_dir_inode_operations_dotu; +static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations; + +/** + * unixmode2p9mode - convert unix mode bits to plan 9 + * @v9ses: v9fs session information + * @mode: mode to convert + * + */ + +static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) +{ + int res; + res = mode & 0777; + if (S_ISDIR(mode)) + res |= P9_DMDIR; + if (v9fs_proto_dotu(v9ses)) { + if (v9ses->nodev == 0) { + if (S_ISSOCK(mode)) + res |= P9_DMSOCKET; + if (S_ISFIFO(mode)) + res |= P9_DMNAMEDPIPE; + if (S_ISBLK(mode)) + res |= P9_DMDEVICE; + if (S_ISCHR(mode)) + res |= P9_DMDEVICE; + } + + if ((mode & S_ISUID) == S_ISUID) + res |= P9_DMSETUID; + if ((mode & S_ISGID) == S_ISGID) + res |= P9_DMSETGID; + if ((mode & S_ISVTX) == S_ISVTX) + res |= P9_DMSETVTX; + } + return res; +} + +/** + * p9mode2perm- convert plan9 mode bits to unix permission bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * + */ +static int p9mode2perm(struct v9fs_session_info *v9ses, + struct p9_wstat *stat) +{ + int res; + int mode = stat->mode; + + res = mode & S_IALLUGO; + if (v9fs_proto_dotu(v9ses)) { + if ((mode & P9_DMSETUID) == P9_DMSETUID) + res |= S_ISUID; + + if ((mode & P9_DMSETGID) == P9_DMSETGID) + res |= S_ISGID; + + if ((mode & P9_DMSETVTX) == P9_DMSETVTX) + res |= S_ISVTX; + } + return res; +} + +/** + * p9mode2unixmode- convert plan9 mode bits to unix mode bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * @rdev: major number, minor number in case of device files. + * + */ +static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) +{ + int res; + u32 mode = stat->mode; + + *rdev = 0; + res = p9mode2perm(v9ses, stat); + + if ((mode & P9_DMDIR) == P9_DMDIR) + res |= S_IFDIR; + else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses))) + res |= S_IFLNK; + else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) + res |= S_IFSOCK; + else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) + res |= S_IFIFO; + else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) { + char type = 0, ext[32]; + int major = -1, minor = -1; + + strlcpy(ext, stat->extension, sizeof(ext)); + sscanf(ext, "%c %i %i", &type, &major, &minor); + switch (type) { + case 'c': + res |= S_IFCHR; + break; + case 'b': + res |= S_IFBLK; + break; + default: + p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", + type, stat->extension); + }; + *rdev = MKDEV(major, minor); + } else + res |= S_IFREG; + + return res; +} + +/** + * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits + * @uflags: flags to convert + * @extended: if .u extensions are active + */ + +int v9fs_uflags2omode(int uflags, int extended) +{ + int ret; + + ret = 0; + switch (uflags&3) { + default: + case O_RDONLY: + ret = P9_OREAD; + break; + + case O_WRONLY: + ret = P9_OWRITE; + break; + + case O_RDWR: + ret = P9_ORDWR; + break; + } + + if (extended) { + if (uflags & O_EXCL) + ret |= P9_OEXCL; + + if (uflags & O_APPEND) + ret |= P9_OAPPEND; + } + + return ret; +} + +/** + * v9fs_blank_wstat - helper function to setup a 9P stat structure + * @wstat: structure to initialize + * + */ + +void +v9fs_blank_wstat(struct p9_wstat *wstat) +{ + wstat->type = ~0; + wstat->dev = ~0; + wstat->qid.type = ~0; + wstat->qid.version = ~0; + *((long long *)&wstat->qid.path) = ~0; + wstat->mode = ~0; + wstat->atime = ~0; + wstat->mtime = ~0; + wstat->length = ~0; + wstat->name = NULL; + wstat->uid = NULL; + wstat->gid = NULL; + wstat->muid = NULL; + wstat->n_uid = INVALID_UID; + wstat->n_gid = INVALID_GID; + wstat->n_muid = INVALID_UID; + wstat->extension = NULL; +} + +/** + * v9fs_alloc_inode - helper function to allocate an inode + * + */ +struct inode *v9fs_alloc_inode(struct super_block *sb) +{ + struct v9fs_inode *v9inode; + v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache, + GFP_KERNEL); + if (!v9inode) + return NULL; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + spin_lock_init(&v9inode->fscache_lock); +#endif + v9inode->writeback_fid = NULL; + v9inode->cache_validity = 0; + mutex_init(&v9inode->v_mutex); + return &v9inode->vfs_inode; +} + +/** + * v9fs_destroy_inode - destroy an inode + * + */ + +static void v9fs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); +} + +void v9fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, v9fs_i_callback); +} + +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, umode_t mode, dev_t rdev) +{ + int err = 0; + + inode_init_owner(inode, NULL, mode); + inode->i_blocks = 0; + inode->i_rdev = rdev; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + } else if (v9fs_proto_dotu(v9ses)) { + inode->i_op = &v9fs_file_inode_operations; + } else { + p9_debug(P9_DEBUG_ERROR, + "special files without extended mode\n"); + err = -EINVAL; + goto error; + } + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) + inode->i_fop = + &v9fs_cached_file_operations_dotl; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations_dotl; + else + inode->i_fop = &v9fs_file_operations_dotl; + } else { + inode->i_op = &v9fs_file_inode_operations; + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) + inode->i_fop = + &v9fs_cached_file_operations; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations; + else + inode->i_fop = &v9fs_file_operations; + } + + break; + case S_IFLNK: + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { + p9_debug(P9_DEBUG_ERROR, + "extended modes used with legacy protocol\n"); + err = -EINVAL; + goto error; + } + + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_symlink_inode_operations_dotl; + else + inode->i_op = &v9fs_symlink_inode_operations; + + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotl; + else if (v9fs_proto_dotu(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotu; + else + inode->i_op = &v9fs_dir_inode_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_fop = &v9fs_dir_operations_dotl; + else + inode->i_fop = &v9fs_dir_operations; + + break; + default: + p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; + } +error: + return err; + +} + +/** + * v9fs_get_inode - helper function to setup an inode + * @sb: superblock + * @mode: mode to setup inode with + * + */ + +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) +{ + int err; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); + + inode = new_inode(sb); + if (!inode) { + pr_warn("%s (%d): Problem allocating inode\n", + __func__, task_pid_nr(current)); + return ERR_PTR(-ENOMEM); + } + err = v9fs_init_inode(v9ses, inode, mode, rdev); + if (err) { + iput(inode); + return ERR_PTR(err); + } + return inode; +} + +/* +static struct v9fs_fid* +v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) +{ + int err; + int nfid; + struct v9fs_fid *ret; + struct v9fs_fcall *fcall; + + nfid = v9fs_get_idpool(&v9ses->fidpool); + if (nfid < 0) { + eprintk(KERN_WARNING, "no free fids available\n"); + return ERR_PTR(-ENOSPC); + } + + err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, + &fcall); + + if (err < 0) { + if (fcall && fcall->id == RWALK) + goto clunk_fid; + + PRINT_FCALL_ERROR("walk error", fcall); + v9fs_put_idpool(nfid, &v9ses->fidpool); + goto error; + } + + kfree(fcall); + fcall = NULL; + ret = v9fs_fid_create(v9ses, nfid); + if (!ret) { + err = -ENOMEM; + goto clunk_fid; + } + + err = v9fs_fid_insert(ret, dentry); + if (err < 0) { + v9fs_fid_destroy(ret); + goto clunk_fid; + } + + return ret; + +clunk_fid: + v9fs_t_clunk(v9ses, nfid); + +error: + kfree(fcall); + return ERR_PTR(err); +} +*/ + + +/** + * v9fs_clear_inode - release an inode + * @inode: inode to release + * + */ +void v9fs_evict_inode(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + truncate_inode_pages_final(inode->i_mapping); + clear_inode(inode); + filemap_fdatawrite(inode->i_mapping); + + v9fs_cache_inode_put_cookie(inode); + /* clunk the fid stashed in writeback_fid */ + if (v9inode->writeback_fid) { + p9_client_clunk(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; + } +} + +static int v9fs_test_inode(struct inode *inode, void *data) +{ + int umode; + dev_t rdev; + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + umode = p9mode2unixmode(v9ses, st, &rdev); + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +static int v9fs_test_new_inode(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + return 0; +} + +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st, + int new) +{ + dev_t rdev; + int retval; + umode_t umode; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode; + else + test = v9fs_test_inode; + + i_ino = v9fs_qid2ino(qid); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + inode->i_ino = i_ino; + umode = p9mode2unixmode(v9ses, st, &rdev); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); + if (retval) + goto error; + + v9fs_stat2inode(st, inode, sb); + v9fs_cache_inode_get_cookie(inode); + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st, new); + p9stat_free(st); + kfree(st); + return inode; +} + +/** + * v9fs_at_to_dotl_flags- convert Linux specific AT flags to + * plan 9 AT flag. + * @flags: flags to convert + */ +static int v9fs_at_to_dotl_flags(int flags) +{ + int rflags = 0; + if (flags & AT_REMOVEDIR) + rflags |= P9_DOTL_AT_REMOVEDIR; + return rflags; +} + +/** + * v9fs_remove - helper function to remove files and directories + * @dir: directory inode that is being deleted + * @dentry: dentry that is being deleted + * @flags: removing a directory + * + */ + +static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) +{ + struct inode *inode; + int retval = -EOPNOTSUPP; + struct p9_fid *v9fid, *dfid; + struct v9fs_session_info *v9ses; + + p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", + dir, dentry, flags); + + v9ses = v9fs_inode2v9ses(dir); + inode = d_inode(dentry); + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + retval = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); + return retval; + } + if (v9fs_proto_dotl(v9ses)) + retval = p9_client_unlinkat(dfid, dentry->d_name.name, + v9fs_at_to_dotl_flags(flags)); + if (retval == -EOPNOTSUPP) { + /* Try the one based on path */ + v9fid = v9fs_fid_clone(dentry); + if (IS_ERR(v9fid)) + return PTR_ERR(v9fid); + retval = p9_client_remove(v9fid); + } + if (!retval) { + /* + * directories on unlink should have zero + * link count + */ + if (flags & AT_REMOVEDIR) { + clear_nlink(inode); + drop_nlink(dir); + } else + drop_nlink(inode); + + v9fs_invalidate_inode_attr(inode); + v9fs_invalidate_inode_attr(dir); + } + return retval; +} + +/** + * v9fs_create - Create a file + * @v9ses: session information + * @dir: directory that dentry is being created in + * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. + * @perm: create permissions + * @mode: open mode + * + */ +static struct p9_fid * +v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, + struct dentry *dentry, char *extension, u32 perm, u8 mode) +{ + int err; + char *name; + struct p9_fid *dfid, *ofid, *fid; + struct inode *inode; + + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); + + err = 0; + ofid = NULL; + fid = NULL; + name = (char *) dentry->d_name.name; + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return ERR_PTR(err); + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return ERR_PTR(err); + } + + err = p9_client_fcreate(ofid, name, perm, mode, extension); + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); + goto error; + } + + if (!(perm & P9_DMLINK)) { + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, + "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + /* + * instantiate inode and assign the unopened fid to the dentry + */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, + "inode creation failed %d\n", err); + goto error; + } + v9fs_fid_add(dentry, fid); + d_instantiate(dentry, inode); + } + return ofid; +error: + if (ofid) + p9_client_clunk(ofid); + + if (fid) + p9_client_clunk(fid); + + return ERR_PTR(err); +} + +/** + * v9fs_vfs_create - VFS hook to create a regular file + * + * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called + * for mknod(2). + * + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * + */ + +static int +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); + u32 perm = unixmode2p9mode(v9ses, mode); + struct p9_fid *fid; + + /* P9_OEXCL? */ + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_invalidate_inode_attr(dir); + p9_client_clunk(fid); + + return 0; +} + +/** + * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int err; + u32 perm; + struct p9_fid *fid; + struct v9fs_session_info *v9ses; + + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode | S_IFDIR); + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + } else { + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); + } + + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode + * @dir: inode that is being walked from + * @dentry: dentry that is being walked to? + * @flags: lookup flags (unused) + * + */ + +struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct dentry *res; + struct v9fs_session_info *v9ses; + struct p9_fid *dfid, *fid; + struct inode *inode; + char *name; + + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n", + dir, dentry, dentry, flags); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + v9ses = v9fs_inode2v9ses(dir); + /* We can walk d_parent because we hold the dir->i_mutex */ + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) + return ERR_CAST(dfid); + + name = (char *) dentry->d_name.name; + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + if (fid == ERR_PTR(-ENOENT)) { + d_add(dentry, NULL); + return NULL; + } + return ERR_CAST(fid); + } + /* + * Make sure we don't use a wrong inode due to parallel + * unlink. For cached mode create calls request for new + * inode. But with cache disabled, lookup should do this. + */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + else + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + p9_client_clunk(fid); + return ERR_CAST(inode); + } + /* + * If we had a rename on the server and a parallel lookup + * for the new name, then make sure we instantiate with + * the new name. ie look up for a/b, while on server somebody + * moved b under k and client parallely did a lookup for + * k/b. + */ + res = d_splice_alias(inode, dentry); + if (!res) + v9fs_fid_add(dentry, fid); + else if (!IS_ERR(res)) + v9fs_fid_add(res, fid); + else + p9_client_clunk(fid); + return res; +} + +static int +v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) +{ + int err; + u32 perm; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; + struct dentry *res = NULL; + + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; + } + + /* Only creates */ + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) + return finish_no_open(file, res); + + err = 0; + + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode); + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + v9fs_invalidate_inode_attr(dir); + v9inode = V9FS_I(d_inode(dentry)); + mutex_lock(&v9inode->v_mutex); + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + err = finish_open(file, dentry, generic_file_open, opened); + if (err) + goto error; + + file->private_data = fid; + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + v9fs_cache_inode_set_cookie(d_inode(dentry), file); + + *opened |= FILE_CREATED; +out: + dput(res); + return err; + +error: + if (fid) + p9_client_clunk(fid); + goto out; +} + +/** + * v9fs_vfs_unlink - VFS unlink hook to delete an inode + * @i: inode that is being unlinked + * @d: dentry that is being unlinked + * + */ + +int v9fs_vfs_unlink(struct inode *i, struct dentry *d) +{ + return v9fs_remove(i, d, 0); +} + +/** + * v9fs_vfs_rmdir - VFS unlink hook to delete a directory + * @i: inode that is being unlinked + * @d: dentry that is being unlinked + * + */ + +int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) +{ + return v9fs_remove(i, d, AT_REMOVEDIR); +} + +/** + * v9fs_vfs_rename - VFS hook to rename an inode + * @old_dir: old dir inode + * @old_dentry: old dentry + * @new_dir: new dir inode + * @new_dentry: new dentry + * + */ + +int +v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int retval; + struct inode *old_inode; + struct inode *new_inode; + struct v9fs_session_info *v9ses; + struct p9_fid *oldfid; + struct p9_fid *olddirfid; + struct p9_fid *newdirfid; + struct p9_wstat wstat; + + p9_debug(P9_DEBUG_VFS, "\n"); + retval = 0; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + v9ses = v9fs_inode2v9ses(old_inode); + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + olddirfid = v9fs_fid_clone(old_dentry->d_parent); + if (IS_ERR(olddirfid)) { + retval = PTR_ERR(olddirfid); + goto done; + } + + newdirfid = v9fs_fid_clone(new_dentry->d_parent); + if (IS_ERR(newdirfid)) { + retval = PTR_ERR(newdirfid); + goto clunk_olddir; + } + + down_write(&v9ses->rename_sem); + if (v9fs_proto_dotl(v9ses)) { + retval = p9_client_renameat(olddirfid, old_dentry->d_name.name, + newdirfid, new_dentry->d_name.name); + if (retval == -EOPNOTSUPP) + retval = p9_client_rename(oldfid, newdirfid, + new_dentry->d_name.name); + if (retval != -EOPNOTSUPP) + goto clunk_newdir; + } + if (old_dentry->d_parent != new_dentry->d_parent) { + /* + * 9P .u can only handle file rename in the same directory + */ + + p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n"); + retval = -EXDEV; + goto clunk_newdir; + } + v9fs_blank_wstat(&wstat); + wstat.muid = v9ses->uname; + wstat.name = (char *) new_dentry->d_name.name; + retval = p9_client_wstat(oldfid, &wstat); + +clunk_newdir: + if (!retval) { + if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + } + if (S_ISDIR(old_inode->i_mode)) { + if (!new_inode) + inc_nlink(new_dir); + drop_nlink(old_dir); + } + v9fs_invalidate_inode_attr(old_inode); + v9fs_invalidate_inode_attr(old_dir); + v9fs_invalidate_inode_attr(new_dir); + + /* successful rename */ + d_move(old_dentry, new_dentry); + } + up_write(&v9ses->rename_sem); + p9_client_clunk(newdirfid); + +clunk_olddir: + p9_client_clunk(olddirfid); + +done: + return retval; +} + +/** + * v9fs_vfs_getattr - retrieve file metadata + * @mnt: mount information + * @dentry: file to get attributes on + * @stat: metadata structure to populate + * + */ + +static int +v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat *st; + + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); + v9ses = v9fs_dentry2v9ses(dentry); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(d_inode(dentry), stat); + return 0; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb); + generic_fillattr(d_inode(dentry), stat); + + p9stat_free(st); + kfree(st); + return 0; +} + +/** + * v9fs_vfs_setattr - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat wstat; + + p9_debug(P9_DEBUG_VFS, "\n"); + retval = inode_change_ok(d_inode(dentry), iattr); + if (retval) + return retval; + + retval = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); + if(IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_blank_wstat(&wstat); + if (iattr->ia_valid & ATTR_MODE) + wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode); + + if (iattr->ia_valid & ATTR_MTIME) + wstat.mtime = iattr->ia_mtime.tv_sec; + + if (iattr->ia_valid & ATTR_ATIME) + wstat.atime = iattr->ia_atime.tv_sec; + + if (iattr->ia_valid & ATTR_SIZE) + wstat.length = iattr->ia_size; + + if (v9fs_proto_dotu(v9ses)) { + if (iattr->ia_valid & ATTR_UID) + wstat.n_uid = iattr->ia_uid; + + if (iattr->ia_valid & ATTR_GID) + wstat.n_gid = iattr->ia_gid; + } + + /* Write all dirty data */ + if (d_is_reg(dentry)) + filemap_write_and_wait(d_inode(dentry)->i_mapping); + + retval = p9_client_wstat(fid, &wstat); + if (retval < 0) + return retval; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(d_inode(dentry))) + truncate_setsize(d_inode(dentry), iattr->ia_size); + + v9fs_invalidate_inode_attr(d_inode(dentry)); + + setattr_copy(d_inode(dentry), iattr); + mark_inode_dirty(d_inode(dentry)); + return 0; +} + +/** + * v9fs_stat2inode - populate an inode structure with mistat info + * @stat: Plan 9 metadata (mistat) structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb) +{ + umode_t mode; + char ext[32]; + char tag_name[14]; + unsigned int i_nlink; + struct v9fs_session_info *v9ses = sb->s_fs_info; + struct v9fs_inode *v9inode = V9FS_I(inode); + + set_nlink(inode, 1); + + inode->i_atime.tv_sec = stat->atime; + inode->i_mtime.tv_sec = stat->mtime; + inode->i_ctime.tv_sec = stat->mtime; + + inode->i_uid = v9ses->dfltuid; + inode->i_gid = v9ses->dfltgid; + + if (v9fs_proto_dotu(v9ses)) { + inode->i_uid = stat->n_uid; + inode->i_gid = stat->n_gid; + } + if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { + if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + /* + * Hadlink support got added later to + * to the .u extension. So there can be + * server out there that doesn't support + * this even with .u extension. So check + * for non NULL stat->extension + */ + strlcpy(ext, stat->extension, sizeof(ext)); + /* HARDLINKCOUNT %u */ + sscanf(ext, "%13s %u", tag_name, &i_nlink); + if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + set_nlink(inode, i_nlink); + } + } + mode = p9mode2perm(v9ses, stat); + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + i_size_write(inode, stat->length); + + /* not real number of blocks, but 512 byte ones ... */ + inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; +} + +/** + * v9fs_qid2ino - convert qid into inode number + * @qid: qid to hash + * + * BUG: potential for inode number collisions? + */ + +ino_t v9fs_qid2ino(struct p9_qid *qid) +{ + u64 path = qid->path + 2; + ino_t i = 0; + + if (sizeof(ino_t) == sizeof(path)) + memcpy(&i, &path, sizeof(ino_t)); + else + i = (ino_t) (path ^ (path >> 32)); + + return i; +} + +/** + * v9fs_readlink - read a symlink's location (internal version) + * @dentry: dentry for symlink + * @buffer: buffer to load symlink location into + * @buflen: length of buffer + * + */ + +static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + int retval; + + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat *st; + + p9_debug(P9_DEBUG_VFS, " %pd\n", dentry); + retval = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + if (!v9fs_proto_dotu(v9ses)) + return -EBADF; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + if (!(st->mode & P9_DMSYMLINK)) { + retval = -EINVAL; + goto done; + } + + /* copy extension buffer into buffer */ + retval = min(strlen(st->extension)+1, (size_t)buflen); + memcpy(buffer, st->extension, retval); + + p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n", + dentry, st->extension, buflen, buffer); + +done: + p9stat_free(st); + kfree(st); + return retval; +} + +/** + * v9fs_vfs_follow_link - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int len = 0; + char *link = __getname(); + + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + + if (!link) + link = ERR_PTR(-ENOMEM); + else { + len = v9fs_readlink(dentry, link, PATH_MAX); + + if (len < 0) { + __putname(link); + link = ERR_PTR(len); + } else + link[min(len, PATH_MAX-1)] = 0; + } + nd_set_link(nd, link); + + return NULL; +} + +/** + * v9fs_vfs_put_link - release a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * @p: unused + * + */ + +void +v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + char *s = nd_get_link(nd); + + p9_debug(P9_DEBUG_VFS, " %pd %s\n", + dentry, IS_ERR(s) ? "" : s); + if (!IS_ERR(s)) + __putname(s); +} + +/** + * v9fs_vfs_mkspecial - create a special file + * @dir: inode to create special file in + * @dentry: dentry to create + * @perm: mode to create special file + * @extension: 9p2000.u format extension string representing special file + * + */ + +static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, + u32 perm, const char *extension) +{ + struct p9_fid *fid; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(dir); + if (!v9fs_proto_dotu(v9ses)) { + p9_debug(P9_DEBUG_ERROR, "not extended\n"); + return -EPERM; + } + + fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, + P9_OREAD); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_invalidate_inode_attr(dir); + p9_client_clunk(fid); + return 0; +} + +/** + * v9fs_vfs_symlink - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See Also: 9P2000.u RFC for more information + * + */ + +static int +v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", + dir->i_ino, dentry, symname); + + return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); +} + +/** + * v9fs_vfs_link - create a hardlink + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + char *name; + struct p9_fid *oldfid; + + p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", + dir->i_ino, dentry, old_dentry); + + oldfid = v9fs_fid_clone(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = __getname(); + if (unlikely(!name)) { + retval = -ENOMEM; + goto clunk_fid; + } + + sprintf(name, "%d\n", oldfid->fid); + retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); + __putname(name); + if (!retval) { + v9fs_refresh_inode(oldfid, d_inode(old_dentry)); + v9fs_invalidate_inode_attr(dir); + } +clunk_fid: + p9_client_clunk(oldfid); + return retval; +} + +/** + * v9fs_vfs_mknod - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ + +static int +v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); + int retval; + char *name; + u32 perm; + + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry, mode, + MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + name = __getname(); + if (!name) + return -ENOMEM; + /* build extension */ + if (S_ISBLK(mode)) + sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); + else if (S_ISCHR(mode)) + sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); + else if (S_ISFIFO(mode)) + *name = 0; + else if (S_ISSOCK(mode)) + *name = 0; + else { + __putname(name); + return -EINVAL; + } + + perm = unixmode2p9mode(v9ses, mode); + retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); + __putname(name); + + return retval; +} + +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) +{ + int umode; + dev_t rdev; + loff_t i_size; + struct p9_wstat *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + umode = p9mode2unixmode(v9ses, st, &rdev); + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + goto out; + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode(st, inode, inode->i_sb); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); +out: + p9stat_free(st); + kfree(st); + return 0; +} + +static const struct inode_operations v9fs_dir_inode_operations_dotu = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, + .symlink = v9fs_vfs_symlink, + .link = v9fs_vfs_link, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_dir_inode_operations = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_file_inode_operations = { + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + diff --git a/kernel/fs/9p/vfs_inode_dotl.c b/kernel/fs/9p/vfs_inode_dotl.c new file mode 100644 index 000000000..9861c7c95 --- /dev/null +++ b/kernel/fs/9p/vfs_inode_dotl.c @@ -0,0 +1,1016 @@ +/* + * linux/fs/9p/vfs_inode_dotl.c + * + * This file contains vfs inode ops for the 9P2000.L protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" +#include "xattr.h" +#include "acl.h" + +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, + dev_t rdev); + +/** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +static int v9fs_test_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + return 0; + + if (inode->i_generation != st->st_gen) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +/* Always get a new inode */ +static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + inode->i_generation = st->st_gen; + return 0; +} + +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st, + int new) +{ + int retval; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode_dotl; + else + test = v9fs_test_inode_dotl; + + i_ino = v9fs_qid2ino(qid); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + inode->i_ino = i_ino; + retval = v9fs_init_inode(v9ses, inode, + st->st_mode, new_decode_dev(st->st_rdev)); + if (retval) + goto error; + + v9fs_stat2inode_dotl(st, inode); + v9fs_cache_inode_get_cookie(inode); + retval = v9fs_get_acl(inode, fid); + if (retval) + goto error; + + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_stat_dotl *st; + struct inode *inode = NULL; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); + kfree(st); + return inode; +} + +struct dotl_openflag_map { + int open_flag; + int dotl_flag; +}; + +static int v9fs_mapped_dotl_flags(int flags) +{ + int i; + int rflags = 0; + struct dotl_openflag_map dotl_oflag_map[] = { + { O_CREAT, P9_DOTL_CREATE }, + { O_EXCL, P9_DOTL_EXCL }, + { O_NOCTTY, P9_DOTL_NOCTTY }, + { O_APPEND, P9_DOTL_APPEND }, + { O_NONBLOCK, P9_DOTL_NONBLOCK }, + { O_DSYNC, P9_DOTL_DSYNC }, + { FASYNC, P9_DOTL_FASYNC }, + { O_DIRECT, P9_DOTL_DIRECT }, + { O_LARGEFILE, P9_DOTL_LARGEFILE }, + { O_DIRECTORY, P9_DOTL_DIRECTORY }, + { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, + { O_NOATIME, P9_DOTL_NOATIME }, + { O_CLOEXEC, P9_DOTL_CLOEXEC }, + { O_SYNC, P9_DOTL_SYNC}, + }; + for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { + if (flags & dotl_oflag_map[i].open_flag) + rflags |= dotl_oflag_map[i].dotl_flag; + } + return rflags; +} + +/** + * v9fs_open_to_dotl_flags- convert Linux specific open flags to + * plan 9 open flag. + * @flags: flags to convert + */ +int v9fs_open_to_dotl_flags(int flags) +{ + int rflags = 0; + + /* + * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY + * and P9_DOTL_NOACCESS + */ + rflags |= flags & O_ACCMODE; + rflags |= v9fs_mapped_dotl_flags(flags); + + return rflags; +} + +/** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @omode: create permissions + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, + bool excl) +{ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); +} + +static int +v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t omode, + int *opened) +{ + int err = 0; + kgid_t gid; + umode_t mode; + char *name = NULL; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *fid = NULL; + struct v9fs_inode *v9inode; + struct p9_fid *dfid, *ofid, *inode_fid; + struct v9fs_session_info *v9ses; + struct posix_acl *pacl = NULL, *dacl = NULL; + struct dentry *res = NULL; + + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; + } + + /* Only creates */ + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) + return finish_no_open(file, res); + + v9ses = v9fs_inode2v9ses(dir); + + name = (char *) dentry->d_name.name; + p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", + name, flags, omode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + goto out; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + goto out; + } + + gid = v9fs_get_fsgid_for_create(dir); + + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + err); + goto error; + } + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), + mode, gid, &qid); + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + v9fs_invalidate_inode_attr(dir); + + /* instantiate inode and assign the unopened fid to the dentry */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(inode, fid, dacl, pacl); + + v9fs_fid_add(dentry, fid); + d_instantiate(dentry, inode); + + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto err_clunk_old_fid; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + /* Since we are opening a file, assign the open fid to the file */ + err = finish_open(file, dentry, generic_file_open, opened); + if (err) + goto err_clunk_old_fid; + file->private_data = ofid; + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + v9fs_cache_inode_set_cookie(inode, file); + *opened |= FILE_CREATED; +out: + v9fs_put_acl(dacl, pacl); + dput(res); + return err; + +error: + if (fid) + p9_client_clunk(fid); +err_clunk_old_fid: + if (ofid) + p9_client_clunk(ofid); + goto out; +} + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @omode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, + struct dentry *dentry, umode_t omode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + kgid_t gid; + char *name; + umode_t mode; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + omode |= S_IFDIR; + if (dir->i_mode & S_ISGID) + omode |= S_ISGID; + + dir_dentry = dentry->d_parent; + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", + err); + goto error; + } + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + v9fs_fid_add(dentry, fid); + v9fs_set_create_acl(inode, fid, dacl, pacl); + d_instantiate(dentry, inode); + fid = NULL; + err = 0; + } else { + /* + * Not in cached mode. No need to populate + * inode with stat. We need to get an inode + * so that we can set the acl with dentry + */ + inode = v9fs_get_inode(dir->i_sb, mode, 0); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + v9fs_set_create_acl(inode, fid, dacl, pacl); + d_instantiate(dentry, inode); + } + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); +error: + if (fid) + p9_client_clunk(fid); + v9fs_put_acl(dacl, pacl); + return err; +} + +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); + v9ses = v9fs_dentry2v9ses(dentry); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(d_inode(dentry), stat); + return 0; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, d_inode(dentry)); + generic_fillattr(d_inode(dentry), stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + +/* + * Attribute flags. + */ +#define P9_ATTR_MODE (1 << 0) +#define P9_ATTR_UID (1 << 1) +#define P9_ATTR_GID (1 << 2) +#define P9_ATTR_SIZE (1 << 3) +#define P9_ATTR_ATIME (1 << 4) +#define P9_ATTR_MTIME (1 << 5) +#define P9_ATTR_CTIME (1 << 6) +#define P9_ATTR_ATIME_SET (1 << 7) +#define P9_ATTR_MTIME_SET (1 << 8) + +struct dotl_iattr_map { + int iattr_valid; + int p9_iattr_valid; +}; + +static int v9fs_mapped_iattr_valid(int iattr_valid) +{ + int i; + int p9_iattr_valid = 0; + struct dotl_iattr_map dotl_iattr_map[] = { + { ATTR_MODE, P9_ATTR_MODE }, + { ATTR_UID, P9_ATTR_UID }, + { ATTR_GID, P9_ATTR_GID }, + { ATTR_SIZE, P9_ATTR_SIZE }, + { ATTR_ATIME, P9_ATTR_ATIME }, + { ATTR_MTIME, P9_ATTR_MTIME }, + { ATTR_CTIME, P9_ATTR_CTIME }, + { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, + { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, + }; + for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { + if (iattr_valid & dotl_iattr_map[i].iattr_valid) + p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; + } + return p9_iattr_valid; +} + +/** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + struct inode *inode = d_inode(dentry); + + p9_debug(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(inode, iattr); + if (retval) + return retval; + + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Write all dirty data */ + if (S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); + + retval = p9_client_setattr(fid, &p9attr); + if (retval < 0) + return retval; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) + truncate_setsize(inode, iattr->ia_size); + + v9fs_invalidate_inode_attr(inode); + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + if (iattr->ia_valid & ATTR_MODE) { + /* We also want to update ACL when we update mode bits */ + retval = v9fs_acl_chmod(inode, fid); + if (retval < 0) + return retval; + } + return 0; +} + +/** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + umode_t mode; + struct v9fs_inode *v9inode = V9FS_I(inode); + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + set_nlink(inode, stat->st_nlink); + + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + set_nlink(inode, stat->st_nlink); + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; +} + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int err; + kgid_t gid; + char *name; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct v9fs_session_info *v9ses; + + name = (char *) dentry->d_name.name; + p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + v9fs_invalidate_inode_attr(dir); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + v9fs_fid_add(dentry, fid); + d_instantiate(dentry, inode); + fid = NULL; + err = 0; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + struct dentry *dir_dentry; + struct p9_fid *dfid, *oldfid; + struct v9fs_session_info *v9ses; + + p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", + dir->i_ino, old_dentry, dentry); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = dentry->d_parent; + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + v9fs_invalidate_inode_attr(dir); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); + } + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); + + return err; +} + +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @omode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, + dev_t rdev) +{ + int err; + kgid_t gid; + char *name; + umode_t mode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry, omode, + MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = dentry->d_parent; + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", + err); + goto error; + } + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + v9fs_invalidate_inode_attr(dir); + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + v9fs_set_create_acl(inode, fid, dacl, pacl); + v9fs_fid_add(dentry, fid); + d_instantiate(dentry, inode); + fid = NULL; + err = 0; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode, rdev); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + v9fs_set_create_acl(inode, fid, dacl, pacl); + d_instantiate(dentry, inode); + } +error: + if (fid) + p9_client_clunk(fid); + v9fs_put_acl(dacl, pacl); + return err; +} + +/** + * v9fs_vfs_follow_link_dotl - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +{ + int retval; + struct p9_fid *fid; + char *link = __getname(); + char *target; + + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + + if (!link) { + link = ERR_PTR(-ENOMEM); + goto ndset; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + __putname(link); + link = ERR_CAST(fid); + goto ndset; + } + retval = p9_client_readlink(fid, &target); + if (!retval) { + strcpy(link, target); + kfree(target); + goto ndset; + } + __putname(link); + link = ERR_PTR(retval); +ndset: + nd_set_link(nd, link); + return NULL; +} + +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_stat_dotl *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + goto out; + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode_dotl(st, inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); +out: + kfree(st); + return 0; +} + +const struct inode_operations v9fs_dir_inode_operations_dotl = { + .create = v9fs_vfs_create_dotl, + .atomic_open = v9fs_vfs_atomic_open_dotl, + .lookup = v9fs_vfs_lookup, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir_dotl, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod_dotl, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .get_acl = v9fs_iop_get_acl, +}; + +const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .get_acl = v9fs_iop_get_acl, +}; + +const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link_dotl, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, +}; diff --git a/kernel/fs/9p/vfs_super.c b/kernel/fs/9p/vfs_super.c new file mode 100644 index 000000000..e99a338a4 --- /dev/null +++ b/kernel/fs/9p/vfs_super.c @@ -0,0 +1,370 @@ +/* + * linux/fs/9p/vfs_super.c + * + * This file contians superblock ops for 9P2000. It is intended that + * you mount this file system on directories. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "xattr.h" +#include "acl.h" + +static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; + +/** + * v9fs_set_super - set the superblock + * @s: super block + * @data: file system specific data + * + */ + +static int v9fs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return set_anon_super(s, data); +} + +/** + * v9fs_fill_super - populate superblock with info + * @sb: superblock + * @v9ses: session information + * @flags: flags propagated from v9fs_mount() + * + */ + +static void +v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, + int flags, void *data) +{ + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize_bits = fls(v9ses->maxdata - 1); + sb->s_blocksize = 1 << sb->s_blocksize_bits; + sb->s_magic = V9FS_MAGIC; + if (v9fs_proto_dotl(v9ses)) { + sb->s_op = &v9fs_super_ops_dotl; + sb->s_xattr = v9fs_xattr_handlers; + } else + sb->s_op = &v9fs_super_ops; + sb->s_bdi = &v9ses->bdi; + if (v9ses->cache) + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; + + sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + if (!v9ses->cache) + sb->s_flags |= MS_SYNCHRONOUS; + +#ifdef CONFIG_9P_FS_POSIX_ACL + if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) + sb->s_flags |= MS_POSIXACL; +#endif + + save_mount_options(sb, data); +} + +/** + * v9fs_mount - mount a superblock + * @fs_type: file system type + * @flags: mount flags + * @dev_name: device name that was mounted + * @data: mount options + * + */ + +static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct super_block *sb = NULL; + struct inode *inode = NULL; + struct dentry *root = NULL; + struct v9fs_session_info *v9ses = NULL; + umode_t mode = S_IRWXUGO | S_ISVTX; + struct p9_fid *fid; + int retval = 0; + + p9_debug(P9_DEBUG_VFS, "\n"); + + v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); + if (!v9ses) + return ERR_PTR(-ENOMEM); + + fid = v9fs_session_init(v9ses, dev_name, data); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + /* + * we need to call session_close to tear down some + * of the data structure setup by session_init + */ + goto close_session; + } + + sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); + if (IS_ERR(sb)) { + retval = PTR_ERR(sb); + goto clunk_fid; + } + v9fs_fill_super(sb, v9ses, flags, data); + + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + sb->s_d_op = &v9fs_cached_dentry_operations; + else + sb->s_d_op = &v9fs_dentry_operations; + + inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); + if (IS_ERR(inode)) { + retval = PTR_ERR(inode); + goto release_sb; + } + + root = d_make_root(inode); + if (!root) { + retval = -ENOMEM; + goto release_sb; + } + sb->s_root = root; + if (v9fs_proto_dotl(v9ses)) { + struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto release_sb; + } + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode_dotl(st, d_inode(root)); + kfree(st); + } else { + struct p9_wstat *st = NULL; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto release_sb; + } + + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, d_inode(root), sb); + + p9stat_free(st); + kfree(st); + } + retval = v9fs_get_acl(inode, fid); + if (retval) + goto release_sb; + v9fs_fid_add(root, fid); + + p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); + return dget(sb->s_root); + +clunk_fid: + p9_client_clunk(fid); +close_session: + v9fs_session_close(v9ses); + kfree(v9ses); + return ERR_PTR(retval); + +release_sb: + /* + * we will do the session_close and root dentry release + * in the below call. But we need to clunk fid, because we haven't + * attached the fid to dentry so it won't get clunked + * automatically. + */ + p9_client_clunk(fid); + deactivate_locked_super(sb); + return ERR_PTR(retval); +} + +/** + * v9fs_kill_super - Kill Superblock + * @s: superblock + * + */ + +static void v9fs_kill_super(struct super_block *s) +{ + struct v9fs_session_info *v9ses = s->s_fs_info; + + p9_debug(P9_DEBUG_VFS, " %p\n", s); + + kill_anon_super(s); + + v9fs_session_cancel(v9ses); + v9fs_session_close(v9ses); + kfree(v9ses); + s->s_fs_info = NULL; + p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); +} + +static void +v9fs_umount_begin(struct super_block *sb) +{ + struct v9fs_session_info *v9ses; + + v9ses = sb->s_fs_info; + v9fs_session_begin_cancel(v9ses); +} + +static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_rstatfs rs; + int res; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + res = PTR_ERR(fid); + goto done; + } + + v9ses = v9fs_dentry2v9ses(dentry); + if (v9fs_proto_dotl(v9ses)) { + res = p9_client_statfs(fid, &rs); + if (res == 0) { + buf->f_type = rs.type; + buf->f_bsize = rs.bsize; + buf->f_blocks = rs.blocks; + buf->f_bfree = rs.bfree; + buf->f_bavail = rs.bavail; + buf->f_files = rs.files; + buf->f_ffree = rs.ffree; + buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = rs.namelen; + } + if (res != -ENOSYS) + goto done; + } + res = simple_statfs(dentry, buf); +done: + return res; +} + +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * the inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + +static int v9fs_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + int ret; + struct p9_wstat wstat; + struct v9fs_inode *v9inode; + /* + * send an fsync request to server irrespective of + * wbc->sync_mode. + */ + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + v9inode = V9FS_I(inode); + if (!v9inode->writeback_fid) + return 0; + v9fs_blank_wstat(&wstat); + + ret = p9_client_wstat(v9inode->writeback_fid, &wstat); + if (ret < 0) { + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; + } + return 0; +} + +static int v9fs_write_inode_dotl(struct inode *inode, + struct writeback_control *wbc) +{ + int ret; + struct v9fs_inode *v9inode; + /* + * send an fsync request to server irrespective of + * wbc->sync_mode. + */ + v9inode = V9FS_I(inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", + __func__, inode, v9inode->writeback_fid); + if (!v9inode->writeback_fid) + return 0; + + ret = p9_client_fsync(v9inode->writeback_fid, 0); + if (ret < 0) { + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; + } + return 0; +} + +static const struct super_operations v9fs_super_ops = { + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, + .statfs = simple_statfs, + .evict_inode = v9fs_evict_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, + .write_inode = v9fs_write_inode, +}; + +static const struct super_operations v9fs_super_ops_dotl = { + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, + .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, + .evict_inode = v9fs_evict_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, + .write_inode = v9fs_write_inode_dotl, +}; + +struct file_system_type v9fs_fs_type = { + .name = "9p", + .mount = v9fs_mount, + .kill_sb = v9fs_kill_super, + .owner = THIS_MODULE, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("9p"); diff --git a/kernel/fs/9p/xattr.c b/kernel/fs/9p/xattr.c new file mode 100644 index 000000000..0cf44b6cc --- /dev/null +++ b/kernel/fs/9p/xattr.c @@ -0,0 +1,151 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "fid.h" +#include "xattr.h" + +ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, + void *buffer, size_t buffer_size) +{ + ssize_t retval; + u64 attr_size; + struct p9_fid *attr_fid; + struct kvec kvec = {.iov_base = buffer, .iov_len = buffer_size}; + struct iov_iter to; + int err; + + iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size); + + attr_fid = p9_client_xattrwalk(fid, name, &attr_size); + if (IS_ERR(attr_fid)) { + retval = PTR_ERR(attr_fid); + p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n", + retval); + return retval; + } + if (attr_size > buffer_size) { + if (!buffer_size) /* request to get the attr_size */ + retval = attr_size; + else + retval = -ERANGE; + } else { + iov_iter_truncate(&to, attr_size); + retval = p9_client_read(attr_fid, 0, &to, &err); + if (err) + retval = err; + } + p9_client_clunk(attr_fid); + return retval; +} + + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + struct p9_fid *fid; + + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + name, buffer_size); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + return v9fs_fid_xattr_get(fid, name, buffer, buffer_size); +} + +/* + * v9fs_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags) +{ + struct p9_fid *fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + return v9fs_fid_xattr_set(fid, name, value, value_len, flags); +} + +int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, + const void *value, size_t value_len, int flags) +{ + struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len}; + struct iov_iter from; + int retval; + + iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); + + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", + name, value_len, flags); + + /* Clone it */ + fid = p9_client_walk(fid, 0, NULL, 1); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* + * On success fid points to xattr + */ + retval = p9_client_xattrcreate(fid, name, value_len, flags); + if (retval < 0) + p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", + retval); + else + p9_client_write(fid, 0, &from, &retval); + p9_client_clunk(fid); + return retval; +} + +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); +} + +const struct xattr_handler *v9fs_xattr_handlers[] = { + &v9fs_xattr_user_handler, + &v9fs_xattr_trusted_handler, +#ifdef CONFIG_9P_FS_POSIX_ACL + &v9fs_xattr_acl_access_handler, + &v9fs_xattr_acl_default_handler, +#endif +#ifdef CONFIG_9P_FS_SECURITY + &v9fs_xattr_security_handler, +#endif + NULL +}; diff --git a/kernel/fs/9p/xattr.h b/kernel/fs/9p/xattr.h new file mode 100644 index 000000000..d3e2ea384 --- /dev/null +++ b/kernel/fs/9p/xattr.h @@ -0,0 +1,37 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_XATTR_H +#define FS_9P_XATTR_H + +#include +#include +#include + +extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern struct xattr_handler v9fs_xattr_user_handler; +extern struct xattr_handler v9fs_xattr_trusted_handler; +extern struct xattr_handler v9fs_xattr_security_handler; +extern const struct xattr_handler v9fs_xattr_acl_access_handler; +extern const struct xattr_handler v9fs_xattr_acl_default_handler; + +extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *, + void *, size_t); +extern ssize_t v9fs_xattr_get(struct dentry *, const char *, + void *, size_t); +extern int v9fs_fid_xattr_set(struct p9_fid *, const char *, + const void *, size_t, int); +extern int v9fs_xattr_set(struct dentry *, const char *, + const void *, size_t, int); +extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +#endif /* FS_9P_XATTR_H */ diff --git a/kernel/fs/9p/xattr_security.c b/kernel/fs/9p/xattr_security.c new file mode 100644 index 000000000..cb247a142 --- /dev/null +++ b/kernel/fs/9p/xattr_security.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = v9fs_xattr_security_get, + .set = v9fs_xattr_security_set, +}; diff --git a/kernel/fs/9p/xattr_trusted.c b/kernel/fs/9p/xattr_trusted.c new file mode 100644 index 000000000..e30d33b8a --- /dev/null +++ b/kernel/fs/9p/xattr_trusted.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = v9fs_xattr_trusted_get, + .set = v9fs_xattr_trusted_set, +}; diff --git a/kernel/fs/9p/xattr_user.c b/kernel/fs/9p/xattr_user.c new file mode 100644 index 000000000..d0b701b72 --- /dev/null +++ b/kernel/fs/9p/xattr_user.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_user_get, + .set = v9fs_xattr_user_set, +}; diff --git a/kernel/fs/Kconfig b/kernel/fs/Kconfig new file mode 100644 index 000000000..011f43365 --- /dev/null +++ b/kernel/fs/Kconfig @@ -0,0 +1,281 @@ +# +# File system configuration +# + +menu "File systems" + +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + +if BLOCK + +source "fs/ext2/Kconfig" +source "fs/ext3/Kconfig" +source "fs/ext4/Kconfig" +source "fs/jbd/Kconfig" +source "fs/jbd2/Kconfig" + +config FS_MBCACHE +# Meta block cache for Extended Attributes (ext2/ext3/ext4) + tristate + default y if EXT2_FS=y && EXT2_FS_XATTR + default y if EXT3_FS=y && EXT3_FS_XATTR + default y if EXT4_FS=y + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS + +source "fs/reiserfs/Kconfig" +source "fs/jfs/Kconfig" + +source "fs/xfs/Kconfig" +source "fs/gfs2/Kconfig" +source "fs/ocfs2/Kconfig" +source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" +source "fs/f2fs/Kconfig" + +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + depends on !(ARM || MIPS || SPARC) + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + +endif # BLOCK + +# Posix ACL utility routines +# +# Note: Posix ACLs can be implemented without these helpers. Never use +# this symbol for ifdefs in core code. +# +config FS_POSIX_ACL + def_bool n + +config EXPORTFS + tristate + +config FILE_LOCKING + bool "Enable POSIX file locking API" if EXPERT + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + +source "fs/notify/Kconfig" + +source "fs/quota/Kconfig" + +source "fs/autofs4/Kconfig" +source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" + +menu "Caches" + +source "fs/fscache/Kconfig" +source "fs/cachefiles/Kconfig" + +endmenu + +if BLOCK +menu "CD-ROM/DVD Filesystems" + +source "fs/isofs/Kconfig" +source "fs/udf/Kconfig" + +endmenu +endif # BLOCK + +if BLOCK +menu "DOS/FAT/NT Filesystems" + +source "fs/fat/Kconfig" +source "fs/ntfs/Kconfig" + +endmenu +endif # BLOCK + +menu "Pseudo filesystems" + +source "fs/proc/Kconfig" +source "fs/kernfs/Kconfig" +source "fs/sysfs/Kconfig" + +config TMPFS + bool "Tmpfs virtual memory file system support (former shm fs)" + depends on SHMEM + help + Tmpfs is a file system which keeps all files in virtual memory. + + Everything in tmpfs is temporary in the sense that no files will be + created on your hard drive. The files live in memory and swap + space. If you unmount a tmpfs instance, everything stored therein is + lost. + + See for details. + +config TMPFS_POSIX_ACL + bool "Tmpfs POSIX Access Control Lists" + depends on TMPFS + select TMPFS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for tmpfs + filesystems. + + If you've selected TMPFS, it's possible that you'll also need + this option as there are a number of Linux distros that require + POSIX ACL support under /dev for certain features to work properly. + For example, some distros need this feature for ALSA-related /dev + files for sound to work properly. In short, if you're not sure, + say Y. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + +config TMPFS_XATTR + bool "Tmpfs extended attributes" + depends on TMPFS + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + Currently this enables support for the trusted.* and + security.* namespaces. + + You need this for POSIX ACL support on tmpfs. + + If unsure, say N. + +config HUGETLBFS + bool "HugeTLB file system support" + depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ + SYS_SUPPORTS_HUGETLBFS || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + for details. + + If unsure, say N. + +config HUGETLB_PAGE + def_bool HUGETLBFS + +source "fs/configfs/Kconfig" +source "fs/efivarfs/Kconfig" + +endmenu + +menuconfig MISC_FILESYSTEMS + bool "Miscellaneous filesystems" + default y + ---help--- + Say Y here to get to see options for various miscellaneous + filesystems, such as filesystems that came from other + operating systems. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if MISC_FILESYSTEMS + +source "fs/adfs/Kconfig" +source "fs/affs/Kconfig" +source "fs/ecryptfs/Kconfig" +source "fs/hfs/Kconfig" +source "fs/hfsplus/Kconfig" +source "fs/befs/Kconfig" +source "fs/bfs/Kconfig" +source "fs/efs/Kconfig" +source "fs/jffs2/Kconfig" +# UBIFS File system configuration +source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" +source "fs/cramfs/Kconfig" +source "fs/squashfs/Kconfig" +source "fs/freevxfs/Kconfig" +source "fs/minix/Kconfig" +source "fs/omfs/Kconfig" +source "fs/hpfs/Kconfig" +source "fs/qnx4/Kconfig" +source "fs/qnx6/Kconfig" +source "fs/romfs/Kconfig" +source "fs/pstore/Kconfig" +source "fs/sysv/Kconfig" +source "fs/ufs/Kconfig" +source "fs/exofs/Kconfig" + +endif # MISC_FILESYSTEMS + +source "fs/exofs/Kconfig.ore" + +menuconfig NETWORK_FILESYSTEMS + bool "Network File Systems" + default y + depends on NET + ---help--- + Say Y here to get to see options for network filesystems and + filesystem-related networking code, such as NFS daemon and + RPCSEC security modules. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if NETWORK_FILESYSTEMS + +source "fs/nfs/Kconfig" +source "fs/nfsd/Kconfig" + +config GRACE_PERIOD + tristate + +config LOCKD + tristate + depends on FILE_LOCKING + select GRACE_PERIOD + +config LOCKD_V4 + bool + depends on NFSD_V3 || NFS_V3 + depends on FILE_LOCKING + default y + +config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL + +config NFS_COMMON + bool + depends on NFSD || NFS_FS || LOCKD + default y + +source "net/sunrpc/Kconfig" +source "fs/ceph/Kconfig" +source "fs/cifs/Kconfig" +source "fs/ncpfs/Kconfig" +source "fs/coda/Kconfig" +source "fs/afs/Kconfig" +source "fs/9p/Kconfig" + +endif # NETWORK_FILESYSTEMS + +source "fs/nls/Kconfig" +source "fs/dlm/Kconfig" + +endmenu diff --git a/kernel/fs/Kconfig.binfmt b/kernel/fs/Kconfig.binfmt new file mode 100644 index 000000000..2d0cbbd14 --- /dev/null +++ b/kernel/fs/Kconfig.binfmt @@ -0,0 +1,181 @@ +config BINFMT_ELF + bool "Kernel support for ELF binaries" + depends on MMU && (BROKEN || !FRV) + default y + ---help--- + ELF (Executable and Linkable Format) is a format for libraries and + executables used across different architectures and operating + systems. Saying Y here will enable your kernel to run ELF binaries + and enlarge it by about 13 KB. ELF support under Linux has now all + but replaced the traditional Linux a.out formats (QMAGIC and ZMAGIC) + because it is portable (this does *not* mean that you will be able + to run executables from different architectures or operating systems + however) and makes building run-time libraries very easy. Many new + executables are distributed solely in ELF format. You definitely + want to say Y here. + + Information about ELF is contained in the ELF HOWTO available from + . + + If you find that after upgrading from Linux kernel 1.2 and saying Y + here, you still can't run any ELF binaries (they just crash), then + you'll have to install the newest ELF runtime libraries, including + ld.so (check the file for location and + latest version). + +config COMPAT_BINFMT_ELF + bool + depends on COMPAT && BINFMT_ELF + +config ARCH_BINFMT_ELF_STATE + bool + +config BINFMT_ELF_FDPIC + bool "Kernel support for FDPIC ELF binaries" + default y + depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + help + ELF FDPIC binaries are based on ELF, but allow the individual load + segments of a binary to be located in memory independently of each + other. This makes this format ideal for use in environments where no + MMU is available as it still permits text segments to be shared, + even if data segments are not. + + It is also possible to run FDPIC ELF binaries on MMU linux also. + +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default y + depends on BINFMT_ELF && ELF_CORE + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say Y. + +config BINFMT_SCRIPT + tristate "Kernel support for scripts starting with #!" + default y + help + Say Y here if you want to execute interpreted scripts starting with + #! followed by the path to an interpreter. + + You can build this support as a module; however, until that module + gets loaded, you cannot run scripts. Thus, if you want to load this + module from an initramfs, the portion of the initramfs before loading + this module must consist of compiled binaries only. + + Most systems will not boot if you say M or N here. If unsure, say Y. + +config BINFMT_FLAT + bool "Kernel support for flat binaries" + depends on !MMU && (!FRV || BROKEN) + help + Support uClinux FLAT format binaries. + +config BINFMT_ZFLAT + bool "Enable ZFLAT support" + depends on BINFMT_FLAT + select ZLIB_INFLATE + help + Support FLAT format compressed binaries + +config BINFMT_SHARED_FLAT + bool "Enable shared FLAT support" + depends on BINFMT_FLAT + help + Support FLAT shared libraries + +config HAVE_AOUT + def_bool n + +config BINFMT_AOUT + tristate "Kernel support for a.out and ECOFF binaries" + depends on HAVE_AOUT + ---help--- + A.out (Assembler.OUTput) is a set of formats for libraries and + executables used in the earliest versions of UNIX. Linux used + the a.out formats QMAGIC and ZMAGIC until they were replaced + with the ELF format. + + The conversion to ELF started in 1995. This option is primarily + provided for historical interest and for the benefit of those + who need to run binaries from that era. + + Most people should answer N here. If you think you may have + occasional use for this format, enable module support above + and answer M here to compile this support as a module called + binfmt_aout. + + If any crucial components of your system (such as /sbin/init + or /lib/ld.so) are still in a.out format, you will have to + say Y here. + +config OSF4_COMPAT + bool "OSF/1 v4 readv/writev compatibility" + depends on ALPHA && BINFMT_AOUT + help + Say Y if you are using OSF/1 binaries (like Netscape and Acrobat) + with v4 shared libraries freely available from Compaq. If you're + going to use shared libraries from Tru64 version 5.0 or later, say N. + +config BINFMT_EM86 + tristate "Kernel support for Linux/Intel ELF binaries" + depends on ALPHA + ---help--- + Say Y here if you want to be able to execute Linux/Intel ELF + binaries just like native Alpha binaries on your Alpha machine. For + this to work, you need to have the emulator /usr/bin/em86 in place. + + You can get the same functionality by saying N here and saying Y to + "Kernel support for MISC binaries". + + You may answer M to compile the emulation support as a module and + later load the module when you want to use a Linux/Intel binary. The + module will be called binfmt_em86. If unsure, say Y. + +config BINFMT_MISC + tristate "Kernel support for MISC binaries" + ---help--- + If you say Y here, it will be possible to plug wrapper-driven binary + formats into the kernel. You will like this especially when you use + programs that need an interpreter to run like Java, Python, .NET or + Emacs-Lisp. It's also useful if you often run DOS executables under + the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, available from + ). Once you have + registered such a binary class with the kernel, you can start one of + those programs simply by typing in its name at a shell prompt; Linux + will automatically feed it to the correct interpreter. + + You can do other nice things, too. Read the file + to learn how to use this + feature, for information about how + to include Java support. and for + information about how to include Mono-based .NET support. + + To use binfmt_misc, you will need to mount it: + mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc + + You may say M here for module support and later load the module when + you have use for it; the module is called binfmt_misc. If you + don't know what to answer at this point, say Y. + +config COREDUMP + bool "Enable core dump support" if EXPERT + default y + help + This option enables support for performing core dumps. You almost + certainly want to say Y here. Not necessary on systems that never + need debugging or only ever run flawless code. diff --git a/kernel/fs/Makefile b/kernel/fs/Makefile new file mode 100644 index 000000000..cb92fd4c3 --- /dev/null +++ b/kernel/fs/Makefile @@ -0,0 +1,129 @@ +# +# Makefile for the Linux filesystems. +# +# 14 Sep 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# + +obj-y := open.o read_write.o file_table.o super.o \ + char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ + ioctl.o readdir.o select.o dcache.o inode.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ + seq_file.o xattr.o libfs.o fs-writeback.o \ + pnode.o splice.o sync.o utimes.o \ + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o + +ifeq ($(CONFIG_BLOCK),y) +obj-y += buffer.o block_dev.o direct-io.o mpage.o +else +obj-y += no-block.o +endif + +obj-$(CONFIG_PROC_FS) += proc_namespace.o + +obj-y += notify/ +obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_TIMERFD) += timerfd.o +obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_DAX) += dax.o +obj-$(CONFIG_FILE_LOCKING) += locks.o +obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o +obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o +obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o +obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o +obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o +obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o +obj-$(CONFIG_NFS_COMMON) += nfs_common/ +obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o + +obj-$(CONFIG_FHANDLE) += fhandle.o + +obj-y += quota/ + +obj-$(CONFIG_PROC_FS) += proc/ +obj-$(CONFIG_KERNFS) += kernfs/ +obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_CONFIGFS_FS) += configfs/ +obj-y += devpts/ + +obj-$(CONFIG_PROFILING) += dcookies.o +obj-$(CONFIG_DLM) += dlm/ + +# Do not add any filesystems before this line +obj-$(CONFIG_FSCACHE) += fscache/ +obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 +obj-$(CONFIG_EXT2_FS) += ext2/ +# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 +# unless explicitly requested by rootfstype +obj-$(CONFIG_EXT4_FS) += ext4/ +obj-$(CONFIG_JBD) += jbd/ +obj-$(CONFIG_JBD2) += jbd2/ +obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_SQUASHFS) += squashfs/ +obj-y += ramfs/ +obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ +obj-$(CONFIG_CODA_FS) += coda/ +obj-$(CONFIG_MINIX_FS) += minix/ +obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_BFS_FS) += bfs/ +obj-$(CONFIG_ISO9660_FS) += isofs/ +obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ +obj-$(CONFIG_HFS_FS) += hfs/ +obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ +obj-$(CONFIG_VXFS_FS) += freevxfs/ +obj-$(CONFIG_NFS_FS) += nfs/ +obj-$(CONFIG_EXPORTFS) += exportfs/ +obj-$(CONFIG_NFSD) += nfsd/ +obj-$(CONFIG_LOCKD) += lockd/ +obj-$(CONFIG_NLS) += nls/ +obj-$(CONFIG_SYSV_FS) += sysv/ +obj-$(CONFIG_CIFS) += cifs/ +obj-$(CONFIG_NCP_FS) += ncpfs/ +obj-$(CONFIG_HPFS_FS) += hpfs/ +obj-$(CONFIG_NTFS_FS) += ntfs/ +obj-$(CONFIG_UFS_FS) += ufs/ +obj-$(CONFIG_EFS_FS) += efs/ +obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ +obj-$(CONFIG_UBIFS_FS) += ubifs/ +obj-$(CONFIG_AFFS_FS) += affs/ +obj-$(CONFIG_ROMFS_FS) += romfs/ +obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_QNX6FS_FS) += qnx6/ +obj-$(CONFIG_AUTOFS4_FS) += autofs4/ +obj-$(CONFIG_ADFS_FS) += adfs/ +obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAY_FS) += overlayfs/ +obj-$(CONFIG_UDF_FS) += udf/ +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ +obj-$(CONFIG_OMFS_FS) += omfs/ +obj-$(CONFIG_JFS_FS) += jfs/ +obj-$(CONFIG_XFS_FS) += xfs/ +obj-$(CONFIG_9P_FS) += 9p/ +obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_NILFS2_FS) += nilfs2/ +obj-$(CONFIG_BEFS_FS) += befs/ +obj-$(CONFIG_HOSTFS) += hostfs/ +obj-$(CONFIG_HPPFS) += hppfs/ +obj-$(CONFIG_CACHEFILES) += cachefiles/ +obj-$(CONFIG_DEBUG_FS) += debugfs/ +obj-$(CONFIG_TRACING) += tracefs/ +obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ +obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_F2FS_FS) += f2fs/ +obj-y += exofs/ # Multiple modules +obj-$(CONFIG_CEPH_FS) += ceph/ +obj-$(CONFIG_PSTORE) += pstore/ +obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/kernel/fs/adfs/Kconfig b/kernel/fs/adfs/Kconfig new file mode 100644 index 000000000..c5a7787dd --- /dev/null +++ b/kernel/fs/adfs/Kconfig @@ -0,0 +1,27 @@ +config ADFS_FS + tristate "ADFS file system support" + depends on BLOCK + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say Y + here, Linux will be able to read from ADFS partitions on hard drives + and from ADFS-formatted floppy discs. If you also want to be able to + write to those devices, say Y to "ADFS write support" below. + + The ADFS partition should be the first partition (i.e., + /dev/[hs]d?1) on each of your drives. Please read the file + for further details. + + To compile this code as a module, choose M here: the module will be + called adfs. + + If unsure, say N. + +config ADFS_FS_RW + bool "ADFS write support (DANGEROUS)" + depends on ADFS_FS + help + If you say Y here, you will be able to write to ADFS partitions on + hard drives and ADFS-formatted floppy disks. This is experimental + codes, so if you're unsure, say N. diff --git a/kernel/fs/adfs/Makefile b/kernel/fs/adfs/Makefile new file mode 100644 index 000000000..9b2d71a9a --- /dev/null +++ b/kernel/fs/adfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux adfs filesystem routines. +# + +obj-$(CONFIG_ADFS_FS) += adfs.o + +adfs-objs := dir.o dir_f.o dir_fplus.o file.o inode.o map.o super.o diff --git a/kernel/fs/adfs/adfs.h b/kernel/fs/adfs/adfs.h new file mode 100644 index 000000000..24575d9d8 --- /dev/null +++ b/kernel/fs/adfs/adfs.h @@ -0,0 +1,208 @@ +#include +#include + +/* Internal data structures for ADFS */ + +#define ADFS_FREE_FRAG 0 +#define ADFS_BAD_FRAG 1 +#define ADFS_ROOT_FRAG 2 + +#define ADFS_NDA_OWNER_READ (1 << 0) +#define ADFS_NDA_OWNER_WRITE (1 << 1) +#define ADFS_NDA_LOCKED (1 << 2) +#define ADFS_NDA_DIRECTORY (1 << 3) +#define ADFS_NDA_EXECUTE (1 << 4) +#define ADFS_NDA_PUBLIC_READ (1 << 5) +#define ADFS_NDA_PUBLIC_WRITE (1 << 6) + +#include "dir_f.h" + +struct buffer_head; + +/* + * adfs file system inode data in memory + */ +struct adfs_inode_info { + loff_t mmu_private; + unsigned long parent_id; /* object id of parent */ + __u32 loadaddr; /* RISC OS load address */ + __u32 execaddr; /* RISC OS exec address */ + unsigned int filetype; /* RISC OS file type */ + unsigned int attr; /* RISC OS permissions */ + unsigned int stamped:1; /* RISC OS file has date/time */ + struct inode vfs_inode; +}; + +/* + * Forward-declare this + */ +struct adfs_discmap; +struct adfs_dir_ops; + +/* + * ADFS file system superblock data in memory + */ +struct adfs_sb_info { + union { struct { + struct adfs_discmap *s_map; /* bh list containing map */ + struct adfs_dir_ops *s_dir; /* directory operations */ + }; + struct rcu_head rcu; /* used only at shutdown time */ + }; + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ + int s_ftsuffix; /* ,xyz hex filetype suffix option */ + + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector */ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ + unsigned int s_namelen; /* maximum number of characters in name */ +}; + +static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct adfs_inode_info *ADFS_I(struct inode *inode) +{ + return container_of(inode, struct adfs_inode_info, vfs_inode); +} + +/* + * Directory handling + */ +struct adfs_dir { + struct super_block *sb; + + int nr_buffers; + struct buffer_head *bh[4]; + + /* big directories need allocated buffers */ + struct buffer_head **bh_fplus; + + unsigned int pos; + unsigned int parent_id; + + struct adfs_dirheader dirhead; + union adfs_dirtail dirtail; +}; + +/* + * This is the overall maximum name length + */ +#define ADFS_MAX_NAME_LEN (256 + 4) /* +4 for ,xyz hex filetype suffix */ +struct object_info { + __u32 parent_id; /* parent object id */ + __u32 file_id; /* object id */ + __u32 loadaddr; /* load address */ + __u32 execaddr; /* execution address */ + __u32 size; /* size */ + __u8 attr; /* RISC OS attributes */ + unsigned int name_len; /* name length */ + char name[ADFS_MAX_NAME_LEN];/* file name */ + + /* RISC OS file type (12-bit: derived from loadaddr) */ + __u16 filetype; +}; + +/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */ +static inline int append_filetype_suffix(char *buf, __u16 filetype) +{ + if (filetype == 0xffff) /* no explicit 12-bit file type was set */ + return 0; + + *buf++ = ','; + *buf++ = hex_asc_lo(filetype >> 8); + *buf++ = hex_asc_lo(filetype >> 4); + *buf++ = hex_asc_lo(filetype >> 0); + return 4; +} + +struct adfs_dir_ops { + int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir); + int (*setpos)(struct adfs_dir *dir, unsigned int fpos); + int (*getnext)(struct adfs_dir *dir, struct object_info *obj); + int (*update)(struct adfs_dir *dir, struct object_info *obj); + int (*create)(struct adfs_dir *dir, struct object_info *obj); + int (*remove)(struct adfs_dir *dir, struct object_info *obj); + int (*sync)(struct adfs_dir *dir); + void (*free)(struct adfs_dir *dir); +}; + +struct adfs_discmap { + struct buffer_head *dm_bh; + __u32 dm_startblk; + unsigned int dm_startbit; + unsigned int dm_endbit; +}; + +/* Inode stuff */ +struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); +int adfs_notify_change(struct dentry *dentry, struct iattr *attr); + +/* map.c */ +extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset); +extern unsigned int adfs_map_free(struct super_block *sb); + +/* Misc */ +__printf(3, 4) +void __adfs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) + +/* super.c */ + +/* + * Inodes and file operations + */ + +/* dir_*.c */ +extern const struct inode_operations adfs_dir_inode_operations; +extern const struct file_operations adfs_dir_operations; +extern const struct dentry_operations adfs_dentry_operations; +extern struct adfs_dir_ops adfs_f_dir_ops; +extern struct adfs_dir_ops adfs_fplus_dir_ops; + +extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, + int wait); + +/* file.c */ +extern const struct inode_operations adfs_file_inode_operations; +extern const struct file_operations adfs_file_operations; + +static inline __u32 signed_asl(__u32 val, signed int shift) +{ + if (shift >= 0) + val <<= shift; + else + val >>= -shift; + return val; +} + +/* + * Calculate the address of a block in an object given the block offset + * and the object identity. + * + * The root directory ID should always be looked up in the map [3.4] + */ +static inline int +__adfs_block_map(struct super_block *sb, unsigned int object_id, + unsigned int block) +{ + if (object_id & 255) { + unsigned int off; + + off = (object_id & 255) - 1; + block += off << ADFS_SB(sb)->s_log2sharesize; + } + + return adfs_map_lookup(sb, object_id >> 8, block); +} diff --git a/kernel/fs/adfs/dir.c b/kernel/fs/adfs/dir.c new file mode 100644 index 000000000..51c279a29 --- /dev/null +++ b/kernel/fs/adfs/dir.c @@ -0,0 +1,288 @@ +/* + * linux/fs/adfs/dir.c + * + * Copyright (C) 1999-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Common directory handling for ADFS + */ +#include "adfs.h" + +/* + * For future. This should probably be per-directory. + */ +static DEFINE_RWLOCK(adfs_dir_lock); + +static int +adfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct object_info obj; + struct adfs_dir dir; + int ret = 0; + + if (ctx->pos >> 32) + return 0; + + ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + if (ret) + return ret; + + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + goto free_out; + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) + goto free_out; + ctx->pos = 2; + } + + read_lock(&adfs_dir_lock); + + ret = ops->setpos(&dir, ctx->pos - 2); + if (ret) + goto unlock_out; + while (ops->getnext(&dir, &obj) == 0) { + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.file_id, DT_UNKNOWN)) + break; + ctx->pos++; + } + +unlock_out: + read_unlock(&adfs_dir_lock); + +free_out: + ops->free(&dir); + return ret; +} + +int +adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) +{ + int ret = -EINVAL; +#ifdef CONFIG_ADFS_FS_RW + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct adfs_dir dir; + + printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n", + obj->file_id, obj->parent_id); + + if (!ops->update) { + ret = -EINVAL; + goto out; + } + + ret = ops->read(sb, obj->parent_id, 0, &dir); + if (ret) + goto out; + + write_lock(&adfs_dir_lock); + ret = ops->update(&dir, obj); + write_unlock(&adfs_dir_lock); + + if (wait) { + int err = ops->sync(&dir); + if (!ret) + ret = err; + } + + ops->free(&dir); +out: +#endif + return ret; +} + +static int +adfs_match(struct qstr *name, struct object_info *obj) +{ + int i; + + if (name->len != obj->name_len) + return 0; + + for (i = 0; i < name->len; i++) { + char c1, c2; + + c1 = name->name[i]; + c2 = obj->name[i]; + + if (c1 >= 'A' && c1 <= 'Z') + c1 += 'a' - 'A'; + if (c2 >= 'A' && c2 <= 'Z') + c2 += 'a' - 'A'; + + if (c1 != c2) + return 0; + } + return 1; +} + +static int +adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj) +{ + struct super_block *sb = inode->i_sb; + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct adfs_dir dir; + int ret; + + ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + if (ret) + goto out; + + if (ADFS_I(inode)->parent_id != dir.parent_id) { + adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n", + ADFS_I(inode)->parent_id, dir.parent_id); + ret = -EIO; + goto free_out; + } + + obj->parent_id = inode->i_ino; + + /* + * '.' is handled by reserved_lookup() in fs/namei.c + */ + if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') { + /* + * Currently unable to fill in the rest of 'obj', + * but this is better than nothing. We need to + * ascend one level to find it's parent. + */ + obj->name_len = 0; + obj->file_id = obj->parent_id; + goto free_out; + } + + read_lock(&adfs_dir_lock); + + ret = ops->setpos(&dir, 0); + if (ret) + goto unlock_out; + + ret = -ENOENT; + while (ops->getnext(&dir, obj) == 0) { + if (adfs_match(name, obj)) { + ret = 0; + break; + } + } + +unlock_out: + read_unlock(&adfs_dir_lock); + +free_out: + ops->free(&dir); +out: + return ret; +} + +const struct file_operations adfs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .iterate = adfs_readdir, + .fsync = generic_file_fsync, +}; + +static int +adfs_hash(const struct dentry *parent, struct qstr *qstr) +{ + const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; + const unsigned char *name; + unsigned long hash; + int i; + + if (qstr->len < name_len) + return 0; + + /* + * Truncate the name in place, avoids + * having to define a compare function. + */ + qstr->len = i = name_len; + name = qstr->name; + hash = init_name_hash(); + while (i--) { + char c; + + c = *name++; + if (c >= 'A' && c <= 'Z') + c += 'a' - 'A'; + + hash = partial_name_hash(c, hash); + } + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare two names, taking note of the name length + * requirements of the underlying filesystem. + */ +static int +adfs_compare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + int i; + + if (len != name->len) + return 1; + + for (i = 0; i < name->len; i++) { + char a, b; + + a = str[i]; + b = name->name[i]; + + if (a >= 'A' && a <= 'Z') + a += 'a' - 'A'; + if (b >= 'A' && b <= 'Z') + b += 'a' - 'A'; + + if (a != b) + return 1; + } + return 0; +} + +const struct dentry_operations adfs_dentry_operations = { + .d_hash = adfs_hash, + .d_compare = adfs_compare, +}; + +static struct dentry * +adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct inode *inode = NULL; + struct object_info obj; + int error; + + error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); + if (error == 0) { + error = -EACCES; + /* + * This only returns NULL if get_empty_inode + * fails. + */ + inode = adfs_iget(dir->i_sb, &obj); + if (inode) + error = 0; + } + d_add(dentry, inode); + return ERR_PTR(error); +} + +/* + * directories can handle most operations... + */ +const struct inode_operations adfs_dir_inode_operations = { + .lookup = adfs_lookup, + .setattr = adfs_notify_change, +}; diff --git a/kernel/fs/adfs/dir_f.c b/kernel/fs/adfs/dir_f.c new file mode 100644 index 000000000..4bbe853ee --- /dev/null +++ b/kernel/fs/adfs/dir_f.c @@ -0,0 +1,486 @@ +/* + * linux/fs/adfs/dir_f.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * E and F format directory handling + */ +#include +#include "adfs.h" +#include "dir_f.h" + +static void adfs_f_free(struct adfs_dir *dir); + +/* + * Read an (unaligned) value of length 1..4 bytes + */ +static inline unsigned int adfs_readval(unsigned char *p, int len) +{ + unsigned int val = 0; + + switch (len) { + case 4: val |= p[3] << 24; + case 3: val |= p[2] << 16; + case 2: val |= p[1] << 8; + default: val |= p[0]; + } + return val; +} + +static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) +{ + switch (len) { + case 4: p[3] = val >> 24; + case 3: p[2] = val >> 16; + case 2: p[1] = val >> 8; + default: p[0] = val; + } +} + +static inline int adfs_readname(char *buf, char *ptr, int maxlen) +{ + char *old_buf = buf; + + while ((unsigned char)*ptr >= ' ' && maxlen--) { + if (*ptr == '/') + *buf++ = '.'; + else + *buf++ = *ptr; + ptr++; + } + + return buf - old_buf; +} + +#define ror13(v) ((v >> 13) | (v << 19)) + +#define dir_u8(idx) \ + ({ int _buf = idx >> blocksize_bits; \ + int _off = idx - (_buf << blocksize_bits);\ + *(u8 *)(bh[_buf]->b_data + _off); \ + }) + +#define dir_u32(idx) \ + ({ int _buf = idx >> blocksize_bits; \ + int _off = idx - (_buf << blocksize_bits);\ + *(__le32 *)(bh[_buf]->b_data + _off); \ + }) + +#define bufoff(_bh,_idx) \ + ({ int _buf = _idx >> blocksize_bits; \ + int _off = _idx - (_buf << blocksize_bits);\ + (u8 *)(_bh[_buf]->b_data + _off); \ + }) + +/* + * There are some algorithms that are nice in + * assembler, but a bitch in C... This is one + * of them. + */ +static u8 +adfs_dir_checkbyte(const struct adfs_dir *dir) +{ + struct buffer_head * const *bh = dir->bh; + const int blocksize_bits = dir->sb->s_blocksize_bits; + union { __le32 *ptr32; u8 *ptr8; } ptr, end; + u32 dircheck = 0; + int last = 5 - 26; + int i = 0; + + /* + * Accumulate each word up to the last whole + * word of the last directory entry. This + * can spread across several buffer heads. + */ + do { + last += 26; + do { + dircheck = le32_to_cpu(dir_u32(i)) ^ ror13(dircheck); + + i += sizeof(u32); + } while (i < (last & ~3)); + } while (dir_u8(last) != 0); + + /* + * Accumulate the last few bytes. These + * bytes will be within the same bh. + */ + if (i != last) { + ptr.ptr8 = bufoff(bh, i); + end.ptr8 = ptr.ptr8 + last - i; + + do { + dircheck = *ptr.ptr8++ ^ ror13(dircheck); + } while (ptr.ptr8 < end.ptr8); + } + + /* + * The directory tail is in the final bh + * Note that contary to the RISC OS PRMs, + * the first few bytes are NOT included + * in the check. All bytes are in the + * same bh. + */ + ptr.ptr8 = bufoff(bh, 2008); + end.ptr8 = ptr.ptr8 + 36; + + do { + __le32 v = *ptr.ptr32++; + dircheck = le32_to_cpu(v) ^ ror13(dircheck); + } while (ptr.ptr32 < end.ptr32); + + return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; +} + +/* + * Read and check that a directory is valid + */ +static int +adfs_dir_read(struct super_block *sb, unsigned long object_id, + unsigned int size, struct adfs_dir *dir) +{ + const unsigned int blocksize_bits = sb->s_blocksize_bits; + int blk = 0; + + /* + * Directories which are not a multiple of 2048 bytes + * are considered bad v2 [3.6] + */ + if (size & 2047) + goto bad_dir; + + size >>= blocksize_bits; + + dir->nr_buffers = 0; + dir->sb = sb; + + for (blk = 0; blk < size; blk++) { + int phys; + + phys = __adfs_block_map(sb, object_id, blk); + if (!phys) { + adfs_error(sb, "dir object %lX has a hole at offset %d", + object_id, blk); + goto release_buffers; + } + + dir->bh[blk] = sb_bread(sb, phys); + if (!dir->bh[blk]) + goto release_buffers; + } + + memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); + memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + + if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || + memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) + goto bad_dir; + + if (memcmp(&dir->dirhead.startname, "Nick", 4) && + memcmp(&dir->dirhead.startname, "Hugo", 4)) + goto bad_dir; + + if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + goto bad_dir; + + dir->nr_buffers = blk; + + return 0; + +bad_dir: + adfs_error(sb, "corrupted directory fragment %lX", + object_id); +release_buffers: + for (blk -= 1; blk >= 0; blk -= 1) + brelse(dir->bh[blk]); + + dir->sb = NULL; + + return -EIO; +} + +/* + * convert a disk-based directory entry to a Linux ADFS directory entry + */ +static inline void +adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj, + struct adfs_direntry *de) +{ + obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN); + obj->file_id = adfs_readval(de->dirinddiscadd, 3); + obj->loadaddr = adfs_readval(de->dirload, 4); + obj->execaddr = adfs_readval(de->direxec, 4); + obj->size = adfs_readval(de->dirlen, 4); + obj->attr = de->newdiratts; + obj->filetype = -1; + + /* + * object is a file and is filetyped and timestamped? + * RISC OS 12-bit filetype is stored in load_address[19:8] + */ + if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) && + (0xfff00000 == (0xfff00000 & obj->loadaddr))) { + obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8); + + /* optionally append the ,xyz hex filetype suffix */ + if (ADFS_SB(dir->sb)->s_ftsuffix) + obj->name_len += + append_filetype_suffix( + &obj->name[obj->name_len], + obj->filetype); + } +} + +/* + * convert a Linux ADFS directory entry to a disk-based directory entry + */ +static inline void +adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) +{ + adfs_writeval(de->dirinddiscadd, 3, obj->file_id); + adfs_writeval(de->dirload, 4, obj->loadaddr); + adfs_writeval(de->direxec, 4, obj->execaddr); + adfs_writeval(de->dirlen, 4, obj->size); + de->newdiratts = obj->attr; +} + +/* + * get a directory entry. Note that the caller is responsible + * for holding the relevant locks. + */ +static int +__adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + struct adfs_direntry de; + int thissize, buffer, offset; + + buffer = pos >> sb->s_blocksize_bits; + + if (buffer > dir->nr_buffers) + return -EINVAL; + + offset = pos & (sb->s_blocksize - 1); + thissize = sb->s_blocksize - offset; + if (thissize > 26) + thissize = 26; + + memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); + if (thissize != 26) + memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, + 26 - thissize); + + if (!de.dirobname[0]) + return -ENOENT; + + adfs_dir2obj(dir, obj, &de); + + return 0; +} + +static int +__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + struct adfs_direntry de; + int thissize, buffer, offset; + + buffer = pos >> sb->s_blocksize_bits; + + if (buffer > dir->nr_buffers) + return -EINVAL; + + offset = pos & (sb->s_blocksize - 1); + thissize = sb->s_blocksize - offset; + if (thissize > 26) + thissize = 26; + + /* + * Get the entry in total + */ + memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); + if (thissize != 26) + memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, + 26 - thissize); + + /* + * update it + */ + adfs_obj2dir(&de, obj); + + /* + * Put the new entry back + */ + memcpy(dir->bh[buffer]->b_data + offset, &de, thissize); + if (thissize != 26) + memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize, + 26 - thissize); + + return 0; +} + +/* + * the caller is responsible for holding the necessary + * locks. + */ +static int +adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id) +{ + int pos, ret; + + ret = -ENOENT; + + for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) { + struct object_info obj; + + if (!__adfs_dir_get(dir, pos, &obj)) + break; + + if (obj.file_id == object_id) { + ret = pos; + break; + } + } + + return ret; +} + +static int +adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +{ + int ret; + + if (sz != ADFS_NEWDIR_SIZE) + return -EIO; + + ret = adfs_dir_read(sb, id, sz, dir); + if (ret) + adfs_error(sb, "unable to read directory"); + else + dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3); + + return ret; +} + +static int +adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos) +{ + if (fpos >= ADFS_NUM_DIR_ENTRIES) + return -ENOENT; + + dir->pos = 5 + fpos * 26; + return 0; +} + +static int +adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj) +{ + unsigned int ret; + + ret = __adfs_dir_get(dir, dir->pos, obj); + if (ret == 0) + dir->pos += 26; + + return ret; +} + +static int +adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + int ret, i; + + ret = adfs_dir_find_entry(dir, obj->file_id); + if (ret < 0) { + adfs_error(dir->sb, "unable to locate entry to update"); + goto out; + } + + __adfs_dir_put(dir, ret, obj); + + /* + * Increment directory sequence number + */ + dir->bh[0]->b_data[0] += 1; + dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1; + + ret = adfs_dir_checkbyte(dir); + /* + * Update directory check byte + */ + dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret; + +#if 1 + { + const unsigned int blocksize_bits = sb->s_blocksize_bits; + + memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); + memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + + if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || + memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) + goto bad_dir; + + if (memcmp(&dir->dirhead.startname, "Nick", 4) && + memcmp(&dir->dirhead.startname, "Hugo", 4)) + goto bad_dir; + + if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + goto bad_dir; + } +#endif + for (i = dir->nr_buffers - 1; i >= 0; i--) + mark_buffer_dirty(dir->bh[i]); + + ret = 0; +out: + return ret; +#if 1 +bad_dir: + adfs_error(dir->sb, "whoops! I broke a directory!"); + return -EIO; +#endif +} + +static int +adfs_f_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + +static void +adfs_f_free(struct adfs_dir *dir) +{ + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + brelse(dir->bh[i]); + dir->bh[i] = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; +} + +struct adfs_dir_ops adfs_f_dir_ops = { + .read = adfs_f_read, + .setpos = adfs_f_setpos, + .getnext = adfs_f_getnext, + .update = adfs_f_update, + .sync = adfs_f_sync, + .free = adfs_f_free +}; diff --git a/kernel/fs/adfs/dir_f.h b/kernel/fs/adfs/dir_f.h new file mode 100644 index 000000000..e47134040 --- /dev/null +++ b/kernel/fs/adfs/dir_f.h @@ -0,0 +1,65 @@ +/* + * linux/fs/adfs/dir_f.h + * + * Copyright (C) 1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Structures of directories on the F format disk + */ +#ifndef ADFS_DIR_F_H +#define ADFS_DIR_F_H + +/* + * Directory header + */ +struct adfs_dirheader { + unsigned char startmasseq; + unsigned char startname[4]; +}; + +#define ADFS_NEWDIR_SIZE 2048 +#define ADFS_NUM_DIR_ENTRIES 77 + +/* + * Directory entries + */ +struct adfs_direntry { +#define ADFS_F_NAME_LEN 10 + char dirobname[ADFS_F_NAME_LEN]; + __u8 dirload[4]; + __u8 direxec[4]; + __u8 dirlen[4]; + __u8 dirinddiscadd[3]; + __u8 newdiratts; +}; + +/* + * Directory tail + */ +union adfs_dirtail { + struct { + unsigned char dirlastmask; + char dirname[10]; + unsigned char dirparent[3]; + char dirtitle[19]; + unsigned char reserved[14]; + unsigned char endmasseq; + unsigned char endname[4]; + unsigned char dircheckbyte; + } old; + struct { + unsigned char dirlastmask; + unsigned char reserved[2]; + unsigned char dirparent[3]; + char dirtitle[19]; + char dirname[10]; + unsigned char endmasseq; + unsigned char endname[4]; + unsigned char dircheckbyte; + } new; +}; + +#endif diff --git a/kernel/fs/adfs/dir_fplus.c b/kernel/fs/adfs/dir_fplus.c new file mode 100644 index 000000000..82d14cdf7 --- /dev/null +++ b/kernel/fs/adfs/dir_fplus.c @@ -0,0 +1,265 @@ +/* + * linux/fs/adfs/dir_fplus.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" +#include "dir_fplus.h" + +static int +adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h; + struct adfs_bigdirtail *t; + unsigned long block; + unsigned int blk, size; + int i, ret = -EIO; + + dir->nr_buffers = 0; + + /* start off using fixed bh set - only alloc for big dirs */ + dir->bh_fplus = &dir->bh[0]; + + block = __adfs_block_map(sb, id, 0); + if (!block) { + adfs_error(sb, "dir object %X has a hole at offset 0", id); + goto out; + } + + dir->bh_fplus[0] = sb_bread(sb, block); + if (!dir->bh_fplus[0]) + goto out; + dir->nr_buffers += 1; + + h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; + size = le32_to_cpu(h->bigdirsize); + if (size != sz) { + printk(KERN_WARNING "adfs: adfs_fplus_read:" + " directory header size %X\n" + " does not match directory size %X\n", + size, sz); + } + + if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || + h->bigdirversion[2] != 0 || size & 2047 || + h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { + printk(KERN_WARNING "adfs: dir object %X has" + " malformed dir header\n", id); + goto out; + } + + size >>= sb->s_blocksize_bits; + if (size > ARRAY_SIZE(dir->bh)) { + /* this directory is too big for fixed bh set, must allocate */ + struct buffer_head **bh_fplus = + kcalloc(size, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!bh_fplus) { + ret = -ENOMEM; + adfs_error(sb, "not enough memory for" + " dir object %X (%d blocks)", id, size); + goto out; + } + dir->bh_fplus = bh_fplus; + /* copy over the pointer to the block that we've already read */ + dir->bh_fplus[0] = dir->bh[0]; + } + + for (blk = 1; blk < size; blk++) { + block = __adfs_block_map(sb, id, blk); + if (!block) { + adfs_error(sb, "dir object %X has a hole at offset %d", id, blk); + goto out; + } + + dir->bh_fplus[blk] = sb_bread(sb, block); + if (!dir->bh_fplus[blk]) { + adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", + id, blk, block); + goto out; + } + + dir->nr_buffers += 1; + } + + t = (struct adfs_bigdirtail *) + (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8)); + + if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || + t->bigdirendmasseq != h->startmasseq || + t->reserved[0] != 0 || t->reserved[1] != 0) { + printk(KERN_WARNING "adfs: dir object %X has " + "malformed dir end\n", id); + goto out; + } + + dir->parent_id = le32_to_cpu(h->bigdirparent); + dir->sb = sb; + return 0; + +out: + if (dir->bh_fplus) { + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bh_fplus[i]); + + if (&dir->bh[0] != dir->bh_fplus) + kfree(dir->bh_fplus); + + dir->bh_fplus = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; + return ret; +} + +static int +adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) +{ + struct adfs_bigdirheader *h = + (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + int ret = -ENOENT; + + if (fpos <= le32_to_cpu(h->bigdirentries)) { + dir->pos = fpos; + ret = 0; + } + + return ret; +} + +static void +dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len) +{ + struct super_block *sb = dir->sb; + unsigned int buffer, partial, remainder; + + buffer = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + + partial = sb->s_blocksize - offset; + + if (partial >= len) + memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len); + else { + char *c = (char *)to; + + remainder = len - partial; + + memcpy(c, + dir->bh_fplus[buffer]->b_data + offset, + partial); + + memcpy(c + partial, + dir->bh_fplus[buffer + 1]->b_data, + remainder); + } +} + +static int +adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) +{ + struct adfs_bigdirheader *h = + (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + struct adfs_bigdirentry bde; + unsigned int offset; + int i, ret = -ENOENT; + + if (dir->pos >= le32_to_cpu(h->bigdirentries)) + goto out; + + offset = offsetof(struct adfs_bigdirheader, bigdirname); + offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); + offset += dir->pos * sizeof(struct adfs_bigdirentry); + + dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry)); + + obj->loadaddr = le32_to_cpu(bde.bigdirload); + obj->execaddr = le32_to_cpu(bde.bigdirexec); + obj->size = le32_to_cpu(bde.bigdirlen); + obj->file_id = le32_to_cpu(bde.bigdirindaddr); + obj->attr = le32_to_cpu(bde.bigdirattr); + obj->name_len = le32_to_cpu(bde.bigdirobnamelen); + + offset = offsetof(struct adfs_bigdirheader, bigdirname); + offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); + offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry); + offset += le32_to_cpu(bde.bigdirobnameptr); + + dir_memcpy(dir, offset, obj->name, obj->name_len); + for (i = 0; i < obj->name_len; i++) + if (obj->name[i] == '/') + obj->name[i] = '.'; + + obj->filetype = -1; + + /* + * object is a file and is filetyped and timestamped? + * RISC OS 12-bit filetype is stored in load_address[19:8] + */ + if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) && + (0xfff00000 == (0xfff00000 & obj->loadaddr))) { + obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8); + + /* optionally append the ,xyz hex filetype suffix */ + if (ADFS_SB(dir->sb)->s_ftsuffix) + obj->name_len += + append_filetype_suffix( + &obj->name[obj->name_len], + obj->filetype); + } + + dir->pos += 1; + ret = 0; +out: + return ret; +} + +static int +adfs_fplus_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh_fplus[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + +static void +adfs_fplus_free(struct adfs_dir *dir) +{ + int i; + + if (dir->bh_fplus) { + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bh_fplus[i]); + + if (&dir->bh[0] != dir->bh_fplus) + kfree(dir->bh_fplus); + + dir->bh_fplus = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; +} + +struct adfs_dir_ops adfs_fplus_dir_ops = { + .read = adfs_fplus_read, + .setpos = adfs_fplus_setpos, + .getnext = adfs_fplus_getnext, + .sync = adfs_fplus_sync, + .free = adfs_fplus_free +}; diff --git a/kernel/fs/adfs/dir_fplus.h b/kernel/fs/adfs/dir_fplus.h new file mode 100644 index 000000000..b55aa41a6 --- /dev/null +++ b/kernel/fs/adfs/dir_fplus.h @@ -0,0 +1,45 @@ +/* + * linux/fs/adfs/dir_fplus.h + * + * Copyright (C) 1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Structures of directories on the F+ format disk + */ + +#define ADFS_FPLUS_NAME_LEN 255 + +#define BIGDIRSTARTNAME ('S' | 'B' << 8 | 'P' << 16 | 'r' << 24) +#define BIGDIRENDNAME ('o' | 'v' << 8 | 'e' << 16 | 'n' << 24) + +struct adfs_bigdirheader { + __u8 startmasseq; + __u8 bigdirversion[3]; + __le32 bigdirstartname; + __le32 bigdirnamelen; + __le32 bigdirsize; + __le32 bigdirentries; + __le32 bigdirnamesize; + __le32 bigdirparent; + char bigdirname[1]; +}; + +struct adfs_bigdirentry { + __le32 bigdirload; + __le32 bigdirexec; + __le32 bigdirlen; + __le32 bigdirindaddr; + __le32 bigdirattr; + __le32 bigdirobnamelen; + __le32 bigdirobnameptr; +}; + +struct adfs_bigdirtail { + __le32 bigdirendname; + __u8 bigdirendmasseq; + __u8 reserved[2]; + __u8 bigdircheckbyte; +}; diff --git a/kernel/fs/adfs/file.c b/kernel/fs/adfs/file.c new file mode 100644 index 000000000..46c0d5671 --- /dev/null +++ b/kernel/fs/adfs/file.c @@ -0,0 +1,35 @@ +/* + * linux/fs/adfs/file.c + * + * Copyright (C) 1997-1999 Russell King + * from: + * + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * adfs regular file handling primitives + */ +#include "adfs.h" + +const struct file_operations adfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .write_iter = generic_file_write_iter, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations adfs_file_inode_operations = { + .setattr = adfs_notify_change, +}; diff --git a/kernel/fs/adfs/inode.c b/kernel/fs/adfs/inode.c new file mode 100644 index 000000000..335055d82 --- /dev/null +++ b/kernel/fs/adfs/inode.c @@ -0,0 +1,371 @@ +/* + * linux/fs/adfs/inode.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" + +/* + * Lookup/Create a block at offset 'block' into 'inode'. We currently do + * not support creation of new blocks, so we return -EIO for this case. + */ +static int +adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, + int create) +{ + if (!create) { + if (block >= inode->i_blocks) + goto abort_toobig; + + block = __adfs_block_map(inode->i_sb, inode->i_ino, block); + if (block) + map_bh(bh, inode->i_sb, block); + return 0; + } + /* don't support allocation of blocks yet */ + return -EIO; + +abort_toobig: + return 0; +} + +static int adfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, adfs_get_block, wbc); +} + +static int adfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, adfs_get_block); +} + +static void adfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, inode->i_size); +} + +static int adfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + adfs_get_block, + &ADFS_I(mapping->host)->mmu_private); + if (unlikely(ret)) + adfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, adfs_get_block); +} + +static const struct address_space_operations adfs_aops = { + .readpage = adfs_readpage, + .writepage = adfs_writepage, + .write_begin = adfs_write_begin, + .write_end = generic_write_end, + .bmap = _adfs_bmap +}; + +/* + * Convert ADFS attributes and filetype to Linux permission. + */ +static umode_t +adfs_atts2mode(struct super_block *sb, struct inode *inode) +{ + unsigned int attr = ADFS_I(inode)->attr; + umode_t mode, rmask; + struct adfs_sb_info *asb = ADFS_SB(sb); + + if (attr & ADFS_NDA_DIRECTORY) { + mode = S_IRUGO & asb->s_owner_mask; + return S_IFDIR | S_IXUGO | mode; + } + + switch (ADFS_I(inode)->filetype) { + case 0xfc0: /* LinkFS */ + return S_IFLNK|S_IRWXUGO; + + case 0xfe6: /* UnixExec */ + rmask = S_IRUGO | S_IXUGO; + break; + + default: + rmask = S_IRUGO; + } + + mode = S_IFREG; + + if (attr & ADFS_NDA_OWNER_READ) + mode |= rmask & asb->s_owner_mask; + + if (attr & ADFS_NDA_OWNER_WRITE) + mode |= S_IWUGO & asb->s_owner_mask; + + if (attr & ADFS_NDA_PUBLIC_READ) + mode |= rmask & asb->s_other_mask; + + if (attr & ADFS_NDA_PUBLIC_WRITE) + mode |= S_IWUGO & asb->s_other_mask; + return mode; +} + +/* + * Convert Linux permission to ADFS attribute. We try to do the reverse + * of atts2mode, but there is not a 1:1 translation. + */ +static int +adfs_mode2atts(struct super_block *sb, struct inode *inode) +{ + umode_t mode; + int attr; + struct adfs_sb_info *asb = ADFS_SB(sb); + + /* FIXME: should we be able to alter a link? */ + if (S_ISLNK(inode->i_mode)) + return ADFS_I(inode)->attr; + + if (S_ISDIR(inode->i_mode)) + attr = ADFS_NDA_DIRECTORY; + else + attr = 0; + + mode = inode->i_mode & asb->s_owner_mask; + if (mode & S_IRUGO) + attr |= ADFS_NDA_OWNER_READ; + if (mode & S_IWUGO) + attr |= ADFS_NDA_OWNER_WRITE; + + mode = inode->i_mode & asb->s_other_mask; + mode &= ~asb->s_owner_mask; + if (mode & S_IRUGO) + attr |= ADFS_NDA_PUBLIC_READ; + if (mode & S_IWUGO) + attr |= ADFS_NDA_PUBLIC_WRITE; + + return attr; +} + +/* + * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time + * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds + * of time to convert from RISC OS epoch to Unix epoch. + */ +static void +adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) +{ + unsigned int high, low; + /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since + * 01 Jan 1900 00:00:00 (RISC OS epoch) + */ + static const s64 nsec_unix_epoch_diff_risc_os_epoch = + 2208988800000000000LL; + s64 nsec; + + if (ADFS_I(inode)->stamped == 0) + goto cur_time; + + high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */ + low = ADFS_I(inode)->execaddr; /* bottom 32 bits of timestamp */ + + /* convert 40-bit centi-seconds to 32-bit seconds + * going via nanoseconds to retain precision + */ + nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */ + + /* Files dated pre 01 Jan 1970 00:00:00. */ + if (nsec < nsec_unix_epoch_diff_risc_os_epoch) + goto too_early; + + /* convert from RISC OS to Unix epoch */ + nsec -= nsec_unix_epoch_diff_risc_os_epoch; + + *tv = ns_to_timespec(nsec); + return; + + cur_time: + *tv = CURRENT_TIME; + return; + + too_early: + tv->tv_sec = tv->tv_nsec = 0; + return; +} + +/* + * Convert an Unix time to ADFS time. We only do this if the entry has a + * time/date stamp already. + */ +static void +adfs_unix2adfs_time(struct inode *inode, unsigned int secs) +{ + unsigned int high, low; + + if (ADFS_I(inode)->stamped) { + /* convert 32-bit seconds to 40-bit centi-seconds */ + low = (secs & 255) * 100; + high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; + + ADFS_I(inode)->loadaddr = (high >> 24) | + (ADFS_I(inode)->loadaddr & ~0xff); + ADFS_I(inode)->execaddr = (low & 255) | (high << 8); + } +} + +/* + * Fill in the inode information from the object information. + * + * Note that this is an inode-less filesystem, so we can't use the inode + * number to reference the metadata on the media. Instead, we use the + * inode number to hold the object ID, which in turn will tell us where + * the data is held. We also save the parent object ID, and with these + * two, we can locate the metadata. + * + * This does mean that we rely on an objects parent remaining the same at + * all times - we cannot cope with a cross-directory rename (yet). + */ +struct inode * +adfs_iget(struct super_block *sb, struct object_info *obj) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + goto out; + + inode->i_uid = ADFS_SB(sb)->s_uid; + inode->i_gid = ADFS_SB(sb)->s_gid; + inode->i_ino = obj->file_id; + inode->i_size = obj->size; + set_nlink(inode, 2); + inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + /* + * we need to save the parent directory ID so that + * write_inode can update the directory information + * for this file. This will need special handling + * for cross-directory renames. + */ + ADFS_I(inode)->parent_id = obj->parent_id; + ADFS_I(inode)->loadaddr = obj->loadaddr; + ADFS_I(inode)->execaddr = obj->execaddr; + ADFS_I(inode)->attr = obj->attr; + ADFS_I(inode)->filetype = obj->filetype; + ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); + + inode->i_mode = adfs_atts2mode(sb, inode); + adfs_adfs2unix_time(&inode->i_mtime, inode); + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + + if (S_ISDIR(inode->i_mode)) { + inode->i_op = &adfs_dir_inode_operations; + inode->i_fop = &adfs_dir_operations; + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &adfs_file_inode_operations; + inode->i_fop = &adfs_file_operations; + inode->i_mapping->a_ops = &adfs_aops; + ADFS_I(inode)->mmu_private = inode->i_size; + } + + insert_inode_hash(inode); + +out: + return inode; +} + +/* + * Validate and convert a changed access mode/time to their ADFS equivalents. + * adfs_write_inode will actually write the information back to the directory + * later. + */ +int +adfs_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + + /* + * we can't change the UID or GID of any file - + * we have a global UID/GID in the superblock + */ + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid))) + error = -EPERM; + + if (error) + goto out; + + /* XXX: this is missing some actual on-disk truncation.. */ + if (ia_valid & ATTR_SIZE) + truncate_setsize(inode, attr->ia_size); + + if (ia_valid & ATTR_MTIME) { + inode->i_mtime = attr->ia_mtime; + adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec); + } + /* + * FIXME: should we make these == to i_mtime since we don't + * have the ability to represent them in our filesystem? + */ + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + ADFS_I(inode)->attr = adfs_mode2atts(sb, inode); + inode->i_mode = adfs_atts2mode(sb, inode); + } + + /* + * FIXME: should we be marking this inode dirty even if + * we don't have any metadata to write back? + */ + if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE)) + mark_inode_dirty(inode); +out: + return error; +} + +/* + * write an existing inode back to the directory, and therefore the disk. + * The adfs-specific inode data has already been updated by + * adfs_notify_change() + */ +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct super_block *sb = inode->i_sb; + struct object_info obj; + int ret; + + obj.file_id = inode->i_ino; + obj.name_len = 0; + obj.parent_id = ADFS_I(inode)->parent_id; + obj.loadaddr = ADFS_I(inode)->loadaddr; + obj.execaddr = ADFS_I(inode)->execaddr; + obj.attr = ADFS_I(inode)->attr; + obj.size = inode->i_size; + + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); + return ret; +} diff --git a/kernel/fs/adfs/map.c b/kernel/fs/adfs/map.c new file mode 100644 index 000000000..6935f0520 --- /dev/null +++ b/kernel/fs/adfs/map.c @@ -0,0 +1,290 @@ +/* + * linux/fs/adfs/map.c + * + * Copyright (C) 1997-2002 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" + +/* + * The ADFS map is basically a set of sectors. Each sector is called a + * zone which contains a bitstream made up of variable sized fragments. + * Each bit refers to a set of bytes in the filesystem, defined by + * log2bpmb. This may be larger or smaller than the sector size, but + * the overall size it describes will always be a round number of + * sectors. A fragment id is always idlen bits long. + * + * < idlen > < n > <1> + * +---------+-------//---------+---+ + * | frag id | 0000....000000 | 1 | + * +---------+-------//---------+---+ + * + * The physical disk space used by a fragment is taken from the start of + * the fragment id up to and including the '1' bit - ie, idlen + n + 1 + * bits. + * + * A fragment id can be repeated multiple times in the whole map for + * large or fragmented files. The first map zone a fragment starts in + * is given by fragment id / ids_per_zone - this allows objects to start + * from any zone on the disk. + * + * Free space is described by a linked list of fragments. Each free + * fragment describes free space in the same way as the other fragments, + * however, the frag id specifies an offset (in map bits) from the end + * of this fragment to the start of the next free fragment. + * + * Objects stored on the disk are allocated object ids (we use these as + * our inode numbers.) Object ids contain a fragment id and an optional + * offset. This allows a directory fragment to contain small files + * associated with that directory. + */ + +/* + * For the future... + */ +static DEFINE_RWLOCK(adfs_map_lock); + +/* + * This is fun. We need to load up to 19 bits from the map at an + * arbitrary bit alignment. (We're limited to 19 bits by F+ version 2). + */ +#define GET_FRAG_ID(_map,_start,_idmask) \ + ({ \ + unsigned char *_m = _map + (_start >> 3); \ + u32 _frag = get_unaligned_le32(_m); \ + _frag >>= (_start & 7); \ + _frag & _idmask; \ + }) + +/* + * return the map bit offset of the fragment frag_id in the zone dm. + * Note that the loop is optimised for best asm code - look at the + * output of: + * gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c + */ +static int +lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, + const unsigned int frag_id, unsigned int *offset) +{ + const unsigned int mapsize = dm->dm_endbit; + const u32 idmask = (1 << idlen) - 1; + unsigned char *map = dm->dm_bh->b_data + 4; + unsigned int start = dm->dm_startbit; + unsigned int mapptr; + u32 frag; + + do { + frag = GET_FRAG_ID(map, start, idmask); + mapptr = start + idlen; + + /* + * find end of fragment + */ + { + __le32 *_map = (__le32 *)map; + u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); + while (v == 0) { + mapptr = (mapptr & ~31) + 32; + if (mapptr >= mapsize) + goto error; + v = le32_to_cpu(_map[mapptr >> 5]); + } + + mapptr += 1 + ffz(~v); + } + + if (frag == frag_id) + goto found; +again: + start = mapptr; + } while (mapptr < mapsize); + return -1; + +error: + printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n", + frag, start, mapptr); + return -1; + +found: + { + int length = mapptr - start; + if (*offset >= length) { + *offset -= length; + goto again; + } + } + return start + *offset; +} + +/* + * Scan the free space map, for this zone, calculating the total + * number of map bits in each free space fragment. + * + * Note: idmask is limited to 15 bits [3.2] + */ +static unsigned int +scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) +{ + const unsigned int mapsize = dm->dm_endbit + 32; + const unsigned int idlen = asb->s_idlen; + const unsigned int frag_idlen = idlen <= 15 ? idlen : 15; + const u32 idmask = (1 << frag_idlen) - 1; + unsigned char *map = dm->dm_bh->b_data; + unsigned int start = 8, mapptr; + u32 frag; + unsigned long total = 0; + + /* + * get fragment id + */ + frag = GET_FRAG_ID(map, start, idmask); + + /* + * If the freelink is null, then no free fragments + * exist in this zone. + */ + if (frag == 0) + return 0; + + do { + start += frag; + + /* + * get fragment id + */ + frag = GET_FRAG_ID(map, start, idmask); + mapptr = start + idlen; + + /* + * find end of fragment + */ + { + __le32 *_map = (__le32 *)map; + u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); + while (v == 0) { + mapptr = (mapptr & ~31) + 32; + if (mapptr >= mapsize) + goto error; + v = le32_to_cpu(_map[mapptr >> 5]); + } + + mapptr += 1 + ffz(~v); + } + + total += mapptr - start; + } while (frag >= idlen + 1); + + if (frag != 0) + printk(KERN_ERR "adfs: undersized free fragment\n"); + + return total; +error: + printk(KERN_ERR "adfs: oversized free fragment\n"); + return 0; +} + +static int +scan_map(struct adfs_sb_info *asb, unsigned int zone, + const unsigned int frag_id, unsigned int mapoff) +{ + const unsigned int idlen = asb->s_idlen; + struct adfs_discmap *dm, *dm_end; + int result; + + dm = asb->s_map + zone; + zone = asb->s_map_size; + dm_end = asb->s_map + zone; + + do { + result = lookup_zone(dm, idlen, frag_id, &mapoff); + + if (result != -1) + goto found; + + dm ++; + if (dm == dm_end) + dm = asb->s_map; + } while (--zone > 0); + + return -1; +found: + result -= dm->dm_startbit; + result += dm->dm_startblk; + + return result; +} + +/* + * calculate the amount of free blocks in the map. + * + * n=1 + * total_free = E(free_in_zone_n) + * nzones + */ +unsigned int +adfs_map_free(struct super_block *sb) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discmap *dm; + unsigned int total = 0; + unsigned int zone; + + dm = asb->s_map; + zone = asb->s_map_size; + + do { + total += scan_free_map(asb, dm++); + } while (--zone > 0); + + return signed_asl(total, asb->s_map2blk); +} + +int +adfs_map_lookup(struct super_block *sb, unsigned int frag_id, + unsigned int offset) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + unsigned int zone, mapoff; + int result; + + /* + * map & root fragment is special - it starts in the center of the + * disk. The other fragments start at zone (frag / ids_per_zone) + */ + if (frag_id == ADFS_ROOT_FRAG) + zone = asb->s_map_size >> 1; + else + zone = frag_id / asb->s_ids_per_zone; + + if (zone >= asb->s_map_size) + goto bad_fragment; + + /* Convert sector offset to map offset */ + mapoff = signed_asl(offset, -asb->s_map2blk); + + read_lock(&adfs_map_lock); + result = scan_map(asb, zone, frag_id, mapoff); + read_unlock(&adfs_map_lock); + + if (result > 0) { + unsigned int secoff; + + /* Calculate sector offset into map block */ + secoff = offset - signed_asl(mapoff, asb->s_map2blk); + return secoff + signed_asl(result, asb->s_map2blk); + } + + adfs_error(sb, "fragment 0x%04x at offset %d not found in map", + frag_id, offset); + return 0; + +bad_fragment: + adfs_error(sb, "invalid fragment 0x%04x (zone = %d, max = %d)", + frag_id, zone, asb->s_map_size); + return 0; +} diff --git a/kernel/fs/adfs/super.c b/kernel/fs/adfs/super.c new file mode 100644 index 000000000..a19c31d3f --- /dev/null +++ b/kernel/fs/adfs/super.c @@ -0,0 +1,562 @@ +/* + * linux/fs/adfs/super.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "adfs.h" +#include "dir_f.h" +#include "dir_fplus.h" + +#define ADFS_DEFAULT_OWNER_MASK S_IRWXU +#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO) + +void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...) +{ + char error_buf[128]; + va_list args; + + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + + printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n", + sb->s_id, function ? ": " : "", + function ? function : "", error_buf); +} + +static int adfs_checkdiscrecord(struct adfs_discrecord *dr) +{ + int i; + + /* sector size must be 256, 512 or 1024 bytes */ + if (dr->log2secsize != 8 && + dr->log2secsize != 9 && + dr->log2secsize != 10) + return 1; + + /* idlen must be at least log2secsize + 3 */ + if (dr->idlen < dr->log2secsize + 3) + return 1; + + /* we cannot have such a large disc that we + * are unable to represent sector offsets in + * 32 bits. This works out at 2.0 TB. + */ + if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize) + return 1; + + /* idlen must be no greater than 19 v2 [1.0] */ + if (dr->idlen > 19) + return 1; + + /* reserved bytes should be zero */ + for (i = 0; i < sizeof(dr->unused52); i++) + if (dr->unused52[i] != 0) + return 1; + + return 0; +} + +static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) +{ + unsigned int v0, v1, v2, v3; + int i; + + v0 = v1 = v2 = v3 = 0; + for (i = sb->s_blocksize - 4; i; i -= 4) { + v0 += map[i] + (v3 >> 8); + v3 &= 0xff; + v1 += map[i + 1] + (v0 >> 8); + v0 &= 0xff; + v2 += map[i + 2] + (v1 >> 8); + v1 &= 0xff; + v3 += map[i + 3] + (v2 >> 8); + v2 &= 0xff; + } + v0 += v3 >> 8; + v1 += map[1] + (v0 >> 8); + v2 += map[2] + (v1 >> 8); + v3 += map[3] + (v2 >> 8); + + return v0 ^ v1 ^ v2 ^ v3; +} + +static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) +{ + unsigned char crosscheck = 0, zonecheck = 1; + int i; + + for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { + unsigned char *map; + + map = dm[i].dm_bh->b_data; + + if (adfs_calczonecheck(sb, map) != map[0]) { + adfs_error(sb, "zone %d fails zonecheck", i); + zonecheck = 0; + } + crosscheck ^= map[3]; + } + if (crosscheck != 0xff) + adfs_error(sb, "crosscheck != 0xff"); + return crosscheck == 0xff && zonecheck; +} + +static void adfs_put_super(struct super_block *sb) +{ + int i; + struct adfs_sb_info *asb = ADFS_SB(sb); + + for (i = 0; i < asb->s_map_size; i++) + brelse(asb->s_map[i].dm_bh); + kfree(asb->s_map); + kfree_rcu(asb, rcu); +} + +static int adfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct adfs_sb_info *asb = ADFS_SB(root->d_sb); + + if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid)); + if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid)); + if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) + seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); + if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) + seq_printf(seq, ",othmask=%o", asb->s_other_mask); + if (asb->s_ftsuffix != 0) + seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix); + + return 0; +} + +enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_ownmask, "ownmask=%o"}, + {Opt_othmask, "othmask=%o"}, + {Opt_ftsuffix, "ftsuffix=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options) +{ + char *p; + struct adfs_sb_info *asb = ADFS_SB(sb); + int option; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(args, &option)) + return -EINVAL; + asb->s_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(asb->s_uid)) + return -EINVAL; + break; + case Opt_gid: + if (match_int(args, &option)) + return -EINVAL; + asb->s_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(asb->s_gid)) + return -EINVAL; + break; + case Opt_ownmask: + if (match_octal(args, &option)) + return -EINVAL; + asb->s_owner_mask = option; + break; + case Opt_othmask: + if (match_octal(args, &option)) + return -EINVAL; + asb->s_other_mask = option; + break; + case Opt_ftsuffix: + if (match_int(args, &option)) + return -EINVAL; + asb->s_ftsuffix = option; + break; + default: + printk("ADFS-fs: unrecognised mount option \"%s\" " + "or missing value\n", p); + return -EINVAL; + } + } + return 0; +} + +static int adfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_NODIRATIME; + return parse_options(sb, data); +} + +static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct adfs_sb_info *sbi = ADFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ADFS_SUPER_MAGIC; + buf->f_namelen = sbi->s_namelen; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_size; + buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; + buf->f_bavail = + buf->f_bfree = adfs_map_free(sb); + buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static struct kmem_cache *adfs_inode_cachep; + +static struct inode *adfs_alloc_inode(struct super_block *sb) +{ + struct adfs_inode_info *ei; + ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void adfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); +} + +static void adfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, adfs_i_callback); +} + +static void init_once(void *foo) +{ + struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", + sizeof(struct adfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (adfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(adfs_inode_cachep); +} + +static const struct super_operations adfs_sops = { + .alloc_inode = adfs_alloc_inode, + .destroy_inode = adfs_destroy_inode, + .write_inode = adfs_write_inode, + .put_super = adfs_put_super, + .statfs = adfs_statfs, + .remount_fs = adfs_remount, + .show_options = adfs_show_options, +}; + +static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +{ + struct adfs_discmap *dm; + unsigned int map_addr, zone_size, nzones; + int i, zone; + struct adfs_sb_info *asb = ADFS_SB(sb); + + nzones = asb->s_map_size; + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + map_addr = (nzones >> 1) * zone_size - + ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); + map_addr = signed_asl(map_addr, asb->s_map2blk); + + asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + + dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL); + if (dm == NULL) { + adfs_error(sb, "not enough memory"); + return ERR_PTR(-ENOMEM); + } + + for (zone = 0; zone < nzones; zone++, map_addr++) { + dm[zone].dm_startbit = 0; + dm[zone].dm_endbit = zone_size; + dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; + dm[zone].dm_bh = sb_bread(sb, map_addr); + + if (!dm[zone].dm_bh) { + adfs_error(sb, "unable to read map"); + goto error_free; + } + } + + /* adjust the limits for the first and last map zones */ + i = zone - 1; + dm[0].dm_startblk = 0; + dm[0].dm_startbit = ADFS_DR_SIZE_BITS; + dm[i].dm_endbit = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) + + (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) + + (ADFS_DR_SIZE_BITS - i * zone_size); + + if (adfs_checkmap(sb, dm)) + return dm; + + adfs_error(sb, "map corrupted"); + +error_free: + while (--zone >= 0) + brelse(dm[zone].dm_bh); + + kfree(dm); + return ERR_PTR(-EIO); +} + +static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits) +{ + unsigned long discsize; + + discsize = le32_to_cpu(dr->disc_size_high) << (32 - block_bits); + discsize |= le32_to_cpu(dr->disc_size) >> block_bits; + + return discsize; +} + +static int adfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct adfs_discrecord *dr; + struct buffer_head *bh; + struct object_info root_obj; + unsigned char *b_data; + struct adfs_sb_info *asb; + struct inode *root; + int ret = -EINVAL; + + sb->s_flags |= MS_NODIRATIME; + + asb = kzalloc(sizeof(*asb), GFP_KERNEL); + if (!asb) + return -ENOMEM; + sb->s_fs_info = asb; + + /* set default options */ + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; + asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; + asb->s_ftsuffix = 0; + + if (parse_options(sb, data)) + goto error; + + sb_set_blocksize(sb, BLOCK_SIZE); + if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { + adfs_error(sb, "unable to read superblock"); + ret = -EIO; + goto error; + } + + b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); + + if (adfs_checkbblk(b_data)) { + if (!silent) + printk("VFS: Can't find an adfs filesystem on dev " + "%s.\n", sb->s_id); + ret = -EINVAL; + goto error_free_bh; + } + + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + + /* + * Do some sanity checks on the ADFS disc record + */ + if (adfs_checkdiscrecord(dr)) { + if (!silent) + printk("VPS: Can't find an adfs filesystem on dev " + "%s.\n", sb->s_id); + ret = -EINVAL; + goto error_free_bh; + } + + brelse(bh); + if (sb_set_blocksize(sb, 1 << dr->log2secsize)) { + bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); + if (!bh) { + adfs_error(sb, "couldn't read superblock on " + "2nd try."); + ret = -EIO; + goto error; + } + b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); + if (adfs_checkbblk(b_data)) { + adfs_error(sb, "disc record mismatch, very weird!"); + ret = -EINVAL; + goto error_free_bh; + } + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + } else { + if (!silent) + printk(KERN_ERR "VFS: Unsupported blocksize on dev " + "%s.\n", sb->s_id); + ret = -EINVAL; + goto error; + } + + /* + * blocksize on this device should now be set to the ADFS log2secsize + */ + + sb->s_magic = ADFS_SUPER_MAGIC; + asb->s_idlen = dr->idlen; + asb->s_map_size = dr->nzones | (dr->nzones_high << 8); + asb->s_map2blk = dr->log2bpmb - dr->log2secsize; + asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits); + asb->s_version = dr->format_version; + asb->s_log2sharesize = dr->log2sharesize; + + asb->s_map = adfs_read_map(sb, dr); + if (IS_ERR(asb->s_map)) { + ret = PTR_ERR(asb->s_map); + goto error_free_bh; + } + + brelse(bh); + + /* + * set up enough so that we can read an inode + */ + sb->s_op = &adfs_sops; + + dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4); + + root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); + root_obj.name_len = 0; + /* Set root object date as 01 Jan 1987 00:00:00 */ + root_obj.loadaddr = 0xfff0003f; + root_obj.execaddr = 0xec22c000; + root_obj.size = ADFS_NEWDIR_SIZE; + root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | + ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; + root_obj.filetype = -1; + + /* + * If this is a F+ disk with variable length directories, + * get the root_size from the disc record. + */ + if (asb->s_version) { + root_obj.size = le32_to_cpu(dr->root_size); + asb->s_dir = &adfs_fplus_dir_ops; + asb->s_namelen = ADFS_FPLUS_NAME_LEN; + } else { + asb->s_dir = &adfs_f_dir_ops; + asb->s_namelen = ADFS_F_NAME_LEN; + } + /* + * ,xyz hex filetype suffix may be added by driver + * to files that have valid RISC OS filetype + */ + if (asb->s_ftsuffix) + asb->s_namelen += 4; + + sb->s_d_op = &adfs_dentry_operations; + root = adfs_iget(sb, &root_obj); + sb->s_root = d_make_root(root); + if (!sb->s_root) { + int i; + for (i = 0; i < asb->s_map_size; i++) + brelse(asb->s_map[i].dm_bh); + kfree(asb->s_map); + adfs_error(sb, "get root inode failed\n"); + ret = -EIO; + goto error; + } + return 0; + +error_free_bh: + brelse(bh); +error: + sb->s_fs_info = NULL; + kfree(asb); + return ret; +} + +static struct dentry *adfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); +} + +static struct file_system_type adfs_fs_type = { + .owner = THIS_MODULE, + .name = "adfs", + .mount = adfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("adfs"); + +static int __init init_adfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&adfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_adfs_fs(void) +{ + unregister_filesystem(&adfs_fs_type); + destroy_inodecache(); +} + +module_init(init_adfs_fs) +module_exit(exit_adfs_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/affs/Changes b/kernel/fs/affs/Changes new file mode 100644 index 000000000..b41c2c979 --- /dev/null +++ b/kernel/fs/affs/Changes @@ -0,0 +1,343 @@ +(Note: I consider version numbers as cheap. That means +that I do not like numbers like 0.1 and the like for +things that can be used since quite some time. But +then, 3.1 doesn't mean 'perfectly stable', too.) + +Known bugs: +----------- + +- Doesn't work on the alpha. The only 64/32-bit + problem that I'm aware of (pointer/int conversion + in readdir()) gives compiler warnings but is + apparently not causing the failure, as directory + reads basically work (but all files are of size 0). + Alas, I've got no alpha to debug. :-( + +- The partition checker (drivers/block/genhd.c) + doesn't work with devices which have 256 byte + blocks (some very old SCSI drives). + +- The feature to automatically make the fs clean + might leave a trashed file system with the + bitmap flag set valid. + +- When a file is truncated to a size that is not + a multiple of the blocksize, the rest of the + last allocated block is not cleared. Well, + this fs never claimed to be Posix conformant. + +Please direct bug reports to: zippel@linux-m68k.org + +Version 3.20 +------------ +- kill kernel lock +- fix for a possible bitmap corruption + +Version 3.19 +------------ + +- sizeof changes from Kernel Janitor Project +- several bug fixes found with fsx + +Version 3.18 +------------ + +- change to global min macro + warning fixes +- add module tags + +Version 3.17 +------------ + +- locking fixes +- wrong sign in __affs_hash_dentry +- remove unnecessary check in affs_new_inode +- enable international mode for dircache fs + +Version 3.16 +------------ + +- use mark_buffer_dirty_inode instead of mark_buffer_dirty. +- introduce affs_lock_{link|dir|ext}. + +Version 3.15 +------------ + +- disable link to directories until we can properly support them. +- locking fixes for link creation/removal. + +Version 3.14 +------------ + +- correctly cut off long file names for compares +- correctly initialize s_last_bmap + +Version 3.13 +------------ + +Major cleanup for 2.4 [Roman Zippel] +- new extended block handling +- new bitmap allocation functions +- locking should be safe for the future +- cleanup of some interfaces + +Version 3.12 +------------ + +more 2.4 fixes: [Roman Zippel] +- s_lock changes +- increased getblock mess +- clear meta blocks + +Version 3.11 +------------ + +- Converted to use 2.3.x page cache [Dave Jones] +- Corruption in truncate() bugfix [Ken Tyler ] + +Version 3.10 +------------ + +- Changed partition checker to allow devices + with physical blocks != 512 bytes. + +- The partition checker now also ignores the + word at 0xd0 that Windows likes to write to. + +Version 3.9 +----------- + +- Moved cleanup from release_file() to put_inode(). + This makes the first one obsolete. + +- truncate() zeroes the unused remainder of a + partially used last block when a file is truncated. + It also marks the inode dirty now (which is not + really necessary as notify_change() will do + it anyway). + +- Added a few comments, fixed some typos (and + introduced some new ones), made the debug messages + more consistent. Changed a bad example in the + doc file (affs.txt). + +- Sets the NOEXEC flag in read_super() for old file + systems, since you can't run programs on them. + +Version 3.8 +----------- +Bill Hawes kindly reviewed the affs and sent me the +patches he did. They're marked (BH). Thanks, Bill! + +- Cleanup of error handling in read_super(). + Didn't release all resources in case of an + error. (BH) + +- put_inode() releases the ext cache only if it's + no longer needed. (BH) + +- One set of dentry callbacks is enough. (BH) + +- Cleanup of error handling in namei.c. (BH) + +- Cleanup of error handling in file.c. (BH) + +- The original blocksize of the device is + restored when the fs is unmounted. (BH) + +- getblock() did not invalidate the key cache + when it allocated a new block. + +- Removed some unnecessary locks as Bill + suggested. + +- Simplified match_name(), changed all hashing + and case insensitive name comparisons to use + uppercase. This makes the tolower() routines + obsolete. + +- Added mount option 'mufs' to force muFS + uid/gid interpretation. + +- File mode changes were not updated on disk. + This was fixed before, but somehow got lost. + +Version 3.7 +----------- + +- Added dentry callbacks to allow the dcache to + operate case insensitive and length ignorant + like the affs itself. + +- getblock() didn't update the lastblock field in the + inode if the fs was not an OFS. This bug only shows + up if a file was enlarged via truncate() and there + was not enough space. + +- Remove some more superfluous code left over from + the old link days ... + +- Fixed some oversights which were in patch 2.1.78. + +- Fixed a few typos. + +Version 3.6 +----------- + +- dentry changes. (Thanks to Jes Sorensen for his help.) + +- Fixed bug in balloc(): Superblock was not set dirty after + the bitmap was changed, so the bitmap wasn't sync'd. + +- Fixed nasty bug in find_new_zone(): If the current + zone number was zero, the loop didn't terminate, + causing a solid lock-up. + +- Removed support for old-style directory reads. + +- Fixed bug in add_entry(): When doing a sorted insert, + the pointer to the next entry in the hash chain wasn't + correctly byte-swapped. Since most of the users of the + affs use it on a 68k, they didn't notice. But why did + I not find this during my tests? + +- Fixed some oversights (version wasn't updated on some + directory changes). + +- Handling of hard links rewritten. To the VFS + they appear now as normal Unix links. They are + now resolved only once in lookup(). The backside + is that unlink(), rename() and rmdir() have to + be smart about them, but the result is worth the + effort. This also led to some code cleanup. + +- Changed name type to unsigned char; the test for + invalid filenames didn't work correctly. + (Thanks to Michael Krause for pointing at this.) + +- Changed mapping of executable flag. + +- Changed all network byte-order macros to the + recommended ones. + +- Added a remount function, so attempts to remount + a dircache filesystem or one with errors read/write + can be trapped. Previously, ro remounts didn't + flush the super block, and rw remounts didn't + create allocation zones ... + +- Call shrink_dcache_parent() in rmdir(). + (Thanks to Bill Hawes.) + +- Permission checks in unlink(). + +- Allow mounting of volumes with superfluous + bitmap pointers read only, also allows them + to be remounted read/write. + +- Owner/Group defaults now to the fs user (i.e. + the one that mounted it) instead of root. This + obsoletes the mount options uid and gid. + +- Argument to volume option could overflow the + name buffer. It is now silently truncated to + 30 characters. (Damn it! This kind of bug + is too embarrassing.) + +- Split inode.c into 2 files, the superblock + routines desperately wanted their own file. + +- truncate() didn't allocate an extension block + cache. If a file was extended by means of + truncate(), this led to an Oops. + +- fsuser is now checked last. + +- rename() will not ignore changes in filename + casing any more (though mv(1) still won't allow + you to do "mv oldname OldName"). + +Version 3.5 +----------- + +- Extension block caches are now allocated on + demand instead of when a file is opened, as + files can be read and written without opening + them (e. g. the loopback device does this). + +- Removed an unused function. + +Version 3.4 +----------- + +- Hash chains are now sorted by block numbers. + (Thanks to Kars de Jong for finding this.) +- Removed all unnecessary external symbols. + +Version 3.3 +----------- + +- Tried to make all types 'correct' and consistent. +- Errors and warnings are now reported via a + function. They are all prefixed by a severity + and have the same appearance: + "AFFS: : " + (There's one exception to this, as in that function + is no pointer to the super block available.) +- The filesystem is remounted read-only after an + error. +- The names of newly created filesystem objects are + now checked for validity. +- Minor cleanups in comments. +- Added this Changes file. At last! + +Version 3.2 +----------- + +- Extension block cache: Reading/writing of huge files + (several MB) is much faster (of course the added + overhead slows down opening, but this is hardly + noticeable). +- The same get_block()-routine can now be used for + both OFS and FFS. +- The super block is now searched in the block that + was calculated and in the one following. This + should remedy the round-off error introduced by + the 1-k blocks that Linux uses. +- Minor changes to adhere to the new VFS interface. +- The number of used blocks is now also calculated + if the filesystem is mounted read-only. +- Prefixed some constants with AFFS_ to avoid name + clashes. +- Removed 'EXPERIMENTAL' status. + +Version 3.1 +----------- + +- Fixed a nasty bug which didn't allow read-only + mounts. +- Allow dir-cache filesystems to be mounted + read only. +- OFS support. +- Several other changes I just cannot remember + any more. + +Version 3.0 +----------- + +- Almost complete rewrite for the new VFS + interface in Linux 1.3. +- Write support. +- Support for hard and symbolic links. +- Lots of things I remember even less ... + +Version 2.0 +----------- + +- Fixed a few things to get it compiled. +- Automatic root block calculation. +- Partition checker for genhd.c + +======================================== + +Let's just call Ray Burr's original affs +'Version 1.0'. diff --git a/kernel/fs/affs/Kconfig b/kernel/fs/affs/Kconfig new file mode 100644 index 000000000..a04d9e848 --- /dev/null +++ b/kernel/fs/affs/Kconfig @@ -0,0 +1,21 @@ +config AFFS_FS + tristate "Amiga FFS file system support" + depends on BLOCK + help + The Fast File System (FFS) is the common file system used on hard + disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y + if you want to be able to read and write files from and to an Amiga + FFS partition on your hard drive. Amiga floppies however cannot be + read with this driver due to an incompatibility of the floppy + controller used in an Amiga and the standard floppy controller in + PCs and workstations. Read + and . + + With this driver you can also mount disk files used by Bernd + Schmidt's Un*X Amiga Emulator + (). + If you want to do this, you will also need to say Y or M to "Loop + device support", above. + + To compile this file system support as a module, choose M here: the + module will be called affs. If unsure, say N. diff --git a/kernel/fs/affs/Makefile b/kernel/fs/affs/Makefile new file mode 100644 index 000000000..3988b4a78 --- /dev/null +++ b/kernel/fs/affs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux affs filesystem routines. +# + +#ccflags-y := -DDEBUG=1 + +obj-$(CONFIG_AFFS_FS) += affs.o + +affs-objs := super.o namei.o inode.o file.o dir.o amigaffs.o bitmap.o symlink.o diff --git a/kernel/fs/affs/affs.h b/kernel/fs/affs/affs.h new file mode 100644 index 000000000..cffe8370f --- /dev/null +++ b/kernel/fs/affs/affs.h @@ -0,0 +1,314 @@ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +/* Ugly macros make the code more pretty. */ + +#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) +#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) +#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) + +#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) +#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) +#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) +#define AFFS_ROOT_TAIL(sb, bh) ((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail))) +#define AFFS_DATA_HEAD(bh) ((struct affs_data_head *)(bh)->b_data) +#define AFFS_DATA(bh) (((struct affs_data_head *)(bh)->b_data)->data) + +#define AFFS_CACHE_SIZE PAGE_SIZE + +#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) +#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) +#define AFFS_AC_MASK (AFFS_AC_SIZE-1) + +#define AFFSNAMEMAX 30U + +struct affs_ext_key { + u32 ext; /* idx of the extended block */ + u32 key; /* block number */ +}; + +/* + * affs fs inode data in memory + */ +struct affs_inode_info { + atomic_t i_opencnt; + struct semaphore i_link_lock; /* Protects internal inode access. */ + struct semaphore i_ext_lock; /* Protects internal inode access. */ +#define i_hash_lock i_ext_lock + u32 i_blkcnt; /* block count */ + u32 i_extcnt; /* extended block count */ + u32 *i_lc; /* linear cache of extended blocks */ + u32 i_lc_size; + u32 i_lc_shift; + u32 i_lc_mask; + struct affs_ext_key *i_ac; /* associative cache of extended blocks */ + u32 i_ext_last; /* last accessed extended block */ + struct buffer_head *i_ext_bh; /* bh of last extended block */ + loff_t mmu_private; + u32 i_protect; /* unused attribute bits */ + u32 i_lastalloc; /* last allocated block */ + int i_pa_cnt; /* number of preallocated blocks */ + struct inode vfs_inode; +}; + +/* short cut to get to the affs specific inode data */ +static inline struct affs_inode_info *AFFS_I(struct inode *inode) +{ + return list_entry(inode, struct affs_inode_info, vfs_inode); +} + +/* + * super-block data in memory + * + * Block numbers are adjusted for their actual size + * + */ + +struct affs_bm_info { + u32 bm_key; /* Disk block number */ + u32 bm_free; /* Free blocks in here */ +}; + +struct affs_sb_info { + int s_partition_size; /* Partition size in blocks. */ + int s_reserved; /* Number of reserved blocks. */ + //u32 s_blksize; /* Initial device blksize */ + u32 s_data_blksize; /* size of the data block w/o header */ + u32 s_root_block; /* FFS root block number. */ + int s_hashsize; /* Size of hash table. */ + unsigned long s_flags; /* See below. */ + kuid_t s_uid; /* uid to override */ + kgid_t s_gid; /* gid to override */ + umode_t s_mode; /* mode to override */ + struct buffer_head *s_root_bh; /* Cached root block. */ + struct mutex s_bmlock; /* Protects bitmap access. */ + struct affs_bm_info *s_bitmap; /* Bitmap infos. */ + u32 s_bmap_count; /* # of bitmap blocks. */ + u32 s_bmap_bits; /* # of bits in one bitmap blocks */ + u32 s_last_bmap; + struct buffer_head *s_bmap_bh; + char *s_prefix; /* Prefix for volumes and assigns. */ + char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ + struct super_block *sb; /* the VFS superblock object */ + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work sb_work; /* superblock flush delayed work */ + spinlock_t work_lock; /* protects sb_work and work_queued */ +}; + +#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ +#define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */ +#define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ +#define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */ +#define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */ +#define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */ +#define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ +#define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ +#define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */ +#define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ +#define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ + +#define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt) +#define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt) +#define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt) + +/* short cut to get to the affs specific sb data */ +static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +void affs_mark_sb_dirty(struct super_block *sb); + +/* amigaffs.c */ + +extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); +extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); +extern int affs_remove_header(struct dentry *dentry); +extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); +extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); +extern void secs_to_datestamp(time_t secs, struct affs_date *ds); +extern umode_t prot_to_mode(u32 prot); +extern void mode_to_prot(struct inode *inode); +__printf(3, 4) +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +__printf(3, 4) +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); +extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); + +/* bitmap. c */ + +extern u32 affs_count_free_blocks(struct super_block *s); +extern void affs_free_block(struct super_block *sb, u32 block); +extern u32 affs_alloc_block(struct inode *inode, u32 goal); +extern int affs_init_bitmap(struct super_block *sb, int *flags); +extern void affs_free_bitmap(struct super_block *sb); + +/* namei.c */ + +extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); +extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); +extern int affs_unlink(struct inode *dir, struct dentry *dentry); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool); +extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +extern int affs_rmdir(struct inode *dir, struct dentry *dentry); +extern int affs_link(struct dentry *olddentry, struct inode *dir, + struct dentry *dentry); +extern int affs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname); +extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); + +/* inode.c */ + +extern unsigned long affs_parent_ino(struct inode *dir); +extern struct inode *affs_new_inode(struct inode *dir); +extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); +extern void affs_evict_inode(struct inode *inode); +extern struct inode *affs_iget(struct super_block *sb, + unsigned long ino); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); +extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); + +/* file.c */ + +void affs_free_prealloc(struct inode *inode); +extern void affs_truncate(struct inode *); +int affs_file_fsync(struct file *, loff_t, loff_t, int); + +/* dir.c */ + +extern void affs_dir_truncate(struct inode *); + +/* jump tables */ + +extern const struct inode_operations affs_file_inode_operations; +extern const struct inode_operations affs_dir_inode_operations; +extern const struct inode_operations affs_symlink_inode_operations; +extern const struct file_operations affs_file_operations; +extern const struct file_operations affs_file_operations_ofs; +extern const struct file_operations affs_dir_operations; +extern const struct address_space_operations affs_symlink_aops; +extern const struct address_space_operations affs_aops; +extern const struct address_space_operations affs_aops_ofs; + +extern const struct dentry_operations affs_dentry_operations; +extern const struct dentry_operations affs_intl_dentry_operations; + +static inline void +affs_set_blocksize(struct super_block *sb, int size) +{ + sb_set_blocksize(sb, size); +} +static inline struct buffer_head * +affs_bread(struct super_block *sb, int block) +{ + pr_debug("%s: %d\n", __func__, block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) + return sb_bread(sb, block); + return NULL; +} +static inline struct buffer_head * +affs_getblk(struct super_block *sb, int block) +{ + pr_debug("%s: %d\n", __func__, block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) + return sb_getblk(sb, block); + return NULL; +} +static inline struct buffer_head * +affs_getzeroblk(struct super_block *sb, int block) +{ + struct buffer_head *bh; + pr_debug("%s: %d\n", __func__, block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { + bh = sb_getblk(sb, block); + lock_buffer(bh); + memset(bh->b_data, 0 , sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + return bh; + } + return NULL; +} +static inline struct buffer_head * +affs_getemptyblk(struct super_block *sb, int block) +{ + struct buffer_head *bh; + pr_debug("%s: %d\n", __func__, block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { + bh = sb_getblk(sb, block); + wait_on_buffer(bh); + set_buffer_uptodate(bh); + return bh; + } + return NULL; +} +static inline void +affs_brelse(struct buffer_head *bh) +{ + if (bh) + pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); + brelse(bh); +} + +static inline void +affs_adjust_checksum(struct buffer_head *bh, u32 val) +{ + u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]); + ((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val); +} +static inline void +affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) +{ + u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]); + ((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val); +} + +static inline void +affs_lock_link(struct inode *inode) +{ + down(&AFFS_I(inode)->i_link_lock); +} +static inline void +affs_unlock_link(struct inode *inode) +{ + up(&AFFS_I(inode)->i_link_lock); +} +static inline void +affs_lock_dir(struct inode *inode) +{ + down(&AFFS_I(inode)->i_hash_lock); +} +static inline void +affs_unlock_dir(struct inode *inode) +{ + up(&AFFS_I(inode)->i_hash_lock); +} +static inline void +affs_lock_ext(struct inode *inode) +{ + down(&AFFS_I(inode)->i_ext_lock); +} +static inline void +affs_unlock_ext(struct inode *inode) +{ + up(&AFFS_I(inode)->i_ext_lock); +} diff --git a/kernel/fs/affs/amigaffs.c b/kernel/fs/affs/amigaffs.c new file mode 100644 index 000000000..a8f463c02 --- /dev/null +++ b/kernel/fs/affs/amigaffs.c @@ -0,0 +1,515 @@ +/* + * linux/fs/affs/amigaffs.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Amiga FFS filesystem. + * + * Please send bug reports to: hjw@zvw.de + */ + +#include "affs.h" + +/* + * Functions for accessing Amiga-FFS structures. + */ + + +/* Insert a header block bh into the directory dir + * caller must hold AFFS_DIR->i_hash_lock! + */ + +int +affs_insert_hash(struct inode *dir, struct buffer_head *bh) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *dir_bh; + u32 ino, hash_ino; + int offset; + + ino = bh->b_blocknr; + offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); + + pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); + + dir_bh = affs_bread(sb, dir->i_ino); + if (!dir_bh) + return -EIO; + + hash_ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[offset]); + while (hash_ino) { + affs_brelse(dir_bh); + dir_bh = affs_bread(sb, hash_ino); + if (!dir_bh) + return -EIO; + hash_ino = be32_to_cpu(AFFS_TAIL(sb, dir_bh)->hash_chain); + } + AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino); + AFFS_TAIL(sb, bh)->hash_chain = 0; + affs_fix_checksum(sb, bh); + + if (dir->i_ino == dir_bh->b_blocknr) + AFFS_HEAD(dir_bh)->table[offset] = cpu_to_be32(ino); + else + AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino); + + affs_adjust_checksum(dir_bh, ino); + mark_buffer_dirty_inode(dir_bh, dir); + affs_brelse(dir_bh); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_version++; + mark_inode_dirty(dir); + + return 0; +} + +/* Remove a header block from its directory. + * caller must hold AFFS_DIR->i_hash_lock! + */ + +int +affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) +{ + struct super_block *sb; + struct buffer_head *bh; + u32 rem_ino, hash_ino; + __be32 ino; + int offset, retval; + + sb = dir->i_sb; + rem_ino = rem_bh->b_blocknr; + offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); + pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, + rem_ino, offset); + + bh = affs_bread(sb, dir->i_ino); + if (!bh) + return -EIO; + + retval = -ENOENT; + hash_ino = be32_to_cpu(AFFS_HEAD(bh)->table[offset]); + while (hash_ino) { + if (hash_ino == rem_ino) { + ino = AFFS_TAIL(sb, rem_bh)->hash_chain; + if (dir->i_ino == bh->b_blocknr) + AFFS_HEAD(bh)->table[offset] = ino; + else + AFFS_TAIL(sb, bh)->hash_chain = ino; + affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino); + mark_buffer_dirty_inode(bh, dir); + AFFS_TAIL(sb, rem_bh)->parent = 0; + retval = 0; + break; + } + affs_brelse(bh); + bh = affs_bread(sb, hash_ino); + if (!bh) + return -EIO; + hash_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); + } + + affs_brelse(bh); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_version++; + mark_inode_dirty(dir); + + return retval; +} + +static void +affs_fix_dcache(struct inode *inode, u32 entry_ino) +{ + struct dentry *dentry; + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + if (entry_ino == (u32)(long)dentry->d_fsdata) { + dentry->d_fsdata = (void *)inode->i_ino; + break; + } + } + spin_unlock(&inode->i_lock); +} + + +/* Remove header from link chain */ + +static int +affs_remove_link(struct dentry *dentry) +{ + struct inode *dir, *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL, *link_bh = NULL; + u32 link_ino, ino; + int retval; + + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + link_ino = (u32)(long)dentry->d_fsdata; + if (inode->i_ino == link_ino) { + /* we can't remove the head of the link, as its blocknr is still used as ino, + * so we remove the block of the first link instead. + */ + link_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain); + link_bh = affs_bread(sb, link_ino); + if (!link_bh) + goto done; + + dir = affs_iget(sb, be32_to_cpu(AFFS_TAIL(sb, link_bh)->parent)); + if (IS_ERR(dir)) { + retval = PTR_ERR(dir); + goto done; + } + + affs_lock_dir(dir); + /* + * if there's a dentry for that block, make it + * refer to inode itself. + */ + affs_fix_dcache(inode, link_ino); + retval = affs_remove_hash(dir, link_bh); + if (retval) { + affs_unlock_dir(dir); + goto done; + } + mark_buffer_dirty_inode(link_bh, inode); + + memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32); + retval = affs_insert_hash(dir, bh); + if (retval) { + affs_unlock_dir(dir); + goto done; + } + mark_buffer_dirty_inode(bh, inode); + + affs_unlock_dir(dir); + iput(dir); + } else { + link_bh = affs_bread(sb, link_ino); + if (!link_bh) + goto done; + } + + while ((ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain)) != 0) { + if (ino == link_ino) { + __be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain; + AFFS_TAIL(sb, bh)->link_chain = ino2; + affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino); + mark_buffer_dirty_inode(bh, inode); + retval = 0; + /* Fix the link count, if bh is a normal header block without links */ + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + case ST_LINKDIR: + case ST_LINKFILE: + break; + default: + if (!AFFS_TAIL(sb, bh)->link_chain) + set_nlink(inode, 1); + } + affs_free_block(sb, link_ino); + goto done; + } + affs_brelse(bh); + bh = affs_bread(sb, ino); + if (!bh) + goto done; + } + retval = -ENOENT; +done: + affs_brelse(link_bh); + affs_brelse(bh); + return retval; +} + + +static int +affs_empty_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + int retval, size; + + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + retval = -ENOTEMPTY; + for (size = AFFS_SB(sb)->s_hashsize - 1; size >= 0; size--) + if (AFFS_HEAD(bh)->table[size]) + goto not_empty; + retval = 0; +not_empty: + affs_brelse(bh); +done: + return retval; +} + + +/* Remove a filesystem object. If the object to be removed has + * links to it, one of the links must be changed to inherit + * the file or directory. As above, any inode will do. + * The buffer will not be freed. If the header is a link, the + * block will be marked as free. + * This function returns a negative error number in case of + * an error, else 0 if the inode is to be deleted or 1 if not. + */ + +int +affs_remove_header(struct dentry *dentry) +{ + struct super_block *sb; + struct inode *inode, *dir; + struct buffer_head *bh = NULL; + int retval; + + dir = d_inode(dentry->d_parent); + sb = dir->i_sb; + + retval = -ENOENT; + inode = d_inode(dentry); + if (!inode) + goto done; + + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + retval = -EIO; + bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); + if (!bh) + goto done; + + affs_lock_link(inode); + affs_lock_dir(dir); + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + case ST_USERDIR: + /* if we ever want to support links to dirs + * i_hash_lock of the inode must only be + * taken after some checks + */ + affs_lock_dir(inode); + retval = affs_empty_dir(inode); + affs_unlock_dir(inode); + if (retval) + goto done_unlock; + break; + default: + break; + } + + retval = affs_remove_hash(dir, bh); + if (retval) + goto done_unlock; + mark_buffer_dirty_inode(bh, inode); + + affs_unlock_dir(dir); + + if (inode->i_nlink > 1) + retval = affs_remove_link(dentry); + else + clear_nlink(inode); + affs_unlock_link(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +done: + affs_brelse(bh); + return retval; + +done_unlock: + affs_unlock_dir(dir); + affs_unlock_link(inode); + goto done; +} + +/* Checksum a block, do various consistency checks and optionally return + the blocks type number. DATA points to the block. If their pointers + are non-null, *PTYPE and *STYPE are set to the primary and secondary + block types respectively, *HASHSIZE is set to the size of the hashtable + (which lets us calculate the block size). + Returns non-zero if the block is not consistent. */ + +u32 +affs_checksum_block(struct super_block *sb, struct buffer_head *bh) +{ + __be32 *ptr = (__be32 *)bh->b_data; + u32 sum; + int bsize; + + sum = 0; + for (bsize = sb->s_blocksize / sizeof(__be32); bsize > 0; bsize--) + sum += be32_to_cpu(*ptr++); + return sum; +} + +/* + * Calculate the checksum of a disk block and store it + * at the indicated position. + */ + +void +affs_fix_checksum(struct super_block *sb, struct buffer_head *bh) +{ + int cnt = sb->s_blocksize / sizeof(__be32); + __be32 *ptr = (__be32 *)bh->b_data; + u32 checksum; + __be32 *checksumptr; + + checksumptr = ptr + 5; + *checksumptr = 0; + for (checksum = 0; cnt > 0; ptr++, cnt--) + checksum += be32_to_cpu(*ptr); + *checksumptr = cpu_to_be32(-checksum); +} + +void +secs_to_datestamp(time_t secs, struct affs_date *ds) +{ + u32 days; + u32 minute; + + secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60); + if (secs < 0) + secs = 0; + days = secs / 86400; + secs -= days * 86400; + minute = secs / 60; + secs -= minute * 60; + + ds->days = cpu_to_be32(days); + ds->mins = cpu_to_be32(minute); + ds->ticks = cpu_to_be32(secs * 50); +} + +umode_t +prot_to_mode(u32 prot) +{ + umode_t mode = 0; + + if (!(prot & FIBF_NOWRITE)) + mode |= S_IWUSR; + if (!(prot & FIBF_NOREAD)) + mode |= S_IRUSR; + if (!(prot & FIBF_NOEXECUTE)) + mode |= S_IXUSR; + if (prot & FIBF_GRP_WRITE) + mode |= S_IWGRP; + if (prot & FIBF_GRP_READ) + mode |= S_IRGRP; + if (prot & FIBF_GRP_EXECUTE) + mode |= S_IXGRP; + if (prot & FIBF_OTR_WRITE) + mode |= S_IWOTH; + if (prot & FIBF_OTR_READ) + mode |= S_IROTH; + if (prot & FIBF_OTR_EXECUTE) + mode |= S_IXOTH; + + return mode; +} + +void +mode_to_prot(struct inode *inode) +{ + u32 prot = AFFS_I(inode)->i_protect; + umode_t mode = inode->i_mode; + + if (!(mode & S_IXUSR)) + prot |= FIBF_NOEXECUTE; + if (!(mode & S_IRUSR)) + prot |= FIBF_NOREAD; + if (!(mode & S_IWUSR)) + prot |= FIBF_NOWRITE; + if (mode & S_IXGRP) + prot |= FIBF_GRP_EXECUTE; + if (mode & S_IRGRP) + prot |= FIBF_GRP_READ; + if (mode & S_IWGRP) + prot |= FIBF_GRP_WRITE; + if (mode & S_IXOTH) + prot |= FIBF_OTR_EXECUTE; + if (mode & S_IROTH) + prot |= FIBF_OTR_READ; + if (mode & S_IWOTH) + prot |= FIBF_OTR_WRITE; + + AFFS_I(inode)->i_protect = prot; +} + +void +affs_error(struct super_block *sb, const char *function, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); + if (!(sb->s_flags & MS_RDONLY)) + pr_warn("Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + va_end(args); +} + +void +affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); + va_end(args); +} + +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE); +} + +/* Check if the name is valid for a affs object. */ + +int +affs_check_name(const unsigned char *name, int len, bool notruncate) +{ + int i; + + if (len > AFFSNAMEMAX) { + if (notruncate) + return -ENAMETOOLONG; + len = AFFSNAMEMAX; + } + for (i = 0; i < len; i++) { + if (name[i] < ' ' || name[i] == ':' + || (name[i] > 0x7e && name[i] < 0xa0)) + return -EINVAL; + } + + return 0; +} + +/* This function copies name to bstr, with at most 30 + * characters length. The bstr will be prepended by + * a length byte. + * NOTE: The name will must be already checked by + * affs_check_name()! + */ + +int +affs_copy_name(unsigned char *bstr, struct dentry *dentry) +{ + u32 len = min(dentry->d_name.len, AFFSNAMEMAX); + + *bstr++ = len; + memcpy(bstr, dentry->d_name.name, len); + return len; +} diff --git a/kernel/fs/affs/bitmap.c b/kernel/fs/affs/bitmap.c new file mode 100644 index 000000000..675148950 --- /dev/null +++ b/kernel/fs/affs/bitmap.c @@ -0,0 +1,364 @@ +/* + * linux/fs/affs/bitmap.c + * + * (c) 1996 Hans-Joachim Widmaier + * + * bitmap.c contains the code that handles all bitmap related stuff - + * block allocation, deallocation, calculation of free space. + */ + +#include +#include "affs.h" + +u32 +affs_count_free_blocks(struct super_block *sb) +{ + struct affs_bm_info *bm; + u32 free; + int i; + + pr_debug("%s()\n", __func__); + + if (sb->s_flags & MS_RDONLY) + return 0; + + mutex_lock(&AFFS_SB(sb)->s_bmlock); + + bm = AFFS_SB(sb)->s_bitmap; + free = 0; + for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--) + free += bm->bm_free; + + mutex_unlock(&AFFS_SB(sb)->s_bmlock); + + return free; +} + +void +affs_free_block(struct super_block *sb, u32 block) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct affs_bm_info *bm; + struct buffer_head *bh; + u32 blk, bmap, bit, mask, tmp; + __be32 *data; + + pr_debug("%s(%u)\n", __func__, block); + + if (block > sbi->s_partition_size) + goto err_range; + + blk = block - sbi->s_reserved; + bmap = blk / sbi->s_bmap_bits; + bit = blk % sbi->s_bmap_bits; + bm = &sbi->s_bitmap[bmap]; + + mutex_lock(&sbi->s_bmlock); + + bh = sbi->s_bmap_bh; + if (sbi->s_last_bmap != bmap) { + affs_brelse(bh); + bh = affs_bread(sb, bm->bm_key); + if (!bh) + goto err_bh_read; + sbi->s_bmap_bh = bh; + sbi->s_last_bmap = bmap; + } + + mask = 1 << (bit & 31); + data = (__be32 *)bh->b_data + bit / 32 + 1; + + /* mark block free */ + tmp = be32_to_cpu(*data); + if (tmp & mask) + goto err_free; + *data = cpu_to_be32(tmp | mask); + + /* fix checksum */ + tmp = be32_to_cpu(*(__be32 *)bh->b_data); + *(__be32 *)bh->b_data = cpu_to_be32(tmp - mask); + + mark_buffer_dirty(bh); + affs_mark_sb_dirty(sb); + bm->bm_free++; + + mutex_unlock(&sbi->s_bmlock); + return; + +err_free: + affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block); + mutex_unlock(&sbi->s_bmlock); + return; + +err_bh_read: + affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; + mutex_unlock(&sbi->s_bmlock); + return; + +err_range: + affs_error(sb, "affs_free_block","Block %u outside partition", block); +} + +/* + * Allocate a block in the given allocation zone. + * Since we have to byte-swap the bitmap on little-endian + * machines, this is rather expensive. Therefore we will + * preallocate up to 16 blocks from the same word, if + * possible. We are not doing preallocations in the + * header zone, though. + */ + +u32 +affs_alloc_block(struct inode *inode, u32 goal) +{ + struct super_block *sb; + struct affs_sb_info *sbi; + struct affs_bm_info *bm; + struct buffer_head *bh; + __be32 *data, *enddata; + u32 blk, bmap, bit, mask, mask2, tmp; + int i; + + sb = inode->i_sb; + sbi = AFFS_SB(sb); + + pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + + if (AFFS_I(inode)->i_pa_cnt) { + pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); + AFFS_I(inode)->i_pa_cnt--; + return ++AFFS_I(inode)->i_lastalloc; + } + + if (!goal || goal > sbi->s_partition_size) { + if (goal) + affs_warning(sb, "affs_balloc", "invalid goal %d", goal); + //if (!AFFS_I(inode)->i_last_block) + // affs_warning(sb, "affs_balloc", "no last alloc block"); + goal = sbi->s_reserved; + } + + blk = goal - sbi->s_reserved; + bmap = blk / sbi->s_bmap_bits; + bm = &sbi->s_bitmap[bmap]; + + mutex_lock(&sbi->s_bmlock); + + if (bm->bm_free) + goto find_bmap_bit; + +find_bmap: + /* search for the next bmap buffer with free bits */ + i = sbi->s_bmap_count; + do { + if (--i < 0) + goto err_full; + bmap++; + bm++; + if (bmap < sbi->s_bmap_count) + continue; + /* restart search at zero */ + bmap = 0; + bm = sbi->s_bitmap; + } while (!bm->bm_free); + blk = bmap * sbi->s_bmap_bits; + +find_bmap_bit: + + bh = sbi->s_bmap_bh; + if (sbi->s_last_bmap != bmap) { + affs_brelse(bh); + bh = affs_bread(sb, bm->bm_key); + if (!bh) + goto err_bh_read; + sbi->s_bmap_bh = bh; + sbi->s_last_bmap = bmap; + } + + /* find an unused block in this bitmap block */ + bit = blk % sbi->s_bmap_bits; + data = (__be32 *)bh->b_data + bit / 32 + 1; + enddata = (__be32 *)((u8 *)bh->b_data + sb->s_blocksize); + mask = ~0UL << (bit & 31); + blk &= ~31UL; + + tmp = be32_to_cpu(*data); + if (tmp & mask) + goto find_bit; + + /* scan the rest of the buffer */ + do { + blk += 32; + if (++data >= enddata) + /* didn't find something, can only happen + * if scan didn't start at 0, try next bmap + */ + goto find_bmap; + } while (!*data); + tmp = be32_to_cpu(*data); + mask = ~0; + +find_bit: + /* finally look for a free bit in the word */ + bit = ffs(tmp & mask) - 1; + blk += bit + sbi->s_reserved; + mask2 = mask = 1 << (bit & 31); + AFFS_I(inode)->i_lastalloc = blk; + + /* prealloc as much as possible within this word */ + while ((mask2 <<= 1)) { + if (!(tmp & mask2)) + break; + AFFS_I(inode)->i_pa_cnt++; + mask |= mask2; + } + bm->bm_free -= AFFS_I(inode)->i_pa_cnt + 1; + + *data = cpu_to_be32(tmp & ~mask); + + /* fix checksum */ + tmp = be32_to_cpu(*(__be32 *)bh->b_data); + *(__be32 *)bh->b_data = cpu_to_be32(tmp + mask); + + mark_buffer_dirty(bh); + affs_mark_sb_dirty(sb); + + mutex_unlock(&sbi->s_bmlock); + + pr_debug("%d\n", blk); + return blk; + +err_bh_read: + affs_error(sb,"affs_read_block","Cannot read bitmap block %u", bm->bm_key); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; +err_full: + mutex_unlock(&sbi->s_bmlock); + pr_debug("failed\n"); + return 0; +} + +int affs_init_bitmap(struct super_block *sb, int *flags) +{ + struct affs_bm_info *bm; + struct buffer_head *bmap_bh = NULL, *bh = NULL; + __be32 *bmap_blk; + u32 size, blk, end, offset, mask; + int i, res = 0; + struct affs_sb_info *sbi = AFFS_SB(sb); + + if (*flags & MS_RDONLY) + return 0; + + if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { + pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); + *flags |= MS_RDONLY; + return 0; + } + + sbi->s_last_bmap = ~0; + sbi->s_bmap_bh = NULL; + sbi->s_bmap_bits = sb->s_blocksize * 8 - 32; + sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved + + sbi->s_bmap_bits - 1) / sbi->s_bmap_bits; + size = sbi->s_bmap_count * sizeof(*bm); + bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL); + if (!sbi->s_bitmap) { + pr_err("Bitmap allocation failed\n"); + return -ENOMEM; + } + + bmap_blk = (__be32 *)sbi->s_root_bh->b_data; + blk = sb->s_blocksize / 4 - 49; + end = blk + 25; + + for (i = sbi->s_bmap_count; i > 0; bm++, i--) { + affs_brelse(bh); + + bm->bm_key = be32_to_cpu(bmap_blk[blk]); + bh = affs_bread(sb, bm->bm_key); + if (!bh) { + pr_err("Cannot read bitmap\n"); + res = -EIO; + goto out; + } + if (affs_checksum_block(sb, bh)) { + pr_warn("Bitmap %u invalid - mounting %s read only.\n", + bm->bm_key, sb->s_id); + *flags |= MS_RDONLY; + goto out; + } + pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); + + /* Don't try read the extension if this is the last block, + * but we also need the right bm pointer below + */ + if (++blk < end || i == 1) + continue; + if (bmap_bh) + affs_brelse(bmap_bh); + bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk])); + if (!bmap_bh) { + pr_err("Cannot read bitmap extension\n"); + res = -EIO; + goto out; + } + bmap_blk = (__be32 *)bmap_bh->b_data; + blk = 0; + end = sb->s_blocksize / 4 - 1; + } + + offset = (sbi->s_partition_size - sbi->s_reserved) % sbi->s_bmap_bits; + mask = ~(0xFFFFFFFFU << (offset & 31)); + pr_debug("last word: %d %d %d\n", offset, offset / 32 + 1, mask); + offset = offset / 32 + 1; + + if (mask) { + u32 old, new; + + /* Mark unused bits in the last word as allocated */ + old = be32_to_cpu(((__be32 *)bh->b_data)[offset]); + new = old & mask; + //if (old != new) { + ((__be32 *)bh->b_data)[offset] = cpu_to_be32(new); + /* fix checksum */ + //new -= old; + //old = be32_to_cpu(*(__be32 *)bh->b_data); + //*(__be32 *)bh->b_data = cpu_to_be32(old - new); + //mark_buffer_dirty(bh); + //} + /* correct offset for the bitmap count below */ + //offset++; + } + while (++offset < sb->s_blocksize / 4) + ((__be32 *)bh->b_data)[offset] = 0; + ((__be32 *)bh->b_data)[0] = 0; + ((__be32 *)bh->b_data)[0] = cpu_to_be32(-affs_checksum_block(sb, bh)); + mark_buffer_dirty(bh); + + /* recalculate bitmap count for last block */ + bm--; + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); + +out: + affs_brelse(bh); + affs_brelse(bmap_bh); + return res; +} + +void affs_free_bitmap(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + + if (!sbi->s_bitmap) + return; + + affs_brelse(sbi->s_bmap_bh); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; + kfree(sbi->s_bitmap); + sbi->s_bitmap = NULL; +} diff --git a/kernel/fs/affs/dir.c b/kernel/fs/affs/dir.c new file mode 100644 index 000000000..ac4f318aa --- /dev/null +++ b/kernel/fs/affs/dir.c @@ -0,0 +1,142 @@ +/* + * linux/fs/affs/dir.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * affs directory handling functions + * + */ + +#include "affs.h" + +static int affs_readdir(struct file *, struct dir_context *); + +const struct file_operations affs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .iterate = affs_readdir, + .fsync = affs_file_fsync, +}; + +/* + * directories can handle most operations... + */ +const struct inode_operations affs_dir_inode_operations = { + .create = affs_create, + .lookup = affs_lookup, + .link = affs_link, + .unlink = affs_unlink, + .symlink = affs_symlink, + .mkdir = affs_mkdir, + .rmdir = affs_rmdir, + .rename = affs_rename, + .setattr = affs_notify_change, +}; + +static int +affs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct buffer_head *dir_bh = NULL; + struct buffer_head *fh_bh = NULL; + unsigned char *name; + int namelen; + u32 i; + int hash_pos; + int chain_pos; + u32 ino; + int error = 0; + + pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); + + if (ctx->pos < 2) { + file->private_data = (void *)0; + if (!dir_emit_dots(file, ctx)) + return 0; + } + + affs_lock_dir(inode); + chain_pos = (ctx->pos - 2) & 0xffff; + hash_pos = (ctx->pos - 2) >> 16; + if (chain_pos == 0xffff) { + affs_warning(sb, "readdir", "More than 65535 entries in chain"); + chain_pos = 0; + hash_pos++; + ctx->pos = ((hash_pos << 16) | chain_pos) + 2; + } + dir_bh = affs_bread(sb, inode->i_ino); + if (!dir_bh) + goto out_unlock_dir; + + /* If the directory hasn't changed since the last call to readdir(), + * we can jump directly to where we left off. + */ + ino = (u32)(long)file->private_data; + if (ino && file->f_version == inode->i_version) { + pr_debug("readdir() left off=%d\n", ino); + goto inside; + } + + ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); + for (i = 0; ino && i < chain_pos; i++) { + fh_bh = affs_bread(sb, ino); + if (!fh_bh) { + affs_error(sb, "readdir","Cannot read block %d", i); + error = -EIO; + goto out_brelse_dir; + } + ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); + affs_brelse(fh_bh); + fh_bh = NULL; + } + if (ino) + goto inside; + hash_pos++; + + for (; hash_pos < AFFS_SB(sb)->s_hashsize; hash_pos++) { + ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); + if (!ino) + continue; + ctx->pos = (hash_pos << 16) + 2; +inside: + do { + fh_bh = affs_bread(sb, ino); + if (!fh_bh) { + affs_error(sb, "readdir", + "Cannot read block %d", ino); + break; + } + + namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], + (u8)AFFSNAMEMAX); + name = AFFS_TAIL(sb, fh_bh)->name + 1; + pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n", + namelen, name, ino, hash_pos, ctx->pos); + + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) + goto done; + ctx->pos++; + ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); + affs_brelse(fh_bh); + fh_bh = NULL; + } while (ino); + } +done: + file->f_version = inode->i_version; + file->private_data = (void *)(long)ino; + affs_brelse(fh_bh); + +out_brelse_dir: + affs_brelse(dir_bh); + +out_unlock_dir: + affs_unlock_dir(inode); + return error; +} diff --git a/kernel/fs/affs/file.c b/kernel/fs/affs/file.c new file mode 100644 index 000000000..659c579c4 --- /dev/null +++ b/kernel/fs/affs/file.c @@ -0,0 +1,982 @@ +/* + * linux/fs/affs/file.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * affs regular file handling primitives + */ + +#include +#include "affs.h" + +static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); + +static int +affs_file_open(struct inode *inode, struct file *filp) +{ + pr_debug("open(%lu,%d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + atomic_inc(&AFFS_I(inode)->i_opencnt); + return 0; +} + +static int +affs_file_release(struct inode *inode, struct file *filp) +{ + pr_debug("release(%lu, %d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + + if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { + mutex_lock(&inode->i_mutex); + if (inode->i_size != AFFS_I(inode)->mmu_private) + affs_truncate(inode); + affs_free_prealloc(inode); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static int +affs_grow_extcache(struct inode *inode, u32 lc_idx) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + u32 lc_max; + int i, j, key; + + if (!AFFS_I(inode)->i_lc) { + char *ptr = (char *)get_zeroed_page(GFP_NOFS); + if (!ptr) + return -ENOMEM; + AFFS_I(inode)->i_lc = (u32 *)ptr; + AFFS_I(inode)->i_ac = (struct affs_ext_key *)(ptr + AFFS_CACHE_SIZE / 2); + } + + lc_max = AFFS_LC_SIZE << AFFS_I(inode)->i_lc_shift; + + if (AFFS_I(inode)->i_extcnt > lc_max) { + u32 lc_shift, lc_mask, tmp, off; + + /* need to recalculate linear cache, start from old size */ + lc_shift = AFFS_I(inode)->i_lc_shift; + tmp = (AFFS_I(inode)->i_extcnt / AFFS_LC_SIZE) >> lc_shift; + for (; tmp; tmp >>= 1) + lc_shift++; + lc_mask = (1 << lc_shift) - 1; + + /* fix idx and old size to new shift */ + lc_idx >>= (lc_shift - AFFS_I(inode)->i_lc_shift); + AFFS_I(inode)->i_lc_size >>= (lc_shift - AFFS_I(inode)->i_lc_shift); + + /* first shrink old cache to make more space */ + off = 1 << (lc_shift - AFFS_I(inode)->i_lc_shift); + for (i = 1, j = off; j < AFFS_LC_SIZE; i++, j += off) + AFFS_I(inode)->i_ac[i] = AFFS_I(inode)->i_ac[j]; + + AFFS_I(inode)->i_lc_shift = lc_shift; + AFFS_I(inode)->i_lc_mask = lc_mask; + } + + /* fill cache to the needed index */ + i = AFFS_I(inode)->i_lc_size; + AFFS_I(inode)->i_lc_size = lc_idx + 1; + for (; i <= lc_idx; i++) { + if (!i) { + AFFS_I(inode)->i_lc[0] = inode->i_ino; + continue; + } + key = AFFS_I(inode)->i_lc[i - 1]; + j = AFFS_I(inode)->i_lc_mask + 1; + // unlock cache + for (; j > 0; j--) { + bh = affs_bread(sb, key); + if (!bh) + goto err; + key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + affs_brelse(bh); + } + // lock cache + AFFS_I(inode)->i_lc[i] = key; + } + + return 0; + +err: + // lock cache + return -EIO; +} + +static struct buffer_head * +affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh; + u32 blocknr, tmp; + + blocknr = affs_alloc_block(inode, bh->b_blocknr); + if (!blocknr) + return ERR_PTR(-ENOSPC); + + new_bh = affs_getzeroblk(sb, blocknr); + if (!new_bh) { + affs_free_block(sb, blocknr); + return ERR_PTR(-EIO); + } + + AFFS_HEAD(new_bh)->ptype = cpu_to_be32(T_LIST); + AFFS_HEAD(new_bh)->key = cpu_to_be32(blocknr); + AFFS_TAIL(sb, new_bh)->stype = cpu_to_be32(ST_FILE); + AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino); + affs_fix_checksum(sb, new_bh); + + mark_buffer_dirty_inode(new_bh, inode); + + tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + if (tmp) + affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp); + AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr); + affs_adjust_checksum(bh, blocknr - tmp); + mark_buffer_dirty_inode(bh, inode); + + AFFS_I(inode)->i_extcnt++; + mark_inode_dirty(inode); + + return new_bh; +} + +static inline struct buffer_head * +affs_get_extblock(struct inode *inode, u32 ext) +{ + /* inline the simplest case: same extended block as last time */ + struct buffer_head *bh = AFFS_I(inode)->i_ext_bh; + if (ext == AFFS_I(inode)->i_ext_last) + get_bh(bh); + else + /* we have to do more (not inlined) */ + bh = affs_get_extblock_slow(inode, ext); + + return bh; +} + +static struct buffer_head * +affs_get_extblock_slow(struct inode *inode, u32 ext) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + u32 ext_key; + u32 lc_idx, lc_off, ac_idx; + u32 tmp, idx; + + if (ext == AFFS_I(inode)->i_ext_last + 1) { + /* read the next extended block from the current one */ + bh = AFFS_I(inode)->i_ext_bh; + ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + if (ext < AFFS_I(inode)->i_extcnt) + goto read_ext; + BUG_ON(ext > AFFS_I(inode)->i_extcnt); + bh = affs_alloc_extblock(inode, bh, ext); + if (IS_ERR(bh)) + return bh; + goto store_ext; + } + + if (ext == 0) { + /* we seek back to the file header block */ + ext_key = inode->i_ino; + goto read_ext; + } + + if (ext >= AFFS_I(inode)->i_extcnt) { + struct buffer_head *prev_bh; + + /* allocate a new extended block */ + BUG_ON(ext > AFFS_I(inode)->i_extcnt); + + /* get previous extended block */ + prev_bh = affs_get_extblock(inode, ext - 1); + if (IS_ERR(prev_bh)) + return prev_bh; + bh = affs_alloc_extblock(inode, prev_bh, ext); + affs_brelse(prev_bh); + if (IS_ERR(bh)) + return bh; + goto store_ext; + } + +again: + /* check if there is an extended cache and whether it's large enough */ + lc_idx = ext >> AFFS_I(inode)->i_lc_shift; + lc_off = ext & AFFS_I(inode)->i_lc_mask; + + if (lc_idx >= AFFS_I(inode)->i_lc_size) { + int err; + + err = affs_grow_extcache(inode, lc_idx); + if (err) + return ERR_PTR(err); + goto again; + } + + /* every n'th key we find in the linear cache */ + if (!lc_off) { + ext_key = AFFS_I(inode)->i_lc[lc_idx]; + goto read_ext; + } + + /* maybe it's still in the associative cache */ + ac_idx = (ext - lc_idx - 1) & AFFS_AC_MASK; + if (AFFS_I(inode)->i_ac[ac_idx].ext == ext) { + ext_key = AFFS_I(inode)->i_ac[ac_idx].key; + goto read_ext; + } + + /* try to find one of the previous extended blocks */ + tmp = ext; + idx = ac_idx; + while (--tmp, --lc_off > 0) { + idx = (idx - 1) & AFFS_AC_MASK; + if (AFFS_I(inode)->i_ac[idx].ext == tmp) { + ext_key = AFFS_I(inode)->i_ac[idx].key; + goto find_ext; + } + } + + /* fall back to the linear cache */ + ext_key = AFFS_I(inode)->i_lc[lc_idx]; +find_ext: + /* read all extended blocks until we find the one we need */ + //unlock cache + do { + bh = affs_bread(sb, ext_key); + if (!bh) + goto err_bread; + ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + affs_brelse(bh); + tmp++; + } while (tmp < ext); + //lock cache + + /* store it in the associative cache */ + // recalculate ac_idx? + AFFS_I(inode)->i_ac[ac_idx].ext = ext; + AFFS_I(inode)->i_ac[ac_idx].key = ext_key; + +read_ext: + /* finally read the right extended block */ + //unlock cache + bh = affs_bread(sb, ext_key); + if (!bh) + goto err_bread; + //lock cache + +store_ext: + /* release old cached extended block and store the new one */ + affs_brelse(AFFS_I(inode)->i_ext_bh); + AFFS_I(inode)->i_ext_last = ext; + AFFS_I(inode)->i_ext_bh = bh; + get_bh(bh); + + return bh; + +err_bread: + affs_brelse(bh); + return ERR_PTR(-EIO); +} + +static int +affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *ext_bh; + u32 ext; + + pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino, + (unsigned long long)block); + + BUG_ON(block > (sector_t)0x7fffffffUL); + + if (block >= AFFS_I(inode)->i_blkcnt) { + if (block > AFFS_I(inode)->i_blkcnt || !create) + goto err_big; + } else + create = 0; + + //lock cache + affs_lock_ext(inode); + + ext = (u32)block / AFFS_SB(sb)->s_hashsize; + block -= ext * AFFS_SB(sb)->s_hashsize; + ext_bh = affs_get_extblock(inode, ext); + if (IS_ERR(ext_bh)) + goto err_ext; + map_bh(bh_result, sb, (sector_t)be32_to_cpu(AFFS_BLOCK(sb, ext_bh, block))); + + if (create) { + u32 blocknr = affs_alloc_block(inode, ext_bh->b_blocknr); + if (!blocknr) + goto err_alloc; + set_buffer_new(bh_result); + AFFS_I(inode)->mmu_private += AFFS_SB(sb)->s_data_blksize; + AFFS_I(inode)->i_blkcnt++; + + /* store new block */ + if (bh_result->b_blocknr) + affs_warning(sb, "get_block", + "block already set (%llx)", + (unsigned long long)bh_result->b_blocknr); + AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); + AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); + affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); + bh_result->b_blocknr = blocknr; + + if (!block) { + /* insert first block into header block */ + u32 tmp = be32_to_cpu(AFFS_HEAD(ext_bh)->first_data); + if (tmp) + affs_warning(sb, "get_block", "first block already set (%d)", tmp); + AFFS_HEAD(ext_bh)->first_data = cpu_to_be32(blocknr); + affs_adjust_checksum(ext_bh, blocknr - tmp); + } + } + + affs_brelse(ext_bh); + //unlock cache + affs_unlock_ext(inode); + return 0; + +err_big: + affs_error(inode->i_sb, "get_block", "strange block request %llu", + (unsigned long long)block); + return -EIO; +err_ext: + // unlock cache + affs_unlock_ext(inode); + return PTR_ERR(ext_bh); +err_alloc: + brelse(ext_bh); + clear_buffer_mapped(bh_result); + bh_result->b_bdev = NULL; + // unlock cache + affs_unlock_ext(inode); + return -ENOSPC; +} + +static int affs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, affs_get_block, wbc); +} + +static int affs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, affs_get_block); +} + +static void affs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + affs_truncate(inode); + } +} + +static ssize_t +affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (iov_iter_rw(iter) == WRITE) { + loff_t size = offset + count; + + if (AFFS_I(inode)->mmu_private < size) + return 0; + } + + ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block); + if (ret < 0 && iov_iter_rw(iter) == WRITE) + affs_write_failed(mapping, offset + count); + return ret; +} + +static int affs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + affs_get_block, + &AFFS_I(mapping->host)->mmu_private); + if (unlikely(ret)) + affs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t _affs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,affs_get_block); +} + +const struct address_space_operations affs_aops = { + .readpage = affs_readpage, + .writepage = affs_writepage, + .write_begin = affs_write_begin, + .write_end = generic_write_end, + .direct_IO = affs_direct_IO, + .bmap = _affs_bmap +}; + +static inline struct buffer_head * +affs_bread_ino(struct inode *inode, int block, int create) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, create); + if (!err) { + bh = affs_bread(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static inline struct buffer_head * +affs_getzeroblk_ino(struct inode *inode, int block) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, 1); + if (!err) { + bh = affs_getzeroblk(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static inline struct buffer_head * +affs_getemptyblk_ino(struct inode *inode, int block) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, 1); + if (!err) { + bh = affs_getemptyblk(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static int +affs_do_readpage_ofs(struct page *page, unsigned to) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + char *data; + unsigned pos = 0; + u32 bidx, boff, bsize; + u32 tmp; + + pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, + page->index, to); + BUG_ON(to > PAGE_CACHE_SIZE); + kmap(page); + data = page_address(page); + bsize = AFFS_SB(sb)->s_data_blksize; + tmp = page->index << PAGE_CACHE_SHIFT; + bidx = tmp / bsize; + boff = tmp % bsize; + + while (pos < to) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + tmp = min(bsize - boff, to - pos); + BUG_ON(pos + tmp > to || tmp > bsize); + memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); + affs_brelse(bh); + bidx++; + pos += tmp; + boff = 0; + } + flush_dcache_page(page); + kunmap(page); + return 0; +} + +static int +affs_extent_file_ofs(struct inode *inode, u32 newsize) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *prev_bh; + u32 bidx, boff; + u32 size, bsize; + u32 tmp; + + pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize); + bsize = AFFS_SB(sb)->s_data_blksize; + bh = NULL; + size = AFFS_I(inode)->mmu_private; + bidx = size / bsize; + boff = size % bsize; + if (boff) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + tmp = min(bsize - boff, newsize - size); + BUG_ON(boff + tmp > bsize || tmp > bsize); + memset(AFFS_DATA(bh) + boff, 0, tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + size += tmp; + bidx++; + } else if (bidx) { + bh = affs_bread_ino(inode, bidx - 1, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + } + + while (size < newsize) { + prev_bh = bh; + bh = affs_getzeroblk_ino(inode, bidx); + if (IS_ERR(bh)) + goto out; + tmp = min(bsize, newsize - size); + BUG_ON(tmp > bsize); + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + affs_fix_checksum(sb, bh); + bh->b_state &= ~(1UL << BH_New); + mark_buffer_dirty_inode(bh, inode); + if (prev_bh) { + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "extent_file_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); + mark_buffer_dirty_inode(prev_bh, inode); + affs_brelse(prev_bh); + } + size += bsize; + bidx++; + } + affs_brelse(bh); + inode->i_size = AFFS_I(inode)->mmu_private = newsize; + return 0; + +out: + inode->i_size = AFFS_I(inode)->mmu_private = newsize; + return PTR_ERR(bh); +} + +static int +affs_readpage_ofs(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + u32 to; + int err; + + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); + to = PAGE_CACHE_SIZE; + if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { + to = inode->i_size & ~PAGE_CACHE_MASK; + memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to); + } + + err = affs_do_readpage_ofs(page, to); + if (!err) + SetPageUptodate(page); + unlock_page(page); + return err; +} + +static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index; + int err = 0; + + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); + if (pos > AFFS_I(inode)->mmu_private) { + /* XXX: this probably leaves a too-big i_size in case of + * failure. Should really be updating i_size at write_end time + */ + err = affs_extent_file_ofs(inode, pos); + if (err) + return err; + } + + index = pos >> PAGE_CACHE_SHIFT; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if (PageUptodate(page)) + return 0; + + /* XXX: inefficient but safe in the face of short writes */ + err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE); + if (err) { + unlock_page(page); + page_cache_release(page); + } + return err; +} + +static int affs_write_end_ofs(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *prev_bh; + char *data; + u32 bidx, boff, bsize; + unsigned from, to; + u32 tmp; + int written; + + from = pos & (PAGE_CACHE_SIZE - 1); + to = pos + len; + /* + * XXX: not sure if this can handle short copies (len < copied), but + * we don't have to, because the page should always be uptodate here, + * due to write_begin. + */ + + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); + bsize = AFFS_SB(sb)->s_data_blksize; + data = page_address(page); + + bh = NULL; + written = 0; + tmp = (page->index << PAGE_CACHE_SHIFT) + from; + bidx = tmp / bsize; + boff = tmp % bsize; + if (boff) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } + tmp = min(bsize - boff, to - from); + BUG_ON(boff + tmp > bsize || tmp > bsize); + memcpy(AFFS_DATA(bh) + boff, data + from, tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += tmp; + from += tmp; + bidx++; + } else if (bidx) { + bh = affs_bread_ino(inode, bidx - 1, 0); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } + } + while (from + bsize <= to) { + prev_bh = bh; + bh = affs_getemptyblk_ino(inode, bidx); + if (IS_ERR(bh)) + goto err_bh; + memcpy(AFFS_DATA(bh), data + from, bsize); + if (buffer_new(bh)) { + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize); + AFFS_DATA_HEAD(bh)->next = 0; + bh->b_state &= ~(1UL << BH_New); + if (prev_bh) { + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "commit_write_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); + mark_buffer_dirty_inode(prev_bh, inode); + } + } + affs_brelse(prev_bh); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += bsize; + from += bsize; + bidx++; + } + if (from < to) { + prev_bh = bh; + bh = affs_bread_ino(inode, bidx, 1); + if (IS_ERR(bh)) + goto err_bh; + tmp = min(bsize, to - from); + BUG_ON(tmp > bsize); + memcpy(AFFS_DATA(bh), data + from, tmp); + if (buffer_new(bh)) { + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + AFFS_DATA_HEAD(bh)->next = 0; + bh->b_state &= ~(1UL << BH_New); + if (prev_bh) { + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "commit_write_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); + mark_buffer_dirty_inode(prev_bh, inode); + } + } else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp) + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + affs_brelse(prev_bh); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += tmp; + from += tmp; + bidx++; + } + SetPageUptodate(page); + +done: + affs_brelse(bh); + tmp = (page->index << PAGE_CACHE_SHIFT) + from; + if (tmp > inode->i_size) + inode->i_size = AFFS_I(inode)->mmu_private = tmp; + +err_first_bh: + unlock_page(page); + page_cache_release(page); + + return written; + +err_bh: + bh = prev_bh; + if (!written) + written = PTR_ERR(bh); + goto done; +} + +const struct address_space_operations affs_aops_ofs = { + .readpage = affs_readpage_ofs, + //.writepage = affs_writepage_ofs, + .write_begin = affs_write_begin_ofs, + .write_end = affs_write_end_ofs +}; + +/* Free any preallocated blocks. */ + +void +affs_free_prealloc(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino); + + while (AFFS_I(inode)->i_pa_cnt) { + AFFS_I(inode)->i_pa_cnt--; + affs_free_block(sb, ++AFFS_I(inode)->i_lastalloc); + } +} + +/* Truncate (or enlarge) a file to the requested size. */ + +void +affs_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + u32 ext, ext_key; + u32 last_blk, blkcnt, blk; + u32 size; + struct buffer_head *ext_bh; + int i; + + pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n", + inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size); + + last_blk = 0; + ext = 0; + if (inode->i_size) { + last_blk = ((u32)inode->i_size - 1) / AFFS_SB(sb)->s_data_blksize; + ext = last_blk / AFFS_SB(sb)->s_hashsize; + } + + if (inode->i_size > AFFS_I(inode)->mmu_private) { + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + loff_t isize = inode->i_size; + int res; + + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata); + if (!res) + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); + else + inode->i_size = AFFS_I(inode)->mmu_private; + mark_inode_dirty(inode); + return; + } else if (inode->i_size == AFFS_I(inode)->mmu_private) + return; + + // lock cache + ext_bh = affs_get_extblock(inode, ext); + if (IS_ERR(ext_bh)) { + affs_warning(sb, "truncate", + "unexpected read error for ext block %u (%ld)", + ext, PTR_ERR(ext_bh)); + return; + } + if (AFFS_I(inode)->i_lc) { + /* clear linear cache */ + i = (ext + 1) >> AFFS_I(inode)->i_lc_shift; + if (AFFS_I(inode)->i_lc_size > i) { + AFFS_I(inode)->i_lc_size = i; + for (; i < AFFS_LC_SIZE; i++) + AFFS_I(inode)->i_lc[i] = 0; + } + /* clear associative cache */ + for (i = 0; i < AFFS_AC_SIZE; i++) + if (AFFS_I(inode)->i_ac[i].ext >= ext) + AFFS_I(inode)->i_ac[i].ext = 0; + } + ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension); + + blkcnt = AFFS_I(inode)->i_blkcnt; + i = 0; + blk = last_blk; + if (inode->i_size) { + i = last_blk % AFFS_SB(sb)->s_hashsize + 1; + blk++; + } else + AFFS_HEAD(ext_bh)->first_data = 0; + AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i); + size = AFFS_SB(sb)->s_hashsize; + if (size > blkcnt - blk + i) + size = blkcnt - blk + i; + for (; i < size; i++, blk++) { + affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i))); + AFFS_BLOCK(sb, ext_bh, i) = 0; + } + AFFS_TAIL(sb, ext_bh)->extension = 0; + affs_fix_checksum(sb, ext_bh); + mark_buffer_dirty_inode(ext_bh, inode); + affs_brelse(ext_bh); + + if (inode->i_size) { + AFFS_I(inode)->i_blkcnt = last_blk + 1; + AFFS_I(inode)->i_extcnt = ext + 1; + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS)) { + struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); + u32 tmp; + if (IS_ERR(bh)) { + affs_warning(sb, "truncate", + "unexpected read error for last block %u (%ld)", + ext, PTR_ERR(bh)); + return; + } + tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); + AFFS_DATA_HEAD(bh)->next = 0; + affs_adjust_checksum(bh, -tmp); + affs_brelse(bh); + } + } else { + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_extcnt = 1; + } + AFFS_I(inode)->mmu_private = inode->i_size; + // unlock cache + + while (ext_key) { + ext_bh = affs_bread(sb, ext_key); + size = AFFS_SB(sb)->s_hashsize; + if (size > blkcnt - blk) + size = blkcnt - blk; + for (i = 0; i < size; i++, blk++) + affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i))); + affs_free_block(sb, ext_key); + ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension); + affs_brelse(ext_bh); + } + affs_free_prealloc(inode); +} + +int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int ret, err; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + ret = write_inode_now(inode, 0); + err = sync_blockdev(inode->i_sb->s_bdev); + if (!ret) + ret = err; + mutex_unlock(&inode->i_mutex); + return ret; +} +const struct file_operations affs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = affs_file_open, + .release = affs_file_release, + .fsync = affs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations affs_file_inode_operations = { + .setattr = affs_notify_change, +}; diff --git a/kernel/fs/affs/inode.c b/kernel/fs/affs/inode.c new file mode 100644 index 000000000..a022f4acc --- /dev/null +++ b/kernel/fs/affs/inode.c @@ -0,0 +1,416 @@ +/* + * linux/fs/affs/inode.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ +#include +#include +#include "affs.h" + +struct inode *affs_iget(struct super_block *sb, unsigned long ino) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct buffer_head *bh; + struct affs_tail *tail; + struct inode *inode; + u32 block; + u32 size; + u32 prot; + u16 id; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + pr_debug("affs_iget(%lu)\n", inode->i_ino); + + block = inode->i_ino; + bh = affs_bread(sb, block); + if (!bh) { + affs_warning(sb, "read_inode", "Cannot read block %d", block); + goto bad_inode; + } + if (affs_checksum_block(sb, bh) || be32_to_cpu(AFFS_HEAD(bh)->ptype) != T_SHORT) { + affs_warning(sb,"read_inode", + "Checksum or type (ptype=%d) error on inode %d", + AFFS_HEAD(bh)->ptype, block); + goto bad_inode; + } + + tail = AFFS_TAIL(sb, bh); + prot = be32_to_cpu(tail->protect); + + inode->i_size = 0; + set_nlink(inode, 1); + inode->i_mode = 0; + AFFS_I(inode)->i_extcnt = 1; + AFFS_I(inode)->i_ext_last = ~1; + AFFS_I(inode)->i_protect = prot; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_lc_size = 0; + AFFS_I(inode)->i_lc_shift = 0; + AFFS_I(inode)->i_lc_mask = 0; + AFFS_I(inode)->i_ac = NULL; + AFFS_I(inode)->i_ext_bh = NULL; + AFFS_I(inode)->mmu_private = 0; + AFFS_I(inode)->i_lastalloc = 0; + AFFS_I(inode)->i_pa_cnt = 0; + + if (affs_test_opt(sbi->s_flags, SF_SETMODE)) + inode->i_mode = sbi->s_mode; + else + inode->i_mode = prot_to_mode(prot); + + id = be16_to_cpu(tail->uid); + if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID)) + inode->i_uid = sbi->s_uid; + else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS)) + i_uid_write(inode, 0); + else + i_uid_write(inode, id); + + id = be16_to_cpu(tail->gid); + if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETGID)) + inode->i_gid = sbi->s_gid; + else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS)) + i_gid_write(inode, 0); + else + i_gid_write(inode, id); + + switch (be32_to_cpu(tail->stype)) { + case ST_ROOT: + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + /* fall through */ + case ST_USERDIR: + if (be32_to_cpu(tail->stype) == ST_USERDIR || + affs_test_opt(sbi->s_flags, SF_SETMODE)) { + if (inode->i_mode & S_IRUSR) + inode->i_mode |= S_IXUSR; + if (inode->i_mode & S_IRGRP) + inode->i_mode |= S_IXGRP; + if (inode->i_mode & S_IROTH) + inode->i_mode |= S_IXOTH; + inode->i_mode |= S_IFDIR; + } else + inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR; + /* Maybe it should be controlled by mount parameter? */ + //inode->i_mode |= S_ISVTX; + inode->i_op = &affs_dir_inode_operations; + inode->i_fop = &affs_dir_operations; + break; + case ST_LINKDIR: +#if 0 + affs_warning(sb, "read_inode", "inode is LINKDIR"); + goto bad_inode; +#else + inode->i_mode |= S_IFDIR; + /* ... and leave ->i_op and ->i_fop pointing to empty */ + break; +#endif + case ST_LINKFILE: + affs_warning(sb, "read_inode", "inode is LINKFILE"); + goto bad_inode; + case ST_FILE: + size = be32_to_cpu(tail->size); + inode->i_mode |= S_IFREG; + AFFS_I(inode)->mmu_private = inode->i_size = size; + if (inode->i_size) { + AFFS_I(inode)->i_blkcnt = (size - 1) / + sbi->s_data_blksize + 1; + AFFS_I(inode)->i_extcnt = (AFFS_I(inode)->i_blkcnt - 1) / + sbi->s_hashsize + 1; + } + if (tail->link_chain) + set_nlink(inode, 2); + inode->i_mapping->a_ops = affs_test_opt(sbi->s_flags, SF_OFS) ? + &affs_aops_ofs : &affs_aops; + inode->i_op = &affs_file_inode_operations; + inode->i_fop = &affs_file_operations; + break; + case ST_SOFTLINK: + inode->i_mode |= S_IFLNK; + inode->i_op = &affs_symlink_inode_operations; + inode->i_data.a_ops = &affs_symlink_aops; + break; + } + + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec + = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) + + be32_to_cpu(tail->change.mins) * 60 + + be32_to_cpu(tail->change.ticks) / 50 + + ((8 * 365 + 2) * 24 * 60 * 60)) + + sys_tz.tz_minuteswest * 60; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; + affs_brelse(bh); + unlock_new_inode(inode); + return inode; + +bad_inode: + affs_brelse(bh); + iget_failed(inode); + return ERR_PTR(-EIO); +} + +int +affs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + struct affs_tail *tail; + uid_t uid; + gid_t gid; + + pr_debug("write_inode(%lu)\n", inode->i_ino); + + if (!inode->i_nlink) + // possibly free block + return 0; + bh = affs_bread(sb, inode->i_ino); + if (!bh) { + affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino); + return -EIO; + } + tail = AFFS_TAIL(sb, bh); + if (tail->stype == cpu_to_be32(ST_ROOT)) { + secs_to_datestamp(inode->i_mtime.tv_sec,&AFFS_ROOT_TAIL(sb, bh)->root_change); + } else { + tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect); + tail->size = cpu_to_be32(inode->i_size); + secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change); + if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { + uid = i_uid_read(inode); + gid = i_gid_read(inode); + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_MUFS)) { + if (uid == 0 || uid == 0xFFFF) + uid = uid ^ ~0; + if (gid == 0 || gid == 0xFFFF) + gid = gid ^ ~0; + } + if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETUID)) + tail->uid = cpu_to_be16(uid); + if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETGID)) + tail->gid = cpu_to_be16(gid); + } + } + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + affs_free_prealloc(inode); + return 0; +} + +int +affs_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); + + error = inode_change_ok(inode,attr); + if (error) + goto out; + + if (((attr->ia_valid & ATTR_UID) && + affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETUID)) || + ((attr->ia_valid & ATTR_GID) && + affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETGID)) || + ((attr->ia_valid & ATTR_MODE) && + (AFFS_SB(inode->i_sb)->s_flags & + (AFFS_MOUNT_SF_SETMODE | AFFS_MOUNT_SF_IMMUTABLE)))) { + if (!affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_QUIET)) + error = -EPERM; + goto out; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + truncate_setsize(inode, attr->ia_size); + affs_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) + mode_to_prot(inode); +out: + return error; +} + +void +affs_evict_inode(struct inode *inode) +{ + unsigned long cache_page; + pr_debug("evict_inode(ino=%lu, nlink=%u)\n", + inode->i_ino, inode->i_nlink); + truncate_inode_pages_final(&inode->i_data); + + if (!inode->i_nlink) { + inode->i_size = 0; + affs_truncate(inode); + } + + invalidate_inode_buffers(inode); + clear_inode(inode); + affs_free_prealloc(inode); + cache_page = (unsigned long)AFFS_I(inode)->i_lc; + if (cache_page) { + pr_debug("freeing ext cache\n"); + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_ac = NULL; + free_page(cache_page); + } + affs_brelse(AFFS_I(inode)->i_ext_bh); + AFFS_I(inode)->i_ext_last = ~1; + AFFS_I(inode)->i_ext_bh = NULL; + + if (!inode->i_nlink) + affs_free_block(inode->i_sb, inode->i_ino); +} + +struct inode * +affs_new_inode(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + u32 block; + struct buffer_head *bh; + + if (!(inode = new_inode(sb))) + goto err_inode; + + if (!(block = affs_alloc_block(dir, dir->i_ino))) + goto err_block; + + bh = affs_getzeroblk(sb, block); + if (!bh) + goto err_bh; + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_ino = block; + set_nlink(inode, 1); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_lc_size = 0; + AFFS_I(inode)->i_lc_shift = 0; + AFFS_I(inode)->i_lc_mask = 0; + AFFS_I(inode)->i_ac = NULL; + AFFS_I(inode)->i_ext_bh = NULL; + AFFS_I(inode)->mmu_private = 0; + AFFS_I(inode)->i_protect = 0; + AFFS_I(inode)->i_lastalloc = 0; + AFFS_I(inode)->i_pa_cnt = 0; + AFFS_I(inode)->i_extcnt = 1; + AFFS_I(inode)->i_ext_last = ~1; + + insert_inode_hash(inode); + + return inode; + +err_bh: + affs_free_block(sb, block); +err_block: + iput(inode); +err_inode: + return NULL; +} + +/* + * Add an entry to a directory. Create the header block + * and insert it into the hash table. + */ + +int +affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *inode_bh = NULL; + struct buffer_head *bh = NULL; + u32 block = 0; + int retval; + + pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__, + dir->i_ino, inode->i_ino, dentry, type); + + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + affs_lock_link(inode); + switch (type) { + case ST_LINKFILE: + case ST_LINKDIR: + retval = -ENOSPC; + block = affs_alloc_block(dir, dir->i_ino); + if (!block) + goto err; + retval = -EIO; + inode_bh = bh; + bh = affs_getzeroblk(sb, block); + if (!bh) + goto err; + break; + default: + break; + } + + AFFS_HEAD(bh)->ptype = cpu_to_be32(T_SHORT); + AFFS_HEAD(bh)->key = cpu_to_be32(bh->b_blocknr); + affs_copy_name(AFFS_TAIL(sb, bh)->name, dentry); + AFFS_TAIL(sb, bh)->stype = cpu_to_be32(type); + AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino); + + if (inode_bh) { + __be32 chain; + chain = AFFS_TAIL(sb, inode_bh)->link_chain; + AFFS_TAIL(sb, bh)->original = cpu_to_be32(inode->i_ino); + AFFS_TAIL(sb, bh)->link_chain = chain; + AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); + affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); + mark_buffer_dirty_inode(inode_bh, inode); + set_nlink(inode, 2); + ihold(inode); + } + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + dentry->d_fsdata = (void *)(long)bh->b_blocknr; + + affs_lock_dir(dir); + retval = affs_insert_hash(dir, bh); + mark_buffer_dirty_inode(bh, inode); + affs_unlock_dir(dir); + affs_unlock_link(inode); + + d_instantiate(dentry, inode); +done: + affs_brelse(inode_bh); + affs_brelse(bh); + return retval; +err: + if (block) + affs_free_block(sb, block); + affs_unlock_link(inode); + goto done; +} diff --git a/kernel/fs/affs/namei.c b/kernel/fs/affs/namei.c new file mode 100644 index 000000000..181e05b46 --- /dev/null +++ b/kernel/fs/affs/namei.c @@ -0,0 +1,462 @@ +/* + * linux/fs/affs/namei.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include "affs.h" + +typedef int (*toupper_t)(int); + +static int affs_toupper(int ch); +static int affs_hash_dentry(const struct dentry *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); +static int affs_intl_toupper(int ch); +static int affs_intl_hash_dentry(const struct dentry *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); + +const struct dentry_operations affs_dentry_operations = { + .d_hash = affs_hash_dentry, + .d_compare = affs_compare_dentry, +}; + +const struct dentry_operations affs_intl_dentry_operations = { + .d_hash = affs_intl_hash_dentry, + .d_compare = affs_intl_compare_dentry, +}; + + +/* Simple toupper() for DOS\1 */ + +static int +affs_toupper(int ch) +{ + return ch >= 'a' && ch <= 'z' ? ch -= ('a' - 'A') : ch; +} + +/* International toupper() for DOS\3 ("international") */ + +static int +affs_intl_toupper(int ch) +{ + return (ch >= 'a' && ch <= 'z') || (ch >= 0xE0 + && ch <= 0xFE && ch != 0xF7) ? + ch - ('a' - 'A') : ch; +} + +static inline toupper_t +affs_get_toupper(struct super_block *sb) +{ + return affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL) ? + affs_intl_toupper : affs_toupper; +} + +/* + * Note: the dentry argument is the parent dentry. + */ +static inline int +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) +{ + const u8 *name = qstr->name; + unsigned long hash; + int retval; + u32 len; + + retval = affs_check_name(qstr->name, qstr->len, notruncate); + if (retval) + return retval; + + hash = init_name_hash(); + len = min(qstr->len, AFFSNAMEMAX); + for (; len > 0; name++, len--) + hash = partial_name_hash(toupper(*name), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +static int +affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) +{ + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + +} + +static int +affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) +{ + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + +} + +static inline int __affs_compare_dentry(unsigned int len, + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) +{ + const u8 *aname = str; + const u8 *bname = name->name; + + /* + * 'str' is the name of an already existing dentry, so the name + * must be valid. 'name' must be validated first. + */ + + if (affs_check_name(name->name, name->len, notruncate)) + return 1; + + /* + * If the names are longer than the allowed 30 chars, + * the excess is ignored, so their length may differ. + */ + if (len >= AFFSNAMEMAX) { + if (name->len < AFFSNAMEMAX) + return 1; + len = AFFSNAMEMAX; + } else if (len != name->len) + return 1; + + for (; len > 0; len--) + if (toupper(*aname++) != toupper(*bname++)) + return 1; + + return 0; +} + +static int +affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); +} + +static int +affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + +} + +/* + * NOTE! unlike strncmp, affs_match returns 1 for success, 0 for failure. + */ + +static inline int +affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) +{ + const u8 *name = dentry->d_name.name; + int len = dentry->d_name.len; + + if (len >= AFFSNAMEMAX) { + if (*name2 < AFFSNAMEMAX) + return 0; + len = AFFSNAMEMAX; + } else if (len != *name2) + return 0; + + for (name2++; len > 0; len--) + if (toupper(*name++) != toupper(*name2++)) + return 0; + return 1; +} + +int +affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) +{ + toupper_t toupper = affs_get_toupper(sb); + u32 hash; + + hash = len = min(len, AFFSNAMEMAX); + for (; len > 0; len--) + hash = (hash * 13 + toupper(*name++)) & 0x7ff; + + return hash % AFFS_SB(sb)->s_hashsize; +} + +static struct buffer_head * +affs_find_entry(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + toupper_t toupper = affs_get_toupper(sb); + u32 key; + + pr_debug("%s(\"%pd\")\n", __func__, dentry); + + bh = affs_bread(sb, dir->i_ino); + if (!bh) + return ERR_PTR(-EIO); + + key = be32_to_cpu(AFFS_HEAD(bh)->table[affs_hash_name(sb, dentry->d_name.name, dentry->d_name.len)]); + + for (;;) { + affs_brelse(bh); + if (key == 0) + return NULL; + bh = affs_bread(sb, key); + if (!bh) + return ERR_PTR(-EIO); + if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper)) + return bh; + key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); + } +} + +struct dentry * +affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct inode *inode = NULL; + + pr_debug("%s(\"%pd\")\n", __func__, dentry); + + affs_lock_dir(dir); + bh = affs_find_entry(dir, dentry); + affs_unlock_dir(dir); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (bh) { + u32 ino = bh->b_blocknr; + + /* store the real header ino in d_fsdata for faster lookups */ + dentry->d_fsdata = (void *)(long)ino; + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + //link to dirs disabled + //case ST_LINKDIR: + case ST_LINKFILE: + ino = be32_to_cpu(AFFS_TAIL(sb, bh)->original); + } + affs_brelse(bh); + inode = affs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +int +affs_unlink(struct inode *dir, struct dentry *dentry) +{ + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + d_inode(dentry)->i_ino, dentry); + + return affs_remove_header(dentry); +} + +int +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + int error; + + pr_debug("%s(%lu,\"%pd\",0%ho)\n", + __func__, dir->i_ino, dentry, mode); + + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_mode = mode; + mode_to_prot(inode); + mark_inode_dirty(inode); + + inode->i_op = &affs_file_inode_operations; + inode->i_fop = &affs_file_operations; + inode->i_mapping->a_ops = affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS) ? + &affs_aops_ofs : &affs_aops; + error = affs_add_entry(dir, inode, dentry, ST_FILE); + if (error) { + clear_nlink(inode); + iput(inode); + return error; + } + return 0; +} + +int +affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int error; + + pr_debug("%s(%lu,\"%pd\",0%ho)\n", + __func__, dir->i_ino, dentry, mode); + + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_mode = S_IFDIR | mode; + mode_to_prot(inode); + + inode->i_op = &affs_dir_inode_operations; + inode->i_fop = &affs_dir_operations; + + error = affs_add_entry(dir, inode, dentry, ST_USERDIR); + if (error) { + clear_nlink(inode); + mark_inode_dirty(inode); + iput(inode); + return error; + } + return 0; +} + +int +affs_rmdir(struct inode *dir, struct dentry *dentry) +{ + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + d_inode(dentry)->i_ino, dentry); + + return affs_remove_header(dentry); +} + +int +affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct inode *inode; + char *p; + int i, maxlen, error; + char c, lc; + + pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n", + __func__, dir->i_ino, dentry, symname); + + maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_op = &affs_symlink_inode_operations; + inode->i_data.a_ops = &affs_symlink_aops; + inode->i_mode = S_IFLNK | 0777; + mode_to_prot(inode); + + error = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto err; + i = 0; + p = (char *)AFFS_HEAD(bh)->table; + lc = '/'; + if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); + while (*symname == '/') + symname++; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); + } + while (i < maxlen && (c = *symname++)) { + if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { + *p++ = '/'; + i++; + symname += 2; + lc = '/'; + } else if (c == '.' && lc == '/' && *symname == '/') { + symname++; + lc = '/'; + } else { + *p++ = c; + lc = c; + i++; + } + if (lc == '/') + while (*symname == '/') + symname++; + } + *p = 0; + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + mark_inode_dirty(inode); + + error = affs_add_entry(dir, inode, dentry, ST_SOFTLINK); + if (error) + goto err; + + return 0; + +err: + clear_nlink(inode); + mark_inode_dirty(inode); + iput(inode); + return error; +} + +int +affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, + dentry); + + return affs_add_entry(dir, inode, dentry, ST_LINKFILE); +} + +int +affs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct buffer_head *bh = NULL; + int retval; + + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); + + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); + + if (retval) + return retval; + + /* Unlink destination if it already exists */ + if (d_really_is_positive(new_dentry)) { + retval = affs_remove_header(new_dentry); + if (retval) + return retval; + } + + bh = affs_bread(sb, d_inode(old_dentry)->i_ino); + if (!bh) + return -EIO; + + /* Remove header from its parent directory. */ + affs_lock_dir(old_dir); + retval = affs_remove_hash(old_dir, bh); + affs_unlock_dir(old_dir); + if (retval) + goto done; + + /* And insert it into the new directory with the new name. */ + affs_copy_name(AFFS_TAIL(sb, bh)->name, new_dentry); + affs_fix_checksum(sb, bh); + affs_lock_dir(new_dir); + retval = affs_insert_hash(new_dir, bh); + affs_unlock_dir(new_dir); + /* TODO: move it back to old_dir, if error? */ + +done: + mark_buffer_dirty_inode(bh, retval ? old_dir : new_dir); + affs_brelse(bh); + return retval; +} diff --git a/kernel/fs/affs/super.c b/kernel/fs/affs/super.c new file mode 100644 index 000000000..3f89c9e05 --- /dev/null +++ b/kernel/fs/affs/super.c @@ -0,0 +1,649 @@ +/* + * linux/fs/affs/inode.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "affs.h" + +static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); +static int affs_remount (struct super_block *sb, int *flags, char *data); + +static void +affs_commit_super(struct super_block *sb, int wait) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct buffer_head *bh = sbi->s_root_bh; + struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); + + lock_buffer(bh); + secs_to_datestamp(get_seconds(), &tail->disk_change); + affs_fix_checksum(sb, bh); + unlock_buffer(bh); + + mark_buffer_dirty(bh); + if (wait) + sync_dirty_buffer(bh); +} + +static void +affs_put_super(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + pr_debug("%s()\n", __func__); + + cancel_delayed_work_sync(&sbi->sb_work); +} + +static int +affs_sync_fs(struct super_block *sb, int wait) +{ + affs_commit_super(sb, wait); + return 0; +} + +static void flush_superblock(struct work_struct *work) +{ + struct affs_sb_info *sbi; + struct super_block *sb; + + sbi = container_of(work, struct affs_sb_info, sb_work.work); + sb = sbi->sb; + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + affs_commit_super(sb, 1); +} + +void affs_mark_sb_dirty(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sb_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + +static struct kmem_cache * affs_inode_cachep; + +static struct inode *affs_alloc_inode(struct super_block *sb) +{ + struct affs_inode_info *i; + + i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + if (!i) + return NULL; + + i->vfs_inode.i_version = 1; + i->i_lc = NULL; + i->i_ext_bh = NULL; + i->i_pa_cnt = 0; + + return &i->vfs_inode; +} + +static void affs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); +} + +static void affs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, affs_i_callback); +} + +static void init_once(void *foo) +{ + struct affs_inode_info *ei = (struct affs_inode_info *) foo; + + sema_init(&ei->i_link_lock, 1); + sema_init(&ei->i_ext_lock, 1); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + affs_inode_cachep = kmem_cache_create("affs_inode_cache", + sizeof(struct affs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (affs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(affs_inode_cachep); +} + +static const struct super_operations affs_sops = { + .alloc_inode = affs_alloc_inode, + .destroy_inode = affs_destroy_inode, + .write_inode = affs_write_inode, + .evict_inode = affs_evict_inode, + .put_super = affs_put_super, + .sync_fs = affs_sync_fs, + .statfs = affs_statfs, + .remount_fs = affs_remount, + .show_options = generic_show_options, +}; + +enum { + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, + Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, + Opt_verbose, Opt_volume, Opt_ignore, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_bs, "bs=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, + {Opt_prefix, "prefix=%s"}, + {Opt_protect, "protect"}, + {Opt_reserved, "reserved=%u"}, + {Opt_root, "root=%u"}, + {Opt_setgid, "setgid=%u"}, + {Opt_setuid, "setuid=%u"}, + {Opt_verbose, "verbose"}, + {Opt_volume, "volume=%s"}, + {Opt_ignore, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, + {Opt_err, NULL}, +}; + +static int +parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, + int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + /* Fill in defaults */ + + *uid = current_uid(); + *gid = current_gid(); + *reserved = 2; + *root = -1; + *blocksize = -1; + volume[0] = ':'; + volume[1] = 0; + *mount_opts = 0; + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token, n, option; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bs: + if (match_int(&args[0], &n)) + return 0; + if (n != 512 && n != 1024 && n != 2048 + && n != 4096) { + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + return 0; + } + *blocksize = n; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return 0; + *mode = option & 0777; + affs_set_opt(*mount_opts, SF_SETMODE); + break; + case Opt_mufs: + affs_set_opt(*mount_opts, SF_MUFS); + break; + case Opt_notruncate: + affs_set_opt(*mount_opts, SF_NO_TRUNCATE); + break; + case Opt_prefix: + *prefix = match_strdup(&args[0]); + if (!*prefix) + return 0; + affs_set_opt(*mount_opts, SF_PREFIX); + break; + case Opt_protect: + affs_set_opt(*mount_opts, SF_IMMUTABLE); + break; + case Opt_reserved: + if (match_int(&args[0], reserved)) + return 0; + break; + case Opt_root: + if (match_int(&args[0], root)) + return 0; + break; + case Opt_setgid: + if (match_int(&args[0], &option)) + return 0; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 0; + affs_set_opt(*mount_opts, SF_SETGID); + break; + case Opt_setuid: + if (match_int(&args[0], &option)) + return 0; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 0; + affs_set_opt(*mount_opts, SF_SETUID); + break; + case Opt_verbose: + affs_set_opt(*mount_opts, SF_VERBOSE); + break; + case Opt_volume: { + char *vol = match_strdup(&args[0]); + if (!vol) + return 0; + strlcpy(volume, vol, 32); + kfree(vol); + break; + } + case Opt_ignore: + /* Silently ignore the quota options */ + break; + default: + pr_warn("Unrecognized mount option \"%s\" or missing value\n", + p); + return 0; + } + } + return 1; +} + +/* This function definitely needs to be split up. Some fine day I'll + * hopefully have the guts to do so. Until then: sorry for the mess. + */ + +static int affs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct affs_sb_info *sbi; + struct buffer_head *root_bh = NULL; + struct buffer_head *boot_bh; + struct inode *root_inode = NULL; + s32 root_block; + int size, blocksize; + u32 chksum; + int num_bm; + int i, j; + kuid_t uid; + kgid_t gid; + int reserved; + unsigned long mount_flags; + int tmp_flags; /* fix remount prototype... */ + u8 sig[4]; + int ret; + + save_mount_options(sb, data); + + pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); + + sb->s_magic = AFFS_SUPER_MAGIC; + sb->s_op = &affs_sops; + sb->s_flags |= MS_NODIRATIME; + + sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sbi->sb = sb; + mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); + + if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, + &blocksize,&sbi->s_prefix, + sbi->s_volume, &mount_flags)) { + pr_err("Error parsing options\n"); + return -EINVAL; + } + /* N.B. after this point s_prefix must be released */ + + sbi->s_flags = mount_flags; + sbi->s_mode = i; + sbi->s_uid = uid; + sbi->s_gid = gid; + sbi->s_reserved= reserved; + + /* Get the size of the device in 512-byte blocks. + * If we later see that the partition uses bigger + * blocks, we will have to change it. + */ + + size = sb->s_bdev->bd_inode->i_size >> 9; + pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); + + affs_set_blocksize(sb, PAGE_SIZE); + /* Try to find root block. Its location depends on the block size. */ + + i = 512; + j = 4096; + if (blocksize > 0) { + i = j = blocksize; + size = size / (blocksize / 512); + } + for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { + sbi->s_root_block = root_block; + if (root_block < 0) + sbi->s_root_block = (reserved + size - 1) / 2; + pr_debug("setting blocksize to %d\n", blocksize); + affs_set_blocksize(sb, blocksize); + sbi->s_partition_size = size; + + /* The root block location that was calculated above is not + * correct if the partition size is an odd number of 512- + * byte blocks, which will be rounded down to a number of + * 1024-byte blocks, and if there were an even number of + * reserved blocks. Ideally, all partition checkers should + * report the real number of blocks of the real blocksize, + * but since this just cannot be done, we have to try to + * find the root block anyways. In the above case, it is one + * block behind the calculated one. So we check this one, too. + */ + for (num_bm = 0; num_bm < 2; num_bm++) { + pr_debug("Dev %s, trying root=%u, bs=%d, " + "size=%d, reserved=%d\n", + sb->s_id, + sbi->s_root_block + num_bm, + blocksize, size, reserved); + root_bh = affs_bread(sb, sbi->s_root_block + num_bm); + if (!root_bh) + continue; + if (!affs_checksum_block(sb, root_bh) && + be32_to_cpu(AFFS_ROOT_HEAD(root_bh)->ptype) == T_SHORT && + be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) { + sbi->s_hashsize = blocksize / 4 - 56; + sbi->s_root_block += num_bm; + goto got_root; + } + affs_brelse(root_bh); + root_bh = NULL; + } + } + if (!silent) + pr_err("No valid root block on device %s\n", sb->s_id); + return -EINVAL; + + /* N.B. after this point bh must be released */ +got_root: + /* Keep super block in cache */ + sbi->s_root_bh = root_bh; + root_block = sbi->s_root_block; + + /* Find out which kind of FS we have */ + boot_bh = sb_bread(sb, 0); + if (!boot_bh) { + pr_err("Cannot read boot block\n"); + return -EINVAL; + } + memcpy(sig, boot_bh->b_data, 4); + brelse(boot_bh); + chksum = be32_to_cpu(*(__be32 *)sig); + + /* Dircache filesystems are compatible with non-dircache ones + * when reading. As long as they aren't supported, writing is + * not recommended. + */ + if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS + || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) { + pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); + sb->s_flags |= MS_RDONLY; + } + switch (chksum) { + case MUFS_FS: + case MUFS_INTLFFS: + case MUFS_DCFFS: + affs_set_opt(sbi->s_flags, SF_MUFS); + /* fall thru */ + case FS_INTLFFS: + case FS_DCFFS: + affs_set_opt(sbi->s_flags, SF_INTL); + break; + case MUFS_FFS: + affs_set_opt(sbi->s_flags, SF_MUFS); + break; + case FS_FFS: + break; + case MUFS_OFS: + affs_set_opt(sbi->s_flags, SF_MUFS); + /* fall thru */ + case FS_OFS: + affs_set_opt(sbi->s_flags, SF_OFS); + sb->s_flags |= MS_NOEXEC; + break; + case MUFS_DCOFS: + case MUFS_INTLOFS: + affs_set_opt(sbi->s_flags, SF_MUFS); + case FS_DCOFS: + case FS_INTLOFS: + affs_set_opt(sbi->s_flags, SF_INTL); + affs_set_opt(sbi->s_flags, SF_OFS); + sb->s_flags |= MS_NOEXEC; + break; + default: + pr_err("Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); + return -EINVAL; + } + + if (affs_test_opt(mount_flags, SF_VERBOSE)) { + u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; + pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", + len > 31 ? 31 : len, + AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, + sig, sig[3] + '0', blocksize); + } + + sb->s_flags |= MS_NODEV | MS_NOSUID; + + sbi->s_data_blksize = sb->s_blocksize; + if (affs_test_opt(sbi->s_flags, SF_OFS)) + sbi->s_data_blksize -= 24; + + tmp_flags = sb->s_flags; + ret = affs_init_bitmap(sb, &tmp_flags); + if (ret) + return ret; + sb->s_flags = tmp_flags; + + /* set up enough so that it can read an inode */ + + root_inode = affs_iget(sb, root_block); + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); + + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL)) + sb->s_d_op = &affs_intl_dentry_operations; + else + sb->s_d_op = &affs_dentry_operations; + + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + pr_err("AFFS: Get root inode failed\n"); + return -ENOMEM; + } + + pr_debug("s_flags=%lX\n", sb->s_flags); + return 0; +} + +static int +affs_remount(struct super_block *sb, int *flags, char *data) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + int blocksize; + kuid_t uid; + kgid_t gid; + int mode; + int reserved; + int root_block; + unsigned long mount_flags; + int res = 0; + char *new_opts; + char volume[32]; + char *prefix = NULL; + + new_opts = kstrdup(data, GFP_KERNEL); + if (!new_opts) + return -ENOMEM; + + pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); + + sync_filesystem(sb); + *flags |= MS_NODIRATIME; + + memcpy(volume, sbi->s_volume, 32); + if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, + &blocksize, &prefix, volume, + &mount_flags)) { + kfree(prefix); + kfree(new_opts); + return -EINVAL; + } + + flush_delayed_work(&sbi->sb_work); + replace_mount_options(sb, new_opts); + + sbi->s_flags = mount_flags; + sbi->s_mode = mode; + sbi->s_uid = uid; + sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) + affs_free_bitmap(sb); + else + res = affs_init_bitmap(sb, flags); + + return res; +} + +static int +affs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + int free; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + pr_debug("%s() partsize=%d, reserved=%d\n", + __func__, AFFS_SB(sb)->s_partition_size, + AFFS_SB(sb)->s_reserved); + + free = affs_count_free_blocks(sb); + buf->f_type = AFFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; + buf->f_bfree = free; + buf->f_bavail = free; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = AFFSNAMEMAX; + return 0; +} + +static struct dentry *affs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); +} + +static void affs_kill_sb(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + kill_block_super(sb); + if (sbi) { + affs_free_bitmap(sb); + affs_brelse(sbi->s_root_bh); + kfree(sbi->s_prefix); + mutex_destroy(&sbi->s_bmlock); + kfree(sbi); + } +} + +static struct file_system_type affs_fs_type = { + .owner = THIS_MODULE, + .name = "affs", + .mount = affs_mount, + .kill_sb = affs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("affs"); + +static int __init init_affs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&affs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_affs_fs(void) +{ + unregister_filesystem(&affs_fs_type); + destroy_inodecache(); +} + +MODULE_DESCRIPTION("Amiga filesystem support for Linux"); +MODULE_LICENSE("GPL"); + +module_init(init_affs_fs) +module_exit(exit_affs_fs) diff --git a/kernel/fs/affs/symlink.c b/kernel/fs/affs/symlink.c new file mode 100644 index 000000000..f39b71c39 --- /dev/null +++ b/kernel/fs/affs/symlink.c @@ -0,0 +1,81 @@ +/* + * linux/fs/affs/symlink.c + * + * 1995 Hans-Joachim Widmaier - Modified for affs. + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * affs symlink handling code + */ + +#include "affs.h" + +static int affs_symlink_readpage(struct file *file, struct page *page) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + char *link = kmap(page); + struct slink_front *lf; + int err; + int i, j; + char c; + char lc; + + pr_debug("follow_link(ino=%lu)\n", inode->i_ino); + + err = -EIO; + bh = affs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto fail; + i = 0; + j = 0; + lf = (struct slink_front *)bh->b_data; + lc = 0; + + if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; + while (i < 1023 && (c = pf[i])) + link[i++] = c; + spin_unlock(&sbi->symlink_lock); + while (i < 1023 && lf->symname[j] != ':') + link[i++] = lf->symname[j++]; + if (i < 1023) + link[i++] = '/'; + j++; + lc = '/'; + } + while (i < 1023 && (c = lf->symname[j])) { + if (c == '/' && lc == '/' && i < 1020) { /* parent dir */ + link[i++] = '.'; + link[i++] = '.'; + } + link[i++] = c; + lc = c; + j++; + } + link[i] = '\0'; + affs_brelse(bh); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations affs_symlink_aops = { + .readpage = affs_symlink_readpage, +}; + +const struct inode_operations affs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = affs_notify_change, +}; diff --git a/kernel/fs/afs/Kconfig b/kernel/fs/afs/Kconfig new file mode 100644 index 000000000..ebba3b18e --- /dev/null +++ b/kernel/fs/afs/Kconfig @@ -0,0 +1,29 @@ +config AFS_FS + tristate "Andrew File System support (AFS)" + depends on INET + select AF_RXRPC + select DNS_RESOLVER + help + If you say Y here, you will get an experimental Andrew File System + driver. It currently only supports unsecured read-only AFS access. + + See for more information. + + If unsure, say N. + +config AFS_DEBUG + bool "AFS dynamic debugging" + depends on AFS_FS + help + Say Y here to make runtime controllable debugging messages appear. + + See for more information. + + If unsure, say N. + +config AFS_FSCACHE + bool "Provide AFS client caching support" + depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y + help + Say Y here if you want AFS data to be cached locally on disk through + the generic filesystem cache manager diff --git a/kernel/fs/afs/Makefile b/kernel/fs/afs/Makefile new file mode 100644 index 000000000..4f64b95d5 --- /dev/null +++ b/kernel/fs/afs/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for Red Hat Linux AFS client. +# + +afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o + +kafs-objs := \ + $(afs-cache-y) \ + callback.o \ + cell.o \ + cmservice.o \ + dir.o \ + file.o \ + flock.o \ + fsclient.o \ + inode.o \ + main.o \ + misc.o \ + mntpt.o \ + proc.o \ + rxrpc.o \ + security.o \ + server.o \ + super.o \ + netdevices.o \ + vlclient.o \ + vlocation.o \ + vnode.o \ + volume.o \ + write.o + +obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/kernel/fs/afs/afs.h b/kernel/fs/afs/afs.h new file mode 100644 index 000000000..3c462ff6d --- /dev/null +++ b/kernel/fs/afs/afs.h @@ -0,0 +1,170 @@ +/* AFS common types + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_H +#define AFS_H + +#include + +#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */ +#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */ +#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */ +#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */ +#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */ + +typedef unsigned afs_volid_t; +typedef unsigned afs_vnodeid_t; +typedef unsigned long long afs_dataversion_t; + +typedef enum { + AFSVL_RWVOL, /* read/write volume */ + AFSVL_ROVOL, /* read-only volume */ + AFSVL_BACKVOL, /* backup volume */ +} __attribute__((packed)) afs_voltype_t; + +typedef enum { + AFS_FTYPE_INVALID = 0, + AFS_FTYPE_FILE = 1, + AFS_FTYPE_DIR = 2, + AFS_FTYPE_SYMLINK = 3, +} afs_file_type_t; + +typedef enum { + AFS_LOCK_READ = 0, /* read lock request */ + AFS_LOCK_WRITE = 1, /* write lock request */ +} afs_lock_type_t; + +#define AFS_LOCKWAIT (5 * 60) /* time until a lock times out (seconds) */ + +/* + * AFS file identifier + */ +struct afs_fid { + afs_volid_t vid; /* volume ID */ + afs_vnodeid_t vnode; /* file index within volume */ + unsigned unique; /* unique ID number (file index version) */ +}; + +/* + * AFS callback notification + */ +typedef enum { + AFSCM_CB_UNTYPED = 0, /* no type set on CB break */ + AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */ + AFSCM_CB_SHARED = 2, /* CB shared by other CM's */ + AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */ +} afs_callback_type_t; + +struct afs_callback { + struct afs_fid fid; /* file identifier */ + unsigned version; /* callback version */ + unsigned expiry; /* time at which expires */ + afs_callback_type_t type; /* type of callback */ +}; + +#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ + +/* + * AFS volume information + */ +struct afs_volume_info { + afs_volid_t vid; /* volume ID */ + afs_voltype_t type; /* type of this volume */ + afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */ + + /* list of fileservers serving this volume */ + size_t nservers; /* number of entries used in servers[] */ + struct { + struct in_addr addr; /* fileserver address */ + } servers[8]; +}; + +/* + * AFS security ACE access mask + */ +typedef u32 afs_access_t; +#define AFS_ACE_READ 0x00000001U /* - permission to read a file/dir */ +#define AFS_ACE_WRITE 0x00000002U /* - permission to write/chmod a file */ +#define AFS_ACE_INSERT 0x00000004U /* - permission to create dirent in a dir */ +#define AFS_ACE_LOOKUP 0x00000008U /* - permission to lookup a file/dir in a dir */ +#define AFS_ACE_DELETE 0x00000010U /* - permission to delete a dirent from a dir */ +#define AFS_ACE_LOCK 0x00000020U /* - permission to lock a file */ +#define AFS_ACE_ADMINISTER 0x00000040U /* - permission to change ACL */ +#define AFS_ACE_USER_A 0x01000000U /* - 'A' user-defined permission */ +#define AFS_ACE_USER_B 0x02000000U /* - 'B' user-defined permission */ +#define AFS_ACE_USER_C 0x04000000U /* - 'C' user-defined permission */ +#define AFS_ACE_USER_D 0x08000000U /* - 'D' user-defined permission */ +#define AFS_ACE_USER_E 0x10000000U /* - 'E' user-defined permission */ +#define AFS_ACE_USER_F 0x20000000U /* - 'F' user-defined permission */ +#define AFS_ACE_USER_G 0x40000000U /* - 'G' user-defined permission */ +#define AFS_ACE_USER_H 0x80000000U /* - 'H' user-defined permission */ + +/* + * AFS file status information + */ +struct afs_file_status { + unsigned if_version; /* interface version */ +#define AFS_FSTATUS_VERSION 1 + + afs_file_type_t type; /* file type */ + unsigned nlink; /* link count */ + u64 size; /* file size */ + afs_dataversion_t data_version; /* current data version */ + u32 author; /* author ID */ + kuid_t owner; /* owner ID */ + kgid_t group; /* group ID */ + afs_access_t caller_access; /* access rights for authenticated caller */ + afs_access_t anon_access; /* access rights for unauthenticated caller */ + umode_t mode; /* UNIX mode */ + struct afs_fid parent; /* parent dir ID for non-dirs only */ + time_t mtime_client; /* last time client changed data */ + time_t mtime_server; /* last time server changed data */ + s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ +}; + +/* + * AFS file status change request + */ + +#define AFS_SET_MTIME 0x01 /* set the mtime */ +#define AFS_SET_OWNER 0x02 /* set the owner ID */ +#define AFS_SET_GROUP 0x04 /* set the group ID (unsupported?) */ +#define AFS_SET_MODE 0x08 /* set the UNIX mode */ +#define AFS_SET_SEG_SIZE 0x10 /* set the segment size (unsupported) */ + +/* + * AFS volume synchronisation information + */ +struct afs_volsync { + time_t creation; /* volume creation time */ +}; + +/* + * AFS volume status record + */ +struct afs_volume_status { + u32 vid; /* volume ID */ + u32 parent_id; /* parent volume ID */ + u8 online; /* true if volume currently online and available */ + u8 in_service; /* true if volume currently in service */ + u8 blessed; /* same as in_service */ + u8 needs_salvage; /* true if consistency checking required */ + u32 type; /* volume type (afs_voltype_t) */ + u32 min_quota; /* minimum space set aside (blocks) */ + u32 max_quota; /* maximum space this volume may occupy (blocks) */ + u32 blocks_in_use; /* space this volume currently occupies (blocks) */ + u32 part_blocks_avail; /* space available in volume's partition */ + u32 part_max_blocks; /* size of volume's partition */ +}; + +#define AFS_BLOCK_SIZE 1024 + +#endif /* AFS_H */ diff --git a/kernel/fs/afs/afs_cm.h b/kernel/fs/afs/afs_cm.h new file mode 100644 index 000000000..255f5dd60 --- /dev/null +++ b/kernel/fs/afs/afs_cm.h @@ -0,0 +1,33 @@ +/* AFS Cache Manager definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_CM_H +#define AFS_CM_H + +#define AFS_CM_PORT 7001 /* AFS file server port */ +#define CM_SERVICE 1 /* AFS File Service ID */ + +enum AFS_CM_Operations { + CBCallBack = 204, /* break callback promises */ + CBInitCallBackState = 205, /* initialise callback state */ + CBProbe = 206, /* probe client */ + CBGetLock = 207, /* get contents of CM lock table */ + CBGetCE = 208, /* get cache file description */ + CBGetXStatsVersion = 209, /* get version of extended statistics */ + CBGetXStats = 210, /* get contents of extended statistics data */ + CBInitCallBackState3 = 213, /* initialise callback state, version 3 */ + CBProbeUuid = 214, /* check the client hasn't rebooted */ + CBTellMeAboutYourself = 65538, /* get client capabilities */ +}; + +#define AFS_CAP_ERROR_TRANSLATION 0x1 + +#endif /* AFS_FS_H */ diff --git a/kernel/fs/afs/afs_fs.h b/kernel/fs/afs/afs_fs.h new file mode 100644 index 000000000..eb647323d --- /dev/null +++ b/kernel/fs/afs/afs_fs.h @@ -0,0 +1,56 @@ +/* AFS File Service definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_FS_H +#define AFS_FS_H + +#define AFS_FS_PORT 7000 /* AFS file server port */ +#define FS_SERVICE 1 /* AFS File Service ID */ + +enum AFS_FS_Operations { + FSFETCHDATA = 130, /* AFS Fetch file data */ + FSFETCHSTATUS = 132, /* AFS Fetch file status */ + FSSTOREDATA = 133, /* AFS Store file data */ + FSSTORESTATUS = 135, /* AFS Store file status */ + FSREMOVEFILE = 136, /* AFS Remove a file */ + FSCREATEFILE = 137, /* AFS Create a file */ + FSRENAME = 138, /* AFS Rename or move a file or directory */ + FSSYMLINK = 139, /* AFS Create a symbolic link */ + FSLINK = 140, /* AFS Create a hard link */ + FSMAKEDIR = 141, /* AFS Create a directory */ + FSREMOVEDIR = 142, /* AFS Remove a directory */ + FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */ + FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ + FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ + FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSSETLOCK = 156, /* AFS Request a file lock */ + FSEXTENDLOCK = 157, /* AFS Extend a file lock */ + FSRELEASELOCK = 158, /* AFS Release a file lock */ + FSLOOKUP = 161, /* AFS lookup file in directory */ + FSFETCHDATA64 = 65537, /* AFS Fetch file data */ + FSSTOREDATA64 = 65538, /* AFS Store file data */ +}; + +enum AFS_FS_Errors { + VSALVAGE = 101, /* volume needs salvaging */ + VNOVNODE = 102, /* no such file/dir (vnode) */ + VNOVOL = 103, /* no such volume or volume unavailable */ + VVOLEXISTS = 104, /* volume name already exists */ + VNOSERVICE = 105, /* volume not currently in service */ + VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */ + VONLINE = 107, /* volume is already online */ + VDISKFULL = 108, /* disk partition is full */ + VOVERQUOTA = 109, /* volume's maximum quota exceeded */ + VBUSY = 110, /* volume is temporarily unavailable */ + VMOVED = 111, /* volume moved to new server - ask this FS where */ +}; + +#endif /* AFS_FS_H */ diff --git a/kernel/fs/afs/afs_vl.h b/kernel/fs/afs/afs_vl.h new file mode 100644 index 000000000..800f607ff --- /dev/null +++ b/kernel/fs/afs/afs_vl.h @@ -0,0 +1,84 @@ +/* AFS Volume Location Service client interface + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_VL_H +#define AFS_VL_H + +#include "afs.h" + +#define AFS_VL_PORT 7003 /* volume location service port */ +#define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ + +enum AFSVL_Operations { + VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */ + VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */ + VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */ +}; + +enum AFSVL_Errors { + AFSVL_IDEXIST = 363520, /* Volume Id entry exists in vl database */ + AFSVL_IO = 363521, /* I/O related error */ + AFSVL_NAMEEXIST = 363522, /* Volume name entry exists in vl database */ + AFSVL_CREATEFAIL = 363523, /* Internal creation failure */ + AFSVL_NOENT = 363524, /* No such entry */ + AFSVL_EMPTY = 363525, /* Vl database is empty */ + AFSVL_ENTDELETED = 363526, /* Entry is deleted (soft delete) */ + AFSVL_BADNAME = 363527, /* Volume name is illegal */ + AFSVL_BADINDEX = 363528, /* Index is out of range */ + AFSVL_BADVOLTYPE = 363529, /* Bad volume type */ + AFSVL_BADSERVER = 363530, /* Illegal server number (out of range) */ + AFSVL_BADPARTITION = 363531, /* Bad partition number */ + AFSVL_REPSFULL = 363532, /* Run out of space for Replication sites */ + AFSVL_NOREPSERVER = 363533, /* No such Replication server site exists */ + AFSVL_DUPREPSERVER = 363534, /* Replication site already exists */ + AFSVL_RWNOTFOUND = 363535, /* Parent R/W entry not found */ + AFSVL_BADREFCOUNT = 363536, /* Illegal Reference Count number */ + AFSVL_SIZEEXCEEDED = 363537, /* Vl size for attributes exceeded */ + AFSVL_BADENTRY = 363538, /* Bad incoming vl entry */ + AFSVL_BADVOLIDBUMP = 363539, /* Illegal max volid increment */ + AFSVL_IDALREADYHASHED = 363540, /* RO/BACK id already hashed */ + AFSVL_ENTRYLOCKED = 363541, /* Vl entry is already locked */ + AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */ + AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */ + AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */ + AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server flag */ + AFSVL_PERM = 363546, /* No permission access */ + AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ +}; + +/* + * maps to "struct vldbentry" in vvl-spec.pdf + */ +struct afs_vldbentry { + char name[65]; /* name of volume (with NUL char) */ + afs_voltype_t type; /* volume type */ + unsigned num_servers; /* num servers that hold instances of this vol */ + unsigned clone_id; /* cloning ID */ + + unsigned flags; +#define AFS_VLF_RWEXISTS 0x1000 /* R/W volume exists */ +#define AFS_VLF_ROEXISTS 0x2000 /* R/O volume exists */ +#define AFS_VLF_BACKEXISTS 0x4000 /* backup volume exists */ + + afs_volid_t volume_ids[3]; /* volume IDs */ + + struct { + struct in_addr addr; /* server address */ + unsigned partition; /* partition ID on this server */ + unsigned flags; /* server specific flags */ +#define AFS_VLSF_NEWREPSITE 0x0001 /* unused */ +#define AFS_VLSF_ROVOL 0x0002 /* this server holds a R/O instance of the volume */ +#define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */ +#define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */ + } servers[8]; +}; + +#endif /* AFS_VL_H */ diff --git a/kernel/fs/afs/cache.c b/kernel/fs/afs/cache.c new file mode 100644 index 000000000..577763c3d --- /dev/null +++ b/kernel/fs/afs/cache.c @@ -0,0 +1,402 @@ +/* AFS caching stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include "internal.h" + +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); + +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vlocation_cache_check_aux( + void *cookie_netfs_data, const void *buffer, uint16_t buflen); + +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); + +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size); +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); + +struct fscache_netfs afs_cache_netfs = { + .name = "afs", + .version = 0, +}; + +struct fscache_cookie_def afs_cell_cache_index_def = { + .name = "AFS.cell", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_cell_cache_get_key, + .get_aux = afs_cell_cache_get_aux, + .check_aux = afs_cell_cache_check_aux, +}; + +struct fscache_cookie_def afs_vlocation_cache_index_def = { + .name = "AFS.vldb", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_vlocation_cache_get_key, + .get_aux = afs_vlocation_cache_get_aux, + .check_aux = afs_vlocation_cache_check_aux, +}; + +struct fscache_cookie_def afs_volume_cache_index_def = { + .name = "AFS.volume", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_volume_cache_get_key, +}; + +struct fscache_cookie_def afs_vnode_cache_index_def = { + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = afs_vnode_cache_get_key, + .get_attr = afs_vnode_cache_get_attr, + .get_aux = afs_vnode_cache_get_aux, + .check_aux = afs_vnode_cache_check_aux, + .now_uncached = afs_vnode_cache_now_uncached, +}; + +/* + * set the key for the index entry + */ +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_cell *cell = cookie_netfs_data; + uint16_t klen; + + _enter("%p,%p,%u", cell, buffer, bufmax); + + klen = strlen(cell->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, cell->name, klen); + return klen; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_cell *cell = cookie_netfs_data; + uint16_t dlen; + + _enter("%p,%p,%u", cell, buffer, bufmax); + + dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); + dlen = min(dlen, bufmax); + dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); + + memcpy(buffer, cell->vl_addrs, dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} + +/*****************************************************************************/ +/* + * set the key for the index entry + */ +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t klen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); + if (klen > bufmax) + return 0; + + memcpy(buffer, vlocation->vldb.name, klen); + + _leave(" = %u", klen); + return klen; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen > bufmax) + return 0; + + memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static +enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct afs_cache_vlocation *cvldb; + struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen != buflen) + return FSCACHE_CHECKAUX_OBSOLETE; + + cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); + + /* if what's on disk is more valid than what's in memory, then use the + * VL record from the cache */ + if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { + memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); + vlocation->valid = 1; + _leave(" = SUCCESS [c->m]"); + return FSCACHE_CHECKAUX_OKAY; + } + + /* need to update the cache if the cached info differs */ + if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { + /* delete if the volume IDs for this name differ */ + if (memcmp(&vlocation->vldb.vid, &cvldb->vid, + sizeof(cvldb->vid)) != 0 + ) { + _leave(" = OBSOLETE"); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = UPDATE"); + return FSCACHE_CHECKAUX_NEEDS_UPDATE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} + +/*****************************************************************************/ +/* + * set the key for the volume index entry + */ +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_volume *volume = cookie_netfs_data; + uint16_t klen; + + _enter("{%u},%p,%u", volume->type, buffer, bufmax); + + klen = sizeof(volume->type); + if (klen > bufmax) + return 0; + + memcpy(buffer, &volume->type, sizeof(volume->type)); + + _leave(" = %u", klen); + return klen; + +} + +/*****************************************************************************/ +/* + * set the key for the index entry + */ +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t klen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + klen = sizeof(vnode->fid.vnode); + if (klen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); + + _leave(" = %u", klen); + return klen; +} + +/* + * provide updated file attributes + */ +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + + _enter("{%x,%x,%llx},", + vnode->fid.vnode, vnode->fid.unique, + vnode->status.data_version); + + *size = vnode->status.size; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%Lx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); + buffer += sizeof(vnode->fid.unique); + memcpy(buffer, &vnode->status.data_version, + sizeof(vnode->status.data_version)); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen != buflen) { + _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer, + &vnode->fid.unique, + sizeof(vnode->fid.unique) + ) != 0) { + unsigned unique; + + memcpy(&unique, buffer, sizeof(unique)); + + _leave(" = OBSOLETE [uniq %x != %x]", + unique, vnode->fid.unique); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer + sizeof(vnode->fid.unique), + &vnode->status.data_version, + sizeof(vnode->status.data_version) + ) != 0) { + afs_dataversion_t version; + + memcpy(&version, buffer + sizeof(vnode->fid.unique), + sizeof(version)); + + _leave(" = OBSOLETE [vers %llx != %llx]", + version, vnode->status.data_version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = SUCCESS"); + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * indication the cookie is no longer uncached + * - this function is called when the backing store currently caching a cookie + * is removed + * - the netfs should use this to clean up any markers indicating cached pages + * - this is mandatory for any object that may have data + */ +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) +{ + struct afs_vnode *vnode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + _enter("{%x,%x,%Lx}", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + /* grab a bunch of pages to clean */ + nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } + + _leave(""); +} diff --git a/kernel/fs/afs/callback.c b/kernel/fs/afs/callback.c new file mode 100644 index 000000000..7ef637d7f --- /dev/null +++ b/kernel/fs/afs/callback.c @@ -0,0 +1,475 @@ +/* + * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Woodhouse + * David Howells + * + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +#if 0 +unsigned afs_vnode_update_timeout = 10; +#endif /* 0 */ + +#define afs_breakring_space(server) \ + CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \ + ARRAY_SIZE((server)->cb_break)) + +//static void afs_callback_updater(struct work_struct *); + +static struct workqueue_struct *afs_callback_update_worker; + +/* + * allow the fileserver to request callback state (re-)initialisation + */ +void afs_init_callback_state(struct afs_server *server) +{ + struct afs_vnode *vnode; + + _enter("{%p}", server); + + spin_lock(&server->cb_lock); + + /* kill all the promises on record from this server */ + while (!RB_EMPTY_ROOT(&server->cb_promises)) { + vnode = rb_entry(server->cb_promises.rb_node, + struct afs_vnode, cb_promise); + _debug("UNPROMISE { vid=%x:%u uq=%u}", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * handle the data invalidation side of a callback being broken + */ +void afs_broken_callback_work(struct work_struct *work) +{ + struct afs_vnode *vnode = + container_of(work, struct afs_vnode, cb_broken_work); + + _enter(""); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + return; + + /* we're only interested in dealing with a broken callback on *this* + * vnode and only if no-one else has dealt with it yet */ + if (!mutex_trylock(&vnode->validate_lock)) + return; /* someone else is dealing with it */ + + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + if (S_ISDIR(vnode->vfs_inode.i_mode)) + afs_clear_permits(vnode); + + if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0) + goto out; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto out; + + /* if the vnode's data version number changed then its contents + * are different */ + if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + afs_zap_data(vnode); + } + +out: + mutex_unlock(&vnode->validate_lock); + + /* avoid the potential race whereby the mutex_trylock() in this + * function happens again between the clear_bit() and the + * mutex_unlock() */ + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + _debug("requeue"); + queue_work(afs_callback_update_worker, &vnode->cb_broken_work); + } + _leave(""); +} + +/* + * actually break a callback + */ +static void afs_break_callback(struct afs_server *server, + struct afs_vnode *vnode) +{ + _enter(""); + + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + + if (vnode->cb_promised) { + spin_lock(&vnode->lock); + + _debug("break callback"); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + + queue_work(afs_callback_update_worker, &vnode->cb_broken_work); + if (list_empty(&vnode->granted_locks) && + !list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); + spin_unlock(&vnode->lock); + } +} + +/* + * allow the fileserver to explicitly break one callback + * - happens when + * - the backing file is changed + * - a lock is released + */ +static void afs_break_one_callback(struct afs_server *server, + struct afs_fid *fid) +{ + struct afs_vnode *vnode; + struct rb_node *p; + + _debug("find"); + spin_lock(&server->fs_lock); + p = server->fs_vnodes.rb_node; + while (p) { + vnode = rb_entry(p, struct afs_vnode, server_rb); + if (fid->vid < vnode->fid.vid) + p = p->rb_left; + else if (fid->vid > vnode->fid.vid) + p = p->rb_right; + else if (fid->vnode < vnode->fid.vnode) + p = p->rb_left; + else if (fid->vnode > vnode->fid.vnode) + p = p->rb_right; + else if (fid->unique < vnode->fid.unique) + p = p->rb_left; + else if (fid->unique > vnode->fid.unique) + p = p->rb_right; + else + goto found; + } + + /* not found so we just ignore it (it may have moved to another + * server) */ +not_available: + _debug("not avail"); + spin_unlock(&server->fs_lock); + _leave(""); + return; + +found: + _debug("found"); + ASSERTCMP(server, ==, vnode->server); + + if (!igrab(AFS_VNODE_TO_I(vnode))) + goto not_available; + spin_unlock(&server->fs_lock); + + afs_break_callback(server, vnode); + iput(&vnode->vfs_inode); + _leave(""); +} + +/* + * allow the fileserver to break callback promises + */ +void afs_break_callbacks(struct afs_server *server, size_t count, + struct afs_callback callbacks[]) +{ + _enter("%p,%zu,", server, count); + + ASSERT(server != NULL); + ASSERTCMP(count, <=, AFSCBMAX); + + for (; count > 0; callbacks++, count--) { + _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", + callbacks->fid.vid, + callbacks->fid.vnode, + callbacks->fid.unique, + callbacks->version, + callbacks->expiry, + callbacks->type + ); + afs_break_one_callback(server, &callbacks->fid); + } + + _leave(""); + return; +} + +/* + * record the callback for breaking + * - the caller must hold server->cb_lock + */ +static void afs_do_give_up_callback(struct afs_server *server, + struct afs_vnode *vnode) +{ + struct afs_callback *cb; + + _enter("%p,%p", server, vnode); + + cb = &server->cb_break[server->cb_break_head]; + cb->fid = vnode->fid; + cb->version = vnode->cb_version; + cb->expiry = vnode->cb_expiry; + cb->type = vnode->cb_type; + smp_wmb(); + server->cb_break_head = + (server->cb_break_head + 1) & + (ARRAY_SIZE(server->cb_break) - 1); + + /* defer the breaking of callbacks to try and collect as many as + * possible to ship in one operation */ + switch (atomic_inc_return(&server->cb_break_n)) { + case 1 ... AFSCBMAX - 1: + queue_delayed_work(afs_callback_update_worker, + &server->cb_break_work, HZ * 2); + break; + case AFSCBMAX: + afs_flush_callback_breaks(server); + break; + default: + break; + } + + ASSERT(server->cb_promises.rb_node != NULL); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + _leave(""); +} + +/* + * discard the callback on a deleted item + */ +void afs_discard_callback_on_delete(struct afs_vnode *vnode) +{ + struct afs_server *server = vnode->server; + + _enter("%d", vnode->cb_promised); + + if (!vnode->cb_promised) { + _leave(" [not promised]"); + return; + } + + ASSERT(server != NULL); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + ASSERT(server->cb_promises.rb_node != NULL); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * give up the callback registered for a vnode on the file server when the + * inode is being cleared + */ +void afs_give_up_callback(struct afs_vnode *vnode) +{ + struct afs_server *server = vnode->server; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%d", vnode->cb_promised); + + _debug("GIVE UP INODE %p", &vnode->vfs_inode); + + if (!vnode->cb_promised) { + _leave(" [not promised]"); + return; + } + + ASSERT(server != NULL); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised && afs_breakring_space(server) == 0) { + add_wait_queue(&server->cb_break_waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!vnode->cb_promised || + afs_breakring_space(server) != 0) + break; + spin_unlock(&server->cb_lock); + schedule(); + spin_lock(&server->cb_lock); + } + remove_wait_queue(&server->cb_break_waitq, &myself); + __set_current_state(TASK_RUNNING); + } + + /* of course, it's always possible for the server to break this vnode's + * callback first... */ + if (vnode->cb_promised) + afs_do_give_up_callback(server, vnode); + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * dispatch a deferred give up callbacks operation + */ +void afs_dispatch_give_up_callbacks(struct work_struct *work) +{ + struct afs_server *server = + container_of(work, struct afs_server, cb_break_work.work); + + _enter(""); + + /* tell the fileserver to discard the callback promises it has + * - in the event of ENOMEM or some other error, we just forget that we + * had callbacks entirely, and the server will call us later to break + * them + */ + afs_fs_give_up_callbacks(server, &afs_async_call); +} + +/* + * flush the outstanding callback breaks on a server + */ +void afs_flush_callback_breaks(struct afs_server *server) +{ + mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0); +} + +#if 0 +/* + * update a bunch of callbacks + */ +static void afs_callback_updater(struct work_struct *work) +{ + struct afs_server *server; + struct afs_vnode *vnode, *xvnode; + time_t now; + long timeout; + int ret; + + server = container_of(work, struct afs_server, updater); + + _enter(""); + + now = get_seconds(); + + /* find the first vnode to update */ + spin_lock(&server->cb_lock); + for (;;) { + if (RB_EMPTY_ROOT(&server->cb_promises)) { + spin_unlock(&server->cb_lock); + _leave(" [nothing]"); + return; + } + + vnode = rb_entry(rb_first(&server->cb_promises), + struct afs_vnode, cb_promise); + if (atomic_read(&vnode->usage) > 0) + break; + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + + timeout = vnode->update_at - now; + if (timeout > 0) { + queue_delayed_work(afs_vnode_update_worker, + &afs_vnode_update, timeout * HZ); + spin_unlock(&server->cb_lock); + _leave(" [nothing]"); + return; + } + + list_del_init(&vnode->update); + atomic_inc(&vnode->usage); + spin_unlock(&server->cb_lock); + + /* we can now perform the update */ + _debug("update %s", vnode->vldb.name); + vnode->state = AFS_VL_UPDATING; + vnode->upd_rej_cnt = 0; + vnode->upd_busy_cnt = 0; + + ret = afs_vnode_update_record(vl, &vldb); + switch (ret) { + case 0: + afs_vnode_apply_update(vl, &vldb); + vnode->state = AFS_VL_UPDATING; + break; + case -ENOMEDIUM: + vnode->state = AFS_VL_VOLUME_DELETED; + break; + default: + vnode->state = AFS_VL_UNCERTAIN; + break; + } + + /* and then reschedule */ + _debug("reschedule"); + vnode->update_at = get_seconds() + afs_vnode_update_timeout; + + spin_lock(&server->cb_lock); + + if (!list_empty(&server->cb_promises)) { + /* next update in 10 minutes, but wait at least 1 second more + * than the newest record already queued so that we don't spam + * the VL server suddenly with lots of requests + */ + xvnode = list_entry(server->cb_promises.prev, + struct afs_vnode, update); + if (vnode->update_at <= xvnode->update_at) + vnode->update_at = xvnode->update_at + 1; + xvnode = list_entry(server->cb_promises.next, + struct afs_vnode, update); + timeout = xvnode->update_at - now; + if (timeout < 0) + timeout = 0; + } else { + timeout = afs_vnode_update_timeout; + } + + list_add_tail(&vnode->update, &server->cb_promises); + + _debug("timeout %ld", timeout); + queue_delayed_work(afs_vnode_update_worker, + &afs_vnode_update, timeout * HZ); + spin_unlock(&server->cb_lock); + afs_put_vnode(vl); +} +#endif + +/* + * initialise the callback update process + */ +int __init afs_callback_update_init(void) +{ + afs_callback_update_worker = + create_singlethread_workqueue("kafs_callbackd"); + return afs_callback_update_worker ? 0 : -ENOMEM; +} + +/* + * shut down the callback update process + */ +void afs_callback_update_kill(void) +{ + destroy_workqueue(afs_callback_update_worker); +} diff --git a/kernel/fs/afs/cell.c b/kernel/fs/afs/cell.c new file mode 100644 index 000000000..ca0a3cf93 --- /dev/null +++ b/kernel/fs/afs/cell.c @@ -0,0 +1,460 @@ +/* AFS cell and server record management + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +DECLARE_RWSEM(afs_proc_cells_sem); +LIST_HEAD(afs_proc_cells); + +static LIST_HEAD(afs_cells); +static DEFINE_RWLOCK(afs_cells_lock); +static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ +static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); +static struct afs_cell *afs_cell_root; + +/* + * allocate a cell record and fill in its name, VL server address list and + * allocate an anonymous key + */ +static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen, + char *vllist) +{ + struct afs_cell *cell; + struct key *key; + char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; + char *dvllist = NULL, *_vllist = NULL; + char delimiter = ':'; + int ret; + + _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist); + + BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ + + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + /* allocate and initialise a cell record */ + cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); + if (!cell) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + memcpy(cell->name, name, namelen); + cell->name[namelen] = 0; + + atomic_set(&cell->usage, 1); + INIT_LIST_HEAD(&cell->link); + rwlock_init(&cell->servers_lock); + INIT_LIST_HEAD(&cell->servers); + init_rwsem(&cell->vl_sem); + INIT_LIST_HEAD(&cell->vl_list); + spin_lock_init(&cell->vl_lock); + + /* if the ip address is invalid, try dns query */ + if (!vllist || strlen(vllist) < 7) { + ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); + if (ret < 0) { + if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY) + /* translate these errors into something + * userspace might understand */ + ret = -EDESTADDRREQ; + _leave(" = %d", ret); + return ERR_PTR(ret); + } + _vllist = dvllist; + + /* change the delimiter for user-space reply */ + delimiter = ','; + + } else { + _vllist = vllist; + } + + /* fill in the VL server list from the rest of the string */ + do { + unsigned a, b, c, d; + + next = strchr(_vllist, delimiter); + if (next) + *next++ = 0; + + if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) + goto bad_address; + + if (a > 255 || b > 255 || c > 255 || d > 255) + goto bad_address; + + cell->vl_addrs[cell->vl_naddrs++].s_addr = + htonl((a << 24) | (b << 16) | (c << 8) | d); + + } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); + + /* create a key to represent an anonymous user */ + memcpy(keyname, "afs@", 4); + dp = keyname + 4; + cp = cell->name; + do { + *dp++ = toupper(*cp); + } while (*cp++); + + key = rxrpc_get_null_key(keyname); + if (IS_ERR(key)) { + _debug("no key"); + ret = PTR_ERR(key); + goto error; + } + cell->anonymous_key = key; + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + + _leave(" = %p", cell); + return cell; + +bad_address: + printk(KERN_ERR "kAFS: bad VL server IP address\n"); + ret = -EINVAL; +error: + key_put(cell->anonymous_key); + kfree(dvllist); + kfree(cell); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * afs_cell_crate() - create a cell record + * @name: is the name of the cell. + * @namsesz: is the strlen of the cell name. + * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format. + * @retref: is T to return the cell reference when the cell exists. + */ +struct afs_cell *afs_cell_create(const char *name, unsigned namesz, + char *vllist, bool retref) +{ + struct afs_cell *cell; + int ret; + + _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist); + + down_write(&afs_cells_sem); + read_lock(&afs_cells_lock); + list_for_each_entry(cell, &afs_cells, link) { + if (strncasecmp(cell->name, name, namesz) == 0) + goto duplicate_name; + } + read_unlock(&afs_cells_lock); + + cell = afs_cell_alloc(name, namesz, vllist); + if (IS_ERR(cell)) { + _leave(" = %ld", PTR_ERR(cell)); + up_write(&afs_cells_sem); + return cell; + } + + /* add a proc directory for this cell */ + ret = afs_proc_cell_setup(cell); + if (ret < 0) + goto error; + +#ifdef CONFIG_AFS_FSCACHE + /* put it up for caching (this never returns an error) */ + cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell, true); +#endif + + /* add to the cell lists */ + write_lock(&afs_cells_lock); + list_add_tail(&cell->link, &afs_cells); + write_unlock(&afs_cells_lock); + + down_write(&afs_proc_cells_sem); + list_add_tail(&cell->proc_link, &afs_proc_cells); + up_write(&afs_proc_cells_sem); + up_write(&afs_cells_sem); + + _leave(" = %p", cell); + return cell; + +error: + up_write(&afs_cells_sem); + key_put(cell->anonymous_key); + kfree(cell); + _leave(" = %d", ret); + return ERR_PTR(ret); + +duplicate_name: + if (retref && !IS_ERR(cell)) + afs_get_cell(cell); + + read_unlock(&afs_cells_lock); + up_write(&afs_cells_sem); + + if (retref) { + _leave(" = %p", cell); + return cell; + } + + _leave(" = -EEXIST"); + return ERR_PTR(-EEXIST); +} + +/* + * set the root cell information + * - can be called with a module parameter string + * - can be called from a write to /proc/fs/afs/rootcell + */ +int afs_cell_init(char *rootcell) +{ + struct afs_cell *old_root, *new_root; + char *cp; + + _enter(""); + + if (!rootcell) { + /* module is loaded with no parameters, or built statically. + * - in the future we might initialize cell DB here. + */ + _leave(" = 0 [no root]"); + return 0; + } + + cp = strchr(rootcell, ':'); + if (!cp) + _debug("kAFS: no VL server IP addresses specified"); + else + *cp++ = 0; + + /* allocate a cell record for the root cell */ + new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false); + if (IS_ERR(new_root)) { + _leave(" = %ld", PTR_ERR(new_root)); + return PTR_ERR(new_root); + } + + /* install the new cell */ + write_lock(&afs_cells_lock); + old_root = afs_cell_root; + afs_cell_root = new_root; + write_unlock(&afs_cells_lock); + afs_put_cell(old_root); + + _leave(" = 0"); + return 0; +} + +/* + * lookup a cell record + */ +struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz, + bool dns_cell) +{ + struct afs_cell *cell; + + _enter("\"%*.*s\",", namesz, namesz, name ?: ""); + + down_read(&afs_cells_sem); + read_lock(&afs_cells_lock); + + if (name) { + /* if the cell was named, look for it in the cell record list */ + list_for_each_entry(cell, &afs_cells, link) { + if (strncmp(cell->name, name, namesz) == 0) { + afs_get_cell(cell); + goto found; + } + } + cell = ERR_PTR(-ENOENT); + if (dns_cell) + goto create_cell; + found: + ; + } else { + cell = afs_cell_root; + if (!cell) { + /* this should not happen unless user tries to mount + * when root cell is not set. Return an impossibly + * bizarre errno to alert the user. Things like + * ENOENT might be "more appropriate" but they happen + * for other reasons. + */ + cell = ERR_PTR(-EDESTADDRREQ); + } else { + afs_get_cell(cell); + } + + } + + read_unlock(&afs_cells_lock); + up_read(&afs_cells_sem); + _leave(" = %p", cell); + return cell; + +create_cell: + read_unlock(&afs_cells_lock); + up_read(&afs_cells_sem); + + cell = afs_cell_create(name, namesz, NULL, true); + + _leave(" = %p", cell); + return cell; +} + +#if 0 +/* + * try and get a cell record + */ +struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell) +{ + write_lock(&afs_cells_lock); + + if (cell && !list_empty(&cell->link)) + afs_get_cell(cell); + else + cell = NULL; + + write_unlock(&afs_cells_lock); + return cell; +} +#endif /* 0 */ + +/* + * destroy a cell record + */ +void afs_put_cell(struct afs_cell *cell) +{ + if (!cell) + return; + + _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + + ASSERTCMP(atomic_read(&cell->usage), >, 0); + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + write_lock(&afs_cells_lock); + + if (likely(!atomic_dec_and_test(&cell->usage))) { + write_unlock(&afs_cells_lock); + _leave(""); + return; + } + + ASSERT(list_empty(&cell->servers)); + ASSERT(list_empty(&cell->vl_list)); + + write_unlock(&afs_cells_lock); + + wake_up(&afs_cells_freeable_wq); + + _leave(" [unused]"); +} + +/* + * destroy a cell record + * - must be called with the afs_cells_sem write-locked + * - cell->link should have been broken by the caller + */ +static void afs_cell_destroy(struct afs_cell *cell) +{ + _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + + ASSERTCMP(atomic_read(&cell->usage), >=, 0); + ASSERT(list_empty(&cell->link)); + + /* wait for everyone to stop using the cell */ + if (atomic_read(&cell->usage) > 0) { + DECLARE_WAITQUEUE(myself, current); + + _debug("wait for cell %s", cell->name); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&afs_cells_freeable_wq, &myself); + + while (atomic_read(&cell->usage) > 0) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&afs_cells_freeable_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _debug("cell dead"); + ASSERTCMP(atomic_read(&cell->usage), ==, 0); + ASSERT(list_empty(&cell->servers)); + ASSERT(list_empty(&cell->vl_list)); + + afs_proc_cell_remove(cell); + + down_write(&afs_proc_cells_sem); + list_del_init(&cell->proc_link); + up_write(&afs_proc_cells_sem); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(cell->cache, 0); +#endif + key_put(cell->anonymous_key); + kfree(cell); + + _leave(" [destroyed]"); +} + +/* + * purge in-memory cell database on module unload or afs_init() failure + * - the timeout daemon is stopped before calling this + */ +void afs_cell_purge(void) +{ + struct afs_cell *cell; + + _enter(""); + + afs_put_cell(afs_cell_root); + + down_write(&afs_cells_sem); + + while (!list_empty(&afs_cells)) { + cell = NULL; + + /* remove the next cell from the front of the list */ + write_lock(&afs_cells_lock); + + if (!list_empty(&afs_cells)) { + cell = list_entry(afs_cells.next, + struct afs_cell, link); + list_del_init(&cell->link); + } + + write_unlock(&afs_cells_lock); + + if (cell) { + _debug("PURGING CELL %s (%d)", + cell->name, atomic_read(&cell->usage)); + + /* now the cell should be left with no references */ + afs_cell_destroy(cell); + } + } + + up_write(&afs_cells_sem); + _leave(""); +} diff --git a/kernel/fs/afs/cmservice.c b/kernel/fs/afs/cmservice.c new file mode 100644 index 000000000..4b0eff6da --- /dev/null +++ b/kernel/fs/afs/cmservice.c @@ -0,0 +1,604 @@ +/* AFS Cache Manager Service + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" +#include "afs_cm.h" + +#if 0 +struct workqueue_struct *afs_cm_workqueue; +#endif /* 0 */ + +static int afs_deliver_cb_init_call_back_state(struct afs_call *, + struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *, + struct sk_buff *, bool); +static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, + struct sk_buff *, bool); +static void afs_cm_destructor(struct afs_call *); + +/* + * CB.CallBack operation type + */ +static const struct afs_call_type afs_SRXCBCallBack = { + .name = "CB.CallBack", + .deliver = afs_deliver_cb_callback, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.InitCallBackState operation type + */ +static const struct afs_call_type afs_SRXCBInitCallBackState = { + .name = "CB.InitCallBackState", + .deliver = afs_deliver_cb_init_call_back_state, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.InitCallBackState3 operation type + */ +static const struct afs_call_type afs_SRXCBInitCallBackState3 = { + .name = "CB.InitCallBackState3", + .deliver = afs_deliver_cb_init_call_back_state3, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.Probe operation type + */ +static const struct afs_call_type afs_SRXCBProbe = { + .name = "CB.Probe", + .deliver = afs_deliver_cb_probe, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.ProbeUuid operation type + */ +static const struct afs_call_type afs_SRXCBProbeUuid = { + .name = "CB.ProbeUuid", + .deliver = afs_deliver_cb_probe_uuid, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.TellMeAboutYourself operation type + */ +static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { + .name = "CB.TellMeAboutYourself", + .deliver = afs_deliver_cb_tell_me_about_yourself, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * route an incoming cache manager call + * - return T if supported, F if not + */ +bool afs_cm_incoming_call(struct afs_call *call) +{ + u32 operation_id = ntohl(call->operation_ID); + + _enter("{CB.OP %u}", operation_id); + + switch (operation_id) { + case CBCallBack: + call->type = &afs_SRXCBCallBack; + return true; + case CBInitCallBackState: + call->type = &afs_SRXCBInitCallBackState; + return true; + case CBInitCallBackState3: + call->type = &afs_SRXCBInitCallBackState3; + return true; + case CBProbe: + call->type = &afs_SRXCBProbe; + return true; + case CBTellMeAboutYourself: + call->type = &afs_SRXCBTellMeAboutYourself; + return true; + default: + return false; + } +} + +/* + * clean up a cache manager call + */ +static void afs_cm_destructor(struct afs_call *call) +{ + _enter(""); + + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + + afs_put_server(call->server); + call->server = NULL; + kfree(call->buffer); + call->buffer = NULL; +} + +/* + * allow the fileserver to see if the cache manager is still alive + */ +static void SRXAFSCB_CallBack(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter(""); + + /* be sure to send the reply *before* attempting to spam the AFS server + * with FSFetchStatus requests on the vnodes with broken callbacks lest + * the AFS server get into a vicious cycle of trying to break further + * callbacks because it hadn't received completion of the CBCallBack op + * yet */ + afs_send_empty_reply(call); + + afs_break_callbacks(call->server, call->count, call->request); + _leave(""); +} + +/* + * deliver request data to a CB.CallBack call + */ +static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + struct afs_callback *cb; + struct afs_server *server; + struct in_addr addr; + __be32 *bp; + u32 tmp; + int ret, loop; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* extract the FID array and its count in two steps */ + case 1: + _debug("extract FID count"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("FID count: %u", call->count); + if (call->count > AFSCBMAX) + return -EBADMSG; + + call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->offset = 0; + call->unmarshall++; + + case 2: + _debug("extract FID array"); + ret = afs_extract_data(call, skb, last, call->buffer, + call->count * 3 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall FID array"); + call->request = kcalloc(call->count, + sizeof(struct afs_callback), + GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->fid.vid = ntohl(*bp++); + cb->fid.vnode = ntohl(*bp++); + cb->fid.unique = ntohl(*bp++); + cb->type = AFSCM_CB_UNTYPED; + } + + call->offset = 0; + call->unmarshall++; + + /* extract the callback array and its count in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count && tmp != 0) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + if (tmp == 0) + goto empty_cb_array; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, skb, last, call->request, + call->count * 3 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall CB array"); + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->version = ntohl(*bp++); + cb->expiry = ntohl(*bp++); + cb->type = ntohl(*bp++); + } + + empty_cb_array: + call->offset = 0; + call->unmarshall++; + + case 5: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: + break; + } + + if (!last) + return 0; + + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_CallBack); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to request callback state (re-)initialisation + */ +static void SRXAFSCB_InitCallBackState(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter("{%p}", call->server); + + afs_init_callback_state(call->server); + afs_send_empty_reply(call); + _leave(""); +} + +/* + * deliver request data to a CB.InitCallBackState call + */ +static int afs_deliver_cb_init_call_back_state(struct afs_call *call, + struct sk_buff *skb, + bool last) +{ + struct afs_server *server; + struct in_addr addr; + + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * deliver request data to a CB.InitCallBackState3 call + */ +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, + struct sk_buff *skb, + bool last) +{ + struct afs_server *server; + struct in_addr addr; + + _enter(",{%u},%d", skb->len, last); + + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to see if the cache manager is still alive + */ +static void SRXAFSCB_Probe(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter(""); + afs_send_empty_reply(call); + _leave(""); +} + +/* + * deliver request data to a CB.Probe call + */ +static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_Probe); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to quickly find out if the fileserver has been rebooted + */ +static void SRXAFSCB_ProbeUuid(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + struct afs_uuid *r = call->request; + + struct { + __be32 match; + } reply; + + _enter(""); + + + if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) + reply.match = htonl(0); + else + reply.match = htonl(1); + + afs_send_simple_reply(call, &reply, sizeof(reply)); + _leave(""); +} + +/* + * deliver request data to a CB.ProbeUuid call + */ +static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; + + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, skb, last, call->buffer, + 11 * sizeof(__be32)); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to ask about the cache manager's capabilities + */ +static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) +{ + struct afs_interface *ifs; + struct afs_call *call = container_of(work, struct afs_call, work); + int loop, nifs; + + struct { + struct /* InterfaceAddr */ { + __be32 nifs; + __be32 uuid[11]; + __be32 ifaddr[32]; + __be32 netmask[32]; + __be32 mtu[32]; + } ia; + struct /* Capabilities */ { + __be32 capcount; + __be32 caps[1]; + } cap; + } reply; + + _enter(""); + + nifs = 0; + ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL); + if (ifs) { + nifs = afs_get_ipv4_interfaces(ifs, 32, false); + if (nifs < 0) { + kfree(ifs); + ifs = NULL; + nifs = 0; + } + } + + memset(&reply, 0, sizeof(reply)); + reply.ia.nifs = htonl(nifs); + + reply.ia.uuid[0] = htonl(afs_uuid.time_low); + reply.ia.uuid[1] = htonl(afs_uuid.time_mid); + reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version); + reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved); + reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low); + for (loop = 0; loop < 6; loop++) + reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]); + + if (ifs) { + for (loop = 0; loop < nifs; loop++) { + reply.ia.ifaddr[loop] = ifs[loop].address.s_addr; + reply.ia.netmask[loop] = ifs[loop].netmask.s_addr; + reply.ia.mtu[loop] = htonl(ifs[loop].mtu); + } + kfree(ifs); + } + + reply.cap.capcount = htonl(1); + reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION); + afs_send_simple_reply(call, &reply, sizeof(reply)); + + _leave(""); +} + +/* + * deliver request data to a CB.TellMeAboutYourself call + */ +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); + queue_work(afs_wq, &call->work); + return 0; +} diff --git a/kernel/fs/afs/dir.c b/kernel/fs/afs/dir.c new file mode 100644 index 000000000..e10e17788 --- /dev/null +++ b/kernel/fs/afs/dir.c @@ -0,0 +1,1123 @@ +/* dir.c: AFS filesystem directory handling + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +static int afs_dir_open(struct inode *inode, struct file *file); +static int afs_readdir(struct file *file, struct dir_context *ctx); +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); +static int afs_d_delete(const struct dentry *dentry); +static void afs_d_release(struct dentry *dentry); +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl); +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +static int afs_rmdir(struct inode *dir, struct dentry *dentry); +static int afs_unlink(struct inode *dir, struct dentry *dentry); +static int afs_link(struct dentry *from, struct inode *dir, + struct dentry *dentry); +static int afs_symlink(struct inode *dir, struct dentry *dentry, + const char *content); +static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); + +const struct file_operations afs_dir_file_operations = { + .open = afs_dir_open, + .release = afs_release, + .iterate = afs_readdir, + .lock = afs_lock, + .llseek = generic_file_llseek, +}; + +const struct inode_operations afs_dir_inode_operations = { + .create = afs_create, + .lookup = afs_lookup, + .link = afs_link, + .unlink = afs_unlink, + .symlink = afs_symlink, + .mkdir = afs_mkdir, + .rmdir = afs_rmdir, + .rename = afs_rename, + .permission = afs_permission, + .getattr = afs_getattr, + .setattr = afs_setattr, +}; + +const struct dentry_operations afs_fs_dentry_operations = { + .d_revalidate = afs_d_revalidate, + .d_delete = afs_d_delete, + .d_release = afs_d_release, + .d_automount = afs_d_automount, +}; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIRENT_PER_BLOCK 64 + +union afs_dirent { + struct { + uint8_t valid; + uint8_t unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + uint8_t name[16]; + uint8_t overflow[4]; /* if any char of the name (inc + * NUL) reaches here, consume + * the next dirent too */ + } u; + uint8_t extended_name[32]; +}; + +/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ +struct afs_dir_pagehdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + uint8_t nentries; + uint8_t bitmap[8]; + uint8_t pad[19]; +}; + +/* directory block layout */ +union afs_dir_block { + + struct afs_dir_pagehdr pagehdr; + + struct { + struct afs_dir_pagehdr pagehdr; + uint8_t alloc_ctrs[128]; + /* dir hash table */ + uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; + } hdr; + + union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; +}; + +/* layout on a linux VM page */ +struct afs_dir_page { + union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; +}; + +struct afs_lookup_cookie { + struct dir_context ctx; + struct afs_fid fid; + struct qstr name; + int found; +}; + +/* + * check that a directory page is valid + */ +static inline void afs_dir_check_page(struct inode *dir, struct page *page) +{ + struct afs_dir_page *dbuf; + loff_t latter; + int tmp, qty; + +#if 0 + /* check the page count */ + qty = desc.size / sizeof(dbuf->blocks[0]); + if (qty == 0) + goto error; + + if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { + printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", + __func__, dir->i_ino, qty, + ntohs(dbuf->blocks[0].pagehdr.npages)); + goto error; + } +#endif + + /* determine how many magic numbers there should be in this page */ + latter = dir->i_size - page_offset(page); + if (latter >= PAGE_SIZE) + qty = PAGE_SIZE; + else + qty = latter; + qty /= sizeof(union afs_dir_block); + + /* check them */ + dbuf = page_address(page); + for (tmp = 0; tmp < qty; tmp++) { + if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { + printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n", + __func__, dir->i_ino, tmp, qty, + ntohs(dbuf->blocks[tmp].pagehdr.magic)); + goto error; + } + } + + SetPageChecked(page); + return; + +error: + SetPageChecked(page); + SetPageError(page); +} + +/* + * discard a page cached in the pagecache + */ +static inline void afs_dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* + * get a page into the pagecache + */ +static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, + struct key *key) +{ + struct page *page; + _enter("{%lu},%lu", dir->i_ino, index); + + page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + afs_dir_check_page(dir, page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + afs_dir_put_page(page); + _leave(" = -EIO"); + return ERR_PTR(-EIO); +} + +/* + * open an AFS directory file + */ +static int afs_dir_open(struct inode *inode, struct file *file) +{ + _enter("{%lu}", inode->i_ino); + + BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) + return -ENOENT; + + return afs_open(inode, file); +} + +/* + * deal with one block in an AFS directory + */ +static int afs_dir_iterate_block(struct dir_context *ctx, + union afs_dir_block *block, + unsigned blkoff) +{ + union afs_dirent *dire; + unsigned offset, next, curr; + size_t nlen; + int tmp; + + _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); + + curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); + + /* walk through the block, an entry at a time */ + for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; + offset < AFS_DIRENT_PER_BLOCK; + offset = next + ) { + next = offset + 1; + + /* skip entries marked unused in the bitmap */ + if (!(block->pagehdr.bitmap[offset / 8] & + (1 << (offset % 8)))) { + _debug("ENT[%Zu.%u]: unused", + blkoff / sizeof(union afs_dir_block), offset); + if (offset >= curr) + ctx->pos = blkoff + + next * sizeof(union afs_dirent); + continue; + } + + /* got a valid entry */ + dire = &block->dirents[offset]; + nlen = strnlen(dire->u.name, + sizeof(*block) - + offset * sizeof(union afs_dirent)); + + _debug("ENT[%Zu.%u]: %s %Zu \"%s\"", + blkoff / sizeof(union afs_dir_block), offset, + (offset < curr ? "skip" : "fill"), + nlen, dire->u.name); + + /* work out where the next possible entry is */ + for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { + if (next >= AFS_DIRENT_PER_BLOCK) { + _debug("ENT[%Zu.%u]:" + " %u travelled beyond end dir block" + " (len %u/%Zu)", + blkoff / sizeof(union afs_dir_block), + offset, next, tmp, nlen); + return -EIO; + } + if (!(block->pagehdr.bitmap[next / 8] & + (1 << (next % 8)))) { + _debug("ENT[%Zu.%u]:" + " %u unmarked extension (len %u/%Zu)", + blkoff / sizeof(union afs_dir_block), + offset, next, tmp, nlen); + return -EIO; + } + + _debug("ENT[%Zu.%u]: ext %u/%Zu", + blkoff / sizeof(union afs_dir_block), + next, tmp, nlen); + next++; + } + + /* skip if starts before the current position */ + if (offset < curr) + continue; + + /* found the next entry */ + if (!dir_emit(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), + ctx->actor == afs_lookup_filldir ? + ntohl(dire->u.unique) : DT_UNKNOWN)) { + _leave(" = 0 [full]"); + return 0; + } + + ctx->pos = blkoff + next * sizeof(union afs_dirent); + } + + _leave(" = 1 [more]"); + return 1; +} + +/* + * iterate through the data blob that lists the contents of an AFS directory + */ +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct key *key) +{ + union afs_dir_block *dblock; + struct afs_dir_page *dbuf; + struct page *page; + unsigned blkoff, limit; + int ret; + + _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { + _leave(" = -ESTALE"); + return -ESTALE; + } + + /* round the file position up to the next entry boundary */ + ctx->pos += sizeof(union afs_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_dirent) - 1); + + /* walk through the blocks in sequence */ + ret = 0; + while (ctx->pos < dir->i_size) { + blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); + + /* fetch the appropriate page from the directory */ + page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + + limit = blkoff & ~(PAGE_SIZE - 1); + + dbuf = page_address(page); + + /* deal with the individual blocks stashed on this page */ + do { + dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / + sizeof(union afs_dir_block)]; + ret = afs_dir_iterate_block(ctx, dblock, blkoff); + if (ret != 1) { + afs_dir_put_page(page); + goto out; + } + + blkoff += sizeof(union afs_dir_block); + + } while (ctx->pos < dir->i_size && blkoff < limit); + + afs_dir_put_page(page); + ret = 0; + } + +out: + _leave(" = %d", ret); + return ret; +} + +/* + * read an AFS directory + */ +static int afs_readdir(struct file *file, struct dir_context *ctx) +{ + return afs_dir_iterate(file_inode(file), + ctx, file->private_data); +} + +/* + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype + */ +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) +{ + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); + + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, + (unsigned long long) ino, dtype); + + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + + if (cookie->name.len != nlen || + memcmp(cookie->name.name, name, nlen) != 0) { + _leave(" = 0 [no]"); + return 0; + } + + cookie->fid.vnode = ino; + cookie->fid.unique = dtype; + cookie->found = 1; + + _leave(" = -1 [found]"); + return -1; +} + +/* + * do a lookup in a directory + * - just returns the FID the dentry name maps to if found + */ +static int afs_do_lookup(struct inode *dir, struct dentry *dentry, + struct afs_fid *fid, struct key *key) +{ + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_lookup_cookie cookie = { + .ctx.actor = afs_lookup_filldir, + .name = dentry->d_name, + .fid.vid = as->volume->vid + }; + int ret; + + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + + /* search the directory */ + ret = afs_dir_iterate(dir, &cookie.ctx, key); + if (ret < 0) { + _leave(" = %d [iter]", ret); + return ret; + } + + ret = -ENOENT; + if (!cookie.found) { + _leave(" = -ENOENT [not found]"); + return -ENOENT; + } + + *fid = cookie.fid; + _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique); + return 0; +} + +/* + * Try to auto mount the mountpoint with pseudo directory, if the autocell + * operation is setted. + */ +static struct inode *afs_try_auto_mntpt( + int ret, struct dentry *dentry, struct inode *dir, struct key *key, + struct afs_fid *fid) +{ + const char *devname = dentry->d_name.name; + struct afs_vnode *vnode = AFS_FS_I(dir); + struct inode *inode; + + _enter("%d, %p{%pd}, {%x:%u}, %p", + ret, dentry, dentry, vnode->fid.vid, vnode->fid.vnode, key); + + if (ret != -ENOENT || + !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + goto out; + + inode = afs_iget_autocell(dir, devname, strlen(devname), key); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + *fid = AFS_FS_I(inode)->fid; + _leave("= %p", inode); + return inode; + +out: + _leave("= %d", ret); + return ERR_PTR(ret); +} + +/* + * look up an entry in a directory + */ +static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct afs_vnode *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + vnode = AFS_FS_I(dir); + + _enter("{%x:%u},%p{%pd},", + vnode->fid.vid, vnode->fid.vnode, dentry, dentry); + + ASSERTCMP(d_inode(dentry), ==, NULL); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _leave(" = -ESTALE"); + return ERR_PTR(-ESTALE); + } + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return ERR_CAST(key); + } + + ret = afs_validate(vnode, key); + if (ret < 0) { + key_put(key); + _leave(" = %d [val]", ret); + return ERR_PTR(ret); + } + + ret = afs_do_lookup(dir, dentry, &fid, key); + if (ret < 0) { + inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid); + if (!IS_ERR(inode)) { + key_put(key); + goto success; + } + + ret = PTR_ERR(inode); + key_put(key); + if (ret == -ENOENT) { + d_add(dentry, NULL); + _leave(" = NULL [negative]"); + return NULL; + } + _leave(" = %d [do]", ret); + return ERR_PTR(ret); + } + dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + + /* instantiate the dentry */ + inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL); + key_put(key); + if (IS_ERR(inode)) { + _leave(" = %ld", PTR_ERR(inode)); + return ERR_CAST(inode); + } + +success: + d_add(dentry, inode); + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", + fid.vnode, + fid.unique, + d_inode(dentry)->i_ino, + d_inode(dentry)->i_generation); + + return NULL; +} + +/* + * check that a dentry lookup hit has found a valid entry + * - NOTE! the hit can be a negative hit too, so we can't assume we have an + * inode + */ +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct afs_vnode *vnode, *dir; + struct afs_fid uninitialized_var(fid); + struct dentry *parent; + struct key *key; + void *dir_version; + int ret; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + vnode = AFS_FS_I(d_inode(dentry)); + + if (d_really_is_positive(dentry)) + _enter("{v={%x:%u} n=%pd fl=%lx},", + vnode->fid.vid, vnode->fid.vnode, dentry, + vnode->flags); + else + _enter("{neg n=%pd}", dentry); + + key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell); + if (IS_ERR(key)) + key = NULL; + + /* lock down the parent dentry so we can peer at it */ + parent = dget_parent(dentry); + dir = AFS_FS_I(d_inode(parent)); + + /* validate the parent directory */ + if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) + afs_validate(dir, key); + + if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { + _debug("%pd: parent dir deleted", dentry); + goto out_bad; + } + + dir_version = (void *) (unsigned long) dir->status.data_version; + if (dentry->d_fsdata == dir_version) + goto out_valid; /* the dir contents are unchanged */ + + _debug("dir modified"); + + /* search the directory for this vnode */ + ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + switch (ret) { + case 0: + /* the filename maps to something */ + if (d_really_is_negative(dentry)) + goto out_bad; + if (is_bad_inode(d_inode(dentry))) { + printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", + dentry); + goto out_bad; + } + + /* if the vnode ID has changed, then the dirent points to a + * different file */ + if (fid.vnode != vnode->fid.vnode) { + _debug("%pd: dirent changed [%u != %u]", + dentry, fid.vnode, + vnode->fid.vnode); + goto not_found; + } + + /* if the vnode ID uniqifier has changed, then the file has + * been deleted and replaced, and the original vnode ID has + * been reused */ + if (fid.unique != vnode->fid.unique) { + _debug("%pd: file deleted (uq %u -> %u I:%u)", + dentry, fid.unique, + vnode->fid.unique, + d_inode(dentry)->i_generation); + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + spin_unlock(&vnode->lock); + goto not_found; + } + goto out_valid; + + case -ENOENT: + /* the filename is unknown */ + _debug("%pd: dirent not found", dentry); + if (d_really_is_positive(dentry)) + goto not_found; + goto out_valid; + + default: + _debug("failed to iterate dir %pd: %d", + parent, ret); + goto out_bad; + } + +out_valid: + dentry->d_fsdata = dir_version; + dput(parent); + key_put(key); + _leave(" = 1 [valid]"); + return 1; + + /* the dirent, if it exists, now points to a different vnode */ +not_found: + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); + +out_bad: + _debug("dropping dentry %pd2", dentry); + dput(parent); + key_put(key); + + _leave(" = 0 [bad]"); + return 0; +} + +/* + * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't + * sleep) + * - called from dput() when d_count is going to 0. + * - return 1 to request dentry be unhashed, 0 otherwise + */ +static int afs_d_delete(const struct dentry *dentry) +{ + _enter("%pd", dentry); + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto zap; + + if (d_really_is_positive(dentry) && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags))) + goto zap; + + _leave(" = 0 [keep]"); + return 0; + +zap: + _leave(" = 1 [zap]"); + return 1; +} + +/* + * handle dentry release + */ +static void afs_d_release(struct dentry *dentry) +{ + _enter("%pd", dentry); +} + +/* + * create a directory on an AFS filesystem + */ +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct afs_file_status status; + struct afs_callback cb; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%pd},%ho", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + mode |= S_IFDIR; + ret = afs_vnode_create(dvnode, key, dentry->d_name.name, + mode, &fid, &status, &cb, &server); + if (ret < 0) + goto mkdir_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +mkdir_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * remove a directory from an AFS filesystem + */ +static int afs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true); + if (ret < 0) + goto rmdir_error; + + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + clear_nlink(&vnode->vfs_inode); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_discard_callback_on_delete(vnode); + } + + key_put(key); + _leave(" = 0"); + return 0; + +rmdir_error: + key_put(key); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * remove a file from an AFS filesystem + */ +static int afs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + + /* make sure we have a callback promise on the victim */ + ret = afs_validate(vnode, key); + if (ret < 0) + goto error; + } + + ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false); + if (ret < 0) + goto remove_error; + + if (d_really_is_positive(dentry)) { + /* if the file wasn't deleted due to excess hard links, the + * fileserver will break the callback promise on the file - if + * it had one - before it returns to us, and if it was deleted, + * it won't + * + * however, if we didn't have a callback promise outstanding, + * or it was outstanding on a different server, then it won't + * break it either... + */ + vnode = AFS_FS_I(d_inode(dentry)); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + _debug("AFS_VNODE_DELETED"); + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) + _debug("AFS_VNODE_CB_BROKEN"); + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_validate(vnode, key); + _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + } + + key_put(key); + _leave(" = 0"); + return 0; + +remove_error: + key_put(key); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * create a regular file on an AFS filesystem + */ +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct afs_file_status status; + struct afs_callback cb; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%pd},%ho,", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + mode |= S_IFREG; + ret = afs_vnode_create(dvnode, key, dentry->d_name.name, + mode, &fid, &status, &cb, &server); + if (ret < 0) + goto create_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +create_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * create a hard link between files in an AFS filesystem + */ +static int afs_link(struct dentry *from, struct inode *dir, + struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + vnode = AFS_FS_I(d_inode(from)); + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%x:%u},{%pd}", + vnode->fid.vid, vnode->fid.vnode, + dvnode->fid.vid, dvnode->fid.vnode, + dentry); + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name); + if (ret < 0) + goto link_error; + + ihold(&vnode->vfs_inode); + d_instantiate(dentry, &vnode->vfs_inode); + key_put(key); + _leave(" = 0"); + return 0; + +link_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * create a symlink in an AFS filesystem + */ +static int afs_symlink(struct inode *dir, struct dentry *dentry, + const char *content) +{ + struct afs_file_status status; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%pd},%s", + dvnode->fid.vid, dvnode->fid.vnode, dentry, + content); + + ret = -EINVAL; + if (strlen(content) >= AFSPATHMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content, + &fid, &status, &server); + if (ret < 0) + goto create_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, NULL); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +create_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * rename a file in an AFS filesystem and/or move it between directories + */ +static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct key *key; + int ret; + + vnode = AFS_FS_I(d_inode(old_dentry)); + orig_dvnode = AFS_FS_I(old_dir); + new_dvnode = AFS_FS_I(new_dir); + + _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", + orig_dvnode->fid.vid, orig_dvnode->fid.vnode, + vnode->fid.vid, vnode->fid.vnode, + new_dvnode->fid.vid, new_dvnode->fid.vnode, + new_dentry); + + key = afs_request_key(orig_dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_rename(orig_dvnode, new_dvnode, key, + old_dentry->d_name.name, + new_dentry->d_name.name); + if (ret < 0) + goto rename_error; + key_put(key); + _leave(" = 0"); + return 0; + +rename_error: + key_put(key); +error: + d_drop(new_dentry); + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/afs/file.c b/kernel/fs/afs/file.c new file mode 100644 index 000000000..999bc3cae --- /dev/null +++ b/kernel/fs/afs/file.c @@ -0,0 +1,378 @@ +/* AFS filesystem file handling + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int afs_readpage(struct file *file, struct page *page); +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); +static int afs_releasepage(struct page *page, gfp_t gfp_flags); +static int afs_launder_page(struct page *page); + +static int afs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + +const struct file_operations afs_file_operations = { + .open = afs_open, + .release = afs_release, + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = afs_file_write, + .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, + .fsync = afs_fsync, + .lock = afs_lock, + .flock = afs_flock, +}; + +const struct inode_operations afs_file_inode_operations = { + .getattr = afs_getattr, + .setattr = afs_setattr, + .permission = afs_permission, +}; + +const struct address_space_operations afs_fs_aops = { + .readpage = afs_readpage, + .readpages = afs_readpages, + .set_page_dirty = afs_set_page_dirty, + .launder_page = afs_launder_page, + .releasepage = afs_releasepage, + .invalidatepage = afs_invalidatepage, + .write_begin = afs_write_begin, + .write_end = afs_write_end, + .writepage = afs_writepage, + .writepages = afs_writepages, +}; + +/* + * open an AFS file or directory and attach a key to it + */ +int afs_open(struct inode *inode, struct file *file) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key; + int ret; + + _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } + + ret = afs_validate(vnode, key); + if (ret < 0) { + _leave(" = %d [val]", ret); + return ret; + } + + file->private_data = key; + _leave(" = 0"); + return 0; +} + +/* + * release an AFS file or directory and discard its key + */ +int afs_release(struct inode *inode, struct file *file) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + + _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + + key_put(file->private_data); + _leave(" = 0"); + return 0; +} + +#ifdef CONFIG_AFS_FSCACHE +/* + * deal with notification that a page was read from the cache + */ +static void afs_file_readpage_read_complete(struct page *page, + void *data, + int error) +{ + _enter("%p,%p,%d", page, data, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) + SetPageUptodate(page); + unlock_page(page); +} +#endif + +/* + * read page from file, directory or symlink, given a key to use + */ +int afs_page_filler(void *data, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key = data; + size_t len; + off_t offset; + int ret; + + _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + + BUG_ON(!PageLocked(page)); + + ret = -ESTALE; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto error; + + /* is it cached? */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_page(vnode->cache, + page, + afs_file_readpage_read_complete, + NULL, + GFP_KERNEL); +#else + ret = -ENOBUFS; +#endif + switch (ret) { + /* read BIO submitted (page in cache) */ + case 0: + break; + + /* page not yet cached */ + case -ENODATA: + _debug("cache said ENODATA"); + goto go_on; + + /* page will not be cached */ + case -ENOBUFS: + _debug("cache said ENOBUFS"); + default: + go_on: + offset = page->index << PAGE_CACHE_SHIFT; + len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); + + /* read the contents of the file from the server into the + * page */ + ret = afs_vnode_fetch_data(vnode, key, offset, len, page); + if (ret < 0) { + if (ret == -ENOENT) { + _debug("got NOENT from server" + " - marking file deleted and stale"); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); +#endif + BUG_ON(PageFsCache(page)); + goto error; + } + + SetPageUptodate(page); + + /* send the page to the cache */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page) && + fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_uncache_page(vnode->cache, page); + BUG_ON(PageFsCache(page)); + } +#endif + unlock_page(page); + } + + _leave(" = 0"); + return 0; + +error: + SetPageError(page); + unlock_page(page); + _leave(" = %d", ret); + return ret; +} + +/* + * read page from file, directory or symlink, given a file to nominate the key + * to be used + */ +static int afs_readpage(struct file *file, struct page *page) +{ + struct key *key; + int ret; + + if (file) { + key = file->private_data; + ASSERT(key != NULL); + ret = afs_page_filler(key, page); + } else { + struct inode *inode = page->mapping->host; + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + } else { + ret = afs_page_filler(key, page); + key_put(key); + } + } + return ret; +} + +/* + * read a set of pages + */ +static int afs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct key *key = file->private_data; + struct afs_vnode *vnode; + int ret = 0; + + _enter("{%d},{%lu},,%d", + key_serial(key), mapping->host->i_ino, nr_pages); + + ASSERT(key != NULL); + + vnode = AFS_FS_I(mapping->host); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _leave(" = -ESTALE"); + return -ESTALE; + } + + /* attempt to read as many of the pages as possible */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_pages(vnode->cache, + mapping, + pages, + &nr_pages, + afs_file_readpage_read_complete, + NULL, + mapping_gfp_mask(mapping)); +#else + ret = -ENOBUFS; +#endif + + switch (ret) { + /* all pages are being read from the cache */ + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(nr_pages != 0); + _leave(" = 0 [reading all]"); + return 0; + + /* there were pages that couldn't be read from the cache */ + case -ENODATA: + case -ENOBUFS: + break; + + /* other error */ + default: + _leave(" = %d", ret); + return ret; + } + + /* load the missing pages from the network */ + ret = read_cache_pages(mapping, pages, afs_page_filler, key); + + _leave(" = %d [netting]", ret); + return ret; +} + +/* + * write back a dirty page + */ +static int afs_launder_page(struct page *page) +{ + _enter("{%lu}", page->index); + + return 0; +} + +/* + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + + _enter("{%lu},%u,%u", page->index, offset, length); + + BUG_ON(!PageLocked(page)); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == PAGE_CACHE_SIZE) { +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + fscache_wait_on_page_write(vnode->cache, page); + fscache_uncache_page(vnode->cache, page); + } +#endif + + if (PagePrivate(page)) { + if (wb && !PageWriteback(page)) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + + if (!page_private(page)) + ClearPagePrivate(page); + } + } + + _leave(""); +} + +/* + * release a page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not + */ +static int afs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + + _enter("{{%x:%u}[%lu],%lx},%x", + vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, + gfp_flags); + + /* deny if page is being written to the cache and the caller hasn't + * elected to wait */ +#ifdef CONFIG_AFS_FSCACHE + if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { + _leave(" = F [cache busy]"); + return 0; + } +#endif + + if (PagePrivate(page)) { + if (wb) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + ClearPagePrivate(page); + } + + /* indicate that the page can be released */ + _leave(" = T"); + return 1; +} diff --git a/kernel/fs/afs/flock.c b/kernel/fs/afs/flock.c new file mode 100644 index 000000000..4baf1d2b3 --- /dev/null +++ b/kernel/fs/afs/flock.c @@ -0,0 +1,585 @@ +/* AFS file locking support + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "internal.h" + +#define AFS_LOCK_GRANTED 0 +#define AFS_LOCK_PENDING 1 + +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl); +static void afs_fl_release_private(struct file_lock *fl); + +static struct workqueue_struct *afs_lock_manager; +static DEFINE_MUTEX(afs_lock_manager_mutex); + +static const struct file_lock_operations afs_lock_ops = { + .fl_copy_lock = afs_fl_copy_lock, + .fl_release_private = afs_fl_release_private, +}; + +/* + * initialise the lock manager thread if it isn't already running + */ +static int afs_init_lock_manager(void) +{ + int ret; + + ret = 0; + if (!afs_lock_manager) { + mutex_lock(&afs_lock_manager_mutex); + if (!afs_lock_manager) { + afs_lock_manager = + create_singlethread_workqueue("kafs_lockd"); + if (!afs_lock_manager) + ret = -ENOMEM; + } + mutex_unlock(&afs_lock_manager_mutex); + } + return ret; +} + +/* + * destroy the lock manager thread if it's running + */ +void __exit afs_kill_lock_manager(void) +{ + if (afs_lock_manager) + destroy_workqueue(afs_lock_manager); +} + +/* + * if the callback is broken on this vnode, then the lock may now be available + */ +void afs_lock_may_be_available(struct afs_vnode *vnode) +{ + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); +} + +/* + * the lock will time out in 5 minutes unless we extend it, so schedule + * extension in a bit less than that time + */ +static void afs_schedule_lock_extension(struct afs_vnode *vnode) +{ + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + AFS_LOCKWAIT * HZ / 2); +} + +/* + * grant one or more locks (readlocks are allowed to jump the queue if the + * first lock in the queue is itself a readlock) + * - the caller must hold the vnode lock + */ +static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl) +{ + struct file_lock *p, *_p; + + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + if (fl->fl_type == F_RDLCK) { + list_for_each_entry_safe(p, _p, &vnode->pending_locks, + fl_u.afs.link) { + if (p->fl_type == F_RDLCK) { + p->fl_u.afs.state = AFS_LOCK_GRANTED; + list_move_tail(&p->fl_u.afs.link, + &vnode->granted_locks); + wake_up(&p->fl_wait); + } + } + } +} + +/* + * do work for a lock, including: + * - probing for a lock we're waiting on but didn't get immediately + * - extending a lock that's close to timing out + */ +void afs_lock_work(struct work_struct *work) +{ + struct afs_vnode *vnode = + container_of(work, struct afs_vnode, lock_work.work); + struct file_lock *fl; + afs_lock_type_t type; + struct key *key; + int ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + spin_lock(&vnode->lock); + + if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { + _debug("unlock"); + spin_unlock(&vnode->lock); + + /* attempt to release the server lock; if it fails, we just + * wait 5 minutes and it'll time out anyway */ + ret = afs_vnode_release_lock(vnode, vnode->unlock_key); + if (ret < 0) + printk(KERN_WARNING "AFS:" + " Failed to release lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + key_put(vnode->unlock_key); + vnode->unlock_key = NULL; + clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); + } + + /* if we've got a lock, then it must be time to extend that lock as AFS + * locks time out after 5 minutes */ + if (!list_empty(&vnode->granted_locks)) { + _debug("extend"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->granted_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + spin_unlock(&vnode->lock); + + ret = afs_vnode_extend_lock(vnode, key); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + key_put(key); + switch (ret) { + case 0: + afs_schedule_lock_extension(vnode); + break; + default: + /* ummm... we failed to extend the lock - retry + * extension shortly */ + printk(KERN_WARNING "AFS:" + " Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + HZ * 10); + break; + } + _leave(" [extend]"); + return; + } + + /* if we don't have a granted lock, then we must've been called back by + * the server, and so if might be possible to get a lock we're + * currently waiting for */ + if (!list_empty(&vnode->pending_locks)) { + _debug("get"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + type = (fl->fl_type == F_RDLCK) ? + AFS_LOCK_READ : AFS_LOCK_WRITE; + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case -EWOULDBLOCK: + _debug("blocked"); + break; + case 0: + _debug("acquired"); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + ret = AFS_LOCK_GRANTED; + default: + spin_lock(&vnode->lock); + /* the pending lock may have been withdrawn due to a + * signal */ + if (list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link) == fl) { + fl->fl_u.afs.state = ret; + if (ret == AFS_LOCK_GRANTED) + afs_grant_locks(vnode, fl); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + } else { + _debug("withdrawn"); + clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); + clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + spin_unlock(&vnode->lock); + afs_vnode_release_lock(vnode, key); + if (!list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); + } + break; + } + key_put(key); + _leave(" [pend]"); + return; + } + + /* looks like the lock request was withdrawn on a signal */ + spin_unlock(&vnode->lock); + _leave(" [no locks]"); +} + +/* + * pass responsibility for the unlocking of a vnode on the server to the + * manager thread, lest a pending signal in the calling thread interrupt + * AF_RXRPC + * - the caller must hold the vnode lock + */ +static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +{ + cancel_delayed_work(&vnode->lock_work); + if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && + !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) + BUG(); + if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) + BUG(); + vnode->unlock_key = key_get(key); + afs_lock_may_be_available(vnode); +} + +/* + * request a lock on a file on the server + */ +static int afs_do_setlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct afs_vnode *vnode = AFS_FS_I(inode); + afs_lock_type_t type; + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file locks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + ret = afs_init_lock_manager(); + if (ret < 0) + return ret; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + + spin_lock(&inode->i_lock); + + /* make sure we've got a callback on this file and that our view of the + * data version is up to date */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + + if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { + ret = -EAGAIN; + goto error; + } + + spin_lock(&vnode->lock); + + /* if we've already got a readlock on the server then we can instantly + * grant another readlock, irrespective of whether there are any + * pending writelocks */ + if (type == AFS_LOCK_READ && + vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + _debug("instant readlock"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + ASSERT(!list_empty(&vnode->granted_locks)); + goto sharing_existing_lock; + } + + /* if there's no-one else with a lock on this vnode, then we need to + * ask the server for a lock */ + if (list_empty(&vnode->pending_locks) && + list_empty(&vnode->granted_locks)) { + _debug("not locked"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); + set_bit(AFS_VNODE_LOCKING, &vnode->flags); + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case 0: + _debug("acquired"); + goto acquired_server_lock; + case -EWOULDBLOCK: + _debug("would block"); + spin_lock(&vnode->lock); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, + &fl->fl_u.afs.link); + goto wait; + default: + spin_lock(&vnode->lock); + list_del_init(&fl->fl_u.afs.link); + spin_unlock(&vnode->lock); + goto error; + } + } + + /* otherwise, we need to wait for a local lock to become available */ + _debug("wait local"); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); +wait: + if (!(fl->fl_flags & FL_SLEEP)) { + _debug("noblock"); + ret = -EAGAIN; + goto abort_attempt; + } + spin_unlock(&vnode->lock); + + /* now we need to sleep and wait for the lock manager thread to get the + * lock from the server */ + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state <= AFS_LOCK_GRANTED); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) + goto error; + spin_lock(&vnode->lock); + goto given_lock; + } + + /* we were interrupted, but someone may still be in the throes of + * giving us the lock */ + _debug("intr"); + ASSERTCMP(ret, ==, -ERESTARTSYS); + + spin_lock(&vnode->lock); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) { + spin_unlock(&vnode->lock); + goto error; + } + goto given_lock; + } + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + if (list_empty(&vnode->granted_locks) && + vnode->pending_locks.next == &fl->fl_u.afs.link) { + if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { + /* kick the next pending lock into having a go */ + list_del_init(&fl->fl_u.afs.link); + afs_lock_may_be_available(vnode); + } + } else { + list_del_init(&fl->fl_u.afs.link); + } + spin_unlock(&vnode->lock); + goto error; + +acquired_server_lock: + /* we've acquired a server lock, but it needs to be renewed after 5 + * mins */ + spin_lock(&vnode->lock); + afs_schedule_lock_extension(vnode); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); +sharing_existing_lock: + /* the lock has been granted as far as we're concerned... */ + fl->fl_u.afs.state = AFS_LOCK_GRANTED; + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); +given_lock: + /* ... but we do still need to get the VFS's blessing */ + ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); + ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED))) != 0); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) + goto vfs_rejected_lock; + spin_unlock(&vnode->lock); + + /* again, make sure we've got a callback on this file and, again, make + * sure that our view of the data version is up to date (we ignore + * errors incurred here and deal with the consequences elsewhere) */ + afs_vnode_fetch_status(vnode, NULL, key); + +error: + spin_unlock(&inode->i_lock); + _leave(" = %d", ret); + return ret; + +vfs_rejected_lock: + /* the VFS rejected the lock we just obtained, so we have to discard + * what we just got */ + _debug("vfs refused %d", ret); + list_del_init(&fl->fl_u.afs.link); + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + goto abort_attempt; +} + +/* + * unlock on a file on the server + */ +static int afs_do_unlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file unlocks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + spin_lock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) { + spin_unlock(&vnode->lock); + _leave(" = %d [vfs]", ret); + return ret; + } + + /* discard the server lock only if all granted locks are gone */ + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + spin_unlock(&vnode->lock); + _leave(" = 0"); + return 0; +} + +/* + * return information about a lock we currently hold, if indeed we hold one + */ +static int afs_do_getlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret, lock_count; + + _enter(""); + + fl->fl_type = F_UNLCK; + + mutex_lock(&vnode->vfs_inode.i_mutex); + + /* check local lock records first */ + ret = 0; + posix_test_lock(file, fl); + if (fl->fl_type == F_UNLCK) { + /* no local locks; consult the server */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + lock_count = vnode->status.lock_count; + if (lock_count) { + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + } + } + +error: + mutex_unlock(&vnode->vfs_inode.i_mutex); + _leave(" = %d [%hd]", ret, fl->fl_type); + return ret; +} + +/* + * manage POSIX locks on a file + */ +int afs_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + + _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags, + (long long) fl->fl_start, (long long) fl->fl_end); + + /* AFS doesn't support mandatory locks */ + if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (IS_GETLK(cmd)) + return afs_do_getlk(file, fl); + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * manage FLOCK locks on a file + */ +int afs_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + + _enter("{%x:%u},%d,{t=%x,fl=%x}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags); + + /* + * No BSD flocks over NFS allowed. + * Note: we could try to fake a POSIX lock request here by + * using ((u32) filp | 0x80000000) or some such as the pid. + * Not sure whether that would be unique, though, or whether + * that would break in other places. + */ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + /* we're simulating flock() locks using posix locks on the server */ + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * the POSIX lock management core VFS code copies the lock record and adds the + * copy into its own list, so we need to add that copy to the vnode's lock + * queue in the same place as the original (which will be deleted shortly + * after) + */ +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + _enter(""); + + list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); +} + +/* + * need to remove this lock from the vnode queue when it's removed from the + * VFS's list + */ +static void afs_fl_release_private(struct file_lock *fl) +{ + _enter(""); + + list_del_init(&fl->fl_u.afs.link); +} diff --git a/kernel/fs/afs/fsclient.c b/kernel/fs/afs/fsclient.c new file mode 100644 index 000000000..c2e930ec2 --- /dev/null +++ b/kernel/fs/afs/fsclient.c @@ -0,0 +1,1911 @@ +/* AFS File Server client stubs + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_fs.h" + +/* + * decode an AFSFid block + */ +static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) +{ + const __be32 *bp = *_bp; + + fid->vid = ntohl(*bp++); + fid->vnode = ntohl(*bp++); + fid->unique = ntohl(*bp++); + *_bp = bp; +} + +/* + * decode an AFSFetchStatus block + */ +static void xdr_decode_AFSFetchStatus(const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + afs_dataversion_t *store_version) +{ + afs_dataversion_t expected_version; + const __be32 *bp = *_bp; + umode_t mode; + u64 data_version, size; + u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ + kuid_t owner; + kgid_t group; + +#define EXTRACT(DST) \ + do { \ + u32 x = ntohl(*bp++); \ + changed |= DST - x; \ + DST = x; \ + } while (0) + + status->if_version = ntohl(*bp++); + EXTRACT(status->type); + EXTRACT(status->nlink); + size = ntohl(*bp++); + data_version = ntohl(*bp++); + EXTRACT(status->author); + owner = make_kuid(&init_user_ns, ntohl(*bp++)); + changed |= !uid_eq(owner, status->owner); + status->owner = owner; + EXTRACT(status->caller_access); /* call ticket dependent */ + EXTRACT(status->anon_access); + EXTRACT(status->mode); + EXTRACT(status->parent.vnode); + EXTRACT(status->parent.unique); + bp++; /* seg size */ + status->mtime_client = ntohl(*bp++); + status->mtime_server = ntohl(*bp++); + group = make_kgid(&init_user_ns, ntohl(*bp++)); + changed |= !gid_eq(group, status->group); + status->group = group; + bp++; /* sync counter */ + data_version |= (u64) ntohl(*bp++) << 32; + EXTRACT(status->lock_count); + size |= (u64) ntohl(*bp++) << 32; + bp++; /* spare 4 */ + *_bp = bp; + + if (size != status->size) { + status->size = size; + changed |= true; + } + status->mode &= S_IALLUGO; + + _debug("vnode time %lx, %lx", + status->mtime_client, status->mtime_server); + + if (vnode) { + status->parent.vid = vnode->fid.vid; + if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + _debug("vnode changed"); + i_size_write(&vnode->vfs_inode, size); + vnode->vfs_inode.i_uid = status->owner; + vnode->vfs_inode.i_gid = status->group; + vnode->vfs_inode.i_generation = vnode->fid.unique; + set_nlink(&vnode->vfs_inode, status->nlink); + + mode = vnode->vfs_inode.i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode; + barrier(); + vnode->vfs_inode.i_mode = mode; + } + + vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; + vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; + } + + expected_version = status->data_version; + if (store_version) + expected_version = *store_version; + + if (expected_version != data_version) { + status->data_version = data_version; + if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + _debug("vnode modified %llx on {%x:%u}", + (unsigned long long) data_version, + vnode->fid.vid, vnode->fid.vnode); + set_bit(AFS_VNODE_MODIFIED, &vnode->flags); + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + } + } else if (store_version) { + status->data_version = data_version; + } +} + +/* + * decode an AFSCallBack block + */ +static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) +{ + const __be32 *bp = *_bp; + + vnode->cb_version = ntohl(*bp++); + vnode->cb_expiry = ntohl(*bp++); + vnode->cb_type = ntohl(*bp++); + vnode->cb_expires = vnode->cb_expiry + get_seconds(); + *_bp = bp; +} + +static void xdr_decode_AFSCallBack_raw(const __be32 **_bp, + struct afs_callback *cb) +{ + const __be32 *bp = *_bp; + + cb->version = ntohl(*bp++); + cb->expiry = ntohl(*bp++); + cb->type = ntohl(*bp++); + *_bp = bp; +} + +/* + * decode an AFSVolSync block + */ +static void xdr_decode_AFSVolSync(const __be32 **_bp, + struct afs_volsync *volsync) +{ + const __be32 *bp = *_bp; + + volsync->creation = ntohl(*bp++); + bp++; /* spare2 */ + bp++; /* spare3 */ + bp++; /* spare4 */ + bp++; /* spare5 */ + bp++; /* spare6 */ + *_bp = bp; +} + +/* + * encode the requested attributes into an AFSStoreStatus block + */ +static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr) +{ + __be32 *bp = *_bp; + u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0; + + mask = 0; + if (attr->ia_valid & ATTR_MTIME) { + mask |= AFS_SET_MTIME; + mtime = attr->ia_mtime.tv_sec; + } + + if (attr->ia_valid & ATTR_UID) { + mask |= AFS_SET_OWNER; + owner = from_kuid(&init_user_ns, attr->ia_uid); + } + + if (attr->ia_valid & ATTR_GID) { + mask |= AFS_SET_GROUP; + group = from_kgid(&init_user_ns, attr->ia_gid); + } + + if (attr->ia_valid & ATTR_MODE) { + mask |= AFS_SET_MODE; + mode = attr->ia_mode & S_IALLUGO; + } + + *bp++ = htonl(mask); + *bp++ = htonl(mtime); + *bp++ = htonl(owner); + *bp++ = htonl(group); + *bp++ = htonl(mode); + *bp++ = 0; /* segment size */ + *_bp = bp; +} + +/* + * decode an AFSFetchVolumeStatus block + */ +static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, + struct afs_volume_status *vs) +{ + const __be32 *bp = *_bp; + + vs->vid = ntohl(*bp++); + vs->parent_id = ntohl(*bp++); + vs->online = ntohl(*bp++); + vs->in_service = ntohl(*bp++); + vs->blessed = ntohl(*bp++); + vs->needs_salvage = ntohl(*bp++); + vs->type = ntohl(*bp++); + vs->min_quota = ntohl(*bp++); + vs->max_quota = ntohl(*bp++); + vs->blocks_in_use = ntohl(*bp++); + vs->part_blocks_avail = ntohl(*bp++); + vs->part_max_blocks = ntohl(*bp++); + *_bp = bp; +} + +/* + * deliver reply data to an FS.FetchStatus + */ +static int afs_deliver_fs_fetch_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack(&bp, vnode); + if (call->reply2) + xdr_decode_AFSVolSync(&bp, call->reply2); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchStatus operation type + */ +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", + .deliver = afs_deliver_fs_fetch_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * fetch the status information for a file + */ +int afs_fs_fetch_file_status(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + struct afs_volsync *volsync, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = volsync; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHSTATUS); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.FetchData + */ +static int afs_deliver_fs_fetch_data(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + struct page *page; + void *buffer; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + if (call->operation_ID != FSFETCHDATA64) { + call->unmarshall++; + goto no_msw; + } + + /* extract the upper part of the returned data length of an + * FSFETCHDATA64 op (which should always be 0 using this + * client) */ + case 1: + _debug("extract data length (MSW)"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("DATA length MSW: %u", call->count); + if (call->count > 0) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + no_msw: + /* extract the returned data length */ + case 2: + _debug("extract data length"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("DATA length: %u", call->count); + if (call->count > PAGE_SIZE) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the returned data */ + case 3: + _debug("extract data"); + if (call->count > 0) { + page = call->reply3; + buffer = kmap_atomic(page); + ret = afs_extract_data(call, skb, last, buffer, + call->count); + kunmap_atomic(buffer); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + call->offset = 0; + call->unmarshall++; + + /* extract the metadata */ + case 4: + ret = afs_extract_data(call, skb, last, call->buffer, + (21 + 3 + 6) * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack(&bp, vnode); + if (call->reply2) + xdr_decode_AFSVolSync(&bp, call->reply2); + + call->offset = 0; + call->unmarshall++; + + case 5: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + if (call->count < PAGE_SIZE) { + _debug("clear"); + page = call->reply3; + buffer = kmap_atomic(page); + memset(buffer + call->count, 0, PAGE_SIZE - call->count); + kunmap_atomic(buffer); + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchData operation type + */ +static const struct afs_call_type afs_RXFSFetchData = { + .name = "FS.FetchData", + .deliver = afs_deliver_fs_fetch_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSFetchData64 = { + .name = "FS.FetchData64", + .deliver = afs_deliver_fs_fetch_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * fetch data from a very large file + */ +static int afs_fs_fetch_data64(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + off_t offset, size_t length, + struct page *buffer, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + ASSERTCMP(length, <, ULONG_MAX); + + call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = NULL; /* volsync */ + call->reply3 = buffer; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSFETCHDATA64; + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHDATA64); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + bp[4] = htonl(upper_32_bits(offset)); + bp[5] = htonl((u32) offset); + bp[6] = 0; + bp[7] = htonl((u32) length); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * fetch data from a file + */ +int afs_fs_fetch_data(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + off_t offset, size_t length, + struct page *buffer, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + if (upper_32_bits(offset) || upper_32_bits(offset + length)) + return afs_fs_fetch_data64(server, key, vnode, offset, length, + buffer, wait_mode); + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = NULL; /* volsync */ + call->reply3 = buffer; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSFETCHDATA; + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHDATA); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + bp[4] = htonl(offset); + bp[5] = htonl(length); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.GiveUpCallBacks + */ +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; /* shouldn't be any reply data */ + return 0; +} + +/* + * FS.GiveUpCallBacks operation type + */ +static const struct afs_call_type afs_RXFSGiveUpCallBacks = { + .name = "FS.GiveUpCallBacks", + .deliver = afs_deliver_fs_give_up_callbacks, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * give up a set of callbacks + * - the callbacks are held in the server->cb_break ring + */ +int afs_fs_give_up_callbacks(struct afs_server *server, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t ncallbacks; + __be32 *bp, *tp; + int loop; + + ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail, + ARRAY_SIZE(server->cb_break)); + + _enter("{%zu},", ncallbacks); + + if (ncallbacks == 0) + return 0; + if (ncallbacks > AFSCBMAX) + ncallbacks = AFSCBMAX; + + _debug("break %zu callbacks", ncallbacks); + + call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks, + 12 + ncallbacks * 6 * 4, 0); + if (!call) + return -ENOMEM; + + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + tp = bp + 2 + ncallbacks * 3; + *bp++ = htonl(FSGIVEUPCALLBACKS); + *bp++ = htonl(ncallbacks); + *tp++ = htonl(ncallbacks); + + atomic_sub(ncallbacks, &server->cb_break_n); + for (loop = ncallbacks; loop > 0; loop--) { + struct afs_callback *cb = + &server->cb_break[server->cb_break_tail]; + + *bp++ = htonl(cb->fid.vid); + *bp++ = htonl(cb->fid.vnode); + *bp++ = htonl(cb->fid.unique); + *tp++ = htonl(cb->version); + *tp++ = htonl(cb->expiry); + *tp++ = htonl(cb->type); + smp_mb(); + server->cb_break_tail = + (server->cb_break_tail + 1) & + (ARRAY_SIZE(server->cb_break) - 1); + } + + ASSERT(ncallbacks > 0); + wake_up_nr(&server->cb_break_waitq, ncallbacks); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.CreateFile or an FS.MakeDir + */ +static int afs_deliver_fs_create_vnode(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFid(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack_raw(&bp, call->reply4); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.CreateFile and FS.MakeDir operation type + */ +static const struct afs_call_type afs_RXFSCreateXXXX = { + .name = "FS.CreateXXXX", + .deliver = afs_deliver_fs_create_vnode, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a file or make a directory + */ +int afs_fs_create(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + umode_t mode, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (6 * 4); + + call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz, + (3 + 21 + 21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = newfid; + call->reply3 = newstatus; + call->reply4 = newcb; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(AFS_SET_MODE); + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ + *bp++ = 0; /* segment size */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.RemoveFile or FS.RemoveDir + */ +static int afs_deliver_fs_remove(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.RemoveDir/FS.RemoveFile operation type + */ +static const struct afs_call_type afs_RXFSRemoveXXXX = { + .name = "FS.RemoveXXXX", + .deliver = afs_deliver_fs_remove, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * remove a file or directory + */ +int afs_fs_remove(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + bool isdir, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz; + + call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Link + */ +static int afs_deliver_fs_link(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Link operation type + */ +static const struct afs_call_type afs_RXFSLink = { + .name = "FS.Link", + .deliver = afs_deliver_fs_link, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * make a hard link + */ +int afs_fs_link(struct afs_server *server, + struct key *key, + struct afs_vnode *dvnode, + struct afs_vnode *vnode, + const char *name, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (3 * 4); + + call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = dvnode; + call->reply2 = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSLINK); + *bp++ = htonl(dvnode->fid.vid); + *bp++ = htonl(dvnode->fid.vnode); + *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Symlink + */ +static int afs_deliver_fs_symlink(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFid(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Symlink operation type + */ +static const struct afs_call_type afs_RXFSSymlink = { + .name = "FS.Symlink", + .deliver = afs_deliver_fs_symlink, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a symbolic link + */ +int afs_fs_symlink(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + const char *contents, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz, c_namesz, c_padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + + c_namesz = strlen(contents); + c_padsz = (4 - (c_namesz & 3)) & 3; + + reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); + + call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz, + (3 + 21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = newfid; + call->reply3 = newstatus; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSYMLINK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(c_namesz); + memcpy(bp, contents, c_namesz); + bp = (void *) bp + c_namesz; + if (c_padsz > 0) { + memset(bp, 0, c_padsz); + bp = (void *) bp + c_padsz; + } + *bp++ = htonl(AFS_SET_MODE); + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(S_IRWXUGO); /* unix mode */ + *bp++ = 0; /* segment size */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Rename + */ +static int afs_deliver_fs_rename(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); + if (new_dvnode != orig_dvnode) + xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, + NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Rename operation type + */ +static const struct afs_call_type afs_RXFSRename = { + .name = "FS.Rename", + .deliver = afs_deliver_fs_rename, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a symbolic link + */ +int afs_fs_rename(struct afs_server *server, + struct key *key, + struct afs_vnode *orig_dvnode, + const char *orig_name, + struct afs_vnode *new_dvnode, + const char *new_name, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; + __be32 *bp; + + _enter(""); + + o_namesz = strlen(orig_name); + o_padsz = (4 - (o_namesz & 3)) & 3; + + n_namesz = strlen(new_name); + n_padsz = (4 - (n_namesz & 3)) & 3; + + reqsz = (4 * 4) + + 4 + o_namesz + o_padsz + + (3 * 4) + + 4 + n_namesz + n_padsz; + + call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = orig_dvnode; + call->reply2 = new_dvnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRENAME); + *bp++ = htonl(orig_dvnode->fid.vid); + *bp++ = htonl(orig_dvnode->fid.vnode); + *bp++ = htonl(orig_dvnode->fid.unique); + *bp++ = htonl(o_namesz); + memcpy(bp, orig_name, o_namesz); + bp = (void *) bp + o_namesz; + if (o_padsz > 0) { + memset(bp, 0, o_padsz); + bp = (void *) bp + o_padsz; + } + + *bp++ = htonl(new_dvnode->fid.vid); + *bp++ = htonl(new_dvnode->fid.vnode); + *bp++ = htonl(new_dvnode->fid.unique); + *bp++ = htonl(n_namesz); + memcpy(bp, new_name, n_namesz); + bp = (void *) bp + n_namesz; + if (n_padsz > 0) { + memset(bp, 0, n_padsz); + bp = (void *) bp + n_padsz; + } + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.StoreData + */ +static int afs_deliver_fs_store_data(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) { + _leave(" = 0 [more]"); + return 0; + } + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, + &call->store_version); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + afs_pages_written_back(vnode, call); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.StoreData operation type + */ +static const struct afs_call_type afs_RXFSStoreData = { + .name = "FS.StoreData", + .deliver = afs_deliver_fs_store_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData64 = { + .name = "FS.StoreData64", + .deliver = afs_deliver_fs_store_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * store a set of pages to a very large file + */ +static int afs_fs_store_data64(struct afs_server *server, + struct afs_writeback *wb, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to, + loff_t size, loff_t pos, loff_t i_size, + const struct afs_wait_mode *wait_mode) +{ + struct afs_vnode *vnode = wb->vnode; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData64, + (4 + 6 + 3 * 2) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->wb = wb; + call->key = wb->key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->mapping = vnode->vfs_inode.i_mapping; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->store_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA64); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + *bp++ = 0; /* mask */ + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = 0; /* unix mode */ + *bp++ = 0; /* segment size */ + + *bp++ = htonl(pos >> 32); + *bp++ = htonl((u32) pos); + *bp++ = htonl(size >> 32); + *bp++ = htonl((u32) size); + *bp++ = htonl(i_size >> 32); + *bp++ = htonl((u32) i_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * store a set of pages + */ +int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to, + const struct afs_wait_mode *wait_mode) +{ + struct afs_vnode *vnode = wb->vnode; + struct afs_call *call; + loff_t size, pos, i_size; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + + size = to - offset; + if (first != last) + size += (loff_t)(last - first) << PAGE_SHIFT; + pos = (loff_t)first << PAGE_SHIFT; + pos += offset; + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + size > i_size) + i_size = size + pos; + + _debug("size %llx, at %llx, i_size %llx", + (unsigned long long) size, (unsigned long long) pos, + (unsigned long long) i_size); + + if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) + return afs_fs_store_data64(server, wb, first, last, offset, to, + size, pos, i_size, wait_mode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData, + (4 + 6 + 3) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->wb = wb; + call->key = wb->key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->mapping = vnode->vfs_inode.i_mapping; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->store_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + *bp++ = 0; /* mask */ + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = 0; /* unix mode */ + *bp++ = 0; /* segment size */ + + *bp++ = htonl(pos); + *bp++ = htonl(size); + *bp++ = htonl(i_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.StoreStatus + */ +static int afs_deliver_fs_store_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + afs_dataversion_t *store_version; + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) { + _leave(" = 0 [more]"); + return 0; + } + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + + /* unmarshall the reply once we've received all of it */ + store_version = NULL; + if (call->operation_ID == FSSTOREDATA) + store_version = &call->store_version; + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.StoreStatus operation type + */ +static const struct afs_call_type afs_RXFSStoreStatus = { + .name = "FS.StoreStatus", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData_as_Status = { + .name = "FS.StoreData", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData64_as_Status = { + .name = "FS.StoreData64", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * set the attributes on a very large file, using FS.StoreData rather than + * FS.StoreStatus so as to alter the file size also + */ +static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + ASSERT(attr->ia_valid & ATTR_SIZE); + + call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status, + (4 + 6 + 3 * 2) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->store_version = vnode->status.data_version + 1; + call->operation_ID = FSSTOREDATA; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA64); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + *bp++ = 0; /* position of start of write */ + *bp++ = 0; + *bp++ = 0; /* size of write */ + *bp++ = 0; + *bp++ = htonl(attr->ia_size >> 32); /* new file length */ + *bp++ = htonl((u32) attr->ia_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus + * so as to alter the file size also + */ +static int afs_fs_setattr_size(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + ASSERT(attr->ia_valid & ATTR_SIZE); + if (attr->ia_size >> 32) + return afs_fs_setattr_size64(server, key, vnode, attr, + wait_mode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status, + (4 + 6 + 3) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->store_version = vnode->status.data_version + 1; + call->operation_ID = FSSTOREDATA; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + *bp++ = 0; /* position of start of write */ + *bp++ = 0; /* size of write */ + *bp++ = htonl(attr->ia_size); /* new file length */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * set the attributes on a file, using FS.StoreData if there's a change in file + * size, and FS.StoreStatus otherwise + */ +int afs_fs_setattr(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + if (attr->ia_valid & ATTR_SIZE) + return afs_fs_setattr_size(server, key, vnode, attr, + wait_mode); + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSStoreStatus, + (4 + 6) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSSTORESTATUS; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTORESTATUS); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.GetVolumeStatus + */ +static int afs_deliver_fs_get_volume_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + const __be32 *bp; + char *p; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* extract the returned status record */ + case 1: + _debug("extract status"); + ret = afs_extract_data(call, skb, last, call->buffer, + 12 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + bp = call->buffer; + xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); + call->offset = 0; + call->unmarshall++; + + /* extract the volume name length */ + case 2: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("volname length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the volume name */ + case 3: + _debug("extract volname"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("volname '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the volume name padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_volname_padding; + } + call->count = 4 - (call->count & 3); + + case 4: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_volname_padding: + + /* extract the offline message length */ + case 5: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("offline msg length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the offline message */ + case 6: + _debug("extract offline"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("offline '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the offline message padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_offline_padding; + } + call->count = 4 - (call->count & 3); + + case 7: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_offline_padding: + + /* extract the message of the day length */ + case 8: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("motd length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the message of the day */ + case 9: + _debug("extract motd"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("motd '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the message of the day padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_motd_padding; + } + call->count = 4 - (call->count & 3); + + case 10: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_motd_padding: + + case 11: + _debug("trailer %d", skb->len); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * destroy an FS.GetVolumeStatus call + */ +static void afs_get_volume_status_call_destructor(struct afs_call *call) +{ + kfree(call->reply3); + call->reply3 = NULL; + afs_flat_call_destructor(call); +} + +/* + * FS.GetVolumeStatus operation type + */ +static const struct afs_call_type afs_RXFSGetVolumeStatus = { + .name = "FS.GetVolumeStatus", + .deliver = afs_deliver_fs_get_volume_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_get_volume_status_call_destructor, +}; + +/* + * fetch the status of a volume + */ +int afs_fs_get_volume_status(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + struct afs_volume_status *vs, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + void *tmpbuf; + + _enter(""); + + tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + + call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4); + if (!call) { + kfree(tmpbuf); + return -ENOMEM; + } + + call->key = key; + call->reply = vnode; + call->reply2 = vs; + call->reply3 = tmpbuf; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSGETVOLUMESTATUS); + bp[1] = htonl(vnode->fid.vid); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock + */ +static int afs_deliver_fs_xxxx_lock(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.SetLock operation type + */ +static const struct afs_call_type afs_RXFSSetLock = { + .name = "FS.SetLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ExtendLock operation type + */ +static const struct afs_call_type afs_RXFSExtendLock = { + .name = "FS.ExtendLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ReleaseLock operation type + */ +static const struct afs_call_type afs_RXFSReleaseLock = { + .name = "FS.ReleaseLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * get a lock on a file + */ +int afs_fs_set_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + afs_lock_type_t type, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSETLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(type); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * extend a lock on a file + */ +int afs_fs_extend_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSEXTENDLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * release a lock on a file + */ +int afs_fs_release_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRELEASELOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} diff --git a/kernel/fs/afs/inode.c b/kernel/fs/afs/inode.c new file mode 100644 index 000000000..e06f5a233 --- /dev/null +++ b/kernel/fs/afs/inode.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2002 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Woodhouse + * David Howells + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +struct afs_iget_data { + struct afs_fid fid; + struct afs_volume *volume; /* volume on which resides */ +}; + +/* + * map the AFS file status to the inode member variables + */ +static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +{ + struct inode *inode = AFS_VNODE_TO_I(vnode); + + _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", + vnode->status.type, + vnode->status.nlink, + (unsigned long long) vnode->status.size, + vnode->status.data_version, + vnode->status.mode); + + switch (vnode->status.type) { + case AFS_FTYPE_FILE: + inode->i_mode = S_IFREG | vnode->status.mode; + inode->i_op = &afs_file_inode_operations; + inode->i_fop = &afs_file_operations; + break; + case AFS_FTYPE_DIR: + inode->i_mode = S_IFDIR | vnode->status.mode; + inode->i_op = &afs_dir_inode_operations; + inode->i_fop = &afs_dir_file_operations; + break; + case AFS_FTYPE_SYMLINK: + inode->i_mode = S_IFLNK | vnode->status.mode; + inode->i_op = &page_symlink_inode_operations; + break; + default: + printk("kAFS: AFS vnode with undefined type\n"); + return -EBADMSG; + } + +#ifdef CONFIG_AFS_FSCACHE + if (vnode->status.size != inode->i_size) + fscache_attr_changed(vnode->cache); +#endif + + set_nlink(inode, vnode->status.nlink); + inode->i_uid = vnode->status.owner; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_size = vnode->status.size; + inode->i_ctime.tv_sec = vnode->status.mtime_server; + inode->i_ctime.tv_nsec = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_blocks = 0; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; + inode->i_mapping->a_ops = &afs_fs_aops; + + /* check to see whether a symbolic link is really a mountpoint */ + if (vnode->status.type == AFS_FTYPE_SYMLINK) { + afs_mntpt_check_symlink(vnode, key); + + if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) { + inode->i_mode = S_IFDIR | vnode->status.mode; + inode->i_op = &afs_mntpt_inode_operations; + inode->i_fop = &afs_mntpt_file_operations; + } + } + + return 0; +} + +/* + * iget5() comparator + */ +static int afs_iget5_test(struct inode *inode, void *opaque) +{ + struct afs_iget_data *data = opaque; + + return inode->i_ino == data->fid.vnode && + inode->i_generation == data->fid.unique; +} + +/* + * iget5() comparator for inode created by autocell operations + * + * These pseudo inodes don't match anything. + */ +static int afs_iget5_autocell_test(struct inode *inode, void *opaque) +{ + return 0; +} + +/* + * iget5() inode initialiser + */ +static int afs_iget5_set(struct inode *inode, void *opaque) +{ + struct afs_iget_data *data = opaque; + struct afs_vnode *vnode = AFS_FS_I(inode); + + inode->i_ino = data->fid.vnode; + inode->i_generation = data->fid.unique; + vnode->fid = data->fid; + vnode->volume = data->volume; + + return 0; +} + +/* + * inode retrieval for autocell + */ +struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, + int namesz, struct key *key) +{ + struct afs_iget_data data; + struct afs_super_info *as; + struct afs_vnode *vnode; + struct super_block *sb; + struct inode *inode; + static atomic_t afs_autocell_ino; + + _enter("{%x:%u},%*.*s,", + AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode, + namesz, namesz, dev_name ?: ""); + + sb = dir->i_sb; + as = sb->s_fs_info; + data.volume = as->volume; + data.fid.vid = as->volume->vid; + data.fid.unique = 0; + data.fid.vnode = 0; + + inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino), + afs_iget5_autocell_test, afs_iget5_set, + &data); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }", + inode, inode->i_ino, data.fid.vid, data.fid.vnode, + data.fid.unique); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + BUG_ON(!(inode->i_state & I_NEW)); + + inode->i_size = 0; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &afs_autocell_inode_operations; + set_nlink(inode, 2); + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_ctime.tv_sec = get_seconds(); + inode->i_ctime.tv_nsec = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_blocks = 0; + inode->i_version = 0; + inode->i_generation = 0; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + unlock_new_inode(inode); + _leave(" = %p", inode); + return inode; +} + +/* + * inode retrieval + */ +struct inode *afs_iget(struct super_block *sb, struct key *key, + struct afs_fid *fid, struct afs_file_status *status, + struct afs_callback *cb) +{ + struct afs_iget_data data = { .fid = *fid }; + struct afs_super_info *as; + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique); + + as = sb->s_fs_info; + data.volume = as->volume; + + inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set, + &data); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { vl=%x vn=%x, u=%x }", + inode, fid->vid, fid->vnode, fid->unique); + + vnode = AFS_FS_I(inode); + + /* deal with an existing inode */ + if (!(inode->i_state & I_NEW)) { + _leave(" = %p", inode); + return inode; + } + + if (!status) { + /* it's a remotely extant inode */ + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto bad_inode; + } else { + /* it's an inode we just created */ + memcpy(&vnode->status, status, sizeof(vnode->status)); + + if (!cb) { + /* it's a symlink we just created (the fileserver + * didn't give us a callback) */ + vnode->cb_version = 0; + vnode->cb_expiry = 0; + vnode->cb_type = 0; + vnode->cb_expires = get_seconds(); + } else { + vnode->cb_version = cb->version; + vnode->cb_expiry = cb->expiry; + vnode->cb_type = cb->type; + vnode->cb_expires = vnode->cb_expiry + get_seconds(); + } + } + + /* set up caching before mapping the status, as map-status reads the + * first page of symlinks to see if they're really mountpoints */ + inode->i_size = vnode->status.size; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + vnode, true); +#endif + + ret = afs_inode_map_status(vnode, key); + if (ret < 0) + goto bad_inode; + + /* success */ + clear_bit(AFS_VNODE_UNSET, &vnode->flags); + inode->i_flags |= S_NOATIME; + unlock_new_inode(inode); + _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type); + return inode; + + /* failure */ +bad_inode: +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif + iget_failed(inode); + _leave(" = %d [bad]", ret); + return ERR_PTR(ret); +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->vfs_inode.i_mode)) + invalidate_remote_inode(&vnode->vfs_inode); + else + invalidate_inode_pages2(vnode->vfs_inode.i_mapping); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + int ret; + + _enter("{v={%x:%u} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (vnode->cb_promised && + !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && + !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { + if (vnode->cb_expires < get_seconds() + 10) { + _debug("callback expired"); + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + } else { + goto valid; + } + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto valid; + + mutex_lock(&vnode->validate_lock); + + /* if the promise has expired, we need to check the server again to get + * a new promise - note that if the (parent) directory's metadata was + * changed then the security may be different and we may no longer have + * access */ + if (!vnode->cb_promised || + test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + _debug("not promised"); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error_unlock; + _debug("new promise [fl=%lx]", vnode->flags); + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _debug("file already deleted"); + ret = -ESTALE; + goto error_unlock; + } + + /* if the vnode's data version number changed then its contents are + * different */ + if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + afs_zap_data(vnode); + + clear_bit(AFS_VNODE_MODIFIED, &vnode->flags); + mutex_unlock(&vnode->validate_lock); +valid: + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; +} + +/* + * read the attributes of an inode + */ +int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + + inode = d_inode(dentry); + + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + + generic_fillattr(inode, stat); + return 0; +} + +/* + * discard an AFS inode + */ +int afs_drop_inode(struct inode *inode) +{ + _enter(""); + + if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) + return generic_delete_inode(inode); + else + return generic_drop_inode(inode); +} + +/* + * clear an AFS inode + */ +void afs_evict_inode(struct inode *inode) +{ + struct afs_permits *permits; + struct afs_vnode *vnode; + + vnode = AFS_FS_I(inode); + + _enter("{%x:%u.%d} v=%u x=%u t=%u }", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + vnode->cb_version, + vnode->cb_expiry, + vnode->cb_type); + + _debug("CLEAR INODE %p", inode); + + ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + afs_give_up_callback(vnode); + + if (vnode->server) { + spin_lock(&vnode->server->fs_lock); + rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes); + spin_unlock(&vnode->server->fs_lock); + afs_put_server(vnode->server); + vnode->server = NULL; + } + + ASSERT(list_empty(&vnode->writebacks)); + ASSERT(!vnode->cb_promised); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif + + mutex_lock(&vnode->permits_lock); + permits = vnode->permits; + rcu_assign_pointer(vnode->permits, NULL); + mutex_unlock(&vnode->permits_lock); + if (permits) + call_rcu(&permits->rcu, afs_zap_permits); + + _leave(""); +} + +/* + * set the attributes of an inode + */ +int afs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct key *key; + int ret; + + _enter("{%x:%u},{n=%pd},%x", + vnode->fid.vid, vnode->fid.vnode, dentry, + attr->ia_valid); + + if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME))) { + _leave(" = 0 [unsupported]"); + return 0; + } + + /* flush any dirty data outstanding on a regular file */ + if (S_ISREG(vnode->vfs_inode.i_mode)) { + filemap_write_and_wait(vnode->vfs_inode.i_mapping); + afs_writeback_all(vnode); + } + + if (attr->ia_valid & ATTR_FILE) { + key = attr->ia_file->private_data; + } else { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + } + + ret = afs_vnode_setattr(vnode, key, attr); + if (!(attr->ia_valid & ATTR_FILE)) + key_put(key); + +error: + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/afs/internal.h b/kernel/fs/afs/internal.h new file mode 100644 index 000000000..71d598231 --- /dev/null +++ b/kernel/fs/afs/internal.h @@ -0,0 +1,889 @@ +/* internal AFS stuff + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "afs.h" +#include "afs_vl.h" + +#define AFS_CELL_MAX_ADDRS 15 + +struct pagevec; +struct afs_call; + +typedef enum { + AFS_VL_NEW, /* new, uninitialised record */ + AFS_VL_CREATING, /* creating record */ + AFS_VL_VALID, /* record is pending */ + AFS_VL_NO_VOLUME, /* no such volume available */ + AFS_VL_UPDATING, /* update in progress */ + AFS_VL_VOLUME_DELETED, /* volume was deleted */ + AFS_VL_UNCERTAIN, /* uncertain state (update failed) */ +} __attribute__((packed)) afs_vlocation_state_t; + +struct afs_mount_params { + bool rwpath; /* T if the parent should be considered R/W */ + bool force; /* T to force cell type */ + bool autocell; /* T if set auto mount operation */ + afs_voltype_t type; /* type of volume requested */ + int volnamesz; /* size of volume name */ + const char *volname; /* name of volume to mount */ + struct afs_cell *cell; /* cell in which to find volume */ + struct afs_volume *volume; /* volume record */ + struct key *key; /* key to use for secure mounting */ +}; + +/* + * definition of how to wait for the completion of an operation + */ +struct afs_wait_mode { + /* RxRPC received message notification */ + void (*rx_wakeup)(struct afs_call *call); + + /* synchronous call waiter and call dispatched notification */ + int (*wait)(struct afs_call *call); + + /* asynchronous call completion */ + void (*async_complete)(void *reply, int error); +}; + +extern const struct afs_wait_mode afs_sync_call; +extern const struct afs_wait_mode afs_async_call; + +/* + * a record of an in-progress RxRPC call + */ +struct afs_call { + const struct afs_call_type *type; /* type of call */ + const struct afs_wait_mode *wait_mode; /* completion wait mode */ + wait_queue_head_t waitq; /* processes awaiting completion */ + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ + struct work_struct async_work; /* asynchronous work processor */ + struct work_struct work; /* actual work processor */ + struct sk_buff_head rx_queue; /* received packets */ + struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct key *key; /* security for this call */ + struct afs_server *server; /* server affected by incoming CM call */ + void *request; /* request data (first part) */ + struct address_space *mapping; /* page set */ + struct afs_writeback *wb; /* writeback being performed */ + void *buffer; /* reply receive buffer */ + void *reply; /* reply buffer (first part) */ + void *reply2; /* reply buffer (second part) */ + void *reply3; /* reply buffer (third part) */ + void *reply4; /* reply buffer (fourth part) */ + pgoff_t first; /* first page in mapping to deal with */ + pgoff_t last; /* last page in mapping to deal with */ + enum { /* call state */ + AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ + AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ + AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ + AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ + AFS_CALL_REPLYING, /* replying to incoming call */ + AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ + AFS_CALL_COMPLETE, /* successfully completed */ + AFS_CALL_BUSY, /* server was busy */ + AFS_CALL_ABORTED, /* call was aborted */ + AFS_CALL_ERROR, /* call failed due to error */ + } state; + int error; /* error code */ + unsigned request_size; /* size of request data */ + unsigned reply_max; /* maximum size of reply */ + unsigned reply_size; /* current size of reply */ + unsigned first_offset; /* offset into mapping[first] */ + unsigned last_to; /* amount of mapping[last] */ + unsigned offset; /* offset into received data store */ + unsigned char unmarshall; /* unmarshalling phase */ + bool incoming; /* T if incoming call */ + bool send_pages; /* T if data from mapping should be sent */ + u16 service_id; /* RxRPC service ID to call */ + __be16 port; /* target UDP port */ + __be32 operation_ID; /* operation ID for an incoming call */ + u32 count; /* count for use in unmarshalling */ + __be32 tmp; /* place to extract temporary data */ + afs_dataversion_t store_version; /* updated version expected from store */ +}; + +struct afs_call_type { + const char *name; + + /* deliver request or reply data to an call + * - returning an error will cause the call to be aborted + */ + int (*deliver)(struct afs_call *call, struct sk_buff *skb, + bool last); + + /* map an abort code to an error number */ + int (*abort_to_error)(u32 abort_code); + + /* clean up a call */ + void (*destructor)(struct afs_call *call); +}; + +/* + * record of an outstanding writeback on a vnode + */ +struct afs_writeback { + struct list_head link; /* link in vnode->writebacks */ + struct work_struct writer; /* work item to perform the writeback */ + struct afs_vnode *vnode; /* vnode to which this write applies */ + struct key *key; /* owner of this write */ + wait_queue_head_t waitq; /* completion and ready wait queue */ + pgoff_t first; /* first page in batch */ + pgoff_t point; /* last page in current store op */ + pgoff_t last; /* last page in batch (inclusive) */ + unsigned offset_first; /* offset into first page of start of write */ + unsigned to_last; /* offset into last page of end of write */ + int num_conflicts; /* count of conflicting writes in list */ + int usage; + bool conflicts; /* T if has dependent conflicts */ + enum { + AFS_WBACK_SYNCING, /* synchronisation being performed */ + AFS_WBACK_PENDING, /* write pending */ + AFS_WBACK_CONFLICTING, /* conflicting writes posted */ + AFS_WBACK_WRITING, /* writing back */ + AFS_WBACK_COMPLETE /* the writeback record has been unlinked */ + } state __attribute__((packed)); +}; + +/* + * AFS superblock private data + * - there's one superblock per volume + */ +struct afs_super_info { + struct afs_volume *volume; /* volume record */ + char rwparent; /* T if parent is R/W AFS volume */ +}; + +static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) +{ + return sb->s_fs_info; +} + +extern struct file_system_type afs_fs_type; + +/* + * entry in the cached cell catalogue + */ +struct afs_cache_cell { + char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */ + struct in_addr vl_servers[15]; /* cached cell VL servers */ +}; + +/* + * AFS cell record + */ +struct afs_cell { + atomic_t usage; + struct list_head link; /* main cell list link */ + struct key *anonymous_key; /* anonymous user key for this cell */ + struct list_head proc_link; /* /proc cell list link */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + + /* server record management */ + rwlock_t servers_lock; /* active server list lock */ + struct list_head servers; /* active server list */ + + /* volume location record management */ + struct rw_semaphore vl_sem; /* volume management serialisation semaphore */ + struct list_head vl_list; /* cell's active VL record list */ + spinlock_t vl_lock; /* vl_list lock */ + unsigned short vl_naddrs; /* number of VL servers in addr list */ + unsigned short vl_curr_svix; /* current server index */ + struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */ + + char name[0]; /* cell name - must go last */ +}; + +/* + * entry in the cached volume location catalogue + */ +struct afs_cache_vlocation { + /* volume name (lowercase, padded with NULs) */ + uint8_t name[AFS_MAXVOLNAME + 1]; + + uint8_t nservers; /* number of entries used in servers[] */ + uint8_t vidmask; /* voltype mask for vid[] */ + uint8_t srvtmask[8]; /* voltype masks for servers[] */ +#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ +#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ +#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ + + afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */ + struct in_addr servers[8]; /* fileserver addresses */ + time_t rtime; /* last retrieval time */ +}; + +/* + * volume -> vnode hash table entry + */ +struct afs_cache_vhash { + afs_voltype_t vtype; /* which volume variation */ + uint8_t hash_bucket; /* which hash bucket this represents */ +} __attribute__((packed)); + +/* + * AFS volume location record + */ +struct afs_vlocation { + atomic_t usage; + time_t time_of_death; /* time at which put reduced usage to 0 */ + struct list_head link; /* link in cell volume location list */ + struct list_head grave; /* link in master graveyard list */ + struct list_head update; /* link in master update list */ + struct afs_cell *cell; /* cell to which volume belongs */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + struct afs_cache_vlocation vldb; /* volume information DB record */ + struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ + wait_queue_head_t waitq; /* status change waitqueue */ + time_t update_at; /* time at which record should be updated */ + spinlock_t lock; /* access lock */ + afs_vlocation_state_t state; /* volume location state */ + unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ + unsigned short upd_busy_cnt; /* EBUSY count during update */ + bool valid; /* T if valid */ +}; + +/* + * AFS fileserver record + */ +struct afs_server { + atomic_t usage; + time_t time_of_death; /* time at which put reduced usage to 0 */ + struct in_addr addr; /* server address */ + struct afs_cell *cell; /* cell in which server resides */ + struct list_head link; /* link in cell's server list */ + struct list_head grave; /* link in master graveyard list */ + struct rb_node master_rb; /* link in master by-addr tree */ + struct rw_semaphore sem; /* access lock */ + + /* file service access */ + struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */ + unsigned long fs_act_jif; /* time at which last activity occurred */ + unsigned long fs_dead_jif; /* time at which no longer to be considered dead */ + spinlock_t fs_lock; /* access lock */ + int fs_state; /* 0 or reason FS currently marked dead (-errno) */ + + /* callback promise management */ + struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */ + struct delayed_work cb_updater; /* callback updater */ + struct delayed_work cb_break_work; /* collected break dispatcher */ + wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */ + spinlock_t cb_lock; /* access lock */ + struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */ + atomic_t cb_break_n; /* number of pending breaks */ + u8 cb_break_head; /* head of callback breaking ring */ + u8 cb_break_tail; /* tail of callback breaking ring */ +}; + +/* + * AFS volume access record + */ +struct afs_volume { + atomic_t usage; + struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ + struct afs_vlocation *vlocation; /* volume location */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + afs_volid_t vid; /* volume ID */ + afs_voltype_t type; /* type of volume */ + char type_force; /* force volume type (suppress R/O -> R/W) */ + unsigned short nservers; /* number of server slots filled */ + unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ + struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ + struct rw_semaphore server_sem; /* lock for accessing current server */ + struct backing_dev_info bdi; +}; + +/* + * vnode catalogue entry + */ +struct afs_cache_vnode { + afs_vnodeid_t vnode_id; /* vnode ID */ + unsigned vnode_unique; /* vnode ID uniquifier */ + afs_dataversion_t data_version; /* data version */ +}; + +/* + * AFS inode private data + */ +struct afs_vnode { + struct inode vfs_inode; /* the VFS's inode record */ + + struct afs_volume *volume; /* volume on which vnode resides */ + struct afs_server *server; /* server currently supplying this file */ + struct afs_fid fid; /* the file identifier for this inode */ + struct afs_file_status status; /* AFS status info for this file */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + struct afs_permits *permits; /* cache of permits so far obtained */ + struct mutex permits_lock; /* lock for altering permits list */ + struct mutex validate_lock; /* lock for validating this vnode */ + wait_queue_head_t update_waitq; /* status fetch waitqueue */ + int update_cnt; /* number of outstanding ops that will update the + * status */ + spinlock_t writeback_lock; /* lock for writebacks */ + spinlock_t lock; /* waitqueue/flags lock */ + unsigned long flags; +#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */ +#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ +#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */ +#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ +#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ +#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ +#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ +#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ +#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ +#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ +#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ +#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ + + long acl_order; /* ACL check count (callback break count) */ + + struct list_head writebacks; /* alterations in pagecache that need writing */ + struct list_head pending_locks; /* locks waiting to be granted */ + struct list_head granted_locks; /* locks granted on this file */ + struct delayed_work lock_work; /* work to be done in locking */ + struct key *unlock_key; /* key to be used in unlocking */ + + /* outstanding callback notification on this file */ + struct rb_node server_rb; /* link in server->fs_vnodes */ + struct rb_node cb_promise; /* link in server->cb_promises */ + struct work_struct cb_broken_work; /* work to be done on callback break */ + time_t cb_expires; /* time at which callback expires */ + time_t cb_expires_at; /* time used to order cb_promise */ + unsigned cb_version; /* callback version */ + unsigned cb_expiry; /* callback expiry time */ + afs_callback_type_t cb_type; /* type of callback */ + bool cb_promised; /* true if promise still holds */ +}; + +/* + * cached security record for one user's attempt to access a vnode + */ +struct afs_permit { + struct key *key; /* RxRPC ticket holding a security context */ + afs_access_t access_mask; /* access mask for this key */ +}; + +/* + * cache of security records from attempts to access a vnode + */ +struct afs_permits { + struct rcu_head rcu; /* disposal procedure */ + int count; /* number of records */ + struct afs_permit permits[0]; /* the permits so far examined */ +}; + +/* + * record of one of a system's set of network interfaces + */ +struct afs_interface { + struct in_addr address; /* IPv4 address bound to interface */ + struct in_addr netmask; /* netmask applied to address */ + unsigned mtu; /* MTU of interface */ +}; + +/* + * UUID definition [internet draft] + * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns + * increments since midnight 15th October 1582 + * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID + * time + * - the clock sequence is a 14-bit counter to avoid duplicate times + */ +struct afs_uuid { + u32 time_low; /* low part of timestamp */ + u16 time_mid; /* mid part of timestamp */ + u16 time_hi_and_version; /* high part of timestamp and version */ +#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000ULL +#define AFS_UUID_TIMEHI_MASK 0x0fff +#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */ +#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */ +#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */ + u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ +#define AFS_UUID_CLOCKHI_MASK 0x3f +#define AFS_UUID_VARIANT_STD 0x80 + u8 clock_seq_low; /* clock seq low */ + u8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + +/*****************************************************************************/ +/* + * cache.c + */ +#ifdef CONFIG_AFS_FSCACHE +extern struct fscache_netfs afs_cache_netfs; +extern struct fscache_cookie_def afs_cell_cache_index_def; +extern struct fscache_cookie_def afs_vlocation_cache_index_def; +extern struct fscache_cookie_def afs_volume_cache_index_def; +extern struct fscache_cookie_def afs_vnode_cache_index_def; +#else +#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) +#endif + +/* + * callback.c + */ +extern void afs_init_callback_state(struct afs_server *); +extern void afs_broken_callback_work(struct work_struct *); +extern void afs_break_callbacks(struct afs_server *, size_t, + struct afs_callback[]); +extern void afs_discard_callback_on_delete(struct afs_vnode *); +extern void afs_give_up_callback(struct afs_vnode *); +extern void afs_dispatch_give_up_callbacks(struct work_struct *); +extern void afs_flush_callback_breaks(struct afs_server *); +extern int __init afs_callback_update_init(void); +extern void afs_callback_update_kill(void); + +/* + * cell.c + */ +extern struct rw_semaphore afs_proc_cells_sem; +extern struct list_head afs_proc_cells; + +#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) +extern int afs_cell_init(char *); +extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool); +extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool); +extern struct afs_cell *afs_grab_cell(struct afs_cell *); +extern void afs_put_cell(struct afs_cell *); +extern void afs_cell_purge(void); + +/* + * cmservice.c + */ +extern bool afs_cm_incoming_call(struct afs_call *); + +/* + * dir.c + */ +extern const struct inode_operations afs_dir_inode_operations; +extern const struct dentry_operations afs_fs_dentry_operations; +extern const struct file_operations afs_dir_file_operations; + +/* + * file.c + */ +extern const struct address_space_operations afs_fs_aops; +extern const struct inode_operations afs_file_inode_operations; +extern const struct file_operations afs_file_operations; + +extern int afs_open(struct inode *, struct file *); +extern int afs_release(struct inode *, struct file *); +extern int afs_page_filler(void *, struct page *); + +/* + * flock.c + */ +extern void __exit afs_kill_lock_manager(void); +extern void afs_lock_work(struct work_struct *); +extern void afs_lock_may_be_available(struct afs_vnode *); +extern int afs_lock(struct file *, int, struct file_lock *); +extern int afs_flock(struct file *, int, struct file_lock *); + +/* + * fsclient.c + */ +extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, + struct afs_vnode *, struct afs_volsync *, + const struct afs_wait_mode *); +extern int afs_fs_give_up_callbacks(struct afs_server *, + const struct afs_wait_mode *); +extern int afs_fs_fetch_data(struct afs_server *, struct key *, + struct afs_vnode *, off_t, size_t, struct page *, + const struct afs_wait_mode *); +extern int afs_fs_create(struct afs_server *, struct key *, + struct afs_vnode *, const char *, umode_t, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, + const struct afs_wait_mode *); +extern int afs_fs_remove(struct afs_server *, struct key *, + struct afs_vnode *, const char *, bool, + const struct afs_wait_mode *); +extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *, + struct afs_vnode *, const char *, + const struct afs_wait_mode *); +extern int afs_fs_symlink(struct afs_server *, struct key *, + struct afs_vnode *, const char *, const char *, + struct afs_fid *, struct afs_file_status *, + const struct afs_wait_mode *); +extern int afs_fs_rename(struct afs_server *, struct key *, + struct afs_vnode *, const char *, + struct afs_vnode *, const char *, + const struct afs_wait_mode *); +extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *, + pgoff_t, pgoff_t, unsigned, unsigned, + const struct afs_wait_mode *); +extern int afs_fs_setattr(struct afs_server *, struct key *, + struct afs_vnode *, struct iattr *, + const struct afs_wait_mode *); +extern int afs_fs_get_volume_status(struct afs_server *, struct key *, + struct afs_vnode *, + struct afs_volume_status *, + const struct afs_wait_mode *); +extern int afs_fs_set_lock(struct afs_server *, struct key *, + struct afs_vnode *, afs_lock_type_t, + const struct afs_wait_mode *); +extern int afs_fs_extend_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); +extern int afs_fs_release_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); + +/* + * inode.c + */ +extern struct inode *afs_iget_autocell(struct inode *, const char *, int, + struct key *); +extern struct inode *afs_iget(struct super_block *, struct key *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *); +extern void afs_zap_data(struct afs_vnode *); +extern int afs_validate(struct afs_vnode *, struct key *); +extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int afs_setattr(struct dentry *, struct iattr *); +extern void afs_evict_inode(struct inode *); +extern int afs_drop_inode(struct inode *); + +/* + * main.c + */ +extern struct workqueue_struct *afs_wq; +extern struct afs_uuid afs_uuid; + +/* + * misc.c + */ +extern int afs_abort_to_error(u32); + +/* + * mntpt.c + */ +extern const struct inode_operations afs_mntpt_inode_operations; +extern const struct inode_operations afs_autocell_inode_operations; +extern const struct file_operations afs_mntpt_file_operations; + +extern struct vfsmount *afs_d_automount(struct path *); +extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); +extern void afs_mntpt_kill_timer(void); + +/* + * proc.c + */ +extern int afs_proc_init(void); +extern void afs_proc_cleanup(void); +extern int afs_proc_cell_setup(struct afs_cell *); +extern void afs_proc_cell_remove(struct afs_cell *); + +/* + * rxrpc.c + */ +extern int afs_open_socket(void); +extern void afs_close_socket(void); +extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, + const struct afs_wait_mode *); +extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, + size_t, size_t); +extern void afs_flat_call_destructor(struct afs_call *); +extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); +extern void afs_send_empty_reply(struct afs_call *); +extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); +extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, + size_t); + +/* + * security.c + */ +extern void afs_clear_permits(struct afs_vnode *); +extern void afs_cache_permit(struct afs_vnode *, struct key *, long); +extern void afs_zap_permits(struct rcu_head *); +extern struct key *afs_request_key(struct afs_cell *); +extern int afs_permission(struct inode *, int); + +/* + * server.c + */ +extern spinlock_t afs_server_peer_lock; + +#define afs_get_server(S) \ +do { \ + _debug("GET SERVER %d", atomic_read(&(S)->usage)); \ + atomic_inc(&(S)->usage); \ +} while(0) + +extern struct afs_server *afs_lookup_server(struct afs_cell *, + const struct in_addr *); +extern struct afs_server *afs_find_server(const struct in_addr *); +extern void afs_put_server(struct afs_server *); +extern void __exit afs_purge_servers(void); + +/* + * super.c + */ +extern int afs_fs_init(void); +extern void afs_fs_exit(void); + +/* + * use-rtnetlink.c + */ +extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); +extern int afs_get_MAC_address(u8 *, size_t); + +/* + * vlclient.c + */ +extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, + const char *, struct afs_cache_vlocation *, + const struct afs_wait_mode *); +extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *, + afs_volid_t, afs_voltype_t, + struct afs_cache_vlocation *, + const struct afs_wait_mode *); + +/* + * vlocation.c + */ +#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0) + +extern int __init afs_vlocation_update_init(void); +extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *, + struct key *, + const char *, size_t); +extern void afs_put_vlocation(struct afs_vlocation *); +extern void afs_vlocation_purge(void); + +/* + * vnode.c + */ +static inline struct afs_vnode *AFS_FS_I(struct inode *inode) +{ + return container_of(inode, struct afs_vnode, vfs_inode); +} + +static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) +{ + return &vnode->vfs_inode; +} + +extern void afs_vnode_finalise_status_update(struct afs_vnode *, + struct afs_server *); +extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *, + struct key *); +extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *, + off_t, size_t, struct page *); +extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *, + umode_t, struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_server **); +extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *, + bool); +extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *, + const char *); +extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *, + const char *, struct afs_fid *, + struct afs_file_status *, struct afs_server **); +extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *, + struct key *, const char *, const char *); +extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, + unsigned, unsigned); +extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); +extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, + struct afs_volume_status *); +extern int afs_vnode_set_lock(struct afs_vnode *, struct key *, + afs_lock_type_t); +extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *); +extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); + +/* + * volume.c + */ +#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) + +extern void afs_put_volume(struct afs_volume *); +extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *); +extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *); +extern int afs_volume_release_fileserver(struct afs_vnode *, + struct afs_server *, int); + +/* + * write.c + */ +extern int afs_set_page_dirty(struct page *); +extern void afs_put_writeback(struct afs_writeback *); +extern int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); +extern int afs_writepage(struct page *, struct writeback_control *); +extern int afs_writepages(struct address_space *, struct writeback_control *); +extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); +extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); +extern int afs_writeback_all(struct afs_vnode *); +extern int afs_fsync(struct file *, loff_t, loff_t, int); + + +/*****************************************************************************/ +/* + * debug tracing + */ +extern unsigned afs_debug; + +#define dbgprintk(FMT,...) \ + printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) + +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) +#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) +#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) + +#elif defined(CONFIG_AFS_DEBUG) +#define AFS_DEBUG_KENTER 0x01 +#define AFS_DEBUG_KLEAVE 0x02 +#define AFS_DEBUG_KDEBUG 0x04 + +#define _enter(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ + kenter(FMT,##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ + kleave(FMT,##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ + kdebug(FMT,##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) +#endif + +/* + * debug assertion checking + */ +#if 1 // defined(__KDEBUGALL) + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTRANGE(L, OP1, N, OP2, H) \ +do { \ + if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ + (unsigned long)(L), (unsigned long)(N), \ + (unsigned long)(H)); \ + printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ + (unsigned long)(L), (unsigned long)(N), \ + (unsigned long)(H)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#else + +#define ASSERT(X) \ +do { \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ +} while(0) + +#define ASSERTRANGE(L, OP1, N, OP2, H) \ +do { \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ +} while(0) + +#endif /* __KDEBUGALL */ diff --git a/kernel/fs/afs/main.c b/kernel/fs/afs/main.c new file mode 100644 index 000000000..35de0c047 --- /dev/null +++ b/kernel/fs/afs/main.c @@ -0,0 +1,184 @@ +/* AFS client file system + * + * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("AFS Client File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned afs_debug; +module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "AFS debugging mask"); + +static char *rootcell; + +module_param(rootcell, charp, 0); +MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); + +struct afs_uuid afs_uuid; +struct workqueue_struct *afs_wq; + +/* + * get a client UUID + */ +static int __init afs_get_client_UUID(void) +{ + struct timespec ts; + u64 uuidtime; + u16 clockseq; + int ret; + + /* read the MAC address of one of the external interfaces and construct + * a UUID from it */ + ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node)); + if (ret < 0) + return ret; + + getnstimeofday(&ts); + uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10; + uuidtime += ts.tv_nsec / 100; + uuidtime += AFS_UUID_TO_UNIX_TIME; + afs_uuid.time_low = uuidtime; + afs_uuid.time_mid = uuidtime >> 32; + afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; + + get_random_bytes(&clockseq, 2); + afs_uuid.clock_seq_low = clockseq; + afs_uuid.clock_seq_hi_and_reserved = + (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; + + _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + afs_uuid.time_low, + afs_uuid.time_mid, + afs_uuid.time_hi_and_version, + afs_uuid.clock_seq_hi_and_reserved, + afs_uuid.clock_seq_low, + afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2], + afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]); + + return 0; +} + +/* + * initialise the AFS client FS module + */ +static int __init afs_init(void) +{ + int ret; + + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + + ret = afs_get_client_UUID(); + if (ret < 0) + return ret; + + /* create workqueue */ + ret = -ENOMEM; + afs_wq = alloc_workqueue("afs", 0, 0); + if (!afs_wq) + return ret; + + /* register the /proc stuff */ + ret = afs_proc_init(); + if (ret < 0) + goto error_proc; + +#ifdef CONFIG_AFS_FSCACHE + /* we want to be able to cache */ + ret = fscache_register_netfs(&afs_cache_netfs); + if (ret < 0) + goto error_cache; +#endif + + /* initialise the cell DB */ + ret = afs_cell_init(rootcell); + if (ret < 0) + goto error_cell_init; + + /* initialise the VL update process */ + ret = afs_vlocation_update_init(); + if (ret < 0) + goto error_vl_update_init; + + /* initialise the callback update process */ + ret = afs_callback_update_init(); + if (ret < 0) + goto error_callback_update_init; + + /* create the RxRPC transport */ + ret = afs_open_socket(); + if (ret < 0) + goto error_open_socket; + + /* register the filesystems */ + ret = afs_fs_init(); + if (ret < 0) + goto error_fs; + + return ret; + +error_fs: + afs_close_socket(); +error_open_socket: + afs_callback_update_kill(); +error_callback_update_init: + afs_vlocation_purge(); +error_vl_update_init: + afs_cell_purge(); +error_cell_init: +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); +error_cache: +#endif + afs_proc_cleanup(); +error_proc: + destroy_workqueue(afs_wq); + rcu_barrier(); + printk(KERN_ERR "kAFS: failed to register: %d\n", ret); + return ret; +} + +/* XXX late_initcall is kludgy, but the only alternative seems to create + * a transport upon the first mount, which is worse. Or is it? + */ +late_initcall(afs_init); /* must be called after net/ to create socket */ + +/* + * clean up on module removal + */ +static void __exit afs_exit(void) +{ + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); + + afs_fs_exit(); + afs_kill_lock_manager(); + afs_close_socket(); + afs_purge_servers(); + afs_callback_update_kill(); + afs_vlocation_purge(); + destroy_workqueue(afs_wq); + afs_cell_purge(); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); +#endif + afs_proc_cleanup(); + rcu_barrier(); +} + +module_exit(afs_exit); diff --git a/kernel/fs/afs/misc.c b/kernel/fs/afs/misc.c new file mode 100644 index 000000000..91ea1aa0d --- /dev/null +++ b/kernel/fs/afs/misc.c @@ -0,0 +1,89 @@ +/* miscellaneous bits + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_fs.h" + +/* + * convert an AFS abort code to a Linux error number + */ +int afs_abort_to_error(u32 abort_code) +{ + switch (abort_code) { + /* low errno codes inserted into abort namespace */ + case 13: return -EACCES; + case 27: return -EFBIG; + case 30: return -EROFS; + + /* VICE "special error" codes; 101 - 111 */ + case VSALVAGE: return -EIO; + case VNOVNODE: return -ENOENT; + case VNOVOL: return -ENOMEDIUM; + case VVOLEXISTS: return -EEXIST; + case VNOSERVICE: return -EIO; + case VOFFLINE: return -ENOENT; + case VONLINE: return -EEXIST; + case VDISKFULL: return -ENOSPC; + case VOVERQUOTA: return -EDQUOT; + case VBUSY: return -EBUSY; + case VMOVED: return -ENXIO; + + /* Unified AFS error table; ET "uae" == 0x2f6df00 */ + case 0x2f6df00: return -EPERM; + case 0x2f6df01: return -ENOENT; + case 0x2f6df04: return -EIO; + case 0x2f6df0a: return -EAGAIN; + case 0x2f6df0b: return -ENOMEM; + case 0x2f6df0c: return -EACCES; + case 0x2f6df0f: return -EBUSY; + case 0x2f6df10: return -EEXIST; + case 0x2f6df11: return -EXDEV; + case 0x2f6df12: return -ENODEV; + case 0x2f6df13: return -ENOTDIR; + case 0x2f6df14: return -EISDIR; + case 0x2f6df15: return -EINVAL; + case 0x2f6df1a: return -EFBIG; + case 0x2f6df1b: return -ENOSPC; + case 0x2f6df1d: return -EROFS; + case 0x2f6df1e: return -EMLINK; + case 0x2f6df20: return -EDOM; + case 0x2f6df21: return -ERANGE; + case 0x2f6df22: return -EDEADLK; + case 0x2f6df23: return -ENAMETOOLONG; + case 0x2f6df24: return -ENOLCK; + case 0x2f6df26: return -ENOTEMPTY; + case 0x2f6df28: return -EWOULDBLOCK; + case 0x2f6df69: return -ENOTCONN; + case 0x2f6df6c: return -ETIMEDOUT; + case 0x2f6df78: return -EDQUOT; + + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ + case RXKADINCONSISTENCY: return -EPROTO; + case RXKADPACKETSHORT: return -EPROTO; + case RXKADLEVELFAIL: return -EKEYREJECTED; + case RXKADTICKETLEN: return -EKEYREJECTED; + case RXKADOUTOFSEQUENCE: return -EPROTO; + case RXKADNOAUTH: return -EKEYREJECTED; + case RXKADBADKEY: return -EKEYREJECTED; + case RXKADBADTICKET: return -EKEYREJECTED; + case RXKADUNKNOWNKEY: return -EKEYREJECTED; + case RXKADEXPIRED: return -EKEYEXPIRED; + case RXKADSEALEDINCON: return -EKEYREJECTED; + case RXKADDATALEN: return -EKEYREJECTED; + case RXKADILLEGALLEVEL: return -EKEYREJECTED; + + default: return -EREMOTEIO; + } +} diff --git a/kernel/fs/afs/mntpt.c b/kernel/fs/afs/mntpt.c new file mode 100644 index 000000000..ccd0b212e --- /dev/null +++ b/kernel/fs/afs/mntpt.c @@ -0,0 +1,270 @@ +/* mountpoint management + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static struct dentry *afs_mntpt_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags); +static int afs_mntpt_open(struct inode *inode, struct file *file); +static void afs_mntpt_expiry_timed_out(struct work_struct *work); + +const struct file_operations afs_mntpt_file_operations = { + .open = afs_mntpt_open, + .llseek = noop_llseek, +}; + +const struct inode_operations afs_mntpt_inode_operations = { + .lookup = afs_mntpt_lookup, + .readlink = page_readlink, + .getattr = afs_getattr, +}; + +const struct inode_operations afs_autocell_inode_operations = { + .getattr = afs_getattr, +}; + +static LIST_HEAD(afs_vfsmounts); +static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); + +static unsigned long afs_mntpt_expiry_timeout = 10 * 60; + +/* + * check a symbolic link to see whether it actually encodes a mountpoint + * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately + */ +int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) +{ + struct page *page; + size_t size; + char *buf; + int ret; + + _enter("{%x:%u,%u}", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + + /* read the contents of the symlink into the pagecache */ + page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, + afs_page_filler, key); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + + ret = -EIO; + if (PageError(page)) + goto out_free; + + buf = kmap(page); + + /* examine the symlink's contents */ + size = vnode->status.size; + _debug("symlink to %*.*s", (int) size, (int) size, buf); + + if (size > 2 && + (buf[0] == '%' || buf[0] == '#') && + buf[size - 1] == '.' + ) { + _debug("symlink is a mountpoint"); + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + vnode->vfs_inode.i_flags |= S_AUTOMOUNT; + spin_unlock(&vnode->lock); + } + + ret = 0; + + kunmap(page); +out_free: + page_cache_release(page); +out: + _leave(" = %d", ret); + return ret; +} + +/* + * no valid lookup procedure on this sort of dir + */ +static struct dentry *afs_mntpt_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + _enter("%p,%p{%pd2}", dir, dentry, dentry); + return ERR_PTR(-EREMOTE); +} + +/* + * no valid open procedure on this sort of dir + */ +static int afs_mntpt_open(struct inode *inode, struct file *file) +{ + _enter("%p,%p{%pD2}", inode, file, file); + return -EREMOTE; +} + +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct afs_super_info *super; + struct vfsmount *mnt; + struct afs_vnode *vnode; + struct page *page; + char *devname, *options; + bool rwpath = false; + int ret; + + _enter("{%pd}", mntpt); + + BUG_ON(!d_inode(mntpt)); + + ret = -ENOMEM; + devname = (char *) get_zeroed_page(GFP_KERNEL); + if (!devname) + goto error_no_devname; + + options = (char *) get_zeroed_page(GFP_KERNEL); + if (!options) + goto error_no_options; + + vnode = AFS_FS_I(d_inode(mntpt)); + if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { + /* if the directory is a pseudo directory, use the d_name */ + static const char afs_root_cell[] = ":root.cell."; + unsigned size = mntpt->d_name.len; + + ret = -ENOENT; + if (size < 2 || size > AFS_MAXCELLNAME) + goto error_no_page; + + if (mntpt->d_name.name[0] == '.') { + devname[0] = '#'; + memcpy(devname + 1, mntpt->d_name.name, size - 1); + memcpy(devname + size, afs_root_cell, + sizeof(afs_root_cell)); + rwpath = true; + } else { + devname[0] = '%'; + memcpy(devname + 1, mntpt->d_name.name, size); + memcpy(devname + size + 1, afs_root_cell, + sizeof(afs_root_cell)); + } + } else { + /* read the contents of the AFS special symlink */ + loff_t size = i_size_read(d_inode(mntpt)); + char *buf; + + ret = -EINVAL; + if (size > PAGE_SIZE - 1) + goto error_no_page; + + page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto error_no_page; + } + + ret = -EIO; + if (PageError(page)) + goto error; + + buf = kmap_atomic(page); + memcpy(devname, buf, size); + kunmap_atomic(buf); + page_cache_release(page); + page = NULL; + } + + /* work out what options we want */ + super = AFS_FS_S(mntpt->d_sb); + memcpy(options, "cell=", 5); + strcpy(options + 5, super->volume->cell->name); + if (super->volume->type == AFSVL_RWVOL || rwpath) + strcat(options, ",rwpath"); + + /* try and do the mount */ + _debug("--- attempting mount %s -o %s ---", devname, options); + mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + _debug("--- mount result %p ---", mnt); + + free_page((unsigned long) devname); + free_page((unsigned long) options); + _leave(" = %p", mnt); + return mnt; + +error: + page_cache_release(page); +error_no_page: + free_page((unsigned long) options); +error_no_options: + free_page((unsigned long) devname); +error_no_devname: + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * handle an automount point + */ +struct vfsmount *afs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + _enter("{%pd}", path->dentry); + + newmnt = afs_mntpt_do_automount(path->dentry); + if (IS_ERR(newmnt)) + return newmnt; + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + _leave(" = %p", newmnt); + return newmnt; +} + +/* + * handle mountpoint expiry timer going off + */ +static void afs_mntpt_expiry_timed_out(struct work_struct *work) +{ + _enter(""); + + if (!list_empty(&afs_vfsmounts)) { + mark_mounts_for_expiry(&afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + } + + _leave(""); +} + +/* + * kill the AFS mountpoint timer if it's still running + */ +void afs_mntpt_kill_timer(void) +{ + _enter(""); + + ASSERT(list_empty(&afs_vfsmounts)); + cancel_delayed_work_sync(&afs_mntpt_expiry_timer); +} diff --git a/kernel/fs/afs/netdevices.c b/kernel/fs/afs/netdevices.c new file mode 100644 index 000000000..7ad36506c --- /dev/null +++ b/kernel/fs/afs/netdevices.c @@ -0,0 +1,68 @@ +/* AFS network device helpers + * + * Copyright (c) 2007 Patrick McHardy + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * get a MAC address from a random ethernet interface that has a real one + * - the buffer will normally be 6 bytes in size + */ +int afs_get_MAC_address(u8 *mac, size_t maclen) +{ + struct net_device *dev; + int ret = -ENODEV; + + BUG_ON(maclen != ETH_ALEN); + + rtnl_lock(); + dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); + if (dev) { + memcpy(mac, dev->dev_addr, maclen); + ret = 0; + } + rtnl_unlock(); + return ret; +} + +/* + * get a list of this system's interface IPv4 addresses, netmasks and MTUs + * - maxbufs must be at least 1 + * - returns the number of interface records in the buffer + */ +int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs, + bool wantloopback) +{ + struct net_device *dev; + struct in_device *idev; + int n = 0; + + ASSERT(maxbufs > 0); + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (dev->type == ARPHRD_LOOPBACK && !wantloopback) + continue; + idev = __in_dev_get_rtnl(dev); + if (!idev) + continue; + for_primary_ifa(idev) { + bufs[n].address.s_addr = ifa->ifa_address; + bufs[n].netmask.s_addr = ifa->ifa_mask; + bufs[n].mtu = dev->mtu; + n++; + if (n >= maxbufs) + goto out; + } endfor_ifa(idev); + } +out: + rtnl_unlock(); + return n; +} diff --git a/kernel/fs/afs/proc.c b/kernel/fs/afs/proc.c new file mode 100644 index 000000000..24a905b07 --- /dev/null +++ b/kernel/fs/afs/proc.c @@ -0,0 +1,666 @@ +/* /proc interface for AFS + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct proc_dir_entry *proc_afs; + + +static int afs_proc_cells_open(struct inode *inode, struct file *file); +static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos); +static void afs_proc_cells_stop(struct seq_file *p, void *v); +static int afs_proc_cells_show(struct seq_file *m, void *v); +static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, + size_t size, loff_t *_pos); + +static const struct seq_operations afs_proc_cells_ops = { + .start = afs_proc_cells_start, + .next = afs_proc_cells_next, + .stop = afs_proc_cells_stop, + .show = afs_proc_cells_show, +}; + +static const struct file_operations afs_proc_cells_fops = { + .open = afs_proc_cells_open, + .read = seq_read, + .write = afs_proc_cells_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, + size_t size, loff_t *_pos); +static ssize_t afs_proc_rootcell_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos); + +static const struct file_operations afs_proc_rootcell_fops = { + .read = afs_proc_rootcell_read, + .write = afs_proc_rootcell_write, + .llseek = no_llseek, +}; + +static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); +static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_volumes_ops = { + .start = afs_proc_cell_volumes_start, + .next = afs_proc_cell_volumes_next, + .stop = afs_proc_cell_volumes_stop, + .show = afs_proc_cell_volumes_show, +}; + +static const struct file_operations afs_proc_cell_volumes_fops = { + .open = afs_proc_cell_volumes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int afs_proc_cell_vlservers_open(struct inode *inode, + struct file *file); +static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_vlservers_ops = { + .start = afs_proc_cell_vlservers_start, + .next = afs_proc_cell_vlservers_next, + .stop = afs_proc_cell_vlservers_stop, + .show = afs_proc_cell_vlservers_show, +}; + +static const struct file_operations afs_proc_cell_vlservers_fops = { + .open = afs_proc_cell_vlservers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); +static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); +static int afs_proc_cell_servers_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_servers_ops = { + .start = afs_proc_cell_servers_start, + .next = afs_proc_cell_servers_next, + .stop = afs_proc_cell_servers_stop, + .show = afs_proc_cell_servers_show, +}; + +static const struct file_operations afs_proc_cell_servers_fops = { + .open = afs_proc_cell_servers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * initialise the /proc/fs/afs/ directory + */ +int afs_proc_init(void) +{ + _enter(""); + + proc_afs = proc_mkdir("fs/afs", NULL); + if (!proc_afs) + goto error_dir; + + if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops)) + goto error_tree; + + _leave(" = 0"); + return 0; + +error_tree: + remove_proc_subtree("fs/afs", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/afs/ directory + */ +void afs_proc_cleanup(void) +{ + remove_proc_subtree("fs/afs", NULL); +} + +/* + * open "/proc/fs/afs/cells" which provides a summary of extant cells + */ +static int afs_proc_cells_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &afs_proc_cells_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE_DATA(inode); + + return 0; +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + down_read(&afs_proc_cells_sem); + return seq_list_start_head(&afs_proc_cells, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &afs_proc_cells, pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cells_stop(struct seq_file *p, void *v) +{ + up_read(&afs_proc_cells_sem); +} + +/* + * display a header line followed by a load of cell lines + */ +static int afs_proc_cells_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + + if (v == &afs_proc_cells) { + /* display header on line 1 */ + seq_puts(m, "USE NAME\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3d %s\n", + atomic_read(&cell->usage), cell->name); + return 0; +} + +/* + * handle writes to /proc/fs/afs/cells + * - to add cells: echo "add [:][:]" + */ +static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, + size_t size, loff_t *_pos) +{ + char *kbuf, *name, *args; + int ret; + + /* start by dragging the command into memory */ + if (size <= 1 || size >= PAGE_SIZE) + return -EINVAL; + + kbuf = kmalloc(size + 1, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(kbuf, buf, size) != 0) + goto done; + kbuf[size] = 0; + + /* trim to first NL */ + name = memchr(kbuf, '\n', size); + if (name) + *name = 0; + + /* split into command, name and argslist */ + name = strchr(kbuf, ' '); + if (!name) + goto inval; + do { + *name++ = 0; + } while(*name == ' '); + if (!*name) + goto inval; + + args = strchr(name, ' '); + if (!args) + goto inval; + do { + *args++ = 0; + } while(*args == ' '); + if (!*args) + goto inval; + + /* determine command to perform */ + _debug("cmd=%s name=%s args=%s", kbuf, name, args); + + if (strcmp(kbuf, "add") == 0) { + struct afs_cell *cell; + + cell = afs_cell_create(name, strlen(name), args, false); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto done; + } + + afs_put_cell(cell); + printk("kAFS: Added new cell '%s'\n", name); + } else { + goto inval; + } + + ret = size; + +done: + kfree(kbuf); + _leave(" = %d", ret); + return ret; + +inval: + ret = -EINVAL; + printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n"); + goto done; +} + +static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, + size_t size, loff_t *_pos) +{ + return 0; +} + +/* + * handle writes to /proc/fs/afs/rootcell + * - to initialize rootcell: echo "cell.name:192.168.231.14" + */ +static ssize_t afs_proc_rootcell_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos) +{ + char *kbuf, *s; + int ret; + + /* start by dragging the command into memory */ + if (size <= 1 || size >= PAGE_SIZE) + return -EINVAL; + + ret = -ENOMEM; + kbuf = kmalloc(size + 1, GFP_KERNEL); + if (!kbuf) + goto nomem; + + ret = -EFAULT; + if (copy_from_user(kbuf, buf, size) != 0) + goto infault; + kbuf[size] = 0; + + /* trim to first NL */ + s = memchr(kbuf, '\n', size); + if (s) + *s = 0; + + /* determine command to perform */ + _debug("rootcell=%s", kbuf); + + ret = afs_cell_init(kbuf); + if (ret >= 0) + ret = size; /* consume everything, always */ + +infault: + kfree(kbuf); +nomem: + _leave(" = %d", ret); + return ret; +} + +/* + * initialise /proc/fs/afs// + */ +int afs_proc_cell_setup(struct afs_cell *cell) +{ + struct proc_dir_entry *dir; + + _enter("%p{%s}", cell, cell->name); + + dir = proc_mkdir(cell->name, proc_afs); + if (!dir) + goto error_dir; + + if (!proc_create_data("servers", 0, dir, + &afs_proc_cell_servers_fops, cell) || + !proc_create_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_fops, cell) || + !proc_create_data("volumes", 0, dir, + &afs_proc_cell_volumes_fops, cell)) + goto error_tree; + + _leave(" = 0"); + return 0; + +error_tree: + remove_proc_subtree(cell->name, proc_afs); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * remove /proc/fs/afs// + */ +void afs_proc_cell_remove(struct afs_cell *cell) +{ + _enter(""); + + remove_proc_subtree(cell->name, proc_afs); + + _leave(""); +} + +/* + * open "/proc/fs/afs//volumes" which provides a summary of extant cells + */ +static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE_DATA(inode); + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_volumes_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = cell; + + return 0; +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) +{ + struct afs_cell *cell = m->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + down_read(&cell->vl_sem); + return seq_list_start_head(&cell->vl_list, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + return seq_list_next(v, &cell->vl_list, _pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) +{ + struct afs_cell *cell = p->private; + + up_read(&cell->vl_sem); +} + +static const char afs_vlocation_states[][4] = { + [AFS_VL_NEW] = "New", + [AFS_VL_CREATING] = "Crt", + [AFS_VL_VALID] = "Val", + [AFS_VL_NO_VOLUME] = "NoV", + [AFS_VL_UPDATING] = "Upd", + [AFS_VL_VOLUME_DELETED] = "Del", + [AFS_VL_UNCERTAIN] = "Unc", +}; + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = m->private; + struct afs_vlocation *vlocation = + list_entry(v, struct afs_vlocation, link); + + /* display header on line 1 */ + if (v == &cell->vl_list) { + seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3d %s %08x %08x %08x %s\n", + atomic_read(&vlocation->usage), + afs_vlocation_states[vlocation->state], + vlocation->vldb.vid[0], + vlocation->vldb.vid[1], + vlocation->vldb.vid[2], + vlocation->vldb.name); + + return 0; +} + +/* + * open "/proc/fs/afs//vlservers" which provides a list of volume + * location server + */ +static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE_DATA(inode); + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_vlservers_ops); + if (ret<0) + return ret; + + m = file->private_data; + m->private = cell; + + return 0; +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) +{ + struct afs_cell *cell = m->private; + loff_t pos = *_pos; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + down_read(&cell->vl_sem); + + /* allow for the header line */ + if (!pos) + return (void *) 1; + pos--; + + if (pos >= cell->vl_naddrs) + return NULL; + + return &cell->vl_addrs[pos]; +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + loff_t pos; + + _enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos); + + pos = *_pos; + (*_pos)++; + if (pos >= cell->vl_naddrs) + return NULL; + + return &cell->vl_addrs[pos]; +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) +{ + struct afs_cell *cell = p->private; + + up_read(&cell->vl_sem); +} + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) +{ + struct in_addr *addr = v; + + /* display header on line 1 */ + if (v == (struct in_addr *) 1) { + seq_puts(m, "ADDRESS\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%pI4\n", &addr->s_addr); + return 0; +} + +/* + * open "/proc/fs/afs//servers" which provides a summary of active + * servers + */ +static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE_DATA(inode); + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_servers_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = cell; + return 0; +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(m->private->servers_lock) +{ + struct afs_cell *cell = m->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + read_lock(&cell->servers_lock); + return seq_list_start_head(&cell->servers, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + return seq_list_next(v, &cell->servers, _pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) + __releases(p->private->servers_lock) +{ + struct afs_cell *cell = p->private; + + read_unlock(&cell->servers_lock); +} + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_servers_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = m->private; + struct afs_server *server = list_entry(v, struct afs_server, link); + char ipaddr[20]; + + /* display header on line 1 */ + if (v == &cell->servers) { + seq_puts(m, "USE ADDR STATE\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + sprintf(ipaddr, "%pI4", &server->addr); + seq_printf(m, "%3d %-15.15s %5d\n", + atomic_read(&server->usage), ipaddr, server->fs_state); + + return 0; +} diff --git a/kernel/fs/afs/rxrpc.c b/kernel/fs/afs/rxrpc.c new file mode 100644 index 000000000..3a57a1b0f --- /dev/null +++ b/kernel/fs/afs/rxrpc.c @@ -0,0 +1,862 @@ +/* Maintain an RxRPC server socket to do AFS communications through + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_cm.h" + +static struct socket *afs_socket; /* my RxRPC socket */ +static struct workqueue_struct *afs_async_calls; +static atomic_t afs_outstanding_calls; +static atomic_t afs_outstanding_skbs; + +static void afs_wake_up_call_waiter(struct afs_call *); +static int afs_wait_for_call_to_complete(struct afs_call *); +static void afs_wake_up_async_call(struct afs_call *); +static int afs_dont_wait_for_call_to_complete(struct afs_call *); +static void afs_process_async_call(struct afs_call *); +static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); +static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); + +/* synchronous call management */ +const struct afs_wait_mode afs_sync_call = { + .rx_wakeup = afs_wake_up_call_waiter, + .wait = afs_wait_for_call_to_complete, +}; + +/* asynchronous call management */ +const struct afs_wait_mode afs_async_call = { + .rx_wakeup = afs_wake_up_async_call, + .wait = afs_dont_wait_for_call_to_complete, +}; + +/* asynchronous incoming call management */ +static const struct afs_wait_mode afs_async_incoming_call = { + .rx_wakeup = afs_wake_up_async_call, +}; + +/* asynchronous incoming call initial processing */ +static const struct afs_call_type afs_RXCMxxxx = { + .name = "CB.xxxx", + .deliver = afs_deliver_cm_op_id, + .abort_to_error = afs_abort_to_error, +}; + +static void afs_collect_incoming_call(struct work_struct *); + +static struct sk_buff_head afs_incoming_calls; +static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); + +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + +/* + * open an RxRPC socket and bind it to be a server for callback notifications + * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT + */ +int afs_open_socket(void) +{ + struct sockaddr_rxrpc srx; + struct socket *socket; + int ret; + + _enter(""); + + skb_queue_head_init(&afs_incoming_calls); + + afs_async_calls = create_singlethread_workqueue("kafsd"); + if (!afs_async_calls) { + _leave(" = -ENOMEM [wq]"); + return -ENOMEM; + } + + ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + if (ret < 0) { + destroy_workqueue(afs_async_calls); + _leave(" = %d [socket]", ret); + return ret; + } + + socket->sk->sk_allocation = GFP_NOFS; + + /* bind the callback manager's address to make this a server socket */ + srx.srx_family = AF_RXRPC; + srx.srx_service = CM_SERVICE; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(AFS_CM_PORT); + memset(&srx.transport.sin.sin_addr, 0, + sizeof(srx.transport.sin.sin_addr)); + + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + if (ret < 0) { + sock_release(socket); + destroy_workqueue(afs_async_calls); + _leave(" = %d [bind]", ret); + return ret; + } + + rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); + + afs_socket = socket; + _leave(" = 0"); + return 0; +} + +/* + * close the RxRPC socket AFS was using + */ +void afs_close_socket(void) +{ + _enter(""); + + sock_release(afs_socket); + + _debug("dework"); + destroy_workqueue(afs_async_calls); + + ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); + ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0); + _leave(""); +} + +/* + * note that the data in a socket buffer is now delivered and that the buffer + * should be freed + */ +static void afs_data_delivered(struct sk_buff *skb) +{ + if (!skb) { + _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); + dump_stack(); + } else { + _debug("DLVR %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + if (atomic_dec_return(&afs_outstanding_skbs) == -1) + BUG(); + rxrpc_kernel_data_delivered(skb); + } +} + +/* + * free a socket buffer + */ +static void afs_free_skb(struct sk_buff *skb) +{ + if (!skb) { + _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); + dump_stack(); + } else { + _debug("FREE %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + if (atomic_dec_return(&afs_outstanding_skbs) == -1) + BUG(); + rxrpc_kernel_free_skb(skb); + } +} + +/* + * free a call + */ +static void afs_free_call(struct afs_call *call) +{ + _debug("DONE %p{%s} [%d]", + call, call->type->name, atomic_read(&afs_outstanding_calls)); + if (atomic_dec_return(&afs_outstanding_calls) == -1) + BUG(); + + ASSERTCMP(call->rxcall, ==, NULL); + ASSERT(!work_pending(&call->async_work)); + ASSERT(skb_queue_empty(&call->rx_queue)); + ASSERT(call->type->name != NULL); + + kfree(call->request); + kfree(call); +} + +/* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* + * allocate a call with flat request and reply buffers + */ +struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, + size_t request_size, size_t reply_size) +{ + struct afs_call *call; + + call = kzalloc(sizeof(*call), GFP_NOFS); + if (!call) + goto nomem_call; + + _debug("CALL %p{%s} [%d]", + call, type->name, atomic_read(&afs_outstanding_calls)); + atomic_inc(&afs_outstanding_calls); + + call->type = type; + call->request_size = request_size; + call->reply_max = reply_size; + + if (request_size) { + call->request = kmalloc(request_size, GFP_NOFS); + if (!call->request) + goto nomem_free; + } + + if (reply_size) { + call->buffer = kmalloc(reply_size, GFP_NOFS); + if (!call->buffer) + goto nomem_free; + } + + init_waitqueue_head(&call->waitq); + skb_queue_head_init(&call->rx_queue); + return call; + +nomem_free: + afs_free_call(call); +nomem_call: + return NULL; +} + +/* + * clean up a call with flat buffer + */ +void afs_flat_call_destructor(struct afs_call *call) +{ + _enter(""); + + kfree(call->request); + call->request = NULL; + kfree(call->buffer); + call->buffer = NULL; +} + +/* + * attach the data from a bunch of pages on an inode to a call + */ +static int afs_send_pages(struct afs_call *call, struct msghdr *msg, + struct kvec *iov) +{ + struct page *pages[8]; + unsigned count, n, loop, offset, to; + pgoff_t first = call->first, last = call->last; + int ret; + + _enter(""); + + offset = call->first_offset; + call->first_offset = 0; + + do { + _debug("attach %lx-%lx", first, last); + + count = last - first + 1; + if (count > ARRAY_SIZE(pages)) + count = ARRAY_SIZE(pages); + n = find_get_pages_contig(call->mapping, first, count, pages); + ASSERTCMP(n, ==, count); + + loop = 0; + do { + msg->msg_flags = 0; + to = PAGE_SIZE; + if (first + loop >= last) + to = call->last_to; + else + msg->msg_flags = MSG_MORE; + iov->iov_base = kmap(pages[loop]) + offset; + iov->iov_len = to - offset; + offset = 0; + + _debug("- range %u-%u%s", + offset, to, msg->msg_flags ? " [more]" : ""); + iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, + iov, 1, to - offset); + + /* have to change the state *before* sending the last + * packet as RxRPC might give us the reply before it + * returns from sending the request */ + if (first + loop >= last) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(call->rxcall, msg, + to - offset); + kunmap(pages[loop]); + if (ret < 0) + break; + } while (++loop < count); + first += count; + + for (loop = 0; loop < count; loop++) + put_page(pages[loop]); + if (ret < 0) + break; + } while (first <= last); + + _leave(" = %d", ret); + return ret; +} + +/* + * initiate a call + */ +int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, + const struct afs_wait_mode *wait_mode) +{ + struct sockaddr_rxrpc srx; + struct rxrpc_call *rxcall; + struct msghdr msg; + struct kvec iov[1]; + int ret; + struct sk_buff *skb; + + _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); + + ASSERT(call->type != NULL); + ASSERT(call->type->name != NULL); + + _debug("____MAKE %p{%s,%x} [%d]____", + call, call->type->name, key_serial(call->key), + atomic_read(&afs_outstanding_calls)); + + call->wait_mode = wait_mode; + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); + + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.srx_service = call->service_id; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = call->port; + memcpy(&srx.transport.sin.sin_addr, addr, 4); + + /* create a call */ + rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, + (unsigned long) call, gfp); + call->key = NULL; + if (IS_ERR(rxcall)) { + ret = PTR_ERR(rxcall); + goto error_kill_call; + } + + call->rxcall = rxcall; + + /* send the request */ + iov[0].iov_base = call->request; + iov[0].iov_len = call->request_size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, + call->request_size); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = (call->send_pages ? MSG_MORE : 0); + + /* have to change the state *before* sending the last packet as RxRPC + * might give us the reply before it returns from sending the + * request */ + if (!call->send_pages) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + if (ret < 0) + goto error_do_abort; + + if (call->send_pages) { + ret = afs_send_pages(call, &msg, iov); + if (ret < 0) + goto error_do_abort; + } + + /* at this point, an async call may no longer exist as it may have + * already completed */ + return wait_mode->wait(call); + +error_do_abort: + rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); +error_kill_call: + afs_end_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * handles intercepted messages that were arriving in the socket's Rx queue + * - called with the socket receive queue lock held to ensure message ordering + * - called with softirqs disabled + */ +static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, + struct sk_buff *skb) +{ + struct afs_call *call = (struct afs_call *) user_call_ID; + + _enter("%p,,%u", call, skb->mark); + + _debug("ICPT %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + + ASSERTCMP(sk, ==, afs_socket->sk); + atomic_inc(&afs_outstanding_skbs); + + if (!call) { + /* its an incoming call for our callback service */ + skb_queue_tail(&afs_incoming_calls, skb); + queue_work(afs_wq, &afs_collect_incoming_call_work); + } else { + /* route the messages directly to the appropriate call */ + skb_queue_tail(&call->rx_queue, skb); + call->wait_mode->rx_wakeup(call); + } + + _leave(""); +} + +/* + * deliver messages to a call + */ +static void afs_deliver_to_call(struct afs_call *call) +{ + struct sk_buff *skb; + bool last; + u32 abort_code; + int ret; + + _enter(""); + + while ((call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK) && + (skb = skb_dequeue(&call->rx_queue))) { + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + _debug("Rcv DATA"); + last = rxrpc_kernel_is_data_last(skb); + ret = call->type->deliver(call, skb, last); + switch (ret) { + case 0: + if (last && + call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + break; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + goto do_abort; + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + do_abort: + rxrpc_kernel_abort_call(call->rxcall, + abort_code); + call->error = ret; + call->state = AFS_CALL_ERROR; + break; + } + afs_data_delivered(skb); + skb = NULL; + continue; + case RXRPC_SKB_MARK_FINAL_ACK: + _debug("Rcv ACK"); + call->state = AFS_CALL_COMPLETE; + break; + case RXRPC_SKB_MARK_BUSY: + _debug("Rcv BUSY"); + call->error = -EBUSY; + call->state = AFS_CALL_BUSY; + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = rxrpc_kernel_get_abort_code(skb); + call->error = call->type->abort_to_error(abort_code); + call->state = AFS_CALL_ABORTED; + _debug("Rcv ABORT %u -> %d", abort_code, call->error); + break; + case RXRPC_SKB_MARK_NET_ERROR: + call->error = -rxrpc_kernel_get_error_number(skb); + call->state = AFS_CALL_ERROR; + _debug("Rcv NET ERROR %d", call->error); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + call->error = -rxrpc_kernel_get_error_number(skb); + call->state = AFS_CALL_ERROR; + _debug("Rcv LOCAL ERROR %d", call->error); + break; + default: + BUG(); + break; + } + + afs_free_skb(skb); + } + + /* make sure the queue is empty if the call is done with (we might have + * aborted the call early because of an unmarshalling error) */ + if (call->state >= AFS_CALL_COMPLETE) { + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); + if (call->incoming) + afs_end_call(call); + } + + _leave(""); +} + +/* + * wait synchronously for a call to complete + */ +static int afs_wait_for_call_to_complete(struct afs_call *call) +{ + struct sk_buff *skb; + int ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter(""); + + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + /* deliver any messages that are in the queue */ + if (!skb_queue_empty(&call->rx_queue)) { + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } + + ret = call->error; + if (call->state >= AFS_CALL_COMPLETE) + break; + ret = -EINTR; + if (signal_pending(current)) + break; + schedule(); + } + + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); + + /* kill the call */ + if (call->state < AFS_CALL_COMPLETE) { + _debug("call incomplete"); + rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); + } + + _debug("call complete"); + afs_end_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * wake up a waiting call + */ +static void afs_wake_up_call_waiter(struct afs_call *call) +{ + wake_up(&call->waitq); +} + +/* + * wake up an asynchronous call + */ +static void afs_wake_up_async_call(struct afs_call *call) +{ + _enter(""); + queue_work(afs_async_calls, &call->async_work); +} + +/* + * put a call into asynchronous mode + * - mustn't touch the call descriptor as the call my have completed by the + * time we get here + */ +static int afs_dont_wait_for_call_to_complete(struct afs_call *call) +{ + _enter(""); + return -EINPROGRESS; +} + +/* + * delete an asynchronous call + */ +static void afs_delete_async_call(struct afs_call *call) +{ + _enter(""); + + afs_free_call(call); + + _leave(""); +} + +/* + * perform processing on an asynchronous call + * - on a multiple-thread workqueue this work item may try to run on several + * CPUs at the same time + */ +static void afs_process_async_call(struct afs_call *call) +{ + _enter(""); + + if (!skb_queue_empty(&call->rx_queue)) + afs_deliver_to_call(call); + + if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->wait_mode->async_complete) + call->wait_mode->async_complete(call->reply, + call->error); + call->reply = NULL; + + /* kill the call */ + afs_end_call_nofree(call); + + /* we can't just delete the call because the work item may be + * queued */ + call->async_workfn = afs_delete_async_call; + queue_work(afs_async_calls, &call->async_work); + } + + _leave(""); +} + +/* + * empty a socket buffer into a flat reply buffer + */ +void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +{ + size_t len = skb->len; + + if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) + BUG(); + call->reply_size += len; +} + +/* + * accept the backlog of incoming calls + */ +static void afs_collect_incoming_call(struct work_struct *work) +{ + struct rxrpc_call *rxcall; + struct afs_call *call = NULL; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&afs_incoming_calls))) { + _debug("new call"); + + /* don't need the notification */ + afs_free_skb(skb); + + if (!call) { + call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); + if (!call) { + rxrpc_kernel_reject_call(afs_socket); + return; + } + + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); + call->wait_mode = &afs_async_incoming_call; + call->type = &afs_RXCMxxxx; + init_waitqueue_head(&call->waitq); + skb_queue_head_init(&call->rx_queue); + call->state = AFS_CALL_AWAIT_OP_ID; + + _debug("CALL %p{%s} [%d]", + call, call->type->name, + atomic_read(&afs_outstanding_calls)); + atomic_inc(&afs_outstanding_calls); + } + + rxcall = rxrpc_kernel_accept_call(afs_socket, + (unsigned long) call); + if (!IS_ERR(rxcall)) { + call->rxcall = rxcall; + call = NULL; + } + } + + if (call) + afs_free_call(call); +} + +/* + * grab the operation ID from an incoming cache manager call + */ +static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + size_t len = skb->len; + void *oibuf = (void *) &call->operation_ID; + + _enter("{%u},{%zu},%d", call->offset, len, last); + + ASSERTCMP(call->offset, <, 4); + + /* the operation ID forms the first four bytes of the request data */ + len = min_t(size_t, len, 4 - call->offset); + if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) + BUG(); + if (!pskb_pull(skb, len)) + BUG(); + call->offset += len; + + if (call->offset < 4) { + if (last) { + _leave(" = -EBADMSG [op ID short]"); + return -EBADMSG; + } + _leave(" = 0 [incomplete]"); + return 0; + } + + call->state = AFS_CALL_AWAIT_REQUEST; + + /* ask the cache manager to route the call (it'll change the call type + * if successful) */ + if (!afs_cm_incoming_call(call)) + return -ENOTSUPP; + + /* pass responsibility for the remainer of this message off to the + * cache manager op */ + return call->type->deliver(call, skb, last); +} + +/* + * send an empty reply + */ +void afs_send_empty_reply(struct afs_call *call) +{ + struct msghdr msg; + + _enter(""); + + msg.msg_name = NULL; + msg.msg_namelen = 0; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + call->state = AFS_CALL_AWAIT_ACK; + switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + case 0: + _leave(" [replied]"); + return; + + case -ENOMEM: + _debug("oom"); + rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + default: + afs_end_call(call); + _leave(" [error]"); + return; + } +} + +/* + * send a simple reply + */ +void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) +{ + struct msghdr msg; + struct kvec iov[1]; + int n; + + _enter(""); + + iov[0].iov_base = (void *) buf; + iov[0].iov_len = len; + msg.msg_name = NULL; + msg.msg_namelen = 0; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + call->state = AFS_CALL_AWAIT_ACK; + n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + if (n >= 0) { + /* Success */ + _leave(" [replied]"); + return; + } + + if (n == -ENOMEM) { + _debug("oom"); + rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + } + afs_end_call(call); + _leave(" [error]"); +} + +/* + * extract a piece of data from the received data socket buffers + */ +int afs_extract_data(struct afs_call *call, struct sk_buff *skb, + bool last, void *buf, size_t count) +{ + size_t len = skb->len; + + _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + + ASSERTCMP(call->offset, <, count); + + len = min_t(size_t, len, count - call->offset); + if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || + !pskb_pull(skb, len)) + BUG(); + call->offset += len; + + if (call->offset < count) { + if (last) { + _leave(" = -EBADMSG [%d < %zu]", call->offset, count); + return -EBADMSG; + } + _leave(" = -EAGAIN"); + return -EAGAIN; + } + return 0; +} diff --git a/kernel/fs/afs/security.c b/kernel/fs/afs/security.c new file mode 100644 index 000000000..8d010422d --- /dev/null +++ b/kernel/fs/afs/security.c @@ -0,0 +1,363 @@ +/* AFS security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * get a key + */ +struct key *afs_request_key(struct afs_cell *cell) +{ + struct key *key; + + _enter("{%x}", key_serial(cell->anonymous_key)); + + _debug("key %s", cell->anonymous_key->description); + key = request_key(&key_type_rxrpc, cell->anonymous_key->description, + NULL); + if (IS_ERR(key)) { + if (PTR_ERR(key) != -ENOKEY) { + _leave(" = %ld", PTR_ERR(key)); + return key; + } + + /* act as anonymous user */ + _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); + return key_get(cell->anonymous_key); + } else { + /* act as authorised user */ + _leave(" = {%x} [auth]", key_serial(key)); + return key; + } +} + +/* + * dispose of a permits list + */ +void afs_zap_permits(struct rcu_head *rcu) +{ + struct afs_permits *permits = + container_of(rcu, struct afs_permits, rcu); + int loop; + + _enter("{%d}", permits->count); + + for (loop = permits->count - 1; loop >= 0; loop--) + key_put(permits->permits[loop].key); + kfree(permits); +} + +/* + * dispose of a permits list in which all the key pointers have been copied + */ +static void afs_dispose_of_permits(struct rcu_head *rcu) +{ + struct afs_permits *permits = + container_of(rcu, struct afs_permits, rcu); + + _enter("{%d}", permits->count); + + kfree(permits); +} + +/* + * get the authorising vnode - this is the specified inode itself if it's a + * directory or it's the parent directory if the specified inode is a file or + * symlink + * - the caller must release the ref on the inode + */ +static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode, + struct key *key) +{ + struct afs_vnode *auth_vnode; + struct inode *auth_inode; + + _enter(""); + + if (S_ISDIR(vnode->vfs_inode.i_mode)) { + auth_inode = igrab(&vnode->vfs_inode); + ASSERT(auth_inode != NULL); + } else { + auth_inode = afs_iget(vnode->vfs_inode.i_sb, key, + &vnode->status.parent, NULL, NULL); + if (IS_ERR(auth_inode)) + return ERR_CAST(auth_inode); + } + + auth_vnode = AFS_FS_I(auth_inode); + _leave(" = {%x}", auth_vnode->fid.vnode); + return auth_vnode; +} + +/* + * clear the permit cache on a directory vnode + */ +void afs_clear_permits(struct afs_vnode *vnode) +{ + struct afs_permits *permits; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + mutex_lock(&vnode->permits_lock); + permits = vnode->permits; + rcu_assign_pointer(vnode->permits, NULL); + mutex_unlock(&vnode->permits_lock); + + if (permits) + call_rcu(&permits->rcu, afs_zap_permits); + _leave(""); +} + +/* + * add the result obtained for a vnode to its or its parent directory's cache + * for the key used to access it + */ +void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order) +{ + struct afs_permits *permits, *xpermits; + struct afs_permit *permit; + struct afs_vnode *auth_vnode; + int count, loop; + + _enter("{%x:%u},%x,%lx", + vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order); + + auth_vnode = afs_get_auth_inode(vnode, key); + if (IS_ERR(auth_vnode)) { + _leave(" [get error %ld]", PTR_ERR(auth_vnode)); + return; + } + + mutex_lock(&auth_vnode->permits_lock); + + /* guard against a rename being detected whilst we waited for the + * lock */ + if (memcmp(&auth_vnode->fid, &vnode->status.parent, + sizeof(struct afs_fid)) != 0) { + _debug("renamed"); + goto out_unlock; + } + + /* have to be careful as the directory's callback may be broken between + * us receiving the status we're trying to cache and us getting the + * lock to update the cache for the status */ + if (auth_vnode->acl_order - acl_order > 0) { + _debug("ACL changed?"); + goto out_unlock; + } + + /* always update the anonymous mask */ + _debug("anon access %x", vnode->status.anon_access); + auth_vnode->status.anon_access = vnode->status.anon_access; + if (key == vnode->volume->cell->anonymous_key) + goto out_unlock; + + xpermits = auth_vnode->permits; + count = 0; + if (xpermits) { + /* see if the permit is already in the list + * - if it is then we just amend the list + */ + count = xpermits->count; + permit = xpermits->permits; + for (loop = count; loop > 0; loop--) { + if (permit->key == key) { + permit->access_mask = + vnode->status.caller_access; + goto out_unlock; + } + permit++; + } + } + + permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1), + GFP_NOFS); + if (!permits) + goto out_unlock; + + if (xpermits) + memcpy(permits->permits, xpermits->permits, + count * sizeof(struct afs_permit)); + + _debug("key %x access %x", + key_serial(key), vnode->status.caller_access); + permits->permits[count].access_mask = vnode->status.caller_access; + permits->permits[count].key = key_get(key); + permits->count = count + 1; + + rcu_assign_pointer(auth_vnode->permits, permits); + if (xpermits) + call_rcu(&xpermits->rcu, afs_dispose_of_permits); + +out_unlock: + mutex_unlock(&auth_vnode->permits_lock); + iput(&auth_vnode->vfs_inode); + _leave(""); +} + +/* + * check with the fileserver to see if the directory or parent directory is + * permitted to be accessed with this authorisation, and if so, what access it + * is granted + */ +static int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) +{ + struct afs_permits *permits; + struct afs_permit *permit; + struct afs_vnode *auth_vnode; + bool valid; + int loop, ret; + + _enter("{%x:%u},%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key)); + + auth_vnode = afs_get_auth_inode(vnode, key); + if (IS_ERR(auth_vnode)) { + *_access = 0; + _leave(" = %ld", PTR_ERR(auth_vnode)); + return PTR_ERR(auth_vnode); + } + + ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode)); + + /* check the permits to see if we've got one yet */ + if (key == auth_vnode->volume->cell->anonymous_key) { + _debug("anon"); + *_access = auth_vnode->status.anon_access; + valid = true; + } else { + valid = false; + rcu_read_lock(); + permits = rcu_dereference(auth_vnode->permits); + if (permits) { + permit = permits->permits; + for (loop = permits->count; loop > 0; loop--) { + if (permit->key == key) { + _debug("found in cache"); + *_access = permit->access_mask; + valid = true; + break; + } + permit++; + } + } + rcu_read_unlock(); + } + + if (!valid) { + /* check the status on the file we're actually interested in + * (the post-processing will cache the result on auth_vnode) */ + _debug("no valid permit"); + + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_vnode_fetch_status(vnode, auth_vnode, key); + if (ret < 0) { + iput(&auth_vnode->vfs_inode); + *_access = 0; + _leave(" = %d", ret); + return ret; + } + *_access = vnode->status.caller_access; + } + + iput(&auth_vnode->vfs_inode); + _leave(" = 0 [access %x]", *_access); + return 0; +} + +/* + * check the permissions on an AFS file + * - AFS ACLs are attached to directories only, and a file is controlled by its + * parent directory's ACL + */ +int afs_permission(struct inode *inode, int mask) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + afs_access_t uninitialized_var(access); + struct key *key; + int ret; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + _enter("{{%x:%u},%lx},%x,", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } + + /* if the promise has expired, we need to check the server again */ + if (!vnode->cb_promised) { + _debug("not promised"); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* check the permits to see if we've got one yet */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + goto error; + + /* interpret the access mask */ + _debug("REQ %x ACC %x on %s", + mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); + + if (S_ISDIR(inode->i_mode)) { + if (mask & MAY_EXEC) { + if (!(access & AFS_ACE_LOOKUP)) + goto permission_denied; + } else if (mask & MAY_READ) { + if (!(access & AFS_ACE_READ)) + goto permission_denied; + } else if (mask & MAY_WRITE) { + if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ + AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */ + AFS_ACE_WRITE))) /* chmod */ + goto permission_denied; + } else { + BUG(); + } + } else { + if (!(access & AFS_ACE_LOOKUP)) + goto permission_denied; + if (mask & (MAY_EXEC | MAY_READ)) { + if (!(access & AFS_ACE_READ)) + goto permission_denied; + } else if (mask & MAY_WRITE) { + if (!(access & AFS_ACE_WRITE)) + goto permission_denied; + } + } + + key_put(key); + ret = generic_permission(inode, mask); + _leave(" = %d", ret); + return ret; + +permission_denied: + ret = -EACCES; +error: + key_put(key); + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/afs/server.c b/kernel/fs/afs/server.c new file mode 100644 index 000000000..f342acf35 --- /dev/null +++ b/kernel/fs/afs/server.c @@ -0,0 +1,322 @@ +/* AFS server record management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +static unsigned afs_server_timeout = 10; /* server timeout in seconds */ + +static void afs_reap_server(struct work_struct *); + +/* tree of all the servers, indexed by IP address */ +static struct rb_root afs_servers = RB_ROOT; +static DEFINE_RWLOCK(afs_servers_lock); + +/* LRU list of all the servers not currently in use */ +static LIST_HEAD(afs_server_graveyard); +static DEFINE_SPINLOCK(afs_server_graveyard_lock); +static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server); + +/* + * install a server record in the master tree + */ +static int afs_install_server(struct afs_server *server) +{ + struct afs_server *xserver; + struct rb_node **pp, *p; + int ret; + + _enter("%p", server); + + write_lock(&afs_servers_lock); + + ret = -EEXIST; + pp = &afs_servers.rb_node; + p = NULL; + while (*pp) { + p = *pp; + _debug("- consider %p", p); + xserver = rb_entry(p, struct afs_server, master_rb); + if (server->addr.s_addr < xserver->addr.s_addr) + pp = &(*pp)->rb_left; + else if (server->addr.s_addr > xserver->addr.s_addr) + pp = &(*pp)->rb_right; + else + goto error; + } + + rb_link_node(&server->master_rb, p, pp); + rb_insert_color(&server->master_rb, &afs_servers); + ret = 0; + +error: + write_unlock(&afs_servers_lock); + return ret; +} + +/* + * allocate a new server record + */ +static struct afs_server *afs_alloc_server(struct afs_cell *cell, + const struct in_addr *addr) +{ + struct afs_server *server; + + _enter(""); + + server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); + if (server) { + atomic_set(&server->usage, 1); + server->cell = cell; + + INIT_LIST_HEAD(&server->link); + INIT_LIST_HEAD(&server->grave); + init_rwsem(&server->sem); + spin_lock_init(&server->fs_lock); + server->fs_vnodes = RB_ROOT; + server->cb_promises = RB_ROOT; + spin_lock_init(&server->cb_lock); + init_waitqueue_head(&server->cb_break_waitq); + INIT_DELAYED_WORK(&server->cb_break_work, + afs_dispatch_give_up_callbacks); + + memcpy(&server->addr, addr, sizeof(struct in_addr)); + server->addr.s_addr = addr->s_addr; + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + } else { + _leave(" = NULL [nomem]"); + } + return server; +} + +/* + * get an FS-server record for a cell + */ +struct afs_server *afs_lookup_server(struct afs_cell *cell, + const struct in_addr *addr) +{ + struct afs_server *server, *candidate; + + _enter("%p,%pI4", cell, &addr->s_addr); + + /* quick scan of the list to see if we already have the server */ + read_lock(&cell->servers_lock); + + list_for_each_entry(server, &cell->servers, link) { + if (server->addr.s_addr == addr->s_addr) + goto found_server_quickly; + } + read_unlock(&cell->servers_lock); + + candidate = afs_alloc_server(cell, addr); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock(&cell->servers_lock); + + /* check the cell's server list again */ + list_for_each_entry(server, &cell->servers, link) { + if (server->addr.s_addr == addr->s_addr) + goto found_server; + } + + _debug("new"); + server = candidate; + if (afs_install_server(server) < 0) + goto server_in_two_cells; + + afs_get_cell(cell); + list_add_tail(&server->link, &cell->servers); + + write_unlock(&cell->servers_lock); + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + return server; + + /* found a matching server quickly */ +found_server_quickly: + _debug("found quickly"); + afs_get_server(server); + read_unlock(&cell->servers_lock); +no_longer_unused: + if (!list_empty(&server->grave)) { + spin_lock(&afs_server_graveyard_lock); + list_del_init(&server->grave); + spin_unlock(&afs_server_graveyard_lock); + } + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + return server; + + /* found a matching server on the second pass */ +found_server: + _debug("found"); + afs_get_server(server); + write_unlock(&cell->servers_lock); + kfree(candidate); + goto no_longer_unused; + + /* found a server that seems to be in two cells */ +server_in_two_cells: + write_unlock(&cell->servers_lock); + kfree(candidate); + printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n", + addr); + _leave(" = -EEXIST"); + return ERR_PTR(-EEXIST); +} + +/* + * look up a server by its IP address + */ +struct afs_server *afs_find_server(const struct in_addr *_addr) +{ + struct afs_server *server = NULL; + struct rb_node *p; + struct in_addr addr = *_addr; + + _enter("%pI4", &addr.s_addr); + + read_lock(&afs_servers_lock); + + p = afs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, master_rb); + + _debug("- consider %p", p); + + if (addr.s_addr < server->addr.s_addr) { + p = p->rb_left; + } else if (addr.s_addr > server->addr.s_addr) { + p = p->rb_right; + } else { + afs_get_server(server); + goto found; + } + } + + server = NULL; +found: + read_unlock(&afs_servers_lock); + ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr); + _leave(" = %p", server); + return server; +} + +/* + * destroy a server record + * - removes from the cell list + */ +void afs_put_server(struct afs_server *server) +{ + if (!server) + return; + + _enter("%p{%d}", server, atomic_read(&server->usage)); + + _debug("PUT SERVER %d", atomic_read(&server->usage)); + + ASSERTCMP(atomic_read(&server->usage), >, 0); + + if (likely(!atomic_dec_and_test(&server->usage))) { + _leave(""); + return; + } + + afs_flush_callback_breaks(server); + + spin_lock(&afs_server_graveyard_lock); + if (atomic_read(&server->usage) == 0) { + list_move_tail(&server->grave, &afs_server_graveyard); + server->time_of_death = get_seconds(); + queue_delayed_work(afs_wq, &afs_server_reaper, + afs_server_timeout * HZ); + } + spin_unlock(&afs_server_graveyard_lock); + _leave(" [dead]"); +} + +/* + * destroy a dead server + */ +static void afs_destroy_server(struct afs_server *server) +{ + _enter("%p", server); + + ASSERTIF(server->cb_break_head != server->cb_break_tail, + delayed_work_pending(&server->cb_break_work)); + + ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL); + ASSERTCMP(server->cb_promises.rb_node, ==, NULL); + ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail); + ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0); + + afs_put_cell(server->cell); + kfree(server); +} + +/* + * reap dead server records + */ +static void afs_reap_server(struct work_struct *work) +{ + LIST_HEAD(corpses); + struct afs_server *server; + unsigned long delay, expiry; + time_t now; + + now = get_seconds(); + spin_lock(&afs_server_graveyard_lock); + + while (!list_empty(&afs_server_graveyard)) { + server = list_entry(afs_server_graveyard.next, + struct afs_server, grave); + + /* the queue is ordered most dead first */ + expiry = server->time_of_death + afs_server_timeout; + if (expiry > now) { + delay = (expiry - now) * HZ; + mod_delayed_work(afs_wq, &afs_server_reaper, delay); + break; + } + + write_lock(&server->cell->servers_lock); + write_lock(&afs_servers_lock); + if (atomic_read(&server->usage) > 0) { + list_del_init(&server->grave); + } else { + list_move_tail(&server->grave, &corpses); + list_del_init(&server->link); + rb_erase(&server->master_rb, &afs_servers); + } + write_unlock(&afs_servers_lock); + write_unlock(&server->cell->servers_lock); + } + + spin_unlock(&afs_server_graveyard_lock); + + /* now reap the corpses we've extracted */ + while (!list_empty(&corpses)) { + server = list_entry(corpses.next, struct afs_server, grave); + list_del(&server->grave); + afs_destroy_server(server); + } +} + +/* + * discard all the server records for rmmod + */ +void __exit afs_purge_servers(void) +{ + afs_server_timeout = 0; + mod_delayed_work(afs_wq, &afs_server_reaper, 0); +} diff --git a/kernel/fs/afs/super.c b/kernel/fs/afs/super.c new file mode 100644 index 000000000..1fb4a5129 --- /dev/null +++ b/kernel/fs/afs/super.c @@ -0,0 +1,557 @@ +/* AFS superblock handling + * + * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Howells + * David Woodhouse + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ + +static void afs_i_init_once(void *foo); +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); +static struct inode *afs_alloc_inode(struct super_block *sb); +static void afs_destroy_inode(struct inode *inode); +static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); + +struct file_system_type afs_fs_type = { + .owner = THIS_MODULE, + .name = "afs", + .mount = afs_mount, + .kill_sb = afs_kill_super, + .fs_flags = 0, +}; +MODULE_ALIAS_FS("afs"); + +static const struct super_operations afs_super_ops = { + .statfs = afs_statfs, + .alloc_inode = afs_alloc_inode, + .drop_inode = afs_drop_inode, + .destroy_inode = afs_destroy_inode, + .evict_inode = afs_evict_inode, + .show_options = generic_show_options, +}; + +static struct kmem_cache *afs_inode_cachep; +static atomic_t afs_count_active_inodes; + +enum { + afs_no_opt, + afs_opt_cell, + afs_opt_rwpath, + afs_opt_vol, + afs_opt_autocell, +}; + +static const match_table_t afs_options_list = { + { afs_opt_cell, "cell=%s" }, + { afs_opt_rwpath, "rwpath" }, + { afs_opt_vol, "vol=%s" }, + { afs_opt_autocell, "autocell" }, + { afs_no_opt, NULL }, +}; + +/* + * initialise the filesystem + */ +int __init afs_fs_init(void) +{ + int ret; + + _enter(""); + + /* create ourselves an inode cache */ + atomic_set(&afs_count_active_inodes, 0); + + ret = -ENOMEM; + afs_inode_cachep = kmem_cache_create("afs_inode_cache", + sizeof(struct afs_vnode), + 0, + SLAB_HWCACHE_ALIGN, + afs_i_init_once); + if (!afs_inode_cachep) { + printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); + return ret; + } + + /* now export our filesystem to lesser mortals */ + ret = register_filesystem(&afs_fs_type); + if (ret < 0) { + kmem_cache_destroy(afs_inode_cachep); + _leave(" = %d", ret); + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * clean up the filesystem + */ +void __exit afs_fs_exit(void) +{ + _enter(""); + + afs_mntpt_kill_timer(); + unregister_filesystem(&afs_fs_type); + + if (atomic_read(&afs_count_active_inodes) != 0) { + printk("kAFS: %d active inode objects still present\n", + atomic_read(&afs_count_active_inodes)); + BUG(); + } + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(afs_inode_cachep); + _leave(""); +} + +/* + * parse the mount options + * - this function has been shamelessly adapted from the ext3 fs which + * shamelessly adapted it from the msdos fs + */ +static int afs_parse_options(struct afs_mount_params *params, + char *options, const char **devname) +{ + struct afs_cell *cell; + substring_t args[MAX_OPT_ARGS]; + char *p; + int token; + + _enter("%s", options); + + options[PAGE_SIZE - 1] = 0; + + while ((p = strsep(&options, ","))) { + if (!*p) + continue; + + token = match_token(p, afs_options_list, args); + switch (token) { + case afs_opt_cell: + cell = afs_cell_lookup(args[0].from, + args[0].to - args[0].from, + false); + if (IS_ERR(cell)) + return PTR_ERR(cell); + afs_put_cell(params->cell); + params->cell = cell; + break; + + case afs_opt_rwpath: + params->rwpath = 1; + break; + + case afs_opt_vol: + *devname = args[0].from; + break; + + case afs_opt_autocell: + params->autocell = 1; + break; + + default: + printk(KERN_ERR "kAFS:" + " Unknown or invalid mount option: '%s'\n", p); + return -EINVAL; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * parse a device name to get cell name, volume name, volume type and R/W + * selector + * - this can be one of the following: + * "%[cell:]volume[.]" R/W volume + * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), + * or R/W (rwpath=1) volume + * "%[cell:]volume.readonly" R/O volume + * "#[cell:]volume.readonly" R/O volume + * "%[cell:]volume.backup" Backup volume + * "#[cell:]volume.backup" Backup volume + */ +static int afs_parse_device_name(struct afs_mount_params *params, + const char *name) +{ + struct afs_cell *cell; + const char *cellname, *suffix; + int cellnamesz; + + _enter(",%s", name); + + if (!name) { + printk(KERN_ERR "kAFS: no volume name specified\n"); + return -EINVAL; + } + + if ((name[0] != '%' && name[0] != '#') || !name[1]) { + printk(KERN_ERR "kAFS: unparsable volume name\n"); + return -EINVAL; + } + + /* determine the type of volume we're looking for */ + params->type = AFSVL_ROVOL; + params->force = false; + if (params->rwpath || name[0] == '%') { + params->type = AFSVL_RWVOL; + params->force = true; + } + name++; + + /* split the cell name out if there is one */ + params->volname = strchr(name, ':'); + if (params->volname) { + cellname = name; + cellnamesz = params->volname - name; + params->volname++; + } else { + params->volname = name; + cellname = NULL; + cellnamesz = 0; + } + + /* the volume type is further affected by a possible suffix */ + suffix = strrchr(params->volname, '.'); + if (suffix) { + if (strcmp(suffix, ".readonly") == 0) { + params->type = AFSVL_ROVOL; + params->force = true; + } else if (strcmp(suffix, ".backup") == 0) { + params->type = AFSVL_BACKVOL; + params->force = true; + } else if (suffix[1] == 0) { + } else { + suffix = NULL; + } + } + + params->volnamesz = suffix ? + suffix - params->volname : strlen(params->volname); + + _debug("cell %*.*s [%p]", + cellnamesz, cellnamesz, cellname ?: "", params->cell); + + /* lookup the cell record */ + if (cellname || !params->cell) { + cell = afs_cell_lookup(cellname, cellnamesz, true); + if (IS_ERR(cell)) { + printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + cellnamesz, cellnamesz, cellname ?: ""); + return PTR_ERR(cell); + } + afs_put_cell(params->cell); + params->cell = cell; + } + + _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", + params->cell->name, params->cell, + params->volnamesz, params->volnamesz, params->volname, + suffix ?: "-", params->type, params->force ? " FORCE" : ""); + + return 0; +} + +/* + * check a superblock to see if it's the one we're looking for + */ +static int afs_test_super(struct super_block *sb, void *data) +{ + struct afs_super_info *as1 = data; + struct afs_super_info *as = sb->s_fs_info; + + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +/* + * fill in the superblock + */ +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) +{ + struct afs_super_info *as = sb->s_fs_info; + struct afs_fid fid; + struct inode *inode = NULL; + int ret; + + _enter(""); + + /* fill in the superblock */ + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = AFS_FS_MAGIC; + sb->s_op = &afs_super_ops; + sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); + + /* allocate the root inode and dentry */ + fid.vid = as->volume->vid; + fid.vnode = 1; + fid.unique = 1; + inode = afs_iget(sb, params->key, &fid, NULL, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (params->autocell) + set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); + + ret = -ENOMEM; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto error; + + sb->s_d_op = &afs_fs_dentry_operations; + + _leave(" = 0"); + return 0; + +error: + _leave(" = %d", ret); + return ret; +} + +/* + * get an AFS superblock + */ +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *options) +{ + struct afs_mount_params params; + struct super_block *sb; + struct afs_volume *vol; + struct key *key; + char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; + int ret; + + _enter(",,%s,%p", dev_name, options); + + memset(¶ms, 0, sizeof(params)); + + ret = -EINVAL; + if (current->nsproxy->net_ns != &init_net) + goto error; + + /* parse the options and device name */ + if (options) { + ret = afs_parse_options(¶ms, options, &dev_name); + if (ret < 0) + goto error; + } + + ret = afs_parse_device_name(¶ms, dev_name); + if (ret < 0) + goto error; + + /* try and do the mount securely */ + key = afs_request_key(params.cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + params.key = key; + + /* parse the device name */ + vol = afs_volume_lookup(¶ms); + if (IS_ERR(vol)) { + ret = PTR_ERR(vol); + goto error; + } + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; + + /* allocate a deviceless superblock */ + sb = sget(fs_type, afs_test_super, afs_set_super, flags, as); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); + goto error; + } + + if (!sb->s_root) { + /* initial superblock/root creation */ + _debug("create"); + ret = afs_fill_super(sb, ¶ms); + if (ret < 0) { + deactivate_locked_super(sb); + goto error; + } + save_mount_options(sb, new_opts); + sb->s_flags |= MS_ACTIVE; + } else { + _debug("reuse"); + ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); + } + + afs_put_cell(params.cell); + kfree(new_opts); + _leave(" = 0 [%p]", sb); + return dget(sb->s_root); + +error: + afs_put_cell(params.cell); + key_put(params.key); + kfree(new_opts); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +static void afs_kill_super(struct super_block *sb) +{ + struct afs_super_info *as = sb->s_fs_info; + kill_anon_super(sb); + afs_put_volume(as->volume); + kfree(as); +} + +/* + * initialise an inode cache slab element prior to any use + */ +static void afs_i_init_once(void *_vnode) +{ + struct afs_vnode *vnode = _vnode; + + memset(vnode, 0, sizeof(*vnode)); + inode_init_once(&vnode->vfs_inode); + init_waitqueue_head(&vnode->update_waitq); + mutex_init(&vnode->permits_lock); + mutex_init(&vnode->validate_lock); + spin_lock_init(&vnode->writeback_lock); + spin_lock_init(&vnode->lock); + INIT_LIST_HEAD(&vnode->writebacks); + INIT_LIST_HEAD(&vnode->pending_locks); + INIT_LIST_HEAD(&vnode->granted_locks); + INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); + INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); +} + +/* + * allocate an AFS inode struct from our slab cache + */ +static struct inode *afs_alloc_inode(struct super_block *sb) +{ + struct afs_vnode *vnode; + + vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + if (!vnode) + return NULL; + + atomic_inc(&afs_count_active_inodes); + + memset(&vnode->fid, 0, sizeof(vnode->fid)); + memset(&vnode->status, 0, sizeof(vnode->status)); + + vnode->volume = NULL; + vnode->update_cnt = 0; + vnode->flags = 1 << AFS_VNODE_UNSET; + vnode->cb_promised = false; + + _leave(" = %p", &vnode->vfs_inode); + return &vnode->vfs_inode; +} + +static void afs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct afs_vnode *vnode = AFS_FS_I(inode); + kmem_cache_free(afs_inode_cachep, vnode); +} + +/* + * destroy an AFS inode struct + */ +static void afs_destroy_inode(struct inode *inode) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + + _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode); + + _debug("DESTROY INODE %p", inode); + + ASSERTCMP(vnode->server, ==, NULL); + + call_rcu(&inode->i_rcu, afs_i_callback); + atomic_dec(&afs_count_active_inodes); +} + +/* + * return information about an AFS volume + */ +static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct afs_volume_status vs; + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct key *key; + int ret; + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ret = afs_vnode_get_volume_status(vnode, key, &vs); + key_put(key); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = AFS_BLOCK_SIZE; + buf->f_namelen = AFSNAMEMAX - 1; + + if (vs.max_quota == 0) + buf->f_blocks = vs.part_max_blocks; + else + buf->f_blocks = vs.max_quota; + buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; + return 0; +} diff --git a/kernel/fs/afs/vlclient.c b/kernel/fs/afs/vlclient.c new file mode 100644 index 000000000..340afd0cd --- /dev/null +++ b/kernel/fs/afs/vlclient.c @@ -0,0 +1,219 @@ +/* AFS Volume Location Service client + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +/* + * map volume locator abort codes to error codes + */ +static int afs_vl_abort_to_error(u32 abort_code) +{ + _enter("%u", abort_code); + + switch (abort_code) { + case AFSVL_IDEXIST: return -EEXIST; + case AFSVL_IO: return -EREMOTEIO; + case AFSVL_NAMEEXIST: return -EEXIST; + case AFSVL_CREATEFAIL: return -EREMOTEIO; + case AFSVL_NOENT: return -ENOMEDIUM; + case AFSVL_EMPTY: return -ENOMEDIUM; + case AFSVL_ENTDELETED: return -ENOMEDIUM; + case AFSVL_BADNAME: return -EINVAL; + case AFSVL_BADINDEX: return -EINVAL; + case AFSVL_BADVOLTYPE: return -EINVAL; + case AFSVL_BADSERVER: return -EINVAL; + case AFSVL_BADPARTITION: return -EINVAL; + case AFSVL_REPSFULL: return -EFBIG; + case AFSVL_NOREPSERVER: return -ENOENT; + case AFSVL_DUPREPSERVER: return -EEXIST; + case AFSVL_RWNOTFOUND: return -ENOENT; + case AFSVL_BADREFCOUNT: return -EINVAL; + case AFSVL_SIZEEXCEEDED: return -EINVAL; + case AFSVL_BADENTRY: return -EINVAL; + case AFSVL_BADVOLIDBUMP: return -EINVAL; + case AFSVL_IDALREADYHASHED: return -EINVAL; + case AFSVL_ENTRYLOCKED: return -EBUSY; + case AFSVL_BADVOLOPER: return -EBADRQC; + case AFSVL_BADRELLOCKTYPE: return -EINVAL; + case AFSVL_RERELEASE: return -EREMOTEIO; + case AFSVL_BADSERVERFLAG: return -EINVAL; + case AFSVL_PERM: return -EACCES; + case AFSVL_NOMEM: return -EREMOTEIO; + default: + return afs_abort_to_error(abort_code); + } +} + +/* + * deliver reply data to a VL.GetEntryByXXX call + */ +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_cache_vlocation *entry; + __be32 *bp; + u32 tmp; + int loop; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + entry = call->reply; + bp = call->buffer; + + for (loop = 0; loop < 64; loop++) + entry->name[loop] = ntohl(*bp++); + entry->name[loop] = 0; + bp++; /* final NUL */ + + bp++; /* type */ + entry->nservers = ntohl(*bp++); + + for (loop = 0; loop < 8; loop++) + entry->servers[loop].s_addr = *bp++; + + bp += 8; /* partition IDs */ + + for (loop = 0; loop < 8; loop++) { + tmp = ntohl(*bp++); + entry->srvtmask[loop] = 0; + if (tmp & AFS_VLSF_RWVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_RW; + if (tmp & AFS_VLSF_ROVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_RO; + if (tmp & AFS_VLSF_BACKVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_BAK; + } + + entry->vid[0] = ntohl(*bp++); + entry->vid[1] = ntohl(*bp++); + entry->vid[2] = ntohl(*bp++); + + bp++; /* clone ID */ + + tmp = ntohl(*bp++); /* flags */ + entry->vidmask = 0; + if (tmp & AFS_VLF_RWEXISTS) + entry->vidmask |= AFS_VOL_VTM_RW; + if (tmp & AFS_VLF_ROEXISTS) + entry->vidmask |= AFS_VOL_VTM_RO; + if (tmp & AFS_VLF_BACKEXISTS) + entry->vidmask |= AFS_VOL_VTM_BAK; + if (!entry->vidmask) + return -EBADMSG; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * VL.GetEntryByName operation type + */ +static const struct afs_call_type afs_RXVLGetEntryByName = { + .name = "VL.GetEntryByName", + .deliver = afs_deliver_vl_get_entry_by_xxx, + .abort_to_error = afs_vl_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * VL.GetEntryById operation type + */ +static const struct afs_call_type afs_RXVLGetEntryById = { + .name = "VL.GetEntryById", + .deliver = afs_deliver_vl_get_entry_by_xxx, + .abort_to_error = afs_vl_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * dispatch a get volume entry by name operation + */ +int afs_vl_get_entry_by_name(struct in_addr *addr, + struct key *key, + const char *volname, + struct afs_cache_vlocation *entry, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t volnamesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + volnamesz = strlen(volname); + padsz = (4 - (volnamesz & 3)) & 3; + reqsz = 8 + volnamesz + padsz; + + call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = entry; + call->service_id = VL_SERVICE; + call->port = htons(AFS_VL_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETENTRYBYNAME); + *bp++ = htonl(volnamesz); + memcpy(bp, volname, volnamesz); + if (padsz > 0) + memset((void *) bp + volnamesz, 0, padsz); + + /* initiate the call */ + return afs_make_call(addr, call, GFP_KERNEL, wait_mode); +} + +/* + * dispatch a get volume entry by ID operation + */ +int afs_vl_get_entry_by_id(struct in_addr *addr, + struct key *key, + afs_volid_t volid, + afs_voltype_t voltype, + struct afs_cache_vlocation *entry, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = entry; + call->service_id = VL_SERVICE; + call->port = htons(AFS_VL_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETENTRYBYID); + *bp++ = htonl(volid); + *bp = htonl(voltype); + + /* initiate the call */ + return afs_make_call(addr, call, GFP_KERNEL, wait_mode); +} diff --git a/kernel/fs/afs/vlocation.c b/kernel/fs/afs/vlocation.c new file mode 100644 index 000000000..52976785a --- /dev/null +++ b/kernel/fs/afs/vlocation.c @@ -0,0 +1,718 @@ +/* AFS volume location management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +static unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */ +static unsigned afs_vlocation_update_timeout = 10 * 60; + +static void afs_vlocation_reaper(struct work_struct *); +static void afs_vlocation_updater(struct work_struct *); + +static LIST_HEAD(afs_vlocation_updates); +static LIST_HEAD(afs_vlocation_graveyard); +static DEFINE_SPINLOCK(afs_vlocation_updates_lock); +static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock); +static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper); +static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater); +static struct workqueue_struct *afs_vlocation_update_worker; + +/* + * iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + */ +static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, + struct key *key, + struct afs_cache_vlocation *vldb) +{ + struct afs_cell *cell = vl->cell; + struct in_addr addr; + int count, ret; + + _enter("%s,%s", cell->name, vl->vldb.name); + + down_write(&vl->cell->vl_sem); + ret = -ENOMEDIUM; + for (count = cell->vl_naddrs; count > 0; count--) { + addr = cell->vl_addrs[cell->vl_curr_svix]; + + _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); + + /* attempt to access the VL server */ + ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb, + &afs_sync_call); + switch (ret) { + case 0: + goto out; + case -ENOMEM: + case -ENONET: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + if (ret == -ENOMEM || ret == -ENONET) + goto out; + goto rotate; + case -ENOMEDIUM: + case -EKEYREJECTED: + case -EKEYEXPIRED: + goto out; + default: + ret = -EIO; + goto rotate; + } + + /* rotate the server records upon lookup failure */ + rotate: + cell->vl_curr_svix++; + cell->vl_curr_svix %= cell->vl_naddrs; + } + +out: + up_write(&vl->cell->vl_sem); + _leave(" = %d", ret); + return ret; +} + +/* + * iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + */ +static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, + struct key *key, + afs_volid_t volid, + afs_voltype_t voltype, + struct afs_cache_vlocation *vldb) +{ + struct afs_cell *cell = vl->cell; + struct in_addr addr; + int count, ret; + + _enter("%s,%x,%d,", cell->name, volid, voltype); + + down_write(&vl->cell->vl_sem); + ret = -ENOMEDIUM; + for (count = cell->vl_naddrs; count > 0; count--) { + addr = cell->vl_addrs[cell->vl_curr_svix]; + + _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); + + /* attempt to access the VL server */ + ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb, + &afs_sync_call); + switch (ret) { + case 0: + goto out; + case -ENOMEM: + case -ENONET: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + if (ret == -ENOMEM || ret == -ENONET) + goto out; + goto rotate; + case -EBUSY: + vl->upd_busy_cnt++; + if (vl->upd_busy_cnt <= 3) { + if (vl->upd_busy_cnt > 1) { + /* second+ BUSY - sleep a little bit */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + } + continue; + } + break; + case -ENOMEDIUM: + vl->upd_rej_cnt++; + goto rotate; + default: + ret = -EIO; + goto rotate; + } + + /* rotate the server records upon lookup failure */ + rotate: + cell->vl_curr_svix++; + cell->vl_curr_svix %= cell->vl_naddrs; + vl->upd_busy_cnt = 0; + } + +out: + if (ret < 0 && vl->upd_rej_cnt > 0) { + printk(KERN_NOTICE "kAFS:" + " Active volume no longer valid '%s'\n", + vl->vldb.name); + vl->valid = 0; + ret = -ENOMEDIUM; + } + + up_write(&vl->cell->vl_sem); + _leave(" = %d", ret); + return ret; +} + +/* + * allocate a volume location record + */ +static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell, + const char *name, + size_t namesz) +{ + struct afs_vlocation *vl; + + vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL); + if (vl) { + vl->cell = cell; + vl->state = AFS_VL_NEW; + atomic_set(&vl->usage, 1); + INIT_LIST_HEAD(&vl->link); + INIT_LIST_HEAD(&vl->grave); + INIT_LIST_HEAD(&vl->update); + init_waitqueue_head(&vl->waitq); + spin_lock_init(&vl->lock); + memcpy(vl->vldb.name, name, namesz); + } + + _leave(" = %p", vl); + return vl; +} + +/* + * update record if we found it in the cache + */ +static int afs_vlocation_update_record(struct afs_vlocation *vl, + struct key *key, + struct afs_cache_vlocation *vldb) +{ + afs_voltype_t voltype; + afs_volid_t vid; + int ret; + + /* try to look up a cached volume in the cell VL databases by ID */ + _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", + vl->vldb.name, + vl->vldb.vidmask, + ntohl(vl->vldb.servers[0].s_addr), + vl->vldb.srvtmask[0], + ntohl(vl->vldb.servers[1].s_addr), + vl->vldb.srvtmask[1], + ntohl(vl->vldb.servers[2].s_addr), + vl->vldb.srvtmask[2]); + + _debug("Vids: %08x %08x %08x", + vl->vldb.vid[0], + vl->vldb.vid[1], + vl->vldb.vid[2]); + + if (vl->vldb.vidmask & AFS_VOL_VTM_RW) { + vid = vl->vldb.vid[0]; + voltype = AFSVL_RWVOL; + } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) { + vid = vl->vldb.vid[1]; + voltype = AFSVL_ROVOL; + } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) { + vid = vl->vldb.vid[2]; + voltype = AFSVL_BACKVOL; + } else { + BUG(); + vid = 0; + voltype = 0; + } + + /* contact the server to make sure the volume is still available + * - TODO: need to handle disconnected operation here + */ + ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb); + switch (ret) { + /* net error */ + default: + printk(KERN_WARNING "kAFS:" + " failed to update volume '%s' (%x) up in '%s': %d\n", + vl->vldb.name, vid, vl->cell->name, ret); + _leave(" = %d", ret); + return ret; + + /* pulled from local cache into memory */ + case 0: + _leave(" = 0"); + return 0; + + /* uh oh... looks like the volume got deleted */ + case -ENOMEDIUM: + printk(KERN_ERR "kAFS:" + " volume '%s' (%x) does not exist '%s'\n", + vl->vldb.name, vid, vl->cell->name); + + /* TODO: make existing record unavailable */ + _leave(" = %d", ret); + return ret; + } +} + +/* + * apply the update to a VL record + */ +static void afs_vlocation_apply_update(struct afs_vlocation *vl, + struct afs_cache_vlocation *vldb) +{ + _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", + vldb->name, vldb->vidmask, + ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0], + ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1], + ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]); + + _debug("Vids: %08x %08x %08x", + vldb->vid[0], vldb->vid[1], vldb->vid[2]); + + if (strcmp(vldb->name, vl->vldb.name) != 0) + printk(KERN_NOTICE "kAFS:" + " name of volume '%s' changed to '%s' on server\n", + vl->vldb.name, vldb->name); + + vl->vldb = *vldb; + +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif +} + +/* + * fill in a volume location record, consulting the cache and the VL server + * both + */ +static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, + struct key *key) +{ + struct afs_cache_vlocation vldb; + int ret; + + _enter(""); + + ASSERTCMP(vl->valid, ==, 0); + + memset(&vldb, 0, sizeof(vldb)); + + /* see if we have an in-cache copy (will set vl->valid if there is) */ +#ifdef CONFIG_AFS_FSCACHE + vl->cache = fscache_acquire_cookie(vl->cell->cache, + &afs_vlocation_cache_index_def, vl, + true); +#endif + + if (vl->valid) { + /* try to update a known volume in the cell VL databases by + * ID as the name may have changed */ + _debug("found in cache"); + ret = afs_vlocation_update_record(vl, key, &vldb); + } else { + /* try to look up an unknown volume in the cell VL databases by + * name */ + ret = afs_vlocation_access_vl_by_name(vl, key, &vldb); + if (ret < 0) { + printk("kAFS: failed to locate '%s' in cell '%s'\n", + vl->vldb.name, vl->cell->name); + return ret; + } + } + + afs_vlocation_apply_update(vl, &vldb); + _leave(" = 0"); + return 0; +} + +/* + * queue a vlocation record for updates + */ +static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) +{ + struct afs_vlocation *xvl; + + /* wait at least 10 minutes before updating... */ + vl->update_at = get_seconds() + afs_vlocation_update_timeout; + + spin_lock(&afs_vlocation_updates_lock); + + if (!list_empty(&afs_vlocation_updates)) { + /* ... but wait at least 1 second more than the newest record + * already queued so that we don't spam the VL server suddenly + * with lots of requests + */ + xvl = list_entry(afs_vlocation_updates.prev, + struct afs_vlocation, update); + if (vl->update_at <= xvl->update_at) + vl->update_at = xvl->update_at + 1; + } else { + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, + afs_vlocation_update_timeout * HZ); + } + + list_add_tail(&vl->update, &afs_vlocation_updates); + spin_unlock(&afs_vlocation_updates_lock); +} + +/* + * lookup volume location + * - iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + * - lookup in the local cache if not able to find on the VL server + * - insert/update in the local cache if did get a VL response + */ +struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell, + struct key *key, + const char *name, + size_t namesz) +{ + struct afs_vlocation *vl; + int ret; + + _enter("{%s},{%x},%*.*s,%zu", + cell->name, key_serial(key), + (int) namesz, (int) namesz, name, namesz); + + if (namesz >= sizeof(vl->vldb.name)) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + /* see if we have an in-memory copy first */ + down_write(&cell->vl_sem); + spin_lock(&cell->vl_lock); + list_for_each_entry(vl, &cell->vl_list, link) { + if (vl->vldb.name[namesz] != '\0') + continue; + if (memcmp(vl->vldb.name, name, namesz) == 0) + goto found_in_memory; + } + spin_unlock(&cell->vl_lock); + + /* not in the cell's in-memory lists - create a new record */ + vl = afs_vlocation_alloc(cell, name, namesz); + if (!vl) { + up_write(&cell->vl_sem); + return ERR_PTR(-ENOMEM); + } + + afs_get_cell(cell); + + list_add_tail(&vl->link, &cell->vl_list); + vl->state = AFS_VL_CREATING; + up_write(&cell->vl_sem); + +fill_in_record: + ret = afs_vlocation_fill_in_record(vl, key); + if (ret < 0) + goto error_abandon; + spin_lock(&vl->lock); + vl->state = AFS_VL_VALID; + spin_unlock(&vl->lock); + wake_up(&vl->waitq); + + /* update volume entry in local cache */ +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif + + /* schedule for regular updates */ + afs_vlocation_queue_for_updates(vl); + goto success; + +found_in_memory: + /* found in memory */ + _debug("found in memory"); + atomic_inc(&vl->usage); + spin_unlock(&cell->vl_lock); + if (!list_empty(&vl->grave)) { + spin_lock(&afs_vlocation_graveyard_lock); + list_del_init(&vl->grave); + spin_unlock(&afs_vlocation_graveyard_lock); + } + up_write(&cell->vl_sem); + + /* see if it was an abandoned record that we might try filling in */ + spin_lock(&vl->lock); + while (vl->state != AFS_VL_VALID) { + afs_vlocation_state_t state = vl->state; + + _debug("invalid [state %d]", state); + + if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) { + vl->state = AFS_VL_CREATING; + spin_unlock(&vl->lock); + goto fill_in_record; + } + + /* must now wait for creation or update by someone else to + * complete */ + _debug("wait"); + + spin_unlock(&vl->lock); + ret = wait_event_interruptible(vl->waitq, + vl->state == AFS_VL_NEW || + vl->state == AFS_VL_VALID || + vl->state == AFS_VL_NO_VOLUME); + if (ret < 0) + goto error; + spin_lock(&vl->lock); + } + spin_unlock(&vl->lock); + +success: + _leave(" = %p", vl); + return vl; + +error_abandon: + spin_lock(&vl->lock); + vl->state = AFS_VL_NEW; + spin_unlock(&vl->lock); + wake_up(&vl->waitq); +error: + ASSERT(vl != NULL); + afs_put_vlocation(vl); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * finish using a volume location record + */ +void afs_put_vlocation(struct afs_vlocation *vl) +{ + if (!vl) + return; + + _enter("%s", vl->vldb.name); + + ASSERTCMP(atomic_read(&vl->usage), >, 0); + + if (likely(!atomic_dec_and_test(&vl->usage))) { + _leave(""); + return; + } + + spin_lock(&afs_vlocation_graveyard_lock); + if (atomic_read(&vl->usage) == 0) { + _debug("buried"); + list_move_tail(&vl->grave, &afs_vlocation_graveyard); + vl->time_of_death = get_seconds(); + queue_delayed_work(afs_wq, &afs_vlocation_reap, + afs_vlocation_timeout * HZ); + + /* suspend updates on this record */ + if (!list_empty(&vl->update)) { + spin_lock(&afs_vlocation_updates_lock); + list_del_init(&vl->update); + spin_unlock(&afs_vlocation_updates_lock); + } + } + spin_unlock(&afs_vlocation_graveyard_lock); + _leave(" [killed?]"); +} + +/* + * destroy a dead volume location record + */ +static void afs_vlocation_destroy(struct afs_vlocation *vl) +{ + _enter("%p", vl); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vl->cache, 0); +#endif + afs_put_cell(vl->cell); + kfree(vl); +} + +/* + * reap dead volume location records + */ +static void afs_vlocation_reaper(struct work_struct *work) +{ + LIST_HEAD(corpses); + struct afs_vlocation *vl; + unsigned long delay, expiry; + time_t now; + + _enter(""); + + now = get_seconds(); + spin_lock(&afs_vlocation_graveyard_lock); + + while (!list_empty(&afs_vlocation_graveyard)) { + vl = list_entry(afs_vlocation_graveyard.next, + struct afs_vlocation, grave); + + _debug("check %p", vl); + + /* the queue is ordered most dead first */ + expiry = vl->time_of_death + afs_vlocation_timeout; + if (expiry > now) { + delay = (expiry - now) * HZ; + _debug("delay %lu", delay); + mod_delayed_work(afs_wq, &afs_vlocation_reap, delay); + break; + } + + spin_lock(&vl->cell->vl_lock); + if (atomic_read(&vl->usage) > 0) { + _debug("no reap"); + list_del_init(&vl->grave); + } else { + _debug("reap"); + list_move_tail(&vl->grave, &corpses); + list_del_init(&vl->link); + } + spin_unlock(&vl->cell->vl_lock); + } + + spin_unlock(&afs_vlocation_graveyard_lock); + + /* now reap the corpses we've extracted */ + while (!list_empty(&corpses)) { + vl = list_entry(corpses.next, struct afs_vlocation, grave); + list_del(&vl->grave); + afs_vlocation_destroy(vl); + } + + _leave(""); +} + +/* + * initialise the VL update process + */ +int __init afs_vlocation_update_init(void) +{ + afs_vlocation_update_worker = + create_singlethread_workqueue("kafs_vlupdated"); + return afs_vlocation_update_worker ? 0 : -ENOMEM; +} + +/* + * discard all the volume location records for rmmod + */ +void afs_vlocation_purge(void) +{ + afs_vlocation_timeout = 0; + + spin_lock(&afs_vlocation_updates_lock); + list_del_init(&afs_vlocation_updates); + spin_unlock(&afs_vlocation_updates_lock); + mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0); + destroy_workqueue(afs_vlocation_update_worker); + + mod_delayed_work(afs_wq, &afs_vlocation_reap, 0); +} + +/* + * update a volume location + */ +static void afs_vlocation_updater(struct work_struct *work) +{ + struct afs_cache_vlocation vldb; + struct afs_vlocation *vl, *xvl; + time_t now; + long timeout; + int ret; + + _enter(""); + + now = get_seconds(); + + /* find a record to update */ + spin_lock(&afs_vlocation_updates_lock); + for (;;) { + if (list_empty(&afs_vlocation_updates)) { + spin_unlock(&afs_vlocation_updates_lock); + _leave(" [nothing]"); + return; + } + + vl = list_entry(afs_vlocation_updates.next, + struct afs_vlocation, update); + if (atomic_read(&vl->usage) > 0) + break; + list_del_init(&vl->update); + } + + timeout = vl->update_at - now; + if (timeout > 0) { + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, timeout * HZ); + spin_unlock(&afs_vlocation_updates_lock); + _leave(" [nothing]"); + return; + } + + list_del_init(&vl->update); + atomic_inc(&vl->usage); + spin_unlock(&afs_vlocation_updates_lock); + + /* we can now perform the update */ + _debug("update %s", vl->vldb.name); + vl->state = AFS_VL_UPDATING; + vl->upd_rej_cnt = 0; + vl->upd_busy_cnt = 0; + + ret = afs_vlocation_update_record(vl, NULL, &vldb); + spin_lock(&vl->lock); + switch (ret) { + case 0: + afs_vlocation_apply_update(vl, &vldb); + vl->state = AFS_VL_VALID; + break; + case -ENOMEDIUM: + vl->state = AFS_VL_VOLUME_DELETED; + break; + default: + vl->state = AFS_VL_UNCERTAIN; + break; + } + spin_unlock(&vl->lock); + wake_up(&vl->waitq); + + /* and then reschedule */ + _debug("reschedule"); + vl->update_at = get_seconds() + afs_vlocation_update_timeout; + + spin_lock(&afs_vlocation_updates_lock); + + if (!list_empty(&afs_vlocation_updates)) { + /* next update in 10 minutes, but wait at least 1 second more + * than the newest record already queued so that we don't spam + * the VL server suddenly with lots of requests + */ + xvl = list_entry(afs_vlocation_updates.prev, + struct afs_vlocation, update); + if (vl->update_at <= xvl->update_at) + vl->update_at = xvl->update_at + 1; + xvl = list_entry(afs_vlocation_updates.next, + struct afs_vlocation, update); + timeout = xvl->update_at - now; + if (timeout < 0) + timeout = 0; + } else { + timeout = afs_vlocation_update_timeout; + } + + ASSERT(list_empty(&vl->update)); + + list_add_tail(&vl->update, &afs_vlocation_updates); + + _debug("timeout %ld", timeout); + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, timeout * HZ); + spin_unlock(&afs_vlocation_updates_lock); + afs_put_vlocation(vl); +} diff --git a/kernel/fs/afs/vnode.c b/kernel/fs/afs/vnode.c new file mode 100644 index 000000000..25cf4c3f4 --- /dev/null +++ b/kernel/fs/afs/vnode.c @@ -0,0 +1,1025 @@ +/* AFS vnode management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +#if 0 +static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent, + int depth, char lr) +{ + struct afs_vnode *vnode; + bool bad = false; + + if (!node) + return false; + + if (node->rb_left) + bad = dump_tree_aux(node->rb_left, node, depth + 2, '/'); + + vnode = rb_entry(node, struct afs_vnode, cb_promise); + _debug("%c %*.*s%c%p {%d}", + rb_is_red(node) ? 'R' : 'B', + depth, depth, "", lr, + vnode, vnode->cb_expires_at); + if (rb_parent(node) != parent) { + printk("BAD: %p != %p\n", rb_parent(node), parent); + bad = true; + } + + if (node->rb_right) + bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\'); + + return bad; +} + +static noinline void dump_tree(const char *name, struct afs_server *server) +{ + _enter("%s", name); + if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-')) + BUG(); +} +#endif + +/* + * insert a vnode into the backing server's vnode tree + */ +static void afs_install_vnode(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *old_server = vnode->server; + struct afs_vnode *xvnode; + struct rb_node *parent, **p; + + _enter("%p,%p", vnode, server); + + if (old_server) { + spin_lock(&old_server->fs_lock); + rb_erase(&vnode->server_rb, &old_server->fs_vnodes); + spin_unlock(&old_server->fs_lock); + } + + afs_get_server(server); + vnode->server = server; + afs_put_server(old_server); + + /* insert into the server's vnode tree in FID order */ + spin_lock(&server->fs_lock); + + parent = NULL; + p = &server->fs_vnodes.rb_node; + while (*p) { + parent = *p; + xvnode = rb_entry(parent, struct afs_vnode, server_rb); + if (vnode->fid.vid < xvnode->fid.vid) + p = &(*p)->rb_left; + else if (vnode->fid.vid > xvnode->fid.vid) + p = &(*p)->rb_right; + else if (vnode->fid.vnode < xvnode->fid.vnode) + p = &(*p)->rb_left; + else if (vnode->fid.vnode > xvnode->fid.vnode) + p = &(*p)->rb_right; + else if (vnode->fid.unique < xvnode->fid.unique) + p = &(*p)->rb_left; + else if (vnode->fid.unique > xvnode->fid.unique) + p = &(*p)->rb_right; + else + BUG(); /* can't happen unless afs_iget() malfunctions */ + } + + rb_link_node(&vnode->server_rb, parent, p); + rb_insert_color(&vnode->server_rb, &server->fs_vnodes); + + spin_unlock(&server->fs_lock); + _leave(""); +} + +/* + * insert a vnode into the promising server's update/expiration tree + * - caller must hold vnode->lock + */ +static void afs_vnode_note_promise(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *old_server; + struct afs_vnode *xvnode; + struct rb_node *parent, **p; + + _enter("%p,%p", vnode, server); + + ASSERT(server != NULL); + + old_server = vnode->server; + if (vnode->cb_promised) { + if (server == old_server && + vnode->cb_expires == vnode->cb_expires_at) { + _leave(" [no change]"); + return; + } + + spin_lock(&old_server->cb_lock); + if (vnode->cb_promised) { + _debug("delete"); + rb_erase(&vnode->cb_promise, &old_server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&old_server->cb_lock); + } + + if (vnode->server != server) + afs_install_vnode(vnode, server); + + vnode->cb_expires_at = vnode->cb_expires; + _debug("PROMISE on %p {%lu}", + vnode, (unsigned long) vnode->cb_expires_at); + + /* abuse an RB-tree to hold the expiration order (we may have multiple + * items with the same expiration time) */ + spin_lock(&server->cb_lock); + + parent = NULL; + p = &server->cb_promises.rb_node; + while (*p) { + parent = *p; + xvnode = rb_entry(parent, struct afs_vnode, cb_promise); + if (vnode->cb_expires_at < xvnode->cb_expires_at) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&vnode->cb_promise, parent, p); + rb_insert_color(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = true; + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * handle remote file deletion by discarding the callback promise + */ +static void afs_vnode_deleted_remotely(struct afs_vnode *vnode) +{ + struct afs_server *server; + + _enter("{%p}", vnode->server); + + set_bit(AFS_VNODE_DELETED, &vnode->flags); + + server = vnode->server; + if (server) { + if (vnode->cb_promised) { + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + rb_erase(&vnode->cb_promise, + &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + } + + spin_lock(&server->fs_lock); + rb_erase(&vnode->server_rb, &server->fs_vnodes); + spin_unlock(&server->fs_lock); + + vnode->server = NULL; + afs_put_server(server); + } else { + ASSERT(!vnode->cb_promised); + } + + _leave(""); +} + +/* + * finish off updating the recorded status of a file after a successful + * operation completion + * - starts callback expiry timer + * - adds to server's callback list + */ +void afs_vnode_finalise_status_update(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *oldserver = NULL; + + _enter("%p,%p", vnode, server); + + spin_lock(&vnode->lock); + clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + afs_vnode_note_promise(vnode, server); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + + wake_up_all(&vnode->update_waitq); + afs_put_server(oldserver); + _leave(""); +} + +/* + * finish off updating the recorded status of a file after an operation failed + */ +static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret) +{ + _enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + + clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + + if (ret == -ENOENT) { + /* the file was deleted on the server */ + _debug("got NOENT from server - marking file deleted"); + afs_vnode_deleted_remotely(vnode); + } + + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + + wake_up_all(&vnode->update_waitq); + _leave(""); +} + +/* + * fetch file status from the volume + * - don't issue a fetch if: + * - the changed bit is not set and there's a valid callback + * - there are any outstanding ops that will fetch the status + * - TODO implement local caching + */ +int afs_vnode_fetch_status(struct afs_vnode *vnode, + struct afs_vnode *auth_vnode, struct key *key) +{ + struct afs_server *server; + unsigned long acl_order; + int ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%s,{%x:%u.%u}", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + vnode->cb_promised) { + _leave(" [unchanged]"); + return 0; + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _leave(" [deleted]"); + return -ENOENT; + } + + acl_order = 0; + if (auth_vnode) + acl_order = auth_vnode->acl_order; + + spin_lock(&vnode->lock); + + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + vnode->cb_promised) { + spin_unlock(&vnode->lock); + _leave(" [unchanged]"); + return 0; + } + + ASSERTCMP(vnode->update_cnt, >=, 0); + + if (vnode->update_cnt > 0) { + /* someone else started a fetch */ + _debug("wait on fetch %d", vnode->update_cnt); + + set_current_state(TASK_UNINTERRUPTIBLE); + ASSERT(myself.func != NULL); + add_wait_queue(&vnode->update_waitq, &myself); + + /* wait for the status to be updated */ + for (;;) { + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) + break; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + break; + + /* check to see if it got updated and invalidated all + * before we saw it */ + if (vnode->update_cnt == 0) { + remove_wait_queue(&vnode->update_waitq, + &myself); + set_current_state(TASK_RUNNING); + goto get_anyway; + } + + spin_unlock(&vnode->lock); + + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock(&vnode->lock); + } + + remove_wait_queue(&vnode->update_waitq, &myself); + spin_unlock(&vnode->lock); + set_current_state(TASK_RUNNING); + + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? + -ENOENT : 0; + } + +get_anyway: + /* okay... we're going to have to initiate the op */ + vnode->update_cnt++; + + spin_unlock(&vnode->lock); + + /* merge AFS status fetches and clear outstanding callback on this + * vnode */ + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %p{%08x}", + server, ntohl(server->addr.s_addr)); + + ret = afs_fs_fetch_file_status(server, key, vnode, NULL, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + _debug("adjust"); + if (auth_vnode) + afs_cache_permit(vnode, key, acl_order); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + _debug("failed [%d]", ret); + afs_vnode_status_update_failed(vnode, ret); + } + + ASSERTCMP(vnode->update_cnt, >=, 0); + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * fetch file data from the volume + * - TODO implement caching + */ +int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, + off_t offset, size_t length, struct page *page) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + /* merge in AFS status fetches and clear outstanding callback on this + * vnode */ + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_fetch_data(server, key, vnode, offset, length, + page, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * make a file or a directory + */ +int afs_vnode_create(struct afs_vnode *vnode, struct key *key, + const char *name, umode_t mode, struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb, struct afs_server **_server) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're creating in */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_create(server, key, vnode, name, mode, newfid, + newstatus, newcb, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + *_server = server; + } else { + afs_vnode_status_update_failed(vnode, ret); + *_server = NULL; + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * remove a file or directory + */ +int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name, + bool isdir) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're removing from */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_remove(server, key, vnode, name, isdir, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * create a hard link + */ +int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, + struct key *key, const char *name) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s", + dvnode->volume->vlocation->vldb.name, + dvnode->fid.vid, + dvnode->fid.vnode, + dvnode->fid.unique, + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're removing from */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + spin_lock(&dvnode->lock); + dvnode->update_cnt++; + spin_unlock(&dvnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(dvnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_link(server, key, dvnode, vnode, name, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(dvnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_vnode_finalise_status_update(dvnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + afs_vnode_status_update_failed(dvnode, ret); + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + spin_lock(&dvnode->lock); + dvnode->update_cnt--; + ASSERTCMP(dvnode->update_cnt, >=, 0); + spin_unlock(&dvnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * create a symbolic link + */ +int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key, + const char *name, const char *content, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_server **_server) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s,%s,,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name, content); + + /* this op will fetch the status on the directory we're creating in */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_symlink(server, key, vnode, name, content, + newfid, newstatus, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + *_server = server; + } else { + afs_vnode_status_update_failed(vnode, ret); + *_server = NULL; + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * rename a file + */ +int afs_vnode_rename(struct afs_vnode *orig_dvnode, + struct afs_vnode *new_dvnode, + struct key *key, + const char *orig_name, + const char *new_name) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s", + orig_dvnode->volume->vlocation->vldb.name, + orig_dvnode->fid.vid, + orig_dvnode->fid.vnode, + orig_dvnode->fid.unique, + new_dvnode->volume->vlocation->vldb.name, + new_dvnode->fid.vid, + new_dvnode->fid.vnode, + new_dvnode->fid.unique, + key_serial(key), + orig_name, + new_name); + + /* this op will fetch the status on both the directories we're dealing + * with */ + spin_lock(&orig_dvnode->lock); + orig_dvnode->update_cnt++; + spin_unlock(&orig_dvnode->lock); + if (new_dvnode != orig_dvnode) { + spin_lock(&new_dvnode->lock); + new_dvnode->update_cnt++; + spin_unlock(&new_dvnode->lock); + } + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(orig_dvnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_rename(server, key, orig_dvnode, orig_name, + new_dvnode, new_name, &afs_sync_call); + + } while (!afs_volume_release_fileserver(orig_dvnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(orig_dvnode, server); + if (new_dvnode != orig_dvnode) + afs_vnode_finalise_status_update(new_dvnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(orig_dvnode, ret); + if (new_dvnode != orig_dvnode) + afs_vnode_status_update_failed(new_dvnode, ret); + } + + _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt); + return ret; + +no_server: + spin_lock(&orig_dvnode->lock); + orig_dvnode->update_cnt--; + ASSERTCMP(orig_dvnode->update_cnt, >=, 0); + spin_unlock(&orig_dvnode->lock); + if (new_dvnode != orig_dvnode) { + spin_lock(&new_dvnode->lock); + new_dvnode->update_cnt--; + ASSERTCMP(new_dvnode->update_cnt, >=, 0); + spin_unlock(&new_dvnode->lock); + } + _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt); + return PTR_ERR(server); +} + +/* + * write to a file + */ +int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last, + unsigned offset, unsigned to) +{ + struct afs_server *server; + struct afs_vnode *vnode = wb->vnode; + int ret; + + _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(wb->key), + first, last, offset, to); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_store_data(server, wb, first, last, offset, to, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * set the attributes on a file + */ +int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key, + struct iattr *attr) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * get the status of a volume + */ +int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, + struct afs_volume_status *vs) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * get a lock on a file + */ +int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%u", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), type); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * extend a lock on a file + */ +int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * release a lock on a file + */ +int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} diff --git a/kernel/fs/afs/volume.c b/kernel/fs/afs/volume.c new file mode 100644 index 000000000..d142a2449 --- /dev/null +++ b/kernel/fs/afs/volume.c @@ -0,0 +1,401 @@ +/* AFS volume management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; + +/* + * lookup a volume by name + * - this can be one of the following: + * "%[cell:]volume[.]" R/W volume + * "#[cell:]volume[.]" R/O or R/W volume (rwparent=0), + * or R/W (rwparent=1) volume + * "%[cell:]volume.readonly" R/O volume + * "#[cell:]volume.readonly" R/O volume + * "%[cell:]volume.backup" Backup volume + * "#[cell:]volume.backup" Backup volume + * + * The cell name is optional, and defaults to the current cell. + * + * See "The Rules of Mount Point Traversal" in Chapter 5 of the AFS SysAdmin + * Guide + * - Rule 1: Explicit type suffix forces access of that type or nothing + * (no suffix, then use Rule 2 & 3) + * - Rule 2: If parent volume is R/O, then mount R/O volume by preference, R/W + * if not available + * - Rule 3: If parent volume is R/W, then only mount R/W volume unless + * explicitly told otherwise + */ +struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) +{ + struct afs_vlocation *vlocation = NULL; + struct afs_volume *volume = NULL; + struct afs_server *server = NULL; + char srvtmask; + int ret, loop; + + _enter("{%*.*s,%d}", + params->volnamesz, params->volnamesz, params->volname, params->rwpath); + + /* lookup the volume location record */ + vlocation = afs_vlocation_lookup(params->cell, params->key, + params->volname, params->volnamesz); + if (IS_ERR(vlocation)) { + ret = PTR_ERR(vlocation); + vlocation = NULL; + goto error; + } + + /* make the final decision on the type we want */ + ret = -ENOMEDIUM; + if (params->force && !(vlocation->vldb.vidmask & (1 << params->type))) + goto error; + + srvtmask = 0; + for (loop = 0; loop < vlocation->vldb.nservers; loop++) + srvtmask |= vlocation->vldb.srvtmask[loop]; + + if (params->force) { + if (!(srvtmask & (1 << params->type))) + goto error; + } else if (srvtmask & AFS_VOL_VTM_RO) { + params->type = AFSVL_ROVOL; + } else if (srvtmask & AFS_VOL_VTM_RW) { + params->type = AFSVL_RWVOL; + } else { + goto error; + } + + down_write(¶ms->cell->vl_sem); + + /* is the volume already active? */ + if (vlocation->vols[params->type]) { + /* yes - re-use it */ + volume = vlocation->vols[params->type]; + afs_get_volume(volume); + goto success; + } + + /* create a new volume record */ + _debug("creating new volume record"); + + ret = -ENOMEM; + volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); + if (!volume) + goto error_up; + + atomic_set(&volume->usage, 1); + volume->type = params->type; + volume->type_force = params->force; + volume->cell = params->cell; + volume->vid = vlocation->vldb.vid[params->type]; + + ret = bdi_setup_and_register(&volume->bdi, "afs"); + if (ret) + goto error_bdi; + + init_rwsem(&volume->server_sem); + + /* look up all the applicable server records */ + for (loop = 0; loop < 8; loop++) { + if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) { + server = afs_lookup_server( + volume->cell, &vlocation->vldb.servers[loop]); + if (IS_ERR(server)) { + ret = PTR_ERR(server); + goto error_discard; + } + + volume->servers[volume->nservers] = server; + volume->nservers++; + } + } + + /* attach the cache and volume location */ +#ifdef CONFIG_AFS_FSCACHE + volume->cache = fscache_acquire_cookie(vlocation->cache, + &afs_volume_cache_index_def, + volume, true); +#endif + afs_get_vlocation(vlocation); + volume->vlocation = vlocation; + + vlocation->vols[volume->type] = volume; + +success: + _debug("kAFS selected %s volume %08x", + afs_voltypes[volume->type], volume->vid); + up_write(¶ms->cell->vl_sem); + afs_put_vlocation(vlocation); + _leave(" = %p", volume); + return volume; + + /* clean up */ +error_up: + up_write(¶ms->cell->vl_sem); +error: + afs_put_vlocation(vlocation); + _leave(" = %d", ret); + return ERR_PTR(ret); + +error_discard: + bdi_destroy(&volume->bdi); +error_bdi: + up_write(¶ms->cell->vl_sem); + + for (loop = volume->nservers - 1; loop >= 0; loop--) + afs_put_server(volume->servers[loop]); + + kfree(volume); + goto error; +} + +/* + * destroy a volume record + */ +void afs_put_volume(struct afs_volume *volume) +{ + struct afs_vlocation *vlocation; + int loop; + + if (!volume) + return; + + _enter("%p", volume); + + ASSERTCMP(atomic_read(&volume->usage), >, 0); + + vlocation = volume->vlocation; + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + down_write(&vlocation->cell->vl_sem); + + if (likely(!atomic_dec_and_test(&volume->usage))) { + up_write(&vlocation->cell->vl_sem); + _leave(""); + return; + } + + vlocation->vols[volume->type] = NULL; + + up_write(&vlocation->cell->vl_sem); + + /* finish cleaning up the volume */ +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(volume->cache, 0); +#endif + afs_put_vlocation(vlocation); + + for (loop = volume->nservers - 1; loop >= 0; loop--) + afs_put_server(volume->servers[loop]); + + bdi_destroy(&volume->bdi); + kfree(volume); + + _leave(" [destroyed]"); +} + +/* + * pick a server to use to try accessing this volume + * - returns with an elevated usage count on the server chosen + */ +struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode) +{ + struct afs_volume *volume = vnode->volume; + struct afs_server *server; + int ret, state, loop; + + _enter("%s", volume->vlocation->vldb.name); + + /* stick with the server we're already using if we can */ + if (vnode->server && vnode->server->fs_state == 0) { + afs_get_server(vnode->server); + _leave(" = %p [current]", vnode->server); + return vnode->server; + } + + down_read(&volume->server_sem); + + /* handle the no-server case */ + if (volume->nservers == 0) { + ret = volume->rjservers ? -ENOMEDIUM : -ESTALE; + up_read(&volume->server_sem); + _leave(" = %d [no servers]", ret); + return ERR_PTR(ret); + } + + /* basically, just search the list for the first live server and use + * that */ + ret = 0; + for (loop = 0; loop < volume->nservers; loop++) { + server = volume->servers[loop]; + state = server->fs_state; + + _debug("consider %d [%d]", loop, state); + + switch (state) { + /* found an apparently healthy server */ + case 0: + afs_get_server(server); + up_read(&volume->server_sem); + _leave(" = %p (picked %08x)", + server, ntohl(server->addr.s_addr)); + return server; + + case -ENETUNREACH: + if (ret == 0) + ret = state; + break; + + case -EHOSTUNREACH: + if (ret == 0 || + ret == -ENETUNREACH) + ret = state; + break; + + case -ECONNREFUSED: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH) + ret = state; + break; + + default: + case -EREMOTEIO: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) + ret = state; + break; + } + } + + /* no available servers + * - TODO: handle the no active servers case better + */ + up_read(&volume->server_sem); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * release a server after use + * - releases the ref on the server struct that was acquired by picking + * - records result of using a particular server to access a volume + * - return 0 to try again, 1 if okay or to issue error + * - the caller must release the server struct if result was 0 + */ +int afs_volume_release_fileserver(struct afs_vnode *vnode, + struct afs_server *server, + int result) +{ + struct afs_volume *volume = vnode->volume; + unsigned loop; + + _enter("%s,%08x,%d", + volume->vlocation->vldb.name, ntohl(server->addr.s_addr), + result); + + switch (result) { + /* success */ + case 0: + server->fs_act_jif = jiffies; + server->fs_state = 0; + _leave(""); + return 1; + + /* the fileserver denied all knowledge of the volume */ + case -ENOMEDIUM: + server->fs_act_jif = jiffies; + down_write(&volume->server_sem); + + /* firstly, find where the server is in the active list (if it + * is) */ + for (loop = 0; loop < volume->nservers; loop++) + if (volume->servers[loop] == server) + goto present; + + /* no longer there - may have been discarded by another op */ + goto try_next_server_upw; + + present: + volume->nservers--; + memmove(&volume->servers[loop], + &volume->servers[loop + 1], + sizeof(volume->servers[loop]) * + (volume->nservers - loop)); + volume->servers[volume->nservers] = NULL; + afs_put_server(server); + volume->rjservers++; + + if (volume->nservers > 0) + /* another server might acknowledge its existence */ + goto try_next_server_upw; + + /* handle the case where all the fileservers have rejected the + * volume + * - TODO: try asking the fileservers for volume information + * - TODO: contact the VL server again to see if the volume is + * no longer registered + */ + up_write(&volume->server_sem); + afs_put_server(server); + _leave(" [completely rejected]"); + return 1; + + /* problem reaching the server */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIME: + case -ETIMEDOUT: + case -EREMOTEIO: + /* mark the server as dead + * TODO: vary dead timeout depending on error + */ + spin_lock(&server->fs_lock); + if (!server->fs_state) { + server->fs_dead_jif = jiffies + HZ * 10; + server->fs_state = result; + printk("kAFS: SERVER DEAD state=%d\n", result); + } + spin_unlock(&server->fs_lock); + goto try_next_server; + + /* miscellaneous error */ + default: + server->fs_act_jif = jiffies; + case -ENOMEM: + case -ENONET: + /* tell the caller to accept the result */ + afs_put_server(server); + _leave(" [local failure]"); + return 1; + } + + /* tell the caller to loop around and try the next server */ +try_next_server_upw: + up_write(&volume->server_sem); +try_next_server: + afs_put_server(server); + _leave(" [try next server]"); + return 0; +} diff --git a/kernel/fs/afs/write.c b/kernel/fs/afs/write.c new file mode 100644 index 000000000..0714abcd7 --- /dev/null +++ b/kernel/fs/afs/write.c @@ -0,0 +1,761 @@ +/* handling of writes to regular files and writing back to the server + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int afs_write_back_from_locked_page(struct afs_writeback *wb, + struct page *page); + +/* + * mark a page as having been made dirty and thus needing writeback + */ +int afs_set_page_dirty(struct page *page) +{ + _enter(""); + return __set_page_dirty_nobuffers(page); +} + +/* + * unlink a writeback record because its usage has reached zero + * - must be called with the wb->vnode->writeback_lock held + */ +static void afs_unlink_writeback(struct afs_writeback *wb) +{ + struct afs_writeback *front; + struct afs_vnode *vnode = wb->vnode; + + list_del_init(&wb->link); + if (!list_empty(&vnode->writebacks)) { + /* if an fsync rises to the front of the queue then wake it + * up */ + front = list_entry(vnode->writebacks.next, + struct afs_writeback, link); + if (front->state == AFS_WBACK_SYNCING) { + _debug("wake up sync"); + front->state = AFS_WBACK_COMPLETE; + wake_up(&front->waitq); + } + } +} + +/* + * free a writeback record + */ +static void afs_free_writeback(struct afs_writeback *wb) +{ + _enter(""); + key_put(wb->key); + kfree(wb); +} + +/* + * dispose of a reference to a writeback record + */ +void afs_put_writeback(struct afs_writeback *wb) +{ + struct afs_vnode *vnode = wb->vnode; + + _enter("{%d}", wb->usage); + + spin_lock(&vnode->writeback_lock); + if (--wb->usage == 0) + afs_unlink_writeback(wb); + else + wb = NULL; + spin_unlock(&vnode->writeback_lock); + if (wb) + afs_free_writeback(wb); +} + +/* + * partly or wholly fill a page that's under preparation for writing + */ +static int afs_fill_page(struct afs_vnode *vnode, struct key *key, + loff_t pos, struct page *page) +{ + loff_t i_size; + int ret; + int len; + + _enter(",,%llu", (unsigned long long)pos); + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; + else + len = PAGE_CACHE_SIZE; + + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); + if (ret < 0) { + if (ret == -ENOENT) { + _debug("got NOENT from server" + " - marking file deleted and stale"); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + } + + _leave(" = %d", ret); + return ret; +} + +/* + * prepare to perform part of a write to a page + */ +int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct afs_writeback *candidate, *wb; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct page *page; + struct key *key = file->private_data; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int ret; + + _enter("{%x:%u},{%lx},%u,%u", + vnode->fid.vid, vnode->fid.vnode, index, from, to); + + candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); + if (!candidate) + return -ENOMEM; + candidate->vnode = vnode; + candidate->first = candidate->last = index; + candidate->offset_first = from; + candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); + candidate->usage = 1; + candidate->state = AFS_WBACK_PENDING; + init_waitqueue_head(&candidate->waitq); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + kfree(candidate); + return -ENOMEM; + } + *pagep = page; + /* page won't leak in error case: it eventually gets cleaned off LRU */ + + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); + if (ret < 0) { + kfree(candidate); + _leave(" = %d [prep]", ret); + return ret; + } + SetPageUptodate(page); + } + +try_again: + spin_lock(&vnode->writeback_lock); + + /* see if this page is already pending a writeback under a suitable key + * - if so we can just join onto that one */ + wb = (struct afs_writeback *) page_private(page); + if (wb) { + if (wb->key == key && wb->state == AFS_WBACK_PENDING) + goto subsume_in_current_wb; + goto flush_conflicting_wb; + } + + if (index > 0) { + /* see if we can find an already pending writeback that we can + * append this page to */ + list_for_each_entry(wb, &vnode->writebacks, link) { + if (wb->last == index - 1 && wb->key == key && + wb->state == AFS_WBACK_PENDING) + goto append_to_previous_wb; + } + } + + list_add_tail(&candidate->link, &vnode->writebacks); + candidate->key = key_get(key); + spin_unlock(&vnode->writeback_lock); + SetPagePrivate(page); + set_page_private(page, (unsigned long) candidate); + _leave(" = 0 [new]"); + return 0; + +subsume_in_current_wb: + _debug("subsume"); + ASSERTRANGE(wb->first, <=, index, <=, wb->last); + if (index == wb->first && from < wb->offset_first) + wb->offset_first = from; + if (index == wb->last && to > wb->to_last) + wb->to_last = to; + spin_unlock(&vnode->writeback_lock); + kfree(candidate); + _leave(" = 0 [sub]"); + return 0; + +append_to_previous_wb: + _debug("append into %lx-%lx", wb->first, wb->last); + wb->usage++; + wb->last++; + wb->to_last = to; + spin_unlock(&vnode->writeback_lock); + SetPagePrivate(page); + set_page_private(page, (unsigned long) wb); + kfree(candidate); + _leave(" = 0 [app]"); + return 0; + + /* the page is currently bound to another context, so if it's dirty we + * need to flush it before we can use the new context */ +flush_conflicting_wb: + _debug("flush conflict"); + if (wb->state == AFS_WBACK_PENDING) + wb->state = AFS_WBACK_CONFLICTING; + spin_unlock(&vnode->writeback_lock); + if (PageDirty(page)) { + ret = afs_write_back_from_locked_page(wb, page); + if (ret < 0) { + afs_put_writeback(candidate); + _leave(" = %d", ret); + return ret; + } + } + + /* the page holds a ref on the writeback record */ + afs_put_writeback(wb); + set_page_private(page, 0); + ClearPagePrivate(page); + goto try_again; +} + +/* + * finalise part of a write to a page + */ +int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + loff_t i_size, maybe_i_size; + + _enter("{%x:%u},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + maybe_i_size = pos + copied; + + i_size = i_size_read(&vnode->vfs_inode); + if (maybe_i_size > i_size) { + spin_lock(&vnode->writeback_lock); + i_size = i_size_read(&vnode->vfs_inode); + if (maybe_i_size > i_size) + i_size_write(&vnode->vfs_inode, maybe_i_size); + spin_unlock(&vnode->writeback_lock); + } + + set_page_dirty(page); + if (PageDirty(page)) + _debug("dirtied"); + unlock_page(page); + page_cache_release(page); + + return copied; +} + +/* + * kill all the pages in the given range + */ +static void afs_kill_pages(struct afs_vnode *vnode, bool error, + pgoff_t first, pgoff_t last) +{ + struct pagevec pv; + unsigned count, loop; + + _enter("{%x:%u},%lx-%lx", + vnode->fid.vid, vnode->fid.vnode, first, last); + + pagevec_init(&pv, 0); + + do { + _debug("kill %lx-%lx", first, last); + + count = last - first + 1; + if (count > PAGEVEC_SIZE) + count = PAGEVEC_SIZE; + pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, + first, count, pv.pages); + ASSERTCMP(pv.nr, ==, count); + + for (loop = 0; loop < count; loop++) { + ClearPageUptodate(pv.pages[loop]); + if (error) + SetPageError(pv.pages[loop]); + end_page_writeback(pv.pages[loop]); + } + + __pagevec_release(&pv); + } while (first < last); + + _leave(""); +} + +/* + * synchronously write back the locked page and any subsequent non-locked dirty + * pages also covered by the same writeback record + */ +static int afs_write_back_from_locked_page(struct afs_writeback *wb, + struct page *primary_page) +{ + struct page *pages[8], *page; + unsigned long count; + unsigned n, offset, to; + pgoff_t start, first, last; + int loop, ret; + + _enter(",%lx", primary_page->index); + + count = 1; + if (!clear_page_dirty_for_io(primary_page)) + BUG(); + if (test_set_page_writeback(primary_page)) + BUG(); + + /* find all consecutive lockable dirty pages, stopping when we find a + * page that is not immediately lockable, is not dirty or is missing, + * or we reach the end of the range */ + start = primary_page->index; + if (start >= wb->last) + goto no_more; + start++; + do { + _debug("more %lx [%lx]", start, count); + n = wb->last - start + 1; + if (n > ARRAY_SIZE(pages)) + n = ARRAY_SIZE(pages); + n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, + start, n, pages); + _debug("fgpc %u", n); + if (n == 0) + goto no_more; + if (pages[0]->index != start) { + do { + put_page(pages[--n]); + } while (n > 0); + goto no_more; + } + + for (loop = 0; loop < n; loop++) { + page = pages[loop]; + if (page->index > wb->last) + break; + if (!trylock_page(page)) + break; + if (!PageDirty(page) || + page_private(page) != (unsigned long) wb) { + unlock_page(page); + break; + } + if (!clear_page_dirty_for_io(page)) + BUG(); + if (test_set_page_writeback(page)) + BUG(); + unlock_page(page); + put_page(page); + } + count += loop; + if (loop < n) { + for (; loop < n; loop++) + put_page(pages[loop]); + goto no_more; + } + + start += loop; + } while (start <= wb->last && count < 65536); + +no_more: + /* we now have a contiguous set of dirty pages, each with writeback set + * and the dirty mark cleared; the first page is locked and must remain + * so, all the rest are unlocked */ + first = primary_page->index; + last = first + count - 1; + + offset = (first == wb->first) ? wb->offset_first : 0; + to = (last == wb->last) ? wb->to_last : PAGE_SIZE; + + _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); + + ret = afs_vnode_store_data(wb, first, last, offset, to); + if (ret < 0) { + switch (ret) { + case -EDQUOT: + case -ENOSPC: + set_bit(AS_ENOSPC, + &wb->vnode->vfs_inode.i_mapping->flags); + break; + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + afs_kill_pages(wb->vnode, true, first, last); + set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); + break; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + afs_kill_pages(wb->vnode, false, first, last); + break; + default: + break; + } + } else { + ret = count; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * write a page back to the server + * - the caller locked the page for us + */ +int afs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct afs_writeback *wb; + int ret; + + _enter("{%lx},", page->index); + + wb = (struct afs_writeback *) page_private(page); + ASSERT(wb != NULL); + + ret = afs_write_back_from_locked_page(wb, page); + unlock_page(page); + if (ret < 0) { + _leave(" = %d", ret); + return 0; + } + + wbc->nr_to_write -= ret; + + _leave(" = 0"); + return 0; +} + +/* + * write a region of pages back to the server + */ +static int afs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t index, pgoff_t end, pgoff_t *_next) +{ + struct afs_writeback *wb; + struct page *page; + int ret, n; + + _enter(",,%lx,%lx,", index, end); + + do { + n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!n) + break; + + _debug("wback %lx", page->index); + + if (page->index > end) { + *_next = index; + page_cache_release(page); + _leave(" = 0 [%lx]", *_next); + return 0; + } + + /* at this point we hold neither mapping->tree_lock nor lock on + * the page itself: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled back from + * swapper_space to tmpfs file mapping + */ + lock_page(page); + + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || !PageDirty(page)) { + unlock_page(page); + continue; + } + + wb = (struct afs_writeback *) page_private(page); + ASSERT(wb != NULL); + + spin_lock(&wb->vnode->writeback_lock); + wb->state = AFS_WBACK_WRITING; + spin_unlock(&wb->vnode->writeback_lock); + + ret = afs_write_back_from_locked_page(wb, page); + unlock_page(page); + page_cache_release(page); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + wbc->nr_to_write -= ret; + + cond_resched(); + } while (index < end && wbc->nr_to_write > 0); + + *_next = index; + _leave(" = 0 [%lx]", *_next); + return 0; +} + +/* + * write some of the pending data back to the server + */ +int afs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t start, end, next; + int ret; + + _enter(""); + + if (wbc->range_cyclic) { + start = mapping->writeback_index; + end = -1; + ret = afs_writepages_region(mapping, wbc, start, end, &next); + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) + ret = afs_writepages_region(mapping, wbc, 0, start, + &next); + mapping->writeback_index = next; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); + ret = afs_writepages_region(mapping, wbc, 0, end, &next); + if (wbc->nr_to_write > 0) + mapping->writeback_index = next; + } else { + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + ret = afs_writepages_region(mapping, wbc, start, end, &next); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * completion of write to server + */ +void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) +{ + struct afs_writeback *wb = call->wb; + struct pagevec pv; + unsigned count, loop; + pgoff_t first = call->first, last = call->last; + bool free_wb; + + _enter("{%x:%u},{%lx-%lx}", + vnode->fid.vid, vnode->fid.vnode, first, last); + + ASSERT(wb != NULL); + + pagevec_init(&pv, 0); + + do { + _debug("done %lx-%lx", first, last); + + count = last - first + 1; + if (count > PAGEVEC_SIZE) + count = PAGEVEC_SIZE; + pv.nr = find_get_pages_contig(call->mapping, first, count, + pv.pages); + ASSERTCMP(pv.nr, ==, count); + + spin_lock(&vnode->writeback_lock); + for (loop = 0; loop < count; loop++) { + struct page *page = pv.pages[loop]; + end_page_writeback(page); + if (page_private(page) == (unsigned long) wb) { + set_page_private(page, 0); + ClearPagePrivate(page); + wb->usage--; + } + } + free_wb = false; + if (wb->usage == 0) { + afs_unlink_writeback(wb); + free_wb = true; + } + spin_unlock(&vnode->writeback_lock); + first += count; + if (free_wb) { + afs_free_writeback(wb); + wb = NULL; + } + + __pagevec_release(&pv); + } while (first <= last); + + _leave(""); +} + +/* + * write to an AFS file + */ +ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + ssize_t result; + size_t count = iov_iter_count(from); + + _enter("{%x.%u},{%zu},", + vnode->fid.vid, vnode->fid.vnode, count); + + if (IS_SWAPFILE(&vnode->vfs_inode)) { + printk(KERN_INFO + "AFS: Attempt to write to active swap file!\n"); + return -EBUSY; + } + + if (!count) + return 0; + + result = generic_file_write_iter(iocb, from); + if (IS_ERR_VALUE(result)) { + _leave(" = %zd", result); + return result; + } + + _leave(" = %zd", result); + return result; +} + +/* + * flush the vnode to the fileserver + */ +int afs_writeback_all(struct afs_vnode *vnode) +{ + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_cyclic = 1, + }; + int ret; + + _enter(""); + + ret = mapping->a_ops->writepages(mapping, &wbc); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + _leave(" = %d", ret); + return ret; +} + +/* + * flush any dirty pages for this process, and check for write errors. + * - the return status from this call provides a reliable indication of + * whether any write errors occurred for this process. + */ +int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file_inode(file); + struct afs_writeback *wb, *xwb; + struct afs_vnode *vnode = AFS_FS_I(inode); + int ret; + + _enter("{%x:%u},{n=%pD},%d", + vnode->fid.vid, vnode->fid.vnode, file, + datasync); + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + + /* use a writeback record as a marker in the queue - when this reaches + * the front of the queue, all the outstanding writes are either + * completed or rejected */ + wb = kzalloc(sizeof(*wb), GFP_KERNEL); + if (!wb) { + ret = -ENOMEM; + goto out; + } + wb->vnode = vnode; + wb->first = 0; + wb->last = -1; + wb->offset_first = 0; + wb->to_last = PAGE_SIZE; + wb->usage = 1; + wb->state = AFS_WBACK_SYNCING; + init_waitqueue_head(&wb->waitq); + + spin_lock(&vnode->writeback_lock); + list_for_each_entry(xwb, &vnode->writebacks, link) { + if (xwb->state == AFS_WBACK_PENDING) + xwb->state = AFS_WBACK_CONFLICTING; + } + list_add_tail(&wb->link, &vnode->writebacks); + spin_unlock(&vnode->writeback_lock); + + /* push all the outstanding writebacks to the server */ + ret = afs_writeback_all(vnode); + if (ret < 0) { + afs_put_writeback(wb); + _leave(" = %d [wb]", ret); + goto out; + } + + /* wait for the preceding writes to actually complete */ + ret = wait_event_interruptible(wb->waitq, + wb->state == AFS_WBACK_COMPLETE || + vnode->writebacks.next == &wb->link); + afs_put_writeback(wb); + _leave(" = %d", ret); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * notification that a previously read-only page is about to become writable + * - if it returns an error, the caller will deliver a bus error signal + */ +int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + + _enter("{{%x:%u}},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + /* wait for the page to be written to the cache before we allow it to + * be modified */ +#ifdef CONFIG_AFS_FSCACHE + fscache_wait_on_page_write(vnode->cache, page); +#endif + + _leave(" = 0"); + return 0; +} diff --git a/kernel/fs/aio.c b/kernel/fs/aio.c new file mode 100644 index 000000000..5a2380de4 --- /dev/null +++ b/kernel/fs/aio.c @@ -0,0 +1,1742 @@ +/* + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * + * See ../COPYING for licensing terms. + */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + +struct kioctx_cpu { + unsigned reqs_available; +}; + +struct ctx_rq_wait { + struct completion comp; + atomic_t count; +}; + +struct kioctx { + struct percpu_ref users; + atomic_t dead; + + struct percpu_ref reqs; + + unsigned long user_id; + + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is nr_events - 1, which will be larger (see + * aio_setup_ring()) + */ + unsigned max_reqs; + + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct swork_event free_work; + + /* + * signals when all in-flight requests are done + */ + struct ctx_rq_wait *rq_wait; + + struct { + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. + */ + atomic_t reqs_available; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + unsigned tail; + unsigned completed_events; + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; + struct file *aio_ring_file; + + unsigned id; +}; + +/* + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). + */ +#define KIOCB_CANCELLED ((void *) (~0ULL)) + +struct aio_kiocb { + struct kiocb common; + + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + + struct iocb __user *ki_user_iocb; /* user's aiocb */ + __u64 ki_user_data; /* user's data for completion */ + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ + + /* + * If the aio_resfd field of the userspace iocb is not zero, + * this is the underlying eventfd context to deliver events to. + */ + struct eventfd_ctx *ki_eventfd; +}; + +/*------ sysctl variables----*/ +static DEFINE_SPINLOCK(aio_nr_lock); +unsigned long aio_nr; /* current system wide number of aio requests */ +unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +/*----end sysctl variables---*/ + +static struct kmem_cache *kiocb_cachep; +static struct kmem_cache *kioctx_cachep; + +static struct vfsmount *aio_mnt; + +static const struct file_operations aio_ring_fops; +static const struct address_space_operations aio_ctx_aops; + +static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) +{ + struct qstr this = QSTR_INIT("[aio]", 5); + struct file *file; + struct path path; + struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + inode->i_mapping->a_ops = &aio_ctx_aops; + inode->i_mapping->private_data = ctx; + inode->i_size = PAGE_SIZE * nr_pages; + + path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); + if (!path.dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + path.mnt = mntget(aio_mnt); + + d_instantiate(path.dentry, inode); + file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops); + if (IS_ERR(file)) { + path_put(&path); + return file; + } + + file->f_flags = O_RDWR; + return file; +} + +static struct dentry *aio_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); +} + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + static struct file_system_type aio_fs = { + .name = "aio", + .mount = aio_mount, + .kill_sb = kill_anon_super, + }; + BUG_ON(swork_get()); + aio_mnt = kern_mount(&aio_fs); + if (IS_ERR(aio_mnt)) + panic("Failed to create aio fs mount."); + + kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); + + return 0; +} +__initcall(aio_setup); + +static void put_aio_ring_file(struct kioctx *ctx) +{ + struct file *aio_ring_file = ctx->aio_ring_file; + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + + /* Prevent further access to the kioctx from migratepages */ + spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); + aio_ring_file->f_inode->i_mapping->private_data = NULL; + ctx->aio_ring_file = NULL; + spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + + fput(aio_ring_file); + } +} + +static void aio_free_ring(struct kioctx *ctx) +{ + int i; + + /* Disconnect the kiotx from the ring file. This prevents future + * accesses to the kioctx from page migration. + */ + put_aio_ring_file(ctx); + + for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, + page_count(ctx->ring_pages[i])); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); + } + + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { + kfree(ctx->ring_pages); + ctx->ring_pages = NULL; + } +} + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct kioctx_table *table; + int i, res = -EINVAL; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + for (i = 0; i < table->nr; i++) { + struct kioctx *ctx; + + ctx = table->table[i]; + if (ctx && ctx->aio_ring_file == file) { + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; + } + break; + } + } + + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + return res; +} + +static const struct file_operations aio_ring_fops = { + .mmap = aio_ring_mmap, + .mremap = aio_ring_remap, +}; + +#if IS_ENABLED(CONFIG_MIGRATION) +static int aio_migratepage(struct address_space *mapping, struct page *new, + struct page *old, enum migrate_mode mode) +{ + struct kioctx *ctx; + unsigned long flags; + pgoff_t idx; + int rc; + + rc = 0; + + /* mapping->private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (!ctx) { + rc = -EINVAL; + goto out; + } + + /* The ring_lock mutex. The prevents aio_read_events() from writing + * to the ring's head, and prevents page migration from mucking in + * a partially initialized kiotx. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + rc = -EAGAIN; + goto out; + } + + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + /* Make sure the old page hasn't already been changed */ + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + + if (rc != 0) + goto out_unlock; + + /* Writeback must be complete */ + BUG_ON(PageWriteback(old)); + get_page(new); + + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); + if (rc != MIGRATEPAGE_SUCCESS) { + put_page(new); + goto out_unlock; + } + + /* Take completion_lock to prevent other writes to the ring buffer + * while the old page is copied to the new. This prevents new + * events from being lost. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + BUG_ON(ctx->ring_pages[idx] != old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + /* The old page is no longer accessible. */ + put_page(old); + +out_unlock: + mutex_unlock(&ctx->ring_lock); +out: + spin_unlock(&mapping->private_lock); + return rc; +} +#endif + +static const struct address_space_operations aio_ctx_aops = { + .set_page_dirty = __set_page_dirty_no_writeback, +#if IS_ENABLED(CONFIG_MIGRATION) + .migratepage = aio_migratepage, +#endif +}; + +static int aio_setup_ring(struct kioctx *ctx) +{ + struct aio_ring *ring; + unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; + unsigned long size, unused; + int nr_pages; + int i; + struct file *file; + + /* Compensate for the ring buffer's head/tail overlap entry */ + nr_events += 2; /* 1 is required, 2 for good luck */ + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + + nr_pages = PFN_UP(size); + if (nr_pages < 0) + return -EINVAL; + + file = aio_private_file(ctx, nr_pages); + if (IS_ERR(file)) { + ctx->aio_ring_file = NULL; + return -ENOMEM; + } + + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } + + for (i = 0; i < nr_pages; i++) { + struct page *page; + page = find_or_create_page(file->f_inode->i_mapping, + i, GFP_HIGHUSER | __GFP_ZERO); + if (!page) + break; + pr_debug("pid(%d) page[%d]->count=%d\n", + current->pid, i, page_count(page)); + SetPageUptodate(page); + unlock_page(page); + + ctx->ring_pages[i] = page; + } + ctx->nr_pages = i; + + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -ENOMEM; + } + + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); + if (IS_ERR((void *)ctx->mmap_base)) { + ctx->mmap_size = 0; + aio_free_ring(ctx); + return -ENOMEM; + } + + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->nr = nr_events; /* user copy */ + ring->id = ~0U; + ring->head = ring->tail = 0; + ring->magic = AIO_RING_MAGIC; + ring->compat_features = AIO_RING_COMPAT_FEATURES; + ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; + ring->header_length = sizeof(struct aio_ring); + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); + + return 0; +} + +#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) + +void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) +{ + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct aio_kiocb *kiocb) +{ + kiocb_cancel_fn *old, *cancel; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return -EINVAL; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + return cancel(&kiocb->common); +} + +static void free_ioctx(struct swork_event *sev) +{ + struct kioctx *ctx = container_of(sev, struct kioctx, free_work); + + pr_debug("freeing %p\n", ctx); + + aio_free_ring(ctx); + free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); + kmem_cache_free(kioctx_cachep, ctx); +} + +static void free_ioctx_reqs(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + + /* At this point we know that there are no any in-flight requests */ + if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) + complete(&ctx->rq_wait->comp); + + INIT_SWORK(&ctx->free_work, free_ioctx); + swork_queue(&ctx->free_work); +} + +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. + */ +static void free_ioctx_users_work(struct swork_event *sev) +{ + struct kioctx *ctx = container_of(sev, struct kioctx, free_work); + struct aio_kiocb *req; + + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct aio_kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(req); + } + + spin_unlock_irq(&ctx->ctx_lock); + + percpu_ref_kill(&ctx->reqs); + percpu_ref_put(&ctx->reqs); +} + +static void free_ioctx_users(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, users); + + INIT_SWORK(&ctx->free_work, free_ioctx_users_work); + swork_queue(&ctx->free_work); +} + +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference_raw(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + spin_unlock(&mm->ioctx_lock); + + /* While kioctx setup is in progress, + * we are protected from page migration + * changes ring_pages by ->ring_lock. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + old = rcu_dereference_raw(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } +} + +static void aio_nr_sub(unsigned nr) +{ + spin_lock(&aio_nr_lock); + if (WARN_ON(aio_nr - nr > aio_nr)) + aio_nr = 0; + else + aio_nr -= nr; + spin_unlock(&aio_nr_lock); +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_events) +{ + struct mm_struct *mm = current->mm; + struct kioctx *ctx; + int err = -ENOMEM; + + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + + /* Prevent overflows */ + if (nr_events > (0x10000000U / sizeof(struct io_event))) { + pr_debug("ENOMEM: nr_events too high\n"); + return ERR_PTR(-EINVAL); + } + + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) + return ERR_PTR(-EAGAIN); + + ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + ctx->max_reqs = nr_events; + + spin_lock_init(&ctx->ctx_lock); + spin_lock_init(&ctx->completion_lock); + mutex_init(&ctx->ring_lock); + /* Protect against page migration throughout kiotx setup by keeping + * the ring_lock mutex held until setup is complete. */ + mutex_lock(&ctx->ring_lock); + init_waitqueue_head(&ctx->wait); + + INIT_LIST_HEAD(&ctx->active_reqs); + + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) + goto err; + + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) + goto err; + + err = aio_setup_ring(ctx); + if (err < 0) + goto err; + + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + if (ctx->req_batch < 1) + ctx->req_batch = 1; + + /* limit the number of system wide aios */ + spin_lock(&aio_nr_lock); + if (aio_nr + nr_events > (aio_max_nr * 2UL) || + aio_nr + nr_events < aio_nr) { + spin_unlock(&aio_nr_lock); + err = -EAGAIN; + goto err_ctx; + } + aio_nr += ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ + + err = ioctx_add_table(ctx, mm); + if (err) + goto err_cleanup; + + /* Release the ring_lock mutex now that all setup is complete. */ + mutex_unlock(&ctx->ring_lock); + + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr_events); + return ctx; + +err_cleanup: + aio_nr_sub(ctx->max_reqs); +err_ctx: + atomic_set(&ctx->dead, 1); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + aio_free_ring(ctx); +err: + mutex_unlock(&ctx->ring_lock); + free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); + kmem_cache_free(kioctx_cachep, ctx); + pr_debug("error allocating ioctx %d\n", err); + return ERR_PTR(err); +} + +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct ctx_rq_wait *wait) +{ + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + if (atomic_xchg(&ctx->dead, 1)) { + spin_unlock(&mm->ioctx_lock); + return -EINVAL; + } + + table = rcu_dereference_raw(mm->ioctx_table); + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + spin_unlock(&mm->ioctx_lock); + + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); + + /* + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). + */ + aio_nr_sub(ctx->max_reqs); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + ctx->rq_wait = wait; + percpu_ref_kill(&ctx->users); + return 0; +} + +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. + */ +void exit_aio(struct mm_struct *mm) +{ + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + struct ctx_rq_wait wait; + int i, skipped; + + if (!table) + return; + + atomic_set(&wait.count, table->nr); + init_completion(&wait.comp); + + skipped = 0; + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + + if (!ctx) { + skipped++; + continue; + } + + /* + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. + */ + ctx->mmap_size = 0; + kill_ioctx(mm, ctx, &wait); + } + + if (!atomic_sub_and_test(skipped, &wait.count)) { + /* Wait until all IO for the context are done. */ + wait_for_completion(&wait.comp); + } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); +} + +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + unsigned long flags; + + local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); + kcpu->reqs_available += nr; + + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } + + local_irq_restore(flags); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + unsigned long flags; + + local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; + } + + ret = true; + kcpu->reqs_available--; +out: + local_irq_restore(flags); + return ret; +} + +/* refill_reqs_available + * Updates the reqs_available reference counts used for tracking the + * number of free slots in the completion ring. This can be called + * from aio_complete() (to optimistically update reqs_available) or + * from aio_get_req() (the we're out of events case). It must be + * called holding ctx->completion_lock. + */ +static void refill_reqs_available(struct kioctx *ctx, unsigned head, + unsigned tail) +{ + unsigned events_in_ring, completed; + + /* Clamp head since userland can write to it. */ + head %= ctx->nr_events; + if (head <= tail) + events_in_ring = tail - head; + else + events_in_ring = ctx->nr_events - (head - tail); + + completed = ctx->completed_events; + if (events_in_ring < completed) + completed -= events_in_ring; + else + completed = 0; + + if (!completed) + return; + + ctx->completed_events -= completed; + put_reqs_available(ctx, completed); +} + +/* user_refill_reqs_available + * Called to refill reqs_available when aio_get_req() encounters an + * out of space in the completion ring. + */ +static void user_refill_reqs_available(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + if (ctx->completed_events) { + struct aio_ring *ring; + unsigned head; + + /* Access of ring->head may race with aio_read_events_ring() + * here, but that's okay since whether we read the old version + * or the new version, and either will be valid. The important + * part is that head cannot pass tail since we prevent + * aio_complete() from updating tail by holding + * ctx->completion_lock. Even if head is invalid, the check + * against ctx->completed_events below will make sure we do the + * safe/right thing. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + refill_reqs_available(ctx, head, ctx->tail); + } + + spin_unlock_irq(&ctx->completion_lock); +} + +/* aio_get_req + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. + */ +static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) +{ + struct aio_kiocb *req; + + if (!get_reqs_available(ctx)) { + user_refill_reqs_available(ctx); + if (!get_reqs_available(ctx)) + return NULL; + } + + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; + + percpu_ref_get(&ctx->reqs); + + req->ki_ctx = ctx; + return req; +out_put: + put_reqs_available(ctx, 1); + return NULL; +} + +static void kiocb_free(struct aio_kiocb *req) +{ + if (req->common.ki_filp) + fput(req->common.ki_filp); + if (req->ki_eventfd != NULL) + eventfd_ctx_put(req->ki_eventfd); + kmem_cache_free(kiocb_cachep, req); +} + +static struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct aio_ring __user *ring = (void __user *)ctx_id; + struct mm_struct *mm = current->mm; + struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; + + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + if (!table || id >= table->nr) + goto out; + + ctx = table->table[id]; + if (ctx && ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: + rcu_read_unlock(); + return ret; +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + */ +static void aio_complete(struct kiocb *kiocb, long res, long res2) +{ + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); + struct kioctx *ctx = iocb->ki_ctx; + struct aio_ring *ring; + struct io_event *ev_page, *event; + unsigned tail, pos, head; + unsigned long flags; + + /* + * Special case handling for sync iocbs: + * - events go directly into the iocb for fast handling + * - the sync task with the iocb in its stack holds the single iocb + * ref, no other paths have a way to get another ref + * - the sync task helpfully left a reference to itself in the iocb + */ + BUG_ON(is_sync_kiocb(kiocb)); + + if (iocb->ki_list.next) { + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } + + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + + tail = ctx->tail; + pos = tail + AIO_EVENTS_OFFSET; + + if (++tail >= ctx->nr_events) + tail = 0; + + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + + event->obj = (u64)(unsigned long)iocb->ki_user_iocb; + event->data = iocb->ki_user_data; + event->res = res; + event->res2 = res2; + + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, + res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + smp_wmb(); /* make event visible before updating tail */ + + ctx->tail = tail; + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + ring->tail = tail; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); + + ctx->completed_events++; + if (ctx->completed_events > 1) + refill_reqs_available(ctx, head, tail); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + pr_debug("added to ring %p at [%u]\n", iocb, tail); + + /* + * Check if the user asked us to deliver the result through an + * eventfd. The eventfd_signal() function is safe to be called + * from IRQ context. + */ + if (iocb->ki_eventfd != NULL) + eventfd_signal(iocb->ki_eventfd, 1); + + /* everything turned out well, dispose of the aiocb. */ + kiocb_free(iocb); + + /* + * We have to order our ring_info tail store above and test + * of the wait list below outside the wait lock. This is + * like in wake_up_bit() where clearing a bit has to be + * ordered with the unlocked test. + */ + smp_mb(); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + + percpu_ref_put(&ctx->reqs); +} + +/* aio_read_events_ring + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched + */ +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) +{ + struct aio_ring *ring; + unsigned head, tail, pos; + long ret = 0; + int copy_ret; + + /* + * The mutex can block and wake us up and that will cause + * wait_event_interruptible_hrtimeout() to schedule without sleeping + * and repeat. This should be rare enough that it doesn't cause + * peformance issues. See the comment in read_events() for more detail. + */ + sched_annotate_sleep(); + mutex_lock(&ctx->ring_lock); + + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + tail = ring->tail; + kunmap_atomic(ring); + + /* + * Ensure that once we've read the current tail pointer, that + * we also see the events that were stored up to the tail. + */ + smp_rmb(); + + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); + + if (head == tail) + goto out; + + head %= ctx->nr_events; + tail %= ctx->nr_events; + + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; + + avail = (head <= tail ? tail : ctx->nr_events) - head; + if (head == tail) + break; + + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); + + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; + + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); + + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; + } + + ret += avail; + head += avail; + head %= ctx->nr_events; + } + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); + + pr_debug("%li h%u t%u\n", ret, head, tail); +out: + mutex_unlock(&ctx->ring_lock); + + return ret; +} + +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); + + if (ret > 0) + *i += ret; + + if (unlikely(atomic_read(&ctx->dead))) + ret = -EINVAL; + + if (!*i) + *i = ret; + + return ret < 0 || *i >= min_nr; +} + +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) +{ + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; + + if (timeout) { + struct timespec ts; + + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; + + until = timespec_to_ktime(ts); + } + + /* + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. + */ + if (until.tv64 == 0) + aio_read_events(ctx, min_nr, nr, event, &ret); + else + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), + until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; +} + +/* sys_io_setup: + * Create an aio_context capable of receiving at least nr_events. + * ctxp must not point to an aio_context that already exists, and + * must be initialized to 0 prior to the call. On successful + * creation of the aio_context, *ctxp is filled in with the resulting + * handle. May fail with -EINVAL if *ctxp is not initialized, + * if the specified nr_events exceeds internal limits. May fail + * with -EAGAIN if the specified nr_events exceeds the user's limit + * of available events. May fail with -ENOMEM if insufficient kernel + * resources are available. May fail with -EFAULT if an invalid + * pointer is passed for ctxp. Will fail with -ENOSYS if not + * implemented. + */ +SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (ret) + kill_ioctx(current->mm, ioctx, NULL); + percpu_ref_put(&ioctx->users); + } + +out: + return ret; +} + +/* sys_io_destroy: + * Destroy the aio_context specified. May cancel any outstanding + * AIOs and block on completion. Will fail with -ENOSYS if not + * implemented. May fail with -EINVAL if the context pointed to + * is invalid. + */ +SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (likely(NULL != ioctx)) { + struct ctx_rq_wait wait; + int ret; + + init_completion(&wait.comp); + atomic_set(&wait.count, 1); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + ret = kill_ioctx(current->mm, ioctx, &wait); + percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + if (!ret) + wait_for_completion(&wait.comp); + + return ret; + } + pr_debug("EINVAL: invalid context id\n"); + return -EINVAL; +} + +typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); + +static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len, + struct iovec **iovec, + bool compat, + struct iov_iter *iter) +{ +#ifdef CONFIG_COMPAT + if (compat) + return compat_import_iovec(rw, + (struct compat_iovec __user *)buf, + len, UIO_FASTIOV, iovec, iter); +#endif + return import_iovec(rw, (struct iovec __user *)buf, + len, UIO_FASTIOV, iovec, iter); +} + +/* + * aio_run_iocb: + * Performs the initial checks and io submission. + */ +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, size_t len, bool compat) +{ + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + rw_iter_op *iter_op; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + + switch (opcode) { + case IOCB_CMD_PREAD: + case IOCB_CMD_PREADV: + mode = FMODE_READ; + rw = READ; + iter_op = file->f_op->read_iter; + goto rw_common; + + case IOCB_CMD_PWRITE: + case IOCB_CMD_PWRITEV: + mode = FMODE_WRITE; + rw = WRITE; + iter_op = file->f_op->write_iter; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!iter_op) + return -EINVAL; + + if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) + ret = aio_setup_vectored_rw(rw, buf, len, + &iovec, compat, &iter); + else { + ret = import_single_range(rw, buf, len, iovec, &iter); + iovec = NULL; + } + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, + iov_iter_count(&iter)); + if (ret < 0) { + kfree(iovec); + return ret; + } + + len = ret; + + if (rw == WRITE) + file_start_write(file); + + ret = iter_op(req, &iter); + + if (rw == WRITE) + file_end_write(file); + kfree(iovec); + break; + + case IOCB_CMD_FDSYNC: + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); + break; + + case IOCB_CMD_FSYNC: + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); + break; + + default: + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; + } + + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } + + return 0; +} + +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + struct iocb *iocb, bool compat) +{ + struct aio_kiocb *req; + ssize_t ret; + + /* enforce forwards compatibility on users */ + if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + pr_debug("EINVAL: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) + )) { + pr_debug("EINVAL: overflow check\n"); + return -EINVAL; + } + + req = aio_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + req->common.ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->common.ki_filp)) { + ret = -EBADF; + goto out_put_req; + } + req->common.ki_pos = iocb->aio_offset; + req->common.ki_complete = aio_complete; + req->common.ki_flags = iocb_flags(req->common.ki_filp); + + if (iocb->aio_flags & IOCB_FLAG_RESFD) { + /* + * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an + * instance of the file* now. The file descriptor must be + * an eventfd() fd, and will be signaled for each completed + * event using the eventfd_signal() function. + */ + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + if (IS_ERR(req->ki_eventfd)) { + ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; + goto out_put_req; + } + + req->common.ki_flags |= IOCB_EVENTFD; + } + + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); + if (unlikely(ret)) { + pr_debug("EFAULT: aio_key\n"); + goto out_put_req; + } + + req->ki_user_iocb = user_iocb; + req->ki_user_data = iocb->aio_data; + + ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, + (char __user *)(unsigned long)iocb->aio_buf, + iocb->aio_nbytes, + compat); + if (ret) + goto out_put_req; + + return 0; +out_put_req: + put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); + kiocb_free(req); + return ret; +} + +long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) +{ + struct kioctx *ctx; + long ret = 0; + int i = 0; + struct blk_plug plug; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) + nr = LONG_MAX/sizeof(*iocbpp); + + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: invalid context id\n"); + return -EINVAL; + } + + blk_start_plug(&plug); + + /* + * AKPM: should this return a partial result if some of the IOs were + * successfully submitted? + */ + for (i=0; iusers); + return i ? i : ret; +} + +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) +{ + return do_io_submit(ctx_id, nr, iocbpp, 0); +} + +/* lookup_kiocb + * Finds a given iocb for cancellation. + */ +static struct aio_kiocb * +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) +{ + struct aio_kiocb *kiocb; + + assert_spin_locked(&ctx->ctx_lock); + + if (key != KIOCB_KEY) + return NULL; + + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_user_iocb == iocb) + return kiocb; + } + return NULL; +} + +/* sys_io_cancel: + * Attempts to cancel an iocb previously passed to io_submit. If + * the operation is successfully cancelled, the resulting event is + * copied into the memory pointed to by result without being placed + * into the completion queue and 0 is returned. May fail with + * -EFAULT if any of the data structures pointed to are invalid. + * May fail with -EINVAL if aio_context specified by ctx_id is + * invalid. May fail with -EAGAIN if the iocb specified was not + * cancelled. Will fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, + struct io_event __user *, result) +{ + struct kioctx *ctx; + struct aio_kiocb *kiocb; + u32 key; + int ret; + + ret = get_user(key, &iocb->aio_key); + if (unlikely(ret)) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) + return -EINVAL; + + spin_lock_irq(&ctx->ctx_lock); + + kiocb = lookup_kiocb(ctx, iocb, key); + if (kiocb) + ret = kiocb_cancel(kiocb); + else + ret = -EINVAL; + + spin_unlock_irq(&ctx->ctx_lock); + + if (!ret) { + /* + * The result argument is no longer used - the io_event is + * always delivered via the ring buffer. -EINPROGRESS indicates + * cancellation is progress: + */ + ret = -EINPROGRESS; + } + + percpu_ref_put(&ctx->users); + + return ret; +} + +/* io_getevents: + * Attempts to read at least min_nr events and up to nr events from + * the completion queue for the aio_context specified by ctx_id. If + * it succeeds, the number of read events is returned. May fail with + * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is + * out of range, if timeout is out of range. May fail with -EFAULT + * if any of the memory specified is invalid. May return 0 or + * < min_nr if the timeout specified by timeout has elapsed + * before sufficient events are available, where timeout == NULL + * specifies an infinite timeout. Note that the timeout pointed to by + * timeout is relative. Will fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct timespec __user *, timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(ioctx)) { + if (likely(min_nr <= nr && min_nr >= 0)) + ret = read_events(ioctx, min_nr, nr, events, timeout); + percpu_ref_put(&ioctx->users); + } + return ret; +} diff --git a/kernel/fs/anon_inodes.c b/kernel/fs/anon_inodes.c new file mode 100644 index 000000000..80ef38c73 --- /dev/null +++ b/kernel/fs/anon_inodes.c @@ -0,0 +1,179 @@ +/* + * fs/anon_inodes.c + * + * Copyright (C) 2007 Davide Libenzi + * + * Thanks to Arnd Bergmann for code review and suggestions. + * More changes for Thomas Gleixner suggestions. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct vfsmount *anon_inode_mnt __read_mostly; +static struct inode *anon_inode_inode; + +/* + * anon_inodefs_dname() is called from d_path(). + */ +static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + dentry->d_name.name); +} + +static const struct dentry_operations anon_inodefs_dentry_operations = { + .d_dname = anon_inodefs_dname, +}; + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "anon_inode:", NULL, + &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); +} + +static struct file_system_type anon_inode_fs_type = { + .name = "anon_inodefs", + .mount = anon_inodefs_mount, + .kill_sb = kill_anon_super, +}; + +/** + * anon_inode_getfile - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfile() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns the newly created file* or an error pointer. + */ +struct file *anon_inode_getfile(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + + if (IS_ERR(anon_inode_inode)) + return ERR_PTR(-ENODEV); + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + file = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + /* + * We know the anon_inode inode count is always greater than zero, + * so ihold() is safe. + */ + ihold(anon_inode_inode); + + d_instantiate(path.dentry, anon_inode_inode); + + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (IS_ERR(file)) + goto err_dput; + file->f_mapping = anon_inode_inode->i_mapping; + + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return file; +} +EXPORT_SYMBOL_GPL(anon_inode_getfile); + +/** + * anon_inode_getfd - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfd() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns new descriptor or an error code. + */ +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + int error, fd; + struct file *file; + + error = get_unused_fd_flags(flags); + if (error < 0) + return error; + fd = error; + + file = anon_inode_getfile(name, fops, priv, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + return error; +} +EXPORT_SYMBOL_GPL(anon_inode_getfd); + +static int __init anon_inode_init(void) +{ + anon_inode_mnt = kern_mount(&anon_inode_fs_type); + if (IS_ERR(anon_inode_mnt)) + panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); + + anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + if (IS_ERR(anon_inode_inode)) + panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); + + return 0; +} + +fs_initcall(anon_inode_init); + diff --git a/kernel/fs/attr.c b/kernel/fs/attr.c new file mode 100644 index 000000000..6530ced19 --- /dev/null +++ b/kernel/fs/attr.c @@ -0,0 +1,278 @@ +/* + * linux/fs/attr.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * changes by Thomas Schoebel-Theuer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * inode_change_ok - check if attribute changes to an inode are allowed + * @inode: inode to check + * @attr: attributes to change + * + * Check if we are allowed to change the attributes contained in @attr + * in the given inode. This includes the normal unix access permission + * checks, as well as checks for rlimits and others. + * + * Should be called as the first thing in ->setattr implementations, + * possibly after taking additional locks. + */ +int inode_change_ok(const struct inode *inode, struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + /* + * First check size constraints. These can't be overriden using + * ATTR_FORCE. + */ + if (ia_valid & ATTR_SIZE) { + int error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + } + + /* If force is set do it anyway. */ + if (ia_valid & ATTR_FORCE) + return 0; + + /* Make sure a caller can chown. */ + if ((ia_valid & ATTR_UID) && + (!uid_eq(current_fsuid(), inode->i_uid) || + !uid_eq(attr->ia_uid, inode->i_uid)) && + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return -EPERM; + + /* Make sure caller can chgrp. */ + if ((ia_valid & ATTR_GID) && + (!uid_eq(current_fsuid(), inode->i_uid) || + (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return -EPERM; + + /* Make sure a caller can chmod. */ + if (ia_valid & ATTR_MODE) { + if (!inode_owner_or_capable(inode)) + return -EPERM; + /* Also check the setgid bit! */ + if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : + inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + attr->ia_mode &= ~S_ISGID; + } + + /* Check for setting the inode time. */ + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { + if (!inode_owner_or_capable(inode)) + return -EPERM; + } + + return 0; +} +EXPORT_SYMBOL(inode_change_ok); + +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok must be called with i_mutex held. + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = rlimit(RLIMIT_FSIZE); + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + +/** + * setattr_copy - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * setattr_copy must be called with i_mutex held. + * + * setattr_copy updates the inode's metadata with that specified + * in attr. Noticeably missing is inode size update, which is more complex + * as it requires pagecache updates. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void setattr_copy(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} +EXPORT_SYMBOL(setattr_copy); + +/** + * notify_change - modify attributes of a filesytem object + * @dentry: object affected + * @iattr: new attributes + * @delegated_inode: returns inode, if the inode is delegated + * + * The caller must hold the i_mutex on the affected object. + * + * If notify_change discovers a delegation in need of breaking, + * it will return -EWOULDBLOCK and return a reference to the inode in + * delegated_inode. The caller should then break the delegation and + * retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. Also, passing NULL is fine for callers holding + * the file open for write, as there can be no conflicting delegation in + * that case. + */ +int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) +{ + struct inode *inode = dentry->d_inode; + umode_t mode = inode->i_mode; + int error; + struct timespec now; + unsigned int ia_valid = attr->ia_valid; + + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + if ((ia_valid & ATTR_MODE)) { + umode_t amode = attr->ia_mode; + /* Flag setting protected by i_mutex */ + if (is_sxid(amode)) + inode->i_flags &= ~S_NOSEC; + } + + now = current_fs_time(inode->i_sb); + + attr->ia_ctime = now; + if (!(ia_valid & ATTR_ATIME_SET)) + attr->ia_atime = now; + if (!(ia_valid & ATTR_MTIME_SET)) + attr->ia_mtime = now; + if (ia_valid & ATTR_KILL_PRIV) { + attr->ia_valid &= ~ATTR_KILL_PRIV; + ia_valid &= ~ATTR_KILL_PRIV; + error = security_inode_need_killpriv(dentry); + if (error > 0) + error = security_inode_killpriv(dentry); + if (error) + return error; + } + + /* + * We now pass ATTR_KILL_S*ID to the lower level setattr function so + * that the function has the ability to reinterpret a mode change + * that's due to these bits. This adds an implicit restriction that + * no function will ever call notify_change with both ATTR_MODE and + * ATTR_KILL_S*ID set. + */ + if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && + (ia_valid & ATTR_MODE)) + BUG(); + + if (ia_valid & ATTR_KILL_SUID) { + if (mode & S_ISUID) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = (inode->i_mode & ~S_ISUID); + } + } + if (ia_valid & ATTR_KILL_SGID) { + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (!(ia_valid & ATTR_MODE)) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = inode->i_mode; + } + attr->ia_mode &= ~S_ISGID; + } + } + if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) + return 0; + + error = security_inode_setattr(dentry, attr); + if (error) + return error; + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; + + if (inode->i_op->setattr) + error = inode->i_op->setattr(dentry, attr); + else + error = simple_setattr(dentry, attr); + + if (!error) { + fsnotify_change(dentry, ia_valid); + ima_inode_post_setattr(dentry); + evm_inode_post_setattr(dentry, ia_valid); + } + + return error; +} +EXPORT_SYMBOL(notify_change); diff --git a/kernel/fs/autofs4/Kconfig b/kernel/fs/autofs4/Kconfig new file mode 100644 index 000000000..1204d6384 --- /dev/null +++ b/kernel/fs/autofs4/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS4_FS + tristate "Kernel automounter version 4 support (also supports v3)" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + ; you also + want to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs4. You will need to add "alias autofs autofs4" to your + modules configuration file. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/kernel/fs/autofs4/Makefile b/kernel/fs/autofs4/Makefile new file mode 100644 index 000000000..a811c1f7d --- /dev/null +++ b/kernel/fs/autofs4/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux autofs-filesystem routines. +# + +obj-$(CONFIG_AUTOFS4_FS) += autofs4.o + +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/kernel/fs/autofs4/autofs_i.h b/kernel/fs/autofs4/autofs_i.h new file mode 100644 index 000000000..63cf85daa --- /dev/null +++ b/kernel/fs/autofs4/autofs_i.h @@ -0,0 +1,284 @@ +/* -*- c -*- ------------------------------------------------------------- * + * + * linux/fs/autofs/autofs_i.h + * + * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* Internal header file for autofs */ + +#include +#include +#include +#include +#include + +/* This is the range of ioctl() numbers we claim as ours */ +#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY +#define AUTOFS_IOC_COUNT 32 + +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* #define DEBUG */ + +#define DPRINTK(fmt, ...) \ + pr_debug("pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +#define AUTOFS_WARN(fmt, ...) \ + printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +#define AUTOFS_ERROR(fmt, ...) \ + printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +/* Unified info structure. This is pointed to by both the dentry and + inode structures. Each file in the filesystem has an instance of this + structure. It holds a reference to the dentry, so dentries are never + flushed while the file exists. All name lookups are dealt with at the + dentry level, although the filesystem can interfere in the validation + process. Readdir is implemented by traversing the dentry lists. */ +struct autofs_info { + struct dentry *dentry; + struct inode *inode; + + int flags; + + struct completion expire_complete; + + struct list_head active; + int active_count; + + struct list_head expiring; + + struct autofs_sb_info *sbi; + unsigned long last_used; + atomic_t count; + + kuid_t uid; + kgid_t gid; +}; + +#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered + * for expiry, so RCU_walk is + * not permitted + */ +#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ + +struct autofs_wait_queue { + wait_queue_head_t queue; + struct autofs_wait_queue *next; + autofs_wqt_t wait_queue_token; + /* We use the following to see what we are waiting for */ + struct qstr name; + u32 dev; + u64 ino; + kuid_t uid; + kgid_t gid; + pid_t pid; + pid_t tgid; + /* This is for status reporting upon return */ + int status; + unsigned int wait_ctr; +}; + +#define AUTOFS_SBI_MAGIC 0x6d4a556d + +struct autofs_sb_info { + u32 magic; + int pipefd; + struct file *pipe; + struct pid *oz_pgrp; + int catatonic; + int version; + int sub_version; + int min_proto; + int max_proto; + unsigned long exp_timeout; + unsigned int type; + int reghost_enabled; + int needs_reghost; + struct super_block *sb; + struct mutex wq_mutex; + struct mutex pipe_mutex; + spinlock_t fs_lock; + struct autofs_wait_queue *queues; /* Wait queue pointer */ + spinlock_t lookup_lock; + struct list_head active_list; + struct list_head expiring_list; + struct rcu_head rcu; +}; + +static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +{ + return (struct autofs_sb_info *)(sb->s_fs_info); +} + +static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +{ + return (struct autofs_info *)(dentry->d_fsdata); +} + +/* autofs4_oz_mode(): do we see the man behind the curtain? (The + processes which do manipulations for us in user space sees the raw + filesystem without "magic".) */ + +static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; +} + +struct inode *autofs4_get_inode(struct super_block *, umode_t); +void autofs4_free_ino(struct autofs_info *); + +/* Expiration */ +int is_autofs4_dentry(struct dentry *); +int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); +int autofs4_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs4_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); + +/* Operations structures */ + +extern const struct inode_operations autofs4_symlink_inode_operations; +extern const struct inode_operations autofs4_dir_inode_operations; +extern const struct file_operations autofs4_dir_operations; +extern const struct file_operations autofs4_root_operations; +extern const struct dentry_operations autofs4_dentry_operations; + +/* VFS automount flags management functions */ +static inline void __managed_dentry_set_managed(struct dentry *dentry) +{ + dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_set_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_managed(struct dentry *dentry) +{ + dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_clear_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +/* Initializing function */ + +int autofs4_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); +void autofs4_clean_ino(struct autofs_info *); + +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!(pipe->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + if (!S_ISFIFO(file_inode(pipe)->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + +/* Queue management functions */ + +int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); +int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); +void autofs4_catatonic_mode(struct autofs_sb_info *); + +static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +{ + return new_encode_dev(sbi->sb->s_dev); +} + +static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +{ + return d_inode(sbi->sb->s_root)->i_ino; +} + +static inline int simple_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + +static inline void __autofs4_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + } + return; +} + +static inline void autofs4_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static inline void autofs4_del_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + return; +} + +extern void autofs4_kill_sb(struct super_block *); diff --git a/kernel/fs/autofs4/dev-ioctl.c b/kernel/fs/autofs4/dev-ioctl.c new file mode 100644 index 000000000..ac7d921ed --- /dev/null +++ b/kernel/fs/autofs4/dev-ioctl.c @@ -0,0 +1,762 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, size_t size) +{ + if (memchr(str, 0, size)) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || + (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + AUTOFS_WARN("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp, *res; + + if (copy_from_user(&tmp, in, sizeof(tmp))) + return ERR_PTR(-EFAULT); + + if (tmp.size < sizeof(tmp)) + return ERR_PTR(-EINVAL); + + if (tmp.size > (PATH_MAX + sizeof(tmp))) + return ERR_PTR(-ENAMETOOLONG); + + res = memdup_user(in, tmp.size); + if (!IS_ERR(res)) + res->size = tmp.size; + + return res; +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); + return; +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it is terminated and contains at least one "/". + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err; + + err = check_dev_ioctl_version(cmd, param); + if (err) { + AUTOFS_WARN("invalid device control module version " + "supplied for cmd(0x%08x)", cmd); + goto out; + } + + if (param->size > sizeof(*param)) { + err = invalid_str(param->path, param->size - sizeof(*param)); + if (err) { + AUTOFS_WARN( + "path string terminator missing for cmd(0x%08x)", + cmd); + goto out; + } + + err = check_name(param->path); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = file_inode(f); + sbi = autofs4_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protover.version = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protosubver.sub_version = sbi->sub_version; + return 0; +} + +/* Find the topmost mount satisfying test() */ +static int find_autofs_mount(const char *pathname, + struct path *res, + int test(struct path *path, void *data), + void *data) +{ + struct path path; + int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + if (err) + return err; + err = -ENOENT; + while (path.dentry == path.mnt->mnt_root) { + if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (test(&path, data)) { + path_get(&path); + *res = path; + err = 0; + break; + } + } + if (!follow_up(&path)) + break; + } + path_put(&path); + return err; +} + +static int test_by_dev(struct path *path, void *p) +{ + return path->dentry->d_sb->s_dev == *(dev_t *)p; +} + +static int test_by_type(struct path *path, void *p) +{ + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + return ino && ino->sbi->type & *(unsigned *)p; +} + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) +{ + int err, fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (likely(fd >= 0)) { + struct file *filp; + struct path path; + + err = find_autofs_mount(name, &path, test_by_dev, &devid); + if (err) + goto out; + + /* + * Find autofs super block that has the device number + * corresponding to the autofs fs we want to open. + */ + + filp = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->openmount.devid) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = new_decode_dev(param->openmount.devid); + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return sys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->ready.token; + return autofs4_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status ? param->fail.status : -ENOENT; + return autofs4_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + struct pid *new_pid = NULL; + + if (param->setpipefd.pipefd == -1) + return -EINVAL; + + pipefd = param->setpipefd.pipefd; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); + if (!pipe) { + err = -EBADF; + goto out; + } + if (autofs_prepare_pipe(pipe) < 0) { + err = -EPIPE; + fput(pipe); + goto out; + } + swap(sbi->oz_pgrp, new_pid); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + put_pid(new_pid); + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs4_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct path path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + devid = sbi->sb->s_dev; + + param->requester.uid = param->requester.gid = -1; + + err = find_autofs_mount(param->path, &path, test_by_dev, &devid); + if (err) + goto out; + + ino = autofs4_dentry_ino(path.dentry); + if (ino) { + err = 0; + autofs4_expire_wait(path.dentry, 0); + spin_lock(&sbi->fs_lock); + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); + spin_unlock(&sbi->fs_lock); + } + path_put(&path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct vfsmount *mnt; + int how; + + how = param->expire.how; + mnt = fp->f_path.mnt; + + return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->askumount.may_umount = 0; + if (may_umount(fp->f_path.mnt)) + param->askumount.may_umount = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the path and check if it is the root of a mount. + * If a type is given we are looking for a particular autofs + * mount and if we don't find a match we return fail. If the + * located path is the root of a mount we return 1 along with + * the super magic of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct path path; + const char *name; + unsigned int type; + unsigned int devid, magic; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + name = param->path; + type = param->ismountpoint.in.type; + + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; + + if (!fp || param->ioctlfd == -1) { + if (autofs_type_any(type)) + err = kern_path_mountpoint(AT_FDCWD, + name, &path, LOOKUP_FOLLOW); + else + err = find_autofs_mount(name, &path, + test_by_type, &type); + if (err) + goto out; + devid = new_encode_dev(path.dentry->d_sb->s_dev); + err = 0; + if (path.mnt->mnt_root == path.dentry) { + err = 1; + magic = path.dentry->d_sb->s_magic; + } + } else { + dev_t dev = sbi->sb->s_dev; + + err = find_autofs_mount(name, &path, test_by_dev, &dev); + if (err) + goto out; + + devid = new_encode_dev(dev); + + err = have_submounts(path.dentry); + + if (follow_down_one(&path)) + magic = path.dentry->d_sb->s_magic; + } + + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + path_put(&path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), + autofs_dev_ioctl_protover}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), + autofs_dev_ioctl_protosubver}, + {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), + autofs_dev_ioctl_openmount}, + {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), + autofs_dev_ioctl_closemount}, + {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), + autofs_dev_ioctl_ready}, + {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), + autofs_dev_ioctl_fail}, + {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), + autofs_dev_ioctl_setpipefd}, + {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), + autofs_dev_ioctl_catatonic}, + {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), + autofs_dev_ioctl_timeout}, + {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), + autofs_dev_ioctl_requester}, + {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), + autofs_dev_ioctl_expire}, + {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), + autofs_dev_ioctl_askumount}, + {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), + autofs_dev_ioctl_ismountpoint} + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + /* The validate routine above always sets the version */ + if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) + goto done; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + AUTOFS_WARN("unknown command 0x%08x", command); + return -ENOTTY; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release. + */ + if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs4_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); +done: + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +{ + int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +{ + return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = AUTOFS_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops +}; + +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + +/* Register/deregister misc character device */ +int __init autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + AUTOFS_ERROR("misc_register failed for control device"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); + return; +} + diff --git a/kernel/fs/autofs4/expire.c b/kernel/fs/autofs4/expire.c new file mode 100644 index 000000000..d487fa27a --- /dev/null +++ b/kernel/fs/autofs4/expire.c @@ -0,0 +1,603 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/expire.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include "autofs_i.h" + +static unsigned long now; + +/* Check if a dentry can be expired */ +static inline int autofs4_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) +{ + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* dentry in the process of being deleted */ + if (ino == NULL) + return 0; + + if (!do_now) { + /* Too young to die */ + if (!timeout || time_after(ino->last_used + timeout, now)) + return 0; + } + return 1; +} + +/* Check a mount point for busyness */ +static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +{ + struct dentry *top = dentry; + struct path path = {.mnt = mnt, .dentry = dentry}; + int status = 1; + + DPRINTK("dentry %p %pd", dentry, dentry); + + path_get(&path); + + if (!follow_down_one(&path)) + goto done; + + if (is_autofs4_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + + /* This is an autofs submount, we can't expire it */ + if (autofs_type_indirect(sbi->type)) + goto done; + } + + /* Update the expiry counter if fs is busy */ + if (!may_umount_tree(path.mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + ino->last_used = jiffies; + goto done; + } + + status = 0; +done: + DPRINTK("returning = %d", status); + path_put(&path); + return status; +} + +/* + * Calculate and dget next entry in the subdirs list under root. + */ +static struct dentry *get_next_positive_subdir(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *q; + + spin_lock(&sbi->lookup_lock); + spin_lock(&root->d_lock); + + if (prev) + next = prev->d_child.next; + else { + prev = dget_dlock(root); + next = prev->d_subdirs.next; + } + +cont: + if (next == &root->d_subdirs) { + spin_unlock(&root->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + q = list_entry(next, struct dentry, d_child); + + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); + /* Already gone or negative dentry (under construction) - try next */ + if (!d_count(q) || !simple_positive(q)) { + spin_unlock(&q->d_lock); + next = q->d_child.next; + goto cont; + } + dget_dlock(q); + spin_unlock(&q->d_lock); + spin_unlock(&root->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return q; +} + +/* + * Calculate and dget next entry in top down tree traversal. + */ +static struct dentry *get_next_positive_dentry(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *ret; + + if (prev == NULL) + return dget(root); + + spin_lock(&sbi->lookup_lock); +relock: + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_subdirs.next; + if (next == &p->d_subdirs) { + while (1) { + struct dentry *parent; + + if (p == root) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + parent = p->d_parent; + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&p->d_lock); + cpu_chill(); + goto relock; + } + spin_unlock(&p->d_lock); + next = p->d_child.next; + p = parent; + if (next != &parent->d_subdirs) + break; + } + } + ret = list_entry(next, struct dentry, d_child); + + spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(ret)) { + spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); + p = ret; + goto again; + } + dget_dlock(ret); + spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return ret; +} + +/* + * Check a direct mount point for busyness. + * Direct mounts have similar expiry semantics to tree mounts. + * The tree is not busy iff no mountpoints are busy and there are no + * autofs submounts. + */ +static int autofs4_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + DPRINTK("top %p %pd", top, top); + + /* If it's busy update the expiry counters */ + if (!may_umount_tree(mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + if (ino) + ino->last_used = jiffies; + return 1; + } + + /* Timeout of a direct mount is determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +/* Check a directory tree of mount points for busyness + * The tree is not busy iff no mountpoints are busy + */ +static int autofs4_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct dentry *p; + + DPRINTK("top %p %pd", top, top); + + /* Negative dentry - give up */ + if (!simple_positive(top)) + return 1; + + p = NULL; + while ((p = get_next_positive_dentry(p, top))) { + DPRINTK("dentry %p %pd", p, p); + + /* + * Is someone visiting anywhere in the subtree ? + * If there's no mount we need to check the usage + * count for the autofs dentry. + * If the fs is busy update the expiry counter. + */ + if (d_mountpoint(p)) { + if (autofs4_mount_busy(mnt, p)) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } else { + struct autofs_info *ino = autofs4_dentry_ino(p); + unsigned int ino_count = atomic_read(&ino->count); + + /* allow for dget above and top is already dgot */ + if (p == top) + ino_count += 2; + else + ino_count++; + + if (d_count(p) > ino_count) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } + } + + /* Timeout of a tree mount is ultimately determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) +{ + struct dentry *p; + + DPRINTK("parent %p %pd", parent, parent); + + p = NULL; + while ((p = get_next_positive_dentry(p, parent))) { + DPRINTK("dentry %p %pd", p, p); + + if (d_mountpoint(p)) { + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, p)) + continue; + + /* Can we expire this guy */ + if (autofs4_can_expire(p, timeout, do_now)) + return p; + } + } + return NULL; +} + +/* Check if we can expire a direct mount (possibly a tree) */ +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = dget(sb->s_root); + int do_now = how & AUTOFS_EXP_IMMEDIATE; + struct autofs_info *ino; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + goto out; + if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + ino->flags |= AUTOFS_INF_NO_RCU; + spin_unlock(&sbi->fs_lock); + synchronize_rcu(); + spin_lock(&sbi->fs_lock); + if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + ino->flags |= AUTOFS_INF_EXPIRING; + smp_mb(); + ino->flags &= ~AUTOFS_INF_NO_RCU; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + return root; + } + ino->flags &= ~AUTOFS_INF_NO_RCU; + } +out: + spin_unlock(&sbi->fs_lock); + dput(root); + + return NULL; +} + +/* Check if 'dentry' should expire, or return a nearby + * dentry that is suitable. + * If returned dentry is different from arg dentry, + * then a dget() reference was taken, else not. + */ +static struct dentry *should_expire(struct dentry *dentry, + struct vfsmount *mnt, + unsigned long timeout, + int how) +{ + int do_now = how & AUTOFS_EXP_IMMEDIATE; + int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino = autofs4_dentry_ino(dentry); + unsigned int ino_count; + + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + return NULL; + + /* + * Case 1: (i) indirect mount or top level pseudo direct mount + * (autofs-4.1). + * (ii) indirect mount with offset mount, check the "/" + * offset (autofs-5.0+). + */ + if (d_mountpoint(dentry)) { + DPRINTK("checking mountpoint %p %pd", dentry, dentry); + + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, dentry)) + return NULL; + + /* Can we expire this guy */ + if (autofs4_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + DPRINTK("checking symlink %p %pd", dentry, dentry); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs4_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (simple_empty(dentry)) + return NULL; + + /* Case 2: tree mount, expire iff entire tree is not busy */ + if (!exp_leaves) { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + return dentry; + /* + * Case 3: pseudo direct mount, expire individual leaves + * (autofs-4.1). + */ + } else { + /* Path walk currently on this dentry? */ + struct dentry *expired; + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + if (expired) { + if (expired == dentry) + dput(dentry); + return expired; + } + } + return NULL; +} +/* + * Find an eligible tree to time-out + * A tree is eligible if :- + * - it is unused by any user process + * - it has been unused for exp_timeout time + */ +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = sb->s_root; + struct dentry *dentry; + struct dentry *expired; + struct autofs_info *ino; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + dentry = NULL; + while ((dentry = get_next_positive_subdir(dentry, root))) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + if (ino->flags & AUTOFS_INF_NO_RCU) + expired = NULL; + else + expired = should_expire(dentry, mnt, timeout, how); + if (!expired) { + spin_unlock(&sbi->fs_lock); + continue; + } + ino = autofs4_dentry_ino(expired); + ino->flags |= AUTOFS_INF_NO_RCU; + spin_unlock(&sbi->fs_lock); + synchronize_rcu(); + spin_lock(&sbi->fs_lock); + if (should_expire(expired, mnt, timeout, how)) { + if (expired != dentry) + dput(dentry); + goto found; + } + + ino->flags &= ~AUTOFS_INF_NO_RCU; + if (expired != dentry) + dput(expired); + spin_unlock(&sbi->fs_lock); + } + return NULL; + +found: + DPRINTK("returning %p %pd", expired, expired); + ino->flags |= AUTOFS_INF_EXPIRING; + smp_mb(); + ino->flags &= ~AUTOFS_INF_NO_RCU; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + spin_lock(&sbi->lookup_lock); + spin_lock(&expired->d_parent->d_lock); + spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&expired->d_parent->d_subdirs, &expired->d_child); + spin_unlock(&expired->d_lock); + spin_unlock(&expired->d_parent->d_lock); + spin_unlock(&sbi->lookup_lock); + return expired; +} + +int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + /* Block on any pending expire */ + if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))) + return 0; + if (rcu_walk) + return -ECHILD; + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + + DPRINTK("waiting for expire %p name=%pd", dentry, dentry); + + status = autofs4_wait(sbi, dentry, NFY_NONE); + wait_for_completion(&ino->expire_complete); + + DPRINTK("expire done status=%d", status); + + if (d_unhashed(dentry)) + return -EAGAIN; + + return status; + } + spin_unlock(&sbi->fs_lock); + + return 0; +} + +/* Perform an expiry operation */ +int autofs4_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) +{ + struct autofs_packet_expire pkt; + struct autofs_info *ino; + struct dentry *dentry; + int ret = 0; + + memset(&pkt,0,sizeof pkt); + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = autofs_ptype_expire; + + if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) + return -EAGAIN; + + pkt.len = dentry->d_name.len; + memcpy(pkt.name, dentry->d_name.name, pkt.len); + pkt.name[pkt.len] = '\0'; + dput(dentry); + + if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) + ret = -EFAULT; + + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + + return ret; +} + +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) +{ + struct dentry *dentry; + int ret = -EAGAIN; + + if (autofs_type_trigger(sbi->type)) + dentry = autofs4_expire_direct(sb, mnt, sbi, when); + else + dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + + if (dentry) { + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* This is synchronous because it makes the daemon a + little easier */ + ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return ret; +} + +/* Call repeatedly until it returns -EAGAIN, meaning there's nothing + more to be done */ +int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int __user *arg) +{ + int do_now = 0; + + if (arg && get_user(do_now, arg)) + return -EFAULT; + + return autofs4_do_expire_multi(sb, mnt, sbi, do_now); +} + diff --git a/kernel/fs/autofs4/init.c b/kernel/fs/autofs4/init.c new file mode 100644 index 000000000..b3db517e8 --- /dev/null +++ b/kernel/fs/autofs4/init.c @@ -0,0 +1,52 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/init.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include "autofs_i.h" + +static struct dentry *autofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, autofs4_fill_super); +} + +static struct file_system_type autofs_fs_type = { + .owner = THIS_MODULE, + .name = "autofs", + .mount = autofs_mount, + .kill_sb = autofs4_kill_sb, +}; +MODULE_ALIAS_FS("autofs"); + +static int __init init_autofs4_fs(void) +{ + int err; + + autofs_dev_ioctl_init(); + + err = register_filesystem(&autofs_fs_type); + if (err) + autofs_dev_ioctl_exit(); + + return err; +} + +static void __exit exit_autofs4_fs(void) +{ + autofs_dev_ioctl_exit(); + unregister_filesystem(&autofs_fs_type); +} + +module_init(init_autofs4_fs) +module_exit(exit_autofs4_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/autofs4/inode.c b/kernel/fs/autofs4/inode.c new file mode 100644 index 000000000..a3ae0b2ae --- /dev/null +++ b/kernel/fs/autofs4/inode.c @@ -0,0 +1,370 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/inode.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "autofs_i.h" +#include + +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +{ + struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + if (ino) { + INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->expiring); + ino->last_used = jiffies; + ino->sbi = sbi; + } + return ino; +} + +void autofs4_clean_ino(struct autofs_info *ino) +{ + ino->uid = GLOBAL_ROOT_UID; + ino->gid = GLOBAL_ROOT_GID; + ino->last_used = jiffies; +} + +void autofs4_free_ino(struct autofs_info *ino) +{ + kfree(ino); +} + +void autofs4_kill_sb(struct super_block *sb) +{ + struct autofs_sb_info *sbi = autofs4_sbi(sb); + + /* + * In the event of a failure in get_sb_nodev the superblock + * info is not present so nothing else has been setup, so + * just call kill_anon_super when we are called from + * deactivate_super. + */ + if (sbi) { + /* Free wait queues, close pipe */ + autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + } + + DPRINTK("shutting down"); + kill_litter_super(sb); + if (sbi) + kfree_rcu(sbi, rcu); +} + +static int autofs4_show_options(struct seq_file *m, struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct inode *root_inode = d_inode(root->d_sb->s_root); + + if (!sbi) + return 0; + + seq_printf(m, ",fd=%d", sbi->pipefd); + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, root_inode->i_uid)); + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, root_inode->i_gid)); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); + seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); + seq_printf(m, ",minproto=%d", sbi->min_proto); + seq_printf(m, ",maxproto=%d", sbi->max_proto); + + if (autofs_type_offset(sbi->type)) + seq_printf(m, ",offset"); + else if (autofs_type_direct(sbi->type)) + seq_printf(m, ",direct"); + else + seq_printf(m, ",indirect"); + + return 0; +} + +static void autofs4_evict_inode(struct inode *inode) +{ + clear_inode(inode); + kfree(inode->i_private); +} + +static const struct super_operations autofs4_sops = { + .statfs = simple_statfs, + .show_options = autofs4_show_options, + .evict_inode = autofs4_evict_inode, +}; + +enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, + Opt_indirect, Opt_direct, Opt_offset}; + +static const match_table_t tokens = { + {Opt_fd, "fd=%u"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_pgrp, "pgrp=%u"}, + {Opt_minproto, "minproto=%u"}, + {Opt_maxproto, "maxproto=%u"}, + {Opt_indirect, "indirect"}, + {Opt_direct, "direct"}, + {Opt_offset, "offset"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + *uid = current_uid(); + *gid = current_gid(); + + *minproto = AUTOFS_MIN_PROTO_VERSION; + *maxproto = AUTOFS_MAX_PROTO_VERSION; + + *pipefd = -1; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_fd: + if (match_int(args, pipefd)) + return 1; + break; + case Opt_uid: + if (match_int(args, &option)) + return 1; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 1; + break; + case Opt_gid: + if (match_int(args, &option)) + return 1; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 1; + break; + case Opt_pgrp: + if (match_int(args, &option)) + return 1; + *pgrp = option; + *pgrp_set = true; + break; + case Opt_minproto: + if (match_int(args, &option)) + return 1; + *minproto = option; + break; + case Opt_maxproto: + if (match_int(args, &option)) + return 1; + *maxproto = option; + break; + case Opt_indirect: + set_autofs_type_indirect(type); + break; + case Opt_direct: + set_autofs_type_direct(type); + break; + case Opt_offset: + set_autofs_type_offset(type); + break; + default: + return 1; + } + } + return (*pipefd < 0); +} + +int autofs4_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode * root_inode; + struct dentry * root; + struct file * pipe; + int pipefd; + struct autofs_sb_info *sbi; + struct autofs_info *ino; + int pgrp = 0; + bool pgrp_set = false; + int ret = -EINVAL; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + DPRINTK("starting up, sbi = %p",sbi); + + s->s_fs_info = sbi; + sbi->magic = AUTOFS_SBI_MAGIC; + sbi->pipefd = -1; + sbi->pipe = NULL; + sbi->catatonic = 1; + sbi->exp_timeout = 0; + sbi->oz_pgrp = NULL; + sbi->sb = s; + sbi->version = 0; + sbi->sub_version = 0; + set_autofs_type_indirect(&sbi->type); + sbi->min_proto = 0; + sbi->max_proto = 0; + mutex_init(&sbi->wq_mutex); + mutex_init(&sbi->pipe_mutex); + spin_lock_init(&sbi->fs_lock); + sbi->queues = NULL; + spin_lock_init(&sbi->lookup_lock); + INIT_LIST_HEAD(&sbi->active_list); + INIT_LIST_HEAD(&sbi->expiring_list); + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = AUTOFS_SUPER_MAGIC; + s->s_op = &autofs4_sops; + s->s_d_op = &autofs4_dentry_operations; + s->s_time_gran = 1; + + /* + * Get the root inode and dentry, but defer checking for errors. + */ + ino = autofs4_new_ino(sbi); + if (!ino) { + ret = -ENOMEM; + goto fail_free; + } + root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + root = d_make_root(root_inode); + if (!root) + goto fail_ino; + pipe = NULL; + + root->d_fsdata = ino; + + /* Can this call block? */ + if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { + printk("autofs: called with bogus options\n"); + goto fail_dput; + } + + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + + if (autofs_type_trigger(sbi->type)) + __managed_dentry_set_managed(root); + + root_inode->i_fop = &autofs4_root_operations; + root_inode->i_op = &autofs4_dir_inode_operations; + + /* Couldn't this be tested earlier? */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + printk("autofs: kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); + pipe = fget(pipefd); + + if (!pipe) { + printk("autofs: could not open pipe file descriptor\n"); + goto fail_dput; + } + ret = autofs_prepare_pipe(pipe); + if (ret < 0) + goto fail_fput; + sbi->pipe = pipe; + sbi->pipefd = pipefd; + sbi->catatonic = 0; + + /* + * Success! Install the root dentry now to indicate completion. + */ + s->s_root = root; + return 0; + + /* + * Failure ... clean up. + */ +fail_fput: + printk("autofs: pipe file descriptor does not contain proper ops\n"); + fput(pipe); + /* fall through */ +fail_dput: + dput(root); + goto fail_free; +fail_ino: + kfree(ino); +fail_free: + put_pid(sbi->oz_pgrp); + kfree(sbi); + s->s_fs_info = NULL; + return ret; +} + +struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) +{ + struct inode *inode = new_inode(sb); + + if (inode == NULL) + return NULL; + + inode->i_mode = mode; + if (sb->s_root) { + inode->i_uid = d_inode(sb->s_root)->i_uid; + inode->i_gid = d_inode(sb->s_root)->i_gid; + } + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = get_next_ino(); + + if (S_ISDIR(mode)) { + set_nlink(inode, 2); + inode->i_op = &autofs4_dir_inode_operations; + inode->i_fop = &autofs4_dir_operations; + } else if (S_ISLNK(mode)) { + inode->i_op = &autofs4_symlink_inode_operations; + } + + return inode; +} diff --git a/kernel/fs/autofs4/root.c b/kernel/fs/autofs4/root.c new file mode 100644 index 000000000..c6d7d3dbd --- /dev/null +++ b/kernel/fs/autofs4/root.c @@ -0,0 +1,923 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/root.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); +static int autofs4_dir_unlink(struct inode *,struct dentry *); +static int autofs4_dir_rmdir(struct inode *,struct dentry *); +static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); +static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); +#ifdef CONFIG_COMPAT +static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); +#endif +static int autofs4_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); +static struct vfsmount *autofs4_d_automount(struct path *); +static int autofs4_d_manage(struct dentry *, bool); +static void autofs4_dentry_release(struct dentry *); + +const struct file_operations autofs4_root_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .iterate = dcache_readdir, + .llseek = dcache_dir_lseek, + .unlocked_ioctl = autofs4_root_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = autofs4_root_compat_ioctl, +#endif +}; + +const struct file_operations autofs4_dir_operations = { + .open = autofs4_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .iterate = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +const struct inode_operations autofs4_dir_inode_operations = { + .lookup = autofs4_lookup, + .unlink = autofs4_dir_unlink, + .symlink = autofs4_dir_symlink, + .mkdir = autofs4_dir_mkdir, + .rmdir = autofs4_dir_rmdir, +}; + +const struct dentry_operations autofs4_dentry_operations = { + .d_automount = autofs4_d_automount, + .d_manage = autofs4_d_manage, + .d_release = autofs4_dentry_release, +}; + +static void autofs4_add_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!ino->active_count) { + if (list_empty(&ino->active)) + list_add(&ino->active, &sbi->active_list); + } + ino->active_count++; + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static void autofs4_del_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + ino->active_count--; + if (!ino->active_count) { + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + } + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static int autofs4_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); + + if (autofs4_oz_mode(sbi)) + goto out; + + /* + * An empty directory in an autofs file system is always a + * mount point. The daemon must have failed to mount this + * during lookup so it doesn't exist. This can happen, for + * example, if user space returns an incorrect status for a + * mount request. Otherwise we're doing a readdir on the + * autofs file system so just let the libfs routines handle + * it. + */ + spin_lock(&sbi->lookup_lock); + if (!d_mountpoint(dentry) && simple_empty(dentry)) { + spin_unlock(&sbi->lookup_lock); + return -ENOENT; + } + spin_unlock(&sbi->lookup_lock); + +out: + return dcache_dir_open(inode, file); +} + +static void autofs4_dentry_release(struct dentry *de) +{ + struct autofs_info *ino = autofs4_dentry_ino(de); + struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + + DPRINTK("releasing %p", de); + + if (!ino) + return; + + if (sbi) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del(&ino->active); + if (!list_empty(&ino->expiring)) + list_del(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + + autofs4_free_ino(ino); +} + +static struct dentry *autofs4_lookup_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + head = &sbi->active_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *active; + struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, active); + active = ino->dentry; + + spin_lock(&active->d_lock); + + /* Already gone? */ + if ((int) d_count(active) <= 0) + goto next; + + qstr = &active->d_name; + + if (active->d_name.hash != hash) + goto next; + if (active->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(active)) { + dget_dlock(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + return active; + } +next: + spin_unlock(&active->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, + bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + head = &sbi->expiring_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *expiring; + struct qstr *qstr; + + if (rcu_walk) { + spin_unlock(&sbi->lookup_lock); + return ERR_PTR(-ECHILD); + } + + ino = list_entry(p, struct autofs_info, expiring); + expiring = ino->dentry; + + spin_lock(&expiring->d_lock); + + /* We've already been dentry_iput or unlinked */ + if (d_really_is_negative(expiring)) + goto next; + + qstr = &expiring->d_name; + + if (expiring->d_name.hash != hash) + goto next; + if (expiring->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(expiring)) { + dget_dlock(expiring); + spin_unlock(&expiring->d_lock); + spin_unlock(&sbi->lookup_lock); + return expiring; + } +next: + spin_unlock(&expiring->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status = 0; + + if (ino->flags & AUTOFS_INF_PENDING) { + if (rcu_walk) + return -ECHILD; + DPRINTK("waiting for mount name=%pd", dentry); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + DPRINTK("mount wait done status=%d", status); + } + ino->last_used = jiffies; + return status; +} + +static int do_expire_wait(struct dentry *dentry, bool rcu_walk) +{ + struct dentry *expiring; + + expiring = autofs4_lookup_expiring(dentry, rcu_walk); + if (IS_ERR(expiring)) + return PTR_ERR(expiring); + if (!expiring) + return autofs4_expire_wait(dentry, rcu_walk); + else { + /* + * If we are racing with expire the request might not + * be quite complete, but the directory has been removed + * so it must have been successful, just wait for it. + */ + autofs4_expire_wait(expiring, 0); + autofs4_del_expiring(expiring); + dput(expiring); + } + return 0; +} + +static struct dentry *autofs4_mountpoint_changed(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + /* + * If this is an indirect mount the dentry could have gone away + * as a result of an expire and a new one created. + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; + struct dentry *new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; + } + return path->dentry; +} + +static struct vfsmount *autofs4_d_automount(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + DPRINTK("dentry=%p %pd", dentry, dentry); + + /* The daemon never triggers a mount. */ + if (autofs4_oz_mode(sbi)) + return NULL; + + /* + * If an expire request is pending everyone must wait. + * If the expire fails we're still mounted so continue + * the follow and return. A return of -EAGAIN (which only + * happens with indirect mounts) means the expire completed + * and the directory was removed, so just go ahead and try + * the mount. + */ + status = do_expire_wait(dentry, 0); + if (status && status != -EAGAIN) + return NULL; + + /* Callback to the daemon to perform the mount or wait */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry, 0); + if (status) + return ERR_PTR(status); + goto done; + } + + /* + * If the dentry is a symlink it's equivalent to a directory + * having d_mountpoint() true, so there's no need to call back + * to the daemon. + */ + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + + if (!d_mountpoint(dentry)) { + /* + * It's possible that user space hasn't removed directories + * after umounting a rootless multi-mount, although it + * should. For v5 have_submounts() is sufficient to handle + * this because the leaves of the directory tree under the + * mount never trigger mounts themselves (they have an autofs + * trigger mount mounted on them). But v4 pseudo direct mounts + * do need the leaves to trigger mounts. In this case we + * have no choice but to use the list_empty() check and + * require user space behave. + */ + if (sbi->version > 4) { + if (have_submounts(dentry)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + } else { + if (!simple_empty(dentry)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + } + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry, 0); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + if (status) { + spin_unlock(&sbi->fs_lock); + return ERR_PTR(status); + } + } + spin_unlock(&sbi->fs_lock); +done: + /* Mount succeeded, check if we ended up with a new dentry */ + dentry = autofs4_mountpoint_changed(path); + if (!dentry) + return ERR_PTR(-ENOENT); + + return NULL; +} + +static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + DPRINTK("dentry=%p %pd", dentry, dentry); + + /* The daemon never waits. */ + if (autofs4_oz_mode(sbi)) { + if (!d_mountpoint(dentry)) + return -EISDIR; + return 0; + } + + /* Wait for pending expires */ + if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + return -ECHILD; + + /* + * This dentry may be under construction so wait on mount + * completion. + */ + status = autofs4_mount_wait(dentry, rcu_walk); + if (status) + return status; + + if (rcu_walk) { + /* We don't need fs_lock in rcu_walk mode, + * just testing 'AUTOFS_INFO_NO_RCU' is enough. + * simple_empty() takes a spinlock, so leave it + * to last. + * We only return -EISDIR when certain this isn't + * a mount-trap. + */ + struct inode *inode; + if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) + return 0; + if (d_mountpoint(dentry)) + return 0; + inode = d_inode_rcu(dentry); + if (inode && S_ISLNK(inode->i_mode)) + return -EISDIR; + if (list_empty(&dentry->d_subdirs)) + return 0; + if (!simple_empty(dentry)) + return -EISDIR; + return 0; + } + + spin_lock(&sbi->fs_lock); + /* + * If the dentry has been selected for expire while we slept + * on the lock then it might go away. We'll deal with that in + * ->d_automount() and wait on a new mount if the expire + * succeeds or return here if it doesn't (since there's no + * mount to follow with a rootless multi-mount). + */ + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path + * updated so check if this is a rootless multi-mount so + * we can avoid needless calls ->d_automount() and avoid + * an incorrect ELOOP error return. + */ + if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + (d_really_is_positive(dentry) && d_is_symlink(dentry))) + status = -EISDIR; + } + spin_unlock(&sbi->fs_lock); + + return status; +} + +/* Lookups in the root directory */ +static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct autofs_sb_info *sbi; + struct autofs_info *ino; + struct dentry *active; + + DPRINTK("name = %pd", dentry); + + /* File name too long to exist */ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + sbi = autofs4_sbi(dir->i_sb); + + DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); + + active = autofs4_lookup_active(dentry); + if (active) { + return active; + } else { + /* + * A dentry that is not within the root can never trigger a + * mount operation, unless the directory already exists, so we + * can return fail immediately. The daemon however does need + * to create directories within the file system. + */ + if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + return ERR_PTR(-ENOENT); + + /* Mark entries in the root as mount triggers */ + if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + __managed_dentry_set_managed(dentry); + + ino = autofs4_new_ino(sbi); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + autofs4_add_active(dentry); + + d_instantiate(dentry, NULL); + } + return NULL; +} + +static int autofs4_dir_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + size_t size = strlen(symname); + char *cp; + + DPRINTK("%s <- %pd", symname, dentry); + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + BUG_ON(!ino); + + autofs4_clean_ino(ino); + + autofs4_del_active(dentry); + + cp = kmalloc(size + 1, GFP_KERNEL); + if (!cp) + return -ENOMEM; + + strcpy(cp, symname); + + inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + if (!inode) { + kfree(cp); + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } + inode->i_private = cp; + inode->i_size = size; + d_add(dentry, inode); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_inc(&p_ino->count); + + dir->i_mtime = CURRENT_TIME; + + return 0; +} + +/* + * NOTE! + * + * Normal filesystems would do a "d_delete()" to tell the VFS dcache + * that the file no longer exists. However, doing that means that the + * VFS layer can turn the dentry into a negative dentry. We don't want + * this, because the unlink is probably the result of an expire. + * We simply d_drop it and add it to a expiring list in the super block, + * which allows the dentry lookup to check for an incomplete expire. + * + * If a process is blocked on the dentry waiting for the expire to finish, + * it will invalidate the dentry and try to mount with a new one. + * + * Also see autofs4_dir_rmdir().. + */ +static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + + /* This allows root to remove symlinks */ + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); + + dir->i_mtime = CURRENT_TIME; + + spin_lock(&sbi->lookup_lock); + __autofs4_add_expiring(dentry); + d_drop(dentry); + spin_unlock(&sbi->lookup_lock); + + return 0; +} + +/* + * Version 4 of autofs provides a pseudo direct mount implementation + * that relies on directories at the leaves of a directory tree under + * an indirect mount to trigger mounts. To allow for this we need to + * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves + * of the directory tree. There is no need to clear the automount flag + * following a mount or restore it after an expire because these mounts + * are always covered. However, it is necessary to ensure that these + * flags are clear on non-empty directories to avoid unnecessary calls + * during path walks. + */ +static void autofs_set_leaf_automount_flags(struct dentry *dentry) +{ + struct dentry *parent; + + /* root and dentrys in the root are already handled */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_set_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + managed_dentry_clear_managed(parent); + return; +} + +static void autofs_clear_leaf_automount_flags(struct dentry *dentry) +{ + struct list_head *d_child; + struct dentry *parent; + + /* flags for dentrys in the root are handled elsewhere */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_clear_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + d_child = &dentry->d_child; + /* Set parent managed if it's becoming empty */ + if (d_child->next == &parent->d_subdirs && + d_child->prev == &parent->d_subdirs) + managed_dentry_set_managed(parent); + return; +} + +static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + + DPRINTK("dentry %p, removing %pd", dentry, dentry); + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + spin_lock(&sbi->lookup_lock); + if (!simple_empty(dentry)) { + spin_unlock(&sbi->lookup_lock); + return -ENOTEMPTY; + } + __autofs4_add_expiring(dentry); + d_drop(dentry); + spin_unlock(&sbi->lookup_lock); + + if (sbi->version < 5) + autofs_clear_leaf_automount_flags(dentry); + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); + + if (dir->i_nlink) + drop_nlink(dir); + + return 0; +} + +static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + DPRINTK("dentry %p, creating %pd", dentry, dentry); + + BUG_ON(!ino); + + autofs4_clean_ino(ino); + + autofs4_del_active(dentry); + + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + if (!inode) + return -ENOMEM; + d_add(dentry, inode); + + if (sbi->version < 5) + autofs_set_leaf_automount_flags(dentry); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_inc(&p_ino->count); + inc_nlink(dir); + dir->i_mtime = CURRENT_TIME; + + return 0; +} + +/* Get/set timeout ioctl() operation */ +#ifdef CONFIG_COMPAT +static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, + compat_ulong_t __user *p) +{ + int rv; + unsigned long ntimeout; + + if ((rv = get_user(ntimeout, p)) || + (rv = put_user(sbi->exp_timeout/HZ, p))) + return rv; + + if (ntimeout > UINT_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} +#endif + +static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, + unsigned long __user *p) +{ + int rv; + unsigned long ntimeout; + + if ((rv = get_user(ntimeout, p)) || + (rv = put_user(sbi->exp_timeout/HZ, p))) + return rv; + + if (ntimeout > ULONG_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} + +/* Return protocol version */ +static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) +{ + return put_user(sbi->version, p); +} + +/* Return protocol sub version */ +static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) +{ + return put_user(sbi->sub_version, p); +} + +/* +* Tells the daemon whether it can umount the autofs mount. +*/ +static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +{ + int status = 0; + + if (may_umount(mnt)) + status = 1; + + DPRINTK("returning %d", status); + + status = put_user(status, p); + + return status; +} + +/* Identify autofs4_dentries - this is so we can tell if there's + an extra dentry refcount or not. We only hold a refcount on the + dentry if its non-negative (ie, d_inode != NULL) +*/ +int is_autofs4_dentry(struct dentry *dentry) +{ + return dentry && d_really_is_positive(dentry) && + dentry->d_op == &autofs4_dentry_operations && + dentry->d_fsdata != NULL; +} + +/* + * ioctl()'s on the root directory is the chief method for the daemon to + * generate kernel reactions + */ +static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + void __user *p = (void __user *)arg; + + DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", + cmd,arg,sbi,task_pgrp_nr(current)); + + if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) + return -ENOTTY; + + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch(cmd) { + case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ + return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); + case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ + return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); + case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ + autofs4_catatonic_mode(sbi); + return 0; + case AUTOFS_IOC_PROTOVER: /* Get protocol version */ + return autofs4_get_protover(sbi, p); + case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ + return autofs4_get_protosubver(sbi, p); + case AUTOFS_IOC_SETTIMEOUT: + return autofs4_get_set_timeout(sbi, p); +#ifdef CONFIG_COMPAT + case AUTOFS_IOC_SETTIMEOUT32: + return autofs4_compat_get_set_timeout(sbi, p); +#endif + + case AUTOFS_IOC_ASKUMOUNT: + return autofs4_ask_umount(filp->f_path.mnt, p); + + /* return a single thing to expire */ + case AUTOFS_IOC_EXPIRE: + return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); + /* same as above, but can send multiple expires through pipe */ + case AUTOFS_IOC_EXPIRE_MULTI: + return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); + + default: + return -ENOSYS; + } +} + +static long autofs4_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); +} + +#ifdef CONFIG_COMPAT +static long autofs4_root_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + else + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + (unsigned long)compat_ptr(arg)); + + return ret; +} +#endif diff --git a/kernel/fs/autofs4/symlink.c b/kernel/fs/autofs4/symlink.c new file mode 100644 index 000000000..de58cc7b8 --- /dev/null +++ b/kernel/fs/autofs4/symlink.c @@ -0,0 +1,28 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/symlink.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include "autofs_i.h" + +static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino && !autofs4_oz_mode(sbi)) + ino->last_used = jiffies; + nd_set_link(nd, d_inode(dentry)->i_private); + return NULL; +} + +const struct inode_operations autofs4_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = autofs4_follow_link +}; diff --git a/kernel/fs/autofs4/waitq.c b/kernel/fs/autofs4/waitq.c new file mode 100644 index 000000000..35b755e79 --- /dev/null +++ b/kernel/fs/autofs4/waitq.c @@ -0,0 +1,565 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/waitq.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "autofs_i.h" + +/* We make this a static variable rather than a part of the superblock; it + is better if we don't reassign numbers easily even across filesystems */ +static autofs_wqt_t autofs4_next_wait_queue = 1; + +/* These are the signals we allow interrupting a pending mount */ +#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) + +void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +{ + struct autofs_wait_queue *wq, *nwq; + + mutex_lock(&sbi->wq_mutex); + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } + + DPRINTK("entering catatonic mode"); + + sbi->catatonic = 1; + wq = sbi->queues; + sbi->queues = NULL; /* Erase all wait queues */ + while (wq) { + nwq = wq->next; + wq->status = -ENOENT; /* Magic is gone - report failure */ + kfree(wq->name.name); + wq->name.name = NULL; + wq->wait_ctr--; + wake_up_interruptible(&wq->queue); + wq = nwq; + } + fput(sbi->pipe); /* Close the pipe */ + sbi->pipe = NULL; + sbi->pipefd = -1; + mutex_unlock(&sbi->wq_mutex); +} + +static int autofs4_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) +{ + unsigned long sigpipe, flags; + mm_segment_t fs; + const char *data = (const char *)addr; + ssize_t wr = 0; + + sigpipe = sigismember(¤t->pending.signal, SIGPIPE); + + /* Save pointer to user space and point back to kernel space */ + fs = get_fs(); + set_fs(KERNEL_DS); + + mutex_lock(&sbi->pipe_mutex); + while (bytes && + (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { + data += wr; + bytes -= wr; + } + mutex_unlock(&sbi->pipe_mutex); + + set_fs(fs); + + /* Keep the currently executing process from receiving a + SIGPIPE unless it was already supposed to get one */ + if (wr == -EPIPE && !sigpipe) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + sigdelset(¤t->pending.signal, SIGPIPE); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + return (bytes > 0); +} + +static void autofs4_notify_daemon(struct autofs_sb_info *sbi, + struct autofs_wait_queue *wq, + int type) +{ + union { + struct autofs_packet_hdr hdr; + union autofs_packet_union v4_pkt; + union autofs_v5_packet_union v5_pkt; + } pkt; + struct file *pipe = NULL; + size_t pktsz; + + DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", + (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); + + memset(&pkt,0,sizeof pkt); /* For security reasons */ + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = type; + + switch (type) { + /* Kernel protocol v4 missing and expire packets */ + case autofs_ptype_missing: + { + struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; + + pktsz = sizeof(*mp); + + mp->wait_queue_token = wq->wait_queue_token; + mp->len = wq->name.len; + memcpy(mp->name, wq->name.name, wq->name.len); + mp->name[wq->name.len] = '\0'; + break; + } + case autofs_ptype_expire_multi: + { + struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; + + pktsz = sizeof(*ep); + + ep->wait_queue_token = wq->wait_queue_token; + ep->len = wq->name.len; + memcpy(ep->name, wq->name.name, wq->name.len); + ep->name[wq->name.len] = '\0'; + break; + } + /* + * Kernel protocol v5 packet for handling indirect and direct + * mount missing and expire requests + */ + case autofs_ptype_missing_indirect: + case autofs_ptype_expire_indirect: + case autofs_ptype_missing_direct: + case autofs_ptype_expire_direct: + { + struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; + + pktsz = sizeof(*packet); + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->name.len; + memcpy(packet->name, wq->name.name, wq->name.len); + packet->name[wq->name.len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = from_kuid_munged(user_ns, wq->uid); + packet->gid = from_kgid_munged(user_ns, wq->gid); + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } + default: + printk("autofs4_notify_daemon: bad type %d!\n", type); + mutex_unlock(&sbi->wq_mutex); + return; + } + + pipe = get_file(sbi->pipe); + + mutex_unlock(&sbi->wq_mutex); + + if (autofs4_write(sbi, pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); +} + +static int autofs4_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char **name) +{ + struct dentry *root = sbi->sb->s_root; + struct dentry *tmp; + char *buf; + char *p; + int len; + unsigned seq; + +rename_retry: + buf = *name; + len = 0; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&sbi->fs_lock); + for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) + len += tmp->d_name.len + 1; + + if (!len || --len > NAME_MAX) { + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + return 0; + } + + *(buf + len) = '\0'; + p = buf + len - dentry->d_name.len; + strncpy(p, dentry->d_name.name, dentry->d_name.len); + + for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { + *(--p) = '/'; + p -= tmp->d_name.len; + strncpy(p, tmp->d_name.name, tmp->d_name.len); + } + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + + return len; +} + +static struct autofs_wait_queue * +autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) +{ + struct autofs_wait_queue *wq; + + for (wq = sbi->queues; wq; wq = wq->next) { + if (wq->name.hash == qstr->hash && + wq->name.len == qstr->len && + wq->name.name && + !memcmp(wq->name.name, qstr->name, qstr->len)) + break; + } + return wq; +} + +/* + * Check if we have a valid request. + * Returns + * 1 if the request should continue. + * In this case we can return an autofs_wait_queue entry if one is + * found or NULL to idicate a new wait needs to be created. + * 0 or a negative errno if the request shouldn't continue. + */ +static int validate_request(struct autofs_wait_queue **wait, + struct autofs_sb_info *sbi, + struct qstr *qstr, + struct dentry*dentry, enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct autofs_info *ino; + + if (sbi->catatonic) + return -ENOENT; + + /* Wait in progress, continue; */ + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + + *wait = NULL; + + /* If we don't yet have any info this is a new request */ + ino = autofs4_dentry_ino(dentry); + if (!ino) + return 1; + + /* + * If we've been asked to wait on an existing expire (NFY_NONE) + * but there is no wait in the queue ... + */ + if (notify == NFY_NONE) { + /* + * Either we've betean the pending expire to post it's + * wait or it finished while we waited on the mutex. + * So we need to wait till either, the wait appears + * or the expire finishes. + */ + + while (ino->flags & AUTOFS_INF_EXPIRING) { + mutex_unlock(&sbi->wq_mutex); + schedule_timeout_interruptible(HZ/10); + if (mutex_lock_interruptible(&sbi->wq_mutex)) + return -EINTR; + + if (sbi->catatonic) + return -ENOENT; + + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + } + + /* + * Not ideal but the status has already gone. Of the two + * cases where we wait on NFY_NONE neither depend on the + * return status of the wait. + */ + return 0; + } + + /* + * If we've been asked to trigger a mount and the request + * completed while we waited on the mutex ... + */ + if (notify == NFY_MOUNT) { + struct dentry *new = NULL; + int valid = 1; + + /* + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (!IS_ROOT(dentry)) { + if (d_really_is_positive(dentry) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); + if (new) + dentry = new; + } + } + if (have_submounts(dentry)) + valid = 0; + + if (new) + dput(new); + return valid; + } + + return 1; +} + +int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, + enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct qstr qstr; + char *name; + int status, ret, type; + pid_t pid; + pid_t tgid; + + /* In catatonic mode, we don't wait for nobody */ + if (sbi->catatonic) + return -ENOENT; + + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + + if (d_really_is_negative(dentry)) { + /* + * A wait for a negative dentry is invalid for certain + * cases. A direct or offset mount "always" has its mount + * point directory created and so the request dentry must + * be positive or the map key doesn't exist. The situation + * is very similar for indirect mounts except only dentrys + * in the root of the autofs file system may be negative. + */ + if (autofs_type_trigger(sbi->type)) + return -ENOENT; + else if (!IS_ROOT(dentry->d_parent)) + return -ENOENT; + } + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + qstr.len = sprintf(name, "%p", dentry); + else { + qstr.len = autofs4_getpath(sbi, dentry, &name); + if (!qstr.len) { + kfree(name); + return -ENOENT; + } + } + qstr.name = name; + qstr.hash = full_name_hash(name, qstr.len); + + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(qstr.name); + return -EINTR; + } + + ret = validate_request(&wq, sbi, &qstr, dentry, notify); + if (ret <= 0) { + if (ret != -EINTR) + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + return ret; + } + + if (!wq) { + /* Create a new wait queue */ + wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); + if (!wq) { + kfree(qstr.name); + mutex_unlock(&sbi->wq_mutex); + return -ENOMEM; + } + + wq->wait_queue_token = autofs4_next_wait_queue; + if (++autofs4_next_wait_queue == 0) + autofs4_next_wait_queue = 1; + wq->next = sbi->queues; + sbi->queues = wq; + init_waitqueue_head(&wq->queue); + memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->dev = autofs4_get_dev(sbi); + wq->ino = autofs4_get_ino(sbi); + wq->uid = current_uid(); + wq->gid = current_gid(); + wq->pid = pid; + wq->tgid = tgid; + wq->status = -EINTR; /* Status return if interrupted */ + wq->wait_ctr = 2; + + if (sbi->version < 5) { + if (notify == NFY_MOUNT) + type = autofs_ptype_missing; + else + type = autofs_ptype_expire_multi; + } else { + if (notify == NFY_MOUNT) + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_missing_direct : + autofs_ptype_missing_indirect; + else + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_expire_direct : + autofs_ptype_expire_indirect; + } + + DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + + /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ + autofs4_notify_daemon(sbi, wq, type); + } else { + wq->wait_ctr++; + DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + } + + /* + * wq->name.name is NULL iff the lock is already released + * or the mount has been made catatonic. + */ + if (wq->name.name) { + /* Block all but "shutdown" signals while waiting */ + sigset_t oldset; + unsigned long irqflags; + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + oldset = current->blocked; + siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); + + wait_event_interruptible(wq->queue, wq->name.name == NULL); + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + current->blocked = oldset; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); + } else { + DPRINTK("skipped sleeping"); + } + + status = wq->status; + + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs4_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs4_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + + /* Are we the last process to need status? */ + mutex_lock(&sbi->wq_mutex); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return status; +} + + +int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +{ + struct autofs_wait_queue *wq, **wql; + + mutex_lock(&sbi->wq_mutex); + for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { + if (wq->wait_queue_token == wait_queue_token) + break; + } + + if (!wq) { + mutex_unlock(&sbi->wq_mutex); + return -EINVAL; + } + + *wql = wq->next; /* Unlink from chain */ + kfree(wq->name.name); + wq->name.name = NULL; /* Do not wait on this queue */ + wq->status = status; + wake_up_interruptible(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return 0; +} + diff --git a/kernel/fs/bad_inode.c b/kernel/fs/bad_inode.c new file mode 100644 index 000000000..861b1e1c4 --- /dev/null +++ b/kernel/fs/bad_inode.c @@ -0,0 +1,214 @@ +/* + * linux/fs/bad_inode.c + * + * Copyright (C) 1997, Stephen Tweedie + * + * Provide stub functions for unreadable inodes + * + * Fabian Frederick : August 2003 - All file operations assigned to EIO + */ + +#include +#include +#include +#include +#include +#include + +static int bad_file_open(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static const struct file_operations bad_file_ops = +{ + .open = bad_file_open, +}; + +static int bad_inode_create (struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return -EIO; +} + +static struct dentry *bad_inode_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + return ERR_PTR(-EIO); +} + +static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, + const char *symname) +{ + return -EIO; +} + +static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + return -EIO; +} + +static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + return -EIO; +} + +static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + return -EIO; +} + +static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + return -EIO; +} + +static int bad_inode_permission(struct inode *inode, int mask) +{ + return -EIO; +} + +static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + return -EIO; +} + +static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +{ + return -EIO; +} + +static int bad_inode_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EIO; +} + +static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return -EIO; +} + +static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EIO; +} + +static int bad_inode_removexattr(struct dentry *dentry, const char *name) +{ + return -EIO; +} + +static const struct inode_operations bad_inode_ops = +{ + .create = bad_inode_create, + .lookup = bad_inode_lookup, + .link = bad_inode_link, + .unlink = bad_inode_unlink, + .symlink = bad_inode_symlink, + .mkdir = bad_inode_mkdir, + .rmdir = bad_inode_rmdir, + .mknod = bad_inode_mknod, + .rename2 = bad_inode_rename2, + .readlink = bad_inode_readlink, + /* follow_link must be no-op, otherwise unmounting this inode + won't work */ + /* put_link returns void */ + /* truncate returns void */ + .permission = bad_inode_permission, + .getattr = bad_inode_getattr, + .setattr = bad_inode_setattr, + .setxattr = bad_inode_setxattr, + .getxattr = bad_inode_getxattr, + .listxattr = bad_inode_listxattr, + .removexattr = bad_inode_removexattr, +}; + + +/* + * When a filesystem is unable to read an inode due to an I/O error in + * its read_inode() function, it can call make_bad_inode() to return a + * set of stubs which will return EIO errors as required. + * + * We only need to do limited initialisation: all other fields are + * preinitialised to zero automatically. + */ + +/** + * make_bad_inode - mark an inode bad due to an I/O error + * @inode: Inode to mark bad + * + * When an inode cannot be read due to a media or remote network + * failure this function makes the inode "bad" and causes I/O operations + * on it to fail from this point on. + */ + +void make_bad_inode(struct inode *inode) +{ + remove_inode_hash(inode); + + inode->i_mode = S_IFREG; + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + inode->i_op = &bad_inode_ops; + inode->i_fop = &bad_file_ops; +} +EXPORT_SYMBOL(make_bad_inode); + +/* + * This tests whether an inode has been flagged as bad. The test uses + * &bad_inode_ops to cover the case of invalidated inodes as well as + * those created by make_bad_inode() above. + */ + +/** + * is_bad_inode - is an inode errored + * @inode: inode to test + * + * Returns true if the inode in question has been marked as bad. + */ + +int is_bad_inode(struct inode *inode) +{ + return (inode->i_op == &bad_inode_ops); +} + +EXPORT_SYMBOL(is_bad_inode); + +/** + * iget_failed - Mark an under-construction inode as dead and release it + * @inode: The inode to discard + * + * Mark an under-construction inode as dead and release it. + */ +void iget_failed(struct inode *inode) +{ + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); +} +EXPORT_SYMBOL(iget_failed); diff --git a/kernel/fs/befs/ChangeLog b/kernel/fs/befs/ChangeLog new file mode 100644 index 000000000..75a461cfa --- /dev/null +++ b/kernel/fs/befs/ChangeLog @@ -0,0 +1,417 @@ +Version 0.92 (2002-03-29) +========== +* Minor cleanup. Ran Lindent on the sources. + +Version 0.92 (2002-03-27) +========== +* Fixed module makefile problem. It was not compiling all the correct + source files! +* Removed duplicated function definition +* Fixed potential null pointer dereference when reporting an error + +Version 0.91 (2002-03-26) +========== +* Oy! Fixed stupid bug that would cause an unresolved symbol error. + Thanks to Laszlo Boszormenyi for pointing this out to me. + +Version 0.9 (2002-03-14) +========== +* Added Sergey S. Kostyliov's patch to eliminate memcpy() overhead + from b+tree operations. Changes the befs_read_datastream() interface. + +* Segregated the functions that interface directly with the linux vfs + interface into their own file called linuxvfs.c. [WD] + +Version 0.64 (2002-02-07) +========== +* Did the string comparison really right this time (btree.c) [WD] + +* Fixed up some places where I assumed that a long int could hold + a pointer value. (btree.c) [WD] + +* Andrew Farnham pointed out that the module + wouldn't work on older (<2.4.10) kernels due to an unresolved symbol. + This is bad, since 2.4.9 is still the current RedHat kernel. I added + a workaround for this problem (compatibility.h) [WD] + +* Sergey S. Kostyliov made befs_find_key() use a binary search to find + keys within btree nodes, rather than the linear search we were using + before. (btree.c) [Sergey S. Kostyliov ] + +* Made a debian package of the source for use with kernel-package. [WD] + + +Version 0.63 (2002-01-31) +========== +* Fixed bug in befs_find_brun_indirect() that would result in the wrong + block being read. It was introduced when adding byteswapping in + 0.61. (datastream.c) [WD] + +* Fixed a longstanding bug in befs_find_key() that would result in it + finding the first key that is a substring of the string it is searching + for. For example, this would cause files in the same directory with + names like file1 and file2 to mysteriously be duplicates of each other + (because they have the same inode number). Many thanks to Pavel Roskin + for reporting this serious bug!!! + (btree.c) [WD] + +* Added support for long symlinks, after Axel Dorfler explained up how + they work. I had forgotten all about them. (inode.c, symlink.c) [WD] + +* Documentation improvements in source. [WD] + +* Makefile fix for independent module when CONFIG_MODVERSION is set in + kernel config [Pavel Roskin ] + +* Compile warning fix for namei.c. [Sergey S. Kostyliov ] + + +Version 0.62 +========== +* Fixed makefile for module install [WD] + + +Version 0.61 (2002-01-20) +========== +* Made functions in endian.h to do the correct byteswapping, no matter + the arch. [WD] + +* Abbandoned silly checks for a NULL superblock pointer in debug.c. [WD] + +* Misc code cleanups. Also cleanup of this changelog file. [WD] + +* Added byteswapping to all metadata reads from disk. + Uses the functions from endian.h [WD] + +* Remove the typedef of struct super_block to vfs_sb, as it offended + certain peoples' aesthetic sense. [WD] + +* Ditto with the befs_read_block() interface. [WD] + + +Version 0.6 (2001-12-15) +========== +* Cleanup of NLS functions (util.c) [WD] + +* Make directory lookup/read use the NLS if an iocharset is provided. [WD] + +* Fixed stupid bug where specifying the uid or gid mount options as '0' + would result in the filesystem using the on-disk uid and gid. [WD] + +* Added mount option to control debug printing. + The option is, simply enough, 'debug'. + (super.c, debug.c) [WD] + +* Removed notion of btree handle from btree.c. It was unnecessary, as the + linux VFS doesn't allow us to keep any state between calls. Updated + dir.c, namei.c befs_fs.h to account for it. [WD] + +* Improved handleing of overflow nodes when listing directories. + Now works for overflow nodes hanging off of nodes other than the root + node. This is the cleaner solution to Brent Miszalaski's problem. [WD] + +* Added new debug/warning/error print functions in debug.c. + More flexible. Will soon be controllable at mount time + (see TODO). [WD] + +* Rewrote datastream position lookups. + (datastream.c) [WD] + +* Moved the TODO list to its own file. + + +Version 0.50 (2001-11-13) +========== +* Added workaround for mis-understanding of the nature of the b+trees used + in directories. A cleaner solution will come after I've thought about it + for a while. Thanks to Brent Miszalaski for finding and reporting this bug. + (btree.c) [WD] + +* Minor cleanups + +* Added test for "impossible" condition of empty internal nodes in + seekleaf() in btree.c [WD] + +* Implemented the abstracted read_block() in io.c [WD] + +* Cleaned up the inode validation in inode.c [WD] + +* Anton Altaparmakov figured out (by asking Linus :) ) what was causing the + hanging disk io problem. It turns out you need to have the sync_pages + callback defined in your address_space_ops, even if it just uses the + default linux-supplied implementation. Fixed. Works now. + (file.c) [WD] + +* Anton Altaparmakov and Christoph Hellwig alerted me to the fact that + filesystem code should be using GFP_NOFS instead of GFP_KERNEL as the + priority parameter to kmalloc(). Fixed. + (datastream.c, btree.c super.c inode.c) [WD] + +* Anton also told me that the blocksize is not allowed to be larger than + the page size in linux, which is 4k i386. Oops. Added a test for + (blocksize > PAGE_SIZE), and refuse to mount in that case. What this + practically means is that 8k blocksize volumes won't work without a major + restructuring of the driver (or an alpha or other 64bit hardware). [WD] + +* Cleaned up the befs_count_blocks() function. Much smarter now. + And somewhat smaller too. [WD] + +* Made inode allocations use a slab cache + (super.c inode.c) [WD] + +* Moved the freeing of the private inode section from put_inode() to + clear_inode(). This fixes a potential free twice type bug. Put_inode() + can be called multiple times for each inode struct. [WD] + +* Converted all non vfs-callback functions to use befs_sb_info as the + superblock type, rather than struct super_block. This is for + portablity. [WD] + +* Fixed a couple of compile warnings due to use of malloc.h, when slab.h + is the new way. (inode.c, super.c) [WD] + +* Fixed erronous includes of linux/befs_fs_i.h and linux/befs_fs_sb.h + in inode.c [WD] + +Version 0.45 (2001-10-29) +========== +* Added functions to get the private superblock and inode structures from + their enclosing public structures. Switched all references to the + private portions to use them. (many files) [WD] + +* Made read_super and read_inode allocate the private portions of those + structures into the generic pointer fields of the public structures + with kmalloc(). put_super and put_inode free them. This allows us not + to have to touch the definitions of the public structures in + include/linux/fs.h. Also, befs_inode_info is huge (because of the + symlink string). (super.c, inode.c, befs_fs.h) [WD] + +* Fixed a thinko that was corrupting file reads after the first block_run + is done being read. (datastream.c) [WD] + +* Removed fsync() hooks, since a read-only filesystem doesn't need them. + [Christoph Hellwig]. + +* Fixed befs_readlink() (symlink.c) [Christoph Hellwig]. + +* Removed all the Read-Write stuff. I'll redo it when it is time to add + write support (various files) [WD]. + +* Removed prototypes for functions who's definitions have been removed + (befs_fs.h) [WD]. + + +Version 0.4 (2001-10-28) +========== +* Made it an option to use the old non-pagecache befs_file_read() for + testing purposes. (fs/Config.in) + +* Fixed unused variable warnings when compiling without debugging. + +* Fixed a bug where the inode and super_block didn't get their blockbits + fields set (inode.c and super.c). + +* Release patch version 11. AKA befs-driver version 0.4. + +* Thats right. New versioning scheme. + I've done some serious testing on it now (on my box anyhow), and it + seems stable and not outragously slow. Existing features are more-or-less + correct (see TODO list). But it isn't 1.0 yet. I think 0.4 gives me some + headroom before the big 1.0. + + +2001-10-26 +========== +* Fixed date format in this file. Was I smoking crack? + +* Removed old datastream code from file.c, since it is nolonger used. + +* Generic_read_file() is now used to read regular file data. + It doesn't chew up the buffer cache (it does page io instead), and seems + to be about as fast (even though it has to look up each file block + indivdualy). And it knows about doing readahead, which is a major plus. + So it does i/o in much larger chunks. It is the correct linux way. It + uses befs_get_block() by way of befs_readpage() to find the disk offsets + of blocks, which in turn calls befs_fpos2brun() in datastream.c to do + the hard work of finding the disk block number. + +* Changed method of checking for a dirty filesystem in befs_read_super + (super.c). Now we check to see if log_start and log_end differ. If so, + the journal needs to be replayed, and the filesystem cannot be mounted. + +* Fixed an extra instance of MOD_DEC_USE_COUNT in super.c + +* Fixed a problem with reading the superblock on devices with large sector + sizes (such as cdroms) on linux 2.4.10 and up. + +2001-10-24 +========== +* Fix nasty bug in converting block numbers to struct befs_inode_addr. + Subtle, because the old version was only sometimes wrong. + Probably responsible for lots of problems. (inode.c) + +* Fix bug with reading an empty directory. (btree.c and dir.c) + +* This one looks good. Release patch version 10 + +2001-10-23 +========== +* Added btree searching function. + +* Use befs_btree_find in befs_lookup (namei.c) + +* Additional comments in btree.c + +2001-10-22 +========== +* Added B+tree reading functions (in btree.c). + Made befs_readdir() use them them instead of the cruft in index.c. + +2001-09-11 +========== +* Converted befs_read_file() to use the new datastream code. + +* Finally updated the README file. + +* Added many comments. + +* Posted version 6 + +* Removed byte-order conversion code. + I have no intention of supporting it, and it was very ugly. + Flow control with #ifdef (ugh). Maybe I'll redo it once + native byteorder works 100%. + +2001-09-10 +========== +* Finished implementing read_datastream() + +* made befs_read_brun() more general + Supports an offset to start at and a max bytes to read + Added a wrapper function to give the old call + +2001-09-30 +========== +* Discovered that the datastream handleing code in file.c is quite deficient + in several respects. For one thing, it doesn't deal with indirect blocks + +* Rewrote datastream handleing. + +* Created io.c, for io related functions. + Previously, the befs_bread() funtions lived in file.c + Created the befs_read_brun() function. + + +2001-09-07 +========== +* Made a function to actually count the number of fs blocks used by a file. + And helper functions. + (fs/befs/inode.c) + +2001-09-05 +========== +* Fixed a misunderstanding of the inode fields. + This fixed the problmem with wrong file sizes from du and others. + The i_blocks field of the inode struct is not the number of blocks for the + inode, it is the number of blocks for the file. Also, i_blksize is not + necessarily the size of the inode, although in practice it works out. + Changed to blocksize of filesystem. + (fs/befs/inode.c) + +* Permanently removed code that had been provisionally ifdefed out of befs_fs.h + +* Since we don't support access time, make that field zero, instead of + copying m_time. + (fs/befs/inode.c) + +* Added sanity check for inode reading + Make sure inode we got was the one we asked for. + (fs/befs/inode.c) + +* Code cleanup + Local pointers to commonly used structures in inode.c. + Got rid of abominations befs_iaddr2inode() and befs_inode2ino(). + Replaced with single function iaddr2blockno(). + (fs/befs/super.c) (fs/befs/inode.c) + +2001-09-01 +========== +* Fixed the problem with statfs where it would always claim the disk was + half full, due to improper understanding of the statfs fields. + (fs/befs/super.c) + +* Posted verion 4 of the patch + +2001-09-01 +========== +* Changed the macros in befs_fs.h to inline functions. + More readable. Typesafe. Better + (include/linux/befs_fs.h) + +* Moved type definitions from befs_fs.h to a new file, befs_fs_types.h + Because befs_fs_i.h and befs_fs_sb.h were including befs_fs.h for the + typedefs, and they are inlcuded in , which has definitions + that I want the inline functions in befs_fs.h to be able to see. Nasty + circularity. + (include/linux/befs_fs.h) + +2001-08-30 +========== +* Cleaned up some wording. + +* Added additional consitency checks on mount + Check block_size agrees with block_shift + Check flags == BEFS_CLEAN + (fs/befs/super.c) + +* Tell the kernel to only mount befs read-only. + By setting the MS_RDONLY flag in befs_read_super(). + Not that it was possible to write before. But now the kernel won't even try. + (fs/befs/super.c) + +* Got rid of kernel warning on mount. + The kernel doesn't like it if you call set_blocksize() on a device when + you have some of its blocks open. Moved the second set_blocksize() to the + very end of befs_read_super(), after we are done with the disk superblock. + (fs/befs/super.c) + +* Fixed wrong number of args bug in befs_dump_inode + (fs/befs/debug.c) + +* Solved lots of type mismatches in kprint()s + (everwhere) + +2001-08-27 +========== +* Cleaned up the fs/Config.in entries a bit, now slightly more descriptive. + +* BeFS depends on NLS, so I made activating BeFS enable the NLS questions + (fs/nls/Config.in) + +* Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS + (Documentation/Configure.help) + +2001-08-?? +========== +* Removed superblock locking calls in befs_read_super(). In 2.4, the VFS + hands us a super_block struct that is already locked. + +2001-08-13 +========== +* Will Dyson is now attempting to maintain this module + Makoto Kato is original author.Daniel Berlin + also did some work on it (fixing it up for the later 2.3.x kernels, IIRC). + +* Fixed compile errors on 2.4.1 kernel (WD) + Resolve rejected patches + Accommodate changed NLS interface (util.h) + Needed to include in most files + Makefile changes + fs/Config.in changes + +* Tried to niceify the code using the ext2 fs as a guide + Declare befs_fs_type using the DECLARE_FSTYPE_DEV() macro + +* Made it a configure option to turn on debugging (fs/Config.in) + +* Compiles on 2.4.7 diff --git a/kernel/fs/befs/Kconfig b/kernel/fs/befs/Kconfig new file mode 100644 index 000000000..edc5cc2ae --- /dev/null +++ b/kernel/fs/befs/Kconfig @@ -0,0 +1,26 @@ +config BEFS_FS + tristate "BeOS file system (BeFS) support (read only)" + depends on BLOCK + select NLS + help + The BeOS File System (BeFS) is the native file system of Be, Inc's + BeOS. Notable features include support for arbitrary attributes + on files and directories, and database-like indices on selected + attributes. (Also note that this driver doesn't make those features + available at this time). It is a 64 bit filesystem, so it supports + extremely large volumes and files. + + If you use this filesystem, you should also say Y to at least one + of the NLS (native language support) options below. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be + called befs. + +config BEFS_DEBUG + bool "Debug BeFS" + depends on BEFS_FS + help + If you say Y here, you can use the 'debug' mount option to enable + debugging output from the driver. diff --git a/kernel/fs/befs/Makefile b/kernel/fs/befs/Makefile new file mode 100644 index 000000000..8b9f66642 --- /dev/null +++ b/kernel/fs/befs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux BeOS filesystem routines. +# + +obj-$(CONFIG_BEFS_FS) += befs.o +ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG +befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o diff --git a/kernel/fs/befs/TODO b/kernel/fs/befs/TODO new file mode 100644 index 000000000..3250921aa --- /dev/null +++ b/kernel/fs/befs/TODO @@ -0,0 +1,14 @@ +TODO +========== + +* Convert comments to the Kernel-Doc format. + +* Befs_fs.h has gotten big and messy. No reason not to break it up into + smaller peices. + +* See if Alexander Viro's option parser made it into the kernel tree. + Use that if we can. (include/linux/parser.h) + +* See if we really need separate types for on-disk and in-memory + representations of the superblock and inode. + diff --git a/kernel/fs/befs/befs.h b/kernel/fs/befs/befs.h new file mode 100644 index 000000000..1fead8d56 --- /dev/null +++ b/kernel/fs/befs/befs.h @@ -0,0 +1,157 @@ +/* + * befs.h + * + * Copyright (C) 2001-2002 Will Dyson + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + */ + +#ifndef _LINUX_BEFS_H +#define _LINUX_BEFS_H + +#include "befs_fs_types.h" + +/* used in debug.c */ +#define BEFS_VERSION "0.9.3" + + +typedef u64 befs_blocknr_t; +/* + * BeFS in memory structures + */ + +struct befs_mount_options { + kgid_t gid; + kuid_t uid; + int use_gid; + int use_uid; + int debug; + char *iocharset; +}; + +struct befs_sb_info { + u32 magic1; + u32 block_size; + u32 block_shift; + int byte_order; + befs_off_t num_blocks; + befs_off_t used_blocks; + u32 inode_size; + u32 magic2; + + /* Allocation group information */ + u32 blocks_per_ag; + u32 ag_shift; + u32 num_ags; + + /* jornal log entry */ + befs_block_run log_blocks; + befs_off_t log_start; + befs_off_t log_end; + + befs_inode_addr root_dir; + befs_inode_addr indices; + u32 magic3; + + struct befs_mount_options mount_opts; + struct nls_table *nls; +}; + +struct befs_inode_info { + u32 i_flags; + u32 i_type; + + befs_inode_addr i_inode_num; + befs_inode_addr i_parent; + befs_inode_addr i_attribute; + + union { + befs_data_stream ds; + char symlink[BEFS_SYMLINK_LEN]; + } i_data; + + struct inode vfs_inode; +}; + +enum befs_err { + BEFS_OK, + BEFS_ERR, + BEFS_BAD_INODE, + BEFS_BT_END, + BEFS_BT_EMPTY, + BEFS_BT_MATCH, + BEFS_BT_PARMATCH, + BEFS_BT_NOT_FOUND +}; + + +/****************************/ +/* debug.c */ +__printf(2, 3) +void befs_error(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) +void befs_warning(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) +void befs_debug(const struct super_block *sb, const char *fmt, ...); + +void befs_dump_super_block(const struct super_block *sb, befs_super_block *); +void befs_dump_inode(const struct super_block *sb, befs_inode *); +void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *); +void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *); +/****************************/ + + +/* Gets a pointer to the private portion of the super_block + * structure from the public part + */ +static inline struct befs_sb_info * +BEFS_SB(const struct super_block *super) +{ + return (struct befs_sb_info *) super->s_fs_info; +} + +static inline struct befs_inode_info * +BEFS_I(const struct inode *inode) +{ + return list_entry(inode, struct befs_inode_info, vfs_inode); +} + +static inline befs_blocknr_t +iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr) +{ + return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) + + iaddr->start); +} + +static inline befs_inode_addr +blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno) +{ + befs_inode_addr iaddr; + iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift; + iaddr.start = + blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift); + iaddr.len = 1; + + return iaddr; +} + +static inline unsigned int +befs_iaddrs_per_block(struct super_block *sb) +{ + return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); +} + +static inline int +befs_iaddr_is_empty(befs_inode_addr * iaddr) +{ + return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); +} + +static inline size_t +befs_brun_size(struct super_block *sb, befs_block_run run) +{ + return BEFS_SB(sb)->block_size * run.len; +} + +#include "endian.h" + +#endif /* _LINUX_BEFS_H */ diff --git a/kernel/fs/befs/befs_fs_types.h b/kernel/fs/befs/befs_fs_types.h new file mode 100644 index 000000000..eb557d9dc --- /dev/null +++ b/kernel/fs/befs/befs_fs_types.h @@ -0,0 +1,255 @@ +/* + * fs/befs/befs_fs_types.h + * + * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu) + * + * + * + * from linux/include/linux/befs_fs.h + * + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + * + */ + +#ifndef _LINUX_BEFS_FS_TYPES +#define _LINUX_BEFS_FS_TYPES + +#ifdef __KERNEL__ +#include +#endif /*__KERNEL__*/ + +#define PACKED __attribute__ ((__packed__)) + +/* + * Max name lengths of BFS + */ + +#define BEFS_NAME_LEN 255 + +#define BEFS_SYMLINK_LEN 144 +#define BEFS_NUM_DIRECT_BLOCKS 12 +#define B_OS_NAME_LENGTH 32 + +/* The datastream blocks mapped by the double-indirect + * block are always 4 fs blocks long. + * This eliminates the need for linear searches among + * the potentially huge number of indirect blocks + * + * Err. Should that be 4 fs blocks or 4k??? + * It matters on large blocksize volumes + */ +#define BEFS_DBLINDIR_BRUN_LEN 4 + +/* + * Flags of superblock + */ + +enum super_flags { + BEFS_BYTESEX_BE, + BEFS_BYTESEX_LE, + BEFS_CLEAN = 0x434c454e, + BEFS_DIRTY = 0x44495254, + BEFS_SUPER_MAGIC1 = 0x42465331, /* BFS1 */ + BEFS_SUPER_MAGIC2 = 0xdd121031, + BEFS_SUPER_MAGIC3 = 0x15b6830e, +}; + +#define BEFS_BYTEORDER_NATIVE 0x42494745 +#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) + +#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 +#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) + +/* + * Flags of inode + */ + +#define BEFS_INODE_MAGIC1 0x3bbe0ad9 + +enum inode_flags { + BEFS_INODE_IN_USE = 0x00000001, + BEFS_ATTR_INODE = 0x00000004, + BEFS_INODE_LOGGED = 0x00000008, + BEFS_INODE_DELETED = 0x00000010, + BEFS_LONG_SYMLINK = 0x00000040, + BEFS_PERMANENT_FLAG = 0x0000ffff, + BEFS_INODE_NO_CREATE = 0x00010000, + BEFS_INODE_WAS_WRITTEN = 0x00020000, + BEFS_NO_TRANSACTION = 0x00040000, +}; +/* + * On-Disk datastructures of BeFS + */ + +typedef u64 __bitwise fs64; +typedef u32 __bitwise fs32; +typedef u16 __bitwise fs16; + +typedef u64 befs_off_t; +typedef fs64 befs_time_t; + +/* Block runs */ +typedef struct { + fs32 allocation_group; + fs16 start; + fs16 len; +} PACKED befs_disk_block_run; + +typedef struct { + u32 allocation_group; + u16 start; + u16 len; +} PACKED befs_block_run; + +typedef befs_disk_block_run befs_disk_inode_addr; +typedef befs_block_run befs_inode_addr; + +/* + * The Superblock Structure + */ +typedef struct { + char name[B_OS_NAME_LENGTH]; + fs32 magic1; + fs32 fs_byte_order; + + fs32 block_size; + fs32 block_shift; + + fs64 num_blocks; + fs64 used_blocks; + + fs32 inode_size; + + fs32 magic2; + fs32 blocks_per_ag; + fs32 ag_shift; + fs32 num_ags; + + fs32 flags; + + befs_disk_block_run log_blocks; + fs64 log_start; + fs64 log_end; + + fs32 magic3; + befs_disk_inode_addr root_dir; + befs_disk_inode_addr indices; + +} PACKED befs_super_block; + +/* + * Note: the indirect and dbl_indir block_runs may + * be longer than one block! + */ +typedef struct { + befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS]; + fs64 max_direct_range; + befs_disk_block_run indirect; + fs64 max_indirect_range; + befs_disk_block_run double_indirect; + fs64 max_double_indirect_range; + fs64 size; +} PACKED befs_disk_data_stream; + +typedef struct { + befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS]; + befs_off_t max_direct_range; + befs_block_run indirect; + befs_off_t max_indirect_range; + befs_block_run double_indirect; + befs_off_t max_double_indirect_range; + befs_off_t size; +} PACKED befs_data_stream; + +/* Attribute */ +typedef struct { + fs32 type; + fs16 name_size; + fs16 data_size; + char name[1]; +} PACKED befs_small_data; + +/* Inode structure */ +typedef struct { + fs32 magic1; + befs_disk_inode_addr inode_num; + fs32 uid; + fs32 gid; + fs32 mode; + fs32 flags; + befs_time_t create_time; + befs_time_t last_modified_time; + befs_disk_inode_addr parent; + befs_disk_inode_addr attributes; + fs32 type; + + fs32 inode_size; + fs32 etc; /* not use */ + + union { + befs_disk_data_stream datastream; + char symlink[BEFS_SYMLINK_LEN]; + } data; + + fs32 pad[4]; /* not use */ + befs_small_data small_data[1]; +} PACKED befs_inode; + +/* + * B+tree superblock + */ + +#define BEFS_BTREE_MAGIC 0x69f6c2e8 + +enum btree_types { + BTREE_STRING_TYPE = 0, + BTREE_INT32_TYPE = 1, + BTREE_UINT32_TYPE = 2, + BTREE_INT64_TYPE = 3, + BTREE_UINT64_TYPE = 4, + BTREE_FLOAT_TYPE = 5, + BTREE_DOUBLE_TYPE = 6 +}; + +typedef struct { + fs32 magic; + fs32 node_size; + fs32 max_depth; + fs32 data_type; + fs64 root_node_ptr; + fs64 free_node_ptr; + fs64 max_size; +} PACKED befs_disk_btree_super; + +typedef struct { + u32 magic; + u32 node_size; + u32 max_depth; + u32 data_type; + befs_off_t root_node_ptr; + befs_off_t free_node_ptr; + befs_off_t max_size; +} PACKED befs_btree_super; + +/* + * Header structure of each btree node + */ +typedef struct { + fs64 left; + fs64 right; + fs64 overflow; + fs16 all_key_count; + fs16 all_key_length; +} PACKED befs_btree_nodehead; + +typedef struct { + befs_off_t left; + befs_off_t right; + befs_off_t overflow; + u16 all_key_count; + u16 all_key_length; +} PACKED befs_host_btree_nodehead; + +#endif /* _LINUX_BEFS_FS_TYPES */ diff --git a/kernel/fs/befs/btree.c b/kernel/fs/befs/btree.c new file mode 100644 index 000000000..0826e91da --- /dev/null +++ b/kernel/fs/befs/btree.c @@ -0,0 +1,793 @@ +/* + * linux/fs/befs/btree.c + * + * Copyright (C) 2001-2002 Will Dyson + * + * Licensed under the GNU GPL. See the file COPYING for details. + * + * 2002-02-05: Sergey S. Kostyliov added binary search within + * btree nodes. + * + * Many thanks to: + * + * Dominic Giampaolo, author of "Practical File System + * Design with the Be File System", for such a helpful book. + * + * Marcus J. Ranum, author of the b+tree package in + * comp.sources.misc volume 10. This code is not copied from that + * work, but it is partially based on it. + * + * Makoto Kato, author of the original BeFS for linux filesystem + * driver. + */ + +#include +#include +#include +#include +#include + +#include "befs.h" +#include "btree.h" +#include "datastream.h" + +/* + * The btree functions in this file are built on top of the + * datastream.c interface, which is in turn built on top of the + * io.c interface. + */ + +/* Befs B+tree structure: + * + * The first thing in the tree is the tree superblock. It tells you + * all kinds of useful things about the tree, like where the rootnode + * is located, and the size of the nodes (always 1024 with current version + * of BeOS). + * + * The rest of the tree consists of a series of nodes. Nodes contain a header + * (struct befs_btree_nodehead), the packed key data, an array of shorts + * containing the ending offsets for each of the keys, and an array of + * befs_off_t values. In interior nodes, the keys are the ending keys for + * the childnode they point to, and the values are offsets into the + * datastream containing the tree. + */ + +/* Note: + * + * The book states 2 confusing things about befs b+trees. First, + * it states that the overflow field of node headers is used by internal nodes + * to point to another node that "effectively continues this one". Here is what + * I believe that means. Each key in internal nodes points to another node that + * contains key values less than itself. Inspection reveals that the last key + * in the internal node is not the last key in the index. Keys that are + * greater than the last key in the internal node go into the overflow node. + * I imagine there is a performance reason for this. + * + * Second, it states that the header of a btree node is sufficient to + * distinguish internal nodes from leaf nodes. Without saying exactly how. + * After figuring out the first, it becomes obvious that internal nodes have + * overflow nodes and leafnodes do not. + */ + +/* + * Currently, this code is only good for directory B+trees. + * In order to be used for other BFS indexes, it needs to be extended to handle + * duplicate keys and non-string keytypes (int32, int64, float, double). + */ + +/* + * In memory structure of each btree node + */ +struct befs_btree_node { + befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */ + struct buffer_head *bh; + befs_btree_nodehead *od_node; /* on disk node */ +}; + +/* local constants */ +static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; + +/* local functions */ +static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * bt_super, + struct befs_btree_node *this_node, + befs_off_t * node_off); + +static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * sup); + +static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, + struct befs_btree_node *node, + befs_off_t node_off); + +static int befs_leafnode(struct befs_btree_node *node); + +static fs16 *befs_bt_keylen_index(struct befs_btree_node *node); + +static fs64 *befs_bt_valarray(struct befs_btree_node *node); + +static char *befs_bt_keydata(struct befs_btree_node *node); + +static int befs_find_key(struct super_block *sb, + struct befs_btree_node *node, + const char *findkey, befs_off_t * value); + +static char *befs_bt_get_key(struct super_block *sb, + struct befs_btree_node *node, + int index, u16 * keylen); + +static int befs_compare_strings(const void *key1, int keylen1, + const void *key2, int keylen2); + +/** + * befs_bt_read_super - read in btree superblock convert to cpu byteorder + * @sb: Filesystem superblock + * @ds: Datastream to read from + * @sup: Buffer in which to place the btree superblock + * + * Calls befs_read_datastream to read in the btree superblock and + * makes sure it is in cpu byteorder, byteswapping if necessary. + * + * On success, returns BEFS_OK and *@sup contains the btree superblock, + * in cpu byte order. + * + * On failure, BEFS_ERR is returned. + */ +static int +befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * sup) +{ + struct buffer_head *bh = NULL; + befs_disk_btree_super *od_sup = NULL; + + befs_debug(sb, "---> %s", __func__); + + bh = befs_read_datastream(sb, ds, 0, NULL); + + if (!bh) { + befs_error(sb, "Couldn't read index header."); + goto error; + } + od_sup = (befs_disk_btree_super *) bh->b_data; + befs_dump_index_entry(sb, od_sup); + + sup->magic = fs32_to_cpu(sb, od_sup->magic); + sup->node_size = fs32_to_cpu(sb, od_sup->node_size); + sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth); + sup->data_type = fs32_to_cpu(sb, od_sup->data_type); + sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr); + sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr); + sup->max_size = fs64_to_cpu(sb, od_sup->max_size); + + brelse(bh); + if (sup->magic != BEFS_BTREE_MAGIC) { + befs_error(sb, "Index header has bad magic."); + goto error; + } + + befs_debug(sb, "<--- %s", __func__); + return BEFS_OK; + + error: + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_ERR; +} + +/** + * befs_bt_read_node - read in btree node and convert to cpu byteorder + * @sb: Filesystem superblock + * @ds: Datastream to read from + * @node: Buffer in which to place the btree node + * @node_off: Starting offset (in bytes) of the node in @ds + * + * Calls befs_read_datastream to read in the indicated btree node and + * makes sure its header fields are in cpu byteorder, byteswapping if + * necessary. + * Note: node->bh must be NULL when this function called first + * time. Don't forget brelse(node->bh) after last call. + * + * On success, returns BEFS_OK and *@node contains the btree node that + * starts at @node_off, with the node->head fields in cpu byte order. + * + * On failure, BEFS_ERR is returned. + */ + +static int +befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, + struct befs_btree_node *node, befs_off_t node_off) +{ + uint off = 0; + + befs_debug(sb, "---> %s", __func__); + + if (node->bh) + brelse(node->bh); + + node->bh = befs_read_datastream(sb, ds, node_off, &off); + if (!node->bh) { + befs_error(sb, "%s failed to read " + "node at %llu", __func__, node_off); + befs_debug(sb, "<--- %s ERROR", __func__); + + return BEFS_ERR; + } + node->od_node = + (befs_btree_nodehead *) ((void *) node->bh->b_data + off); + + befs_dump_index_node(sb, node->od_node); + + node->head.left = fs64_to_cpu(sb, node->od_node->left); + node->head.right = fs64_to_cpu(sb, node->od_node->right); + node->head.overflow = fs64_to_cpu(sb, node->od_node->overflow); + node->head.all_key_count = + fs16_to_cpu(sb, node->od_node->all_key_count); + node->head.all_key_length = + fs16_to_cpu(sb, node->od_node->all_key_length); + + befs_debug(sb, "<--- %s", __func__); + return BEFS_OK; +} + +/** + * befs_btree_find - Find a key in a befs B+tree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @key: Key string to lookup in btree + * @value: Value stored with @key + * + * On success, returns BEFS_OK and sets *@value to the value stored + * with @key (usually the disk block number of an inode). + * + * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND. + * + * Algorithm: + * Read the superblock and rootnode of the b+tree. + * Drill down through the interior nodes using befs_find_key(). + * Once at the correct leaf node, use befs_find_key() again to get the + * actuall value stored with the key. + */ +int +befs_btree_find(struct super_block *sb, befs_data_stream * ds, + const char *key, befs_off_t * value) +{ + struct befs_btree_node *this_node = NULL; + befs_btree_super bt_super; + befs_off_t node_off; + int res; + + befs_debug(sb, "---> %s Key: %s", __func__, key); + + if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { + befs_error(sb, + "befs_btree_find() failed to read index superblock"); + goto error; + } + + this_node = kmalloc(sizeof(struct befs_btree_node), + GFP_NOFS); + if (!this_node) { + befs_error(sb, "befs_btree_find() failed to allocate %zu " + "bytes of memory", sizeof(struct befs_btree_node)); + goto error; + } + + this_node->bh = NULL; + + /* read in root node */ + node_off = bt_super.root_node_ptr; + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_find() failed to read " + "node at %llu", node_off); + goto error_alloc; + } + + while (!befs_leafnode(this_node)) { + res = befs_find_key(sb, this_node, key, &node_off); + if (res == BEFS_BT_NOT_FOUND) + node_off = this_node->head.overflow; + /* if no match, go to overflow node */ + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_find() failed to read " + "node at %llu", node_off); + goto error_alloc; + } + } + + /* at the correct leaf node now */ + + res = befs_find_key(sb, this_node, key, value); + + brelse(this_node->bh); + kfree(this_node); + + if (res != BEFS_BT_MATCH) { + befs_debug(sb, "<--- %s Key %s not found", __func__, key); + *value = 0; + return BEFS_BT_NOT_FOUND; + } + befs_debug(sb, "<--- %s Found key %s, value %llu", __func__, + key, *value); + return BEFS_OK; + + error_alloc: + kfree(this_node); + error: + *value = 0; + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_ERR; +} + +/** + * befs_find_key - Search for a key within a node + * @sb: Filesystem superblock + * @node: Node to find the key within + * @findkey: Keystring to search for + * @value: If key is found, the value stored with the key is put here + * + * finds exact match if one exists, and returns BEFS_BT_MATCH + * If no exact match, finds first key in node that is greater + * (alphabetically) than the search key and returns BEFS_BT_PARMATCH + * (for partial match, I guess). Can you think of something better to + * call it? + * + * If no key was a match or greater than the search key, return + * BEFS_BT_NOT_FOUND. + * + * Use binary search instead of a linear. + */ +static int +befs_find_key(struct super_block *sb, struct befs_btree_node *node, + const char *findkey, befs_off_t * value) +{ + int first, last, mid; + int eq; + u16 keylen; + int findkey_len; + char *thiskey; + fs64 *valarray; + + befs_debug(sb, "---> %s %s", __func__, findkey); + + *value = 0; + + findkey_len = strlen(findkey); + + /* if node can not contain key, just skeep this node */ + last = node->head.all_key_count - 1; + thiskey = befs_bt_get_key(sb, node, last, &keylen); + + eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); + if (eq < 0) { + befs_debug(sb, "<--- %s %s not found", __func__, findkey); + return BEFS_BT_NOT_FOUND; + } + + valarray = befs_bt_valarray(node); + + /* simple binary search */ + first = 0; + mid = 0; + while (last >= first) { + mid = (last + first) / 2; + befs_debug(sb, "first: %d, last: %d, mid: %d", first, last, + mid); + thiskey = befs_bt_get_key(sb, node, mid, &keylen); + eq = befs_compare_strings(thiskey, keylen, findkey, + findkey_len); + + if (eq == 0) { + befs_debug(sb, "<--- %s found %s at %d", + __func__, thiskey, mid); + + *value = fs64_to_cpu(sb, valarray[mid]); + return BEFS_BT_MATCH; + } + if (eq > 0) + last = mid - 1; + else + first = mid + 1; + } + if (eq < 0) + *value = fs64_to_cpu(sb, valarray[mid + 1]); + else + *value = fs64_to_cpu(sb, valarray[mid]); + befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); + return BEFS_BT_PARMATCH; +} + +/** + * befs_btree_read - Traverse leafnodes of a btree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @key_no: Key number (alphabetical order) of key to read + * @bufsize: Size of the buffer to return key in + * @keybuf: Pointer to a buffer to put the key in + * @keysize: Length of the returned key + * @value: Value stored with the returned key + * + * Heres how it works: Key_no is the index of the key/value pair to + * return in keybuf/value. + * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is + * the number of characters in the key (just a convenience). + * + * Algorithm: + * Get the first leafnode of the tree. See if the requested key is in that + * node. If not, follow the node->right link to the next leafnode. Repeat + * until the (key_no)th key is found or the tree is out of keys. + */ +int +befs_btree_read(struct super_block *sb, befs_data_stream * ds, + loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, + befs_off_t * value) +{ + struct befs_btree_node *this_node; + befs_btree_super bt_super; + befs_off_t node_off = 0; + int cur_key; + fs64 *valarray; + char *keystart; + u16 keylen; + int res; + + uint key_sum = 0; + + befs_debug(sb, "---> %s", __func__); + + if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { + befs_error(sb, + "befs_btree_read() failed to read index superblock"); + goto error; + } + + this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS); + if (this_node == NULL) { + befs_error(sb, "befs_btree_read() failed to allocate %zu " + "bytes of memory", sizeof(struct befs_btree_node)); + goto error; + } + + node_off = bt_super.root_node_ptr; + this_node->bh = NULL; + + /* seeks down to first leafnode, reads it into this_node */ + res = befs_btree_seekleaf(sb, ds, &bt_super, this_node, &node_off); + if (res == BEFS_BT_EMPTY) { + brelse(this_node->bh); + kfree(this_node); + *value = 0; + *keysize = 0; + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); + return BEFS_BT_EMPTY; + } else if (res == BEFS_ERR) { + goto error_alloc; + } + + /* find the leaf node containing the key_no key */ + + while (key_sum + this_node->head.all_key_count <= key_no) { + + /* no more nodes to look in: key_no is too large */ + if (this_node->head.right == befs_bt_inval) { + *keysize = 0; + *value = 0; + befs_debug(sb, + "<--- %s END of keys at %llu", __func__, + (unsigned long long) + key_sum + this_node->head.all_key_count); + brelse(this_node->bh); + kfree(this_node); + return BEFS_BT_END; + } + + key_sum += this_node->head.all_key_count; + node_off = this_node->head.right; + + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "%s failed to read node at %llu", + __func__, (unsigned long long)node_off); + goto error_alloc; + } + } + + /* how many keys into this_node is key_no */ + cur_key = key_no - key_sum; + + /* get pointers to datastructures within the node body */ + valarray = befs_bt_valarray(this_node); + + keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); + + befs_debug(sb, "Read [%llu,%d]: keysize %d", + (long long unsigned int)node_off, (int)cur_key, + (int)keylen); + + if (bufsize < keylen + 1) { + befs_error(sb, "%s keybuf too small (%zu) " + "for key of size %d", __func__, bufsize, keylen); + brelse(this_node->bh); + goto error_alloc; + } + + strlcpy(keybuf, keystart, keylen + 1); + *value = fs64_to_cpu(sb, valarray[cur_key]); + *keysize = keylen; + + befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off, + cur_key, keylen, keybuf, *value); + + brelse(this_node->bh); + kfree(this_node); + + befs_debug(sb, "<--- %s", __func__); + + return BEFS_OK; + + error_alloc: + kfree(this_node); + + error: + *keysize = 0; + *value = 0; + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_ERR; +} + +/** + * befs_btree_seekleaf - Find the first leafnode in the btree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @bt_super: Pointer to the superblock of the btree + * @this_node: Buffer to return the leafnode in + * @node_off: Pointer to offset of current node within datastream. Modified + * by the function. + * + * + * Helper function for btree traverse. Moves the current position to the + * start of the first leaf node. + * + * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY. + */ +static int +befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, + befs_btree_super *bt_super, + struct befs_btree_node *this_node, + befs_off_t * node_off) +{ + + befs_debug(sb, "---> %s", __func__); + + if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); + goto error; + } + befs_debug(sb, "Seekleaf to root node %llu", *node_off); + + if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); + return BEFS_BT_EMPTY; + } + + while (!befs_leafnode(this_node)) { + + if (this_node->head.all_key_count == 0) { + befs_debug(sb, "%s encountered " + "an empty interior node: %llu. Using Overflow " + "node: %llu", __func__, *node_off, + this_node->head.overflow); + *node_off = this_node->head.overflow; + } else { + fs64 *valarray = befs_bt_valarray(this_node); + *node_off = fs64_to_cpu(sb, valarray[0]); + } + if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); + goto error; + } + + befs_debug(sb, "Seekleaf to child node %llu", *node_off); + } + befs_debug(sb, "Node %llu is a leaf node", *node_off); + + return BEFS_OK; + + error: + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_ERR; +} + +/** + * befs_leafnode - Determine if the btree node is a leaf node or an + * interior node + * @node: Pointer to node structure to test + * + * Return 1 if leaf, 0 if interior + */ +static int +befs_leafnode(struct befs_btree_node *node) +{ + /* all interior nodes (and only interior nodes) have an overflow node */ + if (node->head.overflow == befs_bt_inval) + return 1; + else + return 0; +} + +/** + * befs_bt_keylen_index - Finds start of keylen index in a node + * @node: Pointer to the node structure to find the keylen index within + * + * Returns a pointer to the start of the key length index array + * of the B+tree node *@node + * + * "The length of all the keys in the node is added to the size of the + * header and then rounded up to a multiple of four to get the beginning + * of the key length index" (p.88, practical filesystem design). + * + * Except that rounding up to 8 works, and rounding up to 4 doesn't. + */ +static fs16 * +befs_bt_keylen_index(struct befs_btree_node *node) +{ + const int keylen_align = 8; + unsigned long int off = + (sizeof (befs_btree_nodehead) + node->head.all_key_length); + ulong tmp = off % keylen_align; + + if (tmp) + off += keylen_align - tmp; + + return (fs16 *) ((void *) node->od_node + off); +} + +/** + * befs_bt_valarray - Finds the start of value array in a node + * @node: Pointer to the node structure to find the value array within + * + * Returns a pointer to the start of the value array + * of the node pointed to by the node header + */ +static fs64 * +befs_bt_valarray(struct befs_btree_node *node) +{ + void *keylen_index_start = (void *) befs_bt_keylen_index(node); + size_t keylen_index_size = node->head.all_key_count * sizeof (fs16); + + return (fs64 *) (keylen_index_start + keylen_index_size); +} + +/** + * befs_bt_keydata - Finds start of keydata array in a node + * @node: Pointer to the node structure to find the keydata array within + * + * Returns a pointer to the start of the keydata array + * of the node pointed to by the node header + */ +static char * +befs_bt_keydata(struct befs_btree_node *node) +{ + return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead)); +} + +/** + * befs_bt_get_key - returns a pointer to the start of a key + * @sb: filesystem superblock + * @node: node in which to look for the key + * @index: the index of the key to get + * @keylen: modified to be the length of the key at @index + * + * Returns a valid pointer into @node on success. + * Returns NULL on failure (bad input) and sets *@keylen = 0 + */ +static char * +befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, + int index, u16 * keylen) +{ + int prev_key_end; + char *keystart; + fs16 *keylen_index; + + if (index < 0 || index > node->head.all_key_count) { + *keylen = 0; + return NULL; + } + + keystart = befs_bt_keydata(node); + keylen_index = befs_bt_keylen_index(node); + + if (index == 0) + prev_key_end = 0; + else + prev_key_end = fs16_to_cpu(sb, keylen_index[index - 1]); + + *keylen = fs16_to_cpu(sb, keylen_index[index]) - prev_key_end; + + return keystart + prev_key_end; +} + +/** + * befs_compare_strings - compare two strings + * @key1: pointer to the first key to be compared + * @keylen1: length in bytes of key1 + * @key2: pointer to the second key to be compared + * @keylen2: length in bytes of key2 + * + * Returns 0 if @key1 and @key2 are equal. + * Returns >0 if @key1 is greater. + * Returns <0 if @key2 is greater.. + */ +static int +befs_compare_strings(const void *key1, int keylen1, + const void *key2, int keylen2) +{ + int len = min_t(int, keylen1, keylen2); + int result = strncmp(key1, key2, len); + if (result == 0) + result = keylen1 - keylen2; + return result; +} + +/* These will be used for non-string keyed btrees */ +#if 0 +static int +btree_compare_int32(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + return *(int32_t *) key1 - *(int32_t *) key2; +} + +static int +btree_compare_uint32(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + if (*(u_int32_t *) key1 == *(u_int32_t *) key2) + return 0; + else if (*(u_int32_t *) key1 > *(u_int32_t *) key2) + return 1; + + return -1; +} +static int +btree_compare_int64(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + if (*(int64_t *) key1 == *(int64_t *) key2) + return 0; + else if (*(int64_t *) key1 > *(int64_t *) key2) + return 1; + + return -1; +} + +static int +btree_compare_uint64(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + if (*(u_int64_t *) key1 == *(u_int64_t *) key2) + return 0; + else if (*(u_int64_t *) key1 > *(u_int64_t *) key2) + return 1; + + return -1; +} + +static int +btree_compare_float(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + float result = *(float *) key1 - *(float *) key2; + if (result == 0.0f) + return 0; + + return (result < 0.0f) ? -1 : 1; +} + +static int +btree_compare_double(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + double result = *(double *) key1 - *(double *) key2; + if (result == 0.0) + return 0; + + return (result < 0.0) ? -1 : 1; +} +#endif //0 diff --git a/kernel/fs/befs/btree.h b/kernel/fs/befs/btree.h new file mode 100644 index 000000000..92e781e5f --- /dev/null +++ b/kernel/fs/befs/btree.h @@ -0,0 +1,13 @@ +/* + * btree.h + * + */ + + +int befs_btree_find(struct super_block *sb, befs_data_stream * ds, + const char *key, befs_off_t * value); + +int befs_btree_read(struct super_block *sb, befs_data_stream * ds, + loff_t key_no, size_t bufsize, char *keybuf, + size_t * keysize, befs_off_t * value); + diff --git a/kernel/fs/befs/datastream.c b/kernel/fs/befs/datastream.c new file mode 100644 index 000000000..ebd507186 --- /dev/null +++ b/kernel/fs/befs/datastream.c @@ -0,0 +1,529 @@ +/* + * linux/fs/befs/datastream.c + * + * Copyright (C) 2001 Will Dyson + * + * Based on portions of file.c by Makoto Kato + * + * Many thanks to Dominic Giampaolo, author of "Practical File System + * Design with the Be File System", for such a helpful book. + * + */ + +#include +#include +#include + +#include "befs.h" +#include "datastream.h" +#include "io.h" + +const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; + +static int befs_find_brun_direct(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, befs_block_run * run); + +static int befs_find_brun_indirect(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, + befs_block_run * run); + +static int befs_find_brun_dblindirect(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, + befs_block_run * run); + +/** + * befs_read_datastream - get buffer_head containing data, starting from pos. + * @sb: Filesystem superblock + * @ds: datastrem to find data with + * @pos: start of data + * @off: offset of data in buffer_head->b_data + * + * Returns pointer to buffer_head containing data starting with offset @off, + * if you don't need to know offset just set @off = NULL. + */ +struct buffer_head * +befs_read_datastream(struct super_block *sb, befs_data_stream * ds, + befs_off_t pos, uint * off) +{ + struct buffer_head *bh = NULL; + befs_block_run run; + befs_blocknr_t block; /* block coresponding to pos */ + + befs_debug(sb, "---> %s %llu", __func__, pos); + block = pos >> BEFS_SB(sb)->block_shift; + if (off) + *off = pos - (block << BEFS_SB(sb)->block_shift); + + if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { + befs_error(sb, "BeFS: Error finding disk addr of block %lu", + (unsigned long)block); + befs_debug(sb, "<--- %s ERROR", __func__); + return NULL; + } + bh = befs_bread_iaddr(sb, run); + if (!bh) { + befs_error(sb, "BeFS: Error reading block %lu from datastream", + (unsigned long)block); + return NULL; + } + + befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos); + + return bh; +} + +/* + * Takes a file position and gives back a brun who's starting block + * is block number fblock of the file. + * + * Returns BEFS_OK or BEFS_ERR. + * + * Calls specialized functions for each of the three possible + * datastream regions. + * + * 2001-11-15 Will Dyson + */ +int +befs_fblock2brun(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t fblock, befs_block_run * run) +{ + int err; + befs_off_t pos = fblock << BEFS_SB(sb)->block_shift; + + if (pos < data->max_direct_range) { + err = befs_find_brun_direct(sb, data, fblock, run); + + } else if (pos < data->max_indirect_range) { + err = befs_find_brun_indirect(sb, data, fblock, run); + + } else if (pos < data->max_double_indirect_range) { + err = befs_find_brun_dblindirect(sb, data, fblock, run); + + } else { + befs_error(sb, + "befs_fblock2brun() was asked to find block %lu, " + "which is not mapped by the datastream\n", + (unsigned long)fblock); + err = BEFS_ERR; + } + return err; +} + +/** + * befs_read_lsmylink - read long symlink from datastream. + * @sb: Filesystem superblock + * @ds: Datastrem to read from + * @buff: Buffer in which to place long symlink data + * @len: Length of the long symlink in bytes + * + * Returns the number of bytes read + */ +size_t +befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, + befs_off_t len) +{ + befs_off_t bytes_read = 0; /* bytes readed */ + u16 plen; + struct buffer_head *bh = NULL; + befs_debug(sb, "---> %s length: %llu", __func__, len); + + while (bytes_read < len) { + bh = befs_read_datastream(sb, ds, bytes_read, NULL); + if (!bh) { + befs_error(sb, "BeFS: Error reading datastream block " + "starting from %llu", bytes_read); + befs_debug(sb, "<--- %s ERROR", __func__); + return bytes_read; + + } + plen = ((bytes_read + BEFS_SB(sb)->block_size) < len) ? + BEFS_SB(sb)->block_size : len - bytes_read; + memcpy(buff + bytes_read, bh->b_data, plen); + brelse(bh); + bytes_read += plen; + } + + befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int) + bytes_read); + return bytes_read; +} + +/** + * befs_count_blocks - blocks used by a file + * @sb: Filesystem superblock + * @ds: Datastream of the file + * + * Counts the number of fs blocks that the file represented by + * inode occupies on the filesystem, counting both regular file + * data and filesystem metadata (and eventually attribute data + * when we support attributes) +*/ + +befs_blocknr_t +befs_count_blocks(struct super_block * sb, befs_data_stream * ds) +{ + befs_blocknr_t blocks; + befs_blocknr_t datablocks; /* File data blocks */ + befs_blocknr_t metablocks; /* FS metadata blocks */ + struct befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_debug(sb, "---> %s", __func__); + + datablocks = ds->size >> befs_sb->block_shift; + if (ds->size & (befs_sb->block_size - 1)) + datablocks += 1; + + metablocks = 1; /* Start with 1 block for inode */ + + /* Size of indirect block */ + if (ds->size > ds->max_direct_range) + metablocks += ds->indirect.len; + + /* + Double indir block, plus all the indirect blocks it mapps + In the double-indirect range, all block runs of data are + BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know + how many data block runs are in the double-indirect region, + and from that we know how many indirect blocks it takes to + map them. We assume that the indirect blocks are also + BEFS_DBLINDIR_BRUN_LEN blocks long. + */ + if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) { + uint dbl_bytes; + uint dbl_bruns; + uint indirblocks; + + dbl_bytes = + ds->max_double_indirect_range - ds->max_indirect_range; + dbl_bruns = + dbl_bytes / (befs_sb->block_size * BEFS_DBLINDIR_BRUN_LEN); + indirblocks = dbl_bruns / befs_iaddrs_per_block(sb); + + metablocks += ds->double_indirect.len; + metablocks += indirblocks; + } + + blocks = datablocks + metablocks; + befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks); + + return blocks; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the direct region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + Linear search. Checks each element of array[] to see if it + contains the blockno-th filesystem block. This is necessary + because the block runs map variable amounts of data. Simply + keeps a count of the number of blocks searched so far (sum), + incrementing this by the length of each block run as we come + across it. Adds sum to *count before returning (this is so + you can search multiple arrays that are logicaly one array, + as in the indirect region code). + + When/if blockno is found, if blockno is inside of a block + run as stored on disk, we offset the start and length members + of the block run, so that blockno is the start and len is + still valid (the run ends in the same place). + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t blockno, befs_block_run * run) +{ + int i; + befs_block_run *array = data->direct; + befs_blocknr_t sum; + befs_blocknr_t max_block = + data->max_direct_range >> BEFS_SB(sb)->block_shift; + + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); + + if (blockno > max_block) { + befs_error(sb, "%s passed block outside of direct region", + __func__); + return BEFS_ERR; + } + + for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS; + sum += array[i].len, i++) { + if (blockno >= sum && blockno < sum + (array[i].len)) { + int offset = blockno - sum; + run->allocation_group = array[i].allocation_group; + run->start = array[i].start + offset; + run->len = array[i].len - offset; + + befs_debug(sb, "---> %s, " + "found %lu at direct[%d]", __func__, + (unsigned long)blockno, i); + return BEFS_OK; + } + } + + befs_debug(sb, "---> %s ERROR", __func__); + return BEFS_ERR; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the indirect region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + For each block in the indirect run of the datastream, read + it in and search through it for search_blk. + + XXX: + Really should check to make sure blockno is inside indirect + region. + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_indirect(struct super_block *sb, + befs_data_stream * data, befs_blocknr_t blockno, + befs_block_run * run) +{ + int i, j; + befs_blocknr_t sum = 0; + befs_blocknr_t indir_start_blk; + befs_blocknr_t search_blk; + struct buffer_head *indirblock; + befs_disk_block_run *array; + + befs_block_run indirect = data->indirect; + befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); + int arraylen = befs_iaddrs_per_block(sb); + + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); + + indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; + search_blk = blockno - indir_start_blk; + + /* Examine blocks of the indirect run one at a time */ + for (i = 0; i < indirect.len; i++) { + indirblock = befs_bread(sb, indirblockno + i); + if (indirblock == NULL) { + befs_debug(sb, "---> %s failed to read " + "disk block %lu from the indirect brun", + __func__, (unsigned long)indirblockno + i); + return BEFS_ERR; + } + + array = (befs_disk_block_run *) indirblock->b_data; + + for (j = 0; j < arraylen; ++j) { + int len = fs16_to_cpu(sb, array[j].len); + + if (search_blk >= sum && search_blk < sum + len) { + int offset = search_blk - sum; + run->allocation_group = + fs32_to_cpu(sb, array[j].allocation_group); + run->start = + fs16_to_cpu(sb, array[j].start) + offset; + run->len = + fs16_to_cpu(sb, array[j].len) - offset; + + brelse(indirblock); + befs_debug(sb, + "<--- %s found file block " + "%lu at indirect[%d]", __func__, + (unsigned long)blockno, + j + (i * arraylen)); + return BEFS_OK; + } + sum += len; + } + + brelse(indirblock); + } + + /* Only fallthrough is an error */ + befs_error(sb, "BeFS: %s failed to find " + "file block %lu", __func__, (unsigned long)blockno); + + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_ERR; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the double-indirect region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + The block runs in the double-indirect region are different. + They are always allocated 4 fs blocks at a time, so each + block run maps a constant amount of file data. This means + that we can directly calculate how many block runs into the + double-indirect region we need to go to get to the one that + maps a particular filesystem block. + + We do this in two stages. First we calculate which of the + inode addresses in the double-indirect block will point us + to the indirect block that contains the mapping for the data, + then we calculate which of the inode addresses in that + indirect block maps the data block we are after. + + Oh, and once we've done that, we actually read in the blocks + that contain the inode addresses we calculated above. Even + though the double-indirect run may be several blocks long, + we can calculate which of those blocks will contain the index + we are after and only read that one. We then follow it to + the indirect block and perform a similar process to find + the actual block run that maps the data block we are interested + in. + + Then we offset the run as in befs_find_brun_array() and we are + done. + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_dblindirect(struct super_block *sb, + befs_data_stream * data, befs_blocknr_t blockno, + befs_block_run * run) +{ + int dblindir_indx; + int indir_indx; + int offset; + int dbl_which_block; + int which_block; + int dbl_block_indx; + int block_indx; + off_t dblindir_leftover; + befs_blocknr_t blockno_at_run_start; + struct buffer_head *dbl_indir_block; + struct buffer_head *indir_block; + befs_block_run indir_run; + befs_disk_inode_addr *iaddr_array = NULL; + struct befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_blocknr_t indir_start_blk = + data->max_indirect_range >> befs_sb->block_shift; + + off_t dbl_indir_off = blockno - indir_start_blk; + + /* number of data blocks mapped by each of the iaddrs in + * the indirect block pointed to by the double indirect block + */ + size_t iblklen = BEFS_DBLINDIR_BRUN_LEN; + + /* number of data blocks mapped by each of the iaddrs in + * the double indirect block + */ + size_t diblklen = iblklen * befs_iaddrs_per_block(sb) + * BEFS_DBLINDIR_BRUN_LEN; + + befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno); + + /* First, discover which of the double_indir->indir blocks + * contains pos. Then figure out how much of pos that + * accounted for. Then discover which of the iaddrs in + * the indirect block contains pos. + */ + + dblindir_indx = dbl_indir_off / diblklen; + dblindir_leftover = dbl_indir_off % diblklen; + indir_indx = dblindir_leftover / diblklen; + + /* Read double indirect block */ + dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); + if (dbl_which_block > data->double_indirect.len) { + befs_error(sb, "The double-indirect index calculated by " + "%s, %d, is outside the range " + "of the double-indirect block", __func__, + dblindir_indx); + return BEFS_ERR; + } + + dbl_indir_block = + befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + dbl_which_block); + if (dbl_indir_block == NULL) { + befs_error(sb, "%s couldn't read the " + "double-indirect block at blockno %lu", __func__, + (unsigned long) + iaddr2blockno(sb, &data->double_indirect) + + dbl_which_block); + brelse(dbl_indir_block); + return BEFS_ERR; + } + + dbl_block_indx = + dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb)); + iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data; + indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]); + brelse(dbl_indir_block); + iaddr_array = NULL; + + /* Read indirect block */ + which_block = indir_indx / befs_iaddrs_per_block(sb); + if (which_block > indir_run.len) { + befs_error(sb, "The indirect index calculated by " + "%s, %d, is outside the range " + "of the indirect block", __func__, indir_indx); + return BEFS_ERR; + } + + indir_block = + befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + if (indir_block == NULL) { + befs_error(sb, "%s couldn't read the indirect block " + "at blockno %lu", __func__, (unsigned long) + iaddr2blockno(sb, &indir_run) + which_block); + brelse(indir_block); + return BEFS_ERR; + } + + block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb)); + iaddr_array = (befs_disk_inode_addr *) indir_block->b_data; + *run = fsrun_to_cpu(sb, iaddr_array[block_indx]); + brelse(indir_block); + iaddr_array = NULL; + + blockno_at_run_start = indir_start_blk; + blockno_at_run_start += diblklen * dblindir_indx; + blockno_at_run_start += iblklen * indir_indx; + offset = blockno - blockno_at_run_start; + + run->start += offset; + run->len -= offset; + + befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," + " double_indirect_leftover = %lu", (unsigned long) + blockno, dblindir_indx, indir_indx, dblindir_leftover); + + return BEFS_OK; +} diff --git a/kernel/fs/befs/datastream.h b/kernel/fs/befs/datastream.h new file mode 100644 index 000000000..45e8a3c98 --- /dev/null +++ b/kernel/fs/befs/datastream.h @@ -0,0 +1,19 @@ +/* + * datastream.h + * + */ + +struct buffer_head *befs_read_datastream(struct super_block *sb, + befs_data_stream * ds, befs_off_t pos, + uint * off); + +int befs_fblock2brun(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t fblock, befs_block_run * run); + +size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data, + void *buff, befs_off_t len); + +befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds); + +extern const befs_inode_addr BAD_IADDR; + diff --git a/kernel/fs/befs/debug.c b/kernel/fs/befs/debug.c new file mode 100644 index 000000000..4de7cffcd --- /dev/null +++ b/kernel/fs/befs/debug.c @@ -0,0 +1,258 @@ +/* + * linux/fs/befs/debug.c + * + * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com) + * + * With help from the ntfs-tng driver by Anton Altparmakov + * + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + * + * debug functions + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#endif /* __KERNEL__ */ + +#include "befs.h" + +void +befs_error(const struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); +} + +void +befs_warning(const struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); +} + +void +befs_debug(const struct super_block *sb, const char *fmt, ...) +{ +#ifdef CONFIG_BEFS_DEBUG + + struct va_format vaf; + va_list args; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); + +#endif //CONFIG_BEFS_DEBUG +} + +void +befs_dump_inode(const struct super_block *sb, befs_inode * inode) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run tmp_run; + + befs_debug(sb, "befs_inode information"); + + befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, inode->magic1)); + + tmp_run = fsrun_to_cpu(sb, inode->inode_num); + befs_debug(sb, " inode_num %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " uid %u", fs32_to_cpu(sb, inode->uid)); + befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); + befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); + befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); + befs_debug(sb, " create_time %llu", + fs64_to_cpu(sb, inode->create_time)); + befs_debug(sb, " last_modified_time %llu", + fs64_to_cpu(sb, inode->last_modified_time)); + + tmp_run = fsrun_to_cpu(sb, inode->parent); + befs_debug(sb, " parent [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + tmp_run = fsrun_to_cpu(sb, inode->attributes); + befs_debug(sb, " attributes [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type)); + befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size)); + + if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) { + befs_debug(sb, " Symbolic link [%s]", inode->data.symlink); + } else { + int i; + + for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; i++) { + tmp_run = + fsrun_to_cpu(sb, inode->data.datastream.direct[i]); + befs_debug(sb, " direct %d [%u, %hu, %hu]", i, + tmp_run.allocation_group, tmp_run.start, + tmp_run.len); + } + befs_debug(sb, " max_direct_range %llu", + fs64_to_cpu(sb, + inode->data.datastream. + max_direct_range)); + + tmp_run = fsrun_to_cpu(sb, inode->data.datastream.indirect); + befs_debug(sb, " indirect [%u, %hu, %hu]", + tmp_run.allocation_group, + tmp_run.start, tmp_run.len); + + befs_debug(sb, " max_indirect_range %llu", + fs64_to_cpu(sb, + inode->data.datastream. + max_indirect_range)); + + tmp_run = + fsrun_to_cpu(sb, inode->data.datastream.double_indirect); + befs_debug(sb, " double indirect [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, + tmp_run.len); + + befs_debug(sb, " max_double_indirect_range %llu", + fs64_to_cpu(sb, + inode->data.datastream. + max_double_indirect_range)); + + befs_debug(sb, " size %llu", + fs64_to_cpu(sb, inode->data.datastream.size)); + } + +#endif //CONFIG_BEFS_DEBUG +} + +/* + * Display super block structure for debug. + */ + +void +befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run tmp_run; + + befs_debug(sb, "befs_super_block information"); + + befs_debug(sb, " name %s", sup->name); + befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, sup->magic1)); + befs_debug(sb, " fs_byte_order %08x", + fs32_to_cpu(sb, sup->fs_byte_order)); + + befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); + befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); + + befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); + befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); + + befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); + befs_debug(sb, " blocks_per_ag %u", + fs32_to_cpu(sb, sup->blocks_per_ag)); + befs_debug(sb, " ag_shift %u", fs32_to_cpu(sb, sup->ag_shift)); + befs_debug(sb, " num_ags %u", fs32_to_cpu(sb, sup->num_ags)); + + befs_debug(sb, " flags %08x", fs32_to_cpu(sb, sup->flags)); + + tmp_run = fsrun_to_cpu(sb, sup->log_blocks); + befs_debug(sb, " log_blocks %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start)); + befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end)); + + befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); + + tmp_run = fsrun_to_cpu(sb, sup->root_dir); + befs_debug(sb, " root_dir %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + tmp_run = fsrun_to_cpu(sb, sup->indices); + befs_debug(sb, " indices %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + +#endif //CONFIG_BEFS_DEBUG +} + +#if 0 +/* unused */ +void +befs_dump_small_data(const struct super_block *sb, befs_small_data * sd) +{ +} + +/* unused */ +void +befs_dump_run(const struct super_block *sb, befs_disk_block_run run) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run n = fsrun_to_cpu(sb, run); + + befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len); + +#endif //CONFIG_BEFS_DEBUG +} +#endif /* 0 */ + +void +befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_debug(sb, "Btree super structure"); + befs_debug(sb, " magic %08x", fs32_to_cpu(sb, super->magic)); + befs_debug(sb, " node_size %u", fs32_to_cpu(sb, super->node_size)); + befs_debug(sb, " max_depth %08x", fs32_to_cpu(sb, super->max_depth)); + + befs_debug(sb, " data_type %08x", fs32_to_cpu(sb, super->data_type)); + befs_debug(sb, " root_node_pointer %016LX", + fs64_to_cpu(sb, super->root_node_ptr)); + befs_debug(sb, " free_node_pointer %016LX", + fs64_to_cpu(sb, super->free_node_ptr)); + befs_debug(sb, " maximum size %016LX", + fs64_to_cpu(sb, super->max_size)); + +#endif //CONFIG_BEFS_DEBUG +} + +void +befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_debug(sb, "Btree node structure"); + befs_debug(sb, " left %016LX", fs64_to_cpu(sb, node->left)); + befs_debug(sb, " right %016LX", fs64_to_cpu(sb, node->right)); + befs_debug(sb, " overflow %016LX", fs64_to_cpu(sb, node->overflow)); + befs_debug(sb, " all_key_count %hu", + fs16_to_cpu(sb, node->all_key_count)); + befs_debug(sb, " all_key_length %hu", + fs16_to_cpu(sb, node->all_key_length)); + +#endif //CONFIG_BEFS_DEBUG +} diff --git a/kernel/fs/befs/endian.h b/kernel/fs/befs/endian.h new file mode 100644 index 000000000..27223878b --- /dev/null +++ b/kernel/fs/befs/endian.h @@ -0,0 +1,125 @@ +/* + * linux/fs/befs/endian.h + * + * Copyright (C) 2001 Will Dyson + * + * Partially based on similar funtions in the sysv driver. + */ + +#ifndef LINUX_BEFS_ENDIAN +#define LINUX_BEFS_ENDIAN + +#include + +static inline u64 +fs64_to_cpu(const struct super_block *sb, fs64 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le64_to_cpu((__force __le64)n); + else + return be64_to_cpu((__force __be64)n); +} + +static inline fs64 +cpu_to_fs64(const struct super_block *sb, u64 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs64)cpu_to_le64(n); + else + return (__force fs64)cpu_to_be64(n); +} + +static inline u32 +fs32_to_cpu(const struct super_block *sb, fs32 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline fs32 +cpu_to_fs32(const struct super_block *sb, u32 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs32)cpu_to_le32(n); + else + return (__force fs32)cpu_to_be32(n); +} + +static inline u16 +fs16_to_cpu(const struct super_block *sb, fs16 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline fs16 +cpu_to_fs16(const struct super_block *sb, u16 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs16)cpu_to_le16(n); + else + return (__force fs16)cpu_to_be16(n); +} + +/* Composite types below here */ + +static inline befs_block_run +fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n) +{ + befs_block_run run; + + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { + run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group); + run.start = le16_to_cpu((__force __le16)n.start); + run.len = le16_to_cpu((__force __le16)n.len); + } else { + run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group); + run.start = be16_to_cpu((__force __be16)n.start); + run.len = be16_to_cpu((__force __be16)n.len); + } + return run; +} + +static inline befs_disk_block_run +cpu_to_fsrun(const struct super_block *sb, befs_block_run n) +{ + befs_disk_block_run run; + + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { + run.allocation_group = cpu_to_le32(n.allocation_group); + run.start = cpu_to_le16(n.start); + run.len = cpu_to_le16(n.len); + } else { + run.allocation_group = cpu_to_be32(n.allocation_group); + run.start = cpu_to_be16(n.start); + run.len = cpu_to_be16(n.len); + } + return run; +} + +static inline befs_data_stream +fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n) +{ + befs_data_stream data; + int i; + + for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) + data.direct[i] = fsrun_to_cpu(sb, n->direct[i]); + + data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range); + data.indirect = fsrun_to_cpu(sb, n->indirect); + data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range); + data.double_indirect = fsrun_to_cpu(sb, n->double_indirect); + data.max_double_indirect_range = fs64_to_cpu(sb, + n-> + max_double_indirect_range); + data.size = fs64_to_cpu(sb, n->size); + + return data; +} + +#endif //LINUX_BEFS_ENDIAN diff --git a/kernel/fs/befs/inode.c b/kernel/fs/befs/inode.c new file mode 100644 index 000000000..fa4b718de --- /dev/null +++ b/kernel/fs/befs/inode.c @@ -0,0 +1,54 @@ +/* + * inode.c + * + * Copyright (C) 2001 Will Dyson + */ + +#include + +#include "befs.h" +#include "inode.h" + +/* + Validates the correctness of the befs inode + Returns BEFS_OK if the inode should be used, otherwise + returns BEFS_BAD_INODE +*/ +int +befs_check_inode(struct super_block *sb, befs_inode * raw_inode, + befs_blocknr_t inode) +{ + u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1); + befs_inode_addr ino_num = fsrun_to_cpu(sb, raw_inode->inode_num); + u32 flags = fs32_to_cpu(sb, raw_inode->flags); + + /* check magic header. */ + if (magic1 != BEFS_INODE_MAGIC1) { + befs_error(sb, + "Inode has a bad magic header - inode = %lu", + (unsigned long)inode); + return BEFS_BAD_INODE; + } + + /* + * Sanity check2: inodes store their own block address. Check it. + */ + if (inode != iaddr2blockno(sb, &ino_num)) { + befs_error(sb, "inode blocknr field disagrees with vfs " + "VFS: %lu, Inode %lu", (unsigned long) + inode, (unsigned long)iaddr2blockno(sb, &ino_num)); + return BEFS_BAD_INODE; + } + + /* + * check flag + */ + + if (!(flags & BEFS_INODE_IN_USE)) { + befs_error(sb, "inode is not used - inode = %lu", + (unsigned long)inode); + return BEFS_BAD_INODE; + } + + return BEFS_OK; +} diff --git a/kernel/fs/befs/inode.h b/kernel/fs/befs/inode.h new file mode 100644 index 000000000..9dc7fd9b7 --- /dev/null +++ b/kernel/fs/befs/inode.h @@ -0,0 +1,8 @@ +/* + * inode.h + * + */ + +int befs_check_inode(struct super_block *sb, befs_inode * raw_inode, + befs_blocknr_t inode); + diff --git a/kernel/fs/befs/io.c b/kernel/fs/befs/io.c new file mode 100644 index 000000000..7a5b4ec21 --- /dev/null +++ b/kernel/fs/befs/io.c @@ -0,0 +1,85 @@ +/* + * linux/fs/befs/io.c + * + * Copyright (C) 2001 Will Dyson + +#include "befs.h" +#include "io.h" + +/* + * Converts befs notion of disk addr to a disk offset and uses + * linux kernel function sb_bread() to get the buffer containing + * the offset. -Will Dyson + * + */ + +struct buffer_head * +befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) +{ + struct buffer_head *bh = NULL; + befs_blocknr_t block = 0; + struct befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_debug(sb, "---> Enter %s " + "[%u, %hu, %hu]", __func__, iaddr.allocation_group, + iaddr.start, iaddr.len); + + if (iaddr.allocation_group > befs_sb->num_ags) { + befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", + iaddr.allocation_group, befs_sb->num_ags); + goto error; + } + + block = iaddr2blockno(sb, &iaddr); + + befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block); + + bh = sb_bread(sb, block); + + if (bh == NULL) { + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); + goto error; + } + + befs_debug(sb, "<--- %s", __func__); + return bh; + + error: + befs_debug(sb, "<--- %s ERROR", __func__); + return NULL; +} + +struct buffer_head * +befs_bread(struct super_block *sb, befs_blocknr_t block) +{ + struct buffer_head *bh = NULL; + + befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); + + bh = sb_bread(sb, block); + + if (bh == NULL) { + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); + goto error; + } + + befs_debug(sb, "<--- %s", __func__); + + return bh; + + error: + befs_debug(sb, "<--- %s ERROR", __func__); + return NULL; +} diff --git a/kernel/fs/befs/io.h b/kernel/fs/befs/io.h new file mode 100644 index 000000000..9b78266b6 --- /dev/null +++ b/kernel/fs/befs/io.h @@ -0,0 +1,9 @@ +/* + * io.h + */ + +struct buffer_head *befs_bread_iaddr(struct super_block *sb, + befs_inode_addr iaddr); + +struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); + diff --git a/kernel/fs/befs/linuxvfs.c b/kernel/fs/befs/linuxvfs.c new file mode 100644 index 000000000..7943533c3 --- /dev/null +++ b/kernel/fs/befs/linuxvfs.c @@ -0,0 +1,981 @@ +/* + * linux/fs/befs/linuxvfs.c + * + * Copyright (C) 2001 Will Dyson +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "befs.h" +#include "btree.h" +#include "inode.h" +#include "datastream.h" +#include "super.h" +#include "io.h" + +MODULE_DESCRIPTION("BeOS File System (BeFS) driver"); +MODULE_AUTHOR("Will Dyson"); +MODULE_LICENSE("GPL"); + +/* The units the vfs expects inode->i_blocks to be in */ +#define VFS_BLOCK_SIZE 512 + +static int befs_readdir(struct file *, struct dir_context *); +static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); +static int befs_readpage(struct file *file, struct page *page); +static sector_t befs_bmap(struct address_space *mapping, sector_t block); +static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); +static struct inode *befs_iget(struct super_block *, unsigned long); +static struct inode *befs_alloc_inode(struct super_block *sb); +static void befs_destroy_inode(struct inode *inode); +static void befs_destroy_inodecache(void); +static void *befs_follow_link(struct dentry *, struct nameidata *); +static void *befs_fast_follow_link(struct dentry *, struct nameidata *); +static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, + char **out, int *out_len); +static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, + char **out, int *out_len); +static void befs_put_super(struct super_block *); +static int befs_remount(struct super_block *, int *, char *); +static int befs_statfs(struct dentry *, struct kstatfs *); +static int parse_options(char *, struct befs_mount_options *); + +static const struct super_operations befs_sops = { + .alloc_inode = befs_alloc_inode, /* allocate a new inode */ + .destroy_inode = befs_destroy_inode, /* deallocate an inode */ + .put_super = befs_put_super, /* uninit super */ + .statfs = befs_statfs, /* statfs */ + .remount_fs = befs_remount, + .show_options = generic_show_options, +}; + +/* slab cache for befs_inode_info objects */ +static struct kmem_cache *befs_inode_cachep; + +static const struct file_operations befs_dir_operations = { + .read = generic_read_dir, + .iterate = befs_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations befs_dir_inode_operations = { + .lookup = befs_lookup, +}; + +static const struct address_space_operations befs_aops = { + .readpage = befs_readpage, + .bmap = befs_bmap, +}; + +static const struct inode_operations befs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = befs_fast_follow_link, +}; + +static const struct inode_operations befs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = befs_follow_link, + .put_link = kfree_put_link, +}; + +/* + * Called by generic_file_read() to read a page of data + * + * In turn, simply calls a generic block read function and + * passes it the address of befs_get_block, for mapping file + * positions to disk blocks. + */ +static int +befs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, befs_get_block); +} + +static sector_t +befs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, befs_get_block); +} + +/* + * Generic function to map a file position (block) to a + * disk offset (passed back in bh_result). + * + * Used by many higher level functions. + * + * Calls befs_fblock2brun() in datastream.c to do the real work. + * + * -WD 10-26-01 + */ + +static int +befs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; + befs_block_run run = BAD_IADDR; + int res = 0; + ulong disk_off; + + befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", + (unsigned long)inode->i_ino, (long)block); + if (create) { + befs_error(sb, "befs_get_block() was asked to write to " + "block %ld in inode %lu", (long)block, + (unsigned long)inode->i_ino); + return -EPERM; + } + + res = befs_fblock2brun(sb, ds, block, &run); + if (res != BEFS_OK) { + befs_error(sb, + "<--- %s for inode %lu, block %ld ERROR", + __func__, (unsigned long)inode->i_ino, + (long)block); + return -EFBIG; + } + + disk_off = (ulong) iaddr2blockno(sb, &run); + + map_bh(bh_result, inode->i_sb, disk_off); + + befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", + __func__, (unsigned long)inode->i_ino, (long)block, + (unsigned long)disk_off); + + return 0; +} + +static struct dentry * +befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct inode *inode = NULL; + struct super_block *sb = dir->i_sb; + befs_data_stream *ds = &BEFS_I(dir)->i_data.ds; + befs_off_t offset; + int ret; + int utfnamelen; + char *utfname; + const char *name = dentry->d_name.name; + + befs_debug(sb, "---> %s name %pd inode %ld", __func__, + dentry, dir->i_ino); + + /* Convert to UTF-8 */ + if (BEFS_SB(sb)->nls) { + ret = + befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); + if (ret < 0) { + befs_debug(sb, "<--- %s ERROR", __func__); + return ERR_PTR(ret); + } + ret = befs_btree_find(sb, ds, utfname, &offset); + kfree(utfname); + + } else { + ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset); + } + + if (ret == BEFS_BT_NOT_FOUND) { + befs_debug(sb, "<--- %s %pd not found", __func__, dentry); + return ERR_PTR(-ENOENT); + + } else if (ret != BEFS_OK || offset == 0) { + befs_warning(sb, "<--- %s Error", __func__); + return ERR_PTR(-ENODATA); + } + + inode = befs_iget(dir->i_sb, (ino_t) offset); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + d_add(dentry, inode); + + befs_debug(sb, "<--- %s", __func__); + + return NULL; +} + +static int +befs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; + befs_off_t value; + int result; + size_t keysize; + unsigned char d_type; + char keybuf[BEFS_NAME_LEN + 1]; + + befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", + __func__, file, inode->i_ino, ctx->pos); + +more: + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, + keybuf, &keysize, &value); + + if (result == BEFS_ERR) { + befs_debug(sb, "<--- %s ERROR", __func__); + befs_error(sb, "IO error reading %pD (inode %lu)", + file, inode->i_ino); + return -EIO; + + } else if (result == BEFS_BT_END) { + befs_debug(sb, "<--- %s END", __func__); + return 0; + + } else if (result == BEFS_BT_EMPTY) { + befs_debug(sb, "<--- %s Empty directory", __func__); + return 0; + } + + d_type = DT_UNKNOWN; + + /* Convert to NLS */ + if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; + result = + befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); + if (result < 0) { + befs_debug(sb, "<--- %s ERROR", __func__); + return result; + } + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, d_type)) { + kfree(nlsname); + return 0; + } + kfree(nlsname); + } else { + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, d_type)) + return 0; + } + ctx->pos++; + goto more; +} + +static struct inode * +befs_alloc_inode(struct super_block *sb) +{ + struct befs_inode_info *bi; + + bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + return &bi->vfs_inode; +} + +static void befs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); +} + +static void befs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, befs_i_callback); +} + +static void init_once(void *foo) +{ + struct befs_inode_info *bi = (struct befs_inode_info *) foo; + + inode_init_once(&bi->vfs_inode); +} + +static struct inode *befs_iget(struct super_block *sb, unsigned long ino) +{ + struct buffer_head *bh = NULL; + befs_inode *raw_inode = NULL; + struct befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_inode_info *befs_ino = NULL; + struct inode *inode; + long ret = -EIO; + + befs_debug(sb, "---> %s inode = %lu", __func__, ino); + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + befs_ino = BEFS_I(inode); + + /* convert from vfs's inode number to befs's inode number */ + befs_ino->i_inode_num = blockno2iaddr(sb, inode->i_ino); + + befs_debug(sb, " real inode number [%u, %hu, %hu]", + befs_ino->i_inode_num.allocation_group, + befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); + + bh = befs_bread(sb, inode->i_ino); + if (!bh) { + befs_error(sb, "unable to read inode block - " + "inode = %lu", inode->i_ino); + goto unacquire_none; + } + + raw_inode = (befs_inode *) bh->b_data; + + befs_dump_inode(sb, raw_inode); + + if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) { + befs_error(sb, "Bad inode: %lu", inode->i_ino); + goto unacquire_bh; + } + + inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode); + + /* + * set uid and gid. But since current BeOS is single user OS, so + * you can change by "uid" or "gid" options. + */ + + inode->i_uid = befs_sb->mount_opts.use_uid ? + befs_sb->mount_opts.uid : + make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid)); + inode->i_gid = befs_sb->mount_opts.use_gid ? + befs_sb->mount_opts.gid : + make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid)); + + set_nlink(inode, 1); + + /* + * BEFS's time is 64 bits, but current VFS is 32 bits... + * BEFS don't have access time. Nor inode change time. VFS + * doesn't have creation time. + * Also, the lower 16 bits of the last_modified_time and + * create_time are just a counter to help ensure uniqueness + * for indexing purposes. (PFD, page 54) + */ + + inode->i_mtime.tv_sec = + fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; + inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ + inode->i_ctime = inode->i_mtime; + inode->i_atime = inode->i_mtime; + + befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); + befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent); + befs_ino->i_attribute = fsrun_to_cpu(sb, raw_inode->attributes); + befs_ino->i_flags = fs32_to_cpu(sb, raw_inode->flags); + + if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){ + inode->i_size = 0; + inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; + strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink, + BEFS_SYMLINK_LEN); + } else { + int num_blks; + + befs_ino->i_data.ds = + fsds_to_cpu(sb, &raw_inode->data.datastream); + + num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); + inode->i_blocks = + num_blks * (befs_sb->block_size / VFS_BLOCK_SIZE); + inode->i_size = befs_ino->i_data.ds.size; + } + + inode->i_mapping->a_ops = &befs_aops; + + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &befs_dir_inode_operations; + inode->i_fop = &befs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) + inode->i_op = &befs_symlink_inode_operations; + else + inode->i_op = &befs_fast_symlink_inode_operations; + } else { + befs_error(sb, "Inode %lu is not a regular file, " + "directory or symlink. THAT IS WRONG! BeFS has no " + "on disk special files", inode->i_ino); + goto unacquire_bh; + } + + brelse(bh); + befs_debug(sb, "<--- %s", __func__); + unlock_new_inode(inode); + return inode; + + unacquire_bh: + brelse(bh); + + unacquire_none: + iget_failed(inode); + befs_debug(sb, "<--- %s - Bad inode", __func__); + return ERR_PTR(ret); +} + +/* Initialize the inode cache. Called at fs setup. + * + * Taken from NFS implementation by Al Viro. + */ +static int __init +befs_init_inodecache(void) +{ + befs_inode_cachep = kmem_cache_create("befs_inode_cache", + sizeof (struct befs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (befs_inode_cachep == NULL) { + pr_err("%s: Couldn't initialize inode slabcache\n", __func__); + return -ENOMEM; + } + return 0; +} + +/* Called at fs teardown. + * + * Taken from NFS implementation by Al Viro. + */ +static void +befs_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(befs_inode_cachep); +} + +/* + * The inode of symbolic link is different to data stream. + * The data stream become link name. Unless the LONG_SYMLINK + * flag is set. + */ +static void * +befs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct super_block *sb = dentry->d_sb; + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); + befs_data_stream *data = &befs_ino->i_data.ds; + befs_off_t len = data->size; + char *link; + + if (len == 0) { + befs_error(sb, "Long symlink with illegal length"); + link = ERR_PTR(-EIO); + } else { + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) { + link = ERR_PTR(-ENOMEM); + } else if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; + } + } + nd_set_link(nd, link); + return NULL; +} + + +static void * +befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); + + nd_set_link(nd, befs_ino->i_data.symlink); + return NULL; +} + +/* + * UTF-8 to NLS charset convert routine + * + * + * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than + * the nls tables directly + */ + +static int +befs_utf2nls(struct super_block *sb, const char *in, + int in_len, char **out, int *out_len) +{ + struct nls_table *nls = BEFS_SB(sb)->nls; + int i, o; + unicode_t uni; + int unilen, utflen; + char *result; + /* The utf8->nls conversion won't make the final nls string bigger + * than the utf one, but if the string is pure ascii they'll have the + * same width and an extra char is needed to save the additional \0 + */ + int maxlen = in_len + 1; + + befs_debug(sb, "---> %s", __func__); + + if (!nls) { + befs_error(sb, "%s called with no NLS table loaded", __func__); + return -EINVAL; + } + + *out = result = kmalloc(maxlen, GFP_NOFS); + if (!*out) { + befs_error(sb, "%s cannot allocate memory", __func__); + *out_len = 0; + return -ENOMEM; + } + + for (i = o = 0; i < in_len; i += utflen, o += unilen) { + + /* convert from UTF-8 to Unicode */ + utflen = utf8_to_utf32(&in[i], in_len - i, &uni); + if (utflen < 0) + goto conv_err; + + /* convert from Unicode to nls */ + if (uni > MAX_WCHAR_T) + goto conv_err; + unilen = nls->uni2char(uni, &result[o], in_len - o); + if (unilen < 0) + goto conv_err; + } + result[o] = '\0'; + *out_len = o; + + befs_debug(sb, "<--- %s", __func__); + + return o; + + conv_err: + befs_error(sb, "Name using character set %s contains a character that " + "cannot be converted to unicode.", nls->charset); + befs_debug(sb, "<--- %s", __func__); + kfree(result); + return -EILSEQ; +} + +/** + * befs_nls2utf - Convert NLS string to utf8 encodeing + * @sb: Superblock + * @in: Input string buffer in NLS format + * @in_len: Length of input string in bytes + * @out: The output string in UTF-8 format + * @out_len: Length of the output buffer + * + * Converts input string @in, which is in the format of the loaded NLS map, + * into a utf8 string. + * + * The destination string @out is allocated by this function and the caller is + * responsible for freeing it with kfree() + * + * On return, *@out_len is the length of @out in bytes. + * + * On success, the return value is the number of utf8 characters written to + * the output buffer @out. + * + * On Failure, a negative number coresponding to the error code is returned. + */ + +static int +befs_nls2utf(struct super_block *sb, const char *in, + int in_len, char **out, int *out_len) +{ + struct nls_table *nls = BEFS_SB(sb)->nls; + int i, o; + wchar_t uni; + int unilen, utflen; + char *result; + /* There're nls characters that will translate to 3-chars-wide UTF-8 + * characters, a additional byte is needed to save the final \0 + * in special cases */ + int maxlen = (3 * in_len) + 1; + + befs_debug(sb, "---> %s\n", __func__); + + if (!nls) { + befs_error(sb, "%s called with no NLS table loaded.", + __func__); + return -EINVAL; + } + + *out = result = kmalloc(maxlen, GFP_NOFS); + if (!*out) { + befs_error(sb, "%s cannot allocate memory", __func__); + *out_len = 0; + return -ENOMEM; + } + + for (i = o = 0; i < in_len; i += unilen, o += utflen) { + + /* convert from nls to unicode */ + unilen = nls->char2uni(&in[i], in_len - i, &uni); + if (unilen < 0) + goto conv_err; + + /* convert from unicode to UTF-8 */ + utflen = utf32_to_utf8(uni, &result[o], 3); + if (utflen <= 0) + goto conv_err; + } + + result[o] = '\0'; + *out_len = o; + + befs_debug(sb, "<--- %s", __func__); + + return i; + + conv_err: + befs_error(sb, "Name using charecter set %s contains a charecter that " + "cannot be converted to unicode.", nls->charset); + befs_debug(sb, "<--- %s", __func__); + kfree(result); + return -EILSEQ; +} + +/** + * Use the + * + */ +enum { + Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, +}; + +static const match_table_t befs_tokens = { + {Opt_uid, "uid=%d"}, + {Opt_gid, "gid=%d"}, + {Opt_charset, "iocharset=%s"}, + {Opt_debug, "debug"}, + {Opt_err, NULL} +}; + +static int +parse_options(char *options, struct befs_mount_options *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + kuid_t uid; + kgid_t gid; + + /* Initialize options */ + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; + opts->use_uid = 0; + opts->use_gid = 0; + opts->iocharset = NULL; + opts->debug = 0; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, befs_tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + uid = INVALID_UID; + if (option >= 0) + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + pr_err("Invalid uid %d, " + "using default\n", option); + break; + } + opts->uid = uid; + opts->use_uid = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + gid = INVALID_GID; + if (option >= 0) + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + pr_err("Invalid gid %d, " + "using default\n", option); + break; + } + opts->gid = gid; + opts->use_gid = 1; + break; + case Opt_charset: + kfree(opts->iocharset); + opts->iocharset = match_strdup(&args[0]); + if (!opts->iocharset) { + pr_err("allocation failure for " + "iocharset string\n"); + return 0; + } + break; + case Opt_debug: + opts->debug = 1; + break; + default: + pr_err("Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + return 1; +} + +/* This function has the responsibiltiy of getting the + * filesystem ready for unmounting. + * Basically, we free everything that we allocated in + * befs_read_inode + */ +static void +befs_put_super(struct super_block *sb) +{ + kfree(BEFS_SB(sb)->mount_opts.iocharset); + BEFS_SB(sb)->mount_opts.iocharset = NULL; + unload_nls(BEFS_SB(sb)->nls); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +/* Allocate private field of the superblock, fill it. + * + * Finish filling the public superblock fields + * Make the root directory + * Load a set of NLS translations if needed. + */ +static int +befs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh; + struct befs_sb_info *befs_sb; + befs_super_block *disk_sb; + struct inode *root; + long ret = -EINVAL; + const unsigned long sb_block = 0; + const off_t x86_sb_off = 512; + + save_mount_options(sb, data); + + sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + pr_err("(%s): Unable to allocate memory for private " + "portion of superblock. Bailing.\n", sb->s_id); + goto unacquire_none; + } + befs_sb = BEFS_SB(sb); + + if (!parse_options((char *) data, &befs_sb->mount_opts)) { + befs_error(sb, "cannot parse mount options"); + goto unacquire_priv_sbp; + } + + befs_debug(sb, "---> %s", __func__); + + if (!(sb->s_flags & MS_RDONLY)) { + befs_warning(sb, + "No write support. Marking filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } + + /* + * Set dummy blocksize to read super block. + * Will be set to real fs blocksize later. + * + * Linux 2.4.10 and later refuse to read blocks smaller than + * the hardsect size for the device. But we also need to read at + * least 1k to get the second 512 bytes of the volume. + * -WD 10-26-01 + */ + sb_min_blocksize(sb, 1024); + + if (!(bh = sb_bread(sb, sb_block))) { + befs_error(sb, "unable to read superblock"); + goto unacquire_priv_sbp; + } + + /* account for offset of super block on x86 */ + disk_sb = (befs_super_block *) bh->b_data; + if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) || + (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) { + befs_debug(sb, "Using PPC superblock location"); + } else { + befs_debug(sb, "Using x86 superblock location"); + disk_sb = + (befs_super_block *) ((void *) bh->b_data + x86_sb_off); + } + + if ((befs_load_sb(sb, disk_sb) != BEFS_OK) || + (befs_check_sb(sb) != BEFS_OK)) + goto unacquire_bh; + + befs_dump_super_block(sb, disk_sb); + + brelse(bh); + + if( befs_sb->num_blocks > ~((sector_t)0) ) { + befs_error(sb, "blocks count: %llu " + "is larger than the host can use", + befs_sb->num_blocks); + goto unacquire_priv_sbp; + } + + /* + * set up enough so that it can read an inode + * Fill in kernel superblock fields from private sb + */ + sb->s_magic = BEFS_SUPER_MAGIC; + /* Set real blocksize of fs */ + sb_set_blocksize(sb, (ulong) befs_sb->block_size); + sb->s_op = &befs_sops; + root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto unacquire_priv_sbp; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + befs_error(sb, "get root inode failed"); + goto unacquire_priv_sbp; + } + + /* load nls library */ + if (befs_sb->mount_opts.iocharset) { + befs_debug(sb, "Loading nls: %s", + befs_sb->mount_opts.iocharset); + befs_sb->nls = load_nls(befs_sb->mount_opts.iocharset); + if (!befs_sb->nls) { + befs_warning(sb, "Cannot load nls %s" + " loading default nls", + befs_sb->mount_opts.iocharset); + befs_sb->nls = load_nls_default(); + } + /* load default nls if none is specified in mount options */ + } else { + befs_debug(sb, "Loading default nls"); + befs_sb->nls = load_nls_default(); + } + + return 0; +/*****************/ + unacquire_bh: + brelse(bh); + + unacquire_priv_sbp: + kfree(befs_sb->mount_opts.iocharset); + kfree(sb->s_fs_info); + + unacquire_none: + sb->s_fs_info = NULL; + return ret; +} + +static int +befs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + if (!(*flags & MS_RDONLY)) + return -EINVAL; + return 0; +} + +static int +befs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + befs_debug(sb, "---> %s", __func__); + + buf->f_type = BEFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = BEFS_SB(sb)->num_blocks; + buf->f_bfree = BEFS_SB(sb)->num_blocks - BEFS_SB(sb)->used_blocks; + buf->f_bavail = buf->f_bfree; + buf->f_files = 0; /* UNKNOWN */ + buf->f_ffree = 0; /* UNKNOWN */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = BEFS_NAME_LEN; + + befs_debug(sb, "<--- %s", __func__); + + return 0; +} + +static struct dentry * +befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); +} + +static struct file_system_type befs_fs_type = { + .owner = THIS_MODULE, + .name = "befs", + .mount = befs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("befs"); + +static int __init +init_befs_fs(void) +{ + int err; + + pr_info("version: %s\n", BEFS_VERSION); + + err = befs_init_inodecache(); + if (err) + goto unacquire_none; + + err = register_filesystem(&befs_fs_type); + if (err) + goto unacquire_inodecache; + + return 0; + +unacquire_inodecache: + befs_destroy_inodecache(); + +unacquire_none: + return err; +} + +static void __exit +exit_befs_fs(void) +{ + befs_destroy_inodecache(); + + unregister_filesystem(&befs_fs_type); +} + +/* +Macros that typecheck the init and exit functions, +ensures that they are called at init and cleanup, +and eliminates warnings about unused functions. +*/ +module_init(init_befs_fs) +module_exit(exit_befs_fs) diff --git a/kernel/fs/befs/super.c b/kernel/fs/befs/super.c new file mode 100644 index 000000000..aeafc4d84 --- /dev/null +++ b/kernel/fs/befs/super.c @@ -0,0 +1,112 @@ +/* + * super.c + * + * Copyright (C) 2001-2002 Will Dyson + * + * Licensed under the GNU GPL. See the file COPYING for details. + * + */ + +#include +#include /* for PAGE_SIZE */ + +#include "befs.h" +#include "super.h" + +/** + * load_befs_sb -- Read from disk and properly byteswap all the fields + * of the befs superblock + * + * + * + * + */ +int +befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) +{ + struct befs_sb_info *befs_sb = BEFS_SB(sb); + + /* Check the byte order of the filesystem */ + if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) + befs_sb->byte_order = BEFS_BYTESEX_LE; + else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) + befs_sb->byte_order = BEFS_BYTESEX_BE; + + befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); + befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); + befs_sb->magic3 = fs32_to_cpu(sb, disk_sb->magic3); + befs_sb->block_size = fs32_to_cpu(sb, disk_sb->block_size); + befs_sb->block_shift = fs32_to_cpu(sb, disk_sb->block_shift); + befs_sb->num_blocks = fs64_to_cpu(sb, disk_sb->num_blocks); + befs_sb->used_blocks = fs64_to_cpu(sb, disk_sb->used_blocks); + befs_sb->inode_size = fs32_to_cpu(sb, disk_sb->inode_size); + + befs_sb->blocks_per_ag = fs32_to_cpu(sb, disk_sb->blocks_per_ag); + befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift); + befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags); + + befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks); + befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start); + befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end); + + befs_sb->root_dir = fsrun_to_cpu(sb, disk_sb->root_dir); + befs_sb->indices = fsrun_to_cpu(sb, disk_sb->indices); + befs_sb->nls = NULL; + + return BEFS_OK; +} + +int +befs_check_sb(struct super_block *sb) +{ + struct befs_sb_info *befs_sb = BEFS_SB(sb); + + /* Check magic headers of super block */ + if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1) + || (befs_sb->magic2 != BEFS_SUPER_MAGIC2) + || (befs_sb->magic3 != BEFS_SUPER_MAGIC3)) { + befs_error(sb, "invalid magic header"); + return BEFS_ERR; + } + + /* + * Check blocksize of BEFS. + * + * Blocksize of BEFS is 1024, 2048, 4096 or 8192. + */ + + if ((befs_sb->block_size != 1024) + && (befs_sb->block_size != 2048) + && (befs_sb->block_size != 4096) + && (befs_sb->block_size != 8192)) { + befs_error(sb, "invalid blocksize: %u", befs_sb->block_size); + return BEFS_ERR; + } + + if (befs_sb->block_size > PAGE_SIZE) { + befs_error(sb, "blocksize(%u) cannot be larger" + "than system pagesize(%lu)", befs_sb->block_size, + PAGE_SIZE); + return BEFS_ERR; + } + + /* + * block_shift and block_size encode the same information + * in different ways as a consistency check. + */ + + if ((1 << befs_sb->block_shift) != befs_sb->block_size) { + befs_error(sb, "block_shift disagrees with block_size. " + "Corruption likely."); + return BEFS_ERR; + } + + if (befs_sb->log_start != befs_sb->log_end) { + befs_error(sb, "Filesystem not clean! There are blocks in the " + "journal. You must boot into BeOS and mount this volume " + "to make it clean."); + return BEFS_ERR; + } + + return BEFS_OK; +} diff --git a/kernel/fs/befs/super.h b/kernel/fs/befs/super.h new file mode 100644 index 000000000..dc4556376 --- /dev/null +++ b/kernel/fs/befs/super.h @@ -0,0 +1,8 @@ +/* + * super.h + */ + +int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb); + +int befs_check_sb(struct super_block *sb); + diff --git a/kernel/fs/bfs/Kconfig b/kernel/fs/bfs/Kconfig new file mode 100644 index 000000000..3728a6479 --- /dev/null +++ b/kernel/fs/bfs/Kconfig @@ -0,0 +1,19 @@ +config BFS_FS + tristate "BFS file system support" + depends on BLOCK + help + Boot File System (BFS) is a file system used under SCO UnixWare to + allow the bootloader access to the kernel image and other important + files during the boot process. It is usually mounted under /stand + and corresponds to the slice marked as "STAND" in the UnixWare + partition. You should say Y if you want to read or write the files + on your /stand slice from within Linux. You then also need to say Y + to "UnixWare slices support", below. More information about the BFS + file system is contained in the file + . + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be called + bfs. Note that the file system of your root partition (the one + containing the directory /) cannot be compiled as a module. diff --git a/kernel/fs/bfs/Makefile b/kernel/fs/bfs/Makefile new file mode 100644 index 000000000..c787b36d9 --- /dev/null +++ b/kernel/fs/bfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for BFS filesystem. +# + +obj-$(CONFIG_BFS_FS) += bfs.o + +bfs-objs := inode.o file.o dir.o diff --git a/kernel/fs/bfs/bfs.h b/kernel/fs/bfs/bfs.h new file mode 100644 index 000000000..f40006db3 --- /dev/null +++ b/kernel/fs/bfs/bfs.h @@ -0,0 +1,60 @@ +/* + * fs/bfs/bfs.h + * Copyright (C) 1999 Tigran Aivazian + */ +#ifndef _FS_BFS_BFS_H +#define _FS_BFS_BFS_H + +#include + +/* + * BFS file system in-core superblock info + */ +struct bfs_sb_info { + unsigned long si_blocks; + unsigned long si_freeb; + unsigned long si_freei; + unsigned long si_lf_eblk; + unsigned long si_lasti; + unsigned long *si_imap; + struct mutex bfs_lock; +}; + +/* + * BFS file system in-core inode info + */ +struct bfs_inode_info { + unsigned long i_dsk_ino; /* inode number from the disk, can be 0 */ + unsigned long i_sblock; + unsigned long i_eblock; + struct inode vfs_inode; +}; + +static inline struct bfs_sb_info *BFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct bfs_inode_info *BFS_I(struct inode *inode) +{ + return container_of(inode, struct bfs_inode_info, vfs_inode); +} + + +#define printf(format, args...) \ + printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args) + +/* inode.c */ +extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); +extern void bfs_dump_imap(const char *, struct super_block *); + +/* file.c */ +extern const struct inode_operations bfs_file_inops; +extern const struct file_operations bfs_file_operations; +extern const struct address_space_operations bfs_aops; + +/* dir.c */ +extern const struct inode_operations bfs_dir_inops; +extern const struct file_operations bfs_dir_operations; + +#endif /* _FS_BFS_BFS_H */ diff --git a/kernel/fs/bfs/dir.c b/kernel/fs/bfs/dir.c new file mode 100644 index 000000000..3ec611314 --- /dev/null +++ b/kernel/fs/bfs/dir.c @@ -0,0 +1,365 @@ +/* + * fs/bfs/dir.c + * BFS directory operations. + * Copyright (C) 1999,2000 Tigran Aivazian + * Made endianness-clean by Andrew Stribblehill 2005 + */ + +#include +#include +#include +#include +#include +#include "bfs.h" + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +static int bfs_add_entry(struct inode *dir, const unsigned char *name, + int namelen, int ino); +static struct buffer_head *bfs_find_entry(struct inode *dir, + const unsigned char *name, int namelen, + struct bfs_dirent **res_dir); + +static int bfs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct buffer_head *bh; + struct bfs_dirent *de; + unsigned int offset; + int block; + + if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { + printf("Bad f_pos=%08lx for %s:%08lx\n", + (unsigned long)ctx->pos, + dir->i_sb->s_id, dir->i_ino); + return -EINVAL; + } + + while (ctx->pos < dir->i_size) { + offset = ctx->pos & (BFS_BSIZE - 1); + block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); + bh = sb_bread(dir->i_sb, block); + if (!bh) { + ctx->pos += BFS_BSIZE - offset; + continue; + } + do { + de = (struct bfs_dirent *)(bh->b_data + offset); + if (de->ino) { + int size = strnlen(de->name, BFS_NAMELEN); + if (!dir_emit(ctx, de->name, size, + le16_to_cpu(de->ino), + DT_UNKNOWN)) { + brelse(bh); + return 0; + } + } + offset += BFS_DIRENT_SIZE; + ctx->pos += BFS_DIRENT_SIZE; + } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); + brelse(bh); + } + return 0; +} + +const struct file_operations bfs_dir_operations = { + .read = generic_read_dir, + .iterate = bfs_readdir, + .fsync = generic_file_fsync, + .llseek = generic_file_llseek, +}; + +static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + int err; + struct inode *inode; + struct super_block *s = dir->i_sb; + struct bfs_sb_info *info = BFS_SB(s); + unsigned long ino; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + mutex_lock(&info->bfs_lock); + ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); + if (ino > info->si_lasti) { + mutex_unlock(&info->bfs_lock); + iput(inode); + return -ENOSPC; + } + set_bit(ino, info->si_imap); + info->si_freei--; + inode_init_owner(inode, dir, mode); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + inode->i_op = &bfs_file_inops; + inode->i_fop = &bfs_file_operations; + inode->i_mapping->a_ops = &bfs_aops; + inode->i_ino = ino; + BFS_I(inode)->i_dsk_ino = ino; + BFS_I(inode)->i_sblock = 0; + BFS_I(inode)->i_eblock = 0; + insert_inode_hash(inode); + mark_inode_dirty(inode); + bfs_dump_imap("create", s); + + err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, + inode->i_ino); + if (err) { + inode_dec_link_count(inode); + mutex_unlock(&info->bfs_lock); + iput(inode); + return err; + } + mutex_unlock(&info->bfs_lock); + d_instantiate(dentry, inode); + return 0; +} + +static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct buffer_head *bh; + struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(dir->i_sb); + + if (dentry->d_name.len > BFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + mutex_lock(&info->bfs_lock); + bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + if (bh) { + unsigned long ino = (unsigned long)le16_to_cpu(de->ino); + brelse(bh); + inode = bfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + mutex_unlock(&info->bfs_lock); + return ERR_CAST(inode); + } + } + mutex_unlock(&info->bfs_lock); + d_add(dentry, inode); + return NULL; +} + +static int bfs_link(struct dentry *old, struct inode *dir, + struct dentry *new) +{ + struct inode *inode = d_inode(old); + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + int err; + + mutex_lock(&info->bfs_lock); + err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, + inode->i_ino); + if (err) { + mutex_unlock(&info->bfs_lock); + return err; + } + inc_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + ihold(inode); + d_instantiate(new, inode); + mutex_unlock(&info->bfs_lock); + return 0; +} + +static int bfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error = -ENOENT; + struct inode *inode = d_inode(dentry); + struct buffer_head *bh; + struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + + mutex_lock(&info->bfs_lock); + bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) + goto out_brelse; + + if (!inode->i_nlink) { + printf("unlinking non-existent file %s:%lu (nlink=%d)\n", + inode->i_sb->s_id, inode->i_ino, + inode->i_nlink); + set_nlink(inode, 1); + } + de->ino = 0; + mark_buffer_dirty_inode(bh, dir); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + error = 0; + +out_brelse: + brelse(bh); + mutex_unlock(&info->bfs_lock); + return error; +} + +static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh; + struct bfs_dirent *old_de, *new_de; + struct bfs_sb_info *info; + int error = -ENOENT; + + old_bh = new_bh = NULL; + old_inode = d_inode(old_dentry); + if (S_ISDIR(old_inode->i_mode)) + return -EINVAL; + + info = BFS_SB(old_inode->i_sb); + + mutex_lock(&info->bfs_lock); + old_bh = bfs_find_entry(old_dir, + old_dentry->d_name.name, + old_dentry->d_name.len, &old_de); + + if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) + goto end_rename; + + error = -EPERM; + new_inode = d_inode(new_dentry); + new_bh = bfs_find_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len, &new_de); + + if (new_bh && !new_inode) { + brelse(new_bh); + new_bh = NULL; + } + if (!new_bh) { + error = bfs_add_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_inode->i_ino); + if (error) + goto end_rename; + } + old_de->ino = 0; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(old_dir); + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME_SEC; + inode_dec_link_count(new_inode); + } + mark_buffer_dirty_inode(old_bh, old_dir); + error = 0; + +end_rename: + mutex_unlock(&info->bfs_lock); + brelse(old_bh); + brelse(new_bh); + return error; +} + +const struct inode_operations bfs_dir_inops = { + .create = bfs_create, + .lookup = bfs_lookup, + .link = bfs_link, + .unlink = bfs_unlink, + .rename = bfs_rename, +}; + +static int bfs_add_entry(struct inode *dir, const unsigned char *name, + int namelen, int ino) +{ + struct buffer_head *bh; + struct bfs_dirent *de; + int block, sblock, eblock, off, pos; + int i; + + dprintf("name=%s, namelen=%d\n", name, namelen); + + if (!namelen) + return -ENOENT; + if (namelen > BFS_NAMELEN) + return -ENAMETOOLONG; + + sblock = BFS_I(dir)->i_sblock; + eblock = BFS_I(dir)->i_eblock; + for (block = sblock; block <= eblock; block++) { + bh = sb_bread(dir->i_sb, block); + if (!bh) + return -EIO; + for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { + de = (struct bfs_dirent *)(bh->b_data + off); + if (!de->ino) { + pos = (block - sblock) * BFS_BSIZE + off; + if (pos >= dir->i_size) { + dir->i_size += BFS_DIRENT_SIZE; + dir->i_ctime = CURRENT_TIME_SEC; + } + dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + de->ino = cpu_to_le16((u16)ino); + for (i = 0; i < BFS_NAMELEN; i++) + de->name[i] = + (i < namelen) ? name[i] : 0; + mark_buffer_dirty_inode(bh, dir); + brelse(bh); + return 0; + } + } + brelse(bh); + } + return -ENOSPC; +} + +static inline int bfs_namecmp(int len, const unsigned char *name, + const char *buffer) +{ + if ((len < BFS_NAMELEN) && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +static struct buffer_head *bfs_find_entry(struct inode *dir, + const unsigned char *name, int namelen, + struct bfs_dirent **res_dir) +{ + unsigned long block = 0, offset = 0; + struct buffer_head *bh = NULL; + struct bfs_dirent *de; + + *res_dir = NULL; + if (namelen > BFS_NAMELEN) + return NULL; + + while (block * BFS_BSIZE + offset < dir->i_size) { + if (!bh) { + bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block); + if (!bh) { + block++; + continue; + } + } + de = (struct bfs_dirent *)(bh->b_data + offset); + offset += BFS_DIRENT_SIZE; + if (le16_to_cpu(de->ino) && + bfs_namecmp(namelen, name, de->name)) { + *res_dir = de; + return bh; + } + if (offset < bh->b_size) + continue; + brelse(bh); + bh = NULL; + offset = 0; + block++; + } + brelse(bh); + return NULL; +} diff --git a/kernel/fs/bfs/file.c b/kernel/fs/bfs/file.c new file mode 100644 index 000000000..97f1b5160 --- /dev/null +++ b/kernel/fs/bfs/file.c @@ -0,0 +1,197 @@ +/* + * fs/bfs/file.c + * BFS file operations. + * Copyright (C) 1999,2000 Tigran Aivazian + * + * Make the file block allocation algorithm understand the size + * of the underlying block device. + * Copyright (C) 2007 Dmitri Vorobiev + * + */ + +#include +#include +#include "bfs.h" + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +const struct file_operations bfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, +}; + +static int bfs_move_block(unsigned long from, unsigned long to, + struct super_block *sb) +{ + struct buffer_head *bh, *new; + + bh = sb_bread(sb, from); + if (!bh) + return -EIO; + new = sb_getblk(sb, to); + memcpy(new->b_data, bh->b_data, bh->b_size); + mark_buffer_dirty(new); + bforget(bh); + brelse(new); + return 0; +} + +static int bfs_move_blocks(struct super_block *sb, unsigned long start, + unsigned long end, unsigned long where) +{ + unsigned long i; + + dprintf("%08lx-%08lx->%08lx\n", start, end, where); + for (i = start; i <= end; i++) + if(bfs_move_block(i, where + i, sb)) { + dprintf("failed to move block %08lx -> %08lx\n", i, + where + i); + return -EIO; + } + return 0; +} + +static int bfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + unsigned long phys; + int err; + struct super_block *sb = inode->i_sb; + struct bfs_sb_info *info = BFS_SB(sb); + struct bfs_inode_info *bi = BFS_I(inode); + + phys = bi->i_sblock + block; + if (!create) { + if (phys <= bi->i_eblock) { + dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + } + return 0; + } + + /* + * If the file is not empty and the requested block is within the + * range of blocks allocated for this file, we can grant it. + */ + if (bi->i_sblock && (phys <= bi->i_eblock)) { + dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + return 0; + } + + /* The file will be extended, so let's see if there is enough space. */ + if (phys >= info->si_blocks) + return -ENOSPC; + + /* The rest has to be protected against itself. */ + mutex_lock(&info->bfs_lock); + + /* + * If the last data block for this file is the last allocated + * block, we can extend the file trivially, without moving it + * anywhere. + */ + if (bi->i_eblock == info->si_lf_eblk) { + dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + info->si_freeb -= phys - bi->i_eblock; + info->si_lf_eblk = bi->i_eblock = phys; + mark_inode_dirty(inode); + err = 0; + goto out; + } + + /* Ok, we have to move this entire file to the next free block. */ + phys = info->si_lf_eblk + 1; + if (phys + block >= info->si_blocks) { + err = -ENOSPC; + goto out; + } + + if (bi->i_sblock) { + err = bfs_move_blocks(inode->i_sb, bi->i_sblock, + bi->i_eblock, phys); + if (err) { + dprintf("failed to move ino=%08lx -> fs corruption\n", + inode->i_ino); + goto out; + } + } else + err = 0; + + dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", + create, (unsigned long)block, phys); + bi->i_sblock = phys; + phys += block; + info->si_lf_eblk = bi->i_eblock = phys; + + /* + * This assumes nothing can write the inode back while we are here + * and thus update inode->i_blocks! (XXX) + */ + info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; + mark_inode_dirty(inode); + map_bh(bh_result, sb, phys); +out: + mutex_unlock(&info->bfs_lock); + return err; +} + +static int bfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, bfs_get_block, wbc); +} + +static int bfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, bfs_get_block); +} + +static void bfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, inode->i_size); +} + +static int bfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + bfs_get_block); + if (unlikely(ret)) + bfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t bfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, bfs_get_block); +} + +const struct address_space_operations bfs_aops = { + .readpage = bfs_readpage, + .writepage = bfs_writepage, + .write_begin = bfs_write_begin, + .write_end = generic_write_end, + .bmap = bfs_bmap, +}; + +const struct inode_operations bfs_file_inops; diff --git a/kernel/fs/bfs/inode.c b/kernel/fs/bfs/inode.c new file mode 100644 index 000000000..fdcb4d69f --- /dev/null +++ b/kernel/fs/bfs/inode.c @@ -0,0 +1,499 @@ +/* + * fs/bfs/inode.c + * BFS superblock and inode operations. + * Copyright (C) 1999-2006 Tigran Aivazian + * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. + * + * Made endianness-clean by Andrew Stribblehill , 2005. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfs.h" + +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); +MODULE_LICENSE("GPL"); + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +struct inode *bfs_iget(struct super_block *sb, unsigned long ino) +{ + struct bfs_inode *di; + struct inode *inode; + struct buffer_head *bh; + int block, off; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { + printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino); + goto error; + } + + block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; + bh = sb_bread(inode->i_sb, block); + if (!bh) { + printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, + ino); + goto error; + } + + off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; + di = (struct bfs_inode *)bh->b_data + off; + + inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { + inode->i_mode |= S_IFDIR; + inode->i_op = &bfs_dir_inops; + inode->i_fop = &bfs_dir_operations; + } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) { + inode->i_mode |= S_IFREG; + inode->i_op = &bfs_file_inops; + inode->i_fop = &bfs_file_operations; + inode->i_mapping->a_ops = &bfs_aops; + } + + BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); + BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); + BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); + i_uid_write(inode, le32_to_cpu(di->i_uid)); + i_gid_write(inode, le32_to_cpu(di->i_gid)); + set_nlink(inode, le32_to_cpu(di->i_nlink)); + inode->i_size = BFS_FILESIZE(di); + inode->i_blocks = BFS_FILEBLOCKS(di); + inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); + inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); + inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + + brelse(bh); + unlock_new_inode(inode); + return inode; + +error: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p) +{ + if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) { + printf("Bad inode number %s:%08x\n", sb->s_id, ino); + return ERR_PTR(-EIO); + } + + ino -= BFS_ROOT_INO; + + *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK); + if (!*p) { + printf("Unable to read inode %s:%08x\n", sb->s_id, ino); + return ERR_PTR(-EIO); + } + + return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK; +} + +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + unsigned int ino = (u16)inode->i_ino; + unsigned long i_sblock; + struct bfs_inode *di; + struct buffer_head *bh; + int err = 0; + + dprintf("ino=%08x\n", ino); + + di = find_inode(inode->i_sb, ino, &bh); + if (IS_ERR(di)) + return PTR_ERR(di); + + mutex_lock(&info->bfs_lock); + + if (ino == BFS_ROOT_INO) + di->i_vtype = cpu_to_le32(BFS_VDIR); + else + di->i_vtype = cpu_to_le32(BFS_VREG); + + di->i_ino = cpu_to_le16(ino); + di->i_mode = cpu_to_le32(inode->i_mode); + di->i_uid = cpu_to_le32(i_uid_read(inode)); + di->i_gid = cpu_to_le32(i_gid_read(inode)); + di->i_nlink = cpu_to_le32(inode->i_nlink); + di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + i_sblock = BFS_I(inode)->i_sblock; + di->i_sblock = cpu_to_le32(i_sblock); + di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); + di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); + + mark_buffer_dirty(bh); + if (wbc->sync_mode == WB_SYNC_ALL) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + brelse(bh); + mutex_unlock(&info->bfs_lock); + return err; +} + +static void bfs_evict_inode(struct inode *inode) +{ + unsigned long ino = inode->i_ino; + struct bfs_inode *di; + struct buffer_head *bh; + struct super_block *s = inode->i_sb; + struct bfs_sb_info *info = BFS_SB(s); + struct bfs_inode_info *bi = BFS_I(inode); + + dprintf("ino=%08lx\n", ino); + + truncate_inode_pages_final(&inode->i_data); + invalidate_inode_buffers(inode); + clear_inode(inode); + + if (inode->i_nlink) + return; + + di = find_inode(s, inode->i_ino, &bh); + if (IS_ERR(di)) + return; + + mutex_lock(&info->bfs_lock); + /* clear on-disk inode */ + memset(di, 0, sizeof(struct bfs_inode)); + mark_buffer_dirty(bh); + brelse(bh); + + if (bi->i_dsk_ino) { + if (bi->i_sblock) + info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; + info->si_freei++; + clear_bit(ino, info->si_imap); + bfs_dump_imap("delete_inode", s); + } + + /* + * If this was the last file, make the previous block + * "last block of the last file" even if there is no + * real file there, saves us 1 gap. + */ + if (info->si_lf_eblk == bi->i_eblock) + info->si_lf_eblk = bi->i_sblock - 1; + mutex_unlock(&info->bfs_lock); +} + +static void bfs_put_super(struct super_block *s) +{ + struct bfs_sb_info *info = BFS_SB(s); + + if (!info) + return; + + mutex_destroy(&info->bfs_lock); + kfree(info->si_imap); + kfree(info); + s->s_fs_info = NULL; +} + +static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct bfs_sb_info *info = BFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + buf->f_type = BFS_MAGIC; + buf->f_bsize = s->s_blocksize; + buf->f_blocks = info->si_blocks; + buf->f_bfree = buf->f_bavail = info->si_freeb; + buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; + buf->f_ffree = info->si_freei; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = BFS_NAMELEN; + return 0; +} + +static struct kmem_cache *bfs_inode_cachep; + +static struct inode *bfs_alloc_inode(struct super_block *sb) +{ + struct bfs_inode_info *bi; + bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + return &bi->vfs_inode; +} + +static void bfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); +} + +static void bfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bfs_i_callback); +} + +static void init_once(void *foo) +{ + struct bfs_inode_info *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + +static int __init init_inodecache(void) +{ + bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", + sizeof(struct bfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (bfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(bfs_inode_cachep); +} + +static const struct super_operations bfs_sops = { + .alloc_inode = bfs_alloc_inode, + .destroy_inode = bfs_destroy_inode, + .write_inode = bfs_write_inode, + .evict_inode = bfs_evict_inode, + .put_super = bfs_put_super, + .statfs = bfs_statfs, +}; + +void bfs_dump_imap(const char *prefix, struct super_block *s) +{ +#ifdef DEBUG + int i; + char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!tmpbuf) + return; + for (i = BFS_SB(s)->si_lasti; i >= 0; i--) { + if (i > PAGE_SIZE - 100) break; + if (test_bit(i, BFS_SB(s)->si_imap)) + strcat(tmpbuf, "1"); + else + strcat(tmpbuf, "0"); + } + printf("BFS-fs: %s: lasti=%08lx <%s>\n", + prefix, BFS_SB(s)->si_lasti, tmpbuf); + free_page((unsigned long)tmpbuf); +#endif +} + +static int bfs_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh, *sbh; + struct bfs_super_block *bfs_sb; + struct inode *inode; + unsigned i, imap_len; + struct bfs_sb_info *info; + int ret = -EINVAL; + unsigned long i_sblock, i_eblock, i_eoff, s_size; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + mutex_init(&info->bfs_lock); + s->s_fs_info = info; + + sb_set_blocksize(s, BFS_BSIZE); + + sbh = sb_bread(s, 0); + if (!sbh) + goto out; + bfs_sb = (struct bfs_super_block *)sbh->b_data; + if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { + if (!silent) + printf("No BFS filesystem on %s (magic=%08x)\n", + s->s_id, le32_to_cpu(bfs_sb->s_magic)); + goto out1; + } + if (BFS_UNCLEAN(bfs_sb, s) && !silent) + printf("%s is unclean, continuing\n", s->s_id); + + s->s_magic = BFS_MAGIC; + + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + printf("Superblock is corrupted\n"); + goto out1; + } + + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / + sizeof(struct bfs_inode) + + BFS_ROOT_INO - 1; + imap_len = (info->si_lasti / 8) + 1; + info->si_imap = kzalloc(imap_len, GFP_KERNEL); + if (!info->si_imap) + goto out1; + for (i = 0; i < BFS_ROOT_INO; i++) + set_bit(i, info->si_imap); + + s->s_op = &bfs_sops; + inode = bfs_iget(s, BFS_ROOT_INO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out2; + } + s->s_root = d_make_root(inode); + if (!s->s_root) { + ret = -ENOMEM; + goto out2; + } + + info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; + info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 + - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; + info->si_freei = 0; + info->si_lf_eblk = 0; + + /* can we read the last block? */ + bh = sb_bread(s, info->si_blocks - 1); + if (!bh) { + printf("Last block not available: %lu\n", info->si_blocks - 1); + ret = -EIO; + goto out3; + } + brelse(bh); + + bh = NULL; + for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { + struct bfs_inode *di; + int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; + int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; + unsigned long eblock; + + if (!off) { + brelse(bh); + bh = sb_bread(s, block); + } + + if (!bh) + continue; + + di = (struct bfs_inode *)bh->b_data + off; + + /* test if filesystem is not corrupted */ + + i_eoff = le32_to_cpu(di->i_eoffset); + i_sblock = le32_to_cpu(di->i_sblock); + i_eblock = le32_to_cpu(di->i_eblock); + s_size = le32_to_cpu(bfs_sb->s_end); + + if (i_sblock > info->si_blocks || + i_eblock > info->si_blocks || + i_sblock > i_eblock || + i_eoff > s_size || + i_sblock * BFS_BSIZE > i_eoff) { + + printf("Inode 0x%08x corrupted\n", i); + + brelse(bh); + ret = -EIO; + goto out3; + } + + if (!di->i_ino) { + info->si_freei++; + continue; + } + set_bit(i, info->si_imap); + info->si_freeb -= BFS_FILEBLOCKS(di); + + eblock = le32_to_cpu(di->i_eblock); + if (eblock > info->si_lf_eblk) + info->si_lf_eblk = eblock; + } + brelse(bh); + brelse(sbh); + bfs_dump_imap("read_super", s); + return 0; + +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(sbh); +out: + mutex_destroy(&info->bfs_lock); + kfree(info); + s->s_fs_info = NULL; + return ret; +} + +static struct dentry *bfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); +} + +static struct file_system_type bfs_fs_type = { + .owner = THIS_MODULE, + .name = "bfs", + .mount = bfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("bfs"); + +static int __init init_bfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&bfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_bfs_fs(void) +{ + unregister_filesystem(&bfs_fs_type); + destroy_inodecache(); +} + +module_init(init_bfs_fs) +module_exit(exit_bfs_fs) diff --git a/kernel/fs/binfmt_aout.c b/kernel/fs/binfmt_aout.c new file mode 100644 index 000000000..4c556680f --- /dev/null +++ b/kernel/fs/binfmt_aout.c @@ -0,0 +1,423 @@ +/* + * linux/fs/binfmt_aout.c + * + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int load_aout_binary(struct linux_binprm *); +static int load_aout_library(struct file*); + +#ifdef CONFIG_COREDUMP +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + * + * Note that setuid/setgid files won't make a core-dump if the uid/gid + * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" + * field, which also makes sure the core-dumps won't be recursive if the + * dumping of the process results in another error.. + */ +static int aout_core_dump(struct coredump_params *cprm) +{ + mm_segment_t fs; + int has_dumped = 0; + void __user *dump_start; + int dump_size; + struct user dump; +#ifdef __alpha__ +# define START_DATA(u) ((void __user *)u.start_data) +#else +# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \ + u.start_code)) +#endif +# define START_STACK(u) ((void __user *)u.start_stack) + + fs = get_fs(); + set_fs(KERNEL_DS); + has_dumped = 1; + strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); + dump.u_ar0 = offsetof(struct user, regs); + dump.signal = cprm->siginfo->si_signo; + aout_dump_thread(cprm->regs, &dump); + +/* If the size of the dump file exceeds the rlimit, then see what would happen + if we wrote the stack, but not the data area. */ + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) + dump.u_dsize = 0; + +/* Make sure we have enough room to write the stack and data areas. */ + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) + dump.u_ssize = 0; + +/* make sure we actually have a data and stack area to dump */ + set_fs(USER_DS); + if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + dump.u_dsize = 0; + if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + dump.u_ssize = 0; + + set_fs(KERNEL_DS); +/* struct user */ + if (!dump_emit(cprm, &dump, sizeof(dump))) + goto end_coredump; +/* Now dump all of the user data. Include malloced stuff as well */ + if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump))) + goto end_coredump; +/* now we start writing out the user space info */ + set_fs(USER_DS); +/* Dump the data area */ + if (dump.u_dsize != 0) { + dump_start = START_DATA(dump); + dump_size = dump.u_dsize << PAGE_SHIFT; + if (!dump_emit(cprm, dump_start, dump_size)) + goto end_coredump; + } +/* Now prepare to dump the stack area */ + if (dump.u_ssize != 0) { + dump_start = START_STACK(dump); + dump_size = dump.u_ssize << PAGE_SHIFT; + if (!dump_emit(cprm, dump_start, dump_size)) + goto end_coredump; + } +end_coredump: + set_fs(fs); + return has_dumped; +} +#else +#define aout_core_dump NULL +#endif + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, + .core_dump = aout_core_dump, + .min_coredump = PAGE_SIZE +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end > start) { + unsigned long addr; + addr = vm_brk(start, end - start); + if (BAD_ADDR(addr)) + return addr; + } + return 0; +} + +/* + * create_aout_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ +static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm) +{ + char __user * __user *argv; + char __user * __user *envp; + unsigned long __user *sp; + int argc = bprm->argc; + int envc = bprm->envc; + + sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); +#ifdef __alpha__ +/* whee.. test-programs are so much fun. */ + put_user(0, --sp); + put_user(0, --sp); + if (bprm->loader) { + put_user(0, --sp); + put_user(1003, --sp); + put_user(bprm->loader, --sp); + put_user(1002, --sp); + } + put_user(bprm->exec, --sp); + put_user(1001, --sp); +#endif + sp -= envc+1; + envp = (char __user * __user *) sp; + sp -= argc+1; + argv = (char __user * __user *) sp; +#ifndef __alpha__ + put_user((unsigned long) envp,--sp); + put_user((unsigned long) argv,--sp); +#endif + put_user(argc,--sp); + current->mm->arg_start = (unsigned long) p; + while (argc-->0) { + char c; + put_user(p,argv++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-->0) { + char c; + put_user(p,envp++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,envp); + current->mm->env_end = (unsigned long) p; + return sp; +} + +/* + * These are the functions used to load a.out style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int load_aout_binary(struct linux_binprm * bprm) +{ + struct pt_regs *regs = current_pt_regs(); + struct exec ex; + unsigned long error; + unsigned long fd_offset; + unsigned long rlim; + int retval; + + ex = *((struct exec *) bprm->buf); /* exec-header */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && + N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || + N_TRSIZE(ex) || N_DRSIZE(ex) || + i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + return -ENOEXEC; + } + + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!bprm->file->f_op->mmap) + return -ENOEXEC; + + fd_offset = N_TXTOFF(ex); + + /* Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = rlimit(RLIMIT_DATA); + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (ex.a_data + ex.a_bss > rlim) + return -ENOMEM; + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + return retval; + + /* OK, This is the point of no return */ +#ifdef __alpha__ + SET_AOUT_PERSONALITY(bprm, ex); +#else + set_personality(PER_LINUX); +#endif + setup_new_exec(bprm); + + current->mm->end_code = ex.a_text + + (current->mm->start_code = N_TXTADDR(ex)); + current->mm->end_data = ex.a_data + + (current->mm->start_data = N_DATADDR(ex)); + current->mm->brk = ex.a_bss + + (current->mm->start_brk = N_BSSADDR(ex)); + + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) + return retval; + + install_exec_creds(bprm); + + if (N_MAGIC(ex) == OMAGIC) { + unsigned long text_addr, map_size; + loff_t pos; + + text_addr = N_TXTADDR(ex); + +#ifdef __alpha__ + pos = fd_offset; + map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; +#else + pos = 32; + map_size = ex.a_text+ex.a_data; +#endif + error = vm_brk(text_addr & PAGE_MASK, map_size); + if (error != (text_addr & PAGE_MASK)) + return error; + + error = read_code(bprm->file, text_addr, pos, + ex.a_text+ex.a_data); + if ((signed long)error < 0) + return error; + } else { + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) + { + printk(KERN_NOTICE "executable not page aligned\n"); + } + + if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) + { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert program: %pD\n", + bprm->file); + } + + if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + read_code(bprm->file, N_TXTADDR(ex), fd_offset, + ex.a_text + ex.a_data); + goto beyond_if; + } + + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + fd_offset); + + if (error != N_TXTADDR(ex)) + return error; + + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + fd_offset + ex.a_text); + if (error != N_DATADDR(ex)) + return error; + } +beyond_if: + set_binfmt(&aout_format); + + retval = set_brk(current->mm->start_brk, current->mm->brk); + if (retval < 0) + return retval; + + current->mm->start_stack = + (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); +#ifdef __alpha__ + regs->gp = ex.a_gpvalue; +#endif + start_thread(regs, ex.a_entry, current->mm->start_stack); + return 0; +} + +static int load_aout_library(struct file *file) +{ + struct inode * inode; + unsigned long bss, start_addr, len; + unsigned long error; + int retval; + struct exec ex; + + inode = file_inode(file); + + retval = -ENOEXEC; + error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + if (error != sizeof(ex)) + goto out; + + /* We come in here for the regular a.out style of shared libraries */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || + N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || + i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + goto out; + } + + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!file->f_op->mmap) + goto out; + + if (N_FLAGS(ex)) + goto out; + + /* For QMAGIC, the starting address is 0x20 into the page. We mask + this off to get the starting address for the page */ + + start_addr = ex.a_entry & 0xfffff000; + + if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { + if (printk_ratelimit()) + { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %pD\n", + file); + } + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + + read_code(file, start_addr, N_TXTOFF(ex), + ex.a_text + ex.a_data); + retval = 0; + goto out; + } + /* Now use mmap to map the library into memory. */ + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + N_TXTOFF(ex)); + retval = error; + if (error != start_addr) + goto out; + + len = PAGE_ALIGN(ex.a_text + ex.a_data); + bss = ex.a_text + ex.a_data + ex.a_bss; + if (bss > len) { + error = vm_brk(start_addr + len, bss - len); + retval = error; + if (error != start_addr + len) + goto out; + } + retval = 0; +out: + return retval; +} + +static int __init init_aout_binfmt(void) +{ + register_binfmt(&aout_format); + return 0; +} + +static void __exit exit_aout_binfmt(void) +{ + unregister_binfmt(&aout_format); +} + +core_initcall(init_aout_binfmt); +module_exit(exit_aout_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/binfmt_elf.c b/kernel/fs/binfmt_elf.c new file mode 100644 index 000000000..cd46e4158 --- /dev/null +++ b/kernel/fs/binfmt_elf.c @@ -0,0 +1,2326 @@ +/* + * linux/fs/binfmt_elf.c + * + * These are the functions used to load ELF format executables as used + * on SVr4 machines. Information on the format may be found in the book + * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support + * Tools". + * + * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef user_long_t +#define user_long_t long +#endif +#ifndef user_siginfo_t +#define user_siginfo_t siginfo_t +#endif + +static int load_elf_binary(struct linux_binprm *bprm); +static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, + int, int, unsigned long); + +#ifdef CONFIG_USELIB +static int load_elf_library(struct file *); +#else +#define load_elf_library NULL +#endif + +/* + * If we don't support core dumping, then supply a NULL so we + * don't even try. + */ +#ifdef CONFIG_ELF_CORE +static int elf_core_dump(struct coredump_params *cprm); +#else +#define elf_core_dump NULL +#endif + +#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE +#else +#define ELF_MIN_ALIGN PAGE_SIZE +#endif + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) +#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) + +static struct linux_binfmt elf_format = { + .module = THIS_MODULE, + .load_binary = load_elf_binary, + .load_shlib = load_elf_library, + .core_dump = elf_core_dump, + .min_coredump = ELF_EXEC_PAGESIZE, +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = ELF_PAGEALIGN(start); + end = ELF_PAGEALIGN(end); + if (end > start) { + unsigned long addr; + addr = vm_brk(start, end - start); + if (BAD_ADDR(addr)) + return addr; + } + current->mm->start_brk = current->mm->brk = end; + return 0; +} + +/* We need to explicitly zero any fractional pages + after the data section (i.e. bss). This would + contain the junk from the file that should not + be in memory + */ +static int padzero(unsigned long elf_bss) +{ + unsigned long nbyte; + + nbyte = ELF_PAGEOFFSET(elf_bss); + if (nbyte) { + nbyte = ELF_MIN_ALIGN - nbyte; + if (clear_user((void __user *) elf_bss, nbyte)) + return -EFAULT; + } + return 0; +} + +/* Let's use some macros to make this stack manipulation a little clearer */ +#ifdef CONFIG_STACK_GROWSUP +#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items)) +#define STACK_ROUND(sp, items) \ + ((15 + (unsigned long) ((sp) + (items))) &~ 15UL) +#define STACK_ALLOC(sp, len) ({ \ + elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \ + old_sp; }) +#else +#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items)) +#define STACK_ROUND(sp, items) \ + (((unsigned long) (sp - items)) &~ 15UL) +#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) +#endif + +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +static int +create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr) +{ + unsigned long p = bprm->p; + int argc = bprm->argc; + int envc = bprm->envc; + elf_addr_t __user *argv; + elf_addr_t __user *envp; + elf_addr_t __user *sp; + elf_addr_t __user *u_platform; + elf_addr_t __user *u_base_platform; + elf_addr_t __user *u_rand_bytes; + const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; + unsigned char k_rand_bytes[16]; + int items; + elf_addr_t *elf_info; + int ei_index = 0; + const struct cred *cred = current_cred(); + struct vm_area_struct *vma; + + /* + * In some cases (e.g. Hyper-Threading), we want to avoid L1 + * evictions by the processes running on the same package. One + * thing we can do is to shuffle the initial stack for them. + */ + + p = arch_align_stack(p); + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ + u_platform = NULL; + if (k_platform) { + size_t len = strlen(k_platform) + 1; + + u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_platform, k_platform, len)) + return -EFAULT; + } + + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + u_base_platform = NULL; + if (k_base_platform) { + size_t len = strlen(k_base_platform) + 1; + + u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_base_platform, k_base_platform, len)) + return -EFAULT; + } + + /* + * Generate 16 random bytes for userspace PRNG seeding. + */ + get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + u_rand_bytes = (elf_addr_t __user *) + STACK_ALLOC(p, sizeof(k_rand_bytes)); + if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) + return -EFAULT; + + /* Create the ELF interpreter info */ + elf_info = (elf_addr_t *)current->mm->saved_auxv; + /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ +#define NEW_AUX_ENT(id, val) \ + do { \ + elf_info[ei_index++] = id; \ + elf_info[ei_index++] = val; \ + } while (0) + +#ifdef ARCH_DLINFO + /* + * ARCH_DLINFO must come first so PPC can do its special alignment of + * AUXV. + * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in + * ARCH_DLINFO changes + */ + ARCH_DLINFO; +#endif + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); + NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); + NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); + NEW_AUX_ENT(AT_BASE, interp_load_addr); + NEW_AUX_ENT(AT_FLAGS, 0); + NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif + NEW_AUX_ENT(AT_EXECFN, bprm->exec); + if (k_platform) { + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t)(unsigned long)u_platform); + } + if (k_base_platform) { + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + } + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } +#undef NEW_AUX_ENT + /* AT_NULL is zero; clear the rest too */ + memset(&elf_info[ei_index], 0, + sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + + /* And advance past the AT_NULL entry. */ + ei_index += 2; + + sp = STACK_ADD(p, ei_index); + + items = (argc + 1) + (envc + 1) + 1; + bprm->p = STACK_ROUND(sp, items); + + /* Point sp at the lowest address on the stack */ +#ifdef CONFIG_STACK_GROWSUP + sp = (elf_addr_t __user *)bprm->p - items - ei_index; + bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */ +#else + sp = (elf_addr_t __user *)bprm->p; +#endif + + + /* + * Grow the stack manually; some architectures have a limit on how + * far ahead a user-space access may be in order to grow the stack. + */ + vma = find_extend_vma(current->mm, bprm->p); + if (!vma) + return -EFAULT; + + /* Now, let's put argc (and argv, envp if appropriate) on the stack */ + if (__put_user(argc, sp++)) + return -EFAULT; + argv = sp; + envp = argv + argc + 1; + + /* Populate argv and envp */ + p = current->mm->arg_end = current->mm->arg_start; + while (argc-- > 0) { + size_t len; + if (__put_user((elf_addr_t)p, argv++)) + return -EFAULT; + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + if (__put_user(0, argv)) + return -EFAULT; + current->mm->arg_end = current->mm->env_start = p; + while (envc-- > 0) { + size_t len; + if (__put_user((elf_addr_t)p, envp++)) + return -EFAULT; + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + if (__put_user(0, envp)) + return -EFAULT; + current->mm->env_end = p; + + /* Put the elf_info on the stack in the right place. */ + sp = (elf_addr_t __user *)envp + 1; + if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + return -EFAULT; + return 0; +} + +#ifndef elf_map + +static unsigned long elf_map(struct file *filep, unsigned long addr, + struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long map_addr; + unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); + unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); + addr = ELF_PAGESTART(addr); + size = ELF_PAGEALIGN(size); + + /* mmap() will return -EINVAL if given a zero size, but a + * segment with zero filesize is perfectly valid */ + if (!size) + return addr; + + /* + * total_size is the size of the ELF (interpreter) image. + * The _first_ mmap needs to know the full size, otherwise + * randomization might put this image into an overlapping + * position with the ELF binary image. (since size < total_size) + * So we first map the 'big' image - and unmap the remainder at + * the end. (which unmap is needed for ELF images with holes.) + */ + if (total_size) { + total_size = ELF_PAGEALIGN(total_size); + map_addr = vm_mmap(filep, addr, total_size, prot, type, off); + if (!BAD_ADDR(map_addr)) + vm_munmap(map_addr+size, total_size-size); + } else + map_addr = vm_mmap(filep, addr, size, prot, type, off); + + return(map_addr); +} + +#endif /* !elf_map */ + +static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) +{ + int i, first_idx = -1, last_idx = -1; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + last_idx = i; + if (first_idx == -1) + first_idx = i; + } + } + if (first_idx == -1) + return 0; + + return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - + ELF_PAGESTART(cmds[first_idx].p_vaddr); +} + +/** + * load_elf_phdrs() - load ELF program headers + * @elf_ex: ELF header of the binary whose program headers should be loaded + * @elf_file: the opened ELF binary file + * + * Loads ELF program headers from the binary file elf_file, which has the ELF + * header pointed to by elf_ex, into a newly allocated array. The caller is + * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + */ +static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, + struct file *elf_file) +{ + struct elf_phdr *elf_phdata = NULL; + int retval, size, err = -1; + + /* + * If the size of this structure has changed, then punt, since + * we will be doing the wrong thing. + */ + if (elf_ex->e_phentsize != sizeof(struct elf_phdr)) + goto out; + + /* Sanity check the number of program headers... */ + if (elf_ex->e_phnum < 1 || + elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) + goto out; + + /* ...and their total size. */ + size = sizeof(struct elf_phdr) * elf_ex->e_phnum; + if (size > ELF_MIN_ALIGN) + goto out; + + elf_phdata = kmalloc(size, GFP_KERNEL); + if (!elf_phdata) + goto out; + + /* Read in the program headers */ + retval = kernel_read(elf_file, elf_ex->e_phoff, + (char *)elf_phdata, size); + if (retval != size) { + err = (retval < 0) ? retval : -EIO; + goto out; + } + + /* Success! */ + err = 0; +out: + if (err) { + kfree(elf_phdata); + elf_phdata = NULL; + } + return elf_phdata; +} + +#ifndef CONFIG_ARCH_BINFMT_ELF_STATE + +/** + * struct arch_elf_state - arch-specific ELF loading state + * + * This structure is used to preserve architecture specific data during + * the loading of an ELF file, throughout the checking of architecture + * specific ELF headers & through to the point where the ELF load is + * known to be proceeding (ie. SET_PERSONALITY). + * + * This implementation is a dummy for architectures which require no + * specific state. + */ +struct arch_elf_state { +}; + +#define INIT_ARCH_ELF_STATE {} + +/** + * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @phdr: The program header to check + * @elf: The open ELF file + * @is_interp: True if the phdr is from the interpreter of the ELF being + * loaded, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Inspects the program header phdr to validate its correctness and/or + * suitability for the system. Called once per ELF program header in the + * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its + * interpreter. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_elf_pt_proc(struct elfhdr *ehdr, + struct elf_phdr *phdr, + struct file *elf, bool is_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +/** + * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @has_interp: True if the ELF has an interpreter, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Provides a final opportunity for architecture code to reject the loading + * of the ELF & cause an exec syscall to return an error. This is called after + * all program headers to be checked by arch_elf_pt_proc have been. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ + +/* This is much more generalized than the library routine read function, + so we keep this separate. Technically the library read function + is only provided so that we can read a.out libraries that have + an ELF header */ + +static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, + struct file *interpreter, unsigned long *interp_map_addr, + unsigned long no_base, struct elf_phdr *interp_elf_phdata) +{ + struct elf_phdr *eppnt; + unsigned long load_addr = 0; + int load_addr_set = 0; + unsigned long last_bss = 0, elf_bss = 0; + unsigned long error = ~0UL; + unsigned long total_size; + int i; + + /* First of all, some simple consistency checks */ + if (interp_elf_ex->e_type != ET_EXEC && + interp_elf_ex->e_type != ET_DYN) + goto out; + if (!elf_check_arch(interp_elf_ex)) + goto out; + if (!interpreter->f_op->mmap) + goto out; + + total_size = total_mapping_size(interp_elf_phdata, + interp_elf_ex->e_phnum); + if (!total_size) { + error = -EINVAL; + goto out; + } + + eppnt = interp_elf_phdata; + for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { + int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_prot = 0; + unsigned long vaddr = 0; + unsigned long k, map_addr; + + if (eppnt->p_flags & PF_R) + elf_prot = PROT_READ; + if (eppnt->p_flags & PF_W) + elf_prot |= PROT_WRITE; + if (eppnt->p_flags & PF_X) + elf_prot |= PROT_EXEC; + vaddr = eppnt->p_vaddr; + if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) + elf_type |= MAP_FIXED; + else if (no_base && interp_elf_ex->e_type == ET_DYN) + load_addr = -vaddr; + + map_addr = elf_map(interpreter, load_addr + vaddr, + eppnt, elf_prot, elf_type, total_size); + total_size = 0; + if (!*interp_map_addr) + *interp_map_addr = map_addr; + error = map_addr; + if (BAD_ADDR(map_addr)) + goto out; + + if (!load_addr_set && + interp_elf_ex->e_type == ET_DYN) { + load_addr = map_addr - ELF_PAGESTART(vaddr); + load_addr_set = 1; + } + + /* + * Check to see if the section's size will overflow the + * allowed task size. Note that p_filesz must always be + * <= p_memsize so it's only necessary to check p_memsz. + */ + k = load_addr + eppnt->p_vaddr; + if (BAD_ADDR(k) || + eppnt->p_filesz > eppnt->p_memsz || + eppnt->p_memsz > TASK_SIZE || + TASK_SIZE - eppnt->p_memsz < k) { + error = -ENOMEM; + goto out; + } + + /* + * Find the end of the file mapping for this phdr, and + * keep track of the largest address we see for this. + */ + k = load_addr + eppnt->p_vaddr + eppnt->p_filesz; + if (k > elf_bss) + elf_bss = k; + + /* + * Do the same thing for the memory mapping - between + * elf_bss and last_bss is the bss section. + */ + k = load_addr + eppnt->p_memsz + eppnt->p_vaddr; + if (k > last_bss) + last_bss = k; + } + } + + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out; + } + + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + + /* Map the last of the bss segment */ + error = vm_brk(elf_bss, last_bss - elf_bss); + if (BAD_ADDR(error)) + goto out; + } + + error = load_addr; +out: + return error; +} + +/* + * These are the functions used to load ELF style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +static unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { + random_variable = (unsigned long) get_random_int(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + +static int load_elf_binary(struct linux_binprm *bprm) +{ + struct file *interpreter = NULL; /* to shut gcc up */ + unsigned long load_addr = 0, load_bias = 0; + int load_addr_set = 0; + char * elf_interpreter = NULL; + unsigned long error; + struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + unsigned long elf_bss, elf_brk; + int retval, i; + unsigned long elf_entry; + unsigned long interp_load_addr = 0; + unsigned long start_code, end_code, start_data, end_data; + unsigned long reloc_func_desc __maybe_unused = 0; + int executable_stack = EXSTACK_DEFAULT; + struct pt_regs *regs = current_pt_regs(); + struct { + struct elfhdr elf_ex; + struct elfhdr interp_elf_ex; + } *loc; + struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + + loc = kmalloc(sizeof(*loc), GFP_KERNEL); + if (!loc) { + retval = -ENOMEM; + goto out_ret; + } + + /* Get the exec-header */ + loc->elf_ex = *((struct elfhdr *)bprm->buf); + + retval = -ENOEXEC; + /* First of all, some simple consistency checks */ + if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + goto out; + if (!elf_check_arch(&loc->elf_ex)) + goto out; + if (!bprm->file->f_op->mmap) + goto out; + + elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); + if (!elf_phdata) + goto out; + + elf_ppnt = elf_phdata; + elf_bss = 0; + elf_brk = 0; + + start_code = ~0UL; + end_code = 0; + start_data = 0; + end_data = 0; + + for (i = 0; i < loc->elf_ex.e_phnum; i++) { + if (elf_ppnt->p_type == PT_INTERP) { + /* This is the program interpreter used for + * shared libraries - for now assume that this + * is an a.out format binary + */ + retval = -ENOEXEC; + if (elf_ppnt->p_filesz > PATH_MAX || + elf_ppnt->p_filesz < 2) + goto out_free_ph; + + retval = -ENOMEM; + elf_interpreter = kmalloc(elf_ppnt->p_filesz, + GFP_KERNEL); + if (!elf_interpreter) + goto out_free_ph; + + retval = kernel_read(bprm->file, elf_ppnt->p_offset, + elf_interpreter, + elf_ppnt->p_filesz); + if (retval != elf_ppnt->p_filesz) { + if (retval >= 0) + retval = -EIO; + goto out_free_interp; + } + /* make sure path is NULL terminated */ + retval = -ENOEXEC; + if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') + goto out_free_interp; + + interpreter = open_exec(elf_interpreter); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) + goto out_free_interp; + + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + would_dump(bprm, interpreter); + + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); + if (retval != BINPRM_BUF_SIZE) { + if (retval >= 0) + retval = -EIO; + goto out_free_dentry; + } + + /* Get the exec headers */ + loc->interp_elf_ex = *((struct elfhdr *)bprm->buf); + break; + } + elf_ppnt++; + } + + elf_ppnt = elf_phdata; + for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + switch (elf_ppnt->p_type) { + case PT_GNU_STACK: + if (elf_ppnt->p_flags & PF_X) + executable_stack = EXSTACK_ENABLE_X; + else + executable_stack = EXSTACK_DISABLE_X; + break; + + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + bprm->file, false, + &arch_state); + if (retval) + goto out_free_dentry; + break; + } + + /* Some simple consistency checks for the interpreter */ + if (elf_interpreter) { + retval = -ELIBBAD; + /* Not an ELF interpreter */ + if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out_free_dentry; + /* Verify the interpreter has a valid arch */ + if (!elf_check_arch(&loc->interp_elf_ex)) + goto out_free_dentry; + + /* Load the interpreter program headers */ + interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interpreter); + if (!interp_elf_phdata) + goto out_free_dentry; + + /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_ppnt = interp_elf_phdata; + for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + switch (elf_ppnt->p_type) { + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->interp_elf_ex, + elf_ppnt, interpreter, + true, &arch_state); + if (retval) + goto out_free_dentry; + break; + } + } + + /* + * Allow arch code to reject the ELF at this point, whilst it's + * still possible to return an error to the code that invoked + * the exec syscall. + */ + retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + if (retval) + goto out_free_dentry; + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + goto out_free_dentry; + + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY2(loc->elf_ex, &arch_state); + if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + current->flags |= PF_RANDOMIZE; + + setup_new_exec(bprm); + + /* Do this so that we can load the interpreter, if need be. We will + change some of these later */ + retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), + executable_stack); + if (retval < 0) + goto out_free_dentry; + + current->mm->start_stack = bprm->p; + + /* Now we do a little grungy work by mmapping the ELF image into + the correct location in memory. */ + for(i = 0, elf_ppnt = elf_phdata; + i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + int elf_prot = 0, elf_flags; + unsigned long k, vaddr; + unsigned long total_size = 0; + + if (elf_ppnt->p_type != PT_LOAD) + continue; + + if (unlikely (elf_brk > elf_bss)) { + unsigned long nbyte; + + /* There was a PT_LOAD segment with p_memsz > p_filesz + before this one. Map anonymous pages, if needed, + and clear the area. */ + retval = set_brk(elf_bss + load_bias, + elf_brk + load_bias); + if (retval) + goto out_free_dentry; + nbyte = ELF_PAGEOFFSET(elf_bss); + if (nbyte) { + nbyte = ELF_MIN_ALIGN - nbyte; + if (nbyte > elf_brk - elf_bss) + nbyte = elf_brk - elf_bss; + if (clear_user((void __user *)elf_bss + + load_bias, nbyte)) { + /* + * This bss-zeroing can fail if the ELF + * file specifies odd protections. So + * we don't check the return value + */ + } + } + } + + if (elf_ppnt->p_flags & PF_R) + elf_prot |= PROT_READ; + if (elf_ppnt->p_flags & PF_W) + elf_prot |= PROT_WRITE; + if (elf_ppnt->p_flags & PF_X) + elf_prot |= PROT_EXEC; + + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + + vaddr = elf_ppnt->p_vaddr; + if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + elf_flags |= MAP_FIXED; + } else if (loc->elf_ex.e_type == ET_DYN) { + /* Try and get dynamic programs out of the way of the + * default mmap base, as well as whatever program they + * might try to exec. This is because the brk will + * follow the loader, and is not movable. */ + load_bias = ELF_ET_DYN_BASE - vaddr; + if (current->flags & PF_RANDOMIZE) + load_bias += arch_mmap_rnd(); + load_bias = ELF_PAGESTART(load_bias); + total_size = total_mapping_size(elf_phdata, + loc->elf_ex.e_phnum); + if (!total_size) { + retval = -EINVAL; + goto out_free_dentry; + } + } + + error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + elf_prot, elf_flags, total_size); + if (BAD_ADDR(error)) { + retval = IS_ERR((void *)error) ? + PTR_ERR((void*)error) : -EINVAL; + goto out_free_dentry; + } + + if (!load_addr_set) { + load_addr_set = 1; + load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (loc->elf_ex.e_type == ET_DYN) { + load_bias += error - + ELF_PAGESTART(load_bias + vaddr); + load_addr += load_bias; + reloc_func_desc = load_bias; + } + } + k = elf_ppnt->p_vaddr; + if (k < start_code) + start_code = k; + if (start_data < k) + start_data = k; + + /* + * Check to see if the section's size will overflow the + * allowed task size. Note that p_filesz must always be + * <= p_memsz so it is only necessary to check p_memsz. + */ + if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || + elf_ppnt->p_memsz > TASK_SIZE || + TASK_SIZE - elf_ppnt->p_memsz < k) { + /* set_brk can never work. Avoid overflows. */ + retval = -EINVAL; + goto out_free_dentry; + } + + k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; + + if (k > elf_bss) + elf_bss = k; + if ((elf_ppnt->p_flags & PF_X) && end_code < k) + end_code = k; + if (end_data < k) + end_data = k; + k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; + if (k > elf_brk) + elf_brk = k; + } + + loc->elf_ex.e_entry += load_bias; + elf_bss += load_bias; + elf_brk += load_bias; + start_code += load_bias; + end_code += load_bias; + start_data += load_bias; + end_data += load_bias; + + /* Calling set_brk effectively mmaps the pages that we need + * for the bss and break sections. We must do this before + * mapping in the interpreter, to make sure it doesn't wind + * up getting placed where the bss needs to go. + */ + retval = set_brk(elf_bss, elf_brk); + if (retval) + goto out_free_dentry; + if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { + retval = -EFAULT; /* Nobody gets to see this, but.. */ + goto out_free_dentry; + } + + if (elf_interpreter) { + unsigned long interp_map_addr = 0; + + elf_entry = load_elf_interp(&loc->interp_elf_ex, + interpreter, + &interp_map_addr, + load_bias, interp_elf_phdata); + if (!IS_ERR((void *)elf_entry)) { + /* + * load_elf_interp() returns relocation + * adjustment + */ + interp_load_addr = elf_entry; + elf_entry += loc->interp_elf_ex.e_entry; + } + if (BAD_ADDR(elf_entry)) { + retval = IS_ERR((void *)elf_entry) ? + (int)elf_entry : -EINVAL; + goto out_free_dentry; + } + reloc_func_desc = interp_load_addr; + + allow_write_access(interpreter); + fput(interpreter); + kfree(elf_interpreter); + } else { + elf_entry = loc->elf_ex.e_entry; + if (BAD_ADDR(elf_entry)) { + retval = -EINVAL; + goto out_free_dentry; + } + } + + kfree(interp_elf_phdata); + kfree(elf_phdata); + + set_binfmt(&elf_format); + +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + if (retval < 0) + goto out; +#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ + + install_exec_creds(bprm); + retval = create_elf_tables(bprm, &loc->elf_ex, + load_addr, interp_load_addr); + if (retval < 0) + goto out; + /* N.B. passed_fileno might not be initialized? */ + current->mm->end_code = end_code; + current->mm->start_code = start_code; + current->mm->start_data = start_data; + current->mm->end_data = end_data; + current->mm->start_stack = bprm->p; + + if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { + current->mm->brk = current->mm->start_brk = + arch_randomize_brk(current->mm); +#ifdef compat_brk_randomized + current->brk_randomized = 1; +#endif + } + + if (current->personality & MMAP_PAGE_ZERO) { + /* Why this, you ask??? Well SVr4 maps page 0 as read-only, + and some applications "depend" upon this behavior. + Since we do not have the power to recompile these, we + emulate the SVr4 behavior. Sigh. */ + error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE, 0); + } + +#ifdef ELF_PLAT_INIT + /* + * The ABI may specify that certain registers be set up in special + * ways (on i386 %edx is the address of a DT_FINI function, for + * example. In addition, it may also specify (eg, PowerPC64 ELF) + * that the e_entry field is the address of the function descriptor + * for the startup routine, rather than the address of the startup + * routine itself. This macro performs whatever initialization to + * the regs structure is required as well as any relocations to the + * function descriptor entries when executing dynamically links apps. + */ + ELF_PLAT_INIT(regs, reloc_func_desc); +#endif + + start_thread(regs, elf_entry, bprm->p); + retval = 0; +out: + kfree(loc); +out_ret: + return retval; + + /* error cleanup */ +out_free_dentry: + kfree(interp_elf_phdata); + allow_write_access(interpreter); + if (interpreter) + fput(interpreter); +out_free_interp: + kfree(elf_interpreter); +out_free_ph: + kfree(elf_phdata); + goto out; +} + +#ifdef CONFIG_USELIB +/* This is really simpleminded and specialized - we are loading an + a.out library that is given an ELF header. */ +static int load_elf_library(struct file *file) +{ + struct elf_phdr *elf_phdata; + struct elf_phdr *eppnt; + unsigned long elf_bss, bss, len; + int retval, error, i, j; + struct elfhdr elf_ex; + + error = -ENOEXEC; + retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex)); + if (retval != sizeof(elf_ex)) + goto out; + + if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* First of all, some simple consistency checks */ + if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || + !elf_check_arch(&elf_ex) || !file->f_op->mmap) + goto out; + + /* Now read in all of the header information */ + + j = sizeof(struct elf_phdr) * elf_ex.e_phnum; + /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ + + error = -ENOMEM; + elf_phdata = kmalloc(j, GFP_KERNEL); + if (!elf_phdata) + goto out; + + eppnt = elf_phdata; + error = -ENOEXEC; + retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); + if (retval != j) + goto out_free_ph; + + for (j = 0, i = 0; ip_type == PT_LOAD) + j++; + if (j != 1) + goto out_free_ph; + + while (eppnt->p_type != PT_LOAD) + eppnt++; + + /* Now use mmap to map the library into memory. */ + error = vm_mmap(file, + ELF_PAGESTART(eppnt->p_vaddr), + (eppnt->p_filesz + + ELF_PAGEOFFSET(eppnt->p_vaddr)), + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + (eppnt->p_offset - + ELF_PAGEOFFSET(eppnt->p_vaddr))); + if (error != ELF_PAGESTART(eppnt->p_vaddr)) + goto out_free_ph; + + elf_bss = eppnt->p_vaddr + eppnt->p_filesz; + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_free_ph; + } + + len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + + ELF_MIN_ALIGN - 1); + bss = eppnt->p_memsz + eppnt->p_vaddr; + if (bss > len) + vm_brk(len, bss - len); + error = 0; + +out_free_ph: + kfree(elf_phdata); +out: + return error; +} +#endif /* #ifdef CONFIG_USELIB */ + +#ifdef CONFIG_ELF_CORE +/* + * ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + */ + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide what to dump of a segment, part, all or none. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if (vma->anon_vma && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this looks like the beginning of a DSO or executable mapping, + * check for an ELF header. If we find one, dump the first page to + * aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + u32 __user *header = (u32 __user *) vma->vm_start; + u32 word; + mm_segment_t fs = get_fs(); + /* + * Doing it this way gets the constant folded by GCC. + */ + union { + u32 cmp; + char elfmag[SELFMAG]; + } magic; + BUILD_BUG_ON(SELFMAG != sizeof word); + magic.elfmag[EI_MAG0] = ELFMAG0; + magic.elfmag[EI_MAG1] = ELFMAG1; + magic.elfmag[EI_MAG2] = ELFMAG2; + magic.elfmag[EI_MAG3] = ELFMAG3; + /* + * Switch to the user "segment" for get_user(), + * then put back what elf_core_dump() had in place. + */ + set_fs(USER_DS); + if (unlikely(get_user(word, header))) + word = 0; + set_fs(fs); + if (word == magic.cmp) + return PAGE_SIZE; + } + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +static int writenote(struct memelfnote *men, struct coredump_params *cprm) +{ + struct elf_note en; + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + return dump_emit(cprm, &en, sizeof(en)) && + dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && + dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); +} + +static void fill_elf_header(struct elfhdr *elf, int segs, + u16 machine, u32 flags) +{ + memset(elf, 0, sizeof(*elf)); + + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + + elf->e_type = ET_CORE; + elf->e_machine = machine; + elf->e_version = EV_CURRENT; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_flags = flags; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + + return; +} + +static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up separately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); + if (thread_group_leader(p)) { + struct task_cputime cputime; + + /* + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. + */ + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + } else { + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + const struct cred *cred; + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ-1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *)mm->arg_start, len)) + return -EFAULT; + for(i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); + rcu_read_unlock(); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) +{ + elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv; + int i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); +} + +static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, + const siginfo_t *siginfo) +{ + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); + set_fs(old_fs); + fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); +} + +#define MAX_FILE_NOTE_SIZE (4*1024*1024) +/* + * Format of NT_FILE note: + * + * long count -- how many files are mapped + * long page_size -- units for file_ofs + * array of [COUNT] elements of + * long start + * long end + * long file_ofs + * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... + */ +static int fill_files_note(struct memelfnote *note) +{ + struct vm_area_struct *vma; + unsigned count, size, names_ofs, remaining, n; + user_long_t *data; + user_long_t *start_end_ofs; + char *name_base, *name_curpos; + + /* *Estimated* file count and total data size needed */ + count = current->mm->map_count; + size = count * 64; + + names_ofs = (2 + 3 * count) * sizeof(data[0]); + alloc: + if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + return -EINVAL; + size = round_up(size, PAGE_SIZE); + data = vmalloc(size); + if (!data) + return -ENOMEM; + + start_end_ofs = data + 2; + name_base = name_curpos = ((char *)data) + names_ofs; + remaining = size - names_ofs; + count = 0; + for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + struct file *file; + const char *filename; + + file = vma->vm_file; + if (!file) + continue; + filename = d_path(&file->f_path, name_curpos, remaining); + if (IS_ERR(filename)) { + if (PTR_ERR(filename) == -ENAMETOOLONG) { + vfree(data); + size = size * 5 / 4; + goto alloc; + } + continue; + } + + /* d_path() fills at the end, move name down */ + /* n = strlen(filename) + 1: */ + n = (name_curpos + remaining) - filename; + remaining = filename - name_curpos; + memmove(name_curpos, filename, n); + name_curpos += n; + + *start_end_ofs++ = vma->vm_start; + *start_end_ofs++ = vma->vm_end; + *start_end_ofs++ = vma->vm_pgoff; + count++; + } + + /* Now we know exact count of files, can store it */ + data[0] = count; + data[1] = PAGE_SIZE; + /* + * Count usually is less than current->mm->map_count, + * we need to move filenames down. + */ + n = current->mm->map_count - count; + if (n != 0) { + unsigned shift_bytes = n * 3 * sizeof(data[0]); + memmove(name_base - shift_bytes, name_base, + name_curpos - name_base); + name_curpos -= shift_bytes; + } + + size = name_curpos - (char *)data; + fill_note(note, "CORE", NT_FILE, size, data); + return 0; +} + +#ifdef CORE_DUMP_USE_REGSET +#include + +struct elf_thread_core_info { + struct elf_thread_core_info *next; + struct task_struct *task; + struct elf_prstatus prstatus; + struct memelfnote notes[0]; +}; + +struct elf_note_info { + struct elf_thread_core_info *thread; + struct memelfnote psinfo; + struct memelfnote signote; + struct memelfnote auxv; + struct memelfnote files; + user_siginfo_t csigdata; + size_t size; + int thread_notes; +}; + +/* + * When a regset has a writeback hook, we call it on each thread before + * dumping user memory. On register window machines, this makes sure the + * user memory backing the register data is up to date before we read it. + */ +static void do_thread_regset_writeback(struct task_struct *task, + const struct user_regset *regset) +{ + if (regset->writeback) + regset->writeback(task, regset, 1); +} + +#ifndef PR_REG_SIZE +#define PR_REG_SIZE(S) sizeof(S) +#endif + +#ifndef PRSTATUS_SIZE +#define PRSTATUS_SIZE(S) sizeof(S) +#endif + +#ifndef PR_REG_PTR +#define PR_REG_PTR(S) (&((S)->pr_reg)) +#endif + +#ifndef SET_PR_FPVALID +#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#endif + +static int fill_thread_core_info(struct elf_thread_core_info *t, + const struct user_regset_view *view, + long signr, size_t *total) +{ + unsigned int i; + + /* + * NT_PRSTATUS is the one special case, because the regset data + * goes into the pr_reg field inside the note contents, rather + * than being the whole note contents. We fill the reset in here. + * We assume that regset 0 is NT_PRSTATUS. + */ + fill_prstatus(&t->prstatus, t->task, signr); + (void) view->regsets[0].get(t->task, &view->regsets[0], + 0, PR_REG_SIZE(t->prstatus.pr_reg), + PR_REG_PTR(&t->prstatus), NULL); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + PRSTATUS_SIZE(t->prstatus), &t->prstatus); + *total += notesize(&t->notes[0]); + + do_thread_regset_writeback(t->task, &view->regsets[0]); + + /* + * Each other regset might generate a note too. For each regset + * that has no core_note_type or is inactive, we leave t->notes[i] + * all zero and we'll know to skip writing it later. + */ + for (i = 1; i < view->n; ++i) { + const struct user_regset *regset = &view->regsets[i]; + do_thread_regset_writeback(t->task, regset); + if (regset->core_note_type && regset->get && + (!regset->active || regset->active(t->task, regset))) { + int ret; + size_t size = regset->n * regset->size; + void *data = kmalloc(size, GFP_KERNEL); + if (unlikely(!data)) + return 0; + ret = regset->get(t->task, regset, + 0, size, data, NULL); + if (unlikely(ret)) + kfree(data); + else { + if (regset->core_note_type != NT_PRFPREG) + fill_note(&t->notes[i], "LINUX", + regset->core_note_type, + size, data); + else { + SET_PR_FPVALID(&t->prstatus, 1); + fill_note(&t->notes[i], "CORE", + NT_PRFPREG, size, data); + } + *total += notesize(&t->notes[i]); + } + } + } + + return 1; +} + +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + const siginfo_t *siginfo, struct pt_regs *regs) +{ + struct task_struct *dump_task = current; + const struct user_regset_view *view = task_user_regset_view(dump_task); + struct elf_thread_core_info *t; + struct elf_prpsinfo *psinfo; + struct core_thread *ct; + unsigned int i; + + info->size = 0; + info->thread = NULL; + + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (psinfo == NULL) { + info->psinfo.data = NULL; /* So we don't free this wrongly */ + return 0; + } + + fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + /* + * Figure out how many notes we're going to need for each thread. + */ + info->thread_notes = 0; + for (i = 0; i < view->n; ++i) + if (view->regsets[i].core_note_type != 0) + ++info->thread_notes; + + /* + * Sanity check. We rely on regset 0 being in NT_PRSTATUS, + * since it is our one special case. + */ + if (unlikely(info->thread_notes == 0) || + unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) { + WARN_ON(1); + return 0; + } + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, + view->e_machine, view->e_flags); + + /* + * Allocate a structure for each thread. + */ + for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + t = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!t)) + return 0; + + t->task = ct->task; + if (ct->task == dump_task || !info->thread) { + t->next = info->thread; + info->thread = t; + } else { + /* + * Make sure to keep the original task at + * the head of the list. + */ + t->next = info->thread->next; + info->thread->next = t; + } + } + + /* + * Now fill in each thread's information. + */ + for (t = info->thread; t != NULL; t = t->next) + if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + return 0; + + /* + * Fill in the two process-wide notes. + */ + fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); + info->size += notesize(&info->psinfo); + + fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + info->size += notesize(&info->signote); + + fill_auxv_note(&info->auxv, current->mm); + info->size += notesize(&info->auxv); + + if (fill_files_note(&info->files) == 0) + info->size += notesize(&info->files); + + return 1; +} + +static size_t get_note_info_size(struct elf_note_info *info) +{ + return info->size; +} + +/* + * Write all the notes for each thread. When writing the first thread, the + * process-wide notes are interleaved after the first thread-specific note. + */ +static int write_note_info(struct elf_note_info *info, + struct coredump_params *cprm) +{ + bool first = true; + struct elf_thread_core_info *t = info->thread; + + do { + int i; + + if (!writenote(&t->notes[0], cprm)) + return 0; + + if (first && !writenote(&info->psinfo, cprm)) + return 0; + if (first && !writenote(&info->signote, cprm)) + return 0; + if (first && !writenote(&info->auxv, cprm)) + return 0; + if (first && info->files.data && + !writenote(&info->files, cprm)) + return 0; + + for (i = 1; i < info->thread_notes; ++i) + if (t->notes[i].data && + !writenote(&t->notes[i], cprm)) + return 0; + + first = false; + t = t->next; + } while (t); + + return 1; +} + +static void free_note_info(struct elf_note_info *info) +{ + struct elf_thread_core_info *threads = info->thread; + while (threads) { + unsigned int i; + struct elf_thread_core_info *t = threads; + threads = t->next; + WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); + for (i = 1; i < info->thread_notes; ++i) + kfree(t->notes[i].data); + kfree(t); + } + kfree(info->psinfo.data); + vfree(info->files.data); +} + +#else + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every threads pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + int sz = 0; + struct task_struct *p = t->thread; + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &(t->prstatus)); + t->num_notes++; + sz += notesize(&t->notes[0]); + + if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, + &t->fpu))) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &(t->fpu)); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(t->xfpu), &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +struct elf_note_info { + struct memelfnote *notes; + struct memelfnote *notes_files; + struct elf_prstatus *prstatus; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ + struct list_head thread_list; + elf_fpregset_t *fpu; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu; +#endif + user_siginfo_t csigdata; + int thread_status_size; + int numnote; +}; + +static int elf_note_info_init(struct elf_note_info *info) +{ + memset(info, 0, sizeof(*info)); + INIT_LIST_HEAD(&info->thread_list); + + /* Allocate space for ELF notes */ + info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL); + if (!info->notes) + return 0; + info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); + if (!info->psinfo) + return 0; + info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); + if (!info->prstatus) + return 0; + info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); + if (!info->fpu) + return 0; +#ifdef ELF_CORE_COPY_XFPREGS + info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); + if (!info->xfpu) + return 0; +#endif + return 1; +} + +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + const siginfo_t *siginfo, struct pt_regs *regs) +{ + struct list_head *t; + struct core_thread *ct; + struct elf_thread_status *ets; + + if (!elf_note_info_init(info)) + return 0; + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + ets = kzalloc(sizeof(*ets), GFP_KERNEL); + if (!ets) + return 0; + + ets->thread = ct->task; + list_add(&ets->list, &info->thread_list); + } + + list_for_each(t, &info->thread_list) { + int sz; + + ets = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(siginfo->si_signo, ets); + info->thread_status_size += sz; + } + /* now collect the dump for the current */ + memset(info->prstatus, 0, sizeof(*info->prstatus)); + fill_prstatus(info->prstatus, current, siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, regs); + + /* Set up header */ + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(info->notes + 0, "CORE", NT_PRSTATUS, + sizeof(*info->prstatus), info->prstatus); + fill_psinfo(info->psinfo, current->group_leader, current->mm); + fill_note(info->notes + 1, "CORE", NT_PRPSINFO, + sizeof(*info->psinfo), info->psinfo); + + fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_auxv_note(info->notes + 3, current->mm); + info->numnote = 4; + + if (fill_files_note(info->notes + info->numnote) == 0) { + info->notes_files = info->notes + info->numnote; + info->numnote++; + } + + /* Try to dump the FPU. */ + info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, + info->fpu); + if (info->prstatus->pr_fpvalid) + fill_note(info->notes + info->numnote++, + "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, info->xfpu)) + fill_note(info->notes + info->numnote++, + "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(*info->xfpu), info->xfpu); +#endif + + return 1; +} + +static size_t get_note_info_size(struct elf_note_info *info) +{ + int sz = 0; + int i; + + for (i = 0; i < info->numnote; i++) + sz += notesize(info->notes + i); + + sz += info->thread_status_size; + + return sz; +} + +static int write_note_info(struct elf_note_info *info, + struct coredump_params *cprm) +{ + int i; + struct list_head *t; + + for (i = 0; i < info->numnote; i++) + if (!writenote(info->notes + i, cprm)) + return 0; + + /* write out the thread status notes section */ + list_for_each(t, &info->thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], cprm)) + return 0; + } + + return 1; +} + +static void free_note_info(struct elf_note_info *info) +{ + while (!list_empty(&info->thread_list)) { + struct list_head *tmp = info->thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + + /* Free data possibly allocated by fill_files_note(): */ + if (info->notes_files) + vfree(info->notes_files->data); + + kfree(info->prstatus); + kfree(info->psinfo); + kfree(info->notes); + kfree(info->fpu); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(info->xfpu); +#endif +} + +#endif + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_core_dump(struct coredump_params *cprm) +{ + int has_dumped = 0; + mm_segment_t fs; + int segs, i; + size_t vma_data_size = 0; + struct vm_area_struct *vma, *gate_vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff; + struct elf_note_info info = { }; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; + elf_addr_t *vma_filesz = NULL; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto out; + /* + * The number of segs are recored into ELF header as 16bit value. + * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. + */ + segs = current->mm->map_count; + segs += elf_core_extra_phdrs(); + + gate_vma = get_gate_vma(current->mm); + if (gate_vma != NULL) + segs++; + + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + + /* + * Collect all the non-memory information about the process for the + * notes. This also sets up the file header. + */ + if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + goto cleanup; + + has_dumped = 1; + + fs = get_fs(); + set_fs(KERNEL_DS); + + offset += sizeof(*elf); /* Elf header */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ + + /* Write notes phdr entry */ + { + size_t sz = get_note_info_size(&info); + + sz += elf_coredump_extra_notes_size(); + + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; + } + + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (!vma_filesz) + goto end_coredump; + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + unsigned long dump_size; + + dump_size = vma_dump_size(vma, cprm->mm_flags); + vma_filesz[i++] = dump_size; + vma_data_size += dump_size; + } + + offset += vma_data_size; + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + if (!dump_emit(cprm, elf, sizeof(*elf))) + goto end_coredump; + + if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + + /* Write program headers for segments dump */ + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + struct elf_phdr phdr; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = vma_filesz[i++]; + phdr.p_memsz = vma->vm_end - vma->vm_start; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + goto end_coredump; + } + + if (!elf_core_write_extra_phdrs(cprm, offset)) + goto end_coredump; + + /* write out the notes section */ + if (!write_note_info(&info, cprm)) + goto end_coredump; + + if (elf_coredump_extra_notes_write(cprm)) + goto end_coredump; + + /* Align to page */ + if (!dump_skip(cprm, dataoff - cprm->written)) + goto end_coredump; + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + unsigned long addr; + unsigned long end; + + end = vma->vm_start + vma_filesz[i++]; + + for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { + struct page *page; + int stop; + + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + page_cache_release(page); + } else + stop = !dump_skip(cprm, PAGE_SIZE); + if (stop) + goto end_coredump; + } + } + + if (!elf_core_write_extra_data(cprm)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) + goto end_coredump; + } + +end_coredump: + set_fs(fs); + +cleanup: + free_note_info(&info); + kfree(shdr4extnum); + kfree(vma_filesz); + kfree(phdr4note); + kfree(elf); +out: + return has_dumped; +} + +#endif /* CONFIG_ELF_CORE */ + +static int __init init_elf_binfmt(void) +{ + register_binfmt(&elf_format); + return 0; +} + +static void __exit exit_elf_binfmt(void) +{ + /* Remove the COFF and ELF loaders. */ + unregister_binfmt(&elf_format); +} + +core_initcall(init_elf_binfmt); +module_exit(exit_elf_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/binfmt_elf_fdpic.c b/kernel/fs/binfmt_elf_fdpic.c new file mode 100644 index 000000000..d3634bfb7 --- /dev/null +++ b/kernel/fs/binfmt_elf_fdpic.c @@ -0,0 +1,1794 @@ +/* binfmt_elf_fdpic.c: FDPIC ELF binary format + * + * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * Derived from binfmt_elf.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +typedef char *elf_caddr_t; + +#if 0 +#define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdebug(fmt, ...) do {} while(0) +#endif + +#if 0 +#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdcore(fmt, ...) do {} while(0) +#endif + +MODULE_LICENSE("GPL"); + +static int load_elf_fdpic_binary(struct linux_binprm *); +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); +static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, + struct mm_struct *, const char *); + +static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, + struct elf_fdpic_params *, + struct elf_fdpic_params *); + +#ifndef CONFIG_MMU +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, + unsigned long *); +static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, + struct file *, + struct mm_struct *); +#endif + +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, + struct file *, struct mm_struct *); + +#ifdef CONFIG_ELF_CORE +static int elf_fdpic_core_dump(struct coredump_params *cprm); +#endif + +static struct linux_binfmt elf_fdpic_format = { + .module = THIS_MODULE, + .load_binary = load_elf_fdpic_binary, +#ifdef CONFIG_ELF_CORE + .core_dump = elf_fdpic_core_dump, +#endif + .min_coredump = ELF_EXEC_PAGESIZE, +}; + +static int __init init_elf_fdpic_binfmt(void) +{ + register_binfmt(&elf_fdpic_format); + return 0; +} + +static void __exit exit_elf_fdpic_binfmt(void) +{ + unregister_binfmt(&elf_fdpic_format); +} + +core_initcall(init_elf_fdpic_binfmt); +module_exit(exit_elf_fdpic_binfmt); + +static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) +{ + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0) + return 0; + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) + return 0; + if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) + return 0; + if (!file->f_op->mmap) + return 0; + return 1; +} + +/*****************************************************************************/ +/* + * read the program headers table into memory + */ +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, + struct file *file) +{ + struct elf32_phdr *phdr; + unsigned long size; + int retval, loop; + + if (params->hdr.e_phentsize != sizeof(struct elf_phdr)) + return -ENOMEM; + if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr)) + return -ENOMEM; + + size = params->hdr.e_phnum * sizeof(struct elf_phdr); + params->phdrs = kmalloc(size, GFP_KERNEL); + if (!params->phdrs) + return -ENOMEM; + + retval = kernel_read(file, params->hdr.e_phoff, + (char *) params->phdrs, size); + if (unlikely(retval != size)) + return retval < 0 ? retval : -ENOEXEC; + + /* determine stack size for this binary */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_GNU_STACK) + continue; + + if (phdr->p_flags & PF_X) + params->flags |= ELF_FDPIC_FLAG_EXEC_STACK; + else + params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK; + + params->stack_size = phdr->p_memsz; + break; + } + + return 0; +} + +/*****************************************************************************/ +/* + * load an fdpic binary into various bits of memory + */ +static int load_elf_fdpic_binary(struct linux_binprm *bprm) +{ + struct elf_fdpic_params exec_params, interp_params; + struct pt_regs *regs = current_pt_regs(); + struct elf_phdr *phdr; + unsigned long stack_size, entryaddr; +#ifdef ELF_FDPIC_PLAT_INIT + unsigned long dynaddr; +#endif +#ifndef CONFIG_MMU + unsigned long stack_prot; +#endif + struct file *interpreter = NULL; /* to shut gcc up */ + char *interpreter_name = NULL; + int executable_stack; + int retval, i; + + kdebug("____ LOAD %d ____", current->pid); + + memset(&exec_params, 0, sizeof(exec_params)); + memset(&interp_params, 0, sizeof(interp_params)); + + exec_params.hdr = *(struct elfhdr *) bprm->buf; + exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE; + + /* check that this is a binary we know how to deal with */ + retval = -ENOEXEC; + if (!is_elf_fdpic(&exec_params.hdr, bprm->file)) + goto error; + + /* read the program header table */ + retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file); + if (retval < 0) + goto error; + + /* scan for a program header that specifies an interpreter */ + phdr = exec_params.phdrs; + + for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) { + switch (phdr->p_type) { + case PT_INTERP: + retval = -ENOMEM; + if (phdr->p_filesz > PATH_MAX) + goto error; + retval = -ENOENT; + if (phdr->p_filesz < 2) + goto error; + + /* read the name of the interpreter into memory */ + interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL); + if (!interpreter_name) + goto error; + + retval = kernel_read(bprm->file, + phdr->p_offset, + interpreter_name, + phdr->p_filesz); + if (unlikely(retval != phdr->p_filesz)) { + if (retval >= 0) + retval = -ENOEXEC; + goto error; + } + + retval = -ENOENT; + if (interpreter_name[phdr->p_filesz - 1] != '\0') + goto error; + + kdebug("Using ELF interpreter %s", interpreter_name); + + /* replace the program with the interpreter */ + interpreter = open_exec(interpreter_name); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) { + interpreter = NULL; + goto error; + } + + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + would_dump(bprm, interpreter); + + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); + if (unlikely(retval != BINPRM_BUF_SIZE)) { + if (retval >= 0) + retval = -ENOEXEC; + goto error; + } + + interp_params.hdr = *((struct elfhdr *) bprm->buf); + break; + + case PT_LOAD: +#ifdef CONFIG_MMU + if (exec_params.load_addr == 0) + exec_params.load_addr = phdr->p_vaddr; +#endif + break; + } + + } + + if (elf_check_const_displacement(&exec_params.hdr)) + exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; + + /* perform insanity checks on the interpreter */ + if (interpreter_name) { + retval = -ELIBBAD; + if (!is_elf_fdpic(&interp_params.hdr, interpreter)) + goto error; + + interp_params.flags = ELF_FDPIC_FLAG_PRESENT; + + /* read the interpreter's program header table */ + retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter); + if (retval < 0) + goto error; + } + + stack_size = exec_params.stack_size; + if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; + else + executable_stack = EXSTACK_DEFAULT; + + if (stack_size == 0) { + stack_size = interp_params.stack_size; + if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; + else + executable_stack = EXSTACK_DEFAULT; + } + + retval = -ENOEXEC; + if (stack_size == 0) + goto error; + + if (elf_check_const_displacement(&interp_params.hdr)) + interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; + + /* flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + goto error; + + /* there's now no turning back... the old userspace image is dead, + * defunct, deceased, etc. + */ + set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + + set_binfmt(&elf_fdpic_format); + + current->mm->start_code = 0; + current->mm->end_code = 0; + current->mm->start_stack = 0; + current->mm->start_data = 0; + current->mm->end_data = 0; + current->mm->context.exec_fdpic_loadmap = 0; + current->mm->context.interp_fdpic_loadmap = 0; + +#ifdef CONFIG_MMU + elf_fdpic_arch_lay_out_mm(&exec_params, + &interp_params, + ¤t->mm->start_stack, + ¤t->mm->start_brk); + + retval = setup_arg_pages(bprm, current->mm->start_stack, + executable_stack); + if (retval < 0) + goto error; +#endif + + /* load the executable and interpreter into memory */ + retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, + "executable"); + if (retval < 0) + goto error; + + if (interpreter_name) { + retval = elf_fdpic_map_file(&interp_params, interpreter, + current->mm, "interpreter"); + if (retval < 0) { + printk(KERN_ERR "Unable to load interpreter\n"); + goto error; + } + + allow_write_access(interpreter); + fput(interpreter); + interpreter = NULL; + } + +#ifdef CONFIG_MMU + if (!current->mm->start_brk) + current->mm->start_brk = current->mm->end_data; + + current->mm->brk = current->mm->start_brk = + PAGE_ALIGN(current->mm->start_brk); + +#else + /* create a stack and brk area big enough for everyone + * - the brk heap starts at the bottom and works up + * - the stack starts at the top and works down + */ + stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK; + if (stack_size < PAGE_SIZE * 2) + stack_size = PAGE_SIZE * 2; + + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + + current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_UNINITIALIZED | MAP_GROWSDOWN, + 0); + + if (IS_ERR_VALUE(current->mm->start_brk)) { + retval = current->mm->start_brk; + current->mm->start_brk = 0; + goto error; + } + + current->mm->brk = current->mm->start_brk; + current->mm->context.end_brk = current->mm->start_brk; + current->mm->context.end_brk += + (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; + current->mm->start_stack = current->mm->start_brk + stack_size; +#endif + + install_exec_creds(bprm); + if (create_elf_fdpic_tables(bprm, current->mm, + &exec_params, &interp_params) < 0) + goto error; + + kdebug("- start_code %lx", current->mm->start_code); + kdebug("- end_code %lx", current->mm->end_code); + kdebug("- start_data %lx", current->mm->start_data); + kdebug("- end_data %lx", current->mm->end_data); + kdebug("- start_brk %lx", current->mm->start_brk); + kdebug("- brk %lx", current->mm->brk); + kdebug("- start_stack %lx", current->mm->start_stack); + +#ifdef ELF_FDPIC_PLAT_INIT + /* + * The ABI may specify that certain registers be set up in special + * ways (on i386 %edx is the address of a DT_FINI function, for + * example. This macro performs whatever initialization to + * the regs structure is required. + */ + dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; + ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, + dynaddr); +#endif + + /* everything is now ready... get the userspace context ready to roll */ + entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; + start_thread(regs, entryaddr, current->mm->start_stack); + + retval = 0; + +error: + if (interpreter) { + allow_write_access(interpreter); + fput(interpreter); + } + kfree(interpreter_name); + kfree(exec_params.phdrs); + kfree(exec_params.loadmap); + kfree(interp_params.phdrs); + kfree(interp_params.loadmap); + return retval; +} + +/*****************************************************************************/ + +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +/* + * present useful information to the program by shovelling it onto the new + * process's stack + */ +static int create_elf_fdpic_tables(struct linux_binprm *bprm, + struct mm_struct *mm, + struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + const struct cred *cred = current_cred(); + unsigned long sp, csp, nitems; + elf_caddr_t __user *argv, *envp; + size_t platform_len = 0, len; + char *k_platform, *k_base_platform; + char __user *u_platform, *u_base_platform, *p; + int loop; + int nr; /* reset for each csp adjustment */ + +#ifdef CONFIG_MMU + /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them, so we give the architecture + * an opportunity to do so here. + */ + sp = arch_align_stack(bprm->p); +#else + sp = mm->start_stack; + + /* stack the program arguments and environment */ + if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) + return -EFAULT; +#endif + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ + k_platform = ELF_PLATFORM; + u_platform = NULL; + + if (k_platform) { + platform_len = strlen(k_platform) + 1; + sp -= platform_len; + u_platform = (char __user *) sp; + if (__copy_to_user(u_platform, k_platform, platform_len) != 0) + return -EFAULT; + } + + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + k_base_platform = ELF_BASE_PLATFORM; + u_base_platform = NULL; + + if (k_base_platform) { + platform_len = strlen(k_base_platform) + 1; + sp -= platform_len; + u_base_platform = (char __user *) sp; + if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) + return -EFAULT; + } + + sp &= ~7UL; + + /* stack the load map(s) */ + len = sizeof(struct elf32_fdpic_loadmap); + len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; + sp = (sp - len) & ~7UL; + exec_params->map_addr = sp; + + if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0) + return -EFAULT; + + current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; + + if (interp_params->loadmap) { + len = sizeof(struct elf32_fdpic_loadmap); + len += sizeof(struct elf32_fdpic_loadseg) * + interp_params->loadmap->nsegs; + sp = (sp - len) & ~7UL; + interp_params->map_addr = sp; + + if (copy_to_user((void __user *) sp, interp_params->loadmap, + len) != 0) + return -EFAULT; + + current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; + } + + /* force 16 byte _final_ alignment here for generality */ +#define DLINFO_ITEMS 15 + + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + + (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) + nitems++; + + csp = sp; + sp -= nitems * 2 * sizeof(unsigned long); + sp -= (bprm->envc + 1) * sizeof(char *); /* envv[] */ + sp -= (bprm->argc + 1) * sizeof(char *); /* argv[] */ + sp -= 1 * sizeof(unsigned long); /* argc */ + + csp -= sp & 15UL; + sp -= sp & 15UL; + + /* put the ELF interpreter info on the stack */ +#define NEW_AUX_ENT(id, val) \ + do { \ + struct { unsigned long _id, _val; } __user *ent; \ + \ + ent = (void __user *) csp; \ + __put_user((id), &ent[nr]._id); \ + __put_user((val), &ent[nr]._val); \ + nr++; \ + } while (0) + + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_NULL, 0); + if (k_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t) (unsigned long) u_platform); + } + + if (k_base_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t) (unsigned long) u_base_platform); + } + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } + + nr = 0; + csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif + NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT(AT_FLAGS, 0); + NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); + +#ifdef ARCH_DLINFO + nr = 0; + csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); + + /* ARCH_DLINFO must come last so platform specific code can enforce + * special alignment requirements on the AUXV if necessary (eg. PPC). + */ + ARCH_DLINFO; +#endif +#undef NEW_AUX_ENT + + /* allocate room for argv[] and envv[] */ + csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); + envp = (elf_caddr_t __user *) csp; + csp -= (bprm->argc + 1) * sizeof(elf_caddr_t); + argv = (elf_caddr_t __user *) csp; + + /* stack argc */ + csp -= sizeof(unsigned long); + __put_user(bprm->argc, (unsigned long __user *) csp); + + BUG_ON(csp != sp); + + /* fill in the argv[] array */ +#ifdef CONFIG_MMU + current->mm->arg_start = bprm->p; +#else + current->mm->arg_start = current->mm->start_stack - + (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); +#endif + + p = (char __user *) current->mm->arg_start; + for (loop = bprm->argc; loop > 0; loop--) { + __put_user((elf_caddr_t) p, argv++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(NULL, argv); + current->mm->arg_end = (unsigned long) p; + + /* fill in the envv[] array */ + current->mm->env_start = (unsigned long) p; + for (loop = bprm->envc; loop > 0; loop--) { + __put_user((elf_caddr_t)(unsigned long) p, envp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(NULL, envp); + current->mm->env_end = (unsigned long) p; + + mm->start_stack = (unsigned long) sp; + return 0; +} + +/*****************************************************************************/ +/* + * transfer the program arguments and environment from the holding pages onto + * the stack + */ +#ifndef CONFIG_MMU +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *_sp) +{ + unsigned long index, stop, sp; + char *src; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *_sp; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + src = kmap(bprm->page[index]); + sp -= PAGE_SIZE; + if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret < 0) + goto out; + } + + *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; + +out: + return ret; +} +#endif + +/*****************************************************************************/ +/* + * load the appropriate binary image (executable or interpreter) into memory + * - we assume no MMU is available + * - if no other PIC bits are set in params->hdr->e_flags + * - we assume that the LOADable segments in the binary are independently relocatable + * - we assume R/O executable segments are shareable + * - else + * - we assume the loadable parts of the image to require fixed displacement + * - the image is not shareable + */ +static int elf_fdpic_map_file(struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm, + const char *what) +{ + struct elf32_fdpic_loadmap *loadmap; +#ifdef CONFIG_MMU + struct elf32_fdpic_loadseg *mseg; +#endif + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, stop; + unsigned nloads, tmp; + size_t size; + int loop, ret; + + /* allocate a load map table */ + nloads = 0; + for (loop = 0; loop < params->hdr.e_phnum; loop++) + if (params->phdrs[loop].p_type == PT_LOAD) + nloads++; + + if (nloads == 0) + return -ELIBBAD; + + size = sizeof(*loadmap) + nloads * sizeof(*seg); + loadmap = kzalloc(size, GFP_KERNEL); + if (!loadmap) + return -ENOMEM; + + params->loadmap = loadmap; + + loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; + loadmap->nsegs = nloads; + + load_addr = params->load_addr; + seg = loadmap->segs; + + /* map the requested LOADs into the memory space */ + switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { + case ELF_FDPIC_FLAG_CONSTDISP: + case ELF_FDPIC_FLAG_CONTIGUOUS: +#ifndef CONFIG_MMU + ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm); + if (ret < 0) + return ret; + break; +#endif + default: + ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm); + if (ret < 0) + return ret; + break; + } + + /* map the entry point */ + if (params->hdr.e_entry) { + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (params->hdr.e_entry >= seg->p_vaddr && + params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { + params->entry_addr = + (params->hdr.e_entry - seg->p_vaddr) + + seg->addr; + break; + } + } + } + + /* determine where the program header table has wound up if mapped */ + stop = params->hdr.e_phoff; + stop += params->hdr.e_phnum * sizeof (struct elf_phdr); + phdr = params->phdrs; + + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + + if (phdr->p_offset > params->hdr.e_phoff || + phdr->p_offset + phdr->p_filesz < stop) + continue; + + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (phdr->p_vaddr >= seg->p_vaddr && + phdr->p_vaddr + phdr->p_filesz <= + seg->p_vaddr + seg->p_memsz) { + params->ph_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr + + params->hdr.e_phoff - phdr->p_offset; + break; + } + } + break; + } + + /* determine where the dynamic section has wound up if there is one */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_DYNAMIC) + continue; + + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (phdr->p_vaddr >= seg->p_vaddr && + phdr->p_vaddr + phdr->p_memsz <= + seg->p_vaddr + seg->p_memsz) { + params->dynamic_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr; + + /* check the dynamic section contains at least + * one item, and that the last item is a NULL + * entry */ + if (phdr->p_memsz == 0 || + phdr->p_memsz % sizeof(Elf32_Dyn) != 0) + goto dynamic_error; + + tmp = phdr->p_memsz / sizeof(Elf32_Dyn); + if (((Elf32_Dyn *) + params->dynamic_addr)[tmp - 1].d_tag != 0) + goto dynamic_error; + break; + } + } + break; + } + + /* now elide adjacent segments in the load map on MMU linux + * - on uClinux the holes between may actually be filled with system + * stuff or stuff from other processes + */ +#ifdef CONFIG_MMU + nloads = loadmap->nsegs; + mseg = loadmap->segs; + seg = mseg + 1; + for (loop = 1; loop < nloads; loop++) { + /* see if we have a candidate for merging */ + if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { + load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); + if (load_addr == (seg->addr & PAGE_MASK)) { + mseg->p_memsz += + load_addr - + (mseg->addr + mseg->p_memsz); + mseg->p_memsz += seg->addr & ~PAGE_MASK; + mseg->p_memsz += seg->p_memsz; + loadmap->nsegs--; + continue; + } + } + + mseg++; + if (mseg != seg) + *mseg = *seg; + } +#endif + + kdebug("Mapped Object [%s]:", what); + kdebug("- elfhdr : %lx", params->elfhdr_addr); + kdebug("- entry : %lx", params->entry_addr); + kdebug("- PHDR[] : %lx", params->ph_addr); + kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); + seg = loadmap->segs; + for (loop = 0; loop < loadmap->nsegs; loop++, seg++) + kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", + loop, + seg->addr, seg->addr + seg->p_memsz - 1, + seg->p_vaddr, seg->p_memsz); + + return 0; + +dynamic_error: + printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", + what, file_inode(file)->i_ino); + return -ELIBBAD; +} + +/*****************************************************************************/ +/* + * map a file with constant displacement under uClinux + */ +#ifndef CONFIG_MMU +static int elf_fdpic_map_file_constdisp_on_uclinux( + struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) +{ + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + int loop, ret; + + load_addr = params->load_addr; + seg = params->loadmap->segs; + + /* determine the bounds of the contiguous overall allocation we must + * make */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (params->phdrs[loop].p_type != PT_LOAD) + continue; + + if (base > phdr->p_vaddr) + base = phdr->p_vaddr; + if (top < phdr->p_vaddr + phdr->p_memsz) + top = phdr->p_vaddr + phdr->p_memsz; + } + + /* allocate one big anon block for everything */ + mflags = MAP_PRIVATE; + if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) + mflags |= MAP_EXECUTABLE; + + maddr = vm_mmap(NULL, load_addr, top - base, + PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + if (IS_ERR_VALUE(maddr)) + return (int) maddr; + + if (load_addr != 0) + load_addr += PAGE_ALIGN(top - base); + + /* and then load the file segments into it */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (params->phdrs[loop].p_type != PT_LOAD) + continue; + + seg->addr = maddr + (phdr->p_vaddr - base); + seg->p_vaddr = phdr->p_vaddr; + seg->p_memsz = phdr->p_memsz; + + ret = read_code(file, seg->addr, phdr->p_offset, + phdr->p_filesz); + if (ret < 0) + return ret; + + /* map the ELF header address if in this segment */ + if (phdr->p_offset == 0) + params->elfhdr_addr = seg->addr; + + /* clear any space allocated but not loaded */ + if (phdr->p_filesz < phdr->p_memsz) { + if (clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz)) + return -EFAULT; + } + + if (mm) { + if (phdr->p_flags & PF_X) { + if (!mm->start_code) { + mm->start_code = seg->addr; + mm->end_code = seg->addr + + phdr->p_memsz; + } + } else if (!mm->start_data) { + mm->start_data = seg->addr; + mm->end_data = seg->addr + phdr->p_memsz; + } + } + + seg++; + } + + return 0; +} +#endif + +/*****************************************************************************/ +/* + * map a binary by direct mmap() of the individual PT_LOAD segments + */ +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) +{ + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, delta_vaddr; + int loop, dvset; + + load_addr = params->load_addr; + delta_vaddr = 0; + dvset = 0; + + seg = params->loadmap->segs; + + /* deal with each load segment separately */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + unsigned long maddr, disp, excess, excess1; + int prot = 0, flags; + + if (phdr->p_type != PT_LOAD) + continue; + + kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx", + (unsigned long) phdr->p_vaddr, + (unsigned long) phdr->p_offset, + (unsigned long) phdr->p_filesz, + (unsigned long) phdr->p_memsz); + + /* determine the mapping parameters */ + if (phdr->p_flags & PF_R) prot |= PROT_READ; + if (phdr->p_flags & PF_W) prot |= PROT_WRITE; + if (phdr->p_flags & PF_X) prot |= PROT_EXEC; + + flags = MAP_PRIVATE | MAP_DENYWRITE; + if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) + flags |= MAP_EXECUTABLE; + + maddr = 0; + + switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { + case ELF_FDPIC_FLAG_INDEPENDENT: + /* PT_LOADs are independently locatable */ + break; + + case ELF_FDPIC_FLAG_HONOURVADDR: + /* the specified virtual address must be honoured */ + maddr = phdr->p_vaddr; + flags |= MAP_FIXED; + break; + + case ELF_FDPIC_FLAG_CONSTDISP: + /* constant displacement + * - can be mapped anywhere, but must be mapped as a + * unit + */ + if (!dvset) { + maddr = load_addr; + delta_vaddr = phdr->p_vaddr; + dvset = 1; + } else { + maddr = load_addr + phdr->p_vaddr - delta_vaddr; + flags |= MAP_FIXED; + } + break; + + case ELF_FDPIC_FLAG_CONTIGUOUS: + /* contiguity handled later */ + break; + + default: + BUG(); + } + + maddr &= PAGE_MASK; + + /* create the mapping */ + disp = phdr->p_vaddr & ~PAGE_MASK; + maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp); + + kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", + loop, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp, maddr); + + if (IS_ERR_VALUE(maddr)) + return (int) maddr; + + if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == + ELF_FDPIC_FLAG_CONTIGUOUS) + load_addr += PAGE_ALIGN(phdr->p_memsz + disp); + + seg->addr = maddr + disp; + seg->p_vaddr = phdr->p_vaddr; + seg->p_memsz = phdr->p_memsz; + + /* map the ELF header address if in this segment */ + if (phdr->p_offset == 0) + params->elfhdr_addr = seg->addr; + + /* clear the bit between beginning of mapping and beginning of + * PT_LOAD */ + if (prot & PROT_WRITE && disp > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); + if (clear_user((void __user *) maddr, disp)) + return -EFAULT; + maddr += disp; + } + + /* clear any space allocated but not loaded + * - on uClinux we can just clear the lot + * - on MMU linux we'll get a SIGBUS beyond the last page + * extant in the file + */ + excess = phdr->p_memsz - phdr->p_filesz; + excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); + +#ifdef CONFIG_MMU + if (excess > excess1) { + unsigned long xaddr = maddr + phdr->p_filesz + excess1; + unsigned long xmaddr; + + flags |= MAP_FIXED | MAP_ANONYMOUS; + xmaddr = vm_mmap(NULL, xaddr, excess - excess1, + prot, flags, 0); + + kdebug("mmap[%d] " + " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", + loop, xaddr, excess - excess1, prot, flags, + xmaddr); + + if (xmaddr != xaddr) + return -ENOMEM; + } + + if (prot & PROT_WRITE && excess1 > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", + loop, maddr + phdr->p_filesz, excess1); + if (clear_user((void __user *) maddr + phdr->p_filesz, + excess1)) + return -EFAULT; + } + +#else + if (excess > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", + loop, maddr + phdr->p_filesz, excess); + if (clear_user((void *) maddr + phdr->p_filesz, excess)) + return -EFAULT; + } +#endif + + if (mm) { + if (phdr->p_flags & PF_X) { + if (!mm->start_code) { + mm->start_code = maddr; + mm->end_code = maddr + phdr->p_memsz; + } + } else if (!mm->start_data) { + mm->start_data = maddr; + mm->end_data = maddr + phdr->p_memsz; + } + } + + seg++; + } + + return 0; +} + +/*****************************************************************************/ +/* + * ELF-FDPIC core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * + * Modelled on fs/binfmt_elf.c core dumper + */ +#ifdef CONFIG_ELF_CORE + +/* + * Decide whether a segment is worth dumping; default is yes to be + * sure (missing info is worse than too much; etc). + * Personally I'd include everything, and use the coredump limit... + * + * I think we should skip something. But I am not sure how. H.J. + */ +static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) +{ + int dump_ok; + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) { + kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* If we may not read the contents, don't allow us to dump + * them either. "dump_write()" can't handle it anyway. + */ + if (!(vma->vm_flags & VM_READ)) { + kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0) { + dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); + kdcore("%08lx: %08lx: %s (share)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } + + dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); + kdcore("%08lx: %08lx: %s (share)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } + +#ifdef CONFIG_MMU + /* By default, if it hasn't been written to, don't write it out */ + if (!vma->anon_vma) { + dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); + kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } +#endif + + dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); + kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, + dump_ok ? "yes" : "no"); + return dump_ok; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +/* #define DEBUG */ + +static int writenote(struct memelfnote *men, struct coredump_params *cprm) +{ + struct elf_note en; + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + return dump_emit(cprm, &en, sizeof(en)) && + dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && + dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); +} + +static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_FDPIC_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up separately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); + if (thread_group_leader(p)) { + struct task_cputime cputime; + + /* + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. + */ + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + } else { + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + const struct cred *cred; + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ - 1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *) mm->arg_start, len)) + return -EFAULT; + for (i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); + rcu_read_unlock(); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every thread's pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + struct task_struct *p = t->thread; + int sz = 0; + + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &t->prstatus); + t->num_notes++; + sz += notesize(&t->notes[0]); + + t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); + if (t->prstatus.pr_fpvalid) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &t->fpu); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(t->xfpu), &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +/* + * dump the segments for an MMU process + */ +static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +{ + struct vm_area_struct *vma; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + if (!maydump(vma, cprm->mm_flags)) + continue; + +#ifdef CONFIG_MMU + for (addr = vma->vm_start; addr < vma->vm_end; + addr += PAGE_SIZE) { + bool res; + struct page *page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + res = dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + page_cache_release(page); + } else { + res = dump_skip(cprm, PAGE_SIZE); + } + if (!res) + return false; + } +#else + if (!dump_emit(cprm, (void *) vma->vm_start, + vma->vm_end - vma->vm_start)) + return false; +#endif + } + return true; +} + +static size_t elf_core_vma_data_size(unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if (maydump(vma, mm_flags)) + size += vma->vm_end - vma->vm_start; + return size; +} + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_fdpic_core_dump(struct coredump_params *cprm) +{ +#define NUM_NOTES 6 + int has_dumped = 0; + mm_segment_t fs; + int segs; + int i; + struct vm_area_struct *vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff; + int numnote; + struct memelfnote *notes = NULL; + struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t *fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu = NULL; +#endif + int thread_status_size = 0; + elf_addr_t *auxv; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; + struct core_thread *ct; + struct elf_thread_status *tmp; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto cleanup; + prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); + if (!prstatus) + goto cleanup; + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (!psinfo) + goto cleanup; + notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + if (!notes) + goto cleanup; + fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); + if (!fpu) + goto cleanup; +#ifdef ELF_CORE_COPY_XFPREGS + xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); + if (!xfpu) + goto cleanup; +#endif + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + goto cleanup; + + tmp->thread = ct->task; + list_add(&tmp->list, &thread_list); + } + + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; + + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp); + thread_status_size += sz; + } + + /* now collect the dump for the current */ + fill_prstatus(prstatus, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); + + segs = current->mm->map_count; + segs += elf_core_extra_phdrs(); + + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + + /* Set up header */ + fill_elf_fdpic_header(elf, e_phnum); + + has_dumped = 1; + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); + fill_psinfo(psinfo, current->group_leader, current->mm); + fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + numnote = 2; + + auxv = (elf_addr_t *) current->mm->saved_auxv; + + i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(¬es[numnote++], "CORE", NT_AUXV, + i * sizeof(elf_addr_t), auxv); + + /* Try to dump the FPU. */ + if ((prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) + fill_note(notes + numnote++, + "CORE", NT_PRFPREG, sizeof(*fpu), fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, xfpu)) + fill_note(notes + numnote++, + "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + offset += sizeof(*elf); /* Elf header */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ + + /* Write notes phdr entry */ + { + int sz = 0; + + for (i = 0; i < numnote; i++) + sz += notesize(notes + i); + + sz += thread_status_size; + + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; + } + + /* Page-align dumped data */ + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + offset += elf_core_vma_data_size(cprm->mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + if (!dump_emit(cprm, elf, sizeof(*elf))) + goto end_coredump; + + if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + + /* write program headers for segments dump */ + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + struct elf_phdr phdr; + size_t sz; + + sz = vma->vm_end - vma->vm_start; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_memsz = sz; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + goto end_coredump; + } + + if (!elf_core_write_extra_phdrs(cprm, offset)) + goto end_coredump; + + /* write out the notes section */ + for (i = 0; i < numnote; i++) + if (!writenote(notes + i, cprm)) + goto end_coredump; + + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], cprm)) + goto end_coredump; + } + + if (!dump_skip(cprm, dataoff - cprm->written)) + goto end_coredump; + + if (!elf_fdpic_dump_segments(cprm)) + goto end_coredump; + + if (!elf_core_write_extra_data(cprm)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) + goto end_coredump; + } + + if (cprm->file->f_pos != offset) { + /* Sanity check */ + printk(KERN_WARNING + "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", + cprm->file->f_pos, offset); + } + +end_coredump: + set_fs(fs); + +cleanup: + while (!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + kfree(phdr4note); + kfree(elf); + kfree(prstatus); + kfree(psinfo); + kfree(notes); + kfree(fpu); + kfree(shdr4extnum); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(xfpu); +#endif + return has_dumped; +#undef NUM_NOTES +} + +#endif /* CONFIG_ELF_CORE */ diff --git a/kernel/fs/binfmt_em86.c b/kernel/fs/binfmt_em86.c new file mode 100644 index 000000000..490538536 --- /dev/null +++ b/kernel/fs/binfmt_em86.c @@ -0,0 +1,117 @@ +/* + * linux/fs/binfmt_em86.c + * + * Based on linux/fs/binfmt_script.c + * Copyright (C) 1996 Martin von Löwis + * original #!-checking implemented by tytso. + * + * em86 changes Copyright (C) 1997 Jim Paradis + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define EM86_INTERP "/usr/bin/em86" +#define EM86_I_NAME "em86" + +static int load_em86(struct linux_binprm *bprm) +{ + char *interp, *i_name, *i_arg; + struct file * file; + int retval; + struct elfhdr elf_ex; + + /* Make sure this is a Linux/Intel ELF executable... */ + elf_ex = *((struct elfhdr *)bprm->buf); + + if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + return -ENOEXEC; + + /* First of all, some simple consistency checks */ + if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) || + (!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) || + !bprm->file->f_op->mmap) { + return -ENOEXEC; + } + + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + + /* Unlike in the script case, we don't have to do any hairy + * parsing to find our interpreter... it's hardcoded! + */ + interp = EM86_INTERP; + i_name = EM86_I_NAME; + i_arg = NULL; /* We reserve the right to add an arg later */ + + /* + * Splice in (1) the interpreter's name for argv[0] + * (2) (optional) argument to interpreter + * (3) filename of emulated file (replace argv[0]) + * + * This is done in reverse order, because of how the + * user environment and arguments are stored. + */ + remove_arg_zero(bprm); + retval = copy_strings_kernel(1, &bprm->filename, bprm); + if (retval < 0) return retval; + bprm->argc++; + if (i_arg) { + retval = copy_strings_kernel(1, &i_arg, bprm); + if (retval < 0) return retval; + bprm->argc++; + } + retval = copy_strings_kernel(1, &i_name, bprm); + if (retval < 0) return retval; + bprm->argc++; + + /* + * OK, now restart the process with the interpreter's inode. + * Note that we use open_exec() as the name is now in kernel + * space, and we don't need to copy it. + */ + file = open_exec(interp); + if (IS_ERR(file)) + return PTR_ERR(file); + + bprm->file = file; + + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + + return search_binary_handler(bprm); +} + +static struct linux_binfmt em86_format = { + .module = THIS_MODULE, + .load_binary = load_em86, +}; + +static int __init init_em86_binfmt(void) +{ + register_binfmt(&em86_format); + return 0; +} + +static void __exit exit_em86_binfmt(void) +{ + unregister_binfmt(&em86_format); +} + +core_initcall(init_em86_binfmt); +module_exit(exit_em86_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/binfmt_flat.c b/kernel/fs/binfmt_flat.c new file mode 100644 index 000000000..f723cd3a4 --- /dev/null +++ b/kernel/fs/binfmt_flat.c @@ -0,0 +1,953 @@ +/****************************************************************************/ +/* + * linux/fs/binfmt_flat.c + * + * Copyright (C) 2000-2003 David McCullough + * Copyright (C) 2002 Greg Ungerer + * Copyright (C) 2002 SnapGear, by Paul Dale + * Copyright (C) 2000, 2001 Lineo, by David McCullough + * based heavily on: + * + * linux/fs/binfmt_aout.c: + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + * linux/fs/binfmt_flat.c for 2.0 kernel + * Copyright (C) 1998 Kenneth Albanowski + * JAN/99 -- coded full program relocation (gerg@snapgear.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/****************************************************************************/ + +#if 0 +#define DEBUG 1 +#endif + +#ifdef DEBUG +#define DBG_FLT(a...) printk(a) +#else +#define DBG_FLT(a...) +#endif + +/* + * User data (data section and bss) needs to be aligned. + * We pick 0x20 here because it is the max value elf2flt has always + * used in producing FLAT files, and because it seems to be large + * enough to make all the gcc alignment related tests happy. + */ +#define FLAT_DATA_ALIGN (0x20) + +/* + * User data (stack) also needs to be aligned. + * Here we can be a bit looser than the data sections since this + * needs to only meet arch ABI requirements. + */ +#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN) + +#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ +#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ + +struct lib_info { + struct { + unsigned long start_code; /* Start of text segment */ + unsigned long start_data; /* Start of data segment */ + unsigned long start_brk; /* End of data segment */ + unsigned long text_len; /* Length of text segment */ + unsigned long entry; /* Start address for this module */ + unsigned long build_date; /* When this one was compiled */ + short loaded; /* Has this library been loaded? */ + } lib_list[MAX_SHARED_LIBS]; +}; + +#ifdef CONFIG_BINFMT_SHARED_FLAT +static int load_flat_shared_library(int id, struct lib_info *p); +#endif + +static int load_flat_binary(struct linux_binprm *); +static int flat_core_dump(struct coredump_params *cprm); + +static struct linux_binfmt flat_format = { + .module = THIS_MODULE, + .load_binary = load_flat_binary, + .core_dump = flat_core_dump, + .min_coredump = PAGE_SIZE +}; + +/****************************************************************************/ +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + */ + +static int flat_core_dump(struct coredump_params *cprm) +{ + printk("Process %s:%d received signr %d and should have core dumped\n", + current->comm, current->pid, (int) cprm->siginfo->si_signo); + return(1); +} + +/****************************************************************************/ +/* + * create_flat_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ + +static unsigned long create_flat_tables( + unsigned long pp, + struct linux_binprm * bprm) +{ + unsigned long *argv,*envp; + unsigned long * sp; + char * p = (char*)pp; + int argc = bprm->argc; + int envc = bprm->envc; + char uninitialized_var(dummy); + + sp = (unsigned long *)p; + sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); + argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + envp = argv + (argc + 1); + + if (flat_argvp_envp_on_stack()) { + put_user((unsigned long) envp, sp + 2); + put_user((unsigned long) argv, sp + 1); + } + + put_user(argc, sp); + current->mm->arg_start = (unsigned long) p; + while (argc-->0) { + put_user((unsigned long) p, argv++); + do { + get_user(dummy, p); p++; + } while (dummy); + } + put_user((unsigned long) NULL, argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-->0) { + put_user((unsigned long)p, envp); envp++; + do { + get_user(dummy, p); p++; + } while (dummy); + } + put_user((unsigned long) NULL, envp); + current->mm->env_end = (unsigned long) p; + return (unsigned long)sp; +} + +/****************************************************************************/ + +#ifdef CONFIG_BINFMT_ZFLAT + +#include + +#define LBUFSIZE 4000 + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ +#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ +#define RESERVED 0xC0 /* bit 6,7: reserved */ + +static int decompress_exec( + struct linux_binprm *bprm, + unsigned long offset, + char *dst, + long len, + int fd) +{ + unsigned char *buf; + z_stream strm; + loff_t fpos; + int ret, retval; + + DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); + + memset(&strm, 0, sizeof(strm)); + strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (strm.workspace == NULL) { + DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); + return -ENOMEM; + } + buf = kmalloc(LBUFSIZE, GFP_KERNEL); + if (buf == NULL) { + DBG_FLT("binfmt_flat: no memory for read buffer\n"); + retval = -ENOMEM; + goto out_free; + } + + /* Read in first chunk of data and parse gzip header. */ + fpos = offset; + ret = kernel_read(bprm->file, offset, buf, LBUFSIZE); + + strm.next_in = buf; + strm.avail_in = ret; + strm.total_in = 0; + fpos += ret; + + retval = -ENOEXEC; + + /* Check minimum size -- gzip header */ + if (ret < 10) { + DBG_FLT("binfmt_flat: file too small?\n"); + goto out_free_buf; + } + + /* Check gzip magic number */ + if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { + DBG_FLT("binfmt_flat: unknown compression magic?\n"); + goto out_free_buf; + } + + /* Check gzip method */ + if (buf[2] != 8) { + DBG_FLT("binfmt_flat: unknown compression method?\n"); + goto out_free_buf; + } + /* Check gzip flags */ + if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || + (buf[3] & RESERVED)) { + DBG_FLT("binfmt_flat: unknown flags?\n"); + goto out_free_buf; + } + + ret = 10; + if (buf[3] & EXTRA_FIELD) { + ret += 2 + buf[10] + (buf[11] << 8); + if (unlikely(LBUFSIZE <= ret)) { + DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); + goto out_free_buf; + } + } + if (buf[3] & ORIG_NAME) { + while (ret < LBUFSIZE && buf[ret++] != 0) + ; + if (unlikely(LBUFSIZE == ret)) { + DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); + goto out_free_buf; + } + } + if (buf[3] & COMMENT) { + while (ret < LBUFSIZE && buf[ret++] != 0) + ; + if (unlikely(LBUFSIZE == ret)) { + DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); + goto out_free_buf; + } + } + + strm.next_in += ret; + strm.avail_in -= ret; + + strm.next_out = dst; + strm.avail_out = len; + strm.total_out = 0; + + if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { + DBG_FLT("binfmt_flat: zlib init failed?\n"); + goto out_free_buf; + } + + while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { + ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE); + if (ret <= 0) + break; + len -= ret; + + strm.next_in = buf; + strm.avail_in = ret; + strm.total_in = 0; + fpos += ret; + } + + if (ret < 0) { + DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", + ret, strm.msg); + goto out_zlib; + } + + retval = 0; +out_zlib: + zlib_inflateEnd(&strm); +out_free_buf: + kfree(buf); +out_free: + kfree(strm.workspace); + return retval; +} + +#endif /* CONFIG_BINFMT_ZFLAT */ + +/****************************************************************************/ + +static unsigned long +calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) +{ + unsigned long addr; + int id; + unsigned long start_brk; + unsigned long start_data; + unsigned long text_len; + unsigned long start_code; + +#ifdef CONFIG_BINFMT_SHARED_FLAT + if (r == 0) + id = curid; /* Relocs of 0 are always self referring */ + else { + id = (r >> 24) & 0xff; /* Find ID for this reloc */ + r &= 0x00ffffff; /* Trim ID off here */ + } + if (id >= MAX_SHARED_LIBS) { + printk("BINFMT_FLAT: reference 0x%x to shared library %d", + (unsigned) r, id); + goto failed; + } + if (curid != id) { + if (internalp) { + printk("BINFMT_FLAT: reloc address 0x%x not in same module " + "(%d != %d)", (unsigned) r, curid, id); + goto failed; + } else if ( ! p->lib_list[id].loaded && + IS_ERR_VALUE(load_flat_shared_library(id, p))) { + printk("BINFMT_FLAT: failed to load library %d", id); + goto failed; + } + /* Check versioning information (i.e. time stamps) */ + if (p->lib_list[id].build_date && p->lib_list[curid].build_date && + p->lib_list[curid].build_date < p->lib_list[id].build_date) { + printk("BINFMT_FLAT: library %d is younger than %d", id, curid); + goto failed; + } + } +#else + id = 0; +#endif + + start_brk = p->lib_list[id].start_brk; + start_data = p->lib_list[id].start_data; + start_code = p->lib_list[id].start_code; + text_len = p->lib_list[id].text_len; + + if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { + printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", + (int) r,(int)(start_brk-start_data+text_len),(int)text_len); + goto failed; + } + + if (r < text_len) /* In text segment */ + addr = r + start_code; + else /* In data segment */ + addr = r - text_len + start_data; + + /* Range checked already above so doing the range tests is redundant...*/ + return(addr); + +failed: + printk(", killing %s!\n", current->comm); + send_sig(SIGSEGV, current, 0); + + return RELOC_FAILED; +} + +/****************************************************************************/ + +static void old_reloc(unsigned long rl) +{ +#ifdef DEBUG + char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; +#endif + flat_v2_reloc_t r; + unsigned long *ptr; + + r.value = rl; +#if defined(CONFIG_COLDFIRE) + ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); +#else + ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); +#endif + +#ifdef DEBUG + printk("Relocation of variable at DATASEG+%x " + "(address %p, currently %x) into segment %s\n", + r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); +#endif + + switch (r.reloc.type) { + case OLD_FLAT_RELOC_TYPE_TEXT: + *ptr += current->mm->start_code; + break; + case OLD_FLAT_RELOC_TYPE_DATA: + *ptr += current->mm->start_data; + break; + case OLD_FLAT_RELOC_TYPE_BSS: + *ptr += current->mm->end_data; + break; + default: + printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); + break; + } + +#ifdef DEBUG + printk("Relocation became %x\n", (int)*ptr); +#endif +} + +/****************************************************************************/ + +static int load_flat_file(struct linux_binprm * bprm, + struct lib_info *libinfo, int id, unsigned long *extra_stack) +{ + struct flat_hdr * hdr; + unsigned long textpos = 0, datapos = 0, result; + unsigned long realdatastart = 0; + unsigned long text_len, data_len, bss_len, stack_len, flags; + unsigned long full_data; + unsigned long len, memp = 0; + unsigned long memp_size, extra, rlim; + unsigned long *reloc = 0, *rp; + struct inode *inode; + int i, rev, relocs = 0; + loff_t fpos; + unsigned long start_code, end_code; + int ret; + + hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ + inode = file_inode(bprm->file); + + text_len = ntohl(hdr->data_start); + data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); + bss_len = ntohl(hdr->bss_end) - ntohl(hdr->data_end); + stack_len = ntohl(hdr->stack_size); + if (extra_stack) { + stack_len += *extra_stack; + *extra_stack = stack_len; + } + relocs = ntohl(hdr->reloc_count); + flags = ntohl(hdr->flags); + rev = ntohl(hdr->rev); + full_data = data_len + relocs * sizeof(unsigned long); + + if (strncmp(hdr->magic, "bFLT", 4)) { + /* + * Previously, here was a printk to tell people + * "BINFMT_FLAT: bad header magic". + * But for the kernel which also use ELF FD-PIC format, this + * error message is confusing. + * because a lot of people do not manage to produce good + */ + ret = -ENOEXEC; + goto err; + } + + if (flags & FLAT_FLAG_KTRACE) + printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + + if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { + printk("BINFMT_FLAT: bad flat file version 0x%x (supported " + "0x%lx and 0x%lx)\n", + rev, FLAT_VERSION, OLD_FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* Don't allow old format executables to use shared libraries */ + if (rev == OLD_FLAT_VERSION && id != 0) { + printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", + (int) FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* + * fix up the flags for the older format, there were all kinds + * of endian hacks, this only works for the simple cases + */ + if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags)) + flags = FLAT_FLAG_RAM; + +#ifndef CONFIG_BINFMT_ZFLAT + if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { + printk("Support for ZFLAT executables is not enabled.\n"); + ret = -ENOEXEC; + goto err; + } +#endif + + /* + * Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = rlimit(RLIMIT_DATA); + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (data_len + bss_len > rlim) { + ret = -ENOMEM; + goto err; + } + + /* Flush all traces of the currently running executable */ + if (id == 0) { + result = flush_old_exec(bprm); + if (result) { + ret = result; + goto err; + } + + /* OK, This is the point of no return */ + set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); + } + + /* + * calculate the extra space we need to map in + */ + extra = max_t(unsigned long, bss_len + stack_len, + relocs * sizeof(unsigned long)); + + /* + * there are a couple of cases here, the separate code/data + * case, and then the fully copied to RAM case which lumps + * it all together. + */ + if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { + /* + * this should give us a ROM ptr, but if it doesn't we don't + * really care + */ + DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); + + textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + MAP_PRIVATE|MAP_EXECUTABLE, 0); + if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos) + textpos = (unsigned long) -ENOMEM; + printk("Unable to mmap process text, errno %d\n", (int)-textpos); + ret = textpos; + goto err; + } + + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); + realdatastart = vm_mmap(0, 0, len, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + + if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + if (!realdatastart) + realdatastart = (unsigned long) -ENOMEM; + printk("Unable to allocate RAM for process data, errno %d\n", + (int)-realdatastart); + vm_munmap(textpos, text_len); + ret = realdatastart; + goto err; + } + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", + (int)(data_len + bss_len + stack_len), (int)datapos); + + fpos = ntohl(hdr->data_start); +#ifdef CONFIG_BINFMT_ZFLAT + if (flags & FLAT_FLAG_GZDATA) { + result = decompress_exec(bprm, fpos, (char *) datapos, + full_data, 0); + } else +#endif + { + result = read_code(bprm->file, datapos, fpos, + full_data); + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read data+bss, errno %d\n", (int)-result); + vm_munmap(textpos, text_len); + vm_munmap(realdatastart, len); + ret = result; + goto err; + } + + reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); + memp = realdatastart; + memp_size = len; + } else { + + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); + textpos = vm_mmap(0, 0, len, + PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + + if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos) + textpos = (unsigned long) -ENOMEM; + printk("Unable to allocate RAM for process text/data, errno %d\n", + (int)-textpos); + ret = textpos; + goto err; + } + + realdatastart = textpos + ntohl(hdr->data_start); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + reloc = (unsigned long *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); + memp = textpos; + memp_size = len; +#ifdef CONFIG_BINFMT_ZFLAT + /* + * load it all in and treat it like a RAM load from now on + */ + if (flags & FLAT_FLAG_GZIP) { + result = decompress_exec(bprm, sizeof (struct flat_hdr), + (((char *) textpos) + sizeof (struct flat_hdr)), + (text_len + full_data + - sizeof (struct flat_hdr)), + 0); + memmove((void *) datapos, (void *) realdatastart, + full_data); + } else if (flags & FLAT_FLAG_GZDATA) { + result = read_code(bprm->file, textpos, 0, text_len); + if (!IS_ERR_VALUE(result)) + result = decompress_exec(bprm, text_len, (char *) datapos, + full_data, 0); + } + else +#endif + { + result = read_code(bprm->file, textpos, 0, text_len); + if (!IS_ERR_VALUE(result)) + result = read_code(bprm->file, datapos, + ntohl(hdr->data_start), + full_data); + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read code+data+bss, errno %d\n",(int)-result); + vm_munmap(textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(unsigned long)); + ret = result; + goto err; + } + } + + if (flags & FLAT_FLAG_KTRACE) + printk("Mapping is %x, Entry point is %x, data_start is %x\n", + (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + + /* The main program needs a little extra setup in the task structure */ + start_code = textpos + sizeof (struct flat_hdr); + end_code = textpos + text_len; + if (id == 0) { + current->mm->start_code = start_code; + current->mm->end_code = end_code; + current->mm->start_data = datapos; + current->mm->end_data = datapos + data_len; + /* + * set up the brk stuff, uses any slack left in data/bss/stack + * allocation. We put the brk after the bss (between the bss + * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. + */ + current->mm->start_brk = datapos + data_len + bss_len; + current->mm->brk = (current->mm->start_brk + 3) & ~3; + current->mm->context.end_brk = memp + memp_size - stack_len; + } + + if (flags & FLAT_FLAG_KTRACE) + printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", + id ? "Lib" : "Load", bprm->filename, + (int) start_code, (int) end_code, + (int) datapos, + (int) (datapos + data_len), + (int) (datapos + data_len), + (int) (((datapos + data_len + bss_len) + 3) & ~3)); + + text_len -= sizeof(struct flat_hdr); /* the real code len */ + + /* Store the current module values into the global library structure */ + libinfo->lib_list[id].start_code = start_code; + libinfo->lib_list[id].start_data = datapos; + libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; + libinfo->lib_list[id].text_len = text_len; + libinfo->lib_list[id].loaded = 1; + libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; + libinfo->lib_list[id].build_date = ntohl(hdr->build_date); + + /* + * We just load the allocations into some temporary memory to + * help simplify all this mumbo jumbo + * + * We've got two different sections of relocation entries. + * The first is the GOT which resides at the beginning of the data segment + * and is terminated with a -1. This one can be relocated in place. + * The second is the extra relocation entries tacked after the image's + * data segment. These require a little more processing as the entry is + * really an offset into the image which contains an offset into the + * image. + */ + if (flags & FLAT_FLAG_GOTPIC) { + for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { + unsigned long addr; + if (*rp) { + addr = calc_reloc(*rp, libinfo, id, 0); + if (addr == RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + *rp = addr; + } + } + } + + /* + * Now run through the relocation entries. + * We've got to be careful here as C++ produces relocatable zero + * entries in the constructor and destructor tables which are then + * tested for being not zero (which will always occur unless we're + * based from address zero). This causes an endless loop as __start + * is at zero. The solution used is to not relocate zero addresses. + * This has the negative side effect of not allowing a global data + * reference to be statically initialised to _stext (I've moved + * __start to address 4 so that is okay). + */ + if (rev > OLD_FLAT_VERSION) { + unsigned long persistent = 0; + for (i=0; i < relocs; i++) { + unsigned long addr, relval; + + /* Get the address of the pointer to be + relocated (of course, the address has to be + relocated first). */ + relval = ntohl(reloc[i]); + if (flat_set_persistent (relval, &persistent)) + continue; + addr = flat_get_relocate_addr(relval); + rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); + if (rp == (unsigned long *)RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + + /* Get the pointer's value. */ + addr = flat_get_addr_from_rp(rp, relval, flags, + &persistent); + if (addr != 0) { + /* + * Do the relocation. PIC relocs in the data section are + * already in target order + */ + if ((flags & FLAT_FLAG_GOTPIC) == 0) + addr = ntohl(addr); + addr = calc_reloc(addr, libinfo, id, 0); + if (addr == RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + + /* Write back the relocated pointer. */ + flat_put_addr_at_rp(rp, addr, relval); + } + } + } else { + for (i=0; i < relocs; i++) + old_reloc(ntohl(reloc[i])); + } + + flush_icache_range(start_code, end_code); + + /* zero the BSS, BRK and stack areas */ + memset((void*)(datapos + data_len), 0, bss_len + + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ + stack_len); + + return 0; +err: + return ret; +} + + +/****************************************************************************/ +#ifdef CONFIG_BINFMT_SHARED_FLAT + +/* + * Load a shared library into memory. The library gets its own data + * segment (including bss) but not argv/argc/environ. + */ + +static int load_flat_shared_library(int id, struct lib_info *libs) +{ + struct linux_binprm bprm; + int res; + char buf[16]; + + memset(&bprm, 0, sizeof(bprm)); + + /* Create the file name */ + sprintf(buf, "/lib/lib%d.so", id); + + /* Open the file up */ + bprm.filename = buf; + bprm.file = open_exec(bprm.filename); + res = PTR_ERR(bprm.file); + if (IS_ERR(bprm.file)) + return res; + + bprm.cred = prepare_exec_creds(); + res = -ENOMEM; + if (!bprm.cred) + goto out; + + /* We don't really care about recalculating credentials at this point + * as we're past the point of no return and are dealing with shared + * libraries. + */ + bprm.cred_prepared = 1; + + res = prepare_binprm(&bprm); + + if (!IS_ERR_VALUE(res)) + res = load_flat_file(&bprm, libs, id, NULL); + + abort_creds(bprm.cred); + +out: + allow_write_access(bprm.file); + fput(bprm.file); + + return(res); +} + +#endif /* CONFIG_BINFMT_SHARED_FLAT */ +/****************************************************************************/ + +/* + * These are the functions used to load flat style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int load_flat_binary(struct linux_binprm * bprm) +{ + struct lib_info libinfo; + struct pt_regs *regs = current_pt_regs(); + unsigned long p = bprm->p; + unsigned long stack_len; + unsigned long start_addr; + unsigned long *sp; + int res; + int i, j; + + memset(&libinfo, 0, sizeof(libinfo)); + /* + * We have to add the size of our arguments to our stack size + * otherwise it's too easy for users to create stack overflows + * by passing in a huge argument list. And yes, we have to be + * pedantic and include space for the argv/envp array as it may have + * a lot of entries. + */ +#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) + stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ + stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ + stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ + stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ + + res = load_flat_file(bprm, &libinfo, 0, &stack_len); + if (IS_ERR_VALUE(res)) + return res; + + /* Update data segment pointers for all libraries */ + for (i=0; imm->context.end_brk + stack_len + 3) & ~3) - 4; + DBG_FLT("p=%x\n", (int)p); + + /* copy the arg pages onto the stack, this could be more efficient :-) */ + for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) + * (char *) --p = + ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; + + sp = (unsigned long *) create_flat_tables(p, bprm); + + /* Fake some return addresses to ensure the call chain will + * initialise library in order for us. We are required to call + * lib 1 first, then 2, ... and finally the main program (id 0). + */ + start_addr = libinfo.lib_list[0].entry; + +#ifdef CONFIG_BINFMT_SHARED_FLAT + for (i = MAX_SHARED_LIBS-1; i>0; i--) { + if (libinfo.lib_list[i].loaded) { + /* Push previos first to call address */ + --sp; put_user(start_addr, sp); + start_addr = libinfo.lib_list[i].entry; + } + } +#endif + + /* Stash our initial stack pointer into the mm structure */ + current->mm->start_stack = (unsigned long )sp; + +#ifdef FLAT_PLAT_INIT + FLAT_PLAT_INIT(regs); +#endif + DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", + (int)regs, (int)start_addr, (int)current->mm->start_stack); + + start_thread(regs, start_addr, current->mm->start_stack); + + return 0; +} + +/****************************************************************************/ + +static int __init init_flat_binfmt(void) +{ + register_binfmt(&flat_format); + return 0; +} + +/****************************************************************************/ + +core_initcall(init_flat_binfmt); + +/****************************************************************************/ diff --git a/kernel/fs/binfmt_misc.c b/kernel/fs/binfmt_misc.c new file mode 100644 index 000000000..78f005f37 --- /dev/null +++ b/kernel/fs/binfmt_misc.c @@ -0,0 +1,835 @@ +/* + * binfmt_misc.c + * + * Copyright (C) 1997 Richard Günther + * + * binfmt_misc detects binaries via a magic or filename extension and invokes + * a specified wrapper. See Documentation/binfmt_misc.txt for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +# define USE_DEBUG 1 +#else +# define USE_DEBUG 0 +#endif + +enum { + VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ +}; + +static LIST_HEAD(entries); +static int enabled = 1; + +enum {Enabled, Magic}; +#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) +#define MISC_FMT_OPEN_BINARY (1 << 30) +#define MISC_FMT_CREDENTIALS (1 << 29) + +typedef struct { + struct list_head list; + unsigned long flags; /* type, status, etc. */ + int offset; /* offset of magic */ + int size; /* size of magic/mask */ + char *magic; /* magic or filename extension */ + char *mask; /* mask, NULL for exact match */ + char *interpreter; /* filename of interpreter */ + char *name; + struct dentry *dentry; +} Node; + +static DEFINE_RWLOCK(entries_lock); +static struct file_system_type bm_fs_type; +static struct vfsmount *bm_mnt; +static int entry_count; + +/* + * Max length of the register string. Determined by: + * - 7 delimiters + * - name: ~50 bytes + * - type: 1 byte + * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE) + * - magic: 128 bytes (512 in escaped form) + * - mask: 128 bytes (512 in escaped form) + * - interp: ~50 bytes + * - flags: 5 bytes + * Round that up a bit, and then back off to hold the internal data + * (like struct Node). + */ +#define MAX_REGISTER_LENGTH 1920 + +/* + * Check if we support the binfmt + * if we do, return the node, else NULL + * locking is done in load_misc_binary + */ +static Node *check_file(struct linux_binprm *bprm) +{ + char *p = strrchr(bprm->interp, '.'); + struct list_head *l; + + /* Walk all the registered handlers. */ + list_for_each(l, &entries) { + Node *e = list_entry(l, Node, list); + char *s; + int j; + + /* Make sure this one is currently enabled. */ + if (!test_bit(Enabled, &e->flags)) + continue; + + /* Do matching based on extension if applicable. */ + if (!test_bit(Magic, &e->flags)) { + if (p && !strcmp(e->magic, p + 1)) + return e; + continue; + } + + /* Do matching based on magic & mask. */ + s = bprm->buf + e->offset; + if (e->mask) { + for (j = 0; j < e->size; j++) + if ((*s++ ^ e->magic[j]) & e->mask[j]) + break; + } else { + for (j = 0; j < e->size; j++) + if ((*s++ ^ e->magic[j])) + break; + } + if (j == e->size) + return e; + } + return NULL; +} + +/* + * the loader itself + */ +static int load_misc_binary(struct linux_binprm *bprm) +{ + Node *fmt; + struct file *interp_file = NULL; + char iname[BINPRM_BUF_SIZE]; + const char *iname_addr = iname; + int retval; + int fd_binary = -1; + + retval = -ENOEXEC; + if (!enabled) + goto ret; + + /* to keep locking time low, we copy the interpreter string */ + read_lock(&entries_lock); + fmt = check_file(bprm); + if (fmt) + strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + read_unlock(&entries_lock); + if (!fmt) + goto ret; + + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + + if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { + retval = remove_arg_zero(bprm); + if (retval) + goto ret; + } + + if (fmt->flags & MISC_FMT_OPEN_BINARY) { + + /* if the binary should be opened on behalf of the + * interpreter than keep it open and assign descriptor + * to it + */ + fd_binary = get_unused_fd_flags(0); + if (fd_binary < 0) { + retval = fd_binary; + goto ret; + } + fd_install(fd_binary, bprm->file); + + /* if the binary is not readable than enforce mm->dumpable=0 + regardless of the interpreter's permissions */ + would_dump(bprm, bprm->file); + + allow_write_access(bprm->file); + bprm->file = NULL; + + /* mark the bprm that fd should be passed to interp */ + bprm->interp_flags |= BINPRM_FLAGS_EXECFD; + bprm->interp_data = fd_binary; + + } else { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + } + /* make argv[1] be the path to the binary */ + retval = copy_strings_kernel(1, &bprm->interp, bprm); + if (retval < 0) + goto error; + bprm->argc++; + + /* add the interp as argv[0] */ + retval = copy_strings_kernel(1, &iname_addr, bprm); + if (retval < 0) + goto error; + bprm->argc++; + + /* Update interp in case binfmt_script needs it. */ + retval = bprm_change_interp(iname, bprm); + if (retval < 0) + goto error; + + interp_file = open_exec(iname); + retval = PTR_ERR(interp_file); + if (IS_ERR(interp_file)) + goto error; + + bprm->file = interp_file; + if (fmt->flags & MISC_FMT_CREDENTIALS) { + /* + * No need to call prepare_binprm(), it's already been + * done. bprm->buf is stale, update from interp_file. + */ + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); + } else + retval = prepare_binprm(bprm); + + if (retval < 0) + goto error; + + retval = search_binary_handler(bprm); + if (retval < 0) + goto error; + +ret: + return retval; +error: + if (fd_binary > 0) + sys_close(fd_binary); + bprm->interp_flags = 0; + bprm->interp_data = 0; + goto ret; +} + +/* Command parsers */ + +/* + * parses and copies one argument enclosed in del from *sp to *dp, + * recognising the \x special. + * returns pointer to the copied argument or NULL in case of an + * error (and sets err) or null argument length. + */ +static char *scanarg(char *s, char del) +{ + char c; + + while ((c = *s++) != del) { + if (c == '\\' && *s == 'x') { + s++; + if (!isxdigit(*s++)) + return NULL; + if (!isxdigit(*s++)) + return NULL; + } + } + s[-1] ='\0'; + return s; +} + +static char *check_special_flags(char *sfs, Node *e) +{ + char *p = sfs; + int cont = 1; + + /* special flags */ + while (cont) { + switch (*p) { + case 'P': + pr_debug("register: flag: P (preserve argv0)\n"); + p++; + e->flags |= MISC_FMT_PRESERVE_ARGV0; + break; + case 'O': + pr_debug("register: flag: O (open binary)\n"); + p++; + e->flags |= MISC_FMT_OPEN_BINARY; + break; + case 'C': + pr_debug("register: flag: C (preserve creds)\n"); + p++; + /* this flags also implies the + open-binary flag */ + e->flags |= (MISC_FMT_CREDENTIALS | + MISC_FMT_OPEN_BINARY); + break; + default: + cont = 0; + } + } + + return p; +} + +/* + * This registers a new binary format, it recognises the syntax + * ':name:type:offset:magic:mask:interpreter:flags' + * where the ':' is the IFS, that can be chosen with the first char + */ +static Node *create_entry(const char __user *buffer, size_t count) +{ + Node *e; + int memsize, err; + char *buf, *p; + char del; + + pr_debug("register: received %zu bytes\n", count); + + /* some sanity checks */ + err = -EINVAL; + if ((count < 11) || (count > MAX_REGISTER_LENGTH)) + goto out; + + err = -ENOMEM; + memsize = sizeof(Node) + count + 8; + e = kmalloc(memsize, GFP_KERNEL); + if (!e) + goto out; + + p = buf = (char *)e + sizeof(Node); + + memset(e, 0, sizeof(Node)); + if (copy_from_user(buf, buffer, count)) + goto efault; + + del = *p++; /* delimeter */ + + pr_debug("register: delim: %#x {%c}\n", del, del); + + /* Pad the buffer with the delim to simplify parsing below. */ + memset(buf + count, del, 8); + + /* Parse the 'name' field. */ + e->name = p; + p = strchr(p, del); + if (!p) + goto einval; + *p++ = '\0'; + if (!e->name[0] || + !strcmp(e->name, ".") || + !strcmp(e->name, "..") || + strchr(e->name, '/')) + goto einval; + + pr_debug("register: name: {%s}\n", e->name); + + /* Parse the 'type' field. */ + switch (*p++) { + case 'E': + pr_debug("register: type: E (extension)\n"); + e->flags = 1 << Enabled; + break; + case 'M': + pr_debug("register: type: M (magic)\n"); + e->flags = (1 << Enabled) | (1 << Magic); + break; + default: + goto einval; + } + if (*p++ != del) + goto einval; + + if (test_bit(Magic, &e->flags)) { + /* Handle the 'M' (magic) format. */ + char *s; + + /* Parse the 'offset' field. */ + s = strchr(p, del); + if (!s) + goto einval; + *s++ = '\0'; + e->offset = simple_strtoul(p, &p, 10); + if (*p++) + goto einval; + pr_debug("register: offset: %#x\n", e->offset); + + /* Parse the 'magic' field. */ + e->magic = p; + p = scanarg(p, del); + if (!p) + goto einval; + if (!e->magic[0]) + goto einval; + if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[raw]: ", + DUMP_PREFIX_NONE, e->magic, p - e->magic); + + /* Parse the 'mask' field. */ + e->mask = p; + p = scanarg(p, del); + if (!p) + goto einval; + if (!e->mask[0]) { + e->mask = NULL; + pr_debug("register: mask[raw]: none\n"); + } else if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[raw]: ", + DUMP_PREFIX_NONE, e->mask, p - e->mask); + + /* + * Decode the magic & mask fields. + * Note: while we might have accepted embedded NUL bytes from + * above, the unescape helpers here will stop at the first one + * it encounters. + */ + e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); + if (e->mask && + string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) + goto einval; + if (e->size + e->offset > BINPRM_BUF_SIZE) + goto einval; + pr_debug("register: magic/mask length: %i\n", e->size); + if (USE_DEBUG) { + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[decoded]: ", + DUMP_PREFIX_NONE, e->magic, e->size); + + if (e->mask) { + int i; + char *masked = kmalloc(e->size, GFP_KERNEL); + + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[decoded]: ", + DUMP_PREFIX_NONE, e->mask, e->size); + + if (masked) { + for (i = 0; i < e->size; ++i) + masked[i] = e->magic[i] & e->mask[i]; + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[masked]: ", + DUMP_PREFIX_NONE, masked, e->size); + + kfree(masked); + } + } + } + } else { + /* Handle the 'E' (extension) format. */ + + /* Skip the 'offset' field. */ + p = strchr(p, del); + if (!p) + goto einval; + *p++ = '\0'; + + /* Parse the 'magic' field. */ + e->magic = p; + p = strchr(p, del); + if (!p) + goto einval; + *p++ = '\0'; + if (!e->magic[0] || strchr(e->magic, '/')) + goto einval; + pr_debug("register: extension: {%s}\n", e->magic); + + /* Skip the 'mask' field. */ + p = strchr(p, del); + if (!p) + goto einval; + *p++ = '\0'; + } + + /* Parse the 'interpreter' field. */ + e->interpreter = p; + p = strchr(p, del); + if (!p) + goto einval; + *p++ = '\0'; + if (!e->interpreter[0]) + goto einval; + pr_debug("register: interpreter: {%s}\n", e->interpreter); + + /* Parse the 'flags' field. */ + p = check_special_flags(p, e); + if (*p == '\n') + p++; + if (p != buf + count) + goto einval; + + return e; + +out: + return ERR_PTR(err); + +efault: + kfree(e); + return ERR_PTR(-EFAULT); +einval: + kfree(e); + return ERR_PTR(-EINVAL); +} + +/* + * Set status of entry/binfmt_misc: + * '1' enables, '0' disables and '-1' clears entry/binfmt_misc + */ +static int parse_command(const char __user *buffer, size_t count) +{ + char s[4]; + + if (count > 3) + return -EINVAL; + if (copy_from_user(s, buffer, count)) + return -EFAULT; + if (!count) + return 0; + if (s[count - 1] == '\n') + count--; + if (count == 1 && s[0] == '0') + return 1; + if (count == 1 && s[0] == '1') + return 2; + if (count == 2 && s[0] == '-' && s[1] == '1') + return 3; + return -EINVAL; +} + +/* generic stuff */ + +static void entry_status(Node *e, char *page) +{ + char *dp = page; + const char *status = "disabled"; + + if (test_bit(Enabled, &e->flags)) + status = "enabled"; + + if (!VERBOSE_STATUS) { + sprintf(page, "%s\n", status); + return; + } + + dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter); + + /* print the special flags */ + dp += sprintf(dp, "flags: "); + if (e->flags & MISC_FMT_PRESERVE_ARGV0) + *dp++ = 'P'; + if (e->flags & MISC_FMT_OPEN_BINARY) + *dp++ = 'O'; + if (e->flags & MISC_FMT_CREDENTIALS) + *dp++ = 'C'; + *dp++ = '\n'; + + if (!test_bit(Magic, &e->flags)) { + sprintf(dp, "extension .%s\n", e->magic); + } else { + dp += sprintf(dp, "offset %i\nmagic ", e->offset); + dp = bin2hex(dp, e->magic, e->size); + if (e->mask) { + dp += sprintf(dp, "\nmask "); + dp = bin2hex(dp, e->mask, e->size); + } + *dp++ = '\n'; + *dp = '\0'; + } +} + +static struct inode *bm_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + } + return inode; +} + +static void bm_evict_inode(struct inode *inode) +{ + clear_inode(inode); + kfree(inode->i_private); +} + +static void kill_node(Node *e) +{ + struct dentry *dentry; + + write_lock(&entries_lock); + dentry = e->dentry; + if (dentry) { + list_del_init(&e->list); + e->dentry = NULL; + } + write_unlock(&entries_lock); + + if (dentry) { + drop_nlink(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); + } +} + +/* / */ + +static ssize_t +bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + Node *e = file_inode(file)->i_private; + ssize_t res; + char *page; + + page = (char *) __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + entry_status(e, page); + + res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); + + free_page((unsigned long) page); + return res; +} + +static ssize_t bm_entry_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct dentry *root; + Node *e = file_inode(file)->i_private; + int res = parse_command(buffer, count); + + switch (res) { + case 1: + /* Disable this handler. */ + clear_bit(Enabled, &e->flags); + break; + case 2: + /* Enable this handler. */ + set_bit(Enabled, &e->flags); + break; + case 3: + /* Delete this handler. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + + kill_node(e); + + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + break; + default: + return res; + } + + return count; +} + +static const struct file_operations bm_entry_operations = { + .read = bm_entry_read, + .write = bm_entry_write, + .llseek = default_llseek, +}; + +/* /register */ + +static ssize_t bm_register_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + Node *e; + struct inode *inode; + struct dentry *root, *dentry; + struct super_block *sb = file->f_path.dentry->d_sb; + int err = 0; + + e = create_entry(buffer, count); + + if (IS_ERR(e)) + return PTR_ERR(e); + + root = dget(sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + dentry = lookup_one_len(e->name, root, strlen(e->name)); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out; + + err = -EEXIST; + if (d_really_is_positive(dentry)) + goto out2; + + inode = bm_get_inode(sb, S_IFREG | 0644); + + err = -ENOMEM; + if (!inode) + goto out2; + + err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); + if (err) { + iput(inode); + inode = NULL; + goto out2; + } + + e->dentry = dget(dentry); + inode->i_private = e; + inode->i_fop = &bm_entry_operations; + + d_instantiate(dentry, inode); + write_lock(&entries_lock); + list_add(&e->list, &entries); + write_unlock(&entries_lock); + + err = 0; +out2: + dput(dentry); +out: + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + + if (err) { + kfree(e); + return -EINVAL; + } + return count; +} + +static const struct file_operations bm_register_operations = { + .write = bm_register_write, + .llseek = noop_llseek, +}; + +/* /status */ + +static ssize_t +bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + char *s = enabled ? "enabled\n" : "disabled\n"; + + return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); +} + +static ssize_t bm_status_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + int res = parse_command(buffer, count); + struct dentry *root; + + switch (res) { + case 1: + /* Disable all handlers. */ + enabled = 0; + break; + case 2: + /* Enable all handlers. */ + enabled = 1; + break; + case 3: + /* Delete all handlers. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + + while (!list_empty(&entries)) + kill_node(list_entry(entries.next, Node, list)); + + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + break; + default: + return res; + } + + return count; +} + +static const struct file_operations bm_status_operations = { + .read = bm_status_read, + .write = bm_status_write, + .llseek = default_llseek, +}; + +/* Superblock handling */ + +static const struct super_operations s_ops = { + .statfs = simple_statfs, + .evict_inode = bm_evict_inode, +}; + +static int bm_fill_super(struct super_block *sb, void *data, int silent) +{ + int err; + static struct tree_descr bm_files[] = { + [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, + [3] = {"register", &bm_register_operations, S_IWUSR}, + /* last one */ {""} + }; + + err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); + if (!err) + sb->s_op = &s_ops; + return err; +} + +static struct dentry *bm_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, bm_fill_super); +} + +static struct linux_binfmt misc_format = { + .module = THIS_MODULE, + .load_binary = load_misc_binary, +}; + +static struct file_system_type bm_fs_type = { + .owner = THIS_MODULE, + .name = "binfmt_misc", + .mount = bm_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("binfmt_misc"); + +static int __init init_misc_binfmt(void) +{ + int err = register_filesystem(&bm_fs_type); + if (!err) + insert_binfmt(&misc_format); + return err; +} + +static void __exit exit_misc_binfmt(void) +{ + unregister_binfmt(&misc_format); + unregister_filesystem(&bm_fs_type); +} + +core_initcall(init_misc_binfmt); +module_exit(exit_misc_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/binfmt_script.c b/kernel/fs/binfmt_script.c new file mode 100644 index 000000000..afdf4e3ca --- /dev/null +++ b/kernel/fs/binfmt_script.c @@ -0,0 +1,129 @@ +/* + * linux/fs/binfmt_script.c + * + * Copyright (C) 1996 Martin von Löwis + * original #!-checking implemented by tytso. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static int load_script(struct linux_binprm *bprm) +{ + const char *i_arg, *i_name; + char *cp; + struct file *file; + char interp[BINPRM_BUF_SIZE]; + int retval; + + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) + return -ENOEXEC; + + /* + * If the script filename will be inaccessible after exec, typically + * because it is a "/dev/fd//.." path against an O_CLOEXEC fd, give + * up now (on the assumption that the interpreter will want to load + * this file). + */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + + /* + * This section does the #! interpretation. + * Sorta complicated, but hopefully it will work. -TYT + */ + + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + + bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; + if ((cp = strchr(bprm->buf, '\n')) == NULL) + cp = bprm->buf+BINPRM_BUF_SIZE-1; + *cp = '\0'; + while (cp > bprm->buf) { + cp--; + if ((*cp == ' ') || (*cp == '\t')) + *cp = '\0'; + else + break; + } + for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); + if (*cp == '\0') + return -ENOEXEC; /* No interpreter name found */ + i_name = cp; + i_arg = NULL; + for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) + /* nothing */ ; + while ((*cp == ' ') || (*cp == '\t')) + *cp++ = '\0'; + if (*cp) + i_arg = cp; + strcpy (interp, i_name); + /* + * OK, we've parsed out the interpreter name and + * (optional) argument. + * Splice in (1) the interpreter's name for argv[0] + * (2) (optional) argument to interpreter + * (3) filename of shell script (replace argv[0]) + * + * This is done in reverse order, because of how the + * user environment and arguments are stored. + */ + retval = remove_arg_zero(bprm); + if (retval) + return retval; + retval = copy_strings_kernel(1, &bprm->interp, bprm); + if (retval < 0) return retval; + bprm->argc++; + if (i_arg) { + retval = copy_strings_kernel(1, &i_arg, bprm); + if (retval < 0) return retval; + bprm->argc++; + } + retval = copy_strings_kernel(1, &i_name, bprm); + if (retval) return retval; + bprm->argc++; + retval = bprm_change_interp(interp, bprm); + if (retval < 0) + return retval; + + /* + * OK, now restart the process with the interpreter's dentry. + */ + file = open_exec(interp); + if (IS_ERR(file)) + return PTR_ERR(file); + + bprm->file = file; + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + return search_binary_handler(bprm); +} + +static struct linux_binfmt script_format = { + .module = THIS_MODULE, + .load_binary = load_script, +}; + +static int __init init_script_binfmt(void) +{ + register_binfmt(&script_format); + return 0; +} + +static void __exit exit_script_binfmt(void) +{ + unregister_binfmt(&script_format); +} + +core_initcall(init_script_binfmt); +module_exit(exit_script_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/block_dev.c b/kernel/fs/block_dev.c new file mode 100644 index 000000000..c7e4163ed --- /dev/null +++ b/kernel/fs/block_dev.c @@ -0,0 +1,1795 @@ +/* + * linux/fs/block_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli SuSE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; + +static const struct address_space_operations def_blk_aops; + +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +inline struct block_device *I_BDEV(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} +EXPORT_SYMBOL(I_BDEV); + +static void bdev_write_inode(struct inode *inode) +{ + spin_lock(&inode->i_lock); + while (inode->i_state & I_DIRTY) { + spin_unlock(&inode->i_lock); + WARN_ON_ONCE(write_inode_now(inode, true)); + spin_lock(&inode->i_lock); + } + spin_unlock(&inode->i_lock); +} + +/* Kill _all_ buffers and pagecache , dirty or not.. */ +void kill_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0 && mapping->nrshadows == 0) + return; + + invalidate_bh_lrus(); + truncate_inode_pages(mapping, 0); +} +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); + +int set_blocksize(struct block_device *bdev, int size) +{ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; + + /* Size cannot be smaller than the size supported by the device */ + if (size < bdev_logical_block_size(bdev)) + return -EINVAL; + + /* Don't change the size if it is same as current */ + if (bdev->bd_block_size != size) { + sync_blockdev(bdev); + bdev->bd_block_size = size; + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } + return 0; +} + +EXPORT_SYMBOL(set_blocksize); + +int sb_set_blocksize(struct super_block *sb, int size) +{ + if (set_blocksize(sb->s_bdev, size)) + return 0; + /* If we get here, we know size is power of two + * and it's value is between 512 and PAGE_SIZE */ + sb->s_blocksize = size; + sb->s_blocksize_bits = blksize_bits(size); + return sb->s_blocksize; +} + +EXPORT_SYMBOL(sb_set_blocksize); + +int sb_min_blocksize(struct super_block *sb, int size) +{ + int minsize = bdev_logical_block_size(sb->s_bdev); + if (size < minsize) + size = minsize; + return sb_set_blocksize(sb, size); +} + +EXPORT_SYMBOL(sb_min_blocksize); + +static int +blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static ssize_t +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, + blkdev_get_block, NULL, NULL, + DIO_SKIP_DIO_COUNT); +} + +int __sync_blockdev(struct block_device *bdev, int wait) +{ + if (!bdev) + return 0; + if (!wait) + return filemap_flush(bdev->bd_inode->i_mapping); + return filemap_write_and_wait(bdev->bd_inode->i_mapping); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + return __sync_blockdev(bdev, 1); +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = sync_filesystem(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} +EXPORT_SYMBOL(fsync_bdev); + +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + */ +struct super_block *freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (++bdev->bd_fsfreeze_count > 1) { + /* + * We don't even need to grab a reference - the first call + * to freeze_bdev grab an active reference and only the last + * thaw_bdev drops it. + */ + sb = get_super(bdev); + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + + sb = get_active_super(bdev); + if (!sb) + goto out; + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb); + else + error = freeze_super(sb); + if (error) { + deactivate_super(sb); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + deactivate_super(sb); + out: + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; /* thaw_bdev releases s->s_umount */ +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * @sb: associated superblock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + int error = -EINVAL; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) + goto out; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out; + + if (!sb) + goto out; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb); + else + error = thaw_super(sb); + if (error) { + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } +out: + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; +} +EXPORT_SYMBOL(thaw_bdev); + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static int blkdev_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return block_write_begin(mapping, pos, len, flags, pagep, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + page_cache_release(page); + + return ret; +} + +/* + * private llseek: + * for a block special file file_inode(file)->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t block_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *bd_inode = file->f_mapping->host; + loff_t retval; + + mutex_lock(&bd_inode->i_mutex); + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); + mutex_unlock(&bd_inode->i_mutex); + return retval; +} + +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *bd_inode = filp->f_mapping->host; + struct block_device *bdev = I_BDEV(bd_inode); + int error; + + error = filemap_write_and_wait_range(filp->f_mapping, start, end); + if (error) + return error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + if (error == -EOPNOTSUPP) + error = 0; + + return error; +} +EXPORT_SYMBOL(blkdev_fsync); + +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + if (result) + end_page_writeback(page); + else + unlock_page(page); + return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); + +/** + * bdev_direct_access() - Get the address for directly-accessibly memory + * @bdev: The device containing the memory + * @sector: The offset within the device + * @addr: Where to put the address of the memory + * @pfn: The Page Frame Number for the memory + * @size: The number of bytes requested + * + * If a block device is made up of directly addressable memory, this function + * will tell the caller the PFN and the address of the memory. The address + * may be directly dereferenced within the kernel without the need to call + * ioremap(), kmap() or similar. The PFN is suitable for inserting into + * page tables. + * + * Return: negative errno if an error occurs, otherwise the number of bytes + * accessible at this address. + */ +long bdev_direct_access(struct block_device *bdev, sector_t sector, + void **addr, unsigned long *pfn, long size) +{ + long avail; + const struct block_device_operations *ops = bdev->bd_disk->fops; + + if (size < 0) + return size; + if (!ops->direct_access) + return -EOPNOTSUPP; + if ((sector + DIV_ROUND_UP(size, 512)) > + part_nr_sects_read(bdev->bd_part)) + return -ERANGE; + sector += get_start_sect(bdev); + if (sector % (PAGE_SIZE / 512)) + return -EINVAL; + avail = ops->direct_access(bdev, sector, addr, pfn, size); + if (!avail) + return -ERANGE; + return min(avail, size); +} +EXPORT_SYMBOL_GPL(bdev_direct_access); + +/* + * pseudo-fs + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static struct kmem_cache * bdev_cachep __read_mostly; + +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void bdev_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct bdev_inode *bdi = BDEV_I(inode); + + kmem_cache_free(bdev_cachep, bdi); +} + +static void bdev_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bdev_i_callback); +} + +static void init_once(void *foo) +{ + struct bdev_inode *ei = (struct bdev_inode *) foo; + struct block_device *bdev = &ei->bdev; + + memset(bdev, 0, sizeof(*bdev)); + mutex_init(&bdev->bd_mutex); + INIT_LIST_HEAD(&bdev->bd_inodes); + INIT_LIST_HEAD(&bdev->bd_list); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif + inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); +} + +static inline void __bd_forget(struct inode *inode) +{ + list_del_init(&inode->i_devices); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; +} + +static void bdev_evict_inode(struct inode *inode) +{ + struct block_device *bdev = &BDEV_I(inode)->bdev; + struct list_head *p; + truncate_inode_pages_final(&inode->i_data); + invalidate_inode_buffers(inode); /* is it needed here? */ + clear_inode(inode); + spin_lock(&bdev_lock); + while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { + __bd_forget(list_entry(p, struct inode, i_devices)); + } + list_del_init(&bdev->bd_list); + spin_unlock(&bdev_lock); +} + +static const struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .destroy_inode = bdev_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = bdev_evict_inode, +}; + +static struct dentry *bd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); +} + +static struct file_system_type bd_type = { + .name = "bdev", + .mount = bd_mount, + .kill_sb = kill_anon_super, +}; + +static struct super_block *blockdev_superblock __read_mostly; + +void __init bdev_cache_init(void) +{ + int err; + static struct vfsmount *bd_mnt; + + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); + err = register_filesystem(&bd_type); + if (err) + panic("Cannot register bdev pseudo-fs"); + bd_mnt = kern_mount(&bd_type); + if (IS_ERR(bd_mnt)) + panic("Cannot create bdev pseudo-fs"); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ +} + +/* + * Most likely _very_ bad one - but then it's hardly critical for small + * /dev and can be fixed when somebody will need really large one. + * Keep in mind that it will be fed through icache hash function too. + */ +static inline unsigned long hash(dev_t dev) +{ + return MAJOR(dev)+MINOR(dev); +} + +static int bdev_test(struct inode *inode, void *data) +{ + return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; +} + +static int bdev_set(struct inode *inode, void *data) +{ + BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; + return 0; +} + +static LIST_HEAD(all_bdevs); + +struct block_device *bdget(dev_t dev) +{ + struct block_device *bdev; + struct inode *inode; + + inode = iget5_locked(blockdev_superblock, hash(dev), + bdev_test, bdev_set, &dev); + + if (!inode) + return NULL; + + bdev = &BDEV_I(inode)->bdev; + + if (inode->i_state & I_NEW) { + bdev->bd_contains = NULL; + bdev->bd_super = NULL; + bdev->bd_inode = inode; + bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_part_count = 0; + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = dev; + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + spin_lock(&bdev_lock); + list_add(&bdev->bd_list, &all_bdevs); + spin_unlock(&bdev_lock); + unlock_new_inode(inode); + } + return bdev; +} + +EXPORT_SYMBOL(bdget); + +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + ihold(bdev->bd_inode); + return bdev; +} +EXPORT_SYMBOL(bdgrab); + +long nr_blockdev_pages(void) +{ + struct block_device *bdev; + long ret = 0; + spin_lock(&bdev_lock); + list_for_each_entry(bdev, &all_bdevs, bd_list) { + ret += bdev->bd_inode->i_mapping->nrpages; + } + spin_unlock(&bdev_lock); + return ret; +} + +void bdput(struct block_device *bdev) +{ + iput(bdev->bd_inode); +} + +EXPORT_SYMBOL(bdput); + +static struct block_device *bd_acquire(struct inode *inode) +{ + struct block_device *bdev; + + spin_lock(&bdev_lock); + bdev = inode->i_bdev; + if (bdev) { + ihold(bdev->bd_inode); + spin_unlock(&bdev_lock); + return bdev; + } + spin_unlock(&bdev_lock); + + bdev = bdget(inode->i_rdev); + if (bdev) { + spin_lock(&bdev_lock); + if (!inode->i_bdev) { + /* + * We take an additional reference to bd_inode, + * and it's released in clear_inode() of inode. + * So, we can access it via ->i_mapping always + * without igrab(). + */ + ihold(bdev->bd_inode); + inode->i_bdev = bdev; + inode->i_mapping = bdev->bd_inode->i_mapping; + list_add(&inode->i_devices, &bdev->bd_inodes); + } + spin_unlock(&bdev_lock); + } + return bdev; +} + +int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + +/* Call when you free inode */ + +void bd_forget(struct inode *inode) +{ + struct block_device *bdev = NULL; + + spin_lock(&bdev_lock); + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); + spin_unlock(&bdev_lock); + + if (bdev) + iput(bdev->bd_inode); +} + +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whether @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + if (bdev->bd_holder == holder) + return true; /* already a holder */ + else if (bdev->bd_holder != NULL) + return false; /* held by someone else */ + else if (bdev->bd_contains == bdev) + return true; /* is a whole device which isn't held */ + + else if (whole->bd_holder == bd_may_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ + else + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - prepare to claim a block device + * @bdev: block device of interest + * @whole: the whole device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Prepare to claim @bdev. This function fails if @bdev is already + * claimed by another holder and waits if another claiming is in + * progress. This function doesn't actually claim. On successful + * return, the caller has ownership of bd_claiming and bd_holder[s]. + * + * CONTEXT: + * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab + * it multiple times. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +static int bd_prepare_to_claim(struct block_device *bdev, + struct block_device *whole, void *holder) +{ +retry: + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) + return -EBUSY; + + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + spin_lock(&bdev_lock); + goto retry; + } + + /* yay, all mine */ + return 0; +} + +/** + * bd_start_claiming - start claiming a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * @bdev is about to be opened exclusively. Check @bdev can be opened + * exclusively and mark that an exclusive open is in progress. Each + * successful call to this function must be matched with a call to + * either bd_finish_claiming() or bd_abort_claiming() (which do not + * fail). + * + * This function is used to gain exclusive access to the block device + * without actually causing other exclusive open attempts to fail. It + * should be used when the open sequence itself requires exclusive + * access but may subsequently fail. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to the block device containing @bdev on success, ERR_PTR() + * value on failure. + */ +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) +{ + struct gendisk *disk; + struct block_device *whole; + int partno, err; + + might_sleep(); + + /* + * @bdev might not have been initialized properly yet, look up + * and grab the outer block device the hard way. + */ + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + return ERR_PTR(-ENXIO); + + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + + module_put(disk->fops->owner); + put_disk(disk); + if (!whole) + return ERR_PTR(-ENOMEM); + + /* prepare to claim, if successful, mark claiming in progress */ + spin_lock(&bdev_lock); + + err = bd_prepare_to_claim(bdev, whole, holder); + if (err == 0) { + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return whole; + } else { + spin_unlock(&bdev_lock); + bdput(whole); + return ERR_PTR(err); + } +} + +#ifdef CONFIG_SYSFS +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&bdev->bd_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: + kfree(holder); +out_unlock: + mutex_unlock(&bdev->bd_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&bdev->bd_mutex); + + holder = bd_find_holder_disk(bdev, disk); + + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, + &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + + mutex_unlock(&bdev->bd_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); +#endif + +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev, bool kill_dirty) +{ + if (__invalidate_device(bdev, kill_dirty)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_part_scan_enabled(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + +/** + * check_disk_size_change - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev, false); + } +} +EXPORT_SYMBOL(check_disk_size_change); + +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev; + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + +/* + * This routine checks whether a removable media has been changed, + * and invalidates all buffer-cache-entries in that case. This + * is a relatively slow routine, so we have to try to minimize using + * it. Thus it is called only upon a 'mount' or 'open'. This + * is the best way of combining speed and utility, I think. + * People changing diskettes in the middle of an operation deserve + * to lose :-) + */ +int check_disk_change(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + const struct block_device_operations *bdops = disk->fops; + unsigned int events; + + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return 0; + + flush_disk(bdev, true); + if (bdops->revalidate_disk) + bdops->revalidate_disk(bdev->bd_disk); + return 1; +} + +EXPORT_SYMBOL(check_disk_change); + +void bd_set_size(struct block_device *bdev, loff_t size) +{ + unsigned bsize = bdev_logical_block_size(bdev); + + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, size); + mutex_unlock(&bdev->bd_inode->i_mutex); + while (bsize < PAGE_CACHE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} +EXPORT_SYMBOL(bd_set_size); + +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); + +/* + * bd_mutex locking: + * + * mutex_lock(part->bd_mutex) + * mutex_lock_nested(whole->bd_mutex, 1) + */ + +static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +{ + struct gendisk *disk; + struct module *owner; + int ret; + int partno; + int perm = 0; + + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + /* + * hooks: /n/, see "layering violations". + */ + if (!for_part) { + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } + } + + restart: + + ret = -ENXIO; + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + goto out; + owner = disk->fops->owner; + + disk_block_events(disk); + mutex_lock_nested(&bdev->bd_mutex, for_part); + if (!bdev->bd_openers) { + bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; + bdev->bd_contains = bdev; + if (!partno) { + ret = -ENXIO; + bdev->bd_part = disk_get_part(disk, partno); + if (!bdev->bd_part) + goto out_clear; + + ret = 0; + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret == -ERESTARTSYS) { + /* Lost a race with 'disk' being + * deleted, try again. + * See md.c + */ + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + bdev->bd_disk = NULL; + bdev->bd_queue = NULL; + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + put_disk(disk); + module_put(owner); + goto restart; + } + } + + if (!ret) + bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + + /* + * If the device is invalidated, rescan partition + * if open succeeded or failed with -ENOMEDIUM. + * The latter is necessary to prevent ghost + * partitions on a removed medium. + */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } + if (ret) + goto out_clear; + } else { + struct block_device *whole; + whole = bdget_disk(disk, 0); + ret = -ENOMEM; + if (!whole) + goto out_clear; + BUG_ON(for_part); + ret = __blkdev_get(whole, mode, 1); + if (ret) + goto out_clear; + bdev->bd_contains = whole; + bdev->bd_part = disk_get_part(disk, partno); + if (!(disk->flags & GENHD_FL_UP) || + !bdev->bd_part || !bdev->bd_part->nr_sects) { + ret = -ENXIO; + goto out_clear; + } + bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + } + } else { + if (bdev->bd_contains == bdev) { + ret = 0; + if (bdev->bd_disk->fops->open) + ret = bdev->bd_disk->fops->open(bdev, mode); + /* the same as first opener case, read comment there */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } + if (ret) + goto out_unlock_bdev; + } + /* only one opener holds refs to the module and disk */ + put_disk(disk); + module_put(owner); + } + bdev->bd_openers++; + if (for_part) + bdev->bd_part_count++; + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + return 0; + + out_clear: + disk_put_part(bdev->bd_part); + bdev->bd_disk = NULL; + bdev->bd_part = NULL; + bdev->bd_queue = NULL; + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, mode, 1); + bdev->bd_contains = NULL; + out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + put_disk(disk); + module_put(owner); + out: + bdput(bdev); + + return ret; +} + +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access. Specifying %FMODE_EXCL with %NULL + * @holder is invalid. Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged. On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +{ + struct block_device *whole = NULL; + int res; + + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + + if ((mode & FMODE_EXCL) && holder) { + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + + res = __blkdev_get(bdev, mode, 0); + + if (whole) { + struct gendisk *disk = whole->bd_disk; + + /* finish claiming */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + if (!res) { + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders + * will be incremented twice, and bd_holder + * will be set to bd_may_claim before being + * set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + } + + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + + /* + * Block event polling for write claims if requested. Any + * write holder makes the write_holder state stick until + * all are released. This is good enough and tracking + * individual writeable reference is too fragile given the + * way @mode is used in blkdev_get/put(). + */ + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + disk_block_events(disk); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(whole); + } + + return res; +} +EXPORT_SYMBOL(blkdev_get); + +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path. @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + int err; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev. @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number. _Never_ to be used for internal purposes. If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int err; + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); + +static int blkdev_open(struct inode * inode, struct file * filp) +{ + struct block_device *bdev; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = bd_acquire(inode); + if (bdev == NULL) + return -ENOMEM; + + filp->f_mapping = bdev->bd_inode->i_mapping; + + return blkdev_get(bdev, filp->f_mode, filp); +} + +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +{ + struct gendisk *disk = bdev->bd_disk; + struct block_device *victim = NULL; + + mutex_lock_nested(&bdev->bd_mutex, for_part); + if (for_part) + bdev->bd_part_count--; + + if (!--bdev->bd_openers) { + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + /* + * ->release can cause the queue to disappear, so flush all + * dirty data before. + */ + bdev_write_inode(bdev->bd_inode); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + disk->fops->release(disk, mode); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + bdev->bd_disk = NULL; + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); + } + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + if (victim) + __blkdev_put(victim, mode, 1); +} + +void blkdev_put(struct block_device *bdev, fmode_t mode) +{ + mutex_lock(&bdev->bd_mutex); + + if (mode & FMODE_EXCL) { + bool bdev_free; + + /* + * Release a claim on the device. The holder fields + * are protected with bdev_lock. bd_mutex is to + * synchronize disk_holder unlinking. + */ + spin_lock(&bdev_lock); + + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + + /* bd_contains might point to self, check in a separate step */ + if ((bdev_free = !bdev->bd_holders)) + bdev->bd_holder = NULL; + if (!bdev->bd_contains->bd_holders) + bdev->bd_contains->bd_holder = NULL; + + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } + } + + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + + mutex_unlock(&bdev->bd_mutex); + + __blkdev_put(bdev, mode, 0); +} +EXPORT_SYMBOL(blkdev_put); + +static int blkdev_close(struct inode * inode, struct file * filp) +{ + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + blkdev_put(bdev, filp->f_mode); + return 0; +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(file->f_mapping->host); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); + struct blk_plug plug; + ssize_t ret; + + if (bdev_read_only(I_BDEV(bd_inode))) + return -EPERM; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + iov_iter_truncate(from, size - iocb->ki_pos); + + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + if (ret > 0) { + ssize_t err; + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + blk_finish_plug(&plug); + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_write_iter); + +ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; + + if (pos >= size) + return 0; + + size -= pos; + iov_iter_truncate(to, size); + return generic_file_read_iter(iocb, to); +} +EXPORT_SYMBOL_GPL(blkdev_read_iter); + +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + +static const struct address_space_operations def_blk_aops = { + .readpage = blkdev_readpage, + .readpages = blkdev_readpages, + .writepage = blkdev_writepage, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = generic_writepages, + .releasepage = blkdev_releasepage, + .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, +}; + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, + .llseek = block_llseek, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, + .mmap = generic_file_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; + +int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) +{ + int res; + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + res = blkdev_ioctl(bdev, 0, cmd, arg); + set_fs(old_fs); + return res; +} + +EXPORT_SYMBOL(ioctl_by_bdev); + +/** + * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device + * + * Get a reference to the blockdevice at @pathname in the current + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +struct block_device *lookup_bdev(const char *pathname) +{ + struct block_device *bdev; + struct inode *inode; + struct path path; + int error; + + if (!pathname || !*pathname) + return ERR_PTR(-EINVAL); + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return ERR_PTR(error); + + inode = d_backing_inode(path.dentry); + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto fail; + error = -EACCES; + if (path.mnt->mnt_flags & MNT_NODEV) + goto fail; + error = -ENOMEM; + bdev = bd_acquire(inode); + if (!bdev) + goto fail; +out: + path_put(&path); + return bdev; +fail: + bdev = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL(lookup_bdev); + +int __invalidate_device(struct block_device *bdev, bool kill_dirty) +{ + struct super_block *sb = get_super(bdev); + int res = 0; + + if (sb) { + /* + * no need to lock the super, get_super holds the + * read mutex so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb, kill_dirty); + drop_super(sb); + } + invalidate_bdev(bdev); + return res; +} +EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + func(I_BDEV(inode), arg); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} diff --git a/kernel/fs/btrfs/Kconfig b/kernel/fs/btrfs/Kconfig new file mode 100644 index 000000000..80e9c18ea --- /dev/null +++ b/kernel/fs/btrfs/Kconfig @@ -0,0 +1,91 @@ +config BTRFS_FS + tristate "Btrfs filesystem support" + select CRYPTO + select CRYPTO_CRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS + select RAID6_PQ + select XOR_BLOCKS + select SRCU + + help + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. + + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. + + For more information, please see the web pages at + http://btrfs.wiki.kernel.org. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N + +config BTRFS_FS_RUN_SANITY_TESTS + bool "Btrfs will run sanity tests upon loading" + depends on BTRFS_FS + help + This will run some basic sanity tests on the free space cache + code to make sure it is acting as it should. These are mostly + regression tests and are only really interesting to btrfs + developers. + + If unsure, say N. + +config BTRFS_DEBUG + bool "Btrfs debugging support" + depends on BTRFS_FS + help + Enable run-time debugging support for the btrfs filesystem. This may + enable additional and expensive checks with negative impact on + performance, or export extra information via sysfs. + + If unsure, say N. + +config BTRFS_ASSERT + bool "Btrfs assert support" + depends on BTRFS_FS + help + Enable run-time assertion checking. This will result in panics if + any of the assertions trip. This is meant for btrfs developers only. + + If unsure, say N. diff --git a/kernel/fs/btrfs/Makefile b/kernel/fs/btrfs/Makefile new file mode 100644 index 000000000..6d1d0b93b --- /dev/null +++ b/kernel/fs/btrfs/Makefile @@ -0,0 +1,19 @@ + +obj-$(CONFIG_BTRFS_FS) := btrfs.o + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o props.o hash.o + +btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o + +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ + tests/extent-buffer-tests.o tests/btrfs-tests.o \ + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/kernel/fs/btrfs/acl.c b/kernel/fs/btrfs/acl.c new file mode 100644 index 000000000..9a0124a95 --- /dev/null +++ b/kernel/fs/btrfs/acl.c @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + } + if (size > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ + acl = NULL; + } else { + acl = ERR_PTR(-EIO); + } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Needs to be called with fs_mutex held + */ +static int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &inode->i_mode); + if (ret < 0) + return ret; + if (ret == 0) + acl = NULL; + } + ret = 0; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); +out: + kfree(value); + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} + +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return __btrfs_set_acl(NULL, inode, acl, type); +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; + + if (default_acl) { + ret = __btrfs_set_acl(trans, inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + + if (acl) { + if (!ret) + ret = __btrfs_set_acl(trans, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + + if (!default_acl && !acl) + cache_no_acl(inode); + return ret; +} diff --git a/kernel/fs/btrfs/async-thread.c b/kernel/fs/btrfs/async-thread.c new file mode 100644 index 000000000..df9932b00 --- /dev/null +++ b/kernel/fs/btrfs/async-thread.c @@ -0,0 +1,365 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "async-thread.h" +#include "ctree.h" + +#define WORK_DONE_BIT 0 +#define WORK_ORDER_DONE_BIT 1 +#define WORK_HIGH_PRIO_BIT 2 + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct __btrfs_workqueue { + struct workqueue_struct *normal_wq; + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + int max_active; + int current_max; + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; + +struct btrfs_workqueue { + struct __btrfs_workqueue *normal; + struct __btrfs_workqueue *high; +}; + +static void normal_work_helper(struct btrfs_work *work); + +#define BTRFS_WORK_HELPER(name) \ +void btrfs_##name(struct work_struct *arg) \ +{ \ + struct btrfs_work *work = container_of(arg, struct btrfs_work, \ + normal_work); \ + normal_work_helper(work); \ +} + +BTRFS_WORK_HELPER(worker_helper); +BTRFS_WORK_HELPER(delalloc_helper); +BTRFS_WORK_HELPER(flush_delalloc_helper); +BTRFS_WORK_HELPER(cache_helper); +BTRFS_WORK_HELPER(submit_helper); +BTRFS_WORK_HELPER(fixup_helper); +BTRFS_WORK_HELPER(endio_helper); +BTRFS_WORK_HELPER(endio_meta_helper); +BTRFS_WORK_HELPER(endio_meta_write_helper); +BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(endio_repair_helper); +BTRFS_WORK_HELPER(rmw_helper); +BTRFS_WORK_HELPER(endio_write_helper); +BTRFS_WORK_HELPER(freespace_write_helper); +BTRFS_WORK_HELPER(delayed_meta_helper); +BTRFS_WORK_HELPER(readahead_helper); +BTRFS_WORK_HELPER(qgroup_rescan_helper); +BTRFS_WORK_HELPER(extent_refs_helper); +BTRFS_WORK_HELPER(scrub_helper); +BTRFS_WORK_HELPER(scrubwrc_helper); +BTRFS_WORK_HELPER(scrubnc_helper); + +static struct __btrfs_workqueue * +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, + int thresh) +{ + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + + if (!ret) + return NULL; + + ret->max_active = max_active; + atomic_set(&ret->pending, 0); + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_max = max_active; + ret->thresh = NO_THRESHOLD; + } else { + ret->current_max = 1; + ret->thresh = thresh; + } + + if (flags & WQ_HIGHPRI) + ret->normal_wq = alloc_workqueue("%s-%s-high", flags, + ret->max_active, + "btrfs", name); + else + ret->normal_wq = alloc_workqueue("%s-%s", flags, + ret->max_active, "btrfs", + name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + + INIT_LIST_HEAD(&ret->ordered_list); + spin_lock_init(&ret->list_lock); + spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); + return ret; +} + +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); + +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + unsigned int flags, + int max_active, + int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + + if (!ret) + return NULL; + + ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + max_active, thresh); + if (!ret->normal) { + kfree(ret); + return NULL; + } + + if (flags & WQ_HIGHPRI) { + ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + thresh); + if (!ret->high) { + __btrfs_destroy_workqueue(ret->normal); + kfree(ret); + return NULL; + } + } + return ret; +} + +/* + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook + */ +static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +{ + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); +} + +/* + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. + */ +static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +{ + int new_max_active; + long pending; + int need_change = 0; + + if (wq->thresh == NO_THRESHOLD) + return; + + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_max_active = wq->current_max; + + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_max_active++; + if (pending < wq->thresh / 2) + new_max_active--; + new_max_active = clamp_val(new_max_active, 1, wq->max_active); + if (new_max_active != wq->current_max) { + need_change = 1; + wq->current_max = new_max_active; + } +out: + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_max); + } +} + +static void run_ordered_work(struct __btrfs_workqueue *wq) +{ + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; + + while (1) { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) + break; + work = list_entry(list->next, struct btrfs_work, + ordered_list); + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* + * we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); + work->ordered_func(work); + + /* now take the lock again and drop our item from the list */ + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); + + /* + * we don't want to call the ordered free functions + * with the lock held though + */ + work->ordered_free(work); + trace_btrfs_all_work_done(work); + } + spin_unlock_irqrestore(lock, flags); +} + +static void normal_work_helper(struct btrfs_work *work) +{ + struct __btrfs_workqueue *wq; + int need_order = 0; + + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + wq = work->wq; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq); + } + if (!need_order) + trace_btrfs_all_work_done(work); +} + +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free) +{ + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, uniq_func); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; +} + +static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, + struct btrfs_work *work) +{ + unsigned long flags; + + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); + } + queue_work(wq->normal_wq, &work->normal_work); + trace_btrfs_work_queued(work); +} + +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work) +{ + struct __btrfs_workqueue *dest_wq; + + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) + dest_wq = wq->high; + else + dest_wq = wq->normal; + __btrfs_queue_work(dest_wq, work); +} + +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) +{ + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); +} + +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) +{ + if (!wq) + return; + if (wq->high) + __btrfs_destroy_workqueue(wq->high); + __btrfs_destroy_workqueue(wq->normal); + kfree(wq); +} + +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) +{ + if (!wq) + return; + wq->normal->max_active = max; + if (wq->high) + wq->high->max_active = max; +} + +void btrfs_set_work_high_priority(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} diff --git a/kernel/fs/btrfs/async-thread.h b/kernel/fs/btrfs/async-thread.h new file mode 100644 index 000000000..ec2ee477f --- /dev/null +++ b/kernel/fs/btrfs/async-thread.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ +#include + +struct btrfs_workqueue; +/* Internal use only */ +struct __btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_work_func_t)(struct work_struct *arg); + +struct btrfs_work { + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct __btrfs_workqueue *wq; + unsigned long flags; +}; + +#define BTRFS_WORK_HELPER_PROTO(name) \ +void btrfs_##name(struct work_struct *arg) + +BTRFS_WORK_HELPER_PROTO(worker_helper); +BTRFS_WORK_HELPER_PROTO(delalloc_helper); +BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); +BTRFS_WORK_HELPER_PROTO(cache_helper); +BTRFS_WORK_HELPER_PROTO(submit_helper); +BTRFS_WORK_HELPER_PROTO(fixup_helper); +BTRFS_WORK_HELPER_PROTO(endio_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); +BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(endio_repair_helper); +BTRFS_WORK_HELPER_PROTO(rmw_helper); +BTRFS_WORK_HELPER_PROTO(endio_write_helper); +BTRFS_WORK_HELPER_PROTO(freespace_write_helper); +BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); +BTRFS_WORK_HELPER_PROTO(readahead_helper); +BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); +BTRFS_WORK_HELPER_PROTO(extent_refs_helper); +BTRFS_WORK_HELPER_PROTO(scrub_helper); +BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); +BTRFS_WORK_HELPER_PROTO(scrubnc_helper); + +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + unsigned int flags, + int max_active, + int thresh); +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +void btrfs_set_work_high_priority(struct btrfs_work *work); +#endif diff --git a/kernel/fs/btrfs/backref.c b/kernel/fs/btrfs/backref.c new file mode 100644 index 000000000..614aaa196 --- /dev/null +++ b/kernel/fs/btrfs/backref.c @@ -0,0 +1,1978 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" +#include "locking.h" + +/* Just an arbitrary number so we can be sure this happened */ +#define BACKREF_FOUND_SHARED 6 + +struct extent_inode_elem { + u64 inum; + u64 offset; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 offset = 0; + struct extent_inode_elem *e; + + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + offset; + *eie = e; + + return 0; +} + +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { + struct list_head list; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; +}; + +static struct kmem_cache *btrfs_prelim_ref_cache; + +int __init btrfs_prelim_ref_init(void) +{ + btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", + sizeof(struct __prelim_ref), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_prelim_ref_cache) + return -ENOMEM; + return 0; +} + +void btrfs_prelim_ref_exit(void) +{ + if (btrfs_prelim_ref_cache) + kmem_cache_destroy(btrfs_prelim_ref_cache); +} + +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count, + gfp_t gfp_mask) +{ + struct __prelim_ref *ref; + + if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) + return 0; + + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key_for_search = *key; + else + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + + ref->inode_list = NULL; + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos, + u64 total_refs) +{ + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; + struct btrfs_file_extent_item *fi; + struct extent_inode_elem *eie = NULL, *old = NULL; + u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; + + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + return 0; + } + + /* + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. + */ + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + ret = btrfs_next_old_leaf(root, path, time_seq); + + while (!ret && count < total_refs) { + eb = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + old = NULL; + count++; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (ret > 0) + goto next; + ret = ulist_add_merge_ptr(parents, eb->start, + eie, (void **)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; + } + eie = NULL; + } +next: + ret = btrfs_next_old_item(root, path, time_seq); + } + + if (ret > 0) + ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); + return ret; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos, u64 total_refs) +{ + struct btrfs_root *root; + struct btrfs_key root_key; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + int index; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(root); + goto out; + } + + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); + + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + goto out; + } + + path->lowest_level = level; + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + ref->root_id, level, ref->count, ret, + ref->key_for_search.objectid, ref->key_for_search.type, + ref->key_for_search.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + while (!eb) { + if (WARN_ON(!level)) { + ret = 1; + goto out; + } + level--; + eb = path->nodes[level]; + } + + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos, total_refs); +out: + path->lowest_level = 0; + btrfs_release_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct list_head *head, + const u64 *extent_item_pos, u64 total_refs, + u64 root_objectid) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + struct ulist_iterator uiter; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } + err = __resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos, + total_refs); + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { + continue; + } else if (err) { + ret = err; + goto out; + } + + /* we put the first parent into the ref at hand */ + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); + ref->parent = node ? node->val : 0; + ref->inode_list = node ? + (struct extent_inode_elem *)(uintptr_t)node->aux : NULL; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, &uiter))) { + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, + GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + goto out; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + new_ref->inode_list = (struct extent_inode_elem *) + (uintptr_t)node->aux; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } +out: + ulist_free(parents); + return ret; +} + +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + 0); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). + * mode = 2: merge identical parents + */ +static void __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + struct __prelim_ref *xchg; + struct extent_inode_elem *eie; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (!ref_for_same_block(ref1, ref2)) + continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } + } else { + if (ref1->parent != ref2->parent) + continue; + } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + + list_del(&ref2->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref2); + } + + } +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct list_head *prefs, u64 *total_refs, + u64 inum) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; + int sgn; + int ret = 0; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); + + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + n = rb_next(n); + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + *total_refs += (node->ref_mod * sgn); + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, &op_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, NULL, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + + /* + * Found a inum that doesn't match our known inum, we + * know it's shared. + */ + if (inum && ref->objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + default: + WARN_ON(1); + } + if (ret) + break; + } + spin_unlock(&head->lock); + return ret; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int *info_level, struct list_head *prefs, + u64 *total_refs, u64 inum) +{ + int ret = 0; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0]; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + *total_refs += btrfs_extent_refs(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + *info_level + 1, offset, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int info_level, struct list_head *prefs, u64 inum) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + info_level + 1, key.offset, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * NOTE: This can return values > 0 + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist *refs, + struct ulist *roots, const u64 *extent_item_pos, + u64 root_objectid, u64 inum) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head; + int info_level = 0; + int ret; + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; + u64 total_refs = 0; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.offset = (u64)-1; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (!trans) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + head = NULL; + + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else + if (trans) { +#endif + /* + * look if there are updates for this ref queued and lock the + * head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + spin_unlock(&delayed_refs->lock); + ret = __add_delayed_refs(head, time_seq, + &prefs_delayed, &total_refs, + inum); + mutex_unlock(&head->mutex); + if (ret) + goto out; + } else { + spin_unlock(&delayed_refs->lock); + } + } + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + path->slots[0]--; + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_level, &prefs, + &total_refs, inum); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, + info_level, &prefs, inum); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + list_splice_init(&prefs_delayed, &prefs); + + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + + __merge_refs(&prefs, 1); + + ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, + extent_item_pos, total_refs, + root_objectid); + if (ret) + goto out; + + __merge_refs(&prefs, 2); + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + WARN_ON(ref->count < 0); + if (roots && ref->count && ref->root_id && ref->parent == 0) { + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } + + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + if (ret < 0) + goto out; + } + if (ref->count && ref->parent) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { + struct extent_buffer *eb; + + eb = read_tree_block(fs_info->extent_root, + ref->parent, 0); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + if (ret < 0) + goto out; + ref->inode_list = eie; + } + ret = ulist_add_merge_ptr(refs, ref->parent, + ref->inode_list, + (void **)&eie, GFP_NOFS); + if (ret < 0) + goto out; + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } + eie = NULL; + } + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + +out: + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + if (ret < 0) + free_inode_elem_list(eie); + return ret; +} + +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)(uintptr_t)node->aux; + free_inode_elem_list(eie); + node->aux = 0; + } + + ulist_free(blocks); +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos) +{ + int ret; + + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) + return -ENOMEM; + + ret = find_parent_nodes(trans, fs_info, bytenr, + time_seq, *leafs, NULL, extent_item_pos, 0, 0); + if (ret < 0 && ret != -ENOENT) { + free_leaf_list(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + struct ulist_iterator uiter; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, + time_seq, tmp, *roots, NULL, 0, 0); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + cond_resched(); + } + + ulist_free(tmp); + return 0; +} + +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + int ret; + + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; +} + +/** + * btrfs_check_shared - tell us whether an extent is shared + * + * @trans: optional trans handle + * + * btrfs_check_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr) +{ + struct ulist *tmp = NULL; + struct ulist *roots = NULL; + struct ulist_iterator uiter; + struct ulist_node *node; + struct seq_list elem = SEQ_LIST_INIT(elem); + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + roots = ulist_alloc(GFP_NOFS); + if (!tmp || !roots) { + ulist_free(tmp); + ulist_free(roots); + return -ENOMEM; + } + + if (trans) + btrfs_get_tree_mod_seq(fs_info, &elem); + else + down_read(&fs_info->commit_root_sem); + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, + roots, NULL, root_objectid, inum); + if (ret == BACKREF_FOUND_SHARED) { + /* this is the only condition under which we return 1 */ + ret = 1; + break; + } + if (ret < 0 && ret != -ENOENT) + break; + ret = 0; + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + cond_resched(); + } + if (trans) + btrfs_put_tree_mod_seq(fs_info, &elem); + else + up_read(&fs_info->commit_root_sem); + ulist_free(tmp); + ulist_free(roots); + return ret; +} + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (found_key.type != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + +/* + * this iterates to turn a name (from iref/extref) into a full filesystem path. + * Elements of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + int slot; + u64 next_inum; + int ret; + s64 bytes_left = ((s64)size) - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; + struct btrfs_inode_ref *iref; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + path->leave_spinning = 1; + while (1) { + bytes_left -= name_len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + name_off, name_len); + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + } + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); + if (ret > 0) + ret = -ENOENT; + if (ret) + break; + + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) { + atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + btrfs_release_path(path); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + path->leave_spinning = leave_spinning; + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) +{ + int ret; + u64 flags; + u64 size = 0; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->extent_root->nodesize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if (found_key->objectid > logical || + found_key->objectid + size <= logical) { + pr_debug("logical %llu is not within any extent\n", logical); + return -ENOENT; + } + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + logical, logical - found_key->objectid, found_key->objectid, + found_key->offset, flags, item_size); + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG_ON(1); + return 0; + } + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + + if (key->type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_level = btrfs_tree_block_level(eb, info); + } else { + ASSERT(key->type == BTRFS_METADATA_ITEM_KEY); + *out_level = (u8)key->offset; + } + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, + iterate_extent_inodes_t *iterate, void *ctx) +{ + struct extent_inode_elem *eie; + int ret = 0; + + for (eie = inode_list; eie; eie = eie->next) { + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); + if (ret) { + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); + break; + } + } + + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + u64 extent_item_objectid, u64 extent_item_pos, + int search_commit_root, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + struct btrfs_trans_handle *trans = NULL; + struct ulist *refs = NULL; + struct ulist *roots = NULL; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; + + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); + + if (!search_commit_root) { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); + } + + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + tree_mod_seq_elem.seq, &refs, + &extent_item_pos); + if (ret) + goto out; + + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); + if (ret) + break; + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { + pr_debug("root %llu references leaf %llu, data list " + "%#llx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs((struct extent_inode_elem *) + (uintptr_t)ref_node->aux, + root_node->val, + extent_item_objectid, + iterate, ctx); + } + ulist_free(roots); + } + + free_leaf_list(refs); +out: + if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 extent_item_pos; + u64 flags = 0; + struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; + + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); + btrfs_release_path(path); + if (ret < 0) + return ret; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return -EINVAL; + + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, search_commit_root, + iterate, ctx); + + return ret; +} + +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret = 0; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (!ret) { + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + item = btrfs_item_nr(slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, found_key.objectid, + fs_root->objectid); + ret = iterate(parent, name_len, + (unsigned long)(iref + 1), eb, ctx); + if (ret) + break; + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = iterate(parent, name_len, + (unsigned long)&extref->name, eb, ctx); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += sizeof(*extref); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, iterate_irefs_t *iterate, + void *ctx) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = vmalloc(alloc_bytes); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + if (!ipath) + return; + vfree(ipath->fspath); + kfree(ipath); +} diff --git a/kernel/fs/btrfs/backref.h b/kernel/fs/btrfs/backref.h new file mode 100644 index 000000000..9c41fbac3 --- /dev/null +++ b/kernel/fs/btrfs/backref.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include +#include "ulist.h" +#include "extent_io.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + u64 extent_item_objectid, + u64 extent_offset, int search_commit_root, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr); + +int __init btrfs_prelim_ref_init(void); +void btrfs_prelim_ref_exit(void); +#endif diff --git a/kernel/fs/btrfs/btrfs_inode.h b/kernel/fs/btrfs/btrfs_inode.h new file mode 100644 index 000000000..0ef5cc13f --- /dev/null +++ b/kernel/fs/btrfs/btrfs_inode.h @@ -0,0 +1,328 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" +#include "delayed-inode.h" + +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 +#define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 +#define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 +/* + * The following 3 bits are meant only for the btree inode. + * When any of them is set, it means an error happened while writing an + * extent buffer belonging to: + * 1) a non-log btree + * 2) a log btree and first log sub-transaction + * 3) a log btree and second log sub-transaction + */ +#define BTRFS_INODE_BTREE_ERR 12 +#define BTRFS_INODE_BTREE_LOG1_ERR 13 +#define BTRFS_INODE_BTREE_LOG2_ERR 14 + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ + spinlock_t lock; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + + unsigned long runtime_flags; + + /* Keep track of who's O_SYNC/fsyncing currently */ + atomic_t sync_writers; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * log transid when this inode was last modified + */ + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* + * total number of bytes pending defrag, used by stat to check whether + * it needs COW. + */ + u64 defrag_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + unsigned outstanding_extents; + unsigned reserved_extents; + + /* + * always compress this one file + */ + unsigned force_compress; + + struct btrfs_delayed_node *delayed_node; + + /* File creation time. */ + struct timespec i_otime; + + struct inode vfs_inode; +}; + +extern unsigned char btrfs_filetype_table[]; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline unsigned long btrfs_inode_hash(u64 objectid, + const struct btrfs_root *root) +{ + u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + +#if BITS_PER_LONG == 32 + h = (h >> 32) ^ (h & 0xffffffff); +#endif + + return (unsigned long)h; +} + +static inline void btrfs_insert_inode_hash(struct inode *inode) +{ + unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); + + __insert_inode_hash(inode, h); +} + +static inline u64 btrfs_ino(struct inode *inode) +{ + u64 ino = BTRFS_I(inode)->location.objectid; + + /* + * !ino: btree_inode + * type == BTRFS_ROOT_ITEM_KEY: subvol dir + */ + if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) + ino = inode->i_ino; + return ino; +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + i_size_write(inode, size); + BTRFS_I(inode)->disk_i_size = size; +} + +static inline bool btrfs_is_free_space_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (root == root->fs_info->tree_root && + btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) + return true; + if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->last_log_commit && + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->root->last_log_commit) { + /* + * After a ranged fsync we might have left some extent maps + * (that fall outside the fsync's range). So return false + * here if the list isn't empty, to make sure btrfs_log_inode() + * will be called and process those extent maps. + */ + smp_mb(); + if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) + ret = 1; + } + spin_unlock(&BTRFS_I(inode)->lock); + return ret; +} + +#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 + +struct btrfs_dio_private { + struct inode *inode; + unsigned long flags; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + /* orig_bio is our btrfs_io_bio */ + struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; + + /* + * The original bio may be splited to several sub-bios, this is + * done during endio of sub-bios + */ + int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); +}; + +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_atomic(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); +} + +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + +#endif diff --git a/kernel/fs/btrfs/check-integrity.c b/kernel/fs/btrfs/check-integrity.c new file mode 100644 index 000000000..ce7dec88f --- /dev/null +++ b/kernel/fs/btrfs/check-integrity.c @@ -0,0 +1,3245 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" +#include "extent_io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" +#include "rcu-string.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char **datav; + struct page **pagev; + void *mem_to_free; +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret = 0; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + next_bytenr, mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)PAGE_CACHE_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + btrfs_super_magic(super_tmp) != BTRFS_MAGIC || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), dev_bytenr, + dev_state->name, dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + next_bytenr, mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + + BUG_ON(!first_hdr); + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&leafhdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + sf->block_ctx->start, sf->nr, + btrfs_stack_header_generation( + &leafhdr->header), + btrfs_stack_header_owner( + &leafhdr->header)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; + u8 type; + u32 item_offset; + u32 item_size; + + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_size(&disk_item); + disk_key = &disk_item.key; + type = btrfs_disk_key_type(disk_key); + + if (BTRFS_ROOT_ITEM_KEY == type) { + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + item_size > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + item_size); + next_bytenr = btrfs_root_bytenr(&root_item); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + btrfs_root_generation( + &root_item)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&nodehdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + btrfs_stack_header_generation( + &nodehdr->header), + btrfs_stack_header_owner( + &nodehdr->header)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = btrfs_stack_key_blockptr(&key_ptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &key_ptr.key, + btrfs_stack_key_generation(&key_ptr)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE)); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block), + next_block->logical_bytenr); + else + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block)); + } + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)next_block_ctx->len) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; + struct btrfsic_block_link *l; + + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr( + &file_extent_item)); + return 0; + } + + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } + generation = btrfs_stack_file_extent_generation(&file_extent_item); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr(&file_extent_item), + btrfs_stack_file_extent_offset(&file_extent_item), + num_bytes); + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->datablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + next_bytenr, chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if ((state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) && + next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + next_bytenr, + next_block_ctx.dev->name, + next_block_ctx.dev_bytenr, + mirror_num, + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(state->root->fs_info, READ, + bytenr, &length, &multi, mirror_num); + + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + block_ctx->dev_bytenr); + return -1; + } + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) + return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; + } + + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_iter.bi_sector = dev_bytenr >> 9; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + if (submit_bio_wait(READ, bio)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + unsigned int i; + + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + return 1; + + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + + crc = btrfs_crc32c(crc, data, sublen); + } + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + return 1; + + return 0; /* is metadata */ +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; + + if (NULL != bio_is_patched) + *bio_is_patched = 0; + +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr = 0; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = btrfs_super_bytenr((struct btrfs_super_block *) + mapped_datav[0]); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr); + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + bytenr, dev_state->name, + dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, + block), + block->logical_bytenr); + else + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", + bytenr, dev_state->name, + dev_bytenr, block->mirror_num, + btrfsic_get_block_type(state, + block)); + } + block->logical_bytenr = bytenr; + } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + bytenr, dev_state->name, dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_disk_key_objectid(&block->disk_key), + block->disk_key.type, + btrfs_disk_key_offset(&block->disk_key), + btrfs_stack_header_generation( + (struct btrfs_header *) mapped_datav[0]), + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_stack_header_generation( + (struct btrfs_header *) + mapped_datav[0])); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + goto continue_loop; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_datav[0]); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + processed_len = state->datablock_size; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, dev_bytenr); + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + } else { + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + bytenr, dev_state->name, dev_bytenr); + } + + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + goto continue_loop; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + block->logical_bytenr, dev_state->name, + block->dev_bytenr, block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + superblock->logical_bytenr, + superblock->dev_state->name, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + superblock->logical_bytenr, + superblock->dev_state->name, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key = {0}; + + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_ITEM_KEY); + btrfs_set_disk_key_objectid(&tmp_disk_key, 0); + + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", next_bytenr); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", next_bytenr); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, BTRFS_SUPER_INFO_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) + btrfsic_dump_tree(state); + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + l->block_ref_to->generation, + l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, block->flush_gen, + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + block->logical_bytenr, dev_state->name, + block->dev_bytenr, mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(state->root->fs_info, + bytenr, state->metablock_size); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, state->metablock_size, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (WARN_ON(!match)) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + bytenr, dev_state->name, dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + bytenr, block_ctx.dev->name, + block_ctx.dev_bytenr, mirror_num); + } + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," + " size=%zu, data=%p, bdev=%p)\n", + rw, (unsigned long long)bh->b_blocknr, + dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + &bh->b_data, 1, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +static void __btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + u64 cur_bytenr; + int bio_is_patched; + char **mapped_datav; + + dev_bytenr = 512 * bio->bi_iter.bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, + (unsigned long long)bio->bi_iter.bi_sector, + dev_bytenr, bio->bi_bdev); + + mapped_datav = kmalloc_array(bio->bi_vcnt, + sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) + goto leave; + cur_bytenr = dev_bytenr; + for (i = 0; i < bio->bi_vcnt; i++) { + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + printk(KERN_INFO + "#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + cur_bytenr += bio->bi_io_vec[i].bv_len; + } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } +leave: + mutex_unlock(&btrfsic_mutex); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); +} + +int btrfsic_submit_bio_wait(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + return submit_bio_wait(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, PAGE_CACHE_SIZE); + return -1; + } + state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!state) { + state = vzalloc(sizeof(*state)); + if (!state) { + printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); + return -1; + } + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone || b_all->never_written) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kvfree(state); +} diff --git a/kernel/fs/btrfs/check-integrity.h b/kernel/fs/btrfs/check-integrity.h new file mode 100644 index 000000000..13b8566c9 --- /dev/null +++ b/kernel/fs/btrfs/check-integrity.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +int btrfsic_submit_bio_wait(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#define btrfsic_submit_bio_wait submit_bio_wait +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/kernel/fs/btrfs/compression.c b/kernel/fs/btrfs/compression.c new file mode 100644 index 000000000..ce62324c7 --- /dev/null +++ b/kernel/fs/btrfs/compression.c @@ -0,0 +1,1091 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen); + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + return sizeof(struct compressed_bio) + + (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr); + + if (csum != *cb_sum) { + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, + (u64)bio->bi_iter.bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int i; + struct bio_vec *bvec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + bio_for_each_segment_all(bvec, cb->orig_bio, i) + SetPageChecked(bvec->bv_page); + + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline void end_compressed_writeback(struct inode *inode, + const struct compressed_bio *cb) +{ + unsigned long index = cb->start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + if (cb->errors) + mapping_set_error(inode->i_mapping, -EIO); + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + if (cb->errors) + SetPageError(pages[i]); + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, + err ? 0 : 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int pg_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if (!bio) { + kfree(cb); + return -ENOMEM; + } + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + page = compressed_pages[pg_index]; + page->mapping = inode->i_mapping; + if (bio->bi_iter.bi_size) + ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); + BUG_ON(ret); /* -ENOMEM */ + + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, + start, 1); + BUG_ON(ret); /* -ENOMEM */ + } + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); + BUG_ON(ret); /* -ENOMEM */ + + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); /* -ENOMEM */ + } + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long pg_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + while (last_offset < compressed_end) { + pg_index = last_offset >> PAGE_CACHE_SHIFT; + + if (pg_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, pg_index); + rcu_read_unlock(); + if (page && !radix_tree_exceptional_entry(page)) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = __page_cache_alloc(mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) + break; + + if (add_to_page_cache_lru(page, mapping, pg_index, + GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end); + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_iter.bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long pg_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret = -ENOMEM; + int faili = 0; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + if (!em) + return -EIO; + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); + cb->orig_bio = bio; + + nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); + cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (!cb->compressed_pages[pg_index]) { + faili = pg_index - 1; + ret = -ENOMEM; + goto fail2; + } + } + faili = nr_pages - 1; + cb->nr_pages = nr_pages; + + /* In the parent-locked case, we only locked the range we are + * interested in. In all other cases, we can opportunistically + * cache decompressed data that goes beyond the requested range. */ + if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + page = cb->compressed_pages[pg_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_iter.bi_size) + ret = tree->ops->merge_bio_hook(READ, page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); + BUG_ON(ret); /* -ENOMEM */ + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ + } + sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + root->sectorsize); + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + if (ret) + bio_endio(comp_bio, ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + BUG_ON(!comp_bio); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); + BUG_ON(ret); /* -ENOMEM */ + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ + } + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + if (ret) + bio_endio(comp_bio, ret); + + bio_put(comp_bio); + return 0; + +fail2: + while (faili >= 0) { + __free_page(cb->compressed_pages[faili]); + faili--; + } + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; +} + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +static const struct btrfs_compress_op * const btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +void __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + smp_mb(); + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*pg_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*pg_index)++; + if (*pg_index >= vcnt) + return 0; + + page_out = bvec[*pg_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; +} + +/* + * When uncompressing data, we need to make sure and zero any parts of + * the biovec that were not filled in by the decompression code. pg_index + * and pg_offset indicate the last page and the last offset of that page + * that have been filled in. This will zero everything remaining in the + * biovec. + */ +void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, + unsigned long pg_index, + unsigned long pg_offset) +{ + while (pg_index < vcnt) { + struct page *page = bvec[pg_index].bv_page; + unsigned long off = bvec[pg_index].bv_offset; + unsigned long len = bvec[pg_index].bv_len; + + if (pg_offset < off) + pg_offset = off; + if (pg_offset < off + len) { + unsigned long bytes = off + len - pg_offset; + char *kaddr; + + kaddr = kmap_atomic(page); + memset(kaddr + pg_offset, 0, bytes); + kunmap_atomic(kaddr); + } + pg_index++; + pg_offset = 0; + } +} diff --git a/kernel/fs/btrfs/compression.h b/kernel/fs/btrfs/compression.h new file mode 100644 index 000000000..13a4dc043 --- /dev/null +++ b/kernel/fs/btrfs/compression.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +void btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset); + +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, + unsigned long pg_index, + unsigned long pg_offset); +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; + +#endif diff --git a/kernel/fs/btrfs/ctree.c b/kernel/fs/btrfs/ctree.c new file mode 100644 index 000000000..0f11ebc92 --- /dev/null +++ b/kernel/fs/btrfs/ctree.c @@ -0,0 +1,5910 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot); +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + return path; +} + +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (!p->nodes[i] || !p->locks[i]) + continue; + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_READ_LOCK) + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + else if (p->locks[i] == BTRFS_WRITE_LOCK) + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held, int held_rw) +{ + int i; + + if (held) { + btrfs_set_lock_blocking_rw(held, held_rw); + if (held_rw == BTRFS_WRITE_LOCK) + held_rw = BTRFS_WRITE_LOCK_BLOCKING; + else if (held_rw == BTRFS_READ_LOCK) + held_rw = BTRFS_READ_LOCK_BLOCKING; + } + btrfs_set_path_blocking(p); + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { + if (p->nodes[i] && p->locks[i]) { + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) + p->locks[i] = BTRFS_WRITE_LOCK; + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) + p->locks[i] = BTRFS_READ_LOCK; + } + } + + if (held) + btrfs_clear_lock_blocking_rw(held, held_rw); +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + if (!p) + return; + btrfs_release_path(p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + + spin_lock(&root->fs_info->trans_lock); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } + spin_unlock(&root->fs_info->trans_lock); +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + int ret = 0; + int level; + struct btrfs_disk_key disk_key; + + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); + + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + u64 seq; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) +{ + read_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) +{ + read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ + write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) +{ + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + tree_mod_log_write_lock(fs_info); + spin_lock(&fs_info->tree_mod_seq_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } + spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + elem->seq = 0; + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if (cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + spin_unlock(&fs_info->tree_mod_seq_lock); + return; + } + min_seq = cur_elem->seq; + } + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tree_mod_log_write_lock(fs_info); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->seq > min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + tree_mod_log_write_unlock(fs_info); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + BUG_ON(!tm); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */ +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (eb && btrfs_header_level(eb) == 0) + return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + + return 0; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); + + return ret; +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + int locked = 0; + + if (!tree_mod_need_log(fs_info, eb)) + return 0; + + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags, + int log_removal) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + tree_mod_log_read_lock(fs_info); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + tree_mod_log_read_unlock(fs_info); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static noinline int +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; + int i; + int locked = 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static noinline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static noinline int +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; +} + +static noinline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node, + int log_removal) +{ + int ret; + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS, log_removal); + BUG_ON(ret < 0); +} + +/* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow, + int *last_ref) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + btrfs_header_level(buf), 1, + &refs, &flags); + if (ret) + return ret; + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); /* -ENOMEM */ + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); /* -ENOMEM */ + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); /* -ENOMEM */ + } + if (new_flags != 0) { + int level = btrfs_header_level(buf); + + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, level, 0); + if (ret) + return ret; + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); /* -ENOMEM */ + } + clean_tree_block(trans, root->fs_info, buf); + *last_ref = 1; + } + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *cow; + int level, ret; + int last_ref = 0; + int unlock_orig = 0; + u64 parent_start; + + if (*cow_ret == buf) + unlock_orig = 1; + + btrfs_assert_tree_locked(buf); + + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; + + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + /* cow is set to blocking by btrfs_init_new_buffer */ + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); + + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) + return ret; + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; + + extent_buffer_get(cow); + tree_mod_log_set_root_pointer(root, cow, 1); + rcu_assign_pointer(root->node, cow); + + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE, GFP_NOFS); + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer_stale(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb_root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + int looped = 0; + + if (!time_seq) + return NULL; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * if there are no tree operation for the oldest root, we simply + * return it. this should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * if there's an operation that's not a root replacement, we + * found the oldest version of our root. normally, we'll find a + * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = 1; + } + + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq, struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + tree_mod_log_read_lock(fs_info); + while (tm && tm->seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + /* Fallthrough */ + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + tree_mod_log_read_unlock(fs_info); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct extent_buffer *eb, u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + btrfs_set_path_blocking(path); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + extent_buffer_get(eb_rewin); + btrfs_tree_read_lock(eb_rewin); + __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); + + return eb_rewin; +} + +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + + eb_root = btrfs_read_lock_root_node(root); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = eb_root->start; + } + + tm = tree_mod_log_search(root->fs_info, logical, time_seq); + if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + old = read_tree_block(root, logical, 0); + if (WARN_ON(!old || !extent_buffer_uptodate(old))) { + free_extent_buffer(old); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); + } else { + eb = btrfs_clone_extent_buffer(old); + free_extent_buffer(old); + } + } else if (old_root) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); + } else { + btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock_blocking(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + extent_buffer_get(eb); + btrfs_tree_read_lock(eb); + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + if (tm) + __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { + level = tm->old_root.level; + } else { + level = btrfs_header_level(eb_root); + } + free_extent_buffer(eb_root); + + return level; +} + +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_test_is_dummy_root(root)) + return 0; + + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return 0; + return 1; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", + trans->transid, + root->fs_info->running_transaction->transid); + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", + trans->transid, root->fs_info->generation); + + if (!should_cow_block(trans, root, buf)) { + *cow_ret = buf; + return 0; + } + + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +/* + * same as comp_keys only with two btrfs_key's + */ +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = root->nodesize; + end_slot = parent_nritems - 1; + + if (parent_nritems <= 1) + return 0; + + btrfs_set_lock_blocking(parent); + + for (i = start_slot; i <= end_slot; i++) { + int close = 1; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_find_tree_block(root->fs_info, blocknr); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen, 0); + else + uptodate = 0; + if (!cur || !uptodate) { + if (!cur) { + cur = read_tree_block(root, blocknr, gen); + if (!cur || !extent_buffer_uptodate(cur)) { + free_extent_buffer(cur); + return -EIO; + } + } else if (!uptodate) { + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize)); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!kaddr || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &kaddr, &map_start, &map_len); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + return 0; + } + } + *slot = low; + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + else + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); +} + +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + struct extent_buffer *eb; + + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_node_ptr_generation(parent, slot)); + if (eb && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = NULL; + } + + return eb; +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) { + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + } + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + + btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } + + tree_mod_log_set_root_pointer(root, child, 1); + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root->fs_info, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + /* once for the root ptr */ + free_extent_buffer_stale(mid); + return 0; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + clean_tree_block(trans, root->fs_info, right); + btrfs_tree_unlock(right); + del_ptr(root, path, level + 1, pslot + 1); + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer_stale(right); + right = NULL; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot + 1, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + clean_tree_block(trans, root->fs_info, mid); + btrfs_tree_unlock(mid); + del_ptr(root, path, level + 1, pslot); + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer_stale(mid); + mid = NULL; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + if (level < BTRFS_MAX_LEVEL - 1) { + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + } + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot + 1, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 target; + u64 nread = 0; + u64 gen; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = root->nodesize; + eb = btrfs_find_tree_block(root->fs_info, search); + if (eb) { + free_extent_buffer(eb); + return; + } + + target = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { + gen = btrfs_node_ptr_generation(node, nr); + readahead_tree_block(root, search); + nread += blocksize; + } + nscan++; + if ((nread > 65536 || nscan > 32)) + break; + } +} + +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + + parent = path->nodes[level + 1]; + if (!parent) + return; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level + 1]; + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root->fs_info, block1); + /* + * if we get -eagain from btrfs_buffer_uptodate, we + * don't want to return eagain here. That will loop + * forever + */ + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) + block1 = 0; + free_extent_buffer(eb); + } + if (slot + 1 < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root->fs_info, block2); + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) + block2 = 0; + free_extent_buffer(eb); + } + + if (block1) + readahead_tree_block(root, block1); + if (block2) + readahead_tree_block(root, block2); +} + + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock_rw(t, path->locks[i]); + path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } + } + } +} + +/* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } +} + +/* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key, u64 time_seq) +{ + u64 blocknr; + u64 gen; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + int ret; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + + tmp = btrfs_find_tree_block(root->fs_info, blocknr); + if (tmp) { + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } + + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; + } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. Don't release the lock on the current + * level because we need to walk this node to figure + * out which blocks to read. + */ + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + btrfs_release_path(p); + + ret = -EAGAIN; + tmp = read_tree_block(root, blocknr, 0); + if (tmp) { + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!btrfs_buffer_uptodate(tmp, 0, 0)) + ret = -EIO; + free_extent_buffer(tmp); + } + return ret; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL, 0); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + int sret; + + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL, 0); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +static void key_search_validate(struct extent_buffer *b, + struct btrfs_key *key, + int level) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, key); + + if (level == 0) + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_leaf, items[0].key), + sizeof(disk_key))); + else + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_node, ptrs[0].key), + sizeof(disk_key))); +#endif +} + +static int key_search(struct extent_buffer *b, struct btrfs_key *key, + int level, int *prev_cmp, int *slot) +{ + if (*prev_cmp != 0) { + *prev_cmp = bin_search(b, key, level, slot); + return *prev_cmp; + } + + key_search_validate(b, key, level); + *slot = 0; + + return 0; +} + +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + ASSERT(path); + ASSERT(found_key); + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + int root_lock; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; + u8 lowest_level = 0; + int min_write_lock_level; + int prev_cmp; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); + + if (ins_len < 0) { + lowest_unlock = 2; + + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + + min_write_lock_level = write_lock_level; + +again: + prev_cmp = -1; + /* + * we try very hard to do read locks on the root + */ + root_lock = BTRFS_READ_LOCK; + level = 0; + if (p->search_commit_root) { + /* + * the commit roots are read only + * so we always do read locks + */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); + b = root->commit_root; + extent_buffer_get(b); + level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); + if (!p->skip_locking) + btrfs_tree_read_lock(b); + } else { + if (p->skip_locking) { + b = btrfs_root_node(root); + level = btrfs_header_level(b); + } else { + /* we don't know the level of the root node + * until we actually have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + /* whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* the level might have changed, check again */ + level = btrfs_header_level(b); + } + } + } + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + if (cow) { + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ + if (!should_cow_block(trans, root, b)) + goto cow_done; + + /* + * must have write locks on this node and the + * parent + */ + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + btrfs_set_path_blocking(p); + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { + ret = err; + goto done; + } + } +cow_done: + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. + */ + if (!ins_len && !p->keep_locks) { + int u = level + 1; + + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } + + ret = key_search(b, key, level, &prev_cmp, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, + ins_len, &write_lock_level); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + /* + * slot 0 is special, if we change the key + * we have to update the parent pointer + * which means we must have a write lock + * on the parent + */ + if (slot == 0 && ins_len && + write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(trans, root, p, + &b, level, slot, key, 0); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + level = btrfs_header_level(b); + if (level <= write_lock_level) { + err = btrfs_try_tree_write_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_WRITE_LOCK); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + err = btrfs_tree_read_lock_atomic(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; + } + p->nodes[level] = b; + } + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + if (write_lock_level < 1) { + write_lock_level = 1; + btrfs_release_path(p); + goto again; + } + + btrfs_set_path_blocking(p); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); + btrfs_clear_path_blocking(p, NULL, 0); + + BUG_ON(err > 0); + if (err) { + ret = err; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); + goto done; + } + } + ret = 1; +done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0 && !p->skip_release_on_error) + btrfs_release_path(p); + return ret; +} + +/* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + int prev_cmp = -1; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = get_old_root(root, time_seq); + level = btrfs_header_level(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + /* + * Since we can unwind eb's we want to do a real search every + * time. + */ + prev_cmp = -1; + ret = key_search(b, key, level, &prev_cmp, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_tree_read_lock_atomic(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret < 0) + return ret; + if (!ret) { + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; + return 0; + } + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } else { + --p->slots[0]; + } + } + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + */ +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + tree_mod_log_set_node_key(fs_info, t, tslot, 1); + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + BUG_ON(comp_keys(&disk_key, new_key) >= 0); + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + BUG_ON(comp_keys(&disk_key, new_key) <= 0); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(fs_info, path, &disk_key, 1); +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + /* + * don't call tree_mod_log_eb_move here, key removal was already + * fully logged by tree_mod_log_eb_copy above. + */ + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + root_add_used(root, root->nodesize); + + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + old = root->node; + tree_mod_log_set_root_pointer(root, c, 0); + rcu_assign_pointer(root->node, c); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + */ +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + int ret; + + BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + BUG_ON(slot > nritems); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != nritems) { + if (level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + if (level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD, GFP_NOFS); + BUG_ON(ret < 0); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* + * trying to split the root, lets make a new one + * + * tree mod log: We don't log_removal old root in + * insert_new_root, because that root buffer will be kept as a + * normal node. We are going to log removal of half of the + * elements below with tree_mod_log_eb_copy. We're holding a + * tree lock on the buffer, which is why we cannot race with + * other tree_mod_log users. + */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); + + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + root_add_used(root, root->nodesize); + + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(split, root->root_key.objectid); + write_extent_buffer(split, root->fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(start); + end_item = btrfs_item_nr(end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems, + u32 min_slot) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + + btrfs_init_map_token(&token); + + if (empty) + nr = 0; + else + nr = max_t(u32, 1, min_slot); + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + slot = path->slots[1]; + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + + if (push_items == 0) + goto out_unlock; + + WARN_ON(!empty && push_items == left_nritems); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(i); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); + } + + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root->fs_info, left); + + btrfs_mark_buffer_dirty(right); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root->fs_info, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items + */ +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, u32 right_nritems, + u32 max_slot) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + int i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 nr; + int ret = 0; + u32 this_item_size; + u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + if (empty) + nr = min(right_nritems, max_slot); + else + nr = min(right_nritems - 1, max_slot); + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(i); + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (push_items == 0) { + ret = 1; + goto out; + } + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(i); + + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + + /* fixup right node */ + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(i); + + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root->fs_info, right); + + btrfs_item_key(right, &disk_key, 0); + fixup_low_keys(root->fs_info, path, &disk_key, 1); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + if (ret == -ENOSPC) + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + */ +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + int data_copy_size; + int rt_data_off; + int i; + struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(i); + u32 ioff; + + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); + } + + btrfs_set_header_nritems(l, mid); + btrfs_item_key(right, &disk_key, 0); + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); +} + +/* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + int space_needed = data_size; + + slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + int wret; + int split; + int num_doubles = 0; + int tried_avoid_double = 0; + + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size_nr(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + return -EOVERFLOW; + + /* first try to make some room by pushing left and right */ + if (data_size && path->nodes[1]) { + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + split = 1; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0); + if (IS_ERR(right)) + return PTR_ERR(right); + + root_add_used(root, root->nodesize); + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE); + + write_extent_buffer(right, fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1], 1); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) + fixup_low_keys(fs_info, path, &disk_key, 1); + } + btrfs_mark_buffer_dirty(right); + return ret; + } + + copy_for_split(trans, root, path, l, right, slot, mid, nritems); + + if (split == 2) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + + return 0; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; +} + +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } + btrfs_release_path(path); + + path->keep_locks = 1; + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; + if (ret < 0) + goto err; + + ret = -EAGAIN; + leaf = path->nodes[0]; + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + /* the leaf has changed, it now has room. return now */ + if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; + } + + btrfs_set_path_blocking(path); + ret = split_leaf(trans, root, &key, path, ins_len, 1); + if (ret) + goto err; + + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + + btrfs_set_path_blocking(path); + + item = btrfs_item_nr(path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + + slot = path->slots[0] + 1; + nritems = btrfs_header_nritems(leaf); + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); + return ret; +} + +/* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + setup_items_for_insert(root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, + u32 new_size, int from_end) +{ + int slot; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(i); + + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + BTRFS_FILE_EXTENT_INLINE_DATA_START); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(root->fs_info, path, &disk_key, 1); + } + + item = btrfs_item_nr(slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +} + +/* + * make the item pointed to by the path bigger, data_size is the added size. + */ +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int slot; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(i); + + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +} + +/* + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot + */ +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) +{ + struct btrfs_item *item; + int i; + u32 nritems; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + int slot; + struct btrfs_map_token token; + + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + fixup_low_keys(root->fs_info, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + + btrfs_init_map_token(&token); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr( i); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); + } + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(slot + i); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); + data_end -= data_size[i]; + btrfs_set_token_item_size(leaf, item, data_size[i], &token); + } + + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + return ret; + + slot = path->slots[0]; + BUG_ON(slot < 0); + + setup_items_for_insert(root, path, cpu_key, data_size, + total_data, total_size, nr); + return 0; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + if (level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } else if (level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + BUG_ON(ret < 0); + } + + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); + } + btrfs_mark_buffer_dirty(parent); +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) +{ + WARN_ON(btrfs_header_generation(leaf) != trans->transid); + del_ptr(root, path, 1, path->slots[1]); + + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + + root_sub_used(root, leaf->len); + + extent_buffer_get(leaf); + btrfs_free_tree_block(trans, root, leaf, 0, 1); + free_extent_buffer_stale(leaf); +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(i); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + btrfs_set_path_blocking(path); + clean_tree_block(trans, root->fs_info, leaf); + btrfs_del_leaf(trans, root, path, leaf); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + fixup_low_keys(root->fs_info, path, &disk_key, 1); + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + btrfs_set_path_blocking(path); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + btrfs_del_leaf(trans, root, path, leaf); + free_extent_buffer(leaf); + ret = 0; + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + int keep_locks = path->keep_locks; + + path->keep_locks = 1; +again: + cur = btrfs_read_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = BTRFS_READ_LOCK; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the min_trans parameters. + * If it is too old, old, skip to the next one. + */ + while (slot < nritems) { + u64 gen; + + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + break; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + btrfs_set_path_blocking(path); + sret = btrfs_find_next_key(root, path, min_key, level, + min_trans); + if (sret == 0) { + btrfs_release_path(path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + goto out; + } + btrfs_set_path_blocking(path); + cur = read_node_slot(root, cur, slot); + BUG_ON(!cur); /* -ENOMEM */ + + btrfs_tree_read_lock(cur); + + path->locks[level - 1] = BTRFS_READ_LOCK; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1, 0, NULL); + btrfs_clear_path_blocking(path, NULL, 0); + } +out: + path->keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_set_path_blocking(path); + memcpy(min_key, &found_key, sizeof(found_key)); + } + return ret; +} + +static void tree_move_down(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + BUG_ON(*level == 0); + path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], + path->slots[*level]); + path->slots[*level - 1] = 0; + (*level)--; +} + +static int tree_move_next_or_upnext(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(root, path, level, root_level); + } else { + tree_move_down(root, path, level, root_level); + ret = 0; + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_root *left_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&left_root->fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = left_root->commit_root; + extent_buffer_get(left_path->nodes[left_level]); + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = right_root->commit_root; + extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + if (advance_left && !left_end_reached) { + ret = tree_advance(left_root, left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret < 0) + left_end_reached = ADVANCE; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_root, right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret < 0) + right_end_reached = ADVANCE; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_root, left_path, + right_path, tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kfree(tmp_buf); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the min_trans parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int level, u64 min_trans) +{ + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) + return 1; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; + } + + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) +{ + int slot; + int level; + struct extent_buffer *c; + struct extent_buffer *next; + struct btrfs_key key; + u32 nritems; + int ret; + int old_spinning = path->leave_spinning; + int next_rw_lock = 0; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; + next_rw_lock = 0; + btrfs_release_path(path); + + path->keep_locks = 1; + path->leave_spinning = 1; + + if (time_seq) + ret = btrfs_search_old_slot(root, &key, path, time_seq); + else + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + if (ret == 0) + path->slots[0]++; + ret = 0; + goto done; + } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) { + ret = 1; + goto done; + } + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } + continue; + } + + if (next) { + btrfs_tree_unlock_rw(next, next_rw_lock); + free_extent_buffer(next); + } + + next = c; + next_rw_lock = path->locks[level]; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key, 0); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + ret = btrfs_try_tree_read_lock(next); + if (!ret && time_seq) { + /* + * If we don't get the lock, we may be racing + * with push_leaf_left, holding that lock while + * itself waiting for the leaf we've currently + * locked. To solve this situation, we give up + * on our lock and cycle. + */ + free_extent_buffer(next); + btrfs_release_path(path); + cond_resched(); + goto again; + } + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); + } + next_rw_lock = BTRFS_READ_LOCK; + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock_rw(c, path->locks[level]); + + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = next_rw_lock; + if (!level) + break; + + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key, 0); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + ret = btrfs_try_tree_read_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); + } + next_rw_lock = BTRFS_READ_LOCK; + } + } + ret = 0; +done: + unlock_up(path, 0, 1, 0, NULL); + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == type) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} diff --git a/kernel/fs/btrfs/ctree.h b/kernel/fs/btrfs/ctree.h new file mode 100644 index 000000000..6f364e1d8 --- /dev/null +++ b/kernel/fs/btrfs/ctree.h @@ -0,0 +1,4247 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +struct btrfs_pending_snapshot; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; +struct btrfs_ordered_sum; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define STATIC noinline +#else +#define STATIC static noinline +#endif + +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ + +#define BTRFS_MAX_MIRRORS 3 + +#define BTRFS_MAX_LEVEL 8 + +#define BTRFS_COMPAT_EXTENT_TREE_V0 + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + +/* + * The inode number assigned to the special inode for storing + * free ino cache + */ +#define BTRFS_FREE_INO_OBJECTID -12ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + +#define BTRFS_DEV_REPLACE_DEVID 0ULL + +/* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) + +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) + +/* + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allow for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ +#define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 +#define BTRFS_FS_STATE_TRANS_ABORTED 2 +#define BTRFS_FS_STATE_DEV_REPLACING 3 + +/* Super block flags */ +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + BTRFS_FILE_EXTENT_INLINE_DATA_START) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) + + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unused_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 __unused_leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + __le64 cache_generation; + __le64 uuid_tree_generation; + + /* future expansion */ + __le64 reserved[30]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) + +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES) + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; + unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ + +struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { + __le32 refs; +} __attribute__ ((__packed__)); + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +/* + * this flag is only used internally by scrub and may be changed at any time + * it is only declared here to avoid collisions + */ +#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 count; +} __attribute__ ((__packed__)); + + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, +}; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Everytime the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + u8 uuid[BTRFS_UUID_SIZE]; + u8 parent_uuid[BTRFS_UUID_SIZE]; + u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + * + * At this offset in the structure, the inline extent data start. + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) + +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * RESCAN is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 rescan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +/* flags definition for qgroup limits */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed everytime we call + * btrfs_free_extent so it is a realtime count of what will be freed + * once the transaction is committed. It will be zero'ed everytime the + * transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +#define BTRFS_BLOCK_RSV_GLOBAL 1 +#define BTRFS_BLOCK_RSV_DELALLOC 2 +#define BTRFS_BLOCK_RSV_TRANS 3 +#define BTRFS_BLOCK_RSV_CHUNK 4 +#define BTRFS_BLOCK_RSV_DELOPS 5 +#define BTRFS_BLOCK_RSV_EMPTY 6 +#define BTRFS_BLOCK_RSV_TEMP 7 + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + unsigned short full; + unsigned short type; + unsigned short failfast; +}; + +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, + BTRFS_CACHE_ERROR = 4, +}; + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN = 0, + BTRFS_DC_ERROR = 1, + BTRFS_DC_CLEAR = 2, + BTRFS_DC_SETUP = 3, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_work work; + struct btrfs_block_group_cache *block_group; + u64 progress; + atomic_t count; +}; + +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 pinned; + u64 reserved; + u64 delalloc_bytes; + u64 bytes_super; + u64 flags; + u64 sectorsize; + u64 cache_generation; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* for raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + + unsigned int ro:1; + unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; + + int disk_cache_state; + + /* cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; + + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; +}; + +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; +}; + +#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + wait_queue_head_t wait; + spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +void btrfs_init_async_reclaim_work(struct work_struct *work); + +/* fs_info */ +struct reloc_control; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + u64 first_logical_byte; + struct rb_root block_group_cache_tree; + + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + + struct extent_io_tree freed_extents[2]; + struct extent_io_tree *pinned_extents; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + /* + * block reservation for extent, checksum, root tree and + * delayed dir index item + */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + u64 avg_delayed_ref_runtime; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + /* + * Track requests for actions that need to be done during transaction + * commit (like for some mount options). + */ + unsigned long pending_changes; + unsigned long compress_type:4; + int commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + struct mutex volume_mutex; + + /* + * this is taken to make sure we don't set block groups ro after + * the free space cache has been allocated on them + */ + struct mutex ro_block_group_mutex; + + /* this is used during read/modify/write to make sure + * no two ios are trying to mod the same stripe at the same + * time + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + + /* + * Same as ordered_operations_mutex except this is for ordered extents + * and not the operations. + */ + struct mutex ordered_extent_flush_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + struct srcu_struct subvol_srcu; + + spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + struct rw_semaphore delayed_iput_sem; + + /* this protects tree_mod_seq_list */ + spinlock_t tree_mod_seq_lock; + atomic64_t tree_mod_seq; + struct list_head tree_mod_seq_list; + + /* this protects tree_mod_log */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; + + /* + * this is used to protect the following list -- ordered_roots. + */ + spinlock_t ordered_root_lock; + + /* + * all fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct btrfs_workqueue *endio_workers; + struct btrfs_workqueue *endio_meta_workers; + struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *endio_repair_workers; + struct btrfs_workqueue *rmw_workers; + struct btrfs_workqueue *endio_meta_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *submit_workers; + struct btrfs_workqueue *caching_workers; + struct btrfs_workqueue *readahead_workers; + + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + struct kobject super_kobj; + struct kobject *space_info_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + int open; + + u64 total_pinned; + + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* auto defrag inodes go here */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + unsigned data_chunk_allocations; + unsigned metadata_ratio; + + void *bdev_holder; + + /* private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + int scrub_workers_refcnt; + struct btrfs_workqueue *scrub_workers; + struct btrfs_workqueue *scrub_wr_completion_workers; + struct btrfs_workqueue *scrub_nocow_workers; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* + * quota information + */ + unsigned int quota_enabled:1; + + /* + * quota_enabled only changes state after a commit. This holds the + * next state. + */ + unsigned int pending_quota_state:1; + + /* is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* holds configuration and tracking. Protected by qgroup_lock */ + struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; + spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; + + /* + * used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* protect user change for quota operations */ + struct mutex qgroup_ioctl_lock; + + /* list of dirty qgroups to be written at next commit */ + struct list_head dirty_qgroups; + + /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + u64 qgroup_seq; + + /* qgroup rescan items */ + struct mutex qgroup_rescan_lock; /* protects the progress item */ + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + + /* filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + struct radix_tree_root buffer_radix; + + /* next backup root to be overwritten */ + int backup_root_index; + + int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; + + struct semaphore uuid_tree_rescan_sem; + unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + + /* For btrfs to record security options */ + struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; +}; + +struct btrfs_subvolume_writers { + struct percpu_counter counter; + wait_queue_head_t wait; +}; + +/* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 +#define BTRFS_ROOT_DIRTY 9 + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_root { + struct extent_buffer *node; + + struct extent_buffer *commit_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + unsigned long state; + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct mutex objectid_mutex; + + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + + /* free ino cache stuff */ + struct btrfs_free_space_ctl *free_ino_ctl; + enum btrfs_caching_type ino_cache_state; + spinlock_t ino_cache_lock; + wait_queue_head_t ino_cache_wait; + struct btrfs_free_space_ctl *free_ino_pinned; + u64 ino_cache_progress; + struct inode *ino_cache_inode; + + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; + atomic_t log_writers; + atomic_t log_commit[2]; + atomic_t log_batch; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; + pid_t log_start_pid; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + u32 stripesize; + + u32 type; + + u64 highest_objectid; + + /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ + u64 alloc_bytenr; + + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + char *name; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + struct list_head root_list; + + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + + spinlock_t orphan_lock; + atomic_t orphan_inodes; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_cleanup_state; + + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + + /* + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock + */ + struct radix_tree_root delayed_nodes_tree; + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + dev_t anon_dev; + + spinlock_t root_item_lock; + atomic_t refs; + + struct mutex delalloc_mutex; + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; + struct btrfs_subvolume_writers *subv_writers; + atomic_t will_be_snapshoted; +}; + +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; +}; + + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. They are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 + +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + +#define BTRFS_BALANCE_ITEM_KEY 248 + +/* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) +#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) +#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) +#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) +#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) +#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (8192) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + +/* + * Requests for changes that need to be done during transaction commit. + * + * Internal mount options that are used for special handling of the real + * mount options (eg. cannot be set during remount and have to be set during + * transaction commit) + */ + +#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) +#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) +#define BTRFS_PENDING_COMMIT (2) + +#define btrfs_test_pending(info, opt) \ + test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_set_pending(info, opt) \ + set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_clear_pending(info, opt) \ + clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) + +/* + * Helpers for setting pending mount option changes. + * + * Expects corresponding macros + * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name + */ +#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), SET_##opt); \ + btrfs_clear_pending((info), CLEAR_##opt); \ + } \ +} while(0) + +#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), CLEAR_##opt); \ + btrfs_clear_pending((info), SET_##opt); \ + } \ +} while(0) + +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + token->kaddr = NULL; +} + +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val, \ + struct btrfs_map_token *token); \ +static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off) \ +{ \ + return btrfs_get_token_##bits(eb, ptr, off, NULL); \ +} \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ +} + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ +} \ +static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = page_address(eb->pages[0]); \ + u##bits res = le##bits##_to_cpu(p->member); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]); \ + p->member = cpu_to_le##bits(val); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, + nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (unsigned long)dev + ptr; +} + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, + blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, + data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, + name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, + transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, + nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline unsigned long btrfs_header_fsid(void) +{ + return offsetof(struct btrfs_header, fsid); +} + +static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + return offsetof(struct btrfs_header, chunk_tree_uuid); +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, + ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, + otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, + stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, + rtransid, 64); + +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + int slot, + struct btrfs_file_extent_item *fi) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + /* + * return the space used on disk if this item isn't + * compressed or encoded + */ + if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && + btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && + btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { + return btrfs_file_extent_inline_item_len(eb, + btrfs_item_nr(slot)); + } + + /* otherwise use the ram bytes field */ + return btrfs_token_file_extent_ram_bytes(eb, fi, &token); +} + + +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index) +{ + u64 val; + + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; +} + +/* extent-tree.c */ + +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); + +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 2 * num_items; +} + +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return root->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct btrfs_root *root, + struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int get_block_group_index(struct btrfs_block_group_cache *cache); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int level, int is_data); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int no_quota); + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, + int delalloc); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset, int no_quota); + +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start, + struct extent_map *em); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +/* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, + u64 min_trans); +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t cb, void *ctx); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, + u32 data_size); +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held, int held_rw); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); +static inline int btrfs_next_old_item(struct btrfs_root *root, + struct btrfs_path *p, u64 time_seq) +{ + ++p->slots[0]; + if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) + return btrfs_next_old_leaf(root, p, time_seq); + return 0; +} +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + int update_ref, int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root->fs_info->sb->s_flags & MS_RDONLY || + btrfs_fs_closing(root->fs_info)); +} + +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + security_free_mnt_opts(&fs_info->security_opts); + kfree(fs_info); +} + +/* tree mod log functions from ctree.c */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); + +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_root *tree_root); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +/* uuid-tree.c */ +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)); + +/* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, + int name_len); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, + u64 ref_objectid, const char *name, + int name_len, + struct btrfs_inode_extref **extref_ret); + +/* file-item.c */ +struct btrfs_dio_private; +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + +/* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes); + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +/* This forces readahead on a given range of bytes in an inode */ +static inline void btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, unsigned long req_size) +{ + page_cache_sync_readahead(mapping, ra, file, offset, req_size); +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid); +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_evict_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *was_new); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); +void btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_inode_check_errors(struct inode *inode); +extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); +int btrfs_is_empty_uuid(u8 *uuid); +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + + +/* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +extern const struct file_operations btrfs_file_operations; +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); + +#ifdef CONFIG_PRINTK +__printf(2, 3) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +#else +static inline __printf(2, 3) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +#ifdef DEBUG +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#endif + +#ifdef CONFIG_BTRFS_ASSERT + +static inline void assfail(char *expr, char *file, int line) +{ + pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) ((void)0) +#endif + +#define btrfs_assert() +__printf(5, 6) +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + + +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno); + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ + +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + __btrfs_abort_transaction(trans, root, __func__, \ + __LINE__, errno); \ +} while (0) + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, \ + __LINE__, (errno), NULL); \ +} while (0) + +#define btrfs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_std_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +/* acl.c */ +#ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); +#else +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif + +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); + +/* scrub.c */ +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_root *root); +void btrfs_scrub_continue(struct btrfs_root *root); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress); + +/* dev-replace.c */ +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) + return 1; + return 0; +} + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +/* Sanity test specific functions */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode); +#endif + +static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 1; +#endif + return 0; +} + +#endif diff --git a/kernel/fs/btrfs/delayed-inode.c b/kernel/fs/btrfs/delayed-inode.c new file mode 100644 index 000000000..a2ae42720 --- /dev/null +++ b/kernel/fs/btrfs/delayed-inode.c @@ -0,0 +1,1997 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "delayed-inode.h" +#include "disk-io.h" +#include "transaction.h" +#include "ctree.h" + +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 + +static struct kmem_cache *delayed_node_cache; + +int __init btrfs_delayed_inode_init(void) +{ + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", + sizeof(struct btrfs_delayed_node), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!delayed_node_cache) + return -ENOMEM; + return 0; +} + +void btrfs_delayed_inode_exit(void) +{ + if (delayed_node_cache) + kmem_cache_destroy(delayed_node_cache); +} + +static inline void btrfs_init_delayed_node( + struct btrfs_delayed_node *delayed_node, + struct btrfs_root *root, u64 inode_id) +{ + delayed_node->root = root; + delayed_node->inode_id = inode_id; + atomic_set(&delayed_node->refs, 0); + delayed_node->count = 0; + delayed_node->flags = 0; + delayed_node->ins_root = RB_ROOT; + delayed_node->del_root = RB_ROOT; + mutex_init(&delayed_node->mutex); + delayed_node->index_cnt = 0; + INIT_LIST_HEAD(&delayed_node->n_list); + INIT_LIST_HEAD(&delayed_node->p_list); + delayed_node->bytes_reserved = 0; + memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); +} + +static inline int btrfs_is_continuous_delayed_item( + struct btrfs_delayed_item *item1, + struct btrfs_delayed_item *item2) +{ + if (item1->key.type == BTRFS_DIR_INDEX_KEY && + item1->key.objectid == item2->key.objectid && + item1->key.type == item2->key.type && + item1->key.offset + 1 == item2->key.offset) + return 1; + return 0; +} + +static inline struct btrfs_delayed_root *btrfs_get_delayed_root( + struct btrfs_root *root) +{ + return root->fs_info->delayed_root; +} + +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) +{ + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + struct btrfs_delayed_node *node; + + node = ACCESS_ONCE(btrfs_inode->delayed_node); + if (node) { + atomic_inc(&node->refs); + return node; + } + + spin_lock(&root->inode_lock); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { + if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); + spin_unlock(&root->inode_lock); + return node; + } + btrfs_inode->delayed_node = node; + /* can be accessed and cached in the inode */ + atomic_add(2, &node->refs); + spin_unlock(&root->inode_lock); + return node; + } + spin_unlock(&root->inode_lock); + + return NULL; +} + +/* Will return either the node or PTR_ERR(-ENOMEM) */ +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); + + /* cached in the btrfs inode and can be accessed */ + atomic_add(2, &node->refs); + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } + btrfs_inode->delayed_node = node; + spin_unlock(&root->inode_lock); + radix_tree_preload_end(); + + return node; +} + +/* + * Call it when holding delayed_node->mutex + * + * If mod = 1, add this node into the prepared list. + */ +static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node, + int mod) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + if (!list_empty(&node->p_list)) + list_move_tail(&node->p_list, &root->prepare_list); + else if (mod) + list_add_tail(&node->p_list, &root->prepare_list); + } else { + list_add_tail(&node->n_list, &root->node_list); + list_add_tail(&node->p_list, &root->prepare_list); + atomic_inc(&node->refs); /* inserted into list */ + root->nodes++; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +/* Call it when holding delayed_node->mutex */ +static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + root->nodes--; + atomic_dec(&node->refs); /* not in the list */ + list_del_init(&node->n_list); + if (!list_empty(&node->p_list)) + list_del_init(&node->p_list); + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +static struct btrfs_delayed_node *btrfs_first_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->node_list)) + goto out; + + p = delayed_root->node_list.next; + node = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static struct btrfs_delayed_node *btrfs_next_delayed_node( + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_root *delayed_root; + struct list_head *p; + struct btrfs_delayed_node *next = NULL; + + delayed_root = node->root->fs_info->delayed_root; + spin_lock(&delayed_root->lock); + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ + if (list_empty(&delayed_root->node_list)) + goto out; + p = delayed_root->node_list.next; + } else if (list_is_last(&node->n_list, &delayed_root->node_list)) + goto out; + else + p = node->n_list.next; + + next = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&next->refs); +out: + spin_unlock(&delayed_root->lock); + + return next; +} + +static void __btrfs_release_delayed_node( + struct btrfs_delayed_node *delayed_node, + int mod) +{ + struct btrfs_delayed_root *delayed_root; + + if (!delayed_node) + return; + + delayed_root = delayed_node->root->fs_info->delayed_root; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->count) + btrfs_queue_delayed_node(delayed_root, delayed_node, mod); + else + btrfs_dequeue_delayed_node(delayed_root, delayed_node); + mutex_unlock(&delayed_node->mutex); + + if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; + struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); + if (atomic_read(&delayed_node->refs) == 0) { + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); + free = true; + } + spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); + } +} + +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 0); +} + +static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->prepare_list)) + goto out; + + p = delayed_root->prepare_list.next; + list_del_init(p); + node = list_entry(p, struct btrfs_delayed_node, p_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static inline void btrfs_release_prepared_delayed_node( + struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 1); +} + +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +{ + struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + if (item) { + item->data_len = data_len; + item->ins_or_del = 0; + item->bytes_reserved = 0; + item->delayed_node = NULL; + atomic_set(&item->refs, 1); + } + return item; +} + +/* + * __btrfs_lookup_delayed_item - look up the delayed item by key + * @delayed_node: pointer to the delayed node + * @key: the key to look up + * @prev: used to store the prev item if the right item isn't found + * @next: used to store the next item if the right item isn't found + * + * Note: if we don't find the right item, we will return the prev item and + * the next item. + */ +static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( + struct rb_root *root, + struct btrfs_key *key, + struct btrfs_delayed_item **prev, + struct btrfs_delayed_item **next) +{ + struct rb_node *node, *prev_node = NULL; + struct btrfs_delayed_item *delayed_item = NULL; + int ret = 0; + + node = root->rb_node; + + while (node) { + delayed_item = rb_entry(node, struct btrfs_delayed_item, + rb_node); + prev_node = node; + ret = btrfs_comp_cpu_keys(&delayed_item->key, key); + if (ret < 0) + node = node->rb_right; + else if (ret > 0) + node = node->rb_left; + else + return delayed_item; + } + + if (prev) { + if (!prev_node) + *prev = NULL; + else if (ret < 0) + *prev = delayed_item; + else if ((node = rb_prev(prev_node)) != NULL) { + *prev = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *prev = NULL; + } + + if (next) { + if (!prev_node) + *next = NULL; + else if (ret > 0) + *next = delayed_item; + else if ((node = rb_next(prev_node)) != NULL) { + *next = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *next = NULL; + } + return NULL; +} + +static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + NULL, NULL); + return item; +} + +static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, + struct btrfs_delayed_item *ins, + int action) +{ + struct rb_node **p, *node; + struct rb_node *parent_node = NULL; + struct rb_root *root; + struct btrfs_delayed_item *item; + int cmp; + + if (action == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_node->ins_root; + else if (action == BTRFS_DELAYED_DELETION_ITEM) + root = &delayed_node->del_root; + else + BUG(); + p = &root->rb_node; + node = &ins->rb_node; + + while (*p) { + parent_node = *p; + item = rb_entry(parent_node, struct btrfs_delayed_item, + rb_node); + + cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); + if (cmp < 0) + p = &(*p)->rb_right; + else if (cmp > 0) + p = &(*p)->rb_left; + else + return -EEXIST; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + ins->delayed_node = delayed_node; + ins->ins_or_del = action; + + if (ins->key.type == BTRFS_DIR_INDEX_KEY && + action == BTRFS_DELAYED_INSERTION_ITEM && + ins->key.offset >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->key.offset + 1; + + delayed_node->count++; + atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + return 0; +} + +static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_INSERTION_ITEM); +} + +static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_DELETION_ITEM); +} + +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + +static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) +{ + struct rb_root *root; + struct btrfs_delayed_root *delayed_root; + + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + + BUG_ON(!delayed_root); + BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && + delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); + + if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_item->delayed_node->ins_root; + else + root = &delayed_item->delayed_node->del_root; + + rb_erase(&delayed_item->rb_node, root); + delayed_item->delayed_node->count--; + + finish_one_item(delayed_root); +} + +static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) +{ + if (item) { + __btrfs_remove_delayed_item(item); + if (atomic_dec_and_test(&item->refs)) + kfree(item); + } +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->ins_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->del_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_next_delayed_item( + struct btrfs_delayed_item *item) +{ + struct rb_node *p; + struct btrfs_delayed_item *next = NULL; + + p = rb_next(&item->rb_node); + if (p) + next = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return next; +} + +static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + + if (!trans->bytes_reserved) + return 0; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); + item->bytes_reserved = num_bytes; + } + + return ret; +} + +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *rsv; + + if (!item->bytes_reserved) + return; + + rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); + btrfs_block_rsv_release(root, rsv, + item->bytes_reserved); +} + +static int btrfs_delayed_inode_reserve_metadata( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + bool release = false; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) { + node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } + return ret; + } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { + spin_lock(&BTRFS_I(inode)->lock); + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!WARN_ON(ret)) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; + } + +migrate: + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); + node->bytes_reserved = num_bytes; + } + + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); + btrfs_block_rsv_release(root, src_rsv, num_bytes); + } + + return ret; +} + +static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *rsv; + + if (!node->bytes_reserved) + return; + + rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); + btrfs_block_rsv_release(root, rsv, + node->bytes_reserved); + node->bytes_reserved = 0; +} + +/* + * This helper will insert some continuous items into the same leaf according + * to the free space of the leaf. + */ +static int btrfs_batch_insert_items(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + int free_space; + int total_data_size = 0, total_size = 0; + struct extent_buffer *leaf; + char *data_ptr; + struct btrfs_key *keys; + u32 *data_size; + struct list_head head; + int slot; + int nitems; + int i; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + free_space = btrfs_leaf_free_space(root, leaf); + INIT_LIST_HEAD(&head); + + next = item; + nitems = 0; + + /* + * count the number of the continuous items that we can insert in batch + */ + while (total_size + next->data_len + sizeof(struct btrfs_item) <= + free_space) { + total_data_size += next->data_len; + total_size += next->data_len + sizeof(struct btrfs_item); + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + } + + if (!nitems) { + ret = 0; + goto out; + } + + /* + * we need allocate some memory space, but it might cause the task + * to sleep, so we set all locked nodes in the path to blocking locks + * first. + */ + btrfs_set_path_blocking(path); + + keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) { + ret = -ENOMEM; + goto out; + } + + data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); + if (!data_size) { + ret = -ENOMEM; + goto error; + } + + /* get keys of all the delayed items */ + i = 0; + list_for_each_entry(next, &head, tree_list) { + keys[i] = next->key; + data_size[i] = next->data_len; + i++; + } + + /* reset all the locked nodes in the patch to spinning locks. */ + btrfs_clear_path_blocking(path, NULL, 0); + + /* insert the keys of the items */ + setup_items_for_insert(root, path, keys, data_size, + total_data_size, total_size, nitems); + + /* insert the dir index items */ + slot = path->slots[0]; + list_for_each_entry_safe(curr, next, &head, tree_list) { + data_ptr = btrfs_item_ptr(leaf, slot, char); + write_extent_buffer(leaf, &curr->data, + (unsigned long)data_ptr, + curr->data_len); + slot++; + + btrfs_delayed_item_release_metadata(root, curr); + + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +error: + kfree(data_size); + kfree(keys); +out: + return ret; +} + +/* + * This helper can just do simple insertion that needn't extend item for new + * data, such as directory name index insertion, inode insertion. + */ +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *delayed_item) +{ + struct extent_buffer *leaf; + char *ptr; + int ret; + + ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, + delayed_item->data_len); + if (ret < 0 && ret != -EEXIST) + return ret; + + leaf = path->nodes[0]; + + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + + write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, + delayed_item->data_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_delayed_item_release_metadata(root, delayed_item); + return 0; +} + +/* + * we insert an item first, then if there are some continuous items, we try + * to insert those items into the same leaf. + */ +static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) + goto insert_end; + + ret = btrfs_insert_delayed_item(trans, root, path, curr); + if (ret < 0) { + btrfs_release_path(path); + goto insert_end; + } + + prev = curr; + curr = __btrfs_next_delayed_item(prev); + if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { + /* insert the continuous items into the same leaf */ + path->slots[0]++; + btrfs_batch_insert_items(root, path, curr); + } + btrfs_release_delayed_item(prev); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +insert_end: + mutex_unlock(&node->mutex); + return ret; +} + +static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + struct extent_buffer *leaf; + struct btrfs_key key; + struct list_head head; + int nitems, i, last_item; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + + i = path->slots[0]; + last_item = btrfs_header_nritems(leaf) - 1; + if (i > last_item) + return -ENOENT; /* FIXME: Is errno suitable? */ + + next = item; + INIT_LIST_HEAD(&head); + btrfs_item_key_to_cpu(leaf, &key, i); + nitems = 0; + /* + * count the number of the dir index items that we can delete in batch + */ + while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + + i++; + if (i > last_item) + break; + btrfs_item_key_to_cpu(leaf, &key, i); + } + + if (!nitems) + return 0; + + ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); + if (ret) + goto out; + + list_for_each_entry_safe(curr, next, &head, tree_list) { + btrfs_delayed_item_release_metadata(root, curr); + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +out: + return ret; +} + +static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_deletion_item(node); + if (!curr) + goto delete_fail; + + ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + if (ret < 0) + goto delete_fail; + else if (ret > 0) { + /* + * can't find the item which the node points to, so this node + * is invalid, just drop it. + */ + prev = curr; + curr = __btrfs_next_delayed_item(prev); + btrfs_release_delayed_item(prev); + ret = 0; + btrfs_release_path(path); + if (curr) { + mutex_unlock(&node->mutex); + goto do_again; + } else + goto delete_fail; + } + + btrfs_batch_delete_items(trans, root, path, curr); + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +delete_fail: + btrfs_release_path(path); + mutex_unlock(&node->mutex); + return ret; +} + +static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + BUG_ON(!delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } +} + +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); +} + +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + struct btrfs_key key; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + int mod; + int ret; + + key.objectid = node->inode_id; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); + if (ret > 0) { + btrfs_release_path(path); + return -ENOENT; + } else if (ret < 0) { + return ret; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item)); + btrfs_mark_buffer_dirty(leaf); + + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto no_iref; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); +no_iref: + btrfs_release_path(path); +err_out: + btrfs_delayed_inode_release_metadata(root, node); + btrfs_release_delayed_inode(node); + + return ret; + +search: + btrfs_release_path(path); + + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = -1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; +} + +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret = 0; + bool count = (nr > 0); + + if (trans->aborted) + return -EIO; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node && (!count || (count && nr--))) { + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); + if (ret) { + btrfs_release_delayed_node(curr_node); + curr_node = NULL; + btrfs_abort_transaction(trans, root, ret); + break; + } + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } + + if (curr_node) + btrfs_release_delayed_node(curr_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_run_delayed_items(trans, root, -1); +} + +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + return __btrfs_run_delayed_items(trans, root, nr); +} + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + path = btrfs_alloc_path(); + if (!path) { + btrfs_release_delayed_node(delayed_node); + return -ENOMEM; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_commit_inode_delayed_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans, delayed_node->root); + btrfs_btree_balance_dirty(delayed_node->root); +out: + btrfs_release_delayed_node(delayed_node); + + return ret; +} + +void btrfs_remove_delayed_node(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node); + if (!delayed_node) + return; + + BTRFS_I(inode)->delayed_node = NULL; + btrfs_release_delayed_node(delayed_node); +} + +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; + struct btrfs_work work; +}; + +static void btrfs_async_run_delayed_root(struct btrfs_work *work) +{ + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + int total_done = 0; + + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; + + path = btrfs_alloc_path(); + if (!path) + goto out; + +again: + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + + path->leave_spinning = 1; + root = delayed_node->root; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + goto release_path; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; + + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work->nr == 0 || total_done < async_work->nr) + goto again; + +free_path: + btrfs_free_path(path); +out: + wake_up(&delayed_root->wait); + kfree(async_work); +} + + +static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, + struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_async_delayed_work *async_work; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return 0; + + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) + return -ENOMEM; + + async_work->delayed_root = delayed_root; + btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, + btrfs_async_run_delayed_root, NULL, NULL); + async_work->nr = nr; + + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); + return 0; +} + +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return 1; + + return 0; +} + +void btrfs_balance_delayed_items(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_fs_info *fs_info = root->fs_info; + + delayed_root = btrfs_get_delayed_root(root); + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return; + + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; + int ret; + + seq = atomic_read(&delayed_root->items_seq); + + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); + if (ret) + return; + + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; + } + + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); +} + +/* Will return 0 or -ENOMEM */ +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *delayed_item; + struct btrfs_dir_item *dir_item; + int ret; + + delayed_node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + if (!delayed_item) { + ret = -ENOMEM; + goto release_node; + } + + delayed_item->key.objectid = btrfs_ino(dir); + delayed_item->key.type = BTRFS_DIR_INDEX_KEY; + delayed_item->key.offset = index; + + dir_item = (struct btrfs_dir_item *)delayed_item->data; + dir_item->location = *disk_key; + btrfs_set_stack_dir_transid(dir_item, trans->transid); + btrfs_set_stack_dir_data_len(dir_item, 0); + btrfs_set_stack_dir_name_len(dir_item, name_len); + btrfs_set_stack_dir_type(dir_item, type); + memcpy((char *)(dir_item + 1), name, name_len); + + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + + mutex_lock(&delayed_node->mutex); + ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) " + "into the insertion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)", + name_len, name, delayed_node->root->objectid, + delayed_node->inode_id, ret); + BUG(); + } + mutex_unlock(&delayed_node->mutex); + +release_node: + btrfs_release_delayed_node(delayed_node); + return ret; +} + +static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, + struct btrfs_delayed_node *node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_lookup_delayed_insertion_item(node, key); + if (!item) { + mutex_unlock(&node->mutex); + return 1; + } + + btrfs_delayed_item_release_metadata(root, item); + btrfs_release_delayed_item(item); + mutex_unlock(&node->mutex); + return 0; +} + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_key item_key; + int ret; + + node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(node)) + return PTR_ERR(node); + + item_key.objectid = btrfs_ino(dir); + item_key.type = BTRFS_DIR_INDEX_KEY; + item_key.offset = index; + + ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); + if (!ret) + goto end; + + item = btrfs_alloc_delayed_item(0); + if (!item) { + ret = -ENOMEM; + goto end; + } + + item->key = item_key; + + ret = btrfs_delayed_item_reserve_metadata(trans, root, item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible. + */ + BUG_ON(ret); + + mutex_lock(&node->mutex); + ret = __btrfs_add_delayed_deletion_item(node, item); + if (unlikely(ret)) { + btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) " + "into the deletion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)", + index, node->root->objectid, node->inode_id, + ret); + BUG(); + } + mutex_unlock(&node->mutex); +end: + btrfs_release_delayed_node(node); + return ret; +} + +int btrfs_inode_delayed_dir_index_count(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + + if (!delayed_node) + return -ENOENT; + + /* + * Since we have held i_mutex of this directory, it is impossible that + * a new directory index is added into the delayed node and index_cnt + * is updated now. So we needn't lock the delayed node. + */ + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); + return -EINVAL; + } + + BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; + btrfs_release_delayed_node(delayed_node); + return 0; +} + +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *item; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + mutex_lock(&delayed_node->mutex); + item = __btrfs_first_delayed_insertion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, ins_list); + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, del_list); + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&delayed_node->mutex); + /* + * This delayed node is still cached in the btrfs inode, so refs + * must be > 1 now, and we needn't check it is going to be freed + * or not. + * + * Besides that, this function is used to read dir, we do not + * insert/delete delayed items in this period. So we also needn't + * requeue or dequeue this delayed node. + */ + atomic_dec(&delayed_node->refs); +} + +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_item *curr, *next; + + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } +} + +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index) +{ + struct btrfs_delayed_item *curr, *next; + int ret; + + if (list_empty(del_list)) + return 0; + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + if (curr->key.offset > index) + break; + + list_del(&curr->readdir_list); + ret = (curr->key.offset == index); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (ret) + return 1; + else + continue; + } + return 0; +} + +/* + * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree + * + */ +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list) +{ + struct btrfs_dir_item *di; + struct btrfs_delayed_item *curr, *next; + struct btrfs_key location; + char *name; + int name_len; + int over = 0; + unsigned char d_type; + + if (list_empty(ins_list)) + return 0; + + /* + * Changing the data of the delayed item is impossible. So + * we needn't lock them. And we have held i_mutex of the + * directory, nobody can delete any directory indexes now. + */ + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + + if (curr->key.offset < ctx->pos) { + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + continue; + } + + ctx->pos = curr->key.offset; + + di = (struct btrfs_dir_item *)curr->data; + name = (char *)(di + 1); + name_len = btrfs_stack_dir_name_len(di); + + d_type = btrfs_filetype_table[di->type]; + btrfs_disk_key_to_cpu(&location, &di->location); + + over = !dir_emit(ctx, name, name_len, + location.objectid, d_type); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (over) + return 1; + } + return 0; +} + +static void fill_stack_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_inode_item *inode_item, + struct inode *inode) +{ + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); + btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); + btrfs_set_stack_inode_generation(inode_item, + BTRFS_I(inode)->generation); + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); + btrfs_set_stack_inode_transid(inode_item, trans->transid); + btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); + btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + btrfs_set_stack_inode_block_group(inode_item, 0); + + btrfs_set_stack_timespec_sec(&inode_item->atime, + inode->i_atime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->ctime, + inode->i_ctime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); +} + +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + + inode->i_version = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + int ret = 0; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + goto release_node; + } + + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); + if (ret) + goto release_node; + + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return ret; +} + +int btrfs_delayed_delete_inode_ref(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + /* + * we don't do delayed inode updates during log recovery because it + * leads to enospc problems. This means we also can't do + * delayed inode refs + */ + if (BTRFS_I(inode)->root->fs_info->log_root_recovering) + return -EAGAIN; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_root *root = delayed_node->root; + struct btrfs_delayed_item *curr_item, *prev_item; + + mutex_lock(&delayed_node->mutex); + curr_item = __btrfs_first_delayed_insertion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + btrfs_delayed_inode_release_metadata(root, delayed_node); + btrfs_release_delayed_inode(delayed_node); + } + mutex_unlock(&delayed_node->mutex); +} + +void btrfs_kill_delayed_inode_items(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + __btrfs_kill_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node); +} + +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) +{ + u64 inode_id = 0; + struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; + + while (1) { + spin_lock(&root->inode_lock); + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { + spin_unlock(&root->inode_lock); + break; + } + + inode_id = delayed_nodes[n - 1]->inode_id + 1; + + for (i = 0; i < n; i++) + atomic_inc(&delayed_nodes[i]->refs); + spin_unlock(&root->inode_lock); + + for (i = 0; i < n; i++) { + __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i]); + } + } +} + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + diff --git a/kernel/fs/btrfs/delayed-inode.h b/kernel/fs/btrfs/delayed-inode.h new file mode 100644 index 000000000..f70119f25 --- /dev/null +++ b/kernel/fs/btrfs/delayed-inode.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DELAYED_TREE_OPERATION_H +#define __DELAYED_TREE_OPERATION_H + +#include +#include +#include +#include +#include +#include + +#include "ctree.h" + +/* types of the delayed item */ +#define BTRFS_DELAYED_INSERTION_ITEM 1 +#define BTRFS_DELAYED_DELETION_ITEM 2 + +struct btrfs_delayed_root { + spinlock_t lock; + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ + int nodes; /* for delayed nodes */ + wait_queue_head_t wait; +}; + +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + +struct btrfs_delayed_node { + u64 inode_id; + u64 bytes_reserved; + struct btrfs_root *root; + /* Used to add the node into the delayed root's node list. */ + struct list_head n_list; + /* + * Used to add the node into the prepare list, the nodes in this list + * is waiting to be dealt with by the async worker. + */ + struct list_head p_list; + struct rb_root ins_root; + struct rb_root del_root; + struct mutex mutex; + struct btrfs_inode_item inode_item; + atomic_t refs; + u64 index_cnt; + unsigned long flags; + int count; +}; + +struct btrfs_delayed_item { + struct rb_node rb_node; + struct btrfs_key key; + struct list_head tree_list; /* used for batch insert/delete items */ + struct list_head readdir_list; /* used for readdir items */ + u64 bytes_reserved; + struct btrfs_delayed_node *delayed_node; + atomic_t refs; + int ins_or_del; + u32 data_len; + char data[0]; +}; + +static inline void btrfs_init_delayed_root( + struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index); + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index); + +int btrfs_inode_delayed_dir_index_count(struct inode *inode); + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr); + +void btrfs_balance_delayed_items(struct btrfs_root *root); + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode); +/* Used for evicting the inode. */ +void btrfs_remove_delayed_node(struct inode *inode); +void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode); + + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct inode *inode); + +/* Used for drop dead root */ +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); + +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); + +/* Used for readdir() */ +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list); +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list); +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index); +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list); + +/* for init */ +int __init btrfs_delayed_inode_init(void); +void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + +#endif diff --git a/kernel/fs/btrfs/delayed-ref.c b/kernel/fs/btrfs/delayed-ref.c new file mode 100644 index 000000000..8f8ed7d20 --- /dev/null +++ b/kernel/fs/btrfs/delayed-ref.c @@ -0,0 +1,962 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + */ + +/* + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1, int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type + */ +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) +{ + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1, + bool compare_seq) +{ + if (ref1->bytenr < ref2->bytenr) + return -1; + if (ref1->bytenr > ref2->bytenr) + return 1; + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) + return -1; + if (ref1->is_head) + return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; + /* merging of sequenced refs is not allowed */ + if (compare_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1), + ref1->type); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; + int cmp; + + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, ins, 1); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->node.bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->node.bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot. + * If return_bigger is given, the next bigger entry is returned if no exact + * match is found. + */ +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + int return_bigger) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = root->rb_node; + entry = NULL; + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + if (bytenr < entry->node.bytenr) + n = n->rb_left; + else if (bytenr > entry->node.bytenr) + n = n->rb_right; + else + return entry; + } + if (entry && return_bigger) { + if (bytenr > entry->node.bytenr) { + n = rb_next(&entry->href_node); + if (!n) + n = rb_first(root); + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + return entry; + } + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref) +{ + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + rb_erase(&head->href_node, &delayed_refs->href_root); + } else { + assert_spin_locked(&head->lock); + rb_erase(&ref->rb_node, &head->ref_root); + } + ref->in_tree = 0; + btrfs_put_delayed_ref(ref); + atomic_dec(&delayed_refs->num_entries); + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; +} + +static int merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, u64 seq) +{ + struct rb_node *node; + int mod = 0; + int done = 0; + + node = rb_next(&ref->rb_node); + while (!done && node) { + struct btrfs_delayed_ref_node *next; + + next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + if (seq && next->seq >= seq) + break; + if (comp_entry(ref, next, 0)) + continue; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + struct btrfs_delayed_ref_node *tmp; + + tmp = ref; + ref = next; + next = tmp; + done = 1; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = 1; + } else { + /* + * You can't have multiples of the same ref on a tree + * block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + } + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + u64 seq = 0; + + assert_spin_locked(&head->lock); + /* + * We don't have too much refs to merge in the case of delayed data + * refs. + */ + if (head->is_data) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + node = rb_first(&head->ref_root); + while (node) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + /* We can't merge refs that are outside of our seq count */ + if (seq && ref->seq >= seq) + break; + if (merge_ref(trans, delayed_refs, head, ref, seq)) + node = rb_first(&head->ref_root); + else + node = rb_next(&ref->rb_node); + } +} + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + int ret = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n", + (u32)(seq >> 32), (u32)seq, + (u32)(elem->seq >> 32), (u32)elem->seq, + delayed_refs); + ret = 1; + } + } + + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; +} + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + u64 start; + bool loop = false; + + delayed_refs = &trans->transaction->delayed_refs; + +again: + start = delayed_refs->run_delayed_start; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head && !loop) { + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head) + return NULL; + } else if (!head && loop) { + return NULL; + } + + while (head->processing) { + struct rb_node *node; + + node = rb_next(&head->href_node); + if (!node) { + if (loop) + return NULL; + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + goto again; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + } + + head->processing = 1; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->node.bytenr + + head->node.num_bytes; + return head; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + if (update->action != existing->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) + drop_delayed_ref(trans, delayed_refs, head, existing); + else + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + int old_ref_mod; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); + + spin_lock(&existing_ref->lock); + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + btrfs_free_delayed_extent_op(ref->extent_op); + } + } + /* + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. + */ + old_ref_mod = existing_ref->total_ref_mod; + existing->ref_mod += update->ref_mod; + existing_ref->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing_ref->is_data) { + if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + delayed_refs->pending_csums -= existing->num_bytes; + if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + delayed_refs->pending_csums += existing->num_bytes; + } + spin_unlock(&existing_ref->lock); +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, int action, int is_data) +{ + struct btrfs_delayed_ref_head *existing; + struct btrfs_delayed_ref_head *head_ref = NULL; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) + must_insert_reserved = 1; + else + must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; + ref->in_tree = 1; + ref->seq = 0; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + head_ref->ref_root = RB_ROOT; + head_ref->processing = 0; + head_ref->total_ref_mod = count_mod; + + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); + + trace_add_delayed_ref_head(ref, head_ref, action); + + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); + if (existing) { + update_existing_head_ref(delayed_refs, &existing->node, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; + } else { + if (is_data && count_mod < 0) + delayed_refs->pending_csums += num_bytes; + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + atomic_inc(&delayed_refs->num_entries); + trans->delayed_ref_updates++; + } + return head_ref; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline void +add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, int level, + int action, int no_quota) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + ref->no_quota = no_quota; + ref->seq = seq; + + full_ref = btrfs_delayed_node_to_tree_ref(ref); + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; + else + ref->type = BTRFS_TREE_BLOCK_REF_KEY; + full_ref->level = level; + + trace_add_delayed_tree_ref(ref, full_ref, action); + + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); + if (existing) { + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); + } else { + atomic_inc(&delayed_refs->num_entries); + trans->delayed_ref_updates++; + } + spin_unlock(&head_ref->lock); +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline void +add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, u64 owner, + u64 offset, int action, int no_quota) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + ref->no_quota = no_quota; + ref->seq = seq; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) + ref->type = BTRFS_SHARED_DATA_REF_KEY; + else + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + + full_ref->objectid = owner; + full_ref->offset = offset; + + trace_add_delayed_data_ref(ref, full_ref, action); + + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); + if (existing) { + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); + } else { + atomic_inc(&delayed_refs->num_entries); + trans->delayed_ref_updates++; + } + spin_unlock(&head_ref->lock); +} + +/* + * add a delayed tree ref. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op, + int no_quota) +{ + struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + + BUG_ON(extent_op && extent_op->is_data); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) { + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 0); + + add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, + num_bytes, parent, ref_root, level, action, + no_quota); + spin_unlock(&delayed_refs->lock); + + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op, + int no_quota) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 1); + + add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, + num_bytes, parent, ref_root, owner, offset, + action, no_quota); + spin_unlock(&delayed_refs->lock); + + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); + + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + return find_ref_head(&delayed_refs->href_root, bytenr, 0); +} + +void btrfs_delayed_ref_exit(void) +{ + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/kernel/fs/btrfs/delayed-ref.h b/kernel/fs/btrfs/delayed-ref.h new file mode 100644 index 000000000..5eb089239 --- /dev/null +++ b/kernel/fs/btrfs/delayed-ref.h @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref_node->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the size of the extent */ + u64 num_bytes; + + /* seq number to keep track of insertion order */ + u64 seq; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + unsigned int action:8; + unsigned int type:8; + unsigned int no_quota:1; + /* is this node still in the rbtree? */ + unsigned int is_head:1; + unsigned int in_tree:1; +}; + +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + int level; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + spinlock_t lock; + struct rb_root ref_root; + + struct rb_node href_node; + + struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; + unsigned int is_data:1; + unsigned int processing:1; +}; + +struct btrfs_delayed_tree_ref { + struct btrfs_delayed_ref_node node; + u64 root; + u64 parent; + int level; +}; + +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + u64 root; + u64 parent; + u64 objectid; + u64 offset; +}; + +struct btrfs_delayed_ref_root { + /* head ref rbtree */ + struct rb_root href_root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + atomic_t num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + u64 pending_csums; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; + + u64 run_delayed_start; +}; + +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } + } +} + +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op, + int no_quota); +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op, + int no_quota); +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans); + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); + +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->is_head; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} + +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); +} +#endif diff --git a/kernel/fs/btrfs/dev-replace.c b/kernel/fs/btrfs/dev-replace.c new file mode 100644 index 000000000..0573848c7 --- /dev/null +++ b/kernel/fs/btrfs/dev-replace.c @@ -0,0 +1,937 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "sysfs.h" + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + btrfs_warn(fs_info, "insert dev_replace item failed %d!", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + + /* the disk copy procedure reuses the scrub code */ + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + if (ret) { + mutex_unlock(&fs_info->volume_mutex); + return ret; + } + + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + src_device, &tgt_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) + return ret; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = get_seconds(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_roots(root->fs_info, -1); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + btrfs_device_get_total_bytes(src_device), + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } + + return ret; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } + btrfs_wait_ordered_roots(root->fs_info, -1); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + mutex_lock(&uuid_mutex); + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { + printk_in_rcu(KERN_ERR + "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return scrub_ret; + } + + printk_in_rcu(KERN_INFO + "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; + tgt_device->commit_bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + fs_info->fs_devices->rw_devices++; + + btrfs_dev_replace_unlock(dev_replace); + + btrfs_rm_dev_replace_blocked(fs_info); + + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); + + btrfs_rm_dev_replace_unblocked(fs_info); + + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); + + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *srcdev; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + srcdev = dev_replace->srcdev; + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + btrfs_info(fs_info, "suspending dev_replace for unmount"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_ERR_OR_ZERO(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + progress = div_u64(progress, 10); + printk_in_rcu(KERN_INFO + "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + btrfs_device_get_total_bytes(dev_replace->srcdev), + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) +{ + percpu_counter_sub(&fs_info->bio_counter, amount); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + } +} diff --git a/kernel/fs/btrfs/dev-replace.h b/kernel/fs/btrfs/dev-replace.h new file mode 100644 index 000000000..20035cbbf --- /dev/null +++ b/kernel/fs/btrfs/dev-replace.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/kernel/fs/btrfs/dir-item.c b/kernel/fs/btrfs/dir-item.c new file mode 100644 index 000000000..1752625fb --- /dev/null +++ b/kernel/fs/btrfs/dir-item.c @@ -0,0 +1,482 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + btrfs_extend_item(root, path, data_size); + } else if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) +{ + int ret = 0; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + * Will return 0 or -ENOMEM + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, + struct inode *dir, struct btrfs_key *location, + u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = btrfs_ino(dir); + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + btrfs_cpu_key_to_disk(&disk_key, location); + + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out_free; + } + + leaf = path->nodes[0]; + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out_free; + } + btrfs_release_path(path); + + ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir, + &disk_key, type, index); +out_free: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + key.type = BTRFS_DIR_ITEM_KEY; + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ + if (ret > 0) { + ret = 0; + goto out; + } + + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len; + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len) +{ + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u32 nritems; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return di; + + path->slots[0]++; + } + return NULL; +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + btrfs_truncate_item(root, path, item_len - sub_item_len, 1); + } + return ret; +} + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + btrfs_crit(root->fs_info, "invalid dir item type: %d", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + btrfs_crit(root->fs_info, "invalid dir item name len: %u", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if ((btrfs_dir_data_len(leaf, dir_item) + + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { + btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u", + (unsigned)btrfs_dir_name_len(leaf, dir_item), + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + return 0; +} diff --git a/kernel/fs/btrfs/disk-io.c b/kernel/fs/btrfs/disk-io.c new file mode 100644 index 000000000..2ef9a4b72 --- /dev/null +++ b/kernel/fs/btrfs/disk-io.c @@ -0,0 +1,4338 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "tree-log.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "raid56.h" +#include "sysfs.h" +#include "qgroup.h" + +#ifdef CONFIG_X86 +#include +#endif + +static const struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); +static void free_fs_root(struct btrfs_root *root); +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static void btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); +static void btrfs_error_commit_super(struct btrfs_root *root); + +/* + * btrfs_end_io_wq structs are used to do processing in task context when an IO + * is complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct btrfs_end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + enum btrfs_wq_endio_type metadata; + struct list_head list; + struct btrfs_work work; +}; + +static struct kmem_cache *btrfs_end_io_wq_cache; + +int __init btrfs_end_io_wq_init(void) +{ + btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", + sizeof(struct btrfs_end_io_wq), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_end_io_wq_cache) + return -ENOMEM; + return 0; +} + +void btrfs_end_io_wq_exit(void) +{ + if (btrfs_end_io_wq_cache) + kmem_cache_destroy(btrfs_end_io_wq_cache); +} + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; + struct btrfs_work work; + int error; +}; + +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid. This ensures that all special purpose roots + * have separate keysets. + * + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. + * + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. + * + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + const char *name_stem; /* lock name stem */ + char names[BTRFS_MAX_LEVEL + 1][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, + { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, + { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, + { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, + { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, + { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = 0, .name_stem = "tree" }, +}; + +void __init btrfs_init_lockdep(void) +{ + int i, j; + + /* initialize lockdep class names */ + for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { + struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + + for (j = 0; j < ARRAY_SIZE(ks->names); j++) + snprintf(ks->names[j], sizeof(ks->names[j]), + "btrfs-%s-%02d", ks->name_stem, j); + } +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, + int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, + &ks->keys[level], ks->names[level]); +} + +#endif + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + goto out; + } + read_unlock(&em_tree->lock); + + em = alloc_extent_map(); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + if (ret == -EEXIST) { + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) + em = ERR_PTR(-EIO); + } else if (ret) { + free_extent_map(em); + em = ERR_PTR(ret); + } + write_unlock(&em_tree->lock); + +out: + return em; +} + +u32 btrfs_csum_data(char *data, u32 seed, size_t len) +{ + return btrfs_crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + put_unaligned_le32(~crc, result); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *buf, + int verify) +{ + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &kaddr, &map_start, &map_len); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size, GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + printk_ratelimited(KERN_WARNING + "BTRFS: %s checksum verify failed on %llu wanted %X found %X " + "level %d\n", + fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid, + int atomic) +{ + struct extent_state *cached_state = NULL; + int ret; + bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + if (atomic) + return -EAGAIN; + + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, + 0, &cached_state); + if (extent_buffer_uptodate(eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk_ratelimited(KERN_ERR + "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + eb->fs_info->sb->s_id, eb->start, + parent_transid, btrfs_header_generation(eb)); + ret = 1; + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); +out: + unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state, GFP_NOFS); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); + return ret; +} + +/* + * Return 0 if the superblock checksum type matches the checksum value of that + * algorithm. Pass the raw disk superblock data. + */ +static int btrfs_check_super_csum(char *raw_disk_sb) +{ + struct btrfs_super_block *disk_sb = + (struct btrfs_super_block *)raw_disk_sb; + u16 csum_type = btrfs_super_csum_type(disk_sb); + int ret = 0; + + if (csum_type == BTRFS_CSUM_TYPE_CRC32) { + u32 crc = ~(u32)0; + const int csum_size = sizeof(crc); + char result[csum_size]; + + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space + * is filled with zeros and is included in the checkum. + */ + crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, + crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, result); + + if (memcmp(raw_disk_sb, result, csum_size)) + ret = 1; + } + + if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", + csum_type); + ret = 1; + } + + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int failed = 0; + int ret; + int num_copies = 0; + int mirror_num = 0; + int failed_mirror = 0; + + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, + btree_get_extent, mirror_num); + if (!ret) { + if (!verify_parent_transid(io_tree, eb, + parent_transid, 0)) + break; + else + ret = -EIO; + } + + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + break; + + num_copies = btrfs_num_copies(root->fs_info, + eb->start, eb->len); + if (num_copies == 1) + break; + + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } + + mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + + if (mirror_num > num_copies) + break; + } + + if (failed && !ret && failed_mirror) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) +{ + u64 start = page_offset(page); + u64 found_start; + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; + found_start = btrfs_header_bytenr(eb); + if (WARN_ON(found_start != start || !PageUptodate(page))) + return 0; + csum_tree_block(fs_info, eb, 0); + return 0; +} + +static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +#define CORRUPT(reason, eb, root, slot) \ + btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d", reason, \ + btrfs_header_bytenr(eb), root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; +} + +static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) +{ + u64 found_start; + int found_level; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + int reads_done; + + if (!page->private) + goto out; + + eb = (struct extent_buffer *)page->private; + + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + + found_start = btrfs_header_bytenr(eb); + if (found_start != eb->start) { + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " + "%llu %llu\n", + eb->fs_info->sb->s_id, found_start, eb->start); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root->fs_info, eb)) { + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", + eb->fs_info->sb->s_id, eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + if (found_level >= BTRFS_MAX_LEVEL) { + btrfs_err(root->fs_info, "bad tree block level %d", + (int)btrfs_header_level(eb)); + ret = -EIO; + goto err; + } + + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), + eb, found_level); + + ret = csum_tree_block(root->fs_info, eb, 1); + if (ret) { + ret = -EIO; + goto err; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } + + if (!ret) + set_extent_buffer_uptodate(eb); +err: + if (reads_done && + test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(root, eb, eb->start, ret); + + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + } + free_extent_buffer(eb); +out: + return ret; +} + +static int btree_io_failed_hook(struct page *page, int failed_mirror) +{ + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(root, eb, eb->start, -EIO); + return -EIO; /* we fixed nothing */ +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct btrfs_end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + + if (bio->bi_rw & REQ_WRITE) { + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + wq = fs_info->endio_meta_write_workers; + func = btrfs_endio_meta_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + } else { + if (unlikely(end_io_wq->metadata == + BTRFS_WQ_ENDIO_DIO_REPAIR)) { + wq = fs_info->endio_repair_workers; + func = btrfs_endio_repair_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else if (end_io_wq->metadata) { + wq = fs_info->endio_meta_workers; + func = btrfs_endio_meta_helper; + } else { + wq = fs_info->endio_workers; + func = btrfs_endio_helper; + } + } + + btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_queue_work(wq, &end_io_wq->work); +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + enum btrfs_wq_endio_type metadata) +{ + struct btrfs_end_io_wq *end_io_wq; + + end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->thread_pool_size, + info->fs_devices->open_devices); + return 256 * limit; +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async; + int ret; + + async = container_of(work, struct async_submit_bio, work); + ret = async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); + if (ret) + async->error = ret; +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + /* If an error occured we just want to clean up the bio and move on */ + if (async->error) { + bio_endio(async->bio, async->error); + return; + } + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, + run_one_async_done, run_one_async_free); + + async->bio_flags = bio_flags; + async->bio_offset = bio_offset; + + async->error = 0; + + atomic_inc(&fs_info->nr_async_submits); + + if (rw & REQ_SYNC) + btrfs_set_work_high_priority(&async->work); + + btrfs_queue_work(fs_info->workers, &async->work); + + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec; + struct btrfs_root *root; + int i, ret = 0; + + bio_for_each_segment_all(bvec, bio, i) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); + if (ret) + break; + } + + return ret; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btree_csum_one_bio(bio); +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + int ret; + + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; +} + +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + int async = check_async_write(inode, bio_flags); + int ret; + + if (!(rw & REQ_WRITE)) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, BTRFS_WQ_ENDIO_METADATA); + if (ret) + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); + } + + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; +} + +#ifdef CONFIG_MIGRATION +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + return migrate_page(mapping, newpage, page, mode); +} +#endif + + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct btrfs_fs_info *fs_info; + int ret; + + if (wbc->sync_mode == WB_SYNC_NONE) { + + if (wbc->for_kupdate) + return 0; + + fs_info = BTRFS_I(mapping->host)->root->fs_info; + /* this is a bit racy, but that's ok */ + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) + return 0; + } + return btree_write_cache_pages(mapping, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent, 0); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + + return try_release_extent_buffer(page); +} + +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, + "page private not zero on page %llu", + (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +static int btree_set_page_dirty(struct page *page) +{ +#ifdef DEBUG + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); +#endif + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif + .set_page_dirty = btree_set_page_dirty, +}; + +void readahead_tree_block(struct btrfs_root *root, u64 bytenr) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + + buf = btrfs_find_create_tree_block(root, bytenr); + if (!buf) + return; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, WAIT_NONE, btree_get_extent, 0); + free_extent_buffer(buf); +} + +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(buf)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr) +{ + return find_extent_buffer(fs_info, bytenr); +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr) +{ + if (btrfs_test_is_dummy_root(root)) + return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_extent_buffer(root->fs_info, bytenr); +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + buf->start + buf->len - 1); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret) { + free_extent_buffer(buf); + return NULL; + } + return buf; + +} + +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == + fs_info->running_transaction->transid) { + btrfs_assert_tree_locked(buf); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); + } + } +} + +static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) +{ + struct btrfs_subvolume_writers *writers; + int ret; + + writers = kmalloc(sizeof(*writers), GFP_NOFS); + if (!writers) + return ERR_PTR(-ENOMEM); + + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); + if (ret < 0) { + kfree(writers); + return ERR_PTR(ret); + } + + init_waitqueue_head(&writers->wait); + return writers; +} + +static void +btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) +{ + percpu_counter_destroy(&writers->counter); + kfree(writers); +} + +static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, + struct btrfs_root *root, struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->stripesize = stripesize; + root->state = 0; + root->orphan_cleanup_state = 0; + + root->objectid = objectid; + root->last_trans = 0; + root->highest_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; + root->name = NULL; + root->inode_tree = RB_ROOT; + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); + spin_lock_init(&root->orphan_lock); + spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); + spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); + atomic_set(&root->orphan_inodes, 0); + atomic_set(&root->refs, 1); + atomic_set(&root->will_be_snapshoted, 0); + root->log_transid = 0; + root->log_transid_committed = -1; + root->last_log_commit = 0; + if (fs_info) + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + if (fs_info) + root->defrag_trans_start = fs_info->generation; + else + root->defrag_trans_start = 0; + root->root_key.objectid = objectid; + root->anon_dev = 0; + + spin_lock_init(&root->root_item_lock); +} + +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + if (root) + root->fs_info = fs_info; + return root; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* Should only be used by the testing infrastructure */ +struct btrfs_root *btrfs_alloc_dummy_root(void) +{ + struct btrfs_root *root; + + root = btrfs_alloc_root(NULL); + if (!root) + return ERR_PTR(-ENOMEM); + __setup_root(4096, 4096, 4096, root, NULL, 1); + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; + + return root; +} +#endif + +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + int ret = 0; + uuid_le uuid; + + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, objectid); + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + leaf = NULL; + goto fail; + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + root->node = leaf; + + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + root->commit_root = btrfs_root_node(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_item.flags = 0; + root->root_item.byte_limit = 0; + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + uuid_le_gen(&uuid); + memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + root->root_item.drop_level = 0; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + btrfs_tree_unlock(leaf); + + return root; + +fail: + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); + free_extent_buffer(leaf); + } + kfree(root); + + return ERR_PTR(ret); +} + +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; + + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + + /* + * DON'T set REF_COWS for log trees + * + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + + leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, + NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + kfree(root); + return ERR_CAST(leaf); + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + root->node = leaf; + + write_extent_buffer(root->node, root->fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + return root; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + + log_root = alloc_log_tree(trans, root->fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_node(&log_root->root_item, log_root->node); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; + root->log_transid_committed = -1; + root->last_log_commit = 0; + return 0; +} + +static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + u64 generation; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; + } + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, key->objectid); + + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto find_fail; + } + + generation = btrfs_root_generation(&root->root_item); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + generation); + if (!root->node) { + ret = -ENOMEM; + goto find_fail; + } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto read_fail; + } + root->commit_root = btrfs_root_node(root); +out: + btrfs_free_path(path); + return root; + +read_fail: + free_extent_buffer(root->node); +find_fail: + kfree(root); +alloc_fail: + root = ERR_PTR(ret); + goto out; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + + root = btrfs_read_tree_root(tree_root, location); + if (IS_ERR(root)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); + } + + return root; +} + +int btrfs_init_fs_root(struct btrfs_root *root) +{ + int ret; + struct btrfs_subvolume_writers *writers; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + writers = btrfs_alloc_subvolume_writers(); + if (IS_ERR(writers)) { + ret = PTR_ERR(writers); + goto fail; + } + root->subv_writers = writers; + + btrfs_init_free_ino_ctl(root); + spin_lock_init(&root->ino_cache_lock); + init_waitqueue_head(&root->ino_cache_wait); + + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto free_writers; + return 0; + +free_writers: + btrfs_free_subvolume_writers(root->subv_writers); +fail: + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + return ret; +} + +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); + spin_unlock(&fs_info->fs_roots_radix_lock); + return root; +} + +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + bool check_ref) +{ + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; + if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + return fs_info->quota_root ? fs_info->quota_root : + ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + return fs_info->uuid_root ? fs_info->uuid_root : + ERR_PTR(-ENOENT); +again: + root = btrfs_lookup_fs_root(fs_info, location->objectid); + if (root) { + if (check_ref && btrfs_root_refs(&root->root_item) == 0) + return ERR_PTR(-ENOENT); + return root; + } + + root = btrfs_read_fs_root(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_init_fs_root(root); + if (ret) + goto fail; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + btrfs_free_path(path); + if (ret < 0) + goto fail; + if (ret == 0) + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + + ret = btrfs_insert_fs_root(fs_info, root); + if (ret) { + if (ret == -EEXIST) { + free_fs_root(root); + goto again; + } + goto fail; + } + return root; +fail: + free_fs_root(root); + return ERR_PTR(ret); +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct btrfs_device *device; + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + return ret; +} + +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + int err; + + err = bdi_setup_and_register(bdi, "btrfs"); + if (err) + return err; + + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct btrfs_end_io_wq *end_io_wq; + int error; + + end_io_wq = container_of(work, struct btrfs_end_io_wq, work); + bio = end_io_wq->bio; + + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); + bio_endio_nodec(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + int again; + + do { + again = 0; + + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + goto sleep; + + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(root)) { + mutex_unlock(&root->fs_info->cleaner_mutex); + goto sleep; + } + + btrfs_run_delayed_iputs(root); + btrfs_delete_unused_bgs(root->fs_info); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(root->fs_info); +sleep: + if (!try_to_freeze() && !again) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + u64 transid; + unsigned long now; + unsigned long delay; + bool cannot_commit; + + do { + cannot_commit = false; + delay = HZ * root->fs_info->commit_interval; + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + spin_lock(&root->fs_info->trans_lock); + cur = root->fs_info->running_transaction; + if (!cur) { + spin_unlock(&root->fs_info->trans_lock); + goto sleep; + } + + now = get_seconds(); + if (cur->state < TRANS_STATE_BLOCKED && + (now < cur->start_time || + now - cur->start_time < root->fs_info->commit_interval)) { + spin_unlock(&root->fs_info->trans_lock); + delay = HZ * 5; + goto sleep; + } + transid = cur->transid; + spin_unlock(&root->fs_info->trans_lock); + + /* If the file system is aborted, this will always fail. */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; + goto sleep; + } + if (transid == trans->transid) { + btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); + } +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, + &root->fs_info->fs_state))) + btrfs_cleanup_transaction(root); + if (!try_to_freeze()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup workers */ +static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) +{ + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + btrfs_destroy_workqueue(fs_info->endio_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->endio_repair_workers); + btrfs_destroy_workqueue(fs_info->rmw_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->submit_workers); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->readahead_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); +} + +static void free_root_extent_buffers(struct btrfs_root *root) +{ + if (root) { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + root->node = NULL; + root->commit_root = NULL; + } +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_root_extent_buffers(info->tree_root); + + free_root_extent_buffers(info->dev_root); + free_root_extent_buffers(info->extent_root); + free_root_extent_buffers(info->csum_root); + free_root_extent_buffers(info->quota_root); + free_root_extent_buffers(info->uuid_root); + if (chunk_root) + free_root_extent_buffers(info->chunk_root); +} + +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + btrfs_put_fs_root(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); + } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log_root_tree(NULL, fs_info); + btrfs_destroy_pinned_extent(fs_info->tree_root, + fs_info->pinned_extents); + } +} + +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); +} + +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(fs_info->btree_inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + init_waitqueue_head(&fs_info->replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); + + /* + * a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "BTRFS: log replay required " + "on RO media\n"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, log_tree_root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + fs_info->generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return -EIO; + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return ret; + } + + if (fs_info->sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + if (ret) + return ret; + } + + return 0; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); + + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + return ret; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +} + +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int err = -EINVAL; + int num_backups_tried = 0; + int backup_index = 0; + int max_active; + + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + if (!tree_root || !chunk_root) { + err = -ENOMEM; + goto fail; + } + + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) { + err = ret; + goto fail; + } + + ret = setup_bdi(fs_info, &fs_info->bdi); + if (ret) { + err = ret; + goto fail_srcu; + } + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); + if (ret) { + err = ret; + goto fail_delalloc_bytes; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bio_counter; + } + + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); + INIT_LIST_HEAD(&fs_info->delalloc_roots); + INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_root_lock); + spin_lock_init(&fs_info->trans_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); + seqlock_init(&fs_info->profiles_lock); + init_rwsem(&fs_info->delayed_iput_sem); + + init_completion(&fs_info->kobj_unregister); + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->unused_bgs); + btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->sb = sb; + fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; + fs_info->metadata_ratio = 0; + fs_info->defrag_inodes = RB_ROOT; + fs_info->free_chunk_space = 0; + fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_NOFS); + if (!fs_info->delayed_root) { + err = -ENOMEM; + goto fail_iput; + } + btrfs_init_delayed_root(fs_info->delayed_root); + + btrfs_init_scrub(fs_info); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif + btrfs_init_balance(fs_info); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; + + btrfs_init_btree_inode(fs_info, tree_root); + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; + + extent_io_tree_init(&fs_info->freed_extents[0], + fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[1], + fs_info->btree_inode->i_mapping); + fs_info->pinned_extents = &fs_info->freed_extents[0]; + fs_info->do_barriers = 1; + + + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->ordered_extent_flush_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->ro_block_group_mutex); + init_rwsem(&fs_info->commit_root_sem); + init_rwsem(&fs_info->cleanup_work_sem); + init_rwsem(&fs_info->subvol_sem); + sema_init(&fs_info->uuid_tree_rescan_sem, 1); + + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + + INIT_LIST_HEAD(&fs_info->pinned_chunks); + + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = ret; + goto fail_alloc; + } + + __setup_root(4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + invalidate_bdev(fs_devices->latest_bdev); + + /* + * Read super block and check the signature bytes only + */ + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) { + err = -EINVAL; + goto fail_alloc; + } + + /* + * We want to check superblock checksum, the type is stored inside. + * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). + */ + if (btrfs_check_super_csum(bh->b_data)) { + printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + err = -EINVAL; + goto fail_alloc; + } + + /* + * super_copy is zeroed at allocation time and we never touch the + * following bytes up to INFO_SIZE, the checksum is calculated from + * the whole block of INFO_SIZE + */ + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + err = -EINVAL; + goto fail_alloc; + } + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_alloc; + + /* check FS state, whether FS is broken. */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_alloc; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + features); + err = -EINVAL; + goto fail_alloc; + } + + /* + * Leafsize and nodesize were always equal, this is only a sanity check. + */ + if (le32_to_cpu(disk_super->__unused_leafsize) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + le32_to_cpu(disk_super->__unused_leafsize)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_nodesize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + + features = btrfs_super_incompat_flags(disk_super); + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + printk(KERN_INFO "BTRFS: has skinny extents\n"); + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + nodesize = btrfs_super_nodesize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions + */ + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (sectorsize != nodesize)) { + printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_alloc; + } + + /* + * Needn't use the lock because there is no other task which will + * update the flag. + */ + btrfs_set_super_incompat_flags(disk_super, features); + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + features); + err = -EINVAL; + goto fail_alloc; + } + + max_active = fs_info->thread_pool_size; + + ret = btrfs_init_workqueues(fs_info, fs_devices); + if (ret) { + err = ret; + goto fail_sb_buffer; + } + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + tree_root->nodesize = nodesize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + if (sectorsize != PAGE_SIZE) { + printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_ERR "BTRFS: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, sectorsize, stripesize, chunk_root, + fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + generation); + if (!chunk_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", + sb->s_id); + goto fail_tree_roots; + } + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + + ret = btrfs_read_chunk_tree(chunk_root); + if (ret) { + printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_tree_roots; + } + + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_devices, 0); + + if (!fs_devices->latest_bdev) { + printk(KERN_ERR "BTRFS: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + +retry_root_backup: + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + generation); + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", + sb->s_id); + + goto recovery_tree_root; + } + + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); + + ret = btrfs_read_roots(fs_info, tree_root); + if (ret) + goto recovery_tree_root; + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + + ret = btrfs_recover_balance(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: failed to recover balance\n"); + goto fail_block_groups; + } + + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", + ret); + goto fail_block_groups; + } + + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_devices, 1); + + ret = btrfs_sysfs_add_one(fs_info); + if (ret) { + pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + goto fail_sysfs; + } + + ret = btrfs_read_block_groups(fs_info->extent_root); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + goto fail_sysfs; + } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING "BTRFS: " + "too many missing devices, writeable mount is not allowed\n"); + goto fail_sysfs; + } + + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (IS_ERR(fs_info->cleaner_kthread)) + goto fail_sysfs; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (IS_ERR(fs_info->transaction_kthread)) + goto fail_cleaner; + + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + + /* + * Mount does not set all options immediatelly, we can do it now and do + * not have to wait for transaction commit + */ + btrfs_apply_pending_changes(fs_info); + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "BTRFS: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; + + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0) { + ret = btrfs_replay_log(fs_info, fs_devices); + if (ret) { + err = ret; + goto fail_qgroup; + } + } + + ret = btrfs_find_orphan_roots(tree_root); + if (ret) + goto fail_qgroup; + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto fail_qgroup; + + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + printk(KERN_WARNING + "BTRFS: failed to recover relocation\n"); + err = -EINVAL; + goto fail_qgroup; + } + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (IS_ERR(fs_info->fs_root)) { + err = PTR_ERR(fs_info->fs_root); + goto fail_qgroup; + } + + if (sb->s_flags & MS_RDONLY) + return 0; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { + up_read(&fs_info->cleanup_work_sem); + close_ctree(tree_root); + return ret; + } + up_read(&fs_info->cleanup_work_sem); + + ret = btrfs_resume_balance_async(fs_info); + if (ret) { + printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + close_ctree(tree_root); + return ret; + } + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("BTRFS: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + + btrfs_qgroup_rescan_resume(fs_info); + + if (!fs_info->uuid_root) { + pr_info("BTRFS: creating UUID tree\n"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + fs_info->generation != + btrfs_super_uuid_tree_generation(disk_super)) { + pr_info("BTRFS: checking UUID tree\n"); + ret = btrfs_check_uuid_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to check the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else { + fs_info->update_uuid_tree_gen = 1; + } + + fs_info->open = 1; + + return 0; + +fail_qgroup: + btrfs_free_qgroup_config(fs_info); +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); + btrfs_cleanup_transaction(fs_info->tree_root); + btrfs_free_fs_roots(fs_info); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + +fail_sysfs: + btrfs_sysfs_remove_one(fs_info); + +fail_block_groups: + btrfs_put_block_group_cache(fs_info); + btrfs_free_block_groups(fs_info); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_sb_buffer: + btrfs_stop_all_workers(fs_info); +fail_alloc: +fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + iput(fs_info->btree_inode); +fail_bio_counter: + percpu_counter_destroy(&fs_info->bio_counter); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); +fail_bdi: + bdi_destroy(&fs_info->bdi); +fail_srcu: + cleanup_srcu_struct(&fs_info->subvol_srcu); +fail: + btrfs_free_stripe_hash_table(fs_info); + btrfs_close_devices(fs_info->fs_devices); + return err; + +recovery_tree_root: + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + struct btrfs_device *device = (struct btrfs_device *) + bh->b_private; + + printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " + "I/O error on %s\n", + rcu_str_deref(device->name)); + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + btrfs_super_magic(super) != BTRFS_MAGIC) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + errors++; + continue; + } + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data((char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + /* + * one reference for us, and we leave it for the + * caller + */ + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + printk(KERN_ERR "BTRFS: couldn't get super " + "buffer head for bytenr %Lu\n", bytenr); + errors++; + continue; + } + + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + /* one reference for submit_bh */ + get_bh(bh); + + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; + } + + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + if (i == 0) + ret = btrfsic_submit_bh(WRITE_FUA, bh); + else + ret = btrfsic_submit_bh(WRITE_SYNC, bh); + if (ret) + errors++; + } + return errors < i ? 0 : -1; +} + +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk_in_rcu("BTRFS: disabling barriers on dev %s\n", + rcu_str_deref(device->name)); + device->nobarriers = 1; + } else if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL; + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors_send = 0; + int errors_wait = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->bdev) { + errors_send++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors_send++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->bdev) { + errors_wait++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors_wait++; + } + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) + return -EIO; + return 0; +} + +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID6) { + num_tolerated_disk_barrier_failures = 2; + } + } + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + +static int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *head; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); + + sb = root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } + + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, + dev->commit_total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + btrfs_err(root->fs_info, "%d errors while writing supers", + total_errors); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* FUA is masked off if unsupported and can't be the reason */ + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; + } + + total_errors = 0; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (total_errors > max_errors) { + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + return write_all_supers(root, max_mirrors); +} + +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (btrfs_root_refs(&root->root_item) == 0) + synchronize_srcu(&fs_info->subvol_srcu); + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + btrfs_free_log(NULL, root); + + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); + free_fs_root(root); +} + +static void free_fs_root(struct btrfs_root *root) +{ + iput(root->ino_cache_inode); + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + if (root->subv_writers) + btrfs_free_subvolume_writers(root->subv_writers); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + kfree(root->name); + btrfs_put_fs_root(root); +} + +void btrfs_free_fs_root(struct btrfs_root *root) +{ + free_fs_root(root); +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; + + while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + break; + btrfs_put_fs_root(gang[i]); + } + root_objectid++; + } + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + wake_up_process(root->fs_info->cleaner_kthread); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans, root); +} + +void close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al., set sem back to initial state */ + up(&fs_info->uuid_tree_rescan_sem); + + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + + btrfs_scrub_cancel(fs_info); + + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + + /* clear out the rbtree of defraggable inodes */ + btrfs_cleanup_defrag_inodes(fs_info); + + cancel_work_sync(&fs_info->async_reclaim_work); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + btrfs_err(fs_info, "commit super ret %d", ret); + } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + btrfs_error_commit_super(root); + + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); + + fs_info->closing = 2; + smp_mb(); + + btrfs_free_qgroup_config(fs_info); + + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + btrfs_info(fs_info, "at unmount delalloc count %lld", + percpu_counter_sum(&fs_info->delalloc_bytes)); + } + + btrfs_sysfs_remove_one(fs_info); + + btrfs_free_fs_roots(fs_info); + + btrfs_put_block_group_cache(fs_info); + + btrfs_free_block_groups(fs_info); + + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + btrfs_stop_all_workers(fs_info); + + fs_info->open = 0; + free_root_pointers(fs_info, 1); + + iput(fs_info->btree_inode); + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->bio_counter); + bdi_destroy(&fs_info->bdi); + cleanup_srcu_struct(&fs_info->subvol_srcu); + + btrfs_free_stripe_hash_table(fs_info); + + __btrfs_free_block_rsv(root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic) +{ + int ret; + struct inode *btree_inode = buf->pages[0]->mapping->host; + + ret = extent_buffer_uptodate(buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid, atomic); + if (ret == -EAGAIN) + return ret; + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + return set_extent_buffer_uptodate(buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root; + u64 transid = btrfs_header_generation(buf); + int was_dirty; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* + * This is a fast path so only do this check if we have sanity tests + * enabled. Normal people shouldn't be marking dummy buffers as dirty + * outside of the sanity tests. + */ + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags))) + return; +#endif + root = BTRFS_I(buf->pages[0]->mapping->host)->root; + btrfs_assert_tree_locked(buf); + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + buf->start, transid, root->fs_info->generation); + was_dirty = set_extent_buffer_dirty(buf); + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); + } +#endif +} + +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + int ret; + + if (current->flags & PF_MEMALLOC) + return; + + if (flush_delayed) + btrfs_balance_delayed_items(root); + + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); + } + return; +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 1); +} + +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); +} + +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + int ret = 0; + + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + + /* + * The common minimum, we don't know if we can trust the nodesize/sectorsize + * items yet, they'll be verified later. Issue just a warning. + */ + if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + btrfs_super_root(sb)); + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", + btrfs_super_chunk_root(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); + + /* + * Check the lower bound, the alignment and other constraints are + * checked later. + */ + if (btrfs_super_nodesize(sb) < 4096) { + printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", + btrfs_super_nodesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_sectorsize(sb) < 4096) { + printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", + btrfs_super_sectorsize(sb)); + ret = -EINVAL; + } + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (btrfs_super_num_devices(sb) > (1UL << 31)) + printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", + btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + printk(KERN_ERR "BTRFS: number of devices is 0\n"); + ret = -EINVAL; + } + + if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { + printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + printk(KERN_WARNING + "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", + btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) + printk(KERN_WARNING + "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", + btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); + + return ret; +} + +static void btrfs_error_commit_super(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); +} + +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&root->ordered_extent_lock); + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + spin_unlock(&root->ordered_extent_lock); +} + +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + + spin_unlock(&fs_info->ordered_root_lock); + btrfs_destroy_ordered_extents(root); + + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (atomic_read(&delayed_refs->num_entries) == 0) { + spin_unlock(&delayed_refs->lock); + btrfs_info(root->fs_info, "delayed_refs has NO entry"); + return ret; + } + + while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; + bool pin_bytes = false; + + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + spin_lock(&delayed_refs->lock); + continue; + } + spin_lock(&head->lock); + while ((node = rb_first(&head->ref_root)) != NULL) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ref->in_tree = 0; + rb_erase(&ref->rb_node, &head->ref_root); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + atomic_dec(&delayed_refs->num_entries); + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (pin_bytes) + btrfs_pin_extent(root, head->node.bytenr, + head->node.num_bytes, 1); + btrfs_put_delayed_ref(&head->node); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); + spin_unlock(&root->delalloc_lock); + + btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->delalloc_lock); + } + + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + list_del_init(&root->delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + eb = btrfs_find_tree_block(root->fs_info, start); + start += root->nodesize; + if (!eb) + continue; + wait_on_extent_buffer_writeback(eb); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags)) + clear_extent_buffer_dirty(eb); + free_extent_buffer_stale(eb); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + bool loop = true; + + unpin = pinned_extents; +again: + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, NULL); + if (ret) + break; + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + if (loop) { + if (unpin == &root->fs_info->freed_extents[0]) + unpin = &root->fs_info->freed_extents[1]; + else + unpin = &root->fs_info->freed_extents[0]; + loop = false; + goto again; + } + + return 0; +} + +static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + btrfs_destroy_delayed_refs(cur_trans, root); + + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&root->fs_info->transaction_blocked_wait); + + cur_trans->state = TRANS_STATE_UNBLOCKED; + wake_up(&root->fs_info->transaction_wait); + + btrfs_free_pending_ordered(cur_trans, root->fs_info); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + + btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + EXTENT_DIRTY); + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + + /* + memset(cur_trans, 0, sizeof(*cur_trans)); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + */ +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + spin_lock(&root->fs_info->trans_lock); + while (!list_empty(&root->fs_info->trans_list)) { + t = list_first_entry(&root->fs_info->trans_list, + struct btrfs_transaction, list); + if (t->state >= TRANS_STATE_COMMIT_START) { + atomic_inc(&t->use_count); + spin_unlock(&root->fs_info->trans_lock); + btrfs_wait_for_commit(root, t->transid); + btrfs_put_transaction(t); + spin_lock(&root->fs_info->trans_lock); + continue; + } + if (t == root->fs_info->running_transaction) { + t->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + /* + * We wait for 0 num_writers since we don't hold a trans + * handle open currently for this transaction. + */ + wait_event(t->writer_wait, + atomic_read(&t->num_writers) == 0); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + btrfs_cleanup_one_transaction(t, root); + + spin_lock(&root->fs_info->trans_lock); + if (t == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + list_del_init(&t->list); + spin_unlock(&root->fs_info->trans_lock); + + btrfs_put_transaction(t); + trace_btrfs_transaction_commit(root); + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(root->fs_info); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + btrfs_destroy_all_delalloc_inodes(root->fs_info); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + return 0; +} + +static const struct extent_io_ops btree_extent_io_ops = { + .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/kernel/fs/btrfs/disk-io.h b/kernel/fs/btrfs/disk-io.h new file mode 100644 index 000000000..d4cbfeeee --- /dev/null +++ b/kernel/fs/btrfs/disk-io.h @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +enum btrfs_wq_endio_type { + BTRFS_WQ_ENDIO_DATA = 0, + BTRFS_WQ_ENDIO_METADATA = 1, + BTRFS_WQ_ENDIO_FREE_SPACE = 2, + BTRFS_WQ_ENDIO_RAID56 = 3, + BTRFS_WQ_ENDIO_DIO_REPAIR = 4, +}; + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u64 parent_transid); +void readahead_tree_block(struct btrfs_root *root, u64 bytenr); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, + int mirror_num, struct extent_buffer **eb); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr); +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, struct extent_buffer *buf); +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +void close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location); +int btrfs_init_fs_root(struct btrfs_root *root); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key, + bool check_ref); +static inline struct btrfs_root * +btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + return btrfs_get_fs_root(fs_info, location, true); +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_root *root); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_root *btrfs_alloc_dummy_root(void); +#endif + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->refs)) + kfree(root); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + enum btrfs_wq_endio_type metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_root *root); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid); +int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info); +int __init btrfs_end_io_wq_init(void); +void btrfs_end_io_wq_exit(void); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level); +#else +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) +{ +} +#endif +#endif diff --git a/kernel/fs/btrfs/export.c b/kernel/fs/btrfs/export.c new file mode 100644 index 000000000..8d052209f --- /dev/null +++ b/kernel/fs/btrfs/export.c @@ -0,0 +1,304 @@ +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + int len = *max_len; + int type; + + if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; + return FILEID_INVALID; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return FILEID_INVALID; + } + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = btrfs_ino(inode); + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (parent) { + u64 parent_root_id; + + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root; + struct inode *inode; + struct btrfs_key key; + int index; + int err = 0; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + if (check_generation && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +fail: + srcu_read_unlock(&fs_info->subvol_srcu, index); + return ERR_PTR(err); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = d_inode(child); + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = btrfs_ino(dir); + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); /* Key with offset of -1 found */ + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; + } + + path->slots[0]--; + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; + } + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } + btrfs_free_path(path); + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(root->fs_info->sb, key.objectid, + found_key.offset, 0, 0); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); +fail: + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + u64 ino; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = ino; + key.offset = btrfs_ino(dir); + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, +}; diff --git a/kernel/fs/btrfs/export.h b/kernel/fs/btrfs/export.h new file mode 100644 index 000000000..074348a95 --- /dev/null +++ b/kernel/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/kernel/fs/btrfs/extent-tree.c b/kernel/fs/btrfs/extent-tree.c new file mode 100644 index 000000000..0ec3acd14 --- /dev/null +++ b/kernel/fs/btrfs/extent-tree.c @@ -0,0 +1,10174 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hash.h" +#include "tree-log.h" +#include "disk-io.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "locking.h" +#include "free-space-cache.h" +#include "math.h" +#include "sysfs.h" +#include "qgroup.h" + +#undef SCRAMBLE_DELAYED_REFS + +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum { + CHUNK_ALLOC_NO_FORCE = 0, + CHUNK_ALLOC_LIMITED = 1, + CHUNK_ALLOC_FORCE = 2, +}; + +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op, + int no_quota); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins, + int no_quota); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 flags, + int force); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, + int delalloc); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved); + +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; +} + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + kfree(cache->free_space_ctl); + kfree(cache); + } +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) { + btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) +{ + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} + +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); +} + +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + if (ret) + return ret; + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + if (ret) + return ret; + + while (nr--) { + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = add_excluded_extent(root, start, len); + if (ret) { + kfree(logical); + return ret; + } + } + + kfree(logical); + } + return 0; +} + +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); /* -ENOMEM or logic error */ + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); /* -ENOMEM or logic error */ + } + + return total_added; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret = -ENOMEM; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) + goto out; + + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; +again: + mutex_lock(&caching_ctl->mutex); + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->commit_root_sem); + +next: + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto err; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) + break; + + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + goto again; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto err; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->tree_root->nodesize; + else + last = key.objectid + key.offset; + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + caching_ctl->progress = (u64)-1; + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +err: + btrfs_free_path(path); + up_read(&fs_info->commit_root_sem); + + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); +out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + btrfs_put_block_group(block_group); +} + +static int cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only) +{ + DEFINE_WAIT(wait); + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); + ret = load_free_space_cache(fs_info, cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; + } else { + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + } + spin_unlock(&cache->lock); + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + if (ret == 1) { + put_caching_control(caching_ctl); + free_excluded_extents(fs_info->extent_root, cache); + return 0; + } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + } + + if (load_cache_only) { + put_caching_control(caching_ctl); + return 0; + } + + down_write(&fs_info->commit_root_sem); + atomic_inc(&caching_ctl->count); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->commit_root_sem); + + btrfs_get_block_group(cache); + + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); + + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains the given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = start; + key.offset = len; + key.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to lookup reference count and flags of a tree block. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { + offset = root->nodesize; + metadata = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->nodesize) + ret = 0; + } + } + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and try + * again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto search_again; + } + spin_lock(&head->lock); + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure for the implicit back refs has fields for: + * + * - Objectid of the subvolume root + * - objectid of the file holding the reference + * - original offset in the file + * - how many bookend extents + * + * The key offset for the implicit back refs is hash of the first + * three fields. + * + * The extent ref structure for the full back refs has field for: + * + * - number of pointers in the tree leaf + * + * The key offset for the implicit back refs is the first byte of + * the tree leaf + * + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: + * + * (root_key.objectid, inode objectid, offset in file, 1) + * + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: + * + * (btrfs_header_owner(leaf), inode objectid, offset in file) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. + * + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. + */ + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) +{ + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); /* Corruption */ + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); /* Corruption */ + + btrfs_extend_item(root, path, new_size); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; + struct extent_buffer *leaf; + u32 nritems; + int ret; + int recow; + int err = -ENOENT; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; +} + +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } + } + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: + btrfs_release_path(path); + return ret; +} + +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop, int *last_ref) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; + struct extent_buffer *leaf; + u32 num_refs = 0; + int ret = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } + + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; + + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + *last_ref = 1; + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif + btrfs_mark_buffer_dirty(leaf); + } + return ret; +} + +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } +#endif + return ret; +} + +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(path); + return ret; +} + +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + + /* + * Owner is our parent level, so we can just add one to get the level + * for the block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + + if (ret && !insert) { + err = -ENOENT; + goto out; + } else if (WARN_ON(ret)) { + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } + + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +void setup_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + btrfs_extend_item(root, path, size); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(leaf); +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) + return ret; + + btrfs_release_path(path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); + } + return ret; +} + +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +void update_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + u64 refs; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); + } + + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + *last_ref = 1; + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + btrfs_truncate_item(root, path, item_size, 1); + } + btrfs_mark_buffer_dirty(leaf); +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + update_inline_extent_backref(root, path, iref, + refs_to_add, extent_op, NULL); + } else if (ret == -ENOENT) { + setup_inline_extent_backref(root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; + } + return ret; +} + +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data, int *last_ref) +{ + int ret = 0; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + update_inline_extent_backref(root, path, iref, + -refs_to_drop, NULL, last_ref); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); + } else { + *last_ref = 1; + ret = btrfs_del_item(trans, root, path); + } + return ret; +} + +static int btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); +} + +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + int ret; + u64 discarded_bytes = 0; + struct btrfs_bio *bbio = NULL; + + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ + if (!ret) { + struct btrfs_bio_stripe *stripe = bbio->stripes; + int i; + + + for (i = 0; i < bbio->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length); + if (!ret) + discarded_bytes += stripe->length; + else if (ret != -EOPNOTSUPP) + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; + } + btrfs_put_bbio(bbio); + } + + if (actual_bytes) + *actual_bytes = discarded_bytes; + + + if (ret == -EOPNOTSUPP) + ret = 0; + return ret; +} + +/* Can return -ENOMEM */ +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset, + int no_quota) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, no_quota); + } else { + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL, no_quota); + } + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + int no_quota, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + struct btrfs_key key; + u64 refs; + int ret; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + + path->reada = 1; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); + + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + goto out; + } + + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + + path->reada = 1; + path->leave_spinning = 1; + /* now insert the actual backref */ + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: + btrfs_free_path(path); + return ret; +} + +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) + flags |= extent_op->flags_to_set; + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + node->no_quota, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op, node->no_quota); + } else { + BUG(); + } + return ret; +} + +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + int metadata = !extent_op->is_data; + + if (trans->aborted) + return 0; + + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + metadata = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + } + +again: + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (metadata) { + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; + + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; + } + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + ref_root = ref->root; + + ins.objectid = node->bytenr; + if (skinny_metadata) { + ins.offset = ref->level; + ins.type = BTRFS_METADATA_ITEM_KEY; + } else { + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + } + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins, + node->no_quota); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, node->no_quota, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op, + node->no_quota); + } else { + BUG(); + } + return ret; +} + +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); + return 0; + } + + if (btrfs_delayed_ref_is_head(node)) { + struct btrfs_delayed_ref_head *head; + /* + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting + */ + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + trace_run_delayed_ref_head(node, head, node->action); + + if (insert_reserved) { + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + } + } + return ret; + } + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; +} + +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + struct btrfs_delayed_ref_node *ref, *last = NULL;; + + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->action == BTRFS_ADD_DELAYED_REF) + return ref; + else if (last == NULL) + last = ref; + node = rb_next(node); + } + return last; +} + +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); + int ret; + unsigned long count = 0; + unsigned long actual_count = 0; + int must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + while (1) { + if (!locked_ref) { + if (count >= nr) + break; + + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + spin_unlock(&delayed_refs->lock); + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; + } + } + + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + + if (ref && ref->seq && + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { + spin_unlock(&locked_ref->lock); + btrfs_delayed_ref_unlock(locked_ref); + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + locked_ref = NULL; + cond_resched(); + count++; + continue; + } + + /* + * record the must insert reserved flag before we + * drop the spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + + if (!ref) { + + + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; + + if (extent_op && must_insert_reserved) { + btrfs_free_delayed_extent_op(extent_op); + extent_op = NULL; + } + + if (extent_op) { + spin_unlock(&locked_ref->lock); + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + btrfs_free_delayed_extent_op(extent_op); + + if (ret) { + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + btrfs_delayed_ref_unlock(locked_ref); + return ret; + } + continue; + } + + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; + } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); + } + atomic_dec(&delayed_refs->num_entries); + + if (!btrfs_delayed_ref_is_head(ref)) { + /* + * when we play the delayed ref, also correct the + * ref_mod on head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->node.ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->node.ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + } + spin_unlock(&locked_ref->lock); + + ret = run_one_delayed_ref(trans, root, ref, extent_op, + must_insert_reserved); + + btrfs_free_delayed_extent_op(extent_op); + if (ret) { + locked_ref->processing = 0; + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); + return ret; + } + + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } + btrfs_put_delayed_ref(ref); + count++; + cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ + spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ + spin_unlock(&delayed_refs->lock); + } + return 0; +} + +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; +} + +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; + int ret = 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->nodesize; + num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; + num_bytes <<= 1; + } + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + int ret; + int run_all = count == (unsigned long)-1; + + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + if (count == 0) + count = atomic_read(&delayed_refs->num_entries) * 2; + +again: +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + if (run_all) { + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); + goto out; + } + count = (unsigned long)-1; + + while (node) { + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; + + ref = &head->node; + atomic_inc(&ref->refs); + + spin_unlock(&delayed_refs->lock); + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + cond_resched(); + goto again; + } else { + WARN_ON(1); + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + cond_resched(); + goto again; + } +out: + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; + assert_qgroups_uptodate(trans); + return 0; +} + +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int level, int is_data) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; + + ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + num_bytes, extent_op); + if (ret) + btrfs_free_delayed_extent_op(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } + + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and let + * caller try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + spin_unlock(&delayed_refs->lock); + + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } + } + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 item_size; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); /* Corruption */ + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; + + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) + goto out; + + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; + } + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; +out: + btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); + return ret; +} + +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + int full_backref, int inc) +{ + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 ref_root; + u32 nritems; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64, int); + + + if (btrfs_test_is_dummy_root(root)) + return 0; + + ref_root = btrfs_header_owner(buf); + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) + return 0; + + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; + + if (full_backref) + parent = buf->start; + else + parent = 0; + + for (i = 0; i < nritems; i++) { + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset, 1); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = root->nodesize; + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0, + 1); + if (ret) + goto fail; + } + } + return 0; +fail: + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} + +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); +fail: + btrfs_release_path(path); + return ret; + +} + +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_root *root = block_group->fs_info->tree_root; + struct inode *inode = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + u64 num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * 1024 * 1024)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + + if (trans->aborted) + return 0; +again: + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(root, trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + if (!num_pages) + num_pages = 1; + + num_pages *= 16; + num_pages *= PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; + btrfs_free_reserved_data_space(inode, num_pages); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + return ret; +} + +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ + list_del_init(&cache->dirty_list); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } + + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int i; + int factor; + int ret; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + if (ret) { + kfree(found); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; + found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->bytes_may_use = 0; + found->full = 0; + found->force_alloc = CHUNK_ALLOC_NO_FORCE; + found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + + *space_info = found; + list_add_rcu(&found->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; + + return ret; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + u64 num_devices = root->fs_info->fs_devices->rw_devices; + u64 target; + u64 tmp; + + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ + spin_lock(&root->fs_info->balance_lock); + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { + spin_unlock(&root->fs_info->balance_lock); + return extended_to_chunk(target); + } + } + spin_unlock(&root->fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; + + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; + + return extended_to_chunk(flags | tmp); +} + +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(root, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +{ + u64 flags; + u64 ret; + + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + + ret = get_alloc_profile(root, flags); + return ret; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +{ + struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 used; + int ret = 0; + int need_commit = 2; + int have_pinned_space; + + /* make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, root->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); + } + + data_sinfo = fs_info->data_sinfo; + if (!data_sinfo) + goto alloc; + +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); +alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else { + have_pinned_space = 1; + goto commit_trans; + } + } + + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. + */ + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ +commit_trans: + if (need_commit && + !atomic_read(&root->fs_info->open_ioctl_trans)) { + need_commit--; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (have_pinned_space >= 0 || + trans->transaction->have_free_bgs || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * make sure that all running delayed iput are + * done + */ + down_write(&root->fs_info->delayed_iput_sem); + up_write(&root->fs_info->delayed_iput_sem); + goto again; + } else { + btrfs_end_transaction(trans, root); + } + } + + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; + } + ret = btrfs_qgroup_reserve(root, write_bytes); + if (ret) + goto out; + data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + data_sinfo->flags, bytes, 1); +out: + spin_unlock(&data_sinfo->lock); + + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, root->sectorsize); + + data_sinfo = root->fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); + data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + data_sinfo->flags, bytes, 0); + spin_unlock(&data_sinfo->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); +} + +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + +static int should_alloc_chunk(struct btrfs_root *root, + struct btrfs_space_info *sinfo, int force) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += calc_global_rsv_need_space(global_rsv); + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); + thresh = max_t(u64, 64 * 1024 * 1024, + div_factor_fine(thresh, 1)); + + if (num_bytes - num_allocated < thresh) + return 1; + } + + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + return 0; + return 1; +} + +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; + else + num_dev = 1; /* DUP or single */ + + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 flags, int force) +{ + struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; + int wait_for_alloc = 0; + int ret = 0; + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); /* -ENOMEM */ + } + BUG_ON(!space_info); /* Logic error */ + +again: + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } + + if (!should_alloc_chunk(extent_root, space_info, force)) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + wait_for_alloc = 1; + } else { + space_info->chunk_alloc = 1; + } + + spin_unlock(&space_info->lock); + + mutex_lock(&fs_info->chunk_mutex); + + /* + * The chunk_mutex is held throughout the entirety of a chunk + * allocation, so once we've acquired the chunk_mutex we know that the + * other guy is done and we need to recheck and see if we should + * allocate. + */ + if (wait_for_alloc) { + mutex_unlock(&fs_info->chunk_mutex); + wait_for_alloc = 0; + goto again; + } + + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; + if (ret) + space_info->full = 1; + else + ret = 1; + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + return ret; +} + +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 space_size; + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = root->fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(root->fs_info, nr_items); + } +} + +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ + u64 bytes; + int nr; + + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM (256 * 1024) + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 max_reclaim; + long time_left; + unsigned long nr_pages; + int loops; + int items; + enum btrfs_reserve_flush_enum flush; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + block_rsv = &root->fs_info->delalloc_block_rsv; + space_info = block_rsv->space_info; + + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); + if (delalloc_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); + return; + } + + loops = 0; + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; + + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, orig, flush)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(root->fs_info, items); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); + } +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + +enum flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes * 2, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, + int flush_state) +{ + u64 used; + + spin_lock(&space_info->lock); + /* + * We run out of space and have not got any free space via flush_space, + * so don't bother doing async reclaim. + */ + if (flush_state > COMMIT_TRANS && space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info, + flush_state)) + return; + } while (flush_state < COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 used; + u64 num_bytes = orig_bytes; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + int ret = 0; + bool flushing = false; + +again: + ret = 0; + spin_lock(&space_info->lock); + /* + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. + */ + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (current->journal_info) + return -EAGAIN; + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, orig_bytes, 1); + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = used - space_info->total_bytes + + (orig_bytes * 2); + } + + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } + + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + flushing = true; + space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!root->fs_info->log_root_recovering && + need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + spin_unlock(&space_info->lock); + + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + goto out; + + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; + + /* + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. + */ + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; + + if (!ret) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) + goto again; + +out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); + if (flushing) { + spin_lock(&space_info->lock); + space_info->flush = 0; + wake_up_all(&space_info->wait); + spin_unlock(&space_info->lock); + } + return ret; +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->uuid_root) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; +} + +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { + spin_lock(&space_info->lock); + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + block_rsv_add_bytes(dst, num_bytes, 1); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_block_rsv(block_rsv, type); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); +} + +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; + } + + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); +} + +/* + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree + */ +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(fs_info->super_copy); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div_u64(data_used + meta_used, 50); + + if (num_bytes * 3 > meta_used) + num_bytes = div_u64(meta_used, 3); + + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); +} + +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + num_bytes = calc_global_metadata_size(fs_info); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 1); + } + + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 0); + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + update_global_block_rsv(fs_info); +} + +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); +} + +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->block_rsv) + return; + + if (!trans->bytes_reserved) + return; + + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +/* Can only return 0 or -ENOSPC */ +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. + */ + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved, + bool use_global_rsv) +{ + u64 num_bytes; + int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->nodesize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); +} + +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) +{ + unsigned drop_inode_space = 0; + unsigned dropped_extents = 0; + unsigned num_extents = 0; + + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; + + if (BTRFS_I(inode)->outstanding_extents == 0 && + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + drop_inode_space = 1; + + /* + * If we have more or the same amount of outsanding extents than we have + * reserved then we need to leave the reserved extents count alone. + */ + if (BTRFS_I(inode)->outstanding_extents >= + BTRFS_I(inode)->reserved_extents) + return drop_inode_space; + + dropped_extents = BTRFS_I(inode)->reserved_extents - + BTRFS_I(inode)->outstanding_extents; + BTRFS_I(inode)->reserved_extents -= dropped_extents; + return dropped_extents + drop_inode_space; +} + +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 old_csums, num_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve = 0; + u64 csum_bytes; + unsigned nr_extents = 0; + int extra_reserve = 0; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; + + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } + + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + + num_bytes = ALIGN(num_bytes, root->sectorsize); + + spin_lock(&BTRFS_I(inode)->lock); + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; + + if (BTRFS_I(inode)->outstanding_extents > + BTRFS_I(inode)->reserved_extents) + nr_extents = BTRFS_I(inode)->outstanding_extents - + BTRFS_I(inode)->reserved_extents; + + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + nr_extents++; + extra_reserve = 1; + } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); + csum_bytes = BTRFS_I(inode)->csum_bytes; + spin_unlock(&BTRFS_I(inode)->lock); + + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + if (ret) + goto out_fail; + } + + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, nr_extents * root->nodesize); + goto out_fail; + } + + spin_lock(&BTRFS_I(inode)->lock); + if (extra_reserve) { + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + nr_extents--; + } + BTRFS_I(inode)->reserved_extents += nr_extents; + spin_unlock(&BTRFS_I(inode)->lock); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_reserve, 1); + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode, num_bytes); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; +} + +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free = 0; + unsigned dropped; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode, num_bytes); + + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped > 0) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (btrfs_test_is_dummy_root(root)) + return; + + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) +{ + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + + /* block accounting for super block */ + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -ENOENT; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + cache_block_group(cache, 1); + + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_test_opt(root, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); + /* + * No longer have used bytes in this block group, queue + * it for deletion. + */ + if (old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + btrfs_put_block_group(cache); + + return bytenr; +} + +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); /* Logic error */ + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); + + btrfs_put_block_group(cache); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + int ret; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!cache) + return -EINVAL; + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return ret; +} + +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + + return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. + */ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + num_bytes, 0); + space_info->bytes_may_use -= num_bytes; + } + + if (delalloc) + cache->delalloc_bytes += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} + +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; + + down_write(&fs_info->commit_root_sem); + + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; + } + } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->commit_root_sem); + + update_global_block_rsv(fs_info); +} + +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 len; + bool readonly; + + while (start <= end) { + readonly = false; + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); /* Logic error */ + } + + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + if (return_free_space) + btrfs_add_free_space(cache, start, len); + } + + start += len; + space_info = cache->space_info; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } + spin_unlock(&cache->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); + } + + if (cache) + btrfs_put_block_group(cache); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + if (trans->aborted) + return 0; + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + + while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, NULL); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + break; + } + + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, + end + 1 - start, NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + } + + return 0; +} + +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op, + int no_quota) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + int ret; + int is_data; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + u32 item_size; + u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + if (is_data) + skinny_metadata = 0; + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); + if (ret == 0) { + extent_slot = path->slots[0]; + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + extent_slot); + if (key.objectid != bytenr) + break; + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { + found_extent = 1; + break; + } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + extent_slot--; + } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif + if (!found_extent) { + BUG_ON(iref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + btrfs_release_path(path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + + if (ret) { + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); + if (ret > 0) + btrfs_print_leaf(extent_root, + path->nodes[0]); + } + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; + } + } else if (WARN_ON(ret == -ENOENT)) { + btrfs_print_leaf(extent_root, path->nodes[0]); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + bytenr, parent, root_objectid, owner_objectid, + owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } else { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + + btrfs_release_path(path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + + refs = btrfs_extent_refs(leaf, ei); + if (refs < refs_to_drop) { + btrfs_err(info, "trying to drop %d refs but we only have %Lu " + "for bytenr %Lu", refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + refs -= refs_to_drop; + + if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref + */ + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); + } else { + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } + } + + last_ref = 1; + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + btrfs_release_path(path); + + if (is_data) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * when we free an block, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out_delayed_unlock; + + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) + goto out; + + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + btrfs_free_delayed_extent_op(head->extent_op); + head->extent_op = NULL; + } + + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; + + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + + atomic_dec(&delayed_refs->num_entries); + + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + head->processing = 0; + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return ret; +out: + spin_unlock(&head->lock); + +out_delayed_unlock: + spin_unlock(&delayed_refs->lock); + return 0; +} + +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + int pin = 1; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL, 0); + BUG_ON(ret); /* -ENOMEM */ + } + + if (!last_ref) + return; + + if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto out; + } + + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); + pin = 0; + } +out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't matter + * anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); +} + +/* Can return -ENOMEM */ +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int no_quota) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + + if (btrfs_test_is_dummy_root(root)) + return 0; + + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + */ + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ + btrfs_pin_extent(root, bytenr, num_bytes, 1); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, no_quota); + } else { + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, + NULL, no_quota); + } + return ret; +} + +/* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. + */ +static noinline void +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return; + + wait_event(caching_ctl->wait, block_group_cache_done(cache) || + (cache->free_space_ctl->free_space >= num_bytes)); + + put_caching_control(caching_ctl); +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; + put_caching_control(caching_ctl); + return ret; +} + +int __get_raid_index(u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + +int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_raid_index(cache->flags); +} + +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + +enum btrfs_loop_type { + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, +}; + +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. + */ +static noinline int find_free_extent(struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + struct btrfs_free_cluster *last_ptr = NULL; + struct btrfs_block_group_cache *block_group = NULL; + u64 search_start = 0; + u64 max_extent_size = 0; + int empty_cluster = 2 * 1024 * 1024; + struct btrfs_space_info *space_info; + int loop = 0; + int index = __get_raid_index(flags); + int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; + bool failed_cluster_refill = false; + bool failed_alloc = false; + bool use_cluster = true; + bool have_caching_bg = false; + + WARN_ON(num_bytes < root->sectorsize); + ins->type = BTRFS_EXTENT_ITEM_KEY; + ins->objectid = 0; + ins->offset = 0; + + trace_find_free_extent(orig_root, num_bytes, empty_size, flags); + + space_info = __find_space_info(root->fs_info, flags); + if (!space_info) { + btrfs_err(root->fs_info, "No space info for %llu", flags); + return -ENOSPC; + } + + /* + * If the space info is for both data and metadata it means we have a + * small filesystem and we can't use the clustering stuff. + */ + if (btrfs_mixed_space_info(space_info)) + use_cluster = false; + + if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { + last_ptr = &root->fs_info->meta_alloc_cluster; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; + } + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } + + if (last_ptr) { + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); + } + + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (!last_ptr) + empty_cluster = 0; + + if (search_start == hint_byte) { + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. + */ + if (block_group && block_group_bits(block_group, flags) && + block_group->cached != BTRFS_CACHE_NO) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else { + index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); + goto have_block_group; + } + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: + have_caching_bg = false; + down_read(&space_info->groups_sem); + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { + u64 offset; + int cached; + + btrfs_grab_block_group(block_group, delalloc); + search_start = block_group->key.objectid; + + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, flags)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((flags & extra) && !(block_group->flags & extra)) + goto loop; + } + +have_block_group: + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { + ret = cache_block_group(block_group, 0); + BUG_ON(ret < 0); + ret = 0; + } + + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; + if (unlikely(block_group->ro)) + goto loop; + + /* + * Ok we want to try and use the cluster allocator, so + * lets look there + */ + if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; + unsigned long aligned_cluster; + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) + goto refill_cluster; + + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, + num_bytes, + used_block_group->key.objectid, + &max_extent_size); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = used_block_group; + } + goto checks; + } + + WARN_ON(last_ptr->block_group != used_block_group); +release_cluster: + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + used_block_group != block_group) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); + goto unclustered_alloc; + } + + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); + if (ret == 0) { + /* + * now pull our allocation out of this + * cluster + */ + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, + num_bytes, + search_start, + &max_extent_size); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); + goto checks; + } + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { + spin_unlock(&last_ptr->refill_lock); + + failed_cluster_refill = true; + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; + } + + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; + } + +unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + if (block_group->free_space_ctl->free_space > + max_extent_size) + max_extent_size = + block_group->free_space_ctl->free_space; + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size, + &max_extent_size); + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + failed_alloc = true; + goto have_block_group; + } else if (!offset) { + if (!cached) + have_caching_bg = true; + goto loop; + } +checks: + search_start = ALIGN(offset, root->stripesize); + + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); + if (ret == -EAGAIN) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + /* we are all good, lets return */ + ins->objectid = search_start; + ins->offset = num_bytes; + + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); + btrfs_release_block_group(block_group, delalloc); + break; +loop: + failed_cluster_refill = false; + failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); + btrfs_release_block_group(block_group, delalloc); + } + up_read(&space_info->groups_sem); + + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { + index = 0; + loop++; + if (loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = do_chunk_alloc(trans, root, flags, + CHUNK_ALLOC_FORCE); + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, + root, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans, root); + if (ret) + goto out; + } + + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; + } + + goto search; + } else if (!ins->objectid) { + ret = -ENOSPC; + } else if (ins->objectid) { + ret = 0; + } +out: + if (ret == -ENOSPC) + ins->offset = max_extent_size; + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, + (info->full) ? "" : "not "); + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} + +int btrfs_reserve_extent(struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc) +{ + bool final_tried = false; + u64 flags; + int ret; + + flags = btrfs_get_alloc_profile(root, is_data); +again: + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags, delalloc); + + if (ret == -ENOSPC) { + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); + num_bytes = round_down(num_bytes, root->sectorsize); + num_bytes = max(num_bytes, min_alloc_size); + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, flags); + btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", + flags, num_bytes); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); + } + } + + return ret; +} + +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, + int pin, int delalloc) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + btrfs_err(root->fs_info, "Unable to find block group for %llu", + start); + return -ENOSPC; + } + + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + } + + btrfs_put_block_group(cache); + + trace_btrfs_reserved_extent_free(root, start, len); + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int delalloc) +{ + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1, 0); +} + +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + int type; + u32 size; + + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + if (ret) { + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); + BUG(); + } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + return ret; +} + +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins, + int no_quota) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!skinny_metadata) + size += sizeof(*block_info); + + path = btrfs_alloc_path(); + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->nodesize); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + if (ret) { + btrfs_free_path(path); + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->nodesize); + return ret; + } + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->nodesize; + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } + + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); + BUG(); + } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); + return ret; +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) +{ + int ret; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ins->offset, 0, + root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); + if (ret) + return ret; + } + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT, 0); + BUG_ON(ret); /* logic error */ + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); + btrfs_put_block_group(block_group); + return ret; +} + +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, int level) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); + btrfs_tree_lock(buf); + clean_tree_block(trans, root->fs_info, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + + btrfs_set_lock_blocking(buf); + btrfs_set_buffer_uptodate(buf); + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (buf->log_index == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + buf->log_index = -1; + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + /* this returns a buffer locked for blocking */ + return buf; +} + +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; + } + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} + +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns the tree buffer or an ERR_PTR on error. + */ +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) +{ + struct btrfs_key ins; + struct btrfs_block_rsv *block_rsv; + struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; + u64 flags = 0; + int ret; + u32 blocksize = root->nodesize; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (btrfs_test_is_dummy_root(root)) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } + + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(root, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); + if (ret) + goto out_unuse; + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + if (skinny_metadata) + extent_op->update_key = 0; + else + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + extent_op->level = level; + + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + if (ret) + goto out_free_delayed; + } + return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); +} + +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; + int for_reloc; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + u64 bytenr; + u64 generation; + u64 refs; + u64 flags; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; + + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); + } + + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = root->nodesize; + + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; + + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); + + if (slot == path->slots[wc->level]) + goto reada; + + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; + + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, + wc->level - 1, 1, &refs, + &flags); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; + BUG_ON(refs == 0); + + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; + + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + } +reada: + readahead_tree_block(root, bytenr); + nread++; + } + wc->reada_slot = slot; +} + +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + bytenr, num_bytes, + BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + child_bytenr, + root->nodesize, + BTRFS_QGROUP_OPER_SUB_SUBTREE, + 0); + if (ret) + goto out; + + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + +/* + * helper to process tree block while walking down the tree. + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int lookup_info) +{ + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, + btrfs_header_level(eb), 0); + BUG_ON(ret); /* -ENOMEM */ + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + } + return 0; +} + +/* + * helper to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + bool need_account = false; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = root->nodesize; + + next = btrfs_find_tree_block(root->fs_info, bytenr); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr); + if (!next) + return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, + &wc->refs[level - 1], + &wc->flags[level - 1]); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(root->fs_info, "Missing references."); + BUG(); + } + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + need_account = true; + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation, 0)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, generation); + if (!next || !extent_buffer_uptodate(next)) { + free_extent_buffer(next); + return -EIO; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0, 0); + BUG_ON(ret); /* -ENOMEM */ + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* + * helper to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return ret; + } + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return 1; + } + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + } + clean_tree_block(trans, root->fs_info, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); + } + + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return 0; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + int lookup_info = 1; + int ret; + + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; + + if (level == 0) + break; + + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } else if (ret < 0) + return ret; + level = wc->level; + } + return 0; +} + +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int max_level) +{ + int level = wc->level; + int ret; + + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; + return 0; + } else { + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; + + if (path->locks[level]) { + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); + path->locks[level] = 0; + } + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; + } + } + return 1; +} + +/* + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN + */ +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) +{ + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; + bool root_dropped = false; + + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; + } + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + if (block_rsv) + trans->block_rsv = block_rsv; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out_end_trans; + } + WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + level, 1, &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + err = ret; + goto out_end_trans; + } + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; + WARN_ON(wc->refs[level] != 1); + level--; + } + } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + wc->for_reloc = for_reloc; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; + break; + } + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; + break; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } + + /* + * Qgroup update accounting is run from + * delayed ref handling. This usually works + * out because delayed refs are normally the + * only way qgroup updates are added. However, + * we may have added updates during our tree + * walk so run qgroups here to make sure we + * don't lose any updates. + */ + ret = btrfs_delayed_qgroup_accounting(trans, + root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + + btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("BTRFS: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + if (block_rsv) + trans->block_rsv = block_rsv; + } + } + btrfs_release_path(path); + if (err) + goto out_end_trans; + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + } + } + + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + btrfs_put_fs_root(root); + } + root_dropped = true; +out_end_trans: + ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + + btrfs_end_transaction_throttle(trans, tree_root); +out_free: + kfree(wc); + btrfs_free_path(path); +out: + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (!for_reloc && root_dropped == false) + btrfs_add_dead_root(root); + if (err && err != -EAGAIN) + btrfs_std_error(root->fs_info, err); + return err; +} + +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code + */ +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + struct walk_control *wc; + int level; + int parent_level; + int ret = 0; + int wret; + + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } + + btrfs_assert_tree_locked(parent); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + btrfs_assert_tree_locked(node); + level = btrfs_header_level(node); + path->nodes[level] = node; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; + wc->for_reloc = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { + ret = wret; + break; + } + + wret = walk_up_tree(trans, root, path, wc, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + kfree(wc); + btrfs_free_path(path); + return ret; +} + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped; + + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); + + num_devices = root->fs_info->fs_devices->rw_devices; + + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* this is drive concat, leave it alone */ + } + + return flags; +} + +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + u64 min_allocable_bytes; + int ret = -ENOSPC; + + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = 1 * 1024 * 1024; + else + min_allocable_bytes = 0; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + + if (cache->ro) { + ret = 0; + goto out; + } + + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + cache->ro = 1; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); + ret = 0; + } +out: + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; +} + +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) + +{ + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; + + BUG_ON(cache->ro); + +again: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (trans->transaction->dirty_bg_run) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + + ret = set_block_group_ro(cache, 0); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache, 0); +out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + btrfs_end_transaction(trans, root); + return ret; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} + +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + list_del_init(&cache->ro_list); + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); +} + +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + struct btrfs_trans_handle *trans; + u64 min_free; + u64 dev_min = 1; + u64 dev_nr = 0; + u64 target; + int index; + int full = 0; + int ret = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; + + min_free = btrfs_block_group_used(&block_group->item); + + /* no bytes used, we're good */ + if (!min_free) + goto out; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + + full = space_info->full; + + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + min_free < space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. + */ + ret = -1; + + /* + * index: + * 0: raid10 + * 1: raid1 + * 2: dup + * 3: raid0 + * 4: single + */ + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_raid_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + + if (index == BTRFS_RAID_RAID10) { + dev_min = 4; + /* Divide by 2 */ + min_free >>= 1; + } else if (index == BTRFS_RAID_RAID1) { + dev_min = 2; + } else if (index == BTRFS_RAID_DUP) { + /* Multiply by 2 */ + min_free <<= 1; + } else if (index == BTRFS_RAID_RAID0) { + dev_min = fs_devices->rw_devices; + min_free = div64_u64(min_free, dev_min); + } + + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 dev_offset; + + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { + ret = find_free_dev_extent(trans, device, min_free, + &dev_offset, NULL); + if (!ret) + dev_nr++; + + if (dev_nr >= dev_min) + break; + + ret = -1; + } + } + mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); +out: + btrfs_put_block_group(block_group); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } +out: + return ret; +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = next_block_group(info->tree_root, + block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->commit_root_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->commit_root_sem); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + free_excluded_extents(info->extent_root, block_group); + + btrfs_remove_free_space_cache(block_group); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + int i; + + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) { + dump_space_info(space_info, 0, 0); + } + } + list_del(&space_info->list); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); + } + return 0; +} + +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + bool first = false; + + down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) { + struct raid_kobject *rkobj; + int ret; + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + goto out_err; + } + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); + + return cache; +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (btrfs_test_opt(root, SPACE_CACHE) && + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(root, CLEAR_CACHE)) + need_clear = 1; + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); + if (!cache) { + ret = -ENOMEM; + goto error; + } + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + cache->flags = btrfs_block_group_flags(&cache->item); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(path); + + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + goto error; + } + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + free_excluded_extents(root, cache); + } + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); + spin_unlock(&info->block_group_cache_lock); + btrfs_put_block_group(cache); + goto error; + } + + cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_readonly += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + __link_block_group(space_info, cache); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) { + set_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } + } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + set_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + set_block_group_ro(cache, 1); + } + + init_global_block_rsv(info); + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + if (ret) + goto next; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); + } +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + btrfs_set_log_full_commit(root->fs_info, trans); + + cache = btrfs_create_block_group_cache(root, chunk_offset, size); + if (!cache) + return -ENOMEM; + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + btrfs_set_block_group_flags(&cache->item, type); + + cache->flags = type; + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + return ret; + } + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + + free_excluded_extents(root, cache); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return ret; + } + update_global_block_rsv(root->fs_info); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_readonly += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + __link_block_group(cache->space_info, cache); + + list_add_tail(&cache->bg_list, &trans->new_bgs); + + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start, + struct extent_map *em) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_key key; + struct inode *inode; + struct kobject *kobj = NULL; + int ret; + int index; + int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + free_excluded_extents(root, block_group); + + memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ + inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + + if (!IS_ERR(inode)) { + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + btrfs_add_delayed_iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; + spin_unlock(&root->fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(root->fs_info, block_group->flags); + } + up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; + + spin_unlock(&block_group->space_info->lock); + + memcpy(&key, &block_group->key, sizeof(key)); + + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + space_info = block_group->space_info; + list_del_init(&block_group->bg_list); + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = set_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_set_block_group_rw(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); +end_trans: + btrfs_end_transaction(trans, root); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return 1; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } +out: + return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end, false); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 group_trimmed; + u64 start; + u64 end; + u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + int ret = 0; + + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); + + while (cache) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->key.objectid); + end = min(range->start + range->len, + cache->key.objectid + cache->key.offset); + + if (end - start >= range->minlen) { + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, 0); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + + cache = next_block_group(fs_info->tree_root, cache); + } + + range->len = trimmed; + return ret; +} + +/* + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). + */ +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) +{ + if (atomic_read(&root->will_be_snapshoted)) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (atomic_read(&root->will_be_snapshoted)) { + btrfs_end_write_no_snapshoting(root); + return 0; + } + return 1; +} diff --git a/kernel/fs/btrfs/extent_io.c b/kernel/fs/btrfs/extent_io.c new file mode 100644 index 000000000..c32d226bf --- /dev/null +++ b/kernel/fs/btrfs/extent_io.c @@ -0,0 +1,5632 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_io.h" +#include "extent_map.h" +#include "ctree.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "check-integrity.h" +#include "locking.h" +#include "rcu-string.h" +#include "backref.h" + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; + +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + +#ifdef CONFIG_BTRFS_DEBUG +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +static DEFINE_SPINLOCK(leak_lock); + +static inline +void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(new, head); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_del(struct list_head *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(entry); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_check(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), + atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " + "refs %d\n", + eb->start, eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, u64 start, u64 end) +{ + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + + inode = tree->mapping->host; + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, btrfs_ino(inode), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add(new, head) do {} while (0) +#define btrfs_leak_debug_del(entry) do {} while (0) +#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + unsigned long bio_flags; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; +}; + +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + if (!tree->mapping) + return NULL; + return btrfs_sb(tree->mapping->host->i_sb); +} + +int __init extent_io_init(void) +{ + extent_state_cache = kmem_cache_create("btrfs_extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_buffer_cache) + goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + + return 0; + +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; + return -ENOMEM; +} + +void extent_io_exit(void) +{ + btrfs_leak_debug_check(); + + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping) +{ + tree->state = RB_ROOT; + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + RB_CLEAR_NODE(&state->rb_node); + btrfs_leak_debug_add(&state->leak_list, &states); + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); + return state; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { + WARN_ON(extent_state_in_tree(state)); + btrfs_leak_debug_del(&state->leak_list); + trace_free_extent_state(state, _RET_IP_); + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + + p = search_start ? &search_start : &root->rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + +do_insert: + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **n = &root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = entry; + + if (offset < entry->start) + n = &(*n)->rb_left; + else if (offset > entry->end) + n = &(*n)->rb_right; + else + return *n; + } + + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); + if (!ret) + return prev; + return ret; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static void merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + merge_cb(tree, state, other); + state->start = other->start; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + merge_cb(tree, state, other); + state->end = other->end; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + } +} + +static void set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, unsigned *bits) +{ + if (tree->ops && tree->ops->set_bit_hook) + tree->ops->set_bit_hook(tree->mapping->host, state, bits); +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, unsigned *bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, unsigned *bits); + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + struct rb_node ***p, + struct rb_node **parent, + unsigned *bits) +{ + struct rb_node *node; + + if (end < start) + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", + end, start); + state->start = start; + state->end = end; + + set_state_bits(tree, state, bits); + + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " + "%llu %llu\n", + found->start, found->end, start, end); + return -EEXIST; + } + merge_state(tree, state); + return 0; +} + +static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + tree->ops->split_extent_hook(tree->mapping->host, orig, split); +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + + split_cb(tree, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); + if (node) { + free_extent_state(prealloc); + return -EEXIST; + } + return 0; +} + +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + unsigned *bits, int wake) +{ + struct extent_state *next; + unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; + + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + next = next_state(state); + if (extent_state_in_tree(state)) { + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + next = next_state(state); + } + return next; +} + +static struct extent_state * +alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns 0 on success and < 0 on error. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + struct rb_node *node; + u64 last_end; + int err; + int clear = 0; + + btrfs_debug_check_extent_io_range(tree, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { + if (clear) + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state = clear_state_bit(tree, state, &bits, wake); + goto next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + if (wake) + wake_up(&state->wq); + + clear_state_bit(tree, prealloc, &bits, wake); + + prealloc = NULL; + goto out; + } + + state = clear_state_bit(tree, state, &bits, wake); +next: + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && state && !need_resched()) + goto hit_next; + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits) +{ + struct extent_state *state; + struct rb_node *node; + + btrfs_debug_check_extent_io_range(tree, start, end); + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); +process_node: + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } + } +out: + spin_unlock(&tree->lock); +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + unsigned *bits) +{ + unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; + + set_state_cb(tree, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + state->state |= bits_to_set; +} + +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) +{ + if (cached_ptr && !(*cached_ptr)) { + if (!flags || (state->state & flags)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_IOBITS | EXTENT_BOUNDARY); +} + +/* + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. + * + * [start, end] is inclusive This takes the tree lock. + */ + +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, unsigned exclusive_bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + + btrfs_debug_check_extent_io_range(tree, start, end); + + bits |= EXTENT_FIRST_DELALLOC; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + BUG_ON(!prealloc); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) { + node = &state->rb_node; + goto hit_next; + } + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search_for_insert(tree, start, &p, &parent); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, u64 * failed_start, + struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + +/** + * convert_extent_bit - convert all bits in a given range from one bit to + * another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, unsigned clear_bits, + struct extent_state **cached_state, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + bool first_iteration = true; + + btrfs_debug_check_extent_io_range(tree, start, end); + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ + prealloc = alloc_extent_state(mask); + if (!prealloc && !first_iteration) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) { + node = &state->rb_node; + goto hit_next; + } + } + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search_for_insert(tree, start, &p, &parent); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, &clear_bits, 0); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + first_iteration = false; + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, NULL, + NULL, mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, + NULL, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_state **cached_state) +{ + int err; + u64 failed_start; + + while (1) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else + break; + WARN_ON(start > end); + } + return err; +} + +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, 0, NULL); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + return 0; + } + return 1; +} + +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + GFP_NOFS); +} + +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + __set_page_dirty_nobuffers(page); + account_page_redirty(page); + page_cache_release(page); + index++; + } + return 0; +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + set_page_writeback(page); + page_cache_release(page); + index++; + } + return 0; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +static struct extent_state * +find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, unsigned bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned. If found something, return 0. + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + struct rb_node *n; + int ret = 1; + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && extent_state_in_tree(state)) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state_if_flags(state, cached_state, 0); + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + atomic_inc(&state->refs); + } + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +STATIC u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + free_extent_state(cached_state); + return 0; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + */ + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + free_extent_state(cached_state); + cached_state = NULL; + if (!loops) { + max_bytes = PAGE_CACHE_SIZE; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ + + /* step three, lock the state bits for the whole range */ + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1, cached_state); + if (!ret) { + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) +{ + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (page_ops == 0) + return 0; + + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(inode->i_mapping, -EIO); + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + + if (page_ops & PAGE_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (page_ops & PAGE_CLEAR_DIRTY) + clear_page_dirty_for_io(pages[i]); + if (page_ops & PAGE_SET_WRITEBACK) + set_page_writeback(pages[i]); + if (page_ops & PAGE_SET_ERROR) + SetPageError(pages[i]); + if (page_ops & PAGE_END_WRITEBACK) + end_page_writeback(pages[i]); + if (page_ops & PAGE_UNLOCK) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned bits, int contig) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (WARN_ON(search_end <= cur_start)) + return 0; + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && extent_state_in_tree(cached) && cached->start <= start && + cached->end > start) + node = &cached->rb_node; + else + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + SetPageUptodate(page); +} + +int free_io_failure(struct inode *inode, struct io_failure_record *rec) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + + kfree(rec); + return err; +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchronization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, int mirror_num) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct bio *bio; + struct btrfs_device *dev; + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + int ret; + + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); + BUG_ON(!mirror_num); + + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_iter.bi_size = 0; + map_length = length; + + ret = btrfs_map_block(fs_info, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_iter.bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + btrfs_put_bbio(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, pg_offset); + + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_ino(inode), start, + rcu_str_deref(dev->name), sector); + bio_put(bio); + return 0; +} + +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = repair_io_failure(root->fs_info->btree_inode, start, + PAGE_CACHE_SIZE, start, p, + start - page_offset(p), mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_state *state; + int num_copies; + int ret; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + goto out; + } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start <= failrec->start && + state->end >= failrec->start + failrec->len - 1) { + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); + if (num_copies > 1) { + repair_io_failure(inode, start, failrec->len, + failrec->logical, page, + pg_offset, failrec->failed_mirror); + } + } + +out: + free_io_failure(inode, failrec); + + return 0; +} + +/* + * Can be called when + * - hold extent lock + * - under ordered extent + * - the inode is freeing + */ +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +{ + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct io_failure_record *failrec; + struct extent_state *state, *next; + + if (RB_EMPTY_ROOT(&failure_tree->state)) + return; + + spin_lock(&failure_tree->lock); + state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); + while (state) { + if (state->start > end) + break; + + ASSERT(state->end <= end); + + next = next_state(state); + + failrec = (struct io_failure_record *)(unsigned long)state->private; + free_extent_state(state); + kfree(failrec); + + state = next; + } + spin_unlock(&failure_tree->lock); +} + +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret) +{ + struct io_failure_record *failrec; + u64 private; + struct extent_map *em; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len <= start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + if (!em) { + kfree(failrec); + return -EIO; + } + + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + + pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n", + logical, start, failrec->len); + + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + + *failrec_ret = failrec; + + return 0; +} + +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + } + + if (failrec->this_mirror > num_copies) { + pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + + +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) + return NULL; + + bio->bi_end_io = endio_func; + bio->bi_iter.bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_iter.bi_size = 0; + bio->bi_private = data; + + btrfs_failed_bio = btrfs_io_bio(failed_bio); + if (btrfs_failed_bio->csum) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = btrfs_bio->csum_inline; + icsum *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, + csum_size); + } + + bio_add_page(bio, page, failrec->len, pg_offset); + + return bio; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) +{ + struct io_failure_record *failrec; + struct inode *inode = page->mapping->host; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct bio *bio; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + phy_offset >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + start - page_offset(page), + (int)phy_offset, failed_bio->bi_end_io, + NULL); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate) { + ClearPageUptodate(page); + SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); + } + return 0; +} + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + struct bio_vec *bvec; + u64 start; + u64 end; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } + + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; + + if (end_extent_writepage(page, err, start, end)) + continue; + + end_page_writeback(page); + } + + bio_put(bio); +} + +static void +endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, + int uptodate) +{ + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + if (uptodate && tree->track_uptodate) + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + struct bio_vec *bvec; + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct extent_io_tree *tree; + u64 offset = 0; + u64 start; + u64 end; + u64 len; + u64 extent_start = 0; + u64 extent_len = 0; + int mirror; + int ret; + int i; + + if (err) + uptodate = 0; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, + io_bio->mirror_num); + tree = &BTRFS_I(inode)->io_tree; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } + + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; + len = bvec->bv_len; + + mirror = io_bio->mirror_num; + if (likely(uptodate && tree->ops && + tree->ops->readpage_end_io_hook)) { + ret = tree->ops->readpage_end_io_hook(io_bio, offset, + page, start, end, + mirror); + if (ret) + uptodate = 0; + else + clean_io_failure(inode, start, page, 0); + } + + if (likely(uptodate)) + goto readpage_ok; + + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else { + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + offset += len; + continue; + } + } +readpage_ok: + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned off; + + /* Zero out the end if this page straddles i_size */ + off = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && off) + zero_user_segment(page, off, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + offset += len; + + if (unlikely(!uptodate)) { + if (extent_len) { + endio_readpage_release_extent(tree, + extent_start, + extent_len, 1); + extent_start = 0; + extent_len = 0; + } + endio_readpage_release_extent(tree, start, + end - start + 1, 0); + } else if (!extent_len) { + extent_start = start; + extent_len = end + 1 - start; + } else if (extent_start + extent_len == start) { + extent_len += end + 1 - start; + } else { + endio_readpage_release_extent(tree, extent_start, + extent_len, uptodate); + extent_start = start; + extent_len = end + 1 - start; + } + } + + if (extent_len) + endio_readpage_release_extent(tree, extent_start, extent_len, + uptodate); + if (io_bio->end_io) + io_bio->end_io(io_bio, err); + bio_put(bio); +} + +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } + } + + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; +} + +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + struct btrfs_io_bio *btrfs_bio; + struct bio *new; + + new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); + if (new) { + btrfs_bio = btrfs_io_bio(new); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return new; +} + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + if (bio) { + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; +} + + +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + + start = page_offset(page) + bvec->bv_offset; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags, start); + else + btrfsic_submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_iter.bi_sector == sector; + else + contig = bio_end_sector(bio) == sector; + + if (prev_bio_flags != bio_flags || !contig || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + if (ret < 0) { + *bio_ret = NULL; + return ret; + } + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +static void attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); + } +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, get_extent_t *get_extent, + struct extent_map **em_cached) +{ + struct extent_map *em; + + if (em_cached && *em_cached) { + em = *em_cached; + if (extent_map_in_tree(em) && start >= em->start && + start < extent_map_end(em)) { + atomic_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = get_extent(inode, page, pg_offset, start, len, 0); + if (em_cached && !IS_ERR_OR_NULL(em)) { + BUG_ON(*em_cached); + atomic_inc(&em->refs); + *em_cached = em; + } + return em; +} +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + * XXX JDM: This needs looking at to ensure proper page locking + */ +static int __do_readpage(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + size_t pg_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + + set_page_extent_mapped(page); + + end = page_end; + if (!PageUptodate(page)) { + if (cleancache_get_page(page) == 0) { + BUG_ON(blocksize != PAGE_SIZE); + unlock_extent(tree, start, end); + goto out; + } + } + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage); + } + } + while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + + if (cur >= last_byte) { + char *userpage; + struct extent_state *cached = NULL; + + iosize = PAGE_CACHE_SIZE - pg_offset; + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage); + set_extent_uptodate(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + if (!parent_locked) + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); + break; + } + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, get_extent, em_cached); + if (IS_ERR_OR_NULL(em)) { + SetPageError(page); + if (!parent_locked) + unlock_extent(tree, cur, end); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + this_bio_flag |= EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = ALIGN(iosize, blocksize); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + struct extent_state *cached = NULL; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { + check_page_uptodate(tree, page); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + pnr -= page->index; + ret = submit_extent_page(rw, tree, page, + sector, disk_io_size, pg_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } else { + SetPageError(page); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); + } + cur = cur + iosize; + pg_offset += iosize; + } +out: + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +static inline void __do_contiguous_readpages(struct extent_io_tree *tree, + struct page *pages[], int nr_pages, + u64 start, u64 end, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode; + struct btrfs_ordered_extent *ordered; + int index; + + inode = pages[0]->mapping->host; + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + for (index = 0; index < nr_pages; index++) { + __do_readpage(tree, pages[index], get_extent, em_cached, bio, + mirror_num, bio_flags, rw); + page_cache_release(pages[index]); + } +} + +static void __extent_readpages(struct extent_io_tree *tree, + struct page *pages[], + int nr_pages, get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + u64 start = 0; + u64 end = 0; + u64 page_start; + int index; + int first_index = 0; + + for (index = 0; index < nr_pages; index++) { + page_start = page_offset(pages[index]); + if (!end) { + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } else if (end + 1 == page_start) { + end += PAGE_CACHE_SIZE; + } else { + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, + bio, mirror_num, bio_flags, + rw); + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } + } + + if (end) + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, bio, + mirror_num, bio_flags, rw); +} + +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + struct btrfs_ordered_extent *ordered; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret; + + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + bio_flags, rw); + return ret; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; + int ret; + + ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + +/* + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) + */ +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) +{ + struct extent_io_tree *tree = epd->tree; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + BTRFS_MAX_EXTENT_SIZE); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) +{ + struct extent_io_tree *tree = epd->tree; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 block_start; + u64 iosize; + sector_t sector; + struct extent_state *cached_state = NULL; + struct extent_map *em; + struct block_device *bdev; + size_t pg_offset = 0; + size_t blocksize; + int ret = 0; + int nr = 0; + bool compressed; + + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); + + update_nr_written(page, wbc, nr_written); + unlock_page(page); + ret = 1; + goto done_unlocked; + } + } + + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); + + end = page_end; + if (i_size <= start) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + goto done; + } + + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + u64 em_end; + if (cur >= i_size) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR_OR_NULL(em)) { + SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); + break; + } + + extent_offset = cur - em->start; + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); + BUG_ON(end < cur); + iosize = min(em_end - cur, end - cur + 1); + iosize = ALIGN(iosize, blocksize); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + continue; + } + + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); + } + + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } + unlock_page(page); + return ret; + +done_unlocked: + return 0; +} + +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); + ret = 1; + } else { + spin_unlock(&eb->refs_lock); + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void set_btree_ioerr(struct page *page) +{ + struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode); + + SetPageError(page); + if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's + * committing (when we call filemap_fdata[write|wait]_range against + * the btree inode), we might have + * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it + * returns an error or an error happens during writeback, when we're + * committing the transaction we wouldn't know about it, since the pages + * can be no longer dirty nor marked anymore for writeback (if a + * subsequent modification to the extent buffer didn't happen before the + * transaction commit), which makes filemap_fdata[write|wait]_range not + * able to find the pages tagged with SetPageError at transaction + * commit time. So if this happens we must abort the transaction, + * otherwise we commit a super block with btree roots that point to + * btree nodes/leafs whose content on disk is invalid - either garbage + * or the content of some node/leaf from a past generation that got + * cowed or deleted and is no longer valid. + * + * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would + * not be enough - we need to distinguish between log tree extents vs + * non-log tree extents, and the next filemap_fdatawait_range() call + * will catch and clear such errors in the mapping - and that call might + * be from a log sync and not from a transaction commit. Also, checking + * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is + * not done and would not be reliable - the eb might have been released + * from memory and reading it back again means that flag would not be + * set (since it's a runtime flag, not persisted on disk). + * + * Using the flags below in the btree inode also makes us achieve the + * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started + * writeback for all dirty pages and before filemap_fdatawait_range() + * is called, the writeback for all dirty pages had already finished + * with errors - because we were not using AS_EIO/AS_ENOSPC, + * filemap_fdatawait_range() would return success, as it could not know + * that writeback errors happened (the pages were no longer tagged for + * writeback). + */ + switch (eb->log_index) { + case -1: + set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags); + break; + case 0: + set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + break; + case 1: + set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + break; + default: + BUG(); /* unexpected, logic error */ + } +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + struct bio_vec *bvec; + struct extent_buffer *eb; + int i, done; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + ClearPageUptodate(page); + set_btree_ioerr(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } + + bio_put(bio); +} + +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + u64 offset = eb->start; + unsigned long i, num_pages; + unsigned long bio_flags = 0; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int ret = 0; + + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; + if (ret) { + set_btree_ioerr(p); + end_page_writeback(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = eb->pages[i]; + clear_page_dirty_for_io(p); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + continue; + } + + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); + continue; + } + + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + continue; + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + struct inode *inode = mapping->host; + int ret = 0; + int done = 0; + int err = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + /* + * We have to hold onto the inode so that ordered extents can do their + * work when the IO finishes. The alternative to this is failing to add + * an ordered extent if the igrab() fails there and that is a huge pain + * to deal with, so instead just hold onto the inode throughout the + * writepages operation. If it fails here we are freeing up the inode + * anyway and we'd rather not waste our time writing out stuff that is + * going to be truncated anyway. + */ + if (!igrab(inode)) + return 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (!err && ret < 0) + err = ret; + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done && !err) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + btrfs_add_delayed_iput(inode); + return err; +} + +static void flush_epd_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + int rw = WRITE; + int ret; + + if (epd->sync_io) + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + BUG_ON(ret < 0); /* -ENOMEM */ + epd->bio = NULL; + } +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + + ret = __extent_writepage(page, wbc, &epd); + + flush_epd_write_bio(&epd); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + struct writeback_control wbc_writepages = { + .sync_mode = mode, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + flush_epd_write_bio(&epd); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + flush_epd_write_bio(&epd); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + unsigned long bio_flags = 0; + struct page *pagepool[16]; + struct page *page; + struct extent_map *em_cached = NULL; + int nr = 0; + + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + continue; + } + + pagepool[nr++] = page; + if (nr < ARRAY_SIZE(pagepool)) + continue; + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + nr = 0; + } + if (nr) + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + + if (em_cached) + free_extent_map(em_cached); + + BUG_ON(!list_empty(pages)); + if (bio) + return submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + struct extent_state *cached_state = NULL; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += ALIGN(offset, blocksize); + if (start > end) + return 0; + + lock_extent_bits(tree, start, end, 0, &cached_state); + wait_on_page_writeback(page); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, &cached_state, GFP_NOFS); + return 0; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +static int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, + struct page *page, gfp_t mask) +{ + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS, 0, NULL)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + ret = clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em) { + write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while (1) { + len = last - offset; + if (len == 0) + break; + len = ALIGN(len, sectorsize); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (IS_ERR_OR_NULL(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret = 0; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u32 found_type; + u64 last; + u64 last_for_get_extent = 0; + u64 disko = 0; + u64 isize = i_size_read(inode); + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(inode)->root; + int end = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; + + if (len == 0) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + start = round_down(start, BTRFS_I(inode)->root->sectorsize); + len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; + + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = found_key.type; + + /* No extents, but there might be delalloc bits */ + if (found_key.objectid != btrfs_ino(inode) || + found_type != BTRFS_EXTENT_DATA_KEY) { + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; + } + btrfs_release_path(path); + + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + &cached_state); + + em = get_extent_skip_holes(inode, start, last_for_get_extent, + get_extent); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + while (!end) { + u64 offset_in_extent = 0; + + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; + + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + + /* + * record the offset from the start of the extent + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. + */ + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; + disko = 0; + flags = 0; + + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + + if (em->block_start == EXTENT_MAP_LAST_BYTE) { + end = 1; + flags |= FIEMAP_EXTENT_LAST; + } else if (em->block_start == EXTENT_MAP_INLINE) { + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + } else if (em->block_start == EXTENT_MAP_DELALLOC) { + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + } else if (fieinfo->fi_extents_max) { + u64 bytenr = em->block_start - + (em->start - em->orig_start); + + disko = em->block_start + offset_in_extent; + + /* + * As btrfs supports shared space, this information + * can be exported to userspace tools via + * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 + * then we're just getting a count and we can skip the + * lookup stuff. + */ + ret = btrfs_check_shared(NULL, root->fs_info, + root->objectid, + btrfs_ino(inode), bytenr); + if (ret < 0) + goto out_free; + if (ret) + flags |= FIEMAP_EXTENT_SHARED; + ret = 0; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + free_extent_map(em); + em = NULL; + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) { + if (ret == 1) + ret = 0; + goto out_free; + } + } +out_free: + free_extent_map(em); +out: + btrfs_free_path(path); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + &cached_state, GFP_NOFS); + return ret; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ + btrfs_leak_debug_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); +} + +int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} + +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) +{ + unsigned long index; + struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + BUG_ON(extent_buffer_under_io(eb)); + + index = num_extent_pages(eb->start, eb->len); + if (index == 0) + return; + + do { + index--; + page = eb->pages[index]; + if (!page) + continue; + if (mapped) + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + + if (mapped) + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ + page_cache_release(page); + } while (index != 0); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb); + __free_extent_buffer(eb); +} + +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len) +{ + struct extent_buffer *eb = NULL; + + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); + if (eb == NULL) + return NULL; + eb->start = start; + eb->len = len; + eb->fs_info = fs_info; + eb->bflags = 0; + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + eb->lock_nested = 0; + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); + + btrfs_leak_debug_add(&eb->leak_list, &buffers); + + spin_lock_init(&eb->refs_lock); + atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); + + return eb; +} + +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_NOFS); + if (!p) { + btrfs_release_extent_buffer(new); + return NULL; + } + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + unsigned long len; + unsigned long num_pages; + unsigned long i; + + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + num_pages = num_extent_pages(0, len); + + eb = __alloc_extent_buffer(fs_info, start, len); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_NOFS); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (; i > 0; i--) + __free_page(eb->pages[i - 1]); + __free_extent_buffer(eb); + return NULL; +} + +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + int refs; + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + + spin_lock(&eb->refs_lock); + if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_inc(&eb->refs); + spin_unlock(&eb->refs_lock); +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + if (p != accessed) + mark_page_accessed(p); + } +} + +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } + mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(fs_info, start); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + unsigned long len = fs_info->tree_root->nodesize; + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + int uptodate = 1; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + + eb = __alloc_extent_buffer(fs_info, start, len); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, GFP_NOFS); + if (!p) + goto free_eb; + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + page_cache_release(p); + mark_extent_buffer_accessed(exists, p); + goto free_eb; + } + exists = NULL; + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); + } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + eb->pages[i] = p; + if (!PageUptodate(p)) + uptodate = 0; + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + } + if (uptodate) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + /* add one reference for the tree */ + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = eb->pages[i]; + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); + return eb; + +free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } + + btrfs_release_extent_buffer(eb); + return exists; +} + +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static int release_extent_buffer(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; + + spin_unlock(&eb->refs_lock); + + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); + } + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return 1; + } + spin_unlock(&eb->refs_lock); + + return 0; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + int refs; + int old; + if (!eb) + return; + + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) + return; + + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb); +} + +void clear_extent_buffer_dirty(struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + struct page *page; + + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (!PageDirty(page)) + continue; + + lock_page(page); + WARN_ON(!PagePrivate(page)); + + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); + unlock_page(page); + } + WARN_ON(atomic_read(&eb->refs) == 0); +} + +int set_extent_buffer_dirty(struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + int was_dirty = 0; + + check_buffer_tree_ref(eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); + return was_dirty; +} + +int clear_extent_buffer_uptodate(struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + SetPageUptodate(page); + } + return 0; +} + +int extent_buffer_uptodate(struct extent_buffer *eb) +{ + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + unsigned long num_pages; + unsigned long num_reads = 0; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 0; + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = eb->pages[i]; + if (wait == WAIT_NONE) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) { + num_reads++; + all_uptodate = 0; + } + } + if (all_uptodate) { + if (start_i == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + goto unlock_exit; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, num_reads); + for (i = start_i; i < num_pages; i++) { + page = eb->pages[i]; + if (!PageUptodate(page)) { + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags, + READ | REQ_META); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) { + err = submit_one_bio(READ | REQ_META, bio, mirror_num, + bio_flags); + if (err) + return err; + } + + if (ret || wait != WAIT_COMPLETE) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = eb->pages[i]; + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = eb->pages[i]; + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + memcpy(dst, kaddr + offset, cur); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **map, + unsigned long *map_start, + unsigned long *map_len) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", + eb->start, eb->len, start, min_len); + return -EINVAL; + } + + p = eb->pages[i]; + kaddr = page_address(p); + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = eb->pages[i]; + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = page_address(page); + ret = memcmp(ptr, kaddr + offset, cur); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = page_address(page); + memcpy(kaddr + offset, src, cur); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = page_address(page); + memset(kaddr + offset, c, cur); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = dst->pages[i]; + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = page_address(page); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) +{ + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = page_address(dst_page); + char *src_kaddr; + int must_memmove = 0; + + if (dst_page != src_page) { + src_kaddr = page_address(src_page); + } else { + src_kaddr = dst_kaddr; + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; + } + + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + (PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + (PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + (PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + (PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + copy_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +int try_release_extent_buffer(struct page *page) +{ + struct extent_buffer *eb; + + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; + } + + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + + /* + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; + } + spin_unlock(&page->mapping->private_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + + return release_extent_buffer(eb); +} diff --git a/kernel/fs/btrfs/extent_io.h b/kernel/fs/btrfs/extent_io.h new file mode 100644 index 000000000..c668f3689 --- /dev/null +++ b/kernel/fs/btrfs/extent_io.h @@ -0,0 +1,382 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include + +/* bits for the extent state */ +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_WRITEBACK (1U << 1) +#define EXTENT_UPTODATE (1U << 2) +#define EXTENT_LOCKED (1U << 3) +#define EXTENT_NEW (1U << 4) +#define EXTENT_DELALLOC (1U << 5) +#define EXTENT_DEFRAG (1U << 6) +#define EXTENT_BOUNDARY (1U << 9) +#define EXTENT_NODATASUM (1U << 10) +#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_FIRST_DELALLOC (1U << 12) +#define EXTENT_NEED_WAIT (1U << 13) +#define EXTENT_DAMAGED (1U << 14) +#define EXTENT_NORESERVE (1U << 15) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) + +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ +#define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_TREE_LOG 2 +#define EXTENT_BIO_PARENT_LOCKED 4 +#define EXTENT_BIO_FLAG_SHIFT 16 + +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_DIRTY 2 +#define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ +#define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_IN_TREE 10 +#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ + +/* these are flags for extent_clear_unlock_delalloc */ +#define PAGE_UNLOCK (1 << 0) +#define PAGE_CLEAR_DIRTY (1 << 1) +#define PAGE_SET_WRITEBACK (1 << 2) +#define PAGE_END_WRITEBACK (1 << 3) +#define PAGE_SET_PRIVATE2 (1 << 4) +#define PAGE_SET_ERROR (1 << 5) + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 + +struct extent_state; +struct btrfs_root; +struct btrfs_io_bio; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned *bits); + void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned *bits); + void (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + void (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); +}; + +struct extent_io_tree { + struct rb_root state; + struct address_space *mapping; + u64 dirty_bytes; + int track_uptodate; + spinlock_t lock; + const struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + wait_queue_head_t wq; + atomic_t refs; + unsigned state; + + /* for use by the FS */ + u64 private; + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) +struct extent_buffer { + u64 start; + unsigned long len; + unsigned long bflags; + struct btrfs_fs_info *fs_info; + spinlock_t refs_lock; + atomic_t refs; + atomic_t io_pages; + int read_mirror; + struct rcu_head rcu_head; + pid_t lock_owner; + + /* count of read lock holders on the extent buffer */ + atomic_t write_locks; + atomic_t read_locks; + atomic_t blocking_writers; + atomic_t blocking_readers; + atomic_t spinning_readers; + atomic_t spinning_writers; + short lock_nested; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ + short log_index; + + /* protects write locks */ + rwlock_t lock; + + /* readers use lock_wq while they wait for the write + * lock holders to unlock + */ + wait_queue_head_t write_lock_wq; + + /* writers use read_lock_wq while they wait for readers + * to unlock + */ + wait_queue_head_t read_lock_wq; + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + +struct extent_map_tree; + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t pg_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct page *page); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_state **cached); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num); +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned bits, int contig); + +void free_extent_state(struct extent_state *state); +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int filled, + struct extent_state *cached_state); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, unsigned clear_bits, + struct extent_state **cached_state, gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits, + struct extent_state **cached_state); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); +void wait_on_extent_buffer_writeback(struct extent_buffer *eb); + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +void clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **map, + unsigned long *map_start, + unsigned long *map_len); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned bits_to_clear, + unsigned long page_ops); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); + +struct btrfs_fs_info; + +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, + int mirror_num); +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret); +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int fail_mirror); +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data); +int free_io_failure(struct inode *inode, struct io_failure_record *rec); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes); +#endif +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); +#endif diff --git a/kernel/fs/btrfs/extent_map.c b/kernel/fs/btrfs/extent_map.c new file mode 100644 index 000000000..6a98bddd8 --- /dev/null +++ b/kernel/fs/btrfs/extent_map.c @@ -0,0 +1,455 @@ +#include +#include +#include +#include +#include "ctree.h" +#include "extent_map.h" + + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("btrfs_extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree) +{ + tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); + rwlock_init(&tree->lock); +} + +/** + * alloc_extent_map - allocate new extent map structure + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(void) +{ + struct extent_map *em; + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); + if (!em) + return NULL; + RB_CLEAR_NODE(&em->rb_node); + em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; + atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); + return em; +} + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(extent_map_in_tree(em)); + WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->bdev); + kmem_cache_free(extent_map_cache, em); + } +} + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + if (em->start < entry->start) + p = &(*p)->rb_left; + else if (em->start >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color(&em->rb_node, root); + return 0; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + + /* + * We don't want to merge stuff that hasn't been written to the log yet + * since it may not reflect exactly what is on disk, and that would be + * bad. + */ + if (!list_empty(&prev->list) || !list_empty(&next->list)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +{ + struct extent_map *merge = NULL; + struct rb_node *rb; + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->orig_start = merge->orig_start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + + rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->block_len; + rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); + free_extent_map(merge); + } +} + +/** + * unpin_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) +{ + int ret = 0; + struct extent_map *em; + bool prealloc = false; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + em->generation = gen; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_FILLING, &em->flags); + } + + try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else + try_merge_map(tree, em); +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was successful. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified) +{ + int ret = 0; + + ret = tree_insert(&tree->map, em); + if (ret) + goto out; + + setup_extent_mapping(tree, em, modified); +out: + return ret; +} + +static struct extent_map * +__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node) { + if (prev) + rb_node = prev; + else if (next) + rb_node = next; + else + return NULL; + } + + em = rb_entry(rb_node, struct extent_map, rb_node); + + if (strict && !(end > em->start && start < extent_map_end(em))) + return NULL; + + atomic_inc(&em->refs); + return em; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 1); +} + +/** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 0); +} + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + rb_erase(&em->rb_node, &tree->map); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); + RB_CLEAR_NODE(&em->rb_node); + return ret; +} + +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} diff --git a/kernel/fs/btrfs/extent_map.h b/kernel/fs/btrfs/extent_map.h new file mode 100644 index 000000000..b2991fd85 --- /dev/null +++ b/kernel/fs/btrfs/extent_map.h @@ -0,0 +1,85 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include + +#define EXTENT_MAP_LAST_BYTE ((u64)-4) +#define EXTENT_MAP_HOLE ((u64)-3) +#define EXTENT_MAP_INLINE ((u64)-2) +#define EXTENT_MAP_DELALLOC ((u64)-1) + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ +#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 mod_start; + u64 mod_len; + u64 orig_start; + u64 orig_block_len; + u64 ram_bytes; + u64 block_start; + u64 block_len; + u64 generation; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + unsigned int compress_type; + struct list_head list; +}; + +struct extent_map_tree { + struct rb_root map; + struct list_head modified_extents; + rwlock_t lock; +}; + +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified); + +struct extent_map *alloc_extent_map(void); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +#endif diff --git a/kernel/fs/btrfs/file-item.c b/kernel/fs/btrfs/file-item.c new file mode 100644 index 000000000..58ece6558 --- /dev/null +++ b/kernel/fs/btrfs/file-item.c @@ -0,0 +1,953 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "print-tree.h" + +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ + PAGE_CACHE_SIZE)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(u32) * (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + file_key.objectid = objectid; + file_key.offset = pos; + file_key.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); /* Can't happen */ + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +static struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + file_key.type = BTRFS_EXTENT_CSUM_KEY; + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset == csums_in_item) { + ret = -EFBIG; + goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + file_key.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + +static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) +{ + kfree(bio->csum_allocated); +} + +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; + u8 *csum; + u64 offset = 0; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + int nblocks; + int bio_index = 0; + int count; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + if (!dst) { + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + btrfs_bio->csum_allocated = kmalloc_array(nblocks, + csum_size, GFP_NOFS); + if (!btrfs_bio->csum_allocated) { + btrfs_free_path(path); + return -ENOMEM; + } + btrfs_bio->csum = btrfs_bio->csum_allocated; + btrfs_bio->end_io = btrfs_io_bio_endio_readpage; + } else { + btrfs_bio->csum = btrfs_bio->csum_inline; + } + csum = btrfs_bio->csum; + } else { + csum = (u8 *)dst; + } + + if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; + if (dio) + offset = logical_offset; + while (bio_index < bio->bi_vcnt) { + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, + (u32 *)csum, nblocks); + if (count) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + count = 1; + memset(csum, 0, csum_size); + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + btrfs_info(BTRFS_I(inode)->root->fs_info, + "no csum found for inode %llu start %llu", + btrfs_ino(inode), offset); + } + item = NULL; + btrfs_release_path(path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], csum, + ((unsigned long)item) + diff, + csum_size * count); +found: + csum += count * csum_size; + nblocks -= count; + bio_index += count; + while (count--) { + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_csum_item *item; + LIST_HEAD(tmplist); + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + ASSERT(IS_ALIGNED(start, root->sectorsize) && + IS_ALIGNED(end + 1, root->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (search_commit) { + path->skip_locking = 1; + path->reada = 2; + path->search_commit_root = 1; + } + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + if (!sums) { + ret = -ENOMEM; + goto fail; + } + + sums->bytenr = start; + sums->len = (int)size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + size >>= root->fs_info->sb->s_blocksize_bits; + + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root->sectorsize * size; + list_add_tail(&sums->list, &tmplist); + } + path->slots[0]++; + } + ret = 0; +fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + int index; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + GFP_NOFS); + if (!sums) + return -ENOMEM; + + sums->len = bio->bi_iter.bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); /* Logic error */ + sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; + index = 0; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_iter.bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); /* -ENOMEM */ + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); /* Logic error */ + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + + total_bytes; + index = 0; + } + + data = kmap_atomic(bvec->bv_page); + sums->sums[index] = ~(u32)0; + sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, + sums->sums[index], + bvec->bv_len); + kunmap_atomic(data); + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); + + bio_index++; + index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline void truncate_one_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + btrfs_truncate_item(root, path, new_size, 1); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + btrfs_truncate_item(root, path, new_size, 0); + + key->offset = end_byte; + btrfs_set_item_key_safe(root->fs_info, path, key); + } else { + BUG(); + } +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + break; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + key.offset = end_byte - 1; + } else { + truncate_one_csum(root, path, &key, bytenr, len); + if (key.offset < bytenr) + break; + } + btrfs_release_path(path); + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; + u64 csum_offset; + u64 bytenr; + u32 nritems; + u32 ins_size; + int index = 0; + int found_next; + int ret; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + next_offset = (u64)-1; + found_next = 0; + bytenr = sums->bytenr + total_bytes; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + file_key.type = BTRFS_EXTENT_CSUM_KEY; + + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); + if (!IS_ERR(item)) { + ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + goto found; + } + ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (!nritems || (path->slots[0] >= nritems - 1)) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = path->slots[0]; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (found_key.type != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + int extend_nr; + u64 tmp; + u32 diff; + u32 free_space; + + if (btrfs_leaf_free_space(root, leaf) < + sizeof(struct btrfs_item) + csum_size * 2) + goto insert; + + free_space = btrfs_leaf_free_space(root, leaf) - + sizeof(struct btrfs_item) - csum_size; + tmp = sums->len - total_bytes; + tmp >>= root->fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); + + extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + diff = min(free_space, diff); + diff /= csum_size; + diff *= csum_size; + + btrfs_extend_item(root, path, diff); + ret = 0; + goto csum; + } + +insert: + btrfs_release_path(path); + csum_offset = 0; + if (found_next) { + u64 tmp; + + tmp = sums->len - total_bytes; + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + root->fs_info->sb->s_blocksize_bits); + + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + path->leave_spinning = 0; + if (ret < 0) + goto fail_unlock; + if (WARN_ON(ret != 0)) + goto fail_unlock; + leaf = path->nodes[0]; +csum: + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size_nr(leaf, path->slots[0])); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + ins_size = (u32)(sums->len - total_bytes) >> + root->fs_info->sb->s_blocksize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + ins_size /= csum_size; + total_bytes += ins_size * root->sectorsize; + index += ins_size; + + btrfs_mark_buffer_dirty(path->nodes[0]); + if (total_bytes < sums->len) { + btrfs_release_path(path); + cond_resched(); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/kernel/fs/btrfs/file.c b/kernel/fs/btrfs/file.c new file mode 100644 index 000000000..b072e1747 --- /dev/null +++ b/kernel/fs/btrfs/file.c @@ -0,0 +1,2860 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "volumes.h" +#include "qgroup.h" + +static struct kmem_cache *btrfs_inode_defrag_cachep; +/* + * when auto defrag is enabled we + * queue up these defrag structs to remember which + * inodes need defragging passes + */ +struct inode_defrag { + struct rb_node rb_node; + /* objectid */ + u64 ino; + /* + * transid where the defrag was added, we search for + * extents newer than this + */ + u64 transid; + + /* root objectid */ + u64 root; + + /* last offset we were able to defrag */ + u64 last_offset; + + /* if we've wrapped around back to zero once already */ + int cycled; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* pop a record for an inode into the defrag tree. The lock + * must be held already + * + * If you're inserting a record for an older transid than an + * existing record, the transid already in the tree is lowered + * + * If an existing record is found the defrag item you + * pass in is freed + */ +static int __btrfs_add_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &root->fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* if we're reinserting an entry for + * an old defrag run, make sure to + * lower the transid of our existing record + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + if (defrag->last_offset > entry->last_offset) + entry->last_offset = defrag->last_offset; + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + + return 1; +} + +/* + * insert a defrag record for this inode if auto defrag is + * enabled + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(root)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = BTRFS_I(inode)->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + + spin_lock(&root->fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&root->fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +static void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + +/* + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. + */ +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_key key; + struct btrfs_ioctl_defrag_range_args range; + int num_defrag; + int index; + int ret; + + /* get the inode */ + key.objectid = defrag->root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + key.objectid = defrag->ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info->tree_root)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * during unmount, we use the transaction_wait queue to + * wait for the defragger to stop + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + size_t write_bytes, + struct page **prepared_pages, + struct iov_iter *i) +{ + size_t copied = 0; + size_t total_copied = 0; + int pg = 0; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + while (write_bytes > 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[pg]; + /* + * Copy data from userspace to the current page + */ + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + + iov_iter_advance(i, copied); + write_bytes -= copied; + total_copied += copied; + + /* Return to btrfs_file_write_iter to fault page */ + if (unlikely(copied == 0)) + break; + + if (copied < PAGE_CACHE_SIZE - offset) { + offset += copied; + } else { + pg++; + offset = 0; + } + } + return total_copied; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) +{ + int err = 0; + int i; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); + + end_of_last_block = start_pos + num_bytes - 1; + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + cached); + if (err) + return err; + + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) + i_size_write(inode, end_pos); + return 0; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + u64 gen; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + bool modified; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + int no_splits = 0; + + modified = false; + if (!split) + split = alloc_extent_map(); + if (!split2) + split2 = alloc_extent_map(); + if (!split || !split2) + no_splits = 1; + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + write_unlock(&em_tree->lock); + break; + } + flags = em->flags; + gen = em->generation; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (testend && em->start + em->len >= start + len) { + free_extent_map(em); + write_unlock(&em_tree->lock); + break; + } + start = em->start + em->len; + if (testend) + len = start + len - (em->start + em->len); + free_extent_map(em); + write_unlock(&em_tree->lock); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + if (no_splits) + goto next; + + if (em->start < start) { + split->start = em->start; + split->len = start - em->start; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; + split->bdev = em->bdev; + split->flags = flags; + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } + } else { + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + } + + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } + free_extent_map(split); + split = NULL; + } +next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + */ +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) +{ + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key new_key; + u64 ino = btrfs_ino(inode); + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; + int recow; + int ret; + int modify_tree = -1; + int update_refs; + int found = 0; + int leafs_visited = 0; + + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) + modify_tree = 0; + + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); + while (1) { + recow = 0; + ret = btrfs_lookup_file_extent(trans, root, path, ino, + search_start, modify_tree); + if (ret < 0) + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + ret = 0; + leafs_visited++; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + leafs_visited++; + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); + } else { + WARN_ON(1); + extent_end = search_start; + } + + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + + if (extent_end <= search_start) { + path->slots[0]++; + goto next_slot; + } + + found = 1; + search_start = max(key.offset, start); + if (recow || !modify_tree) { + modify_tree = -1; + btrfs_release_path(path); + continue; + } + + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + continue; + } + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (update_refs && disk_bytenr > 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset, 1); + BUG_ON(ret); /* -ENOMEM */ + } + key.offset = start; + } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(root->fs_info, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (update_refs && disk_bytenr > 0) + inode_sub_bytes(inode, end - key.offset); + break; + } + + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } + + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (update_refs && disk_bytenr > 0) + inode_sub_bytes(inode, extent_end - start); + if (end == extent_end) + break; + + path->slots[0]++; + goto next_slot; + } + + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { +delete_extent_item: + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } + + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { + inode_sub_bytes(inode, + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (update_refs && disk_bytenr > 0) { + ret = btrfs_free_extent(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + key.objectid, key.offset - + extent_offset, 0); + BUG_ON(ret); /* -ENOMEM */ + inode_sub_bytes(inode, + extent_end - key.offset); + } + + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(path); + continue; + } + + BUG_ON(1); + } + + if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if replace_extent != 0). + */ + path->slots[0] = del_slot; + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } + + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } + + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); + if (drop_end) + *drop_end = found ? min(end, extent_end) : end; + return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, + drop_cache, 0, 0, NULL); + btrfs_free_path(path); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key new_key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 orig_offset; + u64 other_start; + u64 other_end; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; + int ret; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + recow = 0; + split = start; + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = split; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); + + if (start == key.offset && end < extent_end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(root->fs_info, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + if (start > key.offset && end == extent_end) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(root->fs_info, path, &new_key); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; + + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + goto again; + } + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + ino, orig_offset, 1); + BUG_ON(ret); /* -ENOMEM */ + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; + } + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + ino, orig_offset, 0); + BUG_ON(ret); /* -ENOMEM */ + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + ino, orig_offset, 0); + BUG_ON(ret); /* -ENOMEM */ + } + if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_mark_buffer_dirty(leaf); + } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + } +out: + btrfs_free_path(path); + return 0; +} + +/* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos, + bool force_uptodate) +{ + int ret = 0; + + if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && + !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* + * this just gets pages into the page cache and locks them down. + */ +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int err = 0; + int faili; + + for (i = 0; i < num_pages; i++) { + pages[i] = find_or_create_page(inode->i_mapping, index + i, + mask | __GFP_WRITE); + if (!pages[i]) { + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos, + force_uptodate); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes, false); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; + } + wait_on_page_writeback(pages[i]); + } + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent_bits(&BTRFS_I(inode)->io_tree, + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset <= last_pos) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + start_pos, last_pos, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; + } + + for (i = 0; i < num_pages; i++) { + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + + return ret; +} + +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (!ret) + return -ENOSPC; + + lockstart = round_down(pos, root->sectorsize); + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + if (ret <= 0) { + ret = 0; + btrfs_end_write_no_snapshoting(root); + } else { + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; +} + +static noinline ssize_t __btrfs_buffered_write(struct file *file, + struct iov_iter *i, + loff_t pos) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + struct extent_state *cached_state = NULL; + u64 release_bytes = 0; + u64 lockstart; + u64 lockend; + unsigned long first_index; + size_t num_written = 0; + int nrptrs; + int ret = 0; + bool only_release_metadata = false; + bool force_page_uptodate = false; + bool need_unlock; + + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + first_index = pos >> PAGE_CACHE_SHIFT; + + while (iov_iter_count(i) > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(iov_iter_count(i), + nrptrs * (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + size_t reserve_bytes; + size_t dirty_pages; + size_t copied; + + WARN_ON(num_pages > nrptrs); + + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + ret = -EFAULT; + break; + } + + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } + + if (ret) + break; + + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + else + btrfs_end_write_no_snapshoting(root); + break; + } + + release_bytes = reserve_bytes; + need_unlock = false; +again: + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, + force_page_uptodate); + if (ret) + break; + + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; + } + + copied = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, i); + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) { + force_page_uptodate = true; + dirty_pages = 0; + } else { + force_page_uptodate = false; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); + } + + /* + * If we had a short copy we need to release the excess delaloc + * bytes we reserved. We need to increment outstanding_extents + * because btrfs_delalloc_release_space will decrement it, but + * we still have an outstanding extent for the chunk we actually + * managed to copy. + */ + if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; + if (copied > 0) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); + } + + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; + + if (copied > 0) + ret = btrfs_dirty_pages(root, inode, pages, + dirty_pages, pos, copied, + NULL); + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; + } + + release_bytes = 0; + if (only_release_metadata) + btrfs_end_write_no_snapshoting(root); + + if (only_release_metadata && copied > 0) { + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; + } + + btrfs_drop_pages(pages, num_pages); + + cond_resched(); + + balance_dirty_pages_ratelimited(inode->i_mapping); + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root); + + pos += copied; + num_written += copied; + } + + kfree(pages); + + if (release_bytes) { + if (only_release_metadata) { + btrfs_end_write_no_snapshoting(root); + btrfs_delalloc_release_metadata(inode, release_bytes); + } else { + btrfs_delalloc_release_space(inode, release_bytes); + } + } + + return num_written ? num_written : ret; +} + +static ssize_t __btrfs_direct_write(struct kiocb *iocb, + struct iov_iter *from, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t written; + ssize_t written_buffered; + loff_t endbyte; + int err; + + written = generic_file_direct_write(iocb, from, pos); + + if (written < 0 || !iov_iter_count(from)) + return written; + + pos += written; + written_buffered = __btrfs_buffered_write(file, from, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ + endbyte = pos + written_buffered - 1; + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); +out: + return written ? written : err; +} + +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start_pos; + u64 end_pos; + ssize_t num_written = 0; + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + ssize_t err; + loff_t pos; + size_t count; + + mutex_lock(&inode->i_mutex); + err = generic_write_checks(iocb, from); + if (err <= 0) { + mutex_unlock(&inode->i_mutex); + return err; + } + + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_suid(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + mutex_unlock(&inode->i_mutex); + err = -EROFS; + goto out; + } + + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); + + pos = iocb->ki_pos; + count = iov_iter_count(from); + start_pos = round_down(pos, root->sectorsize); + if (start_pos > i_size_read(inode)) { + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + + if (iocb->ki_flags & IOCB_DIRECT) { + num_written = __btrfs_direct_write(iocb, from, pos); + } else { + num_written = __btrfs_buffered_write(file, from, pos); + if (num_written > 0) + iocb->ki_pos = pos + num_written; + } + + mutex_unlock(&inode->i_mutex); + + /* + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. + */ + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); + if (num_written > 0) { + err = generic_write_sync(file, pos, num_written); + if (err < 0) + num_written = err; + } + + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); +out: + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + return 0; +} + +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = btrfs_fdatawrite_range(inode, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; + bool full_sync = 0; + + trace_btrfs_sync_file(file, datasync); + + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + atomic_inc(&root->log_batch); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ + if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ + ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; + } + atomic_inc(&root->log_batch); + + /* + * If the last transaction that changed this file was before the current + * transaction and we have the full sync flag set in our inode, we can + * bail out now without any syncing. + * + * Note that we can't bail out if the full sync flag isn't set. This is + * because when the full sync flag is set we start all ordered extents + * and wait for them to fully complete - when they complete they update + * the inode's last_trans field through: + * + * btrfs_finish_ordered_io() -> + * btrfs_update_inode_fallback() -> + * btrfs_update_inode() -> + * btrfs_set_inode_last_trans() + * + * So we are sure that last_trans is up to date and can do this check to + * bail out safely. For the fast path, when the full sync flag is not + * set in our inode, we can not do it because we start only our ordered + * extents and don't wait for them to complete (that is when + * btrfs_finish_ordered_io runs), so here at this point their last_trans + * value might be less than or equals to fs_info->last_trans_committed, + * and setting a speculative last_trans for an inode when a buffered + * write is made (such as fs_info->generation + 1 for example) would not + * be reliable since after setting the value and before fsync is called + * any number of transactions can start and commit (transaction kthread + * commits the current transaction periodically), and a transaction + * commit does not start nor waits for ordered extents to complete. + */ + smp_mb(); + if (btrfs_inode_in_log(inode, root->fs_info->generation) || + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed)) { + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file->private_data) + btrfs_ioctl_trans_end(file); + + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_unlock(&inode->i_mutex); + goto out; + } + trans->sync = true; + + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); + if (ret < 0) { + /* Fallthrough and commit/free transaction. */ + ret = 1; + } + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&inode->i_mutex); + + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + + if (ret != BTRFS_NO_LOG_SYNC) { + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); + if (!ret) { + ret = btrfs_end_transaction(trans, root); + goto out; + } + } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, + end - start + 1); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; + } + } + ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_end_transaction(trans, root); + } +out: + return ret > 0 ? -EIO : ret; +} + +static const struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + + return 0; +} + +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key key; + int ret; + + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + return ret; + BUG_ON(!ret); + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { + u64 num_bytes; + + key.offset = offset; + btrfs_set_item_key_safe(root->fs_info, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, end - offset, 0, end - offset, + 0, 0, 0); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->ram_bytes = hole_em->len; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + do { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, hole_em, 1); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + free_extent_map(hole_em); + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } + + return 0; +} + +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_block_rsv *rsv; + struct btrfs_trans_handle *trans; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 drop_end; + int ret = 0; + int err = 0; + int rsv_count; + bool same_page; + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; + + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ + /* + * Only do this if we are in the same page and we aren't doing the + * entire page. + */ + if (same_page && len < PAGE_CACHE_SIZE) { + if (offset < ino_size) { + truncated_page = true; + ret = btrfs_truncate_page(inode, offset, len, 0); + } else { + ret = 0; + } + goto out_only_mutex; + } + + /* zero back part of the first page */ + if (offset < ino_size) { + truncated_page = true; + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + truncated_page = true; + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } + } + } + + if (lockend < lockstart) { + ret = 0; + goto out_only_mutex; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_NOFS); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out_free; + } + rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set + */ + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + trans->block_rsv = rsv; + + cur_offset = lockstart; + len = lockend - cur_offset; + while (cur_offset < lockend) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, lockend + 1, + &drop_end, 1, 0, 0, NULL); + if (ret != -ENOSPC) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + + if (ret) { + err = ret; + goto out_trans; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); +out_free: + btrfs_free_path(path); + btrfs_free_block_rsv(root, rsv); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); +out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } + mutex_unlock(&inode->i_mutex); + if (ret && !err) + err = ret; + return err; +} + +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; + int ret; + + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); + + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(inode, offset, len); + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); + if (ret) + goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; + } + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + u64 actual_end; + + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + break; + } + last_byte = min(extent_map_end(em), alloc_end); + actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = ALIGN(last_byte, blocksize); + + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + } else if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, + NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, + root); + } + } + free_extent_map(em); + if (ret < 0) + break; + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + return ret; +} + +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + u64 lockstart; + u64 lockend; + u64 start; + u64 len; + int ret = 0; + + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); + if (lockend <= lockstart) + lockend = lockstart + root->sectorsize; + lockend--; + len = lockend - lockstart + 1; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + &cached_state); + + while (start < inode->i_size) { + em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + em = NULL; + break; + } + + if (whence == SEEK_HOLE && + (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + else if (whence == SEEK_DATA && + (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + + start = em->start + em->len; + free_extent_map(em); + em = NULL; + cond_resched(); + } + free_extent_map(em); + if (!ret) { + if (whence == SEEK_DATA && start >= inode->i_size) + ret = -ENXIO; + else + *offset = min_t(loff_t, start, inode->i_size); + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + return ret; +} + +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + switch (whence) { + case SEEK_END: + case SEEK_CUR: + offset = generic_file_llseek(file, offset, whence); + goto out; + case SEEK_DATA: + case SEEK_HOLE: + if (offset >= i_size_read(inode)) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + ret = find_desired_extent(inode, &offset, whence); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +const struct file_operations btrfs_file_operations = { + .llseek = btrfs_file_llseek, + .read_iter = generic_file_read_iter, + .splice_read = generic_file_splice_read, + .write_iter = btrfs_file_write_iter, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} diff --git a/kernel/fs/btrfs/free-space-cache.c b/kernel/fs/btrfs/free-space-cache.c new file mode 100644 index 000000000..9dbe5b548 --- /dev/null +++ b/kernel/fs/btrfs/free-space-cache.c @@ -0,0 +1,3653 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" +#include "disk-io.h" +#include "extent_io.h" +#include "inode-map.h" +#include "volumes.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) + +struct btrfs_trim_range { + u64 start; + u64 bytes; + struct list_head list; +}; + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); + +static struct inode *__lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_path *path, + u64 offset) +{ + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(path); + + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & + ~(__GFP_FS | __GFP_HIGHMEM)); + + return inode; +} + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path) +{ + struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, + block_group->key.objectid); + if (IS_ERR(inode)) + return inode; + + spin_lock(&block_group->lock); + if (!((BTRFS_I(inode)->flags & flags) == flags)) { + btrfs_info(root->fs_info, + "Old style space inode found, converting."); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + + if (!block_group->iref) { + block_group->inode = igrab(inode); + block_group->iref = 1; + } + spin_unlock(&block_group->lock); + + return inode; +} + +static int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 ino, u64 offset) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; + int ret; + + ret = btrfs_insert_empty_inode(trans, root, path, ino); + if (ret) + return ret; + + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, flags); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, offset); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; +} + +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + int ret; + u64 ino; + + ret = btrfs_find_free_objectid(root, &ino); + if (ret < 0) + return ret; + + return __create_free_space_inode(root, trans, path, ino, + block_group->key.objectid); +} + +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct inode *inode) +{ + int ret = 0; + struct btrfs_path *path = btrfs_alloc_path(); + + if (!path) { + ret = -ENOMEM; + goto fail; + } + + if (block_group) { + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + } + btrfs_free_path(path); + + btrfs_i_size_write(inode, 0); + truncate_pagecache(inode, 0); + + /* + * We don't need an orphan item because truncating the free space cache + * will never be split across transactions. + * We don't need to check for -EAGAIN because we're a free space + * cache inode + */ + ret = btrfs_truncate_inode_items(trans, root, inode, + 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + mutex_unlock(&trans->transaction->cache_write_mutex); + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + ret = btrfs_update_inode(trans, root, inode); + + if (block_group) + mutex_unlock(&trans->transaction->cache_write_mutex); + +fail: + if (ret) + btrfs_abort_transaction(trans, root, ret); + + return ret; +} + +static int readahead_cache(struct inode *inode) +{ + struct file_ra_state *ra; + unsigned long last_index; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + file_ra_state_init(ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); + + kfree(ra); + + return 0; +} + +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root, int write) +{ + int num_pages; + int check_crcs = 0; + + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + check_crcs = 1; + + /* Make sure we can fit our crcs into the first page */ + if (write && check_crcs && + (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + return -ENOSPC; + + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); + + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + + io_ctl->num_pages = num_pages; + io_ctl->root = root; + io_ctl->check_crcs = check_crcs; + io_ctl->inode = inode; + + return 0; +} + +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) +{ + kfree(io_ctl->pages); + io_ctl->pages = NULL; +} + +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) +{ + if (io_ctl->cur) { + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) +{ + ASSERT(io_ctl->index < io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = page_address(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } + } +} + +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + + return 0; +} + +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) +{ + __le64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) +{ + __le64 *gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "BTRFS: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + crc = btrfs_csum_data(io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = page_address(io_ctl->pages[0]); + tmp += index; + *tmp = crc; +} + +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = page_address(io_ctl->pages[0]); + tmp += index; + val = *tmp; + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + + return 0; +} + +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries. This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache. So run through the space cache that we just + * loaded and merge contiguous entries. This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *e, *prev = NULL; + struct rb_node *n; + +again: + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + e = rb_entry(n, struct btrfs_free_space, offset_index); + if (!prev) + goto next; + if (e->bitmap || prev->bitmap) + goto next; + if (prev->offset + prev->bytes == e->offset) { + unlink_free_space(ctl, prev); + unlink_free_space(ctl, e); + prev->bytes += e->bytes; + kmem_cache_free(btrfs_free_space_cachep, e); + link_free_space(ctl, prev); + prev = NULL; + spin_unlock(&ctl->tree_lock); + goto again; + } +next: + prev = e; + } + spin_unlock(&ctl->tree_lock); +} + +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct btrfs_io_ctl io_ctl; + struct btrfs_key key; + struct btrfs_free_space *e, *n; + LIST_HEAD(bitmaps); + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u8 type; + int ret = 0; + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) + return 0; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return 0; + else if (ret > 0) { + btrfs_release_path(path); + return 0; + } + + ret = -1; + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_release_path(path); + + if (!BTRFS_I(inode)->generation) { + btrfs_info(root->fs_info, + "The free space cache file (%llu) is invalid. skip it\n", + offset); + return 0; + } + + if (BTRFS_I(inode)->generation != generation) { + btrfs_err(root->fs_info, + "free space inode generation (%llu) " + "did not match free space cache generation (%llu)", + BTRFS_I(inode)->generation, generation); + return 0; + } + + if (!num_entries) + return 0; + + ret = io_ctl_init(&io_ctl, inode, root, 0); + if (ret) + return ret; + + ret = readahead_cache(inode); + if (ret) + goto out; + + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; + + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; + + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) + goto free_cache; + + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + } else { + ASSERT(num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); + goto free_cache; + } + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + } + + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { + list_del_init(&e->list); + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; + } + + io_ctl_drop_pages(&io_ctl); + merge_space_tree(ctl); + ret = 1; +out: + io_ctl_free(&io_ctl); + return ret; +free_cache: + io_ctl_drop_pages(&io_ctl); + __btrfs_remove_free_space_cache(ctl); + goto out; +} + +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode; + struct btrfs_path *path; + int ret = 0; + bool matched; + u64 used = btrfs_block_group_used(&block_group->item); + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + path = btrfs_alloc_path(); + if (!path) + return 0; + path->search_commit_root = 1; + path->skip_locking = 1; + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + btrfs_free_path(path); + goto out; + } + spin_unlock(&block_group->lock); + + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + path, block_group->key.objectid); + btrfs_free_path(path); + if (ret <= 0) + goto out; + + spin_lock(&ctl->tree_lock); + matched = (ctl->free_space == (block_group->key.offset - used - + block_group->bytes_super)); + spin_unlock(&ctl->tree_lock); + + if (!matched) { + __btrfs_remove_free_space_cache(ctl); + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", + block_group->key.objectid); + ret = -1; + } +out: + if (ret < 0) { + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + ret = 0; + + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", + block_group->key.objectid); + } + + iput(inode); + return ret; +} + +static noinline_for_stack +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) +{ + int ret; + struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); + struct btrfs_trim_range *trim_entry; + + /* Get the cluster for this block_group if it exists */ + if (block_group && !list_empty(&block_group->cluster_list)) { + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + } + + if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); + node = rb_first(&cluster->root); + cluster = NULL; + } + + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + *entries += 1; + + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto fail; + + if (e->bitmap) { + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; + } + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); + cluster = NULL; + } + } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } + + /* + * Make sure we don't miss any range that was removed from our rbtree + * because trimming is running. Otherwise after a umount+mount (or crash + * after committing the transaction) we would leak free space and get + * an inconsistent free space cache report from fsck. + */ + list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { + ret = io_ctl_add_entry(io_ctl, trim_entry->start, + trim_entry->bytes, NULL); + if (ret) + goto fail; + *entries += 1; + } + + return 0; +fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + * + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + start = block_group->key.objectid; + + while (start < block_group->key.objectid + block_group->key.offset) { + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL); + if (ret) + return 0; + + /* This pinned extent is out of our range */ + if (extent_start >= block_group->key.objectid + + block_group->key.offset) + return 0; + + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; + + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); + if (ret) + return -ENOSPC; + + start = extent_end; + } + + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + int ret; + + /* Write out the bitmaps */ + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); + if (ret) + return -ENOSPC; + list_del_init(&entry->list); + } + + return 0; +} + +static int flush_dirty_cache(struct inode *inode) +{ + int ret; + + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + + return ret; +} + +static void noinline_for_stack +cleanup_bitmap_list(struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} + +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + if (block_group) + root = root->fs_info->tree_root; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + } + btrfs_update_inode(trans, root, inode); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * or an errno if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + int must_iput = 0; + + if (!i_size_read(inode)) + return -EIO; + + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, root, 1); + if (ret) + return ret; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; + must_iput = 1; + goto out; + } + spin_unlock(&block_group->lock); + } + + /* Lock all pages first so we can lock the extent safely. */ + ret = io_ctl_prepare_pages(io_ctl, inode, 0); + if (ret) + goto out; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); + + io_ctl_set_generation(io_ctl, trans->transid); + + mutex_lock(&ctl->cache_writeout_mutex); + /* Write out the extent entries in the free space cache */ + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc_locked; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed + */ + ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; + + /* + * At last, we write out all the bitmaps and keep cache_writeout_mutex + * locked while doing it because a concurrent trim can be manipulating + * or freeing the bitmap. + */ + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, + 0, i_size_read(inode), &cached_state); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(io_ctl); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating the + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); + if (ret) + goto out; + + return 0; + +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); + return ret; + +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + +out_nospc: + cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + + goto out; +} + +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct inode *inode; + int ret = 0; + + root = root->fs_info->tree_root; + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) + return 0; + + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, + &block_group->io_ctl, trans, + path, block_group->key.objectid); + if (ret) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); + } + + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + + return ret; +} + +static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, + u64 offset) +{ + ASSERT(offset >= bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div_u64(offset, unit)); +} + +static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) +{ + return (unsigned long)(div_u64(bytes, unit)); +} + +static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, + u64 offset) +{ + u64 bitmap_start; + u32 bytes_per_bitmap; + + bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; + bitmap_start = offset - ctl->start; + bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += ctl->start; + + return bitmap_start; +} + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) { + p = &(*p)->rb_left; + } else if (offset > info->offset) { + p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_right; + } else { + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_left; + } + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. + * + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. + */ +static struct btrfs_free_space * +tree_search_offset(struct btrfs_free_space_ctl *ctl, + u64 offset, int bitmap_only, int fuzzy) +{ + struct rb_node *n = ctl->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } + + entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; + + if (offset < entry->offset) + n = n->rb_left; + else if (offset > entry->offset) + n = n->rb_right; + else + break; + } + + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; + + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; + + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { + /* + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry + */ + n = rb_prev(&entry->offset_index); + if (n) { + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; + } + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + ASSERT(entry->offset <= offset); + } else { + if (fuzzy) + return entry; + else + return NULL; + } + } + + if (entry->bitmap) { + n = rb_prev(&entry->offset_index); + if (n) { + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; + } + if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + ctl->unit > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; +} + +static inline void +__unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &ctl->free_space_offset); + ctl->free_extents--; +} + +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + __unlink_free_space(ctl, info); + ctl->free_space -= info->bytes; +} + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + int ret = 0; + + ASSERT(info->bytes || info->bitmap); + ret = tree_insert_offset(&ctl->free_space_offset, info->offset, + &info->offset_index, (info->bitmap != NULL)); + if (ret) + return ret; + + ctl->free_space += info->bytes; + ctl->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_block_group_cache *block_group = ctl->private; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->key.offset; + u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + + max_bitmaps = max_t(u32, max_bitmaps, 1); + + ASSERT(ctl->total_bitmaps <= max_bitmaps); + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div_u64(size, 1024 * 1024 * 1024); + + /* + * we want to account for 1 more bitmap than what we have so we can make + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as + * we add more bitmaps. + */ + bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; + + if (bitmap_bytes >= max_bytes) { + ctl->extents_thresh = 0; + return; + } + + /* + * we want the extent entry threshold to always be at most 1/2 the max + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + + ctl->extents_thresh = + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); +} + +static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes) +{ + unsigned long start, count; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + ASSERT(start + count <= BITS_PER_BITMAP); + + bitmap_clear(info->bitmap, start, count); + + info->bytes -= bytes; +} + +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + __bitmap_clear_bits(ctl, info, offset, bytes); + ctl->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, count; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + ASSERT(start + count <= BITS_PER_BITMAP); + + bitmap_set(info->bitmap, start, count); + + info->bytes += bytes; + ctl->free_space += bytes; +} + +/* + * If we can not find suitable extent, we will use bytes to record + * the size of the max extent. + */ +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long max_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + unsigned long extent_bits; + + i = offset_to_bit(bitmap_info->offset, ctl->unit, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, ctl->unit); + + for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + extent_bits = next_zero - i; + if (extent_bits >= bits) { + found_bits = extent_bits; + break; + } else if (extent_bits > max_bits) { + max_bits = extent_bits; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * ctl->unit) + bitmap_info->offset; + *bytes = (u64)(found_bits) * ctl->unit; + return 0; + } + + *bytes = (u64)(max_bits) * ctl->unit; + return -1; +} + +/* Cache the size of the max extent in bytes */ +static struct btrfs_free_space * +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align, u64 *max_extent_size) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + u64 tmp; + u64 align_off; + int ret; + + if (!ctl->free_space_offset.rb_node) + goto out; + + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); + if (!entry) + goto out; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; + continue; + } + + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + tmp = entry->offset - ctl->start + align - 1; + tmp = div64_u64(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + if (entry->bytes < *bytes + align_off) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; + continue; + } + + if (entry->bitmap) { + u64 size = *bytes; + + ret = search_bitmap(ctl, entry, &tmp, &size); + if (!ret) { + *offset = tmp; + *bytes = size; + return entry; + } else if (size > *max_extent_size) { + *max_extent_size = size; + } + continue; + } + + *offset = tmp; + *bytes = entry->bytes - align_off; + return entry; + } +out: + return NULL; +} + +static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset) +{ + info->offset = offset_to_bitmap(ctl, offset); + info->bytes = 0; + INIT_LIST_HEAD(&info->list); + link_free_space(ctl, info); + ctl->total_bitmaps++; + + ctl->op->recalc_thresholds(ctl); +} + +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(ctl, bitmap_info); + kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); +} + +static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; + + /* + * We need to search for bits in this bitmap. We could only cover some + * of the extent in this bitmap thanks to how we add space, so we need + * to search for as much as it as we can and clear that amount, and then + * go searching for the next bit. + */ + search_start = *offset; + search_bytes = ctl->unit; + search_bytes = min(search_bytes, end - search_start + 1); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + if (ret < 0 || search_start != *offset) + return -EINVAL; + + /* We may have found more bits than what we need */ + search_bytes = min(search_bytes, *bytes); + + /* Cannot clear past the end of the bitmap */ + search_bytes = min(search_bytes, end - search_start + 1); + + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + *offset += search_bytes; + *bytes -= search_bytes; + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = ctl->unit; + ret = search_bitmap(ctl, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + return 0; +} + +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_block_group_cache *block_group = ctl->private; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (ctl->free_extents < ctl->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plent + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= block_group->sectorsize * 4) { + if (ctl->free_extents * 2 <= ctl->extents_thresh) + return false; + } else { + return false; + } + } + + /* + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. + */ + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) + return false; + + return true; +} + +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; + int added = 0; + u64 bytes, offset, bytes_added; + int ret; + + bytes = info->bytes; + offset = info->offset; + + if (!ctl->op->use_bitmap(ctl, info)) + return 0; + + if (ctl->op == &free_space_op) + block_group = ctl->private; +again: + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } + +no_cluster_bitmap: + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + ASSERT(added == 0); + goto new_bitmap; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(ctl, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&ctl->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!info) { + spin_lock(&ctl->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&ctl->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, info); + } + + return ret; +} + +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, bool update_stat) +{ + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(ctl, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(ctl, offset - 1, 0, 0); + + if (right_info && !right_info->bitmap) { + if (update_stat) + unlink_free_space(ctl, right_info); + else + __unlink_free_space(ctl, right_info); + info->bytes += right_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, right_info); + merged = true; + } + + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { + if (update_stat) + unlink_free_space(ctl, left_info); + else + __unlink_free_space(ctl, left_info); + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, left_info); + merged = true; + } + + return merged; +} + +static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + unsigned long i; + unsigned long j; + const u64 end = info->offset + info->bytes; + const u64 bitmap_offset = offset_to_bitmap(ctl, end); + u64 bytes; + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, end); + j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); + if (j == i) + return false; + bytes = (j - i) * ctl->unit; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, end, bytes); + else + __bitmap_clear_bits(ctl, bitmap, end, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + u64 bitmap_offset; + unsigned long i; + unsigned long j; + unsigned long prev_j; + u64 bytes; + + bitmap_offset = offset_to_bitmap(ctl, info->offset); + /* If we're on a boundary, try the previous logical bitmap. */ + if (bitmap_offset == info->offset) { + if (info->offset == 0) + return false; + bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); + } + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; + j = 0; + prev_j = (unsigned long)-1; + for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { + if (j > i) + break; + prev_j = j; + } + if (prev_j == i) + return false; + + if (prev_j == (unsigned long)-1) + bytes = (i + 1) * ctl->unit; + else + bytes = (i - prev_j) * ctl->unit; + + info->offset -= bytes; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + else + __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +/* + * We prefer always to allocate from extent entries, both for clustered and + * non-clustered allocation requests. So when attempting to add a new extent + * entry, try to see if there's adjacent free space in bitmap entries, and if + * there is, migrate that space from the bitmaps to the extent. + * Like this we get better chances of satisfying space allocation requests + * because we attempt to satisfy them based on a single cache entry, and never + * on 2 or more entries - even if the entries represent a contiguous free space + * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry + * ends). + */ +static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + /* + * Only work with disconnected entries, as we can change their offset, + * and must be extent entries. + */ + ASSERT(!info->bitmap); + ASSERT(RB_EMPTY_NODE(&info->offset_index)); + + if (ctl->total_bitmaps > 0) { + bool stole_end; + bool stole_front = false; + + stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); + if (ctl->total_bitmaps > 0) + stole_front = steal_from_bitmap_to_front(ctl, info, + update_stat); + + if (stole_end || stole_front) + try_merge_free_space(ctl, info, update_stat); + } +} + +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + RB_CLEAR_NODE(&info->offset_index); + + spin_lock(&ctl->tree_lock); + + if (try_merge_free_space(ctl, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(ctl, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: + /* + * Only steal free space from adjacent bitmaps if we're sure we're not + * going to add the new free space to existing bitmap entries - because + * that would mean unnecessary work that would be reverted. Therefore + * attempt to steal space from bitmaps if we're adding an extent entry. + */ + steal_from_bitmap(ctl, info, true); + + ret = link_free_space(ctl, info); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); +out: + spin_unlock(&ctl->tree_lock); + + if (ret) { + printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); + ASSERT(ret != -EEXIST); + } + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + int ret; + bool re_search = false; + + spin_lock(&ctl->tree_lock); + +again: + ret = 0; + if (!bytes) + goto out_lock; + + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) { + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. + */ + WARN_ON(re_search); + goto out_lock; + } + } + + re_search = false; + if (!info->bitmap) { + unlink_free_space(ctl, info); + if (offset == info->offset) { + u64 to_free = min(bytes, info->bytes); + + info->bytes -= to_free; + info->offset += to_free; + if (info->bytes) { + ret = link_free_space(ctl, info); + WARN_ON(ret); + } else { + kmem_cache_free(btrfs_free_space_cachep, info); + } + + offset += to_free; + bytes -= to_free; + goto again; + } else { + u64 old_end = info->bytes + info->offset; + + info->bytes = offset - info->offset; + ret = link_free_space(ctl, info); + WARN_ON(ret); + if (ret) + goto out_lock; + + /* Not enough bytes in this entry to satisfy us */ + if (old_end < offset + bytes) { + bytes -= old_end - offset; + offset = old_end; + goto again; + } else if (old_end == offset + bytes) { + /* all done */ + goto out_lock; + } + spin_unlock(&ctl->tree_lock); + + ret = btrfs_add_free_space(block_group, offset + bytes, + old_end - (offset + bytes)); + WARN_ON(ret); + goto out; + } + } + + ret = remove_from_bitmap(ctl, info, &offset, &bytes); + if (ret == -EAGAIN) { + re_search = true; + goto again; + } +out_lock: + spin_unlock(&ctl->tree_lock); +out: + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes && !block_group->ro) + count++; + btrfs_crit(block_group->fs_info, + "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, + (info->bitmap) ? "yes" : "no"); + } + btrfs_info(block_group->fs_info, "block group has cluster?: %s", + list_empty(&block_group->cluster_list) ? "no" : "yes"); + btrfs_info(block_group->fs_info, + "%d blocks of free space at or bigger than bytes is", count); +} + +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = block_group->sectorsize; + ctl->start = block_group->key.objectid; + ctl->private = block_group; + ctl->op = &free_space_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + ctl->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); +} + +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->block_group = NULL; + cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + + node = rb_first(&cluster->root); + while (node) { + bool bitmap; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + RB_CLEAR_NODE(&entry->offset_index); + + bitmap = (entry->bitmap != NULL); + if (!bitmap) { + try_merge_free_space(ctl, entry, false); + steal_from_bitmap(ctl, entry, false); + } + tree_insert_offset(&ctl->free_space_offset, + entry->offset, &entry->offset_index, bitmap); + } + cluster->root = RB_ROOT; + +out: + spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); + return 0; +} + +static void __btrfs_remove_free_space_cache_locked( + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} + +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + spin_lock(&ctl->tree_lock); + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_cluster *cluster; + struct list_head *head; + + spin_lock(&ctl->tree_lock); + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + + cond_resched_lock(&ctl->tree_lock); + } + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); + +} + +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; + u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; + + spin_lock(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len, max_extent_size); + if (!entry) + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(ctl, entry, offset, bytes); + if (!entry->bytes) + free_bitmap(ctl, entry); + } else { + unlink_free_space(ctl, entry); + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } +out: + spin_unlock(&ctl->tree_lock); + + if (align_gap_len) + __btrfs_add_free_space(ctl, align_gap, align_gap_len); + return ret; +} + +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl; + int ret; + + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); + + ctl = block_group->free_space_ctl; + + /* now return any extents the cluster had on it */ + spin_lock(&ctl->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&ctl->tree_lock); + + /* finally drop our ref */ + btrfs_put_block_group(block_group); + return ret; +} + +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *entry, + u64 bytes, u64 min_start, + u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(ctl, entry, &search_start, &search_bytes); + if (err) { + if (search_bytes > *max_extent_size) + *max_extent_size = search_bytes; + return 0; + } + + ret = search_start; + __bitmap_clear_bits(ctl, entry, ret, bytes); + + return ret; +} + +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start, u64 *max_extent_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + while (1) { + if (entry->bytes < bytes && entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; + + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + cluster->window_start, + max_extent_size); + if (ret == 0) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + cluster->window_start += bytes; + } else { + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + } + + if (entry->bytes == 0) + rb_erase(&entry->offset_index, &cluster->root); + break; + } +out: + spin_unlock(&cluster->lock); + + if (!ret) + return 0; + + spin_lock(&ctl->tree_lock); + + ctl->free_space -= bytes; + if (entry->bytes == 0) { + ctl->free_extents--; + if (entry->bitmap) { + kfree(entry->bitmap); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); + } + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&ctl->tree_lock); + + return ret; +} + +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + unsigned long next_zero; + unsigned long i; + unsigned long want_bits; + unsigned long min_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + int ret; + + i = offset_to_bit(entry->offset, ctl->unit, + max_t(u64, offset, entry->offset)); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); + +again: + found_bits = 0; + for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= min_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -ENOSPC; + + if (!total_found) { + start = i; + cluster->max_size = 0; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; + + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; + goto again; + } + + cluster->window_start = start * ctl->unit + entry->offset; + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + ASSERT(!ret); /* -EEXIST; Logic error */ + + trace_btrfs_setup_cluster(block_group, cluster, + total_found * ctl->unit, 1); + return 0; +} + +/* + * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. + */ +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *first = NULL; + struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *last; + struct rb_node *node; + u64 window_free; + u64 max_extent; + u64 total_size = 0; + + entry = tree_search_offset(ctl, offset, 0, 1); + if (!entry) + return -ENOSPC; + + /* + * We don't want bitmaps, so just move along until we find a normal + * extent entry. + */ + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + window_free = entry->bytes; + max_extent = entry->bytes; + first = entry; + last = entry; + + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + continue; + } + + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + + cluster->window_start = first->offset; + + node = &first->offset_index; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + */ + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (entry->bitmap || entry->bytes < min_bytes) + continue; + + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); + total_size += entry->bytes; + ASSERT(!ret); /* -EEXIST; Logic error */ + } while (node && entry != last); + + cluster->max_size = max_extent; + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); + + if (ctl->total_bitmaps == 0) + return -ENOSPC; + + /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, cont1_bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. + */ + return -ENOSPC; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes+empty_size. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); + u64 min_bytes; + u64 cont1_bytes; + int ret; + + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + cont1_bytes = min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } + + spin_lock(&ctl->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (ctl->free_space < bytes) { + spin_unlock(&ctl->tree_lock); + return -ENOSPC; + } + + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } + + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes + empty_size, + cont1_bytes, min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes + empty_size, + cont1_bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); + + if (!ret) { + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); + } +out: + spin_unlock(&cluster->lock); + spin_unlock(&ctl->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root = RB_ROOT; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes, + struct btrfs_trim_range *trim_entry) +{ + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int ret; + int update = 0; + u64 trimmed = 0; + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + mutex_lock(&ctl->cache_writeout_mutex); + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + list_del(&trim_entry->list); + mutex_unlock(&ctl->cache_writeout_mutex); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; + + while (start < end) { + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + break; + } + + entry = tree_search_offset(ctl, start, 0, 1); + if (!entry) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + break; + } + + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + break; + } + + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + + spin_unlock(&ctl->tree_lock); + trim_entry.start = extent_start; + trim_entry.bytes = extent_bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes, &trim_entry); + if (ret) + break; +next: + start += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } +out: + return ret; +} + +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + + while (offset < end) { + bool next_bitmap = false; + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + next_bitmap = true; + goto next; + } + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + next_bitmap = true; + goto next; + } + + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + trim_entry.start = start; + trim_entry.bytes = bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes, &trim_entry); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + } else { + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; + } + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + goto out; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + lock_chunks(block_group->fs_info->chunk_root); + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + /* + * remove_extent_mapping() will delete us from the pinned_chunks + * list, which is protected by the chunk mutex. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We've left one free space entry and other tasks trimming + * this block group have left 1 entry each one. Free them. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } else { + spin_unlock(&block_group->lock); + } + + return ret; +} + +/* + * Find the left-most item in the cache tree, and then return the + * smallest inode number in the item. + * + * Note: the returned inode number may not be the smallest one in + * the tree, if the left-most item is a bitmap. + */ +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +{ + struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; + struct btrfs_free_space *entry = NULL; + u64 ino = 0; + + spin_lock(&ctl->tree_lock); + + if (RB_EMPTY_ROOT(&ctl->free_space_offset)) + goto out; + + entry = rb_entry(rb_first(&ctl->free_space_offset), + struct btrfs_free_space, offset_index); + + if (!entry->bitmap) { + ino = entry->offset; + + unlink_free_space(ctl, entry); + entry->offset++; + entry->bytes--; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } else { + u64 offset = 0; + u64 count = 1; + int ret; + + ret = search_bitmap(ctl, entry, &offset, &count); + /* Logic error; Should be empty if it can't find anything */ + ASSERT(!ret); + + ino = offset; + bitmap_clear_bits(ctl, entry, offset, 1); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + } +out: + spin_unlock(&ctl->tree_lock); + + return ino; +} + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct inode *inode = NULL; + + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_inode) + inode = igrab(root->ino_cache_inode); + spin_unlock(&root->ino_cache_lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, 0); + if (IS_ERR(inode)) + return inode; + + spin_lock(&root->ino_cache_lock); + if (!btrfs_fs_closing(root->fs_info)) + root->ino_cache_inode = igrab(inode); + spin_unlock(&root->ino_cache_lock); + + return inode; +} + +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + return __create_free_space_inode(root, trans, path, + BTRFS_FREE_INO_OBJECTID, 0); +} + +int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + int ret = 0; + u64 root_gen = btrfs_root_generation(&root->root_item); + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + /* + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. + */ + if (btrfs_fs_closing(fs_info)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return 0; + + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode)) + goto out; + + if (root_gen != BTRFS_I(inode)->generation) + goto out_put; + + ret = __load_free_space_cache(root, inode, ctl, path, 0); + + if (ret < 0) + btrfs_err(fs_info, + "failed to load free ino cache for root %llu", + root->root_key.objectid); +out_put: + iput(inode); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + int ret; + struct btrfs_io_ctl io_ctl; + bool release_metadata = true; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + memset(&io_ctl, 0, sizeof(io_ctl)); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, + trans, path, 0); + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; + ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } + + if (ret) { + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free ino cache for root %llu", + root->root_key.objectid); +#endif + } + + return ret; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; + +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + } + + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + info = NULL; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); + if (map) + kfree(map); + return 0; +} + +/* + * Checks to see if the given range is in the free space cache. This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&ctl->tree_lock); + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) + goto out; + } + +have_info: + if (info->bitmap) { + u64 bit_off, bit_bytes; + struct rb_node *n; + struct btrfs_free_space *tmp; + + bit_off = offset; + bit_bytes = ctl->unit; + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + if (!ret) { + if (bit_off == offset) { + ret = 1; + goto out; + } else if (bit_off > offset && + offset + bytes > bit_off) { + ret = 1; + goto out; + } + } + + n = rb_prev(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (tmp->offset + tmp->bytes < offset) + break; + if (offset + bytes < tmp->offset) { + n = rb_prev(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + n = rb_next(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (offset + bytes < tmp->offset) + break; + if (tmp->offset + tmp->bytes < offset) { + n = rb_next(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + ret = 0; + goto out; + } + + if (info->offset == offset) { + ret = 1; + goto out; + } + + if (offset > info->offset && offset < info->offset + info->bytes) + ret = 1; +out: + spin_unlock(&ctl->tree_lock); + return ret; +} +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/kernel/fs/btrfs/free-space-cache.h b/kernel/fs/btrfs/free-space-cache.h new file mode 100644 index 000000000..a16a029ad --- /dev/null +++ b/kernel/fs/btrfs/free-space-cache.h @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + +struct btrfs_free_space_ctl { + spinlock_t tree_lock; + struct rb_root free_space_offset; + u64 free_space; + int extents_thresh; + int free_extents; + int total_bitmaps; + int unit; + u64 start; + struct btrfs_free_space_op *op; + void *private; + struct mutex cache_writeout_mutex; + struct list_head trimming_ranges; +}; + +struct btrfs_free_space_op { + void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); + bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +}; + +struct btrfs_io_ctl; + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path); +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct inode *inode); +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset); +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path); +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path); +int load_free_ino_cache(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode); + +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 bytenr, u64 size); +static inline int +btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->free_space_ctl, + bytenr, size); +} +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size); +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +int btrfs_find_space_cluster(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start, u64 *max_extent_size); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); + +/* Support functions for runnint our sanity tests */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap); +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes); +#endif + +#endif diff --git a/kernel/fs/btrfs/hash.c b/kernel/fs/btrfs/hash.c new file mode 100644 index 000000000..aae520b2a --- /dev/null +++ b/kernel/fs/btrfs/hash.c @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include "hash.h" + +static struct crypto_shash *tfm; + +int __init btrfs_hash_init(void) +{ + tfm = crypto_alloc_shash("crc32c", 0, 0); + + return PTR_ERR_OR_ZERO(tfm); +} + +void btrfs_hash_exit(void) +{ + crypto_free_shash(tfm); +} + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) +{ + SHASH_DESC_ON_STACK(shash, tfm); + u32 *ctx = (u32 *)shash_desc_ctx(shash); + int err; + + shash->tfm = tfm; + shash->flags = 0; + *ctx = crc; + + err = crypto_shash_update(shash, address, length); + BUG_ON(err); + + return *ctx; +} diff --git a/kernel/fs/btrfs/hash.h b/kernel/fs/btrfs/hash.h new file mode 100644 index 000000000..118a2316e --- /dev/null +++ b/kernel/fs/btrfs/hash.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +int __init btrfs_hash_init(void); + +void btrfs_hash_exit(void); + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); + +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return btrfs_crc32c((u32)~1, name, len); +} + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) btrfs_crc32c(parent_objectid, name, len); +} + +#endif diff --git a/kernel/fs/btrfs/inode-item.c b/kernel/fs/btrfs/inode-item.c new file mode 100644 index 000000000..265e03c73 --- /dev/null +++ b/kernel/fs/btrfs/inode-item.c @@ -0,0 +1,440 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" +#include "print-tree.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, + const char *name, int name_len, + struct btrfs_inode_extref **extref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_extref *extref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + /* + * Search all extended backrefs in this item. We're only + * looking through any collisions so most of the time this is + * just going to compare against one buffer. If all is well, + * we'll return success and the inode ref object. + */ + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_ptr = (unsigned long)(&extref->name); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (ref_name_len == name_len && + btrfs_inode_extref_parent(leaf, extref) == ref_objectid && + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { + if (extref_ret) + *extref_ret = extref; + return 1; + } + + cur_offset += ref_name_len + sizeof(*extref); + } + return 0; +} + +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) +{ + int ret; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + return NULL; + return extref; +} + +static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, + u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + int ret; + int del_len = name_len + sizeof(*extref); + unsigned long ptr; + unsigned long item_start; + u32 item_size; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + /* + * Sanity check - did we find the right item for this name? + * This should always succeed so error here will make the FS + * readonly. + */ + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, &extref)) { + btrfs_std_error(root->fs_info, -ENOENT); + ret = -EROFS; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + + if (del_len == item_size) { + /* + * Common case only one ref in the item, remove the + * whole item. + */ + ret = btrfs_del_item(trans, root, path); + goto out; + } + + ptr = (unsigned long)extref; + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + + memmove_extent_buffer(leaf, ptr, ptr + del_len, + item_size - (ptr + del_len - item_start)); + + btrfs_truncate_item(root, path, item_size - del_len, 1); + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int search_ext_refs = 0; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + key.type = BTRFS_INODE_REF_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + search_ext_refs = 1; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + search_ext_refs = 1; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + btrfs_truncate_item(root, path, item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + + if (search_ext_refs) { + /* + * No refs were found, or we could not find the + * name in our ref array. Find and remove the extended + * inode ref then. + */ + return btrfs_del_inode_extref(trans, root, name, name_len, + inode_objectid, ref_objectid, index); + } + + return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_inode_extref *extref; + int ret; + int ins_len = name_len + sizeof(*extref); + unsigned long ptr; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_item *item; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, NULL)) + goto out; + + btrfs_extend_item(root, path, ins_len); + ret = 0; + } + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_nr(path->slots[0]); + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); + ptr += btrfs_item_size(leaf, item) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_index(path->nodes[0], extref, index); + btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + + ptr = (unsigned long)&extref->name; + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + key.type = BTRFS_INODE_REF_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + btrfs_extend_item(root, path, ins_len); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + if (ret == -EOVERFLOW) { + if (find_name_in_backref(path, name, name_len, &ref)) + ret = -EEXIST; + else + ret = -EMLINK; + } + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + + if (ret == -EMLINK) { + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + /* We ran out of space in the ref array. Need to + * add an extended ref. */ + if (btrfs_super_incompat_flags(disk_super) + & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + ret = btrfs_insert_inode_extref(trans, root, name, + name_len, + inode_objectid, + ref_objectid, index); + } + + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + found_key.type == location->type) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/kernel/fs/btrfs/inode-map.c b/kernel/fs/btrfs/inode-map.c new file mode 100644 index 000000000..f6a596d5a --- /dev/null +++ b/kernel/fs/btrfs/inode-map.c @@ -0,0 +1,568 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include "ctree.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "transaction.h" + +static int caching_kthread(void *data) +{ + struct btrfs_root *root = data; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u64 last = (u64)-1; + int slot; + int ret; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Since the commit root is read-only, we can safely skip locking. */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 2; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->commit_root_sem); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + if (btrfs_fs_closing(fs_info)) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + if (WARN_ON(btrfs_header_nritems(leaf) == 0)) + break; + + /* + * Save the key so we can advances forward + * in the next search. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); + schedule_timeout(1); + goto again; + } else + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_INODE_ITEM_KEY) + goto next; + + if (key.objectid >= root->highest_objectid) + break; + + if (last != (u64)-1 && last + 1 != key.objectid) { + __btrfs_add_free_space(ctl, last + 1, + key.objectid - last - 1); + wake_up(&root->ino_cache_wait); + } + + last = key.objectid; +next: + path->slots[0]++; + } + + if (last < root->highest_objectid - 1) { + __btrfs_add_free_space(ctl, last + 1, + root->highest_objectid - last - 1); + } + + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); + + root->ino_cache_progress = (u64)-1; + btrfs_unpin_free_ino(root); +out: + wake_up(&root->ino_cache_wait); + up_read(&fs_info->commit_root_sem); + + btrfs_free_path(path); + + return ret; +} + +static void start_caching(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct task_struct *tsk; + int ret; + u64 objectid; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_NO) { + spin_unlock(&root->ino_cache_lock); + return; + } + + root->ino_cache_state = BTRFS_CACHE_STARTED; + spin_unlock(&root->ino_cache_lock); + + ret = load_free_ino_cache(root->fs_info, root); + if (ret == 1) { + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); + return; + } + + /* + * It can be quite time-consuming to fill the cache by searching + * through the extent tree, and this can keep ino allocation path + * waiting. Therefore at start we quickly find out the highest + * inode number and we know we can use inode numbers which fall in + * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. + */ + ret = btrfs_find_free_objectid(root, &objectid); + if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { + __btrfs_add_free_space(ctl, objectid, + BTRFS_LAST_FREE_OBJECTID - objectid + 1); + } + + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", + root->root_key.objectid); + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, + "disabling inode map caching"); + } +} + +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) +{ + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + +again: + *objectid = btrfs_find_ino_for_alloc(root); + + if (*objectid != 0) + return 0; + + start_caching(root); + + wait_event(root->ino_cache_wait, + root->ino_cache_state == BTRFS_CACHE_FINISHED || + root->free_ino_ctl->free_space > 0); + + if (root->ino_cache_state == BTRFS_CACHE_FINISHED && + root->free_ino_ctl->free_space == 0) + return -ENOSPC; + else + goto again; +} + +void btrfs_return_ino(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; +again: + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { + __btrfs_add_free_space(pinned, objectid, 1); + } else { + down_write(&root->fs_info->commit_root_sem); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->ino_cache_lock); + up_write(&root->fs_info->commit_root_sem); + goto again; + } + spin_unlock(&root->ino_cache_lock); + + start_caching(root); + + __btrfs_add_free_space(pinned, objectid, 1); + + up_write(&root->fs_info->commit_root_sem); + } +} + +/* + * When a transaction is committed, we'll move those inode numbers which are + * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and + * others will just be dropped, because the commit root we were searching has + * changed. + * + * Must be called with root->fs_info->commit_root_sem held + */ +void btrfs_unpin_free_ino(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + struct btrfs_free_space *info; + struct rb_node *n; + u64 count; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + while (1) { + n = rb_first(rbroot); + if (!n) + break; + + info = rb_entry(n, struct btrfs_free_space, offset_index); + BUG_ON(info->bitmap); /* Logic error */ + + if (info->offset > root->ino_cache_progress) + goto free; + else if (info->offset + info->bytes > root->ino_cache_progress) + count = root->ino_cache_progress - info->offset + 1; + else + count = info->bytes; + + __btrfs_add_free_space(ctl, info->offset, count); +free: + rb_erase(&info->offset_index, rbroot); + kfree(info); + } +} + +#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) + +/* + * The goal is to keep the memory used by the free_ino tree won't + * exceed the memory if we use bitmaps only. + */ +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int max_ino; + int max_bitmaps; + + n = rb_last(&ctl->free_space_offset); + if (!n) { + ctl->extents_thresh = INIT_THRESHOLD; + return; + } + info = rb_entry(n, struct btrfs_free_space, offset_index); + + /* + * Find the maximum inode number in the filesystem. Note we + * ignore the fact that this can be a bitmap, because we are + * not doing precise calculation. + */ + max_ino = info->bytes - 1; + + max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; + if (max_bitmaps <= ctl->total_bitmaps) { + ctl->extents_thresh = 0; + return; + } + + ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * + PAGE_CACHE_SIZE / sizeof(*info); +} + +/* + * We don't fall back to bitmap, if we are below the extents threshold + * or this chunk of inode numbers is a big one. + */ +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + if (ctl->free_extents < ctl->extents_thresh || + info->bytes > INODES_PER_BITMAP / 10) + return false; + + return true; +} + +static struct btrfs_free_space_op free_ino_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) +{ +} + +static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + /* + * We always use extents for two reasons: + * + * - The pinned tree is only used during the process of caching + * work. + * - Make code simpler. See btrfs_unpin_free_ino(). + */ + return false; +} + +static struct btrfs_free_space_op pinned_free_ino_op = { + .recalc_thresholds = pinned_recalc_thresholds, + .use_bitmap = pinned_use_bitmap, +}; + +void btrfs_init_free_ino_ctl(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = 1; + ctl->start = 0; + ctl->private = NULL; + ctl->op = &free_ino_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); + + /* + * Initially we allow to use 16K of ram to cache chunks of + * inode numbers before we resort to bitmaps. This is somewhat + * arbitrary, but it will be adjusted in runtime. + */ + ctl->extents_thresh = INIT_THRESHOLD; + + spin_lock_init(&pinned->tree_lock); + pinned->unit = 1; + pinned->start = 0; + pinned->private = NULL; + pinned->extents_thresh = 0; + pinned->op = &pinned_free_ino_op; +} + +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; + u64 alloc_hint = 0; + int ret; + int prealloc; + bool retry = false; + + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0) + return 0; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + trans->bytes_reserved, + BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + trans->transid, trans->bytes_reserved, 1); +again: + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { + ret = PTR_ERR(inode); + goto out_release; + } + + if (IS_ERR(inode)) { + BUG_ON(retry); /* Logic error */ + retry = true; + + ret = create_free_ino_inode(root, trans, path); + if (ret) + goto out_release; + goto again; + } + + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); + if (ret) { + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } + } + + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { + ret = -1; + spin_unlock(&root->ino_cache_lock); + goto out_put; + } + spin_unlock(&root->ino_cache_lock); + + spin_lock(&ctl->tree_lock); + prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; + prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + spin_unlock(&ctl->tree_lock); + + /* Just to make sure we have enough space */ + prealloc += 8 * PAGE_CACHE_SIZE; + + ret = btrfs_delalloc_reserve_space(inode, prealloc); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, + prealloc, prealloc, &alloc_hint); + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); + goto out_put; + } + btrfs_free_reserved_data_space(inode, prealloc); + + ret = btrfs_write_out_ino_cache(root, trans, path, inode); +out_put: + iput(inode); +out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); +out: + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; + + btrfs_free_path(path); + return ret; +} + +static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) + goto out; + } + + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + ret = -ENOSPC; + goto out; + } + + *objectid = ++root->highest_objectid; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/kernel/fs/btrfs/inode-map.h b/kernel/fs/btrfs/inode-map.h new file mode 100644 index 000000000..ddb347bfe --- /dev/null +++ b/kernel/fs/btrfs/inode-map.h @@ -0,0 +1,13 @@ +#ifndef __BTRFS_INODE_MAP +#define __BTRFS_INODE_MAP + +void btrfs_init_free_ino_ctl(struct btrfs_root *root); +void btrfs_unpin_free_ino(struct btrfs_root *root); +void btrfs_return_ino(struct btrfs_root *root, u64 objectid); +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans); + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); + +#endif diff --git a/kernel/fs/btrfs/inode.c b/kernel/fs/btrfs/inode.c new file mode 100644 index 000000000..8bb013672 --- /dev/null +++ b/kernel/fs/btrfs/inode.c @@ -0,0 +1,9907 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "volumes.h" +#include "compression.h" +#include "locking.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "backref.h" +#include "hash.h" +#include "props.h" +#include "qgroup.h" + +struct btrfs_iget_args { + struct btrfs_key *location; + struct btrfs_root *root; +}; + +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_dir_ro_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct address_space_operations btrfs_symlink_aops; +static const struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static int btrfs_setsize(struct inode *inode, struct iattr *attr); +static int btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, u64 ram_bytes, + int type); + +static int btrfs_dirty_inode(struct inode *inode); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = btrfs_init_acl(trans, inode, dir); + if (!err) + err = btrfs_xattr_security_init(trans, inode, dir, qstr); + return err; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + int compress_type, + struct page **compressed_pages) +{ + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + unsigned long offset; + + if (compressed_size && compressed_pages) + cur_size = compressed_size; + + inode_add_bytes(inode, size); + + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; + + key.objectid = btrfs_ino(inode); + key.offset = start; + key.type = BTRFS_EXTENT_DATA_KEY; + + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (compress_type != BTRFS_COMPRESS_NONE) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap_atomic(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + compress_type); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ + BTRFS_I(inode)->disk_i_size = inode->i_size; + ret = btrfs_update_inode(trans, root, inode); + + return ret; +fail: + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static noinline int cow_file_range_inline(struct btrfs_root *root, + struct inode *inode, u64 start, + u64 end, size_t compressed_size, + int compress_type, + struct page **compressed_pages) +{ + struct btrfs_trans_handle *trans; + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = ALIGN(end, root->sectorsize); + u64 data_len = inline_len; + int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end > PAGE_CACHE_SIZE || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, + inline_len, compressed_size, + compress_type, compressed_pages); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_delalloc_release_metadata(inode, end + 1 - start); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); +out: + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + return ret; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + int compress_type; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages, + int compress_type) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); /* -ENOMEM */ + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +static inline int inode_need_compress(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* force compress */ + if (btrfs_test_opt(root, FORCE_COMPRESS)) + return 1; + /* bad compression ratios */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + return 0; + if (btrfs_test_opt(root, COMPRESS) || + BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || + BTRFS_I(inode)->force_compress) + return 1; + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that the flusher thread sent them + * down. + */ +static noinline void compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + int compress_type = root->fs_info->compress_type; + int redirty = 0; + + /* if this is a small write inside eof, kick off a defrag */ + if ((end - start + 1) < 16 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode); + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + + total_compressed = actual_end - start; + + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if (total_compressed <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = ALIGN(end - start + 1, blocksize); + num_bytes = max(blocksize, num_bytes); + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (inode_need_compress(inode)) { + WARN_ON(pages); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } + + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr); + } + will_compress = 1; + } + } +cont: + if (start == 0) { + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(root, inode, start, end, + 0, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(root, inode, start, end, + total_compressed, + compress_type, pages); + } + if (ret <= 0) { + unsigned long clear_flags = EXTENT_DELALLOC | + EXTENT_DEFRAG; + unsigned long page_error_op; + + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; + + /* + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + page_error_op | + PAGE_END_WRITEBACK); + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = ALIGN(total_compressed, blocksize); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + if (!btrfs_test_opt(root, FORCE_COMPRESS) && + !(BTRFS_I(inode)->force_compress)) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { +cleanup_and_bail_uncompressed: + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); + *num_added += 1; + } + + return; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); +} + +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent->pages) + return; + + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline void submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret = 0; + +again: + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + +retry: + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + + /* allocate blocks */ + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* JDM XXX */ + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started && !ret) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1); + + ret = btrfs_reserve_extent(root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, &ins, 1, 1); + if (ret) { + free_async_extent_pages(async_extent); + + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + + /* + * we need to redirty the pages if we decide to + * fallback to uncompressed IO, otherwise we + * will not submit these pages down to lower + * layers. + */ + extent_range_redirty_for_io(inode, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + + goto retry; + } + goto out_free; + } + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto out_free_reserve; + } + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = async_extent->ram_size; + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->generation = -1; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + if (ret) + goto out_free_reserve; + + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + goto out_free_reserve; + } + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK); + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + if (ret) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct page *p = async_extent->pages[0]; + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + p->mapping = inode->i_mapping; + tree->ops->writepage_end_io_hook(p, start, end, + NULL, 0); + p->mapping = NULL; + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + return; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_free: + extent_clear_unlock_delalloc(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + kfree(async_extent); + goto again; +} + +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + if (btrfs_is_free_space_inode(inode)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out_unlock; + } + + num_bytes = ALIGN(end - start + 1, blocksize); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + + /* if this is a small write inside eof, kick off defrag */ + if (num_bytes < 64 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode); + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(root, inode, start, end, 0, 0, + NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, start, end, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + goto out; + } else if (ret < 0) { + goto out_unlock; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(root->fs_info->super_copy)); + + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + unsigned long op; + + cur_alloc_size = disk_num_bytes; + ret = btrfs_reserve_extent(root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + &ins, 1, 1); + if (ret < 0) + goto out_unlock; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto out_reserve; + } + em->start = start; + em->orig_start = em->start; + ram_size = ins.offset; + em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = ram_size; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->generation = -1; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + if (ret) + goto out_reserve; + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + if (ret) + goto out_drop_extent_cache; + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + if (ret) + goto out_drop_extent_cache; + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage + */ + op = unlock ? PAGE_UNLOCK : 0; + op |= PAGE_SET_PRIVATE2; + + extent_clear_unlock_delalloc(inode, start, + start + ram_size - 1, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + op); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + return ret; + +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); +out_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_unlock: + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DELALLOC | EXTENT_DEFRAG, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + goto out; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) { + btrfs_add_delayed_iput(async_cow->inode); + async_cow->inode = NULL; + } +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < + 5 * 1024 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + if (async_cow->inode) + btrfs_add_delayed_iput(async_cow->inode); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1024; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + BUG_ON(!async_cow); /* -ENOMEM */ + async_cow->inode = igrab(inode); + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(root, FORCE_COMPRESS)) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + btrfs_init_work(&async_cow->work, + btrfs_delalloc_helper, + async_cow_start, async_cow_submit, + async_cow_free); + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_work(root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list, 0); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 extent_offset; + u64 disk_bytenr; + u64 num_bytes; + u64 disk_num_bytes; + u64 ram_bytes; + int extent_type; + int ret, err; + int type; + int nocow; + int check_prev = 1; + bool nolock; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) { + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + return -ENOMEM; + } + + nolock = btrfs_is_free_space_inode(inode); + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_free_path(path); + return PTR_ERR(trans); + } + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, ino, + cur_offset, 0); + if (ret < 0) + goto error; + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto error; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + extent_type = 0; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, ino, + found_key.offset - + extent_offset, disk_bytenr)) + goto out_check; + disk_bytenr += extent_offset; + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_write_no_snapshoting(root); + if (!err) + goto out_check; + } + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + if (!nolock && nocow) + btrfs_end_write_no_snapshoting(root); + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); + if (ret) { + if (!nolock && nocow) + btrfs_end_write_no_snapshoting(root); + goto error; + } + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + BUG_ON(!em); /* -ENOMEM */ + em->start = cur_offset; + em->orig_start = found_key.offset - extent_offset; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->orig_block_len = disk_num_bytes; + em->ram_bytes = ram_bytes; + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); + em->generation = -1; + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); /* -ENOMEM */ + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + if (ret) { + if (!nolock && nocow) + btrfs_end_write_no_snapshoting(root); + goto error; + } + } + + extent_clear_unlock_delalloc(inode, cur_offset, + cur_offset + num_bytes - 1, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC, PAGE_UNLOCK | + PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_write_no_snapshoting(root); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(path); + + if (cur_offset <= end && cow_start == (u64)-1) { + cow_start = cur_offset; + cur_offset = end; + } + + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + if (ret) + goto error; + } + +error: + err = btrfs_end_transaction(trans, root); + if (!ret) + ret = err; + + if (ret && cur_offset < end) + extent_clear_unlock_delalloc(inode, cur_offset, end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_free_path(path); + return ret; +} + +static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +{ + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + return 0; + + /* + * @defrag_bytes is a hint value, no spinlock held here, + * if is not zero, it means the file is defragging. + * Force cow if given extent needs to be defragged. + */ + if (BTRFS_I(inode)->defrag_bytes && + test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DEFRAG, 0, NULL)) + return 1; + + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + int force_cow = need_force_cow(inode, start, end); + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + } else if (!inode_need_compress(inode)) { + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags); + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + } + return ret; +} + +static void btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + u64 size; + + /* not delalloc, ignore it */ + if (!(orig->state & EXTENT_DELALLOC)) + return; + + size = orig->end - orig->start + 1; + if (size > BTRFS_MAX_EXTENT_SIZE) { + u64 num_extents; + u64 new_size; + + /* + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. + */ + new_size = orig->end - split + 1; + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) + return; + } + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static void btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return; + + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + return; + } + + /* + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. + */ + old_size = other->end - other->start + 1; + num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) + return; + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); +} + +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static void btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, unsigned *bits) +{ + + if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + WARN_ON(1); + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; + bool do_list = !btrfs_is_free_space_inode(inode); + + if (*bits & EXTENT_FIRST_DELALLOC) { + *bits &= ~EXTENT_FIRST_DELALLOC; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes += len; + if (*bits & EXTENT_DEFRAG) + BTRFS_I(inode)->defrag_bytes += len; + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); + spin_unlock(&BTRFS_I(inode)->lock); + } +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static void btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, + unsigned *bits) +{ + u64 len = state->end + 1 - state->start; + u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, + BTRFS_MAX_EXTENT_SIZE); + + spin_lock(&BTRFS_I(inode)->lock); + if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) + BTRFS_I(inode)->defrag_bytes -= len; + spin_unlock(&BTRFS_I(inode)->lock); + + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + bool do_list = !btrfs_is_free_space_inode(inode); + + if (*bits & EXTENT_FIRST_DELALLOC) { + *bits &= ~EXTENT_FIRST_DELALLOC; + } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents -= num_extents; + spin_unlock(&BTRFS_I(inode)->lock); + } + + /* + * We don't reserve metadata space for space cache inodes so we + * don't need to call dellalloc_release_metadata if there is an + * error. + */ + if (*bits & EXTENT_DO_ACCOUNTING && + root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, len); + + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && do_list && !(state->state & EXTENT_NORESERVE)) + btrfs_free_reserved_data_space(inode, len); + + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes -= len; + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); + spin_unlock(&BTRFS_I(inode)->lock); + } +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_iter.bi_size; + map_length = length; + ret = btrfs_map_block(root->fs_info, rw, logical, + &map_length, NULL, 0); + /* Will always return 0 with map_multi == NULL */ + BUG_ON(ret < 0); + if (map_length < length + size) + return 1; + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); /* -ENOMEM */ + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + int metadata = 0; + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (btrfs_is_free_space_inode(inode)) + metadata = 2; + + if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + goto out; + + if (bio_flags & EXTENT_BIO_COMPRESSED) { + ret = btrfs_submit_compressed_read(inode, bio, + mirror_num, + bio_flags); + goto out; + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + goto out; + } + goto mapit; + } else if (async && !skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, bio_offset, + __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + goto out; + } else if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + if (ret) + goto out; + } + +mapit: + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: + if (ret < 0) + bio_endio(bio, ret); + return ret; +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct btrfs_ordered_sum *sum; + + list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + cached_state, GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + int ret; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + &cached_state); + + /* already ordered? We're done */ + if (PagePrivate2(page)) + goto out; + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, + page_end, &cached_state, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); + ClearPageChecked(page); + set_page_dirty(page); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + &cached_state, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); + kfree(fixup); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + btrfs_init_work(&fixup->work, btrfs_fixup_helper, + btrfs_writepage_fixup_worker, NULL, NULL); + fixup->page = page; + btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); + return -EBUSY; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + int extent_inserted = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); + if (ret) + goto out; + + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + inode_add_bytes(inode, num_bytes); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + btrfs_ino(inode), file_pos, &ins); +out: + btrfs_free_path(path); + + return ret; +} + +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON(ret < 0)) + return ret; + ret = 0; + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, + path, record_one_backref, + old); + if (ret < 0 && ret != -ENOENT) + return false; + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + struct new_sa_defrag_extent *new) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) + return 0; + + if (btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + + if (btrfs_root_readonly(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + path->leave_spinning = 1; + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (extent_len + found_key.offset == start && + relink_is_mergable(leaf, fi, new)) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + btrfs_release_path(path); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + path->leave_spinning = 0; + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ + struct old_sa_defrag_extent *old, *tmp; + + if (!new) + return; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } + kfree(new); +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); +out: + free_sa_defrag_extent(new); + + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_path; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_path; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_path: + btrfs_free_path(path); +out_kfree: + free_sa_defrag_extent(new); + return NULL; +} + +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, + u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +{ + struct inode *inode = ordered_extent->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; + int compress_type = 0; + int ret = 0; + u64 logical_len = ordered_extent->len; + bool nolock; + bool truncated = false; + + nolock = btrfs_is_free_space_inode(inode); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + + btrfs_free_io_failure_record(inode, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; + /* Truncated the entire extent, don't bother adding */ + if (!logical_len) + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + lock_extent_bits(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + 0, &cached_state); + + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (0 && last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_unlock; + } + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compress_type = ordered_extent->compress_type; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compress_type); + ret = btrfs_mark_extent_written(trans, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + logical_len); + } else { + BUG_ON(root == root->fs_info->tree_root); + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + logical_len, logical_len, + compress_type, 0, 0, + BTRFS_FILE_EXTENT_REG); + if (!ret) + btrfs_release_delalloc_bytes(root, + ordered_extent->start, + ordered_extent->disk_len); + } + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, ordered_extent->len, + trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } + + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } + ret = 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); +out: + if (root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); + + if (ret || truncated) { + u64 start, end; + + if (truncated) + start = ordered_extent->file_offset + logical_len; + else + start = ordered_extent->file_offset; + end = ordered_extent->file_offset + ordered_extent->len - 1; + clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + + /* Drop the cache for the part of the extent we didn't write. */ + btrfs_drop_extent_cache(inode, start, end, 0); + + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. + */ + if ((ret || !logical_len) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len, 1); + } + + + /* + * This needs to be done to make sure anybody waiting knows we are done + * updating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + + /* for snapshot-aware defrag */ + if (new) { + if (ret) { + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); + } else { + relink_file_extents(new); + } + } + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + return ret; +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; + + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); + + ClearPagePrivate2(page); + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) + return 0; + + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + + btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, + NULL); + btrfs_queue_work(wq, &ordered_extent->work); + + return 0; +} + +static int __readpage_endio_check(struct inode *inode, + struct btrfs_io_bio *io_bio, + int icsum, struct page *page, + int pgoff, u64 start, size_t len) +{ + char *kaddr; + u32 csum_expected; + u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + csum_expected = *(((u32 *)io_bio->csum) + icsum); + + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + pgoff, csum, len); + btrfs_csum_final(csum, (char *)&csum); + if (csum != csum_expected) + goto zeroit; + + kunmap_atomic(kaddr); + return 0; +zeroit: + if (__ratelimit(&_rs)) + btrfs_warn(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, csum_expected); + memset(kaddr + pgoff, 1, len); + flush_dcache_page(page); + kunmap_atomic(kaddr); + if (csum_expected == 0) + return 0; + return -EIO; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. + */ +static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) +{ + size_t offset = start - page_offset(page); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (PageChecked(page)) { + ClearPageChecked(page); + return 0; + } + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + phy_offset >>= inode->i_sb->s_blocksize_bits; + return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, + start, (size_t)(end - start + 1)); +} + +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */ +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&fs_info->delayed_iput_sem); + + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + + up_read(&root->fs_info->delayed_iput_sem); +} + +/* + * This is called in transaction commit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + int ret; + + if (atomic_read(&root->orphan_inodes) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + spin_lock(&root->orphan_lock); + if (atomic_read(&root->orphan_inodes)) { + spin_unlock(&root->orphan_lock); + return; + } + + if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { + spin_unlock(&root->orphan_lock); + return; + } + + block_rsv = root->orphan_block_rsv; + root->orphan_block_rsv = NULL; + spin_unlock(&root->orphan_lock); + + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); + } + + if (block_rsv) { + WARN_ON(block_rsv->size > 0); + btrfs_free_block_rsv(root, block_rsv); + } +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; + + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!block_rsv) + return -ENOMEM; + } + + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } + + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + atomic_inc(&root->orphan_inodes); + } + + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + reserve = 1; + spin_unlock(&root->orphan_lock); + + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + } + + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); + if (ret) { + atomic_dec(&root->orphan_inodes); + if (reserve) { + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + btrfs_orphan_release_metadata(inode); + } + if (ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + ret = 0; + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + return 0; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; + int ret = 0; + + spin_lock(&root->orphan_lock); + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) + delete_item = 1; + + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + release_rsv = 1; + spin_unlock(&root->orphan_lock); + + if (delete_item) { + atomic_dec(&root->orphan_inodes); + if (trans) + ret = btrfs_del_orphan_item(trans, root, + btrfs_ino(inode)); + } + + if (release_rsv) + btrfs_orphan_release_metadata(inode); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +int btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + u64 last_objectid = 0; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didn't + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + + if (found_key.offset == last_objectid) { + btrfs_err(root->fs_info, + "Error removing orphan entry, stopping orphan cleanup"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); + ret = PTR_ERR_OR_ZERO(inode); + if (ret && ret != -ESTALE) + goto out; + + if (ret == -ESTALE && root == root->fs_info->tree_root) { + struct btrfs_root *dead_root; + struct btrfs_fs_info *fs_info = root->fs_info; + int is_dead_root = 0; + + /* + * this is an orphan in the tree root. Currently these + * could come from 2 sources: + * a) a snapshot deletion in progress + * b) a free space cache inode + * We need to distinguish those two, as the snapshot + * orphan must not get deleted. + * find_dead_roots already ran before us, so if this + * is a snapshot deletion, we should find the root + * in the dead_roots list + */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry(dead_root, &fs_info->dead_roots, + root_list) { + if (dead_root->root_key.objectid == + found_key.objectid) { + is_dead_root = 1; + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (is_dead_root) { + /* prevent this orphan from being found again */ + key.offset = found_key.objectid - 1; + continue; + } + } + /* + * Inode is already gone but the orphan item is still there, + * kill the orphan item. + */ + if (ret == -ESTALE) { + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_debug(root->fs_info, "auto deleting %Lu", + found_key.objectid); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + continue; + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + if (WARN_ON(!S_ISREG(inode->i_mode))) { + iput(inode); + continue; + } + nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + iput(inode); + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) { + iput(inode); + goto out; + } + + ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + if (ret) + goto out; + } + /* release the path since we're done with it */ + btrfs_release_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + trans = btrfs_join_transaction(root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); + } + + if (nr_unlink) + btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); + if (nr_truncate) + btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); + +out: + if (ret) + btrfs_err(root->fs_info, + "could not do orphan cleanup %d", ret); + btrfs_free_path(path); + return ret; +} + +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; + int scanned = 0; + + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + + slot++; + *first_xattr_slot = -1; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; + return 1; +} + +/* + * read an inode from the btree into the in-memory inode + */ +static void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + unsigned long ptr; + int maybe_acls; + u32 rdev; + int ret; + bool filled = false; + int first_xattr_slot; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; + + path = btrfs_alloc_path(); + if (!path) + goto make_bad; + + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + + if (filled) + goto cache_index; + + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + + inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: + /* + * If we were modified in the current generation and evicted from memory + * and then re-read we need to do a full sync since we don't have any + * idea about which extents were modified before we were evicted from + * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. + */ + if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } +cache_acl: + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + + if (!maybe_acls) + cache_no_acl(inode); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + break; + default: + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + + btrfs_update_iflags(inode); + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, &item->atime, + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->atime, + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, &item->mtime, + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->mtime, + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, &item->ctime, + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->ctime, + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, + 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !root->fs_info->log_root_recovering) { + btrfs_update_root_times(trans, root); + + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + u64 ino = btrfs_ino(inode); + u64 dir_ino = btrfs_ino(dir); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + path->leave_spinning = 1; + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(path); + + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } + } + + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, + dir_ino, &index); + if (ret) { + btrfs_info(root->fs_info, + "failed to delete reference to %.*s, inode %llu parent %llu", + name_len, name, ino, dir_ino); + btrfs_abort_transaction(trans, root, ret); + goto err; + } +skip_backref: + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir_ino); + if (ret != 0 && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + if (ret == -ENOENT) + ret = 0; + else if (ret) + btrfs_abort_transaction(trans, root, ret); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(inode); + inode_inc_iversion(dir); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, dir); +out: + return ret; +} + +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + int ret; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret; + + /* + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode + */ + trans = btrfs_start_transaction(root, 5); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; + + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); + } + trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->bytes_reserved = num_bytes; + } + return trans; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = d_inode(dentry); + int ret; + + trans = __unlink_start_trans(dir); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); + + ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), + dentry->d_name.name, dentry->d_name.len); + if (ret) + goto out; + + if (inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + } + +out: + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + return ret; +} + +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + u64 dir_ino = btrfs_ino(dir); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + name, name_len, -1); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir_ino, &index, name, name_len); + if (ret < 0) { + if (ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + di = btrfs_search_dir_index_item(root, path, dir_ino, + name, name_len); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + index = key.offset; + } + btrfs_release_path(path); + + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode_fallback(trans, root, dir); + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int err = 0; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + return -EPERM; + + trans = __unlink_start_trans(dir); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, root, dir, + BTRFS_I(inode)->location.objectid, + dentry->d_name.name, + dentry->d_name.len); + goto out; + } + + err = btrfs_orphan_add(trans, inode); + if (err) + goto out; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); +out: + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + + return err; +} + +static int truncate_space_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytes_deleted) +{ + int ret; + + bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->bytes_reserved += bytes_deleted; + return ret; + +} + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u64 last_size = (u64)-1; + u32 found_type = (u8)-1; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int ret; + int err = 0; + u64 ino = btrfs_ino(inode); + u64 bytes_deleted = 0; + bool be_nice = 0; + bool should_throttle = 0; + bool should_end = 0; + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + + /* + * for non-free space inodes and ref cows, we want to back off from + * time to time + */ + if (!btrfs_is_free_space_inode(inode) && + test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + be_nice = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + + /* + * We want to drop from the next block forward in case this new size is + * not block aligned since we will be keeping the last block of the + * extent just the way it is. + */ + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); + + /* + * This function is also used to drop the items in the log tree before + * we relog the inode, so if root != BTRFS_I(inode)->root, it means + * it is used to drop the loged items. So we shouldn't kill the delayed + * items. + */ + if (min_type == 0 && root == BTRFS_I(inode)->root) + btrfs_kill_delayed_inode_items(inode); + + key.objectid = ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + /* + * with a 16K leaf size and 128MB extents, you can actually queue + * up a huge file in a single leaf. Most of the time that + * bytes_deleted is > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (btrfs_should_end_transaction(trans, root)) { + err = -EAGAIN; + goto error; + } + } + + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + + if (found_key.objectid != ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); + } + item_end--; + } + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + found_extent = 0; + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) + inode_sub_bytes(inode, num_dec); + } + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + inode_sub_bytes(inode, item_end + 1 - + new_size); + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = + btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root, path, size, 1); + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + should_throttle = 0; + + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { + btrfs_set_path_blocking(path); + bytes_deleted += extent_num_bytes; + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, 0, + btrfs_header_owner(leaf), + ino, extent_offset, 0); + BUG_ON(ret); + if (btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_async_run_delayed_refs(root, + trans->delayed_ref_updates * 2, 0); + if (be_nice) { + if (truncate_space_check(trans, root, + extent_num_bytes)) { + should_end = 1; + } + if (btrfs_should_throttle_delayed_refs(trans, + root)) { + should_throttle = 1; + } + } + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || + should_throttle || should_end) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (ret) { + btrfs_abort_transaction(trans, + root, ret); + goto error; + } + pending_del_nr = 0; + } + btrfs_release_path(path); + if (should_throttle) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + /* + * if we failed to refill our space rsv, bail out + * and let the transaction restart + */ + if (should_end) { + err = -EAGAIN; + goto error; + } + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } +error: + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + btrfs_ordered_update_i_size(inode, last_size, NULL); + + btrfs_free_path(path); + + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + return err; +} + +/* + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + * offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero. This is used with truncate and hole punching. + */ +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front) +{ + struct address_space *mapping = inode->i_mapping; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0 && + (!len || ((len & (blocksize - 1)) == 0))) + goto out; + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) { + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + ret = -ENOMEM; + goto out; + } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state, GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); + if (ret) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + goto out_unlock; + } + + if (offset != PAGE_CACHE_SIZE) { + if (!len) + len = PAGE_CACHE_SIZE - offset; + kaddr = kmap(page); + if (front) + memset(kaddr, 0, offset); + else + memset(kaddr + offset, 0, len); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, + GFP_NOFS); + +out_unlock: + if (ret) + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err = 0; + + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + + if (size <= hole_start) + return 0; + + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + &cached_state); + ordered = btrfs_lookup_ordered_range(inode, hole_start, + block_end - hole_start); + if (!ordered) + break; + unlock_extent_cached(io_tree, hole_start, block_end - 1, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + if (IS_ERR(em)) { + err = PTR_ERR(em); + em = NULL; + break; + } + last_byte = min(extent_map_end(em), block_end); + last_byte = ALIGN(last_byte , root->sectorsize); + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + struct extent_map *hole_em; + hole_size = last_byte - cur_offset; + + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + hole_size - 1, 0); + hole_em = alloc_extent_map(); + if (!hole_em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + hole_em->start = cur_offset; + hole_em->len = hole_size; + hole_em->orig_start = cur_offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->ram_bytes = hole_size; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = root->fs_info->generation; + + while (1) { + write_lock(&em_tree->lock); + err = add_extent_mapping(em_tree, hole_em, 1); + write_unlock(&em_tree->lock); + if (err != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + + hole_size - 1, 0); + } + free_extent_map(hole_em); + } +next: + free_extent_map(em); + em = NULL; + cur_offset = last_byte; + if (cur_offset >= block_end) + break; + } + free_extent_map(em); + unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, + GFP_NOFS); + return err; +} + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +static void wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} + +static int btrfs_setsize(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; + int ret; + + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } + + if (newsize > oldsize) { + truncate_pagecache(inode, newsize); + /* + * Don't do an expanding truncate while snapshoting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + wait_for_snapshot_creation(root); + ret = btrfs_cont_expand(inode, oldsize, newsize); + if (ret) { + btrfs_end_write_no_snapshoting(root); + return ret; + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_end_write_no_snapshoting(root); + return PTR_ERR(trans); + } + + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_write_no_snapshoting(root); + btrfs_end_transaction(trans, root); + } else { + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags); + + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + + ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) { + int err; + + /* + * failed to truncate, disk_i_size is only adjusted down + * as we remove extents, so it should represent the true + * size of the inode, so reset the in memory size and + * delete our orphan entry. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + return ret; + } + i_size_write(inode, BTRFS_I(inode)->disk_i_size); + err = btrfs_orphan_del(trans, inode); + if (err) + btrfs_abort_transaction(trans, root, err); + btrfs_end_transaction(trans, root); + } + } + + return ret; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct btrfs_root *root = BTRFS_I(inode)->root; + int err; + + if (btrfs_root_readonly(root)) + return -EROFS; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + err = btrfs_setsize(inode, attr); + if (err) + return err; + } + + if (attr->ia_valid) { + setattr_copy(inode, attr); + inode_inc_iversion(inode); + err = btrfs_dirty_inode(inode); + + if (!err && attr->ia_valid & ATTR_MODE) + err = posix_acl_chmod(inode, inode->i_mode); + } + + return err; +} + +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages_final(&inode->i_data); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + if (need_resched()) { + write_unlock(&map_tree->lock); + cond_resched(); + write_lock(&map_tree->lock); + } + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + cond_resched(); + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + +void btrfs_evict_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv, *global_rsv; + int steal_from_global = 0; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + int ret; + + trace_btrfs_inode_evict(inode); + + evict_inode_truncate_pages(inode); + + if (inode->i_nlink && + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(inode))) + goto no_delete; + + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + btrfs_free_io_failure_record(inode, 0, (u64)-1); + + if (root->fs_info->log_root_recovering) { + BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)); + goto no_delete; + } + + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + goto no_delete; + } + + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv->size = min_size; + rsv->failfast = 1; + global_rsv = &root->fs_info->global_block_rsv; + + btrfs_i_size_write(inode, 0); + + /* + * This is a bit simpler than btrfs_truncate since we've already + * reserved our space for our orphan item in the unlink, so we just + * need to reserve some slack space in case we add bytes and update + * inode item when doing the truncate. + */ + while (1) { + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + steal_from_global++; + else + steal_from_global = 0; + ret = 0; + + /* + * steal_from_global == 0: we reserved stuff, hooray! + * steal_from_global == 1: we didn't reserve stuff, boo! + * steal_from_global == 2: we've committed, still not a lot of + * room but maybe we'll have room in the global reserve this + * time. + * steal_from_global == 3: abandon all hope! + */ + if (steal_from_global > 2) { + btrfs_warn(root->fs_info, + "Could not get space for a delete, will truncate on mount %d", + ret); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + /* + * We can't just steal from the global reserve, we need tomake + * sure there is room to do it, if not we need to commit and try + * again. + */ + if (steal_from_global) { + if (!btrfs_check_space_for_delayed_refs(trans, root)) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, + min_size); + else + ret = -ENOSPC; + } + + /* + * Couldn't steal from the global reserve, we have too much + * pending stuff built up, commit the transaction and try it + * again. + */ + if (ret) { + ret = btrfs_commit_transaction(trans, root); + if (ret) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + continue; + } else { + steal_from_global = 0; + } + + trans->block_rsv = rsv; + + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + if (ret != -ENOSPC && ret != -EAGAIN) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root); + } + + btrfs_free_block_rsv(root, rsv); + + /* + * Errors here aren't a big deal, it just means we leave orphan items + * in the tree. They will be cleaned up on the next mount. + */ + if (ret == 0) { + trans->block_rsv = root->orphan_block_rsv; + btrfs_orphan_del(trans, inode); + } else { + btrfs_orphan_del(NULL, inode); + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + if (!(root == root->fs_info->tree_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) + btrfs_return_ino(root, btrfs_ino(inode)); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); +no_delete: + btrfs_remove_delayed_node(inode); + clear_inode(inode); + return; +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (IS_ERR_OR_NULL(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) +{ + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + err = -ENOENT; + key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, + 0, 0); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + goto out; + + ret = memcmp_extent_buffer(leaf, dentry->d_name.name, + (unsigned long)(ref + 1), + dentry->d_name.len); + if (ret) + goto out; + + btrfs_release_path(path); + + new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + return err; +} + +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; + struct rb_node *new = &BTRFS_I(inode)->rb_node; + u64 ino = btrfs_ino(inode); + + if (inode_unhashed(inode)) + return; + parent = NULL; + spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (ino < btrfs_ino(&entry->vfs_inode)) + p = &parent->rb_left; + else if (ino > btrfs_ino(&entry->vfs_inode)) + p = &parent->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING))); + rb_replace_node(parent, new, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + return; + } + } + rb_link_node(new, parent, p); + rb_insert_color(new, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); + } + spin_unlock(&root->inode_lock); + + if (empty && btrfs_root_refs(&root->root_item) == 0) { + synchronize_srcu(&root->fs_info->subvol_srcu); + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + +void btrfs_invalidate_inodes(struct btrfs_root *root) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(&entry->vfs_inode)) + node = node->rb_left; + else if (objectid > btrfs_ino(&entry->vfs_inode)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(&entry->vfs_inode)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = btrfs_ino(&entry->vfs_inode) + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will have it removed from + * the inode cache when its usage count + * hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); + BTRFS_I(inode)->root = args->root; + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->location->objectid == BTRFS_I(inode)->location.objectid && + args->root == BTRFS_I(inode)->root; +} + +static struct inode *btrfs_iget_locked(struct super_block *s, + struct btrfs_key *location, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + unsigned long hashval = btrfs_inode_hash(location->objectid, root); + + args.location = location; + args.root = root; + + inode = iget5_locked(s, hashval, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + if (!is_bad_inode(inode)) { + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } else { + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(-ESTALE); + } + } + + return inode; +} + +static struct inode *new_simple_dir(struct super_block *s, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + inode->i_op = &btrfs_dir_ro_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int index; + int ret = 0; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + if (ret < 0) + return ERR_PTR(ret); + + if (location.objectid == 0) + return ERR_PTR(-ENOENT); + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root, NULL); + return inode; + } + + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); + + index = srcu_read_lock(&root->fs_info->subvol_srcu); + ret = fixup_tree_root_location(root, dir, dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir->i_sb, &location, sub_root); + } else { + inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); + } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + + if (!IS_ERR(inode) && root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + ret = btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + if (ret) { + iput(inode); + inode = ERR_PTR(ret); + } + } + + return inode; +} + +static int btrfs_dentry_delete(const struct dentry *dentry) +{ + struct btrfs_root *root; + struct inode *inode = d_inode(dentry); + + if (!inode && !IS_ROOT(dentry)) + inode = d_inode(dentry->d_parent); + + if (inode) { + root = BTRFS_I(inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; + } + return 0; +} + +static void btrfs_dentry_release(struct dentry *dentry) +{ + kfree(dentry->d_fsdata); +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode; + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); +} + +unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct list_head ins_list; + struct list_head del_list; + int ret; + struct extent_buffer *leaf; + int slot; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + int is_curr = 0; /* ctx->pos points to the current index? */ + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + if (!dir_emit_dots(file, ctx)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + + if (key_type == BTRFS_DIR_INDEX_KEY) { + INIT_LIST_HEAD(&ins_list); + INIT_LIST_HEAD(&del_list); + btrfs_get_delayed_items(inode, &ins_list, &del_list); + } + + key.type = key_type; + key.offset = ctx->pos; + key.objectid = btrfs_ino(inode); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; + } + + item = btrfs_item_nr(slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (found_key.type != key_type) + break; + if (found_key.offset < ctx->pos) + goto next; + if (key_type == BTRFS_DIR_INDEX_KEY && + btrfs_should_delete_dir_index(&del_list, + found_key.offset)) + goto next; + + ctx->pos = found_key.offset; + is_curr = 1; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + if (verify_dir_item(root, leaf, di)) + break; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + + /* is this a reference to our own snapshot? If so + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = !dir_emit(ctx, name_ptr, name_len, + location.objectid, d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } +next: + path->slots[0]++; + } + + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (is_curr) + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + if (ret) + goto nopos; + } + + /* Reached end of directory/root. Bump pos past the last item. */ + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } +nopos: + ret = 0; +err: + if (key_type == BTRFS_DIR_INDEX_KEY) + btrfs_put_delayed_items(&ins_list, &del_list); + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + bool nolock = false; + + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + return 0; + + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) + nolock = true; + + if (wbc->sync_mode == WB_SYNC_ALL) { + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +static int btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret; + + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + return 0; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans, root); + if (BTRFS_I(inode)->delayed_node) + btrfs_balance_delayed_items(root); + + return ret; +} + +/* + * This is a copy of file_update_time. We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +static int btrfs_update_time(struct inode *inode, struct timespec *now, + int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root)) + return -EROFS; + + if (flags & S_VERSION) + inode_inc_iversion(inode); + if (flags & S_CTIME) + inode->i_ctime = *now; + if (flags & S_MTIME) + inode->i_mtime = *now; + if (flags & S_ATIME) + inode->i_atime = *now; + return btrfs_dirty_inode(inode); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + found_key.type != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, + umode_t mode, u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + int nitems = name ? 2 : 1; + unsigned long ptr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + inode = new_inode(root->fs_info->sb); + if (!inode) { + btrfs_free_path(path); + return ERR_PTR(-ENOMEM); + } + + /* + * O_TMPFILE, set link count to 0, so that after this point, + * we fill in an inode item with the correct link count. + */ + if (!name) + set_nlink(inode, 0); + + /* + * we have to initialize this early, so we can reclaim the inode + * number if we fail afterwards in this function. + */ + inode->i_ino = objectid; + + if (dir && name) { + trace_btrfs_inode_request(dir); + + ret = btrfs_set_inode_index(dir, index); + if (ret) { + btrfs_free_path(path); + iput(inode); + return ERR_PTR(ret); + } + } else if (dir) { + *index = 0; + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; + + /* + * We could have gotten an inode number from somebody who was fsynced + * and then removed in this same transaction, so let's just set full + * sync since it will be a full sync anyway and this will blow away the + * old info in the log. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + + key[0].objectid = objectid; + key[0].type = BTRFS_INODE_ITEM_KEY; + key[0].offset = 0; + + sizes[0] = sizeof(struct btrfs_inode_item); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + key[1].type = BTRFS_INODE_REF_KEY; + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); + if (ret != 0) + goto fail_unlock; + + inode_init_owner(inode, dir, mode); + inode_set_bytes(inode, 0); + + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, + sizeof(*inode_item)); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + btrfs_inherit_iflags(inode, dir); + + if (S_ISREG(mode)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); + + btrfs_update_root_times(trans, root); + + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + + return inode; + +fail_unlock: + unlock_new_inode(inode); +fail: + if (dir && name) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + iput(inode); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + u64 ino = btrfs_ino(inode); + u64 parent_ino = btrfs_ino(parent_inode); + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + } else { + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + } + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, index, name, name_len); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, + parent_ino, index); + } + + /* Nothing to clean up yet */ + if (ret) + return ret; + + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + if (ret == -EEXIST || ret == -EOVERFLOW) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + inode_inc_iversion(parent_inode); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) +{ + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { + btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); + d_instantiate(dentry, inode); + } + +out_unlock: + btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int drop_inode_on_err = 0; + int err; + u64 objectid; + u64 index = 0; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + drop_inode_on_err = 1; + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) + goto out_unlock_inode; + + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); + d_instantiate(dentry, inode); + +out_unlock: + btrfs_end_transaction(trans, root); + if (err && drop_inode_on_err) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = d_inode(old_dentry); + u64 index; + int err; + int drop_inode = 0; + + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EXDEV; + + if (inode->i_nlink >= BTRFS_LINK_MAX) + return -EMLINK; + + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + /* + * 2 items for inode and inode ref + * 2 items for dir items + * 1 item for parent inode + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } + + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; + inc_nlink(inode); + inode_inc_iversion(inode); + inode->i_ctime = CURRENT_TIME; + ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); + + if (err) { + drop_inode = 1; + } else { + struct dentry *parent = dentry->d_parent; + err = btrfs_update_inode(trans, root, inode); + if (err) + goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, inode, NULL, parent); + } + + btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_fail; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFDIR | mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_fail_inode; + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail_inode; + + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail_inode; + + d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); + drop_on_err = 0; + +out_fail: + btrfs_end_transaction(trans, root); + if (drop_on_err) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; +} + +/* Find next extent map of a given extent map, caller needs to ensure locks */ +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, + * and an extent that you want to insert, deal with overlap and insert + * the best fitted new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start) +{ + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em, 0); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + int compress_type; + + WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); + kfree(tmp); + return ret; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = btrfs_ino(inode); + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + const bool new_inline = !page || create; + +again: + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + /* + * If we backup past the first extent we want to move forward + * and see if there is an extent in front of us, otherwise we'll + * say there is a hole for our whole search range which can + * cause problems. + */ + extent_end = start; + goto next; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } +next: + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + if (start > found_key.offset) + goto next; + em->start = start; + em->orig_start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + if (new_inline) + goto out; + + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = ALIGN(copy_size, root->sectorsize); + em->orig_block_len = em->len; + em->orig_start = em->start; + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + if (ret) { + err = ret; + goto out; + } + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + BUG(); + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + + btrfs_release_path(path); + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) + return ERR_CAST(trans); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, NULL, GFP_NOFS); + goto insert; + } +not_found: + em->start = start; + em->orig_start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(path); + if (em->start > start || extent_map_end(em) <= start) { + btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", + em->start, em->len, start, len); + err = -EIO; + goto out; + } + + err = 0; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= extent_map_end(existing) || + start <= existing->start) { + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + err = merge_extent_mapping(em_tree, existing, + em, start); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + write_unlock(&em_tree->lock); +out: + + trace_btrfs_get_extent(root, em); + + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + BUG_ON(!em); /* Error is always set */ + return em; +} + +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start, range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map *em; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + alloc_hint, &ins, 1, 1); + if (ret) + return ERR_PTR(ret); + + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + return em; + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + free_extent_map(em); + return ERR_PTR(ret); + } + + return em; +} + +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + goto out; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + backref_offset = btrfs_file_extent_offset(leaf, fi); + + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + num_bytes = min(offset + *len, extent_end) - offset; + if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + + btrfs_release_path(path); + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = 0; + goto out; + } + + ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + key.offset - backref_offset, disk_bytenr); + btrfs_end_transaction(trans, root); + if (ret) { + ret = 0; + goto out; + } + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + *len = num_bytes; + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, int writing) +{ + struct btrfs_ordered_extent *ordered; + int ret = 0; + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, cached_state); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) + break; + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state, GFP_NOFS); + + if (ordered) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } else { + /* Screw you mmap */ + ret = btrfs_fdatawrite_range(inode, lockstart, lockend); + if (ret) + break; + ret = filemap_fdatawait_range(inode->i_mapping, + lockstart, + lockend); + if (ret) + break; + + /* + * If we found a page that couldn't be invalidated just + * fall back to buffered. + */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + lockstart >> PAGE_CACHE_SHIFT, + lockend >> PAGE_CACHE_SHIFT); + if (ret) + break; + } + + cond_resched(); + } + + return ret; +} + +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, u64 ram_bytes, + int type) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + if (!em) + return ERR_PTR(-ENOMEM); + + em->start = start; + em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; + em->len = len; + em->block_len = block_len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->orig_block_len = orig_block_len; + em->ram_bytes = ram_bytes; + em->generation = -1; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + if (type == BTRFS_ORDERED_PREALLOC) + set_bit(EXTENT_FLAG_FILLING, &em->flags); + + do { + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + + return em; +} + + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + u64 start = iblock << inode->i_blkbits; + u64 lockstart, lockend; + u64 len = bh_result->b_size; + u64 *outstanding_extents = NULL; + int unlock_bits = EXTENT_LOCKED; + int ret = 0; + + if (create) + unlock_bits |= EXTENT_DIRTY; + else + len = min_t(u64, len, root->sectorsize); + + lockstart = start; + lockend = start + len - 1; + + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered. + */ + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) + return -ENOTBLK; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + ret = -ENOTBLK; + goto unlock_err; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + goto unlock_err; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = min(len, em->len - (start - em->start)); + lockstart = start + len; + goto unlock; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + em = create_pinned_em(inode, start, len, + orig_start, + block_start, len, + orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + } + + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + if (ret) { + free_extent_map(em); + goto unlock_err; + } + goto unlock; + } + } + + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + len = min(len, em->len - (start - em->start)); +unlock: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); + + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + + current->journal_info = outstanding_extents; + btrfs_free_reserved_data_space(inode, len); + } + + /* + * In the case of write we need to clear and unlock the entire range, + * in the case of read we need to unlock only the end area that we + * aren't using if there is any left over space. + */ + if (lockstart < lockend) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); + } else { + free_extent_state(cached_state); + } + + free_extent_map(em); + + return 0; + +unlock_err: + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; + return ret; +} + +static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int rw, int mirror_num) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + BUG_ON(rw & REQ_WRITE); + + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DIO_REPAIR); + if (ret) + goto err; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); +err: + bio_put(bio); + return ret; +} + +static int btrfs_check_dio_repairable(struct inode *inode, + struct bio *failed_bio, + struct io_failure_record *failrec, + int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + + if (failrec->this_mirror > num_copies) { + pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + +static int dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int failed_mirror, bio_end_io_t *repair_endio, + void *repair_arg) +{ + struct io_failure_record *failrec; + struct bio *bio; + int isector; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, + failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + isector = start - btrfs_io_bio(failed_bio)->logical; + isector >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + 0, isector, repair_endio, repair_arg); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + btrfs_debug(BTRFS_I(inode)->root->fs_info, + "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = submit_dio_repair_bio(inode, bio, read_mode, + failrec->this_mirror); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +struct btrfs_retry_complete { + struct completion done; + struct inode *inode; + u64 start; + int uptodate; +}; + +static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) + goto end; + + done->uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) + clean_io_failure(done->inode, done->start, bvec->bv_page, 0); +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + int i; + int ret; + + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); + if (ret) + return ret; + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } + + start += bvec->bv_len; + } + + return 0; +} + +static void btrfs_retry_endio(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct bio_vec *bvec; + int uptodate; + int ret; + int i; + + if (err) + goto end; + + uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) { + ret = __readpage_endio_check(done->inode, io_bio, i, + bvec->bv_page, 0, + done->start, bvec->bv_len); + if (!ret) + clean_io_failure(done->inode, done->start, + bvec->bv_page, 0); + else + uptodate = 0; + } + + done->uptodate = uptodate; +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + u64 offset = 0; + int i; + int ret; + + err = 0; + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + 0, start, bvec->bv_len); + if (likely(!ret)) + goto next; +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); + if (ret) { + err = ret; + goto next; + } + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } +next: + offset += bvec->bv_len; + start += bvec->bv_len; + } + + return err; +} + +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (skip_csum) { + if (unlikely(err)) + return __btrfs_correct_data_nocsum(inode, io_bio); + else + return 0; + } else { + return __btrfs_subio_endio_read(inode, io_bio, err); + } +} + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct bio *dio_bio; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + err = btrfs_subio_endio_read(inode, io_bio, err); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1); + dio_bio = dip->dio_bio; + + kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + + if (io_bio->end_io) + io_bio->end_io(io_bio, err); + bio_put(bio); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; + int ret; + + if (err) + goto out_done; +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes, !err); + if (!ret) + goto out_test; + + btrfs_init_work(&ordered->work, btrfs_endio_write_helper, + finish_ordered_fn, NULL, NULL); + btrfs_queue_work(root->fs_info->endio_write_workers, + &ordered->work); +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + ordered = NULL; + goto again; + } +out_done: + dio_bio = dip->dio_bio; + + kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); /* -ENOMEM */ + return 0; +} + +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) + btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + + if (dip->subio_endio) + err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); + + if (err) { + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) { + bio_io_error(dip->orig_bio); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, + struct inode *inode, + struct btrfs_dio_private *dip, + struct bio *bio, + u64 file_offset) +{ + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + int ret; + + /* + * We load all the csum data we need when we submit + * the first bio to reduce the csum tree search and + * contention. + */ + if (dip->logical_offset == file_offset) { + ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, + file_offset); + if (ret) + return ret; + } + + if (bio == dip->orig_bio) + return 0; + + file_offset -= dip->logical_offset; + file_offset >>= inode->i_sb->s_blocksize_bits; + io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + + return 0; +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + int async_submit) +{ + struct btrfs_dio_private *dip = bio->bi_private; + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (async_submit) + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + + bio_get(bio); + + if (!write) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); + if (ret) + goto err; + } + + if (skip_sum) + goto map; + + if (write && async_submit) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (write) { + /* + * If we aren't doing async submit, calculate the csum of the + * bio now. + */ + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + if (ret) + goto err; + } else { + ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, + file_offset); + if (ret) + goto err; + } +map: + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_iter.bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + int ret; + int async_submit = 0; + + map_length = orig_bio->bi_iter.bi_size; + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, + &map_length, NULL, 0); + if (ret) + return -EIO; + + if (map_length >= orig_bio->bi_iter.bi_size) { + bio = orig_bio; + dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; + goto submit; + } + + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + async_submit = 0; + else + async_submit = 1; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + atomic_inc(&dip->pending_bios); + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + map_length = orig_bio->bi_iter.bi_size; + ret = btrfs_map_block(root->fs_info, rw, + start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages++; + bvec++; + } + } + +submit: + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + async_submit); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio *io_bio; + struct btrfs_io_bio *btrfs_bio; + int skip_sum; + int write = rw & REQ_WRITE; + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + + dip = kzalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_io_bio; + } + + dip->private = dio_bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + io_bio->bi_private = dip; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; + atomic_set(&dip->pending_bios, 0); + btrfs_bio = btrfs_io_bio(io_bio); + btrfs_bio->logical = file_offset; + + if (write) { + io_bio->bi_end_io = btrfs_endio_direct_write; + } else { + io_bio->bi_end_io = btrfs_endio_direct_read; + dip->subio_endio = btrfs_subio_endio_read; + } + + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) + return; + + if (btrfs_bio->end_io) + btrfs_bio->end_io(btrfs_bio, ret); +free_io_bio: + bio_put(io_bio); + +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len, 1); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(dio_bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, + const struct iov_iter *iter, loff_t offset) +{ + int seg; + int i; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + + if (offset & blocksize_mask) + goto out; + + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; + + /* If this is a write we don't need to check anymore */ + if (iov_iter_rw(iter) == WRITE) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + goto out; + } + } + retval = 0; +out: + return retval; +} + +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; + + if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) + return 0; + + inode_dio_begin(inode); + smp_mb__after_atomic(); + + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. + */ + count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); + + if (iov_iter_rw(iter) == WRITE) { + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { + inode_dio_end(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iter, offset, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (iov_iter_rw(iter) == WRITE) { + current->journal_info = NULL; + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + } +out: + if (wakeup) + inode_dio_end(inode); + if (relock) + mutex_lock(&inode->i_mutex); + + return ret; +} + +#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int ret; + + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + if (ret) + return ret; + + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent, 0); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +static int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); +} + +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + int inode_evicting = inode->i_state & I_FREEING; + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ + wait_on_page_writeback(page); + + tree = &BTRFS_I(inode)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + struct btrfs_ordered_inode_tree *tree; + u64 new_len; + + tree = &BTRFS_I(inode)->ordered_tree; + + spin_lock_irq(&tree->lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + new_len = page_start - ordered->file_offset; + if (new_len < ordered->truncated_len) + ordered->truncated_len = new_len; + spin_unlock_irq(&tree->lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + page_start, + PAGE_CACHE_SIZE, 1)) + btrfs_finish_ordered_io(ordered); + } + btrfs_put_ordered_extent(ordered); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); + } + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + int reserved = 0; + u64 page_start; + u64 page_end; + + sb_start_pagefault(inode->i_sb); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (!ret) { + ret = file_update_time(vma->vm_file); + reserved = 1; + } + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + if (reserved) + goto out; + goto out_noreserve; + } + + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state, GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); + if (ret) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + SetPageUptodate(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); + +out_unlock: + if (!ret) { + sb_end_pagefault(inode->i_sb); + return VM_FAULT_LOCKED; + } + unlock_page(page); +out: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: + sb_end_pagefault(inode->i_sb); + return ret; +} + +static int btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; + int ret = 0; + int err = 0; + struct btrfs_trans_handle *trans; + u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + + ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + (u64)-1); + if (ret) + return ret; + + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) + return -ENOMEM; + rsv->size = min_size; + rsv->failfast = 1; + + /* + * 1 for the truncate slack space + * 1 for updating the inode. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + trans->block_rsv = rsv; + + while (1) { + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -ENOSPC && ret != -EAGAIN) { + err = ret; + break; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = err = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + } + + if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_orphan_del(trans, inode); + if (ret) + err = ret; + } + + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; + + ret = btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + } + +out: + btrfs_free_block_rsv(root, rsv); + + if (ret && !err) + err = ret; + + return err; +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid) +{ + struct inode *inode; + int err; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + set_nlink(inode, 1); + btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); + + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d", + new_root->root_key.objectid, err); + + err = btrfs_update_inode(trans, new_root, inode); + + iput(inode); + return err; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + struct inode *inode; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->root = NULL; + ei->generation = 0; + ei->last_trans = 0; + ei->last_sub_trans = 0; + ei->logged_trans = 0; + ei->delalloc_bytes = 0; + ei->defrag_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->csum_bytes = 0; + ei->index_cnt = (u64)-1; + ei->dir_index = 0; + ei->last_unlink_trans = 0; + ei->last_log_commit = 0; + + spin_lock_init(&ei->lock); + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + + ei->runtime_flags = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; + + ei->delayed_node = NULL; + + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree); + extent_io_tree_init(&ei->io_tree, &inode->i_data); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; + atomic_set(&ei->sync_writers, 0); + mutex_init(&ei->log_mutex); + mutex_init(&ei->delalloc_mutex); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->delalloc_inodes); + RB_CLEAR_NODE(&ei->rb_node); + + return inode; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + +static void btrfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + + WARN_ON(!hlist_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + WARN_ON(BTRFS_I(inode)->outstanding_extents); + WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); + WARN_ON(BTRFS_I(inode)->defrag_bytes); + + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { + btrfs_info(root->fs_info, "inode %llu still on the orphan list", + btrfs_ino(inode)); + atomic_dec(&root->orphan_inodes); + } + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", + ordered->file_offset, ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: + call_rcu(&inode->i_rcu, btrfs_i_callback); +} + +int btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (root == NULL) + return 1; + + /* the snap/subvol tree is on deleting */ + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + else + return generic_drop_inode(inode); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + if (!btrfs_inode_cachep) + goto fail; + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_transaction_cachep) + goto fail; + + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + goto fail; + + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + u64 delalloc_bytes; + struct inode *inode = d_inode(dentry); + u32 blocksize = inode->i_sb->s_blocksize; + + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_dev; + stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(delalloc_bytes, blocksize)) >> 9; + return 0; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + u64 root_objectid; + int ret; + u64 old_ino = btrfs_ino(old_inode); + + if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, + new_dentry->d_name.name, + new_dentry->d_name.len); + + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (WARN_ON(!new_inode)) { + return ret; + } + } else { + /* maybe -EOVERFLOW */ + return ret; + } + } + ret = 0; + + /* + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction + */ + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) + filemap_flush(old_inode->i_mapping); + + /* close the racy window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 11); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), index); + if (ret) + goto out_fail; + /* + * this is an ugly little race, but the rename is required + * to make sure that if we crash, the inode is either at the + * old name or the new one. pinning the log transaction lets + * us make sure we don't allow a log commit to come in after + * we unlink the name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + } + + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { + ret = __btrfs_unlink_inode(trans, root, old_dir, + d_inode(old_dentry), + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (new_inode) { + inode_inc_iversion(new_inode); + new_inode->i_ctime = CURRENT_TIME; + if (unlikely(btrfs_ino(new_inode) == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + root_objectid = BTRFS_I(new_inode)->location.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, dest, new_dir, + d_inode(new_dentry), + new_dentry->d_name.name, + new_dentry->d_name.len); + } + if (!ret && new_inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, d_inode(new_dentry)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + } +out_fail: + btrfs_end_transaction(trans, root); +out_notrans: + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + struct inode *inode; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + inode = delalloc_work->inode; + if (delalloc_work->wait) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + WARN_ON_ONCE(!inode); + btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, + btrfs_run_delalloc_work, NULL, NULL); + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, + int nr) +{ + struct btrfs_inode *binode; + struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + struct list_head splice; + int ret = 0; + + INIT_LIST_HEAD(&works); + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->delalloc_mutex); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) { + cond_resched_lock(&root->delalloc_lock); + continue; + } + spin_unlock(&root->delalloc_lock); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (!work) { + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + ret++; + if (nr != -1 && ret >= nr) + goto out; + cond_resched(); + spin_lock(&root->delalloc_lock); + } + spin_unlock(&root->delalloc_lock); + +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + mutex_unlock(&root->delalloc_mutex); + return ret; +} + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return -EROFS; + + ret = __start_delalloc_inodes(root, delay_iput, -1); + if (ret > 0) + ret = 0; + /* + * the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return ret; +} + +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&fs_info->delalloc_root_mutex); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice) && nr) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput, nr); + btrfs_put_fs_root(root); + if (ret < 0) + goto out; + + if (nr != -1) { + nr -= ret; + WARN_ON(nr < 0); + } + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); + + ret = 0; + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); +out: + if (!list_empty_careful(&splice)) { + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + } + mutex_unlock(&fs_info->delalloc_root_mutex); + return ret; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + + name_len = strlen(symname); + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFLNK|S_IRWXUGO, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) + goto out_unlock_inode; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out_unlock_inode; + } + key.objectid = btrfs_ino(inode); + key.offset = 0; + key.type = BTRFS_EXTENT_DATA_KEY; + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + btrfs_free_path(path); + goto out_unlock_inode; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len); + err = btrfs_update_inode(trans, root, inode); + if (err) { + drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + +out_unlock: + btrfs_end_transaction(trans, root); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root); + return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; +} + +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 cur_offset = start; + u64 i_size; + u64 cur_bytes; + int ret = 0; + bool own_trans = true; + + if (trans) + own_trans = false; + while (num_bytes > 0) { + if (own_trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + } + + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, + *alloc_hint, &ins, 1, 0); + if (ret) { + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } + + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset, 0); + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } + + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + + em->start = cur_offset; + em->orig_start = cur_offset; + em->len = ins.offset; + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->generation = trans->transid; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset - 1, + 0); + } + free_extent_map(em); +next: + num_bytes -= ins.offset; + cur_offset += ins.offset; + *alloc_hint = ins.objectid + ins.offset; + + inode_inc_iversion(inode); + inode->i_ctime = CURRENT_TIME; + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + + ret = btrfs_update_inode(trans, root, inode); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } + + if (own_trans) + btrfs_end_transaction(trans, root); + } + return ret; +} + +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} + +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + umode_t mode = inode->i_mode; + + if (mask & MAY_WRITE && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if (btrfs_root_readonly(root)) + return -EROFS; + if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) + return -EACCES; + } + return generic_permission(inode, mask); +} + +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; + + /* + * 5 units required for adding orphan entry + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; + } + + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + inode->i_mapping->a_ops = &btrfs_aops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out_inode; + + /* + * We set number of links to 0 in btrfs_new_inode(), and here we set + * it to 1 because d_tmpfile() will issue a warning if the count is 0, + * through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); + unlock_new_inode(inode); + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); + +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + +} + +/* Inspired by filemap_check_errors() */ +int btrfs_inode_check_errors(struct inode *inode) +{ + int ret = 0; + + if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &inode->i_mapping->flags) && + test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) + ret = -EIO; + + return ret; +} + +static const struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename2 = btrfs_rename2, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, +}; +static const struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, +}; + +static const struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, +}; + +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ +static const struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static const struct inode_operations btrfs_file_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fiemap = btrfs_fiemap, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, +}; +static const struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, +}; +static const struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .update_time = btrfs_update_time, +}; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, + .d_release = btrfs_dentry_release, +}; diff --git a/kernel/fs/btrfs/ioctl.c b/kernel/fs/btrfs/ioctl.c new file mode 100644 index 000000000..1c22c6518 --- /dev/null +++ b/kernel/fs/btrfs/ioctl.c @@ -0,0 +1,5359 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" +#include "inode-map.h" +#include "backref.h" +#include "rcu-string.h" +#include "send.h" +#include "dev-replace.h" +#include "props.h" +#include "sysfs.h" +#include "qgroup.h" + +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + + +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff); + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + if (flags & BTRFS_INODE_NODATACOW) + iflags |= FS_NOCOW_FL; + + if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) + iflags |= FS_COMPR_FL; + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + unsigned int new_fl = 0; + + if (ip->flags & BTRFS_INODE_SYNC) + new_fl |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + new_fl |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + new_fl |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + new_fl |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); +} + +/* + * Inherit flags from the parent inode. + * + * Currently only the compression flags and the cow flags are inherited. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } + + btrfs_update_iflags(inode); +} + +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file_inode(file)); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | + FS_NOCOW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + u64 ip_oldflags; + unsigned int i_oldflags; + umode_t mode; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (btrfs_root_readonly(root)) + return -EROFS; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + ret = check_flags(flags); + if (ret) + return ret; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + mode = inode->i_mode; + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + if (flags & FS_NOCOW_FL) { + if (S_ISREG(mode)) { + /* + * It's safe to turn csums off here, no extents exist. + * Otherwise we want the flag to reflect the real COW + * status of the file and will not set it. + */ + if (inode->i_size == 0) + ip->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; + } else { + ip->flags |= BTRFS_INODE_NODATACOW; + } + } else { + /* + * Revert back under same assuptions as above + */ + if (S_ISREG(mode)) { + if (inode->i_size == 0) + ip->flags &= ~(BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM); + } else { + ip->flags &= ~BTRFS_INODE_NODATACOW; + } + } + + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; + } else if (flags & FS_COMPR_FL) { + const char *comp; + + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + + if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + comp = "lzo"; + else + comp = "zlib"; + ret = btrfs_set_prop(inode, "btrfs.compression", + comp, strlen(comp), 0); + if (ret) + goto out_drop; + + } else { + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; + ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } + + btrfs_update_iflags(inode); + inode_inc_iversion(inode); + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + + btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } + + out_unlock: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); + return ret; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file_inode(file); + + return put_user(inode->i_generation, arg); +} + +static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_device *device; + struct request_queue *q; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, + dev_list) { + if (!device->bdev) + continue; + q = bdev_get_queue(device->bdev); + if (blk_queue_discard(q)) { + num_devices++; + minlen = min((u64)q->limits.discard_granularity, + minlen); + } + } + rcu_read_unlock(); + + if (!num_devices) + return -EOPNOTSUPP; + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + if (range.start > total_bytes || + range.len < fs_info->sb->s_blocksize) + return -EINVAL; + + range.len = min(range.len, total_bytes - range.start); + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(fs_info->tree_root, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +int btrfs_is_empty_uuid(u8 *uuid) +{ + int i; + + for (i = 0; i < BTRFS_UUID_SIZE; i++) { + if (uuid[i]) + return 0; + } + return 1; +} + +static noinline int create_subvol(struct inode *dir, + struct dentry *dentry, + char *name, int namelen, + u64 *async_transid, + struct btrfs_qgroup_inherit *inherit) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *new_root; + struct btrfs_block_rsv block_rsv; + struct timespec cur_time = CURRENT_TIME; + struct inode *inode; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + u64 qgroup_reserved; + uuid_le new_uuid; + + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); + if (ret) + return ret; + + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assume subvolme qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * The same as the snapshot creation, please see the comment + * of create_snapshot(). + */ + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 8, &qgroup_reserved, false); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_subvolume_release_metadata(root, &block_rsv, + qgroup_reserved); + return ret; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); + if (ret) + goto fail; + + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + memset(&root_item, 0, sizeof(root_item)); + + inode_item = &root_item.inode; + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_flags(&root_item, 0); + btrfs_set_root_limit(&root_item, 0); + btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, leaf->len); + btrfs_set_root_last_snapshot(&root_item, 0); + + btrfs_set_root_generation_v2(&root_item, + btrfs_root_generation(&root_item)); + uuid_le_gen(&new_uuid); + memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); + root_item.ctime = root_item.otime; + btrfs_set_root_ctransid(&root_item, trans->transid); + btrfs_set_root_otransid(&root_item, trans->transid); + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + key.offset = (u64)-1; + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + if (IS_ERR(new_root)) { + btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); + ret = PTR_ERR(new_root); + goto fail; + } + + btrfs_record_root_in_trans(trans, new_root); + + ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + if (ret) { + /* We potentially lose an unused inode item here */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(dir, &index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir, &key, + BTRFS_FT_DIR, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + btrfs_ino(dir), index, name, namelen); + BUG_ON(ret); + + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + +fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + + if (async_transid) { + *async_transid = trans->transid; + err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); + } else { + err = btrfs_commit_transaction(trans, root); + } + if (err && !ret) + ret = err; + + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + } + return ret; +} + +static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) +{ + s64 writers; + DEFINE_WAIT(wait); + + do { + prepare_to_wait(&root->subv_writers->wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&root->subv_writers->counter); + if (writers) + schedule(); + + finish_wait(&root->subv_writers->wait, &wait); + } while (writers); +} + +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + struct inode *inode; + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret; + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return -EINVAL; + + atomic_inc(&root->will_be_snapshoted); + smp_mb__after_atomic(); + btrfs_wait_for_no_snapshoting_writes(root); + + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + goto out; + + btrfs_wait_ordered_extents(root, -1); + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto out; + } + + btrfs_init_block_rsv(&pending_snapshot->block_rsv, + BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + * 1 - UUID item + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 8, + &pending_snapshot->qgroup_reserved, + false); + if (ret) + goto free; + + pending_snapshot->dentry = dentry; + pending_snapshot->root = root; + pending_snapshot->readonly = readonly; + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + spin_lock(&root->fs_info->trans_lock); + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); + if (async_transid) { + *async_transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, + root->fs_info->extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_commit_transaction(trans, + root->fs_info->extent_root); + } + if (ret) + goto fail; + + ret = pending_snapshot->error; + if (ret) + goto fail; + + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + + d_instantiate(dentry, inode); + ret = 0; +fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +free: + kfree(pending_snapshot); +out: + if (atomic_dec_and_test(&root->will_be_snapshoted)) + wake_up_atomic_t(&root->will_be_snapshoted); + return ret; +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) +{ + int error; + + if (d_really_is_negative(victim)) + return -ENOENT; + + BUG_ON(d_inode(victim->d_parent) != dir); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || + IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) + return -EPERM; + if (isdir) { + if (!d_is_dir(victim)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (d_is_dir(victim)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (d_really_is_positive(child)) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, + char *name, int namelen, + struct btrfs_root *snap_src, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + struct inode *dir = d_inode(parent->dentry); + struct dentry *dentry; + int error; + + error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (error == -EINTR) + return error; + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (d_really_is_positive(dentry)) + goto out_dput; + + error = btrfs_may_create(dir, dentry); + if (error) + goto out_dput; + + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, name, + namelen); + if (error) + goto out_dput; + + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + + if (snap_src) { + error = create_snapshot(snap_src, dir, dentry, name, namelen, + async_transid, readonly, inherit); + } else { + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); + } + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&dir->i_mutex); + return error; +} + +/* + * When we're defragging a range, we don't want to kick it off again + * if it is really just waiting for delalloc to send it down. + * If we find a nice big extent or delalloc range for the bytes in the + * file you want to defrag, we return 0 to let you know to skip this + * part of the file + */ +static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 end; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (em) { + end = extent_map_end(em); + free_extent_map(em); + if (end - offset > thresh) + return 0; + } + /* if we already have a nice delalloc here, just stop */ + thresh /= 2; + end = count_range_bits(io_tree, &offset, offset + thresh, + thresh, EXTENT_DELALLOC, 1); + if (end >= thresh) + return 0; + return 1; +} + +/* + * helper function to walk through a file and find extents + * newer than a specific transid, and smaller than thresh. + * + * This is used by the defragging code to find new and small + * extents + */ +static int find_new_extents(struct btrfs_root *root, + struct inode *inode, u64 newer_than, + u64 *off, u32 thresh) +{ + struct btrfs_path *path; + struct btrfs_key min_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + int type; + int ret; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + min_key.objectid = ino; + min_key.type = BTRFS_EXTENT_DATA_KEY; + min_key.offset = *off; + + while (1) { + ret = btrfs_search_forward(root, &min_key, path, newer_than); + if (ret != 0) + goto none; +process_slot: + if (min_key.objectid != ino) + goto none; + if (min_key.type != BTRFS_EXTENT_DATA_KEY) + goto none; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_num_bytes(leaf, extent) < thresh && + check_defrag_in_cache(inode, min_key.offset, thresh)) { + *off = min_key.offset; + btrfs_free_path(path); + return 0; + } + + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); + goto process_slot; + } + + if (min_key.offset == (u64)-1) + goto none; + + min_key.offset++; + btrfs_release_path(path); + } +none: + btrfs_free_path(path); + return -ENOENT; +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 len = PAGE_CACHE_SIZE; + + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + /* get the big lock and read metadata off disk */ + lock_extent_bits(io_tree, start, end, 0, &cached); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ + struct extent_map *next; + bool ret = true; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + next = defrag_lookup_extent(inode, em->start + em->len); + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + else if ((em->block_start + em->block_len == next->block_start) && + (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) + ret = false; + + free_extent_map(next); + return ret; +} + +static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, + u64 *last_len, u64 *skip, u64 *defrag_end, + int compress) +{ + struct extent_map *em; + int ret = 1; + bool next_mergeable = true; + + /* + * make sure that once we start defragging an extent, we keep on + * defragging it + */ + if (start < *defrag_end) + return 1; + + *skip = 0; + + em = defrag_lookup_extent(inode, start); + if (!em) + return 0; + + /* this will cover holes, and inline extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + ret = 0; + goto out; + } + + next_mergeable = defrag_check_next_extent(inode, em); + /* + * we hit a real extent, if it is big or the next extent is not a + * real extent, don't bother defragging it + */ + if (!compress && (*last_len == 0 || *last_len >= thresh) && + (em->len >= thresh || !next_mergeable)) + ret = 0; +out: + /* + * last_len ends up being a counter of how many bytes we've defragged. + * every time we choose not to defrag an extent, we reset *last_len + * so that the next tiny extent will force a defrag. + * + * The end result of this is that tiny extents before a single big + * extent will force at least part of that big extent to be defragged. + */ + if (ret) { + *defrag_end = extent_map_end(em); + } else { + *last_len = 0; + *skip = extent_map_end(em); + *defrag_end = 0; + } + + free_extent_map(em); + return ret; +} + +/* + * it doesn't do much good to defrag one or two pages + * at a time. This pulls in a nice chunk of pages + * to COW and defrag. + * + * It also makes sure the delalloc code has enough + * dirty data to avoid making new small extents as part + * of the defrag + * + * It's a good idea to start RA on this range + * before calling this. + */ +static int cluster_pages_for_defrag(struct inode *inode, + struct page **pages, + unsigned long start_index, + unsigned long num_pages) +{ + unsigned long file_end; + u64 isize = i_size_read(inode); + u64 page_start; + u64 page_end; + u64 page_cnt; + int ret; + int i; + int i_done; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || start_index > file_end) + return 0; + + page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); + + ret = btrfs_delalloc_reserve_space(inode, + page_cnt << PAGE_CACHE_SHIFT); + if (ret) + return ret; + i_done = 0; + tree = &BTRFS_I(inode)->io_tree; + + /* step one, lock all the pages */ + for (i = 0; i < page_cnt; i++) { + struct page *page; +again: + page = find_or_create_page(inode->i_mapping, + start_index + i, mask); + if (!page) + break; + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent_bits(tree, page_start, page_end, + 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent_cached(tree, page_start, page_end, + &cached_state, GFP_NOFS); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * we unlocked the page above, so we need check if + * it was released or not. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + + pages[i] = page; + i_done++; + } + if (!i_done || ret) + goto out; + + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + goto out; + + /* + * so now we have a nice long stream of locked + * and up to date pages, lets wait on them + */ + for (i = 0; i < i_done; i++) + wait_on_page_writeback(pages[i]); + + page_start = page_offset(pages[0]); + page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, 0, &cached_state); + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, + page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + &cached_state, GFP_NOFS); + + if (i_done != page_cnt) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); + } + + + set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, + &cached_state, GFP_NOFS); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state, + GFP_NOFS); + + for (i = 0; i < i_done; i++) { + clear_page_dirty_for_io(pages[i]); + ClearPageChecked(pages[i]); + set_page_extent_mapped(pages[i]); + set_page_dirty(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + return i_done; +out: + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + return ret; + +} + +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file_ra_state *ra = NULL; + unsigned long last_index; + u64 isize = i_size_read(inode); + u64 last_len = 0; + u64 skip = 0; + u64 defrag_end = 0; + u64 newer_off = range->start; + unsigned long i; + unsigned long ra_index = 0; + int ret; + int defrag_count = 0; + int compress_type = BTRFS_COMPRESS_ZLIB; + u32 extent_thresh = range->extent_thresh; + unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long cluster = max_cluster; + u64 new_align = ~((u64)128 * 1024 - 1); + struct page **pages = NULL; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = 256 * 1024; + + /* + * if we were not given a file, allocate a readahead + * context + */ + if (!file) { + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + file_ra_state_init(ra, inode->i_mapping); + } else { + ra = &file->f_ra; + } + + pages = kmalloc_array(max_cluster, sizeof(struct page *), + GFP_NOFS); + if (!pages) { + ret = -ENOMEM; + goto out_ra; + } + + /* find the last page to defrag */ + if (range->start + range->len > range->start) { + last_index = min_t(u64, isize - 1, + range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + } else { + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; + } + + if (newer_than) { + ret = find_new_extents(root, inode, newer_than, + &newer_off, 64 * 1024); + if (!ret) { + range->start = newer_off; + /* + * we always align our defrag to help keep + * the extents in the file evenly spaced + */ + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + } else + goto out_ra; + } else { + i = range->start >> PAGE_CACHE_SHIFT; + } + if (!max_to_defrag) + max_to_defrag = last_index + 1; + + /* + * make writeback starts from i, so the defrag range can be + * written sequentially. + */ + if (i < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = i; + + while (i <= last_index && defrag_count < max_to_defrag && + (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { + /* + * make sure we stop running if someone unmounts + * the FS + */ + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + ret = -EAGAIN; + break; + } + + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + extent_thresh, &last_len, &skip, + &defrag_end, range->flags & + BTRFS_DEFRAG_RANGE_COMPRESS)) { + unsigned long next; + /* + * the should_defrag function tells us how much to skip + * bump our counter by the suggested amount + */ + next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); + i = max(i + 1, next); + continue; + } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } + + mutex_lock(&inode->i_mutex); + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + goto out_ra; + } + + defrag_count += ret; + balance_dirty_pages_ratelimited(inode->i_mapping); + mutex_unlock(&inode->i_mutex); + + if (newer_than) { + if (newer_off == (u64)-1) + break; + + if (ret > 0) + i += ret; + + newer_off = max(newer_off + 1, + (u64)i << PAGE_CACHE_SHIFT); + + ret = find_new_extents(root, inode, + newer_than, &newer_off, + 64 * 1024); + if (!ret) { + range->start = newer_off; + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + } else { + break; + } + } else { + if (ret > 0) { + i += ret; + last_len += ret << PAGE_CACHE_SHIFT; + } else { + i++; + last_len = 0; + } + } + } + + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + } + + if (range->compress_type == BTRFS_COMPRESS_LZO) { + btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); + } + + ret = defrag_count; + +out_ra: + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + mutex_unlock(&inode->i_mutex); + } + if (!file) + kfree(ra); + kfree(pages); + return ret; +} + +static noinline int btrfs_ioctl_resize(struct file *file, + void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *retptr; + char *devstr = NULL; + int ret = 0; + int mod = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + mnt_drop_write_file(file); + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } + + mutex_lock(&root->fs_info->volume_mutex); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; + if (!devid) { + ret = -EINVAL; + goto out_free; + } + btrfs_info(root->fs_info, "resizing devid %llu", devid); + } + + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); + if (!device) { + btrfs_info(root->fs_info, "resizer unable to find device %llu", + devid); + ret = -ENODEV; + goto out_free; + } + + if (!device->writeable) { + btrfs_info(root->fs_info, + "resizer unable to apply on readonly device %llu", + devid); + ret = -EPERM; + goto out_free; + } + + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { + ret = -EINVAL; + goto out_free; + } + } + + if (device->is_tgtdev_for_dev_replace) { + ret = -EPERM; + goto out_free; + } + + old_size = btrfs_device_get_total_bytes(device); + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_free; + } + new_size = old_size - new_size; + } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -ERANGE; + goto out_free; + } + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_free; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_free; + } + + new_size = div_u64(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + rcu_str_deref(device->name), new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_free; + } + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else if (new_size < old_size) { + ret = btrfs_shrink_device(device, new_size); + } /* equal, nothing need to do */ + +out_free: + kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_transid(struct file *file, + char *name, unsigned long fd, int subvol, + u64 *transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + int namelen; + int ret = 0; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + namelen = strlen(name); + if (strchr(name, '/')) { + ret = -EINVAL; + goto out_drop_write; + } + + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out_drop_write; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + NULL, transid, readonly, inherit); + } else { + struct fd src = fdget(fd); + struct inode *src_inode; + if (!src.file) { + ret = -EINVAL; + goto out_drop_write; + } + + src_inode = file_inode(src.file); + if (src_inode->i_sb != file_inode(file)->i_sb) { + btrfs_info(BTRFS_I(src_inode)->root->fs_info, + "Snapshot src from another FS"); + ret = -EXDEV; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; + } else { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid, readonly, inherit); + } + fdput(src); + } +out_drop_write: + mnt_drop_write_file(file); +out: + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false, NULL); + + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; + struct btrfs_qgroup_inherit *inherit = NULL; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | + BTRFS_SUBVOL_QGROUP_INHERIT)) { + ret = -EOPNOTSUPP; + goto free_args; + } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { + if (vol_args->size > PAGE_CACHE_SIZE) { + ret = -EINVAL; + goto free_args; + } + inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); + if (IS_ERR(inherit)) { + ret = PTR_ERR(inherit); + goto free_args; + } + } + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, ptr, + readonly, inherit); + if (ret) + goto free_inherit; + + if (ptr && copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), + ptr, sizeof(*ptr))) + ret = -EFAULT; + +free_inherit: + kfree(inherit); +free_args: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out_drop_write; + } + + if (copy_from_user(&flags, arg, sizeof(flags))) { + ret = -EFAULT; + goto out_drop_write; + } + + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out_drop_write; + } + + if (flags & ~BTRFS_SUBVOL_RDONLY) { + ret = -EOPNOTSUPP; + goto out_drop_write; + } + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out_drop_sem; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) { + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out_drop_sem: + up_write(&root->fs_info->subvol_sem); +out_drop_write: + mnt_drop_write_file(file); +out: + return ret; +} + +/* + * helper to check if the subvolume references other subvolumes + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 dir_id; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -EPERM; + btrfs_err(root->fs_info, "deleting default subvolume " + "%llu is not allowed", key.objectid); + goto out; + } + btrfs_release_path(path); + } + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +static noinline int key_in_sk(struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk) +{ + struct btrfs_key test; + int ret; + + test.objectid = sk->min_objectid; + test.type = sk->min_type; + test.offset = sk->min_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret < 0) + return 0; + + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret > 0) + return 0; + return 1; +} + +static noinline int copy_to_sk(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf, + unsigned long *sk_offset, + int *num_found) +{ + u64 found_transid; + struct extent_buffer *leaf; + struct btrfs_ioctl_search_header sh; + unsigned long item_off; + unsigned long item_len; + int nritems; + int i; + int slot; + int ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + + if (btrfs_header_generation(leaf) > sk->max_transid) { + i = nritems; + goto advance_key; + } + found_transid = btrfs_header_generation(leaf); + + for (i = slot; i < nritems; i++) { + item_off = btrfs_item_ptr_offset(leaf, i); + item_len = btrfs_item_size_nr(leaf, i); + + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; + item_len = 0; + ret = -EOVERFLOW; + } + + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { + ret = 1; + goto out; + } + + sh.objectid = key->objectid; + sh.offset = key->offset; + sh.type = key->type; + sh.len = item_len; + sh.transid = found_transid; + + /* copy search result header */ + if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = -EFAULT; + goto out; + } + + *sk_offset += sizeof(sh); + + if (item_len) { + char __user *up = ubuf + *sk_offset; + /* copy the item */ + if (read_extent_buffer_to_user(leaf, up, + item_off, item_len)) { + ret = -EFAULT; + goto out; + } + + *sk_offset += item_len; + } + (*num_found)++; + + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } + } +advance_key: + ret = 0; + if (key->offset < (u64)-1 && key->offset < sk->max_offset) + key->offset++; + else if (key->type < (u8)-1 && key->type < sk->max_type) { + key->offset = 0; + key->type++; + } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + key->offset = 0; + key->type = 0; + key->objectid++; + } else + ret = 1; +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ + return ret; +} + +static noinline int search_ioctl(struct inode *inode, + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf) +{ + struct btrfs_root *root; + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; + int ret; + int num_found = 0; + unsigned long sk_offset = 0; + + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (sk->tree_id == 0) { + /* search the root of the inode that was passed */ + root = BTRFS_I(inode)->root; + } else { + key.objectid = sk->tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "BTRFS: could not find root %llu\n", + sk->tree_id); + btrfs_free_path(path); + return -ENOENT; + } + } + + key.objectid = sk->min_objectid; + key.type = sk->min_type; + key.offset = sk->min_offset; + + while (1) { + ret = btrfs_search_forward(root, &key, path, sk->min_transid); + if (ret != 0) { + if (ret > 0) + ret = 0; + goto err; + } + ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + &sk_offset, &num_found); + btrfs_release_path(path); + if (ret) + break; + + } + if (ret > 0) + ret = 0; +err: + sk->nr_items = num_found; + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_tree_search(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_key sk; + struct inode *inode; + int ret; + size_t buf_size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + uargs = (struct btrfs_ioctl_search_args __user *)argp; + + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) + return -EFAULT; + + buf_size = sizeof(uargs->buf); + + inode = file_inode(file); + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) + ret = -EFAULT; + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 args; + struct inode *inode; + int ret; + size_t buf_size; + const size_t buf_limit = 16 * 1024 * 1024; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + if (buf_size < sizeof(struct btrfs_ioctl_search_header)) + return -EOVERFLOW; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + inode = file_inode(file); + ret = search_ioctl(inode, &args.key, &buf_size, + (char *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + + return ret; +} + +/* + * Search INODE_REFs to identify path name of 'dirid' directory + * in a 'tree_id' tree. and sets path name to 'name'. + */ +static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, + u64 tree_id, u64 dirid, char *name) +{ + struct btrfs_root *root; + struct btrfs_key key; + char *ptr; + int ret = -1; + int slot; + int len; + int total_len = 0; + struct btrfs_inode_ref *iref; + struct extent_buffer *l; + struct btrfs_path *path; + + if (dirid == BTRFS_FIRST_FREE_OBJECTID) { + name[0]='\0'; + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; + + key.objectid = tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); + ret = -ENOENT; + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, slot); + + iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(l, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto out; + } + + *(ptr + len) = '/'; + read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); + + if (key.offset == BTRFS_FIRST_FREE_OBJECTID) + break; + + btrfs_release_path(path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + } + memmove(name, ptr, total_len); + name[total_len] = '\0'; + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_ino_lookup(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_args *args; + struct inode *inode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = file_inode(file); + + if (args->treeid == 0) + args->treeid = BTRFS_I(inode)->root->root_key.objectid; + + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + args->treeid, args->objectid, + args->name); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg) +{ + struct dentry *parent = file->f_path.dentry; + struct dentry *dentry; + struct inode *dir = d_inode(parent); + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 root_flags; + u64 qgroup_reserved; + int namelen; + int ret; + int err = 0; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/') || + strncmp(vol_args->name, "..", namelen) == 0) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write_file(file); + if (err) + goto out; + + + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; + dentry = lookup_one_len(vol_args->name, parent, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (d_really_is_negative(dentry)) { + err = -ENOENT; + goto out_dput; + } + + inode = d_inode(dentry); + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)) { + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ + err = -EINVAL; + if (root == dest) + goto out_dput; + + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; + } + + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_unlock_inode; + } + + d_invalidate(dentry); + + down_write(&root->fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, two for dir entries, two for root + * ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 5, &qgroup_reserved, true); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_release; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, root, ret); + goto out_end_trans; + } + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } + + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } + +out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + ret = btrfs_end_transaction(trans, root); + if (ret && !err) + err = ret; + inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); +out_up_write: + up_write(&root->fs_info->subvol_sem); + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } +out_unlock_inode: + mutex_unlock(&inode->i_mutex); + if (!err) { + shrink_dcache_sb(root->fs_info->sb); + btrfs_invalidate_inodes(dest); + d_delete(dentry); + ASSERT(dest->send_in_progress == 0); + + /* the last ref */ + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; + } + } +out_dput: + dput(dentry); +out_unlock_dir: + mutex_unlock(&dir->i_mutex); +out_drop_write: + mnt_drop_write_file(file); +out: + kfree(vol_args); + return err; +} + +static int btrfs_ioctl_defrag(struct file *file, void __user *argp) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_defrag_range_args *range; + int ret; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_defrag_root(root); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + ret = -ENOMEM; + goto out; + } + + if (argp) { + if (copy_from_user(range, argp, + sizeof(*range))) { + ret = -EFAULT; + kfree(range); + goto out; + } + /* compression requires us to start the IO */ + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range->flags |= BTRFS_DEFRAG_RANGE_START_IO; + range->extent_thresh = (u32)-1; + } + } else { + /* the rest are all set to zero by kzalloc */ + range->len = (u64)-1; + } + ret = btrfs_defrag_file(file_inode(file), file, + range, 0, 0); + if (ret > 0) + ret = 0; + kfree(range); + break; + default: + ret = -EINVAL; + } +out: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } + + mutex_lock(&root->fs_info->volume_mutex); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + + if (!ret) + btrfs_info(root->fs_info, "disk added %s",vol_args->name); + + kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto err_drop; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name); + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + +out: + kfree(vol_args); +err_drop: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_fs_info_args *fi_args; + struct btrfs_device *device; + struct btrfs_device *next; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + mutex_lock(&fs_devices->device_list_mutex); + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); + + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; + } + mutex_unlock(&fs_devices->device_list_mutex); + + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; + + kfree(fi_args); + return ret; +} + +static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_info_args *di_args; + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + char *s_uuid = NULL; + + di_args = memdup_user(arg, sizeof(*di_args)); + if (IS_ERR(di_args)) + return PTR_ERR(di_args); + + if (!btrfs_is_empty_uuid(di_args->uuid)) + s_uuid = di_args->uuid; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); + + if (!dev) { + ret = -ENODEV; + goto out; + } + + di_args->devid = dev->devid; + di_args->bytes_used = btrfs_device_get_bytes_used(dev); + di_args->total_bytes = btrfs_device_get_total_bytes(dev); + memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + if (dev->name) { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + strncpy(di_args->path, name->str, sizeof(di_args->path)); + rcu_read_unlock(); + di_args->path[sizeof(di_args->path) - 1] = 0; + } else { + di_args->path[0] = '\0'; + } + +out: + mutex_unlock(&fs_devices->device_list_mutex); + if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) + ret = -EFAULT; + + kfree(di_args); + return ret; +} + +static struct page *extent_same_get_page(struct inode *inode, u64 off) +{ + struct page *page; + pgoff_t index; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + index = off >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, + 0)) + return NULL; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + } + unlock_page(page); + + return page; +} + +static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +{ + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, + off + len - 1); + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && + !test_range_bit(&BTRFS_I(inode)->io_tree, off, + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(inode, off, len); + } +} + +static void btrfs_double_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); +} + +static void btrfs_double_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + lock_extent_range(inode1, loff1, len); + if (inode1 != inode2) { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + lock_extent_range(inode2, loff2, len); + } +} + +static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, + u64 dst_loff, u64 len) +{ + int ret = 0; + struct page *src_page, *dst_page; + unsigned int cmp_len = PAGE_CACHE_SIZE; + void *addr, *dst_addr; + + while (len) { + if (len < PAGE_CACHE_SIZE) + cmp_len = len; + + src_page = extent_same_get_page(src, loff); + if (!src_page) + return -EINVAL; + dst_page = extent_same_get_page(dst, dst_loff); + if (!dst_page) { + page_cache_release(src_page); + return -EINVAL; + } + addr = kmap_atomic(src_page); + dst_addr = kmap_atomic(dst_page); + + flush_dcache_page(src_page); + flush_dcache_page(dst_page); + + if (memcmp(addr, dst_addr, cmp_len)) + ret = BTRFS_SAME_DATA_DIFFERS; + + kunmap_atomic(addr); + kunmap_atomic(dst_addr); + page_cache_release(src_page); + page_cache_release(dst_page); + + if (ret) + break; + + loff += cmp_len; + dst_loff += cmp_len; + len -= cmp_len; + } + + return ret; +} + +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +{ + u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; + + if (off + len > inode->i_size || off + len < off) + return -EINVAL; + /* Check that we are block aligned - btrfs_clone() requires this */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) + return -EINVAL; + + return 0; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + int ret; + + /* + * btrfs_clone() can't handle extents in the same file + * yet. Once that works, we can drop this check and replace it + * with a check for the same inode, but overlapping extents. + */ + if (src == dst) + return -EINVAL; + + if (len == 0) + return 0; + + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, len); + if (ret) + goto out_unlock; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + if (ret == 0) + ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + +out_unlock: + btrfs_double_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) + +static long btrfs_ioctl_file_extent_same(struct file *file, + struct btrfs_ioctl_same_args __user *argp) +{ + struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_extent_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + unsigned long size; + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + bool is_admin = capable(CAP_SYS_ADMIN); + u16 count; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (get_user(count, &argp->dest_count)) { + ret = -EFAULT; + goto out; + } + + size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); + + same = memdup_user(argp, size); + + if (IS_ERR(same)) { + ret = PTR_ERR(same); + goto out; + } + + off = same->logical_offset; + len = same->length; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > BTRFS_MAX_DEDUPE_LEN) + len = BTRFS_MAX_DEDUPE_LEN; + + if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + ret = -EINVAL; + goto out; + } + + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; + + ret = -EACCES; + if (!S_ISREG(src->i_mode)) + goto out; + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = 0; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_file = fdget(info->fd); + if (!dst_file.file) { + info->status = -EBADF; + continue; + } + dst = file_inode(dst_file.file); + + if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + } else if (!S_ISREG(dst->i_mode)) { + info->status = -EACCES; + } else { + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; + } + fdput(dst_file); + } + + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; + +out: + mnt_drop_write_file(file); + return ret; +} + +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } + } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen, extent_same uses + * identical values here + * @destoff: Offset within @inode to start clone + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; + + ret = -ENOMEM; + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + return ret; + } + + path->reada = 2; + /* clone data */ + key.objectid = btrfs_ino(src); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = off; + + while (1) { + u64 next_key_min_offset = key.offset + 1; + + /* + * note the key will change type as we walk through the + * tree. + */ + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); + if (ret < 0) + goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + no_quota = 1; + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(src)) + break; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(inode); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* + * 1 - adjusting old extent (we may have to split it) + * 1 - add new extent + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + ret = btrfs_drop_extents(trans, root, inode, + drop_start, + new_key.offset + datal, + 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, 0, + root->root_key.objectid, + btrfs_ino(inode), + new_key.offset - datao, + no_quota); + if (ret) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + goto out; + + } + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + u64 aligned_end = 0; + + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + btrfs_end_transaction(trans, root); + goto out; + } + size -= skip + trim; + datal -= skip + trim; + + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); + ret = btrfs_drop_extents(trans, root, inode, + drop_start, + aligned_end, + 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, 0, 0); + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + last_dest_end = ALIGN(new_key.offset + datal, + root->sectorsize); + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + } + btrfs_release_path(path); + key.offset = next_key_min_offset; + } + ret = 0; + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); + } + +out: + btrfs_free_path(path); + vfree(buf); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct fd src_file; + struct inode *src; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + return -EINVAL; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + src_file = fdget(srcfd); + if (!src_file.file) { + ret = -EBADF; + goto out_drop_write; + } + + ret = -EXDEV; + if (src_file.file->f_path.mnt != file->f_path.mnt) + goto out_fput; + + src = file_inode(src_file.file); + + ret = -EINVAL; + if (src == inode) + same_inode = 1; + + /* the src must be open for reading */ + if (!(src_file.file->f_mode & FMODE_READ)) + goto out_fput; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } + } else { + mutex_lock(&src->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off + len > src->i_size || off + len < off) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + if (len == 0) { + ret = 0; + goto out_unlock; + } + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) + goto out_unlock; + + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; + + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } + + ret = btrfs_clone(src, inode, off, olen, len, destoff); + + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); +out_unlock: + if (!same_inode) { + if (inode < src) { + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + } else { + mutex_unlock(&inode->i_mutex); + mutex_unlock(&src->i_mutex); + } + } else { + mutex_unlock(&src->i_mutex); + } +out_fput: + fdput(src_file); +out_drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EINPROGRESS; + if (file->private_data) + goto out; + + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + atomic_inc(&root->fs_info->open_ioctl_trans); + + ret = -ENOMEM; + trans = btrfs_start_ioctl_transaction(root); + if (IS_ERR(trans)) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + atomic_dec(&root->fs_info->open_ioctl_trans); + mnt_drop_write_file(file); +out: + return ret; +} + +static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + u64 objectid = 0; + u64 dir_id; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } + + if (!objectid) + objectid = BTRFS_FS_TREE_OBJECTID; + + location.objectid = objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + path->leave_spinning = 1; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_free_path(path); + ret = PTR_ERR(trans); + goto out; + } + + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, + dir_id, "default", 7, 1); + if (IS_ERR_OR_NULL(di)) { + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" + "item, this isn't going to work"); + ret = -ENOENT; + goto out; + } + + btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); + btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); + btrfs_end_transaction(trans, root); +out: + mnt_drop_write_file(file); + return ret; +} + +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } +} + +static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_info space; + struct btrfs_ioctl_space_info *dest; + struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info __user *user_dest; + struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int alloc_size; + int ret = 0; + u64 slot_count = 0; + int i, c; + + if (copy_from_user(&space_args, + (struct btrfs_ioctl_space_args __user *)arg, + sizeof(space_args))) + return -EFAULT; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } + + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + + /* space_slots == 0 means they are asking for a count */ + if (space_args.space_slots == 0) { + space_args.total_spaces = slot_count; + goto out; + } + + slot_count = min_t(u64, space_args.space_slots, slot_count); + + alloc_size = sizeof(*dest) * slot_count; + + /* we generally have at most 6 or so space infos, one for each raid + * level. So, a whole page should be more than enough for everyone + */ + if (alloc_size > PAGE_CACHE_SIZE) + return -ENOMEM; + + space_args.total_spaces = 0; + dest = kmalloc(alloc_size, GFP_NOFS); + if (!dest) + return -ENOMEM; + dest_orig = dest; + + /* now we have a buffer to copy into */ + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + if (!slot_count) + break; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + btrfs_get_block_group_info( + &info->block_groups[c], &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + slot_count--; + } + if (!slot_count) + break; + } + up_read(&info->groups_sem); + } + + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + + user_dest = (struct btrfs_ioctl_space_info __user *) + (arg + sizeof(struct btrfs_ioctl_space_args)); + + if (copy_to_user(user_dest, dest_orig, alloc_size)) + ret = -EFAULT; + + kfree(dest_orig); +out: + if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) + ret = -EFAULT; + + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = file->private_data; + if (!trans) + return -EINVAL; + file->private_data = NULL; + + btrfs_end_transaction(trans, root); + + atomic_dec(&root->fs_info->open_ioctl_trans); + + mnt_drop_write_file(file); + return 0; +} + +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) +{ + struct btrfs_trans_handle *trans; + u64 transid; + int ret; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } + transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } +out: + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; + return 0; +} + +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, + void __user *argp) +{ + u64 transid; + + if (argp) { + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + } else { + transid = 0; /* current trans */ + } + return btrfs_wait_for_commit(root, transid); +} + +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_scrub_cancel(root->fs_info); +} + +static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { + kfree(sa); + return -EPERM; + } + + ret = btrfs_get_dev_stats(root, sa); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (root->fs_info->sb->s_flags & MS_RDONLY) { + ret = -EROFS; + goto out; + } + if (atomic_xchg( + &root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } else { + ret = btrfs_dev_replace_start(root, p); + atomic_set( + &root->fs_info->mutually_exclusive_operation_running, + 0); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(root->fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + ret = btrfs_dev_replace_cancel(root->fs_info, p); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_DAC_READ_SEARCH)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 64 * 1024); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, + build_ino_list, inodes); + if (ret == -EINVAL) + ret = -ENOENT; + if (ret < 0) + goto out; + + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + vfree(inodes); + kfree(loi); + + return ret; +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } +} + +static long btrfs_ioctl_balance(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out_unlock; + } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } + } else { + bargs = NULL; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + } + +do_balance: + /* + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. + */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out_unlock: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_ctl_args *sa; + struct btrfs_trans_handle *trans = NULL; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + down_write(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + switch (sa->cmd) { + case BTRFS_QUOTA_CTL_ENABLE: + ret = btrfs_quota_enable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_DISABLE: + ret = btrfs_quota_disable(trans, root->fs_info); + break; + default: + ret = -EINVAL; + break; + } + + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); + if (err && !ret) + ret = err; +out: + kfree(sa); + up_write(&root->fs_info->subvol_sem); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->assign) { + ret = btrfs_add_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } else { + ret = btrfs_del_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } + + /* update qgroup status and info */ + err = btrfs_run_qgroups(trans, root->fs_info); + if (err < 0) + btrfs_error(root->fs_info, ret, + "failed to update qgroup status and info\n"); + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_create_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->create) { + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); + } else { + ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_limit_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + u64 qgroupid; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + qgroupid = sa->qgroupid; + if (!qgroupid) { + /* take the current subvol as qgroup */ + qgroupid = root->root_key.objectid; + } + + /* FIXME: check if the IDs really exist */ + ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + qsa = memdup_user(arg, sizeof(*qsa)); + if (IS_ERR(qsa)) { + ret = PTR_ERR(qsa); + goto drop_write; + } + + if (qsa->flags) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_qgroup_rescan(root->fs_info); + +out: + kfree(qsa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + if (!qsa) + return -ENOMEM; + + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + qsa->flags = 1; + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; + } + + if (copy_to_user(arg, qsa, sizeof(*qsa))) + ret = -EFAULT; + + kfree(qsa); + return ret; +} + +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct btrfs_ioctl_received_subvol_args *sa) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct timespec ct = CURRENT_TIME; + int ret = 0; + int received_uuid_changed; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret < 0) + return ret; + + down_write(&root->fs_info->subvol_sem); + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out; + } + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + /* + * 1 - root item + * 2 - uuid items (received uuid + subvol uuid) + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + sa->rtransid = trans->transid; + sa->rtime.sec = ct.tv_sec; + sa->rtime.nsec = ct.tv_nsec; + + received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, + BTRFS_UUID_SIZE); + if (received_uuid_changed && + !btrfs_is_empty_uuid(root_item->received_uuid)) + btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); + btrfs_set_root_stransid(root_item, sa->stransid); + btrfs_set_root_rtransid(root_item, sa->rtransid); + btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); + btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); + btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); + btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans, root); + goto out; + } + if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + sa->uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret < 0 && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + } + ret = btrfs_commit_transaction(trans, root); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + +out: + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) { + ret = PTR_ERR(args32); + args32 = NULL; + goto out; + } + + args64 = kmalloc(sizeof(*args64), GFP_NOFS); + if (!args64) { + ret = -ENOMEM; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + ret = _btrfs_ioctl_set_received_subvol(file, sa); + + if (ret) + goto out; + + ret = copy_to_user(arg, sa, sizeof(*sa)); + if (ret) + ret = -EFAULT; + +out: + kfree(sa); + return ret; +} + +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + size_t len; + int ret; + char label[BTRFS_LABEL_SIZE]; + + spin_lock(&root->fs_info->super_lock); + memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&root->fs_info->super_lock); + + len = strnlen(label, BTRFS_LABEL_SIZE); + + if (len == BTRFS_LABEL_SIZE) { + btrfs_warn(root->fs_info, + "label is too long, return the first %zu bytes", --len); + } + + ret = copy_to_user(arg, label, len); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + btrfs_err(root->fs_info, "unable to set label with more than %d bytes", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + spin_lock(&root->fs_info->super_lock); + strcpy(super_block->label, label); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + +out_unlock: + mnt_drop_write_file(file); + return ret; +} + +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +static int btrfs_ioctl_get_supported_features(struct file *file, + void __user *arg) +{ + static struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_root *root, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_names[set]; + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(root->fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(root, change_mask, flags, mask_base) \ +check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(root, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(root, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(root, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&root->fs_info->super_lock); + + return btrfs_commit_transaction(trans, root); +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(file, argp); + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SNAP_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SUBVOL_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); + case BTRFS_IOC_DEFAULT_SUBVOL: + return btrfs_ioctl_default_subvol(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file, NULL); + case BTRFS_IOC_DEFRAG_RANGE: + return btrfs_ioctl_defrag(file, argp); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(file, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_FS_INFO: + return btrfs_ioctl_fs_info(root, argp); + case BTRFS_IOC_DEV_INFO: + return btrfs_ioctl_dev_info(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_ioctl_balance(file, NULL); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_TREE_SEARCH: + return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(file, argp); + case BTRFS_IOC_INO_LOOKUP: + return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); + case BTRFS_IOC_SPACE_INFO: + return btrfs_ioctl_space_info(root, argp); + case BTRFS_IOC_SYNC: { + int ret; + + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + if (ret) + return ret; + ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); + /* + * The transaction thread may want to do more work, + * namely it pokes the cleaner ktread that will start + * processing uncleaned subvols. + */ + wake_up_process(root->fs_info->transaction_kthread); + return ret; + } + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(root, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(root, argp); + case BTRFS_IOC_SCRUB: + return btrfs_ioctl_scrub(file, argp); + case BTRFS_IOC_SCRUB_CANCEL: + return btrfs_ioctl_scrub_cancel(root, argp); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(file, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_SET_RECEIVED_SUBVOL: + return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif + case BTRFS_IOC_SEND: + return btrfs_ioctl_send(file, argp); + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp); + case BTRFS_IOC_QUOTA_CTL: + return btrfs_ioctl_quota_ctl(file, argp); + case BTRFS_IOC_QGROUP_ASSIGN: + return btrfs_ioctl_qgroup_assign(file, argp); + case BTRFS_IOC_QGROUP_CREATE: + return btrfs_ioctl_qgroup_create(file, argp); + case BTRFS_IOC_QGROUP_LIMIT: + return btrfs_ioctl_qgroup_limit(file, argp); + case BTRFS_IOC_QUOTA_RESCAN: + return btrfs_ioctl_quota_rescan(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_STATUS: + return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(root, argp); + case BTRFS_IOC_GET_FSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case BTRFS_IOC_SET_FSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); + case BTRFS_IOC_FILE_EXTENT_SAME: + return btrfs_ioctl_file_extent_same(file, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(file, argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(file, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); + } + + return -ENOTTY; +} diff --git a/kernel/fs/btrfs/locking.c b/kernel/fs/btrfs/locking.c new file mode 100644 index 000000000..f8229ef1b --- /dev/null +++ b/kernel/fs/btrfs/locking.c @@ -0,0 +1,300 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); + +/* + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. + */ +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) +{ + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (rw == BTRFS_WRITE_LOCK) { + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); + } + } else if (rw == BTRFS_READ_LOCK) { + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); + } + return; +} + +/* + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count + */ +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) +{ + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + if (atomic_dec_and_test(&eb->blocking_writers) && + waitqueue_active(&eb->write_lock_wq)) + wake_up(&eb->write_lock_wq); + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) + wake_up(&eb->read_lock_wq); + } + return; +} + +/* + * take a spinning read lock. This will wait for any blocking + * writers + */ +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ +again: + BUG_ON(!atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner); + + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. We allow + * an additional read lock to be added because it's for the same + * thread. btrfs_find_all_roots() depends on this as it may be + * called on a partly (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = 1; + read_unlock(&eb->lock); + return; + } + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); +} + +/* + * take a spinning read lock. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers + */ +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) +{ + if (atomic_read(&eb->blocking_writers)) + return 0; + + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; +} + +/* + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers + */ +int btrfs_try_tree_read_lock(struct extent_buffer *eb) +{ + if (atomic_read(&eb->blocking_writers)) + return 0; + + if (!read_trylock(&eb->lock)) + return 0; + + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; +} + +/* + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers + */ +int btrfs_try_tree_write_lock(struct extent_buffer *eb) +{ + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) + return 0; + + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->write_locks); + atomic_inc(&eb->spinning_writers); + eb->lock_owner = current->pid; + return 1; +} + +/* + * drop a spinning read lock + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; + } + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + atomic_dec(&eb->read_locks); + read_unlock(&eb->lock); +} + +/* + * drop a blocking read lock + */ +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) +{ + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; + } + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->blocking_readers) == 0); + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) + wake_up(&eb->read_lock_wq); + atomic_dec(&eb->read_locks); +} + +/* + * take a spinning write lock. This will wait for both + * blocking readers or writers + */ +void btrfs_tree_lock(struct extent_buffer *eb) +{ +again: + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + wait_event(eb->read_lock_wq, + atomic_read(&eb->blocking_readers) == 0); + goto again; + } + if (atomic_read(&eb->blocking_writers)) { + write_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + atomic_inc(&eb->write_locks); + eb->lock_owner = current->pid; +} + +/* + * drop a spinning or a blocking write lock. + */ +void btrfs_tree_unlock(struct extent_buffer *eb) +{ + int blockers = atomic_read(&eb->blocking_writers); + + BUG_ON(blockers > 1); + + btrfs_assert_tree_locked(eb); + eb->lock_owner = 0; + atomic_dec(&eb->write_locks); + + if (blockers) { + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_dec(&eb->blocking_writers); + smp_mb(); + if (waitqueue_active(&eb->write_lock_wq)) + wake_up(&eb->write_lock_wq); + } else { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + write_unlock(&eb->lock); + } +} + +void btrfs_assert_tree_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->write_locks)); +} + +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); +} diff --git a/kernel/fs/btrfs/locking.h b/kernel/fs/btrfs/locking.h new file mode 100644 index 000000000..c44a9d5f5 --- /dev/null +++ b/kernel/fs/btrfs/locking.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 + +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); + +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); + + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK_BLOCKING) + btrfs_tree_read_unlock_blocking(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +} +#endif diff --git a/kernel/fs/btrfs/lzo.c b/kernel/fs/btrfs/lzo.c new file mode 100644 index 000000000..a2f051347 --- /dev/null +++ b/kernel/fs/btrfs/lzo.c @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", + ret); + ret = -EIO; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) { + ret = -E2BIG; + goto out; + } + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + bool may_late_unmap, need_unmap; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + may_late_unmap = need_unmap = false; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + may_late_unmap = true; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + if (page_in_index + 1 >= total_pages_in) { + ret = -EIO; + goto done; + } + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "BTRFS: decompress failed\n"); + ret = -EIO; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + kunmap(pages_in[page_in_index]); + if (!ret) + btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "BTRFS: decompress failed!\n"); + ret = -EIO; + goto out; + } + + if (out_len < start_byte) { + ret = -EIO; + goto out; + } + + /* + * the caller is already checking against PAGE_SIZE, but lets + * move this check closer to the memcpy/memset + */ + destlen = min_t(unsigned long, destlen, PAGE_SIZE); + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr, workspace->buf + start_byte, bytes); + + /* + * btrfs_getblock is doing a zero on the tail of the page too, + * but this will cover anything missing from the decompressed + * data. + */ + if (bytes < destlen) + memset(kaddr+bytes, 0, destlen-bytes); + kunmap_atomic(kaddr); +out: + return ret; +} + +const struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/kernel/fs/btrfs/math.h b/kernel/fs/btrfs/math.h new file mode 100644 index 000000000..1b10a3cd1 --- /dev/null +++ b/kernel/fs/btrfs/math.h @@ -0,0 +1,42 @@ + +/* + * Copyright (C) 2012 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + return div_u64(num, 10); +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + return div_u64(num, 100); +} + +#endif diff --git a/kernel/fs/btrfs/ordered-data.c b/kernel/fs/btrfs/ordered-data.c new file mode 100644 index 000000000..760c4a5e0 --- /dev/null +++ b/kernel/fs/btrfs/ordered-data.c @@ -0,0 +1,1051 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" +#include "disk-io.h" + +static struct kmem_cache *btrfs_ordered_extent_cache; + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static void ordered_data_tree_panic(struct inode *inode, int errno, + u64 offset) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " + "%llu", offset); +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev = NULL; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio, int compress_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); + if (!entry) + return -ENOMEM; + + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && + !(type == BTRFS_ORDERED_NOCOW)) + entry->csum_bytes_left = disk_len; + entry->disk_len = disk_len; + entry->bytes_left = len; + entry->inode = igrab(inode); + entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); + INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->trans_list); + + trace_btrfs_ordered_extent_add(inode, entry); + + spin_lock_irq(&tree->lock); + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + if (node) + ordered_data_tree_panic(inode, -EEXIST, file_offset); + spin_unlock_irq(&tree->lock); + + spin_lock(&root->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, + &root->fs_info->ordered_roots); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); + + return 0; +} + +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + list_add_tail(&sum->list, &entry->list); + WARN_ON(entry->csum_bytes_left < sum->len); + entry->csum_bytes_left -= sum->len; + if (entry->csum_bytes_left == 0) + wake_up(&entry->wait); + spin_unlock_irq(&tree->lock); +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size, int uptodate) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + unsigned long flags; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irqsave(&tree->lock, flags); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordering dec_start %llu end %llu", dec_start, dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); + } + entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + if (entry->bytes_left == 0) { + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { + ret = 1; + } +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock_irqrestore(&tree->lock, flags); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + if (io_size > entry->bytes_left) { + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, io_size); + } + entry->bytes_left -= io_size; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + if (entry->bytes_left == 0) { + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { + ret = 1; + } +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock_irqrestore(&tree->lock, flags); + return ret == 0; +} + +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list, + const loff_t start, + const loff_t end) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_extent *ordered; + struct rb_node *n; + struct rb_node *prev; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + n = __tree_search(&tree->tree, end, &prev); + if (!n) + n = prev; + for (; n; n = rb_prev(n)) { + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + if (ordered->file_offset > end) + continue; + if (entry_end(ordered) <= start) + break; + if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + list_add(&ordered->log_list, logged_list); + atomic_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + +void btrfs_put_logged_extents(struct list_head *logged_list) +{ + struct btrfs_ordered_extent *ordered; + + while (!list_empty(logged_list)) { + ordered = list_first_entry(logged_list, + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log) +{ + int index = log->log_transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + list_splice_tail(logged_list, &log->logged_list[index]); + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, + &ordered->flags)); + + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + trace_btrfs_ordered_extent_put(entry->inode, entry); + + if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kmem_cache_free(btrfs_ordered_extent_cache, entry); + } +} + +/* + * remove an ordered extent from the tree. No references are dropped + * and waiters are woken up. + */ +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + if (tree->last == node) + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); + + spin_lock(&root->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; + + trace_btrfs_ordered_extent_remove(inode, entry); + + if (!root->nr_ordered_extents) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); + wake_up(&entry->wait); +} + +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +{ + struct list_head splice, works; + struct btrfs_ordered_extent *ordered, *next; + int count = 0; + + INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); + + mutex_lock(&root->ordered_extent_mutex); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); + while (!list_empty(&splice) && nr) { + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); + atomic_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); + + btrfs_init_work(&ordered->flush_work, + btrfs_flush_delalloc_helper, + btrfs_run_ordered_extent_work, NULL, NULL); + list_add_tail(&ordered->work_list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &ordered->flush_work); + + cond_resched(); + spin_lock(&root->ordered_extent_lock); + if (nr != -1) + nr--; + count++; + } + list_splice_tail(&splice, &root->ordered_extents); + spin_unlock(&root->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + mutex_unlock(&root->ordered_extent_mutex); + + return count; +} + +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_root *root; + struct list_head splice; + int done; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&fs_info->ordered_operations_mutex); + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice) && nr) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + done = btrfs_wait_ordered_extents(root, nr); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->ordered_root_lock); + if (nr != -1) { + nr -= done; + WARN_ON(nr < 0); + } + } + list_splice_tail(&splice, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + trace_btrfs_ordered_extent_start(inode, entry); + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for the flusher thread to find them + */ + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + int ret = 0; + int ret_wb = 0; + u64 end; + u64 orig_end; + struct btrfs_ordered_extent *ordered; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + + /* start IO across the range first to instantiate any delalloc + * extents + */ + ret = btrfs_fdatawrite_range(inode, start, orig_end); + if (ret) + return ret; + + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + + end = orig_end; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len <= start) { + btrfs_put_ordered_extent(ordered); + break; + } + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + ret = -EIO; + btrfs_put_ordered_extent(ordered); + if (ret || end == 0 || end == start) + break; + end--; + } + return ret_wb ? ret_wb : ret; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + spin_unlock_irq(&tree->lock); + return entry; +} + +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock_irq(&tree->lock); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + spin_unlock_irq(&tree->lock); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size = i_size_read(inode); + struct rb_node *node; + struct rb_node *prev = NULL; + struct btrfs_ordered_extent *test; + int ret = 1; + + spin_lock_irq(&tree->lock); + if (ordered) { + offset = entry_end(ordered); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) + offset = min(offset, + ordered->file_offset + + ordered->truncated_len); + } else { + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); + } + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) + goto out; + + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + for (; node; node = rb_prev(node)) { + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= i_size) + break; + if (entry_end(test) > disk_i_size) { + /* + * we don't update disk_i_size now, so record this + * undealt i_size. Or we will not know the real + * i_size. + */ + if (test->outstanding_isize < offset) + test->outstanding_isize = offset; + if (ordered && + ordered->outstanding_isize > + test->outstanding_isize) + test->outstanding_isize = + ordered->outstanding_isize; + goto out; + } + } + new_i_size = min_t(u64, offset, i_size); + + /* + * Some ordered extents may completed before the current one, and + * we hold the real i_size in ->outstanding_isize. + */ + if (ordered && ordered->outstanding_isize > new_i_size) + new_i_size = min_t(u64, ordered->outstanding_isize, i_size); + BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; +out: + /* + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. + */ + if (ordered) + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); + return ret; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum, int len) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int index = 0; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 0; + + spin_lock_irq(&tree->lock); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { + if (disk_bytenr >= ordered_sum->bytenr && + disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { + i = (disk_bytenr - ordered_sum->bytenr) >> + inode->i_sb->s_blocksize_bits; + num_sectors = ordered_sum->len >> + inode->i_sb->s_blocksize_bits; + num_sectors = min_t(int, len - index, num_sectors - i); + memcpy(sum + index, ordered_sum->sums + i, + num_sectors); + + index += (int)num_sectors; + if (index == len) + goto out; + disk_bytenr += num_sectors * sectorsize; + } + } +out: + spin_unlock_irq(&tree->lock); + btrfs_put_ordered_extent(ordered); + return index; +} + +int __init ordered_data_init(void) +{ + btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", + sizeof(struct btrfs_ordered_extent), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_extent_cache) + return -ENOMEM; + + return 0; +} + +void ordered_data_exit(void) +{ + if (btrfs_ordered_extent_cache) + kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/kernel/fs/btrfs/ordered-data.h b/kernel/fs/btrfs/ordered-data.h new file mode 100644 index 000000000..e96cd4ccd --- /dev/null +++ b/kernel/fs/btrfs/ordered-data.h @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + spinlock_t lock; + struct rb_root tree; + struct rb_node *last; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + int len; + struct list_head list; + /* last field is a variable length array of csums */ + u32 sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent + * has done its due diligence in updating + * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered + ordered extent */ +#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ + +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent + * in the logging code. */ +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* number of bytes that still need writing */ + u64 bytes_left; + + /* number of bytes that still need csumming */ + u64 csum_bytes_left; + + /* + * the end of the ordered extent which is behind it but + * didn't update disk_i_size. Please see the comment of + * btrfs_ordered_update_i_size(); + */ + u64 outstanding_isize; + + /* + * If we get truncated we need to adjust the file extent we enter for + * this ordered extent so that we do not expose stale data. + */ + u64 truncated_len; + + /* flags (described above) */ + unsigned long flags; + + /* compression algorithm */ + int compress_type; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* If we need to wait on this to be done */ + struct list_head log_list; + + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; + + struct btrfs_work work; + + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + spin_lock_init(&t->lock); + t->tree = RB_ROOT; + t->last = NULL; +} + +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size, + int uptodate); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum, int len); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list, + const loff_t start, + const loff_t end); +void btrfs_put_logged_extents(struct list_head *logged_list); +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); +int __init ordered_data_init(void); +void ordered_data_exit(void); +#endif diff --git a/kernel/fs/btrfs/orphan.c b/kernel/fs/btrfs/orphan.c new file mode 100644 index 000000000..47767d5b8 --- /dev/null +++ b/kernel/fs/btrfs/orphan.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret) { /* JDM: Really? */ + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/kernel/fs/btrfs/print-tree.c b/kernel/fs/btrfs/print-tree.c new file mode 100644 index 000000000..647ab12fd --- /dev/null +++ b/kernel/fs/btrfs/print-tree.c @@ -0,0 +1,349 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk), + btrfs_chunk_type(eb, chunk), num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + btrfs_stripe_devid_nr(eb, chunk, i), + btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + btrfs_device_id(eb, dev_item), + btrfs_device_total_bytes(eb, dev_item), + btrfs_device_bytes_used(eb, dev_item)); +} +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + btrfs_extent_data_ref_root(eb, ref), + btrfs_extent_data_ref_objectid(eb, ref), + btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot, int type) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), + flags); + + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %u %llu) " + "level %d\n", + btrfs_disk_key_objectid(&key), key.type, + btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + offset, btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + btrfs_ref_root_v0(eb, ref0), + btrfs_ref_generation_v0(eb, ref0), + btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + +static void print_uuid_item(struct extent_buffer *l, unsigned long offset, + u32 item_size) +{ + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("BTRFS: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + return; + } + while (item_size) { + __le64 subvol_id; + + read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); + printk(KERN_INFO "\t\tsubvol_id %llu\n", + (unsigned long long)le64_to_cpu(subvol_id)); + item_size -= sizeof(u64); + offset += sizeof(u64); + } +} + +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 type, nr; + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; + struct btrfs_key key; + struct btrfs_key found_key; + + if (!l) + return; + + nr = btrfs_header_nritems(l); + + btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", + btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(i); + btrfs_item_key_to_cpu(l, &key, i); + type = key.type; + printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " + "itemsize %d\n", + i, key.objectid, type, key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + btrfs_inode_generation(l, ii), + btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); + break; + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, i, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + btrfs_file_extent_disk_bytenr(l, fi), + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + btrfs_file_extent_offset(l, fi), + btrfs_file_extent_num_bytes(l, fi), + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + btrfs_dev_extent_chunk_tree(l, dev_extent), + btrfs_dev_extent_chunk_objectid(l, dev_extent), + btrfs_dev_extent_chunk_offset(l, dev_extent), + btrfs_dev_extent_length(l, dev_extent)); + break; + case BTRFS_DEV_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; + case BTRFS_DEV_REPLACE_KEY: + printk(KERN_INFO "\t\tdev replace\n"); + break; + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), + btrfs_item_size_nr(l, i)); + break; + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", + btrfs_header_bytenr(c), level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, key.objectid, key.type, key.offset, + btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + level != 1) + BUG(); + if (btrfs_header_level(next) != + level - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/kernel/fs/btrfs/print-tree.h b/kernel/fs/btrfs/print-tree.h new file mode 100644 index 000000000..7faddfacc --- /dev/null +++ b/kernel/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c); +#endif diff --git a/kernel/fs/btrfs/props.c b/kernel/fs/btrfs/props.c new file mode 100644 index 000000000..dca137b04 --- /dev/null +++ b/kernel/fs/btrfs/props.c @@ -0,0 +1,429 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "props.h" +#include "btrfs_inode.h" +#include "hash.h" +#include "transaction.h" +#include "xattr.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const char *value, size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + int inheritable; +}; + +static int prop_compression_validate(const char *value, size_t len); +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len); +static const char *prop_compression_extract(struct inode *inode); + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, + { + .xattr_name = NULL + } +}; + +void __init btrfs_props_init(void) +{ + struct prop_handler *p; + + hash_init(prop_handlers_ht); + + for (p = &prop_handlers[0]; p->xattr_name; p++) { + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } +} + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +static int __btrfs_set_prop(struct btrfs_trans_handle *trans, + struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = handler->validate(value, value_len); + if (ret) + return ret; + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + value, value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size_nr(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(inode), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(inode); + int ret; + + ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); + + return ret; +} + +static int inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *parent) +{ + const struct prop_handler *h; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (h = &prop_handlers[0]; h->xattr_name; h++) { + const char *value; + u64 num_bytes; + + if (!h->inheritable) + continue; + + value = h->extract(parent); + if (!value) + continue; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + ret = __btrfs_set_prop(trans, inode, h->xattr_name, + value, strlen(value), 0); + btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir) +{ + if (!dir) + return 0; + + return inherit_props(trans, inode, dir); +} + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root) +{ + struct btrfs_key key; + struct inode *parent_inode, *child_inode; + int ret; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, + parent_root, NULL); + if (IS_ERR(parent_inode)) + return PTR_ERR(parent_inode); + + child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(child_inode)) { + iput(parent_inode); + return PTR_ERR(child_inode); + } + + ret = inherit_props(trans, child_inode, parent_inode); + iput(child_inode); + iput(parent_inode); + + return ret; +} + +static int prop_compression_validate(const char *value, size_t len) +{ + if (!strncmp("lzo", value, len)) + return 0; + else if (!strncmp("zlib", value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len) +{ + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, len)) + type = BTRFS_COMPRESS_LZO; + else if (!strncmp("zlib", value, len)) + type = BTRFS_COMPRESS_ZLIB; + else + return -EINVAL; + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->force_compress) { + case BTRFS_COMPRESS_ZLIB: + return "zlib"; + case BTRFS_COMPRESS_LZO: + return "lzo"; + } + + return NULL; +} + + diff --git a/kernel/fs/btrfs/props.h b/kernel/fs/btrfs/props.h new file mode 100644 index 000000000..100f18829 --- /dev/null +++ b/kernel/fs/btrfs/props.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_PROPS_H +#define __BTRFS_PROPS_H + +#include "ctree.h" + +void __init btrfs_props_init(void); + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root); + +#endif diff --git a/kernel/fs/btrfs/qgroup.c b/kernel/fs/btrfs/qgroup.c new file mode 100644 index 000000000..3d6546581 --- /dev/null +++ b/kernel/fs/btrfs/qgroup.c @@ -0,0 +1,2966 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "ulist.h" +#include "backref.h" +#include "extent_io.h" +#include "qgroup.h" + +/* TODO XXX FIXME + * - subvol delete -> delete when ref goes to 0? delete limits also? + * - reorganize keys + * - compressed + * - sync + * - copy also limits on subvol creation + * - limit + * - caches fuer ulists + * - performance benchmarks + * - check all ioctl parameters + */ + +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + */ + u64 old_refcnt; + u64 new_refcnt; +}; + +/* + * glue structure to represent the relations between qgroups. + */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); + +/* must be called with qgroup_ioctl_lock held */ +static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node *n = fs_info->qgroup_tree.rb_node; + struct btrfs_qgroup *qgroup; + + while (n) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + if (qgroup->qgroupid < qgroupid) + n = n->rb_left; + else if (qgroup->qgroupid > qgroupid) + n = n->rb_right; + else + return qgroup; + } + return NULL; +} + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node **p = &fs_info->qgroup_tree.rb_node; + struct rb_node *parent = NULL; + struct btrfs_qgroup *qgroup; + + while (*p) { + parent = *p; + qgroup = rb_entry(parent, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < qgroupid) + p = &(*p)->rb_left; + else if (qgroup->qgroupid > qgroupid) + p = &(*p)->rb_right; + else + return qgroup; + } + + qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); + if (!qgroup) + return ERR_PTR(-ENOMEM); + + qgroup->qgroupid = qgroupid; + INIT_LIST_HEAD(&qgroup->groups); + INIT_LIST_HEAD(&qgroup->members); + INIT_LIST_HEAD(&qgroup->dirty); + + rb_link_node(&qgroup->node, parent, p); + rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + + return qgroup; +} + +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) +{ + struct btrfs_qgroup_list *list; + + list_del(&qgroup->dirty); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); + return 0; +} + +/* must be called with qgroup_lock held */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list = kzalloc(sizeof(*list), GFP_ATOMIC); + if (!list) + return -ENOMEM; + + list->group = parent; + list->member = member; + list_add_tail(&list->next_group, &member->groups); + list_add_tail(&list->next_member, &parent->members); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + +/* + * The full config is read in one go, only called from open_ctree() + * It doesn't use any locking, as at this point we're still single-threaded + */ +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path = NULL; + struct extent_buffer *l; + int slot; + int ret = 0; + u64 flags = 0; + u64 rescan_progress = 0; + + if (!fs_info->quota_enabled) + return 0; + + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* default this to quota off, in case no status key is found */ + fs_info->qgroup_flags = 0; + + /* + * pass 1: read status, all qgroup infos and limits + */ + key.objectid = 0; + key.type = 0; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); + if (ret) + goto out; + + while (1) { + struct btrfs_qgroup *qgroup; + + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { + struct btrfs_qgroup_status_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_status_item); + + if (btrfs_qgroup_status_version(l, ptr) != + BTRFS_QGROUP_STATUS_VERSION) { + btrfs_err(fs_info, + "old qgroup version, quota disabled"); + goto out; + } + if (btrfs_qgroup_status_generation(l, ptr) != + fs_info->generation) { + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_err(fs_info, + "qgroup generation mismatch, " + "marked as inconsistent"); + } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, + ptr); + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); + goto next1; + } + + if (found_key.type != BTRFS_QGROUP_INFO_KEY && + found_key.type != BTRFS_QGROUP_LIMIT_KEY) + goto next1; + + qgroup = find_qgroup_rb(fs_info, found_key.offset); + if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { + btrfs_err(fs_info, "inconsitent qgroup config"); + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + if (!qgroup) { + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + } + switch (found_key.type) { + case BTRFS_QGROUP_INFO_KEY: { + struct btrfs_qgroup_info_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_info_item); + qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); + qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); + qgroup->excl = btrfs_qgroup_info_excl(l, ptr); + qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); + /* generation currently unused */ + break; + } + case BTRFS_QGROUP_LIMIT_KEY: { + struct btrfs_qgroup_limit_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_limit_item); + qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); + qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); + qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); + qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); + qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); + break; + } + } +next1: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } + btrfs_release_path(path); + + /* + * pass 2: read all qgroup relations + */ + key.objectid = 0; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); + if (ret) + goto out; + while (1) { + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type != BTRFS_QGROUP_RELATION_KEY) + goto next2; + + if (found_key.objectid > found_key.offset) { + /* parent <- member, not needed to build config */ + /* FIXME should we omit the key completely? */ + goto next2; + } + + ret = add_relation_rb(fs_info, found_key.objectid, + found_key.offset); + if (ret == -ENOENT) { + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", + found_key.objectid, found_key.offset); + ret = 0; /* ignore the error */ + } + if (ret) + goto out; +next2: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } +out: + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) { + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } + btrfs_free_path(path); + + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + return ret < 0 ? ret : 0; +} + +/* + * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), + * first two are in single-threaded paths.And for the third one, we have set + * quota_root to be null with qgroup_lock held before, so it is safe to clean + * up the in-memory structures without qgroup_lock held. + */ +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + while ((n = rb_first(&fs_info->qgroup_tree))) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + rb_erase(n, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); + } + /* + * we call btrfs_free_qgroup_config() when umounting + * filesystem and disabling quota, so we set qgroup_ulit + * to be null here to avoid double free. + */ + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; +} + +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); +out: + btrfs_free_path(path); + return ret; +} + +static int add_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_qgroup_info_item *qgroup_info; + struct btrfs_qgroup_limit_item *qgroup_limit; + struct extent_buffer *leaf; + struct btrfs_key key; + + if (btrfs_test_is_dummy_root(quota_root)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + + /* + * Avoid a transaction abort by catching -EEXIST here. In that + * case, we proceed by re-initializing the existing structure + * on disk. + */ + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_info)); + if (ret && ret != -EEXIST) + goto out; + + leaf = path->nodes[0]; + qgroup_info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); + + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_limit)); + if (ret && ret != -EEXIST) + goto out; + + leaf = path->nodes[0]; + qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); + + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_limit_item *qgroup_limit; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_LIMIT_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_info_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_info_item *qgroup_info; + int ret; + int slot; + + if (btrfs_test_is_dummy_root(root)) + return 0; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); + btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); + btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); + btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_status_item(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_status_item *ptr; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_generation(l, ptr, trans->transid); + btrfs_set_qgroup_status_rescan(l, ptr, + fs_info->qgroup_rescan_progress.objectid); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called with qgroup_lock held + */ +static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf = NULL; + int ret; + int nr = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.offset = 0; + key.type = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) + break; + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + ret = 0; +out: + root->fs_info->pending_quota_state = 0; + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_path *path = NULL; + struct btrfs_qgroup_status_item *ptr; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_qgroup *qgroup = NULL; + int ret = 0; + int slot; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (fs_info->quota_root) { + fs_info->pending_quota_state = 1; + goto out; + } + + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + + /* + * initially create the quota tree + */ + quota_root = btrfs_create_tree(trans, fs_info, + BTRFS_QUOTA_TREE_OBJECTID); + if (IS_ERR(quota_root)) { + ret = PTR_ERR(quota_root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out_free_root; + } + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*ptr)); + if (ret) + goto out_free_path; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); + btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); + + btrfs_mark_buffer_dirty(leaf); + + key.objectid = 0; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = 0; + + btrfs_release_path(path); + ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); + if (ret > 0) + goto out_add_root; + if (ret < 0) + goto out_free_path; + + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.type == BTRFS_ROOT_REF_KEY) { + ret = add_qgroup_item(trans, quota_root, + found_key.offset); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + } + ret = btrfs_next_item(tree_root, path); + if (ret < 0) + goto out_free_path; + if (ret) + break; + } + +out_add_root: + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); +out_free_path: + btrfs_free_path(path); +out_free_root: + if (ret) { + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); + } +out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *quota_root; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + goto out; + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + quota_root = fs_info->quota_root; + fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + btrfs_free_qgroup_config(fs_info); + + ret = btrfs_clean_quota_tree(trans, quota_root); + if (ret) + goto out; + + ret = btrfs_del_root(trans, tree_root, "a_root->root_key); + if (ret) + goto out; + + list_del("a_root->dirty_list); + + btrfs_tree_lock(quota_root->node); + clean_tree_block(trans, tree_root->fs_info, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); +} + +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + u64 num_bytes, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup->excl, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + struct ulist *tmp; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + ret = -EEXIST; + goto out; + } + } + + ret = add_qgroup_relation_item(trans, quota_root, src, dst); + if (ret) + goto out; + + ret = add_qgroup_relation_item(trans, quota_root, dst, src); + if (ret) { + del_qgroup_relation_item(trans, quota_root, src, dst); + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + ret = add_relation_rb(quota_root->fs_info, src, dst); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); + return ret; +} + +int __del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + struct ulist *tmp; + int ret = 0; + int err; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) + goto exist; + } + ret = -ENOENT; + goto out; +exist: + ret = del_qgroup_relation_item(trans, quota_root, src, dst); + err = del_qgroup_relation_item(trans, quota_root, dst, src); + if (err && !ret) + ret = err; + + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + spin_unlock(&fs_info->qgroup_lock); +out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, fs_info, src, dst); + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + return ret; +} + +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + ret = -EEXIST; + goto out; + } + + ret = add_qgroup_item(trans, quota_root, qgroupid); + if (ret) + goto out; + + spin_lock(&fs_info->qgroup_lock); + qgroup = add_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + if (IS_ERR(qgroup)) + ret = PTR_ERR(qgroup); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } else { + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + } + ret = del_qgroup_item(trans, quota_root, qgroupid); + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, fs_info, + qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + del_qgroup_rb(quota_root->fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) + qgroup->max_rfer = limit->max_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + qgroup->max_excl = limit->max_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) + qgroup->rsv_rfer = limit->rsv_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) + qgroup->rsv_excl = limit->rsv_excl; + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + qgroupid); + } + +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + /* + * Ignore seq and type here, we're looking for any operation + * at all related to this extent on that root. + */ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + return 0; +} + +static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node *n; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + n = fs_info->qgroup_op_tree.rb_node; + while (n) { + cur = rb_entry(n, struct btrfs_qgroup_operation, n); + cmp = comp_oper_exist(cur, oper); + if (cmp < 0) { + n = n->rb_right; + } else if (cmp) { + n = n->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + +/* + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. + */ +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) +{ + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; + + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) + return -ENOMEM; + + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + + trace_btrfs_qgroup_record_ref(oper); + + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { + /* + * If any operation for this bytenr/ref_root combo + * exists, then we know it's not exclusively owned and + * shouldn't be queued up. + * + * This also catches the case where we have a cloned + * extent that gets queued up multiple times during + * drop snapshot. + */ + if (qgroup_oper_exists(fs_info, oper)) { + kfree(oper); + return 0; + } + } + + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); + + return 0; +} + +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *tmp; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, + oper->num_bytes, sign); +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; + + ulist_reinit(tmp); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + int ret; + + ulist_reinit(tmp); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + u64 cur_new_count, cur_old_count; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; + + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } + + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } + + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; + + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + + if (dirty) + qgroup_dirty(fs_info, qg); + } + return 0; +} + +/* + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. + */ +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } + } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + + return ret; +} + +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = SEQ_LIST_INIT(elem); + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; + + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) + return 0; + } + + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ulist_free(qgroups); + return -ENOMEM; + } + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + seq = fs_info->qgroup_seq; + + /* + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... + */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; + + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; + + /* + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. + */ + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; + + /* + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. + */ + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; + + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); + ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * Process a reference to a shared subtree. This type of operation is + * queued during snapshot removal when we encounter extents which are + * shared between more than one root. + */ +static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup_list *glist; + struct ulist *parents; + int ret = 0; + int err; + struct btrfs_qgroup *qg; + u64 root_obj = 0; + struct seq_list elem = SEQ_LIST_INIT(elem); + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + elem.seq, &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) + goto out; + + if (roots->nnodes != 1) + goto out; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ + /* + * If we find our ref root then that means all refs + * this extent has to the root have not yet been + * deleted. In that case, we do nothing and let the + * last ref for this bytenr drive our update. + * + * This can happen for example if an extent is + * referenced multiple times in a snapshot (clone, + * etc). If we are in the middle of snapshot removal, + * queued updates for such an extent will find the + * root if we have not yet finished removing the + * snapshot. + */ + if (unode->val == oper->ref_root) + goto out; + + root_obj = unode->val; + BUG_ON(!root_obj); + + spin_lock(&fs_info->qgroup_lock); + qg = find_qgroup_rb(fs_info, root_obj); + if (!qg) + goto out_unlock; + + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* + * Adjust counts for parent groups. First we find all + * parents, then in the 2nd loop we do the adjustment + * while adding parents of the parents to our ulist. + */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(parents, &uiter))) { + qg = u64_to_ptr(unode->aux); + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + } + +out_unlock: + spin_unlock(&fs_info->qgroup_lock); + +out: + ulist_free(roots); + ulist_free(parents); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + trace_btrfs_qgroup_account(oper); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + case BTRFS_QGROUP_OPER_SUB_SUBTREE: + ret = qgroup_subtree_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } + return ret; +} + +/* + * called from commit_transaction. Writes all changed qgroups to disk. + */ +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + int ret = 0; + int start_rescan_worker = 0; + + if (!quota_root) + goto out; + + if (!fs_info->quota_enabled && fs_info->pending_quota_state) + start_rescan_worker = 1; + + fs_info->quota_enabled = fs_info->pending_quota_state; + + spin_lock(&fs_info->qgroup_lock); + while (!list_empty(&fs_info->dirty_qgroups)) { + struct btrfs_qgroup *qgroup; + qgroup = list_first_entry(&fs_info->dirty_qgroups, + struct btrfs_qgroup, dirty); + list_del_init(&qgroup->dirty); + spin_unlock(&fs_info->qgroup_lock); + ret = update_qgroup_info_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + spin_lock(&fs_info->qgroup_lock); + } + if (fs_info->quota_enabled) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; + else + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_status_item(trans, fs_info, quota_root); + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + + if (!ret && start_rescan_worker) { + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } + ret = 0; + } + +out: + + return ret; +} + +/* + * copy the acounting information between qgroups. This is necessary when a + * snapshot or a subvolume is created + */ +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit) +{ + int ret = 0; + int i; + u64 *i_qgroups; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *srcgroup; + struct btrfs_qgroup *dstgroup; + u32 level_size = 0; + u64 nums; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) + goto out; + + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + for (i = 0; i < nums; ++i) { + srcgroup = find_qgroup_rb(fs_info, *i_qgroups); + if (!srcgroup) { + ret = -EINVAL; + goto out; + } + + if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { + ret = -EINVAL; + goto out; + } + ++i_qgroups; + } + } + + /* + * create a tracking group for the subvol itself + */ + ret = add_qgroup_item(trans, quota_root, objectid); + if (ret) + goto out; + + if (srcid) { + struct btrfs_root *srcroot; + struct btrfs_key srckey; + + srckey.objectid = srcid; + srckey.type = BTRFS_ROOT_ITEM_KEY; + srckey.offset = (u64)-1; + srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey); + if (IS_ERR(srcroot)) { + ret = PTR_ERR(srcroot); + goto out; + } + + rcu_read_lock(); + level_size = srcroot->nodesize; + rcu_read_unlock(); + } + + /* + * add qgroup to all inherited groups + */ + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_qgroup_relation_item(trans, quota_root, + objectid, *i_qgroups); + if (ret) + goto out; + ret = add_qgroup_relation_item(trans, quota_root, + *i_qgroups, objectid); + if (ret) + goto out; + ++i_qgroups; + } + } + + + spin_lock(&fs_info->qgroup_lock); + + dstgroup = add_qgroup_rb(fs_info, objectid); + if (IS_ERR(dstgroup)) { + ret = PTR_ERR(dstgroup); + goto unlock; + } + + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + dstgroup->qgroupid); + goto unlock; + } + } + + if (srcid) { + srcgroup = find_qgroup_rb(fs_info, srcid); + if (!srcgroup) + goto unlock; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; + srcgroup->excl = level_size; + srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + + qgroup_dirty(fs_info, dstgroup); + qgroup_dirty(fs_info, srcgroup); + } + + if (!inherit) + goto unlock; + + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + ++i_qgroups; + } + + for (i = 0; i < inherit->num_ref_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->rfer = src->rfer - level_size; + dst->rfer_cmpr = src->rfer_cmpr - level_size; + i_qgroups += 2; + } + for (i = 0; i < inherit->num_excl_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->excl = src->excl + level_size; + dst->excl_cmpr = src->excl_cmpr + level_size; + i_qgroups += 2; + } + +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 ref_root = root->root_key.objectid; + int ret = 0; + struct ulist_node *unode; + struct ulist_iterator uiter; + + if (!is_fstree(ref_root)) + return 0; + + if (num_bytes == 0) + return 0; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + /* + * in a first step, we check all affected qgroups if any limits would + * be exceeded + */ + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qg->reserved + (s64)qg->rfer + num_bytes > + qg->max_rfer) { + ret = -EDQUOT; + goto out; + } + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qg->reserved + (s64)qg->excl + num_bytes > + qg->max_excl) { + ret = -EDQUOT; + goto out; + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; + /* + * no limits exceeded, now record the reservation into all qgroups + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + + qg = u64_to_ptr(unode->aux); + + qg->reserved += num_bytes; + } + +out: + spin_unlock(&fs_info->qgroup_lock); + return ret; +} + +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist_node *unode; + struct ulist_iterator uiter; + u64 ref_root = root->root_key.objectid; + int ret = 0; + + if (!is_fstree(ref_root)) + return; + + if (num_bytes == 0) + return; + + spin_lock(&fs_info->qgroup_lock); + + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + + qg->reserved -= num_bytes; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + +out: + spin_unlock(&fs_info->qgroup_lock); +} + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) +{ + if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) + return; + btrfs_err(trans->root->fs_info, + "qgroups not uptodate in trans handle %p: list is%s empty, " + "seq is %#x.%x", + trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); + BUG(); +} + +/* + * returns < 0 on error, 0 when more leafs are to be scanned. + * returns 1 when done. + */ +static int +qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) +{ + struct btrfs_key found; + struct ulist *roots = NULL; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + u64 num_bytes; + u64 seq; + int new_roots; + int slot; + int ret; + + path->leave_spinning = 1; + mutex_lock(&fs_info->qgroup_rescan_lock); + ret = btrfs_search_slot_for_read(fs_info->extent_root, + &fs_info->qgroup_rescan_progress, + path, 1, 0); + + pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", + fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.type, + fs_info->qgroup_rescan_progress.offset, ret); + + if (ret) { + /* + * The rescan is about to end, we will not be scanning any + * further blocks. We cannot unset the RESCAN flag here, because + * we want to commit the transaction if everything went well. + * To make the live accounting work in this phase, we set our + * scan progress pointer such that every real extent objectid + * will be smaller. + */ + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found, + btrfs_header_nritems(path->nodes[0]) - 1); + fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; + + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + slot = path->slots[0]; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) + continue; + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->extent_root->nodesize; + else + num_bytes = found.offset; + + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); + if (ret < 0) + goto out; + spin_lock(&fs_info->qgroup_lock); + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + } +out: + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + + return ret; +} + +static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; + struct extent_buffer *scratch_leaf = NULL; + int err = -ENOMEM; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + goto out; + scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); + if (!scratch_leaf) + goto out; + + err = 0; + while (!err) { + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + if (!fs_info->quota_enabled) { + err = -EINTR; + } else { + err = qgroup_rescan_leaf(fs_info, path, trans, + qgroups, tmp, scratch_leaf); + } + if (err > 0) + btrfs_commit_transaction(trans, fs_info->fs_root); + else + btrfs_end_transaction(trans, fs_info->fs_root); + } + +out: + kfree(scratch_leaf); + ulist_free(qgroups); + ulist_free(tmp); + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + + if (err > 0 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } else if (err < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + /* + * only update status, since the previous part has alreay updated the + * qgroup info. + */ + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_err(fs_info, + "fail to start transaction for status update: %d\n", + err); + goto done; + } + ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d\n", err); + } + btrfs_end_transaction(trans, fs_info->quota_root); + + if (err >= 0) { + btrfs_info(fs_info, "qgroup scan completed%s", + err > 0 ? " (inconsistency flag cleared)" : ""); + } else { + btrfs_err(fs_info, "qgroup scan failed with %d", err); + } + +done: + complete_all(&fs_info->qgroup_rescan_completion); +} + +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) +{ + int ret = 0; + + if (!init_flags && + (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { + ret = -EINVAL; + goto err; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto err; + } + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + memset(&fs_info->qgroup_rescan_progress, 0, + sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + init_completion(&fs_info->qgroup_rescan_completion); + + memset(&fs_info->qgroup_rescan_work, 0, + sizeof(fs_info->qgroup_rescan_work)); + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_helper, + btrfs_qgroup_rescan_worker, NULL, NULL); + + if (ret) { +err: + btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); + return ret; + } + + return 0; +} + +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + spin_lock(&fs_info->qgroup_lock); + /* clear all current qgroup tracking information */ + for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + qgroup->rfer = 0; + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; + } + spin_unlock(&fs_info->qgroup_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_join_transaction(fs_info->fs_root); + if (IS_ERR(trans)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } + + qgroup_rescan_zero_tracking(fs_info); + + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + + return 0; +} + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); +} diff --git a/kernel/fs/btrfs/qgroup.h b/kernel/fs/btrfs/qgroup.h new file mode 100644 index 000000000..c5242aa9a --- /dev/null +++ b/kernel/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, + BTRFS_QGROUP_OPER_SUB_SUBTREE, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/kernel/fs/btrfs/raid56.c b/kernel/fs/btrfs/raid56.c new file mode 100644 index 000000000..fa72068bd --- /dev/null +++ b/kernel/fs/btrfs/raid56.c @@ -0,0 +1,2670 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + +#define RBIO_CACHE_SIZE 1024 + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE = 0, + BTRFS_RBIO_READ_REBUILD = 1, + BTRFS_RBIO_PARITY_SCRUB = 2, +}; + +struct btrfs_raid_bio { + struct btrfs_fs_info *fs_info; + struct btrfs_bio *bbio; + + /* while we're doing rmw on a stripe + * we put it into a hash table so we can + * lock the stripe and merge more rbios + * into it. + */ + struct list_head hash_list; + + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + + /* + * for scheduling work in the helper threads + */ + struct btrfs_work work; + + /* + * bio list and bio_list_lock are used + * to add more bios into the stripe + * in hopes of avoiding the full rmw + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* also protected by the bio_list_lock, the + * plug list is used by the plugging code + * to collect partial bios while plugged. The + * stripe locking code also uses it to hand off + * the stripe lock to the next pending IO + */ + struct list_head plug_list; + + /* + * flags that tell us if it is safe to + * merge with this bio + */ + unsigned long flags; + + /* size of each individual stripe on disk */ + int stripe_len; + + /* number of data stripes (no p/q) */ + int nr_data; + + int real_stripes; + + int stripe_npages; + /* + * set if we're doing a parity rebuild + * for a read from higher up, which is handled + * differently from a parity rebuild as part of + * rmw + */ + enum btrfs_rbio_ops operation; + + /* first bad stripe */ + int faila; + + /* second bad stripe (for raid6 use) */ + int failb; + + int scrubp; + /* + * number of pages needed to represent the full + * stripe + */ + int nr_pages; + + /* + * size of all the bios in the bio_list. This + * helps us decide if the rbio maps to a full + * stripe or not + */ + int bio_list_bytes; + + int generic_bio_cnt; + + atomic_t refs; + + atomic_t stripes_pending; + + atomic_t error; + /* + * these are two arrays of pointers. We allocate the + * rbio big enough to hold them both and setup their + * locations when the rbio is allocated + */ + + /* pointers to pages that we allocated for + * reading/writing stripes directly from the disk (including P/Q) + */ + struct page **stripe_pages; + + /* + * pointers to the pages in the bio_list. Stored + * here for faster lookup + */ + struct page **bio_pages; + + /* + * bitmap to record which horizontal stripe has data + */ + unsigned long *dbitmap; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check); +static void async_scrub_parity(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + int table_size; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table_size = sizeof(*table) + sizeof(*h) * num_entries; + table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!table) { + table = vzalloc(table_size); + if (!table) + return -ENOMEM; + } + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + init_waitqueue_head(&cur->wait); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + if (x) + kvfree(x); + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->bbio->raid_map[0]; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + dest->generic_bio_cnt += victim->generic_bio_cnt; + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + kvfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + if (size != rbio->nr_data * rbio->stripe_len) + ret = 0; + + BUG_ON(size > rbio->nr_data * rbio->stripe_len); + return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + ret = __rbio_is_full(rbio); + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->bbio->raid_map[0] != + cur->bbio->raid_map[0]) + return 0; + + /* we can't merge with different operations */ + if (last->operation != cur->operation) + return 0; + /* + * We've need read the full stripe from the drive. + * check and repair the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + if (last->operation == BTRFS_RBIO_PARITY_SCRUB || + cur->operation == BTRFS_RBIO_PARITY_SCRUB) + return 0; + + return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + if (rbio->nr_data + 1 == rbio->real_stripes) + return NULL; + + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> + PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + unsigned long flags; + DEFINE_WAIT(wait); + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + int walk = 0; + + spin_lock_irqsave(&h->lock, flags); + list_for_each_entry(cur, &h->hash_list, hash_list) { + walk++; + if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { + spin_lock(&cur->bio_list_lock); + + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * we couldn't merge with the running + * rbio, see if we can merge with the + * pending ones. We don't have to + * check for rmw_locked because there + * is no way they are inside finish_rmw + * right now + */ + list_for_each_entry(pending, &cur->plug_list, + plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* no merging, put us on the tail of the plug list, + * our rbio will be started with the currently + * running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } + } +lockit: + atomic_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + __free_raid_bio(freeit); + return ret; +} + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock_irqsave(&h->lock, flags); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + atomic_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + + if (next->operation == BTRFS_RBIO_READ_REBUILD) + async_read_rebuild(next); + else if (next->operation == BTRFS_RBIO_WRITE) { + steal_rbio(rbio, next); + async_rmw_stripe(next); + } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { + steal_rbio(rbio, next); + async_scrub_parity(next); + } + + goto done_nolock; + } else if (waitqueue_active(&h->wait)) { + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + wake_up(&h->wait); + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + WARN_ON(atomic_read(&rbio->refs) < 0); + if (!atomic_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bbio(rbio->bbio); + kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + unlock_stripe(rbio); + __free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *next; + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + + free_raid_bio(rbio); + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + if (uptodate) + set_bit(BIO_UPTODATE, &cur->bi_flags); + bio_endio(cur, err); + cur = next; + } +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); + return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else. This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, + int index, int pagenr, int bio_list_only) +{ + int chunk_page; + struct page *p = NULL; + + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + + spin_lock_irq(&rbio->bio_list_lock); + p = rbio->bio_pages[chunk_page]; + spin_unlock_irq(&rbio->bio_list_lock); + + if (p || bio_list_only) + return p; + + return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ + unsigned long nr = stripe_len * nr_stripes; + return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, + struct btrfs_bio *bbio, u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + int nr_data = 0; + int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int num_pages = rbio_nr_pages(stripe_len, real_stripes); + int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); + void *p; + + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), + GFP_NOFS); + if (!rbio) + return ERR_PTR(-ENOMEM); + + bio_list_init(&rbio->bio_list); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + rbio->bbio = bbio; + rbio->fs_info = root->fs_info; + rbio->stripe_len = stripe_len; + rbio->nr_pages = num_pages; + rbio->real_stripes = real_stripes; + rbio->stripe_npages = stripe_npages; + rbio->faila = -1; + rbio->failb = -1; + atomic_set(&rbio->refs, 1); + atomic_set(&rbio->error, 0); + atomic_set(&rbio->stripes_pending, 0); + + /* + * the stripe_pages and bio_pages array point to the extra + * memory we allocated past the end of the rbio + */ + p = rbio + 1; + rbio->stripe_pages = p; + rbio->bio_pages = p + sizeof(struct page *) * num_pages; + rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; + + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + nr_data = real_stripes - 1; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + nr_data = real_stripes - 2; + else + BUG(); + + rbio->nr_data = nr_data; + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + ClearPageUptodate(page); + } + return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +static int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + test_bit(BIO_UPTODATE, &last->bi_flags) && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); + if (ret == PAGE_CACHE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_iter.bi_sector = disk_start >> 9; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->real_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ + int index; + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); + index += page; + return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + struct page *p; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_iter.bi_sector << 9; + stripe_offset = start - rbio->bbio->raid_map[0]; + page_index = stripe_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + rbio->bio_pages[page_index + i] = p; + } + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[rbio->real_stripes]; + int stripe_len = rbio->stripe_len; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct bio_list bio_list; + struct bio *bio; + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; + int ret; + + bio_list_init(&bio_list); + + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; + } else { + BUG(); + } + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *p; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + p = rbio_pstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + p = rbio_qstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + if (likely(!bbio->num_tgtdevs)) + goto write_data; + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (!bbio->tgtdev_map[stripe]) + continue; + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, page, + rbio->bbio->tgtdev_map[stripe], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + +write_data: + atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&rbio->stripes_pending) == 0); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio. Used to figure out which + * stripe has failed. This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 physical = bio->bi_iter.bi_sector; + u64 stripe_start; + int i; + struct btrfs_bio_stripe *stripe; + + physical <<= 9; + + for (i = 0; i < rbio->bbio->num_stripes; i++) { + stripe = &rbio->bbio->stripes[i]; + stripe_start = stripe->physical; + if (physical >= stripe_start && + physical < stripe_start + rbio->stripe_len && + bio->bi_bdev == stripe->dev->bdev) { + return i; + } + } + return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping). Used to figure out which stripe has + * failed. This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 logical = bio->bi_iter.bi_sector; + u64 stripe_start; + int i; + + logical <<= 9; + + for (i = 0; i < rbio->nr_data; i++) { + stripe_start = rbio->bbio->raid_map[i]; + if (logical >= stripe_start && + logical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + + /* we already know this stripe is bad, move on */ + if (rbio->faila == failed || rbio->failb == failed) + goto out; + + if (rbio->faila == -1) { + /* first failure on this rbio */ + rbio->faila = failed; + atomic_inc(&rbio->error); + } else if (rbio->failb == -1) { + /* second failure on this rbio */ + rbio->failb = failed; + atomic_inc(&rbio->error); + } else { + ret = -EIO; + } +out: + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + + return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + int failed = find_bio_stripe(rbio, bio); + + if (failed < 0) + return -EIO; + + return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ + int i; + struct page *p; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + SetPageUptodate(p); + } +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + goto cleanup; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_rmw(rbio); + return; + +cleanup: + + rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + rmw_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + read_rebuild_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_rmw_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return 0; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; + +finish: + validate_rbio_for_rmw(rbio); + return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = alloc_rbio_parity_pages(rbio); + if (ret) { + __free_raid_bio(rbio); + return ret; + } + + ret = lock_stripe_add(rbio); + if (ret == 0) + finish_rmw(rbio); + return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + async_rmw_stripe(rbio); + return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe. So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ + /* head off into rmw land if we don't have a full stripe */ + if (!rbio_is_full(rbio)) + return partial_stripe_write(rbio); + return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + /* + * sort our plug list then try to merge + * everything we can in hopes of creating full + * stripes. + */ + list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* we have a full stripe, send it down */ + full_stripe_write(cur); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + __free_raid_bio(cur); + continue; + + } + __raid56_parity_write(last); + } + last = cur; + } + if (last) { + __raid56_parity_write(last); + } + kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ + struct btrfs_plug_cb *plug; + plug = container_of(work, struct btrfs_plug_cb, work); + run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug; + plug = container_of(cb, struct btrfs_plug_cb, cb); + + if (from_schedule) { + btrfs_init_work(&plug->work, btrfs_rmw_helper, + unplug_work, NULL, NULL); + btrfs_queue_work(plug->info->rmw_workers, + &plug->work); + return; + } + run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + int ret; + + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) { + btrfs_put_bbio(bbio); + return PTR_ERR(rbio); + } + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio->operation = BTRFS_RBIO_WRITE; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + + /* + * don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (rbio_is_full(rbio)) { + ret = full_stripe_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); + return ret; + } + + cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, + sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = root->fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + ret = 0; + } else { + ret = __raid56_parity_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); + } + return ret; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int pagenr, stripe; + void **pointers; + int faila = -1, failb = -1; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); + struct page *page; + int err; + int i; + + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers) { + err = -ENOMEM; + goto cleanup_io; + } + + faila = rbio->faila; + failb = rbio->failb; + + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(pagenr, rbio->dbitmap)) + continue; + + /* setup our array of pointers with pages + * from each stripe + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + pointers[stripe] = kmap(page); + } + + /* all raid6 handling here */ + if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* + * single failure, rebuild from parity raid5 + * style + */ + if (failb < 0) { + if (faila == rbio->nr_data) { + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * TODO, we should redo the xor here. + */ + err = -EIO; + goto cleanup; + } + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* make sure our ps and qs are in order */ + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + + /* if the q stripe is failed, do a pstripe reconstruction + * from the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want + */ + if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bbio->raid_map[faila] == + RAID5_P_STRIPE) { + err = -EIO; + goto cleanup; + } + /* + * otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, + PAGE_SIZE, faila, pointers); + } else { + raid6_2data_recov(rbio->real_stripes, + PAGE_SIZE, faila, failb, + pointers); + } + } else { + void *p; + + /* rebuild from P stripe here (raid5 or raid6) */ + BUG_ON(failb != -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], + pointers[rbio->nr_data], + PAGE_CACHE_SIZE); + + /* rearrange the pointer array */ + p = pointers[faila]; + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) + pointers[stripe] = pointers[stripe + 1]; + pointers[rbio->nr_data - 1] = p; + + /* xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + } + /* if we're doing this rebuild as part of an rmw, go through + * and set all of our private rbio pages in the + * failed stripes as uptodate. This way finish_rmw will + * know they can be trusted. If this was a read reconstruction, + * other endio functions will fiddle the uptodate bits + */ + if (rbio->operation == BTRFS_RBIO_WRITE) { + for (i = 0; i < nr_pages; i++) { + if (faila != -1) { + page = rbio_stripe_page(rbio, faila, i); + SetPageUptodate(page); + } + if (failb != -1) { + page = rbio_stripe_page(rbio, failb, i); + SetPageUptodate(page); + } + } + } + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + kunmap(page); + } + } + + err = 0; +cleanup: + kfree(pointers); + +cleanup_io: + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + rbio_orig_end_io(rbio, err, err == 0); + } else if (err == 0) { + rbio->faila = -1; + rbio->failb = -1; + + if (rbio->operation == BTRFS_RBIO_WRITE) + finish_rmw(rbio); + else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) + finish_parity_scrub(rbio, 0); + else + BUG(); + } else { + rbio_orig_end_io(rbio, err, 0); + } +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + /* + * we only read stripe pages off the disk, set them + * up to date if there were no errors + */ + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + rbio_orig_end_io(rbio, -EIO, 0); + else + __raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->error, 0); + + /* + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->error); + continue; + } + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *p; + + /* + * the rmw code may have already read this + * page in + */ + p = rbio_stripe_page(rbio, stripe, pagenr); + if (PageUptodate(p)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); + if (ret < 0) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * we might have no bios to read just because the pages + * were up to date, or we might have no bios to read because + * the devices were gone. + */ + if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { + __raid_recover_end_io(rbio); + goto out; + } else { + goto cleanup; + } + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_recover_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } +out: + return 0; + +cleanup: + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) { + if (generic_io) + btrfs_put_bbio(bbio); + return PTR_ERR(rbio); + } + + rbio->operation = BTRFS_RBIO_READ_REBUILD; + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_iter.bi_size; + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + if (generic_io) + btrfs_put_bbio(bbio); + kfree(rbio); + return -EIO; + } + + if (generic_io) { + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + } else { + btrfs_get_bbio(bbio); + } + + /* + * reconstruct from the q stripe if they are + * asking for mirror 3 + */ + if (mirror_num == 3) + rbio->failb = rbio->real_stripes - 2; + + ret = lock_stripe_add(rbio); + + /* + * __raid56_parity_recover will end the bio with + * any errors it hits. We don't want to return + * its error value up the stack because our caller + * will end up calling bio_endio with any nonzero + * return + */ + if (ret == 0) + __raid56_parity_recover(rbio); + /* + * our rbio has been added to the list of + * rbios that will be handled after the + * currently lock owner is done + */ + return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} + +/* + * The following code is used to scrub/replace the parity stripe + * + * Note: We need make sure all the pages that add into the scrub/replace + * raid bio are correct and not be changed during the scrub/replace. That + * is those pages just hold metadata or file data with checksum. + */ + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) +{ + struct btrfs_raid_bio *rbio; + int i; + + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) + return NULL; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + rbio->operation = BTRFS_RBIO_PARITY_SCRUB; + + for (i = 0; i < rbio->real_stripes; i++) { + if (bbio->stripes[i].dev == scrub_dev) { + rbio->scrubp = i; + break; + } + } + + /* Now we just support the sectorsize equals to page size */ + ASSERT(root->sectorsize == PAGE_SIZE); + ASSERT(rbio->stripe_npages == stripe_nsectors); + bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + + return rbio; +} + +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical) +{ + int stripe_offset; + int index; + + ASSERT(logical >= rbio->bbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + + rbio->stripe_len * rbio->nr_data); + stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); + index = stripe_offset >> PAGE_CACHE_SHIFT; + rbio->bio_pages[index] = page; +} + +/* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. + */ +static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) +{ + int i; + int bit; + int index; + struct page *page; + + for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { + for (i = 0; i < rbio->real_stripes; i++) { + index = i * rbio->stripe_npages + bit; + if (rbio->stripe_pages[index]) + continue; + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; + ClearPageUptodate(page); + } + } + return 0; +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_parity_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + + if (atomic_read(&rbio->error)) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); +} + +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[rbio->real_stripes]; + DECLARE_BITMAP(pbitmap, rbio->stripe_npages); + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct page *p_page = NULL; + struct page *q_page = NULL; + struct bio_list bio_list; + struct bio *bio; + int is_replace = 0; + int ret; + + bio_list_init(&bio_list); + + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; + } else { + BUG(); + } + + if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + is_replace = 1; + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + } + + /* + * Because the higher layers(scrubber) are unlikely to + * use this area of the disk again soon, so don't cache + * it. + */ + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + if (!need_check) + goto writeback; + + p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!p_page) + goto cleanup; + SetPageUptodate(p_page); + + if (q_stripe != -1) { + q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!q_page) { + __free_page(p_page); + goto cleanup; + } + SetPageUptodate(q_page); + } + + atomic_set(&rbio->error, 0); + + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *p; + void *parity; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + pointers[stripe++] = kmap(p_page); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + pointers[stripe++] = kmap(q_page); + + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + /* Check scrubbing pairty and repair it */ + p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + parity = kmap(p); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + else + /* Parity is right, needn't writeback */ + bitmap_clear(rbio->dbitmap, pagenr, 1); + kunmap(p); + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + __free_page(p_page); + if (q_page) + __free_page(q_page); + +writeback: + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, + page, rbio->scrubp, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + + if (!is_replace) + goto submit_write; + + for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, page, + bbio->tgtdev_map[rbio->scrubp], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + +submit_write: + nr_data = bio_list_size(&bio_list); + if (!nr_data) { + /* Every parity is right */ + rbio_orig_end_io(rbio, 0, 0); + return; + } + + atomic_set(&rbio->stripes_pending, nr_data); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_parity_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) +{ + if (stripe >= 0 && stripe < rbio->nr_data) + return 1; + return 0; +} + +/* + * While we're doing the parity check and repair, we could have errors + * in reading pages off the disk. This checks for errors and if we're + * not able to read the page it'll trigger parity reconstruction. The + * parity scrub will be finished after we've reconstructed the failed + * stripes + */ +static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +{ + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + goto cleanup; + + if (rbio->faila >= 0 || rbio->failb >= 0) { + int dfail = 0, failp = -1; + + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; + + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; + + /* + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) + */ + if (dfail > rbio->bbio->max_errors - 1) + goto cleanup; + + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) { + finish_parity_scrub(rbio, 0); + return; + } + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckly, use the other one to repair + * the data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) + goto cleanup; + + __raid_recover_end_io(rbio); + } else { + finish_parity_scrub(rbio, 1); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_parity_scrub_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_parity_scrub(rbio); +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + int pagenr; + int stripe; + struct bio *bio; + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + bio_list_init(&bio_list); + + atomic_set(&rbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid56_parity_scrub_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return; + +finish: + validate_rbio_for_parity_scrub(rbio); +} + +static void scrub_parity_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_parity_scrub_stripe(rbio); +} + +static void async_scrub_parity(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + scrub_parity_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_scrub_parity(rbio); +} diff --git a/kernel/fs/btrfs/raid56.h b/kernel/fs/btrfs/raid56.h new file mode 100644 index 000000000..2b5d7977d --- /dev/null +++ b/kernel/fs/btrfs/raid56.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +struct btrfs_raid_bio; +struct btrfs_device; + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len); + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical); +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/kernel/fs/btrfs/rcu-string.h b/kernel/fs/btrfs/rcu-string.h new file mode 100644 index 000000000..9e111e457 --- /dev/null +++ b/kernel/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { + struct rcu_head rcu; + char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + strncpy(ret->str, src, len); + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) diff --git a/kernel/fs/btrfs/reada.c b/kernel/fs/btrfs/reada.c new file mode 100644 index 000000000..0e7beea92 --- /dev/null +++ b/kernel/fs/btrfs/reada.c @@ -0,0 +1,992 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" +#include "dev-replace.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + int err; + struct list_head extctl; + int refcnt; + spinlock_t lock; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + btrfs_debug(root->fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_bio *bbio) +{ + int ret; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < bbio->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = bbio->stripes[i].dev; + } + zone->ndevs = bbio->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), + zone); + + if (ret == -EEXIST) { + kfree(zone); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + } + spin_unlock(&fs_info->reada_lock); + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *dev; + struct btrfs_device *prev_dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; + + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + + if (re) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = root->nodesize; + re->logical = logical; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + re->refcnt = 1; + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0); + if (ret || !bbio || length < blocksize) + goto error; + + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { + btrfs_err(root->fs_info, + "readahead: more than %d copies not supported", + BTRFS_MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + re_exist->refcnt++; + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + if (ret) { + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); + for (i = 0; i < nzones; ++i) { + dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + if (!dev->bdev) { + /* + * cannot read ahead on missing device, but for RAID5/6, + * REQ_GET_READ_MIRRORS return 1. So don't skip missing + * device for such case. + */ + if (nzones > 1) + continue; + } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } + prev_dev = dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = bbio->stripes[i].dev; + BUG_ON(dev == NULL); + /* ignore whether the entry was inserted */ + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + + btrfs_put_bbio(bbio); + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + btrfs_put_bbio(bbio); + kfree(re); + return re_exist; +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (--re->refcnt) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + fs_info->tree_root->nodesize; + re->refcnt++; + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + int old_ioprio; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), + task_nice_ioprio(current)); + set_task_ioprio(current, BTRFS_IOPRIO_READA); + __reada_start_machine(fs_info); + set_task_ioprio(current, old_ioprio); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + btrfs_init_work(&rmw->work, btrfs_readahead_helper, + reada_start_machine_worker, NULL, NULL); + rmw->fs_info = fs_info; + + btrfs_queue_work(fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + if (reada_add_block(rc, start, &max_key, level, generation)) { + kfree(rc); + return ERR_PTR(-ENOMEM); + } + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, + atomic_read(&rc->elems) < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} diff --git a/kernel/fs/btrfs/relocation.c b/kernel/fs/btrfs/relocation.c new file mode 100644 index 000000000..74b24b01d --- /dev/null +++ b/kernel/fs/btrfs/relocation.c @@ -0,0 +1,4658 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" +#include "free-space-cache.h" +#include "inode-map.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ + u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; +}; + +#define LOWER 0 +#define UPPER 1 +#define RELOCATION_RESERVED_NODES 256 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ + struct list_head pending[BTRFS_MAX_LEVEL]; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; + + u64 search_start; + u64 extents_found; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; + unsigned int found_file_extent:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root = RB_ROOT; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root = RB_ROOT; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } +} + +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) +{ + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) +{ + + struct btrfs_fs_info *fs_info = NULL; + struct backref_node *bnode = rb_entry(rb_node, struct backref_node, + rb_node); + if (bnode->root) + fs_info = bnode->root->fs_info; + btrfs_panic(fs_info, errno, "Inconsistency in backref cache " + "found at offset %llu", bytenr); +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + BUG_ON(node->detached); + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->list); + list_del(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest && !node->detached); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } + /* + * add the node to leaf node list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, &cache->leaves); + upper->lowest = 1; + } + } + + drop_backref_node(cache, node); +} + +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, bytenr); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_get_fs_root(fs_info, &key, false); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (key.type == BTRFS_EXTENT_ITEM_KEY && + item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + if (key.type == BTRFS_METADATA_ITEM_KEY && + item_size <= sizeof(*ei)) { + WARN_ON(item_size < sizeof(*ei)); + return 1; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + } else { + *ptr = (unsigned long)(ei + 1); + } + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct backref_cache *cache = &rc->backref_cache; + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; + int ret; + int err = 0; + bool need_check = true; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + path1->reada = 1; + path2->reada = 2; + + node = alloc_backref_node(cache); + if (!node) { + err = -ENOMEM; + goto out; + } + + node->bytenr = bytenr; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + ASSERT(ret); + ASSERT(path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + ASSERT(list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + ASSERT(list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + if (key.objectid == key.offset) { + root = find_tree_root(rc, eb, ref0); + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } + if (is_cowonly_root(btrfs_ref_root_v0(eb, + ref0))) + cur->cowonly = 1; + } +#else + ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + ASSERT(root); + cur->root = root; + break; + } + + edge = alloc_backref_edge(cache); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = alloc_backref_node(cache); + if (!upper) { + free_backref_edge(cache, edge); + err = -ENOMEM; + goto out; + } + upper->bytenr = key.offset; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &cur->upper); + edge->node[LOWER] = cur; + edge->node[UPPER] = upper; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + cur->cowonly = 1; + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + ASSERT(btrfs_root_bytenr(&root->root_item) == + cur->bytenr); + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + need_check = true; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + ASSERT(btrfs_root_bytenr(&root->root_item) == + lower->bytenr); + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; + break; + } + + edge = alloc_backref_edge(cache); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = alloc_backref_node(cache); + if (!upper) { + free_backref_edge(cache, edge); + err = -ENOMEM; + goto out; + } + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) + upper->cowonly = 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs, we only do this once + * while walking up a tree as we will catch + * anything else later on. + */ + if (!upper->checked && need_check) { + need_check = false; + list_add_tail(&edge->list[UPPER], + &list); + } else { + if (upper->checked) + need_check = true; + INIT_LIST_HEAD(&edge->list[UPPER]); + } + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + ASSERT(node->checked); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); + list_add_tail(&node->lower, &cache->leaves); + } + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + if (!upper->checked) { + /* + * Still want to blow up for developers since this is a + * logic bug. + */ + ASSERT(0); + err = -EINVAL; + goto out; + } + if (cowonly != upper->cowonly) { + ASSERT(0); + err = -EINVAL; + goto out; + } + + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + upper->bytenr); + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + ASSERT(list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, list); + list_del_init(&lower->list); + } + while (!list_empty(&list)) { + edge = list_first_entry(&list, struct backref_edge, + list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + upper = edge->node[UPPER]; + free_backref_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes + * and isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &useless); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to proces */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + if (list_empty(&upper->upper)) + list_add(&upper->list, &useless); + } + + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, list); + list_del_init(&lower->list); + free_backref_node(cache, lower); + } + return ERR_PTR(err); + } + ASSERT(!node || !node->detached); + return node; +} + +/* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->checked = 1; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } else { + list_add_tail(&new_node->lower, &cache->leaves); + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __must_check __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) + return -ENOMEM; + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) { + btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " + "for start=%llu while inserting into relocation " + "tree", node->bytenr); + kfree(node); + return -EEXIST; + } + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to delete the 'address of tree root -> reloc tree' + * mapping + */ +static void __del_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); + return 0; +} + +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + u64 last_snap = 0; + int ret; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = objectid; + + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + last_snap = btrfs_root_last_snapshot(&root->root_item); + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } + + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + /* + * abuse rtransid, it is safe because it is impossible to + * receive data into a relocation tree. + */ + btrfs_set_root_rtransid(root_item, last_snap); + btrfs_set_root_otransid(root_item, trans->transid); + } + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + struct btrfs_block_rsv *rsv; + int clear_rsv = 0; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->reloc_reserved) { + rsv = trans->block_rsv; + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = rsv; + + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int ret; + + if (!root->reloc_root) + goto out; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + __del_reloc_root(reloc_root); + } + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + +out: + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(&entry->vfs_inode)) + node = node->rb_left; + else if (objectid > btrfs_ino(&entry->vfs_inode)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(&entry->vfs_inode)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = btrfs_ino(&entry->vfs_inode) + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode), + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = -EINVAL; + goto out; + } + + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr = 0; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret = 0; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (first) { + inode = find_next_inode(root, key.objectid); + first = 0; + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); + inode = find_next_inode(root, key.objectid); + } + if (inode && btrfs_ino(inode) == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret) { + /* + * Don't have to abort since we've not changed anything + * in the file extent yet. + */ + break; + } + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + if (inode) + btrfs_add_delayed_iput(inode); + return ret; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int cow = 0; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); +again: + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = dest->nodesize; + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, old_ptr_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = (!eb) ? -ENOMEM : -EIO; + free_extent_buffer(eb); + break; + } + btrfs_tree_lock(eb); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + } + btrfs_set_lock_blocking(eb); + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0, + 1); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0, 1); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0, + 1); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0, 1); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + eb = read_tree_block(root, bytenr, ptr_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + u64 ino; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + ino = btrfs_ino(inode); + + if (ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + u32 min_reserved; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + goto out; + } + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + goto out; + } + trans->block_rsv = rc->block_rsv; + + replaced = 0; + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else { + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + btrfs_end_transaction_throttle(trans, root); + trans = NULL; + + btrfs_btree_balance_dirty(root); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); + } + + if (trans) + btrfs_end_transaction_throttle(trans, root); + + btrfs_btree_balance_dirty(root); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) +{ + struct btrfs_root *root = rc->extent_root; + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + + mutex_lock(&root->fs_info->reloc_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->reloc_mutex); + +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + goto again; + } + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); + + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); + btrfs_update_reloc_root(trans, root); + + list_add(&reloc_root->root_list, &reloc_roots); + } + + list_splice(&reloc_roots, &rc->reloc_roots); + + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; +} + +static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root; + + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); + __del_reloc_root(reloc_root); + } +} + +static noinline_for_stack +void merge_reloc_roots(struct reloc_control *rc) +{ + struct btrfs_root *root; + struct btrfs_root *reloc_root; + u64 last_snap; + u64 otransid; + u64 objectid; + LIST_HEAD(reloc_roots); + int found = 0; + int ret = 0; +again: + root = rc->extent_root; + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + ret = merge_reloc_root(rc, root); + if (ret) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } + } else { + list_del_init(&reloc_root->root_list); + } + + /* + * we keep the old last snapshod transid in rtranid when we + * created the relocation tree. + */ + last_snap = btrfs_root_rtransid(&reloc_root->root_item); + otransid = btrfs_root_otransid(&reloc_root->root_item); + objectid = reloc_root->root_key.offset; + + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + if (ret < 0) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } + } + + if (found) { + found = 0; + goto again; + } +out: + if (ret) { + btrfs_std_error(root->fs_info, ret); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + } + + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[]) +{ + struct backref_node *next; + struct btrfs_root *root; + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); + break; + } + + WARN_ON(1); + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + if (!root) + return NULL; + + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; + } + return root; +} + +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-references counted tree */ + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; +} + +static noinline_for_stack +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += rc->extent_root->nodesize; + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; +} + +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) +{ + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + u64 tmp; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; + + trans->block_rsv = rc->block_rsv; + rc->reserved_bytes += num_bytes; + ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (ret == -EAGAIN) { + tmp = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + } + return ret; + } + + return 0; +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, rc, upper, edges); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } + drop_node_buffer(upper); + } + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } + + upper->locked = 1; + path->locks[upper->level] = 0; + + slot = path->slots[upper->level]; + btrfs_release_path(path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (lowest) { + BUG_ON(bytenr != node->bytenr); + } else { + if (node->eb->start == bytenr) + goto next; + } + + blocksize = root->nodesize; + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, generation); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + err = -EIO; + goto next; + } + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + if (ret < 0) { + err = ret; + goto next; + } + BUG_ON(node->eb != eb); + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0, 1); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; + } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, rc, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_path *path, int err) +{ + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node; + int level; + int ret; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); + + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } + } + list_splice_init(&list, &cache->pending[level]); + } + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = rc->extent_root->nodesize; + mark_block_processed(rc, node->bytenr, blocksize); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + __mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) +{ + u32 blocksize = rc->extent_root->nodesize; + + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.offset); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret = 0; + + if (!node) + return 0; + + BUG_ON(node->processed); + root = select_one_root(trans, node); + if (root == ERR_PTR(-ENOENT)) { + update_processed_blocks(rc, node); + goto out; + } + + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + } + + if (root) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } +out: + if (ret || node->level == 0 || node->cowonly) + remove_backref_node(&rc->backref_cache, node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out_free_blocks; + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + readahead_tree_block(rc->extent_root, block->bytenr); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) { + err = get_tree_block_key(rc, block); + if (err) + goto out_free_path; + } + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; + goto out; + } + rb_node = rb_next(rb_node); + } +out: + err = finish_pending_nodes(trans, rc, path, err); + +out_free_path: + btrfs_free_path(path); +out_free_blocks: + free_block_list(blocks); + return err; +} + +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; + + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); + + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start, 0); + if (ret) + goto out; + + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + if (ret) + break; + nr++; + } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static noinline_for_stack +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 page_start; + u64 page_end; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; + unsigned long last_index; + struct page *page; + struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int nr = 0; + int ret = 0; + + if (!cluster->nr) + return 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; + + file_ra_state_init(ra, inode->i_mapping); + + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); + if (ret) + goto out; + + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + page = find_lock_page(inode->i_mapping, index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = find_or_create_page(inode->i_mapping, index, + mask); + if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); + ret = -ENOMEM; + goto out; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); + ret = -EIO; + goto out; + } + } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); + + set_page_extent_mapped(page); + + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + set_page_dirty(page); + + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end); + unlock_page(page); + page_cache_release(page); + + index++; + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); + } + WARN_ON(nr != cluster->nr); +out: + kfree(ra); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) +{ + int ret; + + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + u64 generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + level = btrfs_tree_block_level(eb, bi); + } else { + level = (int)extent_key->offset; + } + generation = btrfs_extent_generation(eb, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + if (ret < 0) + return ret; + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = rc->extent_root->nodesize; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, block->bytenr); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, + SKINNY_METADATA); + + if (tree_block_processed(bytenr, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + key.objectid = bytenr; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } + } + BUG_ON(ret); + + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, btrfs_header_level(eb), 1, + NULL, &flags); + BUG_ON(ret); + + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + return ret; +} + +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct inode *inode, + u64 ino) +{ + struct btrfs_key key; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (inode) + goto truncate; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode) || is_bad_inode(inode)) { + if (!IS_ERR(inode)) + iput(inode); + return -ENOENT; + } + +truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); +out: + iput(inode); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + /* + * This is an extent belonging to the free space cache, lets just delete + * it and redo the search. + */ + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { + ret = delete_block_group_cache(rc->extent_root->fs_info, + rc->block_group, + NULL, ref_objectid); + if (ret != -ENOENT) + return ret; + ret = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (WARN_ON(ret > 0)) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (WARN_ON(key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY)) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + block->bytenr); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * helper to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize = rc->extent_root->nodesize; + int ret = 0; + int err = 0; + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + if (ret) { + err = ret; + goto out; + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } +out: + btrfs_release_path(path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * helper to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.objectid + rc->extent_root->nodesize <= + rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(path); + rc->search_start = end + 1; + } else { + if (key.type == BTRFS_EXTENT_ITEM_KEY) + rc->search_start = key.objectid + key.offset; + else + rc->search_start = key.objectid + + rc->extent_root->nodesize; + memcpy(extent_key, &key, sizeof(key)); + return 0; + } + } + btrfs_release_path(path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->reloc_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->reloc_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, + BTRFS_BLOCK_RSV_TEMP); + if (!rc->block_rsv) + return -ENOMEM; + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + u64 flags; + u32 item_size; + int ret; + int err = 0; + int progress = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } + + while (1) { + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } + progress++; + trans = btrfs_start_transaction(rc->extent_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } +restart: + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } + + ret = find_next_extent(trans, rc, path, &key); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(path); + ret = 0; + } + if (ret < 0) { + err = ret; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + /* + * if we fail to relocate tree blocks, force to update + * backref cache when committing transaction. + */ + rc->backref_cache.last_trans = trans->transid - 1; + + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); + trans = NULL; + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, + &key, &rc->cluster); + if (ret < 0) { + err = ret; + break; + } + } + } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } + + btrfs_release_path(path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + if (trans) { + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); + } + + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); + if (ret < 0) + err = ret; + } + + rc->create_reloc_tree = 0; + set_reloc_control(rc); + + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + + err = prepare_to_merge(rc, err); + + merge_reloc_roots(rc); + + rc->merge_reloc_tree = 0; + unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + + /* get rid of pinned extents */ + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); + + err = btrfs_find_free_objectid(root, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); + return rc; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; + int ret; + int rw = 0; + int err = 0; + + rc = alloc_reloc_control(fs_info); + if (!rc) + return -ENOMEM; + + rc->extent_root = extent_root; + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, + path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", + rc->block_group->key.objectid, rc->block_group->flags); + + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + if (ret < 0) { + err = ret; + goto out; + } + btrfs_wait_ordered_roots(fs_info, -1); + + while (1) { + mutex_lock(&fs_info->cleaner_mutex); + ret = relocate_block_group(rc); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + err = ret; + goto out; + } + + if (rc->extents_found == 0) + break; + + btrfs_info(extent_root->fs_info, "found %llu extents", + rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + ret = btrfs_wait_ordered_range(rc->data_inode, 0, + (u64)-1); + if (ret) { + err = ret; + goto out; + } + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } + } + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); + iput(rc->data_inode); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret, err; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + err = btrfs_end_transaction(trans, root->fs_info->tree_root); + if (err) + return err; + return ret; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = alloc_reloc_control(root->fs_info); + if (!rc) { + err = -ENOMEM; + goto out; + } + + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out_free; + } + + err = __add_reloc_root(reloc_root); + BUG_ON(err < 0); /* -ENOMEM or logic error */ + fs_root->reloc_root = reloc_root; + } + + err = btrfs_commit_transaction(trans, rc->extent_root); + if (err) + goto out_free; + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + err = btrfs_commit_transaction(trans, rc->extent_root); +out_free: + kfree(rc); +out: + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + else + err = btrfs_orphan_cleanup(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + u64 disk_bytenr; + u64 new_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0); + if (ret) + goto out; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + /* + * We need to offset the new_bytenr based on where the csum is. + * We need to do this because we will read in entire prealloc + * extents but we may have written to say the middle of the + * prealloc extent, so we need to make sure the csum goes with + * the right disk offset. + * + * We can do this because the data reloc inode refers strictly + * to the on disk bytes, so we don't have to worry about + * disk_len vs real len like with real inodes since it's all + * disk length. + */ + new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + sums->bytenr = new_bytenr; + + btrfs_add_ordered_sum(inode, ordered, sums); + } +out: + btrfs_put_ordered_extent(ordered); + return ret; +} + +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret = 0; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return 0; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) + ret = replace_file_extents(trans, rc, root, cow); + return ret; +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return 0; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + if (ret) + return ret; + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); + + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) + ret = clone_backref_node(trans, rc, root, reloc_root); + return ret; +} diff --git a/kernel/fs/btrfs/root-tree.c b/kernel/fs/btrfs/root-tree.c new file mode 100644 index 000000000..360a728a6 --- /dev/null +++ b/kernel/fs/btrfs/root-tree.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * Read a root item from the tree. In case we detect a root item smaller then + * sizeof(root_item), we know it's an old version of the root structure and + * initialize all new fields to zero. The same happens if we detect mismatching + * generation numbers as then we know the root was once mounted with an older + * kernel that was not aware of the root item structure change. + */ +static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) +{ + uuid_le uuid; + int len; + int need_reset = 0; + + len = btrfs_item_size_nr(eb, slot); + read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), + min_t(int, len, (int)sizeof(*item))); + if (len < sizeof(*item)) + need_reset = 1; + if (!need_reset && btrfs_root_generation(item) + != btrfs_root_generation_v2(item)) { + if (btrfs_root_generation_v2(item) != 0) { + printk(KERN_WARNING "BTRFS: mismatching " + "generation and generation_v2 " + "found in root item. This root " + "was probably mounted with an " + "older kernel. Resetting all " + "new fields.\n"); + } + need_reset = 1; + } + if (need_reset) { + memset(&item->generation_v2, 0, + sizeof(*item) - offsetof(struct btrfs_root_item, + generation_v2)); + + uuid_le_gen(&uuid); + memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + } +} + +/* + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the reak key of the tree we look for + * + * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. + */ +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) +{ + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); + if (ret < 0) + return ret; + + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != search_key->objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } + + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); +out: + btrfs_release_path(path); + return ret; +} + +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + int old_len; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu", + key->objectid, key->type, key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + old_len = btrfs_item_size_nr(l, slot); + + /* + * If this is the first time we update the root item which originated + * from an older kernel, we need to enlarge the item size to make room + * for the added fields. + */ + if (old_len < sizeof(*item)) { + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, key, path, + -1, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, + key, sizeof(*item)); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + } + + /* + * Update generation_v2 so at the next mount we know the new root + * fields are valid. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item) +{ + /* + * Make sure generation v1 and v2 match. See update_root for details. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); +} + +int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; + int err = 0; + int ret; + bool can_recover = true; + + if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + can_recover = false; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root(tree_root, &root_key); + err = PTR_ERR_OR_ZERO(root); + if (err && err != -ENOENT) { + break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_error(tree_root->fs_info, err, + "Failed to start trans to delete " + "orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_key.objectid); + btrfs_end_transaction(trans, tree_root); + if (err) { + btrfs_error(tree_root->fs_info, err, + "Failed to delete root orphan " + "item"); + break; + } + continue; + } + + err = btrfs_init_fs_root(root); + if (err) { + btrfs_free_fs_root(root); + break; + } + + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + + err = btrfs_insert_fs_root(root->fs_info, root); + if (err) { + BUG_ON(err == -EEXIST); + btrfs_free_fs_root(root); + break; + } + + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + } + + btrfs_free_path(path); + return err; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len) + +{ + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret < 0); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + if (ret) { + err = ret; + goto out; + } + } else + err = -ENOENT; + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + +out: + btrfs_free_path(path); + return err; +} + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + + btrfs_free_path(path); + return 0; +} + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + btrfs_set_stack_inode_flags(&root_item->inode, inode_flags); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); + } +} + +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root_item *item = &root->root_item; + struct timespec ct = CURRENT_TIME; + + spin_lock(&root->root_item_lock); + btrfs_set_root_ctransid(item, trans->transid); + btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); + btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); + spin_unlock(&root->root_item_lock); +} diff --git a/kernel/fs/btrfs/scrub.c b/kernel/fs/btrfs/scrub.c new file mode 100644 index 000000000..ab5811545 --- /dev/null +++ b/kernel/fs/btrfs/scrub.c @@ -0,0 +1,4228 @@ +/* + * Copyright (C) 2011, 2012 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" +#include "dev-replace.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "raid56.h" + +/* + * This is only the first step towards a full-features scrub. It reads all + * extent and super block and verifies the checksums. In case a bad checksum + * is found or the extent cannot be read, good data will be written back if + * any can be found. + * + * Future enhancements: + * - In case an unrepairable extent is encountered, track which files are + * affected and report them + * - track and record media errors, throw out bad devices + * - add a mode to also read unallocated space + */ + +struct scrub_block; +struct scrub_ctx; + +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ + +struct scrub_recover { + atomic_t refs; + struct btrfs_bio *bbio; + u64 map_length; +}; + +struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct btrfs_device *dev; + struct list_head list; + u64 flags; /* extent flags */ + u64 generation; + u64 logical; + u64 physical; + u64 physical_for_dev_replace; + atomic_t refs; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; + u8 csum[BTRFS_CSUM_SIZE]; + + struct scrub_recover *recover; +}; + +struct scrub_bio { + int index; + struct scrub_ctx *sctx; + struct btrfs_device *dev; + struct bio *bio; + int err; + u64 logical; + u64 physical; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif + int page_count; + int next_free; + struct btrfs_work work; +}; + +struct scrub_block { + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t refs; /* free mem on transition to zero */ + struct scrub_ctx *sctx; + struct scrub_parity *sparity; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + unsigned int generation_error:1; /* also sets header_error */ + + /* The following is for the data used to check parity */ + /* It is for the data with checksum */ + unsigned int data_corrected:1; + }; +}; + +/* Used for the chunks with parity stripe such RAID5/6 */ +struct scrub_parity { + struct scrub_ctx *sctx; + + struct btrfs_device *scrub_dev; + + u64 logic_start; + + u64 logic_end; + + int nsectors; + + int stripe_len; + + atomic_t refs; + + struct list_head spages; + + /* Work of parity check and repair */ + struct btrfs_work work; + + /* Mark the parity blocks which have data */ + unsigned long *dbitmap; + + /* + * Mark the parity blocks which have data, but errors happen when + * read data or check data + */ + unsigned long *ebitmap; + + unsigned long bitmap[0]; +}; + +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; + struct btrfs_root *dev_root; + int first_free; + int curr; + atomic_t bios_in_flight; + atomic_t workers_pending; + spinlock_t list_lock; + wait_queue_head_t list_wait; + u16 csum_size; + struct list_head csum_list; + atomic_t cancel_req; + int readonly; + int pages_per_rd_bio; + u32 sectorsize; + u32 nodesize; + + int is_dev_replace; + struct scrub_wr_ctx wr_ctx; + + /* + * statistics + */ + struct btrfs_scrub_progress stat; + spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; +}; + +struct scrub_fixup_nodatasum { + struct scrub_ctx *sctx; + struct btrfs_device *dev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + +struct scrub_nocow_inode { + u64 inum; + u64 offset; + u64 root; + struct list_head list; +}; + +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct list_head inodes; + struct btrfs_work work; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; +}; + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, + struct scrub_block *sblocks_for_recheck); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size, int retry_failed_mirror); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); +static void scrub_parity_get(struct scrub_parity *sparity); +static void scrub_parity_put(struct scrub_parity *sparity); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->refs); + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); +} + +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + atomic_inc(&sctx->refs); + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + + atomic_inc(&sctx->workers_pending); +} + +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); +} + +static void scrub_free_csums(struct scrub_ctx *sctx) +{ + while (!list_empty(&sctx->csum_list)) { + struct btrfs_ordered_sum *sum; + sum = list_first_entry(&sctx->csum_list, + struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } +} + +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) +{ + int i; + + if (!sctx) + return; + + scrub_free_wr_ctx(&sctx->wr_ctx); + + /* this can happen when scrub is cancelled */ + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; + + for (i = 0; i < sbio->page_count; i++) { + WARN_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; + + if (!sbio) + break; + kfree(sbio); + } + + scrub_free_csums(sctx); + kfree(sctx); +} + +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + +static noinline_for_stack +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +{ + struct scrub_ctx *sctx; + int i; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_rd_bio; + int ret; + + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) + goto nomem; + atomic_set(&sctx->refs, 1); + sctx->is_dev_replace = is_dev_replace; + sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->curr = -1; + sctx->dev_root = dev->dev_root; + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { + struct scrub_bio *sbio; + + sbio = kzalloc(sizeof(*sbio), GFP_NOFS); + if (!sbio) + goto nomem; + sctx->bios[i] = sbio; + + sbio->index = i; + sbio->sctx = sctx; + sbio->page_count = 0; + btrfs_init_work(&sbio->work, btrfs_scrub_helper, + scrub_bio_end_io_worker, NULL, NULL); + + if (i != SCRUB_BIOS_PER_SCTX - 1) + sctx->bios[i]->next_free = i + 1; + else + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } + return sctx; + +nomem: + scrub_free_ctx(sctx); + return ERR_PTR(-ENOMEM); +} + +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = warn_ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + struct btrfs_key key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, rcu_str_deref(swarn->dev->name), + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + free_ipath(ipath); + return 0; + +err: + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, rcu_str_deref(swarn->dev->name), + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) +{ + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + unsigned long ptr = 0; + u64 extent_item_pos; + u64 flags = 0; + u64 ref_root; + u32 item_size; + u8 ref_level; + int ret; + + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0]->dev; + fs_info = sblock->sctx->dev_root->fs_info; + + path = btrfs_alloc_path(); + if (!path) + return; + + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; + swarn.errstr = errstr; + swarn.dev = NULL; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, + &flags); + if (ret < 0) + goto out; + + extent_item_pos = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + printk_in_rcu(KERN_WARNING + "BTRFS: %s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, + rcu_str_deref(dev->name), + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + btrfs_release_path(path); + } else { + btrfs_release_path(path); + swarn.path = path; + swarn.dev = dev; + iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, 1, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) +{ + struct page *page = NULL; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; + int ret; + int corrected = 0; + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + int srcu_index; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return PTR_ERR(local_root); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + ret = repair_io_failure(inode, offset, PAGE_SIZE, + fixup->logical, page, + offset - page_offset(page), + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); + + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } + +out: + if (page) + put_page(page); + + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_ctx *sctx; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sctx = fixup->sctx; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + fixup->logical, rcu_str_deref(fixup->dev->name)); + } + + btrfs_free_path(path); + kfree(fixup); + + scrub_pending_trans_workers_dec(sctx); +} + +static inline void scrub_get_recover(struct scrub_recover *recover) +{ + atomic_inc(&recover->refs); +} + +static inline void scrub_put_recover(struct scrub_recover *recover) +{ + if (atomic_dec_and_test(&recover->refs)) { + btrfs_put_bbio(recover->bbio); + kfree(recover); + } +} + +/* + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. + */ +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) +{ + struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; + + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } + + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, + sizeof(*sblocks_for_recheck), GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } + + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; + + /* build and submit the bios for the failed mirror, check checksums */ + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size, 1); + + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + sblock_to_check->data_corrected = 1; + spin_unlock(&sctx->stat_lock); + + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); + goto out; + } + + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + } else if (sblock_bad->checksum_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } else if (sblock_bad->header_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + if (sblock_bad->generation_error) + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + else + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } + + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } + + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; + + WARN_ON(sctx->is_dev_replace); + +nodatasum_case: + + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; + scrub_pending_trans_workers_inc(sctx); + btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, + scrub_fixup_nodatasum, NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; + } + + /* + * now build and submit the bios for the other mirrors, check + * checksums. + * First try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other; + + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size, 0); + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + goto corrected_error; + } else { + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other); + if (!ret) + goto corrected_error; + } + } + } + + if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) + goto did_not_correct_error; + + /* + * In case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. + */ + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_block *sblock_other = NULL; + + /* skip no-io-error page in scrub */ + if (!page_bad->io_error && !sctx->is_dev_replace) + continue; + + /* try to find no-io-error page in mirrors */ + if (page_bad->io_error) { + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; + } + } + if (!sblock_other) + success = 0; + } + + if (sctx->is_dev_replace) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + if (!sblock_other) + sblock_other = sblock_bad; + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + success = 0; + } + } else if (sblock_other) { + ret = scrub_repair_page_from_good_copy(sblock_bad, + sblock_other, + page_num, 0); + if (0 == ret) + page_bad->io_error = 0; + else + success = 0; + } + } + + if (success && !sctx->is_dev_replace) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size, 1); + if (!sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + sblock_to_check->data_corrected = 1; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: fixed up error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); + } + } else { +did_not_correct_error: + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); + } + +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + struct scrub_recover *recover; + int page_index; + + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + recover = sblock->pagev[page_index]->recover; + if (recover) { + scrub_put_recover(recover); + sblock->pagev[page_index]->recover = + NULL; + } + scrub_page_put(sblock->pagev[page_index]); + } + } + kfree(sblocks_for_recheck); + } + + return 0; +} + +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) +{ + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + return 3; + else + return (int)bbio->num_stripes; +} + +static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, + u64 *raid_map, + u64 mapped_length, + int nstripes, int mirror, + int *stripe_index, + u64 *stripe_offset) +{ + int i; + + if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* RAID5/6 */ + for (i = 0; i < nstripes; i++) { + if (raid_map[i] == RAID6_Q_STRIPE || + raid_map[i] == RAID5_P_STRIPE) + continue; + + if (logical >= raid_map[i] && + logical < raid_map[i] + mapped_length) + break; + } + + *stripe_index = i; + *stripe_offset = logical - raid_map[i]; + } else { + /* The other RAID type */ + *stripe_index = mirror; + *stripe_offset = 0; + } +} + +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, + struct scrub_block *sblocks_for_recheck) +{ + struct scrub_ctx *sctx = original_sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = original_sblock->page_count * PAGE_SIZE; + u64 logical = original_sblock->pagev[0]->logical; + struct scrub_recover *recover; + struct btrfs_bio *bbio; + u64 sublen; + u64 mapped_length; + u64 stripe_offset; + int stripe_index; + int page_index = 0; + int mirror_index; + int nmirrors; + int ret; + + /* + * note: the two members refs and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + while (length > 0) { + sublen = min_t(u64, length, PAGE_SIZE); + mapped_length = sublen; + bbio = NULL; + + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0, 1); + if (ret || !bbio || mapped_length < sublen) { + btrfs_put_bbio(bbio); + return -EIO; + } + + recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + if (!recover) { + btrfs_put_bbio(bbio); + return -ENOMEM; + } + + atomic_set(&recover->refs, 1); + recover->bbio = bbio; + recover->map_length = mapped_length; + + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + + nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + + for (mirror_index = 0; mirror_index < nmirrors; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + sblock = sblocks_for_recheck + mirror_index; + sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); + if (!page) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_put_recover(recover); + return -ENOMEM; + } + scrub_page_get(page); + sblock->pagev[page_index] = page; + page->logical = logical; + + scrub_stripe_index_and_offset(logical, + bbio->map_type, + bbio->raid_map, + mapped_length, + bbio->num_stripes - + bbio->num_tgtdevs, + mirror_index, + &stripe_index, + &stripe_offset); + page->physical = bbio->stripes[stripe_index].physical + + stripe_offset; + page->dev = bbio->stripes[stripe_index].dev; + + BUG_ON(page_index >= original_sblock->page_count); + page->physical_for_dev_replace = + original_sblock->pagev[page_index]-> + physical_for_dev_replace; + /* for missing devices, dev->bdev is NULL */ + page->mirror_num = mirror_index + 1; + sblock->page_count++; + page->page = alloc_page(GFP_NOFS); + if (!page->page) + goto leave_nomem; + + scrub_get_recover(recover); + page->recover = recover; + } + scrub_put_recover(recover); + length -= sublen; + logical += sublen; + page_index++; + } + + return 0; +} + +struct scrub_bio_ret { + struct completion event; + int error; +}; + +static void scrub_bio_wait_endio(struct bio *bio, int error) +{ + struct scrub_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +static inline int scrub_is_page_on_raid56(struct scrub_page *page) +{ + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); +} + +static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, + struct bio *bio, + struct scrub_page *page) +{ + struct scrub_bio_ret done; + int ret; + + init_completion(&done.event); + done.error = 0; + bio->bi_iter.bi_sector = page->logical >> 9; + bio->bi_private = &done; + bio->bi_end_io = scrub_bio_wait_endio; + + ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, + page->recover->map_length, + page->mirror_num, 0); + if (ret) + return ret; + + wait_for_completion(&done.event); + if (done.error) + return -EIO; + + return 0; +} + +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size, int retry_failed_mirror) +{ + int page_num; + + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + struct scrub_page *page = sblock->pagev[page_num]; + + if (page->dev->bdev == NULL) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + + WARN_ON(!page->page); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + bio->bi_bdev = page->dev->bdev; + + bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + sblock->no_io_error_seen = 0; + } else { + bio->bi_iter.bi_sector = page->physical >> 9; + + if (btrfsic_submit_bio_wait(READ, bio)) + sblock->no_io_error_seen = 0; + } + + bio_put(bio); + } + + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return; +} + +static inline int scrub_check_fsid(u8 fsid[], + struct scrub_page *spage) +{ + struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + int ret; + + ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE); + return !ret; +} + +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) +{ + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + void *mapped_buffer; + + WARN_ON(!sblock->pagev[0]->page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || + !scrub_check_fsid(h->fsid, sblock->pagev[0]) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) { + sblock->header_error = 1; + } else if (generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } + csum = h->csum; + } else { + if (!have_csum) + return; + + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); + } + + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data( + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); + + kunmap_atomic(mapped_buffer); + page_num++; + if (page_num >= sblock->page_count) + break; + WARN_ON(!sblock->pagev[page_num]->page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; +} + +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good) +{ + int page_num; + int ret = 0; + + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; + + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, 1); + if (ret_sub) + ret = ret_sub; + } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *page_good = sblock_good->pagev[page_num]; + + BUG_ON(page_bad->page == NULL); + BUG_ON(page_good->page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + + if (!page_bad->dev->bdev) { + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); + return -EIO; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_bdev = page_bad->dev->bdev; + bio->bi_iter.bi_sector = page_bad->physical >> 9; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + + if (btrfsic_submit_bio_wait(WRITE, bio)) { + btrfs_dev_stat_inc_and_print(page_bad->dev, + BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_replace_stats_inc( + &sblock_bad->sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + bio_put(bio); + return -EIO; + } + bio_put(bio); + } + + return 0; +} + +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ + int page_num; + + /* + * This block is used for the check of the parity on the source device, + * so the data needn't be written into the destination device. + */ + if (sblock->sparity) + return; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + int ret; + + ret = scrub_write_page_to_dev_replace(sblock, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sblock->sctx->dev_root->fs_info->dev_replace. + num_write_errors); + } +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num) +{ + struct scrub_page *spage = sblock->pagev[page_num]; + + BUG_ON(spage->page == NULL); + if (spage->io_error) { + void *mapped_buffer = kmap_atomic(spage->page); + + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + flush_dcache_page(spage->page); + kunmap_atomic(mapped_buffer); + } + return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + int ret; + + mutex_lock(&wr_ctx->wr_lock); +again: + if (!wr_ctx->wr_curr_bio) { + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + GFP_NOFS); + if (!wr_ctx->wr_curr_bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + wr_ctx->wr_curr_bio->sctx = sctx; + wr_ctx->wr_curr_bio->page_count = 0; + } + sbio = wr_ctx->wr_curr_bio; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical_for_dev_replace; + sbio->logical = spage->logical; + sbio->dev = wr_ctx->tgtdev; + bio = sbio->bio; + if (!bio) { + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + if (!bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_wr_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_iter.bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical_for_dev_replace || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_wr_submit(sctx); + goto again; + } + + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); + return -EIO; + } + scrub_wr_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + scrub_page_get(spage); + sbio->page_count++; + if (sbio->page_count == wr_ctx->pages_per_wr_bio) + scrub_wr_submit(sctx); + mutex_unlock(&wr_ctx->wr_lock); + + return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + + if (!wr_ctx->wr_curr_bio) + return; + + sbio = wr_ctx->wr_curr_bio; + wr_ctx->wr_curr_bio = NULL; + WARN_ON(!sbio->bio->bi_bdev); + scrub_pending_bio_inc(sctx); + /* process all writes in a single worker thread. Then the block layer + * orders the requests before sending them to the driver which + * doubled the write performance on spinning disks when measured + * with Linux 3.5 */ + btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, + scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + if (sbio->err) { + struct btrfs_dev_replace *dev_replace = + &sbio->sctx->dev_root->fs_info->dev_replace; + + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + btrfs_dev_replace_stats_inc(&dev_replace-> + num_write_errors); + } + } + + for (i = 0; i < sbio->page_count; i++) + scrub_page_put(sbio->pagev[i]); + + bio_put(sbio->bio); + kfree(sbio); + scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock) +{ + u64 flags; + int ret; + + WARN_ON(sblock->page_count < 1); + flags = sblock->pagev[0]->flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); + + return ret; +} + +static int scrub_checksum_data(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; + u32 crc = ~(u32)0; + int fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0]->have_csum) + return 0; + + on_disk_csum = sblock->pagev[0]->csum; + page = sblock->pagev[0]->page; + buffer = kmap_atomic(page); + + len = sctx->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(buffer, crc, l); + kunmap_atomic(buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + buffer = kmap_atomic(page); + } + + btrfs_csum_final(crc, csum); + if (memcmp(csum, on_disk_csum, sctx->csum_size)) + fail = 1; + + return fail; +} + +static int scrub_checksum_tree_block(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_header *h; + struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0]->page; + mapped_buffer = kmap_atomic(page); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sctx->csum_size); + + /* + * we don't use the getter functions here, as we + * a) don't have an extent buffer and + * b) the page is already kmapped + */ + + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) + ++fail; + + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) + ++fail; + + if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) + ++fail; + + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + ++fail; + + len = sctx->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + ++crc_fail; + + return fail || crc_fail; +} + +static int scrub_checksum_super(struct scrub_block *sblock) +{ + struct btrfs_super_block *s; + struct scrub_ctx *sctx = sblock->sctx; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; + u32 crc = ~(u32)0; + int fail_gen = 0; + int fail_cor = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0]->page; + mapped_buffer = kmap_atomic(page); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sctx->csum_size); + + if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) + ++fail_cor; + + if (sblock->pagev[0]->generation != btrfs_super_generation(s)) + ++fail_gen; + + if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) + ++fail_cor; + + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + ++fail_cor; + + if (fail_cor + fail_gen) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + if (fail_cor) + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + else + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + } + + return fail_cor + fail_gen; +} + +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->refs); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->refs)) { + int i; + + if (sblock->sparity) + scrub_parity_put(sblock->sparity); + + for (i = 0; i < sblock->page_count; i++) + scrub_page_put(sblock->pagev[i]); + kfree(sblock); + } +} + +static void scrub_page_get(struct scrub_page *spage) +{ + atomic_inc(&spage->refs); +} + +static void scrub_page_put(struct scrub_page *spage) +{ + if (atomic_dec_and_test(&spage->refs)) { + if (spage->page) + __free_page(spage->page); + kfree(spage); + } +} + +static void scrub_submit(struct scrub_ctx *sctx) +{ + struct scrub_bio *sbio; + + if (sctx->curr == -1) + return; + + sbio = sctx->bios[sctx->curr]; + sctx->curr = -1; + scrub_pending_bio_inc(sctx); + + if (!sbio->bio->bi_bdev) { + /* + * this case should not happen. If btrfs_map_block() is + * wrong, it could happen for dev-replace operations on + * missing devices when no mirrors are available, but in + * this case it should already fail the mount. + * This case is handled correctly (but _very_ slowly). + */ + printk_ratelimited(KERN_WARNING + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); + bio_endio(sbio->bio, -EIO); + } else { + btrfsic_submit_bio(READ, sbio->bio); + } +} + +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_block *sblock = spage->sblock; + struct scrub_bio *sbio; + int ret; + +again: + /* + * grab a fresh bio or wait for one to become available + */ + while (sctx->curr == -1) { + spin_lock(&sctx->list_lock); + sctx->curr = sctx->first_free; + if (sctx->curr != -1) { + sctx->first_free = sctx->bios[sctx->curr]->next_free; + sctx->bios[sctx->curr]->next_free = -1; + sctx->bios[sctx->curr]->page_count = 0; + spin_unlock(&sctx->list_lock); + } else { + spin_unlock(&sctx->list_lock); + wait_event(sctx->list_wait, sctx->first_free != -1); + } + } + sbio = sctx->bios[sctx->curr]; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical; + sbio->logical = spage->logical; + sbio->dev = spage->dev; + bio = sbio->bio; + if (!bio) { + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_iter.bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical || + sbio->dev != spage->dev) { + scrub_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } + scrub_submit(sctx); + goto again; + } + + scrub_block_get(sblock); /* one for the page added to the bio */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sctx->pages_per_rd_bio) + scrub_submit(sctx); + + return 0; +} + +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + scrub_page_get(spage); + sblock->pagev[index] = spage; + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->physical_for_dev_replace = physical_for_dev_replace; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + physical_for_dev_replace += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + if (force) + scrub_submit(sctx); + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sctx->list_lock); + sbio->next_free = sctx->first_free; + sctx->first_free = sbio->index; + spin_unlock(&sctx->list_lock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + + scrub_pending_bio_dec(sctx); +} + +static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, + unsigned long *bitmap, + u64 start, u64 len) +{ + u32 offset; + int nsectors; + int sectorsize = sparity->sctx->dev_root->sectorsize; + + if (len >= sparity->stripe_len) { + bitmap_set(bitmap, 0, sparity->nsectors); + return; + } + + start -= sparity->logic_start; + start = div_u64_rem(start, sparity->stripe_len, &offset); + offset /= sectorsize; + nsectors = (int)len / sectorsize; + + if (offset + nsectors <= sparity->nsectors) { + bitmap_set(bitmap, offset, nsectors); + return; + } + + bitmap_set(bitmap, offset, sparity->nsectors - offset); + bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); +} + +static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); +} + +static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + int corrupted = 0; + + if (!sblock->no_io_error_seen) { + corrupted = 1; + scrub_handle_errored_block(sblock); + } else { + /* + * if has checksum error, write via repair mechanism in + * dev replace case, otherwise write here in dev replace + * case. + */ + corrupted = scrub_checksum(sblock); + if (!corrupted && sblock->sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock); + } + + if (sblock->sparity && corrupted && !sblock->data_corrected) { + u64 start = sblock->pagev[0]->logical; + u64 end = sblock->pagev[sblock->page_count - 1]->logical + + PAGE_SIZE; + + scrub_parity_mark_sectors_error(sblock->sparity, + start, end - start); + } +} + +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, + u8 *csum) +{ + struct btrfs_ordered_sum *sum = NULL; + unsigned long index; + unsigned long num_sectors; + + while (!list_empty(&sctx->csum_list)) { + sum = list_first_entry(&sctx->csum_list, + struct btrfs_ordered_sum, list); + if (sum->bytenr > logical) + return 0; + if (sum->bytenr + sum->len > logical) + break; + + ++sctx->stat.csum_discards; + list_del(&sum->list); + kfree(sum); + sum = NULL; + } + if (!sum) + return 0; + + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; + num_sectors = sum->len / sctx->sectorsize; + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { + list_del(&sum->list); + kfree(sum); + } + return 1; +} + +/* scrub extent tries to collect up to 64 kB for each bio */ +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u64 physical_for_dev_replace) +{ + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed++; + sctx->stat.data_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + blocksize = sctx->nodesize; + spin_lock(&sctx->stat_lock); + sctx->stat.tree_extents_scrubbed++; + sctx->stat.tree_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + ++sctx->stat.no_csum; + if (sctx->is_dev_replace && !have_csum) { + ret = copy_nocow_pages(sctx, logical, l, + mirror_num, + physical_for_dev_replace); + goto behind_scrub_pages; + } + } + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, + mirror_num, have_csum ? csum : NULL, 0, + physical_for_dev_replace); +behind_scrub_pages: + if (ret) + return ret; + len -= l; + logical += l; + physical += l; + physical_for_dev_replace += l; + } + return 0; +} + +static int scrub_pages_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num, u8 *csum) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + sblock->sparity = sparity; + scrub_parity_get(sparity); + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + /* For scrub block */ + scrub_page_get(spage); + sblock->pagev[index] = spage; + /* For scrub parity */ + scrub_page_get(spage); + list_add_tail(&spage->list, &sparity->spages); + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static int scrub_extent_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num) +{ + struct scrub_ctx *sctx = sparity->sctx; + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + blocksize = sctx->nodesize; + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + goto skip; + } + ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + flags, gen, mirror_num, + have_csum ? csum : NULL); + if (ret) + return ret; +skip: + len -= l; + logical += l; + physical += l; + } + return 0; +} + +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset, + u64 *stripe_start) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + u32 stripe_index; + u32 rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + if (stripe_start) + *stripe_start = last_offset; + + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + +static void scrub_free_parity(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_page *curr, *next; + int nbits; + + nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + if (nbits) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors += nbits; + sctx->stat.uncorrectable_errors += nbits; + spin_unlock(&sctx->stat_lock); + } + + list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_del_init(&curr->list); + scrub_page_put(curr); + } + + kfree(sparity); +} + +static void scrub_parity_bio_endio(struct bio *bio, int error) +{ + struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_ctx *sctx = sparity->sctx; + + if (error) + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); + bio_put(bio); +} + +static void scrub_parity_check_and_repair(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct bio *bio; + struct btrfs_raid_bio *rbio; + struct scrub_page *spage; + struct btrfs_bio *bbio = NULL; + u64 length; + int ret; + + if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, + sparity->nsectors)) + goto out; + + length = sparity->logic_end - sparity->logic_start + 1; + ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, + sparity->logic_start, + &length, &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) + goto bbio_out; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = sparity->logic_start >> 9; + bio->bi_private = sparity; + bio->bi_end_io = scrub_parity_bio_endio; + + rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, + length, sparity->scrub_dev, + sparity->dbitmap, + sparity->nsectors); + if (!rbio) + goto rbio_out; + + list_for_each_entry(spage, &sparity->spages, list) + raid56_parity_add_scrub_pages(rbio, spage->page, + spage->logical); + + scrub_pending_bio_inc(sctx); + raid56_parity_submit_scrub_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + btrfs_put_bbio(bbio); + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +out: + scrub_free_parity(sparity); +} + +static inline int scrub_calc_parity_bitmap_len(int nsectors) +{ + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); +} + +static void scrub_parity_get(struct scrub_parity *sparity) +{ + atomic_inc(&sparity->refs); +} + +static void scrub_parity_put(struct scrub_parity *sparity) +{ + if (!atomic_dec_and_test(&sparity->refs)) + return; + + scrub_parity_check_and_repair(sparity); +} + +static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logic_start, + u64 logic_end) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + u64 flags; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + u64 generation; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + struct scrub_parity *sparity; + int nsectors; + int bitmap_len; + int extent_mirror_num; + int stop_loop = 0; + + nsectors = map->stripe_len / root->sectorsize; + bitmap_len = scrub_calc_parity_bitmap_len(nsectors); + sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, + GFP_NOFS); + if (!sparity) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + sparity->stripe_len = map->stripe_len; + sparity->nsectors = nsectors; + sparity->sctx = sctx; + sparity->scrub_dev = sdev; + sparity->logic_start = logic_start; + sparity->logic_end = logic_end; + atomic_set(&sparity->refs, 1); + INIT_LIST_HEAD(&sparity->spages); + sparity->dbitmap = sparity->bitmap; + sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + + ret = 0; + while (logic_start < logic_end) { + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logic_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->nodesize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logic_start) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid > logic_end) { + stop_loop = 1; + break; + } + + while (key.objectid >= logic_start + map->stripe_len) + logic_start += map->stripe_len; + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logic_start && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); + goto next; + } +again: + extent_logical = key.objectid; + extent_len = bytes; + + if (extent_logical < logic_start) { + extent_len -= logic_start - extent_logical; + extent_logical = logic_start; + } + + if (extent_logical + extent_len > + logic_start + map->stripe_len) + extent_len = logic_start + map->stripe_len - + extent_logical; + + scrub_parity_mark_sectors_data(sparity, extent_logical, + extent_len); + + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent_for_parity(sparity, extent_logical, + extent_len, + extent_physical, + extent_dev, flags, + generation, + extent_mirror_num); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + logic_start += map->stripe_len; + + if (logic_start >= logic_end) { + stop_loop = 1; + break; + } + + if (logic_start < key.objectid + bytes) { + cond_resched(); + goto again; + } + } +next: + path->slots[0]++; + } + + btrfs_release_path(path); + + if (stop_loop) + break; + + logic_start += map->stripe_len; + } +out: + if (ret < 0) + scrub_parity_mark_sectors_error(sparity, logic_start, + logic_end - logic_start + 1); + scrub_parity_put(sparity); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + btrfs_release_path(path); + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *scrub_dev, + int num, u64 base, u64 length, + int is_dev_replace) +{ + struct btrfs_path *path, *ppath; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + struct blk_plug plug; + u64 flags; + int ret; + int slot; + u64 nstripes; + struct extent_buffer *l; + struct btrfs_key key; + u64 physical; + u64 logical; + u64 logic_end; + u64 physical_end; + u64 generation; + int mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; + u64 increment = map->stripe_len; + u64 offset; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + u64 stripe_logical; + u64 stripe_end; + struct btrfs_device *extent_dev; + int extent_mirror_num; + int stop_loop = 0; + + physical = map->stripes[num].physical; + offset = 0; + nstripes = div_u64(length, map->stripe_len); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + offset = map->stripe_len * num; + increment = map->stripe_len * map->num_stripes; + mirror_num = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + offset = map->stripe_len * (num / map->sub_stripes); + increment = map->stripe_len * factor; + mirror_num = num % map->sub_stripes + 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes + 1; + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes + 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + get_raid56_logic_offset(physical, num, map, &offset, NULL); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; + } else { + increment = map->stripe_len; + mirror_num = 1; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ppath = btrfs_alloc_path(); + if (!ppath) { + btrfs_free_path(path); + return -ENOMEM; + } + + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ + path->search_commit_root = 1; + path->skip_locking = 1; + + ppath->search_commit_root = 1; + ppath->skip_locking = 1; + /* + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits + */ + logical = base + offset; + physical_end = physical + nstripes * map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end, NULL); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + scrub_blocked_if_needed(fs_info); + + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = logic_end; + key_end.type = BTRFS_METADATA_ITEM_KEY; + key_end.offset = (u64)-1; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = logic_end; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); + + + /* + * collect all data csums for the stripe to avoid seeking during + * the scrub. This might currently (crc32) end up to be about 1MB + */ + blk_start_plug(&plug); + + /* + * now find all extents for each stripe and scrub them + */ + ret = 0; + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = get_raid56_logic_offset(physical, num, + map, &logical, &stripe_logical); + logical += base; + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment - 1; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; + goto skip; + } + } + /* + * canceled? + */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + goto out; + } + /* + * check to see if we have to pause + */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* push queued extents */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + scrub_blocked_if_needed(fs_info); + } + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->nodesize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logical) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid >= logical + map->stripe_len) { + /* out of this device extent */ + if (key.objectid >= logic_end) + stop_loop = 1; + break; + } + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logical && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", + key.objectid, logical); + goto next; + } + +again: + extent_logical = key.objectid; + extent_len = bytes; + + /* + * trim extent to this stripe + */ + if (extent_logical < logical) { + extent_len -= logical - extent_logical; + extent_logical = logical; + } + if (extent_logical + extent_len > + logical + map->stripe_len) { + extent_len = logical + map->stripe_len - + extent_logical; + } + + extent_physical = extent_logical - logical + physical; + extent_dev = scrub_dev; + extent_mirror_num = mirror_num; + if (is_dev_replace) + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent(sctx, extent_logical, extent_len, + extent_physical, extent_dev, flags, + generation, extent_mirror_num, + extent_logical - logical + physical); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ +loop: + physical += map->stripe_len; + ret = get_raid56_logic_offset(physical, + num, map, &logical, + &stripe_logical); + logical += base; + + if (ret && physical < physical_end) { + stripe_logical += base; + stripe_end = stripe_logical + + increment - 1; + ret = scrub_raid56_parity(sctx, + map, scrub_dev, ppath, + stripe_logical, + stripe_end); + if (ret) + goto out; + goto loop; + } + } else { + physical += map->stripe_len; + logical += increment; + } + if (logical < key.objectid + bytes) { + cond_resched(); + goto again; + } + + if (physical >= physical_end) { + stop_loop = 1; + break; + } + } +next: + path->slots[0]++; + } + btrfs_release_path(path); +skip: + logical += increment; + physical += map->stripe_len; + spin_lock(&sctx->stat_lock); + if (stop_loop) + sctx->stat.last_physical = map->stripes[num].physical + + length; + else + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; + } +out: + /* push queued extents */ + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + blk_finish_plug(&plug); + btrfs_free_path(path); + btrfs_free_path(ppath); + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 length, + u64 dev_offset, int is_dev_replace) +{ + struct btrfs_mapping_tree *map_tree = + &sctx->dev_root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + int i; + int ret = 0; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + + if (!em) + return -EINVAL; + + map = (struct map_lookup *)em->bdev; + if (em->start != chunk_offset) + goto out; + + if (em->len < length) + goto out; + + for (i = 0; i < map->num_stripes; ++i) { + if (map->stripes[i].dev->bdev == scrub_dev->bdev && + map->stripes[i].physical == dev_offset) { + ret = scrub_stripe(sctx, map, scrub_dev, i, + chunk_offset, length, + is_dev_replace); + if (ret) + goto out; + } + } +out: + free_extent_map(em); + + return ret; +} + +static noinline_for_stack +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end, + int is_dev_replace) +{ + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_block_group_cache *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = scrub_dev->devid; + key.offset = 0ull; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.objectid != scrub_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset >= end) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (found_key.offset + length <= start) + goto skip; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + + /* + * get a reference on the corresponding block group to prevent + * the chunk from going away while we scrub it + */ + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + + dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, + chunk_offset, length, found_key.offset, + is_dev_replace); + + /* + * flush, submit all pending read and write bios, afterwards + * wait for them. + * Note that in the dev replace case, a read request causes + * write requests that are submitted in the read completion + * worker. Therefore in the current situation, it is required + * that all write requests are flushed, so that all read and + * write requests are really completed when bios_in_flight + * changes to 0. + */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ + wait_event(sctx->list_wait, + atomic_read(&sctx->workers_pending) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + + btrfs_put_block_group(cache); + if (ret) + break; + if (is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } + + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; +skip: + key.offset = found_key.offset + length; + btrfs_release_path(path); + } + + btrfs_free_path(path); + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) +{ + int i; + u64 bytenr; + u64 gen; + int ret; + struct btrfs_root *root = sctx->dev_root; + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return -EIO; + + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != root->fs_info->fs_devices) + gen = scrub_dev->generation; + else + gen = root->fs_info->last_trans_committed; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) + break; + + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, 1, bytenr); + if (ret) + return ret; + } + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + return 0; +} + +/* + * get a reference count on fs_info->scrub_workers. start worker if necessary + */ +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + int is_dev_replace) +{ + int ret = 0; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; + + if (fs_info->scrub_workers_refcnt == 0) { + if (is_dev_replace) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); + else + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; + goto out; + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; + goto out; + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; + goto out; + } + } + ++fs_info->scrub_workers_refcnt; +out: + return ret; +} + +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (--fs_info->scrub_workers_refcnt == 0) { + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + } + WARN_ON(fs_info->scrub_workers_refcnt < 0); +} + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace) +{ + struct scrub_ctx *sctx; + int ret; + struct btrfs_device *dev; + struct rcu_string *name; + + if (btrfs_fs_closing(fs_info)) + return -EINVAL; + + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", + fs_info->chunk_root->sectorsize, PAGE_SIZE); + return -EINVAL; + } + + if (fs_info->chunk_root->nodesize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || + fs_info->chunk_root->sectorsize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + /* + * would exhaust the array bounds of pagev member in + * struct scrub_block + */ + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", + fs_info->chunk_root->nodesize, + SCRUB_MAX_PAGES_PER_BLOCK, + fs_info->chunk_root->sectorsize, + SCRUB_MAX_PAGES_PER_BLOCK); + return -EINVAL; + } + + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev || (dev->missing && !is_dev_replace)) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -ENODEV; + } + + if (!is_dev_replace && !readonly && !dev->writeable) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_lock(); + name = rcu_dereference(dev->name); + btrfs_err(fs_info, "scrub: device %s is not writable", + name->str); + rcu_read_unlock(); + return -EROFS; + } + + mutex_lock(&fs_info->scrub_lock); + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -EIO; + } + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -EINPROGRESS; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); + + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return ret; + } + + sctx = scrub_setup_ctx(dev, is_dev_replace); + if (IS_ERR(sctx)) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); + return PTR_ERR(sctx); + } + sctx->readonly = readonly; + dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); + atomic_inc(&fs_info->scrubs_running); + mutex_unlock(&fs_info->scrub_lock); + + if (!is_dev_replace) { + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + ret = scrub_supers(sctx, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + } + + if (!ret) + ret = scrub_enumerate_chunks(sctx, dev, start, end, + is_dev_replace); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_dec(&fs_info->scrubs_running); + wake_up(&fs_info->scrub_pause_wait); + + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); + + if (progress) + memcpy(progress, &sctx->stat, sizeof(*progress)); + + mutex_lock(&fs_info->scrub_lock); + dev->scrub_device = NULL; + scrub_workers_put(fs_info); + mutex_unlock(&fs_info->scrub_lock); + + scrub_put_ctx(sctx); + + return ret; +} + +void btrfs_scrub_pause(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrub_pause_req); + while (atomic_read(&fs_info->scrubs_paused) != + atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_paused) == + atomic_read(&fs_info->scrubs_running)); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); +} + +void btrfs_scrub_continue(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + atomic_dec(&fs_info->scrub_pause_req); + wake_up(&fs_info->scrub_pause_wait); +} + +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->scrub_lock); + if (!atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + + atomic_inc(&fs_info->scrub_cancel_req); + while (atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_running) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrub_cancel_req); + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct scrub_ctx *sctx; + + mutex_lock(&fs_info->scrub_lock); + sctx = dev->scrub_device; + if (!sctx) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + atomic_inc(&sctx->cancel_req); + while (dev->scrub_device) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + dev->scrub_device == NULL); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress) +{ + struct btrfs_device *dev; + struct scrub_ctx *sctx = NULL; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); + if (dev) + sctx = dev->scrub_device; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; +} + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) +{ + u64 mapped_length; + struct btrfs_bio *bbio = NULL; + int ret; + + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < extent_len || + !bbio->stripes[0].dev->bdev) { + btrfs_put_bbio(bbio); + return; + } + + *extent_physical = bbio->stripes[0].physical; + *extent_mirror_num = bbio->mirror_num; + *extent_dev = bbio->stripes[0].dev; + btrfs_put_bbio(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace) +{ + WARN_ON(wr_ctx->wr_curr_bio != NULL); + + mutex_init(&wr_ctx->wr_lock); + wr_ctx->wr_curr_bio = NULL; + if (!is_dev_replace) + return 0; + + WARN_ON(!dev->bdev); + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, + bio_get_nr_vecs(dev->bdev)); + wr_ctx->tgtdev = dev; + atomic_set(&wr_ctx->flush_all_writes, 0); + return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ + mutex_lock(&wr_ctx->wr_lock); + kfree(wr_ctx->wr_curr_bio); + wr_ctx->wr_curr_bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace) +{ + struct scrub_copy_nocow_ctx *nocow_ctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); + if (!nocow_ctx) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + scrub_pending_trans_workers_inc(sctx); + + nocow_ctx->sctx = sctx; + nocow_ctx->logical = logical; + nocow_ctx->len = len; + nocow_ctx->mirror_num = mirror_num; + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; + btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, + copy_nocow_pages_worker, NULL, NULL); + INIT_LIST_HEAD(&nocow_ctx->inodes); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + struct scrub_nocow_inode *nocow_inode; + + nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); + if (!nocow_inode) + return -ENOMEM; + nocow_inode->inum = inum; + nocow_inode->offset = offset; + nocow_inode->root = root; + list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); + return 0; +} + +#define COPY_COMPLETE 1 + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = + container_of(work, struct scrub_copy_nocow_ctx, work); + struct scrub_ctx *sctx = nocow_ctx->sctx; + u64 logical = nocow_ctx->logical; + u64 len = nocow_ctx->len; + int mirror_num = nocow_ctx->mirror_num; + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_root *root; + int not_written = 0; + + fs_info = sctx->dev_root->fs_info; + root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + not_written = 1; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + not_written = 1; + goto out; + } + + ret = iterate_inodes_from_logical(logical, fs_info, path, + record_inode_for_nocow, nocow_ctx); + if (ret != 0 && ret != -ENOENT) { + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", + logical, physical_for_dev_replace, len, mirror_num, + ret); + not_written = 1; + goto out; + } + + btrfs_end_transaction(trans, root); + trans = NULL; + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, + entry->root, nocow_ctx); + kfree(entry); + if (ret == COPY_COMPLETE) { + ret = 0; + break; + } else if (ret) { + break; + } + } +out: + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + kfree(entry); + } + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, root); + if (not_written) + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. + num_uncorrectable_read_errors); + + btrfs_free_path(path); + kfree(nocow_ctx); + + scrub_pending_trans_workers_dec(sctx); +} + +static int check_extent_to_block(struct inode *inode, u64 start, u64 len, + u64 logical) +{ + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree; + struct extent_map *em; + u64 lockstart = start, lockend = start + len - 1; + int ret = 0; + + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = 1; + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > logical || + em->block_start + em->block_len < logical + len) { + free_extent_map(em); + ret = 1; + goto out_unlock; + } + free_extent_map(em); + +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); + return ret; +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *nocow_ctx) +{ + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + struct btrfs_key key; + struct inode *inode; + struct page *page; + struct btrfs_root *local_root; + struct extent_io_tree *io_tree; + u64 physical_for_dev_replace; + u64 nocow_ctx_logical; + u64 len = nocow_ctx->len; + unsigned long index; + int srcu_index; + int ret = 0; + int err = 0; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return PTR_ERR(local_root); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + io_tree = &BTRFS_I(inode)->io_tree; + nocow_ctx_logical = nocow_ctx->logical; + + ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto out; + } + + while (len >= PAGE_CACHE_SIZE) { + index = offset >> PAGE_CACHE_SHIFT; +again: + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + btrfs_err(fs_info, "find_or_create_page() failed"); + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) + goto next_page; + } else { + ClearPageError(page); + err = extent_read_full_page(io_tree, page, + btrfs_get_extent, + nocow_ctx->mirror_num); + if (err) { + ret = err; + goto next_page; + } + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto next_page; + } + } + + ret = check_extent_to_block(inode, offset, len, + nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto next_page; + } + + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; +next_page: + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + + offset += PAGE_CACHE_SIZE; + physical_for_dev_replace += PAGE_CACHE_SIZE; + nocow_ctx_logical += PAGE_CACHE_SIZE; + len -= PAGE_CACHE_SIZE; + } + ret = COPY_COMPLETE; +out: + mutex_unlock(&inode->i_mutex); + iput(inode); + return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page) +{ + struct bio *bio; + struct btrfs_device *dev; + int ret; + + dev = sctx->wr_ctx.tgtdev; + if (!dev) + return -EIO; + if (!dev->bdev) { + printk_ratelimited(KERN_WARNING + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; + bio->bi_bdev = dev->bdev; + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + goto leave_with_eio; + + bio_put(bio); + return 0; +} diff --git a/kernel/fs/btrfs/send.c b/kernel/fs/btrfs/send.c new file mode 100644 index 000000000..a1216f9b4 --- /dev/null +++ b/kernel/fs/btrfs/send.c @@ -0,0 +1,5962 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "send.h" +#include "backref.h" +#include "hash.h" +#include "locking.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "transaction.h" + +static int g_verbose = 0; + +#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__) + +/* + * A fs_path is a helper to dynamically build path names with unknown size. + * It reallocates the internal buffer on demand. + * It allows fast adding of path elements on the right side (normal path) and + * fast adding to the left side (reversed path). A reversed path can also be + * unreversed if needed. + */ +struct fs_path { + union { + struct { + char *start; + char *end; + + char *buf; + unsigned short buf_len:15; + unsigned short reversed:1; + char inline_buf[]; + }; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; + }; +}; +#define FS_PATH_INLINE_SIZE \ + (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + + +/* reused for each extent */ +struct clone_root { + struct btrfs_root *root; + u64 ino; + u64 offset; + + u64 found_refs; +}; + +#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 +#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) + +struct send_ctx { + struct file *send_filp; + loff_t send_off; + char *send_buf; + u32 send_size; + u32 send_max_size; + u64 total_send_size; + u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + + struct btrfs_root *send_root; + struct btrfs_root *parent_root; + struct clone_root *clone_roots; + int clone_roots_cnt; + + /* current state of the compare_tree call */ + struct btrfs_path *left_path; + struct btrfs_path *right_path; + struct btrfs_key *cmp_key; + + /* + * infos of the currently processed inode. In case of deleted inodes, + * these are the values from the deleted inode. + */ + u64 cur_ino; + u64 cur_inode_gen; + int cur_inode_new; + int cur_inode_new_gen; + int cur_inode_deleted; + u64 cur_inode_size; + u64 cur_inode_mode; + u64 cur_inode_rdev; + u64 cur_inode_last_extent; + + u64 send_progress; + + struct list_head new_refs; + struct list_head deleted_refs; + + struct radix_tree_root name_cache; + struct list_head name_cache_list; + int name_cache_size; + + struct file_ra_state ra; + + char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + bool is_orphan; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; +}; + +struct name_cache_entry { + struct list_head list; + /* + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. + */ + struct list_head radix_list; + u64 ino; + u64 gen; + u64 parent_ino; + u64 parent_gen; + int ret; + int need_later_update; + int name_len; + char name[]; +}; + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + +static void fs_path_reset(struct fs_path *p) +{ + if (p->reversed) { + p->start = p->buf + p->buf_len - 1; + p->end = p->start; + *p->start = 0; + } else { + p->start = p->buf; + p->end = p->start; + *p->start = 0; + } +} + +static struct fs_path *fs_path_alloc(void) +{ + struct fs_path *p; + + p = kmalloc(sizeof(*p), GFP_NOFS); + if (!p) + return NULL; + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); + return p; +} + +static struct fs_path *fs_path_alloc_reversed(void) +{ + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return NULL; + p->reversed = 1; + fs_path_reset(p); + return p; +} + +static void fs_path_free(struct fs_path *p) +{ + if (!p) + return; + if (p->buf != p->inline_buf) + kfree(p->buf); + kfree(p); +} + +static int fs_path_len(struct fs_path *p) +{ + return p->end - p->start; +} + +static int fs_path_ensure_buf(struct fs_path *p, int len) +{ + char *tmp_buf; + int path_len; + int old_buf_len; + + len++; + + if (p->buf_len >= len) + return 0; + + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + + path_len = p->end - p->start; + old_buf_len = p->buf_len; + + /* + * First time the inline_buf does not suffice + */ + if (p->buf == p->inline_buf) { + tmp_buf = kmalloc(len, GFP_NOFS); + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { + tmp_buf = krealloc(p->buf, len, GFP_NOFS); + } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + + if (p->reversed) { + tmp_buf = p->buf + old_buf_len - path_len - 1; + p->end = p->buf + p->buf_len - 1; + p->start = p->end - path_len; + memmove(p->start, tmp_buf, path_len + 1); + } else { + p->start = p->buf; + p->end = p->start + path_len; + } + return 0; +} + +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) +{ + int ret; + int new_len; + + new_len = p->end - p->start + name_len; + if (p->start != p->end) + new_len++; + ret = fs_path_ensure_buf(p, new_len); + if (ret < 0) + goto out; + + if (p->reversed) { + if (p->start != p->end) + *--p->start = '/'; + p->start -= name_len; + *prepared = p->start; + } else { + if (p->start != p->end) + *p->end++ = '/'; + *prepared = p->end; + p->end += name_len; + *p->end = 0; + } + +out: + return ret; +} + +static int fs_path_add(struct fs_path *p, const char *name, int name_len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, name_len, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, name, name_len); + +out: + return ret; +} + +static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, p2->start, p2->end - p2->start); + +out: + return ret; +} + +static int fs_path_add_from_extent_buffer(struct fs_path *p, + struct extent_buffer *eb, + unsigned long off, int len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, len, &prepared); + if (ret < 0) + goto out; + + read_extent_buffer(eb, prepared, off, len); + +out: + return ret; +} + +static int fs_path_copy(struct fs_path *p, struct fs_path *from) +{ + int ret; + + p->reversed = from->reversed; + fs_path_reset(p); + + ret = fs_path_add_path(p, from); + + return ret; +} + + +static void fs_path_unreverse(struct fs_path *p) +{ + char *tmp; + int len; + + if (!p->reversed) + return; + + tmp = p->start; + len = p->end - p->start; + p->start = p->buf; + p->end = p->start + len; + memmove(p->start, tmp, len + 1); + p->reversed = 0; +} + +static struct btrfs_path *alloc_path_for_send(void) +{ + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return NULL; + path->search_commit_root = 1; + path->skip_locking = 1; + path->need_commit_sem = 1; + return path; +} + +static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) +{ + int ret; + mm_segment_t old_fs; + u32 pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + while (pos < len) { + ret = vfs_write(filp, (__force const char __user *)buf + pos, + len - pos, off); + /* TODO handle that correctly */ + /*if (ret == -ERESTARTSYS) { + continue; + }*/ + if (ret < 0) + goto out; + if (ret == 0) { + ret = -EIO; + goto out; + } + pos += ret; + } + + ret = 0; + +out: + set_fs(old_fs); + return ret; +} + +static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) +{ + struct btrfs_tlv_header *hdr; + int total_len = sizeof(*hdr) + len; + int left = sctx->send_max_size - sctx->send_size; + + if (unlikely(left < total_len)) + return -EOVERFLOW; + + hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); + hdr->tlv_type = cpu_to_le16(attr); + hdr->tlv_len = cpu_to_le16(len); + memcpy(hdr + 1, data, len); + sctx->send_size += total_len; + + return 0; +} + +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } + +TLV_PUT_DEFINE_INT(64) + +static int tlv_put_string(struct send_ctx *sctx, u16 attr, + const char *str, int len) +{ + if (len == -1) + len = strlen(str); + return tlv_put(sctx, attr, str, len); +} + +static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, + const u8 *uuid) +{ + return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); +} + +static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, + struct extent_buffer *eb, + struct btrfs_timespec *ts) +{ + struct btrfs_timespec bts; + read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); + return tlv_put(sctx, attr, &bts, sizeof(bts)); +} + + +#define TLV_PUT(sctx, attrtype, attrlen, data) \ + do { \ + ret = tlv_put(sctx, attrtype, attrlen, data); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_INT(sctx, attrtype, bits, value) \ + do { \ + ret = tlv_put_u##bits(sctx, attrtype, value); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) +#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) +#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) +#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) +#define TLV_PUT_STRING(sctx, attrtype, str, len) \ + do { \ + ret = tlv_put_string(sctx, attrtype, str, len); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_PATH(sctx, attrtype, p) \ + do { \ + ret = tlv_put_string(sctx, attrtype, p->start, \ + p->end - p->start); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while(0) +#define TLV_PUT_UUID(sctx, attrtype, uuid) \ + do { \ + ret = tlv_put_uuid(sctx, attrtype, uuid); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ + do { \ + ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +static int send_header(struct send_ctx *sctx) +{ + struct btrfs_stream_header hdr; + + strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + + return write_buf(sctx->send_filp, &hdr, sizeof(hdr), + &sctx->send_off); +} + +/* + * For each command/item we want to send to userspace, we call this function. + */ +static int begin_cmd(struct send_ctx *sctx, int cmd) +{ + struct btrfs_cmd_header *hdr; + + if (WARN_ON(!sctx->send_buf)) + return -EINVAL; + + BUG_ON(sctx->send_size); + + sctx->send_size += sizeof(*hdr); + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->cmd = cpu_to_le16(cmd); + + return 0; +} + +static int send_cmd(struct send_ctx *sctx) +{ + int ret; + struct btrfs_cmd_header *hdr; + u32 crc; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); + hdr->crc = 0; + + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + + sctx->total_send_size += sctx->send_size; + sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->send_size = 0; + + return ret; +} + +/* + * Sends a move instruction to user space + */ +static int send_rename(struct send_ctx *sctx, + struct fs_path *from, struct fs_path *to) +{ + int ret; + +verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a link instruction to user space + */ +static int send_link(struct send_ctx *sctx, + struct fs_path *path, struct fs_path *lnk) +{ + int ret; + +verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends an unlink instruction to user space + */ +static int send_unlink(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_unlink %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a rmdir instruction to user space + */ +static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_rmdir %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Helper function to retrieve some fields from an inode item. + */ +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) +{ + int ret; + struct btrfs_inode_item *ii; + struct btrfs_key key; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } + + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (size) + *size = btrfs_inode_size(path->nodes[0], ii); + if (gen) + *gen = btrfs_inode_generation(path->nodes[0], ii); + if (mode) + *mode = btrfs_inode_mode(path->nodes[0], ii); + if (uid) + *uid = btrfs_inode_uid(path->nodes[0], ii); + if (gid) + *gid = btrfs_inode_gid(path->nodes[0], ii); + if (rdev) + *rdev = btrfs_inode_rdev(path->nodes[0], ii); + + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); + btrfs_free_path(path); + return ret; +} + +typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, + struct fs_path *p, + void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_inode_ref or + * btrfs_inode_extref. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the INODE_REF or INODE_EXTREF when called. + */ +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, int resolve, + iterate_inode_ref_t iterate, void *ctx) +{ + struct extent_buffer *eb = path->nodes[0]; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; + struct btrfs_path *tmp_path; + struct fs_path *p; + u32 cur = 0; + u32 total; + int slot = path->slots[0]; + u32 name_len; + char *start; + int ret = 0; + int num = 0; + int index; + u64 dir; + unsigned long name_off; + unsigned long elem_size; + unsigned long ptr; + + p = fs_path_alloc_reversed(); + if (!p) + return -ENOMEM; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) { + fs_path_free(p); + return -ENOMEM; + } + + + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); + item = btrfs_item_nr(slot); + total = btrfs_item_size(eb, item); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); + total = btrfs_item_size_nr(eb, slot); + elem_size = sizeof(*extref); + } + + while (cur < total) { + fs_path_reset(p); + + if (found_key->type == BTRFS_INODE_REF_KEY) { + iref = (struct btrfs_inode_ref *)(ptr + cur); + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + index = btrfs_inode_ref_index(eb, iref); + dir = found_key->offset; + } else { + extref = (struct btrfs_inode_extref *)(ptr + cur); + name_len = btrfs_inode_extref_name_len(eb, extref); + name_off = (unsigned long)&extref->name; + index = btrfs_inode_extref_index(eb, extref); + dir = btrfs_inode_extref_parent(eb, extref); + } + + if (resolve) { + start = btrfs_ref_to_path(root, tmp_path, name_len, + name_off, eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + if (start < p->buf) { + /* overflow , try again with larger buffer */ + ret = fs_path_ensure_buf(p, + p->buf_len + p->buf - start); + if (ret < 0) + goto out; + start = btrfs_ref_to_path(root, tmp_path, + name_len, name_off, + eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + BUG_ON(start < p->buf); + } + p->start = start; + } else { + ret = fs_path_add_from_extent_buffer(p, eb, name_off, + name_len); + if (ret < 0) + goto out; + } + + cur += elem_size + name_len; + ret = iterate(num, dir, index, p, ctx); + if (ret) + goto out; + num++; + } + +out: + btrfs_free_path(tmp_path); + fs_path_free(p); + return ret; +} + +typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_dir_item. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the dir item when called. + */ +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, + iterate_dir_item_t iterate, void *ctx) +{ + int ret = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + char *buf = NULL; + int buf_len; + u32 name_len; + u32 data_len; + u32 cur; + u32 len; + u32 total; + int slot; + int num; + u8 type; + + /* + * Start with a small buffer (1 page). If later we end up needing more + * space, which can happen for xattrs on a fs with a leaf size greater + * then the page size, attempt to increase the buffer. Typically xattr + * values are small. + */ + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item = btrfs_item_nr(slot); + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + cur = 0; + len = 0; + total = btrfs_item_size(eb, item); + + num = 0; + while (cur < total) { + name_len = btrfs_dir_name_len(eb, di); + data_len = btrfs_dir_data_len(eb, di); + type = btrfs_dir_type(eb, di); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > PATH_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + } + + if (name_len + data_len > buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, + GFP_NOFS | __GFP_NOWARN); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + + read_extent_buffer(eb, buf, (unsigned long)(di + 1), + name_len + data_len); + + len = sizeof(*di) + name_len + data_len; + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + + ret = iterate(num, &di_key, buf, name_len, buf + name_len, + data_len, type, ctx); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + num++; + } + +out: + kvfree(buf); + return ret; +} + +static int __copy_first_ref(int num, u64 dir, int index, + struct fs_path *p, void *ctx) +{ + int ret; + struct fs_path *pt = ctx; + + ret = fs_path_copy(pt, p); + if (ret < 0) + return ret; + + /* we want the first only */ + return 1; +} + +/* + * Retrieve the first path of an inode. If an inode has more then one + * ref/hardlink, this is ignored. + */ +static int get_inode_path(struct btrfs_root *root, + u64 ino, struct fs_path *path) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_path *p; + + p = alloc_path_for_send(); + if (!p) + return -ENOMEM; + + fs_path_reset(path); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 1; + goto out; + } + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); + if (found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); + if (ret < 0) + goto out; + ret = 0; + +out: + btrfs_free_path(p); + return ret; +} + +struct backref_ctx { + struct send_ctx *sctx; + + struct btrfs_path *path; + /* number of total found references */ + u64 found; + + /* + * used for clones found in send_root. clones found behind cur_objectid + * and cur_offset are not considered as allowed clones. + */ + u64 cur_objectid; + u64 cur_offset; + + /* may be truncated in case it's the last extent in a file */ + u64 extent_len; + + /* Just to check for bugs in backref resolving */ + int found_itself; +}; + +static int __clone_root_cmp_bsearch(const void *key, const void *elt) +{ + u64 root = (u64)(uintptr_t)key; + struct clone_root *cr = (struct clone_root *)elt; + + if (root < cr->root->objectid) + return -1; + if (root > cr->root->objectid) + return 1; + return 0; +} + +static int __clone_root_cmp_sort(const void *e1, const void *e2) +{ + struct clone_root *cr1 = (struct clone_root *)e1; + struct clone_root *cr2 = (struct clone_root *)e2; + + if (cr1->root->objectid < cr2->root->objectid) + return -1; + if (cr1->root->objectid > cr2->root->objectid) + return 1; + return 0; +} + +/* + * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset/found_refs + */ +static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +{ + struct backref_ctx *bctx = ctx_; + struct clone_root *found; + int ret; + u64 i_size; + + /* First check if the root is in the list of accepted clone sources */ + found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!found) + return 0; + + if (found->root == bctx->sctx->send_root && + ino == bctx->cur_objectid && + offset == bctx->cur_offset) { + bctx->found_itself = 1; + } + + /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); + if (ret < 0) + return ret; + + if (offset + bctx->extent_len > i_size) + return 0; + + /* + * Make sure we don't consider clones from send_root that are + * behind the current inode/offset. + */ + if (found->root == bctx->sctx->send_root) { + /* + * TODO for the moment we don't accept clones from the inode + * that is currently send. We may change this when + * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same + * file. + */ + if (ino >= bctx->cur_objectid) + return 0; +#if 0 + if (ino > bctx->cur_objectid) + return 0; + if (offset + bctx->extent_len > bctx->cur_offset) + return 0; +#endif + } + + bctx->found++; + found->found_refs++; + if (ino < found->ino) { + found->ino = ino; + found->offset = offset; + } else if (found->ino == ino) { + /* + * same extent found more then once in the same file. + */ + if (found->offset > offset + bctx->extent_len) + found->offset = offset; + } + + return 0; +} + +/* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + * + * path must point to the extent item when called. + */ +static int find_extent_clone(struct send_ctx *sctx, + struct btrfs_path *path, + u64 ino, u64 data_offset, + u64 ino_size, + struct clone_root **found) +{ + int ret; + int extent_type; + u64 logical; + u64 disk_byte; + u64 num_bytes; + u64 extent_item_pos; + u64 flags = 0; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb = path->nodes[0]; + struct backref_ctx *backref_ctx = NULL; + struct clone_root *cur_clone_root; + struct btrfs_key found_key; + struct btrfs_path *tmp_path; + int compressed; + u32 i; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) + return -ENOMEM; + + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); + if (!backref_ctx) { + ret = -ENOMEM; + goto out; + } + + backref_ctx->path = tmp_path; + + if (data_offset >= ino_size) { + /* + * There may be extents that lie behind the file's size. + * I at least had this in combination with snapshotting while + * writing large files. + */ + ret = 0; + goto out; + } + + fi = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -ENOENT; + goto out; + } + compressed = btrfs_file_extent_compression(eb, fi); + + num_bytes = btrfs_file_extent_num_bytes(eb, fi); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) { + ret = -ENOENT; + goto out; + } + logical = disk_byte + btrfs_file_extent_offset(eb, fi); + + down_read(&sctx->send_root->fs_info->commit_root_sem); + ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, + &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); + btrfs_release_path(tmp_path); + + if (ret < 0) + goto out; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = -EIO; + goto out; + } + + /* + * Setup the clone roots. + */ + for (i = 0; i < sctx->clone_roots_cnt; i++) { + cur_clone_root = sctx->clone_roots + i; + cur_clone_root->ino = (u64)-1; + cur_clone_root->offset = 0; + cur_clone_root->found_refs = 0; + } + + backref_ctx->sctx = sctx; + backref_ctx->found = 0; + backref_ctx->cur_objectid = ino; + backref_ctx->cur_offset = data_offset; + backref_ctx->found_itself = 0; + backref_ctx->extent_len = num_bytes; + + /* + * The last extent of a file may be too large due to page alignment. + * We need to adjust extent_len in this case so that the checks in + * __iterate_backrefs work. + */ + if (data_offset + num_bytes >= ino_size) + backref_ctx->extent_len = ino_size - data_offset; + + /* + * Now collect all backrefs. + */ + if (compressed == BTRFS_COMPRESS_NONE) + extent_item_pos = logical - found_key.objectid; + else + extent_item_pos = 0; + ret = iterate_extent_inodes(sctx->send_root->fs_info, + found_key.objectid, extent_item_pos, 1, + __iterate_backrefs, backref_ctx); + + if (ret < 0) + goto out; + + if (!backref_ctx->found_itself) { + /* found a bug in backref code? */ + ret = -EIO; + btrfs_err(sctx->send_root->fs_info, "did not find backref in " + "send_root. inode=%llu, offset=%llu, " + "disk_byte=%llu found extent=%llu", + ino, data_offset, disk_byte, found_key.objectid); + goto out; + } + +verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " + "ino=%llu, " + "num_bytes=%llu, logical=%llu\n", + data_offset, ino, num_bytes, logical); + + if (!backref_ctx->found) + verbose_printk("btrfs: no clones found\n"); + + cur_clone_root = NULL; + for (i = 0; i < sctx->clone_roots_cnt; i++) { + if (sctx->clone_roots[i].found_refs) { + if (!cur_clone_root) + cur_clone_root = sctx->clone_roots + i; + else if (sctx->clone_roots[i].root == sctx->send_root) + /* prefer clones from send_root over others */ + cur_clone_root = sctx->clone_roots + i; + } + + } + + if (cur_clone_root) { + if (compressed != BTRFS_COMPRESS_NONE) { + /* + * Offsets given by iterate_extent_inodes() are relative + * to the start of the extent, we need to add logical + * offset from the file extent item. + * (See why at backref.c:check_extent_in_eb()) + */ + cur_clone_root->offset += btrfs_file_extent_offset(eb, + fi); + } + *found = cur_clone_root; + ret = 0; + } else { + ret = -ENOENT; + } + +out: + btrfs_free_path(tmp_path); + kfree(backref_ctx); + return ret; +} + +static int read_symlink(struct btrfs_root *root, + u64 ino, + struct fs_path *dest) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u8 type; + u8 compression; + unsigned long off; + int len; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + compression = btrfs_file_extent_compression(path->nodes[0], ei); + BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); + BUG_ON(compression); + + off = btrfs_file_extent_inline_start(ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); + + ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Helper function to generate a file name that is unique in the root of + * send_root and parent_root. This is used to generate names for orphan inodes. + */ +static int gen_unique_name(struct send_ctx *sctx, + u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *di; + char tmp[64]; + int len; + u64 idx = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + while (1) { + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", + ino, gen, idx); + ASSERT(len < sizeof(tmp)); + + di = btrfs_lookup_dir_item(NULL, sctx->send_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + + if (!sctx->parent_root) { + /* unique */ + ret = 0; + break; + } + + di = btrfs_lookup_dir_item(NULL, sctx->parent_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + /* unique */ + break; + } + + ret = fs_path_add(dest, tmp, strlen(tmp)); + +out: + btrfs_free_path(path); + return ret; +} + +enum inode_state { + inode_state_no_change, + inode_state_will_create, + inode_state_did_create, + inode_state_will_delete, + inode_state_did_delete, +}; + +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + int left_ret; + int right_ret; + u64 left_gen; + u64 right_gen; + + ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, + NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + left_ret = ret; + + if (!sctx->parent_root) { + right_ret = -ENOENT; + } else { + ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, + NULL, NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + right_ret = ret; + } + + if (!left_ret && !right_ret) { + if (left_gen == gen && right_gen == gen) { + ret = inode_state_no_change; + } else if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else if (!left_ret) { + if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else { + ret = -ENOENT; + } + } else if (!right_ret) { + if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else { + ret = -ENOENT; + } + +out: + return ret; +} + +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + + ret = get_cur_inode_state(sctx, ino, gen); + if (ret < 0) + goto out; + + if (ret == inode_state_no_change || + ret == inode_state_did_create || + ret == inode_state_will_delete) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Helper function to lookup a dir item in a dir. + */ +static int lookup_dir_item_inode(struct btrfs_root *root, + u64 dir, const char *name, int name_len, + u64 *found_inode, + u8 *found_type) +{ + int ret = 0; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path *path; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(NULL, root, path, + dir, name, name_len, 0); + if (!di) { + ret = -ENOENT; + goto out; + } + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } + *found_inode = key.objectid; + *found_type = btrfs_dir_type(path->nodes[0], di); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */ +static int get_first_ref(struct btrfs_root *root, u64 ino, + u64 *dir, u64 *dir_gen, struct fs_path *name) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int len; + u64 parent_dir; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (!ret) + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (ret || found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + if (found_key.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), + len); + parent_dir = found_key.offset; + } else { + struct btrfs_inode_extref *extref; + extref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_extref); + len = btrfs_inode_extref_name_len(path->nodes[0], extref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)&extref->name, len); + parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); + } + if (ret < 0) + goto out; + btrfs_release_path(path); + + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } + + *dir = parent_dir; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_first_ref(struct btrfs_root *root, + u64 ino, u64 dir, + const char *name, int name_len) +{ + int ret; + struct fs_path *tmp_name; + u64 tmp_dir; + + tmp_name = fs_path_alloc(); + if (!tmp_name) + return -ENOMEM; + + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); + if (ret < 0) + goto out; + + if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { + ret = 0; + goto out; + } + + ret = !memcmp(tmp_name->start, name, name_len); + +out: + fs_path_free(tmp_name); + return ret; +} + +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */ +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + const char *name, int name_len, + u64 *who_ino, u64 *who_gen) +{ + int ret = 0; + u64 gen; + u64 other_inode = 0; + u8 other_type = 0; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + /* + * If we have a parent root we need to verify that the parent dir was + * not delted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + */ + if (sctx->parent_root) { + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, + &other_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Check if the overwritten ref was already processed. If yes, the ref + * was already unlinked/moved, so we can safely assume that we will not + * overwrite anything at this point in time. + */ + if (other_inode > sctx->send_progress) { + ret = get_inode_info(sctx->parent_root, other_inode, NULL, + who_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + + ret = 1; + *who_ino = other_inode; + } else { + ret = 0; + } + +out: + return ret; +} + +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */ +static int did_overwrite_ref(struct send_ctx *sctx, + u64 dir, u64 dir_gen, + u64 ino, u64 ino_gen, + const char *name, int name_len) +{ + int ret = 0; + u64 gen; + u64 ow_inode; + u8 other_type; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + /* check if the ref was overwritten by another ref */ + ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, + &ow_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + /* was never and will never be overwritten */ + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, + NULL, NULL); + if (ret < 0) + goto out; + + if (ow_inode == ino && gen == ino_gen) { + ret = 0; + goto out; + } + + /* we know that it is or will be overwritten. check this now */ + if (ow_inode < sctx->send_progress) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */ +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 dir; + u64 dir_gen; + + if (!sctx->parent_root) + goto out; + + name = fs_path_alloc(); + if (!name) + return -ENOMEM; + + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); + if (ret < 0) + goto out; + + ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, + name->start, fs_path_len(name)); + +out: + fs_path_free(name); + return ret; +} + +/* + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * so we need to do some special handling in case we have clashes. This function + * takes care of this with the help of name_cache_entry::radix_list. + * In case of error, nce is kfreed. + */ +static int name_cache_insert(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + int ret = 0; + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); + if (!nce_head) { + kfree(nce); + return -ENOMEM; + } + INIT_LIST_HEAD(nce_head); + + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); + if (ret < 0) { + kfree(nce_head); + kfree(nce); + return ret; + } + } + list_add_tail(&nce->radix_list, nce_head); + list_add_tail(&nce->list, &sctx->name_cache_list); + sctx->name_cache_size++; + + return ret; +} + +static void name_cache_delete(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } + + list_del(&nce->radix_list); + list_del(&nce->list); + sctx->name_cache_size--; + + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + kfree(nce_head); + } +} + +static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) +{ + struct list_head *nce_head; + struct name_cache_entry *cur; + + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + if (!nce_head) + return NULL; + + list_for_each_entry(cur, nce_head, radix_list) { + if (cur->ino == ino && cur->gen == gen) + return cur; + } + return NULL; +} + +/* + * Removes the entry from the list and adds it back to the end. This marks the + * entry as recently used so that name_cache_clean_unused does not remove it. + */ +static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) +{ + list_del(&nce->list); + list_add_tail(&nce->list, &sctx->name_cache_list); +} + +/* + * Remove some entries from the beginning of name_cache_list. + */ +static void name_cache_clean_unused(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + + if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) + return; + + while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); + name_cache_delete(sctx, nce); + kfree(nce); + } +} + +static void name_cache_free(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + + while (!list_empty(&sctx->name_cache_list)) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); + name_cache_delete(sctx, nce); + kfree(nce); + } +} + +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */ +static int __get_cur_name_and_parent(struct send_ctx *sctx, + u64 ino, u64 gen, + u64 *parent_ino, + u64 *parent_gen, + struct fs_path *dest) +{ + int ret; + int nce_ret; + struct name_cache_entry *nce = NULL; + + /* + * First check if we already did a call to this function with the same + * ino/gen. If yes, check if the cache entry is still up-to-date. If yes + * return the cached result. + */ + nce = name_cache_search(sctx, ino, gen); + if (nce) { + if (ino < sctx->send_progress && nce->need_later_update) { + name_cache_delete(sctx, nce); + kfree(nce); + nce = NULL; + } else { + name_cache_used(sctx, nce); + *parent_ino = nce->parent_ino; + *parent_gen = nce->parent_gen; + ret = fs_path_add(dest, nce->name, nce->name_len); + if (ret < 0) + goto out; + ret = nce->ret; + goto out; + } + } + + /* + * If the inode is not existent yet, add the orphan name and return 1. + * This should only happen for the parent dir that we determine in + * __record_new_ref + */ + ret = is_inode_existent(sctx, ino, gen); + if (ret < 0) + goto out; + + if (!ret) { + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + goto out_cache; + } + + /* + * Depending on whether the inode was already processed or not, use + * send_root or parent_root for ref lookup. + */ + if (ino < sctx->send_progress) + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); + else + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); + if (ret < 0) + goto out; + + /* + * Check if the ref was overwritten by an inode's ref that was processed + * earlier. If yes, treat as orphan and return 1. + */ + ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, + dest->start, dest->end - dest->start); + if (ret < 0) + goto out; + if (ret) { + fs_path_reset(dest); + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + } + +out_cache: + /* + * Store the result of the lookup in the name cache. + */ + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); + if (!nce) { + ret = -ENOMEM; + goto out; + } + + nce->ino = ino; + nce->gen = gen; + nce->parent_ino = *parent_ino; + nce->parent_gen = *parent_gen; + nce->name_len = fs_path_len(dest); + nce->ret = ret; + strcpy(nce->name, dest->start); + + if (ino < sctx->send_progress) + nce->need_later_update = 0; + else + nce->need_later_update = 1; + + nce_ret = name_cache_insert(sctx, nce); + if (nce_ret < 0) + ret = nce_ret; + name_cache_clean_unused(sctx); + +out: + return ret; +} + +/* + * Magic happens here. This function returns the first ref to an inode as it + * would look like while receiving the stream at this point in time. + * We walk the path up to the root. For every inode in between, we check if it + * was already processed/sent. If yes, we continue with the parent as found + * in send_root. If not, we continue with the parent as found in parent_root. + * If we encounter an inode that was deleted at this point in time, we use the + * inodes "orphan" name instead of the real name and stop. Same with new inodes + * that were not created yet and overwritten inodes/refs. + * + * When do we have have orphan inodes: + * 1. When an inode is freshly created and thus no valid refs are available yet + * 2. When a directory lost all it's refs (deleted) but still has dir items + * inside which were not processed yet (pending for move/delete). If anyone + * tried to get the path to the dir items, it would get a path inside that + * orphan directory. + * 3. When an inode is moved around or gets new links, it may overwrite the ref + * of an unprocessed inode. If in that case the first ref would be + * overwritten, the overwritten inode gets "orphanized". Later when we + * process this overwritten inode, it is restored at a new place by moving + * the orphan inode. + * + * sctx->send_progress tells this function at which point in time receiving + * would be. + */ +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 parent_inode = 0; + u64 parent_gen = 0; + int stop = 0; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + + dest->reversed = 1; + fs_path_reset(dest); + + while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + + if (ret < 0) + goto out; + + ret = fs_path_add_path(dest, name); + if (ret < 0) + goto out; + + ino = parent_inode; + gen = parent_gen; + } + +out: + fs_path_free(name); + if (!ret) + fs_path_unreverse(dest); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace + */ +static int send_subvol_begin(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_root *parent_root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + char *name = NULL; + int namelen; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); + if (!name) { + btrfs_free_path(path); + return -ENOMEM; + } + + key.objectid = send_root->objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, + &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type != BTRFS_ROOT_BACKREF_KEY || + key.objectid != send_root->objectid) { + ret = -ENOENT; + goto out; + } + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + namelen = btrfs_root_ref_name_len(leaf, ref); + read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); + btrfs_release_path(path); + + if (parent_root) { + ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); + if (ret < 0) + goto out; + } else { + ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); + if (ret < 0) + goto out; + } + + TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, + le64_to_cpu(sctx->send_root->root_item.ctransid)); + if (parent_root) { + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + sctx->parent_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + le64_to_cpu(sctx->parent_root->root_item.ctransid)); + } + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + btrfs_free_path(path); + kfree(name); + return ret; +} + +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *p = NULL; + struct btrfs_inode_item *ii; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + struct btrfs_key key; + int slot; + +verbose_printk("btrfs: send_utimes %llu\n", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + eb = path->nodes[0]; + slot = path->slots[0]; + ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); + /* TODO Add otime support when the otime patches get into upstream */ + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + btrfs_free_path(path); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have + * a valid path yet because we did not process the refs yet. So, the inode + * is created as orphan. + */ +static int send_create_inode(struct send_ctx *sctx, u64 ino) +{ + int ret = 0; + struct fs_path *p; + int cmd; + u64 gen; + u64 mode; + u64 rdev; + +verbose_printk("btrfs: send_create_inode %llu\n", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } + + if (S_ISREG(mode)) { + cmd = BTRFS_SEND_C_MKFILE; + } else if (S_ISDIR(mode)) { + cmd = BTRFS_SEND_C_MKDIR; + } else if (S_ISLNK(mode)) { + cmd = BTRFS_SEND_C_SYMLINK; + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { + cmd = BTRFS_SEND_C_MKNOD; + } else if (S_ISFIFO(mode)) { + cmd = BTRFS_SEND_C_MKFIFO; + } else if (S_ISSOCK(mode)) { + cmd = BTRFS_SEND_C_MKSOCK; + } else { + printk(KERN_WARNING "btrfs: unexpected inode type %o", + (int)(mode & S_IFMT)); + ret = -ENOTSUPP; + goto out; + } + + ret = begin_cmd(sctx, cmd); + if (ret < 0) + goto out; + + ret = gen_unique_name(sctx, ino, gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); + + if (S_ISLNK(mode)) { + fs_path_reset(p); + ret = read_symlink(sctx->send_root, ino, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); + } + + ret = send_cmd(sctx); + if (ret < 0) + goto out; + + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ + int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key di_key; + struct extent_buffer *eb; + struct btrfs_dir_item *di; + int slot; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { + ret = 1; + goto out; + } + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + * directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ + int ret; + + if (S_ISDIR(sctx->cur_inode_mode)) { + ret = did_create_dir(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + } + + ret = send_create_inode(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + +out: + return ret; +} + +struct recorded_ref { + struct list_head list; + char *dir_path; + char *name; + struct fs_path *full_path; + u64 dir; + u64 dir_gen; + int dir_path_len; + int name_len; +}; + +/* + * We need to process new refs before deleted refs, but compare_tree gives us + * everything mixed. So we first record all refs and later process them. + * This function is a helper to record one ref. + */ +static int __record_ref(struct list_head *head, u64 dir, + u64 dir_gen, struct fs_path *path) +{ + struct recorded_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->dir = dir; + ref->dir_gen = dir_gen; + ref->full_path = path; + + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; + ref->dir_path = ref->full_path->start; + if (ref->name == ref->full_path->start) + ref->dir_path_len = 0; + else + ref->dir_path_len = ref->full_path->end - + ref->full_path->start - 1 - ref->name_len; + + list_add_tail(&ref->list, head); + return 0; +} + +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = kmalloc(sizeof(*ref), GFP_NOFS); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + new->full_path = NULL; + INIT_LIST_HEAD(&new->list); + list_add_tail(&new->list, list); + return 0; +} + +static void __free_recorded_refs(struct list_head *head) +{ + struct recorded_ref *cur; + + while (!list_empty(head)) { + cur = list_entry(head->next, struct recorded_ref, list); + fs_path_free(cur->full_path); + list_del(&cur->list); + kfree(cur); + } +} + +static void free_recorded_refs(struct send_ctx *sctx) +{ + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); +} + +/* + * Renames/moves a file/dir to its orphan name. Used when the first + * ref of an unprocessed inode gets overwritten and for all non empty + * directories. + */ +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *path) +{ + int ret; + struct fs_path *orphan; + + orphan = fs_path_alloc(); + if (!orphan) + return -ENOMEM; + + ret = gen_unique_name(sctx, ino, gen, orphan); + if (ret < 0) + goto out; + + ret = send_rename(sctx, path, orphan); + +out: + fs_path_free(orphan); + return ret; +} + +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + +/* + * Returns 1 if a directory can be removed at this point in time. + * We check this by iterating all dir items and checking if the inode behind + * the dir item was already processed. + */ +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) +{ + int ret = 0; + struct btrfs_root *root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key loc; + struct btrfs_dir_item *di; + + /* + * Don't try to rmdir the top/root subvolume dir. + */ + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) + break; + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + + if (loc.objectid > send_progress) { + ret = 0; + goto out; + } + + path->slots[0]++; + } + + ret = 1; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + + return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + dm->rmdir_ino = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs, + const bool is_orphan) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry = NULL, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = ino; + pm->gen = ino_gen; + pm->is_orphan = is_orphan; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + struct fs_path *name = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; + int ret; + + name = fs_path_alloc(); + from_path = fs_path_alloc(); + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } + + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + fs_path_reset(name); + to_path = name; + name = NULL; + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(name); + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref) +{ + int ret = 0; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + path_after = fs_path_alloc(); + path_before = fs_path_alloc(); + if (!path_after || !path_before) { + ret = -ENOMEM; + goto out; + } + + /* + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. + */ + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret < 0 && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + break; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + break; + } + ino = parent_ino_after; + } + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs, + false); + if (!ret) + ret = 1; + } + + return ret; +} + +/* + * This does all the move/link/unlink/rmdir magic. + */ +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) +{ + int ret = 0; + struct recorded_ref *cur; + struct recorded_ref *cur2; + struct list_head check_dirs; + struct fs_path *valid_path = NULL; + u64 ow_inode = 0; + u64 ow_gen; + int did_overwrite = 0; + int is_orphan = 0; + u64 last_dir_ino_rm = 0; + bool can_rename = true; + +verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); + + /* + * This should never happen as the root dir always has the same ref + * which is always '..' + */ + BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + INIT_LIST_HEAD(&check_dirs); + + valid_path = fs_path_alloc(); + if (!valid_path) { + ret = -ENOMEM; + goto out; + } + + /* + * First, check if the first ref of the current inode was overwritten + * before. If yes, we know that the current inode was already orphanized + * and thus use the orphan name. If not, we can use get_cur_path to + * get the path of the first ref as it would like while receiving at + * this point in time. + * New inodes are always orphan at the beginning, so force to use the + * orphan name in this case. + * The first ref is stored in valid_path and will be updated if it + * gets moved around. + */ + if (!sctx->cur_inode_new) { + ret = did_overwrite_first_ref(sctx, sctx->cur_ino, + sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) + did_overwrite = 1; + } + if (sctx->cur_inode_new || did_overwrite) { + ret = gen_unique_name(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } else { + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + valid_path); + if (ret < 0) + goto out; + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * the the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + + /* + * Check if this new ref would overwrite the first ref of + * another unprocessed inode. If yes, orphanize the + * overwritten inode. If we find an overwritten ref that is + * not the first ref, simply unlink it. + */ + ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, + cur->name, cur->name_len, + &ow_inode, &ow_gen); + if (ret < 0) + goto out; + if (ret) { + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); + if (ret < 0) + goto out; + if (ret) { + struct name_cache_entry *nce; + + ret = orphanize_inode(sctx, ow_inode, ow_gen, + cur->full_path); + if (ret < 0) + goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } + } else { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + + /* + * link/move the ref to the new place. If we have an orphan + * inode, move it and update valid_path. If not, link or move + * it depending on the inode mode. + */ + if (is_orphan && can_rename) { + ret = send_rename(sctx, valid_path, cur->full_path); + if (ret < 0) + goto out; + is_orphan = 0; + ret = fs_path_copy(valid_path, cur->full_path); + if (ret < 0) + goto out; + } else if (can_rename) { + if (S_ISDIR(sctx->cur_inode_mode)) { + /* + * Dirs can't be linked, so move it. For moved + * dirs, we always have one new and one deleted + * ref. The deleted ref is ignored later. + */ + ret = wait_for_parent_move(sctx, cur); + if (ret < 0) + goto out; + if (ret) { + *pending_move = 1; + } else { + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + } + if (ret < 0) + goto out; + } else { + ret = send_link(sctx, cur->full_path, + valid_path); + if (ret < 0) + goto out; + } + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { + /* + * Check if we can already rmdir the directory. If not, + * orphanize it. For every dir item inside that gets deleted + * later, we do this check again and rmdir it then if possible. + * See the use of check_dirs for more details. + */ + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + } else if (!is_orphan) { + ret = orphanize_inode(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + } else if (S_ISDIR(sctx->cur_inode_mode) && + !list_empty(&sctx->deleted_refs)) { + /* + * We have a moved dir. Add the old parent to check_dirs + */ + cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, + list); + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } else if (!S_ISDIR(sctx->cur_inode_mode)) { + /* + * We have a non dir inode. Go through all deleted refs and + * unlink them if they were not already overwritten by other + * inodes. + */ + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino, sctx->cur_inode_gen, + cur->name, cur->name_len); + if (ret < 0) + goto out; + if (!ret) { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + /* + * If the inode is still orphan, unlink the orphan. This may + * happen when a previous inode did overwrite the first ref + * of this inode and no new refs were added for the current + * inode. Unlinking does not mean that the inode is deleted in + * all cases. There may still be links to this inode in other + * places. + */ + if (is_orphan) { + ret = send_unlink(sctx, valid_path); + if (ret < 0) + goto out; + } + } + + /* + * We did collect all parent dirs where cur_inode was once located. We + * now go through all these dirs and check if they are pending for + * deletion and if it's finally possible to perform the rmdir now. + * We also update the inode stats of the parent dirs here. + */ + list_for_each_entry(cur, &check_dirs, list) { + /* + * In case we had refs into dirs that were not processed yet, + * we don't need to do the utime and rmdir logic for these dirs. + * The dir will be processed later. + */ + if (cur->dir > sctx->cur_ino) + continue; + + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + + if (ret == inode_state_did_create || + ret == inode_state_no_change) { + /* TODO delayed utimes */ + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + last_dir_ino_rm = cur->dir; + } + } + } + + ret = 0; + +out: + __free_recorded_refs(&check_dirs); + free_recorded_refs(sctx); + fs_path_free(valid_path); + return ret; +} + +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct fs_path *p; + u64 gen; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, + NULL, NULL); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, dir, gen, p); + if (ret < 0) + goto out; + ret = fs_path_add_path(p, name); + if (ret < 0) + goto out; + + ret = __record_ref(refs, dir, gen, p); + +out: + if (ret) + fs_path_free(p); + return ret; +} + +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + +static int __record_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); +} + +static int record_new_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +static int record_deleted_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +struct find_ref_ctx { + u64 dir; + u64 dir_gen; + struct btrfs_root *root; + struct fs_path *name; + int found_idx; +}; + +static int __find_iref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx_) +{ + struct find_ref_ctx *ctx = ctx_; + u64 dir_gen; + int ret; + + if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && + strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + /* + * To avoid doing extra lookups we'll only do this if everything + * else matches. + */ + ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + if (dir_gen != ctx->dir_gen) + return 0; + ctx->found_idx = num; + return 1; + } + return 0; +} + +static int find_iref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 dir, u64 dir_gen, struct fs_path *name) +{ + int ret; + struct find_ref_ctx ctx; + + ctx.dir = dir; + ctx.name = name; + ctx.dir_gen = dir_gen; + ctx.found_idx = -1; + ctx.root = root; + + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + + return ctx.found_idx; +} + +static int __record_changed_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + u64 dir_gen; + int ret; + struct send_ctx *sctx = ctx; + + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, dir, dir_gen, name); + if (ret == -ENOENT) + ret = __record_new_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int __record_changed_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + u64 dir_gen; + int ret; + struct send_ctx *sctx = ctx; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, + dir, dir_gen, name); + if (ret == -ENOENT) + ret = __record_deleted_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int record_changed_ref(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_changed_new_ref, sctx); + if (ret < 0) + goto out; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +/* + * Record and process all refs at once. Needed when an inode changes the + * generation number, which means that it was deleted and recreated. + */ +static int process_all_refs(struct send_ctx *sctx, + enum btrfs_compare_tree_result cmd) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + iterate_inode_ref_t cb; + int pending_move = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + if (cmd == BTRFS_COMPARE_TREE_NEW) { + root = sctx->send_root; + cb = __record_new_ref; + } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { + root = sctx->parent_root; + cb = __record_deleted_ref; + } else { + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; + } + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) + break; + + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + if (ret < 0) + goto out; + + path->slots[0]++; + } + btrfs_release_path(path); + + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); + +out: + btrfs_free_path(path); + return ret; +} + +static int send_set_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len, + const char *data, int data_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int send_remove_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int __process_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + posix_acl_xattr_header dummy_acl; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + /* + * This hack is needed because empty acl's are stored as zero byte + * data in xattrs. Problem with that is, that receiving these zero byte + * acl's will fail later. To fix this, we send a dummy acl list that + * only contains the version number and no entries. + */ + if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || + !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { + if (data_len == 0) { + dummy_acl.a_version = + cpu_to_le32(POSIX_ACL_XATTR_VERSION); + data = (char *)&dummy_acl; + data_len = sizeof(dummy_acl); + } + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_set_xattr(sctx, p, name, name_len, data, data_len); + +out: + fs_path_free(p); + return ret; +} + +static int __process_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_remove_xattr(sctx, p, name, name_len); + +out: + fs_path_free(p); + return ret; +} + +static int process_new_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); + + return ret; +} + +static int process_deleted_xattr(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); + + return ret; +} + +struct find_xattr_ctx { + const char *name; + int name_len; + int found_idx; + char *found_data; + int found_data_len; +}; + +static int __find_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *vctx) +{ + struct find_xattr_ctx *ctx = vctx; + + if (name_len == ctx->name_len && + strncmp(name, ctx->name, name_len) == 0) { + ctx->found_idx = num; + ctx->found_data_len = data_len; + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); + if (!ctx->found_data) + return -ENOMEM; + return 1; + } + return 0; +} + +static int find_xattr(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + const char *name, int name_len, + char **data, int *data_len) +{ + int ret; + struct find_xattr_ctx ctx; + + ctx.name = name; + ctx.name_len = name_len; + ctx.found_idx = -1; + ctx.found_data = NULL; + ctx.found_data_len = 0; + + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + if (data) { + *data = ctx.found_data; + *data_len = ctx.found_data_len; + } else { + kfree(ctx.found_data); + } + return ctx.found_idx; +} + + +static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + char *found_data = NULL; + int found_data_len = 0; + + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); + if (ret == -ENOENT) { + ret = __process_new_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + } else if (ret >= 0) { + if (data_len != found_data_len || + memcmp(data, found_data, data_len)) { + ret = __process_new_xattr(num, di_key, name, name_len, + data, data_len, type, ctx); + } else { + ret = 0; + } + } + + kfree(found_data); + return ret; +} + +static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); + if (ret == -ENOENT) + ret = __process_deleted_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + else if (ret >= 0) + ret = 0; + + return ret; +} + +static int process_changed_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_changed_new_xattr, sctx); + if (ret < 0) + goto out; + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_changed_deleted_xattr, sctx); + +out: + return ret; +} + +static int process_all_new_xattrs(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + root = sctx->send_root; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); + if (ret < 0) + goto out; + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct page *page; + char *addr; + struct btrfs_key key; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + ssize_t ret = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (offset + len > i_size_read(inode)) { + if (offset > i_size_read(inode)) + len = 0; + else + len = offset - i_size_read(inode); + } + if (len == 0) + goto out; + + last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_CACHE_SIZE - pg_offset); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + break; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + addr = kmap(page); + memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + kunmap(page); + unlock_page(page); + page_cache_release(page); + index++; + pg_offset = 0; + len -= cur_len; + ret += cur_len; + } +out: + iput(inode); + return ret; +} + +/* + * Read some bytes from the current inode/file and send a write command to + * user space. + */ +static int send_write(struct send_ctx *sctx, u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + ssize_t num_read = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + +verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); + + num_read = fill_read_buf(sctx, offset, len); + if (num_read <= 0) { + if (num_read < 0) + ret = num_read; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + if (ret < 0) + return ret; + return num_read; +} + +/* + * Send a clone command to user space. + */ +static int send_clone(struct send_ctx *sctx, + u64 offset, u32 len, + struct clone_root *clone_root) +{ + int ret = 0; + struct fs_path *p; + u64 gen; + +verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " + "clone_inode=%llu, clone_offset=%llu\n", offset, len, + clone_root->root->objectid, clone_root->ino, + clone_root->offset); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + + if (clone_root->root == sctx->send_root) { + ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, + &gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, clone_root->ino, gen, p); + } else { + ret = get_inode_path(clone_root->root, clone_root->ino, p); + } + if (ret < 0) + goto out; + + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + le64_to_cpu(clone_root->root->root_item.ctransid)); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, + clone_root->offset); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + +static int send_write_or_clone(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key, + struct clone_root *clone_root) +{ + int ret = 0; + struct btrfs_file_extent_item *ei; + u64 offset = key->offset; + u64 pos = 0; + u64 len; + u32 l; + u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); + /* + * it is possible the inline item won't cover the whole page, + * but there may be items after this page. Make + * sure to send the whole thing + */ + len = PAGE_CACHE_ALIGN(len); + } else { + len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + } + + if (offset + len > sctx->cur_inode_size) + len = sctx->cur_inode_size - offset; + if (len == 0) { + ret = 0; + goto out; + } + + if (clone_root && IS_ALIGNED(offset + len, bs)) { + ret = send_clone(sctx, offset, len, clone_root); + } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = send_update_extent(sctx, offset, len); + } else { + while (pos < len) { + l = len - pos; + if (l > BTRFS_SEND_READ_SIZE) + l = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, pos + offset, l); + if (ret < 0) + goto out; + if (!ret) + break; + pos += ret; + } + ret = 0; + } +out: + return ret; +} + +static int is_extent_unchanged(struct send_ctx *sctx, + struct btrfs_path *left_path, + struct btrfs_key *ekey) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + struct btrfs_key found_key; + struct btrfs_file_extent_item *ei; + u64 left_disknr; + u64 right_disknr; + u64 left_offset; + u64 right_offset; + u64 left_offset_fixed; + u64 left_len; + u64 right_len; + u64 left_gen; + u64 right_gen; + u8 left_type; + u8 right_type; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + eb = left_path->nodes[0]; + slot = left_path->slots[0]; + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + left_type = btrfs_file_extent_type(eb, ei); + + if (left_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + left_gen = btrfs_file_extent_generation(eb, ei); + + /* + * Following comments will refer to these graphics. L is the left + * extents which we are checking at the moment. 1-8 are the right + * extents that we iterate. + * + * |-----L-----| + * |-1-|-2a-|-3-|-4-|-5-|-6-| + * + * |-----L-----| + * |--1--|-2b-|...(same as above) + * + * Alternative situation. Happens on files where extents got split. + * |-----L-----| + * |-----------7-----------|-6-| + * + * Alternative situation. Happens on files which got larger. + * |-----L-----| + * |-8-| + * Nothing follows after 8. + */ + + key.objectid = ekey->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ekey->offset; + ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Handle special case where the right side has no extents at all. + */ + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + /* + * We're now on 2a, 2b or 7. + */ + key = found_key; + while (key.offset < ekey->offset + left_len) { + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + right_type = btrfs_file_extent_type(eb, ei); + if (right_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + + /* + * Are we at extent 8? If yes, we know the extent is changed. + * This may only happen on the first iteration. + */ + if (found_key.offset + right_len <= ekey->offset) { + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + left_offset_fixed = left_offset; + if (key.offset < ekey->offset) { + /* Fix the right offset for 2a and 7. */ + right_offset += ekey->offset - key.offset; + } else { + /* Fix the left offset for all behind 2a and 2b */ + left_offset_fixed += key.offset - ekey->offset; + } + + /* + * Check if we have the same extent. + */ + if (left_disknr != right_disknr || + left_offset_fixed != right_offset || + left_gen != right_gen) { + ret = 0; + goto out; + } + + /* + * Go to the next extent. + */ + ret = btrfs_next_item(sctx->parent_root, path); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + key.offset += right_len; + break; + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; + } + key = found_key; + } + + /* + * We're now behind the left extent (treat as unchanged) or at the end + * of the right side (treat as changed). + */ + if (key.offset >= ekey->offset + left_len) + ret = 1; + else + ret = 0; + + +out: + btrfs_free_path(path); + return ret; +} + +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + +static int process_extent(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct clone_root *found_clone = NULL; + int ret = 0; + + if (S_ISLNK(sctx->cur_inode_mode)) + return 0; + + if (sctx->parent_root && !sctx->cur_inode_new) { + ret = is_extent_unchanged(sctx, path, key); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out_hole; + } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } + } + } + + ret = find_extent_clone(sctx, path, key->objectid, key->offset, + sctx->cur_inode_size, &found_clone); + if (ret != -ENOENT && ret < 0) + goto out; + + ret = send_write_or_clone(sctx, path, key, found_clone); + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); +out: + return ret; +} + +static int process_all_extents(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + root = sctx->send_root; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = process_extent(sctx, path, &found_key); + if (ret < 0) + goto out; + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) +{ + int ret = 0; + + if (sctx->cur_ino == 0) + goto out; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && + sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) + goto out; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) + goto out; + + ret = process_recorded_refs(sctx, pending_move); + if (ret < 0) + goto out; + + *refs_processed = 1; +out: + return ret; +} + +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +{ + int ret = 0; + u64 left_mode; + u64 left_uid; + u64 left_gid; + u64 right_mode; + u64 right_uid; + u64 right_gid; + int need_chmod = 0; + int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; + + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); + if (ret < 0) + goto out; + + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) + goto out; + if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) + goto out; + + ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, + &left_mode, &left_uid, &left_gid, NULL); + if (ret < 0) + goto out; + + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode)) + need_chmod = 1; + } else { + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, + NULL, NULL, &right_mode, &right_uid, + &right_gid, NULL); + if (ret < 0) + goto out; + + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) + need_chmod = 1; + } + + if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } + ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_inode_size); + if (ret < 0) + goto out; + } + + if (need_chown) { + ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_uid, left_gid); + if (ret < 0) + goto out; + } + if (need_chmod) { + ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_mode); + if (ret < 0) + goto out; + } + + /* + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. + */ + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + } + +out: + return ret; +} + +static int changed_inode(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + struct btrfs_key *key = sctx->cmp_key; + struct btrfs_inode_item *left_ii = NULL; + struct btrfs_inode_item *right_ii = NULL; + u64 left_gen = 0; + u64 right_gen = 0; + + sctx->cur_ino = key->objectid; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; + + /* + * Set send_progress to current inode. This will tell all get_cur_xxx + * functions that the current inode's refs are not updated yet. Later, + * when process_recorded_refs is finished, it is set to cur_ino + 1. + */ + sctx->send_progress = sctx->cur_ino; + + if (result == BTRFS_COMPARE_TREE_NEW || + result == BTRFS_COMPARE_TREE_CHANGED) { + left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], + sctx->left_path->slots[0], + struct btrfs_inode_item); + left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], + left_ii); + } else { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + } + if (result == BTRFS_COMPARE_TREE_CHANGED) { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + + /* + * The cur_ino = root dir case is special here. We can't treat + * the inode as deleted+reused because it would generate a + * stream that tries to delete/mkdir the root dir. + */ + if (left_gen != right_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + sctx->cur_inode_new_gen = 1; + } + + if (result == BTRFS_COMPARE_TREE_NEW) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); + if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + ret = send_create_inode_if_needed(sctx); + } else if (result == BTRFS_COMPARE_TREE_DELETED) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + /* + * We need to do some special handling in case the inode was + * reported as changed with a changed generation number. This + * means that the original inode was deleted and new inode + * reused the same inum. So we have to treat the old inode as + * deleted and the new one as new. + */ + if (sctx->cur_inode_new_gen) { + /* + * First, process the inode as if it was deleted. + */ + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + + /* + * Now process the inode as if it was new. + */ + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; + + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get into + * process_recorded_refs_if_needed in the new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * Now process all extents and xattrs of the inode as if + * they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } else { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + } + } + +out: + return ret; +} + +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */ +static int changed_ref(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = record_new_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = record_deleted_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = record_changed_ref(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */ +static int changed_xattr(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = process_new_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = process_deleted_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = process_changed_xattr(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */ +static int changed_extent(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result != BTRFS_COMPARE_TREE_DELETED) + ret = process_extent(sctx, sctx->left_path, + sctx->cmp_key); + } + + return ret; +} + +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, + NULL, NULL); + if (ret) + return ret; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */ +static int changed_cb(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { + return 0; + } + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + + sctx->left_path = left_path; + sctx->right_path = right_path; + sctx->cmp_key = key; + + ret = finish_inode_if_needed(sctx, 0); + if (ret < 0) + goto out; + + /* Ignore non-FS objects */ + if (key->objectid == BTRFS_FREE_INO_OBJECTID || + key->objectid == BTRFS_FREE_SPACE_OBJECTID) + goto out; + + if (key->type == BTRFS_INODE_ITEM_KEY) + ret = changed_inode(sctx, result); + else if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) + ret = changed_ref(sctx, result); + else if (key->type == BTRFS_XATTR_ITEM_KEY) + ret = changed_xattr(sctx, result); + else if (key->type == BTRFS_EXTENT_DATA_KEY) + ret = changed_extent(sctx, result); + +out: + return ret; +} + +static int full_send_tree(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *eb; + int slot; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) + goto out_finish; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + ret = changed_cb(send_root, NULL, path, NULL, + &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + if (ret < 0) + goto out; + + key.objectid = found_key.objectid; + key.type = found_key.type; + key.offset = found_key.offset + 1; + + ret = btrfs_next_item(send_root, path); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + break; + } + } + +out_finish: + ret = finish_inode_if_needed(sctx, 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int send_subvol(struct send_ctx *sctx) +{ + int ret; + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { + ret = send_header(sctx); + if (ret < 0) + goto out; + } + + ret = send_subvol_begin(sctx); + if (ret < 0) + goto out; + + if (sctx->parent_root) { + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, + changed_cb, sctx); + if (ret < 0) + goto out; + ret = finish_inode_if_needed(sctx, 1); + if (ret < 0) + goto out; + } else { + ret = full_send_tree(sctx); + if (ret < 0) + goto out; + } + +out: + free_recorded_refs(sctx); + return ret; +} + +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx->send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx->send_root); +} + +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +{ + int ret = 0; + struct btrfs_root *send_root; + struct btrfs_root *clone_root; + struct btrfs_fs_info *fs_info; + struct btrfs_ioctl_send_args *arg = NULL; + struct btrfs_key key; + struct send_ctx *sctx = NULL; + u32 i; + u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + send_root = BTRFS_I(file_inode(mnt_file))->root; + fs_info = send_root->fs_info; + + /* + * The subvolume must remain read-only during send, protect against + * making it RW. This also protects against deletion. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* + * This is done when we lookup the root, it should already be complete + * by the time we get here. + */ + WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); + + /* + * Userspace tools do the checks and warn the user if it's + * not RO. + */ + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; + } + + arg = memdup_user(arg_, sizeof(*arg)); + if (IS_ERR(arg)) { + ret = PTR_ERR(arg); + arg = NULL; + goto out; + } + + if (!access_ok(VERIFY_READ, arg->clone_sources, + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { + ret = -EFAULT; + goto out; + } + + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { + ret = -EINVAL; + goto out; + } + + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); + if (!sctx) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&sctx->new_refs); + INIT_LIST_HEAD(&sctx->deleted_refs); + INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); + INIT_LIST_HEAD(&sctx->name_cache_list); + + sctx->flags = arg->flags; + + sctx->send_filp = fget(arg->send_fd); + if (!sctx->send_filp) { + ret = -EBADF; + goto out; + } + + sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + + sctx->clone_roots_cnt = arg->clone_sources_count; + + sctx->send_max_size = BTRFS_SEND_BUF_SIZE; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * + (arg->clone_sources_count + 1)); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } + + if (arg->clone_sources_count) { + clone_sources_tmp = vmalloc(arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(clone_sources_tmp, arg->clone_sources, + arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (ret) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < arg->clone_sources_count; i++) { + key.objectid = clone_sources_tmp[i]; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(clone_root); + goto out; + } + spin_lock(&clone_root->root_item_lock); + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + clone_root->send_in_progress++; + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + + sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; + } + vfree(clone_sources_tmp); + clone_sources_tmp = NULL; + } + + if (arg->parent_root) { + key.objectid = arg->parent_root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(sctx->parent_root); + goto out; + } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); + } + + /* + * Clones from send_root are allowed, but only if the clone source + * is behind the current send position. This is checked while searching + * for possible clone sources. + */ + sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + + /* We do a bsearch later */ + sort(sctx->clone_roots, sctx->clone_roots_cnt, + sizeof(*sctx->clone_roots), __clone_root_cmp_sort, + NULL); + sort_clone_roots = 1; + + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + + current->journal_info = BTRFS_SEND_TRANS_STUB; + ret = send_subvol(sctx); + current->journal_info = NULL; + if (ret < 0) + goto out; + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + } + +out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + + kfree(arg); + vfree(clone_sources_tmp); + + if (sctx) { + if (sctx->send_filp) + fput(sctx->send_filp); + + vfree(sctx->clone_roots); + vfree(sctx->send_buf); + vfree(sctx->read_buf); + + name_cache_free(sctx); + + kfree(sctx); + } + + return ret; +} diff --git a/kernel/fs/btrfs/send.h b/kernel/fs/btrfs/send.h new file mode 100644 index 000000000..48d425aef --- /dev/null +++ b/kernel/fs/btrfs/send.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * Copyright (C) 2012 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" + +#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +#define BTRFS_SEND_STREAM_VERSION 1 + +#define BTRFS_SEND_BUF_SIZE (1024 * 64) +#define BTRFS_SEND_READ_SIZE (1024 * 48) + +enum btrfs_tlv_type { + BTRFS_TLV_U8, + BTRFS_TLV_U16, + BTRFS_TLV_U32, + BTRFS_TLV_U64, + BTRFS_TLV_BINARY, + BTRFS_TLV_STRING, + BTRFS_TLV_UUID, + BTRFS_TLV_TIMESPEC, +}; + +struct btrfs_stream_header { + char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)]; + __le32 version; +} __attribute__ ((__packed__)); + +struct btrfs_cmd_header { + /* len excluding the header */ + __le32 len; + __le16 cmd; + /* crc including the header with zero crc field */ + __le32 crc; +} __attribute__ ((__packed__)); + +struct btrfs_tlv_header { + __le16 tlv_type; + /* len excluding the header */ + __le16 tlv_len; +} __attribute__ ((__packed__)); + +/* commands */ +enum btrfs_send_cmd { + BTRFS_SEND_C_UNSPEC, + + BTRFS_SEND_C_SUBVOL, + BTRFS_SEND_C_SNAPSHOT, + + BTRFS_SEND_C_MKFILE, + BTRFS_SEND_C_MKDIR, + BTRFS_SEND_C_MKNOD, + BTRFS_SEND_C_MKFIFO, + BTRFS_SEND_C_MKSOCK, + BTRFS_SEND_C_SYMLINK, + + BTRFS_SEND_C_RENAME, + BTRFS_SEND_C_LINK, + BTRFS_SEND_C_UNLINK, + BTRFS_SEND_C_RMDIR, + + BTRFS_SEND_C_SET_XATTR, + BTRFS_SEND_C_REMOVE_XATTR, + + BTRFS_SEND_C_WRITE, + BTRFS_SEND_C_CLONE, + + BTRFS_SEND_C_TRUNCATE, + BTRFS_SEND_C_CHMOD, + BTRFS_SEND_C_CHOWN, + BTRFS_SEND_C_UTIMES, + + BTRFS_SEND_C_END, + BTRFS_SEND_C_UPDATE_EXTENT, + __BTRFS_SEND_C_MAX, +}; +#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) + +/* attributes in send stream */ +enum { + BTRFS_SEND_A_UNSPEC, + + BTRFS_SEND_A_UUID, + BTRFS_SEND_A_CTRANSID, + + BTRFS_SEND_A_INO, + BTRFS_SEND_A_SIZE, + BTRFS_SEND_A_MODE, + BTRFS_SEND_A_UID, + BTRFS_SEND_A_GID, + BTRFS_SEND_A_RDEV, + BTRFS_SEND_A_CTIME, + BTRFS_SEND_A_MTIME, + BTRFS_SEND_A_ATIME, + BTRFS_SEND_A_OTIME, + + BTRFS_SEND_A_XATTR_NAME, + BTRFS_SEND_A_XATTR_DATA, + + BTRFS_SEND_A_PATH, + BTRFS_SEND_A_PATH_TO, + BTRFS_SEND_A_PATH_LINK, + + BTRFS_SEND_A_FILE_OFFSET, + BTRFS_SEND_A_DATA, + + BTRFS_SEND_A_CLONE_UUID, + BTRFS_SEND_A_CLONE_CTRANSID, + BTRFS_SEND_A_CLONE_PATH, + BTRFS_SEND_A_CLONE_OFFSET, + BTRFS_SEND_A_CLONE_LEN, + + __BTRFS_SEND_A_MAX, +}; +#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) + +#ifdef __KERNEL__ +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +#endif diff --git a/kernel/fs/btrfs/struct-funcs.c b/kernel/fs/btrfs/struct-funcs.c new file mode 100644 index 000000000..b976597b0 --- /dev/null +++ b/kernel/fs/btrfs/struct-funcs.c @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include "ctree.h" + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +/* + * this is some deeply nasty code. + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions, + * which are wappers of btrfs_set_token_#bits functions and + * btrfs_get_token_#bits functions, which are defined in this file. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do the page spanning work required to + * have a metadata blocksize different from the page size. + */ + +#define DEFINE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + res = get_unaligned_le##bits(p + off); \ + return res; \ + } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ + } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ + return res; \ +} \ +void btrfs_set_token_##bits(struct extent_buffer *eb, \ + void *ptr, unsigned long off, u##bits val, \ + struct btrfs_map_token *token) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + put_unaligned_le##bits(val, p + off); \ + return; \ + } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ + return; \ + } \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ +} + +DEFINE_BTRFS_SETGET_BITS(8) +DEFINE_BTRFS_SETGET_BITS(16) +DEFINE_BTRFS_SETGET_BITS(32) +DEFINE_BTRFS_SETGET_BITS(64) + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/kernel/fs/btrfs/super.c b/kernel/fs/btrfs/super.c new file mode 100644 index 000000000..9e66f5e72 --- /dev/null +++ b/kernel/fs/btrfs/super.c @@ -0,0 +1,2220 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "delayed-inode.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "hash.h" +#include "props.h" +#include "xattr.h" +#include "volumes.h" +#include "export.h" +#include "compression.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "free-space-cache.h" +#include "backref.h" +#include "tests/btrfs-tests.h" + +#include "qgroup.h" +#define CREATE_TRACE_POINTS +#include + +static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; + +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + +static const char *btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + case -EEXIST: + errstr = "Object already exists"; + break; + case -ENOSPC: + errstr = "No space left"; + break; + case -ENOENT: + errstr = "No such entry"; + break; + } + + return errstr; +} + +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + sb->s_flags |= MS_RDONLY; + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ + } +} + +#ifdef CONFIG_PRINTK +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(errno); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT + "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, function, line, errno, errstr, &vaf); + va_end(args); + } else { + printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", + sb->s_id, function, line, errno, errstr); + } + + /* Don't go through full error handling during mount */ + save_error_info(fs_info); + if (sb->s_flags & MS_BORN) + btrfs_handle_error(fs_info); +} + +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + char lvl[4]; + struct va_format vaf; + va_list args; + const char *type = logtypes[4]; + int kern_level; + + va_start(args, fmt); + + kern_level = printk_get_level(fmt); + if (kern_level) { + size_t size = printk_skip_level(fmt) - fmt; + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + fmt += size; + type = logtypes[kern_level - '0']; + } else + *lvl = '\0'; + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + + va_end(args); +} + +#else + +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno) +{ + /* + * Report first abort since mount + */ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, + &root->fs_info->fs_state)) { + WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", + errno); + } + trans->aborted = errno; + /* Nothing used. The other threads that have joined this + * transaction may be able to continue. */ + if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + const char *errstr; + + errstr = btrfs_decode_error(errno); + btrfs_warn(root->fs_info, + "%s:%d: Aborting unused transaction(%s).", + function, line, errstr); + return; + } + ACCESS_ONCE(trans->transaction->aborted) = errno; + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_blocked_wait); + __btrfs_std_error(root->fs_info, function, line, errno, NULL); +} +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, + * issues an alert, and either panics or BUGs, depending on mount options. + */ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} + +static void btrfs_put_super(struct super_block *sb) +{ + close_ctree(btrfs_sb(sb)->tree_root); +} + +enum { + Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, + Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, + Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, + Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, + Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, + Opt_datasum, Opt_treelog, Opt_noinode_cache, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_subvolid, "subvolid=%s"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_datasum, "datasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_datacow, "datacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_barrier, "barrier"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, + {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, + {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd, "nossd"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_treelog, "treelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_space_cache, "space_cache"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, + {Opt_subvolrootid, "subvolrootid=%d"}, + {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, + {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, + {Opt_fatal_errors, "fatal_errors=%s"}, + {Opt_commit_interval, "commit=%d"}, + {Opt_err, NULL}, +}; + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + * XXX JDM: This needs to be cleaned up for remount. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num, *orig = NULL; + u64 cache_gen; + int intarg; + int ret = 0; + char *compress_type; + bool compress_force = false; + + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + orig = options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + btrfs_info(root->fs_info, "allowing degraded mounts"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_subvolid: + case Opt_subvolrootid: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + btrfs_set_and_info(root, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(root, NODATASUM)) { + if (btrfs_test_opt(root, NODATACOW)) + btrfs_info(root->fs_info, "setting datasum, datacow enabled"); + else + btrfs_info(root->fs_info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + if (!btrfs_test_opt(root, NODATACOW)) { + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + btrfs_info(root->fs_info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(root->fs_info, "setting nodatacow"); + } + } + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_datacow: + btrfs_clear_and_info(root, NODATACOW, + "setting datacow"); + break; + case Opt_compress_force: + case Opt_compress_force_type: + compress_force = true; + /* Fallthrough */ + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_LZO); + } else if (strncmp(args[0].from, "no", 2) == 0) { + compress_type = "no"; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + compress_force = false; + } else { + ret = -EINVAL; + goto out; + } + + if (compress_force) { + btrfs_set_and_info(root, FORCE_COMPRESS, + "force %s compression", + compress_type); + } else { + if (!btrfs_test_opt(root, COMPRESS)) + btrfs_info(root->fs_info, + "btrfs: use %s compression", + compress_type); + /* + * If we remount from compress-force=xxx to + * compress=xxx, we need clear FORCE_COMPRESS + * flag, otherwise, there is no way for users + * to disable forcible compression separately. + */ + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + } + break; + case Opt_ssd: + btrfs_set_and_info(root, SSD, + "use ssd allocation scheme"); + break; + case Opt_ssd_spread: + btrfs_set_and_info(root, SSD_SPREAD, + "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_nossd: + btrfs_set_and_info(root, NOSSD, + "not using ssd allocation scheme"); + btrfs_clear_opt(info->mount_opt, SSD); + break; + case Opt_barrier: + btrfs_clear_and_info(root, NOBARRIER, + "turning on barriers"); + break; + case Opt_nobarrier: + btrfs_set_and_info(root, NOBARRIER, + "turning off barriers"); + break; + case Opt_thread_pool: + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg > 0) { + info->thread_pool_size = intarg; + } else { + ret = -EINVAL; + goto out; + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = memparse(num, NULL); + kfree(num); + + if (info->max_inline) { + info->max_inline = min_t(u64, + info->max_inline, + root->sectorsize); + } + btrfs_info(root->fs_info, "max_inline at %llu", + info->max_inline); + } else { + ret = -ENOMEM; + goto out; + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + mutex_lock(&info->chunk_mutex); + info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); + kfree(num); + btrfs_info(root->fs_info, "allocations start at %llu", + info->alloc_start); + } else { + ret = -ENOMEM; + goto out; + } + break; + case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + root->fs_info->sb->s_flags |= MS_POSIXACL; + break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + case Opt_notreelog: + btrfs_set_and_info(root, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(root, NOTREELOG, + "enabling tree log"); + break; + case Opt_flushoncommit: + btrfs_set_and_info(root, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(root, FLUSHONCOMMIT, + "turning off flush-on-commit"); + break; + case Opt_ratio: + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { + info->metadata_ratio = intarg; + btrfs_info(root->fs_info, "metadata ratio %d", + info->metadata_ratio); + } else { + ret = -EINVAL; + goto out; + } + break; + case Opt_discard: + btrfs_set_and_info(root, DISCARD, + "turning on discard"); + break; + case Opt_nodiscard: + btrfs_clear_and_info(root, DISCARD, + "turning off discard"); + break; + case Opt_space_cache: + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_no_space_cache: + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + break; + case Opt_inode_cache: + btrfs_set_pending_and_info(info, INODE_MAP_CACHE, + "enabling inode map caching"); + break; + case Opt_noinode_cache: + btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, + "disabling inode map caching"); + break; + case Opt_clear_cache: + btrfs_set_and_info(root, CLEAR_CACHE, + "force clearing of disk cache"); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + btrfs_set_and_info(root, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(root, AUTO_DEFRAG, + "disabling auto defrag"); + break; + case Opt_recovery: + btrfs_info(root->fs_info, "enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + btrfs_info(root->fs_info, + "enabling check integrity including extent data"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + btrfs_info(root->fs_info, "enabling check integrity"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { + info->check_integrity_print_mask = intarg; + btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x", + info->check_integrity_print_mask); + } else { + ret = -EINVAL; + goto out; + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + btrfs_err(root->fs_info, + "support for check_integrity* not compiled in!"); + ret = -EINVAL; + goto out; +#endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else if (strcmp(args[0].from, "bug") == 0) + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else { + ret = -EINVAL; + goto out; + } + break; + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); + if (ret < 0) { + btrfs_err(root->fs_info, "invalid commit interval"); + ret = -EINVAL; + goto out; + } + if (intarg > 0) { + if (intarg > 300) { + btrfs_warn(root->fs_info, "excessive commit interval %d", + intarg); + } + info->commit_interval = intarg; + } else { + btrfs_info(root->fs_info, "using default commit interval %ds", + BTRFS_DEFAULT_COMMIT_INTERVAL); + info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + break; + case Opt_err: + btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); + ret = -EINVAL; + goto out; + default: + break; + } + } +out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + btrfs_info(root->fs_info, "disk space caching is enabled"); + kfree(orig); + return ret; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, u64 *subvol_objectid, + struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *device_name, *opts, *orig, *p; + char *num = NULL; + int error = 0; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + kfree(*subvol_name); + *subvol_name = match_strdup(&args[0]); + if (!*subvol_name) { + error = -ENOMEM; + goto out; + } + break; + case Opt_subvolid: + num = match_strdup(&args[0]); + if (num) { + *subvol_objectid = memparse(num, NULL); + kfree(num); + /* we want the original fs_tree */ + if (!*subvol_objectid) + *subvol_objectid = + BTRFS_FS_TREE_OBJECTID; + } else { + error = -EINVAL; + goto out; + } + break; + case Opt_subvolrootid: + printk(KERN_WARNING + "BTRFS: 'subvolrootid' mount option is deprecated and has " + "no effect\n"); + break; + case Opt_device: + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, + flags, holder, fs_devices); + kfree(device_name); + if (error) + goto out; + break; + default: + break; + } + } + +out: + kfree(orig); + return error; +} + +static struct dentry *get_default_root(struct super_block *sb, + u64 subvol_objectid) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + struct inode *inode; + u64 dir_id; + int new = 0; + + /* + * We have a specific subvol we want to mount, just setup location and + * go look up the root. + */ + if (subvol_objectid) { + location.objectid = subvol_objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + goto find_root; + } + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + path->leave_spinning = 1; + + /* + * Find the "default" dir item which points to the root item that we + * will mount by default if we haven't been given a specific subvolume + * to mount. + */ + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (IS_ERR(di)) { + btrfs_free_path(path); + return ERR_CAST(di); + } + if (!di) { + /* + * Ok the default dir item isn't there. This is weird since + * it's always been there, but don't freak out, just try and + * mount to root most subvolume. + */ + btrfs_free_path(path); + dir_id = BTRFS_FIRST_FREE_OBJECTID; + new_root = fs_info->fs_root; + goto setup_root; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + btrfs_free_path(path); + +find_root: + new_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (IS_ERR(new_root)) + return ERR_CAST(new_root); + + if (!(sb->s_flags & MS_RDONLY)) { + int ret; + down_read(&fs_info->cleanup_work_sem); + ret = btrfs_orphan_cleanup(new_root); + up_read(&fs_info->cleanup_work_sem); + if (ret) + return ERR_PTR(ret); + } + + dir_id = btrfs_root_dirid(&new_root->root_item); +setup_root: + location.objectid = dir_id; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + inode = btrfs_iget(sb, &location, new_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + /* + * If we're just mounting the root most subvol put the inode and return + * a reference to the dentry. We will have already gotten a reference + * to the inode in btrfs_fill_super so we're good to go. + */ + if (!new && d_inode(sb->s_root) == inode) { + iput(inode); + return dget(sb->s_root); + } + + return d_obtain_root(inode); +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_key key; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + sb->s_flags |= MS_I_VERSION; + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { + printk(KERN_ERR "BTRFS: open_ctree failed\n"); + return err; + } + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail_close; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + err = -ENOMEM; + goto fail_close; + } + + save_mount_options(sb, data); + cleancache_init_fs(sb); + sb->s_flags |= MS_ACTIVE; + return 0; + +fail_close: + close_ctree(fs_info->tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + + trace_btrfs_sync_fs(wait); + + if (!wait) { + filemap_flush(fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_wait_ordered_roots(fs_info, -1); + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (fs_info->pending_changes == 0) + return 0; + /* + * A non-blocking test if the fs is frozen. We must not + * start a new transaction here otherwise a deadlock + * happens. The pending operations are delayed to the + * next commit after thawing. + */ + if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) + __sb_end_write(sb, SB_FREEZE_WRITE); + else + return 0; + trans = btrfs_start_transaction(root, 0); + } + if (IS_ERR(trans)) + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); +} + +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = info->tree_root; + char *compress_type; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + seq_printf(seq, ",max_inline=%llu", info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) { + if (info->compress_type == BTRFS_COMPRESS_ZLIB) + compress_type = "zlib"; + else + compress_type = "lzo"; + if (btrfs_test_opt(root, FORCE_COMPRESS)) + seq_printf(seq, ",compress-force=%s", compress_type); + else + seq_printf(seq, ",compress=%s", compress_type); + } + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + if (btrfs_test_opt(root, SPACE_CACHE)) + seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",nospace_cache"); + if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); + if (btrfs_test_opt(root, CLEAR_CACHE)) + seq_puts(seq, ",clear_cache"); + if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, RECOVERY)) + seq_puts(seq, ",recovery"); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + seq_puts(seq, ",check_int_data"); + else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + seq_puts(seq, ",check_int"); + if (info->check_integrity_print_mask) + seq_printf(seq, ",check_int_print_mask=%d", + info->check_integrity_print_mask); +#endif + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%d", + info->metadata_ratio); + if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%d", info->commit_interval); + return 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; +} + +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned len = strlen(args) + 2 + 1; + char *src, *dst, *buf; + + /* + * We need the same args as before, but with this substitution: + * s!subvol=[^,]+!subvolid=0! + * + * Since the replacement string is up to 2 bytes longer than the + * original, allocate strlen(args) + 2 + 1 bytes. + */ + + src = strstr(args, "subvol="); + /* This shouldn't happen, but just in case.. */ + if (!src) + return NULL; + + buf = dst = kmalloc(len, GFP_NOFS); + if (!buf) + return NULL; + + /* + * If the subvol= arg is not at the start of the string, + * copy whatever precedes it into buf. + */ + if (src != args) { + *src++ = '\0'; + strcpy(buf, args); + dst += strlen(args); + } + + strcpy(dst, "subvolid=0"); + dst += strlen("subvolid=0"); + + /* + * If there is a "," after the original subvol=... string, + * copy that suffix into our buffer. Otherwise, we're done. + */ + src = strchr(src, ','); + if (src) + strcpy(dst, src); + + return buf; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct dentry *root; + struct vfsmount *mnt; + char *newargs; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) { + kfree(newargs); + return ERR_CAST(mnt); + } + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); + return ERR_PTR(r); + } + } + } + + kfree(newargs); + + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + root = mount_subtree(mnt, subvol_name); + + if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); + printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + } + + return root; +} + +static int parse_security_options(char *orig_opts, + struct security_mnt_opts *sec_opts) +{ + char *secdata = NULL; + int ret = 0; + + secdata = alloc_secdata(); + if (!secdata) + return -ENOMEM; + ret = security_sb_copy_data(orig_opts, secdata); + if (ret) { + free_secdata(secdata); + return ret; + } + ret = security_sb_parse_opts_str(secdata, sec_opts); + free_secdata(secdata); + return ret; +} + +static int setup_security_options(struct btrfs_fs_info *fs_info, + struct super_block *sb, + struct security_mnt_opts *sec_opts) +{ + int ret = 0; + + /* + * Call security_sb_set_mnt_opts() to check whether new sec_opts + * is valid. + */ + ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); + if (ret) + return ret; + +#ifdef CONFIG_SECURITY + if (!fs_info->security_opts.num_mnt_opts) { + /* first time security setup, copy sec_opts to fs_info */ + memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); + } else { + /* + * Since SELinux(the only one supports security_mnt_opts) does + * NOT support changing context during remount/mount same sb, + * This must be the same or part of the same security options, + * just free it. + */ + security_free_mnt_opts(sec_opts); + } +#endif + return ret; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_fs_info *fs_info = NULL; + struct security_mnt_opts new_sec_opts; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &subvol_objectid, + &fs_devices); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } + + security_init_mnt_opts(&new_sec_opts); + if (data) { + error = parse_security_options(data, &new_sec_opts); + if (error) + return ERR_PTR(error); + } + + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); + if (error) + goto error_sec_opts; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) { + error = -ENOMEM; + goto error_sec_opts; + } + + fs_info->fs_devices = fs_devices; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + security_init_mnt_opts(&fs_info->security_opts); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + fs_info); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } + + if (s->s_root) { + btrfs_close_devices(fs_devices); + free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; + } else { + char b[BDEVNAME_SIZE]; + + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->bdev_holder = fs_type; + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + } + + root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); + if (IS_ERR(root)) { + deactivate_locked_super(s); + error = PTR_ERR(root); + goto error_sec_opts; + } + + fs_info = btrfs_sb(s); + error = setup_security_options(fs_info, s, &new_sec_opts); + if (error) { + dput(root); + deactivate_locked_super(s); + goto error_sec_opts; + } + + return root; + +error_close_devices: + btrfs_close_devices(fs_devices); +error_fs_info: + free_fs_info(fs_info); +error_sec_opts: + security_free_mnt_opts(&new_sec_opts); + return ERR_PTR(error); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + int new_pool_size, int old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + btrfs_info(fs_info, "resize thread pool %d -> %d", + old_pool_size, new_pool_size); + + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, + new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, + new_pool_size); +} + +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & MS_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + /* + * We need cleanup all defragable inodes if the autodefragment is + * close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (fs_info->sb->s_flags & MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + unsigned old_flags = sb->s_flags; + unsigned long old_opts = fs_info->mount_opt; + unsigned long old_compress_type = fs_info->compress_type; + u64 old_max_inline = fs_info->max_inline; + u64 old_alloc_start = fs_info->alloc_start; + int old_thread_pool_size = fs_info->thread_pool_size; + unsigned int old_metadata_ratio = fs_info->metadata_ratio; + int ret; + + sync_filesystem(sb); + btrfs_remount_prepare(fs_info); + + if (data) { + struct security_mnt_opts new_sec_opts; + + security_init_mnt_opts(&new_sec_opts); + ret = parse_security_options(data, &new_sec_opts); + if (ret) + goto restore; + ret = setup_security_options(fs_info, sb, + &new_sec_opts); + if (ret) { + security_free_mnt_opts(&new_sec_opts); + goto restore; + } + } + + ret = btrfs_parse_options(root, data); + if (ret) { + ret = -EINVAL; + goto restore; + } + + btrfs_remount_begin(fs_info, old_opts, *flags); + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + + if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + + sb->s_flags |= MS_RDONLY; + + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); + + ret = btrfs_commit_super(root); + if (ret) + goto restore; + } else { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + btrfs_err(fs_info, + "Remounting read-write after error is not allowed"); + ret = -EINVAL; + goto restore; + } + if (fs_info->fs_devices->rw_devices == 0) { + ret = -EACCES; + goto restore; + } + + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + btrfs_warn(fs_info, + "too many missing devices, writeable remount is not allowed"); + ret = -EACCES; + goto restore; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + ret = -EINVAL; + goto restore; + } + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto restore; + + /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret) + goto restore; + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto restore; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto restore; + } + + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); + goto restore; + } + } + sb->s_flags &= ~MS_RDONLY; + } +out: + wake_up_process(fs_info->transaction_kthread); + btrfs_remount_cleanup(fs_info, old_opts); + return 0; + +restore: + /* We've hit an error - don't reset MS_RDONLY */ + if (sb->s_flags & MS_RDONLY) + old_flags |= MS_RDONLY; + sb->s_flags = old_flags; + fs_info->mount_opt = old_opts; + fs_info->compress_type = old_compress_type; + fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); + fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); + fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); + return ret; +} + +/* Used to sort the devices by max_avail(descending sort) */ +static int btrfs_cmp_device_free_bytes(const void *dev_info1, + const void *dev_info2) +{ + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1, num_stripes = 1; + int i = 0, nr_devices; + int ret; + + /* + * We aren't under the device list lock, so this is racey-ish, but good + * enough for our purposes. + */ + nr_devices = fs_info->fs_devices->open_devices; + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } + + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) { + min_stripes = 2; + num_stripes = nr_devices; + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { + min_stripes = 2; + num_stripes = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { + min_stripes = 4; + num_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + if (fs_info->alloc_start) + mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) + continue; + + if (i >= nr_devices) + break; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start && + fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) { + rcu_read_unlock(); + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in + * [0, skip_space - 1], we must subtract it from the + * total. In order to implement it, we account the used + * space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, + skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; + } + + rcu_read_lock(); + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + } + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + rcu_read_unlock(); + if (fs_info->alloc_start) + mutex_unlock(&fs_devices->device_list_mutex); + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (num_stripes > nr_devices) + num_stripes = nr_devices; + + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * num_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - num_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes, order of allocations and the + * 'alloc_start' value, this is a close approximation of the actual use but + * there are other factors that may change the result (like a new metadata + * chunk). + * + * FIXME: not accurate for mixed block groups, total and free/used are ok, + * available appears slightly larger. + */ +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *found; + u64 total_used = 0; + u64 total_free_data = 0; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)fs_info->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + int ret; + + /* + * holding chunk_muext to avoid allocating new chunks, holding + * device_list_mutex to avoid the device being removed + */ + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) { + switch (i) { + case BTRFS_RAID_DUP: + case BTRFS_RAID_RAID1: + case BTRFS_RAID_RAID10: + factor = 2; + } + } + } + } + + total_used += found->disk_used; + } + + rcu_read_unlock(); + + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + buf->f_bfree -= block_rsv->size >> bits; + spin_unlock(&block_rsv->lock); + + buf->f_bavail = div_u64(total_free_data, factor); + ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); + if (ret) + return ret; + buf->f_bavail += div_u64(total_free_data, factor); + buf->f_bavail = buf->f_bavail >> bits; + + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = BTRFS_NAME_LEN; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; + + return 0; +} + +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + kill_anon_super(sb); + free_fs_info(fs_info); +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("btrfs"); + +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); + + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + case BTRFS_IOC_DEVICES_READY: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + if (ret) + break; + ret = !(fs_devices->num_devices == fs_devices->total_devices); + break; + } + + kfree(vol); + return ret; +} + +static int btrfs_freeze(struct super_block *sb) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); +} + +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; + struct list_head *head; + struct rcu_string *name; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + cur_devices = fs_info->fs_devices; + while (cur_devices) { + head = &cur_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->name) + continue; + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; + } + cur_devices = cur_devices->seed; + } + + if (first_dev) { + rcu_read_lock(); + name = rcu_dereference(first_dev->name); + seq_escape(m, name->str, " \t\n\\"); + rcu_read_unlock(); + } else { + WARN_ON(1); + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return 0; +} + +static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, + .evict_inode = btrfs_evict_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, + .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, + .write_inode = btrfs_write_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, +}; + +static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice btrfs_misc = { + .minor = BTRFS_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); +} + +static void btrfs_print_info(void) +{ + printk(KERN_INFO "Btrfs loaded" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_ASSERT + ", assert=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif + "\n"); +} + +static int btrfs_run_sanity_tests(void) +{ + int ret; + + ret = btrfs_init_test_fs(); + if (ret) + return ret; + + ret = btrfs_test_free_space_cache(); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(); + if (ret) + goto out; + ret = btrfs_test_extent_io(); + if (ret) + goto out; + ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); +out: + btrfs_destroy_test_fs(); + return ret; +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_hash_init(); + if (err) + return err; + + btrfs_props_init(); + + err = btrfs_init_sysfs(); + if (err) + goto free_hash; + + btrfs_init_compress(); + + err = btrfs_init_cachep(); + if (err) + goto free_compress; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = ordered_data_init(); + if (err) + goto free_extent_map; + + err = btrfs_delayed_inode_init(); + if (err) + goto free_ordered_data; + + err = btrfs_auto_defrag_init(); + if (err) + goto free_delayed_inode; + + err = btrfs_delayed_ref_init(); + if (err) + goto free_auto_defrag; + + err = btrfs_prelim_ref_init(); + if (err) + goto free_delayed_ref; + + err = btrfs_end_io_wq_init(); + if (err) + goto free_prelim_ref; + + err = btrfs_interface_init(); + if (err) + goto free_end_io_wq; + + btrfs_init_lockdep(); + + btrfs_print_info(); + + err = btrfs_run_sanity_tests(); + if (err) + goto unregister_ioctl; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_end_io_wq: + btrfs_end_io_wq_exit(); +free_prelim_ref: + btrfs_prelim_ref_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); +free_delayed_inode: + btrfs_delayed_inode_exit(); +free_ordered_data: + ordered_data_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); + btrfs_exit_sysfs(); +free_hash: + btrfs_hash_exit(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); + btrfs_auto_defrag_exit(); + btrfs_delayed_inode_exit(); + btrfs_prelim_ref_exit(); + ordered_data_exit(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + btrfs_end_io_wq_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_exit_compress(); + btrfs_hash_exit(); +} + +late_initcall(init_btrfs_fs); +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/btrfs/sysfs.c b/kernel/fs/btrfs/sysfs.c new file mode 100644 index 000000000..e8a4c86d2 --- /dev/null +++ b/kernel/fs/btrfs/sysfs.c @@ -0,0 +1,758 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "sysfs.h" +#include "volumes.h" + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); +} + +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); +} + +static int can_modify_feature(struct btrfs_feature_attr *fa) +{ + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; + + return val; +} + +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); + + return count; +} + +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; + + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; + + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); + + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); + +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(mixed_backref), + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(big_metadata), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + NULL +}; + +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, +}; + +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_size, global_rsv_size_show); + +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); +BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group_cache *block_group; + int index = to_raid_kobj(kobj)->raid_type; + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + val += block_group->key.offset; + else + val += btrfs_block_group_used(&block_group->item); + } + up_read(&sinfo->groups_sem); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static struct attribute *raid_attributes[] = { + BTRFS_RAID_ATTR_PTR(total_bytes), + BTRFS_RAID_ATTR_PTR(used_bytes), + NULL +}; + +static void release_raid_kobj(struct kobject *kobj) +{ + kfree(to_raid_kobj(kobj)); +} + +struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_attrs = raid_attributes, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(field, btrfs_space_info_show_##field) + +static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); + return snprintf(buf, PAGE_SIZE, "%lld\n", val); +} + +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); + +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(flags), + BTRFS_ATTR_PTR(total_bytes), + BTRFS_ATTR_PTR(bytes_used), + BTRFS_ATTR_PTR(bytes_pinned), + BTRFS_ATTR_PTR(bytes_reserved), + BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(disk_used), + BTRFS_ATTR_PTR(disk_total), + BTRFS_ATTR_PTR(total_bytes_pinned), + NULL, +}; + +static void space_info_release(struct kobject *kobj) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + percpu_counter_destroy(&sinfo->total_bytes_pinned); + kfree(sinfo); +} + +struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_attrs = space_info_attrs, +}; + +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(global_rsv_reserved), + BTRFS_ATTR_PTR(global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + char *label = fs_info->super_copy->label; + return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); +} + +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + size_t p_len; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + /* + * p_len is the len until the first occurrence of either + * '\n' or '\0' + */ + p_len = strcspn(buf, "\n"); + + if (p_len >= BTRFS_LABEL_SIZE) + return -EINVAL; + + spin_lock(&fs_info->super_lock); + memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); + memcpy(fs_info->super_copy->label, buf, p_len); + spin_unlock(&fs_info->super_lock); + + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); + + return len; +} +BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR(nodesize, btrfs_nodesize_show); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); + +static struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), + NULL, +}; + +static void btrfs_release_super_kobj(struct kobject *kobj) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + complete(&fs_info->kobj_unregister); +} + +static struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_super_kobj, + .default_attrs = btrfs_attrs, +}; + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_info, super_kobj); +} + +#define NUM_FEATURE_BITS 64 +static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; +static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; + +static const u64 supported_feature_masks[3] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, +}; + +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->super_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->super_kobj, + &agroup); + } + + } + return 0; +} + +static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + kobject_del(&fs_info->super_kobj); + kobject_put(&fs_info->super_kobj); + wait_for_completion(&fs_info->kobj_unregister); +} + +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + kobject_del(fs_info->device_dir_kobj); + kobject_put(fs_info->device_dir_kobj); + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); + __btrfs_sysfs_remove_one(fs_info); +} + +const char * const btrfs_feature_set_names[3] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; + int i; + char *str; + + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += snprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); + } + + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != + ARRAY_SIZE(btrfs_feature_attrs)); + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != + ARRAY_SIZE(btrfs_feature_attrs[0])); + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; + } + + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, 13, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device && one_device->bdev) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *dev; + + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", + &fs_info->super_kobj); + + if (!fs_info->device_dir_kobj) + return -ENOMEM; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!dev->bdev) + continue; + + if (one_device && one_device != dev) + continue; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + error = sysfs_create_link(fs_info->device_dir_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + return error; +} + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +/* /sys/kernel/debug/btrfs */ +static struct dentry *btrfs_debugfs_root_dentry; + +/* Debugging tunables and exported data */ +u64 btrfs_debugfs_test; + +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +{ + int error; + + init_completion(&fs_info->kobj_unregister); + fs_info->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, + "%pU", fs_info->fsid); + if (error) + return error; + + error = sysfs_create_group(&fs_info->super_kobj, + &btrfs_feature_attr_group); + if (error) { + __btrfs_sysfs_remove_one(fs_info); + return error; + } + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = btrfs_kobj_add_device(fs_info, NULL); + if (error) + goto failure; + + fs_info->space_info_kobj = kobject_create_and_add("allocation", + &fs_info->super_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_one(fs_info); + return error; +} + +static int btrfs_init_debugfs(void) +{ +#ifdef CONFIG_DEBUG_FS + btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); + if (!btrfs_debugfs_root_dentry) + return -ENOMEM; + + debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry, + &btrfs_debugfs_test); +#endif + return 0; +} + +int btrfs_init_sysfs(void) +{ + int ret; + + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + + ret = btrfs_init_debugfs(); + if (ret) + goto out1; + + init_feature_attrs(); + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); + + return ret; +} + +void btrfs_exit_sysfs(void) +{ + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + kset_unregister(btrfs_kset); + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +} + diff --git a/kernel/fs/btrfs/sysfs.h b/kernel/fs/btrfs/sysfs.h new file mode 100644 index 000000000..3a4bbed72 --- /dev/null +++ b/kernel/fs/btrfs/sysfs.h @@ -0,0 +1,89 @@ +#ifndef _BTRFS_SYSFS_H_ +#define _BTRFS_SYSFS_H_ + +/* + * Data exported through sysfs + */ +extern u64 btrfs_debugfs_test; + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_name, _show) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) + +#define BTRFS_RAID_ATTR(_name, _show) \ + static struct kobj_attribute btrfs_raid_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) + + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +/* convert from attribute */ +static inline struct btrfs_feature_attr * +to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static inline struct btrfs_feature_attr * +attr_to_btrfs_feature_attr(struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +extern const char * const btrfs_feature_set_names[3]; +extern struct kobj_type space_info_ktype; +extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +#endif /* _BTRFS_SYSFS_H_ */ diff --git a/kernel/fs/btrfs/tests/btrfs-tests.c b/kernel/fs/btrfs/tests/btrfs-tests.c new file mode 100644 index 000000000..9626252ee --- /dev/null +++ b/kernel/fs/btrfs/tests/btrfs-tests.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static struct vfsmount *test_mnt = NULL; + +static const struct super_operations btrfs_test_super_ops = { + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_test_destroy_inode, +}; + +static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, + NULL, BTRFS_TEST_MAGIC); +} + +static struct file_system_type test_type = { + .name = "btrfs_test_fs", + .mount = btrfs_test_mount, + .kill_sb = kill_anon_super, +}; + +struct inode *btrfs_new_test_inode(void) +{ + return new_inode(test_mnt->mnt_sb); +} + +int btrfs_init_test_fs(void) +{ + int ret; + + ret = register_filesystem(&test_type); + if (ret) { + printk(KERN_ERR "btrfs: cannot register test file system\n"); + return ret; + } + + test_mnt = kern_mount(&test_type); + if (IS_ERR(test_mnt)) { + printk(KERN_ERR "btrfs: cannot mount test file system\n"); + unregister_filesystem(&test_type); + return ret; + } + return 0; +} + +void btrfs_destroy_test_fs(void) +{ + kern_unmount(test_mnt); + unregister_filesystem(&test_type); +} + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/kernel/fs/btrfs/tests/btrfs-tests.h b/kernel/fs/btrfs/tests/btrfs-tests.h new file mode 100644 index 000000000..fd3954224 --- /dev/null +++ b/kernel/fs/btrfs/tests/btrfs-tests.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TESTS +#define __BTRFS_TESTS + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) + +struct btrfs_root; + +int btrfs_test_free_space_cache(void); +int btrfs_test_extent_buffer_operations(void); +int btrfs_test_extent_io(void); +int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); +int btrfs_init_test_fs(void); +void btrfs_destroy_test_fs(void); +struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); +#else +static inline int btrfs_test_free_space_cache(void) +{ + return 0; +} +static inline int btrfs_test_extent_buffer_operations(void) +{ + return 0; +} +static inline int btrfs_init_test_fs(void) +{ + return 0; +} +static inline void btrfs_destroy_test_fs(void) +{ +} +static inline int btrfs_test_extent_io(void) +{ + return 0; +} +static inline int btrfs_test_inodes(void) +{ + return 0; +} +static inline int btrfs_test_qgroups(void) +{ + return 0; +} +#endif + +#endif diff --git a/kernel/fs/btrfs/tests/extent-buffer-tests.c b/kernel/fs/btrfs/tests/extent-buffer-tests.c new file mode 100644 index 000000000..f51963a8f --- /dev/null +++ b/kernel/fs/btrfs/tests/extent-buffer-tests.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../disk-io.h" + +static int test_btrfs_split_item(void) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + struct btrfs_item *item; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; + char *split3 = "mary"; + char *split4 = " had a little"; + char buf[32]; + struct btrfs_key key; + u32 value_len = strlen(value); + int ret = 0; + + test_msg("Running btrfs_split_item tests\n"); + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Could not allocate root\n"); + return PTR_ERR(root); + } + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Could not allocate path\n"); + kfree(root); + return -ENOMEM; + } + + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + if (!eb) { + test_msg("Could not allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + path->slots[0] = 0; + + key.objectid = 0; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = 0; + + setup_items_for_insert(root, path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + item = btrfs_item_nr(0); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + + key.offset = 3; + + /* + * Passing NULL trans here should be safe because we have plenty of + * space in this leaf to split the item without having to split the + * leaf. + */ + ret = btrfs_split_item(NULL, root, path, &key, 17); + if (ret) { + test_msg("Split item failed %d\n", ret); + goto out; + } + + /* + * Read the first slot, it should have the original key and contain only + * 'mary had a little' + */ + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split1)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split1)); + if (memcmp(buf, split1, strlen(split1))) { + test_msg("Data in the buffer doesn't match what it should " + "in the first split have='%.*s' want '%s'\n", + (int)strlen(split1), buf, split1); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the second split\n"); + ret = -EINVAL; + goto out; + } + + key.offset = 1; + /* Do it again so we test memmoving the other items in the leaf */ + ret = btrfs_split_item(NULL, root, path, &key, 4); + if (ret) { + test_msg("Second split item failed %d\n", ret); + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split3)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split3)); + if (memcmp(buf, split3, strlen(split3))) { + test_msg("Data in the buffer doesn't match what it should " + "in the third split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 1) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split4)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split4)); + if (memcmp(buf, split4, strlen(split4))) { + test_msg("Data in the buffer doesn't match what it should " + "in the fourth split\n"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 2); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 2\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(2); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the last chunk\n"); + ret = -EINVAL; + goto out; + } +out: + btrfs_free_path(path); + kfree(root); + return ret; +} + +int btrfs_test_extent_buffer_operations(void) +{ + test_msg("Running extent buffer operation tests"); + return test_btrfs_split_item(); +} diff --git a/kernel/fs/btrfs/tests/extent-io-tests.c b/kernel/fs/btrfs/tests/extent-io-tests.c new file mode 100644 index 000000000..9e9f23681 --- /dev/null +++ b/kernel/fs/btrfs/tests/extent-io-tests.c @@ -0,0 +1,275 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "btrfs-tests.h" +#include "../extent_io.h" + +#define PROCESS_UNLOCK (1 << 0) +#define PROCESS_RELEASE (1 << 1) +#define PROCESS_TEST_LOCKED (1 << 2) + +static noinline int process_page_range(struct inode *inode, u64 start, u64 end, + unsigned long flags) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int count = 0; + int loops = 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (flags & PROCESS_TEST_LOCKED && + !PageLocked(pages[i])) + count++; + if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) + unlock_page(pages[i]); + page_cache_release(pages[i]); + if (flags & PROCESS_RELEASE) + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + loops++; + if (loops > 100000) { + printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + break; + } + } + return count; +} + +static int test_find_delalloc(void) +{ + struct inode *inode; + struct extent_io_tree tmp; + struct page *page; + struct page *locked_page = NULL; + unsigned long index = 0; + u64 total_dirty = 256 * 1024 * 1024; + u64 max_bytes = 128 * 1024 * 1024; + u64 start, end, test_start; + u64 found; + int ret = -EINVAL; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Failed to allocate test inode\n"); + return -ENOMEM; + } + + extent_io_tree_init(&tmp, &inode->i_data); + + /* + * First go through and create and mark all of our pages dirty, we pin + * everything to make sure our pages don't get evicted and screw up our + * test. + */ + for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + test_msg("Failed to allocate test page\n"); + ret = -ENOMEM; + goto out; + } + SetPageDirty(page); + if (index) { + unlock_page(page); + } else { + page_cache_get(page); + locked_page = page; + } + } + + /* Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + start = 0; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Should have found at least one delalloc\n"); + goto out_bits; + } + if (start != 0 || end != 4095) { + test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", + start, end); + goto out_bits; + } + unlock_extent(&tmp, start, end); + unlock_page(locked_page); + page_cache_release(locked_page); + + /* + * Test this scenario + * + * |--- delalloc ---| + * |--- search ---| + */ + test_start = 64 * 1024 * 1024; + locked_page = find_lock_page(inode->i_mapping, + test_start >> PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Couldn't find the locked page\n"); + goto out_bits; + } + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Couldn't find delalloc in our range\n"); + goto out_bits; + } + if (start != test_start || end != max_bytes - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu, end " + "%Lu\n", test_start, max_bytes - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("There were unlocked pages in the range\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + /* locked_page was unlocked above */ + page_cache_release(locked_page); + + /* + * Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + test_start = max_bytes + 4096; + locked_page = find_lock_page(inode->i_mapping, test_start >> + PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Could'nt find the locked page\n"); + goto out_bits; + } + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (found) { + test_msg("Found range when we shouldn't have\n"); + goto out_bits; + } + if (end != (u64)-1) { + test_msg("Did not return the proper end offset\n"); + goto out_bits; + } + + /* + * Test this scenario + * [------- delalloc -------| + * [max_bytes]|-- search--| + * + * We are re-using our test_start from above since it works out well. + */ + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start || end != total_dirty - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, total_dirty - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + + /* + * Now to test where we run into a page that is no longer dirty in the + * range we want to find. + */ + page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) + >> PAGE_CACHE_SHIFT); + if (!page) { + test_msg("Couldn't find our page\n"); + goto out_bits; + } + ClearPageDirty(page); + page_cache_release(page); + + /* We unlocked it in the previous test */ + lock_page(locked_page); + start = test_start; + end = 0; + /* + * Currently if we fail to find dirty pages in the delalloc range we + * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * this changes at any point in the future we will need to fix this + * tests expected behavior. + */ + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, test_start + PAGE_CACHE_SIZE - 1, start, + end); + goto out_bits; + } + if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | + PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + ret = 0; +out_bits: + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); +out: + if (locked_page) + page_cache_release(locked_page); + process_page_range(inode, 0, total_dirty - 1, + PROCESS_UNLOCK | PROCESS_RELEASE); + iput(inode); + return ret; +} + +int btrfs_test_extent_io(void) +{ + test_msg("Running find delalloc tests\n"); + return test_find_delalloc(); +} diff --git a/kernel/fs/btrfs/tests/free-space-tests.c b/kernel/fs/btrfs/tests/free-space-tests.c new file mode 100644 index 000000000..2299bfde3 --- /dev/null +++ b/kernel/fs/btrfs/tests/free-space-tests.c @@ -0,0 +1,909 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../free-space-cache.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + cache->full_stripe_len = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + test_msg("Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing extent %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + test_msg("Error removing middle piece %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Still have space at the front\n"); + return -1; + } + + if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + test_msg("Still have space in the middle\n"); + return -1; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("Running bitmap only tests\n"); + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap full range %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Left some space in bitmap\n"); + return -1; + } + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = test_add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + test_msg("Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + test_msg("Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + test_msg("Left over pieces after removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + test_msg("Problem removing overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + test_msg("Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + test_msg("Failed to free our space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + test_msg("Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap and extent overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return ctl->free_extents > 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int +check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, + const int num_extents, + const int num_bitmaps) +{ + if (cache->free_space_ctl->free_extents != num_extents) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->free_extents, num_extents); + return -EINVAL; + } + if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->total_bitmaps, num_bitmaps); + return -EINVAL; + } + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int check_cache_empty(struct btrfs_block_group_cache *cache) +{ + u64 offset; + u64 max_extent_size; + + /* + * Now lets confirm that there's absolutely no free space left to + * allocate. + */ + if (cache->free_space_ctl->free_space != 0) { + test_msg("Cache free space is not 0\n"); + return -EINVAL; + } + + /* And any allocation request, no matter how small, should fail now. */ + offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, + &max_extent_size); + if (offset != 0) { + test_msg("Space allocation did not fail, returned offset: %llu", + offset); + return -EINVAL; + } + + /* And no extent nor bitmap entries in the cache anymore. */ + return check_num_extents_and_bitmaps(cache, 0, 0); +} + +/* + * Before we were able to steal free space from a bitmap entry to an extent + * entry, we could end up with 2 entries representing a contiguous free space. + * One would be an extent entry and the other a bitmap entry. Since in order + * to allocate space to a caller we use only 1 entry, we couldn't return that + * whole range to the caller if it was requested. This forced the caller to + * either assume ENOSPC or perform several smaller space allocations, which + * wasn't optimal as they could be spread all over the block group while under + * concurrency (extra overhead and fragmentation). + * + * This stealing approach is benefical, since we always prefer to allocate from + * extent entries, both for clustered and non-clustered allocation requests. + */ +static int +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +{ + int ret; + u64 offset; + u64 max_extent_size; + + bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, + struct btrfs_free_space *); + + test_msg("Running space stealing from bitmap to extent\n"); + + /* + * For this test, we want to ensure we end up with an extent entry + * immediately adjacent to a bitmap entry, where the bitmap starts + * at an offset where the extent entry ends. We keep adding and + * removing free space to reach into this state, but to get there + * we need to reach a point where marking new free space doesn't + * result in adding new extent entries or merging the new space + * with existing extent entries - the space ends up being marked + * in an existing bitmap that covers the new free space range. + * + * To get there, we need to reach the threshold defined set at + * cache->free_space_ctl->extents_thresh, which currently is + * 256 extents on a x86_64 system at least, and a few other + * conditions (check free_space_cache.c). Instead of making the + * test much longer and complicated, use a "use_bitmap" operation + * that forces use of bitmaps as soon as we have at least 1 + * extent entry. + */ + use_bitmap_op = cache->free_space_ctl->op->use_bitmap; + cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + + /* + * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the first 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb - 256Kb, 128Mb - 128Kb[ + * [128Mb + 512Kb, 128Mb + 768Kb[ + */ + ret = btrfs_remove_free_space(cache, + 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered + * by the bitmap too, isn't marked as free either. + */ + if (test_check_exists(cache, 128 * 1024 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the right of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, + 4096); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb - 256Kb, 128Mb - 128Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 4Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb - 256Kb, 128Mb[ + * bitmap entry covering range: [128Mb, 128Mb + 768Kb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + test_msg("Cache free space is not 1Mb + 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 4096) { + test_msg("Cache free space is not 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 4096, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * Now test a similar scenario, but where our extent entry is located + * to the right of the bitmap entry, so that we can check that stealing + * space from a bitmap to the front of an extent entry works. + */ + + /* + * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ + ret = test_add_free_space_entry(cache, 0, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the last 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb + 128b, 128Mb + 256Kb[ + * [128Mb - 768Kb, 128Mb - 512Kb[ + */ + ret = btrfs_remove_free_space(cache, + 0, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 0, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb - 512Kb, 128Mb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the left of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb + 128Kb, 128Mb + 256Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 8Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb, 128Mb + 256Kb[ + * bitmap entry covering range: [128Mb - 768Kb, 128Mb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + test_msg("Cache free space is not 1Mb + 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 8192) { + test_msg("Cache free space is not 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 8192, 0, + &max_extent_size); + if (offset != (32 * 1024 * 1024)) { + test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +int btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + int ret; + + test_msg("Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + test_msg("Couldn't run the tests\n"); + return 0; + } + + ret = test_extents(cache); + if (ret) + goto out; + ret = test_bitmaps(cache); + if (ret) + goto out; + ret = test_bitmaps_and_extents(cache); + if (ret) + goto out; + + ret = test_steal_space_from_bitmap_to_extent(cache); +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + test_msg("Free space cache tests finished\n"); + return ret; +} diff --git a/kernel/fs/btrfs/tests/inode-tests.c b/kernel/fs/btrfs/tests/inode-tests.c new file mode 100644 index 000000000..054fc0d97 --- /dev/null +++ b/kernel/fs/btrfs/tests/inode-tests.c @@ -0,0 +1,1123 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../disk-io.h" +#include "../extent_io.h" +#include "../volumes.h" + +static void insert_extent(struct btrfs_root *root, u64 start, u64 len, + u64 ram_bytes, u64 offset, u64 disk_bytenr, + u64 disk_len, u32 type, u8 compression, int slot) +{ + struct btrfs_path path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = sizeof(struct btrfs_file_extent_item); + + if (type == BTRFS_FILE_EXTENT_INLINE) + value_len += len; + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = slot; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, 1); + btrfs_set_file_extent_type(leaf, fi, type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len); + btrfs_set_file_extent_offset(leaf, fi, offset); + btrfs_set_file_extent_num_bytes(leaf, fi, len); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); +} + +static void insert_inode_item_key(struct btrfs_root *root) +{ + struct btrfs_path path; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = 0; + + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = 0; + + key.objectid = BTRFS_INODE_ITEM_KEY; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); +} + +/* + * Build the most complicated map of extents the earth has ever seen. We want + * this so we can test all of the corner cases of btrfs_get_extent. Here is a + * diagram of how the extents will look though this may not be possible we still + * want to make sure everything acts normally (the last number is not inclusive) + * + * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] + * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * + * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] + * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * + * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] + * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * + * [81920-86016] + * [ regular ] + */ +static void setup_file_extents(struct btrfs_root *root) +{ + int slot = 0; + u64 disk_bytenr = 1 * 1024 * 1024; + u64 offset = 0; + + /* First we want a hole */ + insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 5; + + /* + * Now we want an inline extent, I don't think this is possible but hey + * why not? Also keep in mind if we have an inline extent it counts as + * the whole first page. If we were to expand it we would have to cow + * and we wouldn't have an inline extent anymore. + */ + insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + slot); + slot++; + offset = 4096; + + /* Now another hole */ + insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 4; + + /* Now for a regular extent */ + insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + disk_bytenr += 4096; + offset += 4095; + + /* + * Now for 3 extents that were split from a hole punch so we test + * offsets properly. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, + 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now for a unwritten prealloc extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + + /* + * We want to jack up disk_bytenr a little more so the em stuff doesn't + * merge our records. + */ + disk_bytenr += 8192; + + /* + * Now for a partially written prealloc extent, basically the same as + * the hole punch example above. Ram_bytes never changes when you mark + * extents written btw. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now a normal compressed extent */ + insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + /* No merges */ + disk_bytenr += 8192; + + /* Now a split compressed extent */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + disk_bytenr += 8192; + + /* Now extents that have a hole but no hole extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 16384; + disk_bytenr += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); +} + +static unsigned long prealloc_only = 0; +static unsigned long compressed_only = 0; +static unsigned long vacancy_only = 0; + +static noinline int test_btrfs_get_extent(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + u64 orig_start; + u64 disk_bytenr; + u64 offset; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + /* + * We do this since btrfs_get_extent wants to assign em->bdev to + * root->fs_info->fs_devices->latest_bdev. + */ + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(NULL, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + /* + * We will just free a dummy node if it's ref count is 2 so we need an + * extra ref so our searches don't accidently release our page. + */ + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + ret = -EINVAL; + + /* First with no extents */ + BTRFS_I(inode)->root = root; + em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + if (IS_ERR(em)) { + em = NULL; + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + test_msg("Vacancy flag wasn't set properly\n"); + goto out; + } + free_extent_map(em); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + + /* + * All of the magic numbers are based on the mapping setup in + * setup_file_extents, so if you change anything there you need to + * update the comment and update the expected values below. + */ + setup_file_extents(root); + + em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 5) { + test_msg("Unexpected extent wanted start 0 len 5, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_INLINE) { + test_msg("Expected an inline, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4091) { + test_msg("Unexpected extent wanted start %llu len 1, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + /* + * We don't test anything else for inline since it doesn't get set + * unless we have a page for it to write into. Maybe we should change + * this? + */ + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4) { + test_msg("Unexpected extent wanted start %llu len 4, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Regular extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4095) { + test_msg("Unexpected extent wanted start %llu len 4095, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are split extents */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + disk_bytenr += (em->start - orig_start); + if (em->block_start != disk_bytenr) { + test_msg("Wrong block start, want %llu, have %llu\n", + disk_bytenr, em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are a half written prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_HOLE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Now for the compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Split compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != disk_bytenr) { + test_msg("Block start does not match, want %llu got %llu\n", + disk_bytenr, em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* A hole between regular extents but no hole extent */ + em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole extent, got %llu\n", em->block_start); + goto out; + } + /* + * Currently we just return a length that we requested rather than the + * length of the actual hole, if this changes we'll have to change this + * test. + */ + if (em->start != offset || em->len != 12288) { + test_msg("Unexpected extent wanted start %llu len 12288, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + vacancy_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +static int test_hole_first(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(NULL, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + BTRFS_I(inode)->root = root; + ret = -EINVAL; + + /* + * Need a blank inode item here just so we don't confuse + * btrfs_get_extent. + */ + insert_inode_item_key(root); + insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 4096) { + test_msg("Unexpected extent wanted start 0 len 4096, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + em->flags); + goto out; + } + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != 4096) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != 4096 || em->len != 4096) { + test_msg("Unexpected extent wanted start 4096 len 4096, got " + "start %llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, wanted 0 got %lu\n", + em->flags); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +int btrfs_test_inodes(void) +{ + int ret; + + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); + set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); + set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + + test_msg("Running btrfs_get_extent tests\n"); + ret = test_btrfs_get_extent(); + if (ret) + return ret; + test_msg("Running hole first btrfs_get_extent test\n"); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); +} diff --git a/kernel/fs/btrfs/tests/qgroup-tests.c b/kernel/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 000000000..c32a7ba76 --- /dev/null +++ b/kernel/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,469 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/kernel/fs/btrfs/transaction.c b/kernel/fs/btrfs/transaction.c new file mode 100644 index 000000000..5628e2525 --- /dev/null +++ b/kernel/fs/btrfs/transaction.c @@ -0,0 +1,2204 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "tree-log.h" +#include "inode-map.h" +#include "volumes.h" +#include "dev-replace.h" +#include "qgroup.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | + __TRANS_START), + [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN), + [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), + [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), +}; + +void btrfs_put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(atomic_read(&transaction->use_count) == 0); + if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); + WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + if (transaction->delayed_refs.pending_csums) + printk(KERN_ERR "pending csums is %llu\n", + transaction->delayed_refs.pending_csums); + while (!list_empty(&transaction->pending_chunks)) { + struct extent_map *em; + + em = list_first_entry(&transaction->pending_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); +} + +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); + } + up_write(&fs_info->commit_root_sem); +} + +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(&trans->num_extwriters); +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) +{ + struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); +loop: + /* The file system has been taken offline. No new transactions. */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + spin_unlock(&fs_info->trans_lock); + return -EROFS; + } + + cur_trans = fs_info->running_transaction; + if (cur_trans) { + if (cur_trans->aborted) { + spin_unlock(&fs_info->trans_lock); + return cur_trans->aborted; + } + if (btrfs_blocked_trans_types[cur_trans->state] & type) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + extwriter_counter_inc(cur_trans, type); + spin_unlock(&fs_info->trans_lock); + return 0; + } + spin_unlock(&fs_info->trans_lock); + + /* + * If we are ATTACH, we just want to catch the current transaction, + * and commit it. If there is no transaction, just return ENOENT. + */ + if (type == TRANS_ATTACH) + return -ENOENT; + + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the checks above + */ + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + goto loop; + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + spin_unlock(&fs_info->trans_lock); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + return -EROFS; + } + + atomic_set(&cur_trans->num_writers, 1); + extwriter_counter_init(cur_trans, type); + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->state = TRANS_STATE_RUNNING; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; + cur_trans->start_time = get_seconds(); + cur_trans->dirty_bg_run = 0; + + cur_trans->delayed_refs.href_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.pending_csums = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when " + "creating a fresh transaction\n"); + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when " + "creating a fresh transaction\n"); + atomic64_set(&fs_info->tree_mod_seq, 0); + + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + mutex_init(&cur_trans->cache_write_mutex); + cur_trans->num_dirty_bgs = 0; + spin_lock_init(&cur_trans->dirty_bgs_lock); + list_add_tail(&cur_trans->list, &fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + fs_info->btree_inode->i_mapping); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; + cur_trans->aborted = 0; + spin_unlock(&fs_info->trans_lock); + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +static int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { + WARN_ON(root == root->fs_info->extent_root); + WARN_ON(root->commit_root != root->node); + + /* + * see below for IN_TRANS_SETUP usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); + + /* make sure readers find IN_TRANS_SETUP before + * they find our root->last_trans update + */ + smp_wmb(); + + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root IN_TRANS_SETUP. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ + btrfs_init_reloc_root(trans, root); + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); + } + return 0; +} + + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return 0; + + /* + * see record_root_in_trans for comments about IN_TRANS_SETUP usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_BLOCKED && + trans->state < TRANS_STATE_UNBLOCKED && + !trans->aborted); +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + cur_trans = root->fs_info->running_transaction; + if (cur_trans && is_transaction_blocked(cur_trans)) { + atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + wait_event(root->fs_info->transaction_wait, + cur_trans->state >= TRANS_STATE_UNBLOCKED || + cur_trans->aborted); + btrfs_put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); + } +} + +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; + + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) + return 1; + + return 0; +} + +static inline bool need_reserve_reloc_root(struct btrfs_root *root) +{ + if (!root->fs_info->reloc_ctl || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + root->reloc_root) + return false; + + return true; +} + +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + u64 num_bytes = 0; + u64 qgroup_reserved = 0; + bool reloc_reserved = false; + int ret; + + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type & TRANS_EXTWRITERS); + h = current->journal_info; + h->use_count++; + WARN_ON(h->use_count > 2); + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + if (root->fs_info->quota_enabled && + is_fstree(root->root_key.objectid)) { + qgroup_reserved = num_items * root->nodesize; + ret = btrfs_qgroup_reserve(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); + } + + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + /* + * Do the reservation for the relocation root creation + */ + if (need_reserve_reloc_root(root)) { + num_bytes += root->nodesize; + reloc_reserved = true; + } + + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes, flush); + if (ret) + goto reserve_fail; + } +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } + + /* + * If we are JOIN_NOLOCK we're already committing a transaction and + * waiting on this guy, so we don't need to do the sb_start_intwrite + * because we're already holding a ref. We need this because we could + * have raced in and did an fsync() on a file which can kick a commit + * and then we deadlock with somebody doing a freeze. + * + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it, so we needn't do sb_start_intwrite(). + */ + if (type & __TRANS_FREEZABLE) + sb_start_intwrite(root->fs_info->sb); + + if (may_wait_transaction(root, type)) + wait_current_trans(root); + + do { + ret = join_transaction(root, type); + if (ret == -EBUSY) { + wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } + } while (ret == -EBUSY); + + if (ret < 0) { + /* We must get the transaction if we are JOIN_NOLOCK. */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + goto join_fail; + } + + cur_trans = root->fs_info->running_transaction; + + h->transid = cur_trans->transid; + h->transaction = cur_trans; + h->blocks_used = 0; + h->bytes_reserved = 0; + h->root = root; + h->delayed_ref_updates = 0; + h->use_count = 1; + h->adding_csums = 0; + h->block_rsv = NULL; + h->orig_rsv = NULL; + h->aborted = 0; + h->qgroup_reserved = 0; + h->delayed_ref_elem.seq = 0; + h->type = type; + h->allocating_chunk = false; + h->reloc_reserved = false; + h->sync = false; + INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); + + smp_mb(); + if (cur_trans->state >= TRANS_STATE_BLOCKED && + may_wait_transaction(root, type)) { + current->journal_info = h; + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + h->transid, num_bytes, 1); + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; + h->reloc_reserved = reloc_reserved; + } + h->qgroup_reserved = qgroup_reserved; + +got_it: + btrfs_record_root_in_trans(h, root); + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; + return h; + +join_fail: + if (type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_items) +{ + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL); +} + +struct btrfs_trans_handle *btrfs_start_transaction_lflush( + struct btrfs_root *root, int num_items) +{ + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_LIMIT); +} + +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN, 0); +} + +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_USERSPACE, 0); +} + +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_ATTACH, 0); +} + +/* + * btrfs_attach_transaction_barrier() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + +/* wait for a transaction commit to be fully complete */ +static noinline void wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); +} + +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret = 0; + + if (transid) { + if (transid <= root->fs_info->last_trans_committed) + goto out; + + /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry(t, &root->fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + atomic_inc(&cur_trans->use_count); + ret = 0; + break; + } + if (t->transid > transid) { + ret = 0; + break; + } + } + spin_unlock(&root->fs_info->trans_lock); + + /* + * The specified transaction doesn't exist, or we + * raced with btrfs_commit_transaction + */ + if (!cur_trans) { + if (transid > root->fs_info->last_trans_committed) + ret = -EINVAL; + goto out; + } + } else { + /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry_reverse(t, &root->fs_info->trans_list, + list) { + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) + break; + cur_trans = t; + atomic_inc(&cur_trans->use_count); + break; + } + } + spin_unlock(&root->fs_info->trans_lock); + if (!cur_trans) + goto out; /* nothing committing|committed */ + } + + wait_for_commit(root, cur_trans); + btrfs_put_transaction(cur_trans); +out: + return ret; +} + +void btrfs_throttle(struct btrfs_root *root) +{ + if (!atomic_read(&root->fs_info->open_ioctl_trans)) + wait_current_trans(root); +} + +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (root->fs_info->global_block_rsv.space_info->full && + btrfs_check_space_for_delayed_refs(trans, root)) + return 1; + + return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + int err; + + smp_mb(); + if (cur_trans->state >= TRANS_STATE_BLOCKED || + cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) { + err = btrfs_run_delayed_refs(trans, root, updates * 2); + if (err) /* Error code will also eval true */ + return err; + } + + return should_end_transaction(trans, root); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *info = root->fs_info; + unsigned long cur = trans->delayed_ref_updates; + int lock = (trans->type != TRANS_JOIN_NOLOCK); + int err = 0; + int must_run_delayed_refs = 0; + + if (trans->use_count > 1) { + trans->use_count--; + trans->block_rsv = trans->orig_rsv; + return 0; + } + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + + if (trans->qgroup_reserved) { + /* + * the same root has to be passed here between start_transaction + * and end_transaction. Subvolume quota depends on this. + */ + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root) && + ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + spin_lock(&info->trans_lock); + if (cur_trans->state == TRANS_STATE_RUNNING) + cur_trans->state = TRANS_STATE_BLOCKED; + spin_unlock(&info->trans_lock); + } + + if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + + WARN_ON(cur_trans != info->running_transaction); + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); + + smp_mb(); + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + btrfs_put_transaction(cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + + if (throttle) + btrfs_run_delayed_iputs(root); + + if (trans->aborted || + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + wake_up_process(info->transaction_kthread); + err = -EIO; + } + assert_qgroups_uptodate(trans); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } + return err; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are sent to disk but does not wait on them + */ +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int err = 0; + int werr = 0; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 end; + + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; + cond_resched(); + start = end + 1; + } + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int err = 0; + int werr = 0; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 end; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); + bool errors = false; + + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + free_extent_state(cached_state); + cached_state = NULL; + cond_resched(); + start = end + 1; + } + if (err) + werr = err; + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, + &btree_ino->runtime_flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, + &btree_ino->runtime_flags)) + errors = true; + } else { + if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR, + &btree_ino->runtime_flags)) + errors = true; + } + + if (errors && !werr) + werr = -EIO; + + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int ret2; + struct blk_plug plug; + + blk_start_plug(&plug); + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + blk_finish_plug(&plug); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages, + EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + u64 old_root_used; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + old_root_used = btrfs_root_used(&root->root_item); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) + break; + + btrfs_set_root_node(&root->root_item, root->node); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) + return ret; + + old_root_used = btrfs_root_used(&root->root_item); + } + + return 0; +} + +/* + * update all the cowonly tree roots on disk + * + * The error handling in this function may not be obvious. Any of the + * failures will cause the file system to go offline. We still need + * to clean up the delayed refs. + */ +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; + struct list_head *next; + struct extent_buffer *eb; + int ret; + + eb = btrfs_lock_root_node(fs_info->tree_root); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, + 0, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + if (ret) + return ret; + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + + ret = btrfs_run_dev_stats(trans, root->fs_info); + if (ret) + return ret; + ret = btrfs_run_dev_replace(trans, root->fs_info); + if (ret) + return ret; + ret = btrfs_run_qgroups(trans, root->fs_info); + if (ret) + return ret; + + ret = btrfs_setup_space_cache(trans, root); + if (ret) + return ret; + + /* run_qgroups might have added some more refs */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; +again: + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); + + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + ret = update_cowonly_root(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); + btrfs_after_dev_replace_commit(fs_info); + + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +void btrfs_add_dead_root(struct btrfs_root *root) +{ + spin_lock(&root->fs_info->trans_lock); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); + spin_unlock(&root->fs_info->trans_lock); +} + +/* + * update all the cowonly tree roots on disk + */ +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *gang[8]; + struct btrfs_fs_info *fs_info = root->fs_info; + int i; + int ret; + int err = 0; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); + + btrfs_save_ino_cache(root, trans); + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } + + err = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); + if (err) + break; + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); + return err; +} + +/* + * defrag a given btree. + * Every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_trans_handle *trans; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root); + cond_resched(); + + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + pr_debug("BTRFS: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct btrfs_root *parent_root; + struct btrfs_block_rsv *rsv; + struct inode *parent_inode; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct dentry *dentry; + struct extent_buffer *tmp; + struct extent_buffer *old; + struct timespec cur_time = CURRENT_TIME; + int ret = 0; + u64 to_reserve = 0; + u64 index = 0; + u64 objectid; + u64 root_flags; + uuid_le new_uuid; + + path = btrfs_alloc_path(); + if (!path) { + pending->error = -ENOMEM; + return 0; + } + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + pending->error = -ENOMEM; + goto root_item_alloc_fail; + } + + pending->error = btrfs_find_free_objectid(tree_root, &objectid); + if (pending->error) + goto no_free_objectid; + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + pending->error = btrfs_block_rsv_add(root, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) + goto no_free_objectid; + } + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; + + rsv = trans->block_rsv; + trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; + + dentry = pending->dentry; + parent_inode = pending->dir; + parent_root = BTRFS_I(parent_inode)->root; + record_root_in_trans(trans, parent_root); + + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(parent_inode, &index); + BUG_ON(ret); /* -ENOMEM */ + + /* check if there is a file/dir which has the same name. */ + dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, + btrfs_ino(parent_inode), + dentry->d_name.name, + dentry->d_name.len, 0); + if (dir_item != NULL && !IS_ERR(dir_item)) { + pending->error = -EEXIST; + goto dir_item_existed; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + btrfs_release_path(path); + + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + if (ret) { /* Transaction aborted */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + record_root_in_trans(trans, root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); + + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + + btrfs_set_root_generation_v2(new_root_item, + trans->transid); + uuid_le_gen(&new_uuid); + memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + memcpy(new_root_item->parent_uuid, root->root_item.uuid, + BTRFS_UUID_SIZE); + if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { + memset(new_root_item->received_uuid, 0, + sizeof(new_root_item->received_uuid)); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + } + btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); + btrfs_set_root_otransid(new_root_item, trans->transid); + + old = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + if (ret) { + btrfs_tree_unlock(old); + free_extent_buffer(old); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + btrfs_set_lock_blocking(old); + + ret = btrfs_copy_root(trans, root, old, &tmp, objectid); + /* clean up in any case */ + btrfs_tree_unlock(old); + free_extent_buffer(old); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_wmb(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, + parent_root->root_key.objectid, + btrfs_ino(parent_inode), index, + dentry->d_name.name, dentry->d_name.len); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_insert_dir_item(trans, parent_root, + dentry->d_name.name, dentry->d_name.len, + parent_inode, &key, + BTRFS_FT_DIR, index); + /* We have check then name at the beginning, so it is impossible. */ + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, + BTRFS_UUID_KEY_SUBVOL, objectid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + new_root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + } +fail: + pending->error = ret; +dir_item_existed: + trans->block_rsv = rsv; + trans->bytes_reserved = 0; +no_free_objectid: + kfree(new_root_item); +root_item_alloc_fail: + btrfs_free_path(path); + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending, *next; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret = 0; + + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); + ret = create_pending_snapshot(trans, fs_info, pending); + if (ret) + break; + } + return ret; +} + +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; + if (btrfs_test_opt(root, SPACE_CACHE)) + super->cache_generation = root_item->generation; + if (root->fs_info->update_uuid_tree_gen) + super->uuid_tree_generation = root_item->generation; +} + +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + struct btrfs_transaction *trans; + int ret = 0; + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); + spin_unlock(&info->trans_lock); + return ret; +} + +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + struct btrfs_transaction *trans; + int ret = 0; + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); + spin_unlock(&info->trans_lock); + return ret; +} + +/* + * wait for the current transaction commit to start and block subsequent + * transaction joins + */ +static void wait_current_trans_commit_start(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + wait_event(root->fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || + trans->aborted); +} + +/* + * wait for the current transaction to start and then become unblocked. + * caller holds ref. + */ +static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + wait_event(root->fs_info->transaction_wait, + trans->state >= TRANS_STATE_UNBLOCKED || + trans->aborted); +} + +/* + * commit transactions asynchronously. once btrfs_commit_transaction_async + * returns, any subsequent transaction will not be allowed to join. + */ +struct btrfs_async_commit { + struct btrfs_trans_handle *newtrans; + struct btrfs_root *root; + struct work_struct work; +}; + +static void do_async_commit(struct work_struct *work) +{ + struct btrfs_async_commit *ac = + container_of(work, struct btrfs_async_commit, work); + + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + if (ac->newtrans->type & __TRANS_FREEZABLE) + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + current->journal_info = ac->newtrans; + + btrfs_commit_transaction(ac->newtrans, ac->root); + kfree(ac); +} + +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock) +{ + struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + + ac = kmalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + return -ENOMEM; + + INIT_WORK(&ac->work, do_async_commit); + ac->root = root; + ac->newtrans = btrfs_join_transaction(root); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } + + /* take transaction reference */ + cur_trans = trans->transaction; + atomic_inc(&cur_trans->use_count); + + btrfs_end_transaction(trans, root); + + /* + * Tell lockdep we've released the freeze rwsem, since the + * async commit thread will be the one to unlock it. + */ + if (ac->newtrans->type & __TRANS_FREEZABLE) + rwsem_release( + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + + schedule_work(&ac->work); + + /* wait for transaction to start and unblock */ + if (wait_for_unblock) + wait_current_trans_commit_start_and_unblock(root, cur_trans); + else + wait_current_trans_commit_start(root, cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + + btrfs_put_transaction(cur_trans); + return 0; +} + + +static void cleanup_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int err) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); + + WARN_ON(trans->use_count > 1); + + btrfs_abort_transaction(trans, root, err); + + spin_lock(&root->fs_info->trans_lock); + + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); + + list_del_init(&cur_trans->list); + if (cur_trans == root->fs_info->running_transaction) { + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + + btrfs_cleanup_one_transaction(trans->transaction, root); + + spin_lock(&root->fs_info->trans_lock); + if (cur_trans == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + + trace_btrfs_transaction_commit(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + btrfs_scrub_cancel(root->fs_info); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); +} + +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + return btrfs_start_delalloc_roots(fs_info, 1, -1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + btrfs_wait_ordered_roots(fs_info, -1); +} + +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_transaction *prev_trans = NULL; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); + int ret; + + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + btrfs_end_transaction(trans, root); + return ret; + } + + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + + cur_trans = trans->transaction; + + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + cur_trans->delayed_refs.flushing = 1; + smp_wmb(); + + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + if (!cur_trans->dirty_bg_run) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds dirty_bg_run = 1 + * + * The dirty_bg_run flag is also used to make sure only + * one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (!cur_trans->dirty_bg_run) { + run_it = 1; + cur_trans->dirty_bg_run = 1; + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + if (run_it) + ret = btrfs_start_dirty_block_groups(trans, root); + } + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + spin_unlock(&root->fs_info->trans_lock); + atomic_inc(&cur_trans->use_count); + ret = btrfs_end_transaction(trans, root); + + wait_for_commit(root, cur_trans); + + if (unlikely(cur_trans->aborted)) + ret = cur_trans->aborted; + + btrfs_put_transaction(cur_trans); + + return ret; + } + + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&root->fs_info->transaction_blocked_wait); + + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (prev_trans->state != TRANS_STATE_COMPLETED) { + atomic_inc(&prev_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + wait_for_commit(root, prev_trans); + + btrfs_put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + } else { + spin_unlock(&root->fs_info->trans_lock); + } + + extwriter_counter_dec(cur_trans, trans->type); + + ret = btrfs_start_delalloc_flush(root->fs_info); + if (ret) + goto cleanup_transaction; + + ret = btrfs_run_delayed_items(trans, root); + if (ret) + goto cleanup_transaction; + + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); + + /* some pending stuffs might be added after the previous flush. */ + ret = btrfs_run_delayed_items(trans, root); + if (ret) + goto cleanup_transaction; + + btrfs_wait_delalloc_flush(root->fs_info); + + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + + btrfs_scrub_pause(root); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto scrub_continue; + } + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + + /* + * We needn't worry about the delayed items because we will + * deal with them in create_pending_snapshot(), which is the + * core function of the snapshot creation. + */ + ret = create_pending_snapshots(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * We insert the dir indexes of the snapshots and update the inode + * of the snapshots' parents after the snapshot creation, so there + * are some delayed items which are not dealt with. Now deal with + * them. + * + * We needn't worry that this operation will corrupt the snapshots, + * because all the tree which are snapshoted will be forced to COW + * the nodes and leaves. + */ + ret = btrfs_run_delayed_items(trans, root); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + + WARN_ON(cur_trans != trans->transaction); + + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, root); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * Since the transaction is done, we can apply the pending changes + * before the next transaction. + */ + btrfs_apply_pending_changes(root->fs_info); + + /* commit_fs_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = commit_cowonly_roots(trans, root); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + btrfs_prepare_extent_commit(trans, root); + + cur_trans = root->fs_info->running_transaction; + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + switch_commit_roots(cur_trans, root->fs_info); + + assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); + update_super_roots(root); + + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); + + btrfs_update_commit_device_size(root->fs_info); + btrfs_update_commit_device_bytes_used(root, cur_trans); + + clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + + spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + + wake_up(&root->fs_info->transaction_wait); + + ret = btrfs_write_and_wait_transaction(trans, root); + if (ret) { + btrfs_error(root->fs_info, ret, + "Error while writing out transaction"); + mutex_unlock(&root->fs_info->tree_log_mutex); + goto scrub_continue; + } + + ret = write_ctree_super(trans, root, 0); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto scrub_continue; + } + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root); + + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + + root->fs_info->last_trans_committed = cur_trans->transid; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + + spin_lock(&root->fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + + trace_btrfs_transaction_commit(root); + + btrfs_scrub_continue(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + + return ret; + +scrub_continue: + btrfs_scrub_continue(root); +cleanup_transaction: + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); + if (current->journal_info == trans) + current->journal_info = NULL; + cleanup_transaction(trans, root, ret); + + return ret; +} + +/* + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don't mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. + */ +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&fs_info->trans_lock); + + pr_debug("BTRFS: cleaner removing %llu\n", root->objectid); + + btrfs_kill_all_delayed_nodes(root); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, NULL, 0, 0); + else + ret = btrfs_drop_snapshot(root, NULL, 1, 0); + + return (ret < 0) ? 0 : 1; +} + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = xchg(&fs_info->pending_changes, 0); + if (!prev) + return; + + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_COMMIT; + if (prev & bit) + btrfs_debug(fs_info, "pending commit done"); + prev &= ~bit; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} diff --git a/kernel/fs/btrfs/transaction.h b/kernel/fs/btrfs/transaction.h new file mode 100644 index 000000000..0b2475559 --- /dev/null +++ b/kernel/fs/btrfs/transaction.h @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" +#include "delayed-ref.h" +#include "ctree.h" + +enum btrfs_trans_state { + TRANS_STATE_RUNNING = 0, + TRANS_STATE_BLOCKED = 1, + TRANS_STATE_COMMIT_START = 2, + TRANS_STATE_COMMIT_DOING = 3, + TRANS_STATE_UNBLOCKED = 4, + TRANS_STATE_COMPLETED = 5, + TRANS_STATE_MAX = 6, +}; + +struct btrfs_transaction { + u64 transid; + /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ + atomic_t num_writers; + atomic_t use_count; + + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; + struct list_head pending_chunks; + struct list_head pending_ordered; + struct list_head switch_commits; + struct list_head dirty_bgs; + struct list_head io_bgs; + u64 num_dirty_bgs; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; + spinlock_t dirty_bgs_lock; + struct btrfs_delayed_ref_root delayed_refs; + int aborted; + int dirty_bg_run; +}; + +#define __TRANS_FREEZABLE (1U << 0) + +#define __TRANS_USERSPACE (1U << 8) +#define __TRANS_START (1U << 9) +#define __TRANS_ATTACH (1U << 10) +#define __TRANS_JOIN (1U << 11) +#define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) + +#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) + +#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ + __TRANS_ATTACH) + +#define BTRFS_SEND_TRANS_STUB ((void *)1) + +struct btrfs_trans_handle { + u64 transid; + u64 bytes_reserved; + u64 qgroup_reserved; + unsigned long use_count; + unsigned long blocks_reserved; + unsigned long blocks_used; + unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; + short aborted; + short adding_csums; + bool allocating_chunk; + bool reloc_reserved; + bool sync; + unsigned int type; + /* + * this root is only needed to validate that the root passed to + * start_transaction is the same as the one passed to end_transaction. + * Subvolume quota depends on this + */ + struct btrfs_root *root; + struct seq_list delayed_ref_elem; + struct list_head ordered; + struct list_head qgroup_ref_list; + struct list_head new_bgs; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct inode *dir; + struct btrfs_root *root; + struct btrfs_root *snap; + struct btrfs_qgroup_inherit *inherit; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; + /* extra metadata reseration for relocation */ + int error; + bool readonly; + struct list_head list; +}; + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_lflush( + struct btrfs_root *root, int num_items); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); + +void btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); + +#endif diff --git a/kernel/fs/btrfs/tree-defrag.c b/kernel/fs/btrfs/tree-defrag.c new file mode 100644 index 000000000..a63719cc9 --- /dev/null +++ b/kernel/fs/btrfs/tree-defrag.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &key, path, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/kernel/fs/btrfs/tree-log.c b/kernel/fs/btrfs/tree-log.c new file mode 100644 index 000000000..d04968374 --- /dev/null +++ b/kernel/fs/btrfs/tree-log.c @@ -0,0 +1,5089 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "tree-log.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "backref.h" +#include "hash.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_DIR_INDEX 2 +#define LOG_WALK_REPLAY_ALL 3 + +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + int index; + int ret; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + ret = -EAGAIN; + goto out; + } + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + } else if (root->log_start_pid != current->pid) { + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + } + + atomic_inc(&root->log_batch); + atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } + mutex_unlock(&root->log_mutex); + return 0; + } + + ret = 0; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, root->fs_info); + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + if (ret) + goto out; + } + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); + atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } +out: + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->log_writers); + } + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +void btrfs_end_log_trans(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int ret = 0; + + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + } + + if (wc->pin) + ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, + eb->start, eb->len); + + if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { + if (wc->pin && btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(log, eb); + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return ret; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(path); + return 0; + } + + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + u32 mode; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + + /* + * If this is a directory we need to reset the i_size to + * 0 so that we can set it up properly when replaying + * the rest of the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + u32 mode; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); + + /* + * If this is a directory we need to reset the i_size to 0 so + * that we can set it up properly when replaying the rest of + * the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); + } +insert: + btrfs_release_path(path); + /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + path->skip_release_on_error = 0; + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST || ret == -EOVERFLOW) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) + btrfs_truncate_item(root, path, item_size, 1); + else if (found_size < item_size) + btrfs_extend_item(root, path, + item_size - found_size); + } else if (ret) { + return ret; + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } + goto no_copy; + } + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct btrfs_key key; + struct inode *inode; + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 extent_end; + u64 start = key->offset; + u64 nbytes = 0; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, slot, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); + extent_end = ALIGN(start + size, root->sectorsize); + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(path); + goto out; + } + } + btrfs_release_path(path); + + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); + if (ret) + goto out; + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + if (ret) + goto out; + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_data_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + 0, root->root_key.objectid, + key->objectid, offset, 0); + if (ret) + goto out; + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); + if (ret) + goto out; + } + btrfs_release_path(path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums, 0); + if (ret) + goto out; + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + } else { + btrfs_release_path(path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + if (ret) + goto out; + } + + inode_add_bytes(inode, nbytes); + ret = btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(path); + + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (ret) + goto out; + else + ret = btrfs_run_delayed_items(trans, root); +out: + kfree(name); + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + u64 ref_objectid, + const char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, namelen, NULL)) + match = 1; + + goto out; + } + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_root *log_root, + struct inode *dir, struct inode *inode, + struct extent_buffer *eb, + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, char *name, int namelen, + int *search_done) +{ + int ret; + char *victim_name; + int victim_name_len; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; + +again: + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret == 0) { + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + + leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (search_key.objectid == search_key.offset) + return 1; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + if (!victim_name) + return -ENOMEM; + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log_root, &search_key, + parent_objectid, + victim_name, + victim_name_len)) { + inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + if (ret) + return ret; + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + *search_done = 1; + goto again; + } + kfree(victim_name); + + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + + /* + * NOTE: we have searched root tree and checked the + * coresponding ref, it does not need to check again. + */ + *search_done = 1; + } + btrfs_release_path(path); + + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + inode_objectid, parent_objectid, 0, + 0); + if (!IS_ERR_OR_NULL(extref)) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(base + cur_offset); + + victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + victim_name = kmalloc(victim_name_len, GFP_NOFS); + if (!victim_name) + return -ENOMEM; + read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, + victim_name_len); + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name, + victim_name_len); + ret = 0; + if (!backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len)) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, + victim_parent, + inode, + victim_name, + victim_name_len); + if (!ret) + ret = btrfs_run_delayed_items( + trans, root); + } + iput(victim_parent); + kfree(victim_name); + if (ret) + return ret; + *search_done = 1; + goto again; + } + kfree(victim_name); + if (ret) + return ret; +next: + cur_offset += victim_name_len + sizeof(*extref); + } + *search_done = 1; + } + btrfs_release_path(path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + ref_index, name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + if (ret) + return ret; + } + btrfs_release_path(path); + + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + if (ret) + return ret; + } + btrfs_release_path(path); + + return 0; +} + +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ref_ptr; + + *namelen = btrfs_inode_extref_name_len(eb, extref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)&extref->name, + *namelen); + + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) +{ + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + *namelen = btrfs_inode_ref_name_len(eb, ref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir = NULL; + struct inode *inode = NULL; + unsigned long ref_ptr; + unsigned long ref_end; + char *name = NULL; + int namelen; + int ret; + int search_done = 0; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index = 0; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + + inode = read_one_inode(root, inode_objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + while (ref_ptr < ref_end) { + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + } else { + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); + } + if (ret) + goto out; + + /* if we already have a perfect match, we're done */ + if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, name, namelen)) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + + if (!search_done) { + ret = __add_inode_ref(trans, root, path, log, + dir, inode, eb, + inode_objectid, + parent_objectid, + ref_index, name, namelen, + &search_done); + if (ret) { + if (ret == 1) + ret = 0; + goto out; + } + } + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, + 0, ref_index); + if (ret) + goto out; + + btrfs_update_inode(trans, root, inode); + } + + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; + kfree(name); + name = NULL; + if (log_ref_ver) { + iput(dir); + dir = NULL; + } + } + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); +out: + btrfs_release_path(path); + kfree(name); + iput(dir); + iput(inode); + return ret; +} + +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 ino) +{ + int ret; + + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; + + return ret; +} + +static int count_inode_extrefs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0 && ret != -ENOENT) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + unsigned int nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + u64 ino = btrfs_ino(inode); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } +process_slot: + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + if (path->slots[0] > 0) { + path->slots[0]--; + goto process_slot; + } + key.offset--; + btrfs_release_path(path); + } + btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, inode, path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, inode, path); + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + + if (nlink != inode->i_nlink) { + set_nlink(inode, nlink); + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + ino, 1); + if (ret) + goto out; + } + ret = insert_orphan_item(trans, root, ino); + } + +out: + btrfs_free_path(path); + return ret; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + btrfs_release_path(path); + inode = read_one_inode(root, key.offset); + if (!inode) + return -EIO; + + ret = fixup_inode_link_count(trans, root, inode); + iput(inode); + if (ret) + goto out; + + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; + } + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + if (!inode) + return -EIO; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(path); + if (ret == 0) { + if (!inode->i_nlink) + set_nlink(inode, 1); + else + inc_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); /* Logic Error */ + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret = 0; + bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + + dir = read_one_inode(root, key->objectid); + if (!dir) + return -EIO; + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + /* Corruption */ + ret = -EINVAL; + goto out; + } + if (IS_ERR_OR_NULL(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + update_size = false; + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + if (ret) + goto out; + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(path); + if (!ret && update_size) { + btrfs_i_size_write(dir, dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, dir); + } + kfree(name); + iput(dir); + return ret; + +insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + btrfs_release_path(path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + if (ret && ret != -ENOENT && ret != -EEXIST) + goto out; + update_size = false; + ret = 0; + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + if (ret) + return ret; + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + kfree(name); + return -EIO; + } + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + if (ret) { + kfree(name); + iput(inode); + goto out; + } + + inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + if (!ret) + ret = btrfs_run_delayed_items(trans, root); + kfree(name); + iput(inode); + if (ret) + goto out; + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } else if (IS_ERR(log_di)) { + kfree(name); + return PTR_ERR(log_di); + } + btrfs_release_path(log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(path); + btrfs_release_path(log_path); + return ret; +} + +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size_nr(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, + &found_key); + if (ret) + goto out; + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(path); + goto again; + } +out: + btrfs_release_path(path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + int level; + int i; + int ret; + + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid, 0); + if (ret) + break; + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. + */ + if (S_ISREG(mode)) { + ret = insert_orphan_item(wc->trans, root, + key.objectid); + if (ret) + break; + } + + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + if (ret) + break; + } + + if (key.type == BTRFS_DIR_INDEX_KEY && + wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + if (ret && ret != -ENOENT) + break; + ret = 0; + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } else if (key.type == BTRFS_DIR_ITEM_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + } + btrfs_free_path(path); + return ret; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + WARN_ON(btrfs_header_level(cur) != *level); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = root->nodesize; + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + + next = btrfs_find_create_tree_block(root, bytenr); + if (!next) + return -ENOMEM; + + if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } + + path->slots[*level]++; + if (wc->free) { + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } + + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root->fs_info, + next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_and_pin_reserved_extent(root, + bytenr, blocksize); + if (ret) { + free_extent_buffer(next); + return ret; + } + } + free_extent_buffer(next); + continue; + } + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + ret = wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (ret) + return ret; + + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root->fs_info, + next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_and_pin_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + if (ret) + return ret; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int orig_level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) { + ret = wret; + goto out; + } + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) { + ret = wret; + goto out; + } + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + ret = wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (ret) + goto out; + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, log->fs_info, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } + + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_and_pin_reserved_extent(log, next->start, + next->len); + if (ret) + goto out; + } + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) +{ + DEFINE_WAIT(wait); + int index = transid % 2; + + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ + do { + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + + if (root->log_transid_committed < transid && + atomic_read(&root->log_commit[index])) + schedule(); + + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid_committed < transid && + atomic_read(&root->log_commit[index])); +} + +static void wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (atomic_read(&root->log_writers)) + schedule(); + finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); + } +} + +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_log_ctx *ctx) +{ + int index1; + int index2; + int mark; + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; + struct blk_plug plug; + + mutex_lock(&root->log_mutex); + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(trans, root, log_transid); + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + ASSERT(log_transid == root->log_transid); + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(trans, root, log_transid - 1); + + while (1) { + int batch = atomic_read(&root->log_batch); + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } + wait_for_writer(trans, root); + if (batch == atomic_read(&root->log_batch)) + break; + } + + /* bail out if we need to do a full commit */ + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + ret = -EAGAIN; + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&root->log_mutex); + goto out; + } + + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + blk_start_plug(&plug); + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); + if (ret) { + blk_finish_plug(&plug); + btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); + btrfs_set_log_full_commit(root->fs_info, trans); + mutex_unlock(&root->log_mutex); + goto out; + } + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_transid++; + log->log_transid = root->log_transid; + root->log_start_pid = 0; + /* + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + btrfs_init_log_ctx(&root_log_ctx); + + mutex_lock(&log_root_tree->log_mutex); + atomic_inc(&log_root_tree->log_batch); + atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + + blk_finish_plug(&plug); + btrfs_set_log_full_commit(root->fs_info, trans); + + if (ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + blk_finish_plug(&plug); + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, + mark); + btrfs_wait_logged_extents(trans, log, log_transid); + wait_log_commit(trans, log_root_tree, + root_log_ctx.log_transid); + mutex_unlock(&log_root_tree->log_mutex); + if (!ret) + ret = root_log_ctx.log_ret; + goto out; + } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + root_log_ctx.log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + blk_finish_plug(&plug); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } + + ret = btrfs_write_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + if (!ret) + ret = btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + btrfs_wait_logged_extents(trans, log, log_transid); + + btrfs_set_super_log_root(root->fs_info->super_for_commit, + log_root_tree->node->start); + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, + btrfs_header_level(log_root_tree->node)); + + log_root_tree->log_transid++; + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + ret = write_ctree_super(trans, root->fs_info->tree_root, 1); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_abort_transaction(trans, root, ret); + goto out_wake_log_root; + } + + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + +out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; + atomic_set(&log_root_tree->log_commit[index2], 0); + mutex_unlock(&log_root_tree->log_mutex); + + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); +out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + mutex_lock(&root->log_mutex); + root->log_transid_committed++; + atomic_set(&root->log_commit[index1], 0); + mutex_unlock(&root->log_mutex); + + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); + return ret; +} + +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + ret = walk_log_tree(trans, log, &wc); + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + NULL); + if (ret) + break; + + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + + /* + * We may have short-circuited the log tree with the full commit logic + * and left ordered extents on our list, so clear these out to keep us + * from leaking inodes and memory. + */ + btrfs_free_logged_extents(log, 0); + btrfs_free_logged_extents(log, 1); + + free_extent_buffer(log->node); + kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } + return 0; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int err = 0; + int bytes_del = 0; + u64 dir_ino = btrfs_ino(dir); + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out_unlock; + } + + di = btrfs_lookup_dir_item(trans, log, path, dir_ino, + name, name_len, -1); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + if (ret) { + err = ret; + goto fail; + } + } + btrfs_release_path(path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + if (ret) { + err = ret; + goto fail; + } + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(path); + } +fail: + btrfs_free_path(path); +out_unlock: + mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 0; + } else if (ret < 0) + btrfs_abort_transaction(trans, root, ret); + + btrfs_end_log_trans(root); + + return err; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 0; + } else if (ret < 0 && ret != -ENOENT) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + if (ret) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int err = 0; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + u64 ino = btrfs_ino(inode); + + log = root->log_root; + + min_key.objectid = ino; + min_key.type = key_type; + min_key.offset = min_offset; + + ret = btrfs_search_forward(root, &min_key, path, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + min_key.objectid = ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + ret = btrfs_previous_item(root, path, ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + if (ret) { + err = ret; + goto done; + } + } + } + btrfs_release_path(path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (WARN_ON(ret != 0)) + goto done; + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != ino || min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + if (ret) { + err = ret; + goto done; + } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * + * + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + if (ret) + err = ret; + else + last_offset = tmp.offset; + goto done; + } + } +done: + btrfs_release_path(path); + btrfs_release_path(dst_path); + + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + ino, first_offset, last_offset); + if (ret) + err = ret; + } + return err; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, ctx, min_key, + &max_key); + if (ret) + return ret; + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + int start_slot; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + BUG_ON(ret == 0); /* Logic error */ + if (ret < 0) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], &found_key, 0, + &start_slot); + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) + break; + btrfs_release_path(path); + } + btrfs_release_path(path); + if (ret > 0) + ret = 0; + return ret; +} + +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only, + u64 logged_isize) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); + } else { + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, &item->atime, + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->atime, + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, &item->mtime, + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->mtime, + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, &item->ctime, + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->ctime, + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); +} + +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + int ret; + + ret = btrfs_insert_empty_item(trans, log, path, + &BTRFS_I(inode)->location, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); + btrfs_release_path(path); + return 0; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_path *dst_path, + struct btrfs_path *src_path, u64 *last_extent, + int start_slot, int nr, int inode_only, + u64 logged_isize) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_root *log = BTRFS_I(inode)->root->log_root; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = true; + bool done = false; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + first_key.objectid = (u64)-1; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + if (ret) { + kfree(ins_data); + return ret; + } + + for (i = 0; i < nr; i++, dst_path->slots[0]++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + if ((i == (nr - 1))) + last_key = ins_keys[i]; + + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + } + + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; + } + + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && + !skip_csum) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(src, extent) < trans->transid) + continue; + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent); + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums, 0); + if (ret) { + btrfs_release_path(dst_path); + kfree(ins_data); + return ret; + } + } + } + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + ret = 0; + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); + } + + if (!has_extents) + return ret; + + if (need_find_last_extent && *last_extent == first_key.offset) { + /* + * We don't have any leafs between our current one and the one + * we processed before that can have file extent items for our + * inode (and have a generation number smaller than our current + * transaction id). + */ + need_find_last_extent = false; + } + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, i, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = extent_end; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; + return ret; +} + +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; + return 0; +} + +static int wait_ordered_extents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + const struct extent_map *em, + const struct list_head *logged_list, + bool *ordered_io_error) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_root *log = root->log_root; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + u64 csum_offset; + u64 csum_len; + LIST_HEAD(ordered_sums); + int ret = 0; + + *ordered_io_error = false; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) + return 0; + + /* + * Wait far any ordered extent that covers our extent map. If it + * finishes without an error, first check and see if our csums are on + * our outstanding ordered extents. + */ + list_for_each_entry(ordered, logged_list, log_list) { + struct btrfs_ordered_sum *sum; + + if (!mod_len) + break; + + if (ordered->file_offset + ordered->len <= mod_start || + mod_start + mod_len <= ordered->file_offset) + continue; + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + const u64 start = ordered->file_offset; + const u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(ordered->inode != inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + + wait_event(ordered->wait, + (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || + test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + /* + * Clear the AS_EIO/AS_ENOSPC flags from the inode's + * i_mapping flags, so that the next fsync won't get + * an outdated io error too. + */ + btrfs_inode_check_errors(inode); + *ordered_io_error = true; + break; + } + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this + * ordered extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered->file_offset + ordered->len >= + mod_start + mod_len) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered->file_offset + ordered->len < + mod_start + mod_len) { + mod_len = (mod_start + mod_len) - + (ordered->file_offset + ordered->len); + mod_start = ordered->file_offset + + ordered->len; + } else { + mod_len = 0; + } + } + + if (skip_csum) + continue; + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, + &ordered->flags)) + continue; + + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } + + list_for_each_entry(sum, &ordered->list, list) { + ret = btrfs_csum_file_blocks(trans, log, sum); + if (ret) + break; + } + } + + if (*ordered_io_error || !mod_len || ret || skip_csum) + return ret; + + if (em->compress_type) { + csum_offset = 0; + csum_len = max(em->block_len, em->orig_block_len); + } else { + csum_offset = mod_start - em->start; + csum_len = mod_len; + } + + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; + + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); + } + + return ret; +} + +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + const struct extent_map *em, + struct btrfs_path *path, + const struct list_head *logged_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_map_token token; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + int extent_inserted = 0; + bool ordered_io_err = false; + + ret = wait_ordered_extents(trans, inode, root, em, logged_list, + &ordered_io_err); + if (ret) + return ret; + + if (ordered_io_err) { + ctx->io_err = -EIO; + return 0; + } + + btrfs_init_map_token(&token); + + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); + if (ret) + return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, + &token); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); + else + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + return ret; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct list_head *logged_list, + struct btrfs_log_ctx *ctx) +{ + struct extent_map *em, *n; + struct list_head extents; + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + u64 test_gen; + int ret = 0; + int num = 0; + + INIT_LIST_HEAD(&extents); + + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + + if (em->generation <= test_gen) + continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); + list_add_tail(&em->list, &extents); + num++; + } + + list_sort(NULL, &extents, extent_cmp); + +process: + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) { + clear_em_logging(tree, em); + free_extent_map(em); + continue; + } + + write_unlock(&tree->lock); + + ret = log_one_extent(trans, inode, root, em, path, logged_list, + ctx); + write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); + } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + + btrfs_release_path(path); + return ret; +} + +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = 0; + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + LIST_HEAD(logged_list); + u64 last_extent = 0; + int err = 0; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + bool fast_search = false; + u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } + + min_key.objectid = ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = ino; + + + /* today the code can only do partial logging of directories */ + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ + if (S_ISDIR(inode->i_mode) || + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) + ret = btrfs_commit_inode_delayed_items(trans, inode); + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } + + mutex_lock(&BTRFS_I(inode)->log_mutex); + + btrfs_get_logged_extents(inode, &logged_list, start, end); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, max_key_type); + } else { + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + while(1) { + ret = btrfs_truncate_inode_items(trans, + log, inode, 0, 0); + if (ret != -EAGAIN) + break; + } + } + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; + } + + } + if (ret) { + err = ret; + goto out_unlock; + } + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, + path, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; + } + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ret = 0; + ins_nr = 0; + } + btrfs_release_path(path); + + if (min_key.offset < (u64)-1) { + min_key.offset++; + } else if (min_key.type < max_key.type) { + min_key.type++; + min_key.offset = 0; + } else { + break; + } + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ret = 0; + ins_nr = 0; + } + +log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); + if (fast_search) { + /* + * Some ordered extents started by fsync might have completed + * before we collected the ordered extents in logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + err = btrfs_inode_check_errors(inode); + if (err) { + ctx->io_err = err; + goto out_unlock; + } + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list, ctx); + if (ret) { + err = ret; + goto out_unlock; + } + } else if (inode_only == LOG_INODE_ALL) { + struct extent_map *em, *n; + + write_lock(&em_tree->lock); + /* + * We can't just remove every em if we're called for a ranged + * fsync - that is, one that doesn't cover the whole possible + * file range (0 to LLONG_MAX). This is because we can have + * em's that fall outside the range we're logging and therefore + * their ordered operations haven't completed yet + * (btrfs_finish_ordered_io() not invoked yet). This means we + * didn't get their respective file extent item in the fs/subvol + * tree yet, and need to let the next fast fsync (one which + * consults the list of modified extent maps) find the em so + * that it logs a matching file extent item and waits for the + * respective ordered operation to complete (if it's still + * running). + * + * Removing every em outside the range we're logging would make + * the next fast fsync not log their matching file extent items, + * therefore making us lose data after a log replay. + */ + list_for_each_entry_safe(em, n, &em_tree->modified_extents, + list) { + const u64 mod_end = em->mod_start + em->mod_len - 1; + + if (em->mod_start >= start && mod_end <= end) + list_del_init(&em->list); + } + write_unlock(&em_tree->lock); + } + + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); + if (ret) { + err = ret; + goto out_unlock; + } + } + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); +out_unlock: + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + return err; +} + +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) +{ + int ret = 0; + struct btrfs_root *root; + struct dentry *old_parent = NULL; + struct inode *orig_inode = inode; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + goto out; + inode = d_inode(parent); + } + + while (1) { + /* + * If we are logging a directory then we start with our inode, + * not our parents inode, so we need to skipp setting the + * logged_trans so that further down in the log code we don't + * think this inode has already been logged. + */ + if (inode != orig_inode) + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 1; + break; + } + + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + break; + + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + inode = d_inode(parent); + + } + dput(old_parent); +out: + return ret; +} + +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, + const loff_t start, + const loff_t end, + int exists_only, + struct btrfs_log_ctx *ctx) +{ + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; + struct super_block *sb; + struct dentry *old_parent = NULL; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; + + sb = inode->i_sb; + + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + if (root != BTRFS_I(inode)->root || + btrfs_root_refs(&root->root_item) == 0) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; + + if (btrfs_inode_in_log(inode, trans->transid)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + + ret = start_log_trans(trans, root, ctx); + if (ret) + goto end_no_trans; + + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + if (ret) + goto end_trans; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } + + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + + while (1) { + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + break; + + inode = d_inode(parent); + if (root != BTRFS_I(inode)->root) + break; + + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { + ret = btrfs_log_inode(trans, root, inode, inode_only, + 0, LLONG_MAX, ctx); + if (ret) + goto end_trans; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; +end_trans: + dput(old_parent); + if (ret < 0) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 1; + } + + if (ret) + btrfs_remove_log_ctx(root, ctx); + btrfs_end_log_trans(root); +end_no_trans: + return ret; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx) +{ + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, + start, end, 0, ctx); + dput(parent); + + return ret; +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + fs_info->log_root_recovering = 1; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error; + } + + wc.trans = trans; + wc.pin = 1; + + ret = walk_log_tree(trans, log_root_tree, &wc); + if (ret) { + btrfs_error(fs_info, ret, "Failed to pin buffers while " + "recovering log root tree."); + goto error; + } + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + + if (ret < 0) { + btrfs_error(fs_info, ret, + "Couldn't find tree log root."); + goto error; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root(log_root_tree, &found_key); + if (IS_ERR(log)) { + ret = PTR_ERR(log); + btrfs_error(fs_info, ret, + "Couldn't read tree log root."); + goto error; + } + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + if (IS_ERR(wc.replay_dest)) { + ret = PTR_ERR(wc.replay_dest); + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); + btrfs_error(fs_info, ret, "Couldn't read target root " + "for tree log recovery."); + goto error; + } + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(trans, wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); + + if (ret) + goto error; + + if (found_key.offset == 0) + break; + } + btrfs_release_path(path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + /* step 4: commit the transaction, which also unpins the blocks */ + ret = btrfs_commit_transaction(trans, fs_info->tree_root); + if (ret) + return ret; + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + kfree(log_root_tree); + + return 0; +error: + if (wc.trans) + btrfs_end_transaction(wc.trans, fs_info->tree_root); + btrfs_free_path(path); + return ret; +} + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 0, + LLONG_MAX, 1, NULL); +} + diff --git a/kernel/fs/btrfs/tree-log.h b/kernel/fs/btrfs/tree-log.h new file mode 100644 index 000000000..6916a781e --- /dev/null +++ b/kernel/fs/btrfs/tree-log.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +#include "ctree.h" +#include "transaction.h" + +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + +struct btrfs_log_ctx { + int log_ret; + int log_transid; + int io_err; + bool log_new_dentries; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->io_err = 0; + ctx->log_new_dentries = false; + INIT_LIST_HEAD(&ctx->list); +} + +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_log_ctx *ctx); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, + struct btrfs_log_ctx *ctx); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +void btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); +#endif diff --git a/kernel/fs/btrfs/ulist.c b/kernel/fs/btrfs/ulist.c new file mode 100644 index 000000000..840a38b27 --- /dev/null +++ b/kernel/fs/btrfs/ulist.c @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + * Distributed under the GNU GPL license version 2. + */ + +#include +#include "ulist.h" +#include "ctree.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * ULIST_ITER_INIT(&uiter); + * + * while ((elem = ulist_next(ulist, &uiter)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are adressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/** + * ulist_init - freshly initialize a ulist + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + INIT_LIST_HEAD(&ulist->nodes); + ulist->root = RB_ROOT; + ulist->nnodes = 0; +} + +/** + * ulist_fini - free up additionally allocated memory for the ulist + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +static void ulist_fini(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } + ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); +} + +/** + * ulist_reinit - prepare a ulist for reuse + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_fini(ulist); + ulist_init(ulist); +} + +/** + * ulist_alloc - dynamically allocate a ulist + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(gfp_t gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} + +/** + * ulist_free - free dynamically allocated ulist + * @ulist: ulist to free + * + * It is not necessary to call ulist_fini before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_fini(ulist); + kfree(ulist); +} + +static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) +{ + struct rb_node *n = ulist->root.rb_node; + struct ulist_node *u = NULL; + + while (n) { + u = rb_entry(n, struct ulist_node, rb_node); + if (u->val < val) + n = n->rb_right; + else if (u->val > val) + n = n->rb_left; + else + return u; + } + return NULL; +} + +static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) +{ + struct rb_node **p = &ulist->root.rb_node; + struct rb_node *parent = NULL; + struct ulist_node *cur = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct ulist_node, rb_node); + + if (cur->val < ins->val) + p = &(*p)->rb_right; + else if (cur->val > ins->val) + p = &(*p)->rb_left; + else + return -EEXIST; + } + rb_link_node(&ins->rb_node, parent, p); + rb_insert_color(&ins->rb_node, &ulist->root); + return 0; +} + +/** + * ulist_add - add an element to the ulist + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask) +{ + int ret; + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + if (node) { + if (old_aux) + *old_aux = node->aux; + return 0; + } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; + + node->val = val; + node->aux = aux; +#ifdef CONFIG_BTRFS_DEBUG + node->seqnum = ulist->nnodes; +#endif + + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; + + return 1; +} + +/** + * ulist_next - iterate ulist + * @ulist: ulist to iterate + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +{ + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) + return NULL; + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) + return NULL; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; +#ifdef CONFIG_BTRFS_DEBUG + uiter->i = 0; +#endif + } + node = list_entry(uiter->cur_list, struct ulist_node, list); +#ifdef CONFIG_BTRFS_DEBUG + ASSERT(node->seqnum == uiter->i); + ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); + uiter->i++; +#endif + return node; +} diff --git a/kernel/fs/btrfs/ulist.h b/kernel/fs/btrfs/ulist.h new file mode 100644 index 000000000..4c29db604 --- /dev/null +++ b/kernel/fs/btrfs/ulist.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + * Distributed under the GNU GPL license version 2. + * + */ + +#ifndef __ULIST__ +#define __ULIST__ + +#include +#include + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + */ +struct ulist_iterator { +#ifdef CONFIG_BTRFS_DEBUG + int i; +#endif + struct list_head *cur_list; /* hint to start search */ +}; + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + u64 aux; /* auxiliary value saved along with the val */ + +#ifdef CONFIG_BTRFS_DEBUG + int seqnum; /* sequence number this node is added */ +#endif + + struct list_head list; /* used to link node */ + struct rb_node rb_node; /* used to speed up search */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + struct list_head nodes; + struct rb_root root; +}; + +void ulist_init(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(gfp_t gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask); + +/* just like ulist_add_merge() but take a pointer for the aux data */ +static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, + void **old_aux, gfp_t gfp_mask) +{ +#if BITS_PER_LONG == 32 + u64 old64 = (uintptr_t)*old_aux; + int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask); + *old_aux = (void *)((uintptr_t)old64); + return ret; +#else + return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask); +#endif +} + +struct ulist_node *ulist_next(struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) + +#endif diff --git a/kernel/fs/btrfs/uuid-tree.c b/kernel/fs/btrfs/uuid-tree.c new file mode 100644 index 000000000..778282944 --- /dev/null +++ b/kernel/fs/btrfs/uuid-tree.c @@ -0,0 +1,354 @@ +/* + * Copyright (C) STRATO AG 2013. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + + +static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +{ + key->type = type; + key->objectid = get_unaligned_le64(uuid); + key->offset = get_unaligned_le64(uuid + sizeof(u64)); +} + +/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + u8 type, u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + u32 item_size; + unsigned long offset; + struct btrfs_key key; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -ENOENT; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto out; + } + while (item_size) { + __le64 data; + + read_extent_buffer(eb, &data, offset, sizeof(data)); + if (le64_to_cpu(data) == subid) { + ret = 0; + break; + } + offset += sizeof(data); + item_size -= sizeof(data); + } + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid_cpu) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + __le64 subid_le; + + ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); + if (ret != -ENOENT) + return ret; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, + sizeof(subid_le)); + if (ret >= 0) { + /* Add an item for the type for the first time */ + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + } else if (ret == -EEXIST) { + /* + * An item with that type already exists. + * Extend the item and store the new subid at the end. + */ + btrfs_extend_item(uuid_root, path, sizeof(subid_le)); + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + } else if (ret < 0) { + btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d " + "(0x%016llx, 0x%016llx) type %u!", + ret, (unsigned long long)key.objectid, + (unsigned long long)key.offset, type); + goto out; + } + + ret = 0; + subid_le = cpu_to_le64(subid_cpu); + write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + u32 item_size; + unsigned long move_dst; + unsigned long move_src; + unsigned long move_len; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!", + ret); + goto out; + } + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size_nr(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + ret = -ENOENT; + goto out; + } + while (item_size) { + __le64 read_subid; + + read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); + if (le64_to_cpu(read_subid) == subid) + break; + offset += sizeof(read_subid); + item_size -= sizeof(read_subid); + } + + if (!item_size) { + ret = -ENOENT; + goto out; + } + + item_size = btrfs_item_size_nr(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; + } + + move_dst = offset; + move_src = offset + sizeof(subid); + move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); + memmove_extent_buffer(eb, move_dst, move_src, move_len); + btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* 1 - for the uuid item */ + trans = btrfs_start_transaction(uuid_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid); + btrfs_end_transaction(trans, uuid_root); + +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)) +{ + struct btrfs_root *root = fs_info->uuid_root; + struct btrfs_key key; + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + int slot; + u32 item_size; + unsigned long offset; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = 0; + key.offset = 0; + +again_search_slot: + ret = btrfs_search_forward(root, &key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + + while (1) { + cond_resched(); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_UUID_KEY_SUBVOL && + key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto skip; + } + while (item_size) { + u8 uuid[BTRFS_UUID_SIZE]; + __le64 subid_le; + u64 subid_cpu; + + put_unaligned_le64(key.objectid, uuid); + put_unaligned_le64(key.offset, uuid + sizeof(u64)); + read_extent_buffer(leaf, &subid_le, offset, + sizeof(subid_le)); + subid_cpu = le64_to_cpu(subid_le); + ret = check_func(fs_info, uuid, key.type, subid_cpu); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_uuid_iter_rem(root, uuid, key.type, + subid_cpu); + if (ret == 0) { + /* + * this might look inefficient, but the + * justification is that it is an + * exception that check_func returns 1, + * and that in the regular case only one + * entry per UUID exists. + */ + goto again_search_slot; + } + if (ret < 0 && ret != -ENOENT) + goto out; + } + item_size -= sizeof(subid_le); + offset += sizeof(subid_le); + } + +skip: + ret = btrfs_next_item(root, path); + if (ret == 0) + continue; + else if (ret > 0) + ret = 0; + break; + } + +out: + btrfs_free_path(path); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); + return 0; +} diff --git a/kernel/fs/btrfs/volumes.c b/kernel/fs/btrfs/volumes.c new file mode 100644 index 000000000..174f5e1e0 --- /dev/null +++ b/kernel/fs/btrfs/volumes.c @@ -0,0 +1,6730 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "math.h" +#include "dev-replace.h" +#include "sysfs.h" + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); + +DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +static struct btrfs_fs_devices *__alloc_fs_devices(void) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); + if (!fs_devs) + return ERR_PTR(-ENOMEM); + + mutex_init(&fs_devs->device_list_mutex); + + INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->resized_devices); + INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->list); + + return fs_devs; +} + +/** + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: a pointer to UUID for this FS. If NULL a new UUID is + * generated. + * + * Return: a pointer to a new &struct btrfs_fs_devices on success; + * ERR_PTR() on error. Returned struct is not linked onto any lists and + * can be destroyed with kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = __alloc_fs_devices(); + if (IS_ERR(fs_devs)) + return fs_devs; + + if (fsid) + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + else + generate_random_uuid(fs_devs->fsid); + + return fs_devs; +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + rcu_string_free(device->name); + kfree(device); + } + kfree(fs_devices); +} + +static void btrfs_kobject_uevent(struct block_device *bdev, + enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + +void btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } +} + +static struct btrfs_device *__alloc_device(void) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_NOFS); + if (!dev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->resized_list); + + spin_lock_init(&dev->io_lock); + + spin_lock_init(&dev->reada_lock); + atomic_set(&dev->reada_in_flight, 0); + atomic_set(&dev->dev_stats_ccnt, 0); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + + return dev; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, head, dev_list) { + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, list) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, + int flush, struct block_device **bdev, + struct buffer_head **bh) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + printk(KERN_INFO "BTRFS: open %s failed\n", device_path); + goto error; + } + + if (flush) + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + ret = set_blocksize(*bdev, 4096); + if (ret) { + blkdev_put(*bdev, flags); + goto error; + } + invalidate_bdev(*bdev); + *bh = btrfs_read_dev_super(*bdev); + if (!*bh) { + ret = -EINVAL; + blkdev_put(*bdev, flags); + goto error; + } + + return 0; + +error: + *bdev = NULL; + *bh = NULL; + return ret; +} + +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline void run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run; + unsigned long batch_run = 0; + unsigned long limit; + unsigned long last_waited = 0; + int force_reg = 0; + int sync_pending = 0; + struct blk_plug plug; + + /* + * this function runs all the bios we've collected for + * a particular device. We don't want to wander off to + * another device without first sending all of these down. + * So, setup a plug here and finish it off before we return + */ + blk_start_plug(&plug); + + bdi = blk_get_backing_dev_info(device->bdev); + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + +loop: + spin_lock(&device->io_lock); + +loop_lock: + num_run = 0; + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + if (!force_reg && device->pending_sync_bios.head) { + pending_bios = &device->pending_sync_bios; + force_reg = 1; + } else { + pending_bios = &device->pending_bios; + force_reg = 0; + } + + pending = pending_bios->head; + tail = pending_bios->tail; + WARN_ON(pending && !tail); + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { + again = 0; + device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; + } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + + spin_unlock(&device->io_lock); + + while (pending) { + + rmb(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + + if (atomic_dec_return(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + + /* + * if we're doing the sync list, record that our + * plug has some sync requests on it + * + * If we're doing the regular list and there are + * sync requests sitting around, unplug before + * we add more + */ + if (pending_bios == &device->pending_sync_bios) { + sync_pending = 1; + } else if (sync_pending) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } + + btrfsic_submit_bio(cur->bi_rw, cur); + num_run++; + batch_run++; + + cond_resched(); + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && batch_run > 8 && + fs_info->fs_devices->open_devices > 1) { + struct io_context *ioc; + + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + cond_resched(); + continue; + } + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + device->running_pending = 1; + + spin_unlock(&device->io_lock); + btrfs_queue_work(fs_info->submit_workers, + &device->work); + goto done; + } + /* unplug every 64 requests just for good measure */ + if (batch_run % 64 == 0) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } + } + + cond_resched(); + if (again) + goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios.head || device->pending_sync_bios.head) + goto loop_lock; + spin_unlock(&device->io_lock); + +done: + blk_finish_plug(&plug); +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + struct rcu_string *name; + int ret = 0; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + + list_add(&fs_devices->list, &fs_uuids); + + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = btrfs_alloc_device(NULL, &devid, + disk_super->dev_item.uuid); + if (IS_ERR(device)) { + /* we can safely leave the fs_devices entry around */ + return PTR_ERR(device); + } + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { + kfree(device); + return -ENOMEM; + } + rcu_assign_pointer(device->name, name); + + mutex_lock(&fs_devices->device_list_mutex); + list_add_rcu(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + mutex_unlock(&fs_devices->device_list_mutex); + + ret = 1; + device->fs_devices = fs_devices; + } else if (!device->name || strcmp(device->name->str, path)) { + /* + * When FS is already mounted. + * 1. If you are here and if the device->name is NULL that + * means this device was missing at time of FS mount. + * 2. If you are here and if the device->name is different + * from 'path' that means either + * a. The same device disappeared and reappeared with + * different name. or + * b. The missing-disk-which-was-replaced, has + * reappeared now. + * + * We must allow 1 and 2a above. But 2b would be a spurious + * and unintentional. + * + * Further in case of 1 and 2a above, the disk at 'path' + * would have missed some transaction when it was away and + * in case of 2a the stale bdev has to be updated as well. + * 2b must not be allowed at all time. + */ + + /* + * For now, we do allow update to btrfs_fs_device through the + * btrfs dev scan cli after FS has been mounted. We're still + * tracking a problem where systems fail mount by subvolume id + * when we reject replacement on a mounted FS. + */ + if (!fs_devices->opened && found_transid < device->generation) { + /* + * That is if the FS is _not_ mounted and if you + * are here, that means there is more than one + * disk with same uuid and devid.We keep the one + * with larger generation number or the last-in if + * generation are equal. + */ + return -EEXIST; + } + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) + return -ENOMEM; + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } + } + + /* + * Unmount does not free the btrfs_device struct but would zero + * generation along with most of the other members. So just update + * it back. We need it to pick the disk with largest generation + * (as above). + */ + if (!fs_devices->opened) + device->generation = found_transid; + + *fs_devices_ret = fs_devices; + + return ret; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = alloc_fs_devices(orig->fsid); + if (IS_ERR(fs_devices)) + return fs_devices; + + mutex_lock(&orig->device_list_mutex); + fs_devices->total_devices = orig->total_devices; + + /* We have held the volume lock, it is safe to get the devices. */ + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + struct rcu_string *name; + + device = btrfs_alloc_device(NULL, &orig_dev->devid, + orig_dev->uuid); + if (IS_ERR(device)) + goto error; + + /* + * This is ok to do without rcu read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); + } + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + mutex_unlock(&orig->device_list_mutex); + return fs_devices; +error: + mutex_unlock(&orig->device_list_mutex); + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *device, *next; + struct btrfs_device *latest_dev = NULL; + + mutex_lock(&uuid_mutex); +again: + /* This is the initialized path, it is safe to release the devices. */ + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->in_fs_metadata) { + if (!device->is_tgtdev_for_dev_replace && + (!latest_dev || + device->generation > latest_dev->generation)) { + latest_dev = device; + } + continue; + } + + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } + if (device->bdev) { + blkdev_put(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + rcu_string_free(device->name); + kfree(device); + } + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + fs_devices->latest_bdev = latest_dev->bdev; + + mutex_unlock(&uuid_mutex); +} + +static void __free_device(struct work_struct *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, rcu_work); + + if (device->bdev) + blkdev_put(device->bdev, device->mode); + + rcu_string_free(device->name); + kfree(device); +} + +static void free_device(struct rcu_head *head) +{ + struct btrfs_device *device; + + device = container_of(head, struct btrfs_device, rcu); + + INIT_WORK(&device->rcu_work, __free_device); + schedule_work(&device->rcu_work); +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); + } + mutex_unlock(&fs_devices->device_list_mutex); + + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + /* + * Wait for rcu kworkers under __btrfs_close_devices + * to finish all blkdev_puts so device is really + * free when umount is done. + */ + rcu_barrier(); + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfs_device *latest_dev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 devid; + int seeding = 1; + int ret = 0; + + flags |= FMODE_EXCL; + + list_for_each_entry(device, head, dev_list) { + if (device->bdev) + continue; + if (!device->name) + continue; + + /* Just open everything we can; ignore failures here */ + if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_dev || + device->generation > latest_dev->generation) + latest_dev = device; + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + + fs_devices->open_devices++; + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + brelse(bh); + continue; + +error_brelse: + brelse(bh); + blkdev_put(bdev, flags); + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EINVAL; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct page *page; + void *p; + int ret = -EINVAL; + u64 devid; + u64 transid; + u64 total_devices; + u64 bytenr; + pgoff_t index; + + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + bytenr = btrfs_sb_offset(0); + flags |= FMODE_EXCL; + mutex_lock(&uuid_mutex); + + bdev = blkdev_get_by_path(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + /* make sure our super fits in the device */ + if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + goto error_bdev_put; + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + goto error_bdev_put; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_CACHE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) + goto error_bdev_put; + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_NOFS); + + if (IS_ERR_OR_NULL(page)) + goto error_bdev_put; + + p = kmap(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) + goto error_unmap; + + devid = btrfs_stack_device_id(&disk_super->dev_item); + transid = btrfs_super_generation(disk_super); + total_devices = btrfs_super_num_devices(disk_super); + + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } + + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } + if (!ret && fs_devices_ret) + (*fs_devices_ret)->total_devices = total_devices; + +error_unmap: + kunmap(page); + page_cache_release(page); + +error_bdev_put: + blkdev_put(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (key.type != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int contains_pending_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 *start, u64 len) +{ + struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; + int ret = 0; + u64 physical_start = *start; + +again: + list_for_each_entry(em, search_list, list) { + struct map_lookup *map; + int i; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev != device) + continue; + if (map->stripes[i].physical >= physical_start + len || + map->stripes[i].physical + em->orig_block_len <= + physical_start) + continue; + *start = map->stripes[i].physical + + em->orig_block_len; + ret = 1; + } + } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } + + return ret; +} + + +/* + * find_free_dev_extent - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. + */ +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; + u64 search_end = device->total_bytes; + int ret; + int slot; + struct extent_buffer *l; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + max_hole_start = search_start; + max_hole_size = 0; + +again: + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { + ret = -ENOSPC; + goto out; + } + + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (key.type != BTRFS_DEV_EXTENT_KEY) + goto next; + + if (key.offset > search_start) { + hole_size = key.offset - search_start; + + /* + * Have to check before we set max_hole_start, otherwise + * we could end up sending back this offset anyway. + */ + if (contains_pending_extent(trans, device, + &search_start, + hole_size)) { + if (key.offset >= search_start) { + hole_size = key.offset - search_start; + } else { + WARN_ON_ONCE(1); + hole_size = 0; + } + } + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; + } + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; +next: + path->slots[0]++; + cond_resched(); + } + + /* + * At this point, search_start should be the end of + * allocated dev extents, and when shrinking the device, + * search_end may be smaller than search_start. + */ + if (search_end > search_start) { + hole_size = search_end - search_start; + + if (contains_pending_extent(trans, device, &search_start, + hole_size)) { + btrfs_release_path(path); + goto again; + } + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + } + + /* See above. */ + if (max_hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: + btrfs_free_path(path); + *start = max_hole_start; + if (len) + *len = max_hole_size; + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start, u64 *dev_extent_len) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + if (ret) + goto out; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } else { + btrfs_error(root->fs_info, ret, "Slot search failed"); + goto out; + } + + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + + ret = btrfs_del_item(trans, root, path); + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; + } +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + WARN_ON(device->is_tgtdev_for_dev_replace); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) + goto out; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; + } + read_unlock(&em_tree->lock); + + return ret; +} + +static noinline int find_next_devid(struct btrfs_fs_info *fs_info, + u64 *devid_ret) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); /* Corruption */ + + ret = btrfs_previous_item(fs_info->chunk_root, path, + BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *devid_ret = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *devid_ret = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +static int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (IS_ERR(filp)) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + struct btrfs_fs_devices *cur_devices; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + unsigned seq; + int ret = 0; + bool clear_super = false; + + mutex_lock(&uuid_mutex); + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && + !tmp->is_tgtdev_for_dev_replace && + !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + goto out; + } + } else { + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_WRITE | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) + goto out; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->is_tgtdev_for_dev_replace) { + ret = BTRFS_ERROR_DEV_TGT_REPLACE; + goto error_brelse; + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; + goto error_brelse; + } + + if (device->writeable) { + lock_chunks(root); + list_del_init(&device->dev_alloc_list); + device->fs_devices->rw_devices--; + unlock_chunks(root); + clear_super = true; + } + + mutex_unlock(&uuid_mutex); + ret = btrfs_shrink_device(device, 0); + mutex_lock(&uuid_mutex); + if (ret) + goto error_undo; + + /* + * TODO: the superblock still includes this device in its num_devices + * counter although write_all_supers() is not locked out. This + * could give a filesystem state which requires a degraded mount. + */ + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_undo; + + device->in_fs_metadata = 0; + btrfs_scrub_cancel_dev(root->fs_info, device); + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. Whoever is writing all supers, should + * lock the device list mutex before getting the number of + * devices in the super block (super_copy). Conversely, + * whoever updates the number of devices in the super block + * (super_copy) should hold the device list mutex. + */ + + cur_devices = device->fs_devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_del_rcu(&device->dev_list); + + device->fs_devices->num_devices--; + device->fs_devices->total_devices--; + + if (device->missing) + device->fs_devices->missing_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) { + device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } + + call_rcu(&device->rcu, free_device); + + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + if (cur_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; + break; + } + fs_devices = fs_devices->seed; + } + cur_devices->seed = NULL; + __btrfs_close_devices(cur_devices); + free_fs_devices(cur_devices); + } + + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (clear_super && disk_super) { + u64 bytenr; + int i; + + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + } + + ret = 0; + + if (bdev) { + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + +error_brelse: + brelse(bh); + if (bdev) + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); +out: + mutex_unlock(&uuid_mutex); + return ret; +error_undo: + if (device->writeable) { + lock_chunks(root); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + device->fs_devices->rw_devices++; + unlock_chunks(root); + } + goto error_brelse; +} + +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices; + + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + + /* + * in case of fs with no seed, srcdev->fs_devices will point + * to fs_devices of fs_info. However when the dev being replaced is + * a seed dev it will point to the seed's local fs_devices. In short + * srcdev will have its correct fs_devices in both the cases. + */ + fs_devices = srcdev->fs_devices; + + list_del_rcu(&srcdev->dev_list); + list_del_rcu(&srcdev->dev_alloc_list); + fs_devices->num_devices--; + if (srcdev->missing) + fs_devices->missing_devices--; + + if (srcdev->writeable) { + fs_devices->rw_devices--; + /* zero out the old super if it is writable */ + btrfs_scratch_superblock(srcdev); + } + + if (srcdev->bdev) + fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + + call_rcu(&srcdev->rcu, free_device); + + /* + * unless fs_devices is seed fs, num_devices shouldn't go + * zero + */ + BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); + + /* if this is no devs we rather delete the fs_devices */ + if (!fs_devices->num_devices) { + struct btrfs_fs_devices *tmp_fs_devices; + + tmp_fs_devices = fs_info->fs_devices; + while (tmp_fs_devices) { + if (tmp_fs_devices->seed == fs_devices) { + tmp_fs_devices->seed = fs_devices->seed; + break; + } + tmp_fs_devices = tmp_fs_devices->seed; + } + fs_devices->seed = NULL; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + struct btrfs_device *next_device; + + mutex_lock(&uuid_mutex); + WARN_ON(!tgtdev); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + if (tgtdev->bdev) { + btrfs_scratch_superblock(tgtdev); + fs_info->fs_devices->open_devices--; + } + fs_info->fs_devices->num_devices--; + + next_device = list_entry(fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (tgtdev->bdev == fs_info->sb->s_bdev) + fs_info->sb->s_bdev = next_device->bdev; + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; + list_del_rcu(&tgtdev->dev_list); + + call_rcu(&tgtdev->rcu, free_device); + + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); +} + +static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) +{ + int ret = 0; + struct btrfs_super_block *disk_super; + u64 devid; + u8 *dev_uuid; + struct block_device *bdev; + struct buffer_head *bh; + + *device = NULL; + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, + root->fs_info->bdev_holder, 0, &bdev, &bh); + if (ret) + return ret; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, + disk_super->fsid); + brelse(bh); + if (!*device) + ret = -ENOENT; + blkdev_put(bdev, FMODE_READ); + return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device) +{ + *device = NULL; + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held by the caller. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + *device = tmp; + break; + } + } + + if (!*device) { + btrfs_err(root->fs_info, "no missing device found"); + return -ENOENT; + } + + return 0; + } else { + return btrfs_find_device_by_path(root, device_path, device); + } +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = __alloc_fs_devices(); + if (IS_ERR(seed_devices)) + return PTR_ERR(seed_devices); + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, + synchronize_rcu); + list_for_each_entry(device, &seed_devices->devices, dev_list) + device->fs_devices = seed_devices; + + lock_chunks(root); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + unlock_chunks(root); + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->missing_devices = 0; + fs_devices->rotating = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + fs_uuid); + BUG_ON(!device); /* Logic error */ + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct request_queue *q; + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + struct rcu_string *name; + u64 tmp; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EROFS; + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &root->fs_info->fs_devices->devices; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + goto error; + } + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + device = btrfs_alloc_device(root->fs_info, NULL, NULL); + if (IS_ERR(device)) { + /* we can safely leave the fs_devices entry around */ + ret = PTR_ERR(device); + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + rcu_string_free(device->name); + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + device->writeable = 1; + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 0; + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(root); + BUG_ON(ret); /* -ENOMEM */ + } + + device->fs_devices = root->fs_info->fs_devices; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + lock_chunks(root); + list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_devices++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + + tmp = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, + tmp + device->total_bytes); + + tmp = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, + tmp + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + if (seeding_dev) { + lock_chunks(root); + ret = init_first_rw_device(trans, root, device); + unlock_chunks(root); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + } + + ret = btrfs_add_device(trans, root, device); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + + ret = btrfs_finish_sprout(trans, root); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; + } + + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + ret = btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + if (ret) /* transaction commit */ + return ret; + + ret = btrfs_relocate_sys_chunks(root); + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to relocate sys chunks after " + "device initialization. This can be fixed " + "using the \"btrfs balance\" command."); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + } + + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); + return ret; + +error_trans: + btrfs_end_transaction(trans, root); + rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); + kfree(device); +error: + blkdev_put(bdev, FMODE_EXCL); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + return ret; +} + +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, + struct btrfs_device **device_out) +{ + struct request_queue *q; + struct btrfs_device *device; + struct block_device *bdev; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *devices; + struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); + return -EINVAL; + } + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); + return PTR_ERR(bdev); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + btrfs_err(fs_info, "target device is in the filesystem!"); + ret = -EEXIST; + goto error; + } + } + + + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + device->writeable = 1; + device->generation = 0; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + ASSERT(list_empty(&srcdev->resized_list)); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; + device->dev_root = fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 1; + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, 4096); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return ret; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + WARN_ON(fs_info->fs_devices->rw_devices == 0); + tgtdev->io_width = fs_info->dev_root->sectorsize; + tgtdev->io_align = fs_info->dev_root->sectorsize; + tgtdev->sector_size = fs_info->dev_root->sectorsize; + tgtdev->dev_root = fs_info->dev_root; + tgtdev->in_fs_metadata = 1; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + device->dev_root->fs_info->super_copy; + struct btrfs_fs_devices *fs_devices; + u64 old_total; + u64 diff; + + if (!device->writeable) + return -EACCES; + + lock_chunks(device->dev_root); + old_total = btrfs_super_total_bytes(super_copy); + diff = new_size - device->total_bytes; + + if (new_size <= device->total_bytes || + device->is_tgtdev_for_dev_replace) { + unlock_chunks(device->dev_root); + return -EINVAL; + } + + fs_devices = device->dev_root->fs_info->fs_devices; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + btrfs_device_set_total_bytes(device, new_size); + btrfs_device_set_disk_total_bytes(device, new_size); + btrfs_clear_space_info_full(device->dev_root->fs_info); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &fs_devices->resized_devices); + unlock_chunks(device->dev_root); + + return btrfs_update_device(trans, device); +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + else if (ret > 0) { /* Logic error or corruption */ + btrfs_error(root->fs_info, -ENOENT, + "Failed lookup while freeing chunk."); + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to delete chunk item."); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + lock_chunks(root); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + unlock_chunks(root); + return ret; +} + +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct map_lookup *map; + u64 dev_extent_len = 0; + u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + int i, ret = 0; + + /* Just in case */ + root = root->fs_info->chunk_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + + if (!em || em->start > chunk_offset || + em->start + em->len < chunk_offset) { + /* + * This is a logic error, but we don't want to just rely on the + * user having built with ASSERT enabled, so if ASSERT doens't + * do anything we still error out. + */ + ASSERT(0); + if (em) + free_extent_map(em); + return -EINVAL; + } + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + ret = btrfs_free_dev_extent(trans, device, + map->stripes[i].physical, + &dev_extent_len); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + if (device->bytes_used > 0) { + lock_chunks(root); + btrfs_device_set_bytes_used(device, + device->bytes_used - dev_extent_len); + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += dev_extent_len; + spin_unlock(&root->fs_info->free_chunk_lock); + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); + } + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + } + } + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + +out: + /* once for us */ + free_extent_map(em); + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_objectid, + u64 chunk_offset) +{ + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + int ret; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + ret = btrfs_remove_chunk(trans, root, chunk_offset); + btrfs_end_transaction(trans, root); + return ret; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_type; + bool retried = false; + int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, + found_key.objectid, + found_key.offset); + if (ret == -ENOSPC) + failed++; + else + BUG_ON(ret); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (WARN_ON(failed && retried)) { + ret = -ENOSPC; + } +error: + btrfs_free_path(path); + return ret; +} + +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->profiles & chunk_type) + return 0; + + return 1; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage == 0) + user_thresh = 1; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { + factor = num_stripes / 2; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { + factor = num_stripes - 1; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { + factor = num_stripes - 2; + } else { + factor = num_stripes; + } + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + stripe_length = div_u64(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->target == chunk_type) + return 1; + + return 0; +} + +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + + return 1; +} + +static int __btrfs_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_chunk *chunk; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; + + /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + old_size = btrfs_device_get_total_bytes(device); + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + btrfs_device_get_total_bytes(device) - + btrfs_device_get_bytes_used(device) > size_to_free || + device->is_tgtdev_for_dev_replace) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + if (ret == -ENOSPC) + break; + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 0); + BUG_ON(IS_ERR(trans)); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + BUG(); /* FIXME break ? */ + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) { + ret = 0; + break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); + btrfs_release_path(path); + if (!ret) + goto loop; + + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + + ret = btrfs_relocate_chunk(chunk_root, + found_key.objectid, + found_key.offset); + if (ret && ret != -ENOSPC) + goto error; + if (ret == -ENOSPC) { + enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } +error: + btrfs_free_path(path); + if (enospc_errors) { + btrfs_info(fs_info, "%d enospc errors during balance", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + + return ret; +} + +/** + * alloc_profile_is_valid - see if a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static int alloc_profile_is_valid(u64 flags, int extended) +{ + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) + return 0; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + /* true if exactly one bit set */ + return (flags & (flags - 1)) == 0; +} + +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + int ret; + + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + if (ret) + btrfs_std_error(fs_info, ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +} + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; + int mixed = 0; + int ret; + u64 num_devices; + unsigned seq; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -EINVAL; + goto out; + } + + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; + if (mixed && (bctl->flags & allowed)) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + btrfs_err(fs_info, "with mixed groups data and " + "metadata balance options must be the same"); + ret = -EINVAL; + goto out; + } + } + + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (num_devices > 1) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); + if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->data.target, 1) || + (bctl->data.target & ~allowed))) { + btrfs_err(fs_info, "unable to start balance with target " + "data profile %llu", + bctl->data.target); + ret = -EINVAL; + goto out; + } + if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->meta.target, 1) || + (bctl->meta.target & ~allowed))) { + btrfs_err(fs_info, + "unable to start balance with target metadata profile %llu", + bctl->meta.target); + ret = -EINVAL; + goto out; + } + if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->sys.target, 1) || + (bctl->sys.target & ~allowed))) { + btrfs_err(fs_info, + "unable to start balance with target system profile %llu", + bctl->sys.target); + ret = -EINVAL; + goto out; + } + + /* allow dup'ed data chunks only in mixed mode */ + if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { + btrfs_err(fs_info, "dup for data is not allowed"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, "force reducing metadata integrity"); + } else { + btrfs_err(fs_info, "balance will reduce metadata " + "integrity, use force if you want this"); + ret = -EINVAL; + goto out; + } + } + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + int num_tolerated_disk_barrier_failures; + u64 target = bctl->sys.target; + + num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (num_tolerated_disk_barrier_failures > 0 && + (target & + (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_AVAIL_ALLOC_BIT_SINGLE))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 && + (target & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + + fs_info->num_tolerated_disk_barrier_failures = + num_tolerated_disk_barrier_failures; + } + + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + atomic_inc(&fs_info->balance_running); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); + + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + } + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else { + kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + int ret = 0; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) { + btrfs_info(fs_info, "continuing balance"); + ret = btrfs_balance(fs_info->balance_ctl, NULL); + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + + return ret; +} + +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *tsk; + + spin_lock(&fs_info->balance_lock); + if (!fs_info->balance_ctl) { + spin_unlock(&fs_info->balance_lock); + return 0; + } + spin_unlock(&fs_info->balance_lock); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + btrfs_info(fs_info, "force skipping balance"); + return 0; + } + + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); + return PTR_ERR_OR_ZERO(tsk); +} + +int btrfs_recover_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = fs_info; + bctl->flags = btrfs_balance_flags(leaf, item); + bctl->flags |= BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + +static int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + max_key.objectid = (u64)-1; + max_key.type = BTRFS_ROOT_ITEM_KEY; + max_key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_forward(root, &key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + if (trans) { + ret = btrfs_end_transaction(trans, fs_info->uuid_root); + trans = NULL; + if (ret) + break; + } + + btrfs_release_path(path); + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fs_info->uuid_root); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else + fs_info->update_uuid_tree_gen = 1; + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +/* + * Callback for btrfs_uuid_tree_iterate(). + * returns: + * 0 check succeeded, the entry is not outdated. + * < 0 if an error occured. + * > 0 if the check failed, which means the caller shall remove the entry. + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + +out: + return ret; +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); + if (ret < 0) { + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, fs_info, + BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + btrfs_abort_transaction(trans, tree_root, + PTR_ERR(uuid_root)); + return PTR_ERR(uuid_root); + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + int failed = 0; + bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = btrfs_device_get_total_bytes(device); + u64 diff = old_size - new_size; + + if (device->is_tgtdev_for_dev_replace) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + + lock_chunks(root); + + btrfs_device_set_total_bytes(device, new_size); + if (device->writeable) { + device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } + unlock_chunks(root); + +again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + do { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + btrfs_release_path(path); + break; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) { + btrfs_release_path(path); + break; + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) { + btrfs_release_path(path); + break; + } + + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(path); + + ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + if (ret && ret != -ENOSPC) + goto done; + if (ret == -ENOSPC) + failed++; + } while (key.offset-- > 0); + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + lock_chunks(root); + + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + + lock_chunks(root); + btrfs_device_set_disk_total_bytes(device, new_size); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &root->fs_info->fs_devices->resized_devices); + + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + btrfs_end_transaction(trans, root); +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + lock_chunks(root); + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + unlock_chunks(root); + return -EFBIG; + } + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + unlock_chunks(root); + + return 0; +} + +/* + * sort the devices in descending order by max_avail, total_avail + */ +static int btrfs_cmp_device_info(const void *a, const void *b) +{ + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) + return 1; + if (di_a->total_avail > di_b->total_avail) + return -1; + if (di_a->total_avail < di_b->total_avail) + return 1; + return 0; +} + +static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO allow them to set a preferred stripe size */ + return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + return; + + btrfs_set_fs_incompat(info, RAID56); +} + +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 start, + u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info = NULL; + u64 total_avail; + int num_stripes; /* total number of stripes to allocate */ + int data_stripes; /* number of stripes that count for + block group size */ + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ + int ret; + u64 max_stripe_size; + u64 max_chunk_size; + u64 stripe_size; + u64 num_bytes; + u64 raid_stripe_len = BTRFS_STRIPE_LEN; + int ndevs; + int i; + int j; + int index; + + BUG_ON(!alloc_profile_is_valid(type, 0)); + + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + index = __get_raid_index(type); + + sub_stripes = btrfs_raid_array[index].sub_stripes; + dev_stripes = btrfs_raid_array[index].dev_stripes; + devs_max = btrfs_raid_array[index].devs_max; + devs_min = btrfs_raid_array[index].devs_min; + devs_increment = btrfs_raid_array[index].devs_increment; + ncopies = btrfs_raid_array[index].ncopies; + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_stripe_size = 1024 * 1024 * 1024; + max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; + max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + max_stripe_size = 32 * 1024 * 1024; + max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; + } else { + btrfs_err(info, "invalid chunk type 0x%llx requested", + type); + BUG_ON(1); + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + cur = fs_devices->alloc_list.next; + + /* + * in the first pass through the devices list, we gather information + * about the available holes on each device. + */ + ndevs = 0; + while (cur != &fs_devices->alloc_list) { + struct btrfs_device *device; + u64 max_avail; + u64 dev_offset; + + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + + cur = cur->next; + + if (!device->writeable) { + WARN(1, KERN_ERR + "BTRFS: read-only device in alloc_list\n"); + continue; + } + + if (!device->in_fs_metadata || + device->is_tgtdev_for_dev_replace) + continue; + + if (device->total_bytes > device->bytes_used) + total_avail = device->total_bytes - device->bytes_used; + else + total_avail = 0; + + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; + + ret = find_free_dev_extent(trans, device, + max_stripe_size * dev_stripes, + &dev_offset, &max_avail); + if (ret && ret != -ENOSPC) + goto error; + + if (ret == 0) + max_avail = max_stripe_size * dev_stripes; + + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + continue; + + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + + /* round down to number of usable stripes */ + ndevs -= ndevs % devs_increment; + + if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + ret = -ENOSPC; + goto error; + } + + if (devs_max && ndevs > devs_max) + ndevs = devs_max; + /* + * the primary goal is to maximize the number of stripes, so use as many + * devices as possible, even if the stripes are not maximum sized. + */ + stripe_size = devices_info[ndevs-1].max_avail; + num_stripes = ndevs * dev_stripes; + + /* + * this will have to be fixed for RAID1 and RAID10 over + * more drives + */ + data_stripes = num_stripes / ncopies; + + if (type & BTRFS_BLOCK_GROUP_RAID5) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 1; + } + if (type & BTRFS_BLOCK_GROUP_RAID6) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 2; + } + + /* + * Use the number of data stripes to figure out how big this chunk + * is really going to be in terms of logical address space, + * and compare that answer with the max chunk size + */ + if (stripe_size * data_stripes > max_chunk_size) { + u64 mask = (1ULL << 24) - 1; + + stripe_size = div_u64(max_chunk_size, data_stripes); + + /* bump the answer up to a 16MB boundary */ + stripe_size = (stripe_size + mask) & ~mask; + + /* but don't go higher than the limits we found + * while searching for free extents + */ + if (stripe_size > devices_info[ndevs-1].max_avail) + stripe_size = devices_info[ndevs-1].max_avail; + } + + stripe_size = div_u64(stripe_size, dev_stripes); + + /* align to BTRFS_STRIPE_LEN */ + stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size *= raid_stripe_len; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; + + for (i = 0; i < ndevs; ++i) { + for (j = 0; j < dev_stripes; ++j) { + int s = i * dev_stripes + j; + map->stripes[s].dev = devices_info[i].dev; + map->stripes[s].physical = devices_info[i].dev_offset + + j * stripe_size; + } + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = raid_stripe_len; + map->io_align = raid_stripe_len; + map->io_width = raid_stripe_len; + map->type = type; + map->sub_stripes = sub_stripes; + + num_bytes = stripe_size * data_stripes; + + trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); + + em = alloc_extent_map(); + if (!em) { + kfree(map); + ret = -ENOMEM; + goto error; + } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->bdev = (struct block_device *)map; + em->start = start; + em->len = num_bytes; + em->block_start = 0; + em->block_len = em->len; + em->orig_block_len = stripe_size; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + if (!ret) { + list_add_tail(&em->list, &trans->transaction->pending_chunks); + atomic_inc(&em->refs); + } + write_unlock(&em_tree->lock); + if (ret) { + free_extent_map(em); + goto error; + } + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, num_bytes); + if (ret) + goto error_del_extent; + + for (i = 0; i < map->num_stripes; i++) { + num_bytes = map->stripes[i].dev->bytes_used + stripe_size; + btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); + } + + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + + free_extent_map(em); + check_raid56_incompat_flag(extent_root->fs_info, type); + + kfree(devices_info); + return 0; + +error_del_extent: + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); + /* One for the pending_chunks list reference */ + free_extent_map(em); +error: + kfree(devices_info); + return ret; +} + +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size) +{ + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + u64 dev_offset; + u64 stripe_size; + int i = 0; + int ret; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(extent_root->fs_info, "unable to find logical " + "%Lu len %Lu", chunk_offset, chunk_size); + return -EINVAL; + } + + if (em->start != chunk_offset || em->len != chunk_size) { + btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, + chunk_size, em->start, em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = btrfs_update_device(trans, device); + if (ret) + goto out; + ret = btrfs_alloc_dev_extent(trans, device, + chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_offset, dev_offset, + stripe_size); + if (ret) + goto out; + } + + stripe = &chunk->stripe; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + /* + * TODO: Cleanup of inserted chunk root in case of + * failure. + */ + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, + item_size); + } + +out: + kfree(chunk); + free_extent_map(em); + return ret; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + + ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); + chunk_offset = find_next_chunk(extent_root->fs_info); + return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 alloc_profile; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + chunk_offset = find_next_chunk(fs_info); + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, + alloc_profile); + if (ret) + return ret; + + sys_chunk_offset = find_next_chunk(root->fs_info); + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + alloc_profile); + return ret; +} + +static inline int btrfs_chunk_max_errors(struct map_lookup *map) +{ + int max_errors; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; + } else { + max_errors = 0; + } + + return max_errors; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int miss_ndevs = 0; + int i; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev->missing) { + miss_ndevs++; + continue; + } + + if (!map->stripes[i].dev->writeable) { + readonly = 1; + goto end; + } + } + + /* + * If the number of missing devices is larger than max errors, + * we can not write the data into that chunk successfully, so + * set it readonly. + */ + if (miss_ndevs > btrfs_chunk_max_errors(map)) + readonly = 1; +end: + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + write_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + write_unlock(&tree->map_tree.lock); + if (!em) + break; + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) +{ + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + + /* + * We could return errors for these cases, but that could get ugly and + * we'd probably do the same thing which is just not do anything else + * and exit, so return 1 so the callers don't try to use other copies. + */ + if (!em) { + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, + logical+len); + return 1; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " + "%Lu-%Lu", logical, logical+len, em->start, + em->start + em->len); + free_extent_map(em); + return 1; + } + + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; + else + ret = 1; + free_extent_map(em); + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + ret++; + btrfs_dev_replace_unlock(&fs_info->dev_replace); + + return ret; +} + +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + unsigned long len = root->sectorsize; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + len = map->stripe_len * nr_data_stripes(map); + free_extent_map(em); + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, int num, + int optimal, int dev_replace_is_ongoing) +{ + int i; + int tolerance; + struct btrfs_device *srcdev; + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[optimal].dev->bdev && + (tolerance || map->stripes[optimal].dev != srcdev)) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } + } + + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < num_stripes - 1; i++) { + if (parity_smaller(bbio->raid_map[i], + bbio->raid_map[i+1])) { + s = bbio->stripes[i]; + l = bbio->raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + bbio->raid_map[i] = bbio->raid_map[i+1]; + bbio->stripes[i+1] = s; + bbio->raid_map[i+1] = l; + + again = 1; + } + } + } +} + +static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +{ + struct btrfs_bio *bbio = kzalloc( + /* the size of the btrfs_bio */ + sizeof(struct btrfs_bio) + + /* plus the variable array for the stripes */ + sizeof(struct btrfs_bio_stripe) * (total_stripes) + + /* plus the variable array for the tgt dev */ + sizeof(int) * (real_stripes) + + /* + * plus the raid_map, which includes both the tgt dev + * and the stripes + */ + sizeof(u64) * (total_stripes), + GFP_NOFS); + if (!bbio) + return NULL; + + atomic_set(&bbio->error, 0); + atomic_set(&bbio->refs, 1); + + return bbio; +} + +void btrfs_get_bbio(struct btrfs_bio *bbio) +{ + WARN_ON(!atomic_read(&bbio->refs)); + atomic_inc(&bbio->refs); +} + +void btrfs_put_bbio(struct btrfs_bio *bbio) +{ + if (!bbio) + return; + if (atomic_dec_and_test(&bbio->refs)) + kfree(bbio); +} + +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_end_offset; + u64 stripe_nr; + u64 stripe_nr_orig; + u64 stripe_nr_end; + u64 stripe_len; + u32 stripe_index; + int i; + int ret = 0; + int num_stripes; + int max_errors = 0; + int tgtdev_indexes = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + int num_alloc_stripes; + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu len %llu", + logical, *length); + return -EINVAL; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " + "found %Lu-%Lu", logical, em->start, + em->start + em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + stripe_len = map->stripe_len; + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(stripe_nr, stripe_len); + + stripe_offset = stripe_nr * stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + /* if we're here for raid56, we need to know the stripe aligned start */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + raid56_full_stripe_start = offset; + + /* allow a write of a full stripe, but make sure we don't + * allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + } + + if (rw & REQ_DISCARD) { + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + *length = min_t(u64, em->len - offset, *length); + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len; + /* For writes to RAID[56], allow a full stripeset across all disks. + For other RAID types and for RAID[56] reads, just allow a single + stripe (on a single disk). */ + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + (rw & REQ_WRITE)) { + max_len = stripe_len * nr_data_stripes(map) - + (offset - raid56_full_stripe_start); + } else { + /* we limit the length of each bio to what fits in a stripe */ + max_len = stripe_len - stripe_offset; + } + *length = min_t(u64, em->len - offset, max_len); + } else { + *length = em->len - offset; + } + + /* This is for when we're called from btrfs_merge_bio_hook() and all + it cares about is the length */ + if (!bbio_ret) + goto out; + + btrfs_dev_replace_lock(dev_replace); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (!dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && + dev_replace->tgtdev != NULL) { + /* + * in dev-replace case, for repair case (that's the only + * case where the mirror is selected explicitly when + * calling btrfs_map_block), blocks left of the left cursor + * can also be read from the target drive. + * For REQ_GET_READ_MIRRORS, the target drive is added as + * the last one to the array of stripes. For READ, it also + * needs to be supported using the same mirror number. + * If the requested block is not left of the left cursor, + * EIO is returned. This can happen because btrfs_num_copies() + * returns one more in the dev-replace case. + */ + u64 tmp_length = *length; + struct btrfs_bio *tmp_bbio = NULL; + int tmp_num_stripes; + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + logical, &tmp_length, &tmp_bbio, 0, 0); + if (ret) { + WARN_ON(tmp_bbio != NULL); + goto out; + } + + tmp_num_stripes = tmp_bbio->num_stripes; + if (mirror_num > tmp_num_stripes) { + /* + * REQ_GET_READ_MIRRORS does not contain this + * mirror, that means that the requested area + * is not left of the left cursor + */ + ret = -EIO; + btrfs_put_bbio(tmp_bbio); + goto out; + } + + /* + * process the rest of the function using the mirror_num + * of the source drive. Therefore look it up first. + * At the end, patch the device pointer to the one of the + * target drive. + */ + for (i = 0; i < tmp_num_stripes; i++) { + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + tmp_bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = + tmp_bbio->stripes[i].physical; + } + } + + if (found) { + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; + } else { + WARN_ON(1); + ret = -EIO; + btrfs_put_bbio(tmp_bbio); + goto out; + } + + btrfs_put_bbio(tmp_bbio); + } else if (mirror_num > map->num_stripes) { + mirror_num = 0; + } + + num_stripes = 1; + stripe_index = 0; + stripe_nr_orig = stripe_nr; + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + *length); + + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->num_stripes, + stripe_nr_end - stripe_nr_orig); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + mirror_num = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(fs_info, map, 0, + map->num_stripes, + current->pid % map->num_stripes, + dev_replace_is_ongoing); + mirror_num = stripe_index + 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { + num_stripes = map->num_stripes; + } else if (mirror_num) { + stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= map->sub_stripes; + + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + num_stripes = map->sub_stripes; + else if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->sub_stripes * + (stripe_nr_end - stripe_nr_orig), + map->num_stripes); + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + int old_stripe_index = stripe_index; + stripe_index = find_live_mirror(fs_info, map, + stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes, + dev_replace_is_ongoing); + mirror_num = stripe_index - old_stripe_index + 1; + } + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + if (need_raid_map && + ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + /* push stripe_nr back to the start of the full stripe */ + stripe_nr = div_u64(raid56_full_stripe_start, + stripe_len * nr_data_stripes(map)); + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = nr_parity_stripes(map); + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_nr = div_u64_rem(stripe_nr, + nr_data_stripes(map), &stripe_index); + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + div_u64_rem(stripe_nr + stripe_index, map->num_stripes, + &stripe_index); + if (!(rw & (REQ_WRITE | REQ_DISCARD | + REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + mirror_num = 1; + } + } else { + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + mirror_num = stripe_index + 1; + } + BUG_ON(stripe_index >= map->num_stripes); + + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_alloc_stripes <<= 1; + if (rw & REQ_GET_READ_MIRRORS) + num_alloc_stripes++; + tgtdev_indexes = num_stripes; + } + + bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + if (dev_replace_is_ongoing) + bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + + /* build raid_map */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && + need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + u64 tmp; + unsigned rot; + + bbio->raid_map = (u64 *)((void *)bbio->stripes + + sizeof(struct btrfs_bio_stripe) * + num_alloc_stripes + + sizeof(int) * tgtdev_indexes); + + /* Work out the disk rotation on this stripe-set */ + div_u64_rem(stripe_nr, num_stripes, &rot); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + bbio->raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + bbio->raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + } + + if (rw & REQ_DISCARD) { + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else + bbio->stripes[i].length = *length; + + stripe_index++; + if (stripe_index == map->num_stripes) { + /* This could only happen for RAID0/10 */ + stripe_index = 0; + stripe_nr++; + } + } + } else { + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; + } + } + + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + max_errors = btrfs_chunk_max_errors(map); + + if (bbio->raid_map) + sort_parity_stripes(bbio, num_stripes); + + tgtdev_indexes = 0; + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + dev_replace->tgtdev != NULL) { + int index_where_to_add; + u64 srcdev_devid = dev_replace->srcdev->devid; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk + * to the new disk takes place at run time while the + * filesystem is mounted writable, the regular write + * operations to the old disk have to be duplicated to go + * to the new disk as well. + * Note that device->missing is handled by the caller, and + * that the write to the old disk is already set up in the + * stripes array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + dev_replace->tgtdev != NULL) { + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can + * also be used to read data in case it is needed to repair + * a corrupt block elsewhere. This is possible if the + * requested area is left of the left cursor. In this area, + * the target drive is a full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + if (physical_of_found + map->stripe_len <= + dev_replace->cursor_left) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; + bbio->num_tgtdevs = tgtdev_indexes; + + /* + * this is the case that REQ_READ && dev_replace_is_ongoing && + * mirror_num == num_stripes + 1 && dev_replace target drive is + * available as a mirror + */ + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { + WARN_ON(num_stripes > 1); + bbio->stripes[0].dev = dev_replace->tgtdev; + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; + bbio->mirror_num = map->num_stripes + 1; + } +out: + if (dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + free_extent_map(em); + return ret; +} + +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num) +{ + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + mirror_num, 0); +} + +/* For Scrub/replace */ +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + int need_raid_map) +{ + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + mirror_num, need_raid_map); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + u64 rmap_len; + int i, j, nr = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + read_unlock(&em_tree->lock); + + if (!em) { + printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } + map = (struct map_lookup *)em->bdev; + + length = em->len; + rmap_len = map->stripe_len; + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + length = div_u64(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + length = div_u64(length, map->num_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + length = div_u64(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } + + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + BUG_ON(!buf); /* -ENOMEM */ + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + stripe_nr = div_u64(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + stripe_nr = div_u64(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + *logical = buf; + *naddrs = nr; + *stripe_len = rmap_len; + + free_extent_map(em); + return 0; +} + +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +{ + if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) + bio_endio_nodec(bio, err); + else + bio_endio(bio, err); + btrfs_put_bbio(bbio); +} + +static void btrfs_end_bio(struct bio *bio, int err) +{ + struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_device *dev = bbio->stripes[0].dev; + int is_orig_bio = 0; + + if (err) { + atomic_inc(&bbio->error); + if (err == -EIO || err == -EREMOTEIO) { + unsigned int stripe_index = + btrfs_io_bio(bio)->stripe_index; + + BUG_ON(stripe_index >= bbio->num_stripes); + dev = bbio->stripes[stripe_index].dev; + if (dev->bdev) { + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } + } + } + + if (bio == bbio->orig_bio) + is_orig_bio = 1; + + btrfs_bio_counter_dec(bbio->fs_info); + + if (atomic_dec_and_test(&bbio->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = bbio->orig_bio; + } + + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + /* only send an error to the higher layers if it is + * beyond the tolerance of the btrfs bio + */ + if (atomic_read(&bbio->error) > bbio->max_errors) { + err = -EIO; + } else { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + + btrfs_end_bbio(bbio, bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + struct btrfs_pending_bios *pending_bios; + + if (device->missing || !device->bdev) { + bio_endio(bio, -EIO); + return; + } + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & REQ_WRITE)) { + bio_get(bio); + btrfsic_submit_bio(rw, bio); + bio_put(bio); + return; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + if (bio->bi_rw & REQ_SYNC) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; + + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_work(root->fs_info->submit_workers, + &device->work); +} + +static int bio_size_ok(struct block_device *bdev, struct bio *bio, + sector_t sector) +{ + struct bio_vec *prev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_sectors = queue_max_sectors(q); + struct bvec_merge_data bvm = { + .bi_bdev = bdev, + .bi_sector = sector, + .bi_rw = bio->bi_rw, + }; + + if (WARN_ON(bio->bi_vcnt == 0)) + return 1; + + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (bio_sectors(bio) > max_sectors) + return 0; + + if (!q->merge_bvec_fn) + return 1; + + bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) + return 0; + return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *bio, u64 physical, int dev_nr, + int rw, int async) +{ + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + + bio->bi_private = bbio; + btrfs_io_bio(bio)->stripe_index = dev_nr; + bio->bi_end_io = btrfs_end_bio; + bio->bi_iter.bi_sector = physical >> 9; +#ifdef DEBUG + { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_iter.bi_size); + rcu_read_unlock(); + } +#endif + bio->bi_bdev = dev->bdev; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + + if (async) + btrfs_schedule_bio(root, dev, rw, bio); + else + btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *first_bio, struct btrfs_device *dev, + int dev_nr, int rw, int async) +{ + struct bio_vec *bvec = first_bio->bi_io_vec; + struct bio *bio; + int nr_vecs = bio_get_nr_vecs(dev->bdev); + u64 physical = bbio->stripes[dev_nr].physical; + +again: + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); + if (!bio) + return -ENOMEM; + + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + u64 len = bio->bi_iter.bi_size; + + atomic_inc(&bbio->stripes_pending); + submit_stripe_bio(root, bbio, bio, physical, dev_nr, + rw, async); + physical += len; + goto again; + } + bvec++; + } + + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); + return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ + atomic_inc(&bbio->error); + if (atomic_dec_and_test(&bbio->stripes_pending)) { + /* Shoud be the original bio. */ + WARN_ON(bio != bbio->orig_bio); + + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + bio->bi_iter.bi_sector = logical >> 9; + + btrfs_end_bbio(bbio, bio, -EIO); + } +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + int dev_nr; + int total_devs; + struct btrfs_bio *bbio = NULL; + + length = bio->bi_iter.bi_size; + map_length = length; + + btrfs_bio_counter_inc_blocked(root->fs_info); + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, + mirror_num, 1); + if (ret) { + btrfs_bio_counter_dec(root->fs_info); + return ret; + } + + total_devs = bbio->num_stripes; + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + bbio->fs_info = root->fs_info; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); + + if (bbio->raid_map) { + /* In this case, map_length has been set to the length of + a single stripe; not the whole write */ + if (rw & WRITE) { + ret = raid56_parity_write(root, bio, bbio, map_length); + } else { + ret = raid56_parity_recover(root, bio, bbio, map_length, + mirror_num, 1); + } + + btrfs_bio_counter_dec(root->fs_info); + return ret; + } + + if (map_length < length) { + btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); + BUG(); + } + + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { + dev = bbio->stripes[dev_nr].dev; + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + bbio_error(bbio, first_bio, logical); + continue; + } + + /* + * Check and see if we're ok with this bio based on it's size + * and offset with the given device. + */ + if (!bio_size_ok(dev->bdev, first_bio, + bbio->stripes[dev_nr].physical >> 9)) { + ret = breakup_stripe_bio(root, bbio, first_bio, dev, + dev_nr, rw, async_submit); + BUG_ON(ret); + continue; + } + + if (dev_nr < total_devs - 1) { + bio = btrfs_bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); /* -ENOMEM */ + } else { + bio = first_bio; + bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; + } + + submit_stripe_bio(root, bbio, bio, + bbio->stripes[dev_nr].physical, dev_nr, rw, + async_submit); + } + btrfs_bio_counter_dec(root->fs_info); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + + device = btrfs_alloc_device(NULL, &devid, dev_uuid); + if (IS_ERR(device)) + return NULL; + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + + device->missing = 1; + fs_devices->missing_devices++; + + return device; +} + +/** + * btrfs_alloc_device - allocate struct btrfs_device + * @fs_info: used only for generating a new devid, can be NULL if + * devid is provided (i.e. @devid != NULL). + * @devid: a pointer to devid for this device. If NULL a new devid + * is generated. + * @uuid: a pointer to UUID for this device. If NULL a new UUID + * is generated. + * + * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() + * on error. Returned struct is not linked onto any lists and can be + * destroyed with kfree() right away. + */ +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid) +{ + struct btrfs_device *dev; + u64 tmp; + + if (WARN_ON(!devid && !fs_info)) + return ERR_PTR(-EINVAL); + + dev = __alloc_device(); + if (IS_ERR(dev)) + return dev; + + if (devid) + tmp = *devid; + else { + int ret; + + ret = find_next_devid(fs_info, &tmp); + if (ret) { + kfree(dev); + return ERR_PTR(ret); + } + } + dev->devid = tmp; + + if (uuid) + memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); + else + generate_random_uuid(dev->uuid); + + btrfs_init_work(&dev->work, btrfs_submit_helper, + pending_bios_fn, NULL, NULL); + + return dev; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + read_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->orig_start = 0; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + uuid, NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, root->fs_info->fs_devices, + devid, uuid); + if (!map->stripes[i].dev) { + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + write_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em, 0); + write_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); /* Tree corruption */ + free_extent_map(em); + + return 0; +} + +static void fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); + device->is_tgtdev_for_dev_replace = 0; + + ptr = btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); +} + +static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, + u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) + return fs_devices; + + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + if (!btrfs_test_opt(root, DEGRADED)) + return ERR_PTR(-ENOENT); + + fs_devices = alloc_fs_devices(fsid); + if (IS_ERR(fs_devices)) + return fs_devices; + + fs_devices->seeding = 1; + fs_devices->opened = 1; + return fs_devices; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) + return fs_devices; + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) { + free_fs_devices(fs_devices); + fs_devices = ERR_PTR(ret); + goto out; + } + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + fs_devices = ERR_PTR(-EINVAL); + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + return fs_devices; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + fs_devices = open_seed_devices(root, fs_uuid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + } + + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); + if (!device) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + btrfs_warn(root->fs_info, "devid %llu missing", devid); + device = add_missing_dev(root, fs_devices, devid, dev_uuid); + if (!device) + return -ENOMEM; + } else { + if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if(!device->bdev && !device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + device->fs_devices->missing_devices++; + device->missing = 1; + } + + /* Move the device to its own fs_devices */ + if (device->fs_devices != fs_devices) { + ASSERT(device->missing); + + list_move(&device->dev_list, &fs_devices->devices); + device->fs_devices->num_devices--; + fs_devices->num_devices++; + + device->fs_devices->missing_devices--; + fs_devices->missing_devices++; + + device->fs_devices = fs_devices; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->in_fs_metadata = 1; + if (device->writeable && !device->is_tgtdev_for_dev_replace) { + device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } + ret = 0; + return ret; +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *array_ptr; + unsigned long sb_array_offset; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur_offset; + struct btrfs_key key; + + ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + /* + * This will create extent buffer of nodesize, superblock size is + * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will + * overallocate but we can keep it as-is, only the first page is used. + */ + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->pages[0]); + + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; + + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; + + btrfs_disk_key_to_cpu(&key, disk_key); + + array_ptr += len; + sb_array_offset += len; + cur_offset += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be + * present, exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + } else { + ret = -EIO; + break; + } + array_ptr += len; + sb_array_offset += len; + cur_offset += len; + } + free_extent_buffer(sb); + return ret; + +out_short_read: + printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", + len, cur_offset); + free_extent_buffer(sb); + return -EIO; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + mutex_lock(&uuid_mutex); + lock_chunks(root); + + /* + * Read all device items, and then all the chunk items. All + * device items are found before any chunk item (their object id + * is smaller than the lowest possible object id for a chunk + * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + ret = 0; +error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + + btrfs_free_path(path); + return ret; +} + +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } +} + +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int i; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_dev_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + __btrfs_reset_dev_stats(device); + device->dev_stats_valid = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_reset(device, i); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "error %d while searching for dev_stats item for device %s!\n", + ret, rcu_str_deref(device->name)); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "delete too small dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "insert dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int stats_cnt; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) + continue; + + stats_cnt = atomic_read(&device->dev_stats_ccnt); + ret = update_dev_stat_item(trans, dev_root, device); + if (!ret) + atomic_sub(stats_cnt, &device->dev_stats_ccnt); + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + btrfs_dev_stat_print_on_error(dev); +} + +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ + if (!dev->dev_stats_valid) + return; + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (btrfs_dev_stat_read(dev, i) != 0) + break; + if (i == BTRFS_DEV_STAT_VALUES_MAX) + return; /* all values == 0, suppress message */ + + printk_in_rcu(KERN_INFO "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); + return -ENODEV; + } else if (!dev->dev_stats_valid) { + btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); + return -ENODEV; + } else if (stats->flags & BTRFS_DEV_STATS_RESET) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_reset(dev, i); + } + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + + bh = btrfs_read_dev_super(device->bdev); + if (!bh) + return -EINVAL; + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} + +/* + * Update the size of all devices, which is used for writing out the + * super blocks. + */ +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *curr, *next; + + if (list_empty(&fs_devices->resized_devices)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + lock_chunks(fs_info->dev_root); + list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, + resized_list) { + list_del_init(&curr->resized_list); + curr->commit_total_bytes = curr->disk_total_bytes; + } + unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_devices->device_list_mutex); +} + +/* Must be invoked during the transaction commit */ +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + int i; + + if (list_empty(&transaction->pending_chunks)) + return; + + /* In order to kick the device replace finish process */ + lock_chunks(root); + list_for_each_entry(em, &transaction->pending_chunks, list) { + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; + } + } + unlock_chunks(root); +} diff --git a/kernel/fs/btrfs/volumes.h b/kernel/fs/btrfs/volumes.h new file mode 100644 index 000000000..ebc31331a --- /dev/null +++ b/kernel/fs/btrfs/volumes.h @@ -0,0 +1,541 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include +#include +#include +#include "async-thread.h" + +extern struct mutex uuid_mutex; + +#define BTRFS_STRIPE_LEN (64 * 1024) + +struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + +/* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include +#define __BTRFS_NEED_DEVICE_DATA_ORDERED +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) +#else +#define btrfs_device_data_ordered_init(device) do { } while (0) +#endif + +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + + struct btrfs_root *dev_root; + + struct rcu_string *name; + + u64 generation; + + spinlock_t io_lock ____cacheline_aligned; + int running_pending; + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + + struct block_device *bdev; + + /* the mode sent to blkdev_get */ + fmode_t mode; + + int writeable; + int in_fs_metadata; + int missing; + int can_discard; + int is_tgtdev_for_dev_replace; + +#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED + seqcount_t data_seqcount; +#endif + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device in memory */ + u64 total_bytes; + + /* size of the device on disk */ + u64 disk_total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + /* type and info about this device */ + u64 type; + + /* minimal io size for this device */ + u32 sector_size; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by device_list_mutex + */ + u64 commit_total_bytes; + + /* bytes used on the current transaction */ + u64 commit_bytes_used; + /* + * used to manage the device which is resized + * + * It is protected by chunk_lock. + */ + struct list_head resized_list; + + /* for sending down flush barriers */ + int nobarriers; + struct bio *flush_bio; + struct completion flush_wait; + + /* per-device scrub information */ + struct scrub_ctx *scrub_device; + + struct btrfs_work work; + struct rcu_head rcu; + struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; + + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; + + /* Counter to record the change of device stats */ + atomic_t dev_stats_ccnt; + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; +}; + +/* + * If we read those variants at the context of their own lock, we needn't + * use the following helpers, reading them directly is safe. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + unsigned int seq; \ + \ + do { \ + seq = read_seqcount_begin(&dev->data_seqcount); \ + size = dev->name; \ + } while (read_seqcount_retry(&dev->data_seqcount, seq)); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + write_seqcount_begin(&dev->data_seqcount); \ + dev->name = size; \ + write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ +} +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + \ + preempt_disable(); \ + size = dev->name; \ + preempt_enable(); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + dev->name = size; \ + preempt_enable(); \ +} +#else +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + return dev->name; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + dev->name = size; \ +} +#endif + +BTRFS_DEVICE_GETSET_FUNCS(total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(bytes_used); + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 missing_devices; + u64 total_rw_bytes; + u64 total_devices; + struct block_device *latest_bdev; + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code. + * Scrubbing super can kick off supers writing by holding + * this mutex lock. + */ + struct mutex device_list_mutex; + struct list_head devices; + + struct list_head resized_devices; + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; +}; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); +struct btrfs_io_bio { + unsigned int mirror_num; + unsigned int stripe_index; + u64 logical; + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + u8 *csum_allocated; + btrfs_io_bio_end_io_t *end_io; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; + u64 length; /* only used for discard mappings */ +}; + +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) + +struct btrfs_bio { + atomic_t refs; + atomic_t stripes_pending; + struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ + bio_end_io_t *end_io; + struct bio *orig_bio; + unsigned long flags; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + int mirror_num; + int num_tgtdevs; + int *tgtdev_map; + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; + struct btrfs_bio_stripe stripes[]; +}; + +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; + u64 total_avail; +}; + +struct btrfs_raid_attr { + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ +}; + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); +void btrfs_get_bbio(struct btrfs_bio *bbio); +void btrfs_put_bbio(struct btrfs_bio *bbio); +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num); +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + int need_raid_map); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device); +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +void btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, + struct btrfs_device **device_out); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); +int btrfs_recover_balance(struct btrfs_fs_info *fs_info); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats); +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +int btrfs_scratch_superblock(struct btrfs_device *device); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical); +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset); + +static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) +{ + return atomic_read(&dev->dev_stats_ccnt); +} + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, + int index) +{ + btrfs_dev_stat_set(dev, index, 0); +} + +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + +#endif diff --git a/kernel/fs/btrfs/xattr.c b/kernel/fs/btrfs/xattr.c new file mode 100644 index 000000000..6f518c90e --- /dev/null +++ b/kernel/fs/btrfs/xattr.c @@ -0,0 +1,517 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" +#include "props.h" +#include "locking.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, + strlen(name), 0); + if (!di) { + ret = -ENODATA; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_release_on_error = 1; + + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ + if (flags & XATTR_REPLACE) { + ASSERT(mutex_is_locked(&inode->i_mutex)); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (!di) + ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + if (ret) + goto out; + btrfs_release_path(path); + di = NULL; + } + + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EOVERFLOW) { + /* + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. + */ + ret = 0; + btrfs_assert_tree_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; + goto out; + } + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(root, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; + } + + if (di && (flags & XATTR_CREATE)) { + ret = -EEXIST; + goto out; + } + + if (di) { + /* + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. + */ + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(root, leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(root, path, + size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(root, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(root, path, data_size); + } + + item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, item) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(leaf); + } else { + /* + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. + */ + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * @value: "" makes the attribute to empty, NULL removes it + */ +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode_inc_iversion(inode); + inode->i_ctime = CURRENT_TIME; + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction(trans, root); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = d_inode(dentry); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (found_key.type != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + goto next; + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + goto next; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; +next: + path->slots[0]++; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This is applied after the check for the synthetic attributes in the system + * namespace. + */ +static int btrfs_is_valid_xattr(const char *name) +{ + int len = strlen(name); + int prefixlen = 0; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) + prefixlen = XATTR_SECURITY_PREFIX_LEN; + else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + prefixlen = XATTR_SYSTEM_PREFIX_LEN; + else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + prefixlen = XATTR_TRUSTED_PREFIX_LEN; + else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + prefixlen = XATTR_USER_PREFIX_LEN; + else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + prefixlen = XATTR_BTRFS_PREFIX_LEN; + else + return -EOPNOTSUPP; + + /* + * The name cannot consist of just prefix + */ + if (len <= prefixlen) + return -EINVAL; + + return 0; +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + int ret; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + return __btrfs_getxattr(d_inode(dentry), name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(d_inode(dentry), name, + value, size, flags); + + if (size == 0) + value = ""; /* empty EA, do not remove */ + + return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, + flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(d_inode(dentry), name, + NULL, 0, XATTR_REPLACE); + + return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, + XATTR_REPLACE); +} + +static int btrfs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; + char *name; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + return err; +} + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/kernel/fs/btrfs/xattr.h b/kernel/fs/btrfs/xattr.h new file mode 100644 index 000000000..5049608d1 --- /dev/null +++ b/kernel/fs/btrfs/xattr.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include + +extern const struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +#endif /* __XATTR__ */ diff --git a/kernel/fs/btrfs/zlib.c b/kernel/fs/btrfs/zlib.c new file mode 100644 index 000000000..82990b8f8 --- /dev/null +++ b/kernel/fs/btrfs/zlib.c @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +struct workspace { + z_stream strm; + char *buf; + struct list_head list; +}; + +static void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zlib_alloc_workspace(void) +{ + struct workspace *workspace; + int workspacesize; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + workspace->strm.workspace = vmalloc(workspacesize); + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->strm.workspace || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { + printk(KERN_WARNING "BTRFS: deflateInit failed\n"); + ret = -EIO; + goto out; + } + + workspace->strm.total_in = 0; + workspace->strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->strm.next_in = data_in; + workspace->strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE); + + while (workspace->strm.total_in < len) { + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->strm.total_in > 8192 && + workspace->strm.total_in < + workspace->strm.total_out) { + ret = -E2BIG; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->strm.avail_in == 0) { + if (workspace->strm.total_out > max_out) + break; + + bytes_left = len - workspace->strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->strm.next_in = data_in; + } + } + workspace->strm.avail_in = 0; + ret = zlib_deflate(&workspace->strm, Z_FINISH); + zlib_deflateEnd(&workspace->strm); + + if (ret != Z_STREAM_END) { + ret = -EIO; + goto out; + } + + if (workspace->strm.total_out >= workspace->strm.total_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_out = workspace->strm.total_out; + *total_in = workspace->strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + return ret; +} + +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + int wbits = MAX_WBITS; + char *data_in; + size_t total_out = 0; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long buf_start; + unsigned long pg_offset; + + data_in = kmap(pages_in[page_in_index]); + workspace->strm.next_in = data_in; + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->strm.total_in = 0; + + workspace->strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; + } + while (workspace->strm.total_in < srclen) { + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->strm.total_out; + + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) + break; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; + } + + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->strm.next_in = data_in; + tmp = srclen - workspace->strm.total_in; + workspace->strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -EIO; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap(pages_in[page_in_index]); + if (!ret) + btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + return ret; +} + +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + int wbits = MAX_WBITS; + unsigned long bytes_left; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + char *kaddr; + + destlen = min_t(unsigned long, destlen, PAGE_SIZE); + bytes_left = destlen; + + workspace->strm.next_in = data_in; + workspace->strm.avail_in = srclen; + workspace->strm.total_in = 0; + + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->strm.total_out; + + if (total_out == buf_start) { + ret = -EIO; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -EIO; + else + ret = 0; + + zlib_inflateEnd(&workspace->strm); + + /* + * this should only happen if zlib returned fewer bytes than we + * expected. btrfs_get_block is responsible for zeroing from the + * end of the inline extent (destlen) to the end of the page + */ + if (pg_offset < destlen) { + kaddr = kmap_atomic(dest_page); + memset(kaddr + pg_offset, 0, destlen - pg_offset); + kunmap_atomic(kaddr); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; diff --git a/kernel/fs/buffer.c b/kernel/fs/buffer.c new file mode 100644 index 000000000..2907544c3 --- /dev/null +++ b/kernel/fs/buffer.c @@ -0,0 +1,3422 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992, 2002 Linus Torvalds + */ + +/* + * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 + * + * Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + * + * Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM + * + * Added 32k buffer block sizes - these are required older ARM systems. - RMK + * + * async buffer flushing, 1999 Andrea Arcangeli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) + +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_end_io = handler; + bh->b_private = private; +} +EXPORT_SYMBOL(init_buffer); + +inline void touch_buffer(struct buffer_head *bh) +{ + trace_block_touch_buffer(bh); + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + +void __lock_buffer(struct buffer_head *bh) +{ + wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_buffer); + +void unlock_buffer(struct buffer_head *bh) +{ + clear_bit_unlock(BH_Lock, &bh->b_state); + smp_mb__after_atomic(); + wake_up_bit(&bh->b_state, BH_Lock); +} +EXPORT_SYMBOL(unlock_buffer); + +/* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + +/* + * Block until a buffer comes unlocked. This doesn't stop it + * from becoming locked again - you have to lock it yourself + * if you want to preserve its state. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__wait_on_buffer); + +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +} + +static void buffer_io_error(struct buffer_head *bh, char *msg) +{ + char b[BDEVNAME_SIZE]; + + if (!test_bit(BH_Quiet, &bh->b_state)) + printk_ratelimited(KERN_ERR + "Buffer I/O error on dev %s, logical block %llu%s\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr, msg); +} + +/* + * End-of-IO handler helper function which does not touch the bh after + * unlocking it. + * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but + * a race there is benign: unlock_buffer() only use the bh's address for + * hashing after unlocking the buffer, so it doesn't actually touch the bh + * itself. + */ +static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); + put_bh(bh); +} +EXPORT_SYMBOL(end_buffer_read_sync); + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + buffer_io_error(bh, ", lost sync page write"); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} +EXPORT_SYMBOL(end_buffer_write_sync); + +/* + * Various filesystems appear to want __find_get_block to be non-blocking. + * But it's the page lock which protects the buffers. To get around this, + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. + * + * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * may be quite high. This code could TryLock the page, and if that + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->tree_lock). + */ +static struct buffer_head * +__find_get_block_slow(struct block_device *bdev, sector_t block) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct buffer_head *ret = NULL; + pgoff_t index; + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int all_mapped = 1; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); + if (!page) + goto out; + + spin_lock(&bd_mapping->private_lock); + if (!page_has_buffers(page)) + goto out_unlock; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh)) + all_mapped = 0; + else if (bh->b_blocknr == block) { + ret = bh; + get_bh(bh); + goto out_unlock; + } + bh = bh->b_this_page; + } while (bh != head); + + /* we might be here because some of the buffers on this page are + * not mapped. This is due to various races between + * file io on the block device and getblk. It gets dealt with + * elsewhere, don't buffer_error if we had some unmapped buffers + */ + if (all_mapped) { + char b[BDEVNAME_SIZE]; + + printk("__find_get_block_slow() failed. " + "block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%zu\n", + bh->b_state, bh->b_size); + printk("device %s blocksize: %d\n", bdevname(bdev, b), + 1 << bd_inode->i_blkbits); + } +out_unlock: + spin_unlock(&bd_mapping->private_lock); + page_cache_release(page); +out: + return ret; +} + +/* + * Kick the writeback threads then try to free up some ZONE_NORMAL memory. + */ +static void free_more_memory(void) +{ + struct zone *zone; + int nid; + + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); + yield(); + + for_each_online_node(nid) { + (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL, + &zone); + if (zone) + try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, + GFP_NOFS, NULL); + } +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + int page_uptodate = 1; + + BUG_ON(!buffer_async_read(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + clear_buffer_uptodate(bh); + buffer_io_error(bh, ", async page read"); + SetPageError(page); + } + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + */ + first = page_buffers(page); + flags = bh_uptodate_lock_irqsave(first); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + bh_uptodate_unlock_irqrestore(first, flags); + + /* + * If none of the buffers had errors and they are all + * uptodate then we can set the page uptodate. + */ + if (page_uptodate && !PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return; + +still_busy: + bh_uptodate_unlock_irqrestore(first, flags); +} + +/* + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. + */ +void end_buffer_async_write(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + + BUG_ON(!buffer_async_write(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + buffer_io_error(bh, ", lost async page write"); + set_bit(AS_EIO, &page->mapping->flags); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(page); + } + + first = page_buffers(page); + flags = bh_uptodate_lock_irqsave(first); + + clear_buffer_async_write(bh); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async_write(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } + bh_uptodate_unlock_irqrestore(first, flags); + end_page_writeback(page); + return; + +still_busy: + bh_uptodate_unlock_irqrestore(first, flags); +} +EXPORT_SYMBOL(end_buffer_async_write); + +/* + * If a page's buffers are under async readin (end_buffer_async_read + * completion) then there is a possibility that another thread of + * control could lock one of the buffers after it has completed + * but while some of the other buffers have not completed. This + * locked buffer would confuse end_buffer_async_read() into not unlocking + * the page. So the absence of BH_Async_Read tells end_buffer_async_read() + * that this buffer is not under async I/O. + * + * The page comes unlocked when it has no locked buffer_async buffers + * left. + * + * PageLocked prevents anyone starting new async I/O reads any of + * the buffers. + * + * PageWriteback is used to prevent simultaneous writeout of the same + * page. + * + * PageLocked prevents anyone from starting writeback of a page which is + * under read I/O (PageWriteback is only ever set against a locked page). + */ +static void mark_buffer_async_read(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_read; + set_buffer_async_read(bh); +} + +static void mark_buffer_async_write_endio(struct buffer_head *bh, + bh_end_io_t *handler) +{ + bh->b_end_io = handler; + set_buffer_async_write(bh); +} + +void mark_buffer_async_write(struct buffer_head *bh) +{ + mark_buffer_async_write_endio(bh, end_buffer_async_write); +} +EXPORT_SYMBOL(mark_buffer_async_write); + + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +/* + * The buffer's backing address_space's private_lock must be held + */ +static void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); + WARN_ON(!bh->b_assoc_map); + if (buffer_write_io_error(bh)) + set_bit(AS_EIO, &bh->b_assoc_map->flags); + bh->b_assoc_map = NULL; +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_data.private_list); +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(lock); +repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + goto repeat; + } + } + spin_unlock(lock); + return err; +} + +static void do_thaw_one(struct super_block *sb, void *unused) +{ + char b[BDEVNAME_SIZE]; + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); +} + +static void do_thaw_all(struct work_struct *work) +{ + iterate_supers(do_thaw_one, NULL); + kfree(work); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } +} + +/** + * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers + * @mapping: the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). + * @mapping is a file or directory which needs those buffers to be written for + * a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->private_data; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +/* + * Called when we've recently written block `bblock', and it is known that + * `bblock' was for a buffer_boundary() buffer. This means that the block at + * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's + * dirty, schedule it for IO. So that indirects merge nicely with their data. + */ +void write_boundary_block(struct block_device *bdev, + sector_t bblock, unsigned blocksize) +{ + struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + if (bh) { + if (buffer_dirty(bh)) + ll_rw_block(WRITE, 1, &bh); + put_bh(bh); + } +} + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->private_data) { + mapping->private_data = buffer_mapping; + } else { + BUG_ON(mapping->private_data != buffer_mapping); + } + if (!bh->b_assoc_map) { + spin_lock(&buffer_mapping->private_lock); + list_move_tail(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + +/* + * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * dirty. + * + * If warn is true, then emit a warning if the page is not uptodate and has + * not been truncated. + */ +static void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); +} + +/* + * Add a page to the dirty page list. + * + * It is a sad fact of life that this function is called from several places + * deeply under spinlocking. It may not sleep. + * + * If the page has buffers, the uptodate buffers are set dirty, to preserve + * dirty-state coherency between the page and the buffers. It the page does + * not have buffers then when they are later attached they will all be set + * dirty. + * + * The buffers are dirtied before the page is dirtied. There's a small race + * window in which a writepage caller may see the page cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the page dirty + * before the buffers, a concurrent writepage caller could clear the page dirty + * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean + * page on the dirty page list. + * + * We use private_lock to lock against try_to_free_buffers while using the + * page's buffer list. Also use this to protect against clean buffers being + * added to the page after it was set dirty. + * + * FIXME: may need to call ->reservepage here as well. That's rather up to the + * address_space though. + */ +int __set_page_dirty_buffers(struct page *page) +{ + int newly_dirty; + struct address_space *mapping = page_mapping(page); + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + set_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; +} +EXPORT_SYMBOL(__set_page_dirty_buffers); + +/* + * Write out and wait upon a list of buffers. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head tmp; + struct address_space *mapping; + int err = 0, err2; + struct blk_plug plug; + + INIT_LIST_HEAD(&tmp); + blk_start_plug(&plug); + + spin_lock(lock); + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + bh->b_assoc_map = mapping; + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(lock); + /* + * Ensure any pending I/O completes so that + * write_dirty_buffer() actually writes the + * current contents - it is a noop if I/O is + * still in flight on potentially older + * contents. + */ + write_dirty_buffer(bh, WRITE_SYNC); + + /* + * Kick off IO for the previous mapping. Note + * that we will not run the very last mapping, + * wait_on_buffer() will do that for us + * through sync_buffer(). + */ + brelse(bh); + spin_lock(lock); + } + } + } + + spin_unlock(lock); + blk_finish_plug(&plug); + spin_lock(lock); + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + get_bh(bh); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh)) { + list_add(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + } + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + } + + spin_unlock(lock); + err2 = osync_buffers_list(lock, list); + if (err) + return err; + else + return err2; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->private_data; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(invalidate_inode_buffers); + +/* + * Remove any clean buffers from the inode's buffer list. This is called + * when we're trying to free the inode itself. Those buffers can pin it. + * + * Returns true if all buffers were removed. + */ +int remove_inode_buffers(struct inode *inode) +{ + int ret = 1; + + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->private_data; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) { + struct buffer_head *bh = BH_ENTRY(list->next); + if (buffer_dirty(bh)) { + ret = 0; + break; + } + __remove_assoc_queue(bh); + } + spin_unlock(&buffer_mapping->private_lock); + } + return ret; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. + */ +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + int retry) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = alloc_buffer_head(GFP_NOFS); + if (!bh) + goto no_grow; + + bh->b_this_page = head; + bh->b_blocknr = -1; + head = bh; + + bh->b_size = size; + + /* Link the buffer to its page */ + set_bh_page(bh, page, offset); + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + do { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } while (head); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!retry) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + free_more_memory(); + goto try_again; +} +EXPORT_SYMBOL_GPL(alloc_page_buffers); + +static inline void +link_dev_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); +} + +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + +/* + * Initialise the state of a blockdev page's buffers. + */ +static sector_t +init_page_buffers(struct page *page, struct block_device *bdev, + sector_t block, int size) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + int uptodate = PageUptodate(page); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); + + do { + if (!buffer_mapped(bh)) { + init_buffer(bh, NULL, NULL); + bh->b_bdev = bdev; + bh->b_blocknr = block; + if (uptodate) + set_buffer_uptodate(bh); + if (block < end_block) + set_buffer_mapped(bh); + } + block++; + bh = bh->b_this_page; + } while (bh != head); + + /* + * Caller needs to validate requested block against end of device. + */ + return end_block; +} + +/* + * Create the page-cache page that contains the requested block. + * + * This is used purely for blockdev mappings. + */ +static int +grow_dev_page(struct block_device *bdev, sector_t block, + pgoff_t index, int size, int sizebits, gfp_t gfp) +{ + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; + sector_t end_block; + int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; + + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; + + page = find_or_create_page(inode->i_mapping, index, gfp_mask); + if (!page) + return ret; + + BUG_ON(!PageLocked(page)); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) { + end_block = init_page_buffers(page, bdev, + (sector_t)index << sizebits, + size); + goto done; + } + if (!try_to_free_buffers(page)) + goto failed; + } + + /* + * Allocate some buffers for this page + */ + bh = alloc_page_buffers(page, size, 0); + if (!bh) + goto failed; + + /* + * Link the page to the buffers and initialise them. Take the + * lock to be atomic wrt __find_get_block(), which does not + * run under the page lock. + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); + end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, + size); + spin_unlock(&inode->i_mapping->private_lock); +done: + ret = (block < end_block) ? 1 : -ENXIO; +failed: + unlock_page(page); + page_cache_release(page); + return ret; +} + +/* + * Create buffers for the specified block device block's page. If + * that page was dirty, the buffers are set dirty also. + */ +static int +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) +{ + pgoff_t index; + int sizebits; + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + + /* + * Check for a block which wants to lie outside our maximum possible + * pagecache index. (this comparison is done using sector_t types). + */ + if (unlikely(index != block >> sizebits)) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "%s: requested out-of-range block %llu for " + "device %s\n", + __func__, (unsigned long long)block, + bdevname(bdev, b)); + return -EIO; + } + + /* Create a page with the proper size buffers.. */ + return grow_dev_page(bdev, block, index, size, sizebits, gfp); +} + +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + /* Size must be multiple of hard sectorsize */ + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || + (size < 512 || size > PAGE_SIZE))) { + printk(KERN_ERR "getblk(): invalid block size %d requested\n", + size); + printk(KERN_ERR "logical block size: %d\n", + bdev_logical_block_size(bdev)); + + dump_stack(); + return NULL; + } + + for (;;) { + struct buffer_head *bh; + int ret; + + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; + + ret = grow_buffers(bdev, block, size, gfp); + if (ret < 0) + return NULL; + if (ret == 0) + free_more_memory(); + } +} +EXPORT_SYMBOL(__getblk_slow); + +/* + * The relationship between dirty buffers and dirty pages: + * + * Whenever a page has any dirty buffers, the page's dirty bit is set, and + * the page is tagged dirty in its radix tree. + * + * At all times, the dirtiness of the buffers represents the dirtiness of + * subsections of the page. If the page has buffers, the page dirty bit is + * merely a hint about the true dirty state. + * + * When a page is set dirty in its entirety, all its buffers are marked dirty + * (if the page has buffers). + * + * When a buffer is marked dirty, its page is dirtied, but the page's other + * buffers are not. + * + * Also. When blockdev buffers are explicitly read with bread(), they + * individually become uptodate. But their backing page remains not + * uptodate - even if all of its buffers are uptodate. A subsequent + * block_read_full_page() against that page will discover all the uptodate + * buffers, will set the page uptodate and will perform no I/O. + */ + +/** + * mark_buffer_dirty - mark a buffer_head as needing writeout + * @bh: the buffer_head to mark dirty + * + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. + * + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mapping->tree_lock and mapping->host->i_lock. + */ +void mark_buffer_dirty(struct buffer_head *bh) +{ + WARN_ON_ONCE(!buffer_uptodate(bh)); + + trace_block_dirty_buffer(bh); + + /* + * Very *carefully* optimize the it-is-already-dirty case. + * + * Don't let the final "is it dirty" escape to before we + * perhaps modified the buffer. + */ + if (buffer_dirty(bh)) { + smp_mb(); + if (buffer_dirty(bh)) + return; + } + + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } + } +} +EXPORT_SYMBOL(mark_buffer_dirty); + +/* + * Decrement a buffer_head's reference count. If all buffers against a page + * have zero reference count, are clean and unlocked, and if the page is clean + * and unlocked then try_to_free_buffers() may strip the buffers from the page + * in preparation for freeing it (sometimes, rarely, buffers are removed from + * a page but it ends up not being freed, and buffers may later be reattached). + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} +EXPORT_SYMBOL(__brelse); + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head *bh) +{ + clear_buffer_dirty(bh); + if (bh->b_assoc_map) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + bh->b_assoc_map = NULL; + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); +} +EXPORT_SYMBOL(__bforget); + +static struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } else { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + } + brelse(bh); + return NULL; +} + +/* + * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). + * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their + * refcount elevated by one when they're in an LRU. A buffer can only appear + * once in a particular CPU's LRU. A single buffer can be present in multiple + * CPU's LRUs at the same time. + * + * This is a transparent caching front-end to sb_bread(), sb_getblk() and + * sb_find_get_block(). + * + * The LRUs themselves only need locking against invalidate_bh_lrus. We use + * a local interrupt disable for that. + */ + +#define BH_LRU_SIZE 16 + +struct bh_lru { + struct buffer_head *bhs[BH_LRU_SIZE]; +}; + +static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; + +#ifdef CONFIG_SMP +#define bh_lru_lock() local_irq_disable() +#define bh_lru_unlock() local_irq_enable() +#else +#define bh_lru_lock() preempt_disable() +#define bh_lru_unlock() preempt_enable() +#endif + +static inline void check_irqs_on(void) +{ +#ifdef irqs_disabled + BUG_ON(irqs_disabled()); +#endif +} + +/* + * The LRU management algorithm is dopey-but-simple. Sorry. + */ +static void bh_lru_install(struct buffer_head *bh) +{ + struct buffer_head *evictee = NULL; + + check_irqs_on(); + bh_lru_lock(); + if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { + struct buffer_head *bhs[BH_LRU_SIZE]; + int in; + int out = 0; + + get_bh(bh); + bhs[out++] = bh; + for (in = 0; in < BH_LRU_SIZE; in++) { + struct buffer_head *bh2 = + __this_cpu_read(bh_lrus.bhs[in]); + + if (bh2 == bh) { + __brelse(bh2); + } else { + if (out >= BH_LRU_SIZE) { + BUG_ON(evictee != NULL); + evictee = bh2; + } else { + bhs[out++] = bh2; + } + } + } + while (out < BH_LRU_SIZE) + bhs[out++] = NULL; + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + } + bh_lru_unlock(); + + if (evictee) + __brelse(evictee); +} + +/* + * Look up the bh in this cpu's LRU. If it's there, move it to the head. + */ +static struct buffer_head * +lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *ret = NULL; + unsigned int i; + + check_irqs_on(); + bh_lru_lock(); + for (i = 0; i < BH_LRU_SIZE; i++) { + struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); + + if (bh && bh->b_blocknr == block && bh->b_bdev == bdev && + bh->b_size == size) { + if (i) { + while (i) { + __this_cpu_write(bh_lrus.bhs[i], + __this_cpu_read(bh_lrus.bhs[i - 1])); + i--; + } + __this_cpu_write(bh_lrus.bhs[0], bh); + } + get_bh(bh); + ret = bh; + break; + } + } + bh_lru_unlock(); + return ret; +} + +/* + * Perform a pagecache lookup for the matching buffer. If it's there, refresh + * it in the LRU and mark it as accessed. If it is not present then return + * NULL + */ +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { + /* __find_get_block_slow will mark the page accessed */ + bh = __find_get_block_slow(bdev, block); + if (bh) + bh_lru_install(bh); + } else + touch_buffer(bh); + + return bh; +} +EXPORT_SYMBOL(__find_get_block); + +/* + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? + */ +struct buffer_head * +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_sleep(); + if (bh == NULL) + bh = __getblk_slow(bdev, block, size, gfp); + return bh; +} +EXPORT_SYMBOL(__getblk_gfp); + +/* + * Do async read-ahead on a buffer.. + */ +void __breadahead(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + if (likely(bh)) { + ll_rw_block(READA, 1, &bh); + brelse(bh); + } +} +EXPORT_SYMBOL(__breadahead); + +/** + * __bread_gfp() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * @gfp: page allocation flag + * + * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. + * It returns NULL if the block was unreadable. + */ +struct buffer_head * +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + + if (likely(bh) && !buffer_uptodate(bh)) + bh = __bread_slow(bh); + return bh; +} +EXPORT_SYMBOL(__bread_gfp); + +/* + * invalidate_bh_lrus() is called rarely - but not only at unmount. + * This doesn't race because it runs in each cpu either in irq + * or with preempt disabled. + */ +static void invalidate_bh_lru(void *arg) +{ + struct bh_lru *b = &get_cpu_var(bh_lrus); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + put_cpu_var(bh_lrus); +} + +static bool has_bh_in_lru(int cpu, void *dummy) +{ + struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + if (b->bhs[i]) + return 1; + } + + return 0; +} + +void invalidate_bh_lrus(void) +{ + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(invalidate_bh_lrus); + +void set_bh_page(struct buffer_head *bh, + struct page *page, unsigned long offset) +{ + bh->b_page = page; + BUG_ON(offset >= PAGE_SIZE); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Called when truncating a buffer on a page completely. + */ + +/* Bits that are cleared during an invalidate */ +#define BUFFER_FLAGS_DISCARD \ + (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ + 1 << BH_Delay | 1 << BH_Unwritten) + +static void discard_buffer(struct buffer_head * bh) +{ + unsigned long b_state, b_state_old; + + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + b_state = bh->b_state; + for (;;) { + b_state_old = cmpxchg(&bh->b_state, b_state, + (b_state & ~BUFFER_FLAGS_DISCARD)); + if (b_state_old == b_state) + break; + b_state = b_state_old; + } + unlock_buffer(bh); +} + +/** + * block_invalidatepage - invalidate part or all of a buffer-backed page + * + * @page: the page which is affected + * @offset: start of the range to invalidate + * @length: length of the range to invalidate + * + * block_invalidatepage() is called when all or part of the page has become + * invalidated by a truncate operation. + * + * block_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void block_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + unsigned int stop = length + offset; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + /* + * Check for overflow + */ + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * Are we still fully in range ? + */ + if (next_off > stop) + goto out; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (offset == 0) + try_to_release_page(page, 0); +out: + return; +} +EXPORT_SYMBOL(block_invalidatepage); + + +/* + * We attach and possibly dirty the buffers atomically wrt + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * is already excluded via the page lock. + */ +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + struct buffer_head *bh, *head, *tail; + + head = alloc_page_buffers(page, blocksize, 1); + bh = head; + do { + bh->b_state |= b_state; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + + spin_lock(&page->mapping->private_lock); + if (PageUptodate(page) || PageDirty(page)) { + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (PageUptodate(page)) + set_buffer_uptodate(bh); + bh = bh->b_this_page; + } while (bh != head); + } + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. + */ +void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +{ + struct buffer_head *old_bh; + + might_sleep(); + + old_bh = __find_get_block_slow(bdev, block); + if (old_bh) { + clear_buffer_dirty(old_bh); + wait_on_buffer(old_bh); + clear_buffer_req(old_bh); + __brelse(old_bh); + } +} +EXPORT_SYMBOL(unmap_underlying_metadata); + +/* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a + * locked buffer. This only can happen if someone has written the buffer + * directly, with submit_bh(). At the address_space level PageWriteback + * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * causes the writes to be flagged as synchronous writes. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc, + bh_end_io_t *handler) +{ + int err; + sector_t block; + sector_t last_block; + struct buffer_head *bh, *head; + unsigned int blocksize, bbits; + int nr_underway = 0; + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + + head = create_page_buffers(page, inode, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; + clear_buffer_delay(bh); + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from writeback threads + * and kswapd activity, but those code paths have their own + * higher-level throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write_endio(bh, handler); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + end_page_writeback(page); + + /* + * The page and buffer_heads can be released at any time from + * here on. + */ + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { + lock_buffer(bh); + mark_buffer_async_write_endio(bh, handler); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + mapping_set_error(page->mapping, err); + set_page_writeback(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + goto done; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(page_zero_new_buffers); + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, + to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + return err; +} +EXPORT_SYMBOL(__block_write_begin); + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + blocksize = bh->b_size; + + block_start = 0; + do { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + clear_buffer_new(bh); + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * The filesystem needs to handle block truncation upon failure. + */ +int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, + unsigned flags, struct page **pagep, get_block_t *get_block) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, get_block); + if (unlikely(status)) { + unlock_page(page); + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; +} +EXPORT_SYMBOL(block_write_begin); + +int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + return copied; +} +EXPORT_SYMBOL(block_write_end); + +int generic_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + + unlock_page(page); + page_cache_release(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} +EXPORT_SYMBOL(generic_write_end); + +/* + * block_is_partially_uptodate checks whether buffers within a page are + * uptodate or not. + * + * Returns true if all buffers which correspond to a file portion + * we want to read are uptodate. + */ +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) +{ + unsigned block_start, block_end, blocksize; + unsigned to; + struct buffer_head *bh, *head; + int ret = 1; + + if (!page_has_buffers(page)) + return 0; + + head = page_buffers(page); + blocksize = head->b_size; + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); + to = from + to; + if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) + return 0; + + bh = head; + block_start = 0; + do { + block_end = block_start + blocksize; + if (block_end > from && block_start < to) { + if (!buffer_uptodate(bh)) { + ret = 0; + break; + } + if (block_end >= to) + break; + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} +EXPORT_SYMBOL(block_is_partially_uptodate); + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + sector_t iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, bbits; + int nr, i; + int fully_mapped = 1; + + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); + + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + int err = 0; + + fully_mapped = 0; + if (iblock < lblock) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + SetPageError(page); + } + if (!buffer_mapped(bh)) { + zero_user(page, i * blocksize, blocksize); + if (!err) + set_buffer_uptodate(bh); + continue; + } + /* + * get_block() might have updated the buffer + * synchronously + */ + if (buffer_uptodate(bh)) + continue; + } + arr[nr++] = bh; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + SetPageMappedToDisk(page); + + if (!nr) { + /* + * All buffers are uptodate - we can set the page uptodate + * as well. But not if get_block() returned an error. + */ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + lock_buffer(bh); + mark_buffer_async_read(bh); + } + + /* + * Stage 3: start the IO. Check for uptodateness + * inside the buffer lock in case another process reading + * the underlying blockdev brought it uptodate (the sct fix). + */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_async_read(bh, 1); + else + submit_bh(READ, bh); + } + return 0; +} +EXPORT_SYMBOL(block_read_full_page); + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses filesystem pagecache writes to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand_simple(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + int err; + + err = inode_newsize_ok(inode, size); + if (err) + goto out; + + err = pagecache_write_begin(NULL, mapping, size, 0, + AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, + &page, &fsdata); + if (err) + goto out; + + err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); + BUG_ON(err > 0); + +out: + return err; +} +EXPORT_SYMBOL(generic_cont_expand_simple); + +static int cont_expand_zero(struct file *file, struct address_space *mapping, + loff_t pos, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + struct page *page; + void *fsdata; + pgoff_t index, curidx; + loff_t curpos; + unsigned zerofrom, offset, len; + int err = 0; + + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = PAGE_CACHE_SIZE - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + + balance_dirty_pages_ratelimited(mapping); + + if (unlikely(fatal_signal_pending(current))) { + err = -EINTR; + goto out; + } + } + + /* page covers the boundary, find the boundary offset */ + if (index == curidx) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + /* if we will expand the thing last block will be filled */ + if (offset <= zerofrom) { + goto out; + } + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = offset - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + } +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + unsigned zerofrom; + int err; + + err = cont_expand_zero(file, mapping, pos, bytes); + if (err) + return err; + + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (pos+len > *bytes && zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + return block_write_begin(mapping, pos, len, flags, pagep, get_block); +} +EXPORT_SYMBOL(cont_write_begin); + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + return 0; +} +EXPORT_SYMBOL(block_commit_write); + +/* + * block_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + * + * Direct callers of this function should protect against filesystem freezing + * using sb_start_write() - sb_end_write() functions. + */ +int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + unsigned long end; + loff_t size; + int ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) + end = size & ~PAGE_CACHE_MASK; + else + end = PAGE_CACHE_SIZE; + + ret = __block_write_begin(page, 0, end, get_block); + if (!ret) + ret = block_commit_write(page, 0, end); + + if (unlikely(ret < 0)) + goto out_unlock; + set_page_dirty(page); + wait_for_stable_page(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(__block_page_mkwrite); + +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int ret; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + sb_start_pagefault(sb); + + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + + ret = __block_page_mkwrite(vma, vmf, get_block); + sb_end_pagefault(sb); + return block_page_mkwrite_return(ret); +} +EXPORT_SYMBOL(block_page_mkwrite); + +/* + * nobh_write_begin()'s prereads are special: the buffer_heads are freed + * immediately, while under the page lock. So it needs a special end_io + * handler which does not touch the bh after unlocking it. + */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); +} + +/* + * Attach the singly-linked list of buffers created by nobh_write_begin, to + * the page (converting it to circular linked list and taking care of page + * dirty races). + */ +static void attach_nobh_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh; + + BUG_ON(!PageLocked(page)); + + spin_lock(&page->mapping->private_lock); + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (!bh->b_this_page) + bh->b_this_page = head; + bh = bh->b_this_page; + } while (bh != head); + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + * The filesystem needs to handle block truncation upon failure. + */ +int nobh_write_begin(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head *head, *bh; + struct page *page; + pgoff_t index; + unsigned from, to; + unsigned block_in_page; + unsigned block_start, block_end; + sector_t block_in_file; + int nr_reads = 0; + int ret = 0; + int is_mapped_to_disk = 1; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + *fsdata = NULL; + + if (page_has_buffers(page)) { + ret = __block_write_begin(page, pos, len, get_block); + if (unlikely(ret)) + goto out_release; + return ret; + } + + if (PageMappedToDisk(page)) + return 0; + + /* + * Allocate buffers so that we can keep track of state, and potentially + * attach them to the page if an error occurs. In the common case of + * no error, they will just be freed again without ever being attached + * to the page (which is all OK, because we're under the page lock). + * + * Be careful: the buffer linked list is a NULL terminated one, rather + * than the circular one we're used to. + */ + head = alloc_page_buffers(page, blocksize, 0); + if (!head) { + ret = -ENOMEM; + goto out_release; + } + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0, bh = head; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize, bh = bh->b_this_page) { + int create; + + block_end = block_start + blocksize; + bh->b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + bh, create); + if (ret) + goto failed; + if (!buffer_mapped(bh)) + is_mapped_to_disk = 0; + if (buffer_new(bh)) + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + if (buffer_new(bh) || !buffer_mapped(bh)) { + zero_user_segments(page, block_start, from, + to, block_end); + continue; + } + if (buffer_uptodate(bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + nr_reads++; + } + } + + if (nr_reads) { + /* + * The page is locked, so these buffers are protected from + * any VM or truncate activity. Hence we don't need to care + * for the buffer_head refcounts. + */ + for (bh = head; bh; bh = bh->b_this_page) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + + *fsdata = head; /* to be released by nobh_write_end */ + + return 0; + +failed: + BUG_ON(!ret); + /* + * Error recovery is a bit difficult. We need to zero out blocks that + * were newly allocated, and dirty them to ensure they get written out. + * Buffers need to be attached to the page at this point, otherwise + * the handling of potential IO errors during writeout would be hard + * (could try doing synchronous writeout, but what if that fails too?) + */ + attach_nobh_buffers(page, head); + page_zero_new_buffers(page, from, to); + +out_release: + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + + return ret; +} +EXPORT_SYMBOL(nobh_write_begin); + +int nobh_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *head = fsdata; + struct buffer_head *bh; + BUG_ON(fsdata != NULL && page_has_buffers(page)); + + if (unlikely(copied < len) && head) + attach_nobh_buffers(page, head); + if (page_has_buffers(page)) + return generic_write_end(file, mapping, pos, len, + copied, page, fsdata); + + SetPageUptodate(page); + set_page_dirty(page); + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + unlock_page(page); + page_cache_release(page); + + while (head) { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } + + return copied; +} +EXPORT_SYMBOL(nobh_write_end); + +/* + * nobh_writepage() - based on block_full_write_page() except + * that it tries to operate without attaching bufferheads to + * the page. + */ +int nobh_writepage(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int ret; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto out; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ +#if 0 + /* Not really sure about this - do we need this ? */ + if (page->mapping->a_ops->invalidatepage) + page->mapping->a_ops->invalidatepage(page, offset); +#endif + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); +out: + ret = mpage_writepage(page, get_block, wbc); + if (ret == -EAGAIN) + ret = __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); + return ret; +} +EXPORT_SYMBOL(nobh_writepage); + +int nobh_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head map_bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (page_has_buffers(page)) { +has_buffers: + unlock_page(page); + page_cache_release(page); + return block_truncate_page(mapping, from, get_block); + } + + /* Find the buffer that contains "offset" */ + pos = blocksize; + while (offset >= pos) { + iblock++; + pos += blocksize; + } + + map_bh.b_size = blocksize; + map_bh.b_state = 0; + err = get_block(inode, iblock, &map_bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(&map_bh)) + goto unlock; + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (!PageUptodate(page)) { + err = mapping->a_ops->readpage(NULL, page); + if (err) { + page_cache_release(page); + goto out; + } + lock_page(page); + if (!PageUptodate(page)) { + err = -EIO; + goto unlock; + } + if (page_has_buffers(page)) + goto has_buffers; + } + zero_user(page, offset, length); + set_page_dirty(page); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} +EXPORT_SYMBOL(nobh_truncate_page); + +int block_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + zero_user(page, offset, length); + mark_buffer_dirty(bh); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} +EXPORT_SYMBOL(block_truncate_page); + +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); +} +EXPORT_SYMBOL(block_write_full_page); + +sector_t generic_block_bmap(struct address_space *mapping, sector_t block, + get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + tmp.b_size = 1 << inode->i_blkbits; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} +EXPORT_SYMBOL(generic_block_bmap); + +static void end_bio_bh_io_sync(struct bio *bio, int err) +{ + struct buffer_head *bh = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + } + + if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + set_bit(BH_Quiet, &bh->b_state); + + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bio_put(bio); +} + +/* + * This allows us to do IO even on the odd last sectors + * of a device, even if the block size is some multiple + * of the physical sector size. + * + * We'll just truncate the bio to the size of the device, + * and clear the end of the buffer head manually. + * + * Truly out-of-range accesses will turn into actual IO + * errors, this only handles the "we need to be able to + * do IO at the final sector" case. + */ +void guard_bio_eod(int rw, struct bio *bio) +{ + sector_t maxsector; + struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned truncated_bytes; + + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + /* Uhhuh. We've got a bio that straddles the device size! */ + truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); + + /* Truncate the bio.. */ + bio->bi_iter.bi_size -= truncated_bytes; + bvec->bv_len -= truncated_bytes; + + /* ..and clear the end of the buffer for reads */ + if ((rw & RW_MASK) == READ) { + zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + truncated_bytes); + } +} + +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +{ + struct bio *bio; + int ret = 0; + + BUG_ON(!buffer_locked(bh)); + BUG_ON(!buffer_mapped(bh)); + BUG_ON(!bh->b_end_io); + BUG_ON(buffer_delay(bh)); + BUG_ON(buffer_unwritten(bh)); + + /* + * Only clear out a write error when rewriting + */ + if (test_set_buffer_req(bh) && (rw & WRITE)) + clear_buffer_write_io_error(bh); + + /* + * from here on down, it's all bio -- do the initial mapping, + * submit_bio -> generic_make_request may further map this bio around + */ + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_io_vec[0].bv_page = bh->b_page; + bio->bi_io_vec[0].bv_len = bh->b_size; + bio->bi_io_vec[0].bv_offset = bh_offset(bh); + + bio->bi_vcnt = 1; + bio->bi_iter.bi_size = bh->b_size; + + bio->bi_end_io = end_bio_bh_io_sync; + bio->bi_private = bh; + bio->bi_flags |= bio_flags; + + /* Take care of bh's that straddle the end of the device */ + guard_bio_eod(rw, bio); + + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + + bio_get(bio); + submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL_GPL(_submit_bh); + +int submit_bh(int rw, struct buffer_head *bh) +{ + return _submit_bh(rw, bh, 0); +} +EXPORT_SYMBOL(submit_bh); + +/** + * ll_rw_block: low-level access to block devices (DEPRECATED) + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, and + * requests an I/O operation on them, either a %READ or a %WRITE. The third + * %READA option is described in the documentation for generic_make_request() + * which ll_rw_block() calls. + * + * This function drops any buffer that it cannot get a lock on (with the + * BH_Lock state bit), any buffer that appears to be clean when doing a write + * request, and any buffer that appears to be up-to-date when doing read + * request. Further it marks as clean buffers that are processed for + * writing (the buffer cache won't assume that they are actually clean + * until the buffer gets unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if appropriate), unlocks the buffer and wakes + * any waiters. + * + * All of the buffers must be for the same device, and must also be a + * multiple of the current approved size for the device. + */ +void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (!trylock_buffer(bh)) + continue; + if (rw == WRITE) { + if (test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE, bh); + continue; + } + } else { + if (!buffer_uptodate(bh)) { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(rw, bh); + continue; + } + } + unlock_buffer(bh); + } +} +EXPORT_SYMBOL(ll_rw_block); + +void write_dirty_buffer(struct buffer_head *bh, int rw) +{ + lock_buffer(bh); + if (!test_clear_buffer_dirty(bh)) { + unlock_buffer(bh); + return; + } + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(rw, bh); +} +EXPORT_SYMBOL(write_dirty_buffer); + +/* + * For a data-integrity writeout, we need to wait upon any in-progress I/O + * and then start new I/O and then wait upon it. The caller must have a ref on + * the buffer_head. + */ +int __sync_dirty_buffer(struct buffer_head *bh, int rw) +{ + int ret = 0; + + WARN_ON(atomic_read(&bh->b_count) < 1); + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(rw, bh); + wait_on_buffer(bh); + if (!ret && !buffer_uptodate(bh)) + ret = -EIO; + } else { + unlock_buffer(bh); + } + return ret; +} +EXPORT_SYMBOL(__sync_dirty_buffer); + +int sync_dirty_buffer(struct buffer_head *bh) +{ + return __sync_dirty_buffer(bh, WRITE_SYNC); +} +EXPORT_SYMBOL(sync_dirty_buffer); + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and releases them if so. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the page or by holding its mapping's private_lock. + * + * If the page is dirty but all the buffers are clean then we need to + * be sure to mark the page clean as well. This is because the page + * may be against a block device, and a later reattachment of buffers + * to a dirty page will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem pages: if all the buffers are + * clean then we set the page clean and proceed. To do that, we require + * total exclusion from __set_page_dirty_buffers(). That is obtained with + * private_lock. + * + * try_to_free_buffers() is non-blocking. + */ +static inline int buffer_busy(struct buffer_head *bh) +{ + return atomic_read(&bh->b_count) | + (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); +} + +static int +drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh; + + bh = head; + do { + if (buffer_write_io_error(bh) && page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + if (buffer_busy(bh)) + goto failed; + bh = bh->b_this_page; + } while (bh != head); + + do { + struct buffer_head *next = bh->b_this_page; + + if (bh->b_assoc_map) + __remove_assoc_queue(bh); + bh = next; + } while (bh != head); + *buffers_to_free = head; + __clear_page_buffers(page); + return 1; +failed: + return 0; +} + +int try_to_free_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + struct buffer_head *buffers_to_free = NULL; + int ret = 0; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping == NULL) { /* can this still happen? */ + ret = drop_buffers(page, &buffers_to_free); + goto out; + } + + spin_lock(&mapping->private_lock); + ret = drop_buffers(page, &buffers_to_free); + + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise the VM will never notice + * that the filesystem did any IO at all. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. + * + * private_lock must be held over this entire operation in order + * to synchronise against __set_page_dirty_buffers and prevent the + * dirty bit from being lost. + */ + if (ret && TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + spin_unlock(&mapping->private_lock); +out: + if (buffers_to_free) { + struct buffer_head *bh = buffers_to_free; + + do { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } while (bh != buffers_to_free); + } + return ret; +} +EXPORT_SYMBOL(try_to_free_buffers); + +/* + * There are no bdflush tunables left. But distributions are + * still running obsolete flush daemons, so we terminate them here. + * + * Use of bdflush() is deprecated and will be removed in a future kernel. + * The `flush-X' kernel threads fully replace bdflush daemons and this call. + */ +SYSCALL_DEFINE2(bdflush, int, func, long, data) +{ + static int msg_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the obsolete bdflush" + " system call\n", current->comm); + printk(KERN_INFO "Fix your initscripts?\n"); + } + + if (func == 1) + do_exit(0); + return 0; +} + +/* + * Buffer-head allocation + */ +static struct kmem_cache *bh_cachep __read_mostly; + +/* + * Once the number of bh's in the machine exceeds this level, we start + * stripping them in writeback. + */ +static unsigned long max_buffer_heads; + +int buffer_heads_over_limit; + +struct bh_accounting { + int nr; /* Number of live bh's */ + int ratelimit; /* Limit cacheline bouncing */ +}; + +static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; + +static void recalc_bh_state(void) +{ + int i; + int tot = 0; + + if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) + return; + __this_cpu_write(bh_accounting.ratelimit, 0); + for_each_online_cpu(i) + tot += per_cpu(bh_accounting, i).nr; + buffer_heads_over_limit = (tot > max_buffer_heads); +} + +struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) +{ + struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); + if (ret) { + INIT_LIST_HEAD(&ret->b_assoc_buffers); + buffer_head_init_locks(ret); + preempt_disable(); + __this_cpu_inc(bh_accounting.nr); + recalc_bh_state(); + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(alloc_buffer_head); + +void free_buffer_head(struct buffer_head *bh) +{ + BUG_ON(!list_empty(&bh->b_assoc_buffers)); + kmem_cache_free(bh_cachep, bh); + preempt_disable(); + __this_cpu_dec(bh_accounting.nr); + recalc_bh_state(); + preempt_enable(); +} +EXPORT_SYMBOL(free_buffer_head); + +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); + per_cpu(bh_accounting, cpu).nr = 0; +} + +static int buffer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + buffer_exit_cpu((unsigned long)hcpu); + return NOTIFY_OK; +} + +/** + * bh_uptodate_or_lock - Test whether the buffer is uptodate + * @bh: struct buffer_head + * + * Return true if the buffer is up-to-date and false, + * with the buffer locked, if not. + */ +int bh_uptodate_or_lock(struct buffer_head *bh) +{ + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + if (!buffer_uptodate(bh)) + return 0; + unlock_buffer(bh); + } + return 1; +} +EXPORT_SYMBOL(bh_uptodate_or_lock); + +/** + * bh_submit_read - Submit a locked buffer for reading + * @bh: struct buffer_head + * + * Returns zero on success and -EIO on error. + */ +int bh_submit_read(struct buffer_head *bh) +{ + BUG_ON(!buffer_locked(bh)); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} +EXPORT_SYMBOL(bh_submit_read); + +void __init buffer_init(void) +{ + unsigned long nrpages; + + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + NULL); + + /* + * Limit the bh occupancy to 10% of ZONE_NORMAL + */ + nrpages = (nr_free_buffer_pages() * 10) / 100; + max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); + hotcpu_notifier(buffer_cpu_notify, 0); +} diff --git a/kernel/fs/cachefiles/Kconfig b/kernel/fs/cachefiles/Kconfig new file mode 100644 index 000000000..80e9c6167 --- /dev/null +++ b/kernel/fs/cachefiles/Kconfig @@ -0,0 +1,39 @@ + +config CACHEFILES + tristate "Filesystem caching on files" + depends on FSCACHE && BLOCK + help + This permits use of a mounted filesystem as a cache for other + filesystems - primarily networking filesystems - thus allowing fast + local disk to enhance the speed of slower devices. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. + +config CACHEFILES_DEBUG + bool "Debug CacheFiles" + depends on CACHEFILES + help + This permits debugging to be dynamically enabled in the filesystem + caching on files module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/cachefiles/parameter/debug or + by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_HISTOGRAM + bool "Gather latency information on CacheFiles" + depends on CACHEFILES && PROC_FS + help + + This option causes latency information to be gathered on CacheFiles + operation and exported through file: + + /proc/fs/cachefiles/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. diff --git a/kernel/fs/cachefiles/Makefile b/kernel/fs/cachefiles/Makefile new file mode 100644 index 000000000..32cbab0ff --- /dev/null +++ b/kernel/fs/cachefiles/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for caching in a mounted filesystem +# + +cachefiles-y := \ + bind.o \ + daemon.o \ + interface.o \ + key.o \ + main.o \ + namei.o \ + rdwr.o \ + security.o \ + xattr.o + +cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o + +obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/kernel/fs/cachefiles/bind.c b/kernel/fs/cachefiles/bind.c new file mode 100644 index 000000000..6af790fc3 --- /dev/null +++ b/kernel/fs/cachefiles/bind.c @@ -0,0 +1,277 @@ +/* Bind and unbind a cache from the filesystem backing it + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); + +/* + * bind a directory as a cache + */ +int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) +{ + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + /* start by checking things over */ + ASSERT(cache->fstop_percent >= 0 && + cache->fstop_percent < cache->fcull_percent && + cache->fcull_percent < cache->frun_percent && + cache->frun_percent < 100); + + ASSERT(cache->bstop_percent >= 0 && + cache->bstop_percent < cache->bcull_percent && + cache->bcull_percent < cache->brun_percent && + cache->brun_percent < 100); + + if (*args) { + pr_err("'bind' command doesn't take an argument\n"); + return -EINVAL; + } + + if (!cache->rootdirname) { + pr_err("No cache directory specified\n"); + return -EINVAL; + } + + /* don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + pr_err("Cache already bound\n"); + return -EBUSY; + } + + /* make sure we have copies of the tag and dirname strings */ + if (!cache->tag) { + /* the tag string is released by the fops->release() + * function, so we don't release it on error here */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + + /* add the cache */ + return cachefiles_daemon_add_cache(cache); +} + +/* + * add a cache + */ +static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) +{ + struct cachefiles_object *fsdef; + struct path path; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + return ret; + + cachefiles_begin_secure(cache, &saved_cred); + + /* allocate the root index object */ + ret = -ENOMEM; + + fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!fsdef) + goto error_root_object; + + ASSERTCMP(fsdef->backer, ==, NULL); + + atomic_set(&fsdef->usage, 1); + fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; + + _debug("- fsdef %p", fsdef); + + /* look up the directory at the root of the cache */ + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); + if (ret < 0) + goto error_open_root; + + cache->mnt = path.mnt; + root = path.dentry; + + /* check parameters */ + ret = -EOPNOTSUPP; + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->setxattr || + !d_backing_inode(root)->i_op->getxattr || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs) + goto error_unsupported; + + ret = -EROFS; + if (root->d_sb->s_flags & MS_RDONLY) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(&path, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = 0; + if (stats.f_bsize < PAGE_SIZE) + cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + stats.f_blocks >>= cache->bshift; + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache"); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + fsdef->dentry = cachedir; + fsdef->fscache.cookie = NULL; + + ret = cachefiles_check_object_type(fsdef); + if (ret < 0) + goto error_unsupported; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard"); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + + /* publish the cache */ + fscache_init_cache(&cache->cache, + &cachefiles_cache_ops, + "%s", + fsdef->dentry->d_sb->s_id); + + fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + + ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + pr_info("File cache on %s registered\n", cache->cache.identifier); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + cachefiles_end_secure(cache, saved_cred); + return 0; + +error_add_cache: + dput(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + mntput(cache->mnt); + cache->mnt = NULL; + dput(fsdef->dentry); + fsdef->dentry = NULL; + dput(root); +error_open_root: + kmem_cache_free(cachefiles_object_jar, fsdef); +error_root_object: + cachefiles_end_secure(cache, saved_cred); + pr_err("Failed to register: %d\n", ret); + return ret; +} + +/* + * unbind a cache on fd release + */ +void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) { + pr_info("File cache on %s unregistering\n", + cache->cache.identifier); + + fscache_withdraw_cache(&cache->cache); + } + + dput(cache->graveyard); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); +} diff --git a/kernel/fs/cachefiles/daemon.c b/kernel/fs/cachefiles/daemon.c new file mode 100644 index 000000000..f601def05 --- /dev/null +++ b/kernel/fs/cachefiles/daemon.c @@ -0,0 +1,751 @@ +/* Daemon interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int cachefiles_daemon_open(struct inode *, struct file *); +static int cachefiles_daemon_release(struct inode *, struct file *); +static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, + loff_t *); +static ssize_t cachefiles_daemon_write(struct file *, const char __user *, + size_t, loff_t *); +static unsigned int cachefiles_daemon_poll(struct file *, + struct poll_table_struct *); +static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); +static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); +static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); +static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); +static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); + +static unsigned long cachefiles_open; + +const struct file_operations cachefiles_daemon_fops = { + .owner = THIS_MODULE, + .open = cachefiles_daemon_open, + .release = cachefiles_daemon_release, + .read = cachefiles_daemon_read, + .write = cachefiles_daemon_write, + .poll = cachefiles_daemon_poll, + .llseek = noop_llseek, +}; + +struct cachefiles_daemon_cmd { + char name[8]; + int (*handler)(struct cachefiles_cache *cache, char *args); +}; + +static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { + { "bind", cachefiles_daemon_bind }, + { "brun", cachefiles_daemon_brun }, + { "bcull", cachefiles_daemon_bcull }, + { "bstop", cachefiles_daemon_bstop }, + { "cull", cachefiles_daemon_cull }, + { "debug", cachefiles_daemon_debug }, + { "dir", cachefiles_daemon_dir }, + { "frun", cachefiles_daemon_frun }, + { "fcull", cachefiles_daemon_fcull }, + { "fstop", cachefiles_daemon_fstop }, + { "inuse", cachefiles_daemon_inuse }, + { "secctx", cachefiles_daemon_secctx }, + { "tag", cachefiles_daemon_tag }, + { "", NULL } +}; + + +/* + * do various checks + */ +static int cachefiles_daemon_open(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache; + + _enter(""); + + /* only the superuser may do this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* the cachefiles device may only be open once at a time */ + if (xchg(&cachefiles_open, 1) == 1) + return -EBUSY; + + /* allocate a cache record */ + cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); + if (!cache) { + cachefiles_open = 0; + return -ENOMEM; + } + + mutex_init(&cache->daemon_mutex); + cache->active_nodes = RB_ROOT; + rwlock_init(&cache->active_lock); + init_waitqueue_head(&cache->daemon_pollwq); + + /* set default caching limits + * - limit at 1% free space and/or free files + * - cull below 5% free space and/or free files + * - cease culling above 7% free space and/or free files + */ + cache->frun_percent = 7; + cache->fcull_percent = 5; + cache->fstop_percent = 1; + cache->brun_percent = 7; + cache->bcull_percent = 5; + cache->bstop_percent = 1; + + file->private_data = cache; + cache->cachefilesd = file; + return 0; +} + +/* + * release a cache + */ +static int cachefiles_daemon_release(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache = file->private_data; + + _enter(""); + + ASSERT(cache); + + set_bit(CACHEFILES_DEAD, &cache->flags); + + cachefiles_daemon_unbind(cache); + + ASSERT(!cache->active_nodes.rb_node); + + /* clean up the control file interface */ + cache->cachefilesd = NULL; + file->private_data = NULL; + cachefiles_open = 0; + + kfree(cache); + + _leave(""); + return 0; +} + +/* + * read the cache state + */ +static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, + size_t buflen, loff_t *pos) +{ + struct cachefiles_cache *cache = file->private_data; + char buffer[256]; + int n; + + //_enter(",,%zu,", buflen); + + if (!test_bit(CACHEFILES_READY, &cache->flags)) + return 0; + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + + /* summarise */ + clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + + n = snprintf(buffer, sizeof(buffer), + "cull=%c" + " frun=%llx" + " fcull=%llx" + " fstop=%llx" + " brun=%llx" + " bcull=%llx" + " bstop=%llx", + test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop, + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop + ); + + if (n > buflen) + return -EMSGSIZE; + + if (copy_to_user(_buffer, buffer, n) != 0) + return -EFAULT; + + return n; +} + +/* + * command the cache + */ +static ssize_t cachefiles_daemon_write(struct file *file, + const char __user *_data, + size_t datalen, + loff_t *pos) +{ + const struct cachefiles_daemon_cmd *cmd; + struct cachefiles_cache *cache = file->private_data; + ssize_t ret; + char *data, *args, *cp; + + //_enter(",,%zu,", datalen); + + ASSERT(cache); + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) + return -EIO; + + if (datalen < 0 || datalen > PAGE_SIZE - 1) + return -EOPNOTSUPP; + + /* drag the command string into the kernel so we can parse it */ + data = kmalloc(datalen + 1, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, _data, datalen) != 0) + goto error; + + data[datalen] = '\0'; + + ret = -EINVAL; + if (memchr(data, '\0', datalen)) + goto error; + + /* strip any newline */ + cp = memchr(data, '\n', datalen); + if (cp) { + if (cp == data) + goto error; + + *cp = '\0'; + } + + /* parse the command */ + ret = -EOPNOTSUPP; + + for (args = data; *args; args++) + if (isspace(*args)) + break; + if (*args) { + if (args == data) + goto error; + *args = '\0'; + args = skip_spaces(++args); + } + + /* run the appropriate command handler */ + for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) + if (strcmp(cmd->name, data) == 0) + goto found_command; + +error: + kfree(data); + //_leave(" = %zd", ret); + return ret; + +found_command: + mutex_lock(&cache->daemon_mutex); + + ret = -EIO; + if (!test_bit(CACHEFILES_DEAD, &cache->flags)) + ret = cmd->handler(cache, args); + + mutex_unlock(&cache->daemon_mutex); + + if (ret == 0) + ret = datalen; + goto error; +} + +/* + * poll for culling state + * - use POLLOUT to indicate culling state + */ +static unsigned int cachefiles_daemon_poll(struct file *file, + struct poll_table_struct *poll) +{ + struct cachefiles_cache *cache = file->private_data; + unsigned int mask; + + poll_wait(file, &cache->daemon_pollwq, poll); + mask = 0; + + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= POLLIN; + + if (test_bit(CACHEFILES_CULLING, &cache->flags)) + mask |= POLLOUT; + + return mask; +} + +/* + * give a range error for cache space constraints + * - can be tail-called + */ +static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, + char *args) +{ + pr_err("Free space limits must be in range 0%%<=stop%" + */ +static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) +{ + unsigned long frun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + frun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (frun <= cache->fcull_percent || frun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->frun_percent = frun; + return 0; +} + +/* + * set the percentage of files at which to start culling + * - command: "fcull %" + */ +static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long fcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fcull_percent = fcull; + return 0; +} + +/* + * set the percentage of files at which to stop allocating + * - command: "fstop %" + */ +static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long fstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fstop < 0 || fstop >= cache->fcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fstop_percent = fstop; + return 0; +} + +/* + * set the percentage of blocks at which to stop culling + * - command: "brun %" + */ +static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) +{ + unsigned long brun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + brun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (brun <= cache->bcull_percent || brun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->brun_percent = brun; + return 0; +} + +/* + * set the percentage of blocks at which to start culling + * - command: "bcull %" + */ +static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long bcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bcull_percent = bcull; + return 0; +} + +/* + * set the percentage of blocks at which to stop allocating + * - command: "bstop %" + */ +static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long bstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bstop < 0 || bstop >= cache->bcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bstop_percent = bstop; + return 0; +} + +/* + * set the cache directory + * - command: "dir " + */ +static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) +{ + char *dir; + + _enter(",%s", args); + + if (!*args) { + pr_err("Empty directory specified\n"); + return -EINVAL; + } + + if (cache->rootdirname) { + pr_err("Second cache directory specified\n"); + return -EEXIST; + } + + dir = kstrdup(args, GFP_KERNEL); + if (!dir) + return -ENOMEM; + + cache->rootdirname = dir; + return 0; +} + +/* + * set the cache security context + * - command: "secctx " + */ +static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) +{ + char *secctx; + + _enter(",%s", args); + + if (!*args) { + pr_err("Empty security context specified\n"); + return -EINVAL; + } + + if (cache->secctx) { + pr_err("Second security context specified\n"); + return -EINVAL; + } + + secctx = kstrdup(args, GFP_KERNEL); + if (!secctx) + return -ENOMEM; + + cache->secctx = secctx; + return 0; +} + +/* + * set the cache tag + * - command: "tag " + */ +static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) +{ + char *tag; + + _enter(",%s", args); + + if (!*args) { + pr_err("Empty tag specified\n"); + return -EINVAL; + } + + if (cache->tag) + return -EEXIST; + + tag = kstrdup(args, GFP_KERNEL); + if (!tag) + return -ENOMEM; + + cache->tag = tag; + return 0; +} + +/* + * request a node in the cache be culled from the current working directory + * - command: "cull " + */ +static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) +{ + struct path path; + const struct cred *saved_cred; + int ret; + + _enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + pr_err("cull applied to unready cache\n"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + pr_err("cull applied to dead cache\n"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + get_fs_pwd(current->fs, &path); + + if (!d_can_lookup(path.dentry)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_cull(cache, path.dentry, args); + cachefiles_end_secure(cache, saved_cred); + + path_put(&path); + _leave(" = %d", ret); + return ret; + +notdir: + path_put(&path); + pr_err("cull command requires dirfd to be a directory\n"); + return -ENOTDIR; + +inval: + pr_err("cull command requires dirfd and filename\n"); + return -EINVAL; +} + +/* + * set debugging mode + * - command: "debug " + */ +static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) +{ + unsigned long mask; + + _enter(",%s", args); + + mask = simple_strtoul(args, &args, 0); + if (args[0] != '\0') + goto inval; + + cachefiles_debug = mask; + _leave(" = 0"); + return 0; + +inval: + pr_err("debug command requires mask\n"); + return -EINVAL; +} + +/* + * find out whether an object in the current working directory is in use or not + * - command: "inuse " + */ +static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) +{ + struct path path; + const struct cred *saved_cred; + int ret; + + //_enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + pr_err("inuse applied to unready cache\n"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + pr_err("inuse applied to dead cache\n"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + get_fs_pwd(current->fs, &path); + + if (!d_can_lookup(path.dentry)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_in_use(cache, path.dentry, args); + cachefiles_end_secure(cache, saved_cred); + + path_put(&path); + //_leave(" = %d", ret); + return ret; + +notdir: + path_put(&path); + pr_err("inuse command requires dirfd to be a directory\n"); + return -ENOTDIR; + +inval: + pr_err("inuse command requires dirfd and filename\n"); + return -EINVAL; +} + +/* + * see if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr) +{ + struct kstatfs stats; + struct path path = { + .mnt = cache->mnt, + .dentry = cache->mnt->mnt_root, + }; + int ret; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(&path, &stats); + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + stats.f_bavail >>= cache->bshift; + + //_debug("avail %llu,%llu", + // (unsigned long long) stats.f_ffree, + // (unsigned long long) stats.f_bavail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (stats.f_bavail > bnr) + stats.f_bavail -= bnr; + else + stats.f_bavail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + stats.f_bavail < cache->bstop) + goto begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + stats.f_bavail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + stats.f_bavail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/cachefiles/interface.c b/kernel/fs/cachefiles/interface.c new file mode 100644 index 000000000..afa023dde --- /dev/null +++ b/kernel/fs/cachefiles/interface.c @@ -0,0 +1,557 @@ +/* FS-Cache interface to CacheFiles + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +struct cachefiles_lookup_data { + struct cachefiles_xattr *auxdata; /* auxiliary data */ + char *key; /* key path */ +}; + +static int cachefiles_attr_changed(struct fscache_object *_object); + +/* + * allocate an object record for a cookie lookup and prepare the lookup data + */ +static struct fscache_object *cachefiles_alloc_object( + struct fscache_cache *_cache, + struct fscache_cookie *cookie) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct cachefiles_xattr *auxdata; + unsigned keylen, auxlen; + void *buffer; + char *key; + + cache = container_of(_cache, struct cachefiles_cache, cache); + + _enter("{%s},%p,", cache->cache.identifier, cookie); + + lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); + if (!lookup_data) + goto nomem_lookup_data; + + /* create a new object record and a temporary leaf image */ + object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp); + if (!object) + goto nomem_object; + + ASSERTCMP(object->backer, ==, NULL); + + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + atomic_set(&object->usage, 1); + + fscache_object_init(&object->fscache, cookie, &cache->cache); + + object->type = cookie->def->type; + + /* get hold of the raw key + * - stick the length on the front and leave space on the back for the + * encoder + */ + buffer = kmalloc((2 + 512) + 3, cachefiles_gfp); + if (!buffer) + goto nomem_buffer; + + keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); + ASSERTCMP(keylen, <, 512); + + *(uint16_t *)buffer = keylen; + ((char *)buffer)[keylen + 2] = 0; + ((char *)buffer)[keylen + 3] = 0; + ((char *)buffer)[keylen + 4] = 0; + + /* turn the raw key into something that can work with as a filename */ + key = cachefiles_cook_key(buffer, keylen + 2, object->type); + if (!key) + goto nomem_key; + + /* get hold of the auxiliary data and prepend the object type */ + auxdata = buffer; + auxlen = 0; + if (cookie->def->get_aux) { + auxlen = cookie->def->get_aux(cookie->netfs_data, + auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + } + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + lookup_data->auxdata = auxdata; + lookup_data->key = key; + object->lookup_data = lookup_data; + + _leave(" = %p [%p]", &object->fscache, lookup_data); + return &object->fscache; + +nomem_key: + kfree(buffer); +nomem_buffer: + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(&cache->cache); +nomem_object: + kfree(lookup_data); +nomem_lookup_data: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * attempt to look up the nominated node in this cache + * - return -ETIMEDOUT to be scheduled again + */ +static int cachefiles_lookup_object(struct fscache_object *_object) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *parent, *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", _object->debug_id); + + cache = container_of(_object->cache, struct cachefiles_cache, cache); + parent = container_of(_object->parent, + struct cachefiles_object, fscache); + object = container_of(_object, struct cachefiles_object, fscache); + lookup_data = object->lookup_data; + + ASSERTCMP(lookup_data, !=, NULL); + + /* look up the key, creating any missing bits */ + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_walk_to_object(parent, object, + lookup_data->key, + lookup_data->auxdata); + cachefiles_end_secure(cache, saved_cred); + + /* polish off by setting the attributes of non-index files */ + if (ret == 0 && + object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + cachefiles_attr_changed(&object->fscache); + + if (ret < 0 && ret != -ETIMEDOUT) { + if (ret != -ENOBUFS) + pr_warn("Lookup failed error %d\n", ret); + fscache_object_lookup_error(&object->fscache); + } + + _leave(" [%d]", ret); + return ret; +} + +/* + * indication of lookup completion + */ +static void cachefiles_lookup_complete(struct fscache_object *_object) +{ + struct cachefiles_object *object; + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } +} + +/* + * increment the usage count on an inode object (may fail if unmounting) + */ +static +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +{ + struct cachefiles_object *object = + container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + atomic_inc(&object->usage); + return &object->fscache; +} + +/* + * update the auxiliary data for an object object on disk + */ +static void cachefiles_update_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_xattr *auxdata; + struct cachefiles_cache *cache; + struct fscache_cookie *cookie; + const struct cred *saved_cred; + unsigned auxlen; + + _enter("{OBJ%x}", _object->debug_id); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, struct cachefiles_cache, + cache); + + if (!fscache_use_cookie(_object)) { + _leave(" [relinq]"); + return; + } + + cookie = object->fscache.cookie; + + if (!cookie->def->get_aux) { + fscache_unuse_cookie(_object); + _leave(" [no aux]"); + return; + } + + auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); + if (!auxdata) { + fscache_unuse_cookie(_object); + _leave(" [nomem]"); + return; + } + + auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + fscache_unuse_cookie(_object); + ASSERTCMP(auxlen, <, 511); + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_update_object_xattr(object, auxdata); + cachefiles_end_secure(cache, saved_cred); + kfree(auxdata); + _leave(""); +} + +/* + * discard the resources pinned by an object and effect retirement if + * requested + */ +static void cachefiles_drop_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + /* We need to tidy the object up if we did in fact manage to open it. + * It's possible for us to get here before the object is fully + * initialised if the parent goes away or the object gets retired + * before we set it up. + */ + if (object->dentry) { + /* delete retired objects */ + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && + _object != cache->cache.fsdef + ) { + _debug("- retire object OBJ%x", object->fscache.debug_id); + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_delete_object(cache, object); + cachefiles_end_secure(cache, saved_cred); + } + + /* close the filesystem stuff attached to the object */ + if (object->backer != object->dentry) + dput(object->backer); + object->backer = NULL; + } + + /* note that the object is now inactive */ + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + write_lock(&cache->active_lock); + if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, + &object->flags)) + BUG(); + rb_erase(&object->active_node, &cache->active_nodes); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + } + + dput(object->dentry); + object->dentry = NULL; + + _leave(""); +} + +/* + * dispose of a reference to an object + */ +static void cachefiles_put_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct fscache_cache *cache; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + ASSERTIFCMP(object->fscache.parent, + object->fscache.parent->n_children, >, 0); + + if (atomic_dec_and_test(&object->usage)) { + _debug("- kill object OBJ%x", object->fscache.debug_id); + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + ASSERTCMP(object->fscache.parent, ==, NULL); + ASSERTCMP(object->backer, ==, NULL); + ASSERTCMP(object->dentry, ==, NULL); + ASSERTCMP(object->fscache.n_ops, ==, 0); + ASSERTCMP(object->fscache.n_children, ==, 0); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } + + cache = object->fscache.cache; + fscache_object_destroy(&object->fscache); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(cache); + } + + _leave(""); +} + +/* + * sync a cache + */ +static void cachefiles_sync_cache(struct fscache_cache *_cache) +{ + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("%p", _cache); + + cache = container_of(_cache, struct cachefiles_cache, cache); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + down_read(&cache->mnt->mnt_sb->s_umount); + ret = sync_filesystem(cache->mnt->mnt_sb); + up_read(&cache->mnt->mnt_sb->s_umount); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock" + " returned error %d", + ret); +} + +/* + * check if the backing cache is updated to FS-Cache + * - called by FS-Cache when evaluates if need to invalidate the cache + */ +static bool cachefiles_check_consistency(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", op->object->debug_id); + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_auxdata(object); + cachefiles_end_secure(cache, saved_cred); + + _leave(" = %d", ret); + return ret; +} + +/* + * notification the attributes on an object have changed + * - called with reads/writes excluded by FS-Cache + */ +static int cachefiles_attr_changed(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct iattr newattrs; + uint64_t ni_size; + loff_t oi_size; + int ret; + + _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + + _enter("{OBJ%x},[%llu]", + _object->debug_id, (unsigned long long) ni_size); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (ni_size == object->i_size) + return 0; + + if (!object->backer) + return -ENOBUFS; + + ASSERT(d_is_reg(object->backer)); + + fscache_set_store_limit(&object->fscache, ni_size); + + oi_size = i_size_read(d_backing_inode(object->backer)); + if (oi_size == ni_size) + return 0; + + cachefiles_begin_secure(cache, &saved_cred); + mutex_lock(&d_inode(object->backer)->i_mutex); + + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = notify_change(object->backer, &newattrs, NULL); + if (ret < 0) + goto truncate_failed; + } + + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; + ret = notify_change(object->backer, &newattrs, NULL); + +truncate_failed: + mutex_unlock(&d_inode(object->backer)->i_mutex); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) { + fscache_set_store_limit(&object->fscache, 0); + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * Invalidate an object + */ +static void cachefiles_invalidate_object(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct path path; + uint64_t ni_size; + int ret; + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + op->object->cookie->def->get_attr(op->object->cookie->netfs_data, + &ni_size); + + _enter("{OBJ%x},[%llu]", + op->object->debug_id, (unsigned long long)ni_size); + + if (object->backer) { + ASSERT(d_is_reg(object->backer)); + + fscache_set_store_limit(&object->fscache, ni_size); + + path.dentry = object->backer; + path.mnt = cache->mnt; + + cachefiles_begin_secure(cache, &saved_cred); + ret = vfs_truncate(&path, 0); + if (ret == 0) + ret = vfs_truncate(&path, ni_size); + cachefiles_end_secure(cache, saved_cred); + + if (ret != 0) { + fscache_set_store_limit(&object->fscache, 0); + if (ret == -EIO) + cachefiles_io_error_obj(object, + "Invalidate failed"); + } + } + + fscache_op_complete(op, true); + _leave(""); +} + +/* + * dissociate a cache from all the pages it was backing + */ +static void cachefiles_dissociate_pages(struct fscache_cache *cache) +{ + _enter(""); +} + +const struct fscache_cache_ops cachefiles_cache_ops = { + .name = "cachefiles", + .alloc_object = cachefiles_alloc_object, + .lookup_object = cachefiles_lookup_object, + .lookup_complete = cachefiles_lookup_complete, + .grab_object = cachefiles_grab_object, + .update_object = cachefiles_update_object, + .invalidate_object = cachefiles_invalidate_object, + .drop_object = cachefiles_drop_object, + .put_object = cachefiles_put_object, + .sync_cache = cachefiles_sync_cache, + .attr_changed = cachefiles_attr_changed, + .read_or_alloc_page = cachefiles_read_or_alloc_page, + .read_or_alloc_pages = cachefiles_read_or_alloc_pages, + .allocate_page = cachefiles_allocate_page, + .allocate_pages = cachefiles_allocate_pages, + .write_page = cachefiles_write_page, + .uncache_page = cachefiles_uncache_page, + .dissociate_pages = cachefiles_dissociate_pages, + .check_consistency = cachefiles_check_consistency, +}; diff --git a/kernel/fs/cachefiles/internal.h b/kernel/fs/cachefiles/internal.h new file mode 100644 index 000000000..8c52472d2 --- /dev/null +++ b/kernel/fs/cachefiles/internal.h @@ -0,0 +1,363 @@ +/* General netfs cache on cache files internal defs + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "CacheFiles: " fmt + + +#include +#include +#include +#include +#include + +struct cachefiles_cache; +struct cachefiles_object; + +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + +#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) + +/* + * node records + */ +struct cachefiles_object { + struct fscache_object fscache; /* fscache handle */ + struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ + struct dentry *dentry; /* the file/dir representing this object */ + struct dentry *backer; /* backing file */ + loff_t i_size; /* object size */ + unsigned long flags; +#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ +#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ + atomic_t usage; /* object usage count */ + uint8_t type; /* object type */ + uint8_t new; /* T if object new */ + spinlock_t work_lock; + struct rb_node active_node; /* link in active tree (dentry is key) */ +}; + +extern struct kmem_cache *cachefiles_object_jar; + +/* + * Cache files cache definition + */ +struct cachefiles_cache { + struct fscache_cache cache; /* FS-Cache record */ + struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *graveyard; /* directory into which dead objects go */ + struct file *cachefilesd; /* manager daemon handle */ + const struct cred *cache_cred; /* security override for accessing cache */ + struct mutex daemon_mutex; /* command serialisation mutex */ + wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ + struct rb_root active_nodes; /* active nodes (can't be culled) */ + rwlock_t active_lock; /* lock for active_nodes */ + atomic_t gravecounter; /* graveyard uniquifier */ + unsigned frun_percent; /* when to stop culling (% files) */ + unsigned fcull_percent; /* when to start culling (% files) */ + unsigned fstop_percent; /* when to stop allocating (% files) */ + unsigned brun_percent; /* when to stop culling (% blocks) */ + unsigned bcull_percent; /* when to start culling (% blocks) */ + unsigned bstop_percent; /* when to stop allocating (% blocks) */ + unsigned bsize; /* cache's block size */ + unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + uint64_t frun; /* when to stop culling */ + uint64_t fcull; /* when to start culling */ + uint64_t fstop; /* when to stop allocating */ + sector_t brun; /* when to stop culling */ + sector_t bcull; /* when to start culling */ + sector_t bstop; /* when to stop allocating */ + unsigned long flags; +#define CACHEFILES_READY 0 /* T if cache prepared */ +#define CACHEFILES_DEAD 1 /* T if cache dead */ +#define CACHEFILES_CULLING 2 /* T if cull engaged */ +#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ + char *rootdirname; /* name of cache root directory */ + char *secctx; /* LSM security context */ + char *tag; /* cache binding tag */ +}; + +/* + * backing file read tracking + */ +struct cachefiles_one_read { + wait_queue_t monitor; /* link into monitored waitqueue */ + struct page *back_page; /* backing file page we're waiting for */ + struct page *netfs_page; /* netfs page we're going to fill */ + struct fscache_retrieval *op; /* retrieval op covering this */ + struct list_head op_link; /* link in op's todo list */ +}; + +/* + * backing file write tracking + */ +struct cachefiles_one_write { + struct page *netfs_page; /* netfs page to copy */ + struct cachefiles_object *object; + struct list_head obj_link; /* link in object's lists */ + fscache_rw_complete_t end_io_func; + void *context; +}; + +/* + * auxiliary data xattr buffer + */ +struct cachefiles_xattr { + uint16_t len; + uint8_t type; + uint8_t data[]; +}; + +/* + * note change of state for daemon + */ +static inline void cachefiles_state_changed(struct cachefiles_cache *cache) +{ + set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + wake_up_all(&cache->daemon_pollwq); +} + +/* + * bind.c + */ +extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); +extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); + +/* + * daemon.c + */ +extern const struct file_operations cachefiles_daemon_fops; + +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr); + +/* + * interface.c + */ +extern const struct fscache_cache_ops cachefiles_cache_ops; + +/* + * key.c + */ +extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); + +/* + * namei.c + */ +extern int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object); +extern int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata); +extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *name); + +extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename); + +extern int cachefiles_check_in_use(struct cachefiles_cache *cache, + struct dentry *dir, char *filename); + +/* + * proc.c + */ +#ifdef CONFIG_CACHEFILES_HISTOGRAM +extern atomic_t cachefiles_lookup_histogram[HZ]; +extern atomic_t cachefiles_mkdir_histogram[HZ]; +extern atomic_t cachefiles_create_histogram[HZ]; + +extern int __init cachefiles_proc_init(void); +extern void cachefiles_proc_cleanup(void); +static inline +void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +#else +#define cachefiles_proc_init() (0) +#define cachefiles_proc_cleanup() do {} while (0) +#define cachefiles_hist(hist, start_jif) do {} while (0) +#endif + +/* + * rdwr.c + */ +extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, + struct page *, gfp_t); +extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, + gfp_t); +extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, + gfp_t); +extern int cachefiles_allocate_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, gfp_t); +extern int cachefiles_write_page(struct fscache_storage *, struct page *); +extern void cachefiles_uncache_page(struct fscache_object *, struct page *); + +/* + * security.c + */ +extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); +extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred); + +static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, + const struct cred **_saved_cred) +{ + *_saved_cred = override_creds(cache->cache_cred); +} + +static inline void cachefiles_end_secure(struct cachefiles_cache *cache, + const struct cred *saved_cred) +{ + revert_creds(saved_cred); +} + +/* + * xattr.c + */ +extern int cachefiles_check_object_type(struct cachefiles_object *object); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_check_auxdata(struct cachefiles_object *object); +extern int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry); + + +/* + * error handling + */ + +#define cachefiles_io_error(___cache, FMT, ...) \ +do { \ + pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ + fscache_io_error(&(___cache)->cache); \ + set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ +} while (0) + +#define cachefiles_io_error_obj(object, FMT, ...) \ +do { \ + struct cachefiles_cache *___cache; \ + \ + ___cache = container_of((object)->fscache.cache, \ + struct cachefiles_cache, cache); \ + cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ +} while (0) + + +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_CACHEFILES_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif + +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif diff --git a/kernel/fs/cachefiles/key.c b/kernel/fs/cachefiles/key.c new file mode 100644 index 000000000..33b58c60f --- /dev/null +++ b/kernel/fs/cachefiles/key.c @@ -0,0 +1,159 @@ +/* Key to pathname encoder + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include "internal.h" + +static const char cachefiles_charmap[64] = + "0123456789" /* 0 - 9 */ + "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ + "_-" /* 62 - 63 */ + ; + +static const char cachefiles_filecharmap[256] = { + /* we skip space and tab and control chars */ + [33 ... 46] = 1, /* '!' -> '.' */ + /* we skip '/' as it's significant to pathwalk */ + [48 ... 127] = 1, /* '0' -> '~' */ +}; + +/* + * turn the raw key into something cooked + * - the raw key should include the length in the two bytes at the front + * - the key may be up to 514 bytes in length (including the length word) + * - "base64" encode the strange keys, mapping 3 bytes of raw to four of + * cooked + * - need to cut the cooked key into 252 char lengths (189 raw bytes) + */ +char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +{ + unsigned char csum, ch; + unsigned int acc; + char *key; + int loop, len, max, seg, mark, print; + + _enter(",%d", keylen); + + BUG_ON(keylen < 2 || keylen > 514); + + csum = raw[0] + raw[1]; + print = 1; + for (loop = 2; loop < keylen; loop++) { + ch = raw[loop]; + csum += ch; + print &= cachefiles_filecharmap[ch]; + } + + if (print) { + /* if the path is usable ASCII, then we render it directly */ + max = keylen - 2; + max += 2; /* two base64'd length chars on the front */ + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 251) / 252) = 3 + */ + max += 1; /* NUL on end */ + } else { + /* calculate the maximum length of the cooked key */ + keylen = (keylen + 2) / 3; + + max = keylen * 4; + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 188) / 189) = 3 + */ + max += 1; /* NUL on end */ + } + + max += 1; /* 2nd NUL on end */ + + _debug("max: %d", max); + + key = kmalloc(max, cachefiles_gfp); + if (!key) + return NULL; + + len = 0; + + /* build the cooked key */ + sprintf(key, "@%02x%c+", (unsigned) csum, 0); + len = 5; + mark = len - 1; + + if (print) { + acc = *(uint16_t *) raw; + raw += 2; + + key[len + 1] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len] = cachefiles_charmap[acc & 63]; + len += 2; + + seg = 250; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + key[len++] = *raw++; + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; + default: type = 'S'; break; + } + } else { + seg = 252; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + acc = *raw++; + acc |= *raw++ << 8; + acc |= *raw++ << 16; + + _debug("acc: %06x", acc); + + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; + default: type = 'T'; break; + } + } + + key[mark] = type; + key[len++] = 0; + key[len] = 0; + + _leave(" = %p %d", key, len); + return key; +} diff --git a/kernel/fs/cachefiles/main.c b/kernel/fs/cachefiles/main.c new file mode 100644 index 000000000..711f13d8c --- /dev/null +++ b/kernel/fs/cachefiles/main.c @@ -0,0 +1,105 @@ +/* Network filesystem caching backend to use cache files on a premounted + * filesystem + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +unsigned cachefiles_debug; +module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); + +MODULE_DESCRIPTION("Mounted-filesystem based cache"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +struct kmem_cache *cachefiles_object_jar; + +static struct miscdevice cachefiles_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cachefiles", + .fops = &cachefiles_daemon_fops, +}; + +static void cachefiles_object_init_once(void *_object) +{ + struct cachefiles_object *object = _object; + + memset(object, 0, sizeof(*object)); + spin_lock_init(&object->work_lock); +} + +/* + * initialise the fs caching module + */ +static int __init cachefiles_init(void) +{ + int ret; + + ret = misc_register(&cachefiles_dev); + if (ret < 0) + goto error_dev; + + /* create an object jar */ + ret = -ENOMEM; + cachefiles_object_jar = + kmem_cache_create("cachefiles_object_jar", + sizeof(struct cachefiles_object), + 0, + SLAB_HWCACHE_ALIGN, + cachefiles_object_init_once); + if (!cachefiles_object_jar) { + pr_notice("Failed to allocate an object jar\n"); + goto error_object_jar; + } + + ret = cachefiles_proc_init(); + if (ret < 0) + goto error_proc; + + pr_info("Loaded\n"); + return 0; + +error_proc: + kmem_cache_destroy(cachefiles_object_jar); +error_object_jar: + misc_deregister(&cachefiles_dev); +error_dev: + pr_err("failed to register: %d\n", ret); + return ret; +} + +fs_initcall(cachefiles_init); + +/* + * clean up on module removal + */ +static void __exit cachefiles_exit(void) +{ + pr_info("Unloading\n"); + + cachefiles_proc_cleanup(); + kmem_cache_destroy(cachefiles_object_jar); + misc_deregister(&cachefiles_dev); +} + +module_exit(cachefiles_exit); diff --git a/kernel/fs/cachefiles/namei.c b/kernel/fs/cachefiles/namei.c new file mode 100644 index 000000000..ab857ab9f --- /dev/null +++ b/kernel/fs/cachefiles/namei.c @@ -0,0 +1,978 @@ +/* CacheFiles path walking and related routines + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CACHEFILES_KEYBUF_SIZE 512 + +/* + * dump debugging info about an object + */ +static noinline +void __cachefiles_printk_object(struct cachefiles_object *object, + const char *prefix, + u8 *keybuf) +{ + struct fscache_cookie *cookie; + unsigned keylen, loop; + + pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); + pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", + prefix, object->fscache.state->name, + object->fscache.flags, work_busy(&object->fscache.work), + object->fscache.events, object->fscache.event_mask); + pr_err("%sops=%u inp=%u exc=%u\n", + prefix, object->fscache.n_ops, object->fscache.n_in_progress, + object->fscache.n_exclusive); + pr_err("%sparent=%p\n", + prefix, object->fscache.parent); + + spin_lock(&object->fscache.lock); + cookie = object->fscache.cookie; + if (cookie) { + pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", + prefix, + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + if (keybuf && cookie->def) + keylen = cookie->def->get_key(cookie->netfs_data, keybuf, + CACHEFILES_KEYBUF_SIZE); + else + keylen = 0; + } else { + pr_err("%scookie=NULL\n", prefix); + keylen = 0; + } + spin_unlock(&object->fscache.lock); + + if (keylen) { + pr_err("%skey=[%u] '", prefix, keylen); + for (loop = 0; loop < keylen; loop++) + pr_cont("%02x", keybuf[loop]); + pr_cont("'\n"); + } +} + +/* + * dump debugging info about a pair of objects + */ +static noinline void cachefiles_printk_object(struct cachefiles_object *object, + struct cachefiles_object *xobject) +{ + u8 *keybuf; + + keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); + if (object) + __cachefiles_printk_object(object, "", keybuf); + if (xobject) + __cachefiles_printk_object(xobject, "x", keybuf); + kfree(keybuf); +} + +/* + * mark the owner of a dentry, if there is one, to indicate that that dentry + * has been preemptively deleted + * - the caller must hold the i_mutex on the dentry's parent as required to + * call vfs_unlink(), vfs_rmdir() or vfs_rename() + */ +static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + struct cachefiles_object *object; + struct rb_node *p; + + _enter(",'%pd'", dentry); + + write_lock(&cache->active_lock); + + p = cache->active_nodes.rb_node; + while (p) { + object = rb_entry(p, struct cachefiles_object, active_node); + if (object->dentry > dentry) + p = p->rb_left; + else if (object->dentry < dentry) + p = p->rb_right; + else + goto found_dentry; + } + + write_unlock(&cache->active_lock); + _leave(" [no owner]"); + return; + + /* found the dentry for */ +found_dentry: + kdebug("preemptive burial: OBJ%x [%s] %p", + object->fscache.debug_id, + object->fscache.state->name, + dentry); + + if (fscache_object_is_live(&object->fscache)) { + pr_err("\n"); + pr_err("Error: Can't preemptively bury live object\n"); + cachefiles_printk_object(object, NULL); + } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + pr_err("Error: Object already preemptively buried\n"); + } + + write_unlock(&cache->active_lock); + _leave(" [owner marked]"); +} + +/* + * record the fact that an object is now active + */ +static int cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_object *xobject; + struct rb_node **_p, *_parent = NULL; + struct dentry *dentry; + + _enter(",%p", object); + +try_again: + write_lock(&cache->active_lock); + + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + pr_err("Error: Object already active\n"); + cachefiles_printk_object(object, NULL); + BUG(); + } + + dentry = object->dentry; + _p = &cache->active_nodes.rb_node; + while (*_p) { + _parent = *_p; + xobject = rb_entry(_parent, + struct cachefiles_object, active_node); + + ASSERT(xobject != object); + + if (xobject->dentry > dentry) + _p = &(*_p)->rb_left; + else if (xobject->dentry < dentry) + _p = &(*_p)->rb_right; + else + goto wait_for_old_object; + } + + rb_link_node(&object->active_node, _parent, _p); + rb_insert_color(&object->active_node, &cache->active_nodes); + + write_unlock(&cache->active_lock); + _leave(" = 0"); + return 0; + + /* an old object from a previous incarnation is hogging the slot - we + * need to wait for it to be destroyed */ +wait_for_old_object: + if (fscache_object_is_live(&xobject->fscache)) { + pr_err("\n"); + pr_err("Error: Unexpected object collision\n"); + cachefiles_printk_object(object, xobject); + BUG(); + } + atomic_inc(&xobject->usage); + write_unlock(&cache->active_lock); + + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + wait_queue_head_t *wq; + + signed long timeout = 60 * HZ; + wait_queue_t wait; + bool requeue; + + /* if the object we're waiting for is queued for processing, + * then just put ourselves on the queue behind it */ + if (work_pending(&xobject->fscache.work)) { + _debug("queue OBJ%x behind OBJ%x immediately", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + /* otherwise we sleep until either the object we're waiting for + * is done, or the fscache_object is congested */ + wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); + init_wait(&wait); + requeue = false; + do { + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) + break; + + requeue = fscache_object_sleep_till_congested(&timeout); + } while (timeout > 0 && !requeue); + finish_wait(wq, &wait); + + if (requeue && + test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + _debug("queue OBJ%x behind OBJ%x after wait", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + if (timeout <= 0) { + pr_err("\n"); + pr_err("Error: Overlong wait for old active object to go away\n"); + cachefiles_printk_object(object, xobject); + goto requeue; + } + } + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); + + cache->cache.ops->put_object(&xobject->fscache); + goto try_again; + +requeue: + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + cache->cache.ops->put_object(&xobject->fscache); + _leave(" = -ETIMEDOUT"); + return -ETIMEDOUT; +} + +/* + * delete an object representation from the cache + * - file backed objects are unlinked + * - directory backed objects are stuffed into the graveyard for userspace to + * delete + * - unlocks the directory mutex + */ +static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct dentry *dir, + struct dentry *rep, + bool preemptive) +{ + struct dentry *grave, *trap; + struct path path, path_to_graveyard; + char nbuffer[8 + 8 + 1]; + int ret; + + _enter(",'%pd','%pd'", dir, rep); + + _debug("remove %p from %p", rep, dir); + + /* non-directories can just be unlinked */ + if (!d_is_dir(rep)) { + _debug("unlink stale object"); + + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_unlink(&path, rep); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + } else { + ret = vfs_unlink(d_inode(dir), rep, NULL); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } + + mutex_unlock(&d_inode(dir)->i_mutex); + + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + + _leave(" = %d", ret); + return ret; + } + + /* directories have to be moved to the graveyard */ + _debug("move stale object to graveyard"); + mutex_unlock(&d_inode(dir)->i_mutex); + +try_again: + /* first step is to make up a grave dentry in the graveyard */ + sprintf(nbuffer, "%08x%08x", + (uint32_t) get_seconds(), + (uint32_t) atomic_inc_return(&cache->gravecounter)); + + /* do the multiway lock magic */ + trap = lock_rename(cache->graveyard, dir); + + /* do some checks before getting the grave dentry */ + if (rep->d_parent != dir) { + /* the entry was probably culled when we dropped the parent dir + * lock */ + unlock_rename(cache->graveyard, dir); + _leave(" = 0 [culled?]"); + return 0; + } + + if (!d_can_lookup(cache->graveyard)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Graveyard no longer a directory"); + return -EIO; + } + + if (trap == rep) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + if (d_mountpoint(rep)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Mountpoint in cache"); + return -EIO; + } + + grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); + if (IS_ERR(grave)) { + unlock_rename(cache->graveyard, dir); + + if (PTR_ERR(grave) == -ENOMEM) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + cachefiles_io_error(cache, "Lookup error %ld", + PTR_ERR(grave)); + return -EIO; + } + + if (d_is_positive(grave)) { + unlock_rename(cache->graveyard, dir); + dput(grave); + grave = NULL; + cond_resched(); + goto try_again; + } + + if (d_mountpoint(grave)) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "Mountpoint in graveyard"); + return -EIO; + } + + /* target should not be an ancestor of source */ + if (trap == grave) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + /* attempt the rename */ + path.mnt = cache->mnt; + path.dentry = dir; + path_to_graveyard.mnt = cache->mnt; + path_to_graveyard.dentry = cache->graveyard; + ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); + if (ret < 0) { + cachefiles_io_error(cache, "Rename security error %d", ret); + } else { + ret = vfs_rename(d_inode(dir), rep, + d_inode(cache->graveyard), grave, NULL, 0); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, + "Rename failed with error %d", ret); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } + + unlock_rename(cache->graveyard, dir); + dput(grave); + _leave(" = 0"); + return 0; +} + +/* + * delete an object representation from the cache + */ +int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct dentry *dir; + int ret; + + _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); + + ASSERT(object->dentry); + ASSERT(d_backing_inode(object->dentry)); + ASSERT(object->dentry->d_parent); + + dir = dget_parent(object->dentry); + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + + if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + /* object allocation for the same key preemptively deleted this + * object's file so that it could create its own file */ + _debug("object preemptively buried"); + mutex_unlock(&d_inode(dir)->i_mutex); + ret = 0; + } else { + /* we need to check that our parent is _still_ our parent - it + * may have been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, + object->dentry, false); + } else { + /* it got moved, presumably by cachefilesd culling it, + * so it's no longer in the key path and we can ignore + * it */ + mutex_unlock(&d_inode(dir)->i_mutex); + ret = 0; + } + } + + dput(dir); + _leave(" = %d", ret); + return ret; +} + +/* + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go + */ +int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_cache *cache; + struct dentry *dir, *next = NULL; + struct path path; + unsigned long start; + const char *name; + int ret, nlen; + + _enter("OBJ%x{%p},OBJ%x,%s,", + parent->fscache.debug_id, parent->dentry, + object->fscache.debug_id, key); + + cache = container_of(parent->fscache.cache, + struct cachefiles_cache, cache); + path.mnt = cache->mnt; + + ASSERT(parent->dentry); + ASSERT(d_backing_inode(parent->dentry)); + + if (!(d_is_dir(parent->dentry))) { + // TODO: convert file to dir + _leave("looking up in none directory"); + return -ENOBUFS; + } + + dir = dget(parent->dentry); + +advance: + /* attempt to transit the first directory component */ + name = key; + nlen = strlen(key); + + /* key ends in a double NUL */ + key = key + nlen + 1; + if (!*key) + key = NULL; + +lookup_again: + /* search the current directory for the element name */ + _debug("lookup '%s'", name); + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + + start = jiffies; + next = lookup_one_len(name, dir, nlen); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(next)) + goto lookup_error; + + _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); + + if (!key) + object->new = !d_backing_inode(next); + + /* if this element of the path doesn't exist, then the lookup phase + * failed, and we can release any readers in the certain knowledge that + * there's nothing for them to actually read */ + if (d_is_negative(next)) + fscache_object_lookup_negative(&object->fscache); + + /* we need to create the object if it's negative */ + if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { + /* index objects and intervening tree levels must be subdirs */ + if (d_is_negative(next)) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + path.dentry = dir; + ret = security_path_mkdir(&path, next, 0); + if (ret < 0) + goto create_error; + start = jiffies; + ret = vfs_mkdir(d_inode(dir), next, 0); + cachefiles_hist(cachefiles_mkdir_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(d_backing_inode(next)); + + _debug("mkdir -> %p{%p{ino=%lu}}", + next, d_backing_inode(next), d_backing_inode(next)->i_ino); + + } else if (!d_can_lookup(next)) { + pr_err("inode %lu is not a directory\n", + d_backing_inode(next)->i_ino); + ret = -ENOBUFS; + goto error; + } + + } else { + /* non-index objects start out life as files */ + if (d_is_negative(next)) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + path.dentry = dir; + ret = security_path_mknod(&path, next, S_IFREG, 0); + if (ret < 0) + goto create_error; + start = jiffies; + ret = vfs_create(d_inode(dir), next, S_IFREG, true); + cachefiles_hist(cachefiles_create_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(d_backing_inode(next)); + + _debug("create -> %p{%p{ino=%lu}}", + next, d_backing_inode(next), d_backing_inode(next)->i_ino); + + } else if (!d_can_lookup(next) && + !d_is_reg(next) + ) { + pr_err("inode %lu is not a file or directory\n", + d_backing_inode(next)->i_ino); + ret = -ENOBUFS; + goto error; + } + } + + /* process the next component */ + if (key) { + _debug("advance"); + mutex_unlock(&d_inode(dir)->i_mutex); + dput(dir); + dir = next; + next = NULL; + goto advance; + } + + /* we've found the object we were looking for */ + object->dentry = next; + + /* if we've found that the terminal object exists, then we need to + * check its attributes and delete it if it's out of date */ + if (!object->new) { + _debug("validate '%pd'", next); + + ret = cachefiles_check_object_xattr(object, auxdata); + if (ret == -ESTALE) { + /* delete the object (the deleter drops the directory + * mutex) */ + object->dentry = NULL; + + ret = cachefiles_bury_object(cache, dir, next, true); + dput(next); + next = NULL; + + if (ret < 0) + goto delete_error; + + _debug("redo lookup"); + goto lookup_again; + } + } + + /* note that we're now using this object */ + ret = cachefiles_mark_object_active(cache, object); + + mutex_unlock(&d_inode(dir)->i_mutex); + dput(dir); + dir = NULL; + + if (ret == -ETIMEDOUT) + goto mark_active_timed_out; + + _debug("=== OBTAINED_OBJECT ==="); + + if (object->new) { + /* attach data to a newly constructed terminal object */ + ret = cachefiles_set_object_xattr(object, auxdata); + if (ret < 0) + goto check_error; + } else { + /* always update the atime on an object we've just looked up + * (this is used to keep track of culling, and atimes are only + * updated by read, write and readdir but not lookup or + * open) */ + path.dentry = next; + touch_atime(&path); + } + + /* open a file interface onto a data file */ + if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (d_is_reg(object->dentry)) { + const struct address_space_operations *aops; + + ret = -EPERM; + aops = d_backing_inode(object->dentry)->i_mapping->a_ops; + if (!aops->bmap) + goto check_error; + + object->backer = object->dentry; + } else { + BUG(); // TODO: open file in data-class subdir + } + } + + object->new = 0; + fscache_obtained_object(&object->fscache); + + _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); + return 0; + +create_error: + _debug("create error %d", ret); + if (ret == -EIO) + cachefiles_io_error(cache, "Create/mkdir failed"); + goto error; + +mark_active_timed_out: + _debug("mark active timed out"); + goto release_dentry; + +check_error: + _debug("check error %d", ret); + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); +release_dentry: + dput(object->dentry); + object->dentry = NULL; + goto error_out; + +delete_error: + _debug("delete error %d", ret); + goto error_out2; + +lookup_error: + _debug("lookup error %ld", PTR_ERR(next)); + ret = PTR_ERR(next); + if (ret == -EIO) + cachefiles_io_error(cache, "Lookup failed"); + next = NULL; +error: + mutex_unlock(&d_inode(dir)->i_mutex); + dput(next); +error_out2: + dput(dir); +error_out: + _leave(" = error %d", -ret); + return ret; +} + +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname) +{ + struct dentry *subdir; + unsigned long start; + struct path path; + int ret; + + _enter(",,%s", dirname); + + /* search the current directory for the element name */ + mutex_lock(&d_inode(dir)->i_mutex); + + start = jiffies; + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(subdir)) { + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; + } + + _debug("subdir -> %p %s", + subdir, d_backing_inode(subdir) ? "positive" : "negative"); + + /* we need to create the subdir if it doesn't exist yet */ + if (d_is_negative(subdir)) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto mkdir_error; + + _debug("attempt mkdir"); + + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; + ret = vfs_mkdir(d_inode(dir), subdir, 0700); + if (ret < 0) + goto mkdir_error; + + ASSERT(d_backing_inode(subdir)); + + _debug("mkdir -> %p{%p{ino=%lu}}", + subdir, + d_backing_inode(subdir), + d_backing_inode(subdir)->i_ino); + } + + mutex_unlock(&d_inode(dir)->i_mutex); + + /* we need to make sure the subdir is a directory */ + ASSERT(d_backing_inode(subdir)); + + if (!d_can_lookup(subdir)) { + pr_err("%s is not a directory\n", dirname); + ret = -EIO; + goto check_error; + } + + ret = -EPERM; + if (!d_backing_inode(subdir)->i_op->setxattr || + !d_backing_inode(subdir)->i_op->getxattr || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->create || + (!d_backing_inode(subdir)->i_op->rename && + !d_backing_inode(subdir)->i_op->rename2) || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) + goto check_error; + + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); + return subdir; + +check_error: + dput(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); + +mkdir_error: + mutex_unlock(&d_inode(dir)->i_mutex); + dput(subdir); + pr_err("mkdir %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + mutex_unlock(&d_inode(dir)->i_mutex); + ret = PTR_ERR(subdir); + pr_err("Lookup %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); + +nomem_d_alloc: + mutex_unlock(&d_inode(dir)->i_mutex); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * find out if an object is in use or not + * - if finds object and it's not in use: + * - returns a pointer to the object and a reference on it + * - returns with the directory locked + */ +static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) +{ + struct cachefiles_object *object; + struct rb_node *_n; + struct dentry *victim; + unsigned long start; + int ret; + + //_enter(",%pd/,%s", + // dir, filename); + + /* look up the victim */ + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + + start = jiffies; + victim = lookup_one_len(filename, dir, strlen(filename)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(victim)) + goto lookup_error; + + //_debug("victim -> %p %s", + // victim, d_backing_inode(victim) ? "positive" : "negative"); + + /* if the object is no longer there then we probably retired the object + * at the netfs's request whilst the cull was in progress + */ + if (d_is_negative(victim)) { + mutex_unlock(&d_inode(dir)->i_mutex); + dput(victim); + _leave(" = -ENOENT [absent]"); + return ERR_PTR(-ENOENT); + } + + /* check to see if we're using this object */ + read_lock(&cache->active_lock); + + _n = cache->active_nodes.rb_node; + + while (_n) { + object = rb_entry(_n, struct cachefiles_object, active_node); + + if (object->dentry > victim) + _n = _n->rb_left; + else if (object->dentry < victim) + _n = _n->rb_right; + else + goto object_in_use; + } + + read_unlock(&cache->active_lock); + + //_leave(" = %p", victim); + return victim; + +object_in_use: + read_unlock(&cache->active_lock); + mutex_unlock(&d_inode(dir)->i_mutex); + dput(victim); + //_leave(" = -EBUSY [in use]"); + return ERR_PTR(-EBUSY); + +lookup_error: + mutex_unlock(&d_inode(dir)->i_mutex); + ret = PTR_ERR(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return ERR_PTR(-ESTALE); + } + + if (ret == -EIO) { + cachefiles_io_error(cache, "Lookup failed"); + } else if (ret != -ENOMEM) { + pr_err("Internal error: %d\n", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * cull an object if it's not in use + * - called only by cache manager daemon + */ +int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + int ret; + + _enter(",%pd/,%s", dir, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + _debug("victim -> %p %s", + victim, d_backing_inode(victim) ? "positive" : "negative"); + + /* okay... the victim is not being used so we can cull it + * - start by marking it as stale + */ + _debug("victim is cullable"); + + ret = cachefiles_remove_object_xattr(cache, victim); + if (ret < 0) + goto error_unlock; + + /* actually remove the victim (drops the dir mutex) */ + _debug("bury"); + + ret = cachefiles_bury_object(cache, dir, victim, false); + if (ret < 0) + goto error; + + dput(victim); + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&d_inode(dir)->i_mutex); +error: + dput(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return -ESTALE; + } + + if (ret != -ENOMEM) { + pr_err("Internal error: %d\n", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * find out if an object is in use or not + * - called only by cache manager daemon + * - returns -EBUSY or 0 to indicate whether an object is in use or not + */ +int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + + //_enter(",%pd/,%s", + // dir, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + mutex_unlock(&d_inode(dir)->i_mutex); + dput(victim); + //_leave(" = 0"); + return 0; +} diff --git a/kernel/fs/cachefiles/proc.c b/kernel/fs/cachefiles/proc.c new file mode 100644 index 000000000..eccd33941 --- /dev/null +++ b/kernel/fs/cachefiles/proc.c @@ -0,0 +1,134 @@ +/* CacheFiles statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +atomic_t cachefiles_lookup_histogram[HZ]; +atomic_t cachefiles_mkdir_histogram[HZ]; +atomic_t cachefiles_create_histogram[HZ]; + +/* + * display the latency histogram + */ +static int cachefiles_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned x, y, z, t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + x = atomic_read(&cachefiles_lookup_histogram[index]); + y = atomic_read(&cachefiles_mkdir_histogram[index]); + z = atomic_read(&cachefiles_create_histogram[index]); + if (x == 0 && y == 0 && z == 0) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void cachefiles_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations cachefiles_histogram_ops = { + .start = cachefiles_histogram_start, + .stop = cachefiles_histogram_stop, + .next = cachefiles_histogram_next, + .show = cachefiles_histogram_show, +}; + +/* + * open "/proc/fs/cachefiles/XXX" which provide statistics summaries + */ +static int cachefiles_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cachefiles_histogram_ops); +} + +static const struct file_operations cachefiles_histogram_fops = { + .owner = THIS_MODULE, + .open = cachefiles_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * initialise the /proc/fs/cachefiles/ directory + */ +int __init cachefiles_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/cachefiles", NULL)) + goto error_dir; + + if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_fops)) + goto error_histogram; + + _leave(" = 0"); + return 0; + +error_histogram: + remove_proc_entry("fs/cachefiles", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/cachefiles/ directory + */ +void cachefiles_proc_cleanup(void) +{ + remove_proc_entry("fs/cachefiles/histogram", NULL); + remove_proc_entry("fs/cachefiles", NULL); +} diff --git a/kernel/fs/cachefiles/rdwr.c b/kernel/fs/cachefiles/rdwr.c new file mode 100644 index 000000000..3cbb0e834 --- /dev/null +++ b/kernel/fs/cachefiles/rdwr.c @@ -0,0 +1,967 @@ +/* Storage object read/write + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" + +/* + * detect wake up events generated by the unlocking of pages in which we're + * interested + * - we use this to detect read completion of backing pages + * - the caller holds the waitqueue lock + */ +static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, + int sync, void *_key) +{ + struct cachefiles_one_read *monitor = + container_of(wait, struct cachefiles_one_read, monitor); + struct cachefiles_object *object; + struct wait_bit_key *key = _key; + struct page *page = wait->private; + + ASSERT(key); + + _enter("{%lu},%u,%d,{%p,%u}", + monitor->netfs_page->index, mode, sync, + key->flags, key->bit_nr); + + if (key->flags != &page->flags || + key->bit_nr != PG_locked) + return 0; + + _debug("--- monitor %p %lx ---", page, page->flags); + + if (!PageUptodate(page) && !PageError(page)) { + /* unlocked, not uptodate and not erronous? */ + _debug("page probably truncated"); + } + + /* remove from the waitqueue */ + list_del(&wait->task_list); + + /* move onto the action list and queue for FS-Cache thread pool */ + ASSERT(monitor->op); + + object = container_of(monitor->op->op.object, + struct cachefiles_object, fscache); + + spin_lock(&object->work_lock); + list_add_tail(&monitor->op_link, &monitor->op->to_do); + spin_unlock(&object->work_lock); + + fscache_enqueue_retrieval(monitor->op); + return 0; +} + +/* + * handle a probably truncated page + * - check to see if the page is still relevant and reissue the read if + * possible + * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we + * must wait again and 0 if successful + */ +static int cachefiles_read_reissue(struct cachefiles_object *object, + struct cachefiles_one_read *monitor) +{ + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; + struct page *backpage = monitor->back_page, *backpage2; + int ret; + + _enter("{ino=%lx},{%lx,%lx}", + d_backing_inode(object->backer)->i_ino, + backpage->index, backpage->flags); + + /* skip if the page was truncated away completely */ + if (backpage->mapping != bmapping) { + _leave(" = -ENODATA [mapping]"); + return -ENODATA; + } + + backpage2 = find_get_page(bmapping, backpage->index); + if (!backpage2) { + _leave(" = -ENODATA [gone]"); + return -ENODATA; + } + + if (backpage != backpage2) { + put_page(backpage2); + _leave(" = -ENODATA [different]"); + return -ENODATA; + } + + /* the page is still there and we already have a ref on it, so we don't + * need a second */ + put_page(backpage2); + + INIT_LIST_HEAD(&monitor->op_link); + add_page_wait_queue(backpage, &monitor->monitor); + + if (trylock_page(backpage)) { + ret = -EIO; + if (PageError(backpage)) + goto unlock_discard; + ret = 0; + if (PageUptodate(backpage)) + goto unlock_discard; + + _debug("reissue read"); + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto unlock_discard; + } + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + /* it'll reappear on the todo list */ + _leave(" = -EINPROGRESS"); + return -EINPROGRESS; + +unlock_discard: + unlock_page(backpage); + spin_lock_irq(&object->work_lock); + list_del(&monitor->op_link); + spin_unlock_irq(&object->work_lock); + _leave(" = %d", ret); + return ret; +} + +/* + * copy data from backing pages to netfs pages to complete a read operation + * - driven by FS-Cache's thread pool + */ +static void cachefiles_read_copier(struct fscache_operation *_op) +{ + struct cachefiles_one_read *monitor; + struct cachefiles_object *object; + struct fscache_retrieval *op; + int error, max; + + op = container_of(_op, struct fscache_retrieval, op); + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); + + max = 8; + spin_lock_irq(&object->work_lock); + + while (!list_empty(&op->to_do)) { + monitor = list_entry(op->to_do.next, + struct cachefiles_one_read, op_link); + list_del(&monitor->op_link); + + spin_unlock_irq(&object->work_lock); + + _debug("- copy {%lu}", monitor->back_page->index); + + recheck: + if (test_bit(FSCACHE_COOKIE_INVALIDATING, + &object->fscache.cookie->flags)) { + error = -ESTALE; + } else if (PageUptodate(monitor->back_page)) { + copy_highpage(monitor->netfs_page, monitor->back_page); + fscache_mark_page_cached(monitor->op, + monitor->netfs_page); + error = 0; + } else if (!PageError(monitor->back_page)) { + /* the page has probably been truncated */ + error = cachefiles_read_reissue(object, monitor); + if (error == -EINPROGRESS) + goto next; + goto recheck; + } else { + cachefiles_io_error_obj( + object, + "Readpage failed on backing file %lx", + (unsigned long) monitor->back_page->flags); + error = -EIO; + } + + page_cache_release(monitor->back_page); + + fscache_end_io(op, monitor->netfs_page, error); + page_cache_release(monitor->netfs_page); + fscache_retrieval_complete(op, 1); + fscache_put_retrieval(op); + kfree(monitor); + + next: + /* let the thread pool have some air occasionally */ + max--; + if (max < 0 || need_resched()) { + if (!list_empty(&op->to_do)) + fscache_enqueue_retrieval(op); + _leave(" [maxed out]"); + return; + } + + spin_lock_irq(&object->work_lock); + } + + spin_unlock_irq(&object->work_lock); + _leave(""); +} + +/* + * read the corresponding page to the given set from the backing file + * - an uncertain page is simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file_one(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct page *netpage) +{ + struct cachefiles_one_read *monitor; + struct address_space *bmapping; + struct page *newpage, *backpage; + int ret; + + _enter(""); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); + if (!monitor) + goto nomem; + + monitor->netfs_page = netpage; + monitor->op = fscache_get_retrieval(op); + + init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); + + /* attempt to get hold of the backing page */ + bmapping = d_backing_inode(object->backer)->i_mapping; + newpage = NULL; + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = __page_cache_alloc(cachefiles_gfp | + __GFP_COLD); + if (!newpage) + goto nomem_monitor; + } + + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, cachefiles_gfp); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem_page; + } + + /* we've installed a new backing page, so now we need to start + * it reading */ +installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + +read_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* set the monitor to transfer the data across */ +monitor_backing_page: + _debug("- monitor add"); + + /* install the monitor */ + page_cache_get(monitor->netfs_page); + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + goto success; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ +backing_page_already_present: + _debug("- present"); + + if (newpage) { + page_cache_release(newpage); + newpage = NULL; + } + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + if (!trylock_page(backpage)) + goto monitor_backing_page; + _debug("read %p {%lx}", backpage, backpage->flags); + goto read_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ +backing_page_already_uptodate: + _debug("- uptodate"); + + fscache_mark_page_cached(op, netpage); + + copy_highpage(netpage, backpage); + fscache_end_io(op, netpage, 0); + fscache_retrieval_complete(op, 1); + +success: + _debug("success"); + ret = 0; + +out: + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(monitor->op); + kfree(monitor); + } + _leave(" = %d", ret); + return ret; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) { + fscache_retrieval_complete(op, 1); + goto out; + } +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + fscache_retrieval_complete(op, 1); + ret = -ENOBUFS; + goto out; + +nomem_page: + page_cache_release(newpage); +nomem_monitor: + fscache_put_retrieval(monitor->op); + kfree(monitor); +nomem: + fscache_retrieval_complete(op, 1); + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - if the page is backed by a block in the cache: + * - a read will be started which will call the callback on completion + * - 0 will be returned + * - else if the page is unbacked: + * - the metadata will be retained + * - -ENODATA will be returned + */ +int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct inode *inode; + sector_t block0, block; + unsigned shift; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{%p},{%lx},,,", object, page->index); + + if (!object->backer) + goto enobufs; + + inode = d_backing_inode(object->backer); + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + goto enobufs; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_ASYNC; + op->op.processor = cachefiles_read_copier; + + /* we assume the absence or presence of the first block is a good + * enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually good + * enough for this as it doesn't indicate errors, but it's all we've + * got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* submit the apparently valid page to the backing fs to be + * read from disk */ + ret = cachefiles_read_backing_file_one(object, op, page); + } else if (cachefiles_has_space(cache, 0, 1) == 0) { + /* there's space in the cache we can use */ + fscache_mark_page_cached(op, page); + fscache_retrieval_complete(op, 1); + ret = -ENODATA; + } else { + goto enobufs; + } + + _leave(" = %d", ret); + return ret; + +enobufs: + fscache_retrieval_complete(op, 1); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} + +/* + * read the corresponding pages to the given set from the backing file + * - any uncertain pages are simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct list_head *list) +{ + struct cachefiles_one_read *monitor = NULL; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; + struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; + int ret = 0; + + _enter(""); + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + if (!monitor) { + monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); + if (!monitor) + goto nomem; + + monitor->op = fscache_get_retrieval(op); + init_waitqueue_func_entry(&monitor->monitor, + cachefiles_read_waiter); + } + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = __page_cache_alloc(cachefiles_gfp | + __GFP_COLD); + if (!newpage) + goto nomem; + } + + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, + cachefiles_gfp); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem; + } + + /* we've installed a new backing page, so now we need + * to start it reading */ + installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + reread_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* add the netfs page to the pagecache and LRU, and set the + * monitor to transfer the data across */ + monitor_backing_page: + _debug("- monitor add"); + + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + fscache_retrieval_complete(op, 1); + continue; + } + goto nomem; + } + + /* install a monitor */ + page_cache_get(netpage); + monitor->netfs_page = netpage; + + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was + * installed, so the monitor may miss the event - so we have to + * ensure that we do get one in such a case */ + if (trylock_page(backpage)) { + _debug("2unlock %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + page_cache_release(backpage); + backpage = NULL; + + page_cache_release(netpage); + netpage = NULL; + continue; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ + backing_page_already_present: + _debug("- present %p", backpage); + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + _debug("- not ready %p{%lx}", backpage, backpage->flags); + + if (!trylock_page(backpage)) + goto monitor_backing_page; + + if (PageError(backpage)) { + _debug("error %lx", backpage->flags); + unlock_page(backpage); + goto io_error; + } + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate_unlock; + + /* we've locked a page that's neither up to date nor erroneous, + * so we need to attempt to read it again */ + goto reread_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ + backing_page_already_uptodate_unlock: + _debug("uptodate %lx", backpage->flags); + unlock_page(backpage); + backing_page_already_uptodate: + _debug("- uptodate"); + + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + fscache_retrieval_complete(op, 1); + continue; + } + goto nomem; + } + + copy_highpage(netpage, backpage); + + page_cache_release(backpage); + backpage = NULL; + + fscache_mark_page_cached(op, netpage); + + /* the netpage is unlocked and marked up to date here */ + fscache_end_io(op, netpage, 0); + page_cache_release(netpage); + netpage = NULL; + fscache_retrieval_complete(op, 1); + continue; + } + + netpage = NULL; + + _debug("out"); + +out: + /* tidy up */ + if (newpage) + page_cache_release(newpage); + if (netpage) + page_cache_release(netpage); + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(op); + kfree(monitor); + } + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + page_cache_release(netpage); + fscache_retrieval_complete(op, 1); + } + + _leave(" = %d", ret); + return ret; + +nomem: + _debug("nomem"); + ret = -ENOMEM; + goto record_page_complete; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto record_page_complete; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; +record_page_complete: + fscache_retrieval_complete(op, 1); + goto out; +} + +/* + * read a list of pages from the cache or allocate blocks in which to store + * them + */ +int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct list_head backpages; + struct pagevec pagevec; + struct inode *inode; + struct page *page, *_n; + unsigned shift, nrbackpages; + int ret, ret2, space; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{OBJ%x,%d},,%d,,", + object->fscache.debug_id, atomic_read(&op->op.usage), + *nr_pages); + + if (!object->backer) + goto all_enobufs; + + space = 1; + if (cachefiles_has_space(cache, 0, *nr_pages) < 0) + space = 0; + + inode = d_backing_inode(object->backer); + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + goto all_enobufs; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + pagevec_init(&pagevec, 0); + + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_ASYNC; + op->op.processor = cachefiles_read_copier; + + INIT_LIST_HEAD(&backpages); + nrbackpages = 0; + + ret = space ? -ENODATA : -ENOBUFS; + list_for_each_entry_safe(page, _n, pages, lru) { + sector_t block0, block; + + /* we assume the absence or presence of the first block is a + * good enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually + * good enough for this as it doesn't indicate errors, but + * it's all we've got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, + block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* we have data - add it to the list to give to the + * backing fs */ + list_move(&page->lru, &backpages); + (*nr_pages)--; + nrbackpages++; + } else if (space && pagevec_add(&pagevec, page) == 0) { + fscache_mark_pages_cached(op, &pagevec); + fscache_retrieval_complete(op, 1); + ret = -ENODATA; + } else { + fscache_retrieval_complete(op, 1); + } + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + if (list_empty(pages)) + ret = 0; + + /* submit the apparently valid pages to the backing fs to be read from + * disk */ + if (nrbackpages > 0) { + ret2 = cachefiles_read_backing_file(object, op, &backpages); + if (ret2 == -ENOMEM || ret2 == -EINTR) + ret = ret2; + } + + _leave(" = %d [nr=%u%s]", + ret, *nr_pages, list_empty(pages) ? " empty" : ""); + return ret; + +all_enobufs: + fscache_retrieval_complete(op, *nr_pages); + return -ENOBUFS; +} + +/* + * allocate a block in the cache in which to store a page + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - otherwise: + * - the metadata will be retained + * - 0 will be returned + */ +int cachefiles_allocate_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lx},", object, page->index); + + ret = cachefiles_has_space(cache, 0, 1); + if (ret == 0) + fscache_mark_page_cached(op, page); + else + ret = -ENOBUFS; + + fscache_retrieval_complete(op, 1); + _leave(" = %d", ret); + return ret; +} + +/* + * allocate blocks in the cache in which to store a set of pages + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if some buffers couldn't be made available + * - returns -ENOBUFS if some pages are beyond EOF + * - otherwise: + * - -ENODATA will be returned + * - metadata will be retained for any page marked + */ +int cachefiles_allocate_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct page *page; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,,,%d,", object, *nr_pages); + + ret = cachefiles_has_space(cache, 0, *nr_pages); + if (ret == 0) { + pagevec_init(&pagevec, 0); + + list_for_each_entry(page, pages, lru) { + if (pagevec_add(&pagevec, page) == 0) + fscache_mark_pages_cached(op, &pagevec); + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + fscache_retrieval_complete(op, *nr_pages); + _leave(" = %d", ret); + return ret; +} + +/* + * request a page be stored in the cache + * - cache withdrawal is prevented by the caller + * - this request may be ignored if there's no cache block available, in which + * case -ENOBUFS will be returned + * - if the op is in progress, 0 will be returned + */ +int cachefiles_write_page(struct fscache_storage *op, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct file *file; + struct path path; + loff_t pos, eof; + size_t len; + void *data; + int ret; + + ASSERT(op != NULL); + ASSERT(page != NULL); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("%p,%p{%lx},,,", object, page, page->index); + + if (!object->backer) { + _leave(" = -ENOBUFS"); + return -ENOBUFS; + } + + ASSERT(d_is_reg(object->backer)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + /* write the page to the backing filesystem and let it store it in its + * own time */ + path.mnt = cache->mnt; + path.dentry = object->backer; + file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + } else { + pos = (loff_t) page->index << PAGE_SHIFT; + + /* we mustn't write more data than we have, so we have + * to beware of a partial page at EOF */ + eof = object->fscache.store_limit_l; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + ASSERTCMP(pos, <, eof); + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); + } + } + + data = kmap(page); + ret = __kernel_write(file, data, len, &pos); + kunmap(page); + if (ret != len) + ret = -EIO; + fput(file); + } + + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error_obj( + object, "Write page to backing file failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * detach a backing block from a page + * - cache withdrawal is prevented by the caller + */ +void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lu}", object, page->index); + + spin_unlock(&object->fscache.cookie->lock); +} diff --git a/kernel/fs/cachefiles/security.c b/kernel/fs/cachefiles/security.c new file mode 100644 index 000000000..31bbc0528 --- /dev/null +++ b/kernel/fs/cachefiles/security.c @@ -0,0 +1,116 @@ +/* CacheFiles security management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * determine the security context within which we access the cache from within + * the kernel + */ +int cachefiles_get_security_ID(struct cachefiles_cache *cache) +{ + struct cred *new; + int ret; + + _enter("{%s}", cache->secctx); + + new = prepare_kernel_cred(current); + if (!new) { + ret = -ENOMEM; + goto error; + } + + if (cache->secctx) { + ret = set_security_override_from_ctx(new, cache->secctx); + if (ret < 0) { + put_cred(new); + pr_err("Security denies permission to nominate security context: error %d\n", + ret); + goto error; + } + } + + cache->cache_cred = new; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * see if mkdir and create can be performed in the root directory + */ +static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, + struct dentry *root) +{ + int ret; + + ret = security_inode_mkdir(d_backing_inode(root), root, 0); + if (ret < 0) { + pr_err("Security denies permission to make dirs: error %d", + ret); + return ret; + } + + ret = security_inode_create(d_backing_inode(root), root, 0); + if (ret < 0) + pr_err("Security denies permission to create files: error %d", + ret); + + return ret; +} + +/* + * check the security details of the on-disk cache + * - must be called with security override in force + * - must return with a security override in force - even in the case of an + * error + */ +int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred) +{ + struct cred *new; + int ret; + + _enter(""); + + /* duplicate the cache creds for COW (the override is currently in + * force, so we can use prepare_creds() to do this) */ + new = prepare_creds(); + if (!new) + return -ENOMEM; + + cachefiles_end_secure(cache, *_saved_cred); + + /* use the cache root dir's security context as the basis with + * which create files */ + ret = set_create_files_as(new, d_backing_inode(root)); + if (ret < 0) { + abort_creds(new); + cachefiles_begin_secure(cache, _saved_cred); + _leave(" = %d [cfa]", ret); + return ret; + } + + put_cred(cache->cache_cred); + cache->cache_cred = new; + + cachefiles_begin_secure(cache, _saved_cred); + ret = cachefiles_check_cache_dir(cache, root); + + if (ret == -EOPNOTSUPP) + ret = 0; + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/cachefiles/xattr.c b/kernel/fs/cachefiles/xattr.c new file mode 100644 index 000000000..d31c1a72d --- /dev/null +++ b/kernel/fs/cachefiles/xattr.c @@ -0,0 +1,324 @@ +/* CacheFiles extended attribute management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const char cachefiles_xattr_cache[] = + XATTR_USER_PREFIX "CacheFiles.cache"; + +/* + * check the type label on an object + * - done using xattrs + */ +int cachefiles_check_object_type(struct cachefiles_object *object) +{ + struct dentry *dentry = object->dentry; + char type[3], xtype[3]; + int ret; + + ASSERT(dentry); + ASSERT(d_backing_inode(dentry)); + + if (!object->fscache.cookie) + strcpy(type, "C3"); + else + snprintf(type, 3, "%02x", object->fscache.cookie->def->type); + + _enter("%p{%s}", object, type); + + /* attempt to install a type label directly */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, + XATTR_CREATE); + if (ret == 0) { + _debug("SET"); /* we succeeded */ + goto error; + } + + if (ret != -EEXIST) { + pr_err("Can't set xattr on %pd [%lu] (err %d)\n", + dentry, d_backing_inode(dentry)->i_ino, + -ret); + goto error; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + if (ret < 0) { + if (ret == -ERANGE) + goto bad_type_length; + + pr_err("Can't read xattr on %pd [%lu] (err %d)\n", + dentry, d_backing_inode(dentry)->i_ino, + -ret); + goto error; + } + + /* check the type is what we're expecting */ + if (ret != 2) + goto bad_type_length; + + if (xtype[0] != type[0] || xtype[1] != type[1]) + goto bad_type; + + ret = 0; + +error: + _leave(" = %d", ret); + return ret; + +bad_type_length: + pr_err("Cache object %lu type xattr length incorrect\n", + d_backing_inode(dentry)->i_ino); + ret = -EIO; + goto error; + +bad_type: + xtype[2] = 0; + pr_err("Cache object %pd [%lu] type %s not %s\n", + dentry, d_backing_inode(dentry)->i_ino, + xtype, type); + ret = -EIO; + goto error; +} + +/* + * set the state xattr on a cache file + */ +int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET #%u", auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_CREATE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * update the state xattr on a cache file + */ +int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET #%u", auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to update xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * check the consistency between the backing cache and the FS-Cache cookie + */ +int cachefiles_check_auxdata(struct cachefiles_object *object) +{ + struct cachefiles_xattr *auxbuf; + enum fscache_checkaux validity; + struct dentry *dentry = object->dentry; + ssize_t xlen; + int ret; + + ASSERT(dentry); + ASSERT(d_backing_inode(dentry)); + ASSERT(object->fscache.cookie->def->check_aux); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) + return -ENOMEM; + + xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + ret = -ESTALE; + if (xlen < 1 || + auxbuf->type != object->fscache.cookie->def->type) + goto error; + + xlen--; + validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); + if (validity != FSCACHE_CHECKAUX_OKAY) + goto error; + + ret = 0; +error: + kfree(auxbuf); + return ret; +} + +/* + * check the state xattr on a cache file + * - return -ESTALE if the object should be deleted + */ +int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + int ret; + + _enter("%p,#%d", object, auxdata->len); + + ASSERT(dentry); + ASSERT(d_backing_inode(dentry)); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); + if (!auxbuf) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (ret < 0) { + if (ret == -ENODATA) + goto stale; /* no attribute - power went off + * mid-cull? */ + + if (ret == -ERANGE) + goto bad_type_length; + + cachefiles_io_error_obj(object, + "Can't read xattr on %lu (err %d)", + d_backing_inode(dentry)->i_ino, -ret); + goto error; + } + + /* check the on-disk object */ + if (ret < 1) + goto bad_type_length; + + if (auxbuf->type != auxdata->type) + goto stale; + + auxbuf->len = ret; + + /* consult the netfs */ + if (object->fscache.cookie->def->check_aux) { + enum fscache_checkaux result; + unsigned int dlen; + + dlen = auxbuf->len - 1; + + _debug("checkaux %s #%u", + object->fscache.cookie->def->name, dlen); + + result = fscache_check_aux(&object->fscache, + &auxbuf->data, dlen); + + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + goto okay; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + goto stale; + + default: + BUG(); + } + + /* update the current label */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0) { + cachefiles_io_error_obj(object, + "Can't update xattr on %lu" + " (error %d)", + d_backing_inode(dentry)->i_ino, -ret); + goto error; + } + } + +okay: + ret = 0; + +error: + kfree(auxbuf); + _leave(" = %d", ret); + return ret; + +bad_type_length: + pr_err("Cache object %lu xattr length incorrect\n", + d_backing_inode(dentry)->i_ino); + ret = -EIO; + goto error; + +stale: + ret = -ESTALE; + goto error; +} + +/* + * remove the object's xattr to mark it stale + */ +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + int ret; + + ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + d_backing_inode(dentry)->i_ino, -ret); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/kernel/fs/ceph/Kconfig b/kernel/fs/ceph/Kconfig new file mode 100644 index 000000000..264e9bf83 --- /dev/null +++ b/kernel/fs/ceph/Kconfig @@ -0,0 +1,40 @@ +config CEPH_FS + tristate "Ceph distributed file system" + depends on INET + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include support for mounting the + experimental Ceph distributed file system. Ceph is an extremely + scalable file system designed to provide high performance, + reliable access to petabytes of storage. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N diff --git a/kernel/fs/ceph/Makefile b/kernel/fs/ceph/Makefile new file mode 100644 index 000000000..85a4230b9 --- /dev/null +++ b/kernel/fs/ceph/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for CEPH filesystem. +# + +obj-$(CONFIG_CEPH_FS) += ceph.o + +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ + export.o caps.o snap.o xattr.o \ + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o + +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/kernel/fs/ceph/acl.c b/kernel/fs/ceph/acl.c new file mode 100644 index 000000000..64fa24834 --- /dev/null +++ b/kernel/fs/ceph/acl.c @@ -0,0 +1,263 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + dentry = d_find_alias(inode); + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_dput; + } + + ret = __ceph_setxattr(dentry, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_dput; + } + + ceph_set_cached_acl(inode, type, acl); + +out_dput: + dput(dentry); +out_free: + kfree(value); +out: + return ret; +} + +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info) +{ + struct posix_acl *acl, *default_acl; + size_t val_size1 = 0, val_size2 = 0; + struct ceph_pagelist *pagelist = NULL; + void *tmp_buf = NULL; + int err; + + err = posix_acl_create(dir, mode, &default_acl, &acl); + if (err) + return err; + + if (acl) { + int ret = posix_acl_equiv_mode(acl, mode); + if (ret < 0) + goto out_err; + if (ret == 0) { + posix_acl_release(acl); + acl = NULL; + } + } + + if (!default_acl && !acl) + return 0; + + if (acl) + val_size1 = posix_acl_xattr_size(acl->a_count); + if (default_acl) + val_size2 = posix_acl_xattr_size(default_acl->a_count); + + err = -ENOMEM; + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + if (!tmp_buf) + goto out_err; + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + if (!pagelist) + goto out_err; + ceph_pagelist_init(pagelist); + + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out_err; + + ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); + + if (acl) { + size_t len = strlen(POSIX_ACL_XATTR_ACCESS); + err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS, + len); + err = posix_acl_to_xattr(&init_user_ns, acl, + tmp_buf, val_size1); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size1); + ceph_pagelist_append(pagelist, tmp_buf, val_size1); + } + if (default_acl) { + size_t len = strlen(POSIX_ACL_XATTR_DEFAULT); + err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); + if (err) + goto out_err; + err = ceph_pagelist_encode_string(pagelist, + POSIX_ACL_XATTR_DEFAULT, len); + err = posix_acl_to_xattr(&init_user_ns, default_acl, + tmp_buf, val_size2); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size2); + ceph_pagelist_append(pagelist, tmp_buf, val_size2); + } + + kfree(tmp_buf); + + info->acl = acl; + info->default_acl = default_acl; + info->pagelist = pagelist; + return 0; + +out_err: + posix_acl_release(acl); + posix_acl_release(default_acl); + kfree(tmp_buf); + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info) +{ + if (!inode) + return; + ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl); + ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl); +} + +void ceph_release_acls_info(struct ceph_acls_info *info) +{ + posix_acl_release(info->acl); + posix_acl_release(info->default_acl); + if (info->pagelist) + ceph_pagelist_release(info->pagelist); +} diff --git a/kernel/fs/ceph/addr.c b/kernel/fs/ceph/addr.c new file mode 100644 index 000000000..e162bcd10 --- /dev/null +++ b/kernel/fs/ceph/addr.c @@ -0,0 +1,1599 @@ +#include + +#include +#include +#include +#include +#include /* generic_writepages */ +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include + +/* + * Ceph address space ops. + * + * There are a few funny things going on here. + * + * The page->private field is used to reference a struct + * ceph_snap_context for _every_ dirty page. This indicates which + * snapshot the page was logically dirtied in, and thus which snap + * context needs to be associated with the osd write during writeback. + * + * Similarly, struct ceph_inode_info maintains a set of counters to + * count dirty pages on the inode. In the absence of snapshots, + * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. + * + * When a snapshot is taken (that is, when the client receives + * notification that a snapshot was taken), each inode with caps and + * with dirty pages (dirty pages implies there is a cap) gets a new + * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending + * order, new snaps go to the tail). The i_wrbuffer_ref_head count is + * moved to capsnap->dirty. (Unless a sync write is currently in + * progress. In that case, the capsnap is said to be "pending", new + * writes cannot start, and the capsnap isn't "finalized" until the + * write completes (or fails) and a final size/mtime for the inode for + * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. + * + * On writeback, we must submit writes to the osd IN SNAP ORDER. So, + * we look for the first capsnap in i_cap_snaps and write out pages in + * that snap context _only_. Then we move on to the next capsnap, + * eventually reaching the "live" or "head" context (i.e., pages that + * are not yet snapped) and are writing the most recently dirtied + * pages. + * + * Invalidate and so forth must take care to ensure the dirty page + * accounting is preserved. + */ + +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ + if (PagePrivate(page)) + return (void *)page->private; + return NULL; +} + +/* + * Dirty a page. Optimistically adjust accounting, on the assumption + * that we won't race with invalidate. If we do, readjust. + */ +static int ceph_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc; + int ret; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + if (PageDirty(page)) { + dout("%p set_page_dirty %p idx %lu -- already dirty\n", + mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); + return 0; + } + + inode = mapping->host; + ci = ceph_inode(inode); + + /* + * Note that we're grabbing a snapc ref here without holding + * any locks! + */ + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + + /* dirty the head */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_head_snapc == NULL) + ci->i_head_snapc = ceph_get_snap_context(snapc); + ++ci->i_wrbuffer_ref_head; + if (ci->i_wrbuffer_ref == 0) + ihold(inode); + ++ci->i_wrbuffer_ref; + dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + mapping->host, page, page->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); + spin_unlock(&ci->i_ceph_lock); + + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); + + ret = __set_page_dirty_nobuffers(page); + WARN_ON(!PageLocked(page)); + WARN_ON(!page->mapping); + + return ret; +} + +/* + * If we are truncating the full page (i.e. offset == 0), adjust the + * dirty page counters appropriately. Only called if there is private + * data on the page. + */ +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc = page_snap_context(page); + + inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + if (!PagePrivate(page)) + return; + + /* + * We can get non-dirty pages here due to races between + * set_page_dirty and truncate_complete_page; just spit out a + * warning, in case we end up with accounting problems later. + */ + if (!PageDirty(page)) + pr_err("%p invalidatepage %p page not dirty\n", inode, page); + + ClearPageChecked(page); + + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); +} + +static int ceph_releasepage(struct page *page, gfp_t g) +{ + struct inode *inode = page->mapping ? page->mapping->host : NULL; + dout("%p releasepage %p idx %lu\n", inode, page, page->index); + WARN_ON(PageDirty(page)); + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); +} + +/* + * read a single page, without unlocking it. + */ +static int readpage_nounlock(struct file *filp, struct page *page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int err = 0; + u64 off = page_offset(page); + u64 len = PAGE_CACHE_SIZE; + + if (off >= i_size_read(inode)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) + return -EINVAL; + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + err = ceph_readpage_from_fscache(inode, page); + if (err == 0) + goto out; + + dout("readpage inode %p file %p page %p index %lu\n", + inode, filp, page, page->index); + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + off, &len, + ci->i_truncate_seq, ci->i_truncate_size, + &page, 1, 0); + if (err == -ENOENT) + err = 0; + if (err < 0) { + SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); + goto out; + } + if (err < PAGE_CACHE_SIZE) + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); + +out: + return err < 0 ? err : 0; +} + +static int ceph_readpage(struct file *filp, struct page *page) +{ + int r = readpage_nounlock(filp, page); + unlock_page(page); + return r; +} + +/* + * Finish an async read(ahead) op. + */ +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; + int i; + + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + + /* unlock all pages, zeroing any data we didn't read */ + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; + + if (rc < 0) + goto unlock; + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); + } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); +unlock: + unlock_page(page); + page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; + } + kfree(osd_data->pages); +} + +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + unlock_page(pages[i]); +} + +/* + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. + */ +static int start_read(struct inode *inode, struct list_head *page_list, int max) +{ + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; + struct ceph_osd_request *req; + u64 off; + u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; + + off = (u64) page_offset(page); + + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } + len = nr_pages << PAGE_CACHE_SHIFT; + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 0, 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* build page vector */ + nr_pages = calc_pages_for(0, len); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); + list_del(&page->lru); + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, + GFP_NOFS)) { + ceph_fscache_uncache_page(inode, page); + page_cache_release(page); + dout("start_read %p add_to_page_cache failed %p\n", + inode, page); + nr_pages = i; + goto out_pages; + } + pages[i] = page; + } + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + req->r_callback = finish_read; + req->r_inode = inode; + + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; + +out_pages: + ceph_unlock_page_vector(pages, nr_pages); + ceph_release_page_vector(pages, nr_pages); +out: + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int rc = 0; + int max = 0; + + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, + max); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list, max); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } +out: + ceph_fscache_readpages_cancel(inode, page_list); + + dout("readpages %p file %p ret %d\n", inode, file, rc); + return rc; +} + +/* + * Get ref for the oldest snapc for an inode with dirty data... that is, the + * only snap context we are allowed to write back. + */ +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = NULL; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, + capsnap->context, capsnap->dirty_pages); + if (capsnap->dirty_pages) { + snapc = ceph_get_snap_context(capsnap->context); + if (snap_size) + *snap_size = capsnap->size; + break; + } + } + if (!snapc && ci->i_wrbuffer_ref_head) { + snapc = ceph_get_snap_context(ci->i_head_snapc); + dout(" head snapc %p has %d dirty pages\n", + snapc, ci->i_wrbuffer_ref_head); + } + spin_unlock(&ci->i_ceph_lock); + return snapc; +} + +/* + * Write a single page, but leave the page locked. + * + * If we get a write error, set the page error bit, but still adjust the + * dirty page accounting (i.e., page is no longer dirty). + */ +static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_fs_client *fsc; + struct ceph_osd_client *osdc; + struct ceph_snap_context *snapc, *oldest; + loff_t page_off = page_offset(page); + long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; + + dout("writepage %p idx %lu\n", page, page->index); + + if (!page->mapping || !page->mapping->host) { + dout("writepage %p - no mapping\n", page); + return -EFAULT; + } + inode = page->mapping->host; + ci = ceph_inode(inode); + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; + + /* verify this is a writeable snap context */ + snapc = page_snap_context(page); + if (snapc == NULL) { + dout("writepage %p page %p not dirty?\n", inode, page); + goto out; + } + oldest = get_oldest_context(inode, &snap_size); + if (snapc->seq > oldest->seq) { + dout("writepage %p page %p snapc %p not writeable - noop\n", + inode, page, snapc); + /* we should only noop if called by kswapd */ + WARN_ON((current->flags & PF_MEMALLOC) == 0); + ceph_put_snap_context(oldest); + goto out; + } + ceph_put_snap_context(oldest); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + + /* is this a partial page at end of file? */ + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; + + dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", + inode, page, page->index, page_off, len, snapc); + + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + + ceph_readpage_to_fscache(inode, page); + + set_page_writeback(page); + err = ceph_osdc_writepages(osdc, ceph_vino(inode), + &ci->i_layout, snapc, + page_off, len, + truncate_seq, truncate_size, + &inode->i_mtime, &page, 1); + if (err < 0) { + dout("writepage setting page/mapping error %d %p\n", err, page); + SetPageError(page); + mapping_set_error(&inode->i_data, err); + if (wbc) + wbc->pages_skipped++; + } else { + dout("writepage cleaned page %p\n", page); + err = 0; /* vfs expects us to return 0 */ + } + page->private = 0; + ClearPagePrivate(page); + end_page_writeback(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); /* page's reference */ +out: + return err; +} + +static int ceph_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + struct inode *inode = page->mapping->host; + BUG_ON(!inode); + ihold(inode); + err = writepage_nounlock(page, wbc); + unlock_page(page); + iput(inode); + return err; +} + + +/* + * lame release_pages helper. release_pages() isn't exported to + * modules. + */ +static void ceph_release_pages(struct page **pages, int num) +{ + struct pagevec pvec; + int i; + + pagevec_init(&pvec, 0); + for (i = 0; i < num; i++) { + if (pagevec_add(&pvec, pages[i]) == 0) + pagevec_release(&pvec); + } + pagevec_release(&pvec); +} + +/* + * async writeback completion handler. + * + * If we get an error, set the mapping error bit, but not the individual + * page error bits. + */ +static void writepages_finish(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + struct inode *inode = req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; + unsigned wrote; + struct page *page; + int num_pages; + int i; + struct ceph_snap_context *snapc = req->r_snapc; + struct address_space *mapping = inode->i_mapping; + int rc = req->r_result; + u64 bytes = req->r_ops[0].extent.length; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + long writeback_stat; + unsigned issued = ceph_caps_issued(ci); + + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + if (rc >= 0) { + /* + * Assume we wrote the pages we originally sent. The + * osd might reply with fewer pages if our writeback + * raced with a truncation and was adjusted at the osd, + * so don't believe the reply. + */ + wrote = num_pages; + } else { + wrote = 0; + mapping_set_error(mapping, rc); + } + dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", + inode, rc, bytes, wrote); + + /* clean all pages */ + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); + + writeback_stat = + atomic_long_dec_return(&fsc->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); + + ceph_put_snap_context(page_snap_context(page)); + page->private = 0; + ClearPagePrivate(page); + dout("unlocking %d %p\n", i, page); + end_page_writeback(page); + + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + generic_error_remove_page(inode->i_mapping, page); + + unlock_page(page); + } + dout("%p wrote+cleaned %d pages\n", inode, wrote); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); + + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); + else + kfree(osd_data->pages); + ceph_osdc_put_request(req); +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); + pgoff_t index, start, end; + int range_whole = 0; + int should_loop = 1; + pgoff_t max_pages = 0, max_pages_ever = 0; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; + struct pagevec pvec; + int done = 0; + int rc = 0; + unsigned wsize = 1 << inode->i_blkbits; + struct ceph_osd_request *req = NULL; + int do_sync = 0; + u64 truncate_size, snap_size; + u32 truncate_seq; + + /* + * Include a 'sync' in the OSD request if this is a data + * integrity write (e.g., O_SYNC write or fsync()), or if our + * cap is being revoked. + */ + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + do_sync = 1; + dout("writepages_start %p dosync=%d (mode=%s)\n", + inode, do_sync, + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + pr_warn("writepage_start %p on forced umount\n", inode); + return -EIO; /* we're in a forced umount, don't write! */ + } + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; + if (wsize < PAGE_CACHE_SIZE) + wsize = PAGE_CACHE_SIZE; + max_pages_ever = wsize >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + + /* where to start/end? */ + if (wbc->range_cyclic) { + start = mapping->writeback_index; /* Start from prev offset */ + end = -1; + dout(" cyclic, start at %lu\n", start); + } else { + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + should_loop = 0; + dout(" not cyclic, %lu to %lu\n", start, end); + } + index = start; + +retry: + /* find oldest snap context with dirty data */ + ceph_put_snap_context(snapc); + snap_size = 0; + snapc = get_oldest_context(inode, &snap_size); + if (!snapc) { + /* hmm, why does writepages get called when there + is no dirty data? */ + dout(" no snap context with dirty data?\n"); + goto out; + } + if (snap_size == 0) + snap_size = i_size_read(inode); + dout(" oldest snapc is %p seq %lld (%d snaps)\n", + snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + + if (last_snapc && snapc != last_snapc) { + /* if we switched to a newer snapc, restart our scan at the + * start of the original file range. */ + dout(" snapc differs from last pass, restarting at %lu\n", + index); + index = start; + } + last_snapc = snapc; + + while (!done && index <= end) { + unsigned i; + int first; + pgoff_t next; + int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ + struct page *page; + int want; + u64 offset, len; + long writeback_stat; + + next = 0; + locked_pages = 0; + max_pages = max_pages_ever; + +get_more_pages: + first = -1; + want = min(end - index, + min((pgoff_t)PAGEVEC_SIZE, + max_pages - (pgoff_t)locked_pages) - 1) + + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + want); + dout("pagevec_lookup_tag got %d\n", pvec_pages); + if (!pvec_pages && !locked_pages) + break; + for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { + page = pvec.pages[i]; + dout("? %p idx %lu\n", page, page->index); + if (locked_pages == 0) + lock_page(page); /* first page */ + else if (!trylock_page(page)) + break; + + /* only dirty pages, or our accounting breaks */ + if (unlikely(!PageDirty(page)) || + unlikely(page->mapping != mapping)) { + dout("!dirty or !mapping %p\n", page); + unlock_page(page); + break; + } + if (!wbc->range_cyclic && page->index > end) { + dout("end of range %p\n", page); + done = 1; + unlock_page(page); + break; + } + if (next && (page->index != next)) { + dout("not consecutive %p\n", page); + unlock_page(page); + break; + } + if (wbc->sync_mode != WB_SYNC_NONE) { + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); + } + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + done = 1; + unlock_page(page); + break; + } + if (PageWriteback(page)) { + dout("%p under writeback\n", page); + unlock_page(page); + break; + } + + /* only if matching snap context */ + pgsnapc = page_snap_context(page); + if (pgsnapc->seq > snapc->seq) { + dout("page snapc %p %lld > oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); + unlock_page(page); + if (!locked_pages) + continue; /* keep looking for snap */ + break; + } + + if (!clear_page_dirty_for_io(page)) { + dout("%p !clear_page_dirty_for_io\n", page); + unlock_page(page); + break; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ + if (locked_pages == 0) { + BUG_ON(pages); + /* prepare async write request */ + offset = (u64)page_offset(page); + len = wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, + do_sync ? 2 : 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + unlock_page(page); + break; + } + + if (do_sync) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + + req->r_callback = writepages_finish; + req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } + } + + /* note position of first page in pvec */ + if (first < 0) + first = i; + dout("%p will write page %p idx %lu\n", + inode, page, page->index); + + writeback_stat = + atomic_long_inc_return(&fsc->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH( + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); + } + + set_page_writeback(page); + pages[locked_pages] = page; + locked_pages++; + next = page->index + 1; + } + + /* did we get anything? */ + if (!locked_pages) + goto release_pvec_pages; + if (i) { + int j; + BUG_ON(!locked_pages || first < 0); + + if (pvec_pages && i == pvec_pages && + locked_pages < max_pages) { + dout("reached end pvec, trying for more\n"); + pagevec_reinit(&pvec); + goto get_more_pages; + } + + /* shift unused pages over in the pvec... we + * will need to release them below. */ + for (j = i; j < pvec_pages; j++) { + dout(" pvec leftover page %p\n", + pvec.pages[j]); + pvec.pages[j-i+first] = pvec.pages[j]; + } + pvec.nr -= i-first; + } + + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, + (u64)locked_pages << PAGE_CACHE_SHIFT); + dout("writepages got %d pages at %llu~%llu\n", + locked_pages, offset, len); + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); + + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); + req = NULL; + + /* continue? */ + index = next; + wbc->nr_to_write -= locked_pages; + if (wbc->nr_to_write <= 0) + done = 1; + +release_pvec_pages: + dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, + pvec.nr ? pvec.pages[0] : NULL); + pagevec_release(&pvec); + + if (locked_pages && !done) + goto retry; + } + + if (should_loop && !done) { + /* more to do; loop back to beginning of file */ + dout("writepages looping back to beginning of file\n"); + should_loop = 0; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + +out: + if (req) + ceph_osdc_put_request(req); + ceph_put_snap_context(snapc); + dout("writepages done, rc = %d\n", rc); + return rc; +} + + + +/* + * See if a given @snapc is either writeable, or already written. + */ +static int context_is_writeable_or_written(struct inode *inode, + struct ceph_snap_context *snapc) +{ + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + * + * called with page locked. + * return success with page locked, + * or any failure (incl -EAGAIN) with page unlocked. + */ +static int ceph_update_writeable_page(struct file *file, + loff_t pos, unsigned len, + struct page *page) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + loff_t page_off = pos & PAGE_CACHE_MASK; + int pos_in_page = pos & ~PAGE_CACHE_MASK; + int end_in_page = pos_in_page + len; + loff_t i_size; + int r; + struct ceph_snap_context *snapc, *oldest; + +retry_locked: + /* writepages currently holds page lock, but if we change that later, */ + wait_on_page_writeback(page); + + /* check snap context */ + BUG_ON(!ci->i_snap_realm); + down_read(&mdsc->snap_rwsem); + BUG_ON(!ci->i_snap_realm->cached_context); + snapc = page_snap_context(page); + if (snapc && snapc != ci->i_head_snapc) { + /* + * this page is already dirty in another (older) snap + * context! is it writeable now? + */ + oldest = get_oldest_context(inode, NULL); + up_read(&mdsc->snap_rwsem); + + if (snapc->seq > oldest->seq) { + ceph_put_snap_context(oldest); + dout(" page %p snapc %p not current or oldest\n", + page, snapc); + /* + * queue for writeback, and wait for snapc to + * be writeable or written + */ + snapc = ceph_get_snap_context(snapc); + unlock_page(page); + ceph_queue_writeback(inode); + r = wait_event_interruptible(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + if (r == -ERESTARTSYS) + return r; + return -EAGAIN; + } + ceph_put_snap_context(oldest); + + /* yay, writeable, do it now (without dropping page lock) */ + dout(" page %p snapc %p not current, but oldest\n", + page, snapc); + if (!clear_page_dirty_for_io(page)) + goto retry_locked; + r = writepage_nounlock(page, NULL); + if (r < 0) + goto fail_nosnap; + goto retry_locked; + } + + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + return 0; + } + + /* full page? */ + if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) + return 0; + + /* past end of file? */ + i_size = inode->i_size; /* caller holds i_mutex */ + + if (page_off >= i_size || + (pos_in_page == 0 && (pos+len) >= i_size && + end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { + dout(" zeroing %p 0 - %d and %d - %d\n", + page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); + zero_user_segments(page, + 0, pos_in_page, + end_in_page, PAGE_CACHE_SIZE); + return 0; + } + + /* we need to read it. */ + up_read(&mdsc->snap_rwsem); + r = readpage_nounlock(file, page); + if (r < 0) + goto fail_nosnap; + goto retry_locked; +fail_nosnap: + unlock_page(page); + return r; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + */ +static int ceph_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = file_inode(file); + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int r; + + do { + /* get a page */ + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + return -ENOMEM; + *pagep = page; + + dout("write_begin file %p inode %p page %p %d~%d\n", file, + inode, page, (int)pos, (int)len); + + r = ceph_update_writeable_page(file, pos, len, page); + if (r < 0) + page_cache_release(page); + else + *pagep = page; + } while (r == -EAGAIN); + + return r; +} + +/* + * we don't do anything in here that simple_write_end doesn't do + * except adjust dirty page accounting and drop read lock on + * mdsc->snap_rwsem. + */ +static int ceph_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int check_cap = 0; + + dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, + inode, page, (int)pos, (int)copied, (int)len); + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) + zero_user_segment(page, from+copied, len); + + /* did file size increase? */ + /* (no need for i_size_read(); we caller holds i_mutex */ + if (pos+copied > inode->i_size) + check_cap = ceph_inode_set_size(inode, pos+copied); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + set_page_dirty(page); + + unlock_page(page); + up_read(&mdsc->snap_rwsem); + page_cache_release(page); + + if (check_cap) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + + return copied; +} + +/* + * we set .direct_IO to indicate direct io is supported, but since we + * intercept O_DIRECT reads and writes early, this function should + * never get called. + */ +static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) +{ + WARN_ON(1); + return -EINVAL; +} + +const struct address_space_operations ceph_aops = { + .readpage = ceph_readpage, + .readpages = ceph_readpages, + .writepage = ceph_writepage, + .writepages = ceph_writepages_start, + .write_begin = ceph_write_begin, + .write_end = ceph_write_end, + .set_page_dirty = ceph_set_page_dirty, + .invalidatepage = ceph_invalidatepage, + .releasepage = ceph_releasepage, + .direct_IO = ceph_direct_io, +}; + + +/* + * vm ops + */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct page *pinned_page = NULL; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, + -1, &got, &pinned_page); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + ci->i_inline_version == CEPH_INLINE_NONE) + ret = filemap_fault(vma, vmf); + else + ret = -EAGAIN; + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + if (pinned_page) + page_cache_release(pinned_page); + ceph_put_cap_refs(ci, got); + + if (ret != -EAGAIN) + return ret; + + /* read inline data */ + if (off >= PAGE_CACHE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + int ret1; + struct address_space *mapping = inode->i_mapping; + struct page *page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + ret1 = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 < 0 || off >= i_size_read(inode)) { + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + if (ret1 < PAGE_CACHE_SIZE) + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; + } +out: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ret); + return ret; +} + +/* + * Reuse write_begin here for simplicity. + */ +static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct page *page = vmf->page; + loff_t off = page_offset(page); + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + struct page *locked_page = NULL; + if (off == 0) { + lock_page(page); + locked_page = page; + } + ret = ceph_uninline_data(vma->vm_file, locked_page); + if (locked_page) + unlock_page(locked_page); + if (ret < 0) + return VM_FAULT_SIGBUS; + } + + if (off + PAGE_CACHE_SIZE <= size) + len = PAGE_CACHE_SIZE; + else + len = size & ~PAGE_CACHE_MASK; + + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); + + lock_page(page); + + ret = VM_FAULT_NOPAGE; + if ((off > size) || + (page->mapping != inode->i_mapping)) + goto out; + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret == 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + up_read(&mdsc->snap_rwsem); + ret = VM_FAULT_LOCKED; + } else { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; + } +out: + if (ret != VM_FAULT_LOCKED) + unlock_page(page); + if (ret == VM_FAULT_LOCKED || + ci->i_inline_version != CEPH_INLINE_NONE) { + int dirty; + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} + +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; + } else { + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return; + } + } + + dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n", + inode, ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +} + +int ceph_uninline_data(struct file *filp, struct page *locked_page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page *page = NULL; + u64 len, inline_version; + int err = 0; + bool from_pagecache = false; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == 1 || /* initial version, no data */ + inline_version == CEPH_INLINE_NONE) + goto out; + + if (locked_page) { + page = locked_page; + WARN_ON(!PageUptodate(page)); + } else if (ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { + page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + from_pagecache = true; + lock_page(page); + } else { + page_cache_release(page); + page = NULL; + } + } + } + + if (page) { + len = i_size_read(inode); + if (len > PAGE_CACHE_SIZE) + len = PAGE_CACHE_SIZE; + } else { + page = __page_cache_alloc(GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0) { + /* no inline data */ + if (err == -ENODATA) + err = 0; + goto out; + } + len = err; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); +out_put: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out: + if (page && page != locked_page) { + if (from_pagecache) { + unlock_page(page); + page_cache_release(page); + } else + __free_pages(page, 0); + } + + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", + inode, ceph_vinop(inode), inline_version, err); + return err; +} + +static struct vm_operations_struct ceph_vmops = { + .fault = ceph_filemap_fault, + .page_mkwrite = ceph_page_mkwrite, +}; + +int ceph_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ceph_vmops; + return 0; +} diff --git a/kernel/fs/ceph/cache.c b/kernel/fs/ceph/cache.c new file mode 100644 index 000000000..834f9f372 --- /dev/null +++ b/kernel/fs/ceph/cache.c @@ -0,0 +1,402 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + struct timespec mtime; + loff_t size; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_fs_client* fsc = cookie_netfs_data; + uint16_t klen; + + klen = sizeof(fsc->client->fsid); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &fsc->client->fsid, klen); + return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + fsc, true); + + if (fsc->fscache == NULL) { + pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + return 0; + } + + fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); + if (fsc->revalidate_wq == NULL) + return -ENOMEM; + + return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + uint16_t klen; + + /* use ceph virtual inode (id + snaphot) */ + klen = sizeof(ci->i_vino); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &ci->i_vino, klen); + return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct ceph_aux_inode aux; + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + memcpy(buffer, &aux, sizeof(aux)); + + return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + *size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ + struct ceph_inode_info* ci = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dout("ceph inode 0x%p now uncached", ci); + + while (1) { + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = ceph_fscache_inode_get_key, + .get_attr = ceph_fscache_inode_get_attr, + .get_aux = ceph_fscache_inode_get_aux, + .check_aux = ceph_fscache_inode_check_aux, + .now_uncached = ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci) +{ + struct inode* inode = &ci->vfs_inode; + + /* No caching for filesystem */ + if (fsc->fscache == NULL) + return; + + /* Only cache for regular files that are read only */ + if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + return; + + /* Avoid multiple racing open requests */ + mutex_lock(&inode->i_mutex); + + if (ci->fscache) + goto done; + + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, true); + fscache_check_consistency(ci->fscache); +done: + mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ + return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && + (ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_vfs_readpage_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_vfs_readpage_complete_unlock, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!PageFsCache(page)) + return; + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fsc->revalidate_wq) + destroy_workqueue(fsc->revalidate_wq); + + fscache_relinquish_cookie(fsc->fscache, 0); + fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ + int issued; + u32 orig_gen; + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_revalidate_work); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + if (!(issued & CEPH_CAP_FILE_CACHE)) { + dout("revalidate_work lost cache before validation %p\n", + inode); + goto out; + } + + if (!fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + + spin_lock(&ci->i_ceph_lock); + /* Update the new valid generation (backwards sanity check too) */ + if (orig_gen > ci->i_fscache_gen) { + ci->i_fscache_gen = orig_gen; + } + spin_unlock(&ci->i_ceph_lock); + +out: + iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + return; + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, + &ci->i_revalidate_work)) { + dout("ceph_queue_revalidate %p\n", inode); + } else { + dout("ceph_queue_revalidate %p failed\n)", inode); + iput(inode); + } +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + /* The first load is verifed cookie open time */ + ci->i_fscache_gen = 1; + INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/kernel/fs/ceph/cache.h b/kernel/fs/ceph/cache.h new file mode 100644 index 000000000..5ac591bd0 --- /dev/null +++ b/kernel/fs/ceph/cache.h @@ -0,0 +1,182 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, + struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/kernel/fs/ceph/caps.c b/kernel/fs/ceph/caps.c new file mode 100644 index 000000000..be5ea6af8 --- /dev/null +++ b/kernel/fs/ceph/caps.c @@ -0,0 +1,3456 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include +#include + +/* + * Capability management + * + * The Ceph metadata servers control client access to inode metadata + * and file data by issuing capabilities, granting clients permission + * to read and/or write both inode field and file data to OSDs + * (storage nodes). Each capability consists of a set of bits + * indicating which operations are allowed. + * + * If the client holds a *_SHARED cap, the client has a coherent value + * that can be safely read from the cached inode. + * + * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the + * client is allowed to change inode attributes (e.g., file size, + * mtime), note its dirty state in the ceph_cap, and asynchronously + * flush that metadata change to the MDS. + * + * In the event of a conflicting operation (perhaps by another + * client), the MDS will revoke the conflicting client capabilities. + * + * In order for a client to cache an inode, it must hold a capability + * with at least one MDS server. When inodes are released, release + * notifications are batched and periodically sent en masse to the MDS + * cluster to release server state. + */ + + +/* + * Generate readable cap strings for debugging output. + */ +#define MAX_CAP_STR 20 +static char cap_str[MAX_CAP_STR][40]; +static DEFINE_SPINLOCK(cap_str_lock); +static int last_cap_str; + +static char *gcap_string(char *s, int c) +{ + if (c & CEPH_CAP_GSHARED) + *s++ = 's'; + if (c & CEPH_CAP_GEXCL) + *s++ = 'x'; + if (c & CEPH_CAP_GCACHE) + *s++ = 'c'; + if (c & CEPH_CAP_GRD) + *s++ = 'r'; + if (c & CEPH_CAP_GWR) + *s++ = 'w'; + if (c & CEPH_CAP_GBUFFER) + *s++ = 'b'; + if (c & CEPH_CAP_GLAZYIO) + *s++ = 'l'; + return s; +} + +const char *ceph_cap_string(int caps) +{ + int i; + char *s; + int c; + + spin_lock(&cap_str_lock); + i = last_cap_str++; + if (last_cap_str == MAX_CAP_STR) + last_cap_str = 0; + spin_unlock(&cap_str_lock); + + s = cap_str[i]; + + if (caps & CEPH_CAP_PIN) + *s++ = 'p'; + + c = (caps >> CEPH_CAP_SAUTH) & 3; + if (c) { + *s++ = 'A'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SLINK) & 3; + if (c) { + *s++ = 'L'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SXATTR) & 3; + if (c) { + *s++ = 'X'; + s = gcap_string(s, c); + } + + c = caps >> CEPH_CAP_SFILE; + if (c) { + *s++ = 'F'; + s = gcap_string(s, c); + } + + if (s == cap_str[i]) + *s++ = '-'; + *s = 0; + return cap_str[i]; +} + +void ceph_caps_init(struct ceph_mds_client *mdsc) +{ + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); +} + +void ceph_caps_finalize(struct ceph_mds_client *mdsc) +{ + struct ceph_cap *cap; + + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +{ + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count += delta; + BUG_ON(mdsc->caps_min_count < 0); + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) +{ + int i; + struct ceph_cap *cap; + int have; + int alloc = 0; + LIST_HEAD(newcaps); + + dout("reserve caps ctx=%p need=%d\n", ctx, need); + + /* first reserve any caps that are already allocated */ + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) + have = need; + else + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + + for (i = have; i < need; i++) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!cap) + break; + list_add(&cap->caps_item, &newcaps); + alloc++; + } + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); + + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + + ctx->count = need; + dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); +} + +int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + if (ctx->count) { + spin_lock(&mdsc->caps_list_lock); + BUG_ON(mdsc->caps_reserve_count < ctx->count); + mdsc->caps_reserve_count -= ctx->count; + mdsc->caps_avail_count += ctx->count; + ctx->count = 0; + dout("unreserve caps %d = %d used + %d resv + %d avail\n", + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + } + return 0; +} + +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + struct ceph_cap *cap = NULL; + + /* temporary, until we do something about cap import/export */ + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_use_count++; + mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); + } + return cap; + } + + spin_lock(&mdsc->caps_list_lock); + dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(!ctx->count); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); + + ctx->count--; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; + + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + return cap; +} + +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) +{ + spin_lock(&mdsc->caps_list_lock); + dout("put_cap %p %d = %d used + %d resv + %d avail\n", + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; + /* + * Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. + */ + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; + kmem_cache_free(ceph_cap_cachep, cap); + } else { + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); + } + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_reservation_status(struct ceph_fs_client *fsc, + int *total, int *avail, int *used, int *reserved, + int *min) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + if (total) + *total = mdsc->caps_total_count; + if (avail) + *avail = mdsc->caps_avail_count; + if (used) + *used = mdsc->caps_use_count; + if (reserved) + *reserved = mdsc->caps_reserve_count; + if (min) + *min = mdsc->caps_min_count; +} + +/* + * Find ceph_cap for given mds, if any. + * + * Called with i_ceph_lock held. + */ +static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + struct rb_node *n = ci->i_caps.rb_node; + + while (n) { + cap = rb_entry(n, struct ceph_cap, ci_node); + if (mds < cap->mds) + n = n->rb_left; + else if (mds > cap->mds) + n = n->rb_right; + else + return cap; + } + return NULL; +} + +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->i_ceph_lock); + return cap; +} + +/* + * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. + */ +static int __ceph_get_cap_mds(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + int mds = -1; + struct rb_node *p; + + /* prefer mds with WR|BUFFER|EXCL caps */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + mds = cap->mds; + if (cap->issued & (CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_BUFFER | + CEPH_CAP_FILE_EXCL)) + break; + } + return mds; +} + +int ceph_get_cap_mds(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds; + spin_lock(&ci->i_ceph_lock); + mds = __ceph_get_cap_mds(ceph_inode(inode)); + spin_unlock(&ci->i_ceph_lock); + return mds; +} + +/* + * Called under i_ceph_lock. + */ +static void __insert_cap_node(struct ceph_inode_info *ci, + struct ceph_cap *new) +{ + struct rb_node **p = &ci->i_caps.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap *cap = NULL; + + while (*p) { + parent = *p; + cap = rb_entry(parent, struct ceph_cap, ci_node); + if (new->mds < cap->mds) + p = &(*p)->rb_left; + else if (new->mds > cap->mds) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->ci_node, parent, p); + rb_insert_color(&new->ci_node, &ci->i_caps); +} + +/* + * (re)set cap hold timeouts, which control the delayed release + * of unused caps back to the MDS. Should be called on cap use. + */ +static void __cap_set_timeouts(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + struct ceph_mount_options *ma = mdsc->fsc->mount_options; + + ci->i_hold_caps_min = round_jiffies(jiffies + + ma->caps_wanted_delay_min * HZ); + ci->i_hold_caps_max = round_jiffies(jiffies + + ma->caps_wanted_delay_max * HZ); + dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, + ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); +} + +/* + * (Re)queue cap at the end of the delayed cap release list. + * + * If I_FLUSH is set, leave the inode at the front of the list. + * + * Caller holds i_ceph_lock + * -> we take mdsc->cap_delay_lock + */ +static void __cap_delay_requeue(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + __cap_set_timeouts(mdsc, ci); + dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + ci->i_ceph_flags, ci->i_hold_caps_max); + if (!mdsc->stopping) { + spin_lock(&mdsc->cap_delay_lock); + if (!list_empty(&ci->i_cap_delay_list)) { + if (ci->i_ceph_flags & CEPH_I_FLUSH) + goto no_change; + list_del_init(&ci->i_cap_delay_list); + } + list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); +no_change: + spin_unlock(&mdsc->cap_delay_lock); + } +} + +/* + * Queue an inode for immediate writeback. Mark inode with I_FLUSH, + * indicating we should send a cap message to flush dirty metadata + * asap, and move to the front of the delayed cap list. + */ +static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Cancel delayed work on cap. + * + * Caller must hold i_ceph_lock. + */ +static void __cap_delay_cancel(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + if (list_empty(&ci->i_cap_delay_list)) + return; + spin_lock(&mdsc->cap_delay_lock); + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Common issue checks for add_cap, handle_cap_grant. + */ +static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, + unsigned issued) +{ + unsigned had = __ceph_caps_issued(ci, NULL); + + /* + * Each time we receive FILE_CACHE anew, we increment + * i_rdcache_gen. + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { + ci->i_rdcache_gen++; + } + + /* + * if we are newly issued FILE_SHARED, mark dir not complete; we + * don't know what happened to this directory while we didn't + * have the cap. + */ + if ((issued & CEPH_CAP_FILE_SHARED) && + (had & CEPH_CAP_FILE_SHARED) == 0) { + ci->i_shared_gen++; + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } + } +} + +/* + * Add a capability under the given MDS session. + * + * Caller should hold session snap_rwsem (read) and s_mutex. + * + * @fmode is the open file mode, if we are opening a file, otherwise + * it is < 0. (This is so we can atomically add the cap and add an + * open file reference to it.) + */ +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int mds = session->s_mds; + int actual_wanted; + + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, + session->s_mds, cap_id, ceph_cap_string(issued), seq); + + /* + * If we are opening the file, include file mode wanted bits + * in wanted. + */ + if (fmode >= 0) + wanted |= ceph_caps_for_mode(fmode); + + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + cap = *new_cap; + *new_cap = NULL; + + cap->issued = 0; + cap->implemented = 0; + cap->mds = mds; + cap->mds_wanted = 0; + cap->mseq = 0; + + cap->ci = ci; + __insert_cap_node(ci, cap); + + /* add to session cap list */ + cap->session = session; + spin_lock(&session->s_cap_lock); + list_add_tail(&cap->session_caps, &session->s_caps); + session->s_nr_caps++; + spin_unlock(&session->s_cap_lock); + } else { + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } + + if (!ci->i_snap_realm) { + /* + * add this inode to the appropriate snap realm + */ + struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, + realmino); + if (realm) { + spin_lock(&realm->inodes_with_caps_lock); + ci->i_snap_realm = realm; + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + spin_unlock(&realm->inodes_with_caps_lock); + } else { + pr_err("ceph_add_cap: couldn't find snap realm %llx\n", + realmino); + WARN_ON(!realm); + } + } + + __check_cap_issue(ci, cap, issued); + + /* + * If we are issued caps we don't want, or the mds' wanted + * value appears to be off, queue a check so we'll release + * later and/or update the mds wanted value. + */ + actual_wanted = __ceph_caps_wanted(ci); + if ((wanted & ~actual_wanted) || + (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { + dout(" issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); + __cap_delay_requeue(mdsc, ci); + } + + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } + } else { + WARN_ON(ci->i_auth_cap == cap); + } + + dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); + cap->cap_id = cap_id; + cap->issued = issued; + cap->implemented |= issued; + if (ceph_seq_cmp(mseq, cap->mseq) > 0) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->cap_gen = session->s_cap_gen; + + if (fmode >= 0) + __ceph_get_fmode(ci, fmode); +} + +/* + * Return true if cap has not timed out and belongs to the current + * generation of the MDS session (i.e. has not gone 'stale' due to + * us losing touch with the mds). + */ +static int __cap_is_valid(struct ceph_cap *cap) +{ + unsigned long ttl; + u32 gen; + + spin_lock(&cap->session->s_gen_ttl_lock); + gen = cap->session->s_cap_gen; + ttl = cap->session->s_cap_ttl; + spin_unlock(&cap->session->s_gen_ttl_lock); + + if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { + dout("__cap_is_valid %p cap %p issued %s " + "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + return 0; + } + + return 1; +} + +/* + * Return set of valid cap bits issued to us. Note that caps time + * out, and may be invalidated in bulk if the client session times out + * and session->s_cap_gen is bumped. + */ +int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + if (implemented) + *implemented = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + dout("__ceph_caps_issued %p cap %p issued %s\n", + &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + have |= cap->issued; + if (implemented) + *implemented |= cap->implemented; + } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } + return have; +} + +/* + * Get cap bits issued by caps other than @ocap + */ +int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap == ocap) + continue; + if (!__cap_is_valid(cap)) + continue; + have |= cap->issued; + } + return have; +} + +/* + * Move a cap to the end of the LRU (oldest caps at list head, newest + * at list tail). + */ +static void __touch_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *s = cap->session; + + spin_lock(&s->s_cap_lock); + if (s->s_cap_iterator == NULL) { + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + s->s_mds); + list_move_tail(&cap->session_caps, &s->s_caps); + } else { + dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", + &cap->ci->vfs_inode, cap, s->s_mds); + } + spin_unlock(&s->s_cap_lock); +} + +/* + * Check if we hold the given mask. If so, move the cap(s) to the + * front of their respective LRUs. (This is the preferred way for + * callers to check for caps they want.) + */ +int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) +{ + struct ceph_cap *cap; + struct rb_node *p; + int have = ci->i_snap_caps; + + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p snap issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(have), + ceph_cap_string(mask)); + return 1; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if ((cap->issued & mask) == mask) { + dout("__ceph_caps_issued_mask %p cap %p issued %s" + " (mask %s)\n", &ci->vfs_inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) + __touch_cap(cap); + return 1; + } + + /* does a combination of caps satisfy mask? */ + have |= cap->issued; + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p combo issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) { + struct rb_node *q; + + /* touch this + preceding caps */ + __touch_cap(cap); + for (q = rb_first(&ci->i_caps); q != p; + q = rb_next(q)) { + cap = rb_entry(q, struct ceph_cap, + ci_node); + if (!__cap_is_valid(cap)) + continue; + __touch_cap(cap); + } + } + return 1; + } + } + + return 0; +} + +/* + * Return true if mask caps are currently being revoked by an MDS. + */ +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) +{ + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap != ocap && + (cap->implemented & ~cap->issued & mask)) + return 1; + } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); + spin_unlock(&ci->i_ceph_lock); + dout("ceph_caps_revoking %p %s = %d\n", inode, + ceph_cap_string(mask), ret); + return ret; +} + +int __ceph_caps_used(struct ceph_inode_info *ci) +{ + int used = 0; + if (ci->i_pin_ref) + used |= CEPH_CAP_PIN; + if (ci->i_rd_ref) + used |= CEPH_CAP_FILE_RD; + if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + used |= CEPH_CAP_FILE_CACHE; + if (ci->i_wr_ref) + used |= CEPH_CAP_FILE_WR; + if (ci->i_wb_ref || ci->i_wrbuffer_ref) + used |= CEPH_CAP_FILE_BUFFER; + return used; +} + +/* + * wanted, by virtue of open file modes + */ +int __ceph_caps_file_wanted(struct ceph_inode_info *ci) +{ + int want = 0; + int mode; + for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) + if (ci->i_nr_by_mode[mode]) + want |= ceph_caps_for_mode(mode); + return want; +} + +/* + * Return caps we have registered with the MDS(s) as 'wanted'. + */ +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + struct rb_node *p; + int mds_wanted = 0; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); + } + return mds_wanted; +} + +/* + * called under i_ceph_lock + */ +static int __ceph_is_any_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; +} + +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + +/* + * Remove a cap. Take steps to deal with a racing iterate_session_caps. + * + * caller should hold i_ceph_lock. + * caller will not hold session s_mutex if called from destroy_inode. + */ +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +{ + struct ceph_mds_session *session = cap->session; + struct ceph_inode_info *ci = cap->ci; + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + int removed = 0; + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + + /* remove from session list */ + spin_lock(&session->s_cap_lock); + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + + if (session->s_cap_iterator == cap) { + /* not yet, we are iterating over this very cap */ + dout("__ceph_remove_cap delaying %p removal from session %p\n", + cap, cap->session); + } else { + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + removed = 1; + } + /* protect backpointer with s_cap_lock: see iterate_session_caps */ + cap->ci = NULL; + spin_unlock(&session->s_cap_lock); + + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + if (removed) + ceph_put_cap(mdsc, cap); + + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + + if (!__ceph_is_any_real_caps(ci)) + __cap_delay_cancel(mdsc, ci); +} + +/* + * Build and send a cap message to the given MDS. + * + * Caller should be holding s_mutex. + */ +static int send_cap_msg(struct ceph_mds_session *session, + u64 ino, u64 cid, int op, + int caps, int wanted, int dirty, + u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, + u64 size, u64 max_size, + struct timespec *mtime, struct timespec *atime, + u64 time_warp_seq, + kuid_t uid, kgid_t gid, umode_t mode, + u64 xattr_version, + struct ceph_buffer *xattrs_buf, + u64 follows, bool inline_data) +{ + struct ceph_mds_caps *fc; + struct ceph_msg *msg; + void *p; + size_t extra_len; + + dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" + " seq %u/%u mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), + cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), + ceph_cap_string(dirty), + seq, issue_seq, mseq, follows, size, max_size, + xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + + /* flock buffer size + inline version + inline data size */ + extra_len = 4 + 8 + 4; + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, + GFP_NOFS, false); + if (!msg) + return -ENOMEM; + + msg->hdr.tid = cpu_to_le64(flush_tid); + + fc = msg->front.iov_base; + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(cid); + fc->op = cpu_to_le32(op); + fc->seq = cpu_to_le32(seq); + fc->issue_seq = cpu_to_le32(issue_seq); + fc->migrate_seq = cpu_to_le32(mseq); + fc->caps = cpu_to_le32(caps); + fc->wanted = cpu_to_le32(wanted); + fc->dirty = cpu_to_le32(dirty); + fc->ino = cpu_to_le64(ino); + fc->snap_follows = cpu_to_le64(follows); + + fc->size = cpu_to_le64(size); + fc->max_size = cpu_to_le64(max_size); + if (mtime) + ceph_encode_timespec(&fc->mtime, mtime); + if (atime) + ceph_encode_timespec(&fc->atime, atime); + fc->time_warp_seq = cpu_to_le32(time_warp_seq); + + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); + fc->mode = cpu_to_le32(mode); + + p = fc + 1; + /* flock buffer size */ + ceph_encode_32(&p, 0); + /* inline version */ + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + + fc->xattr_version = cpu_to_le64(xattr_version); + if (xattrs_buf) { + msg->middle = ceph_buffer_get(xattrs_buf); + fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); + } + + ceph_con_send(&session->s_con, msg); + return 0; +} + +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %llx release to mds%d msg %p (%d left)\n", + ino, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + le32_add_cpu(&head->num, 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ino); + item->cap_id = cpu_to_le64(cap_id); + item->migrate_seq = cpu_to_le32(migrate_seq); + item->seq = cpu_to_le32(issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } +} + +/* + * Queue cap releases when an inode is dropped from our cache. Since + * inode is about to be destroyed, there is no need for i_ceph_lock. + */ +void ceph_queue_caps_release(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + p = rb_next(p); + __ceph_remove_cap(cap, true); + } +} + +/* + * Send a cap msg on the given inode. Update our caps state, then + * drop i_ceph_lock and send the message. + * + * Make note of max_size reported/requested from mds, revoked caps + * that have now been implemented. + * + * Make half-hearted attempt ot to invalidate page cache if we are + * dropping RDCACHE. Note that this will leave behind locked pages + * that we'll then need to deal with elsewhere. + * + * Return non-zero if delayed release, or we experienced an error + * such that the caller should requeue + retry later. + * + * called with i_ceph_lock, then drops it. + * caller should hold snap_rwsem (read), s_mutex. + */ +static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + int op, int used, int want, int retain, int flushing, + unsigned *pflush_tid) + __releases(cap->ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->vfs_inode; + u64 cap_id = cap->cap_id; + int held, revoking, dropping, keep; + u64 seq, issue_seq, mseq, time_warp_seq, follows; + u64 size, max_size; + struct timespec mtime, atime; + int wake = 0; + umode_t mode; + kuid_t uid; + kgid_t gid; + struct ceph_mds_session *session; + u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; + int delayed = 0; + u64 flush_tid = 0; + int i; + int ret; + bool inline_data; + + held = cap->issued | cap->implemented; + revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + dropping = cap->issued & ~retain; + + dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", + inode, cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); + BUG_ON((retain & CEPH_CAP_PIN) == 0); + + session = cap->session; + + /* don't release wanted unless we've waited a bit. */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_min)) { + dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + want |= cap->mds_wanted; + retain |= cap->issued; + delayed = 1; + } + ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + + cap->issued &= retain; /* drop bits we don't want */ + if (cap->implemented & ~cap->issued) { + /* + * Wake up any waiters on wanted -> needed transition. + * This is due to the weird transition from buffered + * to sync IO... we need to flush dirty pages _before_ + * allowing sync writes to avoid reordering. + */ + wake = 1; + } + cap->implemented &= cap->issued | used; + cap->mds_wanted = want; + + if (flushing) { + /* + * assign a tid for flush operations so we can avoid + * flush1 -> dirty1 -> flush2 -> flushack1 -> mark + * clean type races. track latest tid for every bit + * so we can handle flush AxFw, flush Fw, and have the + * first ack clean Ax. + */ + flush_tid = ++ci->i_cap_flush_last_tid; + if (pflush_tid) + *pflush_tid = flush_tid; + dout(" cap_flush_tid %d\n", (int)flush_tid); + for (i = 0; i < CEPH_CAP_BITS; i++) + if (flushing & (1 << i)) + ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; + } + + keep = cap->implemented; + seq = cap->seq; + issue_seq = cap->issue_seq; + mseq = cap->mseq; + size = inode->i_size; + ci->i_reported_size = size; + max_size = ci->i_wanted_max_size; + ci->i_requested_max_size = max_size; + mtime = inode->i_mtime; + atime = inode->i_atime; + time_warp_seq = ci->i_time_warp_seq; + uid = inode->i_uid; + gid = inode->i_gid; + mode = inode->i_mode; + + if (flushing & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; + } + + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + spin_unlock(&ci->i_ceph_lock); + + ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, + op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + size, max_size, &mtime, &atime, time_warp_seq, + uid, gid, mode, xattr_version, xattr_blob, + follows, inline_data); + if (ret < 0) { + dout("error sending cap msg, must requeue %p\n", inode); + delayed = 1; + } + + if (wake) + wake_up_all(&ci->i_cap_wq); + + return delayed; +} + +/* + * When a snapshot is taken, clients accumulate dirty metadata on + * inodes with capabilities in ceph_cap_snaps to describe the file + * state at the time the snapshot was taken. This must be flushed + * asynchronously back to the MDS once sync writes complete and dirty + * data is written out. + * + * Unless @again is true, skip cap_snaps that were already sent to + * the MDS (i.e., during this session). + * + * Called under i_ceph_lock. Takes s_mutex as needed. + */ +void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession, + int again) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + struct inode *inode = &ci->vfs_inode; + int mds; + struct ceph_cap_snap *capsnap; + u32 mseq; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = NULL; /* if session != NULL, we hold + session->s_mutex */ + u64 next_follows = 0; /* keep track of how far we've gotten through the + i_cap_snaps list, and skip these entries next time + around to avoid an infinite loop */ + + if (psession) + session = *psession; + + dout("__flush_snaps %p\n", inode); +retry: + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + /* avoid an infiniute loop after retry */ + if (capsnap->follows < next_follows) + continue; + /* + * we need to wait for sync writes to complete and for dirty + * pages to be written out. + */ + if (capsnap->dirty_pages || capsnap->writing) + break; + + /* + * if cap writeback already occurred, we should have dropped + * the capsnap in ceph_put_wrbuffer_cap_refs. + */ + BUG_ON(capsnap->dirty == 0); + + /* pick mds, take s_mutex */ + if (ci->i_auth_cap == NULL) { + dout("no auth cap (migrating?), doing nothing\n"); + goto out; + } + + /* only flush each capsnap once */ + if (!again && !list_empty(&capsnap->flushing_item)) { + dout("already flushed %p, skipping\n", capsnap); + continue; + } + + mds = ci->i_auth_cap->session->s_mds; + mseq = ci->i_auth_cap->mseq; + + if (session && session->s_mds != mds) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout("inverting session/ino locks on %p\n", + session); + mutex_lock(&session->s_mutex); + } + /* + * if session == NULL, we raced against a cap + * deletion or migration. retry, and we'll + * get a better @mds value next time. + */ + spin_lock(&ci->i_ceph_lock); + goto retry; + } + + capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + atomic_inc(&capsnap->nref); + if (!list_empty(&capsnap->flushing_item)) + list_del_init(&capsnap->flushing_item); + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); + spin_unlock(&ci->i_ceph_lock); + + dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", + inode, capsnap, capsnap->follows, capsnap->flush_tid); + send_cap_msg(session, ceph_vino(inode).ino, 0, + CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, + capsnap->size, 0, + &capsnap->mtime, &capsnap->atime, + capsnap->time_warp_seq, + capsnap->uid, capsnap->gid, capsnap->mode, + capsnap->xattr_version, capsnap->xattr_blob, + capsnap->follows, capsnap->inline_data); + + next_follows = capsnap->follows + 1; + ceph_put_cap_snap(capsnap); + + spin_lock(&ci->i_ceph_lock); + goto retry; + } + + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); + +out: + if (psession) + *psession = session; + else if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } +} + +static void ceph_flush_snaps(struct ceph_inode_info *ci) +{ + spin_lock(&ci->i_ceph_lock); + __ceph_flush_snaps(ci, NULL, 0); + spin_unlock(&ci->i_ceph_lock); +} + +/* + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. + */ +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +{ + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct inode *inode = &ci->vfs_inode; + int was = ci->i_dirty_caps; + int dirty = 0; + + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + ceph_cap_string(mask), ceph_cap_string(was), + ceph_cap_string(was | mask)); + ci->i_dirty_caps |= mask; + if (was == 0) { + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + BUG_ON(!list_empty(&ci->i_dirty_item)); + spin_lock(&mdsc->cap_dirty_lock); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + spin_unlock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + ihold(inode); + dirty |= I_DIRTY_SYNC; + } + } + BUG_ON(list_empty(&ci->i_dirty_item)); + if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && + (mask & CEPH_CAP_FILE_BUFFER)) + dirty |= I_DIRTY_DATASYNC; + __cap_delay_requeue(mdsc, ci); + return dirty; +} + +/* + * Add dirty inode to the flushing list. Assigned a seq number so we + * can wait for caps to flush without starving. + * + * Called under i_ceph_lock. + */ +static int __mark_caps_flushing(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int flushing; + + BUG_ON(ci->i_dirty_caps == 0); + BUG_ON(list_empty(&ci->i_dirty_item)); + + flushing = ci->i_dirty_caps; + dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); + ci->i_flushing_caps |= flushing; + ci->i_dirty_caps = 0; + dout(" inode %p now !dirty\n", inode); + + spin_lock(&mdsc->cap_dirty_lock); + list_del_init(&ci->i_dirty_item); + + if (list_empty(&ci->i_flushing_item)) { + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; + list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); + mdsc->num_cap_flushing++; + dout(" inode %p now flushing seq %lld\n", inode, + ci->i_cap_flush_seq); + } else { + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + dout(" inode %p now flushing (more) seq %lld\n", inode, + ci->i_cap_flush_seq); + } + spin_unlock(&mdsc->cap_dirty_lock); + + return flushing; +} + +/* + * try to invalidate mapping pages without blocking. + */ +static int try_nonblocking_invalidate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u32 invalidating_gen = ci->i_rdcache_gen; + + spin_unlock(&ci->i_ceph_lock); + invalidate_mapping_pages(&inode->i_data, 0, -1); + spin_lock(&ci->i_ceph_lock); + + if (inode->i_data.nrpages == 0 && + invalidating_gen == ci->i_rdcache_gen) { + /* success. */ + dout("try_nonblocking_invalidate %p success\n", inode); + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; + return 0; + } + dout("try_nonblocking_invalidate %p failed\n", inode); + return -1; +} + +/* + * Swiss army knife function to examine currently used and wanted + * versus held caps. Release, flush, ack revoked caps to mds as + * appropriate. + * + * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay + * cap release further. + * CHECK_CAPS_AUTHONLY - we should only check the auth cap + * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without + * further delay. + */ +void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int file_wanted, used, cap_used; + int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ + int issued, implemented, want, retain, revoking, flushing = 0; + int mds = -1; /* keep track of how far we've gone through i_caps list + to avoid an infinite loop on retry */ + struct rb_node *p; + int tried_invalidate = 0; + int delayed = 0, sent = 0, force_requeue = 0, num; + int queue_invalidate = 0; + int is_delayed = flags & CHECK_CAPS_NODELAY; + + /* if we are unmounting, flush any unused caps immediately. */ + if (mdsc->stopping) + is_delayed = 1; + + spin_lock(&ci->i_ceph_lock); + + if (ci->i_ceph_flags & CEPH_I_FLUSH) + flags |= CHECK_CAPS_FLUSH; + + /* flush snaps first time around only */ + if (!list_empty(&ci->i_cap_snaps)) + __ceph_flush_snaps(ci, &session, 0); + goto retry_locked; +retry: + spin_lock(&ci->i_ceph_lock); +retry_locked: + file_wanted = __ceph_caps_file_wanted(ci); + used = __ceph_caps_used(ci); + want = file_wanted | used; + issued = __ceph_caps_issued(ci, &implemented); + revoking = implemented & ~issued; + + retain = want | CEPH_CAP_PIN; + if (!mdsc->stopping && inode->i_nlink > 0) { + if (want) { + retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= want; + } else { + + retain |= CEPH_CAP_ANY_SHARED; + /* + * keep RD only if we didn't have the file open RW, + * because then the mds would revoke it anyway to + * journal max_size=0. + */ + if (ci->i_max_size == 0) + retain |= CEPH_CAP_ANY_RD; + } + } + + dout("check_caps %p file_want %s used %s dirty %s flushing %s" + " issued %s revoking %s retain %s %s%s%s\n", inode, + ceph_cap_string(file_wanted), + ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(issued), ceph_cap_string(revoking), + ceph_cap_string(retain), + (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", + (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + + /* + * If we no longer need to hold onto old our caps, and we may + * have cached pages, but don't want them, then try to invalidate. + * If we fail, it's because pages are locked.... try again later. + */ + if ((!is_delayed || mdsc->stopping) && + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ + !tried_invalidate) { + dout("check_caps trying to invalidate on %p\n", inode); + if (try_nonblocking_invalidate(inode) < 0) { + if (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) { + dout("check_caps queuing invalidate\n"); + queue_invalidate = 1; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } else { + dout("check_caps failed to invalidate pages\n"); + /* we failed to invalidate pages. check these + caps again later. */ + force_requeue = 1; + __cap_set_timeouts(mdsc, ci); + } + } + tried_invalidate = 1; + goto retry_locked; + } + + num = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + num++; + + /* avoid looping forever */ + if (mds >= cap->mds || + ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) + continue; + + /* NOTE: no side-effects allowed, until we take s_mutex */ + + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + + revoking = cap->implemented & ~cap->issued; + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); + + if (cap == ci->i_auth_cap && + (cap->issued & CEPH_CAP_FILE_WR)) { + /* request larger max_size from MDS? */ + if (ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) { + dout("requesting new max_size\n"); + goto ack; + } + + /* approaching file_max? */ + if ((inode->i_size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) { + dout("i_size approaching max_size\n"); + goto ack; + } + } + /* flush anything dirty? */ + if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && + ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + + /* completed revocation? going down and there are no caps? */ + if (revoking && (revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* want more caps from mds? */ + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + + /* things we might delay */ + if ((cap->issued & ~retain) == 0 && + cap->mds_wanted == want) + continue; /* nope, all good */ + + if (is_delayed) + goto ack; + + /* delay? */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) { + dout(" delaying issued %s -> %s, wanted %s -> %s\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + delayed++; + continue; + } + +ack: + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout(" skipping %p I_NOFLUSH set\n", inode); + continue; + } + + if (session && session != cap->session) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + session = NULL; + } + if (!session) { + session = cap->session; + if (mutex_trylock(&session->s_mutex) == 0) { + dout("inverting session/ino locks on %p\n", + session); + spin_unlock(&ci->i_ceph_lock); + if (took_snap_rwsem) { + up_read(&mdsc->snap_rwsem); + took_snap_rwsem = 0; + } + mutex_lock(&session->s_mutex); + goto retry; + } + } + /* take snap_rwsem after session mutex */ + if (!took_snap_rwsem) { + if (down_read_trylock(&mdsc->snap_rwsem) == 0) { + dout("inverting snap/in locks on %p\n", + inode); + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + took_snap_rwsem = 1; + goto retry; + } + took_snap_rwsem = 1; + } + + if (cap == ci->i_auth_cap && ci->i_dirty_caps) + flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; + + mds = cap->mds; /* remember mds, so we don't repeat */ + sent++; + + /* __send_cap drops i_ceph_lock */ + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ + } + + /* + * Reschedule delayed caps release if we delayed anything, + * otherwise cancel. + */ + if (delayed && is_delayed) + force_requeue = 1; /* __send_cap delayed release; requeue */ + if (!delayed && !is_delayed) + __cap_delay_cancel(mdsc, ci); + else if (!is_delayed || force_requeue) + __cap_delay_requeue(mdsc, ci); + + spin_unlock(&ci->i_ceph_lock); + + if (queue_invalidate) + ceph_queue_invalidate(inode); + + if (session) + mutex_unlock(&session->s_mutex); + if (took_snap_rwsem) + up_read(&mdsc->snap_rwsem); +} + +/* + * Try to flush dirty caps back to the auth mds. + */ +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int flushing = 0; + struct ceph_mds_session *session = NULL; + +retry: + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); + goto out; + } + if (ci->i_dirty_caps && ci->i_auth_cap) { + struct ceph_cap *cap = ci->i_auth_cap; + int used = __ceph_caps_used(ci); + int want = __ceph_caps_wanted(ci); + int delayed; + + if (!session || session != cap->session) { + spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); + session = cap->session; + mutex_lock(&session->s_mutex); + goto retry; + } + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) + goto out; + + flushing = __mark_caps_flushing(inode, session); + + /* __send_cap drops i_ceph_lock */ + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, + cap->issued | cap->implemented, flushing, + flush_tid); + if (!delayed) + goto out_unlocked; + + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + } +out: + spin_unlock(&ci->i_ceph_lock); +out_unlocked: + if (session) + mutex_unlock(&session->s_mutex); + return flushing; +} + +/* + * Return true if we've flushed caps through the given flush_tid. + */ +static int caps_are_flushed(struct inode *inode, unsigned tid) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int i, ret = 1; + + spin_lock(&ci->i_ceph_lock); + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((ci->i_flushing_caps & (1 << i)) && + ci->i_cap_flush_tid[i] <= tid) { + /* still flushing this bit */ + ret = 0; + break; + } + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* + * Wait on any unsafe replies for the given inode. First wait on the + * newest request, and make that the upper bound. Then, if there are + * more requests, keep waiting on the oldest as long as it is still older + * than the original request. + */ +static void sync_write_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_writes; + struct ceph_osd_request *req; + u64 last_tid; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + /* set upper bound as _last_ entry in chain */ + req = list_entry(head->prev, struct ceph_osd_request, + r_unsafe_item); + last_tid = req->r_tid; + + do { + ceph_osdc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + dout("sync_write_wait on tid %llu (until %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + spin_lock(&ci->i_unsafe_lock); + ceph_osdc_put_request(req); + + /* + * from here on look at first entry in chain, since we + * only want to wait for anything older than last_tid + */ + if (list_empty(head)) + break; + req = list_entry(head->next, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} + +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int ret; + int dirty; + + dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); + sync_write_wait(inode); + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret < 0) + return ret; + mutex_lock(&inode->i_mutex); + + dirty = try_flush_caps(inode, &flush_tid); + dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + + /* + * only wait on non-file metadata writeback (the mds + * can recover size and mtime, so we don't need to + * wait for that) + */ + if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + dout("fsync waiting for flush_tid %u\n", flush_tid); + ret = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } + + dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * Flush any dirty caps back to the mds. If we aren't asked to wait, + * queue inode for flush but don't do so immediately, because we can + * get by with fewer MDS messages if we wait for data writeback to + * complete first. + */ +int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int err = 0; + int dirty; + int wait = wbc->sync_mode == WB_SYNC_ALL; + + dout("write_inode %p wait=%d\n", inode, wait); + if (wait) { + dirty = try_flush_caps(inode, &flush_tid); + if (dirty) + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } else { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_dirty(ci)) + __cap_delay_requeue_front(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + return err; +} + +/* + * After a recovering MDS goes active, we need to resend any caps + * we were flushing. + * + * Caller holds session->s_mutex. + */ +static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_cap_snap *capsnap; + + dout("kick_flushing_capsnaps mds%d\n", session->s_mds); + list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, + flushing_item) { + struct ceph_inode_info *ci = capsnap->ci; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, + cap, capsnap); + __ceph_flush_snaps(ci, &session, 1); + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + } + spin_unlock(&ci->i_ceph_lock); + } +} + +void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + + kick_flushing_capsnaps(mdsc, session); + + dout("kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p %s\n", inode, + cap, ceph_cap_string(ci->i_flushing_caps)); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + } + } +} + +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + + __ceph_flush_snaps(ci, &session, 1); + + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + spin_unlock(&ci->i_ceph_lock); + } +} + + +/* + * Take references to capabilities we hold, so that we don't release + * them to the MDS prematurely. + * + * Protected by i_ceph_lock. + */ +static void __take_cap_refs(struct ceph_inode_info *ci, int got) +{ + if (got & CEPH_CAP_PIN) + ci->i_pin_ref++; + if (got & CEPH_CAP_FILE_RD) + ci->i_rd_ref++; + if (got & CEPH_CAP_FILE_CACHE) + ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_WR) + ci->i_wr_ref++; + if (got & CEPH_CAP_FILE_BUFFER) { + if (ci->i_wb_ref == 0) + ihold(&ci->vfs_inode); + ci->i_wb_ref++; + dout("__take_cap_refs %p wb %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + } +} + +/* + * Try to grab cap references. Specify those refs we @want, and the + * minimal set we @need. Also include the larger offset we are writing + * to (when applicable), and check against max_size here as well. + * Note that caller is responsible for ensuring max_size increases are + * requested from the MDS. + */ +static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, int *check_max, int *err) +{ + struct inode *inode = &ci->vfs_inode; + int ret = 0; + int have, implemented; + int file_wanted; + + dout("get_cap_refs %p need %s want %s\n", inode, + ceph_cap_string(need), ceph_cap_string(want)); + + spin_lock(&ci->i_ceph_lock); + + /* make sure file is actually open */ + file_wanted = __ceph_caps_file_wanted(ci); + if ((file_wanted & need) == 0) { + dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", + ceph_cap_string(need), ceph_cap_string(file_wanted)); + *err = -EBADF; + ret = 1; + goto out_unlock; + } + + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode); + spin_lock(&ci->i_ceph_lock); + } + + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { + if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { + dout("get_cap_refs %p endoff %llu > maxsize %llu\n", + inode, endoff, ci->i_max_size); + if (endoff > ci->i_requested_max_size) { + *check_max = 1; + ret = 1; + } + goto out_unlock; + } + /* + * If a sync write is in progress, we must wait, so that we + * can get a final snapshot value for size+mtime. + */ + if (__ceph_have_pending_cap_snap(ci)) { + dout("get_cap_refs %p cap_snap_pending\n", inode); + goto out_unlock; + } + } + + if ((have & need) == need) { + /* + * Look at (implemented & ~have & not) so that we keep waiting + * on transition from wanted -> needed caps. This is needed + * for WRBUFFER|WR -> WR to avoid a new WR sync write from + * going before a prior buffered writeback happens. + */ + int not = want & ~(have & need); + int revoking = implemented & ~have; + dout("get_cap_refs %p have %s but not %s (revoking %s)\n", + inode, ceph_cap_string(have), ceph_cap_string(not), + ceph_cap_string(revoking)); + if ((revoking & not) == 0) { + *got = need | (have & want); + __take_cap_refs(ci, *got); + ret = 1; + } + } else { + int session_readonly = false; + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + dout("get_cap_refs %p needed %s but mds%d readonly\n", + inode, ceph_cap_string(need), ci->i_auth_cap->mds); + *err = -EROFS; + ret = 1; + goto out_unlock; + } + + dout("get_cap_refs %p have %s needed %s\n", inode, + ceph_cap_string(have), ceph_cap_string(need)); + } +out_unlock: + spin_unlock(&ci->i_ceph_lock); + + dout("get_cap_refs %p ret %d got %s\n", inode, + ret, ceph_cap_string(*got)); + return ret; +} + +/* + * Check the offset we are writing up to against our current + * max_size. If necessary, tell the MDS we want to write to + * a larger offset. + */ +static void check_max_size(struct inode *inode, loff_t endoff) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int check = 0; + + /* do we need to explicitly request a larger max_size? */ + spin_lock(&ci->i_ceph_lock); + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { + dout("write %p at large endoff %llu, req max_size\n", + inode, endoff); + ci->i_wanted_max_size = endoff; + } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; + spin_unlock(&ci->i_ceph_lock); + if (check) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); +} + +/* + * Wait for caps, and take cap references. If we can't get a WR cap + * due to a small max_size, make sure we check_max_size (and possibly + * ask the mds) so we don't get hung up indefinitely. + */ +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, struct page **pinned_page) +{ + int _got, check_max, ret, err = 0; + +retry: + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); + _got = 0; + check_max = 0; + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + &_got, &check_max, &err)); + if (err) + ret = err; + if (ret < 0) + return ret; + + if (check_max) + goto retry; + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret < 0) + return ret; + goto retry; + } +out: + *got = _got; + return 0; +} + +/* + * Take cap refs. Caller must already know we hold at least one ref + * on the caps in question or we don't know this is safe. + */ +void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) +{ + spin_lock(&ci->i_ceph_lock); + __take_cap_refs(ci, caps); + spin_unlock(&ci->i_ceph_lock); +} + +/* + * Release cap refs. + * + * If we released the last ref on any given cap, call ceph_check_caps + * to release (or schedule a release). + * + * If we are releasing a WR cap (from a sync write), finalize any affected + * cap_snap, and wake up any waiters. + */ +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0, put = 0, flushsnaps = 0, wake = 0; + struct ceph_cap_snap *capsnap; + + spin_lock(&ci->i_ceph_lock); + if (had & CEPH_CAP_PIN) + --ci->i_pin_ref; + if (had & CEPH_CAP_FILE_RD) + if (--ci->i_rd_ref == 0) + last++; + if (had & CEPH_CAP_FILE_CACHE) + if (--ci->i_rdcache_ref == 0) + last++; + if (had & CEPH_CAP_FILE_BUFFER) { + if (--ci->i_wb_ref == 0) { + last++; + put++; + } + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); + } + if (had & CEPH_CAP_FILE_WR) + if (--ci->i_wr_ref == 0) { + last++; + if (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + if (capsnap->writing) { + capsnap->writing = 0; + flushsnaps = + __ceph_finish_cap_snap(ci, + capsnap); + wake = 1; + } + } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); + } + spin_unlock(&ci->i_ceph_lock); + + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), + last ? " last" : "", put ? " put" : ""); + + if (last && !flushsnaps) + ceph_check_caps(ci, 0, NULL); + else if (flushsnaps) + ceph_flush_snaps(ci); + if (wake) + wake_up_all(&ci->i_cap_wq); + if (put) + iput(inode); +} + +/* + * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap + * context. Adjust per-snap dirty page accounting as appropriate. + * Once all dirty data for a cap_snap is flushed, flush snapped file + * metadata back to the MDS. If we dropped the last ref, call + * ceph_check_caps. + */ +void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + int complete_capsnap = 0; + int drop_capsnap = 0; + int found = 0; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&ci->i_ceph_lock); + ci->i_wrbuffer_ref -= nr; + last = !ci->i_wrbuffer_ref; + + if (ci->i_head_snapc == snapc) { + ci->i_wrbuffer_ref_head -= nr; + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", + inode, + ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + last ? " LAST" : ""); + } else { + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + found = 1; + break; + } + } + BUG_ON(!found); + capsnap->dirty_pages -= nr; + if (capsnap->dirty_pages == 0) { + complete_capsnap = 1; + if (capsnap->dirty == 0) + /* cap writeback completed before we created + * the cap_snap; no FLUSHSNAP is needed */ + drop_capsnap = 1; + } + dout("put_wrbuffer_cap_refs on %p cap_snap %p " + " snap %lld %d/%d -> %d/%d %s%s%s\n", + inode, capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + complete_capsnap ? " (complete capsnap)" : "", + drop_capsnap ? " (drop capsnap)" : ""); + if (drop_capsnap) { + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + } + } + + spin_unlock(&ci->i_ceph_lock); + + if (last) { + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + iput(inode); + } else if (complete_capsnap) { + ceph_flush_snaps(ci); + wake_up_all(&ci->i_cap_wq); + } + if (drop_capsnap) + iput(inode); +} + +/* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* + * Handle a cap GRANT message from the MDS. (Note that a GRANT may + * actually be a revocation if it specifies a smaller cap set.) + * + * caller holds s_mutex and i_ceph_lock, we drop both. + */ +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + u64 inline_version, + void *inline_data, int inline_len, + struct ceph_buffer *xattr_buf, + struct ceph_mds_session *session, + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) + __releases(mdsc->snap_rwsem) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(grant->seq); + int newcaps = le32_to_cpu(grant->caps); + int used, wanted, dirty; + u64 size = le64_to_cpu(grant->size); + u64 max_size = le64_to_cpu(grant->max_size); + struct timespec mtime, atime, ctime; + int check_caps = 0; + bool wake = false; + bool writeback = false; + bool queue_trunc = false; + bool queue_invalidate = false; + bool queue_revalidate = false; + bool deleted_inode = false; + bool fill_inline = false; + + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); + dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, + inode->i_size); + + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + + /* + * If CACHE is being revoked, and we have no dirty buffers, + * try to invalidate (once). (If there are dirty buffers, we + * will invalidate _after_ writeback.) + */ + if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + !ci->i_wrbuffer_ref) { + if (try_nonblocking_invalidate(inode)) { + /* there were locked pages.. invalidate later + in a separate thread. */ + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + queue_invalidate = true; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } + } + + ceph_fscache_invalidate(inode); + } + + /* side effects now are allowed */ + cap->cap_gen = session->s_cap_gen; + cap->seq = seq; + + __check_cap_issue(ci, cap, newcaps); + + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(grant->mode); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + } + + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { + set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = true; + } + + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { + int len = le32_to_cpu(grant->xattr_len); + u64 version = le64_to_cpu(grant->xattr_version); + + if (version > ci->i_xattrs.version) { + dout(" got new xattrs v%llu on %p len %d\n", + version, inode, len); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); + ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); + } + } + + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = true; + + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = true; + } + } + + /* check cap bits */ + wanted = __ceph_caps_wanted(ci); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + dout(" my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), + ceph_cap_string(used), + ceph_cap_string(dirty)); + if (wanted != le32_to_cpu(grant->wanted)) { + dout("mds wanted %s -> %s\n", + ceph_cap_string(le32_to_cpu(grant->wanted)), + ceph_cap_string(wanted)); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; + } + + /* revocation, grant, or no-op? */ + if (cap->issued & ~newcaps) { + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (revoking & used & CEPH_CAP_FILE_BUFFER) + writeback = true; /* initiate writeback; will delay ack */ + else if (revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + queue_invalidate) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + cap->issued = newcaps; + cap->implemented |= newcaps; + } else if (cap->issued == newcaps) { + dout("caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + } else { + dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + + cap->issued = newcaps; + cap->implemented |= newcaps; /* add bits only, to + * avoid stepping on a + * pending revocation */ + wake = true; + } + BUG_ON(cap->issued & ~cap->implemented); + + if (inline_version > 0 && inline_version >= ci->i_inline_version) { + ci->i_inline_version = inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + + spin_unlock(&ci->i_ceph_lock); + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = true; + } + + if (fill_inline) + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + + if (writeback) + /* + * queue inode for writeback: we can't actually call + * filemap_write_and_wait, etc. from message handler + * context. + */ + ceph_queue_writeback(inode); + if (queue_invalidate) + ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); + if (wake) + wake_up_all(&ci->i_cap_wq); + + if (check_caps == 1) + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + session); + else if (check_caps == 2) + ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); + else + mutex_unlock(&session->s_mutex); +} + +/* + * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the + * MDS has been safely committed. + */ +static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session, + struct ceph_cap *cap) + __releases(ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + unsigned seq = le32_to_cpu(m->seq); + int dirty = le32_to_cpu(m->dirty); + int cleaned = 0; + int drop = 0; + int i; + + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((dirty & (1 << i)) && + (u16)flush_tid == ci->i_cap_flush_tid[i]) + cleaned |= 1 << i; + + dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," + " flushing %s -> %s\n", + inode, session->s_mds, seq, ceph_cap_string(dirty), + ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + + if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + goto out; + + ci->i_flushing_caps &= ~cleaned; + + spin_lock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + mdsc->num_cap_flushing--; + wake_up_all(&mdsc->cap_flushing_wq); + dout(" inode %p now !flushing\n", inode); + + if (ci->i_dirty_caps == 0) { + dout(" inode %p now clean\n", inode); + BUG_ON(!list_empty(&ci->i_dirty_item)); + drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } else { + BUG_ON(list_empty(&ci->i_dirty_item)); + } + } + spin_unlock(&mdsc->cap_dirty_lock); + wake_up_all(&ci->i_cap_wq); + +out: + spin_unlock(&ci->i_ceph_lock); + if (drop) + iput(inode); +} + +/* + * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can + * throw away our cap_snap. + * + * Caller hold s_mutex. + */ +static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 follows = le64_to_cpu(m->snap_follows); + struct ceph_cap_snap *capsnap; + int drop = 0; + + dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", + inode, ci, session->s_mds, follows); + + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->follows == follows) { + if (capsnap->flush_tid != flush_tid) { + dout(" cap_snap %p follows %lld tid %lld !=" + " %lld\n", capsnap, follows, + flush_tid, capsnap->flush_tid); + break; + } + WARN_ON(capsnap->dirty_pages || capsnap->writing); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + drop = 1; + break; + } else { + dout(" skipping cap_snap %p follows %lld\n", + capsnap, capsnap->follows); + } + } + spin_unlock(&ci->i_ceph_lock); + if (drop) + iput(inode); +} + +/* + * Handle TRUNC from MDS, indicating file truncation. + * + * caller hold s_mutex. + */ +static void handle_cap_trunc(struct inode *inode, + struct ceph_mds_caps *trunc, + struct ceph_mds_session *session) + __releases(ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(trunc->seq); + u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); + u64 truncate_size = le64_to_cpu(trunc->truncate_size); + u64 size = le64_to_cpu(trunc->size); + int implemented = 0; + int dirty = __ceph_caps_dirty(ci); + int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); + int queue_trunc = 0; + + issued |= implemented | dirty; + + dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", + inode, mds, seq, truncate_size, truncate_seq); + queue_trunc = ceph_fill_file_size(inode, issued, + truncate_seq, truncate_size, size); + spin_unlock(&ci->i_ceph_lock); + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } +} + +/* + * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a + * different one. If we are the most recent migration we've seen (as + * indicated by mseq), make note of the migrating cap bits for the + * duration (until we see the corresponding IMPORT). + * + * caller holds s_mutex + */ +static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 t_cap_id; + unsigned mseq = le32_to_cpu(ex->migrate_seq); + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; + + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; + + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ + + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } + } + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + } + goto retry; + +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); +} + +/* + * Handle cap IMPORT. + * + * caller holds s_mutex. acquires i_ceph_lock + */ +static void handle_cap_import(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session, + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; + int mds = session->s_mds; + int issued; + unsigned caps = le32_to_cpu(im->caps); + unsigned wanted = le32_to_cpu(im->wanted); + unsigned seq = le32_to_cpu(im->seq); + unsigned mseq = le32_to_cpu(im->migrate_seq); + u64 realmino = le64_to_cpu(im->realm); + u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; + + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } + + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; + } + cap = new_cap; + } else { + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } + + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + *old_issued = issued; + *target_cap = cap; +} + +/* + * Handle a caps message from the MDS. + * + * Identify the appropriate session, inode, and call the right handler + * based on the cap op. + */ +void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct super_block *sb = mdsc->fsc->sb; + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm; + int mds = session->s_mds; + int op, issued; + u32 seq, mseq; + struct ceph_vino vino; + u64 cap_id; + u64 size, max_size; + u64 tid; + u64 inline_version = 0; + void *inline_data = NULL; + u32 inline_len = 0; + void *snaptrace; + size_t snaptrace_len; + void *p, *end; + + dout("handle_caps from mds%d\n", mds); + + /* decode */ + end = msg->front.iov_base + msg->front.iov_len; + tid = le64_to_cpu(msg->hdr.tid); + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = msg->front.iov_base; + op = le32_to_cpu(h->op); + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + cap_id = le64_to_cpu(h->cap_id); + seq = le32_to_cpu(h->seq); + mseq = le32_to_cpu(h->migrate_seq); + size = le64_to_cpu(h->size); + max_size = le64_to_cpu(h->max_size); + + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; + + if (le16_to_cpu(msg->hdr.version) >= 2) { + u32 flock_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; + p += flock_len; + } + + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + p += sizeof(*peer); + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + + if (le16_to_cpu(msg->hdr.version) >= 4) { + ceph_decode_64_safe(&p, end, inline_version, bad); + ceph_decode_32_safe(&p, end, inline_len, bad); + if (p + inline_len > end) + goto bad; + inline_data = p; + p += inline_len; + } + + /* lookup ino */ + inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); + dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, + vino.snap, inode); + + mutex_lock(&session->s_mutex); + session->s_seq++; + dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, + (unsigned)seq); + + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + + if (!inode) { + dout(" i don't have ino %llx\n", vino.ino); + + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); + __queue_cap_release(session, vino.ino, cap_id, + mseq, seq); + spin_unlock(&session->s_cap_lock); + } + goto flush_cap_releases; + } + + /* these will work even if we don't have a cap yet */ + switch (op) { + case CEPH_CAP_OP_FLUSHSNAP_ACK: + handle_cap_flushsnap_ack(inode, tid, h, session); + goto done; + + case CEPH_CAP_OP_EXPORT: + handle_cap_export(inode, h, peer, session); + goto done_unlocked; + + case CEPH_CAP_OP_IMPORT: + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &issued); + handle_cap_grant(mdsc, inode, h, + inline_version, inline_data, inline_len, + msg->middle, session, cap, issued); + if (realm) + ceph_put_snap_realm(mdsc, realm); + goto done_unlocked; + } + + /* the rest require a cap */ + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ceph_inode(inode), mds); + if (!cap) { + dout(" no cap on %p ino %llx.%llx from mds%d\n", + inode, ceph_ino(inode), ceph_snap(inode), mds); + spin_unlock(&ci->i_ceph_lock); + goto flush_cap_releases; + } + + /* note that each of these drops i_ceph_lock for us */ + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, + inline_version, inline_data, inline_len, + msg->middle, session, cap, issued); + goto done_unlocked; + + case CEPH_CAP_OP_FLUSH_ACK: + handle_cap_flush_ack(inode, tid, h, session, cap); + break; + + case CEPH_CAP_OP_TRUNC: + handle_cap_trunc(inode, h, session); + break; + + default: + spin_unlock(&ci->i_ceph_lock); + pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); + } + + goto done; + +flush_cap_releases: + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + iput(inode); + return; + +bad: + pr_err("ceph_handle_caps: corrupt message\n"); + ceph_msg_dump(msg); + return; +} + +/* + * Delayed work handler to process end of delayed cap release LRU list. + */ +void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + int flags = CHECK_CAPS_NODELAY; + + dout("check_delayed_caps\n"); + while (1) { + spin_lock(&mdsc->cap_delay_lock); + if (list_empty(&mdsc->cap_delay_list)) + break; + ci = list_first_entry(&mdsc->cap_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) + break; + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); + dout("check_delayed_caps on %p\n", &ci->vfs_inode); + ceph_check_caps(ci, flags, NULL); + } + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Flush all dirty caps to the mds + */ +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + + dout("flush_dirty_caps\n"); + spin_lock(&mdsc->cap_dirty_lock); + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = &ci->vfs_inode; + ihold(inode); + dout("flush_dirty_caps %p\n", inode); + spin_unlock(&mdsc->cap_dirty_lock); + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); + spin_lock(&mdsc->cap_dirty_lock); + } + spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); +} + +/* + * Drop open file reference. If we were the last open file, + * we may need to release capabilities to the MDS (or schedule + * their delayed release). + */ +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + + spin_lock(&ci->i_ceph_lock); + dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, + ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); + BUG_ON(ci->i_nr_by_mode[fmode] == 0); + if (--ci->i_nr_by_mode[fmode] == 0) + last++; + spin_unlock(&ci->i_ceph_lock); + + if (last && ci->i_vino.snap == CEPH_NOSNAP) + ceph_check_caps(ci, 0, NULL); +} + +/* + * Helpers for embedding cap and dentry lease releases into mds + * requests. + * + * @force is used by dentry_release (below) to force inclusion of a + * record for the directory inode, even when there aren't any caps to + * drop. + */ +int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + struct ceph_mds_request_release *rel = *p; + int used, dirty; + int ret = 0; + + spin_lock(&ci->i_ceph_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), + ceph_cap_string(unless)); + + /* only drop unused, clean caps */ + drop &= ~(used | dirty); + + cap = __get_cap_for_mds(ci, mds); + if (cap && __cap_is_valid(cap)) { + if (force || + ((cap->issued & drop) && + (cap->issued & unless) == 0)) { + if ((cap->issued & drop) && + (cap->issued & unless) == 0) { + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + + cap->issued &= ~drop; + cap->implemented &= ~drop; + cap->mds_wanted = wanted; + } else { + dout("encode_inode_release %p cap %p %s" + " (force)\n", inode, cap, + ceph_cap_string(cap->issued)); + } + + rel->ino = cpu_to_le64(ceph_ino(inode)); + rel->cap_id = cpu_to_le64(cap->cap_id); + rel->seq = cpu_to_le32(cap->seq); + rel->issue_seq = cpu_to_le32(cap->issue_seq); + rel->mseq = cpu_to_le32(cap->mseq); + rel->caps = cpu_to_le32(cap->implemented); + rel->wanted = cpu_to_le32(cap->mds_wanted); + rel->dname_len = 0; + rel->dname_seq = 0; + *p += sizeof(*rel); + ret = 1; + } else { + dout("encode_inode_release %p cap %p %s\n", + inode, cap, ceph_cap_string(cap->issued)); + } + } + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +int ceph_encode_dentry_release(void **p, struct dentry *dentry, + int mds, int drop, int unless) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct ceph_mds_request_release *rel = *p; + struct ceph_dentry_info *di = ceph_dentry(dentry); + int force = 0; + int ret; + + /* + * force an record for the directory caps if we have a dentry lease. + * this is racy (can't take i_ceph_lock and d_lock together), but it + * doesn't have to be perfect; the mds will revoke anything we don't + * release. + */ + spin_lock(&dentry->d_lock); + if (di->lease_session && di->lease_session->s_mds == mds) + force = 1; + spin_unlock(&dentry->d_lock); + + ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + + spin_lock(&dentry->d_lock); + if (ret && di->lease_session && di->lease_session->s_mds == mds) { + dout("encode_dentry_release %p mds%d seq %d\n", + dentry, mds, (int)di->lease_seq); + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + spin_unlock(&dentry->d_lock); + return ret; +} diff --git a/kernel/fs/ceph/ceph_frag.c b/kernel/fs/ceph/ceph_frag.c new file mode 100644 index 000000000..bdce8b1fb --- /dev/null +++ b/kernel/fs/ceph/ceph_frag.c @@ -0,0 +1,22 @@ +/* + * Ceph 'frag' type + */ +#include +#include + +int ceph_frag_compare(__u32 a, __u32 b) +{ + unsigned va = ceph_frag_value(a); + unsigned vb = ceph_frag_value(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + va = ceph_frag_bits(a); + vb = ceph_frag_bits(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} diff --git a/kernel/fs/ceph/debugfs.c b/kernel/fs/ceph/debugfs.c new file mode 100644 index 000000000..31f831471 --- /dev/null +++ b/kernel/fs/ceph/debugfs.c @@ -0,0 +1,321 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "super.h" + +#ifdef CONFIG_DEBUG_FS + +#include "mds_client.h" + +static int mdsmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_fs_client *fsc = s->private; + + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); + seq_printf(s, "session_timeout %d\n", + fsc->mdsc->mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { + struct ceph_entity_addr *addr = + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; + + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), + ceph_mds_state_name(state)); + } + return 0; +} + +/* + * mdsc debugfs + */ +static int mdsc_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct rb_node *rp; + int pathlen; + u64 pathbase; + char *path; + + mutex_lock(&mdsc->mutex); + for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mds_request, r_node); + + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) + seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); + + seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); + + if (req->r_got_unsafe) + seq_puts(s, "\t(unsafe)"); + else + seq_puts(s, "\t"); + + if (req->r_inode) { + seq_printf(s, " #%llx", ceph_ino(req->r_inode)); + } else if (req->r_dentry) { + path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_dentry->d_lock); + seq_printf(s, " #%llx/%pd (%s)", + ceph_ino(d_inode(req->r_dentry->d_parent)), + req->r_dentry, + path ? path : ""); + spin_unlock(&req->r_dentry->d_lock); + kfree(path); + } else if (req->r_path1) { + seq_printf(s, " #%llx/%s", req->r_ino1.ino, + req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); + } + + if (req->r_old_dentry) { + path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_old_dentry->d_lock); + seq_printf(s, " #%llx/%pd (%s)", + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, + req->r_old_dentry, + path ? path : ""); + spin_unlock(&req->r_old_dentry->d_lock); + kfree(path); + } else if (req->r_path2) { + if (req->r_ino2.ino) + seq_printf(s, " #%llx/%s", req->r_ino2.ino, + req->r_path2); + else + seq_printf(s, " %s", req->r_path2); + } + + seq_puts(s, "\n"); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +static int caps_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + int total, avail, used, reserved, min; + + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); + seq_printf(s, "total\t\t%d\n" + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n" + "min\t%d\n", + total, avail, used, reserved, min); + return 0; +} + +static int dentry_lru_show(struct seq_file *s, void *ptr) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_dentry_info *di; + + spin_lock(&mdsc->dentry_lru_lock); + list_for_each_entry(di, &mdsc->dentry_lru, lru) { + struct dentry *dentry = di->dentry; + seq_printf(s, "%p %p\t%pd\n", + di, dentry, dentry); + } + spin_unlock(&mdsc->dentry_lru_lock); + + return 0; +} + +static int mds_sessions_show(struct seq_file *s, void *ptr) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_auth_client *ac = fsc->client->monc.auth; + struct ceph_options *opt = fsc->client->options; + int mds = -1; + + mutex_lock(&mdsc->mutex); + + /* The 'num' portion of an 'entity name' */ + seq_printf(s, "global_id %llu\n", ac->global_id); + + /* The -o name mount argument */ + seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : ""); + + /* The list of MDS session rank+state */ + for (mds = 0; mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = + __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + continue; + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "mds.%d %s\n", + session->s_mds, + ceph_session_state_name(session->s_state)); + + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) +CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) + + +/* + * debugfs + */ +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + fsc->mount_options->congestion_kb = (int)val; + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + *val = (u64)fsc->mount_options->congestion_kb; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_mds_sessions); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); +} + +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ + char name[100]; + int err = -ENOMEM; + + dout("ceph_fs_debugfs_init\n"); + BUG_ON(!fsc->client->debugfs_dir); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + if (!fsc->debugfs_congestion_kb) + goto out; + + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) + goto out; + + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsmap_show_fops); + if (!fsc->debugfs_mdsmap) + goto out; + + fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", + 0600, + fsc->client->debugfs_dir, + fsc, + &mds_sessions_show_fops); + if (!fsc->debugfs_mds_sessions) + goto out; + + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) + goto out; + + fsc->debugfs_caps = debugfs_create_file("caps", + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_show_fops); + if (!fsc->debugfs_caps) + goto out; + + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) + goto out; + + return 0; + +out: + ceph_fs_debugfs_cleanup(fsc); + return err; +} + + +#else /* CONFIG_DEBUG_FS */ + +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ + return 0; +} + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/fs/ceph/dir.c b/kernel/fs/ceph/dir.c new file mode 100644 index 000000000..4248307fe --- /dev/null +++ b/kernel/fs/ceph/dir.c @@ -0,0 +1,1411 @@ +#include + +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * Directory operations: readdir, lookup, create, link, unlink, + * rename, etc. + */ + +/* + * Ceph MDS operations are specified in terms of a base ino and + * relative path. Thus, the client can specify an operation on a + * specific inode (e.g., a getattr due to fstat(2)), or as a path + * relative to, say, the root directory. + * + * Normally, we limit ourselves to strict inode ops (no path component) + * or dentry operations (a single path component relative to an ino). The + * exception to this is open_root_dentry(), which will open the mount + * point by name. + */ + +const struct dentry_operations ceph_dentry_ops; + +/* + * Initialize ceph dentry state. + */ +int ceph_init_dentry(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + if (dentry->d_fsdata) + return 0; + + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + if (!di) + return -ENOMEM; /* oh well */ + + spin_lock(&dentry->d_lock); + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); + goto out_unlock; + } + + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + + di->dentry = dentry; + di->lease_session = NULL; + dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; + ceph_dentry_lru_add(dentry); +out_unlock: + spin_unlock(&dentry->d_lock); + return 0; +} + +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (!IS_ROOT(dentry)) { + inode = d_inode(dentry->d_parent); + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} + + +/* + * for readdir, we encode the directory frag and offset within that + * frag into f_pos. + */ +static unsigned fpos_frag(loff_t p) +{ + return p >> 32; +} +static unsigned fpos_off(loff_t p) +{ + return p & 0xffffffff; +} + +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + +/* + * When possible, we try to satisfy a readdir by peeking at the + * dcache. We make this work by carefully ordering dentries on + * d_child when we initially get results back from the MDS, and + * falling back to a "normal" sync readdir if any dentries in the dir + * are dropped. + * + * Complete dir indicates that we have all dentries in the dir. It is + * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by + * the MDS if/when the directory is modified). + */ +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) +{ + struct ceph_file_info *fi = file->private_data; + struct dentry *parent = file->f_path.dentry; + struct inode *dir = d_inode(parent); + struct list_head *p; + struct dentry *dentry, *last; + struct ceph_dentry_info *di; + int err = 0; + + /* claim ref on last dentry we returned */ + last = fi->dentry; + fi->dentry = NULL; + + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); + + spin_lock(&parent->d_lock); + + /* start at beginning? */ + if (ctx->pos == 2 || last == NULL || + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { + if (list_empty(&parent->d_subdirs)) + goto out_unlock; + p = parent->d_subdirs.prev; + dout(" initial p %p/%p\n", p->prev, p->next); + } else { + p = last->d_child.prev; + } + +more: + dentry = list_entry(p, struct dentry, d_child); + di = ceph_dentry(dentry); + while (1) { + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", + parent->d_subdirs.prev, parent->d_subdirs.next); + if (p == &parent->d_subdirs) { + fi->flags |= CEPH_F_ATEND; + goto out_unlock; + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && + ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && + fpos_cmp(ctx->pos, di->offset) <= 0) + break; + dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, + dentry, di->offset, + ctx->pos, d_unhashed(dentry) ? " unhashed" : "", + !d_inode(dentry) ? " null" : ""); + spin_unlock(&dentry->d_lock); + p = p->prev; + dentry = list_entry(p, struct dentry, d_child); + di = ceph_dentry(dentry); + } + + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete_ordered(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, d_inode(dentry)); + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { + if (last) { + /* remember our position */ + fi->dentry = last; + fi->next_offset = fpos_off(di->offset); + } + dput(dentry); + return 0; + } + + ctx->pos = di->offset + 1; + + if (last) + dput(last); + last = dentry; + + spin_lock(&parent->d_lock); + p = p->prev; /* advance to next dentry */ + goto more; + +out_unlock: + spin_unlock(&parent->d_lock); +out: + if (last) + dput(last); + return err; +} + +/* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len) +{ + kfree(fi->last_name); + fi->last_name = kmalloc(len+1, GFP_NOFS); + if (!fi->last_name) + return -ENOMEM; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +static int ceph_readdir(struct file *file, struct dir_context *ctx) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned frag = fpos_frag(ctx->pos); + int off = fpos_off(ctx->pos); + int err; + u32 ftype; + struct ceph_mds_reply_info_parsed *rinfo; + + dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + if (fi->flags & CEPH_F_ATEND) + return 0; + + /* always start with . and .. */ + if (ctx->pos == 0) { + dout("readdir off 0 -> '.'\n"); + if (!dir_emit(ctx, ".", 1, + ceph_translate_ino(inode->i_sb, inode->i_ino), + inode->i_mode >> 12)) + return 0; + ctx->pos = 1; + off = 1; + } + if (ctx->pos == 1) { + ino_t ino = parent_ino(file->f_path.dentry); + dout("readdir off 1 -> '..'\n"); + if (!dir_emit(ctx, "..", 2, + ceph_translate_ino(inode->i_sb, ino), + inode->i_mode >> 12)) + return 0; + ctx->pos = 2; + off = 2; + } + + /* can we use the dcache? */ + spin_lock(&ci->i_ceph_lock); + if ((ctx->pos == 2 || fi->dentry) && + ceph_test_mount_opt(fsc, DCACHE) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && + __ceph_dir_is_complete_ordered(ci) && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + u32 shared_gen = ci->i_shared_gen; + spin_unlock(&ci->i_ceph_lock); + err = __dcache_readdir(file, ctx, shared_gen); + if (err != -EAGAIN) + return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); + } else { + spin_unlock(&ci->i_ceph_lock); + } + if (fi->dentry) { + err = note_last_dentry(fi, fi->dentry->d_name.name, + fi->dentry->d_name.len); + if (err) + return err; + dput(fi->dentry); + fi->dentry = NULL; + } + + /* proceed with a normal readdir */ + + if (ctx->pos == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = atomic_read(&ci->i_release_count); + fi->dir_ordered_count = ci->i_ordered_count; + } + +more: + /* do we have the correct frag content buffered? */ + if (fi->frag != frag || fi->last_readdir == NULL) { + struct ceph_mds_request *req; + int op = ceph_snap(inode) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; + + /* discard old result, if any */ + if (fi->last_readdir) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } + + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", + ceph_vinop(inode), frag, fi->last_name); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } + /* hints to request -> mds selection code */ + req->r_direct_mode = USE_AUTH_MDS; + req->r_direct_hash = ceph_frag_value(frag); + req->r_direct_is_hash = true; + if (fi->last_name) { + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } + req->r_readdir_offset = fi->next_offset; + req->r_args.readdir.frag = cpu_to_le32(frag); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } + dout("readdir got and parsed readdir result=%d" + " on frag %x, end=%d, complete=%d\n", err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete); + + if (!req->r_did_prepopulate) { + dout("readdir !did_prepopulate"); + /* preclude from marking dir complete */ + fi->dir_release_count--; + } + + /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } + fi->frag = frag; + fi->offset = fi->next_offset; + fi->last_readdir = req; + + if (req->r_reply_info.dir_end) { + kfree(fi->last_name); + fi->last_name = NULL; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + } else { + err = note_last_dentry(fi, + rinfo->dir_dname[rinfo->dir_nr-1], + rinfo->dir_dname_len[rinfo->dir_nr-1]); + if (err) + return err; + fi->next_offset += rinfo->dir_nr; + } + } + + rinfo = &fi->last_readdir->r_reply_info; + dout("readdir frag %x num %d off %d chunkoff %d\n", frag, + rinfo->dir_nr, off, fi->offset); + + ctx->pos = ceph_make_fpos(frag, off); + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { + struct ceph_mds_reply_inode *in = + rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", + off, off - fi->offset, rinfo->dir_nr, ctx->pos, + rinfo->dir_dname_len[off - fi->offset], + rinfo->dir_dname[off - fi->offset], in); + BUG_ON(!in); + ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); + if (!dir_emit(ctx, + rinfo->dir_dname[off - fi->offset], + rinfo->dir_dname_len[off - fi->offset], + ceph_translate_ino(inode->i_sb, ino), ftype)) { + dout("filldir stopping us...\n"); + return 0; + } + off++; + ctx->pos++; + } + + if (fi->last_name) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + goto more; + } + + /* more frags? */ + if (!ceph_frag_is_rightmost(frag)) { + frag = ceph_frag_next(frag); + off = 0; + ctx->pos = ceph_make_fpos(frag, off); + dout("readdir next frag is %x\n", frag); + goto more; + } + fi->flags |= CEPH_F_ATEND; + + /* + * if dir_release_count still matches the dir, no dentries + * were released during the whole readdir, and we should have + * the complete dir contents in our cache. + */ + spin_lock(&ci->i_ceph_lock); + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { + if (ci->i_ordered_count == fi->dir_ordered_count) + dout(" marking %p complete and ordered\n", inode); + else + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count, + fi->dir_ordered_count); + } + spin_unlock(&ci->i_ceph_lock); + + dout("readdir %p file %p done.\n", inode, file); + return 0; +} + +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +{ + if (fi->last_readdir) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } + kfree(fi->last_name); + fi->last_name = NULL; + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; + if (fi->dentry) { + dput(fi->dentry); + fi->dentry = NULL; + } + fi->flags &= ~CEPH_F_ATEND; +} + +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_mapping->host; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); + loff_t retval; + + mutex_lock(&inode->i_mutex); + retval = -EINVAL; + switch (whence) { + case SEEK_END: + offset += inode->i_size + 2; /* FIXME */ + break; + case SEEK_CUR: + offset += file->f_pos; + case SEEK_SET: + break; + default: + goto out; + } + + if (offset >= 0) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + fi->flags &= ~CEPH_F_ATEND; + } + retval = offset; + + /* + * discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk. + */ + if (offset == 0 || + fpos_frag(offset) != fi->frag || + fpos_off(offset) < fi->offset) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi, fpos_frag(offset)); + } + + /* bump dir_release_count if we did a forward seek */ + if (fpos_cmp(offset, old_offset) > 0) + fi->dir_release_count--; + } +out: + mutex_unlock(&inode->i_mutex); + return retval; +} + +/* + * Handle lookups for the hidden .snap directory. + */ +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + + /* .snap dir? */ + if (err == -ENOENT && + ceph_snap(parent) == CEPH_NOSNAP && + strcmp(dentry->d_name.name, + fsc->mount_options->snapdir_name) == 0) { + struct inode *inode = ceph_get_snapdir(parent); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", + dentry, dentry, inode); + BUG_ON(!d_unhashed(dentry)); + d_add(dentry, inode); + err = 0; + } + return err; +} + +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + if (err == -ENOENT) { + /* no trace? */ + err = 0; + if (!req->r_reply_info.head->is_dentry) { + dout("ENOENT and no trace, dentry %p inode %p\n", + dentry, d_inode(dentry)); + if (d_really_is_positive(dentry)) { + d_drop(dentry); + err = -ENOENT; + } else { + d_add(dentry, NULL); + } + } + } + if (err) + dentry = ERR_PTR(err); + else if (dentry != req->r_dentry) + dentry = dget(req->r_dentry); /* we got spliced */ + else + dentry = NULL; + return dentry; +} + +static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +{ + return ceph_ino(inode) == CEPH_INO_ROOT && + strncmp(dentry->d_name.name, ".ceph", 5) == 0; +} + +/* + * Look up a single dir entry. If there is a lookup intent, inform + * the MDS so that it gets our 'caps wanted' value in a single op. + */ +static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int op; + int err; + + dout("lookup %p dentry %p '%pd'\n", + dir, dentry, dentry); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + err = ceph_init_dentry(dentry); + if (err < 0) + return ERR_PTR(err); + + /* can we conclude ENOENT locally? */ + if (d_really_is_negative(dentry)) { + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + + spin_lock(&ci->i_ceph_lock); + dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + if (strncmp(dentry->d_name.name, + fsc->mount_options->snapdir_name, + dentry->d_name.len) && + !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && + __ceph_dir_is_complete(ci) && + (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + spin_unlock(&ci->i_ceph_lock); + dout(" dir %p complete, -ENOENT\n", dir); + d_add(dentry, NULL); + di->lease_shared_gen = ci->i_shared_gen; + return NULL; + } + spin_unlock(&ci->i_ceph_lock); + } + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + /* we only need inode linkage */ + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_locked_dir = dir; + err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); + dentry = ceph_finish_lookup(req, dentry, err); + ceph_mdsc_put_request(req); /* will dput(dentry) */ + dout("lookup result=%p\n", dentry); + return dentry; +} + +/* + * If we do a create but get no trace back from the MDS, follow up with + * a lookup (the VFS expects us to link up the provided dentry). + */ +int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) +{ + struct dentry *result = ceph_lookup(dir, dentry, 0); + + if (result && !IS_ERR(result)) { + /* + * We created the item, then did a lookup, and found + * it was already linked to another inode we already + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. + */ + d_drop(result); + return -ESTALE; + } + return PTR_ERR(result); +} + +static int ceph_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct ceph_acls_info acls = {}; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + return err; + + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", + dir, dentry, mode, rdev); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mknod.mode = cpu_to_le32(mode); + req->r_args.mknod.rdev = cpu_to_le32(rdev); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (!err) + ceph_init_inode_acls(d_inode(dentry), &acls); + else + d_drop(dentry); + ceph_release_acls_info(&acls); + return err; +} + +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ceph_mknod(dir, dentry, mode, 0); +} + +static int ceph_symlink(struct inode *dir, struct dentry *dentry, + const char *dest) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_path2 = kstrdup(dest, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } + req->r_locked_dir = dir; + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (err) + d_drop(dentry); + return err; +} + +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct ceph_acls_info acls = {}; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* mkdir .snap/foo is a MKSNAP */ + op = CEPH_MDS_OP_MKSNAP; + dout("mksnap dir %p snap '%pd' dn %p\n", dir, + dentry, dentry); + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); + op = CEPH_MDS_OP_MKDIR; + } else { + goto out; + } + + mode |= S_IFDIR; + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + goto out; + + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mkdir.mode = cpu_to_le32(mode); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (!err) + ceph_init_inode_acls(d_inode(dentry), &acls); + else + d_drop(dentry); + ceph_release_acls_info(&acls); + return err; +} + +static int ceph_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("link in dir %p old_dentry %p dentry %p\n", dir, + old_dentry, dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (err) { + d_drop(dentry); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); + } + ceph_mdsc_put_request(req); + return err; +} + +/* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +static int drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + ci->i_ceph_flags |= CEPH_I_NODELAY; + } + spin_unlock(&ci->i_ceph_lock); + return drop; +} + +/* + * rmdir and unlink are differ only by the metadata op code + */ +static int ceph_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = d_inode(dentry); + struct ceph_mds_request *req; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* rmdir .snap/foo is RMSNAP */ + dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); + op = CEPH_MDS_OP_RMSNAP; + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("unlink/rmdir dir %p dn %p inode %p\n", + dir, dentry, inode); + op = d_is_dir(dentry) ? + CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; + } else + goto out; + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_inode_drop = drop_caps_for_unlink(inode); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + ceph_mdsc_put_request(req); +out: + return err; +} + +static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; + int err; + + if (ceph_snap(old_dir) != ceph_snap(new_dir)) + return -EXDEV; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } + dout("rename dir %p dentry %p to dir %p dentry %p\n", + old_dir, old_dentry, new_dir, new_dentry); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + ihold(old_dir); + req->r_dentry = dget(new_dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = old_dir; + req->r_locked_dir = new_dir; + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_RDCACHE on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + if (d_really_is_positive(new_dentry)) + req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); + err = ceph_mdsc_do_request(mdsc, old_dir, req); + if (!err && !req->r_reply_info.head->is_dentry) { + /* + * Normally d_move() is done by fill_trace (called by + * do_request, above). If there is no trace, we need + * to do it here. + */ + + d_move(old_dentry, new_dentry); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + + } + ceph_mdsc_put_request(req); + return err; +} + +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} + +/* + * Check if dentry lease is valid. If not, delete the lease. Try to + * renew if the least is more than half up. + */ +static int dentry_lease_is_valid(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *s; + int valid = 0; + u32 gen; + unsigned long ttl; + struct ceph_mds_session *session = NULL; + struct inode *dir = NULL; + u32 seq = 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (di->lease_session) { + s = di->lease_session; + spin_lock(&s->s_gen_ttl_lock); + gen = s->s_cap_gen; + ttl = s->s_cap_ttl; + spin_unlock(&s->s_gen_ttl_lock); + + if (di->lease_gen == gen && + time_before(jiffies, dentry->d_time) && + time_before(jiffies, ttl)) { + valid = 1; + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* we should renew */ + dir = d_inode(dentry->d_parent); + session = ceph_get_mds_session(s); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } + } + } + spin_unlock(&dentry->d_lock); + + if (session) { + ceph_mdsc_lease_send_msg(session, dir, dentry, + CEPH_MDS_LEASE_RENEW, seq); + ceph_put_mds_session(session); + } + dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + return valid; +} + +/* + * Check if directory-wide content lease/cap is valid. + */ +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int valid = 0; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_shared_gen == di->lease_shared_gen) + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + spin_unlock(&ci->i_ceph_lock); + dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", + dir, (unsigned)ci->i_shared_gen, dentry, + (unsigned)di->lease_shared_gen, valid); + return valid; +} + +/* + * Check if cached dentry can be trusted. + */ +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + int valid = 0; + struct inode *dir; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, + dentry, d_inode(dentry), ceph_dentry(dentry)->offset); + + dir = ceph_get_dentry_parent_inode(dentry); + + /* always trust cached snapped dentries, snapdir dentry */ + if (ceph_snap(dir) != CEPH_NOSNAP) { + dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, + dentry, d_inode(dentry)); + valid = 1; + } else if (d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); + else + valid = 1; + } + + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) { + ceph_dentry_lru_touch(dentry); + } else { + ceph_dir_clear_complete(dir); + } + iput(dir); + return valid; +} + +/* + * Release our ceph_dentry_info. + */ +static void ceph_d_release(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + dout("d_release %p\n", dentry); + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; +} + +static int ceph_snapdir_d_revalidate(struct dentry *dentry, + unsigned int flags) +{ + /* + * Eventually, we'll want to revalidate snapped metadata + * too... probably... + */ + return 1; +} + +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + dout("ceph_d_prune %p\n", dentry); + + /* do we have a valid parent? */ + if (IS_ROOT(dentry)) + return; + + /* if we are not hashed, we don't affect dir's completeness */ + if (d_unhashed(dentry)) + return; + + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + ceph_dir_clear_complete(d_inode(dentry->d_parent)); +} + +/* + * read() on a dir. This weird interface hack only works if mounted + * with '-o dirstat'. + */ +static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct ceph_file_info *cf = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int left; + const int bufsize = 1024; + + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + return -EISDIR; + + if (!cf->dir_info) { + cf->dir_info = kmalloc(bufsize, GFP_NOFS); + if (!cf->dir_info) + return -ENOMEM; + cf->dir_info_len = + snprintf(cf->dir_info, bufsize, + "entries: %20lld\n" + " files: %20lld\n" + " subdirs: %20lld\n" + "rentries: %20lld\n" + " rfiles: %20lld\n" + " rsubdirs: %20lld\n" + "rbytes: %20lld\n" + "rctime: %10ld.%09ld\n", + ci->i_files + ci->i_subdirs, + ci->i_files, + ci->i_subdirs, + ci->i_rfiles + ci->i_rsubdirs, + ci->i_rfiles, + ci->i_rsubdirs, + ci->i_rbytes, + (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); + } + + if (*ppos >= cf->dir_info_len) + return 0; + size = min_t(unsigned, size, cf->dir_info_len-*ppos); + left = copy_to_user(buf, cf->dir_info + *ppos, size); + if (left == size) + return -EFAULT; + *ppos += (size - left); + return size - left; +} + +/* + * an fsync() on a dir will wait for any uncommitted directory + * operations to commit. + */ +static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + dout("dir_fsync %p\n", inode); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_entry(head->prev, + struct ceph_mds_request, r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("dir_fsync %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + if (req->r_timeout) { + unsigned long time_left = wait_for_completion_timeout( + &req->r_safe_completion, + req->r_timeout); + if (time_left > 0) + ret = 0; + else + ret = -EIO; /* timed out */ + } else { + wait_for_completion(&req->r_safe_completion); + } + ceph_mdsc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + if (ret || list_empty(head)) + break; + req = list_entry(head->next, + struct ceph_mds_request, r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +/* + * We maintain a private dentry LRU. + * + * FIXME: this needs to be changed to a per-mds lru to be useful. + */ +void ceph_dentry_lru_add(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); +} + +void ceph_dentry_lru_touch(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, + di->offset); + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); +} + +void ceph_dentry_lru_del(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); +} + +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) +{ + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + } +} + +const struct file_operations ceph_dir_fops = { + .read = ceph_read_dir, + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, + .unlocked_ioctl = ceph_ioctl, + .fsync = ceph_dir_fsync, +}; + +const struct file_operations ceph_snapdir_fops = { + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, +}; + +const struct inode_operations ceph_dir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .setattr = ceph_setattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, + .mknod = ceph_mknod, + .symlink = ceph_symlink, + .mkdir = ceph_mkdir, + .link = ceph_link, + .unlink = ceph_unlink, + .rmdir = ceph_unlink, + .rename = ceph_rename, + .create = ceph_create, + .atomic_open = ceph_atomic_open, +}; + +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, + .rename = ceph_rename, +}; + +const struct dentry_operations ceph_dentry_ops = { + .d_revalidate = ceph_d_revalidate, + .d_release = ceph_d_release, + .d_prune = ceph_d_prune, +}; + +const struct dentry_operations ceph_snapdir_dentry_ops = { + .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_d_release, +}; + +const struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_d_release, + .d_prune = ceph_d_prune, +}; diff --git a/kernel/fs/ceph/export.c b/kernel/fs/ceph/export.c new file mode 100644 index 000000000..fe02ae7f0 --- /dev/null +++ b/kernel/fs/ceph/export.c @@ -0,0 +1,250 @@ +#include + +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * Basic fh + */ +struct ceph_nfs_fh { + u64 ino; +} __attribute__ ((packed)); + +/* + * Larger fh that includes parent ino. + */ +struct ceph_nfs_confh { + u64 ino, parent_ino; +} __attribute__ ((packed)); + +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + int type; + struct ceph_nfs_fh *fh = (void *)rawfh; + struct ceph_nfs_confh *cfh = (void *)rawfh; + int connected_handle_length = sizeof(*cfh)/4; + int handle_length = sizeof(*fh)/4; + + /* don't re-export snaps */ + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EINVAL; + + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } + + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); + cfh->ino = ceph_ino(inode); + cfh->parent_ino = ceph_ino(parent_inode); + *max_len = connected_handle_length; + type = FILEID_INO32_GEN_PARENT; + } else { + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); + *max_len = handle_length; + type = FILEID_INO32_GEN; + } + return type; +} + +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; + struct dentry *dentry; + struct ceph_vino vino; + int err; + + vino.ino = ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ESTALE); + } + + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + iput(inode); + return dentry; + } + err = ceph_init_dentry(dentry); + if (err < 0) { + dput(dentry); + return ERR_PTR(err); + } + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); + return dentry; +} + +/* + * convert regular fh to dentry + */ +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + struct dentry *dentry; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + if (child) { + req->r_inode = d_inode(child); + ihold(d_inode(child)); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; + } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); + + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + iput(inode); + return dentry; + } + err = ceph_init_dentry(dentry); + if (err < 0) { + dput(dentry); + return ERR_PTR(err); + } + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(d_inode(child)) : ino, + dentry, ceph_vinop(inode)); + return dentry; +} + +static struct dentry *ceph_get_parent(struct dentry *child) +{ + /* don't re-export snaps */ + if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(d_inode(child))); + return __get_parent(child->d_sb, child, 0); +} + +/* + * convert regular fh to parent + */ +static struct dentry *ceph_fh_to_parent(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_confh *cfh = (void *)fid->raw; + struct dentry *dentry; + + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*cfh) / 4) + return NULL; + + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} + +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; + + mdsc = ceph_inode_to_client(d_inode(child))->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&d_inode(parent)->i_mutex); + + req->r_inode = d_inode(child); + ihold(d_inode(child)); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_locked_dir = d_inode(parent); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&d_inode(parent)->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(d_inode(child)), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(d_inode(child)), err); + } + + ceph_mdsc_put_request(req); + return err; +} + +const struct export_operations ceph_export_ops = { + .encode_fh = ceph_encode_fh, + .fh_to_dentry = ceph_fh_to_dentry, + .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, +}; diff --git a/kernel/fs/ceph/file.c b/kernel/fs/ceph/file.c new file mode 100644 index 000000000..3b6b522b4 --- /dev/null +++ b/kernel/fs/ceph/file.c @@ -0,0 +1,1344 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "cache.h" + +/* + * Ceph file operations + * + * Implement basic open/close functionality, and implement + * read/write. + * + * We implement three modes of file I/O: + * - buffered uses the generic_file_aio_{read,write} helpers + * + * - synchronous is used when there is multi-client read/write + * sharing, avoids the page cache, and synchronously waits for an + * ack from the OSD. + * + * - direct io takes the variant of the sync path that references + * user pages directly. + * + * fsync() flushes and waits on dirty pages, but just queues metadata + * for writeback: since the MDS can recover size and mtime there is no + * need to wait for MDS acknowledgement. + */ + + +/* + * Prepare an open request. Preallocate ceph_cap to avoid an + * inopportune ENOMEM later. + */ +static struct ceph_mds_request * +prepare_open_request(struct super_block *sb, int flags, int create_mode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int want_auth = USE_ANY_MDS; + int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; + + if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) + want_auth = USE_AUTH_MDS; + + req = ceph_mdsc_create_request(mdsc, op, want_auth); + if (IS_ERR(req)) + goto out; + req->r_fmode = ceph_flags_to_mode(flags); + req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.mode = cpu_to_le32(create_mode); +out: + return req; +} + +/* + * initialize private struct file data. + * if we fail, clean up by dropping fmode reference on the ceph_inode + */ +static int ceph_init_file(struct inode *inode, struct file *file, int fmode) +{ + struct ceph_file_info *cf; + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + /* First file open request creates the cookie, we want to keep + * this cookie around for the filetime of the inode as not to + * have to worry about fscache register / revoke / operation + * races. + * + * Also, if we know the operation is going to invalidate data + * (non readonly) just nuke the cache right away. + */ + ceph_fscache_register_inode_cookie(mdsc->fsc, ci); + if ((fmode & CEPH_FILE_MODE_WR)) + ceph_fscache_invalidate(inode); + case S_IFDIR: + dout("init_file %p %p 0%o (regular)\n", inode, file, + inode->i_mode); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + if (cf == NULL) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + cf->fmode = fmode; + cf->next_offset = 2; + file->private_data = cf; + BUG_ON(inode->i_fop->release != ceph_release); + break; + + case S_IFLNK: + dout("init_file %p %p 0%o (symlink)\n", inode, file, + inode->i_mode); + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + break; + + default: + dout("init_file %p %p 0%o (special)\n", inode, file, + inode->i_mode); + /* + * we need to drop the open ref now, since we don't + * have .release set to ceph_release. + */ + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + BUG_ON(inode->i_fop->release == ceph_release); + + /* call the proper open fop */ + ret = inode->i_fop->open(inode, file); + } + return ret; +} + +/* + * If we already have the requisite capabilities, we can satisfy + * the open request locally (no need to request new caps from the + * MDS). We do, however, need to inform the MDS (asynchronously) + * if our wanted caps set expands. + */ +int ceph_open(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct ceph_file_info *cf = file->private_data; + struct inode *parent_inode = NULL; + int err; + int flags, fmode, wanted; + + if (cf) { + dout("open file %p is already opened\n", file); + return 0; + } + + /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ + flags = file->f_flags & ~(O_CREAT|O_EXCL); + if (S_ISDIR(inode->i_mode)) + flags = O_DIRECTORY; /* mds likes to know */ + + dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); + fmode = ceph_flags_to_mode(flags); + wanted = ceph_caps_for_mode(fmode); + + /* snapped files are read-only */ + if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) + return -EROFS; + + /* trivially open snapdir */ + if (ceph_snap(inode) == CEPH_SNAPDIR) { + spin_lock(&ci->i_ceph_lock); + __ceph_get_fmode(ci, fmode); + spin_unlock(&ci->i_ceph_lock); + return ceph_init_file(inode, file, fmode); + } + + /* + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set + * asynchronously. + */ + spin_lock(&ci->i_ceph_lock); + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { + int mds_wanted = __ceph_caps_mds_wanted(ci); + int issued = __ceph_caps_issued(ci, NULL); + + dout("open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); + __ceph_get_fmode(ci, fmode); + spin_unlock(&ci->i_ceph_lock); + + /* adjust wanted? */ + if ((issued & wanted) != wanted && + (mds_wanted & wanted) != wanted && + ceph_snap(inode) != CEPH_SNAPDIR) + ceph_check_caps(ci, 0, NULL); + + return ceph_init_file(inode, file, fmode); + } else if (ceph_snap(inode) != CEPH_NOSNAP && + (ci->i_snap_caps & wanted) == wanted) { + __ceph_get_fmode(ci, fmode); + spin_unlock(&ci->i_ceph_lock); + return ceph_init_file(inode, file, fmode); + } + + spin_unlock(&ci->i_ceph_lock); + + dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = inode; + ihold(inode); + + req->r_num_caps = 1; + if (flags & O_CREAT) + parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); + if (!err) + err = ceph_init_file(inode, file, req->r_fmode); + ceph_mdsc_put_request(req); + dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); +out: + return err; +} + + +/* + * Do a lookup + open with a single request. If we get a non-existent + * file or symlink, return 1 so the VFS can retry. + */ +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct dentry *dn; + struct ceph_acls_info acls = {}; + int err; + + dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", + dir, dentry, dentry, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + err = ceph_init_dentry(dentry); + if (err < 0) + return err; + + if (flags & O_CREAT) { + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + return err; + } + + /* do the open */ + req = prepare_open_request(dir->i_sb, flags, mode); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_acl; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + if (flags & O_CREAT) { + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } + } + req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out_req; + + if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + + if (d_unhashed(dentry)) { + dn = ceph_finish_lookup(req, dentry, err); + if (IS_ERR(dn)) + err = PTR_ERR(dn); + } else { + /* we were given a hashed negative dentry */ + dn = NULL; + } + if (err) + goto out_req; + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { + /* make vfs retry on splice, ENOENT, or symlink */ + dout("atomic_open finish_no_open on dn %p\n", dn); + err = finish_no_open(file, dn); + } else { + dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + ceph_init_inode_acls(d_inode(dentry), &acls); + *opened |= FILE_CREATED; + } + err = finish_open(file, dentry, ceph_open, opened); + } +out_req: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); + ceph_mdsc_put_request(req); +out_acl: + ceph_release_acls_info(&acls); + dout("atomic_open result=%d\n", err); + return err; +} + +int ceph_release(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *cf = file->private_data; + + dout("release inode %p file %p\n", inode, file); + ceph_put_fmode(ci, cf->fmode); + if (cf->last_readdir) + ceph_mdsc_put_request(cf->last_readdir); + kfree(cf->last_name); + kfree(cf->dir_info); + dput(cf->dentry); + kmem_cache_free(ceph_file_cachep, cf); + + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return 0; +} + +enum { + CHECK_EOF = 1, + READ_INLINE = 2, +}; + +/* + * Read a range of bytes striped over one or more objects. Iterate over + * objects we stripe over. (That's not atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. + */ +static int striped_read(struct inode *inode, + u64 off, u64 len, + struct page **pages, int num_pages, + int *checkeof, bool o_direct, + unsigned long buf_align) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 pos, this_len, left; + int io_align, page_align; + int pages_left; + int read; + struct page **page_pos; + int ret; + bool hit_stripe, was_short; + + /* + * we may need to do multiple reads. not atomic, unfortunately. + */ + pos = off; + left = len; + page_pos = pages; + pages_left = num_pages; + read = 0; + io_align = off & ~PAGE_MASK; + +more: + if (o_direct) + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; + this_len = left; + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, pos, &this_len, + ci->i_truncate_seq, + ci->i_truncate_size, + page_pos, pages_left, page_align); + if (ret == -ENOENT) + ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, + ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + int zlen = min(this_len - ret, + inode->i_size - pos - ret); + int zoff = (o_direct ? buf_align : io_align) + + read + ret; + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; + } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; + pos += ret; + read = pos - off; + left -= ret; + page_pos += didpages; + pages_left -= didpages; + + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) + goto more; + } + + if (read > 0) { + ret = read; + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = CHECK_EOF; + } + + dout("striped_read returns %d\n", ret); + return ret; +} + +/* + * Completely synchronous read and write methods. Direct from __user + * buffer to osd, or directly to user pages (if O_DIRECT). + * + * If the read spans object boundary, just do multiple reads. + */ +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct page **pages; + u64 off = iocb->ki_pos; + int num_pages, ret; + size_t len = iov_iter_count(i); + + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (!len) + return 0; + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); + if (ret < 0) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + while (iov_iter_count(i)) { + size_t start; + ssize_t n; + + n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); + if (n < 0) + return n; + + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + + ret = striped_read(inode, off, n, + pages, num_pages, checkeof, + 1, start); + + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < n) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, + PAGE_SIZE - page_off, left); + l = copy_page_to_iter(pages[k++], page_off, + copy, i); + off += l; + left -= l; + if (l < copy) + break; + } + } + ceph_release_page_vector(pages, num_pages); + } + + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } + + dout("sync_read result %d\n", ret); + return ret; +} + +/* + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. + */ +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) +{ + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + + while (iov_iter_count(from) > 0) { + u64 len = iov_iter_single_seg_count(from); + size_t start; + ssize_t n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 0, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + + n = iov_iter_get_pages_alloc(from, &pages, len, &start); + if (unlikely(n < 0)) { + ret = n; + ceph_osdc_put_request(req); + break; + } + + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+n) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + + ceph_osdc_put_request(req); + if (ret) + break; + pos += n; + written += n; + iov_iter_advance(from, n); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 0, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = copy_page_from_iter(pages[n], 0, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + } + + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + goto out; + } + + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + +out: + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + + if (ret != -EOLDSNAPC && written > 0) { + ret = written; + iocb->ki_pos = pos; + } + return ret; +} + +/* + * Wrap generic_file_aio_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + * + * Hmm, the sync read case isn't actually async... should it be? + */ +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *filp = iocb->ki_filp; + struct ceph_file_info *fi = filp->private_data; + size_t len = iov_iter_count(to); + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *pinned_page = NULL; + ssize_t ret; + int want, got = 0; + int retry_op = 0, read = 0; + +again: + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) + return ret; + + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_flags & IOCB_DIRECT) || + (fi->flags & CEPH_F_SYNC)) { + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + if (ci->i_inline_version == CEPH_INLINE_NONE) { + /* hmm, this isn't really async... */ + ret = ceph_sync_read(iocb, to, &retry_op); + } else { + retry_op = READ_INLINE; + } + } else { + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + ret = generic_file_read_iter(iocb, to); + } + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + if (pinned_page) { + page_cache_release(pinned_page); + pinned_page = NULL; + } + ceph_put_cap_refs(ci, got); + if (retry_op && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + } + + statret = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, !!page); + if (statret < 0) { + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_CACHE_SIZE) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_CACHE_SIZE); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; + } + __free_pages(page, 0); + return read; + } + + /* hit EOF or hole? */ + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + + read += ret; + len -= ret; + retry_op = 0; + goto again; + } + } + + if (ret >= 0) + ret += read; + + return ret; +} + +/* + * Take cap references to avoid releasing caps to MDS mid-write. + * + * If we are synchronous, and write with an old snap context, the OSD + * may return EOLDSNAPC. In that case, retry the write.. _after_ + * dropping our cap refs and allowing the pending snap to logically + * complete _before_ this write occurs. + * + * If we are near ENOSPC, write synchronously. + */ +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + ssize_t count, written = 0; + int err, want, got; + loff_t pos; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + + err = generic_write_checks(iocb, from); + if (err <= 0) + goto out; + + pos = iocb->ki_pos; + count = iov_iter_count(from); + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; + goto out; + } + + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + &got, NULL); + if (err < 0) + goto out; + + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct iov_iter data; + mutex_unlock(&inode->i_mutex); + /* we might need to revert back to that point */ + data = *from; + if (iocb->ki_flags & IOCB_DIRECT) + written = ceph_sync_direct_write(iocb, &data, pos); + else + written = ceph_sync_write(iocb, &data, pos); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" + "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), + pos, (unsigned)count); + mutex_lock(&inode->i_mutex); + goto retry_snap; + } + if (written > 0) + iov_iter_advance(from, written); + } else { + loff_t old_size = inode->i_size; + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); + mutex_unlock(&inode->i_mutex); + } + + if (written >= 0) { + int dirty; + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)count, + ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; + } + + goto out_unlocked; + +out: + mutex_unlock(&inode->i_mutex); +out_unlocked: + current->backing_dev_info = NULL; + return written ? written : err; +} + +/* + * llseek. be sure to verify file size on SEEK_END. + */ +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (ret < 0) { + offset = ret; + goto out; + } + } + + switch (whence) { + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + break; + case SEEK_HOLE: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + offset = inode->i_size; + break; + } + + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 0, 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + } + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + s32 object_size = ceph_file_layout_object_size(ci->i_layout); + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + ret = -ENOSPC; + goto unlock; + } + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file, NULL); + if (ret < 0) + goto unlock; + } + + size = i_size_read(inode); + if (!(mode & FALLOC_FL_KEEP_SIZE)) + endoff = offset + length; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + if (ret < 0) + goto unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset < size) + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + } else if (endoff > size) { + truncate_pagecache_range(inode, size, -1); + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, NULL); + } + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: + mutex_unlock(&inode->i_mutex); + return ret; +} + +const struct file_operations ceph_file_fops = { + .open = ceph_open, + .release = ceph_release, + .llseek = ceph_llseek, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, + .mmap = ceph_mmap, + .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = ceph_ioctl, + .fallocate = ceph_fallocate, +}; + diff --git a/kernel/fs/ceph/inode.c b/kernel/fs/ceph/inode.c new file mode 100644 index 000000000..e876e1944 --- /dev/null +++ b/kernel/fs/ceph/inode.c @@ -0,0 +1,2016 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include + +/* + * Ceph inode operations + * + * Implement basic inode helpers (get, alloc) and inode ops (getattr, + * setattr, etc.), xattr helpers, and helpers for assimilating + * metadata returned by the MDS into our cache. + * + * Also define helpers for doing asynchronous writeback, invalidation, + * and truncation for the benefit of those who can't afford to block + * (typically because they are in the message handler path). + */ + +static const struct inode_operations ceph_symlink_iops; + +static void ceph_invalidate_work(struct work_struct *work); +static void ceph_writeback_work(struct work_struct *work); +static void ceph_vmtruncate_work(struct work_struct *work); + +/* + * find or create an inode, given the ceph ino number + */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +{ + struct inode *inode; + ino_t t = ceph_vino_to_ino(vino); + + inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + dout("get_inode created new inode %p %llx.%llx ino %llx\n", + inode, ceph_vinop(inode), (u64)inode->i_ino); + unlock_new_inode(inode); + } + + dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, + vino.snap, inode); + return inode; +} + +/* + * get/constuct snapdir inode for a given directory + */ +struct inode *ceph_get_snapdir(struct inode *parent) +{ + struct ceph_vino vino = { + .ino = ceph_ino(parent), + .snap = CEPH_SNAPDIR, + }; + struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct ceph_inode_info *ci = ceph_inode(inode); + + BUG_ON(!S_ISDIR(parent->i_mode)); + if (IS_ERR(inode)) + return inode; + inode->i_mode = parent->i_mode; + inode->i_uid = parent->i_uid; + inode->i_gid = parent->i_gid; + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + ci->i_rbytes = 0; + return inode; +} + +const struct inode_operations ceph_file_iops = { + .permission = ceph_permission, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, +}; + + +/* + * We use a 'frag tree' to keep track of the MDS's directory fragments + * for a given inode (usually there is just a single fragment). We + * need to know when a child frag is delegated to a new MDS, or when + * it is flagged as replicated, so we can direct our requests + * accordingly. + */ + +/* + * find/create a frag in the tree + */ +static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, + u32 f) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_frag *frag; + int c; + + p = &ci->i_fragtree.rb_node; + while (*p) { + parent = *p; + frag = rb_entry(parent, struct ceph_inode_frag, node); + c = ceph_frag_compare(f, frag->frag); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return frag; + } + + frag = kmalloc(sizeof(*frag), GFP_NOFS); + if (!frag) { + pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " + "frag %x\n", &ci->vfs_inode, + ceph_vinop(&ci->vfs_inode), f); + return ERR_PTR(-ENOMEM); + } + frag->frag = f; + frag->split_by = 0; + frag->mds = -1; + frag->ndist = 0; + + rb_link_node(&frag->node, parent, p); + rb_insert_color(&frag->node, &ci->i_fragtree); + + dout("get_or_create_frag added %llx.%llx frag %x\n", + ceph_vinop(&ci->vfs_inode), f); + return frag; +} + +/* + * find a specific frag @f + */ +struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) +{ + struct rb_node *n = ci->i_fragtree.rb_node; + + while (n) { + struct ceph_inode_frag *frag = + rb_entry(n, struct ceph_inode_frag, node); + int c = ceph_frag_compare(f, frag->frag); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return frag; + } + return NULL; +} + +/* + * Choose frag containing the given value @v. If @pfrag is + * specified, copy the frag delegation info to the caller if + * it is present. + */ +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 t = ceph_frag_make(0, 0); + struct ceph_inode_frag *frag; + unsigned nway, i; + u32 n; + + if (found) + *found = 0; + + while (1) { + WARN_ON(!ceph_frag_contains_value(t, v)); + frag = __ceph_find_frag(ci, t); + if (!frag) + break; /* t is a leaf */ + if (frag->split_by == 0) { + if (pfrag) + memcpy(pfrag, frag, sizeof(*pfrag)); + if (found) + *found = 1; + break; + } + + /* choose child */ + nway = 1 << frag->split_by; + dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); + for (i = 0; i < nway; i++) { + n = ceph_frag_make_child(t, frag->split_by, i); + if (ceph_frag_contains_value(n, v)) { + t = n; + break; + } + } + BUG_ON(i == nway); + } + dout("choose_frag(%x) = %x\n", v, t); + + return t; +} + +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + +/* + * Process dirfrag (delegation) info from the mds. Include leaf + * fragment in tree ONLY if ndist > 0. Otherwise, only + * branches/splits are included in i_fragtree) + */ +static int ceph_fill_dirfrag(struct inode *inode, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + u32 id = le32_to_cpu(dirinfo->frag); + int mds = le32_to_cpu(dirinfo->auth); + int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; + int i; + int err = 0; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + + mutex_lock(&ci->i_fragtree_mutex); + if (ndist == 0 && mds == diri_auth) { + /* no delegation info needed. */ + frag = __ceph_find_frag(ci, id); + if (!frag) + goto out; + if (frag->split_by == 0) { + /* tree leaf, remove */ + dout("fill_dirfrag removed %llx.%llx frag %x" + " (no ref)\n", ceph_vinop(inode), id); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } else { + /* tree branch, keep and clear */ + dout("fill_dirfrag cleared %llx.%llx frag %x" + " referral\n", ceph_vinop(inode), id); + frag->mds = -1; + frag->ndist = 0; + } + goto out; + } + + + /* find/add this frag to store mds delegation info */ + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) { + /* this is not the end of the world; we can continue + with bad/inaccurate delegation info */ + pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", + ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + err = -ENOMEM; + goto out; + } + + frag->mds = mds; + frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); + for (i = 0; i < frag->ndist; i++) + frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); + dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", + ceph_vinop(inode), frag->frag, frag->ndist); + +out: + mutex_unlock(&ci->i_fragtree_mutex); + return err; +} + +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *rb_node; + int i; + u32 id, nsplits; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + frag->split_by = le32_to_cpu(fragtree->splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} + +/* + * initialize a newly allocated inode. + */ +struct inode *ceph_alloc_inode(struct super_block *sb) +{ + struct ceph_inode_info *ci; + int i; + + ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + if (!ci) + return NULL; + + dout("alloc_inode %p\n", &ci->vfs_inode); + + spin_lock_init(&ci->i_ceph_lock); + + ci->i_version = 0; + ci->i_inline_version = 0; + ci->i_time_warp_seq = 0; + ci->i_ceph_flags = 0; + ci->i_ordered_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); + ci->i_symlink = NULL; + + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + + ci->i_fragtree = RB_ROOT; + mutex_init(&ci->i_fragtree_mutex); + + ci->i_xattrs.blob = NULL; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.index = RB_ROOT; + ci->i_xattrs.count = 0; + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.version = 0; + ci->i_xattrs.index_version = 0; + + ci->i_caps = RB_ROOT; + ci->i_auth_cap = NULL; + ci->i_dirty_caps = 0; + ci->i_flushing_caps = 0; + INIT_LIST_HEAD(&ci->i_dirty_item); + INIT_LIST_HEAD(&ci->i_flushing_item); + ci->i_cap_flush_seq = 0; + ci->i_cap_flush_last_tid = 0; + memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + init_waitqueue_head(&ci->i_cap_wq); + ci->i_hold_caps_min = 0; + ci->i_hold_caps_max = 0; + INIT_LIST_HEAD(&ci->i_cap_delay_list); + INIT_LIST_HEAD(&ci->i_cap_snaps); + ci->i_head_snapc = NULL; + ci->i_snap_caps = 0; + + for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + ci->i_nr_by_mode[i] = 0; + + mutex_init(&ci->i_truncate_mutex); + ci->i_truncate_seq = 0; + ci->i_truncate_size = 0; + ci->i_truncate_pending = 0; + + ci->i_max_size = 0; + ci->i_reported_size = 0; + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + ci->i_pin_ref = 0; + ci->i_rd_ref = 0; + ci->i_rdcache_ref = 0; + ci->i_wr_ref = 0; + ci->i_wb_ref = 0; + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + ci->i_shared_gen = 0; + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + + INIT_LIST_HEAD(&ci->i_unsafe_writes); + INIT_LIST_HEAD(&ci->i_unsafe_dirops); + spin_lock_init(&ci->i_unsafe_lock); + + ci->i_snap_realm = NULL; + INIT_LIST_HEAD(&ci->i_snap_realm_item); + INIT_LIST_HEAD(&ci->i_snap_flush_item); + + INIT_WORK(&ci->i_wb_work, ceph_writeback_work); + INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work); + + INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + + ceph_fscache_inode_init(ci); + + return &ci->vfs_inode; +} + +static void ceph_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ceph_inode_info *ci = ceph_inode(inode); + + kmem_cache_free(ceph_inode_cachep, ci); +} + +void ceph_destroy_inode(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *n; + + dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + ceph_fscache_unregister_inode_cookie(ci); + + ceph_queue_caps_release(inode); + + /* + * we may still have a snap_realm reference if there are stray + * caps in i_snap_caps. + */ + if (ci->i_snap_realm) { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_snap_realm *realm = ci->i_snap_realm; + + dout(" dropping residual ref to snap realm %p\n", realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + + kfree(ci->i_symlink); + while ((n = rb_first(&ci->i_fragtree)) != NULL) { + frag = rb_entry(n, struct ceph_inode_frag, node); + rb_erase(n, &ci->i_fragtree); + kfree(frag); + } + + __ceph_destroy_xattrs(ci); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + + call_rcu(&inode->i_rcu, ceph_i_callback); +} + +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} + +/* + * Helpers to fill in size, ctime, mtime, and atime. We have to be + * careful because either the client or MDS may have more up to date + * info, depending on which capabilities are held, and whether + * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime + * and size are monotonically increasing, except when utimes() or + * truncate() increments the corresponding _seq values.) + */ +int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int queue_trunc = 0; + + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || + (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { + dout("size %lld -> %llu\n", inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1<<9) - 1) >> 9; + ci->i_reported_size = size; + if (truncate_seq != ci->i_truncate_seq) { + dout("truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); + ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_LAZYIO)); + /* + * If we hold relevant caps, or in the case where we're + * not the only client referencing this file and we + * don't hold those caps, then we need to check whether + * the file is either opened or mmaped + */ + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || + mapping_mapped(inode->i_mapping) || + __ceph_caps_file_wanted(ci)) { + ci->i_truncate_pending++; + queue_trunc = 1; + } + } + } + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && + ci->i_truncate_size != truncate_size) { + dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, + truncate_size); + ci->i_truncate_size = truncate_size; + } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + + return queue_trunc; +} + +void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int warn = 0; + + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { + if (timespec_compare(ctime, &inode->i_ctime) > 0) { + dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; + } + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + /* the MDS did a utimes() */ + dout("mtime %ld.%09ld -> %ld.%09ld " + "tw %d -> %d\n", + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else if (time_warp_seq == ci->i_time_warp_seq) { + /* nobody did utimes(); take the max */ + if (timespec_compare(mtime, &inode->i_mtime) > 0) { + dout("mtime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_mtime.tv_sec, + inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = *mtime; + } + if (timespec_compare(atime, &inode->i_atime) > 0) { + dout("atime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_atime.tv_sec, + inode->i_atime.tv_nsec, + atime->tv_sec, atime->tv_nsec); + inode->i_atime = *atime; + } + } else if (issued & CEPH_CAP_FILE_EXCL) { + /* we did a utimes(); ignore mds values */ + } else { + warn = 1; + } + } else { + /* we have no write|excl caps; whatever the MDS says is true */ + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else { + warn = 1; + } + } + if (warn) /* time_warp_seq shouldn't go backwards */ + dout("%p mds time_warp_seq %llu < %u\n", + inode, time_warp_seq, ci->i_time_warp_seq); +} + +/* + * Populate an inode based on info from mds. May be called on new or + * existing inodes. + */ +static int fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, + unsigned long ttl_from, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_reply_inode *info = iinfo->in; + struct ceph_inode_info *ci = ceph_inode(inode); + int issued = 0, implemented, new_issued; + struct timespec mtime, atime, ctime; + struct ceph_buffer *xattr_blob = NULL; + struct ceph_cap *new_cap = NULL; + int err = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; + bool fill_inline = false; + + dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", + inode, ceph_vinop(inode), le64_to_cpu(info->version), + ci->i_version); + + /* prealloc new cap struct */ + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + new_cap = ceph_get_cap(mdsc, caps_reservation); + + /* + * prealloc xattr data, if it looks like we'll need it. only + * if len > 4 (meaning there are actually xattrs; the first 4 + * bytes are the xattr count). + */ + if (iinfo->xattr_len > 4) { + xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); + if (!xattr_blob) + pr_err("fill_inode ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); + } + + spin_lock(&ci->i_ceph_lock); + + /* + * provided version will be odd if inode value is projected, + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update + */ + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + + issued = __ceph_caps_issued(ci, &implemented); + issued |= implemented | __ceph_caps_dirty(ci); + new_issued = ~issued & le32_to_cpu(info->cap.caps); + + /* update inode */ + ci->i_version = le64_to_cpu(info->version); + inode->i_version++; + inode->i_rdev = le32_to_cpu(info->rdev); + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(info->mode); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + } + + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) + set_nlink(inode, le32_to_cpu(info->nlink)); + + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } + + /* xattrs */ + /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ + if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && + le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = xattr_blob; + if (xattr_blob) + memcpy(ci->i_xattrs.blob->vec.iov_base, + iinfo->xattr_data, iinfo->xattr_len); + ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); + xattr_blob = NULL; + } + + inode->i_mapping->a_ops = &ceph_aops; + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &ceph_file_iops; + break; + case S_IFREG: + inode->i_op = &ceph_file_iops; + inode->i_fop = &ceph_file_fops; + break; + case S_IFLNK: + inode->i_op = &ceph_symlink_iops; + if (!ci->i_symlink) { + u32 symlen = iinfo->symlink_len; + char *sym; + + spin_unlock(&ci->i_ceph_lock); + + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; + + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + + spin_lock(&ci->i_ceph_lock); + if (!ci->i_symlink) + ci->i_symlink = sym; + else + kfree(sym); /* lost a race */ + } + break; + case S_IFDIR: + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + + ci->i_dir_layout = iinfo->dir_layout; + + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ceph_decode_timespec(&ci->i_rctime, &info->rctime); + break; + default: + pr_err("fill_inode %llx.%llx BAD mode 0%o\n", + ceph_vinop(inode), inode->i_mode); + } + + /* were we issued a capability? */ + if (info->cap.caps) { + if (ceph_snap(inode) == CEPH_NOSNAP) { + unsigned caps = le32_to_cpu(info->cap.caps); + ceph_add_cap(inode, session, + le64_to_cpu(info->cap.cap_id), + cap_fmode, caps, + le32_to_cpu(info->cap.wanted), + le32_to_cpu(info->cap.seq), + le32_to_cpu(info->cap.mseq), + le64_to_cpu(info->cap.realm), + info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, + atomic_read(&ci->i_release_count), + ci->i_ordered_count); + } + + wake = true; + } else { + dout(" %p got snap_caps %s\n", inode, + ceph_cap_string(le32_to_cpu(info->cap.caps))); + ci->i_snap_caps |= le32_to_cpu(info->cap.caps); + if (cap_fmode >= 0) + __ceph_get_fmode(ci, cap_fmode); + } + } else if (cap_fmode >= 0) { + pr_warn("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_get_fmode(ci, cap_fmode); + } + + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (locked_page || + (le32_to_cpu(info->cap.caps) & cache_caps))) + fill_inline = true; + } + + spin_unlock(&ci->i_ceph_lock); + + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); + + /* update delegation info? */ + if (dirinfo) + ceph_fill_dirfrag(inode, dirinfo); + + err = 0; +out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); + if (xattr_blob) + ceph_buffer_put(xattr_blob); + return err; +} + +/* + * caller should hold session s_mutex. + */ +static void update_dentry_lease(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + long unsigned duration = le32_to_cpu(lease->duration_ms); + long unsigned ttl = from_time + (duration * HZ) / 1000; + long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; + struct inode *dir; + + /* only track leases on regular dentries */ + if (dentry->d_op != &ceph_dentry_ops) + return; + + spin_lock(&dentry->d_lock); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); + + /* make lease_rdcache_gen match directory */ + dir = d_inode(dentry->d_parent); + di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; + + if (duration == 0) + goto out_unlock; + + if (di->lease_gen == session->s_cap_gen && + time_before(ttl, dentry->d_time)) + goto out_unlock; /* we already have a newer lease. */ + + if (di->lease_session && di->lease_session != session) + goto out_unlock; + + ceph_dentry_lru_touch(dentry); + + if (!di->lease_session) + di->lease_session = ceph_get_mds_session(session); + di->lease_gen = session->s_cap_gen; + di->lease_seq = le32_to_cpu(lease->seq); + di->lease_renew_after = half_ttl; + di->lease_renew_from = 0; + dentry->d_time = ttl; +out_unlock: + spin_unlock(&dentry->d_lock); + return; +} + +/* + * splice a dentry to an inode. + * caller must hold directory i_mutex for this to be safe. + * + * we will only rehash the resulting dentry if @prehash is + * true; @prehash will be set to false (for the benefit of + * the caller) if we fail. + */ +static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, + bool *prehash) +{ + struct dentry *realdn; + + BUG_ON(d_inode(dn)); + + /* dn must be unhashed */ + if (!d_unhashed(dn)) + d_drop(dn); + realdn = d_splice_alias(in, dn); + if (IS_ERR(realdn)) { + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); + if (prehash) + *prehash = false; /* don't rehash on error */ + dn = realdn; /* note realdn contains the error */ + goto out; + } else if (realdn) { + dout("dn %p (%d) spliced with %p (%d) " + "inode %p ino %llx.%llx\n", + dn, d_count(dn), + realdn, d_count(realdn), + d_inode(realdn), ceph_vinop(d_inode(realdn))); + dput(dn); + dn = realdn; + } else { + BUG_ON(!ceph_dentry(dn)); + dout("dn %p attached to %p ino %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn))); + } + if ((!prehash || *prehash) && d_unhashed(dn)) + d_rehash(dn); +out: + return dn; +} + +/* + * Incorporate results into the local cache. This is either just + * one inode, or a directory, dentry, and possibly linked-to inode (e.g., + * after a lookup). + * + * A reply may contain + * a directory inode along with a dentry. + * and/or a target inode + * + * Called with snap_rwsem (read). + */ +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct inode *in = NULL; + struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + dout("fill_trace %p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); + +#if 0 + /* + * Debugging hook: + * + * If we resend completed ops to a recovering mds, we get no + * trace. Since that is very rare, pretend this is the case + * to ensure the 'no trace' handlers in the callers behave. + * + * Fill in inodes unconditionally to avoid breaking cap + * invariants. + */ + if (rinfo->head->op & CEPH_MDS_OP_WRITE) { + pr_info("fill_trace faking empty trace on %lld %s\n", + req->r_tid, ceph_mds_op_name(rinfo->head->op)); + if (rinfo->head->is_dentry) { + rinfo->head->is_dentry = 0; + err = fill_inode(req->r_locked_dir, + &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1); + } + if (rinfo->head->is_target) { + rinfo->head->is_target = 0; + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = ceph_get_inode(sb, vino); + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + req->r_fmode); + iput(in); + } + } +#endif + + if (!rinfo->head->is_target && !rinfo->head->is_dentry) { + dout("fill_trace reply is empty!\n"); + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); + return 0; + } + + if (rinfo->head->is_dentry) { + struct inode *dir = req->r_locked_dir; + + if (dir) { + err = fill_inode(dir, NULL, + &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1, + &req->r_caps_reservation); + if (err < 0) + goto done; + } else { + WARN_ON_ONCE(1); + } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, + session, req->r_request_started, + (!req->r_aborted && rinfo->head->result == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + + /* + * ignore null lease/binding on snapdir ENOENT, or else we + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && !req->r_aborted && + req->r_locked_dir && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + fsc->mount_options->snapdir_name, + req->r_dentry->d_name.len))) { + /* + * lookup link rename : null -> possibly existing inode + * mknod symlink mkdir : null -> new inode + * unlink : linked -> null + */ + struct inode *dir = req->r_locked_dir; + struct dentry *dn = req->r_dentry; + bool have_dir_cap, have_lease; + + BUG_ON(!dn); + BUG_ON(!dir); + BUG_ON(d_inode(dn->d_parent) != dir); + BUG_ON(ceph_ino(dir) != + le64_to_cpu(rinfo->diri.in->ino)); + BUG_ON(ceph_snap(dir) != + le64_to_cpu(rinfo->diri.in->snapid)); + + /* do we have a lease on the whole dir? */ + have_dir_cap = + (le32_to_cpu(rinfo->diri.in->cap.caps) & + CEPH_CAP_FILE_SHARED); + + /* do we have a dn lease? */ + have_lease = have_dir_cap || + le32_to_cpu(rinfo->dlease->duration_ms); + if (!have_lease) + dout("fill_trace no dentry lease or dir cap\n"); + + /* rename? */ + if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + + dout(" src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, + req->r_old_dentry, + dn, dn); + dout("fill_trace doing d_move %p -> %p\n", + req->r_old_dentry, dn); + + d_move(req->r_old_dentry, dn); + dout(" src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, + req->r_old_dentry, + dn, dn); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + ceph_invalidate_dentry_lease(dn); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + + dout("dn %p gets new offset %lld\n", req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); + + dn = req->r_old_dentry; /* use old_dentry */ + } + + /* null dentry? */ + if (!rinfo->head->is_target) { + dout("fill_trace null dentry\n"); + if (d_really_is_positive(dn)) { + ceph_dir_clear_ordered(dir); + dout("d_delete %p\n", dn); + d_delete(dn); + } else { + dout("d_instantiate %p NULL\n", dn); + d_instantiate(dn, NULL); + if (have_lease && d_unhashed(dn)) + d_rehash(dn); + update_dentry_lease(dn, rinfo->dlease, + session, + req->r_request_started); + } + goto done; + } + + /* attach proper inode */ + if (d_really_is_negative(dn)) { + ceph_dir_clear_ordered(dir); + ihold(in); + dn = splice_dentry(dn, in, &have_lease); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + req->r_dentry = dn; /* may have spliced */ + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { + dout(" %p links to %p %llx.%llx, not %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn)), + ceph_vinop(in)); + have_lease = false; + } + + if (have_lease) + update_dentry_lease(dn, rinfo->dlease, session, + req->r_request_started); + dout(" final dn %p\n", dn); + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { + struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; + + /* fill out a snapdir LOOKUPSNAP dentry */ + BUG_ON(!dn); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); + dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_ordered(dir); + ihold(in); + dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + req->r_dentry = dn; /* may have spliced */ + } +done: + dout("fill_trace done err=%d\n", err); + return err; +} + +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + int i, err = 0; + + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + struct inode *in; + int rc; + + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + + in = ceph_get_inode(req->r_dentry->d_sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + dout("new_inode badness got %d\n", err); + continue; + } + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (rc < 0) { + pr_err("fill_inode badness on %p got %d\n", in, rc); + err = rc; + continue; + } + } + + return err; +} + +int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct dentry *parent = req->r_dentry; + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct qstr dname; + struct dentry *dn; + struct inode *in; + int err = 0, ret, i; + struct inode *snapdir = NULL; + struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; + struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); + + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { + snapdir = ceph_get_snapdir(d_inode(parent)); + parent = d_find_alias(snapdir); + dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); + } else { + dout("readdir_prepopulate %d items under dn %p\n", + rinfo->dir_nr, parent); + if (rinfo->dir_dir) + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); + } + + /* FIXME: release caps/leases if error occurs */ + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + + dname.name = rinfo->dir_dname[i]; + dname.len = rinfo->dir_dname_len[i]; + dname.hash = full_name_hash(dname.name, dname.len); + + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dout("d_alloc badness\n"); + err = -ENOMEM; + goto out; + } + ret = ceph_init_dentry(dn); + if (ret < 0) { + dput(dn); + err = ret; + goto out; + } + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + d_delete(dn); + dput(dn); + goto retry_lookup; + } else { + /* reorder parent's d_subdirs */ + spin_lock(&parent->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&dn->d_child, &parent->d_subdirs); + spin_unlock(&dn->d_lock); + spin_unlock(&parent->d_lock); + } + + /* inode */ + if (d_really_is_positive(dn)) { + in = d_inode(dn); + } else { + in = ceph_get_inode(parent->d_sb, vino); + if (IS_ERR(in)) { + dout("new_inode badness\n"); + d_drop(dn); + dput(dn); + err = PTR_ERR(in); + goto out; + } + } + + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation) < 0) { + pr_err("fill_inode badness on %p\n", in); + if (d_really_is_negative(dn)) + iput(in); + d_drop(dn); + goto next_item; + } + + if (d_really_is_negative(dn)) { + struct dentry *realdn = splice_dentry(dn, in, NULL); + if (IS_ERR(realdn)) { + err = PTR_ERR(realdn); + d_drop(dn); + dn = NULL; + goto next_item; + } + dn = realdn; + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); +next_item: + if (dn) + dput(dn); + } + if (err == 0) + req->r_did_prepopulate = true; + +out: + if (snapdir) { + iput(snapdir); + dput(parent); + } + dout("readdir_prepopulate done\n"); + return err; +} + +int ceph_inode_set_size(struct inode *inode, loff_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret = 0; + + spin_lock(&ci->i_ceph_lock); + dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1 << 9) - 1) >> 9; + + /* tell the MDS if we are approaching max_size */ + if ((size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) + ret = 1; + + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* + * Write back inode data in a worker thread. (This can't be done + * in the message handler context.) + */ +void ceph_queue_writeback(struct inode *inode) +{ + ihold(inode); + if (queue_work(ceph_inode_to_client(inode)->wb_wq, + &ceph_inode(inode)->i_wb_work)) { + dout("ceph_queue_writeback %p\n", inode); + } else { + dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); + } +} + +static void ceph_writeback_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_wb_work); + struct inode *inode = &ci->vfs_inode; + + dout("writeback %p\n", inode); + filemap_fdatawrite(&inode->i_data); + iput(inode); +} + +/* + * queue an async invalidation + */ +void ceph_queue_invalidate(struct inode *inode) +{ + ihold(inode); + if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, + &ceph_inode(inode)->i_pg_inv_work)) { + dout("ceph_queue_invalidate %p\n", inode); + } else { + dout("ceph_queue_invalidate %p failed\n", inode); + iput(inode); + } +} + +/* + * Invalidate inode pages in a worker thread. (This can't be done + * in the message handler context.) + */ +static void ceph_invalidate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_pg_inv_work); + struct inode *inode = &ci->vfs_inode; + u32 orig_gen; + int check = 0; + + mutex_lock(&ci->i_truncate_mutex); + spin_lock(&ci->i_ceph_lock); + dout("invalidate_pages %p gen %d revoking %d\n", inode, + ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, 0); + + spin_lock(&ci->i_ceph_lock); + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { + dout("invalidate_pages %p gen %d successful\n", inode, + ci->i_rdcache_gen); + ci->i_rdcache_revoking--; + check = 1; + } else { + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; + } + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); +out: + if (check) + ceph_check_caps(ci, 0, NULL); + iput(inode); +} + + +/* + * called by trunc_wq; + * + * We also truncate in a separate thread as well. + */ +static void ceph_vmtruncate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_vmtruncate_work); + struct inode *inode = &ci->vfs_inode; + + dout("vmtruncate_work %p\n", inode); + __ceph_do_pending_vmtruncate(inode); + iput(inode); +} + +/* + * Queue an async vmtruncate. If we fail to queue work, we will handle + * the truncation the next time we call __ceph_do_pending_vmtruncate. + */ +void ceph_queue_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, + &ci->i_vmtruncate_work)) { + dout("ceph_queue_vmtruncate %p\n", inode); + } else { + dout("ceph_queue_vmtruncate %p failed, pending=%d\n", + inode, ci->i_truncate_pending); + iput(inode); + } +} + +/* + * Make sure any pending truncation is applied before doing anything + * that may depend on it. + */ +void __ceph_do_pending_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 to; + int wrbuffer_refs, finish = 0; + + mutex_lock(&ci->i_truncate_mutex); +retry: + spin_lock(&ci->i_ceph_lock); + if (ci->i_truncate_pending == 0) { + dout("__do_pending_vmtruncate %p none pending\n", inode); + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); + return; + } + + /* + * make sure any dirty snapped pages are flushed before we + * possibly truncate them.. so write AND block! + */ + if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + dout("__do_pending_vmtruncate %p flushing snaps first\n", + inode); + spin_unlock(&ci->i_ceph_lock); + filemap_write_and_wait_range(&inode->i_data, 0, + inode->i_sb->s_maxbytes); + goto retry; + } + + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + + to = ci->i_truncate_size; + wrbuffer_refs = ci->i_wrbuffer_ref; + dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + ci->i_truncate_pending, to); + spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, to); + + spin_lock(&ci->i_ceph_lock); + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } + spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; + + mutex_unlock(&ci->i_truncate_mutex); + + if (wrbuffer_refs == 0) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + + wake_up_all(&ci->i_cap_wq); +} + +/* + * symlinks + */ +static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ceph_inode_info *ci = ceph_inode(d_inode(dentry)); + nd_set_link(nd, ci->i_symlink); + return NULL; +} + +static const struct inode_operations ceph_symlink_iops = { + .readlink = generic_readlink, + .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, +}; + +/* + * setattr + */ +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + const unsigned int ia_valid = attr->ia_valid; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + int issued; + int release = 0, dirtied = 0; + int mask = 0; + int err = 0; + int inode_dirty_flags = 0; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + err = inode_change_ok(inode, attr); + if (err != 0) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (ia_valid & ATTR_UID) { + dout("setattr %p uid %d -> %d\n", inode, + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_uid = attr->ia_uid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); + mask |= CEPH_SETATTR_UID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_GID) { + dout("setattr %p gid %d -> %d\n", inode, + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_gid = attr->ia_gid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); + mask |= CEPH_SETATTR_GID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_MODE) { + dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, + attr->ia_mode); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_mode = attr->ia_mode; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; + req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); + mask |= CEPH_SETATTR_MODE; + release |= CEPH_CAP_AUTH_SHARED; + } + } + + if (ia_valid & ATTR_ATIME) { + dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode, + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_atime, + &attr->ia_atime) < 0) { + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_atime, &attr->ia_atime)) { + ceph_encode_timespec(&req->r_args.setattr.atime, + &attr->ia_atime); + mask |= CEPH_SETATTR_ATIME; + release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_MTIME) { + dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_mtime, + &attr->ia_mtime) < 0) { + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { + ceph_encode_timespec(&req->r_args.setattr.mtime, + &attr->ia_mtime); + mask |= CEPH_SETATTR_MTIME; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + inode->i_size = attr->ia_size; + inode->i_blocks = + (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_ctime = attr->ia_ctime; + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + + /* these do nothing */ + if (ia_valid & ATTR_CTIME) { + bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| + ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; + dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + only ? "ctime only" : "ignored"); + inode->i_ctime = attr->ia_ctime; + if (only) { + /* + * if kernel wants to dirty ctime but nothing else, + * we need to choose a cap to dirty under, or do + * a almost-no-op setattr + */ + if (issued & CEPH_CAP_AUTH_EXCL) + dirtied |= CEPH_CAP_AUTH_EXCL; + else if (issued & CEPH_CAP_FILE_EXCL) + dirtied |= CEPH_CAP_FILE_EXCL; + else if (issued & CEPH_CAP_XATTR_EXCL) + dirtied |= CEPH_CAP_XATTR_EXCL; + else + mask |= CEPH_SETATTR_CTIME; + } + } + if (ia_valid & ATTR_FILE) + dout("setattr %p ATTR_FILE ... hrm!\n", inode); + + if (dirtied) { + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode->i_ctime = CURRENT_TIME; + } + + release &= issued; + spin_unlock(&ci->i_ceph_lock); + + if (inode_dirty_flags) + __mark_inode_dirty(inode, inode_dirty_flags); + + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } + + if (mask) { + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = release; + req->r_args.setattr.mask = cpu_to_le32(mask); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + } + dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, + ceph_cap_string(dirtied), mask); + + ceph_mdsc_put_request(req); + if (mask & CEPH_SETATTR_SIZE) + __ceph_do_pending_vmtruncate(inode); + return err; +out_put: + ceph_mdsc_put_request(req); + return err; +} + +/* + * Verify that we have a lease on the given mask. If not, + * do a getattr against an mds. + */ +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(inode) == CEPH_SNAPDIR) { + dout("do_getattr inode %p SNAPDIR\n", inode); + return 0; + } + + dout("do_getattr inode %p mask %s mode 0%o\n", + inode, ceph_cap_string(mask), inode->i_mode); + if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) + return 0; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } + ceph_mdsc_put_request(req); + dout("do_getattr result=%d\n", err); + return err; +} + + +/* + * Check inode permissions. We verify we have a valid value for + * the AUTH cap, then call the generic handler. + */ +int ceph_permission(struct inode *inode, int mask) +{ + int err; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); + + if (!err) + err = generic_permission(inode, mask); + return err; +} + +/* + * Get all attributes. Hopefully somedata we'll have a statlite() + * and can limit the fields we require to be accurate. + */ +int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + int err; + + err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false); + if (!err) { + generic_fillattr(inode, stat); + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + if (ceph_snap(inode) != CEPH_NOSNAP) + stat->dev = ceph_snap(inode); + else + stat->dev = 0; + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; + stat->blocks = 0; + stat->blksize = 65536; + } + } + return err; +} diff --git a/kernel/fs/ceph/ioctl.c b/kernel/fs/ceph/ioctl.c new file mode 100644 index 000000000..f851d8d70 --- /dev/null +++ b/kernel/fs/ceph/ioctl.c @@ -0,0 +1,295 @@ +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "ioctl.h" + + +/* + * ioctls + */ + +/* + * get and set the file layout + */ +static long ceph_ioctl_get_layout(struct file *file, void __user *arg) +{ + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); + struct ceph_ioctl_layout l; + int err; + + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); + if (!err) { + l.stripe_unit = ceph_file_layout_su(ci->i_layout); + l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + l.object_size = ceph_file_layout_object_size(ci->i_layout); + l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.preferred_osd = (s32)-1; + if (copy_to_user(arg, &l, sizeof(l))) + return -EFAULT; + } + + return err; +} + +static long __validate_layout(struct ceph_mds_client *mdsc, + struct ceph_ioctl_layout *l) +{ + int i, err; + + /* validate striping parameters */ + if ((l->object_size & ~PAGE_MASK) || + (l->stripe_unit & ~PAGE_MASK) || + ((unsigned)l->stripe_unit != 0 && + ((unsigned)l->object_size % (unsigned)l->stripe_unit))) + return -EINVAL; + + /* make sure it's a valid data pool */ + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + + return 0; +} + +static long ceph_ioctl_set_layout(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); + struct ceph_ioctl_layout nl; + int err; + + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + /* validate changed params against current layout */ + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); + if (err) + return err; + + memset(&nl, 0, sizeof(nl)); + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + else + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + else + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + if (l.object_size) + nl.object_size = l.object_size; + else + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + if (l.data_pool) + nl.data_pool = l.data_pool; + else + nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + + /* this is obsolete, and always -1 */ + nl.preferred_osd = le64_to_cpu(-1); + + err = __validate_layout(mdsc, &nl); + if (err) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + err = __validate_layout(mdsc, &l); + if (err) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Return object name, size/offset information, and location (OSD + * number, network address) for a given file offset. + */ +static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) +{ + struct ceph_ioctl_dataloc dl; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; + u64 len = 1, olen; + u64 tmp; + struct ceph_pg pgid; + int r; + + /* copy and validate */ + if (copy_from_user(&dl, arg, sizeof(dl))) + return -EFAULT; + + down_read(&osdc->map_sem); + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, + &dl.object_no, &dl.object_offset, + &olen); + if (r < 0) { + up_read(&osdc->map_sem); + return -EIO; + } + dl.file_offset -= dl.object_offset; + dl.object_size = ceph_file_layout_object_size(ci->i_layout); + dl.block_size = ceph_file_layout_su(ci->i_layout); + + /* block_offset = object_offset % block_size */ + tmp = dl.object_offset; + dl.block_offset = do_div(tmp, dl.block_size); + + snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", + ceph_ino(inode), dl.object_no); + + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } + + dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + if (dl.osd >= 0) { + struct ceph_entity_addr *a = + ceph_osd_addr(osdc->osdmap, dl.osd); + if (a) + memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr)); + } else { + memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); + } + up_read(&osdc->map_sem); + + /* send result back to user */ + if (copy_to_user(arg, &dl, sizeof(dl))) + return -EFAULT; + + return 0; +} + +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&ci->i_ceph_lock); + ci->i_nr_by_mode[fi->fmode]--; + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[fi->fmode]++; + spin_unlock(&ci->i_ceph_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + +long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return ceph_ioctl_get_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT: + return ceph_ioctl_set_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + + case CEPH_IOC_GET_DATALOC: + return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); + } + + return -ENOTTY; +} diff --git a/kernel/fs/ceph/ioctl.h b/kernel/fs/ceph/ioctl.h new file mode 100644 index 000000000..c77028afb --- /dev/null +++ b/kernel/fs/ceph/ioctl.h @@ -0,0 +1,100 @@ +#ifndef FS_CEPH_IOCTL_H +#define FS_CEPH_IOCTL_H + +#include +#include + +#define CEPH_IOCTL_MAGIC 0x97 + +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ +struct ceph_ioctl_layout { + __u64 stripe_unit, stripe_count, object_size; + __u64 data_pool; + + /* obsolete. new values ignored, always return -1 */ + __s64 preferred_osd; +}; + +#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) + +/* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * + * Extract identity, address of the OSD and object storing a given + * file offset. + */ +struct ceph_ioctl_dataloc { + __u64 file_offset; /* in+out: file offset */ + __u64 object_offset; /* out: offset in object */ + __u64 object_no; /* out: object # */ + __u64 object_size; /* out: object size */ + char object_name[64]; /* out: object name */ + __u64 block_offset; /* out: offset in block */ + __u64 block_size; /* out: block length */ + __s64 osd; /* out: osd # */ + struct sockaddr_storage osd_addr; /* out: osd address */ +}; + +#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ + struct ceph_ioctl_dataloc) + +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) + +#endif diff --git a/kernel/fs/ceph/locks.c b/kernel/fs/ceph/locks.c new file mode 100644 index 000000000..4347039ec --- /dev/null +++ b/kernel/fs/ceph/locks.c @@ -0,0 +1,381 @@ +#include + +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include + +static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, + int cmd, u8 wait, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + int err; + u64 length = 0; + u64 owner; + + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + owner = secure_addr(fl->fl_owner); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + if (wait) + req->r_wait_for_completion = ceph_lock_wait_for_completion; + + err = ceph_mdsc_do_request(mdsc, inode, req); + + if (operation == CEPH_MDS_OP_GETFILELOCK) { + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); + return err; +} + +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", + req->r_tid); + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + wait_for_completion(&req->r_completion); + return 0; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (IS_GETLK(cmd)) + op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + if (!err) { + if (op != CEPH_MDS_OP_GETFILELOCK) { + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", + err); + } + } + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + file, lock_cmd, wait, fl); + if (!err) { + err = flock_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on flock_lock_file_wait, undid lock", err); + } + } + return err; +} + +/* + * Fills in the passed counter variables, so you can prepare pagelist metadata + * before calling ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + struct file_lock_context *ctx; + + *fcntl_count = 0; + *flock_count = 0; + + ctx = inode->i_flctx; + if (ctx) { + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_posix, fl_list) + ++(*fcntl_count); + list_for_each_entry(lock, &ctx->flc_flock, fl_list) + ++(*flock_count); + spin_unlock(&ctx->flc_lock); + } + dout("counted %d flock locks and %d fcntl locks", + *flock_count, *fcntl_count); +} + +/** + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. + */ +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + struct file_lock_context *ctx = inode->i_flctx; + int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; + int l = 0; + + dout("encoding %d flock and %d fcntl locks", num_flock_locks, + num_fcntl_locks); + + if (!ctx) + return 0; + + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } + list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } +fail: + spin_unlock(&ctx->flc_lock); + return err; +} + +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} diff --git a/kernel/fs/ceph/mds_client.c b/kernel/fs/ceph/mds_client.c new file mode 100644 index 000000000..84f37f34f --- /dev/null +++ b/kernel/fs/ceph/mds_client.c @@ -0,0 +1,3871 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include +#include +#include +#include +#include +#include + +/* + * A cluster of MDS (metadata server) daemons is responsible for + * managing the file system namespace (the directory hierarchy and + * inodes) and for coordinating shared access to storage. Metadata is + * partitioning hierarchically across a number of servers, and that + * partition varies over time as the cluster adjusts the distribution + * in order to balance load. + * + * The MDS client is primarily responsible to managing synchronous + * metadata requests for operations like open, unlink, and so forth. + * If there is a MDS failure, we find out about it when we (possibly + * request and) receive a new MDS map, and can resubmit affected + * requests. + * + * For the most part, though, we take advantage of a lossless + * communications channel to the MDS, and do not need to worry about + * timing out or resubmitting requests. + * + * We maintain a stateful "session" with each MDS we interact with. + * Within each session, we sent periodic heartbeat messages to ensure + * any capabilities or leases we have been issues remain valid. If + * the session times out and goes stale, our leases and capabilities + * are no longer valid. + */ + +struct ceph_reconnect_state { + int nr_caps; + struct ceph_pagelist *pagelist; + bool flock; +}; + +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head); + +static const struct ceph_connection_operations mds_con_ops; + + +/* + * mds reply parsing + */ + +/* + * parse individual inode info + */ +static int parse_reply_info_in(void **p, void *end, + struct ceph_mds_reply_info_in *info, + u64 features) +{ + int err = -EIO; + + info->in = *p; + *p += sizeof(struct ceph_mds_reply_inode) + + sizeof(*info->in->fragtree.splits) * + le32_to_cpu(info->in->fragtree.nsplits); + + ceph_decode_32_safe(p, end, info->symlink_len, bad); + ceph_decode_need(p, end, info->symlink_len, bad); + info->symlink = *p; + *p += info->symlink_len; + + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + else + memset(&info->dir_layout, 0, sizeof(info->dir_layout)); + + ceph_decode_32_safe(p, end, info->xattr_len, bad); + ceph_decode_need(p, end, info->xattr_len, bad); + info->xattr_data = *p; + *p += info->xattr_len; + + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + return 0; +bad: + return err; +} + +/* + * parse a normal reply, which may contain a (dir+)dentry and/or a + * target inode. + */ +static int parse_reply_info_trace(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + int err; + + if (info->head->is_dentry) { + err = parse_reply_info_in(p, end, &info->diri, features); + if (err < 0) + goto out_bad; + + if (unlikely(*p + sizeof(*info->dirfrag) > end)) + goto bad; + info->dirfrag = *p; + *p += sizeof(*info->dirfrag) + + sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); + if (unlikely(*p > end)) + goto bad; + + ceph_decode_32_safe(p, end, info->dname_len, bad); + ceph_decode_need(p, end, info->dname_len, bad); + info->dname = *p; + *p += info->dname_len; + info->dlease = *p; + *p += sizeof(*info->dlease); + } + + if (info->head->is_target) { + err = parse_reply_info_in(p, end, &info->targeti, features); + if (err < 0) + goto out_bad; + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing mds trace %d\n", err); + return err; +} + +/* + * parse readdir results + */ +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 num, i = 0; + int err; + + info->dir_dir = *p; + if (*p + sizeof(*info->dir_dir) > end) + goto bad; + *p += sizeof(*info->dir_dir) + + sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); + if (*p > end) + goto bad; + + ceph_decode_need(p, end, sizeof(num) + 2, bad); + num = ceph_decode_32(p); + info->dir_end = ceph_decode_8(p); + info->dir_complete = ceph_decode_8(p); + if (num == 0) + goto done; + + BUG_ON(!info->dir_in); + info->dir_dname = (void *)(info->dir_in + num); + info->dir_dname_len = (void *)(info->dir_dname + num); + info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + + info->dir_nr = num; + while (num) { + /* dentry */ + ceph_decode_need(p, end, sizeof(u32)*2, bad); + info->dir_dname_len[i] = ceph_decode_32(p); + ceph_decode_need(p, end, info->dir_dname_len[i], bad); + info->dir_dname[i] = *p; + *p += info->dir_dname_len[i]; + dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], + info->dir_dname[i]); + info->dir_dlease[i] = *p; + *p += sizeof(struct ceph_mds_reply_lease); + + /* inode */ + err = parse_reply_info_in(p, end, &info->dir_in[i], features); + if (err < 0) + goto out_bad; + i++; + num--; + } + +done: + if (*p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing dir contents %d\n", err); + return err; +} + +/* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) + return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; +} + +/* + * parse entire mds reply + */ +static int parse_reply_info(struct ceph_msg *msg, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + void *p, *end; + u32 len; + int err; + + info->head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); + end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); + + /* trace */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_trace(&p, p+len, info, features); + if (err < 0) + goto out_bad; + } + + /* extra */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_extra(&p, p+len, info, features); + if (err < 0) + goto out_bad; + } + + /* snap blob */ + ceph_decode_32_safe(&p, end, len, bad); + info->snapblob_len = len; + info->snapblob = p; + p += len; + + if (p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("mds parse_reply err %d\n", err); + return err; +} + +static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) +{ + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); +} + + +/* + * sessions + */ +const char *ceph_session_state_name(int s) +{ + switch (s) { + case CEPH_MDS_SESSION_NEW: return "new"; + case CEPH_MDS_SESSION_OPENING: return "opening"; + case CEPH_MDS_SESSION_OPEN: return "open"; + case CEPH_MDS_SESSION_HUNG: return "hung"; + case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; + case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + default: return "???"; + } +} + +static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +{ + if (atomic_inc_not_zero(&s->s_ref)) { + dout("mdsc get_session %p %d -> %d\n", s, + atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + return s; + } else { + dout("mdsc get_session %p 0 -- FAIL", s); + return NULL; + } +} + +void ceph_put_mds_session(struct ceph_mds_session *s) +{ + dout("mdsc put_session %p %d -> %d\n", s, + atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); + if (atomic_dec_and_test(&s->s_ref)) { + if (s->s_auth.authorizer) + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); + kfree(s); + } +} + +/* + * called under mdsc->mutex + */ +struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *session; + + if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + return NULL; + session = mdsc->sessions[mds]; + dout("lookup_mds_session %p %d\n", session, + atomic_read(&session->s_ref)); + get_session(session); + return session; +} + +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions) + return false; + return mdsc->sessions[mds]; +} + +static int __verify_registered_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + if (s->s_mds >= mdsc->max_sessions || + mdsc->sessions[s->s_mds] != s) + return -ENOENT; + return 0; +} + +/* + * create+register a new session for given mds. + * called under mdsc->mutex. + */ +static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *s; + + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) + return ERR_PTR(-ENOMEM); + s->s_mdsc = mdsc; + s->s_mds = mds; + s->s_state = CEPH_MDS_SESSION_NEW; + s->s_ttl = 0; + s->s_seq = 0; + mutex_init(&s->s_mutex); + + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); + + spin_lock_init(&s->s_gen_ttl_lock); + s->s_cap_gen = 0; + s->s_cap_ttl = jiffies - 1; + + spin_lock_init(&s->s_cap_lock); + s->s_renew_requested = 0; + s->s_renew_seq = 0; + INIT_LIST_HEAD(&s->s_caps); + s->s_nr_caps = 0; + s->s_trim_caps = 0; + atomic_set(&s->s_ref, 1); + INIT_LIST_HEAD(&s->s_waiting); + INIT_LIST_HEAD(&s->s_unsafe); + s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; + s->s_cap_iterator = NULL; + INIT_LIST_HEAD(&s->s_cap_releases); + INIT_LIST_HEAD(&s->s_cap_releases_done); + INIT_LIST_HEAD(&s->s_cap_flushing); + INIT_LIST_HEAD(&s->s_cap_snaps_flushing); + + dout("register_session mds%d\n", mds); + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds+1); + struct ceph_mds_session **sa; + + dout("register_session realloc to %d\n", newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (sa == NULL) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); + atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + return s; + +fail_realloc: + kfree(s); + return ERR_PTR(-ENOMEM); +} + +/* + * called under mdsc->mutex + */ +static void __unregister_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + dout("__unregister_session mds%d %p\n", s->s_mds, s); + BUG_ON(mdsc->sessions[s->s_mds] != s); + mdsc->sessions[s->s_mds] = NULL; + ceph_con_close(&s->s_con); + ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); +} + +/* + * drop session refs in request. + * + * should be last request ref, or hold mdsc->mutex + */ +static void put_request_session(struct ceph_mds_request *req) +{ + if (req->r_session) { + ceph_put_mds_session(req->r_session); + req->r_session = NULL; + } +} + +void ceph_mdsc_release_request(struct kref *kref) +{ + struct ceph_mds_request *req = container_of(kref, + struct ceph_mds_request, + r_kref); + destroy_reply_info(&req->r_reply_info); + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) + ceph_msg_put(req->r_reply); + if (req->r_inode) { + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + iput(req->r_inode); + } + if (req->r_locked_dir) + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + iput(req->r_target_inode); + if (req->r_dentry) + dput(req->r_dentry); + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + iput(req->r_old_dentry_dir); + } + kfree(req->r_path1); + kfree(req->r_path2); + if (req->r_pagelist) + ceph_pagelist_release(req->r_pagelist); + put_request_session(req); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); + kfree(req); +} + +/* + * lookup session, bump ref if found. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, + u64 tid) +{ + struct ceph_mds_request *req; + struct rb_node *n = mdsc->request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mds_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else { + ceph_mdsc_get_request(req); + return req; + } + } + return NULL; +} + +static void __insert_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *new) +{ + struct rb_node **p = &mdsc->request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mds_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mds_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &mdsc->request_tree); +} + +/* + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). + * + * Called under mdsc->mutex. + */ +static void __register_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + struct inode *dir) +{ + req->r_tid = ++mdsc->last_tid; + if (req->r_num_caps) + ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); + dout("__register_request %p tid %lld\n", req, req->r_tid); + ceph_mdsc_get_request(req); + __insert_request(mdsc, req); + + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + + ihold(dir); + spin_lock(&ci->i_unsafe_lock); + req->r_unsafe_dir = dir; + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } +} + +static void __unregister_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &mdsc->request_tree); + RB_CLEAR_NODE(&req->r_node); + + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); + + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_dir_item); + spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; + } + + complete_all(&req->r_safe_completion); + + ceph_mdsc_put_request(req); +} + +/* + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. + * + * Called under mdsc->mutex. + */ +static struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ + while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + +static int __choose_mds(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; + int mds = -1; + u32 hash = req->r_direct_hash; + bool is_hash = req->r_direct_is_hash; + + /* + * is there a specific mds we should try? ignore hint if we have + * no session and the mds is not up (active or recovering). + */ + if (req->r_resend_mds >= 0 && + (__have_session(mdsc, req->r_resend_mds) || + ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { + dout("choose_mds using resend_mds mds%d\n", + req->r_resend_mds); + return req->r_resend_mds; + } + + if (mode == USE_RANDOM_MDS) + goto random; + + inode = NULL; + if (req->r_inode) { + inode = req->r_inode; + } else if (req->r_dentry) { + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = d_inode(parent); + + if (dir->i_sb != mdsc->fsc->sb) { + /* not this fs! */ + inode = d_inode(req->r_dentry); + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = get_nonsnap_parent(parent); + inode = d_inode(dn); + dout("__choose_mds using nonsnap parent %p\n", inode); + } else { + /* dentry target */ + inode = d_inode(req->r_dentry); + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } + } + } + + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, + (int)hash, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), + frag.frag, mds, + (int)r, frag.ndist); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + mode = USE_AUTH_MDS; + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; + } + } + } + + spin_lock(&ci->i_ceph_lock); + cap = NULL; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + goto random; + } + mds = cap->session->s_mds; + dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&ci->i_ceph_lock); + return mds; + +random: + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); + dout("choose_mds chose random mds%d\n", mds); + return mds; +} + + +/* + * session messages + */ +static struct ceph_msg *create_session_msg(u32 op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); + if (!msg) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return NULL; + } + h = msg->front.iov_base; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + + return msg; +} + +/* + * session message, specialization for CEPH_SESSION_REQUEST_OPEN + * to include additional client metadata fields. + */ +static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + int i = -1; + int metadata_bytes = 0; + int metadata_key_count = 0; + struct ceph_options *opt = mdsc->fsc->client->options; + void *p; + + const char* metadata[][2] = { + {"hostname", utsname()->nodename}, + {"kernel_version", utsname()->release}, + {"entity_id", opt->name ? opt->name : ""}, + {NULL, NULL} + }; + + /* Calculate serialized length of metadata */ + metadata_bytes = 4; /* map length */ + for (i = 0; metadata[i][0] != NULL; ++i) { + metadata_bytes += 8 + strlen(metadata[i][0]) + + strlen(metadata[i][1]); + metadata_key_count++; + } + + /* Allocate the message */ + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes, + GFP_NOFS, false); + if (!msg) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return NULL; + } + h = msg->front.iov_base; + h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->seq = cpu_to_le64(seq); + + /* + * Serialize client metadata into waiting buffer space, using + * the format that userspace expects for map + * + * ClientSession messages with metadata are v2 + */ + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); + + /* The write pointer, following the session_head structure */ + p = msg->front.iov_base + sizeof(*h); + + /* Number of entries in the map */ + ceph_encode_32(&p, metadata_key_count); + + /* Two length-prefixed strings for each entry in the map */ + for (i = 0; metadata[i][0] != NULL; ++i) { + size_t const key_len = strlen(metadata[i][0]); + size_t const val_len = strlen(metadata[i][1]); + + ceph_encode_32(&p, key_len); + memcpy(p, metadata[i][0], key_len); + p += key_len; + ceph_encode_32(&p, val_len); + memcpy(p, metadata[i][1], val_len); + p += val_len; + } + + return msg; +} + +/* + * send session open request. + * + * called under mdsc->mutex + */ +static int __open_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int mstate; + int mds = session->s_mds; + + /* wait for mds to go active? */ + mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); + dout("open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); + session->s_state = CEPH_MDS_SESSION_OPENING; + session->s_renew_requested = jiffies; + + /* send connect message */ + msg = create_session_open_msg(mdsc, session->s_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + + if (mds >= mdsc->mdsmap->m_max_mds) + return; + + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); + } +} + +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + +/* + * session caps + */ + +/* + * Free preallocated cap messages assigned to this session + */ +static void cleanup_cap_releases(struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + spin_unlock(&session->s_cap_lock); +} + +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + list_del_init(&req->r_unsafe_item); + pr_info(" dropping unsafe request %llu\n", req->r_tid); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); +} + +/* + * Helper to safely iterate over all caps associated with a session, with + * special care taken to handle a racing __ceph_remove_cap(). + * + * Caller must hold session s_mutex. + */ +static int iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) +{ + struct list_head *p; + struct ceph_cap *cap; + struct inode *inode, *last_inode = NULL; + struct ceph_cap *old_cap = NULL; + int ret; + + dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + spin_lock(&session->s_cap_lock); + p = session->s_caps.next; + while (p != &session->s_caps) { + cap = list_entry(p, struct ceph_cap, session_caps); + inode = igrab(&cap->ci->vfs_inode); + if (!inode) { + p = p->next; + continue; + } + session->s_cap_iterator = cap; + spin_unlock(&session->s_cap_lock); + + if (last_inode) { + iput(last_inode); + last_inode = NULL; + } + if (old_cap) { + ceph_put_cap(session->s_mdsc, old_cap); + old_cap = NULL; + } + + ret = cb(inode, cap, arg); + last_inode = inode; + + spin_lock(&session->s_cap_lock); + p = p->next; + if (cap->ci == NULL) { + dout("iterate_session_caps finishing cap %p removal\n", + cap); + BUG_ON(cap->session != session); + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + old_cap = cap; /* put_cap it w/o locks held */ + } + if (ret < 0) + goto out; + } + ret = 0; +out: + session->s_cap_iterator = NULL; + spin_unlock(&session->s_cap_lock); + + iput(last_inode); + if (old_cap) + ceph_put_cap(session->s_mdsc, old_cap); + + return ret; +} + +static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + spin_lock(&ci->i_ceph_lock); + __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&ci->i_ceph_lock); + while (drop--) + iput(inode); + return 0; +} + +/* + * caller must hold session s_mutex + */ +static void remove_session_caps(struct ceph_mds_session *session) +{ + dout("remove_session_caps on %p\n", session); + iterate_session_caps(session, remove_session_caps_cb, NULL); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct super_block *sb = session->s_mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + spin_unlock(&session->s_cap_lock); + + BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); + cleanup_cap_releases(session); +} + +/* + * wake up any threads waiting on this session's caps. if the cap is + * old (didn't get renewed on the client reconnect), remove it now. + * + * caller must hold s_mutex. + */ +static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + wake_up_all(&ci->i_cap_wq); + if (arg) { + spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + } + return 0; +} + +static void wake_up_session_caps(struct ceph_mds_session *session, + int reconnect) +{ + dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)reconnect); +} + +/* + * Send periodic message to MDS renewing all currently held caps. The + * ack will reset the expiration for all caps from this session. + * + * caller holds s_mutex + */ +static int send_renew_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int state; + + if (time_after_eq(jiffies, session->s_cap_ttl) && + time_after_eq(session->s_cap_ttl, session->s_renew_requested)) + pr_info("mds%d caps stale\n", session->s_mds); + session->s_renew_requested = jiffies; + + /* do not try to renew caps until a recovering mds has reconnected + * with its clients. */ + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout("send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); + return 0; + } + + dout("send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, ceph_session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + +/* + * Note new cap ttl, and any transition from stale -> not stale (fresh?). + * + * Called under session->s_mutex + */ +static void renewed_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, int is_renew) +{ + int was_stale; + int wake = 0; + + spin_lock(&session->s_cap_lock); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); + + session->s_cap_ttl = session->s_renew_requested + + mdsc->mdsmap->m_session_timeout*HZ; + + if (was_stale) { + if (time_before(jiffies, session->s_cap_ttl)) { + pr_info("mds%d caps renewed\n", session->s_mds); + wake = 1; + } else { + pr_info("mds%d caps still stale\n", session->s_mds); + } + } + dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", + session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + spin_unlock(&session->s_cap_lock); + + if (wake) + wake_up_session_caps(session, 0); +} + +/* + * send a session close request + */ +static int request_close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("request_close_session mds%d state %s seq %lld\n", + session->s_mds, ceph_session_state_name(session->s_state), + session->s_seq); + msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Called with s_mutex held. + */ +static int __close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (session->s_state >= CEPH_MDS_SESSION_CLOSING) + return 0; + session->s_state = CEPH_MDS_SESSION_CLOSING; + return request_close_session(mdsc, session); +} + +/* + * Trim old(er) caps. + * + * Because we can't cache an inode without one or more caps, we do + * this indirectly: if a cap is unused, we prune its aliases, at which + * point the inode will hopefully get dropped to. + * + * Yes, this is a bit sloppy. Our only real goal here is to respond to + * memory pressure from the MDS, though, so it needn't be perfect. + */ +static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +{ + struct ceph_mds_session *session = arg; + struct ceph_inode_info *ci = ceph_inode(inode); + int used, wanted, oissued, mine; + + if (session->s_trim_caps <= 0) + return -1; + + spin_lock(&ci->i_ceph_lock); + mine = cap->issued | cap->implemented; + used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); + oissued = __ceph_caps_issued_other(ci, cap); + + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", + inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) + goto out; /* we need these caps */ + + session->s_trim_caps--; + if (oissued) { + /* we aren't the only cap.. just remove us */ + __ceph_remove_cap(cap, true); + } else { + /* try to drop referring dentries */ + spin_unlock(&ci->i_ceph_lock); + d_prune_aliases(inode); + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, atomic_read(&inode->i_count)); + return 0; + } + +out: + spin_unlock(&ci->i_ceph_lock); + return 0; +} + +/* + * Trim session cap count down to some max number. + */ +static int trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) +{ + int trim_caps = session->s_nr_caps - max_caps; + + dout("trim_caps mds%d start: %d / %d, trim %d\n", + session->s_mds, session->s_nr_caps, max_caps, trim_caps); + if (trim_caps > 0) { + session->s_trim_caps = trim_caps; + iterate_session_caps(session, trim_caps_cb, session); + dout("trim_caps mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - session->s_trim_caps); + session->s_trim_caps = 0; + } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + return 0; +} + +/* + * Allocate cap_release messages. If there is a partially full message + * in the queue, try to allocate enough to cover it's remainder, so that + * we can send it immediately. + * + * Called under s_mutex. + */ +int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg, *partial = NULL; + struct ceph_mds_cap_release *head; + int err = -ENOMEM; + int extra = mdsc->fsc->mount_options->cap_release_safety; + int num; + + dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, + extra); + + spin_lock(&session->s_cap_lock); + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + if (num) { + dout(" partial %p with (%d/%d)\n", msg, num, + (int)CEPH_CAPS_PER_RELEASE); + extra += CEPH_CAPS_PER_RELEASE - num; + partial = msg; + } + } + while (session->s_num_cap_releases < session->s_nr_caps + extra) { + spin_unlock(&session->s_cap_lock); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, + GFP_NOFS, false); + if (!msg) + goto out_unlocked; + dout("add_cap_releases %p msg %p now %d\n", session, msg, + (int)msg->front.iov_len); + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + spin_lock(&session->s_cap_lock); + list_add(&msg->list_head, &session->s_cap_releases); + session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; + } + + if (partial) { + head = partial->front.iov_base; + num = le32_to_cpu(head->num); + dout(" queueing partial %p with %d/%d\n", partial, num, + (int)CEPH_CAPS_PER_RELEASE); + list_move_tail(&partial->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + } + err = 0; + spin_unlock(&session->s_cap_lock); +out_unlocked: + return err; +} + +static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + spin_lock(&ci->i_ceph_lock); + if (ci->i_flushing_caps) + ret = ci->i_cap_flush_seq >= want_flush_seq; + else + ret = 1; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq + */ +static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +{ + int mds; + + dout("check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + struct inode *inode = NULL; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { + dout("check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", + &ci->vfs_inode, ci->i_cap_flush_seq, + want_flush_seq, session->s_mds); + inode = igrab(&ci->vfs_inode); + } + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (inode) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(inode, want_flush_seq)); + iput(inode); + } + + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); +} + +/* + * called under s_mutex + */ +void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("send_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + spin_unlock(&session->s_cap_lock); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); + } + spin_unlock(&session->s_cap_lock); +} + +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } +} + +/* + * requests + */ + +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + +/* + * Create an mds request. + */ +struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) +{ + struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + + if (!req) + return ERR_PTR(-ENOMEM); + + mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; + req->r_started = jiffies; + req->r_resend_mds = -1; + INIT_LIST_HEAD(&req->r_unsafe_dir_item); + req->r_fmode = -1; + kref_init(&req->r_kref); + INIT_LIST_HEAD(&req->r_wait); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + + req->r_stamp = CURRENT_TIME; + + req->r_op = op; + req->r_direct_mode = mode; + return req; +} + +/* + * return oldest (lowest) request, tid in request tree, 0 if none. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + +static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req = __get_oldest_req(mdsc); + + if (req) + return req->r_tid; + return 0; +} + +/* + * Build a dentry's path. Allocate on heap; caller must kfree. Based + * on build_path_from_dentry in fs/cifs/dir.c. + * + * If @stop_on_nosnap, generate path relative to the first non-snapped + * inode. + * + * Encode hidden .snap dirs as a double /, i.e. + * foo/.snap/bar -> foo//bar + */ +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap) +{ + struct dentry *temp; + char *path; + int len, pos; + unsigned seq; + + if (dentry == NULL) + return ERR_PTR(-EINVAL); + +retry: + len = 0; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + for (temp = dentry; !IS_ROOT(temp);) { + struct inode *inode = d_inode(temp); + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) + len++; /* slash only */ + else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) + break; + else + len += 1 + temp->d_name.len; + temp = temp->d_parent; + } + rcu_read_unlock(); + if (len) + len--; /* no leading '/' */ + + path = kmalloc(len+1, GFP_NOFS); + if (path == NULL) + return ERR_PTR(-ENOMEM); + pos = len; + path[pos] = 0; /* trailing null */ + rcu_read_lock(); + for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { + struct inode *inode; + + spin_lock(&temp->d_lock); + inode = d_inode(temp); + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { + dout("build_path path+%d: %p SNAPDIR\n", + pos, temp); + } else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) { + spin_unlock(&temp->d_lock); + break; + } else { + pos -= temp->d_name.len; + if (pos < 0) { + spin_unlock(&temp->d_lock); + break; + } + strncpy(path + pos, temp->d_name.name, + temp->d_name.len); + } + spin_unlock(&temp->d_lock); + if (pos) + path[--pos] = '/'; + temp = temp->d_parent; + } + rcu_read_unlock(); + if (pos != 0 || read_seqretry(&rename_lock, seq)) { + pr_err("build_path did not end path lookup where " + "expected, namelen is %d, pos is %d\n", len, pos); + /* presumably this is only possible if racing with a + rename of one of the parent directories (we can not + lock the dentries above us to prevent this, but + retrying should be harmless) */ + kfree(path); + goto retry; + } + + *base = ceph_ino(d_inode(temp)); + *plen = len; + dout("build_path on %p %d built %llx '%.*s'\n", + dentry, d_count(dentry), *base, len, path); + return path; +} + +static int build_dentry_path(struct dentry *dentry, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + char *path; + + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { + *pino = ceph_ino(d_inode(dentry->d_parent)); + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + return 0; + } + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +static int build_inode_path(struct inode *inode, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + struct dentry *dentry; + char *path; + + if (ceph_snap(inode) == CEPH_NOSNAP) { + *pino = ceph_ino(inode); + *ppathlen = 0; + return 0; + } + dentry = d_find_alias(inode); + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + dput(dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +/* + * request arguments may be specified via an inode *, a dentry *, or + * an explicit ino+path. + */ +static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, + const char *rpath, u64 rino, + const char **ppath, int *pathlen, + u64 *ino, int *freepath) +{ + int r = 0; + + if (rinode) { + r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); + } else if (rdentry) { + r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, + *ppath); + } else if (rpath || rino) { + *ino = rino; + *ppath = rpath; + *pathlen = rpath ? strlen(rpath) : 0; + dout(" path %.*s\n", *pathlen, rpath); + } + + return r; +} + +/* + * called under mdsc->mutex + */ +static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds, bool drop_cap_releases) +{ + struct ceph_msg *msg; + struct ceph_mds_request_head *head; + const char *path1 = NULL; + const char *path2 = NULL; + u64 ino1 = 0, ino2 = 0; + int pathlen1 = 0, pathlen2 = 0; + int freepath1 = 0, freepath2 = 0; + int len; + u16 releases; + void *p, *end; + int ret; + + ret = set_request_path_attr(req->r_inode, req->r_dentry, + req->r_path1, req->r_ino1.ino, + &path1, &pathlen1, &ino1, &freepath1); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out; + } + + ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_path2, req->r_ino2.ino, + &path2, &pathlen2, &ino2, &freepath2); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out_free1; + } + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct timespec); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * + (!!req->r_inode_drop + !!req->r_dentry_drop + + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) + len += req->r_dentry->d_name.len; + if (req->r_old_dentry_drop) + len += req->r_old_dentry->d_name.len; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); + if (!msg) { + msg = ERR_PTR(-ENOMEM); + goto out_free2; + } + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.tid = cpu_to_le64(req->r_tid); + + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + end = msg->front.iov_base + msg->front.iov_len; + + head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + head->op = cpu_to_le32(req->r_op); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->args = req->r_args; + + ceph_encode_filepath(&p, end, ino1, path1); + ceph_encode_filepath(&p, end, ino2, path2); + + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + + /* cap releases */ + releases = 0; + if (req->r_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_inode ? req->r_inode : d_inode(req->r_dentry), + mds, req->r_inode_drop, req->r_inode_unless, 0); + if (req->r_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_dentry, + mds, req->r_dentry_drop, req->r_dentry_unless); + if (req->r_old_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (req->r_old_inode_drop) + releases += ceph_encode_inode_release(&p, + d_inode(req->r_old_dentry), + mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + + head->num_releases = cpu_to_le16(releases); + + /* time stamp */ + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + if (req->r_pagelist) { + struct ceph_pagelist *pagelist = req->r_pagelist; + atomic_inc(&pagelist->refcnt); + ceph_msg_data_add_pagelist(msg, pagelist); + msg->hdr.data_len = cpu_to_le32(pagelist->length); + } else { + msg->hdr.data_len = 0; + } + + msg->hdr.data_off = cpu_to_le16(0); + +out_free2: + if (freepath2) + kfree((char *)path2); +out_free1: + if (freepath1) + kfree((char *)path1); +out: + return msg; +} + +/* + * called under mdsc->mutex if error, under no mutex if + * success. + */ +static void complete_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + if (req->r_callback) + req->r_callback(mdsc, req); + else + complete_all(&req->r_completion); +} + +/* + * called under mdsc->mutex + */ +static int __prepare_send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds, bool drop_cap_releases) +{ + struct ceph_mds_request_head *rhead; + struct ceph_msg *msg; + int flags = 0; + + req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } + dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + + if (req->r_got_unsafe) { + void *p; + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + + /* time stamp */ + p = msg->front.iov_base + req->r_request_release_offset; + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + return 0; + } + + if (req->r_request) { + ceph_msg_put(req->r_request); + req->r_request = NULL; + } + msg = create_request_message(mdsc, req, mds, drop_cap_releases); + if (IS_ERR(msg)) { + req->r_err = PTR_ERR(msg); + complete_request(mdsc, req); + return PTR_ERR(msg); + } + req->r_request = msg; + + rhead = msg->front.iov_base; + rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + if (req->r_got_unsafe) + flags |= CEPH_MDS_FLAG_REPLAY; + if (req->r_locked_dir) + flags |= CEPH_MDS_FLAG_WANT_DENTRY; + rhead->flags = cpu_to_le32(flags); + rhead->num_fwd = req->r_num_fwd; + rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; + + dout(" r_locked_dir = %p\n", req->r_locked_dir); + return 0; +} + +/* + * send request, or put it on the appropriate wait list. + */ +static int __do_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_session *session = NULL; + int mds = -1; + int err = -EAGAIN; + + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); + goto out; + } + + if (req->r_timeout && + time_after_eq(jiffies, req->r_started + req->r_timeout)) { + dout("do_request timed out\n"); + err = -EIO; + goto finish; + } + + put_request_session(req); + + mds = __choose_mds(mdsc, req); + if (mds < 0 || + ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + dout("do_request no mds or not active, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto out; + } + + /* get, open session */ + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + session = register_session(mdsc, mds); + if (IS_ERR(session)) { + err = PTR_ERR(session); + goto finish; + } + } + req->r_session = get_session(session); + + dout("do_request mds%d session %p state %s\n", mds, session, + ceph_session_state_name(session->s_state)); + if (session->s_state != CEPH_MDS_SESSION_OPEN && + session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + list_add(&req->r_wait, &session->s_waiting); + goto out_session; + } + + /* send request */ + req->r_resend_mds = -1; /* forget any previous mds hint */ + + if (req->r_request_started == 0) /* note request start time */ + req->r_request_started = jiffies; + + err = __prepare_send_request(mdsc, req, mds, false); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + +out_session: + ceph_put_mds_session(session); +out: + return err; + +finish: + req->r_err = err; + complete_request(mdsc, req); + goto out; +} + +/* + * called under mdsc->mutex + */ +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head) +{ + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); + + list_splice_init(head, &tmp_list); + + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); + list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); + __do_request(mdsc, req); + } +} + +/* + * Wake up threads with requests pending for @mds, so that they can + * resubmit their requests to a possibly different mds. + */ +static void kick_requests(struct ceph_mds_client *mdsc, int mds) +{ + struct ceph_mds_request *req; + struct rb_node *p = rb_first(&mdsc->request_tree); + + dout("kick_requests mds%d\n", mds); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts > 0) + continue; /* only new requests */ + if (req->r_session && + req->r_session->s_mds == mds) { + dout(" kicking tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); + __do_request(mdsc, req); + } + } +} + +void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("submit_request on %p\n", req); + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, NULL); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + if (req->r_inode) + ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + if (req->r_locked_dir) + ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_old_dentry_dir) + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + + /* issue */ + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, dir); + __do_request(mdsc, req); + + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; + } + + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_killable_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else if (req->r_wait_for_completion) { + err = req->r_wait_for_completion(mdsc, req); + } else { + err = wait_for_completion_killable(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); + + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); + + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); + + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); + } else { + err = req->r_err; + } + +out: + mutex_unlock(&mdsc->mutex); + dout("do_request %p done, result %d\n", req, err); + return err; +} + +/* + * Invalidate dir's completeness, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + + ceph_dir_clear_complete(inode); + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* + * Handle mds reply. + * + * We take the session mutex and parse and process the reply immediately. + * This preserves the logical ordering of replies, capabilities, etc., sent + * by the MDS as they are applied to our local cache. + */ +static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request *req; + struct ceph_mds_reply_head *head = msg->front.iov_base; + struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; + u64 tid; + int err, result; + int mds = session->s_mds; + + if (msg->front.iov_len < sizeof(*head)) { + pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + ceph_msg_dump(msg); + return; + } + + /* get request, session */ + tid = le64_to_cpu(msg->hdr.tid); + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("handle_reply on unknown tid %llu\n", tid); + mutex_unlock(&mdsc->mutex); + return; + } + dout("handle_reply %p\n", req); + + /* correct session? */ + if (req->r_session != session) { + pr_err("mdsc_handle_reply got %llu on session mds%d" + " not mds%d\n", tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); + mutex_unlock(&mdsc->mutex); + goto out; + } + + /* dup? */ + if ((req->r_got_unsafe && !head->safe) || + (req->r_got_safe && head->safe)) { + pr_warn("got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + if (req->r_got_safe && !head->safe) { + pr_warn("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + + result = le32_to_cpu(head->result); + + /* + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE + */ + if (result == -ESTALE) { + dout("got ESTALE on request %llu", req->r_tid); + req->r_resend_mds = -1; + if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now"); + req->r_direct_mode = USE_AUTH_MDS; + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } else { + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } + } + dout("have to return ESTALE on request %llu", req->r_tid); + } + + + if (head->safe) { + req->r_got_safe = true; + __unregister_request(mdsc, req); + + if (req->r_got_unsafe) { + /* + * We already handled the unsafe response, now do the + * cleanup. No need to examine the response; the MDS + * doesn't include any result info in the safe + * response. And even if it did, there is nothing + * useful we could do with a revised return value. + */ + dout("got safe reply %llu, mds%d\n", tid, mds); + list_del_init(&req->r_unsafe_item); + + /* last unsafe request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete_all(&mdsc->safe_umount_waiters); + mutex_unlock(&mdsc->mutex); + goto out; + } + } else { + req->r_got_unsafe = true; + list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); + } + + dout("handle_reply tid %lld result %d\n", tid, result); + rinfo = &req->r_reply_info; + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (err < 0) { + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); + ceph_msg_dump(msg); + goto out_err; + } + + /* snap trace */ + realm = NULL; + if (rinfo->snapblob_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, rinfo->snapblob, + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + + /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + if (err == 0) { + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP)) + ceph_readdir_prepopulate(req, req->r_session); + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); + } + mutex_unlock(&req->r_fill_mutex); + + up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); +out_err: + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } + } else { + dout("reply arrived after request %lld was aborted\n", tid); + } + mutex_unlock(&mdsc->mutex); + + ceph_add_cap_releases(mdsc, req->r_session); + mutex_unlock(&session->s_mutex); + + /* kick calling process */ + complete_request(mdsc, req); +out: + ceph_mdsc_put_request(req); + return; +} + + + +/* + * handle mds notification that our request has been forwarded. + */ +static void handle_forward(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 next_mds; + u32 fwd_seq; + int err = -EINVAL; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + next_mds = ceph_decode_32(&p); + fwd_seq = ceph_decode_32(&p); + + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); + goto out; /* dup reply? */ + } + + if (req->r_aborted) { + dout("forward tid %llu aborted, unregistering\n", tid); + __unregister_request(mdsc, req); + } else if (fwd_seq <= req->r_num_fwd) { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } else { + /* resend. forward race not possible; mds would drop */ + dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + BUG_ON(req->r_err); + BUG_ON(req->r_got_result); + req->r_attempts = 0; + req->r_num_fwd = fwd_seq; + req->r_resend_mds = next_mds; + put_request_session(req); + __do_request(mdsc, req); + } + ceph_mdsc_put_request(req); +out: + mutex_unlock(&mdsc->mutex); + return; + +bad: + pr_err("mdsc_handle_forward decode error err=%d\n", err); +} + +/* + * handle a mds session control message + */ +static void handle_session(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + u32 op; + u64 seq; + int mds = session->s_mds; + struct ceph_mds_session_head *h = msg->front.iov_base; + int wake = 0; + + /* decode */ + if (msg->front.iov_len != sizeof(*h)) + goto bad; + op = le32_to_cpu(h->op); + seq = le64_to_cpu(h->seq); + + mutex_lock(&mdsc->mutex); + if (op == CEPH_SESSION_CLOSE) + __unregister_session(mdsc, session); + /* FIXME: this ttl calculation is generous */ + session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + + dout("handle_session mds%d %s %p state %s seq %llu\n", + mds, ceph_session_op_name(op), session, + ceph_session_state_name(session->s_state), seq); + + if (session->s_state == CEPH_MDS_SESSION_HUNG) { + session->s_state = CEPH_MDS_SESSION_OPEN; + pr_info("mds%d came back\n", session->s_mds); + } + + switch (op) { + case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_OPEN; + renewed_caps(mdsc, session, 0); + wake = 1; + if (mdsc->stopping) + __close_session(mdsc, session); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->s_renew_seq == seq) + renewed_caps(mdsc, session, 1); + break; + + case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + wake = 2; /* for good measure */ + wake_up_all(&mdsc->session_close_wq); + break; + + case CEPH_SESSION_STALE: + pr_info("mds%d caps went stale, renewing\n", + session->s_mds); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + session->s_cap_ttl = jiffies - 1; + spin_unlock(&session->s_gen_ttl_lock); + send_renew_caps(mdsc, session); + break; + + case CEPH_SESSION_RECALL_STATE: + trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + break; + + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, 0); + break; + + default: + pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + WARN_ON(1); + } + + mutex_unlock(&session->s_mutex); + if (wake) { + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + if (wake == 2) + kick_requests(mdsc, mds); + mutex_unlock(&mdsc->mutex); + } + return; + +bad: + pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; +} + + +/* + * called under session->mutex. + */ +static void replay_unsafe_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req, *nreq; + struct rb_node *p; + int err; + + dout("replay_unsafe_requests mds%d\n", session->s_mds); + + mutex_lock(&mdsc->mutex); + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { + err = __prepare_send_request(mdsc, req, session->s_mds, true); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (req->r_session && + req->r_session->s_mds == session->s_mds) { + err = __prepare_send_request(mdsc, req, + session->s_mds, true); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + } + mutex_unlock(&mdsc->mutex); +} + +/* + * Encode information about a cap for a reconnect with the MDS. + */ +static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + size_t reclen; + struct ceph_inode_info *ci; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; + char *path; + int pathlen, err; + u64 pathbase; + struct dentry *dentry; + + ci = cap->ci; + + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + if (err) + return err; + + dentry = d_find_alias(inode); + if (dentry) { + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_dput; + } + } else { + path = NULL; + pathlen = 0; + } + err = ceph_pagelist_encode_string(pagelist, path, pathlen); + if (err) + goto out_free; + + spin_lock(&ci->i_ceph_lock); + cap->seq = 0; /* reset cap seq */ + cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; + + if (recon_state->flock) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = 0; + reclen = sizeof(rec.v2); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + reclen = sizeof(rec.v1); + } + spin_unlock(&ci->i_ceph_lock); + + if (recon_state->flock) { + int num_fcntl_locks, num_flock_locks; + struct ceph_filelock *flocks; + +encode_again: + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); + } else { + err = ceph_pagelist_append(pagelist, &rec, reclen); + } + + recon_state->nr_caps++; +out_free: + kfree(path); +out_dput: + dput(dentry); + return err; +} + + +/* + * If an MDS fails and recovers, clients need to reconnect in order to + * reestablish shared state. This includes all caps issued through + * this session _and_ the snap_realm hierarchy. Because it's not + * clear which snap realms the mds cares about, we send everything we + * know about.. that ensures we'll then get any new info the + * recovering MDS might have. + * + * This is a relatively heavyweight operation, but it's rare. + * + * called with mdsc->mutex held. + */ +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *reply; + struct rb_node *p; + int mds = session->s_mds; + int err = -ENOMEM; + int s_nr_caps; + struct ceph_pagelist *pagelist; + struct ceph_reconnect_state recon_state; + + pr_info("mds%d reconnect start\n", mds); + + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + goto fail_nopagelist; + ceph_pagelist_init(pagelist); + + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); + if (!reply) + goto fail_nomsg; + + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; + + dout("session %p state %s\n", session, + ceph_session_state_name(session->s_state)); + + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + + /* trim unused caps to reduce MDS's cache rejoin time */ + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); + + ceph_con_close(&session->s_con); + ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); + + down_read(&mdsc->snap_rwsem); + + /* traverse this session's caps */ + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); + if (err) + goto fail; + + recon_state.nr_caps = 0; + recon_state.pagelist = pagelist; + recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + err = iterate_session_caps(session, encode_caps_cb, &recon_state); + if (err < 0) + goto fail; + + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + } + + if (recon_state.flock) + reply->hdr.version = cpu_to_le16(2); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); + } + + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); + ceph_con_send(&session->s_con, reply); + + mutex_unlock(&session->s_mutex); + + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + up_read(&mdsc->snap_rwsem); + return; + +fail: + ceph_msg_put(reply); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); +fail_nomsg: + ceph_pagelist_release(pagelist); +fail_nopagelist: + pr_err("error %d preparing reconnect for mds%d\n", err, mds); + return; +} + + +/* + * compare old and new mdsmaps, kicking requests + * and closing out old connections as necessary + * + * called under mdsc->mutex. + */ +static void check_new_map(struct ceph_mds_client *mdsc, + struct ceph_mdsmap *newmap, + struct ceph_mdsmap *oldmap) +{ + int i; + int oldstate, newstate; + struct ceph_mds_session *s; + + dout("check_new_map new %u old %u\n", + newmap->m_epoch, oldmap->m_epoch); + + for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i] == NULL) + continue; + s = mdsc->sessions[i]; + oldstate = ceph_mdsmap_get_state(oldmap, i); + newstate = ceph_mdsmap_get_state(newmap, i); + + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", + ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", + ceph_session_state_name(s->s_state)); + + if (i >= newmap->m_max_mds || + memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + if (s->s_state == CEPH_MDS_SESSION_OPENING) { + /* the session never opened, just close it + * out now */ + __wake_requests(mdsc, &s->s_waiting); + __unregister_session(mdsc, s); + } else { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; + } + } else if (oldstate == newstate) { + continue; /* nothing new with this mds */ + } + + /* + * send reconnect? + */ + if (s->s_state == CEPH_MDS_SESSION_RESTARTING && + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } + + /* + * kick request on any mds that has gone active. + */ + if (oldstate < CEPH_MDS_STATE_ACTIVE && + newstate >= CEPH_MDS_STATE_ACTIVE) { + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); + ceph_kick_flushing_caps(mdsc, s); + wake_up_session_caps(s, 1); + } + } + + for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } +} + + + +/* + * leases + */ + +/* + * caller must hold session s_mutex, dentry->d_lock + */ +void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + ceph_put_mds_session(di->lease_session); + di->lease_session = NULL; +} + +static void handle_lease(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct inode *inode; + struct dentry *parent, *dentry; + struct ceph_dentry_info *di; + int mds = session->s_mds; + struct ceph_mds_lease *h = msg->front.iov_base; + u32 seq; + struct ceph_vino vino; + struct qstr dname; + int release = 0; + + dout("handle_lease from mds%d\n", mds); + + /* decode */ + if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) + goto bad; + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + seq = le32_to_cpu(h->seq); + dname.name = (void *)h + sizeof(*h) + sizeof(u32); + dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); + if (dname.len != get_unaligned_le32(h+1)) + goto bad; + + /* lookup inode */ + inode = ceph_find_inode(sb, vino); + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, + dname.len, dname.name); + + mutex_lock(&session->s_mutex); + session->s_seq++; + + if (inode == NULL) { + dout("handle_lease no inode %llx\n", vino.ino); + goto release; + } + + /* dentry */ + parent = d_find_alias(inode); + if (!parent) { + dout("no parent dentry on inode %p\n", inode); + WARN_ON(1); + goto release; /* hrm... */ + } + dname.hash = full_name_hash(dname.name, dname.len); + dentry = d_lookup(parent, &dname); + dput(parent); + if (!dentry) + goto release; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + switch (h->action) { + case CEPH_MDS_LEASE_REVOKE: + if (di->lease_session == session) { + if (ceph_seq_cmp(di->lease_seq, seq) > 0) + h->seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + release = 1; + break; + + case CEPH_MDS_LEASE_RENEW: + if (di->lease_session == session && + di->lease_gen == session->s_cap_gen && + di->lease_renew_from && + di->lease_renew_after == 0) { + unsigned long duration = + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); + + di->lease_seq = seq; + dentry->d_time = di->lease_renew_from + duration; + di->lease_renew_after = di->lease_renew_from + + (duration >> 1); + di->lease_renew_from = 0; + } + break; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + + if (!release) + goto out; + +release: + /* let's just reuse the same message */ + h->action = CEPH_MDS_LEASE_REVOKE_ACK; + ceph_msg_get(msg); + ceph_con_send(&session->s_con, msg); + +out: + iput(inode); + mutex_unlock(&session->s_mutex); + return; + +bad: + pr_err("corrupt lease message\n"); + ceph_msg_dump(msg); +} + +void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_lease *lease; + int len = sizeof(*lease) + sizeof(u32); + int dnamelen = 0; + + dout("lease_send_msg inode %p dentry %p %s to mds%d\n", + inode, dentry, ceph_lease_op_name(action), session->s_mds); + dnamelen = dentry->d_name.len; + len += dnamelen; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); + if (!msg) + return; + lease = msg->front.iov_base; + lease->action = action; + lease->ino = cpu_to_le64(ceph_vino(inode).ino); + lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); + lease->seq = cpu_to_le32(seq); + put_unaligned_le32(dnamelen, lease + 1); + memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); + + /* + * if this is a preemptive lease RELEASE, no need to + * flush request stream, since the actual request will + * soon follow. + */ + msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); + + ceph_con_send(&session->s_con, msg); +} + +/* + * Preemptively release a lease we expect to invalidate anyway. + * Pass @inode always, @dentry is optional. + */ +void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, + struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *session; + u32 seq; + + BUG_ON(inode == NULL); + BUG_ON(dentry == NULL); + + /* is dentry lease valid? */ + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (!di || !di->lease_session || + di->lease_session->s_mds < 0 || + di->lease_gen != di->lease_session->s_cap_gen || + !time_before(jiffies, dentry->d_time)) { + dout("lease_release inode %p dentry %p -- " + "no lease\n", + inode, dentry); + spin_unlock(&dentry->d_lock); + return; + } + + /* we do have a lease on this dentry; note mds and seq */ + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); + ceph_mdsc_lease_send_msg(session, inode, dentry, + CEPH_MDS_LEASE_RELEASE, seq); + ceph_put_mds_session(session); +} + +/* + * drop all leases (and dentry refs) in preparation for umount + */ +static void drop_leases(struct ceph_mds_client *mdsc) +{ + int i; + + dout("drop_leases\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + + + +/* + * delayed work -- periodically trim expired leases, renew caps with mds + */ +static void schedule_delayed(struct ceph_mds_client *mdsc) +{ + int delay = 5; + unsigned hz = round_jiffies_relative(HZ * delay); + schedule_delayed_work(&mdsc->delayed_work, hz); +} + +static void delayed_work(struct work_struct *work) +{ + int i; + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, delayed_work.work); + int renew_interval; + int renew_caps; + + dout("mdsc delayed_work\n"); + ceph_check_delayed_caps(mdsc); + + mutex_lock(&mdsc->mutex); + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; + renew_caps = time_after_eq(jiffies, HZ*renew_interval + + mdsc->last_renew_caps); + if (renew_caps) + mdsc->last_renew_caps = jiffies; + + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (s == NULL) + continue; + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout("resending session close request for mds%d\n", + s->s_mds); + request_close_session(mdsc, s); + ceph_put_mds_session(s); + continue; + } + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + if (s->s_state == CEPH_MDS_SESSION_OPEN) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + } + if (s->s_state < CEPH_MDS_SESSION_OPEN) { + /* this mds is failed or recovering, just wait */ + ceph_put_mds_session(s); + continue; + } + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + if (renew_caps) + send_renew_caps(mdsc, s); + else + ceph_con_keepalive(&s->s_con); + ceph_add_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(mdsc, s); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + schedule_delayed(mdsc); +} + +int ceph_mdsc_init(struct ceph_fs_client *fsc) + +{ + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; + mutex_init(&mdsc->mutex); + mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) { + kfree(mdsc); + return -ENOMEM; + } + + init_completion(&mdsc->safe_umount_waiters); + init_waitqueue_head(&mdsc->session_close_wq); + INIT_LIST_HEAD(&mdsc->waiting_for_map); + mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); + mdsc->max_sessions = 0; + mdsc->stopping = 0; + init_rwsem(&mdsc->snap_rwsem); + mdsc->snap_realms = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snap_empty); + spin_lock_init(&mdsc->snap_empty_lock); + mdsc->last_tid = 0; + mdsc->request_tree = RB_ROOT; + INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); + mdsc->last_renew_caps = jiffies; + INIT_LIST_HEAD(&mdsc->cap_delay_list); + spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->snap_flush_list); + spin_lock_init(&mdsc->snap_flush_lock); + mdsc->cap_flush_seq = 0; + INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); + mdsc->num_cap_flushing = 0; + spin_lock_init(&mdsc->cap_dirty_lock); + init_waitqueue_head(&mdsc->cap_flushing_wq); + spin_lock_init(&mdsc->dentry_lru_lock); + INIT_LIST_HEAD(&mdsc->dentry_lru); + + ceph_caps_init(mdsc); + ceph_adjust_min_caps(mdsc, fsc->min_caps); + + return 0; +} + +/* + * Wait for safe replies on open mds requests. If we time out, drop + * all requests from the tree to avoid dangling dentry refs. + */ +static void wait_requests(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req; + struct ceph_fs_client *fsc = mdsc->fsc; + + mutex_lock(&mdsc->mutex); + if (__get_oldest_req(mdsc)) { + mutex_unlock(&mdsc->mutex); + + dout("wait_requests waiting for requests\n"); + wait_for_completion_timeout(&mdsc->safe_umount_waiters, + fsc->client->options->mount_timeout * HZ); + + /* tear down remaining requests */ + mutex_lock(&mdsc->mutex); + while ((req = __get_oldest_req(mdsc))) { + dout("wait_requests timed out on tid %llu\n", + req->r_tid); + __unregister_request(mdsc, req); + } + } + mutex_unlock(&mdsc->mutex); + dout("wait_requests done\n"); +} + +/* + * called before mount is ro, and before dentries are torn down. + * (hmm, does this still race with new lookups?) + */ +void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) +{ + dout("pre_umount\n"); + mdsc->stopping = 1; + + drop_leases(mdsc); + ceph_flush_dirty_caps(mdsc); + wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); +} + +/* + * wait for all write mds requests to flush. + */ +static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_mds_request *req = NULL, *nextreq; + struct rb_node *n; + + mutex_lock(&mdsc->mutex); + dout("wait_unsafe_requests want %lld\n", want_tid); +restart: + req = __get_oldest_req(mdsc); + while (req && req->r_tid <= want_tid) { + /* find next request */ + n = rb_next(&req->r_node); + if (n) + nextreq = rb_entry(n, struct ceph_mds_request, r_node); + else + nextreq = NULL; + if ((req->r_op & CEPH_MDS_OP_WRITE)) { + /* write op */ + ceph_mdsc_get_request(req); + if (nextreq) + ceph_mdsc_get_request(nextreq); + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); + ceph_mdsc_put_request(req); + if (!nextreq) + break; /* next dne before, so we're done! */ + if (RB_EMPTY_NODE(&nextreq->r_node)) { + /* next request was removed from tree */ + ceph_mdsc_put_request(nextreq); + goto restart; + } + ceph_mdsc_put_request(nextreq); /* won't go away */ + } + req = nextreq; + } + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests done\n"); +} + +void ceph_mdsc_sync(struct ceph_mds_client *mdsc) +{ + u64 want_tid, want_flush; + + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + + dout("sync\n"); + mutex_lock(&mdsc->mutex); + want_tid = mdsc->last_tid; + mutex_unlock(&mdsc->mutex); + + ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->cap_flush_seq; + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + + wait_unsafe_requests(mdsc, want_tid); + wait_caps_flush(mdsc, want_flush); +} + +/* + * true if all sessions are closed, or we force unmount + */ +static bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + return atomic_read(&mdsc->num_sessions) == 0; +} + +/* + * called after sb is ro. + */ +void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int i; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; + + dout("close_sessions\n"); + + /* close sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); + + /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i]) { + session = get_session(mdsc->sessions[i]); + __unregister_session(mdsc, session); + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + remove_session_caps(session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + } + WARN_ON(!list_empty(&mdsc->cap_delay_list)); + mutex_unlock(&mdsc->mutex); + + ceph_cleanup_empty_realms(mdsc); + + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + + dout("stopped\n"); +} + +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + if (mdsc->mdsmap) + ceph_mdsmap_destroy(mdsc->mdsmap); + kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); +} + +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + dout("mdsc_destroy %p\n", mdsc); + ceph_mdsc_stop(mdsc); + + /* flush out any connection work with references to us */ + ceph_msgr_flush(); + + fsc->mdsc = NULL; + kfree(mdsc); + dout("mdsc_destroy %p done\n", mdsc); +} + + +/* + * handle mds map update. + */ +void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + u32 epoch; + u32 maplen; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mdsmap *newmap, *oldmap; + struct ceph_fsid fsid; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) + return; + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + + /* do we need it? */ + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); + mutex_lock(&mdsc->mutex); + if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { + dout("handle_map epoch %u <= our %u\n", + epoch, mdsc->mdsmap->m_epoch); + mutex_unlock(&mdsc->mutex); + return; + } + + newmap = ceph_mdsmap_decode(&p, end); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad_unlock; + } + + /* swap into place */ + if (mdsc->mdsmap) { + oldmap = mdsc->mdsmap; + mdsc->mdsmap = newmap; + check_new_map(mdsc, newmap, oldmap); + ceph_mdsmap_destroy(oldmap); + } else { + mdsc->mdsmap = newmap; /* first mds map */ + } + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + + __wake_requests(mdsc, &mdsc->waiting_for_map); + + mutex_unlock(&mdsc->mutex); + schedule_delayed(mdsc); + return; + +bad_unlock: + mutex_unlock(&mdsc->mutex); +bad: + pr_err("error decoding mdsmap %d\n", err); + return; +} + +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + if (get_session(s)) { + dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + return con; + } + dout("mdsc con_get %p FAIL\n", s); + return NULL; +} + +static void con_put(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); + ceph_put_mds_session(s); +} + +/* + * if the client is unresponsive for long enough, the mds will kill + * the session entirely. + */ +static void peer_reset(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + + pr_warn("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); +} + +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + int type = le16_to_cpu(msg->hdr.type); + + mutex_lock(&mdsc->mutex); + if (__verify_registered_session(mdsc, s) < 0) { + mutex_unlock(&mdsc->mutex); + goto out; + } + mutex_unlock(&mdsc->mutex); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(mdsc, msg); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_session(s, msg); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_reply(s, msg); + break; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_forward(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_CAPS: + ceph_handle_caps(s, msg); + break; + case CEPH_MSG_CLIENT_SNAP: + ceph_handle_snap(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(mdsc, s, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * authentication + */ + +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } + *proto = ac->protocol; + + return auth; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); +} + +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + +static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_sign_message(auth, msg); +} + +static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_check_message_signature(auth, msg); +} + +static const struct ceph_connection_operations mds_con_ops = { + .get = con_get, + .put = con_put, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, + .sign_message = sign_message, + .check_message_signature = check_message_signature, +}; + +/* eof */ diff --git a/kernel/fs/ceph/mds_client.h b/kernel/fs/ceph/mds_client.h new file mode 100644 index 000000000..1875b5d98 --- /dev/null +++ b/kernel/fs/ceph/mds_client.h @@ -0,0 +1,406 @@ +#ifndef _FS_CEPH_MDS_CLIENT_H +#define _FS_CEPH_MDS_CLIENT_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Some lock dependencies: + * + * session->s_mutex + * mdsc->mutex + * + * mdsc->snap_rwsem + * + * ci->i_ceph_lock + * mdsc->snap_flush_lock + * mdsc->cap_delay_lock + * + */ + +struct ceph_fs_client; +struct ceph_cap; + +/* + * parsed info about a single inode. pointers are into the encoded + * on-wire structures within the mds reply message payload. + */ +struct ceph_mds_reply_info_in { + struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; + u32 symlink_len; + char *symlink; + u32 xattr_len; + char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; +}; + +/* + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). + */ +struct ceph_mds_reply_info_parsed { + struct ceph_mds_reply_head *head; + + /* trace */ + struct ceph_mds_reply_info_in diri, targeti; + struct ceph_mds_reply_dirfrag *dirfrag; + char *dname; + u32 dname_len; + struct ceph_mds_reply_lease *dlease; + + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; + }; + + /* encoded blob describing snapshot contexts for certain + operations (e.g., open) */ + void *snapblob; + int snapblob_len; +}; + + +/* + * cap releases are batched and sent to the MDS en masse. + */ +#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ + sizeof(struct ceph_mds_cap_release)) / \ + sizeof(struct ceph_mds_cap_item)) + + +/* + * state associated with each MDS<->client session + */ +enum { + CEPH_MDS_SESSION_NEW = 1, + CEPH_MDS_SESSION_OPENING = 2, + CEPH_MDS_SESSION_OPEN = 3, + CEPH_MDS_SESSION_HUNG = 4, + CEPH_MDS_SESSION_CLOSING = 5, + CEPH_MDS_SESSION_RESTARTING = 6, + CEPH_MDS_SESSION_RECONNECTING = 7, +}; + +struct ceph_mds_session { + struct ceph_mds_client *s_mdsc; + int s_mds; + int s_state; + unsigned long s_ttl; /* time until mds kills us */ + u64 s_seq; /* incoming msg seq # */ + struct mutex s_mutex; /* serialize session messages */ + + struct ceph_connection s_con; + + struct ceph_auth_handshake s_auth; + + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; + u32 s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; + struct list_head s_caps; /* all caps issued by this session */ + int s_nr_caps, s_trim_caps; + int s_num_cap_releases; + int s_cap_reconnect; + int s_readonly; + struct list_head s_cap_releases; /* waiting cap_release messages */ + struct list_head s_cap_releases_done; /* ready to send */ + struct ceph_cap *s_cap_iterator; + + /* protected by mutex */ + struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + struct list_head s_cap_snaps_flushing; + unsigned long s_renew_requested; /* last time we sent a renew req */ + u64 s_renew_seq; + + atomic_t s_ref; + struct list_head s_waiting; /* waiting requests */ + struct list_head s_unsafe; /* unsafe requests */ +}; + +/* + * modes of choosing which MDS to send a request to + */ +enum { + USE_ANY_MDS, + USE_RANDOM_MDS, + USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ +}; + +struct ceph_mds_request; +struct ceph_mds_client; + +/* + * request completion callback + */ +typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +/* + * an in-flight mds request + */ +struct ceph_mds_request { + u64 r_tid; /* transaction id */ + struct rb_node r_node; + struct ceph_mds_client *r_mdsc; + + int r_op; /* mds op code */ + + /* operation on what? */ + struct inode *r_inode; /* arg1 */ + struct dentry *r_dentry; /* arg1 */ + struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ + char *r_path1, *r_path2; + struct ceph_vino r_ino1, r_ino2; + + struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_target_inode; /* resulting inode */ + + struct mutex r_fill_mutex; + + union ceph_mds_request_args r_args; + int r_fmode; /* file mode, if expecting cap */ + kuid_t r_uid; + kgid_t r_gid; + struct timespec r_stamp; + + /* for choosing which mds to send this request to */ + int r_direct_mode; + u32 r_direct_hash; /* choose dir frag based on this dentry hash */ + bool r_direct_is_hash; /* true if r_direct_hash is valid */ + + /* data payload is used for xattr ops */ + struct ceph_pagelist *r_pagelist; + + /* what caps shall we drop? */ + int r_inode_drop, r_inode_unless; + int r_dentry_drop, r_dentry_unless; + int r_old_dentry_drop, r_old_dentry_unless; + struct inode *r_old_inode; + int r_old_inode_drop, r_old_inode_unless; + + struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; + struct ceph_msg *r_reply; + struct ceph_mds_reply_info_parsed r_reply_info; + struct page *r_locked_page; + int r_err; + bool r_aborted; + + unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_request_started; /* start time for mds request only, + used to measure lease durations */ + + /* link unsafe requests to parent directory, for fsync */ + struct inode *r_unsafe_dir; + struct list_head r_unsafe_dir_item; + + struct ceph_mds_session *r_session; + + int r_attempts; /* resend attempts */ + int r_num_fwd; /* number of forward attempts */ + int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + + struct kref r_kref; + struct list_head r_wait; + struct completion r_completion; + struct completion r_safe_completion; + ceph_mds_request_callback_t r_callback; + ceph_mds_request_wait_callback_t r_wait_for_completion; + struct list_head r_unsafe_item; /* per-session unsafe list item */ + bool r_got_unsafe, r_got_safe, r_got_result; + + bool r_did_prepopulate; + u32 r_readdir_offset; + + struct ceph_cap_reservation r_caps_reservation; + int r_num_caps; +}; + +/* + * mds client state + */ +struct ceph_mds_client { + struct ceph_fs_client *fsc; + struct mutex mutex; /* all nested structures */ + + struct ceph_mdsmap *mdsmap; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; + struct list_head waiting_for_map; + + struct ceph_mds_session **sessions; /* NULL for mds if no session */ + atomic_t num_sessions; + int max_sessions; /* len of s_mds_sessions */ + int stopping; /* true if shutting down */ + + /* + * snap_rwsem will cover cap linkage into snaprealms, and + * realm snap contexts. (later, we can do per-realm snap + * contexts locks..) the empty list contains realms with no + * references (implying they contain no inodes with caps) that + * should be destroyed. + */ + struct rw_semaphore snap_rwsem; + struct rb_root snap_realms; + struct list_head snap_empty; + spinlock_t snap_empty_lock; /* protect snap_empty */ + + u64 last_tid; /* most recent mds request */ + struct rb_root request_tree; /* pending mds requests */ + struct delayed_work delayed_work; /* delayed work */ + unsigned long last_renew_caps; /* last time we renewed our caps */ + struct list_head cap_delay_list; /* caps with delayed release */ + spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head snap_flush_list; /* cap_snaps ready to flush */ + spinlock_t snap_flush_lock; + + u64 cap_flush_seq; + struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ + int num_cap_flushing; /* # caps we are flushing */ + spinlock_t cap_dirty_lock; /* protects above items */ + wait_queue_head_t cap_flushing_wq; + + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ + spinlock_t dentry_lru_lock; + struct list_head dentry_lru; + int num_dentry; +}; + +extern const char *ceph_mds_op_name(int op); + +extern struct ceph_mds_session * +__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); + +static inline struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s) +{ + atomic_inc(&s->s_ref); + return s; +} + +extern const char *ceph_session_state_name(int s); + +extern void ceph_put_mds_session(struct ceph_mds_session *s); + +extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, + struct ceph_msg *msg, int mds); + +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); +extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); + +extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); + +extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, + struct inode *inode, + struct dentry *dn); + +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); +extern struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); +extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); +extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_mdsc_release_request(struct kref *kref); +static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) +{ + kref_put(&req->r_kref, ceph_mdsc_release_request); +} + +extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + +extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); + +extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap); + +extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); +extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq); + +extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); + +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + +#endif diff --git a/kernel/fs/ceph/mdsmap.c b/kernel/fs/ceph/mdsmap.c new file mode 100644 index 000000000..261531e55 --- /dev/null +++ b/kernel/fs/ceph/mdsmap.c @@ -0,0 +1,189 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "super.h" + + +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int n = 0; + int i; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; + + /* count */ + for (i = 0; i < m->m_max_mds; i++) + if (m->m_info[i].state > 0) + n++; + if (n == 0) + return -1; + + /* pick */ + n = prandom_u32() % n; + i = 0; + for (i = 0; n > 0; i++, n--) + while (m->m_info[i].state <= 0) + i++; + + return i; +} + +/* + * Decode an MDS map + * + * Ignore any fields we don't care about (there are quite a few of + * them). + */ +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +{ + struct ceph_mdsmap *m; + const void *start = *p; + int i, j, n; + int err = -EINVAL; + u16 version; + + m = kzalloc(sizeof(*m), GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_16_safe(p, end, version, bad); + if (version > 3) { + pr_warn("got mdsmap version %d > 3, failing", version); + goto bad; + } + + ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); + m->m_epoch = ceph_decode_32(p); + m->m_client_epoch = ceph_decode_32(p); + m->m_last_failure = ceph_decode_32(p); + m->m_root = ceph_decode_32(p); + m->m_session_timeout = ceph_decode_32(p); + m->m_session_autoclose = ceph_decode_32(p); + m->m_max_file_size = ceph_decode_64(p); + m->m_max_mds = ceph_decode_32(p); + + m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + if (m->m_info == NULL) + goto badmem; + + /* pick out active nodes from mds_info (state > 0) */ + n = ceph_decode_32(p); + for (i = 0; i < n; i++) { + u64 global_id; + u32 namelen; + s32 mds, inc, state; + u64 state_seq; + u8 infoversion; + struct ceph_entity_addr addr; + u32 num_export_targets; + void *pexport_targets = NULL; + struct ceph_timespec laggy_since; + struct ceph_mds_info *info; + + ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + global_id = ceph_decode_64(p); + infoversion = ceph_decode_8(p); + *p += sizeof(u64); + namelen = ceph_decode_32(p); /* skip mds name */ + *p += namelen; + + ceph_decode_need(p, end, + 4*sizeof(u32) + sizeof(u64) + + sizeof(addr) + sizeof(struct ceph_timespec), + bad); + mds = ceph_decode_32(p); + inc = ceph_decode_32(p); + state = ceph_decode_32(p); + state_seq = ceph_decode_64(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + *p += sizeof(u32); + ceph_decode_32_safe(p, end, namelen, bad); + *p += namelen; + if (infoversion >= 2) { + ceph_decode_32_safe(p, end, num_export_targets, bad); + pexport_targets = *p; + *p += num_export_targets * sizeof(u32); + } else { + num_export_targets = 0; + } + + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), + ceph_mds_state_name(state)); + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; + } + } + + /* pg_pools */ + ceph_decode_32_safe(p, end, n, bad); + m->m_num_data_pg_pools = n; + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); + if (!m->m_data_pg_pools) + goto badmem; + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); + for (i = 0; i < n; i++) + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); + + /* ok, we don't care about the rest. */ + dout("mdsmap_decode success epoch %u\n", m->m_epoch); + return m; + +badmem: + err = -ENOMEM; +bad: + pr_err("corrupt mdsmap\n"); + print_hex_dump(KERN_DEBUG, "mdsmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + ceph_mdsmap_destroy(m); + return ERR_PTR(err); +} + +void ceph_mdsmap_destroy(struct ceph_mdsmap *m) +{ + int i; + + for (i = 0; i < m->m_max_mds; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + kfree(m->m_data_pg_pools); + kfree(m); +} diff --git a/kernel/fs/ceph/snap.c b/kernel/fs/ceph/snap.c new file mode 100644 index 000000000..a97e39f09 --- /dev/null +++ b/kernel/fs/ceph/snap.c @@ -0,0 +1,977 @@ +#include + +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include + +/* + * Snapshots in ceph are driven in large part by cooperation from the + * client. In contrast to local file systems or file servers that + * implement snapshots at a single point in the system, ceph's + * distributed access to storage requires clients to help decide + * whether a write logically occurs before or after a recently created + * snapshot. + * + * This provides a perfect instantanous client-wide snapshot. Between + * clients, however, snapshots may appear to be applied at slightly + * different points in time, depending on delays in delivering the + * snapshot notification. + * + * Snapshots are _not_ file system-wide. Instead, each snapshot + * applies to the subdirectory nested beneath some directory. This + * effectively divides the hierarchy into multiple "realms," where all + * of the files contained by each realm share the same set of + * snapshots. An individual realm's snap set contains snapshots + * explicitly created on that realm, as well as any snaps in its + * parent's snap set _after_ the point at which the parent became it's + * parent (due to, say, a rename). Similarly, snaps from prior parents + * during the time intervals during which they were the parent are included. + * + * The client is spared most of this detail, fortunately... it must only + * maintains a hierarchy of realms reflecting the current parent/child + * realm relationship, and for each realm has an explicit list of snaps + * inherited from prior parents. + * + * A snap_realm struct is maintained for realms containing every inode + * with an open cap in the system. (The needed snap realm information is + * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' + * version number is used to ensure that as realm parameters change (new + * snapshot, new parent, etc.) the client's realm hierarchy is updated. + * + * The realm hierarchy drives the generation of a 'snap context' for each + * realm, which simply lists the resulting set of snaps for the realm. This + * is attached to any writes sent to OSDs. + */ +/* + * Unfortunately error handling is a bit mixed here. If we get a snap + * update, but don't have enough memory to update our realm hierarchy, + * it's not clear what we can do about it (besides complaining to the + * console). + */ + + +/* + * increase ref count for the realm + * + * caller must hold snap_rwsem for write. + */ +void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("get_realm %p %d -> %d\n", realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)+1); + /* + * since we _only_ increment realm refs or empty the empty + * list with snap_rwsem held, adjusting the empty list here is + * safe. we do need to protect against concurrent empty list + * additions, however. + */ + if (atomic_inc_return(&realm->nref) == 1) { + spin_lock(&mdsc->snap_empty_lock); + list_del_init(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + } +} + +static void __insert_snap_realm(struct rb_root *root, + struct ceph_snap_realm *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_snap_realm *r = NULL; + + while (*p) { + parent = *p; + r = rb_entry(parent, struct ceph_snap_realm, node); + if (new->ino < r->ino) + p = &(*p)->rb_left; + else if (new->ino > r->ino) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); +} + +/* + * create and get the realm rooted at @ino and bump its ref count. + * + * caller must hold snap_rwsem for write. + */ +static struct ceph_snap_realm *ceph_create_snap_realm( + struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *realm; + + realm = kzalloc(sizeof(*realm), GFP_NOFS); + if (!realm) + return ERR_PTR(-ENOMEM); + + atomic_set(&realm->nref, 1); /* for caller */ + realm->ino = ino; + INIT_LIST_HEAD(&realm->children); + INIT_LIST_HEAD(&realm->child_item); + INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->inodes_with_caps); + spin_lock_init(&realm->inodes_with_caps_lock); + __insert_snap_realm(&mdsc->snap_realms, realm); + dout("create_snap_realm %llx %p\n", realm->ino, realm); + return realm; +} + +/* + * lookup the realm rooted at @ino. + * + * caller must hold snap_rwsem for write. + */ +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct rb_node *n = mdsc->snap_realms.rb_node; + struct ceph_snap_realm *r; + + while (n) { + r = rb_entry(n, struct ceph_snap_realm, node); + if (ino < r->ino) + n = n->rb_left; + else if (ino > r->ino) + n = n->rb_right; + else { + dout("lookup_snap_realm %llx %p\n", r->ino, r); + return r; + } + } + return NULL; +} + +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); + +/* + * called with snap_rwsem (write) + */ +static void __destroy_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + + rb_erase(&realm->node, &mdsc->snap_realms); + + if (realm->parent) { + list_del_init(&realm->child_item); + __put_snap_realm(mdsc, realm->parent); + } + + kfree(realm->prior_parent_snaps); + kfree(realm->snaps); + ceph_put_snap_context(realm->cached_context); + kfree(realm); +} + +/* + * caller holds snap_rwsem (write) + */ +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (atomic_dec_and_test(&realm->nref)) + __destroy_snap_realm(mdsc, realm); +} + +/* + * caller needn't hold any locks + */ +void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (!atomic_dec_and_test(&realm->nref)) + return; + + if (down_write_trylock(&mdsc->snap_rwsem)) { + __destroy_snap_realm(mdsc, realm); + up_write(&mdsc->snap_rwsem); + } else { + spin_lock(&mdsc->snap_empty_lock); + list_add(&realm->empty_item, &mdsc->snap_empty); + spin_unlock(&mdsc->snap_empty_lock); + } +} + +/* + * Clean up any realms whose ref counts have dropped to zero. Note + * that this does not include realms who were created but not yet + * used. + * + * Called under snap_rwsem (write) + */ +static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + struct ceph_snap_realm *realm; + + spin_lock(&mdsc->snap_empty_lock); + while (!list_empty(&mdsc->snap_empty)) { + realm = list_first_entry(&mdsc->snap_empty, + struct ceph_snap_realm, empty_item); + list_del(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + __destroy_snap_realm(mdsc, realm); + spin_lock(&mdsc->snap_empty_lock); + } + spin_unlock(&mdsc->snap_empty_lock); +} + +void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + down_write(&mdsc->snap_rwsem); + __cleanup_empty_realms(mdsc); + up_write(&mdsc->snap_rwsem); +} + +/* + * adjust the parent realm of a given @realm. adjust child list, and parent + * pointers, and ref counts appropriately. + * + * return true if parent was changed, 0 if unchanged, <0 on error. + * + * caller must hold snap_rwsem for write. + */ +static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + u64 parentino) +{ + struct ceph_snap_realm *parent; + + if (realm->parent_ino == parentino) + return 0; + + parent = ceph_lookup_snap_realm(mdsc, parentino); + if (!parent) { + parent = ceph_create_snap_realm(mdsc, parentino); + if (IS_ERR(parent)) + return PTR_ERR(parent); + } + dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", + realm->ino, realm, realm->parent_ino, realm->parent, + parentino, parent); + if (realm->parent) { + list_del_init(&realm->child_item); + ceph_put_snap_realm(mdsc, realm->parent); + } + realm->parent_ino = parentino; + realm->parent = parent; + list_add(&realm->child_item, &parent->children); + return 1; +} + + +static int cmpu64_rev(const void *a, const void *b) +{ + if (*(u64 *)a < *(u64 *)b) + return 1; + if (*(u64 *)a > *(u64 *)b) + return -1; + return 0; +} + + +static struct ceph_snap_context *empty_snapc; + +/* + * build the snap context for a given realm. + */ +static int build_snap_context(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *parent = realm->parent; + struct ceph_snap_context *snapc; + int err = 0; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; + + /* + * build parent context, if it hasn't been built. + * conservatively estimate that all parent snaps might be + * included by us. + */ + if (parent) { + if (!parent->cached_context) { + err = build_snap_context(parent); + if (err) + goto fail; + } + num += parent->cached_context->num_snaps; + } + + /* do i actually need to update? not if my context seq + matches realm seq, and my parents' does to. (this works + because we rebuild_snap_realms() works _downward_ in + hierarchy after each update.) */ + if (realm->cached_context && + realm->cached_context->seq == realm->seq && + (!parent || + realm->cached_context->seq >= parent->cached_context->seq)) { + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" + " (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + (unsigned int) realm->cached_context->num_snaps); + return 0; + } + + if (num == 0 && realm->seq == empty_snapc->seq) { + ceph_get_snap_context(empty_snapc); + snapc = empty_snapc; + goto done; + } + + /* alloc new snap context */ + err = -ENOMEM; + if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) + goto fail; + snapc = ceph_create_snap_context(num, GFP_NOFS); + if (!snapc) + goto fail; + + /* build (reverse sorted) snap vector */ + num = 0; + snapc->seq = realm->seq; + if (parent) { + u32 i; + + /* include any of parent's snaps occurring _after_ my + parent became my parent */ + for (i = 0; i < parent->cached_context->num_snaps; i++) + if (parent->cached_context->snaps[i] >= + realm->parent_since) + snapc->snaps[num++] = + parent->cached_context->snaps[i]; + if (parent->cached_context->seq > snapc->seq) + snapc->seq = parent->cached_context->seq; + } + memcpy(snapc->snaps + num, realm->snaps, + sizeof(u64)*realm->num_snaps); + num += realm->num_snaps; + memcpy(snapc->snaps + num, realm->prior_parent_snaps, + sizeof(u64)*realm->num_prior_parent_snaps); + num += realm->num_prior_parent_snaps; + + sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); + snapc->num_snaps = num; + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); + +done: + ceph_put_snap_context(realm->cached_context); + realm->cached_context = snapc; + return 0; + +fail: + /* + * if we fail, clear old (incorrect) cached_context... hopefully + * we'll have better luck building it later + */ + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + realm->cached_context = NULL; + } + pr_err("build_snap_context %llx %p fail %d\n", realm->ino, + realm, err); + return err; +} + +/* + * rebuild snap context for the given realm and all of its children. + */ +static void rebuild_snap_realms(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *child; + + dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); + build_snap_context(realm); + + list_for_each_entry(child, &realm->children, child_item) + rebuild_snap_realms(child); +} + + +/* + * helper to allocate and decode an array of snapids. free prior + * instance, if any. + */ +static int dup_array(u64 **dst, __le64 *src, u32 num) +{ + u32 i; + + kfree(*dst); + if (num) { + *dst = kcalloc(num, sizeof(u64), GFP_NOFS); + if (!*dst) + return -ENOMEM; + for (i = 0; i < num; i++) + (*dst)[i] = get_unaligned_le64(src + i); + } else { + *dst = NULL; + } + return 0; +} + + +/* + * When a snapshot is applied, the size/mtime inode metadata is queued + * in a ceph_cap_snap (one for each snapshot) until writeback + * completes and the metadata can be flushed back to the MDS. + * + * However, if a (sync) write is currently in-progress when we apply + * the snapshot, we have to wait until the write succeeds or fails + * (and a final size/mtime is known). In this case the + * cap_snap->writing = 1, and is said to be "pending." When the write + * finishes, we __ceph_finish_cap_snap(). + * + * Caller must hold snap_rwsem for read (i.e., the realm topology won't + * change). + */ +void ceph_queue_cap_snap(struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap_snap *capsnap; + int used, dirty; + + capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); + return; + } + + spin_lock(&ci->i_ceph_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + + if (__ceph_have_pending_cap_snap(ci)) { + /* there is no point in queuing multiple "pending" cap_snaps, + as no new writes are allowed to start when pending, so any + writes in progress now were started before the previous + cap_snap. lucky us. */ + dout("queue_cap_snap %p already pending\n", inode); + kfree(capsnap); + } else if (ci->i_snap_realm->cached_context == empty_snapc) { + dout("queue_cap_snap %p empty snapc\n", inode); + kfree(capsnap); + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { + struct ceph_snap_context *snapc = ci->i_head_snapc; + + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } + + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = snapc; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, snapc, snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + } else { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + kfree(capsnap); + } + + spin_unlock(&ci->i_ceph_lock); +} + +/* + * Finalize the size, mtime for a cap_snap.. that is, settle on final values + * to be used for the snapshot, to be flushed back to the mds. + * + * If capsnap can now be flushed, add to snap_flush list, and return 1. + * + * Caller must hold i_ceph_lock. + */ +int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + BUG_ON(capsnap->writing); + capsnap->size = inode->i_size; + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; + capsnap->time_warp_seq = ci->i_time_warp_seq; + if (capsnap->dirty_pages) { + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", inode, capsnap, + capsnap->context, capsnap->context->seq, + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); + return 0; + } + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", + inode, capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); + + spin_lock(&mdsc->snap_flush_lock); + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + spin_unlock(&mdsc->snap_flush_lock); + return 1; /* caller may want to ceph_flush_snaps */ +} + +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + iput(lastinode); + + list_for_each_entry(child, &realm->children, child_item) { + dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", + realm, realm->ino, child, child->ino); + list_del_init(&child->dirty_item); + list_add(&child->dirty_item, &realm->dirty_item); + } + + list_del_init(&realm->dirty_item); + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} + +/* + * Parse and apply a snapblob "snap trace" from the MDS. This specifies + * the snap realm parameters from a given realm and all of its ancestors, + * up to the root. + * + * Caller must hold snap_rwsem for write. + */ +int ceph_update_snap_trace(struct ceph_mds_client *mdsc, + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) +{ + struct ceph_mds_snap_realm *ri; /* encoded */ + __le64 *snaps; /* encoded */ + __le64 *prior_parent_snaps; /* encoded */ + struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *first_realm = NULL; + int invalidate = 0; + int err = -ENOMEM; + LIST_HEAD(dirty_realms); + + dout("update_snap_trace deletion=%d\n", deletion); +more: + ceph_decode_need(&p, e, sizeof(*ri), bad); + ri = p; + p += sizeof(*ri); + ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + + le32_to_cpu(ri->num_prior_parent_snaps)), bad); + snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_snaps); + prior_parent_snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); + + realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (IS_ERR(realm)) { + err = PTR_ERR(realm); + goto fail; + } + } + + /* ensure the parent is correct */ + err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); + if (err < 0) + goto fail; + invalidate += err; + + if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + /* update realm parameters, snap lists */ + realm->seq = le64_to_cpu(ri->seq); + realm->created = le64_to_cpu(ri->created); + realm->parent_since = le64_to_cpu(ri->parent_since); + + realm->num_snaps = le32_to_cpu(ri->num_snaps); + err = dup_array(&realm->snaps, snaps, realm->num_snaps); + if (err < 0) + goto fail; + + realm->num_prior_parent_snaps = + le32_to_cpu(ri->num_prior_parent_snaps); + err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, + realm->num_prior_parent_snaps); + if (err < 0) + goto fail; + + /* queue realm for cap_snap creation */ + list_add(&realm->dirty_item, &dirty_realms); + + invalidate = 1; + } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); + invalidate = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); + } + + dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, + realm, invalidate, p, e); + + /* invalidate when we reach the _end_ (root) of the trace */ + if (invalidate && p >= e) + rebuild_snap_realms(realm); + + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); + queue_realm_cap_snaps(realm); + } + + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + + __cleanup_empty_realms(mdsc); + return 0; + +bad: + err = -EINVAL; +fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); + pr_err("update_snap_trace error %d\n", err); + return err; +} + + +/* + * Send any cap_snaps that are queued for flush. Try to carry + * s_mutex across multiple snap flushes to avoid locking overhead. + * + * Caller holds no locks. + */ +static void flush_snaps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + struct ceph_mds_session *session = NULL; + + dout("flush_snaps\n"); + spin_lock(&mdsc->snap_flush_lock); + while (!list_empty(&mdsc->snap_flush_list)) { + ci = list_first_entry(&mdsc->snap_flush_list, + struct ceph_inode_info, i_snap_flush_item); + inode = &ci->vfs_inode; + ihold(inode); + spin_unlock(&mdsc->snap_flush_lock); + spin_lock(&ci->i_ceph_lock); + __ceph_flush_snaps(ci, &session, 0); + spin_unlock(&ci->i_ceph_lock); + iput(inode); + spin_lock(&mdsc->snap_flush_lock); + } + spin_unlock(&mdsc->snap_flush_lock); + + if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + dout("flush_snaps done\n"); +} + + +/* + * Handle a snap notification from the MDS. + * + * This can take two basic forms: the simplest is just a snap creation + * or deletion notification on an existing realm. This should update the + * realm and its children. + * + * The more difficult case is realm creation, due to snap creation at a + * new point in the file hierarchy, or due to a rename that moves a file or + * directory into another realm. + */ +void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + int mds = session->s_mds; + u64 split; + int op; + int trace_len; + struct ceph_snap_realm *realm = NULL; + void *p = msg->front.iov_base; + void *e = p + msg->front.iov_len; + struct ceph_mds_snap_head *h; + int num_split_inos, num_split_realms; + __le64 *split_inos = NULL, *split_realms = NULL; + int i; + int locked_rwsem = 0; + + /* decode */ + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = p; + op = le32_to_cpu(h->op); + split = le64_to_cpu(h->split); /* non-zero if we are splitting an + * existing realm */ + num_split_inos = le32_to_cpu(h->num_split_inos); + num_split_realms = le32_to_cpu(h->num_split_realms); + trace_len = le32_to_cpu(h->trace_len); + p += sizeof(*h); + + dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); + + mutex_lock(&session->s_mutex); + session->s_seq++; + mutex_unlock(&session->s_mutex); + + down_write(&mdsc->snap_rwsem); + locked_rwsem = 1; + + if (op == CEPH_SNAP_OP_SPLIT) { + struct ceph_mds_snap_realm *ri; + + /* + * A "split" breaks part of an existing realm off into + * a new realm. The MDS provides a list of inodes + * (with caps) and child realms that belong to the new + * child. + */ + split_inos = p; + p += sizeof(u64) * num_split_inos; + split_realms = p; + p += sizeof(u64) * num_split_realms; + ceph_decode_need(&p, e, sizeof(*ri), bad); + /* we will peek at realm info here, but will _not_ + * advance p, as the realm update will occur below in + * ceph_update_snap_trace. */ + ri = p; + + realm = ceph_lookup_snap_realm(mdsc, split); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, split); + if (IS_ERR(realm)) + goto out; + } + + dout("splitting snap_realm %llx %p\n", realm->ino, realm); + for (i = 0; i < num_split_inos; i++) { + struct ceph_vino vino = { + .ino = le64_to_cpu(split_inos[i]), + .snap = CEPH_NOSNAP, + }; + struct inode *inode = ceph_find_inode(sb, vino); + struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; + + if (!inode) + continue; + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (!ci->i_snap_realm) + goto skip_inode; + /* + * If this inode belongs to a realm that was + * created after our new realm, we experienced + * a race (due to another split notifications + * arriving from a different MDS). So skip + * this inode. + */ + if (ci->i_snap_realm->created > + le64_to_cpu(ri->created)) { + dout(" leaving %p in newer realm %llx %p\n", + inode, ci->i_snap_realm->ino, + ci->i_snap_realm); + goto skip_inode; + } + dout(" will move %p to split realm %llx %p\n", + inode, realm->ino, realm); + /* + * Move the inode to the new realm + */ + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + oldrealm = ci->i_snap_realm; + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + spin_unlock(&ci->i_ceph_lock); + + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + + iput(inode); + continue; + +skip_inode: + spin_unlock(&ci->i_ceph_lock); + iput(inode); + } + + /* we may have taken some of the old realm's children. */ + for (i = 0; i < num_split_realms; i++) { + struct ceph_snap_realm *child = + __lookup_snap_realm(mdsc, + le64_to_cpu(split_realms[i])); + if (!child) + continue; + adjust_snap_realm_parent(mdsc, child, realm->ino); + } + } + + /* + * update using the provided snap trace. if we are deleting a + * snap, we can avoid queueing cap_snaps. + */ + ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, NULL); + + if (op == CEPH_SNAP_OP_SPLIT) + /* we took a reference when we created the realm, above */ + ceph_put_snap_realm(mdsc, realm); + + __cleanup_empty_realms(mdsc); + + up_write(&mdsc->snap_rwsem); + + flush_snaps(mdsc); + return; + +bad: + pr_err("corrupt snap message from mds%d\n", mds); + ceph_msg_dump(msg); +out: + if (locked_rwsem) + up_write(&mdsc->snap_rwsem); + return; +} + +int __init ceph_snap_init(void) +{ + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!empty_snapc) + return -ENOMEM; + empty_snapc->seq = 1; + return 0; +} + +void ceph_snap_exit(void) +{ + ceph_put_snap_context(empty_snapc); +} diff --git a/kernel/fs/ceph/strings.c b/kernel/fs/ceph/strings.c new file mode 100644 index 000000000..89e6bc321 --- /dev/null +++ b/kernel/fs/ceph/strings.c @@ -0,0 +1,125 @@ +/* + * Ceph fs string constants + */ +#include +#include + + +const char *ceph_mds_state_name(int s) +{ + switch (s) { + /* down and out */ + case CEPH_MDS_STATE_DNE: return "down:dne"; + case CEPH_MDS_STATE_STOPPED: return "down:stopped"; + /* up and out */ + case CEPH_MDS_STATE_BOOT: return "up:boot"; + case CEPH_MDS_STATE_STANDBY: return "up:standby"; + case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; + case CEPH_MDS_STATE_CREATING: return "up:creating"; + case CEPH_MDS_STATE_STARTING: return "up:starting"; + /* up and in */ + case CEPH_MDS_STATE_REPLAY: return "up:replay"; + case CEPH_MDS_STATE_RESOLVE: return "up:resolve"; + case CEPH_MDS_STATE_RECONNECT: return "up:reconnect"; + case CEPH_MDS_STATE_REJOIN: return "up:rejoin"; + case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay"; + case CEPH_MDS_STATE_ACTIVE: return "up:active"; + case CEPH_MDS_STATE_STOPPING: return "up:stopping"; + } + return "???"; +} + +const char *ceph_session_op_name(int op) +{ + switch (op) { + case CEPH_SESSION_REQUEST_OPEN: return "request_open"; + case CEPH_SESSION_OPEN: return "open"; + case CEPH_SESSION_REQUEST_CLOSE: return "request_close"; + case CEPH_SESSION_CLOSE: return "close"; + case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps"; + case CEPH_SESSION_RENEWCAPS: return "renewcaps"; + case CEPH_SESSION_STALE: return "stale"; + case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; + } + return "???"; +} + +const char *ceph_mds_op_name(int op) +{ + switch (op) { + case CEPH_MDS_OP_LOOKUP: return "lookup"; + case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; + case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; + case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_SETXATTR: return "setxattr"; + case CEPH_MDS_OP_SETATTR: return "setattr"; + case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; + case CEPH_MDS_OP_READDIR: return "readdir"; + case CEPH_MDS_OP_MKNOD: return "mknod"; + case CEPH_MDS_OP_LINK: return "link"; + case CEPH_MDS_OP_UNLINK: return "unlink"; + case CEPH_MDS_OP_RENAME: return "rename"; + case CEPH_MDS_OP_MKDIR: return "mkdir"; + case CEPH_MDS_OP_RMDIR: return "rmdir"; + case CEPH_MDS_OP_SYMLINK: return "symlink"; + case CEPH_MDS_OP_CREATE: return "create"; + case CEPH_MDS_OP_OPEN: return "open"; + case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap"; + case CEPH_MDS_OP_LSSNAP: return "lssnap"; + case CEPH_MDS_OP_MKSNAP: return "mksnap"; + case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; + } + return "???"; +} + +const char *ceph_cap_op_name(int op) +{ + switch (op) { + case CEPH_CAP_OP_GRANT: return "grant"; + case CEPH_CAP_OP_REVOKE: return "revoke"; + case CEPH_CAP_OP_TRUNC: return "trunc"; + case CEPH_CAP_OP_EXPORT: return "export"; + case CEPH_CAP_OP_IMPORT: return "import"; + case CEPH_CAP_OP_UPDATE: return "update"; + case CEPH_CAP_OP_DROP: return "drop"; + case CEPH_CAP_OP_FLUSH: return "flush"; + case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack"; + case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; + case CEPH_CAP_OP_RELEASE: return "release"; + case CEPH_CAP_OP_RENEW: return "renew"; + } + return "???"; +} + +const char *ceph_lease_op_name(int o) +{ + switch (o) { + case CEPH_MDS_LEASE_REVOKE: return "revoke"; + case CEPH_MDS_LEASE_RELEASE: return "release"; + case CEPH_MDS_LEASE_RENEW: return "renew"; + case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack"; + } + return "???"; +} + +const char *ceph_snap_op_name(int o) +{ + switch (o) { + case CEPH_SNAP_OP_UPDATE: return "update"; + case CEPH_SNAP_OP_CREATE: return "create"; + case CEPH_SNAP_OP_DESTROY: return "destroy"; + case CEPH_SNAP_OP_SPLIT: return "split"; + } + return "???"; +} diff --git a/kernel/fs/ceph/super.c b/kernel/fs/ceph/super.c new file mode 100644 index 000000000..4e9905374 --- /dev/null +++ b/kernel/fs/ceph/super.c @@ -0,0 +1,1059 @@ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "cache.h" + +#include +#include +#include +#include +#include + +/* + * Ceph superblock operations + * + * Handle the basics of mounting, unmounting. + */ + +/* + * super ops + */ +static void ceph_put_super(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + + dout("put_super\n"); + ceph_mdsc_close_sessions(fsc->mdsc); +} + +static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); + struct ceph_monmap *monmap = fsc->client->monc.monmap; + struct ceph_statfs st; + u64 fsid; + int err; + + dout("statfs\n"); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); + if (err < 0) + return err; + + /* fill in kstatfs */ + buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ + + /* + * express utilization in terms of large blocks to avoid + * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! + */ + buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + + buf->f_files = le64_to_cpu(st.num_objects); + buf->f_ffree = -1; + buf->f_namelen = NAME_MAX; + + /* leave fsid little-endian, regardless of host endianness */ + fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + buf->f_fsid.val[0] = fsid & 0xffffffff; + buf->f_fsid.val[1] = fsid >> 32; + + return 0; +} + + +static int ceph_sync_fs(struct super_block *sb, int wait) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(fsc->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + + dout("sync_fs (blocking)\n"); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); + dout("sync_fs (blocking) done\n"); + return 0; +} + +/* + * mount options + */ +enum { + Opt_wsize, + Opt_rsize, + Opt_rasize, + Opt_caps_wanted_delay_min, + Opt_caps_wanted_delay_max, + Opt_cap_release_safety, + Opt_readdir_max_entries, + Opt_readdir_max_bytes, + Opt_congestion_kb, + Opt_last_int, + /* int args above */ + Opt_snapdirname, + Opt_last_string, + /* string args above */ + Opt_dirstat, + Opt_nodirstat, + Opt_rbytes, + Opt_norbytes, + Opt_asyncreaddir, + Opt_noasyncreaddir, + Opt_dcache, + Opt_nodcache, + Opt_ino32, + Opt_noino32, + Opt_fscache, + Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + Opt_acl, +#endif + Opt_noacl +}; + +static match_table_t fsopt_tokens = { + {Opt_wsize, "wsize=%d"}, + {Opt_rsize, "rsize=%d"}, + {Opt_rasize, "rasize=%d"}, + {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, + {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, + {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, + /* int args above */ + {Opt_snapdirname, "snapdirname=%s"}, + /* string args above */ + {Opt_dirstat, "dirstat"}, + {Opt_nodirstat, "nodirstat"}, + {Opt_rbytes, "rbytes"}, + {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, + {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_dcache, "dcache"}, + {Opt_nodcache, "nodcache"}, + {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, + {Opt_fscache, "fsc"}, + {Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + {Opt_acl, "acl"}, +#endif + {Opt_noacl, "noacl"}, + {-1, NULL} +}; + +static int parse_fsopt_token(char *c, void *private) +{ + struct ceph_mount_options *fsopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_rasize: + fsopt->rasize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_dcache: + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_nodcache: + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_ino32: + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; + case Opt_fscache: + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; + case Opt_nofscache: + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL + case Opt_acl: + fsopt->sb_flags |= MS_POSIXACL; + break; +#endif + case Opt_noacl: + fsopt->sb_flags &= ~MS_POSIXACL; + break; + default: + BUG_ON(token); + } + return 0; +} + +static void destroy_mount_options(struct ceph_mount_options *args) +{ + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; + + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err; + + if (!dev_name || !*dev_name) + return -EINVAL; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) { + err = -ENOMEM; + goto out; + } + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* + * Distinguish the server list from the path in "dev_name". + * Internally we do not include the leading '/' in the path. + * + * "dev_name" will look like: + * [,...]:[] + * where + * is [:] + * is optional, but if present must begin with '/' + */ + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* skip over leading '/' for path */ + *path = dev_name_end + 1; + } else { + /* path is empty */ + dev_name_end = dev_name + strlen(dev_name); + *path = dev_name_end; + } + err = -EINVAL; + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') { + pr_err("device name is missing path (no : separator in %s)\n", + dev_name); + goto out; + } + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + dout("server path '%s'\n", *path); + + *popt = ceph_parse_options(options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); + goto out; + } + + /* success */ + *pfsopt = fsopt; + return 0; + +out: + destroy_mount_options(fsopt); + return err; +} + +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @root: root of that (sub)tree + */ +static int ceph_show_options(struct seq_file *m, struct dentry *root) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) + seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) + seq_puts(m, ",fsc"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (fsopt->sb_flags & MS_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rasize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + + return 0; +} + +/* + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. + */ +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; + const u64 supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDS_INLINE_DATA; + const u64 required_features = 0; + int page_count; + size_t size; + int err = -ENOMEM; + + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) + return ERR_PTR(-ENOMEM); + + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.want_mdsmap = 1; + + fsc->mount_options = fsopt; + + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; + + atomic_long_set(&fsc->writeback_count, 0); + + err = bdi_init(&fsc->backing_dev_info); + if (err < 0) + goto fail_client; + + err = -ENOMEM; + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); + if (fsc->wb_wq == NULL) + goto fail_bdi; + fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); + if (fsc->pg_inv_wq == NULL) + goto fail_wb_wq; + fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); + if (fsc->trunc_wq == NULL) + goto fail_pg_inv_wq; + + /* set up mempools */ + err = -ENOMEM; + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); + if (!fsc->wb_pagevec_pool) + goto fail_trunc_wq; + + /* setup fscache */ + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && + (ceph_fscache_register_fs(fsc) != 0)) + goto fail_fscache; + + /* caps */ + fsc->min_caps = fsopt->max_readdir; + + return fsc; + +fail_fscache: + ceph_fscache_unregister_fs(fsc); +fail_trunc_wq: + destroy_workqueue(fsc->trunc_wq); +fail_pg_inv_wq: + destroy_workqueue(fsc->pg_inv_wq); +fail_wb_wq: + destroy_workqueue(fsc->wb_wq); +fail_bdi: + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); +fail: + kfree(fsc); + return ERR_PTR(err); +} + +static void destroy_fs_client(struct ceph_fs_client *fsc) +{ + dout("destroy_fs_client %p\n", fsc); + + ceph_fscache_unregister_fs(fsc); + + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); + + bdi_destroy(&fsc->backing_dev_info); + + mempool_destroy(fsc->wb_pagevec_pool); + + destroy_mount_options(fsc->mount_options); + + ceph_fs_debugfs_cleanup(fsc); + + ceph_destroy_client(fsc->client); + + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); +} + +/* + * caches + */ +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) +{ + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + int error = -ENOMEM; + + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + + if ((error = ceph_fscache_register())) + goto bad_file; + + return 0; +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return error; +} + +static void destroy_caches(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); + + ceph_fscache_unregister(); +} + + +/* + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). + */ +static void ceph_umount_begin(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + return; +} + +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + +/* + * Bootstrap mount by opening the root directory. Note the mount + * @started time from caller, and time out if this takes too long. + */ +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, + const char *path, + unsigned long started) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req = NULL; + int err; + struct dentry *root; + + /* open dir */ + dout("open_root_inode opening '%s'\n", path); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + + req->r_ino1.ino = CEPH_INO_ROOT; + req->r_ino1.snap = CEPH_NOSNAP; + req->r_started = started; + req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == 0) { + struct inode *inode = req->r_target_inode; + req->r_target_inode = NULL; + dout("open_root_inode success\n"); + if (ceph_ino(inode) == CEPH_INO_ROOT && + fsc->sb->s_root == NULL) { + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; + } + } else { + root = d_obtain_root(inode); + } + ceph_init_dentry(root); + dout("open_root_inode success, root dentry is %p\n", root); + } else { + root = ERR_PTR(err); + } +out: + ceph_mdsc_put_request(req); + return root; +} + + + + +/* + * mount: join the ceph cluster, and open root directory. + */ +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + const char *path) +{ + int err; + unsigned long started = jiffies; /* note the start time */ + struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ + + dout("mount start\n"); + mutex_lock(&fsc->client->mount_mutex); + + err = __ceph_open_session(fsc->client, started); + if (err < 0) + goto out; + + dout("mount opening root\n"); + root = open_root_dentry(fsc, "", started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + if (fsc->sb->s_root) { + dput(root); + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } + + if (path[0] == 0) { + dget(root); + } else { + dout("mount opening base mountpoint\n"); + root = open_root_dentry(fsc, path, started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + } + + fsc->mount_state = CEPH_MOUNT_MOUNTED; + dout("mount success\n"); + mutex_unlock(&fsc->client->mount_mutex); + return root; + +out: + mutex_unlock(&fsc->client->mount_mutex); + return ERR_PTR(err); + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; +} + +static int ceph_set_super(struct super_block *s, void *data) +{ + struct ceph_fs_client *fsc = data; + int ret; + + dout("set_super %p data %p\n", s, data); + + s->s_flags = fsc->mount_options->sb_flags; + s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + + s->s_xattr = ceph_xattr_handlers; + s->s_fs_info = fsc; + fsc->sb = s; + + s->s_op = &ceph_super_ops; + s->s_export_op = &ceph_export_ops; + + s->s_time_gran = 1000; /* 1000 ns == 1 us */ + + ret = set_anon_super(s, NULL); /* what is that second arg for? */ + if (ret != 0) + goto fail; + + return ret; + +fail: + s->s_fs_info = NULL; + fsc->sb = NULL; + return ret; +} + +/* + * share superblock if same fs AND options + */ +static int ceph_compare_super(struct super_block *sb, void *data) +{ + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); + + dout("ceph_compare_super %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; + } + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fsopt->sb_flags != other->mount_options->sb_flags) { + dout("flags differ\n"); + return 0; + } + return 1; +} + +/* + * construct our own bdi so we can control readahead, etc. + */ +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) +{ + int err; + + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + else + fsc->backing_dev_info.ra_pages = + VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; + + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (!err) + sb->s_bdi = &fsc->backing_dev_info; + return err; +} + +static struct dentry *ceph_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct super_block *sb; + struct ceph_fs_client *fsc; + struct dentry *res; + int err; + int (*compare_super)(struct super_block *, void *) = ceph_compare_super; + const char *path = NULL; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; + + dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + flags |= MS_POSIXACL; +#endif + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) { + res = ERR_PTR(err); + goto out_final; + } + + /* create client (which we may/may not use) */ + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + res = ERR_CAST(fsc); + destroy_mount_options(fsopt); + ceph_destroy_options(opt); + goto out_final; + } + + err = ceph_mdsc_init(fsc); + if (err < 0) { + res = ERR_PTR(err); + goto out; + } + + if (ceph_test_opt(fsc->client, NOSHARE)) + compare_super = NULL; + sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); + if (IS_ERR(sb)) { + res = ERR_CAST(sb); + goto out; + } + + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); + } else { + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); + if (err < 0) { + res = ERR_PTR(err); + goto out_splat; + } + } + + res = ceph_real_mount(fsc, path); + if (IS_ERR(res)) + goto out_splat; + dout("root %p inode %p ino %llx.%llx\n", res, + d_inode(res), ceph_vinop(d_inode(res))); + return res; + +out_splat: + ceph_mdsc_close_sessions(fsc->mdsc); + deactivate_locked_super(sb); + goto out_final; + +out: + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); +out_final: + dout("ceph_mount fail %ld\n", PTR_ERR(res)); + return res; +} + +static void ceph_kill_sb(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + dev_t dev = s->s_dev; + + dout("kill_sb %p\n", s); + + ceph_mdsc_pre_umount(fsc->mdsc); + generic_shutdown_super(s); + ceph_mdsc_destroy(fsc); + + destroy_fs_client(fsc); + free_anon_bdev(dev); +} + +static struct file_system_type ceph_fs_type = { + .owner = THIS_MODULE, + .name = "ceph", + .mount = ceph_mount, + .kill_sb = ceph_kill_sb, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("ceph"); + +static int __init init_ceph(void) +{ + int ret = init_caches(); + if (ret) + goto out; + + ceph_flock_init(); + ceph_xattr_init(); + ret = ceph_snap_init(); + if (ret) + goto out_xattr; + ret = register_filesystem(&ceph_fs_type); + if (ret) + goto out_snap; + + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + + return 0; + +out_snap: + ceph_snap_exit(); +out_xattr: + ceph_xattr_exit(); + destroy_caches(); +out: + return ret; +} + +static void __exit exit_ceph(void) +{ + dout("exit_ceph\n"); + unregister_filesystem(&ceph_fs_type); + ceph_snap_exit(); + ceph_xattr_exit(); + destroy_caches(); +} + +module_init(init_ceph); +module_exit(exit_ceph); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_AUTHOR("Patience Warnick "); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/ceph/super.h b/kernel/fs/ceph/super.h new file mode 100644 index 000000000..fa20e1318 --- /dev/null +++ b/kernel/fs/ceph/super.h @@ -0,0 +1,946 @@ +#ifndef _FS_CEPH_SUPER_H +#define _FS_CEPH_SUPER_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_CEPH_FSCACHE +#include +#endif + +/* f_type in struct statfs */ +#define CEPH_SUPER_MAGIC 0x00c36400 + +/* large granularity for statfs utilization stats to facilitate + * large volume sizes on 32-bit machines. */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ +#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) + +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ + +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ + CEPH_MOUNT_OPT_DCACHE) + +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) + +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" + +struct ceph_mount_options { + int flags; + int sb_flags; + + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ + + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ + + char *snapdir_name; /* default ".snap" */ +}; + +struct ceph_fs_client { + struct super_block *sb; + + struct ceph_mount_options *mount_options; + struct ceph_client *client; + + unsigned long mount_state; + int min_caps; /* min caps i added */ + + struct ceph_mds_client *mdsc; + + /* writeback */ + mempool_t *wb_pagevec_pool; + struct workqueue_struct *wb_wq; + struct workqueue_struct *pg_inv_wq; + struct workqueue_struct *trunc_wq; + atomic_long_t writeback_count; + + struct backing_dev_info backing_dev_info; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; + struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_mds_sessions; +#endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + struct workqueue_struct *revalidate_wq; +#endif +}; + + +/* + * File i/o capability. This tracks shared state with the metadata + * server that allows us to cache or writeback attributes or to read + * and write data. For any given inode, we should have one or more + * capabilities, one issued by each metadata server, and our + * cumulative access is the OR of all issued capabilities. + * + * Each cap is referenced by the inode's i_caps rbtree and by per-mds + * session capability lists. + */ +struct ceph_cap { + struct ceph_inode_info *ci; + struct rb_node ci_node; /* per-ci cap tree */ + struct ceph_mds_session *session; + struct list_head session_caps; /* per-session caplist */ + int mds; + u64 cap_id; /* unique cap id (mds provided) */ + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of issued (for revocation) */ + int mds_wanted; + u32 seq, issue_seq, mseq; + u32 cap_gen; /* active/stale cycle */ + unsigned long last_used; + struct list_head caps_item; +}; + +#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ +#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ + +/* + * Snapped cap state that is pending flush to mds. When a snapshot occurs, + * we first complete any in-process sync writes and writeback any dirty + * data before flushing the snapped state (tracked here) back to the MDS. + */ +struct ceph_cap_snap { + atomic_t nref; + struct ceph_inode_info *ci; + struct list_head ci_item, flushing_item; + + u64 follows, flush_tid; + int issued, dirty; + struct ceph_snap_context *context; + + umode_t mode; + kuid_t uid; + kgid_t gid; + + struct ceph_buffer *xattr_blob; + u64 xattr_version; + + u64 size; + struct timespec mtime, atime, ctime; + u64 time_warp_seq; + int writing; /* a sync write is still in progress */ + int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; +}; + +static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); + kfree(capsnap); + } +} + +/* + * The frag tree describes how a directory is fragmented, potentially across + * multiple metadata servers. It is also used to indicate points where + * metadata authority is delegated, and whether/where metadata is replicated. + * + * A _leaf_ frag will be present in the i_fragtree IFF there is + * delegation info. That is, if mds >= 0 || ndist > 0. + */ +#define CEPH_MAX_DIRFRAG_REP 4 + +struct ceph_inode_frag { + struct rb_node node; + + /* fragtree state */ + u32 frag; + int split_by; /* i.e. 2^(split_by) children */ + + /* delegation and replication info */ + int mds; /* -1 if same authority as parent */ + int ndist; /* >0 if replicated */ + int dist[CEPH_MAX_DIRFRAG_REP]; +}; + +/* + * We cache inode xattrs as an encoded blob until they are first used, + * at which point we parse them into an rbtree. + */ +struct ceph_inode_xattr { + struct rb_node node; + + const char *name; + int name_len; + const char *val; + int val_len; + int dirty; + + int should_free_name; + int should_free_val; +}; + +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + +struct ceph_inode_xattrs_info { + /* + * (still encoded) xattr blob. we avoid the overhead of parsing + * this until someone actually calls getxattr, etc. + * + * blob->vec.iov_len == 4 implies there are no xattrs; blob == + * NULL means we don't know. + */ + struct ceph_buffer *blob, *prealloc_blob; + + struct rb_root index; + bool dirty; + int count; + int names_size; + int vals_size; + u64 version, index_version; +}; + +/* + * Ceph inode. + */ +struct ceph_inode_info { + struct ceph_vino i_vino; /* ceph ino + snap */ + + spinlock_t i_ceph_lock; + + u64 i_version; + u64 i_inline_version; + u32 i_time_warp_seq; + + unsigned i_ceph_flags; + int i_ordered_count; + atomic_t i_release_count; + atomic_t i_complete_count; + + struct ceph_dir_layout i_dir_layout; + struct ceph_file_layout i_layout; + char *i_symlink; + + /* for dirs */ + struct timespec i_rctime; + u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_files, i_subdirs; + + struct rb_root i_fragtree; + struct mutex i_fragtree_mutex; + + struct ceph_inode_xattrs_info i_xattrs; + + /* capabilities. protected _both_ by i_ceph_lock and cap->session's + * s_mutex. */ + struct rb_root i_caps; /* cap list */ + struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ + unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ + struct list_head i_dirty_item, i_flushing_item; + u64 i_cap_flush_seq; + /* we need to track cap writeback on a per-cap-bit basis, to allow + * overlapping, pipelined cap flushes to the mds. we can probably + * reduce the tid to 8 bits if we're concerned about inode size. */ + u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ + unsigned long i_hold_caps_min; /* jiffies */ + unsigned long i_hold_caps_max; /* jiffies */ + struct list_head i_cap_delay_list; /* for delayed cap release to mds */ + struct ceph_cap_reservation i_cap_migration_resv; + struct list_head i_cap_snaps; /* snapped state pending flush to mds */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ + unsigned i_snap_caps; /* cap bits for snapped files */ + + int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + + struct mutex i_truncate_mutex; + u32 i_truncate_seq; /* last truncate to smaller size */ + u64 i_truncate_size; /* and the size we last truncated down to */ + int i_truncate_pending; /* still need to call vmtruncate */ + + u64 i_max_size; /* max file size authorized by mds */ + u64 i_reported_size; /* (max_)size reported to or requested of mds */ + u64 i_wanted_max_size; /* offset we'd like to write too */ + u64 i_requested_max_size; /* max_size we've requested */ + + /* held references to caps */ + int i_pin_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; + int i_wrbuffer_ref, i_wrbuffer_ref_head; + u32 i_shared_gen; /* increment each time we get FILE_SHARED */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ + u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ + + struct list_head i_unsafe_writes; /* uncommitted sync writes */ + struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + spinlock_t i_unsafe_lock; + + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + int i_snap_realm_counter; /* snap realm (if caps) */ + struct list_head i_snap_realm_item; + struct list_head i_snap_flush_item; + + struct work_struct i_wb_work; /* writeback work */ + struct work_struct i_pg_inv_work; /* page invalidation work */ + + struct work_struct i_vmtruncate_work; + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; /* sequence, for delayed fscache validate */ + struct work_struct i_revalidate_work; +#endif + struct inode vfs_inode; /* at end */ +}; + +static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +{ + return container_of(inode, struct ceph_inode_info, vfs_inode); +} + +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +{ + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +{ + return (struct ceph_fs_client *)sb->s_fs_info; +} + +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * i_ino (kernel inode) st_ino (userspace) + * i386 32 32 + * x86_64+ino32 64 32 + * x86_64 64 64 + */ +static inline u32 ceph_ino_to_ino32(__u64 vino) +{ + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; + if (!ino) + ino = 2; + return ino; +} + +/* + * kernel i_ino value + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ +#if BITS_PER_LONG == 32 + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; +#endif +} + +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + return ino; +} +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) + ino = ceph_ino_to_ino32(ino); + return ino; +} +#endif + + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count, int ordered_count) +{ + atomic_set(&ci->i_complete_count, release_count); + if (ci->i_ordered_count == ordered_count) + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; + else + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; +} + +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) +{ + atomic_inc(&ci->i_release_count); +} + +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); +} + +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return __ceph_dir_is_complete(ci) && + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); +} + +static inline void ceph_dir_clear_complete(struct inode *inode) +{ + __ceph_dir_clear_complete(ceph_inode(inode)); +} + +static inline void ceph_dir_clear_ordered(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + spin_lock(&ci->i_ceph_lock); + ci->i_ordered_count++; + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + spin_unlock(&ci->i_ceph_lock); +} + +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool ret; + spin_lock(&ci->i_ceph_lock); + ret = __ceph_dir_is_complete_ordered(ci); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +{ + return ((loff_t)frag << 32) | (loff_t)off; +} + +/* + * caps helpers + */ +static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); +extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, + struct ceph_cap *cap); + +static inline int ceph_caps_issued(struct ceph_inode_info *ci) +{ + int issued; + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + return issued; +} + +static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, + int touch) +{ + int r; + spin_lock(&ci->i_ceph_lock); + r = __ceph_caps_issued_mask(ci, mask, touch); + spin_unlock(&ci->i_ceph_lock); + return r; +} + +static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) +{ + return ci->i_dirty_caps | ci->i_flushing_caps; +} +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); + +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); +extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_used(struct ceph_inode_info *ci); + +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ + return w; +} + +/* what the mds thinks we want */ +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); + +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_reservation_status(struct ceph_fs_client *client, + int *total, int *avail, int *used, + int *reserved, int *min); + + + +/* + * we keep buffered readdir results attached to file->private_data + */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + +struct ceph_file_info { + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ + + /* readdir: position within the dir */ + u32 frag; + struct ceph_mds_request *last_readdir; + + /* readdir: position within a frag */ + unsigned offset; /* offset of last chunk, adjusted for . and .. */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ + char *last_name; /* last entry in previous chunk */ + struct dentry *dentry; /* next dentry (for dcache readdir) */ + int dir_release_count; + int dir_ordered_count; + + /* used for -o dirstat read() on directory thing */ + char *dir_info; + int dir_info_len; +}; + + + +/* + * A "snap realm" describes a subset of the file hierarchy sharing + * the same set of snapshots that apply to it. The realms themselves + * are organized into a hierarchy, such that children inherit (some of) + * the snapshots of their parents. + * + * All inodes within the realm that have capabilities are linked into a + * per-realm list. + */ +struct ceph_snap_realm { + u64 ino; + atomic_t nref; + struct rb_node node; + + u64 created, seq; + u64 parent_ino; + u64 parent_since; /* snapid when our current parent became so */ + + u64 *prior_parent_snaps; /* snaps inherited from any parents we */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ + u64 *snaps; /* snaps specific to this realm */ + u32 num_snaps; + + struct ceph_snap_realm *parent; + struct list_head children; /* list of child realms */ + struct list_head child_item; + + struct list_head empty_item; /* if i have ref==0 */ + + struct list_head dirty_item; /* if realm needs new context */ + + /* the current set of snaps for this realm */ + struct ceph_snap_context *cached_context; + + struct list_head inodes_with_caps; + spinlock_t inodes_with_caps_lock; +}; + +static inline int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + + + +/* snap.c */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino); +extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern int ceph_update_snap_trace(struct ceph_mds_client *m, + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); +extern void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); +extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap); +extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern int ceph_snap_init(void); +extern void ceph_snap_exit(void); + +/* + * a cap_snap is "pending" if it is still awaiting an in-progress + * sync write (that may/may not still update size, mtime, etc.). + */ +static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) +{ + return !list_empty(&ci->i_cap_snaps) && + list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, + ci_item)->writing; +} + +/* inode.c */ +extern const struct inode_operations ceph_file_iops; + +extern struct inode *ceph_alloc_inode(struct super_block *sb); +extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); + +extern struct inode *ceph_get_inode(struct super_block *sb, + struct ceph_vino vino); +extern struct inode *ceph_get_snapdir(struct inode *parent); +extern int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size); +extern void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime); +extern int ceph_fill_trace(struct super_block *sb, + struct ceph_mds_request *req, + struct ceph_mds_session *session); +extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session); + +extern int ceph_inode_holds_cap(struct inode *inode, int mask); + +extern int ceph_inode_set_size(struct inode *inode, loff_t size); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void ceph_queue_vmtruncate(struct inode *inode); + +extern void ceph_queue_invalidate(struct inode *inode); +extern void ceph_queue_writeback(struct inode *inode); + +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} +extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +extern int ceph_setxattr(struct dentry *, const char *, const void *, + size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); +extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); +extern int ceph_removexattr(struct dentry *, const char *); +extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); +extern const struct xattr_handler *ceph_xattr_handlers[]; + +/* acl.c */ +struct ceph_acls_info { + void *default_acl; + void *acl; + struct ceph_pagelist *pagelist; +}; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info); +void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info); +void ceph_release_acls_info(struct ceph_acls_info *info); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info) +{ + return 0; +} +static inline void ceph_init_inode_acls(struct inode *inode, + struct ceph_acls_info *info) +{ +} +static inline void ceph_release_acls_info(struct ceph_acls_info *info) +{ +} +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif + +/* caps.c */ +extern const char *ceph_cap_string(int c); +extern void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg); +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); + +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); +extern void ceph_queue_caps_release(struct inode *inode); +extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); +extern int ceph_fsync(struct file *file, loff_t start, loff_t end, + int datasync); +extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); +extern int ceph_get_cap_mds(struct inode *inode); +extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); +extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc); +extern void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession, + int again); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session); +extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); + +extern int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force); +extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + int mds, int drop, int unless); + +extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, struct page **pinned_page); + +/* for counting open files by mode */ +static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) +{ + ci->i_nr_by_mode[mode]++; +} +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); + +/* addr.c */ +extern const struct address_space_operations ceph_aops; +extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); + +/* file.c */ +extern const struct file_operations ceph_file_fops; + +extern int ceph_open(struct inode *inode, struct file *file); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened); +extern int ceph_release(struct inode *inode, struct file *filp); +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); +int ceph_uninline_data(struct file *filp, struct page *locked_page); +/* dir.c */ +extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; +extern const struct inode_operations ceph_dir_iops; +extern const struct inode_operations ceph_snapdir_iops; +extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, + ceph_snapdir_dentry_ops; + +extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); +extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err); + +extern void ceph_dentry_lru_add(struct dentry *dn); +extern void ceph_dentry_lru_touch(struct dentry *dn); +extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); + +/* + * our d_ops vary depending on whether the inode is live, + * snapshotted (read-only), or a virtual ".snap" directory. + */ +int ceph_init_dentry(struct dentry *dentry); + + +/* ioctl.c */ +extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* export.c */ +extern const struct export_operations ceph_export_ops; + +/* locks.c */ +extern __init void ceph_flock_init(void); +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); +extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); + +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/kernel/fs/ceph/xattr.c b/kernel/fs/ceph/xattr.c new file mode 100644 index 000000000..cd7ffad40 --- /dev/null +++ b/kernel/fs/ceph/xattr.c @@ -0,0 +1,1117 @@ +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include + +#include +#include +#include + +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + +static bool ceph_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +/* + * These define virtual xattrs exposing the recursive directory + * statistics and layout metadata. + */ +struct ceph_vxattr { + char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ + size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); + bool readonly, hidden; + bool (*exists_cb)(struct ceph_inode_info *ci); +}; + +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + size_t s; + char *p = (char *)&ci->i_layout; + + for (s = 0; s < sizeof(ci->i_layout); s++, p++) + if (*p) + return true; + return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + char buf[128]; + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + (unsigned long long)pool); + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } + up_read(&osdc->map_sem); + return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, "%s", pool_name); + else + ret = snprintf(val, size, "%lld", (unsigned long long)pool); + up_read(&osdc->map_sem); + return ret; +} + +/* directories */ + +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); +} + +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files); +} + +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_subdirs); +} + +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles); +} + +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rbytes); +} + +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); +} + + +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + .hidden = false, \ + .exists_cb = NULL, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + } + +static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { .name = NULL, 0 } /* Required table terminator */ +}; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ + +/* files */ + +static struct ceph_vxattr ceph_file_vxattrs[] = { + { + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + { .name = NULL, 0 } /* Required table terminator */ +}; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ + +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + return ceph_dir_vxattrs; + else if (S_ISREG(inode->i_mode)) + return ceph_file_vxattrs; + return NULL; +} + +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG_ON(vxattrs); + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + if (!vxattr->hidden) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, + const char *name) +{ + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + + return NULL; +} + +static int __set_xattr(struct ceph_inode_info *ci, + const char *name, int name_len, + const char *val, int val_len, + int flags, int update_xattr, + struct ceph_inode_xattr **newxattr) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int c; + int new = 0; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + if (name_len == xattr->name_len) + break; + else if (name_len < xattr->name_len) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + xattr = NULL; + } + + if (update_xattr) { + int err = 0; + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } + } + + if (!xattr) { + new = 1; + xattr = *newxattr; + xattr->name = name; + xattr->name_len = name_len; + xattr->should_free_name = update_xattr; + + ci->i_xattrs.count++; + dout("__set_xattr count=%d\n", ci->i_xattrs.count); + } else { + kfree(*newxattr); + *newxattr = NULL; + if (xattr->should_free_val) + kfree((void *)xattr->val); + + if (update_xattr) { + kfree((void *)name); + name = xattr->name; + } + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + } + ci->i_xattrs.names_size += name_len; + ci->i_xattrs.vals_size += val_len; + if (val) + xattr->val = val; + else + xattr->val = ""; + + xattr->val_len = val_len; + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); + + if (new) { + rb_link_node(&xattr->node, parent, p); + rb_insert_color(&xattr->node, &ci->i_xattrs.index); + dout("__set_xattr_val p=%p\n", p); + } + + dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", + ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); + + return 0; +} + +static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); + int c; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + dout("__get_xattr %s: found %.*s\n", name, + xattr->val_len, xattr->val); + return xattr; + } + } + + dout("__get_xattr %s: not found\n", name); + + return NULL; +} + +static void __free_xattr(struct ceph_inode_xattr *xattr) +{ + BUG_ON(!xattr); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + kfree(xattr); +} + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr) +{ + if (!xattr) + return -ENODATA; + + rb_erase(&xattr->node, &ci->i_xattrs.index); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + ci->i_xattrs.count--; + kfree(xattr); + + return 0; +} + +static int __remove_xattr_by_name(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct ceph_inode_xattr *xattr; + int err; + + p = &ci->i_xattrs.index.rb_node; + xattr = __get_xattr(ci, name); + err = __remove_xattr(ci, xattr); + return err; +} + +static char *__copy_xattr_names(struct ceph_inode_info *ci, + char *dest) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + memcpy(dest, xattr->name, xattr->name_len); + dest[xattr->name_len] = '\0'; + + dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); + + dest += xattr->name_len + 1; + p = rb_next(p); + } + + return dest; +} + +void __ceph_destroy_xattrs(struct ceph_inode_info *ci) +{ + struct rb_node *p, *tmp; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + + dout("__ceph_destroy_xattrs p=%p\n", p); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + tmp = p; + p = rb_next(tmp); + dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, + xattr->name_len, xattr->name); + rb_erase(tmp, &ci->i_xattrs.index); + + __free_xattr(xattr); + } + + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.index_version = 0; + ci->i_xattrs.count = 0; + ci->i_xattrs.index = RB_ROOT; +} + +static int __build_xattrs(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + u32 namelen; + u32 numattr = 0; + void *p, *end; + u32 len; + const char *name, *val; + struct ceph_inode_info *ci = ceph_inode(inode); + int xattr_version; + struct ceph_inode_xattr **xattrs = NULL; + int err = 0; + int i; + + dout("__build_xattrs() len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + + if (ci->i_xattrs.index_version >= ci->i_xattrs.version) + return 0; /* already built */ + + __ceph_destroy_xattrs(ci); + +start: + /* updated internal xattr rb tree */ + if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { + p = ci->i_xattrs.blob->vec.iov_base; + end = p + ci->i_xattrs.blob->vec.iov_len; + ceph_decode_32_safe(&p, end, numattr, bad); + xattr_version = ci->i_xattrs.version; + spin_unlock(&ci->i_ceph_lock); + + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), + GFP_NOFS); + err = -ENOMEM; + if (!xattrs) + goto bad_lock; + + for (i = 0; i < numattr; i++) { + xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), + GFP_NOFS); + if (!xattrs[i]) + goto bad_lock; + } + + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.version != xattr_version) { + /* lost a race, retry */ + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + xattrs = NULL; + goto start; + } + err = -EIO; + while (numattr--) { + ceph_decode_32_safe(&p, end, len, bad); + namelen = len; + name = p; + p += len; + ceph_decode_32_safe(&p, end, len, bad); + val = p; + p += len; + + err = __set_xattr(ci, name, namelen, val, len, + 0, 0, &xattrs[numattr]); + + if (err < 0) + goto bad; + } + kfree(xattrs); + } + ci->i_xattrs.index_version = ci->i_xattrs.version; + ci->i_xattrs.dirty = false; + + return err; +bad_lock: + spin_lock(&ci->i_ceph_lock); +bad: + if (xattrs) { + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + } + ci->i_xattrs.names_size = 0; + return err; +} + +static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, + int val_size) +{ + /* + * 4 bytes for the length, and additional 4 bytes per each xattr name, + * 4 bytes per each value + */ + int size = 4 + ci->i_xattrs.count*(4 + 4) + + ci->i_xattrs.names_size + + ci->i_xattrs.vals_size; + dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", + ci->i_xattrs.count, ci->i_xattrs.names_size, + ci->i_xattrs.vals_size); + + if (name_size) + size += 4 + 4 + name_size + val_size; + + return size; +} + +/* + * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * and swap into place. + */ +void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + void *dest; + + dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + if (ci->i_xattrs.dirty) { + int need = __get_required_blob_size(ci, 0, 0); + + BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); + + p = rb_first(&ci->i_xattrs.index); + dest = ci->i_xattrs.prealloc_blob->vec.iov_base; + + ceph_encode_32(&dest, ci->i_xattrs.count); + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + + ceph_encode_32(&dest, xattr->name_len); + memcpy(dest, xattr->name, xattr->name_len); + dest += xattr->name_len; + ceph_encode_32(&dest, xattr->val_len); + memcpy(dest, xattr->val, xattr->val_len); + dest += xattr->val_len; + + p = rb_next(p); + } + + /* adjust buffer len; it may be larger than we need */ + ci->i_xattrs.prealloc_blob->vec.iov_len = + dest - ci->i_xattrs.prealloc_blob->vec.iov_base; + + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; + } +} + +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int err; + struct ceph_inode_xattr *xattr; + struct ceph_vxattr *vxattr = NULL; + + if (!ceph_is_valid_xattr(name)) + return -ENODATA; + + /* let's see if a virtual xattr was requested */ + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + return err; + } + + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { + spin_unlock(&ci->i_ceph_lock); + /* get xattrs from mds (if we don't already have them) */ + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); + if (err) + return err; + spin_lock(&ci->i_ceph_lock); + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + + err = -ENODATA; /* == ENOATTR */ + xattr = __get_xattr(ci, name); + if (!xattr) + goto out; + + err = -ERANGE; + if (size && size < xattr->val_len) + goto out; + + err = xattr->val_len; + if (size == 0) + goto out; + + memcpy(value, xattr->val, xattr->val_len); + +out: + spin_unlock(&ci->i_ceph_lock); + return err; +} + +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(d_inode(dentry), name, value, size); +} + +ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); + u32 vir_namelen = 0; + u32 namelen; + int err; + u32 len; + int i; + + spin_lock(&ci->i_ceph_lock); + dout("listxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { + spin_unlock(&ci->i_ceph_lock); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); + if (err) + return err; + spin_lock(&ci->i_ceph_lock); + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + + /* adding 1 byte per each variable due to the null termination */ + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; + err = -ERANGE; + if (size && vir_namelen + namelen > size) + goto out; + + err = namelen + vir_namelen; + if (size == 0) + goto out; + + names = __copy_xattr_names(ci, names); + + /* virtual xattr names, too */ + err = namelen; + if (vxattrs) { + for (i = 0; vxattrs[i].name; i++) { + if (!vxattrs[i].hidden && + !(vxattrs[i].exists_cb && + !vxattrs[i].exists_cb(ci))) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + err += len + 1; + } + } + } + +out: + spin_unlock(&ci->i_ceph_lock); + return err; +} + +static int ceph_sync_setxattr(struct dentry *dentry, const char *name, + const char *value, size_t size, int flags) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *inode = d_inode(dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_pagelist *pagelist = NULL; + int err; + + if (size > 0) { + /* copy value into pagelist */ + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + return -ENOMEM; + + ceph_pagelist_init(pagelist); + err = ceph_pagelist_append(pagelist, value, size); + if (err) + goto out; + } else if (!value) { + flags |= CEPH_XATTR_REMOVE; + } + + dout("setxattr value=%.*s\n", (int)size, value); + + /* do request */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } + + req->r_pagelist = pagelist; + pagelist = NULL; + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + +out: + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + struct ceph_vxattr *vxattr; + struct ceph_inode_info *ci = ceph_inode(inode); + int issued; + int err; + int dirty = 0; + int name_len = strlen(name); + int val_len = size; + char *newname = NULL; + char *newval = NULL; + struct ceph_inode_xattr *xattr = NULL; + int required_blob_size; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + + /* preallocate memory for xattr name, value, index node */ + err = -ENOMEM; + newname = kmemdup(name, name_len + 1, GFP_NOFS); + if (!newname) + goto out; + + if (val_len) { + newval = kmemdup(value, val_len, GFP_NOFS); + if (!newval) + goto out; + } + + xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); + if (!xattr) + goto out; + + spin_lock(&ci->i_ceph_lock); +retry: + issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); + if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); + + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + } + + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + return err; + +do_sync: + spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: + err = ceph_sync_setxattr(dentry, name, value, size, flags); +out: + kfree(newname); + kfree(newval); + kfree(xattr); + return err; +} + +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (size == 0) + value = ""; /* empty EA, do not remove */ + + return __ceph_setxattr(dentry, name, value, size, flags); +} + +static int ceph_send_removexattr(struct dentry *dentry, const char *name) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = d_inode(dentry); + struct ceph_mds_request *req; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) + return -ENOMEM; + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + return err; +} + +int __ceph_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = d_inode(dentry); + struct ceph_vxattr *vxattr; + struct ceph_inode_info *ci = ceph_inode(inode); + int issued; + int err; + int required_blob_size; + int dirty; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + + err = -ENOMEM; + spin_lock(&ci->i_ceph_lock); +retry: + issued = __ceph_caps_issued(ci, NULL); + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, 0, 0); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + + err = __remove_xattr_by_name(ceph_inode(inode), name); + + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + return err; +do_sync: + spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: + err = ceph_send_removexattr(dentry, name); +out: + return err; +} + +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} diff --git a/kernel/fs/char_dev.c b/kernel/fs/char_dev.c new file mode 100644 index 000000000..ea06a3d03 --- /dev/null +++ b/kernel/fs/char_dev.c @@ -0,0 +1,569 @@ +/* + * linux/fs/char_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static struct kobj_map *cdev_map; + +static DEFINE_MUTEX(chrdevs_lock); + +static struct char_device_struct { + struct char_device_struct *next; + unsigned int major; + unsigned int baseminor; + int minorct; + char name[64]; + struct cdev *cdev; /* will die */ +} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; + +/* index in the above */ +static inline int major_to_index(unsigned major) +{ + return major % CHRDEV_MAJOR_HASH_SIZE; +} + +#ifdef CONFIG_PROC_FS + +void chrdev_show(struct seq_file *f, off_t offset) +{ + struct char_device_struct *cd; + + if (offset < CHRDEV_MAJOR_HASH_SIZE) { + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[offset]; cd; cd = cd->next) + seq_printf(f, "%3d %s\n", cd->major, cd->name); + mutex_unlock(&chrdevs_lock); + } +} + +#endif /* CONFIG_PROC_FS */ + +/* + * Register a single major with a specified minor range. + * + * If major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If major > 0 this function will attempt to reserve the passed range of + * minors and will return zero on success. + * + * Returns a -ve errno on failure. + */ +static struct char_device_struct * +__register_chrdev_region(unsigned int major, unsigned int baseminor, + int minorct, const char *name) +{ + struct char_device_struct *cd, **cp; + int ret = 0; + int i; + + cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); + if (cd == NULL) + return ERR_PTR(-ENOMEM); + + mutex_lock(&chrdevs_lock); + + /* temporary */ + if (major == 0) { + for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { + if (chrdevs[i] == NULL) + break; + } + + if (i == 0) { + ret = -EBUSY; + goto out; + } + major = i; + } + + cd->major = major; + cd->baseminor = baseminor; + cd->minorct = minorct; + strlcpy(cd->name, name, sizeof(cd->name)); + + i = major_to_index(major); + + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major > major || + ((*cp)->major == major && + (((*cp)->baseminor >= baseminor) || + ((*cp)->baseminor + (*cp)->minorct > baseminor)))) + break; + + /* Check for overlapping minor ranges. */ + if (*cp && (*cp)->major == major) { + int old_min = (*cp)->baseminor; + int old_max = (*cp)->baseminor + (*cp)->minorct - 1; + int new_min = baseminor; + int new_max = baseminor + minorct - 1; + + /* New driver overlaps from the left. */ + if (new_max >= old_min && new_max <= old_max) { + ret = -EBUSY; + goto out; + } + + /* New driver overlaps from the right. */ + if (new_min <= old_max && new_min >= old_min) { + ret = -EBUSY; + goto out; + } + } + + cd->next = *cp; + *cp = cd; + mutex_unlock(&chrdevs_lock); + return cd; +out: + mutex_unlock(&chrdevs_lock); + kfree(cd); + return ERR_PTR(ret); +} + +static struct char_device_struct * +__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) +{ + struct char_device_struct *cd = NULL, **cp; + int i = major_to_index(major); + + mutex_lock(&chrdevs_lock); + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major == major && + (*cp)->baseminor == baseminor && + (*cp)->minorct == minorct) + break; + if (*cp) { + cd = *cp; + *cp = cd->next; + } + mutex_unlock(&chrdevs_lock); + return cd; +} + +/** + * register_chrdev_region() - register a range of device numbers + * @from: the first in the desired range of device numbers; must include + * the major number. + * @count: the number of consecutive device numbers required + * @name: the name of the device or driver. + * + * Return value is zero on success, a negative error code on failure. + */ +int register_chrdev_region(dev_t from, unsigned count, const char *name) +{ + struct char_device_struct *cd; + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + cd = __register_chrdev_region(MAJOR(n), MINOR(n), + next - n, name); + if (IS_ERR(cd)) + goto fail; + } + return 0; +fail: + to = n; + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } + return PTR_ERR(cd); +} + +/** + * alloc_chrdev_region() - register a range of char device numbers + * @dev: output parameter for first assigned number + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: the name of the associated device or driver + * + * Allocates a range of char device numbers. The major number will be + * chosen dynamically, and returned (along with the first minor number) + * in @dev. Returns zero or a negative error code. + */ +int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, + const char *name) +{ + struct char_device_struct *cd; + cd = __register_chrdev_region(0, baseminor, count, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + *dev = MKDEV(cd->major, cd->baseminor); + return 0; +} + +/** + * __register_chrdev() - create and register a cdev occupying a range of minors + * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + */ +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) +{ + struct char_device_struct *cd; + struct cdev *cdev; + int err = -ENOMEM; + + cd = __register_chrdev_region(major, baseminor, count, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + + cdev = cdev_alloc(); + if (!cdev) + goto out2; + + cdev->owner = fops->owner; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, "%s", name); + + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); + if (err) + goto out; + + cd->cdev = cdev; + + return major ? 0 : cd->major; +out: + kobject_put(&cdev->kobj); +out2: + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); + return err; +} + +/** + * unregister_chrdev_region() - return a range of device numbers + * @from: the first in the range of numbers to unregister + * @count: the number of device numbers to unregister + * + * This function will unregister a range of @count device numbers, + * starting with @from. The caller should normally be the one who + * allocated those numbers in the first place... + */ +void unregister_chrdev_region(dev_t from, unsigned count) +{ + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } +} + +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) +{ + struct char_device_struct *cd; + + cd = __unregister_chrdev_region(major, baseminor, count); + if (cd && cd->cdev) + cdev_del(cd->cdev); + kfree(cd); +} + +static DEFINE_SPINLOCK(cdev_lock); + +static struct kobject *cdev_get(struct cdev *p) +{ + struct module *owner = p->owner; + struct kobject *kobj; + + if (owner && !try_module_get(owner)) + return NULL; + kobj = kobject_get(&p->kobj); + if (!kobj) + module_put(owner); + return kobj; +} + +void cdev_put(struct cdev *p) +{ + if (p) { + struct module *owner = p->owner; + kobject_put(&p->kobj); + module_put(owner); + } +} + +/* + * Called every time a character special file is opened + */ +static int chrdev_open(struct inode *inode, struct file *filp) +{ + const struct file_operations *fops; + struct cdev *p; + struct cdev *new = NULL; + int ret = 0; + + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { + struct kobject *kobj; + int idx; + spin_unlock(&cdev_lock); + kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); + if (!kobj) + return -ENXIO; + new = container_of(kobj, struct cdev, kobj); + spin_lock(&cdev_lock); + /* Check i_cdev again in case somebody beat us to it while + we dropped the lock. */ + p = inode->i_cdev; + if (!p) { + inode->i_cdev = p = new; + list_add(&inode->i_devices, &p->list); + new = NULL; + } else if (!cdev_get(p)) + ret = -ENXIO; + } else if (!cdev_get(p)) + ret = -ENXIO; + spin_unlock(&cdev_lock); + cdev_put(new); + if (ret) + return ret; + + ret = -ENXIO; + fops = fops_get(p->ops); + if (!fops) + goto out_cdev_put; + + replace_fops(filp, fops); + if (filp->f_op->open) { + ret = filp->f_op->open(inode, filp); + if (ret) + goto out_cdev_put; + } + + return 0; + + out_cdev_put: + cdev_put(p); + return ret; +} + +void cd_forget(struct inode *inode) +{ + spin_lock(&cdev_lock); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + spin_unlock(&cdev_lock); +} + +static void cdev_purge(struct cdev *cdev) +{ + spin_lock(&cdev_lock); + while (!list_empty(&cdev->list)) { + struct inode *inode; + inode = container_of(cdev->list.next, struct inode, i_devices); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + } + spin_unlock(&cdev_lock); +} + +/* + * Dummy default file-operations: the only thing this does + * is contain the open that then fills in the correct operations + * depending on the special file... + */ +const struct file_operations def_chr_fops = { + .open = chrdev_open, + .llseek = noop_llseek, +}; + +static struct kobject *exact_match(dev_t dev, int *part, void *data) +{ + struct cdev *p = data; + return &p->kobj; +} + +static int exact_lock(dev_t dev, void *data) +{ + struct cdev *p = data; + return cdev_get(p) ? 0 : -1; +} + +/** + * cdev_add() - add a char device to the system + * @p: the cdev structure for the device + * @dev: the first device number for which this device is responsible + * @count: the number of consecutive minor numbers corresponding to this + * device + * + * cdev_add() adds the device represented by @p to the system, making it + * live immediately. A negative error code is returned on failure. + */ +int cdev_add(struct cdev *p, dev_t dev, unsigned count) +{ + int error; + + p->dev = dev; + p->count = count; + + error = kobj_map(cdev_map, dev, count, NULL, + exact_match, exact_lock, p); + if (error) + return error; + + kobject_get(p->kobj.parent); + + return 0; +} + +static void cdev_unmap(dev_t dev, unsigned count) +{ + kobj_unmap(cdev_map, dev, count); +} + +/** + * cdev_del() - remove a cdev from the system + * @p: the cdev structure to be removed + * + * cdev_del() removes @p from the system, possibly freeing the structure + * itself. + */ +void cdev_del(struct cdev *p) +{ + cdev_unmap(p->dev, p->count); + kobject_put(&p->kobj); +} + + +static void cdev_default_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + struct kobject *parent = kobj->parent; + + cdev_purge(p); + kobject_put(parent); +} + +static void cdev_dynamic_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + struct kobject *parent = kobj->parent; + + cdev_purge(p); + kfree(p); + kobject_put(parent); +} + +static struct kobj_type ktype_cdev_default = { + .release = cdev_default_release, +}; + +static struct kobj_type ktype_cdev_dynamic = { + .release = cdev_dynamic_release, +}; + +/** + * cdev_alloc() - allocate a cdev structure + * + * Allocates and returns a cdev structure, or NULL on failure. + */ +struct cdev *cdev_alloc(void) +{ + struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); + if (p) { + INIT_LIST_HEAD(&p->list); + kobject_init(&p->kobj, &ktype_cdev_dynamic); + } + return p; +} + +/** + * cdev_init() - initialize a cdev structure + * @cdev: the structure to initialize + * @fops: the file_operations for this device + * + * Initializes @cdev, remembering @fops, making it ready to add to the + * system with cdev_add(). + */ +void cdev_init(struct cdev *cdev, const struct file_operations *fops) +{ + memset(cdev, 0, sizeof *cdev); + INIT_LIST_HEAD(&cdev->list); + kobject_init(&cdev->kobj, &ktype_cdev_default); + cdev->ops = fops; +} + +static struct kobject *base_probe(dev_t dev, int *part, void *data) +{ + if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("char-major-%d", MAJOR(dev)); + return NULL; +} + +void __init chrdev_init(void) +{ + cdev_map = kobj_map_init(base_probe, &chrdevs_lock); +} + + +/* Let modules do char dev stuff */ +EXPORT_SYMBOL(register_chrdev_region); +EXPORT_SYMBOL(unregister_chrdev_region); +EXPORT_SYMBOL(alloc_chrdev_region); +EXPORT_SYMBOL(cdev_init); +EXPORT_SYMBOL(cdev_alloc); +EXPORT_SYMBOL(cdev_del); +EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); diff --git a/kernel/fs/cifs/Kconfig b/kernel/fs/cifs/Kconfig new file mode 100644 index 000000000..a2172f3f6 --- /dev/null +++ b/kernel/fs/cifs/Kconfig @@ -0,0 +1,202 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + select CRYPTO + select CRYPTO_MD4 + select CRYPTO_MD5 + select CRYPTO_HMAC + select CRYPTO_ARC4 + select CRYPTO_ECB + select CRYPTO_DES + select CRYPTO_SHA256 + select CRYPTO_CMAC + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, Windows 2008, + NT 4 and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The module also provides optional support for the followon + protocols for CIFS including SMB3, which enables + useful performance and security features (see the description + of CONFIG_CIFS_SMB2). + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Enables an upcall mechanism for CIFS which accesses userspace helper + utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets + which are needed to mount to certain secure servers (for which more + secure Kerberos authentication is required). If unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_ACL + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS + help + Allows fetching CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. See the man + page for getcifsacl for more information. + +config CIFS_DEBUG + bool "Enable CIFS debugging routines" + default y + depends on CIFS + help + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS_DEBUG + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Distributed File System (DFS) support is used to access shares + transparently in an enterprise name space, even if the share + moves to a different server. This feature also enables + an upcall mechanism for CIFS which contacts userspace helper + utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. + +config CIFS_NFSD_EXPORT + bool "Allow nfsd to export CIFS file system" + depends on CIFS && BROKEN + help + Allows NFS server to export a CIFS mounted share (nfsd over cifs) + +config CIFS_SMB2 + bool "SMB2 and SMB3 network file system support" + depends on CIFS && INET + select NLS + select KEYS + select FSCACHE + select DNS_RESOLVER + + help + This enables support for the Server Message Block version 2 + family of protocols, including SMB3. SMB3 support is + enabled on mount by specifying "vers=3.0" in the mount + options. These protocols are the successors to the popular + CIFS and SMB network file sharing protocols. SMB3 is the + native file sharing mechanism for the more recent + versions of Windows (Windows 8 and Windows 2012 and + later) and Samba server and many others support SMB3 well. + In general SMB3 enables better performance, security + and features, than would be possible with CIFS (Note that + when mounting to Samba, due to the CIFS POSIX extensions, + CIFS mounts can provide slightly better POSIX compatibility + than SMB3 mounts do though). Note that SMB2/SMB3 mount + options are also slightly simpler (compared to CIFS) due + to protocol improvements. + +config CIFS_FSCACHE + bool "Provide CIFS client caching support" + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + diff --git a/kernel/fs/cifs/Makefile b/kernel/fs/cifs/Makefile new file mode 100644 index 000000000..1964d212a --- /dev/null +++ b/kernel/fs/cifs/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for Linux CIFS VFS client +# +obj-$(CONFIG_CIFS) += cifs.o + +cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ + link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ + cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ + readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o + +cifs-$(CONFIG_CIFS_ACL) += cifsacl.o + +cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o + +cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o + +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o + +cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \ + smb2misc.o smb2pdu.o smb2inode.o smb2file.o diff --git a/kernel/fs/cifs/asn1.c b/kernel/fs/cifs/asn1.c new file mode 100644 index 000000000..a3b56544c --- /dev/null +++ b/kernel/fs/cifs/asn1.c @@ -0,0 +1,623 @@ +/* + * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in + * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsproto.h" + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents or N/A */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_ENUM 10 /* Enumerated */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +#define SPNEGO_OID_LEN 7 +#define NTLMSSP_OID_LEN 10 +#define KRB5_OID_LEN 7 +#define KRB5U2U_OID_LEN 8 +#define MSKRB5_OID_LEN 7 +static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; +static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; +static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; +static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; +static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; + +/* + * ASN.1 context. + */ +struct asn1_ctx { + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr { + unsigned char *data; + unsigned int len; +}; + +static void +asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char +asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */ +static unsigned char +asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) +{ + unsigned char ch; + + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + + ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ + if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ + *val = *(++(ctx->pointer)); /* value has enum value */ + else + return 0; + + ctx->pointer++; + return 1; +} +#endif + +static unsigned char +asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char +asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, unsigned int *con, unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char +asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = (unsigned char) (ch & 0x7F); + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + + return 1; +} + +static unsigned char +asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, unsigned int *con, unsigned int *tag) +{ + unsigned int def = 0; + unsigned int len = 0; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char +asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == NULL) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx, + unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) + len = 0; + else + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) + len = 0; + else + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char +asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} */ + +static unsigned char +asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static int +asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, unsigned long **oid, unsigned int *len) +{ + unsigned long subid; + unsigned int size; + unsigned long *optr; + + size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) + return 0; + + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) + return 0; + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr[0] = 0; + optr[1] = subid; + } else if (subid < 80) { + optr[0] = 1; + optr[1] = subid - 40; + } else { + optr[0] = 2; + optr[1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +static int +compare_oid(unsigned long *oid1, unsigned int oid1len, + unsigned long *oid2, unsigned int oid2len) +{ + unsigned int i; + + if (oid1len != oid2len) + return 0; + else { + for (i = 0; i < oid1len; i++) { + if (oid1[i] != oid2[i]) + return 0; + } + return 1; + } +} + + /* BB check for endian conversion issues here */ + +int +decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server) +{ + struct asn1_ctx ctx; + unsigned char *end; + unsigned char *sequence_end; + unsigned long *oid = NULL; + unsigned int cls, con, tag, oidlen, rc; + + /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ + + asn1_open(&ctx, security_blob, length); + + /* GSSAPI header */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); + return 0; + } else if ((cls != ASN1_APL) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag); + return 0; + } + + /* Check for SPNEGO OID -- remember to free obj->oid */ + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (rc) { + if ((tag == ASN1_OJI) && (con == ASN1_PRI) && + (cls == ASN1_UNI)) { + rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); + if (rc) { + rc = compare_oid(oid, oidlen, SPNEGO_OID, + SPNEGO_OID_LEN); + kfree(oid); + } + } else + rc = 0; + } + + /* SPNEGO OID not present or garbled -- bail out */ + if (!rc) { + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); + return 0; + } + + /* SPNEGO */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cifs_dbg(FYI, "Error decoding negTokenInit\n"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); + return 0; + } + + /* negTokenInit */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cifs_dbg(FYI, "Error decoding negTokenInit\n"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); + return 0; + } + + /* sequence */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); + return 0; + } + + /* sequence of */ + if (asn1_header_decode + (&ctx, &sequence_end, &cls, &con, &tag) == 0) { + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); + return 0; + } + + /* list of security mechanisms */ + while (!asn1_eoc_decode(&ctx, sequence_end)) { + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (!rc) { + cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n"); + return 0; + } + if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { + if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { + + cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n", + oidlen, *oid, *(oid + 1), *(oid + 2), + *(oid + 3)); + + if (compare_oid(oid, oidlen, MSKRB5_OID, + MSKRB5_OID_LEN)) + server->sec_mskerberos = true; + else if (compare_oid(oid, oidlen, KRB5U2U_OID, + KRB5U2U_OID_LEN)) + server->sec_kerberosu2u = true; + else if (compare_oid(oid, oidlen, KRB5_OID, + KRB5_OID_LEN)) + server->sec_kerberos = true; + else if (compare_oid(oid, oidlen, NTLMSSP_OID, + NTLMSSP_OID_LEN)) + server->sec_ntlmssp = true; + + kfree(oid); + } + } else { + cifs_dbg(FYI, "Should be an oid what is going on?\n"); + } + } + + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ + return 1; +} diff --git a/kernel/fs/cifs/cache.c b/kernel/fs/cifs/cache.c new file mode 100644 index 000000000..6c665bf4a --- /dev/null +++ b/kernel/fs/cifs/cache.c @@ -0,0 +1,333 @@ +/* + * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifs_debug.h" + +/* + * CIFS filesystem definition for FS-Cache + */ +struct fscache_netfs cifs_fscache_netfs = { + .name = "cifs", + .version = 0, +}; + +/* + * Register CIFS for caching with FS-Cache + */ +int cifs_fscache_register(void) +{ + return fscache_register_netfs(&cifs_fscache_netfs); +} + +/* + * Unregister CIFS for caching + */ +void cifs_fscache_unregister(void) +{ + fscache_unregister_netfs(&cifs_fscache_netfs); +} + +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + uint16_t family; /* address family */ + __be16 port; /* IP port */ + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } addr[0]; +}; + +/* + * Server object keyed by {IPaddress,port,family} tuple + */ +static uint16_t cifs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct TCP_Server_Info *server = cookie_netfs_data; + const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; + const struct sockaddr_in *addr = (struct sockaddr_in *) sa; + const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; + struct cifs_server_key *key = buffer; + uint16_t key_len = sizeof(struct cifs_server_key); + + memset(key, 0, key_len); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + switch (sa->sa_family) { + case AF_INET: + key->family = sa->sa_family; + key->port = addr->sin_port; + key->addr[0].ipv4_addr = addr->sin_addr; + key_len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->family = sa->sa_family; + key->port = addr6->sin6_port; + key->addr[0].ipv6_addr = addr6->sin6_addr; + key_len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + key_len = 0; + break; + } + + return key_len; +} + +/* + * Server object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_server_index_def = { + .name = "CIFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_server_get_key, +}; + +/* + * Auxiliary data attached to CIFS superblock within the cache + */ +struct cifs_fscache_super_auxdata { + u64 resource_id; /* unique server resource id */ +}; + +static char *extract_sharename(const char *treename) +{ + const char *src; + char *delim, *dst; + int len; + + /* skip double chars at the beginning */ + src = treename + 2; + + /* share name is always preceded by '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + delim++; + len = strlen(delim); + + /* caller has to free the memory */ + dst = kstrndup(delim, len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + return dst; +} + +/* + * Superblock object currently keyed by share name + */ +static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + const struct cifs_tcon *tcon = cookie_netfs_data; + char *sharename; + uint16_t len; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); + sharename = NULL; + return 0; + } + + len = strlen(sharename); + if (len > maxbuf) + return 0; + + memcpy(buffer, sharename, len); + + kfree(sharename); + + return len; +} + +static uint16_t +cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifs_tcon *tcon = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifs_tcon *tcon = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Superblock object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_super_index_def = { + .name = "CIFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_super_get_key, + .get_aux = cifs_fscache_super_get_aux, + .check_aux = cifs_fscache_super_check_aux, +}; + +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + uint16_t keylen; + + /* use the UniqueId as the key */ + keylen = sizeof(cifsi->uniqueid); + if (keylen > maxbuf) + keylen = 0; + else + memcpy(buffer, &cifsi->uniqueid, keylen); + + return keylen; +} + +static void +cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + *size = cifsi->vfs_inode.i_size; +} + +static uint16_t +cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_inode_auxdata auxdata; + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_inode_auxdata auxdata; + struct cifsInodeInfo *cifsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct cifsInodeInfo *cifsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); + + for (;;) { + nr_pages = pagevec_lookup(&pvec, + cifsi->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def cifs_fscache_inode_object_def = { + .name = "CIFS.uniqueid", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = cifs_fscache_inode_get_key, + .get_attr = cifs_fscache_inode_get_attr, + .get_aux = cifs_fscache_inode_get_aux, + .check_aux = cifs_fscache_inode_check_aux, + .now_uncached = cifs_fscache_inode_now_uncached, +}; diff --git a/kernel/fs/cifs/cifs_debug.c b/kernel/fs/cifs/cifs_debug.c new file mode 100644 index 000000000..7febcf247 --- /dev/null +++ b/kernel/fs/cifs/cifs_debug.c @@ -0,0 +1,701 @@ +/* + * fs/cifs_debug.c + * + * Copyright (C) International Business Machines Corp., 2000,2005 + * + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +void +cifs_dump_mem(char *label, void *data, int length) +{ + pr_debug("%s: dump of %d bytes of data at 0x%p\n", label, length, data); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 4, + data, length, true); +} + +#ifdef CONFIG_CIFS_DEBUG +void cifs_vfs_err(const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("CIFS VFS: %pV", &vaf); + + va_end(args); +} +#endif + +void cifs_dump_detail(void *buf) +{ +#ifdef CONFIG_CIFS_DEBUG2 + struct smb_hdr *smb = (struct smb_hdr *)buf; + + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", + smb->Command, smb->Status.CifsError, + smb->Flags, smb->Flags2, smb->Mid, smb->Pid); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb)); +#endif /* CONFIG_CIFS_DEBUG2 */ +} + +void cifs_dump_mids(struct TCP_Server_Info *server) +{ +#ifdef CONFIG_CIFS_DEBUG2 + struct list_head *tmp; + struct mid_q_entry *mid_entry; + + if (server == NULL) + return; + + cifs_dbg(VFS, "Dump pending requests:\n"); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); +#ifdef CONFIG_CIFS_STATS2 + cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n", + mid_entry->large_buf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies); +#endif /* STATS2 */ + cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", + mid_entry->multiRsp, mid_entry->multiEnd); + if (mid_entry->resp_buf) { + cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_mem("existing buf: ", + mid_entry->resp_buf, 62); + } + } + spin_unlock(&GlobalMid_Lock); +#endif /* CONFIG_CIFS_DEBUG2 */ +} + +#ifdef CONFIG_PROC_FS +static int cifs_debug_data_proc_show(struct seq_file *m, void *v) +{ + struct list_head *tmp1, *tmp2, *tmp3; + struct mid_q_entry *mid_entry; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + int i, j; + __u32 dev_type; + + seq_puts(m, + "Display Internal CIFS Data Structures for Debugging\n" + "---------------------------------------------------\n"); + seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Features:"); +#ifdef CONFIG_CIFS_DFS_UPCALL + seq_printf(m, " dfs"); +#endif +#ifdef CONFIG_CIFS_FSCACHE + seq_printf(m, " fscache"); +#endif +#ifdef CONFIG_CIFS_WEAK_PW_HASH + seq_printf(m, " lanman"); +#endif +#ifdef CONFIG_CIFS_POSIX + seq_printf(m, " posix"); +#endif +#ifdef CONFIG_CIFS_UPCALL + seq_printf(m, " spnego"); +#endif +#ifdef CONFIG_CIFS_XATTR + seq_printf(m, " xattr"); +#endif +#ifdef CONFIG_CIFS_ACL + seq_printf(m, " acl"); +#endif + seq_putc(m, '\n'); + seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); + seq_printf(m, "Servers:"); + + i = 0; + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + i++; + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + if ((ses->serverDomain == NULL) || + (ses->serverOS == NULL) || + (ses->serverNOS == NULL)) { + seq_printf(m, "\n%d) entry for %s not fully " + "displayed\n\t", i, ses->serverName); + } else { + seq_printf(m, + "\n%d) Name: %s Domain: %s Uses: %d OS:" + " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" + " session status: %d\t", + i, ses->serverName, ses->serverDomain, + ses->ses_count, ses->serverOS, ses->serverNOS, + ses->capabilities, ses->status); + } + seq_printf(m, "TCP status: %d\n\tLocal Users To " + "Server: %d SecMode: 0x%x Req On Wire: %d", + server->tcpStatus, server->srv_count, + server->sec_mode, in_flight(server)); + +#ifdef CONFIG_CIFS_STATS2 + seq_printf(m, " In Send: %d In MaxReq Wait: %d", + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); +#endif + + seq_puts(m, "\n\tShares:"); + j = 0; + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, struct cifs_tcon, + tcon_list); + ++j; + dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + seq_printf(m, "\n\t%d) %s Mounts: %d ", j, + tcon->treeName, tcon->tc_count); + if (tcon->nativeFileSystem) { + seq_printf(m, "Type: %s ", + tcon->nativeFileSystem); + } + seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" + "\n\tPathComponentMax: %d Status: %d", + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), + le32_to_cpu(tcon->fsAttrInfo.Attributes), + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), + tcon->tidStatus); + if (dev_type == FILE_DEVICE_DISK) + seq_puts(m, " type: DISK "); + else if (dev_type == FILE_DEVICE_CD_ROM) + seq_puts(m, " type: CDROM "); + else + seq_printf(m, " type: %d ", dev_type); + if (server->ops->dump_share_caps) + server->ops->dump_share_caps(m, tcon); + + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_putc(m, '\n'); + } + + seq_puts(m, "\n\tMIDs:\n"); + + spin_lock(&GlobalMid_Lock); + list_for_each(tmp3, &server->pending_mid_q) { + mid_entry = list_entry(tmp3, struct mid_q_entry, + qhead); + seq_printf(m, "\tState: %d com: %d pid:" + " %d cbdata: %p mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); + } + spin_unlock(&GlobalMid_Lock); + } + } + spin_unlock(&cifs_tcp_ses_lock); + seq_putc(m, '\n'); + + /* BB add code to dump additional info such as TCP session info now */ + return 0; +} + +static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_debug_data_proc_show, NULL); +} + +static const struct file_operations cifs_debug_data_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_debug_data_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_CIFS_STATS +static ssize_t cifs_stats_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + bool bv; + int rc; + struct list_head *tmp1, *tmp2, *tmp3; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + rc = get_user(c, buffer); + if (rc) + return rc; + + if (strtobool(&c, &bv) == 0) { +#ifdef CONFIG_CIFS_STATS2 + atomic_set(&totBufAllocCount, 0); + atomic_set(&totSmBufAllocCount, 0); +#endif /* CONFIG_CIFS_STATS2 */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, + struct cifs_tcon, + tcon_list); + atomic_set(&tcon->num_smbs_sent, 0); + if (server->ops->clear_stats) + server->ops->clear_stats(tcon); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + } + + return count; +} + +static int cifs_stats_proc_show(struct seq_file *m, void *v) +{ + int i; + struct list_head *tmp1, *tmp2, *tmp3; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + seq_printf(m, + "Resources in use\nCIFS Session: %d\n", + sesInfoAllocCount.counter); + seq_printf(m, "Share (unique mount targets): %d\n", + tconInfoAllocCount.counter); + seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", + bufAllocCount.counter, + cifs_min_rcv + tcpSesAllocCount.counter); + seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", + smBufAllocCount.counter, cifs_min_small); +#ifdef CONFIG_CIFS_STATS2 + seq_printf(m, "Total Large %d Small %d Allocations\n", + atomic_read(&totBufAllocCount), + atomic_read(&totSmBufAllocCount)); +#endif /* CONFIG_CIFS_STATS2 */ + + seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount)); + seq_printf(m, + "\n%d session %d share reconnects\n", + tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); + + seq_printf(m, + "Total vfs operations: %d maximum at one time: %d\n", + GlobalCurrentXid, GlobalMaxActiveXid); + + i = 0; + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, + struct cifs_tcon, + tcon_list); + i++; + seq_printf(m, "\n%d) %s", i, tcon->treeName); + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_printf(m, "\nSMBs: %d", + atomic_read(&tcon->num_smbs_sent)); + if (server->ops->print_stats) + server->ops->print_stats(m, tcon); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + + seq_putc(m, '\n'); + return 0; +} + +static int cifs_stats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_stats_proc_show, NULL); +} + +static const struct file_operations cifs_stats_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_stats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_stats_proc_write, +}; +#endif /* STATS */ + +static struct proc_dir_entry *proc_fs_cifs; +static const struct file_operations cifsFYI_proc_fops; +static const struct file_operations cifs_lookup_cache_proc_fops; +static const struct file_operations traceSMB_proc_fops; +static const struct file_operations cifs_security_flags_proc_fops; +static const struct file_operations cifs_linux_ext_proc_fops; + +void +cifs_proc_init(void) +{ + proc_fs_cifs = proc_mkdir("fs/cifs", NULL); + if (proc_fs_cifs == NULL) + return; + + proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); + +#ifdef CONFIG_CIFS_STATS + proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); +#endif /* STATS */ + proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); + proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, + &cifs_linux_ext_proc_fops); + proc_create("SecurityFlags", 0, proc_fs_cifs, + &cifs_security_flags_proc_fops); + proc_create("LookupCacheEnabled", 0, proc_fs_cifs, + &cifs_lookup_cache_proc_fops); +} + +void +cifs_proc_clean(void) +{ + if (proc_fs_cifs == NULL) + return; + + remove_proc_entry("DebugData", proc_fs_cifs); + remove_proc_entry("cifsFYI", proc_fs_cifs); + remove_proc_entry("traceSMB", proc_fs_cifs); +#ifdef CONFIG_CIFS_STATS + remove_proc_entry("Stats", proc_fs_cifs); +#endif + remove_proc_entry("SecurityFlags", proc_fs_cifs); + remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); + remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + remove_proc_entry("fs/cifs", NULL); +} + +static int cifsFYI_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", cifsFYI); + return 0; +} + +static int cifsFYI_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifsFYI_proc_show, NULL); +} + +static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char c; + bool bv; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (strtobool(&c, &bv) == 0) + cifsFYI = bv; + else if ((c > '1') && (c <= '9')) + cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ + + return count; +} + +static const struct file_operations cifsFYI_proc_fops = { + .owner = THIS_MODULE, + .open = cifsFYI_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifsFYI_proc_write, +}; + +static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", linuxExtEnabled); + return 0; +} + +static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_linux_ext_proc_show, NULL); +} + +static ssize_t cifs_linux_ext_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + bool bv; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + linuxExtEnabled = bv; + + return count; +} + +static const struct file_operations cifs_linux_ext_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_linux_ext_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_linux_ext_proc_write, +}; + +static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", lookupCacheEnabled); + return 0; +} + +static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_lookup_cache_proc_show, NULL); +} + +static ssize_t cifs_lookup_cache_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + bool bv; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + lookupCacheEnabled = bv; + + return count; +} + +static const struct file_operations cifs_lookup_cache_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_lookup_cache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_lookup_cache_proc_write, +}; + +static int traceSMB_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", traceSMB); + return 0; +} + +static int traceSMB_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, traceSMB_proc_show, NULL); +} + +static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char c; + bool bv; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + traceSMB = bv; + + return count; +} + +static const struct file_operations traceSMB_proc_fops = { + .owner = THIS_MODULE, + .open = traceSMB_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = traceSMB_proc_write, +}; + +static int cifs_security_flags_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x\n", global_secflags); + return 0; +} + +static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_security_flags_proc_show, NULL); +} + +/* + * Ensure that if someone sets a MUST flag, that we disable all other MAY + * flags except for the ones corresponding to the given MUST flag. If there are + * multiple MUST flags, then try to prefer more secure ones. + */ +static void +cifs_security_flags_handle_must_flags(unsigned int *flags) +{ + unsigned int signflags = *flags & CIFSSEC_MUST_SIGN; + + if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) + *flags = CIFSSEC_MUST_KRB5; + else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) + *flags = CIFSSEC_MUST_NTLMSSP; + else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) + *flags = CIFSSEC_MUST_NTLMV2; + else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) + *flags = CIFSSEC_MUST_NTLM; + else if (CIFSSEC_MUST_LANMAN && + (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) + *flags = CIFSSEC_MUST_LANMAN; + else if (CIFSSEC_MUST_PLNTXT && + (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) + *flags = CIFSSEC_MUST_PLNTXT; + + *flags |= signflags; +} + +static ssize_t cifs_security_flags_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + int rc; + unsigned int flags; + char flags_string[12]; + char c; + bool bv; + + if ((count < 1) || (count > 11)) + return -EINVAL; + + memset(flags_string, 0, 12); + + if (copy_from_user(flags_string, buffer, count)) + return -EFAULT; + + if (count < 3) { + /* single char or single char followed by null */ + c = flags_string[0]; + if (strtobool(&c, &bv) == 0) { + global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; + return count; + } else if (!isdigit(c)) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); + return -EINVAL; + } + } + + /* else we have a number */ + rc = kstrtouint(flags_string, 0, &flags); + if (rc) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); + return rc; + } + + cifs_dbg(FYI, "sec flags 0x%x\n", flags); + + if (flags == 0) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); + return -EINVAL; + } + + if (flags & ~CIFSSEC_MASK) { + cifs_dbg(VFS, "Unsupported security flags: 0x%x\n", + flags & ~CIFSSEC_MASK); + return -EINVAL; + } + + cifs_security_flags_handle_must_flags(&flags); + + /* flags look ok - update the global security flags for cifs module */ + global_secflags = flags; + if (global_secflags & CIFSSEC_MUST_SIGN) { + /* requiring signing implies signing is allowed */ + global_secflags |= CIFSSEC_MAY_SIGN; + cifs_dbg(FYI, "packet signing now required\n"); + } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) { + cifs_dbg(FYI, "packet signing disabled\n"); + } + /* BB should we turn on MAY flags for other MUST options? */ + return count; +} + +static const struct file_operations cifs_security_flags_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_security_flags_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_security_flags_proc_write, +}; +#else +inline void cifs_proc_init(void) +{ +} + +inline void cifs_proc_clean(void) +{ +} +#endif /* PROC_FS */ diff --git a/kernel/fs/cifs/cifs_debug.h b/kernel/fs/cifs/cifs_debug.h new file mode 100644 index 000000000..f40fbaca1 --- /dev/null +++ b/kernel/fs/cifs/cifs_debug.h @@ -0,0 +1,77 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000,2002 + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ + +#ifndef _H_CIFS_DEBUG +#define _H_CIFS_DEBUG + +void cifs_dump_mem(char *label, void *data, int length); +void cifs_dump_detail(void *); +void cifs_dump_mids(struct TCP_Server_Info *); +extern int traceSMB; /* flag which enables the function below */ +void dump_smb(void *, int); +#define CIFS_INFO 0x01 +#define CIFS_RC 0x02 +#define CIFS_TIMER 0x04 + +#define VFS 1 +#define FYI 2 +extern int cifsFYI; +#ifdef CONFIG_CIFS_DEBUG2 +#define NOISY 4 +#else +#define NOISY 0 +#endif + +/* + * debug ON + * -------- + */ +#ifdef CONFIG_CIFS_DEBUG + +__printf(1, 2) void cifs_vfs_err(const char *fmt, ...); + +/* information message: e.g., configuration, major event */ +#define cifs_dbg(type, fmt, ...) \ +do { \ + if (type == FYI) { \ + if (cifsFYI & CIFS_INFO) { \ + pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \ + } \ + } else if (type == VFS) { \ + cifs_vfs_err(fmt, ##__VA_ARGS__); \ + } else if (type == NOISY && type != 0) { \ + pr_debug(fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +/* + * debug OFF + * --------- + */ +#else /* _CIFS_DEBUG */ +#define cifs_dbg(type, fmt, ...) \ +do { \ + if (0) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) +#endif + +#endif /* _H_CIFS_DEBUG */ diff --git a/kernel/fs/cifs/cifs_dfs_ref.c b/kernel/fs/cifs/cifs_dfs_ref.c new file mode 100644 index 000000000..7dc886c9a --- /dev/null +++ b/kernel/fs/cifs/cifs_dfs_ref.c @@ -0,0 +1,379 @@ +/* + * Contains the CIFS DFS referral mounting routines used for handling + * traversal via DFS junction point + * + * Copyright (c) 2007 Igor Mammedov + * Copyright (C) International Business Machines Corp., 2008 + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifsfs.h" +#include "dns_resolve.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" + +static LIST_HEAD(cifs_dfs_automount_list); + +static void cifs_dfs_expire_automounts(struct work_struct *work); +static DECLARE_DELAYED_WORK(cifs_dfs_automount_task, + cifs_dfs_expire_automounts); +static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ; + +static void cifs_dfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &cifs_dfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); +} + +void cifs_dfs_release_automount_timer(void) +{ + BUG_ON(!list_empty(&cifs_dfs_automount_list)); + cancel_delayed_work_sync(&cifs_dfs_automount_task); +} + +/** + * cifs_build_devname - build a devicename from a UNC and optional prepath + * @nodename: pointer to UNC string + * @prepath: pointer to prefixpath (or NULL if there isn't one) + * + * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer + * big enough to hold the final thing. Copy the UNC from the nodename, and + * concatenate the prepath onto the end of it if there is one. + * + * Returns pointer to the built string, or a ERR_PTR. Caller is responsible + * for freeing the returned string. + */ +static char * +cifs_build_devname(char *nodename, const char *prepath) +{ + size_t pplen; + size_t unclen; + char *dev; + char *pos; + + /* skip over any preceding delimiters */ + nodename += strspn(nodename, "\\"); + if (!*nodename) + return ERR_PTR(-EINVAL); + + /* get length of UNC and set pos to last char */ + unclen = strlen(nodename); + pos = nodename + unclen - 1; + + /* trim off any trailing delimiters */ + while (*pos == '\\') { + --pos; + --unclen; + } + + /* allocate a buffer: + * +2 for preceding "//" + * +1 for delimiter between UNC and prepath + * +1 for trailing NULL + */ + pplen = prepath ? strlen(prepath) : 0; + dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + pos = dev; + /* add the initial "//" */ + *pos = '/'; + ++pos; + *pos = '/'; + ++pos; + + /* copy in the UNC portion from referral */ + memcpy(pos, nodename, unclen); + pos += unclen; + + /* copy the prefixpath remainder (if there is one) */ + if (pplen) { + *pos = '/'; + ++pos; + memcpy(pos, prepath, pplen); + pos += pplen; + } + + /* NULL terminator */ + *pos = '\0'; + + convert_delimiter(dev, '/'); + return dev; +} + + +/** + * cifs_compose_mount_options - creates mount options for refferral + * @sb_mountdata: parent/root DFS mount options (template) + * @fullpath: full path in UNC format + * @ref: server's referral + * @devname: pointer for saving device name + * + * creates mount options for submount based on template options sb_mountdata + * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. + * + * Returns: pointer to new mount options or ERR_PTR. + * Caller is responcible for freeing retunrned value if it is not error. + */ +char *cifs_compose_mount_options(const char *sb_mountdata, + const char *fullpath, + const struct dfs_info3_param *ref, + char **devname) +{ + int rc; + char *mountdata = NULL; + const char *prepath = NULL; + int md_len; + char *tkn_e; + char *srvIP = NULL; + char sep = ','; + int off, noff; + + if (sb_mountdata == NULL) + return ERR_PTR(-EINVAL); + + if (strlen(fullpath) - ref->path_consumed) + prepath = fullpath + ref->path_consumed; + + *devname = cifs_build_devname(ref->node_name, prepath); + if (IS_ERR(*devname)) { + rc = PTR_ERR(*devname); + *devname = NULL; + goto compose_mount_options_err; + } + + rc = dns_resolve_server_name_to_ip(*devname, &srvIP); + if (rc < 0) { + cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n", + __func__, *devname, rc); + goto compose_mount_options_err; + } + + /* + * In most cases, we'll be building a shorter string than the original, + * but we do have to assume that the address in the ip= option may be + * much longer than the original. Add the max length of an address + * string to the length of the original string to allow for worst case. + */ + md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; + mountdata = kzalloc(md_len + 1, GFP_KERNEL); + if (mountdata == NULL) { + rc = -ENOMEM; + goto compose_mount_options_err; + } + + /* copy all options except of unc,ip,prefixpath */ + off = 0; + if (strncmp(sb_mountdata, "sep=", 4) == 0) { + sep = sb_mountdata[4]; + strncpy(mountdata, sb_mountdata, 5); + off += 5; + } + + do { + tkn_e = strchr(sb_mountdata + off, sep); + if (tkn_e == NULL) + noff = strlen(sb_mountdata + off); + else + noff = tkn_e - (sb_mountdata + off) + 1; + + if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) { + off += noff; + continue; + } + if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) { + off += noff; + continue; + } + if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) { + off += noff; + continue; + } + strncat(mountdata, sb_mountdata + off, noff); + off += noff; + } while (tkn_e); + strcat(mountdata, sb_mountdata + off); + mountdata[md_len] = '\0'; + + /* copy new IP and ref share name */ + if (mountdata[strlen(mountdata) - 1] != sep) + strncat(mountdata, &sep, 1); + strcat(mountdata, "ip="); + strcat(mountdata, srvIP); + + /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ + /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ + +compose_mount_options_out: + kfree(srvIP); + return mountdata; + +compose_mount_options_err: + kfree(mountdata); + mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; + goto compose_mount_options_out; +} + +/** + * cifs_dfs_do_refmount - mounts specified path using provided refferal + * @cifs_sb: parent/root superblock + * @fullpath: full path in UNC format + * @ref: server's referral + */ +static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, + const char *fullpath, const struct dfs_info3_param *ref) +{ + struct vfsmount *mnt; + char *mountdata; + char *devname = NULL; + + /* strip first '\' from fullpath */ + mountdata = cifs_compose_mount_options(cifs_sb->mountdata, + fullpath + 1, ref, &devname); + + if (IS_ERR(mountdata)) + return (struct vfsmount *)mountdata; + + mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + kfree(mountdata); + kfree(devname); + return mnt; + +} + +static void dump_referral(const struct dfs_info3_param *ref) +{ + cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); + cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); + cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + ref->flags, ref->server_type); + cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + ref->ref_flag, ref->path_consumed); +} + +/* + * Create a vfsmount that we can automount + */ +static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) +{ + struct dfs_info3_param *referrals = NULL; + unsigned int num_referrals = 0; + struct cifs_sb_info *cifs_sb; + struct cifs_ses *ses; + char *full_path; + unsigned int xid; + int i; + int rc; + struct vfsmount *mnt; + struct tcon_link *tlink; + + cifs_dbg(FYI, "in %s\n", __func__); + BUG_ON(IS_ROOT(mntpt)); + + /* + * The MSDFS spec states that paths in DFS referral requests and + * responses must be prefixed by a single '\' character instead of + * the double backslashes usually used in the UNC. This function + * gives us the latter, so we must adjust the result. + */ + mnt = ERR_PTR(-ENOMEM); + full_path = build_path_from_dentry(mntpt); + if (full_path == NULL) + goto cdda_exit; + + cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + mnt = ERR_CAST(tlink); + goto free_full_path; + } + ses = tlink_tcon(tlink)->ses; + + xid = get_xid(); + rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, + &num_referrals, &referrals, + cifs_remap(cifs_sb)); + free_xid(xid); + + cifs_put_tlink(tlink); + + mnt = ERR_PTR(-ENOENT); + for (i = 0; i < num_referrals; i++) { + int len; + dump_referral(referrals + i); + /* connect to a node */ + len = strlen(referrals[i].node_name); + if (len < 2) { + cifs_dbg(VFS, "%s: Net Address path too short: %s\n", + __func__, referrals[i].node_name); + mnt = ERR_PTR(-EINVAL); + break; + } + mnt = cifs_dfs_do_refmount(cifs_sb, + full_path, referrals + i); + cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", + __func__, referrals[i].node_name, mnt); + if (!IS_ERR(mnt)) + goto success; + } + + /* no valid submounts were found; return error from get_dfs_path() by + * preference */ + if (rc != 0) + mnt = ERR_PTR(rc); + +success: + free_dfs_info_array(referrals, num_referrals); +free_full_path: + kfree(full_path); +cdda_exit: + cifs_dbg(FYI, "leaving %s\n" , __func__); + return mnt; +} + +/* + * Attempt to automount the referral + */ +struct vfsmount *cifs_dfs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + cifs_dbg(FYI, "in %s\n", __func__); + + newmnt = cifs_dfs_do_automount(path->dentry); + if (IS_ERR(newmnt)) { + cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__); + return newmnt; + } + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &cifs_dfs_automount_list); + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); + cifs_dbg(FYI, "leaving %s [ok]\n" , __func__); + return newmnt; +} + +const struct inode_operations cifs_dfs_referral_inode_operations = { +}; diff --git a/kernel/fs/cifs/cifs_fs_sb.h b/kernel/fs/cifs/cifs_fs_sb.h new file mode 100644 index 000000000..3182273a3 --- /dev/null +++ b/kernel/fs/cifs/cifs_fs_sb.h @@ -0,0 +1,71 @@ +/* + * fs/cifs/cifs_fs_sb.h + * + * Copyright (c) International Business Machines Corp., 2002,2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ +#include + +#ifndef _CIFS_FS_SB_H +#define _CIFS_FS_SB_H + +#include + +#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ +#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ +#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ +#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */ +#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */ +#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */ +#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible*/ +#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */ +#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */ +#define CIFS_MOUNT_CIFS_ACL 0x200 /* send ACL requests to non-POSIX srv */ +#define CIFS_MOUNT_OVERR_UID 0x400 /* override uid returned from server */ +#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */ +#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ +#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ +#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ +#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ +#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ +#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ +#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ +#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ +#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ +#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ + +struct cifs_sb_info { + struct rb_root tlink_tree; + spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; + struct nls_table *local_nls; + unsigned int rsize; + unsigned int wsize; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ + atomic_t active; + kuid_t mnt_uid; + kgid_t mnt_gid; + kuid_t mnt_backupuid; + kgid_t mnt_backupgid; + umode_t mnt_file_mode; + umode_t mnt_dir_mode; + unsigned int mnt_cifs_flags; + char *mountdata; /* options received at mount time or via DFS refs */ + struct backing_dev_info bdi; + struct delayed_work prune_tlinks; + struct rcu_head rcu; +}; +#endif /* _CIFS_FS_SB_H */ diff --git a/kernel/fs/cifs/cifs_spnego.c b/kernel/fs/cifs/cifs_spnego.c new file mode 100644 index 000000000..f4cf200b3 --- /dev/null +++ b/kernel/fs/cifs/cifs_spnego.c @@ -0,0 +1,179 @@ +/* + * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS + * + * Copyright (c) 2007 Red Hat, Inc. + * Author(s): Jeff Layton (jlayton@redhat.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "cifsglob.h" +#include "cifs_spnego.h" +#include "cifs_debug.h" + +/* create a new cifs key */ +static int +cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) +{ + char *payload; + int ret; + + ret = -ENOMEM; + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!payload) + goto error; + + /* attach the data */ + key->payload.data = payload; + ret = 0; + +error: + return ret; +} + +static void +cifs_spnego_key_destroy(struct key *key) +{ + kfree(key->payload.data); +} + + +/* + * keytype for CIFS spnego keys + */ +struct key_type cifs_spnego_key_type = { + .name = "cifs.spnego", + .instantiate = cifs_spnego_key_instantiate, + .destroy = cifs_spnego_key_destroy, + .describe = user_describe, +}; + +/* length of longest version string e.g. strlen("ver=0xFF") */ +#define MAX_VER_STR_LEN 8 + +/* length of longest security mechanism name, eg in future could have + * strlen(";sec=ntlmsspi") */ +#define MAX_MECH_STR_LEN 13 + +/* strlen of "host=" */ +#define HOST_KEY_LEN 5 + +/* strlen of ";ip4=" or ";ip6=" */ +#define IP_KEY_LEN 5 + +/* strlen of ";uid=0x" */ +#define UID_KEY_LEN 7 + +/* strlen of ";creduid=0x" */ +#define CREDUID_KEY_LEN 11 + +/* strlen of ";user=" */ +#define USER_KEY_LEN 6 + +/* strlen of ";pid=0x" */ +#define PID_KEY_LEN 7 + +/* get a key struct with a SPNEGO security blob, suitable for session setup */ +struct key * +cifs_get_spnego_key(struct cifs_ses *sesInfo) +{ + struct TCP_Server_Info *server = sesInfo->server; + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; + char *description, *dp; + size_t desc_len; + struct key *spnego_key; + const char *hostname = server->hostname; + + /* length of fields (with semicolons): ver=0xyz ip4=ipaddress + host=hostname sec=mechanism uid=0xFF user=username */ + desc_len = MAX_VER_STR_LEN + + HOST_KEY_LEN + strlen(hostname) + + IP_KEY_LEN + INET6_ADDRSTRLEN + + MAX_MECH_STR_LEN + + UID_KEY_LEN + (sizeof(uid_t) * 2) + + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; + + if (sesInfo->user_name) + desc_len += USER_KEY_LEN + strlen(sesInfo->user_name); + + spnego_key = ERR_PTR(-ENOMEM); + description = kzalloc(desc_len, GFP_KERNEL); + if (description == NULL) + goto out; + + dp = description; + /* start with version and hostname portion of UNC string */ + spnego_key = ERR_PTR(-EINVAL); + sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, + hostname); + dp = description + strlen(description); + + /* add the server address */ + if (server->dstaddr.ss_family == AF_INET) + sprintf(dp, "ip4=%pI4", &sa->sin_addr); + else if (server->dstaddr.ss_family == AF_INET6) + sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); + else + goto out; + + dp = description + strlen(description); + + /* for now, only sec=krb5 and sec=mskrb5 are valid */ + if (server->sec_kerberos) + sprintf(dp, ";sec=krb5"); + else if (server->sec_mskerberos) + sprintf(dp, ";sec=mskrb5"); + else + goto out; + + dp = description + strlen(description); + sprintf(dp, ";uid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); + + dp = description + strlen(description); + sprintf(dp, ";creduid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); + + if (sesInfo->user_name) { + dp = description + strlen(description); + sprintf(dp, ";user=%s", sesInfo->user_name); + } + + dp = description + strlen(description); + sprintf(dp, ";pid=0x%x", current->pid); + + cifs_dbg(FYI, "key description = %s\n", description); + spnego_key = request_key(&cifs_spnego_key_type, description, ""); + +#ifdef CONFIG_CIFS_DEBUG2 + if (cifsFYI && !IS_ERR(spnego_key)) { + struct cifs_spnego_msg *msg = spnego_key->payload.data; + cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U, + msg->secblob_len + msg->sesskey_len)); + } +#endif /* CONFIG_CIFS_DEBUG2 */ + +out: + kfree(description); + return spnego_key; +} diff --git a/kernel/fs/cifs/cifs_spnego.h b/kernel/fs/cifs/cifs_spnego.h new file mode 100644 index 000000000..31bef9ee0 --- /dev/null +++ b/kernel/fs/cifs/cifs_spnego.h @@ -0,0 +1,47 @@ +/* + * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS + * + * Copyright (c) 2007 Red Hat, Inc. + * Author(s): Jeff Layton (jlayton@redhat.com) + * Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFS_SPNEGO_H +#define _CIFS_SPNEGO_H + +#define CIFS_SPNEGO_UPCALL_VERSION 2 + +/* + * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. + * The flags field is for future use. The request-key callout should set + * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob + * and stuff it in the data field. + */ +struct cifs_spnego_msg { + uint32_t version; + uint32_t flags; + uint32_t sesskey_len; + uint32_t secblob_len; + uint8_t data[1]; +}; + +#ifdef __KERNEL__ +extern struct key_type cifs_spnego_key_type; +extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo); +#endif /* KERNEL */ + +#endif /* _CIFS_SPNEGO_H */ diff --git a/kernel/fs/cifs/cifs_unicode.c b/kernel/fs/cifs/cifs_unicode.c new file mode 100644 index 000000000..5a53ac6b1 --- /dev/null +++ b/kernel/fs/cifs/cifs_unicode.c @@ -0,0 +1,611 @@ +/* + * fs/cifs/cifs_unicode.c + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "cifs_uniupr.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" + +int cifs_remap(struct cifs_sb_info *cifs_sb) +{ + int map_type; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + map_type = SFM_MAP_UNI_RSVD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + map_type = SFU_MAP_UNI_RSVD; + else + map_type = NO_MAP_UNI_RSVD; + + return map_type; +} + +/* Convert character using the SFU - "Services for Unix" remapping range */ +static bool +convert_sfu_char(const __u16 src_char, char *target) +{ + /* + * BB: Cannot handle remapping UNI_SLASH until all the calls to + * build_path_from_dentry are modified, as they use slash as + * separator. + */ + switch (src_char) { + case UNI_COLON: + *target = ':'; + break; + case UNI_ASTERISK: + *target = '*'; + break; + case UNI_QUESTION: + *target = '?'; + break; + case UNI_PIPE: + *target = '|'; + break; + case UNI_GRTRTHAN: + *target = '>'; + break; + case UNI_LESSTHAN: + *target = '<'; + break; + default: + return false; + } + return true; +} + +/* Convert character using the SFM - "Services for Mac" remapping range */ +static bool +convert_sfm_char(const __u16 src_char, char *target) +{ + switch (src_char) { + case SFM_COLON: + *target = ':'; + break; + case SFM_ASTERISK: + *target = '*'; + break; + case SFM_QUESTION: + *target = '?'; + break; + case SFM_PIPE: + *target = '|'; + break; + case SFM_GRTRTHAN: + *target = '>'; + break; + case SFM_LESSTHAN: + *target = '<'; + break; + case SFM_SLASH: + *target = '\\'; + break; + default: + return false; + } + return true; +} + + +/* + * cifs_mapchar - convert a host-endian char to proper char in codepage + * @target - where converted character should be copied + * @src_char - 2 byte host-endian source character + * @cp - codepage to which character should be converted + * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2? + * + * This function handles the conversion of a single character. It is the + * responsibility of the caller to ensure that the target buffer is large + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). + */ +static int +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, + int maptype) +{ + int len = 1; + __u16 src_char; + + src_char = *from; + + if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target)) + return len; + else if ((maptype == SFU_MAP_UNI_RSVD) && + convert_sfu_char(src_char, target)) + return len; + + /* if character not one of seven in special remap set */ + len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); + if (len <= 0) + goto surrogate_pair; + + return len; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; + return len; +} + +/* + * cifs_from_utf16 - convert utf16le string to local charset + * @to - destination buffer + * @from - source buffer + * @tolen - destination buffer size (in bytes) + * @fromlen - source buffer size (in bytes) + * @codepage - codepage to which characters should be converted + * @mapchar - should characters be remapped according to the mapchars option? + * + * Convert a little-endian utf16le string (as sent by the server) to a string + * in the provided codepage. The tolen and fromlen parameters are to ensure + * that the code doesn't walk off of the end of the buffer (which is always + * a danger if the alignment of the source buffer is off). The destination + * string is always properly null terminated and fits in the destination + * buffer. Returns the length of the destination string in bytes (including + * null terminator). + * + * Note that some windows versions actually send multiword UTF-16 characters + * instead of straight UTF16-2. The linux nls routines however aren't able to + * deal with those characters properly. In the event that we get some of + * those characters, they won't be translated properly. + */ +int +cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *codepage, int map_type) +{ + int i, charlen, safelen; + int outlen = 0; + int nullsize = nls_nullsize(codepage); + int fromwords = fromlen / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ + + /* + * because the chars can be of varying widths, we need to take care + * not to overflow the destination buffer when we get close to the + * end of it. Until we get to this offset, we don't need to check + * for overflow however. + */ + safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); + + for (i = 0; i < fromwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < fromwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < fromwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + /* + * check to see if converting this character might make the + * conversion bleed into the null terminator + */ + if (outlen >= safelen) { + charlen = cifs_mapchar(tmp, ftmp, codepage, map_type); + if ((outlen + charlen) > (tolen - nullsize)) + break; + } + + /* put converted char into 'to' buffer */ + charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type); + outlen += charlen; + + /* charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; + } + + /* properly null-terminate string */ + for (i = 0; i < nullsize; i++) + to[outlen++] = 0; + + return outlen; +} + +/* + * NAME: cifs_strtoUTF16() + * + * FUNCTION: Convert character string to unicode string + * + */ +int +cifs_strtoUTF16(__le16 *to, const char *from, int len, + const struct nls_table *codepage) +{ + int charlen; + int i; + wchar_t wchar_to; /* needed to quiet sparse */ + + /* special case for utf8 to handle no plane0 chars */ + if (!strcmp(codepage->charset, "utf8")) { + /* + * convert utf8 -> utf16, we assume we have enough space + * as caller should have assumed conversion does not overflow + * in destination len is length in wchar_t units (16bits) + */ + i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN, + (wchar_t *) to, len); + + /* if success terminate and exit */ + if (i >= 0) + goto success; + /* + * if fails fall back to UCS encoding as this + * function should not return negative values + * currently can fail only if source contains + * invalid encoded characters + */ + } + + for (i = 0; len && *from; i++, from += charlen, len -= charlen) { + charlen = codepage->char2uni(from, len, &wchar_to); + if (charlen < 1) { + cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n", + *from, charlen); + /* A question mark */ + wchar_to = 0x003f; + charlen = 1; + } + put_unaligned_le16(wchar_to, &to[i]); + } + +success: + put_unaligned_le16(0, &to[i]); + return i; +} + +/* + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ +int +cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < maxwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < maxwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD); + outlen += charlen; + } + + return outlen; +} + +/* + * cifs_strndup_from_utf16 - copy a string from wire format to the local + * codepage + * @src - source string + * @maxlen - don't walk past this many bytes in the source string + * @is_unicode - is this a unicode string? + * @codepage - destination codepage + * + * Take a string given by the server, convert it to the local codepage and + * put it in a new buffer. Returns a pointer to the new string or NULL on + * error. + */ +char * +cifs_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, const struct nls_table *codepage) +{ + int len; + char *dst; + + if (is_unicode) { + len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage); + len += nls_nullsize(codepage); + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return NULL; + cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, + NO_MAP_UNI_RSVD); + } else { + len = strnlen(src, maxlen); + len++; + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return NULL; + strlcpy(dst, src, len); + } + + return dst; +} + +static __le16 convert_to_sfu_char(char src_char) +{ + __le16 dest_char; + + switch (src_char) { + case ':': + dest_char = cpu_to_le16(UNI_COLON); + break; + case '*': + dest_char = cpu_to_le16(UNI_ASTERISK); + break; + case '?': + dest_char = cpu_to_le16(UNI_QUESTION); + break; + case '<': + dest_char = cpu_to_le16(UNI_LESSTHAN); + break; + case '>': + dest_char = cpu_to_le16(UNI_GRTRTHAN); + break; + case '|': + dest_char = cpu_to_le16(UNI_PIPE); + break; + default: + dest_char = 0; + } + + return dest_char; +} + +static __le16 convert_to_sfm_char(char src_char) +{ + __le16 dest_char; + + switch (src_char) { + case ':': + dest_char = cpu_to_le16(SFM_COLON); + break; + case '*': + dest_char = cpu_to_le16(SFM_ASTERISK); + break; + case '?': + dest_char = cpu_to_le16(SFM_QUESTION); + break; + case '<': + dest_char = cpu_to_le16(SFM_LESSTHAN); + break; + case '>': + dest_char = cpu_to_le16(SFM_GRTRTHAN); + break; + case '|': + dest_char = cpu_to_le16(SFM_PIPE); + break; + default: + dest_char = 0; + } + + return dest_char; +} + +/* + * Convert 16 bit Unicode pathname to wire format from string in current code + * page. Conversion may involve remapping up the six characters that are + * only legal in POSIX-like OS (if they are present in the string). Path + * names are little endian 16 bit Unicode on the wire + */ +int +cifsConvertToUTF16(__le16 *target, const char *source, int srclen, + const struct nls_table *cp, int map_chars) +{ + int i, charlen; + int j = 0; + char src_char; + __le16 dst_char; + wchar_t tmp; + wchar_t *wchar_to; /* UTF-16 */ + int ret; + unicode_t u; + + if (map_chars == NO_MAP_UNI_RSVD) + return cifs_strtoUTF16(target, source, PATH_MAX, cp); + + wchar_to = kzalloc(6, GFP_KERNEL); + + for (i = 0; i < srclen; j++) { + src_char = source[i]; + charlen = 1; + + /* check if end of string */ + if (src_char == 0) + goto ctoUTF16_out; + + /* see if we must remap this char */ + if (map_chars == SFU_MAP_UNI_RSVD) + dst_char = convert_to_sfu_char(src_char); + else if (map_chars == SFM_MAP_UNI_RSVD) + dst_char = convert_to_sfm_char(src_char); + else + dst_char = 0; + /* + * FIXME: We can not handle remapping backslash (UNI_SLASH) + * until all the calls to build_path_from_dentry are modified, + * as they use backslash as separator. + */ + if (dst_char == 0) { + charlen = cp->char2uni(source + i, srclen - i, &tmp); + dst_char = cpu_to_le16(tmp); + + /* + * if no match, use question mark, which at least in + * some cases serves as wild card + */ + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8") || !wchar_to) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); + } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; + } + +ctoUTF16: + /* + * character may take more than one byte in the source string, + * but will take exactly two bytes in the target string + */ + i += charlen; + put_unaligned(dst_char, &target[j]); + } + +ctoUTF16_out: + put_unaligned(0, &target[j]); /* Null terminate target unicode string */ + kfree(wchar_to); + return j; +} + +#ifdef CONFIG_CIFS_SMB2 +/* + * cifs_local_to_utf16_bytes - how long will a string be after conversion? + * @from - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - source codepage + * + * Walk a string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ + +static int +cifs_local_to_utf16_bytes(const char *from, int len, + const struct nls_table *codepage) +{ + int charlen; + int i; + wchar_t wchar_to; + + for (i = 0; len && *from; i++, from += charlen, len -= charlen) { + charlen = codepage->char2uni(from, len, &wchar_to); + /* Failed conversion defaults to a question mark */ + if (charlen < 1) + charlen = 1; + } + return 2 * i; /* UTF16 characters are two bytes */ +} + +/* + * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage + * @src - source string + * @maxlen - don't walk past this many bytes in the source string + * @utf16_len - the length of the allocated string in bytes (including null) + * @cp - source codepage + * @remap - map special chars + * + * Take a string convert it from the local codepage to UTF16 and + * put it in a new buffer. Returns a pointer to the new string or NULL on + * error. + */ +__le16 * +cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len, + const struct nls_table *cp, int remap) +{ + int len; + __le16 *dst; + + len = cifs_local_to_utf16_bytes(src, maxlen, cp); + len += 2; /* NULL */ + dst = kmalloc(len, GFP_KERNEL); + if (!dst) { + *utf16_len = 0; + return NULL; + } + cifsConvertToUTF16(dst, src, strlen(src), cp, remap); + *utf16_len = len; + return dst; +} +#endif /* CONFIG_CIFS_SMB2 */ diff --git a/kernel/fs/cifs/cifs_unicode.h b/kernel/fs/cifs/cifs_unicode.h new file mode 100644 index 000000000..bdc52cb9a --- /dev/null +++ b/kernel/fs/cifs/cifs_unicode.h @@ -0,0 +1,418 @@ +/* + * cifs_unicode: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Notes: + * These APIs are based on the C library functions. The semantics + * should match the C functions but with expanded size operands. + * + * The upper/lower functions are based on a table created by mkupr. + * This is a compressed table of upper and lower case conversion. + * + */ +#ifndef _CIFS_UNICODE_H +#define _CIFS_UNICODE_H + +#include +#include +#include + +#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ + +/* + * Windows maps these to the user defined 16 bit Unicode range since they are + * reserved symbols (along with \ and /), otherwise illegal to store + * in filenames in NTFS + */ +#define UNI_ASTERISK (__u16) ('*' + 0xF000) +#define UNI_QUESTION (__u16) ('?' + 0xF000) +#define UNI_COLON (__u16) (':' + 0xF000) +#define UNI_GRTRTHAN (__u16) ('>' + 0xF000) +#define UNI_LESSTHAN (__u16) ('<' + 0xF000) +#define UNI_PIPE (__u16) ('|' + 0xF000) +#define UNI_SLASH (__u16) ('\\' + 0xF000) + +/* + * Macs use an older "SFM" mapping of the symbols above. Fortunately it does + * not conflict (although almost does) with the mapping above. + */ + +#define SFM_ASTERISK ((__u16) 0xF021) +#define SFM_QUESTION ((__u16) 0xF025) +#define SFM_COLON ((__u16) 0xF022) +#define SFM_GRTRTHAN ((__u16) 0xF024) +#define SFM_LESSTHAN ((__u16) 0xF023) +#define SFM_PIPE ((__u16) 0xF027) +#define SFM_SLASH ((__u16) 0xF026) + +/* + * Mapping mechanism to use when one of the seven reserved characters is + * encountered. We can only map using one of the mechanisms at a time + * since otherwise readdir could return directory entries which we would + * not be able to open + * + * NO_MAP_UNI_RSVD = do not perform any remapping of the character + * SFM_MAP_UNI_RSVD = map reserved characters using SFM scheme (MAC compatible) + * SFU_MAP_UNI_RSVD = map reserved characters ala SFU ("mapchars" option) + * + */ +#define NO_MAP_UNI_RSVD 0 +#define SFM_MAP_UNI_RSVD 1 +#define SFU_MAP_UNI_RSVD 2 + +/* Just define what we want from uniupr.h. We don't want to define the tables + * in each source file. + */ +#ifndef UNICASERANGE_DEFINED +struct UniCaseRange { + wchar_t start; + wchar_t end; + signed char *table; +}; +#endif /* UNICASERANGE_DEFINED */ + +#ifndef UNIUPR_NOUPPER +extern signed char CifsUniUpperTable[512]; +extern const struct UniCaseRange CifsUniUpperRange[]; +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +extern signed char CifsUniLowerTable[512]; +extern const struct UniCaseRange CifsUniLowerRange[]; +#endif /* UNIUPR_NOLOWER */ + +#ifdef __KERNEL__ +int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *cp, int map_type); +int cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage); +int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *); +char *cifs_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, + const struct nls_table *codepage); +extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen, + const struct nls_table *cp, int mapChars); +extern int cifs_remap(struct cifs_sb_info *cifs_sb); +#ifdef CONFIG_CIFS_SMB2 +extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, + int *utf16_len, const struct nls_table *cp, + int remap); +#endif /* CONFIG_CIFS_SMB2 */ +#endif + +wchar_t cifs_toupper(wchar_t in); + +/* + * UniStrcat: Concatenate the second string to the first + * + * Returns: + * Address of the first string + */ +static inline wchar_t * +UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + + while (*ucs1++) ; /* To end of first string */ + ucs1--; /* Return to the null */ + while ((*ucs1++ = *ucs2++)) ; /* copy string 2 over */ + return anchor; +} + +/* + * UniStrchr: Find a character in a string + * + * Returns: + * Address of first occurrence of character in string + * or NULL if the character is not in the string + */ +static inline wchar_t * +UniStrchr(const wchar_t *ucs, wchar_t uc) +{ + while ((*ucs != uc) && *ucs) + ucs++; + + if (*ucs == uc) + return (wchar_t *) ucs; + return NULL; +} + +/* + * UniStrcmp: Compare two strings + * + * Returns: + * < 0: First string is less than second + * = 0: Strings are equal + * > 0: First string is greater than second + */ +static inline int +UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) +{ + while ((*ucs1 == *ucs2) && *ucs1) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) *ucs2; +} + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t * +UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)) ; + return anchor; +} + +/* + * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) + */ +static inline size_t +UniStrlen(const wchar_t *ucs1) +{ + int i = 0; + + while (*ucs1++) + i++; + return i; +} + +/* + * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a + * string (length limited) + */ +static inline size_t +UniStrnlen(const wchar_t *ucs1, int maxlen) +{ + int i = 0; + + while (*ucs1++) { + i++; + if (i >= maxlen) + break; + } + return i; +} + +/* + * UniStrncat: Concatenate length limited string + */ +static inline wchar_t * +UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; /* save pointer to string 1 */ + + while (*ucs1++) ; + ucs1--; /* point to null terminator of s1 */ + while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ + ucs1++; + ucs2++; + } + *ucs1 = 0; /* Null terminate the result */ + return (anchor); +} + +/* + * UniStrncmp: Compare length limited string + */ +static inline int +UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == *ucs2) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) *ucs2; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int +UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t * +UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t * +UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrstr: Find a string in a string + * + * Returns: + * Address of first match found + * NULL if no matching string is found + */ +static inline wchar_t * +UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) +{ + const wchar_t *anchor1 = ucs1; + const wchar_t *anchor2 = ucs2; + + while (*ucs1) { + if (*ucs1 == *ucs2) { + /* Partial match found */ + ucs1++; + ucs2++; + } else { + if (!*ucs2) /* Match found */ + return (wchar_t *) anchor1; + ucs1 = ++anchor1; /* No match */ + ucs2 = anchor2; + } + } + + if (!*ucs2) /* Both end together */ + return (wchar_t *) anchor1; /* Match found */ + return NULL; /* No match */ +} + +#ifndef UNIUPR_NOUPPER +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t +UniToupper(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(CifsUniUpperTable)) { + /* Latin characters */ + return uc + CifsUniUpperTable[uc]; /* Use base tables */ + } else { + rp = CifsUniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + +/* + * UniStrupr: Upper case a unicode string + */ +static inline __le16 * +UniStrupr(register __le16 *upin) +{ + register __le16 *up; + + up = upin; + while (*up) { /* For all characters */ + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); + up++; + } + return upin; /* Return input pointer */ +} +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +/* + * UniTolower: Convert a unicode character to lower case + */ +static inline wchar_t +UniTolower(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(CifsUniLowerTable)) { + /* Latin characters */ + return uc + CifsUniLowerTable[uc]; /* Use base tables */ + } else { + rp = CifsUniLowerRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + +/* + * UniStrlwr: Lower case a unicode string + */ +static inline wchar_t * +UniStrlwr(register wchar_t *upin) +{ + register wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniTolower(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif + +#endif /* _CIFS_UNICODE_H */ diff --git a/kernel/fs/cifs/cifs_uniupr.h b/kernel/fs/cifs/cifs_uniupr.h new file mode 100644 index 000000000..0ac7c5a86 --- /dev/null +++ b/kernel/fs/cifs/cifs_uniupr.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) International Business Machines Corp., 2000,2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * uniupr.h - Unicode compressed case ranges + * +*/ + +#ifndef UNIUPR_NOUPPER +/* + * Latin upper case + */ +signed char CifsUniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 060-06f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 0e0-0ef */ + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 3b0-3bf */ + -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, + -63, -63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 430-43f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 440-44f */ + 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* ff40-ff4f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, +}; + +/* + * Upper Case Range + */ +const struct UniCaseRange CifsUniUpperRange[] = { + {0x03a0, 0x03ce, UniCaseRangeU03a0}, + {0x0430, 0x045f, UniCaseRangeU0430}, + {0x0490, 0x04cc, UniCaseRangeU0490}, + {0x1e00, 0x1ffc, UniCaseRangeU1e00}, + {0xff40, 0xff5a, UniCaseRangeUff40}, + {0} +}; +#endif + +#ifndef UNIUPR_NOLOWER +/* + * Latin lower case + */ +signed char CifsUniLowerTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 040-04f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, /* 050-05f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 0c0-0cf */ + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0, /* 0d0-0df */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ + 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0, /* 170-17f */ + 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0, /* 180-18f */ + 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ + 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ + 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ +}; + +/* Lower case range - Greek */ +static signed char UniCaseRangeL0380[44] = { + 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 390-39f */ + 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* Lower case range - Cyrillic */ +static signed char UniCaseRangeL0400[48] = { + 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80, /* 400-40f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 410-41f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 420-42f */ +}; + +/* Lower case range - Extended cyrillic */ +static signed char UniCaseRangeL0490[60] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ + 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, +}; + +/* Lower case range - Extended latin and greek */ +static signed char UniCaseRangeL1e00[504] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ + 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0, /* 1fc0-1fcf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Lower case range - Wide latin */ +static signed char UniCaseRangeLff20[27] = { + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* ff20-ff2f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* + * Lower Case Range + */ +const struct UniCaseRange CifsUniLowerRange[] = { + {0x0380, 0x03ab, UniCaseRangeL0380}, + {0x0400, 0x042f, UniCaseRangeL0400}, + {0x0490, 0x04cb, UniCaseRangeL0490}, + {0x1e00, 0x1ff7, UniCaseRangeL1e00}, + {0xff20, 0xff3a, UniCaseRangeLff20}, + {0} +}; +#endif diff --git a/kernel/fs/cifs/cifsacl.c b/kernel/fs/cifs/cifsacl.c new file mode 100644 index 000000000..1ea780bc6 --- /dev/null +++ b/kernel/fs/cifs/cifsacl.c @@ -0,0 +1,1119 @@ +/* + * fs/cifs/cifsacl.c + * + * Copyright (C) International Business Machines Corp., 2007,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Contains the routines for mapping CIFS/NTFS ACLs + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsacl.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +/* security id for everyone/world system group */ +static const struct cifs_sid sid_everyone = { + 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; +/* security id for Authenticated Users system group */ +static const struct cifs_sid sid_authusers = { + 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; +/* group users */ +static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; + +static const struct cred *root_cred; + +static int +cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) +{ + char *payload; + + /* + * If the payload is less than or equal to the size of a pointer, then + * an allocation here is wasteful. Just copy the data directly to the + * payload.value union member instead. + * + * With this however, you must check the datalen before trying to + * dereference payload.data! + */ + if (prep->datalen <= sizeof(key->payload)) { + key->payload.value = 0; + memcpy(&key->payload.value, prep->data, prep->datalen); + key->datalen = prep->datalen; + return 0; + } + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + key->payload.data = payload; + key->datalen = prep->datalen; + return 0; +} + +static inline void +cifs_idmap_key_destroy(struct key *key) +{ + if (key->datalen > sizeof(key->payload)) + kfree(key->payload.data); +} + +static struct key_type cifs_idmap_key_type = { + .name = "cifs.idmap", + .instantiate = cifs_idmap_key_instantiate, + .destroy = cifs_idmap_key_destroy, + .describe = user_describe, +}; + +static char * +sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) +{ + int i, len; + unsigned int saval; + char *sidstr, *strptr; + unsigned long long id_auth_val; + + /* 3 bytes for prefix */ + sidstr = kmalloc(3 + SID_STRING_BASE_SIZE + + (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth), + GFP_KERNEL); + if (!sidstr) + return sidstr; + + strptr = sidstr; + len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g', + sidptr->revision); + strptr += len; + + /* The authority field is a single 48-bit number */ + id_auth_val = (unsigned long long)sidptr->authority[5]; + id_auth_val |= (unsigned long long)sidptr->authority[4] << 8; + id_auth_val |= (unsigned long long)sidptr->authority[3] << 16; + id_auth_val |= (unsigned long long)sidptr->authority[2] << 24; + id_auth_val |= (unsigned long long)sidptr->authority[1] << 32; + id_auth_val |= (unsigned long long)sidptr->authority[0] << 48; + + /* + * MS-DTYP states that if the authority is >= 2^32, then it should be + * expressed as a hex value. + */ + if (id_auth_val <= UINT_MAX) + len = sprintf(strptr, "-%llu", id_auth_val); + else + len = sprintf(strptr, "-0x%llx", id_auth_val); + + strptr += len; + + for (i = 0; i < sidptr->num_subauth; ++i) { + saval = le32_to_cpu(sidptr->sub_auth[i]); + len = sprintf(strptr, "-%u", saval); + strptr += len; + } + + return sidstr; +} + +/* + * if the two SIDs (roughly equivalent to a UUID for a user or group) are + * the same returns zero, if they do not match returns non-zero. + */ +static int +compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) +{ + int i; + int num_subauth, num_sat, num_saw; + + if ((!ctsid) || (!cwsid)) + return 1; + + /* compare the revision */ + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } + } + + /* compare all of the subauth values if any */ + num_sat = ctsid->num_subauth; + num_saw = cwsid->num_subauth; + num_subauth = num_sat < num_saw ? num_sat : num_saw; + if (num_subauth) { + for (i = 0; i < num_subauth; ++i) { + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (le32_to_cpu(ctsid->sub_auth[i]) > + le32_to_cpu(cwsid->sub_auth[i])) + return 1; + else + return -1; + } + } + } + + return 0; /* sids compare/match */ +} + +static void +cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) +{ + int i; + + dst->revision = src->revision; + dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); + for (i = 0; i < NUM_AUTHS; ++i) + dst->authority[i] = src->authority[i]; + for (i = 0; i < dst->num_subauth; ++i) + dst->sub_auth[i] = src->sub_auth[i]; +} + +static int +id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) +{ + int rc; + struct key *sidkey; + struct cifs_sid *ksid; + unsigned int ksid_size; + char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ + const struct cred *saved_cred; + + rc = snprintf(desc, sizeof(desc), "%ci:%u", + sidtype == SIDOWNER ? 'o' : 'g', cid); + if (rc >= sizeof(desc)) + return -EINVAL; + + rc = 0; + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, desc, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", + __func__, sidtype == SIDOWNER ? 'u' : 'g', cid); + goto out_revert_creds; + } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { + rc = -EIO; + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); + goto invalidate_key; + } + + /* + * A sid is usually too large to be embedded in payload.value, but if + * there are no subauthorities and the host has 8-byte pointers, then + * it could be. + */ + ksid = sidkey->datalen <= sizeof(sidkey->payload) ? + (struct cifs_sid *)&sidkey->payload.value : + (struct cifs_sid *)sidkey->payload.data; + + ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); + if (ksid_size > sidkey->datalen) { + rc = -EIO; + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n", + __func__, sidkey->datalen, ksid_size); + goto invalidate_key; + } + + cifs_copy_sid(ssid, ksid); +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); + return rc; + +invalidate_key: + key_invalidate(sidkey); + goto out_key_put; +} + +static int +sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype) +{ + int rc; + struct key *sidkey; + char *sidstr; + const struct cred *saved_cred; + kuid_t fuid = cifs_sb->mnt_uid; + kgid_t fgid = cifs_sb->mnt_gid; + + /* + * If we have too many subauthorities, then something is really wrong. + * Just return an error. + */ + if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { + cifs_dbg(FYI, "%s: %u subauthorities is too many!\n", + __func__, psid->num_subauth); + return -EIO; + } + + sidstr = sid_to_key_str(psid, sidtype); + if (!sidstr) + return -ENOMEM; + + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", + __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); + goto out_revert_creds; + } + + /* + * FIXME: Here we assume that uid_t and gid_t are same size. It's + * probably a safe assumption but might be better to check based on + * sidtype. + */ + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + if (sidkey->datalen != sizeof(uid_t)) { + rc = -EIO; + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); + key_invalidate(sidkey); + goto out_key_put; + } + + if (sidtype == SIDOWNER) { + kuid_t uid; + uid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fuid = uid; + } else { + kgid_t gid; + gid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fgid = gid; + } + +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); + kfree(sidstr); + + /* + * Note that we return 0 here unconditionally. If the mapping + * fails then we just fall back to using the mnt_uid/mnt_gid. + */ + if (sidtype == SIDOWNER) + fattr->cf_uid = fuid; + else + fattr->cf_gid = fgid; + return 0; +} + +int +init_cifs_idmap(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + cifs_dbg(FYI, "Registering the %s key type\n", + cifs_idmap_key_type.name); + + /* create an override credential set with a special thread keyring in + * which requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = keyring_alloc(".cifs_idmap", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = register_key_type(&cifs_idmap_key_type); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + root_cred = cred; + + cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void +exit_cifs_idmap(void) +{ + key_revoke(root_cred->thread_keyring); + unregister_key_type(&cifs_idmap_key_type); + put_cred(root_cred); + cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name); +} + +/* copy ntsd, owner sid, and group sid from a security descriptor to another */ +static void copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, __u32 sidsoffset) +{ + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + + /* copy security descriptor control portion */ + pnntsd->revision = pntsd->revision; + pnntsd->type = pntsd->type; + pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd)); + pnntsd->sacloffset = 0; + pnntsd->osidoffset = cpu_to_le32(sidsoffset); + pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); + + /* copy owner sid */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); + cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); + + /* copy group sid */ + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + + sizeof(struct cifs_sid)); + cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); + + return; +} + + +/* + change posix mode to reflect permissions + pmode is the existing mode (we only want to overwrite part of this + bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007 +*/ +static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, + umode_t *pbits_to_set) +{ + __u32 flags = le32_to_cpu(ace_flags); + /* the order of ACEs is important. The canonical order is to begin with + DENY entries followed by ALLOW, otherwise an allow entry could be + encountered first, making the subsequent deny entry like "dead code" + which would be superflous since Windows stops when a match is made + for the operation you are trying to perform for your user */ + + /* For deny ACEs we change the mask so that subsequent allow access + control entries do not turn on the bits we are denying */ + if (type == ACCESS_DENIED) { + if (flags & GENERIC_ALL) + *pbits_to_set &= ~S_IRWXUGO; + + if ((flags & GENERIC_WRITE) || + ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) + *pbits_to_set &= ~S_IWUGO; + if ((flags & GENERIC_READ) || + ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) + *pbits_to_set &= ~S_IRUGO; + if ((flags & GENERIC_EXECUTE) || + ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) + *pbits_to_set &= ~S_IXUGO; + return; + } else if (type != ACCESS_ALLOWED) { + cifs_dbg(VFS, "unknown access control type %d\n", type); + return; + } + /* else ACCESS_ALLOWED type */ + + if (flags & GENERIC_ALL) { + *pmode |= (S_IRWXUGO & (*pbits_to_set)); + cifs_dbg(NOISY, "all perms\n"); + return; + } + if ((flags & GENERIC_WRITE) || + ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) + *pmode |= (S_IWUGO & (*pbits_to_set)); + if ((flags & GENERIC_READ) || + ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) + *pmode |= (S_IRUGO & (*pbits_to_set)); + if ((flags & GENERIC_EXECUTE) || + ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) + *pmode |= (S_IXUGO & (*pbits_to_set)); + + cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode); + return; +} + +/* + Generate access flags to reflect permissions mode is the existing mode. + This function is called for every ACE in the DACL whose SID matches + with either owner or group or everyone. +*/ + +static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, + __u32 *pace_flags) +{ + /* reset access mask */ + *pace_flags = 0x0; + + /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */ + mode &= bits_to_use; + + /* check for R/W/X UGO since we do not know whose flags + is this but we have cleared all the bits sans RWX for + either user or group or other as per bits_to_use */ + if (mode & S_IRUGO) + *pace_flags |= SET_FILE_READ_RIGHTS; + if (mode & S_IWUGO) + *pace_flags |= SET_FILE_WRITE_RIGHTS; + if (mode & S_IXUGO) + *pace_flags |= SET_FILE_EXEC_RIGHTS; + + cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n", + mode, *pace_flags); + return; +} + +static __u16 fill_ace_for_sid(struct cifs_ace *pntace, + const struct cifs_sid *psid, __u64 nmode, umode_t bits) +{ + int i; + __u16 size = 0; + __u32 access_req = 0; + + pntace->type = ACCESS_ALLOWED; + pntace->flags = 0x0; + mode_to_access_flags(nmode, bits, &access_req); + if (!access_req) + access_req = SET_MINIMUM_RIGHTS; + pntace->access_req = cpu_to_le32(access_req); + + pntace->sid.revision = psid->revision; + pntace->sid.num_subauth = psid->num_subauth; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = psid->authority[i]; + for (i = 0; i < psid->num_subauth; i++) + pntace->sid.sub_auth[i] = psid->sub_auth[i]; + + size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4); + pntace->size = cpu_to_le16(size); + + return size; +} + + +#ifdef CONFIG_CIFS_DEBUG2 +static void dump_ace(struct cifs_ace *pace, char *end_of_acl) +{ + int num_subauth; + + /* validate that we do not go past end of acl */ + + if (le16_to_cpu(pace->size) < 16) { + cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size)); + return; + } + + if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { + cifs_dbg(VFS, "ACL too small to parse ACE\n"); + return; + } + + num_subauth = pace->sid.num_subauth; + if (num_subauth) { + int i; + cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n", + pace->sid.revision, pace->sid.num_subauth, pace->type, + pace->flags, le16_to_cpu(pace->size)); + for (i = 0; i < num_subauth; ++i) { + cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(pace->sid.sub_auth[i])); + } + + /* BB add length check to make sure that we do not have huge + num auths and therefore go off the end */ + } + + return; +} +#endif + + +static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + struct cifs_fattr *fattr) +{ + int i; + int num_aces = 0; + int acl_size; + char *acl_base; + struct cifs_ace **ppace; + + /* BB need to add parm so we can store the SID BB */ + + if (!pdacl) { + /* no DACL in the security descriptor, set + all the permissions for user/group/other */ + fattr->cf_mode |= S_IRWXUGO; + return; + } + + /* validate that we do not go past end of acl */ + if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + cifs_dbg(VFS, "ACL too small to parse DACL\n"); + return; + } + + cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", + le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), + le32_to_cpu(pdacl->num_aces)); + + /* reset rwx permissions for user/group/other. + Also, if num_aces is 0 i.e. DACL has no ACEs, + user/group/other have no permissions */ + fattr->cf_mode &= ~(S_IRWXUGO); + + acl_base = (char *)pdacl; + acl_size = sizeof(struct cifs_acl); + + num_aces = le32_to_cpu(pdacl->num_aces); + if (num_aces > 0) { + umode_t user_mask = S_IRWXU; + umode_t group_mask = S_IRWXG; + umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; + + if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) + return; + ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), + GFP_KERNEL); + if (!ppace) + return; + + for (i = 0; i < num_aces; ++i) { + ppace[i] = (struct cifs_ace *) (acl_base + acl_size); +#ifdef CONFIG_CIFS_DEBUG2 + dump_ace(ppace[i], end_of_acl); +#endif + if (compare_sids(&(ppace[i]->sid), pownersid) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &user_mask); + if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &group_mask); + if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &other_mask); + if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &other_mask); + + +/* memcpy((void *)(&(cifscred->aces[i])), + (void *)ppace[i], + sizeof(struct cifs_ace)); */ + + acl_base = (char *)ppace[i]; + acl_size = le16_to_cpu(ppace[i]->size); + } + + kfree(ppace); + } + + return; +} + + +static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid, __u64 nmode) +{ + u16 size = 0; + struct cifs_acl *pnndacl; + + pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + + size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), + pownersid, nmode, S_IRWXU); + size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), + pgrpsid, nmode, S_IRWXG); + size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), + &sid_everyone, nmode, S_IRWXO); + + pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); + pndacl->num_aces = cpu_to_le32(3); + + return 0; +} + + +static int parse_sid(struct cifs_sid *psid, char *end_of_acl) +{ + /* BB need to add parm so we can store the SID BB */ + + /* validate that we do not go past end of ACL - sid must be at least 8 + bytes long (assuming no sub-auths - e.g. the null SID */ + if (end_of_acl < (char *)psid + 8) { + cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid); + return -EINVAL; + } + +#ifdef CONFIG_CIFS_DEBUG2 + if (psid->num_subauth) { + int i; + cifs_dbg(FYI, "SID revision %d num_auth %d\n", + psid->revision, psid->num_subauth); + + for (i = 0; i < psid->num_subauth; i++) { + cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(psid->sub_auth[i])); + } + + /* BB add length check to make sure that we do not have huge + num auths and therefore go off the end */ + cifs_dbg(FYI, "RID 0x%x\n", + le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); + } +#endif + + return 0; +} + + +/* Convert CIFS ACL to POSIX form */ +static int parse_sec_desc(struct cifs_sb_info *cifs_sb, + struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr) +{ + int rc = 0; + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_acl *dacl_ptr; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + acl_len; + __u32 dacloffset; + + if (pntsd == NULL) + return -EIO; + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", + pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), + le32_to_cpu(pntsd->gsidoffset), + le32_to_cpu(pntsd->sacloffset), dacloffset); +/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ + rc = parse_sid(owner_sid_ptr, end_of_acl); + if (rc) { + cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc); + return rc; + } + rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); + if (rc) { + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n", + __func__, rc); + return rc; + } + + rc = parse_sid(group_sid_ptr, end_of_acl); + if (rc) { + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n", + __func__, rc); + return rc; + } + rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); + if (rc) { + cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n", + __func__, rc); + return rc; + } + + if (dacloffset) + parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, + group_sid_ptr, fattr); + else + cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ + + return rc; +} + +/* Convert permission bits from mode to equivalent CIFS ACL */ +static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, + __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag) +{ + int rc = 0; + __u32 dacloffset; + __u32 ndacloffset; + __u32 sidsoffset; + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ + struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + + if (nmode != NO_CHANGE_64) { /* chmod */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + nmode); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy sec desc control portion & owner and group sids */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + *aclflag = CIFS_ACL_DACL; + } else { + memcpy(pnntsd, pntsd, secdesclen); + if (uid_valid(uid)) { /* chown */ + uid_t id; + owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->osidoffset)); + nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + id = from_kuid(&init_user_ns, uid); + rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); + if (rc) { + cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", + __func__, rc, id); + kfree(nowner_sid_ptr); + return rc; + } + cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } + if (gid_valid(gid)) { /* chgrp */ + gid_t id; + group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->gsidoffset)); + ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!ngroup_sid_ptr) + return -ENOMEM; + id = from_kgid(&init_user_ns, gid); + rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); + if (rc) { + cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", + __func__, rc, id); + kfree(ngroup_sid_ptr); + return rc; + } + cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } + } + + return rc; +} + +struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, + const struct cifs_fid *cifsfid, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + unsigned int xid; + int rc; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + xid = get_xid(); + rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd, + pacllen); + free_xid(xid); + + cifs_put_tlink(tlink); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, + const char *path, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + int oplock = 0; + unsigned int xid; + int rc, create_options = 0; + struct cifs_tcon *tcon; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + tcon = tlink_tcon(tlink); + xid = get_xid(); + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = READ_CONTROL; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (!rc) { + rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid.netfid); + } + + cifs_put_tlink(tlink); + free_xid(xid); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +/* Retrieve an ACL from the server */ +struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, + struct inode *inode, const char *path, + u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + struct cifsFileInfo *open_file = NULL; + + if (inode) + open_file = find_readable_file(CIFS_I(inode), true); + if (!open_file) + return get_cifs_acl_by_path(cifs_sb, path, pacllen); + + pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen); + cifsFileInfo_put(open_file); + return pntsd; +} + + /* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path, int aclflag) +{ + int oplock = 0; + unsigned int xid; + int rc, access_flags, create_options = 0; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + tcon = tlink_tcon(tlink); + xid = get_xid(); + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) + access_flags = WRITE_OWNER; + else + access_flags = WRITE_DAC; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = access_flags; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) { + cifs_dbg(VFS, "Unable to open file to set ACL\n"); + goto out; + } + + rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag); + cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); + + CIFSSMBClose(xid, tcon, fid.netfid); +out: + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ +int +cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + struct inode *inode, const char *path, + const struct cifs_fid *pfid) +{ + struct cifs_ntsd *pntsd = NULL; + u32 acllen = 0; + int rc = 0; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) + pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, + &acllen); + else if (tcon->ses->server->ops->get_acl) + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &acllen); + else { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); + } else { + rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); + kfree(pntsd); + if (rc) + cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); + } + + cifs_put_tlink(tlink); + + return rc; +} + +/* Convert mode bits to an ACL so we can update the ACL on the server */ +int +id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, + kuid_t uid, kgid_t gid) +{ + int rc = 0; + int aclflag = CIFS_ACL_DACL; /* default flag to set */ + __u32 secdesclen = 0; + struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ + struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + cifs_dbg(NOISY, "set ACL from mode for %s\n", path); + + /* Get the security descriptor */ + + if (tcon->ses->server->ops->get_acl == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &secdesclen); + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); + cifs_put_tlink(tlink); + return rc; + } + + /* + * Add three ACEs for owner, group, everyone getting rid of other ACEs + * as chmod disables ACEs and set the security descriptor. Allocate + * memory for the smb header, set security descriptor request security + * descriptor parameters, and secuirty descriptor itself + */ + secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); + pnntsd = kmalloc(secdesclen, GFP_KERNEL); + if (!pnntsd) { + kfree(pntsd); + cifs_put_tlink(tlink); + return -ENOMEM; + } + + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); + + cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); + + if (tcon->ses->server->ops->set_acl == NULL) + rc = -EOPNOTSUPP; + + if (!rc) { + /* Set the security descriptor */ + rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, + path, aclflag); + cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); + } + cifs_put_tlink(tlink); + + kfree(pnntsd); + kfree(pntsd); + return rc; +} diff --git a/kernel/fs/cifs/cifsacl.h b/kernel/fs/cifs/cifsacl.h new file mode 100644 index 000000000..4f3884835 --- /dev/null +++ b/kernel/fs/cifs/cifsacl.h @@ -0,0 +1,101 @@ +/* + * fs/cifs/cifsacl.h + * + * Copyright (c) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSACL_H +#define _CIFSACL_H + + +#define NUM_AUTHS (6) /* number of authority fields */ +#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ + +#define READ_BIT 0x4 +#define WRITE_BIT 0x2 +#define EXEC_BIT 0x1 + +#define UBITSHIFT 6 +#define GBITSHIFT 3 + +#define ACCESS_ALLOWED 0 +#define ACCESS_DENIED 1 + +#define SIDOWNER 1 +#define SIDGROUP 2 + +/* + * Security Descriptor length containing DACL with 3 ACEs (one each for + * owner, group and world). + */ +#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \ + sizeof(struct cifs_acl) + \ + (sizeof(struct cifs_ace) * 3)) + +/* + * Maximum size of a string representation of a SID: + * + * The fields are unsigned values in decimal. So: + * + * u8: max 3 bytes in decimal + * u32: max 10 bytes in decimal + * + * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator + * + * For authority field, max is when all 6 values are non-zero and it must be + * represented in hex. So "-0x" + 12 hex digits. + * + * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') + */ +#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) +#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ + +struct cifs_ntsd { + __le16 revision; /* revision level */ + __le16 type; + __le32 osidoffset; + __le32 gsidoffset; + __le32 sacloffset; + __le32 dacloffset; +} __attribute__((packed)); + +struct cifs_sid { + __u8 revision; /* revision level */ + __u8 num_subauth; + __u8 authority[NUM_AUTHS]; + __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ +} __attribute__((packed)); + +/* size of a struct cifs_sid, sans sub_auth array */ +#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) + +struct cifs_acl { + __le16 revision; /* revision level */ + __le16 size; + __le32 num_aces; +} __attribute__((packed)); + +struct cifs_ace { + __u8 type; + __u8 flags; + __le16 size; + __le32 access_req; + struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ +} __attribute__((packed)); + +#endif /* _CIFSACL_H */ diff --git a/kernel/fs/cifs/cifsencrypt.c b/kernel/fs/cifs/cifsencrypt.c new file mode 100644 index 000000000..aa0dc2573 --- /dev/null +++ b/kernel/fs/cifs/cifsencrypt.c @@ -0,0 +1,818 @@ +/* + * fs/cifs/cifsencrypt.c + * + * Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP + * for more detailed information + * + * Copyright (C) International Business Machines Corp., 2005,2013 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" +#include "cifsproto.h" +#include "ntlmssp.h" +#include +#include +#include + +static int +cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + if (server->secmech.sdescmd5 != NULL) + return 0; /* already allocated */ + + server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(server->secmech.md5)) { + cifs_dbg(VFS, "could not allocate crypto md5\n"); + rc = PTR_ERR(server->secmech.md5); + server->secmech.md5 = NULL; + return rc; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.md5); + server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdescmd5) { + crypto_free_shash(server->secmech.md5); + server->secmech.md5 = NULL; + return -ENOMEM; + } + server->secmech.sdescmd5->shash.tfm = server->secmech.md5; + server->secmech.sdescmd5->shash.flags = 0x0; + + return 0; +} + +/* + * Calculate and return the CIFS signature based on the mac key and SMB PDU. + * The 16 byte signature must be allocated by the caller. Note we only use the + * 1st eight bytes and that the smb header signature field on input contains + * the sequence number before this function is called. Also, this function + * should be called with the server->srv_mutex held. + */ +static int cifs_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, char *signature) +{ + int i; + int rc; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + + if (iov == NULL || signature == NULL || server == NULL) + return -EINVAL; + + if (!server->secmech.sdescmd5) { + rc = cifs_crypto_shash_md5_allocate(server); + if (rc) { + cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__); + return -1; + } + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); + return rc; + } + + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cifs_dbg(VFS, "null iovec entry\n"); + return -EIO; + } + /* The first entry includes a length field (which does not get + signed that occupies the first 4 bytes before the header */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdescmd5->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + + return rc; +} + +/* must be called with server->srv_mutex held */ +int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + int rc = 0; + char smb_signature[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + + if ((cifs_pdu == NULL) || (server == NULL)) + return -EINVAL; + + if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || + server->tcpStatus == CifsNeedNegotiate) + return rc; + + if (!server->session_estab) { + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + return rc; + } + + cifs_pdu->Signature.Sequence.SequenceNumber = + cpu_to_le32(server->sequence_number); + cifs_pdu->Signature.Sequence.Reserved = 0; + + *pexpected_response_sequence_number = ++server->sequence_number; + ++server->sequence_number; + + rc = cifs_calc_signature(rqst, server, smb_signature); + if (rc) + memset(cifs_pdu->Signature.SecuritySignature, 0, 8); + else + memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); + + return rc; +} + +int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence) +{ + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; + + return cifs_sign_rqst(&rqst, server, pexpected_response_sequence); +} + +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + struct kvec iov; + + iov.iov_base = cifs_pdu; + iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + + return cifs_sign_smbv(&iov, 1, server, + pexpected_response_sequence_number); +} + +int cifs_verify_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, + __u32 expected_sequence_number) +{ + unsigned int rc; + char server_response_sig[8]; + char what_we_think_sig_should_be[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + + if (cifs_pdu == NULL || server == NULL) + return -EINVAL; + + if (!server->session_estab) + return 0; + + if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { + struct smb_com_lock_req *pSMB = + (struct smb_com_lock_req *)cifs_pdu; + if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) + return 0; + } + + /* BB what if signatures are supposed to be on for session but + server does not send one? BB */ + + /* Do not need to verify session setups with signature "BSRSPYL " */ + if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + cifs_pdu->Command); + + /* save off the origiginal signature so we can modify the smb and check + its signature against what the server sent */ + memcpy(server_response_sig, cifs_pdu->Signature.SecuritySignature, 8); + + cifs_pdu->Signature.Sequence.SequenceNumber = + cpu_to_le32(expected_sequence_number); + cifs_pdu->Signature.Sequence.Reserved = 0; + + mutex_lock(&server->srv_mutex); + rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be); + mutex_unlock(&server->srv_mutex); + + if (rc) + return rc; + +/* cifs_dump_mem("what we think it should be: ", + what_we_think_sig_should_be, 16); */ + + if (memcmp(server_response_sig, what_we_think_sig_should_be, 8)) + return -EACCES; + else + return 0; + +} + +/* first calculate 24 bytes ntlm response and then 16 byte session key */ +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + int rc = 0; + unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; + char temp_key[CIFS_SESS_KEY_SIZE]; + + if (!ses) + return -EINVAL; + + ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); + if (!ses->auth_key.response) + return -ENOMEM; + + ses->auth_key.len = temp_len; + + rc = SMBNTencrypt(ses->password, ses->server->cryptkey, + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n", + __func__, rc); + return rc; + } + + rc = E_md4hash(ses->password, temp_key, nls_cp); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); + return rc; + } + + rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + if (rc) + cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n", + __func__, rc); + + return rc; +} + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, + char *lnm_session_key) +{ + int i; + int rc; + char password_with_pad[CIFS_ENCPWD_SIZE]; + + memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); + if (password) + strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); + + if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { + memcpy(lnm_session_key, password_with_pad, + CIFS_ENCPWD_SIZE); + return 0; + } + + /* calculate old style session key */ + /* calling toupper is less broken than repeatedly + calling nls_toupper would be since that will never + work for UTF8, but neither handles multibyte code pages + but the only alternative would be converting to UCS-16 (Unicode) + (using a routine something like UniStrupr) then + uppercasing and then converting back from Unicode - which + would only worth doing it if we knew it were utf8. Basically + utf8 and other multibyte codepages each need their own strupper + function since a byte at a time will ont work. */ + + for (i = 0; i < CIFS_ENCPWD_SIZE; i++) + password_with_pad[i] = toupper(password_with_pad[i]); + + rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key); + + return rc; +} +#endif /* CIFS_WEAK_PW_HASH */ + +/* Build a proper attribute value/target info pairs blob. + * Fill in netbios and dns domain name and workstation name + * and client time (total five av pairs and + one end of fields indicator. + * Allocate domain name which gets freed when session struct is deallocated. + */ +static int +build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + unsigned int dlen; + unsigned int size = 2 * sizeof(struct ntlmssp2_name); + char *defdmname = "WORKGROUP"; + unsigned char *blobptr; + struct ntlmssp2_name *attrptr; + + if (!ses->domainName) { + ses->domainName = kstrdup(defdmname, GFP_KERNEL); + if (!ses->domainName) + return -ENOMEM; + } + + dlen = strlen(ses->domainName); + + /* + * The length of this blob is two times the size of a + * structure (av pair) which holds name/size + * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) + + * unicode length of a netbios domain name + */ + ses->auth_key.len = size + 2 * dlen; + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); + if (!ses->auth_key.response) { + ses->auth_key.len = 0; + return -ENOMEM; + } + + blobptr = ses->auth_key.response; + attrptr = (struct ntlmssp2_name *) blobptr; + + /* + * As defined in MS-NTLM 3.3.2, just this av pair field + * is sufficient as part of the temp + */ + attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); + attrptr->length = cpu_to_le16(2 * dlen); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp); + + return 0; +} + +/* Server has provided av pairs/target info in the type 2 challenge + * packet and we have plucked it and stored within smb session. + * We parse that blob here to find netbios domain name to be used + * as part of ntlmv2 authentication (in Target String), if not already + * specified on the command line. + * If this function returns without any error but without fetching + * domain name, authentication may fail against some server but + * may not fail against other (those who are not very particular + * about target string i.e. for some, just user name might suffice. + */ +static int +find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + unsigned int attrsize; + unsigned int type; + unsigned int onesize = sizeof(struct ntlmssp2_name); + unsigned char *blobptr; + unsigned char *blobend; + struct ntlmssp2_name *attrptr; + + if (!ses->auth_key.len || !ses->auth_key.response) + return 0; + + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; + + while (blobptr + onesize < blobend) { + attrptr = (struct ntlmssp2_name *) blobptr; + type = le16_to_cpu(attrptr->type); + if (type == NTLMSSP_AV_EOL) + break; + blobptr += 2; /* advance attr type */ + attrsize = le16_to_cpu(attrptr->length); + blobptr += 2; /* advance attr size */ + if (blobptr + attrsize > blobend) + break; + if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { + if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN) + break; + if (!ses->domainName) { + ses->domainName = + kmalloc(attrsize + 1, GFP_KERNEL); + if (!ses->domainName) + return -ENOMEM; + cifs_from_utf16(ses->domainName, + (__le16 *)blobptr, attrsize, attrsize, + nls_cp, NO_MAP_UNI_RSVD); + break; + } + } + blobptr += attrsize; /* advance attr value */ + } + + return 0; +} + +static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, + const struct nls_table *nls_cp) +{ + int rc = 0; + int len; + char nt_hash[CIFS_NTHASH_SIZE]; + __le16 *user; + wchar_t *domain; + wchar_t *server; + + if (!ses->server->secmech.sdeschmacmd5) { + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); + return -1; + } + + /* calculate md4 hash of password */ + E_md4hash(ses->password, nt_hash, nls_cp); + + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + CIFS_NTHASH_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); + return rc; + } + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); + return rc; + } + + /* convert ses->user_name to unicode */ + len = ses->user_name ? strlen(ses->user_name) : 0; + user = kmalloc(2 + (len * 2), GFP_KERNEL); + if (user == NULL) { + rc = -ENOMEM; + return rc; + } + + if (len) { + len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); + UniStrupr(user); + } else { + memset(user, '\0', 2); + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)user, 2 * len); + kfree(user); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with user\n", __func__); + return rc; + } + + /* convert ses->domainName to unicode and uppercase */ + if (ses->domainName) { + len = strlen(ses->domainName); + + domain = kmalloc(2 + (len * 2), GFP_KERNEL); + if (domain == NULL) { + rc = -ENOMEM; + return rc; + } + len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, + nls_cp); + rc = + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)domain, 2 * len); + kfree(domain); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with domain\n", + __func__); + return rc; + } + } else { + /* We use ses->serverName if no domain name available */ + len = strlen(ses->serverName); + + server = kmalloc(2 + (len * 2), GFP_KERNEL); + if (server == NULL) { + rc = -ENOMEM; + return rc; + } + len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len, + nls_cp); + rc = + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)server, 2 * len); + kfree(server); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with server\n", + __func__); + return rc; + } + } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2_hash); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + + return rc; +} + +static int +CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) +{ + int rc; + struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + unsigned int hash_len; + + /* The MD5 hash starts at challenge_key.key */ + hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + + offsetof(struct ntlmv2_resp, challenge.key[0])); + + if (!ses->server->secmech.sdeschmacmd5) { + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); + return -1; + } + + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); + return rc; + } + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); + return rc; + } + + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) + memcpy(ntlmv2->challenge.key, + ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + else + memcpy(ntlmv2->challenge.key, + ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2->challenge.key, hash_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + return rc; + } + + /* Note that the MD5 digest over writes anon.challenge_key.key */ + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2->ntlmv2_hash); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + + return rc; +} + +static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + /* check if already allocated */ + if (server->secmech.sdeschmacmd5) + return 0; + + server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); + if (IS_ERR(server->secmech.hmacmd5)) { + cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); + rc = PTR_ERR(server->secmech.hmacmd5); + server->secmech.hmacmd5 = NULL; + return rc; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacmd5); + server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacmd5) { + crypto_free_shash(server->secmech.hmacmd5); + server->secmech.hmacmd5 = NULL; + return -ENOMEM; + } + server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; + server->secmech.sdeschmacmd5->shash.flags = 0x0; + + return 0; +} + +int +setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + int rc; + int baselen; + unsigned int tilen; + struct ntlmv2_resp *ntlmv2; + char ntlmv2_hash[16]; + unsigned char *tiblob = NULL; /* target info blob */ + + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) { + if (!ses->domainName) { + rc = find_domain_name(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "error %d finding domain name\n", + rc); + goto setup_ntlmv2_rsp_ret; + } + } + } else { + rc = build_avpair_blob(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "error %d building av pair blob\n", rc); + goto setup_ntlmv2_rsp_ret; + } + } + + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + tilen = ses->auth_key.len; + tiblob = ses->auth_key.response; + + ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); + if (!ses->auth_key.response) { + rc = ENOMEM; + ses->auth_key.len = 0; + goto setup_ntlmv2_rsp_ret; + } + ses->auth_key.len += baselen; + + ntlmv2 = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ntlmv2->blob_signature = cpu_to_le32(0x00000101); + ntlmv2->reserved = 0; + /* Must be within 5 minutes of the server */ + ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal)); + ntlmv2->reserved2 = 0; + + memcpy(ses->auth_key.response + baselen, tiblob, tilen); + + rc = crypto_hmacmd5_alloc(ses->server); + if (rc) { + cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc); + goto setup_ntlmv2_rsp_ret; + } + + /* calculate ntlmv2_hash */ + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); + if (rc) { + cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); + goto setup_ntlmv2_rsp_ret; + } + + /* calculate first part of the client response (CR1) */ + rc = CalcNTLMv2_response(ses, ntlmv2_hash); + if (rc) { + cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); + goto setup_ntlmv2_rsp_ret; + } + + /* now calculate the session key for NTLMv2 */ + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); + goto setup_ntlmv2_rsp_ret; + } + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); + goto setup_ntlmv2_rsp_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2->ntlmv2_hash, + CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + goto setup_ntlmv2_rsp_ret; + } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + +setup_ntlmv2_rsp_ret: + kfree(tiblob); + + return rc; +} + +int +calc_seckey(struct cifs_ses *ses) +{ + int rc; + struct crypto_blkcipher *tfm_arc4; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ + + get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); + + tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_arc4)) { + rc = PTR_ERR(tfm_arc4); + cifs_dbg(VFS, "could not allocate crypto API arc4\n"); + return rc; + } + + desc.tfm = tfm_arc4; + + rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + CIFS_SESS_KEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set response as a key\n", + __func__); + return rc; + } + + sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); + sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + if (rc) { + cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc); + crypto_free_blkcipher(tfm_arc4); + return rc; + } + + /* make secondary_key/nonce as session key */ + memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); + /* and make len as that of session key only */ + ses->auth_key.len = CIFS_SESS_KEY_SIZE; + + crypto_free_blkcipher(tfm_arc4); + + return rc; +} + +void +cifs_crypto_shash_release(struct TCP_Server_Info *server) +{ + if (server->secmech.cmacaes) { + crypto_free_shash(server->secmech.cmacaes); + server->secmech.cmacaes = NULL; + } + + if (server->secmech.hmacsha256) { + crypto_free_shash(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + } + + if (server->secmech.md5) { + crypto_free_shash(server->secmech.md5); + server->secmech.md5 = NULL; + } + + if (server->secmech.hmacmd5) { + crypto_free_shash(server->secmech.hmacmd5); + server->secmech.hmacmd5 = NULL; + } + + kfree(server->secmech.sdesccmacaes); + server->secmech.sdesccmacaes = NULL; + kfree(server->secmech.sdeschmacsha256); + server->secmech.sdeschmacsha256 = NULL; + kfree(server->secmech.sdeschmacmd5); + server->secmech.sdeschmacmd5 = NULL; + kfree(server->secmech.sdescmd5); + server->secmech.sdescmd5 = NULL; +} diff --git a/kernel/fs/cifs/cifsfs.c b/kernel/fs/cifs/cifsfs.c new file mode 100644 index 000000000..0a9fb6b53 --- /dev/null +++ b/kernel/fs/cifs/cifsfs.c @@ -0,0 +1,1308 @@ +/* + * fs/cifs/cifsfs.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Common Internet FileSystem (CIFS) client + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Note that BB means BUGBUG (ie something to fix eventually) */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#define DECLARE_GLOBALS_HERE +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include +#include +#include "cifs_spnego.h" +#include "fscache.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2pdu.h" +#endif + +int cifsFYI = 0; +int traceSMB = 0; +bool enable_oplocks = true; +unsigned int linuxExtEnabled = 1; +unsigned int lookupCacheEnabled = 1; +unsigned int global_secflags = CIFSSEC_DEF; +/* unsigned int ntlmv2_support = 0; */ +unsigned int sign_CIFS_PDUs = 1; +static const struct super_operations cifs_super_ops; +unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; +module_param(CIFSMaxBufSize, uint, 0); +MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " + "Default: 16384 Range: 8192 to 130048"); +unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; +module_param(cifs_min_rcv, uint, 0); +MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " + "1 to 64"); +unsigned int cifs_min_small = 30; +module_param(cifs_min_small, uint, 0); +MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " + "Range: 2 to 256"); +unsigned int cifs_max_pending = CIFS_MAX_REQ; +module_param(cifs_max_pending, uint, 0444); +MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " + "Default: 32767 Range: 2 to 32767."); +module_param(enable_oplocks, bool, 0644); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); + +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; +extern mempool_t *cifs_mid_poolp; + +struct workqueue_struct *cifsiod_wq; + +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + +static int +cifs_read_super(struct super_block *sb) +{ + struct inode *inode; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + int rc = 0; + + cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + sb->s_flags |= MS_POSIXACL; + + if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; + + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ + sb->s_time_gran = 100; + + sb->s_magic = CIFS_MAGIC_NUMBER; + sb->s_op = &cifs_super_ops; + sb->s_bdi = &cifs_sb->bdi; + sb->s_blocksize = CIFS_MAX_MSGSIZE; + sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ + inode = cifs_root_iget(sb); + + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + goto out_no_root; + } + + if (tcon->nocase) + sb->s_d_op = &cifs_ci_dentry_ops; + else + sb->s_d_op = &cifs_dentry_ops; + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + rc = -ENOMEM; + goto out_no_root; + } + +#ifdef CONFIG_CIFS_NFSD_EXPORT + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_dbg(FYI, "export ops supported\n"); + sb->s_export_op = &cifs_export_ops; + } +#endif /* CONFIG_CIFS_NFSD_EXPORT */ + + return 0; + +out_no_root: + cifs_dbg(VFS, "%s: get root inode failed\n", __func__); + return rc; +} + +static void cifs_kill_sb(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + kill_anon_super(sb); + cifs_umount(cifs_sb); +} + +static int +cifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int xid; + int rc = 0; + + xid = get_xid(); + + /* + * PATH_MAX may be too long - it would presumably be total path, + * but note that some servers (includinng Samba 3) have a shorter + * maximum path. + * + * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO. + */ + buf->f_namelen = PATH_MAX; + buf->f_files = 0; /* undefined */ + buf->f_ffree = 0; /* unlimited */ + + if (server->ops->queryfs) + rc = server->ops->queryfs(xid, tcon, buf); + + free_xid(xid); + return 0; +} + +static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) +{ + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct TCP_Server_Info *server = tcon->ses->server; + + if (server->ops->fallocate) + return server->ops->fallocate(file, tcon, mode, off, len); + + return -EOPNOTSUPP; +} + +static int cifs_permission(struct inode *inode, int mask) +{ + struct cifs_sb_info *cifs_sb; + + cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + else + return 0; + } else /* file mode might have been restricted at mount time + on the client (above and beyond ACL on servers) for + servers which do not support setting and viewing mode bits, + so allowing client to check permissions is useful */ + return generic_permission(inode, mask); +} + +static struct kmem_cache *cifs_inode_cachep; +static struct kmem_cache *cifs_req_cachep; +static struct kmem_cache *cifs_mid_cachep; +static struct kmem_cache *cifs_sm_req_cachep; +mempool_t *cifs_sm_req_poolp; +mempool_t *cifs_req_poolp; +mempool_t *cifs_mid_poolp; + +static struct inode * +cifs_alloc_inode(struct super_block *sb) +{ + struct cifsInodeInfo *cifs_inode; + cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); + if (!cifs_inode) + return NULL; + cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->time = 0; + /* + * Until the file is open and we have gotten oplock info back from the + * server, can not assume caching of file data or metadata. + */ + cifs_set_oplock_level(cifs_inode, 0); + cifs_inode->flags = 0; + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; + cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->server_eof = 0; + cifs_inode->uniqueid = 0; + cifs_inode->createtime = 0; + cifs_inode->epoch = 0; +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); +#endif + /* + * Can not set i_flags here - they get immediately overwritten to zero + * by the VFS. + */ + /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ + INIT_LIST_HEAD(&cifs_inode->openFileList); + INIT_LIST_HEAD(&cifs_inode->llist); + return &cifs_inode->vfs_inode; +} + +static void cifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); +} + +static void +cifs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, cifs_i_callback); +} + +static void +cifs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + cifs_fscache_release_inode_cookie(inode); +} + +static void +cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) +{ + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; + + seq_puts(s, ",addr="); + + switch (server->dstaddr.ss_family) { + case AF_INET: + seq_printf(s, "%pI4", &sa->sin_addr.s_addr); + break; + case AF_INET6: + seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr); + if (sa6->sin6_scope_id) + seq_printf(s, "%%%u", sa6->sin6_scope_id); + break; + default: + seq_puts(s, "(unknown)"); + } +} + +static void +cifs_show_security(struct seq_file *s, struct cifs_ses *ses) +{ + if (ses->sectype == Unspecified) + return; + + seq_puts(s, ",sec="); + + switch (ses->sectype) { + case LANMAN: + seq_puts(s, "lanman"); + break; + case NTLMv2: + seq_puts(s, "ntlmv2"); + break; + case NTLM: + seq_puts(s, "ntlm"); + break; + case Kerberos: + seq_puts(s, "krb5"); + break; + case RawNTLMSSP: + seq_puts(s, "ntlmssp"); + break; + default: + /* shouldn't ever happen */ + seq_puts(s, "unknown"); + break; + } + + if (ses->sign) + seq_puts(s, "i"); +} + +static void +cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) +{ + seq_puts(s, ",cache="); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + seq_puts(s, "strict"); + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) + seq_puts(s, "none"); + else + seq_puts(s, "loose"); +} + +static void +cifs_show_nls(struct seq_file *s, struct nls_table *cur) +{ + struct nls_table *def; + + /* Display iocharset= option if it's not default charset */ + def = load_nls_default(); + if (def != cur) + seq_printf(s, ",iocharset=%s", cur->charset); + unload_nls(def); +} + +/* + * cifs_show_options() is for displaying mount options in /proc/mounts. + * Not all settable options are displayed but most of the important + * ones are. + */ +static int +cifs_show_options(struct seq_file *s, struct dentry *root) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct sockaddr *srcaddr; + srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; + + seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); + cifs_show_security(s, tcon->ses); + cifs_show_cache_flavor(s, cifs_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) + seq_puts(s, ",multiuser"); + else if (tcon->ses->user_name) + seq_printf(s, ",username=%s", tcon->ses->user_name); + + if (tcon->ses->domainName) + seq_printf(s, ",domain=%s", tcon->ses->domainName); + + if (srcaddr->sa_family != AF_UNSPEC) { + struct sockaddr_in *saddr4; + struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)srcaddr; + saddr6 = (struct sockaddr_in6 *)srcaddr; + if (srcaddr->sa_family == AF_INET6) + seq_printf(s, ",srcaddr=%pI6c", + &saddr6->sin6_addr); + else if (srcaddr->sa_family == AF_INET) + seq_printf(s, ",srcaddr=%pI4", + &saddr4->sin_addr.s_addr); + else + seq_printf(s, ",srcaddr=BAD-AF:%i", + (int)(srcaddr->sa_family)); + } + + seq_printf(s, ",uid=%u", + from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid)); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + seq_puts(s, ",forceuid"); + else + seq_puts(s, ",noforceuid"); + + seq_printf(s, ",gid=%u", + from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid)); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + seq_puts(s, ",forcegid"); + else + seq_puts(s, ",noforcegid"); + + cifs_show_address(s, tcon->ses->server); + + if (!tcon->unix_ext) + seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", + cifs_sb->mnt_file_mode, + cifs_sb->mnt_dir_mode); + + cifs_show_nls(s, cifs_sb->local_nls); + + if (tcon->seal) + seq_puts(s, ",seal"); + if (tcon->nocase) + seq_puts(s, ",nocase"); + if (tcon->retry) + seq_puts(s, ",hard"); + if (tcon->unix_ext) + seq_puts(s, ",unix"); + else + seq_puts(s, ",nounix"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + seq_puts(s, ",posixpaths"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) + seq_puts(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + seq_puts(s, ",serverino"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + seq_puts(s, ",rwpidforward"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) + seq_puts(s, ",forcemand"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + seq_puts(s, ",nouser_xattr"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + seq_puts(s, ",mapchars"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + seq_puts(s, ",mapposix"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + seq_puts(s, ",sfu"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + seq_puts(s, ",nobrl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + seq_puts(s, ",cifsacl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + seq_puts(s, ",dynperm"); + if (root->d_sb->s_flags & MS_POSIXACL) + seq_puts(s, ",acl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + seq_puts(s, ",mfsymlinks"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + seq_puts(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + seq_puts(s, ",nostrictsync"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + seq_puts(s, ",noperm"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) + seq_printf(s, ",backupuid=%u", + from_kuid_munged(&init_user_ns, + cifs_sb->mnt_backupuid)); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) + seq_printf(s, ",backupgid=%u", + from_kgid_munged(&init_user_ns, + cifs_sb->mnt_backupgid)); + + seq_printf(s, ",rsize=%u", cifs_sb->rsize); + seq_printf(s, ",wsize=%u", cifs_sb->wsize); + /* convert actimeo and display it in seconds */ + seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); + + return 0; +} + +static void cifs_umount_begin(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + + if (cifs_sb == NULL) + return; + + tcon = cifs_sb_master_tcon(cifs_sb); + + spin_lock(&cifs_tcp_ses_lock); + if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { + /* we have other mounts to same share or we have + already tried to force umount this and woken up + all waiting network requests, nothing to do */ + spin_unlock(&cifs_tcp_ses_lock); + return; + } else if (tcon->tc_count == 1) + tcon->tidStatus = CifsExiting; + spin_unlock(&cifs_tcp_ses_lock); + + /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ + /* cancel_notify_requests(tcon); */ + if (tcon->ses && tcon->ses->server) { + cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n"); + wake_up_all(&tcon->ses->server->request_q); + wake_up_all(&tcon->ses->server->response_q); + msleep(1); /* yield */ + /* we have to kick the requests once more */ + wake_up_all(&tcon->ses->server->response_q); + msleep(1); + } + + return; +} + +#ifdef CONFIG_CIFS_STATS2 +static int cifs_show_stats(struct seq_file *s, struct dentry *root) +{ + /* BB FIXME */ + return 0; +} +#endif + +static int cifs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_NODIRATIME; + return 0; +} + +static int cifs_drop_inode(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + /* no serverino => unconditional eviction */ + return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || + generic_drop_inode(inode); +} + +static const struct super_operations cifs_super_ops = { + .statfs = cifs_statfs, + .alloc_inode = cifs_alloc_inode, + .destroy_inode = cifs_destroy_inode, + .drop_inode = cifs_drop_inode, + .evict_inode = cifs_evict_inode, +/* .delete_inode = cifs_delete_inode, */ /* Do not need above + function unless later we add lazy close of inodes or unless the + kernel forgets to call us with the same number of releases (closes) + as opens */ + .show_options = cifs_show_options, + .umount_begin = cifs_umount_begin, + .remount_fs = cifs_remount, +#ifdef CONFIG_CIFS_STATS2 + .show_stats = cifs_show_stats, +#endif +}; + +/* + * Get root dentry from superblock according to prefix path mount option. + * Return dentry with refcount + 1 on success and NULL otherwise. + */ +static struct dentry * +cifs_get_root(struct smb_vol *vol, struct super_block *sb) +{ + struct dentry *dentry; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + char *full_path = NULL; + char *s, *p; + char sep; + + full_path = cifs_build_path_to_root(vol, cifs_sb, + cifs_sb_master_tcon(cifs_sb)); + if (full_path == NULL) + return ERR_PTR(-ENOMEM); + + cifs_dbg(FYI, "Get root dentry for %s\n", full_path); + + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(sb->s_root); + p = s = full_path; + + do { + struct inode *dir = d_inode(dentry); + struct dentry *child; + + if (!dir) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + break; + } + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + mutex_lock(&dir->i_mutex); + child = lookup_one_len(p, dentry, s - p); + mutex_unlock(&dir->i_mutex); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + kfree(full_path); + return dentry; +} + +static int cifs_set_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = data; + sb->s_fs_info = mnt_data->cifs_sb; + return set_anon_super(sb, NULL); +} + +static struct dentry * +cifs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int rc; + struct super_block *sb; + struct cifs_sb_info *cifs_sb; + struct smb_vol *volume_info; + struct cifs_mnt_data mnt_data; + struct dentry *root; + + cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); + + volume_info = cifs_get_volume_info((char *)data, dev_name); + if (IS_ERR(volume_info)) + return ERR_CAST(volume_info); + + cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); + if (cifs_sb == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_nls; + } + + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; + } + + cifs_setup_cifs_sb(volume_info, cifs_sb); + + rc = cifs_mount(cifs_sb, volume_info); + if (rc) { + if (!(flags & MS_SILENT)) + cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", + rc); + root = ERR_PTR(rc); + goto out_mountdata; + } + + mnt_data.vol = volume_info; + mnt_data.cifs_sb = cifs_sb; + mnt_data.flags = flags; + + /* BB should we make this contingent on mount parm? */ + flags |= MS_NODIRATIME | MS_NOATIME; + + sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); + if (IS_ERR(sb)) { + root = ERR_CAST(sb); + cifs_umount(cifs_sb); + goto out; + } + + if (sb->s_root) { + cifs_dbg(FYI, "Use existing superblock\n"); + cifs_umount(cifs_sb); + } else { + rc = cifs_read_super(sb); + if (rc) { + root = ERR_PTR(rc); + goto out_super; + } + + sb->s_flags |= MS_ACTIVE; + } + + root = cifs_get_root(volume_info, sb); + if (IS_ERR(root)) + goto out_super; + + cifs_dbg(FYI, "dentry root is: %p\n", root); + goto out; + +out_super: + deactivate_locked_super(sb); +out: + cifs_cleanup_volume_info(volume_info); + return root; + +out_mountdata: + kfree(cifs_sb->mountdata); +out_cifs_sb: + kfree(cifs_sb); +out_nls: + unload_nls(volume_info->local_nls); + goto out; +} + +static ssize_t +cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t rc; + struct inode *inode = file_inode(iocb->ki_filp); + + rc = cifs_revalidate_mapping(inode); + if (rc) + return rc; + + return generic_file_read_iter(iocb, iter); +} + +static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); + ssize_t written; + int rc; + + written = cifs_get_writer(cinode); + if (written) + return written; + + written = generic_file_write_iter(iocb, from); + + if (CIFS_CACHE_WRITE(CIFS_I(inode))) + goto out; + + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n", + rc, inode); + +out: + cifs_put_writer(cinode); + return written; +} + +static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) +{ + /* + * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (whence != SEEK_SET && whence != SEEK_CUR) { + int rc; + struct inode *inode = file_inode(file); + + /* + * We need to be sure that all dirty pages are written and the + * server has the newest file length. + */ + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; + } + } + /* + * Some applications poll for the file length in this strange + * way so we must seek to end on non-oplocked files by + * setting the revalidate time to zero. + */ + CIFS_I(inode)->time = 0; + + rc = cifs_revalidate_file_attr(file); + if (rc < 0) + return (loff_t)rc; + } + return generic_file_llseek(file, offset, whence); +} + +static int +cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) +{ + /* + * Note that this is called by vfs setlease with i_lock held to + * protect *lease from going away. + */ + struct inode *inode = file_inode(file); + struct cifsFileInfo *cfile = file->private_data; + + if (!(S_ISREG(inode->i_mode))) + return -EINVAL; + + /* Check if file is oplocked if this is request for new lease */ + if (arg == F_UNLCK || + ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || + ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) + return generic_setlease(file, arg, lease, priv); + else if (tlink_tcon(cfile->tlink)->local_lease && + !CIFS_CACHE_READ(CIFS_I(inode))) + /* + * If the server claims to support oplock on this file, then we + * still need to check oplock even if the local_lease mount + * option is set, but there are servers which do not support + * oplock for which this mount option may be useful if the user + * knows that the file won't be changed on the server by anyone + * else. + */ + return generic_setlease(file, arg, lease, priv); + else + return -EAGAIN; +} + +struct file_system_type cifs_fs_type = { + .owner = THIS_MODULE, + .name = "cifs", + .mount = cifs_do_mount, + .kill_sb = cifs_kill_sb, + /* .fs_flags */ +}; +MODULE_ALIAS_FS("cifs"); +const struct inode_operations cifs_dir_inode_ops = { + .create = cifs_create, + .atomic_open = cifs_atomic_open, + .lookup = cifs_lookup, + .getattr = cifs_getattr, + .unlink = cifs_unlink, + .link = cifs_hardlink, + .mkdir = cifs_mkdir, + .rmdir = cifs_rmdir, + .rename2 = cifs_rename2, + .permission = cifs_permission, +/* revalidate:cifs_revalidate, */ + .setattr = cifs_setattr, + .symlink = cifs_symlink, + .mknod = cifs_mknod, +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct inode_operations cifs_file_inode_ops = { +/* revalidate:cifs_revalidate, */ + .setattr = cifs_setattr, + .getattr = cifs_getattr, /* do we need this anymore? */ + .permission = cifs_permission, +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct inode_operations cifs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = cifs_follow_link, + .put_link = kfree_put_link, + .permission = cifs_permission, + /* BB add the following two eventually */ + /* revalidate: cifs_revalidate, + setattr: cifs_notify_change, *//* BB do we need notify change */ +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct file_operations cifs_file_ops = { + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_file_strict_ops = { + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_file_direct_ops = { + /* BB reevaluate whether they can be done with directio, no cache */ + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .llseek = cifs_llseek, + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_file_nobrl_ops = { + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_file_strict_nobrl_ops = { + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_file_direct_nobrl_ops = { + /* BB reevaluate whether they can be done with directio, no cache */ + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .llseek = cifs_llseek, + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + +const struct file_operations cifs_dir_ops = { + .iterate = cifs_readdir, + .release = cifs_closedir, + .read = generic_read_dir, + .unlocked_ioctl = cifs_ioctl, + .llseek = generic_file_llseek, +}; + +static void +cifs_init_once(void *inode) +{ + struct cifsInodeInfo *cifsi = inode; + + inode_init_once(&cifsi->vfs_inode); + init_rwsem(&cifsi->lock_sem); +} + +static int __init +cifs_init_inodecache(void) +{ + cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", + sizeof(struct cifsInodeInfo), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + cifs_init_once); + if (cifs_inode_cachep == NULL) + return -ENOMEM; + + return 0; +} + +static void +cifs_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(cifs_inode_cachep); +} + +static int +cifs_init_request_bufs(void) +{ + size_t max_hdr_size = MAX_CIFS_HDR_SIZE; +#ifdef CONFIG_CIFS_SMB2 + /* + * SMB2 maximum header size is bigger than CIFS one - no problems to + * allocate some more bytes for CIFS. + */ + max_hdr_size = MAX_SMB2_HDR_SIZE; +#endif + if (CIFSMaxBufSize < 8192) { + /* Buffer size can not be smaller than 2 * PATH_MAX since maximum + Unicode path name has to fit in any SMB/CIFS path based frames */ + CIFSMaxBufSize = 8192; + } else if (CIFSMaxBufSize > 1024*127) { + CIFSMaxBufSize = 1024 * 127; + } else { + CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ + } +/* + cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n", + CIFSMaxBufSize, CIFSMaxBufSize); +*/ + cifs_req_cachep = kmem_cache_create("cifs_request", + CIFSMaxBufSize + max_hdr_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (cifs_req_cachep == NULL) + return -ENOMEM; + + if (cifs_min_rcv < 1) + cifs_min_rcv = 1; + else if (cifs_min_rcv > 64) { + cifs_min_rcv = 64; + cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n"); + } + + cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, + cifs_req_cachep); + + if (cifs_req_poolp == NULL) { + kmem_cache_destroy(cifs_req_cachep); + return -ENOMEM; + } + /* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and + almost all handle based requests (but not write response, nor is it + sufficient for path based requests). A smaller size would have + been more efficient (compacting multiple slab items on one 4k page) + for the case in which debug was on, but this larger size allows + more SMBs to use small buffer alloc and is still much more + efficient to alloc 1 per page off the slab compared to 17K (5page) + alloc of large cifs buffers even when page debugging is on */ + cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq", + MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN, + NULL); + if (cifs_sm_req_cachep == NULL) { + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + return -ENOMEM; + } + + if (cifs_min_small < 2) + cifs_min_small = 2; + else if (cifs_min_small > 256) { + cifs_min_small = 256; + cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n"); + } + + cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, + cifs_sm_req_cachep); + + if (cifs_sm_req_poolp == NULL) { + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + kmem_cache_destroy(cifs_sm_req_cachep); + return -ENOMEM; + } + + return 0; +} + +static void +cifs_destroy_request_bufs(void) +{ + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + mempool_destroy(cifs_sm_req_poolp); + kmem_cache_destroy(cifs_sm_req_cachep); +} + +static int +cifs_init_mids(void) +{ + cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids", + sizeof(struct mid_q_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (cifs_mid_cachep == NULL) + return -ENOMEM; + + /* 3 is a reasonable minimum number of simultaneous operations */ + cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep); + if (cifs_mid_poolp == NULL) { + kmem_cache_destroy(cifs_mid_cachep); + return -ENOMEM; + } + + return 0; +} + +static void +cifs_destroy_mids(void) +{ + mempool_destroy(cifs_mid_poolp); + kmem_cache_destroy(cifs_mid_cachep); +} + +static int __init +init_cifs(void) +{ + int rc = 0; + cifs_proc_init(); + INIT_LIST_HEAD(&cifs_tcp_ses_list); +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ + INIT_LIST_HEAD(&GlobalDnotifyReqList); + INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ +/* + * Initialize Global counters + */ + atomic_set(&sesInfoAllocCount, 0); + atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesAllocCount, 0); + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + + atomic_set(&bufAllocCount, 0); + atomic_set(&smBufAllocCount, 0); +#ifdef CONFIG_CIFS_STATS2 + atomic_set(&totBufAllocCount, 0); + atomic_set(&totSmBufAllocCount, 0); +#endif /* CONFIG_CIFS_STATS2 */ + + atomic_set(&midCount, 0); + GlobalCurrentXid = 0; + GlobalTotalActiveXid = 0; + GlobalMaxActiveXid = 0; + spin_lock_init(&cifs_tcp_ses_lock); + spin_lock_init(&cifs_file_list_lock); + spin_lock_init(&GlobalMid_Lock); + + if (cifs_max_pending < 2) { + cifs_max_pending = 2; + cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); + } else if (cifs_max_pending > CIFS_MAX_REQ) { + cifs_max_pending = CIFS_MAX_REQ; + cifs_dbg(FYI, "cifs_max_pending set to max of %u\n", + CIFS_MAX_REQ); + } + + cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsiod_wq) { + rc = -ENOMEM; + goto out_clean_proc; + } + + rc = cifs_fscache_register(); + if (rc) + goto out_destroy_wq; + + rc = cifs_init_inodecache(); + if (rc) + goto out_unreg_fscache; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + +#ifdef CONFIG_CIFS_UPCALL + rc = register_key_type(&cifs_spnego_key_type); + if (rc) + goto out_destroy_request_bufs; +#endif /* CONFIG_CIFS_UPCALL */ + +#ifdef CONFIG_CIFS_ACL + rc = init_cifs_idmap(); + if (rc) + goto out_register_key_type; +#endif /* CONFIG_CIFS_ACL */ + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_init_cifs_idmap; + + return 0; + +out_init_cifs_idmap: +#ifdef CONFIG_CIFS_ACL + exit_cifs_idmap(); +out_register_key_type: +#endif +#ifdef CONFIG_CIFS_UPCALL + unregister_key_type(&cifs_spnego_key_type); +out_destroy_request_bufs: +#endif + cifs_destroy_request_bufs(); +out_destroy_mids: + cifs_destroy_mids(); +out_destroy_inodecache: + cifs_destroy_inodecache(); +out_unreg_fscache: + cifs_fscache_unregister(); +out_destroy_wq: + destroy_workqueue(cifsiod_wq); +out_clean_proc: + cifs_proc_clean(); + return rc; +} + +static void __exit +exit_cifs(void) +{ + cifs_dbg(NOISY, "exit_cifs\n"); + unregister_filesystem(&cifs_fs_type); + cifs_dfs_release_automount_timer(); +#ifdef CONFIG_CIFS_ACL + exit_cifs_idmap(); +#endif +#ifdef CONFIG_CIFS_UPCALL + unregister_key_type(&cifs_spnego_key_type); +#endif + cifs_destroy_request_bufs(); + cifs_destroy_mids(); + cifs_destroy_inodecache(); + cifs_fscache_unregister(); + destroy_workqueue(cifsiod_wq); + cifs_proc_clean(); +} + +MODULE_AUTHOR("Steve French "); +MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ +MODULE_DESCRIPTION + ("VFS to access servers complying with the SNIA CIFS Specification " + "e.g. Samba and Windows"); +MODULE_VERSION(CIFS_VERSION); +module_init(init_cifs) +module_exit(exit_cifs) diff --git a/kernel/fs/cifs/cifsfs.h b/kernel/fs/cifs/cifsfs.h new file mode 100644 index 000000000..252f5c158 --- /dev/null +++ b/kernel/fs/cifs/cifsfs.h @@ -0,0 +1,140 @@ +/* + * fs/cifs/cifsfs.h + * + * Copyright (c) International Business Machines Corp., 2002, 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSFS_H +#define _CIFSFS_H + +#include + +#define ROOT_I 2 + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. We use hash_64 to convert the value to 31 bits, and + * then add 1, to ensure that we don't end up with a 0 as the value. + */ +#if BITS_PER_LONG == 64 +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + return (ino_t)fileid; +} +#else +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; +} +#endif + +extern struct file_system_type cifs_fs_type; +extern const struct address_space_operations cifs_addr_ops; +extern const struct address_space_operations cifs_addr_ops_smallbuf; + +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + +/* Functions related to inodes */ +extern const struct inode_operations cifs_dir_inode_ops; +extern struct inode *cifs_root_iget(struct super_block *); +extern int cifs_create(struct inode *, struct dentry *, umode_t, + bool excl); +extern int cifs_atomic_open(struct inode *, struct dentry *, + struct file *, unsigned, umode_t, + int *); +extern struct dentry *cifs_lookup(struct inode *, struct dentry *, + unsigned int); +extern int cifs_unlink(struct inode *dir, struct dentry *dentry); +extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); +extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); +extern int cifs_rmdir(struct inode *, struct dentry *); +extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, + struct dentry *, unsigned int); +extern int cifs_revalidate_file_attr(struct file *filp); +extern int cifs_revalidate_dentry_attr(struct dentry *); +extern int cifs_revalidate_file(struct file *filp); +extern int cifs_revalidate_dentry(struct dentry *); +extern int cifs_invalidate_mapping(struct inode *inode); +extern int cifs_revalidate_mapping(struct inode *inode); +extern int cifs_zap_mapping(struct inode *inode); +extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int cifs_setattr(struct dentry *, struct iattr *); + +extern const struct inode_operations cifs_file_inode_ops; +extern const struct inode_operations cifs_symlink_inode_ops; +extern const struct inode_operations cifs_dfs_referral_inode_operations; + + +/* Functions related to files and directories */ +extern const struct file_operations cifs_file_ops; +extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ +extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */ +extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */ +extern const struct file_operations cifs_file_direct_nobrl_ops; +extern const struct file_operations cifs_file_strict_nobrl_ops; +extern int cifs_open(struct inode *inode, struct file *file); +extern int cifs_close(struct inode *inode, struct file *file); +extern int cifs_closedir(struct inode *inode, struct file *file); +extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern int cifs_lock(struct file *, int, struct file_lock *); +extern int cifs_fsync(struct file *, loff_t, loff_t, int); +extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); +extern int cifs_flush(struct file *, fl_owner_t id); +extern int cifs_file_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); +extern const struct file_operations cifs_dir_ops; +extern int cifs_dir_open(struct inode *inode, struct file *file); +extern int cifs_readdir(struct file *file, struct dir_context *ctx); + +/* Functions related to dir entries */ +extern const struct dentry_operations cifs_dentry_ops; +extern const struct dentry_operations cifs_ci_dentry_ops; + +#ifdef CONFIG_CIFS_DFS_UPCALL +extern struct vfsmount *cifs_dfs_d_automount(struct path *path); +#else +#define cifs_dfs_d_automount NULL +#endif + +/* Functions related to symlinks */ +extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern int cifs_readlink(struct dentry *direntry, char __user *buffer, + int buflen); +extern int cifs_symlink(struct inode *inode, struct dentry *direntry, + const char *symname); +extern int cifs_removexattr(struct dentry *, const char *); +extern int cifs_setxattr(struct dentry *, const char *, const void *, + size_t, int); +extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); +extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +#ifdef CONFIG_CIFS_NFSD_EXPORT +extern const struct export_operations cifs_export_ops; +#endif /* CONFIG_CIFS_NFSD_EXPORT */ + +#define CIFS_VERSION "2.06" +#endif /* _CIFSFS_H */ diff --git a/kernel/fs/cifs/cifsglob.h b/kernel/fs/cifs/cifsglob.h new file mode 100644 index 000000000..22b289a3b --- /dev/null +++ b/kernel/fs/cifs/cifsglob.h @@ -0,0 +1,1620 @@ +/* + * fs/cifs/cifsglob.h + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ +#ifndef _CIFS_GLOB_H +#define _CIFS_GLOB_H + +#include +#include +#include +#include +#include +#include "cifs_fs_sb.h" +#include "cifsacl.h" +#include +#include +#include +#ifdef CONFIG_CIFS_SMB2 +#include "smb2pdu.h" +#endif + +#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ + +/* + * The sizes of various internal tables and strings + */ +#define MAX_UID_INFO 16 +#define MAX_SES_INFO 2 +#define MAX_TCON_INFO 4 + +#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1) + +#define CIFS_MIN_RCV_POOL 4 + +#define MAX_REOPEN_ATT 5 /* these many maximum attempts to reopen a file */ +/* + * default attribute cache timeout (jiffies) + */ +#define CIFS_DEF_ACTIMEO (1 * HZ) + +/* + * max attribute cache timeout (jiffies) - 2^30 + */ +#define CIFS_MAX_ACTIMEO (1 << 30) + +/* + * MAX_REQ is the maximum number of requests that WE will send + * on one socket concurrently. + */ +#define CIFS_MAX_REQ 32767 + +#define RFC1001_NAME_LEN 15 +#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) + +/* currently length of NIP6_FMT */ +#define SERVER_NAME_LENGTH 40 +#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) + +/* SMB echo "timeout" -- FIXME: tunable? */ +#define SMB_ECHO_INTERVAL (60 * HZ) + +#include "cifspdu.h" + +#ifndef XATTR_DOS_ATTRIB +#define XATTR_DOS_ATTRIB "user.DOSATTRIB" +#endif + +/* + * CIFS vfs client Status information (based on what we know.) + */ + +/* associated with each tcp and smb session */ +enum statusEnum { + CifsNew = 0, + CifsGood, + CifsExiting, + CifsNeedReconnect, + CifsNeedNegotiate +}; + +enum securityEnum { + Unspecified = 0, /* not specified */ + LANMAN, /* Legacy LANMAN auth */ + NTLM, /* Legacy NTLM012 auth with NTLM hash */ + NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ + RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ + Kerberos, /* Kerberos via SPNEGO */ +}; + +struct session_key { + unsigned int len; + char *response; +}; + +/* crypto security descriptor definition */ +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +/* crypto hashing related structure/fields, not specific to a sec mech */ +struct cifs_secmech { + struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ + struct crypto_shash *md5; /* md5 hash function */ + struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ + struct crypto_shash *cmacaes; /* block-cipher based MAC function */ + struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ + struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ + struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ + struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ +}; + +/* per smb session structure/fields */ +struct ntlmssp_auth { + bool sesskey_per_smbsess; /* whether session key is per smb session */ + __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ + __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */ +}; + +struct cifs_cred { + int uid; + int gid; + int mode; + int cecount; + struct cifs_sid osid; + struct cifs_sid gsid; + struct cifs_ntace *ntaces; + struct cifs_ace *aces; +}; + +/* + ***************************************************************** + * Except the CIFS PDUs themselves all the + * globally interesting structs should go here + ***************************************************************** + */ + +/* + * A smb_rqst represents a complete request to be issued to a server. It's + * formed by a kvec array, followed by an array of pages. Page data is assumed + * to start at the beginning of the first page. + */ +struct smb_rqst { + struct kvec *rq_iov; /* array of kvecs */ + unsigned int rq_nvec; /* number of kvecs in array */ + struct page **rq_pages; /* pointer to array of page ptrs */ + unsigned int rq_npages; /* number pages in array */ + unsigned int rq_pagesz; /* page size to use */ + unsigned int rq_tailsz; /* length of last page */ +}; + +enum smb_version { + Smb_1 = 1, + Smb_20, + Smb_21, + Smb_30, + Smb_302, +}; + +struct mid_q_entry; +struct TCP_Server_Info; +struct cifsFileInfo; +struct cifs_ses; +struct cifs_tcon; +struct dfs_info3_param; +struct cifs_fattr; +struct smb_vol; +struct cifs_fid; +struct cifs_readdata; +struct cifs_writedata; +struct cifs_io_parms; +struct cifs_search_info; +struct cifsInodeInfo; +struct cifs_open_parms; + +struct smb_version_operations { + int (*send_cancel)(struct TCP_Server_Info *, void *, + struct mid_q_entry *); + bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); + /* setup request: allocate mid, sign message */ + struct mid_q_entry *(*setup_request)(struct cifs_ses *, + struct smb_rqst *); + /* setup async request: allocate mid, sign message */ + struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *, + struct smb_rqst *); + /* check response: verify signature, map error */ + int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *, + bool); + void (*add_credits)(struct TCP_Server_Info *, const unsigned int, + const int); + void (*set_credits)(struct TCP_Server_Info *, const int); + int * (*get_credits_field)(struct TCP_Server_Info *, const int); + unsigned int (*get_credits)(struct mid_q_entry *); + __u64 (*get_next_mid)(struct TCP_Server_Info *); + /* data offset from read response message */ + unsigned int (*read_data_offset)(char *); + /* data length from read response message */ + unsigned int (*read_data_length)(char *); + /* map smb to linux error */ + int (*map_error)(char *, bool); + /* find mid corresponding to the response message */ + struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *); + void (*dump_detail)(void *); + void (*clear_stats)(struct cifs_tcon *); + void (*print_stats)(struct seq_file *m, struct cifs_tcon *); + void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); + /* verify the message */ + int (*check_message)(char *, unsigned int); + bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); + /* process transaction2 response */ + bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, + char *, int); + /* check if we need to negotiate */ + bool (*need_neg)(struct TCP_Server_Info *); + /* negotiate to the server */ + int (*negotiate)(const unsigned int, struct cifs_ses *); + /* set negotiated write size */ + unsigned int (*negotiate_wsize)(struct cifs_tcon *, struct smb_vol *); + /* set negotiated read size */ + unsigned int (*negotiate_rsize)(struct cifs_tcon *, struct smb_vol *); + /* setup smb sessionn */ + int (*sess_setup)(const unsigned int, struct cifs_ses *, + const struct nls_table *); + /* close smb session */ + int (*logoff)(const unsigned int, struct cifs_ses *); + /* connect to a server share */ + int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *, + struct cifs_tcon *, const struct nls_table *); + /* close tree connecion */ + int (*tree_disconnect)(const unsigned int, struct cifs_tcon *); + /* get DFS referrals */ + int (*get_dfs_refer)(const unsigned int, struct cifs_ses *, + const char *, struct dfs_info3_param **, + unsigned int *, const struct nls_table *, int); + /* informational QFS call */ + void (*qfs_tcon)(const unsigned int, struct cifs_tcon *); + /* check if a path is accessible or not */ + int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const char *); + /* query path data from the server */ + int (*query_path_info)(const unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const char *, + FILE_ALL_INFO *, bool *, bool *); + /* query file data from the server */ + int (*query_file_info)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *, FILE_ALL_INFO *); + /* get server index number */ + int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const char *, + u64 *uniqueid, FILE_ALL_INFO *); + /* set size by path */ + int (*set_path_size)(const unsigned int, struct cifs_tcon *, + const char *, __u64, struct cifs_sb_info *, bool); + /* set size by file handle */ + int (*set_file_size)(const unsigned int, struct cifs_tcon *, + struct cifsFileInfo *, __u64, bool); + /* set attributes */ + int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, + const unsigned int); + int (*set_compression)(const unsigned int, struct cifs_tcon *, + struct cifsFileInfo *); + /* check if we can send an echo or nor */ + bool (*can_echo)(struct TCP_Server_Info *); + /* send echo request */ + int (*echo)(struct TCP_Server_Info *); + /* create directory */ + int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); + /* set info on created directory */ + void (*mkdir_setinfo)(struct inode *, const char *, + struct cifs_sb_info *, struct cifs_tcon *, + const unsigned int); + /* remove directory */ + int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); + /* unlink file */ + int (*unlink)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); + /* open, rename and delete file */ + int (*rename_pending_delete)(const char *, struct dentry *, + const unsigned int); + /* send rename request */ + int (*rename)(const unsigned int, struct cifs_tcon *, const char *, + const char *, struct cifs_sb_info *); + /* send create hardlink request */ + int (*create_hardlink)(const unsigned int, struct cifs_tcon *, + const char *, const char *, + struct cifs_sb_info *); + /* query symlink target */ + int (*query_symlink)(const unsigned int, struct cifs_tcon *, + const char *, char **, struct cifs_sb_info *); + /* open a file for non-posix mounts */ + int (*open)(const unsigned int, struct cifs_open_parms *, + __u32 *, FILE_ALL_INFO *); + /* set fid protocol-specific info */ + void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); + /* close a file */ + void (*close)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *); + /* send a flush request to the server */ + int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); + /* async read from the server */ + int (*async_readv)(struct cifs_readdata *); + /* async write to the server */ + int (*async_writev)(struct cifs_writedata *, + void (*release)(struct kref *)); + /* sync read from the server */ + int (*sync_read)(const unsigned int, struct cifs_fid *, + struct cifs_io_parms *, unsigned int *, char **, + int *); + /* sync write to the server */ + int (*sync_write)(const unsigned int, struct cifs_fid *, + struct cifs_io_parms *, unsigned int *, struct kvec *, + unsigned long); + /* open dir, start readdir */ + int (*query_dir_first)(const unsigned int, struct cifs_tcon *, + const char *, struct cifs_sb_info *, + struct cifs_fid *, __u16, + struct cifs_search_info *); + /* continue readdir */ + int (*query_dir_next)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *, + __u16, struct cifs_search_info *srch_inf); + /* close dir */ + int (*close_dir)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *); + /* calculate a size of SMB message */ + unsigned int (*calc_smb_size)(void *); + /* check for STATUS_PENDING and process it in a positive case */ + bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* send oplock break response */ + int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *, + struct cifsInodeInfo *); + /* query remote filesystem */ + int (*queryfs)(const unsigned int, struct cifs_tcon *, + struct kstatfs *); + /* send mandatory brlock to the server */ + int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64, + __u64, __u32, int, int, bool); + /* unlock range of mandatory locks */ + int (*mand_unlock_range)(struct cifsFileInfo *, struct file_lock *, + const unsigned int); + /* push brlocks from the cache to the server */ + int (*push_mand_locks)(struct cifsFileInfo *); + /* get lease key of the inode */ + void (*get_lease_key)(struct inode *, struct cifs_fid *); + /* set lease key of the inode */ + void (*set_lease_key)(struct inode *, struct cifs_fid *); + /* generate new lease key */ + void (*new_lease_key)(struct cifs_fid *); + int (*generate_signingkey)(struct cifs_ses *); + int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); + int (*create_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); + /* if we can do cache read operations */ + bool (*is_read_op)(__u32); + /* set oplock level for the inode */ + void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, + bool *); + /* create lease context buffer for CREATE request */ + char * (*create_lease_buf)(u8 *, u8); + /* parse lease context buffer and return oplock/epoch info */ + __u8 (*parse_lease_buf)(void *, unsigned int *); + int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); + int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); + ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, + const unsigned char *, const unsigned char *, char *, + size_t, const struct nls_table *, int); + int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, + const char *, const void *, const __u16, + const struct nls_table *, int); + struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, + const char *, u32 *); + struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); + int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, + int); + /* writepages retry size */ + unsigned int (*wp_retry_size)(struct inode *); + /* get mtu credits */ + int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, + unsigned int *, unsigned int *); + /* check if we need to issue closedir */ + bool (*dir_needs_close)(struct cifsFileInfo *); + long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, + loff_t); +}; + +struct smb_version_values { + char *version_string; + __u16 protocol_id; + __u32 req_capabilities; + __u32 large_lock_type; + __u32 exclusive_lock_type; + __u32 shared_lock_type; + __u32 unlock_lock_type; + size_t header_size; + size_t max_header_size; + size_t read_rsp_size; + __le16 lock_cmd; + unsigned int cap_unix; + unsigned int cap_nt_find; + unsigned int cap_large_files; + __u16 signing_enabled; + __u16 signing_required; + size_t create_lease_size; +}; + +#define HEADER_SIZE(server) (server->vals->header_size) +#define MAX_HEADER_SIZE(server) (server->vals->max_header_size) + +struct smb_vol { + char *username; + char *password; + char *domainname; + char *UNC; + char *iocharset; /* local code page for mapping to and from Unicode */ + char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ + char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ + kuid_t cred_uid; + kuid_t linux_uid; + kgid_t linux_gid; + kuid_t backupuid; + kgid_t backupgid; + umode_t file_mode; + umode_t dir_mode; + enum securityEnum sectype; /* sectype requested via mnt opts */ + bool sign; /* was signing requested via mnt opts? */ + bool retry:1; + bool intr:1; + bool setuids:1; + bool override_uid:1; + bool override_gid:1; + bool dynperm:1; + bool noperm:1; + bool no_psx_acl:1; /* set if posix acl support should be disabled */ + bool cifs_acl:1; + bool backupuid_specified; /* mount option backupuid is specified */ + bool backupgid_specified; /* mount option backupgid is specified */ + bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ + bool server_ino:1; /* use inode numbers from server ie UniqueId */ + bool direct_io:1; + bool strict_io:1; /* strict cache behavior */ + bool remap:1; /* set to remap seven reserved chars in filenames */ + bool sfu_remap:1; /* remap seven reserved chars ala SFU */ + bool posix_paths:1; /* unset to not ask for posix pathnames. */ + bool no_linux_ext:1; + bool sfu_emul:1; + bool nullauth:1; /* attempt to authenticate with null user */ + bool nocase:1; /* request case insensitive filenames */ + bool nobrl:1; /* disable sending byte range locks to srv */ + bool mand_lock:1; /* send mandatory not posix byte range lock reqs */ + bool seal:1; /* request transport encryption on share */ + bool nodfs:1; /* Do not request DFS, even if available */ + bool local_lease:1; /* check leases only on local system, not remote */ + bool noblocksnd:1; + bool noautotune:1; + bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool fsc:1; /* enable fscache */ + bool mfsymlinks:1; /* use Minshall+French Symlinks */ + bool multiuser:1; + bool rwpidforward:1; /* pid forward for read/write operations */ + bool nosharesock; + unsigned int rsize; + unsigned int wsize; + bool sockopt_tcp_nodelay:1; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ + struct smb_version_operations *ops; + struct smb_version_values *vals; + char *prepath; + struct sockaddr_storage dstaddr; /* destination address */ + struct sockaddr_storage srcaddr; /* allow binding to a local IP */ + struct nls_table *local_nls; +}; + +#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ + CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \ + CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \ + CIFS_MOUNT_MAP_SFM_CHR | \ + CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \ + CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \ + CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ + CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ + CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ + CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) + +#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ + MS_NODEV | MS_SYNCHRONOUS) + +struct cifs_mnt_data { + struct cifs_sb_info *cifs_sb; + struct smb_vol *vol; + int flags; +}; + +static inline unsigned int +get_rfc1002_length(void *buf) +{ + return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; +} + +static inline void +inc_rfc1001_len(void *buf, int count) +{ + be32_add_cpu((__be32 *)buf, count); +} + +struct TCP_Server_Info { + struct list_head tcp_ses_list; + struct list_head smb_ses_list; + int srv_count; /* reference counter */ + /* 15 character server name + 0x20 16th byte indicating type = srv */ + char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; + struct smb_version_operations *ops; + struct smb_version_values *vals; + enum statusEnum tcpStatus; /* what we think the status is */ + char *hostname; /* hostname portion of UNC string */ + struct socket *ssocket; + struct sockaddr_storage dstaddr; + struct sockaddr_storage srcaddr; /* locally bind to this IP */ +#ifdef CONFIG_NET_NS + struct net *net; +#endif + wait_queue_head_t response_q; + wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ + struct list_head pending_mid_q; + bool noblocksnd; /* use blocking sendmsg */ + bool noautotune; /* do not autotune send buf sizes */ + bool tcp_nodelay; + int credits; /* send no more requests at once */ + unsigned int in_flight; /* number of requests on the wire to server */ + spinlock_t req_lock; /* protect the two values above */ + struct mutex srv_mutex; + struct task_struct *tsk; + char server_GUID[16]; + __u16 sec_mode; + bool sign; /* is signing enabled on this connection? */ + bool session_estab; /* mark when very first sess is established */ +#ifdef CONFIG_CIFS_SMB2 + int echo_credits; /* echo reserved slots */ + int oplock_credits; /* oplock break reserved slots */ + bool echoes:1; /* enable echoes */ + __u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */ +#endif + u16 dialect; /* dialect index that server chose */ + bool oplocks:1; /* enable oplocks */ + unsigned int maxReq; /* Clients should submit no more */ + /* than maxReq distinct unanswered SMBs to the server when using */ + /* multiplexed reads or writes */ + unsigned int maxBuf; /* maxBuf specifies the maximum */ + /* message size the server can send or receive for non-raw SMBs */ + /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ + /* when socket is setup (and during reconnect) before NegProt sent */ + unsigned int max_rw; /* maxRw specifies the maximum */ + /* message size the server can send or receive for */ + /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ + unsigned int capabilities; /* selective disabling of caps by smb sess */ + int timeAdj; /* Adjust for difference in server time zone in sec */ + __u64 CurrentMid; /* multiplex id - rotating counter */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ + /* 16th byte of RFC1001 workstation name is always null */ + char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; + __u32 sequence_number; /* for signing, protected by srv_mutex */ + struct session_key session_key; + unsigned long lstrp; /* when we got last response from this server */ + struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ +#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */ +#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ +#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ + char negflavor; /* NEGOTIATE response flavor */ + /* extended security flavors that server supports */ + bool sec_ntlmssp; /* supports NTLMSSP */ + bool sec_kerberosu2u; /* supports U2U Kerberos */ + bool sec_kerberos; /* supports plain Kerberos */ + bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool large_buf; /* is current buffer large? */ + struct delayed_work echo; /* echo ping workqueue job */ + struct kvec *iov; /* reusable kvec array for receives */ + unsigned int nr_iov; /* number of kvecs in array */ + char *smallbuf; /* pointer to current "small" buffer */ + char *bigbuf; /* pointer to current "big" buffer */ + unsigned int total_read; /* total amount of data read in this pass */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; /* client index cache cookie */ +#endif +#ifdef CONFIG_CIFS_STATS2 + atomic_t in_send; /* requests trying to send */ + atomic_t num_waiters; /* blocked waiting to get in sendrecv */ +#endif +#ifdef CONFIG_CIFS_SMB2 + unsigned int max_read; + unsigned int max_write; +#endif /* CONFIG_CIFS_SMB2 */ +}; + +static inline unsigned int +in_flight(struct TCP_Server_Info *server) +{ + unsigned int num; + spin_lock(&server->req_lock); + num = server->in_flight; + spin_unlock(&server->req_lock); + return num; +} + +static inline bool +has_credits(struct TCP_Server_Info *server, int *credits) +{ + int num; + spin_lock(&server->req_lock); + num = *credits; + spin_unlock(&server->req_lock); + return num > 0; +} + +static inline void +add_credits(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + server->ops->add_credits(server, add, optype); +} + +static inline void +add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + if (add) { + server->ops->add_credits(server, add, optype); + wake_up(&server->request_q); + } +} + +static inline void +set_credits(struct TCP_Server_Info *server, const int val) +{ + server->ops->set_credits(server, val); +} + +static inline __le64 +get_next_mid64(struct TCP_Server_Info *server) +{ + return cpu_to_le64(server->ops->get_next_mid(server)); +} + +static inline __le16 +get_next_mid(struct TCP_Server_Info *server) +{ + __u16 mid = server->ops->get_next_mid(server); + /* + * The value in the SMB header should be little endian for easy + * on-the-wire decoding. + */ + return cpu_to_le16(mid); +} + +static inline __u16 +get_mid(const struct smb_hdr *smb) +{ + return le16_to_cpu(smb->Mid); +} + +static inline bool +compare_mid(__u16 mid, const struct smb_hdr *smb) +{ + return mid == le16_to_cpu(smb->Mid); +} + +/* + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. + * + * Note that this might make for "interesting" allocation problems during + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. + */ +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) + +/* + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. + */ +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) + +/* + * The default wsize is 1M. find_get_pages seems to return a maximum of 256 + * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill + * a single wsize request with a single call. + */ +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60kb reads and 65535 byte writes. Default to + * those values when posix extensions aren't in force. In actuality here, we + * use 65536 to allow for a write that is a multiple of 4k. Most servers seem + * to be ok with the extra byte even though Windows doesn't send writes that + * are that large. + * + * Citation: + * + * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) +#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) + +/* + * Macros to allow the TCP_Server_Info->net field and related code to drop out + * when CONFIG_NET_NS isn't set. + */ + +#ifdef CONFIG_NET_NS + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return srv->net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ + srv->net = net; +} + +#else + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return &init_net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ +} + +#endif + +/* + * Session structure. One of these for each uid session with a particular host + */ +struct cifs_ses { + struct list_head smb_ses_list; + struct list_head tcon_list; + struct mutex session_mutex; + struct TCP_Server_Info *server; /* pointer to server info */ + int ses_count; /* reference counter */ + enum statusEnum status; + unsigned overrideSecFlg; /* if non-zero override global sec flags */ + __u16 ipc_tid; /* special tid for connection to IPC share */ + char *serverOS; /* name of operating system underlying server */ + char *serverNOS; /* name of network operating system of server */ + char *serverDomain; /* security realm of server */ + __u64 Suid; /* remote smb uid */ + kuid_t linux_uid; /* overriding owner of files on the mount */ + kuid_t cred_uid; /* owner of credentials */ + unsigned int capabilities; + char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for + TCP names - will ipv6 and sctp addresses fit? */ + char *user_name; /* must not be null except during init of sess + and after mount option parsing we fill it */ + char *domainName; + char *password; + struct session_key auth_key; + struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ + enum securityEnum sectype; /* what security flavor was specified? */ + bool sign; /* is signing required? */ + bool need_reconnect:1; /* connection reset, uid now invalid */ +#ifdef CONFIG_CIFS_SMB2 + __u16 session_flags; + char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ +#endif /* CONFIG_CIFS_SMB2 */ +}; + +static inline bool +cap_unix(struct cifs_ses *ses) +{ + return ses->server->vals->cap_unix & ses->capabilities; +} + +/* + * there is one of these for each connection to a resource on a particular + * session + */ +struct cifs_tcon { + struct list_head tcon_list; + int tc_count; + struct list_head openFileList; + struct cifs_ses *ses; /* pointer to session associated with */ + char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char *nativeFileSystem; + char *password; /* for share-level security */ + __u32 tid; /* The 4 byte tree id */ + __u16 Flags; /* optional support bits */ + enum statusEnum tidStatus; +#ifdef CONFIG_CIFS_STATS + atomic_t num_smbs_sent; + union { + struct { + atomic_t num_writes; + atomic_t num_reads; + atomic_t num_flushes; + atomic_t num_oplock_brks; + atomic_t num_opens; + atomic_t num_closes; + atomic_t num_deletes; + atomic_t num_mkdirs; + atomic_t num_posixopens; + atomic_t num_posixmkdirs; + atomic_t num_rmdirs; + atomic_t num_renames; + atomic_t num_t2renames; + atomic_t num_ffirst; + atomic_t num_fnext; + atomic_t num_fclose; + atomic_t num_hardlinks; + atomic_t num_symlinks; + atomic_t num_locks; + atomic_t num_acl_get; + atomic_t num_acl_set; + } cifs_stats; +#ifdef CONFIG_CIFS_SMB2 + struct { + atomic_t smb2_com_sent[NUMBER_OF_SMB2_COMMANDS]; + atomic_t smb2_com_failed[NUMBER_OF_SMB2_COMMANDS]; + } smb2_stats; +#endif /* CONFIG_CIFS_SMB2 */ + } stats; +#ifdef CONFIG_CIFS_STATS2 + unsigned long long time_writes; + unsigned long long time_reads; + unsigned long long time_opens; + unsigned long long time_deletes; + unsigned long long time_closes; + unsigned long long time_mkdirs; + unsigned long long time_rmdirs; + unsigned long long time_renames; + unsigned long long time_t2renames; + unsigned long long time_ffirst; + unsigned long long time_fnext; + unsigned long long time_fclose; +#endif /* CONFIG_CIFS_STATS2 */ + __u64 bytes_read; + __u64 bytes_written; + spinlock_t stat_lock; +#endif /* CONFIG_CIFS_STATS */ + FILE_SYSTEM_DEVICE_INFO fsDevInfo; + FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ + FILE_SYSTEM_UNIX_INFO fsUnixInfo; + bool ipc:1; /* set if connection to IPC$ eg for RPC/PIPES */ + bool retry:1; + bool nocase:1; + bool seal:1; /* transport encryption for this mounted share */ + bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol + for this mount even if server would support */ + bool local_lease:1; /* check leases (only) on local system not remote */ + bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ + bool broken_sparse_sup; /* if server or share does not support sparse */ + bool need_reconnect:1; /* connection reset, tid now invalid */ +#ifdef CONFIG_CIFS_SMB2 + bool print:1; /* set if connection to printer share */ + bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ + __le32 capabilities; + __u32 share_flags; + __u32 maximal_access; + __u32 vol_serial_number; + __le64 vol_create_time; + __u32 ss_flags; /* sector size flags */ + __u32 perf_sector_size; /* best sector size for perf */ + __u32 max_chunks; + __u32 max_bytes_chunk; + __u32 max_bytes_copy; +#endif /* CONFIG_CIFS_SMB2 */ +#ifdef CONFIG_CIFS_FSCACHE + u64 resource_id; /* server resource id */ + struct fscache_cookie *fscache; /* cookie for share */ +#endif + struct list_head pending_opens; /* list of incomplete opens */ + /* BB add field for back pointer to sb struct(s)? */ +}; + +/* + * This is a refcounted and timestamped container for a tcon pointer. The + * container holds a tcon reference. It is considered safe to free one of + * these when the tl_count goes to 0. The tl_time is the time of the last + * "get" on the container. + */ +struct tcon_link { + struct rb_node tl_rbnode; + kuid_t tl_uid; + unsigned long tl_flags; +#define TCON_LINK_MASTER 0 +#define TCON_LINK_PENDING 1 +#define TCON_LINK_IN_TREE 2 + unsigned long tl_time; + atomic_t tl_count; + struct cifs_tcon *tl_tcon; +}; + +extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb); + +static inline struct cifs_tcon * +tlink_tcon(struct tcon_link *tlink) +{ + return tlink->tl_tcon; +} + +extern void cifs_put_tlink(struct tcon_link *tlink); + +static inline struct tcon_link * +cifs_get_tlink(struct tcon_link *tlink) +{ + if (tlink && !IS_ERR(tlink)) + atomic_inc(&tlink->tl_count); + return tlink; +} + +/* This function is always expected to succeed */ +extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); + +#define CIFS_OPLOCK_NO_CHANGE 0xfe + +struct cifs_pending_open { + struct list_head olist; + struct tcon_link *tlink; + __u8 lease_key[16]; + __u32 oplock; +}; + +/* + * This info hangs off the cifsFileInfo structure, pointed to by llist. + * This is used to track byte stream locks on the file + */ +struct cifsLockInfo { + struct list_head llist; /* pointer to next cifsLockInfo */ + struct list_head blist; /* pointer to locks blocked on this */ + wait_queue_head_t block_q; + __u64 offset; + __u64 length; + __u32 pid; + __u32 type; +}; + +/* + * One of these for each open instance of a file + */ +struct cifs_search_info { + loff_t index_of_last_entry; + __u16 entries_in_buffer; + __u16 info_level; + __u32 resume_key; + char *ntwrk_buf_start; + char *srch_entries_start; + char *last_entry; + const char *presume_name; + unsigned int resume_name_len; + bool endOfSearch:1; + bool emptyDir:1; + bool unicode:1; + bool smallBuf:1; /* so we know which buf_release function to call */ +}; + +struct cifs_open_parms { + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + int disposition; + int desired_access; + int create_options; + const char *path; + struct cifs_fid *fid; + bool reconnect:1; +}; + +struct cifs_fid { + __u16 netfid; +#ifdef CONFIG_CIFS_SMB2 + __u64 persistent_fid; /* persist file id for smb2 */ + __u64 volatile_fid; /* volatile file id for smb2 */ + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ +#endif + struct cifs_pending_open *pending_open; + unsigned int epoch; + bool purge_cache; +}; + +struct cifs_fid_locks { + struct list_head llist; + struct cifsFileInfo *cfile; /* fid that owns locks */ + struct list_head locks; /* locks held by fid above */ +}; + +struct cifsFileInfo { + struct list_head tlist; /* pointer to next fid owned by tcon */ + struct list_head flist; /* next fid (file instance) for this inode */ + struct cifs_fid_locks *llist; /* brlocks held by this fid */ + kuid_t uid; /* allows finding which FileInfo structure */ + __u32 pid; /* process id who opened file */ + struct cifs_fid fid; /* file id from remote */ + /* BB add lock scope info here if needed */ ; + /* lock scope id (0 if none) */ + struct dentry *dentry; + unsigned int f_flags; + struct tcon_link *tlink; + bool invalidHandle:1; /* file closed via session abend */ + bool oplock_break_cancelled:1; + int count; /* refcount protected by cifs_file_list_lock */ + struct mutex fh_mutex; /* prevents reopen race after dead ses*/ + struct cifs_search_info srch_inf; + struct work_struct oplock_break; /* work for oplock breaks */ +}; + +struct cifs_io_parms { + __u16 netfid; +#ifdef CONFIG_CIFS_SMB2 + __u64 persistent_fid; /* persist file id for smb2 */ + __u64 volatile_fid; /* volatile file id for smb2 */ +#endif + __u32 pid; + __u64 offset; + unsigned int length; + struct cifs_tcon *tcon; +}; + +struct cifs_readdata; + +/* asynchronous read support */ +struct cifs_readdata { + struct kref refcount; + struct list_head list; + struct completion done; + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + unsigned int got_bytes; + pid_t pid; + int result; + struct work_struct work; + int (*read_into_pages)(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, + unsigned int len); + struct kvec iov; + unsigned int pagesz; + unsigned int tailsz; + unsigned int credits; + unsigned int nr_pages; + struct page *pages[]; +}; + +struct cifs_writedata; + +/* asynchronous write support */ +struct cifs_writedata { + struct kref refcount; + struct list_head list; + struct completion done; + enum writeback_sync_modes sync_mode; + struct work_struct work; + struct cifsFileInfo *cfile; + __u64 offset; + pid_t pid; + unsigned int bytes; + int result; + unsigned int pagesz; + unsigned int tailsz; + unsigned int credits; + unsigned int nr_pages; + struct page *pages[]; +}; + +/* + * Take a reference on the file private data. Must be called with + * cifs_file_list_lock held. + */ +static inline void +cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) +{ + ++cifs_file->count; +} + +struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void cifsFileInfo_put(struct cifsFileInfo *cifs_file); + +#define CIFS_CACHE_READ_FLG 1 +#define CIFS_CACHE_HANDLE_FLG 2 +#define CIFS_CACHE_RH_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE_FLG 4 +#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) +#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) + +#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG) +#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG) + +/* + * One of these for each file inode + */ + +struct cifsInodeInfo { + bool can_cache_brlcks; + struct list_head llist; /* locks helb by this inode */ + struct rw_semaphore lock_sem; /* protect the fields above */ + /* BB add in lists for dirty pages i.e. write caching info for oplock */ + struct list_head openFileList; + __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ + unsigned int oplock; /* oplock/lease level we have */ + unsigned int epoch; /* used to track lease state changes */ +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ +#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ +#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ +#define CIFS_INO_LOCK (5) /* lock bit for synchronization */ + unsigned long flags; + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ + unsigned long time; /* jiffies of last update of inode */ + u64 server_eof; /* current file size on server -- protected by i_lock */ + u64 uniqueid; /* server inode number */ + u64 createtime; /* creation time on server */ +#ifdef CONFIG_CIFS_SMB2 + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ +#endif +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; +#endif + struct inode vfs_inode; +}; + +static inline struct cifsInodeInfo * +CIFS_I(struct inode *inode) +{ + return container_of(inode, struct cifsInodeInfo, vfs_inode); +} + +static inline struct cifs_sb_info * +CIFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct cifs_sb_info * +CIFS_FILE_SB(struct file *file) +{ + return CIFS_SB(file_inode(file)->i_sb); +} + +static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + return '/'; + else + return '\\'; +} + +static inline void +convert_delimiter(char *path, char delim) +{ + char old_delim, *pos; + + if (delim == '/') + old_delim = '\\'; + else + old_delim = '/'; + + pos = path; + while ((pos = strchr(pos, old_delim))) + *pos = delim; +} + +#ifdef CONFIG_CIFS_STATS +#define cifs_stats_inc atomic_inc + +static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon, + unsigned int bytes) +{ + if (bytes) { + spin_lock(&tcon->stat_lock); + tcon->bytes_written += bytes; + spin_unlock(&tcon->stat_lock); + } +} + +static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, + unsigned int bytes) +{ + spin_lock(&tcon->stat_lock); + tcon->bytes_read += bytes; + spin_unlock(&tcon->stat_lock); +} +#else + +#define cifs_stats_inc(field) do {} while (0) +#define cifs_stats_bytes_written(tcon, bytes) do {} while (0) +#define cifs_stats_bytes_read(tcon, bytes) do {} while (0) + +#endif + + +/* + * This is the prototype for the mid receive function. This function is for + * receiving the rest of the SMB frame, starting with the WordCount (which is + * just after the MID in struct smb_hdr). Note: + * + * - This will be called by cifsd, with no locks held. + * - The mid will still be on the pending_mid_q. + * - mid->resp_buf will point to the current buffer. + * + * Returns zero on a successful receive, or an error. The receive state in + * the TCP_Server_Info will also be updated. + */ +typedef int (mid_receive_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + +/* + * This is the prototype for the mid callback function. This is called once the + * mid has been received off of the socket. When creating one, take special + * care to avoid deadlocks. Things to bear in mind: + * + * - it will be called by cifsd, with no locks held + * - the mid will be removed from any lists + */ +typedef void (mid_callback_t)(struct mid_q_entry *mid); + +/* one of these for every pending CIFS request to the server */ +struct mid_q_entry { + struct list_head qhead; /* mids waiting on reply from this server */ + struct TCP_Server_Info *server; /* server corresponding to this mid */ + __u64 mid; /* multiplex id */ + __u32 pid; /* process id */ + __u32 sequence_number; /* for CIFS signing */ + unsigned long when_alloc; /* when mid was created */ +#ifdef CONFIG_CIFS_STATS2 + unsigned long when_sent; /* time when smb send finished */ + unsigned long when_received; /* when demux complete (taken off wire) */ +#endif + mid_receive_t *receive; /* call receive callback */ + mid_callback_t *callback; /* call completion callback */ + void *callback_data; /* general purpose pointer for callback */ + void *resp_buf; /* pointer to received SMB header */ + int mid_state; /* wish this were enum but can not pass to wait_event */ + __le16 command; /* smb command code */ + bool large_buf:1; /* if valid response, is pointer to large buf */ + bool multiRsp:1; /* multiple trans2 responses for one request */ + bool multiEnd:1; /* both received */ +}; + +/* Make code in transport.c a little cleaner by moving + update of optional stats into function below */ +#ifdef CONFIG_CIFS_STATS2 + +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->in_send); +} + +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->in_send); +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->num_waiters); +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->num_waiters); +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ + mid->when_sent = jiffies; +} +#else +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ +} +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ +} +#endif + +/* for pending dnotify requests */ +struct dir_notify_req { + struct list_head lhead; + __le16 Pid; + __le16 PidHigh; + __u16 Mid; + __u16 Tid; + __u16 Uid; + __u16 netfid; + __u32 filter; /* CompletionFilter (for multishot) */ + int multishot; + struct file *pfile; +}; + +struct dfs_info3_param { + int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/ + int path_consumed; + int server_type; + int ref_flag; + char *path_name; + char *node_name; +}; + +/* + * common struct for holding inode info when searching for or updating an + * inode with new info + */ + +#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_DELETE_PENDING 0x2 +#define CIFS_FATTR_NEED_REVAL 0x4 +#define CIFS_FATTR_INO_COLLISION 0x8 +#define CIFS_FATTR_UNKNOWN_NLINK 0x10 + +struct cifs_fattr { + u32 cf_flags; + u32 cf_cifsattrs; + u64 cf_uniqueid; + u64 cf_eof; + u64 cf_bytes; + u64 cf_createtime; + kuid_t cf_uid; + kgid_t cf_gid; + umode_t cf_mode; + dev_t cf_rdev; + unsigned int cf_nlink; + unsigned int cf_dtype; + struct timespec cf_atime; + struct timespec cf_mtime; + struct timespec cf_ctime; +}; + +static inline void free_dfs_info_param(struct dfs_info3_param *param) +{ + if (param) { + kfree(param->path_name); + kfree(param->node_name); + kfree(param); + } +} + +static inline void free_dfs_info_array(struct dfs_info3_param *param, + int number_of_items) +{ + int i; + if ((number_of_items == 0) || (param == NULL)) + return; + for (i = 0; i < number_of_items; i++) { + kfree(param[i].path_name); + kfree(param[i].node_name); + } + kfree(param); +} + +#define MID_FREE 0 +#define MID_REQUEST_ALLOCATED 1 +#define MID_REQUEST_SUBMITTED 2 +#define MID_RESPONSE_RECEIVED 4 +#define MID_RETRY_NEEDED 8 /* session closed while this request out */ +#define MID_RESPONSE_MALFORMED 0x10 +#define MID_SHUTDOWN 0x20 + +/* Types of response buffer returned from SendReceive2 */ +#define CIFS_NO_BUFFER 0 /* Response buffer not returned */ +#define CIFS_SMALL_BUFFER 1 +#define CIFS_LARGE_BUFFER 2 +#define CIFS_IOVEC 4 /* array of response buffers */ + +/* Type of Request to SendReceive2 */ +#define CIFS_BLOCKING_OP 1 /* operation can block */ +#define CIFS_ASYNC_OP 2 /* do not wait for response */ +#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */ +#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ +#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ +#define CIFS_NO_RESP 0x040 /* no response buffer required */ + +/* Type of request operation */ +#define CIFS_ECHO_OP 0x080 /* echo request */ +#define CIFS_OBREAK_OP 0x0100 /* oplock break request */ +#define CIFS_NEG_OP 0x0200 /* negotiate request */ +#define CIFS_OP_MASK 0x0380 /* mask request type */ +#define CIFS_HAS_CREDITS 0x0400 /* already has credits */ + +/* Security Flags: indicate type of session setup needed */ +#define CIFSSEC_MAY_SIGN 0x00001 +#define CIFSSEC_MAY_NTLM 0x00002 +#define CIFSSEC_MAY_NTLMV2 0x00004 +#define CIFSSEC_MAY_KRB5 0x00008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MAY_LANMAN 0x00010 +#define CIFSSEC_MAY_PLNTXT 0x00020 +#else +#define CIFSSEC_MAY_LANMAN 0 +#define CIFSSEC_MAY_PLNTXT 0 +#endif /* weak passwords */ +#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ +#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */ + +#define CIFSSEC_MUST_SIGN 0x01001 +/* note that only one of the following can be set so the +result of setting MUST flags more than once will be to +require use of the stronger protocol */ +#define CIFSSEC_MUST_NTLM 0x02002 +#define CIFSSEC_MUST_NTLMV2 0x04004 +#define CIFSSEC_MUST_KRB5 0x08008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MUST_LANMAN 0x10010 +#define CIFSSEC_MUST_PLNTXT 0x20020 +#ifdef CONFIG_CIFS_UPCALL +#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */ +#else +#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ +#endif /* UPCALL */ +#else /* do not allow weak pw hash */ +#define CIFSSEC_MUST_LANMAN 0 +#define CIFSSEC_MUST_PLNTXT 0 +#ifdef CONFIG_CIFS_UPCALL +#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ +#else +#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */ +#endif /* UPCALL */ +#endif /* WEAK_PW_HASH */ +#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ +#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ + +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) +#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) +/* + ***************************************************************** + * All constants go here + ***************************************************************** + */ + +#define UID_HASH (16) + +/* + * Note that ONE module should define _DECLARE_GLOBALS_HERE to cause the + * following to be declared. + */ + +/**************************************************************************** + * Locking notes. All updates to global variables and lists should be + * protected by spinlocks or semaphores. + * + * Spinlocks + * --------- + * GlobalMid_Lock protects: + * list operations on pending_mid_q and oplockQ + * updates to XID counters, multiplex id and SMB sequence numbers + * cifs_file_list_lock protects: + * list operations on tcp and SMB session lists and tCon lists + * f_owner.lock protects certain per file struct operations + * mapping->page_lock protects certain per page operations + * + * Semaphores + * ---------- + * sesSem operations on smb session + * tconSem operations on tree connection + * fh_sem file handle reconnection operations + * + ****************************************************************************/ + +#ifdef DECLARE_GLOBALS_HERE +#define GLOBAL_EXTERN +#else +#define GLOBAL_EXTERN extern +#endif + +/* + * the list of TCP_Server_Info structures, ie each of the sockets + * connecting our client to a distinct server (ip address), is + * chained together by cifs_tcp_ses_list. The list of all our SMB + * sessions (and from that the tree connections) can be found + * by iterating over cifs_tcp_ses_list + */ +GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; + +/* + * This lock protects the cifs_tcp_ses_list, the list of smb sessions per + * tcp session, and the list of tcon's per smb session. It also protects + * the reference counters for the server, smb session, and tcon. Finally, + * changes to the tcon->tidStatus should be done while holding this lock. + */ +GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; + +/* + * This lock protects the cifs_file->llist and cifs_file->flist + * list operations, and updates to some flags (cifs_file->invalidHandle) + * It will be moved to either use the tcon->stat_lock or equivalent later. + * If cifs_tcp_ses_lock and the lock below are both needed to be held, then + * the cifs_tcp_ses_lock must be grabbed first and released last. + */ +GLOBAL_EXTERN spinlock_t cifs_file_list_lock; + +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ +/* Outstanding dir notify requests */ +GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; +/* DirNotify response queue */ +GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ + +/* + * Global transaction id (XID) information + */ +GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ +GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ +GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ + /* on midQ entries */ +/* + * Global counters, updated atomically + */ +GLOBAL_EXTERN atomic_t sesInfoAllocCount; +GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesAllocCount; +GLOBAL_EXTERN atomic_t tcpSesReconnectCount; +GLOBAL_EXTERN atomic_t tconInfoReconnectCount; + +/* Various Debug counters */ +GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ +#ifdef CONFIG_CIFS_STATS2 +GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ +GLOBAL_EXTERN atomic_t totSmBufAllocCount; +#endif +GLOBAL_EXTERN atomic_t smBufAllocCount; +GLOBAL_EXTERN atomic_t midCount; + +/* Misc globals */ +GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ +GLOBAL_EXTERN unsigned int lookupCacheEnabled; +GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent + with more secure ntlmssp2 challenge/resp */ +GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ +GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ +GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ +GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ + +#ifdef CONFIG_CIFS_ACL +GLOBAL_EXTERN struct rb_root uidtree; +GLOBAL_EXTERN struct rb_root gidtree; +GLOBAL_EXTERN spinlock_t siduidlock; +GLOBAL_EXTERN spinlock_t sidgidlock; +GLOBAL_EXTERN struct rb_root siduidtree; +GLOBAL_EXTERN struct rb_root sidgidtree; +GLOBAL_EXTERN spinlock_t uidsidlock; +GLOBAL_EXTERN spinlock_t gidsidlock; +#endif /* CONFIG_CIFS_ACL */ + +void cifs_oplock_break(struct work_struct *work); + +extern const struct slow_work_ops cifs_oplock_break_ops; +extern struct workqueue_struct *cifsiod_wq; + +extern mempool_t *cifs_mid_poolp; + +/* Operations for different SMB versions */ +#define SMB1_VERSION_STRING "1.0" +extern struct smb_version_operations smb1_operations; +extern struct smb_version_values smb1_values; +#define SMB20_VERSION_STRING "2.0" +extern struct smb_version_operations smb20_operations; +extern struct smb_version_values smb20_values; +#define SMB21_VERSION_STRING "2.1" +extern struct smb_version_operations smb21_operations; +extern struct smb_version_values smb21_values; +#define SMB30_VERSION_STRING "3.0" +extern struct smb_version_operations smb30_operations; +extern struct smb_version_values smb30_values; +#define SMB302_VERSION_STRING "3.02" +/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ +extern struct smb_version_values smb302_values; +#endif /* _CIFS_GLOB_H */ diff --git a/kernel/fs/cifs/cifspdu.h b/kernel/fs/cifs/cifspdu.h new file mode 100644 index 000000000..5f9822ac0 --- /dev/null +++ b/kernel/fs/cifs/cifspdu.h @@ -0,0 +1,2739 @@ +/* + * fs/cifs/cifspdu.h + * + * Copyright (c) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSPDU_H +#define _CIFSPDU_H + +#include +#include +#include "smbfsctl.h" + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define LANMAN_PROT 0 +#define LANMAN2_PROT 1 +#define CIFS_PROT 2 +#else +#define CIFS_PROT 0 +#endif +#define POSIX_PROT (CIFS_PROT+1) +#define BAD_PROT 0xFFFF + +/* SMB command codes: + * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie which include no useful data other than the SMB error code itself). + * This can allow us to avoid response buffer allocations and copy in some cases + */ +#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */ +#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */ +#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */ +#define SMB_COM_FLUSH 0x05 /* triv req/rsp */ +#define SMB_COM_DELETE 0x06 /* trivial response */ +#define SMB_COM_RENAME 0x07 /* trivial response */ +#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */ +#define SMB_COM_SETATTR 0x09 /* trivial response */ +#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ +#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ +#define SMB_COM_ECHO 0x2B /* echo request */ +#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ +#define SMB_COM_READ_ANDX 0x2E +#define SMB_COM_WRITE_ANDX 0x2F +#define SMB_COM_TRANSACTION2 0x32 +#define SMB_COM_TRANSACTION2_SECONDARY 0x33 +#define SMB_COM_FIND_CLOSE2 0x34 /* trivial response */ +#define SMB_COM_TREE_DISCONNECT 0x71 /* trivial response */ +#define SMB_COM_NEGOTIATE 0x72 +#define SMB_COM_SESSION_SETUP_ANDX 0x73 +#define SMB_COM_LOGOFF_ANDX 0x74 /* trivial response */ +#define SMB_COM_TREE_CONNECT_ANDX 0x75 +#define SMB_COM_NT_TRANSACT 0xA0 +#define SMB_COM_NT_TRANSACT_SECONDARY 0xA1 +#define SMB_COM_NT_CREATE_ANDX 0xA2 +#define SMB_COM_NT_CANCEL 0xA4 /* no response */ +#define SMB_COM_NT_RENAME 0xA5 /* trivial response */ + +/* Transact2 subcommand codes */ +#define TRANS2_OPEN 0x00 +#define TRANS2_FIND_FIRST 0x01 +#define TRANS2_FIND_NEXT 0x02 +#define TRANS2_QUERY_FS_INFORMATION 0x03 +#define TRANS2_SET_FS_INFORMATION 0x04 +#define TRANS2_QUERY_PATH_INFORMATION 0x05 +#define TRANS2_SET_PATH_INFORMATION 0x06 +#define TRANS2_QUERY_FILE_INFORMATION 0x07 +#define TRANS2_SET_FILE_INFORMATION 0x08 +#define TRANS2_GET_DFS_REFERRAL 0x10 +#define TRANS2_REPORT_DFS_INCOSISTENCY 0x11 + +/* SMB Transact (Named Pipe) subcommand codes */ +#define TRANS_SET_NMPIPE_STATE 0x0001 +#define TRANS_RAW_READ_NMPIPE 0x0011 +#define TRANS_QUERY_NMPIPE_STATE 0x0021 +#define TRANS_QUERY_NMPIPE_INFO 0x0022 +#define TRANS_PEEK_NMPIPE 0x0023 +#define TRANS_TRANSACT_NMPIPE 0x0026 +#define TRANS_RAW_WRITE_NMPIPE 0x0031 +#define TRANS_READ_NMPIPE 0x0036 +#define TRANS_WRITE_NMPIPE 0x0037 +#define TRANS_WAIT_NMPIPE 0x0053 +#define TRANS_CALL_NMPIPE 0x0054 + +/* NT Transact subcommand codes */ +#define NT_TRANSACT_CREATE 0x01 +#define NT_TRANSACT_IOCTL 0x02 +#define NT_TRANSACT_SET_SECURITY_DESC 0x03 +#define NT_TRANSACT_NOTIFY_CHANGE 0x04 +#define NT_TRANSACT_RENAME 0x05 +#define NT_TRANSACT_QUERY_SECURITY_DESC 0x06 +#define NT_TRANSACT_GET_USER_QUOTA 0x07 +#define NT_TRANSACT_SET_USER_QUOTA 0x08 + +#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ +/* future chained NTCreateXReadX bigger, but for time being NTCreateX biggest */ +/* among the requests (NTCreateX response is bigger with wct of 34) */ +#define MAX_CIFS_HDR_SIZE 0x58 /* 4 len + 32 hdr + (2*24 wct) + 2 bct + 2 pad */ +#define CIFS_SMALL_PATH 120 /* allows for (448-88)/3 */ + +/* internal cifs vfs structures */ +/***************************************************************** + * All constants go here + ***************************************************************** + */ + +/* + * Starting value for maximum SMB size negotiation + */ +#define CIFS_MAX_MSGSIZE (4*4096) + +/* + * Size of encrypted user password in bytes + */ +#define CIFS_ENCPWD_SIZE (16) + +/* + * Size of the crypto key returned on the negotiate SMB in bytes + */ +#define CIFS_CRYPTO_KEY_SIZE (8) + +/* + * Size of the ntlm client response + */ +#define CIFS_AUTH_RESP_SIZE (24) + +/* + * Size of the session key (crypto key encrypted with the password + */ +#define CIFS_SESS_KEY_SIZE (16) + +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE (16) + +#define CIFS_CLIENT_CHALLENGE_SIZE (8) +#define CIFS_SERVER_CHALLENGE_SIZE (8) +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) +#define CIFS_NTHASH_SIZE (16) + +/* + * Maximum user name length + */ +#define CIFS_UNLEN (20) + +/* + * Flags on SMB open + */ +#define SMBOPEN_WRITE_THROUGH 0x4000 +#define SMBOPEN_DENY_ALL 0x0010 +#define SMBOPEN_DENY_WRITE 0x0020 +#define SMBOPEN_DENY_READ 0x0030 +#define SMBOPEN_DENY_NONE 0x0040 +#define SMBOPEN_READ 0x0000 +#define SMBOPEN_WRITE 0x0001 +#define SMBOPEN_READWRITE 0x0002 +#define SMBOPEN_EXECUTE 0x0003 + +#define SMBOPEN_OCREATE 0x0010 +#define SMBOPEN_OTRUNC 0x0002 +#define SMBOPEN_OAPPEND 0x0001 + +/* + * SMB flag definitions + */ +#define SMBFLG_EXTD_LOCK 0x01 /* server supports lock-read write-unlock smb */ +#define SMBFLG_RCV_POSTED 0x02 /* obsolete */ +#define SMBFLG_RSVD 0x04 +#define SMBFLG_CASELESS 0x08 /* all pathnames treated as caseless (off + implies case sensitive file handling request) */ +#define SMBFLG_CANONICAL_PATH_FORMAT 0x10 /* obsolete */ +#define SMBFLG_OLD_OPLOCK 0x20 /* obsolete */ +#define SMBFLG_OLD_OPLOCK_NOTIFY 0x40 /* obsolete */ +#define SMBFLG_RESPONSE 0x80 /* this PDU is a response from server */ + +/* + * SMB flag2 definitions + */ +#define SMBFLG2_KNOWS_LONG_NAMES cpu_to_le16(1) /* can send long (non-8.3) + path names in response */ +#define SMBFLG2_KNOWS_EAS cpu_to_le16(2) +#define SMBFLG2_SECURITY_SIGNATURE cpu_to_le16(4) +#define SMBFLG2_COMPRESSED (8) +#define SMBFLG2_SECURITY_SIGNATURE_REQUIRED (0x10) +#define SMBFLG2_IS_LONG_NAME cpu_to_le16(0x40) +#define SMBFLG2_REPARSE_PATH (0x400) +#define SMBFLG2_EXT_SEC cpu_to_le16(0x800) +#define SMBFLG2_DFS cpu_to_le16(0x1000) +#define SMBFLG2_PAGING_IO cpu_to_le16(0x2000) +#define SMBFLG2_ERR_STATUS cpu_to_le16(0x4000) +#define SMBFLG2_UNICODE cpu_to_le16(0x8000) + +/* + * These are the file access permission bits defined in CIFS for the + * NTCreateAndX as well as the level 0x107 + * TRANS2_QUERY_PATH_INFORMATION API. The level 0x107, SMB_QUERY_FILE_ALL_INFO + * responds with the AccessFlags. + * The AccessFlags specifies the access permissions a caller has to the + * file and can have any suitable combination of the following values: + */ + +#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */ +#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */ +#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */ +#define FILE_READ_EA 0x00000008 /* Extended attributes associated */ + /* with the file can be read */ +#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */ + /* with the file can be written */ +#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */ + /* the file using system paging I/O */ +#define FILE_DELETE_CHILD 0x00000040 +#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */ + /* file can be read */ +#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */ + /* file can be written */ +#define DELETE 0x00010000 /* The file can be deleted */ +#define READ_CONTROL 0x00020000 /* The access control list and */ + /* ownership associated with the */ + /* file can be read */ +#define WRITE_DAC 0x00040000 /* The access control list and */ + /* ownership associated with the */ + /* file can be written. */ +#define WRITE_OWNER 0x00080000 /* Ownership information associated */ + /* with the file can be written */ +#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */ + /* synchronize with the completion */ + /* of an input/output request */ +#define GENERIC_ALL 0x10000000 +#define GENERIC_EXECUTE 0x20000000 +#define GENERIC_WRITE 0x40000000 +#define GENERIC_READ 0x80000000 + /* In summary - Relevant file */ + /* access flags from CIFS are */ + /* file_read_data, file_write_data */ + /* file_execute, file_read_attributes*/ + /* write_dac, and delete. */ + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \ + | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_READ_EA | FILE_WRITE_EA \ + | FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \ + | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) + +#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \ + | READ_CONTROL | SYNCHRONIZE) + + +/* + * Invalid readdir handle + */ +#define CIFS_NO_HANDLE 0xFFFF + +#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL + +/* IPC$ in ASCII */ +#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24" + +/* IPC$ in Unicode */ +#define CIFS_IPC_UNICODE_RESOURCE "\x00\x49\x00\x50\x00\x43\x00\x24\x00\x00" + +/* Unicode Null terminate 2 bytes of 0 */ +#define UNICODE_NULL "\x00\x00" +#define ASCII_NULL 0x00 + +/* + * Server type values (returned on EnumServer API + */ +#define CIFS_SV_TYPE_DC 0x00000008 +#define CIFS_SV_TYPE_BACKDC 0x00000010 + +/* + * Alias type flags (From EnumAlias API call + */ +#define CIFS_ALIAS_TYPE_FILE 0x0001 +#define CIFS_SHARE_TYPE_FILE 0x0000 + +/* + * File Attribute flags + */ +#define ATTR_READONLY 0x0001 +#define ATTR_HIDDEN 0x0002 +#define ATTR_SYSTEM 0x0004 +#define ATTR_VOLUME 0x0008 +#define ATTR_DIRECTORY 0x0010 +#define ATTR_ARCHIVE 0x0020 +#define ATTR_DEVICE 0x0040 +#define ATTR_NORMAL 0x0080 +#define ATTR_TEMPORARY 0x0100 +#define ATTR_SPARSE 0x0200 +#define ATTR_REPARSE 0x0400 +#define ATTR_COMPRESSED 0x0800 +#define ATTR_OFFLINE 0x1000 /* ie file not immediately available - + on offline storage */ +#define ATTR_NOT_CONTENT_INDEXED 0x2000 +#define ATTR_ENCRYPTED 0x4000 +#define ATTR_POSIX_SEMANTICS 0x01000000 +#define ATTR_BACKUP_SEMANTICS 0x02000000 +#define ATTR_DELETE_ON_CLOSE 0x04000000 +#define ATTR_SEQUENTIAL_SCAN 0x08000000 +#define ATTR_RANDOM_ACCESS 0x10000000 +#define ATTR_NO_BUFFERING 0x20000000 +#define ATTR_WRITE_THROUGH 0x80000000 + +/* ShareAccess flags */ +#define FILE_NO_SHARE 0x00000000 +#define FILE_SHARE_READ 0x00000001 +#define FILE_SHARE_WRITE 0x00000002 +#define FILE_SHARE_DELETE 0x00000004 +#define FILE_SHARE_ALL 0x00000007 + +/* CreateDisposition flags, similar to CreateAction as well */ +#define FILE_SUPERSEDE 0x00000000 +#define FILE_OPEN 0x00000001 +#define FILE_CREATE 0x00000002 +#define FILE_OPEN_IF 0x00000003 +#define FILE_OVERWRITE 0x00000004 +#define FILE_OVERWRITE_IF 0x00000005 + +/* CreateOptions */ +#define CREATE_NOT_FILE 0x00000001 /* if set must not be file */ +#define CREATE_WRITE_THROUGH 0x00000002 +#define CREATE_SEQUENTIAL 0x00000004 +#define CREATE_NO_BUFFER 0x00000008 /* should not buffer on srv */ +#define CREATE_SYNC_ALERT 0x00000010 /* MBZ */ +#define CREATE_ASYNC_ALERT 0x00000020 /* MBZ */ +#define CREATE_NOT_DIR 0x00000040 /* if set must not be directory */ +#define CREATE_TREE_CONNECTION 0x00000080 /* should be zero */ +#define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */ +#define CREATE_NO_EA_KNOWLEDGE 0x00000200 +#define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete + "open for recovery" flag should + be zero in any case */ +#define CREATE_OPEN_FOR_RECOVERY 0x00000400 +#define CREATE_RANDOM_ACCESS 0x00000800 +#define CREATE_DELETE_ON_CLOSE 0x00001000 +#define CREATE_OPEN_BY_ID 0x00002000 +#define CREATE_OPEN_BACKUP_INTENT 0x00004000 +#define CREATE_NO_COMPRESSION 0x00008000 +#define CREATE_RESERVE_OPFILTER 0x00100000 /* should be zero */ +#define OPEN_REPARSE_POINT 0x00200000 +#define OPEN_NO_RECALL 0x00400000 +#define OPEN_FREE_SPACE_QUERY 0x00800000 /* should be zero */ +#define CREATE_OPTIONS_MASK 0x007FFFFF +#define CREATE_OPTION_READONLY 0x10000000 +#define CREATE_OPTION_SPECIAL 0x20000000 /* system. NB not sent over wire */ + +/* ImpersonationLevel flags */ +#define SECURITY_ANONYMOUS 0 +#define SECURITY_IDENTIFICATION 1 +#define SECURITY_IMPERSONATION 2 +#define SECURITY_DELEGATION 3 + +/* SecurityFlags */ +#define SECURITY_CONTEXT_TRACKING 0x01 +#define SECURITY_EFFECTIVE_ONLY 0x02 + +/* + * Default PID value, used in all SMBs where the PID is not important + */ +#define CIFS_DFT_PID 0x1234 + +/* + * We use the same routine for Copy and Move SMBs. This flag is used to + * distinguish + */ +#define CIFS_COPY_OP 1 +#define CIFS_RENAME_OP 2 + +#define GETU16(var) (*((__u16 *)var)) /* BB check for endian issues */ +#define GETU32(var) (*((__u32 *)var)) /* BB check for endian issues */ + +struct smb_hdr { + __be32 smb_buf_length; /* BB length is only two (rarely three) bytes, + with one or two byte "type" preceding it that will be + zero - we could mask the type byte off */ + __u8 Protocol[4]; + __u8 Command; + union { + struct { + __u8 ErrorClass; + __u8 Reserved; + __le16 Error; + } __attribute__((packed)) DosError; + __le32 CifsError; + } __attribute__((packed)) Status; + __u8 Flags; + __le16 Flags2; /* note: le */ + __le16 PidHigh; + union { + struct { + __le32 SequenceNumber; /* le */ + __u32 Reserved; /* zero */ + } __attribute__((packed)) Sequence; + __u8 SecuritySignature[8]; /* le */ + } __attribute__((packed)) Signature; + __u8 pad[2]; + __u16 Tid; + __le16 Pid; + __u16 Uid; + __le16 Mid; + __u8 WordCount; +} __attribute__((packed)); + +/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */ +static inline void * +BCC(struct smb_hdr *smb) +{ + return (void *)smb + sizeof(*smb) + 2 * smb->WordCount; +} + +/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ +#define pByteArea(smb_var) (BCC(smb_var) + 2) + +/* get the unconverted ByteCount for a SMB packet and return it */ +static inline __u16 +get_bcc(struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + return get_unaligned_le16(bc_ptr); +} + +/* set the ByteCount for a SMB packet in little-endian */ +static inline void +put_bcc(__u16 count, struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + put_unaligned_le16(count, bc_ptr); +} + +/* + * Computer Name Length (since Netbios name was length 16 with last byte 0x20) + * No longer as important, now that TCP names are more commonly used to + * resolve hosts. + */ +#define CNLEN 15 + +/* + * Share Name Length (SNLEN) + * Note: This length was limited by the SMB used to get + * the Share info. NetShareEnum only returned 13 + * chars, including the null termination. + * This was removed because it no longer is limiting. + */ + +/* + * Comment Length + */ +#define MAXCOMMENTLEN 40 + +/* + * The OS/2 maximum path name + */ +#define MAX_PATHCONF 256 + +/* + * SMB frame definitions (following must be packed structs) + * See the SNIA CIFS Specification for details. + * + * The Naming convention is the lower case version of the + * smb command code name for the struct and this is typedef to the + * uppercase version of the same name with the prefix SMB_ removed + * for brevity. Although typedefs are not commonly used for + * structure definitions in the Linux kernel, their use in the + * CIFS standards document, which this code is based on, may + * make this one of the cases where typedefs for structures make + * sense to improve readability for readers of the standards doc. + * Typedefs can always be removed later if they are too distracting + * and they are only used for the CIFSs PDUs themselves, not + * internal cifs vfs structures + * + */ + +typedef struct negotiate_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + unsigned char DialectsArray[1]; +} __attribute__((packed)) NEGOTIATE_REQ; + +/* Dialect index is 13 for LANMAN */ + +#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ + +typedef struct lanman_neg_rsp { + struct smb_hdr hdr; /* wct = 13 */ + __le16 DialectIndex; + __le16 SecurityMode; + __le16 MaxBufSize; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le16 RawMode; + __le32 SessionKey; + struct { + __le16 Time; + __le16 Date; + } __attribute__((packed)) SrvTime; + __le16 ServerTimeZone; + __le16 EncryptionKeyLength; + __le16 Reserved; + __u16 ByteCount; + unsigned char EncryptionKey[1]; +} __attribute__((packed)) LANMAN_NEG_RSP; + +#define READ_RAW_ENABLE 1 +#define WRITE_RAW_ENABLE 2 +#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) +#define SMB1_CLIENT_GUID_SIZE (16) +typedef struct negotiate_rsp { + struct smb_hdr hdr; /* wct = 17 */ + __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ + __u8 SecurityMode; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le32 MaxBufferSize; + __le32 MaxRawSize; + __le32 SessionKey; + __le32 Capabilities; /* see below */ + __le32 SystemTimeLow; + __le32 SystemTimeHigh; + __le16 ServerTimeZone; + __u8 EncryptionKeyLength; + __u16 ByteCount; + union { + unsigned char EncryptionKey[1]; /* cap extended security off */ + /* followed by Domain name - if extended security is off */ + /* followed by 16 bytes of server GUID */ + /* then security blob if cap_extended_security negotiated */ + struct { + unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; + unsigned char SecurityBlob[1]; + } __attribute__((packed)) extended_response; + } __attribute__((packed)) u; +} __attribute__((packed)) NEGOTIATE_RSP; + +/* SecurityMode bits */ +#define SECMODE_USER 0x01 /* off indicates share level security */ +#define SECMODE_PW_ENCRYPT 0x02 +#define SECMODE_SIGN_ENABLED 0x04 /* SMB security signatures enabled */ +#define SECMODE_SIGN_REQUIRED 0x08 /* SMB security signatures required */ + +/* Negotiate response Capabilities */ +#define CAP_RAW_MODE 0x00000001 +#define CAP_MPX_MODE 0x00000002 +#define CAP_UNICODE 0x00000004 +#define CAP_LARGE_FILES 0x00000008 +#define CAP_NT_SMBS 0x00000010 /* implies CAP_NT_FIND */ +#define CAP_RPC_REMOTE_APIS 0x00000020 +#define CAP_STATUS32 0x00000040 +#define CAP_LEVEL_II_OPLOCKS 0x00000080 +#define CAP_LOCK_AND_READ 0x00000100 +#define CAP_NT_FIND 0x00000200 +#define CAP_DFS 0x00001000 +#define CAP_INFOLEVEL_PASSTHRU 0x00002000 +#define CAP_LARGE_READ_X 0x00004000 +#define CAP_LARGE_WRITE_X 0x00008000 +#define CAP_LWIO 0x00010000 /* support fctl_srv_req_resume_key */ +#define CAP_UNIX 0x00800000 +#define CAP_COMPRESSED_DATA 0x02000000 +#define CAP_DYNAMIC_REAUTH 0x20000000 +#define CAP_PERSISTENT_HANDLES 0x40000000 +#define CAP_EXTENDED_SECURITY 0x80000000 + +typedef union smb_com_session_setup_andx { + struct { /* request format */ + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 SecurityBlobLength; + __u32 Reserved; + __le32 Capabilities; /* see below */ + __le16 ByteCount; + unsigned char SecurityBlob[1]; /* followed by */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) req; /* NTLM request format (with + extended security */ + + struct { /* request format */ + struct smb_hdr hdr; /* wct = 13 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 CaseInsensitivePasswordLength; /* ASCII password len */ + __le16 CaseSensitivePasswordLength; /* Unicode password length*/ + __u32 Reserved; /* see below */ + __le32 Capabilities; + __le16 ByteCount; + unsigned char CaseInsensitivePassword[1]; /* followed by: */ + /* unsigned char * CaseSensitivePassword; */ + /* STRING AccountName */ + /* STRING PrimaryDomain */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) req_no_secext; /* NTLM request format (without + extended security */ + + struct { /* default (NTLM) response format */ + struct smb_hdr hdr; /* wct = 4 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Action; /* see below */ + __le16 SecurityBlobLength; + __u16 ByteCount; + unsigned char SecurityBlob[1]; /* followed by */ +/* unsigned char * NativeOS; */ +/* unsigned char * NativeLanMan; */ +/* unsigned char * PrimaryDomain; */ + } __attribute__((packed)) resp; /* NTLM response + (with or without extended sec) */ + + struct { /* request format */ + struct smb_hdr hdr; /* wct = 10 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 PasswordLength; + __u32 Reserved; /* encrypt key len and offset */ + __le16 ByteCount; + unsigned char AccountPassword[1]; /* followed by */ + /* STRING AccountName */ + /* STRING PrimaryDomain */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) req format */ + + struct { /* default (NTLM) response format */ + struct smb_hdr hdr; /* wct = 3 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Action; /* see below */ + __u16 ByteCount; + unsigned char NativeOS[1]; /* followed by */ +/* unsigned char * NativeLanMan; */ +/* unsigned char * PrimaryDomain; */ + } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ +} __attribute__((packed)) SESSION_SETUP_ANDX; + +/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */ + +#define NTLMSSP_SERVER_TYPE 1 +#define NTLMSSP_DOMAIN_TYPE 2 +#define NTLMSSP_FQ_DOMAIN_TYPE 3 +#define NTLMSSP_DNS_DOMAIN_TYPE 4 +#define NTLMSSP_DNS_PARENT_TYPE 5 + +struct ntlmssp2_name { + __le16 type; + __le16 length; +/* char name[length]; */ +} __attribute__((packed)); + +struct ntlmv2_resp { + union { + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + struct { + __u8 reserved[8]; + __u8 key[CIFS_SERVER_CHALLENGE_SIZE]; + } __attribute__((packed)) challenge; + } __attribute__((packed)); + __le32 blob_signature; + __u32 reserved; + __le64 time; + __u64 client_chal; /* random */ + __u32 reserved2; + /* array of name entries could follow ending in minimum 4 byte struct */ +} __attribute__((packed)); + + +#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux" + +/* Capabilities bits (for NTLM SessSetup request) */ +#define CAP_UNICODE 0x00000004 +#define CAP_LARGE_FILES 0x00000008 +#define CAP_NT_SMBS 0x00000010 +#define CAP_STATUS32 0x00000040 +#define CAP_LEVEL_II_OPLOCKS 0x00000080 +#define CAP_NT_FIND 0x00000200 /* reserved should be zero + (because NT_SMBs implies the same thing?) */ +#define CAP_BULK_TRANSFER 0x20000000 +#define CAP_EXTENDED_SECURITY 0x80000000 + +/* Action bits */ +#define GUEST_LOGIN 1 + +typedef struct smb_com_tconx_req { + struct smb_hdr hdr; /* wct = 4 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Flags; /* see below */ + __le16 PasswordLength; + __le16 ByteCount; + unsigned char Password[1]; /* followed by */ +/* STRING Path *//* \\server\share name */ + /* STRING Service */ +} __attribute__((packed)) TCONX_REQ; + +typedef struct smb_com_tconx_rsp { + struct smb_hdr hdr; /* wct = 3 , not extended response */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OptionalSupport; /* see below */ + __u16 ByteCount; + unsigned char Service[1]; /* always ASCII, not Unicode */ + /* STRING NativeFileSystem */ +} __attribute__((packed)) TCONX_RSP; + +typedef struct smb_com_tconx_rsp_ext { + struct smb_hdr hdr; /* wct = 7, extended response */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OptionalSupport; /* see below */ + __le32 MaximalShareAccessRights; + __le32 GuestMaximalShareAccessRights; + __u16 ByteCount; + unsigned char Service[1]; /* always ASCII, not Unicode */ + /* STRING NativeFileSystem */ +} __attribute__((packed)) TCONX_RSP_EXT; + + +/* tree connect Flags */ +#define DISCONNECT_TID 0x0001 +#define TCON_EXTENDED_SIGNATURES 0x0004 +#define TCON_EXTENDED_SECINFO 0x0008 + +/* OptionalSupport bits */ +#define SMB_SUPPORT_SEARCH_BITS 0x0001 /* "must have" directory search bits + (exclusive searches supported) */ +#define SMB_SHARE_IS_IN_DFS 0x0002 +#define SMB_CSC_MASK 0x000C +/* CSC flags defined as follows */ +#define SMB_CSC_CACHE_MANUAL_REINT 0x0000 +#define SMB_CSC_CACHE_AUTO_REINT 0x0004 +#define SMB_CSC_CACHE_VDO 0x0008 +#define SMB_CSC_NO_CACHING 0x000C +#define SMB_UNIQUE_FILE_NAME 0x0010 +#define SMB_EXTENDED_SIGNATURES 0x0020 + +/* services + * + * A: ie disk + * LPT1: ie printer + * IPC ie named pipe + * COMM + * ????? ie any type + * + */ + +typedef struct smb_com_echo_req { + struct smb_hdr hdr; + __le16 EchoCount; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_REQ; + +typedef struct smb_com_echo_rsp { + struct smb_hdr hdr; + __le16 SequenceNumber; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_RSP; + +typedef struct smb_com_logoff_andx_req { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __u16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOGOFF_ANDX_REQ; + +typedef struct smb_com_logoff_andx_rsp { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __u16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOGOFF_ANDX_RSP; + +typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on + tree_connect PDU to effect disconnect */ + /* tdis is probably simplest SMB PDU */ + struct { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bcc = 0 */ + } __attribute__((packed)) req; + struct { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bcc = 0 */ + } __attribute__((packed)) resp; +} __attribute__((packed)) TREE_DISCONNECT; + +typedef struct smb_com_close_req { + struct smb_hdr hdr; /* wct = 3 */ + __u16 FileID; + __u32 LastWriteTime; /* should be zero or -1 */ + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) CLOSE_REQ; + +typedef struct smb_com_close_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) CLOSE_RSP; + +typedef struct smb_com_flush_req { + struct smb_hdr hdr; /* wct = 1 */ + __u16 FileID; + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) FLUSH_REQ; + +typedef struct smb_com_findclose_req { + struct smb_hdr hdr; /* wct = 1 */ + __u16 FileID; + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) FINDCLOSE_REQ; + +/* OpenFlags */ +#define REQ_MORE_INFO 0x00000001 /* legacy (OPEN_AND_X) only */ +#define REQ_OPLOCK 0x00000002 +#define REQ_BATCHOPLOCK 0x00000004 +#define REQ_OPENDIRONLY 0x00000008 +#define REQ_EXTENDED_INFO 0x00000010 + +/* File type */ +#define DISK_TYPE 0x0000 +#define BYTE_PIPE_TYPE 0x0001 +#define MESSAGE_PIPE_TYPE 0x0002 +#define PRINTER_TYPE 0x0003 +#define COMM_DEV_TYPE 0x0004 +#define UNKNOWN_TYPE 0xFFFF + +/* Device Type or File Status Flags */ +#define NO_EAS 0x0001 +#define NO_SUBSTREAMS 0x0002 +#define NO_REPARSETAG 0x0004 +/* following flags can apply if pipe */ +#define ICOUNT_MASK 0x00FF +#define PIPE_READ_MODE 0x0100 +#define NAMED_PIPE_TYPE 0x0400 +#define PIPE_END_POINT 0x4000 +#define BLOCKING_NAMED_PIPE 0x8000 + +typedef struct smb_com_open_req { /* also handles create */ + struct smb_hdr hdr; /* wct = 24 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 Reserved; /* Must Be Zero */ + __le16 NameLength; + __le32 OpenFlags; + __u32 RootDirectoryFid; + __le32 DesiredAccess; + __le64 AllocationSize; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le32 ImpersonationLevel; + __u8 SecurityFlags; + __le16 ByteCount; + char fileName[1]; +} __attribute__((packed)) OPEN_REQ; + +/* open response: oplock levels */ +#define OPLOCK_NONE 0 +#define OPLOCK_EXCLUSIVE 1 +#define OPLOCK_BATCH 2 +#define OPLOCK_READ 3 /* level 2 oplock */ + +/* open response for CreateAction shifted left */ +#define CIFS_CREATE_ACTION 0x20000 /* file created */ + +typedef struct smb_com_open_rsp { + struct smb_hdr hdr; /* wct = 34 BB */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 OplockLevel; + __u16 Fid; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + __le64 AllocationSize; + __le64 EndOfFile; + __le16 FileType; + __le16 DeviceState; + __u8 DirectoryFlag; + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) OPEN_RSP; + +typedef struct smb_com_open_rsp_ext { + struct smb_hdr hdr; /* wct = 42 but meaningless due to MS bug? */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 OplockLevel; + __u16 Fid; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + __le64 AllocationSize; + __le64 EndOfFile; + __le16 FileType; + __le16 DeviceState; + __u8 DirectoryFlag; + __u8 VolumeGUID[16]; + __u64 FileId; /* note no endian conversion - is opaque UniqueID */ + __le32 MaximalAccessRights; + __le32 GuestMaximalAccessRights; + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) OPEN_RSP_EXT; + + +/* format of legacy open request */ +typedef struct smb_com_openx_req { + struct smb_hdr hdr; /* wct = 15 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OpenFlags; + __le16 Mode; + __le16 Sattr; /* search attributes */ + __le16 FileAttributes; /* dos attrs */ + __le32 CreateTime; /* os2 format */ + __le16 OpenFunction; + __le32 EndOfFile; + __le32 Timeout; + __le32 Reserved; + __le16 ByteCount; /* file name follows */ + char fileName[1]; +} __attribute__((packed)) OPENX_REQ; + +typedef struct smb_com_openx_rsp { + struct smb_hdr hdr; /* wct = 15 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le16 FileAttributes; + __le32 LastWriteTime; /* os2 format */ + __le32 EndOfFile; + __le16 Access; + __le16 FileType; + __le16 IPCState; + __le16 Action; + __u32 FileId; + __u16 Reserved; + __u16 ByteCount; +} __attribute__((packed)) OPENX_RSP; + +/* For encoding of POSIX Open Request - see trans2 function 0x209 data struct */ + +/* Legacy write request for older servers */ +typedef struct smb_com_writex_req { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __u32 Reserved; /* Timeout */ + __le16 WriteMode; /* 1 = write through */ + __le16 Remaining; + __le16 Reserved2; + __le16 DataLengthLow; + __le16 DataOffset; + __le16 ByteCount; + __u8 Pad; /* BB check for whether padded to DWORD + boundary and optimum performance here */ + char Data[0]; +} __attribute__((packed)) WRITEX_REQ; + +typedef struct smb_com_write_req { + struct smb_hdr hdr; /* wct = 14 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __u32 Reserved; + __le16 WriteMode; + __le16 Remaining; + __le16 DataLengthHigh; + __le16 DataLengthLow; + __le16 DataOffset; + __le32 OffsetHigh; + __le16 ByteCount; + __u8 Pad; /* BB check for whether padded to DWORD + boundary and optimum performance here */ + char Data[0]; +} __attribute__((packed)) WRITE_REQ; + +typedef struct smb_com_write_rsp { + struct smb_hdr hdr; /* wct = 6 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Count; + __le16 Remaining; + __le16 CountHigh; + __u16 Reserved; + __u16 ByteCount; +} __attribute__((packed)) WRITE_RSP; + +/* legacy read request for older servers */ +typedef struct smb_com_readx_req { + struct smb_hdr hdr; /* wct = 10 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __le16 MaxCount; + __le16 MinCount; /* obsolete */ + __le32 Reserved; + __le16 Remaining; + __le16 ByteCount; +} __attribute__((packed)) READX_REQ; + +typedef struct smb_com_read_req { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __le16 MaxCount; + __le16 MinCount; /* obsolete */ + __le32 MaxCountHigh; + __le16 Remaining; + __le32 OffsetHigh; + __le16 ByteCount; +} __attribute__((packed)) READ_REQ; + +typedef struct smb_com_read_rsp { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Remaining; + __le16 DataCompactionMode; + __le16 Reserved; + __le16 DataLength; + __le16 DataOffset; + __le16 DataLengthHigh; + __u64 Reserved2; + __u16 ByteCount; + /* read response data immediately follows */ +} __attribute__((packed)) READ_RSP; + +typedef struct locking_andx_range { + __le16 Pid; + __le16 Pad; + __le32 OffsetHigh; + __le32 OffsetLow; + __le32 LengthHigh; + __le32 LengthLow; +} __attribute__((packed)) LOCKING_ANDX_RANGE; + +#define LOCKING_ANDX_SHARED_LOCK 0x01 +#define LOCKING_ANDX_OPLOCK_RELEASE 0x02 +#define LOCKING_ANDX_CHANGE_LOCKTYPE 0x04 +#define LOCKING_ANDX_CANCEL_LOCK 0x08 +#define LOCKING_ANDX_LARGE_FILES 0x10 /* always on for us */ + +typedef struct smb_com_lock_req { + struct smb_hdr hdr; /* wct = 8 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __u8 LockType; + __u8 OplockLevel; + __le32 Timeout; + __le16 NumberOfUnlocks; + __le16 NumberOfLocks; + __le16 ByteCount; + LOCKING_ANDX_RANGE Locks[1]; +} __attribute__((packed)) LOCK_REQ; + +/* lock type */ +#define CIFS_RDLCK 0 +#define CIFS_WRLCK 1 +#define CIFS_UNLCK 2 +typedef struct cifs_posix_lock { + __le16 lock_type; /* 0 = Read, 1 = Write, 2 = Unlock */ + __le16 lock_flags; /* 1 = Wait (only valid for setlock) */ + __le32 pid; + __le64 start; + __le64 length; + /* BB what about additional owner info to identify network client */ +} __attribute__((packed)) CIFS_POSIX_LOCK; + +typedef struct smb_com_lock_rsp { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOCK_RSP; + +typedef struct smb_com_rename_req { + struct smb_hdr hdr; /* wct = 1 */ + __le16 SearchAttributes; /* target file attributes */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName */ +} __attribute__((packed)) RENAME_REQ; + + /* copy request flags */ +#define COPY_MUST_BE_FILE 0x0001 +#define COPY_MUST_BE_DIR 0x0002 +#define COPY_TARGET_MODE_ASCII 0x0004 /* if not set, binary */ +#define COPY_SOURCE_MODE_ASCII 0x0008 /* if not set, binary */ +#define COPY_VERIFY_WRITES 0x0010 +#define COPY_TREE 0x0020 + +typedef struct smb_com_copy_req { + struct smb_hdr hdr; /* wct = 3 */ + __u16 Tid2; + __le16 OpenFunction; + __le16 Flags; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName string */ +} __attribute__((packed)) COPY_REQ; + +typedef struct smb_com_copy_rsp { + struct smb_hdr hdr; /* wct = 1 */ + __le16 CopyCount; /* number of files copied */ + __u16 ByteCount; /* may be zero */ + __u8 BufferFormat; /* 0x04 - only present if errored file follows */ + unsigned char ErrorFileName[1]; /* only present if error in copy */ +} __attribute__((packed)) COPY_RSP; + +#define CREATE_HARD_LINK 0x103 +#define MOVEFILE_COPY_ALLOWED 0x0002 +#define MOVEFILE_REPLACE_EXISTING 0x0001 + +typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */ + struct smb_hdr hdr; /* wct = 4 */ + __le16 SearchAttributes; /* target file attributes */ + __le16 Flags; /* spec says Information Level */ + __le32 ClusterCount; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName */ +} __attribute__((packed)) NT_RENAME_REQ; + +typedef struct smb_com_rename_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) RENAME_RSP; + +typedef struct smb_com_delete_file_req { + struct smb_hdr hdr; /* wct = 1 */ + __le16 SearchAttributes; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char fileName[1]; +} __attribute__((packed)) DELETE_FILE_REQ; + +typedef struct smb_com_delete_file_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) DELETE_FILE_RSP; + +typedef struct smb_com_delete_directory_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char DirName[1]; +} __attribute__((packed)) DELETE_DIRECTORY_REQ; + +typedef struct smb_com_delete_directory_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) DELETE_DIRECTORY_RSP; + +typedef struct smb_com_create_directory_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char DirName[1]; +} __attribute__((packed)) CREATE_DIRECTORY_REQ; + +typedef struct smb_com_create_directory_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) CREATE_DIRECTORY_RSP; + +typedef struct smb_com_query_information_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; /* 1 + namelen + 1 */ + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char FileName[1]; +} __attribute__((packed)) QUERY_INFORMATION_REQ; + +typedef struct smb_com_query_information_rsp { + struct smb_hdr hdr; /* wct = 10 */ + __le16 attr; + __le32 last_write_time; + __le32 size; + __u16 reserved[5]; + __le16 ByteCount; /* bcc = 0 */ +} __attribute__((packed)) QUERY_INFORMATION_RSP; + +typedef struct smb_com_setattr_req { + struct smb_hdr hdr; /* wct = 8 */ + __le16 attr; + __le16 time_low; + __le16 time_high; + __le16 reserved[5]; /* must be zero */ + __u16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char fileName[1]; +} __attribute__((packed)) SETATTR_REQ; + +typedef struct smb_com_setattr_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) SETATTR_RSP; + +/* empty wct response to setattr */ + +/*******************************************************/ +/* NT Transact structure definitions follow */ +/* Currently only ioctl, acl (get security descriptor) */ +/* and notify are implemented */ +/*******************************************************/ +typedef struct smb_com_ntransact_req { + struct smb_hdr hdr; /* wct >= 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + /* SetupCount words follow then */ + __le16 ByteCount; + __u8 Pad[3]; + __u8 Parms[0]; +} __attribute__((packed)) NTRANSACT_REQ; + +typedef struct smb_com_ntransact_rsp { + struct smb_hdr hdr; /* wct = 18 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 0 */ + __u16 ByteCount; + /* __u8 Pad[3]; */ + /* parms and data follow */ +} __attribute__((packed)) NTRANSACT_RSP; + +/* See MS-SMB 2.2.7.2.1.1 */ +struct srv_copychunk { + __le64 SourceOffset; + __le64 DestinationOffset; + __le32 CopyLength; + __u32 Reserved; +} __packed; + +typedef struct smb_com_transaction_ioctl_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + __le32 FunctionCode; + __u16 Fid; + __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */ + __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ + __le16 ByteCount; + __u8 Pad[3]; + __u8 Data[1]; +} __attribute__((packed)) TRANSACT_IOCTL_REQ; + +typedef struct smb_com_transaction_compr_ioctl_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + __le32 FunctionCode; + __u16 Fid; + __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */ + __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ + __le16 ByteCount; + __u8 Pad[3]; + __le16 compression_state; /* See below for valid flags */ +} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ; + +/* compression state flags */ +#define COMPRESSION_FORMAT_NONE 0x0000 +#define COMPRESSION_FORMAT_DEFAULT 0x0001 +#define COMPRESSION_FORMAT_LZNT1 0x0002 + +typedef struct smb_com_transaction_ioctl_rsp { + struct smb_hdr hdr; /* wct = 19 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 1 */ + __le16 ReturnedDataLen; + __u16 ByteCount; +} __attribute__((packed)) TRANSACT_IOCTL_RSP; + +#define CIFS_ACL_OWNER 1 +#define CIFS_ACL_GROUP 2 +#define CIFS_ACL_DACL 4 +#define CIFS_ACL_SACL 8 + +typedef struct smb_com_transaction_qsec_req { + struct smb_hdr hdr; /* wct = 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* no setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 6 = QUERY_SECURITY_DESC */ + __le16 ByteCount; /* bcc = 3 + 8 */ + __u8 Pad[3]; + __u16 Fid; + __u16 Reserved2; + __le32 AclFlags; +} __attribute__((packed)) QUERY_SEC_DESC_REQ; + + +typedef struct smb_com_transaction_ssec_req { + struct smb_hdr hdr; /* wct = 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* no setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 3 = SET_SECURITY_DESC */ + __le16 ByteCount; /* bcc = 3 + 8 */ + __u8 Pad[3]; + __u16 Fid; + __u16 Reserved2; + __le32 AclFlags; +} __attribute__((packed)) SET_SEC_DESC_REQ; + +typedef struct smb_com_transaction_change_notify_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand;/* 4 = Change Notify */ + __le32 CompletionFilter; /* operation to monitor */ + __u16 Fid; + __u8 WatchTree; /* 1 = Monitor subdirectories */ + __u8 Reserved2; + __le16 ByteCount; +/* __u8 Pad[3];*/ +/* __u8 Data[1];*/ +} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; + +/* BB eventually change to use generic ntransact rsp struct + and validation routine */ +typedef struct smb_com_transaction_change_notify_rsp { + struct smb_hdr hdr; /* wct = 18 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 0 */ + __u16 ByteCount; + /* __u8 Pad[3]; */ +} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_RSP; +/* Completion Filter flags for Notify */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_NAME 0x00000003 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 + +/* response contains array of the following structures */ +struct file_notify_information { + __le32 NextEntryOffset; + __le32 Action; + __le32 FileNameLength; + __u8 FileName[0]; +} __attribute__((packed)); + +/* For IO_REPARSE_TAG_SYMLINK */ +struct reparse_symlink_data { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + char PathBuffer[0]; +} __attribute__((packed)); + +/* For IO_REPARSE_TAG_NFS */ +#define NFS_SPECFILE_LNK 0x00000000014B4E4C +#define NFS_SPECFILE_CHR 0x0000000000524843 +#define NFS_SPECFILE_BLK 0x00000000004B4C42 +#define NFS_SPECFILE_FIFO 0x000000004F464946 +#define NFS_SPECFILE_SOCK 0x000000004B434F53 +struct reparse_posix_data { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le64 InodeType; /* LNK, FIFO, CHR etc. */ + char PathBuffer[0]; +} __attribute__((packed)); + +struct cifs_quota_data { + __u32 rsrvd1; /* 0 */ + __u32 sid_size; + __u64 rsrvd2; /* 0 */ + __u64 space_used; + __u64 soft_limit; + __u64 hard_limit; + char sid[1]; /* variable size? */ +} __attribute__((packed)); + +/* quota sub commands */ +#define QUOTA_LIST_CONTINUE 0 +#define QUOTA_LIST_START 0x100 +#define QUOTA_FOR_SID 0x101 + +struct trans2_req { + /* struct smb_hdr hdr precedes. Set wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* 1st setup word - SetupCount words follow */ + __le16 ByteCount; +} __attribute__((packed)); + +struct smb_t2_req { + struct smb_hdr hdr; + struct trans2_req t2_req; +} __attribute__((packed)); + +struct trans2_resp { + /* struct smb_hdr hdr precedes. Note wct = 10 + setup count */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __u16 Reserved; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 ParameterDisplacement; + __le16 DataCount; + __le16 DataOffset; + __le16 DataDisplacement; + __u8 SetupCount; + __u8 Reserved1; + /* SetupWords[SetupCount]; + __u16 ByteCount; + __u16 Reserved2;*/ + /* data area follows */ +} __attribute__((packed)); + +struct smb_t2_rsp { + struct smb_hdr hdr; + struct trans2_resp t2_rsp; +} __attribute__((packed)); + +/* PathInfo/FileInfo infolevels */ +#define SMB_INFO_STANDARD 1 +#define SMB_SET_FILE_EA 2 +#define SMB_QUERY_FILE_EA_SIZE 2 +#define SMB_INFO_QUERY_EAS_FROM_LIST 3 +#define SMB_INFO_QUERY_ALL_EAS 4 +#define SMB_INFO_IS_NAME_VALID 6 +#define SMB_QUERY_FILE_BASIC_INFO 0x101 +#define SMB_QUERY_FILE_STANDARD_INFO 0x102 +#define SMB_QUERY_FILE_EA_INFO 0x103 +#define SMB_QUERY_FILE_NAME_INFO 0x104 +#define SMB_QUERY_FILE_ALLOCATION_INFO 0x105 +#define SMB_QUERY_FILE_END_OF_FILEINFO 0x106 +#define SMB_QUERY_FILE_ALL_INFO 0x107 +#define SMB_QUERY_ALT_NAME_INFO 0x108 +#define SMB_QUERY_FILE_STREAM_INFO 0x109 +#define SMB_QUERY_FILE_COMPRESSION_INFO 0x10B +#define SMB_QUERY_FILE_UNIX_BASIC 0x200 +#define SMB_QUERY_FILE_UNIX_LINK 0x201 +#define SMB_QUERY_POSIX_ACL 0x204 +#define SMB_QUERY_XATTR 0x205 /* e.g. system EA name space */ +#define SMB_QUERY_ATTR_FLAGS 0x206 /* append,immutable etc. */ +#define SMB_QUERY_POSIX_PERMISSION 0x207 +#define SMB_QUERY_POSIX_LOCK 0x208 +/* #define SMB_POSIX_OPEN 0x209 */ +/* #define SMB_POSIX_UNLINK 0x20a */ +#define SMB_QUERY_FILE__UNIX_INFO2 0x20b +#define SMB_QUERY_FILE_INTERNAL_INFO 0x3ee +#define SMB_QUERY_FILE_ACCESS_INFO 0x3f0 +#define SMB_QUERY_FILE_NAME_INFO2 0x3f1 /* 0x30 bytes */ +#define SMB_QUERY_FILE_POSITION_INFO 0x3f6 +#define SMB_QUERY_FILE_MODE_INFO 0x3f8 +#define SMB_QUERY_FILE_ALGN_INFO 0x3f9 + + +#define SMB_SET_FILE_BASIC_INFO 0x101 +#define SMB_SET_FILE_DISPOSITION_INFO 0x102 +#define SMB_SET_FILE_ALLOCATION_INFO 0x103 +#define SMB_SET_FILE_END_OF_FILE_INFO 0x104 +#define SMB_SET_FILE_UNIX_BASIC 0x200 +#define SMB_SET_FILE_UNIX_LINK 0x201 +#define SMB_SET_FILE_UNIX_HLINK 0x203 +#define SMB_SET_POSIX_ACL 0x204 +#define SMB_SET_XATTR 0x205 +#define SMB_SET_ATTR_FLAGS 0x206 /* append, immutable etc. */ +#define SMB_SET_POSIX_LOCK 0x208 +#define SMB_POSIX_OPEN 0x209 +#define SMB_POSIX_UNLINK 0x20a +#define SMB_SET_FILE_UNIX_INFO2 0x20b +#define SMB_SET_FILE_BASIC_INFO2 0x3ec +#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */ +#define SMB_FILE_ALL_INFO2 0x3fa +#define SMB_SET_FILE_ALLOCATION_INFO2 0x3fb +#define SMB_SET_FILE_END_OF_FILE_INFO2 0x3fc +#define SMB_FILE_MOVE_CLUSTER_INFO 0x407 +#define SMB_FILE_QUOTA_INFO 0x408 +#define SMB_FILE_REPARSEPOINT_INFO 0x409 +#define SMB_FILE_MAXIMUM_INFO 0x40d + +/* Find File infolevels */ +#define SMB_FIND_FILE_INFO_STANDARD 0x001 +#define SMB_FIND_FILE_QUERY_EA_SIZE 0x002 +#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003 +#define SMB_FIND_FILE_DIRECTORY_INFO 0x101 +#define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102 +#define SMB_FIND_FILE_NAMES_INFO 0x103 +#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO 0x104 +#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105 +#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106 +#define SMB_FIND_FILE_UNIX 0x202 + +typedef struct smb_com_transaction2_qpi_req { + struct smb_hdr hdr; /* wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __le16 InformationLevel; + __u32 Reserved4; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_QPI_REQ; + +typedef struct smb_com_transaction2_qpi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ +} __attribute__((packed)) TRANSACTION2_QPI_RSP; + +typedef struct smb_com_transaction2_spi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __u16 Pad1; + __le16 InformationLevel; + __u32 Reserved4; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_SPI_REQ; + +typedef struct smb_com_transaction2_spi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ +} __attribute__((packed)) TRANSACTION2_SPI_RSP; + +struct set_file_rename { + __le32 overwrite; /* 1 = overwrite dest */ + __u32 root_fid; /* zero */ + __le32 target_name_len; + char target_name[0]; /* Must be unicode */ +} __attribute__((packed)); + +struct smb_com_transaction2_sfi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __u16 Pad1; + __u16 Fid; + __le16 InformationLevel; + __u16 Reserved4; +} __attribute__((packed)); + +struct smb_com_transaction2_sfi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word reserved - + present for infolevels > 100 */ +} __attribute__((packed)); + +struct smb_t2_qfi_req { + struct smb_hdr hdr; + struct trans2_req t2; + __u8 Pad; + __u16 Fid; + __le16 InformationLevel; +} __attribute__((packed)); + +struct smb_t2_qfi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word reserved - + present for infolevels > 100 */ +} __attribute__((packed)); + +/* + * Flags on T2 FINDFIRST and FINDNEXT + */ +#define CIFS_SEARCH_CLOSE_ALWAYS 0x0001 +#define CIFS_SEARCH_CLOSE_AT_END 0x0002 +#define CIFS_SEARCH_RETURN_RESUME 0x0004 +#define CIFS_SEARCH_CONTINUE_FROM_LAST 0x0008 +#define CIFS_SEARCH_BACKUP_SEARCH 0x0010 + +/* + * Size of the resume key on FINDFIRST and FINDNEXT calls + */ +#define CIFS_SMB_RESUME_KEY_SIZE 4 + +typedef struct smb_com_transaction2_ffirst_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_FIND_FIRST */ + __le16 ByteCount; + __u8 Pad; + __le16 SearchAttributes; + __le16 SearchCount; + __le16 SearchFlags; + __le16 InformationLevel; + __le32 SearchStorageType; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_FFIRST_REQ; + +typedef struct smb_com_transaction2_ffirst_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_FFIRST_RSP; + +typedef struct smb_com_transaction2_ffirst_rsp_parms { + __u16 SearchHandle; + __le16 SearchCount; + __le16 EndofSearch; + __le16 EAErrorOffset; + __le16 LastNameOffset; +} __attribute__((packed)) T2_FFIRST_RSP_PARMS; + +typedef struct smb_com_transaction2_fnext_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_FIND_NEXT */ + __le16 ByteCount; + __u8 Pad; + __u16 SearchHandle; + __le16 SearchCount; + __le16 InformationLevel; + __u32 ResumeKey; + __le16 SearchFlags; + char ResumeFileName[1]; +} __attribute__((packed)) TRANSACTION2_FNEXT_REQ; + +typedef struct smb_com_transaction2_fnext_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_FNEXT_RSP; + +typedef struct smb_com_transaction2_fnext_rsp_parms { + __le16 SearchCount; + __le16 EndofSearch; + __le16 EAErrorOffset; + __le16 LastNameOffset; +} __attribute__((packed)) T2_FNEXT_RSP_PARMS; + +/* QFSInfo Levels */ +#define SMB_INFO_ALLOCATION 1 +#define SMB_INFO_VOLUME 2 +#define SMB_QUERY_FS_VOLUME_INFO 0x102 +#define SMB_QUERY_FS_SIZE_INFO 0x103 +#define SMB_QUERY_FS_DEVICE_INFO 0x104 +#define SMB_QUERY_FS_ATTRIBUTE_INFO 0x105 +#define SMB_QUERY_CIFS_UNIX_INFO 0x200 +#define SMB_QUERY_POSIX_FS_INFO 0x201 +#define SMB_QUERY_POSIX_WHO_AM_I 0x202 +#define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 +#define SMB_QUERY_FS_PROXY 0x204 /* WAFS enabled. Returns structure + FILE_SYSTEM__UNIX_INFO to tell + whether new NTIOCTL available + (0xACE) for WAN friendly SMB + operations to be carried */ +#define SMB_QUERY_LABEL_INFO 0x3ea +#define SMB_QUERY_FS_QUOTA_INFO 0x3ee +#define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef +#define SMB_QUERY_OBJECTID_INFO 0x3f0 + +typedef struct smb_com_transaction2_qfsi_req { + struct smb_hdr hdr; /* wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __le16 InformationLevel; +} __attribute__((packed)) TRANSACTION2_QFSI_REQ; + +typedef struct smb_com_transaction_qfsi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u8 Pad; /* may be three bytes? *//* followed by data area */ +} __attribute__((packed)) TRANSACTION2_QFSI_RSP; + +typedef struct whoami_rsp_data { /* Query level 0x202 */ + __u32 flags; /* 0 = Authenticated user 1 = GUEST */ + __u32 mask; /* which flags bits server understands ie 0x0001 */ + __u64 unix_user_id; + __u64 unix_user_gid; + __u32 number_of_supplementary_gids; /* may be zero */ + __u32 number_of_sids; /* may be zero */ + __u32 length_of_sid_array; /* in bytes - may be zero */ + __u32 pad; /* reserved - MBZ */ + /* __u64 gid_array[0]; */ /* may be empty */ + /* __u8 * psid_list */ /* may be empty */ +} __attribute__((packed)) WHOAMI_RSP_DATA; + +/* SETFSInfo Levels */ +#define SMB_SET_CIFS_UNIX_INFO 0x200 +/* level 0x203 is defined above in list of QFS info levels */ +/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */ + +/* Level 0x200 request structure follows */ +typedef struct smb_com_transaction2_setfsi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 FileNum; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + __le16 ClientUnixMajor; /* Data start. */ + __le16 ClientUnixMinor; + __le64 ClientUnixCap; /* Data end */ +} __attribute__((packed)) TRANSACTION2_SETFSI_REQ; + +/* level 0x203 request structure follows */ +typedef struct smb_com_transaction2_setfs_enc_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 Reserved4; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + /* NTLMSSP Blob, Data start. */ +} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ; + +/* response for setfsinfo levels 0x200 and 0x203 */ +typedef struct smb_com_transaction2_setfsi_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_SETFSI_RSP; + +typedef struct smb_com_transaction2_get_dfs_refer_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad[3]; /* Win2K has sent 0x0F01 (max response length + perhaps?) followed by one byte pad - doesn't + seem to matter though */ + __le16 MaxReferralLevel; + char RequestFileName[1]; +} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ; + +#define DFS_VERSION cpu_to_le16(0x0003) + +/* DFS server target type */ +#define DFS_TYPE_LINK 0x0000 /* also for sysvol targets */ +#define DFS_TYPE_ROOT 0x0001 + +/* Referral Entry Flags */ +#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */ +#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */ + +typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */ + __le16 VersionNumber; /* must be 3 or 4 */ + __le16 Size; + __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */ + __le16 ReferralEntryFlags; + __le32 TimeToLive; + __le16 DfsPathOffset; + __le16 DfsAlternatePathOffset; + __le16 NetworkAddressOffset; /* offset of the link target */ + __u8 ServiceSiteGuid[16]; /* MBZ, ignored */ +} __attribute__((packed)) REFERRAL3; + +typedef struct smb_com_transaction_get_dfs_refer_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; + __u8 Pad; + __le16 PathConsumed; + __le16 NumberOfReferrals; + __le32 DFSFlags; + REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ + /* followed by the strings pointed to by the referral structures */ +} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP; + +/* DFS Flags */ +#define DFSREF_REFERRAL_SERVER 0x00000001 /* all targets are DFS roots */ +#define DFSREF_STORAGE_SERVER 0x00000002 /* no further ref requests needed */ +#define DFSREF_TARGET_FAILBACK 0x00000004 /* only for DFS referral version 4 */ + +/* + ************************************************************************ + * All structs for everything above the SMB PDUs themselves + * (such as the T2 level specific data) go here + ************************************************************************ + */ + +/* + * Information on a server + */ + +struct serverInfo { + char name[16]; + unsigned char versionMajor; + unsigned char versionMinor; + unsigned long type; + unsigned int commentOffset; +} __attribute__((packed)); + +/* + * The following structure is the format of the data returned on a NetShareEnum + * with level "90" (x5A) + */ + +struct shareInfo { + char shareName[13]; + char pad; + unsigned short type; + unsigned int commentOffset; +} __attribute__((packed)); + +struct aliasInfo { + char aliasName[9]; + char pad; + unsigned int commentOffset; + unsigned char type[2]; +} __attribute__((packed)); + +struct aliasInfo92 { + int aliasNameOffset; + int serverNameOffset; + int shareNameOffset; +} __attribute__((packed)); + +typedef struct { + __le64 TotalAllocationUnits; + __le64 FreeAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __attribute__((packed)) FILE_SYSTEM_INFO; /* size info, level 0x103 */ + +typedef struct { + __le32 fsid; + __le32 SectorsPerAllocationUnit; + __le32 TotalAllocationUnits; + __le32 FreeAllocationUnits; + __le16 BytesPerSector; +} __attribute__((packed)) FILE_SYSTEM_ALLOC_INFO; + +typedef struct { + __le16 MajorVersionNumber; + __le16 MinorVersionNumber; + __le64 Capability; +} __attribute__((packed)) FILE_SYSTEM_UNIX_INFO; /* Unix extension level 0x200*/ + +/* Version numbers for CIFS UNIX major and minor. */ +#define CIFS_UNIX_MAJOR_VERSION 1 +#define CIFS_UNIX_MINOR_VERSION 0 + +/* Linux/Unix extensions capability flags */ +#define CIFS_UNIX_FCNTL_CAP 0x00000001 /* support for fcntl locks */ +#define CIFS_UNIX_POSIX_ACL_CAP 0x00000002 /* support getfacl/setfacl */ +#define CIFS_UNIX_XATTR_CAP 0x00000004 /* support new namespace */ +#define CIFS_UNIX_EXTATTR_CAP 0x00000008 /* support chattr/chflag */ +#define CIFS_UNIX_POSIX_PATHNAMES_CAP 0x00000010 /* Allow POSIX path chars */ +#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based + calls including posix open + and posix unlink */ +#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up + to 0xFFFF00 */ +#define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 +#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ +#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and + QFS PROXY call */ +#ifdef CONFIG_CIFS_POSIX +/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send + LockingX instead of posix locking call on unix sess (and we do not expect + LockingX to use different (ie Windows) semantics than posix locking on + the same session (if WINE needs to do this later, we can add this cap + back in later */ +/* #define CIFS_UNIX_CAP_MASK 0x000000fb */ +#define CIFS_UNIX_CAP_MASK 0x000003db +#else +#define CIFS_UNIX_CAP_MASK 0x00000013 +#endif /* CONFIG_CIFS_POSIX */ + + +#define CIFS_POSIX_EXTENSIONS 0x00000010 /* support for new QFSInfo */ + +typedef struct { + /* For undefined recommended transfer size return -1 in that field */ + __le32 OptimalTransferSize; /* bsize on some os, iosize on other os */ + __le32 BlockSize; + /* The next three fields are in terms of the block size. + (above). If block size is unknown, 4096 would be a + reasonable block size for a server to report. + Note that returning the blocks/blocksavail removes need + to make a second call (to QFSInfo level 0x103 to get this info. + UserBlockAvail is typically less than or equal to BlocksAvail, + if no distinction is made return the same value in each */ + __le64 TotalBlocks; + __le64 BlocksAvail; /* bfree */ + __le64 UserBlocksAvail; /* bavail */ + /* For undefined Node fields or FSID return -1 */ + __le64 TotalFileNodes; + __le64 FreeFileNodes; + __le64 FileSysIdentifier; /* fsid */ + /* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */ + /* NB flags can come from FILE_SYSTEM_DEVICE_INFO call */ +} __attribute__((packed)) FILE_SYSTEM_POSIX_INFO; + +/* DeviceType Flags */ +#define FILE_DEVICE_CD_ROM 0x00000002 +#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003 +#define FILE_DEVICE_DFS 0x00000006 +#define FILE_DEVICE_DISK 0x00000007 +#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008 +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#define FILE_DEVICE_NAMED_PIPE 0x00000011 +#define FILE_DEVICE_NETWORK 0x00000012 +#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014 +#define FILE_DEVICE_NULL 0x00000015 +#define FILE_DEVICE_PARALLEL_PORT 0x00000016 +#define FILE_DEVICE_PRINTER 0x00000018 +#define FILE_DEVICE_SERIAL_PORT 0x0000001b +#define FILE_DEVICE_STREAMS 0x0000001e +#define FILE_DEVICE_TAPE 0x0000001f +#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020 +#define FILE_DEVICE_VIRTUAL_DISK 0x00000024 +#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 + +typedef struct { + __le32 DeviceType; + __le32 DeviceCharacteristics; +} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */ + +/* minimum includes first three fields, and empty FS Name */ +#define MIN_FS_ATTR_INFO_SIZE 12 + + +/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 +#define FILE_SUPPORTS_USN_JOURNAL 0x02000000 +#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 +#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000 +#define FILE_SUPPORTS_HARD_LINKS 0x00400000 +#define FILE_SUPPORTS_TRANSACTIONS 0x00200000 +#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000 +#define FILE_READ_ONLY_VOLUME 0x00080000 +#define FILE_NAMED_STREAMS 0x00040000 +#define FILE_SUPPORTS_ENCRYPTION 0x00020000 +#define FILE_SUPPORTS_OBJECT_IDS 0x00010000 +#define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 +#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 +#define FILE_SUPPORTS_SPARSE_FILES 0x00000040 +#define FILE_VOLUME_QUOTAS 0x00000020 +#define FILE_FILE_COMPRESSION 0x00000010 +#define FILE_PERSISTENT_ACLS 0x00000008 +#define FILE_UNICODE_ON_DISK 0x00000004 +#define FILE_CASE_PRESERVED_NAMES 0x00000002 +#define FILE_CASE_SENSITIVE_SEARCH 0x00000001 +typedef struct { + __le32 Attributes; + __le32 MaxPathNameComponentLength; + __le32 FileSystemNameLen; + char FileSystemName[52]; /* do not have to save this - get subset? */ +} __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO; + +/******************************************************************************/ +/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */ +/******************************************************************************/ +typedef struct { /* data block encoding of response to level 263 QPathInfo */ + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; + __le64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __u64 IndexNumber1; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ + +/* defines for enumerating possible values of the Unix type field below */ +#define UNIX_FILE 0 +#define UNIX_DIR 1 +#define UNIX_SYMLINK 2 +#define UNIX_CHARDEV 3 +#define UNIX_BLOCKDEV 4 +#define UNIX_FIFO 5 +#define UNIX_SOCKET 6 +typedef struct { + __le64 EndOfFile; + __le64 NumOfBytes; + __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */ + __le64 LastAccessTime; + __le64 LastModificationTime; + __le64 Uid; + __le64 Gid; + __le32 Type; + __le64 DevMajor; + __le64 DevMinor; + __le64 UniqueId; + __le64 Permissions; + __le64 Nlinks; +} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ + +typedef struct { + char LinkDest[1]; +} __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ + +/* The following three structures are needed only for + setting time to NT4 and some older servers via + the primitive DOS time format */ +typedef struct { + __u16 Day:5; + __u16 Month:4; + __u16 Year:7; +} __attribute__((packed)) SMB_DATE; + +typedef struct { + __u16 TwoSeconds:5; + __u16 Minutes:6; + __u16 Hours:5; +} __attribute__((packed)) SMB_TIME; + +typedef struct { + __le16 CreationDate; /* SMB Date see above */ + __le16 CreationTime; /* SMB Time */ + __le16 LastAccessDate; + __le16 LastAccessTime; + __le16 LastWriteDate; + __le16 LastWriteTime; + __le32 DataSize; /* File Size (EOF) */ + __le32 AllocationSize; + __le16 Attributes; /* verify not u32 */ + __le32 EASize; +} __attribute__((packed)) FILE_INFO_STANDARD; /* level 1 SetPath/FileInfo */ + +typedef struct { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad; +} __attribute__((packed)) FILE_BASIC_INFO; /* size info, level 0x101 */ + +struct file_allocation_info { + __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ +} __attribute__((packed)); /* size used on disk, for level 0x103 for set, + 0x105 for query */ + +struct file_end_of_file_info { + __le64 FileSize; /* offset to end of file */ +} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ + +struct file_alt_name_info { + __u8 alt_name[1]; +} __attribute__((packed)); /* level 0x0108 */ + +struct file_stream_info { + __le32 number_of_streams; /* BB check sizes and verify location */ + /* followed by info on streams themselves + u64 size; + u64 allocation_size + stream info */ +}; /* level 0x109 */ + +struct file_compression_info { + __le64 compressed_size; + __le16 format; + __u8 unit_shift; + __u8 ch_shift; + __u8 cl_shift; + __u8 pad[3]; +} __attribute__((packed)); /* level 0x10b */ + +/* POSIX ACL set/query path info structures */ +#define CIFS_ACL_VERSION 1 +struct cifs_posix_ace { /* access control entry (ACE) */ + __u8 cifs_e_tag; + __u8 cifs_e_perm; + __le64 cifs_uid; /* or gid */ +} __attribute__((packed)); + +struct cifs_posix_acl { /* access conrol list (ACL) */ + __le16 version; + __le16 access_entry_count; /* access ACL - count of entries */ + __le16 default_entry_count; /* default ACL - count of entries */ + struct cifs_posix_ace ace_array[0]; + /* followed by + struct cifs_posix_ace default_ace_arraay[] */ +} __attribute__((packed)); /* level 0x204 */ + +/* types of access control entries already defined in posix_acl.h */ +/* #define CIFS_POSIX_ACL_USER_OBJ 0x01 +#define CIFS_POSIX_ACL_USER 0x02 +#define CIFS_POSIX_ACL_GROUP_OBJ 0x04 +#define CIFS_POSIX_ACL_GROUP 0x08 +#define CIFS_POSIX_ACL_MASK 0x10 +#define CIFS_POSIX_ACL_OTHER 0x20 */ + +/* types of perms */ +/* #define CIFS_POSIX_ACL_EXECUTE 0x01 +#define CIFS_POSIX_ACL_WRITE 0x02 +#define CIFS_POSIX_ACL_READ 0x04 */ + +/* end of POSIX ACL definitions */ + +/* POSIX Open Flags */ +#define SMB_O_RDONLY 0x1 +#define SMB_O_WRONLY 0x2 +#define SMB_O_RDWR 0x4 +#define SMB_O_CREAT 0x10 +#define SMB_O_EXCL 0x20 +#define SMB_O_TRUNC 0x40 +#define SMB_O_APPEND 0x80 +#define SMB_O_SYNC 0x100 +#define SMB_O_DIRECTORY 0x200 +#define SMB_O_NOFOLLOW 0x400 +#define SMB_O_DIRECT 0x800 + +typedef struct { + __le32 OpenFlags; /* same as NT CreateX */ + __le32 PosixOpenFlags; + __le64 Permissions; + __le16 Level; /* reply level requested (see QPathInfo levels) */ +} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */ + +typedef struct { + __le16 OplockFlags; + __u16 Fid; + __le32 CreateAction; + __le16 ReturnedLevel; + __le16 Pad; + /* struct following varies based on requested level */ +} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */ + +#define SMB_POSIX_UNLINK_FILE_TARGET 0 +#define SMB_POSIX_UNLINK_DIRECTORY_TARGET 1 + +struct unlink_psx_rq { /* level 0x20a SetPathInfo */ + __le16 type; +} __attribute__((packed)); + +struct file_internal_info { + __le64 UniqueId; /* inode number */ +} __attribute__((packed)); /* level 0x3ee */ + +struct file_mode_info { + __le32 Mode; +} __attribute__((packed)); /* level 0x3f8 */ + +struct file_attrib_tag { + __le32 Attribute; + __le32 ReparseTag; +} __attribute__((packed)); /* level 0x40b */ + + +/********************************************************/ +/* FindFirst/FindNext transact2 data buffer formats */ +/********************************************************/ + +typedef struct { + __le32 NextEntryOffset; + __u32 ResumeKey; /* as with FileIndex - no need to convert */ + FILE_UNIX_BASIC_INFO basic; + char FileName[1]; +} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + char FileName[1]; +} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* EA size */ + __le32 Reserved; + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ + char FileName[1]; +} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + __u8 ShortNameLength; + __u8 Reserved; + __u8 ShortName[12]; + char FileName[1]; +} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ + +typedef struct { + __u32 ResumeKey; + __le16 CreationDate; /* SMB Date */ + __le16 CreationTime; /* SMB Time */ + __le16 LastAccessDate; + __le16 LastAccessTime; + __le16 LastWriteDate; + __le16 LastWriteTime; + __le32 DataSize; /* File Size (EOF) */ + __le32 AllocationSize; + __le16 Attributes; /* verify not u32 */ + __u8 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ + + +struct win_dev { + unsigned char type[8]; /* IntxCHR or IntxBLK */ + __le64 major; + __le64 minor; +} __attribute__((packed)); + +struct gea { + unsigned char name_len; + char name[1]; +} __attribute__((packed)); + +struct gealist { + unsigned long list_len; + struct gea list[1]; +} __attribute__((packed)); + +struct fea { + unsigned char EA_flags; + __u8 name_len; + __le16 value_len; + char name[1]; + /* optionally followed by value */ +} __attribute__((packed)); +/* flags for _FEA.fEA */ +#define FEA_NEEDEA 0x80 /* need EA bit */ + +struct fealist { + __le32 list_len; + struct fea list[1]; +} __attribute__((packed)); + +/* used to hold an arbitrary blob of data */ +struct data_blob { + __u8 *data; + size_t length; + void (*free) (struct data_blob *data_blob); +} __attribute__((packed)); + + +#ifdef CONFIG_CIFS_POSIX +/* + For better POSIX semantics from Linux client, (even better + than the existing CIFS Unix Extensions) we need updated PDUs for: + + 1) PosixCreateX - to set and return the mode, inode#, device info and + perhaps add a CreateDevice - to create Pipes and other special .inodes + Also note POSIX open flags + 2) Close - to return the last write time to do cache across close + more safely + 3) FindFirst return unique inode number - what about resume key, two + forms short (matches readdir) and full (enough info to cache inodes) + 4) Mkdir - set mode + + And under consideration: + 5) FindClose2 (return nanosecond timestamp ??) + 6) Use nanosecond timestamps throughout all time fields if + corresponding attribute flag is set + 7) sendfile - handle based copy + + what about fixing 64 bit alignment + + There are also various legacy SMB/CIFS requests used as is + + From existing Lanman and NTLM dialects: + -------------------------------------- + NEGOTIATE + SESSION_SETUP_ANDX (BB which?) + TREE_CONNECT_ANDX (BB which wct?) + TREE_DISCONNECT (BB add volume timestamp on response) + LOGOFF_ANDX + DELETE (note delete open file behavior) + DELETE_DIRECTORY + READ_AND_X + WRITE_AND_X + LOCKING_AND_X (note posix lock semantics) + RENAME (note rename across dirs and open file rename posix behaviors) + NT_RENAME (for hardlinks) Is this good enough for all features? + FIND_CLOSE2 + TRANSACTION2 (18 cases) + SMB_SET_FILE_END_OF_FILE_INFO2 SMB_SET_PATH_END_OF_FILE_INFO2 + (BB verify that never need to set allocation size) + SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via + Unix ext?) + + COPY (note support for copy across directories) - FUTURE, OPTIONAL + setting/getting OS/2 EAs - FUTURE (BB can this handle + setting Linux xattrs perfectly) - OPTIONAL + dnotify - FUTURE, OPTIONAL + quota - FUTURE, OPTIONAL + + Note that various requests implemented for NT interop such as + NT_TRANSACT (IOCTL) QueryReparseInfo + are unneeded to servers compliant with the CIFS POSIX extensions + + From CIFS Unix Extensions: + ------------------------- + T2 SET_PATH_INFO (SMB_SET_FILE_UNIX_LINK) for symlinks + T2 SET_PATH_INFO (SMB_SET_FILE_BASIC_INFO2) + T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_LINK) + T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_BASIC) BB check for missing + inode fields + Actually a need QUERY_FILE_UNIX_INFO + since has inode num + BB what about a) blksize/blkbits/blocks + b) i_version + c) i_rdev + d) notify mask? + e) generation + f) size_seqcount + T2 FIND_FIRST/FIND_NEXT FIND_FILE_UNIX + TRANS2_GET_DFS_REFERRAL - OPTIONAL but recommended + T2_QFS_INFO QueryDevice/AttributeInfo - OPTIONAL + */ + +/* xsymlink is a symlink format (used by MacOS) that can be used + to save symlink info in a regular file when + mounted to operating systems that do not + support the cifs Unix extensions or EAs (for xattr + based symlinks). For such a file to be recognized + as containing symlink data: + + 1) file size must be 1067, + 2) signature must begin file data, + 3) length field must be set to ASCII representation + of a number which is less than or equal to 1024, + 4) md5 must match that of the path data */ + +struct xsymlink { + /* 1067 bytes */ + char signature[4]; /* XSym */ /* not null terminated */ + char cr0; /* \n */ +/* ASCII representation of length (4 bytes decimal) terminated by \n not null */ + char length[4]; + char cr1; /* \n */ +/* md5 of valid subset of path ie path[0] through path[length-1] */ + __u8 md5[32]; + char cr2; /* \n */ +/* if room left, then end with \n then 0x20s by convention but not required */ + char path[1024]; +} __attribute__((packed)); + +typedef struct file_xattr_info { + /* BB do we need another field for flags? BB */ + __u32 xattr_name_len; + __u32 xattr_value_len; + char xattr_name[0]; + /* followed by xattr_value[xattr_value_len], no pad */ +} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info + level 0x205 */ + +/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */ + +typedef struct file_chattr_info { + __le64 mask; /* list of all possible attribute bits */ + __le64 mode; /* list of actual attribute bits on this inode */ +} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes + (chattr, chflags) level 0x206 */ +#endif /* POSIX */ +#endif /* _CIFSPDU_H */ diff --git a/kernel/fs/cifs/cifsproto.h b/kernel/fs/cifs/cifsproto.h new file mode 100644 index 000000000..c63fd1dde --- /dev/null +++ b/kernel/fs/cifs/cifsproto.h @@ -0,0 +1,514 @@ +/* + * fs/cifs/cifsproto.h + * + * Copyright (c) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFSPROTO_H +#define _CIFSPROTO_H +#include + +struct statfs; +struct smb_vol; +struct smb_rqst; + +/* + ***************************************************************** + * All Prototypes + ***************************************************************** + */ + +extern struct smb_hdr *cifs_buf_get(void); +extern void cifs_buf_release(void *); +extern struct smb_hdr *cifs_small_buf_get(void); +extern void cifs_small_buf_release(void *); +extern void free_rsp_buf(int, void *); +extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, + struct kvec *iov); +extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, + unsigned int /* length */); +extern unsigned int _get_xid(void); +extern void _free_xid(unsigned int); +#define get_xid() \ +({ \ + unsigned int __xid = _get_xid(); \ + cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ + __func__, __xid, \ + from_kuid(&init_user_ns, current_fsuid())); \ + __xid; \ +}) + +#define free_xid(curr_xid) \ +do { \ + _free_xid(curr_xid); \ + cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ + __func__, curr_xid, (int)rc); \ +} while (0) +extern int init_cifs_idmap(void); +extern void exit_cifs_idmap(void); +extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct smb_vol *vol, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon); +extern char *build_wildcard_path_from_dentry(struct dentry *direntry); +extern char *cifs_compose_mount_options(const char *sb_mountdata, + const char *fullpath, const struct dfs_info3_param *ref, + char **devname); +/* extern void renew_parental_timestamps(struct dentry *direntry);*/ +extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, + struct TCP_Server_Info *server); +extern void DeleteMidQEntry(struct mid_q_entry *midEntry); +extern void cifs_delete_mid(struct mid_q_entry *mid); +extern void cifs_wake_up_task(struct mid_q_entry *mid); +extern int cifs_call_async(struct TCP_Server_Info *server, + struct smb_rqst *rqst, + mid_receive_t *receive, mid_callback_t *callback, + void *cbdata, const int flags); +extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, + struct smb_hdr * /* input */ , + struct smb_hdr * /* out */ , + int * /* bytes returned */ , const int); +extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, + char *in_buf, int flags); +extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *, + struct smb_rqst *); +extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, + struct smb_rqst *); +extern int cifs_check_receive(struct mid_q_entry *mid, + struct TCP_Server_Info *server, bool log_error); +extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, + unsigned int size, unsigned int *num, + unsigned int *credits); +extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, + struct kvec *, int /* nvec to send */, + int * /* type of buf returned */ , const int flags); +extern int SendReceiveBlockingLock(const unsigned int xid, + struct cifs_tcon *ptcon, + struct smb_hdr *in_buf , + struct smb_hdr *out_buf, + int *bytes_returned); +extern int cifs_reconnect(struct TCP_Server_Info *server); +extern int checkSMB(char *buf, unsigned int length); +extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); +extern bool backup_cred(struct cifs_sb_info *); +extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); +extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written); +extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); +extern unsigned int smbCalcSize(void *buf); +extern int decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server); +extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port); +extern int map_smb_to_linux_error(char *buf, bool logErr); +extern void header_assemble(struct smb_hdr *, char /* command */ , + const struct cifs_tcon *, int /* length of + fixed section (word count) in two byte units */); +extern int small_smb_init_no_tc(const int smb_cmd, const int wct, + struct cifs_ses *ses, + void **request_buf); +extern enum securityEnum select_sectype(struct TCP_Server_Info *server, + enum securityEnum requested); +extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp); +extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); +extern u64 cifs_UnixTimeToNT(struct timespec); +extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, + int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); +extern int cifs_unlock_range(struct cifsFileInfo *cfile, + struct file_lock *flock, const unsigned int xid); +extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); + +extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, + struct file *file, + struct tcon_link *tlink, + __u32 oplock); +extern int cifs_posix_open(char *full_path, struct inode **inode, + struct super_block *sb, int mode, + unsigned int f_flags, __u32 *oplock, __u16 *netfid, + unsigned int xid); +void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); +extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, + FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb); +extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *, + struct cifs_sb_info *); +extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern struct inode *cifs_iget(struct super_block *sb, + struct cifs_fattr *fattr); + +extern int cifs_get_inode_info(struct inode **inode, const char *full_path, + FILE_ALL_INFO *data, struct super_block *sb, + int xid, const struct cifs_fid *fid); +extern int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *search_path, + struct super_block *sb, unsigned int xid); +extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, + unsigned int xid, char *full_path, __u32 dosattr); +extern int cifs_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); +extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, struct inode *inode, + const char *path, const struct cifs_fid *pfid); +extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, + kuid_t, kgid_t); +extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, + const char *, u32 *); +extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); +extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, + const char *, int); + +extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); +extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read); +extern int cifs_readv_from_socket(struct TCP_Server_Info *server, + struct kvec *iov_orig, unsigned int nr_segs, + unsigned int to_read); +extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, + struct cifs_sb_info *cifs_sb); +extern int cifs_match_super(struct super_block *, void *); +extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); +extern struct smb_vol *cifs_get_volume_info(char *mount_data, + const char *devname); +extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); +extern void cifs_umount(struct cifs_sb_info *); +extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u8 type, + struct cifsLockInfo **conf_lock, + int rw_check); +extern void cifs_add_pending_open(struct cifs_fid *fid, + struct tcon_link *tlink, + struct cifs_pending_open *open); +extern void cifs_add_pending_open_locked(struct cifs_fid *fid, + struct tcon_link *tlink, + struct cifs_pending_open *open); +extern void cifs_del_pending_open(struct cifs_pending_open *open); + +#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) +extern void cifs_dfs_release_automount_timer(void); +#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +#define cifs_dfs_release_automount_timer() do { } while (0) +#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ + +void cifs_proc_init(void); +void cifs_proc_clean(void); + +extern void cifs_move_llist(struct list_head *source, struct list_head *dest); +extern void cifs_free_llist(struct list_head *llist); +extern void cifs_del_lock_waiters(struct cifsLockInfo *lock); + +extern int cifs_negotiate_protocol(const unsigned int xid, + struct cifs_ses *ses); +extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info); +extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required); +extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); + +extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, + const char *tree, struct cifs_tcon *tcon, + const struct nls_table *); + +extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, + const char *searchName, struct cifs_sb_info *cifs_sb, + __u16 *searchHandle, __u16 search_flags, + struct cifs_search_info *psrch_inf, + bool msearch); + +extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, + __u16 searchHandle, __u16 search_flags, + struct cifs_search_info *psrch_inf); + +extern int CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 search_handle); + +extern int CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_ALL_INFO *pFindData); +extern int CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_Name, FILE_ALL_INFO *data, + int legacy /* whether to use old info level */, + const struct nls_table *nls_codepage, int remap); +extern int SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_name, FILE_ALL_INFO *data, + const struct nls_table *nls_codepage, int remap); + +extern int CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_UNIX_BASIC_INFO *pFindData); +extern int CIFSSMBUnixQPathInfo(const unsigned int xid, + struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_UNIX_BASIC_INFO *pFindData, + const struct nls_table *nls_codepage, int remap); + +extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, + const char *search_name, + struct dfs_info3_param **target_nodes, + unsigned int *num_of_nodes, + const struct nls_table *nls_codepage, int remap); + +extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, + const char *old_path, + const struct nls_table *nls_codepage, + unsigned int *num_referrals, + struct dfs_info3_param **referrals, int remap); +extern void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol); +extern int CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); +extern int SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); +extern int CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, + __u64 cap); + +extern int CIFSSMBQFSAttributeInfo(const unsigned int xid, + struct cifs_tcon *tcon); +extern int CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon); +extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon); +extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); + +extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const FILE_BASIC_INFO *data, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + const FILE_BASIC_INFO *data, __u16 fid, + __u32 pid_of_opener); +extern int CIFSSMBSetFileDisposition(const unsigned int xid, + struct cifs_tcon *tcon, + bool delete_file, __u16 fid, + __u32 pid_of_opener); +#if 0 +extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, + char *fileName, __u16 dos_attributes, + const struct nls_table *nls_codepage); +#endif /* possibly unneeded function */ +extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, + const char *file_name, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_allocation); +extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, __u64 size, + bool set_allocation); + +struct cifs_unix_set_info_args { + __u64 ctime; + __u64 atime; + __u64 mtime; + __u64 mode; + kuid_t uid; + kgid_t gid; + dev_t device; +}; + +extern int CIFSSMBUnixSetFileInfo(const unsigned int xid, + struct cifs_tcon *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener); + +extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, + struct cifs_tcon *tcon, const char *file_name, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, + int remap); + +extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, __u16 type, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon, + int netfid, const char *target_name, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +extern int CIFSUnixCreateHardLink(const unsigned int xid, + struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSUnixCreateSymLink(const unsigned int xid, + struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap); +extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, + struct cifs_tcon *tcon, + const unsigned char *searchName, char **syminfo, + const struct nls_table *nls_codepage, int remap); +extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage); +extern int CIFSSMB_set_compression(const unsigned int xid, + struct cifs_tcon *tcon, __u16 fid); +extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, + int *oplock, FILE_ALL_INFO *buf); +extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const int disposition, + const int access_flags, const int omode, + __u16 *netfid, int *pOplock, FILE_ALL_INFO *, + const struct nls_table *nls_codepage, int remap); +extern int CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon, + u32 posix_flags, __u64 mode, __u16 *netfid, + FILE_UNIX_BASIC_INFO *pRetData, + __u32 *pOplock, const char *name, + const struct nls_table *nls_codepage, int remap); +extern int CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, + const int smb_file_id); + +extern int CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, + const int smb_file_id); + +extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, + int *return_buf_type); +extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, const char *buf, + const char __user *ubuf, const int long_op); +extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, const int nvec); +extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_name, __u64 *inode_number, + const struct nls_table *nls_codepage, + int remap); + +extern int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 netfid, const __u8 lock_type, + const __u32 num_unlock, const __u32 num_lock, + LOCKING_ANDX_RANGE *buf); +extern int CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 netfid, const __u32 netpid, const __u64 len, + const __u64 offset, const __u32 numUnlock, + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level); +extern int CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const __u32 netpid, + const loff_t start_offset, const __u64 len, + struct file_lock *, const __u16 lock_type, + const bool waitFlag); +extern int CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon); +extern int CIFSSMBEcho(struct TCP_Server_Info *server); +extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses); + +extern struct cifs_ses *sesInfoAlloc(void); +extern void sesInfoFree(struct cifs_ses *); +extern struct cifs_tcon *tconInfoAlloc(void); +extern void tconInfoFree(struct cifs_tcon *); + +extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number); +extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *, + __u32 *); +extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); +extern int cifs_verify_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server, + __u32 expected_sequence_number); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); +extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); +extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern int calc_seckey(struct cifs_ses *); +extern int generate_smb3signingkey(struct cifs_ses *); + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +extern int calc_lanman_hash(const char *password, const char *cryptkey, + bool encrypt, char *lnm_session_key); +#endif /* CIFS_WEAK_PW_HASH */ +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ +extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *file, int multishot, + const struct nls_table *nls_codepage); +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ +extern int CIFSSMBCopy(unsigned int xid, + struct cifs_tcon *source_tcon, + const char *fromName, + const __u16 target_tid, + const char *toName, const int flags, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + const unsigned char *ea_name, char *EAData, + size_t bufsize, const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const char *ea_name, + const void *ea_value, const __u16 ea_value_len, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); +extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, + struct cifs_ntsd *, __u32, int); +extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + char *acl_inf, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const char *local_acl, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, + const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); +extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr); +extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + const unsigned char *path); +extern int mdfour(unsigned char *, unsigned char *, int); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); +extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, + unsigned char *p24); + +void cifs_readdata_release(struct kref *refcount); +int cifs_async_readv(struct cifs_readdata *rdata); +int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); + +int cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); +void cifs_writev_complete(struct work_struct *work); +struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, + work_func_t complete); +void cifs_writedata_release(struct kref *refcount); +int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_read); +int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_written); +#endif /* _CIFSPROTO_H */ diff --git a/kernel/fs/cifs/cifssmb.c b/kernel/fs/cifs/cifssmb.c new file mode 100644 index 000000000..f26ffbfc6 --- /dev/null +++ b/kernel/fs/cifs/cifssmb.c @@ -0,0 +1,6526 @@ +/* + * fs/cifs/cifssmb.c + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Contains the routines for constructing the SMB PDUs themselves + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ + /* These are mostly routines that operate on a pathname, or on a tree id */ + /* (mounted volume), but there are eight handle based routines which must be */ + /* treated slightly differently for reconnection purposes since we never */ + /* want to reuse a stale file handle and only the caller knows the file info */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsacl.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "fscache.h" + +#ifdef CONFIG_CIFS_POSIX +static struct { + int index; + char *name; +} protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, + {LANMAN2_PROT, "\2LANMAN2.1"}, +#endif /* weak password hashing for legacy clients */ + {CIFS_PROT, "\2NT LM 0.12"}, + {POSIX_PROT, "\2POSIX 2"}, + {BAD_PROT, "\2"} +}; +#else +static struct { + int index; + char *name; +} protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, + {LANMAN2_PROT, "\2LANMAN2.1"}, +#endif /* weak password hashing for legacy clients */ + {CIFS_PROT, "\2NT LM 0.12"}, + {BAD_PROT, "\2"} +}; +#endif + +/* define the number of elements in the cifs dialect array */ +#ifdef CONFIG_CIFS_POSIX +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 4 +#else +#define CIFS_NUM_PROT 2 +#endif /* CIFS_WEAK_PW_HASH */ +#else /* not posix */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 3 +#else +#define CIFS_NUM_PROT 1 +#endif /* CONFIG_CIFS_WEAK_PW_HASH */ +#endif /* CIFS_POSIX */ + +/* + * Mark as invalid, all open files on tree connections since they + * were closed when session to server was lost. + */ +void +cifs_mark_open_files_invalid(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file = NULL; + struct list_head *tmp; + struct list_head *tmp1; + + /* list all files open on tree connection and mark them invalid */ + spin_lock(&cifs_file_list_lock); + list_for_each_safe(tmp, tmp1, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + open_file->invalidHandle = true; + open_file->oplock_break_cancelled = true; + } + spin_unlock(&cifs_file_list_lock); + /* + * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * to this tcon. + */ +} + +/* reconnect the socket, tcon, and smb session if needed */ +static int +cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) +{ + int rc; + struct cifs_ses *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; + + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb_command); + return -ENODEV; + } + } + + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + + /* are we still trying to reconnect? */ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry) { + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); + return -EHOSTDOWN; + } + } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses); + if (rc == 0 && ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; + } + + cifs_mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + mutex_unlock(&ses->session_mutex); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + + if (rc) + goto out; + + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + *request_buf = cifs_small_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + + header_assemble((struct smb_hdr *) *request_buf, smb_command, + tcon, wct); + + if (tcon != NULL) + cifs_stats_inc(&tcon->num_smbs_sent); + + return 0; +} + +int +small_smb_init_no_tc(const int smb_command, const int wct, + struct cifs_ses *ses, void **request_buf) +{ + int rc; + struct smb_hdr *buffer; + + rc = small_smb_init(smb_command, wct, NULL, request_buf); + if (rc) + return rc; + + buffer = (struct smb_hdr *)*request_buf; + buffer->Mid = get_next_mid(ses->server); + if (ses->capabilities & CAP_UNICODE) + buffer->Flags2 |= SMBFLG2_UNICODE; + if (ses->capabilities & CAP_STATUS32) + buffer->Flags2 |= SMBFLG2_ERR_STATUS; + + /* uid, tid can stay at zero as set in header assemble */ + + /* BB add support for turning on the signing when + this function is used after 1st of session setup requests */ + + return rc; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +__smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + *request_buf = cifs_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + /* Although the original thought was we needed the response buf for */ + /* potential retries of smb operations it turns out we can determine */ + /* from the mid flags when the request buffer can be resent without */ + /* having to use a second distinct buffer for the response */ + if (response_buf) + *response_buf = *request_buf; + + header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon, + wct); + + if (tcon != NULL) + cifs_stats_inc(&tcon->num_smbs_sent); + + return 0; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int +smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + if (tcon->ses->need_reconnect || tcon->need_reconnect) + return -EHOSTDOWN; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int validate_t2(struct smb_t2_rsp *pSMB) +{ + unsigned int total_size; + + /* check for plausible wct */ + if (pSMB->hdr.WordCount < 10) + goto vt2_err; + + /* check for parm and data offset going beyond end of smb */ + if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 || + get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024) + goto vt2_err; + + total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount); + if (total_size >= 512) + goto vt2_err; + + /* check that bcc is at least as big as parms + data, and that it is + * less than negotiated smb buffer + */ + total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount); + if (total_size > get_bcc(&pSMB->hdr) || + total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) + goto vt2_err; + + return 0; +vt2_err: + cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, + sizeof(struct smb_t2_rsp) + 16); + return -EINVAL; +} + +static int +decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr) +{ + int rc = 0; + u16 count; + char *guid = pSMBr->u.extended_response.GUID; + struct TCP_Server_Info *server = ses->server; + + count = get_bcc(&pSMBr->hdr); + if (count < SMB1_CLIENT_GUID_SIZE) + return -EIO; + + spin_lock(&cifs_tcp_ses_lock); + if (server->srv_count > 1) { + spin_unlock(&cifs_tcp_ses_lock); + if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) { + cifs_dbg(FYI, "server UID changed\n"); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + } else { + spin_unlock(&cifs_tcp_ses_lock); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + + if (count == SMB1_CLIENT_GUID_SIZE) { + server->sec_ntlmssp = true; + } else { + count -= SMB1_CLIENT_GUID_SIZE; + rc = decode_negTokenInit( + pSMBr->u.extended_response.SecurityBlob, count, server); + if (rc != 1) + return -EINVAL; + } + + return 0; +} + +int +cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) +{ + bool srv_sign_required = server->sec_mode & server->vals->signing_required; + bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; + bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN; + + /* + * Is signing required by mnt options? If not then check + * global_secflags to see if it is there. + */ + if (!mnt_sign_required) + mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) == + CIFSSEC_MUST_SIGN); + + /* + * If signing is required then it's automatically enabled too, + * otherwise, check to see if the secflags allow it. + */ + mnt_sign_enabled = mnt_sign_required ? mnt_sign_required : + (global_secflags & CIFSSEC_MAY_SIGN); + + /* If server requires signing, does client allow it? */ + if (srv_sign_required) { + if (!mnt_sign_enabled) { + cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!"); + return -ENOTSUPP; + } + server->sign = true; + } + + /* If client requires signing, does server allow it? */ + if (mnt_sign_required) { + if (!srv_sign_enabled) { + cifs_dbg(VFS, "Server does not support signing!"); + return -ENOTSUPP; + } + server->sign = true; + } + + return 0; +} + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + __s16 tmp; + struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; + + if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT) + return -EOPNOTSUPP; + + server->sec_mode = le16_to_cpu(rsp->SecurityMode); + server->maxReq = min_t(unsigned int, + le16_to_cpu(rsp->MaxMpxCount), + cifs_max_pending); + set_credits(server, server->maxReq); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); + /* even though we do not use raw we might as well set this + accurately, in case we ever find a need for it */ + if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { + server->max_rw = 0xFF00; + server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; + } else { + server->max_rw = 0;/* do not need to use raw anyway */ + server->capabilities = CAP_MPX_MODE; + } + tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); + if (tmp == -1) { + /* OS/2 often does not set timezone therefore + * we must use server time to calc time zone. + * Could deviate slightly from the right zone. + * Smallest defined timezone difference is 15 minutes + * (i.e. Nepal). Rounding up/down is done to match + * this requirement. + */ + int val, seconds, remain, result; + struct timespec ts, utc; + utc = CURRENT_TIME; + ts = cnvrtDosUnixTm(rsp->SrvTime.Date, + rsp->SrvTime.Time, 0); + cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", + (int)ts.tv_sec, (int)utc.tv_sec, + (int)(utc.tv_sec - ts.tv_sec)); + val = (int)(utc.tv_sec - ts.tv_sec); + seconds = abs(val); + result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; + remain = seconds % MIN_TZ_ADJ; + if (remain >= (MIN_TZ_ADJ / 2)) + result += MIN_TZ_ADJ; + if (val < 0) + result = -result; + server->timeAdj = result; + } else { + server->timeAdj = (int)tmp; + server->timeAdj *= 60; /* also in seconds */ + } + cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); + + + /* BB get server time for time conversions and add + code to use it and timezone since this is not UTC */ + + if (rsp->EncryptionKeyLength == + cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { + memcpy(server->cryptkey, rsp->EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + return -EIO; /* need cryptkey unless plain text */ + } + + cifs_dbg(FYI, "LANMAN negotiated\n"); + return 0; +} +#else +static inline int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); + return -EOPNOTSUPP; +} +#endif + +static bool +should_set_ext_sec_flag(enum securityEnum sectype) +{ + switch (sectype) { + case RawNTLMSSP: + case Kerberos: + return true; + case Unspecified: + if (global_secflags & + (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)) + return true; + /* Fallthrough */ + default: + return false; + } +} + +int +CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) +{ + NEGOTIATE_REQ *pSMB; + NEGOTIATE_RSP *pSMBr; + int rc = 0; + int bytes_returned; + int i; + struct TCP_Server_Info *server = ses->server; + u16 count; + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + pSMB->hdr.Mid = get_next_mid(server); + pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); + + if (should_set_ext_sec_flag(ses->sectype)) { + cifs_dbg(FYI, "Requesting extended security."); + pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; + } + + count = 0; + for (i = 0; i < CIFS_NUM_PROT; i++) { + strncpy(pSMB->DialectsArray+count, protocols[i].name, 16); + count += strlen(protocols[i].name) + 1; + /* null at end of source and target buffers anyway */ + } + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc != 0) + goto neg_err_exit; + + server->dialect = le16_to_cpu(pSMBr->DialectIndex); + cifs_dbg(FYI, "Dialect: %d\n", server->dialect); + /* Check wct = 1 error case */ + if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) { + /* core returns wct = 1, but we do not ask for core - otherwise + small wct just comes when dialect index is -1 indicating we + could not negotiate a common dialect */ + rc = -EOPNOTSUPP; + goto neg_err_exit; + } else if (pSMBr->hdr.WordCount == 13) { + server->negflavor = CIFS_NEGFLAVOR_LANMAN; + rc = decode_lanman_negprot_rsp(server, pSMBr); + goto signing_check; + } else if (pSMBr->hdr.WordCount != 17) { + /* unknown wct */ + rc = -EOPNOTSUPP; + goto neg_err_exit; + } + /* else wct == 17, NTLM or better */ + + server->sec_mode = pSMBr->SecurityMode; + if ((server->sec_mode & SECMODE_USER) == 0) + cifs_dbg(FYI, "share mode security\n"); + + /* one byte, so no need to convert this or EncryptionKeyLen from + little endian */ + server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), + cifs_max_pending); + set_credits(server, server->maxReq); + /* probably no need to store and check maxvcs */ + server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); + server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); + cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); + server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); + server->timeAdj *= 60; + + if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; + memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) && + (pSMBr->EncryptionKeyLength == 0)) { + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; + rc = decode_ext_sec_blob(ses, pSMBr); + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + rc = -EIO; /* no crypt key only if plain text pwd */ + } else { + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; + server->capabilities &= ~CAP_EXTENDED_SECURITY; + } + +signing_check: + if (!rc) + rc = cifs_enable_signing(server, ses->sign); +neg_err_exit: + cifs_buf_release(pSMB); + + cifs_dbg(FYI, "negprot rc %d\n", rc); + return rc; +} + +int +CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) +{ + struct smb_hdr *smb_buffer; + int rc = 0; + + cifs_dbg(FYI, "In tree disconnect\n"); + + /* BB: do we need to check this? These should never be NULL. */ + if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) + return -EIO; + + /* + * No need to return error on this operation if tid invalidated and + * closed on server already e.g. due to tcp session crashing. Also, + * the tcon is no longer on the list, so no need to take lock before + * checking this. + */ + if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) + return 0; + + rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, + (void **)&smb_buffer); + if (rc) + return rc; + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0); + if (rc) + cifs_dbg(FYI, "Tree disconnect failed %d\n", rc); + + /* No need to return error on this operation if tid invalidated and + closed on server already e.g. due to tcp session crashing */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +/* + * This is a no-op for now. We're not really interested in the reply, but + * rather in the fact that the server sent one and that server->lstrp + * gets updated. + * + * FIXME: maybe we should consider checking that the reply matches request? + */ +static void +cifs_echo_callback(struct mid_q_entry *mid) +{ + struct TCP_Server_Info *server = mid->callback_data; + + DeleteMidQEntry(mid); + add_credits(server, 1, CIFS_ECHO_OP); +} + +int +CIFSSMBEcho(struct TCP_Server_Info *server) +{ + ECHO_REQ *smb; + int rc = 0; + struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; + + cifs_dbg(FYI, "In echo request\n"); + + rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb); + if (rc) + return rc; + + /* set up echo request */ + smb->hdr.Tid = 0xffff; + smb->hdr.WordCount = 1; + put_unaligned_le16(1, &smb->EchoCount); + put_bcc(1, &smb->hdr); + smb->Data[0] = 'a'; + inc_rfc1001_len(smb, 3); + iov.iov_base = smb; + iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, + server, CIFS_ASYNC_OP | CIFS_ECHO_OP); + if (rc) + cifs_dbg(FYI, "Echo request failed: %d\n", rc); + + cifs_small_buf_release(smb); + + return rc; +} + +int +CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) +{ + LOGOFF_ANDX_REQ *pSMB; + int rc = 0; + + cifs_dbg(FYI, "In SMBLogoff for session disconnect\n"); + + /* + * BB: do we need to check validity of ses and server? They should + * always be valid since we have an active reference. If not, that + * should probably be a BUG() + */ + if (!ses || !ses->server) + return -EIO; + + mutex_lock(&ses->session_mutex); + if (ses->need_reconnect) + goto session_already_dead; /* no need to send SMBlogoff if uid + already closed due to reconnect */ + rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); + if (rc) { + mutex_unlock(&ses->session_mutex); + return rc; + } + + pSMB->hdr.Mid = get_next_mid(ses->server); + + if (ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + pSMB->hdr.Uid = ses->Suid; + + pSMB->AndXCommand = 0xFF; + rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0); +session_already_dead: + mutex_unlock(&ses->session_mutex); + + /* if session dead then we do not need to do ulogoff, + since server closed smb session, no sense reporting + error */ + if (rc == -EAGAIN) + rc = 0; + return rc; +} + +int +CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __u16 type, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + struct unlink_psx_rq *pRqD; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cifs_dbg(FYI, "In POSIX delete\n"); +PsxDelete: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB add path length overrun check */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = 0; /* BB double check this with jra */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + /* Setup pointer to Request Data (inode type) */ + pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset); + pRqD->type = cpu_to_le16(type); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + sizeof(struct unlink_psx_rq); + + pSMB->DataCount = cpu_to_le16(sizeof(struct unlink_psx_rq)); + pSMB->TotalDataCount = cpu_to_le16(sizeof(struct unlink_psx_rq)); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Posix delete returned %d\n", rc); + cifs_buf_release(pSMB); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); + + if (rc == -EAGAIN) + goto PsxDelete; + + return rc; +} + +int +CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + DELETE_FILE_REQ *pSMB = NULL; + DELETE_FILE_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + int remap = cifs_remap(cifs_sb); + +DelFileRetry: + rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->fileName, name, + PATH_MAX, cifs_sb->local_nls, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, name, name_len); + } + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); + if (rc) + cifs_dbg(FYI, "Error in RMFile = %d\n", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto DelFileRetry; + + return rc; +} + +int +CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + DELETE_DIRECTORY_REQ *pSMB = NULL; + DELETE_DIRECTORY_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + int remap = cifs_remap(cifs_sb); + + cifs_dbg(FYI, "In CIFSSMBRmDir\n"); +RmDirRetry: + rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, + PATH_MAX, cifs_sb->local_nls, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->DirName, name, name_len); + } + + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs); + if (rc) + cifs_dbg(FYI, "Error in RMDir = %d\n", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto RmDirRetry; + return rc; +} + +int +CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + int rc = 0; + CREATE_DIRECTORY_REQ *pSMB = NULL; + CREATE_DIRECTORY_RSP *pSMBr = NULL; + int bytes_returned; + int name_len; + int remap = cifs_remap(cifs_sb); + + cifs_dbg(FYI, "In CIFSSMBMkDir\n"); +MkDirRetry: + rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, + PATH_MAX, cifs_sb->local_nls, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->DirName, name, name_len); + } + + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs); + if (rc) + cifs_dbg(FYI, "Error in Mkdir = %d\n", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto MkDirRetry; + return rc; +} + +int +CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon, + __u32 posix_flags, __u64 mode, __u16 *netfid, + FILE_UNIX_BASIC_INFO *pRetData, __u32 *pOplock, + const char *name, const struct nls_table *nls_codepage, + int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count, count; + OPEN_PSX_REQ *pdata; + OPEN_PSX_RSP *psx_rsp; + + cifs_dbg(FYI, "In POSIX Create\n"); +PsxCreat: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, name, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, name, name_len); + } + + params = 6 + name_len; + count = sizeof(OPEN_PSX_REQ); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* large enough */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset); + pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pdata->Permissions = cpu_to_le64(mode); + pdata->PosixOpenFlags = cpu_to_le32(posix_flags); + pdata->OpenFlags = cpu_to_le32(*pOplock); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Posix create returned %d\n", rc); + goto psx_create_err; + } + + cifs_dbg(FYI, "copying inode info\n"); + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) { + rc = -EIO; /* bad smb */ + goto psx_create_err; + } + + /* copy return information to pRetData */ + psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset)); + + *pOplock = le16_to_cpu(psx_rsp->OplockFlags); + if (netfid) + *netfid = psx_rsp->Fid; /* cifs fid stays in le */ + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction) + *pOplock |= CIFS_CREATE_ACTION; + /* check to make sure response data is there */ + if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { + pRetData->Type = cpu_to_le32(-1); /* unknown */ + cifs_dbg(NOISY, "unknown type\n"); + } else { + if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP) + + sizeof(FILE_UNIX_BASIC_INFO)) { + cifs_dbg(VFS, "Open response data too small\n"); + pRetData->Type = cpu_to_le32(-1); + goto psx_create_err; + } + memcpy((char *) pRetData, + (char *)psx_rsp + sizeof(OPEN_PSX_RSP), + sizeof(FILE_UNIX_BASIC_INFO)); + } + +psx_create_err: + cifs_buf_release(pSMB); + + if (posix_flags & SMB_O_DIRECTORY) + cifs_stats_inc(&tcon->stats.cifs_stats.num_posixmkdirs); + else + cifs_stats_inc(&tcon->stats.cifs_stats.num_posixopens); + + if (rc == -EAGAIN) + goto PsxCreat; + + return rc; +} + +static __u16 convert_disposition(int disposition) +{ + __u16 ofun = 0; + + switch (disposition) { + case FILE_SUPERSEDE: + ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; + break; + case FILE_OPEN: + ofun = SMBOPEN_OAPPEND; + break; + case FILE_CREATE: + ofun = SMBOPEN_OCREATE; + break; + case FILE_OPEN_IF: + ofun = SMBOPEN_OCREATE | SMBOPEN_OAPPEND; + break; + case FILE_OVERWRITE: + ofun = SMBOPEN_OTRUNC; + break; + case FILE_OVERWRITE_IF: + ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; + break; + default: + cifs_dbg(FYI, "unknown disposition %d\n", disposition); + ofun = SMBOPEN_OAPPEND; /* regular open */ + } + return ofun; +} + +static int +access_flags_to_smbopen_mode(const int access_flags) +{ + int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE); + + if (masked_flags == GENERIC_READ) + return SMBOPEN_READ; + else if (masked_flags == GENERIC_WRITE) + return SMBOPEN_WRITE; + + /* just go for read/write */ + return SMBOPEN_READWRITE; +} + +int +SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const int openDisposition, + const int access_flags, const int create_options, __u16 *netfid, + int *pOplock, FILE_ALL_INFO *pfile_info, + const struct nls_table *nls_codepage, int remap) +{ + int rc = -EACCES; + OPENX_REQ *pSMB = NULL; + OPENX_RSP *pSMBr = NULL; + int bytes_returned; + int name_len; + __u16 count; + +OldOpenRetry: + rc = smb_init(SMB_COM_OPEN_ANDX, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->AndXCommand = 0xFF; /* none */ + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + count = 1; /* account for one byte pad to word boundary */ + name_len = + cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), + fileName, PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + count = 0; /* no pad */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, fileName, name_len); + } + if (*pOplock & REQ_OPLOCK) + pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK); + else if (*pOplock & REQ_BATCHOPLOCK) + pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK); + + pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO); + pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags)); + pSMB->Mode |= cpu_to_le16(0x40); /* deny none */ + /* set file as system file if special file such + as fifo and server expecting SFU style and + no Unix extensions */ + + if (create_options & CREATE_OPTION_SPECIAL) + pSMB->FileAttributes = cpu_to_le16(ATTR_SYSTEM); + else /* BB FIXME BB */ + pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/); + + if (create_options & CREATE_OPTION_READONLY) + pSMB->FileAttributes |= cpu_to_le16(ATTR_READONLY); + + /* BB FIXME BB */ +/* pSMB->CreateOptions = cpu_to_le32(create_options & + CREATE_OPTIONS_MASK); */ + /* BB FIXME END BB */ + + pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY); + pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition)); + count += name_len; + inc_rfc1001_len(pSMB, count); + + pSMB->ByteCount = cpu_to_le16(count); + /* long_op set to 1 to allow for oplock break timeouts */ + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); + if (rc) { + cifs_dbg(FYI, "Error in Open = %d\n", rc); + } else { + /* BB verify if wct == 15 */ + +/* *pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/ + + *netfid = pSMBr->Fid; /* cifs fid stays in le */ + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + /* BB FIXME BB */ +/* if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction) + *pOplock |= CIFS_CREATE_ACTION; */ + /* BB FIXME END */ + + if (pfile_info) { + pfile_info->CreationTime = 0; /* BB convert CreateTime*/ + pfile_info->LastAccessTime = 0; /* BB fixme */ + pfile_info->LastWriteTime = 0; /* BB fixme */ + pfile_info->ChangeTime = 0; /* BB fixme */ + pfile_info->Attributes = + cpu_to_le32(le16_to_cpu(pSMBr->FileAttributes)); + /* the file_info buf is endian converted by caller */ + pfile_info->AllocationSize = + cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile)); + pfile_info->EndOfFile = pfile_info->AllocationSize; + pfile_info->NumberOfLinks = cpu_to_le32(1); + pfile_info->DeletePending = 0; + } + } + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto OldOpenRetry; + return rc; +} + +int +CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock, + FILE_ALL_INFO *buf) +{ + int rc = -EACCES; + OPEN_REQ *req = NULL; + OPEN_RSP *rsp = NULL; + int bytes_returned; + int name_len; + __u16 count; + struct cifs_sb_info *cifs_sb = oparms->cifs_sb; + struct cifs_tcon *tcon = oparms->tcon; + int remap = cifs_remap(cifs_sb); + const struct nls_table *nls = cifs_sb->local_nls; + int create_options = oparms->create_options; + int desired_access = oparms->desired_access; + int disposition = oparms->disposition; + const char *path = oparms->path; + +openRetry: + rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req, + (void **)&rsp); + if (rc) + return rc; + + /* no commands go after this */ + req->AndXCommand = 0xFF; + + if (req->hdr.Flags2 & SMBFLG2_UNICODE) { + /* account for one byte pad to word boundary */ + count = 1; + name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1), + path, PATH_MAX, nls, remap); + /* trailing null */ + name_len++; + name_len *= 2; + req->NameLength = cpu_to_le16(name_len); + } else { + /* BB improve check for buffer overruns BB */ + /* no pad */ + count = 0; + name_len = strnlen(path, PATH_MAX); + /* trailing null */ + name_len++; + req->NameLength = cpu_to_le16(name_len); + strncpy(req->fileName, path, name_len); + } + + if (*oplock & REQ_OPLOCK) + req->OpenFlags = cpu_to_le32(REQ_OPLOCK); + else if (*oplock & REQ_BATCHOPLOCK) + req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); + + req->DesiredAccess = cpu_to_le32(desired_access); + req->AllocationSize = 0; + + /* + * Set file as system file if special file such as fifo and server + * expecting SFU style and no Unix extensions. + */ + if (create_options & CREATE_OPTION_SPECIAL) + req->FileAttributes = cpu_to_le32(ATTR_SYSTEM); + else + req->FileAttributes = cpu_to_le32(ATTR_NORMAL); + + /* + * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case + * sensitive checks for other servers such as Samba. + */ + if (tcon->ses->capabilities & CAP_UNIX) + req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); + + if (create_options & CREATE_OPTION_READONLY) + req->FileAttributes |= cpu_to_le32(ATTR_READONLY); + + req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); + req->CreateDisposition = cpu_to_le32(disposition); + req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); + + /* BB Expirement with various impersonation levels and verify */ + req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); + req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY; + + count += name_len; + inc_rfc1001_len(req, count); + + req->ByteCount = cpu_to_le16(count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req, + (struct smb_hdr *)rsp, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); + if (rc) { + cifs_dbg(FYI, "Error in Open = %d\n", rc); + cifs_buf_release(req); + if (rc == -EAGAIN) + goto openRetry; + return rc; + } + + /* 1 byte no need to le_to_cpu */ + *oplock = rsp->OplockLevel; + /* cifs fid stays in le */ + oparms->fid->netfid = rsp->Fid; + + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction) + *oplock |= CIFS_CREATE_ACTION; + + if (buf) { + /* copy from CreationTime to Attributes */ + memcpy((char *)buf, (char *)&rsp->CreationTime, 36); + /* the file_info buf is endian converted by caller */ + buf->AllocationSize = rsp->AllocationSize; + buf->EndOfFile = rsp->EndOfFile; + buf->NumberOfLinks = cpu_to_le32(1); + buf->DeletePending = 0; + } + + cifs_buf_release(req); + return rc; +} + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + unsigned int rfclen = get_rfc1002_length(server->smallbuf); + int remaining = rfclen + 4 - server->total_read; + struct cifs_readdata *rdata = mid->callback_data; + + while (remaining > 0) { + int length; + + length = cifs_read_from_socket(server, server->bigbuf, + min_t(unsigned int, remaining, + CIFSMaxBufSize + MAX_HEADER_SIZE(server))); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + dequeue_mid(mid, rdata->result); + return 0; +} + +int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, data_len; + struct cifs_readdata *rdata = mid->callback_data; + char *buf = server->smallbuf; + unsigned int buflen = get_rfc1002_length(buf) + 4; + + cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", + __func__, mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - + HEADER_SIZE(server) + 1; + + rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1; + rdata->iov.iov_len = len; + + length = cifs_readv_from_socket(server, &rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + + /* Was the SMB read successful? */ + rdata->result = server->ops->map_error(buf, false); + if (rdata->result != 0) { + cifs_dbg(FYI, "%s: server returned error %d\n", + __func__, rdata->result); + return cifs_readv_discard(server, mid); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < server->vals->read_rsp_size) { + cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n", + __func__, server->total_read, + server->vals->read_rsp_size); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = server->ops->read_data_offset(buf) + 4; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n", + __func__, server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + rdata->iov.iov_base = buf + server->total_read; + rdata->iov.iov_len = len; + length = cifs_readv_from_socket(server, &rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* set up first iov for signature check */ + rdata->iov.iov_base = buf; + rdata->iov.iov_len = server->total_read; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov.iov_base, rdata->iov.iov_len); + + /* how much data is in the response? */ + data_len = server->ops->read_data_length(buf); + if (data_offset + data_len > buflen) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + length = rdata->read_into_pages(server, rdata, data_len); + if (length < 0) + return length; + + server->total_read += length; + + cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", + server->total_read, buflen, data_len); + + /* discard anything left over */ + if (server->total_read < buflen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + return length; +} + +static void +cifs_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1, + .rq_pages = rdata->pages, + .rq_npages = rdata->nr_pages, + .rq_pagesz = rdata->pagesz, + .rq_tailsz = rdata->tailsz }; + + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + /* result already set, check signature */ + if (server->sign) { + int rc = 0; + + rc = cifs_verify_signature(&rqst, server, + mid->sequence_number); + if (rc) + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); + break; + default: + rdata->result = -EIO; + } + + queue_work(cifsiod_wq, &rdata->work); + DeleteMidQEntry(mid); + add_credits(server, 1, 0); +} + +/* cifs_async_readv - send an async write, and set up mid to handle result */ +int +cifs_async_readv(struct cifs_readdata *rdata) +{ + int rc; + READ_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1 }; + + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((rdata->offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb); + if (rc) + return rc; + + smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = rdata->cfile->fid.netfid; + smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); + if (wct == 12) + smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); + smb->Remaining = 0; + smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF); + smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16); + if (wct == 12) + smb->ByteCount = 0; + else { + /* old style read */ + struct smb_com_readx_req *smbr = + (struct smb_com_readx_req *)smb; + smbr->ByteCount = 0; + } + + /* 4 for RFC1001 length + 1 for BCC */ + rdata->iov.iov_base = smb; + rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + kref_get(&rdata->refcount); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, + cifs_readv_callback, rdata, 0); + + if (rc == 0) + cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); + else + kref_put(&rdata->refcount, cifs_readdata_release); + + cifs_small_buf_release(smb); + return rc; +} + +int +CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, int *pbuf_type) +{ + int rc = -EACCES; + READ_REQ *pSMB = NULL; + READ_RSP *pSMBr = NULL; + char *pReadData = NULL; + int wct; + int resp_buf_type = 0; + struct kvec iov[1]; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid); + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + *nbytes = 0; + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 12) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + + pSMB->Remaining = 0; + pSMB->MaxCount = cpu_to_le16(count & 0xFFFF); + pSMB->MaxCountHigh = cpu_to_le32(count >> 16); + if (wct == 12) + pSMB->ByteCount = 0; /* no need to do le conversion since 0 */ + else { + /* old style read */ + struct smb_com_readx_req *pSMBW = + (struct smb_com_readx_req *)pSMB; + pSMBW->ByteCount = 0; + } + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, + &resp_buf_type, CIFS_LOG_ERROR); + cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); + pSMBr = (READ_RSP *)iov[0].iov_base; + if (rc) { + cifs_dbg(VFS, "Send error in read = %d\n", rc); + } else { + int data_length = le16_to_cpu(pSMBr->DataLengthHigh); + data_length = data_length << 16; + data_length += le16_to_cpu(pSMBr->DataLength); + *nbytes = data_length; + + /*check that DataLength would not go beyond end of SMB */ + if ((data_length > CIFSMaxBufSize) + || (data_length > count)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + data_length, count); + rc = -EIO; + *nbytes = 0; + } else { + pReadData = (char *) (&pSMBr->hdr.Protocol) + + le16_to_cpu(pSMBr->DataOffset); +/* if (rc = copy_to_user(buf, pReadData, data_length)) { + cifs_dbg(VFS, "Faulting on read rc = %d\n",rc); + rc = -EFAULT; + }*/ /* can not use copy_to_user when using page cache*/ + if (*buf) + memcpy(*buf, pReadData, data_length); + } + } + +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + if (*buf) { + free_rsp_buf(resp_buf_type, iov[0].iov_base); + } else if (resp_buf_type != CIFS_NO_BUFFER) { + /* return buffer to caller to free */ + *buf = iov[0].iov_base; + if (resp_buf_type == CIFS_SMALL_BUFFER) + *pbuf_type = CIFS_SMALL_BUFFER; + else if (resp_buf_type == CIFS_LARGE_BUFFER) + *pbuf_type = CIFS_LARGE_BUFFER; + } /* else no valid buffer on return - leave as null */ + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; +} + + +int +CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, const char *buf, + const char __user *ubuf, const int long_op) +{ + int rc = -EACCES; + WRITE_REQ *pSMB = NULL; + WRITE_RSP *pSMBr = NULL; + int bytes_returned, wct; + __u32 bytes_sent; + __u16 byte_count; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + *nbytes = 0; + + /* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/ + if (tcon->ses == NULL) + return -ECONNABORTED; + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 14; + else { + wct = 12; + if ((offset >> 32) > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + + rc = smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 14) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + + pSMB->Reserved = 0xFFFFFFFF; + pSMB->WriteMode = 0; + pSMB->Remaining = 0; + + /* Can increase buffer size if buffer is big enough in some cases ie we + can send more if LARGE_WRITE_X capability returned by the server and if + our buffer is big enough or if we convert to iovecs on socket writes + and eliminate the copy to the CIFS buffer */ + if (tcon->ses->capabilities & CAP_LARGE_WRITE_X) { + bytes_sent = min_t(const unsigned int, CIFSMaxBufSize, count); + } else { + bytes_sent = (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) + & ~0xFF; + } + + if (bytes_sent > count) + bytes_sent = count; + pSMB->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + if (buf) + memcpy(pSMB->Data, buf, bytes_sent); + else if (ubuf) { + if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) { + cifs_buf_release(pSMB); + return -EFAULT; + } + } else if (count != 0) { + /* No buffer */ + cifs_buf_release(pSMB); + return -EINVAL; + } /* else setting file size with write of zero bytes */ + if (wct == 14) + byte_count = bytes_sent + 1; /* pad */ + else /* wct == 12 */ + byte_count = bytes_sent + 5; /* bigger pad, smaller smb hdr */ + + pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF); + pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16); + inc_rfc1001_len(pSMB, byte_count); + + if (wct == 14) + pSMB->ByteCount = cpu_to_le16(byte_count); + else { /* old style write has byte count 4 bytes earlier + so 4 bytes pad */ + struct smb_com_writex_req *pSMBW = + (struct smb_com_writex_req *)pSMB; + pSMBW->ByteCount = cpu_to_le16(byte_count); + } + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, long_op); + cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); + if (rc) { + cifs_dbg(FYI, "Send error in write = %d\n", rc); + } else { + *nbytes = le16_to_cpu(pSMBr->CountHigh); + *nbytes = (*nbytes) << 16; + *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. Some + * OS/2 servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; + } + + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +void +cifs_writedata_release(struct kref *refcount) +{ + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + if (wdata->cfile) + cifsFileInfo_put(wdata->cfile); + + kfree(wdata); +} + +/* + * Write failed with a retryable error. Resend the write request. It's also + * possible that the page was redirtied so re-clean the page. + */ +static void +cifs_writev_requeue(struct cifs_writedata *wdata) +{ + int i, rc = 0; + struct inode *inode = d_inode(wdata->cfile->dentry); + struct TCP_Server_Info *server; + unsigned int rest_len; + + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; + do { + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_CACHE_SIZE; + if (!nr_pages) { + rc = -ENOTSUPP; + break; + } + cur_len = nr_pages * PAGE_CACHE_SIZE; + tailsz = PAGE_CACHE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + } + + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; + } + + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + wdata2->cfile = find_writable_file(CIFS_I(inode), false); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + break; + } + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, cifs_writedata_release); + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && rc != -EAGAIN) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + page_cache_release(wdata2->pages[j]); + } + } + + if (rc) { + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc == -EAGAIN) + continue; + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); + + mapping_set_error(inode->i_mapping, rc); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +void +cifs_writev_complete(struct work_struct *work) +{ + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = d_inode(wdata->cfile->dentry); + int i = 0; + + if (wdata->result == 0) { + spin_lock(&inode->i_lock); + cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes); + spin_unlock(&inode->i_lock); + cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink), + wdata->bytes); + } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) + return cifs_writev_requeue(wdata); + + for (i = 0; i < wdata->nr_pages; i++) { + struct page *page = wdata->pages[i]; + if (wdata->result == -EAGAIN) + __set_page_dirty_nobuffers(page); + else if (wdata->result < 0) + SetPageError(page); + end_page_writeback(page); + page_cache_release(page); + } + if (wdata->result != -EAGAIN) + mapping_set_error(inode->i_mapping, wdata->result); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +struct cifs_writedata * +cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) +{ + struct cifs_writedata *wdata; + + /* writedata + number of page pointers */ + wdata = kzalloc(sizeof(*wdata) + + sizeof(struct page *) * nr_pages, GFP_NOFS); + if (wdata != NULL) { + kref_init(&wdata->refcount); + INIT_LIST_HEAD(&wdata->list); + init_completion(&wdata->done); + INIT_WORK(&wdata->work, complete); + } + return wdata; +} + +/* + * Check the mid_state and signature on received buffer (if any), and queue the + * workqueue completion task. + */ +static void +cifs_writev_callback(struct mid_q_entry *mid) +{ + struct cifs_writedata *wdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + unsigned int written; + WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + wdata->result = cifs_check_receive(mid, tcon->ses->server, 0); + if (wdata->result != 0) + break; + + written = le16_to_cpu(smb->CountHigh); + written <<= 16; + written += le16_to_cpu(smb->Count); + /* + * Mask off high 16 bits when bytes written as returned + * by the server is greater than bytes requested by the + * client. OS/2 servers are known to set incorrect + * CountHigh values. + */ + if (written > wdata->bytes) + written &= 0xFFFF; + + if (written < wdata->bytes) + wdata->result = -ENOSPC; + else + wdata->bytes = written; + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + wdata->result = -EAGAIN; + break; + default: + wdata->result = -EIO; + break; + } + + queue_work(cifsiod_wq, &wdata->work); + DeleteMidQEntry(mid); + add_credits(tcon->ses->server, 1, 0); +} + +/* cifs_async_writev - send an async write, and set up mid to handle result */ +int +cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) +{ + int rc = -EACCES; + WRITE_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct kvec iov; + struct smb_rqst rqst = { }; + + if (tcon->ses->capabilities & CAP_LARGE_FILES) { + wct = 14; + } else { + wct = 12; + if (wdata->offset >> 32 > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb); + if (rc) + goto async_writev_out; + + smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = wdata->cfile->fid.netfid; + smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF); + if (wct == 14) + smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32); + smb->Reserved = 0xFFFFFFFF; + smb->WriteMode = 0; + smb->Remaining = 0; + + smb->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + + /* 4 for RFC1001 length + 1 for BCC */ + iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; + iov.iov_base = smb; + + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + rqst.rq_pages = wdata->pages; + rqst.rq_npages = wdata->nr_pages; + rqst.rq_pagesz = wdata->pagesz; + rqst.rq_tailsz = wdata->tailsz; + + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); + + smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF); + smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16); + + if (wct == 14) { + inc_rfc1001_len(&smb->hdr, wdata->bytes + 1); + put_bcc(wdata->bytes + 1, &smb->hdr); + } else { + /* wct == 12 */ + struct smb_com_writex_req *smbw = + (struct smb_com_writex_req *)smb; + inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5); + put_bcc(wdata->bytes + 5, &smbw->hdr); + iov.iov_len += 4; /* pad bigger by four bytes */ + } + + kref_get(&wdata->refcount); + rc = cifs_call_async(tcon->ses->server, &rqst, NULL, + cifs_writev_callback, wdata, 0); + + if (rc == 0) + cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); + else + kref_put(&wdata->refcount, release); + +async_writev_out: + cifs_small_buf_release(smb); + return rc; +} + +int +CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec) +{ + int rc = -EACCES; + WRITE_REQ *pSMB = NULL; + int wct; + int smb_hdr_len; + int resp_buf_type = 0; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + *nbytes = 0; + + cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) { + wct = 14; + } else { + wct = 12; + if ((offset >> 32) > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 14) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + pSMB->Reserved = 0xFFFFFFFF; + pSMB->WriteMode = 0; + pSMB->Remaining = 0; + + pSMB->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + + pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF); + pSMB->DataLengthHigh = cpu_to_le16(count >> 16); + /* header + 1 byte pad */ + smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1; + if (wct == 14) + inc_rfc1001_len(pSMB, count + 1); + else /* wct == 12 */ + inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */ + if (wct == 14) + pSMB->ByteCount = cpu_to_le16(count + 1); + else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ { + struct smb_com_writex_req *pSMBW = + (struct smb_com_writex_req *)pSMB; + pSMBW->ByteCount = cpu_to_le16(count + 5); + } + iov[0].iov_base = pSMB; + if (wct == 14) + iov[0].iov_len = smb_hdr_len + 4; + else /* wct == 12 pad bigger by four bytes */ + iov[0].iov_len = smb_hdr_len + 8; + + + rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); + if (rc) { + cifs_dbg(FYI, "Send error Write2 = %d\n", rc); + } else if (resp_buf_type == 0) { + /* presumably this can not happen, but best to be safe */ + rc = -EIO; + } else { + WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; + *nbytes = le16_to_cpu(pSMBr->CountHigh); + *nbytes = (*nbytes) << 16; + *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. OS/2 + * servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; + } + +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + free_rsp_buf(resp_buf_type, iov[0].iov_base); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 netfid, const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; + struct kvec iov[2]; + int resp_buf_type; + __u16 count; + + cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n", + num_lock, num_unlock); + + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->Timeout = 0; + pSMB->NumberOfLocks = cpu_to_le16(num_lock); + pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock); + pSMB->LockType = lock_type; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; /* netfid stays le */ + + count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 - + (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + iov[1].iov_base = (char *)buf; + iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) + cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc); + + return rc; +} + +int +CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const __u32 netpid, const __u64 len, + const __u64 offset, const __u32 numUnlock, + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; +/* LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */ + int bytes_returned; + int flags = 0; + __u16 count; + + cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n", + (int)waitFlag, numLock); + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + + if (rc) + return rc; + + if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) { + /* no response expected */ + flags = CIFS_ASYNC_OP | CIFS_OBREAK_OP; + pSMB->Timeout = 0; + } else if (waitFlag) { + flags = CIFS_BLOCKING_OP; /* blocking operation, no timeout */ + pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */ + } else { + pSMB->Timeout = 0; + } + + pSMB->NumberOfLocks = cpu_to_le16(numLock); + pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); + pSMB->LockType = lockType; + pSMB->OplockLevel = oplock_level; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = smb_file_id; /* netfid stays le */ + + if ((numLock != 0) || (numUnlock != 0)) { + pSMB->Locks[0].Pid = cpu_to_le16(netpid); + /* BB where to store pid high? */ + pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); + pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); + pSMB->Locks[0].OffsetLow = cpu_to_le32((u32)offset); + pSMB->Locks[0].OffsetHigh = cpu_to_le32((u32)(offset>>32)); + count = sizeof(LOCKING_ANDX_RANGE); + } else { + /* oplock break */ + count = 0; + } + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMB, &bytes_returned); + cifs_small_buf_release(pSMB); + } else { + rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, flags); + /* SMB buffer freed by function above */ + } + cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); + if (rc) + cifs_dbg(FYI, "Send error in Lock = %d\n", rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; +} + +int +CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const __u32 netpid, + const loff_t start_offset, const __u64 len, + struct file_lock *pLockData, const __u16 lock_type, + const bool waitFlag) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + struct cifs_posix_lock *parm_data; + int rc = 0; + int timeout = 0; + int bytes_returned = 0; + int resp_buf_type = 0; + __u16 params, param_offset, offset, byte_count, count; + struct kvec iov[1]; + + cifs_dbg(FYI, "Posix Lock\n"); + + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB; + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + count = sizeof(struct cifs_posix_lock); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + if (pLockData) + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + else + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + parm_data = (struct cifs_posix_lock *) + (((char *) &pSMB->hdr.Protocol) + offset); + + parm_data->lock_type = cpu_to_le16(lock_type); + if (waitFlag) { + timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */ + parm_data->lock_flags = cpu_to_le16(1); + pSMB->Timeout = cpu_to_le32(-1); + } else + pSMB->Timeout = 0; + + parm_data->pid = cpu_to_le32(netpid); + parm_data->start = cpu_to_le64(start_offset); + parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ + + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = smb_file_id; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned); + } else { + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, + &resp_buf_type, timeout); + pSMB = NULL; /* request buf already freed by SendReceive2. Do + not try to free it twice below on exit */ + pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base; + } + + if (rc) { + cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc); + } else if (pLockData) { + /* lock structure can be returned on get */ + __u16 data_offset; + __u16 data_count; + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) { + rc = -EIO; /* bad smb */ + goto plk_err_exit; + } + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + data_count = le16_to_cpu(pSMBr->t2.DataCount); + if (data_count < sizeof(struct cifs_posix_lock)) { + rc = -EIO; + goto plk_err_exit; + } + parm_data = (struct cifs_posix_lock *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) + pLockData->fl_type = F_UNLCK; + else { + if (parm_data->lock_type == + cpu_to_le16(CIFS_RDLCK)) + pLockData->fl_type = F_RDLCK; + else if (parm_data->lock_type == + cpu_to_le16(CIFS_WRLCK)) + pLockData->fl_type = F_WRLCK; + + pLockData->fl_start = le64_to_cpu(parm_data->start); + pLockData->fl_end = pLockData->fl_start + + le64_to_cpu(parm_data->length) - 1; + pLockData->fl_pid = le32_to_cpu(parm_data->pid); + } + } + +plk_err_exit: + if (pSMB) + cifs_small_buf_release(pSMB); + + free_rsp_buf(resp_buf_type, iov[0].iov_base); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + + +int +CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) +{ + int rc = 0; + CLOSE_REQ *pSMB = NULL; + cifs_dbg(FYI, "In CIFSSMBClose\n"); + +/* do not retry on dead session on close */ + rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); + if (rc == -EAGAIN) + return 0; + if (rc) + return rc; + + pSMB->FileID = (__u16) smb_file_id; + pSMB->LastWriteTime = 0xFFFFFFFF; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_closes); + if (rc) { + if (rc != -EINTR) { + /* EINTR is expected when user ctl-c to kill app */ + cifs_dbg(VFS, "Send error in Close = %d\n", rc); + } + } + + /* Since session is dead, file will be closed on server already */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +int +CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) +{ + int rc = 0; + FLUSH_REQ *pSMB = NULL; + cifs_dbg(FYI, "In CIFSSMBFlush\n"); + + rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->FileID = (__u16) smb_file_id; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes); + if (rc) + cifs_dbg(VFS, "Send error in Flush = %d\n", rc); + + return rc; +} + +int +CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + int rc = 0; + RENAME_REQ *pSMB = NULL; + RENAME_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + int remap = cifs_remap(cifs_sb); + + cifs_dbg(FYI, "In CIFSSMBRename\n"); +renameRetry: + rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->BufferFormat = 0x04; + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, + from_name, PATH_MAX, + cifs_sb->local_nls, remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->OldFileName[name_len] = 0x04; /* pad */ + /* protocol requires ASCII signature byte on Unicode string */ + pSMB->OldFileName[name_len + 1] = 0x00; + name_len2 = + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + to_name, PATH_MAX, cifs_sb->local_nls, + remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(from_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, from_name, name_len); + name_len2 = strnlen(to_name, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* 1st signature byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_renames); + if (rc) + cifs_dbg(FYI, "Send error in rename = %d\n", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto renameRetry; + + return rc; +} + +int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, + int netfid, const char *target_name, + const struct nls_table *nls_codepage, int remap) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + struct set_file_rename *rename_info; + char *data_offset; + char dummy_string[30]; + int rc = 0; + int bytes_returned = 0; + int len_of_str; + __u16 params, param_offset, offset, count, byte_count; + + cifs_dbg(FYI, "Rename to File by handle\n"); + rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + rename_info = (struct set_file_rename *) data_offset; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + /* construct random name ".cifs_tmp" */ + rename_info->overwrite = cpu_to_le32(1); + rename_info->root_fid = 0; + /* unicode only call */ + if (target_name == NULL) { + sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid); + len_of_str = + cifsConvertToUTF16((__le16 *)rename_info->target_name, + dummy_string, 24, nls_codepage, remap); + } else { + len_of_str = + cifsConvertToUTF16((__le16 *)rename_info->target_name, + target_name, PATH_MAX, nls_codepage, + remap); + } + rename_info->target_name_len = cpu_to_le32(2 * len_of_str); + count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + byte_count += count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->Fid = netfid; + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames); + if (rc) + cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n", + rc); + + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon, + const char *fromName, const __u16 target_tid, const char *toName, + const int flags, const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + COPY_REQ *pSMB = NULL; + COPY_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + + cifs_dbg(FYI, "In CIFSSMBCopy\n"); +copyRetry: + rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->BufferFormat = 0x04; + pSMB->Tid2 = target_tid; + + pSMB->Flags = cpu_to_le16(flags & COPY_TREE); + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, + fromName, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->OldFileName[name_len] = 0x04; /* pad */ + /* protocol requires ASCII signature byte on Unicode string */ + pSMB->OldFileName[name_len + 1] = 0x00; + name_len2 = + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + toName, PATH_MAX, nls_codepage, remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, fromName, name_len); + name_len2 = strnlen(toName, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* 1st signature byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n", + rc, le16_to_cpu(pSMBr->CopyCount)); + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto copyRetry; + + return rc; +} + +int +CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + char *data_offset; + int name_len; + int name_len_target; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cifs_dbg(FYI, "In Symlink Unix style\n"); +createSymLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fromName, name_len); + } + params = 6 + name_len; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len_target = + cifsConvertToUTF16((__le16 *) data_offset, toName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); + name_len_target++; /* trailing null */ + name_len_target *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len_target = strnlen(toName, PATH_MAX); + name_len_target++; /* trailing null */ + strncpy(data_offset, toName, name_len_target); + } + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max on data count below from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + name_len_target; + pSMB->DataCount = cpu_to_le16(name_len_target); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks); + if (rc) + cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n", + rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto createSymLinkRetry; + + return rc; +} + +int +CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + char *data_offset; + int name_len; + int name_len_target; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cifs_dbg(FYI, "In Create Hard link Unix style\n"); +createHardLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(toName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, toName, name_len); + } + params = 6 + name_len; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len_target = + cifsConvertToUTF16((__le16 *) data_offset, fromName, + PATH_MAX, nls_codepage, remap); + name_len_target++; /* trailing null */ + name_len_target *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len_target = strnlen(fromName, PATH_MAX); + name_len_target++; /* trailing null */ + strncpy(data_offset, fromName, name_len_target); + } + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max on data count below from sess*/ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + name_len_target; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->DataCount = cpu_to_le16(name_len_target); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); + if (rc) + cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n", + rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto createHardLinkRetry; + + return rc; +} + +int +CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + int rc = 0; + NT_RENAME_REQ *pSMB = NULL; + RENAME_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + int remap = cifs_remap(cifs_sb); + + cifs_dbg(FYI, "In CIFSCreateHardLink\n"); +winCreateHardLinkRetry: + + rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + pSMB->Flags = cpu_to_le16(CREATE_HARD_LINK); + pSMB->ClusterCount = 0; + + pSMB->BufferFormat = 0x04; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->OldFileName, from_name, + PATH_MAX, cifs_sb->local_nls, remap); + name_len++; /* trailing null */ + name_len *= 2; + + /* protocol specifies ASCII buffer format (0x04) for unicode */ + pSMB->OldFileName[name_len] = 0x04; + pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ + name_len2 = + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + to_name, PATH_MAX, cifs_sb->local_nls, + remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(from_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, from_name, name_len); + name_len2 = strnlen(to_name, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* string type byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); + if (rc) + cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto winCreateHardLinkRetry; + + return rc; +} + +int +CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage, int remap) +{ +/* SMB_QUERY_FILE_UNIX_LINK */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + char *data_start; + + cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName); + +querySymLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc); + } else { + /* decode response */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + rc = -EIO; + else { + bool is_unicode; + u16 count = le16_to_cpu(pSMBr->t2.DataCount); + + data_start = ((char *) &pSMBr->hdr.Protocol) + + le16_to_cpu(pSMBr->t2.DataOffset); + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + + /* BB FIXME investigate remapping reserved chars here */ + *symlinkinfo = cifs_strndup_from_utf16(data_start, + count, is_unicode, nls_codepage); + if (!*symlinkinfo) + rc = -ENOMEM; + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto querySymLinkRetry; + return rc; +} + +/* + * Recent Windows versions now create symlinks more frequently + * and they use the "reparse point" mechanism below. We can of course + * do symlinks nicely to Samba and other servers which support the + * CIFS Unix Extensions and we can also do SFU symlinks and "client only" + * "MF" symlinks optionally, but for recent Windows we really need to + * reenable the code below and fix the cifs_symlink callers to handle this. + * In the interim this code has been moved to its own config option so + * it is not compiled in by default until callers fixed up and more tested. + */ +int +CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage) +{ + int rc = 0; + int bytes_returned; + struct smb_com_transaction_ioctl_req *pSMB; + struct smb_com_transaction_ioctl_rsp *pSMBr; + bool is_unicode; + unsigned int sub_len; + char *sub_start; + struct reparse_symlink_data *reparse_buf; + struct reparse_posix_data *posix_buf; + __u32 data_offset, data_count; + char *end_of_smb; + + cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->TotalParameterCount = 0 ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 4; + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); + pSMB->IsFsctl = 1; /* FSCTL */ + pSMB->IsRootFlag = 0; + pSMB->Fid = fid; /* file handle always le */ + pSMB->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); + goto qreparse_out; + } + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { + /* BB also check enough total bytes returned */ + rc = -EIO; /* bad smb */ + goto qreparse_out; + } + if (!data_count || (data_count > 2048)) { + rc = -EIO; + cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); + goto qreparse_out; + } + end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; + reparse_buf = (struct reparse_symlink_data *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if ((char *)reparse_buf >= end_of_smb) { + rc = -EIO; + goto qreparse_out; + } + if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { + cifs_dbg(FYI, "NFS style reparse tag\n"); + posix_buf = (struct reparse_posix_data *)reparse_buf; + + if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { + cifs_dbg(FYI, "unsupported file type 0x%llx\n", + le64_to_cpu(posix_buf->InodeType)); + rc = -EOPNOTSUPP; + goto qreparse_out; + } + is_unicode = true; + sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); + if (posix_buf->PathBuffer + sub_len > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, + sub_len, is_unicode, nls_codepage); + goto qreparse_out; + } else if (reparse_buf->ReparseTag != + cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { + rc = -EOPNOTSUPP; + goto qreparse_out; + } + + /* Reparse tag is NTFS symlink */ + sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + + reparse_buf->PathBuffer; + sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); + if (sub_start + sub_len > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + + /* BB FIXME investigate remapping reserved chars here */ + *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode, + nls_codepage); + if (!*symlinkinfo) + rc = -ENOMEM; +qreparse_out: + cifs_buf_release(pSMB); + + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ + return rc; +} + +int +CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid) +{ + int rc = 0; + int bytes_returned; + struct smb_com_transaction_compr_ioctl_req *pSMB; + struct smb_com_transaction_ioctl_rsp *pSMBr; + + cifs_dbg(FYI, "Set compression for %u\n", fid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + pSMB->TotalParameterCount = 0; + pSMB->TotalDataCount = cpu_to_le32(2); + pSMB->MaxParameterCount = 0; + pSMB->MaxDataCount = 0; + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = cpu_to_le32(2); + pSMB->DataOffset = + cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, + compression_state) - 4); /* 84 */ + pSMB->SetupCount = 4; + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->ParameterCount = 0; + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->IsFsctl = 1; /* FSCTL */ + pSMB->IsRootFlag = 0; + pSMB->Fid = fid; /* file handle always le */ + /* 3 byte pad, followed by 2 byte compress state */ + pSMB->ByteCount = cpu_to_le16(5); + inc_rfc1001_len(pSMB, 5); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc); + + cifs_buf_release(pSMB); + + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ + return rc; +} + + +#ifdef CONFIG_CIFS_POSIX + +/*Convert an Access Control Entry from wire format to local POSIX xattr format*/ +static void cifs_convert_ace(posix_acl_xattr_entry *ace, + struct cifs_posix_ace *cifs_ace) +{ + /* u8 cifs fields do not need le conversion */ + ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); + ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); + ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ + + return; +} + +/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */ +static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, + const int acl_type, const int size_of_data_area) +{ + int size = 0; + int i; + __u16 count; + struct cifs_posix_ace *pACE; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; + posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt; + + if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) + return -EOPNOTSUPP; + + if (acl_type & ACL_TYPE_ACCESS) { + count = le16_to_cpu(cifs_acl->access_entry_count); + pACE = &cifs_acl->ace_array[0]; + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) { + cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n", + size_of_data_area, size); + return -EINVAL; + } + } else if (acl_type & ACL_TYPE_DEFAULT) { + count = le16_to_cpu(cifs_acl->access_entry_count); + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; +/* skip past access ACEs to get to default ACEs */ + pACE = &cifs_acl->ace_array[count]; + count = le16_to_cpu(cifs_acl->default_entry_count); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) + return -EINVAL; + } else { + /* illegal type */ + return -EINVAL; + } + + size = posix_acl_xattr_size(count); + if ((buflen == 0) || (local_acl == NULL)) { + /* used to query ACL EA size */ + } else if (size > buflen) { + return -ERANGE; + } else /* buffer big enough */ { + local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + for (i = 0; i < count ; i++) { + cifs_convert_ace(&local_acl->a_entries[i], pACE); + pACE++; + } + } + return size; +} + +static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, + const posix_acl_xattr_entry *local_ace) +{ + __u16 rc = 0; /* 0 = ACL converted ok */ + + cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); + cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); + /* BB is there a better way to handle the large uid? */ + if (local_ace->e_id == cpu_to_le32(-1)) { + /* Probably no need to le convert -1 on any arch but can not hurt */ + cifs_ace->cifs_uid = cpu_to_le64(-1); + } else + cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ + return rc; +} + +/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ +static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, + const int buflen, const int acl_type) +{ + __u16 rc = 0; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; + posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL; + int count; + int i; + + if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL)) + return 0; + + count = posix_acl_xattr_count((size_t)buflen); + cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", + count, buflen, le32_to_cpu(local_acl->a_version)); + if (le32_to_cpu(local_acl->a_version) != 2) { + cifs_dbg(FYI, "unknown POSIX ACL version %d\n", + le32_to_cpu(local_acl->a_version)); + return 0; + } + cifs_acl->version = cpu_to_le16(1); + if (acl_type == ACL_TYPE_ACCESS) { + cifs_acl->access_entry_count = cpu_to_le16(count); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); + } else if (acl_type == ACL_TYPE_DEFAULT) { + cifs_acl->default_entry_count = cpu_to_le16(count); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); + } else { + cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); + return 0; + } + for (i = 0; i < count; i++) { + rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], + &local_acl->a_entries[i]); + if (rc != 0) { + /* ACE not converted */ + break; + } + } + if (rc == 0) { + rc = (__u16)(count * sizeof(struct cifs_posix_ace)); + rc += sizeof(struct cifs_posix_acl); + /* BB add check to make sure ACL does not overflow SMB */ + } + return rc; +} + +int +CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + char *acl_inf, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap) +{ +/* SMB_QUERY_POSIX_ACL */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + + cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName); + +queryAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->FileName[name_len] = 0; + pSMB->FileName[name_len+1] = 0; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max data count below from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_qpi_req, + InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); + if (rc) { + cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc); + } else { + /* decode response */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + rc = cifs_copy_posix_acl(acl_inf, + (char *)&pSMBr->hdr.Protocol+data_offset, + buflen, acl_type, count); + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto queryAclRetry; + return rc; +} + +int +CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const char *local_acl, const int buflen, + const int acl_type, + const struct nls_table *nls_codepage, int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + char *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count, data_count, param_offset, offset; + + cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName); +setAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + params = 6 + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB size from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + + /* convert to on the wire format for POSIX ACL */ + data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type); + + if (data_count == 0) { + rc = -EOPNOTSUPP; + goto setACLerrorExit; + } + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL); + byte_count = 3 /* pad */ + params + data_count; + pSMB->DataCount = cpu_to_le16(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc); + +setACLerrorExit: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setAclRetry; + return rc; +} + +/* BB fix tabs in this function FIXME BB */ +int +CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, + const int netfid, __u64 *pExtAttrBits, __u64 *pMask) +{ + int rc = 0; + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int bytes_returned; + __u16 params, byte_count; + + cifs_dbg(FYI, "In GetExtAttr\n"); + if (tcon == NULL) + return -ENODEV; + +GetExtAttrRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(4000); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "error %d in GetExtAttr\n", rc); + } else { + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + /* If rc should we check for EOPNOSUPP and + disable the srvino flag? or in caller? */ + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + struct file_chattr_info *pfinfo; + /* BB Do we need a cast or hash here ? */ + if (count != 16) { + cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n"); + rc = -EIO; + goto GetExtAttrOut; + } + pfinfo = (struct file_chattr_info *) + (data_offset + (char *) &pSMBr->hdr.Protocol); + *pExtAttrBits = le64_to_cpu(pfinfo->mode); + *pMask = le64_to_cpu(pfinfo->mask); + } + } +GetExtAttrOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto GetExtAttrRetry; + return rc; +} + +#endif /* CONFIG_POSIX */ + +#ifdef CONFIG_CIFS_ACL +/* + * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that + * all NT TRANSACTS that we init here have total parm and data under about 400 + * bytes (to fit in small cifs buffer size), which is the case so far, it + * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of + * returned setup area) and MaxParameterCount (returned parms size) must be set + * by caller + */ +static int +smb_init_nttransact(const __u16 sub_command, const int setup_count, + const int parm_len, struct cifs_tcon *tcon, + void **ret_buf) +{ + int rc; + __u32 temp_offset; + struct smb_com_ntransact_req *pSMB; + + rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, + (void **)&pSMB); + if (rc) + return rc; + *ret_buf = (void *)pSMB; + pSMB->Reserved = 0; + pSMB->TotalParameterCount = cpu_to_le32(parm_len); + pSMB->TotalDataCount = 0; + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->DataCount = pSMB->TotalDataCount; + temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + + (setup_count * 2) - 4 /* for rfc1001 length itself */; + pSMB->ParameterOffset = cpu_to_le32(temp_offset); + pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); + pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ + pSMB->SubCommand = cpu_to_le16(sub_command); + return 0; +} + +static int +validate_ntransact(char *buf, char **ppparm, char **ppdata, + __u32 *pparmlen, __u32 *pdatalen) +{ + char *end_of_smb; + __u32 data_count, data_offset, parm_count, parm_offset; + struct smb_com_ntransact_rsp *pSMBr; + u16 bcc; + + *pdatalen = 0; + *pparmlen = 0; + + if (buf == NULL) + return -EINVAL; + + pSMBr = (struct smb_com_ntransact_rsp *)buf; + + bcc = get_bcc(&pSMBr->hdr); + end_of_smb = 2 /* sizeof byte count */ + bcc + + (char *)&pSMBr->ByteCount; + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + parm_offset = le32_to_cpu(pSMBr->ParameterOffset); + parm_count = le32_to_cpu(pSMBr->ParameterCount); + + *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; + *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; + + /* should we also check that parm and data areas do not overlap? */ + if (*ppparm > end_of_smb) { + cifs_dbg(FYI, "parms start after end of smb\n"); + return -EINVAL; + } else if (parm_count + *ppparm > end_of_smb) { + cifs_dbg(FYI, "parm end after end of smb\n"); + return -EINVAL; + } else if (*ppdata > end_of_smb) { + cifs_dbg(FYI, "data starts after end of smb\n"); + return -EINVAL; + } else if (data_count + *ppdata > end_of_smb) { + cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); + return -EINVAL; + } else if (parm_count + data_count > bcc) { + cifs_dbg(FYI, "parm count and data count larger than SMB\n"); + return -EINVAL; + } + *pdatalen = data_count; + *pparmlen = parm_count; + return 0; +} + +/* Get Security Descriptor (by handle) from remote server for a file or dir */ +int +CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, + struct cifs_ntsd **acl_inf, __u32 *pbuflen) +{ + int rc = 0; + int buf_type = 0; + QUERY_SEC_DESC_REQ *pSMB; + struct kvec iov[1]; + + cifs_dbg(FYI, "GetCifsACL\n"); + + *pbuflen = 0; + *acl_inf = NULL; + + rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0, + 8 /* parm len */, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->MaxParameterCount = cpu_to_le32(4); + /* BB TEST with big acls that might need to be e.g. larger than 16K */ + pSMB->MaxSetupCount = 0; + pSMB->Fid = fid; /* file handle always le */ + pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | + CIFS_ACL_DACL); + pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ + inc_rfc1001_len(pSMB, 11); + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, + 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); + if (rc) { + cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc); + } else { /* decode response */ + __le32 *parm; + __u32 parm_len; + __u32 acl_len; + struct smb_com_ntransact_rsp *pSMBr; + char *pdata; + +/* validate_nttransact */ + rc = validate_ntransact(iov[0].iov_base, (char **)&parm, + &pdata, &parm_len, pbuflen); + if (rc) + goto qsec_out; + pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; + + cifs_dbg(FYI, "smb %p parm %p data %p\n", + pSMBr, parm, *acl_inf); + + if (le32_to_cpu(pSMBr->ParameterCount) != 4) { + rc = -EIO; /* bad smb */ + *pbuflen = 0; + goto qsec_out; + } + +/* BB check that data area is minimum length and as big as acl_len */ + + acl_len = le32_to_cpu(*parm); + if (acl_len != *pbuflen) { + cifs_dbg(VFS, "acl length %d does not match %d\n", + acl_len, *pbuflen); + if (*pbuflen > acl_len) + *pbuflen = acl_len; + } + + /* check if buffer is big enough for the acl + header followed by the smallest SID */ + if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || + (*pbuflen >= 64 * 1024)) { + cifs_dbg(VFS, "bad acl length %d\n", *pbuflen); + rc = -EINVAL; + *pbuflen = 0; + } else { + *acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL); + if (*acl_inf == NULL) { + *pbuflen = 0; + rc = -ENOMEM; + } + } + } +qsec_out: + free_rsp_buf(buf_type, iov[0].iov_base); +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + return rc; +} + +int +CIFSSMBSetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, + struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) +{ + __u16 byte_count, param_count, data_count, param_offset, data_offset; + int rc = 0; + int bytes_returned = 0; + SET_SEC_DESC_REQ *pSMB = NULL; + void *pSMBr; + +setCifsAclRetry: + rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB, &pSMBr); + if (rc) + return rc; + + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + + param_count = 8; + param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4; + data_count = acllen; + data_offset = param_offset + param_count; + byte_count = 3 /* pad */ + param_count; + + pSMB->DataCount = cpu_to_le32(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->MaxParameterCount = cpu_to_le32(4); + pSMB->MaxDataCount = cpu_to_le32(16384); + pSMB->ParameterCount = cpu_to_le32(param_count); + pSMB->ParameterOffset = cpu_to_le32(param_offset); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->DataOffset = cpu_to_le32(data_offset); + pSMB->SetupCount = 0; + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC); + pSMB->ByteCount = cpu_to_le16(byte_count+data_count); + + pSMB->Fid = fid; /* file handle always le */ + pSMB->Reserved2 = 0; + pSMB->AclFlags = cpu_to_le32(aclflag); + + if (pntsd && acllen) { + memcpy((char *)pSMBr + offsetof(struct smb_hdr, Protocol) + + data_offset, pntsd, acllen); + inc_rfc1001_len(pSMB, byte_count + data_count); + } else + inc_rfc1001_len(pSMB, byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + + cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n", + bytes_returned, rc); + if (rc) + cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto setCifsAclRetry; + + return (rc); +} + +#endif /* CONFIG_CIFS_ACL */ + +/* Legacy Query Path Information call for lookup to old servers such + as Win9x/WinME */ +int +SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_name, FILE_ALL_INFO *data, + const struct nls_table *nls_codepage, int remap) +{ + QUERY_INFORMATION_REQ *pSMB; + QUERY_INFORMATION_RSP *pSMBr; + int rc = 0; + int bytes_returned; + int name_len; + + cifs_dbg(FYI, "In SMBQPath path %s\n", search_name); +QInfRetry: + rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, + search_name, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = strnlen(search_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, search_name, name_len); + } + pSMB->BufferFormat = 0x04; + name_len++; /* account for buffer type byte */ + inc_rfc1001_len(pSMB, (__u16)name_len); + pSMB->ByteCount = cpu_to_le16(name_len); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc); + } else if (data) { + struct timespec ts; + __u32 time = le32_to_cpu(pSMBr->last_write_time); + + /* decode response */ + /* BB FIXME - add time zone adjustment BB */ + memset(data, 0, sizeof(FILE_ALL_INFO)); + ts.tv_nsec = 0; + ts.tv_sec = time; + /* decode time fields */ + data->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts)); + data->LastWriteTime = data->ChangeTime; + data->LastAccessTime = 0; + data->AllocationSize = + cpu_to_le64(le32_to_cpu(pSMBr->size)); + data->EndOfFile = data->AllocationSize; + data->Attributes = + cpu_to_le32(le16_to_cpu(pSMBr->attr)); + } else + rc = -EIO; /* bad buffer passed in */ + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QInfRetry; + + return rc; +} + +int +CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_ALL_INFO *pFindData) +{ + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int rc = 0; + int bytes_returned; + __u16 params, byte_count; + +QFileInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc) /* BB add auto retry on EOPNOTSUPP? */ + rc = -EIO; + else if (get_bcc(&pSMBr->hdr) < 40) + rc = -EIO; /* bad smb */ + else if (pFindData) { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, sizeof(FILE_ALL_INFO)); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QFileInfoRetry; + + return rc; +} + +int +CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_name, FILE_ALL_INFO *data, + int legacy /* old style infolevel */, + const struct nls_table *nls_codepage, int remap) +{ + /* level 263 SMB_QUERY_FILE_ALL_INFO */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + + /* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */ +QPathInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, search_name, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(search_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, search_name, name_len); + } + + params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + if (legacy) + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD); + else + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc) /* BB add auto retry on EOPNOTSUPP? */ + rc = -EIO; + else if (!legacy && get_bcc(&pSMBr->hdr) < 40) + rc = -EIO; /* bad smb */ + else if (legacy && get_bcc(&pSMBr->hdr) < 24) + rc = -EIO; /* 24 or 26 expected but we do not read + last field */ + else if (data) { + int size; + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + + /* + * On legacy responses we do not read the last field, + * EAsize, fortunately since it varies by subdialect and + * also note it differs on Set vs Get, ie two bytes or 4 + * bytes depending but we don't care here. + */ + if (legacy) + size = sizeof(FILE_INFO_STANDARD); + else + size = sizeof(FILE_ALL_INFO); + memcpy((char *) data, (char *) &pSMBr->hdr.Protocol + + data_offset, size); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QPathInfoRetry; + + return rc; +} + +int +CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_UNIX_BASIC_INFO *pFindData) +{ + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int rc = 0; + int bytes_returned; + __u16 params, byte_count; + +UnixQFileInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, + sizeof(FILE_UNIX_BASIC_INFO)); + } + } + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto UnixQFileInfoRetry; + + return rc; +} + +int +CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_UNIX_BASIC_INFO *pFindData, + const struct nls_table *nls_codepage, int remap) +{ +/* SMB_QUERY_FILE_UNIX_BASIC */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned = 0; + int name_len; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName); +UnixQPathInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, + sizeof(FILE_UNIX_BASIC_INFO)); + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto UnixQPathInfoRetry; + + return rc; +} + +/* xid, tcon, searchName and codepage are input parms, rest are returned */ +int +CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, + const char *searchName, struct cifs_sb_info *cifs_sb, + __u16 *pnetfid, __u16 search_flags, + struct cifs_search_info *psrch_inf, bool msearch) +{ +/* level 257 SMB_ */ + TRANSACTION2_FFIRST_REQ *pSMB = NULL; + TRANSACTION2_FFIRST_RSP *pSMBr = NULL; + T2_FFIRST_RSP_PARMS *parms; + int rc = 0; + int bytes_returned = 0; + int name_len, remap; + __u16 params, byte_count; + struct nls_table *nls_codepage; + + cifs_dbg(FYI, "In FindFirst for %s\n", searchName); + +findFirstRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + nls_codepage = cifs_sb->local_nls; + remap = cifs_remap(cifs_sb); + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + /* We can not add the asterik earlier in case + it got remapped to 0xF03A as if it were part of the + directory name instead of a wildcard */ + name_len *= 2; + if (msearch) { + pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[name_len+1] = 0; + pSMB->FileName[name_len+2] = '*'; + pSMB->FileName[name_len+3] = 0; + name_len += 4; /* now the trailing null */ + /* null terminate just in case */ + pSMB->FileName[name_len] = 0; + pSMB->FileName[name_len+1] = 0; + name_len += 2; + } + } else { /* BB add check for overrun of SMB buf BB */ + name_len = strnlen(searchName, PATH_MAX); +/* BB fix here and in unicode clause above ie + if (name_len > buffersize-header) + free buffer exit; BB */ + strncpy(pSMB->FileName, searchName, name_len); + if (msearch) { + pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[name_len+1] = '*'; + pSMB->FileName[name_len+2] = 0; + name_len += 3; + } + } + + params = 12 + name_len /* includes null */ ; + pSMB->TotalDataCount = 0; /* no EAs */ + pSMB->MaxParameterCount = cpu_to_le16(10); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes) + - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; /* one byte, no need to make endian neutral */ + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_FIRST); + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO)); + pSMB->SearchFlags = cpu_to_le16(search_flags); + pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); + + /* BB what should we set StorageType to? Does it matter? BB */ + pSMB->SearchStorageType = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_ffirst); + + if (rc) {/* BB add logic to retry regular search if Unix search + rejected unexpectedly by server */ + /* BB Add code to handle unsupported level rc */ + cifs_dbg(FYI, "Error in FindFirst = %d\n", rc); + + cifs_buf_release(pSMB); + + /* BB eventually could optimize out free and realloc of buf */ + /* for this case */ + if (rc == -EAGAIN) + goto findFirstRetry; + } else { /* decode response */ + /* BB remember to free buffer if error BB */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc == 0) { + unsigned int lnoff; + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + psrch_inf->unicode = true; + else + psrch_inf->unicode = false; + + psrch_inf->ntwrk_buf_start = (char *)pSMBr; + psrch_inf->smallBuf = 0; + psrch_inf->srch_entries_start = + (char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + parms = (T2_FFIRST_RSP_PARMS *)((char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset)); + + if (parms->EndofSearch) + psrch_inf->endOfSearch = true; + else + psrch_inf->endOfSearch = false; + + psrch_inf->entries_in_buffer = + le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (CIFSMaxBufSize < lnoff) { + cifs_dbg(VFS, "ignoring corrupt resume name\n"); + psrch_inf->last_entry = NULL; + return rc; + } + + psrch_inf->last_entry = psrch_inf->srch_entries_start + + lnoff; + + if (pnetfid) + *pnetfid = parms->SearchHandle; + } else { + cifs_buf_release(pSMB); + } + } + + return rc; +} + +int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, + __u16 searchHandle, __u16 search_flags, + struct cifs_search_info *psrch_inf) +{ + TRANSACTION2_FNEXT_REQ *pSMB = NULL; + TRANSACTION2_FNEXT_RSP *pSMBr = NULL; + T2_FNEXT_RSP_PARMS *parms; + char *response_data; + int rc = 0; + int bytes_returned; + unsigned int name_len; + __u16 params, byte_count; + + cifs_dbg(FYI, "In FindNext\n"); + + if (psrch_inf->endOfSearch) + return -ENOENT; + + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 14; /* includes 2 bytes of null string, converted to LE below*/ + byte_count = 0; + pSMB->TotalDataCount = 0; /* no EAs */ + pSMB->MaxParameterCount = cpu_to_le16(8); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_fnext_req,SearchHandle) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT); + pSMB->SearchHandle = searchHandle; /* always kept as le */ + pSMB->SearchCount = + cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO)); + pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); + pSMB->ResumeKey = psrch_inf->resume_key; + pSMB->SearchFlags = cpu_to_le16(search_flags); + + name_len = psrch_inf->resume_name_len; + params += name_len; + if (name_len < PATH_MAX) { + memcpy(pSMB->ResumeFileName, psrch_inf->presume_name, name_len); + byte_count += name_len; + /* 14 byte parm len above enough for 2 byte null terminator */ + pSMB->ResumeFileName[name_len] = 0; + pSMB->ResumeFileName[name_len+1] = 0; + } else { + rc = -EINVAL; + goto FNext2_err_exit; + } + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->stats.cifs_stats.num_fnext); + if (rc) { + if (rc == -EBADF) { + psrch_inf->endOfSearch = true; + cifs_buf_release(pSMB); + rc = 0; /* search probably was closed at end of search*/ + } else + cifs_dbg(FYI, "FindNext returned = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc == 0) { + unsigned int lnoff; + + /* BB fixme add lock for file (srch_info) struct here */ + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + psrch_inf->unicode = true; + else + psrch_inf->unicode = false; + response_data = (char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset); + parms = (T2_FNEXT_RSP_PARMS *)response_data; + response_data = (char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + if (psrch_inf->smallBuf) + cifs_small_buf_release( + psrch_inf->ntwrk_buf_start); + else + cifs_buf_release(psrch_inf->ntwrk_buf_start); + psrch_inf->srch_entries_start = response_data; + psrch_inf->ntwrk_buf_start = (char *)pSMB; + psrch_inf->smallBuf = 0; + if (parms->EndofSearch) + psrch_inf->endOfSearch = true; + else + psrch_inf->endOfSearch = false; + psrch_inf->entries_in_buffer = + le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry += + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (CIFSMaxBufSize < lnoff) { + cifs_dbg(VFS, "ignoring corrupt resume name\n"); + psrch_inf->last_entry = NULL; + return rc; + } else + psrch_inf->last_entry = + psrch_inf->srch_entries_start + lnoff; + +/* cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n", + psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ + + /* BB fixme add unlock here */ + } + + } + + /* BB On error, should we leave previous search buf (and count and + last entry fields) intact or free the previous one? */ + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ +FNext2_err_exit: + if (rc != 0) + cifs_buf_release(pSMB); + return rc; +} + +int +CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, + const __u16 searchHandle) +{ + int rc = 0; + FINDCLOSE_REQ *pSMB = NULL; + + cifs_dbg(FYI, "In CIFSSMBFindClose\n"); + rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); + + /* no sense returning error if session restarted + as file handle has been closed */ + if (rc == -EAGAIN) + return 0; + if (rc) + return rc; + + pSMB->FileID = searchHandle; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + if (rc) + cifs_dbg(VFS, "Send error in FindClose = %d\n", rc); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose); + + /* Since session is dead, search handle closed on server already */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +int +CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, + const char *search_name, __u64 *inode_number, + const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int name_len, bytes_returned; + __u16 params, byte_count; + + cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name); + if (tcon == NULL) + return -ENODEV; + +GetInodeNumberRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, + search_name, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(search_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, search_name, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max data count below from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc); + } else { + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + /* If rc should we check for EOPNOSUPP and + disable the srvino flag? or in caller? */ + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + struct file_internal_info *pfinfo; + /* BB Do we need a cast or hash here ? */ + if (count < 8) { + cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n"); + rc = -EIO; + goto GetInodeNumOut; + } + pfinfo = (struct file_internal_info *) + (data_offset + (char *) &pSMBr->hdr.Protocol); + *inode_number = le64_to_cpu(pfinfo->UniqueId); + } + } +GetInodeNumOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto GetInodeNumberRetry; + return rc; +} + +/* parses DFS refferal V3 structure + * caller is responsible for freeing target_nodes + * returns: + * on success - 0 + * on failure - errno + */ +static int +parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, + unsigned int *num_of_nodes, + struct dfs_info3_param **target_nodes, + const struct nls_table *nls_codepage, int remap, + const char *searchName) +{ + int i, rc = 0; + char *data_end; + bool is_unicode; + struct dfs_referral_level_3 *ref; + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); + + if (*num_of_nodes < 1) { + cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n", + *num_of_nodes); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); + if (ref->VersionNumber != cpu_to_le16(3)) { + cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", + le16_to_cpu(ref->VersionNumber)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + /* get the upper boundary of the resp buffer */ + data_end = (char *)(&(pSMBr->PathConsumed)) + + le16_to_cpu(pSMBr->t2.DataCount); + + cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n", + *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags)); + + *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param), + GFP_KERNEL); + if (*target_nodes == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* collect necessary data from referrals */ + for (i = 0; i < *num_of_nodes; i++) { + char *temp; + int max_len; + struct dfs_info3_param *node = (*target_nodes)+i; + + node->flags = le32_to_cpu(pSMBr->DFSFlags); + if (is_unicode) { + __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, + GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + cifsConvertToUTF16((__le16 *) tmp, searchName, + PATH_MAX, nls_codepage, remap); + node->path_consumed = cifs_utf16_bytes(tmp, + le16_to_cpu(pSMBr->PathConsumed), + nls_codepage); + kfree(tmp); + } else + node->path_consumed = le16_to_cpu(pSMBr->PathConsumed); + + node->server_type = le16_to_cpu(ref->ServerType); + node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags); + + /* copy DfsPath */ + temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); + max_len = data_end - temp; + node->path_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); + if (!node->path_name) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* copy link target UNC */ + temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); + max_len = data_end - temp; + node->node_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); + if (!node->node_name) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + ref++; + } + +parse_DFS_referrals_exit: + if (rc) { + free_dfs_info_array(*target_nodes, *num_of_nodes); + *target_nodes = NULL; + *num_of_nodes = 0; + } + return rc; +} + +int +CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, + const char *search_name, struct dfs_info3_param **target_nodes, + unsigned int *num_of_nodes, + const struct nls_table *nls_codepage, int remap) +{ +/* TRANS2_GET_DFS_REFERRAL */ + TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL; + TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + *num_of_nodes = 0; + *target_nodes = NULL; + + cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name); + if (ses == NULL) + return -ENODEV; +getDFSRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + /* server pointer checked in called function, + but should never be null here anyway */ + pSMB->hdr.Mid = get_next_mid(ses->server); + pSMB->hdr.Tid = ses->ipc_tid; + pSMB->hdr.Uid = ses->Suid; + if (ses->capabilities & CAP_STATUS32) + pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; + if (ses->capabilities & CAP_DFS) + pSMB->hdr.Flags2 |= SMBFLG2_DFS; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; + name_len = + cifsConvertToUTF16((__le16 *) pSMB->RequestFileName, + search_name, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(search_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->RequestFileName, search_name, name_len); + } + + if (ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + pSMB->hdr.Uid = ses->Suid; + + params = 2 /* level */ + name_len /*includes null */ ; + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = 0; + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_get_dfs_refer_req, MaxReferralLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_GET_DFS_REFERRAL); + byte_count = params + 3 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->MaxReferralLevel = cpu_to_le16(3); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc); + goto GetDFSRefExit; + } + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + /* BB Also check if enough total bytes returned? */ + if (rc || get_bcc(&pSMBr->hdr) < 17) { + rc = -EIO; /* bad smb */ + goto GetDFSRefExit; + } + + cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d Offset %d\n", + get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset)); + + /* parse returned result into more usable form */ + rc = parse_DFS_referrals(pSMBr, num_of_nodes, + target_nodes, nls_codepage, remap, + search_name); + +GetDFSRefExit: + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto getDFSRetry; + + return rc; +} + +/* Query File System Info such as free space to old servers such as Win 9x */ +int +SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData) +{ +/* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_ALLOC_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "OldQFSInfo\n"); +oldQFSInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 18) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + cifs_dbg(FYI, "qfsinf resp BCC: %d Offset %d\n", + get_bcc(&pSMBr->hdr), data_offset); + + response_data = (FILE_SYSTEM_ALLOC_INFO *) + (((char *) &pSMBr->hdr.Protocol) + data_offset); + FSData->f_bsize = + le16_to_cpu(response_data->BytesPerSector) * + le32_to_cpu(response_data-> + SectorsPerAllocationUnit); + FSData->f_blocks = + le32_to_cpu(response_data->TotalAllocationUnits); + FSData->f_bfree = FSData->f_bavail = + le32_to_cpu(response_data->FreeAllocationUnits); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto oldQFSInfoRetry; + + return rc; +} + +int +CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData) +{ +/* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QFSInfo\n"); +QFSInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 24) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + + response_data = + (FILE_SYSTEM_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + FSData->f_bsize = + le32_to_cpu(response_data->BytesPerSector) * + le32_to_cpu(response_data-> + SectorsPerAllocationUnit); + FSData->f_blocks = + le64_to_cpu(response_data->TotalAllocationUnits); + FSData->f_bfree = FSData->f_bavail = + le64_to_cpu(response_data->FreeAllocationUnits); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSInfoRetry; + + return rc; +} + +int +CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon) +{ +/* level 0x105 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_ATTRIBUTE_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QFSAttributeInfo\n"); +QFSAttributeRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + /* BB also check if enough bytes returned */ + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_ATTRIBUTE_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsAttrInfo, response_data, + sizeof(FILE_SYSTEM_ATTRIBUTE_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSAttributeRetry; + + return rc; +} + +int +CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon) +{ +/* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_DEVICE_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QFSDeviceInfo\n"); +QFSDeviceRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < + sizeof(FILE_SYSTEM_DEVICE_INFO)) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_DEVICE_INFO *) + (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsDevInfo, response_data, + sizeof(FILE_SYSTEM_DEVICE_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSDeviceRetry; + + return rc; +} + +int +CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon) +{ +/* level 0x200 SMB_QUERY_CIFS_UNIX_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_UNIX_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QFSUnixInfo\n"); +QFSUnixRetry: + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof(struct + smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_UNIX_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsUnixInfo, response_data, + sizeof(FILE_SYSTEM_UNIX_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSUnixRetry; + + + return rc; +} + +int +CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap) +{ +/* level 0x200 SMB_SET_CIFS_UNIX_INFO */ + TRANSACTION2_SETFSI_REQ *pSMB = NULL; + TRANSACTION2_SETFSI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cifs_dbg(FYI, "In SETFSUnixInfo\n"); +SETFSUnixRetry: + /* BB switch to small buf init to save memory */ + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + params = 4; /* 2 bytes zero followed by info level. */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_setfsi_req, FileNum) + - 4; + offset = param_offset + params; + + pSMB->MaxParameterCount = cpu_to_le16(4); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION); + byte_count = 1 /* pad */ + params + 12; + + pSMB->DataCount = cpu_to_le16(12); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + + /* Params. */ + pSMB->FileNum = 0; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_CIFS_UNIX_INFO); + + /* Data. */ + pSMB->ClientUnixMajor = cpu_to_le16(CIFS_UNIX_MAJOR_VERSION); + pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION); + pSMB->ClientUnixCap = cpu_to_le64(cap); + + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc) + rc = -EIO; /* bad smb */ + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SETFSUnixRetry; + + return rc; +} + + + +int +CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData) +{ +/* level 0x201 SMB_QUERY_CIFS_POSIX_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_POSIX_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cifs_dbg(FYI, "In QFSPosixInfo\n"); +QFSPosixRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof(struct + smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_POSIX_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + FSData->f_bsize = + le32_to_cpu(response_data->BlockSize); + FSData->f_blocks = + le64_to_cpu(response_data->TotalBlocks); + FSData->f_bfree = + le64_to_cpu(response_data->BlocksAvail); + if (response_data->UserBlocksAvail == cpu_to_le64(-1)) { + FSData->f_bavail = FSData->f_bfree; + } else { + FSData->f_bavail = + le64_to_cpu(response_data->UserBlocksAvail); + } + if (response_data->TotalFileNodes != cpu_to_le64(-1)) + FSData->f_files = + le64_to_cpu(response_data->TotalFileNodes); + if (response_data->FreeFileNodes != cpu_to_le64(-1)) + FSData->f_ffree = + le64_to_cpu(response_data->FreeFileNodes); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSPosixRetry; + + return rc; +} + + +/* + * We can not use write of zero bytes trick to set file size due to need for + * large file support. Also note that this SetPathInfo is preferred to + * SetFileInfo based method in next routine which is only needed to work around + * a sharing violation bugin Samba which this routine can run into. + */ +int +CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, + const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb, + bool set_allocation) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + struct file_end_of_file_info *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + int remap = cifs_remap(cifs_sb); + + __u16 params, byte_count, data_count, param_offset, offset; + + cifs_dbg(FYI, "In SetEOF\n"); +SetEOFRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name, + PATH_MAX, cifs_sb->local_nls, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(file_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, file_name, name_len); + } + params = 6 + name_len; + data_count = sizeof(struct file_end_of_file_info); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(4100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + if (set_allocation) { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO); + } else /* Set File Size */ { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO); + } + + parm_data = + (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + data_count; + pSMB->DataCount = cpu_to_le16(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + parm_data->FileSize = cpu_to_le64(size); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetEOFRetry; + + return rc; +} + +int +CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, __u64 size, bool set_allocation) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct file_end_of_file_info *parm_data; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n", + (long long)size); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)cfile->pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(cfile->pid >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + count = sizeof(struct file_end_of_file_info); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + parm_data = + (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->DataOffset = cpu_to_le16(offset); + parm_data->FileSize = cpu_to_le64(size); + pSMB->Fid = cfile->fid.netfid; + if (set_allocation) { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO); + } else /* Set File Size */ { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO); + } + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + if (rc) { + cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n", + rc); + } + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +/* Some legacy servers such as NT4 require that the file times be set on + an open handle, rather than by pathname - this is awkward due to + potential access conflicts on the open, but it is unavoidable for these + old servers since the only other choice is to go from 100 nanosecond DCE + time and resort to the original setpathinfo level which takes the ancient + DOS time format with 2 second granularity */ +int +CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cifs_dbg(FYI, "Set Times (via SetFileInfo)\n"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *)pSMB + + offsetof(struct smb_hdr, Protocol) + offset; + + count = sizeof(FILE_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2); + else + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + if (rc) + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = 1; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + *data_offset = delete_file ? 1 : 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + if (rc) + cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc); + + return rc; +} + +int +CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const FILE_BASIC_INFO *data, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + char *data_offset; + __u16 params, param_offset, offset, byte_count, count; + + cifs_dbg(FYI, "In SetTimes\n"); + +SetTimesRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + count = sizeof(FILE_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2); + else + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetTimesRetry; + + return rc; +} + +/* Can not be used to set time stamps yet (due to old DOS time format) */ +/* Can be used to set attributes */ +#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug + handling it anyway and NT4 was what we thought it would be needed for + Do not delete it until we prove whether needed for Win9x though */ +int +CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, + __u16 dos_attrs, const struct nls_table *nls_codepage) +{ + SETATTR_REQ *pSMB = NULL; + SETATTR_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + + cifs_dbg(FYI, "In SetAttrLegacy\n"); + +SetAttrLgcyRetry: + rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + ConvertToUTF16((__le16 *) pSMB->fileName, fileName, + PATH_MAX, nls_codepage); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, fileName, name_len); + } + pSMB->attr = cpu_to_le16(dos_attrs); + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetAttrLgcyRetry; + + return rc; +} +#endif /* temporarily unneeded SetAttr legacy function */ + +static void +cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, + const struct cifs_unix_set_info_args *args) +{ + u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64; + u64 mode = args->mode; + + if (uid_valid(args->uid)) + uid = from_kuid(&init_user_ns, args->uid); + if (gid_valid(args->gid)) + gid = from_kgid(&init_user_ns, args->gid); + + /* + * Samba server ignores set of file size to zero due to bugs in some + * older clients, but we should be precise - we use SetFileSize to + * set file size and do not want to truncate file size to zero + * accidentally as happened on one Samba server beta by putting + * zero instead of -1 here + */ + data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); + data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64); + data_offset->LastStatusChange = cpu_to_le64(args->ctime); + data_offset->LastAccessTime = cpu_to_le64(args->atime); + data_offset->LastModificationTime = cpu_to_le64(args->mtime); + data_offset->Uid = cpu_to_le64(uid); + data_offset->Gid = cpu_to_le64(gid); + /* better to leave device as zero when it is */ + data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); + data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); + data_offset->Permissions = cpu_to_le64(mode); + + if (S_ISREG(mode)) + data_offset->Type = cpu_to_le32(UNIX_FILE); + else if (S_ISDIR(mode)) + data_offset->Type = cpu_to_le32(UNIX_DIR); + else if (S_ISLNK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SYMLINK); + else if (S_ISCHR(mode)) + data_offset->Type = cpu_to_le32(UNIX_CHARDEV); + else if (S_ISBLK(mode)) + data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV); + else if (S_ISFIFO(mode)) + data_offset->Type = cpu_to_le32(UNIX_FIFO); + else if (S_ISSOCK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SOCKET); +} + +int +CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + u16 params, param_offset, offset, byte_count, count; + + cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *)pSMB + + offsetof(struct smb_hdr, Protocol) + offset; + + count = sizeof(FILE_UNIX_BASIC_INFO); + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args); + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + if (rc) + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, + const char *file_name, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + FILE_UNIX_BASIC_INFO *data_offset; + __u16 params, param_offset, offset, count, byte_count; + + cifs_dbg(FYI, "In SetUID/GID/Mode\n"); +setPermsRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(file_name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, file_name, name_len); + } + + params = 6 + name_len; + count = sizeof(FILE_UNIX_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + data_offset = + (FILE_UNIX_BASIC_INFO *) ((char *) &pSMB->hdr.Protocol + + offset); + memset(data_offset, 0, count); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->DataCount = cpu_to_le16(count); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + + cifs_fill_unix_set_info(data_offset, args); + + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setPermsRetry; + return rc; +} + +#ifdef CONFIG_CIFS_XATTR +/* + * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common + * function used by listxattr and getxattr type calls. When ea_name is set, + * it looks for that attribute name and stuffs that value into the EAData + * buffer. When ea_name is NULL, it stuffs a list of attribute names into the + * buffer. In both cases, the return value is either the length of the + * resulting data or a negative error code. If EAData is a NULL pointer then + * the data isn't copied to it, but the length is returned. + */ +ssize_t +CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, const unsigned char *ea_name, + char *EAData, size_t buf_size, + const struct nls_table *nls_codepage, int remap) +{ + /* BB assumes one setup word */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int list_len; + struct fealist *ea_response_data; + struct fea *temp_fea; + char *temp_ptr; + char *end_of_smb; + __u16 params, byte_count, data_offset; + unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; + + cifs_dbg(FYI, "In Query All EAs path %s\n", searchName); +QAllEAsRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + list_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + list_len++; /* trailing null */ + list_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + list_len = strnlen(searchName, PATH_MAX); + list_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, list_len); + } + + params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc); + goto QAllEAsOut; + } + + + /* BB also check enough total bytes returned */ + /* BB we need to improve the validity checking + of these trans2 responses */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc || get_bcc(&pSMBr->hdr) < 4) { + rc = -EIO; /* bad smb */ + goto QAllEAsOut; + } + + /* check that length of list is not more than bcc */ + /* check that each entry does not go beyond length + of list */ + /* check that each element of each entry does not + go beyond end of list */ + /* validate_trans2_offsets() */ + /* BB check if start of smb + data_offset > &bcc+ bcc */ + + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + ea_response_data = (struct fealist *) + (((char *) &pSMBr->hdr.Protocol) + data_offset); + + list_len = le32_to_cpu(ea_response_data->list_len); + cifs_dbg(FYI, "ea length %d\n", list_len); + if (list_len <= 8) { + cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + goto QAllEAsOut; + } + + /* make sure list_len doesn't go past end of SMB */ + end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr); + if ((char *)ea_response_data + list_len > end_of_smb) { + cifs_dbg(FYI, "EA list appears to go beyond SMB\n"); + rc = -EIO; + goto QAllEAsOut; + } + + /* account for ea list len */ + list_len -= 4; + temp_fea = ea_response_data->list; + temp_ptr = (char *)temp_fea; + while (list_len > 0) { + unsigned int name_len; + __u16 value_len; + + list_len -= 4; + temp_ptr += 4; + /* make sure we can read name_len and value_len */ + if (list_len < 0) { + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); + rc = -EIO; + goto QAllEAsOut; + } + + name_len = temp_fea->name_len; + value_len = le16_to_cpu(temp_fea->value_len); + list_len -= name_len + 1 + value_len; + if (list_len < 0) { + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); + rc = -EIO; + goto QAllEAsOut; + } + + if (ea_name) { + if (ea_name_len == name_len && + memcmp(ea_name, temp_ptr, name_len) == 0) { + temp_ptr += name_len + 1; + rc = value_len; + if (buf_size == 0) + goto QAllEAsOut; + if ((size_t)value_len > buf_size) { + rc = -ERANGE; + goto QAllEAsOut; + } + memcpy(EAData, temp_ptr, value_len); + goto QAllEAsOut; + } + } else { + /* account for prefix user. and trailing null */ + rc += (5 + 1 + name_len); + if (rc < (int) buf_size) { + memcpy(EAData, "user.", 5); + EAData += 5; + memcpy(EAData, temp_ptr, name_len); + EAData += name_len; + /* null terminate name */ + *EAData = 0; + ++EAData; + } else if (buf_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + } + temp_ptr += name_len + 1 + value_len; + temp_fea = (struct fea *)temp_ptr; + } + + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + +QAllEAsOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QAllEAsRetry; + + return (ssize_t)rc; +} + +int +CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, const char *ea_name, const void *ea_value, + const __u16 ea_value_len, const struct nls_table *nls_codepage, + int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + struct fealist *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, byte_count, offset, count; + + cifs_dbg(FYI, "In SetEA\n"); +SetEARetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + + /* done calculating parms using name_len of file name, + now use name_len to calculate length of ea name + we are going to create in the inode xattrs */ + if (ea_name == NULL) + name_len = 0; + else + name_len = strnlen(ea_name, 255); + + count = sizeof(*parm_data) + ea_value_len + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_EA); + + parm_data = + (struct fealist *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + parm_data->list_len = cpu_to_le32(count); + parm_data->list[0].EA_flags = 0; + /* we checked above that name len is less than 255 */ + parm_data->list[0].name_len = (__u8)name_len; + /* EA names are always ASCII */ + if (ea_name) + strncpy(parm_data->list[0].name, ea_name, name_len); + parm_data->list[0].name[name_len] = 0; + parm_data->list[0].value_len = cpu_to_le16(ea_value_len); + /* caller ensures that ea_value_len is less than 64K but + we need to ensure that it fits within the smb */ + + /*BB add length check to see if it would fit in + negotiated SMB buffer size BB */ + /* if (ea_value_len > buffer_size - 512 (enough for header)) */ + if (ea_value_len) + memcpy(parm_data->list[0].name+name_len+1, + ea_value, ea_value_len); + + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetEARetry; + + return rc; +} +#endif + +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */ +/* + * Years ago the kernel added a "dnotify" function for Samba server, + * to allow network clients (such as Windows) to display updated + * lists of files in directory listings automatically when + * files are added by one user when another user has the + * same directory open on their desktop. The Linux cifs kernel + * client hooked into the kernel side of this interface for + * the same reason, but ironically when the VFS moved from + * "dnotify" to "inotify" it became harder to plug in Linux + * network file system clients (the most obvious use case + * for notify interfaces is when multiple users can update + * the contents of the same directory - exactly what network + * file systems can do) although the server (Samba) could + * still use it. For the short term we leave the worker + * function ifdeffed out (below) until inotify is fixed + * in the VFS to make it easier to plug in network file + * system clients. If inotify turns out to be permanently + * incompatible for network fs clients, we could instead simply + * expose this config flag by adding a future cifs (and smb2) notify ioctl. + */ +int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *pfile, int multishot, + const struct nls_table *nls_codepage) +{ + int rc = 0; + struct smb_com_transaction_change_notify_req *pSMB = NULL; + struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL; + struct dir_notify_req *dnotify_req; + int bytes_returned; + + cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->TotalParameterCount = 0 ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le32(2); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 4; /* single byte does not need le conversion */ + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE); + pSMB->ParameterCount = pSMB->TotalParameterCount; + if (notify_subdirs) + pSMB->WatchTree = 1; /* one byte - no le conversion needed */ + pSMB->Reserved2 = 0; + pSMB->CompletionFilter = cpu_to_le32(filter); + pSMB->Fid = netfid; /* file handle always le */ + pSMB->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, + CIFS_ASYNC_OP); + if (rc) { + cifs_dbg(FYI, "Error in Notify = %d\n", rc); + } else { + /* Add file to outstanding requests */ + /* BB change to kmem cache alloc */ + dnotify_req = kmalloc( + sizeof(struct dir_notify_req), + GFP_KERNEL); + if (dnotify_req) { + dnotify_req->Pid = pSMB->hdr.Pid; + dnotify_req->PidHigh = pSMB->hdr.PidHigh; + dnotify_req->Mid = pSMB->hdr.Mid; + dnotify_req->Tid = pSMB->hdr.Tid; + dnotify_req->Uid = pSMB->hdr.Uid; + dnotify_req->netfid = netfid; + dnotify_req->pfile = pfile; + dnotify_req->filter = filter; + dnotify_req->multishot = multishot; + spin_lock(&GlobalMid_Lock); + list_add_tail(&dnotify_req->lhead, + &GlobalDnotifyReqList); + spin_unlock(&GlobalMid_Lock); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + return rc; +} +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ diff --git a/kernel/fs/cifs/connect.c b/kernel/fs/cifs/connect.c new file mode 100644 index 000000000..8383d5ea4 --- /dev/null +++ b/kernel/fs/cifs/connect.c @@ -0,0 +1,4137 @@ +/* + * fs/cifs/connect.c + * + * Copyright (C) International Business Machines Corp., 2002,2011 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "ntlmssp.h" +#include "nterr.h" +#include "rfc1002pdu.h" +#include "fscache.h" + +#define CIFS_PORT 445 +#define RFC1001_PORT 139 + +extern mempool_t *cifs_req_poolp; + +/* FIXME: should these be tunable? */ +#define TLINK_ERROR_EXPIRE (1 * HZ) +#define TLINK_IDLE_EXPIRE (600 * HZ) + +enum { + + /* Mount options that take no arguments */ + Opt_user_xattr, Opt_nouser_xattr, + Opt_forceuid, Opt_noforceuid, + Opt_forcegid, Opt_noforcegid, + Opt_noblocksend, Opt_noautotune, + Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_mapposix, Opt_nomapposix, + Opt_mapchars, Opt_nomapchars, Opt_sfu, + Opt_nosfu, Opt_nodfs, Opt_posixpaths, + Opt_noposixpaths, Opt_nounix, + Opt_nocase, + Opt_brl, Opt_nobrl, + Opt_forcemandatorylock, Opt_setuids, + Opt_nosetuids, Opt_dynperm, Opt_nodynperm, + Opt_nohard, Opt_nosoft, + Opt_nointr, Opt_intr, + Opt_nostrictsync, Opt_strictsync, + Opt_serverino, Opt_noserverino, + Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl, + Opt_acl, Opt_noacl, Opt_locallease, + Opt_sign, Opt_seal, Opt_noac, + Opt_fsc, Opt_mfsymlinks, + Opt_multiuser, Opt_sloppy, Opt_nosharesock, + + /* Mount options which take numeric value */ + Opt_backupuid, Opt_backupgid, Opt_uid, + Opt_cruid, Opt_gid, Opt_file_mode, + Opt_dirmode, Opt_port, + Opt_rsize, Opt_wsize, Opt_actimeo, + + /* Mount options which take string value */ + Opt_user, Opt_pass, Opt_ip, + Opt_domain, Opt_srcaddr, Opt_iocharset, + Opt_netbiosname, Opt_servern, + Opt_ver, Opt_vers, Opt_sec, Opt_cache, + + /* Mount options to be ignored */ + Opt_ignore, + + /* Options which could be blank */ + Opt_blank_pass, + Opt_blank_user, + Opt_blank_ip, + + Opt_err +}; + +static const match_table_t cifs_mount_option_tokens = { + + { Opt_user_xattr, "user_xattr" }, + { Opt_nouser_xattr, "nouser_xattr" }, + { Opt_forceuid, "forceuid" }, + { Opt_noforceuid, "noforceuid" }, + { Opt_forcegid, "forcegid" }, + { Opt_noforcegid, "noforcegid" }, + { Opt_noblocksend, "noblocksend" }, + { Opt_noautotune, "noautotune" }, + { Opt_hard, "hard" }, + { Opt_soft, "soft" }, + { Opt_perm, "perm" }, + { Opt_noperm, "noperm" }, + { Opt_mapchars, "mapchars" }, /* SFU style */ + { Opt_nomapchars, "nomapchars" }, + { Opt_mapposix, "mapposix" }, /* SFM style */ + { Opt_nomapposix, "nomapposix" }, + { Opt_sfu, "sfu" }, + { Opt_nosfu, "nosfu" }, + { Opt_nodfs, "nodfs" }, + { Opt_posixpaths, "posixpaths" }, + { Opt_noposixpaths, "noposixpaths" }, + { Opt_nounix, "nounix" }, + { Opt_nounix, "nolinux" }, + { Opt_nocase, "nocase" }, + { Opt_nocase, "ignorecase" }, + { Opt_brl, "brl" }, + { Opt_nobrl, "nobrl" }, + { Opt_nobrl, "nolock" }, + { Opt_forcemandatorylock, "forcemandatorylock" }, + { Opt_forcemandatorylock, "forcemand" }, + { Opt_setuids, "setuids" }, + { Opt_nosetuids, "nosetuids" }, + { Opt_dynperm, "dynperm" }, + { Opt_nodynperm, "nodynperm" }, + { Opt_nohard, "nohard" }, + { Opt_nosoft, "nosoft" }, + { Opt_nointr, "nointr" }, + { Opt_intr, "intr" }, + { Opt_nostrictsync, "nostrictsync" }, + { Opt_strictsync, "strictsync" }, + { Opt_serverino, "serverino" }, + { Opt_noserverino, "noserverino" }, + { Opt_rwpidforward, "rwpidforward" }, + { Opt_cifsacl, "cifsacl" }, + { Opt_nocifsacl, "nocifsacl" }, + { Opt_acl, "acl" }, + { Opt_noacl, "noacl" }, + { Opt_locallease, "locallease" }, + { Opt_sign, "sign" }, + { Opt_seal, "seal" }, + { Opt_noac, "noac" }, + { Opt_fsc, "fsc" }, + { Opt_mfsymlinks, "mfsymlinks" }, + { Opt_multiuser, "multiuser" }, + { Opt_sloppy, "sloppy" }, + { Opt_nosharesock, "nosharesock" }, + + { Opt_backupuid, "backupuid=%s" }, + { Opt_backupgid, "backupgid=%s" }, + { Opt_uid, "uid=%s" }, + { Opt_cruid, "cruid=%s" }, + { Opt_gid, "gid=%s" }, + { Opt_file_mode, "file_mode=%s" }, + { Opt_dirmode, "dirmode=%s" }, + { Opt_dirmode, "dir_mode=%s" }, + { Opt_port, "port=%s" }, + { Opt_rsize, "rsize=%s" }, + { Opt_wsize, "wsize=%s" }, + { Opt_actimeo, "actimeo=%s" }, + + { Opt_blank_user, "user=" }, + { Opt_blank_user, "username=" }, + { Opt_user, "user=%s" }, + { Opt_user, "username=%s" }, + { Opt_blank_pass, "pass=" }, + { Opt_blank_pass, "password=" }, + { Opt_pass, "pass=%s" }, + { Opt_pass, "password=%s" }, + { Opt_blank_ip, "ip=" }, + { Opt_blank_ip, "addr=" }, + { Opt_ip, "ip=%s" }, + { Opt_ip, "addr=%s" }, + { Opt_ignore, "unc=%s" }, + { Opt_ignore, "target=%s" }, + { Opt_ignore, "path=%s" }, + { Opt_domain, "dom=%s" }, + { Opt_domain, "domain=%s" }, + { Opt_domain, "workgroup=%s" }, + { Opt_srcaddr, "srcaddr=%s" }, + { Opt_ignore, "prefixpath=%s" }, + { Opt_iocharset, "iocharset=%s" }, + { Opt_netbiosname, "netbiosname=%s" }, + { Opt_servern, "servern=%s" }, + { Opt_ver, "ver=%s" }, + { Opt_vers, "vers=%s" }, + { Opt_sec, "sec=%s" }, + { Opt_cache, "cache=%s" }, + + { Opt_ignore, "cred" }, + { Opt_ignore, "credentials" }, + { Opt_ignore, "cred=%s" }, + { Opt_ignore, "credentials=%s" }, + { Opt_ignore, "guest" }, + { Opt_ignore, "rw" }, + { Opt_ignore, "ro" }, + { Opt_ignore, "suid" }, + { Opt_ignore, "nosuid" }, + { Opt_ignore, "exec" }, + { Opt_ignore, "noexec" }, + { Opt_ignore, "nodev" }, + { Opt_ignore, "noauto" }, + { Opt_ignore, "dev" }, + { Opt_ignore, "mand" }, + { Opt_ignore, "nomand" }, + { Opt_ignore, "_netdev" }, + + { Opt_err, NULL } +}; + +enum { + Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, + Opt_sec_ntlmsspi, Opt_sec_ntlmssp, + Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2, + Opt_sec_ntlmv2i, Opt_sec_lanman, + Opt_sec_none, + + Opt_sec_err +}; + +static const match_table_t cifs_secflavor_tokens = { + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + { Opt_sec_ntlmsspi, "ntlmsspi" }, + { Opt_sec_ntlmssp, "ntlmssp" }, + { Opt_ntlm, "ntlm" }, + { Opt_sec_ntlmi, "ntlmi" }, + { Opt_sec_ntlmv2, "nontlm" }, + { Opt_sec_ntlmv2, "ntlmv2" }, + { Opt_sec_ntlmv2i, "ntlmv2i" }, + { Opt_sec_lanman, "lanman" }, + { Opt_sec_none, "none" }, + + { Opt_sec_err, NULL } +}; + +/* cache flavors */ +enum { + Opt_cache_loose, + Opt_cache_strict, + Opt_cache_none, + Opt_cache_err +}; + +static const match_table_t cifs_cacheflavor_tokens = { + { Opt_cache_loose, "loose" }, + { Opt_cache_strict, "strict" }, + { Opt_cache_none, "none" }, + { Opt_cache_err, NULL } +}; + +static const match_table_t cifs_smb_version_tokens = { + { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, + { Smb_21, SMB21_VERSION_STRING }, + { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, +}; + +static int ip_connect(struct TCP_Server_Info *server); +static int generic_ip_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); +static void cifs_prune_tlinks(struct work_struct *work); +static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname); + +/* + * cifs tcp session reconnection + * + * mark tcp session as reconnecting so temporarily locked + * mark all smb sessions as reconnecting for tcp session + * reconnect tcp session + * wake up waiters on reconnection? - (not needed currently) + */ +int +cifs_reconnect(struct TCP_Server_Info *server) +{ + int rc = 0; + struct list_head *tmp, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct mid_q_entry *mid_entry; + struct list_head retry_list; + + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus == CifsExiting) { + /* the demux thread will exit normally + next time through the loop */ + spin_unlock(&GlobalMid_Lock); + return rc; + } else + server->tcpStatus = CifsNeedReconnect; + spin_unlock(&GlobalMid_Lock); + server->maxBuf = 0; +#ifdef CONFIG_CIFS_SMB2 + server->max_read = 0; +#endif + + cifs_dbg(FYI, "Reconnecting tcp session\n"); + + /* before reconnecting the tcp session, mark the smb session (uid) + and the tid bad so they are not used until reconnected */ + cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", + __func__); + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + ses->need_reconnect = true; + ses->ipc_tid = 0; + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + tcon->need_reconnect = true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + + /* do not want to be sending data on a socket we are freeing */ + cifs_dbg(FYI, "%s: tearing down socket\n", __func__); + mutex_lock(&server->srv_mutex); + if (server->ssocket) { + cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); + kernel_sock_shutdown(server->ssocket, SHUT_WR); + cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); + sock_release(server->ssocket); + server->ssocket = NULL; + } + server->sequence_number = 0; + server->session_estab = false; + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + server->lstrp = jiffies; + mutex_unlock(&server->srv_mutex); + + /* mark submitted MIDs for retry and issue callback */ + INIT_LIST_HEAD(&retry_list); + cifs_dbg(FYI, "%s: moving mids to private list\n", __func__); + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + if (mid_entry->mid_state == MID_REQUEST_SUBMITTED) + mid_entry->mid_state = MID_RETRY_NEEDED; + list_move(&mid_entry->qhead, &retry_list); + } + spin_unlock(&GlobalMid_Lock); + + cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); + list_for_each_safe(tmp, tmp2, &retry_list) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + } + + do { + try_to_freeze(); + + /* we should try only the port we connected to before */ + mutex_lock(&server->srv_mutex); + rc = generic_ip_connect(server); + if (rc) { + cifs_dbg(FYI, "reconnect error %d\n", rc); + mutex_unlock(&server->srv_mutex); + msleep(3000); + } else { + atomic_inc(&tcpSesReconnectCount); + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedNegotiate; + spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); + } + } while (server->tcpStatus == CifsNeedReconnect); + + return rc; +} + +static void +cifs_echo_request(struct work_struct *work) +{ + int rc; + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, echo.work); + + /* + * We cannot send an echo if it is disabled or until the + * NEGOTIATE_PROTOCOL request is done, which is indicated by + * server->ops->need_neg() == true. Also, no need to ping if + * we got a response recently. + */ + if (!server->ops->need_neg || server->ops->need_neg(server) || + (server->ops->can_echo && !server->ops->can_echo(server)) || + time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + goto requeue_echo; + + rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; + if (rc) + cifs_dbg(FYI, "Unable to send echo request to server: %s\n", + server->hostname); + +requeue_echo: + queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); +} + +static bool +allocate_buffers(struct TCP_Server_Info *server) +{ + if (!server->bigbuf) { + server->bigbuf = (char *)cifs_buf_get(); + if (!server->bigbuf) { + cifs_dbg(VFS, "No memory for large SMB response\n"); + msleep(3000); + /* retry will check if exiting */ + return false; + } + } else if (server->large_buf) { + /* we are reusing a dirty large buf, clear its start */ + memset(server->bigbuf, 0, HEADER_SIZE(server)); + } + + if (!server->smallbuf) { + server->smallbuf = (char *)cifs_small_buf_get(); + if (!server->smallbuf) { + cifs_dbg(VFS, "No memory for SMB response\n"); + msleep(1000); + /* retry will check if exiting */ + return false; + } + /* beginning of smb buffer is cleared in our buf_get */ + } else { + /* if existing small buf clear beginning */ + memset(server->smallbuf, 0, HEADER_SIZE(server)); + } + + return true; +} + +static bool +server_unresponsive(struct TCP_Server_Info *server) +{ + /* + * We need to wait 2 echo intervals to make sure we handle such + * situations right: + * 1s client sends a normal SMB request + * 2s client gets a response + * 30s echo workqueue job pops, and decides we got a response recently + * and don't need to send another + * ... + * 65s kernel_recvmsg times out, and we see that we haven't gotten + * a response in >60s. + */ + if (server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { + cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", + server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); + cifs_reconnect(server); + wake_up(&server->response_q); + return true; + } + + return false; +} + +/* + * kvec_array_init - clone a kvec array, and advance into it + * @new: pointer to memory for cloned array + * @iov: pointer to original array + * @nr_segs: number of members in original array + * @bytes: number of bytes to advance into the cloned array + * + * This function will copy the array provided in iov to a section of memory + * and advance the specified number of bytes into the new array. It returns + * the number of segments in the new array. "new" must be at least as big as + * the original iov array. + */ +static unsigned int +kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, + size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} + +static struct kvec * +get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) +{ + struct kvec *new_iov; + + if (server->iov && nr_segs <= server->nr_iov) + return server->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); + if (new_iov) { + kfree(server->iov); + server->iov = new_iov; + server->nr_iov = nr_segs; + } + return new_iov; +} + +int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) +{ + int length = 0; + int total_read; + unsigned int segs; + struct msghdr smb_msg; + struct kvec *iov; + + iov = get_server_iovec(server, nr_segs); + if (!iov) + return -ENOMEM; + + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + + for (total_read = 0; to_read; total_read += length, to_read -= length) { + try_to_freeze(); + + if (server_unresponsive(server)) { + total_read = -ECONNABORTED; + break; + } + + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(server->ssocket, &smb_msg, + iov, segs, to_read, 0); + + if (server->tcpStatus == CifsExiting) { + total_read = -ESHUTDOWN; + break; + } else if (server->tcpStatus == CifsNeedReconnect) { + cifs_reconnect(server); + total_read = -ECONNABORTED; + break; + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { + /* + * Minimum sleep to prevent looping, allowing socket + * to clear and app threads to set tcpStatus + * CifsNeedReconnect if server hung. + */ + usleep_range(1000, 2000); + length = 0; + continue; + } else if (length <= 0) { + cifs_dbg(FYI, "Received no data or error: expecting %d\n" + "got %d", to_read, length); + cifs_reconnect(server); + total_read = -ECONNABORTED; + break; + } + } + return total_read; +} + +int +cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) +{ + struct kvec iov; + + iov.iov_base = buf; + iov.iov_len = to_read; + + return cifs_readv_from_socket(server, &iov, 1, to_read); +} + +static bool +is_smb_response(struct TCP_Server_Info *server, unsigned char type) +{ + /* + * The first byte big endian of the length field, + * is actually not part of the length but the type + * with the most common, zero, as regular data. + */ + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB response */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + cifs_dbg(FYI, "RFC 1002 session keep alive\n"); + break; + case RFC1002_POSITIVE_SESSION_RESPONSE: + cifs_dbg(FYI, "RFC 1002 positive session response\n"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: + /* + * We get this from Windows 98 instead of an error on + * SMB negprot response. + */ + cifs_dbg(FYI, "RFC 1002 negative session response\n"); + /* give server a second to clean up */ + msleep(1000); + /* + * Always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame). + */ + cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); + cifs_reconnect(server); + wake_up(&server->response_q); + break; + default: + cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); + cifs_reconnect(server); + } + + return false; +} + +void +dequeue_mid(struct mid_q_entry *mid, bool malformed) +{ +#ifdef CONFIG_CIFS_STATS2 + mid->when_received = jiffies; +#endif + spin_lock(&GlobalMid_Lock); + if (!malformed) + mid->mid_state = MID_RESPONSE_RECEIVED; + else + mid->mid_state = MID_RESPONSE_MALFORMED; + list_del_init(&mid->qhead); + spin_unlock(&GlobalMid_Lock); +} + +static void +handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, + char *buf, int malformed) +{ + if (server->ops->check_trans2 && + server->ops->check_trans2(mid, server, buf, malformed)) + return; + mid->resp_buf = buf; + mid->large_buf = server->large_buf; + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!mid->multiRsp) { + /* smb buffer will be freed by user thread */ + if (server->large_buf) + server->bigbuf = NULL; + else + server->smallbuf = NULL; + } + dequeue_mid(mid, malformed); +} + +static void clean_demultiplex_info(struct TCP_Server_Info *server) +{ + int length; + + /* take it off the list, if it's not already */ + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&server->tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&GlobalMid_Lock); + server->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + wake_up_all(&server->response_q); + + /* check if we have blocked requests that need to free */ + spin_lock(&server->req_lock); + if (server->credits <= 0) + server->credits = 1; + spin_unlock(&server->req_lock); + /* + * Although there should not be any requests blocked on this queue it + * can not hurt to be paranoid and try to wake up requests that may + * haven been blocked when more than 50 at time were on the wire to the + * same server - they now will see the session is in exit state and get + * out of SendReceive. + */ + wake_up_all(&server->request_q); + /* give those requests time to exit */ + msleep(125); + + if (server->ssocket) { + sock_release(server->ssocket); + server->ssocket = NULL; + } + + if (!list_empty(&server->pending_mid_q)) { + struct list_head dispose_list; + struct mid_q_entry *mid_entry; + struct list_head *tmp, *tmp2; + + INIT_LIST_HEAD(&dispose_list); + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); + mid_entry->mid_state = MID_SHUTDOWN; + list_move(&mid_entry->qhead, &dispose_list); + } + spin_unlock(&GlobalMid_Lock); + + /* now walk dispose list and issue callbacks */ + list_for_each_safe(tmp, tmp2, &dispose_list) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + } + /* 1/8th of sec is more than enough time for them to exit */ + msleep(125); + } + + if (!list_empty(&server->pending_mid_q)) { + /* + * mpx threads have not exited yet give them at least the smb + * send timeout time for long ops. + * + * Due to delays on oplock break requests, we need to wait at + * least 45 seconds before giving up on a request getting a + * response and going ahead and killing cifsd. + */ + cifs_dbg(FYI, "Wait for exit from demultiplex thread\n"); + msleep(46000); + /* + * If threads still have not exited they are probably never + * coming home not much else we can do but free the memory. + */ + } + + kfree(server->hostname); + kfree(server->iov); + kfree(server); + + length = atomic_dec_return(&tcpSesAllocCount); + if (length > 0) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); +} + +static int +standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + char *buf = server->smallbuf; + unsigned int pdu_length = get_rfc1002_length(buf); + + /* make sure this will fit in a large buffer */ + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) { + cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -ECONNABORTED; + } + + /* switch to large buffer if too big for a small one */ + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + server->large_buf = true; + memcpy(server->bigbuf, buf, server->total_read); + buf = server->bigbuf; + } + + /* now read the rest */ + length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, + pdu_length - HEADER_SIZE(server) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + dump_smb(buf, server->total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = server->ops->check_message(buf, server->total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, server->total_read, 48)); + + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server, length)) + return -1; + + if (!mid) + return length; + + handle_mid(mid, server, buf, length); + return 0; +} + +static int +cifs_demultiplex_thread(void *p) +{ + int length; + struct TCP_Server_Info *server = p; + unsigned int pdu_length; + char *buf = NULL; + struct task_struct *task_to_wake = NULL; + struct mid_q_entry *mid_entry; + + current->flags |= PF_MEMALLOC; + cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); + + length = atomic_inc_return(&tcpSesAllocCount); + if (length > 1) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); + + set_freezable(); + while (server->tcpStatus != CifsExiting) { + if (try_to_freeze()) + continue; + + if (!allocate_buffers(server)) + continue; + + server->large_buf = false; + buf = server->smallbuf; + pdu_length = 4; /* enough to get RFC1001 header */ + + length = cifs_read_from_socket(server, buf, pdu_length); + if (length < 0) + continue; + server->total_read = length; + + /* + * The right amount was read from socket - 4 bytes, + * so we can now interpret the length field. + */ + pdu_length = get_rfc1002_length(buf); + + cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); + if (!is_smb_response(server, buf[0])) + continue; + + /* make sure we have enough to get to the MID */ + if (pdu_length < HEADER_SIZE(server) - 1 - 4) { + cifs_dbg(VFS, "SMB response too short (%u bytes)\n", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; + } + + /* read down to the MID */ + length = cifs_read_from_socket(server, buf + 4, + HEADER_SIZE(server) - 1 - 4); + if (length < 0) + continue; + server->total_read += length; + + mid_entry = server->ops->find_mid(server, buf); + + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); + + if (length < 0) + continue; + + if (server->large_buf) + buf = server->bigbuf; + + server->lstrp = jiffies; + if (mid_entry != NULL) { + if (!mid_entry->multiRsp || mid_entry->multiEnd) + mid_entry->callback(mid_entry); + } else if (!server->ops->is_oplock_break || + !server->ops->is_oplock_break(buf, server)) { + cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", + atomic_read(&midCount)); + cifs_dump_mem("Received Data is: ", buf, + HEADER_SIZE(server)); +#ifdef CONFIG_CIFS_DEBUG2 + if (server->ops->dump_detail) + server->ops->dump_detail(buf); + cifs_dump_mids(server); +#endif /* CIFS_DEBUG2 */ + + } + } /* end while !EXITING */ + + /* buffer usually freed in free_mid - need to free it here on exit */ + cifs_buf_release(server->bigbuf); + if (server->smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(server->smallbuf); + + task_to_wake = xchg(&server->tsk, NULL); + clean_demultiplex_info(server); + + /* if server->tsk was NULL then wait for a signal before exiting */ + if (!task_to_wake) { + set_current_state(TASK_INTERRUPTIBLE); + while (!signal_pending(current)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + } + + module_put_and_exit(0); +} + +/* extract the host portion of the UNC string */ +static char * +extract_hostname(const char *unc) +{ + const char *src; + char *dst, *delim; + unsigned int len; + + /* skip double chars at beginning of string */ + /* BB: check validity of these bytes? */ + src = unc + 2; + + /* delimiter between hostname and sharename is always '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + + len = delim - src; + dst = kmalloc((len + 1), GFP_KERNEL); + if (dst == NULL) + return ERR_PTR(-ENOMEM); + + memcpy(dst, src, len); + dst[len] = '\0'; + + return dst; +} + +static int get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = kstrtoul(string, 0, option); + kfree(string); + + return rc; +} + +static int get_option_uid(substring_t args[], kuid_t *result) +{ + unsigned long value; + kuid_t uid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + uid = make_kuid(current_user_ns(), value); + if (!uid_valid(uid)) + return -EINVAL; + + *result = uid; + return 0; +} + +static int get_option_gid(substring_t args[], kgid_t *result) +{ + unsigned long value; + kgid_t gid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + gid = make_kgid(current_user_ns(), value); + if (!gid_valid(gid)) + return -EINVAL; + + *result = gid; + return 0; +} + +static int cifs_parse_security_flavors(char *value, + struct smb_vol *vol) +{ + + substring_t args[MAX_OPT_ARGS]; + + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + + switch (match_token(value, cifs_secflavor_tokens, args)) { + case Opt_sec_krb5p: + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + /* Fallthrough */ + case Opt_sec_krb5: + vol->sectype = Kerberos; + break; + case Opt_sec_ntlmsspi: + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; + break; + case Opt_sec_ntlmi: + vol->sign = true; + /* Fallthrough */ + case Opt_ntlm: + vol->sectype = NTLM; + break; + case Opt_sec_ntlmv2i: + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; + break; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + case Opt_sec_lanman: + vol->sectype = LANMAN; + break; +#endif + case Opt_sec_none: + vol->nullauth = 1; + break; + default: + cifs_dbg(VFS, "bad security option: %s\n", value); + return 1; + } + + return 0; +} + +static int +cifs_parse_cache_flavor(char *value, struct smb_vol *vol) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_cacheflavor_tokens, args)) { + case Opt_cache_loose: + vol->direct_io = false; + vol->strict_io = false; + break; + case Opt_cache_strict: + vol->direct_io = false; + vol->strict_io = true; + break; + case Opt_cache_none: + vol->direct_io = true; + vol->strict_io = false; + break; + default: + cifs_dbg(VFS, "bad cache= option: %s\n", value); + return 1; + } + return 0; +} + +static int +cifs_parse_smb_version(char *value, struct smb_vol *vol) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_smb_version_tokens, args)) { + case Smb_1: + vol->ops = &smb1_operations; + vol->vals = &smb1_values; + break; +#ifdef CONFIG_CIFS_SMB2 + case Smb_20: + vol->ops = &smb20_operations; + vol->vals = &smb20_values; + break; + case Smb_21: + vol->ops = &smb21_operations; + vol->vals = &smb21_values; + break; + case Smb_30: + vol->ops = &smb30_operations; + vol->vals = &smb30_values; + break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; +#endif + default: + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + return 1; + } + return 0; +} + +/* + * Parse a devname into substrings and populate the vol->UNC and vol->prepath + * fields with the result. Returns 0 on success and an error otherwise. + */ +static int +cifs_parse_devname(const char *devname, struct smb_vol *vol) +{ + char *pos; + const char *delims = "/\\"; + size_t len; + + /* make sure we have a valid UNC double delimiter prefix */ + len = strspn(devname, delims); + if (len != 2) + return -EINVAL; + + /* find delimiter between host and sharename */ + pos = strpbrk(devname + 2, delims); + if (!pos) + return -EINVAL; + + /* skip past delimiter */ + ++pos; + + /* now go until next delimiter or end of string */ + len = strcspn(pos, delims); + + /* move "pos" up to delimiter or NULL */ + pos += len; + vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); + if (!vol->UNC) + return -ENOMEM; + + convert_delimiter(vol->UNC, '\\'); + + /* If pos is NULL, or is a bogus trailing delimiter then no prepath */ + if (!*pos++ || !*pos) + return 0; + + vol->prepath = kstrdup(pos, GFP_KERNEL); + if (!vol->prepath) + return -ENOMEM; + + return 0; +} + +static int +cifs_parse_mount_options(const char *mountdata, const char *devname, + struct smb_vol *vol) +{ + char *data, *end; + char *mountdata_copy = NULL, *options; + unsigned int temp_len, i, j; + char separator[2]; + short int override_uid = -1; + short int override_gid = -1; + bool uid_specified = false; + bool gid_specified = false; + bool sloppy = false; + char *invalid = NULL; + char *nodename = utsname()->nodename; + char *string = NULL; + char *tmp_end, *value; + char delim; + bool got_ip = false; + unsigned short port = 0; + struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; + + separator[0] = ','; + separator[1] = 0; + delim = separator[0]; + + /* ensure we always start with zeroed-out smb_vol */ + memset(vol, 0, sizeof(*vol)); + + /* + * does not have to be perfect mapping since field is + * informational, only used for servers that do not support + * port 445 and it can be overridden at mount time + */ + memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); + for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) + vol->source_rfc1001_name[i] = toupper(nodename[i]); + + vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0; + /* null target name indicates to use *SMBSERVR default called name + if we end up sending RFC1001 session initialize */ + vol->target_rfc1001_name[0] = 0; + vol->cred_uid = current_uid(); + vol->linux_uid = current_uid(); + vol->linux_gid = current_gid(); + + /* + * default to SFM style remapping of seven reserved characters + * unless user overrides it or we negotiate CIFS POSIX where + * it is unnecessary. Can not simultaneously use more than one mapping + * since then readdir could list files that open could not open + */ + vol->remap = true; + + /* default to only allowing write access to owner of the mount */ + vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; + + /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ + /* default is always to request posix paths. */ + vol->posix_paths = 1; + /* default to using server inode numbers where available */ + vol->server_ino = 1; + + /* default is to use strict cifs caching semantics */ + vol->strict_io = true; + + vol->actimeo = CIFS_DEF_ACTIMEO; + + /* FIXME: add autonegotiation -- for now, SMB1 is default */ + vol->ops = &smb1_operations; + vol->vals = &smb1_values; + + if (!mountdata) + goto cifs_parse_mount_err; + + mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL); + if (!mountdata_copy) + goto cifs_parse_mount_err; + + options = mountdata_copy; + end = options + strlen(options); + + if (strncmp(options, "sep=", 4) == 0) { + if (options[4] != 0) { + separator[0] = options[4]; + options += 5; + } else { + cifs_dbg(FYI, "Null separator not allowed\n"); + } + } + vol->backupuid_specified = false; /* no backup intent for a user */ + vol->backupgid_specified = false; /* no backup intent for a group */ + + switch (cifs_parse_devname(devname, vol)) { + case 0: + break; + case -ENOMEM: + cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + goto cifs_parse_mount_err; + case -EINVAL: + cifs_dbg(VFS, "Malformed UNC in devname.\n"); + goto cifs_parse_mount_err; + default: + cifs_dbg(VFS, "Unknown error parsing devname.\n"); + goto cifs_parse_mount_err; + } + + while ((data = strsep(&options, separator)) != NULL) { + substring_t args[MAX_OPT_ARGS]; + unsigned long option; + int token; + + if (!*data) + continue; + + token = match_token(data, cifs_mount_option_tokens, args); + + switch (token) { + + /* Ingnore the following */ + case Opt_ignore: + break; + + /* Boolean values */ + case Opt_user_xattr: + vol->no_xattr = 0; + break; + case Opt_nouser_xattr: + vol->no_xattr = 1; + break; + case Opt_forceuid: + override_uid = 1; + break; + case Opt_noforceuid: + override_uid = 0; + break; + case Opt_forcegid: + override_gid = 1; + break; + case Opt_noforcegid: + override_gid = 0; + break; + case Opt_noblocksend: + vol->noblocksnd = 1; + break; + case Opt_noautotune: + vol->noautotune = 1; + break; + case Opt_hard: + vol->retry = 1; + break; + case Opt_soft: + vol->retry = 0; + break; + case Opt_perm: + vol->noperm = 0; + break; + case Opt_noperm: + vol->noperm = 1; + break; + case Opt_mapchars: + vol->sfu_remap = true; + vol->remap = false; /* disable SFM mapping */ + break; + case Opt_nomapchars: + vol->sfu_remap = false; + break; + case Opt_mapposix: + vol->remap = true; + vol->sfu_remap = false; /* disable SFU mapping */ + break; + case Opt_nomapposix: + vol->remap = false; + break; + case Opt_sfu: + vol->sfu_emul = 1; + break; + case Opt_nosfu: + vol->sfu_emul = 0; + break; + case Opt_nodfs: + vol->nodfs = 1; + break; + case Opt_posixpaths: + vol->posix_paths = 1; + break; + case Opt_noposixpaths: + vol->posix_paths = 0; + break; + case Opt_nounix: + vol->no_linux_ext = 1; + break; + case Opt_nocase: + vol->nocase = 1; + break; + case Opt_brl: + vol->nobrl = 0; + break; + case Opt_nobrl: + vol->nobrl = 1; + /* + * turn off mandatory locking in mode + * if remote locking is turned off since the + * local vfs will do advisory + */ + if (vol->file_mode == + (S_IALLUGO & ~(S_ISUID | S_IXGRP))) + vol->file_mode = S_IALLUGO; + break; + case Opt_forcemandatorylock: + vol->mand_lock = 1; + break; + case Opt_setuids: + vol->setuids = 1; + break; + case Opt_nosetuids: + vol->setuids = 0; + break; + case Opt_dynperm: + vol->dynperm = true; + break; + case Opt_nodynperm: + vol->dynperm = false; + break; + case Opt_nohard: + vol->retry = 0; + break; + case Opt_nosoft: + vol->retry = 1; + break; + case Opt_nointr: + vol->intr = 0; + break; + case Opt_intr: + vol->intr = 1; + break; + case Opt_nostrictsync: + vol->nostrictsync = 1; + break; + case Opt_strictsync: + vol->nostrictsync = 0; + break; + case Opt_serverino: + vol->server_ino = 1; + break; + case Opt_noserverino: + vol->server_ino = 0; + break; + case Opt_rwpidforward: + vol->rwpidforward = 1; + break; + case Opt_cifsacl: + vol->cifs_acl = 1; + break; + case Opt_nocifsacl: + vol->cifs_acl = 0; + break; + case Opt_acl: + vol->no_psx_acl = 0; + break; + case Opt_noacl: + vol->no_psx_acl = 1; + break; + case Opt_locallease: + vol->local_lease = 1; + break; + case Opt_sign: + vol->sign = true; + break; + case Opt_seal: + /* we do not do the following in secFlags because seal + * is a per tree connection (mount) not a per socket + * or per-smb connection option in the protocol + * vol->secFlg |= CIFSSEC_MUST_SEAL; + */ + vol->seal = 1; + break; + case Opt_noac: + pr_warn("CIFS: Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); + break; + case Opt_fsc: +#ifndef CONFIG_CIFS_FSCACHE + cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); + goto cifs_parse_mount_err; +#endif + vol->fsc = true; + break; + case Opt_mfsymlinks: + vol->mfsymlinks = true; + break; + case Opt_multiuser: + vol->multiuser = true; + break; + case Opt_sloppy: + sloppy = true; + break; + case Opt_nosharesock: + vol->nosharesock = true; + break; + + /* Numeric Values */ + case Opt_backupuid: + if (get_option_uid(args, &vol->backupuid)) { + cifs_dbg(VFS, "%s: Invalid backupuid value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->backupuid_specified = true; + break; + case Opt_backupgid: + if (get_option_gid(args, &vol->backupgid)) { + cifs_dbg(VFS, "%s: Invalid backupgid value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->backupgid_specified = true; + break; + case Opt_uid: + if (get_option_uid(args, &vol->linux_uid)) { + cifs_dbg(VFS, "%s: Invalid uid value\n", + __func__); + goto cifs_parse_mount_err; + } + uid_specified = true; + break; + case Opt_cruid: + if (get_option_uid(args, &vol->cred_uid)) { + cifs_dbg(VFS, "%s: Invalid cruid value\n", + __func__); + goto cifs_parse_mount_err; + } + break; + case Opt_gid: + if (get_option_gid(args, &vol->linux_gid)) { + cifs_dbg(VFS, "%s: Invalid gid value\n", + __func__); + goto cifs_parse_mount_err; + } + gid_specified = true; + break; + case Opt_file_mode: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid file_mode value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->file_mode = option; + break; + case Opt_dirmode: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid dir_mode value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->dir_mode = option; + break; + case Opt_port: + if (get_option_ul(args, &option) || + option > USHRT_MAX) { + cifs_dbg(VFS, "%s: Invalid port value\n", + __func__); + goto cifs_parse_mount_err; + } + port = (unsigned short)option; + break; + case Opt_rsize: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid rsize value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->rsize = option; + break; + case Opt_wsize: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid wsize value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->wsize = option; + break; + case Opt_actimeo: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid actimeo value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->actimeo = HZ * option; + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "attribute cache timeout too large\n"); + goto cifs_parse_mount_err; + } + break; + + /* String Arguments */ + + case Opt_blank_user: + /* null user, ie. anonymous authentication */ + vol->nullauth = 1; + vol->username = NULL; + break; + case Opt_user: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (strnlen(string, CIFS_MAX_USERNAME_LEN) > + CIFS_MAX_USERNAME_LEN) { + pr_warn("CIFS: username too long\n"); + goto cifs_parse_mount_err; + } + + kfree(vol->username); + vol->username = kstrdup(string, GFP_KERNEL); + if (!vol->username) + goto cifs_parse_mount_err; + break; + case Opt_blank_pass: + /* passwords have to be handled differently + * to allow the character used for deliminator + * to be passed within them + */ + + /* + * Check if this is a case where the password + * starts with a delimiter + */ + tmp_end = strchr(data, '='); + tmp_end++; + if (!(tmp_end < end && tmp_end[1] == delim)) { + /* No it is not. Set the password to NULL */ + kfree(vol->password); + vol->password = NULL; + break; + } + /* Yes it is. Drop down to Opt_pass below.*/ + case Opt_pass: + /* Obtain the value string */ + value = strchr(data, '='); + value++; + + /* Set tmp_end to end of the string */ + tmp_end = (char *) value + strlen(value); + + /* Check if following character is the deliminator + * If yes, we have encountered a double deliminator + * reset the NULL character to the deliminator + */ + if (tmp_end < end && tmp_end[1] == delim) { + tmp_end[0] = delim; + + /* Keep iterating until we get to a single + * deliminator OR the end + */ + while ((tmp_end = strchr(tmp_end, delim)) + != NULL && (tmp_end[1] == delim)) { + tmp_end = (char *) &tmp_end[2]; + } + + /* Reset var options to point to next element */ + if (tmp_end) { + tmp_end[0] = '\0'; + options = (char *) &tmp_end[1]; + } else + /* Reached the end of the mount option + * string */ + options = end; + } + + kfree(vol->password); + /* Now build new password string */ + temp_len = strlen(value); + vol->password = kzalloc(temp_len+1, GFP_KERNEL); + if (vol->password == NULL) { + pr_warn("CIFS: no memory for password\n"); + goto cifs_parse_mount_err; + } + + for (i = 0, j = 0; i < temp_len; i++, j++) { + vol->password[j] = value[i]; + if ((value[i] == delim) && + value[i+1] == delim) + /* skip the second deliminator */ + i++; + } + vol->password[j] = '\0'; + break; + case Opt_blank_ip: + /* FIXME: should this be an error instead? */ + got_ip = false; + break; + case Opt_ip: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!cifs_convert_address(dstaddr, string, + strlen(string))) { + pr_err("CIFS: bad ip= option (%s).\n", string); + goto cifs_parse_mount_err; + } + got_ip = true; + break; + case Opt_domain: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN) + == CIFS_MAX_DOMAINNAME_LEN) { + pr_warn("CIFS: domain name too long\n"); + goto cifs_parse_mount_err; + } + + kfree(vol->domainname); + vol->domainname = kstrdup(string, GFP_KERNEL); + if (!vol->domainname) { + pr_warn("CIFS: no memory for domainname\n"); + goto cifs_parse_mount_err; + } + cifs_dbg(FYI, "Domain name set\n"); + break; + case Opt_srcaddr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!cifs_convert_address( + (struct sockaddr *)&vol->srcaddr, + string, strlen(string))) { + pr_warn("CIFS: Could not parse srcaddr: %s\n", + string); + goto cifs_parse_mount_err; + } + break; + case Opt_iocharset: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (strnlen(string, 1024) >= 65) { + pr_warn("CIFS: iocharset name too long.\n"); + goto cifs_parse_mount_err; + } + + if (strncasecmp(string, "default", 7) != 0) { + kfree(vol->iocharset); + vol->iocharset = kstrdup(string, + GFP_KERNEL); + if (!vol->iocharset) { + pr_warn("CIFS: no memory for charset\n"); + goto cifs_parse_mount_err; + } + } + /* if iocharset not set then load_nls_default + * is used by caller + */ + cifs_dbg(FYI, "iocharset set to %s\n", string); + break; + case Opt_netbiosname: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + memset(vol->source_rfc1001_name, 0x20, + RFC1001_NAME_LEN); + /* + * FIXME: are there cases in which a comma can + * be valid in workstation netbios name (and + * need special handling)? + */ + for (i = 0; i < RFC1001_NAME_LEN; i++) { + /* don't ucase netbiosname for user */ + if (string[i] == 0) + break; + vol->source_rfc1001_name[i] = string[i]; + } + /* The string has 16th byte zero still from + * set at top of the function + */ + if (i == RFC1001_NAME_LEN && string[i] != 0) + pr_warn("CIFS: netbiosname longer than 15 truncated.\n"); + break; + case Opt_servern: + /* servernetbiosname specified override *SMBSERVER */ + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + /* last byte, type, is 0x20 for servr type */ + memset(vol->target_rfc1001_name, 0x20, + RFC1001_NAME_LEN_WITH_NULL); + + /* BB are there cases in which a comma can be + valid in this workstation netbios name + (and need special handling)? */ + + /* user or mount helper must uppercase the + netbios name */ + for (i = 0; i < 15; i++) { + if (string[i] == 0) + break; + vol->target_rfc1001_name[i] = string[i]; + } + /* The string has 16th byte zero still from + set at top of the function */ + if (i == RFC1001_NAME_LEN && string[i] != 0) + pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); + break; + case Opt_ver: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (strncasecmp(string, "1", 1) == 0) { + /* This is the default */ + break; + } + /* For all other value, error */ + pr_warn("CIFS: Invalid version specified\n"); + goto cifs_parse_mount_err; + case Opt_vers: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (cifs_parse_smb_version(string, vol) != 0) + goto cifs_parse_mount_err; + break; + case Opt_sec: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (cifs_parse_security_flavors(string, vol) != 0) + goto cifs_parse_mount_err; + break; + case Opt_cache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (cifs_parse_cache_flavor(string, vol) != 0) + goto cifs_parse_mount_err; + break; + default: + /* + * An option we don't recognize. Save it off for later + * if we haven't already found one + */ + if (!invalid) + invalid = data; + break; + } + /* Free up any allocated string */ + kfree(string); + string = NULL; + } + + if (!sloppy && invalid) { + pr_err("CIFS: Unknown mount option \"%s\"\n", invalid); + goto cifs_parse_mount_err; + } + +#ifndef CONFIG_KEYS + /* Muliuser mounts require CONFIG_KEYS support */ + if (vol->multiuser) { + cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); + goto cifs_parse_mount_err; + } +#endif + if (!vol->UNC) { + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); + goto cifs_parse_mount_err; + } + + /* make sure UNC has a share name */ + if (!strchr(vol->UNC + 3, '\\')) { + cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); + goto cifs_parse_mount_err; + } + + if (!got_ip) { + /* No ip= option specified? Try to get it from UNC */ + if (!cifs_convert_address(dstaddr, &vol->UNC[2], + strlen(&vol->UNC[2]))) { + pr_err("Unable to determine destination address.\n"); + goto cifs_parse_mount_err; + } + } + + /* set the port that we got earlier */ + cifs_set_port(dstaddr, port); + + if (uid_specified) + vol->override_uid = override_uid; + else if (override_uid == 1) + pr_notice("CIFS: ignoring forceuid mount option specified with no uid= option.\n"); + + if (gid_specified) + vol->override_gid = override_gid; + else if (override_gid == 1) + pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); + + kfree(mountdata_copy); + return 0; + +out_nomem: + pr_warn("Could not allocate temporary buffer\n"); +cifs_parse_mount_err: + kfree(string); + kfree(mountdata_copy); + return 1; +} + +/** Returns true if srcaddr isn't specified and rhs isn't + * specified, or if srcaddr is specified and + * matches the IP address of the rhs argument. + */ +static bool +srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) +{ + switch (srcaddr->sa_family) { + case AF_UNSPEC: + return (rhs->sa_family == AF_UNSPEC); + case AF_INET: { + struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; + struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr); + } + case AF_INET6: { + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; + return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); + } + default: + WARN_ON(1); + return false; /* don't expect to be here */ + } +} + +/* + * If no port is specified in addr structure, we try to match with 445 port + * and if it fails - with 139 ports. It should be called only if address + * families of server and addr are equal. + */ +static bool +match_port(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + __be16 port, *sport; + + switch (addr->sa_family) { + case AF_INET: + sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port; + port = ((struct sockaddr_in *) addr)->sin_port; + break; + case AF_INET6: + sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port; + port = ((struct sockaddr_in6 *) addr)->sin6_port; + break; + default: + WARN_ON(1); + return false; + } + + if (!port) { + port = htons(CIFS_PORT); + if (port == *sport) + return true; + + port = htons(RFC1001_PORT); + } + + return port == *sport; +} + +static bool +match_address(struct TCP_Server_Info *server, struct sockaddr *addr, + struct sockaddr *srcaddr) +{ + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in *srv_addr4 = + (struct sockaddr_in *)&server->dstaddr; + + if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr) + return false; + break; + } + case AF_INET6: { + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + struct sockaddr_in6 *srv_addr6 = + (struct sockaddr_in6 *)&server->dstaddr; + + if (!ipv6_addr_equal(&addr6->sin6_addr, + &srv_addr6->sin6_addr)) + return false; + if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id) + return false; + break; + } + default: + WARN_ON(1); + return false; /* don't expect to be here */ + } + + if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) + return false; + + return true; +} + +static bool +match_security(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + /* + * The select_sectype function should either return the vol->sectype + * that was specified, or "Unspecified" if that sectype was not + * compatible with the given NEGOTIATE request. + */ + if (select_sectype(server, vol->sectype) == Unspecified) + return false; + + /* + * Now check if signing mode is acceptable. No need to check + * global_secflags at this point since if MUST_SIGN is set then + * the server->sign had better be too. + */ + if (vol->sign && !server->sign) + return false; + + return true; +} + +static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; + + if (vol->nosharesock) + return 0; + + if ((server->vals != vol->vals) || (server->ops != vol->ops)) + return 0; + + if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) + return 0; + + if (!match_address(server, addr, + (struct sockaddr *)&vol->srcaddr)) + return 0; + + if (!match_port(server, addr)) + return 0; + + if (!match_security(server, vol)) + return 0; + + return 1; +} + +static struct TCP_Server_Info * +cifs_find_tcp_session(struct smb_vol *vol) +{ + struct TCP_Server_Info *server; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + if (!match_server(server, vol)) + continue; + + ++server->srv_count; + spin_unlock(&cifs_tcp_ses_lock); + cifs_dbg(FYI, "Existing tcp session with server found\n"); + return server; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_tcp_session(struct TCP_Server_Info *server) +{ + struct task_struct *task; + + spin_lock(&cifs_tcp_ses_lock); + if (--server->srv_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + put_net(cifs_net_ns(server)); + + list_del_init(&server->tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + cancel_delayed_work_sync(&server->echo); + + spin_lock(&GlobalMid_Lock); + server->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + + cifs_crypto_shash_release(server); + cifs_fscache_release_client_cookie(server); + + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + + task = xchg(&server->tsk, NULL); + if (task) + force_sig(SIGKILL, task); +} + +static struct TCP_Server_Info * +cifs_get_tcp_session(struct smb_vol *volume_info) +{ + struct TCP_Server_Info *tcp_ses = NULL; + int rc; + + cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC); + + /* see if we already have a matching tcp_ses */ + tcp_ses = cifs_find_tcp_session(volume_info); + if (tcp_ses) + return tcp_ses; + + tcp_ses = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL); + if (!tcp_ses) { + rc = -ENOMEM; + goto out_err; + } + + tcp_ses->ops = volume_info->ops; + tcp_ses->vals = volume_info->vals; + cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); + tcp_ses->hostname = extract_hostname(volume_info->UNC); + if (IS_ERR(tcp_ses->hostname)) { + rc = PTR_ERR(tcp_ses->hostname); + goto out_err_crypto_release; + } + + tcp_ses->noblocksnd = volume_info->noblocksnd; + tcp_ses->noautotune = volume_info->noautotune; + tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; + tcp_ses->in_flight = 0; + tcp_ses->credits = 1; + init_waitqueue_head(&tcp_ses->response_q); + init_waitqueue_head(&tcp_ses->request_q); + INIT_LIST_HEAD(&tcp_ses->pending_mid_q); + mutex_init(&tcp_ses->srv_mutex); + memcpy(tcp_ses->workstation_RFC1001_name, + volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + memcpy(tcp_ses->server_RFC1001_name, + volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + tcp_ses->session_estab = false; + tcp_ses->sequence_number = 0; + tcp_ses->lstrp = jiffies; + spin_lock_init(&tcp_ses->req_lock); + INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); + INIT_LIST_HEAD(&tcp_ses->smb_ses_list); + INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); + memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, + sizeof(tcp_ses->srcaddr)); + memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, + sizeof(tcp_ses->dstaddr)); +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); +#endif + /* + * at this point we are the only ones with the pointer + * to the struct since the kernel thread not created yet + * no need to spinlock this init of tcpStatus or srv_count + */ + tcp_ses->tcpStatus = CifsNew; + ++tcp_ses->srv_count; + + rc = ip_connect(tcp_ses); + if (rc < 0) { + cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); + goto out_err_crypto_release; + } + + /* + * since we're in a cifs function already, we know that + * this will succeed. No need for try_module_get(). + */ + __module_get(THIS_MODULE); + tcp_ses->tsk = kthread_run(cifs_demultiplex_thread, + tcp_ses, "cifsd"); + if (IS_ERR(tcp_ses->tsk)) { + rc = PTR_ERR(tcp_ses->tsk); + cifs_dbg(VFS, "error %d create cifsd thread\n", rc); + module_put(THIS_MODULE); + goto out_err_crypto_release; + } + tcp_ses->tcpStatus = CifsNeedNegotiate; + + /* thread spawned, put it on the list */ + spin_lock(&cifs_tcp_ses_lock); + list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + cifs_fscache_get_client_cookie(tcp_ses); + + /* queue echo request delayed work */ + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + + return tcp_ses; + +out_err_crypto_release: + cifs_crypto_shash_release(tcp_ses); + + put_net(cifs_net_ns(tcp_ses)); + +out_err: + if (tcp_ses) { + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); + if (tcp_ses->ssocket) + sock_release(tcp_ses->ssocket); + kfree(tcp_ses); + } + return ERR_PTR(rc); +} + +static int match_session(struct cifs_ses *ses, struct smb_vol *vol) +{ + if (vol->sectype != Unspecified && + vol->sectype != ses->sectype) + return 0; + + switch (ses->sectype) { + case Kerberos: + if (!uid_eq(vol->cred_uid, ses->cred_uid)) + return 0; + break; + default: + /* NULL username means anonymous session */ + if (ses->user_name == NULL) { + if (!vol->nullauth) + return 0; + break; + } + + /* anything else takes username/password */ + if (strncmp(ses->user_name, + vol->username ? vol->username : "", + CIFS_MAX_USERNAME_LEN)) + return 0; + if ((vol->username && strlen(vol->username) != 0) && + ses->password != NULL && + strncmp(ses->password, + vol->password ? vol->password : "", + CIFS_MAX_PASSWORD_LEN)) + return 0; + } + return 1; +} + +static struct cifs_ses * +cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->status == CifsExiting) + continue; + if (!match_session(ses, vol)) + continue; + ++ses->ses_count; + spin_unlock(&cifs_tcp_ses_lock); + return ses; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_smb_ses(struct cifs_ses *ses) +{ + unsigned int rc, xid; + struct TCP_Server_Info *server = ses->server; + + cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); + + spin_lock(&cifs_tcp_ses_lock); + if (ses->status == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + if (--ses->ses_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + if (ses->status == CifsGood) + ses->status = CifsExiting; + spin_unlock(&cifs_tcp_ses_lock); + + if (ses->status == CifsExiting && server->ops->logoff) { + xid = get_xid(); + rc = server->ops->logoff(xid, ses); + if (rc) + cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n", + __func__, rc); + _free_xid(xid); + } + + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&ses->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + sesInfoFree(ses); + cifs_put_tcp_session(server); +} + +#ifdef CONFIG_KEYS + +/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */ +#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1) + +/* Populate username and pw fields from keyring if possible */ +static int +cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) +{ + int rc = 0; + char *desc, *delim, *payload; + ssize_t len; + struct key *key; + struct TCP_Server_Info *server = ses->server; + struct sockaddr_in *sa; + struct sockaddr_in6 *sa6; + struct user_key_payload *upayload; + + desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + /* try to find an address key first */ + switch (server->dstaddr.ss_family) { + case AF_INET: + sa = (struct sockaddr_in *)&server->dstaddr; + sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr); + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)&server->dstaddr; + sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); + break; + default: + cifs_dbg(FYI, "Bad ss_family (%hu)\n", + server->dstaddr.ss_family); + rc = -EINVAL; + goto out_err; + } + + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); + key = request_key(&key_type_logon, desc, ""); + if (IS_ERR(key)) { + if (!ses->domainName) { + cifs_dbg(FYI, "domainName is NULL\n"); + rc = PTR_ERR(key); + goto out_err; + } + + /* didn't work, try to find a domain key */ + sprintf(desc, "cifs:d:%s", ses->domainName); + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); + key = request_key(&key_type_logon, desc, ""); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + goto out_err; + } + } + + down_read(&key->sem); + upayload = key->payload.data; + if (IS_ERR_OR_NULL(upayload)) { + rc = upayload ? PTR_ERR(upayload) : -EINVAL; + goto out_key_put; + } + + /* find first : in payload */ + payload = (char *)upayload->data; + delim = strnchr(payload, upayload->datalen, ':'); + cifs_dbg(FYI, "payload=%s\n", payload); + if (!delim) { + cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n", + upayload->datalen); + rc = -EINVAL; + goto out_key_put; + } + + len = delim - payload; + if (len > CIFS_MAX_USERNAME_LEN || len <= 0) { + cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", + len); + rc = -EINVAL; + goto out_key_put; + } + + vol->username = kstrndup(payload, len, GFP_KERNEL); + if (!vol->username) { + cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n", + len); + rc = -ENOMEM; + goto out_key_put; + } + cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); + + len = key->datalen - (len + 1); + if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) { + cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); + rc = -EINVAL; + kfree(vol->username); + vol->username = NULL; + goto out_key_put; + } + + ++delim; + vol->password = kstrndup(delim, len, GFP_KERNEL); + if (!vol->password) { + cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", + len); + rc = -ENOMEM; + kfree(vol->username); + vol->username = NULL; + goto out_key_put; + } + +out_key_put: + up_read(&key->sem); + key_put(key); +out_err: + kfree(desc); + cifs_dbg(FYI, "%s: returning %d\n", __func__, rc); + return rc; +} +#else /* ! CONFIG_KEYS */ +static inline int +cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), + struct cifs_ses *ses __attribute__((unused))) +{ + return -ENOSYS; +} +#endif /* CONFIG_KEYS */ + +static struct cifs_ses * +cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) +{ + int rc = -ENOMEM; + unsigned int xid; + struct cifs_ses *ses; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + + xid = get_xid(); + + ses = cifs_find_smb_ses(server, volume_info); + if (ses) { + cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", + ses->status); + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses); + if (rc) { + mutex_unlock(&ses->session_mutex); + /* problem -- put our ses reference */ + cifs_put_smb_ses(ses); + free_xid(xid); + return ERR_PTR(rc); + } + if (ses->need_reconnect) { + cifs_dbg(FYI, "Session needs reconnect\n"); + rc = cifs_setup_session(xid, ses, + volume_info->local_nls); + if (rc) { + mutex_unlock(&ses->session_mutex); + /* problem -- put our reference */ + cifs_put_smb_ses(ses); + free_xid(xid); + return ERR_PTR(rc); + } + } + mutex_unlock(&ses->session_mutex); + + /* existing SMB ses has a server reference already */ + cifs_put_tcp_session(server); + free_xid(xid); + return ses; + } + + cifs_dbg(FYI, "Existing smb sess not found\n"); + ses = sesInfoAlloc(); + if (ses == NULL) + goto get_ses_fail; + + /* new SMB session uses our server ref */ + ses->server = server; + if (server->dstaddr.ss_family == AF_INET6) + sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); + else + sprintf(ses->serverName, "%pI4", &addr->sin_addr); + + if (volume_info->username) { + ses->user_name = kstrdup(volume_info->username, GFP_KERNEL); + if (!ses->user_name) + goto get_ses_fail; + } + + /* volume_info->password freed at unmount */ + if (volume_info->password) { + ses->password = kstrdup(volume_info->password, GFP_KERNEL); + if (!ses->password) + goto get_ses_fail; + } + if (volume_info->domainname) { + ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL); + if (!ses->domainName) + goto get_ses_fail; + } + ses->cred_uid = volume_info->cred_uid; + ses->linux_uid = volume_info->linux_uid; + + ses->sectype = volume_info->sectype; + ses->sign = volume_info->sign; + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses); + if (!rc) + rc = cifs_setup_session(xid, ses, volume_info->local_nls); + mutex_unlock(&ses->session_mutex); + if (rc) + goto get_ses_fail; + + /* success, put it on the list */ + spin_lock(&cifs_tcp_ses_lock); + list_add(&ses->smb_ses_list, &server->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + free_xid(xid); + return ses; + +get_ses_fail: + sesInfoFree(ses); + free_xid(xid); + return ERR_PTR(rc); +} + +static int match_tcon(struct cifs_tcon *tcon, const char *unc) +{ + if (tcon->tidStatus == CifsExiting) + return 0; + if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE)) + return 0; + return 1; +} + +static struct cifs_tcon * +cifs_find_tcon(struct cifs_ses *ses, const char *unc) +{ + struct list_head *tmp; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &ses->tcon_list) { + tcon = list_entry(tmp, struct cifs_tcon, tcon_list); + if (!match_tcon(tcon, unc)) + continue; + ++tcon->tc_count; + spin_unlock(&cifs_tcp_ses_lock); + return tcon; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_tcon(struct cifs_tcon *tcon) +{ + unsigned int xid; + struct cifs_ses *ses = tcon->ses; + + cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); + spin_lock(&cifs_tcp_ses_lock); + if (--tcon->tc_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + list_del_init(&tcon->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + + xid = get_xid(); + if (ses->server->ops->tree_disconnect) + ses->server->ops->tree_disconnect(xid, tcon); + _free_xid(xid); + + cifs_fscache_release_super_cookie(tcon); + tconInfoFree(tcon); + cifs_put_smb_ses(ses); +} + +static struct cifs_tcon * +cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) +{ + int rc, xid; + struct cifs_tcon *tcon; + + tcon = cifs_find_tcon(ses, volume_info->UNC); + if (tcon) { + cifs_dbg(FYI, "Found match on UNC path\n"); + /* existing tcon already has a reference */ + cifs_put_smb_ses(ses); + if (tcon->seal != volume_info->seal) + cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n"); + return tcon; + } + + if (!ses->server->ops->tree_connect) { + rc = -ENOSYS; + goto out_fail; + } + + tcon = tconInfoAlloc(); + if (tcon == NULL) { + rc = -ENOMEM; + goto out_fail; + } + + tcon->ses = ses; + if (volume_info->password) { + tcon->password = kstrdup(volume_info->password, GFP_KERNEL); + if (!tcon->password) { + rc = -ENOMEM; + goto out_fail; + } + } + + /* + * BB Do we need to wrap session_mutex around this TCon call and Unix + * SetFS as we do on SessSetup and reconnect? + */ + xid = get_xid(); + rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon, + volume_info->local_nls); + free_xid(xid); + cifs_dbg(FYI, "Tcon rc = %d\n", rc); + if (rc) + goto out_fail; + + if (volume_info->nodfs) { + tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; + cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); + } + tcon->seal = volume_info->seal; + /* + * We can have only one retry value for a connection to a share so for + * resources mounted more than once to the same server share the last + * value passed in for the retry flag is used. + */ + tcon->retry = volume_info->retry; + tcon->nocase = volume_info->nocase; + tcon->local_lease = volume_info->local_lease; + INIT_LIST_HEAD(&tcon->pending_opens); + + spin_lock(&cifs_tcp_ses_lock); + list_add(&tcon->tcon_list, &ses->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + + cifs_fscache_get_super_cookie(tcon); + + return tcon; + +out_fail: + tconInfoFree(tcon); + return ERR_PTR(rc); +} + +void +cifs_put_tlink(struct tcon_link *tlink) +{ + if (!tlink || IS_ERR(tlink)) + return; + + if (!atomic_dec_and_test(&tlink->tl_count) || + test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) { + tlink->tl_time = jiffies; + return; + } + + if (!IS_ERR(tlink_tcon(tlink))) + cifs_put_tcon(tlink_tcon(tlink)); + kfree(tlink); + return; +} + +static inline struct tcon_link * +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} + +static int +compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK)) + return 0; + + if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) != + (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) + return 0; + + /* + * We want to share sb only if we don't specify an r/wsize or + * specified r/wsize is greater than or equal to existing one. + */ + if (new->wsize && new->wsize < old->wsize) + return 0; + + if (new->rsize && new->rsize < old->rsize) + return 0; + + if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid)) + return 0; + + if (old->mnt_file_mode != new->mnt_file_mode || + old->mnt_dir_mode != new->mnt_dir_mode) + return 0; + + if (strcmp(old->local_nls->charset, new->local_nls->charset)) + return 0; + + if (old->actimeo != new->actimeo) + return 0; + + return 1; +} + +int +cifs_match_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data; + struct smb_vol *volume_info; + struct cifs_sb_info *cifs_sb; + struct TCP_Server_Info *tcp_srv; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + int rc = 0; + + spin_lock(&cifs_tcp_ses_lock); + cifs_sb = CIFS_SB(sb); + tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); + if (IS_ERR(tlink)) { + spin_unlock(&cifs_tcp_ses_lock); + return rc; + } + tcon = tlink_tcon(tlink); + ses = tcon->ses; + tcp_srv = ses->server; + + volume_info = mnt_data->vol; + + if (!match_server(tcp_srv, volume_info) || + !match_session(ses, volume_info) || + !match_tcon(tcon, volume_info->UNC)) { + rc = 0; + goto out; + } + + rc = compare_mount_options(sb, mnt_data); +out: + spin_unlock(&cifs_tcp_ses_lock); + cifs_put_tlink(tlink); + return rc; +} + +int +get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, + const struct nls_table *nls_codepage, unsigned int *num_referrals, + struct dfs_info3_param **referrals, int remap) +{ + char *temp_unc; + int rc = 0; + + if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer) + return -ENOSYS; + + *num_referrals = 0; + *referrals = NULL; + + if (ses->ipc_tid == 0) { + temp_unc = kmalloc(2 /* for slashes */ + + strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2) + + 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL); + if (temp_unc == NULL) + return -ENOMEM; + temp_unc[0] = '\\'; + temp_unc[1] = '\\'; + strcpy(temp_unc + 2, ses->serverName); + strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$"); + rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL, + nls_codepage); + cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid); + kfree(temp_unc); + } + if (rc == 0) + rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, + referrals, num_referrals, + nls_codepage, remap); + /* + * BB - map targetUNCs to dfs_info3 structures, here or in + * ses->server->ops->get_dfs_refer. + */ + + return rc; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key cifs_key[2]; +static struct lock_class_key cifs_slock_key[2]; + +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", + &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", + &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); +} +#else +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ +} +#endif + +/* See RFC1001 section 14 on representation of Netbios names */ +static void rfc1002mangle(char *target, char *source, unsigned int length) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < (length); i++) { + /* mask a nibble at a time and encode */ + target[j] = 'A' + (0x0F & (source[i] >> 4)); + target[j+1] = 'A' + (0x0F & source[i]); + j += 2; + } + +} + +static int +bind_socket(struct TCP_Server_Info *server) +{ + int rc = 0; + if (server->srcaddr.ss_family != AF_UNSPEC) { + /* Bind to the specified local IP address */ + struct socket *socket = server->ssocket; + rc = socket->ops->bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); + if (rc < 0) { + struct sockaddr_in *saddr4; + struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)&server->srcaddr; + saddr6 = (struct sockaddr_in6 *)&server->srcaddr; + if (saddr6->sin6_family == AF_INET6) + cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n", + &saddr6->sin6_addr, rc); + else + cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n", + &saddr4->sin_addr.s_addr, rc); + } + } + return rc; +} + +static int +ip_rfc1001_connect(struct TCP_Server_Info *server) +{ + int rc = 0; + /* + * some servers require RFC1001 sessinit before sending + * negprot - BB check reconnection in case where second + * sessinit is sent but no second negprot + */ + struct rfc1002_session_packet *ses_init_buf; + struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), + GFP_KERNEL); + if (ses_init_buf) { + ses_init_buf->trailer.session_req.called_len = 32; + + if (server->server_RFC1001_name && + server->server_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.calling_len = 32; + + /* + * calling name ends in null (byte 16) from old smb + * convention. + */ + if (server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.scope1 = 0; + ses_init_buf->trailer.session_req.scope2 = 0; + smb_buf = (struct smb_hdr *)ses_init_buf; + + /* sizeof RFC1002_SESSION_REQUEST with no scope */ + smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + rc = smb_send(server, smb_buf, 0x44); + kfree(ses_init_buf); + /* + * RFC1001 layer in at least one server + * requires very short break before negprot + * presumably because not expecting negprot + * to follow so fast. This is a simple + * solution that works without + * complicating the code and causes no + * significant slowing down on mount + * for everyone else + */ + usleep_range(1000, 2000); + } + /* + * else the negprot may still work without this + * even though malloc failed + */ + + return rc; +} + +static int +generic_ip_connect(struct TCP_Server_Info *server) +{ + int rc = 0; + __be16 sport; + int slen, sfamily; + struct socket *socket = server->ssocket; + struct sockaddr *saddr; + + saddr = (struct sockaddr *) &server->dstaddr; + + if (server->dstaddr.ss_family == AF_INET6) { + sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + slen = sizeof(struct sockaddr_in6); + sfamily = AF_INET6; + } else { + sport = ((struct sockaddr_in *) saddr)->sin_port; + slen = sizeof(struct sockaddr_in); + sfamily = AF_INET; + } + + if (socket == NULL) { + rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, + IPPROTO_TCP, &socket, 1); + if (rc < 0) { + cifs_dbg(VFS, "Error %d creating socket\n", rc); + server->ssocket = NULL; + return rc; + } + + /* BB other socket options to set KEEPALIVE, NODELAY? */ + cifs_dbg(FYI, "Socket created\n"); + server->ssocket = socket; + socket->sk->sk_allocation = GFP_NOFS; + if (sfamily == AF_INET6) + cifs_reclassify_socket6(socket); + else + cifs_reclassify_socket4(socket); + } + + rc = bind_socket(server); + if (rc < 0) + return rc; + + /* + * Eventually check for other socket options to change from + * the default. sock_setsockopt not used because it expects + * user space buffer + */ + socket->sk->sk_rcvtimeo = 7 * HZ; + socket->sk->sk_sndtimeo = 5 * HZ; + + /* make the bufsizes depend on wsize/rsize and max requests */ + if (server->noautotune) { + if (socket->sk->sk_sndbuf < (200 * 1024)) + socket->sk->sk_sndbuf = 200 * 1024; + if (socket->sk->sk_rcvbuf < (140 * 1024)) + socket->sk->sk_rcvbuf = 140 * 1024; + } + + if (server->tcp_nodelay) { + int val = 1; + rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rc) + cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", + rc); + } + + cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", + socket->sk->sk_sndbuf, + socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); + + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cifs_dbg(FYI, "Error %d connecting to server\n", rc); + sock_release(socket); + server->ssocket = NULL; + return rc; + } + + if (sport == htons(RFC1001_PORT)) + rc = ip_rfc1001_connect(server); + + return rc; +} + +static int +ip_connect(struct TCP_Server_Info *server) +{ + __be16 *sport; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + + if (server->dstaddr.ss_family == AF_INET6) + sport = &addr6->sin6_port; + else + sport = &addr->sin_port; + + if (*sport == 0) { + int rc; + + /* try with 445 port at first */ + *sport = htons(CIFS_PORT); + + rc = generic_ip_connect(server); + if (rc >= 0) + return rc; + + /* if it failed, try with 139 port */ + *sport = htons(RFC1001_PORT); + } + + return generic_ip_connect(server); +} + +void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info) +{ + /* if we are reconnecting then should we check to see if + * any requested capabilities changed locally e.g. via + * remount but we can not do much about it here + * if they have (even if we could detect it by the following) + * Perhaps we could add a backpointer to array of sb from tcon + * or if we change to make all sb to same share the same + * sb as NFS - then we only have one backpointer to sb. + * What if we wanted to mount the server share twice once with + * and once without posixacls or posix paths? */ + __u64 saved_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (vol_info && vol_info->no_linux_ext) { + tcon->fsUnixInfo.Capability = 0; + tcon->unix_ext = 0; /* Unix Extensions disabled */ + cifs_dbg(FYI, "Linux protocol extensions disabled\n"); + return; + } else if (vol_info) + tcon->unix_ext = 1; /* Unix Extensions supported */ + + if (tcon->unix_ext == 0) { + cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n"); + return; + } + + if (!CIFSSMBQFSUnixInfo(xid, tcon)) { + __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); + /* check for reconnect case in which we do not + want to change the mount behavior if we can avoid it */ + if (vol_info == NULL) { + /* turn off POSIX ACL and PATHNAMES if not set + originally at mount time */ + if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { + if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cifs_dbg(VFS, "POSIXPATH support change\n"); + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { + cifs_dbg(VFS, "possible reconnect error\n"); + cifs_dbg(VFS, "server disabled POSIX path support\n"); + } + } + + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) + cifs_dbg(VFS, "per-share encryption not supported yet\n"); + + cap &= CIFS_UNIX_CAP_MASK; + if (vol_info && vol_info->no_psx_acl) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { + cifs_dbg(FYI, "negotiated posix acl support\n"); + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIXACL; + } + + if (vol_info && vol_info->posix_paths == 0) + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { + cifs_dbg(FYI, "negotiate posix pathnames\n"); + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIX_PATHS; + } + + cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap); +#ifdef CONFIG_CIFS_DEBUG2 + if (cap & CIFS_UNIX_FCNTL_CAP) + cifs_dbg(FYI, "FCNTL cap\n"); + if (cap & CIFS_UNIX_EXTATTR_CAP) + cifs_dbg(FYI, "EXTATTR cap\n"); + if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cifs_dbg(FYI, "POSIX path cap\n"); + if (cap & CIFS_UNIX_XATTR_CAP) + cifs_dbg(FYI, "XATTR cap\n"); + if (cap & CIFS_UNIX_POSIX_ACL_CAP) + cifs_dbg(FYI, "POSIX ACL cap\n"); + if (cap & CIFS_UNIX_LARGE_READ_CAP) + cifs_dbg(FYI, "very large read cap\n"); + if (cap & CIFS_UNIX_LARGE_WRITE_CAP) + cifs_dbg(FYI, "very large write cap\n"); + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP) + cifs_dbg(FYI, "transport encryption cap\n"); + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) + cifs_dbg(FYI, "mandatory transport encryption cap\n"); +#endif /* CIFS_DEBUG2 */ + if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { + if (vol_info == NULL) { + cifs_dbg(FYI, "resetting capabilities failed\n"); + } else + cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n"); + + } + } +} + +void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, + struct cifs_sb_info *cifs_sb) +{ + INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + + spin_lock_init(&cifs_sb->tlink_tree_lock); + cifs_sb->tlink_tree = RB_ROOT; + + /* + * Temporarily set r/wsize for matching superblock. If we end up using + * new sb then client will later negotiate it downward if needed. + */ + cifs_sb->rsize = pvolume_info->rsize; + cifs_sb->wsize = pvolume_info->wsize; + + cifs_sb->mnt_uid = pvolume_info->linux_uid; + cifs_sb->mnt_gid = pvolume_info->linux_gid; + cifs_sb->mnt_file_mode = pvolume_info->file_mode; + cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; + cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n", + cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + + cifs_sb->actimeo = pvolume_info->actimeo; + cifs_sb->local_nls = pvolume_info->local_nls; + + if (pvolume_info->noperm) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; + if (pvolume_info->setuids) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->server_ino) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; + if (pvolume_info->remap) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR; + if (pvolume_info->sfu_remap) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; + if (pvolume_info->no_xattr) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; + if (pvolume_info->sfu_emul) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; + if (pvolume_info->nobrl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; + if (pvolume_info->nostrictsync) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; + if (pvolume_info->mand_lock) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; + if (pvolume_info->rwpidforward) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; + if (pvolume_info->cifs_acl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->backupuid_specified) { + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + } + if (pvolume_info->backupgid_specified) { + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; + cifs_sb->mnt_backupgid = pvolume_info->backupgid; + } + if (pvolume_info->override_uid) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; + if (pvolume_info->override_gid) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; + if (pvolume_info->dynperm) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + if (pvolume_info->fsc) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; + if (pvolume_info->multiuser) + cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | + CIFS_MOUNT_NO_PERM); + if (pvolume_info->strict_io) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; + if (pvolume_info->direct_io) { + cifs_dbg(FYI, "mounting share using direct i/o\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; + } + if (pvolume_info->mfsymlinks) { + if (pvolume_info->sfu_emul) { + /* + * Our SFU ("Services for Unix" emulation does not allow + * creating symlinks but does allow reading existing SFU + * symlinks (it does allow both creating and reading SFU + * style mknod and FIFOs though). When "mfsymlinks" and + * "sfu" are both enabled at the same time, it allows + * reading both types of symlinks, but will only create + * them with mfsymlinks format. This allows better + * Apple compatibility (probably better for Samba too) + * while still recognizing old Windows style symlinks. + */ + cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); + } + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; + } + + if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) + cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); +} + +static void +cleanup_volume_info_contents(struct smb_vol *volume_info) +{ + kfree(volume_info->username); + kzfree(volume_info->password); + kfree(volume_info->UNC); + kfree(volume_info->domainname); + kfree(volume_info->iocharset); + kfree(volume_info->prepath); +} + +void +cifs_cleanup_volume_info(struct smb_vol *volume_info) +{ + if (!volume_info) + return; + cleanup_volume_info_contents(volume_info); + kfree(volume_info); +} + + +#ifdef CONFIG_CIFS_DFS_UPCALL +/* + * cifs_build_path_to_root returns full path to root when we do not have an + * exiting connection (tcon) + */ +static char * +build_unc_path_to_root(const struct smb_vol *vol, + const struct cifs_sb_info *cifs_sb) +{ + char *full_path, *pos; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; + unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); + + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return ERR_PTR(-ENOMEM); + + strncpy(full_path, vol->UNC, unc_len); + pos = full_path + unc_len; + + if (pplen) { + *pos = CIFS_DIR_SEP(cifs_sb); + strncpy(pos + 1, vol->prepath, pplen); + pos += pplen; + } + + *pos = '\0'; /* add trailing null */ + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path); + return full_path; +} + +/* + * Perform a dfs referral query for a share and (optionally) prefix + * + * If a referral is found, cifs_sb->mountdata will be (re-)allocated + * to a string containing updated options for the submount. Otherwise it + * will be left untouched. + * + * Returns the rc from get_dfs_path to the caller, which can be used to + * determine whether there were referrals. + */ +static int +expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, + struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb, + int check_prefix) +{ + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + char *full_path = NULL, *ref_path = NULL, *mdata = NULL; + + full_path = build_unc_path_to_root(volume_info, cifs_sb); + if (IS_ERR(full_path)) + return PTR_ERR(full_path); + + /* For DFS paths, skip the first '\' of the UNC */ + ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; + + rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls, + &num_referrals, &referrals, cifs_remap(cifs_sb)); + + if (!rc && num_referrals > 0) { + char *fake_devname = NULL; + + mdata = cifs_compose_mount_options(cifs_sb->mountdata, + full_path + 1, referrals, + &fake_devname); + + free_dfs_info_array(referrals, num_referrals); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else { + cleanup_volume_info_contents(volume_info); + rc = cifs_setup_volume_info(volume_info, mdata, + fake_devname); + } + kfree(fake_devname); + kfree(cifs_sb->mountdata); + cifs_sb->mountdata = mdata; + } + kfree(full_path); + return rc; +} +#endif + +static int +cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname) +{ + int rc = 0; + + if (cifs_parse_mount_options(mount_data, devname, volume_info)) + return -EINVAL; + + if (volume_info->nullauth) { + cifs_dbg(FYI, "Anonymous login\n"); + kfree(volume_info->username); + volume_info->username = NULL; + } else if (volume_info->username) { + /* BB fixme parse for domain name here */ + cifs_dbg(FYI, "Username: %s\n", volume_info->username); + } else { + cifs_dbg(VFS, "No username specified\n"); + /* In userspace mount helper we can get user name from alternate + locations such as env variables and files on disk */ + return -EINVAL; + } + + /* this is needed for ASCII cp to Unicode converts */ + if (volume_info->iocharset == NULL) { + /* load_nls_default cannot return null */ + volume_info->local_nls = load_nls_default(); + } else { + volume_info->local_nls = load_nls(volume_info->iocharset); + if (volume_info->local_nls == NULL) { + cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n", + volume_info->iocharset); + return -ELIBACC; + } + } + + return rc; +} + +struct smb_vol * +cifs_get_volume_info(char *mount_data, const char *devname) +{ + int rc; + struct smb_vol *volume_info; + + volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL); + if (!volume_info) + return ERR_PTR(-ENOMEM); + + rc = cifs_setup_volume_info(volume_info, mount_data, devname); + if (rc) { + cifs_cleanup_volume_info(volume_info); + volume_info = ERR_PTR(rc); + } + + return volume_info; +} + +int +cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) +{ + int rc; + unsigned int xid; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + char *full_path; + struct tcon_link *tlink; +#ifdef CONFIG_CIFS_DFS_UPCALL + int referral_walks_count = 0; +#endif + + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); + if (rc) + return rc; + +#ifdef CONFIG_CIFS_DFS_UPCALL +try_mount_again: + /* cleanup activities if we're chasing a referral */ + if (referral_walks_count) { + if (tcon) + cifs_put_tcon(tcon); + else if (ses) + cifs_put_smb_ses(ses); + + free_xid(xid); + } +#endif + rc = 0; + tcon = NULL; + ses = NULL; + server = NULL; + full_path = NULL; + tlink = NULL; + + xid = get_xid(); + + /* get a reference to a tcp session */ + server = cifs_get_tcp_session(volume_info); + if (IS_ERR(server)) { + rc = PTR_ERR(server); + bdi_destroy(&cifs_sb->bdi); + goto out; + } + + /* get a reference to a SMB session */ + ses = cifs_get_smb_ses(server, volume_info); + if (IS_ERR(ses)) { + rc = PTR_ERR(ses); + ses = NULL; + goto mount_fail_check; + } + + /* search for existing tcon to this server share */ + tcon = cifs_get_tcon(ses, volume_info); + if (IS_ERR(tcon)) { + rc = PTR_ERR(tcon); + tcon = NULL; + goto remote_path_check; + } + + /* tell server which Unix caps we support */ + if (cap_unix(tcon->ses)) { + /* reset of caps checks mount to see if unix extensions + disabled for just this mount */ + reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); + if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && + (le64_to_cpu(tcon->fsUnixInfo.Capability) & + CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { + rc = -EACCES; + goto mount_fail_check; + } + } else + tcon->unix_ext = 0; /* server does not support them */ + + /* do not care if a following call succeed - informational */ + if (!tcon->ipc && server->ops->qfs_tcon) + server->ops->qfs_tcon(xid, tcon); + + cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); + + /* tune readahead according to rsize */ + cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + +remote_path_check: +#ifdef CONFIG_CIFS_DFS_UPCALL + /* + * Perform an unconditional check for whether there are DFS + * referrals for this path without prefix, to provide support + * for DFS referrals from w2k8 servers which don't seem to respond + * with PATH_NOT_COVERED to requests that include the prefix. + * Chase the referral if found, otherwise continue normally. + */ + if (referral_walks_count == 0) { + int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, + false); + if (!refrc) { + referral_walks_count++; + goto try_mount_again; + } + } +#endif + + /* check if a whole path is not remote */ + if (!rc && tcon) { + if (!server->ops->is_path_accessible) { + rc = -ENOSYS; + goto mount_fail_check; + } + /* + * cifs_build_path_to_root works only when we have a valid tcon + */ + full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon); + if (full_path == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, + full_path); + if (rc != 0 && rc != -EREMOTE) { + kfree(full_path); + goto mount_fail_check; + } + kfree(full_path); + } + + /* get referral if needed */ + if (rc == -EREMOTE) { +#ifdef CONFIG_CIFS_DFS_UPCALL + if (referral_walks_count > MAX_NESTED_LINKS) { + /* + * BB: when we implement proper loop detection, + * we will remove this check. But now we need it + * to prevent an indefinite loop if 'DFS tree' is + * misconfigured (i.e. has loops). + */ + rc = -ELOOP; + goto mount_fail_check; + } + + rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true); + + if (!rc) { + referral_walks_count++; + goto try_mount_again; + } + goto mount_fail_check; +#else /* No DFS support, return error on mount */ + rc = -EOPNOTSUPP; +#endif + } + + if (rc) + goto mount_fail_check; + + /* now, hang the tcon off of the superblock */ + tlink = kzalloc(sizeof *tlink, GFP_KERNEL); + if (tlink == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + + tlink->tl_uid = ses->linux_uid; + tlink->tl_tcon = tcon; + tlink->tl_time = jiffies; + set_bit(TCON_LINK_MASTER, &tlink->tl_flags); + set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + + cifs_sb->master_tlink = tlink; + spin_lock(&cifs_sb->tlink_tree_lock); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, + TLINK_IDLE_EXPIRE); + +mount_fail_check: + /* on error free sesinfo and tcon struct if needed */ + if (rc) { + /* If find_unc succeeded then rc == 0 so we can not end */ + /* up accidentally freeing someone elses tcon struct */ + if (tcon) + cifs_put_tcon(tcon); + else if (ses) + cifs_put_smb_ses(ses); + else + cifs_put_tcp_session(server); + bdi_destroy(&cifs_sb->bdi); + } + +out: + free_xid(xid); + return rc; +} + +/* + * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon + * pointer may be NULL. + */ +int +CIFSTCon(const unsigned int xid, struct cifs_ses *ses, + const char *tree, struct cifs_tcon *tcon, + const struct nls_table *nls_codepage) +{ + struct smb_hdr *smb_buffer; + struct smb_hdr *smb_buffer_response; + TCONX_REQ *pSMB; + TCONX_RSP *pSMBr; + unsigned char *bcc_ptr; + int rc = 0; + int length; + __u16 bytes_left, count; + + if (ses == NULL) + return -EIO; + + smb_buffer = cifs_buf_get(); + if (smb_buffer == NULL) + return -ENOMEM; + + smb_buffer_response = smb_buffer; + + header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, + NULL /*no tid */ , 4 /*wct */ ); + + smb_buffer->Mid = get_next_mid(ses->server); + smb_buffer->Uid = ses->Suid; + pSMB = (TCONX_REQ *) smb_buffer; + pSMBr = (TCONX_RSP *) smb_buffer_response; + + pSMB->AndXCommand = 0xFF; + pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); + bcc_ptr = &pSMB->Password[0]; + if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ + } else { + pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + /* BB FIXME add code to fail this if NTLMv2 or Kerberos + specified as required (when that support is added to + the vfs in the future) as only NTLM or the much + weaker LANMAN (which we do not send by default) is accepted + by Samba (not sure whether other servers allow + NTLMv2 password here) */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + if ((global_secflags & CIFSSEC_MAY_LANMAN) && + (ses->sectype == LANMAN)) + calc_lanman_hash(tcon->password, ses->server->cryptkey, + ses->server->sec_mode & + SECMODE_PW_ENCRYPT ? true : false, + bcc_ptr); + else +#endif /* CIFS_WEAK_PW_HASH */ + rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, + bcc_ptr, nls_codepage); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n", + __func__, rc); + cifs_buf_release(smb_buffer); + return rc; + } + + bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (ses->capabilities & CAP_UNICODE) { + /* must align unicode strings */ + *bcc_ptr = 0; /* null byte password */ + bcc_ptr++; + } + } + + if (ses->server->sign) + smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_STATUS32) { + smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS; + } + if (ses->capabilities & CAP_DFS) { + smb_buffer->Flags2 |= SMBFLG2_DFS; + } + if (ses->capabilities & CAP_UNICODE) { + smb_buffer->Flags2 |= SMBFLG2_UNICODE; + length = + cifs_strtoUTF16((__le16 *) bcc_ptr, tree, + 6 /* max utf8 char length in bytes */ * + (/* server len*/ + 256 /* share len */), nls_codepage); + bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ + bcc_ptr += 2; /* skip trailing null */ + } else { /* ASCII */ + strcpy(bcc_ptr, tree); + bcc_ptr += strlen(tree) + 1; + } + strcpy(bcc_ptr, "?????"); + bcc_ptr += strlen("?????"); + bcc_ptr += 1; + count = bcc_ptr - &pSMB->Password[0]; + pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu( + pSMB->hdr.smb_buf_length) + count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, + 0); + + /* above now done in SendReceive */ + if ((rc == 0) && (tcon != NULL)) { + bool is_unicode; + + tcon->tidStatus = CifsGood; + tcon->need_reconnect = false; + tcon->tid = smb_buffer_response->Tid; + bcc_ptr = pByteArea(smb_buffer_response); + bytes_left = get_bcc(smb_buffer_response); + length = strnlen(bcc_ptr, bytes_left - 2); + if (smb_buffer->Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + + + /* skip service field (NB: this field is always ASCII) */ + if (length == 3) { + if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && + (bcc_ptr[2] == 'C')) { + cifs_dbg(FYI, "IPC connection\n"); + tcon->ipc = 1; + } + } else if (length == 2) { + if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { + /* the most common case */ + cifs_dbg(FYI, "disk share connection\n"); + } + } + bcc_ptr += length + 1; + bytes_left -= (length + 1); + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); + + /* mostly informational -- no need to fail on error here */ + kfree(tcon->nativeFileSystem); + tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr, + bytes_left, is_unicode, + nls_codepage); + + cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem); + + if ((smb_buffer_response->WordCount == 3) || + (smb_buffer_response->WordCount == 7)) + /* field is in same location */ + tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); + else + tcon->Flags = 0; + cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); + } else if ((rc == 0) && tcon == NULL) { + /* all we need to save for IPC$ connection */ + ses->ipc_tid = smb_buffer_response->Tid; + } + + cifs_buf_release(smb_buffer); + return rc; +} + +static void delayed_free(struct rcu_head *p) +{ + struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu); + unload_nls(sbi->local_nls); + kfree(sbi); +} + +void +cifs_umount(struct cifs_sb_info *cifs_sb) +{ + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; + + cancel_delayed_work_sync(&cifs_sb->prune_tlinks); + + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); + + bdi_destroy(&cifs_sb->bdi); + kfree(cifs_sb->mountdata); + call_rcu(&cifs_sb->rcu, delayed_free); +} + +int +cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + + if (!server->ops->need_neg || !server->ops->negotiate) + return -ENOSYS; + + /* only send once per connect */ + if (!server->ops->need_neg(server)) + return 0; + + set_credits(server, 1); + + rc = server->ops->negotiate(xid, ses); + if (rc == 0) { + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus == CifsNeedNegotiate) + server->tcpStatus = CifsGood; + else + rc = -EHOSTDOWN; + spin_unlock(&GlobalMid_Lock); + } + + return rc; +} + +int +cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info) +{ + int rc = -ENOSYS; + struct TCP_Server_Info *server = ses->server; + + ses->capabilities = server->capabilities; + if (linuxExtEnabled == 0) + ses->capabilities &= (~server->vals->cap_unix); + + cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", + server->sec_mode, server->capabilities, server->timeAdj); + + if (server->ops->sess_setup) + rc = server->ops->sess_setup(xid, ses, nls_info); + + if (rc) + cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); + + return rc; +} + +static int +cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) +{ + vol->sectype = ses->sectype; + + /* krb5 is special, since we don't need username or pw */ + if (vol->sectype == Kerberos) + return 0; + + return cifs_set_cifscreds(vol, ses); +} + +static struct cifs_tcon * +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) +{ + int rc; + struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); + struct cifs_ses *ses; + struct cifs_tcon *tcon = NULL; + struct smb_vol *vol_info; + + vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); + if (vol_info == NULL) + return ERR_PTR(-ENOMEM); + + vol_info->local_nls = cifs_sb->local_nls; + vol_info->linux_uid = fsuid; + vol_info->cred_uid = fsuid; + vol_info->UNC = master_tcon->treeName; + vol_info->retry = master_tcon->retry; + vol_info->nocase = master_tcon->nocase; + vol_info->local_lease = master_tcon->local_lease; + vol_info->no_linux_ext = !master_tcon->unix_ext; + vol_info->sectype = master_tcon->ses->sectype; + vol_info->sign = master_tcon->ses->sign; + + rc = cifs_set_vol_auth(vol_info, master_tcon->ses); + if (rc) { + tcon = ERR_PTR(rc); + goto out; + } + + /* get a reference for the same TCP session */ + spin_lock(&cifs_tcp_ses_lock); + ++master_tcon->ses->server->srv_count; + spin_unlock(&cifs_tcp_ses_lock); + + ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info); + if (IS_ERR(ses)) { + tcon = (struct cifs_tcon *)ses; + cifs_put_tcp_session(master_tcon->ses->server); + goto out; + } + + tcon = cifs_get_tcon(ses, vol_info); + if (IS_ERR(tcon)) { + cifs_put_smb_ses(ses); + goto out; + } + + if (cap_unix(ses)) + reset_cifs_unix_caps(0, tcon, NULL, vol_info); +out: + kfree(vol_info->username); + kfree(vol_info->password); + kfree(vol_info); + + return tcon; +} + +struct cifs_tcon * +cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) +{ + return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); +} + +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, kuid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (uid_gt(tlink->tl_uid, uid)) + node = node->rb_left; + else if (uid_lt(tlink->tl_uid, uid)) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (uid_gt(tlink->tl_uid, new_tlink->tl_uid)) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + +/* + * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the + * current task. + * + * If the superblock doesn't refer to a multiuser mount, then just return + * the master tcon for the mount. + * + * First, search the rbtree for an existing tcon for this fsuid. If one + * exists, then check to see if it's pending construction. If it is then wait + * for construction to complete. Once it's no longer pending, check to see if + * it failed and either return an error or retry construction, depending on + * the timeout. + * + * If one doesn't exist then insert a new tcon_link struct into the tree and + * try to construct a new one. + */ +struct tcon_link * +cifs_sb_tlink(struct cifs_sb_info *cifs_sb) +{ + int ret; + kuid_t fsuid = current_fsuid(); + struct tcon_link *tlink, *newtlink; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); + + spin_lock(&cifs_sb->tlink_tree_lock); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); + if (tlink) + cifs_get_tlink(tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + + if (tlink == NULL) { + newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); + if (newtlink == NULL) + return ERR_PTR(-ENOMEM); + newtlink->tl_uid = fsuid; + newtlink->tl_tcon = ERR_PTR(-EACCES); + set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); + set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); + cifs_get_tlink(newtlink); + + spin_lock(&cifs_sb->tlink_tree_lock); + /* was one inserted after previous search? */ + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); + if (tlink) { + cifs_get_tlink(tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + kfree(newtlink); + goto wait_for_construction; + } + tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + } else { +wait_for_construction: + ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, + TASK_INTERRUPTIBLE); + if (ret) { + cifs_put_tlink(tlink); + return ERR_PTR(-ERESTARTSYS); + } + + /* if it's good, return it */ + if (!IS_ERR(tlink->tl_tcon)) + return tlink; + + /* return error if we tried this already recently */ + if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) { + cifs_put_tlink(tlink); + return ERR_PTR(-EACCES); + } + + if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags)) + goto wait_for_construction; + } + + tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid); + clear_bit(TCON_LINK_PENDING, &tlink->tl_flags); + wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING); + + if (IS_ERR(tlink->tl_tcon)) { + cifs_put_tlink(tlink); + return ERR_PTR(-EACCES); + } + + return tlink; +} + +/* + * periodic workqueue job that scans tcon_tree for a superblock and closes + * out tcons. + */ +static void +cifs_prune_tlinks(struct work_struct *work) +{ + struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, + prune_tlinks.work); + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; + + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; + + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); + + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, + TLINK_IDLE_EXPIRE); +} diff --git a/kernel/fs/cifs/dir.c b/kernel/fs/cifs/dir.c new file mode 100644 index 000000000..c3eb998a9 --- /dev/null +++ b/kernel/fs/cifs/dir.c @@ -0,0 +1,924 @@ +/* + * fs/cifs/dir.c + * + * vfs operations that deal with dentries + * + * Copyright (C) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" + +static void +renew_parental_timestamps(struct dentry *direntry) +{ + /* BB check if there is a way to get the kernel to do this or if we + really need this */ + do { + direntry->d_time = jiffies; + direntry = direntry->d_parent; + } while (!IS_ROOT(direntry)); +} + +char * +cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon) +{ + int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; + int dfsplen; + char *full_path = NULL; + + /* if no prefix path, simply set path to the root of share to "" */ + if (pplen == 0) { + full_path = kzalloc(1, GFP_KERNEL); + return full_path; + } + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; + + full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + + if (dfsplen) + strncpy(full_path, tcon->treeName, dfsplen); + full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); + strncpy(full_path + dfsplen + 1, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + full_path[dfsplen + pplen] = 0; /* add trailing null */ + return full_path; +} + +/* Note: caller must free return buffer */ +char * +build_path_from_dentry(struct dentry *direntry) +{ + struct dentry *temp; + int namelen; + int dfsplen; + char *full_path; + char dirsep; + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + unsigned seq; + + dirsep = CIFS_DIR_SEP(cifs_sb); + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; +cifs_bp_rename_retry: + namelen = dfsplen; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + for (temp = direntry; !IS_ROOT(temp);) { + namelen += (1 + temp->d_name.len); + temp = temp->d_parent; + if (temp == NULL) { + cifs_dbg(VFS, "corrupt dentry\n"); + rcu_read_unlock(); + return NULL; + } + } + rcu_read_unlock(); + + full_path = kmalloc(namelen+1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + full_path[namelen] = 0; /* trailing null */ + rcu_read_lock(); + for (temp = direntry; !IS_ROOT(temp);) { + spin_lock(&temp->d_lock); + namelen -= 1 + temp->d_name.len; + if (namelen < 0) { + spin_unlock(&temp->d_lock); + break; + } else { + full_path[namelen] = dirsep; + strncpy(full_path + namelen + 1, temp->d_name.name, + temp->d_name.len); + cifs_dbg(FYI, "name: %s\n", full_path + namelen); + } + spin_unlock(&temp->d_lock); + temp = temp->d_parent; + if (temp == NULL) { + cifs_dbg(VFS, "corrupt dentry\n"); + rcu_read_unlock(); + kfree(full_path); + return NULL; + } + } + rcu_read_unlock(); + if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { + cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", + namelen, dfsplen); + /* presumably this is only possible if racing with a rename + of one of the parent directories (we can not lock the dentries + above us to prevent this, but retrying should be harmless) */ + kfree(full_path); + goto cifs_bp_rename_retry; + } + /* DIR_SEP already set for byte 0 / vs \ but not for + subsequent slashes in prepath which currently must + be entered the right way - not sure if there is an alternative + since the '\' is a valid posix character so we can not switch + those safely to '/' if any are found in the middle of the prepath */ + /* BB test paths to Windows with '/' in the midst of prepath */ + + if (dfsplen) { + strncpy(full_path, tcon->treeName, dfsplen); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { + int i; + for (i = 0; i < dfsplen; i++) { + if (full_path[i] == '\\') + full_path[i] = '/'; + } + } + } + return full_path; +} + +/* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ +static int +check_name(struct dentry *direntry) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + int i; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + for (i = 0; i < direntry->d_name.len; i++) { + if (direntry->d_name.name[i] == '\\') { + cifs_dbg(FYI, "Invalid file name\n"); + return -EINVAL; + } + } + } + return 0; +} + + +/* Inode operations in similar order to how they appear in Linux file fs.h */ + +static int +cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, + struct tcon_link *tlink, unsigned oflags, umode_t mode, + __u32 *oplock, struct cifs_fid *fid) +{ + int rc = -ENOENT; + int create_options = CREATE_NOT_DIR; + int desired_access; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = tlink_tcon(tlink); + char *full_path = NULL; + FILE_ALL_INFO *buf = NULL; + struct inode *newinode = NULL; + int disposition; + struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_open_parms oparms; + + *oplock = 0; + if (tcon->ses->server->oplocks) + *oplock = REQ_OPLOCK; + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = cifs_posix_open(full_path, &newinode, inode->i_sb, mode, + oflags, oplock, &fid->netfid, xid); + switch (rc) { + case 0: + if (newinode == NULL) { + /* query inode info */ + goto cifs_create_get_file_info; + } + + if (!S_ISREG(newinode->i_mode)) { + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + CIFSSMBClose(xid, tcon, fid->netfid); + goto cifs_create_get_file_info; + } + /* success, no need to query */ + goto cifs_create_set_dentry; + + case -ENOENT: + goto cifs_create_get_file_info; + + case -EIO: + case -EINVAL: + /* + * EIO could indicate that (posix open) operation is not + * supported, despite what server claimed in capability + * negotiation. + * + * POSIX open in samba versions 3.3.1 and earlier could + * incorrectly fail with invalid parameter. + */ + tcon->broken_posix_open = true; + break; + + case -EREMOTE: + case -EOPNOTSUPP: + /* + * EREMOTE indicates DFS junction, which is not handled + * in posix open. If either that or op not supported + * returned, follow the normal lookup. + */ + break; + + default: + goto out; + } + /* + * fallthrough to retry, using older open call, this is case + * where server does not support this SMB level, and falsely + * claims capability (also get here for DFS case which should be + * rare for path not covered on files) + */ + } + + desired_access = 0; + if (OPEN_FMODE(oflags) & FMODE_READ) + desired_access |= GENERIC_READ; /* is this too little? */ + if (OPEN_FMODE(oflags) & FMODE_WRITE) + desired_access |= GENERIC_WRITE; + + disposition = FILE_OVERWRITE_IF; + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; + else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; + else if ((oflags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else + cifs_dbg(FYI, "Create flag not set in create function\n"); + + /* + * BB add processing to set equivalent of mode - e.g. via CreateX with + * ACLs + */ + + if (!server->ops->open) { + rc = -ENOSYS; + goto out; + } + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + /* + * if we're not using unix extensions, see if we need to set + * ATTR_READONLY on the create call + */ + if (!tcon->unix_ext && (mode & S_IWUGO) == 0) + create_options |= CREATE_OPTION_READONLY; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = desired_access; + oparms.create_options = create_options; + oparms.disposition = disposition; + oparms.path = full_path; + oparms.fid = fid; + oparms.reconnect = false; + + rc = server->ops->open(xid, &oparms, oplock, buf); + if (rc) { + cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); + goto out; + } + + /* + * If Open reported that we actually created a file then we now have to + * set the mode if possible. + */ + if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = inode->i_gid; + else + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid, + current->tgid); + } else { + /* + * BB implement mode setting via Windows security + * descriptors e.g. + */ + /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ + + /* Could set r/o dos attribute if mode & 0222 == 0 */ + } + +cifs_create_get_file_info: + /* server might mask mode so we have to query for it */ + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, + xid); + else { + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, + xid, fid); + if (newinode) { + if (server->ops->set_lease_key) + server->ops->set_lease_key(newinode, fid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if ((*oplock & CIFS_CREATE_ACTION) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); + } + } + } + +cifs_create_set_dentry: + if (rc != 0) { + cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", + rc); + if (server->ops->close) + server->ops->close(xid, tcon, fid); + goto out; + } + d_drop(direntry); + d_add(direntry, newinode); + +out: + kfree(buf); + kfree(full_path); + return rc; +} + +int +cifs_atomic_open(struct inode *inode, struct dentry *direntry, + struct file *file, unsigned oflags, umode_t mode, + int *opened) +{ + int rc; + unsigned int xid; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifs_fid fid; + struct cifs_pending_open open; + __u32 oplock; + struct cifsFileInfo *file_info; + + /* + * Posix open is only called (at lookup time) for file create now. For + * opens (rather than creates), because we do not know if it is a file + * or directory yet, and current Samba no longer allows us to do posix + * open on dirs, we could end up wasting an open call on what turns out + * to be a dir. For file opens, we wait to call posix open till + * cifs_open. It could be added to atomic_open in the future but the + * performance tradeoff of the extra network request when EISDIR or + * EACCES is returned would have to be weighed against the 50% reduction + * in network traffic in the other paths. + */ + if (!(oflags & O_CREAT)) { + struct dentry *res; + + /* + * Check for hashed negative dentry. We have already revalidated + * the dentry and it is fine. No need to perform another lookup. + */ + if (!d_unhashed(direntry)) + return -ENOENT; + + res = cifs_lookup(inode, direntry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + return finish_no_open(file, res); + } + + rc = check_name(direntry); + if (rc) + return rc; + + xid = get_xid(); + + cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + inode, direntry, direntry); + + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto out_free_xid; + } + + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (server->ops->new_lease_key) + server->ops->new_lease_key(&fid); + + cifs_add_pending_open(&fid, tlink, &open); + + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, + &oplock, &fid); + + if (rc) { + cifs_del_pending_open(&open); + goto out; + } + + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + + rc = finish_open(file, direntry, generic_file_open, opened); + if (rc) { + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); + goto out; + } + + if (file->f_flags & O_DIRECT && + CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + if (file_info == NULL) { + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); + fput(file); + rc = -ENOMEM; + } + +out: + cifs_put_tlink(tlink); +out_free_xid: + free_xid(xid); + return rc; +} + +int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, + bool excl) +{ + int rc; + unsigned int xid = get_xid(); + /* + * BB below access is probably too much for mknod to request + * but we have to do query and setpathinfo so requesting + * less could fail (unless we want to request getatr and setatr + * permissions (only). At least for POSIX we do not have to + * request so much. + */ + unsigned oflags = O_EXCL | O_CREAT | O_RDWR; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifs_fid fid; + __u32 oplock; + + cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + inode, direntry, direntry); + + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); + rc = PTR_ERR(tlink); + if (IS_ERR(tlink)) + goto out_free_xid; + + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (server->ops->new_lease_key) + server->ops->new_lease_key(&fid); + + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, + &oplock, &fid); + if (!rc && server->ops->close) + server->ops->close(xid, tcon, &fid); + + cifs_put_tlink(tlink); +out_free_xid: + free_xid(xid); + return rc; +} + +int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, + dev_t device_number) +{ + int rc = -EPERM; + unsigned int xid; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct cifs_io_parms io_parms; + char *full_path = NULL; + struct inode *newinode = NULL; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + FILE_ALL_INFO *buf = NULL; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + if (!old_valid_dev(device_number)) + return -EINVAL; + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + tcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto mknod_out; + } + + if (tcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = device_number, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc) + goto mknod_out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(direntry, newinode); + goto mknod_out; + } + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto mknod_out; + + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + kfree(full_path); + rc = -ENOMEM; + free_xid(xid); + return rc; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto mknod_out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } /* else if (S_ISFIFO) */ + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(direntry); + + /* FIXME: add code here to set EAs */ + +mknod_out: + kfree(full_path); + kfree(buf); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +struct dentry * +cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, + unsigned int flags) +{ + unsigned int xid; + int rc = 0; /* to get around spurious gcc warning, set to zero here */ + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct inode *newInode = NULL; + char *full_path = NULL; + + xid = get_xid(); + + cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + parent_dir_inode, direntry, direntry); + + /* check whether path exists */ + + cifs_sb = CIFS_SB(parent_dir_inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + free_xid(xid); + return (struct dentry *)tlink; + } + pTcon = tlink_tcon(tlink); + + rc = check_name(direntry); + if (rc) + goto lookup_out; + + /* can not grab the rename sem here since it would + deadlock in the cases (beginning of sys_rename itself) + in which we already have the sb rename sem */ + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto lookup_out; + } + + if (d_really_is_positive(direntry)) { + cifs_dbg(FYI, "non-NULL inode in lookup\n"); + } else { + cifs_dbg(FYI, "NULL inode in lookup\n"); + } + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", + full_path, d_inode(direntry)); + + if (pTcon->unix_ext) { + rc = cifs_get_inode_info_unix(&newInode, full_path, + parent_dir_inode->i_sb, xid); + } else { + rc = cifs_get_inode_info(&newInode, full_path, NULL, + parent_dir_inode->i_sb, xid, NULL); + } + + if ((rc == 0) && (newInode != NULL)) { + d_add(direntry, newInode); + /* since paths are not looked up by component - the parent + directories are presumed to be good here */ + renew_parental_timestamps(direntry); + + } else if (rc == -ENOENT) { + rc = 0; + direntry->d_time = jiffies; + d_add(direntry, NULL); + /* if it was once a directory (but how can we tell?) we could do + shrink_dcache_parent(direntry); */ + } else if (rc != -EACCES) { + cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); + /* We special case check for Access Denied - since that + is a common return code */ + } + +lookup_out: + kfree(full_path); + cifs_put_tlink(tlink); + free_xid(xid); + return ERR_PTR(rc); +} + +static int +cifs_d_revalidate(struct dentry *direntry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (d_really_is_positive(direntry)) { + if (cifs_revalidate_dentry(direntry)) + return 0; + else { + /* + * If the inode wasn't known to be a dfs entry when + * the dentry was instantiated, such as when created + * via ->readdir(), it needs to be set now since the + * attributes will have been updated by + * cifs_revalidate_dentry(). + */ + if (IS_AUTOMOUNT(d_inode(direntry)) && + !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { + spin_lock(&direntry->d_lock); + direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; + spin_unlock(&direntry->d_lock); + } + + return 1; + } + } + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!flags) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + + if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + return 0; + + return 1; +} + +/* static int cifs_d_delete(struct dentry *direntry) +{ + int rc = 0; + + cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry); + + return rc; +} */ + +const struct dentry_operations cifs_dentry_ops = { + .d_revalidate = cifs_d_revalidate, + .d_automount = cifs_dfs_d_automount, +/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ +}; + +static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) +{ + struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; + unsigned long hash; + wchar_t c; + int i, charlen; + + hash = init_name_hash(); + for (i = 0; i < q->len; i += charlen) { + charlen = codepage->char2uni(&q->name[i], q->len - i, &c); + /* error out if we can't convert the character */ + if (unlikely(charlen < 0)) + return charlen; + hash = partial_name_hash(cifs_toupper(c), hash); + } + q->hash = end_name_hash(hash); + + return 0; +} + +static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; + wchar_t c1, c2; + int i, l1, l2; + + /* + * We make the assumption here that uppercase characters in the local + * codepage are always the same length as their lowercase counterparts. + * + * If that's ever not the case, then this will fail to match it. + */ + if (name->len != len) + return 1; + + for (i = 0; i < len; i += l1) { + /* Convert characters in both strings to UTF-16. */ + l1 = codepage->char2uni(&str[i], len - i, &c1); + l2 = codepage->char2uni(&name->name[i], name->len - i, &c2); + + /* + * If we can't convert either character, just declare it to + * be 1 byte long and compare the original byte. + */ + if (unlikely(l1 < 0 && l2 < 0)) { + if (str[i] != name->name[i]) + return 1; + l1 = 1; + continue; + } + + /* + * Here, we again ass|u|me that upper/lowercase versions of + * a character are the same length in the local NLS. + */ + if (l1 != l2) + return 1; + + /* Now compare uppercase versions of these characters */ + if (cifs_toupper(c1) != cifs_toupper(c2)) + return 1; + } + + return 0; +} + +const struct dentry_operations cifs_ci_dentry_ops = { + .d_revalidate = cifs_d_revalidate, + .d_hash = cifs_ci_hash, + .d_compare = cifs_ci_compare, + .d_automount = cifs_dfs_d_automount, +}; diff --git a/kernel/fs/cifs/dns_resolve.c b/kernel/fs/cifs/dns_resolve.c new file mode 100644 index 000000000..7ede73065 --- /dev/null +++ b/kernel/fs/cifs/dns_resolve.c @@ -0,0 +1,99 @@ +/* + * fs/cifs/dns_resolve.c + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * Contains the CIFS DFS upcall routines used for hostname to + * IP address translation. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "dns_resolve.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +/** + * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. + * @unc: UNC path specifying the server (with '/' as delimiter) + * @ip_addr: Where to return the IP address. + * + * The IP address will be returned in string form, and the caller is + * responsible for freeing it. + * + * Returns length of result on success, -ve on error. + */ +int +dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) +{ + struct sockaddr_storage ss; + const char *hostname, *sep; + char *name; + int len, rc; + + if (!ip_addr || !unc) + return -EINVAL; + + len = strlen(unc); + if (len < 3) { + cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc); + return -EINVAL; + } + + /* Discount leading slashes for cifs */ + len -= 2; + hostname = unc + 2; + + /* Search for server name delimiter */ + sep = memchr(hostname, '/', len); + if (sep) + len = sep - hostname; + else + cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n", + __func__, unc); + + /* Try to interpret hostname as an IPv4 or IPv6 address */ + rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); + if (rc > 0) + goto name_is_IP_address; + + /* Perform the upcall */ + rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); + if (rc < 0) + cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n", + __func__, len, len, hostname); + else + cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n", + __func__, len, len, hostname, *ip_addr); + return rc; + +name_is_IP_address: + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + memcpy(name, hostname, len); + name[len] = 0; + cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n", + __func__, name); + *ip_addr = name; + return 0; +} diff --git a/kernel/fs/cifs/dns_resolve.h b/kernel/fs/cifs/dns_resolve.h new file mode 100644 index 000000000..d3f5d27f4 --- /dev/null +++ b/kernel/fs/cifs/dns_resolve.h @@ -0,0 +1,30 @@ +/* + * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS + * Handles host name to IP address resolution + * + * Copyright (c) International Business Machines Corp., 2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _DNS_RESOLVE_H +#define _DNS_RESOLVE_H + +#ifdef __KERNEL__ +extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); +#endif /* KERNEL */ + +#endif /* _DNS_RESOLVE_H */ diff --git a/kernel/fs/cifs/export.c b/kernel/fs/cifs/export.c new file mode 100644 index 000000000..ce8b7f677 --- /dev/null +++ b/kernel/fs/cifs/export.c @@ -0,0 +1,67 @@ +/* + * fs/cifs/export.c + * + * Copyright (C) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Common Internet FileSystem (CIFS) client + * + * Operations related to support for exporting files via NFSD + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + /* + * See Documentation/filesystems/nfs/Exporting + * and examples in fs/exportfs + * + * Since cifs is a network file system, an "fsid" must be included for + * any nfs exports file entries which refer to cifs paths. In addition + * the cifs mount must be mounted with the "serverino" option (ie use stable + * server inode numbers instead of locally generated temporary ones). + * Although cifs inodes do not use generation numbers (have generation number + * of zero) - the inode number alone should be good enough for simple cases + * in which users want to export cifs shares with NFS. The decode and encode + * could be improved by using a new routine which expects 64 bit inode numbers + * instead of the default 32 bit routines in fs/exportfs + * + */ + +#include +#include +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +#ifdef CONFIG_CIFS_NFSD_EXPORT +static struct dentry *cifs_get_parent(struct dentry *dentry) +{ + /* BB need to add code here eventually to enable export via NFSD */ + cifs_dbg(FYI, "get parent for %p\n", dentry); + return ERR_PTR(-EACCES); +} + +const struct export_operations cifs_export_ops = { + .get_parent = cifs_get_parent, +/* Following five export operations are unneeded so far and can default: + .get_dentry = + .get_name = + .find_exported_dentry = + .decode_fh = + .encode_fs = */ +}; + +#endif /* CONFIG_CIFS_NFSD_EXPORT */ + diff --git a/kernel/fs/cifs/file.c b/kernel/fs/cifs/file.c new file mode 100644 index 000000000..3f50cee79 --- /dev/null +++ b/kernel/fs/cifs/file.c @@ -0,0 +1,3896 @@ +/* + * fs/cifs/file.c + * + * vfs operations that deal with files + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "fscache.h" + + +static inline int cifs_convert_flags(unsigned int flags) +{ + if ((flags & O_ACCMODE) == O_RDONLY) + return GENERIC_READ; + else if ((flags & O_ACCMODE) == O_WRONLY) + return GENERIC_WRITE; + else if ((flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request + can cause unnecessary access denied on create */ + /* return GENERIC_ALL; */ + return (GENERIC_READ | GENERIC_WRITE); + } + + return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES | + FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA | + FILE_READ_DATA); +} + +static u32 cifs_posix_convert_flags(unsigned int flags) +{ + u32 posix_flags = 0; + + if ((flags & O_ACCMODE) == O_RDONLY) + posix_flags = SMB_O_RDONLY; + else if ((flags & O_ACCMODE) == O_WRONLY) + posix_flags = SMB_O_WRONLY; + else if ((flags & O_ACCMODE) == O_RDWR) + posix_flags = SMB_O_RDWR; + + if (flags & O_CREAT) { + posix_flags |= SMB_O_CREAT; + if (flags & O_EXCL) + posix_flags |= SMB_O_EXCL; + } else if (flags & O_EXCL) + cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n", + current->comm, current->tgid); + + if (flags & O_TRUNC) + posix_flags |= SMB_O_TRUNC; + /* be safe and imply O_SYNC for O_DSYNC */ + if (flags & O_DSYNC) + posix_flags |= SMB_O_SYNC; + if (flags & O_DIRECTORY) + posix_flags |= SMB_O_DIRECTORY; + if (flags & O_NOFOLLOW) + posix_flags |= SMB_O_NOFOLLOW; + if (flags & O_DIRECT) + posix_flags |= SMB_O_DIRECT; + + return posix_flags; +} + +static inline int cifs_get_disposition(unsigned int flags) +{ + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return FILE_CREATE; + else if ((flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + return FILE_OVERWRITE_IF; + else if ((flags & O_CREAT) == O_CREAT) + return FILE_OPEN_IF; + else if ((flags & O_TRUNC) == O_TRUNC) + return FILE_OVERWRITE; + else + return FILE_OPEN; +} + +int cifs_posix_open(char *full_path, struct inode **pinode, + struct super_block *sb, int mode, unsigned int f_flags, + __u32 *poplock, __u16 *pnetfid, unsigned int xid) +{ + int rc; + FILE_UNIX_BASIC_INFO *presp_data; + __u32 posix_flags = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_fattr fattr; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + + cifs_dbg(FYI, "posix open %s\n", full_path); + + presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (presp_data == NULL) + return -ENOMEM; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto posix_open_ret; + } + + tcon = tlink_tcon(tlink); + mode &= ~current_umask(); + + posix_flags = cifs_posix_convert_flags(f_flags); + rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data, + poplock, full_path, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_put_tlink(tlink); + + if (rc) + goto posix_open_ret; + + if (presp_data->Type == cpu_to_le32(-1)) + goto posix_open_ret; /* open ok, caller does qpathinfo */ + + if (!pinode) + goto posix_open_ret; /* caller does not need info */ + + cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb); + + /* get new inode and set it up */ + if (*pinode == NULL) { + cifs_fill_uniqueid(sb, &fattr); + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) { + rc = -ENOMEM; + goto posix_open_ret; + } + } else { + cifs_fattr_to_inode(*pinode, &fattr); + } + +posix_open_ret: + kfree(presp_data); + return rc; +} + +static int +cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid) +{ + int rc; + int desired_access; + int disposition; + int create_options = CREATE_NOT_DIR; + FILE_ALL_INFO *buf; + struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_open_parms oparms; + + if (!server->ops->open) + return -ENOSYS; + + desired_access = cifs_convert_flags(f_flags); + +/********************************************************************* + * open flag mapping table: + * + * POSIX Flag CIFS Disposition + * ---------- ---------------- + * O_CREAT FILE_OPEN_IF + * O_CREAT | O_EXCL FILE_CREATE + * O_CREAT | O_TRUNC FILE_OVERWRITE_IF + * O_TRUNC FILE_OVERWRITE + * none of the above FILE_OPEN + * + * Note that there is not a direct match between disposition + * FILE_SUPERSEDE (ie create whether or not file exists although + * O_CREAT | O_TRUNC is similar but truncates the existing + * file rather than creating a new file as FILE_SUPERSEDE does + * (which uses the attributes / metadata passed in on open call) + *? + *? O_SYNC is a reasonable match to CIFS writethrough flag + *? and the read write flags match reasonably. O_LARGEFILE + *? is irrelevant because largefile support is always used + *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY, + * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation + *********************************************************************/ + + disposition = cifs_get_disposition(f_flags); + + /* BB pass O_SYNC flag through on file attributes .. BB */ + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = desired_access; + oparms.create_options = create_options; + oparms.disposition = disposition; + oparms.path = full_path; + oparms.fid = fid; + oparms.reconnect = false; + + rc = server->ops->open(xid, &oparms, oplock, buf); + + if (rc) + goto out; + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, + xid); + else + rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, + xid, fid); + +out: + kfree(buf); + return rc; +} + +static bool +cifs_has_mand_locks(struct cifsInodeInfo *cinode) +{ + struct cifs_fid_locks *cur; + bool has_locks = false; + + down_read(&cinode->lock_sem); + list_for_each_entry(cur, &cinode->llist, llist) { + if (!list_empty(&cur->locks)) { + has_locks = true; + break; + } + } + up_read(&cinode->lock_sem); + return has_locks; +} + +struct cifsFileInfo * +cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifsFileInfo *cfile; + struct cifs_fid_locks *fdlocks; + struct cifs_tcon *tcon = tlink_tcon(tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cfile == NULL) + return cfile; + + fdlocks = kzalloc(sizeof(struct cifs_fid_locks), GFP_KERNEL); + if (!fdlocks) { + kfree(cfile); + return NULL; + } + + INIT_LIST_HEAD(&fdlocks->locks); + fdlocks->cfile = cfile; + cfile->llist = fdlocks; + down_write(&cinode->lock_sem); + list_add(&fdlocks->llist, &cinode->llist); + up_write(&cinode->lock_sem); + + cfile->count = 1; + cfile->pid = current->tgid; + cfile->uid = current_fsuid(); + cfile->dentry = dget(dentry); + cfile->f_flags = file->f_flags; + cfile->invalidHandle = false; + cfile->tlink = cifs_get_tlink(tlink); + INIT_WORK(&cfile->oplock_break, cifs_oplock_break); + mutex_init(&cfile->fh_mutex); + + cifs_sb_active(inode->i_sb); + + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); + oplock = 0; + } + + spin_lock(&cifs_file_list_lock); + if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) + oplock = fid->pending_open->oplock; + list_del(&fid->pending_open->olist); + + fid->purge_cache = false; + server->ops->set_fid(cfile, fid, oplock); + + list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ + if (file->f_mode & FMODE_READ) + list_add(&cfile->flist, &cinode->openFileList); + else + list_add_tail(&cfile->flist, &cinode->openFileList); + spin_unlock(&cifs_file_list_lock); + + if (fid->purge_cache) + cifs_zap_mapping(inode); + + file->private_data = cfile; + return cfile; +} + +struct cifsFileInfo * +cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + spin_lock(&cifs_file_list_lock); + cifsFileInfo_get_locked(cifs_file); + spin_unlock(&cifs_file_list_lock); + return cifs_file; +} + +/* + * Release a reference on the file private data. This may involve closing + * the filehandle out on the server. Must be called without holding + * cifs_file_list_lock. + */ +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + struct inode *inode = d_inode(cifs_file->dentry); + struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifsLockInfo *li, *tmp; + struct cifs_fid fid; + struct cifs_pending_open open; + bool oplock_break_cancelled; + + spin_lock(&cifs_file_list_lock); + if (--cifs_file->count > 0) { + spin_unlock(&cifs_file_list_lock); + return; + } + + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + + /* store open in pending opens to make sure we don't miss lease break */ + cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open); + + /* remove it from the lists */ + list_del(&cifs_file->flist); + list_del(&cifs_file->tlist); + + if (list_empty(&cifsi->openFileList)) { + cifs_dbg(FYI, "closing last open instance for inode %p\n", + d_inode(cifs_file->dentry)); + /* + * In strict cache mode we need invalidate mapping on the last + * close because it may cause a error when we open this file + * again and get at least level II oplock. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); + cifs_set_oplock_level(cifsi, 0); + } + spin_unlock(&cifs_file_list_lock); + + oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + + if (!tcon->need_reconnect && !cifs_file->invalidHandle) { + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int xid; + + xid = get_xid(); + if (server->ops->close) + server->ops->close(xid, tcon, &cifs_file->fid); + _free_xid(xid); + } + + if (oplock_break_cancelled) + cifs_done_oplock_break(cifsi); + + cifs_del_pending_open(&open); + + /* + * Delete any outstanding lock records. We'll lose them when the file + * is closed anyway. + */ + down_write(&cifsi->lock_sem); + list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + } + list_del(&cifs_file->llist->llist); + kfree(cifs_file->llist); + up_write(&cifsi->lock_sem); + + cifs_put_tlink(cifs_file->tlink); + dput(cifs_file->dentry); + cifs_sb_deactive(sb); + kfree(cifs_file); +} + +int cifs_open(struct inode *inode, struct file *file) + +{ + int rc = -EACCES; + unsigned int xid; + __u32 oplock; + struct cifs_sb_info *cifs_sb; + struct TCP_Server_Info *server; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + struct cifsFileInfo *cfile = NULL; + char *full_path = NULL; + bool posix_open_ok = false; + struct cifs_fid fid; + struct cifs_pending_open open; + + xid = get_xid(); + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + free_xid(xid); + return PTR_ERR(tlink); + } + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + full_path = build_path_from_dentry(file->f_path.dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", + inode, file->f_flags, full_path); + + if (file->f_flags & O_DIRECT && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + + if (server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + + if (!tcon->broken_posix_open && tcon->unix_ext && + cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + /* can not refresh inode info since size could be stale */ + rc = cifs_posix_open(full_path, &inode, inode->i_sb, + cifs_sb->mnt_file_mode /* ignored */, + file->f_flags, &oplock, &fid.netfid, xid); + if (rc == 0) { + cifs_dbg(FYI, "posix open succeeded\n"); + posix_open_ok = true; + } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + if (tcon->ses->serverNOS) + cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", + tcon->ses->serverName, + tcon->ses->serverNOS); + tcon->broken_posix_open = true; + } else if ((rc != -EIO) && (rc != -EREMOTE) && + (rc != -EOPNOTSUPP)) /* path not found or net err */ + goto out; + /* + * Else fallthrough to retry open the old way on network i/o + * or DFS errors. + */ + } + + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + + cifs_add_pending_open(&fid, tlink, &open); + + if (!posix_open_ok) { + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, + file->f_flags, &oplock, &fid, xid); + if (rc) { + cifs_del_pending_open(&open); + goto out; + } + } + + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + if (cfile == NULL) { + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); + rc = -ENOMEM; + goto out; + } + + cifs_fscache_set_inode_cookie(inode, file); + + if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { + /* + * Time to set mode which we can not set earlier due to + * problems creating new read-only files. + */ + struct cifs_unix_set_info_args args = { + .mode = inode->i_mode, + .uid = INVALID_UID, /* no change */ + .gid = INVALID_GID, /* no change */ + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid, + cfile->pid); + } + +out: + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +static int cifs_push_posix_locks(struct cifsFileInfo *cfile); + +/* + * Try to reacquire byte range locks that were released when session + * to server was lost. + */ +static int +cifs_relock_file(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + int rc = 0; + + down_read(&cinode->lock_sem); + if (cinode->can_cache_brlcks) { + /* can cache locks - no need to relock */ + up_read(&cinode->lock_sem); + return rc; + } + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); + + up_read(&cinode->lock_sem); + return rc; +} + +static int +cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) +{ + int rc = -EACCES; + unsigned int xid; + __u32 oplock; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifsInodeInfo *cinode; + struct inode *inode; + char *full_path = NULL; + int desired_access; + int disposition = FILE_OPEN; + int create_options = CREATE_NOT_DIR; + struct cifs_open_parms oparms; + + xid = get_xid(); + mutex_lock(&cfile->fh_mutex); + if (!cfile->invalidHandle) { + mutex_unlock(&cfile->fh_mutex); + rc = 0; + free_xid(xid); + return rc; + } + + inode = d_inode(cfile->dentry); + cifs_sb = CIFS_SB(inode->i_sb); + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; + + /* + * Can not grab rename sem here because various ops, including those + * that already have the rename sem can end up causing writepage to get + * called and if the server was down that means we end up here, and we + * can never tell if the caller already has the rename_sem. + */ + full_path = build_path_from_dentry(cfile->dentry); + if (full_path == NULL) { + rc = -ENOMEM; + mutex_unlock(&cfile->fh_mutex); + free_xid(xid); + return rc; + } + + cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", + inode, cfile->f_flags, full_path); + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + + if (tcon->unix_ext && cap_unix(tcon->ses) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + /* + * O_CREAT, O_EXCL and O_TRUNC already had their effect on the + * original open. Must mask them off for a reopen. + */ + unsigned int oflags = cfile->f_flags & + ~(O_CREAT | O_EXCL | O_TRUNC); + + rc = cifs_posix_open(full_path, NULL, inode->i_sb, + cifs_sb->mnt_file_mode /* ignored */, + oflags, &oplock, &cfile->fid.netfid, xid); + if (rc == 0) { + cifs_dbg(FYI, "posix reopen succeeded\n"); + oparms.reconnect = true; + goto reopen_success; + } + /* + * fallthrough to retry open the old way on errors, especially + * in the reconnect path it is important to retry hard + */ + } + + desired_access = cifs_convert_flags(cfile->f_flags); + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &cfile->fid); + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = desired_access; + oparms.create_options = create_options; + oparms.disposition = disposition; + oparms.path = full_path; + oparms.fid = &cfile->fid; + oparms.reconnect = true; + + /* + * Can not refresh inode by passing in file_info buf to be returned by + * ops->open and then calling get_inode_info with returned buf since + * file might have write behind data that needs to be flushed and server + * version of file size can be stale. If we knew for sure that inode was + * not dirty locally we could do this. + */ + rc = server->ops->open(xid, &oparms, &oplock, NULL); + if (rc == -ENOENT && oparms.reconnect == false) { + /* durable handle timeout is expired - open the file again */ + rc = server->ops->open(xid, &oparms, &oplock, NULL); + /* indicate that we need to relock the file */ + oparms.reconnect = true; + } + + if (rc) { + mutex_unlock(&cfile->fh_mutex); + cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc); + cifs_dbg(FYI, "oplock: %d\n", oplock); + goto reopen_error_exit; + } + +reopen_success: + cfile->invalidHandle = false; + mutex_unlock(&cfile->fh_mutex); + cinode = CIFS_I(inode); + + if (can_flush) { + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, + inode->i_sb, xid); + else + rc = cifs_get_inode_info(&inode, full_path, NULL, + inode->i_sb, xid, NULL); + } + /* + * Else we are writing out data to server already and could deadlock if + * we tried to flush data, and since we do not know if we have data that + * would invalidate the current end of file on the server we can not go + * to the server to get the new inode info. + */ + + server->ops->set_fid(cfile, &cfile->fid, oplock); + if (oparms.reconnect) + cifs_relock_file(cfile); + +reopen_error_exit: + kfree(full_path); + free_xid(xid); + return rc; +} + +int cifs_close(struct inode *inode, struct file *file) +{ + if (file->private_data != NULL) { + cifsFileInfo_put(file->private_data); + file->private_data = NULL; + } + + /* return code from the ->release op is always ignored */ + return 0; +} + +int cifs_closedir(struct inode *inode, struct file *file) +{ + int rc = 0; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + char *buf; + + cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode); + + if (cfile == NULL) + return rc; + + xid = get_xid(); + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; + + cifs_dbg(FYI, "Freeing private data in close dir\n"); + spin_lock(&cifs_file_list_lock); + if (server->ops->dir_needs_close(cfile)) { + cfile->invalidHandle = true; + spin_unlock(&cifs_file_list_lock); + if (server->ops->close_dir) + rc = server->ops->close_dir(xid, tcon, &cfile->fid); + else + rc = -ENOSYS; + cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc); + /* not much we can do if it fails anyway, ignore rc */ + rc = 0; + } else + spin_unlock(&cifs_file_list_lock); + + buf = cfile->srch_inf.ntwrk_buf_start; + if (buf) { + cifs_dbg(FYI, "closedir free smb buf in srch struct\n"); + cfile->srch_inf.ntwrk_buf_start = NULL; + if (cfile->srch_inf.smallBuf) + cifs_small_buf_release(buf); + else + cifs_buf_release(buf); + } + + cifs_put_tlink(cfile->tlink); + kfree(file->private_data); + file->private_data = NULL; + /* BB can we lock the filestruct while this is going on? */ + free_xid(xid); + return rc; +} + +static struct cifsLockInfo * +cifs_lock_init(__u64 offset, __u64 length, __u8 type) +{ + struct cifsLockInfo *lock = + kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; +} + +void +cifs_del_lock_waiters(struct cifsLockInfo *lock) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, &lock->blist, blist) { + list_del_init(&li->blist); + wake_up(&li->block_q); + } +} + +#define CIFS_LOCK_OP 0 +#define CIFS_READ_OP 1 +#define CIFS_WRITE_OP 2 + +/* @rw_check : 0 - no op, 1 - read, 2 - write */ +static bool +cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, + __u64 length, __u8 type, struct cifsFileInfo *cfile, + struct cifsLockInfo **conf_lock, int rw_check) +{ + struct cifsLockInfo *li; + struct cifsFileInfo *cur_cfile = fdlocks->cfile; + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + + list_for_each_entry(li, &fdlocks->locks, llist) { + if (offset + length <= li->offset || + offset >= li->offset + li->length) + continue; + if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid && + server->ops->compare_fids(cfile, cur_cfile)) { + /* shared lock prevents write op through the same fid */ + if (!(li->type & server->vals->shared_lock_type) || + rw_check != CIFS_WRITE_OP) + continue; + } + if ((type & server->vals->shared_lock_type) && + ((server->ops->compare_fids(cfile, cur_cfile) && + current->tgid == li->pid) || type == li->type)) + continue; + if (conf_lock) + *conf_lock = li; + return true; + } + return false; +} + +bool +cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, + __u8 type, struct cifsLockInfo **conf_lock, + int rw_check) +{ + bool rc = false; + struct cifs_fid_locks *cur; + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + + list_for_each_entry(cur, &cinode->llist, llist) { + rc = cifs_find_fid_lock_conflict(cur, offset, length, type, + cfile, conf_lock, rw_check); + if (rc) + break; + } + + return rc; +} + +/* + * Check if there is another lock that prevents us to set the lock (mandatory + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, + __u8 type, struct file_lock *flock) +{ + int rc = 0; + struct cifsLockInfo *conf_lock; + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + bool exist; + + down_read(&cinode->lock_sem); + + exist = cifs_find_lock_conflict(cfile, offset, length, type, + &conf_lock, CIFS_LOCK_OP); + if (exist) { + flock->fl_start = conf_lock->offset; + flock->fl_end = conf_lock->offset + conf_lock->length - 1; + flock->fl_pid = conf_lock->pid; + if (conf_lock->type & server->vals->shared_lock_type) + flock->fl_type = F_RDLCK; + else + flock->fl_type = F_WRLCK; + } else if (!cinode->can_cache_brlcks) + rc = 1; + else + flock->fl_type = F_UNLCK; + + up_read(&cinode->lock_sem); + return rc; +} + +static void +cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) +{ + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + down_write(&cinode->lock_sem); + list_add_tail(&lock->llist, &cfile->llist->locks); + up_write(&cinode->lock_sem); +} + +/* + * Set the byte-range lock (mandatory style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if no locks prevent us but we need to request to the server; + * 3) -EACCESS, if there is a lock that prevents us and wait is false. + */ +static int +cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, + bool wait) +{ + struct cifsLockInfo *conf_lock; + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + bool exist; + int rc = 0; + +try_again: + exist = false; + down_write(&cinode->lock_sem); + + exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, + lock->type, &conf_lock, CIFS_LOCK_OP); + if (!exist && cinode->can_cache_brlcks) { + list_add_tail(&lock->llist, &cfile->llist->locks); + up_write(&cinode->lock_sem); + return rc; + } + + if (!exist) + rc = 1; + else if (!wait) + rc = -EACCES; + else { + list_add_tail(&lock->blist, &conf_lock->blist); + up_write(&cinode->lock_sem); + rc = wait_event_interruptible(lock->block_q, + (lock->blist.prev == &lock->blist) && + (lock->blist.next == &lock->blist)); + if (!rc) + goto try_again; + down_write(&cinode->lock_sem); + list_del_init(&lock->blist); + } + + up_write(&cinode->lock_sem); + return rc; +} + +/* + * Check if there is another lock that prevents us to set the lock (posix + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_posix_lock_test(struct file *file, struct file_lock *flock) +{ + int rc = 0; + struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); + unsigned char saved_type = flock->fl_type; + + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + + down_read(&cinode->lock_sem); + posix_test_lock(file, flock); + + if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { + flock->fl_type = saved_type; + rc = 1; + } + + up_read(&cinode->lock_sem); + return rc; +} + +/* + * Set the byte-range lock (posix style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if we need to request to the server; + * 3) <0, if the error occurs while setting the lock. + */ +static int +cifs_posix_lock_set(struct file *file, struct file_lock *flock) +{ + struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; + +try_again: + down_write(&cinode->lock_sem); + if (!cinode->can_cache_brlcks) { + up_write(&cinode->lock_sem); + return rc; + } + + rc = posix_lock_file(file, flock, NULL); + up_write(&cinode->lock_sem); + if (rc == FILE_LOCK_DEFERRED) { + rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); + if (!rc) + goto try_again; + posix_unblock_lock(flock); + } + return rc; +} + +int +cifs_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + unsigned int xid; + int rc = 0, stored_rc; + struct cifsLockInfo *li, *tmp; + struct cifs_tcon *tcon; + unsigned int num, max_num, max_buf; + LOCKING_ANDX_RANGE *buf, *cur; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + int i; + + xid = get_xid(); + tcon = tlink_tcon(cfile->tlink); + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) { + free_xid(xid); + return -EINVAL; + } + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) { + free_xid(xid); + return -ENOMEM; + } + + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { + if (li->type != types[i]) + continue; + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->fid.netfid, + (__u8)li->type, 0, num, + buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid, + (__u8)types[i], 0, num, buf); + if (stored_rc) + rc = stored_rc; + } + } + + kfree(buf); + free_xid(xid); + return rc; +} + +struct lock_to_push { + struct list_head llist; + __u64 offset; + __u64 length; + __u32 pid; + __u16 netfid; + __u8 type; +}; + +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct inode *inode = d_inode(cfile->dentry); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct file_lock *flock; + struct file_lock_context *flctx = inode->i_flctx; + unsigned int count = 0, i; + int rc = 0, xid, type; + struct list_head locks_to_send, *el; + struct lock_to_push *lck, *tmp; + __u64 length; + + xid = get_xid(); + + if (!flctx) + goto out; + + spin_lock(&flctx->flc_lock); + list_for_each(el, &flctx->flc_posix) { + count++; + } + spin_unlock(&flctx->flc_lock); + + INIT_LIST_HEAD(&locks_to_send); + + /* + * Allocating count locks is enough because no FL_POSIX locks can be + * added to the list while we are holding cinode->lock_sem that + * protects locking operations of this inode. + */ + for (i = 0; i < count; i++) { + lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); + if (!lck) { + rc = -ENOMEM; + goto err_out; + } + list_add_tail(&lck->llist, &locks_to_send); + } + + el = locks_to_send.next; + spin_lock(&flctx->flc_lock); + list_for_each_entry(flock, &flctx->flc_posix, fl_list) { + if (el == &locks_to_send) { + /* + * The list ended. We don't have enough allocated + * structures - something is really wrong. + */ + cifs_dbg(VFS, "Can't push all brlocks!\n"); + break; + } + length = 1 + flock->fl_end - flock->fl_start; + if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + type = CIFS_RDLCK; + else + type = CIFS_WRLCK; + lck = list_entry(el, struct lock_to_push, llist); + lck->pid = flock->fl_pid; + lck->netfid = cfile->fid.netfid; + lck->length = length; + lck->type = type; + lck->offset = flock->fl_start; + } + spin_unlock(&flctx->flc_lock); + + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + int stored_rc; + + stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid, + lck->offset, lck->length, NULL, + lck->type, 0); + if (stored_rc) + rc = stored_rc; + list_del(&lck->llist); + kfree(lck); + } + +out: + free_xid(xid); + return rc; +err_out: + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + list_del(&lck->llist); + kfree(lck); + } + goto out; +} + +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + int rc = 0; + + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (!cinode->can_cache_brlcks) { + up_write(&cinode->lock_sem); + return rc; + } + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); + + cinode->can_cache_brlcks = false; + up_write(&cinode->lock_sem); + return rc; +} + +static void +cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, + bool *wait_flag, struct TCP_Server_Info *server) +{ + if (flock->fl_flags & FL_POSIX) + cifs_dbg(FYI, "Posix\n"); + if (flock->fl_flags & FL_FLOCK) + cifs_dbg(FYI, "Flock\n"); + if (flock->fl_flags & FL_SLEEP) { + cifs_dbg(FYI, "Blocking lock\n"); + *wait_flag = true; + } + if (flock->fl_flags & FL_ACCESS) + cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n"); + if (flock->fl_flags & FL_LEASE) + cifs_dbg(FYI, "Lease on file - not implemented yet\n"); + if (flock->fl_flags & + (~(FL_POSIX | FL_FLOCK | FL_SLEEP | + FL_ACCESS | FL_LEASE | FL_CLOSE))) + cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); + + *type = server->vals->large_lock_type; + if (flock->fl_type == F_WRLCK) { + cifs_dbg(FYI, "F_WRLCK\n"); + *type |= server->vals->exclusive_lock_type; + *lock = 1; + } else if (flock->fl_type == F_UNLCK) { + cifs_dbg(FYI, "F_UNLCK\n"); + *type |= server->vals->unlock_lock_type; + *unlock = 1; + /* Check if unlock includes more than one lock range */ + } else if (flock->fl_type == F_RDLCK) { + cifs_dbg(FYI, "F_RDLCK\n"); + *type |= server->vals->shared_lock_type; + *lock = 1; + } else if (flock->fl_type == F_EXLCK) { + cifs_dbg(FYI, "F_EXLCK\n"); + *type |= server->vals->exclusive_lock_type; + *lock = 1; + } else if (flock->fl_type == F_SHLCK) { + cifs_dbg(FYI, "F_SHLCK\n"); + *type |= server->vals->shared_lock_type; + *lock = 1; + } else + cifs_dbg(FYI, "Unknown type of lock\n"); +} + +static int +cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, + bool wait_flag, bool posix_lck, unsigned int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + __u16 netfid = cfile->fid.netfid; + + if (posix_lck) { + int posix_lock_type; + + rc = cifs_posix_lock_test(file, flock); + if (!rc) + return rc; + + if (type & server->vals->shared_lock_type) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + flock->fl_start, length, flock, + posix_lock_type, wait_flag); + return rc; + } + + rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock); + if (!rc) + return rc; + + /* BB we could chain these into one lock request BB */ + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, + 1, 0, false); + if (rc == 0) { + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type, 0, 1, false); + flock->fl_type = F_UNLCK; + if (rc != 0) + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); + return 0; + } + + if (type & server->vals->shared_lock_type) { + flock->fl_type = F_WRLCK; + return 0; + } + + type &= ~server->vals->exclusive_lock_type; + + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type | server->vals->shared_lock_type, + 1, 0, false); + if (rc == 0) { + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type | server->vals->shared_lock_type, 0, 1, false); + flock->fl_type = F_RDLCK; + if (rc != 0) + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); + } else + flock->fl_type = F_WRLCK; + + return 0; +} + +void +cifs_move_llist(struct list_head *source, struct list_head *dest) +{ + struct list_head *li, *tmp; + list_for_each_safe(li, tmp, source) + list_move(li, dest); +} + +void +cifs_free_llist(struct list_head *llist) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, llist, llist) { + cifs_del_lock_waiters(li); + list_del(&li->llist); + kfree(li); + } +} + +int +cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, + unsigned int xid) +{ + int rc = 0, stored_rc; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + unsigned int i; + unsigned int max_num, num, max_buf; + LOCKING_ANDX_RANGE *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) + return -EINVAL; + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + down_write(&cinode->lock_sem); + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (types[i] != li->type) + continue; + if (cinode->can_cache_brlcks) { + /* + * We can cache brlock requests - simply remove + * a lock from the file's list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + continue; + } + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add it again to + * the file's list if the unlock range request fails on + * the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->fid.netfid, + li->type, num, 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from the tmp + * list to the head of the file's list. + */ + cifs_move_llist(&tmp_llist, + &cfile->llist->locks); + rc = stored_rc; + } else + /* + * The unlock range request succeed - + * free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; + } + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid, + types[i], num, 0, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, + &cfile->llist->locks); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } + } + + up_write(&cinode->lock_sem); + kfree(buf); + return rc; +} + +static int +cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, + bool wait_flag, bool posix_lck, int lock, int unlock, + unsigned int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct inode *inode = d_inode(cfile->dentry); + + if (posix_lck) { + int posix_lock_type; + + rc = cifs_posix_lock_set(file, flock); + if (!rc || rc < 0) + return rc; + + if (type & server->vals->shared_lock_type) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + + if (unlock == 1) + posix_lock_type = CIFS_UNLCK; + + rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, + current->tgid, flock->fl_start, length, + NULL, posix_lock_type, wait_flag); + goto out; + } + + if (lock) { + struct cifsLockInfo *lock; + + lock = cifs_lock_init(flock->fl_start, length, type); + if (!lock) + return -ENOMEM; + + rc = cifs_lock_add_if(cfile, lock, wait_flag); + if (rc < 0) { + kfree(lock); + return rc; + } + if (!rc) + goto out; + + /* + * Windows 7 server can delay breaking lease from read to None + * if we set a byte-range lock on a file - break it explicitly + * before sending the lock to the server to be sure the next + * read won't conflict with non-overlapted locks due to + * pagereading. + */ + if (!CIFS_CACHE_WRITE(CIFS_I(inode)) && + CIFS_CACHE_READ(CIFS_I(inode))) { + cifs_zap_mapping(inode); + cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", + inode); + CIFS_I(inode)->oplock = 0; + } + + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type, 1, 0, wait_flag); + if (rc) { + kfree(lock); + return rc; + } + + cifs_lock_add(cfile, lock); + } else if (unlock) + rc = server->ops->mand_unlock_range(cfile, flock, xid); + +out: + if (flock->fl_flags & FL_POSIX && !rc) + rc = posix_lock_file_wait(file, flock); + return rc; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *flock) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + __u16 netfid; + __u32 type; + + rc = -EACCES; + xid = get_xid(); + + cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n", + cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); + + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + + cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, + tcon->ses->server); + + cifs_sb = CIFS_FILE_SB(file); + netfid = cfile->fid.netfid; + cinode = CIFS_I(file_inode(file)); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + /* + * BB add code here to normalize offset and length to account for + * negative length which we can not accept over the wire. + */ + if (IS_GETLK(cmd)) { + rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid); + free_xid(xid); + return rc; + } + + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + free_xid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, + xid); + free_xid(xid); + return rc; +} + +/* + * update the file size (if needed) after a write. Should be called with + * the inode->i_lock held + */ +void +cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written) +{ + loff_t end_of_write = offset + bytes_written; + + if (end_of_write > cifsi->server_eof) + cifsi->server_eof = end_of_write; +} + +static ssize_t +cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, + size_t write_size, loff_t *offset) +{ + int rc = 0; + unsigned int bytes_written = 0; + unsigned int total_written; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + unsigned int xid; + struct dentry *dentry = open_file->dentry; + struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); + struct cifs_io_parms io_parms; + + cifs_sb = CIFS_SB(dentry->d_sb); + + cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n", + write_size, *offset, dentry); + + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + + if (!server->ops->sync_write) + return -ENOSYS; + + xid = get_xid(); + + for (total_written = 0; write_size > total_written; + total_written += bytes_written) { + rc = -EAGAIN; + while (rc == -EAGAIN) { + struct kvec iov[2]; + unsigned int len; + + if (open_file->invalidHandle) { + /* we could deadlock if we called + filemap_fdatawait from here so tell + reopen_file not to flush data to + server now */ + rc = cifs_reopen_file(open_file, false); + if (rc != 0) + break; + } + + len = min(server->ops->wp_retry_size(d_inode(dentry)), + (unsigned int)write_size - total_written); + /* iov[0] is reserved for smb header */ + iov[1].iov_base = (char *)write_data + total_written; + iov[1].iov_len = len; + io_parms.pid = pid; + io_parms.tcon = tcon; + io_parms.offset = *offset; + io_parms.length = len; + rc = server->ops->sync_write(xid, &open_file->fid, + &io_parms, &bytes_written, iov, 1); + } + if (rc || (bytes_written == 0)) { + if (total_written) + break; + else { + free_xid(xid); + return rc; + } + } else { + spin_lock(&d_inode(dentry)->i_lock); + cifs_update_eof(cifsi, *offset, bytes_written); + spin_unlock(&d_inode(dentry)->i_lock); + *offset += bytes_written; + } + } + + cifs_stats_bytes_written(tcon, total_written); + + if (total_written > 0) { + spin_lock(&d_inode(dentry)->i_lock); + if (*offset > d_inode(dentry)->i_size) + i_size_write(d_inode(dentry), *offset); + spin_unlock(&d_inode(dentry)->i_lock); + } + mark_inode_dirty_sync(d_inode(dentry)); + free_xid(xid); + return total_written; +} + +struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only) +{ + struct cifsFileInfo *open_file = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + + /* only filter by fsuid on multiuser mounts */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + fsuid_only = false; + + spin_lock(&cifs_file_list_lock); + /* we could simply get the first_list_entry since write-only entries + are always at the end of the list but since the first entry might + have a close pending, we go through the whole list */ + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) + continue; + if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { + if (!open_file->invalidHandle) { + /* found a good file */ + /* lock it so it will not be closed on us */ + cifsFileInfo_get_locked(open_file); + spin_unlock(&cifs_file_list_lock); + return open_file; + } /* else might as well continue, and look for + another, or simply have the caller reopen it + again rather than trying to fix this handle */ + } else /* write only file */ + break; /* write only files are last so must be done */ + } + spin_unlock(&cifs_file_list_lock); + return NULL; +} + +struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only) +{ + struct cifsFileInfo *open_file, *inv_file = NULL; + struct cifs_sb_info *cifs_sb; + bool any_available = false; + int rc; + unsigned int refind = 0; + + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if (cifs_inode == NULL) { + cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n"); + dump_stack(); + return NULL; + } + + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + + /* only filter by fsuid on multiuser mounts */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + fsuid_only = false; + + spin_lock(&cifs_file_list_lock); +refind_writable: + if (refind > MAX_REOPEN_ATT) { + spin_unlock(&cifs_file_list_lock); + return NULL; + } + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (!any_available && open_file->pid != current->tgid) + continue; + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) + continue; + if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { + if (!open_file->invalidHandle) { + /* found a good writable file */ + cifsFileInfo_get_locked(open_file); + spin_unlock(&cifs_file_list_lock); + return open_file; + } else { + if (!inv_file) + inv_file = open_file; + } + } + } + /* couldn't find useable FH with same pid, try any available */ + if (!any_available) { + any_available = true; + goto refind_writable; + } + + if (inv_file) { + any_available = false; + cifsFileInfo_get_locked(inv_file); + } + + spin_unlock(&cifs_file_list_lock); + + if (inv_file) { + rc = cifs_reopen_file(inv_file, false); + if (!rc) + return inv_file; + else { + spin_lock(&cifs_file_list_lock); + list_move_tail(&inv_file->flist, + &cifs_inode->openFileList); + spin_unlock(&cifs_file_list_lock); + cifsFileInfo_put(inv_file); + spin_lock(&cifs_file_list_lock); + ++refind; + inv_file = NULL; + goto refind_writable; + } + } + + return NULL; +} + +static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) +{ + struct address_space *mapping = page->mapping; + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + char *write_data; + int rc = -EFAULT; + int bytes_written = 0; + struct inode *inode; + struct cifsFileInfo *open_file; + + if (!mapping || !mapping->host) + return -EFAULT; + + inode = page->mapping->host; + + offset += (loff_t)from; + write_data = kmap(page); + write_data += from; + + if ((to > PAGE_CACHE_SIZE) || (from > to)) { + kunmap(page); + return -EIO; + } + + /* racing with truncate? */ + if (offset > mapping->host->i_size) { + kunmap(page); + return 0; /* don't care */ + } + + /* check to make sure that we are not extending the file */ + if (mapping->host->i_size - offset < (loff_t)to) + to = (unsigned)(mapping->host->i_size - offset); + + open_file = find_writable_file(CIFS_I(mapping->host), false); + if (open_file) { + bytes_written = cifs_write(open_file, open_file->pid, + write_data, to - from, &offset); + cifsFileInfo_put(open_file); + /* Does mm or vfs already set times? */ + inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); + if ((bytes_written > 0) && (offset)) + rc = 0; + else if (bytes_written < 0) + rc = bytes_written; + } else { + cifs_dbg(FYI, "No writeable filehandles for inode\n"); + rc = -EIO; + } + + kunmap(page); + return rc; +} + +static struct cifs_writedata * +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, + pgoff_t end, pgoff_t *index, + unsigned int *found_pages) +{ + unsigned int nr_pages; + struct page **pages; + struct cifs_writedata *wdata; + + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); + if (!wdata) + return NULL; + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + *found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, index, + PAGECACHE_TAG_DIRTY, tofind, + pages); + *found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && *index <= end); + + return wdata; +} + +static unsigned int +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, + struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) +{ + unsigned int nr_pages = 0, i; + struct page *page; + + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + *done = true; + unlock_page(page); + break; + } + + if (*next && (page->index != *next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= i_size_read(mapping->host)) { + *done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + *next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + *index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + return nr_pages; +} + +static int +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, + struct address_space *mapping, struct writeback_control *wbc) +{ + int rc = 0; + struct TCP_Server_Info *server; + unsigned int i; + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + } else { + wdata->pid = wdata->cfile->pid; + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata, cifs_writedata_release); + } + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + return rc; +} + +static int cifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct TCP_Server_Info *server; + bool done = false, scanned = false, range_whole = false; + pgoff_t end, index; + struct cifs_writedata *wdata; + int rc = 0; + + /* + * If wsize is smaller than the page cache size, default to writing + * one page at a time via cifs_writepage + */ + if (cifs_sb->wsize < PAGE_CACHE_SIZE) + return generic_writepages(mapping, wbc); + + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + scanned = true; + } + server = cifs_sb_master_tcon(cifs_sb)->ses->server; +retry: + while (!done && index <= end) { + unsigned int i, nr_pages, found_pages, wsize, credits; + pgoff_t next = 0, tofind, saved_index = index; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; + + wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, + &found_pages); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } + + if (found_pages == 0) { + kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); + break; + } + + nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, + end, &index, &next, &done); + + /* nothing to write? */ + if (nr_pages == 0) { + kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); + continue; + } + + wdata->credits = credits; + + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + + /* send failure -- clean up the mess */ + if (rc != 0) { + add_credits_and_wake_if(server, wdata->credits, 0); + for (i = 0; i < nr_pages; ++i) { + if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, + wdata->pages[i]); + else + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } + if (rc != -EAGAIN) + mapping_set_error(mapping, rc); + } + kref_put(&wdata->refcount, cifs_writedata_release); + + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { + index = saved_index; + continue; + } + + wbc->nr_to_write -= nr_pages; + if (wbc->nr_to_write <= 0) + done = true; + + index = next; + } + + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = true; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + + return rc; +} + +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc) +{ + int rc; + unsigned int xid; + + xid = get_xid(); +/* BB add check for wbc flags */ + page_cache_get(page); + if (!PageUptodate(page)) + cifs_dbg(FYI, "ppw - page not up to date\n"); + + /* + * Set the "writeback" flag, and clear "dirty" in the radix tree. + * + * A writepage() implementation always needs to do either this, + * or re-dirty the page with "redirty_page_for_writepage()" in + * the case of a failure. + * + * Just unlocking the page will cause the radix tree tag-bits + * to fail to update with the state of the page correctly. + */ + set_page_writeback(page); +retry_write: + rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE); + if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) + goto retry_write; + else if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, page); + else if (rc != 0) + SetPageError(page); + else + SetPageUptodate(page); + end_page_writeback(page); + page_cache_release(page); + free_xid(xid); + return rc; +} + +static int cifs_writepage(struct page *page, struct writeback_control *wbc) +{ + int rc = cifs_writepage_locked(page, wbc); + unlock_page(page); + return rc; +} + +static int cifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int rc; + struct inode *inode = mapping->host; + struct cifsFileInfo *cfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + __u32 pid; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = cfile->pid; + else + pid = current->tgid; + + cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", + page, pos, copied); + + if (PageChecked(page)) { + if (copied == len) + SetPageUptodate(page); + ClearPageChecked(page); + } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (!PageUptodate(page)) { + char *page_data; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int xid; + + xid = get_xid(); + /* this is probably better than directly calling + partialpage_write since in this function the file handle is + known which we might as well leverage */ + /* BB check if anything else missing out of ppw + such as updating last write time */ + page_data = kmap(page); + rc = cifs_write(cfile, pid, page_data + offset, copied, &pos); + /* if (rc < 0) should we set writebehind rc? */ + kunmap(page); + + free_xid(xid); + } else { + rc = copied; + pos += copied; + set_page_dirty(page); + } + + if (rc > 0) { + spin_lock(&inode->i_lock); + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + unlock_page(page); + page_cache_release(page); + + return rc; +} + +int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + unsigned int xid; + int rc = 0; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifsFileInfo *smbfile = file->private_data; + struct inode *inode = file_inode(file); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + mutex_lock(&inode->i_mutex); + + xid = get_xid(); + + cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n", + file, datasync); + + if (!CIFS_CACHE_READ(CIFS_I(inode))) { + rc = cifs_zap_mapping(inode); + if (rc) { + cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); + rc = 0; /* don't care about it in fsync */ + } + } + + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { + server = tcon->ses->server; + if (server->ops->flush) + rc = server->ops->flush(xid, tcon, &smbfile->fid); + else + rc = -ENOSYS; + } + + free_xid(xid); + mutex_unlock(&inode->i_mutex); + return rc; +} + +int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + unsigned int xid; + int rc = 0; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifsFileInfo *smbfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct inode *inode = file->f_mapping->host; + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + mutex_lock(&inode->i_mutex); + + xid = get_xid(); + + cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n", + file, datasync); + + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { + server = tcon->ses->server; + if (server->ops->flush) + rc = server->ops->flush(xid, tcon, &smbfile->fid); + else + rc = -ENOSYS; + } + + free_xid(xid); + mutex_unlock(&inode->i_mutex); + return rc; +} + +/* + * As file closes, flush all cached write data for this inode checking + * for write behind errors. + */ +int cifs_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + int rc = 0; + + if (file->f_mode & FMODE_WRITE) + rc = filemap_write_and_wait(inode->i_mapping); + + cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); + + return rc; +} + +static int +cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) +{ + int rc = 0; + unsigned long i; + + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!pages[i]) { + /* + * save number of pages we have already allocated and + * return with ENOMEM error + */ + num_pages = i; + rc = -ENOMEM; + break; + } + } + + if (rc) { + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + } + return rc; +} + +static inline +size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) +{ + size_t num_pages; + size_t clen; + + clen = min_t(const size_t, len, wsize); + num_pages = DIV_ROUND_UP(clen, PAGE_SIZE); + + if (cur_len) + *cur_len = clen; + + return num_pages; +} + +static void +cifs_uncached_writedata_release(struct kref *refcount) +{ + int i; + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); + cifs_writedata_release(refcount); +} + +static void +cifs_uncached_writev_complete(struct work_struct *work) +{ + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = d_inode(wdata->cfile->dentry); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + spin_lock(&inode->i_lock); + cifs_update_eof(cifsi, wdata->offset, wdata->bytes); + if (cifsi->server_eof > inode->i_size) + i_size_write(inode, cifsi->server_eof); + spin_unlock(&inode->i_lock); + + complete(&wdata->done); + + kref_put(&wdata->refcount, cifs_uncached_writedata_release); +} + +static int +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, + size_t *len, unsigned long *num_pages) +{ + size_t save_len, copied, bytes, cur_len = *len; + unsigned long i, nr_pages = *num_pages; + + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + bytes = min_t(const size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + cur_len -= copied; + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; + } + cur_len = save_len - cur_len; + *len = cur_len; + + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Return -EFAULT and let + * the caller free anything we allocated and bail out. + */ + if (!cur_len) + return -EFAULT; + + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. + */ + *num_pages = i + 1; + return 0; +} + +static int +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, + struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) +{ + int rc = 0; + size_t cur_len; + unsigned long nr_pages, num_pages, i; + struct cifs_writedata *wdata; + struct iov_iter saved_from; + loff_t saved_offset = offset; + pid_t pid; + struct TCP_Server_Info *server; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + server = tlink_tcon(open_file->tlink)->ses->server; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + do { + unsigned int wsize, credits; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + nr_pages = get_numpages(wsize, len, &cur_len); + wdata = cifs_writedata_alloc(nr_pages, + cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } + + rc = cifs_write_allocate_pages(wdata->pages, nr_pages); + if (rc) { + kfree(wdata); + add_credits_and_wake_if(server, credits, 0); + break; + } + + num_pages = nr_pages; + rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages); + if (rc) { + for (i = 0; i < nr_pages; i++) + put_page(wdata->pages[i]); + kfree(wdata); + add_credits_and_wake_if(server, credits, 0); + break; + } + + /* + * Bring nr_pages down to the number of pages we actually used, + * and free any pages that we didn't use. + */ + for ( ; nr_pages > num_pages; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); + + wdata->sync_mode = WB_SYNC_ALL; + wdata->nr_pages = nr_pages; + wdata->offset = (__u64)offset; + wdata->cfile = cifsFileInfo_get(open_file); + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->pagesz = PAGE_SIZE; + wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); + wdata->credits = credits; + + if (!wdata->cfile->invalidHandle || + !cifs_reopen_file(wdata->cfile, false)) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); + if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); + if (rc == -EAGAIN) { + memcpy(from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(from, offset - saved_offset); + continue; + } + break; + } + + list_add_tail(&wdata->list, wdata_list); + offset += cur_len; + len -= cur_len; + } while (len > 0); + + return rc; +} + +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t total_written = 0; + struct cifsFileInfo *open_file; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + struct iov_iter saved_from; + int rc; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; + + INIT_LIST_HEAD(&wdata_list); + cifs_sb = CIFS_FILE_SB(file); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, + open_file, cifs_sb, &wdata_list); + + /* + * If at least one write was successfully sent, then discard any rc + * value from the later writes. If the other write succeeds, then + * we'll end up returning whatever was written. If it fails, then + * we'll get a new rc value from that. + */ + if (!list_empty(&wdata_list)) + rc = 0; + + /* + * Wait for and collect replies for any successful sends in order of + * increasing offset. Once an error is hit or we get a fatal signal + * while waiting, then return without waiting for any more replies. + */ +restart_loop: + list_for_each_entry_safe(wdata, tmp, &wdata_list, list) { + if (!rc) { + /* FIXME: freezable too? */ + rc = wait_for_completion_killable(&wdata->done); + if (rc) + rc = -EINTR; + else if (wdata->result) + rc = wdata->result; + else + total_written += wdata->bytes; + + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + struct list_head tmp_list; + struct iov_iter tmp_from; + + INIT_LIST_HEAD(&tmp_list); + list_del_init(&wdata->list); + + memcpy(&tmp_from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(&tmp_from, + wdata->offset - iocb->ki_pos); + + rc = cifs_write_from_iter(wdata->offset, + wdata->bytes, &tmp_from, + open_file, cifs_sb, &tmp_list); + + list_splice(&tmp_list, &wdata_list); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); + goto restart_loop; + } + } + list_del_init(&wdata->list); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); + } + + if (unlikely(!total_written)) + return rc; + + iocb->ki_pos += total_written; + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags); + cifs_stats_bytes_written(tcon, total_written); + return total_written; +} + +static ssize_t +cifs_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct inode *inode = file->f_mapping->host; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + ssize_t rc; + + /* + * We need to hold the sem to be sure nobody modifies lock list + * with a brlock that prevents writing. + */ + down_read(&cinode->lock_sem); + mutex_lock(&inode->i_mutex); + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + goto out; + + if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), + server->vals->exclusive_lock_type, NULL, + CIFS_WRITE_OP)) + rc = __generic_file_write_iter(iocb, from); + else + rc = -EACCES; +out: + mutex_unlock(&inode->i_mutex); + + if (rc > 0) { + ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; + } + up_read(&cinode->lock_sem); + return rc; +} + +ssize_t +cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = (struct cifsFileInfo *) + iocb->ki_filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + ssize_t written; + + written = cifs_get_writer(cinode); + if (written) + return written; + + if (CIFS_CACHE_WRITE(cinode)) { + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_write_iter(iocb, from); + goto out; + } + written = cifs_writev(iocb, from); + goto out; + } + /* + * For non-oplocked files in strict cache mode we need to write the data + * to the server exactly from the pos to pos+len-1 rather than flush all + * affected pages because it may cause a error with mandatory locks on + * these pages but not on the region from pos to ppos+len-1. + */ + written = cifs_user_writev(iocb, from); + if (written > 0 && CIFS_CACHE_READ(cinode)) { + /* + * Windows 7 server can delay breaking level2 oplock if a write + * request comes - break it on the client to prevent reading + * an old data. + */ + cifs_zap_mapping(inode); + cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + inode); + cinode->oplock = 0; + } +out: + cifs_put_writer(cinode); + return written; +} + +static struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) +{ + struct cifs_readdata *rdata; + + rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages), + GFP_KERNEL); + if (rdata != NULL) { + kref_init(&rdata->refcount); + INIT_LIST_HEAD(&rdata->list); + init_completion(&rdata->done); + INIT_WORK(&rdata->work, complete); + } + + return rdata; +} + +void +cifs_readdata_release(struct kref *refcount) +{ + struct cifs_readdata *rdata = container_of(refcount, + struct cifs_readdata, refcount); + + if (rdata->cfile) + cifsFileInfo_put(rdata->cfile); + + kfree(rdata); +} + +static int +cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) +{ + int rc = 0; + struct page *page; + unsigned int i; + + for (i = 0; i < nr_pages; i++) { + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) { + rc = -ENOMEM; + break; + } + rdata->pages[i] = page; + } + + if (rc) { + for (i = 0; i < nr_pages; i++) { + put_page(rdata->pages[i]); + rdata->pages[i] = NULL; + } + } + return rc; +} + +static void +cifs_uncached_readdata_release(struct kref *refcount) +{ + struct cifs_readdata *rdata = container_of(refcount, + struct cifs_readdata, refcount); + unsigned int i; + + for (i = 0; i < rdata->nr_pages; i++) { + put_page(rdata->pages[i]); + rdata->pages[i] = NULL; + } + cifs_readdata_release(refcount); +} + +/** + * cifs_readdata_to_iov - copy data from pages in response to an iovec + * @rdata: the readdata response with list of pages holding data + * @iter: destination for our data + * + * This function copies data from a list of pages in a readdata response into + * an array of iovecs. It will first calculate where the data should go + * based on the info in the readdata and then copy the data into that spot. + */ +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) +{ + size_t remaining = rdata->got_bytes; + unsigned int i; + + for (i = 0; i < rdata->nr_pages; i++) { + struct page *page = rdata->pages[i]; + size_t copy = min_t(size_t, remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; + } + return remaining ? -EFAULT : 0; +} + +static void +cifs_uncached_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + + complete(&rdata->done); + kref_put(&rdata->refcount, cifs_uncached_readdata_release); +} + +static int +cifs_uncached_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) +{ + int result = 0; + unsigned int i; + unsigned int nr_pages = rdata->nr_pages; + struct kvec iov; + + rdata->got_bytes = 0; + rdata->tailsz = PAGE_SIZE; + for (i = 0; i < nr_pages; i++) { + struct page *page = rdata->pages[i]; + + if (len >= PAGE_SIZE) { + /* enough data to fill the page */ + iov.iov_base = kmap(page); + iov.iov_len = PAGE_SIZE; + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); + len -= PAGE_SIZE; + } else if (len > 0) { + /* enough for partial page, fill and zero the rest */ + iov.iov_base = kmap(page); + iov.iov_len = len; + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); + memset(iov.iov_base + len, '\0', PAGE_SIZE - len); + rdata->tailsz = len; + len = 0; + } else { + /* no need to hold page hostage */ + rdata->pages[i] = NULL; + rdata->nr_pages--; + put_page(page); + continue; + } + + result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); + kunmap(page); + if (result < 0) + break; + + rdata->got_bytes += result; + } + + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) +{ + struct cifs_readdata *rdata; + unsigned int npages, rsize, credits; + size_t cur_len; + int rc; + pid_t pid; + struct TCP_Server_Info *server; + + server = tlink_tcon(open_file->tlink)->ses->server; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + do { + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); + npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); + + /* allocate a readdata struct */ + rdata = cifs_readdata_alloc(npages, + cifs_uncached_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; + } + + rc = cifs_read_allocate_pages(rdata, npages); + if (rc) + goto error; + + rdata->cfile = cifsFileInfo_get(open_file); + rdata->nr_pages = npages; + rdata->offset = offset; + rdata->bytes = cur_len; + rdata->pid = pid; + rdata->pagesz = PAGE_SIZE; + rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->credits = credits; + + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); +error: + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + if (rc == -EAGAIN) + continue; + break; + } + + list_add_tail(&rdata->list, rdata_list); + offset += cur_len; + len -= cur_len; + } while (len > 0); + + return rc; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *open_file; + struct cifs_readdata *rdata, *tmp; + struct list_head rdata_list; + + len = iov_iter_count(to); + if (!len) + return 0; + + INIT_LIST_HEAD(&rdata_list); + cifs_sb = CIFS_FILE_SB(file); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + + /* if at least one read request send succeeded, then reset rc */ + if (!list_empty(&rdata_list)) + rc = 0; + + len = iov_iter_count(to); + /* the loop below should proceed in the order of increasing offsets */ +again: + list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + if (!rc) { + /* FIXME: freezable sleep too? */ + rc = wait_for_completion_killable(&rdata->done); + if (rc) + rc = -EINTR; + else if (rdata->result == -EAGAIN) { + /* resend call if it's a retryable error */ + struct list_head tmp_list; + unsigned int got_bytes = rdata->got_bytes; + + list_del_init(&rdata->list); + INIT_LIST_HEAD(&tmp_list); + + /* + * Got a part of data and then reconnect has + * happened -- fill the buffer and continue + * reading. + */ + if (got_bytes && got_bytes < rdata->bytes) { + rc = cifs_readdata_to_iov(rdata, to); + if (rc) { + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + continue; + } + } + + rc = cifs_send_async_read( + rdata->offset + got_bytes, + rdata->bytes - got_bytes, + rdata->cfile, cifs_sb, + &tmp_list); + + list_splice(&tmp_list, &rdata_list); + + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + goto again; + } else if (rdata->result) + rc = rdata->result; + else + rc = cifs_readdata_to_iov(rdata, to); + + /* if there was a short read -- discard anything left */ + if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) + rc = -ENODATA; + } + list_del_init(&rdata->list); + kref_put(&rdata->refcount, cifs_uncached_readdata_release); + } + + total_read = len - iov_iter_count(to); + + cifs_stats_bytes_read(tcon, total_read); + + /* mask nodata case */ + if (rc == -ENODATA) + rc = 0; + + if (total_read) { + iocb->ki_pos += total_read; + return total_read; + } + return rc; +} + +ssize_t +cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = (struct cifsFileInfo *) + iocb->ki_filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + int rc = -EACCES; + + /* + * In strict cache mode we need to read from the server all the time + * if we don't have level II oplock because the server can delay mtime + * change - so we can't make a decision about inode invalidating. + * And we can also fail with pagereading if there are mandatory locks + * on pages affected by this read but not on the region from pos to + * pos+len-1. + */ + if (!CIFS_CACHE_READ(cinode)) + return cifs_user_readv(iocb, to); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return generic_file_read_iter(iocb, to); + + /* + * We need to hold the sem to be sure nobody modifies lock list + * with a brlock that prevents reading. + */ + down_read(&cinode->lock_sem); + if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), + tcon->ses->server->vals->shared_lock_type, + NULL, CIFS_READ_OP)) + rc = generic_file_read_iter(iocb, to); + up_read(&cinode->lock_sem); + return rc; +} + +static ssize_t +cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) +{ + int rc = -EACCES; + unsigned int bytes_read = 0; + unsigned int total_read; + unsigned int current_read_size; + unsigned int rsize; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + unsigned int xid; + char *cur_offset; + struct cifsFileInfo *open_file; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + __u32 pid; + + xid = get_xid(); + cifs_sb = CIFS_FILE_SB(file); + + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + + if (file->private_data == NULL) { + rc = -EBADF; + free_xid(xid); + return rc; + } + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + + if (!server->ops->sync_read) { + free_xid(xid); + return -ENOSYS; + } + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + for (total_read = 0, cur_offset = read_data; read_size > total_read; + total_read += bytes_read, cur_offset += bytes_read) { + do { + current_read_size = min_t(uint, read_size - total_read, + rsize); + /* + * For windows me and 9x we do not want to request more + * than it negotiated since it will refuse the read + * then. + */ + if ((tcon->ses) && !(tcon->ses->capabilities & + tcon->ses->server->vals->cap_large_files)) { + current_read_size = min_t(uint, + current_read_size, CIFSMaxBufSize); + } + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc != 0) + break; + } + io_parms.pid = pid; + io_parms.tcon = tcon; + io_parms.offset = *offset; + io_parms.length = current_read_size; + rc = server->ops->sync_read(xid, &open_file->fid, &io_parms, + &bytes_read, &cur_offset, + &buf_type); + } while (rc == -EAGAIN); + + if (rc || (bytes_read == 0)) { + if (total_read) { + break; + } else { + free_xid(xid); + return rc; + } + } else { + cifs_stats_bytes_read(tcon, total_read); + *offset += bytes_read; + } + } + free_xid(xid); + return total_read; +} + +/* + * If the page is mmap'ed into a process' page tables, then we need to make + * sure that it doesn't change while being written back. + */ +static int +cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + + lock_page(page); + return VM_FAULT_LOCKED; +} + +static struct vm_operations_struct cifs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = cifs_page_mkwrite, +}; + +int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc, xid; + struct inode *inode = file_inode(file); + + xid = get_xid(); + + if (!CIFS_CACHE_READ(CIFS_I(inode))) { + rc = cifs_zap_mapping(inode); + if (rc) + return rc; + } + + rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); + return rc; +} + +int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc, xid; + + xid = get_xid(); + rc = cifs_revalidate_file(file); + if (rc) { + cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", + rc); + free_xid(xid); + return rc; + } + rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); + return rc; +} + +static void +cifs_readv_complete(struct work_struct *work) +{ + unsigned int i, got_bytes; + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + + got_bytes = rdata->got_bytes; + for (i = 0; i < rdata->nr_pages; i++) { + struct page *page = rdata->pages[i]; + + lru_cache_add_file(page); + + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) { + flush_dcache_page(page); + SetPageUptodate(page); + } + + unlock_page(page); + + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) + cifs_readpage_to_fscache(rdata->mapping->host, page); + + got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + + page_cache_release(page); + rdata->pages[i] = NULL; + } + kref_put(&rdata->refcount, cifs_readdata_release); +} + +static int +cifs_readpages_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) +{ + int result = 0; + unsigned int i; + u64 eof; + pgoff_t eof_index; + unsigned int nr_pages = rdata->nr_pages; + struct kvec iov; + + /* determine the eof that the server (probably) has */ + eof = CIFS_I(rdata->mapping->host)->server_eof; + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); + + rdata->got_bytes = 0; + rdata->tailsz = PAGE_CACHE_SIZE; + for (i = 0; i < nr_pages; i++) { + struct page *page = rdata->pages[i]; + + if (len >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ + iov.iov_base = kmap(page); + iov.iov_len = PAGE_CACHE_SIZE; + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); + len -= PAGE_CACHE_SIZE; + } else if (len > 0) { + /* enough for partial page, fill and zero the rest */ + iov.iov_base = kmap(page); + iov.iov_len = len; + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); + memset(iov.iov_base + len, + '\0', PAGE_CACHE_SIZE - len); + rdata->tailsz = len; + len = 0; + } else if (page->index > eof_index) { + /* + * The VFS will not try to do readahead past the + * i_size, but it's possible that we have outstanding + * writes with gaps in the middle and the i_size hasn't + * caught up yet. Populate those with zeroed out pages + * to prevent the VFS from repeatedly attempting to + * fill them until the writes are flushed. + */ + zero_user(page, 0, PAGE_CACHE_SIZE); + lru_cache_add_file(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + rdata->pages[i] = NULL; + rdata->nr_pages--; + continue; + } else { + /* no need to hold page hostage */ + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + rdata->pages[i] = NULL; + rdata->nr_pages--; + continue; + } + + result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); + kunmap(page); + if (result < 0) + break; + + rdata->got_bytes += result; + } + + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + unsigned int rsize, struct list_head *tmplist, + unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +{ + struct page *page, *tpage; + unsigned int expected_index; + int rc; + + INIT_LIST_HEAD(tmplist); + + page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + return rc; + } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + *bytes = PAGE_CACHE_SIZE; + *nr_pages = 1; + list_move_tail(&page->lru, tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (*bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, page->index, + GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_CACHE_SIZE; + expected_index++; + (*nr_pages)++; + } + return rc; +} + +static int cifs_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned num_pages) +{ + int rc; + struct list_head tmplist; + struct cifsFileInfo *open_file = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct TCP_Server_Info *server; + pid_t pid; + + /* + * Reads as many pages as possible from fscache. Returns -ENOBUFS + * immediately if the cookie is negative + * + * After this point, every page in the list might have PG_fscache set, + * so we will need to clean that up off of every page we don't use. + */ + rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, + &num_pages); + if (rc == 0) + return rc; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + rc = 0; + server = tlink_tcon(open_file->tlink)->ses->server; + + cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", + __func__, file, mapping, num_pages); + + /* + * Start with the page at end of list and move it to private + * list. Do the same with any following pages until we hit + * the rsize limit, hit an index discontinuity, or run out of + * pages. Issue the async read and then start the loop again + * until the list is empty. + * + * Note that list order is important. The page_list is in + * the order of declining indexes. When we put the pages in + * the rdata->pages, then we want them in increasing order. + */ + while (!list_empty(page_list)) { + unsigned int i, nr_pages, bytes, rsize; + loff_t offset; + struct page *page, *tpage; + struct cifs_readdata *rdata; + unsigned credits; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + /* + * Give up immediately if rsize is too small to read an entire + * page. The VFS will fall back to readpage. We should never + * reach this point however since we set ra_pages to 0 when the + * rsize is smaller than a cache page. + */ + if (unlikely(rsize < PAGE_CACHE_SIZE)) { + add_credits_and_wake_if(server, credits, 0); + return 0; + } + + rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + &nr_pages, &offset, &bytes); + if (rc) { + add_credits_and_wake_if(server, credits, 0); + break; + } + + rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); + if (!rdata) { + /* best to give up if we're out of mem */ + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } + + rdata->cfile = cifsFileInfo_get(open_file); + rdata->mapping = mapping; + rdata->offset = offset; + rdata->bytes = bytes; + rdata->pid = pid; + rdata->pagesz = PAGE_CACHE_SIZE; + rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->credits = credits; + + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + rdata->pages[rdata->nr_pages++] = page; + } + + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); + for (i = 0; i < rdata->nr_pages; i++) { + page = rdata->pages[i]; + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + /* Fallback to the readpage in error/reconnect cases */ + kref_put(&rdata->refcount, cifs_readdata_release); + break; + } + + kref_put(&rdata->refcount, cifs_readdata_release); + } + + /* Any pages that have been shown to fscache but didn't get added to + * the pagecache must be uncached before they get returned to the + * allocator. + */ + cifs_fscache_readpages_cancel(mapping->host, page_list); + return rc; +} + +/* + * cifs_readpage_worker must be called with the page pinned + */ +static int cifs_readpage_worker(struct file *file, struct page *page, + loff_t *poffset) +{ + char *read_data; + int rc; + + /* Is the page cached? */ + rc = cifs_readpage_from_fscache(file_inode(file), page); + if (rc == 0) + goto read_complete; + + read_data = kmap(page); + /* for reads over a certain size could initiate async read ahead */ + + rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset); + + if (rc < 0) + goto io_error; + else + cifs_dbg(FYI, "Bytes read %d\n", rc); + + file_inode(file)->i_atime = + current_fs_time(file_inode(file)->i_sb); + + if (PAGE_CACHE_SIZE > rc) + memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); + + flush_dcache_page(page); + SetPageUptodate(page); + + /* send this page to the cache */ + cifs_readpage_to_fscache(file_inode(file), page); + + rc = 0; + +io_error: + kunmap(page); + unlock_page(page); + +read_complete: + return rc; +} + +static int cifs_readpage(struct file *file, struct page *page) +{ + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + int rc = -EACCES; + unsigned int xid; + + xid = get_xid(); + + if (file->private_data == NULL) { + rc = -EBADF; + free_xid(xid); + return rc; + } + + cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n", + page, (int)offset, (int)offset); + + rc = cifs_readpage_worker(file, page, &offset); + + free_xid(xid); + return rc; +} + +static int is_inode_writable(struct cifsInodeInfo *cifs_inode) +{ + struct cifsFileInfo *open_file; + + spin_lock(&cifs_file_list_lock); + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { + spin_unlock(&cifs_file_list_lock); + return 1; + } + } + spin_unlock(&cifs_file_list_lock); + return 0; +} + +/* We do not want to update the file size from server for inodes + open for write - to avoid races with writepage extending + the file - in the future we could consider allowing + refreshing the inode only on increases in the file size + but this is tricky to do without racing with writebehind + page caching in the current Linux kernel design */ +bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) +{ + if (!cifsInode) + return true; + + if (is_inode_writable(cifsInode)) { + /* This inode is open for write at least once */ + struct cifs_sb_info *cifs_sb; + + cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + /* since no page cache to corrupt on directio + we can change size safely */ + return true; + } + + if (i_size_read(&cifsInode->vfs_inode) < end_of_file) + return true; + + return false; + } else + return true; +} + +static int cifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int oncethru = 0; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + loff_t page_start = pos & PAGE_MASK; + loff_t i_size; + struct page *page; + int rc = 0; + + cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); + +start: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + rc = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) + goto out; + + /* + * If we write a full page it will be up to date, no need to read from + * the server. If the write is short, we'll end up doing a sync write + * instead. + */ + if (len == PAGE_CACHE_SIZE) + goto out; + + /* + * optimize away the read when we have an oplock, and we're not + * expecting to use any of the data we'd be reading in. That + * is, when the page lies beyond the EOF, or straddles the EOF + * and the write will cover all of the existing data. + */ + if (CIFS_CACHE_READ(CIFS_I(mapping->host))) { + i_size = i_size_read(mapping->host); + if (page_start >= i_size || + (offset == 0 && (pos + len) >= i_size)) { + zero_user_segments(page, 0, offset, + offset + len, + PAGE_CACHE_SIZE); + /* + * PageChecked means that the parts of the page + * to which we're not writing are considered up + * to date. Once the data is copied to the + * page, it can be set uptodate. + */ + SetPageChecked(page); + goto out; + } + } + + if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) { + /* + * might as well read a page, it is fast enough. If we get + * an error, we don't need to return it. cifs_write_end will + * do a sync write instead since PG_uptodate isn't set. + */ + cifs_readpage_worker(file, page, &page_start); + page_cache_release(page); + oncethru = 1; + goto start; + } else { + /* we could try using another file handle if there is one - + but how would we lock it to prevent close of that handle + racing with this read? In any case + this will be written out by write_end so is fine */ + } +out: + *pagep = page; + return rc; +} + +static int cifs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return cifs_fscache_release_page(page, gfp); +} + +static void cifs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) +{ + struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); + + if (offset == 0 && length == PAGE_CACHE_SIZE) + cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); +} + +static int cifs_launder_page(struct page *page) +{ + int rc = 0; + loff_t range_start = page_offset(page); + loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_end, + }; + + cifs_dbg(FYI, "Launder page: %p\n", page); + + if (clear_page_dirty_for_io(page)) + rc = cifs_writepage_locked(page, &wbc); + + cifs_fscache_invalidate_page(page, page->mapping->host); + return rc; +} + +void cifs_oplock_break(struct work_struct *work) +{ + struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, + oplock_break); + struct inode *inode = d_inode(cfile->dentry); + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + int rc = 0; + + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && + cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", + inode); + cinode->oplock = 0; + } + + if (inode && S_ISREG(inode->i_mode)) { + if (CIFS_CACHE_READ(cinode)) + break_lease(inode, O_RDONLY); + else + break_lease(inode, O_WRONLY); + rc = filemap_fdatawrite(inode->i_mapping); + if (!CIFS_CACHE_READ(cinode)) { + rc = filemap_fdatawait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + cifs_zap_mapping(inode); + } + cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); + } + + rc = cifs_push_locks(cfile); + if (rc) + cifs_dbg(VFS, "Push locks rc = %d\n", rc); + + /* + * releasing stale oplock after recent reconnect of smb session using + * a now incorrect file handle is not a data integrity issue but do + * not bother sending an oplock release if session to server still is + * disconnected since oplock already released by the server + */ + if (!cfile->oplock_break_cancelled) { + rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, + cinode); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + } + cifs_done_oplock_break(cinode); +} + +/* + * The presence of cifs_direct_io() in the address space ops vector + * allowes open() O_DIRECT flags which would have failed otherwise. + * + * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests + * so this method should never be called. + * + * Direct IO is not yet supported in the cached mode. + */ +static ssize_t +cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +{ + /* + * FIXME + * Eventually need to support direct IO for non forcedirectio mounts + */ + return -EINVAL; +} + + +const struct address_space_operations cifs_addr_ops = { + .readpage = cifs_readpage, + .readpages = cifs_readpages, + .writepage = cifs_writepage, + .writepages = cifs_writepages, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, + .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .direct_IO = cifs_direct_io, + .invalidatepage = cifs_invalidate_page, + .launder_page = cifs_launder_page, +}; + +/* + * cifs_readpages requires the server to support a buffer large enough to + * contain the header plus one complete page of data. Otherwise, we need + * to leave cifs_readpages out of the address space operations. + */ +const struct address_space_operations cifs_addr_ops_smallbuf = { + .readpage = cifs_readpage, + .writepage = cifs_writepage, + .writepages = cifs_writepages, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, + .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, + .launder_page = cifs_launder_page, +}; diff --git a/kernel/fs/cifs/fscache.c b/kernel/fs/cifs/fscache.c new file mode 100644 index 000000000..8d4b7bc8a --- /dev/null +++ b/kernel/fs/cifs/fscache.c @@ -0,0 +1,242 @@ +/* + * fs/cifs/fscache.c - CIFS filesystem cache interface + * + * Copyright (c) 2010 Novell, Inc. + * Author(s): Suresh Jayaraman + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +{ + server->fscache = + fscache_acquire_cookie(cifs_fscache_netfs.primary_index, + &cifs_fscache_server_index_def, server, true); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); +} + +void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +{ + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); + fscache_relinquish_cookie(server->fscache, 0); + server->fscache = NULL; +} + +void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) +{ + struct TCP_Server_Info *server = tcon->ses->server; + + tcon->fscache = + fscache_acquire_cookie(server->fscache, + &cifs_fscache_super_index_def, tcon, true); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server->fscache, tcon->fscache); +} + +void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) +{ + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); + fscache_relinquish_cookie(tcon->fscache, 0); + tcon->fscache = NULL; +} + +static void cifs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + if (cifsi->fscache) + return; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { + cifsi->fscache = fscache_acquire_cookie(tcon->fscache, + &cifs_fscache_inode_object_def, cifsi, true); + cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", + __func__, tcon->fscache, cifsi->fscache); + } +} + +void cifs_fscache_release_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 0); + cifsi->fscache = NULL; + } +} + +static void cifs_fscache_disable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); + fscache_uncache_all_inode_pages(cifsi->fscache, inode); + fscache_relinquish_cookie(cifsi->fscache, 1); + cifsi->fscache = NULL; + } +} + +void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + cifs_fscache_disable_inode_cookie(inode); + else + cifs_fscache_enable_inode_cookie(inode); +} + +void cifs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct fscache_cookie *old = cifsi->fscache; + + if (cifsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(cifsi->fscache, 1); + + cifsi->fscache = fscache_acquire_cookie( + cifs_sb_master_tcon(cifs_sb)->fscache, + &cifs_fscache_inode_object_def, + cifsi, true); + cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", + __func__, cifsi->fscache, old); + } +} + +int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct inode *inode = page->mapping->host; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, page, cifsi->fscache); + if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) + return 0; + } + + return 1; +} + +static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); + if (!error) + SetPageUptodate(page); + unlock_page(page); +} + +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, CIFS_I(inode)->fscache, page, inode); + ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, + cifs_readpage_from_fscache_complete, + NULL, + GFP_KERNEL); + switch (ret) { + + case 0: /* page found in fscache, read submitted */ + cifs_dbg(FYI, "%s: submitted\n", __func__); + return ret; + case -ENOBUFS: /* page won't be cached */ + case -ENODATA: /* page not in cache */ + cifs_dbg(FYI, "%s: %d\n", __func__, ret); + return 1; + + default: + cifs_dbg(VFS, "unknown error ret = %d\n", ret); + } + return ret; +} + +/* + * Retrieve a set of pages from FS-Cache + */ +int __cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + + cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", + __func__, CIFS_I(inode)->fscache, *nr_pages, inode); + ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, + pages, nr_pages, + cifs_readpage_from_fscache_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + cifs_dbg(FYI, "%s: submitted\n", __func__); + return ret; + + case -ENOBUFS: /* some pages are not cached and can't be */ + case -ENODATA: /* some pages are not cached */ + cifs_dbg(FYI, "%s: no page\n", __func__); + return 1; + + default: + cifs_dbg(FYI, "unknown error ret = %d\n", ret); + } + + return ret; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, page, inode); + ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + if (ret != 0) + fscache_uncache_page(CIFS_I(inode)->fscache, page); +} + +void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) +{ + cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, inode); + fscache_readpages_cancel(CIFS_I(inode)->fscache, pages); +} + +void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); + fscache_wait_on_page_write(cookie, page); + fscache_uncache_page(cookie, page); +} + diff --git a/kernel/fs/cifs/fscache.h b/kernel/fs/cifs/fscache.h new file mode 100644 index 000000000..24794b6cd --- /dev/null +++ b/kernel/fs/cifs/fscache.h @@ -0,0 +1,149 @@ +/* + * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFS_FSCACHE_H +#define _CIFS_FSCACHE_H + +#include + +#include "cifsglob.h" + +#ifdef CONFIG_CIFS_FSCACHE + +extern struct fscache_netfs cifs_fscache_netfs; +extern const struct fscache_cookie_def cifs_fscache_server_index_def; +extern const struct fscache_cookie_def cifs_fscache_super_index_def; +extern const struct fscache_cookie_def cifs_fscache_inode_object_def; + +extern int cifs_fscache_register(void); +extern void cifs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); +extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); + +extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void cifs_fscache_reset_inode_cookie(struct inode *); + +extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); +extern int __cifs_readpage_from_fscache(struct inode *, struct page *); +extern int __cifs_readpages_from_fscache(struct inode *, + struct address_space *, + struct list_head *, + unsigned *); +extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); + +extern void __cifs_readpage_to_fscache(struct inode *, struct page *); + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __cifs_fscache_invalidate_page(page, inode); +} + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpage_from_fscache(inode, page); + + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpages_from_fscache(inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_readpage_to_fscache(inode, page); +} + +static inline void cifs_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_fscache_readpages_cancel(inode, pages); +} + +#else /* CONFIG_CIFS_FSCACHE */ +static inline int cifs_fscache_register(void) { return 0; } +static inline void cifs_fscache_unregister(void) {} + +static inline void +cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} +static inline void +cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} +static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {} +static inline void +cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} + +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* May release page */ +} + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) {} + +static inline void cifs_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +#endif /* CONFIG_CIFS_FSCACHE */ + +#endif /* _CIFS_FSCACHE_H */ diff --git a/kernel/fs/cifs/inode.c b/kernel/fs/cifs/inode.c new file mode 100644 index 000000000..f621b44cb --- /dev/null +++ b/kernel/fs/cifs/inode.c @@ -0,0 +1,2454 @@ +/* + * fs/cifs/inode.c + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "fscache.h" + + +static void cifs_set_ops(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &cifs_file_inode_ops; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_direct_nobrl_ops; + else + inode->i_fop = &cifs_file_direct_ops; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_strict_nobrl_ops; + else + inode->i_fop = &cifs_file_strict_ops; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_nobrl_ops; + else { /* not direct, send byte range locks */ + inode->i_fop = &cifs_file_ops; + } + + /* check if server can support readpages */ + if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) + inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + inode->i_data.a_ops = &cifs_addr_ops; + break; + case S_IFDIR: +#ifdef CONFIG_CIFS_DFS_UPCALL + if (IS_AUTOMOUNT(inode)) { + inode->i_op = &cifs_dfs_referral_inode_operations; + } else { +#else /* NO DFS support, treat as a directory */ + { +#endif + inode->i_op = &cifs_dir_inode_ops; + inode->i_fop = &cifs_dir_ops; + } + break; + case S_IFLNK: + inode->i_op = &cifs_symlink_inode_ops; + break; + default: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} + +/* check inode attributes against fattr. If they don't match, tag the + * inode for cache invalidation + */ +static void +cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + + cifs_dbg(FYI, "%s: revalidating inode %llu\n", + __func__, cifs_i->uniqueid); + + if (inode->i_state & I_NEW) { + cifs_dbg(FYI, "%s: inode %llu is new\n", + __func__, cifs_i->uniqueid); + return; + } + + /* don't bother with revalidation if we have an oplock */ + if (CIFS_CACHE_READ(cifs_i)) { + cifs_dbg(FYI, "%s: inode %llu is oplocked\n", + __func__, cifs_i->uniqueid); + return; + } + + /* revalidate if mtime or size have changed */ + if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && + cifs_i->server_eof == fattr->cf_eof) { + cifs_dbg(FYI, "%s: inode %llu is unchanged\n", + __func__, cifs_i->uniqueid); + return; + } + + cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", + __func__, cifs_i->uniqueid); + set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); +} + +/* + * copy nlink to the inode, unless it wasn't provided. Provide + * sane values if we don't have an existing one and none was provided + */ +static void +cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) +{ + /* + * if we're in a situation where we can't trust what we + * got from the server (readdir, some non-unix cases) + * fake reasonable values + */ + if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) { + /* only provide fake values on a new inode */ + if (inode->i_state & I_NEW) { + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + } + return; + } + + /* we trust the server, so update it */ + set_nlink(inode, fattr->cf_nlink); +} + +/* populate an inode with info from a cifs_fattr struct */ +void +cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + cifs_revalidate_cache(inode, fattr); + + spin_lock(&inode->i_lock); + inode->i_atime = fattr->cf_atime; + inode->i_mtime = fattr->cf_mtime; + inode->i_ctime = fattr->cf_ctime; + inode->i_rdev = fattr->cf_rdev; + cifs_nlink_fattr_to_inode(inode, fattr); + inode->i_uid = fattr->cf_uid; + inode->i_gid = fattr->cf_gid; + + /* if dynperm is set, don't clobber existing mode */ + if (inode->i_state & I_NEW || + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) + inode->i_mode = fattr->cf_mode; + + cifs_i->cifsAttrs = fattr->cf_cifsattrs; + + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + cifs_i->time = 0; + else + cifs_i->time = jiffies; + + if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); + else + clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); + + cifs_i->server_eof = fattr->cf_eof; + /* + * Can't safely change the file size here if the client is writing to + * it due to potential races. + */ + if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) { + i_size_write(inode, fattr->cf_eof); + + /* + * i_blocks is not related to (i_size / i_blksize), + * but instead 512 byte (2**9) size is required for + * calculating num blocks. + */ + inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; + } + spin_unlock(&inode->i_lock); + + if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + inode->i_flags |= S_AUTOMOUNT; + if (inode->i_state & I_NEW) + cifs_set_ops(inode); +} + +void +cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + return; + + fattr->cf_uniqueid = iunique(sb, ROOT_I); +} + +/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ +void +cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_uniqueid = le64_to_cpu(info->UniqueId); + fattr->cf_bytes = le64_to_cpu(info->NumOfBytes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange); + fattr->cf_mode = le64_to_cpu(info->Permissions); + + /* + * Since we set the inode type below we need to mask off + * to avoid strange results if bits set above. + */ + fattr->cf_mode &= ~S_IFMT; + switch (le32_to_cpu(info->Type)) { + case UNIX_FILE: + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + break; + case UNIX_SYMLINK: + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + case UNIX_DIR: + fattr->cf_mode |= S_IFDIR; + fattr->cf_dtype = DT_DIR; + break; + case UNIX_CHARDEV: + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); + break; + case UNIX_BLOCKDEV: + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); + break; + case UNIX_FIFO: + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + break; + case UNIX_SOCKET: + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; + break; + default: + /* safest to call it a file if we do not know */ + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type)); + break; + } + + fattr->cf_uid = cifs_sb->mnt_uid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) { + u64 id = le64_to_cpu(info->Uid); + if (id < ((uid_t)-1)) { + kuid_t uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fattr->cf_uid = uid; + } + } + + fattr->cf_gid = cifs_sb->mnt_gid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) { + u64 id = le64_to_cpu(info->Gid); + if (id < ((gid_t)-1)) { + kgid_t gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fattr->cf_gid = gid; + } + } + + fattr->cf_nlink = le64_to_cpu(info->Nlinks); +} + +/* + * Fill a cifs_fattr struct with fake inode info. + * + * Needed to setup cifs_fattr data for the directory which is the + * junction to the new submount (ie to setup the fake directory + * which represents a DFS referral). + */ +static void +cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cifs_dbg(FYI, "creating fake fattr for DFS referral\n"); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; + fattr->cf_atime = CURRENT_TIME; + fattr->cf_ctime = CURRENT_TIME; + fattr->cf_mtime = CURRENT_TIME; + fattr->cf_nlink = 2; + fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; +} + +static int +cifs_get_file_info_unix(struct file *filp) +{ + int rc; + unsigned int xid; + FILE_UNIX_BASIC_INFO find_data; + struct cifs_fattr fattr; + struct inode *inode = file_inode(filp); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + xid = get_xid(); + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); + if (!rc) { + cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + } + + cifs_fattr_to_inode(inode, &fattr); + free_xid(xid); + return rc; +} + +int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *full_path, + struct super_block *sb, unsigned int xid) +{ + int rc; + FILE_UNIX_BASIC_INFO find_data; + struct cifs_fattr fattr; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cifs_dbg(FYI, "Getting info on %s\n", full_path); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + /* could have done a find first instead but this returns more info */ + rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_put_tlink(tlink); + + if (!rc) { + cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); + rc = 0; + } else { + return rc; + } + + /* check for Minshall+French symlinks */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); + if (tmprc) + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + } + + if (*pinode == NULL) { + /* get new inode */ + cifs_fill_uniqueid(sb, &fattr); + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) + rc = -ENOMEM; + } else { + /* we already have inode, update it */ + + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgiiu_exit; + } + + /* if filetype is different, return error */ + if (unlikely(((*pinode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgiiu_exit; + } + + cifs_fattr_to_inode(*pinode, &fattr); + } + +cgiiu_exit: + return rc; +} + +static int +cifs_sfu_type(struct cifs_fattr *fattr, const char *path, + struct cifs_sb_info *cifs_sb, unsigned int xid) +{ + int rc; + __u32 oplock; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + char buf[24]; + unsigned int bytes_read; + char *pbuf; + int buf_type = CIFS_NO_BUFFER; + + pbuf = buf; + + fattr->cf_mode &= ~S_IFMT; + + if (fattr->cf_eof == 0) { + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + return 0; + } else if (fattr->cf_eof < 8) { + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + return -EINVAL; /* EOPNOTSUPP? */ + } + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); + if (rc) { + cifs_dbg(FYI, "check sfu type of %s, open rc = %d\n", path, rc); + cifs_put_tlink(tlink); + return rc; + } + + /* Read header */ + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = 24; + + rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms, + &bytes_read, &pbuf, &buf_type); + if ((rc == 0) && (bytes_read >= 8)) { + if (memcmp("IntxBLK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Block device\n"); + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxCHR", pbuf, 8) == 0) { + cifs_dbg(FYI, "Char device\n"); + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + cifs_dbg(FYI, "Symlink\n"); + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + } else { + fattr->cf_mode |= S_IFREG; /* file? */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; + } + } else { + fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; /* or some unknown SFU type */ + } + + tcon->ses->server->ops->close(xid, tcon, &fid); + cifs_put_tlink(tlink); + return rc; +} + +#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */ + +/* + * Fetch mode bits as provided by SFU. + * + * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ? + */ +static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, + struct cifs_sb_info *cifs_sb, unsigned int xid) +{ +#ifdef CONFIG_CIFS_XATTR + ssize_t rc; + char ea_value[4]; + __u32 mode; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + if (tcon->ses->server->ops->query_all_EAs == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, + "SETFILEBITS", ea_value, 4 /* size of buf */, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_put_tlink(tlink); + if (rc < 0) + return (int)rc; + else if (rc > 3) { + mode = le32_to_cpu(*((__le32 *)ea_value)); + fattr->cf_mode &= ~SFBITS_MASK; + cifs_dbg(FYI, "special bits 0%o org mode 0%o\n", + mode, fattr->cf_mode); + fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; + cifs_dbg(FYI, "special mode bits 0%o\n", mode); + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ +static void +cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, + struct cifs_sb_info *cifs_sb, bool adjust_tz, + bool symlink) +{ + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); + if (info->DeletePending) + fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING; + + if (info->LastAccessTime) + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + else + fattr->cf_atime = CURRENT_TIME; + + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + if (adjust_tz) { + fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; + fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; + } + + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); + + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + + if (symlink) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; + } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + /* + * Server can return wrong NumberOfLinks value for directories + * when Unix extensions are disabled - fake it. + */ + if (!tcon->unix_ext) + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; + + /* clear write bits if ATTR_READONLY is set */ + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~(S_IWUGO); + + /* + * Don't accept zero nlink from non-unix servers unless + * delete is pending. Instead mark it as unknown. + */ + if ((fattr->cf_nlink < 1) && !tcon->unix_ext && + !info->DeletePending) { + cifs_dbg(1, "bogus file nlink value %u\n", + fattr->cf_nlink); + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; + } + } + + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; +} + +static int +cifs_get_file_info(struct file *filp) +{ + int rc; + unsigned int xid; + FILE_ALL_INFO find_data; + struct cifs_fattr fattr; + struct inode *inode = file_inode(filp); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + if (!server->ops->query_file_info) + return -ENOSYS; + + xid = get_xid(); + rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false, + false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: + /* + * FIXME: legacy server -- fall back to path-based call? + * for now, just skip revalidating and mark inode for + * immediate reval. + */ + rc = 0; + CIFS_I(inode)->time = 0; + default: + goto cgfi_exit; + } + + /* + * don't bother with SFU junk here -- just mark inode as needing + * revalidation. + */ + fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; + cifs_fattr_to_inode(inode, &fattr); +cgfi_exit: + free_xid(xid); + return rc; +} + +int +cifs_get_inode_info(struct inode **inode, const char *full_path, + FILE_ALL_INFO *data, struct super_block *sb, int xid, + const struct cifs_fid *fid) +{ + bool validinum = false; + __u16 srchflgs; + int rc = 0, tmprc = ENOSYS; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct tcon_link *tlink; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + char *buf = NULL; + bool adjust_tz = false; + struct cifs_fattr fattr; + struct cifs_search_info *srchinf = NULL; + bool symlink = false; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + cifs_dbg(FYI, "Getting info on %s\n", full_path); + + if ((data == NULL) && (*inode != NULL)) { + if (CIFS_CACHE_READ(CIFS_I(*inode))) { + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); + goto cgii_exit; + } + } + + /* if inode info is not passed, get it from server */ + if (data == NULL) { + if (!server->ops->query_path_info) { + rc = -ENOSYS; + goto cgii_exit; + } + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + data = (FILE_ALL_INFO *)buf; + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, + data, &adjust_tz, &symlink); + } + + if (!rc) { + cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz, + symlink); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); + rc = 0; + } else if (rc == -EACCES && backup_cred(cifs_sb)) { + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + + srchinf->endOfSearch = false; + srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; + + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = + (FILE_ALL_INFO *)srchinf->srch_entries_start; + + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; + + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + if (rc) + goto cgii_exit; + } else + goto cgii_exit; + + /* + * If an inode wasn't passed in, then get the inode number + * + * Is an i_ino of zero legal? Can we use that to check if the server + * supports returning inode numbers? Are there other sanity checks we + * can use to ensure that the server is really filling in that field? + */ + if (*inode == NULL) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + if (validinum == false) { + if (server->ops->get_srv_inum) + tmprc = server->ops->get_srv_inum(xid, + tcon, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) { + cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", + tmprc); + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + } + } else + fattr.cf_uniqueid = iunique(sb, ROOT_I); + } else + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + + /* query for SFU type info if supported and needed */ + if (fattr.cf_cifsattrs & ATTR_SYSTEM && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); + if (tmprc) + cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); + } + +#ifdef CONFIG_CIFS_ACL + /* fill in 0777 bits from ACL */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid); + if (rc) { + cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", + __func__, rc); + goto cgii_exit; + } + } +#endif /* CONFIG_CIFS_ACL */ + + /* fill in remaining high mode bits e.g. SUID, VTX */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); + + /* check for Minshall+French symlinks */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); + if (tmprc) + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + } + + if (!*inode) { + *inode = cifs_iget(sb, &fattr); + if (!*inode) + rc = -ENOMEM; + } else { + /* we already have inode, update it */ + + /* if filetype is different, return error */ + if (unlikely(((*inode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgii_exit; + } + + cifs_fattr_to_inode(*inode, &fattr); + } + +cgii_exit: + kfree(buf); + cifs_put_tlink(tlink); + return rc; +} + +static const struct inode_operations cifs_ipc_inode_ops = { + .lookup = cifs_lookup, +}; + +static int +cifs_find_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + /* don't match inode with different uniqueid */ + if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) + return 0; + + /* use createtime like an i_generation field */ + if (CIFS_I(inode)->createtime != fattr->cf_createtime) + return 0; + + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + return 0; + + /* if it's not a directory or has no dentries, then flag it */ + if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) + fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; + + return 1; +} + +static int +cifs_init_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; + CIFS_I(inode)->createtime = fattr->cf_createtime; + return 0; +} + +/* + * walk dentry list for an inode and report whether it has aliases that + * are hashed. We use this to determine if a directory inode can actually + * be used. + */ +static bool +inode_has_hashed_dentries(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + if (!d_unhashed(dentry) || IS_ROOT(dentry)) { + spin_unlock(&inode->i_lock); + return true; + } + } + spin_unlock(&inode->i_lock); + return false; +} + +/* Given fattrs, get a corresponding inode */ +struct inode * +cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) +{ + unsigned long hash; + struct inode *inode; + +retry_iget5_locked: + cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid); + + /* hash down to 32-bits on 32-bit arch */ + hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); + + inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); + if (inode) { + /* was there a potentially problematic inode collision? */ + if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { + fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; + + if (inode_has_hashed_dentries(inode)) { + cifs_autodisable_serverino(CIFS_SB(sb)); + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + goto retry_iget5_locked; + } + } + + cifs_fattr_to_inode(inode, fattr); + if (sb->s_flags & MS_NOATIME) + inode->i_flags |= S_NOATIME | S_NOCMTIME; + if (inode->i_state & I_NEW) { + inode->i_ino = hash; +#ifdef CONFIG_CIFS_FSCACHE + /* initialize per-inode cache cookie pointer */ + CIFS_I(inode)->fscache = NULL; +#endif + unlock_new_inode(inode); + } + } + + return inode; +} + +/* gets root inode */ +struct inode *cifs_root_iget(struct super_block *sb) +{ + unsigned int xid; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct inode *inode = NULL; + long rc; + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + xid = get_xid(); + if (tcon->unix_ext) { + rc = cifs_get_inode_info_unix(&inode, "", sb, xid); + /* some servers mistakenly claim POSIX support */ + if (rc != -EOPNOTSUPP) + goto iget_no_retry; + cifs_dbg(VFS, "server does not support POSIX extensions"); + tcon->unix_ext = false; + } + + rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + +iget_no_retry: + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } + +#ifdef CONFIG_CIFS_FSCACHE + /* populate tcon->resource_id */ + tcon->resource_id = CIFS_I(inode)->uniqueid; +#endif + + if (rc && tcon->ipc) { + cifs_dbg(FYI, "ipc connection - fake read inode\n"); + spin_lock(&inode->i_lock); + inode->i_mode |= S_IFDIR; + set_nlink(inode, 2); + inode->i_op = &cifs_ipc_inode_ops; + inode->i_fop = &simple_dir_operations; + inode->i_uid = cifs_sb->mnt_uid; + inode->i_gid = cifs_sb->mnt_gid; + spin_unlock(&inode->i_lock); + } else if (rc) { + iget_failed(inode); + inode = ERR_PTR(rc); + } + +out: + /* can not call macro free_xid here since in a void func + * TODO: This is no longer true + */ + _free_xid(xid); + return inode; +} + +int +cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, + char *full_path, __u32 dosattr) +{ + bool set_time = false; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct TCP_Server_Info *server; + FILE_BASIC_INFO info_buf; + + if (attrs == NULL) + return -EINVAL; + + server = cifs_sb_master_tcon(cifs_sb)->ses->server; + if (!server->ops->set_file_info) + return -ENOSYS; + + if (attrs->ia_valid & ATTR_ATIME) { + set_time = true; + info_buf.LastAccessTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + } else + info_buf.LastAccessTime = 0; + + if (attrs->ia_valid & ATTR_MTIME) { + set_time = true; + info_buf.LastWriteTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + } else + info_buf.LastWriteTime = 0; + + /* + * Samba throws this field away, but windows may actually use it. + * Do not set ctime unless other time stamps are changed explicitly + * (i.e. by utimes()) since we would then have a mix of client and + * server times. + */ + if (set_time && (attrs->ia_valid & ATTR_CTIME)) { + cifs_dbg(FYI, "CIFS - CTIME changed\n"); + info_buf.ChangeTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + } else + info_buf.ChangeTime = 0; + + info_buf.CreationTime = 0; /* don't change */ + info_buf.Attributes = cpu_to_le32(dosattr); + + return server->ops->set_file_info(inode, full_path, &info_buf, xid); +} + +/* + * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit + * and rename it to a random name that hopefully won't conflict with + * anything else. + */ +int +cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, + const unsigned int xid) +{ + int oplock = 0; + int rc; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct inode *inode = d_inode(dentry); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + __u32 dosattr, origattr; + FILE_BASIC_INFO *info_buf = NULL; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + /* + * We cannot rename the file if the server doesn't support + * CAP_INFOLEVEL_PASSTHRU + */ + if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) { + rc = -EBUSY; + goto out; + } + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc != 0) + goto out; + + origattr = cifsInode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + /* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */ + if (dosattr != origattr) { + info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); + if (info_buf == NULL) { + rc = -ENOMEM; + goto out_close; + } + info_buf->Attributes = cpu_to_le32(dosattr); + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, + current->tgid); + /* although we would like to mark the file hidden + if that fails we will still try to rename it */ + if (!rc) + cifsInode->cifsAttrs = dosattr; + else + dosattr = origattr; /* since not able to change them */ + } + + /* rename the file */ + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc != 0) { + rc = -EBUSY; + goto undo_setattr; + } + + /* try to set DELETE_ON_CLOSE */ + if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) { + rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid, + current->tgid); + /* + * some samba versions return -ENOENT when we try to set the + * file disposition here. Likely a samba bug, but work around + * it for now. This means that some cifsXXX files may hang + * around after they shouldn't. + * + * BB: remove this hack after more servers have the fix + */ + if (rc == -ENOENT) + rc = 0; + else if (rc != 0) { + rc = -EBUSY; + goto undo_rename; + } + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); + } + +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); +out: + kfree(info_buf); + cifs_put_tlink(tlink); + return rc; + + /* + * reset everything back to the original state. Don't bother + * dealing with errors here since we can't do anything about + * them anyway. + */ +undo_rename: + CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name, + cifs_sb->local_nls, cifs_remap(cifs_sb)); +undo_setattr: + if (dosattr != origattr) { + info_buf->Attributes = cpu_to_le32(origattr); + if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, + current->tgid)) + cifsInode->cifsAttrs = origattr; + } + + goto out_close; +} + +/* copied from fs/nfs/dir.c with small changes */ +static void +cifs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&inode->i_lock); +} + +/* + * If d_inode(dentry) is null (usually meaning the cached dentry + * is a negative dentry) then we would attempt a standard SMB delete, but + * if that fails we can not attempt the fall back mechanisms on EACCESS + * but will return the EACCESS to the caller. Note that the VFS does not call + * unlink on negative dentries currently. + */ +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + int rc = 0; + unsigned int xid; + char *full_path = NULL; + struct inode *inode = d_inode(dentry); + struct cifsInodeInfo *cifs_inode; + struct super_block *sb = dir->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct iattr *attrs = NULL; + __u32 dosattr = 0, origattr = 0; + + cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + xid = get_xid(); + + /* Unlink can be called from rename so we can not take the + * sb->s_vfs_rename_mutex here */ + full_path = build_path_from_dentry(dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto unlink_out; + } + + if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = CIFSPOSIXDelFile(xid, tcon, full_path, + SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_dbg(FYI, "posix del rc %d\n", rc); + if ((rc == 0) || (rc == -ENOENT)) + goto psx_del_no_retry; + } + +retry_std_delete: + if (!server->ops->unlink) { + rc = -ENOSYS; + goto psx_del_no_retry; + } + + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb); + +psx_del_no_retry: + if (!rc) { + if (inode) + cifs_drop_nlink(inode); + } else if (rc == -ENOENT) { + d_drop(dentry); + } else if (rc == -EBUSY) { + if (server->ops->rename_pending_delete) { + rc = server->ops->rename_pending_delete(full_path, + dentry, xid); + if (rc == 0) + cifs_drop_nlink(inode); + } + } else if ((rc == -EACCES) && (dosattr == 0) && inode) { + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (attrs == NULL) { + rc = -ENOMEM; + goto out_reval; + } + + /* try to reset dos attributes */ + cifs_inode = CIFS_I(inode); + origattr = cifs_inode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + if (rc != 0) + goto out_reval; + + goto retry_std_delete; + } + + /* undo the setattr if we errored out and it's needed */ + if (rc != 0 && dosattr != 0) + cifs_set_file_info(inode, attrs, xid, full_path, origattr); + +out_reval: + if (inode) { + cifs_inode = CIFS_I(inode); + cifs_inode->time = 0; /* will force revalidate to get info + when needed */ + inode->i_ctime = current_fs_time(sb); + } + dir->i_ctime = dir->i_mtime = current_fs_time(sb); + cifs_inode = CIFS_I(dir); + CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ +unlink_out: + kfree(full_path); + kfree(attrs); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +static int +cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, + const char *full_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid) +{ + int rc = 0; + struct inode *inode = NULL; + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb, + xid); + else + rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb, + xid, NULL); + + if (rc) + return rc; + + /* + * setting nlink not necessary except in cases where we failed to get it + * from the server or was set bogus. Also, since this is a brand new + * inode, no need to grab the i_lock before setting the i_nlink. + */ + if (inode->i_nlink < 2) + set_nlink(inode, 2); + mode &= ~current_umask(); + /* must turn on setgid bit if parent dir has it */ + if (parent->i_mode & S_ISGID) + mode |= S_ISGID; + + if (tcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + if (parent->i_mode & S_ISGID) + args.gid = parent->i_gid; + else + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + } else { + struct TCP_Server_Info *server = tcon->ses->server; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) + server->ops->mkdir_setinfo(inode, full_path, cifs_sb, + tcon, xid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + inode->i_mode = (mode | S_IFDIR); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + inode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + inode->i_gid = parent->i_gid; + else + inode->i_gid = current_fsgid(); + } + } + d_instantiate(dentry, inode); + return rc; +} + +static int +cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, + const char *full_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid) +{ + int rc = 0; + u32 oplock = 0; + FILE_UNIX_BASIC_INFO *info = NULL; + struct inode *newinode = NULL; + struct cifs_fattr fattr; + + info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (info == NULL) { + rc = -ENOMEM; + goto posix_mkdir_out; + } + + mode &= ~current_umask(); + rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, + NULL /* netfid */, info, &oplock, full_path, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (rc == -EOPNOTSUPP) + goto posix_mkdir_out; + else if (rc) { + cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc); + d_drop(dentry); + goto posix_mkdir_out; + } + + if (info->Type == cpu_to_le32(-1)) + /* no return info, go query for it */ + goto posix_mkdir_get_info; + /* + * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if + * need to set uid/gid. + */ + + cifs_unix_basic_to_fattr(&fattr, info, cifs_sb); + cifs_fill_uniqueid(inode->i_sb, &fattr); + newinode = cifs_iget(inode->i_sb, &fattr); + if (!newinode) + goto posix_mkdir_get_info; + + d_instantiate(dentry, newinode); + +#ifdef CONFIG_CIFS_DEBUG2 + cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n", + dentry, dentry, newinode); + + if (newinode->i_nlink != 2) + cifs_dbg(FYI, "unexpected number of links %d\n", + newinode->i_nlink); +#endif + +posix_mkdir_out: + kfree(info); + return rc; +posix_mkdir_get_info: + rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon, + xid); + goto posix_mkdir_out; +} + +int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) +{ + int rc = 0; + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + char *full_path; + + cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n", + mode, inode); + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto mkdir_out; + } + + if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, + tcon, xid); + if (rc != -EOPNOTSUPP) + goto mkdir_out; + } + + server = tcon->ses->server; + + if (!server->ops->mkdir) { + rc = -ENOSYS; + goto mkdir_out; + } + + /* BB add setting the equivalent of mode via CreateX w/ACLs */ + rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + if (rc) { + cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); + d_drop(direntry); + goto mkdir_out; + } + + rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, + xid); +mkdir_out: + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + CIFS_I(inode)->time = 0; + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +int cifs_rmdir(struct inode *inode, struct dentry *direntry) +{ + int rc = 0; + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + char *full_path = NULL; + struct cifsInodeInfo *cifsInode; + + cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto rmdir_exit; + } + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto rmdir_exit; + } + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (!server->ops->rmdir) { + rc = -ENOSYS; + cifs_put_tlink(tlink); + goto rmdir_exit; + } + + rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); + cifs_put_tlink(tlink); + + if (!rc) { + spin_lock(&d_inode(direntry)->i_lock); + i_size_write(d_inode(direntry), 0); + clear_nlink(d_inode(direntry)); + spin_unlock(&d_inode(direntry)->i_lock); + } + + cifsInode = CIFS_I(d_inode(direntry)); + /* force revalidate to go get info when needed */ + cifsInode->time = 0; + + cifsInode = CIFS_I(inode); + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + cifsInode->time = 0; + + d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + +rmdir_exit: + kfree(full_path); + free_xid(xid); + return rc; +} + +static int +cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, + const char *from_path, struct dentry *to_dentry, + const char *to_path) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifs_fid fid; + struct cifs_open_parms oparms; + int oplock, rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (!server->ops->rename) + return -ENOSYS; + + /* try path-based rename first */ + rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb); + + /* + * Don't bother with rename by filehandle unless file is busy and + * source. Note that cross directory moves do not work with + * rename by filehandle to various Windows servers. + */ + if (rc == 0 || rc != -EBUSY) + goto do_rename_exit; + + /* open-file renames don't work across directories */ + if (to_dentry->d_parent != from_dentry->d_parent) + goto do_rename_exit; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + /* open the file to be renamed -- we need DELETE perms */ + oparms.desired_access = DELETE; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = from_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc == 0) { + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, + (const char *) to_dentry->d_name.name, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + CIFSSMBClose(xid, tcon, fid.netfid); + } +do_rename_exit: + cifs_put_tlink(tlink); + return rc; +} + +int +cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry, + unsigned int flags) +{ + char *from_name = NULL; + char *to_name = NULL; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + FILE_UNIX_BASIC_INFO *info_buf_source = NULL; + FILE_UNIX_BASIC_INFO *info_buf_target; + unsigned int xid; + int rc, tmprc; + + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + cifs_sb = CIFS_SB(source_dir->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + xid = get_xid(); + + /* + * we already have the rename sem so we do not need to + * grab it again here to protect the path integrity + */ + from_name = build_path_from_dentry(source_dentry); + if (from_name == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + to_name = build_path_from_dentry(target_dentry); + if (to_name == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, + to_name); + + /* + * No-replace is the natural behavior for CIFS, so skip unlink hacks. + */ + if (flags & RENAME_NOREPLACE) + goto cifs_rename_exit; + + if (rc == -EEXIST && tcon->unix_ext) { + /* + * Are src and dst hardlinks of same inode? We can only tell + * with unix extensions enabled. + */ + info_buf_source = + kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + GFP_KERNEL); + if (info_buf_source == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + info_buf_target = info_buf_source + 1; + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name, + info_buf_source, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (tmprc != 0) + goto unlink_target; + + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name, + info_buf_target, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + + if (tmprc == 0 && (info_buf_source->UniqueId == + info_buf_target->UniqueId)) { + /* same file, POSIX says that this is a noop */ + rc = 0; + goto cifs_rename_exit; + } + } + /* + * else ... BB we could add the same check for Windows by + * checking the UniqueId via FILE_INTERNAL_INFO + */ + +unlink_target: + /* Try unlinking the target dentry if it's not negative */ + if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { + if (d_is_dir(target_dentry)) + tmprc = cifs_rmdir(target_dir, target_dentry); + else + tmprc = cifs_unlink(target_dir, target_dentry); + if (tmprc) + goto cifs_rename_exit; + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + } + + /* force revalidate to go get info when needed */ + CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; + + source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = + target_dir->i_mtime = current_fs_time(source_dir->i_sb); + +cifs_rename_exit: + kfree(info_buf_source); + kfree(from_name); + kfree(to_name); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +static bool +cifs_inode_needs_reval(struct inode *inode) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (CIFS_CACHE_READ(cifs_i)) + return false; + + if (!lookupCacheEnabled) + return true; + + if (cifs_i->time == 0) + return true; + + if (!cifs_sb->actimeo) + return true; + + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->actimeo)) + return true; + + /* hardlinked files w/ noserverino get "special" treatment */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + S_ISREG(inode->i_mode) && inode->i_nlink != 1) + return true; + + return false; +} + +/* + * Zap the cache. Called when invalid_mapping flag is set. + */ +int +cifs_invalidate_mapping(struct inode *inode) +{ + int rc = 0; + + if (inode->i_mapping && inode->i_mapping->nrpages != 0) { + rc = invalidate_inode_pages2(inode->i_mapping); + if (rc) + cifs_dbg(VFS, "%s: could not invalidate inode %p\n", + __func__, inode); + } + + cifs_fscache_reset_inode_cookie(inode); + return rc; +} + +/** + * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +static int +cifs_wait_bit_killable(struct wait_bit_key *key) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; +} + +int +cifs_revalidate_mapping(struct inode *inode) +{ + int rc; + unsigned long *flags = &CIFS_I(inode)->flags; + + rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); + if (rc) + return rc; + + if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) { + rc = cifs_invalidate_mapping(inode); + if (rc) + set_bit(CIFS_INO_INVALID_MAPPING, flags); + } + + clear_bit_unlock(CIFS_INO_LOCK, flags); + smp_mb__after_atomic(); + wake_up_bit(flags, CIFS_INO_LOCK); + + return rc; +} + +int +cifs_zap_mapping(struct inode *inode) +{ + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); + return cifs_revalidate_mapping(inode); +} + +int cifs_revalidate_file_attr(struct file *filp) +{ + int rc = 0; + struct inode *inode = file_inode(filp); + struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + + if (!cifs_inode_needs_reval(inode)) + return rc; + + if (tlink_tcon(cfile->tlink)->unix_ext) + rc = cifs_get_file_info_unix(filp); + else + rc = cifs_get_file_info(filp); + + return rc; +} + +int cifs_revalidate_dentry_attr(struct dentry *dentry) +{ + unsigned int xid; + int rc = 0; + struct inode *inode = d_inode(dentry); + struct super_block *sb = dentry->d_sb; + char *full_path = NULL; + + if (inode == NULL) + return -ENOENT; + + if (!cifs_inode_needs_reval(inode)) + return rc; + + xid = get_xid(); + + /* can not safely grab the rename sem here if rename calls revalidate + since that would deadlock */ + full_path = build_path_from_dentry(dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", + full_path, inode, inode->i_count.counter, + dentry, dentry->d_time, jiffies); + + if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); + else + rc = cifs_get_inode_info(&inode, full_path, NULL, sb, + xid, NULL); + +out: + kfree(full_path); + free_xid(xid); + return rc; +} + +int cifs_revalidate_file(struct file *filp) +{ + int rc; + struct inode *inode = file_inode(filp); + + rc = cifs_revalidate_file_attr(filp); + if (rc) + return rc; + + return cifs_revalidate_mapping(inode); +} + +/* revalidate a dentry's inode attributes */ +int cifs_revalidate_dentry(struct dentry *dentry) +{ + int rc; + struct inode *inode = d_inode(dentry); + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + return cifs_revalidate_mapping(inode); +} + +int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct inode *inode = d_inode(dentry); + int rc; + + /* + * We need to be sure that all dirty pages are written and the server + * has actual ctime, mtime and file length. + */ + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; + } + } + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + generic_fillattr(inode, stat); + stat->blksize = CIFS_MAX_MSGSIZE; + stat->ino = CIFS_I(inode)->uniqueid; + + /* + * If on a multiuser mount without unix extensions or cifsacl being + * enabled, and the admin hasn't overridden them, set the ownership + * to the fsuid/fsgid of the current process. + */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + !tcon->unix_ext) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) + stat->uid = current_fsuid(); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) + stat->gid = current_fsgid(); + } + return rc; +} + +static int cifs_truncate_page(struct address_space *mapping, loff_t from) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE - 1); + struct page *page; + int rc = 0; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + unlock_page(page); + page_cache_release(page); + return rc; +} + +static void cifs_setsize(struct inode *inode, loff_t offset) +{ + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, offset); +} + +static int +cifs_set_file_size(struct inode *inode, struct iattr *attrs, + unsigned int xid, char *full_path) +{ + int rc; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon = NULL; + struct TCP_Server_Info *server; + struct cifs_io_parms io_parms; + + /* + * To avoid spurious oplock breaks from server, in the case of + * inodes that we already have open, avoid doing path based + * setting of file size if we can do it by handle. + * This keeps our caching token (oplock) and avoids timeouts + * when the local oplock break takes longer to flush + * writebehind data than the SMB timeout for the SetPathInfo + * request would allow + */ + open_file = find_writable_file(cifsInode, true); + if (open_file) { + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + if (server->ops->set_file_size) + rc = server->ops->set_file_size(xid, tcon, open_file, + attrs->ia_size, false); + else + rc = -ENOSYS; + cifsFileInfo_put(open_file); + cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + unsigned int bytes_written; + + io_parms.netfid = open_file->fid.netfid; + io_parms.pid = open_file->pid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = attrs->ia_size; + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, + NULL, NULL, 1); + cifs_dbg(FYI, "Wrt seteof rc %d\n", rc); + } + } else + rc = -EINVAL; + + if (!rc) + goto set_size_out; + + if (tcon == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + } + + /* + * Set file size by pathname rather than by handle either because no + * valid, writeable file handle for it was found or because there was + * an error setting it by handle. + */ + if (server->ops->set_path_size) + rc = server->ops->set_path_size(xid, tcon, full_path, + attrs->ia_size, cifs_sb, false); + else + rc = -ENOSYS; + cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + __u16 netfid; + int oplock = 0; + + rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN, + GENERIC_WRITE, CREATE_NOT_DIR, &netfid, + &oplock, NULL, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc == 0) { + unsigned int bytes_written; + + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = attrs->ia_size; + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, + NULL, 1); + cifs_dbg(FYI, "wrt seteof rc %d\n", rc); + CIFSSMBClose(xid, tcon, netfid); + } + } + if (tlink) + cifs_put_tlink(tlink); + +set_size_out: + if (rc == 0) { + cifsInode->server_eof = attrs->ia_size; + cifs_setsize(inode, attrs->ia_size); + cifs_truncate_page(inode->i_mapping, inode->i_size); + } + + return rc; +} + +static int +cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) +{ + int rc; + unsigned int xid; + char *full_path = NULL; + struct inode *inode = d_inode(direntry); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifs_unix_set_info_args *args = NULL; + struct cifsFileInfo *open_file; + + cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n", + direntry, attrs->ia_valid); + + xid = get_xid(); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + attrs->ia_valid |= ATTR_FORCE; + + rc = inode_change_ok(inode, attrs); + if (rc < 0) + goto out; + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + rc = 0; + + if (attrs->ia_valid & ATTR_SIZE) { + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) + goto out; + } + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attrs->ia_valid &= ~ATTR_MODE; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) { + rc = -ENOMEM; + goto out; + } + + /* set up the struct */ + if (attrs->ia_valid & ATTR_MODE) + args->mode = attrs->ia_mode; + else + args->mode = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_UID) + args->uid = attrs->ia_uid; + else + args->uid = INVALID_UID; /* no change */ + + if (attrs->ia_valid & ATTR_GID) + args->gid = attrs->ia_gid; + else + args->gid = INVALID_GID; /* no change */ + + if (attrs->ia_valid & ATTR_ATIME) + args->atime = cifs_UnixTimeToNT(attrs->ia_atime); + else + args->atime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_MTIME) + args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime); + else + args->mtime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_CTIME) + args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime); + else + args->ctime = NO_CHANGE_64; + + args->device = 0; + open_file = find_writable_file(cifsInode, true); + if (open_file) { + u16 nfid = open_file->fid.netfid; + u32 npid = open_file->pid; + pTcon = tlink_tcon(open_file->tlink); + rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); + cifsFileInfo_put(open_file); + } else { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto out; + } + pTcon = tlink_tcon(tlink); + rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_put_tlink(tlink); + } + + if (rc) + goto out; + + if ((attrs->ia_valid & ATTR_SIZE) && + attrs->ia_size != i_size_read(inode)) + truncate_setsize(inode, attrs->ia_size); + + setattr_copy(inode, attrs); + mark_inode_dirty(inode); + + /* force revalidate when any of these times are set since some + of the fs types (eg ext3, fat) do not have fine enough + time granularity to match protocol, and we do not have a + a way (yet) to query the server fs's time granularity (and + whether it rounds times down). + */ + if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + cifsInode->time = 0; +out: + kfree(args); + kfree(full_path); + free_xid(xid); + return rc; +} + +static int +cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) +{ + unsigned int xid; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + struct inode *inode = d_inode(direntry); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + char *full_path = NULL; + int rc = -EACCES; + __u32 dosattr = 0; + __u64 mode = NO_CHANGE_64; + + xid = get_xid(); + + cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n", + direntry, attrs->ia_valid); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + attrs->ia_valid |= ATTR_FORCE; + + rc = inode_change_ok(inode, attrs); + if (rc < 0) { + free_xid(xid); + return rc; + } + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + free_xid(xid); + return rc; + } + + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + rc = 0; + + if (attrs->ia_valid & ATTR_SIZE) { + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) + goto cifs_setattr_exit; + } + + if (attrs->ia_valid & ATTR_UID) + uid = attrs->ia_uid; + + if (attrs->ia_valid & ATTR_GID) + gid = attrs->ia_gid; + +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if (uid_valid(uid) || gid_valid(gid)) { + rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, + uid, gid); + if (rc) { + cifs_dbg(FYI, "%s: Setting id failed with error: %d\n", + __func__, rc); + goto cifs_setattr_exit; + } + } + } else +#endif /* CONFIG_CIFS_ACL */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) + attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attrs->ia_valid &= ~ATTR_MODE; + + if (attrs->ia_valid & ATTR_MODE) { + mode = attrs->ia_mode; + rc = 0; +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = id_mode_to_cifs_acl(inode, full_path, mode, + INVALID_UID, INVALID_GID); + if (rc) { + cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n", + __func__, rc); + goto cifs_setattr_exit; + } + } else +#endif /* CONFIG_CIFS_ACL */ + if (((mode & S_IWUGO) == 0) && + (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { + + dosattr = cifsInode->cifsAttrs | ATTR_READONLY; + + /* fix up mode if we're not using dynperm */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + attrs->ia_mode = inode->i_mode & ~S_IWUGO; + } else if ((mode & S_IWUGO) && + (cifsInode->cifsAttrs & ATTR_READONLY)) { + + dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + /* Attributes of 0 are ignored */ + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + + /* reset local inode permissions to normal */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + attrs->ia_mode &= ~(S_IALLUGO); + if (S_ISDIR(inode->i_mode)) + attrs->ia_mode |= + cifs_sb->mnt_dir_mode; + else + attrs->ia_mode |= + cifs_sb->mnt_file_mode; + } + } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + /* ignore mode change - ATTR_READONLY hasn't changed */ + attrs->ia_valid &= ~ATTR_MODE; + } + } + + if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) || + ((attrs->ia_valid & ATTR_MODE) && dosattr)) { + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + /* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */ + + /* Even if error on time set, no sense failing the call if + the server would set the time to a reasonable value anyway, + and this check ensures that we are not being called from + sys_utimes in which case we ought to fail the call back to + the user when the server rejects the call */ + if ((rc) && (attrs->ia_valid & + (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE))) + rc = 0; + } + + /* do not need local check to inode_check_ok since the server does + that */ + if (rc) + goto cifs_setattr_exit; + + if ((attrs->ia_valid & ATTR_SIZE) && + attrs->ia_size != i_size_read(inode)) + truncate_setsize(inode, attrs->ia_size); + + setattr_copy(inode, attrs); + mark_inode_dirty(inode); + +cifs_setattr_exit: + kfree(full_path); + free_xid(xid); + return rc; +} + +int +cifs_setattr(struct dentry *direntry, struct iattr *attrs) +{ + struct inode *inode = d_inode(direntry); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); + + if (pTcon->unix_ext) + return cifs_setattr_unix(direntry, attrs); + + return cifs_setattr_nounix(direntry, attrs); + + /* BB: add cifs_setattr_legacy for really old servers */ +} + +#if 0 +void cifs_delete_inode(struct inode *inode) +{ + cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); + /* may have to add back in if and when safe distributed caching of + directories added e.g. via FindNotify */ +} +#endif diff --git a/kernel/fs/cifs/ioctl.c b/kernel/fs/cifs/ioctl.c new file mode 100644 index 000000000..8b7898b76 --- /dev/null +++ b/kernel/fs/cifs/ioctl.c @@ -0,0 +1,217 @@ +/* + * fs/cifs/ioctl.c + * + * vfs operations that deal with io control + * + * Copyright (C) International Business Machines Corp., 2005,2013 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) + +static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, + unsigned long srcfd, u64 off, u64 len, u64 destoff) +{ + int rc; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct inode *target_inode = file_inode(dst_file); + struct cifs_tcon *target_tcon; + struct fd src_file; + struct cifsFileInfo *smb_file_src; + struct inode *src_inode; + struct cifs_tcon *src_tcon; + + cifs_dbg(FYI, "ioctl clone range\n"); + /* the destination must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) { + cifs_dbg(FYI, "file target not open for write\n"); + return -EINVAL; + } + + /* check if target volume is readonly and take reference */ + rc = mnt_want_write_file(dst_file); + if (rc) { + cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); + return rc; + } + + src_file = fdget(srcfd); + if (!src_file.file) { + rc = -EBADF; + goto out_drop_write; + } + + if ((!src_file.file->private_data) || (!dst_file->private_data)) { + rc = -EBADF; + cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); + goto out_fput; + } + + rc = -EXDEV; + smb_file_target = dst_file->private_data; + smb_file_src = src_file.file->private_data; + src_tcon = tlink_tcon(smb_file_src->tlink); + target_tcon = tlink_tcon(smb_file_target->tlink); + + /* check if source and target are on same tree connection */ + if (src_tcon != target_tcon) { + cifs_dbg(VFS, "file copy src and target on different volume\n"); + goto out_fput; + } + + src_inode = file_inode(src_file.file); + rc = -EINVAL; + if (S_ISDIR(src_inode->i_mode)) + goto out_fput; + + /* + * Note: cifs case is easier than btrfs since server responsible for + * checks for proper open modes and file type and if it wants + * server could even support copy of range where source = target + */ + lock_two_nondirectories(target_inode, src_inode); + + /* determine range to clone */ + rc = -EINVAL; + if (off + len > src_inode->i_size || off + len < off) + goto out_unlock; + if (len == 0) + len = src_inode->i_size - off; + + cifs_dbg(FYI, "about to flush pages\n"); + /* should we flush first and last page first */ + truncate_inode_pages_range(&target_inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len)-1); + + if (target_tcon->ses->server->ops->clone_range) + rc = target_tcon->ses->server->ops->clone_range(xid, + smb_file_src, smb_file_target, off, len, destoff); + + /* force revalidate of size and timestamps of target file now + that target is updated on the server */ + CIFS_I(target_inode)->time = 0; +out_unlock: + /* although unlocking in the reverse order from locking is not + strictly necessary here it is a little cleaner to be consistent */ + unlock_two_nondirectories(src_inode, target_inode); +out_fput: + fdput(src_file); +out_drop_write: + mnt_drop_write_file(dst_file); + return rc; +} + +long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) +{ + struct inode *inode = file_inode(filep); + int rc = -ENOTTY; /* strange error - but the precedent */ + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct cifsFileInfo *pSMBFile = filep->private_data; + struct cifs_tcon *tcon; + __u64 ExtAttrBits = 0; + __u64 caps; + + xid = get_xid(); + + cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); + + cifs_sb = CIFS_SB(inode->i_sb); + + switch (command) { + case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); +#ifdef CONFIG_CIFS_POSIX + if (CIFS_UNIX_EXTATTR_CAP & caps) { + __u64 ExtAttrMask = 0; + rc = CIFSGetExtAttr(xid, tcon, + pSMBFile->fid.netfid, + &ExtAttrBits, &ExtAttrMask); + if (rc == 0) + rc = put_user(ExtAttrBits & + FS_FL_USER_VISIBLE, + (int __user *)arg); + if (rc != EOPNOTSUPP) + break; + } +#endif /* CONFIG_CIFS_POSIX */ + rc = 0; + if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) { + /* add in the compressed bit */ + ExtAttrBits = FS_COMPR_FL; + rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, + (int __user *)arg); + } + break; + case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (get_user(ExtAttrBits, (int __user *)arg)) { + rc = -EFAULT; + break; + } + + /* + * if (CIFS_UNIX_EXTATTR_CAP & caps) + * rc = CIFSSetExtAttr(xid, tcon, + * pSMBFile->fid.netfid, + * extAttrBits, + * &ExtAttrMask); + * if (rc != EOPNOTSUPP) + * break; + */ + + /* Currently only flag we can set is compressed flag */ + if ((ExtAttrBits & FS_COMPR_FL) == 0) + break; + + /* Try to set compress flag */ + if (tcon->ses->server->ops->set_compression) { + rc = tcon->ses->server->ops->set_compression( + xid, tcon, pSMBFile); + cifs_dbg(FYI, "set compress flag rc %d\n", rc); + } + break; + case CIFS_IOC_COPYCHUNK_FILE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + break; + default: + cifs_dbg(FYI, "unsupported ioctl\n"); + break; + } + + free_xid(xid); + return rc; +} diff --git a/kernel/fs/cifs/link.c b/kernel/fs/cifs/link.c new file mode 100644 index 000000000..e6c707cc6 --- /dev/null +++ b/kernel/fs/cifs/link.c @@ -0,0 +1,746 @@ +/* + * fs/cifs/link.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2proto.h" +#endif + +/* + * M-F Symlink Functions - Begin + */ + +#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) +#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) +#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1)) +#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024) +#define CIFS_MF_SYMLINK_FILE_SIZE \ + (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN) + +#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n" +#define CIFS_MF_SYMLINK_MD5_FORMAT \ + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n" +#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \ + md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \ + md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \ + md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\ + md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] + +static int +symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) +{ + int rc; + unsigned int size; + struct crypto_shash *md5; + struct sdesc *sdescmd5; + + md5 = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(md5)) { + rc = PTR_ERR(md5); + cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n", + __func__, rc); + return rc; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); + sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!sdescmd5) { + rc = -ENOMEM; + goto symlink_hash_err; + } + sdescmd5->shash.tfm = md5; + sdescmd5->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd5->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); + goto symlink_hash_err; + } + rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); + goto symlink_hash_err; + } + rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + +symlink_hash_err: + crypto_free_shash(md5); + kfree(sdescmd5); + + return rc; +} + +static int +parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, + char **_link_str) +{ + int rc; + unsigned int link_len; + const char *md5_str1; + const char *link_str; + u8 md5_hash[16]; + char md5_str2[34]; + + if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) + return -EINVAL; + + md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET]; + link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET]; + + rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len); + if (rc != 1) + return -EINVAL; + + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } + + snprintf(md5_str2, sizeof(md5_str2), + CIFS_MF_SYMLINK_MD5_FORMAT, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + + if (strncmp(md5_str1, md5_str2, 17) != 0) + return -EINVAL; + + if (_link_str) { + *_link_str = kstrndup(link_str, link_len, GFP_KERNEL); + if (!*_link_str) + return -ENOMEM; + } + + *_link_len = link_len; + return 0; +} + +static int +format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) +{ + int rc; + unsigned int link_len; + unsigned int ofs; + u8 md5_hash[16]; + + if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) + return -EINVAL; + + link_len = strlen(link_str); + + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -ENAMETOOLONG; + + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } + + snprintf(buf, buf_len, + CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, + link_len, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + + ofs = CIFS_MF_SYMLINK_LINK_OFFSET; + memcpy(buf + ofs, link_str, link_len); + + ofs += link_len; + if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) { + buf[ofs] = '\n'; + ofs++; + } + + while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) { + buf[ofs] = ' '; + ofs++; + } + + return 0; +} + +bool +couldbe_mf_symlink(const struct cifs_fattr *fattr) +{ + if (!S_ISREG(fattr->cf_mode)) + /* it's not a symlink */ + return false; + + if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) + /* it's not a symlink */ + return false; + + return true; +} + +static int +create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *fromName, + const char *toName) +{ + int rc; + u8 *buf; + unsigned int bytes_written = 0; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); + if (rc) + goto out; + + if (tcon->ses->server->ops->create_mf_symlink) + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, + cifs_sb, fromName, buf, &bytes_written); + else + rc = -EOPNOTSUPP; + + if (rc) + goto out; + + if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE) + rc = -EIO; +out: + kfree(buf); + return rc; +} + +static int +query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char **symlinkinfo) +{ + int rc; + u8 *buf = NULL; + unsigned int link_len = 0; + unsigned int bytes_read = 0; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); + else + rc = -ENOSYS; + + if (rc) + goto out; + + if (bytes_read == 0) { /* not a symlink */ + rc = -EINVAL; + goto out; + } + + rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); +out: + kfree(buf); + return rc; +} + +int +check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) +{ + int rc; + u8 *buf = NULL; + unsigned int link_len = 0; + unsigned int bytes_read = 0; + + if (!couldbe_mf_symlink(fattr)) + /* it's not a symlink */ + return 0; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); + else + rc = -ENOSYS; + + if (rc) + goto out; + + if (bytes_read == 0) /* not a symlink */ + goto out; + + rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); + if (rc == -EINVAL) { + /* it's not a symlink */ + rc = 0; + goto out; + } + + if (rc != 0) + goto out; + + /* it is a symlink */ + fattr->cf_eof = link_len; + fattr->cf_mode &= ~S_IFMT; + fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; + fattr->cf_dtype = DT_LNK; +out: + kfree(buf); + return rc; +} + +/* + * SMB 1.0 Protocol specific functions + */ + +int +cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_read) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + FILE_ALL_INFO file_info; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, &file_info); + if (rc) + return rc; + + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + rc = -ENOENT; + /* it's not a symlink */ + goto out; + } + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); +out: + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +int +cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int create_options = CREATE_NOT_DIR; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + return rc; + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +/* + * SMB 2.1/SMB3 Protocol specific functions + */ +#ifdef CONFIG_CIFS_SMB2 +int +smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_read) +{ + int rc; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_II; + struct smb2_file_all_info *pfile_info = NULL; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + if (backup_cred(cifs_sb)) + oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.disposition = FILE_OPEN; + oparms.fid = &fid; + oparms.reconnect = false; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (utf16_path == NULL) + return -ENOMEM; + + pfile_info = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + GFP_KERNEL); + + if (pfile_info == NULL) { + kfree(utf16_path); + return -ENOMEM; + } + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL); + if (rc) + goto qmf_out_open_fail; + + if (pfile_info->EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + /* it's not a symlink */ + rc = -ENOENT; /* Is there a better rc to return? */ + goto qmf_out; + } + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + io_parms.persistent_fid = fid.persistent_fid; + io_parms.volatile_fid = fid.volatile_fid; + rc = SMB2_read(xid, &io_parms, pbytes_read, &pbuf, &buf_type); +qmf_out: + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); +qmf_out_open_fail: + kfree(utf16_path); + kfree(pfile_info); + return rc; +} + +int +smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written) +{ + int rc; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int create_options = CREATE_NOT_DIR; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + struct kvec iov[2]; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + if (rc) { + kfree(utf16_path); + return rc; + } + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + io_parms.persistent_fid = fid.persistent_fid; + io_parms.volatile_fid = fid.volatile_fid; + + /* iov[0] is reserved for smb header */ + iov[1].iov_base = pbuf; + iov[1].iov_len = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = SMB2_write(xid, &io_parms, pbytes_written, iov, 1); + + /* Make sure we wrote all of the symlink data */ + if ((rc == 0) && (*pbytes_written != CIFS_MF_SYMLINK_FILE_SIZE)) + rc = -EIO; + + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + kfree(utf16_path); + return rc; +} +#endif /* CONFIG_CIFS_SMB2 */ + +/* + * M-F Symlink Functions - End + */ + +int +cifs_hardlink(struct dentry *old_file, struct inode *inode, + struct dentry *direntry) +{ + int rc = -EACCES; + unsigned int xid; + char *from_name = NULL; + char *to_name = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifsInodeInfo *cifsInode; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + xid = get_xid(); + + from_name = build_path_from_dentry(old_file); + to_name = build_path_from_dentry(direntry); + if ((from_name == NULL) || (to_name == NULL)) { + rc = -ENOMEM; + goto cifs_hl_exit; + } + + if (tcon->unix_ext) + rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + else { + server = tcon->ses->server; + if (!server->ops->create_hardlink) { + rc = -ENOSYS; + goto cifs_hl_exit; + } + rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, + cifs_sb); + if ((rc == -EIO) || (rc == -EINVAL)) + rc = -EOPNOTSUPP; + } + + d_drop(direntry); /* force new lookup from server of target */ + + /* + * if source file is cached (oplocked) revalidate will not go to server + * until the file is closed or oplock broken so update nlinks locally + */ + if (d_really_is_positive(old_file)) { + cifsInode = CIFS_I(d_inode(old_file)); + if (rc == 0) { + spin_lock(&d_inode(old_file)->i_lock); + inc_nlink(d_inode(old_file)); + spin_unlock(&d_inode(old_file)->i_lock); + + /* + * parent dir timestamps will update from srv within a + * second, would it really be worth it to set the parent + * dir cifs inode time to zero to force revalidate + * (faster) for it too? + */ + } + /* + * if not oplocked will force revalidate to get info on source + * file from srv. Note Samba server prior to 4.2 has bug - + * not updating src file ctime on hardlinks but Windows servers + * handle it properly + */ + cifsInode->time = 0; + + /* + * Will update parent dir timestamps from srv within a second. + * Would it really be worth it to set the parent dir (cifs + * inode) time field to zero to force revalidate on parent + * directory faster ie + * + * CIFS_I(inode)->time = 0; + */ + } + +cifs_hl_exit: + kfree(from_name); + kfree(to_name); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +} + +void * +cifs_follow_link(struct dentry *direntry, struct nameidata *nd) +{ + struct inode *inode = d_inode(direntry); + int rc = -ENOMEM; + unsigned int xid; + char *full_path = NULL; + char *target_path = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + + xid = get_xid(); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + full_path = build_path_from_dentry(direntry); + if (!full_path) + goto out; + + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); + + rc = -EACCES; + /* + * First try Minshall+French Symlinks, if configured + * and fallback to UNIX Extensions Symlinks. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, + &target_path); + + if (rc != 0 && server->ops->query_symlink) + rc = server->ops->query_symlink(xid, tcon, full_path, + &target_path, cifs_sb); + + kfree(full_path); +out: + if (rc != 0) { + kfree(target_path); + target_path = ERR_PTR(rc); + } + + free_xid(xid); + if (tlink) + cifs_put_tlink(tlink); + nd_set_link(nd, target_path); + return NULL; +} + +int +cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) +{ + int rc = -EOPNOTSUPP; + unsigned int xid; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + char *full_path = NULL; + struct inode *newinode = NULL; + + xid = get_xid(); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto symlink_exit; + } + pTcon = tlink_tcon(tlink); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto symlink_exit; + } + + cifs_dbg(FYI, "Full path: %s\n", full_path); + cifs_dbg(FYI, "symname is %s\n", symname); + + /* BB what if DFS and this volume is on different share? BB */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); + else if (pTcon->unix_ext) + rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + /* else + rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, + cifs_sb_target->local_nls); */ + + if (rc == 0) { + if (pTcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else + rc = cifs_get_inode_info(&newinode, full_path, NULL, + inode->i_sb, xid, NULL); + + if (rc != 0) { + cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n", + rc); + } else { + d_instantiate(direntry, newinode); + } + } +symlink_exit: + kfree(full_path); + cifs_put_tlink(tlink); + free_xid(xid); + return rc; +} diff --git a/kernel/fs/cifs/misc.c b/kernel/fs/cifs/misc.c new file mode 100644 index 000000000..8442b8b8e --- /dev/null +++ b/kernel/fs/cifs/misc.c @@ -0,0 +1,641 @@ +/* + * fs/cifs/misc.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "smberr.h" +#include "nterr.h" +#include "cifs_unicode.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2pdu.h" +#endif + +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; + +/* The xid serves as a useful identifier for each incoming vfs request, + in a similar way to the mid which is useful to track each sent smb, + and CurrentXid can also provide a running counter (although it + will eventually wrap past zero) of the total vfs operations handled + since the cifs fs was mounted */ + +unsigned int +_get_xid(void) +{ + unsigned int xid; + + spin_lock(&GlobalMid_Lock); + GlobalTotalActiveXid++; + + /* keep high water mark for number of simultaneous ops in filesystem */ + if (GlobalTotalActiveXid > GlobalMaxActiveXid) + GlobalMaxActiveXid = GlobalTotalActiveXid; + if (GlobalTotalActiveXid > 65000) + cifs_dbg(FYI, "warning: more than 65000 requests active\n"); + xid = GlobalCurrentXid++; + spin_unlock(&GlobalMid_Lock); + return xid; +} + +void +_free_xid(unsigned int xid) +{ + spin_lock(&GlobalMid_Lock); + /* if (GlobalTotalActiveXid == 0) + BUG(); */ + GlobalTotalActiveXid--; + spin_unlock(&GlobalMid_Lock); +} + +struct cifs_ses * +sesInfoAlloc(void) +{ + struct cifs_ses *ret_buf; + + ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL); + if (ret_buf) { + atomic_inc(&sesInfoAllocCount); + ret_buf->status = CifsNew; + ++ret_buf->ses_count; + INIT_LIST_HEAD(&ret_buf->smb_ses_list); + INIT_LIST_HEAD(&ret_buf->tcon_list); + mutex_init(&ret_buf->session_mutex); + } + return ret_buf; +} + +void +sesInfoFree(struct cifs_ses *buf_to_free) +{ + if (buf_to_free == NULL) { + cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); + return; + } + + atomic_dec(&sesInfoAllocCount); + kfree(buf_to_free->serverOS); + kfree(buf_to_free->serverDomain); + kfree(buf_to_free->serverNOS); + if (buf_to_free->password) { + memset(buf_to_free->password, 0, strlen(buf_to_free->password)); + kfree(buf_to_free->password); + } + kfree(buf_to_free->user_name); + kfree(buf_to_free->domainName); + kfree(buf_to_free->auth_key.response); + kfree(buf_to_free); +} + +struct cifs_tcon * +tconInfoAlloc(void) +{ + struct cifs_tcon *ret_buf; + ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + if (ret_buf) { + atomic_inc(&tconInfoAllocCount); + ret_buf->tidStatus = CifsNew; + ++ret_buf->tc_count; + INIT_LIST_HEAD(&ret_buf->openFileList); + INIT_LIST_HEAD(&ret_buf->tcon_list); +#ifdef CONFIG_CIFS_STATS + spin_lock_init(&ret_buf->stat_lock); +#endif + } + return ret_buf; +} + +void +tconInfoFree(struct cifs_tcon *buf_to_free) +{ + if (buf_to_free == NULL) { + cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); + return; + } + atomic_dec(&tconInfoAllocCount); + kfree(buf_to_free->nativeFileSystem); + if (buf_to_free->password) { + memset(buf_to_free->password, 0, strlen(buf_to_free->password)); + kfree(buf_to_free->password); + } + kfree(buf_to_free); +} + +struct smb_hdr * +cifs_buf_get(void) +{ + struct smb_hdr *ret_buf = NULL; + size_t buf_size = sizeof(struct smb_hdr); + +#ifdef CONFIG_CIFS_SMB2 + /* + * SMB2 header is bigger than CIFS one - no problems to clean some + * more bytes for CIFS. + */ + buf_size = sizeof(struct smb2_hdr); +#endif + /* + * We could use negotiated size instead of max_msgsize - + * but it may be more efficient to always alloc same size + * albeit slightly larger than necessary and maxbuffersize + * defaults to this and can not be bigger. + */ + ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS); + + /* clear the first few header bytes */ + /* for most paths, more is cleared in header_assemble */ + if (ret_buf) { + memset(ret_buf, 0, buf_size + 3); + atomic_inc(&bufAllocCount); +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&totBufAllocCount); +#endif /* CONFIG_CIFS_STATS2 */ + } + + return ret_buf; +} + +void +cifs_buf_release(void *buf_to_free) +{ + if (buf_to_free == NULL) { + /* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/ + return; + } + mempool_free(buf_to_free, cifs_req_poolp); + + atomic_dec(&bufAllocCount); + return; +} + +struct smb_hdr * +cifs_small_buf_get(void) +{ + struct smb_hdr *ret_buf = NULL; + +/* We could use negotiated size instead of max_msgsize - + but it may be more efficient to always alloc same size + albeit slightly larger than necessary and maxbuffersize + defaults to this and can not be bigger */ + ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); + if (ret_buf) { + /* No need to clear memory here, cleared in header assemble */ + /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ + atomic_inc(&smBufAllocCount); +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&totSmBufAllocCount); +#endif /* CONFIG_CIFS_STATS2 */ + + } + return ret_buf; +} + +void +cifs_small_buf_release(void *buf_to_free) +{ + + if (buf_to_free == NULL) { + cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n"); + return; + } + mempool_free(buf_to_free, cifs_sm_req_poolp); + + atomic_dec(&smBufAllocCount); + return; +} + +void +free_rsp_buf(int resp_buftype, void *rsp) +{ + if (resp_buftype == CIFS_SMALL_BUFFER) + cifs_small_buf_release(rsp); + else if (resp_buftype == CIFS_LARGE_BUFFER) + cifs_buf_release(rsp); +} + +/* NB: MID can not be set if treeCon not passed in, in that + case it is responsbility of caller to set the mid */ +void +header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , + const struct cifs_tcon *treeCon, int word_count + /* length of fixed section (word count) in two byte units */) +{ + char *temp = (char *) buffer; + + memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */ + + buffer->smb_buf_length = cpu_to_be32( + (2 * word_count) + sizeof(struct smb_hdr) - + 4 /* RFC 1001 length field does not count */ + + 2 /* for bcc field itself */) ; + + buffer->Protocol[0] = 0xFF; + buffer->Protocol[1] = 'S'; + buffer->Protocol[2] = 'M'; + buffer->Protocol[3] = 'B'; + buffer->Command = smb_command; + buffer->Flags = 0x00; /* case sensitive */ + buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; + buffer->Pid = cpu_to_le16((__u16)current->tgid); + buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); + if (treeCon) { + buffer->Tid = treeCon->tid; + if (treeCon->ses) { + if (treeCon->ses->capabilities & CAP_UNICODE) + buffer->Flags2 |= SMBFLG2_UNICODE; + if (treeCon->ses->capabilities & CAP_STATUS32) + buffer->Flags2 |= SMBFLG2_ERR_STATUS; + + /* Uid is not converted */ + buffer->Uid = treeCon->ses->Suid; + buffer->Mid = get_next_mid(treeCon->ses->server); + } + if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) + buffer->Flags2 |= SMBFLG2_DFS; + if (treeCon->nocase) + buffer->Flags |= SMBFLG_CASELESS; + if ((treeCon->ses) && (treeCon->ses->server)) + if (treeCon->ses->server->sign) + buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + } + +/* endian conversion of flags is now done just before sending */ + buffer->WordCount = (char) word_count; + return; +} + +static int +check_smb_hdr(struct smb_hdr *smb) +{ + /* does it have the right SMB "signature" ? */ + if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { + cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n", + *(unsigned int *)smb->Protocol); + return 1; + } + + /* if it's a response then accept */ + if (smb->Flags & SMBFLG_RESPONSE) + return 0; + + /* only one valid case where server sends us request */ + if (smb->Command == SMB_COM_LOCKING_ANDX) + return 0; + + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", + get_mid(smb)); + return 1; +} + +int +checkSMB(char *buf, unsigned int total_read) +{ + struct smb_hdr *smb = (struct smb_hdr *)buf; + __u32 rfclen = be32_to_cpu(smb->smb_buf_length); + __u32 clc_len; /* calculated length */ + cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n", + total_read, rfclen); + + /* is this frame too small to even get to a BCC? */ + if (total_read < 2 + sizeof(struct smb_hdr)) { + if ((total_read >= sizeof(struct smb_hdr) - 1) + && (smb->Status.CifsError != 0)) { + /* it's an error return */ + smb->WordCount = 0; + /* some error cases do not return wct and bcc */ + return 0; + } else if ((total_read == sizeof(struct smb_hdr) + 1) && + (smb->WordCount == 0)) { + char *tmp = (char *)smb; + /* Need to work around a bug in two servers here */ + /* First, check if the part of bcc they sent was zero */ + if (tmp[sizeof(struct smb_hdr)] == 0) { + /* some servers return only half of bcc + * on simple responses (wct, bcc both zero) + * in particular have seen this on + * ulogoffX and FindClose. This leaves + * one byte of bcc potentially unitialized + */ + /* zero rest of bcc */ + tmp[sizeof(struct smb_hdr)+1] = 0; + return 0; + } + cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n"); + } else { + cifs_dbg(VFS, "Length less than smb header size\n"); + } + return -EIO; + } + + /* otherwise, there is enough to get to the BCC */ + if (check_smb_hdr(smb)) + return -EIO; + clc_len = smbCalcSize(smb); + + if (4 + rfclen != total_read) { + cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", + rfclen); + return -EIO; + } + + if (4 + rfclen != clc_len) { + __u16 mid = get_mid(smb); + /* check if bcc wrapped around for large read responses */ + if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { + /* check if lengths match mod 64K */ + if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) + return 0; /* bcc wrapped */ + } + cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n", + clc_len, 4 + rfclen, mid); + + if (4 + rfclen < clc_len) { + cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n", + rfclen, mid); + return -EIO; + } else if (rfclen > clc_len + 512) { + /* + * Some servers (Windows XP in particular) send more + * data than the lengths in the SMB packet would + * indicate on certain calls (byte range locks and + * trans2 find first calls in particular). While the + * client can handle such a frame by ignoring the + * trailing data, we choose limit the amount of extra + * data to 512 bytes. + */ + cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n", + rfclen, mid); + return -EIO; + } + } + return 0; +} + +bool +is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) +{ + struct smb_hdr *buf = (struct smb_hdr *)buffer; + struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; + struct list_head *tmp, *tmp1, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsInodeInfo *pCifsInode; + struct cifsFileInfo *netfile; + + cifs_dbg(FYI, "Checking for oplock break or dnotify response\n"); + if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && + (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { + struct smb_com_transaction_change_notify_rsp *pSMBr = + (struct smb_com_transaction_change_notify_rsp *)buf; + struct file_notify_information *pnotify; + __u32 data_offset = 0; + if (get_bcc(buf) > sizeof(struct file_notify_information)) { + data_offset = le32_to_cpu(pSMBr->DataOffset); + + pnotify = (struct file_notify_information *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n", + pnotify->FileName, pnotify->Action); + /* cifs_dump_mem("Rcvd notify Data: ",buf, + sizeof(struct smb_hdr)+60); */ + return true; + } + if (pSMBr->hdr.Status.CifsError) { + cifs_dbg(FYI, "notify err 0x%x\n", + pSMBr->hdr.Status.CifsError); + return true; + } + return false; + } + if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX) + return false; + if (pSMB->hdr.Flags & SMBFLG_RESPONSE) { + /* no sense logging error on invalid handle on oplock + break - harmless race between close request and oplock + break response is expected from time to time writing out + large dirty files cached on the client */ + if ((NT_STATUS_INVALID_HANDLE) == + le32_to_cpu(pSMB->hdr.Status.CifsError)) { + cifs_dbg(FYI, "invalid handle on oplock break\n"); + return true; + } else if (ERRbadfid == + le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { + return true; + } else { + return false; /* on valid oplock brk we get "request" */ + } + } + if (pSMB->hdr.WordCount != 8) + return false; + + cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n", + pSMB->LockType, pSMB->OplockLevel); + if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) + return false; + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srv->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + if (tcon->tid != buf->Tid) + continue; + + cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &tcon->openFileList) { + netfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + if (pSMB->Fid != netfile->fid.netfid) + continue; + + cifs_dbg(FYI, "file id match, oplock break\n"); + pCifsInode = CIFS_I(d_inode(netfile->dentry)); + + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + + queue_work(cifsiod_wq, + &netfile->oplock_break); + netfile->oplock_break_cancelled = false; + + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + cifs_dbg(FYI, "No matching file for oplock break\n"); + return true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); + return true; +} + +void +dump_smb(void *buf, int smb_buf_length) +{ + if (traceSMB == 0) + return; + + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 8, 2, buf, + smb_buf_length, true); +} + +void +cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n", + cifs_sb_master_tcon(cifs_sb)->treeName); + } +} + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->oplock = CIFS_CACHE_READ_FLG; + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else + cinode->oplock = 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + +bool +backup_cred(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid())) + return true; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (in_group_p(cifs_sb->mnt_backupgid)) + return true; + } + + return false; +} + +void +cifs_del_pending_open(struct cifs_pending_open *open) +{ + spin_lock(&cifs_file_list_lock); + list_del(&open->olist); + spin_unlock(&cifs_file_list_lock); +} + +void +cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, + struct cifs_pending_open *open) +{ +#ifdef CONFIG_CIFS_SMB2 + memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE); +#endif + open->oplock = CIFS_OPLOCK_NO_CHANGE; + open->tlink = tlink; + fid->pending_open = open; + list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens); +} + +void +cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, + struct cifs_pending_open *open) +{ + spin_lock(&cifs_file_list_lock); + cifs_add_pending_open_locked(fid, tlink, open); + spin_unlock(&cifs_file_list_lock); +} diff --git a/kernel/fs/cifs/netmisc.c b/kernel/fs/cifs/netmisc.c new file mode 100644 index 000000000..abae6dd2c --- /dev/null +++ b/kernel/fs/cifs/netmisc.c @@ -0,0 +1,1014 @@ +/* + * fs/cifs/netmisc.c + * + * Copyright (c) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Error mapping routines from Samba libsmb/errormap.c + * Copyright (C) Andrew Tridgell 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "smberr.h" +#include "cifs_debug.h" +#include "nterr.h" + +struct smb_to_posix_error { + __u16 smb_err; + int posix_code; +}; + +static const struct smb_to_posix_error mapping_table_ERRDOS[] = { + {ERRbadfunc, -EINVAL}, + {ERRbadfile, -ENOENT}, + {ERRbadpath, -ENOTDIR}, + {ERRnofids, -EMFILE}, + {ERRnoaccess, -EACCES}, + {ERRbadfid, -EBADF}, + {ERRbadmcb, -EIO}, + {ERRnomem, -EREMOTEIO}, + {ERRbadmem, -EFAULT}, + {ERRbadenv, -EFAULT}, + {ERRbadformat, -EINVAL}, + {ERRbadaccess, -EACCES}, + {ERRbaddata, -EIO}, + {ERRbaddrive, -ENXIO}, + {ERRremcd, -EACCES}, + {ERRdiffdevice, -EXDEV}, + {ERRnofiles, -ENOENT}, + {ERRwriteprot, -EROFS}, + {ERRbadshare, -EBUSY}, + {ERRlock, -EACCES}, + {ERRunsup, -EINVAL}, + {ERRnosuchshare, -ENXIO}, + {ERRfilexists, -EEXIST}, + {ERRinvparm, -EINVAL}, + {ERRdiskfull, -ENOSPC}, + {ERRinvname, -ENOENT}, + {ERRinvlevel, -EOPNOTSUPP}, + {ERRdirnotempty, -ENOTEMPTY}, + {ERRnotlocked, -ENOLCK}, + {ERRcancelviolation, -ENOLCK}, + {ERRalreadyexists, -EEXIST}, + {ERRmoredata, -EOVERFLOW}, + {ERReasnotsupported, -EOPNOTSUPP}, + {ErrQuota, -EDQUOT}, + {ErrNotALink, -ENOLINK}, + {ERRnetlogonNotStarted, -ENOPROTOOPT}, + {ERRsymlink, -EOPNOTSUPP}, + {ErrTooManyLinks, -EMLINK}, + {0, 0} +}; + +static const struct smb_to_posix_error mapping_table_ERRSRV[] = { + {ERRerror, -EIO}, + {ERRbadpw, -EACCES}, /* was EPERM */ + {ERRbadtype, -EREMOTE}, + {ERRaccess, -EACCES}, + {ERRinvtid, -ENXIO}, + {ERRinvnetname, -ENXIO}, + {ERRinvdevice, -ENXIO}, + {ERRqfull, -ENOSPC}, + {ERRqtoobig, -ENOSPC}, + {ERRqeof, -EIO}, + {ERRinvpfid, -EBADF}, + {ERRsmbcmd, -EBADRQC}, + {ERRsrverror, -EIO}, + {ERRbadBID, -EIO}, + {ERRfilespecs, -EINVAL}, + {ERRbadLink, -EIO}, + {ERRbadpermits, -EINVAL}, + {ERRbadPID, -ESRCH}, + {ERRsetattrmode, -EINVAL}, + {ERRpaused, -EHOSTDOWN}, + {ERRmsgoff, -EHOSTDOWN}, + {ERRnoroom, -ENOSPC}, + {ERRrmuns, -EUSERS}, + {ERRtimeout, -ETIME}, + {ERRnoresource, -EREMOTEIO}, + {ERRtoomanyuids, -EUSERS}, + {ERRbaduid, -EACCES}, + {ERRusempx, -EIO}, + {ERRusestd, -EIO}, + {ERR_NOTIFY_ENUM_DIR, -ENOBUFS}, + {ERRnoSuchUser, -EACCES}, +/* {ERRaccountexpired, -EACCES}, + {ERRbadclient, -EACCES}, + {ERRbadLogonTime, -EACCES}, + {ERRpasswordExpired, -EACCES},*/ + {ERRaccountexpired, -EKEYEXPIRED}, + {ERRbadclient, -EACCES}, + {ERRbadLogonTime, -EACCES}, + {ERRpasswordExpired, -EKEYEXPIRED}, + + {ERRnosupport, -EINVAL}, + {0, 0} +}; + +static const struct smb_to_posix_error mapping_table_ERRHRD[] = { + {0, 0} +}; + +/* + * Convert a string containing text IPv4 or IPv6 address to binary form. + * + * Returns 0 on failure. + */ +static int +cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) +{ + int ret = 0; + + /* calculate length by finding first slash or NULL */ + if (address_family == AF_INET) + ret = in4_pton(cp, len, dst, '\\', NULL); + else if (address_family == AF_INET6) + ret = in6_pton(cp, len, dst , '\\', NULL); + + cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n", + ret, len, len, cp); + if (ret > 0) + ret = 1; + return ret; +} + +/* + * Try to convert a string to an IPv4 address and then attempt to convert + * it to an IPv6 address if that fails. Set the family field if either + * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to + * treat the part following it as a numeric sin6_scope_id. + * + * Returns 0 on failure. + */ +int +cifs_convert_address(struct sockaddr *dst, const char *src, int len) +{ + int rc, alen, slen; + const char *pct; + char scope_id[13]; + struct sockaddr_in *s4 = (struct sockaddr_in *) dst; + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; + + /* IPv4 address */ + if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { + s4->sin_family = AF_INET; + return 1; + } + + /* attempt to exclude the scope ID from the address part */ + pct = memchr(src, '%', len); + alen = pct ? pct - src : len; + + rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); + if (!rc) + return rc; + + s6->sin6_family = AF_INET6; + if (pct) { + /* grab the scope ID */ + slen = len - (alen + 1); + if (slen <= 0 || slen > 12) + return 0; + memcpy(scope_id, pct + 1, slen); + scope_id[slen] = '\0'; + + rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id); + rc = (rc == 0) ? 1 : 0; + } + + return rc; +} + +void +cifs_set_port(struct sockaddr *addr, const unsigned short int port) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + break; + } +} + +/***************************************************************************** +convert a NT status code to a dos class/code + *****************************************************************************/ +/* NT status -> dos error map */ +static const struct { + __u8 dos_class; + __u16 dos_code; + __u32 ntstatus; +} ntstatus_to_dos_map[] = { + { + ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, { + ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, { + ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, { + ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, { + ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, { + ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, { + ERRDOS, 87, NT_STATUS_INVALID_CID}, { + ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, { + ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, { + ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, { + ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, { + ERRDOS, 38, NT_STATUS_END_OF_FILE}, { + ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, { + ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, { + ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR}, +/* { This NT error code was 'sqashed' + from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK + during the session setup } */ + { + ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, { + ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, { + ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, { + ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, { + ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, { + ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, { + ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, { + ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED}, +/* { This NT error code was 'sqashed' + from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, { + ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, { + ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, { + ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, { + ERRDOS, 158, NT_STATUS_NOT_LOCKED}, { + ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, { + ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, { + ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, { + ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, { + /* mapping changed since shell does lookup on * expects FileNotFound */ + ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, { + ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, { + ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, { + ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, { + ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, { + ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, { + ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, { + ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, { + ERRDOS, 23, NT_STATUS_DATA_ERROR}, { + ERRDOS, 23, NT_STATUS_CRC_ERROR}, { + ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, { + ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, { + ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, { + ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, { + ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, { + ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, { + ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, { + ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, { + ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, { + ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, { + ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, { + ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, { + ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, { + ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, { + ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, { + ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, { + ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, { + ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, { + ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, { + ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, { + ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, { + ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, { + ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, { + ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, { + ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS}, +/* { This NT error code was 'sqashed' + from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */ + ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN}, +/* { This NT error code was 'sqashed' + from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, { + ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, { + ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { + ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, { + ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { + ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, { + ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { + ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, { + ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, { + ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, { + ERRDOS, 112, NT_STATUS_DISK_FULL}, { + ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, { + ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, { + ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, { + ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, { + ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, { + ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, { + ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, { + ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, { + ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, { + ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, +/* { This NT error code was 'sqashed' + from NT_STATUS_INSUFFICIENT_RESOURCES to + NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ + { + ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, { + ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { + ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { + ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, { + ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, { + ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, { + ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, { + ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, { + ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, { + ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, { + ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, { + ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, { + ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, { + ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, { + ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, { + ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, { + ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, { + ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, { + ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, { + ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, { + ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, { + ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, { + ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, { + ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, { + ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, { + ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, { + ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, { + ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, { + ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, { + ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, { + ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, { + ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, { + ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, { + ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, { + ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, { + ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, { + ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, { + ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, { + ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, { + ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, { + ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, { + ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, { + ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, { + ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, { + ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, { + ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, { + ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, { + ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, { + ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, { + ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, { + ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, { + ERRDOS, 203, 0xc0000100}, { + ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, { + ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, { + ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, { + ERRDOS, 2401, NT_STATUS_FILES_OPEN}, { + ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, { + ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, { + ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, { + ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, { + ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, { + ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, { + ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, { + ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, { + ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, { + ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, { + ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, { + ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, { + ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, { + ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, { + ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, { + ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, { + ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, { + ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, { + ERRDOS, 59, NT_STATUS_LINK_FAILED}, { + ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, { + ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, { + ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, { + ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, { + ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, { + ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, { + ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, { + ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, { + ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, { + ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, { + ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, { + ERRHRD, ERRgeneral, 0xc000016e}, { + ERRHRD, ERRgeneral, 0xc000016f}, { + ERRHRD, ERRgeneral, 0xc0000170}, { + ERRHRD, ERRgeneral, 0xc0000171}, { + ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, { + ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, { + ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, { + ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, { + ERRHRD, ERRgeneral, 0xc0000179}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, { + ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, { + ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, { + ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, { + ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, { + ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, { + ERRDOS, 19, NT_STATUS_TOO_LATE}, { + ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET}, +/* { This NT error code was 'sqashed' + from NT_STATUS_NO_TRUST_SAM_ACCOUNT to + NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, { + ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, { + ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, { + ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, { + ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, +/* { This NT error code was 'sqashed' + from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, { + ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { + ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { + ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, { + ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, { + ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, { + ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, { + ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, { + ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, { + ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { + ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { + ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, { + ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, { + ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, { + ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, { + ERRHRD, ERRgeneral, NT_STATUS_RETRY}, { + ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, { + ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, { + ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, { + ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, { + ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, { + ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, { + ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, { + ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, { + ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, { + ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, { + ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, { + ERRHRD, ERRgeneral, 0xc000024a}, { + ERRHRD, ERRgeneral, 0xc000024b}, { + ERRHRD, ERRgeneral, 0xc000024c}, { + ERRHRD, ERRgeneral, 0xc000024d}, { + ERRHRD, ERRgeneral, 0xc000024e}, { + ERRHRD, ERRgeneral, 0xc000024f}, { + ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, { + ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, { + ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, { + ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, { + ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, { + ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, { + ERRHRD, ERRgeneral, 0xc000025d}, { + ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, { + ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, { + ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, { + ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, { + ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, { + ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, { + ERRDOS, 21, 0xc000026e}, { + ERRDOS, 161, 0xc0000281}, { + ERRDOS, ERRnoaccess, 0xc000028a}, { + ERRDOS, ERRnoaccess, 0xc000028b}, { + ERRHRD, ERRgeneral, 0xc000028c}, { + ERRDOS, ERRnoaccess, 0xc000028d}, { + ERRDOS, ERRnoaccess, 0xc000028e}, { + ERRDOS, ERRnoaccess, 0xc000028f}, { + ERRDOS, ERRnoaccess, 0xc0000290}, { + ERRDOS, ERRbadfunc, 0xc000029c}, { + ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { + ERRDOS, ERRinvlevel, 0x007c0001}, { + 0, 0, 0 } +}; + +/***************************************************************************** + Print an error message from the status code + *****************************************************************************/ +static void +cifs_print_status(__u32 status_code) +{ + int idx = 0; + + while (nt_errs[idx].nt_errstr != NULL) { + if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == + (status_code & 0xFFFFFF)) { + pr_notice("Status code returned 0x%08x %s\n", + status_code, nt_errs[idx].nt_errstr); + } + idx++; + } + return; +} + + +static void +ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode) +{ + int i; + if (ntstatus == 0) { + *eclass = 0; + *ecode = 0; + return; + } + for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) { + if (ntstatus == ntstatus_to_dos_map[i].ntstatus) { + *eclass = ntstatus_to_dos_map[i].dos_class; + *ecode = ntstatus_to_dos_map[i].dos_code; + return; + } + } + *eclass = ERRHRD; + *ecode = ERRgeneral; +} + +int +map_smb_to_linux_error(char *buf, bool logErr) +{ + struct smb_hdr *smb = (struct smb_hdr *)buf; + unsigned int i; + int rc = -EIO; /* if transport error smb error may not be set */ + __u8 smberrclass; + __u16 smberrcode; + + /* BB if NT Status codes - map NT BB */ + + /* old style smb error codes */ + if (smb->Status.CifsError == 0) + return 0; + + if (smb->Flags2 & SMBFLG2_ERR_STATUS) { + /* translate the newer STATUS codes to old style SMB errors + * and then to POSIX errors */ + __u32 err = le32_to_cpu(smb->Status.CifsError); + if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED))) + cifs_print_status(err); + else if (cifsFYI & CIFS_RC) + cifs_print_status(err); + ntstatus_to_dos(err, &smberrclass, &smberrcode); + } else { + smberrclass = smb->Status.DosError.ErrorClass; + smberrcode = le16_to_cpu(smb->Status.DosError.Error); + } + + /* old style errors */ + + /* DOS class smb error codes - map DOS */ + if (smberrclass == ERRDOS) { + /* 1 byte field no need to byte reverse */ + for (i = 0; + i < + sizeof(mapping_table_ERRDOS) / + sizeof(struct smb_to_posix_error); i++) { + if (mapping_table_ERRDOS[i].smb_err == 0) + break; + else if (mapping_table_ERRDOS[i].smb_err == + smberrcode) { + rc = mapping_table_ERRDOS[i].posix_code; + break; + } + /* else try next error mapping one to see if match */ + } + } else if (smberrclass == ERRSRV) { + /* server class of error codes */ + for (i = 0; + i < + sizeof(mapping_table_ERRSRV) / + sizeof(struct smb_to_posix_error); i++) { + if (mapping_table_ERRSRV[i].smb_err == 0) + break; + else if (mapping_table_ERRSRV[i].smb_err == + smberrcode) { + rc = mapping_table_ERRSRV[i].posix_code; + break; + } + /* else try next error mapping to see if match */ + } + } + /* else ERRHRD class errors or junk - return EIO */ + + cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", + le32_to_cpu(smb->Status.CifsError), rc); + + /* generic corrective action e.g. reconnect SMB session on + * ERRbaduid could be added */ + + return rc; +} + +/* + * calculate the size of the SMB message based on the fixed header + * portion, the number of word parameters and the data portion of the message + */ +unsigned int +smbCalcSize(void *buf) +{ + struct smb_hdr *ptr = (struct smb_hdr *)buf; + return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + + 2 /* size of the bcc field */ + get_bcc(ptr)); +} + +/* The following are taken from fs/ntfs/util.c */ + +#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) + +/* + * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) + * into Unix UTC (based 1970-01-01, in seconds). + */ +struct timespec +cifs_NTtimeToUnix(__le64 ntutc) +{ + struct timespec ts; + /* BB what about the timezone? BB */ + + /* Subtract the NTFS time offset, then convert to 1s intervals. */ + s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + u64 abs_t; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + abs_t = -t; + ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -abs_t; + } else { + abs_t = t; + ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; + } + + return ts; +} + +/* Convert the Unix UTC into NT UTC. */ +u64 +cifs_UnixTimeToNT(struct timespec t) +{ + /* Convert to 100ns intervals and then add the NTFS time offset. */ + return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; +} + +static const int total_days_of_prev_months[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 +}; + +struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) +{ + struct timespec ts; + int sec, min, days, month, year; + u16 date = le16_to_cpu(le_date); + u16 time = le16_to_cpu(le_time); + SMB_TIME *st = (SMB_TIME *)&time; + SMB_DATE *sd = (SMB_DATE *)&date; + + cifs_dbg(FYI, "date %d time %d\n", date, time); + + sec = 2 * st->TwoSeconds; + min = st->Minutes; + if ((sec > 59) || (min > 59)) + cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec); + sec += (min * 60); + sec += 60 * 60 * st->Hours; + if (st->Hours > 24) + cifs_dbg(VFS, "illegal hours %d\n", st->Hours); + days = sd->Day; + month = sd->Month; + if ((days > 31) || (month > 12)) { + cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); + if (month > 12) + month = 12; + } + month -= 1; + days += total_days_of_prev_months[month]; + days += 3652; /* account for difference in days between 1980 and 1970 */ + year = sd->Year; + days += year * 365; + days += (year/4); /* leap year */ + /* generalized leap year calculation is more complex, ie no leap year + for years/100 except for years/400, but since the maximum number for DOS + year is 2**7, the last year is 1980+127, which means we need only + consider 2 special case years, ie the years 2000 and 2100, and only + adjust for the lack of leap year for the year 2100, as 2000 was a + leap year (divisable by 400) */ + if (year >= 120) /* the year 2100 */ + days = days - 1; /* do not count leap year for the year 2100 */ + + /* adjust for leap year where we are still before leap day */ + if (year != 120) + days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); + sec += 24 * 60 * 60 * days; + + ts.tv_sec = sec + offset; + + /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */ + + ts.tv_nsec = 0; + return ts; +} diff --git a/kernel/fs/cifs/nterr.c b/kernel/fs/cifs/nterr.c new file mode 100644 index 000000000..b6023c646 --- /dev/null +++ b/kernel/fs/cifs/nterr.c @@ -0,0 +1,687 @@ +/* + * Unix SMB/Netbios implementation. + * Version 1.9. + * RPC Pipe client / server routines + * Copyright (C) Luke Kenneth Casson Leighton 1997-2001. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* NT error codes - see nterr.h */ +#include +#include +#include "nterr.h" + +const struct nt_err_code_struct nt_errs[] = { + {"NT_STATUS_OK", NT_STATUS_OK}, + {"NT_STATUS_UNSUCCESSFUL", NT_STATUS_UNSUCCESSFUL}, + {"NT_STATUS_NOT_IMPLEMENTED", NT_STATUS_NOT_IMPLEMENTED}, + {"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS}, + {"NT_STATUS_INFO_LENGTH_MISMATCH", NT_STATUS_INFO_LENGTH_MISMATCH}, + {"NT_STATUS_ACCESS_VIOLATION", NT_STATUS_ACCESS_VIOLATION}, + {"NT_STATUS_BUFFER_OVERFLOW", NT_STATUS_BUFFER_OVERFLOW}, + {"NT_STATUS_IN_PAGE_ERROR", NT_STATUS_IN_PAGE_ERROR}, + {"NT_STATUS_PAGEFILE_QUOTA", NT_STATUS_PAGEFILE_QUOTA}, + {"NT_STATUS_INVALID_HANDLE", NT_STATUS_INVALID_HANDLE}, + {"NT_STATUS_BAD_INITIAL_STACK", NT_STATUS_BAD_INITIAL_STACK}, + {"NT_STATUS_BAD_INITIAL_PC", NT_STATUS_BAD_INITIAL_PC}, + {"NT_STATUS_INVALID_CID", NT_STATUS_INVALID_CID}, + {"NT_STATUS_TIMER_NOT_CANCELED", NT_STATUS_TIMER_NOT_CANCELED}, + {"NT_STATUS_INVALID_PARAMETER", NT_STATUS_INVALID_PARAMETER}, + {"NT_STATUS_NO_SUCH_DEVICE", NT_STATUS_NO_SUCH_DEVICE}, + {"NT_STATUS_NO_SUCH_FILE", NT_STATUS_NO_SUCH_FILE}, + {"NT_STATUS_INVALID_DEVICE_REQUEST", + NT_STATUS_INVALID_DEVICE_REQUEST}, + {"NT_STATUS_END_OF_FILE", NT_STATUS_END_OF_FILE}, + {"NT_STATUS_WRONG_VOLUME", NT_STATUS_WRONG_VOLUME}, + {"NT_STATUS_NO_MEDIA_IN_DEVICE", NT_STATUS_NO_MEDIA_IN_DEVICE}, + {"NT_STATUS_UNRECOGNIZED_MEDIA", NT_STATUS_UNRECOGNIZED_MEDIA}, + {"NT_STATUS_NONEXISTENT_SECTOR", NT_STATUS_NONEXISTENT_SECTOR}, + {"NT_STATUS_MORE_PROCESSING_REQUIRED", + NT_STATUS_MORE_PROCESSING_REQUIRED}, + {"NT_STATUS_NO_MEMORY", NT_STATUS_NO_MEMORY}, + {"NT_STATUS_CONFLICTING_ADDRESSES", + NT_STATUS_CONFLICTING_ADDRESSES}, + {"NT_STATUS_NOT_MAPPED_VIEW", NT_STATUS_NOT_MAPPED_VIEW}, + {"NT_STATUS_UNABLE_TO_FREE_VM", NT_STATUS_UNABLE_TO_FREE_VM}, + {"NT_STATUS_UNABLE_TO_DELETE_SECTION", + NT_STATUS_UNABLE_TO_DELETE_SECTION}, + {"NT_STATUS_INVALID_SYSTEM_SERVICE", + NT_STATUS_INVALID_SYSTEM_SERVICE}, + {"NT_STATUS_ILLEGAL_INSTRUCTION", NT_STATUS_ILLEGAL_INSTRUCTION}, + {"NT_STATUS_INVALID_LOCK_SEQUENCE", + NT_STATUS_INVALID_LOCK_SEQUENCE}, + {"NT_STATUS_INVALID_VIEW_SIZE", NT_STATUS_INVALID_VIEW_SIZE}, + {"NT_STATUS_INVALID_FILE_FOR_SECTION", + NT_STATUS_INVALID_FILE_FOR_SECTION}, + {"NT_STATUS_ALREADY_COMMITTED", NT_STATUS_ALREADY_COMMITTED}, + {"NT_STATUS_ACCESS_DENIED", NT_STATUS_ACCESS_DENIED}, + {"NT_STATUS_BUFFER_TOO_SMALL", NT_STATUS_BUFFER_TOO_SMALL}, + {"NT_STATUS_OBJECT_TYPE_MISMATCH", NT_STATUS_OBJECT_TYPE_MISMATCH}, + {"NT_STATUS_NONCONTINUABLE_EXCEPTION", + NT_STATUS_NONCONTINUABLE_EXCEPTION}, + {"NT_STATUS_INVALID_DISPOSITION", NT_STATUS_INVALID_DISPOSITION}, + {"NT_STATUS_UNWIND", NT_STATUS_UNWIND}, + {"NT_STATUS_BAD_STACK", NT_STATUS_BAD_STACK}, + {"NT_STATUS_INVALID_UNWIND_TARGET", + NT_STATUS_INVALID_UNWIND_TARGET}, + {"NT_STATUS_NOT_LOCKED", NT_STATUS_NOT_LOCKED}, + {"NT_STATUS_PARITY_ERROR", NT_STATUS_PARITY_ERROR}, + {"NT_STATUS_UNABLE_TO_DECOMMIT_VM", + NT_STATUS_UNABLE_TO_DECOMMIT_VM}, + {"NT_STATUS_NOT_COMMITTED", NT_STATUS_NOT_COMMITTED}, + {"NT_STATUS_INVALID_PORT_ATTRIBUTES", + NT_STATUS_INVALID_PORT_ATTRIBUTES}, + {"NT_STATUS_PORT_MESSAGE_TOO_LONG", + NT_STATUS_PORT_MESSAGE_TOO_LONG}, + {"NT_STATUS_INVALID_PARAMETER_MIX", + NT_STATUS_INVALID_PARAMETER_MIX}, + {"NT_STATUS_INVALID_QUOTA_LOWER", NT_STATUS_INVALID_QUOTA_LOWER}, + {"NT_STATUS_DISK_CORRUPT_ERROR", NT_STATUS_DISK_CORRUPT_ERROR}, + {"NT_STATUS_OBJECT_NAME_INVALID", NT_STATUS_OBJECT_NAME_INVALID}, + {"NT_STATUS_OBJECT_NAME_NOT_FOUND", + NT_STATUS_OBJECT_NAME_NOT_FOUND}, + {"NT_STATUS_OBJECT_NAME_COLLISION", + NT_STATUS_OBJECT_NAME_COLLISION}, + {"NT_STATUS_HANDLE_NOT_WAITABLE", NT_STATUS_HANDLE_NOT_WAITABLE}, + {"NT_STATUS_PORT_DISCONNECTED", NT_STATUS_PORT_DISCONNECTED}, + {"NT_STATUS_DEVICE_ALREADY_ATTACHED", + NT_STATUS_DEVICE_ALREADY_ATTACHED}, + {"NT_STATUS_OBJECT_PATH_INVALID", NT_STATUS_OBJECT_PATH_INVALID}, + {"NT_STATUS_OBJECT_PATH_NOT_FOUND", + NT_STATUS_OBJECT_PATH_NOT_FOUND}, + {"NT_STATUS_OBJECT_PATH_SYNTAX_BAD", + NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, + {"NT_STATUS_DATA_OVERRUN", NT_STATUS_DATA_OVERRUN}, + {"NT_STATUS_DATA_LATE_ERROR", NT_STATUS_DATA_LATE_ERROR}, + {"NT_STATUS_DATA_ERROR", NT_STATUS_DATA_ERROR}, + {"NT_STATUS_CRC_ERROR", NT_STATUS_CRC_ERROR}, + {"NT_STATUS_SECTION_TOO_BIG", NT_STATUS_SECTION_TOO_BIG}, + {"NT_STATUS_PORT_CONNECTION_REFUSED", + NT_STATUS_PORT_CONNECTION_REFUSED}, + {"NT_STATUS_INVALID_PORT_HANDLE", NT_STATUS_INVALID_PORT_HANDLE}, + {"NT_STATUS_SHARING_VIOLATION", NT_STATUS_SHARING_VIOLATION}, + {"NT_STATUS_QUOTA_EXCEEDED", NT_STATUS_QUOTA_EXCEEDED}, + {"NT_STATUS_INVALID_PAGE_PROTECTION", + NT_STATUS_INVALID_PAGE_PROTECTION}, + {"NT_STATUS_MUTANT_NOT_OWNED", NT_STATUS_MUTANT_NOT_OWNED}, + {"NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED", + NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, + {"NT_STATUS_PORT_ALREADY_SET", NT_STATUS_PORT_ALREADY_SET}, + {"NT_STATUS_SECTION_NOT_IMAGE", NT_STATUS_SECTION_NOT_IMAGE}, + {"NT_STATUS_SUSPEND_COUNT_EXCEEDED", + NT_STATUS_SUSPEND_COUNT_EXCEEDED}, + {"NT_STATUS_THREAD_IS_TERMINATING", + NT_STATUS_THREAD_IS_TERMINATING}, + {"NT_STATUS_BAD_WORKING_SET_LIMIT", + NT_STATUS_BAD_WORKING_SET_LIMIT}, + {"NT_STATUS_INCOMPATIBLE_FILE_MAP", + NT_STATUS_INCOMPATIBLE_FILE_MAP}, + {"NT_STATUS_SECTION_PROTECTION", NT_STATUS_SECTION_PROTECTION}, + {"NT_STATUS_EAS_NOT_SUPPORTED", NT_STATUS_EAS_NOT_SUPPORTED}, + {"NT_STATUS_EA_TOO_LARGE", NT_STATUS_EA_TOO_LARGE}, + {"NT_STATUS_NONEXISTENT_EA_ENTRY", NT_STATUS_NONEXISTENT_EA_ENTRY}, + {"NT_STATUS_NO_EAS_ON_FILE", NT_STATUS_NO_EAS_ON_FILE}, + {"NT_STATUS_EA_CORRUPT_ERROR", NT_STATUS_EA_CORRUPT_ERROR}, + {"NT_STATUS_FILE_LOCK_CONFLICT", NT_STATUS_FILE_LOCK_CONFLICT}, + {"NT_STATUS_LOCK_NOT_GRANTED", NT_STATUS_LOCK_NOT_GRANTED}, + {"NT_STATUS_DELETE_PENDING", NT_STATUS_DELETE_PENDING}, + {"NT_STATUS_CTL_FILE_NOT_SUPPORTED", + NT_STATUS_CTL_FILE_NOT_SUPPORTED}, + {"NT_STATUS_UNKNOWN_REVISION", NT_STATUS_UNKNOWN_REVISION}, + {"NT_STATUS_REVISION_MISMATCH", NT_STATUS_REVISION_MISMATCH}, + {"NT_STATUS_INVALID_OWNER", NT_STATUS_INVALID_OWNER}, + {"NT_STATUS_INVALID_PRIMARY_GROUP", + NT_STATUS_INVALID_PRIMARY_GROUP}, + {"NT_STATUS_NO_IMPERSONATION_TOKEN", + NT_STATUS_NO_IMPERSONATION_TOKEN}, + {"NT_STATUS_CANT_DISABLE_MANDATORY", + NT_STATUS_CANT_DISABLE_MANDATORY}, + {"NT_STATUS_NO_LOGON_SERVERS", NT_STATUS_NO_LOGON_SERVERS}, + {"NT_STATUS_NO_SUCH_LOGON_SESSION", + NT_STATUS_NO_SUCH_LOGON_SESSION}, + {"NT_STATUS_NO_SUCH_PRIVILEGE", NT_STATUS_NO_SUCH_PRIVILEGE}, + {"NT_STATUS_PRIVILEGE_NOT_HELD", NT_STATUS_PRIVILEGE_NOT_HELD}, + {"NT_STATUS_INVALID_ACCOUNT_NAME", NT_STATUS_INVALID_ACCOUNT_NAME}, + {"NT_STATUS_USER_EXISTS", NT_STATUS_USER_EXISTS}, + {"NT_STATUS_NO_SUCH_USER", NT_STATUS_NO_SUCH_USER}, + {"NT_STATUS_GROUP_EXISTS", NT_STATUS_GROUP_EXISTS}, + {"NT_STATUS_NO_SUCH_GROUP", NT_STATUS_NO_SUCH_GROUP}, + {"NT_STATUS_MEMBER_IN_GROUP", NT_STATUS_MEMBER_IN_GROUP}, + {"NT_STATUS_MEMBER_NOT_IN_GROUP", NT_STATUS_MEMBER_NOT_IN_GROUP}, + {"NT_STATUS_LAST_ADMIN", NT_STATUS_LAST_ADMIN}, + {"NT_STATUS_WRONG_PASSWORD", NT_STATUS_WRONG_PASSWORD}, + {"NT_STATUS_ILL_FORMED_PASSWORD", NT_STATUS_ILL_FORMED_PASSWORD}, + {"NT_STATUS_PASSWORD_RESTRICTION", NT_STATUS_PASSWORD_RESTRICTION}, + {"NT_STATUS_LOGON_FAILURE", NT_STATUS_LOGON_FAILURE}, + {"NT_STATUS_ACCOUNT_RESTRICTION", NT_STATUS_ACCOUNT_RESTRICTION}, + {"NT_STATUS_INVALID_LOGON_HOURS", NT_STATUS_INVALID_LOGON_HOURS}, + {"NT_STATUS_INVALID_WORKSTATION", NT_STATUS_INVALID_WORKSTATION}, + {"NT_STATUS_PASSWORD_EXPIRED", NT_STATUS_PASSWORD_EXPIRED}, + {"NT_STATUS_ACCOUNT_DISABLED", NT_STATUS_ACCOUNT_DISABLED}, + {"NT_STATUS_NONE_MAPPED", NT_STATUS_NONE_MAPPED}, + {"NT_STATUS_TOO_MANY_LUIDS_REQUESTED", + NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, + {"NT_STATUS_LUIDS_EXHAUSTED", NT_STATUS_LUIDS_EXHAUSTED}, + {"NT_STATUS_INVALID_SUB_AUTHORITY", + NT_STATUS_INVALID_SUB_AUTHORITY}, + {"NT_STATUS_INVALID_ACL", NT_STATUS_INVALID_ACL}, + {"NT_STATUS_INVALID_SID", NT_STATUS_INVALID_SID}, + {"NT_STATUS_INVALID_SECURITY_DESCR", + NT_STATUS_INVALID_SECURITY_DESCR}, + {"NT_STATUS_PROCEDURE_NOT_FOUND", NT_STATUS_PROCEDURE_NOT_FOUND}, + {"NT_STATUS_INVALID_IMAGE_FORMAT", NT_STATUS_INVALID_IMAGE_FORMAT}, + {"NT_STATUS_NO_TOKEN", NT_STATUS_NO_TOKEN}, + {"NT_STATUS_BAD_INHERITANCE_ACL", NT_STATUS_BAD_INHERITANCE_ACL}, + {"NT_STATUS_RANGE_NOT_LOCKED", NT_STATUS_RANGE_NOT_LOCKED}, + {"NT_STATUS_DISK_FULL", NT_STATUS_DISK_FULL}, + {"NT_STATUS_SERVER_DISABLED", NT_STATUS_SERVER_DISABLED}, + {"NT_STATUS_SERVER_NOT_DISABLED", NT_STATUS_SERVER_NOT_DISABLED}, + {"NT_STATUS_TOO_MANY_GUIDS_REQUESTED", + NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, + {"NT_STATUS_GUIDS_EXHAUSTED", NT_STATUS_GUIDS_EXHAUSTED}, + {"NT_STATUS_INVALID_ID_AUTHORITY", NT_STATUS_INVALID_ID_AUTHORITY}, + {"NT_STATUS_AGENTS_EXHAUSTED", NT_STATUS_AGENTS_EXHAUSTED}, + {"NT_STATUS_INVALID_VOLUME_LABEL", NT_STATUS_INVALID_VOLUME_LABEL}, + {"NT_STATUS_SECTION_NOT_EXTENDED", NT_STATUS_SECTION_NOT_EXTENDED}, + {"NT_STATUS_NOT_MAPPED_DATA", NT_STATUS_NOT_MAPPED_DATA}, + {"NT_STATUS_RESOURCE_DATA_NOT_FOUND", + NT_STATUS_RESOURCE_DATA_NOT_FOUND}, + {"NT_STATUS_RESOURCE_TYPE_NOT_FOUND", + NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, + {"NT_STATUS_RESOURCE_NAME_NOT_FOUND", + NT_STATUS_RESOURCE_NAME_NOT_FOUND}, + {"NT_STATUS_ARRAY_BOUNDS_EXCEEDED", + NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, + {"NT_STATUS_FLOAT_DENORMAL_OPERAND", + NT_STATUS_FLOAT_DENORMAL_OPERAND}, + {"NT_STATUS_FLOAT_DIVIDE_BY_ZERO", NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, + {"NT_STATUS_FLOAT_INEXACT_RESULT", NT_STATUS_FLOAT_INEXACT_RESULT}, + {"NT_STATUS_FLOAT_INVALID_OPERATION", + NT_STATUS_FLOAT_INVALID_OPERATION}, + {"NT_STATUS_FLOAT_OVERFLOW", NT_STATUS_FLOAT_OVERFLOW}, + {"NT_STATUS_FLOAT_STACK_CHECK", NT_STATUS_FLOAT_STACK_CHECK}, + {"NT_STATUS_FLOAT_UNDERFLOW", NT_STATUS_FLOAT_UNDERFLOW}, + {"NT_STATUS_INTEGER_DIVIDE_BY_ZERO", + NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, + {"NT_STATUS_INTEGER_OVERFLOW", NT_STATUS_INTEGER_OVERFLOW}, + {"NT_STATUS_PRIVILEGED_INSTRUCTION", + NT_STATUS_PRIVILEGED_INSTRUCTION}, + {"NT_STATUS_TOO_MANY_PAGING_FILES", + NT_STATUS_TOO_MANY_PAGING_FILES}, + {"NT_STATUS_FILE_INVALID", NT_STATUS_FILE_INVALID}, + {"NT_STATUS_ALLOTTED_SPACE_EXCEEDED", + NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, + {"NT_STATUS_INSUFFICIENT_RESOURCES", + NT_STATUS_INSUFFICIENT_RESOURCES}, + {"NT_STATUS_DFS_EXIT_PATH_FOUND", NT_STATUS_DFS_EXIT_PATH_FOUND}, + {"NT_STATUS_DEVICE_DATA_ERROR", NT_STATUS_DEVICE_DATA_ERROR}, + {"NT_STATUS_DEVICE_NOT_CONNECTED", NT_STATUS_DEVICE_NOT_CONNECTED}, + {"NT_STATUS_DEVICE_POWER_FAILURE", NT_STATUS_DEVICE_POWER_FAILURE}, + {"NT_STATUS_FREE_VM_NOT_AT_BASE", NT_STATUS_FREE_VM_NOT_AT_BASE}, + {"NT_STATUS_MEMORY_NOT_ALLOCATED", NT_STATUS_MEMORY_NOT_ALLOCATED}, + {"NT_STATUS_WORKING_SET_QUOTA", NT_STATUS_WORKING_SET_QUOTA}, + {"NT_STATUS_MEDIA_WRITE_PROTECTED", + NT_STATUS_MEDIA_WRITE_PROTECTED}, + {"NT_STATUS_DEVICE_NOT_READY", NT_STATUS_DEVICE_NOT_READY}, + {"NT_STATUS_INVALID_GROUP_ATTRIBUTES", + NT_STATUS_INVALID_GROUP_ATTRIBUTES}, + {"NT_STATUS_BAD_IMPERSONATION_LEVEL", + NT_STATUS_BAD_IMPERSONATION_LEVEL}, + {"NT_STATUS_CANT_OPEN_ANONYMOUS", NT_STATUS_CANT_OPEN_ANONYMOUS}, + {"NT_STATUS_BAD_VALIDATION_CLASS", NT_STATUS_BAD_VALIDATION_CLASS}, + {"NT_STATUS_BAD_TOKEN_TYPE", NT_STATUS_BAD_TOKEN_TYPE}, + {"NT_STATUS_BAD_MASTER_BOOT_RECORD", + NT_STATUS_BAD_MASTER_BOOT_RECORD}, + {"NT_STATUS_INSTRUCTION_MISALIGNMENT", + NT_STATUS_INSTRUCTION_MISALIGNMENT}, + {"NT_STATUS_INSTANCE_NOT_AVAILABLE", + NT_STATUS_INSTANCE_NOT_AVAILABLE}, + {"NT_STATUS_PIPE_NOT_AVAILABLE", NT_STATUS_PIPE_NOT_AVAILABLE}, + {"NT_STATUS_INVALID_PIPE_STATE", NT_STATUS_INVALID_PIPE_STATE}, + {"NT_STATUS_PIPE_BUSY", NT_STATUS_PIPE_BUSY}, + {"NT_STATUS_ILLEGAL_FUNCTION", NT_STATUS_ILLEGAL_FUNCTION}, + {"NT_STATUS_PIPE_DISCONNECTED", NT_STATUS_PIPE_DISCONNECTED}, + {"NT_STATUS_PIPE_CLOSING", NT_STATUS_PIPE_CLOSING}, + {"NT_STATUS_PIPE_CONNECTED", NT_STATUS_PIPE_CONNECTED}, + {"NT_STATUS_PIPE_LISTENING", NT_STATUS_PIPE_LISTENING}, + {"NT_STATUS_INVALID_READ_MODE", NT_STATUS_INVALID_READ_MODE}, + {"NT_STATUS_IO_TIMEOUT", NT_STATUS_IO_TIMEOUT}, + {"NT_STATUS_FILE_FORCED_CLOSED", NT_STATUS_FILE_FORCED_CLOSED}, + {"NT_STATUS_PROFILING_NOT_STARTED", + NT_STATUS_PROFILING_NOT_STARTED}, + {"NT_STATUS_PROFILING_NOT_STOPPED", + NT_STATUS_PROFILING_NOT_STOPPED}, + {"NT_STATUS_COULD_NOT_INTERPRET", NT_STATUS_COULD_NOT_INTERPRET}, + {"NT_STATUS_FILE_IS_A_DIRECTORY", NT_STATUS_FILE_IS_A_DIRECTORY}, + {"NT_STATUS_NOT_SUPPORTED", NT_STATUS_NOT_SUPPORTED}, + {"NT_STATUS_REMOTE_NOT_LISTENING", NT_STATUS_REMOTE_NOT_LISTENING}, + {"NT_STATUS_DUPLICATE_NAME", NT_STATUS_DUPLICATE_NAME}, + {"NT_STATUS_BAD_NETWORK_PATH", NT_STATUS_BAD_NETWORK_PATH}, + {"NT_STATUS_NETWORK_BUSY", NT_STATUS_NETWORK_BUSY}, + {"NT_STATUS_DEVICE_DOES_NOT_EXIST", + NT_STATUS_DEVICE_DOES_NOT_EXIST}, + {"NT_STATUS_TOO_MANY_COMMANDS", NT_STATUS_TOO_MANY_COMMANDS}, + {"NT_STATUS_ADAPTER_HARDWARE_ERROR", + NT_STATUS_ADAPTER_HARDWARE_ERROR}, + {"NT_STATUS_INVALID_NETWORK_RESPONSE", + NT_STATUS_INVALID_NETWORK_RESPONSE}, + {"NT_STATUS_UNEXPECTED_NETWORK_ERROR", + NT_STATUS_UNEXPECTED_NETWORK_ERROR}, + {"NT_STATUS_BAD_REMOTE_ADAPTER", NT_STATUS_BAD_REMOTE_ADAPTER}, + {"NT_STATUS_PRINT_QUEUE_FULL", NT_STATUS_PRINT_QUEUE_FULL}, + {"NT_STATUS_NO_SPOOL_SPACE", NT_STATUS_NO_SPOOL_SPACE}, + {"NT_STATUS_PRINT_CANCELLED", NT_STATUS_PRINT_CANCELLED}, + {"NT_STATUS_NETWORK_NAME_DELETED", NT_STATUS_NETWORK_NAME_DELETED}, + {"NT_STATUS_NETWORK_ACCESS_DENIED", + NT_STATUS_NETWORK_ACCESS_DENIED}, + {"NT_STATUS_BAD_DEVICE_TYPE", NT_STATUS_BAD_DEVICE_TYPE}, + {"NT_STATUS_BAD_NETWORK_NAME", NT_STATUS_BAD_NETWORK_NAME}, + {"NT_STATUS_TOO_MANY_NAMES", NT_STATUS_TOO_MANY_NAMES}, + {"NT_STATUS_TOO_MANY_SESSIONS", NT_STATUS_TOO_MANY_SESSIONS}, + {"NT_STATUS_SHARING_PAUSED", NT_STATUS_SHARING_PAUSED}, + {"NT_STATUS_REQUEST_NOT_ACCEPTED", NT_STATUS_REQUEST_NOT_ACCEPTED}, + {"NT_STATUS_REDIRECTOR_PAUSED", NT_STATUS_REDIRECTOR_PAUSED}, + {"NT_STATUS_NET_WRITE_FAULT", NT_STATUS_NET_WRITE_FAULT}, + {"NT_STATUS_PROFILING_AT_LIMIT", NT_STATUS_PROFILING_AT_LIMIT}, + {"NT_STATUS_NOT_SAME_DEVICE", NT_STATUS_NOT_SAME_DEVICE}, + {"NT_STATUS_FILE_RENAMED", NT_STATUS_FILE_RENAMED}, + {"NT_STATUS_VIRTUAL_CIRCUIT_CLOSED", + NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, + {"NT_STATUS_NO_SECURITY_ON_OBJECT", + NT_STATUS_NO_SECURITY_ON_OBJECT}, + {"NT_STATUS_CANT_WAIT", NT_STATUS_CANT_WAIT}, + {"NT_STATUS_PIPE_EMPTY", NT_STATUS_PIPE_EMPTY}, + {"NT_STATUS_CANT_ACCESS_DOMAIN_INFO", + NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, + {"NT_STATUS_CANT_TERMINATE_SELF", NT_STATUS_CANT_TERMINATE_SELF}, + {"NT_STATUS_INVALID_SERVER_STATE", NT_STATUS_INVALID_SERVER_STATE}, + {"NT_STATUS_INVALID_DOMAIN_STATE", NT_STATUS_INVALID_DOMAIN_STATE}, + {"NT_STATUS_INVALID_DOMAIN_ROLE", NT_STATUS_INVALID_DOMAIN_ROLE}, + {"NT_STATUS_NO_SUCH_DOMAIN", NT_STATUS_NO_SUCH_DOMAIN}, + {"NT_STATUS_DOMAIN_EXISTS", NT_STATUS_DOMAIN_EXISTS}, + {"NT_STATUS_DOMAIN_LIMIT_EXCEEDED", + NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, + {"NT_STATUS_OPLOCK_NOT_GRANTED", NT_STATUS_OPLOCK_NOT_GRANTED}, + {"NT_STATUS_INVALID_OPLOCK_PROTOCOL", + NT_STATUS_INVALID_OPLOCK_PROTOCOL}, + {"NT_STATUS_INTERNAL_DB_CORRUPTION", + NT_STATUS_INTERNAL_DB_CORRUPTION}, + {"NT_STATUS_INTERNAL_ERROR", NT_STATUS_INTERNAL_ERROR}, + {"NT_STATUS_GENERIC_NOT_MAPPED", NT_STATUS_GENERIC_NOT_MAPPED}, + {"NT_STATUS_BAD_DESCRIPTOR_FORMAT", + NT_STATUS_BAD_DESCRIPTOR_FORMAT}, + {"NT_STATUS_INVALID_USER_BUFFER", NT_STATUS_INVALID_USER_BUFFER}, + {"NT_STATUS_UNEXPECTED_IO_ERROR", NT_STATUS_UNEXPECTED_IO_ERROR}, + {"NT_STATUS_UNEXPECTED_MM_CREATE_ERR", + NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, + {"NT_STATUS_UNEXPECTED_MM_MAP_ERROR", + NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, + {"NT_STATUS_UNEXPECTED_MM_EXTEND_ERR", + NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, + {"NT_STATUS_NOT_LOGON_PROCESS", NT_STATUS_NOT_LOGON_PROCESS}, + {"NT_STATUS_LOGON_SESSION_EXISTS", NT_STATUS_LOGON_SESSION_EXISTS}, + {"NT_STATUS_INVALID_PARAMETER_1", NT_STATUS_INVALID_PARAMETER_1}, + {"NT_STATUS_INVALID_PARAMETER_2", NT_STATUS_INVALID_PARAMETER_2}, + {"NT_STATUS_INVALID_PARAMETER_3", NT_STATUS_INVALID_PARAMETER_3}, + {"NT_STATUS_INVALID_PARAMETER_4", NT_STATUS_INVALID_PARAMETER_4}, + {"NT_STATUS_INVALID_PARAMETER_5", NT_STATUS_INVALID_PARAMETER_5}, + {"NT_STATUS_INVALID_PARAMETER_6", NT_STATUS_INVALID_PARAMETER_6}, + {"NT_STATUS_INVALID_PARAMETER_7", NT_STATUS_INVALID_PARAMETER_7}, + {"NT_STATUS_INVALID_PARAMETER_8", NT_STATUS_INVALID_PARAMETER_8}, + {"NT_STATUS_INVALID_PARAMETER_9", NT_STATUS_INVALID_PARAMETER_9}, + {"NT_STATUS_INVALID_PARAMETER_10", NT_STATUS_INVALID_PARAMETER_10}, + {"NT_STATUS_INVALID_PARAMETER_11", NT_STATUS_INVALID_PARAMETER_11}, + {"NT_STATUS_INVALID_PARAMETER_12", NT_STATUS_INVALID_PARAMETER_12}, + {"NT_STATUS_REDIRECTOR_NOT_STARTED", + NT_STATUS_REDIRECTOR_NOT_STARTED}, + {"NT_STATUS_REDIRECTOR_STARTED", NT_STATUS_REDIRECTOR_STARTED}, + {"NT_STATUS_STACK_OVERFLOW", NT_STATUS_STACK_OVERFLOW}, + {"NT_STATUS_NO_SUCH_PACKAGE", NT_STATUS_NO_SUCH_PACKAGE}, + {"NT_STATUS_BAD_FUNCTION_TABLE", NT_STATUS_BAD_FUNCTION_TABLE}, + {"NT_STATUS_DIRECTORY_NOT_EMPTY", NT_STATUS_DIRECTORY_NOT_EMPTY}, + {"NT_STATUS_FILE_CORRUPT_ERROR", NT_STATUS_FILE_CORRUPT_ERROR}, + {"NT_STATUS_NOT_A_DIRECTORY", NT_STATUS_NOT_A_DIRECTORY}, + {"NT_STATUS_BAD_LOGON_SESSION_STATE", + NT_STATUS_BAD_LOGON_SESSION_STATE}, + {"NT_STATUS_LOGON_SESSION_COLLISION", + NT_STATUS_LOGON_SESSION_COLLISION}, + {"NT_STATUS_NAME_TOO_LONG", NT_STATUS_NAME_TOO_LONG}, + {"NT_STATUS_FILES_OPEN", NT_STATUS_FILES_OPEN}, + {"NT_STATUS_CONNECTION_IN_USE", NT_STATUS_CONNECTION_IN_USE}, + {"NT_STATUS_MESSAGE_NOT_FOUND", NT_STATUS_MESSAGE_NOT_FOUND}, + {"NT_STATUS_PROCESS_IS_TERMINATING", + NT_STATUS_PROCESS_IS_TERMINATING}, + {"NT_STATUS_INVALID_LOGON_TYPE", NT_STATUS_INVALID_LOGON_TYPE}, + {"NT_STATUS_NO_GUID_TRANSLATION", NT_STATUS_NO_GUID_TRANSLATION}, + {"NT_STATUS_CANNOT_IMPERSONATE", NT_STATUS_CANNOT_IMPERSONATE}, + {"NT_STATUS_IMAGE_ALREADY_LOADED", NT_STATUS_IMAGE_ALREADY_LOADED}, + {"NT_STATUS_ABIOS_NOT_PRESENT", NT_STATUS_ABIOS_NOT_PRESENT}, + {"NT_STATUS_ABIOS_LID_NOT_EXIST", NT_STATUS_ABIOS_LID_NOT_EXIST}, + {"NT_STATUS_ABIOS_LID_ALREADY_OWNED", + NT_STATUS_ABIOS_LID_ALREADY_OWNED}, + {"NT_STATUS_ABIOS_NOT_LID_OWNER", NT_STATUS_ABIOS_NOT_LID_OWNER}, + {"NT_STATUS_ABIOS_INVALID_COMMAND", + NT_STATUS_ABIOS_INVALID_COMMAND}, + {"NT_STATUS_ABIOS_INVALID_LID", NT_STATUS_ABIOS_INVALID_LID}, + {"NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE", + NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, + {"NT_STATUS_ABIOS_INVALID_SELECTOR", + NT_STATUS_ABIOS_INVALID_SELECTOR}, + {"NT_STATUS_NO_LDT", NT_STATUS_NO_LDT}, + {"NT_STATUS_INVALID_LDT_SIZE", NT_STATUS_INVALID_LDT_SIZE}, + {"NT_STATUS_INVALID_LDT_OFFSET", NT_STATUS_INVALID_LDT_OFFSET}, + {"NT_STATUS_INVALID_LDT_DESCRIPTOR", + NT_STATUS_INVALID_LDT_DESCRIPTOR}, + {"NT_STATUS_INVALID_IMAGE_NE_FORMAT", + NT_STATUS_INVALID_IMAGE_NE_FORMAT}, + {"NT_STATUS_RXACT_INVALID_STATE", NT_STATUS_RXACT_INVALID_STATE}, + {"NT_STATUS_RXACT_COMMIT_FAILURE", NT_STATUS_RXACT_COMMIT_FAILURE}, + {"NT_STATUS_MAPPED_FILE_SIZE_ZERO", + NT_STATUS_MAPPED_FILE_SIZE_ZERO}, + {"NT_STATUS_TOO_MANY_OPENED_FILES", + NT_STATUS_TOO_MANY_OPENED_FILES}, + {"NT_STATUS_CANCELLED", NT_STATUS_CANCELLED}, + {"NT_STATUS_CANNOT_DELETE", NT_STATUS_CANNOT_DELETE}, + {"NT_STATUS_INVALID_COMPUTER_NAME", + NT_STATUS_INVALID_COMPUTER_NAME}, + {"NT_STATUS_FILE_DELETED", NT_STATUS_FILE_DELETED}, + {"NT_STATUS_SPECIAL_ACCOUNT", NT_STATUS_SPECIAL_ACCOUNT}, + {"NT_STATUS_SPECIAL_GROUP", NT_STATUS_SPECIAL_GROUP}, + {"NT_STATUS_SPECIAL_USER", NT_STATUS_SPECIAL_USER}, + {"NT_STATUS_MEMBERS_PRIMARY_GROUP", + NT_STATUS_MEMBERS_PRIMARY_GROUP}, + {"NT_STATUS_FILE_CLOSED", NT_STATUS_FILE_CLOSED}, + {"NT_STATUS_TOO_MANY_THREADS", NT_STATUS_TOO_MANY_THREADS}, + {"NT_STATUS_THREAD_NOT_IN_PROCESS", + NT_STATUS_THREAD_NOT_IN_PROCESS}, + {"NT_STATUS_TOKEN_ALREADY_IN_USE", NT_STATUS_TOKEN_ALREADY_IN_USE}, + {"NT_STATUS_PAGEFILE_QUOTA_EXCEEDED", + NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, + {"NT_STATUS_COMMITMENT_LIMIT", NT_STATUS_COMMITMENT_LIMIT}, + {"NT_STATUS_INVALID_IMAGE_LE_FORMAT", + NT_STATUS_INVALID_IMAGE_LE_FORMAT}, + {"NT_STATUS_INVALID_IMAGE_NOT_MZ", NT_STATUS_INVALID_IMAGE_NOT_MZ}, + {"NT_STATUS_INVALID_IMAGE_PROTECT", + NT_STATUS_INVALID_IMAGE_PROTECT}, + {"NT_STATUS_INVALID_IMAGE_WIN_16", NT_STATUS_INVALID_IMAGE_WIN_16}, + {"NT_STATUS_LOGON_SERVER_CONFLICT", + NT_STATUS_LOGON_SERVER_CONFLICT}, + {"NT_STATUS_TIME_DIFFERENCE_AT_DC", + NT_STATUS_TIME_DIFFERENCE_AT_DC}, + {"NT_STATUS_SYNCHRONIZATION_REQUIRED", + NT_STATUS_SYNCHRONIZATION_REQUIRED}, + {"NT_STATUS_DLL_NOT_FOUND", NT_STATUS_DLL_NOT_FOUND}, + {"NT_STATUS_OPEN_FAILED", NT_STATUS_OPEN_FAILED}, + {"NT_STATUS_IO_PRIVILEGE_FAILED", NT_STATUS_IO_PRIVILEGE_FAILED}, + {"NT_STATUS_ORDINAL_NOT_FOUND", NT_STATUS_ORDINAL_NOT_FOUND}, + {"NT_STATUS_ENTRYPOINT_NOT_FOUND", NT_STATUS_ENTRYPOINT_NOT_FOUND}, + {"NT_STATUS_CONTROL_C_EXIT", NT_STATUS_CONTROL_C_EXIT}, + {"NT_STATUS_LOCAL_DISCONNECT", NT_STATUS_LOCAL_DISCONNECT}, + {"NT_STATUS_REMOTE_DISCONNECT", NT_STATUS_REMOTE_DISCONNECT}, + {"NT_STATUS_REMOTE_RESOURCES", NT_STATUS_REMOTE_RESOURCES}, + {"NT_STATUS_LINK_FAILED", NT_STATUS_LINK_FAILED}, + {"NT_STATUS_LINK_TIMEOUT", NT_STATUS_LINK_TIMEOUT}, + {"NT_STATUS_INVALID_CONNECTION", NT_STATUS_INVALID_CONNECTION}, + {"NT_STATUS_INVALID_ADDRESS", NT_STATUS_INVALID_ADDRESS}, + {"NT_STATUS_DLL_INIT_FAILED", NT_STATUS_DLL_INIT_FAILED}, + {"NT_STATUS_MISSING_SYSTEMFILE", NT_STATUS_MISSING_SYSTEMFILE}, + {"NT_STATUS_UNHANDLED_EXCEPTION", NT_STATUS_UNHANDLED_EXCEPTION}, + {"NT_STATUS_APP_INIT_FAILURE", NT_STATUS_APP_INIT_FAILURE}, + {"NT_STATUS_PAGEFILE_CREATE_FAILED", + NT_STATUS_PAGEFILE_CREATE_FAILED}, + {"NT_STATUS_NO_PAGEFILE", NT_STATUS_NO_PAGEFILE}, + {"NT_STATUS_INVALID_LEVEL", NT_STATUS_INVALID_LEVEL}, + {"NT_STATUS_WRONG_PASSWORD_CORE", NT_STATUS_WRONG_PASSWORD_CORE}, + {"NT_STATUS_ILLEGAL_FLOAT_CONTEXT", + NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, + {"NT_STATUS_PIPE_BROKEN", NT_STATUS_PIPE_BROKEN}, + {"NT_STATUS_REGISTRY_CORRUPT", NT_STATUS_REGISTRY_CORRUPT}, + {"NT_STATUS_REGISTRY_IO_FAILED", NT_STATUS_REGISTRY_IO_FAILED}, + {"NT_STATUS_NO_EVENT_PAIR", NT_STATUS_NO_EVENT_PAIR}, + {"NT_STATUS_UNRECOGNIZED_VOLUME", NT_STATUS_UNRECOGNIZED_VOLUME}, + {"NT_STATUS_SERIAL_NO_DEVICE_INITED", + NT_STATUS_SERIAL_NO_DEVICE_INITED}, + {"NT_STATUS_NO_SUCH_ALIAS", NT_STATUS_NO_SUCH_ALIAS}, + {"NT_STATUS_MEMBER_NOT_IN_ALIAS", NT_STATUS_MEMBER_NOT_IN_ALIAS}, + {"NT_STATUS_MEMBER_IN_ALIAS", NT_STATUS_MEMBER_IN_ALIAS}, + {"NT_STATUS_ALIAS_EXISTS", NT_STATUS_ALIAS_EXISTS}, + {"NT_STATUS_LOGON_NOT_GRANTED", NT_STATUS_LOGON_NOT_GRANTED}, + {"NT_STATUS_TOO_MANY_SECRETS", NT_STATUS_TOO_MANY_SECRETS}, + {"NT_STATUS_SECRET_TOO_LONG", NT_STATUS_SECRET_TOO_LONG}, + {"NT_STATUS_INTERNAL_DB_ERROR", NT_STATUS_INTERNAL_DB_ERROR}, + {"NT_STATUS_FULLSCREEN_MODE", NT_STATUS_FULLSCREEN_MODE}, + {"NT_STATUS_TOO_MANY_CONTEXT_IDS", NT_STATUS_TOO_MANY_CONTEXT_IDS}, + {"NT_STATUS_LOGON_TYPE_NOT_GRANTED", + NT_STATUS_LOGON_TYPE_NOT_GRANTED}, + {"NT_STATUS_NOT_REGISTRY_FILE", NT_STATUS_NOT_REGISTRY_FILE}, + {"NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED", + NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, + {"NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR", + NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, + {"NT_STATUS_FT_MISSING_MEMBER", NT_STATUS_FT_MISSING_MEMBER}, + {"NT_STATUS_ILL_FORMED_SERVICE_ENTRY", + NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, + {"NT_STATUS_ILLEGAL_CHARACTER", NT_STATUS_ILLEGAL_CHARACTER}, + {"NT_STATUS_UNMAPPABLE_CHARACTER", NT_STATUS_UNMAPPABLE_CHARACTER}, + {"NT_STATUS_UNDEFINED_CHARACTER", NT_STATUS_UNDEFINED_CHARACTER}, + {"NT_STATUS_FLOPPY_VOLUME", NT_STATUS_FLOPPY_VOLUME}, + {"NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND", + NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, + {"NT_STATUS_FLOPPY_WRONG_CYLINDER", + NT_STATUS_FLOPPY_WRONG_CYLINDER}, + {"NT_STATUS_FLOPPY_UNKNOWN_ERROR", NT_STATUS_FLOPPY_UNKNOWN_ERROR}, + {"NT_STATUS_FLOPPY_BAD_REGISTERS", NT_STATUS_FLOPPY_BAD_REGISTERS}, + {"NT_STATUS_DISK_RECALIBRATE_FAILED", + NT_STATUS_DISK_RECALIBRATE_FAILED}, + {"NT_STATUS_DISK_OPERATION_FAILED", + NT_STATUS_DISK_OPERATION_FAILED}, + {"NT_STATUS_DISK_RESET_FAILED", NT_STATUS_DISK_RESET_FAILED}, + {"NT_STATUS_SHARED_IRQ_BUSY", NT_STATUS_SHARED_IRQ_BUSY}, + {"NT_STATUS_FT_ORPHANING", NT_STATUS_FT_ORPHANING}, + {"NT_STATUS_PARTITION_FAILURE", NT_STATUS_PARTITION_FAILURE}, + {"NT_STATUS_INVALID_BLOCK_LENGTH", NT_STATUS_INVALID_BLOCK_LENGTH}, + {"NT_STATUS_DEVICE_NOT_PARTITIONED", + NT_STATUS_DEVICE_NOT_PARTITIONED}, + {"NT_STATUS_UNABLE_TO_LOCK_MEDIA", NT_STATUS_UNABLE_TO_LOCK_MEDIA}, + {"NT_STATUS_UNABLE_TO_UNLOAD_MEDIA", + NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, + {"NT_STATUS_EOM_OVERFLOW", NT_STATUS_EOM_OVERFLOW}, + {"NT_STATUS_NO_MEDIA", NT_STATUS_NO_MEDIA}, + {"NT_STATUS_NO_SUCH_MEMBER", NT_STATUS_NO_SUCH_MEMBER}, + {"NT_STATUS_INVALID_MEMBER", NT_STATUS_INVALID_MEMBER}, + {"NT_STATUS_KEY_DELETED", NT_STATUS_KEY_DELETED}, + {"NT_STATUS_NO_LOG_SPACE", NT_STATUS_NO_LOG_SPACE}, + {"NT_STATUS_TOO_MANY_SIDS", NT_STATUS_TOO_MANY_SIDS}, + {"NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED", + NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, + {"NT_STATUS_KEY_HAS_CHILDREN", NT_STATUS_KEY_HAS_CHILDREN}, + {"NT_STATUS_CHILD_MUST_BE_VOLATILE", + NT_STATUS_CHILD_MUST_BE_VOLATILE}, + {"NT_STATUS_DEVICE_CONFIGURATION_ERROR", + NT_STATUS_DEVICE_CONFIGURATION_ERROR}, + {"NT_STATUS_DRIVER_INTERNAL_ERROR", + NT_STATUS_DRIVER_INTERNAL_ERROR}, + {"NT_STATUS_INVALID_DEVICE_STATE", NT_STATUS_INVALID_DEVICE_STATE}, + {"NT_STATUS_IO_DEVICE_ERROR", NT_STATUS_IO_DEVICE_ERROR}, + {"NT_STATUS_DEVICE_PROTOCOL_ERROR", + NT_STATUS_DEVICE_PROTOCOL_ERROR}, + {"NT_STATUS_BACKUP_CONTROLLER", NT_STATUS_BACKUP_CONTROLLER}, + {"NT_STATUS_LOG_FILE_FULL", NT_STATUS_LOG_FILE_FULL}, + {"NT_STATUS_TOO_LATE", NT_STATUS_TOO_LATE}, + {"NT_STATUS_NO_TRUST_LSA_SECRET", NT_STATUS_NO_TRUST_LSA_SECRET}, + {"NT_STATUS_NO_TRUST_SAM_ACCOUNT", NT_STATUS_NO_TRUST_SAM_ACCOUNT}, + {"NT_STATUS_TRUSTED_DOMAIN_FAILURE", + NT_STATUS_TRUSTED_DOMAIN_FAILURE}, + {"NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE", + NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, + {"NT_STATUS_EVENTLOG_FILE_CORRUPT", + NT_STATUS_EVENTLOG_FILE_CORRUPT}, + {"NT_STATUS_EVENTLOG_CANT_START", NT_STATUS_EVENTLOG_CANT_START}, + {"NT_STATUS_TRUST_FAILURE", NT_STATUS_TRUST_FAILURE}, + {"NT_STATUS_MUTANT_LIMIT_EXCEEDED", + NT_STATUS_MUTANT_LIMIT_EXCEEDED}, + {"NT_STATUS_NETLOGON_NOT_STARTED", NT_STATUS_NETLOGON_NOT_STARTED}, + {"NT_STATUS_ACCOUNT_EXPIRED", NT_STATUS_ACCOUNT_EXPIRED}, + {"NT_STATUS_POSSIBLE_DEADLOCK", NT_STATUS_POSSIBLE_DEADLOCK}, + {"NT_STATUS_NETWORK_CREDENTIAL_CONFLICT", + NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, + {"NT_STATUS_REMOTE_SESSION_LIMIT", NT_STATUS_REMOTE_SESSION_LIMIT}, + {"NT_STATUS_EVENTLOG_FILE_CHANGED", + NT_STATUS_EVENTLOG_FILE_CHANGED}, + {"NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, + {"NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, + {"NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, + {"NT_STATUS_DOMAIN_TRUST_INCONSISTENT", + NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, + {"NT_STATUS_FS_DRIVER_REQUIRED", NT_STATUS_FS_DRIVER_REQUIRED}, + {"NT_STATUS_NO_USER_SESSION_KEY", NT_STATUS_NO_USER_SESSION_KEY}, + {"NT_STATUS_USER_SESSION_DELETED", NT_STATUS_USER_SESSION_DELETED}, + {"NT_STATUS_RESOURCE_LANG_NOT_FOUND", + NT_STATUS_RESOURCE_LANG_NOT_FOUND}, + {"NT_STATUS_INSUFF_SERVER_RESOURCES", + NT_STATUS_INSUFF_SERVER_RESOURCES}, + {"NT_STATUS_INVALID_BUFFER_SIZE", NT_STATUS_INVALID_BUFFER_SIZE}, + {"NT_STATUS_INVALID_ADDRESS_COMPONENT", + NT_STATUS_INVALID_ADDRESS_COMPONENT}, + {"NT_STATUS_INVALID_ADDRESS_WILDCARD", + NT_STATUS_INVALID_ADDRESS_WILDCARD}, + {"NT_STATUS_TOO_MANY_ADDRESSES", NT_STATUS_TOO_MANY_ADDRESSES}, + {"NT_STATUS_ADDRESS_ALREADY_EXISTS", + NT_STATUS_ADDRESS_ALREADY_EXISTS}, + {"NT_STATUS_ADDRESS_CLOSED", NT_STATUS_ADDRESS_CLOSED}, + {"NT_STATUS_CONNECTION_DISCONNECTED", + NT_STATUS_CONNECTION_DISCONNECTED}, + {"NT_STATUS_CONNECTION_RESET", NT_STATUS_CONNECTION_RESET}, + {"NT_STATUS_TOO_MANY_NODES", NT_STATUS_TOO_MANY_NODES}, + {"NT_STATUS_TRANSACTION_ABORTED", NT_STATUS_TRANSACTION_ABORTED}, + {"NT_STATUS_TRANSACTION_TIMED_OUT", + NT_STATUS_TRANSACTION_TIMED_OUT}, + {"NT_STATUS_TRANSACTION_NO_RELEASE", + NT_STATUS_TRANSACTION_NO_RELEASE}, + {"NT_STATUS_TRANSACTION_NO_MATCH", NT_STATUS_TRANSACTION_NO_MATCH}, + {"NT_STATUS_TRANSACTION_RESPONDED", + NT_STATUS_TRANSACTION_RESPONDED}, + {"NT_STATUS_TRANSACTION_INVALID_ID", + NT_STATUS_TRANSACTION_INVALID_ID}, + {"NT_STATUS_TRANSACTION_INVALID_TYPE", + NT_STATUS_TRANSACTION_INVALID_TYPE}, + {"NT_STATUS_NOT_SERVER_SESSION", NT_STATUS_NOT_SERVER_SESSION}, + {"NT_STATUS_NOT_CLIENT_SESSION", NT_STATUS_NOT_CLIENT_SESSION}, + {"NT_STATUS_CANNOT_LOAD_REGISTRY_FILE", + NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, + {"NT_STATUS_DEBUG_ATTACH_FAILED", NT_STATUS_DEBUG_ATTACH_FAILED}, + {"NT_STATUS_SYSTEM_PROCESS_TERMINATED", + NT_STATUS_SYSTEM_PROCESS_TERMINATED}, + {"NT_STATUS_DATA_NOT_ACCEPTED", NT_STATUS_DATA_NOT_ACCEPTED}, + {"NT_STATUS_NO_BROWSER_SERVERS_FOUND", + NT_STATUS_NO_BROWSER_SERVERS_FOUND}, + {"NT_STATUS_VDM_HARD_ERROR", NT_STATUS_VDM_HARD_ERROR}, + {"NT_STATUS_DRIVER_CANCEL_TIMEOUT", + NT_STATUS_DRIVER_CANCEL_TIMEOUT}, + {"NT_STATUS_REPLY_MESSAGE_MISMATCH", + NT_STATUS_REPLY_MESSAGE_MISMATCH}, + {"NT_STATUS_MAPPED_ALIGNMENT", NT_STATUS_MAPPED_ALIGNMENT}, + {"NT_STATUS_IMAGE_CHECKSUM_MISMATCH", + NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, + {"NT_STATUS_LOST_WRITEBEHIND_DATA", + NT_STATUS_LOST_WRITEBEHIND_DATA}, + {"NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID", + NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, + {"NT_STATUS_PASSWORD_MUST_CHANGE", NT_STATUS_PASSWORD_MUST_CHANGE}, + {"NT_STATUS_NOT_FOUND", NT_STATUS_NOT_FOUND}, + {"NT_STATUS_NOT_TINY_STREAM", NT_STATUS_NOT_TINY_STREAM}, + {"NT_STATUS_RECOVERY_FAILURE", NT_STATUS_RECOVERY_FAILURE}, + {"NT_STATUS_STACK_OVERFLOW_READ", NT_STATUS_STACK_OVERFLOW_READ}, + {"NT_STATUS_FAIL_CHECK", NT_STATUS_FAIL_CHECK}, + {"NT_STATUS_DUPLICATE_OBJECTID", NT_STATUS_DUPLICATE_OBJECTID}, + {"NT_STATUS_OBJECTID_EXISTS", NT_STATUS_OBJECTID_EXISTS}, + {"NT_STATUS_CONVERT_TO_LARGE", NT_STATUS_CONVERT_TO_LARGE}, + {"NT_STATUS_RETRY", NT_STATUS_RETRY}, + {"NT_STATUS_FOUND_OUT_OF_SCOPE", NT_STATUS_FOUND_OUT_OF_SCOPE}, + {"NT_STATUS_ALLOCATE_BUCKET", NT_STATUS_ALLOCATE_BUCKET}, + {"NT_STATUS_PROPSET_NOT_FOUND", NT_STATUS_PROPSET_NOT_FOUND}, + {"NT_STATUS_MARSHALL_OVERFLOW", NT_STATUS_MARSHALL_OVERFLOW}, + {"NT_STATUS_INVALID_VARIANT", NT_STATUS_INVALID_VARIANT}, + {"NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND", + NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, + {"NT_STATUS_ACCOUNT_LOCKED_OUT", NT_STATUS_ACCOUNT_LOCKED_OUT}, + {"NT_STATUS_HANDLE_NOT_CLOSABLE", NT_STATUS_HANDLE_NOT_CLOSABLE}, + {"NT_STATUS_CONNECTION_REFUSED", NT_STATUS_CONNECTION_REFUSED}, + {"NT_STATUS_GRACEFUL_DISCONNECT", NT_STATUS_GRACEFUL_DISCONNECT}, + {"NT_STATUS_ADDRESS_ALREADY_ASSOCIATED", + NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, + {"NT_STATUS_ADDRESS_NOT_ASSOCIATED", + NT_STATUS_ADDRESS_NOT_ASSOCIATED}, + {"NT_STATUS_CONNECTION_INVALID", NT_STATUS_CONNECTION_INVALID}, + {"NT_STATUS_CONNECTION_ACTIVE", NT_STATUS_CONNECTION_ACTIVE}, + {"NT_STATUS_NETWORK_UNREACHABLE", NT_STATUS_NETWORK_UNREACHABLE}, + {"NT_STATUS_HOST_UNREACHABLE", NT_STATUS_HOST_UNREACHABLE}, + {"NT_STATUS_PROTOCOL_UNREACHABLE", NT_STATUS_PROTOCOL_UNREACHABLE}, + {"NT_STATUS_PORT_UNREACHABLE", NT_STATUS_PORT_UNREACHABLE}, + {"NT_STATUS_REQUEST_ABORTED", NT_STATUS_REQUEST_ABORTED}, + {"NT_STATUS_CONNECTION_ABORTED", NT_STATUS_CONNECTION_ABORTED}, + {"NT_STATUS_BAD_COMPRESSION_BUFFER", + NT_STATUS_BAD_COMPRESSION_BUFFER}, + {"NT_STATUS_USER_MAPPED_FILE", NT_STATUS_USER_MAPPED_FILE}, + {"NT_STATUS_AUDIT_FAILED", NT_STATUS_AUDIT_FAILED}, + {"NT_STATUS_TIMER_RESOLUTION_NOT_SET", + NT_STATUS_TIMER_RESOLUTION_NOT_SET}, + {"NT_STATUS_CONNECTION_COUNT_LIMIT", + NT_STATUS_CONNECTION_COUNT_LIMIT}, + {"NT_STATUS_LOGIN_TIME_RESTRICTION", + NT_STATUS_LOGIN_TIME_RESTRICTION}, + {"NT_STATUS_LOGIN_WKSTA_RESTRICTION", + NT_STATUS_LOGIN_WKSTA_RESTRICTION}, + {"NT_STATUS_IMAGE_MP_UP_MISMATCH", NT_STATUS_IMAGE_MP_UP_MISMATCH}, + {"NT_STATUS_INSUFFICIENT_LOGON_INFO", + NT_STATUS_INSUFFICIENT_LOGON_INFO}, + {"NT_STATUS_BAD_DLL_ENTRYPOINT", NT_STATUS_BAD_DLL_ENTRYPOINT}, + {"NT_STATUS_BAD_SERVICE_ENTRYPOINT", + NT_STATUS_BAD_SERVICE_ENTRYPOINT}, + {"NT_STATUS_LPC_REPLY_LOST", NT_STATUS_LPC_REPLY_LOST}, + {"NT_STATUS_IP_ADDRESS_CONFLICT1", NT_STATUS_IP_ADDRESS_CONFLICT1}, + {"NT_STATUS_IP_ADDRESS_CONFLICT2", NT_STATUS_IP_ADDRESS_CONFLICT2}, + {"NT_STATUS_REGISTRY_QUOTA_LIMIT", NT_STATUS_REGISTRY_QUOTA_LIMIT}, + {"NT_STATUS_PATH_NOT_COVERED", NT_STATUS_PATH_NOT_COVERED}, + {"NT_STATUS_NO_CALLBACK_ACTIVE", NT_STATUS_NO_CALLBACK_ACTIVE}, + {"NT_STATUS_LICENSE_QUOTA_EXCEEDED", + NT_STATUS_LICENSE_QUOTA_EXCEEDED}, + {"NT_STATUS_PWD_TOO_SHORT", NT_STATUS_PWD_TOO_SHORT}, + {"NT_STATUS_PWD_TOO_RECENT", NT_STATUS_PWD_TOO_RECENT}, + {"NT_STATUS_PWD_HISTORY_CONFLICT", NT_STATUS_PWD_HISTORY_CONFLICT}, + {"NT_STATUS_PLUGPLAY_NO_DEVICE", NT_STATUS_PLUGPLAY_NO_DEVICE}, + {"NT_STATUS_UNSUPPORTED_COMPRESSION", + NT_STATUS_UNSUPPORTED_COMPRESSION}, + {"NT_STATUS_INVALID_HW_PROFILE", NT_STATUS_INVALID_HW_PROFILE}, + {"NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH", + NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, + {"NT_STATUS_DRIVER_ORDINAL_NOT_FOUND", + NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, + {"NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND", + NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, + {"NT_STATUS_RESOURCE_NOT_OWNED", NT_STATUS_RESOURCE_NOT_OWNED}, + {"NT_STATUS_TOO_MANY_LINKS", NT_STATUS_TOO_MANY_LINKS}, + {"NT_STATUS_QUOTA_LIST_INCONSISTENT", + NT_STATUS_QUOTA_LIST_INCONSISTENT}, + {"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE}, + {"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES}, + {"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES}, + {"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED}, + {NULL, 0} +}; diff --git a/kernel/fs/cifs/nterr.h b/kernel/fs/cifs/nterr.h new file mode 100644 index 000000000..7a0eae5ae --- /dev/null +++ b/kernel/fs/cifs/nterr.h @@ -0,0 +1,563 @@ +/* + Unix SMB/Netbios implementation. + Version 1.9. + NT error code constants + Copyright (C) Andrew Tridgell 1992-2000 + Copyright (C) John H Terpstra 1996-2000 + Copyright (C) Luke Kenneth Casson Leighton 1996-2000 + Copyright (C) Paul Ashton 1998-2000 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + + + +#ifndef _NTERR_H +#define _NTERR_H + +struct nt_err_code_struct { + char *nt_errstr; + __u32 nt_errcode; +}; + +extern const struct nt_err_code_struct nt_errs[]; + +/* Win32 Status codes. */ +#define NT_STATUS_MORE_ENTRIES 0x0105 +#define NT_ERROR_INVALID_PARAMETER 0x0057 +#define NT_ERROR_INSUFFICIENT_BUFFER 0x007a +#define NT_STATUS_1804 0x070c +#define NT_STATUS_NOTIFY_ENUM_DIR 0x010c + +/* + * Win32 Error codes extracted using a loop in smbclient then printing a netmon + * sniff to a file. + */ + +#define NT_STATUS_OK 0x0000 +#define NT_STATUS_SOME_UNMAPPED 0x0107 +#define NT_STATUS_BUFFER_OVERFLOW 0x80000005 +#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a +#define NT_STATUS_MEDIA_CHANGED 0x8000001c +#define NT_STATUS_END_OF_MEDIA 0x8000001e +#define NT_STATUS_MEDIA_CHECK 0x80000020 +#define NT_STATUS_NO_DATA_DETECTED 0x8000001c +#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d +#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288 +#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288 +#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001 +#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002 +#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003 +#define NT_STATUS_INFO_LENGTH_MISMATCH 0xC0000000 | 0x0004 +#define NT_STATUS_ACCESS_VIOLATION 0xC0000000 | 0x0005 +#define NT_STATUS_IN_PAGE_ERROR 0xC0000000 | 0x0006 +#define NT_STATUS_PAGEFILE_QUOTA 0xC0000000 | 0x0007 +#define NT_STATUS_INVALID_HANDLE 0xC0000000 | 0x0008 +#define NT_STATUS_BAD_INITIAL_STACK 0xC0000000 | 0x0009 +#define NT_STATUS_BAD_INITIAL_PC 0xC0000000 | 0x000a +#define NT_STATUS_INVALID_CID 0xC0000000 | 0x000b +#define NT_STATUS_TIMER_NOT_CANCELED 0xC0000000 | 0x000c +#define NT_STATUS_INVALID_PARAMETER 0xC0000000 | 0x000d +#define NT_STATUS_NO_SUCH_DEVICE 0xC0000000 | 0x000e +#define NT_STATUS_NO_SUCH_FILE 0xC0000000 | 0x000f +#define NT_STATUS_INVALID_DEVICE_REQUEST 0xC0000000 | 0x0010 +#define NT_STATUS_END_OF_FILE 0xC0000000 | 0x0011 +#define NT_STATUS_WRONG_VOLUME 0xC0000000 | 0x0012 +#define NT_STATUS_NO_MEDIA_IN_DEVICE 0xC0000000 | 0x0013 +#define NT_STATUS_UNRECOGNIZED_MEDIA 0xC0000000 | 0x0014 +#define NT_STATUS_NONEXISTENT_SECTOR 0xC0000000 | 0x0015 +#define NT_STATUS_MORE_PROCESSING_REQUIRED 0xC0000000 | 0x0016 +#define NT_STATUS_NO_MEMORY 0xC0000000 | 0x0017 +#define NT_STATUS_CONFLICTING_ADDRESSES 0xC0000000 | 0x0018 +#define NT_STATUS_NOT_MAPPED_VIEW 0xC0000000 | 0x0019 +#define NT_STATUS_UNABLE_TO_FREE_VM 0x80000000 | 0x001a +#define NT_STATUS_UNABLE_TO_DELETE_SECTION 0xC0000000 | 0x001b +#define NT_STATUS_INVALID_SYSTEM_SERVICE 0xC0000000 | 0x001c +#define NT_STATUS_ILLEGAL_INSTRUCTION 0xC0000000 | 0x001d +#define NT_STATUS_INVALID_LOCK_SEQUENCE 0xC0000000 | 0x001e +#define NT_STATUS_INVALID_VIEW_SIZE 0xC0000000 | 0x001f +#define NT_STATUS_INVALID_FILE_FOR_SECTION 0xC0000000 | 0x0020 +#define NT_STATUS_ALREADY_COMMITTED 0xC0000000 | 0x0021 +#define NT_STATUS_ACCESS_DENIED 0xC0000000 | 0x0022 +#define NT_STATUS_BUFFER_TOO_SMALL 0xC0000000 | 0x0023 +#define NT_STATUS_OBJECT_TYPE_MISMATCH 0xC0000000 | 0x0024 +#define NT_STATUS_NONCONTINUABLE_EXCEPTION 0xC0000000 | 0x0025 +#define NT_STATUS_INVALID_DISPOSITION 0xC0000000 | 0x0026 +#define NT_STATUS_UNWIND 0xC0000000 | 0x0027 +#define NT_STATUS_BAD_STACK 0xC0000000 | 0x0028 +#define NT_STATUS_INVALID_UNWIND_TARGET 0xC0000000 | 0x0029 +#define NT_STATUS_NOT_LOCKED 0xC0000000 | 0x002a +#define NT_STATUS_PARITY_ERROR 0xC0000000 | 0x002b +#define NT_STATUS_UNABLE_TO_DECOMMIT_VM 0xC0000000 | 0x002c +#define NT_STATUS_NOT_COMMITTED 0xC0000000 | 0x002d +#define NT_STATUS_INVALID_PORT_ATTRIBUTES 0xC0000000 | 0x002e +#define NT_STATUS_PORT_MESSAGE_TOO_LONG 0xC0000000 | 0x002f +#define NT_STATUS_INVALID_PARAMETER_MIX 0xC0000000 | 0x0030 +#define NT_STATUS_INVALID_QUOTA_LOWER 0xC0000000 | 0x0031 +#define NT_STATUS_DISK_CORRUPT_ERROR 0xC0000000 | 0x0032 +#define NT_STATUS_OBJECT_NAME_INVALID 0xC0000000 | 0x0033 +#define NT_STATUS_OBJECT_NAME_NOT_FOUND 0xC0000000 | 0x0034 +#define NT_STATUS_OBJECT_NAME_COLLISION 0xC0000000 | 0x0035 +#define NT_STATUS_HANDLE_NOT_WAITABLE 0xC0000000 | 0x0036 +#define NT_STATUS_PORT_DISCONNECTED 0xC0000000 | 0x0037 +#define NT_STATUS_DEVICE_ALREADY_ATTACHED 0xC0000000 | 0x0038 +#define NT_STATUS_OBJECT_PATH_INVALID 0xC0000000 | 0x0039 +#define NT_STATUS_OBJECT_PATH_NOT_FOUND 0xC0000000 | 0x003a +#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD 0xC0000000 | 0x003b +#define NT_STATUS_DATA_OVERRUN 0xC0000000 | 0x003c +#define NT_STATUS_DATA_LATE_ERROR 0xC0000000 | 0x003d +#define NT_STATUS_DATA_ERROR 0xC0000000 | 0x003e +#define NT_STATUS_CRC_ERROR 0xC0000000 | 0x003f +#define NT_STATUS_SECTION_TOO_BIG 0xC0000000 | 0x0040 +#define NT_STATUS_PORT_CONNECTION_REFUSED 0xC0000000 | 0x0041 +#define NT_STATUS_INVALID_PORT_HANDLE 0xC0000000 | 0x0042 +#define NT_STATUS_SHARING_VIOLATION 0xC0000000 | 0x0043 +#define NT_STATUS_QUOTA_EXCEEDED 0xC0000000 | 0x0044 +#define NT_STATUS_INVALID_PAGE_PROTECTION 0xC0000000 | 0x0045 +#define NT_STATUS_MUTANT_NOT_OWNED 0xC0000000 | 0x0046 +#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED 0xC0000000 | 0x0047 +#define NT_STATUS_PORT_ALREADY_SET 0xC0000000 | 0x0048 +#define NT_STATUS_SECTION_NOT_IMAGE 0xC0000000 | 0x0049 +#define NT_STATUS_SUSPEND_COUNT_EXCEEDED 0xC0000000 | 0x004a +#define NT_STATUS_THREAD_IS_TERMINATING 0xC0000000 | 0x004b +#define NT_STATUS_BAD_WORKING_SET_LIMIT 0xC0000000 | 0x004c +#define NT_STATUS_INCOMPATIBLE_FILE_MAP 0xC0000000 | 0x004d +#define NT_STATUS_SECTION_PROTECTION 0xC0000000 | 0x004e +#define NT_STATUS_EAS_NOT_SUPPORTED 0xC0000000 | 0x004f +#define NT_STATUS_EA_TOO_LARGE 0xC0000000 | 0x0050 +#define NT_STATUS_NONEXISTENT_EA_ENTRY 0xC0000000 | 0x0051 +#define NT_STATUS_NO_EAS_ON_FILE 0xC0000000 | 0x0052 +#define NT_STATUS_EA_CORRUPT_ERROR 0xC0000000 | 0x0053 +#define NT_STATUS_FILE_LOCK_CONFLICT 0xC0000000 | 0x0054 +#define NT_STATUS_LOCK_NOT_GRANTED 0xC0000000 | 0x0055 +#define NT_STATUS_DELETE_PENDING 0xC0000000 | 0x0056 +#define NT_STATUS_CTL_FILE_NOT_SUPPORTED 0xC0000000 | 0x0057 +#define NT_STATUS_UNKNOWN_REVISION 0xC0000000 | 0x0058 +#define NT_STATUS_REVISION_MISMATCH 0xC0000000 | 0x0059 +#define NT_STATUS_INVALID_OWNER 0xC0000000 | 0x005a +#define NT_STATUS_INVALID_PRIMARY_GROUP 0xC0000000 | 0x005b +#define NT_STATUS_NO_IMPERSONATION_TOKEN 0xC0000000 | 0x005c +#define NT_STATUS_CANT_DISABLE_MANDATORY 0xC0000000 | 0x005d +#define NT_STATUS_NO_LOGON_SERVERS 0xC0000000 | 0x005e +#define NT_STATUS_NO_SUCH_LOGON_SESSION 0xC0000000 | 0x005f +#define NT_STATUS_NO_SUCH_PRIVILEGE 0xC0000000 | 0x0060 +#define NT_STATUS_PRIVILEGE_NOT_HELD 0xC0000000 | 0x0061 +#define NT_STATUS_INVALID_ACCOUNT_NAME 0xC0000000 | 0x0062 +#define NT_STATUS_USER_EXISTS 0xC0000000 | 0x0063 +#define NT_STATUS_NO_SUCH_USER 0xC0000000 | 0x0064 +#define NT_STATUS_GROUP_EXISTS 0xC0000000 | 0x0065 +#define NT_STATUS_NO_SUCH_GROUP 0xC0000000 | 0x0066 +#define NT_STATUS_MEMBER_IN_GROUP 0xC0000000 | 0x0067 +#define NT_STATUS_MEMBER_NOT_IN_GROUP 0xC0000000 | 0x0068 +#define NT_STATUS_LAST_ADMIN 0xC0000000 | 0x0069 +#define NT_STATUS_WRONG_PASSWORD 0xC0000000 | 0x006a +#define NT_STATUS_ILL_FORMED_PASSWORD 0xC0000000 | 0x006b +#define NT_STATUS_PASSWORD_RESTRICTION 0xC0000000 | 0x006c +#define NT_STATUS_LOGON_FAILURE 0xC0000000 | 0x006d +#define NT_STATUS_ACCOUNT_RESTRICTION 0xC0000000 | 0x006e +#define NT_STATUS_INVALID_LOGON_HOURS 0xC0000000 | 0x006f +#define NT_STATUS_INVALID_WORKSTATION 0xC0000000 | 0x0070 +#define NT_STATUS_PASSWORD_EXPIRED 0xC0000000 | 0x0071 +#define NT_STATUS_ACCOUNT_DISABLED 0xC0000000 | 0x0072 +#define NT_STATUS_NONE_MAPPED 0xC0000000 | 0x0073 +#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED 0xC0000000 | 0x0074 +#define NT_STATUS_LUIDS_EXHAUSTED 0xC0000000 | 0x0075 +#define NT_STATUS_INVALID_SUB_AUTHORITY 0xC0000000 | 0x0076 +#define NT_STATUS_INVALID_ACL 0xC0000000 | 0x0077 +#define NT_STATUS_INVALID_SID 0xC0000000 | 0x0078 +#define NT_STATUS_INVALID_SECURITY_DESCR 0xC0000000 | 0x0079 +#define NT_STATUS_PROCEDURE_NOT_FOUND 0xC0000000 | 0x007a +#define NT_STATUS_INVALID_IMAGE_FORMAT 0xC0000000 | 0x007b +#define NT_STATUS_NO_TOKEN 0xC0000000 | 0x007c +#define NT_STATUS_BAD_INHERITANCE_ACL 0xC0000000 | 0x007d +#define NT_STATUS_RANGE_NOT_LOCKED 0xC0000000 | 0x007e +#define NT_STATUS_DISK_FULL 0xC0000000 | 0x007f +#define NT_STATUS_SERVER_DISABLED 0xC0000000 | 0x0080 +#define NT_STATUS_SERVER_NOT_DISABLED 0xC0000000 | 0x0081 +#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED 0xC0000000 | 0x0082 +#define NT_STATUS_GUIDS_EXHAUSTED 0xC0000000 | 0x0083 +#define NT_STATUS_INVALID_ID_AUTHORITY 0xC0000000 | 0x0084 +#define NT_STATUS_AGENTS_EXHAUSTED 0xC0000000 | 0x0085 +#define NT_STATUS_INVALID_VOLUME_LABEL 0xC0000000 | 0x0086 +#define NT_STATUS_SECTION_NOT_EXTENDED 0xC0000000 | 0x0087 +#define NT_STATUS_NOT_MAPPED_DATA 0xC0000000 | 0x0088 +#define NT_STATUS_RESOURCE_DATA_NOT_FOUND 0xC0000000 | 0x0089 +#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND 0xC0000000 | 0x008a +#define NT_STATUS_RESOURCE_NAME_NOT_FOUND 0xC0000000 | 0x008b +#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED 0xC0000000 | 0x008c +#define NT_STATUS_FLOAT_DENORMAL_OPERAND 0xC0000000 | 0x008d +#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO 0xC0000000 | 0x008e +#define NT_STATUS_FLOAT_INEXACT_RESULT 0xC0000000 | 0x008f +#define NT_STATUS_FLOAT_INVALID_OPERATION 0xC0000000 | 0x0090 +#define NT_STATUS_FLOAT_OVERFLOW 0xC0000000 | 0x0091 +#define NT_STATUS_FLOAT_STACK_CHECK 0xC0000000 | 0x0092 +#define NT_STATUS_FLOAT_UNDERFLOW 0xC0000000 | 0x0093 +#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO 0xC0000000 | 0x0094 +#define NT_STATUS_INTEGER_OVERFLOW 0xC0000000 | 0x0095 +#define NT_STATUS_PRIVILEGED_INSTRUCTION 0xC0000000 | 0x0096 +#define NT_STATUS_TOO_MANY_PAGING_FILES 0xC0000000 | 0x0097 +#define NT_STATUS_FILE_INVALID 0xC0000000 | 0x0098 +#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED 0xC0000000 | 0x0099 +#define NT_STATUS_INSUFFICIENT_RESOURCES 0xC0000000 | 0x009a +#define NT_STATUS_DFS_EXIT_PATH_FOUND 0xC0000000 | 0x009b +#define NT_STATUS_DEVICE_DATA_ERROR 0xC0000000 | 0x009c +#define NT_STATUS_DEVICE_NOT_CONNECTED 0xC0000000 | 0x009d +#define NT_STATUS_DEVICE_POWER_FAILURE 0xC0000000 | 0x009e +#define NT_STATUS_FREE_VM_NOT_AT_BASE 0xC0000000 | 0x009f +#define NT_STATUS_MEMORY_NOT_ALLOCATED 0xC0000000 | 0x00a0 +#define NT_STATUS_WORKING_SET_QUOTA 0xC0000000 | 0x00a1 +#define NT_STATUS_MEDIA_WRITE_PROTECTED 0xC0000000 | 0x00a2 +#define NT_STATUS_DEVICE_NOT_READY 0xC0000000 | 0x00a3 +#define NT_STATUS_INVALID_GROUP_ATTRIBUTES 0xC0000000 | 0x00a4 +#define NT_STATUS_BAD_IMPERSONATION_LEVEL 0xC0000000 | 0x00a5 +#define NT_STATUS_CANT_OPEN_ANONYMOUS 0xC0000000 | 0x00a6 +#define NT_STATUS_BAD_VALIDATION_CLASS 0xC0000000 | 0x00a7 +#define NT_STATUS_BAD_TOKEN_TYPE 0xC0000000 | 0x00a8 +#define NT_STATUS_BAD_MASTER_BOOT_RECORD 0xC0000000 | 0x00a9 +#define NT_STATUS_INSTRUCTION_MISALIGNMENT 0xC0000000 | 0x00aa +#define NT_STATUS_INSTANCE_NOT_AVAILABLE 0xC0000000 | 0x00ab +#define NT_STATUS_PIPE_NOT_AVAILABLE 0xC0000000 | 0x00ac +#define NT_STATUS_INVALID_PIPE_STATE 0xC0000000 | 0x00ad +#define NT_STATUS_PIPE_BUSY 0xC0000000 | 0x00ae +#define NT_STATUS_ILLEGAL_FUNCTION 0xC0000000 | 0x00af +#define NT_STATUS_PIPE_DISCONNECTED 0xC0000000 | 0x00b0 +#define NT_STATUS_PIPE_CLOSING 0xC0000000 | 0x00b1 +#define NT_STATUS_PIPE_CONNECTED 0xC0000000 | 0x00b2 +#define NT_STATUS_PIPE_LISTENING 0xC0000000 | 0x00b3 +#define NT_STATUS_INVALID_READ_MODE 0xC0000000 | 0x00b4 +#define NT_STATUS_IO_TIMEOUT 0xC0000000 | 0x00b5 +#define NT_STATUS_FILE_FORCED_CLOSED 0xC0000000 | 0x00b6 +#define NT_STATUS_PROFILING_NOT_STARTED 0xC0000000 | 0x00b7 +#define NT_STATUS_PROFILING_NOT_STOPPED 0xC0000000 | 0x00b8 +#define NT_STATUS_COULD_NOT_INTERPRET 0xC0000000 | 0x00b9 +#define NT_STATUS_FILE_IS_A_DIRECTORY 0xC0000000 | 0x00ba +#define NT_STATUS_NOT_SUPPORTED 0xC0000000 | 0x00bb +#define NT_STATUS_REMOTE_NOT_LISTENING 0xC0000000 | 0x00bc +#define NT_STATUS_DUPLICATE_NAME 0xC0000000 | 0x00bd +#define NT_STATUS_BAD_NETWORK_PATH 0xC0000000 | 0x00be +#define NT_STATUS_NETWORK_BUSY 0xC0000000 | 0x00bf +#define NT_STATUS_DEVICE_DOES_NOT_EXIST 0xC0000000 | 0x00c0 +#define NT_STATUS_TOO_MANY_COMMANDS 0xC0000000 | 0x00c1 +#define NT_STATUS_ADAPTER_HARDWARE_ERROR 0xC0000000 | 0x00c2 +#define NT_STATUS_INVALID_NETWORK_RESPONSE 0xC0000000 | 0x00c3 +#define NT_STATUS_UNEXPECTED_NETWORK_ERROR 0xC0000000 | 0x00c4 +#define NT_STATUS_BAD_REMOTE_ADAPTER 0xC0000000 | 0x00c5 +#define NT_STATUS_PRINT_QUEUE_FULL 0xC0000000 | 0x00c6 +#define NT_STATUS_NO_SPOOL_SPACE 0xC0000000 | 0x00c7 +#define NT_STATUS_PRINT_CANCELLED 0xC0000000 | 0x00c8 +#define NT_STATUS_NETWORK_NAME_DELETED 0xC0000000 | 0x00c9 +#define NT_STATUS_NETWORK_ACCESS_DENIED 0xC0000000 | 0x00ca +#define NT_STATUS_BAD_DEVICE_TYPE 0xC0000000 | 0x00cb +#define NT_STATUS_BAD_NETWORK_NAME 0xC0000000 | 0x00cc +#define NT_STATUS_TOO_MANY_NAMES 0xC0000000 | 0x00cd +#define NT_STATUS_TOO_MANY_SESSIONS 0xC0000000 | 0x00ce +#define NT_STATUS_SHARING_PAUSED 0xC0000000 | 0x00cf +#define NT_STATUS_REQUEST_NOT_ACCEPTED 0xC0000000 | 0x00d0 +#define NT_STATUS_REDIRECTOR_PAUSED 0xC0000000 | 0x00d1 +#define NT_STATUS_NET_WRITE_FAULT 0xC0000000 | 0x00d2 +#define NT_STATUS_PROFILING_AT_LIMIT 0xC0000000 | 0x00d3 +#define NT_STATUS_NOT_SAME_DEVICE 0xC0000000 | 0x00d4 +#define NT_STATUS_FILE_RENAMED 0xC0000000 | 0x00d5 +#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED 0xC0000000 | 0x00d6 +#define NT_STATUS_NO_SECURITY_ON_OBJECT 0xC0000000 | 0x00d7 +#define NT_STATUS_CANT_WAIT 0xC0000000 | 0x00d8 +#define NT_STATUS_PIPE_EMPTY 0xC0000000 | 0x00d9 +#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO 0xC0000000 | 0x00da +#define NT_STATUS_CANT_TERMINATE_SELF 0xC0000000 | 0x00db +#define NT_STATUS_INVALID_SERVER_STATE 0xC0000000 | 0x00dc +#define NT_STATUS_INVALID_DOMAIN_STATE 0xC0000000 | 0x00dd +#define NT_STATUS_INVALID_DOMAIN_ROLE 0xC0000000 | 0x00de +#define NT_STATUS_NO_SUCH_DOMAIN 0xC0000000 | 0x00df +#define NT_STATUS_DOMAIN_EXISTS 0xC0000000 | 0x00e0 +#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED 0xC0000000 | 0x00e1 +#define NT_STATUS_OPLOCK_NOT_GRANTED 0xC0000000 | 0x00e2 +#define NT_STATUS_INVALID_OPLOCK_PROTOCOL 0xC0000000 | 0x00e3 +#define NT_STATUS_INTERNAL_DB_CORRUPTION 0xC0000000 | 0x00e4 +#define NT_STATUS_INTERNAL_ERROR 0xC0000000 | 0x00e5 +#define NT_STATUS_GENERIC_NOT_MAPPED 0xC0000000 | 0x00e6 +#define NT_STATUS_BAD_DESCRIPTOR_FORMAT 0xC0000000 | 0x00e7 +#define NT_STATUS_INVALID_USER_BUFFER 0xC0000000 | 0x00e8 +#define NT_STATUS_UNEXPECTED_IO_ERROR 0xC0000000 | 0x00e9 +#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR 0xC0000000 | 0x00ea +#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR 0xC0000000 | 0x00eb +#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR 0xC0000000 | 0x00ec +#define NT_STATUS_NOT_LOGON_PROCESS 0xC0000000 | 0x00ed +#define NT_STATUS_LOGON_SESSION_EXISTS 0xC0000000 | 0x00ee +#define NT_STATUS_INVALID_PARAMETER_1 0xC0000000 | 0x00ef +#define NT_STATUS_INVALID_PARAMETER_2 0xC0000000 | 0x00f0 +#define NT_STATUS_INVALID_PARAMETER_3 0xC0000000 | 0x00f1 +#define NT_STATUS_INVALID_PARAMETER_4 0xC0000000 | 0x00f2 +#define NT_STATUS_INVALID_PARAMETER_5 0xC0000000 | 0x00f3 +#define NT_STATUS_INVALID_PARAMETER_6 0xC0000000 | 0x00f4 +#define NT_STATUS_INVALID_PARAMETER_7 0xC0000000 | 0x00f5 +#define NT_STATUS_INVALID_PARAMETER_8 0xC0000000 | 0x00f6 +#define NT_STATUS_INVALID_PARAMETER_9 0xC0000000 | 0x00f7 +#define NT_STATUS_INVALID_PARAMETER_10 0xC0000000 | 0x00f8 +#define NT_STATUS_INVALID_PARAMETER_11 0xC0000000 | 0x00f9 +#define NT_STATUS_INVALID_PARAMETER_12 0xC0000000 | 0x00fa +#define NT_STATUS_REDIRECTOR_NOT_STARTED 0xC0000000 | 0x00fb +#define NT_STATUS_REDIRECTOR_STARTED 0xC0000000 | 0x00fc +#define NT_STATUS_STACK_OVERFLOW 0xC0000000 | 0x00fd +#define NT_STATUS_NO_SUCH_PACKAGE 0xC0000000 | 0x00fe +#define NT_STATUS_BAD_FUNCTION_TABLE 0xC0000000 | 0x00ff +#define NT_STATUS_DIRECTORY_NOT_EMPTY 0xC0000000 | 0x0101 +#define NT_STATUS_FILE_CORRUPT_ERROR 0xC0000000 | 0x0102 +#define NT_STATUS_NOT_A_DIRECTORY 0xC0000000 | 0x0103 +#define NT_STATUS_BAD_LOGON_SESSION_STATE 0xC0000000 | 0x0104 +#define NT_STATUS_LOGON_SESSION_COLLISION 0xC0000000 | 0x0105 +#define NT_STATUS_NAME_TOO_LONG 0xC0000000 | 0x0106 +#define NT_STATUS_FILES_OPEN 0xC0000000 | 0x0107 +#define NT_STATUS_CONNECTION_IN_USE 0xC0000000 | 0x0108 +#define NT_STATUS_MESSAGE_NOT_FOUND 0xC0000000 | 0x0109 +#define NT_STATUS_PROCESS_IS_TERMINATING 0xC0000000 | 0x010a +#define NT_STATUS_INVALID_LOGON_TYPE 0xC0000000 | 0x010b +#define NT_STATUS_NO_GUID_TRANSLATION 0xC0000000 | 0x010c +#define NT_STATUS_CANNOT_IMPERSONATE 0xC0000000 | 0x010d +#define NT_STATUS_IMAGE_ALREADY_LOADED 0xC0000000 | 0x010e +#define NT_STATUS_ABIOS_NOT_PRESENT 0xC0000000 | 0x010f +#define NT_STATUS_ABIOS_LID_NOT_EXIST 0xC0000000 | 0x0110 +#define NT_STATUS_ABIOS_LID_ALREADY_OWNED 0xC0000000 | 0x0111 +#define NT_STATUS_ABIOS_NOT_LID_OWNER 0xC0000000 | 0x0112 +#define NT_STATUS_ABIOS_INVALID_COMMAND 0xC0000000 | 0x0113 +#define NT_STATUS_ABIOS_INVALID_LID 0xC0000000 | 0x0114 +#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE 0xC0000000 | 0x0115 +#define NT_STATUS_ABIOS_INVALID_SELECTOR 0xC0000000 | 0x0116 +#define NT_STATUS_NO_LDT 0xC0000000 | 0x0117 +#define NT_STATUS_INVALID_LDT_SIZE 0xC0000000 | 0x0118 +#define NT_STATUS_INVALID_LDT_OFFSET 0xC0000000 | 0x0119 +#define NT_STATUS_INVALID_LDT_DESCRIPTOR 0xC0000000 | 0x011a +#define NT_STATUS_INVALID_IMAGE_NE_FORMAT 0xC0000000 | 0x011b +#define NT_STATUS_RXACT_INVALID_STATE 0xC0000000 | 0x011c +#define NT_STATUS_RXACT_COMMIT_FAILURE 0xC0000000 | 0x011d +#define NT_STATUS_MAPPED_FILE_SIZE_ZERO 0xC0000000 | 0x011e +#define NT_STATUS_TOO_MANY_OPENED_FILES 0xC0000000 | 0x011f +#define NT_STATUS_CANCELLED 0xC0000000 | 0x0120 +#define NT_STATUS_CANNOT_DELETE 0xC0000000 | 0x0121 +#define NT_STATUS_INVALID_COMPUTER_NAME 0xC0000000 | 0x0122 +#define NT_STATUS_FILE_DELETED 0xC0000000 | 0x0123 +#define NT_STATUS_SPECIAL_ACCOUNT 0xC0000000 | 0x0124 +#define NT_STATUS_SPECIAL_GROUP 0xC0000000 | 0x0125 +#define NT_STATUS_SPECIAL_USER 0xC0000000 | 0x0126 +#define NT_STATUS_MEMBERS_PRIMARY_GROUP 0xC0000000 | 0x0127 +#define NT_STATUS_FILE_CLOSED 0xC0000000 | 0x0128 +#define NT_STATUS_TOO_MANY_THREADS 0xC0000000 | 0x0129 +#define NT_STATUS_THREAD_NOT_IN_PROCESS 0xC0000000 | 0x012a +#define NT_STATUS_TOKEN_ALREADY_IN_USE 0xC0000000 | 0x012b +#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED 0xC0000000 | 0x012c +#define NT_STATUS_COMMITMENT_LIMIT 0xC0000000 | 0x012d +#define NT_STATUS_INVALID_IMAGE_LE_FORMAT 0xC0000000 | 0x012e +#define NT_STATUS_INVALID_IMAGE_NOT_MZ 0xC0000000 | 0x012f +#define NT_STATUS_INVALID_IMAGE_PROTECT 0xC0000000 | 0x0130 +#define NT_STATUS_INVALID_IMAGE_WIN_16 0xC0000000 | 0x0131 +#define NT_STATUS_LOGON_SERVER_CONFLICT 0xC0000000 | 0x0132 +#define NT_STATUS_TIME_DIFFERENCE_AT_DC 0xC0000000 | 0x0133 +#define NT_STATUS_SYNCHRONIZATION_REQUIRED 0xC0000000 | 0x0134 +#define NT_STATUS_DLL_NOT_FOUND 0xC0000000 | 0x0135 +#define NT_STATUS_OPEN_FAILED 0xC0000000 | 0x0136 +#define NT_STATUS_IO_PRIVILEGE_FAILED 0xC0000000 | 0x0137 +#define NT_STATUS_ORDINAL_NOT_FOUND 0xC0000000 | 0x0138 +#define NT_STATUS_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0139 +#define NT_STATUS_CONTROL_C_EXIT 0xC0000000 | 0x013a +#define NT_STATUS_LOCAL_DISCONNECT 0xC0000000 | 0x013b +#define NT_STATUS_REMOTE_DISCONNECT 0xC0000000 | 0x013c +#define NT_STATUS_REMOTE_RESOURCES 0xC0000000 | 0x013d +#define NT_STATUS_LINK_FAILED 0xC0000000 | 0x013e +#define NT_STATUS_LINK_TIMEOUT 0xC0000000 | 0x013f +#define NT_STATUS_INVALID_CONNECTION 0xC0000000 | 0x0140 +#define NT_STATUS_INVALID_ADDRESS 0xC0000000 | 0x0141 +#define NT_STATUS_DLL_INIT_FAILED 0xC0000000 | 0x0142 +#define NT_STATUS_MISSING_SYSTEMFILE 0xC0000000 | 0x0143 +#define NT_STATUS_UNHANDLED_EXCEPTION 0xC0000000 | 0x0144 +#define NT_STATUS_APP_INIT_FAILURE 0xC0000000 | 0x0145 +#define NT_STATUS_PAGEFILE_CREATE_FAILED 0xC0000000 | 0x0146 +#define NT_STATUS_NO_PAGEFILE 0xC0000000 | 0x0147 +#define NT_STATUS_INVALID_LEVEL 0xC0000000 | 0x0148 +#define NT_STATUS_WRONG_PASSWORD_CORE 0xC0000000 | 0x0149 +#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT 0xC0000000 | 0x014a +#define NT_STATUS_PIPE_BROKEN 0xC0000000 | 0x014b +#define NT_STATUS_REGISTRY_CORRUPT 0xC0000000 | 0x014c +#define NT_STATUS_REGISTRY_IO_FAILED 0xC0000000 | 0x014d +#define NT_STATUS_NO_EVENT_PAIR 0xC0000000 | 0x014e +#define NT_STATUS_UNRECOGNIZED_VOLUME 0xC0000000 | 0x014f +#define NT_STATUS_SERIAL_NO_DEVICE_INITED 0xC0000000 | 0x0150 +#define NT_STATUS_NO_SUCH_ALIAS 0xC0000000 | 0x0151 +#define NT_STATUS_MEMBER_NOT_IN_ALIAS 0xC0000000 | 0x0152 +#define NT_STATUS_MEMBER_IN_ALIAS 0xC0000000 | 0x0153 +#define NT_STATUS_ALIAS_EXISTS 0xC0000000 | 0x0154 +#define NT_STATUS_LOGON_NOT_GRANTED 0xC0000000 | 0x0155 +#define NT_STATUS_TOO_MANY_SECRETS 0xC0000000 | 0x0156 +#define NT_STATUS_SECRET_TOO_LONG 0xC0000000 | 0x0157 +#define NT_STATUS_INTERNAL_DB_ERROR 0xC0000000 | 0x0158 +#define NT_STATUS_FULLSCREEN_MODE 0xC0000000 | 0x0159 +#define NT_STATUS_TOO_MANY_CONTEXT_IDS 0xC0000000 | 0x015a +#define NT_STATUS_LOGON_TYPE_NOT_GRANTED 0xC0000000 | 0x015b +#define NT_STATUS_NOT_REGISTRY_FILE 0xC0000000 | 0x015c +#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x015d +#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR 0xC0000000 | 0x015e +#define NT_STATUS_FT_MISSING_MEMBER 0xC0000000 | 0x015f +#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY 0xC0000000 | 0x0160 +#define NT_STATUS_ILLEGAL_CHARACTER 0xC0000000 | 0x0161 +#define NT_STATUS_UNMAPPABLE_CHARACTER 0xC0000000 | 0x0162 +#define NT_STATUS_UNDEFINED_CHARACTER 0xC0000000 | 0x0163 +#define NT_STATUS_FLOPPY_VOLUME 0xC0000000 | 0x0164 +#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND 0xC0000000 | 0x0165 +#define NT_STATUS_FLOPPY_WRONG_CYLINDER 0xC0000000 | 0x0166 +#define NT_STATUS_FLOPPY_UNKNOWN_ERROR 0xC0000000 | 0x0167 +#define NT_STATUS_FLOPPY_BAD_REGISTERS 0xC0000000 | 0x0168 +#define NT_STATUS_DISK_RECALIBRATE_FAILED 0xC0000000 | 0x0169 +#define NT_STATUS_DISK_OPERATION_FAILED 0xC0000000 | 0x016a +#define NT_STATUS_DISK_RESET_FAILED 0xC0000000 | 0x016b +#define NT_STATUS_SHARED_IRQ_BUSY 0xC0000000 | 0x016c +#define NT_STATUS_FT_ORPHANING 0xC0000000 | 0x016d +#define NT_STATUS_PARTITION_FAILURE 0xC0000000 | 0x0172 +#define NT_STATUS_INVALID_BLOCK_LENGTH 0xC0000000 | 0x0173 +#define NT_STATUS_DEVICE_NOT_PARTITIONED 0xC0000000 | 0x0174 +#define NT_STATUS_UNABLE_TO_LOCK_MEDIA 0xC0000000 | 0x0175 +#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA 0xC0000000 | 0x0176 +#define NT_STATUS_EOM_OVERFLOW 0xC0000000 | 0x0177 +#define NT_STATUS_NO_MEDIA 0xC0000000 | 0x0178 +#define NT_STATUS_NO_SUCH_MEMBER 0xC0000000 | 0x017a +#define NT_STATUS_INVALID_MEMBER 0xC0000000 | 0x017b +#define NT_STATUS_KEY_DELETED 0xC0000000 | 0x017c +#define NT_STATUS_NO_LOG_SPACE 0xC0000000 | 0x017d +#define NT_STATUS_TOO_MANY_SIDS 0xC0000000 | 0x017e +#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x017f +#define NT_STATUS_KEY_HAS_CHILDREN 0xC0000000 | 0x0180 +#define NT_STATUS_CHILD_MUST_BE_VOLATILE 0xC0000000 | 0x0181 +#define NT_STATUS_DEVICE_CONFIGURATION_ERROR 0xC0000000 | 0x0182 +#define NT_STATUS_DRIVER_INTERNAL_ERROR 0xC0000000 | 0x0183 +#define NT_STATUS_INVALID_DEVICE_STATE 0xC0000000 | 0x0184 +#define NT_STATUS_IO_DEVICE_ERROR 0xC0000000 | 0x0185 +#define NT_STATUS_DEVICE_PROTOCOL_ERROR 0xC0000000 | 0x0186 +#define NT_STATUS_BACKUP_CONTROLLER 0xC0000000 | 0x0187 +#define NT_STATUS_LOG_FILE_FULL 0xC0000000 | 0x0188 +#define NT_STATUS_TOO_LATE 0xC0000000 | 0x0189 +#define NT_STATUS_NO_TRUST_LSA_SECRET 0xC0000000 | 0x018a +#define NT_STATUS_NO_TRUST_SAM_ACCOUNT 0xC0000000 | 0x018b +#define NT_STATUS_TRUSTED_DOMAIN_FAILURE 0xC0000000 | 0x018c +#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE 0xC0000000 | 0x018d +#define NT_STATUS_EVENTLOG_FILE_CORRUPT 0xC0000000 | 0x018e +#define NT_STATUS_EVENTLOG_CANT_START 0xC0000000 | 0x018f +#define NT_STATUS_TRUST_FAILURE 0xC0000000 | 0x0190 +#define NT_STATUS_MUTANT_LIMIT_EXCEEDED 0xC0000000 | 0x0191 +#define NT_STATUS_NETLOGON_NOT_STARTED 0xC0000000 | 0x0192 +#define NT_STATUS_ACCOUNT_EXPIRED 0xC0000000 | 0x0193 +#define NT_STATUS_POSSIBLE_DEADLOCK 0xC0000000 | 0x0194 +#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT 0xC0000000 | 0x0195 +#define NT_STATUS_REMOTE_SESSION_LIMIT 0xC0000000 | 0x0196 +#define NT_STATUS_EVENTLOG_FILE_CHANGED 0xC0000000 | 0x0197 +#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT 0xC0000000 | 0x0198 +#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT 0xC0000000 | 0x0199 +#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT 0xC0000000 | 0x019a +#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT 0xC0000000 | 0x019b +#define NT_STATUS_FS_DRIVER_REQUIRED 0xC0000000 | 0x019c +#define NT_STATUS_NO_USER_SESSION_KEY 0xC0000000 | 0x0202 +#define NT_STATUS_USER_SESSION_DELETED 0xC0000000 | 0x0203 +#define NT_STATUS_RESOURCE_LANG_NOT_FOUND 0xC0000000 | 0x0204 +#define NT_STATUS_INSUFF_SERVER_RESOURCES 0xC0000000 | 0x0205 +#define NT_STATUS_INVALID_BUFFER_SIZE 0xC0000000 | 0x0206 +#define NT_STATUS_INVALID_ADDRESS_COMPONENT 0xC0000000 | 0x0207 +#define NT_STATUS_INVALID_ADDRESS_WILDCARD 0xC0000000 | 0x0208 +#define NT_STATUS_TOO_MANY_ADDRESSES 0xC0000000 | 0x0209 +#define NT_STATUS_ADDRESS_ALREADY_EXISTS 0xC0000000 | 0x020a +#define NT_STATUS_ADDRESS_CLOSED 0xC0000000 | 0x020b +#define NT_STATUS_CONNECTION_DISCONNECTED 0xC0000000 | 0x020c +#define NT_STATUS_CONNECTION_RESET 0xC0000000 | 0x020d +#define NT_STATUS_TOO_MANY_NODES 0xC0000000 | 0x020e +#define NT_STATUS_TRANSACTION_ABORTED 0xC0000000 | 0x020f +#define NT_STATUS_TRANSACTION_TIMED_OUT 0xC0000000 | 0x0210 +#define NT_STATUS_TRANSACTION_NO_RELEASE 0xC0000000 | 0x0211 +#define NT_STATUS_TRANSACTION_NO_MATCH 0xC0000000 | 0x0212 +#define NT_STATUS_TRANSACTION_RESPONDED 0xC0000000 | 0x0213 +#define NT_STATUS_TRANSACTION_INVALID_ID 0xC0000000 | 0x0214 +#define NT_STATUS_TRANSACTION_INVALID_TYPE 0xC0000000 | 0x0215 +#define NT_STATUS_NOT_SERVER_SESSION 0xC0000000 | 0x0216 +#define NT_STATUS_NOT_CLIENT_SESSION 0xC0000000 | 0x0217 +#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE 0xC0000000 | 0x0218 +#define NT_STATUS_DEBUG_ATTACH_FAILED 0xC0000000 | 0x0219 +#define NT_STATUS_SYSTEM_PROCESS_TERMINATED 0xC0000000 | 0x021a +#define NT_STATUS_DATA_NOT_ACCEPTED 0xC0000000 | 0x021b +#define NT_STATUS_NO_BROWSER_SERVERS_FOUND 0xC0000000 | 0x021c +#define NT_STATUS_VDM_HARD_ERROR 0xC0000000 | 0x021d +#define NT_STATUS_DRIVER_CANCEL_TIMEOUT 0xC0000000 | 0x021e +#define NT_STATUS_REPLY_MESSAGE_MISMATCH 0xC0000000 | 0x021f +#define NT_STATUS_MAPPED_ALIGNMENT 0xC0000000 | 0x0220 +#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH 0xC0000000 | 0x0221 +#define NT_STATUS_LOST_WRITEBEHIND_DATA 0xC0000000 | 0x0222 +#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID 0xC0000000 | 0x0223 +#define NT_STATUS_PASSWORD_MUST_CHANGE 0xC0000000 | 0x0224 +#define NT_STATUS_NOT_FOUND 0xC0000000 | 0x0225 +#define NT_STATUS_NOT_TINY_STREAM 0xC0000000 | 0x0226 +#define NT_STATUS_RECOVERY_FAILURE 0xC0000000 | 0x0227 +#define NT_STATUS_STACK_OVERFLOW_READ 0xC0000000 | 0x0228 +#define NT_STATUS_FAIL_CHECK 0xC0000000 | 0x0229 +#define NT_STATUS_DUPLICATE_OBJECTID 0xC0000000 | 0x022a +#define NT_STATUS_OBJECTID_EXISTS 0xC0000000 | 0x022b +#define NT_STATUS_CONVERT_TO_LARGE 0xC0000000 | 0x022c +#define NT_STATUS_RETRY 0xC0000000 | 0x022d +#define NT_STATUS_FOUND_OUT_OF_SCOPE 0xC0000000 | 0x022e +#define NT_STATUS_ALLOCATE_BUCKET 0xC0000000 | 0x022f +#define NT_STATUS_PROPSET_NOT_FOUND 0xC0000000 | 0x0230 +#define NT_STATUS_MARSHALL_OVERFLOW 0xC0000000 | 0x0231 +#define NT_STATUS_INVALID_VARIANT 0xC0000000 | 0x0232 +#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND 0xC0000000 | 0x0233 +#define NT_STATUS_ACCOUNT_LOCKED_OUT 0xC0000000 | 0x0234 +#define NT_STATUS_HANDLE_NOT_CLOSABLE 0xC0000000 | 0x0235 +#define NT_STATUS_CONNECTION_REFUSED 0xC0000000 | 0x0236 +#define NT_STATUS_GRACEFUL_DISCONNECT 0xC0000000 | 0x0237 +#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED 0xC0000000 | 0x0238 +#define NT_STATUS_ADDRESS_NOT_ASSOCIATED 0xC0000000 | 0x0239 +#define NT_STATUS_CONNECTION_INVALID 0xC0000000 | 0x023a +#define NT_STATUS_CONNECTION_ACTIVE 0xC0000000 | 0x023b +#define NT_STATUS_NETWORK_UNREACHABLE 0xC0000000 | 0x023c +#define NT_STATUS_HOST_UNREACHABLE 0xC0000000 | 0x023d +#define NT_STATUS_PROTOCOL_UNREACHABLE 0xC0000000 | 0x023e +#define NT_STATUS_PORT_UNREACHABLE 0xC0000000 | 0x023f +#define NT_STATUS_REQUEST_ABORTED 0xC0000000 | 0x0240 +#define NT_STATUS_CONNECTION_ABORTED 0xC0000000 | 0x0241 +#define NT_STATUS_BAD_COMPRESSION_BUFFER 0xC0000000 | 0x0242 +#define NT_STATUS_USER_MAPPED_FILE 0xC0000000 | 0x0243 +#define NT_STATUS_AUDIT_FAILED 0xC0000000 | 0x0244 +#define NT_STATUS_TIMER_RESOLUTION_NOT_SET 0xC0000000 | 0x0245 +#define NT_STATUS_CONNECTION_COUNT_LIMIT 0xC0000000 | 0x0246 +#define NT_STATUS_LOGIN_TIME_RESTRICTION 0xC0000000 | 0x0247 +#define NT_STATUS_LOGIN_WKSTA_RESTRICTION 0xC0000000 | 0x0248 +#define NT_STATUS_IMAGE_MP_UP_MISMATCH 0xC0000000 | 0x0249 +#define NT_STATUS_INSUFFICIENT_LOGON_INFO 0xC0000000 | 0x0250 +#define NT_STATUS_BAD_DLL_ENTRYPOINT 0xC0000000 | 0x0251 +#define NT_STATUS_BAD_SERVICE_ENTRYPOINT 0xC0000000 | 0x0252 +#define NT_STATUS_LPC_REPLY_LOST 0xC0000000 | 0x0253 +#define NT_STATUS_IP_ADDRESS_CONFLICT1 0xC0000000 | 0x0254 +#define NT_STATUS_IP_ADDRESS_CONFLICT2 0xC0000000 | 0x0255 +#define NT_STATUS_REGISTRY_QUOTA_LIMIT 0xC0000000 | 0x0256 +#define NT_STATUS_PATH_NOT_COVERED 0xC0000000 | 0x0257 +#define NT_STATUS_NO_CALLBACK_ACTIVE 0xC0000000 | 0x0258 +#define NT_STATUS_LICENSE_QUOTA_EXCEEDED 0xC0000000 | 0x0259 +#define NT_STATUS_PWD_TOO_SHORT 0xC0000000 | 0x025a +#define NT_STATUS_PWD_TOO_RECENT 0xC0000000 | 0x025b +#define NT_STATUS_PWD_HISTORY_CONFLICT 0xC0000000 | 0x025c +#define NT_STATUS_PLUGPLAY_NO_DEVICE 0xC0000000 | 0x025e +#define NT_STATUS_UNSUPPORTED_COMPRESSION 0xC0000000 | 0x025f +#define NT_STATUS_INVALID_HW_PROFILE 0xC0000000 | 0x0260 +#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH 0xC0000000 | 0x0261 +#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND 0xC0000000 | 0x0262 +#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0263 +#define NT_STATUS_RESOURCE_NOT_OWNED 0xC0000000 | 0x0264 +#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265 +#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266 +#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267 +#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE /* scheduler */ + +#endif /* _NTERR_H */ diff --git a/kernel/fs/cifs/ntlmssp.h b/kernel/fs/cifs/ntlmssp.h new file mode 100644 index 000000000..848249fa1 --- /dev/null +++ b/kernel/fs/cifs/ntlmssp.h @@ -0,0 +1,138 @@ +/* + * fs/cifs/ntlmssp.h + * + * Copyright (c) International Business Machines Corp., 2002,2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define NTLMSSP_SIGNATURE "NTLMSSP" +/* Message Types */ +#define NtLmNegotiate cpu_to_le32(1) +#define NtLmChallenge cpu_to_le32(2) +#define NtLmAuthenticate cpu_to_le32(3) +#define UnknownMessage cpu_to_le32(8) + +/* Negotiate Flags */ +#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */ +#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */ +#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */ +/* define reserved9 0x08 */ +#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */ +#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */ +#define NTLMSSP_NEGOTIATE_DGRAM 0x0040 +#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */ +/* defined reserved 8 0x0100 */ +#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */ +#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */ +#define NTLMSSP_ANONYMOUS 0x0800 +#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */ +#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000 +#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */ +#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */ +#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000 +#define NTLMSSP_TARGET_TYPE_SERVER 0x20000 +#define NTLMSSP_TARGET_TYPE_SHARE 0x40000 +#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/ +/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */ +#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000 +#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */ +#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 +#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 +/* #define reserved4 0x1000000 */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +/* #define reserved3 0x4000000 */ +/* #define reserved2 0x8000000 */ +/* #define reserved1 0x10000000 */ +#define NTLMSSP_NEGOTIATE_128 0x20000000 +#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 +#define NTLMSSP_NEGOTIATE_56 0x80000000 + +/* Define AV Pair Field IDs */ +enum av_field_type { + NTLMSSP_AV_EOL = 0, + NTLMSSP_AV_NB_COMPUTER_NAME, + NTLMSSP_AV_NB_DOMAIN_NAME, + NTLMSSP_AV_DNS_COMPUTER_NAME, + NTLMSSP_AV_DNS_DOMAIN_NAME, + NTLMSSP_AV_DNS_TREE_NAME, + NTLMSSP_AV_FLAGS, + NTLMSSP_AV_TIMESTAMP, + NTLMSSP_AV_RESTRICTION, + NTLMSSP_AV_TARGET_NAME, + NTLMSSP_AV_CHANNEL_BINDINGS +}; + +/* Although typedefs are not commonly used for structure definitions */ +/* in the Linux kernel, in this particular case they are useful */ +/* to more closely match the standards document for NTLMSSP from */ +/* OpenGroup and to make the code more closely match the standard in */ +/* appearance */ + +typedef struct _SECURITY_BUFFER { + __le16 Length; + __le16 MaximumLength; + __le32 BufferOffset; /* offset to buffer */ +} __attribute__((packed)) SECURITY_BUFFER; + +typedef struct _NEGOTIATE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ + SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ + char DomainString[0]; + /* followed by WorkstationString */ +} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; + +typedef struct _CHALLENGE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmChallenge = 2 */ + SECURITY_BUFFER TargetName; + __le32 NegotiateFlags; + __u8 Challenge[CIFS_CRYPTO_KEY_SIZE]; + __u8 Reserved[8]; + SECURITY_BUFFER TargetInfoArray; + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ +} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE; + +typedef struct _AUTHENTICATE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmsAuthenticate = 3 */ + SECURITY_BUFFER LmChallengeResponse; + SECURITY_BUFFER NtChallengeResponse; + SECURITY_BUFFER DomainName; + SECURITY_BUFFER UserName; + SECURITY_BUFFER WorkstationName; + SECURITY_BUFFER SessionKey; + __le32 NegotiateFlags; + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ + char UserString[0]; +} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; + +/* + * Size of the session key (crypto key encrypted with the password + */ + +int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); +void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses); +int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen, + struct cifs_ses *ses, + const struct nls_table *nls_cp); diff --git a/kernel/fs/cifs/readdir.c b/kernel/fs/cifs/readdir.c new file mode 100644 index 000000000..b1eede367 --- /dev/null +++ b/kernel/fs/cifs/readdir.c @@ -0,0 +1,875 @@ +/* + * fs/cifs/readdir.c + * + * Directory search handling + * + * Copyright (C) International Business Machines Corp., 2004, 2008 + * Copyright (C) Red Hat, Inc., 2011 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifsfs.h" + +/* + * To be safe - for UCS to UTF-8 with strings loaded with the rare long + * characters alloc more to account for such multibyte target UTF-8 + * characters. + */ +#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2) + +#ifdef CONFIG_CIFS_DEBUG2 +static void dump_cifs_file_struct(struct file *file, char *label) +{ + struct cifsFileInfo *cf; + + if (file) { + cf = file->private_data; + if (cf == NULL) { + cifs_dbg(FYI, "empty cifs private file data\n"); + return; + } + if (cf->invalidHandle) + cifs_dbg(FYI, "invalid handle\n"); + if (cf->srch_inf.endOfSearch) + cifs_dbg(FYI, "end of search\n"); + if (cf->srch_inf.emptyDir) + cifs_dbg(FYI, "empty dir\n"); + } +} +#else +static inline void dump_cifs_file_struct(struct file *file, char *label) +{ +} +#endif /* DEBUG2 */ + +/* + * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT + * + * Find the dentry that matches "name". If there isn't one, create one. If it's + * a negative dentry or the uniqueid or filetype(mode) changed, + * then drop it and recreate it. + */ +static void +cifs_prime_dcache(struct dentry *parent, struct qstr *name, + struct cifs_fattr *fattr) +{ + struct dentry *dentry, *alias; + struct inode *inode; + struct super_block *sb = d_inode(parent)->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); + + dentry = d_hash_and_lookup(parent, name); + if (unlikely(IS_ERR(dentry))) + return; + + if (dentry) { + inode = d_inode(dentry); + if (inode) { + if (d_mountpoint(dentry)) + goto out; + /* + * If we're generating inode numbers, then we don't + * want to clobber the existing one with the one that + * the readdir code created. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) + fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; + + /* update inode in place + * if both i_ino and i_mode didn't change */ + if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && + (inode->i_mode & S_IFMT) == + (fattr->cf_mode & S_IFMT)) { + cifs_fattr_to_inode(inode, fattr); + goto out; + } + } + d_invalidate(dentry); + dput(dentry); + } + + /* + * If we know that the inode will need to be revalidated immediately, + * then don't create a new dentry for it. We'll end up doing an on + * the wire call either way and this spares us an invalidation. + */ + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + return; + + dentry = d_alloc(parent, name); + if (!dentry) + return; + + inode = cifs_iget(sb, fattr); + if (!inode) + goto out; + + alias = d_splice_alias(inode, dentry); + if (alias && !IS_ERR(alias)) + dput(alias); +out: + dput(dentry); +} + +static void +cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) +{ + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; + } + + /* + * We need to revalidate it further to make a decision about whether it + * is a symbolic link, DFS referral or a reparse point with a direct + * access like junctions, deduplicated files, NFS symlinks. + */ + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + + /* non-unix readdir doesn't provide nlink */ + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; + + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~S_IWUGO; + + /* + * We of course don't get ACL info in FIND_FIRST/NEXT results, so + * mark it for revalidation so that "ls -l" will look right. It might + * be super-slow, but if we don't do this then the ownership of files + * may look wrong since the inodes may not have timed out by the time + * "ls" does a stat() call on them. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && + fattr->cf_cifsattrs & ATTR_SYSTEM) { + if (fattr->cf_eof == 0) { + fattr->cf_mode &= ~S_IFMT; + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + } else { + /* + * trying to get the type and mode via SFU can be slow, + * so just call those regular files for now, and mark + * for reval + */ + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + } + } +} + +void +cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + cifs_fill_common_info(fattr, cifs_sb); +} + +static void +cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj; + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, + info->LastAccessTime, offset); + fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); + fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); + + fattr->cf_cifsattrs = le16_to_cpu(info->Attributes); + fattr->cf_bytes = le32_to_cpu(info->AllocationSize); + fattr->cf_eof = le32_to_cpu(info->DataSize); + + cifs_fill_common_info(fattr, cifs_sb); +} + +/* BB eventually need to add the following helper function to + resolve NT_STATUS_STOPPED_ON_SYMLINK return code when + we try to do FindFirst on (NTFS) directory symlinks */ +/* +int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, + unsigned int xid) +{ + __u16 fid; + int len; + int oplock = 0; + int rc; + struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb); + char *tmpbuffer; + + rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, + OPEN_REPARSE_POINT, &fid, &oplock, NULL, + cifs_sb->local_nls, + cifs_remap(cifs_sb); + if (!rc) { + tmpbuffer = kmalloc(maxpath); + rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, + tmpbuffer, + maxpath -1, + fid, + cifs_sb->local_nls); + if (CIFSSMBClose(xid, ptcon, fid)) { + cifs_dbg(FYI, "Error closing temporary reparsepoint open\n"); + } + } +} + */ + +static int +initiate_cifs_search(const unsigned int xid, struct file *file) +{ + __u16 search_flags; + int rc = 0; + char *full_path = NULL; + struct cifsFileInfo *cifsFile; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + + if (file->private_data == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cifsFile == NULL) { + rc = -ENOMEM; + goto error_exit; + } + file->private_data = cifsFile; + cifsFile->tlink = cifs_get_tlink(tlink); + tcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + tcon = tlink_tcon(cifsFile->tlink); + } + + server = tcon->ses->server; + + if (!server->ops->query_dir_first) { + rc = -ENOSYS; + goto error_exit; + } + + cifsFile->invalidHandle = true; + cifsFile->srch_inf.endOfSearch = false; + + full_path = build_path_from_dentry(file->f_path.dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto error_exit; + } + + cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); + +ffirst_retry: + /* test for Unix extensions */ + /* but now check for them on the share/mount not on the SMB session */ + /* if (cap_unix(tcon->ses) { */ + if (tcon->unix_ext) + cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + } else /* not srvinos - BB fixme add check for backlevel? */ { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + } + + search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; + if (backup_cred(cifs_sb)) + search_flags |= CIFS_SEARCH_BACKUP_SEARCH; + + rc = server->ops->query_dir_first(xid, tcon, full_path, cifs_sb, + &cifsFile->fid, search_flags, + &cifsFile->srch_inf); + + if (rc == 0) + cifsFile->invalidHandle = false; + /* BB add following call to handle readdir on new NTFS symlink errors + else if STATUS_STOPPED_ON_SYMLINK + call get_symlink_reparse_path and retry with new path */ + else if ((rc == -EOPNOTSUPP) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + goto ffirst_retry; + } +error_exit: + kfree(full_path); + cifs_put_tlink(tlink); + return rc; +} + +/* return length of unicode string in bytes */ +static int cifs_unicode_bytelen(const char *str) +{ + int len; + const __le16 *ustr = (const __le16 *)str; + + for (len = 0; len <= PATH_MAX; len++) { + if (ustr[len] == 0) + return len << 1; + } + cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n"); + return len << 1; +} + +static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) +{ + char *new_entry; + FILE_DIRECTORY_INFO *pDirInfo = (FILE_DIRECTORY_INFO *)old_entry; + + if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pfData; + pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + pfData->FileNameLength; + } else + new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); + cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry); + /* validate that new_entry is not past end of SMB */ + if (new_entry >= end_of_smb) { + cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n", + new_entry, end_of_smb, old_entry); + return NULL; + } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) + || ((level != SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { + cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", + new_entry, end_of_smb); + return NULL; + } else + return new_entry; + +} + +struct cifs_dirent { + const char *name; + size_t namelen; + u32 resume_key; + u64 ino; +}; + +static void cifs_fill_dirent_unix(struct cifs_dirent *de, + const FILE_UNIX_INFO *info, bool is_unicode) +{ + de->name = &info->FileName[0]; + if (is_unicode) + de->namelen = cifs_unicode_bytelen(de->name); + else + de->namelen = strnlen(de->name, PATH_MAX); + de->resume_key = info->ResumeKey; + de->ino = le64_to_cpu(info->basic.UniqueId); +} + +static void cifs_fill_dirent_dir(struct cifs_dirent *de, + const FILE_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_full(struct cifs_dirent *de, + const FILE_FULL_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_search(struct cifs_dirent *de, + const SEARCH_ID_FULL_DIR_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; + de->ino = le64_to_cpu(info->UniqueId); +} + +static void cifs_fill_dirent_both(struct cifs_dirent *de, + const FILE_BOTH_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_std(struct cifs_dirent *de, + const FIND_FILE_STANDARD_INFO *info) +{ + de->name = &info->FileName[0]; + /* one byte length, no endianess conversion */ + de->namelen = info->FileNameLength; + de->resume_key = info->ResumeKey; +} + +static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, + u16 level, bool is_unicode) +{ + memset(de, 0, sizeof(*de)); + + switch (level) { + case SMB_FIND_FILE_UNIX: + cifs_fill_dirent_unix(de, info, is_unicode); + break; + case SMB_FIND_FILE_DIRECTORY_INFO: + cifs_fill_dirent_dir(de, info); + break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + cifs_fill_dirent_full(de, info); + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fill_dirent_search(de, info); + break; + case SMB_FIND_FILE_BOTH_DIRECTORY_INFO: + cifs_fill_dirent_both(de, info); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_fill_dirent_std(de, info); + break; + default: + cifs_dbg(FYI, "Unknown findfirst level %d\n", level); + return -EINVAL; + } + + return 0; +} + +#define UNICODE_DOT cpu_to_le16(0x2e) + +/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ +static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) +{ + int rc = 0; + + if (!de->name) + return 0; + + if (is_unicode) { + __le16 *ufilename = (__le16 *)de->name; + if (de->namelen == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (de->namelen == 4) { + /* check for .. */ + if (ufilename[0] == UNICODE_DOT && + ufilename[1] == UNICODE_DOT) + rc = 2; + } + } else /* ASCII */ { + if (de->namelen == 1) { + if (de->name[0] == '.') + rc = 1; + } else if (de->namelen == 2) { + if (de->name[0] == '.' && de->name[1] == '.') + rc = 2; + } + } + + return rc; +} + +/* Check if directory that we are searching has changed so we can decide + whether we can use the cached search results from the previous search */ +static int is_dir_changed(struct file *file) +{ + struct inode *inode = file_inode(file); + struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + + if (cifsInfo->time == 0) + return 1; /* directory was changed, perhaps due to unlink */ + else + return 0; + +} + +static int cifs_save_resume_key(const char *current_entry, + struct cifsFileInfo *file_info) +{ + struct cifs_dirent de; + int rc; + + rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (!rc) { + file_info->srch_inf.presume_name = de.name; + file_info->srch_inf.resume_name_len = de.namelen; + file_info->srch_inf.resume_key = de.resume_key; + } + return rc; +} + +/* + * Find the corresponding entry in the search. Note that the SMB server returns + * search entries for . and .. which complicates logic here if we choose to + * parse for them and we do not assume that they are located in the findfirst + * return buffer. We start counting in the buffer with entry 2 and increment for + * every entry (do not increment for . or .. entry). + */ +static int +find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, + struct file *file, char **current_entry, int *num_to_ret) +{ + __u16 search_flags; + int rc = 0; + int pos_in_buf = 0; + loff_t first_entry_in_buffer; + loff_t index_to_find = pos; + struct cifsFileInfo *cfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct TCP_Server_Info *server = tcon->ses->server; + /* check if index in the buffer */ + + if (!server->ops->query_dir_first || !server->ops->query_dir_next) + return -ENOSYS; + + if ((cfile == NULL) || (current_entry == NULL) || (num_to_ret == NULL)) + return -ENOENT; + + *current_entry = NULL; + first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - + cfile->srch_inf.entries_in_buffer; + + /* + * If first entry in buf is zero then is first buffer + * in search response data which means it is likely . and .. + * will be in this buffer, although some servers do not return + * . and .. for the root of a drive and for those we need + * to start two entries earlier. + */ + + dump_cifs_file_struct(file, "In fce "); + if (((index_to_find < cfile->srch_inf.index_of_last_entry) && + is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { + /* close and restart search */ + cifs_dbg(FYI, "search backing up - close and restart search\n"); + spin_lock(&cifs_file_list_lock); + if (server->ops->dir_needs_close(cfile)) { + cfile->invalidHandle = true; + spin_unlock(&cifs_file_list_lock); + if (server->ops->close_dir) + server->ops->close_dir(xid, tcon, &cfile->fid); + } else + spin_unlock(&cifs_file_list_lock); + if (cfile->srch_inf.ntwrk_buf_start) { + cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); + if (cfile->srch_inf.smallBuf) + cifs_small_buf_release(cfile->srch_inf. + ntwrk_buf_start); + else + cifs_buf_release(cfile->srch_inf. + ntwrk_buf_start); + cfile->srch_inf.ntwrk_buf_start = NULL; + } + rc = initiate_cifs_search(xid, file); + if (rc) { + cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", + rc); + return rc; + } + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cfile->srch_inf.last_entry) + cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); + } + + search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; + if (backup_cred(cifs_sb)) + search_flags |= CIFS_SEARCH_BACKUP_SEARCH; + + while ((index_to_find >= cfile->srch_inf.index_of_last_entry) && + (rc == 0) && !cfile->srch_inf.endOfSearch) { + cifs_dbg(FYI, "calling findnext2\n"); + rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, + search_flags, + &cfile->srch_inf); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cfile->srch_inf.last_entry) + cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); + if (rc) + return -ENOENT; + } + if (index_to_find < cfile->srch_inf.index_of_last_entry) { + /* we found the buffer that contains the entry */ + /* scan and find it */ + int i; + char *cur_ent; + char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + server->ops->calc_smb_size( + cfile->srch_inf.ntwrk_buf_start); + + cur_ent = cfile->srch_inf.srch_entries_start; + first_entry_in_buffer = cfile->srch_inf.index_of_last_entry + - cfile->srch_inf.entries_in_buffer; + pos_in_buf = index_to_find - first_entry_in_buffer; + cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf); + + for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { + /* go entry by entry figuring out which is first */ + cur_ent = nxt_dir_entry(cur_ent, end_of_smb, + cfile->srch_inf.info_level); + } + if ((cur_ent == NULL) && (i < pos_in_buf)) { + /* BB fixme - check if we should flag this error */ + cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n", + pos_in_buf, index_to_find, rc); + } + rc = 0; + *current_entry = cur_ent; + } else { + cifs_dbg(FYI, "index not in buffer - could not findnext into it\n"); + return 0; + } + + if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) { + cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n"); + *num_to_ret = 0; + } else + *num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf; + + return rc; +} + +static int cifs_filldir(char *find_entry, struct file *file, + struct dir_context *ctx, + char *scratch_buf, unsigned int max_len) +{ + struct cifsFileInfo *file_info = file->private_data; + struct super_block *sb = file_inode(file)->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_dirent de = { NULL, }; + struct cifs_fattr fattr; + struct qstr name; + int rc = 0; + ino_t ino; + + rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (rc) + return rc; + + if (de.namelen > max_len) { + cifs_dbg(VFS, "bad search response length %zd past smb end\n", + de.namelen); + return -EINVAL; + } + + /* skip . and .. since we added them first */ + if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) + return 0; + + if (file_info->srch_inf.unicode) { + struct nls_table *nlt = cifs_sb->local_nls; + int map_type; + + map_type = cifs_remap(cifs_sb); + name.name = scratch_buf; + name.len = + cifs_from_utf16((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min_t(size_t, de.namelen, + (size_t)max_len), nlt, map_type); + name.len -= nls_nullsize(nlt); + } else { + name.name = de.name; + name.len = de.namelen; + } + + switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_UNIX: + cifs_unix_basic_to_fattr(&fattr, + &((FILE_UNIX_INFO *)find_entry)->basic, + cifs_sb); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_std_info_to_fattr(&fattr, + (FIND_FILE_STANDARD_INFO *)find_entry, + cifs_sb); + break; + default: + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)find_entry, + cifs_sb); + break; + } + + if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = de.ino; + } else { + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && + couldbe_mf_symlink(&fattr)) + /* + * trying to get the type and mode can be slow, + * so just call those regular files for now, and mark + * for reval + */ + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; + + cifs_prime_dcache(file->f_path.dentry, &name, &fattr); + + ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); + return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); +} + + +int cifs_readdir(struct file *file, struct dir_context *ctx) +{ + int rc = 0; + unsigned int xid; + int i; + struct cifs_tcon *tcon; + struct cifsFileInfo *cifsFile = NULL; + char *current_entry; + int num_to_fill = 0; + char *tmp_buf = NULL; + char *end_of_smb; + unsigned int max_len; + + xid = get_xid(); + + /* + * Ensure FindFirst doesn't fail before doing filldir() for '.' and + * '..'. Otherwise we won't be able to notify VFS in case of failure. + */ + if (file->private_data == NULL) { + rc = initiate_cifs_search(xid, file); + cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); + if (rc) + goto rddir2_exit; + } + + if (!dir_emit_dots(file, ctx)) + goto rddir2_exit; + + /* 1) If search is active, + is in current search buffer? + if it before then restart search + if after then keep searching till find it */ + + cifsFile = file->private_data; + if (cifsFile->srch_inf.endOfSearch) { + if (cifsFile->srch_inf.emptyDir) { + cifs_dbg(FYI, "End of search, empty dir\n"); + rc = 0; + goto rddir2_exit; + } + } /* else { + cifsFile->invalidHandle = true; + tcon->ses->server->close(xid, tcon, &cifsFile->fid); + } */ + + tcon = tlink_tcon(cifsFile->tlink); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, + &num_to_fill); + if (rc) { + cifs_dbg(FYI, "fce error %d\n", rc); + goto rddir2_exit; + } else if (current_entry != NULL) { + cifs_dbg(FYI, "entry %lld found\n", ctx->pos); + } else { + cifs_dbg(FYI, "could not find entry\n"); + goto rddir2_exit; + } + cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", + num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); + max_len = tcon->ses->server->ops->calc_smb_size( + cifsFile->srch_inf.ntwrk_buf_start); + end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; + + tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + + for (i = 0; i < num_to_fill; i++) { + if (current_entry == NULL) { + /* evaluate whether this case is an error */ + cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", + num_to_fill, i); + break; + } + /* + * if buggy server returns . and .. late do we want to + * check for that here? + */ + rc = cifs_filldir(current_entry, file, ctx, + tmp_buf, max_len); + if (rc) { + if (rc > 0) + rc = 0; + break; + } + + ctx->pos++; + if (ctx->pos == + cifsFile->srch_inf.index_of_last_entry) { + cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", + ctx->pos, tmp_buf); + cifs_save_resume_key(current_entry, cifsFile); + break; + } else + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); + } + kfree(tmp_buf); + +rddir2_exit: + free_xid(xid); + return rc; +} diff --git a/kernel/fs/cifs/rfc1002pdu.h b/kernel/fs/cifs/rfc1002pdu.h new file mode 100644 index 000000000..8b69fcceb --- /dev/null +++ b/kernel/fs/cifs/rfc1002pdu.h @@ -0,0 +1,74 @@ +/* + * fs/cifs/rfc1002pdu.h + * + * Protocol Data Unit definitions for RFC 1001/1002 support + * + * Copyright (c) International Business Machines Corp., 2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */ + + /* RFC 1002 session packet types */ +#define RFC1002_SESSION_MESSAGE 0x00 +#define RFC1002_SESSION_REQUEST 0x81 +#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 +#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 +#define RFC1002_RETARGET_SESSION_RESPONSE 0x84 +#define RFC1002_SESSION_KEEP_ALIVE 0x85 + + /* RFC 1002 flags (only one defined */ +#define RFC1002_LENGTH_EXTEND 0x80 /* high order bit of length (ie +64K) */ + +struct rfc1002_session_packet { + __u8 type; + __u8 flags; + __u16 length; + union { + struct { + __u8 called_len; + __u8 called_name[32]; + __u8 scope1; /* null */ + __u8 calling_len; + __u8 calling_name[32]; + __u8 scope2; /* null */ + } __attribute__((packed)) session_req; + struct { + __u32 retarget_ip_addr; + __u16 port; + } __attribute__((packed)) retarget_resp; + __u8 neg_ses_resp_error_code; + /* POSITIVE_SESSION_RESPONSE packet does not include trailer. + SESSION_KEEP_ALIVE packet also does not include a trailer. + Trailer for the SESSION_MESSAGE packet is SMB/CIFS header */ + } __attribute__((packed)) trailer; +} __attribute__((packed)); + +/* Negative Session Response error codes */ +#define RFC1002_NOT_LISTENING_CALLED 0x80 /* not listening on called name */ +#define RFC1002_NOT_LISTENING_CALLING 0x81 /* not listening on calling name */ +#define RFC1002_NOT_PRESENT 0x82 /* called name not present */ +#define RFC1002_INSUFFICIENT_RESOURCE 0x83 +#define RFC1002_UNSPECIFIED_ERROR 0x8F + +/* RFC 1002 Datagram service packets are not defined here as they +are not needed for the network filesystem client unless we plan on +implementing broadcast resolution of the server ip address (from +server netbios name). Currently server names are resolved only via DNS +(tcp name) or ip address or an /etc/hosts equivalent mapping to ip address.*/ + +#define DEFAULT_CIFS_CALLED_NAME "*SMBSERVER " diff --git a/kernel/fs/cifs/sess.c b/kernel/fs/cifs/sess.c new file mode 100644 index 000000000..bce6fdcd5 --- /dev/null +++ b/kernel/fs/cifs/sess.c @@ -0,0 +1,1442 @@ +/* + * fs/cifs/sess.c + * + * SMB/CIFS session setup handling routines + * + * Copyright (c) International Business Machines Corp., 2006, 2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "ntlmssp.h" +#include "nterr.h" +#include +#include +#include "cifs_spnego.h" + +static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) +{ + __u32 capabilities = 0; + + /* init fields common to all four types of SessSetup */ + /* Note that offsets for first seven fields in req struct are same */ + /* in CIFS Specs so does not matter which of 3 forms of struct */ + /* that we use in next few lines */ + /* Note that header is initialized to zero in header_assemble */ + pSMB->req.AndXCommand = 0xFF; + pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, + USHRT_MAX)); + pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.VcNumber = cpu_to_le16(1); + + /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ + + /* BB verify whether signing required on neg or just on auth frame + (and NTLM case) */ + + capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | + CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; + + if (ses->server->sign) + pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE; + capabilities |= CAP_UNICODE; + } + if (ses->capabilities & CAP_STATUS32) { + pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS; + capabilities |= CAP_STATUS32; + } + if (ses->capabilities & CAP_DFS) { + pSMB->req.hdr.Flags2 |= SMBFLG2_DFS; + capabilities |= CAP_DFS; + } + if (ses->capabilities & CAP_UNIX) + capabilities |= CAP_UNIX; + + return capabilities; +} + +static void +unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* Copy OS version */ + bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32, + nls_cp); + bcc_ptr += 2 * bytes_ret; + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + *pbcc_area = bcc_ptr; +} + +static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* copy domain */ + if (ses->domainName == NULL) { + /* Sending null domain better than using a bogus domain name (as + we did briefly in 2.6.18) since server will use its default */ + *bcc_ptr = 0; + *(bcc_ptr+1) = 0; + bytes_ret = 0; + } else + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, + CIFS_MAX_DOMAINNAME_LEN, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null terminator */ + + *pbcc_area = bcc_ptr; +} + + +static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* BB FIXME add check that strings total less + than 335 or will need to send them as arrays */ + + /* unicode strings, must be word aligned before the call */ +/* if ((long) bcc_ptr % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } */ + /* copy user */ + if (ses->user_name == NULL) { + /* null user mount */ + *bcc_ptr = 0; + *(bcc_ptr+1) = 0; + } else { + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, + CIFS_MAX_USERNAME_LEN, nls_cp); + } + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null termination */ + + unicode_domain_string(&bcc_ptr, ses, nls_cp); + unicode_oslm_strings(&bcc_ptr, nls_cp); + + *pbcc_area = bcc_ptr; +} + +static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + /* copy user */ + /* BB what about null user mounts - check that we do this BB */ + /* copy user */ + if (ses->user_name != NULL) { + strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); + bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); + } + /* else null user mount */ + *bcc_ptr = 0; + bcc_ptr++; /* account for null termination */ + + /* copy domain */ + if (ses->domainName != NULL) { + strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + } /* else we will send a null domain name + so the server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + /* BB check for overflow here */ + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + +static void +decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int len; + char *data = *pbcc_area; + + cifs_dbg(FYI, "bleft %d\n", bleft); + + kfree(ses->serverOS); + ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); + cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS); + len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; + data += len; + bleft -= len; + if (bleft <= 0) + return; + + kfree(ses->serverNOS); + ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); + cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS); + len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; + data += len; + bleft -= len; + if (bleft <= 0) + return; + + kfree(ses->serverDomain); + ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp); + cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain); + + return; +} + +static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, + struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int len; + char *bcc_ptr = *pbcc_area; + + cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft); + + len = strnlen(bcc_ptr, bleft); + if (len >= bleft) + return; + + kfree(ses->serverOS); + + ses->serverOS = kzalloc(len + 1, GFP_KERNEL); + if (ses->serverOS) { + strncpy(ses->serverOS, bcc_ptr, len); + if (strncmp(ses->serverOS, "OS/2", 4) == 0) + cifs_dbg(FYI, "OS/2 server\n"); + } + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if (len >= bleft) + return; + + kfree(ses->serverNOS); + + ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); + if (ses->serverNOS) + strncpy(ses->serverNOS, bcc_ptr, len); + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if (len > bleft) + return; + + /* No domain field in LANMAN case. Domain is + returned by old servers in the SMB negprot response */ + /* BB For newer servers which do not support Unicode, + but thus do return domain here we could add parsing + for it later, but it is not very important */ + cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); +} + +int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, + struct cifs_ses *ses) +{ + unsigned int tioffset; /* challenge message target info area */ + unsigned int tilen; /* challenge message target info area length */ + + CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; + + if (blob_len < sizeof(CHALLENGE_MESSAGE)) { + cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len); + return -EINVAL; + } + + if (memcmp(pblob->Signature, "NTLMSSP", 8)) { + cifs_dbg(VFS, "blob signature incorrect %s\n", + pblob->Signature); + return -EINVAL; + } + if (pblob->MessageType != NtLmChallenge) { + cifs_dbg(VFS, "Incorrect message type %d\n", + pblob->MessageType); + return -EINVAL; + } + + memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); + /* BB we could decode pblob->NegotiateFlags; some may be useful */ + /* In particular we can examine sign flags */ + /* BB spec says that if AvId field of MsvAvTimestamp is populated then + we must set the MIC field of the AUTHENTICATE_MESSAGE */ + ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); + tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); + tilen = le16_to_cpu(pblob->TargetInfoArray.Length); + if (tioffset > blob_len || tioffset + tilen > blob_len) { + cifs_dbg(VFS, "tioffset + tilen too high %u + %u", + tioffset, tilen); + return -EINVAL; + } + if (tilen) { + ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Challenge target info alloc failure"); + return -ENOMEM; + } + ses->auth_key.len = tilen; + } + + return 0; +} + +/* BB Move to ntlmssp.c eventually */ + +/* We do not malloc the blob, it is passed in pbuffer, because + it is fixed size, and small, making this approach cleaner */ +void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, + struct cifs_ses *ses) +{ + NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; + __u32 flags; + + memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE)); + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmNegotiate; + + /* BB is NTLMV2 session security format easier to use here? */ + flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (ses->server->sign) { + flags |= NTLMSSP_NEGOTIATE_SIGN; + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } + + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + sec_blob->WorkstationName.BufferOffset = 0; + sec_blob->WorkstationName.Length = 0; + sec_blob->WorkstationName.MaximumLength = 0; + + /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */ + sec_blob->DomainName.BufferOffset = 0; + sec_blob->DomainName.Length = 0; + sec_blob->DomainName.MaximumLength = 0; +} + +/* We do not malloc the blob, it is passed in pbuffer, because its + maximum possible size is fixed and small, making this approach cleaner. + This function returns the length of the data in the blob */ +int build_ntlmssp_auth_blob(unsigned char *pbuffer, + u16 *buflen, + struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc; + AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; + __u32 flags; + unsigned char *tmp; + + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmAuthenticate; + + flags = NTLMSSP_NEGOTIATE_56 | + NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (ses->server->sign) { + flags |= NTLMSSP_NEGOTIATE_SIGN; + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } + + tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + sec_blob->LmChallengeResponse.BufferOffset = + cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); + sec_blob->LmChallengeResponse.Length = 0; + sec_blob->LmChallengeResponse.MaximumLength = 0; + + sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); + goto setup_ntlmv2_ret; + } + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + sec_blob->NtChallengeResponse.MaximumLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->domainName == NULL) { + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.Length = 0; + sec_blob->DomainName.MaximumLength = 0; + tmp += 2; + } else { + int len; + len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, + CIFS_MAX_USERNAME_LEN, nls_cp); + len *= 2; /* unicode is 2 bytes each */ + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.Length = cpu_to_le16(len); + sec_blob->DomainName.MaximumLength = cpu_to_le16(len); + tmp += len; + } + + if (ses->user_name == NULL) { + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.Length = 0; + sec_blob->UserName.MaximumLength = 0; + tmp += 2; + } else { + int len; + len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, + CIFS_MAX_USERNAME_LEN, nls_cp); + len *= 2; /* unicode is 2 bytes each */ + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.Length = cpu_to_le16(len); + sec_blob->UserName.MaximumLength = cpu_to_le16(len); + tmp += len; + } + + sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->WorkstationName.Length = 0; + sec_blob->WorkstationName.MaximumLength = 0; + tmp += 2; + + if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) || + (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) + && !calc_seckey(ses)) { + memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.MaximumLength = + cpu_to_le16(CIFS_CPHTXT_SIZE); + tmp += CIFS_CPHTXT_SIZE; + } else { + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; + } + +setup_ntlmv2_ret: + *buflen = tmp - pbuffer; + return rc; +} + +enum securityEnum +select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) +{ + switch (server->negflavor) { + case CIFS_NEGFLAVOR_EXTENDED: + switch (requested) { + case Kerberos: + case RawNTLMSSP: + return requested; + case Unspecified: + if (server->sec_ntlmssp && + (global_secflags & CIFSSEC_MAY_NTLMSSP)) + return RawNTLMSSP; + if ((server->sec_kerberos || server->sec_mskerberos) && + (global_secflags & CIFSSEC_MAY_KRB5)) + return Kerberos; + /* Fallthrough */ + default: + return Unspecified; + } + case CIFS_NEGFLAVOR_UNENCAP: + switch (requested) { + case NTLM: + case NTLMv2: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_NTLMV2) + return NTLMv2; + if (global_secflags & CIFSSEC_MAY_NTLM) + return NTLM; + default: + /* Fallthrough to attempt LANMAN authentication next */ + break; + } + case CIFS_NEGFLAVOR_LANMAN: + switch (requested) { + case LANMAN: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_LANMAN) + return LANMAN; + /* Fallthrough */ + default: + return Unspecified; + } + default: + return Unspecified; + } +} + +struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb_hdr *smb_buf; + + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); + + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; + } + + return 0; + +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{ + + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + + return 0; +} + +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count; + + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); + + return rc; +} + +/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); + + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */ + + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} + +#endif + +static void +sess_auth_ntlm(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", + rc); + goto out; + } + + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +static void +sess_auth_ntlmv2(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } + + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + struct key *spnego_key = NULL; + struct cifs_spnego_msg *msg; + u16 blob_len; + + /* extended security */ + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; + } + + msg = spnego_key->payload.data; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "incorrect version of cifs.upcall (expected %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); + } else { + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_put_spnego_key; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_put_spnego_key; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#endif /* ! CONFIG_CIFS_UPCALL */ + +/* + * The required kvec buffers have to be allocated before calling this + * function. + */ +static int +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) +{ + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + char *bcc_ptr; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); + return -ENOSYS; + } + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + + bcc_ptr = sess_data->iov[2].iov_base; + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + return 0; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data); + +static void +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n"); + + /* + * if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out; + } + ses->ntlmssp->sesskey_per_smbsess = false; + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + + /* Build security blob before we assemble the request */ + build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses); + sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + sess_data->iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out; + + rc = sess_sendreceive(sess_data); + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)) + rc = 0; + + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out; + } + + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); +out: + sess_free_buffer(sess_data); + + if (!rc) { + sess_data->func = sess_auth_rawntlmssp_authenticate; + return; + } + + /* Else error. Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + char *ntlmsspblob = NULL; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + /* Build security blob before we assemble the request */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + rc = -ENOMEM; + goto out; + } + + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, sess_data->nls_cp); + if (rc) + goto out_free_ntlmsspblob; + sess_data->iov[1].iov_len = blob_len; + sess_data->iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_free_ntlmsspblob; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + if (ses->Suid != smb_buf->Uid) { + ses->Suid = smb_buf->Uid; + cifs_dbg(FYI, "UID changed! new UID = %llu\n", ses->Suid); + } + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_free_ntlmsspblob; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + +out_free_ntlmsspblob: + kfree(ntlmsspblob); +out: + sess_free_buffer(sess_data); + + if (!rc) + rc = sess_establish_session(sess_data); + + /* Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +{ + int type; + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } + + switch (type) { + case LANMAN: + /* LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + sess_data->func = sess_auth_lanman; + break; +#else + return -EOPNOTSUPP; +#endif + case NTLM: + sess_data->func = sess_auth_ntlm; + break; + case NTLMv2: + sess_data->func = sess_auth_ntlmv2; + break; + case Kerberos: +#ifdef CONFIG_CIFS_UPCALL + sess_data->func = sess_auth_kerberos; + break; +#else + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + return -ENOSYS; + break; +#endif /* CONFIG_CIFS_UPCALL */ + case RawNTLMSSP: + sess_data->func = sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", type); + return -ENOSYS; + } + + return 0; +} + +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct sess_data *sess_data; + + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = select_sec(ses, sess_data); + if (rc) + goto out; + + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + /* Store result before we free sess_data */ + rc = sess_data->result; + +out: + kfree(sess_data); + return rc; +} diff --git a/kernel/fs/cifs/smb1ops.c b/kernel/fs/cifs/smb1ops.c new file mode 100644 index 000000000..fc537c290 --- /dev/null +++ b/kernel/fs/cifs/smb1ops.c @@ -0,0 +1,1116 @@ +/* + * SMB1 (CIFS) version specific operations + * + * Copyright (c) 2012, Jeff Layton + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2 as published + * by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifspdu.h" +#include "cifs_unicode.h" + +/* + * An NT cancel request header looks just like the original request except: + * + * The Command is SMB_COM_NT_CANCEL + * The WordCount is zeroed out + * The ByteCount is zeroed out + * + * This function mangles an existing request buffer into a + * SMB_COM_NT_CANCEL request and then sends it. + */ +static int +send_nt_cancel(struct TCP_Server_Info *server, void *buf, + struct mid_q_entry *mid) +{ + int rc = 0; + struct smb_hdr *in_buf = (struct smb_hdr *)buf; + + /* -4 for RFC1001 length and +2 for BCC field */ + in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2); + in_buf->Command = SMB_COM_NT_CANCEL; + in_buf->WordCount = 0; + put_bcc(0, in_buf); + + mutex_lock(&server->srv_mutex); + rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); + if (rc) { + mutex_unlock(&server->srv_mutex); + return rc; + } + + /* + * The response to this call was already factored into the sequence + * number when the call went out, so we must adjust it back downward + * after signing here. + */ + --server->sequence_number; + rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + if (rc < 0) + server->sequence_number--; + + mutex_unlock(&server->srv_mutex); + + cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", + get_mid(in_buf), rc); + + return rc; +} + +static bool +cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2) +{ + return ob1->fid.netfid == ob2->fid.netfid; +} + +static unsigned int +cifs_read_data_offset(char *buf) +{ + READ_RSP *rsp = (READ_RSP *)buf; + return le16_to_cpu(rsp->DataOffset); +} + +static unsigned int +cifs_read_data_length(char *buf) +{ + READ_RSP *rsp = (READ_RSP *)buf; + return (le16_to_cpu(rsp->DataLengthHigh) << 16) + + le16_to_cpu(rsp->DataLength); +} + +static struct mid_q_entry * +cifs_find_mid(struct TCP_Server_Info *server, char *buffer) +{ + struct smb_hdr *buf = (struct smb_hdr *)buffer; + struct mid_q_entry *mid; + + spin_lock(&GlobalMid_Lock); + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if (compare_mid(mid->mid, buf) && + mid->mid_state == MID_REQUEST_SUBMITTED && + le16_to_cpu(mid->command) == buf->Command) { + spin_unlock(&GlobalMid_Lock); + return mid; + } + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} + +static void +cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + spin_lock(&server->req_lock); + server->credits += add; + server->in_flight--; + spin_unlock(&server->req_lock); + wake_up(&server->request_q); +} + +static void +cifs_set_credits(struct TCP_Server_Info *server, const int val) +{ + spin_lock(&server->req_lock); + server->credits = val; + server->oplocks = val > 1 ? enable_oplocks : false; + spin_unlock(&server->req_lock); +} + +static int * +cifs_get_credits_field(struct TCP_Server_Info *server, const int optype) +{ + return &server->credits; +} + +static unsigned int +cifs_get_credits(struct mid_q_entry *mid) +{ + return 1; +} + +/* + * Find a free multiplex id (SMB mid). Otherwise there could be + * mid collisions which might cause problems, demultiplexing the + * wrong response to this request. Multiplex ids could collide if + * one of a series requests takes much longer than the others, or + * if a very large number of long lived requests (byte range + * locks or FindNotify requests) are pending. No more than + * 64K-1 requests can be outstanding at one time. If no + * mids are available, return zero. A future optimization + * could make the combination of mids and uid the key we use + * to demultiplex on (rather than mid alone). + * In addition to the above check, the cifs demultiplex + * code already used the command code as a secondary + * check of the frame and if signing is negotiated the + * response would be discarded if the mid were the same + * but the signature was wrong. Since the mid is not put in the + * pending queue until later (when it is about to be dispatched) + * we do have to limit the number of outstanding requests + * to somewhat less than 64K-1 although it is hard to imagine + * so many threads being in the vfs at one time. + */ +static __u64 +cifs_get_next_mid(struct TCP_Server_Info *server) +{ + __u64 mid = 0; + __u16 last_mid, cur_mid; + bool collision; + + spin_lock(&GlobalMid_Lock); + + /* mid is 16 bit only for CIFS/SMB */ + cur_mid = (__u16)((server->CurrentMid) & 0xffff); + /* we do not want to loop forever */ + last_mid = cur_mid; + cur_mid++; + + /* + * This nested loop looks more expensive than it is. + * In practice the list of pending requests is short, + * fewer than 50, and the mids are likely to be unique + * on the first pass through the loop unless some request + * takes longer than the 64 thousand requests before it + * (and it would also have to have been a request that + * did not time out). + */ + while (cur_mid != last_mid) { + struct mid_q_entry *mid_entry; + unsigned int num_mids; + + collision = false; + if (cur_mid == 0) + cur_mid++; + + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == cur_mid && + mid_entry->mid_state == MID_REQUEST_SUBMITTED) { + /* This mid is in use, try a different one */ + collision = true; + break; + } + } + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { + mid = (__u64)cur_mid; + server->CurrentMid = mid; + break; + } + cur_mid++; + } + spin_unlock(&GlobalMid_Lock); + return mid; +} + +/* + return codes: + 0 not a transact2, or all data present + >0 transact2 with that much data missing + -EINVAL invalid transact2 + */ +static int +check2ndT2(char *buf) +{ + struct smb_hdr *pSMB = (struct smb_hdr *)buf; + struct smb_t2_rsp *pSMBt; + int remaining; + __u16 total_data_size, data_in_this_rsp; + + if (pSMB->Command != SMB_COM_TRANSACTION2) + return 0; + + /* check for plausible wct, bcc and t2 data and parm sizes */ + /* check for parm and data offset going beyond end of smb */ + if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ + cifs_dbg(FYI, "invalid transact2 word count\n"); + return -EINVAL; + } + + pSMBt = (struct smb_t2_rsp *)pSMB; + + total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); + + if (total_data_size == data_in_this_rsp) + return 0; + else if (total_data_size < data_in_this_rsp) { + cifs_dbg(FYI, "total data %d smaller than data in frame %d\n", + total_data_size, data_in_this_rsp); + return -EINVAL; + } + + remaining = total_data_size - data_in_this_rsp; + + cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n", + remaining); + if (total_data_size > CIFSMaxBufSize) { + cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n", + total_data_size, CIFSMaxBufSize); + return -EINVAL; + } + return remaining; +} + +static int +coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) +{ + struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)second_buf; + struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)target_hdr; + char *data_area_of_tgt; + char *data_area_of_src; + int remaining; + unsigned int byte_count, total_in_tgt; + __u16 tgt_total_cnt, src_total_cnt, total_in_src; + + src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount); + tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + + if (tgt_total_cnt != src_total_cnt) + cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n", + src_total_cnt, tgt_total_cnt); + + total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); + + remaining = tgt_total_cnt - total_in_tgt; + + if (remaining < 0) { + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + tgt_total_cnt, total_in_tgt); + return -EPROTO; + } + + if (remaining == 0) { + /* nothing to do, ignore */ + cifs_dbg(FYI, "no more data remains\n"); + return 0; + } + + total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount); + if (remaining < total_in_src) + cifs_dbg(FYI, "transact2 2nd response contains too much data\n"); + + /* find end of first SMB data area */ + data_area_of_tgt = (char *)&pSMBt->hdr.Protocol + + get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); + + /* validate target area */ + data_area_of_src = (char *)&pSMBs->hdr.Protocol + + get_unaligned_le16(&pSMBs->t2_rsp.DataOffset); + + data_area_of_tgt += total_in_tgt; + + total_in_tgt += total_in_src; + /* is the result too big for the field? */ + if (total_in_tgt > USHRT_MAX) { + cifs_dbg(FYI, "coalesced DataCount too large (%u)\n", + total_in_tgt); + return -EPROTO; + } + put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount); + + /* fix up the BCC */ + byte_count = get_bcc(target_hdr); + byte_count += total_in_src; + /* is the result too big for the field? */ + if (byte_count > USHRT_MAX) { + cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count); + return -EPROTO; + } + put_bcc(byte_count, target_hdr); + + byte_count = be32_to_cpu(target_hdr->smb_buf_length); + byte_count += total_in_src; + /* don't allow buffer to overflow */ + if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n", + byte_count); + return -ENOBUFS; + } + target_hdr->smb_buf_length = cpu_to_be32(byte_count); + + /* copy second buffer into end of first buffer */ + memcpy(data_area_of_tgt, data_area_of_src, total_in_src); + + if (remaining != total_in_src) { + /* more responses to go */ + cifs_dbg(FYI, "waiting for more secondary responses\n"); + return 1; + } + + /* we are done */ + cifs_dbg(FYI, "found the last secondary response\n"); + return 0; +} + +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + +static bool +cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, + char *buf, int malformed) +{ + if (malformed) + return false; + if (check2ndT2(buf) <= 0) + return false; + mid->multiRsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + malformed = coalesce_t2(buf, mid->resp_buf); + if (malformed > 0) + return true; + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + dequeue_mid(mid, malformed); + return true; + } + if (!server->large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->large_buf = true; + server->bigbuf = NULL; + } + return true; +} + +static bool +cifs_need_neg(struct TCP_Server_Info *server) +{ + return server->maxBuf == 0; +} + +static int +cifs_negotiate(const unsigned int xid, struct cifs_ses *ses) +{ + int rc; + rc = CIFSSMBNegotiate(xid, ses); + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ + set_credits(ses->server, 1); + rc = CIFSSMBNegotiate(xid, ses); + if (rc == -EAGAIN) + rc = -EHOSTDOWN; + } + return rc; +} + +static unsigned int +cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + if (volume_info->wsize) + wsize = volume_info->wsize; + else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = CIFS_DEFAULT_IOSIZE; + else + wsize = CIFS_DEFAULT_NON_POSIX_WSIZE; + + /* can server support 24-bit write sizes? (via UNIX extensions) */ + if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); + + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && server->sign)) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); + + /* hard limit of CIFS_MAX_WSIZE */ + wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); + + return wsize; +} + +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * OS/2 just sends back short reads. + * + * If the server doesn't advertise CAP_LARGE_READ_X, then assume that + * it can't handle a read request larger than its MaxBufferSize either. + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = volume_info->rsize ? volume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + +static void +cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + CIFSSMBQFSDeviceInfo(xid, tcon); + CIFSSMBQFSAttributeInfo(xid, tcon); +} + +static int +cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path) +{ + int rc; + FILE_ALL_INFO *file_info; + + file_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (file_info == NULL) + return -ENOMEM; + + rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info, + 0 /* not legacy */, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + + if (rc == -EOPNOTSUPP || rc == -EINVAL) + rc = SMBQueryInformation(xid, tcon, full_path, file_info, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + kfree(file_info); + return rc; +} + +static int +cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) +{ + int rc; + + *symlink = false; + + /* could do find first instead but this returns more info */ + rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + /* + * BB optimize code so we do not make the above call when server claims + * no NT SMB support and the above call failed at least once - set flag + * in tcon or mount. + */ + if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { + rc = SMBQueryInformation(xid, tcon, full_path, data, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + *adjustTZ = true; + } + + if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + int tmprc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = 0; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + /* Need to check if this is a symbolic link or not */ + tmprc = CIFS_open(xid, &oparms, &oplock, NULL); + if (tmprc == -EOPNOTSUPP) + *symlink = true; + else if (tmprc == 0) + CIFSSMBClose(xid, tcon, fid.netfid); + } + + return rc; +} + +static int +cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, FILE_ALL_INFO *data) +{ + /* + * We can not use the IndexNumber field by default from Windows or + * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA + * CIFS spec claims that this value is unique within the scope of a + * share, and the windows docs hint that it's actually unique + * per-machine. + * + * There may be higher info levels that work but are there Windows + * server or network appliances for which IndexNumber field is not + * guaranteed unique? + */ + return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +static int +cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, FILE_ALL_INFO *data) +{ + return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); +} + +static void +cifs_clear_stats(struct cifs_tcon *tcon) +{ +#ifdef CONFIG_CIFS_STATS + atomic_set(&tcon->stats.cifs_stats.num_writes, 0); + atomic_set(&tcon->stats.cifs_stats.num_reads, 0); + atomic_set(&tcon->stats.cifs_stats.num_flushes, 0); + atomic_set(&tcon->stats.cifs_stats.num_oplock_brks, 0); + atomic_set(&tcon->stats.cifs_stats.num_opens, 0); + atomic_set(&tcon->stats.cifs_stats.num_posixopens, 0); + atomic_set(&tcon->stats.cifs_stats.num_posixmkdirs, 0); + atomic_set(&tcon->stats.cifs_stats.num_closes, 0); + atomic_set(&tcon->stats.cifs_stats.num_deletes, 0); + atomic_set(&tcon->stats.cifs_stats.num_mkdirs, 0); + atomic_set(&tcon->stats.cifs_stats.num_rmdirs, 0); + atomic_set(&tcon->stats.cifs_stats.num_renames, 0); + atomic_set(&tcon->stats.cifs_stats.num_t2renames, 0); + atomic_set(&tcon->stats.cifs_stats.num_ffirst, 0); + atomic_set(&tcon->stats.cifs_stats.num_fnext, 0); + atomic_set(&tcon->stats.cifs_stats.num_fclose, 0); + atomic_set(&tcon->stats.cifs_stats.num_hardlinks, 0); + atomic_set(&tcon->stats.cifs_stats.num_symlinks, 0); + atomic_set(&tcon->stats.cifs_stats.num_locks, 0); + atomic_set(&tcon->stats.cifs_stats.num_acl_get, 0); + atomic_set(&tcon->stats.cifs_stats.num_acl_set, 0); +#endif +} + +static void +cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon) +{ +#ifdef CONFIG_CIFS_STATS + seq_printf(m, " Oplocks breaks: %d", + atomic_read(&tcon->stats.cifs_stats.num_oplock_brks)); + seq_printf(m, "\nReads: %d Bytes: %llu", + atomic_read(&tcon->stats.cifs_stats.num_reads), + (long long)(tcon->bytes_read)); + seq_printf(m, "\nWrites: %d Bytes: %llu", + atomic_read(&tcon->stats.cifs_stats.num_writes), + (long long)(tcon->bytes_written)); + seq_printf(m, "\nFlushes: %d", + atomic_read(&tcon->stats.cifs_stats.num_flushes)); + seq_printf(m, "\nLocks: %d HardLinks: %d Symlinks: %d", + atomic_read(&tcon->stats.cifs_stats.num_locks), + atomic_read(&tcon->stats.cifs_stats.num_hardlinks), + atomic_read(&tcon->stats.cifs_stats.num_symlinks)); + seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d", + atomic_read(&tcon->stats.cifs_stats.num_opens), + atomic_read(&tcon->stats.cifs_stats.num_closes), + atomic_read(&tcon->stats.cifs_stats.num_deletes)); + seq_printf(m, "\nPosix Opens: %d Posix Mkdirs: %d", + atomic_read(&tcon->stats.cifs_stats.num_posixopens), + atomic_read(&tcon->stats.cifs_stats.num_posixmkdirs)); + seq_printf(m, "\nMkdirs: %d Rmdirs: %d", + atomic_read(&tcon->stats.cifs_stats.num_mkdirs), + atomic_read(&tcon->stats.cifs_stats.num_rmdirs)); + seq_printf(m, "\nRenames: %d T2 Renames %d", + atomic_read(&tcon->stats.cifs_stats.num_renames), + atomic_read(&tcon->stats.cifs_stats.num_t2renames)); + seq_printf(m, "\nFindFirst: %d FNext %d FClose %d", + atomic_read(&tcon->stats.cifs_stats.num_ffirst), + atomic_read(&tcon->stats.cifs_stats.num_fnext), + atomic_read(&tcon->stats.cifs_stats.num_fclose)); +#endif +} + +static void +cifs_mkdir_setinfo(struct inode *inode, const char *full_path, + struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, + const unsigned int xid) +{ + FILE_BASIC_INFO info; + struct cifsInodeInfo *cifsInode; + u32 dosattrs; + int rc; + + memset(&info, 0, sizeof(info)); + cifsInode = CIFS_I(inode); + dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; + info.Attributes = cpu_to_le32(dosattrs); + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc == 0) + cifsInode->cifsAttrs = dosattrs; +} + +static int +cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, + __u32 *oplock, FILE_ALL_INFO *buf) +{ + if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) + return SMBLegacyOpen(xid, oparms->tcon, oparms->path, + oparms->disposition, + oparms->desired_access, + oparms->create_options, + &oparms->fid->netfid, oplock, buf, + oparms->cifs_sb->local_nls, + cifs_remap(oparms->cifs_sb)); + return CIFS_open(xid, oparms, oplock, buf); +} + +static void +cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) +{ + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + cfile->fid.netfid = fid->netfid; + cifs_set_oplock_level(cinode, oplock); + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); +} + +static void +cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + CIFSSMBClose(xid, tcon, fid->netfid); +} + +static int +cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return CIFSSMBFlush(xid, tcon, fid->netfid); +} + +static int +cifs_sync_read(const unsigned int xid, struct cifs_fid *pfid, + struct cifs_io_parms *parms, unsigned int *bytes_read, + char **buf, int *buf_type) +{ + parms->netfid = pfid->netfid; + return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type); +} + +static int +cifs_sync_write(const unsigned int xid, struct cifs_fid *pfid, + struct cifs_io_parms *parms, unsigned int *written, + struct kvec *iov, unsigned long nr_segs) +{ + + parms->netfid = pfid->netfid; + return CIFSSMBWrite2(xid, parms, written, iov, nr_segs); +} + +static int +smb_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid) +{ + int oplock = 0; + int rc; + __u32 netpid; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon; + + /* if the file is already open for write, just use that fileid */ + open_file = find_writable_file(cinode, true); + if (open_file) { + fid.netfid = open_file->fid.netfid; + netpid = open_file->pid; + tcon = tlink_tcon(open_file->tlink); + goto set_via_filehandle; + } + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); + + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc == 0) { + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) { + goto out; + } + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc != 0) { + if (rc == -EIO) + rc = -EINVAL; + goto out; + } + + netpid = current->tgid; + +set_via_filehandle: + rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); + if (!rc) + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + + if (open_file == NULL) + CIFSSMBClose(xid, tcon, fid.netfid); + else + cifsFileInfo_put(open_file); +out: + if (tlink != NULL) + cifs_put_tlink(tlink); + return rc; +} + +static int +cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid); +} + +static int +cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); +} + +static int +cifs_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return CIFSFindNext(xid, tcon, fid->netfid, search_flags, srch_inf); +} + +static int +cifs_close_dir(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return CIFSFindClose(xid, tcon, fid->netfid); +} + +static int +cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, + struct cifsInodeInfo *cinode) +{ + return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, + CIFS_CACHE_READ(cinode) ? 1 : 0); +} + +static int +cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *buf) +{ + int rc = -EOPNOTSUPP; + + buf->f_type = CIFS_MAGIC_NUMBER; + + /* + * We could add a second check for a QFS Unix capability bit + */ + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability))) + rc = CIFSSMBQFSPosixInfo(xid, tcon, buf); + + /* + * Only need to call the old QFSInfo if failed on newer one, + * e.g. by OS/2. + **/ + if (rc && (tcon->ses->capabilities & CAP_NT_SMBS)) + rc = CIFSSMBQFSInfo(xid, tcon, buf); + + /* + * Some old Windows servers also do not support level 103, retry with + * older level one if old server failed the previous call or we + * bypassed it because we detected that this was an older LANMAN sess + */ + if (rc) + rc = SMBOldQFSInfo(xid, tcon, buf); + return rc; +} + +static int +cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u32 type, int lock, int unlock, bool wait) +{ + return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->fid.netfid, + current->tgid, length, offset, unlock, lock, + (__u8)type, wait, 0); +} + +static int +cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage) +{ +#ifdef CONFIG_CIFS_DFS_UPCALL + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + + rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, + &num_referrals, &referrals, 0); + + if (!rc && num_referrals > 0) { + *symlinkinfo = kstrndup(referrals->node_name, + strlen(referrals->node_name), + GFP_KERNEL); + if (!*symlinkinfo) + rc = -ENOMEM; + free_dfs_info_array(referrals, num_referrals); + } + return rc; +#else /* No DFS support */ + return -EREMOTE; +#endif +} + +static int +cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + /* Check for unix extensions */ + if (cap_unix(tcon->ses)) { + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc == -EREMOTE) + rc = cifs_unix_dfs_readlink(xid, tcon, full_path, + target_path, + cifs_sb->local_nls); + + goto out; + } + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = OPEN_REPARSE_POINT; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + goto out; + + rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path, + cifs_sb->local_nls); + if (rc) + goto out_close; + + convert_delimiter(*target_path, '/'); +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); +out: + if (!rc) + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + return rc; +} + +static bool +cifs_is_read_op(__u32 oplock) +{ + return oplock == OPLOCK_READ; +} + +static unsigned int +cifs_wp_retry_size(struct inode *inode) +{ + return CIFS_SB(inode->i_sb)->wsize; +} + +static bool +cifs_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; +} + +struct smb_version_operations smb1_operations = { + .send_cancel = send_nt_cancel, + .compare_fids = cifs_compare_fids, + .setup_request = cifs_setup_request, + .setup_async_request = cifs_setup_async_request, + .check_receive = cifs_check_receive, + .add_credits = cifs_add_credits, + .set_credits = cifs_set_credits, + .get_credits_field = cifs_get_credits_field, + .get_credits = cifs_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, + .get_next_mid = cifs_get_next_mid, + .read_data_offset = cifs_read_data_offset, + .read_data_length = cifs_read_data_length, + .map_error = map_smb_to_linux_error, + .find_mid = cifs_find_mid, + .check_message = checkSMB, + .dump_detail = cifs_dump_detail, + .clear_stats = cifs_clear_stats, + .print_stats = cifs_print_stats, + .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, + .check_trans2 = cifs_check_trans2, + .need_neg = cifs_need_neg, + .negotiate = cifs_negotiate, + .negotiate_wsize = cifs_negotiate_wsize, + .negotiate_rsize = cifs_negotiate_rsize, + .sess_setup = CIFS_SessSetup, + .logoff = CIFSSMBLogoff, + .tree_connect = CIFSTCon, + .tree_disconnect = CIFSSMBTDis, + .get_dfs_refer = CIFSGetDFSRefer, + .qfs_tcon = cifs_qfs_tcon, + .is_path_accessible = cifs_is_path_accessible, + .query_path_info = cifs_query_path_info, + .query_file_info = cifs_query_file_info, + .get_srv_inum = cifs_get_srv_inum, + .set_path_size = CIFSSMBSetEOF, + .set_file_size = CIFSSMBSetFileSize, + .set_file_info = smb_set_file_info, + .set_compression = cifs_set_compression, + .echo = CIFSSMBEcho, + .mkdir = CIFSSMBMkDir, + .mkdir_setinfo = cifs_mkdir_setinfo, + .rmdir = CIFSSMBRmDir, + .unlink = CIFSSMBDelFile, + .rename_pending_delete = cifs_rename_pending_delete, + .rename = CIFSSMBRename, + .create_hardlink = CIFSCreateHardLink, + .query_symlink = cifs_query_symlink, + .open = cifs_open_file, + .set_fid = cifs_set_fid, + .close = cifs_close_file, + .flush = cifs_flush_file, + .async_readv = cifs_async_readv, + .async_writev = cifs_async_writev, + .sync_read = cifs_sync_read, + .sync_write = cifs_sync_write, + .query_dir_first = cifs_query_dir_first, + .query_dir_next = cifs_query_dir_next, + .close_dir = cifs_close_dir, + .calc_smb_size = smbCalcSize, + .oplock_response = cifs_oplock_response, + .queryfs = cifs_queryfs, + .mand_lock = cifs_mand_lock, + .mand_unlock_range = cifs_unlock_range, + .push_mand_locks = cifs_push_mandatory_locks, + .query_mf_symlink = cifs_query_mf_symlink, + .create_mf_symlink = cifs_create_mf_symlink, + .is_read_op = cifs_is_read_op, + .wp_retry_size = cifs_wp_retry_size, + .dir_needs_close = cifs_dir_needs_close, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = CIFSSMBQAllEAs, + .set_EA = CIFSSMBSetEA, +#endif /* CIFS_XATTR */ +#ifdef CONFIG_CIFS_ACL + .get_acl = get_cifs_acl, + .get_acl_by_fid = get_cifs_acl_by_fid, + .set_acl = set_cifs_acl, +#endif /* CIFS_ACL */ +}; + +struct smb_version_values smb1_values = { + .version_string = SMB1_VERSION_STRING, + .large_lock_type = LOCKING_ANDX_LARGE_FILES, + .exclusive_lock_type = 0, + .shared_lock_type = LOCKING_ANDX_SHARED_LOCK, + .unlock_lock_type = 0, + .header_size = sizeof(struct smb_hdr), + .max_header_size = MAX_CIFS_HDR_SIZE, + .read_rsp_size = sizeof(READ_RSP), + .lock_cmd = cpu_to_le16(SMB_COM_LOCKING_ANDX), + .cap_unix = CAP_UNIX, + .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, + .cap_large_files = CAP_LARGE_FILES, + .signing_enabled = SECMODE_SIGN_ENABLED, + .signing_required = SECMODE_SIGN_REQUIRED, +}; diff --git a/kernel/fs/cifs/smb2file.c b/kernel/fs/cifs/smb2file.c new file mode 100644 index 000000000..2ab297dae --- /dev/null +++ b/kernel/fs/cifs/smb2file.c @@ -0,0 +1,265 @@ +/* + * fs/cifs/smb2file.c + * + * Copyright (C) International Business Machines Corp., 2002, 2011 + * Author(s): Steve French (sfrench@us.ibm.com), + * Pavel Shilovsky ((pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "fscache.h" +#include "smb2proto.h" + +int +smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, + __u32 *oplock, FILE_ALL_INFO *buf) +{ + int rc; + __le16 *smb2_path; + struct smb2_file_all_info *smb2_data = NULL; + __u8 smb2_oplock[17]; + struct cifs_fid *fid = oparms->fid; + + smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); + if (smb2_path == NULL) { + rc = -ENOMEM; + goto out; + } + + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + GFP_KERNEL); + if (smb2_data == NULL) { + rc = -ENOMEM; + goto out; + } + + oparms->desired_access |= FILE_READ_ATTRIBUTES; + *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; + + if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) + memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); + + rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL); + if (rc) + goto out; + + if (buf) { + /* open response does not have IndexNumber field - get it */ + rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid, + fid->volatile_fid, + &smb2_data->IndexNumber); + if (rc) { + /* let get_inode_info disable server inode numbers */ + smb2_data->IndexNumber = 0; + rc = 0; + } + move_smb2_info_to_cifs(buf, smb2_data); + } + + *oplock = *smb2_oplock; +out: + kfree(smb2_data); + kfree(smb2_path); + return rc; +} + +int +smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, + const unsigned int xid) +{ + int rc = 0, stored_rc; + unsigned int max_num, num = 0, max_buf; + struct smb2_lock_element *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) + return -EINVAL; + + max_num = max_buf / sizeof(struct smb2_lock_element); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + cur = buf; + + down_write(&cinode->lock_sem); + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cinode->can_cache_brlcks) { + /* + * We can cache brlock requests - simply remove a lock + * from the file's list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + continue; + } + cur->Length = cpu_to_le64(li->length); + cur->Offset = cpu_to_le64(li->offset); + cur->Flags = cpu_to_le32(SMB2_LOCKFLAG_UNLOCK); + /* + * We need to save a lock here to let us add it again to the + * file's list if the unlock range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) { + /* + * We failed on the unlock range request - add + * all locks from the tmp list to the head of + * the file's list. + */ + cifs_move_llist(&tmp_llist, + &cfile->llist->locks); + rc = stored_rc; + } else + /* + * The unlock range request succeed - free the + * tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; + } + if (num) { + stored_rc = smb2_lockv(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, current->tgid, + num, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cfile->llist->locks); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } + up_write(&cinode->lock_sem); + + kfree(buf); + return rc; +} + +static int +smb2_push_mand_fdlocks(struct cifs_fid_locks *fdlocks, const unsigned int xid, + struct smb2_lock_element *buf, unsigned int max_num) +{ + int rc = 0, stored_rc; + struct cifsFileInfo *cfile = fdlocks->cfile; + struct cifsLockInfo *li; + unsigned int num = 0; + struct smb2_lock_element *cur = buf; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + list_for_each_entry(li, &fdlocks->locks, llist) { + cur->Length = cpu_to_le64(li->length); + cur->Offset = cpu_to_le64(li->offset); + cur->Flags = cpu_to_le32(li->type | + SMB2_LOCKFLAG_FAIL_IMMEDIATELY); + if (++num == max_num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + if (num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) + rc = stored_rc; + } + + return rc; +} + +int +smb2_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + int rc = 0, stored_rc; + unsigned int xid; + unsigned int max_num, max_buf; + struct smb2_lock_element *buf; + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct cifs_fid_locks *fdlocks; + + xid = get_xid(); + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; + if (!max_buf) { + free_xid(xid); + return -EINVAL; + } + + max_num = max_buf / sizeof(struct smb2_lock_element); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); + if (!buf) { + free_xid(xid); + return -ENOMEM; + } + + list_for_each_entry(fdlocks, &cinode->llist, llist) { + stored_rc = smb2_push_mand_fdlocks(fdlocks, xid, buf, max_num); + if (stored_rc) + rc = stored_rc; + } + + kfree(buf); + free_xid(xid); + return rc; +} diff --git a/kernel/fs/cifs/smb2glob.h b/kernel/fs/cifs/smb2glob.h new file mode 100644 index 000000000..bc0bb9c34 --- /dev/null +++ b/kernel/fs/cifs/smb2glob.h @@ -0,0 +1,63 @@ +/* + * fs/cifs/smb2glob.h + * + * Definitions for various global variables and structures + * + * Copyright (C) International Business Machines Corp., 2002, 2011 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ +#ifndef _SMB2_GLOB_H +#define _SMB2_GLOB_H + +#define SMB2_MAGIC_NUMBER 0xFE534D42 + +/* + ***************************************************************** + * Constants go here + ***************************************************************** + */ + +/* + * Identifiers for functions that use the open, operation, close pattern + * in smb2inode.c:smb2_open_op_close() + */ +#define SMB2_OP_SET_DELETE 1 +#define SMB2_OP_SET_INFO 2 +#define SMB2_OP_QUERY_INFO 3 +#define SMB2_OP_QUERY_DIR 4 +#define SMB2_OP_MKDIR 5 +#define SMB2_OP_RENAME 6 +#define SMB2_OP_DELETE 7 +#define SMB2_OP_HARDLINK 8 +#define SMB2_OP_SET_EOF 9 + +/* Used when constructing chained read requests. */ +#define CHAINED_REQUEST 1 +#define START_OF_CHAIN 2 +#define END_OF_CHAIN 4 +#define RELATED_REQUEST 8 + +#define SMB2_SIGNATURE_SIZE (16) +#define SMB2_NTLMV2_SESSKEY_SIZE (16) +#define SMB2_HMACSHA256_SIZE (32) +#define SMB2_CMACAES_SIZE (16) +#define SMB3_SIGNKEY_SIZE (16) + +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + +#endif /* _SMB2_GLOB_H */ diff --git a/kernel/fs/cifs/smb2inode.c b/kernel/fs/cifs/smb2inode.c new file mode 100644 index 000000000..899bbc86f --- /dev/null +++ b/kernel/fs/cifs/smb2inode.c @@ -0,0 +1,273 @@ +/* + * fs/cifs/smb2inode.c + * + * Copyright (C) International Business Machines Corp., 2002, 2011 + * Etersoft, 2012 + * Author(s): Pavel Shilovsky (pshilovsky@samba.org), + * Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "fscache.h" +#include "smb2glob.h" +#include "smb2pdu.h" +#include "smb2proto.h" + +static int +smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, + __u32 create_options, void *data, int command) +{ + int rc, tmprc = 0; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = desired_access; + oparms.disposition = create_disposition; + oparms.create_options = create_options; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + if (rc) { + kfree(utf16_path); + return rc; + } + + switch (command) { + case SMB2_OP_DELETE: + break; + case SMB2_OP_QUERY_INFO: + tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, + (struct smb2_file_all_info *)data); + break; + case SMB2_OP_MKDIR: + /* + * Directories are created through parameters in the + * SMB2_open() call. + */ + break; + case SMB2_OP_RENAME: + tmprc = SMB2_rename(xid, tcon, fid.persistent_fid, + fid.volatile_fid, (__le16 *)data); + break; + case SMB2_OP_HARDLINK: + tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid, + fid.volatile_fid, (__le16 *)data); + break; + case SMB2_OP_SET_EOF: + tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, + fid.volatile_fid, current->tgid, + (__le64 *)data, false); + break; + case SMB2_OP_SET_INFO: + tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, + (FILE_BASIC_INFO *)data); + break; + default: + cifs_dbg(VFS, "Invalid command\n"); + break; + } + + rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + if (tmprc) + rc = tmprc; + kfree(utf16_path); + return rc; +} + +void +move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) +{ + memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); + dst->CurrentByteOffset = src->CurrentByteOffset; + dst->Mode = src->Mode; + dst->AlignmentRequirement = src->AlignmentRequirement; + dst->IndexNumber1 = 0; /* we don't use it */ +} + +int +smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink) +{ + int rc; + struct smb2_file_all_info *smb2_data; + + *adjust_tz = false; + *symlink = false; + + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + GFP_KERNEL); + if (smb2_data == NULL) + return -ENOMEM; + + rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, 0, + smb2_data, SMB2_OP_QUERY_INFO); + if (rc == -EOPNOTSUPP) { + *symlink = true; + /* Failed on a symbolic link - query a reparse point info */ + rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, smb2_data, + SMB2_OP_QUERY_INFO); + } + if (rc) + goto out; + + move_smb2_info_to_cifs(data, smb2_data); +out: + kfree(smb2_data); + return rc; +} + +int +smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); +} + +void +smb2_mkdir_setinfo(struct inode *inode, const char *name, + struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, + const unsigned int xid) +{ + FILE_BASIC_INFO data; + struct cifsInodeInfo *cifs_i; + u32 dosattrs; + int tmprc; + + memset(&data, 0, sizeof(data)); + cifs_i = CIFS_I(inode); + dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; + data.Attributes = cpu_to_le32(dosattrs); + tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); + if (tmprc == 0) + cifs_i->cifsAttrs = dosattrs; +} + +int +smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, + NULL, SMB2_OP_DELETE); +} + +int +smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + NULL, SMB2_OP_DELETE); +} + +static int +smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb, __u32 access, int command) +{ + __le16 *smb2_to_name = NULL; + int rc; + + smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb); + if (smb2_to_name == NULL) { + rc = -ENOMEM; + goto smb2_rename_path; + } + + rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, + FILE_OPEN, 0, smb2_to_name, command); +smb2_rename_path: + kfree(smb2_to_name); + return rc; +} + +int +smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + DELETE, SMB2_OP_RENAME); +} + +int +smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK); +} + +int +smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_alloc) +{ + __le64 eof = cpu_to_le64(size); + return smb2_open_op_close(xid, tcon, cifs_sb, full_path, + FILE_WRITE_DATA, FILE_OPEN, 0, &eof, + SMB2_OP_SET_EOF); +} + +int +smb2_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, + FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, + SMB2_OP_SET_INFO); + cifs_put_tlink(tlink); + return rc; +} diff --git a/kernel/fs/cifs/smb2maperror.c b/kernel/fs/cifs/smb2maperror.c new file mode 100644 index 000000000..8257a5a97 --- /dev/null +++ b/kernel/fs/cifs/smb2maperror.c @@ -0,0 +1,2481 @@ +/* + * fs/smb2/smb2maperror.c + * + * Functions which do error mapping of SMB2 status codes to POSIX errors + * + * Copyright (C) International Business Machines Corp., 2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include "cifsglob.h" +#include "cifs_debug.h" +#include "smb2pdu.h" +#include "smb2proto.h" +#include "smb2status.h" + +struct status_to_posix_error { + __le32 smb2_status; + int posix_error; + char *status_string; +}; + +static const struct status_to_posix_error smb2_error_map_table[] = { + {STATUS_SUCCESS, 0, "STATUS_SUCCESS"}, + {STATUS_WAIT_0, 0, "STATUS_WAIT_0"}, + {STATUS_WAIT_1, -EIO, "STATUS_WAIT_1"}, + {STATUS_WAIT_2, -EIO, "STATUS_WAIT_2"}, + {STATUS_WAIT_3, -EIO, "STATUS_WAIT_3"}, + {STATUS_WAIT_63, -EIO, "STATUS_WAIT_63"}, + {STATUS_ABANDONED, -EIO, "STATUS_ABANDONED"}, + {STATUS_ABANDONED_WAIT_0, -EIO, "STATUS_ABANDONED_WAIT_0"}, + {STATUS_ABANDONED_WAIT_63, -EIO, "STATUS_ABANDONED_WAIT_63"}, + {STATUS_USER_APC, -EIO, "STATUS_USER_APC"}, + {STATUS_KERNEL_APC, -EIO, "STATUS_KERNEL_APC"}, + {STATUS_ALERTED, -EIO, "STATUS_ALERTED"}, + {STATUS_TIMEOUT, -ETIMEDOUT, "STATUS_TIMEOUT"}, + {STATUS_PENDING, -EIO, "STATUS_PENDING"}, + {STATUS_REPARSE, -EIO, "STATUS_REPARSE"}, + {STATUS_MORE_ENTRIES, -EIO, "STATUS_MORE_ENTRIES"}, + {STATUS_NOT_ALL_ASSIGNED, -EIO, "STATUS_NOT_ALL_ASSIGNED"}, + {STATUS_SOME_NOT_MAPPED, -EIO, "STATUS_SOME_NOT_MAPPED"}, + {STATUS_OPLOCK_BREAK_IN_PROGRESS, -EIO, + "STATUS_OPLOCK_BREAK_IN_PROGRESS"}, + {STATUS_VOLUME_MOUNTED, -EIO, "STATUS_VOLUME_MOUNTED"}, + {STATUS_RXACT_COMMITTED, -EIO, "STATUS_RXACT_COMMITTED"}, + {STATUS_NOTIFY_CLEANUP, -EIO, "STATUS_NOTIFY_CLEANUP"}, + {STATUS_NOTIFY_ENUM_DIR, -EIO, "STATUS_NOTIFY_ENUM_DIR"}, + {STATUS_NO_QUOTAS_FOR_ACCOUNT, -EIO, "STATUS_NO_QUOTAS_FOR_ACCOUNT"}, + {STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED, -EIO, + "STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED"}, + {STATUS_PAGE_FAULT_TRANSITION, -EIO, "STATUS_PAGE_FAULT_TRANSITION"}, + {STATUS_PAGE_FAULT_DEMAND_ZERO, -EIO, "STATUS_PAGE_FAULT_DEMAND_ZERO"}, + {STATUS_PAGE_FAULT_COPY_ON_WRITE, -EIO, + "STATUS_PAGE_FAULT_COPY_ON_WRITE"}, + {STATUS_PAGE_FAULT_GUARD_PAGE, -EIO, "STATUS_PAGE_FAULT_GUARD_PAGE"}, + {STATUS_PAGE_FAULT_PAGING_FILE, -EIO, "STATUS_PAGE_FAULT_PAGING_FILE"}, + {STATUS_CACHE_PAGE_LOCKED, -EIO, "STATUS_CACHE_PAGE_LOCKED"}, + {STATUS_CRASH_DUMP, -EIO, "STATUS_CRASH_DUMP"}, + {STATUS_BUFFER_ALL_ZEROS, -EIO, "STATUS_BUFFER_ALL_ZEROS"}, + {STATUS_REPARSE_OBJECT, -EIO, "STATUS_REPARSE_OBJECT"}, + {STATUS_RESOURCE_REQUIREMENTS_CHANGED, -EIO, + "STATUS_RESOURCE_REQUIREMENTS_CHANGED"}, + {STATUS_TRANSLATION_COMPLETE, -EIO, "STATUS_TRANSLATION_COMPLETE"}, + {STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY, -EIO, + "STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY"}, + {STATUS_NOTHING_TO_TERMINATE, -EIO, "STATUS_NOTHING_TO_TERMINATE"}, + {STATUS_PROCESS_NOT_IN_JOB, -EIO, "STATUS_PROCESS_NOT_IN_JOB"}, + {STATUS_PROCESS_IN_JOB, -EIO, "STATUS_PROCESS_IN_JOB"}, + {STATUS_VOLSNAP_HIBERNATE_READY, -EIO, + "STATUS_VOLSNAP_HIBERNATE_READY"}, + {STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY, -EIO, + "STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY"}, + {STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED, -EIO, + "STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED"}, + {STATUS_INTERRUPT_STILL_CONNECTED, -EIO, + "STATUS_INTERRUPT_STILL_CONNECTED"}, + {STATUS_PROCESS_CLONED, -EIO, "STATUS_PROCESS_CLONED"}, + {STATUS_FILE_LOCKED_WITH_ONLY_READERS, -EIO, + "STATUS_FILE_LOCKED_WITH_ONLY_READERS"}, + {STATUS_FILE_LOCKED_WITH_WRITERS, -EIO, + "STATUS_FILE_LOCKED_WITH_WRITERS"}, + {STATUS_RESOURCEMANAGER_READ_ONLY, -EROFS, + "STATUS_RESOURCEMANAGER_READ_ONLY"}, + {STATUS_WAIT_FOR_OPLOCK, -EIO, "STATUS_WAIT_FOR_OPLOCK"}, + {DBG_EXCEPTION_HANDLED, -EIO, "DBG_EXCEPTION_HANDLED"}, + {DBG_CONTINUE, -EIO, "DBG_CONTINUE"}, + {STATUS_FLT_IO_COMPLETE, -EIO, "STATUS_FLT_IO_COMPLETE"}, + {STATUS_OBJECT_NAME_EXISTS, -EIO, "STATUS_OBJECT_NAME_EXISTS"}, + {STATUS_THREAD_WAS_SUSPENDED, -EIO, "STATUS_THREAD_WAS_SUSPENDED"}, + {STATUS_WORKING_SET_LIMIT_RANGE, -EIO, + "STATUS_WORKING_SET_LIMIT_RANGE"}, + {STATUS_IMAGE_NOT_AT_BASE, -EIO, "STATUS_IMAGE_NOT_AT_BASE"}, + {STATUS_RXACT_STATE_CREATED, -EIO, "STATUS_RXACT_STATE_CREATED"}, + {STATUS_SEGMENT_NOTIFICATION, -EIO, "STATUS_SEGMENT_NOTIFICATION"}, + {STATUS_LOCAL_USER_SESSION_KEY, -EIO, "STATUS_LOCAL_USER_SESSION_KEY"}, + {STATUS_BAD_CURRENT_DIRECTORY, -EIO, "STATUS_BAD_CURRENT_DIRECTORY"}, + {STATUS_SERIAL_MORE_WRITES, -EIO, "STATUS_SERIAL_MORE_WRITES"}, + {STATUS_REGISTRY_RECOVERED, -EIO, "STATUS_REGISTRY_RECOVERED"}, + {STATUS_FT_READ_RECOVERY_FROM_BACKUP, -EIO, + "STATUS_FT_READ_RECOVERY_FROM_BACKUP"}, + {STATUS_FT_WRITE_RECOVERY, -EIO, "STATUS_FT_WRITE_RECOVERY"}, + {STATUS_SERIAL_COUNTER_TIMEOUT, -ETIMEDOUT, + "STATUS_SERIAL_COUNTER_TIMEOUT"}, + {STATUS_NULL_LM_PASSWORD, -EIO, "STATUS_NULL_LM_PASSWORD"}, + {STATUS_IMAGE_MACHINE_TYPE_MISMATCH, -EIO, + "STATUS_IMAGE_MACHINE_TYPE_MISMATCH"}, + {STATUS_RECEIVE_PARTIAL, -EIO, "STATUS_RECEIVE_PARTIAL"}, + {STATUS_RECEIVE_EXPEDITED, -EIO, "STATUS_RECEIVE_EXPEDITED"}, + {STATUS_RECEIVE_PARTIAL_EXPEDITED, -EIO, + "STATUS_RECEIVE_PARTIAL_EXPEDITED"}, + {STATUS_EVENT_DONE, -EIO, "STATUS_EVENT_DONE"}, + {STATUS_EVENT_PENDING, -EIO, "STATUS_EVENT_PENDING"}, + {STATUS_CHECKING_FILE_SYSTEM, -EIO, "STATUS_CHECKING_FILE_SYSTEM"}, + {STATUS_FATAL_APP_EXIT, -EIO, "STATUS_FATAL_APP_EXIT"}, + {STATUS_PREDEFINED_HANDLE, -EIO, "STATUS_PREDEFINED_HANDLE"}, + {STATUS_WAS_UNLOCKED, -EIO, "STATUS_WAS_UNLOCKED"}, + {STATUS_SERVICE_NOTIFICATION, -EIO, "STATUS_SERVICE_NOTIFICATION"}, + {STATUS_WAS_LOCKED, -EIO, "STATUS_WAS_LOCKED"}, + {STATUS_LOG_HARD_ERROR, -EIO, "STATUS_LOG_HARD_ERROR"}, + {STATUS_ALREADY_WIN32, -EIO, "STATUS_ALREADY_WIN32"}, + {STATUS_WX86_UNSIMULATE, -EIO, "STATUS_WX86_UNSIMULATE"}, + {STATUS_WX86_CONTINUE, -EIO, "STATUS_WX86_CONTINUE"}, + {STATUS_WX86_SINGLE_STEP, -EIO, "STATUS_WX86_SINGLE_STEP"}, + {STATUS_WX86_BREAKPOINT, -EIO, "STATUS_WX86_BREAKPOINT"}, + {STATUS_WX86_EXCEPTION_CONTINUE, -EIO, + "STATUS_WX86_EXCEPTION_CONTINUE"}, + {STATUS_WX86_EXCEPTION_LASTCHANCE, -EIO, + "STATUS_WX86_EXCEPTION_LASTCHANCE"}, + {STATUS_WX86_EXCEPTION_CHAIN, -EIO, "STATUS_WX86_EXCEPTION_CHAIN"}, + {STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE, -EIO, + "STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE"}, + {STATUS_NO_YIELD_PERFORMED, -EIO, "STATUS_NO_YIELD_PERFORMED"}, + {STATUS_TIMER_RESUME_IGNORED, -EIO, "STATUS_TIMER_RESUME_IGNORED"}, + {STATUS_ARBITRATION_UNHANDLED, -EIO, "STATUS_ARBITRATION_UNHANDLED"}, + {STATUS_CARDBUS_NOT_SUPPORTED, -ENOSYS, "STATUS_CARDBUS_NOT_SUPPORTED"}, + {STATUS_WX86_CREATEWX86TIB, -EIO, "STATUS_WX86_CREATEWX86TIB"}, + {STATUS_MP_PROCESSOR_MISMATCH, -EIO, "STATUS_MP_PROCESSOR_MISMATCH"}, + {STATUS_HIBERNATED, -EIO, "STATUS_HIBERNATED"}, + {STATUS_RESUME_HIBERNATION, -EIO, "STATUS_RESUME_HIBERNATION"}, + {STATUS_FIRMWARE_UPDATED, -EIO, "STATUS_FIRMWARE_UPDATED"}, + {STATUS_DRIVERS_LEAKING_LOCKED_PAGES, -EIO, + "STATUS_DRIVERS_LEAKING_LOCKED_PAGES"}, + {STATUS_MESSAGE_RETRIEVED, -EIO, "STATUS_MESSAGE_RETRIEVED"}, + {STATUS_SYSTEM_POWERSTATE_TRANSITION, -EIO, + "STATUS_SYSTEM_POWERSTATE_TRANSITION"}, + {STATUS_ALPC_CHECK_COMPLETION_LIST, -EIO, + "STATUS_ALPC_CHECK_COMPLETION_LIST"}, + {STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION, -EIO, + "STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION"}, + {STATUS_ACCESS_AUDIT_BY_POLICY, -EIO, "STATUS_ACCESS_AUDIT_BY_POLICY"}, + {STATUS_ABANDON_HIBERFILE, -EIO, "STATUS_ABANDON_HIBERFILE"}, + {STATUS_BIZRULES_NOT_ENABLED, -EIO, "STATUS_BIZRULES_NOT_ENABLED"}, + {STATUS_WAKE_SYSTEM, -EIO, "STATUS_WAKE_SYSTEM"}, + {STATUS_DS_SHUTTING_DOWN, -EIO, "STATUS_DS_SHUTTING_DOWN"}, + {DBG_REPLY_LATER, -EIO, "DBG_REPLY_LATER"}, + {DBG_UNABLE_TO_PROVIDE_HANDLE, -EIO, "DBG_UNABLE_TO_PROVIDE_HANDLE"}, + {DBG_TERMINATE_THREAD, -EIO, "DBG_TERMINATE_THREAD"}, + {DBG_TERMINATE_PROCESS, -EIO, "DBG_TERMINATE_PROCESS"}, + {DBG_CONTROL_C, -EIO, "DBG_CONTROL_C"}, + {DBG_PRINTEXCEPTION_C, -EIO, "DBG_PRINTEXCEPTION_C"}, + {DBG_RIPEXCEPTION, -EIO, "DBG_RIPEXCEPTION"}, + {DBG_CONTROL_BREAK, -EIO, "DBG_CONTROL_BREAK"}, + {DBG_COMMAND_EXCEPTION, -EIO, "DBG_COMMAND_EXCEPTION"}, + {RPC_NT_UUID_LOCAL_ONLY, -EIO, "RPC_NT_UUID_LOCAL_ONLY"}, + {RPC_NT_SEND_INCOMPLETE, -EIO, "RPC_NT_SEND_INCOMPLETE"}, + {STATUS_CTX_CDM_CONNECT, -EIO, "STATUS_CTX_CDM_CONNECT"}, + {STATUS_CTX_CDM_DISCONNECT, -EIO, "STATUS_CTX_CDM_DISCONNECT"}, + {STATUS_SXS_RELEASE_ACTIVATION_CONTEXT, -EIO, + "STATUS_SXS_RELEASE_ACTIVATION_CONTEXT"}, + {STATUS_RECOVERY_NOT_NEEDED, -EIO, "STATUS_RECOVERY_NOT_NEEDED"}, + {STATUS_RM_ALREADY_STARTED, -EIO, "STATUS_RM_ALREADY_STARTED"}, + {STATUS_LOG_NO_RESTART, -EIO, "STATUS_LOG_NO_RESTART"}, + {STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST, -EIO, + "STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST"}, + {STATUS_GRAPHICS_PARTIAL_DATA_POPULATED, -EIO, + "STATUS_GRAPHICS_PARTIAL_DATA_POPULATED"}, + {STATUS_GRAPHICS_DRIVER_MISMATCH, -EIO, + "STATUS_GRAPHICS_DRIVER_MISMATCH"}, + {STATUS_GRAPHICS_MODE_NOT_PINNED, -EIO, + "STATUS_GRAPHICS_MODE_NOT_PINNED"}, + {STATUS_GRAPHICS_NO_PREFERRED_MODE, -EIO, + "STATUS_GRAPHICS_NO_PREFERRED_MODE"}, + {STATUS_GRAPHICS_DATASET_IS_EMPTY, -EIO, + "STATUS_GRAPHICS_DATASET_IS_EMPTY"}, + {STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET, -EIO, + "STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET"}, + {STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED, -EIO, + "STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED"}, + {STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS, -EIO, + "STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS"}, + {STATUS_GRAPHICS_LEADLINK_START_DEFERRED, -EIO, + "STATUS_GRAPHICS_LEADLINK_START_DEFERRED"}, + {STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY, -EIO, + "STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY"}, + {STATUS_GRAPHICS_START_DEFERRED, -EIO, + "STATUS_GRAPHICS_START_DEFERRED"}, + {STATUS_NDIS_INDICATION_REQUIRED, -EIO, + "STATUS_NDIS_INDICATION_REQUIRED"}, + {STATUS_GUARD_PAGE_VIOLATION, -EIO, "STATUS_GUARD_PAGE_VIOLATION"}, + {STATUS_DATATYPE_MISALIGNMENT, -EIO, "STATUS_DATATYPE_MISALIGNMENT"}, + {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, + {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, + {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, + {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, + {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, + {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, + {STATUS_GUID_SUBSTITUTION_MADE, -EIO, "STATUS_GUID_SUBSTITUTION_MADE"}, + {STATUS_PARTIAL_COPY, -EIO, "STATUS_PARTIAL_COPY"}, + {STATUS_DEVICE_PAPER_EMPTY, -EIO, "STATUS_DEVICE_PAPER_EMPTY"}, + {STATUS_DEVICE_POWERED_OFF, -EIO, "STATUS_DEVICE_POWERED_OFF"}, + {STATUS_DEVICE_OFF_LINE, -EIO, "STATUS_DEVICE_OFF_LINE"}, + {STATUS_DEVICE_BUSY, -EBUSY, "STATUS_DEVICE_BUSY"}, + {STATUS_NO_MORE_EAS, -EIO, "STATUS_NO_MORE_EAS"}, + {STATUS_INVALID_EA_NAME, -EINVAL, "STATUS_INVALID_EA_NAME"}, + {STATUS_EA_LIST_INCONSISTENT, -EIO, "STATUS_EA_LIST_INCONSISTENT"}, + {STATUS_INVALID_EA_FLAG, -EINVAL, "STATUS_INVALID_EA_FLAG"}, + {STATUS_VERIFY_REQUIRED, -EIO, "STATUS_VERIFY_REQUIRED"}, + {STATUS_EXTRANEOUS_INFORMATION, -EIO, "STATUS_EXTRANEOUS_INFORMATION"}, + {STATUS_RXACT_COMMIT_NECESSARY, -EIO, "STATUS_RXACT_COMMIT_NECESSARY"}, + {STATUS_NO_MORE_ENTRIES, -EIO, "STATUS_NO_MORE_ENTRIES"}, + {STATUS_FILEMARK_DETECTED, -EIO, "STATUS_FILEMARK_DETECTED"}, + {STATUS_MEDIA_CHANGED, -EIO, "STATUS_MEDIA_CHANGED"}, + {STATUS_BUS_RESET, -EIO, "STATUS_BUS_RESET"}, + {STATUS_END_OF_MEDIA, -EIO, "STATUS_END_OF_MEDIA"}, + {STATUS_BEGINNING_OF_MEDIA, -EIO, "STATUS_BEGINNING_OF_MEDIA"}, + {STATUS_MEDIA_CHECK, -EIO, "STATUS_MEDIA_CHECK"}, + {STATUS_SETMARK_DETECTED, -EIO, "STATUS_SETMARK_DETECTED"}, + {STATUS_NO_DATA_DETECTED, -EIO, "STATUS_NO_DATA_DETECTED"}, + {STATUS_REDIRECTOR_HAS_OPEN_HANDLES, -EIO, + "STATUS_REDIRECTOR_HAS_OPEN_HANDLES"}, + {STATUS_SERVER_HAS_OPEN_HANDLES, -EIO, + "STATUS_SERVER_HAS_OPEN_HANDLES"}, + {STATUS_ALREADY_DISCONNECTED, -EIO, "STATUS_ALREADY_DISCONNECTED"}, + {STATUS_LONGJUMP, -EIO, "STATUS_LONGJUMP"}, + {STATUS_CLEANER_CARTRIDGE_INSTALLED, -EIO, + "STATUS_CLEANER_CARTRIDGE_INSTALLED"}, + {STATUS_PLUGPLAY_QUERY_VETOED, -EIO, "STATUS_PLUGPLAY_QUERY_VETOED"}, + {STATUS_UNWIND_CONSOLIDATE, -EIO, "STATUS_UNWIND_CONSOLIDATE"}, + {STATUS_REGISTRY_HIVE_RECOVERED, -EIO, + "STATUS_REGISTRY_HIVE_RECOVERED"}, + {STATUS_DLL_MIGHT_BE_INSECURE, -EIO, "STATUS_DLL_MIGHT_BE_INSECURE"}, + {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, + "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, + {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP, + "STATUS_REPARSE_NOT_HANDLED"}, + {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, + "STATUS_DEVICE_REQUIRES_CLEANING"}, + {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, + {STATUS_DATA_LOST_REPAIR, -EIO, "STATUS_DATA_LOST_REPAIR"}, + {DBG_EXCEPTION_NOT_HANDLED, -EIO, "DBG_EXCEPTION_NOT_HANDLED"}, + {STATUS_CLUSTER_NODE_ALREADY_UP, -EIO, + "STATUS_CLUSTER_NODE_ALREADY_UP"}, + {STATUS_CLUSTER_NODE_ALREADY_DOWN, -EIO, + "STATUS_CLUSTER_NODE_ALREADY_DOWN"}, + {STATUS_CLUSTER_NETWORK_ALREADY_ONLINE, -EIO, + "STATUS_CLUSTER_NETWORK_ALREADY_ONLINE"}, + {STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE, -EIO, + "STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE"}, + {STATUS_CLUSTER_NODE_ALREADY_MEMBER, -EIO, + "STATUS_CLUSTER_NODE_ALREADY_MEMBER"}, + {STATUS_COULD_NOT_RESIZE_LOG, -EIO, "STATUS_COULD_NOT_RESIZE_LOG"}, + {STATUS_NO_TXF_METADATA, -EIO, "STATUS_NO_TXF_METADATA"}, + {STATUS_CANT_RECOVER_WITH_HANDLE_OPEN, -EIO, + "STATUS_CANT_RECOVER_WITH_HANDLE_OPEN"}, + {STATUS_TXF_METADATA_ALREADY_PRESENT, -EIO, + "STATUS_TXF_METADATA_ALREADY_PRESENT"}, + {STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET, -EIO, + "STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET"}, + {STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED, -EIO, + "STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED"}, + {STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"}, + {STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"}, + {STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"}, + {STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"}, + {STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"}, + {STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"}, + {STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"}, + {STATUS_IN_PAGE_ERROR, -EFAULT, "STATUS_IN_PAGE_ERROR"}, + {STATUS_PAGEFILE_QUOTA, -EDQUOT, "STATUS_PAGEFILE_QUOTA"}, + {STATUS_INVALID_HANDLE, -EBADF, "STATUS_INVALID_HANDLE"}, + {STATUS_BAD_INITIAL_STACK, -EIO, "STATUS_BAD_INITIAL_STACK"}, + {STATUS_BAD_INITIAL_PC, -EIO, "STATUS_BAD_INITIAL_PC"}, + {STATUS_INVALID_CID, -EIO, "STATUS_INVALID_CID"}, + {STATUS_TIMER_NOT_CANCELED, -EIO, "STATUS_TIMER_NOT_CANCELED"}, + {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"}, + {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"}, + {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"}, + {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"}, + {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"}, + {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"}, + {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"}, + {STATUS_UNRECOGNIZED_MEDIA, -EIO, "STATUS_UNRECOGNIZED_MEDIA"}, + {STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"}, + {STATUS_MORE_PROCESSING_REQUIRED, -EIO, + "STATUS_MORE_PROCESSING_REQUIRED"}, + {STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"}, + {STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE, + "STATUS_CONFLICTING_ADDRESSES"}, + {STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"}, + {STATUS_UNABLE_TO_FREE_VM, -EIO, "STATUS_UNABLE_TO_FREE_VM"}, + {STATUS_UNABLE_TO_DELETE_SECTION, -EIO, + "STATUS_UNABLE_TO_DELETE_SECTION"}, + {STATUS_INVALID_SYSTEM_SERVICE, -EIO, "STATUS_INVALID_SYSTEM_SERVICE"}, + {STATUS_ILLEGAL_INSTRUCTION, -EIO, "STATUS_ILLEGAL_INSTRUCTION"}, + {STATUS_INVALID_LOCK_SEQUENCE, -EIO, "STATUS_INVALID_LOCK_SEQUENCE"}, + {STATUS_INVALID_VIEW_SIZE, -EIO, "STATUS_INVALID_VIEW_SIZE"}, + {STATUS_INVALID_FILE_FOR_SECTION, -EIO, + "STATUS_INVALID_FILE_FOR_SECTION"}, + {STATUS_ALREADY_COMMITTED, -EIO, "STATUS_ALREADY_COMMITTED"}, + {STATUS_ACCESS_DENIED, -EACCES, "STATUS_ACCESS_DENIED"}, + {STATUS_BUFFER_TOO_SMALL, -EIO, "STATUS_BUFFER_TOO_SMALL"}, + {STATUS_OBJECT_TYPE_MISMATCH, -EIO, "STATUS_OBJECT_TYPE_MISMATCH"}, + {STATUS_NONCONTINUABLE_EXCEPTION, -EIO, + "STATUS_NONCONTINUABLE_EXCEPTION"}, + {STATUS_INVALID_DISPOSITION, -EIO, "STATUS_INVALID_DISPOSITION"}, + {STATUS_UNWIND, -EIO, "STATUS_UNWIND"}, + {STATUS_BAD_STACK, -EIO, "STATUS_BAD_STACK"}, + {STATUS_INVALID_UNWIND_TARGET, -EIO, "STATUS_INVALID_UNWIND_TARGET"}, + {STATUS_NOT_LOCKED, -EIO, "STATUS_NOT_LOCKED"}, + {STATUS_PARITY_ERROR, -EIO, "STATUS_PARITY_ERROR"}, + {STATUS_UNABLE_TO_DECOMMIT_VM, -EIO, "STATUS_UNABLE_TO_DECOMMIT_VM"}, + {STATUS_NOT_COMMITTED, -EIO, "STATUS_NOT_COMMITTED"}, + {STATUS_INVALID_PORT_ATTRIBUTES, -EIO, + "STATUS_INVALID_PORT_ATTRIBUTES"}, + {STATUS_PORT_MESSAGE_TOO_LONG, -EIO, "STATUS_PORT_MESSAGE_TOO_LONG"}, + {STATUS_INVALID_PARAMETER_MIX, -EINVAL, "STATUS_INVALID_PARAMETER_MIX"}, + {STATUS_INVALID_QUOTA_LOWER, -EIO, "STATUS_INVALID_QUOTA_LOWER"}, + {STATUS_DISK_CORRUPT_ERROR, -EIO, "STATUS_DISK_CORRUPT_ERROR"}, + {STATUS_OBJECT_NAME_INVALID, -ENOENT, "STATUS_OBJECT_NAME_INVALID"}, + {STATUS_OBJECT_NAME_NOT_FOUND, -ENOENT, "STATUS_OBJECT_NAME_NOT_FOUND"}, + {STATUS_OBJECT_NAME_COLLISION, -EEXIST, "STATUS_OBJECT_NAME_COLLISION"}, + {STATUS_PORT_DISCONNECTED, -EIO, "STATUS_PORT_DISCONNECTED"}, + {STATUS_DEVICE_ALREADY_ATTACHED, -EIO, + "STATUS_DEVICE_ALREADY_ATTACHED"}, + {STATUS_OBJECT_PATH_INVALID, -ENOTDIR, "STATUS_OBJECT_PATH_INVALID"}, + {STATUS_OBJECT_PATH_NOT_FOUND, -ENOENT, "STATUS_OBJECT_PATH_NOT_FOUND"}, + {STATUS_OBJECT_PATH_SYNTAX_BAD, -EIO, "STATUS_OBJECT_PATH_SYNTAX_BAD"}, + {STATUS_DATA_OVERRUN, -EIO, "STATUS_DATA_OVERRUN"}, + {STATUS_DATA_LATE_ERROR, -EIO, "STATUS_DATA_LATE_ERROR"}, + {STATUS_DATA_ERROR, -EIO, "STATUS_DATA_ERROR"}, + {STATUS_CRC_ERROR, -EIO, "STATUS_CRC_ERROR"}, + {STATUS_SECTION_TOO_BIG, -EIO, "STATUS_SECTION_TOO_BIG"}, + {STATUS_PORT_CONNECTION_REFUSED, -ECONNREFUSED, + "STATUS_PORT_CONNECTION_REFUSED"}, + {STATUS_INVALID_PORT_HANDLE, -EIO, "STATUS_INVALID_PORT_HANDLE"}, + {STATUS_SHARING_VIOLATION, -EBUSY, "STATUS_SHARING_VIOLATION"}, + {STATUS_QUOTA_EXCEEDED, -EDQUOT, "STATUS_QUOTA_EXCEEDED"}, + {STATUS_INVALID_PAGE_PROTECTION, -EIO, + "STATUS_INVALID_PAGE_PROTECTION"}, + {STATUS_MUTANT_NOT_OWNED, -EIO, "STATUS_MUTANT_NOT_OWNED"}, + {STATUS_SEMAPHORE_LIMIT_EXCEEDED, -EIO, + "STATUS_SEMAPHORE_LIMIT_EXCEEDED"}, + {STATUS_PORT_ALREADY_SET, -EIO, "STATUS_PORT_ALREADY_SET"}, + {STATUS_SECTION_NOT_IMAGE, -EIO, "STATUS_SECTION_NOT_IMAGE"}, + {STATUS_SUSPEND_COUNT_EXCEEDED, -EIO, "STATUS_SUSPEND_COUNT_EXCEEDED"}, + {STATUS_THREAD_IS_TERMINATING, -EIO, "STATUS_THREAD_IS_TERMINATING"}, + {STATUS_BAD_WORKING_SET_LIMIT, -EIO, "STATUS_BAD_WORKING_SET_LIMIT"}, + {STATUS_INCOMPATIBLE_FILE_MAP, -EIO, "STATUS_INCOMPATIBLE_FILE_MAP"}, + {STATUS_SECTION_PROTECTION, -EIO, "STATUS_SECTION_PROTECTION"}, + {STATUS_EAS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_EAS_NOT_SUPPORTED"}, + {STATUS_EA_TOO_LARGE, -EIO, "STATUS_EA_TOO_LARGE"}, + {STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"}, + {STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"}, + {STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"}, + {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"}, + {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"}, + {STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"}, + {STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS, + "STATUS_CTL_FILE_NOT_SUPPORTED"}, + {STATUS_UNKNOWN_REVISION, -EIO, "STATUS_UNKNOWN_REVISION"}, + {STATUS_REVISION_MISMATCH, -EIO, "STATUS_REVISION_MISMATCH"}, + {STATUS_INVALID_OWNER, -EIO, "STATUS_INVALID_OWNER"}, + {STATUS_INVALID_PRIMARY_GROUP, -EIO, "STATUS_INVALID_PRIMARY_GROUP"}, + {STATUS_NO_IMPERSONATION_TOKEN, -EIO, "STATUS_NO_IMPERSONATION_TOKEN"}, + {STATUS_CANT_DISABLE_MANDATORY, -EIO, "STATUS_CANT_DISABLE_MANDATORY"}, + {STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"}, + {STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"}, + {STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"}, + {STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"}, + {STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"}, + {STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"}, + {STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"}, + {STATUS_GROUP_EXISTS, -EIO, "STATUS_GROUP_EXISTS"}, + {STATUS_NO_SUCH_GROUP, -EIO, "STATUS_NO_SUCH_GROUP"}, + {STATUS_MEMBER_IN_GROUP, -EIO, "STATUS_MEMBER_IN_GROUP"}, + {STATUS_MEMBER_NOT_IN_GROUP, -EIO, "STATUS_MEMBER_NOT_IN_GROUP"}, + {STATUS_LAST_ADMIN, -EIO, "STATUS_LAST_ADMIN"}, + {STATUS_WRONG_PASSWORD, -EACCES, "STATUS_WRONG_PASSWORD"}, + {STATUS_ILL_FORMED_PASSWORD, -EINVAL, "STATUS_ILL_FORMED_PASSWORD"}, + {STATUS_PASSWORD_RESTRICTION, -EACCES, "STATUS_PASSWORD_RESTRICTION"}, + {STATUS_LOGON_FAILURE, -EACCES, "STATUS_LOGON_FAILURE"}, + {STATUS_ACCOUNT_RESTRICTION, -EACCES, "STATUS_ACCOUNT_RESTRICTION"}, + {STATUS_INVALID_LOGON_HOURS, -EACCES, "STATUS_INVALID_LOGON_HOURS"}, + {STATUS_INVALID_WORKSTATION, -EACCES, "STATUS_INVALID_WORKSTATION"}, + {STATUS_PASSWORD_EXPIRED, -EKEYEXPIRED, "STATUS_PASSWORD_EXPIRED"}, + {STATUS_ACCOUNT_DISABLED, -EKEYREVOKED, "STATUS_ACCOUNT_DISABLED"}, + {STATUS_NONE_MAPPED, -EIO, "STATUS_NONE_MAPPED"}, + {STATUS_TOO_MANY_LUIDS_REQUESTED, -EIO, + "STATUS_TOO_MANY_LUIDS_REQUESTED"}, + {STATUS_LUIDS_EXHAUSTED, -EIO, "STATUS_LUIDS_EXHAUSTED"}, + {STATUS_INVALID_SUB_AUTHORITY, -EIO, "STATUS_INVALID_SUB_AUTHORITY"}, + {STATUS_INVALID_ACL, -EIO, "STATUS_INVALID_ACL"}, + {STATUS_INVALID_SID, -EIO, "STATUS_INVALID_SID"}, + {STATUS_INVALID_SECURITY_DESCR, -EIO, "STATUS_INVALID_SECURITY_DESCR"}, + {STATUS_PROCEDURE_NOT_FOUND, -EIO, "STATUS_PROCEDURE_NOT_FOUND"}, + {STATUS_INVALID_IMAGE_FORMAT, -EIO, "STATUS_INVALID_IMAGE_FORMAT"}, + {STATUS_NO_TOKEN, -EIO, "STATUS_NO_TOKEN"}, + {STATUS_BAD_INHERITANCE_ACL, -EIO, "STATUS_BAD_INHERITANCE_ACL"}, + {STATUS_RANGE_NOT_LOCKED, -EIO, "STATUS_RANGE_NOT_LOCKED"}, + {STATUS_DISK_FULL, -ENOSPC, "STATUS_DISK_FULL"}, + {STATUS_SERVER_DISABLED, -EIO, "STATUS_SERVER_DISABLED"}, + {STATUS_SERVER_NOT_DISABLED, -EIO, "STATUS_SERVER_NOT_DISABLED"}, + {STATUS_TOO_MANY_GUIDS_REQUESTED, -EIO, + "STATUS_TOO_MANY_GUIDS_REQUESTED"}, + {STATUS_GUIDS_EXHAUSTED, -EIO, "STATUS_GUIDS_EXHAUSTED"}, + {STATUS_INVALID_ID_AUTHORITY, -EIO, "STATUS_INVALID_ID_AUTHORITY"}, + {STATUS_AGENTS_EXHAUSTED, -EIO, "STATUS_AGENTS_EXHAUSTED"}, + {STATUS_INVALID_VOLUME_LABEL, -EIO, "STATUS_INVALID_VOLUME_LABEL"}, + {STATUS_SECTION_NOT_EXTENDED, -EIO, "STATUS_SECTION_NOT_EXTENDED"}, + {STATUS_NOT_MAPPED_DATA, -EIO, "STATUS_NOT_MAPPED_DATA"}, + {STATUS_RESOURCE_DATA_NOT_FOUND, -EIO, + "STATUS_RESOURCE_DATA_NOT_FOUND"}, + {STATUS_RESOURCE_TYPE_NOT_FOUND, -EIO, + "STATUS_RESOURCE_TYPE_NOT_FOUND"}, + {STATUS_RESOURCE_NAME_NOT_FOUND, -EIO, + "STATUS_RESOURCE_NAME_NOT_FOUND"}, + {STATUS_ARRAY_BOUNDS_EXCEEDED, -EIO, "STATUS_ARRAY_BOUNDS_EXCEEDED"}, + {STATUS_FLOAT_DENORMAL_OPERAND, -EIO, "STATUS_FLOAT_DENORMAL_OPERAND"}, + {STATUS_FLOAT_DIVIDE_BY_ZERO, -EIO, "STATUS_FLOAT_DIVIDE_BY_ZERO"}, + {STATUS_FLOAT_INEXACT_RESULT, -EIO, "STATUS_FLOAT_INEXACT_RESULT"}, + {STATUS_FLOAT_INVALID_OPERATION, -EIO, + "STATUS_FLOAT_INVALID_OPERATION"}, + {STATUS_FLOAT_OVERFLOW, -EIO, "STATUS_FLOAT_OVERFLOW"}, + {STATUS_FLOAT_STACK_CHECK, -EIO, "STATUS_FLOAT_STACK_CHECK"}, + {STATUS_FLOAT_UNDERFLOW, -EIO, "STATUS_FLOAT_UNDERFLOW"}, + {STATUS_INTEGER_DIVIDE_BY_ZERO, -EIO, "STATUS_INTEGER_DIVIDE_BY_ZERO"}, + {STATUS_INTEGER_OVERFLOW, -EIO, "STATUS_INTEGER_OVERFLOW"}, + {STATUS_PRIVILEGED_INSTRUCTION, -EIO, "STATUS_PRIVILEGED_INSTRUCTION"}, + {STATUS_TOO_MANY_PAGING_FILES, -EIO, "STATUS_TOO_MANY_PAGING_FILES"}, + {STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"}, + {STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO, + "STATUS_ALLOTTED_SPACE_EXCEEDED"}, + {STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO, + "STATUS_INSUFFICIENT_RESOURCES"}, + {STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"}, + {STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"}, + {STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"}, + {STATUS_DEVICE_POWER_FAILURE, -EIO, "STATUS_DEVICE_POWER_FAILURE"}, + {STATUS_FREE_VM_NOT_AT_BASE, -EIO, "STATUS_FREE_VM_NOT_AT_BASE"}, + {STATUS_MEMORY_NOT_ALLOCATED, -EFAULT, "STATUS_MEMORY_NOT_ALLOCATED"}, + {STATUS_WORKING_SET_QUOTA, -EIO, "STATUS_WORKING_SET_QUOTA"}, + {STATUS_MEDIA_WRITE_PROTECTED, -EROFS, "STATUS_MEDIA_WRITE_PROTECTED"}, + {STATUS_DEVICE_NOT_READY, -EIO, "STATUS_DEVICE_NOT_READY"}, + {STATUS_INVALID_GROUP_ATTRIBUTES, -EIO, + "STATUS_INVALID_GROUP_ATTRIBUTES"}, + {STATUS_BAD_IMPERSONATION_LEVEL, -EIO, + "STATUS_BAD_IMPERSONATION_LEVEL"}, + {STATUS_CANT_OPEN_ANONYMOUS, -EIO, "STATUS_CANT_OPEN_ANONYMOUS"}, + {STATUS_BAD_VALIDATION_CLASS, -EIO, "STATUS_BAD_VALIDATION_CLASS"}, + {STATUS_BAD_TOKEN_TYPE, -EIO, "STATUS_BAD_TOKEN_TYPE"}, + {STATUS_BAD_MASTER_BOOT_RECORD, -EIO, "STATUS_BAD_MASTER_BOOT_RECORD"}, + {STATUS_INSTRUCTION_MISALIGNMENT, -EIO, + "STATUS_INSTRUCTION_MISALIGNMENT"}, + {STATUS_INSTANCE_NOT_AVAILABLE, -EIO, "STATUS_INSTANCE_NOT_AVAILABLE"}, + {STATUS_PIPE_NOT_AVAILABLE, -EIO, "STATUS_PIPE_NOT_AVAILABLE"}, + {STATUS_INVALID_PIPE_STATE, -EIO, "STATUS_INVALID_PIPE_STATE"}, + {STATUS_PIPE_BUSY, -EBUSY, "STATUS_PIPE_BUSY"}, + {STATUS_ILLEGAL_FUNCTION, -EIO, "STATUS_ILLEGAL_FUNCTION"}, + {STATUS_PIPE_DISCONNECTED, -EPIPE, "STATUS_PIPE_DISCONNECTED"}, + {STATUS_PIPE_CLOSING, -EIO, "STATUS_PIPE_CLOSING"}, + {STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"}, + {STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"}, + {STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"}, + {STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"}, + {STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"}, + {STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"}, + {STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"}, + {STATUS_COULD_NOT_INTERPRET, -EIO, "STATUS_COULD_NOT_INTERPRET"}, + {STATUS_FILE_IS_A_DIRECTORY, -EISDIR, "STATUS_FILE_IS_A_DIRECTORY"}, + {STATUS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_NOT_SUPPORTED"}, + {STATUS_REMOTE_NOT_LISTENING, -EHOSTDOWN, + "STATUS_REMOTE_NOT_LISTENING"}, + {STATUS_DUPLICATE_NAME, -ENOTUNIQ, "STATUS_DUPLICATE_NAME"}, + {STATUS_BAD_NETWORK_PATH, -EINVAL, "STATUS_BAD_NETWORK_PATH"}, + {STATUS_NETWORK_BUSY, -EBUSY, "STATUS_NETWORK_BUSY"}, + {STATUS_DEVICE_DOES_NOT_EXIST, -ENODEV, "STATUS_DEVICE_DOES_NOT_EXIST"}, + {STATUS_TOO_MANY_COMMANDS, -EIO, "STATUS_TOO_MANY_COMMANDS"}, + {STATUS_ADAPTER_HARDWARE_ERROR, -EIO, "STATUS_ADAPTER_HARDWARE_ERROR"}, + {STATUS_INVALID_NETWORK_RESPONSE, -EIO, + "STATUS_INVALID_NETWORK_RESPONSE"}, + {STATUS_UNEXPECTED_NETWORK_ERROR, -EIO, + "STATUS_UNEXPECTED_NETWORK_ERROR"}, + {STATUS_BAD_REMOTE_ADAPTER, -EIO, "STATUS_BAD_REMOTE_ADAPTER"}, + {STATUS_PRINT_QUEUE_FULL, -EIO, "STATUS_PRINT_QUEUE_FULL"}, + {STATUS_NO_SPOOL_SPACE, -EIO, "STATUS_NO_SPOOL_SPACE"}, + {STATUS_PRINT_CANCELLED, -EIO, "STATUS_PRINT_CANCELLED"}, + {STATUS_NETWORK_NAME_DELETED, -EIO, "STATUS_NETWORK_NAME_DELETED"}, + {STATUS_NETWORK_ACCESS_DENIED, -EACCES, "STATUS_NETWORK_ACCESS_DENIED"}, + {STATUS_BAD_DEVICE_TYPE, -EIO, "STATUS_BAD_DEVICE_TYPE"}, + {STATUS_BAD_NETWORK_NAME, -ENOENT, "STATUS_BAD_NETWORK_NAME"}, + {STATUS_TOO_MANY_NAMES, -EIO, "STATUS_TOO_MANY_NAMES"}, + {STATUS_TOO_MANY_SESSIONS, -EIO, "STATUS_TOO_MANY_SESSIONS"}, + {STATUS_SHARING_PAUSED, -EIO, "STATUS_SHARING_PAUSED"}, + {STATUS_REQUEST_NOT_ACCEPTED, -EIO, "STATUS_REQUEST_NOT_ACCEPTED"}, + {STATUS_REDIRECTOR_PAUSED, -EIO, "STATUS_REDIRECTOR_PAUSED"}, + {STATUS_NET_WRITE_FAULT, -EIO, "STATUS_NET_WRITE_FAULT"}, + {STATUS_PROFILING_AT_LIMIT, -EIO, "STATUS_PROFILING_AT_LIMIT"}, + {STATUS_NOT_SAME_DEVICE, -EXDEV, "STATUS_NOT_SAME_DEVICE"}, + {STATUS_FILE_RENAMED, -EIO, "STATUS_FILE_RENAMED"}, + {STATUS_VIRTUAL_CIRCUIT_CLOSED, -EIO, "STATUS_VIRTUAL_CIRCUIT_CLOSED"}, + {STATUS_NO_SECURITY_ON_OBJECT, -EIO, "STATUS_NO_SECURITY_ON_OBJECT"}, + {STATUS_CANT_WAIT, -EIO, "STATUS_CANT_WAIT"}, + {STATUS_PIPE_EMPTY, -EIO, "STATUS_PIPE_EMPTY"}, + {STATUS_CANT_ACCESS_DOMAIN_INFO, -EIO, + "STATUS_CANT_ACCESS_DOMAIN_INFO"}, + {STATUS_CANT_TERMINATE_SELF, -EIO, "STATUS_CANT_TERMINATE_SELF"}, + {STATUS_INVALID_SERVER_STATE, -EIO, "STATUS_INVALID_SERVER_STATE"}, + {STATUS_INVALID_DOMAIN_STATE, -EIO, "STATUS_INVALID_DOMAIN_STATE"}, + {STATUS_INVALID_DOMAIN_ROLE, -EIO, "STATUS_INVALID_DOMAIN_ROLE"}, + {STATUS_NO_SUCH_DOMAIN, -EIO, "STATUS_NO_SUCH_DOMAIN"}, + {STATUS_DOMAIN_EXISTS, -EIO, "STATUS_DOMAIN_EXISTS"}, + {STATUS_DOMAIN_LIMIT_EXCEEDED, -EIO, "STATUS_DOMAIN_LIMIT_EXCEEDED"}, + {STATUS_OPLOCK_NOT_GRANTED, -EIO, "STATUS_OPLOCK_NOT_GRANTED"}, + {STATUS_INVALID_OPLOCK_PROTOCOL, -EIO, + "STATUS_INVALID_OPLOCK_PROTOCOL"}, + {STATUS_INTERNAL_DB_CORRUPTION, -EIO, "STATUS_INTERNAL_DB_CORRUPTION"}, + {STATUS_INTERNAL_ERROR, -EIO, "STATUS_INTERNAL_ERROR"}, + {STATUS_GENERIC_NOT_MAPPED, -EIO, "STATUS_GENERIC_NOT_MAPPED"}, + {STATUS_BAD_DESCRIPTOR_FORMAT, -EIO, "STATUS_BAD_DESCRIPTOR_FORMAT"}, + {STATUS_INVALID_USER_BUFFER, -EIO, "STATUS_INVALID_USER_BUFFER"}, + {STATUS_UNEXPECTED_IO_ERROR, -EIO, "STATUS_UNEXPECTED_IO_ERROR"}, + {STATUS_UNEXPECTED_MM_CREATE_ERR, -EIO, + "STATUS_UNEXPECTED_MM_CREATE_ERR"}, + {STATUS_UNEXPECTED_MM_MAP_ERROR, -EIO, + "STATUS_UNEXPECTED_MM_MAP_ERROR"}, + {STATUS_UNEXPECTED_MM_EXTEND_ERR, -EIO, + "STATUS_UNEXPECTED_MM_EXTEND_ERR"}, + {STATUS_NOT_LOGON_PROCESS, -EIO, "STATUS_NOT_LOGON_PROCESS"}, + {STATUS_LOGON_SESSION_EXISTS, -EIO, "STATUS_LOGON_SESSION_EXISTS"}, + {STATUS_INVALID_PARAMETER_1, -EINVAL, "STATUS_INVALID_PARAMETER_1"}, + {STATUS_INVALID_PARAMETER_2, -EINVAL, "STATUS_INVALID_PARAMETER_2"}, + {STATUS_INVALID_PARAMETER_3, -EINVAL, "STATUS_INVALID_PARAMETER_3"}, + {STATUS_INVALID_PARAMETER_4, -EINVAL, "STATUS_INVALID_PARAMETER_4"}, + {STATUS_INVALID_PARAMETER_5, -EINVAL, "STATUS_INVALID_PARAMETER_5"}, + {STATUS_INVALID_PARAMETER_6, -EINVAL, "STATUS_INVALID_PARAMETER_6"}, + {STATUS_INVALID_PARAMETER_7, -EINVAL, "STATUS_INVALID_PARAMETER_7"}, + {STATUS_INVALID_PARAMETER_8, -EINVAL, "STATUS_INVALID_PARAMETER_8"}, + {STATUS_INVALID_PARAMETER_9, -EINVAL, "STATUS_INVALID_PARAMETER_9"}, + {STATUS_INVALID_PARAMETER_10, -EINVAL, "STATUS_INVALID_PARAMETER_10"}, + {STATUS_INVALID_PARAMETER_11, -EINVAL, "STATUS_INVALID_PARAMETER_11"}, + {STATUS_INVALID_PARAMETER_12, -EINVAL, "STATUS_INVALID_PARAMETER_12"}, + {STATUS_REDIRECTOR_NOT_STARTED, -EIO, "STATUS_REDIRECTOR_NOT_STARTED"}, + {STATUS_REDIRECTOR_STARTED, -EIO, "STATUS_REDIRECTOR_STARTED"}, + {STATUS_STACK_OVERFLOW, -EIO, "STATUS_STACK_OVERFLOW"}, + {STATUS_NO_SUCH_PACKAGE, -EIO, "STATUS_NO_SUCH_PACKAGE"}, + {STATUS_BAD_FUNCTION_TABLE, -EIO, "STATUS_BAD_FUNCTION_TABLE"}, + {STATUS_VARIABLE_NOT_FOUND, -EIO, "STATUS_VARIABLE_NOT_FOUND"}, + {STATUS_DIRECTORY_NOT_EMPTY, -ENOTEMPTY, "STATUS_DIRECTORY_NOT_EMPTY"}, + {STATUS_FILE_CORRUPT_ERROR, -EIO, "STATUS_FILE_CORRUPT_ERROR"}, + {STATUS_NOT_A_DIRECTORY, -ENOTDIR, "STATUS_NOT_A_DIRECTORY"}, + {STATUS_BAD_LOGON_SESSION_STATE, -EIO, + "STATUS_BAD_LOGON_SESSION_STATE"}, + {STATUS_LOGON_SESSION_COLLISION, -EIO, + "STATUS_LOGON_SESSION_COLLISION"}, + {STATUS_NAME_TOO_LONG, -ENAMETOOLONG, "STATUS_NAME_TOO_LONG"}, + {STATUS_FILES_OPEN, -EIO, "STATUS_FILES_OPEN"}, + {STATUS_CONNECTION_IN_USE, -EIO, "STATUS_CONNECTION_IN_USE"}, + {STATUS_MESSAGE_NOT_FOUND, -EIO, "STATUS_MESSAGE_NOT_FOUND"}, + {STATUS_PROCESS_IS_TERMINATING, -EIO, "STATUS_PROCESS_IS_TERMINATING"}, + {STATUS_INVALID_LOGON_TYPE, -EIO, "STATUS_INVALID_LOGON_TYPE"}, + {STATUS_NO_GUID_TRANSLATION, -EIO, "STATUS_NO_GUID_TRANSLATION"}, + {STATUS_CANNOT_IMPERSONATE, -EIO, "STATUS_CANNOT_IMPERSONATE"}, + {STATUS_IMAGE_ALREADY_LOADED, -EIO, "STATUS_IMAGE_ALREADY_LOADED"}, + {STATUS_ABIOS_NOT_PRESENT, -EIO, "STATUS_ABIOS_NOT_PRESENT"}, + {STATUS_ABIOS_LID_NOT_EXIST, -EIO, "STATUS_ABIOS_LID_NOT_EXIST"}, + {STATUS_ABIOS_LID_ALREADY_OWNED, -EIO, + "STATUS_ABIOS_LID_ALREADY_OWNED"}, + {STATUS_ABIOS_NOT_LID_OWNER, -EIO, "STATUS_ABIOS_NOT_LID_OWNER"}, + {STATUS_ABIOS_INVALID_COMMAND, -EIO, "STATUS_ABIOS_INVALID_COMMAND"}, + {STATUS_ABIOS_INVALID_LID, -EIO, "STATUS_ABIOS_INVALID_LID"}, + {STATUS_ABIOS_SELECTOR_NOT_AVAILABLE, -EIO, + "STATUS_ABIOS_SELECTOR_NOT_AVAILABLE"}, + {STATUS_ABIOS_INVALID_SELECTOR, -EIO, "STATUS_ABIOS_INVALID_SELECTOR"}, + {STATUS_NO_LDT, -EIO, "STATUS_NO_LDT"}, + {STATUS_INVALID_LDT_SIZE, -EIO, "STATUS_INVALID_LDT_SIZE"}, + {STATUS_INVALID_LDT_OFFSET, -EIO, "STATUS_INVALID_LDT_OFFSET"}, + {STATUS_INVALID_LDT_DESCRIPTOR, -EIO, "STATUS_INVALID_LDT_DESCRIPTOR"}, + {STATUS_INVALID_IMAGE_NE_FORMAT, -EIO, + "STATUS_INVALID_IMAGE_NE_FORMAT"}, + {STATUS_RXACT_INVALID_STATE, -EIO, "STATUS_RXACT_INVALID_STATE"}, + {STATUS_RXACT_COMMIT_FAILURE, -EIO, "STATUS_RXACT_COMMIT_FAILURE"}, + {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"}, + {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"}, + {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"}, + {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"}, + {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"}, + {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"}, + {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"}, + {STATUS_SPECIAL_GROUP, -EIO, "STATUS_SPECIAL_GROUP"}, + {STATUS_SPECIAL_USER, -EIO, "STATUS_SPECIAL_USER"}, + {STATUS_MEMBERS_PRIMARY_GROUP, -EIO, "STATUS_MEMBERS_PRIMARY_GROUP"}, + {STATUS_FILE_CLOSED, -EBADF, "STATUS_FILE_CLOSED"}, + {STATUS_TOO_MANY_THREADS, -EIO, "STATUS_TOO_MANY_THREADS"}, + {STATUS_THREAD_NOT_IN_PROCESS, -EIO, "STATUS_THREAD_NOT_IN_PROCESS"}, + {STATUS_TOKEN_ALREADY_IN_USE, -EIO, "STATUS_TOKEN_ALREADY_IN_USE"}, + {STATUS_PAGEFILE_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_PAGEFILE_QUOTA_EXCEEDED"}, + {STATUS_COMMITMENT_LIMIT, -EIO, "STATUS_COMMITMENT_LIMIT"}, + {STATUS_INVALID_IMAGE_LE_FORMAT, -EIO, + "STATUS_INVALID_IMAGE_LE_FORMAT"}, + {STATUS_INVALID_IMAGE_NOT_MZ, -EIO, "STATUS_INVALID_IMAGE_NOT_MZ"}, + {STATUS_INVALID_IMAGE_PROTECT, -EIO, "STATUS_INVALID_IMAGE_PROTECT"}, + {STATUS_INVALID_IMAGE_WIN_16, -EIO, "STATUS_INVALID_IMAGE_WIN_16"}, + {STATUS_LOGON_SERVER_CONFLICT, -EIO, "STATUS_LOGON_SERVER_CONFLICT"}, + {STATUS_TIME_DIFFERENCE_AT_DC, -EIO, "STATUS_TIME_DIFFERENCE_AT_DC"}, + {STATUS_SYNCHRONIZATION_REQUIRED, -EIO, + "STATUS_SYNCHRONIZATION_REQUIRED"}, + {STATUS_DLL_NOT_FOUND, -ENOENT, "STATUS_DLL_NOT_FOUND"}, + {STATUS_OPEN_FAILED, -EIO, "STATUS_OPEN_FAILED"}, + {STATUS_IO_PRIVILEGE_FAILED, -EIO, "STATUS_IO_PRIVILEGE_FAILED"}, + {STATUS_ORDINAL_NOT_FOUND, -EIO, "STATUS_ORDINAL_NOT_FOUND"}, + {STATUS_ENTRYPOINT_NOT_FOUND, -EIO, "STATUS_ENTRYPOINT_NOT_FOUND"}, + {STATUS_CONTROL_C_EXIT, -EIO, "STATUS_CONTROL_C_EXIT"}, + {STATUS_LOCAL_DISCONNECT, -EIO, "STATUS_LOCAL_DISCONNECT"}, + {STATUS_REMOTE_DISCONNECT, -ESHUTDOWN, "STATUS_REMOTE_DISCONNECT"}, + {STATUS_REMOTE_RESOURCES, -EIO, "STATUS_REMOTE_RESOURCES"}, + {STATUS_LINK_FAILED, -EXDEV, "STATUS_LINK_FAILED"}, + {STATUS_LINK_TIMEOUT, -ETIMEDOUT, "STATUS_LINK_TIMEOUT"}, + {STATUS_INVALID_CONNECTION, -EIO, "STATUS_INVALID_CONNECTION"}, + {STATUS_INVALID_ADDRESS, -EIO, "STATUS_INVALID_ADDRESS"}, + {STATUS_DLL_INIT_FAILED, -EIO, "STATUS_DLL_INIT_FAILED"}, + {STATUS_MISSING_SYSTEMFILE, -EIO, "STATUS_MISSING_SYSTEMFILE"}, + {STATUS_UNHANDLED_EXCEPTION, -EIO, "STATUS_UNHANDLED_EXCEPTION"}, + {STATUS_APP_INIT_FAILURE, -EIO, "STATUS_APP_INIT_FAILURE"}, + {STATUS_PAGEFILE_CREATE_FAILED, -EIO, "STATUS_PAGEFILE_CREATE_FAILED"}, + {STATUS_NO_PAGEFILE, -EIO, "STATUS_NO_PAGEFILE"}, + {STATUS_INVALID_LEVEL, -EIO, "STATUS_INVALID_LEVEL"}, + {STATUS_WRONG_PASSWORD_CORE, -EIO, "STATUS_WRONG_PASSWORD_CORE"}, + {STATUS_ILLEGAL_FLOAT_CONTEXT, -EIO, "STATUS_ILLEGAL_FLOAT_CONTEXT"}, + {STATUS_PIPE_BROKEN, -EPIPE, "STATUS_PIPE_BROKEN"}, + {STATUS_REGISTRY_CORRUPT, -EIO, "STATUS_REGISTRY_CORRUPT"}, + {STATUS_REGISTRY_IO_FAILED, -EIO, "STATUS_REGISTRY_IO_FAILED"}, + {STATUS_NO_EVENT_PAIR, -EIO, "STATUS_NO_EVENT_PAIR"}, + {STATUS_UNRECOGNIZED_VOLUME, -EIO, "STATUS_UNRECOGNIZED_VOLUME"}, + {STATUS_SERIAL_NO_DEVICE_INITED, -EIO, + "STATUS_SERIAL_NO_DEVICE_INITED"}, + {STATUS_NO_SUCH_ALIAS, -EIO, "STATUS_NO_SUCH_ALIAS"}, + {STATUS_MEMBER_NOT_IN_ALIAS, -EIO, "STATUS_MEMBER_NOT_IN_ALIAS"}, + {STATUS_MEMBER_IN_ALIAS, -EIO, "STATUS_MEMBER_IN_ALIAS"}, + {STATUS_ALIAS_EXISTS, -EIO, "STATUS_ALIAS_EXISTS"}, + {STATUS_LOGON_NOT_GRANTED, -EIO, "STATUS_LOGON_NOT_GRANTED"}, + {STATUS_TOO_MANY_SECRETS, -EIO, "STATUS_TOO_MANY_SECRETS"}, + {STATUS_SECRET_TOO_LONG, -EIO, "STATUS_SECRET_TOO_LONG"}, + {STATUS_INTERNAL_DB_ERROR, -EIO, "STATUS_INTERNAL_DB_ERROR"}, + {STATUS_FULLSCREEN_MODE, -EIO, "STATUS_FULLSCREEN_MODE"}, + {STATUS_TOO_MANY_CONTEXT_IDS, -EIO, "STATUS_TOO_MANY_CONTEXT_IDS"}, + {STATUS_LOGON_TYPE_NOT_GRANTED, -EIO, "STATUS_LOGON_TYPE_NOT_GRANTED"}, + {STATUS_NOT_REGISTRY_FILE, -EIO, "STATUS_NOT_REGISTRY_FILE"}, + {STATUS_NT_CROSS_ENCRYPTION_REQUIRED, -EIO, + "STATUS_NT_CROSS_ENCRYPTION_REQUIRED"}, + {STATUS_DOMAIN_CTRLR_CONFIG_ERROR, -EIO, + "STATUS_DOMAIN_CTRLR_CONFIG_ERROR"}, + {STATUS_FT_MISSING_MEMBER, -EIO, "STATUS_FT_MISSING_MEMBER"}, + {STATUS_ILL_FORMED_SERVICE_ENTRY, -EIO, + "STATUS_ILL_FORMED_SERVICE_ENTRY"}, + {STATUS_ILLEGAL_CHARACTER, -EIO, "STATUS_ILLEGAL_CHARACTER"}, + {STATUS_UNMAPPABLE_CHARACTER, -EIO, "STATUS_UNMAPPABLE_CHARACTER"}, + {STATUS_UNDEFINED_CHARACTER, -EIO, "STATUS_UNDEFINED_CHARACTER"}, + {STATUS_FLOPPY_VOLUME, -EIO, "STATUS_FLOPPY_VOLUME"}, + {STATUS_FLOPPY_ID_MARK_NOT_FOUND, -EIO, + "STATUS_FLOPPY_ID_MARK_NOT_FOUND"}, + {STATUS_FLOPPY_WRONG_CYLINDER, -EIO, "STATUS_FLOPPY_WRONG_CYLINDER"}, + {STATUS_FLOPPY_UNKNOWN_ERROR, -EIO, "STATUS_FLOPPY_UNKNOWN_ERROR"}, + {STATUS_FLOPPY_BAD_REGISTERS, -EIO, "STATUS_FLOPPY_BAD_REGISTERS"}, + {STATUS_DISK_RECALIBRATE_FAILED, -EIO, + "STATUS_DISK_RECALIBRATE_FAILED"}, + {STATUS_DISK_OPERATION_FAILED, -EIO, "STATUS_DISK_OPERATION_FAILED"}, + {STATUS_DISK_RESET_FAILED, -EIO, "STATUS_DISK_RESET_FAILED"}, + {STATUS_SHARED_IRQ_BUSY, -EBUSY, "STATUS_SHARED_IRQ_BUSY"}, + {STATUS_FT_ORPHANING, -EIO, "STATUS_FT_ORPHANING"}, + {STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT, -EIO, + "STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT"}, + {STATUS_PARTITION_FAILURE, -EIO, "STATUS_PARTITION_FAILURE"}, + {STATUS_INVALID_BLOCK_LENGTH, -EIO, "STATUS_INVALID_BLOCK_LENGTH"}, + {STATUS_DEVICE_NOT_PARTITIONED, -EIO, "STATUS_DEVICE_NOT_PARTITIONED"}, + {STATUS_UNABLE_TO_LOCK_MEDIA, -EIO, "STATUS_UNABLE_TO_LOCK_MEDIA"}, + {STATUS_UNABLE_TO_UNLOAD_MEDIA, -EIO, "STATUS_UNABLE_TO_UNLOAD_MEDIA"}, + {STATUS_EOM_OVERFLOW, -EIO, "STATUS_EOM_OVERFLOW"}, + {STATUS_NO_MEDIA, -EIO, "STATUS_NO_MEDIA"}, + {STATUS_NO_SUCH_MEMBER, -EIO, "STATUS_NO_SUCH_MEMBER"}, + {STATUS_INVALID_MEMBER, -EIO, "STATUS_INVALID_MEMBER"}, + {STATUS_KEY_DELETED, -EIO, "STATUS_KEY_DELETED"}, + {STATUS_NO_LOG_SPACE, -EIO, "STATUS_NO_LOG_SPACE"}, + {STATUS_TOO_MANY_SIDS, -EIO, "STATUS_TOO_MANY_SIDS"}, + {STATUS_LM_CROSS_ENCRYPTION_REQUIRED, -EIO, + "STATUS_LM_CROSS_ENCRYPTION_REQUIRED"}, + {STATUS_KEY_HAS_CHILDREN, -EIO, "STATUS_KEY_HAS_CHILDREN"}, + {STATUS_CHILD_MUST_BE_VOLATILE, -EIO, "STATUS_CHILD_MUST_BE_VOLATILE"}, + {STATUS_DEVICE_CONFIGURATION_ERROR, -EIO, + "STATUS_DEVICE_CONFIGURATION_ERROR"}, + {STATUS_DRIVER_INTERNAL_ERROR, -EIO, "STATUS_DRIVER_INTERNAL_ERROR"}, + {STATUS_INVALID_DEVICE_STATE, -EIO, "STATUS_INVALID_DEVICE_STATE"}, + {STATUS_IO_DEVICE_ERROR, -EIO, "STATUS_IO_DEVICE_ERROR"}, + {STATUS_DEVICE_PROTOCOL_ERROR, -EIO, "STATUS_DEVICE_PROTOCOL_ERROR"}, + {STATUS_BACKUP_CONTROLLER, -EIO, "STATUS_BACKUP_CONTROLLER"}, + {STATUS_LOG_FILE_FULL, -EIO, "STATUS_LOG_FILE_FULL"}, + {STATUS_TOO_LATE, -EIO, "STATUS_TOO_LATE"}, + {STATUS_NO_TRUST_LSA_SECRET, -EIO, "STATUS_NO_TRUST_LSA_SECRET"}, + {STATUS_NO_TRUST_SAM_ACCOUNT, -EIO, "STATUS_NO_TRUST_SAM_ACCOUNT"}, + {STATUS_TRUSTED_DOMAIN_FAILURE, -EIO, "STATUS_TRUSTED_DOMAIN_FAILURE"}, + {STATUS_TRUSTED_RELATIONSHIP_FAILURE, -EIO, + "STATUS_TRUSTED_RELATIONSHIP_FAILURE"}, + {STATUS_EVENTLOG_FILE_CORRUPT, -EIO, "STATUS_EVENTLOG_FILE_CORRUPT"}, + {STATUS_EVENTLOG_CANT_START, -EIO, "STATUS_EVENTLOG_CANT_START"}, + {STATUS_TRUST_FAILURE, -EIO, "STATUS_TRUST_FAILURE"}, + {STATUS_MUTANT_LIMIT_EXCEEDED, -EIO, "STATUS_MUTANT_LIMIT_EXCEEDED"}, + {STATUS_NETLOGON_NOT_STARTED, -EIO, "STATUS_NETLOGON_NOT_STARTED"}, + {STATUS_ACCOUNT_EXPIRED, -EKEYEXPIRED, "STATUS_ACCOUNT_EXPIRED"}, + {STATUS_POSSIBLE_DEADLOCK, -EIO, "STATUS_POSSIBLE_DEADLOCK"}, + {STATUS_NETWORK_CREDENTIAL_CONFLICT, -EIO, + "STATUS_NETWORK_CREDENTIAL_CONFLICT"}, + {STATUS_REMOTE_SESSION_LIMIT, -EIO, "STATUS_REMOTE_SESSION_LIMIT"}, + {STATUS_EVENTLOG_FILE_CHANGED, -EIO, "STATUS_EVENTLOG_FILE_CHANGED"}, + {STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT, -EIO, + "STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT"}, + {STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT, -EIO, + "STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT"}, + {STATUS_NOLOGON_SERVER_TRUST_ACCOUNT, -EIO, + "STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"}, + {STATUS_DOMAIN_TRUST_INCONSISTENT, -EIO, + "STATUS_DOMAIN_TRUST_INCONSISTENT"}, + {STATUS_FS_DRIVER_REQUIRED, -EIO, "STATUS_FS_DRIVER_REQUIRED"}, + {STATUS_IMAGE_ALREADY_LOADED_AS_DLL, -EIO, + "STATUS_IMAGE_ALREADY_LOADED_AS_DLL"}, + {STATUS_NETWORK_OPEN_RESTRICTION, -EIO, + "STATUS_NETWORK_OPEN_RESTRICTION"}, + {STATUS_NO_USER_SESSION_KEY, -EIO, "STATUS_NO_USER_SESSION_KEY"}, + {STATUS_USER_SESSION_DELETED, -EIO, "STATUS_USER_SESSION_DELETED"}, + {STATUS_RESOURCE_LANG_NOT_FOUND, -EIO, + "STATUS_RESOURCE_LANG_NOT_FOUND"}, + {STATUS_INSUFF_SERVER_RESOURCES, -EIO, + "STATUS_INSUFF_SERVER_RESOURCES"}, + {STATUS_INVALID_BUFFER_SIZE, -EIO, "STATUS_INVALID_BUFFER_SIZE"}, + {STATUS_INVALID_ADDRESS_COMPONENT, -EIO, + "STATUS_INVALID_ADDRESS_COMPONENT"}, + {STATUS_INVALID_ADDRESS_WILDCARD, -EIO, + "STATUS_INVALID_ADDRESS_WILDCARD"}, + {STATUS_TOO_MANY_ADDRESSES, -EIO, "STATUS_TOO_MANY_ADDRESSES"}, + {STATUS_ADDRESS_ALREADY_EXISTS, -EADDRINUSE, + "STATUS_ADDRESS_ALREADY_EXISTS"}, + {STATUS_ADDRESS_CLOSED, -EIO, "STATUS_ADDRESS_CLOSED"}, + {STATUS_CONNECTION_DISCONNECTED, -ECONNABORTED, + "STATUS_CONNECTION_DISCONNECTED"}, + {STATUS_CONNECTION_RESET, -ENETRESET, "STATUS_CONNECTION_RESET"}, + {STATUS_TOO_MANY_NODES, -EIO, "STATUS_TOO_MANY_NODES"}, + {STATUS_TRANSACTION_ABORTED, -EIO, "STATUS_TRANSACTION_ABORTED"}, + {STATUS_TRANSACTION_TIMED_OUT, -EIO, "STATUS_TRANSACTION_TIMED_OUT"}, + {STATUS_TRANSACTION_NO_RELEASE, -EIO, "STATUS_TRANSACTION_NO_RELEASE"}, + {STATUS_TRANSACTION_NO_MATCH, -EIO, "STATUS_TRANSACTION_NO_MATCH"}, + {STATUS_TRANSACTION_RESPONDED, -EIO, "STATUS_TRANSACTION_RESPONDED"}, + {STATUS_TRANSACTION_INVALID_ID, -EIO, "STATUS_TRANSACTION_INVALID_ID"}, + {STATUS_TRANSACTION_INVALID_TYPE, -EIO, + "STATUS_TRANSACTION_INVALID_TYPE"}, + {STATUS_NOT_SERVER_SESSION, -EIO, "STATUS_NOT_SERVER_SESSION"}, + {STATUS_NOT_CLIENT_SESSION, -EIO, "STATUS_NOT_CLIENT_SESSION"}, + {STATUS_CANNOT_LOAD_REGISTRY_FILE, -EIO, + "STATUS_CANNOT_LOAD_REGISTRY_FILE"}, + {STATUS_DEBUG_ATTACH_FAILED, -EIO, "STATUS_DEBUG_ATTACH_FAILED"}, + {STATUS_SYSTEM_PROCESS_TERMINATED, -EIO, + "STATUS_SYSTEM_PROCESS_TERMINATED"}, + {STATUS_DATA_NOT_ACCEPTED, -EIO, "STATUS_DATA_NOT_ACCEPTED"}, + {STATUS_NO_BROWSER_SERVERS_FOUND, -EIO, + "STATUS_NO_BROWSER_SERVERS_FOUND"}, + {STATUS_VDM_HARD_ERROR, -EIO, "STATUS_VDM_HARD_ERROR"}, + {STATUS_DRIVER_CANCEL_TIMEOUT, -EIO, "STATUS_DRIVER_CANCEL_TIMEOUT"}, + {STATUS_REPLY_MESSAGE_MISMATCH, -EIO, "STATUS_REPLY_MESSAGE_MISMATCH"}, + {STATUS_MAPPED_ALIGNMENT, -EIO, "STATUS_MAPPED_ALIGNMENT"}, + {STATUS_IMAGE_CHECKSUM_MISMATCH, -EIO, + "STATUS_IMAGE_CHECKSUM_MISMATCH"}, + {STATUS_LOST_WRITEBEHIND_DATA, -EIO, "STATUS_LOST_WRITEBEHIND_DATA"}, + {STATUS_CLIENT_SERVER_PARAMETERS_INVALID, -EIO, + "STATUS_CLIENT_SERVER_PARAMETERS_INVALID"}, + {STATUS_PASSWORD_MUST_CHANGE, -EIO, "STATUS_PASSWORD_MUST_CHANGE"}, + {STATUS_NOT_FOUND, -ENOENT, "STATUS_NOT_FOUND"}, + {STATUS_NOT_TINY_STREAM, -EIO, "STATUS_NOT_TINY_STREAM"}, + {STATUS_RECOVERY_FAILURE, -EIO, "STATUS_RECOVERY_FAILURE"}, + {STATUS_STACK_OVERFLOW_READ, -EIO, "STATUS_STACK_OVERFLOW_READ"}, + {STATUS_FAIL_CHECK, -EIO, "STATUS_FAIL_CHECK"}, + {STATUS_DUPLICATE_OBJECTID, -EIO, "STATUS_DUPLICATE_OBJECTID"}, + {STATUS_OBJECTID_EXISTS, -EIO, "STATUS_OBJECTID_EXISTS"}, + {STATUS_CONVERT_TO_LARGE, -EIO, "STATUS_CONVERT_TO_LARGE"}, + {STATUS_RETRY, -EAGAIN, "STATUS_RETRY"}, + {STATUS_FOUND_OUT_OF_SCOPE, -EIO, "STATUS_FOUND_OUT_OF_SCOPE"}, + {STATUS_ALLOCATE_BUCKET, -EIO, "STATUS_ALLOCATE_BUCKET"}, + {STATUS_PROPSET_NOT_FOUND, -EIO, "STATUS_PROPSET_NOT_FOUND"}, + {STATUS_MARSHALL_OVERFLOW, -EIO, "STATUS_MARSHALL_OVERFLOW"}, + {STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"}, + {STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO, + "STATUS_DOMAIN_CONTROLLER_NOT_FOUND"}, + {STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"}, + {STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"}, + {STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"}, + {STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"}, + {STATUS_ADDRESS_ALREADY_ASSOCIATED, -EIO, + "STATUS_ADDRESS_ALREADY_ASSOCIATED"}, + {STATUS_ADDRESS_NOT_ASSOCIATED, -EIO, "STATUS_ADDRESS_NOT_ASSOCIATED"}, + {STATUS_CONNECTION_INVALID, -EIO, "STATUS_CONNECTION_INVALID"}, + {STATUS_CONNECTION_ACTIVE, -EIO, "STATUS_CONNECTION_ACTIVE"}, + {STATUS_NETWORK_UNREACHABLE, -ENETUNREACH, + "STATUS_NETWORK_UNREACHABLE"}, + {STATUS_HOST_UNREACHABLE, -EHOSTDOWN, "STATUS_HOST_UNREACHABLE"}, + {STATUS_PROTOCOL_UNREACHABLE, -ENETUNREACH, + "STATUS_PROTOCOL_UNREACHABLE"}, + {STATUS_PORT_UNREACHABLE, -ENETUNREACH, "STATUS_PORT_UNREACHABLE"}, + {STATUS_REQUEST_ABORTED, -EIO, "STATUS_REQUEST_ABORTED"}, + {STATUS_CONNECTION_ABORTED, -ECONNABORTED, "STATUS_CONNECTION_ABORTED"}, + {STATUS_BAD_COMPRESSION_BUFFER, -EIO, "STATUS_BAD_COMPRESSION_BUFFER"}, + {STATUS_USER_MAPPED_FILE, -EIO, "STATUS_USER_MAPPED_FILE"}, + {STATUS_AUDIT_FAILED, -EIO, "STATUS_AUDIT_FAILED"}, + {STATUS_TIMER_RESOLUTION_NOT_SET, -EIO, + "STATUS_TIMER_RESOLUTION_NOT_SET"}, + {STATUS_CONNECTION_COUNT_LIMIT, -EIO, "STATUS_CONNECTION_COUNT_LIMIT"}, + {STATUS_LOGIN_TIME_RESTRICTION, -EACCES, + "STATUS_LOGIN_TIME_RESTRICTION"}, + {STATUS_LOGIN_WKSTA_RESTRICTION, -EACCES, + "STATUS_LOGIN_WKSTA_RESTRICTION"}, + {STATUS_IMAGE_MP_UP_MISMATCH, -EIO, "STATUS_IMAGE_MP_UP_MISMATCH"}, + {STATUS_INSUFFICIENT_LOGON_INFO, -EIO, + "STATUS_INSUFFICIENT_LOGON_INFO"}, + {STATUS_BAD_DLL_ENTRYPOINT, -EIO, "STATUS_BAD_DLL_ENTRYPOINT"}, + {STATUS_BAD_SERVICE_ENTRYPOINT, -EIO, "STATUS_BAD_SERVICE_ENTRYPOINT"}, + {STATUS_LPC_REPLY_LOST, -EIO, "STATUS_LPC_REPLY_LOST"}, + {STATUS_IP_ADDRESS_CONFLICT1, -EIO, "STATUS_IP_ADDRESS_CONFLICT1"}, + {STATUS_IP_ADDRESS_CONFLICT2, -EIO, "STATUS_IP_ADDRESS_CONFLICT2"}, + {STATUS_REGISTRY_QUOTA_LIMIT, -EDQUOT, "STATUS_REGISTRY_QUOTA_LIMIT"}, + {STATUS_PATH_NOT_COVERED, -EREMOTE, "STATUS_PATH_NOT_COVERED"}, + {STATUS_NO_CALLBACK_ACTIVE, -EIO, "STATUS_NO_CALLBACK_ACTIVE"}, + {STATUS_LICENSE_QUOTA_EXCEEDED, -EACCES, + "STATUS_LICENSE_QUOTA_EXCEEDED"}, + {STATUS_PWD_TOO_SHORT, -EIO, "STATUS_PWD_TOO_SHORT"}, + {STATUS_PWD_TOO_RECENT, -EIO, "STATUS_PWD_TOO_RECENT"}, + {STATUS_PWD_HISTORY_CONFLICT, -EIO, "STATUS_PWD_HISTORY_CONFLICT"}, + {STATUS_PLUGPLAY_NO_DEVICE, -EIO, "STATUS_PLUGPLAY_NO_DEVICE"}, + {STATUS_UNSUPPORTED_COMPRESSION, -EIO, + "STATUS_UNSUPPORTED_COMPRESSION"}, + {STATUS_INVALID_HW_PROFILE, -EIO, "STATUS_INVALID_HW_PROFILE"}, + {STATUS_INVALID_PLUGPLAY_DEVICE_PATH, -EIO, + "STATUS_INVALID_PLUGPLAY_DEVICE_PATH"}, + {STATUS_DRIVER_ORDINAL_NOT_FOUND, -EIO, + "STATUS_DRIVER_ORDINAL_NOT_FOUND"}, + {STATUS_DRIVER_ENTRYPOINT_NOT_FOUND, -EIO, + "STATUS_DRIVER_ENTRYPOINT_NOT_FOUND"}, + {STATUS_RESOURCE_NOT_OWNED, -EIO, "STATUS_RESOURCE_NOT_OWNED"}, + {STATUS_TOO_MANY_LINKS, -EMLINK, "STATUS_TOO_MANY_LINKS"}, + {STATUS_QUOTA_LIST_INCONSISTENT, -EIO, + "STATUS_QUOTA_LIST_INCONSISTENT"}, + {STATUS_FILE_IS_OFFLINE, -EIO, "STATUS_FILE_IS_OFFLINE"}, + {STATUS_EVALUATION_EXPIRATION, -EIO, "STATUS_EVALUATION_EXPIRATION"}, + {STATUS_ILLEGAL_DLL_RELOCATION, -EIO, "STATUS_ILLEGAL_DLL_RELOCATION"}, + {STATUS_LICENSE_VIOLATION, -EIO, "STATUS_LICENSE_VIOLATION"}, + {STATUS_DLL_INIT_FAILED_LOGOFF, -EIO, "STATUS_DLL_INIT_FAILED_LOGOFF"}, + {STATUS_DRIVER_UNABLE_TO_LOAD, -EIO, "STATUS_DRIVER_UNABLE_TO_LOAD"}, + {STATUS_DFS_UNAVAILABLE, -EIO, "STATUS_DFS_UNAVAILABLE"}, + {STATUS_VOLUME_DISMOUNTED, -EIO, "STATUS_VOLUME_DISMOUNTED"}, + {STATUS_WX86_INTERNAL_ERROR, -EIO, "STATUS_WX86_INTERNAL_ERROR"}, + {STATUS_WX86_FLOAT_STACK_CHECK, -EIO, "STATUS_WX86_FLOAT_STACK_CHECK"}, + {STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"}, + {STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"}, + {STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"}, + {STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"}, + {STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"}, + {STATUS_IO_REPARSE_TAG_MISMATCH, -EIO, + "STATUS_IO_REPARSE_TAG_MISMATCH"}, + {STATUS_IO_REPARSE_DATA_INVALID, -EIO, + "STATUS_IO_REPARSE_DATA_INVALID"}, + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO, + "STATUS_IO_REPARSE_TAG_NOT_HANDLED"}, + {STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO, + "STATUS_REPARSE_POINT_NOT_RESOLVED"}, + {STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO, + "STATUS_DIRECTORY_IS_A_REPARSE_POINT"}, + {STATUS_RANGE_LIST_CONFLICT, -EIO, "STATUS_RANGE_LIST_CONFLICT"}, + {STATUS_SOURCE_ELEMENT_EMPTY, -EIO, "STATUS_SOURCE_ELEMENT_EMPTY"}, + {STATUS_DESTINATION_ELEMENT_FULL, -EIO, + "STATUS_DESTINATION_ELEMENT_FULL"}, + {STATUS_ILLEGAL_ELEMENT_ADDRESS, -EIO, + "STATUS_ILLEGAL_ELEMENT_ADDRESS"}, + {STATUS_MAGAZINE_NOT_PRESENT, -EIO, "STATUS_MAGAZINE_NOT_PRESENT"}, + {STATUS_REINITIALIZATION_NEEDED, -EIO, + "STATUS_REINITIALIZATION_NEEDED"}, + {STATUS_ENCRYPTION_FAILED, -EIO, "STATUS_ENCRYPTION_FAILED"}, + {STATUS_DECRYPTION_FAILED, -EIO, "STATUS_DECRYPTION_FAILED"}, + {STATUS_RANGE_NOT_FOUND, -EIO, "STATUS_RANGE_NOT_FOUND"}, + {STATUS_NO_RECOVERY_POLICY, -EIO, "STATUS_NO_RECOVERY_POLICY"}, + {STATUS_NO_EFS, -EIO, "STATUS_NO_EFS"}, + {STATUS_WRONG_EFS, -EIO, "STATUS_WRONG_EFS"}, + {STATUS_NO_USER_KEYS, -EIO, "STATUS_NO_USER_KEYS"}, + {STATUS_FILE_NOT_ENCRYPTED, -EIO, "STATUS_FILE_NOT_ENCRYPTED"}, + {STATUS_NOT_EXPORT_FORMAT, -EIO, "STATUS_NOT_EXPORT_FORMAT"}, + {STATUS_FILE_ENCRYPTED, -EIO, "STATUS_FILE_ENCRYPTED"}, + {STATUS_WMI_GUID_NOT_FOUND, -EIO, "STATUS_WMI_GUID_NOT_FOUND"}, + {STATUS_WMI_INSTANCE_NOT_FOUND, -EIO, "STATUS_WMI_INSTANCE_NOT_FOUND"}, + {STATUS_WMI_ITEMID_NOT_FOUND, -EIO, "STATUS_WMI_ITEMID_NOT_FOUND"}, + {STATUS_WMI_TRY_AGAIN, -EIO, "STATUS_WMI_TRY_AGAIN"}, + {STATUS_SHARED_POLICY, -EIO, "STATUS_SHARED_POLICY"}, + {STATUS_POLICY_OBJECT_NOT_FOUND, -EIO, + "STATUS_POLICY_OBJECT_NOT_FOUND"}, + {STATUS_POLICY_ONLY_IN_DS, -EIO, "STATUS_POLICY_ONLY_IN_DS"}, + {STATUS_VOLUME_NOT_UPGRADED, -EIO, "STATUS_VOLUME_NOT_UPGRADED"}, + {STATUS_REMOTE_STORAGE_NOT_ACTIVE, -EIO, + "STATUS_REMOTE_STORAGE_NOT_ACTIVE"}, + {STATUS_REMOTE_STORAGE_MEDIA_ERROR, -EIO, + "STATUS_REMOTE_STORAGE_MEDIA_ERROR"}, + {STATUS_NO_TRACKING_SERVICE, -EIO, "STATUS_NO_TRACKING_SERVICE"}, + {STATUS_SERVER_SID_MISMATCH, -EIO, "STATUS_SERVER_SID_MISMATCH"}, + {STATUS_DS_NO_ATTRIBUTE_OR_VALUE, -EIO, + "STATUS_DS_NO_ATTRIBUTE_OR_VALUE"}, + {STATUS_DS_INVALID_ATTRIBUTE_SYNTAX, -EIO, + "STATUS_DS_INVALID_ATTRIBUTE_SYNTAX"}, + {STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED, -EIO, + "STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED"}, + {STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS, -EIO, + "STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS"}, + {STATUS_DS_BUSY, -EBUSY, "STATUS_DS_BUSY"}, + {STATUS_DS_UNAVAILABLE, -EIO, "STATUS_DS_UNAVAILABLE"}, + {STATUS_DS_NO_RIDS_ALLOCATED, -EIO, "STATUS_DS_NO_RIDS_ALLOCATED"}, + {STATUS_DS_NO_MORE_RIDS, -EIO, "STATUS_DS_NO_MORE_RIDS"}, + {STATUS_DS_INCORRECT_ROLE_OWNER, -EIO, + "STATUS_DS_INCORRECT_ROLE_OWNER"}, + {STATUS_DS_RIDMGR_INIT_ERROR, -EIO, "STATUS_DS_RIDMGR_INIT_ERROR"}, + {STATUS_DS_OBJ_CLASS_VIOLATION, -EIO, "STATUS_DS_OBJ_CLASS_VIOLATION"}, + {STATUS_DS_CANT_ON_NON_LEAF, -EIO, "STATUS_DS_CANT_ON_NON_LEAF"}, + {STATUS_DS_CANT_ON_RDN, -EIO, "STATUS_DS_CANT_ON_RDN"}, + {STATUS_DS_CANT_MOD_OBJ_CLASS, -EIO, "STATUS_DS_CANT_MOD_OBJ_CLASS"}, + {STATUS_DS_CROSS_DOM_MOVE_FAILED, -EIO, + "STATUS_DS_CROSS_DOM_MOVE_FAILED"}, + {STATUS_DS_GC_NOT_AVAILABLE, -EIO, "STATUS_DS_GC_NOT_AVAILABLE"}, + {STATUS_DIRECTORY_SERVICE_REQUIRED, -EIO, + "STATUS_DIRECTORY_SERVICE_REQUIRED"}, + {STATUS_REPARSE_ATTRIBUTE_CONFLICT, -EIO, + "STATUS_REPARSE_ATTRIBUTE_CONFLICT"}, + {STATUS_CANT_ENABLE_DENY_ONLY, -EIO, "STATUS_CANT_ENABLE_DENY_ONLY"}, + {STATUS_FLOAT_MULTIPLE_FAULTS, -EIO, "STATUS_FLOAT_MULTIPLE_FAULTS"}, + {STATUS_FLOAT_MULTIPLE_TRAPS, -EIO, "STATUS_FLOAT_MULTIPLE_TRAPS"}, + {STATUS_DEVICE_REMOVED, -EIO, "STATUS_DEVICE_REMOVED"}, + {STATUS_JOURNAL_DELETE_IN_PROGRESS, -EIO, + "STATUS_JOURNAL_DELETE_IN_PROGRESS"}, + {STATUS_JOURNAL_NOT_ACTIVE, -EIO, "STATUS_JOURNAL_NOT_ACTIVE"}, + {STATUS_NOINTERFACE, -EIO, "STATUS_NOINTERFACE"}, + {STATUS_DS_ADMIN_LIMIT_EXCEEDED, -EIO, + "STATUS_DS_ADMIN_LIMIT_EXCEEDED"}, + {STATUS_DRIVER_FAILED_SLEEP, -EIO, "STATUS_DRIVER_FAILED_SLEEP"}, + {STATUS_MUTUAL_AUTHENTICATION_FAILED, -EIO, + "STATUS_MUTUAL_AUTHENTICATION_FAILED"}, + {STATUS_CORRUPT_SYSTEM_FILE, -EIO, "STATUS_CORRUPT_SYSTEM_FILE"}, + {STATUS_DATATYPE_MISALIGNMENT_ERROR, -EIO, + "STATUS_DATATYPE_MISALIGNMENT_ERROR"}, + {STATUS_WMI_READ_ONLY, -EROFS, "STATUS_WMI_READ_ONLY"}, + {STATUS_WMI_SET_FAILURE, -EIO, "STATUS_WMI_SET_FAILURE"}, + {STATUS_COMMITMENT_MINIMUM, -EIO, "STATUS_COMMITMENT_MINIMUM"}, + {STATUS_REG_NAT_CONSUMPTION, -EIO, "STATUS_REG_NAT_CONSUMPTION"}, + {STATUS_TRANSPORT_FULL, -EIO, "STATUS_TRANSPORT_FULL"}, + {STATUS_DS_SAM_INIT_FAILURE, -EIO, "STATUS_DS_SAM_INIT_FAILURE"}, + {STATUS_ONLY_IF_CONNECTED, -EIO, "STATUS_ONLY_IF_CONNECTED"}, + {STATUS_DS_SENSITIVE_GROUP_VIOLATION, -EIO, + "STATUS_DS_SENSITIVE_GROUP_VIOLATION"}, + {STATUS_PNP_RESTART_ENUMERATION, -EIO, + "STATUS_PNP_RESTART_ENUMERATION"}, + {STATUS_JOURNAL_ENTRY_DELETED, -EIO, "STATUS_JOURNAL_ENTRY_DELETED"}, + {STATUS_DS_CANT_MOD_PRIMARYGROUPID, -EIO, + "STATUS_DS_CANT_MOD_PRIMARYGROUPID"}, + {STATUS_SYSTEM_IMAGE_BAD_SIGNATURE, -EIO, + "STATUS_SYSTEM_IMAGE_BAD_SIGNATURE"}, + {STATUS_PNP_REBOOT_REQUIRED, -EIO, "STATUS_PNP_REBOOT_REQUIRED"}, + {STATUS_POWER_STATE_INVALID, -EIO, "STATUS_POWER_STATE_INVALID"}, + {STATUS_DS_INVALID_GROUP_TYPE, -EIO, "STATUS_DS_INVALID_GROUP_TYPE"}, + {STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN, -EIO, + "STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN"}, + {STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN, -EIO, + "STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN"}, + {STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER, -EIO, + "STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER"}, + {STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER, -EIO, + "STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER"}, + {STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER, -EIO, + "STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER"}, + {STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER, -EIO, + "STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER"}, + {STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER, -EIO, + "STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER"}, + {STATUS_DS_HAVE_PRIMARY_MEMBERS, -EIO, + "STATUS_DS_HAVE_PRIMARY_MEMBERS"}, + {STATUS_WMI_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_WMI_NOT_SUPPORTED"}, + {STATUS_INSUFFICIENT_POWER, -EIO, "STATUS_INSUFFICIENT_POWER"}, + {STATUS_SAM_NEED_BOOTKEY_PASSWORD, -EIO, + "STATUS_SAM_NEED_BOOTKEY_PASSWORD"}, + {STATUS_SAM_NEED_BOOTKEY_FLOPPY, -EIO, + "STATUS_SAM_NEED_BOOTKEY_FLOPPY"}, + {STATUS_DS_CANT_START, -EIO, "STATUS_DS_CANT_START"}, + {STATUS_DS_INIT_FAILURE, -EIO, "STATUS_DS_INIT_FAILURE"}, + {STATUS_SAM_INIT_FAILURE, -EIO, "STATUS_SAM_INIT_FAILURE"}, + {STATUS_DS_GC_REQUIRED, -EIO, "STATUS_DS_GC_REQUIRED"}, + {STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY, -EIO, + "STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY"}, + {STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS, -EIO, + "STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS"}, + {STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED"}, + {STATUS_MULTIPLE_FAULT_VIOLATION, -EIO, + "STATUS_MULTIPLE_FAULT_VIOLATION"}, + {STATUS_CURRENT_DOMAIN_NOT_ALLOWED, -EIO, + "STATUS_CURRENT_DOMAIN_NOT_ALLOWED"}, + {STATUS_CANNOT_MAKE, -EIO, "STATUS_CANNOT_MAKE"}, + {STATUS_SYSTEM_SHUTDOWN, -EIO, "STATUS_SYSTEM_SHUTDOWN"}, + {STATUS_DS_INIT_FAILURE_CONSOLE, -EIO, + "STATUS_DS_INIT_FAILURE_CONSOLE"}, + {STATUS_DS_SAM_INIT_FAILURE_CONSOLE, -EIO, + "STATUS_DS_SAM_INIT_FAILURE_CONSOLE"}, + {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, + "STATUS_UNFINISHED_CONTEXT_DELETED"}, + {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, + {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, + {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, + "STATUS_WRONG_CREDENTIAL_HANDLE"}, + {STATUS_CRYPTO_SYSTEM_INVALID, -EIO, "STATUS_CRYPTO_SYSTEM_INVALID"}, + {STATUS_MAX_REFERRALS_EXCEEDED, -EIO, "STATUS_MAX_REFERRALS_EXCEEDED"}, + {STATUS_MUST_BE_KDC, -EIO, "STATUS_MUST_BE_KDC"}, + {STATUS_STRONG_CRYPTO_NOT_SUPPORTED, -EIO, + "STATUS_STRONG_CRYPTO_NOT_SUPPORTED"}, + {STATUS_TOO_MANY_PRINCIPALS, -EIO, "STATUS_TOO_MANY_PRINCIPALS"}, + {STATUS_NO_PA_DATA, -EIO, "STATUS_NO_PA_DATA"}, + {STATUS_PKINIT_NAME_MISMATCH, -EIO, "STATUS_PKINIT_NAME_MISMATCH"}, + {STATUS_SMARTCARD_LOGON_REQUIRED, -EIO, + "STATUS_SMARTCARD_LOGON_REQUIRED"}, + {STATUS_KDC_INVALID_REQUEST, -EIO, "STATUS_KDC_INVALID_REQUEST"}, + {STATUS_KDC_UNABLE_TO_REFER, -EIO, "STATUS_KDC_UNABLE_TO_REFER"}, + {STATUS_KDC_UNKNOWN_ETYPE, -EIO, "STATUS_KDC_UNKNOWN_ETYPE"}, + {STATUS_SHUTDOWN_IN_PROGRESS, -EIO, "STATUS_SHUTDOWN_IN_PROGRESS"}, + {STATUS_SERVER_SHUTDOWN_IN_PROGRESS, -EIO, + "STATUS_SERVER_SHUTDOWN_IN_PROGRESS"}, + {STATUS_NOT_SUPPORTED_ON_SBS, -EOPNOTSUPP, + "STATUS_NOT_SUPPORTED_ON_SBS"}, + {STATUS_WMI_GUID_DISCONNECTED, -EIO, "STATUS_WMI_GUID_DISCONNECTED"}, + {STATUS_WMI_ALREADY_DISABLED, -EIO, "STATUS_WMI_ALREADY_DISABLED"}, + {STATUS_WMI_ALREADY_ENABLED, -EIO, "STATUS_WMI_ALREADY_ENABLED"}, + {STATUS_MFT_TOO_FRAGMENTED, -EIO, "STATUS_MFT_TOO_FRAGMENTED"}, + {STATUS_COPY_PROTECTION_FAILURE, -EIO, + "STATUS_COPY_PROTECTION_FAILURE"}, + {STATUS_CSS_AUTHENTICATION_FAILURE, -EIO, + "STATUS_CSS_AUTHENTICATION_FAILURE"}, + {STATUS_CSS_KEY_NOT_PRESENT, -EIO, "STATUS_CSS_KEY_NOT_PRESENT"}, + {STATUS_CSS_KEY_NOT_ESTABLISHED, -EIO, + "STATUS_CSS_KEY_NOT_ESTABLISHED"}, + {STATUS_CSS_SCRAMBLED_SECTOR, -EIO, "STATUS_CSS_SCRAMBLED_SECTOR"}, + {STATUS_CSS_REGION_MISMATCH, -EIO, "STATUS_CSS_REGION_MISMATCH"}, + {STATUS_CSS_RESETS_EXHAUSTED, -EIO, "STATUS_CSS_RESETS_EXHAUSTED"}, + {STATUS_PKINIT_FAILURE, -EIO, "STATUS_PKINIT_FAILURE"}, + {STATUS_SMARTCARD_SUBSYSTEM_FAILURE, -EIO, + "STATUS_SMARTCARD_SUBSYSTEM_FAILURE"}, + {STATUS_NO_KERB_KEY, -EIO, "STATUS_NO_KERB_KEY"}, + {STATUS_HOST_DOWN, -EIO, "STATUS_HOST_DOWN"}, + {STATUS_UNSUPPORTED_PREAUTH, -EIO, "STATUS_UNSUPPORTED_PREAUTH"}, + {STATUS_EFS_ALG_BLOB_TOO_BIG, -EIO, "STATUS_EFS_ALG_BLOB_TOO_BIG"}, + {STATUS_PORT_NOT_SET, -EIO, "STATUS_PORT_NOT_SET"}, + {STATUS_DEBUGGER_INACTIVE, -EIO, "STATUS_DEBUGGER_INACTIVE"}, + {STATUS_DS_VERSION_CHECK_FAILURE, -EIO, + "STATUS_DS_VERSION_CHECK_FAILURE"}, + {STATUS_AUDITING_DISABLED, -EIO, "STATUS_AUDITING_DISABLED"}, + {STATUS_PRENT4_MACHINE_ACCOUNT, -EIO, "STATUS_PRENT4_MACHINE_ACCOUNT"}, + {STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER, -EIO, + "STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER"}, + {STATUS_INVALID_IMAGE_WIN_32, -EIO, "STATUS_INVALID_IMAGE_WIN_32"}, + {STATUS_INVALID_IMAGE_WIN_64, -EIO, "STATUS_INVALID_IMAGE_WIN_64"}, + {STATUS_BAD_BINDINGS, -EIO, "STATUS_BAD_BINDINGS"}, + {STATUS_NETWORK_SESSION_EXPIRED, -EIO, + "STATUS_NETWORK_SESSION_EXPIRED"}, + {STATUS_APPHELP_BLOCK, -EIO, "STATUS_APPHELP_BLOCK"}, + {STATUS_ALL_SIDS_FILTERED, -EIO, "STATUS_ALL_SIDS_FILTERED"}, + {STATUS_NOT_SAFE_MODE_DRIVER, -EIO, "STATUS_NOT_SAFE_MODE_DRIVER"}, + {STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT, -EACCES, + "STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT"}, + {STATUS_ACCESS_DISABLED_BY_POLICY_PATH, -EACCES, + "STATUS_ACCESS_DISABLED_BY_POLICY_PATH"}, + {STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER, -EACCES, + "STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER"}, + {STATUS_ACCESS_DISABLED_BY_POLICY_OTHER, -EACCES, + "STATUS_ACCESS_DISABLED_BY_POLICY_OTHER"}, + {STATUS_FAILED_DRIVER_ENTRY, -EIO, "STATUS_FAILED_DRIVER_ENTRY"}, + {STATUS_DEVICE_ENUMERATION_ERROR, -EIO, + "STATUS_DEVICE_ENUMERATION_ERROR"}, + {STATUS_MOUNT_POINT_NOT_RESOLVED, -EIO, + "STATUS_MOUNT_POINT_NOT_RESOLVED"}, + {STATUS_INVALID_DEVICE_OBJECT_PARAMETER, -EIO, + "STATUS_INVALID_DEVICE_OBJECT_PARAMETER"}, + {STATUS_MCA_OCCURED, -EIO, "STATUS_MCA_OCCURED"}, + {STATUS_DRIVER_BLOCKED_CRITICAL, -EIO, + "STATUS_DRIVER_BLOCKED_CRITICAL"}, + {STATUS_DRIVER_BLOCKED, -EIO, "STATUS_DRIVER_BLOCKED"}, + {STATUS_DRIVER_DATABASE_ERROR, -EIO, "STATUS_DRIVER_DATABASE_ERROR"}, + {STATUS_SYSTEM_HIVE_TOO_LARGE, -EIO, "STATUS_SYSTEM_HIVE_TOO_LARGE"}, + {STATUS_INVALID_IMPORT_OF_NON_DLL, -EIO, + "STATUS_INVALID_IMPORT_OF_NON_DLL"}, + {STATUS_NO_SECRETS, -EIO, "STATUS_NO_SECRETS"}, + {STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY, -EACCES, + "STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY"}, + {STATUS_FAILED_STACK_SWITCH, -EIO, "STATUS_FAILED_STACK_SWITCH"}, + {STATUS_HEAP_CORRUPTION, -EIO, "STATUS_HEAP_CORRUPTION"}, + {STATUS_SMARTCARD_WRONG_PIN, -EIO, "STATUS_SMARTCARD_WRONG_PIN"}, + {STATUS_SMARTCARD_CARD_BLOCKED, -EIO, "STATUS_SMARTCARD_CARD_BLOCKED"}, + {STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED, -EIO, + "STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED"}, + {STATUS_SMARTCARD_NO_CARD, -EIO, "STATUS_SMARTCARD_NO_CARD"}, + {STATUS_SMARTCARD_NO_KEY_CONTAINER, -EIO, + "STATUS_SMARTCARD_NO_KEY_CONTAINER"}, + {STATUS_SMARTCARD_NO_CERTIFICATE, -EIO, + "STATUS_SMARTCARD_NO_CERTIFICATE"}, + {STATUS_SMARTCARD_NO_KEYSET, -EIO, "STATUS_SMARTCARD_NO_KEYSET"}, + {STATUS_SMARTCARD_IO_ERROR, -EIO, "STATUS_SMARTCARD_IO_ERROR"}, + {STATUS_DOWNGRADE_DETECTED, -EIO, "STATUS_DOWNGRADE_DETECTED"}, + {STATUS_SMARTCARD_CERT_REVOKED, -EIO, "STATUS_SMARTCARD_CERT_REVOKED"}, + {STATUS_ISSUING_CA_UNTRUSTED, -EIO, "STATUS_ISSUING_CA_UNTRUSTED"}, + {STATUS_REVOCATION_OFFLINE_C, -EIO, "STATUS_REVOCATION_OFFLINE_C"}, + {STATUS_PKINIT_CLIENT_FAILURE, -EIO, "STATUS_PKINIT_CLIENT_FAILURE"}, + {STATUS_SMARTCARD_CERT_EXPIRED, -EIO, "STATUS_SMARTCARD_CERT_EXPIRED"}, + {STATUS_DRIVER_FAILED_PRIOR_UNLOAD, -EIO, + "STATUS_DRIVER_FAILED_PRIOR_UNLOAD"}, + {STATUS_SMARTCARD_SILENT_CONTEXT, -EIO, + "STATUS_SMARTCARD_SILENT_CONTEXT"}, + {STATUS_PER_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_PER_USER_TRUST_QUOTA_EXCEEDED"}, + {STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED"}, + {STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED"}, + {STATUS_DS_NAME_NOT_UNIQUE, -EIO, "STATUS_DS_NAME_NOT_UNIQUE"}, + {STATUS_DS_DUPLICATE_ID_FOUND, -EIO, "STATUS_DS_DUPLICATE_ID_FOUND"}, + {STATUS_DS_GROUP_CONVERSION_ERROR, -EIO, + "STATUS_DS_GROUP_CONVERSION_ERROR"}, + {STATUS_VOLSNAP_PREPARE_HIBERNATE, -EIO, + "STATUS_VOLSNAP_PREPARE_HIBERNATE"}, + {STATUS_USER2USER_REQUIRED, -EIO, "STATUS_USER2USER_REQUIRED"}, + {STATUS_STACK_BUFFER_OVERRUN, -EIO, "STATUS_STACK_BUFFER_OVERRUN"}, + {STATUS_NO_S4U_PROT_SUPPORT, -EIO, "STATUS_NO_S4U_PROT_SUPPORT"}, + {STATUS_CROSSREALM_DELEGATION_FAILURE, -EIO, + "STATUS_CROSSREALM_DELEGATION_FAILURE"}, + {STATUS_REVOCATION_OFFLINE_KDC, -EIO, "STATUS_REVOCATION_OFFLINE_KDC"}, + {STATUS_ISSUING_CA_UNTRUSTED_KDC, -EIO, + "STATUS_ISSUING_CA_UNTRUSTED_KDC"}, + {STATUS_KDC_CERT_EXPIRED, -EIO, "STATUS_KDC_CERT_EXPIRED"}, + {STATUS_KDC_CERT_REVOKED, -EIO, "STATUS_KDC_CERT_REVOKED"}, + {STATUS_PARAMETER_QUOTA_EXCEEDED, -EDQUOT, + "STATUS_PARAMETER_QUOTA_EXCEEDED"}, + {STATUS_HIBERNATION_FAILURE, -EIO, "STATUS_HIBERNATION_FAILURE"}, + {STATUS_DELAY_LOAD_FAILED, -EIO, "STATUS_DELAY_LOAD_FAILED"}, + {STATUS_AUTHENTICATION_FIREWALL_FAILED, -EIO, + "STATUS_AUTHENTICATION_FIREWALL_FAILED"}, + {STATUS_VDM_DISALLOWED, -EIO, "STATUS_VDM_DISALLOWED"}, + {STATUS_HUNG_DISPLAY_DRIVER_THREAD, -EIO, + "STATUS_HUNG_DISPLAY_DRIVER_THREAD"}, + {STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE, -EIO, + "STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE"}, + {STATUS_INVALID_CRUNTIME_PARAMETER, -EIO, + "STATUS_INVALID_CRUNTIME_PARAMETER"}, + {STATUS_NTLM_BLOCKED, -EIO, "STATUS_NTLM_BLOCKED"}, + {STATUS_ASSERTION_FAILURE, -EIO, "STATUS_ASSERTION_FAILURE"}, + {STATUS_VERIFIER_STOP, -EIO, "STATUS_VERIFIER_STOP"}, + {STATUS_CALLBACK_POP_STACK, -EIO, "STATUS_CALLBACK_POP_STACK"}, + {STATUS_INCOMPATIBLE_DRIVER_BLOCKED, -EIO, + "STATUS_INCOMPATIBLE_DRIVER_BLOCKED"}, + {STATUS_HIVE_UNLOADED, -EIO, "STATUS_HIVE_UNLOADED"}, + {STATUS_COMPRESSION_DISABLED, -EIO, "STATUS_COMPRESSION_DISABLED"}, + {STATUS_FILE_SYSTEM_LIMITATION, -EIO, "STATUS_FILE_SYSTEM_LIMITATION"}, + {STATUS_INVALID_IMAGE_HASH, -EIO, "STATUS_INVALID_IMAGE_HASH"}, + {STATUS_NOT_CAPABLE, -EIO, "STATUS_NOT_CAPABLE"}, + {STATUS_REQUEST_OUT_OF_SEQUENCE, -EIO, + "STATUS_REQUEST_OUT_OF_SEQUENCE"}, + {STATUS_IMPLEMENTATION_LIMIT, -EIO, "STATUS_IMPLEMENTATION_LIMIT"}, + {STATUS_ELEVATION_REQUIRED, -EIO, "STATUS_ELEVATION_REQUIRED"}, + {STATUS_BEYOND_VDL, -EIO, "STATUS_BEYOND_VDL"}, + {STATUS_ENCOUNTERED_WRITE_IN_PROGRESS, -EIO, + "STATUS_ENCOUNTERED_WRITE_IN_PROGRESS"}, + {STATUS_PTE_CHANGED, -EIO, "STATUS_PTE_CHANGED"}, + {STATUS_PURGE_FAILED, -EIO, "STATUS_PURGE_FAILED"}, + {STATUS_CRED_REQUIRES_CONFIRMATION, -EIO, + "STATUS_CRED_REQUIRES_CONFIRMATION"}, + {STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE, -EIO, + "STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE"}, + {STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER, -EIO, + "STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER"}, + {STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE, -EIO, + "STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE"}, + {STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE, -EIO, + "STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE"}, + {STATUS_CS_ENCRYPTION_FILE_NOT_CSE, -EIO, + "STATUS_CS_ENCRYPTION_FILE_NOT_CSE"}, + {STATUS_INVALID_LABEL, -EIO, "STATUS_INVALID_LABEL"}, + {STATUS_DRIVER_PROCESS_TERMINATED, -EIO, + "STATUS_DRIVER_PROCESS_TERMINATED"}, + {STATUS_AMBIGUOUS_SYSTEM_DEVICE, -EIO, + "STATUS_AMBIGUOUS_SYSTEM_DEVICE"}, + {STATUS_SYSTEM_DEVICE_NOT_FOUND, -EIO, + "STATUS_SYSTEM_DEVICE_NOT_FOUND"}, + {STATUS_RESTART_BOOT_APPLICATION, -EIO, + "STATUS_RESTART_BOOT_APPLICATION"}, + {STATUS_INVALID_TASK_NAME, -EIO, "STATUS_INVALID_TASK_NAME"}, + {STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"}, + {STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"}, + {STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"}, + {STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"}, + {STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"}, + {STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"}, + {STATUS_REQUEST_CANCELED, -EIO, "STATUS_REQUEST_CANCELED"}, + {STATUS_RECURSIVE_DISPATCH, -EIO, "STATUS_RECURSIVE_DISPATCH"}, + {STATUS_LPC_RECEIVE_BUFFER_EXPECTED, -EIO, + "STATUS_LPC_RECEIVE_BUFFER_EXPECTED"}, + {STATUS_LPC_INVALID_CONNECTION_USAGE, -EIO, + "STATUS_LPC_INVALID_CONNECTION_USAGE"}, + {STATUS_LPC_REQUESTS_NOT_ALLOWED, -EIO, + "STATUS_LPC_REQUESTS_NOT_ALLOWED"}, + {STATUS_RESOURCE_IN_USE, -EIO, "STATUS_RESOURCE_IN_USE"}, + {STATUS_HARDWARE_MEMORY_ERROR, -EIO, "STATUS_HARDWARE_MEMORY_ERROR"}, + {STATUS_THREADPOOL_HANDLE_EXCEPTION, -EIO, + "STATUS_THREADPOOL_HANDLE_EXCEPTION"}, + {STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED, -EIO, + "STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED"}, + {STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED, -EIO, + "STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED"}, + {STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED, -EIO, + "STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED"}, + {STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED, -EIO, + "STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED"}, + {STATUS_THREADPOOL_RELEASED_DURING_OPERATION, -EIO, + "STATUS_THREADPOOL_RELEASED_DURING_OPERATION"}, + {STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING, -EIO, + "STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING"}, + {STATUS_APC_RETURNED_WHILE_IMPERSONATING, -EIO, + "STATUS_APC_RETURNED_WHILE_IMPERSONATING"}, + {STATUS_PROCESS_IS_PROTECTED, -EIO, "STATUS_PROCESS_IS_PROTECTED"}, + {STATUS_MCA_EXCEPTION, -EIO, "STATUS_MCA_EXCEPTION"}, + {STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE, -EIO, + "STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE"}, + {STATUS_SYMLINK_CLASS_DISABLED, -EIO, "STATUS_SYMLINK_CLASS_DISABLED"}, + {STATUS_INVALID_IDN_NORMALIZATION, -EIO, + "STATUS_INVALID_IDN_NORMALIZATION"}, + {STATUS_NO_UNICODE_TRANSLATION, -EIO, "STATUS_NO_UNICODE_TRANSLATION"}, + {STATUS_ALREADY_REGISTERED, -EIO, "STATUS_ALREADY_REGISTERED"}, + {STATUS_CONTEXT_MISMATCH, -EIO, "STATUS_CONTEXT_MISMATCH"}, + {STATUS_PORT_ALREADY_HAS_COMPLETION_LIST, -EIO, + "STATUS_PORT_ALREADY_HAS_COMPLETION_LIST"}, + {STATUS_CALLBACK_RETURNED_THREAD_PRIORITY, -EIO, + "STATUS_CALLBACK_RETURNED_THREAD_PRIORITY"}, + {STATUS_INVALID_THREAD, -EIO, "STATUS_INVALID_THREAD"}, + {STATUS_CALLBACK_RETURNED_TRANSACTION, -EIO, + "STATUS_CALLBACK_RETURNED_TRANSACTION"}, + {STATUS_CALLBACK_RETURNED_LDR_LOCK, -EIO, + "STATUS_CALLBACK_RETURNED_LDR_LOCK"}, + {STATUS_CALLBACK_RETURNED_LANG, -EIO, "STATUS_CALLBACK_RETURNED_LANG"}, + {STATUS_CALLBACK_RETURNED_PRI_BACK, -EIO, + "STATUS_CALLBACK_RETURNED_PRI_BACK"}, + {STATUS_CALLBACK_RETURNED_THREAD_AFFINITY, -EIO, + "STATUS_CALLBACK_RETURNED_THREAD_AFFINITY"}, + {STATUS_DISK_REPAIR_DISABLED, -EIO, "STATUS_DISK_REPAIR_DISABLED"}, + {STATUS_DS_DOMAIN_RENAME_IN_PROGRESS, -EIO, + "STATUS_DS_DOMAIN_RENAME_IN_PROGRESS"}, + {STATUS_DISK_QUOTA_EXCEEDED, -EDQUOT, "STATUS_DISK_QUOTA_EXCEEDED"}, + {STATUS_CONTENT_BLOCKED, -EIO, "STATUS_CONTENT_BLOCKED"}, + {STATUS_BAD_CLUSTERS, -EIO, "STATUS_BAD_CLUSTERS"}, + {STATUS_VOLUME_DIRTY, -EIO, "STATUS_VOLUME_DIRTY"}, + {STATUS_FILE_CHECKED_OUT, -EIO, "STATUS_FILE_CHECKED_OUT"}, + {STATUS_CHECKOUT_REQUIRED, -EIO, "STATUS_CHECKOUT_REQUIRED"}, + {STATUS_BAD_FILE_TYPE, -EIO, "STATUS_BAD_FILE_TYPE"}, + {STATUS_FILE_TOO_LARGE, -EIO, "STATUS_FILE_TOO_LARGE"}, + {STATUS_FORMS_AUTH_REQUIRED, -EIO, "STATUS_FORMS_AUTH_REQUIRED"}, + {STATUS_VIRUS_INFECTED, -EIO, "STATUS_VIRUS_INFECTED"}, + {STATUS_VIRUS_DELETED, -EIO, "STATUS_VIRUS_DELETED"}, + {STATUS_BAD_MCFG_TABLE, -EIO, "STATUS_BAD_MCFG_TABLE"}, + {STATUS_WOW_ASSERTION, -EIO, "STATUS_WOW_ASSERTION"}, + {STATUS_INVALID_SIGNATURE, -EIO, "STATUS_INVALID_SIGNATURE"}, + {STATUS_HMAC_NOT_SUPPORTED, -EIO, "STATUS_HMAC_NOT_SUPPORTED"}, + {STATUS_IPSEC_QUEUE_OVERFLOW, -EIO, "STATUS_IPSEC_QUEUE_OVERFLOW"}, + {STATUS_ND_QUEUE_OVERFLOW, -EIO, "STATUS_ND_QUEUE_OVERFLOW"}, + {STATUS_HOPLIMIT_EXCEEDED, -EIO, "STATUS_HOPLIMIT_EXCEEDED"}, + {STATUS_PROTOCOL_NOT_SUPPORTED, -EOPNOTSUPP, + "STATUS_PROTOCOL_NOT_SUPPORTED"}, + {STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED, -EIO, + "STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED"}, + {STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR, -EIO, + "STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR"}, + {STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR, -EIO, + "STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR"}, + {STATUS_XML_PARSE_ERROR, -EIO, "STATUS_XML_PARSE_ERROR"}, + {STATUS_XMLDSIG_ERROR, -EIO, "STATUS_XMLDSIG_ERROR"}, + {STATUS_WRONG_COMPARTMENT, -EIO, "STATUS_WRONG_COMPARTMENT"}, + {STATUS_AUTHIP_FAILURE, -EIO, "STATUS_AUTHIP_FAILURE"}, + {DBG_NO_STATE_CHANGE, -EIO, "DBG_NO_STATE_CHANGE"}, + {DBG_APP_NOT_IDLE, -EIO, "DBG_APP_NOT_IDLE"}, + {RPC_NT_INVALID_STRING_BINDING, -EIO, "RPC_NT_INVALID_STRING_BINDING"}, + {RPC_NT_WRONG_KIND_OF_BINDING, -EIO, "RPC_NT_WRONG_KIND_OF_BINDING"}, + {RPC_NT_INVALID_BINDING, -EIO, "RPC_NT_INVALID_BINDING"}, + {RPC_NT_PROTSEQ_NOT_SUPPORTED, -EOPNOTSUPP, + "RPC_NT_PROTSEQ_NOT_SUPPORTED"}, + {RPC_NT_INVALID_RPC_PROTSEQ, -EIO, "RPC_NT_INVALID_RPC_PROTSEQ"}, + {RPC_NT_INVALID_STRING_UUID, -EIO, "RPC_NT_INVALID_STRING_UUID"}, + {RPC_NT_INVALID_ENDPOINT_FORMAT, -EIO, + "RPC_NT_INVALID_ENDPOINT_FORMAT"}, + {RPC_NT_INVALID_NET_ADDR, -EIO, "RPC_NT_INVALID_NET_ADDR"}, + {RPC_NT_NO_ENDPOINT_FOUND, -EIO, "RPC_NT_NO_ENDPOINT_FOUND"}, + {RPC_NT_INVALID_TIMEOUT, -EINVAL, "RPC_NT_INVALID_TIMEOUT"}, + {RPC_NT_OBJECT_NOT_FOUND, -ENOENT, "RPC_NT_OBJECT_NOT_FOUND"}, + {RPC_NT_ALREADY_REGISTERED, -EIO, "RPC_NT_ALREADY_REGISTERED"}, + {RPC_NT_TYPE_ALREADY_REGISTERED, -EIO, + "RPC_NT_TYPE_ALREADY_REGISTERED"}, + {RPC_NT_ALREADY_LISTENING, -EIO, "RPC_NT_ALREADY_LISTENING"}, + {RPC_NT_NO_PROTSEQS_REGISTERED, -EIO, "RPC_NT_NO_PROTSEQS_REGISTERED"}, + {RPC_NT_NOT_LISTENING, -EIO, "RPC_NT_NOT_LISTENING"}, + {RPC_NT_UNKNOWN_MGR_TYPE, -EIO, "RPC_NT_UNKNOWN_MGR_TYPE"}, + {RPC_NT_UNKNOWN_IF, -EIO, "RPC_NT_UNKNOWN_IF"}, + {RPC_NT_NO_BINDINGS, -EIO, "RPC_NT_NO_BINDINGS"}, + {RPC_NT_NO_PROTSEQS, -EIO, "RPC_NT_NO_PROTSEQS"}, + {RPC_NT_CANT_CREATE_ENDPOINT, -EIO, "RPC_NT_CANT_CREATE_ENDPOINT"}, + {RPC_NT_OUT_OF_RESOURCES, -EIO, "RPC_NT_OUT_OF_RESOURCES"}, + {RPC_NT_SERVER_UNAVAILABLE, -EIO, "RPC_NT_SERVER_UNAVAILABLE"}, + {RPC_NT_SERVER_TOO_BUSY, -EBUSY, "RPC_NT_SERVER_TOO_BUSY"}, + {RPC_NT_INVALID_NETWORK_OPTIONS, -EIO, + "RPC_NT_INVALID_NETWORK_OPTIONS"}, + {RPC_NT_NO_CALL_ACTIVE, -EIO, "RPC_NT_NO_CALL_ACTIVE"}, + {RPC_NT_CALL_FAILED, -EIO, "RPC_NT_CALL_FAILED"}, + {RPC_NT_CALL_FAILED_DNE, -EIO, "RPC_NT_CALL_FAILED_DNE"}, + {RPC_NT_PROTOCOL_ERROR, -EIO, "RPC_NT_PROTOCOL_ERROR"}, + {RPC_NT_UNSUPPORTED_TRANS_SYN, -EIO, "RPC_NT_UNSUPPORTED_TRANS_SYN"}, + {RPC_NT_UNSUPPORTED_TYPE, -EIO, "RPC_NT_UNSUPPORTED_TYPE"}, + {RPC_NT_INVALID_TAG, -EIO, "RPC_NT_INVALID_TAG"}, + {RPC_NT_INVALID_BOUND, -EIO, "RPC_NT_INVALID_BOUND"}, + {RPC_NT_NO_ENTRY_NAME, -EIO, "RPC_NT_NO_ENTRY_NAME"}, + {RPC_NT_INVALID_NAME_SYNTAX, -EIO, "RPC_NT_INVALID_NAME_SYNTAX"}, + {RPC_NT_UNSUPPORTED_NAME_SYNTAX, -EIO, + "RPC_NT_UNSUPPORTED_NAME_SYNTAX"}, + {RPC_NT_UUID_NO_ADDRESS, -EIO, "RPC_NT_UUID_NO_ADDRESS"}, + {RPC_NT_DUPLICATE_ENDPOINT, -ENOTUNIQ, "RPC_NT_DUPLICATE_ENDPOINT"}, + {RPC_NT_UNKNOWN_AUTHN_TYPE, -EIO, "RPC_NT_UNKNOWN_AUTHN_TYPE"}, + {RPC_NT_MAX_CALLS_TOO_SMALL, -EIO, "RPC_NT_MAX_CALLS_TOO_SMALL"}, + {RPC_NT_STRING_TOO_LONG, -EIO, "RPC_NT_STRING_TOO_LONG"}, + {RPC_NT_PROTSEQ_NOT_FOUND, -EIO, "RPC_NT_PROTSEQ_NOT_FOUND"}, + {RPC_NT_PROCNUM_OUT_OF_RANGE, -EIO, "RPC_NT_PROCNUM_OUT_OF_RANGE"}, + {RPC_NT_BINDING_HAS_NO_AUTH, -EIO, "RPC_NT_BINDING_HAS_NO_AUTH"}, + {RPC_NT_UNKNOWN_AUTHN_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHN_SERVICE"}, + {RPC_NT_UNKNOWN_AUTHN_LEVEL, -EIO, "RPC_NT_UNKNOWN_AUTHN_LEVEL"}, + {RPC_NT_INVALID_AUTH_IDENTITY, -EIO, "RPC_NT_INVALID_AUTH_IDENTITY"}, + {RPC_NT_UNKNOWN_AUTHZ_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHZ_SERVICE"}, + {EPT_NT_INVALID_ENTRY, -EIO, "EPT_NT_INVALID_ENTRY"}, + {EPT_NT_CANT_PERFORM_OP, -EIO, "EPT_NT_CANT_PERFORM_OP"}, + {EPT_NT_NOT_REGISTERED, -EIO, "EPT_NT_NOT_REGISTERED"}, + {RPC_NT_NOTHING_TO_EXPORT, -EIO, "RPC_NT_NOTHING_TO_EXPORT"}, + {RPC_NT_INCOMPLETE_NAME, -EIO, "RPC_NT_INCOMPLETE_NAME"}, + {RPC_NT_INVALID_VERS_OPTION, -EIO, "RPC_NT_INVALID_VERS_OPTION"}, + {RPC_NT_NO_MORE_MEMBERS, -EIO, "RPC_NT_NO_MORE_MEMBERS"}, + {RPC_NT_NOT_ALL_OBJS_UNEXPORTED, -EIO, + "RPC_NT_NOT_ALL_OBJS_UNEXPORTED"}, + {RPC_NT_INTERFACE_NOT_FOUND, -EIO, "RPC_NT_INTERFACE_NOT_FOUND"}, + {RPC_NT_ENTRY_ALREADY_EXISTS, -EIO, "RPC_NT_ENTRY_ALREADY_EXISTS"}, + {RPC_NT_ENTRY_NOT_FOUND, -EIO, "RPC_NT_ENTRY_NOT_FOUND"}, + {RPC_NT_NAME_SERVICE_UNAVAILABLE, -EIO, + "RPC_NT_NAME_SERVICE_UNAVAILABLE"}, + {RPC_NT_INVALID_NAF_ID, -EIO, "RPC_NT_INVALID_NAF_ID"}, + {RPC_NT_CANNOT_SUPPORT, -EOPNOTSUPP, "RPC_NT_CANNOT_SUPPORT"}, + {RPC_NT_NO_CONTEXT_AVAILABLE, -EIO, "RPC_NT_NO_CONTEXT_AVAILABLE"}, + {RPC_NT_INTERNAL_ERROR, -EIO, "RPC_NT_INTERNAL_ERROR"}, + {RPC_NT_ZERO_DIVIDE, -EIO, "RPC_NT_ZERO_DIVIDE"}, + {RPC_NT_ADDRESS_ERROR, -EIO, "RPC_NT_ADDRESS_ERROR"}, + {RPC_NT_FP_DIV_ZERO, -EIO, "RPC_NT_FP_DIV_ZERO"}, + {RPC_NT_FP_UNDERFLOW, -EIO, "RPC_NT_FP_UNDERFLOW"}, + {RPC_NT_FP_OVERFLOW, -EIO, "RPC_NT_FP_OVERFLOW"}, + {RPC_NT_CALL_IN_PROGRESS, -EIO, "RPC_NT_CALL_IN_PROGRESS"}, + {RPC_NT_NO_MORE_BINDINGS, -EIO, "RPC_NT_NO_MORE_BINDINGS"}, + {RPC_NT_GROUP_MEMBER_NOT_FOUND, -EIO, "RPC_NT_GROUP_MEMBER_NOT_FOUND"}, + {EPT_NT_CANT_CREATE, -EIO, "EPT_NT_CANT_CREATE"}, + {RPC_NT_INVALID_OBJECT, -EIO, "RPC_NT_INVALID_OBJECT"}, + {RPC_NT_NO_INTERFACES, -EIO, "RPC_NT_NO_INTERFACES"}, + {RPC_NT_CALL_CANCELLED, -EIO, "RPC_NT_CALL_CANCELLED"}, + {RPC_NT_BINDING_INCOMPLETE, -EIO, "RPC_NT_BINDING_INCOMPLETE"}, + {RPC_NT_COMM_FAILURE, -EIO, "RPC_NT_COMM_FAILURE"}, + {RPC_NT_UNSUPPORTED_AUTHN_LEVEL, -EIO, + "RPC_NT_UNSUPPORTED_AUTHN_LEVEL"}, + {RPC_NT_NO_PRINC_NAME, -EIO, "RPC_NT_NO_PRINC_NAME"}, + {RPC_NT_NOT_RPC_ERROR, -EIO, "RPC_NT_NOT_RPC_ERROR"}, + {RPC_NT_SEC_PKG_ERROR, -EIO, "RPC_NT_SEC_PKG_ERROR"}, + {RPC_NT_NOT_CANCELLED, -EIO, "RPC_NT_NOT_CANCELLED"}, + {RPC_NT_INVALID_ASYNC_HANDLE, -EIO, "RPC_NT_INVALID_ASYNC_HANDLE"}, + {RPC_NT_INVALID_ASYNC_CALL, -EIO, "RPC_NT_INVALID_ASYNC_CALL"}, + {RPC_NT_PROXY_ACCESS_DENIED, -EACCES, "RPC_NT_PROXY_ACCESS_DENIED"}, + {RPC_NT_NO_MORE_ENTRIES, -EIO, "RPC_NT_NO_MORE_ENTRIES"}, + {RPC_NT_SS_CHAR_TRANS_OPEN_FAIL, -EIO, + "RPC_NT_SS_CHAR_TRANS_OPEN_FAIL"}, + {RPC_NT_SS_CHAR_TRANS_SHORT_FILE, -EIO, + "RPC_NT_SS_CHAR_TRANS_SHORT_FILE"}, + {RPC_NT_SS_IN_NULL_CONTEXT, -EIO, "RPC_NT_SS_IN_NULL_CONTEXT"}, + {RPC_NT_SS_CONTEXT_MISMATCH, -EIO, "RPC_NT_SS_CONTEXT_MISMATCH"}, + {RPC_NT_SS_CONTEXT_DAMAGED, -EIO, "RPC_NT_SS_CONTEXT_DAMAGED"}, + {RPC_NT_SS_HANDLES_MISMATCH, -EIO, "RPC_NT_SS_HANDLES_MISMATCH"}, + {RPC_NT_SS_CANNOT_GET_CALL_HANDLE, -EIO, + "RPC_NT_SS_CANNOT_GET_CALL_HANDLE"}, + {RPC_NT_NULL_REF_POINTER, -EIO, "RPC_NT_NULL_REF_POINTER"}, + {RPC_NT_ENUM_VALUE_OUT_OF_RANGE, -EIO, + "RPC_NT_ENUM_VALUE_OUT_OF_RANGE"}, + {RPC_NT_BYTE_COUNT_TOO_SMALL, -EIO, "RPC_NT_BYTE_COUNT_TOO_SMALL"}, + {RPC_NT_BAD_STUB_DATA, -EIO, "RPC_NT_BAD_STUB_DATA"}, + {RPC_NT_INVALID_ES_ACTION, -EIO, "RPC_NT_INVALID_ES_ACTION"}, + {RPC_NT_WRONG_ES_VERSION, -EIO, "RPC_NT_WRONG_ES_VERSION"}, + {RPC_NT_WRONG_STUB_VERSION, -EIO, "RPC_NT_WRONG_STUB_VERSION"}, + {RPC_NT_INVALID_PIPE_OBJECT, -EIO, "RPC_NT_INVALID_PIPE_OBJECT"}, + {RPC_NT_INVALID_PIPE_OPERATION, -EIO, "RPC_NT_INVALID_PIPE_OPERATION"}, + {RPC_NT_WRONG_PIPE_VERSION, -EIO, "RPC_NT_WRONG_PIPE_VERSION"}, + {RPC_NT_PIPE_CLOSED, -EIO, "RPC_NT_PIPE_CLOSED"}, + {RPC_NT_PIPE_DISCIPLINE_ERROR, -EIO, "RPC_NT_PIPE_DISCIPLINE_ERROR"}, + {RPC_NT_PIPE_EMPTY, -EIO, "RPC_NT_PIPE_EMPTY"}, + {STATUS_PNP_BAD_MPS_TABLE, -EIO, "STATUS_PNP_BAD_MPS_TABLE"}, + {STATUS_PNP_TRANSLATION_FAILED, -EIO, "STATUS_PNP_TRANSLATION_FAILED"}, + {STATUS_PNP_IRQ_TRANSLATION_FAILED, -EIO, + "STATUS_PNP_IRQ_TRANSLATION_FAILED"}, + {STATUS_PNP_INVALID_ID, -EIO, "STATUS_PNP_INVALID_ID"}, + {STATUS_IO_REISSUE_AS_CACHED, -EIO, "STATUS_IO_REISSUE_AS_CACHED"}, + {STATUS_CTX_WINSTATION_NAME_INVALID, -EIO, + "STATUS_CTX_WINSTATION_NAME_INVALID"}, + {STATUS_CTX_INVALID_PD, -EIO, "STATUS_CTX_INVALID_PD"}, + {STATUS_CTX_PD_NOT_FOUND, -EIO, "STATUS_CTX_PD_NOT_FOUND"}, + {STATUS_CTX_CLOSE_PENDING, -EIO, "STATUS_CTX_CLOSE_PENDING"}, + {STATUS_CTX_NO_OUTBUF, -EIO, "STATUS_CTX_NO_OUTBUF"}, + {STATUS_CTX_MODEM_INF_NOT_FOUND, -EIO, + "STATUS_CTX_MODEM_INF_NOT_FOUND"}, + {STATUS_CTX_INVALID_MODEMNAME, -EIO, "STATUS_CTX_INVALID_MODEMNAME"}, + {STATUS_CTX_RESPONSE_ERROR, -EIO, "STATUS_CTX_RESPONSE_ERROR"}, + {STATUS_CTX_MODEM_RESPONSE_TIMEOUT, -ETIMEDOUT, + "STATUS_CTX_MODEM_RESPONSE_TIMEOUT"}, + {STATUS_CTX_MODEM_RESPONSE_NO_CARRIER, -EIO, + "STATUS_CTX_MODEM_RESPONSE_NO_CARRIER"}, + {STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE, -EIO, + "STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE"}, + {STATUS_CTX_MODEM_RESPONSE_BUSY, -EBUSY, + "STATUS_CTX_MODEM_RESPONSE_BUSY"}, + {STATUS_CTX_MODEM_RESPONSE_VOICE, -EIO, + "STATUS_CTX_MODEM_RESPONSE_VOICE"}, + {STATUS_CTX_TD_ERROR, -EIO, "STATUS_CTX_TD_ERROR"}, + {STATUS_CTX_LICENSE_CLIENT_INVALID, -EIO, + "STATUS_CTX_LICENSE_CLIENT_INVALID"}, + {STATUS_CTX_LICENSE_NOT_AVAILABLE, -EIO, + "STATUS_CTX_LICENSE_NOT_AVAILABLE"}, + {STATUS_CTX_LICENSE_EXPIRED, -EIO, "STATUS_CTX_LICENSE_EXPIRED"}, + {STATUS_CTX_WINSTATION_NOT_FOUND, -EIO, + "STATUS_CTX_WINSTATION_NOT_FOUND"}, + {STATUS_CTX_WINSTATION_NAME_COLLISION, -EIO, + "STATUS_CTX_WINSTATION_NAME_COLLISION"}, + {STATUS_CTX_WINSTATION_BUSY, -EBUSY, "STATUS_CTX_WINSTATION_BUSY"}, + {STATUS_CTX_BAD_VIDEO_MODE, -EIO, "STATUS_CTX_BAD_VIDEO_MODE"}, + {STATUS_CTX_GRAPHICS_INVALID, -EIO, "STATUS_CTX_GRAPHICS_INVALID"}, + {STATUS_CTX_NOT_CONSOLE, -EIO, "STATUS_CTX_NOT_CONSOLE"}, + {STATUS_CTX_CLIENT_QUERY_TIMEOUT, -EIO, + "STATUS_CTX_CLIENT_QUERY_TIMEOUT"}, + {STATUS_CTX_CONSOLE_DISCONNECT, -EIO, "STATUS_CTX_CONSOLE_DISCONNECT"}, + {STATUS_CTX_CONSOLE_CONNECT, -EIO, "STATUS_CTX_CONSOLE_CONNECT"}, + {STATUS_CTX_SHADOW_DENIED, -EIO, "STATUS_CTX_SHADOW_DENIED"}, + {STATUS_CTX_WINSTATION_ACCESS_DENIED, -EACCES, + "STATUS_CTX_WINSTATION_ACCESS_DENIED"}, + {STATUS_CTX_INVALID_WD, -EIO, "STATUS_CTX_INVALID_WD"}, + {STATUS_CTX_WD_NOT_FOUND, -EIO, "STATUS_CTX_WD_NOT_FOUND"}, + {STATUS_CTX_SHADOW_INVALID, -EIO, "STATUS_CTX_SHADOW_INVALID"}, + {STATUS_CTX_SHADOW_DISABLED, -EIO, "STATUS_CTX_SHADOW_DISABLED"}, + {STATUS_RDP_PROTOCOL_ERROR, -EIO, "STATUS_RDP_PROTOCOL_ERROR"}, + {STATUS_CTX_CLIENT_LICENSE_NOT_SET, -EIO, + "STATUS_CTX_CLIENT_LICENSE_NOT_SET"}, + {STATUS_CTX_CLIENT_LICENSE_IN_USE, -EIO, + "STATUS_CTX_CLIENT_LICENSE_IN_USE"}, + {STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE, -EIO, + "STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE"}, + {STATUS_CTX_SHADOW_NOT_RUNNING, -EIO, "STATUS_CTX_SHADOW_NOT_RUNNING"}, + {STATUS_CTX_LOGON_DISABLED, -EIO, "STATUS_CTX_LOGON_DISABLED"}, + {STATUS_CTX_SECURITY_LAYER_ERROR, -EIO, + "STATUS_CTX_SECURITY_LAYER_ERROR"}, + {STATUS_TS_INCOMPATIBLE_SESSIONS, -EIO, + "STATUS_TS_INCOMPATIBLE_SESSIONS"}, + {STATUS_MUI_FILE_NOT_FOUND, -EIO, "STATUS_MUI_FILE_NOT_FOUND"}, + {STATUS_MUI_INVALID_FILE, -EIO, "STATUS_MUI_INVALID_FILE"}, + {STATUS_MUI_INVALID_RC_CONFIG, -EIO, "STATUS_MUI_INVALID_RC_CONFIG"}, + {STATUS_MUI_INVALID_LOCALE_NAME, -EIO, + "STATUS_MUI_INVALID_LOCALE_NAME"}, + {STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME, -EIO, + "STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME"}, + {STATUS_MUI_FILE_NOT_LOADED, -EIO, "STATUS_MUI_FILE_NOT_LOADED"}, + {STATUS_RESOURCE_ENUM_USER_STOP, -EIO, + "STATUS_RESOURCE_ENUM_USER_STOP"}, + {STATUS_CLUSTER_INVALID_NODE, -EIO, "STATUS_CLUSTER_INVALID_NODE"}, + {STATUS_CLUSTER_NODE_EXISTS, -EIO, "STATUS_CLUSTER_NODE_EXISTS"}, + {STATUS_CLUSTER_JOIN_IN_PROGRESS, -EIO, + "STATUS_CLUSTER_JOIN_IN_PROGRESS"}, + {STATUS_CLUSTER_NODE_NOT_FOUND, -EIO, "STATUS_CLUSTER_NODE_NOT_FOUND"}, + {STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND, -EIO, + "STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND"}, + {STATUS_CLUSTER_NETWORK_EXISTS, -EIO, "STATUS_CLUSTER_NETWORK_EXISTS"}, + {STATUS_CLUSTER_NETWORK_NOT_FOUND, -EIO, + "STATUS_CLUSTER_NETWORK_NOT_FOUND"}, + {STATUS_CLUSTER_NETINTERFACE_EXISTS, -EIO, + "STATUS_CLUSTER_NETINTERFACE_EXISTS"}, + {STATUS_CLUSTER_NETINTERFACE_NOT_FOUND, -EIO, + "STATUS_CLUSTER_NETINTERFACE_NOT_FOUND"}, + {STATUS_CLUSTER_INVALID_REQUEST, -EIO, + "STATUS_CLUSTER_INVALID_REQUEST"}, + {STATUS_CLUSTER_INVALID_NETWORK_PROVIDER, -EIO, + "STATUS_CLUSTER_INVALID_NETWORK_PROVIDER"}, + {STATUS_CLUSTER_NODE_DOWN, -EIO, "STATUS_CLUSTER_NODE_DOWN"}, + {STATUS_CLUSTER_NODE_UNREACHABLE, -EIO, + "STATUS_CLUSTER_NODE_UNREACHABLE"}, + {STATUS_CLUSTER_NODE_NOT_MEMBER, -EIO, + "STATUS_CLUSTER_NODE_NOT_MEMBER"}, + {STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS, -EIO, + "STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS"}, + {STATUS_CLUSTER_INVALID_NETWORK, -EIO, + "STATUS_CLUSTER_INVALID_NETWORK"}, + {STATUS_CLUSTER_NO_NET_ADAPTERS, -EIO, + "STATUS_CLUSTER_NO_NET_ADAPTERS"}, + {STATUS_CLUSTER_NODE_UP, -EIO, "STATUS_CLUSTER_NODE_UP"}, + {STATUS_CLUSTER_NODE_PAUSED, -EIO, "STATUS_CLUSTER_NODE_PAUSED"}, + {STATUS_CLUSTER_NODE_NOT_PAUSED, -EIO, + "STATUS_CLUSTER_NODE_NOT_PAUSED"}, + {STATUS_CLUSTER_NO_SECURITY_CONTEXT, -EIO, + "STATUS_CLUSTER_NO_SECURITY_CONTEXT"}, + {STATUS_CLUSTER_NETWORK_NOT_INTERNAL, -EIO, + "STATUS_CLUSTER_NETWORK_NOT_INTERNAL"}, + {STATUS_CLUSTER_POISONED, -EIO, "STATUS_CLUSTER_POISONED"}, + {STATUS_ACPI_INVALID_OPCODE, -EIO, "STATUS_ACPI_INVALID_OPCODE"}, + {STATUS_ACPI_STACK_OVERFLOW, -EIO, "STATUS_ACPI_STACK_OVERFLOW"}, + {STATUS_ACPI_ASSERT_FAILED, -EIO, "STATUS_ACPI_ASSERT_FAILED"}, + {STATUS_ACPI_INVALID_INDEX, -EIO, "STATUS_ACPI_INVALID_INDEX"}, + {STATUS_ACPI_INVALID_ARGUMENT, -EIO, "STATUS_ACPI_INVALID_ARGUMENT"}, + {STATUS_ACPI_FATAL, -EIO, "STATUS_ACPI_FATAL"}, + {STATUS_ACPI_INVALID_SUPERNAME, -EIO, "STATUS_ACPI_INVALID_SUPERNAME"}, + {STATUS_ACPI_INVALID_ARGTYPE, -EIO, "STATUS_ACPI_INVALID_ARGTYPE"}, + {STATUS_ACPI_INVALID_OBJTYPE, -EIO, "STATUS_ACPI_INVALID_OBJTYPE"}, + {STATUS_ACPI_INVALID_TARGETTYPE, -EIO, + "STATUS_ACPI_INVALID_TARGETTYPE"}, + {STATUS_ACPI_INCORRECT_ARGUMENT_COUNT, -EIO, + "STATUS_ACPI_INCORRECT_ARGUMENT_COUNT"}, + {STATUS_ACPI_ADDRESS_NOT_MAPPED, -EIO, + "STATUS_ACPI_ADDRESS_NOT_MAPPED"}, + {STATUS_ACPI_INVALID_EVENTTYPE, -EIO, "STATUS_ACPI_INVALID_EVENTTYPE"}, + {STATUS_ACPI_HANDLER_COLLISION, -EIO, "STATUS_ACPI_HANDLER_COLLISION"}, + {STATUS_ACPI_INVALID_DATA, -EIO, "STATUS_ACPI_INVALID_DATA"}, + {STATUS_ACPI_INVALID_REGION, -EIO, "STATUS_ACPI_INVALID_REGION"}, + {STATUS_ACPI_INVALID_ACCESS_SIZE, -EIO, + "STATUS_ACPI_INVALID_ACCESS_SIZE"}, + {STATUS_ACPI_ACQUIRE_GLOBAL_LOCK, -EIO, + "STATUS_ACPI_ACQUIRE_GLOBAL_LOCK"}, + {STATUS_ACPI_ALREADY_INITIALIZED, -EIO, + "STATUS_ACPI_ALREADY_INITIALIZED"}, + {STATUS_ACPI_NOT_INITIALIZED, -EIO, "STATUS_ACPI_NOT_INITIALIZED"}, + {STATUS_ACPI_INVALID_MUTEX_LEVEL, -EIO, + "STATUS_ACPI_INVALID_MUTEX_LEVEL"}, + {STATUS_ACPI_MUTEX_NOT_OWNED, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNED"}, + {STATUS_ACPI_MUTEX_NOT_OWNER, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNER"}, + {STATUS_ACPI_RS_ACCESS, -EIO, "STATUS_ACPI_RS_ACCESS"}, + {STATUS_ACPI_INVALID_TABLE, -EIO, "STATUS_ACPI_INVALID_TABLE"}, + {STATUS_ACPI_REG_HANDLER_FAILED, -EIO, + "STATUS_ACPI_REG_HANDLER_FAILED"}, + {STATUS_ACPI_POWER_REQUEST_FAILED, -EIO, + "STATUS_ACPI_POWER_REQUEST_FAILED"}, + {STATUS_SXS_SECTION_NOT_FOUND, -EIO, "STATUS_SXS_SECTION_NOT_FOUND"}, + {STATUS_SXS_CANT_GEN_ACTCTX, -EIO, "STATUS_SXS_CANT_GEN_ACTCTX"}, + {STATUS_SXS_INVALID_ACTCTXDATA_FORMAT, -EIO, + "STATUS_SXS_INVALID_ACTCTXDATA_FORMAT"}, + {STATUS_SXS_ASSEMBLY_NOT_FOUND, -EIO, "STATUS_SXS_ASSEMBLY_NOT_FOUND"}, + {STATUS_SXS_MANIFEST_FORMAT_ERROR, -EIO, + "STATUS_SXS_MANIFEST_FORMAT_ERROR"}, + {STATUS_SXS_MANIFEST_PARSE_ERROR, -EIO, + "STATUS_SXS_MANIFEST_PARSE_ERROR"}, + {STATUS_SXS_ACTIVATION_CONTEXT_DISABLED, -EIO, + "STATUS_SXS_ACTIVATION_CONTEXT_DISABLED"}, + {STATUS_SXS_KEY_NOT_FOUND, -EIO, "STATUS_SXS_KEY_NOT_FOUND"}, + {STATUS_SXS_VERSION_CONFLICT, -EIO, "STATUS_SXS_VERSION_CONFLICT"}, + {STATUS_SXS_WRONG_SECTION_TYPE, -EIO, "STATUS_SXS_WRONG_SECTION_TYPE"}, + {STATUS_SXS_THREAD_QUERIES_DISABLED, -EIO, + "STATUS_SXS_THREAD_QUERIES_DISABLED"}, + {STATUS_SXS_ASSEMBLY_MISSING, -EIO, "STATUS_SXS_ASSEMBLY_MISSING"}, + {STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET, -EIO, + "STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET"}, + {STATUS_SXS_EARLY_DEACTIVATION, -EIO, "STATUS_SXS_EARLY_DEACTIVATION"}, + {STATUS_SXS_INVALID_DEACTIVATION, -EIO, + "STATUS_SXS_INVALID_DEACTIVATION"}, + {STATUS_SXS_MULTIPLE_DEACTIVATION, -EIO, + "STATUS_SXS_MULTIPLE_DEACTIVATION"}, + {STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY, -EIO, + "STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY"}, + {STATUS_SXS_PROCESS_TERMINATION_REQUESTED, -EIO, + "STATUS_SXS_PROCESS_TERMINATION_REQUESTED"}, + {STATUS_SXS_CORRUPT_ACTIVATION_STACK, -EIO, + "STATUS_SXS_CORRUPT_ACTIVATION_STACK"}, + {STATUS_SXS_CORRUPTION, -EIO, "STATUS_SXS_CORRUPTION"}, + {STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE, -EIO, + "STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE"}, + {STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME, -EIO, + "STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME"}, + {STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE, -EIO, + "STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE"}, + {STATUS_SXS_IDENTITY_PARSE_ERROR, -EIO, + "STATUS_SXS_IDENTITY_PARSE_ERROR"}, + {STATUS_SXS_COMPONENT_STORE_CORRUPT, -EIO, + "STATUS_SXS_COMPONENT_STORE_CORRUPT"}, + {STATUS_SXS_FILE_HASH_MISMATCH, -EIO, "STATUS_SXS_FILE_HASH_MISMATCH"}, + {STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT, -EIO, + "STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT"}, + {STATUS_SXS_IDENTITIES_DIFFERENT, -EIO, + "STATUS_SXS_IDENTITIES_DIFFERENT"}, + {STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT, -EIO, + "STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT"}, + {STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY, -EIO, + "STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY"}, + {STATUS_ADVANCED_INSTALLER_FAILED, -EIO, + "STATUS_ADVANCED_INSTALLER_FAILED"}, + {STATUS_XML_ENCODING_MISMATCH, -EIO, "STATUS_XML_ENCODING_MISMATCH"}, + {STATUS_SXS_MANIFEST_TOO_BIG, -EIO, "STATUS_SXS_MANIFEST_TOO_BIG"}, + {STATUS_SXS_SETTING_NOT_REGISTERED, -EIO, + "STATUS_SXS_SETTING_NOT_REGISTERED"}, + {STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE, -EIO, + "STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE"}, + {STATUS_SMI_PRIMITIVE_INSTALLER_FAILED, -EIO, + "STATUS_SMI_PRIMITIVE_INSTALLER_FAILED"}, + {STATUS_GENERIC_COMMAND_FAILED, -EIO, "STATUS_GENERIC_COMMAND_FAILED"}, + {STATUS_SXS_FILE_HASH_MISSING, -EIO, "STATUS_SXS_FILE_HASH_MISSING"}, + {STATUS_TRANSACTIONAL_CONFLICT, -EIO, "STATUS_TRANSACTIONAL_CONFLICT"}, + {STATUS_INVALID_TRANSACTION, -EIO, "STATUS_INVALID_TRANSACTION"}, + {STATUS_TRANSACTION_NOT_ACTIVE, -EIO, "STATUS_TRANSACTION_NOT_ACTIVE"}, + {STATUS_TM_INITIALIZATION_FAILED, -EIO, + "STATUS_TM_INITIALIZATION_FAILED"}, + {STATUS_RM_NOT_ACTIVE, -EIO, "STATUS_RM_NOT_ACTIVE"}, + {STATUS_RM_METADATA_CORRUPT, -EIO, "STATUS_RM_METADATA_CORRUPT"}, + {STATUS_TRANSACTION_NOT_JOINED, -EIO, "STATUS_TRANSACTION_NOT_JOINED"}, + {STATUS_DIRECTORY_NOT_RM, -EIO, "STATUS_DIRECTORY_NOT_RM"}, + {STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE, -EIO, + "STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE"}, + {STATUS_LOG_RESIZE_INVALID_SIZE, -EIO, + "STATUS_LOG_RESIZE_INVALID_SIZE"}, + {STATUS_REMOTE_FILE_VERSION_MISMATCH, -EIO, + "STATUS_REMOTE_FILE_VERSION_MISMATCH"}, + {STATUS_CRM_PROTOCOL_ALREADY_EXISTS, -EIO, + "STATUS_CRM_PROTOCOL_ALREADY_EXISTS"}, + {STATUS_TRANSACTION_PROPAGATION_FAILED, -EIO, + "STATUS_TRANSACTION_PROPAGATION_FAILED"}, + {STATUS_CRM_PROTOCOL_NOT_FOUND, -EIO, "STATUS_CRM_PROTOCOL_NOT_FOUND"}, + {STATUS_TRANSACTION_SUPERIOR_EXISTS, -EIO, + "STATUS_TRANSACTION_SUPERIOR_EXISTS"}, + {STATUS_TRANSACTION_REQUEST_NOT_VALID, -EIO, + "STATUS_TRANSACTION_REQUEST_NOT_VALID"}, + {STATUS_TRANSACTION_NOT_REQUESTED, -EIO, + "STATUS_TRANSACTION_NOT_REQUESTED"}, + {STATUS_TRANSACTION_ALREADY_ABORTED, -EIO, + "STATUS_TRANSACTION_ALREADY_ABORTED"}, + {STATUS_TRANSACTION_ALREADY_COMMITTED, -EIO, + "STATUS_TRANSACTION_ALREADY_COMMITTED"}, + {STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER, -EIO, + "STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER"}, + {STATUS_CURRENT_TRANSACTION_NOT_VALID, -EIO, + "STATUS_CURRENT_TRANSACTION_NOT_VALID"}, + {STATUS_LOG_GROWTH_FAILED, -EIO, "STATUS_LOG_GROWTH_FAILED"}, + {STATUS_OBJECT_NO_LONGER_EXISTS, -EIO, + "STATUS_OBJECT_NO_LONGER_EXISTS"}, + {STATUS_STREAM_MINIVERSION_NOT_FOUND, -EIO, + "STATUS_STREAM_MINIVERSION_NOT_FOUND"}, + {STATUS_STREAM_MINIVERSION_NOT_VALID, -EIO, + "STATUS_STREAM_MINIVERSION_NOT_VALID"}, + {STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION, -EIO, + "STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION"}, + {STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT, -EIO, + "STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT"}, + {STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS, -EIO, + "STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS"}, + {STATUS_HANDLE_NO_LONGER_VALID, -EIO, "STATUS_HANDLE_NO_LONGER_VALID"}, + {STATUS_LOG_CORRUPTION_DETECTED, -EIO, + "STATUS_LOG_CORRUPTION_DETECTED"}, + {STATUS_RM_DISCONNECTED, -EIO, "STATUS_RM_DISCONNECTED"}, + {STATUS_ENLISTMENT_NOT_SUPERIOR, -EIO, + "STATUS_ENLISTMENT_NOT_SUPERIOR"}, + {STATUS_FILE_IDENTITY_NOT_PERSISTENT, -EIO, + "STATUS_FILE_IDENTITY_NOT_PERSISTENT"}, + {STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY, -EIO, + "STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY"}, + {STATUS_CANT_CROSS_RM_BOUNDARY, -EIO, "STATUS_CANT_CROSS_RM_BOUNDARY"}, + {STATUS_TXF_DIR_NOT_EMPTY, -EIO, "STATUS_TXF_DIR_NOT_EMPTY"}, + {STATUS_INDOUBT_TRANSACTIONS_EXIST, -EIO, + "STATUS_INDOUBT_TRANSACTIONS_EXIST"}, + {STATUS_TM_VOLATILE, -EIO, "STATUS_TM_VOLATILE"}, + {STATUS_ROLLBACK_TIMER_EXPIRED, -EIO, "STATUS_ROLLBACK_TIMER_EXPIRED"}, + {STATUS_TXF_ATTRIBUTE_CORRUPT, -EIO, "STATUS_TXF_ATTRIBUTE_CORRUPT"}, + {STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION, -EIO, + "STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION"}, + {STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED, -EIO, + "STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED"}, + {STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE, -EIO, + "STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE"}, + {STATUS_TRANSACTION_REQUIRED_PROMOTION, -EIO, + "STATUS_TRANSACTION_REQUIRED_PROMOTION"}, + {STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION, -EIO, + "STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION"}, + {STATUS_TRANSACTIONS_NOT_FROZEN, -EIO, + "STATUS_TRANSACTIONS_NOT_FROZEN"}, + {STATUS_TRANSACTION_FREEZE_IN_PROGRESS, -EIO, + "STATUS_TRANSACTION_FREEZE_IN_PROGRESS"}, + {STATUS_NOT_SNAPSHOT_VOLUME, -EIO, "STATUS_NOT_SNAPSHOT_VOLUME"}, + {STATUS_NO_SAVEPOINT_WITH_OPEN_FILES, -EIO, + "STATUS_NO_SAVEPOINT_WITH_OPEN_FILES"}, + {STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION, -EIO, + "STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION"}, + {STATUS_TM_IDENTITY_MISMATCH, -EIO, "STATUS_TM_IDENTITY_MISMATCH"}, + {STATUS_FLOATED_SECTION, -EIO, "STATUS_FLOATED_SECTION"}, + {STATUS_CANNOT_ACCEPT_TRANSACTED_WORK, -EIO, + "STATUS_CANNOT_ACCEPT_TRANSACTED_WORK"}, + {STATUS_CANNOT_ABORT_TRANSACTIONS, -EIO, + "STATUS_CANNOT_ABORT_TRANSACTIONS"}, + {STATUS_TRANSACTION_NOT_FOUND, -EIO, "STATUS_TRANSACTION_NOT_FOUND"}, + {STATUS_RESOURCEMANAGER_NOT_FOUND, -EIO, + "STATUS_RESOURCEMANAGER_NOT_FOUND"}, + {STATUS_ENLISTMENT_NOT_FOUND, -EIO, "STATUS_ENLISTMENT_NOT_FOUND"}, + {STATUS_TRANSACTIONMANAGER_NOT_FOUND, -EIO, + "STATUS_TRANSACTIONMANAGER_NOT_FOUND"}, + {STATUS_TRANSACTIONMANAGER_NOT_ONLINE, -EIO, + "STATUS_TRANSACTIONMANAGER_NOT_ONLINE"}, + {STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION, -EIO, + "STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION"}, + {STATUS_TRANSACTION_NOT_ROOT, -EIO, "STATUS_TRANSACTION_NOT_ROOT"}, + {STATUS_TRANSACTION_OBJECT_EXPIRED, -EIO, + "STATUS_TRANSACTION_OBJECT_EXPIRED"}, + {STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION, -EIO, + "STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION"}, + {STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED, -EIO, + "STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED"}, + {STATUS_TRANSACTION_RECORD_TOO_LONG, -EIO, + "STATUS_TRANSACTION_RECORD_TOO_LONG"}, + {STATUS_NO_LINK_TRACKING_IN_TRANSACTION, -EIO, + "STATUS_NO_LINK_TRACKING_IN_TRANSACTION"}, + {STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION, -EOPNOTSUPP, + "STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION"}, + {STATUS_TRANSACTION_INTEGRITY_VIOLATED, -EIO, + "STATUS_TRANSACTION_INTEGRITY_VIOLATED"}, + {STATUS_LOG_SECTOR_INVALID, -EIO, "STATUS_LOG_SECTOR_INVALID"}, + {STATUS_LOG_SECTOR_PARITY_INVALID, -EIO, + "STATUS_LOG_SECTOR_PARITY_INVALID"}, + {STATUS_LOG_SECTOR_REMAPPED, -EIO, "STATUS_LOG_SECTOR_REMAPPED"}, + {STATUS_LOG_BLOCK_INCOMPLETE, -EIO, "STATUS_LOG_BLOCK_INCOMPLETE"}, + {STATUS_LOG_INVALID_RANGE, -EIO, "STATUS_LOG_INVALID_RANGE"}, + {STATUS_LOG_BLOCKS_EXHAUSTED, -EIO, "STATUS_LOG_BLOCKS_EXHAUSTED"}, + {STATUS_LOG_READ_CONTEXT_INVALID, -EIO, + "STATUS_LOG_READ_CONTEXT_INVALID"}, + {STATUS_LOG_RESTART_INVALID, -EIO, "STATUS_LOG_RESTART_INVALID"}, + {STATUS_LOG_BLOCK_VERSION, -EIO, "STATUS_LOG_BLOCK_VERSION"}, + {STATUS_LOG_BLOCK_INVALID, -EIO, "STATUS_LOG_BLOCK_INVALID"}, + {STATUS_LOG_READ_MODE_INVALID, -EIO, "STATUS_LOG_READ_MODE_INVALID"}, + {STATUS_LOG_METADATA_CORRUPT, -EIO, "STATUS_LOG_METADATA_CORRUPT"}, + {STATUS_LOG_METADATA_INVALID, -EIO, "STATUS_LOG_METADATA_INVALID"}, + {STATUS_LOG_METADATA_INCONSISTENT, -EIO, + "STATUS_LOG_METADATA_INCONSISTENT"}, + {STATUS_LOG_RESERVATION_INVALID, -EIO, + "STATUS_LOG_RESERVATION_INVALID"}, + {STATUS_LOG_CANT_DELETE, -EIO, "STATUS_LOG_CANT_DELETE"}, + {STATUS_LOG_CONTAINER_LIMIT_EXCEEDED, -EIO, + "STATUS_LOG_CONTAINER_LIMIT_EXCEEDED"}, + {STATUS_LOG_START_OF_LOG, -EIO, "STATUS_LOG_START_OF_LOG"}, + {STATUS_LOG_POLICY_ALREADY_INSTALLED, -EIO, + "STATUS_LOG_POLICY_ALREADY_INSTALLED"}, + {STATUS_LOG_POLICY_NOT_INSTALLED, -EIO, + "STATUS_LOG_POLICY_NOT_INSTALLED"}, + {STATUS_LOG_POLICY_INVALID, -EIO, "STATUS_LOG_POLICY_INVALID"}, + {STATUS_LOG_POLICY_CONFLICT, -EIO, "STATUS_LOG_POLICY_CONFLICT"}, + {STATUS_LOG_PINNED_ARCHIVE_TAIL, -EIO, + "STATUS_LOG_PINNED_ARCHIVE_TAIL"}, + {STATUS_LOG_RECORD_NONEXISTENT, -EIO, "STATUS_LOG_RECORD_NONEXISTENT"}, + {STATUS_LOG_RECORDS_RESERVED_INVALID, -EIO, + "STATUS_LOG_RECORDS_RESERVED_INVALID"}, + {STATUS_LOG_SPACE_RESERVED_INVALID, -EIO, + "STATUS_LOG_SPACE_RESERVED_INVALID"}, + {STATUS_LOG_TAIL_INVALID, -EIO, "STATUS_LOG_TAIL_INVALID"}, + {STATUS_LOG_FULL, -EIO, "STATUS_LOG_FULL"}, + {STATUS_LOG_MULTIPLEXED, -EIO, "STATUS_LOG_MULTIPLEXED"}, + {STATUS_LOG_DEDICATED, -EIO, "STATUS_LOG_DEDICATED"}, + {STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS, -EIO, + "STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS"}, + {STATUS_LOG_ARCHIVE_IN_PROGRESS, -EIO, + "STATUS_LOG_ARCHIVE_IN_PROGRESS"}, + {STATUS_LOG_EPHEMERAL, -EIO, "STATUS_LOG_EPHEMERAL"}, + {STATUS_LOG_NOT_ENOUGH_CONTAINERS, -EIO, + "STATUS_LOG_NOT_ENOUGH_CONTAINERS"}, + {STATUS_LOG_CLIENT_ALREADY_REGISTERED, -EIO, + "STATUS_LOG_CLIENT_ALREADY_REGISTERED"}, + {STATUS_LOG_CLIENT_NOT_REGISTERED, -EIO, + "STATUS_LOG_CLIENT_NOT_REGISTERED"}, + {STATUS_LOG_FULL_HANDLER_IN_PROGRESS, -EIO, + "STATUS_LOG_FULL_HANDLER_IN_PROGRESS"}, + {STATUS_LOG_CONTAINER_READ_FAILED, -EIO, + "STATUS_LOG_CONTAINER_READ_FAILED"}, + {STATUS_LOG_CONTAINER_WRITE_FAILED, -EIO, + "STATUS_LOG_CONTAINER_WRITE_FAILED"}, + {STATUS_LOG_CONTAINER_OPEN_FAILED, -EIO, + "STATUS_LOG_CONTAINER_OPEN_FAILED"}, + {STATUS_LOG_CONTAINER_STATE_INVALID, -EIO, + "STATUS_LOG_CONTAINER_STATE_INVALID"}, + {STATUS_LOG_STATE_INVALID, -EIO, "STATUS_LOG_STATE_INVALID"}, + {STATUS_LOG_PINNED, -EIO, "STATUS_LOG_PINNED"}, + {STATUS_LOG_METADATA_FLUSH_FAILED, -EIO, + "STATUS_LOG_METADATA_FLUSH_FAILED"}, + {STATUS_LOG_INCONSISTENT_SECURITY, -EIO, + "STATUS_LOG_INCONSISTENT_SECURITY"}, + {STATUS_LOG_APPENDED_FLUSH_FAILED, -EIO, + "STATUS_LOG_APPENDED_FLUSH_FAILED"}, + {STATUS_LOG_PINNED_RESERVATION, -EIO, "STATUS_LOG_PINNED_RESERVATION"}, + {STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD, -EIO, + "STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD"}, + {STATUS_FLT_NO_HANDLER_DEFINED, -EIO, "STATUS_FLT_NO_HANDLER_DEFINED"}, + {STATUS_FLT_CONTEXT_ALREADY_DEFINED, -EIO, + "STATUS_FLT_CONTEXT_ALREADY_DEFINED"}, + {STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST, -EIO, + "STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST"}, + {STATUS_FLT_DISALLOW_FAST_IO, -EIO, "STATUS_FLT_DISALLOW_FAST_IO"}, + {STATUS_FLT_INVALID_NAME_REQUEST, -EIO, + "STATUS_FLT_INVALID_NAME_REQUEST"}, + {STATUS_FLT_NOT_SAFE_TO_POST_OPERATION, -EIO, + "STATUS_FLT_NOT_SAFE_TO_POST_OPERATION"}, + {STATUS_FLT_NOT_INITIALIZED, -EIO, "STATUS_FLT_NOT_INITIALIZED"}, + {STATUS_FLT_FILTER_NOT_READY, -EIO, "STATUS_FLT_FILTER_NOT_READY"}, + {STATUS_FLT_POST_OPERATION_CLEANUP, -EIO, + "STATUS_FLT_POST_OPERATION_CLEANUP"}, + {STATUS_FLT_INTERNAL_ERROR, -EIO, "STATUS_FLT_INTERNAL_ERROR"}, + {STATUS_FLT_DELETING_OBJECT, -EIO, "STATUS_FLT_DELETING_OBJECT"}, + {STATUS_FLT_MUST_BE_NONPAGED_POOL, -EIO, + "STATUS_FLT_MUST_BE_NONPAGED_POOL"}, + {STATUS_FLT_DUPLICATE_ENTRY, -EIO, "STATUS_FLT_DUPLICATE_ENTRY"}, + {STATUS_FLT_CBDQ_DISABLED, -EIO, "STATUS_FLT_CBDQ_DISABLED"}, + {STATUS_FLT_DO_NOT_ATTACH, -EIO, "STATUS_FLT_DO_NOT_ATTACH"}, + {STATUS_FLT_DO_NOT_DETACH, -EIO, "STATUS_FLT_DO_NOT_DETACH"}, + {STATUS_FLT_INSTANCE_ALTITUDE_COLLISION, -EIO, + "STATUS_FLT_INSTANCE_ALTITUDE_COLLISION"}, + {STATUS_FLT_INSTANCE_NAME_COLLISION, -EIO, + "STATUS_FLT_INSTANCE_NAME_COLLISION"}, + {STATUS_FLT_FILTER_NOT_FOUND, -EIO, "STATUS_FLT_FILTER_NOT_FOUND"}, + {STATUS_FLT_VOLUME_NOT_FOUND, -EIO, "STATUS_FLT_VOLUME_NOT_FOUND"}, + {STATUS_FLT_INSTANCE_NOT_FOUND, -EIO, "STATUS_FLT_INSTANCE_NOT_FOUND"}, + {STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND, -EIO, + "STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND"}, + {STATUS_FLT_INVALID_CONTEXT_REGISTRATION, -EIO, + "STATUS_FLT_INVALID_CONTEXT_REGISTRATION"}, + {STATUS_FLT_NAME_CACHE_MISS, -EIO, "STATUS_FLT_NAME_CACHE_MISS"}, + {STATUS_FLT_NO_DEVICE_OBJECT, -EIO, "STATUS_FLT_NO_DEVICE_OBJECT"}, + {STATUS_FLT_VOLUME_ALREADY_MOUNTED, -EIO, + "STATUS_FLT_VOLUME_ALREADY_MOUNTED"}, + {STATUS_FLT_ALREADY_ENLISTED, -EIO, "STATUS_FLT_ALREADY_ENLISTED"}, + {STATUS_FLT_CONTEXT_ALREADY_LINKED, -EIO, + "STATUS_FLT_CONTEXT_ALREADY_LINKED"}, + {STATUS_FLT_NO_WAITER_FOR_REPLY, -EIO, + "STATUS_FLT_NO_WAITER_FOR_REPLY"}, + {STATUS_MONITOR_NO_DESCRIPTOR, -EIO, "STATUS_MONITOR_NO_DESCRIPTOR"}, + {STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT, -EIO, + "STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT"}, + {STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM, -EIO, + "STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM"}, + {STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK, -EIO, + "STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK"}, + {STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED, -EIO, + "STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED"}, + {STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK, -EIO, + "STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK"}, + {STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK, -EIO, + "STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK"}, + {STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA, -EIO, + "STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA"}, + {STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK, -EIO, + "STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK"}, + {STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER, -EIO, + "STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER"}, + {STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER, -EIO, + "STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER"}, + {STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER, -EIO, + "STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER"}, + {STATUS_GRAPHICS_ADAPTER_WAS_RESET, -EIO, + "STATUS_GRAPHICS_ADAPTER_WAS_RESET"}, + {STATUS_GRAPHICS_INVALID_DRIVER_MODEL, -EIO, + "STATUS_GRAPHICS_INVALID_DRIVER_MODEL"}, + {STATUS_GRAPHICS_PRESENT_MODE_CHANGED, -EIO, + "STATUS_GRAPHICS_PRESENT_MODE_CHANGED"}, + {STATUS_GRAPHICS_PRESENT_OCCLUDED, -EIO, + "STATUS_GRAPHICS_PRESENT_OCCLUDED"}, + {STATUS_GRAPHICS_PRESENT_DENIED, -EIO, + "STATUS_GRAPHICS_PRESENT_DENIED"}, + {STATUS_GRAPHICS_CANNOTCOLORCONVERT, -EIO, + "STATUS_GRAPHICS_CANNOTCOLORCONVERT"}, + {STATUS_GRAPHICS_NO_VIDEO_MEMORY, -EIO, + "STATUS_GRAPHICS_NO_VIDEO_MEMORY"}, + {STATUS_GRAPHICS_CANT_LOCK_MEMORY, -EIO, + "STATUS_GRAPHICS_CANT_LOCK_MEMORY"}, + {STATUS_GRAPHICS_ALLOCATION_BUSY, -EBUSY, + "STATUS_GRAPHICS_ALLOCATION_BUSY"}, + {STATUS_GRAPHICS_TOO_MANY_REFERENCES, -EIO, + "STATUS_GRAPHICS_TOO_MANY_REFERENCES"}, + {STATUS_GRAPHICS_TRY_AGAIN_LATER, -EIO, + "STATUS_GRAPHICS_TRY_AGAIN_LATER"}, + {STATUS_GRAPHICS_TRY_AGAIN_NOW, -EIO, "STATUS_GRAPHICS_TRY_AGAIN_NOW"}, + {STATUS_GRAPHICS_ALLOCATION_INVALID, -EIO, + "STATUS_GRAPHICS_ALLOCATION_INVALID"}, + {STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE, -EIO, + "STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE"}, + {STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED, -EIO, + "STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED"}, + {STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION, -EIO, + "STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION"}, + {STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE, -EIO, + "STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE"}, + {STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION, -EIO, + "STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION"}, + {STATUS_GRAPHICS_ALLOCATION_CLOSED, -EIO, + "STATUS_GRAPHICS_ALLOCATION_CLOSED"}, + {STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE, -EIO, + "STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE"}, + {STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE, -EIO, + "STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE"}, + {STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE, -EIO, + "STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE"}, + {STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST, -EIO, + "STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST"}, + {STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE, -EIO, + "STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE"}, + {STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY"}, + {STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_INVALID_VIDPN, -EIO, "STATUS_GRAPHICS_INVALID_VIDPN"}, + {STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE"}, + {STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET"}, + {STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET"}, + {STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET"}, + {STATUS_GRAPHICS_INVALID_FREQUENCY, -EIO, + "STATUS_GRAPHICS_INVALID_FREQUENCY"}, + {STATUS_GRAPHICS_INVALID_ACTIVE_REGION, -EIO, + "STATUS_GRAPHICS_INVALID_ACTIVE_REGION"}, + {STATUS_GRAPHICS_INVALID_TOTAL_REGION, -EIO, + "STATUS_GRAPHICS_INVALID_TOTAL_REGION"}, + {STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE"}, + {STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE"}, + {STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET, -EIO, + "STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET"}, + {STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY"}, + {STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET, -EIO, + "STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET"}, + {STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET"}, + {STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET, -EIO, + "STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET"}, + {STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET, -EIO, + "STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET"}, + {STATUS_GRAPHICS_TARGET_ALREADY_IN_SET, -EIO, + "STATUS_GRAPHICS_TARGET_ALREADY_IN_SET"}, + {STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH"}, + {STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY"}, + {STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET"}, + {STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE"}, + {STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET, -EIO, + "STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET"}, + {STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET, -EIO, + "STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET"}, + {STATUS_GRAPHICS_STALE_MODESET, -EIO, "STATUS_GRAPHICS_STALE_MODESET"}, + {STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET"}, + {STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE"}, + {STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN, -EIO, + "STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN"}, + {STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE, -EIO, + "STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE"}, + {STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION, -EIO, + "STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION"}, + {STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES, -EIO, + "STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES"}, + {STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY"}, + {STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE, -EIO, + "STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE"}, + {STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET, -EIO, + "STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET"}, + {STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET, -EIO, + "STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET"}, + {STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR, -EIO, + "STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR"}, + {STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET, -EIO, + "STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET"}, + {STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET, -EIO, + "STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET"}, + {STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE, -EIO, + "STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE"}, + {STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE"}, + {STATUS_GRAPHICS_RESOURCES_NOT_RELATED, -EIO, + "STATUS_GRAPHICS_RESOURCES_NOT_RELATED"}, + {STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE, -EIO, + "STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE"}, + {STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE, -EIO, + "STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE"}, + {STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET, -EIO, + "STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET"}, + {STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER, -EIO, + "STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER"}, + {STATUS_GRAPHICS_NO_VIDPNMGR, -EIO, "STATUS_GRAPHICS_NO_VIDPNMGR"}, + {STATUS_GRAPHICS_NO_ACTIVE_VIDPN, -EIO, + "STATUS_GRAPHICS_NO_ACTIVE_VIDPN"}, + {STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY"}, + {STATUS_GRAPHICS_MONITOR_NOT_CONNECTED, -EIO, + "STATUS_GRAPHICS_MONITOR_NOT_CONNECTED"}, + {STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY"}, + {STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE, -EIO, + "STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE"}, + {STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE, -EIO, + "STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE"}, + {STATUS_GRAPHICS_INVALID_STRIDE, -EIO, + "STATUS_GRAPHICS_INVALID_STRIDE"}, + {STATUS_GRAPHICS_INVALID_PIXELFORMAT, -EIO, + "STATUS_GRAPHICS_INVALID_PIXELFORMAT"}, + {STATUS_GRAPHICS_INVALID_COLORBASIS, -EIO, + "STATUS_GRAPHICS_INVALID_COLORBASIS"}, + {STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE, -EIO, + "STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE"}, + {STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY, -EIO, + "STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY"}, + {STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT, -EIO, + "STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT"}, + {STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE, -EIO, + "STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE"}, + {STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN, -EIO, + "STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN"}, + {STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL, -EIO, + "STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL"}, + {STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION, -EIO, + "STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION"}, + {STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED, + -EIO, + "STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_INVALID_GAMMA_RAMP, -EIO, + "STATUS_GRAPHICS_INVALID_GAMMA_RAMP"}, + {STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_MODE_NOT_IN_MODESET, -EIO, + "STATUS_GRAPHICS_MODE_NOT_IN_MODESET"}, + {STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON, -EIO, + "STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON"}, + {STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE, -EIO, + "STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE"}, + {STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE, -EIO, + "STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE"}, + {STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS, -EIO, + "STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS"}, + {STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING, -EIO, + "STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING"}, + {STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED, -EIO, + "STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED"}, + {STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS, -EIO, + "STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS"}, + {STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT, -EIO, + "STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT"}, + {STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM, -EIO, + "STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM"}, + {STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN"}, + {STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT, -EIO, + "STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT"}, + {STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED, -EIO, + "STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED"}, + {STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION, -EIO, + "STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION"}, + {STATUS_GRAPHICS_INVALID_CLIENT_TYPE, -EIO, + "STATUS_GRAPHICS_INVALID_CLIENT_TYPE"}, + {STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET, -EIO, + "STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET"}, + {STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED, -EIO, + "STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED"}, + {STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER, -EIO, + "STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER"}, + {STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED, -EIO, + "STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED"}, + {STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED, -EIO, + "STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED"}, + {STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY, -EIO, + "STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY"}, + {STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED, -EIO, + "STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED"}, + {STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON, -EIO, + "STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON"}, + {STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE, -EIO, + "STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE"}, + {STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER, -EIO, + "STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER"}, + {STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED, -EIO, + "STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED"}, + {STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS, + -EIO, + "STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS"}, + {STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST"}, + {STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR, -EIO, + "STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR"}, + {STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS, -EIO, + "STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS"}, + {STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST"}, + {STATUS_GRAPHICS_OPM_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_OPM_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_COPP_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_COPP_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_UAB_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_UAB_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS"}, + {STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL, -EIO, + "STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL"}, + {STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST, -EIO, + "STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST"}, + {STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO, + "STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"}, + {STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO, + "STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"}, + {STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_OPM_INVALID_POINTER, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_POINTER"}, + {STATUS_GRAPHICS_OPM_INTERNAL_ERROR, -EIO, + "STATUS_GRAPHICS_OPM_INTERNAL_ERROR"}, + {STATUS_GRAPHICS_OPM_INVALID_HANDLE, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_HANDLE"}, + {STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO, + "STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"}, + {STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH, -EIO, + "STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH"}, + {STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED, -EIO, + "STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED"}, + {STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED, -EIO, + "STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED"}, + {STATUS_GRAPHICS_PVP_HFS_FAILED, -EIO, + "STATUS_GRAPHICS_PVP_HFS_FAILED"}, + {STATUS_GRAPHICS_OPM_INVALID_SRM, -EIO, + "STATUS_GRAPHICS_OPM_INVALID_SRM"}, + {STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP, -EIO, + "STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP"}, + {STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP, -EIO, + "STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP"}, + {STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA, -EIO, + "STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA"}, + {STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET, -EIO, + "STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET"}, + {STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH, -EIO, + "STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH"}, + {STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE, -EIO, + "STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE"}, + {STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS, -EIO, + "STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS"}, + {STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO, + "STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS"}, + {STATUS_GRAPHICS_I2C_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_I2C_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST, -EIO, + "STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST"}, + {STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA, -EIO, + "STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA"}, + {STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA, -EIO, + "STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA"}, + {STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_DDCCI_INVALID_DATA, -EIO, + "STATUS_GRAPHICS_DDCCI_INVALID_DATA"}, + {STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE, + -EIO, + "STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE"}, + {STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING, -EIO, + "STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING"}, + {STATUS_GRAPHICS_MCA_INTERNAL_ERROR, -EIO, + "STATUS_GRAPHICS_MCA_INTERNAL_ERROR"}, + {STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND, -EIO, + "STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND"}, + {STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH, -EIO, + "STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH"}, + {STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM, -EIO, + "STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM"}, + {STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE, -EIO, + "STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE"}, + {STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS, -EIO, + "STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS"}, + {STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED, -EIO, + "STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED"}, + {STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO, + "STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"}, + {STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO, + "STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"}, + {STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO, + "STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED"}, + {STATUS_GRAPHICS_INVALID_POINTER, -EIO, + "STATUS_GRAPHICS_INVALID_POINTER"}, + {STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO, + "STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"}, + {STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL, -EIO, + "STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL"}, + {STATUS_GRAPHICS_INTERNAL_ERROR, -EIO, + "STATUS_GRAPHICS_INTERNAL_ERROR"}, + {STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO, + "STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS"}, + {STATUS_FVE_LOCKED_VOLUME, -EIO, "STATUS_FVE_LOCKED_VOLUME"}, + {STATUS_FVE_NOT_ENCRYPTED, -EIO, "STATUS_FVE_NOT_ENCRYPTED"}, + {STATUS_FVE_BAD_INFORMATION, -EIO, "STATUS_FVE_BAD_INFORMATION"}, + {STATUS_FVE_TOO_SMALL, -EIO, "STATUS_FVE_TOO_SMALL"}, + {STATUS_FVE_FAILED_WRONG_FS, -EIO, "STATUS_FVE_FAILED_WRONG_FS"}, + {STATUS_FVE_FAILED_BAD_FS, -EIO, "STATUS_FVE_FAILED_BAD_FS"}, + {STATUS_FVE_FS_NOT_EXTENDED, -EIO, "STATUS_FVE_FS_NOT_EXTENDED"}, + {STATUS_FVE_FS_MOUNTED, -EIO, "STATUS_FVE_FS_MOUNTED"}, + {STATUS_FVE_NO_LICENSE, -EIO, "STATUS_FVE_NO_LICENSE"}, + {STATUS_FVE_ACTION_NOT_ALLOWED, -EIO, "STATUS_FVE_ACTION_NOT_ALLOWED"}, + {STATUS_FVE_BAD_DATA, -EIO, "STATUS_FVE_BAD_DATA"}, + {STATUS_FVE_VOLUME_NOT_BOUND, -EIO, "STATUS_FVE_VOLUME_NOT_BOUND"}, + {STATUS_FVE_NOT_DATA_VOLUME, -EIO, "STATUS_FVE_NOT_DATA_VOLUME"}, + {STATUS_FVE_CONV_READ_ERROR, -EIO, "STATUS_FVE_CONV_READ_ERROR"}, + {STATUS_FVE_CONV_WRITE_ERROR, -EIO, "STATUS_FVE_CONV_WRITE_ERROR"}, + {STATUS_FVE_OVERLAPPED_UPDATE, -EIO, "STATUS_FVE_OVERLAPPED_UPDATE"}, + {STATUS_FVE_FAILED_SECTOR_SIZE, -EIO, "STATUS_FVE_FAILED_SECTOR_SIZE"}, + {STATUS_FVE_FAILED_AUTHENTICATION, -EIO, + "STATUS_FVE_FAILED_AUTHENTICATION"}, + {STATUS_FVE_NOT_OS_VOLUME, -EIO, "STATUS_FVE_NOT_OS_VOLUME"}, + {STATUS_FVE_KEYFILE_NOT_FOUND, -EIO, "STATUS_FVE_KEYFILE_NOT_FOUND"}, + {STATUS_FVE_KEYFILE_INVALID, -EIO, "STATUS_FVE_KEYFILE_INVALID"}, + {STATUS_FVE_KEYFILE_NO_VMK, -EIO, "STATUS_FVE_KEYFILE_NO_VMK"}, + {STATUS_FVE_TPM_DISABLED, -EIO, "STATUS_FVE_TPM_DISABLED"}, + {STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO, -EIO, + "STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO"}, + {STATUS_FVE_TPM_INVALID_PCR, -EIO, "STATUS_FVE_TPM_INVALID_PCR"}, + {STATUS_FVE_TPM_NO_VMK, -EIO, "STATUS_FVE_TPM_NO_VMK"}, + {STATUS_FVE_PIN_INVALID, -EIO, "STATUS_FVE_PIN_INVALID"}, + {STATUS_FVE_AUTH_INVALID_APPLICATION, -EIO, + "STATUS_FVE_AUTH_INVALID_APPLICATION"}, + {STATUS_FVE_AUTH_INVALID_CONFIG, -EIO, + "STATUS_FVE_AUTH_INVALID_CONFIG"}, + {STATUS_FVE_DEBUGGER_ENABLED, -EIO, "STATUS_FVE_DEBUGGER_ENABLED"}, + {STATUS_FVE_DRY_RUN_FAILED, -EIO, "STATUS_FVE_DRY_RUN_FAILED"}, + {STATUS_FVE_BAD_METADATA_POINTER, -EIO, + "STATUS_FVE_BAD_METADATA_POINTER"}, + {STATUS_FVE_OLD_METADATA_COPY, -EIO, "STATUS_FVE_OLD_METADATA_COPY"}, + {STATUS_FVE_REBOOT_REQUIRED, -EIO, "STATUS_FVE_REBOOT_REQUIRED"}, + {STATUS_FVE_RAW_ACCESS, -EIO, "STATUS_FVE_RAW_ACCESS"}, + {STATUS_FVE_RAW_BLOCKED, -EIO, "STATUS_FVE_RAW_BLOCKED"}, + {STATUS_FWP_CALLOUT_NOT_FOUND, -EIO, "STATUS_FWP_CALLOUT_NOT_FOUND"}, + {STATUS_FWP_CONDITION_NOT_FOUND, -EIO, + "STATUS_FWP_CONDITION_NOT_FOUND"}, + {STATUS_FWP_FILTER_NOT_FOUND, -EIO, "STATUS_FWP_FILTER_NOT_FOUND"}, + {STATUS_FWP_LAYER_NOT_FOUND, -EIO, "STATUS_FWP_LAYER_NOT_FOUND"}, + {STATUS_FWP_PROVIDER_NOT_FOUND, -EIO, "STATUS_FWP_PROVIDER_NOT_FOUND"}, + {STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND, -EIO, + "STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND"}, + {STATUS_FWP_SUBLAYER_NOT_FOUND, -EIO, "STATUS_FWP_SUBLAYER_NOT_FOUND"}, + {STATUS_FWP_NOT_FOUND, -EIO, "STATUS_FWP_NOT_FOUND"}, + {STATUS_FWP_ALREADY_EXISTS, -EIO, "STATUS_FWP_ALREADY_EXISTS"}, + {STATUS_FWP_IN_USE, -EIO, "STATUS_FWP_IN_USE"}, + {STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS, -EIO, + "STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS"}, + {STATUS_FWP_WRONG_SESSION, -EIO, "STATUS_FWP_WRONG_SESSION"}, + {STATUS_FWP_NO_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_NO_TXN_IN_PROGRESS"}, + {STATUS_FWP_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_TXN_IN_PROGRESS"}, + {STATUS_FWP_TXN_ABORTED, -EIO, "STATUS_FWP_TXN_ABORTED"}, + {STATUS_FWP_SESSION_ABORTED, -EIO, "STATUS_FWP_SESSION_ABORTED"}, + {STATUS_FWP_INCOMPATIBLE_TXN, -EIO, "STATUS_FWP_INCOMPATIBLE_TXN"}, + {STATUS_FWP_TIMEOUT, -ETIMEDOUT, "STATUS_FWP_TIMEOUT"}, + {STATUS_FWP_NET_EVENTS_DISABLED, -EIO, + "STATUS_FWP_NET_EVENTS_DISABLED"}, + {STATUS_FWP_INCOMPATIBLE_LAYER, -EIO, "STATUS_FWP_INCOMPATIBLE_LAYER"}, + {STATUS_FWP_KM_CLIENTS_ONLY, -EIO, "STATUS_FWP_KM_CLIENTS_ONLY"}, + {STATUS_FWP_LIFETIME_MISMATCH, -EIO, "STATUS_FWP_LIFETIME_MISMATCH"}, + {STATUS_FWP_BUILTIN_OBJECT, -EIO, "STATUS_FWP_BUILTIN_OBJECT"}, + {STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS, -EIO, + "STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS"}, + {STATUS_FWP_TOO_MANY_CALLOUTS, -EIO, "STATUS_FWP_TOO_MANY_CALLOUTS"}, + {STATUS_FWP_NOTIFICATION_DROPPED, -EIO, + "STATUS_FWP_NOTIFICATION_DROPPED"}, + {STATUS_FWP_TRAFFIC_MISMATCH, -EIO, "STATUS_FWP_TRAFFIC_MISMATCH"}, + {STATUS_FWP_INCOMPATIBLE_SA_STATE, -EIO, + "STATUS_FWP_INCOMPATIBLE_SA_STATE"}, + {STATUS_FWP_NULL_POINTER, -EIO, "STATUS_FWP_NULL_POINTER"}, + {STATUS_FWP_INVALID_ENUMERATOR, -EIO, "STATUS_FWP_INVALID_ENUMERATOR"}, + {STATUS_FWP_INVALID_FLAGS, -EIO, "STATUS_FWP_INVALID_FLAGS"}, + {STATUS_FWP_INVALID_NET_MASK, -EIO, "STATUS_FWP_INVALID_NET_MASK"}, + {STATUS_FWP_INVALID_RANGE, -EIO, "STATUS_FWP_INVALID_RANGE"}, + {STATUS_FWP_INVALID_INTERVAL, -EIO, "STATUS_FWP_INVALID_INTERVAL"}, + {STATUS_FWP_ZERO_LENGTH_ARRAY, -EIO, "STATUS_FWP_ZERO_LENGTH_ARRAY"}, + {STATUS_FWP_NULL_DISPLAY_NAME, -EIO, "STATUS_FWP_NULL_DISPLAY_NAME"}, + {STATUS_FWP_INVALID_ACTION_TYPE, -EIO, + "STATUS_FWP_INVALID_ACTION_TYPE"}, + {STATUS_FWP_INVALID_WEIGHT, -EIO, "STATUS_FWP_INVALID_WEIGHT"}, + {STATUS_FWP_MATCH_TYPE_MISMATCH, -EIO, + "STATUS_FWP_MATCH_TYPE_MISMATCH"}, + {STATUS_FWP_TYPE_MISMATCH, -EIO, "STATUS_FWP_TYPE_MISMATCH"}, + {STATUS_FWP_OUT_OF_BOUNDS, -EIO, "STATUS_FWP_OUT_OF_BOUNDS"}, + {STATUS_FWP_RESERVED, -EIO, "STATUS_FWP_RESERVED"}, + {STATUS_FWP_DUPLICATE_CONDITION, -EIO, + "STATUS_FWP_DUPLICATE_CONDITION"}, + {STATUS_FWP_DUPLICATE_KEYMOD, -EIO, "STATUS_FWP_DUPLICATE_KEYMOD"}, + {STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER, -EIO, + "STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER"}, + {STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER, -EIO, + "STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER"}, + {STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER, -EIO, + "STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER"}, + {STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT, -EIO, + "STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT"}, + {STATUS_FWP_INCOMPATIBLE_AUTH_METHOD, -EIO, + "STATUS_FWP_INCOMPATIBLE_AUTH_METHOD"}, + {STATUS_FWP_INCOMPATIBLE_DH_GROUP, -EIO, + "STATUS_FWP_INCOMPATIBLE_DH_GROUP"}, + {STATUS_FWP_EM_NOT_SUPPORTED, -EOPNOTSUPP, + "STATUS_FWP_EM_NOT_SUPPORTED"}, + {STATUS_FWP_NEVER_MATCH, -EIO, "STATUS_FWP_NEVER_MATCH"}, + {STATUS_FWP_PROVIDER_CONTEXT_MISMATCH, -EIO, + "STATUS_FWP_PROVIDER_CONTEXT_MISMATCH"}, + {STATUS_FWP_INVALID_PARAMETER, -EIO, "STATUS_FWP_INVALID_PARAMETER"}, + {STATUS_FWP_TOO_MANY_SUBLAYERS, -EIO, "STATUS_FWP_TOO_MANY_SUBLAYERS"}, + {STATUS_FWP_CALLOUT_NOTIFICATION_FAILED, -EIO, + "STATUS_FWP_CALLOUT_NOTIFICATION_FAILED"}, + {STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG, -EIO, + "STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG"}, + {STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG, -EIO, + "STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG"}, + {STATUS_FWP_TCPIP_NOT_READY, -EIO, "STATUS_FWP_TCPIP_NOT_READY"}, + {STATUS_FWP_INJECT_HANDLE_CLOSING, -EIO, + "STATUS_FWP_INJECT_HANDLE_CLOSING"}, + {STATUS_FWP_INJECT_HANDLE_STALE, -EIO, + "STATUS_FWP_INJECT_HANDLE_STALE"}, + {STATUS_FWP_CANNOT_PEND, -EIO, "STATUS_FWP_CANNOT_PEND"}, + {STATUS_NDIS_CLOSING, -EIO, "STATUS_NDIS_CLOSING"}, + {STATUS_NDIS_BAD_VERSION, -EIO, "STATUS_NDIS_BAD_VERSION"}, + {STATUS_NDIS_BAD_CHARACTERISTICS, -EIO, + "STATUS_NDIS_BAD_CHARACTERISTICS"}, + {STATUS_NDIS_ADAPTER_NOT_FOUND, -EIO, "STATUS_NDIS_ADAPTER_NOT_FOUND"}, + {STATUS_NDIS_OPEN_FAILED, -EIO, "STATUS_NDIS_OPEN_FAILED"}, + {STATUS_NDIS_DEVICE_FAILED, -EIO, "STATUS_NDIS_DEVICE_FAILED"}, + {STATUS_NDIS_MULTICAST_FULL, -EIO, "STATUS_NDIS_MULTICAST_FULL"}, + {STATUS_NDIS_MULTICAST_EXISTS, -EIO, "STATUS_NDIS_MULTICAST_EXISTS"}, + {STATUS_NDIS_MULTICAST_NOT_FOUND, -EIO, + "STATUS_NDIS_MULTICAST_NOT_FOUND"}, + {STATUS_NDIS_REQUEST_ABORTED, -EIO, "STATUS_NDIS_REQUEST_ABORTED"}, + {STATUS_NDIS_RESET_IN_PROGRESS, -EIO, "STATUS_NDIS_RESET_IN_PROGRESS"}, + {STATUS_NDIS_INVALID_PACKET, -EIO, "STATUS_NDIS_INVALID_PACKET"}, + {STATUS_NDIS_INVALID_DEVICE_REQUEST, -EIO, + "STATUS_NDIS_INVALID_DEVICE_REQUEST"}, + {STATUS_NDIS_ADAPTER_NOT_READY, -EIO, "STATUS_NDIS_ADAPTER_NOT_READY"}, + {STATUS_NDIS_INVALID_LENGTH, -EIO, "STATUS_NDIS_INVALID_LENGTH"}, + {STATUS_NDIS_INVALID_DATA, -EIO, "STATUS_NDIS_INVALID_DATA"}, + {STATUS_NDIS_BUFFER_TOO_SHORT, -ENOBUFS, + "STATUS_NDIS_BUFFER_TOO_SHORT"}, + {STATUS_NDIS_INVALID_OID, -EIO, "STATUS_NDIS_INVALID_OID"}, + {STATUS_NDIS_ADAPTER_REMOVED, -EIO, "STATUS_NDIS_ADAPTER_REMOVED"}, + {STATUS_NDIS_UNSUPPORTED_MEDIA, -EIO, "STATUS_NDIS_UNSUPPORTED_MEDIA"}, + {STATUS_NDIS_GROUP_ADDRESS_IN_USE, -EIO, + "STATUS_NDIS_GROUP_ADDRESS_IN_USE"}, + {STATUS_NDIS_FILE_NOT_FOUND, -EIO, "STATUS_NDIS_FILE_NOT_FOUND"}, + {STATUS_NDIS_ERROR_READING_FILE, -EIO, + "STATUS_NDIS_ERROR_READING_FILE"}, + {STATUS_NDIS_ALREADY_MAPPED, -EIO, "STATUS_NDIS_ALREADY_MAPPED"}, + {STATUS_NDIS_RESOURCE_CONFLICT, -EIO, "STATUS_NDIS_RESOURCE_CONFLICT"}, + {STATUS_NDIS_MEDIA_DISCONNECTED, -EIO, + "STATUS_NDIS_MEDIA_DISCONNECTED"}, + {STATUS_NDIS_INVALID_ADDRESS, -EIO, "STATUS_NDIS_INVALID_ADDRESS"}, + {STATUS_NDIS_PAUSED, -EIO, "STATUS_NDIS_PAUSED"}, + {STATUS_NDIS_INTERFACE_NOT_FOUND, -EIO, + "STATUS_NDIS_INTERFACE_NOT_FOUND"}, + {STATUS_NDIS_UNSUPPORTED_REVISION, -EIO, + "STATUS_NDIS_UNSUPPORTED_REVISION"}, + {STATUS_NDIS_INVALID_PORT, -EIO, "STATUS_NDIS_INVALID_PORT"}, + {STATUS_NDIS_INVALID_PORT_STATE, -EIO, + "STATUS_NDIS_INVALID_PORT_STATE"}, + {STATUS_NDIS_LOW_POWER_STATE, -EIO, "STATUS_NDIS_LOW_POWER_STATE"}, + {STATUS_NDIS_NOT_SUPPORTED, -ENOSYS, "STATUS_NDIS_NOT_SUPPORTED"}, + {STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED, -EIO, + "STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED"}, + {STATUS_NDIS_DOT11_MEDIA_IN_USE, -EIO, + "STATUS_NDIS_DOT11_MEDIA_IN_USE"}, + {STATUS_NDIS_DOT11_POWER_STATE_INVALID, -EIO, + "STATUS_NDIS_DOT11_POWER_STATE_INVALID"}, + {STATUS_IPSEC_BAD_SPI, -EIO, "STATUS_IPSEC_BAD_SPI"}, + {STATUS_IPSEC_SA_LIFETIME_EXPIRED, -EIO, + "STATUS_IPSEC_SA_LIFETIME_EXPIRED"}, + {STATUS_IPSEC_WRONG_SA, -EIO, "STATUS_IPSEC_WRONG_SA"}, + {STATUS_IPSEC_REPLAY_CHECK_FAILED, -EIO, + "STATUS_IPSEC_REPLAY_CHECK_FAILED"}, + {STATUS_IPSEC_INVALID_PACKET, -EIO, "STATUS_IPSEC_INVALID_PACKET"}, + {STATUS_IPSEC_INTEGRITY_CHECK_FAILED, -EIO, + "STATUS_IPSEC_INTEGRITY_CHECK_FAILED"}, + {STATUS_IPSEC_CLEAR_TEXT_DROP, -EIO, "STATUS_IPSEC_CLEAR_TEXT_DROP"}, + {0, 0, NULL} +}; + +/***************************************************************************** + Print an error message from the status code + *****************************************************************************/ +static void +smb2_print_status(__le32 status) +{ + int idx = 0; + + while (smb2_error_map_table[idx].status_string != NULL) { + if ((smb2_error_map_table[idx].smb2_status) == status) { + pr_notice("Status code returned 0x%08x %s\n", status, + smb2_error_map_table[idx].status_string); + } + idx++; + } + return; +} + +int +map_smb2_to_linux_error(char *buf, bool log_err) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + unsigned int i; + int rc = -EIO; + __le32 smb2err = hdr->Status; + + if (smb2err == 0) + return 0; + + /* mask facility */ + if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) && + (smb2err != STATUS_END_OF_FILE)) + smb2_print_status(smb2err); + else if (cifsFYI & CIFS_RC) + smb2_print_status(smb2err); + + for (i = 0; i < sizeof(smb2_error_map_table) / + sizeof(struct status_to_posix_error); i++) { + if (smb2_error_map_table[i].smb2_status == smb2err) { + rc = smb2_error_map_table[i].posix_error; + break; + } + } + + /* on error mapping not found - return EIO */ + + cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n", + smb2err, rc); + + return rc; +} diff --git a/kernel/fs/cifs/smb2misc.c b/kernel/fs/cifs/smb2misc.c new file mode 100644 index 000000000..1c5907019 --- /dev/null +++ b/kernel/fs/cifs/smb2misc.c @@ -0,0 +1,632 @@ +/* + * fs/cifs/smb2misc.c + * + * Copyright (C) International Business Machines Corp., 2002,2011 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include "smb2pdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "smb2proto.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" +#include "smb2status.h" + +static int +check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) +{ + __u64 wire_mid = le64_to_cpu(hdr->MessageId); + + /* + * Make sure that this really is an SMB, that it is a response, + * and that the message ids match. + */ + if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && + (mid == wire_mid)) { + if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + return 0; + else { + /* only one valid case where server sends us request */ + if (hdr->Command == SMB2_OPLOCK_BREAK) + return 0; + else + cifs_dbg(VFS, "Received Request not response\n"); + } + } else { /* bad signature or mid */ + if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) + cifs_dbg(VFS, "Bad protocol string signature header %x\n", + *(unsigned int *) hdr->ProtocolId); + if (mid != wire_mid) + cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", + mid, wire_mid); + } + cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid); + return 1; +} + +/* + * The following table defines the expected "StructureSize" of SMB2 responses + * in order by SMB2 command. This is similar to "wct" in SMB/CIFS responses. + * + * Note that commands are defined in smb2pdu.h in le16 but the array below is + * indexed by command in host byte order + */ +static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { + /* SMB2_NEGOTIATE */ cpu_to_le16(65), + /* SMB2_SESSION_SETUP */ cpu_to_le16(9), + /* SMB2_LOGOFF */ cpu_to_le16(4), + /* SMB2_TREE_CONNECT */ cpu_to_le16(16), + /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), + /* SMB2_CREATE */ cpu_to_le16(89), + /* SMB2_CLOSE */ cpu_to_le16(60), + /* SMB2_FLUSH */ cpu_to_le16(4), + /* SMB2_READ */ cpu_to_le16(17), + /* SMB2_WRITE */ cpu_to_le16(17), + /* SMB2_LOCK */ cpu_to_le16(4), + /* SMB2_IOCTL */ cpu_to_le16(49), + /* BB CHECK this ... not listed in documentation */ + /* SMB2_CANCEL */ cpu_to_le16(0), + /* SMB2_ECHO */ cpu_to_le16(4), + /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9), + /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9), + /* SMB2_QUERY_INFO */ cpu_to_le16(9), + /* SMB2_SET_INFO */ cpu_to_le16(2), + /* BB FIXME can also be 44 for lease break */ + /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) +}; + +int +smb2_check_message(char *buf, unsigned int length) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; + __u64 mid = le64_to_cpu(hdr->MessageId); + __u32 len = get_rfc1002_length(buf); + __u32 clc_len; /* calculated length */ + int command; + + /* BB disable following printk later */ + cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n", + __func__, length, len); + + /* + * Add function to do table lookup of StructureSize by command + * ie Validate the wct via smb2_struct_sizes table above + */ + + if (length < sizeof(struct smb2_pdu)) { + if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { + pdu->StructureSize2 = 0; + /* + * As with SMB/CIFS, on some error cases servers may + * not return wct properly + */ + return 0; + } else { + cifs_dbg(VFS, "Length less than SMB header size\n"); + } + return 1; + } + if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) { + cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", + mid); + return 1; + } + + if (check_smb2_hdr(hdr, mid)) + return 1; + + if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { + cifs_dbg(VFS, "Illegal structure size %u\n", + le16_to_cpu(hdr->StructureSize)); + return 1; + } + + command = le16_to_cpu(hdr->Command); + if (command >= NUMBER_OF_SMB2_COMMANDS) { + cifs_dbg(VFS, "Illegal SMB2 command %d\n", command); + return 1; + } + + if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { + if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 || + pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { + /* error packets have 9 byte structure size */ + cifs_dbg(VFS, "Illegal response size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); + return 1; + } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0) + && (le16_to_cpu(pdu->StructureSize2) != 44) + && (le16_to_cpu(pdu->StructureSize2) != 36)) { + /* special case for SMB2.1 lease break message */ + cifs_dbg(VFS, "Illegal response size %d for oplock break\n", + le16_to_cpu(pdu->StructureSize2)); + return 1; + } + } + + if (4 + len != length) { + cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n", + length, 4 + len, mid); + return 1; + } + + clc_len = smb2_calc_size(hdr); + + if (4 + len != clc_len) { + cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", + clc_len, 4 + len, mid); + /* create failed on symlink */ + if (command == SMB2_CREATE_HE && + hdr->Status == STATUS_STOPPED_ON_SYMLINK) + return 0; + /* Windows 7 server returns 24 bytes more */ + if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) + return 0; + /* server can return one byte more due to implied bcc[0] */ + if (clc_len == 4 + len + 1) + return 0; + + /* + * MacOS server pads after SMB2.1 write response with 3 bytes + * of junk. Other servers match RFC1001 len to actual + * SMB2/SMB3 frame length (header + smb2 response specific data) + * Log the server error (once), but allow it and continue + * since the frame is parseable. + */ + if (clc_len < 4 /* RFC1001 header size */ + len) { + printk_once(KERN_WARNING + "SMB2 server sent bad RFC1001 len %d not %d\n", + len, clc_len - 4); + return 0; + } + + return 1; + } + return 0; +} + +/* + * The size of the variable area depends on the offset and length fields + * located in different fields for various SMB2 responses. SMB2 responses + * with no variable length info, show an offset of zero for the offset field. + */ +static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { + /* SMB2_NEGOTIATE */ true, + /* SMB2_SESSION_SETUP */ true, + /* SMB2_LOGOFF */ false, + /* SMB2_TREE_CONNECT */ false, + /* SMB2_TREE_DISCONNECT */ false, + /* SMB2_CREATE */ true, + /* SMB2_CLOSE */ false, + /* SMB2_FLUSH */ false, + /* SMB2_READ */ true, + /* SMB2_WRITE */ false, + /* SMB2_LOCK */ false, + /* SMB2_IOCTL */ true, + /* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */ + /* SMB2_ECHO */ false, + /* SMB2_QUERY_DIRECTORY */ true, + /* SMB2_CHANGE_NOTIFY */ true, + /* SMB2_QUERY_INFO */ true, + /* SMB2_SET_INFO */ false, + /* SMB2_OPLOCK_BREAK */ false +}; + +/* + * Returns the pointer to the beginning of the data area. Length of the data + * area and the offset to it (from the beginning of the smb are also returned. + */ +char * +smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +{ + *off = 0; + *len = 0; + + /* error responses do not have data area */ + if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && + (((struct smb2_err_rsp *)hdr)->StructureSize) == + SMB2_ERROR_STRUCTURE_SIZE2) + return NULL; + + /* + * Following commands have data areas so we have to get the location + * of the data buffer offset and data buffer length for the particular + * command. + */ + switch (hdr->Command) { + case SMB2_NEGOTIATE: + *off = le16_to_cpu( + ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset); + *len = le16_to_cpu( + ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength); + break; + case SMB2_SESSION_SETUP: + *off = le16_to_cpu( + ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset); + *len = le16_to_cpu( + ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength); + break; + case SMB2_CREATE: + *off = le32_to_cpu( + ((struct smb2_create_rsp *)hdr)->CreateContextsOffset); + *len = le32_to_cpu( + ((struct smb2_create_rsp *)hdr)->CreateContextsLength); + break; + case SMB2_QUERY_INFO: + *off = le16_to_cpu( + ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset); + *len = le32_to_cpu( + ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength); + break; + case SMB2_READ: + *off = ((struct smb2_read_rsp *)hdr)->DataOffset; + *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength); + break; + case SMB2_QUERY_DIRECTORY: + *off = le16_to_cpu( + ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset); + *len = le32_to_cpu( + ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); + break; + case SMB2_IOCTL: + *off = le32_to_cpu( + ((struct smb2_ioctl_rsp *)hdr)->OutputOffset); + *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount); + break; + case SMB2_CHANGE_NOTIFY: + default: + /* BB FIXME for unimplemented cases above */ + cifs_dbg(VFS, "no length check for command\n"); + break; + } + + /* + * Invalid length or offset probably means data area is invalid, but + * we have little choice but to ignore the data area in this case. + */ + if (*off > 4096) { + cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off); + *len = 0; + *off = 0; + } else if (*off < 0) { + cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n", + *off); + *off = 0; + *len = 0; + } else if (*len < 0) { + cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n", + *len); + *len = 0; + } else if (*len > 128 * 1024) { + cifs_dbg(VFS, "data area larger than 128K: %d\n", *len); + *len = 0; + } + + /* return pointer to beginning of data area, ie offset from SMB start */ + if ((*off != 0) && (*len != 0)) + return (char *)(&hdr->ProtocolId[0]) + *off; + else + return NULL; +} + +/* + * Calculate the size of the SMB message based on the fixed header + * portion, the number of word parameters and the data portion of the message. + */ +unsigned int +smb2_calc_size(void *buf) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; + int offset; /* the offset from the beginning of SMB to data area */ + int data_length; /* the length of the variable length data area */ + /* Structure Size has already been checked to make sure it is 64 */ + int len = 4 + le16_to_cpu(pdu->hdr.StructureSize); + + /* + * StructureSize2, ie length of fixed parameter area has already + * been checked to make sure it is the correct length. + */ + len += le16_to_cpu(pdu->StructureSize2); + + if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) + goto calc_size_exit; + + smb2_get_data_area_len(&offset, &data_length, hdr); + cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset); + + if (data_length > 0) { + /* + * Check to make sure that data area begins after fixed area, + * Note that last byte of the fixed area is part of data area + * for some commands, typically those with odd StructureSize, + * so we must add one to the calculation (and 4 to account for + * the size of the RFC1001 hdr. + */ + if (offset + 4 + 1 < len) { + cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n", + offset + 4 + 1, len); + data_length = 0; + } else { + len = 4 + offset + data_length; + } + } +calc_size_exit: + cifs_dbg(FYI, "SMB2 len %d\n", len); + return len; +} + +/* Note: caller must free return buffer */ +__le16 * +cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) +{ + int len; + const char *start_of_path; + __le16 *to; + int map_type; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + map_type = SFM_MAP_UNI_RSVD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + map_type = SFU_MAP_UNI_RSVD; + else + map_type = NO_MAP_UNI_RSVD; + + /* Windows doesn't allow paths beginning with \ */ + if (from[0] == '\\') + start_of_path = from + 1; + else + start_of_path = from; + to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, + cifs_sb->local_nls, map_type); + return to; +} + +__le32 +smb2_get_lease_state(struct cifsInodeInfo *cinode) +{ + __le32 lease = 0; + + if (CIFS_CACHE_WRITE(cinode)) + lease |= SMB2_LEASE_WRITE_CACHING; + if (CIFS_CACHE_HANDLE(cinode)) + lease |= SMB2_LEASE_HANDLE_CACHING; + if (CIFS_CACHE_READ(cinode)) + lease |= SMB2_LEASE_READ_CACHING; + return lease; +} + +struct smb2_lease_break_work { + struct work_struct lease_break; + struct tcon_link *tlink; + __u8 lease_key[16]; + __le32 lease_state; +}; + +static void +cifs_ses_oplock_break(struct work_struct *work) +{ + struct smb2_lease_break_work *lw = container_of(work, + struct smb2_lease_break_work, lease_break); + int rc; + + rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, + lw->lease_state); + cifs_dbg(FYI, "Lease release rc %d\n", rc); + cifs_put_tlink(lw->tlink); + kfree(lw); +} + +static bool +smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, + struct smb2_lease_break_work *lw) +{ + bool found; + __u8 lease_state; + struct list_head *tmp; + struct cifsFileInfo *cfile; + struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_pending_open *open; + struct cifsInodeInfo *cinode; + int ack_req = le32_to_cpu(rsp->Flags & + SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); + + lease_state = le32_to_cpu(rsp->NewLeaseState); + + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + cinode = CIFS_I(d_inode(cfile->dentry)); + + if (memcmp(cinode->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + cifs_dbg(FYI, "found in the open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", + le32_to_cpu(rsp->NewLeaseState)); + + server->ops->set_oplock_level(cinode, lease_state, 0, NULL); + + if (ack_req) + cfile->oplock_break_cancelled = false; + else + cfile->oplock_break_cancelled = true; + + queue_work(cifsiod_wq, &cfile->oplock_break); + kfree(lw); + return true; + } + + found = false; + list_for_each_entry(open, &tcon->pending_opens, olist) { + if (memcmp(open->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + if (!found && ack_req) { + found = true; + memcpy(lw->lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + lw->tlink = cifs_get_tlink(open->tlink); + queue_work(cifsiod_wq, &lw->lease_break); + } + + cifs_dbg(FYI, "found in the pending open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", + le32_to_cpu(rsp->NewLeaseState)); + + open->oplock = lease_state; + } + return found; +} + +static bool +smb2_is_valid_lease_break(char *buffer) +{ + struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct smb2_lease_break_work *lw; + + lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); + if (!lw) + return false; + + INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); + lw->lease_state = rsp->NewLeaseState; + + cifs_dbg(FYI, "Checking for lease break\n"); + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &cifs_tcp_ses_list) { + server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list); + + list_for_each(tmp1, &server->smb_ses_list) { + ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); + + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, + tcon_list); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp, lw)) { + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + } + spin_unlock(&cifs_file_list_lock); + } + } + spin_unlock(&cifs_tcp_ses_lock); + kfree(lw); + cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); + return false; +} + +bool +smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + + cifs_dbg(FYI, "Checking for oplock break\n"); + + if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) + return false; + + if (rsp->StructureSize != + smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { + if (le16_to_cpu(rsp->StructureSize) == 44) + return smb2_is_valid_lease_break(buffer); + else + return false; + } + + cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &tcon->openFileList) { + cfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + if (rsp->PersistentFid != + cfile->fid.persistent_fid || + rsp->VolatileFid != + cfile->fid.volatile_fid) + continue; + + cifs_dbg(FYI, "file id match, oplock break\n"); + cinode = CIFS_I(d_inode(cfile->dentry)); + + if (!CIFS_CACHE_WRITE(cinode) && + rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) + cfile->oplock_break_cancelled = true; + else + cfile->oplock_break_cancelled = false; + + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + + queue_work(cifsiod_wq, &cfile->oplock_break); + + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + cifs_dbg(FYI, "No matching file for oplock break\n"); + return true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); + return false; +} diff --git a/kernel/fs/cifs/smb2ops.c b/kernel/fs/cifs/smb2ops.c new file mode 100644 index 000000000..54daee5ad --- /dev/null +++ b/kernel/fs/cifs/smb2ops.c @@ -0,0 +1,1716 @@ +/* + * SMB2 version specific operations + * + * Copyright (c) 2012, Jeff Layton + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2 as published + * by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "cifsglob.h" +#include "smb2pdu.h" +#include "smb2proto.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" +#include "smb2status.h" +#include "smb2glob.h" + +static int +change_conf(struct TCP_Server_Info *server) +{ + server->credits += server->echo_credits + server->oplock_credits; + server->oplock_credits = server->echo_credits = 0; + switch (server->credits) { + case 0: + return -1; + case 1: + server->echoes = false; + server->oplocks = false; + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + server->echoes = true; + server->oplocks = false; + server->echo_credits = 1; + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + server->echoes = true; + server->oplocks = true; + server->echo_credits = 1; + server->oplock_credits = 1; + } + server->credits -= server->echo_credits + server->oplock_credits; + return 0; +} + +static void +smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + int *val, rc = 0; + spin_lock(&server->req_lock); + val = server->ops->get_credits_field(server, optype); + *val += add; + server->in_flight--; + if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) + rc = change_conf(server); + /* + * Sometimes server returns 0 credits on oplock break ack - we need to + * rebalance credits in this case. + */ + else if (server->in_flight > 0 && server->oplock_credits == 0 && + server->oplocks) { + if (server->credits > 1) { + server->credits--; + server->oplock_credits++; + } + } + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + if (rc) + cifs_reconnect(server); +} + +static void +smb2_set_credits(struct TCP_Server_Info *server, const int val) +{ + spin_lock(&server->req_lock); + server->credits = val; + spin_unlock(&server->req_lock); +} + +static int * +smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) +{ + switch (optype) { + case CIFS_ECHO_OP: + return &server->echo_credits; + case CIFS_OBREAK_OP: + return &server->oplock_credits; + default: + return &server->credits; + } +} + +static unsigned int +smb2_get_credits(struct mid_q_entry *mid) +{ + return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); +} + +static int +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + int rc = 0; + unsigned int scredits; + + spin_lock(&server->req_lock); + while (1) { + if (server->credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, &server->credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + scredits = server->credits; + /* can deadlock with reopen */ + if (scredits == 1) { + *num = SMB2_MAX_BUFFER_SIZE; + *credits = 0; + break; + } + + /* leave one credit for a possible reopen */ + scredits--; + *num = min_t(unsigned int, size, + scredits * SMB2_MAX_BUFFER_SIZE); + + *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + server->credits -= *credits; + server->in_flight++; + break; + } + } + spin_unlock(&server->req_lock); + return rc; +} + +static __u64 +smb2_get_next_mid(struct TCP_Server_Info *server) +{ + __u64 mid; + /* for SMB2 we need the current value */ + spin_lock(&GlobalMid_Lock); + mid = server->CurrentMid++; + spin_unlock(&GlobalMid_Lock); + return mid; +} + +static struct mid_q_entry * +smb2_find_mid(struct TCP_Server_Info *server, char *buf) +{ + struct mid_q_entry *mid; + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + __u64 wire_mid = le64_to_cpu(hdr->MessageId); + + spin_lock(&GlobalMid_Lock); + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if ((mid->mid == wire_mid) && + (mid->mid_state == MID_REQUEST_SUBMITTED) && + (mid->command == hdr->Command)) { + spin_unlock(&GlobalMid_Lock); + return mid; + } + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} + +static void +smb2_dump_detail(void *buf) +{ +#ifdef CONFIG_CIFS_DEBUG2 + struct smb2_hdr *smb = (struct smb2_hdr *)buf; + + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", + smb->Command, smb->Status, smb->Flags, smb->MessageId, + smb->ProcessId); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb)); +#endif +} + +static bool +smb2_need_neg(struct TCP_Server_Info *server) +{ + return server->max_read == 0; +} + +static int +smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) +{ + int rc; + ses->server->CurrentMid = 0; + rc = SMB2_negotiate(xid, ses); + /* BB we probably don't need to retry with modern servers */ + if (rc == -EAGAIN) + rc = -EHOSTDOWN; + return rc; +} + +static unsigned int +smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; + wsize = min_t(unsigned int, wsize, server->max_write); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); + + return wsize; +} + +static unsigned int +smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize; + + /* start with specified rsize, or default */ + rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; + rsize = min_t(unsigned int, rsize, server->max_read); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); + + return rsize; +} + +#ifdef CONFIG_CIFS_STATS2 +static int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + unsigned int ret_data_len = 0; + struct network_interface_info_ioctl_rsp *out_buf; + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, + NULL /* no data input */, 0 /* no data input */, + (char **)&out_buf, &ret_data_len); + if (rc != 0) + cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) { + cifs_dbg(VFS, "server returned bad net interface info buf\n"); + rc = -EINVAL; + } else { + /* Dump info on first interface */ + cifs_dbg(FYI, "Adapter Capability 0x%x\t", + le32_to_cpu(out_buf->Capability)); + cifs_dbg(FYI, "Link Speed %lld\n", + le64_to_cpu(out_buf->LinkSpeed)); + } + + return rc; +} +#endif /* STATS2 */ + +static void +smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + +#ifdef CONFIG_CIFS_STATS2 + SMB3_request_interfaces(xid, tcon); +#endif /* STATS2 */ + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + +static void +smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + +static int +smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + if (rc) { + kfree(utf16_path); + return rc; + } + + rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + kfree(utf16_path); + return rc; +} + +static int +smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, FILE_ALL_INFO *data) +{ + *uniqueid = le64_to_cpu(data->IndexNumber); + return 0; +} + +static int +smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, FILE_ALL_INFO *data) +{ + int rc; + struct smb2_file_all_info *smb2_data; + + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + GFP_KERNEL); + if (smb2_data == NULL) + return -ENOMEM; + + rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, + smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + kfree(smb2_data); + return rc; +} + +static bool +smb2_can_echo(struct TCP_Server_Info *server) +{ + return server->echoes; +} + +static void +smb2_clear_stats(struct cifs_tcon *tcon) +{ +#ifdef CONFIG_CIFS_STATS + int i; + for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) { + atomic_set(&tcon->stats.smb2_stats.smb2_com_sent[i], 0); + atomic_set(&tcon->stats.smb2_stats.smb2_com_failed[i], 0); + } +#endif +} + +static void +smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) +{ + seq_puts(m, "\n\tShare Capabilities:"); + if (tcon->capabilities & SMB2_SHARE_CAP_DFS) + seq_puts(m, " DFS,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY) + seq_puts(m, " CONTINUOUS AVAILABILITY,"); + if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT) + seq_puts(m, " SCALEOUT,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) + seq_puts(m, " CLUSTER,"); + if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC) + seq_puts(m, " ASYMMETRIC,"); + if (tcon->capabilities == 0) + seq_puts(m, " None"); + if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE) + seq_puts(m, " Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE) + seq_puts(m, " Partition Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY) + seq_puts(m, " SSD,"); + if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED) + seq_puts(m, " TRIM-support,"); + + seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); + if (tcon->perf_sector_size) + seq_printf(m, "\tOptimal sector size: 0x%x", + tcon->perf_sector_size); +} + +static void +smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) +{ +#ifdef CONFIG_CIFS_STATS + atomic_t *sent = tcon->stats.smb2_stats.smb2_com_sent; + atomic_t *failed = tcon->stats.smb2_stats.smb2_com_failed; + seq_printf(m, "\nNegotiates: %d sent %d failed", + atomic_read(&sent[SMB2_NEGOTIATE_HE]), + atomic_read(&failed[SMB2_NEGOTIATE_HE])); + seq_printf(m, "\nSessionSetups: %d sent %d failed", + atomic_read(&sent[SMB2_SESSION_SETUP_HE]), + atomic_read(&failed[SMB2_SESSION_SETUP_HE])); + seq_printf(m, "\nLogoffs: %d sent %d failed", + atomic_read(&sent[SMB2_LOGOFF_HE]), + atomic_read(&failed[SMB2_LOGOFF_HE])); + seq_printf(m, "\nTreeConnects: %d sent %d failed", + atomic_read(&sent[SMB2_TREE_CONNECT_HE]), + atomic_read(&failed[SMB2_TREE_CONNECT_HE])); + seq_printf(m, "\nTreeDisconnects: %d sent %d failed", + atomic_read(&sent[SMB2_TREE_DISCONNECT_HE]), + atomic_read(&failed[SMB2_TREE_DISCONNECT_HE])); + seq_printf(m, "\nCreates: %d sent %d failed", + atomic_read(&sent[SMB2_CREATE_HE]), + atomic_read(&failed[SMB2_CREATE_HE])); + seq_printf(m, "\nCloses: %d sent %d failed", + atomic_read(&sent[SMB2_CLOSE_HE]), + atomic_read(&failed[SMB2_CLOSE_HE])); + seq_printf(m, "\nFlushes: %d sent %d failed", + atomic_read(&sent[SMB2_FLUSH_HE]), + atomic_read(&failed[SMB2_FLUSH_HE])); + seq_printf(m, "\nReads: %d sent %d failed", + atomic_read(&sent[SMB2_READ_HE]), + atomic_read(&failed[SMB2_READ_HE])); + seq_printf(m, "\nWrites: %d sent %d failed", + atomic_read(&sent[SMB2_WRITE_HE]), + atomic_read(&failed[SMB2_WRITE_HE])); + seq_printf(m, "\nLocks: %d sent %d failed", + atomic_read(&sent[SMB2_LOCK_HE]), + atomic_read(&failed[SMB2_LOCK_HE])); + seq_printf(m, "\nIOCTLs: %d sent %d failed", + atomic_read(&sent[SMB2_IOCTL_HE]), + atomic_read(&failed[SMB2_IOCTL_HE])); + seq_printf(m, "\nCancels: %d sent %d failed", + atomic_read(&sent[SMB2_CANCEL_HE]), + atomic_read(&failed[SMB2_CANCEL_HE])); + seq_printf(m, "\nEchos: %d sent %d failed", + atomic_read(&sent[SMB2_ECHO_HE]), + atomic_read(&failed[SMB2_ECHO_HE])); + seq_printf(m, "\nQueryDirectories: %d sent %d failed", + atomic_read(&sent[SMB2_QUERY_DIRECTORY_HE]), + atomic_read(&failed[SMB2_QUERY_DIRECTORY_HE])); + seq_printf(m, "\nChangeNotifies: %d sent %d failed", + atomic_read(&sent[SMB2_CHANGE_NOTIFY_HE]), + atomic_read(&failed[SMB2_CHANGE_NOTIFY_HE])); + seq_printf(m, "\nQueryInfos: %d sent %d failed", + atomic_read(&sent[SMB2_QUERY_INFO_HE]), + atomic_read(&failed[SMB2_QUERY_INFO_HE])); + seq_printf(m, "\nSetInfos: %d sent %d failed", + atomic_read(&sent[SMB2_SET_INFO_HE]), + atomic_read(&failed[SMB2_SET_INFO_HE])); + seq_printf(m, "\nOplockBreaks: %d sent %d failed", + atomic_read(&sent[SMB2_OPLOCK_BREAK_HE]), + atomic_read(&failed[SMB2_OPLOCK_BREAK_HE])); +#endif +} + +static void +smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) +{ + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + + cfile->fid.persistent_fid = fid->persistent_fid; + cfile->fid.volatile_fid = fid->volatile_fid; + server->ops->set_oplock_level(cinode, oplock, fid->epoch, + &fid->purge_cache); + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); +} + +static void +smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +static int +SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct copychunk_ioctl *pcchunk) +{ + int rc; + unsigned int ret_data_len; + struct resume_key_req *res_key; + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, + FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, + NULL, 0 /* no input */, + (char **)&res_key, &ret_data_len); + + if (rc) { + cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); + goto req_res_key_exit; + } + if (ret_data_len < sizeof(struct resume_key_req)) { + cifs_dbg(VFS, "Invalid refcopy resume key length\n"); + rc = -EINVAL; + goto req_res_key_exit; + } + memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE); + +req_res_key_exit: + kfree(res_key); + return rc; +} + +static int +smb2_clone_range(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + struct copychunk_ioctl *pcchunk; + struct copychunk_ioctl_rsp *retbuf = NULL; + struct cifs_tcon *tcon; + int chunks_copied = 0; + bool chunk_sizes_updated = false; + + pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + + if (pcchunk == NULL) + return -ENOMEM; + + cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n"); + /* Request a key from the server to identify the source of the copy */ + rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink), + srcfile->fid.persistent_fid, + srcfile->fid.volatile_fid, pcchunk); + + /* Note: request_res_key sets res_key null only if rc !=0 */ + if (rc) + goto cchunk_out; + + /* For now array only one chunk long, will make more flexible later */ + pcchunk->ChunkCount = cpu_to_le32(1); + pcchunk->Reserved = 0; + pcchunk->Reserved2 = 0; + + tcon = tlink_tcon(trgtfile->tlink); + + while (len > 0) { + pcchunk->SourceOffset = cpu_to_le64(src_off); + pcchunk->TargetOffset = cpu_to_le64(dest_off); + pcchunk->Length = + cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); + + /* Request server copy to target from src identified by key */ + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + true /* is_fsctl */, (char *)pcchunk, + sizeof(struct copychunk_ioctl), (char **)&retbuf, + &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { + cifs_dbg(VFS, "invalid cchunk response size\n"); + rc = -EIO; + goto cchunk_out; + } + if (retbuf->TotalBytesWritten == 0) { + cifs_dbg(FYI, "no bytes copied\n"); + rc = -EIO; + goto cchunk_out; + } + /* + * Check if server claimed to write more than we asked + */ + if (le32_to_cpu(retbuf->TotalBytesWritten) > + le32_to_cpu(pcchunk->Length)) { + cifs_dbg(VFS, "invalid copy chunk response\n"); + rc = -EIO; + goto cchunk_out; + } + if (le32_to_cpu(retbuf->ChunksWritten) != 1) { + cifs_dbg(VFS, "invalid num chunks written\n"); + rc = -EIO; + goto cchunk_out; + } + chunks_copied++; + + src_off += le32_to_cpu(retbuf->TotalBytesWritten); + dest_off += le32_to_cpu(retbuf->TotalBytesWritten); + len -= le32_to_cpu(retbuf->TotalBytesWritten); + + cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + } else if (rc == -EINVAL) { + if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) + goto cchunk_out; + + cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + + /* + * Check if this is the first request using these sizes, + * (ie check if copy succeed once with original sizes + * and check if the server gave us different sizes after + * we already updated max sizes on previous request). + * if not then why is the server returning an error now + */ + if ((chunks_copied != 0) || chunk_sizes_updated) + goto cchunk_out; + + /* Check that server is not asking us to grow size */ + if (le32_to_cpu(retbuf->ChunkBytesWritten) < + tcon->max_bytes_chunk) + tcon->max_bytes_chunk = + le32_to_cpu(retbuf->ChunkBytesWritten); + else + goto cchunk_out; /* server gave us bogus size */ + + /* No need to change MaxChunks since already set to 1 */ + chunk_sizes_updated = true; + } else + goto cchunk_out; + } + +cchunk_out: + kfree(pcchunk); + return rc; +} + +static int +smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return SMB2_flush(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +static unsigned int +smb2_read_data_offset(char *buf) +{ + struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + return rsp->DataOffset; +} + +static unsigned int +smb2_read_data_length(char *buf) +{ + struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + return le32_to_cpu(rsp->DataLength); +} + + +static int +smb2_sync_read(const unsigned int xid, struct cifs_fid *pfid, + struct cifs_io_parms *parms, unsigned int *bytes_read, + char **buf, int *buf_type) +{ + parms->persistent_fid = pfid->persistent_fid; + parms->volatile_fid = pfid->volatile_fid; + return SMB2_read(xid, parms, bytes_read, buf, buf_type); +} + +static int +smb2_sync_write(const unsigned int xid, struct cifs_fid *pfid, + struct cifs_io_parms *parms, unsigned int *written, + struct kvec *iov, unsigned long nr_segs) +{ + + parms->persistent_fid = pfid->persistent_fid; + parms->volatile_fid = pfid->volatile_fid; + return SMB2_write(xid, parms, written, iov, nr_segs); +} + +/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */ +static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse) +{ + struct cifsInodeInfo *cifsi; + int rc; + + cifsi = CIFS_I(inode); + + /* if file already sparse don't bother setting sparse again */ + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse) + return true; /* already sparse */ + + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse) + return true; /* already not sparse */ + + /* + * Can't check for sparse support on share the usual way via the + * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share + * since Samba server doesn't set the flag on the share, yet + * supports the set sparse FSCTL and returns sparse correctly + * in the file attributes. If we fail setting sparse though we + * mark that server does not support sparse files for this share + * to avoid repeatedly sending the unsupported fsctl to server + * if the file is repeatedly extended. + */ + if (tcon->broken_sparse_sup) + return false; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_SPARSE, + true /* is_fctl */, &setsparse, 1, NULL, NULL); + if (rc) { + tcon->broken_sparse_sup = true; + cifs_dbg(FYI, "set sparse rc = %d\n", rc); + return false; + } + + if (setsparse) + cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE; + else + cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE); + + return true; +} + +static int +smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, __u64 size, bool set_alloc) +{ + __le64 eof = cpu_to_le64(size); + struct inode *inode; + + /* + * If extending file more than one page make sparse. Many Linux fs + * make files sparse by default when extending via ftruncate + */ + inode = d_inode(cfile->dentry); + + if (!set_alloc && (size > inode->i_size + 8192)) { + __u8 set_sparse = 1; + + /* whether set sparse succeeds or not, extend the file */ + smb2_set_sparse(xid, tcon, cfile, inode, set_sparse); + } + + return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof, false); +} + +static int +smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid); +} + +static int +smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + __le16 *utf16_path; + int rc; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (rc) { + cifs_dbg(VFS, "open dir failed\n"); + return rc; + } + + srch_inf->entries_in_buffer = 0; + srch_inf->index_of_last_entry = 0; + + rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, + fid->volatile_fid, 0, srch_inf); + if (rc) { + cifs_dbg(VFS, "query directory failed\n"); + SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); + } + return rc; +} + +static int +smb2_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return SMB2_query_directory(xid, tcon, fid->persistent_fid, + fid->volatile_fid, 0, srch_inf); +} + +static int +smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +/* +* If we negotiate SMB2 protocol and get STATUS_PENDING - update +* the number of credits and return true. Otherwise - return false. +*/ +static bool +smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + + if (hdr->Status != STATUS_PENDING) + return false; + + if (!length) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(hdr->CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + + return true; +} + +static int +smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, + struct cifsInodeInfo *cinode) +{ + if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) + return SMB2_lease_break(0, tcon, cinode->lease_key, + smb2_get_lease_state(cinode)); + + return SMB2_oplock_break(0, tcon, fid->persistent_fid, + fid->volatile_fid, + CIFS_CACHE_READ(cinode) ? 1 : 0); +} + +static int +smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *buf) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return rc; + buf->f_type = SMB2_MAGIC_NUMBER; + rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid, + buf); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return rc; +} + +static bool +smb2_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2) +{ + return ob1->fid.persistent_fid == ob2->fid.persistent_fid && + ob1->fid.volatile_fid == ob2->fid.volatile_fid; +} + +static int +smb2_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u32 type, int lock, int unlock, bool wait) +{ + if (unlock && !lock) + type = SMB2_LOCKFLAG_UNLOCK; + return SMB2_lock(xid, tlink_tcon(cfile->tlink), + cfile->fid.persistent_fid, cfile->fid.volatile_fid, + current->tgid, length, offset, type, wait); +} + +static void +smb2_get_lease_key(struct inode *inode, struct cifs_fid *fid) +{ + memcpy(fid->lease_key, CIFS_I(inode)->lease_key, SMB2_LEASE_KEY_SIZE); +} + +static void +smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) +{ + memcpy(CIFS_I(inode)->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE); +} + +static void +smb2_new_lease_key(struct cifs_fid *fid) +{ + get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); +} + +static int +smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_err_rsp *err_buf = NULL; + struct smb2_symlink_err_rsp *symlink; + unsigned int sub_len, sub_offset; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf); + + if (!rc || !err_buf) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ + rc = 0; + symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; + sub_len = le16_to_cpu(symlink->SubstituteNameLength); + sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + *target_path = cifs_strndup_from_utf16( + (char *)symlink->PathBuffer + sub_offset, + sub_len, true, cifs_sb->local_nls); + if (!(*target_path)) { + kfree(utf16_path); + return -ENOMEM; + } + convert_delimiter(*target_path, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + kfree(utf16_path); + return rc; +} + +static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len, bool keep_size) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + + xid = get_xid(); + + inode = d_inode(cfile->dentry); + cifsi = CIFS_I(inode); + + /* if file not oplocked can't be sure whether asking to extend size */ + if (!CIFS_CACHE_READ(cifsi)) + if (keep_size == false) + return -EOPNOTSUPP; + + /* + * Must check if file sparse since fallocate -z (zero range) assumes + * non-sparse allocation + */ + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) + return -EOPNOTSUPP; + + /* + * need to make sure we are not asked to extend the file since the SMB3 + * fsctl does not change the file size. In the future we could change + * this to zero the first part of the range then set the file size + * which for a non sparse file would zero the newly extended range + */ + if (keep_size == false) + if (i_size_read(inode) < offset + len) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + +static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + __u8 set_sparse = 1; + + xid = get_xid(); + + inode = d_inode(cfile->dentry); + cifsi = CIFS_I(inode); + + /* Need to make file sparse, if not already, before freeing range. */ + /* Consider adding equivalent for compressed since it could also work */ + if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + +static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len, bool keep_size) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + long rc = -EOPNOTSUPP; + unsigned int xid; + + xid = get_xid(); + + inode = d_inode(cfile->dentry); + cifsi = CIFS_I(inode); + + /* if file not oplocked can't be sure whether asking to extend size */ + if (!CIFS_CACHE_READ(cifsi)) + if (keep_size == false) + return -EOPNOTSUPP; + + /* + * Files are non-sparse by default so falloc may be a no-op + * Must check if file sparse. If not sparse, and not extending + * then no need to do anything since file already allocated + */ + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { + if (keep_size == true) + return 0; + /* check if extending file */ + else if (i_size_read(inode) >= off + len) + /* not extending file and already not sparse */ + return 0; + /* BB: in future add else clause to extend file */ + else + return -EOPNOTSUPP; + } + + if ((keep_size == true) || (i_size_read(inode) >= off + len)) { + /* + * Check if falloc starts within first few pages of file + * and ends within a few pages of the end of file to + * ensure that most of file is being forced to be + * fallocated now. If so then setting whole file sparse + * ie potentially making a few extra pages at the beginning + * or end of the file non-sparse via set_sparse is harmless. + */ + if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) + return -EOPNOTSUPP; + + rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + } + /* BB: else ... in future add code to extend file and set sparse */ + + + free_xid(xid); + return rc; +} + + +static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, + loff_t off, loff_t len) +{ + /* KEEP_SIZE already checked for by do_fallocate */ + if (mode & FALLOC_FL_PUNCH_HOLE) + return smb3_punch_hole(file, tcon, off, len); + else if (mode & FALLOC_FL_ZERO_RANGE) { + if (mode & FALLOC_FL_KEEP_SIZE) + return smb3_zero_range(file, tcon, off, len, true); + return smb3_zero_range(file, tcon, off, len, false); + } else if (mode == FALLOC_FL_KEEP_SIZE) + return smb3_simple_falloc(file, tcon, off, len, true); + else if (mode == 0) + return smb3_simple_falloc(file, tcon, off, len, false); + + return -EOPNOTSUPP; +} + +static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void +smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { + cinode->oplock = CIFS_CACHE_RHW_FLG; + cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + cinode->oplock = CIFS_CACHE_RW_FLG; + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_II) { + cinode->oplock = CIFS_CACHE_READ_FLG; + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else + cinode->oplock = 0; +} + +static void +smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + char message[5] = {0}; + + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + + cinode->oplock = 0; + if (oplock & SMB2_LEASE_READ_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_READ_FLG; + strcat(message, "R"); + } + if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_HANDLE_FLG; + strcat(message, "H"); + } + if (oplock & SMB2_LEASE_WRITE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_WRITE_FLG; + strcat(message, "W"); + } + if (!cinode->oplock) + strcat(message, "None"); + cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, + &cinode->vfs_inode); +} + +static void +smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + unsigned int old_oplock = cinode->oplock; + + smb21_set_oplock_level(cinode, oplock, epoch, purge_cache); + + if (purge_cache) { + *purge_cache = false; + if (old_oplock == CIFS_CACHE_READ_FLG) { + if (cinode->oplock == CIFS_CACHE_READ_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == 0 && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + } else if (old_oplock == CIFS_CACHE_RH_FLG) { + if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + } + cinode->epoch = epoch; + } +} + +static bool +smb2_is_read_op(__u32 oplock) +{ + return oplock == SMB2_OPLOCK_LEVEL_II; +} + +static bool +smb21_is_read_op(__u32 oplock) +{ + return (oplock & SMB2_LEASE_READ_CACHING_HE) && + !(oplock & SMB2_LEASE_WRITE_CACHING_HE); +} + +static __le32 +map_oplock_to_lease(u8 oplock) +{ + if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_II) + return SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) + return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING | + SMB2_LEASE_WRITE_CACHING; + return 0; +} + +static char * +smb2_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease *buf; + + buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static char * +smb3_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease_v2 *buf; + + buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease_v2, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease_v2, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static __u8 +smb2_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease *lc = (struct create_lease *)buf; + + *epoch = 0; /* not used */ + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +static __u8 +smb3_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; + + *epoch = le16_to_cpu(lc->lcontext.Epoch); + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +static unsigned int +smb2_wp_retry_size(struct inode *inode) +{ + return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize, + SMB2_MAX_BUFFER_SIZE); +} + +static bool +smb2_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->invalidHandle; +} + +struct smb_version_operations smb20_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, + .is_read_op = smb2_is_read_op, + .set_oplock_level = smb2_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, + .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, +}; + +struct smb_version_operations smb21_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb21_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, + .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, +}; + +struct smb_version_operations smb30_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, + .calc_signature = smb3_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .validate_negotiate = smb3_validate_negotiate, + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, +}; + +struct smb_version_values smb20_values = { + .version_string = SMB20_VERSION_STRING, + .protocol_id = SMB20_PROT_ID, + .req_capabilities = 0, /* MBZ */ + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), +}; + +struct smb_version_values smb21_values = { + .version_string = SMB21_VERSION_STRING, + .protocol_id = SMB21_PROT_ID, + .req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */ + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), +}; + +struct smb_version_values smb30_values = { + .version_string = SMB30_VERSION_STRING, + .protocol_id = SMB30_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; + +struct smb_version_values smb302_values = { + .version_string = SMB302_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; diff --git a/kernel/fs/cifs/smb2pdu.c b/kernel/fs/cifs/smb2pdu.c new file mode 100644 index 000000000..54cbe19d9 --- /dev/null +++ b/kernel/fs/cifs/smb2pdu.c @@ -0,0 +1,2661 @@ +/* + * fs/cifs/smb2pdu.c + * + * Copyright (C) International Business Machines Corp., 2009, 2013 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * Contains the routines for constructing the SMB2 PDUs themselves + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + /* SMB2 PDU handling routines here - except for leftovers (eg session setup) */ + /* Note that there are handle based routines which must be */ + /* treated slightly differently for reconnection purposes since we never */ + /* want to reuse a stale file handle and only the caller knows the file info */ + +#include +#include +#include +#include +#include +#include +#include +#include "smb2pdu.h" +#include "cifsglob.h" +#include "cifsacl.h" +#include "cifsproto.h" +#include "smb2proto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "ntlmssp.h" +#include "smb2status.h" +#include "smb2glob.h" +#include "cifspdu.h" + +/* + * The following table defines the expected "StructureSize" of SMB2 requests + * in order by SMB2 command. This is similar to "wct" in SMB/CIFS requests. + * + * Note that commands are defined in smb2pdu.h in le16 but the array below is + * indexed by command in host byte order. + */ +static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { + /* SMB2_NEGOTIATE */ 36, + /* SMB2_SESSION_SETUP */ 25, + /* SMB2_LOGOFF */ 4, + /* SMB2_TREE_CONNECT */ 9, + /* SMB2_TREE_DISCONNECT */ 4, + /* SMB2_CREATE */ 57, + /* SMB2_CLOSE */ 24, + /* SMB2_FLUSH */ 24, + /* SMB2_READ */ 49, + /* SMB2_WRITE */ 49, + /* SMB2_LOCK */ 48, + /* SMB2_IOCTL */ 57, + /* SMB2_CANCEL */ 4, + /* SMB2_ECHO */ 4, + /* SMB2_QUERY_DIRECTORY */ 33, + /* SMB2_CHANGE_NOTIFY */ 32, + /* SMB2_QUERY_INFO */ 41, + /* SMB2_SET_INFO */ 33, + /* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */ +}; + + +static void +smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , + const struct cifs_tcon *tcon) +{ + struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; + char *temp = (char *)hdr; + /* lookup word count ie StructureSize from table */ + __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_cmd)]; + + /* + * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of + * largest operations (Create) + */ + memset(temp, 0, 256); + + /* Note this is only network field converted to big endian */ + hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) + - 4 /* RFC 1001 length field itself not counted */); + + hdr->ProtocolId[0] = 0xFE; + hdr->ProtocolId[1] = 'S'; + hdr->ProtocolId[2] = 'M'; + hdr->ProtocolId[3] = 'B'; + hdr->StructureSize = cpu_to_le16(64); + hdr->Command = smb2_cmd; + hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + hdr->ProcessId = cpu_to_le32((__u16)current->tgid); + + if (!tcon) + goto out; + + /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ + /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ + if ((tcon->ses) && (tcon->ses->server) && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + hdr->CreditCharge = cpu_to_le16(1); + /* else CreditCharge MBZ */ + + hdr->TreeId = tcon->tid; + /* Uid is not converted */ + if (tcon->ses) + hdr->SessionId = tcon->ses->Suid; + + /* + * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have + * to pass the path on the Open SMB prefixed by \\server\share. + * Not sure when we would need to do the augmented path (if ever) and + * setting this flag breaks the SMB2 open operation since it is + * illegal to send an empty path name (without \\server\share prefix) + * when the DFS flag is set in the SMB open header. We could + * consider setting the flag on all operations other than open + * but it is safer to net set it for now. + */ +/* if (tcon->share_flags & SHI1005_FLAGS_DFS) + hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ + + if (tcon->ses && tcon->ses->server && tcon->ses->server->sign) + hdr->Flags |= SMB2_FLAGS_SIGNED; +out: + pdu->StructureSize2 = cpu_to_le16(parmsize); + return; +} + +static int +smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) +{ + int rc = 0; + struct nls_table *nls_codepage; + struct cifs_ses *ses; + struct TCP_Server_Info *server; + + /* + * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so + * check for tcp and smb session status done differently + * for those three - in the calling routine. + */ + if (tcon == NULL) + return rc; + + if (smb2_command == SMB2_TREE_CONNECT) + return rc; + + if (tcon->tidStatus == CifsExiting) { + /* + * only tree disconnect, open, and write, + * (and ulogoff which does not have tcon) + * are allowed as we start force umount. + */ + if ((smb2_command != SMB2_WRITE) && + (smb2_command != SMB2_CREATE) && + (smb2_command != SMB2_TREE_DISCONNECT)) { + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb2_command); + return -ENODEV; + } + } + if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || + (!tcon->ses->server)) + return -EIO; + + ses = tcon->ses; + server = ses->server; + + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + return -EAGAIN; + } + + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + + /* are we still trying to reconnect? */ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry) { + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); + return -EHOSTDOWN; + } + } + + if (!tcon->ses->need_reconnect && !tcon->need_reconnect) + return rc; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously reconnect + * the same SMB session + */ + mutex_lock(&tcon->ses->session_mutex); + rc = cifs_negotiate_protocol(0, tcon->ses); + if (!rc && tcon->ses->need_reconnect) + rc = cifs_setup_session(0, tcon->ses, nls_codepage); + + if (rc || !tcon->need_reconnect) { + mutex_unlock(&tcon->ses->session_mutex); + goto out; + } + + cifs_mark_open_files_invalid(tcon); + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); + mutex_unlock(&tcon->ses->session_mutex); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + if (rc) + goto out; + atomic_inc(&tconInfoReconnectCount); +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle. + */ + /* + * BB Is flush done by server on drop of tcp session? Should we special + * case it and skip above? + */ + switch (smb2_command) { + case SMB2_FLUSH: + case SMB2_READ: + case SMB2_WRITE: + case SMB2_LOCK: + case SMB2_IOCTL: + case SMB2_QUERY_DIRECTORY: + case SMB2_CHANGE_NOTIFY: + case SMB2_QUERY_INFO: + case SMB2_SET_INFO: + return -EAGAIN; + } + unload_nls(nls_codepage); + return rc; +} + +/* + * Allocate and return pointer to an SMB request hdr, and set basic + * SMB information in the SMB header. If the return code is zero, this + * function must have filled in request_buf pointer. + */ +static int +small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf) +{ + int rc = 0; + + rc = smb2_reconnect(smb2_command, tcon); + if (rc) + return rc; + + /* BB eventually switch this to SMB2 specific small buf size */ + *request_buf = cifs_small_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + + smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon); + + if (tcon != NULL) { +#ifdef CONFIG_CIFS_STATS2 + uint16_t com_code = le16_to_cpu(smb2_command); + cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]); +#endif + cifs_stats_inc(&tcon->num_smbs_sent); + } + + return rc; +} + +/* + * + * SMB2 Worker functions follow: + * + * The general structure of the worker functions is: + * 1) Call smb2_init (assembles SMB2 header) + * 2) Initialize SMB2 command specific fields in fixed length area of SMB + * 3) Call smb_sendrcv2 (sends request on socket and waits for response) + * 4) Decode SMB2 command specific fields in the fixed length area + * 5) Decode variable length data area (if any for this SMB2 command type) + * 6) Call free smb buffer + * 7) return + * + */ + +int +SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) +{ + struct smb2_negotiate_req *req; + struct smb2_negotiate_rsp *rsp; + struct kvec iov[1]; + int rc = 0; + int resp_buftype; + struct TCP_Server_Info *server = ses->server; + int blob_offset, blob_length; + char *security_blob; + int flags = CIFS_NEG_OP; + + cifs_dbg(FYI, "Negotiate protocol\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); + if (rc) + return rc; + + req->hdr.SessionId = 0; + + req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); + + req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */ + inc_rfc1001_len(req, 2); + + /* only one of SMB2 signing flags may be set in SMB2 request */ + if (ses->sign) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + req->SecurityMode = 0; + + req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); + + /* ClientGUID must be zero for SMB2.02 dialect */ + if (ses->server->vals->protocol_id == SMB20_PROT_ID) + memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); + else + memcpy(req->ClientGUID, server->client_guid, + SMB2_CLIENT_GUID_SIZE); + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags); + + rsp = (struct smb2_negotiate_rsp *)iov[0].iov_base; + /* + * No tcon so can't do + * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + */ + if (rc != 0) + goto neg_exit; + + cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); + + /* BB we may eventually want to match the negotiated vs. requested + dialect, even though we are only requesting one at a time */ + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) + cifs_dbg(FYI, "negotiated smb2.0 dialect\n"); + else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) + cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); + else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); + else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); + else { + cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + le16_to_cpu(rsp->DialectRevision)); + rc = -EIO; + goto neg_exit; + } + server->dialect = le16_to_cpu(rsp->DialectRevision); + + /* SMB2 only has an extended negflavor */ + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; + /* set it to the maximum buffer size value we can send with 1 credit */ + server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize), + SMB2_MAX_BUFFER_SIZE); + server->max_read = le32_to_cpu(rsp->MaxReadSize); + server->max_write = le32_to_cpu(rsp->MaxWriteSize); + /* BB Do we need to validate the SecurityMode? */ + server->sec_mode = le16_to_cpu(rsp->SecurityMode); + server->capabilities = le32_to_cpu(rsp->Capabilities); + /* Internal types */ + server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; + + security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, + &rsp->hdr); + /* + * See MS-SMB2 section 2.2.4: if no blob, client picks default which + * for us will be + * ses->sectype = RawNTLMSSP; + * but for time being this is our only auth choice so doesn't matter. + * We just found a server which sets blob length to zero expecting raw. + */ + if (blob_length == 0) + cifs_dbg(FYI, "missing security blob on negprot\n"); + + rc = cifs_enable_signing(server, ses->sign); +#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ + if (rc) + goto neg_exit; + if (blob_length) + rc = decode_negTokenInit(security_blob, blob_length, server); + if (rc == 1) + rc = 0; + else if (rc == 0) { + rc = -EIO; + goto neg_exit; + } +#endif + +neg_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc = 0; + struct validate_negotiate_info_req vneg_inbuf; + struct validate_negotiate_info_rsp *pneg_rsp; + u32 rsplen; + + cifs_dbg(FYI, "validate negotiate\n"); + + /* + * validation ioctl must be signed, so no point sending this if we + * can not sign it. We could eventually change this to selectively + * sign just this, the first and only signed request on a connection. + * This is good enough for now since a user who wants better security + * would also enable signing on the mount. Having validation of + * negotiate info for signed connections helps reduce attack vectors + */ + if (tcon->ses->server->sign == false) + return 0; /* validation requires signing */ + + vneg_inbuf.Capabilities = + cpu_to_le32(tcon->ses->server->vals->req_capabilities); + memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + SMB2_CLIENT_GUID_SIZE); + + if (tcon->ses->sign) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + vneg_inbuf.SecurityMode = 0; + + vneg_inbuf.DialectCount = cpu_to_le16(1); + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), + (char **)&pneg_rsp, &rsplen); + + if (rc != 0) { + cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + return -EIO; + } + + if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); + return -EIO; + } + + /* check validate negotiate info response matches what we got earlier */ + if (pneg_rsp->Dialect != + cpu_to_le16(tcon->ses->server->vals->protocol_id)) + goto vneg_out; + + if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) + goto vneg_out; + + /* do not validate server guid because not saved at negprot time yet */ + + if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND | + SMB2_LARGE_FILES) != tcon->ses->server->capabilities) + goto vneg_out; + + /* validate negotiate successful */ + cifs_dbg(FYI, "validate negotiate info successful\n"); + return 0; + +vneg_out: + cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); + return -EIO; +} + +int +SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp = NULL; + struct kvec iov[2]; + int rc = 0; + int resp_buftype = CIFS_NO_BUFFER; + __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ + struct TCP_Server_Info *server = ses->server; + u16 blob_length = 0; + char *security_blob; + char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + + cifs_dbg(FYI, "Session Setup\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + /* + * If we are here due to reconnect, free per-smb session key + * in case signing was required. + */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + + /* + * If memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) + return -ENOMEM; + ses->ntlmssp->sesskey_per_smbsess = true; + + /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ + ses->sectype = RawNTLMSSP; + +ssetup_ntlmssp_authenticate: + if (phase == NtLmChallenge) + phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + + rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); + if (rc) + return rc; + + req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + req->VcNumber = 0; /* MBZ */ + /* to enable echos and oplocks */ + req->hdr.CreditRequest = cpu_to_le16(3); + + /* only one of SMB2 signing flags may be set in SMB2 request */ + if (server->sign) + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED; + else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */ + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED; + else + req->SecurityMode = 0; + + req->Capabilities = 0; + req->Channel = 0; /* MBZ */ + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and 1 for pad */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + if (phase == NtLmNegotiate) { + ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto ssetup_exit; + } + build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); + if (use_spnego) { + /* blob_length = build_spnego_ntlmssp_blob( + &security_blob, + sizeof(struct _NEGOTIATE_MESSAGE), + ntlmssp_blob); */ + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + kfree(ntlmssp_blob); + goto ssetup_exit; + } else { + blob_length = sizeof(struct _NEGOTIATE_MESSAGE); + /* with raw NTLMSSP we don't encapsulate in SPNEGO */ + security_blob = ntlmssp_blob; + } + } else if (phase == NtLmAuthenticate) { + req->hdr.SessionId = ses->Suid; + ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto ssetup_exit; + } + rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, + nls_cp); + if (rc) { + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", + rc); + goto ssetup_exit; /* BB double check error handling */ + } + if (use_spnego) { + /* blob_length = build_spnego_ntlmssp_blob( + &security_blob, + blob_length, + ntlmssp_blob); */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + kfree(ntlmssp_blob); + goto ssetup_exit; + } else { + security_blob = ntlmssp_blob; + } + } else { + cifs_dbg(VFS, "illegal ntlmssp phase\n"); + rc = -EIO; + goto ssetup_exit; + } + + /* Testing shows that buffer offset must be at location of Buffer[0] */ + req->SecurityBufferOffset = + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - + 1 /* pad */ - 4 /* rfc1001 len */); + req->SecurityBufferLength = cpu_to_le16(blob_length); + iov[1].iov_base = security_blob; + iov[1].iov_len = blob_length; + + inc_rfc1001_len(req, blob_length - 1 /* pad */); + + /* BB add code to build os and lm fields */ + + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, + CIFS_LOG_ERROR | CIFS_NEG_OP); + + kfree(security_blob); + rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; + if (resp_buftype != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { + if (phase != NtLmNegotiate) { + cifs_dbg(VFS, "Unexpected more processing error\n"); + goto ssetup_exit; + } + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + le16_to_cpu(rsp->SecurityBufferOffset)) { + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); + rc = -EIO; + goto ssetup_exit; + } + + /* NTLMSSP Negotiate sent now processing challenge (response) */ + phase = NtLmChallenge; /* process ntlmssp challenge */ + rc = 0; /* MORE_PROCESSING is not an error here but expected */ + ses->Suid = rsp->hdr.SessionId; + rc = decode_ntlmssp_challenge(rsp->Buffer, + le16_to_cpu(rsp->SecurityBufferLength), ses); + } + + /* + * BB eventually add code for SPNEGO decoding of NtlmChallenge blob, + * but at least the raw NTLMSSP case works. + */ + /* + * No tcon so can't do + * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + */ + if (rc != 0) + goto ssetup_exit; + + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); +ssetup_exit: + free_rsp_buf(resp_buftype, rsp); + + /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ + if ((phase == NtLmChallenge) && (rc == 0)) + goto ssetup_ntlmssp_authenticate; + + if (!rc) { + mutex_lock(&server->srv_mutex); + if (server->sign && server->ops->generate_signingkey) { + rc = server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&server->srv_mutex); + goto keygen_exit; + } + } + if (!server->session_estab) { + server->sequence_number = 0x2; + server->session_estab = true; + } + mutex_unlock(&server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + +keygen_exit: + if (!server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + kfree(ses->ntlmssp); + + return rc; +} + +int +SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) +{ + struct smb2_logoff_req *req; /* response is also trivial struct */ + int rc = 0; + struct TCP_Server_Info *server; + + cifs_dbg(FYI, "disconnect session %p\n", ses); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + /* no need to send SMB logoff if uid already closed due to reconnect */ + if (ses->need_reconnect) + goto smb2_session_already_dead; + + rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); + if (rc) + return rc; + + /* since no tcon, smb2_init can not do this, so do here */ + req->hdr.SessionId = ses->Suid; + if (server->sign) + req->hdr.Flags |= SMB2_FLAGS_SIGNED; + + rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); + /* + * No tcon so can't do + * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + */ + +smb2_session_already_dead: + return rc; +} + +static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code) +{ + cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_failed[code]); +} + +#define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */) + +/* These are similar values to what Windows uses */ +static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon) +{ + tcon->max_chunks = 256; + tcon->max_bytes_chunk = 1048576; + tcon->max_bytes_copy = 16777216; +} + +int +SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, + struct cifs_tcon *tcon, const struct nls_table *cp) +{ + struct smb2_tree_connect_req *req; + struct smb2_tree_connect_rsp *rsp = NULL; + struct kvec iov[2]; + int rc = 0; + int resp_buftype; + int unc_path_len; + struct TCP_Server_Info *server; + __le16 *unc_path = NULL; + + cifs_dbg(FYI, "TCON\n"); + + if ((ses->server) && tree) + server = ses->server; + else + return -EIO; + + if (tcon && tcon->bad_network_name) + return -ENOENT; + + unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL); + if (unc_path == NULL) + return -ENOMEM; + + unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1; + unc_path_len *= 2; + if (unc_path_len < 2) { + kfree(unc_path); + return -EINVAL; + } + + rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req); + if (rc) { + kfree(unc_path); + return rc; + } + + if (tcon == NULL) { + /* since no tcon, smb2_init can not do this, so do here */ + req->hdr.SessionId = ses->Suid; + /* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED) + req->hdr.Flags |= SMB2_FLAGS_SIGNED; */ + } + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and 1 for pad */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + + /* Testing shows that buffer offset must be at location of Buffer[0] */ + req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) + - 1 /* pad */ - 4 /* do not count rfc1001 len field */); + req->PathLength = cpu_to_le16(unc_path_len - 2); + iov[1].iov_base = unc_path; + iov[1].iov_len = unc_path_len; + + inc_rfc1001_len(req, unc_path_len - 1 /* pad */); + + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0); + rsp = (struct smb2_tree_connect_rsp *)iov[0].iov_base; + + if (rc != 0) { + if (tcon) { + cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); + tcon->need_reconnect = true; + } + goto tcon_error_exit; + } + + if (tcon == NULL) { + ses->ipc_tid = rsp->hdr.TreeId; + goto tcon_exit; + } + + if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) + cifs_dbg(FYI, "connection to disk share\n"); + else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { + tcon->ipc = true; + cifs_dbg(FYI, "connection to pipe share\n"); + } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { + tcon->print = true; + cifs_dbg(FYI, "connection to printer\n"); + } else { + cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); + rc = -EOPNOTSUPP; + goto tcon_error_exit; + } + + tcon->share_flags = le32_to_cpu(rsp->ShareFlags); + tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ + tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); + tcon->tidStatus = CifsGood; + tcon->need_reconnect = false; + tcon->tid = rsp->hdr.TreeId; + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); + + if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && + ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) + cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); + init_copy_chunk_defaults(tcon); + if (tcon->ses->server->ops->validate_negotiate) + rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); +tcon_exit: + free_rsp_buf(resp_buftype, rsp); + kfree(unc_path); + return rc; + +tcon_error_exit: + if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { + cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); + if (tcon) + tcon->bad_network_name = true; + } + goto tcon_exit; +} + +int +SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) +{ + struct smb2_tree_disconnect_req *req; /* response is trivial */ + int rc = 0; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + + cifs_dbg(FYI, "Tree Disconnect\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) + return 0; + + rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req); + if (rc) + return rc; + + rc = SendReceiveNoRsp(xid, ses, (char *)&req->hdr, 0); + if (rc) + cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); + + return rc; +} + + +static struct create_durable * +create_durable_buf(void) +{ + struct create_durable *buf; + + buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_durable, Data)); + buf->ccontext.DataLength = cpu_to_le32(16); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_durable, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = 'n'; + buf->Name[3] = 'Q'; + return buf; +} + +static struct create_durable * +create_reconnect_durable_buf(struct cifs_fid *fid) +{ + struct create_durable *buf; + + buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_durable, Data)); + buf->ccontext.DataLength = cpu_to_le32(16); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_durable, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Data.Fid.PersistentFileId = fid->persistent_fid; + buf->Data.Fid.VolatileFileId = fid->volatile_fid; + /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = 'n'; + buf->Name[3] = 'C'; + return buf; +} + +static __u8 +parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, + unsigned int *epoch) +{ + char *data_offset; + struct create_context *cc; + unsigned int next = 0; + char *name; + + data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); + cc = (struct create_context *)data_offset; + do { + cc = (struct create_context *)((char *)cc + next); + name = le16_to_cpu(cc->NameOffset) + (char *)cc; + if (le16_to_cpu(cc->NameLength) != 4 || + strncmp(name, "RqLs", 4)) { + next = le32_to_cpu(cc->Next); + continue; + } + return server->ops->parse_lease_buf(cc, epoch); + } while (next != 0); + + return 0; +} + +static int +add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int *num_iovec, __u8 *oplock) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = server->vals->create_lease_size; + req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; + if (!req->CreateContextsOffset) + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) - 4 + + iov[num - 1].iov_len); + le32_add_cpu(&req->CreateContextsLength, + server->vals->create_lease_size); + inc_rfc1001_len(&req->hdr, server->vals->create_lease_size); + *num_iovec = num + 1; + return 0; +} + +static int +add_durable_context(struct kvec *iov, unsigned int *num_iovec, + struct cifs_open_parms *oparms) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + if (oparms->reconnect) { + iov[num].iov_base = create_reconnect_durable_buf(oparms->fid); + /* indicate that we don't need to relock the file */ + oparms->reconnect = false; + } else + iov[num].iov_base = create_durable_buf(); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct create_durable); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = + cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); + inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); + *num_iovec = num + 1; + return 0; +} + +int +SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, + __u8 *oplock, struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf) +{ + struct smb2_create_req *req; + struct smb2_create_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_tcon *tcon = oparms->tcon; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[4]; + int resp_buftype; + int uni_path_len; + __le16 *copy_path = NULL; + int copy_size; + int rc = 0; + unsigned int num_iovecs = 2; + __u32 file_attributes = 0; + char *dhc_buf = NULL, *lc_buf = NULL; + + cifs_dbg(FYI, "create/open\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req); + if (rc) + return rc; + + if (oparms->create_options & CREATE_OPTION_READONLY) + file_attributes |= ATTR_READONLY; + if (oparms->create_options & CREATE_OPTION_SPECIAL) + file_attributes |= ATTR_SYSTEM; + + req->ImpersonationLevel = IL_IMPERSONATION; + req->DesiredAccess = cpu_to_le32(oparms->desired_access); + /* File attributes ignored on open (used in create though) */ + req->FileAttributes = cpu_to_le32(file_attributes); + req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); + req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); + uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; + /* do not count rfc1001 len field */ + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4); + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + /* MUST set path len (NameLength) to 0 opening root of share */ + req->NameLength = cpu_to_le16(uni_path_len - 2); + /* -1 since last byte is buf[0] which is sent below (path) */ + iov[0].iov_len--; + if (uni_path_len % 8 != 0) { + copy_size = uni_path_len / 8 * 8; + if (copy_size < uni_path_len) + copy_size += 8; + + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) + return -ENOMEM; + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } + + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + /* -1 since last byte is buf[0] which was counted in smb2_buf_len */ + inc_rfc1001_len(req, uni_path_len - 1); + + if (!server->oplocks) + *oplock = SMB2_OPLOCK_LEVEL_NONE; + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || + *oplock == SMB2_OPLOCK_LEVEL_NONE) + req->RequestedOplockLevel = *oplock; + else { + rc = add_lease_context(server, iov, &num_iovecs, oplock); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + return rc; + } + lc_buf = iov[num_iovecs-1].iov_base; + } + + if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { + /* need to set Next field of lease context if we request it */ + if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { + struct create_context *ccontext = + (struct create_context *)iov[num_iovecs-1].iov_base; + ccontext->Next = + cpu_to_le32(server->vals->create_lease_size); + } + rc = add_durable_context(iov, &num_iovecs, oparms); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + kfree(lc_buf); + return rc; + } + dhc_buf = iov[num_iovecs-1].iov_base; + } + + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); + rsp = (struct smb2_create_rsp *)iov[0].iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + if (err_buf) + *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, + GFP_KERNEL); + goto creat_exit; + } + + oparms->fid->persistent_fid = rsp->PersistentFileId; + oparms->fid->volatile_fid = rsp->VolatileFileId; + + if (buf) { + memcpy(buf, &rsp->CreationTime, 32); + buf->AllocationSize = rsp->AllocationSize; + buf->EndOfFile = rsp->EndofFile; + buf->Attributes = rsp->FileAttributes; + buf->NumberOfLinks = cpu_to_le32(1); + buf->DeletePending = 0; + } + + if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) + *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch); + else + *oplock = rsp->OplockLevel; +creat_exit: + kfree(copy_path); + kfree(lc_buf); + kfree(dhc_buf); + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, + u32 indatalen, char **out_data, u32 *plen /* returned data len */) +{ + struct smb2_ioctl_req *req; + struct smb2_ioctl_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct kvec iov[2]; + int resp_buftype; + int num_iovecs; + int rc = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (tcon) + ses = tcon->ses; + else + return -EIO; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req); + if (rc) + return rc; + + req->CtlCode = cpu_to_le32(opcode); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + if (indatalen) { + req->InputCount = cpu_to_le32(indatalen); + /* do not set InputOffset if no input data */ + req->InputOffset = + cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); + iov[1].iov_base = in_data; + iov[1].iov_len = indatalen; + num_iovecs = 2; + } else + num_iovecs = 1; + + req->OutputOffset = 0; + req->OutputCount = 0; /* MBZ */ + + /* + * Could increase MaxOutputResponse, but that would require more + * than one credit. Windows typically sets this smaller, but for some + * ioctls it may be useful to allow server to send more. No point + * limiting what the server can send as long as fits in one credit + */ + req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + + if (is_fsctl) + req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); + else + req->Flags = 0; + + iov[0].iov_base = (char *)req; + + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). We also must add 4 bytes + * in first iovec to allow for rfc1002 length field. + */ + + if (indatalen) { + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + inc_rfc1001_len(req, indatalen - 1); + } else + iov[0].iov_len = get_rfc1002_length(req) + 4; + + + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); + rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; + + if ((rc != 0) && (rc != -EINVAL)) { + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } else if (rc == -EINVAL) { + if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && + (opcode != FSCTL_SRV_COPYCHUNK)) { + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } + } + + /* check if caller wants to look at return data or just return rc */ + if ((plen == NULL) || (out_data == NULL)) + goto ioctl_exit; + + *plen = le32_to_cpu(rsp->OutputCount); + + /* We check for obvious errors in the output buffer length and offset */ + if (*plen == 0) + goto ioctl_exit; /* server returned no data */ + else if (*plen > 0xFF00) { + cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) { + cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, + le32_to_cpu(rsp->OutputOffset)); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + *out_data = kmalloc(*plen, GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto ioctl_exit; + } + + memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + *plen); +ioctl_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +/* + * Individual callers to ioctl worker function follow + */ + +int +SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + int rc; + struct compress_ioctl fsctl_input; + char *ret_data = NULL; + + fsctl_input.CompressionState = + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, + FSCTL_SET_COMPRESSION, true /* is_fsctl */, + (char *)&fsctl_input /* data input */, + 2 /* in data len */, &ret_data /* out data */, NULL); + + cifs_dbg(FYI, "set compression rc %d\n", rc); + + return rc; +} + +int +SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + struct smb2_close_req *req; + struct smb2_close_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[1]; + int resp_buftype; + int rc = 0; + + cifs_dbg(FYI, "Close\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req); + if (rc) + return rc; + + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + rsp = (struct smb2_close_rsp *)iov[0].iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + goto close_exit; + } + + /* BB FIXME - decode close response, update inode for caching */ + +close_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +static int +validate_buf(unsigned int offset, unsigned int buffer_length, + struct smb2_hdr *hdr, unsigned int min_buf_size) + +{ + unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length); + char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr; + char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + char *end_of_buf = begin_of_buf + buffer_length; + + + if (buffer_length < min_buf_size) { + cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n", + buffer_length, min_buf_size); + return -EINVAL; + } + + /* check if beyond RFC1001 maximum length */ + if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) { + cifs_dbg(VFS, "buffer length %d or smb length %d too large\n", + buffer_length, smb_len); + return -EINVAL; + } + + if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) { + cifs_dbg(VFS, "illegal server response, bad offset to data\n"); + return -EINVAL; + } + + return 0; +} + +/* + * If SMB buffer fields are valid, copy into temporary buffer to hold result. + * Caller must free buffer. + */ +static int +validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, + struct smb2_hdr *hdr, unsigned int minbufsize, + char *data) + +{ + char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + int rc; + + if (!data) + return -EINVAL; + + rc = validate_buf(offset, buffer_length, hdr, minbufsize); + if (rc) + return rc; + + memcpy(data, begin_of_buf, buffer_length); + + return 0; +} + +static int +query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u8 info_class, + size_t output_len, size_t min_len, void *data) +{ + struct smb2_query_info_req *req; + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov[2]; + int rc = 0; + int resp_buftype; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + + cifs_dbg(FYI, "Query Info\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + if (rc) + return rc; + + req->InfoType = SMB2_O_INFO_FILE; + req->FileInfoClass = info_class; + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + /* 4 for rfc1002 length field and 1 for Buffer */ + req->InputBufferOffset = + cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + req->OutputBufferLength = cpu_to_le32(output_len); + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + rsp = (struct smb2_query_info_rsp *)iov[0].iov_base; + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qinf_exit; + } + + rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), + &rsp->hdr, min_len, data); + +qinf_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +int +SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_all_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_ALL_INFORMATION, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + sizeof(struct smb2_file_all_info), data); +} + +int +SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_INTERNAL_INFORMATION, + sizeof(struct smb2_file_internal_info), + sizeof(struct smb2_file_internal_info), uniqueid); +} + +/* + * This is a no-op for now. We're not really interested in the reply, but + * rather in the fact that the server sent one and that server->lstrp + * gets updated. + * + * FIXME: maybe we should consider checking that the reply matches request? + */ +static void +smb2_echo_callback(struct mid_q_entry *mid) +{ + struct TCP_Server_Info *server = mid->callback_data; + struct smb2_echo_rsp *smb2 = (struct smb2_echo_rsp *)mid->resp_buf; + unsigned int credits_received = 1; + + if (mid->mid_state == MID_RESPONSE_RECEIVED) + credits_received = le16_to_cpu(smb2->hdr.CreditRequest); + + DeleteMidQEntry(mid); + add_credits(server, credits_received, CIFS_ECHO_OP); +} + +int +SMB2_echo(struct TCP_Server_Info *server) +{ + struct smb2_echo_req *req; + int rc = 0; + struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; + + cifs_dbg(FYI, "In echo request\n"); + + rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); + if (rc) + return rc; + + req->hdr.CreditRequest = cpu_to_le16(1); + + iov.iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov.iov_len = get_rfc1002_length(req) + 4; + + rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server, + CIFS_ECHO_OP); + if (rc) + cifs_dbg(FYI, "Echo request failed: %d\n", rc); + + cifs_small_buf_release(req); + return rc; +} + +int +SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid) +{ + struct smb2_flush_req *req; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[1]; + int resp_buftype; + int rc = 0; + + cifs_dbg(FYI, "Flush\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req); + if (rc) + return rc; + + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + + if (rc != 0) + cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); + + free_rsp_buf(resp_buftype, iov[0].iov_base); + return rc; +} + +/* + * To form a chain of read requests, any read requests after the first should + * have the end_of_chain boolean set to true. + */ +static int +smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms, + unsigned int remaining_bytes, int request_type) +{ + int rc = -EACCES; + struct smb2_read_req *req = NULL; + + rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req); + if (rc) + return rc; + if (io_parms->tcon->ses->server == NULL) + return -ECONNABORTED; + + req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; + req->ReadChannelInfoOffset = 0; /* reserved */ + req->ReadChannelInfoLength = 0; /* reserved */ + req->Channel = 0; /* reserved */ + req->MinimumCount = 0; + req->Length = cpu_to_le32(io_parms->length); + req->Offset = cpu_to_le64(io_parms->offset); + + if (request_type & CHAINED_REQUEST) { + if (!(request_type & END_OF_CHAIN)) { + /* 4 for rfc1002 length field */ + req->hdr.NextCommand = + cpu_to_le32(get_rfc1002_length(req) + 4); + } else /* END_OF_CHAIN */ + req->hdr.NextCommand = 0; + if (request_type & RELATED_REQUEST) { + req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS; + /* + * Related requests use info from previous read request + * in chain. + */ + req->hdr.SessionId = 0xFFFFFFFF; + req->hdr.TreeId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFF; + } + } + if (remaining_bytes > io_parms->length) + req->RemainingBytes = cpu_to_le32(remaining_bytes); + else + req->RemainingBytes = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + return rc; +} + +static void +smb2_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base; + unsigned int credits_received = 1; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1, + .rq_pages = rdata->pages, + .rq_npages = rdata->nr_pages, + .rq_pagesz = rdata->pagesz, + .rq_tailsz = rdata->tailsz }; + + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + credits_received = le16_to_cpu(buf->CreditRequest); + /* result already set, check signature */ + if (server->sign) { + int rc; + + rc = smb2_verify_signature(&rqst, server); + if (rc) + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); + break; + default: + if (rdata->result != -ENODATA) + rdata->result = -EIO; + } + + if (rdata->result) + cifs_stats_fail_inc(tcon, SMB2_READ_HE); + + queue_work(cifsiod_wq, &rdata->work); + DeleteMidQEntry(mid); + add_credits(server, credits_received, 0); +} + +/* smb2_async_readv - send an async write, and set up mid to handle result */ +int +smb2_async_readv(struct cifs_readdata *rdata) +{ + int rc, flags = 0; + struct smb2_hdr *buf; + struct cifs_io_parms io_parms; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1 }; + struct TCP_Server_Info *server; + + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); + + io_parms.tcon = tlink_tcon(rdata->cfile->tlink); + io_parms.offset = rdata->offset; + io_parms.length = rdata->bytes; + io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; + io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; + io_parms.pid = rdata->pid; + + server = io_parms.tcon->ses->server; + + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); + if (rc) { + if (rc == -EAGAIN && rdata->credits) { + /* credits was reset by reconnect */ + rdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } + return rc; + } + + buf = (struct smb2_hdr *)rdata->iov.iov_base; + /* 4 for rfc1002 length field */ + rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; + + if (rdata->credits) { + buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += rdata->credits - + le16_to_cpu(buf->CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + + kref_get(&rdata->refcount); + rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, + cifs_readv_receive, smb2_readv_callback, + rdata, flags); + if (rc) { + kref_put(&rdata->refcount, cifs_readdata_release); + cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); + } + + cifs_small_buf_release(buf); + return rc; +} + +int +SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, int *buf_type) +{ + int resp_buftype, rc = -EACCES; + struct smb2_read_rsp *rsp = NULL; + struct kvec iov[1]; + + *nbytes = 0; + rc = smb2_new_read_req(iov, io_parms, 0, 0); + if (rc) + return rc; + + rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1, + &resp_buftype, CIFS_LOG_ERROR); + + rsp = (struct smb2_read_rsp *)iov[0].iov_base; + + if (rsp->hdr.Status == STATUS_END_OF_FILE) { + free_rsp_buf(resp_buftype, iov[0].iov_base); + return 0; + } + + if (rc) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cifs_dbg(VFS, "Send error in read = %d\n", rc); + } else { + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); + rc = -EIO; + *nbytes = 0; + } + } + + if (*buf) { + memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + *nbytes); + free_rsp_buf(resp_buftype, iov[0].iov_base); + } else if (resp_buftype != CIFS_NO_BUFFER) { + *buf = iov[0].iov_base; + if (resp_buftype == CIFS_SMALL_BUFFER) + *buf_type = CIFS_SMALL_BUFFER; + else if (resp_buftype == CIFS_LARGE_BUFFER) + *buf_type = CIFS_LARGE_BUFFER; + } + return rc; +} + +/* + * Check the mid_state and signature on received buffer (if any), and queue the + * workqueue completion task. + */ +static void +smb2_writev_callback(struct mid_q_entry *mid) +{ + struct cifs_writedata *wdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + unsigned int written; + struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; + unsigned int credits_received = 1; + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + credits_received = le16_to_cpu(rsp->hdr.CreditRequest); + wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); + if (wdata->result != 0) + break; + + written = le32_to_cpu(rsp->DataLength); + /* + * Mask off high 16 bits when bytes written as returned + * by the server is greater than bytes requested by the + * client. OS/2 servers are known to set incorrect + * CountHigh values. + */ + if (written > wdata->bytes) + written &= 0xFFFF; + + if (written < wdata->bytes) + wdata->result = -ENOSPC; + else + wdata->bytes = written; + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + wdata->result = -EAGAIN; + break; + default: + wdata->result = -EIO; + break; + } + + if (wdata->result) + cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + + queue_work(cifsiod_wq, &wdata->work); + DeleteMidQEntry(mid); + add_credits(tcon->ses->server, credits_received, 0); +} + +/* smb2_async_writev - send an async write, and set up mid to handle result */ +int +smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) +{ + int rc = -EACCES, flags = 0; + struct smb2_write_req *req = NULL; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct kvec iov; + struct smb_rqst rqst; + + rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); + if (rc) { + if (rc == -EAGAIN && wdata->credits) { + /* credits was reset by reconnect */ + wdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } + goto async_writev_out; + } + + req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); + + req->PersistentFileId = wdata->cfile->fid.persistent_fid; + req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->WriteChannelInfoOffset = 0; + req->WriteChannelInfoLength = 0; + req->Channel = 0; + req->Offset = cpu_to_le64(wdata->offset); + /* 4 for rfc1002 length field */ + req->DataOffset = cpu_to_le16( + offsetof(struct smb2_write_req, Buffer) - 4); + req->RemainingBytes = 0; + + /* 4 for rfc1002 length field and 1 for Buffer */ + iov.iov_len = get_rfc1002_length(req) + 4 - 1; + iov.iov_base = req; + + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + rqst.rq_pages = wdata->pages; + rqst.rq_npages = wdata->nr_pages; + rqst.rq_pagesz = wdata->pagesz; + rqst.rq_tailsz = wdata->tailsz; + + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); + + req->Length = cpu_to_le32(wdata->bytes); + + inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); + + if (wdata->credits) { + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += wdata->credits - + le16_to_cpu(req->hdr.CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + + kref_get(&wdata->refcount); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, + flags); + + if (rc) { + kref_put(&wdata->refcount, release); + cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + } + +async_writev_out: + cifs_small_buf_release(req); + return rc; +} + +/* + * SMB2_write function gets iov pointer to kvec array with n_vec as a length. + * The length field from io_parms must be at least 1 and indicates a number of + * elements with data to write that begins with position 1 in iov array. All + * data length is specified by count. + */ +int +SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec) +{ + int rc = 0; + struct smb2_write_req *req = NULL; + struct smb2_write_rsp *rsp = NULL; + int resp_buftype; + *nbytes = 0; + + if (n_vec < 1) + return rc; + + rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req); + if (rc) + return rc; + + if (io_parms->tcon->ses->server == NULL) + return -ECONNABORTED; + + req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; + req->WriteChannelInfoOffset = 0; + req->WriteChannelInfoLength = 0; + req->Channel = 0; + req->Length = cpu_to_le32(io_parms->length); + req->Offset = cpu_to_le64(io_parms->offset); + /* 4 for rfc1002 length field */ + req->DataOffset = cpu_to_le16( + offsetof(struct smb2_write_req, Buffer) - 4); + req->RemainingBytes = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and 1 for Buffer */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + + /* length of entire message including data to be written */ + inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */); + + rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1, + &resp_buftype, 0); + rsp = (struct smb2_write_rsp *)iov[0].iov_base; + + if (rc) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); + cifs_dbg(VFS, "Send error in write = %d\n", rc); + } else + *nbytes = le32_to_cpu(rsp->DataLength); + + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +static unsigned int +num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) +{ + int len; + unsigned int entrycount = 0; + unsigned int next_offset = 0; + FILE_DIRECTORY_INFO *entryptr; + + if (bufstart == NULL) + return 0; + + entryptr = (FILE_DIRECTORY_INFO *)bufstart; + + while (1) { + entryptr = (FILE_DIRECTORY_INFO *) + ((char *)entryptr + next_offset); + + if ((char *)entryptr + size > end_of_buf) { + cifs_dbg(VFS, "malformed search entry would overflow\n"); + break; + } + + len = le32_to_cpu(entryptr->FileNameLength); + if ((char *)entryptr + len + size > end_of_buf) { + cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n", + end_of_buf); + break; + } + + *lastentry = (char *)entryptr; + entrycount++; + + next_offset = le32_to_cpu(entryptr->NextEntryOffset); + if (!next_offset) + break; + } + + return entrycount; +} + +/* + * Readdir/FindFirst + */ +int +SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf) +{ + struct smb2_query_directory_req *req; + struct smb2_query_directory_rsp *rsp = NULL; + struct kvec iov[2]; + int rc = 0; + int len; + int resp_buftype = CIFS_NO_BUFFER; + unsigned char *bufptr; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + __le16 asteriks = cpu_to_le16('*'); + char *end_of_smb; + unsigned int output_size = CIFSMaxBufSize; + size_t info_buf_size; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req); + if (rc) + return rc; + + switch (srch_inf->info_level) { + case SMB_FIND_FILE_DIRECTORY_INFO: + req->FileInformationClass = FILE_DIRECTORY_INFORMATION; + info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION; + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + break; + default: + cifs_dbg(VFS, "info level %u isn't supported\n", + srch_inf->info_level); + rc = -EINVAL; + goto qdir_exit; + } + + req->FileIndex = cpu_to_le32(index); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + len = 0x2; + bufptr = req->Buffer; + memcpy(bufptr, &asteriks, len); + + req->FileNameOffset = + cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4); + req->FileNameLength = cpu_to_le16(len); + /* + * BB could be 30 bytes or so longer if we used SMB2 specific + * buffer lengths, but this is safe and close enough. + */ + output_size = min_t(unsigned int, output_size, server->maxBuf); + output_size = min_t(unsigned int, output_size, 2 << 15); + req->OutputBufferLength = cpu_to_le32(output_size); + + iov[0].iov_base = (char *)req; + /* 4 for RFC1001 length and 1 for Buffer */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + + iov[1].iov_base = (char *)(req->Buffer); + iov[1].iov_len = len; + + inc_rfc1001_len(req, len - 1 /* Buffer */); + + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0); + rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; + + if (rc) { + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { + srch_inf->endOfSearch = true; + rc = 0; + } + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + goto qdir_exit; + } + + rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + info_buf_size); + if (rc) + goto qdir_exit; + + srch_inf->unicode = true; + + if (srch_inf->ntwrk_buf_start) { + if (srch_inf->smallBuf) + cifs_small_buf_release(srch_inf->ntwrk_buf_start); + else + cifs_buf_release(srch_inf->ntwrk_buf_start); + } + srch_inf->ntwrk_buf_start = (char *)rsp; + srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ + + (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset); + /* 4 for rfc1002 length field */ + end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr; + srch_inf->entries_in_buffer = + num_entries(srch_inf->srch_entries_start, end_of_smb, + &srch_inf->last_entry, info_buf_size); + srch_inf->index_of_last_entry += srch_inf->entries_in_buffer; + cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n", + srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, + srch_inf->srch_entries_start, srch_inf->last_entry); + if (resp_buftype == CIFS_LARGE_BUFFER) + srch_inf->smallBuf = false; + else if (resp_buftype == CIFS_SMALL_BUFFER) + srch_inf->smallBuf = true; + else + cifs_dbg(VFS, "illegal search buffer type\n"); + + return rc; + +qdir_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +static int +send_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class, + unsigned int num, void **data, unsigned int *size) +{ + struct smb2_set_info_req *req; + struct smb2_set_info_rsp *rsp = NULL; + struct kvec *iov; + int rc = 0; + int resp_buftype; + unsigned int i; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + if (!num) + return -EINVAL; + + iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL); + if (!iov) + return -ENOMEM; + + rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req); + if (rc) { + kfree(iov); + return rc; + } + + req->hdr.ProcessId = cpu_to_le32(pid); + + req->InfoType = SMB2_O_INFO_FILE; + req->FileInfoClass = info_class; + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + /* 4 for RFC1001 length and 1 for Buffer */ + req->BufferOffset = + cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4); + req->BufferLength = cpu_to_le32(*size); + + inc_rfc1001_len(req, *size - 1 /* Buffer */); + + memcpy(req->Buffer, *data, *size); + + iov[0].iov_base = (char *)req; + /* 4 for RFC1001 length */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + for (i = 1; i < num; i++) { + inc_rfc1001_len(req, size[i]); + le32_add_cpu(&req->BufferLength, size[i]); + iov[i].iov_base = (char *)data[i]; + iov[i].iov_len = size[i]; + } + + rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); + rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; + + if (rc != 0) + cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); + + free_rsp_buf(resp_buftype, rsp); + kfree(iov); + return rc; +} + +int +SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le16 *target_file) +{ + struct smb2_file_rename_info info; + void **data; + unsigned int size[2]; + int rc; + int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); + + data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + if (!data) + return -ENOMEM; + + info.ReplaceIfExists = 1; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ + info.FileNameLength = cpu_to_le32(len); + + data[0] = &info; + size[0] = sizeof(struct smb2_file_rename_info); + + data[1] = target_file; + size[1] = len + 2 /* null */; + + rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_RENAME_INFORMATION, 2, data, + size); + kfree(data); + return rc; +} + +int +SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le16 *target_file) +{ + struct smb2_file_link_info info; + void **data; + unsigned int size[2]; + int rc; + int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); + + data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + if (!data) + return -ENOMEM; + + info.ReplaceIfExists = 0; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ + info.FileNameLength = cpu_to_le32(len); + + data[0] = &info; + size[0] = sizeof(struct smb2_file_link_info); + + data[1] = target_file; + size[1] = len + 2 /* null */; + + rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_LINK_INFORMATION, 2, data, size); + kfree(data); + return rc; +} + +int +SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc) +{ + struct smb2_file_eof_info info; + void *data; + unsigned int size; + + info.EndOfFile = *eof; + + data = &info; + size = sizeof(struct smb2_file_eof_info); + + if (is_falloc) + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size); + else + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size); +} + +int +SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf) +{ + unsigned int size; + size = sizeof(FILE_BASIC_INFO); + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_BASIC_INFORMATION, 1, + (void **)&buf, &size); +} + +int +SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, + const u64 persistent_fid, const u64 volatile_fid, + __u8 oplock_level) +{ + int rc; + struct smb2_oplock_break *req = NULL; + + cifs_dbg(FYI, "SMB2_oplock_break\n"); + rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + + if (rc) + return rc; + + req->VolatileFid = volatile_fid; + req->PersistentFid = persistent_fid; + req->OplockLevel = oplock_level; + req->hdr.CreditRequest = cpu_to_le16(1); + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); + /* SMB2 buffer freed by function above */ + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); + } + + return rc; +} + +static void +copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, + struct kstatfs *kst) +{ + kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * + le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); + kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); + kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); + kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + return; +} + +static int +build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, + int outbuf_len, u64 persistent_fid, u64 volatile_fid) +{ + int rc; + struct smb2_query_info_req *req; + + cifs_dbg(FYI, "Query FSInfo level %d\n", level); + + if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) + return -EIO; + + rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + if (rc) + return rc; + + req->InfoType = SMB2_O_INFO_FILESYSTEM; + req->FileInfoClass = level; + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + /* 4 for rfc1002 length field and 1 for pad */ + req->InputBufferOffset = + cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + req->OutputBufferLength = cpu_to_le32( + outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4); + + iov->iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov->iov_len = get_rfc1002_length(req) + 4; + return 0; +} + +int +SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) +{ + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov; + int rc = 0; + int resp_buftype; + struct cifs_ses *ses = tcon->ses; + struct smb2_fs_full_size_info *info = NULL; + + rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION, + sizeof(struct smb2_fs_full_size_info), + persistent_fid, volatile_fid); + if (rc) + return rc; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qfsinf_exit; + } + rsp = (struct smb2_query_info_rsp *)iov.iov_base; + + info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ + + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); + rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + sizeof(struct smb2_fs_full_size_info)); + if (!rc) + copy_fs_info_to_kstatfs(info, fsdata); + +qfsinf_exit: + free_rsp_buf(resp_buftype, iov.iov_base); + return rc; +} + +int +SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int level) +{ + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov; + int rc = 0; + int resp_buftype, max_len, min_len; + struct cifs_ses *ses = tcon->ses; + unsigned int rsp_len, offset; + + if (level == FS_DEVICE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + min_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + } else if (level == FS_ATTRIBUTE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO); + min_len = MIN_FS_ATTR_INFO_SIZE; + } else if (level == FS_SECTOR_SIZE_INFORMATION) { + max_len = sizeof(struct smb3_fs_ss_info); + min_len = sizeof(struct smb3_fs_ss_info); + } else { + cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level); + return -EINVAL; + } + + rc = build_qfs_info_req(&iov, tcon, level, max_len, + persistent_fid, volatile_fid); + if (rc) + return rc; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qfsattr_exit; + } + rsp = (struct smb2_query_info_rsp *)iov.iov_base; + + rsp_len = le32_to_cpu(rsp->OutputBufferLength); + offset = le16_to_cpu(rsp->OutputBufferOffset); + rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len); + if (rc) + goto qfsattr_exit; + + if (level == FS_ATTRIBUTE_INFORMATION) + memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, min_t(unsigned int, + rsp_len, max_len)); + else if (level == FS_DEVICE_INFORMATION) + memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); + else if (level == FS_SECTOR_SIZE_INFORMATION) { + struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) + (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr); + tcon->ss_flags = le32_to_cpu(ss_info->Flags); + tcon->perf_sector_size = + le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); + } + +qfsattr_exit: + free_rsp_buf(resp_buftype, iov.iov_base); + return rc; +} + +int +smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, + const __u32 num_lock, struct smb2_lock_element *buf) +{ + int rc = 0; + struct smb2_lock_req *req = NULL; + struct kvec iov[2]; + int resp_buf_type; + unsigned int count; + + cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); + + rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); + if (rc) + return rc; + + req->hdr.ProcessId = cpu_to_le32(pid); + req->LockCount = cpu_to_le16(num_lock); + + req->PersistentFileId = persist_fid; + req->VolatileFileId = volatile_fid; + + count = num_lock * sizeof(struct smb2_lock_element); + inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element)); + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and count for all locks */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - count; + iov[1].iov_base = (char *)buf; + iov[1].iov_len = count; + + cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) { + cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); + cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); + } + + return rc; +} + +int +SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, + const __u64 length, const __u64 offset, const __u32 lock_flags, + const bool wait) +{ + struct smb2_lock_element lock; + + lock.Offset = cpu_to_le64(offset); + lock.Length = cpu_to_le64(length); + lock.Flags = cpu_to_le32(lock_flags); + if (!wait && lock_flags != SMB2_LOCKFLAG_UNLOCK) + lock.Flags |= cpu_to_le32(SMB2_LOCKFLAG_FAIL_IMMEDIATELY); + + return smb2_lockv(xid, tcon, persist_fid, volatile_fid, pid, 1, &lock); +} + +int +SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, + __u8 *lease_key, const __le32 lease_state) +{ + int rc; + struct smb2_lease_ack *req = NULL; + + cifs_dbg(FYI, "SMB2_lease_break\n"); + rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + + if (rc) + return rc; + + req->hdr.CreditRequest = cpu_to_le16(1); + req->StructureSize = cpu_to_le16(36); + inc_rfc1001_len(req, 12); + + memcpy(req->LeaseKey, lease_key, 16); + req->LeaseState = lease_state; + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); + /* SMB2 buffer freed by function above */ + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); + } + + return rc; +} diff --git a/kernel/fs/cifs/smb2pdu.h b/kernel/fs/cifs/smb2pdu.h new file mode 100644 index 000000000..70867d54f --- /dev/null +++ b/kernel/fs/cifs/smb2pdu.h @@ -0,0 +1,1080 @@ +/* + * fs/cifs/smb2pdu.h + * + * Copyright (c) International Business Machines Corp., 2009, 2013 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _SMB2PDU_H +#define _SMB2PDU_H + +#include + +/* + * Note that, due to trying to use names similar to the protocol specifications, + * there are many mixed case field names in the structures below. Although + * this does not match typical Linux kernel style, it is necessary to be + * be able to match against the protocol specfication. + * + * SMB2 commands + * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie no useful data other than the SMB error code itself) and are marked such. + * Knowing this helps avoid response buffer allocations and copy in some cases. + */ + +/* List of commands in host endian */ +#define SMB2_NEGOTIATE_HE 0x0000 +#define SMB2_SESSION_SETUP_HE 0x0001 +#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ +#define SMB2_TREE_CONNECT_HE 0x0003 +#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ +#define SMB2_CREATE_HE 0x0005 +#define SMB2_CLOSE_HE 0x0006 +#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ +#define SMB2_READ_HE 0x0008 +#define SMB2_WRITE_HE 0x0009 +#define SMB2_LOCK_HE 0x000A +#define SMB2_IOCTL_HE 0x000B +#define SMB2_CANCEL_HE 0x000C +#define SMB2_ECHO_HE 0x000D +#define SMB2_QUERY_DIRECTORY_HE 0x000E +#define SMB2_CHANGE_NOTIFY_HE 0x000F +#define SMB2_QUERY_INFO_HE 0x0010 +#define SMB2_SET_INFO_HE 0x0011 +#define SMB2_OPLOCK_BREAK_HE 0x0012 + +/* The same list in little endian */ +#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) +#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) +#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) +#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) +#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) +#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) +#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) +#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) +#define SMB2_READ cpu_to_le16(SMB2_READ_HE) +#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) +#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) +#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) +#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) +#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) +#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) +#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) +#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) +#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) +#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) + +#define NUMBER_OF_SMB2_COMMANDS 0x0013 + +/* BB FIXME - analyze following length BB */ +#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ + +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) + +/* + * SMB2 Header Definition + * + * "MBZ" : Must be Zero + * "BB" : BugBug, Something to check/review/analyze later + * "PDU" : "Protocol Data Unit" (ie a network "frame") + * + */ + +#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) + +struct smb2_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* length is only two or three bytes - with + one or two byte type preceding it that MBZ */ + __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le32 Status; /* Error from server */ + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + __le32 ProcessId; + __u32 TreeId; /* opaque - so do not make little endian */ + __u64 SessionId; /* opaque - so do not make little endian */ + __u8 Signature[16]; +} __packed; + +struct smb2_pdu { + struct smb2_hdr hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +struct smb2_transform_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* length is only two or three bytes - with + one or two byte type preceding it that MBZ */ + __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[11]; + __u8 Reserved[5]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 EncryptionAlgorithm; + __u64 SessionId; +} __packed; + +/* Encryption Algorithms */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) + +/* + * SMB2 flag definitions + */ +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) + +/* + * Definitions for SMB2 Protocol Data Units (network frames) + * + * See MS-SMB2.PDF specification for protocol details. + * The Naming convention is the lower case version of the SMB2 + * command code name for the struct. Note that structures must be packed. + * + */ + +#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) + +struct smb2_err_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; /* MBZ */ + __le32 ByteCount; /* even if zero, at least one byte follows */ + __u8 ErrorData[1]; /* variable length */ +} __packed; + +struct smb2_symlink_err_rsp { + __le32 SymLinkLength; + __le32 SymLinkErrorTag; + __le32 ReparseTag; + __le16 ReparseDataLength; + __le16 UnparsedPathLength; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + __u8 PathBuffer[0]; +} __packed; + +#define SMB2_CLIENT_GUID_SIZE 16 + +struct smb2_negotiate_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 DialectCount; + __le16 SecurityMode; + __le16 Reserved; /* MBZ */ + __le32 Capabilities; + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; + __le64 ClientStartTime; /* MBZ */ + __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ +} __packed; + +/* Dialects */ +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +#define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 +#define BAD_PROT_ID 0xFFFF + +/* SecurityMode flags */ +#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 +#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +/* Capabilities flags */ +#define SMB2_GLOBAL_CAP_DFS 0x00000001 +#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +/* Internal types */ +#define SMB2_NT_FIND 0x00100000 +#define SMB2_LARGE_FILES 0x00200000 + +struct smb2_negotiate_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 65 */ + __le16 SecurityMode; + __le16 DialectRevision; + __le16 Reserved; /* MBZ */ + __u8 ServerGUID[16]; + __le32 Capabilities; + __le32 MaxTransactSize; + __le32 MaxReadSize; + __le32 MaxWriteSize; + __le64 SystemTime; /* MBZ */ + __le64 ServerStartTime; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le32 Reserved2; /* may be any value, ignore */ + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +struct smb2_sess_setup_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 25 */ + __u8 VcNumber; + __u8 SecurityMode; + __le32 Capabilities; + __le32 Channel; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le64 PreviousSessionId; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Currently defined SessionFlags */ +#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 +#define SMB2_SESSION_FLAG_IS_NULL 0x0002 +#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 +struct smb2_sess_setup_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 SessionFlags; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +struct smb2_logoff_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_logoff_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_connect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 Reserved; + __le16 PathOffset; + __le16 PathLength; + __u8 Buffer[1]; /* variable length */ +} __packed; + +struct smb2_tree_connect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 16 */ + __u8 ShareType; /* see below */ + __u8 Reserved; + __le32 ShareFlags; /* see below */ + __le32 Capabilities; /* see below */ + __le32 MaximalAccess; +} __packed; + +/* Possible ShareType values */ +#define SMB2_SHARE_TYPE_DISK 0x01 +#define SMB2_SHARE_TYPE_PIPE 0x02 +#define SMB2_SHARE_TYPE_PRINT 0x03 + +/* + * Possible ShareFlags - exactly one and only one of the first 4 caching flags + * must be set (any of the remaining, SHI1005, flags may be set individually + * or in combination. + */ +#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 +#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 +#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 +#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 +#define SHI1005_FLAGS_DFS 0x00000001 +#define SHI1005_FLAGS_DFS_ROOT 0x00000002 +#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 +#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SHI1005_FLAGS_ALL 0x0000FF33 + +/* Possible share capabilities */ +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ + +struct smb2_tree_disconnect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_disconnect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +/* File Attrubutes */ +#define FILE_ATTRIBUTE_READONLY 0x00000001 +#define FILE_ATTRIBUTE_HIDDEN 0x00000002 +#define FILE_ATTRIBUTE_SYSTEM 0x00000004 +#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 +#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 +#define FILE_ATTRIBUTE_NORMAL 0x00000080 +#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 +#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 +#define FILE_ATTRIBUTE_OFFLINE 0x00001000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 +#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 +#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 + +/* Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 + +/* Desired Access Flags */ +#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) +#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) +#define FILE_READ_EA_LE cpu_to_le32(0x00000008) +#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) +#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) +#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) +#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) +#define FILE_DELETE_LE cpu_to_le32(0x00010000) +#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) +#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) +#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) +#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) +#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) +#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) +#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) +#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) +#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) +#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) + +/* ShareAccess Flags */ +#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) +#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) +#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) +#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) + +/* CreateDisposition Flags */ +#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) +#define FILE_OPEN_LE cpu_to_le32(0x00000001) +#define FILE_CREATE_LE cpu_to_le32(0x00000002) +#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) +#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) +#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) + +/* CreateOptions Flags */ +#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) +/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ +#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) +#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) +#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008) +#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) +#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020) +#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) +#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) +#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) +#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) +#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000) +#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) +#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000) + +#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ + | FILE_READ_ATTRIBUTES_LE) +#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ + | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) +#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) + +/* Impersonation Levels */ +#define IL_ANONYMOUS cpu_to_le32(0x00000000) +#define IL_IDENTIFICATION cpu_to_le32(0x00000001) +#define IL_IMPERSONATION cpu_to_le32(0x00000002) +#define IL_DELEGATE cpu_to_le32(0x00000003) + +/* Create Context Values */ +#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ +#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" +#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" +#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" +#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" +#define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 +#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 + +struct smb2_create_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u8 SecurityFlags; + __u8 RequestedOplockLevel; + __le32 ImpersonationLevel; + __le64 SmbCreateFlags; + __le64 Reserved; + __le32 DesiredAccess; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le16 NameOffset; + __le16 NameLength; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[0]; +} __packed; + +struct smb2_create_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 89 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndofFile; + __le32 FileAttributes; + __le32 Reserved2; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[1]; +} __packed; + +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[0]; +} __packed; + +#define SMB2_LEASE_READ_CACHING_HE 0x01 +#define SMB2_LEASE_HANDLE_CACHING_HE 0x02 +#define SMB2_LEASE_WRITE_CACHING_HE 0x04 + +#define SMB2_LEASE_NONE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) + +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) + +#define SMB2_LEASE_KEY_SIZE 16 + +struct lease_context { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; +} __packed; + +struct lease_context_v2 { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __le64 ParentLeaseKeyLow; + __le64 ParentLeaseKeyHigh; + __le16 Epoch; + __le16 Reserved; +} __packed; + +struct create_lease { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context lcontext; +} __packed; + +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + +struct create_durable { + struct create_context ccontext; + __u8 Name[8]; + union { + __u8 Reserved[16]; + struct { + __u64 PersistentFileId; + __u64 VolatileFileId; + } Fid; + } Data; +} __packed; + +#define COPY_CHUNK_RES_KEY_SIZE 24 +struct resume_key_req { + char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; + __le32 ContextLength; /* MBZ */ + char Context[0]; /* ignored, Windows sets to 4 bytes of zero */ +} __packed; + +/* this goes in the ioctl buffer when doing a copychunk request */ +struct copychunk_ioctl { + char SourceKey[COPY_CHUNK_RES_KEY_SIZE]; + __le32 ChunkCount; /* we are only sending 1 */ + __le32 Reserved; + /* array will only be one chunk long for us */ + __le64 SourceOffset; + __le64 TargetOffset; + __le32 Length; /* how many bytes to copy */ + __u32 Reserved2; +} __packed; + +/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + +struct copychunk_ioctl_rsp { + __le32 ChunksWritten; + __le32 ChunkBytesWritten; + __le32 TotalBytesWritten; +} __packed; + +struct validate_negotiate_info_req { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ +} __packed; + +#define RSS_CAPABLE 0x00000001 +#define RDMA_CAPABLE 0x00000002 + +struct network_interface_info_ioctl_rsp { + __le32 Next; /* next interface. zero if this is last one */ + __le32 IfIndex; + __le32 Capability; /* RSS or RDMA Capable */ + __le32 Reserved; + __le64 LinkSpeed; + char SockAddr_Storage[128]; +} __packed; + +#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ + +struct compress_ioctl { + __le16 CompressionState; /* See cifspdu.h for possible flag values */ +} __packed; + +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __u32 Reserved2; + __u8 Buffer[0]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __u32 Reserved2; + /* char * buffer[] */ +} __packed; + +/* Currently defined values for close flags */ +#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) +struct smb2_close_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Flags; + __le32 Reserved; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ +} __packed; + +struct smb2_close_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* 60 */ + __le16 Flags; + __le32 Reserved; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; + __le32 Attributes; +} __packed; + +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE 0x00000000 +#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */ + +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Flags; /* MBZ unless SMB3.02 or later */ + __le32 Length; + __le64 Offset; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 MinimumCount; + __le32 Channel; /* MBZ except for SMB3 or later */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; /* Reserved MBZ */ + __le16 ReadChannelInfoLength; /* Reserved MBZ */ + __u8 Buffer[1]; +} __packed; + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 Channel; /* Reserved MBZ */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; /* Reserved MBZ */ + __le16 WriteChannelInfoLength; /* Reserved MBZ */ + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 +#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 +#define SMB2_LOCKFLAG_UNLOCK 0x0004 +#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 + +struct smb2_lock_element { + __le64 Offset; + __le64 Length; + __le32 Flags; + __le32 Reserved; +} __packed; + +struct smb2_lock_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 48 */ + __le16 LockCount; + __le32 Reserved; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + /* Followed by at least one */ + struct smb2_lock_element locks[1]; +} __packed; + +struct smb2_lock_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_echo_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +struct smb2_echo_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +/* search (query_directory) Flags field */ +#define SMB2_RESTART_SCANS 0x01 +#define SMB2_RETURN_SINGLE_ENTRY 0x02 +#define SMB2_INDEX_SPECIFIED 0x04 +#define SMB2_REOPEN 0x10 + +struct smb2_query_directory_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 FileInformationClass; + __u8 Flags; + __le32 FileIndex; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le16 FileNameOffset; + __le16 FileNameLength; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_directory_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* Possible InfoType values */ +#define SMB2_O_INFO_FILE 0x01 +#define SMB2_O_INFO_FILESYSTEM 0x02 +#define SMB2_O_INFO_SECURITY 0x03 +#define SMB2_O_INFO_QUOTA 0x04 + +/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ +#define OWNER_SECINFO 0x00000001 +#define GROUP_SECINFO 0x00000002 +#define DACL_SECINFO 0x00000004 +#define SACL_SECINFO 0x00000008 +#define LABEL_SECINFO 0x00000010 +#define ATTRIBUTE_SECINFO 0x00000020 +#define SCOPE_SECINFO 0x00000040 +#define BACKUP_SECINFO 0x00010000 +#define UNPROTECTED_SACL_SECINFO 0x10000000 +#define UNPROTECTED_DACL_SECINFO 0x20000000 +#define PROTECTED_SACL_SECINFO 0x40000000 +#define PROTECTED_DACL_SECINFO 0x80000000 + +/* Flags used for FileFullEAinfo */ +#define SL_RESTART_SCAN 0x00000001 +#define SL_RETURN_SINGLE_ENTRY 0x00000002 +#define SL_INDEX_SPECIFIED 0x00000004 + +struct smb2_query_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 41 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 OutputBufferLength; + __le16 InputBufferOffset; + __u16 Reserved; + __le32 InputBufferLength; + __le32 AdditionalInformation; + __le32 Flags; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __u8 Buffer[1]; +} __packed; + +struct smb2_query_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 BufferLength; + __le16 BufferOffset; + __u16 Reserved; + __le32 AdditionalInformation; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 2 */ +} __packed; + +struct smb2_oplock_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) + +struct smb2_lease_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 44 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 CurrentLeaseState; + __le32 NewLeaseState; + __le32 BreakReason; + __le32 AccessMaskHint; + __le32 ShareMaskHint; +} __packed; + +struct smb2_lease_ack { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 LeaseState; + __le64 LeaseDuration; +} __packed; + +/* + * PDU infolevel structure definitions + * BB consider moving to a different header + */ + +/* File System Information Classes */ +#define FS_VOLUME_INFORMATION 1 /* Query */ +#define FS_LABEL_INFORMATION 2 /* Local only */ +#define FS_SIZE_INFORMATION 3 /* Query */ +#define FS_DEVICE_INFORMATION 4 /* Query */ +#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ +#define FS_CONTROL_INFORMATION 6 /* Query, Set */ +#define FS_FULL_SIZE_INFORMATION 7 /* Query */ +#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */ +#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ + +struct smb2_fs_full_size_info { + __le64 TotalAllocationUnits; + __le64 CallerAvailableAllocationUnits; + __le64 ActualAvailableAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; + +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + +/* partial list of QUERY INFO levels */ +#define FILE_DIRECTORY_INFORMATION 1 +#define FILE_FULL_DIRECTORY_INFORMATION 2 +#define FILE_BOTH_DIRECTORY_INFORMATION 3 +#define FILE_BASIC_INFORMATION 4 +#define FILE_STANDARD_INFORMATION 5 +#define FILE_INTERNAL_INFORMATION 6 +#define FILE_EA_INFORMATION 7 +#define FILE_ACCESS_INFORMATION 8 +#define FILE_NAME_INFORMATION 9 +#define FILE_RENAME_INFORMATION 10 +#define FILE_LINK_INFORMATION 11 +#define FILE_NAMES_INFORMATION 12 +#define FILE_DISPOSITION_INFORMATION 13 +#define FILE_POSITION_INFORMATION 14 +#define FILE_FULL_EA_INFORMATION 15 +#define FILE_MODE_INFORMATION 16 +#define FILE_ALIGNMENT_INFORMATION 17 +#define FILE_ALL_INFORMATION 18 +#define FILE_ALLOCATION_INFORMATION 19 +#define FILE_END_OF_FILE_INFORMATION 20 +#define FILE_ALTERNATE_NAME_INFORMATION 21 +#define FILE_STREAM_INFORMATION 22 +#define FILE_PIPE_INFORMATION 23 +#define FILE_PIPE_LOCAL_INFORMATION 24 +#define FILE_PIPE_REMOTE_INFORMATION 25 +#define FILE_MAILSLOT_QUERY_INFORMATION 26 +#define FILE_MAILSLOT_SET_INFORMATION 27 +#define FILE_COMPRESSION_INFORMATION 28 +#define FILE_OBJECT_ID_INFORMATION 29 +/* Number 30 not defined in documents */ +#define FILE_MOVE_CLUSTER_INFORMATION 31 +#define FILE_QUOTA_INFORMATION 32 +#define FILE_REPARSE_POINT_INFORMATION 33 +#define FILE_NETWORK_OPEN_INFORMATION 34 +#define FILE_ATTRIBUTE_TAG_INFORMATION 35 +#define FILE_TRACKING_INFORMATION 36 +#define FILEID_BOTH_DIRECTORY_INFORMATION 37 +#define FILEID_FULL_DIRECTORY_INFORMATION 38 +#define FILE_VALID_DATA_LENGTH_INFORMATION 39 +#define FILE_SHORT_NAME_INFORMATION 40 +#define FILE_SFIO_RESERVE_INFORMATION 44 +#define FILE_SFIO_VOLUME_INFORMATION 45 +#define FILE_HARD_LINK_INFORMATION 46 +#define FILE_NORMALIZED_NAME_INFORMATION 48 +#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 +#define FILE_STANDARD_LINK_INFORMATION 54 + +struct smb2_file_internal_info { + __le64 IndexNumber; +} __packed; /* level 6 Query */ + +struct smb2_file_rename_info { /* encoding of request for level 10 */ + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* New name to be assigned */ +} __packed; /* level 10 Set */ + +struct smb2_file_link_info { /* encoding of request for level 11 */ + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* Name to be assigned to new link */ +} __packed; /* level 11 Set */ + +/* + * This level 18, although with struct with same name is different from cifs + * level 0x107. Level 0x107 has an extra u64 between AccessFlags and + * CurrentByteOffset. + */ +struct smb2_file_all_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ + __le64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 18 Query */ + +struct smb2_file_eof_info { /* encoding of request for level 10 */ + __le64 EndOfFile; /* new end of file value */ +} __packed; /* level 20 Set */ + +#endif /* _SMB2PDU_H */ diff --git a/kernel/fs/cifs/smb2proto.h b/kernel/fs/cifs/smb2proto.h new file mode 100644 index 000000000..79dc650c1 --- /dev/null +++ b/kernel/fs/cifs/smb2proto.h @@ -0,0 +1,174 @@ +/* + * fs/cifs/smb2proto.h + * + * Copyright (c) International Business Machines Corp., 2002, 2011 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _SMB2PROTO_H +#define _SMB2PROTO_H +#include +#include + +struct statfs; +struct smb_rqst; + +/* + ***************************************************************** + * All Prototypes + ***************************************************************** + */ +extern int map_smb2_to_linux_error(char *buf, bool log_err); +extern int smb2_check_message(char *buf, unsigned int length); +extern unsigned int smb2_calc_size(void *buf); +extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); +extern __le16 *cifs_convert_path_to_utf16(const char *from, + struct cifs_sb_info *cifs_sb); + +extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *); +extern int smb2_check_receive(struct mid_q_entry *mid, + struct TCP_Server_Info *server, bool log_error); +extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, + struct smb_rqst *rqst); +extern struct mid_q_entry *smb2_setup_async_request( + struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern int smb2_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); +extern int smb3_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); +extern void smb2_echo_request(struct work_struct *work); +extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); +extern bool smb2_is_valid_oplock_break(char *buffer, + struct TCP_Server_Info *srv); + +extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, + struct smb2_file_all_info *src); +extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, FILE_ALL_INFO *data, + bool *adjust_tz, bool *symlink); +extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_alloc); +extern int smb2_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid); +extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid); +extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written); +extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_read); +extern int smb2_open_file(const unsigned int xid, + struct cifs_open_parms *oparms, + __u32 *oplock, FILE_ALL_INFO *buf); +extern int smb2_unlock_range(struct cifsFileInfo *cfile, + struct file_lock *flock, const unsigned int xid); +extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); + +/* + * SMB2 Worker functions - most of protocol specific implementation details + * are contained within these calls. + */ +extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses); +extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp); +extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses); +extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, + const char *tree, struct cifs_tcon *tcon, + const struct nls_table *); +extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); +extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, + __le16 *path, __u8 *oplock, + struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf); +extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */); +extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + struct smb2_file_all_info *data); +extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le64 *uniqueid); +extern int smb2_async_readv(struct cifs_readdata *rdata); +extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, int *buf_type); +extern int smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); +extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec); +extern int SMB2_echo(struct TCP_Server_Info *server); +extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf); +extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le16 *target_file); +extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le16 *target_file); +extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, + __le64 *eof, bool is_fallocate); +extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + FILE_BASIC_INFO *buf); +extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid); +extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, + const u64 persistent_fid, const u64 volatile_fid, + const __u8 oplock_level); +extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + struct kstatfs *FSData); +extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, int lvl); +extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, + const __u32 pid, const __u64 length, const __u64 offset, + const __u32 lockFlags, const bool wait); +extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, + const __u32 pid, const __u32 num_lock, + struct smb2_lock_element *buf); +extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, + __u8 *lease_key, const __le32 lease_state); +extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); + +#endif /* _SMB2PROTO_H */ diff --git a/kernel/fs/cifs/smb2status.h b/kernel/fs/cifs/smb2status.h new file mode 100644 index 000000000..3d5f62150 --- /dev/null +++ b/kernel/fs/cifs/smb2status.h @@ -0,0 +1,1782 @@ +/* + * fs/cifs/smb2status.h + * + * SMB2 Status code (network error) definitions + * Definitions are from MS-ERREF + * + * Copyright (c) International Business Machines Corp., 2009,2011 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * 0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F + * SEV C N <-------Facility--------> <------Error Status Code------> + * + * C is set if "customer defined" error, N bit is reserved and MBZ + */ + +#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000) +#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003) + +struct ntstatus { + /* Facility is the high 12 bits of the following field */ + __le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */ + __le32 Code; +}; + +#define STATUS_SUCCESS __constant_cpu_to_le32(0x00000000) +#define STATUS_WAIT_0 __constant_cpu_to_le32(0x00000000) +#define STATUS_WAIT_1 __constant_cpu_to_le32(0x00000001) +#define STATUS_WAIT_2 __constant_cpu_to_le32(0x00000002) +#define STATUS_WAIT_3 __constant_cpu_to_le32(0x00000003) +#define STATUS_WAIT_63 __constant_cpu_to_le32(0x0000003F) +#define STATUS_ABANDONED __constant_cpu_to_le32(0x00000080) +#define STATUS_ABANDONED_WAIT_0 __constant_cpu_to_le32(0x00000080) +#define STATUS_ABANDONED_WAIT_63 __constant_cpu_to_le32(0x000000BF) +#define STATUS_USER_APC __constant_cpu_to_le32(0x000000C0) +#define STATUS_KERNEL_APC __constant_cpu_to_le32(0x00000100) +#define STATUS_ALERTED __constant_cpu_to_le32(0x00000101) +#define STATUS_TIMEOUT __constant_cpu_to_le32(0x00000102) +#define STATUS_PENDING __constant_cpu_to_le32(0x00000103) +#define STATUS_REPARSE __constant_cpu_to_le32(0x00000104) +#define STATUS_MORE_ENTRIES __constant_cpu_to_le32(0x00000105) +#define STATUS_NOT_ALL_ASSIGNED __constant_cpu_to_le32(0x00000106) +#define STATUS_SOME_NOT_MAPPED __constant_cpu_to_le32(0x00000107) +#define STATUS_OPLOCK_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x00000108) +#define STATUS_VOLUME_MOUNTED __constant_cpu_to_le32(0x00000109) +#define STATUS_RXACT_COMMITTED __constant_cpu_to_le32(0x0000010A) +#define STATUS_NOTIFY_CLEANUP __constant_cpu_to_le32(0x0000010B) +#define STATUS_NOTIFY_ENUM_DIR __constant_cpu_to_le32(0x0000010C) +#define STATUS_NO_QUOTAS_FOR_ACCOUNT __constant_cpu_to_le32(0x0000010D) +#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED __constant_cpu_to_le32(0x0000010E) +#define STATUS_PAGE_FAULT_TRANSITION __constant_cpu_to_le32(0x00000110) +#define STATUS_PAGE_FAULT_DEMAND_ZERO __constant_cpu_to_le32(0x00000111) +#define STATUS_PAGE_FAULT_COPY_ON_WRITE __constant_cpu_to_le32(0x00000112) +#define STATUS_PAGE_FAULT_GUARD_PAGE __constant_cpu_to_le32(0x00000113) +#define STATUS_PAGE_FAULT_PAGING_FILE __constant_cpu_to_le32(0x00000114) +#define STATUS_CACHE_PAGE_LOCKED __constant_cpu_to_le32(0x00000115) +#define STATUS_CRASH_DUMP __constant_cpu_to_le32(0x00000116) +#define STATUS_BUFFER_ALL_ZEROS __constant_cpu_to_le32(0x00000117) +#define STATUS_REPARSE_OBJECT __constant_cpu_to_le32(0x00000118) +#define STATUS_RESOURCE_REQUIREMENTS_CHANGED __constant_cpu_to_le32(0x00000119) +#define STATUS_TRANSLATION_COMPLETE __constant_cpu_to_le32(0x00000120) +#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY __constant_cpu_to_le32(0x00000121) +#define STATUS_NOTHING_TO_TERMINATE __constant_cpu_to_le32(0x00000122) +#define STATUS_PROCESS_NOT_IN_JOB __constant_cpu_to_le32(0x00000123) +#define STATUS_PROCESS_IN_JOB __constant_cpu_to_le32(0x00000124) +#define STATUS_VOLSNAP_HIBERNATE_READY __constant_cpu_to_le32(0x00000125) +#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY __constant_cpu_to_le32(0x00000126) +#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED __constant_cpu_to_le32(0x00000127) +#define STATUS_INTERRUPT_STILL_CONNECTED __constant_cpu_to_le32(0x00000128) +#define STATUS_PROCESS_CLONED __constant_cpu_to_le32(0x00000129) +#define STATUS_FILE_LOCKED_WITH_ONLY_READERS __constant_cpu_to_le32(0x0000012A) +#define STATUS_FILE_LOCKED_WITH_WRITERS __constant_cpu_to_le32(0x0000012B) +#define STATUS_RESOURCEMANAGER_READ_ONLY __constant_cpu_to_le32(0x00000202) +#define STATUS_WAIT_FOR_OPLOCK __constant_cpu_to_le32(0x00000367) +#define DBG_EXCEPTION_HANDLED __constant_cpu_to_le32(0x00010001) +#define DBG_CONTINUE __constant_cpu_to_le32(0x00010002) +#define STATUS_FLT_IO_COMPLETE __constant_cpu_to_le32(0x001C0001) +#define STATUS_OBJECT_NAME_EXISTS __constant_cpu_to_le32(0x40000000) +#define STATUS_THREAD_WAS_SUSPENDED __constant_cpu_to_le32(0x40000001) +#define STATUS_WORKING_SET_LIMIT_RANGE __constant_cpu_to_le32(0x40000002) +#define STATUS_IMAGE_NOT_AT_BASE __constant_cpu_to_le32(0x40000003) +#define STATUS_RXACT_STATE_CREATED __constant_cpu_to_le32(0x40000004) +#define STATUS_SEGMENT_NOTIFICATION __constant_cpu_to_le32(0x40000005) +#define STATUS_LOCAL_USER_SESSION_KEY __constant_cpu_to_le32(0x40000006) +#define STATUS_BAD_CURRENT_DIRECTORY __constant_cpu_to_le32(0x40000007) +#define STATUS_SERIAL_MORE_WRITES __constant_cpu_to_le32(0x40000008) +#define STATUS_REGISTRY_RECOVERED __constant_cpu_to_le32(0x40000009) +#define STATUS_FT_READ_RECOVERY_FROM_BACKUP __constant_cpu_to_le32(0x4000000A) +#define STATUS_FT_WRITE_RECOVERY __constant_cpu_to_le32(0x4000000B) +#define STATUS_SERIAL_COUNTER_TIMEOUT __constant_cpu_to_le32(0x4000000C) +#define STATUS_NULL_LM_PASSWORD __constant_cpu_to_le32(0x4000000D) +#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH __constant_cpu_to_le32(0x4000000E) +#define STATUS_RECEIVE_PARTIAL __constant_cpu_to_le32(0x4000000F) +#define STATUS_RECEIVE_EXPEDITED __constant_cpu_to_le32(0x40000010) +#define STATUS_RECEIVE_PARTIAL_EXPEDITED __constant_cpu_to_le32(0x40000011) +#define STATUS_EVENT_DONE __constant_cpu_to_le32(0x40000012) +#define STATUS_EVENT_PENDING __constant_cpu_to_le32(0x40000013) +#define STATUS_CHECKING_FILE_SYSTEM __constant_cpu_to_le32(0x40000014) +#define STATUS_FATAL_APP_EXIT __constant_cpu_to_le32(0x40000015) +#define STATUS_PREDEFINED_HANDLE __constant_cpu_to_le32(0x40000016) +#define STATUS_WAS_UNLOCKED __constant_cpu_to_le32(0x40000017) +#define STATUS_SERVICE_NOTIFICATION __constant_cpu_to_le32(0x40000018) +#define STATUS_WAS_LOCKED __constant_cpu_to_le32(0x40000019) +#define STATUS_LOG_HARD_ERROR __constant_cpu_to_le32(0x4000001A) +#define STATUS_ALREADY_WIN32 __constant_cpu_to_le32(0x4000001B) +#define STATUS_WX86_UNSIMULATE __constant_cpu_to_le32(0x4000001C) +#define STATUS_WX86_CONTINUE __constant_cpu_to_le32(0x4000001D) +#define STATUS_WX86_SINGLE_STEP __constant_cpu_to_le32(0x4000001E) +#define STATUS_WX86_BREAKPOINT __constant_cpu_to_le32(0x4000001F) +#define STATUS_WX86_EXCEPTION_CONTINUE __constant_cpu_to_le32(0x40000020) +#define STATUS_WX86_EXCEPTION_LASTCHANCE __constant_cpu_to_le32(0x40000021) +#define STATUS_WX86_EXCEPTION_CHAIN __constant_cpu_to_le32(0x40000022) +#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE __constant_cpu_to_le32(0x40000023) +#define STATUS_NO_YIELD_PERFORMED __constant_cpu_to_le32(0x40000024) +#define STATUS_TIMER_RESUME_IGNORED __constant_cpu_to_le32(0x40000025) +#define STATUS_ARBITRATION_UNHANDLED __constant_cpu_to_le32(0x40000026) +#define STATUS_CARDBUS_NOT_SUPPORTED __constant_cpu_to_le32(0x40000027) +#define STATUS_WX86_CREATEWX86TIB __constant_cpu_to_le32(0x40000028) +#define STATUS_MP_PROCESSOR_MISMATCH __constant_cpu_to_le32(0x40000029) +#define STATUS_HIBERNATED __constant_cpu_to_le32(0x4000002A) +#define STATUS_RESUME_HIBERNATION __constant_cpu_to_le32(0x4000002B) +#define STATUS_FIRMWARE_UPDATED __constant_cpu_to_le32(0x4000002C) +#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES __constant_cpu_to_le32(0x4000002D) +#define STATUS_MESSAGE_RETRIEVED __constant_cpu_to_le32(0x4000002E) +#define STATUS_SYSTEM_POWERSTATE_TRANSITION __constant_cpu_to_le32(0x4000002F) +#define STATUS_ALPC_CHECK_COMPLETION_LIST __constant_cpu_to_le32(0x40000030) +#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION __constant_cpu_to_le32(0x40000031) +#define STATUS_ACCESS_AUDIT_BY_POLICY __constant_cpu_to_le32(0x40000032) +#define STATUS_ABANDON_HIBERFILE __constant_cpu_to_le32(0x40000033) +#define STATUS_BIZRULES_NOT_ENABLED __constant_cpu_to_le32(0x40000034) +#define STATUS_WAKE_SYSTEM __constant_cpu_to_le32(0x40000294) +#define STATUS_DS_SHUTTING_DOWN __constant_cpu_to_le32(0x40000370) +#define DBG_REPLY_LATER __constant_cpu_to_le32(0x40010001) +#define DBG_UNABLE_TO_PROVIDE_HANDLE __constant_cpu_to_le32(0x40010002) +#define DBG_TERMINATE_THREAD __constant_cpu_to_le32(0x40010003) +#define DBG_TERMINATE_PROCESS __constant_cpu_to_le32(0x40010004) +#define DBG_CONTROL_C __constant_cpu_to_le32(0x40010005) +#define DBG_PRINTEXCEPTION_C __constant_cpu_to_le32(0x40010006) +#define DBG_RIPEXCEPTION __constant_cpu_to_le32(0x40010007) +#define DBG_CONTROL_BREAK __constant_cpu_to_le32(0x40010008) +#define DBG_COMMAND_EXCEPTION __constant_cpu_to_le32(0x40010009) +#define RPC_NT_UUID_LOCAL_ONLY __constant_cpu_to_le32(0x40020056) +#define RPC_NT_SEND_INCOMPLETE __constant_cpu_to_le32(0x400200AF) +#define STATUS_CTX_CDM_CONNECT __constant_cpu_to_le32(0x400A0004) +#define STATUS_CTX_CDM_DISCONNECT __constant_cpu_to_le32(0x400A0005) +#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT __constant_cpu_to_le32(0x4015000D) +#define STATUS_RECOVERY_NOT_NEEDED __constant_cpu_to_le32(0x40190034) +#define STATUS_RM_ALREADY_STARTED __constant_cpu_to_le32(0x40190035) +#define STATUS_LOG_NO_RESTART __constant_cpu_to_le32(0x401A000C) +#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST __constant_cpu_to_le32(0x401B00EC) +#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED __constant_cpu_to_le32(0x401E000A) +#define STATUS_GRAPHICS_DRIVER_MISMATCH __constant_cpu_to_le32(0x401E0117) +#define STATUS_GRAPHICS_MODE_NOT_PINNED __constant_cpu_to_le32(0x401E0307) +#define STATUS_GRAPHICS_NO_PREFERRED_MODE __constant_cpu_to_le32(0x401E031E) +#define STATUS_GRAPHICS_DATASET_IS_EMPTY __constant_cpu_to_le32(0x401E034B) +#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET __constant_cpu_to_le32(0x401E034C) +#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED __constant_cpu_to_le32(0x401E0351) +#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS __constant_cpu_to_le32(0x401E042F) +#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED __constant_cpu_to_le32(0x401E0437) +#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY __constant_cpu_to_le32(0x401E0439) +#define STATUS_GRAPHICS_START_DEFERRED __constant_cpu_to_le32(0x401E043A) +#define STATUS_NDIS_INDICATION_REQUIRED __constant_cpu_to_le32(0x40230001) +#define STATUS_GUARD_PAGE_VIOLATION __constant_cpu_to_le32(0x80000001) +#define STATUS_DATATYPE_MISALIGNMENT __constant_cpu_to_le32(0x80000002) +#define STATUS_BREAKPOINT __constant_cpu_to_le32(0x80000003) +#define STATUS_SINGLE_STEP __constant_cpu_to_le32(0x80000004) +#define STATUS_BUFFER_OVERFLOW __constant_cpu_to_le32(0x80000005) +#define STATUS_NO_MORE_FILES __constant_cpu_to_le32(0x80000006) +#define STATUS_WAKE_SYSTEM_DEBUGGER __constant_cpu_to_le32(0x80000007) +#define STATUS_HANDLES_CLOSED __constant_cpu_to_le32(0x8000000A) +#define STATUS_NO_INHERITANCE __constant_cpu_to_le32(0x8000000B) +#define STATUS_GUID_SUBSTITUTION_MADE __constant_cpu_to_le32(0x8000000C) +#define STATUS_PARTIAL_COPY __constant_cpu_to_le32(0x8000000D) +#define STATUS_DEVICE_PAPER_EMPTY __constant_cpu_to_le32(0x8000000E) +#define STATUS_DEVICE_POWERED_OFF __constant_cpu_to_le32(0x8000000F) +#define STATUS_DEVICE_OFF_LINE __constant_cpu_to_le32(0x80000010) +#define STATUS_DEVICE_BUSY __constant_cpu_to_le32(0x80000011) +#define STATUS_NO_MORE_EAS __constant_cpu_to_le32(0x80000012) +#define STATUS_INVALID_EA_NAME __constant_cpu_to_le32(0x80000013) +#define STATUS_EA_LIST_INCONSISTENT __constant_cpu_to_le32(0x80000014) +#define STATUS_INVALID_EA_FLAG __constant_cpu_to_le32(0x80000015) +#define STATUS_VERIFY_REQUIRED __constant_cpu_to_le32(0x80000016) +#define STATUS_EXTRANEOUS_INFORMATION __constant_cpu_to_le32(0x80000017) +#define STATUS_RXACT_COMMIT_NECESSARY __constant_cpu_to_le32(0x80000018) +#define STATUS_NO_MORE_ENTRIES __constant_cpu_to_le32(0x8000001A) +#define STATUS_FILEMARK_DETECTED __constant_cpu_to_le32(0x8000001B) +#define STATUS_MEDIA_CHANGED __constant_cpu_to_le32(0x8000001C) +#define STATUS_BUS_RESET __constant_cpu_to_le32(0x8000001D) +#define STATUS_END_OF_MEDIA __constant_cpu_to_le32(0x8000001E) +#define STATUS_BEGINNING_OF_MEDIA __constant_cpu_to_le32(0x8000001F) +#define STATUS_MEDIA_CHECK __constant_cpu_to_le32(0x80000020) +#define STATUS_SETMARK_DETECTED __constant_cpu_to_le32(0x80000021) +#define STATUS_NO_DATA_DETECTED __constant_cpu_to_le32(0x80000022) +#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000023) +#define STATUS_SERVER_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000024) +#define STATUS_ALREADY_DISCONNECTED __constant_cpu_to_le32(0x80000025) +#define STATUS_LONGJUMP __constant_cpu_to_le32(0x80000026) +#define STATUS_CLEANER_CARTRIDGE_INSTALLED __constant_cpu_to_le32(0x80000027) +#define STATUS_PLUGPLAY_QUERY_VETOED __constant_cpu_to_le32(0x80000028) +#define STATUS_UNWIND_CONSOLIDATE __constant_cpu_to_le32(0x80000029) +#define STATUS_REGISTRY_HIVE_RECOVERED __constant_cpu_to_le32(0x8000002A) +#define STATUS_DLL_MIGHT_BE_INSECURE __constant_cpu_to_le32(0x8000002B) +#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE __constant_cpu_to_le32(0x8000002C) +#define STATUS_STOPPED_ON_SYMLINK __constant_cpu_to_le32(0x8000002D) +#define STATUS_DEVICE_REQUIRES_CLEANING __constant_cpu_to_le32(0x80000288) +#define STATUS_DEVICE_DOOR_OPEN __constant_cpu_to_le32(0x80000289) +#define STATUS_DATA_LOST_REPAIR __constant_cpu_to_le32(0x80000803) +#define DBG_EXCEPTION_NOT_HANDLED __constant_cpu_to_le32(0x80010001) +#define STATUS_CLUSTER_NODE_ALREADY_UP __constant_cpu_to_le32(0x80130001) +#define STATUS_CLUSTER_NODE_ALREADY_DOWN __constant_cpu_to_le32(0x80130002) +#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE __constant_cpu_to_le32(0x80130003) +#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE __constant_cpu_to_le32(0x80130004) +#define STATUS_CLUSTER_NODE_ALREADY_MEMBER __constant_cpu_to_le32(0x80130005) +#define STATUS_COULD_NOT_RESIZE_LOG __constant_cpu_to_le32(0x80190009) +#define STATUS_NO_TXF_METADATA __constant_cpu_to_le32(0x80190029) +#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN __constant_cpu_to_le32(0x80190031) +#define STATUS_TXF_METADATA_ALREADY_PRESENT __constant_cpu_to_le32(0x80190041) +#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET __constant_cpu_to_le32(0x80190042) +#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED __constant_cpu_to_le32(0x801B00EB) +#define STATUS_FLT_BUFFER_TOO_SMALL __constant_cpu_to_le32(0x801C0001) +#define STATUS_FVE_PARTIAL_METADATA __constant_cpu_to_le32(0x80210001) +#define STATUS_UNSUCCESSFUL __constant_cpu_to_le32(0xC0000001) +#define STATUS_NOT_IMPLEMENTED __constant_cpu_to_le32(0xC0000002) +#define STATUS_INVALID_INFO_CLASS __constant_cpu_to_le32(0xC0000003) +#define STATUS_INFO_LENGTH_MISMATCH __constant_cpu_to_le32(0xC0000004) +#define STATUS_ACCESS_VIOLATION __constant_cpu_to_le32(0xC0000005) +#define STATUS_IN_PAGE_ERROR __constant_cpu_to_le32(0xC0000006) +#define STATUS_PAGEFILE_QUOTA __constant_cpu_to_le32(0xC0000007) +#define STATUS_INVALID_HANDLE __constant_cpu_to_le32(0xC0000008) +#define STATUS_BAD_INITIAL_STACK __constant_cpu_to_le32(0xC0000009) +#define STATUS_BAD_INITIAL_PC __constant_cpu_to_le32(0xC000000A) +#define STATUS_INVALID_CID __constant_cpu_to_le32(0xC000000B) +#define STATUS_TIMER_NOT_CANCELED __constant_cpu_to_le32(0xC000000C) +#define STATUS_INVALID_PARAMETER __constant_cpu_to_le32(0xC000000D) +#define STATUS_NO_SUCH_DEVICE __constant_cpu_to_le32(0xC000000E) +#define STATUS_NO_SUCH_FILE __constant_cpu_to_le32(0xC000000F) +#define STATUS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0000010) +#define STATUS_END_OF_FILE __constant_cpu_to_le32(0xC0000011) +#define STATUS_WRONG_VOLUME __constant_cpu_to_le32(0xC0000012) +#define STATUS_NO_MEDIA_IN_DEVICE __constant_cpu_to_le32(0xC0000013) +#define STATUS_UNRECOGNIZED_MEDIA __constant_cpu_to_le32(0xC0000014) +#define STATUS_NONEXISTENT_SECTOR __constant_cpu_to_le32(0xC0000015) +#define STATUS_MORE_PROCESSING_REQUIRED __constant_cpu_to_le32(0xC0000016) +#define STATUS_NO_MEMORY __constant_cpu_to_le32(0xC0000017) +#define STATUS_CONFLICTING_ADDRESSES __constant_cpu_to_le32(0xC0000018) +#define STATUS_NOT_MAPPED_VIEW __constant_cpu_to_le32(0xC0000019) +#define STATUS_UNABLE_TO_FREE_VM __constant_cpu_to_le32(0xC000001A) +#define STATUS_UNABLE_TO_DELETE_SECTION __constant_cpu_to_le32(0xC000001B) +#define STATUS_INVALID_SYSTEM_SERVICE __constant_cpu_to_le32(0xC000001C) +#define STATUS_ILLEGAL_INSTRUCTION __constant_cpu_to_le32(0xC000001D) +#define STATUS_INVALID_LOCK_SEQUENCE __constant_cpu_to_le32(0xC000001E) +#define STATUS_INVALID_VIEW_SIZE __constant_cpu_to_le32(0xC000001F) +#define STATUS_INVALID_FILE_FOR_SECTION __constant_cpu_to_le32(0xC0000020) +#define STATUS_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0000021) +#define STATUS_ACCESS_DENIED __constant_cpu_to_le32(0xC0000022) +#define STATUS_BUFFER_TOO_SMALL __constant_cpu_to_le32(0xC0000023) +#define STATUS_OBJECT_TYPE_MISMATCH __constant_cpu_to_le32(0xC0000024) +#define STATUS_NONCONTINUABLE_EXCEPTION __constant_cpu_to_le32(0xC0000025) +#define STATUS_INVALID_DISPOSITION __constant_cpu_to_le32(0xC0000026) +#define STATUS_UNWIND __constant_cpu_to_le32(0xC0000027) +#define STATUS_BAD_STACK __constant_cpu_to_le32(0xC0000028) +#define STATUS_INVALID_UNWIND_TARGET __constant_cpu_to_le32(0xC0000029) +#define STATUS_NOT_LOCKED __constant_cpu_to_le32(0xC000002A) +#define STATUS_PARITY_ERROR __constant_cpu_to_le32(0xC000002B) +#define STATUS_UNABLE_TO_DECOMMIT_VM __constant_cpu_to_le32(0xC000002C) +#define STATUS_NOT_COMMITTED __constant_cpu_to_le32(0xC000002D) +#define STATUS_INVALID_PORT_ATTRIBUTES __constant_cpu_to_le32(0xC000002E) +#define STATUS_PORT_MESSAGE_TOO_LONG __constant_cpu_to_le32(0xC000002F) +#define STATUS_INVALID_PARAMETER_MIX __constant_cpu_to_le32(0xC0000030) +#define STATUS_INVALID_QUOTA_LOWER __constant_cpu_to_le32(0xC0000031) +#define STATUS_DISK_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000032) +#define STATUS_OBJECT_NAME_INVALID __constant_cpu_to_le32(0xC0000033) +#define STATUS_OBJECT_NAME_NOT_FOUND __constant_cpu_to_le32(0xC0000034) +#define STATUS_OBJECT_NAME_COLLISION __constant_cpu_to_le32(0xC0000035) +#define STATUS_PORT_DISCONNECTED __constant_cpu_to_le32(0xC0000037) +#define STATUS_DEVICE_ALREADY_ATTACHED __constant_cpu_to_le32(0xC0000038) +#define STATUS_OBJECT_PATH_INVALID __constant_cpu_to_le32(0xC0000039) +#define STATUS_OBJECT_PATH_NOT_FOUND __constant_cpu_to_le32(0xC000003A) +#define STATUS_OBJECT_PATH_SYNTAX_BAD __constant_cpu_to_le32(0xC000003B) +#define STATUS_DATA_OVERRUN __constant_cpu_to_le32(0xC000003C) +#define STATUS_DATA_LATE_ERROR __constant_cpu_to_le32(0xC000003D) +#define STATUS_DATA_ERROR __constant_cpu_to_le32(0xC000003E) +#define STATUS_CRC_ERROR __constant_cpu_to_le32(0xC000003F) +#define STATUS_SECTION_TOO_BIG __constant_cpu_to_le32(0xC0000040) +#define STATUS_PORT_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000041) +#define STATUS_INVALID_PORT_HANDLE __constant_cpu_to_le32(0xC0000042) +#define STATUS_SHARING_VIOLATION __constant_cpu_to_le32(0xC0000043) +#define STATUS_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000044) +#define STATUS_INVALID_PAGE_PROTECTION __constant_cpu_to_le32(0xC0000045) +#define STATUS_MUTANT_NOT_OWNED __constant_cpu_to_le32(0xC0000046) +#define STATUS_SEMAPHORE_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000047) +#define STATUS_PORT_ALREADY_SET __constant_cpu_to_le32(0xC0000048) +#define STATUS_SECTION_NOT_IMAGE __constant_cpu_to_le32(0xC0000049) +#define STATUS_SUSPEND_COUNT_EXCEEDED __constant_cpu_to_le32(0xC000004A) +#define STATUS_THREAD_IS_TERMINATING __constant_cpu_to_le32(0xC000004B) +#define STATUS_BAD_WORKING_SET_LIMIT __constant_cpu_to_le32(0xC000004C) +#define STATUS_INCOMPATIBLE_FILE_MAP __constant_cpu_to_le32(0xC000004D) +#define STATUS_SECTION_PROTECTION __constant_cpu_to_le32(0xC000004E) +#define STATUS_EAS_NOT_SUPPORTED __constant_cpu_to_le32(0xC000004F) +#define STATUS_EA_TOO_LARGE __constant_cpu_to_le32(0xC0000050) +#define STATUS_NONEXISTENT_EA_ENTRY __constant_cpu_to_le32(0xC0000051) +#define STATUS_NO_EAS_ON_FILE __constant_cpu_to_le32(0xC0000052) +#define STATUS_EA_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000053) +#define STATUS_FILE_LOCK_CONFLICT __constant_cpu_to_le32(0xC0000054) +#define STATUS_LOCK_NOT_GRANTED __constant_cpu_to_le32(0xC0000055) +#define STATUS_DELETE_PENDING __constant_cpu_to_le32(0xC0000056) +#define STATUS_CTL_FILE_NOT_SUPPORTED __constant_cpu_to_le32(0xC0000057) +#define STATUS_UNKNOWN_REVISION __constant_cpu_to_le32(0xC0000058) +#define STATUS_REVISION_MISMATCH __constant_cpu_to_le32(0xC0000059) +#define STATUS_INVALID_OWNER __constant_cpu_to_le32(0xC000005A) +#define STATUS_INVALID_PRIMARY_GROUP __constant_cpu_to_le32(0xC000005B) +#define STATUS_NO_IMPERSONATION_TOKEN __constant_cpu_to_le32(0xC000005C) +#define STATUS_CANT_DISABLE_MANDATORY __constant_cpu_to_le32(0xC000005D) +#define STATUS_NO_LOGON_SERVERS __constant_cpu_to_le32(0xC000005E) +#define STATUS_NO_SUCH_LOGON_SESSION __constant_cpu_to_le32(0xC000005F) +#define STATUS_NO_SUCH_PRIVILEGE __constant_cpu_to_le32(0xC0000060) +#define STATUS_PRIVILEGE_NOT_HELD __constant_cpu_to_le32(0xC0000061) +#define STATUS_INVALID_ACCOUNT_NAME __constant_cpu_to_le32(0xC0000062) +#define STATUS_USER_EXISTS __constant_cpu_to_le32(0xC0000063) +#define STATUS_NO_SUCH_USER __constant_cpu_to_le32(0xC0000064) +#define STATUS_GROUP_EXISTS __constant_cpu_to_le32(0xC0000065) +#define STATUS_NO_SUCH_GROUP __constant_cpu_to_le32(0xC0000066) +#define STATUS_MEMBER_IN_GROUP __constant_cpu_to_le32(0xC0000067) +#define STATUS_MEMBER_NOT_IN_GROUP __constant_cpu_to_le32(0xC0000068) +#define STATUS_LAST_ADMIN __constant_cpu_to_le32(0xC0000069) +#define STATUS_WRONG_PASSWORD __constant_cpu_to_le32(0xC000006A) +#define STATUS_ILL_FORMED_PASSWORD __constant_cpu_to_le32(0xC000006B) +#define STATUS_PASSWORD_RESTRICTION __constant_cpu_to_le32(0xC000006C) +#define STATUS_LOGON_FAILURE __constant_cpu_to_le32(0xC000006D) +#define STATUS_ACCOUNT_RESTRICTION __constant_cpu_to_le32(0xC000006E) +#define STATUS_INVALID_LOGON_HOURS __constant_cpu_to_le32(0xC000006F) +#define STATUS_INVALID_WORKSTATION __constant_cpu_to_le32(0xC0000070) +#define STATUS_PASSWORD_EXPIRED __constant_cpu_to_le32(0xC0000071) +#define STATUS_ACCOUNT_DISABLED __constant_cpu_to_le32(0xC0000072) +#define STATUS_NONE_MAPPED __constant_cpu_to_le32(0xC0000073) +#define STATUS_TOO_MANY_LUIDS_REQUESTED __constant_cpu_to_le32(0xC0000074) +#define STATUS_LUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000075) +#define STATUS_INVALID_SUB_AUTHORITY __constant_cpu_to_le32(0xC0000076) +#define STATUS_INVALID_ACL __constant_cpu_to_le32(0xC0000077) +#define STATUS_INVALID_SID __constant_cpu_to_le32(0xC0000078) +#define STATUS_INVALID_SECURITY_DESCR __constant_cpu_to_le32(0xC0000079) +#define STATUS_PROCEDURE_NOT_FOUND __constant_cpu_to_le32(0xC000007A) +#define STATUS_INVALID_IMAGE_FORMAT __constant_cpu_to_le32(0xC000007B) +#define STATUS_NO_TOKEN __constant_cpu_to_le32(0xC000007C) +#define STATUS_BAD_INHERITANCE_ACL __constant_cpu_to_le32(0xC000007D) +#define STATUS_RANGE_NOT_LOCKED __constant_cpu_to_le32(0xC000007E) +#define STATUS_DISK_FULL __constant_cpu_to_le32(0xC000007F) +#define STATUS_SERVER_DISABLED __constant_cpu_to_le32(0xC0000080) +#define STATUS_SERVER_NOT_DISABLED __constant_cpu_to_le32(0xC0000081) +#define STATUS_TOO_MANY_GUIDS_REQUESTED __constant_cpu_to_le32(0xC0000082) +#define STATUS_GUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000083) +#define STATUS_INVALID_ID_AUTHORITY __constant_cpu_to_le32(0xC0000084) +#define STATUS_AGENTS_EXHAUSTED __constant_cpu_to_le32(0xC0000085) +#define STATUS_INVALID_VOLUME_LABEL __constant_cpu_to_le32(0xC0000086) +#define STATUS_SECTION_NOT_EXTENDED __constant_cpu_to_le32(0xC0000087) +#define STATUS_NOT_MAPPED_DATA __constant_cpu_to_le32(0xC0000088) +#define STATUS_RESOURCE_DATA_NOT_FOUND __constant_cpu_to_le32(0xC0000089) +#define STATUS_RESOURCE_TYPE_NOT_FOUND __constant_cpu_to_le32(0xC000008A) +#define STATUS_RESOURCE_NAME_NOT_FOUND __constant_cpu_to_le32(0xC000008B) +#define STATUS_ARRAY_BOUNDS_EXCEEDED __constant_cpu_to_le32(0xC000008C) +#define STATUS_FLOAT_DENORMAL_OPERAND __constant_cpu_to_le32(0xC000008D) +#define STATUS_FLOAT_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC000008E) +#define STATUS_FLOAT_INEXACT_RESULT __constant_cpu_to_le32(0xC000008F) +#define STATUS_FLOAT_INVALID_OPERATION __constant_cpu_to_le32(0xC0000090) +#define STATUS_FLOAT_OVERFLOW __constant_cpu_to_le32(0xC0000091) +#define STATUS_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000092) +#define STATUS_FLOAT_UNDERFLOW __constant_cpu_to_le32(0xC0000093) +#define STATUS_INTEGER_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC0000094) +#define STATUS_INTEGER_OVERFLOW __constant_cpu_to_le32(0xC0000095) +#define STATUS_PRIVILEGED_INSTRUCTION __constant_cpu_to_le32(0xC0000096) +#define STATUS_TOO_MANY_PAGING_FILES __constant_cpu_to_le32(0xC0000097) +#define STATUS_FILE_INVALID __constant_cpu_to_le32(0xC0000098) +#define STATUS_ALLOTTED_SPACE_EXCEEDED __constant_cpu_to_le32(0xC0000099) +#define STATUS_INSUFFICIENT_RESOURCES __constant_cpu_to_le32(0xC000009A) +#define STATUS_DFS_EXIT_PATH_FOUND __constant_cpu_to_le32(0xC000009B) +#define STATUS_DEVICE_DATA_ERROR __constant_cpu_to_le32(0xC000009C) +#define STATUS_DEVICE_NOT_CONNECTED __constant_cpu_to_le32(0xC000009D) +#define STATUS_DEVICE_POWER_FAILURE __constant_cpu_to_le32(0xC000009E) +#define STATUS_FREE_VM_NOT_AT_BASE __constant_cpu_to_le32(0xC000009F) +#define STATUS_MEMORY_NOT_ALLOCATED __constant_cpu_to_le32(0xC00000A0) +#define STATUS_WORKING_SET_QUOTA __constant_cpu_to_le32(0xC00000A1) +#define STATUS_MEDIA_WRITE_PROTECTED __constant_cpu_to_le32(0xC00000A2) +#define STATUS_DEVICE_NOT_READY __constant_cpu_to_le32(0xC00000A3) +#define STATUS_INVALID_GROUP_ATTRIBUTES __constant_cpu_to_le32(0xC00000A4) +#define STATUS_BAD_IMPERSONATION_LEVEL __constant_cpu_to_le32(0xC00000A5) +#define STATUS_CANT_OPEN_ANONYMOUS __constant_cpu_to_le32(0xC00000A6) +#define STATUS_BAD_VALIDATION_CLASS __constant_cpu_to_le32(0xC00000A7) +#define STATUS_BAD_TOKEN_TYPE __constant_cpu_to_le32(0xC00000A8) +#define STATUS_BAD_MASTER_BOOT_RECORD __constant_cpu_to_le32(0xC00000A9) +#define STATUS_INSTRUCTION_MISALIGNMENT __constant_cpu_to_le32(0xC00000AA) +#define STATUS_INSTANCE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AB) +#define STATUS_PIPE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AC) +#define STATUS_INVALID_PIPE_STATE __constant_cpu_to_le32(0xC00000AD) +#define STATUS_PIPE_BUSY __constant_cpu_to_le32(0xC00000AE) +#define STATUS_ILLEGAL_FUNCTION __constant_cpu_to_le32(0xC00000AF) +#define STATUS_PIPE_DISCONNECTED __constant_cpu_to_le32(0xC00000B0) +#define STATUS_PIPE_CLOSING __constant_cpu_to_le32(0xC00000B1) +#define STATUS_PIPE_CONNECTED __constant_cpu_to_le32(0xC00000B2) +#define STATUS_PIPE_LISTENING __constant_cpu_to_le32(0xC00000B3) +#define STATUS_INVALID_READ_MODE __constant_cpu_to_le32(0xC00000B4) +#define STATUS_IO_TIMEOUT __constant_cpu_to_le32(0xC00000B5) +#define STATUS_FILE_FORCED_CLOSED __constant_cpu_to_le32(0xC00000B6) +#define STATUS_PROFILING_NOT_STARTED __constant_cpu_to_le32(0xC00000B7) +#define STATUS_PROFILING_NOT_STOPPED __constant_cpu_to_le32(0xC00000B8) +#define STATUS_COULD_NOT_INTERPRET __constant_cpu_to_le32(0xC00000B9) +#define STATUS_FILE_IS_A_DIRECTORY __constant_cpu_to_le32(0xC00000BA) +#define STATUS_NOT_SUPPORTED __constant_cpu_to_le32(0xC00000BB) +#define STATUS_REMOTE_NOT_LISTENING __constant_cpu_to_le32(0xC00000BC) +#define STATUS_DUPLICATE_NAME __constant_cpu_to_le32(0xC00000BD) +#define STATUS_BAD_NETWORK_PATH __constant_cpu_to_le32(0xC00000BE) +#define STATUS_NETWORK_BUSY __constant_cpu_to_le32(0xC00000BF) +#define STATUS_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC00000C0) +#define STATUS_TOO_MANY_COMMANDS __constant_cpu_to_le32(0xC00000C1) +#define STATUS_ADAPTER_HARDWARE_ERROR __constant_cpu_to_le32(0xC00000C2) +#define STATUS_INVALID_NETWORK_RESPONSE __constant_cpu_to_le32(0xC00000C3) +#define STATUS_UNEXPECTED_NETWORK_ERROR __constant_cpu_to_le32(0xC00000C4) +#define STATUS_BAD_REMOTE_ADAPTER __constant_cpu_to_le32(0xC00000C5) +#define STATUS_PRINT_QUEUE_FULL __constant_cpu_to_le32(0xC00000C6) +#define STATUS_NO_SPOOL_SPACE __constant_cpu_to_le32(0xC00000C7) +#define STATUS_PRINT_CANCELLED __constant_cpu_to_le32(0xC00000C8) +#define STATUS_NETWORK_NAME_DELETED __constant_cpu_to_le32(0xC00000C9) +#define STATUS_NETWORK_ACCESS_DENIED __constant_cpu_to_le32(0xC00000CA) +#define STATUS_BAD_DEVICE_TYPE __constant_cpu_to_le32(0xC00000CB) +#define STATUS_BAD_NETWORK_NAME __constant_cpu_to_le32(0xC00000CC) +#define STATUS_TOO_MANY_NAMES __constant_cpu_to_le32(0xC00000CD) +#define STATUS_TOO_MANY_SESSIONS __constant_cpu_to_le32(0xC00000CE) +#define STATUS_SHARING_PAUSED __constant_cpu_to_le32(0xC00000CF) +#define STATUS_REQUEST_NOT_ACCEPTED __constant_cpu_to_le32(0xC00000D0) +#define STATUS_REDIRECTOR_PAUSED __constant_cpu_to_le32(0xC00000D1) +#define STATUS_NET_WRITE_FAULT __constant_cpu_to_le32(0xC00000D2) +#define STATUS_PROFILING_AT_LIMIT __constant_cpu_to_le32(0xC00000D3) +#define STATUS_NOT_SAME_DEVICE __constant_cpu_to_le32(0xC00000D4) +#define STATUS_FILE_RENAMED __constant_cpu_to_le32(0xC00000D5) +#define STATUS_VIRTUAL_CIRCUIT_CLOSED __constant_cpu_to_le32(0xC00000D6) +#define STATUS_NO_SECURITY_ON_OBJECT __constant_cpu_to_le32(0xC00000D7) +#define STATUS_CANT_WAIT __constant_cpu_to_le32(0xC00000D8) +#define STATUS_PIPE_EMPTY __constant_cpu_to_le32(0xC00000D9) +#define STATUS_CANT_ACCESS_DOMAIN_INFO __constant_cpu_to_le32(0xC00000DA) +#define STATUS_CANT_TERMINATE_SELF __constant_cpu_to_le32(0xC00000DB) +#define STATUS_INVALID_SERVER_STATE __constant_cpu_to_le32(0xC00000DC) +#define STATUS_INVALID_DOMAIN_STATE __constant_cpu_to_le32(0xC00000DD) +#define STATUS_INVALID_DOMAIN_ROLE __constant_cpu_to_le32(0xC00000DE) +#define STATUS_NO_SUCH_DOMAIN __constant_cpu_to_le32(0xC00000DF) +#define STATUS_DOMAIN_EXISTS __constant_cpu_to_le32(0xC00000E0) +#define STATUS_DOMAIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00000E1) +#define STATUS_OPLOCK_NOT_GRANTED __constant_cpu_to_le32(0xC00000E2) +#define STATUS_INVALID_OPLOCK_PROTOCOL __constant_cpu_to_le32(0xC00000E3) +#define STATUS_INTERNAL_DB_CORRUPTION __constant_cpu_to_le32(0xC00000E4) +#define STATUS_INTERNAL_ERROR __constant_cpu_to_le32(0xC00000E5) +#define STATUS_GENERIC_NOT_MAPPED __constant_cpu_to_le32(0xC00000E6) +#define STATUS_BAD_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC00000E7) +#define STATUS_INVALID_USER_BUFFER __constant_cpu_to_le32(0xC00000E8) +#define STATUS_UNEXPECTED_IO_ERROR __constant_cpu_to_le32(0xC00000E9) +#define STATUS_UNEXPECTED_MM_CREATE_ERR __constant_cpu_to_le32(0xC00000EA) +#define STATUS_UNEXPECTED_MM_MAP_ERROR __constant_cpu_to_le32(0xC00000EB) +#define STATUS_UNEXPECTED_MM_EXTEND_ERR __constant_cpu_to_le32(0xC00000EC) +#define STATUS_NOT_LOGON_PROCESS __constant_cpu_to_le32(0xC00000ED) +#define STATUS_LOGON_SESSION_EXISTS __constant_cpu_to_le32(0xC00000EE) +#define STATUS_INVALID_PARAMETER_1 __constant_cpu_to_le32(0xC00000EF) +#define STATUS_INVALID_PARAMETER_2 __constant_cpu_to_le32(0xC00000F0) +#define STATUS_INVALID_PARAMETER_3 __constant_cpu_to_le32(0xC00000F1) +#define STATUS_INVALID_PARAMETER_4 __constant_cpu_to_le32(0xC00000F2) +#define STATUS_INVALID_PARAMETER_5 __constant_cpu_to_le32(0xC00000F3) +#define STATUS_INVALID_PARAMETER_6 __constant_cpu_to_le32(0xC00000F4) +#define STATUS_INVALID_PARAMETER_7 __constant_cpu_to_le32(0xC00000F5) +#define STATUS_INVALID_PARAMETER_8 __constant_cpu_to_le32(0xC00000F6) +#define STATUS_INVALID_PARAMETER_9 __constant_cpu_to_le32(0xC00000F7) +#define STATUS_INVALID_PARAMETER_10 __constant_cpu_to_le32(0xC00000F8) +#define STATUS_INVALID_PARAMETER_11 __constant_cpu_to_le32(0xC00000F9) +#define STATUS_INVALID_PARAMETER_12 __constant_cpu_to_le32(0xC00000FA) +#define STATUS_REDIRECTOR_NOT_STARTED __constant_cpu_to_le32(0xC00000FB) +#define STATUS_REDIRECTOR_STARTED __constant_cpu_to_le32(0xC00000FC) +#define STATUS_STACK_OVERFLOW __constant_cpu_to_le32(0xC00000FD) +#define STATUS_NO_SUCH_PACKAGE __constant_cpu_to_le32(0xC00000FE) +#define STATUS_BAD_FUNCTION_TABLE __constant_cpu_to_le32(0xC00000FF) +#define STATUS_VARIABLE_NOT_FOUND __constant_cpu_to_le32(0xC0000100) +#define STATUS_DIRECTORY_NOT_EMPTY __constant_cpu_to_le32(0xC0000101) +#define STATUS_FILE_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000102) +#define STATUS_NOT_A_DIRECTORY __constant_cpu_to_le32(0xC0000103) +#define STATUS_BAD_LOGON_SESSION_STATE __constant_cpu_to_le32(0xC0000104) +#define STATUS_LOGON_SESSION_COLLISION __constant_cpu_to_le32(0xC0000105) +#define STATUS_NAME_TOO_LONG __constant_cpu_to_le32(0xC0000106) +#define STATUS_FILES_OPEN __constant_cpu_to_le32(0xC0000107) +#define STATUS_CONNECTION_IN_USE __constant_cpu_to_le32(0xC0000108) +#define STATUS_MESSAGE_NOT_FOUND __constant_cpu_to_le32(0xC0000109) +#define STATUS_PROCESS_IS_TERMINATING __constant_cpu_to_le32(0xC000010A) +#define STATUS_INVALID_LOGON_TYPE __constant_cpu_to_le32(0xC000010B) +#define STATUS_NO_GUID_TRANSLATION __constant_cpu_to_le32(0xC000010C) +#define STATUS_CANNOT_IMPERSONATE __constant_cpu_to_le32(0xC000010D) +#define STATUS_IMAGE_ALREADY_LOADED __constant_cpu_to_le32(0xC000010E) +#define STATUS_ABIOS_NOT_PRESENT __constant_cpu_to_le32(0xC000010F) +#define STATUS_ABIOS_LID_NOT_EXIST __constant_cpu_to_le32(0xC0000110) +#define STATUS_ABIOS_LID_ALREADY_OWNED __constant_cpu_to_le32(0xC0000111) +#define STATUS_ABIOS_NOT_LID_OWNER __constant_cpu_to_le32(0xC0000112) +#define STATUS_ABIOS_INVALID_COMMAND __constant_cpu_to_le32(0xC0000113) +#define STATUS_ABIOS_INVALID_LID __constant_cpu_to_le32(0xC0000114) +#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE __constant_cpu_to_le32(0xC0000115) +#define STATUS_ABIOS_INVALID_SELECTOR __constant_cpu_to_le32(0xC0000116) +#define STATUS_NO_LDT __constant_cpu_to_le32(0xC0000117) +#define STATUS_INVALID_LDT_SIZE __constant_cpu_to_le32(0xC0000118) +#define STATUS_INVALID_LDT_OFFSET __constant_cpu_to_le32(0xC0000119) +#define STATUS_INVALID_LDT_DESCRIPTOR __constant_cpu_to_le32(0xC000011A) +#define STATUS_INVALID_IMAGE_NE_FORMAT __constant_cpu_to_le32(0xC000011B) +#define STATUS_RXACT_INVALID_STATE __constant_cpu_to_le32(0xC000011C) +#define STATUS_RXACT_COMMIT_FAILURE __constant_cpu_to_le32(0xC000011D) +#define STATUS_MAPPED_FILE_SIZE_ZERO __constant_cpu_to_le32(0xC000011E) +#define STATUS_TOO_MANY_OPENED_FILES __constant_cpu_to_le32(0xC000011F) +#define STATUS_CANCELLED __constant_cpu_to_le32(0xC0000120) +#define STATUS_CANNOT_DELETE __constant_cpu_to_le32(0xC0000121) +#define STATUS_INVALID_COMPUTER_NAME __constant_cpu_to_le32(0xC0000122) +#define STATUS_FILE_DELETED __constant_cpu_to_le32(0xC0000123) +#define STATUS_SPECIAL_ACCOUNT __constant_cpu_to_le32(0xC0000124) +#define STATUS_SPECIAL_GROUP __constant_cpu_to_le32(0xC0000125) +#define STATUS_SPECIAL_USER __constant_cpu_to_le32(0xC0000126) +#define STATUS_MEMBERS_PRIMARY_GROUP __constant_cpu_to_le32(0xC0000127) +#define STATUS_FILE_CLOSED __constant_cpu_to_le32(0xC0000128) +#define STATUS_TOO_MANY_THREADS __constant_cpu_to_le32(0xC0000129) +#define STATUS_THREAD_NOT_IN_PROCESS __constant_cpu_to_le32(0xC000012A) +#define STATUS_TOKEN_ALREADY_IN_USE __constant_cpu_to_le32(0xC000012B) +#define STATUS_PAGEFILE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC000012C) +#define STATUS_COMMITMENT_LIMIT __constant_cpu_to_le32(0xC000012D) +#define STATUS_INVALID_IMAGE_LE_FORMAT __constant_cpu_to_le32(0xC000012E) +#define STATUS_INVALID_IMAGE_NOT_MZ __constant_cpu_to_le32(0xC000012F) +#define STATUS_INVALID_IMAGE_PROTECT __constant_cpu_to_le32(0xC0000130) +#define STATUS_INVALID_IMAGE_WIN_16 __constant_cpu_to_le32(0xC0000131) +#define STATUS_LOGON_SERVER_CONFLICT __constant_cpu_to_le32(0xC0000132) +#define STATUS_TIME_DIFFERENCE_AT_DC __constant_cpu_to_le32(0xC0000133) +#define STATUS_SYNCHRONIZATION_REQUIRED __constant_cpu_to_le32(0xC0000134) +#define STATUS_DLL_NOT_FOUND __constant_cpu_to_le32(0xC0000135) +#define STATUS_OPEN_FAILED __constant_cpu_to_le32(0xC0000136) +#define STATUS_IO_PRIVILEGE_FAILED __constant_cpu_to_le32(0xC0000137) +#define STATUS_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000138) +#define STATUS_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000139) +#define STATUS_CONTROL_C_EXIT __constant_cpu_to_le32(0xC000013A) +#define STATUS_LOCAL_DISCONNECT __constant_cpu_to_le32(0xC000013B) +#define STATUS_REMOTE_DISCONNECT __constant_cpu_to_le32(0xC000013C) +#define STATUS_REMOTE_RESOURCES __constant_cpu_to_le32(0xC000013D) +#define STATUS_LINK_FAILED __constant_cpu_to_le32(0xC000013E) +#define STATUS_LINK_TIMEOUT __constant_cpu_to_le32(0xC000013F) +#define STATUS_INVALID_CONNECTION __constant_cpu_to_le32(0xC0000140) +#define STATUS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0000141) +#define STATUS_DLL_INIT_FAILED __constant_cpu_to_le32(0xC0000142) +#define STATUS_MISSING_SYSTEMFILE __constant_cpu_to_le32(0xC0000143) +#define STATUS_UNHANDLED_EXCEPTION __constant_cpu_to_le32(0xC0000144) +#define STATUS_APP_INIT_FAILURE __constant_cpu_to_le32(0xC0000145) +#define STATUS_PAGEFILE_CREATE_FAILED __constant_cpu_to_le32(0xC0000146) +#define STATUS_NO_PAGEFILE __constant_cpu_to_le32(0xC0000147) +#define STATUS_INVALID_LEVEL __constant_cpu_to_le32(0xC0000148) +#define STATUS_WRONG_PASSWORD_CORE __constant_cpu_to_le32(0xC0000149) +#define STATUS_ILLEGAL_FLOAT_CONTEXT __constant_cpu_to_le32(0xC000014A) +#define STATUS_PIPE_BROKEN __constant_cpu_to_le32(0xC000014B) +#define STATUS_REGISTRY_CORRUPT __constant_cpu_to_le32(0xC000014C) +#define STATUS_REGISTRY_IO_FAILED __constant_cpu_to_le32(0xC000014D) +#define STATUS_NO_EVENT_PAIR __constant_cpu_to_le32(0xC000014E) +#define STATUS_UNRECOGNIZED_VOLUME __constant_cpu_to_le32(0xC000014F) +#define STATUS_SERIAL_NO_DEVICE_INITED __constant_cpu_to_le32(0xC0000150) +#define STATUS_NO_SUCH_ALIAS __constant_cpu_to_le32(0xC0000151) +#define STATUS_MEMBER_NOT_IN_ALIAS __constant_cpu_to_le32(0xC0000152) +#define STATUS_MEMBER_IN_ALIAS __constant_cpu_to_le32(0xC0000153) +#define STATUS_ALIAS_EXISTS __constant_cpu_to_le32(0xC0000154) +#define STATUS_LOGON_NOT_GRANTED __constant_cpu_to_le32(0xC0000155) +#define STATUS_TOO_MANY_SECRETS __constant_cpu_to_le32(0xC0000156) +#define STATUS_SECRET_TOO_LONG __constant_cpu_to_le32(0xC0000157) +#define STATUS_INTERNAL_DB_ERROR __constant_cpu_to_le32(0xC0000158) +#define STATUS_FULLSCREEN_MODE __constant_cpu_to_le32(0xC0000159) +#define STATUS_TOO_MANY_CONTEXT_IDS __constant_cpu_to_le32(0xC000015A) +#define STATUS_LOGON_TYPE_NOT_GRANTED __constant_cpu_to_le32(0xC000015B) +#define STATUS_NOT_REGISTRY_FILE __constant_cpu_to_le32(0xC000015C) +#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000015D) +#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR __constant_cpu_to_le32(0xC000015E) +#define STATUS_FT_MISSING_MEMBER __constant_cpu_to_le32(0xC000015F) +#define STATUS_ILL_FORMED_SERVICE_ENTRY __constant_cpu_to_le32(0xC0000160) +#define STATUS_ILLEGAL_CHARACTER __constant_cpu_to_le32(0xC0000161) +#define STATUS_UNMAPPABLE_CHARACTER __constant_cpu_to_le32(0xC0000162) +#define STATUS_UNDEFINED_CHARACTER __constant_cpu_to_le32(0xC0000163) +#define STATUS_FLOPPY_VOLUME __constant_cpu_to_le32(0xC0000164) +#define STATUS_FLOPPY_ID_MARK_NOT_FOUND __constant_cpu_to_le32(0xC0000165) +#define STATUS_FLOPPY_WRONG_CYLINDER __constant_cpu_to_le32(0xC0000166) +#define STATUS_FLOPPY_UNKNOWN_ERROR __constant_cpu_to_le32(0xC0000167) +#define STATUS_FLOPPY_BAD_REGISTERS __constant_cpu_to_le32(0xC0000168) +#define STATUS_DISK_RECALIBRATE_FAILED __constant_cpu_to_le32(0xC0000169) +#define STATUS_DISK_OPERATION_FAILED __constant_cpu_to_le32(0xC000016A) +#define STATUS_DISK_RESET_FAILED __constant_cpu_to_le32(0xC000016B) +#define STATUS_SHARED_IRQ_BUSY __constant_cpu_to_le32(0xC000016C) +#define STATUS_FT_ORPHANING __constant_cpu_to_le32(0xC000016D) +#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT __constant_cpu_to_le32(0xC000016E) +#define STATUS_PARTITION_FAILURE __constant_cpu_to_le32(0xC0000172) +#define STATUS_INVALID_BLOCK_LENGTH __constant_cpu_to_le32(0xC0000173) +#define STATUS_DEVICE_NOT_PARTITIONED __constant_cpu_to_le32(0xC0000174) +#define STATUS_UNABLE_TO_LOCK_MEDIA __constant_cpu_to_le32(0xC0000175) +#define STATUS_UNABLE_TO_UNLOAD_MEDIA __constant_cpu_to_le32(0xC0000176) +#define STATUS_EOM_OVERFLOW __constant_cpu_to_le32(0xC0000177) +#define STATUS_NO_MEDIA __constant_cpu_to_le32(0xC0000178) +#define STATUS_NO_SUCH_MEMBER __constant_cpu_to_le32(0xC000017A) +#define STATUS_INVALID_MEMBER __constant_cpu_to_le32(0xC000017B) +#define STATUS_KEY_DELETED __constant_cpu_to_le32(0xC000017C) +#define STATUS_NO_LOG_SPACE __constant_cpu_to_le32(0xC000017D) +#define STATUS_TOO_MANY_SIDS __constant_cpu_to_le32(0xC000017E) +#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000017F) +#define STATUS_KEY_HAS_CHILDREN __constant_cpu_to_le32(0xC0000180) +#define STATUS_CHILD_MUST_BE_VOLATILE __constant_cpu_to_le32(0xC0000181) +#define STATUS_DEVICE_CONFIGURATION_ERROR __constant_cpu_to_le32(0xC0000182) +#define STATUS_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC0000183) +#define STATUS_INVALID_DEVICE_STATE __constant_cpu_to_le32(0xC0000184) +#define STATUS_IO_DEVICE_ERROR __constant_cpu_to_le32(0xC0000185) +#define STATUS_DEVICE_PROTOCOL_ERROR __constant_cpu_to_le32(0xC0000186) +#define STATUS_BACKUP_CONTROLLER __constant_cpu_to_le32(0xC0000187) +#define STATUS_LOG_FILE_FULL __constant_cpu_to_le32(0xC0000188) +#define STATUS_TOO_LATE __constant_cpu_to_le32(0xC0000189) +#define STATUS_NO_TRUST_LSA_SECRET __constant_cpu_to_le32(0xC000018A) +#define STATUS_NO_TRUST_SAM_ACCOUNT __constant_cpu_to_le32(0xC000018B) +#define STATUS_TRUSTED_DOMAIN_FAILURE __constant_cpu_to_le32(0xC000018C) +#define STATUS_TRUSTED_RELATIONSHIP_FAILURE __constant_cpu_to_le32(0xC000018D) +#define STATUS_EVENTLOG_FILE_CORRUPT __constant_cpu_to_le32(0xC000018E) +#define STATUS_EVENTLOG_CANT_START __constant_cpu_to_le32(0xC000018F) +#define STATUS_TRUST_FAILURE __constant_cpu_to_le32(0xC0000190) +#define STATUS_MUTANT_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000191) +#define STATUS_NETLOGON_NOT_STARTED __constant_cpu_to_le32(0xC0000192) +#define STATUS_ACCOUNT_EXPIRED __constant_cpu_to_le32(0xC0000193) +#define STATUS_POSSIBLE_DEADLOCK __constant_cpu_to_le32(0xC0000194) +#define STATUS_NETWORK_CREDENTIAL_CONFLICT __constant_cpu_to_le32(0xC0000195) +#define STATUS_REMOTE_SESSION_LIMIT __constant_cpu_to_le32(0xC0000196) +#define STATUS_EVENTLOG_FILE_CHANGED __constant_cpu_to_le32(0xC0000197) +#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000198) +#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000199) +#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT __constant_cpu_to_le32(0xC000019A) +#define STATUS_DOMAIN_TRUST_INCONSISTENT __constant_cpu_to_le32(0xC000019B) +#define STATUS_FS_DRIVER_REQUIRED __constant_cpu_to_le32(0xC000019C) +#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL __constant_cpu_to_le32(0xC000019D) +#define STATUS_NETWORK_OPEN_RESTRICTION __constant_cpu_to_le32(0xC0000201) +#define STATUS_NO_USER_SESSION_KEY __constant_cpu_to_le32(0xC0000202) +#define STATUS_USER_SESSION_DELETED __constant_cpu_to_le32(0xC0000203) +#define STATUS_RESOURCE_LANG_NOT_FOUND __constant_cpu_to_le32(0xC0000204) +#define STATUS_INSUFF_SERVER_RESOURCES __constant_cpu_to_le32(0xC0000205) +#define STATUS_INVALID_BUFFER_SIZE __constant_cpu_to_le32(0xC0000206) +#define STATUS_INVALID_ADDRESS_COMPONENT __constant_cpu_to_le32(0xC0000207) +#define STATUS_INVALID_ADDRESS_WILDCARD __constant_cpu_to_le32(0xC0000208) +#define STATUS_TOO_MANY_ADDRESSES __constant_cpu_to_le32(0xC0000209) +#define STATUS_ADDRESS_ALREADY_EXISTS __constant_cpu_to_le32(0xC000020A) +#define STATUS_ADDRESS_CLOSED __constant_cpu_to_le32(0xC000020B) +#define STATUS_CONNECTION_DISCONNECTED __constant_cpu_to_le32(0xC000020C) +#define STATUS_CONNECTION_RESET __constant_cpu_to_le32(0xC000020D) +#define STATUS_TOO_MANY_NODES __constant_cpu_to_le32(0xC000020E) +#define STATUS_TRANSACTION_ABORTED __constant_cpu_to_le32(0xC000020F) +#define STATUS_TRANSACTION_TIMED_OUT __constant_cpu_to_le32(0xC0000210) +#define STATUS_TRANSACTION_NO_RELEASE __constant_cpu_to_le32(0xC0000211) +#define STATUS_TRANSACTION_NO_MATCH __constant_cpu_to_le32(0xC0000212) +#define STATUS_TRANSACTION_RESPONDED __constant_cpu_to_le32(0xC0000213) +#define STATUS_TRANSACTION_INVALID_ID __constant_cpu_to_le32(0xC0000214) +#define STATUS_TRANSACTION_INVALID_TYPE __constant_cpu_to_le32(0xC0000215) +#define STATUS_NOT_SERVER_SESSION __constant_cpu_to_le32(0xC0000216) +#define STATUS_NOT_CLIENT_SESSION __constant_cpu_to_le32(0xC0000217) +#define STATUS_CANNOT_LOAD_REGISTRY_FILE __constant_cpu_to_le32(0xC0000218) +#define STATUS_DEBUG_ATTACH_FAILED __constant_cpu_to_le32(0xC0000219) +#define STATUS_SYSTEM_PROCESS_TERMINATED __constant_cpu_to_le32(0xC000021A) +#define STATUS_DATA_NOT_ACCEPTED __constant_cpu_to_le32(0xC000021B) +#define STATUS_NO_BROWSER_SERVERS_FOUND __constant_cpu_to_le32(0xC000021C) +#define STATUS_VDM_HARD_ERROR __constant_cpu_to_le32(0xC000021D) +#define STATUS_DRIVER_CANCEL_TIMEOUT __constant_cpu_to_le32(0xC000021E) +#define STATUS_REPLY_MESSAGE_MISMATCH __constant_cpu_to_le32(0xC000021F) +#define STATUS_MAPPED_ALIGNMENT __constant_cpu_to_le32(0xC0000220) +#define STATUS_IMAGE_CHECKSUM_MISMATCH __constant_cpu_to_le32(0xC0000221) +#define STATUS_LOST_WRITEBEHIND_DATA __constant_cpu_to_le32(0xC0000222) +#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID __constant_cpu_to_le32(0xC0000223) +#define STATUS_PASSWORD_MUST_CHANGE __constant_cpu_to_le32(0xC0000224) +#define STATUS_NOT_FOUND __constant_cpu_to_le32(0xC0000225) +#define STATUS_NOT_TINY_STREAM __constant_cpu_to_le32(0xC0000226) +#define STATUS_RECOVERY_FAILURE __constant_cpu_to_le32(0xC0000227) +#define STATUS_STACK_OVERFLOW_READ __constant_cpu_to_le32(0xC0000228) +#define STATUS_FAIL_CHECK __constant_cpu_to_le32(0xC0000229) +#define STATUS_DUPLICATE_OBJECTID __constant_cpu_to_le32(0xC000022A) +#define STATUS_OBJECTID_EXISTS __constant_cpu_to_le32(0xC000022B) +#define STATUS_CONVERT_TO_LARGE __constant_cpu_to_le32(0xC000022C) +#define STATUS_RETRY __constant_cpu_to_le32(0xC000022D) +#define STATUS_FOUND_OUT_OF_SCOPE __constant_cpu_to_le32(0xC000022E) +#define STATUS_ALLOCATE_BUCKET __constant_cpu_to_le32(0xC000022F) +#define STATUS_PROPSET_NOT_FOUND __constant_cpu_to_le32(0xC0000230) +#define STATUS_MARSHALL_OVERFLOW __constant_cpu_to_le32(0xC0000231) +#define STATUS_INVALID_VARIANT __constant_cpu_to_le32(0xC0000232) +#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND __constant_cpu_to_le32(0xC0000233) +#define STATUS_ACCOUNT_LOCKED_OUT __constant_cpu_to_le32(0xC0000234) +#define STATUS_HANDLE_NOT_CLOSABLE __constant_cpu_to_le32(0xC0000235) +#define STATUS_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000236) +#define STATUS_GRACEFUL_DISCONNECT __constant_cpu_to_le32(0xC0000237) +#define STATUS_ADDRESS_ALREADY_ASSOCIATED __constant_cpu_to_le32(0xC0000238) +#define STATUS_ADDRESS_NOT_ASSOCIATED __constant_cpu_to_le32(0xC0000239) +#define STATUS_CONNECTION_INVALID __constant_cpu_to_le32(0xC000023A) +#define STATUS_CONNECTION_ACTIVE __constant_cpu_to_le32(0xC000023B) +#define STATUS_NETWORK_UNREACHABLE __constant_cpu_to_le32(0xC000023C) +#define STATUS_HOST_UNREACHABLE __constant_cpu_to_le32(0xC000023D) +#define STATUS_PROTOCOL_UNREACHABLE __constant_cpu_to_le32(0xC000023E) +#define STATUS_PORT_UNREACHABLE __constant_cpu_to_le32(0xC000023F) +#define STATUS_REQUEST_ABORTED __constant_cpu_to_le32(0xC0000240) +#define STATUS_CONNECTION_ABORTED __constant_cpu_to_le32(0xC0000241) +#define STATUS_BAD_COMPRESSION_BUFFER __constant_cpu_to_le32(0xC0000242) +#define STATUS_USER_MAPPED_FILE __constant_cpu_to_le32(0xC0000243) +#define STATUS_AUDIT_FAILED __constant_cpu_to_le32(0xC0000244) +#define STATUS_TIMER_RESOLUTION_NOT_SET __constant_cpu_to_le32(0xC0000245) +#define STATUS_CONNECTION_COUNT_LIMIT __constant_cpu_to_le32(0xC0000246) +#define STATUS_LOGIN_TIME_RESTRICTION __constant_cpu_to_le32(0xC0000247) +#define STATUS_LOGIN_WKSTA_RESTRICTION __constant_cpu_to_le32(0xC0000248) +#define STATUS_IMAGE_MP_UP_MISMATCH __constant_cpu_to_le32(0xC0000249) +#define STATUS_INSUFFICIENT_LOGON_INFO __constant_cpu_to_le32(0xC0000250) +#define STATUS_BAD_DLL_ENTRYPOINT __constant_cpu_to_le32(0xC0000251) +#define STATUS_BAD_SERVICE_ENTRYPOINT __constant_cpu_to_le32(0xC0000252) +#define STATUS_LPC_REPLY_LOST __constant_cpu_to_le32(0xC0000253) +#define STATUS_IP_ADDRESS_CONFLICT1 __constant_cpu_to_le32(0xC0000254) +#define STATUS_IP_ADDRESS_CONFLICT2 __constant_cpu_to_le32(0xC0000255) +#define STATUS_REGISTRY_QUOTA_LIMIT __constant_cpu_to_le32(0xC0000256) +#define STATUS_PATH_NOT_COVERED __constant_cpu_to_le32(0xC0000257) +#define STATUS_NO_CALLBACK_ACTIVE __constant_cpu_to_le32(0xC0000258) +#define STATUS_LICENSE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000259) +#define STATUS_PWD_TOO_SHORT __constant_cpu_to_le32(0xC000025A) +#define STATUS_PWD_TOO_RECENT __constant_cpu_to_le32(0xC000025B) +#define STATUS_PWD_HISTORY_CONFLICT __constant_cpu_to_le32(0xC000025C) +#define STATUS_PLUGPLAY_NO_DEVICE __constant_cpu_to_le32(0xC000025E) +#define STATUS_UNSUPPORTED_COMPRESSION __constant_cpu_to_le32(0xC000025F) +#define STATUS_INVALID_HW_PROFILE __constant_cpu_to_le32(0xC0000260) +#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH __constant_cpu_to_le32(0xC0000261) +#define STATUS_DRIVER_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000262) +#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000263) +#define STATUS_RESOURCE_NOT_OWNED __constant_cpu_to_le32(0xC0000264) +#define STATUS_TOO_MANY_LINKS __constant_cpu_to_le32(0xC0000265) +#define STATUS_QUOTA_LIST_INCONSISTENT __constant_cpu_to_le32(0xC0000266) +#define STATUS_FILE_IS_OFFLINE __constant_cpu_to_le32(0xC0000267) +#define STATUS_EVALUATION_EXPIRATION __constant_cpu_to_le32(0xC0000268) +#define STATUS_ILLEGAL_DLL_RELOCATION __constant_cpu_to_le32(0xC0000269) +#define STATUS_LICENSE_VIOLATION __constant_cpu_to_le32(0xC000026A) +#define STATUS_DLL_INIT_FAILED_LOGOFF __constant_cpu_to_le32(0xC000026B) +#define STATUS_DRIVER_UNABLE_TO_LOAD __constant_cpu_to_le32(0xC000026C) +#define STATUS_DFS_UNAVAILABLE __constant_cpu_to_le32(0xC000026D) +#define STATUS_VOLUME_DISMOUNTED __constant_cpu_to_le32(0xC000026E) +#define STATUS_WX86_INTERNAL_ERROR __constant_cpu_to_le32(0xC000026F) +#define STATUS_WX86_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000270) +#define STATUS_VALIDATE_CONTINUE __constant_cpu_to_le32(0xC0000271) +#define STATUS_NO_MATCH __constant_cpu_to_le32(0xC0000272) +#define STATUS_NO_MORE_MATCHES __constant_cpu_to_le32(0xC0000273) +#define STATUS_NOT_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000275) +#define STATUS_IO_REPARSE_TAG_INVALID __constant_cpu_to_le32(0xC0000276) +#define STATUS_IO_REPARSE_TAG_MISMATCH __constant_cpu_to_le32(0xC0000277) +#define STATUS_IO_REPARSE_DATA_INVALID __constant_cpu_to_le32(0xC0000278) +#define STATUS_IO_REPARSE_TAG_NOT_HANDLED __constant_cpu_to_le32(0xC0000279) +#define STATUS_REPARSE_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000280) +#define STATUS_DIRECTORY_IS_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000281) +#define STATUS_RANGE_LIST_CONFLICT __constant_cpu_to_le32(0xC0000282) +#define STATUS_SOURCE_ELEMENT_EMPTY __constant_cpu_to_le32(0xC0000283) +#define STATUS_DESTINATION_ELEMENT_FULL __constant_cpu_to_le32(0xC0000284) +#define STATUS_ILLEGAL_ELEMENT_ADDRESS __constant_cpu_to_le32(0xC0000285) +#define STATUS_MAGAZINE_NOT_PRESENT __constant_cpu_to_le32(0xC0000286) +#define STATUS_REINITIALIZATION_NEEDED __constant_cpu_to_le32(0xC0000287) +#define STATUS_ENCRYPTION_FAILED __constant_cpu_to_le32(0xC000028A) +#define STATUS_DECRYPTION_FAILED __constant_cpu_to_le32(0xC000028B) +#define STATUS_RANGE_NOT_FOUND __constant_cpu_to_le32(0xC000028C) +#define STATUS_NO_RECOVERY_POLICY __constant_cpu_to_le32(0xC000028D) +#define STATUS_NO_EFS __constant_cpu_to_le32(0xC000028E) +#define STATUS_WRONG_EFS __constant_cpu_to_le32(0xC000028F) +#define STATUS_NO_USER_KEYS __constant_cpu_to_le32(0xC0000290) +#define STATUS_FILE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0000291) +#define STATUS_NOT_EXPORT_FORMAT __constant_cpu_to_le32(0xC0000292) +#define STATUS_FILE_ENCRYPTED __constant_cpu_to_le32(0xC0000293) +#define STATUS_WMI_GUID_NOT_FOUND __constant_cpu_to_le32(0xC0000295) +#define STATUS_WMI_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC0000296) +#define STATUS_WMI_ITEMID_NOT_FOUND __constant_cpu_to_le32(0xC0000297) +#define STATUS_WMI_TRY_AGAIN __constant_cpu_to_le32(0xC0000298) +#define STATUS_SHARED_POLICY __constant_cpu_to_le32(0xC0000299) +#define STATUS_POLICY_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC000029A) +#define STATUS_POLICY_ONLY_IN_DS __constant_cpu_to_le32(0xC000029B) +#define STATUS_VOLUME_NOT_UPGRADED __constant_cpu_to_le32(0xC000029C) +#define STATUS_REMOTE_STORAGE_NOT_ACTIVE __constant_cpu_to_le32(0xC000029D) +#define STATUS_REMOTE_STORAGE_MEDIA_ERROR __constant_cpu_to_le32(0xC000029E) +#define STATUS_NO_TRACKING_SERVICE __constant_cpu_to_le32(0xC000029F) +#define STATUS_SERVER_SID_MISMATCH __constant_cpu_to_le32(0xC00002A0) +#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE __constant_cpu_to_le32(0xC00002A1) +#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX __constant_cpu_to_le32(0xC00002A2) +#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED __constant_cpu_to_le32(0xC00002A3) +#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS __constant_cpu_to_le32(0xC00002A4) +#define STATUS_DS_BUSY __constant_cpu_to_le32(0xC00002A5) +#define STATUS_DS_UNAVAILABLE __constant_cpu_to_le32(0xC00002A6) +#define STATUS_DS_NO_RIDS_ALLOCATED __constant_cpu_to_le32(0xC00002A7) +#define STATUS_DS_NO_MORE_RIDS __constant_cpu_to_le32(0xC00002A8) +#define STATUS_DS_INCORRECT_ROLE_OWNER __constant_cpu_to_le32(0xC00002A9) +#define STATUS_DS_RIDMGR_INIT_ERROR __constant_cpu_to_le32(0xC00002AA) +#define STATUS_DS_OBJ_CLASS_VIOLATION __constant_cpu_to_le32(0xC00002AB) +#define STATUS_DS_CANT_ON_NON_LEAF __constant_cpu_to_le32(0xC00002AC) +#define STATUS_DS_CANT_ON_RDN __constant_cpu_to_le32(0xC00002AD) +#define STATUS_DS_CANT_MOD_OBJ_CLASS __constant_cpu_to_le32(0xC00002AE) +#define STATUS_DS_CROSS_DOM_MOVE_FAILED __constant_cpu_to_le32(0xC00002AF) +#define STATUS_DS_GC_NOT_AVAILABLE __constant_cpu_to_le32(0xC00002B0) +#define STATUS_DIRECTORY_SERVICE_REQUIRED __constant_cpu_to_le32(0xC00002B1) +#define STATUS_REPARSE_ATTRIBUTE_CONFLICT __constant_cpu_to_le32(0xC00002B2) +#define STATUS_CANT_ENABLE_DENY_ONLY __constant_cpu_to_le32(0xC00002B3) +#define STATUS_FLOAT_MULTIPLE_FAULTS __constant_cpu_to_le32(0xC00002B4) +#define STATUS_FLOAT_MULTIPLE_TRAPS __constant_cpu_to_le32(0xC00002B5) +#define STATUS_DEVICE_REMOVED __constant_cpu_to_le32(0xC00002B6) +#define STATUS_JOURNAL_DELETE_IN_PROGRESS __constant_cpu_to_le32(0xC00002B7) +#define STATUS_JOURNAL_NOT_ACTIVE __constant_cpu_to_le32(0xC00002B8) +#define STATUS_NOINTERFACE __constant_cpu_to_le32(0xC00002B9) +#define STATUS_DS_ADMIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00002C1) +#define STATUS_DRIVER_FAILED_SLEEP __constant_cpu_to_le32(0xC00002C2) +#define STATUS_MUTUAL_AUTHENTICATION_FAILED __constant_cpu_to_le32(0xC00002C3) +#define STATUS_CORRUPT_SYSTEM_FILE __constant_cpu_to_le32(0xC00002C4) +#define STATUS_DATATYPE_MISALIGNMENT_ERROR __constant_cpu_to_le32(0xC00002C5) +#define STATUS_WMI_READ_ONLY __constant_cpu_to_le32(0xC00002C6) +#define STATUS_WMI_SET_FAILURE __constant_cpu_to_le32(0xC00002C7) +#define STATUS_COMMITMENT_MINIMUM __constant_cpu_to_le32(0xC00002C8) +#define STATUS_REG_NAT_CONSUMPTION __constant_cpu_to_le32(0xC00002C9) +#define STATUS_TRANSPORT_FULL __constant_cpu_to_le32(0xC00002CA) +#define STATUS_DS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002CB) +#define STATUS_ONLY_IF_CONNECTED __constant_cpu_to_le32(0xC00002CC) +#define STATUS_DS_SENSITIVE_GROUP_VIOLATION __constant_cpu_to_le32(0xC00002CD) +#define STATUS_PNP_RESTART_ENUMERATION __constant_cpu_to_le32(0xC00002CE) +#define STATUS_JOURNAL_ENTRY_DELETED __constant_cpu_to_le32(0xC00002CF) +#define STATUS_DS_CANT_MOD_PRIMARYGROUPID __constant_cpu_to_le32(0xC00002D0) +#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE __constant_cpu_to_le32(0xC00002D1) +#define STATUS_PNP_REBOOT_REQUIRED __constant_cpu_to_le32(0xC00002D2) +#define STATUS_POWER_STATE_INVALID __constant_cpu_to_le32(0xC00002D3) +#define STATUS_DS_INVALID_GROUP_TYPE __constant_cpu_to_le32(0xC00002D4) +#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D5) +#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D6) +#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D7) +#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC00002D8) +#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D9) +#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER __constant_cpu_to_le32(0xC00002DA) +#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002DB) +#define STATUS_DS_HAVE_PRIMARY_MEMBERS __constant_cpu_to_le32(0xC00002DC) +#define STATUS_WMI_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002DD) +#define STATUS_INSUFFICIENT_POWER __constant_cpu_to_le32(0xC00002DE) +#define STATUS_SAM_NEED_BOOTKEY_PASSWORD __constant_cpu_to_le32(0xC00002DF) +#define STATUS_SAM_NEED_BOOTKEY_FLOPPY __constant_cpu_to_le32(0xC00002E0) +#define STATUS_DS_CANT_START __constant_cpu_to_le32(0xC00002E1) +#define STATUS_DS_INIT_FAILURE __constant_cpu_to_le32(0xC00002E2) +#define STATUS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002E3) +#define STATUS_DS_GC_REQUIRED __constant_cpu_to_le32(0xC00002E4) +#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY __constant_cpu_to_le32(0xC00002E5) +#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS __constant_cpu_to_le32(0xC00002E6) +#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC00002E7) +#define STATUS_MULTIPLE_FAULT_VIOLATION __constant_cpu_to_le32(0xC00002E8) +#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED __constant_cpu_to_le32(0xC00002E9) +#define STATUS_CANNOT_MAKE __constant_cpu_to_le32(0xC00002EA) +#define STATUS_SYSTEM_SHUTDOWN __constant_cpu_to_le32(0xC00002EB) +#define STATUS_DS_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002EC) +#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002ED) +#define STATUS_UNFINISHED_CONTEXT_DELETED __constant_cpu_to_le32(0xC00002EE) +#define STATUS_NO_TGT_REPLY __constant_cpu_to_le32(0xC00002EF) +#define STATUS_OBJECTID_NOT_FOUND __constant_cpu_to_le32(0xC00002F0) +#define STATUS_NO_IP_ADDRESSES __constant_cpu_to_le32(0xC00002F1) +#define STATUS_WRONG_CREDENTIAL_HANDLE __constant_cpu_to_le32(0xC00002F2) +#define STATUS_CRYPTO_SYSTEM_INVALID __constant_cpu_to_le32(0xC00002F3) +#define STATUS_MAX_REFERRALS_EXCEEDED __constant_cpu_to_le32(0xC00002F4) +#define STATUS_MUST_BE_KDC __constant_cpu_to_le32(0xC00002F5) +#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002F6) +#define STATUS_TOO_MANY_PRINCIPALS __constant_cpu_to_le32(0xC00002F7) +#define STATUS_NO_PA_DATA __constant_cpu_to_le32(0xC00002F8) +#define STATUS_PKINIT_NAME_MISMATCH __constant_cpu_to_le32(0xC00002F9) +#define STATUS_SMARTCARD_LOGON_REQUIRED __constant_cpu_to_le32(0xC00002FA) +#define STATUS_KDC_INVALID_REQUEST __constant_cpu_to_le32(0xC00002FB) +#define STATUS_KDC_UNABLE_TO_REFER __constant_cpu_to_le32(0xC00002FC) +#define STATUS_KDC_UNKNOWN_ETYPE __constant_cpu_to_le32(0xC00002FD) +#define STATUS_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FE) +#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FF) +#define STATUS_NOT_SUPPORTED_ON_SBS __constant_cpu_to_le32(0xC0000300) +#define STATUS_WMI_GUID_DISCONNECTED __constant_cpu_to_le32(0xC0000301) +#define STATUS_WMI_ALREADY_DISABLED __constant_cpu_to_le32(0xC0000302) +#define STATUS_WMI_ALREADY_ENABLED __constant_cpu_to_le32(0xC0000303) +#define STATUS_MFT_TOO_FRAGMENTED __constant_cpu_to_le32(0xC0000304) +#define STATUS_COPY_PROTECTION_FAILURE __constant_cpu_to_le32(0xC0000305) +#define STATUS_CSS_AUTHENTICATION_FAILURE __constant_cpu_to_le32(0xC0000306) +#define STATUS_CSS_KEY_NOT_PRESENT __constant_cpu_to_le32(0xC0000307) +#define STATUS_CSS_KEY_NOT_ESTABLISHED __constant_cpu_to_le32(0xC0000308) +#define STATUS_CSS_SCRAMBLED_SECTOR __constant_cpu_to_le32(0xC0000309) +#define STATUS_CSS_REGION_MISMATCH __constant_cpu_to_le32(0xC000030A) +#define STATUS_CSS_RESETS_EXHAUSTED __constant_cpu_to_le32(0xC000030B) +#define STATUS_PKINIT_FAILURE __constant_cpu_to_le32(0xC0000320) +#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE __constant_cpu_to_le32(0xC0000321) +#define STATUS_NO_KERB_KEY __constant_cpu_to_le32(0xC0000322) +#define STATUS_HOST_DOWN __constant_cpu_to_le32(0xC0000350) +#define STATUS_UNSUPPORTED_PREAUTH __constant_cpu_to_le32(0xC0000351) +#define STATUS_EFS_ALG_BLOB_TOO_BIG __constant_cpu_to_le32(0xC0000352) +#define STATUS_PORT_NOT_SET __constant_cpu_to_le32(0xC0000353) +#define STATUS_DEBUGGER_INACTIVE __constant_cpu_to_le32(0xC0000354) +#define STATUS_DS_VERSION_CHECK_FAILURE __constant_cpu_to_le32(0xC0000355) +#define STATUS_AUDITING_DISABLED __constant_cpu_to_le32(0xC0000356) +#define STATUS_PRENT4_MACHINE_ACCOUNT __constant_cpu_to_le32(0xC0000357) +#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC0000358) +#define STATUS_INVALID_IMAGE_WIN_32 __constant_cpu_to_le32(0xC0000359) +#define STATUS_INVALID_IMAGE_WIN_64 __constant_cpu_to_le32(0xC000035A) +#define STATUS_BAD_BINDINGS __constant_cpu_to_le32(0xC000035B) +#define STATUS_NETWORK_SESSION_EXPIRED __constant_cpu_to_le32(0xC000035C) +#define STATUS_APPHELP_BLOCK __constant_cpu_to_le32(0xC000035D) +#define STATUS_ALL_SIDS_FILTERED __constant_cpu_to_le32(0xC000035E) +#define STATUS_NOT_SAFE_MODE_DRIVER __constant_cpu_to_le32(0xC000035F) +#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT __constant_cpu_to_le32(0xC0000361) +#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH __constant_cpu_to_le32(0xC0000362) +#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER __constant_cpu_to_le32(0xC0000363) +#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER __constant_cpu_to_le32(0xC0000364) +#define STATUS_FAILED_DRIVER_ENTRY __constant_cpu_to_le32(0xC0000365) +#define STATUS_DEVICE_ENUMERATION_ERROR __constant_cpu_to_le32(0xC0000366) +#define STATUS_MOUNT_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000368) +#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER __constant_cpu_to_le32(0xC0000369) +#define STATUS_MCA_OCCURED __constant_cpu_to_le32(0xC000036A) +#define STATUS_DRIVER_BLOCKED_CRITICAL __constant_cpu_to_le32(0xC000036B) +#define STATUS_DRIVER_BLOCKED __constant_cpu_to_le32(0xC000036C) +#define STATUS_DRIVER_DATABASE_ERROR __constant_cpu_to_le32(0xC000036D) +#define STATUS_SYSTEM_HIVE_TOO_LARGE __constant_cpu_to_le32(0xC000036E) +#define STATUS_INVALID_IMPORT_OF_NON_DLL __constant_cpu_to_le32(0xC000036F) +#define STATUS_NO_SECRETS __constant_cpu_to_le32(0xC0000371) +#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY __constant_cpu_to_le32(0xC0000372) +#define STATUS_FAILED_STACK_SWITCH __constant_cpu_to_le32(0xC0000373) +#define STATUS_HEAP_CORRUPTION __constant_cpu_to_le32(0xC0000374) +#define STATUS_SMARTCARD_WRONG_PIN __constant_cpu_to_le32(0xC0000380) +#define STATUS_SMARTCARD_CARD_BLOCKED __constant_cpu_to_le32(0xC0000381) +#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED __constant_cpu_to_le32(0xC0000382) +#define STATUS_SMARTCARD_NO_CARD __constant_cpu_to_le32(0xC0000383) +#define STATUS_SMARTCARD_NO_KEY_CONTAINER __constant_cpu_to_le32(0xC0000384) +#define STATUS_SMARTCARD_NO_CERTIFICATE __constant_cpu_to_le32(0xC0000385) +#define STATUS_SMARTCARD_NO_KEYSET __constant_cpu_to_le32(0xC0000386) +#define STATUS_SMARTCARD_IO_ERROR __constant_cpu_to_le32(0xC0000387) +#define STATUS_DOWNGRADE_DETECTED __constant_cpu_to_le32(0xC0000388) +#define STATUS_SMARTCARD_CERT_REVOKED __constant_cpu_to_le32(0xC0000389) +#define STATUS_ISSUING_CA_UNTRUSTED __constant_cpu_to_le32(0xC000038A) +#define STATUS_REVOCATION_OFFLINE_C __constant_cpu_to_le32(0xC000038B) +#define STATUS_PKINIT_CLIENT_FAILURE __constant_cpu_to_le32(0xC000038C) +#define STATUS_SMARTCARD_CERT_EXPIRED __constant_cpu_to_le32(0xC000038D) +#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD __constant_cpu_to_le32(0xC000038E) +#define STATUS_SMARTCARD_SILENT_CONTEXT __constant_cpu_to_le32(0xC000038F) +#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000401) +#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000402) +#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000403) +#define STATUS_DS_NAME_NOT_UNIQUE __constant_cpu_to_le32(0xC0000404) +#define STATUS_DS_DUPLICATE_ID_FOUND __constant_cpu_to_le32(0xC0000405) +#define STATUS_DS_GROUP_CONVERSION_ERROR __constant_cpu_to_le32(0xC0000406) +#define STATUS_VOLSNAP_PREPARE_HIBERNATE __constant_cpu_to_le32(0xC0000407) +#define STATUS_USER2USER_REQUIRED __constant_cpu_to_le32(0xC0000408) +#define STATUS_STACK_BUFFER_OVERRUN __constant_cpu_to_le32(0xC0000409) +#define STATUS_NO_S4U_PROT_SUPPORT __constant_cpu_to_le32(0xC000040A) +#define STATUS_CROSSREALM_DELEGATION_FAILURE __constant_cpu_to_le32(0xC000040B) +#define STATUS_REVOCATION_OFFLINE_KDC __constant_cpu_to_le32(0xC000040C) +#define STATUS_ISSUING_CA_UNTRUSTED_KDC __constant_cpu_to_le32(0xC000040D) +#define STATUS_KDC_CERT_EXPIRED __constant_cpu_to_le32(0xC000040E) +#define STATUS_KDC_CERT_REVOKED __constant_cpu_to_le32(0xC000040F) +#define STATUS_PARAMETER_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000410) +#define STATUS_HIBERNATION_FAILURE __constant_cpu_to_le32(0xC0000411) +#define STATUS_DELAY_LOAD_FAILED __constant_cpu_to_le32(0xC0000412) +#define STATUS_AUTHENTICATION_FIREWALL_FAILED __constant_cpu_to_le32(0xC0000413) +#define STATUS_VDM_DISALLOWED __constant_cpu_to_le32(0xC0000414) +#define STATUS_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC0000415) +#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE __constant_cpu_to_le32(0xC0000416) +#define STATUS_INVALID_CRUNTIME_PARAMETER __constant_cpu_to_le32(0xC0000417) +#define STATUS_NTLM_BLOCKED __constant_cpu_to_le32(0xC0000418) +#define STATUS_ASSERTION_FAILURE __constant_cpu_to_le32(0xC0000420) +#define STATUS_VERIFIER_STOP __constant_cpu_to_le32(0xC0000421) +#define STATUS_CALLBACK_POP_STACK __constant_cpu_to_le32(0xC0000423) +#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED __constant_cpu_to_le32(0xC0000424) +#define STATUS_HIVE_UNLOADED __constant_cpu_to_le32(0xC0000425) +#define STATUS_COMPRESSION_DISABLED __constant_cpu_to_le32(0xC0000426) +#define STATUS_FILE_SYSTEM_LIMITATION __constant_cpu_to_le32(0xC0000427) +#define STATUS_INVALID_IMAGE_HASH __constant_cpu_to_le32(0xC0000428) +#define STATUS_NOT_CAPABLE __constant_cpu_to_le32(0xC0000429) +#define STATUS_REQUEST_OUT_OF_SEQUENCE __constant_cpu_to_le32(0xC000042A) +#define STATUS_IMPLEMENTATION_LIMIT __constant_cpu_to_le32(0xC000042B) +#define STATUS_ELEVATION_REQUIRED __constant_cpu_to_le32(0xC000042C) +#define STATUS_BEYOND_VDL __constant_cpu_to_le32(0xC0000432) +#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS __constant_cpu_to_le32(0xC0000433) +#define STATUS_PTE_CHANGED __constant_cpu_to_le32(0xC0000434) +#define STATUS_PURGE_FAILED __constant_cpu_to_le32(0xC0000435) +#define STATUS_CRED_REQUIRES_CONFIRMATION __constant_cpu_to_le32(0xC0000440) +#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE __constant_cpu_to_le32(0xC0000441) +#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER __constant_cpu_to_le32(0xC0000442) +#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000443) +#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000444) +#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE __constant_cpu_to_le32(0xC0000445) +#define STATUS_INVALID_LABEL __constant_cpu_to_le32(0xC0000446) +#define STATUS_DRIVER_PROCESS_TERMINATED __constant_cpu_to_le32(0xC0000450) +#define STATUS_AMBIGUOUS_SYSTEM_DEVICE __constant_cpu_to_le32(0xC0000451) +#define STATUS_SYSTEM_DEVICE_NOT_FOUND __constant_cpu_to_le32(0xC0000452) +#define STATUS_RESTART_BOOT_APPLICATION __constant_cpu_to_le32(0xC0000453) +#define STATUS_INVALID_TASK_NAME __constant_cpu_to_le32(0xC0000500) +#define STATUS_INVALID_TASK_INDEX __constant_cpu_to_le32(0xC0000501) +#define STATUS_THREAD_ALREADY_IN_TASK __constant_cpu_to_le32(0xC0000502) +#define STATUS_CALLBACK_BYPASS __constant_cpu_to_le32(0xC0000503) +#define STATUS_PORT_CLOSED __constant_cpu_to_le32(0xC0000700) +#define STATUS_MESSAGE_LOST __constant_cpu_to_le32(0xC0000701) +#define STATUS_INVALID_MESSAGE __constant_cpu_to_le32(0xC0000702) +#define STATUS_REQUEST_CANCELED __constant_cpu_to_le32(0xC0000703) +#define STATUS_RECURSIVE_DISPATCH __constant_cpu_to_le32(0xC0000704) +#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED __constant_cpu_to_le32(0xC0000705) +#define STATUS_LPC_INVALID_CONNECTION_USAGE __constant_cpu_to_le32(0xC0000706) +#define STATUS_LPC_REQUESTS_NOT_ALLOWED __constant_cpu_to_le32(0xC0000707) +#define STATUS_RESOURCE_IN_USE __constant_cpu_to_le32(0xC0000708) +#define STATUS_HARDWARE_MEMORY_ERROR __constant_cpu_to_le32(0xC0000709) +#define STATUS_THREADPOOL_HANDLE_EXCEPTION __constant_cpu_to_le32(0xC000070A) +#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070B) +#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070C) +#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070D) +#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070E) +#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION __constant_cpu_to_le32(0xC000070F) +#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000710) +#define STATUS_APC_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000711) +#define STATUS_PROCESS_IS_PROTECTED __constant_cpu_to_le32(0xC0000712) +#define STATUS_MCA_EXCEPTION __constant_cpu_to_le32(0xC0000713) +#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE __constant_cpu_to_le32(0xC0000714) +#define STATUS_SYMLINK_CLASS_DISABLED __constant_cpu_to_le32(0xC0000715) +#define STATUS_INVALID_IDN_NORMALIZATION __constant_cpu_to_le32(0xC0000716) +#define STATUS_NO_UNICODE_TRANSLATION __constant_cpu_to_le32(0xC0000717) +#define STATUS_ALREADY_REGISTERED __constant_cpu_to_le32(0xC0000718) +#define STATUS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0000719) +#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST __constant_cpu_to_le32(0xC000071A) +#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY __constant_cpu_to_le32(0xC000071B) +#define STATUS_INVALID_THREAD __constant_cpu_to_le32(0xC000071C) +#define STATUS_CALLBACK_RETURNED_TRANSACTION __constant_cpu_to_le32(0xC000071D) +#define STATUS_CALLBACK_RETURNED_LDR_LOCK __constant_cpu_to_le32(0xC000071E) +#define STATUS_CALLBACK_RETURNED_LANG __constant_cpu_to_le32(0xC000071F) +#define STATUS_CALLBACK_RETURNED_PRI_BACK __constant_cpu_to_le32(0xC0000720) +#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY __constant_cpu_to_le32(0xC0000721) +#define STATUS_DISK_REPAIR_DISABLED __constant_cpu_to_le32(0xC0000800) +#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS __constant_cpu_to_le32(0xC0000801) +#define STATUS_DISK_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000802) +#define STATUS_CONTENT_BLOCKED __constant_cpu_to_le32(0xC0000804) +#define STATUS_BAD_CLUSTERS __constant_cpu_to_le32(0xC0000805) +#define STATUS_VOLUME_DIRTY __constant_cpu_to_le32(0xC0000806) +#define STATUS_FILE_CHECKED_OUT __constant_cpu_to_le32(0xC0000901) +#define STATUS_CHECKOUT_REQUIRED __constant_cpu_to_le32(0xC0000902) +#define STATUS_BAD_FILE_TYPE __constant_cpu_to_le32(0xC0000903) +#define STATUS_FILE_TOO_LARGE __constant_cpu_to_le32(0xC0000904) +#define STATUS_FORMS_AUTH_REQUIRED __constant_cpu_to_le32(0xC0000905) +#define STATUS_VIRUS_INFECTED __constant_cpu_to_le32(0xC0000906) +#define STATUS_VIRUS_DELETED __constant_cpu_to_le32(0xC0000907) +#define STATUS_BAD_MCFG_TABLE __constant_cpu_to_le32(0xC0000908) +#define STATUS_WOW_ASSERTION __constant_cpu_to_le32(0xC0009898) +#define STATUS_INVALID_SIGNATURE __constant_cpu_to_le32(0xC000A000) +#define STATUS_HMAC_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A001) +#define STATUS_IPSEC_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A010) +#define STATUS_ND_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A011) +#define STATUS_HOPLIMIT_EXCEEDED __constant_cpu_to_le32(0xC000A012) +#define STATUS_PROTOCOL_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A013) +#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED __constant_cpu_to_le32(0xC000A080) +#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR __constant_cpu_to_le32(0xC000A081) +#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR __constant_cpu_to_le32(0xC000A082) +#define STATUS_XML_PARSE_ERROR __constant_cpu_to_le32(0xC000A083) +#define STATUS_XMLDSIG_ERROR __constant_cpu_to_le32(0xC000A084) +#define STATUS_WRONG_COMPARTMENT __constant_cpu_to_le32(0xC000A085) +#define STATUS_AUTHIP_FAILURE __constant_cpu_to_le32(0xC000A086) +#define DBG_NO_STATE_CHANGE __constant_cpu_to_le32(0xC0010001) +#define DBG_APP_NOT_IDLE __constant_cpu_to_le32(0xC0010002) +#define RPC_NT_INVALID_STRING_BINDING __constant_cpu_to_le32(0xC0020001) +#define RPC_NT_WRONG_KIND_OF_BINDING __constant_cpu_to_le32(0xC0020002) +#define RPC_NT_INVALID_BINDING __constant_cpu_to_le32(0xC0020003) +#define RPC_NT_PROTSEQ_NOT_SUPPORTED __constant_cpu_to_le32(0xC0020004) +#define RPC_NT_INVALID_RPC_PROTSEQ __constant_cpu_to_le32(0xC0020005) +#define RPC_NT_INVALID_STRING_UUID __constant_cpu_to_le32(0xC0020006) +#define RPC_NT_INVALID_ENDPOINT_FORMAT __constant_cpu_to_le32(0xC0020007) +#define RPC_NT_INVALID_NET_ADDR __constant_cpu_to_le32(0xC0020008) +#define RPC_NT_NO_ENDPOINT_FOUND __constant_cpu_to_le32(0xC0020009) +#define RPC_NT_INVALID_TIMEOUT __constant_cpu_to_le32(0xC002000A) +#define RPC_NT_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC002000B) +#define RPC_NT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000C) +#define RPC_NT_TYPE_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000D) +#define RPC_NT_ALREADY_LISTENING __constant_cpu_to_le32(0xC002000E) +#define RPC_NT_NO_PROTSEQS_REGISTERED __constant_cpu_to_le32(0xC002000F) +#define RPC_NT_NOT_LISTENING __constant_cpu_to_le32(0xC0020010) +#define RPC_NT_UNKNOWN_MGR_TYPE __constant_cpu_to_le32(0xC0020011) +#define RPC_NT_UNKNOWN_IF __constant_cpu_to_le32(0xC0020012) +#define RPC_NT_NO_BINDINGS __constant_cpu_to_le32(0xC0020013) +#define RPC_NT_NO_PROTSEQS __constant_cpu_to_le32(0xC0020014) +#define RPC_NT_CANT_CREATE_ENDPOINT __constant_cpu_to_le32(0xC0020015) +#define RPC_NT_OUT_OF_RESOURCES __constant_cpu_to_le32(0xC0020016) +#define RPC_NT_SERVER_UNAVAILABLE __constant_cpu_to_le32(0xC0020017) +#define RPC_NT_SERVER_TOO_BUSY __constant_cpu_to_le32(0xC0020018) +#define RPC_NT_INVALID_NETWORK_OPTIONS __constant_cpu_to_le32(0xC0020019) +#define RPC_NT_NO_CALL_ACTIVE __constant_cpu_to_le32(0xC002001A) +#define RPC_NT_CALL_FAILED __constant_cpu_to_le32(0xC002001B) +#define RPC_NT_CALL_FAILED_DNE __constant_cpu_to_le32(0xC002001C) +#define RPC_NT_PROTOCOL_ERROR __constant_cpu_to_le32(0xC002001D) +#define RPC_NT_UNSUPPORTED_TRANS_SYN __constant_cpu_to_le32(0xC002001F) +#define RPC_NT_UNSUPPORTED_TYPE __constant_cpu_to_le32(0xC0020021) +#define RPC_NT_INVALID_TAG __constant_cpu_to_le32(0xC0020022) +#define RPC_NT_INVALID_BOUND __constant_cpu_to_le32(0xC0020023) +#define RPC_NT_NO_ENTRY_NAME __constant_cpu_to_le32(0xC0020024) +#define RPC_NT_INVALID_NAME_SYNTAX __constant_cpu_to_le32(0xC0020025) +#define RPC_NT_UNSUPPORTED_NAME_SYNTAX __constant_cpu_to_le32(0xC0020026) +#define RPC_NT_UUID_NO_ADDRESS __constant_cpu_to_le32(0xC0020028) +#define RPC_NT_DUPLICATE_ENDPOINT __constant_cpu_to_le32(0xC0020029) +#define RPC_NT_UNKNOWN_AUTHN_TYPE __constant_cpu_to_le32(0xC002002A) +#define RPC_NT_MAX_CALLS_TOO_SMALL __constant_cpu_to_le32(0xC002002B) +#define RPC_NT_STRING_TOO_LONG __constant_cpu_to_le32(0xC002002C) +#define RPC_NT_PROTSEQ_NOT_FOUND __constant_cpu_to_le32(0xC002002D) +#define RPC_NT_PROCNUM_OUT_OF_RANGE __constant_cpu_to_le32(0xC002002E) +#define RPC_NT_BINDING_HAS_NO_AUTH __constant_cpu_to_le32(0xC002002F) +#define RPC_NT_UNKNOWN_AUTHN_SERVICE __constant_cpu_to_le32(0xC0020030) +#define RPC_NT_UNKNOWN_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020031) +#define RPC_NT_INVALID_AUTH_IDENTITY __constant_cpu_to_le32(0xC0020032) +#define RPC_NT_UNKNOWN_AUTHZ_SERVICE __constant_cpu_to_le32(0xC0020033) +#define EPT_NT_INVALID_ENTRY __constant_cpu_to_le32(0xC0020034) +#define EPT_NT_CANT_PERFORM_OP __constant_cpu_to_le32(0xC0020035) +#define EPT_NT_NOT_REGISTERED __constant_cpu_to_le32(0xC0020036) +#define RPC_NT_NOTHING_TO_EXPORT __constant_cpu_to_le32(0xC0020037) +#define RPC_NT_INCOMPLETE_NAME __constant_cpu_to_le32(0xC0020038) +#define RPC_NT_INVALID_VERS_OPTION __constant_cpu_to_le32(0xC0020039) +#define RPC_NT_NO_MORE_MEMBERS __constant_cpu_to_le32(0xC002003A) +#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED __constant_cpu_to_le32(0xC002003B) +#define RPC_NT_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC002003C) +#define RPC_NT_ENTRY_ALREADY_EXISTS __constant_cpu_to_le32(0xC002003D) +#define RPC_NT_ENTRY_NOT_FOUND __constant_cpu_to_le32(0xC002003E) +#define RPC_NT_NAME_SERVICE_UNAVAILABLE __constant_cpu_to_le32(0xC002003F) +#define RPC_NT_INVALID_NAF_ID __constant_cpu_to_le32(0xC0020040) +#define RPC_NT_CANNOT_SUPPORT __constant_cpu_to_le32(0xC0020041) +#define RPC_NT_NO_CONTEXT_AVAILABLE __constant_cpu_to_le32(0xC0020042) +#define RPC_NT_INTERNAL_ERROR __constant_cpu_to_le32(0xC0020043) +#define RPC_NT_ZERO_DIVIDE __constant_cpu_to_le32(0xC0020044) +#define RPC_NT_ADDRESS_ERROR __constant_cpu_to_le32(0xC0020045) +#define RPC_NT_FP_DIV_ZERO __constant_cpu_to_le32(0xC0020046) +#define RPC_NT_FP_UNDERFLOW __constant_cpu_to_le32(0xC0020047) +#define RPC_NT_FP_OVERFLOW __constant_cpu_to_le32(0xC0020048) +#define RPC_NT_CALL_IN_PROGRESS __constant_cpu_to_le32(0xC0020049) +#define RPC_NT_NO_MORE_BINDINGS __constant_cpu_to_le32(0xC002004A) +#define RPC_NT_GROUP_MEMBER_NOT_FOUND __constant_cpu_to_le32(0xC002004B) +#define EPT_NT_CANT_CREATE __constant_cpu_to_le32(0xC002004C) +#define RPC_NT_INVALID_OBJECT __constant_cpu_to_le32(0xC002004D) +#define RPC_NT_NO_INTERFACES __constant_cpu_to_le32(0xC002004F) +#define RPC_NT_CALL_CANCELLED __constant_cpu_to_le32(0xC0020050) +#define RPC_NT_BINDING_INCOMPLETE __constant_cpu_to_le32(0xC0020051) +#define RPC_NT_COMM_FAILURE __constant_cpu_to_le32(0xC0020052) +#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020053) +#define RPC_NT_NO_PRINC_NAME __constant_cpu_to_le32(0xC0020054) +#define RPC_NT_NOT_RPC_ERROR __constant_cpu_to_le32(0xC0020055) +#define RPC_NT_SEC_PKG_ERROR __constant_cpu_to_le32(0xC0020057) +#define RPC_NT_NOT_CANCELLED __constant_cpu_to_le32(0xC0020058) +#define RPC_NT_INVALID_ASYNC_HANDLE __constant_cpu_to_le32(0xC0020062) +#define RPC_NT_INVALID_ASYNC_CALL __constant_cpu_to_le32(0xC0020063) +#define RPC_NT_PROXY_ACCESS_DENIED __constant_cpu_to_le32(0xC0020064) +#define RPC_NT_NO_MORE_ENTRIES __constant_cpu_to_le32(0xC0030001) +#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL __constant_cpu_to_le32(0xC0030002) +#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE __constant_cpu_to_le32(0xC0030003) +#define RPC_NT_SS_IN_NULL_CONTEXT __constant_cpu_to_le32(0xC0030004) +#define RPC_NT_SS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0030005) +#define RPC_NT_SS_CONTEXT_DAMAGED __constant_cpu_to_le32(0xC0030006) +#define RPC_NT_SS_HANDLES_MISMATCH __constant_cpu_to_le32(0xC0030007) +#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE __constant_cpu_to_le32(0xC0030008) +#define RPC_NT_NULL_REF_POINTER __constant_cpu_to_le32(0xC0030009) +#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE __constant_cpu_to_le32(0xC003000A) +#define RPC_NT_BYTE_COUNT_TOO_SMALL __constant_cpu_to_le32(0xC003000B) +#define RPC_NT_BAD_STUB_DATA __constant_cpu_to_le32(0xC003000C) +#define RPC_NT_INVALID_ES_ACTION __constant_cpu_to_le32(0xC0030059) +#define RPC_NT_WRONG_ES_VERSION __constant_cpu_to_le32(0xC003005A) +#define RPC_NT_WRONG_STUB_VERSION __constant_cpu_to_le32(0xC003005B) +#define RPC_NT_INVALID_PIPE_OBJECT __constant_cpu_to_le32(0xC003005C) +#define RPC_NT_INVALID_PIPE_OPERATION __constant_cpu_to_le32(0xC003005D) +#define RPC_NT_WRONG_PIPE_VERSION __constant_cpu_to_le32(0xC003005E) +#define RPC_NT_PIPE_CLOSED __constant_cpu_to_le32(0xC003005F) +#define RPC_NT_PIPE_DISCIPLINE_ERROR __constant_cpu_to_le32(0xC0030060) +#define RPC_NT_PIPE_EMPTY __constant_cpu_to_le32(0xC0030061) +#define STATUS_PNP_BAD_MPS_TABLE __constant_cpu_to_le32(0xC0040035) +#define STATUS_PNP_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040036) +#define STATUS_PNP_IRQ_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040037) +#define STATUS_PNP_INVALID_ID __constant_cpu_to_le32(0xC0040038) +#define STATUS_IO_REISSUE_AS_CACHED __constant_cpu_to_le32(0xC0040039) +#define STATUS_CTX_WINSTATION_NAME_INVALID __constant_cpu_to_le32(0xC00A0001) +#define STATUS_CTX_INVALID_PD __constant_cpu_to_le32(0xC00A0002) +#define STATUS_CTX_PD_NOT_FOUND __constant_cpu_to_le32(0xC00A0003) +#define STATUS_CTX_CLOSE_PENDING __constant_cpu_to_le32(0xC00A0006) +#define STATUS_CTX_NO_OUTBUF __constant_cpu_to_le32(0xC00A0007) +#define STATUS_CTX_MODEM_INF_NOT_FOUND __constant_cpu_to_le32(0xC00A0008) +#define STATUS_CTX_INVALID_MODEMNAME __constant_cpu_to_le32(0xC00A0009) +#define STATUS_CTX_RESPONSE_ERROR __constant_cpu_to_le32(0xC00A000A) +#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT __constant_cpu_to_le32(0xC00A000B) +#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER __constant_cpu_to_le32(0xC00A000C) +#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE __constant_cpu_to_le32(0xC00A000D) +#define STATUS_CTX_MODEM_RESPONSE_BUSY __constant_cpu_to_le32(0xC00A000E) +#define STATUS_CTX_MODEM_RESPONSE_VOICE __constant_cpu_to_le32(0xC00A000F) +#define STATUS_CTX_TD_ERROR __constant_cpu_to_le32(0xC00A0010) +#define STATUS_CTX_LICENSE_CLIENT_INVALID __constant_cpu_to_le32(0xC00A0012) +#define STATUS_CTX_LICENSE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00A0013) +#define STATUS_CTX_LICENSE_EXPIRED __constant_cpu_to_le32(0xC00A0014) +#define STATUS_CTX_WINSTATION_NOT_FOUND __constant_cpu_to_le32(0xC00A0015) +#define STATUS_CTX_WINSTATION_NAME_COLLISION __constant_cpu_to_le32(0xC00A0016) +#define STATUS_CTX_WINSTATION_BUSY __constant_cpu_to_le32(0xC00A0017) +#define STATUS_CTX_BAD_VIDEO_MODE __constant_cpu_to_le32(0xC00A0018) +#define STATUS_CTX_GRAPHICS_INVALID __constant_cpu_to_le32(0xC00A0022) +#define STATUS_CTX_NOT_CONSOLE __constant_cpu_to_le32(0xC00A0024) +#define STATUS_CTX_CLIENT_QUERY_TIMEOUT __constant_cpu_to_le32(0xC00A0026) +#define STATUS_CTX_CONSOLE_DISCONNECT __constant_cpu_to_le32(0xC00A0027) +#define STATUS_CTX_CONSOLE_CONNECT __constant_cpu_to_le32(0xC00A0028) +#define STATUS_CTX_SHADOW_DENIED __constant_cpu_to_le32(0xC00A002A) +#define STATUS_CTX_WINSTATION_ACCESS_DENIED __constant_cpu_to_le32(0xC00A002B) +#define STATUS_CTX_INVALID_WD __constant_cpu_to_le32(0xC00A002E) +#define STATUS_CTX_WD_NOT_FOUND __constant_cpu_to_le32(0xC00A002F) +#define STATUS_CTX_SHADOW_INVALID __constant_cpu_to_le32(0xC00A0030) +#define STATUS_CTX_SHADOW_DISABLED __constant_cpu_to_le32(0xC00A0031) +#define STATUS_RDP_PROTOCOL_ERROR __constant_cpu_to_le32(0xC00A0032) +#define STATUS_CTX_CLIENT_LICENSE_NOT_SET __constant_cpu_to_le32(0xC00A0033) +#define STATUS_CTX_CLIENT_LICENSE_IN_USE __constant_cpu_to_le32(0xC00A0034) +#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE __constant_cpu_to_le32(0xC00A0035) +#define STATUS_CTX_SHADOW_NOT_RUNNING __constant_cpu_to_le32(0xC00A0036) +#define STATUS_CTX_LOGON_DISABLED __constant_cpu_to_le32(0xC00A0037) +#define STATUS_CTX_SECURITY_LAYER_ERROR __constant_cpu_to_le32(0xC00A0038) +#define STATUS_TS_INCOMPATIBLE_SESSIONS __constant_cpu_to_le32(0xC00A0039) +#define STATUS_MUI_FILE_NOT_FOUND __constant_cpu_to_le32(0xC00B0001) +#define STATUS_MUI_INVALID_FILE __constant_cpu_to_le32(0xC00B0002) +#define STATUS_MUI_INVALID_RC_CONFIG __constant_cpu_to_le32(0xC00B0003) +#define STATUS_MUI_INVALID_LOCALE_NAME __constant_cpu_to_le32(0xC00B0004) +#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME __constant_cpu_to_le32(0xC00B0005) +#define STATUS_MUI_FILE_NOT_LOADED __constant_cpu_to_le32(0xC00B0006) +#define STATUS_RESOURCE_ENUM_USER_STOP __constant_cpu_to_le32(0xC00B0007) +#define STATUS_CLUSTER_INVALID_NODE __constant_cpu_to_le32(0xC0130001) +#define STATUS_CLUSTER_NODE_EXISTS __constant_cpu_to_le32(0xC0130002) +#define STATUS_CLUSTER_JOIN_IN_PROGRESS __constant_cpu_to_le32(0xC0130003) +#define STATUS_CLUSTER_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130004) +#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130005) +#define STATUS_CLUSTER_NETWORK_EXISTS __constant_cpu_to_le32(0xC0130006) +#define STATUS_CLUSTER_NETWORK_NOT_FOUND __constant_cpu_to_le32(0xC0130007) +#define STATUS_CLUSTER_NETINTERFACE_EXISTS __constant_cpu_to_le32(0xC0130008) +#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC0130009) +#define STATUS_CLUSTER_INVALID_REQUEST __constant_cpu_to_le32(0xC013000A) +#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER __constant_cpu_to_le32(0xC013000B) +#define STATUS_CLUSTER_NODE_DOWN __constant_cpu_to_le32(0xC013000C) +#define STATUS_CLUSTER_NODE_UNREACHABLE __constant_cpu_to_le32(0xC013000D) +#define STATUS_CLUSTER_NODE_NOT_MEMBER __constant_cpu_to_le32(0xC013000E) +#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC013000F) +#define STATUS_CLUSTER_INVALID_NETWORK __constant_cpu_to_le32(0xC0130010) +#define STATUS_CLUSTER_NO_NET_ADAPTERS __constant_cpu_to_le32(0xC0130011) +#define STATUS_CLUSTER_NODE_UP __constant_cpu_to_le32(0xC0130012) +#define STATUS_CLUSTER_NODE_PAUSED __constant_cpu_to_le32(0xC0130013) +#define STATUS_CLUSTER_NODE_NOT_PAUSED __constant_cpu_to_le32(0xC0130014) +#define STATUS_CLUSTER_NO_SECURITY_CONTEXT __constant_cpu_to_le32(0xC0130015) +#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL __constant_cpu_to_le32(0xC0130016) +#define STATUS_CLUSTER_POISONED __constant_cpu_to_le32(0xC0130017) +#define STATUS_ACPI_INVALID_OPCODE __constant_cpu_to_le32(0xC0140001) +#define STATUS_ACPI_STACK_OVERFLOW __constant_cpu_to_le32(0xC0140002) +#define STATUS_ACPI_ASSERT_FAILED __constant_cpu_to_le32(0xC0140003) +#define STATUS_ACPI_INVALID_INDEX __constant_cpu_to_le32(0xC0140004) +#define STATUS_ACPI_INVALID_ARGUMENT __constant_cpu_to_le32(0xC0140005) +#define STATUS_ACPI_FATAL __constant_cpu_to_le32(0xC0140006) +#define STATUS_ACPI_INVALID_SUPERNAME __constant_cpu_to_le32(0xC0140007) +#define STATUS_ACPI_INVALID_ARGTYPE __constant_cpu_to_le32(0xC0140008) +#define STATUS_ACPI_INVALID_OBJTYPE __constant_cpu_to_le32(0xC0140009) +#define STATUS_ACPI_INVALID_TARGETTYPE __constant_cpu_to_le32(0xC014000A) +#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT __constant_cpu_to_le32(0xC014000B) +#define STATUS_ACPI_ADDRESS_NOT_MAPPED __constant_cpu_to_le32(0xC014000C) +#define STATUS_ACPI_INVALID_EVENTTYPE __constant_cpu_to_le32(0xC014000D) +#define STATUS_ACPI_HANDLER_COLLISION __constant_cpu_to_le32(0xC014000E) +#define STATUS_ACPI_INVALID_DATA __constant_cpu_to_le32(0xC014000F) +#define STATUS_ACPI_INVALID_REGION __constant_cpu_to_le32(0xC0140010) +#define STATUS_ACPI_INVALID_ACCESS_SIZE __constant_cpu_to_le32(0xC0140011) +#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK __constant_cpu_to_le32(0xC0140012) +#define STATUS_ACPI_ALREADY_INITIALIZED __constant_cpu_to_le32(0xC0140013) +#define STATUS_ACPI_NOT_INITIALIZED __constant_cpu_to_le32(0xC0140014) +#define STATUS_ACPI_INVALID_MUTEX_LEVEL __constant_cpu_to_le32(0xC0140015) +#define STATUS_ACPI_MUTEX_NOT_OWNED __constant_cpu_to_le32(0xC0140016) +#define STATUS_ACPI_MUTEX_NOT_OWNER __constant_cpu_to_le32(0xC0140017) +#define STATUS_ACPI_RS_ACCESS __constant_cpu_to_le32(0xC0140018) +#define STATUS_ACPI_INVALID_TABLE __constant_cpu_to_le32(0xC0140019) +#define STATUS_ACPI_REG_HANDLER_FAILED __constant_cpu_to_le32(0xC0140020) +#define STATUS_ACPI_POWER_REQUEST_FAILED __constant_cpu_to_le32(0xC0140021) +#define STATUS_SXS_SECTION_NOT_FOUND __constant_cpu_to_le32(0xC0150001) +#define STATUS_SXS_CANT_GEN_ACTCTX __constant_cpu_to_le32(0xC0150002) +#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT __constant_cpu_to_le32(0xC0150003) +#define STATUS_SXS_ASSEMBLY_NOT_FOUND __constant_cpu_to_le32(0xC0150004) +#define STATUS_SXS_MANIFEST_FORMAT_ERROR __constant_cpu_to_le32(0xC0150005) +#define STATUS_SXS_MANIFEST_PARSE_ERROR __constant_cpu_to_le32(0xC0150006) +#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED __constant_cpu_to_le32(0xC0150007) +#define STATUS_SXS_KEY_NOT_FOUND __constant_cpu_to_le32(0xC0150008) +#define STATUS_SXS_VERSION_CONFLICT __constant_cpu_to_le32(0xC0150009) +#define STATUS_SXS_WRONG_SECTION_TYPE __constant_cpu_to_le32(0xC015000A) +#define STATUS_SXS_THREAD_QUERIES_DISABLED __constant_cpu_to_le32(0xC015000B) +#define STATUS_SXS_ASSEMBLY_MISSING __constant_cpu_to_le32(0xC015000C) +#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET __constant_cpu_to_le32(0xC015000E) +#define STATUS_SXS_EARLY_DEACTIVATION __constant_cpu_to_le32(0xC015000F) +#define STATUS_SXS_INVALID_DEACTIVATION __constant_cpu_to_le32(0xC0150010) +#define STATUS_SXS_MULTIPLE_DEACTIVATION __constant_cpu_to_le32(0xC0150011) +#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY __constant_cpu_to_le32(0xC0150012) +#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED __constant_cpu_to_le32(0xC0150013) +#define STATUS_SXS_CORRUPT_ACTIVATION_STACK __constant_cpu_to_le32(0xC0150014) +#define STATUS_SXS_CORRUPTION __constant_cpu_to_le32(0xC0150015) +#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE __constant_cpu_to_le32(0xC0150016) +#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME __constant_cpu_to_le32(0xC0150017) +#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE __constant_cpu_to_le32(0xC0150018) +#define STATUS_SXS_IDENTITY_PARSE_ERROR __constant_cpu_to_le32(0xC0150019) +#define STATUS_SXS_COMPONENT_STORE_CORRUPT __constant_cpu_to_le32(0xC015001A) +#define STATUS_SXS_FILE_HASH_MISMATCH __constant_cpu_to_le32(0xC015001B) +#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT __constant_cpu_to_le32(0xC015001C) +#define STATUS_SXS_IDENTITIES_DIFFERENT __constant_cpu_to_le32(0xC015001D) +#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT __constant_cpu_to_le32(0xC015001E) +#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY __constant_cpu_to_le32(0xC015001F) +#define STATUS_ADVANCED_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150020) +#define STATUS_XML_ENCODING_MISMATCH __constant_cpu_to_le32(0xC0150021) +#define STATUS_SXS_MANIFEST_TOO_BIG __constant_cpu_to_le32(0xC0150022) +#define STATUS_SXS_SETTING_NOT_REGISTERED __constant_cpu_to_le32(0xC0150023) +#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE __constant_cpu_to_le32(0xC0150024) +#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150025) +#define STATUS_GENERIC_COMMAND_FAILED __constant_cpu_to_le32(0xC0150026) +#define STATUS_SXS_FILE_HASH_MISSING __constant_cpu_to_le32(0xC0150027) +#define STATUS_TRANSACTIONAL_CONFLICT __constant_cpu_to_le32(0xC0190001) +#define STATUS_INVALID_TRANSACTION __constant_cpu_to_le32(0xC0190002) +#define STATUS_TRANSACTION_NOT_ACTIVE __constant_cpu_to_le32(0xC0190003) +#define STATUS_TM_INITIALIZATION_FAILED __constant_cpu_to_le32(0xC0190004) +#define STATUS_RM_NOT_ACTIVE __constant_cpu_to_le32(0xC0190005) +#define STATUS_RM_METADATA_CORRUPT __constant_cpu_to_le32(0xC0190006) +#define STATUS_TRANSACTION_NOT_JOINED __constant_cpu_to_le32(0xC0190007) +#define STATUS_DIRECTORY_NOT_RM __constant_cpu_to_le32(0xC0190008) +#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC019000A) +#define STATUS_LOG_RESIZE_INVALID_SIZE __constant_cpu_to_le32(0xC019000B) +#define STATUS_REMOTE_FILE_VERSION_MISMATCH __constant_cpu_to_le32(0xC019000C) +#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS __constant_cpu_to_le32(0xC019000F) +#define STATUS_TRANSACTION_PROPAGATION_FAILED __constant_cpu_to_le32(0xC0190010) +#define STATUS_CRM_PROTOCOL_NOT_FOUND __constant_cpu_to_le32(0xC0190011) +#define STATUS_TRANSACTION_SUPERIOR_EXISTS __constant_cpu_to_le32(0xC0190012) +#define STATUS_TRANSACTION_REQUEST_NOT_VALID __constant_cpu_to_le32(0xC0190013) +#define STATUS_TRANSACTION_NOT_REQUESTED __constant_cpu_to_le32(0xC0190014) +#define STATUS_TRANSACTION_ALREADY_ABORTED __constant_cpu_to_le32(0xC0190015) +#define STATUS_TRANSACTION_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0190016) +#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER __constant_cpu_to_le32(0xC0190017) +#define STATUS_CURRENT_TRANSACTION_NOT_VALID __constant_cpu_to_le32(0xC0190018) +#define STATUS_LOG_GROWTH_FAILED __constant_cpu_to_le32(0xC0190019) +#define STATUS_OBJECT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC0190021) +#define STATUS_STREAM_MINIVERSION_NOT_FOUND __constant_cpu_to_le32(0xC0190022) +#define STATUS_STREAM_MINIVERSION_NOT_VALID __constant_cpu_to_le32(0xC0190023) +#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION __constant_cpu_to_le32(0xC0190024) +#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT __constant_cpu_to_le32(0xC0190025) +#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS __constant_cpu_to_le32(0xC0190026) +#define STATUS_HANDLE_NO_LONGER_VALID __constant_cpu_to_le32(0xC0190028) +#define STATUS_LOG_CORRUPTION_DETECTED __constant_cpu_to_le32(0xC0190030) +#define STATUS_RM_DISCONNECTED __constant_cpu_to_le32(0xC0190032) +#define STATUS_ENLISTMENT_NOT_SUPERIOR __constant_cpu_to_le32(0xC0190033) +#define STATUS_FILE_IDENTITY_NOT_PERSISTENT __constant_cpu_to_le32(0xC0190036) +#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY __constant_cpu_to_le32(0xC0190037) +#define STATUS_CANT_CROSS_RM_BOUNDARY __constant_cpu_to_le32(0xC0190038) +#define STATUS_TXF_DIR_NOT_EMPTY __constant_cpu_to_le32(0xC0190039) +#define STATUS_INDOUBT_TRANSACTIONS_EXIST __constant_cpu_to_le32(0xC019003A) +#define STATUS_TM_VOLATILE __constant_cpu_to_le32(0xC019003B) +#define STATUS_ROLLBACK_TIMER_EXPIRED __constant_cpu_to_le32(0xC019003C) +#define STATUS_TXF_ATTRIBUTE_CORRUPT __constant_cpu_to_le32(0xC019003D) +#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC019003E) +#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED __constant_cpu_to_le32(0xC019003F) +#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC0190040) +#define STATUS_TRANSACTION_REQUIRED_PROMOTION __constant_cpu_to_le32(0xC0190043) +#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION __constant_cpu_to_le32(0xC0190044) +#define STATUS_TRANSACTIONS_NOT_FROZEN __constant_cpu_to_le32(0xC0190045) +#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS __constant_cpu_to_le32(0xC0190046) +#define STATUS_NOT_SNAPSHOT_VOLUME __constant_cpu_to_le32(0xC0190047) +#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES __constant_cpu_to_le32(0xC0190048) +#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190049) +#define STATUS_TM_IDENTITY_MISMATCH __constant_cpu_to_le32(0xC019004A) +#define STATUS_FLOATED_SECTION __constant_cpu_to_le32(0xC019004B) +#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK __constant_cpu_to_le32(0xC019004C) +#define STATUS_CANNOT_ABORT_TRANSACTIONS __constant_cpu_to_le32(0xC019004D) +#define STATUS_TRANSACTION_NOT_FOUND __constant_cpu_to_le32(0xC019004E) +#define STATUS_RESOURCEMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC019004F) +#define STATUS_ENLISTMENT_NOT_FOUND __constant_cpu_to_le32(0xC0190050) +#define STATUS_TRANSACTIONMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC0190051) +#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE __constant_cpu_to_le32(0xC0190052) +#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION __constant_cpu_to_le32(0xC0190053) +#define STATUS_TRANSACTION_NOT_ROOT __constant_cpu_to_le32(0xC0190054) +#define STATUS_TRANSACTION_OBJECT_EXPIRED __constant_cpu_to_le32(0xC0190055) +#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190056) +#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED __constant_cpu_to_le32(0xC0190057) +#define STATUS_TRANSACTION_RECORD_TOO_LONG __constant_cpu_to_le32(0xC0190058) +#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION __constant_cpu_to_le32(0xC0190059) +#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION __constant_cpu_to_le32(0xC019005A) +#define STATUS_TRANSACTION_INTEGRITY_VIOLATED __constant_cpu_to_le32(0xC019005B) +#define STATUS_LOG_SECTOR_INVALID __constant_cpu_to_le32(0xC01A0001) +#define STATUS_LOG_SECTOR_PARITY_INVALID __constant_cpu_to_le32(0xC01A0002) +#define STATUS_LOG_SECTOR_REMAPPED __constant_cpu_to_le32(0xC01A0003) +#define STATUS_LOG_BLOCK_INCOMPLETE __constant_cpu_to_le32(0xC01A0004) +#define STATUS_LOG_INVALID_RANGE __constant_cpu_to_le32(0xC01A0005) +#define STATUS_LOG_BLOCKS_EXHAUSTED __constant_cpu_to_le32(0xC01A0006) +#define STATUS_LOG_READ_CONTEXT_INVALID __constant_cpu_to_le32(0xC01A0007) +#define STATUS_LOG_RESTART_INVALID __constant_cpu_to_le32(0xC01A0008) +#define STATUS_LOG_BLOCK_VERSION __constant_cpu_to_le32(0xC01A0009) +#define STATUS_LOG_BLOCK_INVALID __constant_cpu_to_le32(0xC01A000A) +#define STATUS_LOG_READ_MODE_INVALID __constant_cpu_to_le32(0xC01A000B) +#define STATUS_LOG_METADATA_CORRUPT __constant_cpu_to_le32(0xC01A000D) +#define STATUS_LOG_METADATA_INVALID __constant_cpu_to_le32(0xC01A000E) +#define STATUS_LOG_METADATA_INCONSISTENT __constant_cpu_to_le32(0xC01A000F) +#define STATUS_LOG_RESERVATION_INVALID __constant_cpu_to_le32(0xC01A0010) +#define STATUS_LOG_CANT_DELETE __constant_cpu_to_le32(0xC01A0011) +#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC01A0012) +#define STATUS_LOG_START_OF_LOG __constant_cpu_to_le32(0xC01A0013) +#define STATUS_LOG_POLICY_ALREADY_INSTALLED __constant_cpu_to_le32(0xC01A0014) +#define STATUS_LOG_POLICY_NOT_INSTALLED __constant_cpu_to_le32(0xC01A0015) +#define STATUS_LOG_POLICY_INVALID __constant_cpu_to_le32(0xC01A0016) +#define STATUS_LOG_POLICY_CONFLICT __constant_cpu_to_le32(0xC01A0017) +#define STATUS_LOG_PINNED_ARCHIVE_TAIL __constant_cpu_to_le32(0xC01A0018) +#define STATUS_LOG_RECORD_NONEXISTENT __constant_cpu_to_le32(0xC01A0019) +#define STATUS_LOG_RECORDS_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001A) +#define STATUS_LOG_SPACE_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001B) +#define STATUS_LOG_TAIL_INVALID __constant_cpu_to_le32(0xC01A001C) +#define STATUS_LOG_FULL __constant_cpu_to_le32(0xC01A001D) +#define STATUS_LOG_MULTIPLEXED __constant_cpu_to_le32(0xC01A001E) +#define STATUS_LOG_DEDICATED __constant_cpu_to_le32(0xC01A001F) +#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC01A0020) +#define STATUS_LOG_ARCHIVE_IN_PROGRESS __constant_cpu_to_le32(0xC01A0021) +#define STATUS_LOG_EPHEMERAL __constant_cpu_to_le32(0xC01A0022) +#define STATUS_LOG_NOT_ENOUGH_CONTAINERS __constant_cpu_to_le32(0xC01A0023) +#define STATUS_LOG_CLIENT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC01A0024) +#define STATUS_LOG_CLIENT_NOT_REGISTERED __constant_cpu_to_le32(0xC01A0025) +#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS __constant_cpu_to_le32(0xC01A0026) +#define STATUS_LOG_CONTAINER_READ_FAILED __constant_cpu_to_le32(0xC01A0027) +#define STATUS_LOG_CONTAINER_WRITE_FAILED __constant_cpu_to_le32(0xC01A0028) +#define STATUS_LOG_CONTAINER_OPEN_FAILED __constant_cpu_to_le32(0xC01A0029) +#define STATUS_LOG_CONTAINER_STATE_INVALID __constant_cpu_to_le32(0xC01A002A) +#define STATUS_LOG_STATE_INVALID __constant_cpu_to_le32(0xC01A002B) +#define STATUS_LOG_PINNED __constant_cpu_to_le32(0xC01A002C) +#define STATUS_LOG_METADATA_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002D) +#define STATUS_LOG_INCONSISTENT_SECURITY __constant_cpu_to_le32(0xC01A002E) +#define STATUS_LOG_APPENDED_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002F) +#define STATUS_LOG_PINNED_RESERVATION __constant_cpu_to_le32(0xC01A0030) +#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC01B00EA) +#define STATUS_FLT_NO_HANDLER_DEFINED __constant_cpu_to_le32(0xC01C0001) +#define STATUS_FLT_CONTEXT_ALREADY_DEFINED __constant_cpu_to_le32(0xC01C0002) +#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST __constant_cpu_to_le32(0xC01C0003) +#define STATUS_FLT_DISALLOW_FAST_IO __constant_cpu_to_le32(0xC01C0004) +#define STATUS_FLT_INVALID_NAME_REQUEST __constant_cpu_to_le32(0xC01C0005) +#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION __constant_cpu_to_le32(0xC01C0006) +#define STATUS_FLT_NOT_INITIALIZED __constant_cpu_to_le32(0xC01C0007) +#define STATUS_FLT_FILTER_NOT_READY __constant_cpu_to_le32(0xC01C0008) +#define STATUS_FLT_POST_OPERATION_CLEANUP __constant_cpu_to_le32(0xC01C0009) +#define STATUS_FLT_INTERNAL_ERROR __constant_cpu_to_le32(0xC01C000A) +#define STATUS_FLT_DELETING_OBJECT __constant_cpu_to_le32(0xC01C000B) +#define STATUS_FLT_MUST_BE_NONPAGED_POOL __constant_cpu_to_le32(0xC01C000C) +#define STATUS_FLT_DUPLICATE_ENTRY __constant_cpu_to_le32(0xC01C000D) +#define STATUS_FLT_CBDQ_DISABLED __constant_cpu_to_le32(0xC01C000E) +#define STATUS_FLT_DO_NOT_ATTACH __constant_cpu_to_le32(0xC01C000F) +#define STATUS_FLT_DO_NOT_DETACH __constant_cpu_to_le32(0xC01C0010) +#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION __constant_cpu_to_le32(0xC01C0011) +#define STATUS_FLT_INSTANCE_NAME_COLLISION __constant_cpu_to_le32(0xC01C0012) +#define STATUS_FLT_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC01C0013) +#define STATUS_FLT_VOLUME_NOT_FOUND __constant_cpu_to_le32(0xC01C0014) +#define STATUS_FLT_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC01C0015) +#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND __constant_cpu_to_le32(0xC01C0016) +#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION __constant_cpu_to_le32(0xC01C0017) +#define STATUS_FLT_NAME_CACHE_MISS __constant_cpu_to_le32(0xC01C0018) +#define STATUS_FLT_NO_DEVICE_OBJECT __constant_cpu_to_le32(0xC01C0019) +#define STATUS_FLT_VOLUME_ALREADY_MOUNTED __constant_cpu_to_le32(0xC01C001A) +#define STATUS_FLT_ALREADY_ENLISTED __constant_cpu_to_le32(0xC01C001B) +#define STATUS_FLT_CONTEXT_ALREADY_LINKED __constant_cpu_to_le32(0xC01C001C) +#define STATUS_FLT_NO_WAITER_FOR_REPLY __constant_cpu_to_le32(0xC01C0020) +#define STATUS_MONITOR_NO_DESCRIPTOR __constant_cpu_to_le32(0xC01D0001) +#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC01D0002) +#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM __constant_cpu_to_le32(0xC01D0003) +#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0004) +#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED __constant_cpu_to_le32(0xC01D0005) +#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0006) +#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0007) +#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA __constant_cpu_to_le32(0xC01D0008) +#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0009) +#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER __constant_cpu_to_le32(0xC01E0000) +#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER __constant_cpu_to_le32(0xC01E0001) +#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER __constant_cpu_to_le32(0xC01E0002) +#define STATUS_GRAPHICS_ADAPTER_WAS_RESET __constant_cpu_to_le32(0xC01E0003) +#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL __constant_cpu_to_le32(0xC01E0004) +#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED __constant_cpu_to_le32(0xC01E0005) +#define STATUS_GRAPHICS_PRESENT_OCCLUDED __constant_cpu_to_le32(0xC01E0006) +#define STATUS_GRAPHICS_PRESENT_DENIED __constant_cpu_to_le32(0xC01E0007) +#define STATUS_GRAPHICS_CANNOTCOLORCONVERT __constant_cpu_to_le32(0xC01E0008) +#define STATUS_GRAPHICS_NO_VIDEO_MEMORY __constant_cpu_to_le32(0xC01E0100) +#define STATUS_GRAPHICS_CANT_LOCK_MEMORY __constant_cpu_to_le32(0xC01E0101) +#define STATUS_GRAPHICS_ALLOCATION_BUSY __constant_cpu_to_le32(0xC01E0102) +#define STATUS_GRAPHICS_TOO_MANY_REFERENCES __constant_cpu_to_le32(0xC01E0103) +#define STATUS_GRAPHICS_TRY_AGAIN_LATER __constant_cpu_to_le32(0xC01E0104) +#define STATUS_GRAPHICS_TRY_AGAIN_NOW __constant_cpu_to_le32(0xC01E0105) +#define STATUS_GRAPHICS_ALLOCATION_INVALID __constant_cpu_to_le32(0xC01E0106) +#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE __constant_cpu_to_le32(0xC01E0107) +#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED __constant_cpu_to_le32(0xC01E0108) +#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION __constant_cpu_to_le32(0xC01E0109) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE __constant_cpu_to_le32(0xC01E0110) +#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION __constant_cpu_to_le32(0xC01E0111) +#define STATUS_GRAPHICS_ALLOCATION_CLOSED __constant_cpu_to_le32(0xC01E0112) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE __constant_cpu_to_le32(0xC01E0113) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE __constant_cpu_to_le32(0xC01E0114) +#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE __constant_cpu_to_le32(0xC01E0115) +#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST __constant_cpu_to_le32(0xC01E0116) +#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE __constant_cpu_to_le32(0xC01E0200) +#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0300) +#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0301) +#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0302) +#define STATUS_GRAPHICS_INVALID_VIDPN __constant_cpu_to_le32(0xC01E0303) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE __constant_cpu_to_le32(0xC01E0304) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET __constant_cpu_to_le32(0xC01E0305) +#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0306) +#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET __constant_cpu_to_le32(0xC01E0308) +#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET __constant_cpu_to_le32(0xC01E0309) +#define STATUS_GRAPHICS_INVALID_FREQUENCY __constant_cpu_to_le32(0xC01E030A) +#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION __constant_cpu_to_le32(0xC01E030B) +#define STATUS_GRAPHICS_INVALID_TOTAL_REGION __constant_cpu_to_le32(0xC01E030C) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE __constant_cpu_to_le32(0xC01E0310) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE __constant_cpu_to_le32(0xC01E0311) +#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET __constant_cpu_to_le32(0xC01E0312) +#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0313) +#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET __constant_cpu_to_le32(0xC01E0314) +#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET __constant_cpu_to_le32(0xC01E0315) +#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET __constant_cpu_to_le32(0xC01E0316) +#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0317) +#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0318) +#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH __constant_cpu_to_le32(0xC01E0319) +#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E031A) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET __constant_cpu_to_le32(0xC01E031B) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE __constant_cpu_to_le32(0xC01E031C) +#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET __constant_cpu_to_le32(0xC01E031D) +#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E031F) +#define STATUS_GRAPHICS_STALE_MODESET __constant_cpu_to_le32(0xC01E0320) +#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET __constant_cpu_to_le32(0xC01E0321) +#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE __constant_cpu_to_le32(0xC01E0322) +#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN __constant_cpu_to_le32(0xC01E0323) +#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0324) +#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION __constant_cpu_to_le32(0xC01E0325) +#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES __constant_cpu_to_le32(0xC01E0326) +#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0327) +#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE __constant_cpu_to_le32(0xC01E0328) +#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET __constant_cpu_to_le32(0xC01E0329) +#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET __constant_cpu_to_le32(0xC01E032A) +#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR __constant_cpu_to_le32(0xC01E032B) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET __constant_cpu_to_le32(0xC01E032C) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E032D) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E032E) +#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE __constant_cpu_to_le32(0xC01E032F) +#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED __constant_cpu_to_le32(0xC01E0330) +#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0331) +#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0332) +#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET __constant_cpu_to_le32(0xC01E0333) +#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER __constant_cpu_to_le32(0xC01E0334) +#define STATUS_GRAPHICS_NO_VIDPNMGR __constant_cpu_to_le32(0xC01E0335) +#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0336) +#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0337) +#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED __constant_cpu_to_le32(0xC01E0338) +#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0339) +#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE __constant_cpu_to_le32(0xC01E033A) +#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE __constant_cpu_to_le32(0xC01E033B) +#define STATUS_GRAPHICS_INVALID_STRIDE __constant_cpu_to_le32(0xC01E033C) +#define STATUS_GRAPHICS_INVALID_PIXELFORMAT __constant_cpu_to_le32(0xC01E033D) +#define STATUS_GRAPHICS_INVALID_COLORBASIS __constant_cpu_to_le32(0xC01E033E) +#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE __constant_cpu_to_le32(0xC01E033F) +#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0340) +#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT __constant_cpu_to_le32(0xC01E0341) +#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE __constant_cpu_to_le32(0xC01E0342) +#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0343) +#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL __constant_cpu_to_le32(0xC01E0344) +#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION __constant_cpu_to_le32(0xC01E0345) +#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0346) +#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP __constant_cpu_to_le32(0xC01E0347) +#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0348) +#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0349) +#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET __constant_cpu_to_le32(0xC01E034A) +#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON __constant_cpu_to_le32(0xC01E034D) +#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE __constant_cpu_to_le32(0xC01E034E) +#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE __constant_cpu_to_le32(0xC01E034F) +#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS __constant_cpu_to_le32(0xC01E0350) +#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING __constant_cpu_to_le32(0xC01E0352) +#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED __constant_cpu_to_le32(0xC01E0353) +#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS __constant_cpu_to_le32(0xC01E0354) +#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT __constant_cpu_to_le32(0xC01E0355) +#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM __constant_cpu_to_le32(0xC01E0356) +#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN __constant_cpu_to_le32(0xC01E0357) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT __constant_cpu_to_le32(0xC01E0358) +#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED __constant_cpu_to_le32(0xC01E0359) +#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION __constant_cpu_to_le32(0xC01E035A) +#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE __constant_cpu_to_le32(0xC01E035B) +#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET __constant_cpu_to_le32(0xC01E035C) +#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED __constant_cpu_to_le32(0xC01E0400) +#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0401) +#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER __constant_cpu_to_le32(0xC01E0430) +#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0431) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0432) +#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY __constant_cpu_to_le32(0xC01E0433) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED __constant_cpu_to_le32(0xC01E0434) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON __constant_cpu_to_le32(0xC01E0435) +#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE __constant_cpu_to_le32(0xC01E0436) +#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER __constant_cpu_to_le32(0xC01E0438) +#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED __constant_cpu_to_le32(0xC01E043B) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS __constant_cpu_to_le32(0xC01E051C) +#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST __constant_cpu_to_le32(0xC01E051D) +#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E051E) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS __constant_cpu_to_le32(0xC01E051F) +#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0520) +#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST __constant_cpu_to_le32(0xC01E0521) +#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0500) +#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0501) +#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0502) +#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS __constant_cpu_to_le32(0xC01E0503) +#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E0504) +#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST __constant_cpu_to_le32(0xC01E0505) +#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E0506) +#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E0507) +#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0508) +#define STATUS_GRAPHICS_OPM_INVALID_POINTER __constant_cpu_to_le32(0xC01E050A) +#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E050B) +#define STATUS_GRAPHICS_OPM_INVALID_HANDLE __constant_cpu_to_le32(0xC01E050C) +#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E050D) +#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH __constant_cpu_to_le32(0xC01E050E) +#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED __constant_cpu_to_le32(0xC01E050F) +#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED __constant_cpu_to_le32(0xC01E0510) +#define STATUS_GRAPHICS_PVP_HFS_FAILED __constant_cpu_to_le32(0xC01E0511) +#define STATUS_GRAPHICS_OPM_INVALID_SRM __constant_cpu_to_le32(0xC01E0512) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP __constant_cpu_to_le32(0xC01E0513) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP __constant_cpu_to_le32(0xC01E0514) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA __constant_cpu_to_le32(0xC01E0515) +#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET __constant_cpu_to_le32(0xC01E0516) +#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH __constant_cpu_to_le32(0xC01E0517) +#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE __constant_cpu_to_le32(0xC01E0518) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E051A) +#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E051B) +#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0580) +#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC01E0581) +#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA __constant_cpu_to_le32(0xC01E0582) +#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA __constant_cpu_to_le32(0xC01E0583) +#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0584) +#define STATUS_GRAPHICS_DDCCI_INVALID_DATA __constant_cpu_to_le32(0xC01E0585) +#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE __constant_cpu_to_le32(0xC01E0586) +#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING __constant_cpu_to_le32(0xC01E0587) +#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E0588) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND __constant_cpu_to_le32(0xC01E0589) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH __constant_cpu_to_le32(0xC01E058A) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM __constant_cpu_to_le32(0xC01E058B) +#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE __constant_cpu_to_le32(0xC01E058C) +#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E058D) +#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED __constant_cpu_to_le32(0xC01E05E0) +#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E05E1) +#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E05E2) +#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E05E3) +#define STATUS_GRAPHICS_INVALID_POINTER __constant_cpu_to_le32(0xC01E05E4) +#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E05E5) +#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E05E6) +#define STATUS_GRAPHICS_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E05E7) +#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E05E8) +#define STATUS_FVE_LOCKED_VOLUME __constant_cpu_to_le32(0xC0210000) +#define STATUS_FVE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0210001) +#define STATUS_FVE_BAD_INFORMATION __constant_cpu_to_le32(0xC0210002) +#define STATUS_FVE_TOO_SMALL __constant_cpu_to_le32(0xC0210003) +#define STATUS_FVE_FAILED_WRONG_FS __constant_cpu_to_le32(0xC0210004) +#define STATUS_FVE_FAILED_BAD_FS __constant_cpu_to_le32(0xC0210005) +#define STATUS_FVE_FS_NOT_EXTENDED __constant_cpu_to_le32(0xC0210006) +#define STATUS_FVE_FS_MOUNTED __constant_cpu_to_le32(0xC0210007) +#define STATUS_FVE_NO_LICENSE __constant_cpu_to_le32(0xC0210008) +#define STATUS_FVE_ACTION_NOT_ALLOWED __constant_cpu_to_le32(0xC0210009) +#define STATUS_FVE_BAD_DATA __constant_cpu_to_le32(0xC021000A) +#define STATUS_FVE_VOLUME_NOT_BOUND __constant_cpu_to_le32(0xC021000B) +#define STATUS_FVE_NOT_DATA_VOLUME __constant_cpu_to_le32(0xC021000C) +#define STATUS_FVE_CONV_READ_ERROR __constant_cpu_to_le32(0xC021000D) +#define STATUS_FVE_CONV_WRITE_ERROR __constant_cpu_to_le32(0xC021000E) +#define STATUS_FVE_OVERLAPPED_UPDATE __constant_cpu_to_le32(0xC021000F) +#define STATUS_FVE_FAILED_SECTOR_SIZE __constant_cpu_to_le32(0xC0210010) +#define STATUS_FVE_FAILED_AUTHENTICATION __constant_cpu_to_le32(0xC0210011) +#define STATUS_FVE_NOT_OS_VOLUME __constant_cpu_to_le32(0xC0210012) +#define STATUS_FVE_KEYFILE_NOT_FOUND __constant_cpu_to_le32(0xC0210013) +#define STATUS_FVE_KEYFILE_INVALID __constant_cpu_to_le32(0xC0210014) +#define STATUS_FVE_KEYFILE_NO_VMK __constant_cpu_to_le32(0xC0210015) +#define STATUS_FVE_TPM_DISABLED __constant_cpu_to_le32(0xC0210016) +#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO __constant_cpu_to_le32(0xC0210017) +#define STATUS_FVE_TPM_INVALID_PCR __constant_cpu_to_le32(0xC0210018) +#define STATUS_FVE_TPM_NO_VMK __constant_cpu_to_le32(0xC0210019) +#define STATUS_FVE_PIN_INVALID __constant_cpu_to_le32(0xC021001A) +#define STATUS_FVE_AUTH_INVALID_APPLICATION __constant_cpu_to_le32(0xC021001B) +#define STATUS_FVE_AUTH_INVALID_CONFIG __constant_cpu_to_le32(0xC021001C) +#define STATUS_FVE_DEBUGGER_ENABLED __constant_cpu_to_le32(0xC021001D) +#define STATUS_FVE_DRY_RUN_FAILED __constant_cpu_to_le32(0xC021001E) +#define STATUS_FVE_BAD_METADATA_POINTER __constant_cpu_to_le32(0xC021001F) +#define STATUS_FVE_OLD_METADATA_COPY __constant_cpu_to_le32(0xC0210020) +#define STATUS_FVE_REBOOT_REQUIRED __constant_cpu_to_le32(0xC0210021) +#define STATUS_FVE_RAW_ACCESS __constant_cpu_to_le32(0xC0210022) +#define STATUS_FVE_RAW_BLOCKED __constant_cpu_to_le32(0xC0210023) +#define STATUS_FWP_CALLOUT_NOT_FOUND __constant_cpu_to_le32(0xC0220001) +#define STATUS_FWP_CONDITION_NOT_FOUND __constant_cpu_to_le32(0xC0220002) +#define STATUS_FWP_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC0220003) +#define STATUS_FWP_LAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220004) +#define STATUS_FWP_PROVIDER_NOT_FOUND __constant_cpu_to_le32(0xC0220005) +#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND __constant_cpu_to_le32(0xC0220006) +#define STATUS_FWP_SUBLAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220007) +#define STATUS_FWP_NOT_FOUND __constant_cpu_to_le32(0xC0220008) +#define STATUS_FWP_ALREADY_EXISTS __constant_cpu_to_le32(0xC0220009) +#define STATUS_FWP_IN_USE __constant_cpu_to_le32(0xC022000A) +#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS __constant_cpu_to_le32(0xC022000B) +#define STATUS_FWP_WRONG_SESSION __constant_cpu_to_le32(0xC022000C) +#define STATUS_FWP_NO_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000D) +#define STATUS_FWP_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000E) +#define STATUS_FWP_TXN_ABORTED __constant_cpu_to_le32(0xC022000F) +#define STATUS_FWP_SESSION_ABORTED __constant_cpu_to_le32(0xC0220010) +#define STATUS_FWP_INCOMPATIBLE_TXN __constant_cpu_to_le32(0xC0220011) +#define STATUS_FWP_TIMEOUT __constant_cpu_to_le32(0xC0220012) +#define STATUS_FWP_NET_EVENTS_DISABLED __constant_cpu_to_le32(0xC0220013) +#define STATUS_FWP_INCOMPATIBLE_LAYER __constant_cpu_to_le32(0xC0220014) +#define STATUS_FWP_KM_CLIENTS_ONLY __constant_cpu_to_le32(0xC0220015) +#define STATUS_FWP_LIFETIME_MISMATCH __constant_cpu_to_le32(0xC0220016) +#define STATUS_FWP_BUILTIN_OBJECT __constant_cpu_to_le32(0xC0220017) +#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS __constant_cpu_to_le32(0xC0220018) +#define STATUS_FWP_TOO_MANY_CALLOUTS __constant_cpu_to_le32(0xC0220018) +#define STATUS_FWP_NOTIFICATION_DROPPED __constant_cpu_to_le32(0xC0220019) +#define STATUS_FWP_TRAFFIC_MISMATCH __constant_cpu_to_le32(0xC022001A) +#define STATUS_FWP_INCOMPATIBLE_SA_STATE __constant_cpu_to_le32(0xC022001B) +#define STATUS_FWP_NULL_POINTER __constant_cpu_to_le32(0xC022001C) +#define STATUS_FWP_INVALID_ENUMERATOR __constant_cpu_to_le32(0xC022001D) +#define STATUS_FWP_INVALID_FLAGS __constant_cpu_to_le32(0xC022001E) +#define STATUS_FWP_INVALID_NET_MASK __constant_cpu_to_le32(0xC022001F) +#define STATUS_FWP_INVALID_RANGE __constant_cpu_to_le32(0xC0220020) +#define STATUS_FWP_INVALID_INTERVAL __constant_cpu_to_le32(0xC0220021) +#define STATUS_FWP_ZERO_LENGTH_ARRAY __constant_cpu_to_le32(0xC0220022) +#define STATUS_FWP_NULL_DISPLAY_NAME __constant_cpu_to_le32(0xC0220023) +#define STATUS_FWP_INVALID_ACTION_TYPE __constant_cpu_to_le32(0xC0220024) +#define STATUS_FWP_INVALID_WEIGHT __constant_cpu_to_le32(0xC0220025) +#define STATUS_FWP_MATCH_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220026) +#define STATUS_FWP_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220027) +#define STATUS_FWP_OUT_OF_BOUNDS __constant_cpu_to_le32(0xC0220028) +#define STATUS_FWP_RESERVED __constant_cpu_to_le32(0xC0220029) +#define STATUS_FWP_DUPLICATE_CONDITION __constant_cpu_to_le32(0xC022002A) +#define STATUS_FWP_DUPLICATE_KEYMOD __constant_cpu_to_le32(0xC022002B) +#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002C) +#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER __constant_cpu_to_le32(0xC022002D) +#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002E) +#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT __constant_cpu_to_le32(0xC022002F) +#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD __constant_cpu_to_le32(0xC0220030) +#define STATUS_FWP_INCOMPATIBLE_DH_GROUP __constant_cpu_to_le32(0xC0220031) +#define STATUS_FWP_EM_NOT_SUPPORTED __constant_cpu_to_le32(0xC0220032) +#define STATUS_FWP_NEVER_MATCH __constant_cpu_to_le32(0xC0220033) +#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0220034) +#define STATUS_FWP_INVALID_PARAMETER __constant_cpu_to_le32(0xC0220035) +#define STATUS_FWP_TOO_MANY_SUBLAYERS __constant_cpu_to_le32(0xC0220036) +#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED __constant_cpu_to_le32(0xC0220037) +#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG __constant_cpu_to_le32(0xC0220038) +#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG __constant_cpu_to_le32(0xC0220039) +#define STATUS_FWP_TCPIP_NOT_READY __constant_cpu_to_le32(0xC0220100) +#define STATUS_FWP_INJECT_HANDLE_CLOSING __constant_cpu_to_le32(0xC0220101) +#define STATUS_FWP_INJECT_HANDLE_STALE __constant_cpu_to_le32(0xC0220102) +#define STATUS_FWP_CANNOT_PEND __constant_cpu_to_le32(0xC0220103) +#define STATUS_NDIS_CLOSING __constant_cpu_to_le32(0xC0230002) +#define STATUS_NDIS_BAD_VERSION __constant_cpu_to_le32(0xC0230004) +#define STATUS_NDIS_BAD_CHARACTERISTICS __constant_cpu_to_le32(0xC0230005) +#define STATUS_NDIS_ADAPTER_NOT_FOUND __constant_cpu_to_le32(0xC0230006) +#define STATUS_NDIS_OPEN_FAILED __constant_cpu_to_le32(0xC0230007) +#define STATUS_NDIS_DEVICE_FAILED __constant_cpu_to_le32(0xC0230008) +#define STATUS_NDIS_MULTICAST_FULL __constant_cpu_to_le32(0xC0230009) +#define STATUS_NDIS_MULTICAST_EXISTS __constant_cpu_to_le32(0xC023000A) +#define STATUS_NDIS_MULTICAST_NOT_FOUND __constant_cpu_to_le32(0xC023000B) +#define STATUS_NDIS_REQUEST_ABORTED __constant_cpu_to_le32(0xC023000C) +#define STATUS_NDIS_RESET_IN_PROGRESS __constant_cpu_to_le32(0xC023000D) +#define STATUS_NDIS_INVALID_PACKET __constant_cpu_to_le32(0xC023000F) +#define STATUS_NDIS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0230010) +#define STATUS_NDIS_ADAPTER_NOT_READY __constant_cpu_to_le32(0xC0230011) +#define STATUS_NDIS_INVALID_LENGTH __constant_cpu_to_le32(0xC0230014) +#define STATUS_NDIS_INVALID_DATA __constant_cpu_to_le32(0xC0230015) +#define STATUS_NDIS_BUFFER_TOO_SHORT __constant_cpu_to_le32(0xC0230016) +#define STATUS_NDIS_INVALID_OID __constant_cpu_to_le32(0xC0230017) +#define STATUS_NDIS_ADAPTER_REMOVED __constant_cpu_to_le32(0xC0230018) +#define STATUS_NDIS_UNSUPPORTED_MEDIA __constant_cpu_to_le32(0xC0230019) +#define STATUS_NDIS_GROUP_ADDRESS_IN_USE __constant_cpu_to_le32(0xC023001A) +#define STATUS_NDIS_FILE_NOT_FOUND __constant_cpu_to_le32(0xC023001B) +#define STATUS_NDIS_ERROR_READING_FILE __constant_cpu_to_le32(0xC023001C) +#define STATUS_NDIS_ALREADY_MAPPED __constant_cpu_to_le32(0xC023001D) +#define STATUS_NDIS_RESOURCE_CONFLICT __constant_cpu_to_le32(0xC023001E) +#define STATUS_NDIS_MEDIA_DISCONNECTED __constant_cpu_to_le32(0xC023001F) +#define STATUS_NDIS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0230022) +#define STATUS_NDIS_PAUSED __constant_cpu_to_le32(0xC023002A) +#define STATUS_NDIS_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC023002B) +#define STATUS_NDIS_UNSUPPORTED_REVISION __constant_cpu_to_le32(0xC023002C) +#define STATUS_NDIS_INVALID_PORT __constant_cpu_to_le32(0xC023002D) +#define STATUS_NDIS_INVALID_PORT_STATE __constant_cpu_to_le32(0xC023002E) +#define STATUS_NDIS_LOW_POWER_STATE __constant_cpu_to_le32(0xC023002F) +#define STATUS_NDIS_NOT_SUPPORTED __constant_cpu_to_le32(0xC02300BB) +#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED __constant_cpu_to_le32(0xC0232000) +#define STATUS_NDIS_DOT11_MEDIA_IN_USE __constant_cpu_to_le32(0xC0232001) +#define STATUS_NDIS_DOT11_POWER_STATE_INVALID __constant_cpu_to_le32(0xC0232002) +#define STATUS_IPSEC_BAD_SPI __constant_cpu_to_le32(0xC0360001) +#define STATUS_IPSEC_SA_LIFETIME_EXPIRED __constant_cpu_to_le32(0xC0360002) +#define STATUS_IPSEC_WRONG_SA __constant_cpu_to_le32(0xC0360003) +#define STATUS_IPSEC_REPLAY_CHECK_FAILED __constant_cpu_to_le32(0xC0360004) +#define STATUS_IPSEC_INVALID_PACKET __constant_cpu_to_le32(0xC0360005) +#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED __constant_cpu_to_le32(0xC0360006) +#define STATUS_IPSEC_CLEAR_TEXT_DROP __constant_cpu_to_le32(0xC0360007) diff --git a/kernel/fs/cifs/smb2transport.c b/kernel/fs/cifs/smb2transport.c new file mode 100644 index 000000000..d4c5b6f10 --- /dev/null +++ b/kernel/fs/cifs/smb2transport.c @@ -0,0 +1,612 @@ +/* + * fs/cifs/smb2transport.c + * + * Copyright (C) International Business Machines Corp., 2002, 2011 + * Etersoft, 2012 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) 2006 + * Pavel Shilovsky (pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "smb2pdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "smb2proto.h" +#include "cifs_debug.h" +#include "smb2status.h" +#include "smb2glob.h" + +static int +smb2_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + if (server->secmech.sdeschmacsha256 != NULL) + return 0; /* already allocated */ + + server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); + if (IS_ERR(server->secmech.hmacsha256)) { + cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); + rc = PTR_ERR(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + return rc; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacsha256); + server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacsha256) { + crypto_free_shash(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + return -ENOMEM; + } + server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; + server->secmech.sdeschmacsha256->shash.flags = 0x0; + + return 0; +} + +static int +smb3_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + unsigned int size; + int rc; + + if (server->secmech.sdesccmacaes != NULL) + return 0; /* already allocated */ + + rc = smb2_crypto_shash_allocate(server); + if (rc) + return rc; + + server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0); + if (IS_ERR(server->secmech.cmacaes)) { + cifs_dbg(VFS, "could not allocate crypto cmac-aes"); + kfree(server->secmech.sdeschmacsha256); + server->secmech.sdeschmacsha256 = NULL; + crypto_free_shash(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + rc = PTR_ERR(server->secmech.cmacaes); + server->secmech.cmacaes = NULL; + return rc; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.cmacaes); + server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdesccmacaes) { + cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__); + kfree(server->secmech.sdeschmacsha256); + server->secmech.sdeschmacsha256 = NULL; + crypto_free_shash(server->secmech.hmacsha256); + crypto_free_shash(server->secmech.cmacaes); + server->secmech.hmacsha256 = NULL; + server->secmech.cmacaes = NULL; + return -ENOMEM; + } + server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes; + server->secmech.sdesccmacaes->shash.flags = 0x0; + + return 0; +} + +static struct cifs_ses * +smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid != smb2hdr->SessionId) + continue; + spin_unlock(&cifs_tcp_ses_lock); + return ses; + } + spin_unlock(&cifs_tcp_ses_lock); + + return NULL; +} + + +int +smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + int i, rc; + unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; + unsigned char *sigptr = smb2_signature; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } + + memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); + memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + + rc = smb2_crypto_shash_allocate(server); + if (rc) { + cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__); + return rc; + } + + rc = crypto_shash_setkey(server->secmech.hmacsha256, + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + return rc; + } + + rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init sha256", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cifs_dbg(VFS, "null iovec entry\n"); + return -EIO; + } + /* + * The first entry includes a length field (which does not get + * signed that occupies the first 4 bytes before the header). + */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update( + &server->secmech.sdeschmacsha256->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update( + &server->secmech.sdeschmacsha256->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + sigptr); + if (rc) + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + + return rc; +} + +int +generate_smb3signingkey(struct cifs_ses *ses) +{ + unsigned char zero = 0x0; + __u8 i[4] = {0, 0, 0, 1}; + __u8 L[4] = {0, 0, 0, 128}; + int rc = 0; + unsigned char prfhash[SMB2_HMACSHA256_SIZE]; + unsigned char *hashptr = prfhash; + + memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); + memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + + rc = smb3_crypto_shash_allocate(ses->server); + if (rc) { + cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_setkey(ses->server->secmech.hmacsha256, + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + i, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with n\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + "SMB2AESCMAC", 12); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with label\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + &zero, 1); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + "SmbSign", 8); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with context\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, + L, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with L\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash, + hashptr); + if (rc) { + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + goto smb3signkey_ret; + } + + memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + +smb3signkey_ret: + return rc; +} + +int +smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + int i; + int rc = 0; + unsigned char smb3_signature[SMB2_CMACAES_SIZE]; + unsigned char *sigptr = smb3_signature; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } + + memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); + memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + + rc = crypto_shash_setkey(server->secmech.cmacaes, + ses->smb3signingkey, SMB2_CMACAES_SIZE); + + if (rc) { + cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); + return rc; + } + + /* + * we already allocate sdesccmacaes when we init smb3 signing key, + * so unlike smb2 case we do not have to check here if secmech are + * initialized + */ + rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cifs_dbg(VFS, "null iovec entry"); + return -EIO; + } + /* + * The first entry includes a length field (which does not get + * signed that occupies the first 4 bytes before the header). + */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdesccmacaes->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash, + sigptr); + if (rc) + cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__); + + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + + return rc; +} + +/* must be called with server->srv_mutex held */ +static int +smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + int rc = 0; + struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base; + + if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) || + server->tcpStatus == CifsNeedNegotiate) + return rc; + + if (!server->session_estab) { + strncpy(smb2_pdu->Signature, "BSRSPYL", 8); + return rc; + } + + rc = server->ops->calc_signature(rqst, server); + + return rc; +} + +int +smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + unsigned int rc; + char server_response_sig[16]; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + + if ((smb2_pdu->Command == SMB2_NEGOTIATE) || + (smb2_pdu->Command == SMB2_SESSION_SETUP) || + (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || + (!server->session_estab)) + return 0; + + /* + * BB what if signatures are supposed to be on for session but + * server does not send one? BB + */ + + /* Do not need to verify session setups with signature "BSRSPYL " */ + if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0) + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + smb2_pdu->Command); + + /* + * Save off the origiginal signature so we can modify the smb and check + * our calculated signature against what the server sent. + */ + memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE); + + memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); + + mutex_lock(&server->srv_mutex); + rc = server->ops->calc_signature(rqst, server); + mutex_unlock(&server->srv_mutex); + + if (rc) + return rc; + + if (memcmp(server_response_sig, smb2_pdu->Signature, + SMB2_SIGNATURE_SIZE)) + return -EACCES; + else + return 0; +} + +/* + * Set message id for the request. Should be called after wait_for_free_request + * and when srv_mutex is held. + */ +static inline void +smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) +{ + unsigned int i, num = le16_to_cpu(hdr->CreditCharge); + + hdr->MessageId = get_next_mid64(server); + /* skip message numbers according to CreditCharge field */ + for (i = 1; i < num; i++) + get_next_mid(server); +} + +static struct mid_q_entry * +smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, + struct TCP_Server_Info *server) +{ + struct mid_q_entry *temp; + + if (server == NULL) { + cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); + return NULL; + } + + temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); + if (temp == NULL) + return temp; + else { + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = le64_to_cpu(smb_buffer->MessageId); + temp->pid = current->pid; + temp->command = smb_buffer->Command; /* Always LE */ + temp->when_alloc = jiffies; + temp->server = server; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; + } + + atomic_inc(&midCount); + temp->mid_state = MID_REQUEST_ALLOCATED; + return temp; +} + +static int +smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, + struct mid_q_entry **mid) +{ + if (ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + if (ses->server->tcpStatus == CifsNeedReconnect) { + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); + return -EAGAIN; + } + + if (ses->status == CifsNew) { + if ((buf->Command != SMB2_SESSION_SETUP) && + (buf->Command != SMB2_NEGOTIATE)) + return -EAGAIN; + /* else ok - we are setting up session */ + } + + if (ses->status == CifsExiting) { + if (buf->Command != SMB2_LOGOFF) + return -EAGAIN; + /* else ok - we are shutting down the session */ + } + + *mid = smb2_mid_entry_alloc(buf, ses->server); + if (*mid == NULL) + return -ENOMEM; + spin_lock(&GlobalMid_Lock); + list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + return 0; +} + +int +smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, + bool log_error) +{ + unsigned int len = get_rfc1002_length(mid->resp_buf); + struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; + + iov.iov_base = (char *)mid->resp_buf; + iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 80, len)); + /* convert the length into a more usable form */ + if (len > 24 && server->sign) { + int rc; + + rc = smb2_verify_signature(&rqst, server); + if (rc) + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); + } + + return map_smb2_to_linux_error(mid->resp_buf, log_error); +} + +struct mid_q_entry * +smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +{ + int rc; + struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + struct mid_q_entry *mid; + + smb2_seq_num_into_buf(ses->server, hdr); + + rc = smb2_get_mid_entry(ses, hdr, &mid); + if (rc) + return ERR_PTR(rc); + rc = smb2_sign_rqst(rqst, ses->server); + if (rc) { + cifs_delete_mid(mid); + return ERR_PTR(rc); + } + return mid; +} + +struct mid_q_entry * +smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + int rc; + struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + struct mid_q_entry *mid; + + smb2_seq_num_into_buf(server, hdr); + + mid = smb2_mid_entry_alloc(hdr, server); + if (mid == NULL) + return ERR_PTR(-ENOMEM); + + rc = smb2_sign_rqst(rqst, server); + if (rc) { + DeleteMidQEntry(mid); + return ERR_PTR(rc); + } + + return mid; +} diff --git a/kernel/fs/cifs/smbencrypt.c b/kernel/fs/cifs/smbencrypt.c new file mode 100644 index 000000000..a4232ec4f --- /dev/null +++ b/kernel/fs/cifs/smbencrypt.c @@ -0,0 +1,249 @@ +/* + Unix SMB/Netbios implementation. + Version 1.9. + SMB parameters and setup + Copyright (C) Andrew Tridgell 1992-2000 + Copyright (C) Luke Kenneth Casson Leighton 1996-2000 + Modified by Jeremy Allison 1995. + Copyright (C) Andrew Bartlett 2002-2003 + Modified by Steve French (sfrench@us.ibm.com) 2002-2003 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsproto.h" + +#ifndef false +#define false 0 +#endif +#ifndef true +#define true 1 +#endif + +/* following came from the other byteorder.h to avoid include conflicts */ +#define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) +#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) +#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) + +static void +str_to_key(unsigned char *str, unsigned char *key) +{ + int i; + + key[0] = str[0] >> 1; + key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); + key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); + key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); + key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); + key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); + key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); + key[7] = str[6] & 0x7F; + for (i = 0; i < 8; i++) + key[i] = (key[i] << 1); +} + +static int +smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) +{ + int rc; + unsigned char key2[8]; + struct crypto_blkcipher *tfm_des; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + + str_to_key(key, key2); + + tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_des)) { + rc = PTR_ERR(tfm_des); + cifs_dbg(VFS, "could not allocate des crypto API\n"); + goto smbhash_err; + } + + desc.tfm = tfm_des; + + crypto_blkcipher_setkey(tfm_des, key2, 8); + + sg_init_one(&sgin, in, 8); + sg_init_one(&sgout, out, 8); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); + if (rc) + cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); + + crypto_free_blkcipher(tfm_des); +smbhash_err: + return rc; +} + +static int +E_P16(unsigned char *p14, unsigned char *p16) +{ + int rc; + unsigned char sp8[8] = + { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 }; + + rc = smbhash(p16, sp8, p14); + if (rc) + return rc; + rc = smbhash(p16 + 8, sp8, p14 + 7); + return rc; +} + +static int +E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) +{ + int rc; + + rc = smbhash(p24, c8, p21); + if (rc) + return rc; + rc = smbhash(p24 + 8, c8, p21 + 7); + if (rc) + return rc; + rc = smbhash(p24 + 16, c8, p21 + 14); + return rc; +} + +/* produce a md4 message digest from data of length n bytes */ +int +mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) +{ + int rc; + unsigned int size; + struct crypto_shash *md4; + struct sdesc *sdescmd4; + + md4 = crypto_alloc_shash("md4", 0, 0); + if (IS_ERR(md4)) { + rc = PTR_ERR(md4); + cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n", + __func__, rc); + return rc; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); + sdescmd4 = kmalloc(size, GFP_KERNEL); + if (!sdescmd4) { + rc = -ENOMEM; + goto mdfour_err; + } + sdescmd4->shash.tfm = md4; + sdescmd4->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd4->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__); + goto mdfour_err; + } + rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); + goto mdfour_err; + } + rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + if (rc) + cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__); + +mdfour_err: + crypto_free_shash(md4); + kfree(sdescmd4); + + return rc; +} + +/* + This implements the X/Open SMB password encryption + It takes a password, a 8 byte "crypt key" and puts 24 bytes of + encrypted password into p24 */ +/* Note that password must be uppercased and null terminated */ +int +SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) +{ + int rc; + unsigned char p14[14], p16[16], p21[21]; + + memset(p14, '\0', 14); + memset(p16, '\0', 16); + memset(p21, '\0', 21); + + memcpy(p14, passwd, 14); + rc = E_P16(p14, p16); + if (rc) + return rc; + + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); + + return rc; +} + +/* + * Creates the MD4 Hash of the users password in NT UNICODE. + */ + +int +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) +{ + int rc; + int len; + __le16 wpwd[129]; + + /* Password cannot be longer than 128 characters */ + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUTF16(wpwd, passwd, 128, codepage); + else { + len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } + + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); + memzero_explicit(wpwd, sizeof(wpwd)); + + return rc; +} + +/* Does the NT MD4 hash then des encryption. */ +int +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) +{ + int rc; + unsigned char p16[16], p21[21]; + + memset(p16, '\0', 16); + memset(p21, '\0', 21); + + rc = E_md4hash(passwd, p16, codepage); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); + return rc; + } + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); + return rc; +} diff --git a/kernel/fs/cifs/smberr.h b/kernel/fs/cifs/smberr.h new file mode 100644 index 000000000..7f16cb825 --- /dev/null +++ b/kernel/fs/cifs/smberr.h @@ -0,0 +1,184 @@ +/* + * fs/cifs/smberr.h + * + * Copyright (c) International Business Machines Corp., 2002,2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * See Error Codes section of the SNIA CIFS Specification + * for more information + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SUCCESS 0x00 /* The request was successful. */ +#define ERRDOS 0x01 /* Error is from the core DOS operating system set */ +#define ERRSRV 0x02 /* Error is generated by the file server daemon */ +#define ERRHRD 0x03 /* Error is a hardware error. */ +#define ERRCMD 0xFF /* Command was not in the "SMB" format. */ + +/* The following error codes may be generated with the SUCCESS error class.*/ + +/*#define SUCCESS 0 The request was successful. */ + +/* The following error codes may be generated with the ERRDOS error class.*/ + +#define ERRbadfunc 1 /* Invalid function. The server did not + recognize or could not perform a + system call generated by the server, + e.g., set the DIRECTORY attribute on + a data file, invalid seek mode. */ +#define ERRbadfile 2 /* File not found. The last component + of a file's pathname could not be + found. */ +#define ERRbadpath 3 /* Directory invalid. A directory + component in a pathname could not be + found. */ +#define ERRnofids 4 /* Too many open files. The server has + no file handles available. */ +#define ERRnoaccess 5 /* Access denied, the client's context + does not permit the requested + function. This includes the + following conditions: invalid rename + command, write to Fid open for read + only, read on Fid open for write + only, attempt to delete a non-empty + directory */ +#define ERRbadfid 6 /* Invalid file handle. The file handle + specified was not recognized by the + server. */ +#define ERRbadmcb 7 /* Memory control blocks destroyed. */ +#define ERRnomem 8 /* Insufficient server memory to + perform the requested function. */ +#define ERRbadmem 9 /* Invalid memory block address. */ +#define ERRbadenv 10 /* Invalid environment. */ +#define ERRbadformat 11 /* Invalid format. */ +#define ERRbadaccess 12 /* Invalid open mode. */ +#define ERRbaddata 13 /* Invalid data (generated only by + IOCTL calls within the server). */ +#define ERRbaddrive 15 /* Invalid drive specified. */ +#define ERRremcd 16 /* A Delete Directory request attempted + to remove the server's current + directory. */ +#define ERRdiffdevice 17 /* Not same device (e.g., a cross + volume rename was attempted */ +#define ERRnofiles 18 /* A File Search command can find no + more files matching the specified + criteria. */ +#define ERRwriteprot 19 /* media is write protected */ +#define ERRgeneral 31 +#define ERRbadshare 32 /* The sharing mode specified for an + Open conflicts with existing FIDs on + the file. */ +#define ERRlock 33 /* A Lock request conflicted with an + existing lock or specified an + invalid mode, or an Unlock requested + attempted to remove a lock held by + another process. */ +#define ERRunsup 50 +#define ERRnosuchshare 67 +#define ERRfilexists 80 /* The file named in the request + already exists. */ +#define ERRinvparm 87 +#define ERRdiskfull 112 +#define ERRinvname 123 +#define ERRinvlevel 124 +#define ERRdirnotempty 145 +#define ERRnotlocked 158 +#define ERRcancelviolation 173 +#define ERRalreadyexists 183 +#define ERRbadpipe 230 +#define ERRpipebusy 231 +#define ERRpipeclosing 232 +#define ERRnotconnected 233 +#define ERRmoredata 234 +#define ERReasnotsupported 282 +#define ErrQuota 0x200 /* The operation would cause a quota + limit to be exceeded. */ +#define ErrNotALink 0x201 /* A link operation was performed on a + pathname that was not a link. */ + +/* Below errors are used internally (do not come over the wire) for passthrough + from STATUS codes to POSIX only */ +#define ERRsymlink 0xFFFD +#define ErrTooManyLinks 0xFFFE + +/* Following error codes may be generated with the ERRSRV error class.*/ + +#define ERRerror 1 /* Non-specific error code. It is + returned under the following + conditions: resource other than disk + space exhausted (e.g. TIDs), first + SMB command was not negotiate, + multiple negotiates attempted, and + internal server error. */ +#define ERRbadpw 2 /* Bad password - name/password pair in + a TreeConnect or Session Setup are + invalid. */ +#define ERRbadtype 3 /* used for indicating DFS referral + needed */ +#define ERRaccess 4 /* The client does not have the + necessary access rights within the + specified context for requested + function. */ +#define ERRinvtid 5 /* The Tid specified in a command was + invalid. */ +#define ERRinvnetname 6 /* Invalid network name in tree + connect. */ +#define ERRinvdevice 7 /* Invalid device - printer request + made to non-printer connection or + non-printer request made to printer + connection. */ +#define ERRqfull 49 /* Print queue full (files) -- returned + by open print file. */ +#define ERRqtoobig 50 /* Print queue full -- no space. */ +#define ERRqeof 51 /* EOF on print queue dump */ +#define ERRinvpfid 52 /* Invalid print file FID. */ +#define ERRsmbcmd 64 /* The server did not recognize the + command received. */ +#define ERRsrverror 65 /* The server encountered an internal + error, e.g., system file + unavailable. */ +#define ERRbadBID 66 /* (obsolete) */ +#define ERRfilespecs 67 /* The Fid and pathname parameters + contained an invalid combination of + values. */ +#define ERRbadLink 68 /* (obsolete) */ +#define ERRbadpermits 69 /* The access permissions specified for + a file or directory are not a valid + combination. */ +#define ERRbadPID 70 +#define ERRsetattrmode 71 /* attribute (mode) is invalid */ +#define ERRpaused 81 /* Server is paused */ +#define ERRmsgoff 82 /* reserved - messaging off */ +#define ERRnoroom 83 /* reserved - no room for message */ +#define ERRrmuns 87 /* reserved - too many remote names */ +#define ERRtimeout 88 /* operation timed out */ +#define ERRnoresource 89 /* No resources available for request + */ +#define ERRtoomanyuids 90 /* Too many UIDs active on this session + */ +#define ERRbaduid 91 /* The UID is not known as a valid user + */ +#define ERRusempx 250 /* temporarily unable to use raw */ +#define ERRusestd 251 /* temporarily unable to use either raw + or mpx */ +#define ERR_NOTIFY_ENUM_DIR 1024 +#define ERRnoSuchUser 2238 /* user account does not exist */ +#define ERRaccountexpired 2239 +#define ERRbadclient 2240 /* can not logon from this client */ +#define ERRbadLogonTime 2241 /* logon hours do not allow this */ +#define ERRpasswordExpired 2242 +#define ERRnetlogonNotStarted 2455 +#define ERRnosupport 0xFFFF diff --git a/kernel/fs/cifs/smbfsctl.h b/kernel/fs/cifs/smbfsctl.h new file mode 100644 index 000000000..83efa5953 --- /dev/null +++ b/kernel/fs/cifs/smbfsctl.h @@ -0,0 +1,121 @@ +/* + * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions + * + * Copyright (c) International Business Machines Corp., 2002,2013 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* IOCTL information */ +/* + * List of ioctl/fsctl function codes that are or could be useful in the + * future to remote clients like cifs or SMB2/SMB3 client. This is probably + * a slightly larger set of fsctls that NTFS local filesystem could handle, + * including the seven below that we do not have struct definitions for. + * Even with protocol definitions for most of these now available, we still + * need to do some experimentation to identify which are practical to do + * remotely. Some of the following, such as the encryption/compression ones + * could be invoked from tools via a specialized hook into the VFS rather + * than via the standard vfs entry points + * + * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are + * below). Additional detail on less common ones can be found in MS-FSCC + * section 2.3. + */ +#define FSCTL_DFS_GET_REFERRALS 0x00060194 +#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0 +#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 +#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 +#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 +#define FSCTL_LOCK_VOLUME 0x00090018 +#define FSCTL_UNLOCK_VOLUME 0x0009001C +#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */ +#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */ +#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */ +#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */ +/* Verify the next FSCTL number, we had it as 0x00090090 before */ +#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */ +#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */ +#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */ +#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */ +#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */ +#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C +#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */ +#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */ +#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */ +#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */ +#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */ +#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */ +#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */ +#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ +#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ +#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ +#define FSCTL_SET_ZERO_DATA 0x000980C8 +#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ +#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ +#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ +#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */ +#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */ +#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */ +#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */ +#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */ +#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ +#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ +#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ +#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ +#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ +#define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ +#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ +/* strange that the number for this op is not sequential with previous op */ +#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ +/* Enumerate previous versions of a file */ +#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064 +/* Retrieve an opaque file reference for server-side data movement ie copy */ +#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078 +#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ +#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ +#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 +/* Perform server-side data movement */ +#define FSCTL_SRV_COPYCHUNK 0x001440F2 +#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 +#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ +#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ + +/* See FSCC 2.1.2.5 */ +#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 +#define IO_REPARSE_TAG_HSM 0xC0000004 +#define IO_REPARSE_TAG_SIS 0x80000007 +#define IO_REPARSE_TAG_HSM2 0x80000006 +#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005 +/* Used by the DFS filter. See MS-DFSC */ +#define IO_REPARSE_TAG_DFS 0x8000000A +/* Used by the DFS filter See MS-DFSC */ +#define IO_REPARSE_TAG_DFSR 0x80000012 +#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B +/* See section MS-FSCC 2.1.2.4 */ +#define IO_REPARSE_TAG_SYMLINK 0xA000000C +#define IO_REPARSE_TAG_DEDUP 0x80000013 +#define IO_REPARSE_APPXSTREAM 0xC0000014 +/* NFS symlinks, Win 8/SMB3 and later */ +#define IO_REPARSE_TAG_NFS 0x80000014 + +/* fsctl flags */ +/* If Flags is set to this value, the request is an FSCTL not ioctl request */ +#define SMB2_0_IOCTL_IS_FSCTL 0x00000001 + diff --git a/kernel/fs/cifs/transport.c b/kernel/fs/cifs/transport.c new file mode 100644 index 000000000..126f46b88 --- /dev/null +++ b/kernel/fs/cifs/transport.c @@ -0,0 +1,1113 @@ +/* + * fs/cifs/transport.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) 2006. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +void +cifs_wake_up_task(struct mid_q_entry *mid) +{ + wake_up_process(mid->callback_data); +} + +struct mid_q_entry * +AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) +{ + struct mid_q_entry *temp; + + if (server == NULL) { + cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n"); + return NULL; + } + + temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); + if (temp == NULL) + return temp; + else { + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = get_mid(smb_buffer); + temp->pid = current->pid; + temp->command = cpu_to_le16(smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); + /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ + /* when mid allocated can be before when sent */ + temp->when_alloc = jiffies; + temp->server = server; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; + } + + atomic_inc(&midCount); + temp->mid_state = MID_REQUEST_ALLOCATED; + return temp; +} + +void +DeleteMidQEntry(struct mid_q_entry *midEntry) +{ +#ifdef CONFIG_CIFS_STATS2 + __le16 command = midEntry->server->vals->lock_cmd; + unsigned long now; +#endif + midEntry->mid_state = MID_FREE; + atomic_dec(&midCount); + if (midEntry->large_buf) + cifs_buf_release(midEntry->resp_buf); + else + cifs_small_buf_release(midEntry->resp_buf); +#ifdef CONFIG_CIFS_STATS2 + now = jiffies; + /* commands taking longer than one second are indications that + something is wrong, unless it is quite a slow link or server */ + if ((now - midEntry->when_alloc) > HZ) { + if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) { + pr_debug(" CIFS slow rsp: cmd %d mid %llu", + midEntry->command, midEntry->mid); + pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", + now - midEntry->when_alloc, + now - midEntry->when_sent, + now - midEntry->when_received); + } + } +#endif + mempool_free(midEntry, cifs_mid_poolp); +} + +void +cifs_delete_mid(struct mid_q_entry *mid) +{ + spin_lock(&GlobalMid_Lock); + list_del(&mid->qhead); + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); +} + +/* + * smb_send_kvec - send an array of kvecs to the server + * @server: Server to send the data to + * @iov: Pointer to array of kvecs + * @n_vec: length of kvec array + * @sent: amount of data sent on socket is stored here + * + * Our basic "send data to server" function. Should be called with srv_mutex + * held. The caller is responsible for handling the results. + */ +static int +smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, + size_t *sent) +{ + int rc = 0; + int i = 0; + struct msghdr smb_msg; + unsigned int remaining; + size_t first_vec = 0; + struct socket *ssocket = server->ssocket; + + *sent = 0; + + smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; + smb_msg.msg_namelen = sizeof(struct sockaddr); + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + if (server->noblocksnd) + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + else + smb_msg.msg_flags = MSG_NOSIGNAL; + + remaining = 0; + for (i = 0; i < n_vec; i++) + remaining += iov[i].iov_len; + + i = 0; + while (remaining) { + /* + * If blocking send, we try 3 times, since each can block + * for 5 seconds. For nonblocking we have to try more + * but wait increasing amounts of time allowing time for + * socket to clear. The overall time we wait in either + * case to send on the socket is about 15 seconds. + * Similarly we wait for 15 seconds for a response from + * the server in SendReceive[2] for the server to send + * a response back for most types of requests (except + * SMB Write past end of file which can be slow, and + * blocking lock operations). NFS waits slightly longer + * than CIFS, but this can make it take longer for + * nonresponsive servers to be detected and 15 seconds + * is more than enough time for modern networks to + * send a packet. In most cases if we fail to send + * after the retries we will kill the socket and + * reconnect which may clear the network problem. + */ + rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], + n_vec - first_vec, remaining); + if (rc == -EAGAIN) { + i++; + if (i >= 14 || (!server->noblocksnd && (i > 2))) { + cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + ssocket); + rc = -EAGAIN; + break; + } + msleep(1 << i); + continue; + } + + if (rc < 0) + break; + + /* send was at least partially successful */ + *sent += rc; + + if (rc == remaining) { + remaining = 0; + break; + } + + if (rc > remaining) { + cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining); + break; + } + + if (rc == 0) { + /* should never happen, letting socket clear before + retrying is our only obvious option here */ + cifs_dbg(VFS, "tcp sent no data\n"); + msleep(500); + continue; + } + + remaining -= rc; + + /* the line below resets i */ + for (i = first_vec; i < n_vec; i++) { + if (iov[i].iov_len) { + if (rc > iov[i].iov_len) { + rc -= iov[i].iov_len; + iov[i].iov_len = 0; + } else { + iov[i].iov_base += rc; + iov[i].iov_len -= rc; + first_vec = i; + break; + } + } + } + + i = 0; /* in case we get ENOSPC on the next send */ + rc = 0; + } + return rc; +} + +/** + * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec + * @rqst: pointer to smb_rqst + * @idx: index into the array of the page + * @iov: pointer to struct kvec that will hold the result + * + * Helper function to convert a slot in the rqst->rq_pages array into a kvec. + * The page will be kmapped and the address placed into iov_base. The length + * will then be adjusted according to the ptailoff. + */ +void +cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, + struct kvec *iov) +{ + /* + * FIXME: We could avoid this kmap altogether if we used + * kernel_sendpage instead of kernel_sendmsg. That will only + * work if signing is disabled though as sendpage inlines the + * page directly into the fraglist. If userspace modifies the + * page after we calculate the signature, then the server will + * reject it and may break the connection. kernel_sendmsg does + * an extra copy of the data and avoids that issue. + */ + iov->iov_base = kmap(rqst->rq_pages[idx]); + + /* if last page, don't send beyond this offset into page */ + if (idx == (rqst->rq_npages - 1)) + iov->iov_len = rqst->rq_tailsz; + else + iov->iov_len = rqst->rq_pagesz; +} + +static unsigned long +rqst_len(struct smb_rqst *rqst) +{ + unsigned int i; + struct kvec *iov = rqst->rq_iov; + unsigned long buflen = 0; + + /* total up iov array first */ + for (i = 0; i < rqst->rq_nvec; i++) + buflen += iov[i].iov_len; + + /* add in the page array if there is one */ + if (rqst->rq_npages) { + buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); + buflen += rqst->rq_tailsz; + } + + return buflen; +} + +static int +smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + int rc; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); + unsigned long send_length; + unsigned int i; + size_t total_len = 0, sent; + struct socket *ssocket = server->ssocket; + int val = 1; + + if (ssocket == NULL) + return -ENOTSOCK; + + /* sanity check send length */ + send_length = rqst_len(rqst); + if (send_length != smb_buf_length + 4) { + WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n", + send_length, smb_buf_length); + return -EIO; + } + + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); + dump_smb(iov[0].iov_base, iov[0].iov_len); + + /* cork the socket */ + kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, + (char *)&val, sizeof(val)); + + rc = smb_send_kvec(server, iov, n_vec, &sent); + if (rc < 0) + goto uncork; + + total_len += sent; + + /* now walk the page array and send each page in it */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + rc = smb_send_kvec(server, &p_iov, 1, &sent); + kunmap(rqst->rq_pages[i]); + if (rc < 0) + break; + + total_len += sent; + } + +uncork: + /* uncork it */ + val = 0; + kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, + (char *)&val, sizeof(val)); + + if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", + smb_buf_length + 4, total_len); + /* + * If we have only sent part of an SMB then the next SMB could + * be taken as the remainder of this one. We need to kill the + * socket so the server throws away the partial SMB + */ + server->tcpStatus = CifsNeedReconnect; + } + + if (rc < 0 && rc != -EINTR) + cifs_dbg(VFS, "Error %d sending data on socket to server\n", + rc); + else + rc = 0; + + return rc; +} + +static int +smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) +{ + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; + + return smb_send_rqst(server, &rqst); +} + +int +smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, + unsigned int smb_buf_length) +{ + struct kvec iov; + + iov.iov_base = smb_buffer; + iov.iov_len = smb_buf_length + 4; + + return smb_sendv(server, &iov, 1); +} + +static int +wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, + int *credits) +{ + int rc; + + spin_lock(&server->req_lock); + if (timeout == CIFS_ASYNC_OP) { + /* oplock breaks must not be held up */ + server->in_flight++; + *credits -= 1; + spin_unlock(&server->req_lock); + return 0; + } + + while (1) { + if (*credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + /* + * Can not count locking commands against total + * as they are allowed to block on server. + */ + + /* update # of requests on the wire to server */ + if (timeout != CIFS_BLOCKING_OP) { + *credits -= 1; + server->in_flight++; + } + spin_unlock(&server->req_lock); + break; + } + } + return 0; +} + +static int +wait_for_free_request(struct TCP_Server_Info *server, const int timeout, + const int optype) +{ + int *val; + + val = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*val <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + return wait_for_free_credits(server, timeout, val); +} + +int +cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + *num = size; + *credits = 0; + return 0; +} + +static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, + struct mid_q_entry **ppmidQ) +{ + if (ses->server->tcpStatus == CifsExiting) { + return -ENOENT; + } + + if (ses->server->tcpStatus == CifsNeedReconnect) { + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); + return -EAGAIN; + } + + if (ses->status == CifsNew) { + if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && + (in_buf->Command != SMB_COM_NEGOTIATE)) + return -EAGAIN; + /* else ok - we are setting up session */ + } + + if (ses->status == CifsExiting) { + /* check if SMB session is bad because we are setting it up */ + if (in_buf->Command != SMB_COM_LOGOFF_ANDX) + return -EAGAIN; + /* else ok - we are shutting down session */ + } + + *ppmidQ = AllocMidQEntry(in_buf, ses->server); + if (*ppmidQ == NULL) + return -ENOMEM; + spin_lock(&GlobalMid_Lock); + list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + return 0; +} + +static int +wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) +{ + int error; + + error = wait_event_freezekillable_unsafe(server->response_q, + midQ->mid_state != MID_REQUEST_SUBMITTED); + if (error < 0) + return -ERESTARTSYS; + + return 0; +} + +struct mid_q_entry * +cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + int rc; + struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + struct mid_q_entry *mid; + + /* enable signing if server requires it */ + if (server->sign) + hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + mid = AllocMidQEntry(hdr, server); + if (mid == NULL) + return ERR_PTR(-ENOMEM); + + rc = cifs_sign_rqst(rqst, server, &mid->sequence_number); + if (rc) { + DeleteMidQEntry(mid); + return ERR_PTR(rc); + } + + return mid; +} + +/* + * Send a SMB request and set the callback function in the mid to handle + * the result. Caller is responsible for dealing with timeouts. + */ +int +cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, + mid_receive_t *receive, mid_callback_t *callback, + void *cbdata, const int flags) +{ + int rc, timeout, optype; + struct mid_q_entry *mid; + unsigned int credits = 0; + + timeout = flags & CIFS_TIMEOUT_MASK; + optype = flags & CIFS_OP_MASK; + + if ((flags & CIFS_HAS_CREDITS) == 0) { + rc = wait_for_free_request(server, timeout, optype); + if (rc) + return rc; + credits = 1; + } + + mutex_lock(&server->srv_mutex); + mid = server->ops->setup_async_request(server, rqst); + if (IS_ERR(mid)) { + mutex_unlock(&server->srv_mutex); + add_credits_and_wake_if(server, credits, optype); + return PTR_ERR(mid); + } + + mid->receive = receive; + mid->callback = callback; + mid->callback_data = cbdata; + mid->mid_state = MID_REQUEST_SUBMITTED; + + /* put it on the pending_mid_q */ + spin_lock(&GlobalMid_Lock); + list_add_tail(&mid->qhead, &server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + + + cifs_in_send_inc(server); + rc = smb_send_rqst(server, rqst); + cifs_in_send_dec(server); + cifs_save_when_sent(mid); + + if (rc < 0) + server->sequence_number -= 2; + mutex_unlock(&server->srv_mutex); + + if (rc == 0) + return 0; + + cifs_delete_mid(mid); + add_credits_and_wake_if(server, credits, optype); + return rc; +} + +/* + * + * Send an SMB Request. No response info (other than return code) + * needs to be parsed. + * + * flags indicate the type of request buffer and how long to wait + * and whether to log NT STATUS code (error) before mapping it to POSIX error + * + */ +int +SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, + char *in_buf, int flags) +{ + int rc; + struct kvec iov[1]; + int resp_buf_type; + + iov[0].iov_base = in_buf; + iov[0].iov_len = get_rfc1002_length(in_buf) + 4; + flags |= CIFS_NO_RESP; + rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); + cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc); + + return rc; +} + +static int +cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) +{ + int rc = 0; + + cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n", + __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state); + + spin_lock(&GlobalMid_Lock); + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + spin_unlock(&GlobalMid_Lock); + return rc; + case MID_RETRY_NEEDED: + rc = -EAGAIN; + break; + case MID_RESPONSE_MALFORMED: + rc = -EIO; + break; + case MID_SHUTDOWN: + rc = -EHOSTDOWN; + break; + default: + list_del_init(&mid->qhead); + cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", + __func__, mid->mid, mid->mid_state); + rc = -EIO; + } + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); + return rc; +} + +static inline int +send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid) +{ + return server->ops->send_cancel ? + server->ops->send_cancel(server, buf, mid) : 0; +} + +int +cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, + bool log_error) +{ + unsigned int len = get_rfc1002_length(mid->resp_buf) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 92, len)); + + /* convert the length into a more usable form */ + if (server->sign) { + struct kvec iov; + int rc = 0; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; + + iov.iov_base = mid->resp_buf; + iov.iov_len = len; + /* FIXME: add code to kill session */ + rc = cifs_verify_signature(&rqst, server, + mid->sequence_number); + if (rc) + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); + } + + /* BB special case reconnect tid and uid here? */ + return map_smb_to_linux_error(mid->resp_buf, log_error); +} + +struct mid_q_entry * +cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +{ + int rc; + struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + struct mid_q_entry *mid; + + rc = allocate_mid(ses, hdr, &mid); + if (rc) + return ERR_PTR(rc); + rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number); + if (rc) { + cifs_delete_mid(mid); + return ERR_PTR(rc); + } + return mid; +} + +int +SendReceive2(const unsigned int xid, struct cifs_ses *ses, + struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, + const int flags) +{ + int rc = 0; + int timeout, optype; + struct mid_q_entry *midQ; + char *buf = iov[0].iov_base; + unsigned int credits = 1; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; + + timeout = flags & CIFS_TIMEOUT_MASK; + optype = flags & CIFS_OP_MASK; + + *resp_buf_type = CIFS_NO_BUFFER; /* no response buf yet */ + + if ((ses == NULL) || (ses->server == NULL)) { + cifs_small_buf_release(buf); + cifs_dbg(VFS, "Null session\n"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) { + cifs_small_buf_release(buf); + return -ENOENT; + } + + /* + * Ensure that we do not send more than 50 overlapping requests + * to the same server. We may make this configurable later or + * use ses->maxReq. + */ + + rc = wait_for_free_request(ses->server, timeout, optype); + if (rc) { + cifs_small_buf_release(buf); + return rc; + } + + /* + * Make sure that we sign in the same order that we send on this socket + * and avoid races inside tcp sendmsg code that could cause corruption + * of smb data. + */ + + mutex_lock(&ses->server->srv_mutex); + + midQ = ses->server->ops->setup_request(ses, &rqst); + if (IS_ERR(midQ)) { + mutex_unlock(&ses->server->srv_mutex); + cifs_small_buf_release(buf); + /* Update # of requests on wire to server */ + add_credits(ses->server, 1, optype); + return PTR_ERR(midQ); + } + + midQ->mid_state = MID_REQUEST_SUBMITTED; + cifs_in_send_inc(ses->server); + rc = smb_sendv(ses->server, iov, n_vec); + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) { + cifs_small_buf_release(buf); + goto out; + } + + if (timeout == CIFS_ASYNC_OP) { + cifs_small_buf_release(buf); + goto out; + } + + rc = wait_for_response(ses->server, midQ); + if (rc != 0) { + send_cancel(ses->server, buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + cifs_small_buf_release(buf); + add_credits(ses->server, 1, optype); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + cifs_small_buf_release(buf); + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) { + add_credits(ses->server, 1, optype); + return rc; + } + + if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cifs_dbg(FYI, "Bad MID state?\n"); + goto out; + } + + buf = (char *)midQ->resp_buf; + iov[0].iov_base = buf; + iov[0].iov_len = get_rfc1002_length(buf) + 4; + if (midQ->large_buf) + *resp_buf_type = CIFS_LARGE_BUFFER; + else + *resp_buf_type = CIFS_SMALL_BUFFER; + + credits = ses->server->ops->get_credits(midQ); + + rc = ses->server->ops->check_receive(midQ, ses->server, + flags & CIFS_LOG_ERROR); + + /* mark it so buf will not be freed by cifs_delete_mid */ + if ((flags & CIFS_NO_RESP) == 0) + midQ->resp_buf = NULL; +out: + cifs_delete_mid(midQ); + add_credits(ses->server, credits, optype); + + return rc; +} + +int +SendReceive(const unsigned int xid, struct cifs_ses *ses, + struct smb_hdr *in_buf, struct smb_hdr *out_buf, + int *pbytes_returned, const int timeout) +{ + int rc = 0; + struct mid_q_entry *midQ; + + if (ses == NULL) { + cifs_dbg(VFS, "Null smb session\n"); + return -EIO; + } + if (ses->server == NULL) { + cifs_dbg(VFS, "Null tcp session\n"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); + return -EIO; + } + + rc = wait_for_free_request(ses->server, timeout, 0); + if (rc) + return rc; + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + mutex_lock(&ses->server->srv_mutex); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + /* Update # of requests on wire to server */ + add_credits(ses->server, 1, 0); + return rc; + } + + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + goto out; + } + + midQ->mid_state = MID_REQUEST_SUBMITTED; + + cifs_in_send_inc(ses->server); + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) + goto out; + + if (timeout == CIFS_ASYNC_OP) + goto out; + + rc = wait_for_response(ses->server, midQ); + if (rc != 0) { + send_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + add_credits(ses->server, 1, 0); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) { + add_credits(ses->server, 1, 0); + return rc; + } + + if (!midQ->resp_buf || !out_buf || + midQ->mid_state != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cifs_dbg(VFS, "Bad MID state?\n"); + goto out; + } + + *pbytes_returned = get_rfc1002_length(midQ->resp_buf); + memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); + rc = cifs_check_receive(midQ, ses->server, 0); +out: + cifs_delete_mid(midQ); + add_credits(ses->server, 1, 0); + + return rc; +} + +/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows + blocking lock to return. */ + +static int +send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon, + struct smb_hdr *in_buf, + struct smb_hdr *out_buf) +{ + int bytes_returned; + struct cifs_ses *ses = tcon->ses; + LOCK_REQ *pSMB = (LOCK_REQ *)in_buf; + + /* We just modify the current in_buf to change + the type of lock from LOCKING_ANDX_SHARED_LOCK + or LOCKING_ANDX_EXCLUSIVE_LOCK to + LOCKING_ANDX_CANCEL_LOCK. */ + + pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; + pSMB->Timeout = 0; + pSMB->hdr.Mid = get_next_mid(ses->server); + + return SendReceive(xid, ses, in_buf, out_buf, + &bytes_returned, 0); +} + +int +SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, + struct smb_hdr *in_buf, struct smb_hdr *out_buf, + int *pbytes_returned) +{ + int rc = 0; + int rstart = 0; + struct mid_q_entry *midQ; + struct cifs_ses *ses; + + if (tcon == NULL || tcon->ses == NULL) { + cifs_dbg(VFS, "Null smb session\n"); + return -EIO; + } + ses = tcon->ses; + + if (ses->server == NULL) { + cifs_dbg(VFS, "Null tcp session\n"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); + return -EIO; + } + + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0); + if (rc) + return rc; + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + mutex_lock(&ses->server->srv_mutex); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + return rc; + } + + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + if (rc) { + cifs_delete_mid(midQ); + mutex_unlock(&ses->server->srv_mutex); + return rc; + } + + midQ->mid_state = MID_REQUEST_SUBMITTED; + cifs_in_send_inc(ses->server); + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) { + cifs_delete_mid(midQ); + return rc; + } + + /* Wait for a reply - allow signals to interrupt. */ + rc = wait_event_interruptible(ses->server->response_q, + (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + /* Were we interrupted by a signal ? */ + if ((rc == -ERESTARTSYS) && + (midQ->mid_state == MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + if (in_buf->Command == SMB_COM_TRANSACTION2) { + /* POSIX lock. We send a NT_CANCEL SMB to cause the + blocking lock to return. */ + rc = send_cancel(ses->server, in_buf, midQ); + if (rc) { + cifs_delete_mid(midQ); + return rc; + } + } else { + /* Windows lock. We send a LOCKINGX_CANCEL_LOCK + to cause the blocking lock to return. */ + + rc = send_lock_cancel(xid, tcon, in_buf, out_buf); + + /* If we get -ENOLCK back the lock may have + already been removed. Don't exit in this case. */ + if (rc && rc != -ENOLCK) { + cifs_delete_mid(midQ); + return rc; + } + } + + rc = wait_for_response(ses->server, midQ); + if (rc) { + send_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + /* We got the response - restart system call. */ + rstart = 1; + } + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) + return rc; + + /* rcvd frame is ok */ + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cifs_dbg(VFS, "Bad MID state?\n"); + goto out; + } + + *pbytes_returned = get_rfc1002_length(midQ->resp_buf); + memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); + rc = cifs_check_receive(midQ, ses->server, 0); +out: + cifs_delete_mid(midQ); + if (rstart && rc == -EACCES) + return -ERESTARTSYS; + return rc; +} diff --git a/kernel/fs/cifs/winucase.c b/kernel/fs/cifs/winucase.c new file mode 100644 index 000000000..1506d4fdd --- /dev/null +++ b/kernel/fs/cifs/winucase.c @@ -0,0 +1,663 @@ +/* + * fs/cifs/winucase.c + * + * Copyright (c) Jeffrey Layton , 2013 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * The const tables in this file were converted from the following info + * provided by Microsoft: + * + * 3.1.5.3 Mapping UTF-16 Strings to Upper Case: + * + * http://msdn.microsoft.com/en-us/library/hh877830.aspx + * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921 + * + * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was + * post-processed using the winucase_convert.pl script. + */ + +#include + +wchar_t cifs_toupper(wchar_t in); /* quiet sparse */ + +static const wchar_t t2_00[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, +}; + +static const wchar_t t2_01[256] = { + 0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106, + 0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e, + 0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116, + 0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e, + 0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126, + 0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e, + 0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136, + 0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000, + 0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000, + 0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e, + 0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156, + 0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e, + 0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166, + 0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e, + 0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176, + 0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000, + 0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000, + 0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000, + 0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000, + 0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000, + 0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000, + 0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000, + 0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000, + 0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000, + 0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000, + 0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de, + 0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6, + 0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee, + 0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000, + 0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe, +}; + +static const wchar_t t2_02[256] = { + 0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206, + 0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e, + 0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216, + 0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e, + 0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226, + 0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e, + 0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246, + 0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e, + 0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a, + 0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c, + 0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000, + 0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000, + 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_03[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376, + 0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a, + 0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf, + 0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de, + 0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6, + 0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee, + 0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_04[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466, + 0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e, + 0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476, + 0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e, + 0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e, + 0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496, + 0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e, + 0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6, + 0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae, + 0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6, + 0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be, + 0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000, + 0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0, + 0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6, + 0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de, + 0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6, + 0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee, + 0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6, + 0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe, +}; + +static const wchar_t t2_05[256] = { + 0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506, + 0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e, + 0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516, + 0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e, + 0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1d[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1e[256] = { + 0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06, + 0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e, + 0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16, + 0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e, + 0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26, + 0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e, + 0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36, + 0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e, + 0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46, + 0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e, + 0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56, + 0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e, + 0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66, + 0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e, + 0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76, + 0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e, + 0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86, + 0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e, + 0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6, + 0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae, + 0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6, + 0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe, + 0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6, + 0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece, + 0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6, + 0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede, + 0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6, + 0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee, + 0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6, + 0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe, +}; + +static const wchar_t t2_1f[256] = { + 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, + 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000, + 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_21[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, + 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_24[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd, + 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5, + 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd, + 0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2c[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, + 0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, + 0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, + 0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, + 0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, + 0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000, + 0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000, + 0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86, + 0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e, + 0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96, + 0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e, + 0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6, + 0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae, + 0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6, + 0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe, + 0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6, + 0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce, + 0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6, + 0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde, + 0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2d[256] = { + 0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, + 0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, + 0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, + 0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, + 0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a6[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646, + 0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e, + 0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656, + 0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e, + 0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666, + 0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686, + 0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e, + 0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a7[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726, + 0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e, + 0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736, + 0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e, + 0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746, + 0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e, + 0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756, + 0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e, + 0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766, + 0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e, + 0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786, + 0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_ff[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, + 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, + 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, + 0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t *const toplevel[256] = { + t2_00, t2_01, t2_02, t2_03, t2_04, t2_05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, t2_1d, t2_1e, t2_1f, + NULL, t2_21, NULL, NULL, t2_24, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, t2_2c, t2_2d, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, t2_a6, t2_a7, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, t2_ff, +}; + +/** + * cifs_toupper - convert a wchar_t from lower to uppercase + * @in: character to convert from lower to uppercase + * + * This function consults the static tables above to convert a wchar_t from + * lower to uppercase. In the event that there is no mapping, the original + * "in" character is returned. + */ +wchar_t +cifs_toupper(wchar_t in) +{ + unsigned char idx; + const wchar_t *tbl; + wchar_t out; + + /* grab upper byte */ + idx = (in & 0xff00) >> 8; + + /* find pointer to 2nd layer table */ + tbl = toplevel[idx]; + if (!tbl) + return in; + + /* grab lower byte */ + idx = in & 0xff; + + /* look up character in table */ + out = tbl[idx]; + if (out) + return out; + + return in; +} diff --git a/kernel/fs/cifs/xattr.c b/kernel/fs/cifs/xattr.c new file mode 100644 index 000000000..ff9e1f8b1 --- /dev/null +++ b/kernel/fs/cifs/xattr.c @@ -0,0 +1,424 @@ +/* + * fs/cifs/xattr.c + * + * Copyright (c) International Business Machines Corp., 2003, 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" + +#define MAX_EA_VALUE_SIZE 65535 +#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" + +/* BB need to add server (Samba e.g) support for security and trusted prefix */ + +int cifs_removexattr(struct dentry *direntry, const char *ea_name) +{ + int rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path = NULL; + + if (direntry == NULL) + return -EIO; + if (d_really_is_negative(direntry)) + return -EIO; + sb = d_inode(direntry)->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto remove_ea_exit; + } + if (ea_name == NULL) { + cifs_dbg(FYI, "Null xattr names not supported\n"); + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); + /* BB what if no namespace prefix? */ + /* Should we just pass them to server, except for + system and perhaps security prefixes? */ + } else { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto remove_ea_exit; + + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, NULL, (__u16)0, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + } +remove_ea_exit: + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +int cifs_setxattr(struct dentry *direntry, const char *ea_name, + const void *ea_value, size_t value_size, int flags) +{ + int rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + + if (direntry == NULL) + return -EIO; + if (d_really_is_negative(direntry)) + return -EIO; + sb = d_inode(direntry)->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto set_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + if (value_size > MAX_EA_VALUE_SIZE) { + cifs_dbg(FYI, "size of EA value too large\n"); + rc = -EOPNOTSUPP; + goto set_ea_exit; + } + + if (ea_name == NULL) { + cifs_dbg(FYI, "Null xattr names not supported\n"); + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto set_ea_exit; + if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) + cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); + + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) + == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto set_ea_exit; + + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + struct cifs_ntsd *pacl; + pacl = kmalloc(value_size, GFP_KERNEL); + if (!pacl) { + rc = -ENOMEM; + } else { + memcpy(pacl, ea_value, value_size); + if (pTcon->ses->server->ops->set_acl) + rc = pTcon->ses->server->ops->set_acl(pacl, + value_size, d_inode(direntry), + full_path, CIFS_ACL_DACL); + else + rc = -EOPNOTSUPP; + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(d_inode(direntry))->time = 0; + kfree(pacl); + } +#else + cifs_dbg(FYI, "Set CIFS ACL not supported yet\n"); +#endif /* CONFIG_CIFS_ACL */ + } else { + int temp; + temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + if (temp == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + ea_value, (const int)value_size, + ACL_TYPE_ACCESS, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); +#else + cifs_dbg(FYI, "set POSIX ACL not supported\n"); +#endif + } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + ea_value, (const int)value_size, + ACL_TYPE_DEFAULT, cifs_sb->local_nls, + cifs_remap(cifs_sb)); + cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); +#else + cifs_dbg(FYI, "set default POSIX ACL not supported\n"); +#endif + } else { + cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n", + ea_name); + /* BB what if no namespace prefix? */ + /* Should we just pass them to server, except for + system and perhaps security prefixes? */ + } + } + +set_ea_exit: + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, + void *ea_value, size_t buf_size) +{ + ssize_t rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + + if (direntry == NULL) + return -EIO; + if (d_really_is_negative(direntry)) + return -EIO; + sb = d_inode(direntry)->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto get_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + if (ea_name == NULL) { + cifs_dbg(FYI, "Null xattr names not supported\n"); + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto get_ea_exit; + + if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { + cifs_dbg(FYI, "attempt to query cifs inode metadata\n"); + /* revalidate/getattr then populate from inode */ + } /* BB add else when above is implemented */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto get_ea_exit; + + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, + ea_value, buf_size, ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +#else + cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, + ea_value, buf_size, ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +#else + cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + if (pTcon->ses->server->ops->get_acl == NULL) + goto get_ea_exit; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + d_inode(direntry), full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cifs_dbg(VFS, "%s: error %zd getting sec desc\n", + __func__, rc); + } else { + if (ea_value) { + if (acllen > buf_size) + acllen = -ERANGE; + else + memcpy(ea_value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#else + cifs_dbg(FYI, "Query CIFS ACL not supported yet\n"); +#endif /* CONFIG_CIFS_ACL */ + } else if (strncmp(ea_name, + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n"); + } else if (strncmp(ea_name, + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + cifs_dbg(FYI, "Security xattr namespace not supported yet\n"); + } else + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); + + /* We could add an additional check for streams ie + if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + + if (rc == -EINVAL) + rc = -EOPNOTSUPP; + +get_ea_exit: + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) +{ + ssize_t rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + unsigned int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + + if (direntry == NULL) + return -EIO; + if (d_really_is_negative(direntry)) + return -EIO; + sb = d_inode(direntry)->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + return -EOPNOTSUPP; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto list_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, NULL, data, buf_size, + cifs_sb->local_nls, cifs_remap(cifs_sb)); +list_ea_exit: + kfree(full_path); + free_xid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} diff --git a/kernel/fs/coda/Kconfig b/kernel/fs/coda/Kconfig new file mode 100644 index 000000000..c0e5a7fad --- /dev/null +++ b/kernel/fs/coda/Kconfig @@ -0,0 +1,21 @@ +config CODA_FS + tristate "Coda file system support (advanced network fs)" + depends on INET + help + Coda is an advanced network file system, similar to NFS in that it + enables you to mount file systems of a remote server and access them + with regular Unix commands as if they were sitting on your hard + disk. Coda has several advantages over NFS: support for + disconnected operation (e.g. for laptops), read/write server + replication, security model for authentication and encryption, + persistent client caches and write back caching. + + If you say Y here, your Linux box will be able to act as a Coda + *client*. You will need user level code as well, both for the + client and server. Servers are currently user level, i.e. they need + no kernel support. Please read + and check out the Coda + home page . + + To compile the coda client support as a module, choose M here: the + module will be called coda. diff --git a/kernel/fs/coda/Makefile b/kernel/fs/coda/Makefile new file mode 100644 index 000000000..1bab69a0d --- /dev/null +++ b/kernel/fs/coda/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux Coda filesystem routines. +# + +obj-$(CONFIG_CODA_FS) += coda.o + +coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \ + coda_linux.o symlink.o pioctl.o sysctl.o + +# If you want debugging output, please uncomment the following line. + +# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1 diff --git a/kernel/fs/coda/cache.c b/kernel/fs/coda/cache.c new file mode 100644 index 000000000..5bb630a76 --- /dev/null +++ b/kernel/fs/coda/cache.c @@ -0,0 +1,118 @@ +/* + * Cache operations for Coda. + * For Linux 2.1: (C) 1997 Carnegie Mellon University + * For Linux 2.3: (C) 2000 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project http://www.coda.cs.cmu.edu/ . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +static atomic_t permission_epoch = ATOMIC_INIT(0); + +/* replace or extend an acl cache hit */ +void coda_cache_enter(struct inode *inode, int mask) +{ + struct coda_inode_info *cii = ITOC(inode); + + spin_lock(&cii->c_lock); + cii->c_cached_epoch = atomic_read(&permission_epoch); + if (!uid_eq(cii->c_uid, current_fsuid())) { + cii->c_uid = current_fsuid(); + cii->c_cached_perm = mask; + } else + cii->c_cached_perm |= mask; + spin_unlock(&cii->c_lock); +} + +/* remove cached acl from an inode */ +void coda_cache_clear_inode(struct inode *inode) +{ + struct coda_inode_info *cii = ITOC(inode); + spin_lock(&cii->c_lock); + cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; + spin_unlock(&cii->c_lock); +} + +/* remove all acl caches */ +void coda_cache_clear_all(struct super_block *sb) +{ + atomic_inc(&permission_epoch); +} + + +/* check if the mask has been matched against the acl already */ +int coda_cache_check(struct inode *inode, int mask) +{ + struct coda_inode_info *cii = ITOC(inode); + int hit; + + spin_lock(&cii->c_lock); + hit = (mask & cii->c_cached_perm) == mask && + uid_eq(cii->c_uid, current_fsuid()) && + cii->c_cached_epoch == atomic_read(&permission_epoch); + spin_unlock(&cii->c_lock); + + return hit; +} + + +/* Purging dentries and children */ +/* The following routines drop dentries which are not + in use and flag dentries which are in use to be + zapped later. + + The flags are detected by: + - coda_dentry_revalidate (for lookups) if the flag is C_PURGE + - coda_dentry_delete: to remove dentry from the cache when d_count + falls to zero + - an inode method coda_revalidate (for attributes) if the + flag is C_VATTR +*/ + +/* this won't do any harm: just flag all children */ +static void coda_flag_children(struct dentry *parent, int flag) +{ + struct dentry *de; + + spin_lock(&parent->d_lock); + list_for_each_entry(de, &parent->d_subdirs, d_child) { + /* don't know what to do with negative dentries */ + if (d_inode(de) ) + coda_flag_inode(d_inode(de), flag); + } + spin_unlock(&parent->d_lock); + return; +} + +void coda_flag_inode_children(struct inode *inode, int flag) +{ + struct dentry *alias_de; + + if ( !inode || !S_ISDIR(inode->i_mode)) + return; + + alias_de = d_find_alias(inode); + if (!alias_de) + return; + coda_flag_children(alias_de, flag); + shrink_dcache_parent(alias_de); + dput(alias_de); +} + diff --git a/kernel/fs/coda/cnode.c b/kernel/fs/coda/cnode.c new file mode 100644 index 000000000..7740b1c87 --- /dev/null +++ b/kernel/fs/coda/cnode.c @@ -0,0 +1,168 @@ +/* cnode related routines for the coda kernel code + (C) 1996 Peter Braam + */ + +#include +#include +#include + +#include +#include +#include "coda_linux.h" + +static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) +{ + return memcmp(fid1, fid2, sizeof(*fid1)) == 0; +} + +static const struct inode_operations coda_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = coda_setattr, +}; + +/* cnode.c */ +static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) +{ + coda_vattr_to_iattr(inode, attr); + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &coda_file_inode_operations; + inode->i_fop = &coda_file_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &coda_dir_inode_operations; + inode->i_fop = &coda_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &coda_symlink_inode_operations; + inode->i_data.a_ops = &coda_symlink_aops; + inode->i_mapping = &inode->i_data; + } else + init_special_inode(inode, inode->i_mode, huge_decode_dev(attr->va_rdev)); +} + +static int coda_test_inode(struct inode *inode, void *data) +{ + struct CodaFid *fid = (struct CodaFid *)data; + struct coda_inode_info *cii = ITOC(inode); + return coda_fideq(&cii->c_fid, fid); +} + +static int coda_set_inode(struct inode *inode, void *data) +{ + struct CodaFid *fid = (struct CodaFid *)data; + struct coda_inode_info *cii = ITOC(inode); + cii->c_fid = *fid; + return 0; +} + +struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, + struct coda_vattr * attr) +{ + struct inode *inode; + struct coda_inode_info *cii; + unsigned long hash = coda_f2i(fid); + + inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + cii = ITOC(inode); + /* we still need to set i_ino for things like stat(2) */ + inode->i_ino = hash; + /* inode is locked and unique, no need to grab cii->c_lock */ + cii->c_mapcount = 0; + unlock_new_inode(inode); + } + + /* always replace the attributes, type might have changed */ + coda_fill_inode(inode, attr); + return inode; +} + +/* this is effectively coda_iget: + - get attributes (might be cached) + - get the inode for the fid using vfs iget + - link the two up if this is needed + - fill in the attributes +*/ +struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb) +{ + struct coda_vattr attr; + struct inode *inode; + int error; + + /* We get inode numbers from Venus -- see venus source */ + error = venus_getattr(sb, fid, &attr); + if (error) + return ERR_PTR(error); + + inode = coda_iget(sb, fid, &attr); + if (IS_ERR(inode)) + pr_warn("%s: coda_iget failed\n", __func__); + return inode; +} + + +/* Although we treat Coda file identifiers as immutable, there is one + * special case for files created during a disconnection where they may + * not be globally unique. When an identifier collision is detected we + * first try to flush the cached inode from the kernel and finally + * resort to renaming/rehashing in-place. Userspace remembers both old + * and new values of the identifier to handle any in-flight upcalls. + * The real solution is to use globally unique UUIDs as identifiers, but + * retrofitting the existing userspace code for this is non-trivial. */ +void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, + struct CodaFid *newfid) +{ + struct coda_inode_info *cii = ITOC(inode); + unsigned long hash = coda_f2i(newfid); + + BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); + + /* replace fid and rehash inode */ + /* XXX we probably need to hold some lock here! */ + remove_inode_hash(inode); + cii->c_fid = *newfid; + inode->i_ino = hash; + __insert_inode_hash(inode, hash); +} + +/* convert a fid to an inode. */ +struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) +{ + struct inode *inode; + unsigned long hash = coda_f2i(fid); + + if ( !sb ) { + pr_warn("%s: no sb!\n", __func__); + return NULL; + } + + inode = ilookup5(sb, hash, coda_test_inode, fid); + if ( !inode ) + return NULL; + + /* we should never see newly created inodes because we intentionally + * fail in the initialization callback */ + BUG_ON(inode->i_state & I_NEW); + + return inode; +} + +/* the CONTROL inode is made without asking attributes from Venus */ +struct inode *coda_cnode_makectl(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = CTL_INO; + inode->i_op = &coda_ioctl_inode_operations; + inode->i_fop = &coda_ioctl_operations; + inode->i_mode = 0444; + return inode; + } + return ERR_PTR(-ENOMEM); +} + diff --git a/kernel/fs/coda/coda_cache.h b/kernel/fs/coda/coda_cache.h new file mode 100644 index 000000000..c910b5eb1 --- /dev/null +++ b/kernel/fs/coda/coda_cache.h @@ -0,0 +1,22 @@ +/* Coda filesystem -- Linux Minicache + * + * Copyright (C) 1989 - 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. Contact Peter Braam + * + */ + +#ifndef _CFSNC_HEADER_ +#define _CFSNC_HEADER_ + +/* credential cache */ +void coda_cache_enter(struct inode *inode, int mask); +void coda_cache_clear_inode(struct inode *); +void coda_cache_clear_all(struct super_block *sb); +int coda_cache_check(struct inode *inode, int mask); + +/* for downcalls and attributes and lookups */ +void coda_flag_inode_children(struct inode *inode, int flag); + +#endif /* _CFSNC_HEADER_ */ diff --git a/kernel/fs/coda/coda_fs_i.h b/kernel/fs/coda/coda_fs_i.h new file mode 100644 index 000000000..c64075213 --- /dev/null +++ b/kernel/fs/coda/coda_fs_i.h @@ -0,0 +1,58 @@ +/* + * coda_fs_i.h + * + * Copyright (C) 1998 Carnegie Mellon University + * + */ + +#ifndef _LINUX_CODA_FS_I +#define _LINUX_CODA_FS_I + +#include +#include +#include +#include + +/* + * coda fs inode data + * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and + * c_cached_perm. + * vfs_inode is set only when the inode is created and never changes. + * c_fid is set when the inode is created and should be considered immutable. + */ +struct coda_inode_info { + struct CodaFid c_fid; /* Coda identifier */ + u_short c_flags; /* flags (see below) */ + unsigned int c_mapcount; /* nr of times this inode is mapped */ + unsigned int c_cached_epoch; /* epoch for cached permissions */ + kuid_t c_uid; /* fsuid for cached permissions */ + unsigned int c_cached_perm; /* cached access permissions */ + spinlock_t c_lock; + struct inode vfs_inode; +}; + +/* + * coda fs file private data + */ +#define CODA_MAGIC 0xC0DAC0DA +struct coda_file_info { + int cfi_magic; /* magic number */ + struct file *cfi_container; /* container file for this cnode */ + unsigned int cfi_mapcount; /* nr of times this file is mapped */ +}; + +#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data)) + +/* flags */ +#define C_VATTR 0x1 /* Validity of vattr in inode */ +#define C_FLUSH 0x2 /* used after a flush */ +#define C_DYING 0x4 /* from venus (which died) */ +#define C_PURGE 0x8 + +struct inode *coda_cnode_make(struct CodaFid *, struct super_block *); +struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); +struct inode *coda_cnode_makectl(struct super_block *sb); +struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); +void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); + +#endif diff --git a/kernel/fs/coda/coda_int.h b/kernel/fs/coda/coda_int.h new file mode 100644 index 000000000..381c993b1 --- /dev/null +++ b/kernel/fs/coda/coda_int.h @@ -0,0 +1,20 @@ +#ifndef _CODA_INT_ +#define _CODA_INT_ + +struct dentry; +struct file; + +extern struct file_system_type coda_fs_type; +extern unsigned long coda_timeout; +extern int coda_hard; +extern int coda_fake_statfs; + +void coda_destroy_inodecache(void); +int __init coda_init_inodecache(void); +int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); +void coda_sysctl_init(void); +void coda_sysctl_clean(void); + +#endif /* _CODA_INT_ */ + + diff --git a/kernel/fs/coda/coda_linux.c b/kernel/fs/coda/coda_linux.c new file mode 100644 index 000000000..f1714cfb5 --- /dev/null +++ b/kernel/fs/coda/coda_linux.c @@ -0,0 +1,186 @@ +/* + * Inode operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" + +/* initialize the debugging variables */ +int coda_fake_statfs; + +/* print a fid */ +char * coda_f2s(struct CodaFid *f) +{ + static char s[60]; + + sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]); + + return s; +} + +/* recognize special .CONTROL name */ +int coda_iscontrol(const char *name, size_t length) +{ + return ((CODA_CONTROLLEN == length) && + (strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0)); +} + +unsigned short coda_flags_to_cflags(unsigned short flags) +{ + unsigned short coda_flags = 0; + + if ((flags & O_ACCMODE) == O_RDONLY) + coda_flags |= C_O_READ; + + if ((flags & O_ACCMODE) == O_RDWR) + coda_flags |= C_O_READ | C_O_WRITE; + + if ((flags & O_ACCMODE) == O_WRONLY) + coda_flags |= C_O_WRITE; + + if (flags & O_TRUNC) + coda_flags |= C_O_TRUNC; + + if (flags & O_CREAT) + coda_flags |= C_O_CREAT; + + if (flags & O_EXCL) + coda_flags |= C_O_EXCL; + + return coda_flags; +} + + +/* utility functions below */ +void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) +{ + int inode_type; + /* inode's i_flags, i_ino are set by iget + XXX: is this all we need ?? + */ + switch (attr->va_type) { + case C_VNON: + inode_type = 0; + break; + case C_VREG: + inode_type = S_IFREG; + break; + case C_VDIR: + inode_type = S_IFDIR; + break; + case C_VLNK: + inode_type = S_IFLNK; + break; + default: + inode_type = 0; + } + inode->i_mode |= inode_type; + + if (attr->va_mode != (u_short) -1) + inode->i_mode = attr->va_mode | inode_type; + if (attr->va_uid != -1) + inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid); + if (attr->va_gid != -1) + inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid); + if (attr->va_nlink != -1) + set_nlink(inode, attr->va_nlink); + if (attr->va_size != -1) + inode->i_size = attr->va_size; + if (attr->va_size != -1) + inode->i_blocks = (attr->va_size + 511) >> 9; + if (attr->va_atime.tv_sec != -1) + inode->i_atime = attr->va_atime; + if (attr->va_mtime.tv_sec != -1) + inode->i_mtime = attr->va_mtime; + if (attr->va_ctime.tv_sec != -1) + inode->i_ctime = attr->va_ctime; +} + + +/* + * BSD sets attributes that need not be modified to -1. + * Linux uses the valid field to indicate what should be + * looked at. The BSD type field needs to be deduced from linux + * mode. + * So we have to do some translations here. + */ + +void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) +{ + unsigned int valid; + + /* clean out */ + vattr->va_mode = -1; + vattr->va_uid = (vuid_t) -1; + vattr->va_gid = (vgid_t) -1; + vattr->va_size = (off_t) -1; + vattr->va_atime.tv_sec = (time_t) -1; + vattr->va_atime.tv_nsec = (time_t) -1; + vattr->va_mtime.tv_sec = (time_t) -1; + vattr->va_mtime.tv_nsec = (time_t) -1; + vattr->va_ctime.tv_sec = (time_t) -1; + vattr->va_ctime.tv_nsec = (time_t) -1; + vattr->va_type = C_VNON; + vattr->va_fileid = -1; + vattr->va_gen = -1; + vattr->va_bytes = -1; + vattr->va_nlink = -1; + vattr->va_blocksize = -1; + vattr->va_rdev = -1; + vattr->va_flags = 0; + + /* determine the type */ +#if 0 + mode = iattr->ia_mode; + if ( S_ISDIR(mode) ) { + vattr->va_type = C_VDIR; + } else if ( S_ISREG(mode) ) { + vattr->va_type = C_VREG; + } else if ( S_ISLNK(mode) ) { + vattr->va_type = C_VLNK; + } else { + /* don't do others */ + vattr->va_type = C_VNON; + } +#endif + + /* set those vattrs that need change */ + valid = iattr->ia_valid; + if ( valid & ATTR_MODE ) { + vattr->va_mode = iattr->ia_mode; + } + if ( valid & ATTR_UID ) { + vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid); + } + if ( valid & ATTR_GID ) { + vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid); + } + if ( valid & ATTR_SIZE ) { + vattr->va_size = iattr->ia_size; + } + if ( valid & ATTR_ATIME ) { + vattr->va_atime = iattr->ia_atime; + } + if ( valid & ATTR_MTIME ) { + vattr->va_mtime = iattr->ia_mtime; + } + if ( valid & ATTR_CTIME ) { + vattr->va_ctime = iattr->ia_ctime; + } +} + diff --git a/kernel/fs/coda/coda_linux.h b/kernel/fs/coda/coda_linux.h new file mode 100644 index 000000000..d6f7a76a1 --- /dev/null +++ b/kernel/fs/coda/coda_linux.h @@ -0,0 +1,105 @@ +/* + * Coda File System, Linux Kernel module + * + * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University + * Linux modifications (C) 1996, Peter J. Braam + * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. + */ + +#ifndef _LINUX_CODA_FS +#define _LINUX_CODA_FS + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "coda_fs_i.h" + +/* operations */ +extern const struct inode_operations coda_dir_inode_operations; +extern const struct inode_operations coda_file_inode_operations; +extern const struct inode_operations coda_ioctl_inode_operations; + +extern const struct dentry_operations coda_dentry_operations; + +extern const struct address_space_operations coda_file_aops; +extern const struct address_space_operations coda_symlink_aops; + +extern const struct file_operations coda_dir_operations; +extern const struct file_operations coda_file_operations; +extern const struct file_operations coda_ioctl_operations; + +/* operations shared over more than one file */ +int coda_open(struct inode *i, struct file *f); +int coda_release(struct inode *i, struct file *f); +int coda_permission(struct inode *inode, int mask); +int coda_revalidate_inode(struct inode *); +int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int coda_setattr(struct dentry *, struct iattr *); + +/* this file: heloers */ +char *coda_f2s(struct CodaFid *f); +int coda_iscontrol(const char *name, size_t length); + +void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); +void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); +unsigned short coda_flags_to_cflags(unsigned short); + +/* sysctl.h */ +void coda_sysctl_init(void); +void coda_sysctl_clean(void); + +#define CODA_ALLOC(ptr, cast, size) do { \ + if (size < PAGE_SIZE) \ + ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ + else \ + ptr = (cast)vzalloc((unsigned long) size); \ + if (!ptr) \ + pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ +} while (0) + + +#define CODA_FREE(ptr,size) \ + do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) + +/* inode to cnode access functions */ + +static inline struct coda_inode_info *ITOC(struct inode *inode) +{ + return list_entry(inode, struct coda_inode_info, vfs_inode); +} + +static __inline__ struct CodaFid *coda_i2f(struct inode *inode) +{ + return &(ITOC(inode)->c_fid); +} + +static __inline__ char *coda_i2s(struct inode *inode) +{ + return coda_f2s(&(ITOC(inode)->c_fid)); +} + +/* this will not zap the inode away */ +static __inline__ void coda_flag_inode(struct inode *inode, int flag) +{ + struct coda_inode_info *cii = ITOC(inode); + + spin_lock(&cii->c_lock); + cii->c_flags |= flag; + spin_unlock(&cii->c_lock); +} + +#endif diff --git a/kernel/fs/coda/dir.c b/kernel/fs/coda/dir.c new file mode 100644 index 000000000..fda9f4311 --- /dev/null +++ b/kernel/fs/coda/dir.c @@ -0,0 +1,579 @@ + +/* + * Directory operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +/* same as fs/bad_inode.c */ +static int coda_return_EIO(void) +{ + return -EIO; +} +#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) + +/* inode operations for directories */ +/* access routines: lookup, readlink, permission */ +static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + const char *name = entry->d_name.name; + size_t length = entry->d_name.len; + struct inode *inode; + int type = 0; + + if (length > CODA_MAXNAMLEN) { + pr_err("name too long: lookup, %s (%*s)\n", + coda_i2s(dir), (int)length, name); + return ERR_PTR(-ENAMETOOLONG); + } + + /* control object, create inode on the fly */ + if (is_root_inode(dir) && coda_iscontrol(name, length)) { + inode = coda_cnode_makectl(sb); + type = CODA_NOCACHE; + } else { + struct CodaFid fid = { { 0, } }; + int error = venus_lookup(sb, coda_i2f(dir), name, length, + &type, &fid); + inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error); + } + + if (!IS_ERR(inode) && (type & CODA_NOCACHE)) + coda_flag_inode(inode, C_VATTR | C_PURGE); + + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; + + return d_splice_alias(inode, entry); +} + + +int coda_permission(struct inode *inode, int mask) +{ + int error; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + + if (!mask) + return 0; + + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + + if (coda_cache_check(inode, mask)) + return 0; + + error = venus_access(inode->i_sb, coda_i2f(inode), mask); + + if (!error) + coda_cache_enter(inode, mask); + + return error; +} + + +static inline void coda_dir_update_mtime(struct inode *dir) +{ +#ifdef REQUERY_VENUS_FOR_MTIME + /* invalidate the directory cnode's attributes so we refetch the + * attributes from venus next time the inode is referenced */ + coda_flag_inode(dir, C_VATTR); +#else + /* optimistically we can also act as if our nose bleeds. The + * granularity of the mtime is coarse anyways so we might actually be + * right most of the time. Note: we only do this for directories. */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; +#endif +} + +/* we have to wrap inc_nlink/drop_nlink because sometimes userspace uses a + * trick to fool GNU find's optimizations. If we can't be sure of the link + * (because of volume mount points) we set i_nlink to 1 which forces find + * to consider every child as a possible directory. We should also never + * see an increment or decrement for deleted directories where i_nlink == 0 */ +static inline void coda_dir_inc_nlink(struct inode *dir) +{ + if (dir->i_nlink >= 2) + inc_nlink(dir); +} + +static inline void coda_dir_drop_nlink(struct inode *dir) +{ + if (dir->i_nlink > 2) + drop_nlink(dir); +} + +/* creation routines: create, mknod, mkdir, link, symlink */ +static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl) +{ + int error; + const char *name=de->d_name.name; + int length=de->d_name.len; + struct inode *inode; + struct CodaFid newfid; + struct coda_vattr attrs; + + if (is_root_inode(dir) && coda_iscontrol(name, length)) + return -EPERM; + + error = venus_create(dir->i_sb, coda_i2f(dir), name, length, + 0, mode, &newfid, &attrs); + if (error) + goto err_out; + + inode = coda_iget(dir->i_sb, &newfid, &attrs); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; + } + + /* invalidate the directory cnode's attributes */ + coda_dir_update_mtime(dir); + d_instantiate(de, inode); + return 0; +err_out: + d_drop(de); + return error; +} + +static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) +{ + struct inode *inode; + struct coda_vattr attrs; + const char *name = de->d_name.name; + int len = de->d_name.len; + int error; + struct CodaFid newfid; + + if (is_root_inode(dir) && coda_iscontrol(name, len)) + return -EPERM; + + attrs.va_mode = mode; + error = venus_mkdir(dir->i_sb, coda_i2f(dir), + name, len, &newfid, &attrs); + if (error) + goto err_out; + + inode = coda_iget(dir->i_sb, &newfid, &attrs); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; + } + + /* invalidate the directory cnode's attributes */ + coda_dir_inc_nlink(dir); + coda_dir_update_mtime(dir); + d_instantiate(de, inode); + return 0; +err_out: + d_drop(de); + return error; +} + +/* try to make de an entry in dir_inodde linked to source_de */ +static int coda_link(struct dentry *source_de, struct inode *dir_inode, + struct dentry *de) +{ + struct inode *inode = d_inode(source_de); + const char * name = de->d_name.name; + int len = de->d_name.len; + int error; + + if (is_root_inode(dir_inode) && coda_iscontrol(name, len)) + return -EPERM; + + error = venus_link(dir_inode->i_sb, coda_i2f(inode), + coda_i2f(dir_inode), (const char *)name, len); + if (error) { + d_drop(de); + return error; + } + + coda_dir_update_mtime(dir_inode); + ihold(inode); + d_instantiate(de, inode); + inc_nlink(inode); + return 0; +} + + +static int coda_symlink(struct inode *dir_inode, struct dentry *de, + const char *symname) +{ + const char *name = de->d_name.name; + int len = de->d_name.len; + int symlen; + int error; + + if (is_root_inode(dir_inode) && coda_iscontrol(name, len)) + return -EPERM; + + symlen = strlen(symname); + if (symlen > CODA_MAXPATHLEN) + return -ENAMETOOLONG; + + /* + * This entry is now negative. Since we do not create + * an inode for the entry we have to drop it. + */ + d_drop(de); + error = venus_symlink(dir_inode->i_sb, coda_i2f(dir_inode), name, len, + symname, symlen); + + /* mtime is no good anymore */ + if (!error) + coda_dir_update_mtime(dir_inode); + + return error; +} + +/* destruction routines: unlink, rmdir */ +static int coda_unlink(struct inode *dir, struct dentry *de) +{ + int error; + const char *name = de->d_name.name; + int len = de->d_name.len; + + error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); + if (error) + return error; + + coda_dir_update_mtime(dir); + drop_nlink(d_inode(de)); + return 0; +} + +static int coda_rmdir(struct inode *dir, struct dentry *de) +{ + const char *name = de->d_name.name; + int len = de->d_name.len; + int error; + + error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); + if (!error) { + /* VFS may delete the child */ + if (d_really_is_positive(de)) + clear_nlink(d_inode(de)); + + /* fix the link count of the parent */ + coda_dir_drop_nlink(dir); + coda_dir_update_mtime(dir); + } + return error; +} + +/* rename */ +static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + const char *old_name = old_dentry->d_name.name; + const char *new_name = new_dentry->d_name.name; + int old_length = old_dentry->d_name.len; + int new_length = new_dentry->d_name.len; + int error; + + error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), + coda_i2f(new_dir), old_length, new_length, + (const char *) old_name, (const char *)new_name); + if (!error) { + if (d_really_is_positive(new_dentry)) { + if (d_is_dir(new_dentry)) { + coda_dir_drop_nlink(old_dir); + coda_dir_inc_nlink(new_dir); + } + coda_dir_update_mtime(old_dir); + coda_dir_update_mtime(new_dir); + coda_flag_inode(d_inode(new_dentry), C_VATTR); + } else { + coda_flag_inode(old_dir, C_VATTR); + coda_flag_inode(new_dir, C_VATTR); + } + } + return error; +} + +static inline unsigned int CDT2DT(unsigned char cdt) +{ + unsigned int dt; + + switch(cdt) { + case CDT_UNKNOWN: dt = DT_UNKNOWN; break; + case CDT_FIFO: dt = DT_FIFO; break; + case CDT_CHR: dt = DT_CHR; break; + case CDT_DIR: dt = DT_DIR; break; + case CDT_BLK: dt = DT_BLK; break; + case CDT_REG: dt = DT_REG; break; + case CDT_LNK: dt = DT_LNK; break; + case CDT_SOCK: dt = DT_SOCK; break; + case CDT_WHT: dt = DT_WHT; break; + default: dt = DT_UNKNOWN; break; + } + return dt; +} + +/* support routines */ +static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) +{ + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct file *host_file; + struct venus_dirent *vdir; + unsigned long vdir_size = offsetof(struct venus_dirent, d_name); + unsigned int type; + struct qstr name; + ino_t ino; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + cii = ITOC(file_inode(coda_file)); + + vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); + if (!vdir) return -ENOMEM; + + if (!dir_emit_dots(coda_file, ctx)) + goto out; + + while (1) { + /* read entries from the directory file */ + ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, + sizeof(*vdir)); + if (ret < 0) { + pr_err("%s: read dir %s failed %d\n", + __func__, coda_f2s(&cii->c_fid), ret); + break; + } + if (ret == 0) break; /* end of directory file reached */ + + /* catch truncated reads */ + if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) { + pr_err("%s: short read on %s\n", + __func__, coda_f2s(&cii->c_fid)); + ret = -EBADF; + break; + } + /* validate whether the directory file actually makes sense */ + if (vdir->d_reclen < vdir_size + vdir->d_namlen) { + pr_err("%s: invalid dir %s\n", + __func__, coda_f2s(&cii->c_fid)); + ret = -EBADF; + break; + } + + name.len = vdir->d_namlen; + name.name = vdir->d_name; + + /* Make sure we skip '.' and '..', we already got those */ + if (name.name[0] == '.' && (name.len == 1 || + (name.name[1] == '.' && name.len == 2))) + vdir->d_fileno = name.len = 0; + + /* skip null entries */ + if (vdir->d_fileno && name.len) { + ino = vdir->d_fileno; + type = CDT2DT(vdir->d_type); + if (!dir_emit(ctx, name.name, name.len, ino, type)) + break; + } + /* we'll always have progress because d_reclen is unsigned and + * we've already established it is non-zero. */ + ctx->pos += vdir->d_reclen; + } +out: + kfree(vdir); + return 0; +} + +/* file operations for directories */ +static int coda_readdir(struct file *coda_file, struct dir_context *ctx) +{ + struct coda_file_info *cfi; + struct file *host_file; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (host_file->f_op->iterate) { + struct inode *host_inode = file_inode(host_file); + + mutex_lock(&host_inode->i_mutex); + ret = -ENOENT; + if (!IS_DEADDIR(host_inode)) { + ret = host_file->f_op->iterate(host_file, ctx); + file_accessed(host_file); + } + mutex_unlock(&host_inode->i_mutex); + return ret; + } + /* Venus: we must read Venus dirents from a file */ + return coda_venus_readdir(coda_file, ctx); +} + +/* called when a cache lookup succeeds */ +static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) +{ + struct inode *inode; + struct coda_inode_info *cii; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(de); + if (!inode || is_root_inode(inode)) + goto out; + if (is_bad_inode(inode)) + goto bad; + + cii = ITOC(d_inode(de)); + if (!(cii->c_flags & (C_PURGE | C_FLUSH))) + goto out; + + shrink_dcache_parent(de); + + /* propagate for a flush */ + if (cii->c_flags & C_FLUSH) + coda_flag_inode_children(inode, C_FLUSH); + + if (d_count(de) > 1) + /* pretend it's valid, but don't change the flags */ + goto out; + + /* clear the flags. */ + spin_lock(&cii->c_lock); + cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); + spin_unlock(&cii->c_lock); +bad: + return 0; +out: + return 1; +} + +/* + * This is the callback from dput() when d_count is going to 0. + * We use this to unhash dentries with bad inodes. + */ +static int coda_dentry_delete(const struct dentry * dentry) +{ + int flags; + + if (d_really_is_negative(dentry)) + return 0; + + flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE; + if (is_bad_inode(d_inode(dentry)) || flags) { + return 1; + } + return 0; +} + + + +/* + * This is called when we want to check if the inode has + * changed on the server. Coda makes this easy since the + * cache manager Venus issues a downcall to the kernel when this + * happens + */ +int coda_revalidate_inode(struct inode *inode) +{ + struct coda_vattr attr; + int error; + int old_mode; + ino_t old_ino; + struct coda_inode_info *cii = ITOC(inode); + + if (!cii->c_flags) + return 0; + + if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { + error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); + if (error) + return -EIO; + + /* this inode may be lost if: + - it's ino changed + - type changes must be permitted for repair and + missing mount points. + */ + old_mode = inode->i_mode; + old_ino = inode->i_ino; + coda_vattr_to_iattr(inode, &attr); + + if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { + pr_warn("inode %ld, fid %s changed type!\n", + inode->i_ino, coda_f2s(&(cii->c_fid))); + } + + /* the following can happen when a local fid is replaced + with a global one, here we lose and declare the inode bad */ + if (inode->i_ino != old_ino) + return -EIO; + + coda_flag_inode_children(inode, C_FLUSH); + + spin_lock(&cii->c_lock); + cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); + spin_unlock(&cii->c_lock); + } + return 0; +} + +const struct dentry_operations coda_dentry_operations = { + .d_revalidate = coda_dentry_revalidate, + .d_delete = coda_dentry_delete, +}; + +const struct inode_operations coda_dir_inode_operations = { + .create = coda_create, + .lookup = coda_lookup, + .link = coda_link, + .unlink = coda_unlink, + .symlink = coda_symlink, + .mkdir = coda_mkdir, + .rmdir = coda_rmdir, + .mknod = CODA_EIO_ERROR, + .rename = coda_rename, + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +const struct file_operations coda_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = coda_readdir, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, +}; diff --git a/kernel/fs/coda/file.c b/kernel/fs/coda/file.c new file mode 100644 index 000000000..1da3805f3 --- /dev/null +++ b/kernel/fs/coda/file.c @@ -0,0 +1,230 @@ +/* + * File operations for Coda. + * Original version: (C) 1996 Peter Braam + * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" +#include "coda_int.h" + +static ssize_t +coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *coda_file = iocb->ki_filp; + struct coda_file_info *cfi = CODA_FTOC(coda_file); + + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + + return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos); +} + +static ssize_t +coda_file_splice_read(struct file *coda_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); + struct coda_file_info *cfi; + struct file *host_file; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + splice_read = host_file->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; + + return splice_read(host_file, ppos, pipe, count, flags); +} + +static ssize_t +coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *coda_file = iocb->ki_filp; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = CODA_FTOC(coda_file); + struct file *host_file; + ssize_t ret; + + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + + host_file = cfi->cfi_container; + file_start_write(host_file); + mutex_lock(&coda_inode->i_mutex); + ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); + coda_inode->i_size = file_inode(host_file)->i_size; + coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; + coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; + mutex_unlock(&coda_inode->i_mutex); + file_end_write(host_file); + return ret; +} + +static int +coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) +{ + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct file *host_file; + struct inode *coda_inode, *host_inode; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (!host_file->f_op->mmap) + return -ENODEV; + + coda_inode = file_inode(coda_file); + host_inode = file_inode(host_file); + + cii = ITOC(coda_inode); + spin_lock(&cii->c_lock); + coda_file->f_mapping = host_file->f_mapping; + if (coda_inode->i_mapping == &coda_inode->i_data) + coda_inode->i_mapping = host_inode->i_mapping; + + /* only allow additional mmaps as long as userspace isn't changing + * the container file on us! */ + else if (coda_inode->i_mapping != host_inode->i_mapping) { + spin_unlock(&cii->c_lock); + return -EBUSY; + } + + /* keep track of how often the coda_inode/host_file has been mmapped */ + cii->c_mapcount++; + cfi->cfi_mapcount++; + spin_unlock(&cii->c_lock); + + return host_file->f_op->mmap(host_file, vma); +} + +int coda_open(struct inode *coda_inode, struct file *coda_file) +{ + struct file *host_file = NULL; + int error; + unsigned short flags = coda_file->f_flags & (~O_EXCL); + unsigned short coda_flags = coda_flags_to_cflags(flags); + struct coda_file_info *cfi; + + cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL); + if (!cfi) + return -ENOMEM; + + error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, + &host_file); + if (!host_file) + error = -EIO; + + if (error) { + kfree(cfi); + return error; + } + + host_file->f_flags |= coda_file->f_flags & (O_APPEND | O_SYNC); + + cfi->cfi_magic = CODA_MAGIC; + cfi->cfi_mapcount = 0; + cfi->cfi_container = host_file; + + BUG_ON(coda_file->private_data != NULL); + coda_file->private_data = cfi; + return 0; +} + +int coda_release(struct inode *coda_inode, struct file *coda_file) +{ + unsigned short flags = (coda_file->f_flags) & (~O_EXCL); + unsigned short coda_flags = coda_flags_to_cflags(flags); + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct inode *host_inode; + int err; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + + err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), + coda_flags, coda_file->f_cred->fsuid); + + host_inode = file_inode(cfi->cfi_container); + cii = ITOC(coda_inode); + + /* did we mmap this file? */ + spin_lock(&cii->c_lock); + if (coda_inode->i_mapping == &host_inode->i_data) { + cii->c_mapcount -= cfi->cfi_mapcount; + if (!cii->c_mapcount) + coda_inode->i_mapping = &coda_inode->i_data; + } + spin_unlock(&cii->c_lock); + + fput(cfi->cfi_container); + kfree(coda_file->private_data); + coda_file->private_data = NULL; + + /* VFS fput ignores the return value from file_operations->release, so + * there is no use returning an error here */ + return 0; +} + +int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) +{ + struct file *host_file; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi; + int err; + + if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || + S_ISLNK(coda_inode->i_mode))) + return -EINVAL; + + err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&coda_inode->i_mutex); + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + err = vfs_fsync(host_file, datasync); + if (!err && !datasync) + err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); + mutex_unlock(&coda_inode->i_mutex); + + return err; +} + +const struct file_operations coda_file_operations = { + .llseek = generic_file_llseek, + .read_iter = coda_file_read_iter, + .write_iter = coda_file_write_iter, + .mmap = coda_file_mmap, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, + .splice_read = coda_file_splice_read, +}; + diff --git a/kernel/fs/coda/inode.c b/kernel/fs/coda/inode.c new file mode 100644 index 000000000..cac1390b8 --- /dev/null +++ b/kernel/fs/coda/inode.c @@ -0,0 +1,333 @@ +/* + * Super block/filesystem wide operations + * + * Copyright (C) 1996 Peter J. Braam and + * Michael Callahan + * + * Rewritten for Linux 2.1. Peter Braam + * Copyright (C) Carnegie Mellon University + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +/* VFS super_block ops */ +static void coda_evict_inode(struct inode *); +static void coda_put_super(struct super_block *); +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf); + +static struct kmem_cache * coda_inode_cachep; + +static struct inode *coda_alloc_inode(struct super_block *sb) +{ + struct coda_inode_info *ei; + ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + memset(&ei->c_fid, 0, sizeof(struct CodaFid)); + ei->c_flags = 0; + ei->c_uid = GLOBAL_ROOT_UID; + ei->c_cached_perm = 0; + spin_lock_init(&ei->c_lock); + return &ei->vfs_inode; +} + +static void coda_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(coda_inode_cachep, ITOC(inode)); +} + +static void coda_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, coda_i_callback); +} + +static void init_once(void *foo) +{ + struct coda_inode_info *ei = (struct coda_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +int __init coda_init_inodecache(void) +{ + coda_inode_cachep = kmem_cache_create("coda_inode_cache", + sizeof(struct coda_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (coda_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +void coda_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(coda_inode_cachep); +} + +static int coda_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_NOATIME; + return 0; +} + +/* exported operations */ +static const struct super_operations coda_super_operations = +{ + .alloc_inode = coda_alloc_inode, + .destroy_inode = coda_destroy_inode, + .evict_inode = coda_evict_inode, + .put_super = coda_put_super, + .statfs = coda_statfs, + .remount_fs = coda_remount, +}; + +static int get_device_index(struct coda_mount_data *data) +{ + struct fd f; + struct inode *inode; + int idx; + + if (data == NULL) { + pr_warn("%s: Bad mount data\n", __func__); + return -1; + } + + if (data->version != CODA_MOUNT_VERSION) { + pr_warn("%s: Bad mount version\n", __func__); + return -1; + } + + f = fdget(data->fd); + if (!f.file) + goto Ebadf; + inode = file_inode(f.file); + if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { + fdput(f); + goto Ebadf; + } + + idx = iminor(inode); + fdput(f); + + if (idx < 0 || idx >= MAX_CODADEVS) { + pr_warn("%s: Bad minor number\n", __func__); + return -1; + } + + return idx; +Ebadf: + pr_warn("%s: Bad file\n", __func__); + return -1; +} + +static int coda_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root = NULL; + struct venus_comm *vc; + struct CodaFid fid; + int error; + int idx; + + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + + idx = get_device_index((struct coda_mount_data *) data); + + /* Ignore errors in data, for backward compatibility */ + if(idx == -1) + idx = 0; + + pr_info("%s: device index: %i\n", __func__, idx); + + vc = &coda_comms[idx]; + mutex_lock(&vc->vc_mutex); + + if (!vc->vc_inuse) { + pr_warn("%s: No pseudo device\n", __func__); + error = -EINVAL; + goto unlock_out; + } + + if (vc->vc_sb) { + pr_warn("%s: Device already mounted\n", __func__); + error = -EBUSY; + goto unlock_out; + } + + error = bdi_setup_and_register(&vc->bdi, "coda"); + if (error) + goto unlock_out; + + vc->vc_sb = sb; + mutex_unlock(&vc->vc_mutex); + + sb->s_fs_info = vc; + sb->s_flags |= MS_NOATIME; + sb->s_blocksize = 4096; /* XXXXX what do we put here?? */ + sb->s_blocksize_bits = 12; + sb->s_magic = CODA_SUPER_MAGIC; + sb->s_op = &coda_super_operations; + sb->s_d_op = &coda_dentry_operations; + sb->s_bdi = &vc->bdi; + + /* get root fid from Venus: this needs the root inode */ + error = venus_rootfid(sb, &fid); + if ( error ) { + pr_warn("%s: coda_get_rootfid failed with %d\n", + __func__, error); + goto error; + } + pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid)); + + /* make root inode */ + root = coda_cnode_make(&fid, sb); + if (IS_ERR(root)) { + error = PTR_ERR(root); + pr_warn("Failure of coda_cnode_make for root: error %d\n", + error); + goto error; + } + + pr_info("%s: rootinode is %ld dev %s\n", + __func__, root->i_ino, root->i_sb->s_id); + sb->s_root = d_make_root(root); + if (!sb->s_root) { + error = -EINVAL; + goto error; + } + return 0; + +error: + mutex_lock(&vc->vc_mutex); + bdi_destroy(&vc->bdi); + vc->vc_sb = NULL; + sb->s_fs_info = NULL; +unlock_out: + mutex_unlock(&vc->vc_mutex); + return error; +} + +static void coda_put_super(struct super_block *sb) +{ + struct venus_comm *vcp = coda_vcp(sb); + mutex_lock(&vcp->vc_mutex); + bdi_destroy(&vcp->bdi); + vcp->vc_sb = NULL; + sb->s_fs_info = NULL; + mutex_unlock(&vcp->vc_mutex); + + pr_info("Bye bye.\n"); +} + +static void coda_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + coda_cache_clear_inode(inode); +} + +int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + int err = coda_revalidate_inode(d_inode(dentry)); + if (!err) + generic_fillattr(d_inode(dentry), stat); + return err; +} + +int coda_setattr(struct dentry *de, struct iattr *iattr) +{ + struct inode *inode = d_inode(de); + struct coda_vattr vattr; + int error; + + memset(&vattr, 0, sizeof(vattr)); + + inode->i_ctime = CURRENT_TIME_SEC; + coda_iattr_to_vattr(iattr, &vattr); + vattr.va_type = C_VNON; /* cannot set type */ + + /* Venus is responsible for truncating the container-file!!! */ + error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); + + if (!error) { + coda_vattr_to_iattr(inode, &vattr); + coda_cache_clear_inode(inode); + } + return error; +} + +const struct inode_operations coda_file_inode_operations = { + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int error; + + error = venus_statfs(dentry, buf); + + if (error) { + /* fake something like AFS does */ + buf->f_blocks = 9000000; + buf->f_bfree = 9000000; + buf->f_bavail = 9000000; + buf->f_files = 9000000; + buf->f_ffree = 9000000; + } + + /* and fill in the rest */ + buf->f_type = CODA_SUPER_MAGIC; + buf->f_bsize = 4096; + buf->f_namelen = CODA_MAXNAMLEN; + + return 0; +} + +/* init_coda: used by filesystems.c to register coda */ + +static struct dentry *coda_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, coda_fill_super); +} + +struct file_system_type coda_fs_type = { + .owner = THIS_MODULE, + .name = "coda", + .mount = coda_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("coda"); + diff --git a/kernel/fs/coda/pioctl.c b/kernel/fs/coda/pioctl.c new file mode 100644 index 000000000..f36a4040a --- /dev/null +++ b/kernel/fs/coda/pioctl.c @@ -0,0 +1,90 @@ +/* + * Pioctl operations for Coda. + * Original version: (C) 1996 Peter Braam + * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +/* pioctl ops */ +static int coda_ioctl_permission(struct inode *inode, int mask); +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data); + +/* exported from this file */ +const struct inode_operations coda_ioctl_inode_operations = { + .permission = coda_ioctl_permission, + .setattr = coda_setattr, +}; + +const struct file_operations coda_ioctl_operations = { + .owner = THIS_MODULE, + .unlocked_ioctl = coda_pioctl, + .llseek = noop_llseek, +}; + +/* the coda pioctl inode ops */ +static int coda_ioctl_permission(struct inode *inode, int mask) +{ + return (mask & MAY_EXEC) ? -EACCES : 0; +} + +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data) +{ + struct path path; + int error; + struct PioctlData data; + struct inode *inode = file_inode(filp); + struct inode *target_inode = NULL; + struct coda_inode_info *cnp; + + /* get the Pioctl data arguments from user space */ + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) + return -EINVAL; + + /* + * Look up the pathname. Note that the pathname is in + * user memory, and namei takes care of this + */ + if (data.follow) + error = user_path(data.path, &path); + else + error = user_lpath(data.path, &path); + + if (error) + return error; + + target_inode = d_inode(path.dentry); + + /* return if it is not a Coda inode */ + if (target_inode->i_sb != inode->i_sb) { + error = -EINVAL; + goto out; + } + + /* now proceed to make the upcall */ + cnp = ITOC(target_inode); + + error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); +out: + path_put(&path); + return error; +} diff --git a/kernel/fs/coda/psdev.c b/kernel/fs/coda/psdev.c new file mode 100644 index 000000000..822629126 --- /dev/null +++ b/kernel/fs/coda/psdev.c @@ -0,0 +1,437 @@ +/* + * An implementation of a loadable kernel mode driver providing + * multiple kernel/user space bidirectional communications links. + * + * Author: Alan Cox + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Adapted to become the Linux 2.0 Coda pseudo device + * Peter Braam + * Michael Callahan + * + * Changes for Linux 2.1 + * Copyright (c) 1997 Carnegie-Mellon University + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +#include "coda_int.h" + +/* statistics */ +int coda_hard; /* allows signals during upcalls */ +unsigned long coda_timeout = 30; /* .. secs, then signals will dequeue */ + + +struct venus_comm coda_comms[MAX_CODADEVS]; +static struct class *coda_psdev_class; + +/* + * Device operations + */ + +static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + unsigned int mask = POLLOUT | POLLWRNORM; + + poll_wait(file, &vcp->vc_waitq, wait); + mutex_lock(&vcp->vc_mutex); + if (!list_empty(&vcp->vc_pending)) + mask |= POLLIN | POLLRDNORM; + mutex_unlock(&vcp->vc_mutex); + + return mask; +} + +static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg) +{ + unsigned int data; + + switch(cmd) { + case CIOC_KERNEL_VERSION: + data = CODA_KERNEL_VERSION; + return put_user(data, (int __user *) arg); + default: + return -ENOTTY; + } + + return 0; +} + +/* + * Receive a message written by Venus to the psdev + */ + +static ssize_t coda_psdev_write(struct file *file, const char __user *buf, + size_t nbytes, loff_t *off) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req = NULL; + struct upc_req *tmp; + struct list_head *lh; + struct coda_in_hdr hdr; + ssize_t retval = 0, count = 0; + int error; + + /* Peek at the opcode, uniquefier */ + if (copy_from_user(&hdr, buf, 2 * sizeof(u_long))) + return -EFAULT; + + if (DOWNCALL(hdr.opcode)) { + union outputArgs *dcbuf; + int size = sizeof(*dcbuf); + + if ( nbytes < sizeof(struct coda_out_hdr) ) { + pr_warn("coda_downcall opc %d uniq %d, not enough!\n", + hdr.opcode, hdr.unique); + count = nbytes; + goto out; + } + if ( nbytes > size ) { + pr_warn("downcall opc %d, uniq %d, too much!", + hdr.opcode, hdr.unique); + nbytes = size; + } + CODA_ALLOC(dcbuf, union outputArgs *, nbytes); + if (copy_from_user(dcbuf, buf, nbytes)) { + CODA_FREE(dcbuf, nbytes); + retval = -EFAULT; + goto out; + } + + /* what downcall errors does Venus handle ? */ + error = coda_downcall(vcp, hdr.opcode, dcbuf); + + CODA_FREE(dcbuf, nbytes); + if (error) { + pr_warn("%s: coda_downcall error: %d\n", + __func__, error); + retval = error; + goto out; + } + count = nbytes; + goto out; + } + + /* Look for the message on the processing queue. */ + mutex_lock(&vcp->vc_mutex); + list_for_each(lh, &vcp->vc_processing) { + tmp = list_entry(lh, struct upc_req , uc_chain); + if (tmp->uc_unique == hdr.unique) { + req = tmp; + list_del(&req->uc_chain); + break; + } + } + mutex_unlock(&vcp->vc_mutex); + + if (!req) { + pr_warn("%s: msg (%d, %d) not found\n", + __func__, hdr.opcode, hdr.unique); + retval = -ESRCH; + goto out; + } + + /* move data into response buffer. */ + if (req->uc_outSize < nbytes) { + pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", + __func__, req->uc_outSize, (long)nbytes, + hdr.opcode, hdr.unique); + nbytes = req->uc_outSize; /* don't have more space! */ + } + if (copy_from_user(req->uc_data, buf, nbytes)) { + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + retval = -EFAULT; + goto out; + } + + /* adjust outsize. is this useful ?? */ + req->uc_outSize = nbytes; + req->uc_flags |= CODA_REQ_WRITE; + count = nbytes; + + /* Convert filedescriptor into a file handle */ + if (req->uc_opcode == CODA_OPEN_BY_FD) { + struct coda_open_by_fd_out *outp = + (struct coda_open_by_fd_out *)req->uc_data; + if (!outp->oh.result) + outp->fh = fget(outp->fd); + } + + wake_up(&req->uc_sleep); +out: + return(count ? count : retval); +} + +/* + * Read a message from the kernel to Venus + */ + +static ssize_t coda_psdev_read(struct file * file, char __user * buf, + size_t nbytes, loff_t *off) +{ + DECLARE_WAITQUEUE(wait, current); + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req; + ssize_t retval = 0, count = 0; + + if (nbytes == 0) + return 0; + + mutex_lock(&vcp->vc_mutex); + + add_wait_queue(&vcp->vc_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + while (list_empty(&vcp->vc_pending)) { + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + break; + } + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + mutex_unlock(&vcp->vc_mutex); + schedule(); + mutex_lock(&vcp->vc_mutex); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&vcp->vc_waitq, &wait); + + if (retval) + goto out; + + req = list_entry(vcp->vc_pending.next, struct upc_req,uc_chain); + list_del(&req->uc_chain); + + /* Move the input args into userspace */ + count = req->uc_inSize; + if (nbytes < req->uc_inSize) { + pr_warn("%s: Venus read %ld bytes of %d in message\n", + __func__, (long)nbytes, req->uc_inSize); + count = nbytes; + } + + if (copy_to_user(buf, req->uc_data, count)) + retval = -EFAULT; + + /* If request was not a signal, enqueue and don't free */ + if (!(req->uc_flags & CODA_REQ_ASYNC)) { + req->uc_flags |= CODA_REQ_READ; + list_add_tail(&(req->uc_chain), &vcp->vc_processing); + goto out; + } + + CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kfree(req); +out: + mutex_unlock(&vcp->vc_mutex); + return (count ? count : retval); +} + +static int coda_psdev_open(struct inode * inode, struct file * file) +{ + struct venus_comm *vcp; + int idx, err; + + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + + if (current_user_ns() != &init_user_ns) + return -EINVAL; + + idx = iminor(inode); + if (idx < 0 || idx >= MAX_CODADEVS) + return -ENODEV; + + err = -EBUSY; + vcp = &coda_comms[idx]; + mutex_lock(&vcp->vc_mutex); + + if (!vcp->vc_inuse) { + vcp->vc_inuse++; + + INIT_LIST_HEAD(&vcp->vc_pending); + INIT_LIST_HEAD(&vcp->vc_processing); + init_waitqueue_head(&vcp->vc_waitq); + vcp->vc_sb = NULL; + vcp->vc_seq = 0; + + file->private_data = vcp; + err = 0; + } + + mutex_unlock(&vcp->vc_mutex); + return err; +} + + +static int coda_psdev_release(struct inode * inode, struct file * file) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req, *tmp; + + if (!vcp || !vcp->vc_inuse ) { + pr_warn("%s: Not open.\n", __func__); + return -1; + } + + mutex_lock(&vcp->vc_mutex); + + /* Wakeup clients so they can return. */ + list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { + list_del(&req->uc_chain); + + /* Async requests need to be freed here */ + if (req->uc_flags & CODA_REQ_ASYNC) { + CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kfree(req); + continue; + } + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + } + + list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { + list_del(&req->uc_chain); + + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + } + + file->private_data = NULL; + vcp->vc_inuse--; + mutex_unlock(&vcp->vc_mutex); + return 0; +} + + +static const struct file_operations coda_psdev_fops = { + .owner = THIS_MODULE, + .read = coda_psdev_read, + .write = coda_psdev_write, + .poll = coda_psdev_poll, + .unlocked_ioctl = coda_psdev_ioctl, + .open = coda_psdev_open, + .release = coda_psdev_release, + .llseek = noop_llseek, +}; + +static int init_coda_psdev(void) +{ + int i, err = 0; + if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) { + pr_err("%s: unable to get major %d\n", + __func__, CODA_PSDEV_MAJOR); + return -EIO; + } + coda_psdev_class = class_create(THIS_MODULE, "coda"); + if (IS_ERR(coda_psdev_class)) { + err = PTR_ERR(coda_psdev_class); + goto out_chrdev; + } + for (i = 0; i < MAX_CODADEVS; i++) { + mutex_init(&(&coda_comms[i])->vc_mutex); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); + } + coda_sysctl_init(); + goto out; + +out_chrdev: + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); +out: + return err; +} + +MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); +MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); +MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); +MODULE_LICENSE("GPL"); +MODULE_VERSION("6.6"); + +static int __init init_coda(void) +{ + int status; + int i; + + status = coda_init_inodecache(); + if (status) + goto out2; + status = init_coda_psdev(); + if ( status ) { + pr_warn("Problem (%d) in init_coda_psdev\n", status); + goto out1; + } + + status = register_filesystem(&coda_fs_type); + if (status) { + pr_warn("failed to register filesystem!\n"); + goto out; + } + return 0; +out: + for (i = 0; i < MAX_CODADEVS; i++) + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + class_destroy(coda_psdev_class); + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); + coda_sysctl_clean(); +out1: + coda_destroy_inodecache(); +out2: + return status; +} + +static void __exit exit_coda(void) +{ + int err, i; + + err = unregister_filesystem(&coda_fs_type); + if (err != 0) + pr_warn("failed to unregister filesystem\n"); + for (i = 0; i < MAX_CODADEVS; i++) + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + class_destroy(coda_psdev_class); + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); + coda_sysctl_clean(); + coda_destroy_inodecache(); +} + +module_init(init_coda); +module_exit(exit_coda); + diff --git a/kernel/fs/coda/symlink.c b/kernel/fs/coda/symlink.c new file mode 100644 index 000000000..ab94ef63c --- /dev/null +++ b/kernel/fs/coda/symlink.c @@ -0,0 +1,50 @@ +/* + * Symlink inode operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +static int coda_symlink_filler(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error; + struct coda_inode_info *cii; + unsigned int len = PAGE_SIZE; + char *p = kmap(page); + + cii = ITOC(inode); + + error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); + if (error) + goto fail; + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return error; +} + +const struct address_space_operations coda_symlink_aops = { + .readpage = coda_symlink_filler, +}; diff --git a/kernel/fs/coda/sysctl.c b/kernel/fs/coda/sysctl.c new file mode 100644 index 000000000..34218a8a2 --- /dev/null +++ b/kernel/fs/coda/sysctl.c @@ -0,0 +1,73 @@ +/* + * Sysctl operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include + +#include "coda_int.h" + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fs_table_header; + +static struct ctl_table coda_table[] = { + { + .procname = "timeout", + .data = &coda_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "hard", + .data = &coda_hard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "fake_statfs", + .data = &coda_fake_statfs, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec + }, + {} +}; + +static struct ctl_table fs_table[] = { + { + .procname = "coda", + .mode = 0555, + .child = coda_table + }, + {} +}; + +void coda_sysctl_init(void) +{ + if ( !fs_table_header ) + fs_table_header = register_sysctl_table(fs_table); +} + +void coda_sysctl_clean(void) +{ + if ( fs_table_header ) { + unregister_sysctl_table(fs_table_header); + fs_table_header = NULL; + } +} + +#else +void coda_sysctl_init(void) +{ +} + +void coda_sysctl_clean(void) +{ +} +#endif diff --git a/kernel/fs/coda/upcall.c b/kernel/fs/coda/upcall.c new file mode 100644 index 000000000..9b1ffaa05 --- /dev/null +++ b/kernel/fs/coda/upcall.c @@ -0,0 +1,882 @@ +/* + * Mostly platform independent upcall operations to Venus: + * -- upcalls + * -- upcall routines + * + * Linux 2.0 version + * Copyright (C) 1996 Peter J. Braam , + * Michael Callahan + * + * Redone for Linux 2.1 + * Copyright (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this code to contribute + * improvements to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize, + union inputArgs *buffer); + +static void *alloc_upcall(int opcode, int size) +{ + union inputArgs *inp; + + CODA_ALLOC(inp, union inputArgs *, size); + if (!inp) + return ERR_PTR(-ENOMEM); + + inp->ih.opcode = opcode; + inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns); + inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns); + inp->ih.uid = from_kuid(&init_user_ns, current_fsuid()); + + return (void*)inp; +} + +#define UPARG(op)\ +do {\ + inp = (union inputArgs *)alloc_upcall(op, insize); \ + if (IS_ERR(inp)) { return PTR_ERR(inp); }\ + outp = (union outputArgs *)(inp); \ + outsize = insize; \ +} while (0) + +#define INSIZE(tag) sizeof(struct coda_ ## tag ## _in) +#define OUTSIZE(tag) sizeof(struct coda_ ## tag ## _out) +#define SIZE(tag) max_t(unsigned int, INSIZE(tag), OUTSIZE(tag)) + + +/* the upcalls */ +int venus_rootfid(struct super_block *sb, struct CodaFid *fidp) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(root); + UPARG(CODA_ROOT); + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *fidp = outp->coda_root.VFid; + + CODA_FREE(inp, insize); + return error; +} + +int venus_getattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *attr) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(getattr); + UPARG(CODA_GETATTR); + inp->coda_getattr.VFid = *fid; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *attr = outp->coda_getattr.attr; + + CODA_FREE(inp, insize); + return error; +} + +int venus_setattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *vattr) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(setattr); + UPARG(CODA_SETATTR); + + inp->coda_setattr.VFid = *fid; + inp->coda_setattr.attr = *vattr; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_lookup(struct super_block *sb, struct CodaFid *fid, + const char *name, int length, int * type, + struct CodaFid *resfid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(lookup); + insize = max_t(unsigned int, offset + length +1, OUTSIZE(lookup)); + UPARG(CODA_LOOKUP); + + inp->coda_lookup.VFid = *fid; + inp->coda_lookup.name = offset; + inp->coda_lookup.flags = CLU_CASE_SENSITIVE; + /* send Venus a null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *resfid = outp->coda_lookup.VFid; + *type = outp->coda_lookup.vtype; + } + + CODA_FREE(inp, insize); + return error; +} + +int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, + kuid_t uid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(release); + UPARG(CODA_CLOSE); + + inp->ih.uid = from_kuid(&init_user_ns, uid); + inp->coda_close.VFid = *fid; + inp->coda_close.flags = flags; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_open(struct super_block *sb, struct CodaFid *fid, + int flags, struct file **fh) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(open_by_fd); + UPARG(CODA_OPEN_BY_FD); + + inp->coda_open_by_fd.VFid = *fid; + inp->coda_open_by_fd.flags = flags; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *fh = outp->coda_open_by_fd.fh; + + CODA_FREE(inp, insize); + return error; +} + +int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, + struct CodaFid *newfid, struct coda_vattr *attrs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(mkdir); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(mkdir)); + UPARG(CODA_MKDIR); + + inp->coda_mkdir.VFid = *dirfid; + inp->coda_mkdir.attr = *attrs; + inp->coda_mkdir.name = offset; + /* Venus must get null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *attrs = outp->coda_mkdir.attr; + *newfid = outp->coda_mkdir.VFid; + } + + CODA_FREE(inp, insize); + return error; +} + + +int venus_rename(struct super_block *sb, struct CodaFid *old_fid, + struct CodaFid *new_fid, size_t old_length, + size_t new_length, const char *old_name, + const char *new_name) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset, s; + + offset = INSIZE(rename); + insize = max_t(unsigned int, offset + new_length + old_length + 8, + OUTSIZE(rename)); + UPARG(CODA_RENAME); + + inp->coda_rename.sourceFid = *old_fid; + inp->coda_rename.destFid = *new_fid; + inp->coda_rename.srcname = offset; + + /* Venus must receive an null terminated string */ + s = ( old_length & ~0x3) +4; /* round up to word boundary */ + memcpy((char *)(inp) + offset, old_name, old_length); + *((char *)inp + offset + old_length) = '\0'; + + /* another null terminated string for Venus */ + offset += s; + inp->coda_rename.destname = offset; + s = ( new_length & ~0x3) +4; /* round up to word boundary */ + memcpy((char *)(inp) + offset, new_name, new_length); + *((char *)inp + offset + new_length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_create(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, int excl, int mode, + struct CodaFid *newfid, struct coda_vattr *attrs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(create); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(create)); + UPARG(CODA_CREATE); + + inp->coda_create.VFid = *dirfid; + inp->coda_create.attr.va_mode = mode; + inp->coda_create.excl = excl; + inp->coda_create.mode = mode; + inp->coda_create.name = offset; + + /* Venus must get null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *attrs = outp->coda_create.attr; + *newfid = outp->coda_create.VFid; + } + + CODA_FREE(inp, insize); + return error; +} + +int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(rmdir); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(rmdir)); + UPARG(CODA_RMDIR); + + inp->coda_rmdir.VFid = *dirfid; + inp->coda_rmdir.name = offset; + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_remove(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length) +{ + union inputArgs *inp; + union outputArgs *outp; + int error=0, insize, outsize, offset; + + offset = INSIZE(remove); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(remove)); + UPARG(CODA_REMOVE); + + inp->coda_remove.VFid = *dirfid; + inp->coda_remove.name = offset; + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_readlink(struct super_block *sb, struct CodaFid *fid, + char *buffer, int *length) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int retlen; + char *result; + + insize = max_t(unsigned int, + INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); + UPARG(CODA_READLINK); + + inp->coda_readlink.VFid = *fid; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + retlen = outp->coda_readlink.count; + if ( retlen > *length ) + retlen = *length; + *length = retlen; + result = (char *)outp + (long)outp->coda_readlink.data; + memcpy(buffer, result, retlen); + *(buffer + retlen) = '\0'; + } + + CODA_FREE(inp, insize); + return error; +} + + + +int venus_link(struct super_block *sb, struct CodaFid *fid, + struct CodaFid *dirfid, const char *name, int len ) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(link); + insize = max_t(unsigned int, offset + len + 1, OUTSIZE(link)); + UPARG(CODA_LINK); + + inp->coda_link.sourceFid = *fid; + inp->coda_link.destFid = *dirfid; + inp->coda_link.tname = offset; + + /* make sure strings are null terminated */ + memcpy((char *)(inp) + offset, name, len); + *((char *)inp + offset + len) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_symlink(struct super_block *sb, struct CodaFid *fid, + const char *name, int len, + const char *symname, int symlen) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset, s; + + offset = INSIZE(symlink); + insize = max_t(unsigned int, offset + len + symlen + 8, OUTSIZE(symlink)); + UPARG(CODA_SYMLINK); + + /* inp->coda_symlink.attr = *tva; XXXXXX */ + inp->coda_symlink.VFid = *fid; + + /* Round up to word boundary and null terminate */ + inp->coda_symlink.srcname = offset; + s = ( symlen & ~0x3 ) + 4; + memcpy((char *)(inp) + offset, symname, symlen); + *((char *)inp + offset + symlen) = '\0'; + + /* Round up to word boundary and null terminate */ + offset += s; + inp->coda_symlink.tname = offset; + s = (len & ~0x3) + 4; + memcpy((char *)(inp) + offset, name, len); + *((char *)inp + offset + len) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_fsync(struct super_block *sb, struct CodaFid *fid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize=SIZE(fsync); + UPARG(CODA_FSYNC); + + inp->coda_fsync.VFid = *fid; + error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs), + &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_access(struct super_block *sb, struct CodaFid *fid, int mask) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(access); + UPARG(CODA_ACCESS); + + inp->coda_access.VFid = *fid; + inp->coda_access.flags = mask; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + + +int venus_pioctl(struct super_block *sb, struct CodaFid *fid, + unsigned int cmd, struct PioctlData *data) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int iocsize; + + insize = VC_MAXMSGSIZE; + UPARG(CODA_IOCTL); + + /* build packet for Venus */ + if (data->vi.in_size > VC_MAXDATASIZE) { + error = -EINVAL; + goto exit; + } + + if (data->vi.out_size > VC_MAXDATASIZE) { + error = -EINVAL; + goto exit; + } + + inp->coda_ioctl.VFid = *fid; + + /* the cmd field was mutated by increasing its size field to + * reflect the path and follow args. We need to subtract that + * out before sending the command to Venus. */ + inp->coda_ioctl.cmd = (cmd & ~(PIOCPARM_MASK << 16)); + iocsize = ((cmd >> 16) & PIOCPARM_MASK) - sizeof(char *) - sizeof(int); + inp->coda_ioctl.cmd |= (iocsize & PIOCPARM_MASK) << 16; + + /* in->coda_ioctl.rwflag = flag; */ + inp->coda_ioctl.len = data->vi.in_size; + inp->coda_ioctl.data = (char *)(INSIZE(ioctl)); + + /* get the data out of user space */ + if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data, + data->vi.in, data->vi.in_size)) { + error = -EINVAL; + goto exit; + } + + error = coda_upcall(coda_vcp(sb), SIZE(ioctl) + data->vi.in_size, + &outsize, inp); + + if (error) { + pr_warn("%s: Venus returns: %d for %s\n", + __func__, error, coda_f2s(fid)); + goto exit; + } + + if (outsize < (long)outp->coda_ioctl.data + outp->coda_ioctl.len) { + error = -EINVAL; + goto exit; + } + + /* Copy out the OUT buffer. */ + if (outp->coda_ioctl.len > data->vi.out_size) { + error = -EINVAL; + goto exit; + } + + /* Copy out the OUT buffer. */ + if (copy_to_user(data->vi.out, + (char *)outp + (long)outp->coda_ioctl.data, + outp->coda_ioctl.len)) { + error = -EFAULT; + goto exit; + } + + exit: + CODA_FREE(inp, insize); + return error; +} + +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs)); + UPARG(CODA_STATFS); + + error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp); + if (!error) { + sfs->f_blocks = outp->coda_statfs.stat.f_blocks; + sfs->f_bfree = outp->coda_statfs.stat.f_bfree; + sfs->f_bavail = outp->coda_statfs.stat.f_bavail; + sfs->f_files = outp->coda_statfs.stat.f_files; + sfs->f_ffree = outp->coda_statfs.stat.f_ffree; + } + + CODA_FREE(inp, insize); + return error; +} + +/* + * coda_upcall and coda_downcall routines. + */ +static void coda_block_signals(sigset_t *old) +{ + spin_lock_irq(¤t->sighand->siglock); + *old = current->blocked; + + sigfillset(¤t->blocked); + sigdelset(¤t->blocked, SIGKILL); + sigdelset(¤t->blocked, SIGSTOP); + sigdelset(¤t->blocked, SIGINT); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + +static void coda_unblock_signals(sigset_t *old) +{ + spin_lock_irq(¤t->sighand->siglock); + current->blocked = *old; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + +/* Don't allow signals to interrupt the following upcalls before venus + * has seen them, + * - CODA_CLOSE or CODA_RELEASE upcall (to avoid reference count problems) + * - CODA_STORE (to avoid data loss) + */ +#define CODA_INTERRUPTIBLE(r) (!coda_hard && \ + (((r)->uc_opcode != CODA_CLOSE && \ + (r)->uc_opcode != CODA_STORE && \ + (r)->uc_opcode != CODA_RELEASE) || \ + (r)->uc_flags & CODA_REQ_READ)) + +static inline void coda_waitfor_upcall(struct venus_comm *vcp, + struct upc_req *req) +{ + DECLARE_WAITQUEUE(wait, current); + unsigned long timeout = jiffies + coda_timeout * HZ; + sigset_t old; + int blocked; + + coda_block_signals(&old); + blocked = 1; + + add_wait_queue(&req->uc_sleep, &wait); + for (;;) { + if (CODA_INTERRUPTIBLE(req)) + set_current_state(TASK_INTERRUPTIBLE); + else + set_current_state(TASK_UNINTERRUPTIBLE); + + /* got a reply */ + if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT)) + break; + + if (blocked && time_after(jiffies, timeout) && + CODA_INTERRUPTIBLE(req)) + { + coda_unblock_signals(&old); + blocked = 0; + } + + if (signal_pending(current)) { + list_del(&req->uc_chain); + break; + } + + mutex_unlock(&vcp->vc_mutex); + if (blocked) + schedule_timeout(HZ); + else + schedule(); + mutex_lock(&vcp->vc_mutex); + } + if (blocked) + coda_unblock_signals(&old); + + remove_wait_queue(&req->uc_sleep, &wait); + set_current_state(TASK_RUNNING); +} + + +/* + * coda_upcall will return an error in the case of + * failed communication with Venus _or_ will peek at Venus + * reply and return Venus' error. + * + * As venus has 2 types of errors, normal errors (positive) and internal + * errors (negative), normal errors are negated, while internal errors + * are all mapped to -EINTR, while showing a nice warning message. (jh) + */ +static int coda_upcall(struct venus_comm *vcp, + int inSize, int *outSize, + union inputArgs *buffer) +{ + union outputArgs *out; + union inputArgs *sig_inputArgs; + struct upc_req *req = NULL, *sig_req; + int error; + + mutex_lock(&vcp->vc_mutex); + + if (!vcp->vc_inuse) { + pr_notice("Venus dead, not sending upcall\n"); + error = -ENXIO; + goto exit; + } + + /* Format the request message. */ + req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); + if (!req) { + error = -ENOMEM; + goto exit; + } + + req->uc_data = (void *)buffer; + req->uc_flags = 0; + req->uc_inSize = inSize; + req->uc_outSize = *outSize ? *outSize : inSize; + req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode; + req->uc_unique = ++vcp->vc_seq; + init_waitqueue_head(&req->uc_sleep); + + /* Fill in the common input args. */ + ((union inputArgs *)buffer)->ih.unique = req->uc_unique; + + /* Append msg to pending queue and poke Venus. */ + list_add_tail(&req->uc_chain, &vcp->vc_pending); + + wake_up_interruptible(&vcp->vc_waitq); + /* We can be interrupted while we wait for Venus to process + * our request. If the interrupt occurs before Venus has read + * the request, we dequeue and return. If it occurs after the + * read but before the reply, we dequeue, send a signal + * message, and return. If it occurs after the reply we ignore + * it. In no case do we want to restart the syscall. If it + * was interrupted by a venus shutdown (psdev_close), return + * ENODEV. */ + + /* Go to sleep. Wake up on signals only after the timeout. */ + coda_waitfor_upcall(vcp, req); + + /* Op went through, interrupt or not... */ + if (req->uc_flags & CODA_REQ_WRITE) { + out = (union outputArgs *)req->uc_data; + /* here we map positive Venus errors to kernel errors */ + error = -out->oh.result; + *outSize = req->uc_outSize; + goto exit; + } + + error = -EINTR; + if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { + pr_warn("Unexpected interruption.\n"); + goto exit; + } + + /* Interrupted before venus read it. */ + if (!(req->uc_flags & CODA_REQ_READ)) + goto exit; + + /* Venus saw the upcall, make sure we can send interrupt signal */ + if (!vcp->vc_inuse) { + pr_info("Venus dead, not sending signal.\n"); + goto exit; + } + + error = -ENOMEM; + sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); + if (!sig_req) goto exit; + + CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr)); + if (!sig_req->uc_data) { + kfree(sig_req); + goto exit; + } + + error = -EINTR; + sig_inputArgs = (union inputArgs *)sig_req->uc_data; + sig_inputArgs->ih.opcode = CODA_SIGNAL; + sig_inputArgs->ih.unique = req->uc_unique; + + sig_req->uc_flags = CODA_REQ_ASYNC; + sig_req->uc_opcode = sig_inputArgs->ih.opcode; + sig_req->uc_unique = sig_inputArgs->ih.unique; + sig_req->uc_inSize = sizeof(struct coda_in_hdr); + sig_req->uc_outSize = sizeof(struct coda_in_hdr); + + /* insert at head of queue! */ + list_add(&(sig_req->uc_chain), &vcp->vc_pending); + wake_up_interruptible(&vcp->vc_waitq); + +exit: + kfree(req); + mutex_unlock(&vcp->vc_mutex); + return error; +} + +/* + The statements below are part of the Coda opportunistic + programming -- taken from the Mach/BSD kernel code for Coda. + You don't get correct semantics by stating what needs to be + done without guaranteeing the invariants needed for it to happen. + When will be have time to find out what exactly is going on? (pjb) +*/ + + +/* + * There are 7 cases where cache invalidations occur. The semantics + * of each is listed here: + * + * CODA_FLUSH -- flush all entries from the name cache and the cnode cache. + * CODA_PURGEUSER -- flush all entries from the name cache for a specific user + * This call is a result of token expiration. + * + * The next arise as the result of callbacks on a file or directory. + * CODA_ZAPFILE -- flush the cached attributes for a file. + + * CODA_ZAPDIR -- flush the attributes for the dir and + * force a new lookup for all the children + of this dir. + + * + * The next is a result of Venus detecting an inconsistent file. + * CODA_PURGEFID -- flush the attribute for the file + * purge it and its children from the dcache + * + * The last allows Venus to replace local fids with global ones + * during reintegration. + * + * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ + +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) +{ + struct inode *inode = NULL; + struct CodaFid *fid = NULL, *newfid; + struct super_block *sb; + + /* Handle invalidation requests. */ + mutex_lock(&vcp->vc_mutex); + sb = vcp->vc_sb; + if (!sb || !sb->s_root) + goto unlock_out; + + switch (opcode) { + case CODA_FLUSH: + coda_cache_clear_all(sb); + shrink_dcache_sb(sb); + if (d_really_is_positive(sb->s_root)) + coda_flag_inode(d_inode(sb->s_root), C_FLUSH); + break; + + case CODA_PURGEUSER: + coda_cache_clear_all(sb); + break; + + case CODA_ZAPDIR: + fid = &out->coda_zapdir.CodaFid; + break; + + case CODA_ZAPFILE: + fid = &out->coda_zapfile.CodaFid; + break; + + case CODA_PURGEFID: + fid = &out->coda_purgefid.CodaFid; + break; + + case CODA_REPLACE: + fid = &out->coda_replace.OldFid; + break; + } + if (fid) + inode = coda_fid_to_inode(fid, sb); + +unlock_out: + mutex_unlock(&vcp->vc_mutex); + + if (!inode) + return 0; + + switch (opcode) { + case CODA_ZAPDIR: + coda_flag_inode_children(inode, C_PURGE); + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_ZAPFILE: + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_PURGEFID: + coda_flag_inode_children(inode, C_PURGE); + + /* catch the dentries later if some are still busy */ + coda_flag_inode(inode, C_PURGE); + d_prune_aliases(inode); + break; + + case CODA_REPLACE: + newfid = &out->coda_replace.NewFid; + coda_replace_fid(inode, fid, newfid); + break; + } + iput(inode); + return 0; +} + diff --git a/kernel/fs/compat.c b/kernel/fs/compat.c new file mode 100644 index 000000000..6fd272d45 --- /dev/null +++ b/kernel/fs/compat.c @@ -0,0 +1,1481 @@ +/* + * linux/fs/compat.c + * + * Kernel compatibililty routines for e.g. 32 bit syscall support + * on 64 bit kernels. + * + * Copyright (C) 2002 Stephen Rothwell, IBM Corporation + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +int compat_log = 1; + +int compat_printk(const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = vprintk(fmt, ap); + va_end(ap); + return ret; +} + +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +{ + struct timespec tv[2]; + + if (t) { + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) + return -EFAULT; + + if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) + return 0; + } + return do_utimes(dfd, filename, t ? tv : NULL, flags); +} + +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t[0].tv_sec) || + get_user(tv[0].tv_nsec, &t[0].tv_usec) || + get_user(tv[1].tv_sec, &t[1].tv_sec) || + get_user(tv[1].tv_nsec, &t[1].tv_usec)) + return -EFAULT; + if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || + tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) + return -EINVAL; + tv[0].tv_nsec *= 1000; + tv[1].tv_nsec *= 1000; + } + return do_utimes(dfd, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +{ + return compat_sys_futimesat(AT_FDCWD, filename, t); +} + +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + struct compat_stat tmp; + + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +#ifndef __ARCH_WANT_STAT64 +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} +#endif + +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} + +static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) +{ + + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +/* + * The following statfs calls are copies of code from fs/statfs.c and + * should be checked against those from time to time + */ +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +/* + * This is a copy of sys_ustat, just dealing with a structure layout. + * Given how simple this syscall is that apporach is more maintainable + * than the various conversion hacks. + */ +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) +{ + struct compat_ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct compat_ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) + return -EFAULT; + return 0; +} + +static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 +static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + mm_segment_t old_fs; + struct flock f; + long ret; + unsigned int conv_cmd; + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + ret = get_compat_flock(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, cmd, (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK && ret == 0) { + /* GETLK was successful and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; + if (ret == 0) + ret = put_compat_flock(&f, compat_ptr(arg)); + } + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + ret = get_compat_flock64(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); + set_fs(old_fs); + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; + if (ret == 0) + ret = put_compat_flock64(&f, compat_ptr(arg)); + } + break; + + default: + ret = sys_fcntl(fd, cmd, arg); + break; + } + return ret; +} + +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + return -EINVAL; + } + return compat_sys_fcntl64(fd, cmd, arg); +} + +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) +{ + long ret; + aio_context_t ctx64; + + mm_segment_t oldfs = get_fs(); + if (unlikely(get_user(ctx64, ctx32p))) + return -EFAULT; + + set_fs(KERNEL_DS); + /* The __user pointer cast is valid because of the set_fs() */ + ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64); + set_fs(oldfs); + /* truncating is ok because it's a user address */ + if (!ret) + ret = put_user((u32) ctx64, ctx32p); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) +{ + struct timespec t; + struct timespec __user *ut = NULL; + + if (timeout) { + if (compat_get_timespec(&t, timeout)) + return -EFAULT; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t)) ) + return -EFAULT; + } + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV || nr_segs < 0) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) + goto out; + } + *ret_pointer = iov; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. + * + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + if (type >= 0 && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} + +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} + +struct compat_ncp_mount_data { + compat_int_t version; + compat_uint_t ncp_fd; + __compat_uid_t mounted_uid; + compat_pid_t wdog_pid; + unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; + compat_uint_t time_out; + compat_uint_t retry_count; + compat_uint_t flags; + __compat_uid_t uid; + __compat_gid_t gid; + compat_mode_t file_mode; + compat_mode_t dir_mode; +}; + +struct compat_ncp_mount_data_v4 { + compat_int_t version; + compat_ulong_t flags; + compat_ulong_t mounted_uid; + compat_long_t wdog_pid; + compat_uint_t ncp_fd; + compat_uint_t time_out; + compat_uint_t retry_count; + compat_ulong_t uid; + compat_ulong_t gid; + compat_ulong_t file_mode; + compat_ulong_t dir_mode; +}; + +static void *do_ncp_super_data_conv(void *raw_data) +{ + int version = *(unsigned int *)raw_data; + + if (version == 3) { + struct compat_ncp_mount_data *c_n = raw_data; + struct ncp_mount_data *n = raw_data; + + n->dir_mode = c_n->dir_mode; + n->file_mode = c_n->file_mode; + n->gid = c_n->gid; + n->uid = c_n->uid; + memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int))); + n->wdog_pid = c_n->wdog_pid; + n->mounted_uid = c_n->mounted_uid; + } else if (version == 4) { + struct compat_ncp_mount_data_v4 *c_n = raw_data; + struct ncp_mount_data_v4 *n = raw_data; + + n->dir_mode = c_n->dir_mode; + n->file_mode = c_n->file_mode; + n->gid = c_n->gid; + n->uid = c_n->uid; + n->retry_count = c_n->retry_count; + n->time_out = c_n->time_out; + n->ncp_fd = c_n->ncp_fd; + n->wdog_pid = c_n->wdog_pid; + n->mounted_uid = c_n->mounted_uid; + n->flags = c_n->flags; + } else if (version != 5) { + return NULL; + } + + return raw_data; +} + + +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static int do_nfs4_super_data_conv(void *raw_data) +{ + int version = *(compat_uint_t *) raw_data; + + if (version == 1) { + struct compat_nfs4_mount_data_v1 *raw = raw_data; + struct nfs4_mount_data *real = raw_data; + + /* copy the fields backwards */ + real->auth_flavours = compat_ptr(raw->auth_flavours); + real->auth_flavourlen = raw->auth_flavourlen; + real->proto = raw->proto; + real->host_addr = compat_ptr(raw->host_addr); + real->host_addrlen = raw->host_addrlen; + compat_nfs_string(&real->hostname, &raw->hostname); + compat_nfs_string(&real->mnt_path, &raw->mnt_path); + compat_nfs_string(&real->client_addr, &raw->client_addr); + real->acdirmax = raw->acdirmax; + real->acdirmin = raw->acdirmin; + real->acregmax = raw->acregmax; + real->acregmin = raw->acregmin; + real->retrans = raw->retrans; + real->timeo = raw->timeo; + real->wsize = raw->wsize; + real->rsize = raw->rsize; + real->flags = raw->flags; + real->version = raw->version; + } + + return 0; +} + +#define NCPFS_NAME "ncpfs" +#define NFS4_NAME "nfs4" + +COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, + const char __user *, dir_name, + const char __user *, type, compat_ulong_t, flags, + const void __user *, data) +{ + char *kernel_type; + unsigned long data_page; + char *kernel_dev; + int retval; + + kernel_type = copy_mount_string(type); + retval = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) + goto out; + + kernel_dev = copy_mount_string(dev_name); + retval = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) + goto out1; + + retval = copy_mount_options(data, &data_page); + if (retval < 0) + goto out2; + + retval = -EINVAL; + + if (kernel_type && data_page) { + if (!strcmp(kernel_type, NCPFS_NAME)) { + do_ncp_super_data_conv((void *)data_page); + } else if (!strcmp(kernel_type, NFS4_NAME)) { + if (do_nfs4_super_data_conv((void *) data_page)) + goto out3; + } + } + + retval = do_mount(kernel_dev, dir_name, kernel_type, + flags, (void*)data_page); + + out3: + free_page(data_page); + out2: + kfree(kernel_dev); + out1: + kfree(kernel_type); + out: + return retval; +} + +struct compat_old_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct compat_readdir_callback { + struct dir_context ctx; + struct compat_old_linux_dirent __user *dirent; + int result; +}; + +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); + struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct fd f = fdget(fd); + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; + + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (buf.result) + error = buf.result; + + fdput(f); + return error; +} + +struct compat_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct compat_getdents_callback { + struct dir_context ctx; + struct compat_linux_dirent __user *current_dir; + struct compat_linux_dirent __user *previous; + int count; + int error; +}; + +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_linux_dirent __user * dirent; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); + compat_ulong_t d_ino; + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) +{ + struct fd f; + struct compat_linux_dirent __user * lastdirent; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(buf.ctx.pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput(f); + return error; +} + +#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 + +struct compat_getdents_callback64 { + struct dir_context ctx; + struct linux_dirent64 __user *current_dir; + struct linux_dirent64 __user *previous; + int count; + int error; +}; + +static int compat_filldir64(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct linux_dirent64 __user *dirent; + struct compat_getdents_callback64 *buf = + container_of(ctx, struct compat_getdents_callback64, ctx); + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); + u64 off; + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + dirent = buf->previous; + + if (dirent) { + if (__put_user_unaligned(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user_unaligned(ino, &dirent->d_ino)) + goto efault; + off = 0; + if (__put_user_unaligned(off, &dirent->d_off)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (__put_user(d_type, &dirent->d_type)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + struct fd f; + struct linux_dirent64 __user * lastdirent; + struct compat_getdents_callback64 buf = { + .ctx.actor = compat_filldir64, + .current_dir = dirent, + .count = count + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + typeof(lastdirent->d_off) d_off = buf.ctx.pos; + if (__put_user_unaligned(d_off, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput(f); + return error; +} +#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ + +/* + * Exactly like fs/open.c:sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like fs/open.c:sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(dfd, filename, flags, mode); +} + +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/kernel/fs/compat_binfmt_elf.c b/kernel/fs/compat_binfmt_elf.c new file mode 100644 index 000000000..4d24d17bc --- /dev/null +++ b/kernel/fs/compat_binfmt_elf.c @@ -0,0 +1,145 @@ +/* + * 32-bit compatibility support for ELF format executables and core dumps. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + * + * This file is used in a 64-bit kernel that wants to support 32-bit ELF. + * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros + * used below, with definitions appropriate for 32-bit ABI compatibility. + * + * We use macros to rename the ABI types and machine-dependent + * functions used in binfmt_elf.c to compat versions. + */ + +#include +#include + +/* + * Rename the basic ELF layout types to refer to the 32-bit class of files. + */ +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 + +#undef elfhdr +#undef elf_phdr +#undef elf_shdr +#undef elf_note +#undef elf_addr_t +#define elfhdr elf32_hdr +#define elf_phdr elf32_phdr +#define elf_shdr elf32_shdr +#define elf_note elf32_note +#define elf_addr_t Elf32_Addr + +/* + * Some data types as stored in coredump. + */ +#define user_long_t compat_long_t +#define user_siginfo_t compat_siginfo_t +#define copy_siginfo_to_user copy_siginfo_to_user32 + +/* + * The machine-dependent core note format types are defined in elfcore-compat.h, + * which requires asm/elf.h to define compat_elf_gregset_t et al. + */ +#define elf_prstatus compat_elf_prstatus +#define elf_prpsinfo compat_elf_prpsinfo + +/* + * Compat version of cputime_to_compat_timeval, perhaps this + * should be an inline in . + */ +static void cputime_to_compat_timeval(const cputime_t cputime, + struct compat_timeval *value) +{ + struct timeval tv; + cputime_to_timeval(cputime, &tv); + value->tv_sec = tv.tv_sec; + value->tv_usec = tv.tv_usec; +} + +#undef cputime_to_timeval +#define cputime_to_timeval cputime_to_compat_timeval + + +/* + * To use this file, asm/elf.h must define compat_elf_check_arch. + * The other following macros can be defined if the compat versions + * differ from the native ones, or omitted when they match. + */ + +#undef ELF_ARCH +#undef elf_check_arch +#define elf_check_arch compat_elf_check_arch + +#ifdef COMPAT_ELF_PLATFORM +#undef ELF_PLATFORM +#define ELF_PLATFORM COMPAT_ELF_PLATFORM +#endif + +#ifdef COMPAT_ELF_HWCAP +#undef ELF_HWCAP +#define ELF_HWCAP COMPAT_ELF_HWCAP +#endif + +#ifdef COMPAT_ELF_HWCAP2 +#undef ELF_HWCAP2 +#define ELF_HWCAP2 COMPAT_ELF_HWCAP2 +#endif + +#ifdef COMPAT_ARCH_DLINFO +#undef ARCH_DLINFO +#define ARCH_DLINFO COMPAT_ARCH_DLINFO +#endif + +#ifdef COMPAT_ELF_ET_DYN_BASE +#undef ELF_ET_DYN_BASE +#define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE +#endif + +#ifdef COMPAT_ELF_EXEC_PAGESIZE +#undef ELF_EXEC_PAGESIZE +#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE +#endif + +#ifdef COMPAT_ELF_PLAT_INIT +#undef ELF_PLAT_INIT +#define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT +#endif + +#ifdef COMPAT_SET_PERSONALITY +#undef SET_PERSONALITY +#define SET_PERSONALITY COMPAT_SET_PERSONALITY +#endif + +#ifdef compat_start_thread +#undef start_thread +#define start_thread compat_start_thread +#endif + +#ifdef compat_arch_setup_additional_pages +#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +#undef arch_setup_additional_pages +#define arch_setup_additional_pages compat_arch_setup_additional_pages +#endif + +/* + * Rename a few of the symbols that binfmt_elf.c will define. + * These are all local so the names don't really matter, but it + * might make some debugging less confusing not to duplicate them. + */ +#define elf_format compat_elf_format +#define init_elf_binfmt init_compat_elf_binfmt +#define exit_elf_binfmt exit_compat_elf_binfmt + +/* + * We share all the actual code with the native (64-bit) version. + */ +#include "binfmt_elf.c" diff --git a/kernel/fs/compat_ioctl.c b/kernel/fs/compat_ioctl.c new file mode 100644 index 000000000..6b8e2f091 --- /dev/null +++ b/kernel/fs/compat_ioctl.c @@ -0,0 +1,1638 @@ +/* + * ioctl32.c: Conversion between 32bit and 64bit native ioctls. + * + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) + * + * These routines maintain argument size conversion between 32bit and 64bit + * ioctls. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifdef CONFIG_BLOCK +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define __DVB_CORE__ +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SPARC +#include +#endif + +static int w_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) +{ + mm_segment_t old_fs = get_fs(); + int err; + unsigned long val; + + set_fs (KERNEL_DS); + err = sys_ioctl(fd, cmd, (unsigned long)&val); + set_fs (old_fs); + if (!err && put_user(val, argp)) + return -EFAULT; + return err; +} + +struct compat_video_event { + int32_t type; + compat_time_t timestamp; + union { + video_size_t size; + unsigned int frame_rate; + } u; +}; + +static int do_video_get_event(unsigned int fd, unsigned int cmd, + struct compat_video_event __user *up) +{ + struct video_event kevent; + mm_segment_t old_fs = get_fs(); + int err; + + set_fs(KERNEL_DS); + err = sys_ioctl(fd, cmd, (unsigned long) &kevent); + set_fs(old_fs); + + if (!err) { + err = put_user(kevent.type, &up->type); + err |= put_user(kevent.timestamp, &up->timestamp); + err |= put_user(kevent.u.size.w, &up->u.size.w); + err |= put_user(kevent.u.size.h, &up->u.size.h); + err |= put_user(kevent.u.size.aspect_ratio, + &up->u.size.aspect_ratio); + if (err) + err = -EFAULT; + } + + return err; +} + +struct compat_video_still_picture { + compat_uptr_t iFrame; + int32_t size; +}; + +static int do_video_stillpicture(unsigned int fd, unsigned int cmd, + struct compat_video_still_picture __user *up) +{ + struct video_still_picture __user *up_native; + compat_uptr_t fp; + int32_t size; + int err; + + err = get_user(fp, &up->iFrame); + err |= get_user(size, &up->size); + if (err) + return -EFAULT; + + up_native = + compat_alloc_user_space(sizeof(struct video_still_picture)); + + err = put_user(compat_ptr(fp), &up_native->iFrame); + err |= put_user(size, &up_native->size); + if (err) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) up_native); + + return err; +} + +struct compat_video_spu_palette { + int length; + compat_uptr_t palette; +}; + +static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + struct compat_video_spu_palette __user *up) +{ + struct video_spu_palette __user *up_native; + compat_uptr_t palp; + int length, err; + + err = get_user(palp, &up->palette); + err |= get_user(length, &up->length); + if (err) + return -EFAULT; + + up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); + err = put_user(compat_ptr(palp), &up_native->palette); + err |= put_user(length, &up_native->length); + if (err) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) up_native); + + return err; +} + +#ifdef CONFIG_BLOCK +typedef struct sg_io_hdr32 { + compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ + compat_int_t dxfer_direction; /* [i] data transfer direction */ + unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ + unsigned char mx_sb_len; /* [i] max length to write to sbp */ + unsigned short iovec_count; /* [i] 0 implies no scatter gather */ + compat_uint_t dxfer_len; /* [i] byte count of data transfer */ + compat_uint_t dxferp; /* [i], [*io] points to data transfer memory + or scatter gather list */ + compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ + compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ + compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ + compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ + compat_int_t pack_id; /* [i->o] unused internally (normally) */ + compat_uptr_t usr_ptr; /* [i->o] unused internally */ + unsigned char status; /* [o] scsi status */ + unsigned char masked_status; /* [o] shifted, masked scsi status */ + unsigned char msg_status; /* [o] messaging level data (optional) */ + unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ + unsigned short host_status; /* [o] errors from host adapter */ + unsigned short driver_status; /* [o] errors from software driver */ + compat_int_t resid; /* [o] dxfer_len - actual_transferred */ + compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ + compat_uint_t info; /* [o] auxiliary information */ +} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ + +typedef struct sg_iovec32 { + compat_uint_t iov_base; + compat_uint_t iov_len; +} sg_iovec32_t; + +static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) +{ + sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); + sg_iovec32_t __user *iov32 = dxferp; + int i; + + for (i = 0; i < iovec_count; i++) { + u32 base, len; + + if (get_user(base, &iov32[i].iov_base) || + get_user(len, &iov32[i].iov_len) || + put_user(compat_ptr(base), &iov[i].iov_base) || + put_user(len, &iov[i].iov_len)) + return -EFAULT; + } + + if (put_user(iov, &sgio->dxferp)) + return -EFAULT; + return 0; +} + +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, + sg_io_hdr32_t __user *sgio32) +{ + sg_io_hdr_t __user *sgio; + u16 iovec_count; + u32 data; + void __user *dxferp; + int err; + int interface_id; + + if (get_user(interface_id, &sgio32->interface_id)) + return -EFAULT; + if (interface_id != 'S') + return sys_ioctl(fd, cmd, (unsigned long)sgio32); + + if (get_user(iovec_count, &sgio32->iovec_count)) + return -EFAULT; + + { + void __user *top = compat_alloc_user_space(0); + void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + + (iovec_count * sizeof(sg_iovec_t))); + if (new > top) + return -EINVAL; + + sgio = new; + } + + /* Ok, now construct. */ + if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, + (2 * sizeof(int)) + + (2 * sizeof(unsigned char)) + + (1 * sizeof(unsigned short)) + + (1 * sizeof(unsigned int)))) + return -EFAULT; + + if (get_user(data, &sgio32->dxferp)) + return -EFAULT; + dxferp = compat_ptr(data); + if (iovec_count) { + if (sg_build_iovec(sgio, dxferp, iovec_count)) + return -EFAULT; + } else { + if (put_user(dxferp, &sgio->dxferp)) + return -EFAULT; + } + + { + unsigned char __user *cmdp; + unsigned char __user *sbp; + + if (get_user(data, &sgio32->cmdp)) + return -EFAULT; + cmdp = compat_ptr(data); + + if (get_user(data, &sgio32->sbp)) + return -EFAULT; + sbp = compat_ptr(data); + + if (put_user(cmdp, &sgio->cmdp) || + put_user(sbp, &sgio->sbp)) + return -EFAULT; + } + + if (copy_in_user(&sgio->timeout, &sgio32->timeout, + 3 * sizeof(int))) + return -EFAULT; + + if (get_user(data, &sgio32->usr_ptr)) + return -EFAULT; + if (put_user(compat_ptr(data), &sgio->usr_ptr)) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) sgio); + + if (err >= 0) { + void __user *datap; + + if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, + sizeof(int)) || + get_user(datap, &sgio->usr_ptr) || + put_user((u32)(unsigned long)datap, + &sgio32->usr_ptr) || + copy_in_user(&sgio32->status, &sgio->status, + (4 * sizeof(unsigned char)) + + (2 * sizeof(unsigned short)) + + (3 * sizeof(int)))) + err = -EFAULT; + } + + return err; +} + +struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ + char req_state; + char orphan; + char sg_io_owned; + char problem; + int pack_id; + compat_uptr_t usr_ptr; + unsigned int duration; + int unused; +}; + +static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct + compat_sg_req_info __user *o) +{ + int err, i; + sg_req_info_t __user *r; + r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); + err = sys_ioctl(fd,cmd,(unsigned long)r); + if (err < 0) + return err; + for (i = 0; i < SG_MAX_QUEUE; i++) { + void __user *ptr; + int d; + + if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || + get_user(ptr, &r[i].usr_ptr) || + get_user(d, &r[i].duration) || + put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || + put_user(d, &o[i].duration)) + return -EFAULT; + } + return err; +} +#endif /* CONFIG_BLOCK */ + +struct sock_fprog32 { + unsigned short len; + compat_caddr_t filter; +}; + +#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) +#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) + +static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, + struct sock_fprog32 __user *u_fprog32) +{ + struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); + void __user *fptr64; + u32 fptr32; + u16 flen; + + if (get_user(flen, &u_fprog32->len) || + get_user(fptr32, &u_fprog32->filter)) + return -EFAULT; + + fptr64 = compat_ptr(fptr32); + + if (put_user(flen, &u_fprog64->len) || + put_user(fptr64, &u_fprog64->filter)) + return -EFAULT; + + if (cmd == PPPIOCSPASS32) + cmd = PPPIOCSPASS; + else + cmd = PPPIOCSACTIVE; + + return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); +} + +struct ppp_option_data32 { + compat_caddr_t ptr; + u32 length; + compat_int_t transmit; +}; +#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) + +struct ppp_idle32 { + compat_time_t xmit_idle; + compat_time_t recv_idle; +}; +#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) + +static int ppp_gidle(unsigned int fd, unsigned int cmd, + struct ppp_idle32 __user *idle32) +{ + struct ppp_idle __user *idle; + __kernel_time_t xmit, recv; + int err; + + idle = compat_alloc_user_space(sizeof(*idle)); + + err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); + + if (!err) { + if (get_user(xmit, &idle->xmit_idle) || + get_user(recv, &idle->recv_idle) || + put_user(xmit, &idle32->xmit_idle) || + put_user(recv, &idle32->recv_idle)) + err = -EFAULT; + } + return err; +} + +static int ppp_scompress(unsigned int fd, unsigned int cmd, + struct ppp_option_data32 __user *odata32) +{ + struct ppp_option_data __user *odata; + __u32 data; + void __user *datap; + + odata = compat_alloc_user_space(sizeof(*odata)); + + if (get_user(data, &odata32->ptr)) + return -EFAULT; + + datap = compat_ptr(data); + if (put_user(datap, &odata->ptr)) + return -EFAULT; + + if (copy_in_user(&odata->length, &odata32->length, + sizeof(__u32) + sizeof(int))) + return -EFAULT; + + return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); +} + +#ifdef CONFIG_BLOCK +struct mtget32 { + compat_long_t mt_type; + compat_long_t mt_resid; + compat_long_t mt_dsreg; + compat_long_t mt_gstat; + compat_long_t mt_erreg; + compat_daddr_t mt_fileno; + compat_daddr_t mt_blkno; +}; +#define MTIOCGET32 _IOR('m', 2, struct mtget32) + +struct mtpos32 { + compat_long_t mt_blkno; +}; +#define MTIOCPOS32 _IOR('m', 3, struct mtpos32) + +static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) +{ + mm_segment_t old_fs = get_fs(); + struct mtget get; + struct mtget32 __user *umget32; + struct mtpos pos; + struct mtpos32 __user *upos32; + unsigned long kcmd; + void *karg; + int err = 0; + + switch(cmd) { + case MTIOCPOS32: + kcmd = MTIOCPOS; + karg = &pos; + break; + default: /* MTIOCGET32 */ + kcmd = MTIOCGET; + karg = &get; + break; + } + set_fs (KERNEL_DS); + err = sys_ioctl (fd, kcmd, (unsigned long)karg); + set_fs (old_fs); + if (err) + return err; + switch (cmd) { + case MTIOCPOS32: + upos32 = argp; + err = __put_user(pos.mt_blkno, &upos32->mt_blkno); + break; + case MTIOCGET32: + umget32 = argp; + err = __put_user(get.mt_type, &umget32->mt_type); + err |= __put_user(get.mt_resid, &umget32->mt_resid); + err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); + err |= __put_user(get.mt_gstat, &umget32->mt_gstat); + err |= __put_user(get.mt_erreg, &umget32->mt_erreg); + err |= __put_user(get.mt_fileno, &umget32->mt_fileno); + err |= __put_user(get.mt_blkno, &umget32->mt_blkno); + break; + } + return err ? -EFAULT: 0; +} + +#endif /* CONFIG_BLOCK */ + +/* Bluetooth ioctls */ +#define HCIUARTSETPROTO _IOW('U', 200, int) +#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTGETDEVICE _IOR('U', 202, int) +#define HCIUARTSETFLAGS _IOW('U', 203, int) +#define HCIUARTGETFLAGS _IOR('U', 204, int) + +#define BNEPCONNADD _IOW('B', 200, int) +#define BNEPCONNDEL _IOW('B', 201, int) +#define BNEPGETCONNLIST _IOR('B', 210, int) +#define BNEPGETCONNINFO _IOR('B', 211, int) +#define BNEPGETSUPPFEAT _IOR('B', 212, int) + +#define CMTPCONNADD _IOW('C', 200, int) +#define CMTPCONNDEL _IOW('C', 201, int) +#define CMTPGETCONNLIST _IOR('C', 210, int) +#define CMTPGETCONNINFO _IOR('C', 211, int) + +#define HIDPCONNADD _IOW('H', 200, int) +#define HIDPCONNDEL _IOW('H', 201, int) +#define HIDPGETCONNLIST _IOR('H', 210, int) +#define HIDPGETCONNINFO _IOR('H', 211, int) + + +struct serial_struct32 { + compat_int_t type; + compat_int_t line; + compat_uint_t port; + compat_int_t irq; + compat_int_t flags; + compat_int_t xmit_fifo_size; + compat_int_t custom_divisor; + compat_int_t baud_base; + unsigned short close_delay; + char io_type; + char reserved_char[1]; + compat_int_t hub6; + unsigned short closing_wait; /* time to wait before closing */ + unsigned short closing_wait2; /* no longer used... */ + compat_uint_t iomem_base; + unsigned short iomem_reg_shift; + unsigned int port_high; + /* compat_ulong_t iomap_base FIXME */ + compat_int_t reserved[1]; +}; + +static int serial_struct_ioctl(unsigned fd, unsigned cmd, + struct serial_struct32 __user *ss32) +{ + typedef struct serial_struct32 SS32; + int err; + struct serial_struct ss; + mm_segment_t oldseg = get_fs(); + __u32 udata; + unsigned int base; + + if (cmd == TIOCSSERIAL) { + if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) + return -EFAULT; + if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) + return -EFAULT; + if (__get_user(udata, &ss32->iomem_base)) + return -EFAULT; + ss.iomem_base = compat_ptr(udata); + if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || + __get_user(ss.port_high, &ss32->port_high)) + return -EFAULT; + ss.iomap_base = 0UL; + } + set_fs(KERNEL_DS); + err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); + set_fs(oldseg); + if (cmd == TIOCGSERIAL && err >= 0) { + if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) + return -EFAULT; + if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) + return -EFAULT; + base = (unsigned long)ss.iomem_base >> 32 ? + 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; + if (__put_user(base, &ss32->iomem_base) || + __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || + __put_user(ss.port_high, &ss32->port_high)) + return -EFAULT; + } + return err; +} + +/* + * I2C layer ioctls + */ + +struct i2c_msg32 { + u16 addr; + u16 flags; + u16 len; + compat_caddr_t buf; +}; + +struct i2c_rdwr_ioctl_data32 { + compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ + u32 nmsgs; +}; + +struct i2c_smbus_ioctl_data32 { + u8 read_write; + u8 command; + u32 size; + compat_caddr_t data; /* union i2c_smbus_data *data */ +}; + +struct i2c_rdwr_aligned { + struct i2c_rdwr_ioctl_data cmd; + struct i2c_msg msgs[0]; +}; + +static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_rdwr_ioctl_data32 __user *udata) +{ + struct i2c_rdwr_aligned __user *tdata; + struct i2c_msg __user *tmsgs; + struct i2c_msg32 __user *umsgs; + compat_caddr_t datap; + u32 nmsgs; + int i; + + if (get_user(nmsgs, &udata->nmsgs)) + return -EFAULT; + if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS) + return -EINVAL; + + if (get_user(datap, &udata->msgs)) + return -EFAULT; + umsgs = compat_ptr(datap); + + tdata = compat_alloc_user_space(sizeof(*tdata) + + nmsgs * sizeof(struct i2c_msg)); + tmsgs = &tdata->msgs[0]; + + if (put_user(nmsgs, &tdata->cmd.nmsgs) || + put_user(tmsgs, &tdata->cmd.msgs)) + return -EFAULT; + + for (i = 0; i < nmsgs; i++) { + if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16))) + return -EFAULT; + if (get_user(datap, &umsgs[i].buf) || + put_user(compat_ptr(datap), &tmsgs[i].buf)) + return -EFAULT; + } + return sys_ioctl(fd, cmd, (unsigned long)tdata); +} + +static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_smbus_ioctl_data32 __user *udata) +{ + struct i2c_smbus_ioctl_data __user *tdata; + compat_caddr_t datap; + + tdata = compat_alloc_user_space(sizeof(*tdata)); + if (tdata == NULL) + return -ENOMEM; + if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) + return -EFAULT; + + if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) + return -EFAULT; + + if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8))) + return -EFAULT; + if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32))) + return -EFAULT; + if (__get_user(datap, &udata->data) || + __put_user(compat_ptr(datap), &tdata->data)) + return -EFAULT; + + return sys_ioctl(fd, cmd, (unsigned long)tdata); +} + +#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) +#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) +#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) +#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) + +static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) +{ + mm_segment_t oldfs = get_fs(); + compat_ulong_t val32; + unsigned long kval; + int ret; + + switch (cmd) { + case RTC_IRQP_READ32: + case RTC_EPOCH_READ32: + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + RTC_IRQP_READ : RTC_EPOCH_READ, + (unsigned long)&kval); + set_fs(oldfs); + if (ret) + return ret; + val32 = kval; + return put_user(val32, (unsigned int __user *)argp); + case RTC_IRQP_SET32: + return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); + case RTC_EPOCH_SET32: + return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); + } + + return -ENOIOCTLCMD; +} + +/* on ia32 l_start is on a 32-bit boundary */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +struct space_resv_32 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32) +#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) + +/* just account for different alignment */ +static int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *p32) +{ + struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); + + if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || + copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || + copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || + copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || + copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || + copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || + copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) + return -EFAULT; + + return ioctl_preallocate(file, p); +} +#endif + +/* + * simple reversible transform to make our table more evenly + * distributed after sorting. + */ +#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) + +#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), +/* ioctl should not be warned about even if it's not implemented. + Valid reasons to use this: + - It is implemented with ->compat_ioctl on some device, but programs + call it on others too. + - The ioctl is not implemented in the native kernel, but programs + call it commonly anyways. + Most other reasons are not valid. */ +#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) + +static unsigned int ioctl_pointer[] = { +/* compatible ioctls first */ +COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ +COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ + +/* Big T */ +COMPATIBLE_IOCTL(TCGETA) +COMPATIBLE_IOCTL(TCSETA) +COMPATIBLE_IOCTL(TCSETAW) +COMPATIBLE_IOCTL(TCSETAF) +COMPATIBLE_IOCTL(TCSBRK) +COMPATIBLE_IOCTL(TCXONC) +COMPATIBLE_IOCTL(TCFLSH) +COMPATIBLE_IOCTL(TCGETS) +COMPATIBLE_IOCTL(TCSETS) +COMPATIBLE_IOCTL(TCSETSW) +COMPATIBLE_IOCTL(TCSETSF) +COMPATIBLE_IOCTL(TIOCLINUX) +COMPATIBLE_IOCTL(TIOCSBRK) +COMPATIBLE_IOCTL(TIOCGDEV) +COMPATIBLE_IOCTL(TIOCCBRK) +COMPATIBLE_IOCTL(TIOCGSID) +COMPATIBLE_IOCTL(TIOCGICOUNT) +COMPATIBLE_IOCTL(TIOCGPKT) +COMPATIBLE_IOCTL(TIOCGPTLCK) +COMPATIBLE_IOCTL(TIOCGEXCL) +/* Little t */ +COMPATIBLE_IOCTL(TIOCGETD) +COMPATIBLE_IOCTL(TIOCSETD) +COMPATIBLE_IOCTL(TIOCEXCL) +COMPATIBLE_IOCTL(TIOCNXCL) +COMPATIBLE_IOCTL(TIOCCONS) +COMPATIBLE_IOCTL(TIOCGSOFTCAR) +COMPATIBLE_IOCTL(TIOCSSOFTCAR) +COMPATIBLE_IOCTL(TIOCSWINSZ) +COMPATIBLE_IOCTL(TIOCGWINSZ) +COMPATIBLE_IOCTL(TIOCMGET) +COMPATIBLE_IOCTL(TIOCMBIC) +COMPATIBLE_IOCTL(TIOCMBIS) +COMPATIBLE_IOCTL(TIOCMSET) +COMPATIBLE_IOCTL(TIOCPKT) +COMPATIBLE_IOCTL(TIOCNOTTY) +COMPATIBLE_IOCTL(TIOCSTI) +COMPATIBLE_IOCTL(TIOCOUTQ) +COMPATIBLE_IOCTL(TIOCSPGRP) +COMPATIBLE_IOCTL(TIOCGPGRP) +COMPATIBLE_IOCTL(TIOCGPTN) +COMPATIBLE_IOCTL(TIOCSPTLCK) +COMPATIBLE_IOCTL(TIOCSERGETLSR) +COMPATIBLE_IOCTL(TIOCSIG) +#ifdef TIOCSRS485 +COMPATIBLE_IOCTL(TIOCSRS485) +#endif +#ifdef TIOCGRS485 +COMPATIBLE_IOCTL(TIOCGRS485) +#endif +#ifdef TCGETS2 +COMPATIBLE_IOCTL(TCGETS2) +COMPATIBLE_IOCTL(TCSETS2) +COMPATIBLE_IOCTL(TCSETSW2) +COMPATIBLE_IOCTL(TCSETSF2) +#endif +/* Little f */ +COMPATIBLE_IOCTL(FIOCLEX) +COMPATIBLE_IOCTL(FIONCLEX) +COMPATIBLE_IOCTL(FIOASYNC) +COMPATIBLE_IOCTL(FIONBIO) +COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ +COMPATIBLE_IOCTL(FS_IOC_FIEMAP) +/* 0x00 */ +COMPATIBLE_IOCTL(FIBMAP) +COMPATIBLE_IOCTL(FIGETBSZ) +/* 'X' - originally XFS but some now in the VFS */ +COMPATIBLE_IOCTL(FIFREEZE) +COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(KDGETKEYCODE) +COMPATIBLE_IOCTL(KDSETKEYCODE) +COMPATIBLE_IOCTL(KDGKBTYPE) +COMPATIBLE_IOCTL(KDGETMODE) +COMPATIBLE_IOCTL(KDGKBMODE) +COMPATIBLE_IOCTL(KDGKBMETA) +COMPATIBLE_IOCTL(KDGKBENT) +COMPATIBLE_IOCTL(KDSKBENT) +COMPATIBLE_IOCTL(KDGKBSENT) +COMPATIBLE_IOCTL(KDSKBSENT) +COMPATIBLE_IOCTL(KDGKBDIACR) +COMPATIBLE_IOCTL(KDSKBDIACR) +COMPATIBLE_IOCTL(KDGKBDIACRUC) +COMPATIBLE_IOCTL(KDSKBDIACRUC) +COMPATIBLE_IOCTL(KDKBDREP) +COMPATIBLE_IOCTL(KDGKBLED) +COMPATIBLE_IOCTL(KDGETLED) +#ifdef CONFIG_BLOCK +/* Big S */ +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) +COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) +COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) +COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) +COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) +COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) +#endif +/* Big V (don't complain on serial console) */ +IGNORE_IOCTL(VT_OPENQRY) +IGNORE_IOCTL(VT_GETMODE) +/* Little p (/dev/rtc, /dev/envctrl, etc.) */ +COMPATIBLE_IOCTL(RTC_AIE_ON) +COMPATIBLE_IOCTL(RTC_AIE_OFF) +COMPATIBLE_IOCTL(RTC_UIE_ON) +COMPATIBLE_IOCTL(RTC_UIE_OFF) +COMPATIBLE_IOCTL(RTC_PIE_ON) +COMPATIBLE_IOCTL(RTC_PIE_OFF) +COMPATIBLE_IOCTL(RTC_WIE_ON) +COMPATIBLE_IOCTL(RTC_WIE_OFF) +COMPATIBLE_IOCTL(RTC_ALM_SET) +COMPATIBLE_IOCTL(RTC_ALM_READ) +COMPATIBLE_IOCTL(RTC_RD_TIME) +COMPATIBLE_IOCTL(RTC_SET_TIME) +COMPATIBLE_IOCTL(RTC_WKALM_SET) +COMPATIBLE_IOCTL(RTC_WKALM_RD) +/* + * These two are only for the sbus rtc driver, but + * hwclock tries them on every rtc device first when + * running on sparc. On other architectures the entries + * are useless but harmless. + */ +COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ +COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ +/* Little m */ +COMPATIBLE_IOCTL(MTIOCTOP) +/* Socket level stuff */ +COMPATIBLE_IOCTL(FIOQSIZE) +#ifdef CONFIG_BLOCK +/* md calls this on random blockdevs */ +IGNORE_IOCTL(RAID_VERSION) +/* qemu/qemu-img might call these two on plain files for probing */ +IGNORE_IOCTL(CDROM_DRIVE_STATUS) +IGNORE_IOCTL(FDGETPRM32) +/* SG stuff */ +COMPATIBLE_IOCTL(SG_SET_TIMEOUT) +COMPATIBLE_IOCTL(SG_GET_TIMEOUT) +COMPATIBLE_IOCTL(SG_EMULATED_HOST) +COMPATIBLE_IOCTL(SG_GET_TRANSFORM) +COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) +COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) +COMPATIBLE_IOCTL(SG_GET_SCSI_ID) +COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) +COMPATIBLE_IOCTL(SG_GET_LOW_DMA) +COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) +COMPATIBLE_IOCTL(SG_GET_PACK_ID) +COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) +COMPATIBLE_IOCTL(SG_SET_DEBUG) +COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) +COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) +COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) +COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) +COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) +COMPATIBLE_IOCTL(SG_SCSI_RESET) +COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) +COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) +COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) +#endif +/* PPP stuff */ +COMPATIBLE_IOCTL(PPPIOCGFLAGS) +COMPATIBLE_IOCTL(PPPIOCSFLAGS) +COMPATIBLE_IOCTL(PPPIOCGASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCGUNIT) +COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCGMRU) +COMPATIBLE_IOCTL(PPPIOCSMRU) +COMPATIBLE_IOCTL(PPPIOCSMAXCID) +COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCXFERUNIT) +/* PPPIOCSCOMPRESS is translated */ +COMPATIBLE_IOCTL(PPPIOCGNPMODE) +COMPATIBLE_IOCTL(PPPIOCSNPMODE) +COMPATIBLE_IOCTL(PPPIOCGDEBUG) +COMPATIBLE_IOCTL(PPPIOCSDEBUG) +/* PPPIOCSPASS is translated */ +/* PPPIOCSACTIVE is translated */ +/* PPPIOCGIDLE is translated */ +COMPATIBLE_IOCTL(PPPIOCNEWUNIT) +COMPATIBLE_IOCTL(PPPIOCATTACH) +COMPATIBLE_IOCTL(PPPIOCDETACH) +COMPATIBLE_IOCTL(PPPIOCSMRRU) +COMPATIBLE_IOCTL(PPPIOCCONNECT) +COMPATIBLE_IOCTL(PPPIOCDISCONN) +COMPATIBLE_IOCTL(PPPIOCATTCHAN) +COMPATIBLE_IOCTL(PPPIOCGCHAN) +COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) +/* PPPOX */ +COMPATIBLE_IOCTL(PPPOEIOCSFWD) +COMPATIBLE_IOCTL(PPPOEIOCDFWD) +/* ppdev */ +COMPATIBLE_IOCTL(PPSETMODE) +COMPATIBLE_IOCTL(PPRSTATUS) +COMPATIBLE_IOCTL(PPRCONTROL) +COMPATIBLE_IOCTL(PPWCONTROL) +COMPATIBLE_IOCTL(PPFCONTROL) +COMPATIBLE_IOCTL(PPRDATA) +COMPATIBLE_IOCTL(PPWDATA) +COMPATIBLE_IOCTL(PPCLAIM) +COMPATIBLE_IOCTL(PPRELEASE) +COMPATIBLE_IOCTL(PPYIELD) +COMPATIBLE_IOCTL(PPEXCL) +COMPATIBLE_IOCTL(PPDATADIR) +COMPATIBLE_IOCTL(PPNEGOT) +COMPATIBLE_IOCTL(PPWCTLONIRQ) +COMPATIBLE_IOCTL(PPCLRIRQ) +COMPATIBLE_IOCTL(PPSETPHASE) +COMPATIBLE_IOCTL(PPGETMODES) +COMPATIBLE_IOCTL(PPGETMODE) +COMPATIBLE_IOCTL(PPGETPHASE) +COMPATIBLE_IOCTL(PPGETFLAGS) +COMPATIBLE_IOCTL(PPSETFLAGS) +/* Big A */ +/* sparc only */ +/* Big Q for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET) +COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO) +COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT) +COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE) +COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR) +COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI) +COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES) +COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS) +COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS) +COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO) +COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL) +COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE) +COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC) +COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE) +/* Big T for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE) +COMPATIBLE_IOCTL(SNDCTL_TMR_START) +COMPATIBLE_IOCTL(SNDCTL_TMR_STOP) +COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE) +COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO) +COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE) +COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME) +COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT) +/* Little m for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME) +COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE) +COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD) +/* Big P for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_DSP_RESET) +COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC) +COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED) +COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE) +COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS) +COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER) +COMPATIBLE_IOCTL(SNDCTL_DSP_POST) +COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE) +COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR) +/* SNDCTL_DSP_MAPINBUF, XXX needs translation */ +/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */ +COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY) +COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE) +COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE) +COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS) +COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS) +COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER) +/* Big C for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_COPR_RESET) +COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD) +COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA) +COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE) +COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA) +COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE) +COMPATIBLE_IOCTL(SNDCTL_COPR_RUN) +COMPATIBLE_IOCTL(SNDCTL_COPR_HALT) +COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG) +COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG) +/* Big M for sound/OSS */ +COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) +/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ +/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) +/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ +/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC) +COMPATIBLE_IOCTL(SOUND_MIXER_INFO) +COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO) +COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS) +COMPATIBLE_IOCTL(SOUND_MIXER_AGC) +COMPATIBLE_IOCTL(SOUND_MIXER_3DSE) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) +COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) +COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) +COMPATIBLE_IOCTL(OSS_GETVERSION) +/* Raw devices */ +COMPATIBLE_IOCTL(RAW_SETBIND) +COMPATIBLE_IOCTL(RAW_GETBIND) +/* Watchdog */ +COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) +COMPATIBLE_IOCTL(WDIOC_GETSTATUS) +COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) +COMPATIBLE_IOCTL(WDIOC_GETTEMP) +COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) +COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) +COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +/* Big R */ +COMPATIBLE_IOCTL(RNDGETENTCNT) +COMPATIBLE_IOCTL(RNDADDTOENTCNT) +COMPATIBLE_IOCTL(RNDGETPOOL) +COMPATIBLE_IOCTL(RNDADDENTROPY) +COMPATIBLE_IOCTL(RNDZAPENTCNT) +COMPATIBLE_IOCTL(RNDCLEARPOOL) +/* Bluetooth */ +COMPATIBLE_IOCTL(HCIDEVUP) +COMPATIBLE_IOCTL(HCIDEVDOWN) +COMPATIBLE_IOCTL(HCIDEVRESET) +COMPATIBLE_IOCTL(HCIDEVRESTAT) +COMPATIBLE_IOCTL(HCIGETDEVLIST) +COMPATIBLE_IOCTL(HCIGETDEVINFO) +COMPATIBLE_IOCTL(HCIGETCONNLIST) +COMPATIBLE_IOCTL(HCIGETCONNINFO) +COMPATIBLE_IOCTL(HCIGETAUTHINFO) +COMPATIBLE_IOCTL(HCISETRAW) +COMPATIBLE_IOCTL(HCISETSCAN) +COMPATIBLE_IOCTL(HCISETAUTH) +COMPATIBLE_IOCTL(HCISETENCRYPT) +COMPATIBLE_IOCTL(HCISETPTYPE) +COMPATIBLE_IOCTL(HCISETLINKPOL) +COMPATIBLE_IOCTL(HCISETLINKMODE) +COMPATIBLE_IOCTL(HCISETACLMTU) +COMPATIBLE_IOCTL(HCISETSCOMTU) +COMPATIBLE_IOCTL(HCIBLOCKADDR) +COMPATIBLE_IOCTL(HCIUNBLOCKADDR) +COMPATIBLE_IOCTL(HCIINQUIRY) +COMPATIBLE_IOCTL(HCIUARTSETPROTO) +COMPATIBLE_IOCTL(HCIUARTGETPROTO) +COMPATIBLE_IOCTL(RFCOMMCREATEDEV) +COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) +COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) +COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) +COMPATIBLE_IOCTL(RFCOMMSTEALDLC) +COMPATIBLE_IOCTL(BNEPCONNADD) +COMPATIBLE_IOCTL(BNEPCONNDEL) +COMPATIBLE_IOCTL(BNEPGETCONNLIST) +COMPATIBLE_IOCTL(BNEPGETCONNINFO) +COMPATIBLE_IOCTL(BNEPGETSUPPFEAT) +COMPATIBLE_IOCTL(CMTPCONNADD) +COMPATIBLE_IOCTL(CMTPCONNDEL) +COMPATIBLE_IOCTL(CMTPGETCONNLIST) +COMPATIBLE_IOCTL(CMTPGETCONNINFO) +COMPATIBLE_IOCTL(HIDPCONNADD) +COMPATIBLE_IOCTL(HIDPCONNDEL) +COMPATIBLE_IOCTL(HIDPGETCONNLIST) +COMPATIBLE_IOCTL(HIDPGETCONNINFO) +/* CAPI */ +COMPATIBLE_IOCTL(CAPI_REGISTER) +COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) +COMPATIBLE_IOCTL(CAPI_GET_VERSION) +COMPATIBLE_IOCTL(CAPI_GET_SERIAL) +COMPATIBLE_IOCTL(CAPI_GET_PROFILE) +COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) +COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) +COMPATIBLE_IOCTL(CAPI_INSTALLED) +COMPATIBLE_IOCTL(CAPI_GET_FLAGS) +COMPATIBLE_IOCTL(CAPI_SET_FLAGS) +COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) +COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) +COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) +/* Siemens Gigaset */ +COMPATIBLE_IOCTL(GIGASET_REDIR) +COMPATIBLE_IOCTL(GIGASET_CONFIG) +COMPATIBLE_IOCTL(GIGASET_BRKCHARS) +COMPATIBLE_IOCTL(GIGASET_VERSION) +/* Misc. */ +COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ +COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ +COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) +COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) +COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) +COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) +/* NBD */ +COMPATIBLE_IOCTL(NBD_DO_IT) +COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) +COMPATIBLE_IOCTL(NBD_CLEAR_QUE) +COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) +COMPATIBLE_IOCTL(NBD_DISCONNECT) +/* i2c */ +COMPATIBLE_IOCTL(I2C_SLAVE) +COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) +COMPATIBLE_IOCTL(I2C_TENBIT) +COMPATIBLE_IOCTL(I2C_PEC) +COMPATIBLE_IOCTL(I2C_RETRIES) +COMPATIBLE_IOCTL(I2C_TIMEOUT) +/* hiddev */ +COMPATIBLE_IOCTL(HIDIOCGVERSION) +COMPATIBLE_IOCTL(HIDIOCAPPLICATION) +COMPATIBLE_IOCTL(HIDIOCGDEVINFO) +COMPATIBLE_IOCTL(HIDIOCGSTRING) +COMPATIBLE_IOCTL(HIDIOCINITREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORT) +COMPATIBLE_IOCTL(HIDIOCSREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) +COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) +COMPATIBLE_IOCTL(HIDIOCGUSAGE) +COMPATIBLE_IOCTL(HIDIOCSUSAGE) +COMPATIBLE_IOCTL(HIDIOCGUCODE) +COMPATIBLE_IOCTL(HIDIOCGFLAG) +COMPATIBLE_IOCTL(HIDIOCSFLAG) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) +/* dvb */ +COMPATIBLE_IOCTL(AUDIO_STOP) +COMPATIBLE_IOCTL(AUDIO_PLAY) +COMPATIBLE_IOCTL(AUDIO_PAUSE) +COMPATIBLE_IOCTL(AUDIO_CONTINUE) +COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) +COMPATIBLE_IOCTL(AUDIO_SET_MUTE) +COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) +COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) +COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) +COMPATIBLE_IOCTL(AUDIO_GET_STATUS) +COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) +COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) +COMPATIBLE_IOCTL(AUDIO_SET_ID) +COMPATIBLE_IOCTL(AUDIO_SET_MIXER) +COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) +COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID) +COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES) +COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE) +COMPATIBLE_IOCTL(DMX_START) +COMPATIBLE_IOCTL(DMX_STOP) +COMPATIBLE_IOCTL(DMX_SET_FILTER) +COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) +COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) +COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) +COMPATIBLE_IOCTL(DMX_GET_CAPS) +COMPATIBLE_IOCTL(DMX_SET_SOURCE) +COMPATIBLE_IOCTL(DMX_GET_STC) +COMPATIBLE_IOCTL(FE_GET_INFO) +COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) +COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD) +COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY) +COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST) +COMPATIBLE_IOCTL(FE_SET_TONE) +COMPATIBLE_IOCTL(FE_SET_VOLTAGE) +COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE) +COMPATIBLE_IOCTL(FE_READ_STATUS) +COMPATIBLE_IOCTL(FE_READ_BER) +COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH) +COMPATIBLE_IOCTL(FE_READ_SNR) +COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS) +COMPATIBLE_IOCTL(FE_SET_FRONTEND) +COMPATIBLE_IOCTL(FE_GET_FRONTEND) +COMPATIBLE_IOCTL(FE_GET_EVENT) +COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD) +COMPATIBLE_IOCTL(VIDEO_STOP) +COMPATIBLE_IOCTL(VIDEO_PLAY) +COMPATIBLE_IOCTL(VIDEO_FREEZE) +COMPATIBLE_IOCTL(VIDEO_CONTINUE) +COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) +COMPATIBLE_IOCTL(VIDEO_SET_BLANK) +COMPATIBLE_IOCTL(VIDEO_GET_STATUS) +COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) +COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) +COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) +COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) +COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) +COMPATIBLE_IOCTL(VIDEO_SET_ID) +COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) +COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) +COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM) +COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT) +COMPATIBLE_IOCTL(VIDEO_SET_SPU) +COMPATIBLE_IOCTL(VIDEO_GET_NAVI) +COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES) +COMPATIBLE_IOCTL(VIDEO_GET_SIZE) +COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) + +/* joystick */ +COMPATIBLE_IOCTL(JSIOCGVERSION) +COMPATIBLE_IOCTL(JSIOCGAXES) +COMPATIBLE_IOCTL(JSIOCGBUTTONS) +COMPATIBLE_IOCTL(JSIOCGNAME(0)) + +#ifdef TIOCGLTC +COMPATIBLE_IOCTL(TIOCGLTC) +COMPATIBLE_IOCTL(TIOCSLTC) +#endif +#ifdef TIOCSTART +/* + * For these two we have definitions in ioctls.h and/or termios.h on + * some architectures but no actual implemention. Some applications + * like bash call them if they are defined in the headers, so we provide + * entries here to avoid syslog message spew. + */ +COMPATIBLE_IOCTL(TIOCSTART) +COMPATIBLE_IOCTL(TIOCSTOP) +#endif + +/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, + but we don't want warnings on other file systems. So declare + them as compatible here. */ +#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) +#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) + +IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) +IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) + +#ifdef CONFIG_SPARC +/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ +IGNORE_IOCTL(FBIOGTYPE) +IGNORE_IOCTL(FBIOSATTR) +IGNORE_IOCTL(FBIOGATTR) +IGNORE_IOCTL(FBIOSVIDEO) +IGNORE_IOCTL(FBIOGVIDEO) +IGNORE_IOCTL(FBIOSCURPOS) +IGNORE_IOCTL(FBIOGCURPOS) +IGNORE_IOCTL(FBIOGCURMAX) +IGNORE_IOCTL(FBIOPUTCMAP32) +IGNORE_IOCTL(FBIOGETCMAP32) +IGNORE_IOCTL(FBIOSCURSOR32) +IGNORE_IOCTL(FBIOGCURSOR32) +#endif +}; + +/* + * Convert common ioctl arguments based on their command number + * + * Please do not add any code in here. Instead, implement + * a compat_ioctl operation in the place that handleѕ the + * ioctl for the native case. + */ +static long do_ioctl_trans(int fd, unsigned int cmd, + unsigned long arg, struct file *file) +{ + void __user *argp = compat_ptr(arg); + + switch (cmd) { + case PPPIOCGIDLE32: + return ppp_gidle(fd, cmd, argp); + case PPPIOCSCOMPRESS32: + return ppp_scompress(fd, cmd, argp); + case PPPIOCSPASS32: + case PPPIOCSACTIVE32: + return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); +#ifdef CONFIG_BLOCK + case SG_IO: + return sg_ioctl_trans(fd, cmd, argp); + case SG_GET_REQUEST_TABLE: + return sg_grt_trans(fd, cmd, argp); + case MTIOCGET32: + case MTIOCPOS32: + return mt_ioctl_trans(fd, cmd, argp); +#endif + /* Serial */ + case TIOCGSERIAL: + case TIOCSSERIAL: + return serial_struct_ioctl(fd, cmd, argp); + /* i2c */ + case I2C_FUNCS: + return w_long(fd, cmd, argp); + case I2C_RDWR: + return do_i2c_rdwr_ioctl(fd, cmd, argp); + case I2C_SMBUS: + return do_i2c_smbus_ioctl(fd, cmd, argp); + /* Not implemented in the native kernel */ + case RTC_IRQP_READ32: + case RTC_IRQP_SET32: + case RTC_EPOCH_READ32: + case RTC_EPOCH_SET32: + return rtc_ioctl(fd, cmd, argp); + + /* dvb */ + case VIDEO_GET_EVENT: + return do_video_get_event(fd, cmd, argp); + case VIDEO_STILLPICTURE: + return do_video_stillpicture(fd, cmd, argp); + case VIDEO_SET_SPU_PALETTE: + return do_video_set_spu_palette(fd, cmd, argp); + } + + /* + * These take an integer instead of a pointer as 'arg', + * so we must not do a compat_ptr() translation. + */ + switch (cmd) { + /* Big T */ + case TCSBRKP: + case TIOCMIWAIT: + case TIOCSCTTY: + /* RAID */ + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* Big K */ + case KDSIGACCEPT: + case KIOCSOUND: + case KDMKTONE: + case KDSETMODE: + case KDSKBMODE: + case KDSKBMETA: + case KDSKBLED: + case KDSETLED: + /* NBD */ + case NBD_SET_SOCK: + case NBD_SET_BLKSIZE: + case NBD_SET_SIZE: + case NBD_SET_SIZE_BLOCKS: + return do_vfs_ioctl(file, fd, cmd, arg); + } + + return -ENOIOCTLCMD; +} + +static int compat_ioctl_check_table(unsigned int xcmd) +{ + int i; + const int max = ARRAY_SIZE(ioctl_pointer) - 1; + + BUILD_BUG_ON(max >= (1 << 16)); + + /* guess initial offset into table, assuming a + normalized distribution */ + i = ((xcmd >> 16) * max) >> 16; + + /* do linear search up first, until greater or equal */ + while (ioctl_pointer[i] < xcmd && i < max) + i++; + + /* then do linear search down */ + while (ioctl_pointer[i] > xcmd && i > 0) + i--; + + return ioctl_pointer[i] == xcmd; +} + +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg32) +{ + unsigned long arg = arg32; + struct fd f = fdget(fd); + int error = -EBADF; + if (!f.file) + goto out; + + /* RED-PEN how should LSM module know it's handling 32bit? */ + error = security_file_ioctl(f.file, cmd, arg); + if (error) + goto out_fput; + + /* + * To allow the compat_ioctl handlers to be self contained + * we need to check the common ioctls here first. + * Just handle them with the standard handlers below. + */ + switch (cmd) { + case FIOCLEX: + case FIONCLEX: + case FIONBIO: + case FIOASYNC: + case FIOQSIZE: + break; + +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + error = compat_ioctl_preallocate(f.file, compat_ptr(arg)); + goto out_fput; +#else + case FS_IOC_RESVSP: + case FS_IOC_RESVSP64: + error = ioctl_preallocate(f.file, compat_ptr(arg)); + goto out_fput; +#endif + + case FIBMAP: + case FIGETBSZ: + case FIONREAD: + if (S_ISREG(file_inode(f.file)->i_mode)) + break; + /*FALL THROUGH*/ + + default: + if (f.file->f_op->compat_ioctl) { + error = f.file->f_op->compat_ioctl(f.file, cmd, arg); + if (error != -ENOIOCTLCMD) + goto out_fput; + } + + if (!f.file->f_op->unlocked_ioctl) + goto do_ioctl; + break; + } + + if (compat_ioctl_check_table(XFORM(cmd))) + goto found_handler; + + error = do_ioctl_trans(fd, cmd, arg, f.file); + if (error == -ENOIOCTLCMD) + error = -ENOTTY; + + goto out_fput; + + found_handler: + arg = (unsigned long)compat_ptr(arg); + do_ioctl: + error = do_vfs_ioctl(f.file, fd, cmd, arg); + out_fput: + fdput(f); + out: + return error; +} + +static int __init init_sys32_ioctl_cmp(const void *p, const void *q) +{ + unsigned int a, b; + a = *(unsigned int *)p; + b = *(unsigned int *)q; + if (a > b) + return 1; + if (a < b) + return -1; + return 0; +} + +static int __init init_sys32_ioctl(void) +{ + sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), + init_sys32_ioctl_cmp, NULL); + return 0; +} +__initcall(init_sys32_ioctl); diff --git a/kernel/fs/configfs/Kconfig b/kernel/fs/configfs/Kconfig new file mode 100644 index 000000000..9febcdefd --- /dev/null +++ b/kernel/fs/configfs/Kconfig @@ -0,0 +1,11 @@ +config CONFIGFS_FS + tristate "Userspace-driven configuration filesystem" + select SYSFS + help + configfs is a RAM-based filesystem that provides the converse + of sysfs's functionality. Where sysfs is a filesystem-based + view of kernel objects, configfs is a filesystem-based manager + of kernel objects, or config_items. + + Both sysfs and configfs can and should exist together on the + same system. One is not a replacement for the other. diff --git a/kernel/fs/configfs/Makefile b/kernel/fs/configfs/Makefile new file mode 100644 index 000000000..00ffb278e --- /dev/null +++ b/kernel/fs/configfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the configfs virtual filesystem +# + +obj-$(CONFIG_CONFIGFS_FS) += configfs.o + +configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o diff --git a/kernel/fs/configfs/configfs_internal.h b/kernel/fs/configfs/configfs_internal.h new file mode 100644 index 000000000..b65d1ef53 --- /dev/null +++ b/kernel/fs/configfs/configfs_internal.h @@ -0,0 +1,163 @@ +/* -*- mode: c; c-basic-offset:8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * configfs_internal.h - Internal stuff for configfs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +struct configfs_dirent { + atomic_t s_count; + int s_dependent_count; + struct list_head s_sibling; + struct list_head s_children; + struct list_head s_links; + void * s_element; + int s_type; + umode_t s_mode; + struct dentry * s_dentry; + struct iattr * s_iattr; +#ifdef CONFIG_LOCKDEP + int s_depth; +#endif +}; + +#define CONFIGFS_ROOT 0x0001 +#define CONFIGFS_DIR 0x0002 +#define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_LINK 0x0020 +#define CONFIGFS_USET_DIR 0x0040 +#define CONFIGFS_USET_DEFAULT 0x0080 +#define CONFIGFS_USET_DROPPING 0x0100 +#define CONFIGFS_USET_IN_MKDIR 0x0200 +#define CONFIGFS_USET_CREATING 0x0400 +#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) + +extern struct mutex configfs_symlink_mutex; +extern spinlock_t configfs_dirent_lock; + +extern struct kmem_cache *configfs_dir_cachep; + +extern int configfs_is_root(struct config_item *item); + +extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); +extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *)); + +extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); +extern int configfs_make_dirent(struct configfs_dirent *, + struct dentry *, void *, umode_t, int); +extern int configfs_dirent_is_ready(struct configfs_dirent *); + +extern void configfs_hash_and_remove(struct dentry * dir, const char * name); + +extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); +extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); + +extern struct dentry *configfs_pin_fs(void); +extern void configfs_release_fs(void); + +extern struct rw_semaphore configfs_rename_sem; +extern const struct file_operations configfs_dir_operations; +extern const struct file_operations configfs_file_operations; +extern const struct file_operations bin_fops; +extern const struct inode_operations configfs_dir_inode_operations; +extern const struct inode_operations configfs_root_inode_operations; +extern const struct inode_operations configfs_symlink_inode_operations; +extern const struct dentry_operations configfs_dentry_ops; + +extern int configfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname); +extern int configfs_unlink(struct inode *dir, struct dentry *dentry); + +struct configfs_symlink { + struct list_head sl_list; + struct config_item *sl_target; +}; + +extern int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry); + +static inline struct config_item * to_item(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct config_item *) sd->s_element); +} + +static inline struct configfs_attribute * to_attr(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct configfs_attribute *) sd->s_element); +} + +static inline struct config_item *configfs_get_config_item(struct dentry *dentry) +{ + struct config_item * item = NULL; + + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + struct configfs_dirent * sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_ITEM_LINK) { + struct configfs_symlink * sl = sd->s_element; + item = config_item_get(sl->sl_target); + } else + item = config_item_get(sd->s_element); + } + spin_unlock(&dentry->d_lock); + + return item; +} + +static inline void release_configfs_dirent(struct configfs_dirent * sd) +{ + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } +} + +static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) +{ + if (sd) { + WARN_ON(!atomic_read(&sd->s_count)); + atomic_inc(&sd->s_count); + } + return sd; +} + +static inline void configfs_put(struct configfs_dirent * sd) +{ + WARN_ON(!atomic_read(&sd->s_count)); + if (atomic_dec_and_test(&sd->s_count)) + release_configfs_dirent(sd); +} + diff --git a/kernel/fs/configfs/dir.c b/kernel/fs/configfs/dir.c new file mode 100644 index 000000000..c81ce7f20 --- /dev/null +++ b/kernel/fs/configfs/dir.c @@ -0,0 +1,1724 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c - Operations for configfs directories. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +DECLARE_RWSEM(configfs_rename_sem); +/* + * Protects mutations of configfs_dirent linkage together with proper i_mutex + * Also protects mutations of symlinks linkage to target configfs_dirent + * Mutators of configfs_dirent linkage must *both* have the proper inode locked + * and configfs_dirent_lock locked, in that order. + * This allows one to safely traverse configfs_dirent trees and symlinks without + * having to lock inodes. + * + * Protects setting of CONFIGFS_USET_DROPPING: checking the flag + * unlocked is not reliable unless in detach_groups() called from + * rmdir()/unregister() and from configfs_attach_group() + */ +DEFINE_SPINLOCK(configfs_dirent_lock); + +static void configfs_d_iput(struct dentry * dentry, + struct inode * inode) +{ + struct configfs_dirent *sd = dentry->d_fsdata; + + if (sd) { + /* Coordinate with configfs_readdir */ + spin_lock(&configfs_dirent_lock); + /* Coordinate with configfs_attach_attr where will increase + * sd->s_count and update sd->s_dentry to new allocated one. + * Only set sd->dentry to null when this dentry is the only + * sd owner. + * If not do so, configfs_d_iput may run just after + * configfs_attach_attr and set sd->s_dentry to null + * even it's still in use. + */ + if (atomic_read(&sd->s_count) <= 2) + sd->s_dentry = NULL; + + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + iput(inode); +} + +const struct dentry_operations configfs_dentry_ops = { + .d_iput = configfs_d_iput, + .d_delete = always_delete_dentry, +}; + +#ifdef CONFIG_LOCKDEP + +/* + * Helpers to make lockdep happy with our recursive locking of default groups' + * inodes (see configfs_attach_group() and configfs_detach_group()). + * We put default groups i_mutexes in separate classes according to their depth + * from the youngest non-default group ancestor. + * + * For a non-default group A having default groups A/B, A/C, and A/C/D, default + * groups A/B and A/C will have their inode's mutex in class + * default_group_class[0], and default group A/C/D will be in + * default_group_class[1]. + * + * The lock classes are declared and assigned in inode.c, according to the + * s_depth value. + * The s_depth value is initialized to -1, adjusted to >= 0 when attaching + * default groups, and reset to -1 when all default groups are attached. During + * attachment, if configfs_create() sees s_depth > 0, the lock class of the new + * inode's mutex is set to default_group_class[s_depth - 1]. + */ + +static void configfs_init_dirent_depth(struct configfs_dirent *sd) +{ + sd->s_depth = -1; +} + +static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, + struct configfs_dirent *sd) +{ + int parent_depth = parent_sd->s_depth; + + if (parent_depth >= 0) + sd->s_depth = parent_depth + 1; +} + +static void +configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) +{ + /* + * item's i_mutex class is already setup, so s_depth is now only + * used to set new sub-directories s_depth, which is always done + * with item's i_mutex locked. + */ + /* + * sd->s_depth == -1 iff we are a non default group. + * else (we are a default group) sd->s_depth > 0 (see + * create_dir()). + */ + if (sd->s_depth == -1) + /* + * We are a non default group and we are going to create + * default groups. + */ + sd->s_depth = 0; +} + +static void +configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) +{ + /* We will not create default groups anymore. */ + sd->s_depth = -1; +} + +#else /* CONFIG_LOCKDEP */ + +static void configfs_init_dirent_depth(struct configfs_dirent *sd) +{ +} + +static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, + struct configfs_dirent *sd) +{ +} + +static void +configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) +{ +} + +static void +configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) +{ +} + +#endif /* CONFIG_LOCKDEP */ + +/* + * Allocates a new configfs_dirent and links it to the parent configfs_dirent + */ +static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, + void *element, int type) +{ + struct configfs_dirent * sd; + + sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); + if (!sd) + return ERR_PTR(-ENOMEM); + + atomic_set(&sd->s_count, 1); + INIT_LIST_HEAD(&sd->s_links); + INIT_LIST_HEAD(&sd->s_children); + sd->s_element = element; + sd->s_type = type; + configfs_init_dirent_depth(sd); + spin_lock(&configfs_dirent_lock); + if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + kmem_cache_free(configfs_dir_cachep, sd); + return ERR_PTR(-ENOENT); + } + list_add(&sd->s_sibling, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + + return sd; +} + +/* + * + * Return -EEXIST if there is already a configfs element with the same + * name for the same parent. + * + * called with parent inode's i_mutex held + */ +static int configfs_dirent_exists(struct configfs_dirent *parent_sd, + const unsigned char *new) +{ + struct configfs_dirent * sd; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_element) { + const unsigned char *existing = configfs_get_name(sd); + if (strcmp(existing, new)) + continue; + else + return -EEXIST; + } + } + + return 0; +} + + +int configfs_make_dirent(struct configfs_dirent * parent_sd, + struct dentry * dentry, void * element, + umode_t mode, int type) +{ + struct configfs_dirent * sd; + + sd = configfs_new_dirent(parent_sd, element, type); + if (IS_ERR(sd)) + return PTR_ERR(sd); + + sd->s_mode = mode; + sd->s_dentry = dentry; + if (dentry) + dentry->d_fsdata = configfs_get(sd); + + return 0; +} + +static void init_dir(struct inode * inode) +{ + inode->i_op = &configfs_dir_inode_operations; + inode->i_fop = &configfs_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); +} + +static void configfs_init_file(struct inode * inode) +{ + inode->i_size = PAGE_SIZE; + inode->i_fop = &configfs_file_operations; +} + +static void init_symlink(struct inode * inode) +{ + inode->i_op = &configfs_symlink_inode_operations; +} + +/** + * configfs_create_dir - create a directory for an config_item. + * @item: config_itemwe're creating directory for. + * @dentry: config_item's dentry. + * + * Note: user-created entries won't be allowed under this new directory + * until it is validated by configfs_dir_set_ready() + */ + +static int configfs_create_dir(struct config_item *item, struct dentry *dentry) +{ + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + struct dentry *p = dentry->d_parent; + + BUG_ON(!item); + + error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name); + if (unlikely(error)) + return error; + + error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, + CONFIGFS_DIR | CONFIGFS_USET_CREATING); + if (unlikely(error)) + return error; + + configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); + error = configfs_create(dentry, mode, init_dir); + if (!error) { + inc_nlink(d_inode(p)); + item->ci_dentry = dentry; + } else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } + return error; +} + +/* + * Allow userspace to create new entries under a new directory created with + * configfs_create_dir(), and under all of its chidlren directories recursively. + * @sd configfs_dirent of the new directory to validate + * + * Caller must hold configfs_dirent_lock. + */ +static void configfs_dir_set_ready(struct configfs_dirent *sd) +{ + struct configfs_dirent *child_sd; + + sd->s_type &= ~CONFIGFS_USET_CREATING; + list_for_each_entry(child_sd, &sd->s_children, s_sibling) + if (child_sd->s_type & CONFIGFS_USET_CREATING) + configfs_dir_set_ready(child_sd); +} + +/* + * Check that a directory does not belong to a directory hierarchy being + * attached and not validated yet. + * @sd configfs_dirent of the directory to check + * + * @return non-zero iff the directory was validated + * + * Note: takes configfs_dirent_lock, so the result may change from false to true + * in two consecutive calls, but never from true to false. + */ +int configfs_dirent_is_ready(struct configfs_dirent *sd) +{ + int ret; + + spin_lock(&configfs_dirent_lock); + ret = !(sd->s_type & CONFIGFS_USET_CREATING); + spin_unlock(&configfs_dirent_lock); + + return ret; +} + +int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry) +{ + int err = 0; + umode_t mode = S_IFLNK | S_IRWXUGO; + + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); + if (!err) { + err = configfs_create(dentry, mode, init_symlink); + if (err) { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } + } + return err; +} + +static void remove_dir(struct dentry * d) +{ + struct dentry * parent = dget(d->d_parent); + struct configfs_dirent * sd; + + sd = d->d_fsdata; + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + if (d_really_is_positive(d)) + simple_rmdir(d_inode(parent),d); + + pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); + + dput(parent); +} + +/** + * configfs_remove_dir - remove an config_item's directory. + * @item: config_item we're removing. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be configfs_rmdir() below, instead of calling separately. + * + * Caller holds the mutex of the item's inode + */ + +static void configfs_remove_dir(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + + if (!dentry) + return; + + remove_dir(dentry); + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + + +/* attaches attribute's configfs_dirent to the dentry corresponding to the + * attribute file + */ +static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) +{ + struct configfs_attribute * attr = sd->s_element; + int error; + + spin_lock(&configfs_dirent_lock); + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); + + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, + configfs_init_file); + if (error) { + configfs_put(sd); + return error; + } + + d_rehash(dentry); + + return 0; +} + +static struct dentry * configfs_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; + struct configfs_dirent * sd; + int found = 0; + int err; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + * + * This forbids userspace to read/write attributes of items which may + * not complete their initialization, since the dentries of the + * attributes won't be instantiated. + */ + err = -ENOENT; + if (!configfs_dirent_is_ready(parent_sd)) + goto out; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_type & CONFIGFS_NOT_PINNED) { + const unsigned char * name = configfs_get_name(sd); + + if (strcmp(name, dentry->d_name.name)) + continue; + + found = 1; + err = configfs_attach_attr(sd, dentry); + break; + } + } + + if (!found) { + /* + * If it doesn't exist and it isn't a NOT_PINNED item, + * it must be negative. + */ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; + } + +out: + return ERR_PTR(err); +} + +/* + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes and are removed by rmdir(). We recurse, setting + * CONFIGFS_USET_DROPPING on all children that are candidates for + * default detach. + * If there is an error, the caller will reset the flags via + * configfs_detach_rollback(). + */ +static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + int ret; + + /* Mark that we're trying to drop the group */ + parent_sd->s_type |= CONFIGFS_USET_DROPPING; + + ret = -EBUSY; + if (!list_empty(&parent_sd->s_links)) + goto out; + + ret = 0; + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || + (sd->s_type & CONFIGFS_NOT_PINNED)) + continue; + if (sd->s_type & CONFIGFS_USET_DEFAULT) { + /* Abort if racing with mkdir() */ + if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { + if (wait_mutex) + *wait_mutex = &d_inode(sd->s_dentry)->i_mutex; + return -EAGAIN; + } + + /* + * Yup, recursive. If there's a problem, blame + * deep nesting of default_groups + */ + ret = configfs_detach_prep(sd->s_dentry, wait_mutex); + if (!ret) + continue; + } else + ret = -ENOTEMPTY; + + break; + } + +out: + return ret; +} + +/* + * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was + * set. + */ +static void configfs_detach_rollback(struct dentry *dentry) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + + parent_sd->s_type &= ~CONFIGFS_USET_DROPPING; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) + if (sd->s_type & CONFIGFS_USET_DEFAULT) + configfs_detach_rollback(sd->s_dentry); +} + +static void detach_attrs(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + struct configfs_dirent * parent_sd; + struct configfs_dirent * sd, * tmp; + + if (!dentry) + return; + + pr_debug("configfs %s: dropping attrs for dir\n", + dentry->d_name.name); + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) + continue; + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dentry); + configfs_put(sd); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +static int populate_attrs(struct config_item *item) +{ + struct config_item_type *t = item->ci_type; + struct configfs_attribute *attr; + int error = 0; + int i; + + if (!t) + return -EINVAL; + if (t->ct_attrs) { + for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { + if ((error = configfs_create_file(item, attr))) + break; + } + } + + if (error) + detach_attrs(item); + + return error; +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry); +static void configfs_detach_group(struct config_item *item); + +static void detach_groups(struct config_group *group) +{ + struct dentry * dentry = dget(group->cg_item.ci_dentry); + struct dentry *child; + struct configfs_dirent *parent_sd; + struct configfs_dirent *sd, *tmp; + + if (!dentry) + return; + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || + !(sd->s_type & CONFIGFS_USET_DEFAULT)) + continue; + + child = sd->s_dentry; + + mutex_lock(&d_inode(child)->i_mutex); + + configfs_detach_group(sd->s_element); + d_inode(child)->i_flags |= S_DEAD; + dont_mount(child); + + mutex_unlock(&d_inode(child)->i_mutex); + + d_delete(child); + dput(child); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +/* + * This fakes mkdir(2) on a default_groups[] entry. It + * creates a dentry, attachs it, and then does fixup + * on the sd->s_type. + * + * We could, perhaps, tweak our parent's ->mkdir for a minute and + * try using vfs_mkdir. Just a thought. + */ +static int create_default_group(struct config_group *parent_group, + struct config_group *group) +{ + int ret; + struct configfs_dirent *sd; + /* We trust the caller holds a reference to parent */ + struct dentry *child, *parent = parent_group->cg_item.ci_dentry; + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + + ret = -ENOMEM; + child = d_alloc_name(parent, group->cg_item.ci_name); + if (child) { + d_add(child, NULL); + + ret = configfs_attach_group(&parent_group->cg_item, + &group->cg_item, child); + if (!ret) { + sd = child->d_fsdata; + sd->s_type |= CONFIGFS_USET_DEFAULT; + } else { + BUG_ON(d_inode(child)); + d_drop(child); + dput(child); + } + } + + return ret; +} + +static int populate_groups(struct config_group *group) +{ + struct config_group *new_group; + int ret = 0; + int i; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + + ret = create_default_group(group, new_group); + if (ret) { + detach_groups(group); + break; + } + } + } + + return ret; +} + +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + */ + +static void unlink_obj(struct config_item *item) +{ + struct config_group *group; + + group = item->ci_group; + if (group) { + list_del_init(&item->ci_entry); + + item->ci_group = NULL; + item->ci_parent = NULL; + + /* Drop the reference for ci_entry */ + config_item_put(item); + + /* Drop the reference for ci_parent */ + config_group_put(group); + } +} + +static void link_obj(struct config_item *parent_item, struct config_item *item) +{ + /* + * Parent seems redundant with group, but it makes certain + * traversals much nicer. + */ + item->ci_parent = parent_item; + + /* + * We hold a reference on the parent for the child's ci_parent + * link. + */ + item->ci_group = config_group_get(to_config_group(parent_item)); + list_add_tail(&item->ci_entry, &item->ci_group->cg_children); + + /* + * We hold a reference on the child for ci_entry on the parent's + * cg_children + */ + config_item_get(item); +} + +static void unlink_group(struct config_group *group) +{ + int i; + struct config_group *new_group; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + unlink_group(new_group); + } + } + + group->cg_subsys = NULL; + unlink_obj(&group->cg_item); +} + +static void link_group(struct config_group *parent_group, struct config_group *group) +{ + int i; + struct config_group *new_group; + struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ + + link_obj(&parent_group->cg_item, &group->cg_item); + + if (parent_group->cg_subsys) + subsys = parent_group->cg_subsys; + else if (configfs_is_root(&parent_group->cg_item)) + subsys = to_configfs_subsystem(group); + else + BUG(); + group->cg_subsys = subsys; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + link_group(group, new_group); + } + } +} + +/* + * The goal is that configfs_attach_item() (and + * configfs_attach_group()) can be called from either the VFS or this + * module. That is, they assume that the items have been created, + * the dentry allocated, and the dcache is all ready to go. + * + * If they fail, they must clean up after themselves as if they + * had never been called. The caller (VFS or local function) will + * handle cleaning up the dcache bits. + * + * configfs_detach_group() and configfs_detach_item() behave similarly on + * the way out. They assume that the proper semaphores are held, they + * clean up the configfs items, and they expect their callers will + * handle the dcache bits. + */ +static int configfs_attach_item(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + + ret = configfs_create_dir(item, dentry); + if (!ret) { + ret = populate_attrs(item); + if (ret) { + /* + * We are going to remove an inode and its dentry but + * the VFS may already have hit and used them. Thus, + * we must lock them as rmdir() would. + */ + mutex_lock(&d_inode(dentry)->i_mutex); + configfs_remove_dir(item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); + mutex_unlock(&d_inode(dentry)->i_mutex); + d_delete(dentry); + } + } + + return ret; +} + +/* Caller holds the mutex of the item's inode */ +static void configfs_detach_item(struct config_item *item) +{ + detach_attrs(item); + configfs_remove_dir(item); +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + struct configfs_dirent *sd; + + ret = configfs_attach_item(parent_item, item, dentry); + if (!ret) { + sd = dentry->d_fsdata; + sd->s_type |= CONFIGFS_USET_DIR; + + /* + * FYI, we're faking mkdir in populate_groups() + * We must lock the group's inode to avoid races with the VFS + * which can already hit the inode and try to add/remove entries + * under it. + * + * We must also lock the inode to remove it safely in case of + * error, as rmdir() would. + */ + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + configfs_adjust_dir_dirent_depth_before_populate(sd); + ret = populate_groups(to_config_group(item)); + if (ret) { + configfs_detach_item(item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); + } + configfs_adjust_dir_dirent_depth_after_populate(sd); + mutex_unlock(&d_inode(dentry)->i_mutex); + if (ret) + d_delete(dentry); + } + + return ret; +} + +/* Caller holds the mutex of the group's inode */ +static void configfs_detach_group(struct config_item *item) +{ + detach_groups(to_config_group(item)); + configfs_detach_item(item); +} + +/* + * After the item has been detached from the filesystem view, we are + * ready to tear it out of the hierarchy. Notify the client before + * we do that so they can perform any cleanup that requires + * navigating the hierarchy. A client does not need to provide this + * callback. The subsystem semaphore MUST be held by the caller, and + * references must be valid for both items. It also assumes the + * caller has validated ci_type. + */ +static void client_disconnect_notify(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) + type->ct_group_ops->disconnect_notify(to_config_group(parent_item), + item); +} + +/* + * Drop the initial reference from make_item()/make_group() + * This function assumes that reference is held on item + * and that item holds a valid reference to the parent. Also, it + * assumes the caller has validated ci_type. + */ +static void client_drop_item(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + /* + * If ->drop_item() exists, it is responsible for the + * config_item_put(). + */ + if (type->ct_group_ops && type->ct_group_ops->drop_item) + type->ct_group_ops->drop_item(to_config_group(parent_item), + item); + else + config_item_put(item); +} + +#ifdef DEBUG +static void configfs_dump_one(struct configfs_dirent *sd, int level) +{ + pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + +#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); + type_print(CONFIGFS_ROOT); + type_print(CONFIGFS_DIR); + type_print(CONFIGFS_ITEM_ATTR); + type_print(CONFIGFS_ITEM_LINK); + type_print(CONFIGFS_USET_DIR); + type_print(CONFIGFS_USET_DEFAULT); + type_print(CONFIGFS_USET_DROPPING); +#undef type_print +} + +static int configfs_dump(struct configfs_dirent *sd, int level) +{ + struct configfs_dirent *child_sd; + int ret = 0; + + configfs_dump_one(sd, level); + + if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) + return 0; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + ret = configfs_dump(child_sd, level + 2); + if (ret) + break; + } + + return ret; +} +#endif + + +/* + * configfs_depend_item() and configfs_undepend_item() + * + * WARNING: Do not call these from a configfs callback! + * + * This describes these functions and their helpers. + * + * Allow another kernel system to depend on a config_item. If this + * happens, the item cannot go away until the dependent can live without + * it. The idea is to give client modules as simple an interface as + * possible. When a system asks them to depend on an item, they just + * call configfs_depend_item(). If the item is live and the client + * driver is in good shape, we'll happily do the work for them. + * + * Why is the locking complex? Because configfs uses the VFS to handle + * all locking, but this function is called outside the normal + * VFS->configfs path. So it must take VFS locks to prevent the + * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is + * why you can't call these functions underneath configfs callbacks. + * + * Note, btw, that this can be called at *any* time, even when a configfs + * subsystem isn't registered, or when configfs is loading or unloading. + * Just like configfs_register_subsystem(). So we take the same + * precautions. We pin the filesystem. We lock configfs_dirent_lock. + * If we can find the target item in the + * configfs tree, it must be part of the subsystem tree as well, so we + * do not need the subsystem semaphore. Holding configfs_dirent_lock helps + * locking out mkdir() and rmdir(), who might be racing us. + */ + +/* + * configfs_depend_prep() + * + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes. This is similar but not the same to configfs_detach_prep(). + * Note that configfs_detach_prep() expects the parent to be locked when it + * is called, but we lock the parent *inside* configfs_depend_prep(). We + * do that so we can unlock it if we find nothing. + * + * Here we do a depth-first search of the dentry hierarchy looking for + * our object. + * We deliberately ignore items tagged as dropping since they are virtually + * dead, as well as items in the middle of attachment since they virtually + * do not exist yet. This completes the locking out of racing mkdir() and + * rmdir(). + * Note: subdirectories in the middle of attachment start with s_type = + * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When + * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of + * s_type is in configfs_new_dirent(), which has configfs_dirent_lock. + * + * If the target is not found, -ENOENT is bubbled up. + * + * This adds a requirement that all config_items be unique! + * + * This is recursive. There isn't + * much on the stack, though, so folks that need this function - be careful + * about your stack! Patches will be accepted to make it iterative. + */ +static int configfs_depend_prep(struct dentry *origin, + struct config_item *target) +{ + struct configfs_dirent *child_sd, *sd; + int ret = 0; + + BUG_ON(!origin || !origin->d_fsdata); + sd = origin->d_fsdata; + + if (sd->s_element == target) /* Boo-yah */ + goto out; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + if ((child_sd->s_type & CONFIGFS_DIR) && + !(child_sd->s_type & CONFIGFS_USET_DROPPING) && + !(child_sd->s_type & CONFIGFS_USET_CREATING)) { + ret = configfs_depend_prep(child_sd->s_dentry, + target); + if (!ret) + goto out; /* Child path boo-yah */ + } + } + + /* We looped all our children and didn't find target */ + ret = -ENOENT; + +out: + return ret; +} + +int configfs_depend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + int ret; + struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct config_item *s_item = &subsys->su_group.cg_item; + struct dentry *root; + + /* + * Pin the configfs filesystem. This means we can safely access + * the root of the configfs filesystem. + */ + root = configfs_pin_fs(); + if (IS_ERR(root)) + return PTR_ERR(root); + + /* + * Next, lock the root directory. We're going to check that the + * subsystem is really registered, and so we need to lock out + * configfs_[un]register_subsystem(). + */ + mutex_lock(&d_inode(root)->i_mutex); + + root_sd = root->d_fsdata; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR) { + if (p->s_element == s_item) { + subsys_sd = p; + break; + } + } + } + + if (!subsys_sd) { + ret = -ENOENT; + goto out_unlock_fs; + } + + /* Ok, now we can trust subsys/s_item */ + + spin_lock(&configfs_dirent_lock); + /* Scan the tree, return 0 if found */ + ret = configfs_depend_prep(subsys_sd->s_dentry, target); + if (ret) + goto out_unlock_dirent_lock; + + /* + * We are sure that the item is not about to be removed by rmdir(), and + * not in the middle of attachment by mkdir(). + */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + +out_unlock_dirent_lock: + spin_unlock(&configfs_dirent_lock); +out_unlock_fs: + mutex_unlock(&d_inode(root)->i_mutex); + + /* + * If we succeeded, the fs is pinned via other methods. If not, + * we're done with it anyway. So release_fs() is always right. + */ + configfs_release_fs(); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item); + +/* + * Release the dependent linkage. This is much simpler than + * configfs_depend_item() because we know that that the client driver is + * pinned, thus the subsystem is pinned, and therefore configfs is pinned. + */ +void configfs_undepend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + struct configfs_dirent *sd; + + /* + * Since we can trust everything is pinned, we just need + * configfs_dirent_lock. + */ + spin_lock(&configfs_dirent_lock); + + sd = target->ci_dentry->d_fsdata; + BUG_ON(sd->s_dependent_count < 1); + + sd->s_dependent_count -= 1; + + /* + * After this unlock, we cannot trust the item to stay alive! + * DO NOT REFERENCE item after this unlock. + */ + spin_unlock(&configfs_dirent_lock); +} +EXPORT_SYMBOL(configfs_undepend_item); + +static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int ret = 0; + int module_got = 0; + struct config_group *group = NULL; + struct config_item *item = NULL; + struct config_item *parent_item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct config_item_type *type; + struct module *subsys_owner = NULL, *new_item_owner = NULL; + char *name; + + sd = dentry->d_parent->d_fsdata; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + if (!configfs_dirent_is_ready(sd)) { + ret = -ENOENT; + goto out; + } + + if (!(sd->s_type & CONFIGFS_USET_DIR)) { + ret = -EPERM; + goto out; + } + + /* Get a working ref for the duration of this function */ + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!type || !type->ct_group_ops || + (!type->ct_group_ops->make_group && + !type->ct_group_ops->make_item)) { + ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ + goto out_put; + } + + /* + * The subsystem may belong to a different module than the item + * being created. We don't want to safely pin the new item but + * fail to pin the subsystem it sits under. + */ + if (!subsys->su_group.cg_item.ci_type) { + ret = -EINVAL; + goto out_put; + } + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + if (!try_module_get(subsys_owner)) { + ret = -EINVAL; + goto out_put; + } + + name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); + if (!name) { + ret = -ENOMEM; + goto out_subsys_put; + } + + snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); + + mutex_lock(&subsys->su_mutex); + if (type->ct_group_ops->make_group) { + group = type->ct_group_ops->make_group(to_config_group(parent_item), name); + if (!group) + group = ERR_PTR(-ENOMEM); + if (!IS_ERR(group)) { + link_group(to_config_group(parent_item), group); + item = &group->cg_item; + } else + ret = PTR_ERR(group); + } else { + item = type->ct_group_ops->make_item(to_config_group(parent_item), name); + if (!item) + item = ERR_PTR(-ENOMEM); + if (!IS_ERR(item)) + link_obj(parent_item, item); + else + ret = PTR_ERR(item); + } + mutex_unlock(&subsys->su_mutex); + + kfree(name); + if (ret) { + /* + * If ret != 0, then link_obj() was never called. + * There are no extra references to clean up. + */ + goto out_subsys_put; + } + + /* + * link_obj() has been called (via link_group() for groups). + * From here on out, errors must clean that up. + */ + + type = item->ci_type; + if (!type) { + ret = -EINVAL; + goto out_unlink; + } + + new_item_owner = type->ct_owner; + if (!try_module_get(new_item_owner)) { + ret = -EINVAL; + goto out_unlink; + } + + /* + * I hate doing it this way, but if there is + * an error, module_put() probably should + * happen after any cleanup. + */ + module_got = 1; + + /* + * Make racing rmdir() fail if it did not tag parent with + * CONFIGFS_USET_DROPPING + * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will + * fail and let rmdir() terminate correctly + */ + spin_lock(&configfs_dirent_lock); + /* This will make configfs_detach_prep() fail */ + sd->s_type |= CONFIGFS_USET_IN_MKDIR; + spin_unlock(&configfs_dirent_lock); + + if (group) + ret = configfs_attach_group(parent_item, item, dentry); + else + ret = configfs_attach_item(parent_item, item, dentry); + + spin_lock(&configfs_dirent_lock); + sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; + if (!ret) + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + +out_unlink: + if (ret) { + /* Tear down everything we built up */ + mutex_lock(&subsys->su_mutex); + + client_disconnect_notify(parent_item, item); + if (group) + unlink_group(group); + else + unlink_obj(item); + client_drop_item(parent_item, item); + + mutex_unlock(&subsys->su_mutex); + + if (module_got) + module_put(new_item_owner); + } + +out_subsys_put: + if (ret) + module_put(subsys_owner); + +out_put: + /* + * link_obj()/link_group() took a reference from child->parent, + * so the parent is safely pinned. We can drop our working + * reference. + */ + config_item_put(parent_item); + +out: + return ret; +} + +static int configfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct config_item *parent_item; + struct config_item *item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct module *subsys_owner = NULL, *dead_item_owner = NULL; + int ret; + + sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_USET_DEFAULT) + return -EPERM; + + /* Get a working ref until we have the child */ + parent_item = configfs_get_config_item(dentry->d_parent); + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!parent_item->ci_type) { + config_item_put(parent_item); + return -EINVAL; + } + + /* configfs_mkdir() shouldn't have allowed this */ + BUG_ON(!subsys->su_group.cg_item.ci_type); + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + + /* + * Ensure that no racing symlink() will make detach_prep() fail while + * the new link is temporarily attached + */ + do { + struct mutex *wait_mutex; + + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + /* + * Here's where we check for dependents. We're protected by + * configfs_dirent_lock. + * If no dependent, atomically tag the item as dropping. + */ + ret = sd->s_dependent_count ? -EBUSY : 0; + if (!ret) { + ret = configfs_detach_prep(dentry, &wait_mutex); + if (ret) + configfs_detach_rollback(dentry); + } + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { + if (ret != -EAGAIN) { + config_item_put(parent_item); + return ret; + } + + /* Wait until the racing operation terminates */ + mutex_lock(wait_mutex); + mutex_unlock(wait_mutex); + } + } while (ret == -EAGAIN); + + /* Get a working ref for the duration of this function */ + item = configfs_get_config_item(dentry); + + /* Drop reference from above, item already holds one. */ + config_item_put(parent_item); + + if (item->ci_type) + dead_item_owner = item->ci_type->ct_owner; + + if (sd->s_type & CONFIGFS_USET_DIR) { + configfs_detach_group(item); + + mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); + unlink_group(to_config_group(item)); + } else { + configfs_detach_item(item); + + mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); + unlink_obj(item); + } + + client_drop_item(parent_item, item); + mutex_unlock(&subsys->su_mutex); + + /* Drop our reference from above */ + config_item_put(item); + + module_put(dead_item_owner); + module_put(subsys_owner); + + return 0; +} + +const struct inode_operations configfs_dir_inode_operations = { + .mkdir = configfs_mkdir, + .rmdir = configfs_rmdir, + .symlink = configfs_symlink, + .unlink = configfs_unlink, + .lookup = configfs_lookup, + .setattr = configfs_setattr, +}; + +const struct inode_operations configfs_root_inode_operations = { + .lookup = configfs_lookup, + .setattr = configfs_setattr, +}; + +#if 0 +int configfs_rename_dir(struct config_item * item, const char *new_name) +{ + int error = 0; + struct dentry * new_dentry, * parent; + + if (!strcmp(config_item_name(item), new_name)) + return -EINVAL; + + if (!item->parent) + return -EINVAL; + + down_write(&configfs_rename_sem); + parent = item->parent->dentry; + + mutex_lock(&d_inode(parent)->i_mutex); + + new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); + if (!IS_ERR(new_dentry)) { + if (d_really_is_negative(new_dentry)) { + error = config_item_set_name(item, "%s", new_name); + if (!error) { + d_add(new_dentry, NULL); + d_move(item->dentry, new_dentry); + } + else + d_delete(new_dentry); + } else + error = -EEXIST; + dput(new_dentry); + } + mutex_unlock(&d_inode(parent)->i_mutex); + up_write(&configfs_rename_sem); + + return error; +} +#endif + +static int configfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_path.dentry; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + int err; + + mutex_lock(&d_inode(dentry)->i_mutex); + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + err = -ENOENT; + if (configfs_dirent_is_ready(parent_sd)) { + file->private_data = configfs_new_dirent(parent_sd, NULL, 0); + if (IS_ERR(file->private_data)) + err = PTR_ERR(file->private_data); + else + err = 0; + } + mutex_unlock(&d_inode(dentry)->i_mutex); + + return err; +} + +static int configfs_dir_close(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_path.dentry; + struct configfs_dirent * cursor = file->private_data; + + mutex_lock(&d_inode(dentry)->i_mutex); + spin_lock(&configfs_dirent_lock); + list_del_init(&cursor->s_sibling); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&d_inode(dentry)->i_mutex); + + release_configfs_dirent(cursor); + + return 0; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct configfs_dirent *sd) +{ + return (sd->s_mode >> 12) & 15; +} + +static int configfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct super_block *sb = dentry->d_sb; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + struct configfs_dirent *cursor = file->private_data; + struct list_head *p, *q = &cursor->s_sibling; + ino_t ino = 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + if (ctx->pos == 2) { + spin_lock(&configfs_dirent_lock); + list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + } + for (p = q->next; p != &parent_sd->s_children; p = p->next) { + struct configfs_dirent *next; + const char *name; + int len; + struct inode *inode = NULL; + + next = list_entry(p, struct configfs_dirent, s_sibling); + if (!next->s_element) + continue; + + name = configfs_get_name(next); + len = strlen(name); + + /* + * We'll have a dentry and an inode for + * PINNED items and for open attribute + * files. We lock here to prevent a race + * with configfs_d_iput() clearing + * s_dentry before calling iput(). + * + * Why do we go to the trouble? If + * someone has an attribute file open, + * the inode number should match until + * they close it. Beyond that, we don't + * care. + */ + spin_lock(&configfs_dirent_lock); + dentry = next->s_dentry; + if (dentry) + inode = d_inode(dentry); + if (inode) + ino = inode->i_ino; + spin_unlock(&configfs_dirent_lock); + if (!inode) + ino = iunique(sb, 2); + + if (!dir_emit(ctx, name, len, ino, dt_type(next))) + return 0; + + spin_lock(&configfs_dirent_lock); + list_move(q, p); + spin_unlock(&configfs_dirent_lock); + p = q; + ctx->pos++; + } + return 0; +} + +static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) +{ + struct dentry * dentry = file->f_path.dentry; + + mutex_lock(&d_inode(dentry)->i_mutex); + switch (whence) { + case 1: + offset += file->f_pos; + case 0: + if (offset >= 0) + break; + default: + mutex_unlock(&d_inode(dentry)->i_mutex); + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + if (file->f_pos >= 2) { + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_dirent *cursor = file->private_data; + struct list_head *p; + loff_t n = file->f_pos - 2; + + spin_lock(&configfs_dirent_lock); + list_del(&cursor->s_sibling); + p = sd->s_children.next; + while (n && p != &sd->s_children) { + struct configfs_dirent *next; + next = list_entry(p, struct configfs_dirent, + s_sibling); + if (next->s_element) + n--; + p = p->next; + } + list_add_tail(&cursor->s_sibling, p); + spin_unlock(&configfs_dirent_lock); + } + } + mutex_unlock(&d_inode(dentry)->i_mutex); + return offset; +} + +const struct file_operations configfs_dir_operations = { + .open = configfs_dir_open, + .release = configfs_dir_close, + .llseek = configfs_dir_lseek, + .read = generic_read_dir, + .iterate = configfs_readdir, +}; + +int configfs_register_subsystem(struct configfs_subsystem *subsys) +{ + int err; + struct config_group *group = &subsys->su_group; + struct dentry *dentry; + struct dentry *root; + struct configfs_dirent *sd; + + root = configfs_pin_fs(); + if (IS_ERR(root)) + return PTR_ERR(root); + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + + sd = root->d_fsdata; + link_group(to_config_group(sd->s_element), group); + + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + + err = -ENOMEM; + dentry = d_alloc_name(root, group->cg_item.ci_name); + if (dentry) { + d_add(dentry, NULL); + + err = configfs_attach_group(sd->s_element, &group->cg_item, + dentry); + if (err) { + BUG_ON(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + } else { + spin_lock(&configfs_dirent_lock); + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + } + } + + mutex_unlock(&d_inode(root)->i_mutex); + + if (err) { + unlink_group(group); + configfs_release_fs(); + } + + return err; +} + +void configfs_unregister_subsystem(struct configfs_subsystem *subsys) +{ + struct config_group *group = &subsys->su_group; + struct dentry *dentry = group->cg_item.ci_dentry; + struct dentry *root = dentry->d_sb->s_root; + + if (dentry->d_parent != root) { + pr_err("Tried to unregister non-subsystem!\n"); + return; + } + + mutex_lock_nested(&d_inode(root)->i_mutex, + I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + if (configfs_detach_prep(dentry, NULL)) { + pr_err("Tried to unregister non-empty subsystem!\n"); + } + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + configfs_detach_group(&group->cg_item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); + mutex_unlock(&d_inode(dentry)->i_mutex); + + d_delete(dentry); + + mutex_unlock(&d_inode(root)->i_mutex); + + dput(dentry); + + unlink_group(group); + configfs_release_fs(); +} + +EXPORT_SYMBOL(configfs_register_subsystem); +EXPORT_SYMBOL(configfs_unregister_subsystem); diff --git a/kernel/fs/configfs/file.c b/kernel/fs/configfs/file.c new file mode 100644 index 000000000..403269ffc --- /dev/null +++ b/kernel/fs/configfs/file.c @@ -0,0 +1,336 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c - operations for regular (text) files. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* + * A simple attribute can only be 4096 characters. Why 4k? Because the + * original code limited it to PAGE_SIZE. That's a bad idea, though, + * because an attribute of 16k on ia64 won't work on x86. So we limit to + * 4k, our minimum common page size. + */ +#define SIMPLE_ATTR_SIZE 4096 + +struct configfs_buffer { + size_t count; + loff_t pos; + char * page; + struct configfs_item_operations * ops; + struct mutex mutex; + int needs_read_fill; +}; + + +/** + * fill_read_buffer - allocate and fill buffer from item. + * @dentry: dentry pointer. + * @buffer: data buffer for file. + * + * Allocate @buffer->page, if it hasn't been already, then call the + * config_item's show() method to fill the buffer with this attribute's + * data. + * This is called only once, on the file's first read. + */ +static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + int ret = 0; + ssize_t count; + + if (!buffer->page) + buffer->page = (char *) get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + count = ops->show_attribute(item,attr,buffer->page); + buffer->needs_read_fill = 0; + BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); + if (count >= 0) + buffer->count = count; + else + ret = count; + return ret; +} + +/** + * configfs_read_file - read an attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read an attribute file. The attribute descriptor + * is in the file's ->d_fsdata. The target item is in the directory's + * ->d_fsdata. + * + * We call fill_read_buffer() to allocate and fill the buffer from the + * item's show() method exactly once (if the read is happening from + * the beginning of the file). That should fill the entire buffer with + * all the data the item has to offer for that attribute. + * We then call flush_read_buffer() to copy the buffer to userspace + * in the increments specified. + */ + +static ssize_t +configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + ssize_t retval = 0; + + mutex_lock(&buffer->mutex); + if (buffer->needs_read_fill) { + if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) + goto out; + } + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", + __func__, count, *ppos, buffer->page); + retval = simple_read_from_buffer(buf, count, ppos, buffer->page, + buffer->count); +out: + mutex_unlock(&buffer->mutex); + return retval; +} + + +/** + * fill_write_buffer - copy buffer from userspace. + * @buffer: data buffer for file. + * @buf: data from user. + * @count: number of bytes in @userbuf. + * + * Allocate @buffer->page if it hasn't been already, then + * copy the user-supplied buffer into it. + */ + +static int +fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count) +{ + int error; + + if (!buffer->page) + buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); + if (!buffer->page) + return -ENOMEM; + + if (count >= SIMPLE_ATTR_SIZE) + count = SIMPLE_ATTR_SIZE - 1; + error = copy_from_user(buffer->page,buf,count); + buffer->needs_read_fill = 1; + /* if buf is assumed to contain a string, terminate it by \0, + * so e.g. sscanf() can scan the string easily */ + buffer->page[count] = 0; + return error ? -EFAULT : count; +} + + +/** + * flush_write_buffer - push buffer to config_item. + * @dentry: dentry to the attribute + * @buffer: data buffer for file. + * @count: number of bytes + * + * Get the correct pointers for the config_item and the attribute we're + * dealing with, then call the store() method for the attribute, + * passing the buffer that we acquired in fill_write_buffer(). + */ + +static int +flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + + return ops->store_attribute(item,attr,buffer->page,count); +} + + +/** + * configfs_write_file - write an attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Similar to configfs_read_file(), though working in the opposite direction. + * We allocate and fill the data from the user in fill_write_buffer(), + * then push it to the config_item in flush_write_buffer(). + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come + * on the first write. + * Hint: if you're writing a value, first read the file, modify only the + * the value you're changing, then write entire buffer back. + */ + +static ssize_t +configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + ssize_t len; + + mutex_lock(&buffer->mutex); + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_path.dentry, buffer, len); + if (len > 0) + *ppos += len; + mutex_unlock(&buffer->mutex); + return len; +} + +static int check_perm(struct inode * inode, struct file * file) +{ + struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(file->f_path.dentry); + struct configfs_buffer * buffer; + struct configfs_item_operations * ops = NULL; + int error = 0; + + if (!item || !attr) + goto Einval; + + /* Grab the module reference for this attribute if we have one */ + if (!try_module_get(attr->ca_owner)) { + error = -ENODEV; + goto Done; + } + + if (item->ci_type) + ops = item->ci_type->ct_item_ops; + else + goto Eaccess; + + /* File needs write support. + * The inode's perms must say it's ok, + * and we must have a store method. + */ + if (file->f_mode & FMODE_WRITE) { + + if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute) + goto Eaccess; + + } + + /* File needs read support. + * The inode's perms must say it's ok, and we there + * must be a show method for it. + */ + if (file->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute) + goto Eaccess; + } + + /* No error? Great, allocate a buffer for the file, and store it + * it in file->private_data for easy access. + */ + buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); + if (!buffer) { + error = -ENOMEM; + goto Enomem; + } + mutex_init(&buffer->mutex); + buffer->needs_read_fill = 1; + buffer->ops = ops; + file->private_data = buffer; + goto Done; + + Einval: + error = -EINVAL; + goto Done; + Eaccess: + error = -EACCES; + Enomem: + module_put(attr->ca_owner); + Done: + if (error && item) + config_item_put(item); + return error; +} + +static int configfs_open_file(struct inode * inode, struct file * filp) +{ + return check_perm(inode,filp); +} + +static int configfs_release(struct inode * inode, struct file * filp) +{ + struct config_item * item = to_item(filp->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(filp->f_path.dentry); + struct module * owner = attr->ca_owner; + struct configfs_buffer * buffer = filp->private_data; + + if (item) + config_item_put(item); + /* After this point, attr should not be accessed. */ + module_put(owner); + + if (buffer) { + if (buffer->page) + free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); + kfree(buffer); + } + return 0; +} + +const struct file_operations configfs_file_operations = { + .read = configfs_read_file, + .write = configfs_write_file, + .llseek = generic_file_llseek, + .open = configfs_open_file, + .release = configfs_release, +}; + +/** + * configfs_create_file - create an attribute file for an item. + * @item: item we're creating for. + * @attr: atrribute descriptor. + */ + +int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) +{ + struct dentry *dir = item->ci_dentry; + struct configfs_dirent *parent_sd = dir->d_fsdata; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, + CONFIGFS_ITEM_ATTR); + mutex_unlock(&d_inode(dir)->i_mutex); + + return error; +} + diff --git a/kernel/fs/configfs/inode.c b/kernel/fs/configfs/inode.c new file mode 100644 index 000000000..8d89f5fd0 --- /dev/null +++ b/kernel/fs/configfs/inode.c @@ -0,0 +1,272 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c - basic inode and dentry operations. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see Documentation/filesystems/configfs/configfs.txt for more + * information. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; +#endif + +static const struct address_space_operations configfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static const struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = d_inode(dentry); + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = GLOBAL_ROOT_UID; + sd_iattr->ia_gid = GLOBAL_ROOT_GID; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + /* attributes were changed atleast once in past */ + + error = simple_setattr(dentry, iattr); + if (error) + return error; + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, + struct super_block *s) +{ + struct inode * inode = new_inode(s); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mapping->a_ops = &configfs_aops; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); + } + return inode; +} + +#ifdef CONFIG_LOCKDEP + +static void configfs_set_inode_lock_class(struct configfs_dirent *sd, + struct inode *inode) +{ + int depth = sd->s_depth; + + if (depth > 0) { + if (depth <= ARRAY_SIZE(default_group_class)) { + lockdep_set_class(&inode->i_mutex, + &default_group_class[depth - 1]); + } else { + /* + * In practice the maximum level of locking depth is + * already reached. Just inform about possible reasons. + */ + pr_info("Too many levels of inodes for the locking correctness validator.\n"); + pr_info("Spurious warnings may appear.\n"); + } + } +} + +#else /* CONFIG_LOCKDEP */ + +static void configfs_set_inode_lock_class(struct configfs_dirent *sd, + struct inode *inode) +{ +} + +#endif /* CONFIG_LOCKDEP */ + +int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *)) +{ + int error = 0; + struct inode *inode = NULL; + struct configfs_dirent *sd; + struct inode *p_inode; + + if (!dentry) + return -ENOENT; + + if (d_really_is_positive(dentry)) + return -EEXIST; + + sd = dentry->d_fsdata; + inode = configfs_new_inode(mode, sd, dentry->d_sb); + if (!inode) + return -ENOMEM; + + p_inode = d_inode(dentry->d_parent); + p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + configfs_set_inode_lock_class(sd, inode); + + init(inode); + d_instantiate(dentry, inode); + if (S_ISDIR(mode) || S_ISLNK(mode)) + dget(dentry); /* pin link and directory dentries in core */ + return error; +} + +/* + * Get the name for corresponding element represented by the given configfs_dirent + */ +const unsigned char * configfs_get_name(struct configfs_dirent *sd) +{ + struct configfs_attribute *attr; + + BUG_ON(!sd || !sd->s_element); + + /* These always have a dentry, so use that */ + if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) + return sd->s_dentry->d_name.name; + + if (sd->s_type & CONFIGFS_ITEM_ATTR) { + attr = sd->s_element; + return attr->ca_name; + } + return NULL; +} + + +/* + * Unhashes the dentry corresponding to given configfs_dirent + * Called with parent inode's i_mutex held. + */ +void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) +{ + struct dentry * dentry = sd->s_dentry; + + if (dentry) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { + dget_dlock(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + simple_unlink(d_inode(parent), dentry); + } else + spin_unlock(&dentry->d_lock); + } +} + +void configfs_hash_and_remove(struct dentry * dir, const char * name) +{ + struct configfs_dirent * sd; + struct configfs_dirent * parent_sd = dir->d_fsdata; + + if (d_really_is_negative(dir)) + /* no inode means this hasn't been made visible yet */ + return; + + mutex_lock(&d_inode(dir)->i_mutex); + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element) + continue; + if (!strcmp(configfs_get_name(sd), name)) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dir); + configfs_put(sd); + break; + } + } + mutex_unlock(&d_inode(dir)->i_mutex); +} diff --git a/kernel/fs/configfs/item.c b/kernel/fs/configfs/item.c new file mode 100644 index 000000000..e65f9ffbb --- /dev/null +++ b/kernel/fs/configfs/item.c @@ -0,0 +1,214 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * item.c - library routines for handling generic config items + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on kobject: + * kobject is Copyright (c) 2002-2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see the file Documentation/filesystems/configfs/configfs.txt for + * critical information about using the config_item interface. + */ + +#include +#include +#include +#include + +#include + + +static inline struct config_item *to_item(struct list_head *entry) +{ + return container_of(entry, struct config_item, ci_entry); +} + +/* Evil kernel */ +static void config_item_release(struct kref *kref); + +/** + * config_item_init - initialize item. + * @item: item in question. + */ +void config_item_init(struct config_item *item) +{ + kref_init(&item->ci_kref); + INIT_LIST_HEAD(&item->ci_entry); +} +EXPORT_SYMBOL(config_item_init); + +/** + * config_item_set_name - Set the name of an item + * @item: item. + * @fmt: The vsnprintf()'s format string. + * + * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a + * dynamically allocated string that @item->ci_name points to. + * Otherwise, use the static @item->ci_namebuf array. + */ +int config_item_set_name(struct config_item *item, const char *fmt, ...) +{ + int error = 0; + int limit = CONFIGFS_ITEM_NAME_LEN; + int need; + va_list args; + char *name; + + /* + * First, try the static array + */ + va_start(args, fmt); + need = vsnprintf(item->ci_namebuf, limit, fmt, args); + va_end(args); + if (need < limit) + name = item->ci_namebuf; + else { + /* + * Need more space? Allocate it and try again + */ + limit = need + 1; + name = kmalloc(limit, GFP_KERNEL); + if (!name) { + error = -ENOMEM; + goto Done; + } + va_start(args, fmt); + need = vsnprintf(name, limit, fmt, args); + va_end(args); + + /* Still? Give up. */ + if (need >= limit) { + kfree(name); + error = -EFAULT; + goto Done; + } + } + + /* Free the old name, if necessary. */ + if (item->ci_name && item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + + /* Now, set the new name */ + item->ci_name = name; + Done: + return error; +} +EXPORT_SYMBOL(config_item_set_name); + +void config_item_init_type_name(struct config_item *item, + const char *name, + struct config_item_type *type) +{ + config_item_set_name(item, name); + item->ci_type = type; + config_item_init(item); +} +EXPORT_SYMBOL(config_item_init_type_name); + +void config_group_init_type_name(struct config_group *group, const char *name, + struct config_item_type *type) +{ + config_item_set_name(&group->cg_item, name); + group->cg_item.ci_type = type; + config_group_init(group); +} +EXPORT_SYMBOL(config_group_init_type_name); + +struct config_item *config_item_get(struct config_item *item) +{ + if (item) + kref_get(&item->ci_kref); + return item; +} +EXPORT_SYMBOL(config_item_get); + +static void config_item_cleanup(struct config_item *item) +{ + struct config_item_type *t = item->ci_type; + struct config_group *s = item->ci_group; + struct config_item *parent = item->ci_parent; + + pr_debug("config_item %s: cleaning up\n", config_item_name(item)); + if (item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + item->ci_name = NULL; + if (t && t->ct_item_ops && t->ct_item_ops->release) + t->ct_item_ops->release(item); + if (s) + config_group_put(s); + if (parent) + config_item_put(parent); +} + +static void config_item_release(struct kref *kref) +{ + config_item_cleanup(container_of(kref, struct config_item, ci_kref)); +} + +/** + * config_item_put - decrement refcount for item. + * @item: item. + * + * Decrement the refcount, and if 0, call config_item_cleanup(). + */ +void config_item_put(struct config_item *item) +{ + if (item) + kref_put(&item->ci_kref, config_item_release); +} +EXPORT_SYMBOL(config_item_put); + +/** + * config_group_init - initialize a group for use + * @group: config_group + */ +void config_group_init(struct config_group *group) +{ + config_item_init(&group->cg_item); + INIT_LIST_HEAD(&group->cg_children); +} +EXPORT_SYMBOL(config_group_init); + +/** + * config_group_find_item - search for item in group. + * @group: group we're looking in. + * @name: item's name. + * + * Iterate over @group->cg_list, looking for a matching config_item. + * If matching item is found take a reference and return the item. + * Caller must have locked group via @group->cg_subsys->su_mtx. + */ +struct config_item *config_group_find_item(struct config_group *group, + const char *name) +{ + struct list_head *entry; + struct config_item *ret = NULL; + + list_for_each(entry, &group->cg_children) { + struct config_item *item = to_item(entry); + if (config_item_name(item) && + !strcmp(config_item_name(item), name)) { + ret = config_item_get(item); + break; + } + } + return ret; +} +EXPORT_SYMBOL(config_group_find_item); diff --git a/kernel/fs/configfs/mount.c b/kernel/fs/configfs/mount.c new file mode 100644 index 000000000..a8f3b589a --- /dev/null +++ b/kernel/fs/configfs/mount.c @@ -0,0 +1,175 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mount.c - operations for initializing and mounting configfs. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* Random magic number */ +#define CONFIGFS_MAGIC 0x62656570 + +static struct vfsmount *configfs_mount = NULL; +struct kmem_cache *configfs_dir_cachep; +static int configfs_mnt_count = 0; + +static const struct super_operations configfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, +}; + +static struct config_group configfs_root_group = { + .cg_item = { + .ci_namebuf = "root", + .ci_name = configfs_root_group.cg_item.ci_namebuf, + }, +}; + +int configfs_is_root(struct config_item *item) +{ + return item == &configfs_root_group.cg_item; +} + +static struct configfs_dirent configfs_root = { + .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), + .s_children = LIST_HEAD_INIT(configfs_root.s_children), + .s_element = &configfs_root_group.cg_item, + .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, +}; + +static int configfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CONFIGFS_MAGIC; + sb->s_op = &configfs_ops; + sb->s_time_gran = 1; + + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root, sb); + if (inode) { + inode->i_op = &configfs_root_inode_operations; + inode->i_fop = &configfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + } else { + pr_debug("could not get root inode\n"); + return -ENOMEM; + } + + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n",__func__); + return -ENOMEM; + } + config_group_init(&configfs_root_group); + configfs_root_group.cg_item.ci_dentry = root; + root->d_fsdata = &configfs_root; + sb->s_root = root; + sb->s_d_op = &configfs_dentry_ops; /* the rest get that */ + return 0; +} + +static struct dentry *configfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, configfs_fill_super); +} + +static struct file_system_type configfs_fs_type = { + .owner = THIS_MODULE, + .name = "configfs", + .mount = configfs_do_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("configfs"); + +struct dentry *configfs_pin_fs(void) +{ + int err = simple_pin_fs(&configfs_fs_type, &configfs_mount, + &configfs_mnt_count); + return err ? ERR_PTR(err) : configfs_mount->mnt_root; +} + +void configfs_release_fs(void) +{ + simple_release_fs(&configfs_mount, &configfs_mnt_count); +} + + +static int __init configfs_init(void) +{ + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL); + if (!configfs_dir_cachep) + goto out; + + err = sysfs_create_mount_point(kernel_kobj, "config"); + if (err) + goto out2; + + err = register_filesystem(&configfs_fs_type); + if (err) + goto out3; + + return 0; +out3: + pr_err("Unable to register filesystem!\n"); + sysfs_remove_mount_point(kernel_kobj, "config"); +out2: + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; +out: + return err; +} + +static void __exit configfs_exit(void) +{ + unregister_filesystem(&configfs_fs_type); + sysfs_remove_mount_point(kernel_kobj, "config"); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.0.2"); +MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); + +core_initcall(configfs_init); +module_exit(configfs_exit); diff --git a/kernel/fs/configfs/symlink.c b/kernel/fs/configfs/symlink.c new file mode 100644 index 000000000..cc9f2546e --- /dev/null +++ b/kernel/fs/configfs/symlink.c @@ -0,0 +1,314 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.c - operations for configfs symlinks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* Protects attachments of new symlinks */ +DEFINE_MUTEX(configfs_symlink_mutex); + +static int item_depth(struct config_item * item) +{ + struct config_item * p = item; + int depth = 0; + do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p)); + return depth; +} + +static int item_path_length(struct config_item * item) +{ + struct config_item * p = item; + int length = 1; + do { + length += strlen(config_item_name(p)) + 1; + p = p->ci_parent; + } while (p && !configfs_is_root(p)); + return length; +} + +static void fill_item_path(struct config_item * item, char * buffer, int length) +{ + struct config_item * p; + + --length; + for (p = item; p && !configfs_is_root(p); p = p->ci_parent) { + int cur = strlen(config_item_name(p)); + + /* back up enough to print this bus id with '/' */ + length -= cur; + strncpy(buffer + length,config_item_name(p),cur); + *(buffer + --length) = '/'; + } +} + +static int create_link(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata; + struct configfs_symlink *sl; + int ret; + + ret = -ENOENT; + if (!configfs_dirent_is_ready(target_sd)) + goto out; + ret = -ENOMEM; + sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); + if (sl) { + sl->sl_target = config_item_get(item); + spin_lock(&configfs_dirent_lock); + if (target_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + config_item_put(item); + kfree(sl); + return -ENOENT; + } + list_add(&sl->sl_list, &target_sd->s_links); + spin_unlock(&configfs_dirent_lock); + ret = configfs_create_link(sl, parent_item->ci_dentry, + dentry); + if (ret) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); + config_item_put(item); + kfree(sl); + } + } + +out: + return ret; +} + + +static int get_target(const char *symname, struct path *path, + struct config_item **target, struct super_block *sb) +{ + int ret; + + ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path); + if (!ret) { + if (path->dentry->d_sb == sb) { + *target = configfs_get_config_item(path->dentry); + if (!*target) { + ret = -ENOENT; + path_put(path); + } + } else { + ret = -EPERM; + path_put(path); + } + } + + return ret; +} + + +int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int ret; + struct path path; + struct configfs_dirent *sd; + struct config_item *parent_item; + struct config_item *target_item = NULL; + struct config_item_type *type; + + sd = dentry->d_parent->d_fsdata; + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + ret = -ENOENT; + if (!configfs_dirent_is_ready(sd)) + goto out; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + ret = -EPERM; + if (!type || !type->ct_item_ops || + !type->ct_item_ops->allow_link) + goto out_put; + + ret = get_target(symname, &path, &target_item, dentry->d_sb); + if (ret) + goto out_put; + + ret = type->ct_item_ops->allow_link(parent_item, target_item); + if (!ret) { + mutex_lock(&configfs_symlink_mutex); + ret = create_link(parent_item, target_item, dentry); + mutex_unlock(&configfs_symlink_mutex); + if (ret && type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + target_item); + } + + config_item_put(target_item); + path_put(&path); + +out_put: + config_item_put(parent_item); + +out: + return ret; +} + +int configfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_symlink *sl; + struct config_item *parent_item; + struct config_item_type *type; + int ret; + + ret = -EPERM; /* What lack-of-symlink returns */ + if (!(sd->s_type & CONFIGFS_ITEM_LINK)) + goto out; + + sl = sd->s_element; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dentry->d_parent); + dput(dentry); + configfs_put(sd); + + /* + * drop_link() must be called before + * list_del_init(&sl->sl_list), so that the order of + * drop_link(this, target) and drop_item(target) is preserved. + */ + if (type && type->ct_item_ops && + type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + sl->sl_target); + + spin_lock(&configfs_dirent_lock); + list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); + + /* Put reference from create_link() */ + config_item_put(sl->sl_target); + kfree(sl); + + config_item_put(parent_item); + + ret = 0; + +out: + return ret; +} + +static int configfs_get_target_path(struct config_item * item, struct config_item * target, + char *path) +{ + char * s; + int depth, size; + + depth = item_depth(item); + size = item_path_length(target) + depth * 3 - 1; + if (size > PATH_MAX) + return -ENAMETOOLONG; + + pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size); + + for (s = path; depth--; s += 3) + strcpy(s,"../"); + + fill_item_path(target, path, size); + pr_debug("%s: path = '%s'\n", __func__, path); + + return 0; +} + +static int configfs_getlink(struct dentry *dentry, char * path) +{ + struct config_item *item, *target_item; + int error = 0; + + item = configfs_get_config_item(dentry->d_parent); + if (!item) + return -EINVAL; + + target_item = configfs_get_config_item(dentry); + if (!target_item) { + config_item_put(item); + return -EINVAL; + } + + down_read(&configfs_rename_sem); + error = configfs_get_target_path(item, target_item, path); + up_read(&configfs_rename_sem); + + config_item_put(item); + config_item_put(target_item); + return error; + +} + +static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + + if (page) { + error = configfs_getlink(dentry, (char *)page); + if (!error) { + nd_set_link(nd, (char *)page); + return (void *)page; + } + } + + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + if (cookie) { + unsigned long page = (unsigned long)cookie; + free_page(page); + } +} + +const struct inode_operations configfs_symlink_inode_operations = { + .follow_link = configfs_follow_link, + .readlink = generic_readlink, + .put_link = configfs_put_link, + .setattr = configfs_setattr, +}; + diff --git a/kernel/fs/coredump.c b/kernel/fs/coredump.c new file mode 100644 index 000000000..bbbe139ab --- /dev/null +++ b/kernel/fs/coredump.c @@ -0,0 +1,751 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include "internal.h" + +#include + +int core_uses_pid; +unsigned int core_pipe_limit; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_name_size = CORENAME_MAX_SIZE; + +struct core_name { + char *corename; + int used, size; +}; + +/* The maximal length of core_pattern is also specified in sysctl.c */ + +static int expand_corename(struct core_name *cn, int size) +{ + char *corename = krealloc(cn->corename, size, GFP_KERNEL); + + if (!corename) + return -ENOMEM; + + if (size > core_name_size) /* racy but harmless */ + core_name_size = size; + + cn->size = ksize(corename); + cn->corename = corename; + return 0; +} + +static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) +{ + int free, need; + va_list arg_copy; + +again: + free = cn->size - cn->used; + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + + if (need < free) { + cn->used += need; + return 0; + } + + if (!expand_corename(cn, cn->size + need - free + 1)) + goto again; + + return -ENOMEM; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + va_list arg; + int ret; + + va_start(arg, fmt); + ret = cn_vprintf(cn, fmt, arg); + va_end(arg); + + return ret; +} + +static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) +{ + int cur = cn->used; + va_list arg; + int ret; + + va_start(arg, fmt); + ret = cn_vprintf(cn, fmt, arg); + va_end(arg); + + for (; cur < cn->used; ++cur) { + if (cn->corename[cur] == '/') + cn->corename[cur] = '!'; + } + return ret; +} + +static int cn_print_exe_file(struct core_name *cn) +{ + struct file *exe_file; + char *pathbuf, *path; + int ret; + + exe_file = get_mm_exe_file(current->mm); + if (!exe_file) + return cn_esc_printf(cn, "%s (path unknown)", current->comm); + + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + if (!pathbuf) { + ret = -ENOMEM; + goto put_exe_file; + } + + path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto free_buf; + } + + ret = cn_esc_printf(cn, "%s", path); + +free_buf: + kfree(pathbuf); +put_exe_file: + fput(exe_file); + return ret; +} + +/* format_corename will inspect the pattern parameter, and output a + * name into corename, which must have space for at least + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. + */ +static int format_corename(struct core_name *cn, struct coredump_params *cprm) +{ + const struct cred *cred = current_cred(); + const char *pat_ptr = core_pattern; + int ispipe = (*pat_ptr == '|'); + int pid_in_pattern = 0; + int err = 0; + + cn->used = 0; + cn->corename = NULL; + if (expand_corename(cn, core_name_size)) + return -ENOMEM; + cn->corename[0] = '\0'; + + if (ispipe) + ++pat_ptr; + + /* Repeat as long as we have more pattern to process and more output + space */ + while (*pat_ptr) { + if (*pat_ptr != '%') { + err = cn_printf(cn, "%c", *pat_ptr++); + } else { + switch (*++pat_ptr) { + /* single % at the end, drop that */ + case 0: + goto out; + /* Double percent, output one percent */ + case '%': + err = cn_printf(cn, "%c", '%'); + break; + /* pid */ + case 'p': + pid_in_pattern = 1; + err = cn_printf(cn, "%d", + task_tgid_vnr(current)); + break; + /* global pid */ + case 'P': + err = cn_printf(cn, "%d", + task_tgid_nr(current)); + break; + case 'i': + err = cn_printf(cn, "%d", + task_pid_vnr(current)); + break; + case 'I': + err = cn_printf(cn, "%d", + task_pid_nr(current)); + break; + /* uid */ + case 'u': + err = cn_printf(cn, "%d", cred->uid); + break; + /* gid */ + case 'g': + err = cn_printf(cn, "%d", cred->gid); + break; + case 'd': + err = cn_printf(cn, "%d", + __get_dumpable(cprm->mm_flags)); + break; + /* signal that caused the coredump */ + case 's': + err = cn_printf(cn, "%ld", cprm->siginfo->si_signo); + break; + /* UNIX time of coredump */ + case 't': { + struct timeval tv; + do_gettimeofday(&tv); + err = cn_printf(cn, "%lu", tv.tv_sec); + break; + } + /* hostname */ + case 'h': + down_read(&uts_sem); + err = cn_esc_printf(cn, "%s", + utsname()->nodename); + up_read(&uts_sem); + break; + /* executable */ + case 'e': + err = cn_esc_printf(cn, "%s", current->comm); + break; + case 'E': + err = cn_print_exe_file(cn); + break; + /* core limit size */ + case 'c': + err = cn_printf(cn, "%lu", + rlimit(RLIMIT_CORE)); + break; + default: + break; + } + ++pat_ptr; + } + + if (err) + return err; + } + +out: + /* Backward compatibility with core_uses_pid: + * + * If core_pattern does not include a %p (as is the default) + * and core_uses_pid is set, then .%pid will be appended to + * the filename. Do not do this for piped commands. */ + if (!ispipe && !pid_in_pattern && core_uses_pid) { + err = cn_printf(cn, ".%d", task_tgid_vnr(current)); + if (err) + return err; + } + return ispipe; +} + +static int zap_process(struct task_struct *start, int exit_code) +{ + struct task_struct *t; + int nr = 0; + + start->signal->group_exit_code = exit_code; + start->signal->group_stop_count = 0; + + t = start; + do { + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); + if (t != current && t->mm) { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + nr++; + } + } while_each_thread(start, t); + + return nr; +} + +static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int nr = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_group_exit(tsk->signal)) { + mm->core_state = core_state; + nr = zap_process(tsk, exit_code); + tsk->signal->group_exit_task = tsk; + /* ignore all signals except SIGKILL, see prepare_signal() */ + tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); + } + spin_unlock_irq(&tsk->sighand->siglock); + if (unlikely(nr < 0)) + return nr; + + tsk->flags |= PF_DUMPCORE; + if (atomic_read(&mm->mm_users) == nr + 1) + goto done; + /* + * We should find and kill all tasks which use this mm, and we should + * count them correctly into ->nr_threads. We don't take tasklist + * lock, but this is safe wrt: + * + * fork: + * None of sub-threads can fork after zap_process(leader). All + * processes which were created before this point should be + * visible to zap_threads() because copy_process() adds the new + * process to the tail of init_task.tasks list, and lock/unlock + * of ->siglock provides a memory barrier. + * + * do_exit: + * The caller holds mm->mmap_sem. This means that the task which + * uses this mm can't pass exit_mm(), so it can't exit or clear + * its ->mm. + * + * de_thread: + * It does list_replace_rcu(&leader->tasks, ¤t->tasks), + * we must see either old or new leader, this does not matter. + * However, it can change p->sighand, so lock_task_sighand(p) + * must be used. Since p->mm != NULL and we hold ->mmap_sem + * it can't fail. + * + * Note also that "g" can be the old leader with ->mm == NULL + * and already unhashed and thus removed from ->thread_group. + * This is OK, __unhash_process()->list_del_rcu() does not + * clear the ->next pointer, we will find the new leader via + * next_thread(). + */ + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + if (g->flags & PF_KTHREAD) + continue; + p = g; + do { + if (p->mm) { + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code); + p->signal->flags = SIGNAL_GROUP_EXIT; + unlock_task_sighand(p, &flags); + } + break; + } + } while_each_thread(g, p); + } + rcu_read_unlock(); +done: + atomic_set(&core_state->nr_threads, nr); + return nr; +} + +static int coredump_wait(int exit_code, struct core_state *core_state) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + int core_waiters = -EBUSY; + + init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; + + down_write(&mm->mmap_sem); + if (!mm->core_state) + core_waiters = zap_threads(tsk, mm, core_state, exit_code); + up_write(&mm->mmap_sem); + + if (core_waiters > 0) { + struct core_thread *ptr; + + wait_for_completion(&core_state->startup); + /* + * Wait for all the threads to become inactive, so that + * all the thread context (extended register state, like + * fpu etc) gets copied to the memory. + */ + ptr = core_state->dumper.next; + while (ptr != NULL) { + wait_task_inactive(ptr->task, 0); + ptr = ptr->next; + } + } + + return core_waiters; +} + +static void coredump_finish(struct mm_struct *mm, bool core_dumped) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + spin_lock_irq(¤t->sighand->siglock); + if (core_dumped && !__fatal_signal_pending(current)) + current->signal->group_exit_code |= 0x80; + current->signal->group_exit_task = NULL; + current->signal->flags = SIGNAL_GROUP_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + +static bool dump_interrupted(void) +{ + /* + * SIGKILL or freezing() interrupt the coredumping. Perhaps we + * can do try_to_freeze() and check __fatal_signal_pending(), + * but then we need to teach dump_write() to restart and clear + * TIF_SIGPENDING. + */ + return signal_pending(current); +} + +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; + + pipe_lock(pipe); + pipe->readers++; + pipe->writers--; + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_unlock(pipe); + + /* + * We actually want wait_event_freezable() but then we need + * to clear TIF_SIGPENDING and improve dump_interrupted(). + */ + wait_event_interruptible(pipe->wait, pipe->readers == 1); + + pipe_lock(pipe); + pipe->readers--; + pipe->writers++; + pipe_unlock(pipe); +} + +/* + * umh_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct file *files[2]; + struct coredump_params *cp = (struct coredump_params *)info->data; + int err = create_pipe_files(files, 0); + if (err) + return err; + + cp->file = files[1]; + + err = replace_fd(0, files[0], 0); + fput(files[0]); + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return err; +} + +void do_coredump(const siginfo_t *siginfo) +{ + struct core_state core_state; + struct core_name cn; + struct mm_struct *mm = current->mm; + struct linux_binfmt * binfmt; + const struct cred *old_cred; + struct cred *cred; + int retval = 0; + int flag = 0; + int ispipe; + struct files_struct *displaced; + bool need_nonrelative = false; + bool core_dumped = false; + static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .siginfo = siginfo, + .regs = signal_pt_regs(), + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, + }; + + audit_core_dumps(siginfo->si_signo); + + binfmt = mm->binfmt; + if (!binfmt || !binfmt->core_dump) + goto fail; + if (!__get_dumpable(cprm.mm_flags)) + goto fail; + + cred = prepare_creds(); + if (!cred) + goto fail; + /* + * We cannot trust fsuid as being the "true" uid of the process + * nor do we know its entire history. We only know it was tainted + * so we dump it as root in mode 2, and only into a controlled + * environment (pipe handler or fully qualified path). + */ + if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { + /* Setuid core dump mode */ + flag = O_EXCL; /* Stop rewrite attacks */ + cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ + need_nonrelative = true; + } + + retval = coredump_wait(siginfo->si_signo, &core_state); + if (retval < 0) + goto fail_creds; + + old_cred = override_creds(cred); + + ispipe = format_corename(&cn, &cprm); + + if (ispipe) { + int dump_count; + char **helper_argv; + struct subprocess_info *sub_info; + + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_unlock; + } + + if (cprm.limit == 1) { + /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + * + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a special value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + printk(KERN_WARNING + "Process %d(%s) has RLIMIT_CORE set to 1\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Aborting core\n"); + goto fail_unlock; + } + cprm.limit = RLIM_INFINITY; + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_dropcount; + } + + helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_dropcount; + } + + retval = -ENOMEM; + sub_info = call_usermodehelper_setup(helper_argv[0], + helper_argv, NULL, GFP_KERNEL, + umh_pipe_setup, NULL, &cprm); + if (sub_info) + retval = call_usermodehelper_exec(sub_info, + UMH_WAIT_EXEC); + + argv_free(helper_argv); + if (retval) { + printk(KERN_INFO "Core dump to |%s pipe failed\n", + cn.corename); + goto close_fail; + } + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + + if (need_nonrelative && cn.corename[0] != '/') { + printk(KERN_WARNING "Pid %d(%s) can only dump core "\ + "to fully qualified path!\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_unlock; + } + + cprm.file = filp_open(cn.corename, + O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + 0600); + if (IS_ERR(cprm.file)) + goto fail_unlock; + + inode = file_inode(cprm.file); + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (!uid_eq(inode->i_uid, current_fsuid())) + goto close_fail; + if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + + /* get us an unshared descriptor table; almost always a no-op */ + retval = unshare_files(&displaced); + if (retval) + goto close_fail; + if (displaced) + put_files_struct(displaced); + if (!dump_interrupted()) { + file_start_write(cprm.file); + core_dumped = binfmt->core_dump(&cprm); + file_end_write(cprm.file); + } + if (ispipe && core_pipe_limit) + wait_for_dump_helpers(cprm.file); +close_fail: + if (cprm.file) + filp_close(cprm.file, NULL); +fail_dropcount: + if (ispipe) + atomic_dec(&core_dump_count); +fail_unlock: + kfree(cn.corename); + coredump_finish(mm, core_dumped); + revert_creds(old_cred); +fail_creds: + put_cred(cred); +fail: + return; +} + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + struct file *file = cprm->file; + loff_t pos = file->f_pos; + ssize_t n; + if (cprm->written + nr > cprm->limit) + return 0; + while (nr) { + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n <= 0) + return 0; + file->f_pos = pos; + cprm->written += n; + nr -= n; + } + return 1; +} +EXPORT_SYMBOL(dump_emit); + +int dump_skip(struct coredump_params *cprm, size_t nr) +{ + static char zeroes[PAGE_SIZE]; + struct file *file = cprm->file; + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (cprm->written + nr > cprm->limit) + return 0; + if (dump_interrupted() || + file->f_op->llseek(file, nr, SEEK_CUR) < 0) + return 0; + cprm->written += nr; + return 1; + } else { + while (nr > PAGE_SIZE) { + if (!dump_emit(cprm, zeroes, PAGE_SIZE)) + return 0; + nr -= PAGE_SIZE; + } + return dump_emit(cprm, zeroes, nr); + } +} +EXPORT_SYMBOL(dump_skip); + +int dump_align(struct coredump_params *cprm, int align) +{ + unsigned mod = cprm->written & (align - 1); + if (align & (align - 1)) + return 0; + return mod ? dump_skip(cprm, align - mod) : 1; +} +EXPORT_SYMBOL(dump_align); diff --git a/kernel/fs/cramfs/Kconfig b/kernel/fs/cramfs/Kconfig new file mode 100644 index 000000000..11b29d491 --- /dev/null +++ b/kernel/fs/cramfs/Kconfig @@ -0,0 +1,22 @@ +config CRAMFS + tristate "Compressed ROM file system support (cramfs) (OBSOLETE)" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for CramFs (Compressed ROM File + System). CramFs is designed to be a simple, small, and compressed + file system for ROM based embedded systems. CramFs is read-only, + limited to 256MB file systems (with 16MB files), and doesn't support + 16/32 bits uid/gid, hard links and timestamps. + + See and + for further information. + + To compile this as a module, choose M here: the module will be called + cramfs. Note that the root file system (the one containing the + directory /) cannot be compiled as a module. + + This filesystem is obsoleted by SquashFS, which is much better + in terms of performance and features. + + If unsure, say N. diff --git a/kernel/fs/cramfs/Makefile b/kernel/fs/cramfs/Makefile new file mode 100644 index 000000000..92ebb464a --- /dev/null +++ b/kernel/fs/cramfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux cramfs routines. +# + +obj-$(CONFIG_CRAMFS) += cramfs.o + +cramfs-objs := inode.o uncompress.o diff --git a/kernel/fs/cramfs/README b/kernel/fs/cramfs/README new file mode 100644 index 000000000..445d1c2d7 --- /dev/null +++ b/kernel/fs/cramfs/README @@ -0,0 +1,168 @@ +Notes on Filesystem Layout +-------------------------- + +These notes describe what mkcramfs generates. Kernel requirements are +a bit looser, e.g. it doesn't care if the items are +swapped around (though it does care that directory entries (inodes) in +a given directory are contiguous, as this is used by readdir). + +All data is currently in host-endian format; neither mkcramfs nor the +kernel ever do swabbing. (See section `Block Size' below.) + +: + + + + +: struct cramfs_super (see cramfs_fs.h). + +: + For each file: + struct cramfs_inode (see cramfs_fs.h). + Filename. Not generally null-terminated, but it is + null-padded to a multiple of 4 bytes. + +The order of inode traversal is described as "width-first" (not to be +confused with breadth-first); i.e. like depth-first but listing all of +a directory's entries before recursing down its subdirectories: the +same order as `ls -AUR' (but without the /^\..*:$/ directory header +lines); put another way, the same order as `find -type d -exec +ls -AU1 {} \;'. + +Beginning in 2.4.7, directory entries are sorted. This optimization +allows cramfs_lookup to return more quickly when a filename does not +exist, speeds up user-space directory sorts, etc. + +: + One for each file that's either a symlink or a + regular file of non-zero st_size. + +: + nblocks * + (where nblocks = (st_size - 1) / blksize + 1) + nblocks * + padding to multiple of 4 bytes + +The i'th for a file stores the byte offset of the +*end* of the i'th (i.e. one past the last byte, which is the +same as the start of the (i+1)'th if there is one). The first + immediately follows the last for the file. +s are each 32 bits long. + +The order of 's is a depth-first descent of the directory +tree, i.e. the same order as `find -size +0 \( -type f -o -type l \) +-print'. + + +: The i'th is the output of zlib's compress function +applied to the i'th blksize-sized chunk of the input data. +(For the last of the file, the input may of course be smaller.) +Each may be a different size. (See above.) +s are merely byte-aligned, not generally u32-aligned. + + +Holes +----- + +This kernel supports cramfs holes (i.e. [efficient representation of] +blocks in uncompressed data consisting entirely of NUL bytes), but by +default mkcramfs doesn't test for & create holes, since cramfs in +kernels up to at least 2.3.39 didn't support holes. Run mkcramfs +with -z if you want it to create files that can have holes in them. + + +Tools +----- + +The cramfs user-space tools, including mkcramfs and cramfsck, are +located at . + + +Future Development +================== + +Block Size +---------- + +(Block size in cramfs refers to the size of input data that is +compressed at a time. It's intended to be somewhere around +PAGE_CACHE_SIZE for cramfs_readpage's convenience.) + +The superblock ought to indicate the block size that the fs was +written for, since comments in indicate that +PAGE_CACHE_SIZE may grow in future (if I interpret the comment +correctly). + +Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that +for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in +turn is defined as PAGE_SIZE (which can be as large as 32KB on arm). +This discrepancy is a bug, though it's not clear which should be +changed. + +One option is to change mkcramfs to take its PAGE_CACHE_SIZE from +. Personally I don't like this option, but it does +require the least amount of change: just change `#define +PAGE_CACHE_SIZE (4096)' to `#include '. The disadvantage +is that the generated cramfs cannot always be shared between different +kernels, not even necessarily kernels of the same architecture if +PAGE_CACHE_SIZE is subject to change between kernel versions +(currently possible with arm and ia64). + +The remaining options try to make cramfs more sharable. + +One part of that is addressing endianness. The two options here are +`always use little-endian' (like ext2fs) or `writer chooses +endianness; kernel adapts at runtime'. Little-endian wins because of +code simplicity and little CPU overhead even on big-endian machines. + +The cost of swabbing is changing the code to use the le32_to_cpu +etc. macros as used by ext2fs. We don't need to swab the compressed +data, only the superblock, inodes and block pointers. + + +The other part of making cramfs more sharable is choosing a block +size. The options are: + + 1. Always 4096 bytes. + + 2. Writer chooses blocksize; kernel adapts but rejects blocksize > + PAGE_CACHE_SIZE. + + 3. Writer chooses blocksize; kernel adapts even to blocksize > + PAGE_CACHE_SIZE. + +It's easy enough to change the kernel to use a smaller value than +PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks. + +The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE +value don't get as good compression as they can. + +The cost of option 2 relative to option 1 is that the code uses +variables instead of #define'd constants. The gain is that people +with kernels having larger PAGE_CACHE_SIZE can make use of that if +they don't mind their cramfs being inaccessible to kernels with +smaller PAGE_CACHE_SIZE values. + +Option 3 is easy to implement if we don't mind being CPU-inefficient: +e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which +must be no larger than 32KB) and discard what it doesn't need. +Getting readpage to read into all the covered pages is harder. + +The main advantage of option 3 over 1, 2, is better compression. The +cost is greater complexity. Probably not worth it, but I hope someone +will disagree. (If it is implemented, then I'll re-use that code in +e2compr.) + + +Another cost of 2 and 3 over 1 is making mkcramfs use a different +block size, but that just means adding and parsing a -b option. + + +Inode Size +---------- + +Given that cramfs will probably be used for CDs etc. as well as just +silicon ROMs, it might make sense to expand the inode a little from +its current 12 bytes. Inodes other than the root inode are followed +by filename, so the expansion doesn't even have to be a multiple of 4 +bytes. diff --git a/kernel/fs/cramfs/inode.c b/kernel/fs/cramfs/inode.c new file mode 100644 index 000000000..355c522f3 --- /dev/null +++ b/kernel/fs/cramfs/inode.c @@ -0,0 +1,611 @@ +/* + * Compressed rom filesystem for Linux. + * + * Copyright (C) 1999 Linus Torvalds. + * + * This file is released under the GPL. + */ + +/* + * These are the VFS interfaces to the compressed rom filesystem. + * The actual compression is based on zlib, see the other files. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * cramfs super-block data in memory + */ +struct cramfs_sb_info { + unsigned long magic; + unsigned long size; + unsigned long blocks; + unsigned long files; + unsigned long flags; +}; + +static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static const struct super_operations cramfs_ops; +static const struct inode_operations cramfs_dir_inode_operations; +static const struct file_operations cramfs_directory_operations; +static const struct address_space_operations cramfs_aops; + +static DEFINE_MUTEX(read_mutex); + + +/* These macros may change in future, to provide better st_ino semantics. */ +#define OFFSET(x) ((x)->i_ino) + +static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) +{ + if (!cino->offset) + return offset + 1; + if (!cino->size) + return offset + 1; + + /* + * The file mode test fixes buggy mkcramfs implementations where + * cramfs_inode->offset is set to a non zero value for entries + * which did not contain data, like devices node and fifos. + */ + switch (cino->mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + return cino->offset << 2; + default: + break; + } + return offset + 1; +} + +static struct inode *get_cramfs_inode(struct super_block *sb, + const struct cramfs_inode *cramfs_inode, unsigned int offset) +{ + struct inode *inode; + static struct timespec zerotime; + + inode = iget_locked(sb, cramino(cramfs_inode, offset)); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + switch (cramfs_inode->mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + break; + case S_IFDIR: + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + break; + default: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); + } + + inode->i_mode = cramfs_inode->mode; + i_uid_write(inode, cramfs_inode->uid); + i_gid_write(inode, cramfs_inode->gid); + + /* if the lower 2 bits are zero, the inode contains data */ + if (!(inode->i_ino & 3)) { + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + } + + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + + unlock_new_inode(inode); + + return inode; +} + +/* + * We have our own block cache: don't fill up the buffer cache + * with the rom-image, because the way the filesystem is set + * up the accesses should be fairly regular and cached in the + * page cache and dentry tree anyway.. + * + * This also acts as a way to guarantee contiguous areas of up to + * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to + * worry about end-of-buffer issues even when decompressing a full + * page cache. + */ +#define READ_BUFFERS (2) +/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ +#define NEXT_BUFFER(_ix) ((_ix) ^ 1) + +/* + * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed" + * data that takes up more space than the original and with unlucky + * alignment. + */ +#define BLKS_PER_BUF_SHIFT (2) +#define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT) +#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_CACHE_SIZE) + +static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; +static unsigned buffer_blocknr[READ_BUFFERS]; +static struct super_block *buffer_dev[READ_BUFFERS]; +static int next_buffer; + +/* + * Returns a pointer to a buffer containing at least LEN bytes of + * filesystem starting at byte offset OFFSET into the filesystem. + */ +static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) +{ + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *pages[BLKS_PER_BUF]; + unsigned i, blocknr, buffer; + unsigned long devsize; + char *data; + + if (!len) + return NULL; + blocknr = offset >> PAGE_CACHE_SHIFT; + offset &= PAGE_CACHE_SIZE - 1; + + /* Check if an existing buffer already has the data.. */ + for (i = 0; i < READ_BUFFERS; i++) { + unsigned int blk_offset; + + if (buffer_dev[i] != sb) + continue; + if (blocknr < buffer_blocknr[i]) + continue; + blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT; + blk_offset += offset; + if (blk_offset + len > BUFFER_SIZE) + continue; + return read_buffers[i] + blk_offset; + } + + devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT; + + /* Ok, read in BLKS_PER_BUF pages completely first. */ + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = NULL; + + if (blocknr + i < devsize) { + page = read_mapping_page(mapping, blocknr + i, NULL); + /* synchronous error? */ + if (IS_ERR(page)) + page = NULL; + } + pages[i] = page; + } + + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = pages[i]; + + if (page) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + /* asynchronous error */ + page_cache_release(page); + pages[i] = NULL; + } + } + } + + buffer = next_buffer; + next_buffer = NEXT_BUFFER(buffer); + buffer_blocknr[buffer] = blocknr; + buffer_dev[buffer] = sb; + + data = read_buffers[buffer]; + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = pages[i]; + + if (page) { + memcpy(data, kmap(page), PAGE_CACHE_SIZE); + kunmap(page); + page_cache_release(page); + } else + memset(data, 0, PAGE_CACHE_SIZE); + data += PAGE_CACHE_SIZE; + } + return read_buffers[buffer] + offset; +} + +static void cramfs_kill_sb(struct super_block *sb) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + + kill_block_super(sb); + kfree(sbi); +} + +static int cramfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + +static int cramfs_fill_super(struct super_block *sb, void *data, int silent) +{ + int i; + struct cramfs_super super; + unsigned long root_offset; + struct cramfs_sb_info *sbi; + struct inode *root; + + sb->s_flags |= MS_RDONLY; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Invalidate the read buffers on mount: think disk change.. */ + mutex_lock(&read_mutex); + for (i = 0; i < READ_BUFFERS; i++) + buffer_blocknr[i] = -1; + + /* Read the first block and get the superblock from it */ + memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super)); + mutex_unlock(&read_mutex); + + /* Do sanity checks on the superblock */ + if (super.magic != CRAMFS_MAGIC) { + /* check for wrong endianness */ + if (super.magic == CRAMFS_MAGIC_WEND) { + if (!silent) + pr_err("wrong endianness\n"); + return -EINVAL; + } + + /* check at 512 byte offset */ + mutex_lock(&read_mutex); + memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super)); + mutex_unlock(&read_mutex); + if (super.magic != CRAMFS_MAGIC) { + if (super.magic == CRAMFS_MAGIC_WEND && !silent) + pr_err("wrong endianness\n"); + else if (!silent) + pr_err("wrong magic\n"); + return -EINVAL; + } + } + + /* get feature flags first */ + if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { + pr_err("unsupported filesystem features\n"); + return -EINVAL; + } + + /* Check that the root inode is in a sane state */ + if (!S_ISDIR(super.root.mode)) { + pr_err("root is not a directory\n"); + return -EINVAL; + } + /* correct strange, hard-coded permissions of mkcramfs */ + super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + + root_offset = super.root.offset << 2; + if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { + sbi->size = super.size; + sbi->blocks = super.fsid.blocks; + sbi->files = super.fsid.files; + } else { + sbi->size = 1<<28; + sbi->blocks = 0; + sbi->files = 0; + } + sbi->magic = super.magic; + sbi->flags = super.flags; + if (root_offset == 0) + pr_info("empty filesystem"); + else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && + ((root_offset != sizeof(struct cramfs_super)) && + (root_offset != 512 + sizeof(struct cramfs_super)))) + { + pr_err("bad root offset %lu\n", root_offset); + return -EINVAL; + } + + /* Set it all up.. */ + sb->s_op = &cramfs_ops; + root = get_cramfs_inode(sb, &super.root, 0); + if (IS_ERR(root)) + return PTR_ERR(root); + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + return 0; +} + +static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = CRAMFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_blocks = CRAMFS_SB(sb)->blocks; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = CRAMFS_SB(sb)->files; + buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = CRAMFS_MAXPATHLEN; + return 0; +} + +/* + * Read a cramfs directory entry. + */ +static int cramfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + char *buf; + unsigned int offset; + + /* Offset within the thing. */ + if (ctx->pos >= inode->i_size) + return 0; + offset = ctx->pos; + /* Directory entries are always 4-byte aligned */ + if (offset & 3) + return -EINVAL; + + buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + while (offset < inode->i_size) { + struct cramfs_inode *de; + unsigned long nextoffset; + char *name; + ino_t ino; + umode_t mode; + int namelen; + + mutex_lock(&read_mutex); + de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); + name = (char *)(de+1); + + /* + * Namelengths on disk are shifted by two + * and the name padded out to 4-byte boundaries + * with zeroes. + */ + namelen = de->namelen << 2; + memcpy(buf, name, namelen); + ino = cramino(de, OFFSET(inode) + offset); + mode = de->mode; + mutex_unlock(&read_mutex); + nextoffset = offset + sizeof(*de) + namelen; + for (;;) { + if (!namelen) { + kfree(buf); + return -EIO; + } + if (buf[namelen-1]) + break; + namelen--; + } + if (!dir_emit(ctx, buf, namelen, ino, mode >> 12)) + break; + + ctx->pos = offset = nextoffset; + } + kfree(buf); + return 0; +} + +/* + * Lookup and fill in the inode data.. + */ +static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + unsigned int offset = 0; + struct inode *inode = NULL; + int sorted; + + mutex_lock(&read_mutex); + sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS; + while (offset < dir->i_size) { + struct cramfs_inode *de; + char *name; + int namelen, retval; + int dir_off = OFFSET(dir) + offset; + + de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN); + name = (char *)(de+1); + + /* Try to take advantage of sorted directories */ + if (sorted && (dentry->d_name.name[0] < name[0])) + break; + + namelen = de->namelen << 2; + offset += sizeof(*de) + namelen; + + /* Quick check that the name is roughly the right length */ + if (((dentry->d_name.len + 3) & ~3) != namelen) + continue; + + for (;;) { + if (!namelen) { + inode = ERR_PTR(-EIO); + goto out; + } + if (name[namelen-1]) + break; + namelen--; + } + if (namelen != dentry->d_name.len) + continue; + retval = memcmp(dentry->d_name.name, name, namelen); + if (retval > 0) + continue; + if (!retval) { + inode = get_cramfs_inode(dir->i_sb, de, dir_off); + break; + } + /* else (retval < 0) */ + if (sorted) + break; + } +out: + mutex_unlock(&read_mutex); + if (IS_ERR(inode)) + return ERR_CAST(inode); + d_add(dentry, inode); + return NULL; +} + +static int cramfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + u32 maxblock; + int bytes_filled; + void *pgdata; + + maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + bytes_filled = 0; + pgdata = kmap(page); + + if (page->index < maxblock) { + struct super_block *sb = inode->i_sb; + u32 blkptr_offset = OFFSET(inode) + page->index*4; + u32 start_offset, compr_len; + + start_offset = OFFSET(inode) + maxblock*4; + mutex_lock(&read_mutex); + if (page->index) + start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, + 4); + compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - + start_offset); + mutex_unlock(&read_mutex); + + if (compr_len == 0) + ; /* hole */ + else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { + pr_err("bad compressed blocksize %u\n", + compr_len); + goto err; + } else { + mutex_lock(&read_mutex); + bytes_filled = cramfs_uncompress_block(pgdata, + PAGE_CACHE_SIZE, + cramfs_read(sb, start_offset, compr_len), + compr_len); + mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; + } + } + + memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +err: + kunmap(page); + ClearPageUptodate(page); + SetPageError(page); + unlock_page(page); + return 0; +} + +static const struct address_space_operations cramfs_aops = { + .readpage = cramfs_readpage +}; + +/* + * Our operations: + */ + +/* + * A directory can only readdir + */ +static const struct file_operations cramfs_directory_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = cramfs_readdir, +}; + +static const struct inode_operations cramfs_dir_inode_operations = { + .lookup = cramfs_lookup, +}; + +static const struct super_operations cramfs_ops = { + .remount_fs = cramfs_remount, + .statfs = cramfs_statfs, +}; + +static struct dentry *cramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); +} + +static struct file_system_type cramfs_fs_type = { + .owner = THIS_MODULE, + .name = "cramfs", + .mount = cramfs_mount, + .kill_sb = cramfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("cramfs"); + +static int __init init_cramfs_fs(void) +{ + int rv; + + rv = cramfs_uncompress_init(); + if (rv < 0) + return rv; + rv = register_filesystem(&cramfs_fs_type); + if (rv < 0) + cramfs_uncompress_exit(); + return rv; +} + +static void __exit exit_cramfs_fs(void) +{ + cramfs_uncompress_exit(); + unregister_filesystem(&cramfs_fs_type); +} + +module_init(init_cramfs_fs) +module_exit(exit_cramfs_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/cramfs/internal.h b/kernel/fs/cramfs/internal.h new file mode 100644 index 000000000..349d71272 --- /dev/null +++ b/kernel/fs/cramfs/internal.h @@ -0,0 +1,4 @@ +/* Uncompression interfaces to the underlying zlib */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen); +int cramfs_uncompress_init(void); +void cramfs_uncompress_exit(void); diff --git a/kernel/fs/cramfs/uncompress.c b/kernel/fs/cramfs/uncompress.c new file mode 100644 index 000000000..ec4f1d4fd --- /dev/null +++ b/kernel/fs/cramfs/uncompress.c @@ -0,0 +1,79 @@ +/* + * uncompress.c + * + * (C) Copyright 1999 Linus Torvalds + * + * cramfs interfaces to the uncompression library. There's really just + * three entrypoints: + * + * - cramfs_uncompress_init() - called to initialize the thing. + * - cramfs_uncompress_exit() - tell me when you're done + * - cramfs_uncompress_block() - uncompress a block. + * + * NOTE NOTE NOTE! The uncompression is entirely single-threaded. We + * only have one stream, and we'll initialize it only once even if it + * then is used by multiple filesystems. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "internal.h" + +static z_stream stream; +static int initialized; + +/* Returns length of decompressed data. */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) +{ + int err; + + stream.next_in = src; + stream.avail_in = srclen; + + stream.next_out = dst; + stream.avail_out = dstlen; + + err = zlib_inflateReset(&stream); + if (err != Z_OK) { + pr_err("zlib_inflateReset error %d\n", err); + zlib_inflateEnd(&stream); + zlib_inflateInit(&stream); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto err; + return stream.total_out; + +err: + pr_err("Error %d while decompressing!\n", err); + pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); + return -EIO; +} + +int cramfs_uncompress_init(void) +{ + if (!initialized++) { + stream.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!stream.workspace) { + initialized = 0; + return -ENOMEM; + } + stream.next_in = NULL; + stream.avail_in = 0; + zlib_inflateInit(&stream); + } + return 0; +} + +void cramfs_uncompress_exit(void) +{ + if (!--initialized) { + zlib_inflateEnd(&stream); + vfree(stream.workspace); + } +} diff --git a/kernel/fs/dax.c b/kernel/fs/dax.c new file mode 100644 index 000000000..6f65f00e5 --- /dev/null +++ b/kernel/fs/dax.c @@ -0,0 +1,550 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox + * Author: Ross Zwisler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (iov_iter_rw(iter) != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + iov_iter_rw(iter) == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (iov_iter_rw(iter) == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, get_block_t get_block, + dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + inode_dio_begin(inode); + + retval = dax_io(inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_end(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_pfn_mkwrite - handle first write to DAX page + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * + */ +int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + sb_end_pagefault(sb); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); + +/** + * dax_zero_page_range - zero a range within a page of a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @length: The number of bytes to zero + * @get_block: The filesystem method used to translate file offsets to blocks + * + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/kernel/fs/dcache.c b/kernel/fs/dcache.c new file mode 100644 index 000000000..e1d20e11e --- /dev/null +++ b/kernel/fs/dcache.c @@ -0,0 +1,3459 @@ +/* + * fs/dcache.c + * + * Complete reimplementation + * (C) 1997 Thomas Schoebel-Theuer, + * with heavy changes by Linus Torvalds + */ + +/* + * Notes on the allocation strategy: + * + * The dcache is a master of the icache - whenever a dcache entry + * exists, the inode will always exist. "iput()" is done either when + * the dcache entry is deleted or garbage collected. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "mount.h" + +/* + * Usage: + * dcache->d_inode->i_lock protects: + * - i_dentry, d_u.d_alias, d_inode of aliases + * dcache_hash_bucket lock protects: + * - the dcache hash table + * s_anon bl list spinlock protects: + * - the s_anon list (see __d_drop) + * dentry->d_sb->s_dentry_lru_lock protects: + * - the dcache lru lists and counters + * d_lock protects: + * - d_flags + * - d_name + * - d_lru + * - d_count + * - d_unhashed() + * - d_parent and d_subdirs + * - childrens' d_child and d_parent + * - d_u.d_alias, d_inode + * + * Ordering: + * dentry->d_inode->i_lock + * dentry->d_lock + * dentry->d_sb->s_dentry_lru_lock + * dcache_hash_bucket lock + * s_anon lock + * + * If there is an ancestor relationship: + * dentry->d_parent->...->d_parent->d_lock + * ... + * dentry->d_parent->d_lock + * dentry->d_lock + * + * If no ancestor relationship: + * if (dentry1 < dentry2) + * dentry1->d_lock + * dentry2->d_lock + */ +int sysctl_vfs_cache_pressure __read_mostly = 100; +EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + +__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); + +EXPORT_SYMBOL(rename_lock); + +static struct kmem_cache *dentry_cache __read_mostly; + +/* + * This is the single most critical data structure when it comes + * to the dcache: the hashtable for lookups. Somebody should try + * to make this good - I've just made it work. + * + * This hash-function tries to avoid losing too many bits of hash + * information, yet avoid using a prime hash-size or similar. + */ + +static unsigned int d_hash_mask __read_mostly; +static unsigned int d_hash_shift __read_mostly; + +static struct hlist_bl_head *dentry_hashtable __read_mostly; + +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, + unsigned int hash) +{ + hash += (unsigned long) parent / L1_CACHE_BYTES; + return dentry_hashtable + hash_32(hash, d_hash_shift); +} + +/* Statistics gathering. */ +struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; + +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry, i); + return sum < 0 ? 0 : sum; +} + +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + dentry_stat.nr_dentry = get_nr_dentry(); + dentry_stat.nr_unused = get_nr_dentry_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#include +/* + * NOTE! 'cs' and 'scount' come from a dentry, so it has a + * aligned allocation for this particular component. We don't + * strictly need the load_unaligned_zeropad() safety, but it + * doesn't hurt either. + * + * In contrast, 'ct' and 'tcount' can be from a pathname, and do + * need the careful unaligned handling. + */ +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) +{ + unsigned long a,b,mask; + + for (;;) { + a = *(unsigned long *)cs; + b = load_unaligned_zeropad(ct); + if (tcount < sizeof(unsigned long)) + break; + if (unlikely(a != b)) + return 1; + cs += sizeof(unsigned long); + ct += sizeof(unsigned long); + tcount -= sizeof(unsigned long); + if (!tcount) + return 0; + } + mask = bytemask_from_count(tcount); + return unlikely(!!((a ^ b) & mask)); +} + +#else + +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) +{ + do { + if (*cs != *ct) + return 1; + cs++; + ct++; + tcount--; + } while (tcount); + return 0; +} + +#endif + +static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) +{ + const unsigned char *cs; + /* + * Be careful about RCU walk racing with rename: + * use ACCESS_ONCE to fetch the name pointer. + * + * NOTE! Even if a rename will mean that the length + * was not loaded atomically, we don't care. The + * RCU walk will check the sequence count eventually, + * and catch it. And we won't overrun the buffer, + * because we're reading the name pointer atomically, + * and a dentry name is guaranteed to be properly + * terminated with a NUL byte. + * + * End result: even if 'len' is wrong, we'll exit + * early because the data cannot match (there can + * be no NUL in the ct/tcount data) + */ + cs = ACCESS_ONCE(dentry->d_name.name); + smp_read_barrier_depends(); + return dentry_string_cmp(cs, ct, tcount); +} + +struct external_name { + union { + atomic_t count; + struct rcu_head head; + } u; + unsigned char name[]; +}; + +static inline struct external_name *external_name(struct dentry *dentry) +{ + return container_of(dentry->d_name.name, struct external_name, name[0]); +} + +static void __d_free(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + + kmem_cache_free(dentry_cache, dentry); +} + +static void __d_free_external(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); +} + +static inline int dname_external(const struct dentry *dentry) +{ + return dentry->d_name.name != dentry->d_iname; +} + +/* + * Make sure other CPUs see the inode attached before the type is set. + */ +static inline void __d_set_inode_and_type(struct dentry *dentry, + struct inode *inode, + unsigned type_flags) +{ + unsigned flags; + + dentry->d_inode = inode; + smp_wmb(); + flags = READ_ONCE(dentry->d_flags); + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags |= type_flags; + WRITE_ONCE(dentry->d_flags, flags); +} + +/* + * Ideally, we want to make sure that other CPUs see the flags cleared before + * the inode is detached, but this is really a violation of RCU principles + * since the ordering suggests we should always set inode before flags. + * + * We should instead replace or discard the entire dentry - but that sucks + * performancewise on mass deletion/rename. + */ +static inline void __d_clear_type_and_inode(struct dentry *dentry) +{ + unsigned flags = READ_ONCE(dentry->d_flags); + + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + WRITE_ONCE(dentry->d_flags, flags); + smp_wmb(); + dentry->d_inode = NULL; +} + +static void dentry_free(struct dentry *dentry) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + if (likely(atomic_dec_and_test(&p->u.count))) { + call_rcu(&dentry->d_u.d_rcu, __d_free_external); + return; + } + } + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); +} + +/** + * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * @dentry: the target dentry + * After this call, in-progress rcu-walk path lookup will fail. This + * should be called after unhashing, and after changing d_inode (if + * the dentry has not already been unhashed). + */ +static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +{ + assert_spin_locked(&dentry->d_lock); + /* Go through a barrier */ + write_seqcount_barrier(&dentry->d_seq); +} + +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. Dentry has no refcount + * and is unhashed. + */ +static void dentry_iput(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + if (inode) { + __d_clear_type_and_inode(dentry); + hlist_del_init(&dentry->d_u.d_alias); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); + } else { + spin_unlock(&dentry->d_lock); + } +} + +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. dentry remains in-use. + */ +static void dentry_unlink_inode(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + __d_clear_type_and_inode(dentry); + hlist_del_init(&dentry->d_u.d_alias); + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); +} + +/* + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. + */ +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_lru_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); +} + +/* + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. + */ +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + list_lru_isolate(lru, &dentry->d_lru); +} + +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_lru_isolate_move(lru, &dentry->d_lru, list); +} + +/* + * dentry_lru_(add|del)_list) must be called with d_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); +} + +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent dentry hashes, so that it won't + * be found through a VFS lookup any more. Note that this is different from + * deleting the dentry - d_delete will try to mark the dentry negative if + * possible, giving a successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants to invalidate a dentry for some + * reason (NFS timeouts or autofs deletes). + * + * __d_drop requires dentry->d_lock. + */ +void __d_drop(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + /* + * Hashed dentries are normally on the dentry hashtable, + * with the exception of those newly allocated by + * d_obtain_alias, which are always IS_ROOT: + */ + if (unlikely(IS_ROOT(dentry))) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + dentry_rcuwalk_barrier(dentry); + } +} +EXPORT_SYMBOL(__d_drop); + +void d_drop(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_drop); + +static void __dentry_kill(struct dentry *dentry) +{ + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) + parent = dentry->d_parent; + + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + + /* + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } + /* if it was on the hash then remove it */ + __d_drop(dentry); + __list_del_entry(&dentry->d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON(dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_chill(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (unlikely(dentry->d_lockref.count < 0)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + rcu_read_lock(); + spin_unlock(&dentry->d_lock); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; +} + +/* + * Try to do a lockless dput(), and return whether that was successful. + * + * If unsuccessful, we return false, having already taken the dentry lock. + * + * The caller needs to hold the RCU read lock, so that the dentry is + * guaranteed to stay around even if the refcount goes down to zero! + */ +static inline bool fast_dput(struct dentry *dentry) +{ + int ret; + unsigned int d_flags; + + /* + * If we have a d_op->d_delete() operation, we sould not + * let the dentry count go to zero, so use "put__or_lock". + */ + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) + return lockref_put_or_lock(&dentry->d_lockref); + + /* + * .. otherwise, we can try to just decrement the + * lockref optimistically. + */ + ret = lockref_put_return(&dentry->d_lockref); + + /* + * If the lockref_put_return() failed due to the lock being held + * by somebody else, the fast path has failed. We will need to + * get the lock, and then check the count again. + */ + if (unlikely(ret < 0)) { + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + return 1; + } + return 0; + } + + /* + * If we weren't the last ref, we're done. + */ + if (ret) + return 1; + + /* + * Careful, careful. The reference count went down + * to zero, but we don't hold the dentry lock, so + * somebody else could get it again, and do another + * dput(), and we need to not race with that. + * + * However, there is a very special and common case + * where we don't care, because there is nothing to + * do: the dentry is still hashed, it does not have + * a 'delete' op, and it's referenced and already on + * the LRU list. + * + * NOTE! Since we aren't locked, these values are + * not "stable". However, it is sufficient that at + * some point after we dropped the reference the + * dentry was hashed and the flags had the proper + * value. Other dentry users may have re-gotten + * a reference to the dentry and change that, but + * our work is done - we can leave the dentry + * around with a zero refcount. + */ + smp_rmb(); + d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + + /* Nothing to do? Dropping the reference was all we needed? */ + if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + return 1; + + /* + * Not the fast normal case? Get the lock. We've already decremented + * the refcount, but we'll need to re-check the situation after + * getting the lock. + */ + spin_lock(&dentry->d_lock); + + /* + * Did somebody else grab a reference to it in the meantime, and + * we're no longer the last user after all? Alternatively, somebody + * else could have killed it and marked it dead. Either way, we + * don't need to do anything else. + */ + if (dentry->d_lockref.count) { + spin_unlock(&dentry->d_lock); + return 1; + } + + /* + * Re-get the reference we optimistically dropped. We hold the + * lock, and we just tested that it was zero, so we can just + * set it to 1. + */ + dentry->d_lockref.count = 1; + return 0; +} + + +/* + * This is dput + * + * This is complicated by the fact that we do not want to put + * dentries that are no longer on any hash chain on the unused + * list: we'd much rather just get rid of them immediately. + * + * However, that implies that we have to traverse the dentry + * tree upwards to the parents which might _also_ now be + * scheduled for deletion (it may have been only waiting for + * its last child to go away). + * + * This tail recursion is done by hand as we don't want to depend + * on the compiler to always get this right (gcc generally doesn't). + * Real recursion would eat up our stack space. + */ + +/* + * dput - release a dentry + * @dentry: dentry to release + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and + * releasing its resources. If the parent dentries were scheduled for release + * they too may now get deleted. + */ +void dput(struct dentry *dentry) +{ + if (unlikely(!dentry)) + return; + +repeat: + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + + /* Slow case: now with the dentry lock held */ + rcu_read_unlock(); + + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + goto kill_it; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { + if (dentry->d_op->d_delete(dentry)) + goto kill_it; + } + + if (!(dentry->d_flags & DCACHE_REFERENCED)) + dentry->d_flags |= DCACHE_REFERENCED; + dentry_lru_add(dentry); + + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + return; + +kill_it: + dentry = dentry_kill(dentry); + if (dentry) + goto repeat; +} +EXPORT_SYMBOL(dput); + + +/* This must be called with d_lock held */ +static inline void __dget_dlock(struct dentry *dentry) +{ + dentry->d_lockref.count++; +} + +static inline void __dget(struct dentry *dentry) +{ + lockref_get(&dentry->d_lockref); +} + +struct dentry *dget_parent(struct dentry *dentry) +{ + int gotref; + struct dentry *ret; + + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + +repeat: + /* + * Don't need rcu_dereference because we re-check it was correct under + * the lock. + */ + rcu_read_lock(); + ret = dentry->d_parent; + spin_lock(&ret->d_lock); + if (unlikely(ret != dentry->d_parent)) { + spin_unlock(&ret->d_lock); + rcu_read_unlock(); + goto repeat; + } + rcu_read_unlock(); + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; + spin_unlock(&ret->d_lock); + return ret; +} +EXPORT_SYMBOL(dget_parent); + +/** + * d_find_alias - grab a hashed alias of inode + * @inode: inode in question + * + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem, or if the directory was renamed and d_revalidate + * was the first vfs operation to notice. + * + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer + * any other hashed alias over that one. + */ +static struct dentry *__d_find_alias(struct inode *inode) +{ + struct dentry *alias, *discon_alias; + +again: + discon_alias = NULL; + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (IS_ROOT(alias) && + (alias->d_flags & DCACHE_DISCONNECTED)) { + discon_alias = alias; + } else { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + } + spin_unlock(&alias->d_lock); + } + if (discon_alias) { + alias = discon_alias; + spin_lock(&alias->d_lock); + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + spin_unlock(&alias->d_lock); + goto again; + } + return NULL; +} + +struct dentry *d_find_alias(struct inode *inode) +{ + struct dentry *de = NULL; + + if (!hlist_empty(&inode->i_dentry)) { + spin_lock(&inode->i_lock); + de = __d_find_alias(inode); + spin_unlock(&inode->i_lock); + } + return de; +} +EXPORT_SYMBOL(d_find_alias); + +/* + * Try to kill dentries associated with this inode. + * WARNING: you must own a reference to inode. + */ +void d_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; +restart: + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) { + struct dentry *parent = lock_parent(dentry); + if (likely(!dentry->d_lockref.count)) { + __dentry_kill(dentry); + dput(parent); + goto restart; + } + if (parent) + spin_unlock(&parent->d_lock); + } + spin_unlock(&dentry->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_prune_aliases); + +static void shrink_dentry_list(struct list_head *list) +{ + struct dentry *dentry, *parent; + + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); + spin_lock(&dentry->d_lock); + parent = lock_parent(dentry); + + /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + d_shrink_del(dentry); + + /* + * We found an inuse dentry which was not removed from + * the LRU because of laziness during lookup. Do not free it. + */ + if (dentry->d_lockref.count > 0) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { + d_shrink_add(dentry, list); + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; + } + } +} + +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(lru, dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + +/** + * prune_dcache_sb - shrink the dcache + * @sb: superblock + * @sc: shrink control, passed to list_lru_shrink_walk() + * + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker + * function. + * + * This function may fail to free any resources if all the dentries are in + * use. + */ +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + long freed; + + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); + shrink_dentry_list(&dispose); + return freed; +} + +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + + +/** + * shrink_dcache_sb - shrink dcache for a superblock + * @sb: superblock + * + * Shrink the dcache for the specified super block. This is used to free + * the dcache before unmounting a file system. + */ +void shrink_dcache_sb(struct super_block *sb) +{ + long freed; + + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); + + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); +} +EXPORT_SYMBOL(shrink_dcache_sb); + +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; + +/** + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk + * + * The @enter() and @finish() callbacks are called with d_lock held. + */ +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) +{ + struct dentry *this_parent; + struct list_head *next; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; + +again: + read_seqbegin_or_lock(&rename_lock, &seq); + this_parent = parent; + spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + spin_unlock(&dentry->d_lock); + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; + } + + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + rcu_read_lock(); +ascend: + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = child->d_parent; + + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* might go back up the wrong parent if we have had a rename. */ + if (need_seqretry(&rename_lock, seq)) + goto rename_retry; + /* go into the first sibling still alive */ + do { + next = child->d_child.next; + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); + rcu_read_unlock(); + goto resume; + } + if (need_seqretry(&rename_lock, seq)) + goto rename_retry; + rcu_read_unlock(); + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; + +rename_retry: + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + BUG_ON(seq & 1); + if (!retry) + return; + seq = 1; + goto again; +} + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} +EXPORT_SYMBOL(have_submounts); + +/* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of d_invalidate() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. d_invalidate() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + +/* + * Search the dentry child list of the specified parent, + * and move any unused dentries to the end of the unused + * list for prune_dcache(). We descend to the next level + * whenever the d_subdirs list is non-empty and continue + * searching. + * + * It returns zero iff there are no unused children, + * otherwise it returns the number of children moved to + * the end of the unused list. This may not be the total + * number of unused children, because select_parent can + * drop the lock and return early due to latency + * constraints. + */ + +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; + +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; + + if (data->start == dentry) + goto out; + + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + data->found++; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; +out: + return ret; +} + +/** + * shrink_dcache_parent - prune dcache + * @parent: parent of entries to prune + * + * Prune the dcache to remove unused children of the parent dentry. + */ +void shrink_dcache_parent(struct dentry *parent) +{ + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; + + shrink_dentry_list(&data.dispose); + cond_resched(); + } +} +EXPORT_SYMBOL(shrink_dcache_parent); + +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) +{ + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; + + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry, + dentry->d_lockref.count, + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); +} + +/* + * destroy the dentries attached to a superblock on unmounting + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); + + dentry = sb->s_root; + sb->s_root = NULL; + do_one_tree(dentry); + + while (!hlist_bl_empty(&sb->s_anon)) { + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); + } +} + +struct detach_data { + struct select_data select; + struct dentry *mountpoint; +}; +static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) +{ + struct detach_data *data = _data; + + if (d_mountpoint(dentry)) { + __dget_dlock(dentry); + data->mountpoint = dentry; + return D_WALK_QUIT; + } + + return select_collect(&data->select, dentry); +} + +static void check_and_drop(void *_data) +{ + struct detach_data *data = _data; + + if (!data->mountpoint && !data->select.found) + __d_drop(data->select.start); +} + +/** + * d_invalidate - detach submounts, prune dcache, and drop + * @dentry: dentry to invalidate (aka detach, prune and drop) + * + * no dcache lock. + * + * The final d_drop is done as an atomic operation relative to + * rename_lock ensuring there are no races with d_set_mounted. This + * ensures there are no unhashed dentries on the path to a mountpoint. + */ +void d_invalidate(struct dentry *dentry) +{ + /* + * If it's already been dropped, return OK. + */ + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + return; + } + spin_unlock(&dentry->d_lock); + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + return; + } + + for (;;) { + struct detach_data data; + + data.mountpoint = NULL; + INIT_LIST_HEAD(&data.select.dispose); + data.select.start = dentry; + data.select.found = 0; + + d_walk(dentry, &data, detach_and_collect, check_and_drop); + + if (data.select.found) + shrink_dentry_list(&data.select.dispose); + + if (data.mountpoint) { + detach_mounts(data.mountpoint); + dput(data.mountpoint); + } + + if (!data.mountpoint && !data.select.found) + break; + + cond_resched(); + } +} +EXPORT_SYMBOL(d_invalidate); + +/** + * __d_alloc - allocate a dcache entry + * @sb: filesystem it will belong to + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ + +struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) +{ + struct dentry *dentry; + char *dname; + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; + + /* + * We guarantee that the inline name is always NUL-terminated. + * This way the memcpy() done by the name switching in rename + * will still always have a NUL at the end, even if we might + * be overwriting an internal NUL character + */ + dentry->d_iname[DNAME_INLINE_LEN-1] = 0; + if (name->len > DNAME_INLINE_LEN-1) { + size_t size = offsetof(struct external_name, name[1]); + struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + if (!p) { + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + atomic_set(&p->u.count, 1); + dname = p->name; + if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS)) + kasan_unpoison_shadow(dname, + round_up(name->len + 1, sizeof(unsigned long))); + } else { + dname = dentry->d_iname; + } + + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); + dname[name->len] = 0; + + /* Make sure we always see the terminating NUL character */ + smp_wmb(); + dentry->d_name.name = dname; + + dentry->d_lockref.count = 1; + dentry->d_flags = 0; + spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); + dentry->d_inode = NULL; + dentry->d_parent = dentry; + dentry->d_sb = sb; + dentry->d_op = NULL; + dentry->d_fsdata = NULL; + INIT_HLIST_BL_NODE(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_child); + d_set_d_op(dentry, dentry->d_sb->s_d_op); + + this_cpu_inc(nr_dentry); + + return dentry; +} + +/** + * d_alloc - allocate a dcache entry + * @parent: parent of entry to allocate + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ +struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) +{ + struct dentry *dentry = __d_alloc(parent->d_sb, name); + if (!dentry) + return NULL; + + spin_lock(&parent->d_lock); + /* + * don't need child lock because it is not subject + * to concurrency here + */ + __dget_dlock(parent); + dentry->d_parent = parent; + list_add(&dentry->d_child, &parent->d_subdirs); + spin_unlock(&parent->d_lock); + + return dentry; +} +EXPORT_SYMBOL(d_alloc); + +/** + * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) + * @sb: the superblock + * @name: qstr of the name + * + * For a filesystem that just pins its dentries in memory and never + * performs lookups at all, return an unhashed IS_ROOT dentry. + */ +struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) +{ + return __d_alloc(sb, name); +} +EXPORT_SYMBOL(d_alloc_pseudo); + +struct dentry *d_alloc_name(struct dentry *parent, const char *name) +{ + struct qstr q; + + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len); + return d_alloc(parent, &q); +} +EXPORT_SYMBOL(d_alloc_name); + +void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + WARN_ON_ONCE(dentry->d_op); + WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | + DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | + DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_DELETE )); + dentry->d_op = op; + if (!op) + return; + if (op->d_hash) + dentry->d_flags |= DCACHE_OP_HASH; + if (op->d_compare) + dentry->d_flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + dentry->d_flags |= DCACHE_OP_REVALIDATE; + if (op->d_weak_revalidate) + dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; + if (op->d_delete) + dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; + +} +EXPORT_SYMBOL(d_set_d_op); + + +/* + * d_set_fallthru - Mark a dentry as falling through to a lower layer + * @dentry - The dentry to mark + * + * Mark a dentry as falling through to the lower layer (as set with + * d_pin_lower()). This flag may be recorded on the medium. + */ +void d_set_fallthru(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_FALLTHRU; + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_set_fallthru); + +static unsigned d_flags_for_inode(struct inode *inode) +{ + unsigned add_flags = DCACHE_REGULAR_TYPE; + + if (!inode) + return DCACHE_MISS_TYPE; + + if (S_ISDIR(inode->i_mode)) { + add_flags = DCACHE_DIRECTORY_TYPE; + if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { + if (unlikely(!inode->i_op->lookup)) + add_flags = DCACHE_AUTODIR_TYPE; + else + inode->i_opflags |= IOP_LOOKUP; + } + goto type_determined; + } + + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->follow_link)) { + add_flags = DCACHE_SYMLINK_TYPE; + goto type_determined; + } + inode->i_opflags |= IOP_NOFOLLOW; + } + + if (unlikely(!S_ISREG(inode->i_mode))) + add_flags = DCACHE_SPECIAL_TYPE; + +type_determined: + if (unlikely(IS_AUTOMOUNT(inode))) + add_flags |= DCACHE_NEED_AUTOMOUNT; + return add_flags; +} + +static void __d_instantiate(struct dentry *dentry, struct inode *inode) +{ + unsigned add_flags = d_flags_for_inode(inode); + + spin_lock(&dentry->d_lock); + if (inode) + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + __d_set_inode_and_type(dentry, inode, add_flags); + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + fsnotify_d_instantiate(dentry, inode); +} + +/** + * d_instantiate - fill in inode information for a dentry + * @entry: dentry to complete + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. + * + * This turns negative dentries into productive full members + * of society. + * + * NOTE! This assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ + +void d_instantiate(struct dentry *entry, struct inode * inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + if (inode) + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + if (inode) + spin_unlock(&inode->i_lock); + security_d_instantiate(entry, inode); +} +EXPORT_SYMBOL(d_instantiate); + +/** + * d_instantiate_unique - instantiate a non-aliased dentry + * @entry: dentry to instantiate + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. On success, it returns NULL. + * If an unhashed alias of "entry" already exists, then we return the + * aliased dentry instead and drop one reference to inode. + * + * Note that in order to avoid conflicts with rename() etc, the caller + * had better be holding the parent directory semaphore. + * + * This also assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ +static struct dentry *__d_instantiate_unique(struct dentry *entry, + struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + if (!inode) { + __d_instantiate(entry, NULL); + return NULL; + } + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ + if (alias->d_name.hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (alias->d_name.len != len) + continue; + if (dentry_cmp(alias, name, len)) + continue; + __dget(alias); + return alias; + } + + __d_instantiate(entry, inode); + return NULL; +} + +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *result; + + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + + if (inode) + spin_lock(&inode->i_lock); + result = __d_instantiate_unique(entry, inode); + if (inode) + spin_unlock(&inode->i_lock); + + if (!result) { + security_d_instantiate(entry, inode); + return NULL; + } + + BUG_ON(!d_unhashed(result)); + iput(inode); + return result; +} + +EXPORT_SYMBOL(d_instantiate_unique); + +/** + * d_instantiate_no_diralias - instantiate a non-aliased dentry + * @entry: dentry to complete + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. If a directory alias is found, then + * return an error (and drop inode). Together with d_materialise_unique() this + * guarantees that a directory inode may never have more than one alias. + */ +int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { + spin_unlock(&inode->i_lock); + iput(inode); + return -EBUSY; + } + __d_instantiate(entry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(entry, inode); + + return 0; +} +EXPORT_SYMBOL(d_instantiate_no_diralias); + +struct dentry *d_make_root(struct inode *root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + static const struct qstr name = QSTR_INIT("/", 1); + + res = __d_alloc(root_inode->i_sb, &name); + if (res) + d_instantiate(res, root_inode); + else + iput(root_inode); + } + return res; +} +EXPORT_SYMBOL(d_make_root); + +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + __dget(alias); + return alias; +} + +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} +EXPORT_SYMBOL(d_find_any_alias); + +static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) +{ + static const struct qstr anonstring = QSTR_INIT("/", 1); + struct dentry *tmp; + struct dentry *res; + unsigned add_flags; + + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + res = d_find_any_alias(inode); + if (res) + goto out_iput; + + tmp = __d_alloc(inode->i_sb, &anonstring); + if (!tmp) { + res = ERR_PTR(-ENOMEM); + goto out_iput; + } + + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); + if (res) { + spin_unlock(&inode->i_lock); + dput(tmp); + goto out_iput; + } + + /* attach a disconnected dentry */ + add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; + + spin_lock(&tmp->d_lock); + __d_set_inode_and_type(tmp, inode, add_flags); + hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry); + hlist_bl_lock(&tmp->d_sb->s_anon); + hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); + hlist_bl_unlock(&tmp->d_sb->s_anon); + spin_unlock(&tmp->d_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(tmp, inode); + + return tmp; + + out_iput: + if (res && !IS_ERR(res)) + security_d_instantiate(res, inode); + iput(inode); + return res; +} + +/** + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). + * + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and the error will be propagated to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_alias(struct inode *inode) +{ + return __d_obtain_alias(inode, 1); +} +EXPORT_SYMBOL(d_obtain_alias); + +/** + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain an IS_ROOT dentry for the root of a filesystem. + * + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_root(struct inode *inode) +{ + return __d_obtain_alias(inode, 0); +} +EXPORT_SYMBOL(d_obtain_root); + +/** + * d_add_ci - lookup or allocate new dentry with case-exact name + * @inode: the inode case-insensitive lookup has found + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * This is to avoid filling the dcache with case-insensitive names to the + * same inode, only the actual correct case is stored in the dcache for + * case-insensitive filesystems. + * + * For a case-insensitive lookup match and if the the case-exact dentry + * already exists in in the dcache, use it and return it. + * + * If no entry exists with the exact case name, allocate new dentry with + * the exact case, and return the spliced entry. + */ +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, + struct qstr *name) +{ + struct dentry *found; + struct dentry *new; + + /* + * First check if a dentry matching the name already exists, + * if not go ahead and create it now. + */ + found = d_hash_and_lookup(dentry->d_parent, name); + if (!found) { + new = d_alloc(dentry->d_parent, name); + if (!new) { + found = ERR_PTR(-ENOMEM); + } else { + found = d_splice_alias(inode, new); + if (found) { + dput(new); + return found; + } + return new; + } + } + iput(inode); + return found; +} +EXPORT_SYMBOL(d_add_ci); + +/* + * Do the slow-case of the dentry name compare. + * + * Unlike the dentry_cmp() function, we need to atomically + * load the name and length information, so that the + * filesystem can rely on them, and can use the 'name' and + * 'len' information without worrying about walking off the + * end of memory etc. + * + * Thus the read_seqcount_retry() and the "duplicate" info + * in arguments (the low-level filesystem should not look + * at the dentry inode or name contents directly, since + * rename can change them while we're in RCU mode). + */ +enum slow_d_compare { + D_COMP_OK, + D_COMP_NOMATCH, + D_COMP_SEQRETRY, +}; + +static noinline enum slow_d_compare slow_dentry_cmp( + const struct dentry *parent, + struct dentry *dentry, + unsigned int seq, + const struct qstr *name) +{ + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; + + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); + return D_COMP_SEQRETRY; + } + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) + return D_COMP_NOMATCH; + return D_COMP_OK; +} + +/** + * __d_lookup_rcu - search for a dentry (racy, store-free) + * @parent: parent dentry + * @name: qstr of name we wish to find + * @seqp: returns d_seq value at the point where the dentry was found + * Returns: dentry, or NULL + * + * __d_lookup_rcu is the dcache lookup function for rcu-walk name + * resolution (store-free path walking) design described in + * Documentation/filesystems/path-lookup.txt. + * + * This is not to be used outside core vfs. + * + * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock + * held, and rcu_read_lock held. The returned dentry must not be stored into + * without taking d_lock and checking d_seq sequence count against @seq + * returned here. + * + * A refcount may be taken on the found dentry with the d_rcu_to_refcount + * function. + * + * Alternatively, __d_lookup_rcu may be called again to look up the child of + * the returned dentry, so long as its parent's seqlock is checked after the + * child is looked up. Thus, an interlocking stepping of sequence lock checks + * is formed, giving integrity down the path walk. + * + * NOTE! The caller *has* to check the resulting dentry against the sequence + * number we've returned before using any of the resulting dentry state! + */ +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp) +{ + u64 hashlen = name->hash_len; + const unsigned char *str = name->name; + struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); + struct hlist_bl_node *node; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Carefully use d_seq when comparing a candidate dentry, to avoid + * races with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; + +seqretry: + /* + * The dentry sequence count protects us from concurrent + * renames, and thus protects parent and name fields. + * + * The caller must perform a seqcount check in order + * to do anything useful with the returned dentry. + * + * NOTE! We do a "raw" seqcount_begin here. That means that + * we don't wait for the sequence count to stabilize if it + * is in the middle of a sequence change. If we do the slow + * dentry compare, we will do seqretries until it is stable, + * and if we end up with a successful lookup, we actually + * want to exit RCU lookup anyway. + */ + seq = raw_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { + if (dentry->d_name.hash != hashlen_hash(hashlen)) + continue; + *seqp = seq; + switch (slow_dentry_cmp(parent, dentry, seq, name)) { + case D_COMP_OK: + return dentry; + case D_COMP_NOMATCH: + continue; + default: + goto seqretry; + } + } + + if (dentry->d_name.hash_len != hashlen) + continue; + *seqp = seq; + if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) + return dentry; + } + return NULL; +} + +/** + * d_lookup - search for a dentry + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * d_lookup searches the children of the parent dentry for the name in + * question. If the dentry is found its reference count is incremented and the + * dentry is returned. The caller must use dput to free the entry when it has + * finished using it. %NULL is returned if the dentry does not exist. + */ +struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) +{ + struct dentry *dentry; + unsigned seq; + + do { + seq = read_seqbegin(&rename_lock); + dentry = __d_lookup(parent, name); + if (dentry) + break; + } while (read_seqretry(&rename_lock, seq)); + return dentry; +} +EXPORT_SYMBOL(d_lookup); + +/** + * __d_lookup - search for a dentry (racy) + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * __d_lookup is like d_lookup, however it may (rarely) return a + * false-negative result due to unrelated rename activity. + * + * __d_lookup is slightly faster by avoiding rename_lock read seqlock, + * however it must be used carefully, eg. with a following d_lookup in + * the case of failure. + * + * __d_lookup callers must be commented. + */ +struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *found = NULL; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Take d_lock when comparing a candidate dentry, to avoid races + * with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + rcu_read_lock(); + + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + + if (dentry->d_name.hash != hash) + continue; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent != parent) + goto next; + if (d_unhashed(dentry)) + goto next; + + /* + * It is safe to compare names since d_move() cannot + * change the qstr (protected by d_lock). + */ + if (parent->d_flags & DCACHE_OP_COMPARE) { + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) + goto next; + } else { + if (dentry->d_name.len != len) + goto next; + if (dentry_cmp(dentry, str, len)) + goto next; + } + + dentry->d_lockref.count++; + found = dentry; + spin_unlock(&dentry->d_lock); + break; +next: + spin_unlock(&dentry->d_lock); + } + rcu_read_unlock(); + + return found; +} + +/** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(name->name, name->len); + if (dir->d_flags & DCACHE_OP_HASH) { + int err = dir->d_op->d_hash(dir, name); + if (unlikely(err < 0)) + return ERR_PTR(err); + } + return d_lookup(dir, name); +} +EXPORT_SYMBOL(d_hash_and_lookup); + +/* + * When a file is deleted, we have two options: + * - turn this dentry into a negative dentry + * - unhash this dentry and free it. + * + * Usually, we want to just turn this into + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users + */ + +/** + * d_delete - delete a dentry + * @dentry: The dentry to delete + * + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later + */ + +void d_delete(struct dentry * dentry) +{ + struct inode *inode; + int isdir = 0; + /* + * Are we the only user? + */ +again: + spin_lock(&dentry->d_lock); + inode = dentry->d_inode; + isdir = S_ISDIR(inode->i_mode); + if (dentry->d_lockref.count == 1) { + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&dentry->d_lock); + cpu_chill(); + goto again; + } + dentry->d_flags &= ~DCACHE_CANT_MOUNT; + dentry_unlink_inode(dentry); + fsnotify_nameremove(dentry, isdir); + return; + } + + if (!d_unhashed(dentry)) + __d_drop(dentry); + + spin_unlock(&dentry->d_lock); + + fsnotify_nameremove(dentry, isdir); +} +EXPORT_SYMBOL(d_delete); + +static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) +{ + BUG_ON(!d_unhashed(entry)); + hlist_bl_lock(b); + entry->d_flags |= DCACHE_RCUACCESS; + hlist_bl_add_head_rcu(&entry->d_hash, b); + hlist_bl_unlock(b); +} + +static void _d_rehash(struct dentry * entry) +{ + __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); +} + +/** + * d_rehash - add an entry back to the hash + * @entry: dentry to add to the hash + * + * Adds a dentry to the hash according to its name. + */ + +void d_rehash(struct dentry * entry) +{ + spin_lock(&entry->d_lock); + _d_rehash(entry); + spin_unlock(&entry->d_lock); +} +EXPORT_SYMBOL(d_rehash); + +/** + * dentry_update_name_case - update case insensitive dentry with a new name + * @dentry: dentry to be updated + * @name: new name + * + * Update a case insensitive dentry with new case of name. + * + * dentry must have been returned by d_lookup with name @name. Old and new + * name lengths must match (ie. no d_compare which allows mismatched name + * lengths). + * + * Parent inode i_mutex must be held over d_lookup and into this call (to + * keep renames and concurrent inserts, and readdir(2) away). + */ +void dentry_update_name_case(struct dentry *dentry, struct qstr *name) +{ + BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ + + spin_lock(&dentry->d_lock); + write_seqcount_begin(&dentry->d_seq); + memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); + write_seqcount_end(&dentry->d_seq); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(dentry_update_name_case); + +static void swap_names(struct dentry *dentry, struct dentry *target) +{ + if (unlikely(dname_external(target))) { + if (unlikely(dname_external(dentry))) { + /* + * Both external: swap the pointers + */ + swap(target->d_name.name, dentry->d_name.name); + } else { + /* + * dentry:internal, target:external. Steal target's + * storage and make target internal. + */ + memcpy(target->d_iname, dentry->d_name.name, + dentry->d_name.len + 1); + dentry->d_name.name = target->d_name.name; + target->d_name.name = target->d_iname; + } + } else { + if (unlikely(dname_external(dentry))) { + /* + * dentry:external, target:internal. Give dentry's + * storage to target and make dentry internal + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + target->d_name.name = dentry->d_name.name; + dentry->d_name.name = dentry->d_iname; + } else { + /* + * Both are internal. + */ + unsigned int i; + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); + kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { + swap(((long *) &dentry->d_iname)[i], + ((long *) &target->d_iname)[i]); + } + } + } + swap(dentry->d_name.hash_len, target->d_name.hash_len); +} + +static void copy_name(struct dentry *dentry, struct dentry *target) +{ + struct external_name *old_name = NULL; + if (unlikely(dname_external(dentry))) + old_name = external_name(dentry); + if (unlikely(dname_external(target))) { + atomic_inc(&external_name(target)->u.count); + dentry->d_name = target->d_name; + } else { + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + dentry->d_name.name = dentry->d_iname; + dentry->d_name.hash_len = target->d_name.hash_len; + } + if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) + kfree_rcu(old_name, u.head); +} + +static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) +{ + /* + * XXXX: do we really need to take target->d_lock? + */ + if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) + spin_lock(&target->d_parent->d_lock); + else { + if (d_ancestor(dentry->d_parent, target->d_parent)) { + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } else { + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&dentry->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + } + if (target < dentry) { + spin_lock_nested(&target->d_lock, 2); + spin_lock_nested(&dentry->d_lock, 3); + } else { + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); + } +} + +static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) +{ + if (target->d_parent != dentry->d_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (target->d_parent != target) + spin_unlock(&target->d_parent->d_lock); + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); +} + +/* + * When switching names, the actual string doesn't strictly have to + * be preserved in the target - because we're dropping the target + * anyway. As such, we can just do a simple memcpy() to copy over + * the new name before we switch, unless we are going to rehash + * it. Note that if we *do* unhash the target, we are not allowed + * to rehash it without giving it a new name/hash key - whether + * we swap or overwrite the names here, resulting name won't match + * the reality in filesystem; it's only there for d_path() purposes. + * Note that all of this is happening under rename_lock, so the + * any hash lookup seeing it in the middle of manipulations will + * be discarded anyway. So we do not care what happens to the hash + * key in that case. + */ +/* + * __d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * @exchange: exchange the two dentries + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + */ +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) +{ + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + + BUG_ON(d_ancestor(dentry, target)); + BUG_ON(d_ancestor(target, dentry)); + + dentry_lock_for_move(dentry, target); + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); + + /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ + + /* + * Move the dentry to the target hash queue. Don't bother checking + * for the same hash queue because of how unlikely it is. + */ + __d_drop(dentry); + __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); + + /* + * Unhash the target (d_delete() is not usable here). If exchanging + * the two dentries, then rehash onto the other's hash queue. + */ + __d_drop(target); + if (exchange) { + __d_rehash(target, + d_hash(dentry->d_parent, dentry->d_name.hash)); + } + + /* Switch the names.. */ + if (exchange) + swap_names(dentry, target); + else + copy_name(dentry, target); + + /* ... and switch them in the tree */ + if (IS_ROOT(dentry)) { + /* splicing a tree */ + dentry->d_parent = target->d_parent; + target->d_parent = target; + list_del_init(&target->d_child); + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + } else { + /* swapping two dentries */ + swap(dentry->d_parent, target->d_parent); + list_move(&target->d_child, &target->d_parent->d_subdirs); + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + if (exchange) + fsnotify_d_move(target); + fsnotify_d_move(dentry); + } + + write_seqcount_end(&target->d_seq); + write_seqcount_end(&dentry->d_seq); + + dentry_unlock_for_move(dentry, target); +} + +/* + * d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. + */ +void d_move(struct dentry *dentry, struct dentry *target) +{ + write_seqlock(&rename_lock); + __d_move(dentry, target, false); + write_sequnlock(&rename_lock); +} +EXPORT_SYMBOL(d_move); + +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} + +/** + * d_ancestor - search for an ancestor + * @p1: ancestor dentry + * @p2: child dentry + * + * Returns the ancestor dentry of p2 which is a child of p1, if p1 is + * an ancestor of p2, else NULL. + */ +struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p; + + for (p = p2; !IS_ROOT(p); p = p->d_parent) { + if (p->d_parent == p1) + return p; + } + return NULL; +} + +/* + * This helper attempts to cope with remotely renamed directories + * + * It assumes that the caller is already holding + * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * + * Note: If ever the locking in lock_rename() changes, then please + * remember to update this too... + */ +static int __d_unalias(struct inode *inode, + struct dentry *dentry, struct dentry *alias) +{ + struct mutex *m1 = NULL, *m2 = NULL; + int ret = -ESTALE; + + /* If alias and dentry share a parent, then no extra locks required */ + if (alias->d_parent == dentry->d_parent) + goto out_unalias; + + /* See lock_rename() */ + if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) + goto out_err; + m1 = &dentry->d_sb->s_vfs_rename_mutex; + if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + goto out_err; + m2 = &alias->d_parent->d_inode->i_mutex; +out_unalias: + __d_move(alias, dentry, false); + ret = 0; +out_err: + spin_unlock(&inode->i_lock); + if (m2) + mutex_unlock(m2); + if (m1) + mutex_unlock(m1); + return ret; +} + +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. + * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + if (IS_ERR(inode)) + return ERR_CAST(inode); + + BUG_ON(!d_unhashed(dentry)); + + if (!inode) { + __d_instantiate(dentry, NULL); + goto out; + } + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode)) { + struct dentry *new = __d_find_any_alias(inode); + if (unlikely(new)) { + write_seqlock(&rename_lock); + if (unlikely(d_ancestor(new, dentry))) { + write_sequnlock(&rename_lock); + spin_unlock(&inode->i_lock); + dput(new); + new = ERR_PTR(-ELOOP); + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); + } else if (!IS_ROOT(new)) { + int err = __d_unalias(inode, dentry, new); + write_sequnlock(&rename_lock); + if (err) { + dput(new); + new = ERR_PTR(err); + } + } else { + __d_move(new, dentry, false); + write_sequnlock(&rename_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); + } + iput(inode); + return new; + } + } + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); +out: + security_d_instantiate(dentry, inode); + d_rehash(dentry); + return NULL; +} +EXPORT_SYMBOL(d_splice_alias); + +static int prepend(char **buffer, int *buflen, const char *str, int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + * + * Data dependency barrier is needed to make sure that we see that terminating + * NUL. Alpha strikes again, film at 11... + */ +static int prepend_name(char **buffer, int *buflen, struct qstr *name) +{ + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + smp_read_barrier_depends(); + + *buflen -= dlen + 1; + if (*buflen < 0) + return -ENAMETOOLONG; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; +} + +/** + * prepend_path - Prepend path string to a buffer + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buffer: pointer to the end of the buffer + * @buflen: pointer to buffer length + * + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. + */ +static int prepend_path(const struct path *path, + const struct path *root, + char **buffer, int *buflen) +{ + struct dentry *dentry; + struct vfsmount *vfsmnt; + struct mount *mnt; + int error = 0; + unsigned seq, m_seq = 0; + char *bptr; + int blen; + + rcu_read_lock(); +restart_mnt: + read_seqbegin_or_lock(&mount_lock, &m_seq); + seq = 0; + rcu_read_lock(); +restart: + bptr = *buffer; + blen = *buflen; + error = 0; + dentry = path->dentry; + vfsmnt = path->mnt; + mnt = real_mount(vfsmnt); + read_seqbegin_or_lock(&rename_lock, &seq); + while (dentry != root->dentry || vfsmnt != root->mnt) { + struct dentry * parent; + + if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Global root? */ + if (mnt != parent) { + dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + mnt = parent; + vfsmnt = &mnt->mnt; + continue; + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; + } + parent = dentry->d_parent; + prefetch(parent); + error = prepend_name(&bptr, &blen, &dentry->d_name); + if (error) + break; + + dentry = parent; + } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + + if (!(m_seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&mount_lock, m_seq)) { + m_seq = 1; + goto restart_mnt; + } + done_seqretry(&mount_lock, m_seq); + + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; + return error; +} + +/** + * __d_path - return the path of a dentry + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. + * + * Returns a pointer into the buffer or an error code if the + * path was too long. + * + * "buflen" should be positive. + * + * If the path is not reachable from the supplied root, return %NULL. + */ +char *__d_path(const struct path *path, + const struct path *root, + char *buf, int buflen) +{ + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + error = prepend_path(path, root, &res, &buflen); + + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + error = prepend_path(path, &root, &res, &buflen); + + if (error > 1) + error = -EINVAL; + if (error < 0) + return ERR_PTR(error); + return res; +} + +/* + * same as __d_path but appends "(deleted)" for unlinked files. + */ +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) +{ + prepend(buf, buflen, "\0", 1); + if (d_unlinked(path->dentry)) { + int error = prepend(buf, buflen, " (deleted)", 10); + if (error) + return error; + } + + return prepend_path(path, root, buf, buflen); +} + +static int prepend_unreachable(char **buffer, int *buflen) +{ + return prepend(buffer, buflen, "(unreachable)", 13); +} + +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + +/** + * d_path - return the path of a dentry + * @path: path to report + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. If the entry has been deleted + * the string " (deleted)" is appended. Note that this is ambiguous. + * + * Returns a pointer into the buffer or an error code if the path was + * too long. Note: Callers should use the returned pointer, not the passed + * in buffer, to use the name! The implementation often starts at an offset + * into the buffer, and may leave 0 bytes at the start. + * + * "buflen" should be positive. + */ +char *d_path(const struct path *path, char *buf, int buflen) +{ + char *res = buf + buflen; + struct path root; + int error; + + /* + * We have various synthetic filesystems that never get mounted. On + * these filesystems dentries are never used for lookup purposes, and + * thus don't need to be hashed. They also don't need a name until a + * user wants to identify the object in /proc/pid/fd/. The little hack + * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. + */ + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) + return path->dentry->d_op->d_dname(path->dentry, buf, buflen); + + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); + error = path_with_deleted(path, &root, &res, &buflen); + rcu_read_unlock(); + + if (error < 0) + res = ERR_PTR(error); + return res; +} +EXPORT_SYMBOL(d_path); + +/* + * Helper function for dentry_operations.d_dname() members + */ +char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, + const char *fmt, ...) +{ + va_list args; + char temp[64]; + int sz; + + va_start(args, fmt); + sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; + va_end(args); + + if (sz > sizeof(temp) || sz > buflen) + return ERR_PTR(-ENAMETOOLONG); + + buffer += buflen - sz; + return memcpy(buffer, temp, sz); +} + +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} +EXPORT_SYMBOL(simple_dname); + +/* + * Write full pathname from the root of the filesystem into the buffer. + */ +static char *__dentry_path(struct dentry *d, char *buf, int buflen) +{ + struct dentry *dentry; + char *end, *retval; + int len, seq = 0; + int error = 0; + + if (buflen < 2) + goto Elong; + + rcu_read_lock(); +restart: + dentry = d; + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); + /* Get '/' right */ + retval = end-1; + *retval = '/'; + read_seqbegin_or_lock(&rename_lock, &seq); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dentry->d_parent; + + prefetch(parent); + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; + + retval = end; + dentry = parent; + } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + if (error) + goto Elong; + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +{ + return __dentry_path(dentry, buf, buflen); +} +EXPORT_SYMBOL(dentry_path_raw); + +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *p = NULL; + char *retval; + + if (d_unlinked(dentry)) { + p = buf + buflen; + if (prepend(&p, &buflen, "//deleted", 10) != 0) + goto Elong; + buflen++; + } + retval = __dentry_path(dentry, buf, buflen); + if (!IS_ERR(retval) && p) + *p = '/'; /* restore '/' overriden with '\0' */ + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); +} + +/* + * NOTE! The user-level library version returns a + * character pointer. The kernel system call just + * returns the length of the buffer filled (which + * includes the ending '\0' character), or a negative + * error value. So libc would do something like + * + * char *getcwd(char * buf, size_t size) + * { + * int retval; + * + * retval = sys_getcwd(buf, size); + * if (retval >= 0) + * return buf; + * errno = -retval; + * return NULL; + * } + */ +SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) +{ + int error; + struct path pwd, root; + char *page = __getname(); + + if (!page) + return -ENOMEM; + + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); + + error = -ENOENT; + if (!d_unlinked(pwd.dentry)) { + unsigned long len; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; + + prepend(&cwd, &buflen, "\0", 1); + error = prepend_path(&pwd, &root, &cwd, &buflen); + rcu_read_unlock(); + + if (error < 0) + goto out; + + /* Unreachable from current root */ + if (error > 0) { + error = prepend_unreachable(&cwd, &buflen); + if (error) + goto out; + } + + error = -ERANGE; + len = PATH_MAX + page - cwd; + if (len <= size) { + error = len; + if (copy_to_user(buf, cwd, len)) + error = -EFAULT; + } + } else { + rcu_read_unlock(); + } + +out: + __putname(page); + return error; +} + +/* + * Test whether new_dentry is a subdirectory of old_dentry. + * + * Trivially implemented using the dcache structure + */ + +/** + * is_subdir - is new dentry a subdirectory of old_dentry + * @new_dentry: new dentry + * @old_dentry: old dentry + * + * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). + * Returns 0 otherwise. + * Caller must ensure that "new_dentry" is pinned before calling is_subdir() + */ + +int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +{ + int result; + unsigned seq; + + if (new_dentry == old_dentry) + return 1; + + do { + /* for restarting inner loop in case of seq retry */ + seq = read_seqbegin(&rename_lock); + /* + * Need rcu_readlock to protect against the d_parent trashing + * due to d_move + */ + rcu_read_lock(); + if (d_ancestor(old_dentry, new_dentry)) + result = 1; + else + result = 0; + rcu_read_unlock(); + } while (read_seqretry(&rename_lock, seq)); + + return result; +} + +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) +{ + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; + + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } + } + return D_WALK_CONTINUE; +} + +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); +} + +void d_tmpfile(struct dentry *dentry, struct inode *inode) +{ + inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); + d_instantiate(dentry, inode); +} +EXPORT_SYMBOL(d_tmpfile); + +static __initdata unsigned long dhash_entries; +static int __init set_dhash_entries(char *str) +{ + if (!str) + return 0; + dhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("dhash_entries=", set_dhash_entries); + +static void __init dcache_init_early(void) +{ + unsigned int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + HASH_EARLY, + &d_hash_shift, + &d_hash_mask, + 0, + 0); + + for (loop = 0; loop < (1U << d_hash_shift); loop++) + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); +} + +static void __init dcache_init(void) +{ + unsigned int loop; + + /* + * A constructor could be added for stable state like the lists, + * but it is probably not worth it because of the cache nature + * of the dcache. + */ + dentry_cache = KMEM_CACHE(dentry, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + + /* Hash may have been set up in dcache_init_early */ + if (!hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask, + 0, + 0); + + for (loop = 0; loop < (1U << d_hash_shift); loop++) + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); +} + +/* SLAB cache for __getname() consumers */ +struct kmem_cache *names_cachep __read_mostly; +EXPORT_SYMBOL(names_cachep); + +EXPORT_SYMBOL(d_genocide); + +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + +void __init vfs_caches_init(unsigned long mempages) +{ + unsigned long reserve; + + /* Base hash sizes on available memory, with a reserve equal to + 150% of current kernel size */ + + reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); + mempages -= reserve; + + names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + dcache_init(); + inode_init(); + files_init(mempages); + mnt_init(); + bdev_cache_init(); + chrdev_init(); +} diff --git a/kernel/fs/dcookies.c b/kernel/fs/dcookies.c new file mode 100644 index 000000000..ac44a69fb --- /dev/null +++ b/kernel/fs/dcookies.c @@ -0,0 +1,350 @@ +/* + * dcookies.c + * + * Copyright 2002 John Levon + * + * Persistent cookie-path mappings. These are used by + * profilers to convert a per-task EIP value into something + * non-transitory that can be processed at a later date. + * This is done by locking the dentry/vfsmnt pair in the + * kernel until released by the tasks needing the persistent + * objects. The tag is simply an unsigned long that refers + * to the pair and can be looked up from userspace. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The dcookies are allocated from a kmem_cache and + * hashed onto a small number of lists. None of the + * code here is particularly performance critical + */ +struct dcookie_struct { + struct path path; + struct list_head hash_list; +}; + +static LIST_HEAD(dcookie_users); +static DEFINE_MUTEX(dcookie_mutex); +static struct kmem_cache *dcookie_cache __read_mostly; +static struct list_head *dcookie_hashtable __read_mostly; +static size_t hash_size __read_mostly; + +static inline int is_live(void) +{ + return !(list_empty(&dcookie_users)); +} + + +/* The dentry is locked, its address will do for the cookie */ +static inline unsigned long dcookie_value(struct dcookie_struct * dcs) +{ + return (unsigned long)dcs->path.dentry; +} + + +static size_t dcookie_hash(unsigned long dcookie) +{ + return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1); +} + + +static struct dcookie_struct * find_dcookie(unsigned long dcookie) +{ + struct dcookie_struct *found = NULL; + struct dcookie_struct * dcs; + struct list_head * pos; + struct list_head * list; + + list = dcookie_hashtable + dcookie_hash(dcookie); + + list_for_each(pos, list) { + dcs = list_entry(pos, struct dcookie_struct, hash_list); + if (dcookie_value(dcs) == dcookie) { + found = dcs; + break; + } + } + + return found; +} + + +static void hash_dcookie(struct dcookie_struct * dcs) +{ + struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs)); + list_add(&dcs->hash_list, list); +} + + +static struct dcookie_struct *alloc_dcookie(struct path *path) +{ + struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, + GFP_KERNEL); + struct dentry *d; + if (!dcs) + return NULL; + + d = path->dentry; + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_COOKIE; + spin_unlock(&d->d_lock); + + dcs->path = *path; + path_get(path); + hash_dcookie(dcs); + return dcs; +} + + +/* This is the main kernel-side routine that retrieves the cookie + * value for a dentry/vfsmnt pair. + */ +int get_dcookie(struct path *path, unsigned long *cookie) +{ + int err = 0; + struct dcookie_struct * dcs; + + mutex_lock(&dcookie_mutex); + + if (!is_live()) { + err = -EINVAL; + goto out; + } + + if (path->dentry->d_flags & DCACHE_COOKIE) { + dcs = find_dcookie((unsigned long)path->dentry); + } else { + dcs = alloc_dcookie(path); + if (!dcs) { + err = -ENOMEM; + goto out; + } + } + + *cookie = dcookie_value(dcs); + +out: + mutex_unlock(&dcookie_mutex); + return err; +} + + +/* And here is where the userspace process can look up the cookie value + * to retrieve the path. + */ +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) +{ + unsigned long cookie = (unsigned long)cookie64; + int err = -EINVAL; + char * kbuf; + char * path; + size_t pathlen; + struct dcookie_struct * dcs; + + /* we could leak path information to users + * without dir read permission without this + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&dcookie_mutex); + + if (!is_live()) { + err = -EINVAL; + goto out; + } + + if (!(dcs = find_dcookie(cookie))) + goto out; + + err = -ENOMEM; + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kbuf) + goto out; + + /* FIXME: (deleted) ? */ + path = d_path(&dcs->path, kbuf, PAGE_SIZE); + + mutex_unlock(&dcookie_mutex); + + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_free; + } + + err = -ERANGE; + + pathlen = kbuf + PAGE_SIZE - path; + if (pathlen <= len) { + err = pathlen; + if (copy_to_user(buf, path, pathlen)) + err = -EFAULT; + } + +out_free: + kfree(kbuf); + return err; +out: + mutex_unlock(&dcookie_mutex); + return err; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) +{ +#ifdef __BIG_ENDIAN + return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); +#else + return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); +#endif +} +#endif + +static int dcookie_init(void) +{ + struct list_head * d; + unsigned int i, hash_bits; + int err = -ENOMEM; + + dcookie_cache = kmem_cache_create("dcookie_cache", + sizeof(struct dcookie_struct), + 0, 0, NULL); + + if (!dcookie_cache) + goto out; + + dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!dcookie_hashtable) + goto out_kmem; + + err = 0; + + /* + * Find the power-of-two list-heads that can fit into the allocation.. + * We don't guarantee that "sizeof(struct list_head)" is necessarily + * a power-of-two. + */ + hash_size = PAGE_SIZE / sizeof(struct list_head); + hash_bits = 0; + do { + hash_bits++; + } while ((hash_size >> hash_bits) != 0); + hash_bits--; + + /* + * Re-calculate the actual number of entries and the mask + * from the number of bits we can fit. + */ + hash_size = 1UL << hash_bits; + + /* And initialize the newly allocated array */ + d = dcookie_hashtable; + i = hash_size; + do { + INIT_LIST_HEAD(d); + d++; + i--; + } while (i); + +out: + return err; +out_kmem: + kmem_cache_destroy(dcookie_cache); + goto out; +} + + +static void free_dcookie(struct dcookie_struct * dcs) +{ + struct dentry *d = dcs->path.dentry; + + spin_lock(&d->d_lock); + d->d_flags &= ~DCACHE_COOKIE; + spin_unlock(&d->d_lock); + + path_put(&dcs->path); + kmem_cache_free(dcookie_cache, dcs); +} + + +static void dcookie_exit(void) +{ + struct list_head * list; + struct list_head * pos; + struct list_head * pos2; + struct dcookie_struct * dcs; + size_t i; + + for (i = 0; i < hash_size; ++i) { + list = dcookie_hashtable + i; + list_for_each_safe(pos, pos2, list) { + dcs = list_entry(pos, struct dcookie_struct, hash_list); + list_del(&dcs->hash_list); + free_dcookie(dcs); + } + } + + kfree(dcookie_hashtable); + kmem_cache_destroy(dcookie_cache); +} + + +struct dcookie_user { + struct list_head next; +}; + +struct dcookie_user * dcookie_register(void) +{ + struct dcookie_user * user; + + mutex_lock(&dcookie_mutex); + + user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL); + if (!user) + goto out; + + if (!is_live() && dcookie_init()) + goto out_free; + + list_add(&user->next, &dcookie_users); + +out: + mutex_unlock(&dcookie_mutex); + return user; +out_free: + kfree(user); + user = NULL; + goto out; +} + + +void dcookie_unregister(struct dcookie_user * user) +{ + mutex_lock(&dcookie_mutex); + + list_del(&user->next); + kfree(user); + + if (!is_live()) + dcookie_exit(); + + mutex_unlock(&dcookie_mutex); +} + +EXPORT_SYMBOL_GPL(dcookie_register); +EXPORT_SYMBOL_GPL(dcookie_unregister); +EXPORT_SYMBOL_GPL(get_dcookie); diff --git a/kernel/fs/debugfs/Makefile b/kernel/fs/debugfs/Makefile new file mode 100644 index 000000000..840c45696 --- /dev/null +++ b/kernel/fs/debugfs/Makefile @@ -0,0 +1,4 @@ +debugfs-objs := inode.o file.o + +obj-$(CONFIG_DEBUG_FS) += debugfs.o + diff --git a/kernel/fs/debugfs/file.c b/kernel/fs/debugfs/file.c new file mode 100644 index 000000000..830a7e76f --- /dev/null +++ b/kernel/fs/debugfs/file.c @@ -0,0 +1,818 @@ +/* + * file.c - part of debugfs, a tiny little debug file system + * + * Copyright (C) 2004 Greg Kroah-Hartman + * Copyright (C) 2004 IBM Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * debugfs is for people to use instead of /proc or /sys. + * See Documentation/DocBook/filesystems for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ssize_t default_read_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t default_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return count; +} + +const struct file_operations debugfs_file_operations = { + .read = default_read_file, + .write = default_write_file, + .open = simple_open, + .llseek = noop_llseek, +}; + +static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, d_inode(dentry)->i_private); + return NULL; +} + +const struct inode_operations debugfs_link_operations = { + .readlink = generic_readlink, + .follow_link = debugfs_follow_link, +}; + +static int debugfs_u8_set(void *data, u64 val) +{ + *(u8 *)data = val; + return 0; +} +static int debugfs_u8_get(void *data, u64 *val) +{ + *val = *(u8 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); + +/** + * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u8(const char *name, umode_t mode, + struct dentry *parent, u8 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u8); +} +EXPORT_SYMBOL_GPL(debugfs_create_u8); + +static int debugfs_u16_set(void *data, u64 val) +{ + *(u16 *)data = val; + return 0; +} +static int debugfs_u16_get(void *data, u64 *val) +{ + *val = *(u16 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); + +/** + * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u16(const char *name, umode_t mode, + struct dentry *parent, u16 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u16); +} +EXPORT_SYMBOL_GPL(debugfs_create_u16); + +static int debugfs_u32_set(void *data, u64 val) +{ + *(u32 *)data = val; + return 0; +} +static int debugfs_u32_get(void *data, u64 *val) +{ + *val = *(u32 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); + +/** + * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u32(const char *name, umode_t mode, + struct dentry *parent, u32 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u32); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32); + +static int debugfs_u64_set(void *data, u64 val) +{ + *(u64 *)data = val; + return 0; +} + +static int debugfs_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); + +/** + * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u64(const char *name, umode_t mode, + struct dentry *parent, u64 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u64); +} +EXPORT_SYMBOL_GPL(debugfs_create_u64); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); + +/* + * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value + * + * These functions are exactly the same as the above functions (but use a hex + * output for the decimal challenged). For details look at the above unsigned + * decimal functions. + */ + +/** + * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x8(const char *name, umode_t mode, + struct dentry *parent, u8 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x8); +} +EXPORT_SYMBOL_GPL(debugfs_create_x8); + +/** + * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x16(const char *name, umode_t mode, + struct dentry *parent, u16 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x16); +} +EXPORT_SYMBOL_GPL(debugfs_create_x16); + +/** + * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x32(const char *name, umode_t mode, + struct dentry *parent, u32 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x32); +} +EXPORT_SYMBOL_GPL(debugfs_create_x32); + +/** + * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x64(const char *name, umode_t mode, + struct dentry *parent, u64 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_x64); +} +EXPORT_SYMBOL_GPL(debugfs_create_x64); + + +static int debugfs_size_t_set(void *data, u64 val) +{ + *(size_t *)data = val; + return 0; +} +static int debugfs_size_t_get(void *data, u64 *val) +{ + *val = *(size_t *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ + +/** + * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_size_t(const char *name, umode_t mode, + struct dentry *parent, size_t *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_size_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_size_t); + +static int debugfs_atomic_t_set(void *data, u64 val) +{ + atomic_set((atomic_t *)data, val); + return 0; +} +static int debugfs_atomic_t_get(void *data, u64 *val) +{ + *val = atomic_read((atomic_t *)data); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, + debugfs_atomic_t_set, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); + +/** + * debugfs_create_atomic_t - create a debugfs file that is used to read and + * write an atomic_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); + +static ssize_t read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[3]; + u32 *val = file->private_data; + + if (*val) + buf[0] = 'Y'; + else + buf[0] = 'N'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + size_t buf_size; + bool bv; + u32 *val = file->private_data; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &bv) == 0) + *val = bv; + + return count; +} + +static const struct file_operations fops_bool = { + .read = read_file_bool, + .write = write_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_bool(const char *name, umode_t mode, + struct dentry *parent, u32 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_bool); +} +EXPORT_SYMBOL_GPL(debugfs_create_bool); + +static ssize_t read_file_blob(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct debugfs_blob_wrapper *blob = file->private_data; + return simple_read_from_buffer(user_buf, count, ppos, blob->data, + blob->size); +} + +static const struct file_operations fops_blob = { + .read = read_file_blob, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_blob - create a debugfs file that is used to read a binary blob + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer + * to the blob data and the size of the data. + * + * This function creates a file in debugfs with the given name that exports + * @blob->data as a binary blob. If the @mode variable is so set it can be + * read from. Writing is not supported. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_blob(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_blob_wrapper *blob) +{ + return debugfs_create_file(name, mode, parent, blob, &fops_blob); +} +EXPORT_SYMBOL_GPL(debugfs_create_blob); + +struct array_data { + void *array; + u32 elements; +}; + +static size_t u32_format_array(char *buf, size_t bufsize, + u32 *array, int array_size) +{ + size_t ret = 0; + + while (--array_size >= 0) { + size_t len; + char term = array_size ? ' ' : '\n'; + + len = snprintf(buf, bufsize, "%u%c", *array++, term); + ret += len; + + buf += len; + bufsize -= len; + } + return ret; +} + +static int u32_array_open(struct inode *inode, struct file *file) +{ + struct array_data *data = inode->i_private; + int size, elements = data->elements; + char *buf; + + /* + * Max size: + * - 10 digits + ' '/'\n' = 11 bytes per number + * - terminating NUL character + */ + size = elements*11; + buf = kmalloc(size+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + buf[size] = 0; + + file->private_data = buf; + u32_format_array(buf, size, data->array, data->elements); + + return nonseekable_open(inode, file); +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + size_t size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, + file->private_data, size); +} + +static int u32_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static const struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release = u32_array_release, + .read = u32_array_read, + .llseek = no_llseek, +}; + +/** + * debugfs_create_u32_array - create a debugfs file that is used to read u32 + * array. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @array: u32 array that provides data. + * @elements: total number of elements in the array. + * + * This function creates a file in debugfs with the given name that exports + * @array as data. If the @mode variable is so set it can be read from. + * Writing is not supported. Seek within the file is also not supported. + * Once array is created its size can not be changed. + * + * The function returns a pointer to dentry on success. If debugfs is not + * enabled in the kernel, the value -%ENODEV will be returned. + */ +struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32_array); + +#ifdef CONFIG_HAS_IOMEM + +/* + * The regset32 stuff is used to print 32-bit registers using the + * seq_file utilities. We offer printing a register set in an already-opened + * sequential file or create a debugfs file that only prints a regset32. + */ + +/** + * debugfs_print_regs32 - use seq_print to describe a set of registers + * @s: the seq_file structure being used to generate output + * @regs: an array if struct debugfs_reg32 structures + * @nregs: the length of the above array + * @base: the base address to be used in reading the registers + * @prefix: a string to be prefixed to every output line + * + * This function outputs a text block describing the current values of + * some 32-bit hardware registers. It is meant to be used within debugfs + * files based on seq_file that need to show registers, intermixed with other + * information. The prefix argument may be used to specify a leading string, + * because some peripherals have several blocks of identical registers, + * for example configuration of dma channels + */ +void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix) +{ + int i; + + for (i = 0; i < nregs; i++, regs++) { + if (prefix) + seq_printf(s, "%s", prefix); + seq_printf(s, "%s = 0x%08x\n", regs->name, + readl(base + regs->offset)); + if (seq_has_overflowed(s)) + break; + } +} +EXPORT_SYMBOL_GPL(debugfs_print_regs32); + +static int debugfs_show_regset32(struct seq_file *s, void *data) +{ + struct debugfs_regset32 *regset = s->private; + + debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); + return 0; +} + +static int debugfs_open_regset32(struct inode *inode, struct file *file) +{ + return single_open(file, debugfs_show_regset32, inode->i_private); +} + +static const struct file_operations fops_regset32 = { + .open = debugfs_open_regset32, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * debugfs_create_regset32 - create a debugfs file that returns register values + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @regset: a pointer to a struct debugfs_regset32, which contains a pointer + * to an array of register definitions, the array size and the base + * address where the register bank is to be found. + * + * This function creates a file in debugfs with the given name that reports + * the names and values of a set of 32-bit registers. If the @mode variable + * is so set it can be read from. Writing is not supported. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_regset32(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset) +{ + return debugfs_create_file(name, mode, parent, regset, &fops_regset32); +} +EXPORT_SYMBOL_GPL(debugfs_create_regset32); + +#endif /* CONFIG_HAS_IOMEM */ + +struct debugfs_devm_entry { + int (*read)(struct seq_file *seq, void *data); + struct device *dev; +}; + +static int debugfs_devm_entry_open(struct inode *inode, struct file *f) +{ + struct debugfs_devm_entry *entry = inode->i_private; + + return single_open(f, entry->read, entry->dev); +} + +static const struct file_operations debugfs_devm_entry_ops = { + .owner = THIS_MODULE, + .open = debugfs_devm_entry_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek +}; + +/** + * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. + * + * @dev: device related to this debugfs file. + * @name: name of the debugfs file. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @read_fn: function pointer called to print the seq_file content. + */ +struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)) +{ + struct debugfs_devm_entry *entry; + + if (IS_ERR(parent)) + return ERR_PTR(-ENOENT); + + entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + entry->read = read_fn; + entry->dev = dev; + + return debugfs_create_file(name, S_IRUGO, parent, entry, + &debugfs_devm_entry_ops); +} +EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); + diff --git a/kernel/fs/debugfs/inode.c b/kernel/fs/debugfs/inode.c new file mode 100644 index 000000000..12756040c --- /dev/null +++ b/kernel/fs/debugfs/inode.c @@ -0,0 +1,736 @@ +/* + * inode.c - part of debugfs, a tiny little debug file system + * + * Copyright (C) 2004 Greg Kroah-Hartman + * Copyright (C) 2004 IBM Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * debugfs is for people to use instead of /proc or /sys. + * See Documentation/DocBook/kernel-api for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUGFS_DEFAULT_MODE 0700 + +static struct vfsmount *debugfs_mount; +static int debugfs_mount_count; +static bool debugfs_registered; + +static struct inode *debugfs_get_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + return inode; +} + +static inline int debugfs_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + +struct debugfs_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; +}; + +enum { + Opt_uid, + Opt_gid, + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct debugfs_fs_info { + struct debugfs_mount_opts mount_opts; +}; + +static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + kuid_t uid; + kgid_t gid; + char *p; + + opts->mode = DEBUGFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally debugfs has ignored all mount options + */ + } + } + + return 0; +} + +static int debugfs_apply_options(struct super_block *sb) +{ + struct debugfs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = d_inode(sb->s_root); + struct debugfs_mount_opts *opts = &fsi->mount_opts; + + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; + + return 0; +} + +static int debugfs_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct debugfs_fs_info *fsi = sb->s_fs_info; + + sync_filesystem(sb); + err = debugfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + debugfs_apply_options(sb); + +fail: + return err; +} + +static int debugfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; + struct debugfs_mount_opts *opts = &fsi->mount_opts; + + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + if (opts->mode != DEBUGFS_DEFAULT_MODE) + seq_printf(m, ",mode=%o", opts->mode); + + return 0; +} + +static void debugfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_private); +} + +static const struct super_operations debugfs_super_operations = { + .statfs = simple_statfs, + .remount_fs = debugfs_remount, + .show_options = debugfs_show_options, + .evict_inode = debugfs_evict_inode, +}; + +static struct vfsmount *debugfs_automount(struct path *path) +{ + struct vfsmount *(*f)(void *); + f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; + return f(d_inode(path->dentry)->i_private); +} + +static const struct dentry_operations debugfs_dops = { + .d_delete = always_delete_dentry, + .d_automount = debugfs_automount, +}; + +static int debug_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr debug_files[] = {{""}}; + struct debugfs_fs_info *fsi; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) { + err = -ENOMEM; + goto fail; + } + + err = debugfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); + if (err) + goto fail; + + sb->s_op = &debugfs_super_operations; + sb->s_d_op = &debugfs_dops; + + debugfs_apply_options(sb); + + return 0; + +fail: + kfree(fsi); + sb->s_fs_info = NULL; + return err; +} + +static struct dentry *debug_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_single(fs_type, flags, data, debug_fill_super); +} + +static struct file_system_type debug_fs_type = { + .owner = THIS_MODULE, + .name = "debugfs", + .mount = debug_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("debugfs"); + +static struct dentry *start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + pr_debug("debugfs: creating file '%s'\n",name); + + if (IS_ERR(parent)) + return parent; + + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, + &debugfs_mount_count); + if (error) + return ERR_PTR(error); + + /* If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = debugfs_mount->mnt_root; + + mutex_lock(&d_inode(parent)->i_mutex); + dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + if (IS_ERR(dentry)) + mutex_unlock(&d_inode(parent)->i_mutex); + return dentry; +} + +static struct dentry *failed_creating(struct dentry *dentry) +{ + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + dput(dentry); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + return NULL; +} + +static struct dentry *end_creating(struct dentry *dentry) +{ + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + return dentry; +} + +/** + * debugfs_create_file - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; + d_instantiate(dentry, inode); + fsnotify_create(d_inode(dentry->d_parent), dentry); + return end_creating(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_create_file); + +/** + * debugfs_create_file_size - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * @file_size: initial file size + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_file_size(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops, + loff_t file_size) +{ + struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); + + if (de) + d_inode(de)->i_size = file_size; + return de; +} +EXPORT_SYMBOL_GPL(debugfs_create_file_size); + +/** + * debugfs_create_dir - create a directory in the debugfs filesystem + * @name: a pointer to a string containing the name of the directory to + * create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * directory will be created in the root of the debugfs filesystem. + * + * This function creates a directory in debugfs with the given name. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); + return end_creating(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_create_dir); + +/** + * debugfs_create_automount - create automount point in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @f: function to be called when pathname resolution steps on that one. + * @data: opaque argument to pass to f(). + * + * @f should return what ->d_automount() would. + */ +struct dentry *debugfs_create_automount(const char *name, + struct dentry *parent, + struct vfsmount *(*f)(void *), + void *data) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_flags |= S_AUTOMOUNT; + inode->i_private = data; + dentry->d_fsdata = (void *)f; + d_instantiate(dentry, inode); + return end_creating(dentry); +} +EXPORT_SYMBOL(debugfs_create_automount); + +/** + * debugfs_create_symlink- create a symbolic link in the debugfs filesystem + * @name: a pointer to a string containing the name of the symbolic link to + * create. + * @parent: a pointer to the parent dentry for this symbolic link. This + * should be a directory dentry if set. If this parameter is NULL, + * then the symbolic link will be created in the root of the debugfs + * filesystem. + * @target: a pointer to a string containing the path to the target of the + * symbolic link. + * + * This function creates a symbolic link with the given name in debugfs that + * links to the given target path. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the symbolic + * link is to be removed (no automatic cleanup happens if your module is + * unloaded, you are responsible here.) If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, + const char *target) +{ + struct dentry *dentry; + struct inode *inode; + char *link = kstrdup(target, GFP_KERNEL); + if (!link) + return NULL; + + dentry = start_creating(name, parent); + if (IS_ERR(dentry)) { + kfree(link); + return NULL; + } + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) { + kfree(link); + return failed_creating(dentry); + } + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_op = &debugfs_link_operations; + inode->i_private = link; + d_instantiate(dentry, inode); + return end_creating(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_create_symlink); + +static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (debugfs_positive(dentry)) { + dget(dentry); + if (d_is_dir(dentry)) + ret = simple_rmdir(d_inode(parent), dentry); + else + simple_unlink(d_inode(parent), dentry); + if (!ret) + d_delete(dentry); + dput(dentry); + } + return ret; +} + +/** + * debugfs_remove - removes a file or directory from the debugfs filesystem + * @dentry: a pointer to a the dentry of the file or directory to be + * removed. + * + * This function removes a file or directory in debugfs that was previously + * created with a call to another debugfs function (like + * debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove(struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || d_really_is_negative(parent)) + return; + + mutex_lock(&d_inode(parent)->i_mutex); + ret = __debugfs_remove(dentry, parent); + mutex_unlock(&d_inode(parent)->i_mutex); + if (!ret) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); +} +EXPORT_SYMBOL_GPL(debugfs_remove); + +/** + * debugfs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in debugfs that + * was previously created with a call to another debugfs function + * (like debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child, *parent; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || d_really_is_negative(parent)) + return; + + parent = dentry; + down: + mutex_lock(&d_inode(parent)->i_mutex); + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_child) { + if (!debugfs_positive(child)) + continue; + + /* perhaps simple_empty(child) makes more sense */ + if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); + mutex_unlock(&d_inode(parent)->i_mutex); + parent = child; + goto down; + } + + spin_unlock(&parent->d_lock); + + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * debugfs_positive() check. + */ + goto loop; + } + spin_unlock(&parent->d_lock); + + mutex_unlock(&d_inode(parent)->i_mutex); + child = parent; + parent = parent->d_parent; + mutex_lock(&d_inode(parent)->i_mutex); + + if (child != dentry) + /* go up */ + goto loop; + + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + mutex_unlock(&d_inode(parent)->i_mutex); +} +EXPORT_SYMBOL_GPL(debugfs_remove_recursive); + +/** + * debugfs_rename - rename a file/directory in the debugfs filesystem + * @old_dir: a pointer to the parent dentry for the renamed object. This + * should be a directory dentry. + * @old_dentry: dentry of an object to be renamed. + * @new_dir: a pointer to the parent dentry where the object should be + * moved. This should be a directory dentry. + * @new_name: a pointer to a string containing the target name. + * + * This function renames a file/directory in debugfs. The target must not + * exist for rename to succeed. + * + * This function will return a pointer to old_dentry (which is updated to + * reflect renaming) if it succeeds. If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, + struct dentry *new_dir, const char *new_name) +{ + int error; + struct dentry *dentry = NULL, *trap; + const char *old_name; + + trap = lock_rename(new_dir, old_dir); + /* Source or destination directories don't exist? */ + if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) + goto exit; + /* Source does not exist, cyclic rename, or mountpoint? */ + if (d_really_is_negative(old_dentry) || old_dentry == trap || + d_mountpoint(old_dentry)) + goto exit; + dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); + /* Lookup failed, cyclic rename or target exists? */ + if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) + goto exit; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + + error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), + dentry); + if (error) { + fsnotify_oldname_free(old_name); + goto exit; + } + d_move(old_dentry, dentry); + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, + d_is_dir(old_dentry), + NULL, old_dentry); + fsnotify_oldname_free(old_name); + unlock_rename(new_dir, old_dir); + dput(dentry); + return old_dentry; +exit: + if (dentry && !IS_ERR(dentry)) + dput(dentry); + unlock_rename(new_dir, old_dir); + return NULL; +} +EXPORT_SYMBOL_GPL(debugfs_rename); + +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + +static int __init debugfs_init(void) +{ + int retval; + + retval = sysfs_create_mount_point(kernel_kobj, "debug"); + if (retval) + return retval; + + retval = register_filesystem(&debug_fs_type); + if (retval) + sysfs_remove_mount_point(kernel_kobj, "debug"); + else + debugfs_registered = true; + + return retval; +} +core_initcall(debugfs_init); + diff --git a/kernel/fs/devpts/Makefile b/kernel/fs/devpts/Makefile new file mode 100644 index 000000000..236696efc --- /dev/null +++ b/kernel/fs/devpts/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux /dev/pts virtual filesystem. +# + +obj-$(CONFIG_UNIX98_PTYS) += devpts.o + +devpts-$(CONFIG_UNIX98_PTYS) := inode.o diff --git a/kernel/fs/devpts/inode.c b/kernel/fs/devpts/inode.c new file mode 100644 index 000000000..add566303 --- /dev/null +++ b/kernel/fs/devpts/inode.c @@ -0,0 +1,689 @@ +/* -*- linux-c -*- --------------------------------------------------------- * + * + * linux/fs/devpts/inode.c + * + * Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVPTS_DEFAULT_MODE 0600 +/* + * ptmx is a new node in /dev/pts and will be unused in legacy (single- + * instance) mode. To prevent surprises in user space, set permissions of + * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful + * permissions. + */ +#define DEVPTS_DEFAULT_PTMX_MODE 0000 +#define PTMX_MINOR 2 + +/* + * sysctl support for setting limits on the number of Unix98 ptys allocated. + * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. + */ +static int pty_limit = NR_UNIX98_PTY_DEFAULT; +static int pty_reserve = NR_UNIX98_PTY_RESERVE; +static int pty_limit_min; +static int pty_limit_max = INT_MAX; +static int pty_count; + +static struct ctl_table pty_table[] = { + { + .procname = "max", + .maxlen = sizeof(int), + .mode = 0644, + .data = &pty_limit, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pty_limit_min, + .extra2 = &pty_limit_max, + }, { + .procname = "reserve", + .maxlen = sizeof(int), + .mode = 0644, + .data = &pty_reserve, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pty_limit_min, + .extra2 = &pty_limit_max, + }, { + .procname = "nr", + .maxlen = sizeof(int), + .mode = 0444, + .data = &pty_count, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_table pty_kern_table[] = { + { + .procname = "pty", + .mode = 0555, + .child = pty_table, + }, + {} +}; + +static struct ctl_table pty_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = pty_kern_table, + }, + {} +}; + +static DEFINE_MUTEX(allocated_ptys_lock); + +static struct vfsmount *devpts_mnt; + +struct pts_mount_opts { + int setuid; + int setgid; + kuid_t uid; + kgid_t gid; + umode_t mode; + umode_t ptmxmode; + int newinstance; + int max; +}; + +enum { + Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_max, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + {Opt_ptmxmode, "ptmxmode=%o"}, + {Opt_newinstance, "newinstance"}, + {Opt_max, "max=%d"}, +#endif + {Opt_err, NULL} +}; + +struct pts_fs_info { + struct ida allocated_ptys; + struct pts_mount_opts mount_opts; + struct dentry *ptmx_dentry; +}; + +static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct super_block *pts_sb_from_inode(struct inode *inode) +{ +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + return inode->i_sb; +#endif + return devpts_mnt->mnt_sb; +} + +#define PARSE_MOUNT 0 +#define PARSE_REMOUNT 1 + +/* + * parse_mount_options(): + * Set @opts to mount options specified in @data. If an option is not + * specified in @data, set it to its default value. The exception is + * 'newinstance' option which can only be set/cleared on a mount (i.e. + * cannot be changed during remount). + * + * Note: @data may be NULL (in which case all options are set to default). + */ +static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +{ + char *p; + kuid_t uid; + kgid_t gid; + + opts->setuid = 0; + opts->setgid = 0; + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; + opts->mode = DEVPTS_DEFAULT_MODE; + opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + opts->max = NR_UNIX98_PTY_MAX; + + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 0; + + while ((p = strsep(&data, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + int option; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; + opts->setuid = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; + opts->setgid = 1; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + case Opt_ptmxmode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->ptmxmode = option & S_IALLUGO; + break; + case Opt_newinstance: + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 1; + break; + case Opt_max: + if (match_int(&args[0], &option) || + option < 0 || option > NR_UNIX98_PTY_MAX) + return -EINVAL; + opts->max = option; + break; +#endif + default: + pr_err("called with bogus options\n"); + return -EINVAL; + } + } + + return 0; +} + +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int mknod_ptmx(struct super_block *sb) +{ + int mode; + int rc = -ENOMEM; + struct dentry *dentry; + struct inode *inode; + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + kuid_t root_uid; + kgid_t root_gid; + + root_uid = make_kuid(current_user_ns(), 0); + root_gid = make_kgid(current_user_ns(), 0); + if (!uid_valid(root_uid) || !gid_valid(root_gid)) + return -EINVAL; + + mutex_lock(&d_inode(root)->i_mutex); + + /* If we have already created ptmx node, return */ + if (fsi->ptmx_dentry) { + rc = 0; + goto out; + } + + dentry = d_alloc_name(root, "ptmx"); + if (!dentry) { + pr_err("Unable to alloc dentry for ptmx node\n"); + goto out; + } + + /* + * Create a new 'ptmx' node in this mount of devpts. + */ + inode = new_inode(sb); + if (!inode) { + pr_err("Unable to alloc inode for ptmx node\n"); + dput(dentry); + goto out; + } + + inode->i_ino = 2; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + mode = S_IFCHR|opts->ptmxmode; + init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); + inode->i_uid = root_uid; + inode->i_gid = root_gid; + + d_add(dentry, inode); + + fsi->ptmx_dentry = dentry; + rc = 0; +out: + mutex_unlock(&d_inode(root)->i_mutex); + return rc; +} + +static void update_ptmx_mode(struct pts_fs_info *fsi) +{ + struct inode *inode; + if (fsi->ptmx_dentry) { + inode = d_inode(fsi->ptmx_dentry); + inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; + } +} +#else +static inline void update_ptmx_mode(struct pts_fs_info *fsi) +{ + return; +} +#endif + +static int devpts_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + sync_filesystem(sb); + err = parse_mount_options(data, PARSE_REMOUNT, opts); + + /* + * parse_mount_options() restores options to default values + * before parsing and may have changed ptmxmode. So, update the + * mode in the inode too. Bogus options don't fail the remount, + * so do this even on error return. + */ + update_ptmx_mode(fsi); + + return err; +} + +static int devpts_show_options(struct seq_file *seq, struct dentry *root) +{ + struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + if (opts->setuid) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (opts->setgid) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + seq_printf(seq, ",mode=%03o", opts->mode); +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); + if (opts->max < NR_UNIX98_PTY_MAX) + seq_printf(seq, ",max=%d", opts->max); +#endif + + return 0; +} + +static const struct super_operations devpts_sops = { + .statfs = simple_statfs, + .remount_fs = devpts_remount, + .show_options = devpts_show_options, +}; + +static void *new_pts_fs_info(void) +{ + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return NULL; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + + return fsi; +} + +static int +devpts_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *inode; + + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = DEVPTS_SUPER_MAGIC; + s->s_op = &devpts_sops; + s->s_time_gran = 1; + + s->s_fs_info = new_pts_fs_info(); + if (!s->s_fs_info) + goto fail; + + inode = new_inode(s); + if (!inode) + goto fail; + inode->i_ino = 1; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + s->s_root = d_make_root(inode); + if (s->s_root) + return 0; + + pr_err("get root dentry failed\n"); + +fail: + return -ENOMEM; +} + +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int compare_init_pts_sb(struct super_block *s, void *p) +{ + if (devpts_mnt) + return devpts_mnt->mnt_sb == s; + return 0; +} + +/* + * devpts_mount() + * + * If the '-o newinstance' mount option was specified, mount a new + * (private) instance of devpts. PTYs created in this instance are + * independent of the PTYs in other devpts instances. + * + * If the '-o newinstance' option was not specified, mount/remount the + * initial kernel mount of devpts. This type of mount gives the + * legacy, single-instance semantics. + * + * The 'newinstance' option is needed to support multiple namespace + * semantics in devpts while preserving backward compatibility of the + * current 'single-namespace' semantics. i.e all mounts of devpts + * without the 'newinstance' mount option should bind to the initial + * kernel mount, like mount_single(). + * + * Mounts with 'newinstance' option create a new, private namespace. + * + * NOTE: + * + * For single-mount semantics, devpts cannot use mount_single(), + * because mount_single()/sget() find and use the super-block from + * the most recent mount of devpts. But that recent mount may be a + * 'newinstance' mount and mount_single() would pick the newinstance + * super-block instead of the initial super-block. + */ +static struct dentry *devpts_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int error; + struct pts_mount_opts opts; + struct super_block *s; + + error = parse_mount_options(data, PARSE_MOUNT, &opts); + if (error) + return ERR_PTR(error); + + /* Require newinstance for all user namespace mounts to ensure + * the mount options are not changed. + */ + if ((current_user_ns() != &init_user_ns) && !opts.newinstance) + return ERR_PTR(-EINVAL); + + if (opts.newinstance) + s = sget(fs_type, NULL, set_anon_super, flags, NULL); + else + s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, + NULL); + + if (IS_ERR(s)) + return ERR_CAST(s); + + if (!s->s_root) { + error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) + goto out_undo_sget; + s->s_flags |= MS_ACTIVE; + } + + memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); + + error = mknod_ptmx(s); + if (error) + goto out_undo_sget; + + return dget(s->s_root); + +out_undo_sget: + deactivate_locked_super(s); + return ERR_PTR(error); +} + +#else +/* + * This supports only the legacy single-instance semantics (no + * multiple-instance semantics) + */ +static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, devpts_fill_super); +} +#endif + +static void devpts_kill_sb(struct super_block *sb) +{ + struct pts_fs_info *fsi = DEVPTS_SB(sb); + + ida_destroy(&fsi->allocated_ptys); + kfree(fsi); + kill_litter_super(sb); +} + +static struct file_system_type devpts_fs_type = { + .name = "devpts", + .mount = devpts_mount, + .kill_sb = devpts_kill_sb, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, +#endif +}; + +/* + * The normal naming convention is simply /dev/pts/; this conforms + * to the System V naming convention + */ + +int devpts_new_index(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); + int index; + int ida_ret; + +retry: + if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) + return -ENOMEM; + + mutex_lock(&allocated_ptys_lock); + if (pty_count >= pty_limit - + (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + mutex_unlock(&allocated_ptys_lock); + return -ENOSPC; + } + + ida_ret = ida_get_new(&fsi->allocated_ptys, &index); + if (ida_ret < 0) { + mutex_unlock(&allocated_ptys_lock); + if (ida_ret == -EAGAIN) + goto retry; + return -EIO; + } + + if (index >= fsi->mount_opts.max) { + ida_remove(&fsi->allocated_ptys, index); + mutex_unlock(&allocated_ptys_lock); + return -ENOSPC; + } + pty_count++; + mutex_unlock(&allocated_ptys_lock); + return index; +} + +void devpts_kill_index(struct inode *ptmx_inode, int idx) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); + + mutex_lock(&allocated_ptys_lock); + ida_remove(&fsi->allocated_ptys, idx); + pty_count--; + mutex_unlock(&allocated_ptys_lock); +} + +/** + * devpts_pty_new -- create a new inode in /dev/pts/ + * @ptmx_inode: inode of the master + * @device: major+minor of the node to be created + * @index: used as a name of the node + * @priv: what's given back by devpts_get_priv + * + * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + */ +struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, + void *priv) +{ + struct dentry *dentry; + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct inode *inode; + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + char s[12]; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = index + 3; + inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); + inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + init_special_inode(inode, S_IFCHR|opts->mode, device); + inode->i_private = priv; + + sprintf(s, "%d", index); + + mutex_lock(&d_inode(root)->i_mutex); + + dentry = d_alloc_name(root, s); + if (dentry) { + d_add(dentry, inode); + fsnotify_create(d_inode(root), dentry); + } else { + iput(inode); + inode = ERR_PTR(-ENOMEM); + } + + mutex_unlock(&d_inode(root)->i_mutex); + + return inode; +} + +/** + * devpts_get_priv -- get private data for a slave + * @pts_inode: inode of the slave + * + * Returns whatever was passed as priv in devpts_pty_new for a given inode. + */ +void *devpts_get_priv(struct inode *pts_inode) +{ + struct dentry *dentry; + void *priv = NULL; + + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + /* Ensure dentry has not been deleted by devpts_pty_kill() */ + dentry = d_find_alias(pts_inode); + if (!dentry) + return NULL; + + if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + priv = pts_inode->i_private; + + dput(dentry); + + return priv; +} + +/** + * devpts_pty_kill -- remove inode form /dev/pts/ + * @inode: inode of the slave to be removed + * + * This is an inverse operation of devpts_pty_new. + */ +void devpts_pty_kill(struct inode *inode) +{ + struct super_block *sb = pts_sb_from_inode(inode); + struct dentry *root = sb->s_root; + struct dentry *dentry; + + BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + mutex_lock(&d_inode(root)->i_mutex); + + dentry = d_find_alias(inode); + + drop_nlink(inode); + d_delete(dentry); + dput(dentry); /* d_alloc_name() in devpts_pty_new() */ + dput(dentry); /* d_find_alias above */ + + mutex_unlock(&d_inode(root)->i_mutex); +} + +static int __init init_devpts_fs(void) +{ + int err = register_filesystem(&devpts_fs_type); + struct ctl_table_header *table; + + if (!err) { + table = register_sysctl_table(pty_root_table); + devpts_mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(devpts_mnt)) { + err = PTR_ERR(devpts_mnt); + unregister_filesystem(&devpts_fs_type); + unregister_sysctl_table(table); + } + } + return err; +} +module_init(init_devpts_fs) diff --git a/kernel/fs/direct-io.c b/kernel/fs/direct-io.c new file mode 100644 index 000000000..745d23426 --- /dev/null +++ b/kernel/fs/direct-io.c @@ -0,0 +1,1332 @@ +/* + * fs/direct-io.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * O_DIRECT + * + * 04Jul2002 Andrew Morton + * Initial version + * 11Sep2002 janetinc@us.ibm.com + * added readv/writev support. + * 29Oct2002 Andrew Morton + * rewrote bio_add_page() support. + * 30Oct2002 pbadari@us.ibm.com + * added support for non-aligned IO. + * 06Nov2002 pbadari@us.ibm.com + * added asynchronous IO support. + * 21Jul2003 nathans@sgi.com + * added IO completion notifier. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * How many user pages to map in one call to get_user_pages(). This determines + * the size of a structure in the slab cache + */ +#define DIO_PAGES 64 + +/* + * This code generally works in units of "dio_blocks". A dio_block is + * somewhere between the hard sector size and the filesystem block size. it + * is determined on a per-invocation basis. When talking to the filesystem + * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity + * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted + * to bio_block quantities by shifting left by blkfactor. + * + * If blkfactor is zero then the user's request was aligned to the filesystem's + * blocksize. + */ + +/* dio_state only used in the submission path */ + +struct dio_submit { + struct bio *bio; /* bio under assembly */ + unsigned blkbits; /* doesn't change */ + unsigned blkfactor; /* When we're using an alignment which + is finer than the filesystem's soft + blocksize, this specifies how much + finer. blkfactor=2 means 1/4-block + alignment. Does not change */ + unsigned start_zero_done; /* flag: sub-blocksize zeroing has + been performed at the start of a + write */ + int pages_in_io; /* approximate total IO pages */ + sector_t block_in_file; /* Current offset into the underlying + file in dio_block units. */ + unsigned blocks_available; /* At block_in_file. changes */ + int reap_counter; /* rate limit reaping */ + sector_t final_block_in_request;/* doesn't change */ + int boundary; /* prev block is at a boundary */ + get_block_t *get_block; /* block mapping function */ + dio_submit_t *submit_io; /* IO submition function */ + + loff_t logical_offset_in_bio; /* current first logical block in bio */ + sector_t final_block_in_bio; /* current final block in bio + 1 */ + sector_t next_block_for_io; /* next block to be put under IO, + in dio_blocks units */ + + /* + * Deferred addition of a page to the dio. These variables are + * private to dio_send_cur_page(), submit_page_section() and + * dio_bio_add_page(). + */ + struct page *cur_page; /* The page */ + unsigned cur_page_offset; /* Offset into it, in bytes */ + unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ + sector_t cur_page_block; /* Where it starts */ + loff_t cur_page_fs_offset; /* Offset in file */ + + struct iov_iter *iter; + /* + * Page queue. These variables belong to dio_refill_pages() and + * dio_get_page(). + */ + unsigned head; /* next page to process */ + unsigned tail; /* last valid page + 1 */ + size_t from, to; +}; + +/* dio_state communicated between submission path and end_io */ +struct dio { + int flags; /* doesn't change */ + int rw; + struct inode *inode; + loff_t i_size; /* i_size when submitted */ + dio_iodone_t *end_io; /* IO completion function */ + + void *private; /* copy from map_bh.b_private */ + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ + int page_errors; /* errno from get_user_pages() */ + int is_async; /* is IO async ? */ + bool defer_completion; /* defer AIO completion to workqueue? */ + int io_error; /* IO error in completion path */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + ssize_t result; /* IO result */ + + /* + * pages[] (and any fields placed after it) are not zeroed out at + * allocation time. Don't add new fields after pages[] unless you + * wish that they not be zeroed. + */ + union { + struct page *pages[DIO_PAGES]; /* page buffer */ + struct work_struct complete_work;/* deferred AIO completion */ + }; +} ____cacheline_aligned_in_smp; + +static struct kmem_cache *dio_cache __read_mostly; + +/* + * How many pages are in the queue? + */ +static inline unsigned dio_pages_present(struct dio_submit *sdio) +{ + return sdio->tail - sdio->head; +} + +/* + * Go grab and pin some userspace pages. Typically we'll get 64 at a time. + */ +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) +{ + ssize_t ret; + + ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, + &sdio->from); + + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { + struct page *page = ZERO_PAGE(0); + /* + * A memory fault, but the filesystem has some outstanding + * mapped blocks. We need to use those blocks up to avoid + * leaking stale data in the file. + */ + if (dio->page_errors == 0) + dio->page_errors = ret; + page_cache_get(page); + dio->pages[0] = page; + sdio->head = 0; + sdio->tail = 1; + sdio->from = 0; + sdio->to = PAGE_SIZE; + return 0; + } + + if (ret >= 0) { + iov_iter_advance(sdio->iter, ret); + ret += sdio->from; + sdio->head = 0; + sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; + return 0; + } + return ret; +} + +/* + * Get another userspace page. Returns an ERR_PTR on error. Pages are + * buffered inside the dio so that we can call get_user_pages() against a + * decent number of pages, less frequently. To provide nicer use of the + * L1 cache. + */ +static inline struct page *dio_get_page(struct dio *dio, + struct dio_submit *sdio) +{ + if (dio_pages_present(sdio) == 0) { + int ret; + + ret = dio_refill_pages(dio, sdio); + if (ret) + return ERR_PTR(ret); + BUG_ON(dio_pages_present(sdio) == 0); + } + return dio->pages[sdio->head]; +} + +/** + * dio_complete() - called when all DIO BIO I/O has been completed + * @offset: the byte offset in the file of the completed operation + * + * This drops i_dio_count, lets interested parties know that a DIO operation + * has completed, and calculates the resulting return code for the operation. + * + * It lets the filesystem know if it registered an interest earlier via + * get_block. Pass the private field of the map buffer_head so that + * filesystems can use it to hold additional state between get_block calls and + * dio_complete. + */ +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, + bool is_async) +{ + ssize_t transferred = 0; + + /* + * AIO submission can race with bio completion to get here while + * expecting to have the last io completed by bio completion. + * In that case -EIOCBQUEUED is in fact not an error we want + * to preserve through this call. + */ + if (ret == -EIOCBQUEUED) + ret = 0; + + if (dio->result) { + transferred = dio->result; + + /* Check for short read case */ + if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + transferred = dio->i_size - offset; + } + + if (ret == 0) + ret = dio->page_errors; + if (ret == 0) + ret = dio->io_error; + if (ret == 0) + ret = transferred; + + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, dio->private); + + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(dio->inode); + + if (is_async) { + if (dio->rw & WRITE) { + int err; + + err = generic_write_sync(dio->iocb->ki_filp, offset, + transferred); + if (err < 0 && ret > 0) + ret = err; + } + + dio->iocb->ki_complete(dio->iocb, ret, 0); + } + + kmem_cache_free(dio_cache, dio); + return ret; +} + +static void dio_aio_complete_work(struct work_struct *work) +{ + struct dio *dio = container_of(work, struct dio, complete_work); + + dio_complete(dio, dio->iocb->ki_pos, 0, true); +} + +static int dio_bio_complete(struct dio *dio, struct bio *bio); + +/* + * Asynchronous IO callback. + */ +static void dio_bio_end_aio(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + unsigned long remaining; + unsigned long flags; + + /* cleanup the bio */ + dio_bio_complete(dio, bio); + + spin_lock_irqsave(&dio->bio_lock, flags); + remaining = --dio->refcount; + if (remaining == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (remaining == 0) { + if (dio->result && dio->defer_completion) { + INIT_WORK(&dio->complete_work, dio_aio_complete_work); + queue_work(dio->inode->i_sb->s_dio_done_wq, + &dio->complete_work); + } else { + dio_complete(dio, dio->iocb->ki_pos, 0, true); + } + } +} + +/* + * The BIO completion handler simply queues the BIO up for the process-context + * handler. + * + * During I/O bi_private points at the dio. After I/O, bi_private is used to + * implement a singly-linked list of completed BIOs, at dio->bio_list. + */ +static void dio_bio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&dio->bio_lock, flags); + bio->bi_private = dio->bio_list; + dio->bio_list = bio; + if (--dio->refcount == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); +} + +/** + * dio_end_io - handle the end io action for the given bio + * @bio: The direct io bio thats being completed + * @error: Error if there was one + * + * This is meant to be called by any filesystem that uses their own dio_submit_t + * so that the DIO specific endio actions are dealt with after the filesystem + * has done it's completion work. + */ +void dio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + + if (dio->is_async) + dio_bio_end_aio(bio, error); + else + dio_bio_end_io(bio, error); +} +EXPORT_SYMBOL_GPL(dio_end_io); + +static inline void +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, + struct block_device *bdev, + sector_t first_sector, int nr_vecs) +{ + struct bio *bio; + + /* + * bio_alloc() is guaranteed to return a bio when called with + * __GFP_WAIT and we request a valid number of vectors. + */ + bio = bio_alloc(GFP_KERNEL, nr_vecs); + + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_sector; + if (dio->is_async) + bio->bi_end_io = dio_bio_end_aio; + else + bio->bi_end_io = dio_bio_end_io; + + sdio->bio = bio; + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; +} + +/* + * In the AIO read case we speculatively dirty the pages before starting IO. + * During IO completion, any of these pages which happen to have been written + * back will be redirtied by bio_check_pages_dirty(). + * + * bios hold a dio reference between submit_bio and ->end_io. + */ +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) +{ + struct bio *bio = sdio->bio; + unsigned long flags; + + bio->bi_private = dio; + + spin_lock_irqsave(&dio->bio_lock, flags); + dio->refcount++; + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (dio->is_async && dio->rw == READ) + bio_set_pages_dirty(bio); + + if (sdio->submit_io) + sdio->submit_io(dio->rw, bio, dio->inode, + sdio->logical_offset_in_bio); + else + submit_bio(dio->rw, bio); + + sdio->bio = NULL; + sdio->boundary = 0; + sdio->logical_offset_in_bio = 0; +} + +/* + * Release any resources in case of a failure + */ +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) +{ + while (sdio->head < sdio->tail) + page_cache_release(dio->pages[sdio->head++]); +} + +/* + * Wait for the next BIO to complete. Remove it and return it. NULL is + * returned once all BIOs have been completed. This must only be called once + * all bios have been issued so that dio->refcount can only decrease. This + * requires that that the caller hold a reference on the dio. + */ +static struct bio *dio_await_one(struct dio *dio) +{ + unsigned long flags; + struct bio *bio = NULL; + + spin_lock_irqsave(&dio->bio_lock, flags); + + /* + * Wait as long as the list is empty and there are bios in flight. bio + * completion drops the count, maybe adds to the list, and wakes while + * holding the bio_lock so we don't need set_current_state()'s barrier + * and can call it after testing our condition. + */ + while (dio->refcount > 1 && dio->bio_list == NULL) { + __set_current_state(TASK_UNINTERRUPTIBLE); + dio->waiter = current; + spin_unlock_irqrestore(&dio->bio_lock, flags); + io_schedule(); + /* wake up sets us TASK_RUNNING */ + spin_lock_irqsave(&dio->bio_lock, flags); + dio->waiter = NULL; + } + if (dio->bio_list) { + bio = dio->bio_list; + dio->bio_list = bio->bi_private; + } + spin_unlock_irqrestore(&dio->bio_lock, flags); + return bio; +} + +/* + * Process one completed BIO. No locks are held. + */ +static int dio_bio_complete(struct dio *dio, struct bio *bio) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; + unsigned i; + + if (!uptodate) + dio->io_error = -EIO; + + if (dio->is_async && dio->rw == READ) { + bio_check_pages_dirty(bio); /* transfers ownership */ + } else { + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (dio->rw == READ && !PageCompound(page)) + set_page_dirty_lock(page); + page_cache_release(page); + } + bio_put(bio); + } + return uptodate ? 0 : -EIO; +} + +/* + * Wait on and process all in-flight BIOs. This must only be called once + * all bios have been issued so that the refcount can only decrease. + * This just waits for all bios to make it through dio_bio_complete. IO + * errors are propagated through dio->io_error and should be propagated via + * dio_complete(). + */ +static void dio_await_completion(struct dio *dio) +{ + struct bio *bio; + do { + bio = dio_await_one(dio); + if (bio) + dio_bio_complete(dio, bio); + } while (bio); +} + +/* + * A really large O_DIRECT read or write can generate a lot of BIOs. So + * to keep the memory consumption sane we periodically reap any completed BIOs + * during the BIO generation phase. + * + * This also helps to limit the peak amount of pinned userspace memory. + */ +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) +{ + int ret = 0; + + if (sdio->reap_counter++ >= 64) { + while (dio->bio_list) { + unsigned long flags; + struct bio *bio; + int ret2; + + spin_lock_irqsave(&dio->bio_lock, flags); + bio = dio->bio_list; + dio->bio_list = bio->bi_private; + spin_unlock_irqrestore(&dio->bio_lock, flags); + ret2 = dio_bio_complete(dio, bio); + if (ret == 0) + ret = ret2; + } + sdio->reap_counter = 0; + } + return ret; +} + +/* + * Create workqueue for deferred direct IO completions. We allocate the + * workqueue when it's first needed. This avoids creating workqueue for + * filesystems that don't need it and also allows us to create the workqueue + * late enough so the we can include s_id in the name of the workqueue. + */ +static int sb_init_dio_done_wq(struct super_block *sb) +{ + struct workqueue_struct *old; + struct workqueue_struct *wq = alloc_workqueue("dio/%s", + WQ_MEM_RECLAIM, 0, + sb->s_id); + if (!wq) + return -ENOMEM; + /* + * This has to be atomic as more DIOs can race to create the workqueue + */ + old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); + /* Someone created workqueue before us? Free ours... */ + if (old) + destroy_workqueue(wq); + return 0; +} + +static int dio_set_defer_completion(struct dio *dio) +{ + struct super_block *sb = dio->inode->i_sb; + + if (dio->defer_completion) + return 0; + dio->defer_completion = true; + if (!sb->s_dio_done_wq) + return sb_init_dio_done_wq(sb); + return 0; +} + +/* + * Call into the fs to map some more disk blocks. We record the current number + * of available blocks at sdio->blocks_available. These are in units of the + * fs blocksize, (1 << inode->i_blkbits). + * + * The fs is allowed to map lots of blocks at once. If it wants to do that, + * it uses the passed inode-relative block number as the file offset, as usual. + * + * get_block() is passed the number of i_blkbits-sized blocks which direct_io + * has remaining to do. The fs should not map more than this number of blocks. + * + * If the fs has mapped a lot of blocks, it should populate bh->b_size to + * indicate how much contiguous disk space has been made available at + * bh->b_blocknr. + * + * If *any* of the mapped blocks are new, then the fs must set buffer_new(). + * This isn't very efficient... + * + * In the case of filesystem holes: the fs may return an arbitrarily-large + * hole by returning an appropriate value in b_size and by clearing + * buffer_mapped(). However the direct-io code will only process holes one + * block at a time - it will repeatedly call get_block() as it walks the hole. + */ +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) +{ + int ret; + sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ + unsigned long fs_count; /* Number of filesystem-sized blocks */ + int create; + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + + /* + * If there was a memory error and we've overwritten all the + * mapped blocks then we can now return that memory error + */ + ret = dio->page_errors; + if (ret == 0) { + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); + fs_startblk = sdio->block_in_file >> sdio->blkfactor; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; + + map_bh->b_state = 0; + map_bh->b_size = fs_count << i_blkbits; + + /* + * For writes inside i_size on a DIO_SKIP_HOLES filesystem we + * forbid block creations: only overwrites are permitted. + * We will return early to the caller once we see an + * unmapped buffer head returned, and the caller will fall + * back to buffered I/O. + * + * Otherwise the decision is left to the get_blocks method, + * which may decide to handle it or also return an unmapped + * buffer head. + */ + create = dio->rw & WRITE; + if (dio->flags & DIO_SKIP_HOLES) { + if (sdio->block_in_file < (i_size_read(dio->inode) >> + sdio->blkbits)) + create = 0; + } + + ret = (*sdio->get_block)(dio->inode, fs_startblk, + map_bh, create); + + /* Store for completion */ + dio->private = map_bh->b_private; + + if (ret == 0 && buffer_defer_completion(map_bh)) + ret = dio_set_defer_completion(dio); + } + return ret; +} + +/* + * There is no bio. Make one now. + */ +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector, struct buffer_head *map_bh) +{ + sector_t sector; + int ret, nr_pages; + + ret = dio_bio_reap(dio, sdio); + if (ret) + goto out; + sector = start_sector << (sdio->blkbits - 9); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); + BUG_ON(nr_pages <= 0); + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); + sdio->boundary = 0; +out: + return ret; +} + +/* + * Attempt to put the current chunk of 'cur_page' into the current BIO. If + * that was successful then update final_block_in_bio and take a ref against + * the just-added page. + * + * Return zero on success. Non-zero means the caller needs to start a new BIO. + */ +static inline int dio_bio_add_page(struct dio_submit *sdio) +{ + int ret; + + ret = bio_add_page(sdio->bio, sdio->cur_page, + sdio->cur_page_len, sdio->cur_page_offset); + if (ret == sdio->cur_page_len) { + /* + * Decrement count only, if we are done with this page + */ + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) + sdio->pages_in_io--; + page_cache_get(sdio->cur_page); + sdio->final_block_in_bio = sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits); + ret = 0; + } else { + ret = 1; + } + return ret; +} + +/* + * Put cur_page under IO. The section of cur_page which is described by + * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page + * starts on-disk at cur_page_block. + * + * We take a ref against the page here (on behalf of its presence in the bio). + * + * The caller of this function is responsible for removing cur_page from the + * dio, and for dropping the refcount which came from that presence. + */ +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) +{ + int ret = 0; + + if (sdio->bio) { + loff_t cur_offset = sdio->cur_page_fs_offset; + loff_t bio_next_offset = sdio->logical_offset_in_bio + + sdio->bio->bi_iter.bi_size; + + /* + * See whether this new request is contiguous with the old. + * + * Btrfs cannot handle having logically non-contiguous requests + * submitted. For example if you have + * + * Logical: [0-4095][HOLE][8192-12287] + * Physical: [0-4095] [4096-8191] + * + * We cannot submit those pages together as one BIO. So if our + * current logical offset in the file does not equal what would + * be the next logical offset in the bio, submit the bio we + * have. + */ + if (sdio->final_block_in_bio != sdio->cur_page_block || + cur_offset != bio_next_offset) + dio_bio_submit(dio, sdio); + } + + if (sdio->bio == NULL) { + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); + if (ret) + goto out; + } + + if (dio_bio_add_page(sdio) != 0) { + dio_bio_submit(dio, sdio); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); + if (ret == 0) { + ret = dio_bio_add_page(sdio); + BUG_ON(ret != 0); + } + } +out: + return ret; +} + +/* + * An autonomous function to put a chunk of a page under deferred IO. + * + * The caller doesn't actually know (or care) whether this piece of page is in + * a BIO, or is under IO or whatever. We just take care of all possible + * situations here. The separation between the logic of do_direct_IO() and + * that of submit_page_section() is important for clarity. Please don't break. + * + * The chunk of page starts on-disk at blocknr. + * + * We perform deferred IO, by recording the last-submitted page inside our + * private part of the dio structure. If possible, we just expand the IO + * across that page here. + * + * If that doesn't work out then we put the old page into the bio and add this + * page to the dio instead. + */ +static inline int +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr, + struct buffer_head *map_bh) +{ + int ret = 0; + + if (dio->rw & WRITE) { + /* + * Read accounting is performed in submit_bio() + */ + task_io_account_write(len); + } + + /* + * Can we just grow the current page's presence in the dio? + */ + if (sdio->cur_page == page && + sdio->cur_page_offset + sdio->cur_page_len == offset && + sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { + sdio->cur_page_len += len; + goto out; + } + + /* + * If there's a deferred page already there then send it. + */ + if (sdio->cur_page) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; + if (ret) + return ret; + } + + page_cache_get(page); /* It is in dio */ + sdio->cur_page = page; + sdio->cur_page_offset = offset; + sdio->cur_page_len = len; + sdio->cur_page_block = blocknr; + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; +out: + /* + * If sdio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + dio_bio_submit(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; + } + return ret; +} + +/* + * Clean any dirty buffers in the blockdev mapping which alias newly-created + * file blocks. Only called for S_ISREG files - blockdevs do not set + * buffer_new + */ +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) +{ + unsigned i; + unsigned nblocks; + + nblocks = map_bh->b_size >> dio->inode->i_blkbits; + + for (i = 0; i < nblocks; i++) { + unmap_underlying_metadata(map_bh->b_bdev, + map_bh->b_blocknr + i); + } +} + +/* + * If we are not writing the entire block and get_block() allocated + * the block for us, we need to fill-in the unused portion of the + * block with zeros. This happens only if user-buffer, fileoffset or + * io length is not filesystem block-size multiple. + * + * `end' is zero if we're doing the start of the IO, 1 at the end of the + * IO. + */ +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, + int end, struct buffer_head *map_bh) +{ + unsigned dio_blocks_per_fs_block; + unsigned this_chunk_blocks; /* In dio_blocks */ + unsigned this_chunk_bytes; + struct page *page; + + sdio->start_zero_done = 1; + if (!sdio->blkfactor || !buffer_new(map_bh)) + return; + + dio_blocks_per_fs_block = 1 << sdio->blkfactor; + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); + + if (!this_chunk_blocks) + return; + + /* + * We need to zero out part of an fs block. It is either at the + * beginning or the end of the fs block. + */ + if (end) + this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; + + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; + + page = ZERO_PAGE(0); + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, + sdio->next_block_for_io, map_bh)) + return; + + sdio->next_block_for_io += this_chunk_blocks; +} + +/* + * Walk the user pages, and the file, mapping blocks to disk and generating + * a sequence of (page,offset,len,block) mappings. These mappings are injected + * into submit_page_section(), which takes care of the next stage of submission + * + * Direct IO against a blockdev is different from a file. Because we can + * happily perform page-sized but 512-byte aligned IOs. It is important that + * blockdev IO be able to have fine alignment and large sizes. + * + * So what we do is to permit the ->get_block function to populate bh.b_size + * with the size of IO which is permitted at this offset and this i_blkbits. + * + * For best results, the blockdev should be set up with 512-byte i_blkbits and + * it should set b_size to PAGE_SIZE or more inside get_block(). This gives + * fine alignment but still allows this function to work in PAGE_SIZE units. + */ +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) +{ + const unsigned blkbits = sdio->blkbits; + int ret = 0; + + while (sdio->block_in_file < sdio->final_block_in_request) { + struct page *page; + size_t from, to; + + page = dio_get_page(dio, sdio); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; + + while (from < to) { + unsigned this_chunk_bytes; /* # of bytes mapped */ + unsigned this_chunk_blocks; /* # of blocks */ + unsigned u; + + if (sdio->blocks_available == 0) { + /* + * Need to go and map some more disk + */ + unsigned long blkmask; + unsigned long dio_remainder; + + ret = get_more_blocks(dio, sdio, map_bh); + if (ret) { + page_cache_release(page); + goto out; + } + if (!buffer_mapped(map_bh)) + goto do_holes; + + sdio->blocks_available = + map_bh->b_size >> sdio->blkbits; + sdio->next_block_for_io = + map_bh->b_blocknr << sdio->blkfactor; + if (buffer_new(map_bh)) + clean_blockdev_aliases(dio, map_bh); + + if (!sdio->blkfactor) + goto do_holes; + + blkmask = (1 << sdio->blkfactor) - 1; + dio_remainder = (sdio->block_in_file & blkmask); + + /* + * If we are at the start of IO and that IO + * starts partway into a fs-block, + * dio_remainder will be non-zero. If the IO + * is a read then we can simply advance the IO + * cursor to the first block which is to be + * read. But if the IO is a write and the + * block was newly allocated we cannot do that; + * the start of the fs block must be zeroed out + * on-disk + */ + if (!buffer_new(map_bh)) + sdio->next_block_for_io += dio_remainder; + sdio->blocks_available -= dio_remainder; + } +do_holes: + /* Handle holes */ + if (!buffer_mapped(map_bh)) { + loff_t i_size_aligned; + + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw & WRITE) { + page_cache_release(page); + return -ENOTBLK; + } + + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); + if (sdio->block_in_file >= + i_size_aligned >> blkbits) { + /* We hit eof */ + page_cache_release(page); + goto out; + } + zero_user(page, from, 1 << blkbits); + sdio->block_in_file++; + from += 1 << blkbits; + dio->result += 1 << blkbits; + goto next_block; + } + + /* + * If we're performing IO which has an alignment which + * is finer than the underlying fs, go check to see if + * we must zero out the start of this block. + */ + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) + dio_zero_block(dio, sdio, 0, map_bh); + + /* + * Work out, in this_chunk_blocks, how much disk we + * can add to this page + */ + this_chunk_blocks = sdio->blocks_available; + u = (to - from) >> blkbits; + if (this_chunk_blocks > u) + this_chunk_blocks = u; + u = sdio->final_block_in_request - sdio->block_in_file; + if (this_chunk_blocks > u) + this_chunk_blocks = u; + this_chunk_bytes = this_chunk_blocks << blkbits; + BUG_ON(this_chunk_bytes == 0); + + if (this_chunk_blocks == sdio->blocks_available) + sdio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, sdio, page, + from, + this_chunk_bytes, + sdio->next_block_for_io, + map_bh); + if (ret) { + page_cache_release(page); + goto out; + } + sdio->next_block_for_io += this_chunk_blocks; + + sdio->block_in_file += this_chunk_blocks; + from += this_chunk_bytes; + dio->result += this_chunk_bytes; + sdio->blocks_available -= this_chunk_blocks; +next_block: + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); + if (sdio->block_in_file == sdio->final_block_in_request) + break; + } + + /* Drop the ref which was taken in get_user_pages() */ + page_cache_release(page); + } +out: + return ret; +} + +static inline int drop_refcount(struct dio *dio) +{ + int ret2; + unsigned long flags; + + /* + * Sync will always be dropping the final ref and completing the + * operation. AIO can if it was a broken operation described above or + * in fact if all the bios race to complete before we get here. In + * that case dio_complete() translates the EIOCBQUEUED into the proper + * return code that the caller will hand to ->complete(). + * + * This is managed by the bio_lock instead of being an atomic_t so that + * completion paths can drop their ref and use the remaining count to + * decide to wake the submission path atomically. + */ + spin_lock_irqsave(&dio->bio_lock, flags); + ret2 = --dio->refcount; + spin_unlock_irqrestore(&dio->bio_lock, flags); + return ret2; +} + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * + * To help with locking against truncate we incremented the i_dio_count + * counter before starting direct I/O, and decrement it once we are done. + * Truncate can wait for it to reach zero to provide exclusion. It is + * expected that filesystem provide exclusion between new direct I/O + * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * but other filesystems need to take care of this on their own. + * + * NOTE: if you pass "sdio" to anything by pointer make sure that function + * is always inlined. Otherwise gcc is unable to split the structure into + * individual fields and will generate much worse code. This is important + * for the whole file. + */ +static inline ssize_t +do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + loff_t offset, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; + unsigned blocksize_mask = (1 << blkbits) - 1; + ssize_t retval = -EINVAL; + size_t count = iov_iter_count(iter); + loff_t end = offset + count; + struct dio *dio; + struct dio_submit sdio = { 0, }; + struct buffer_head map_bh = { 0, }; + struct blk_plug plug; + unsigned long align = offset | iov_iter_alignment(iter); + + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + goto out; + } + + /* watch out for a 0 len io from a tricksy fs */ + if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) + return 0; + + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + /* + * Believe it or not, zeroing out the page array caused a .5% + * performance regression in a database benchmark. So, we take + * care to only zero out what's needed. + */ + memset(dio, 0, offsetof(struct dio, pages)); + + dio->flags = flags; + if (dio->flags & DIO_LOCKING) { + if (iov_iter_rw(iter) == READ) { + struct address_space *mapping = + iocb->ki_filp->f_mapping; + + /* will be released by direct_io_worker */ + mutex_lock(&inode->i_mutex); + + retval = filemap_write_and_wait_range(mapping, offset, + end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + kmem_cache_free(dio_cache, dio); + goto out; + } + } + } + + /* + * For file extending writes updating i_size before data writeouts + * complete can expose uninitialized blocks in dumb filesystems. + * In that case we need to wait for I/O completion even if asked + * for an asynchronous write. + */ + if (is_sync_kiocb(iocb)) + dio->is_async = false; + else if (!(dio->flags & DIO_ASYNC_EXTEND) && + iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) + dio->is_async = false; + else + dio->is_async = true; + + dio->inode = inode; + dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ; + + /* + * For AIO O_(D)SYNC writes we need to defer completions to a workqueue + * so that we can call ->fsync. + */ + if (dio->is_async && iov_iter_rw(iter) == WRITE && + ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host))) { + retval = dio_set_defer_completion(dio); + if (retval) { + /* + * We grab i_mutex only for reads so we don't have + * to release it here + */ + kmem_cache_free(dio_cache, dio); + goto out; + } + } + + /* + * Will be decremented at I/O completion time. + */ + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); + + retval = 0; + sdio.blkbits = blkbits; + sdio.blkfactor = i_blkbits - blkbits; + sdio.block_in_file = offset >> blkbits; + + sdio.get_block = get_block; + dio->end_io = end_io; + sdio.submit_io = submit_io; + sdio.final_block_in_bio = -1; + sdio.next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + sdio.iter = iter; + sdio.final_block_in_request = + (offset + iov_iter_count(iter)) >> blkbits; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(sdio.blkfactor)) + sdio.pages_in_io = 2; + + sdio.pages_in_io += iov_iter_npages(iter, INT_MAX); + + blk_start_plug(&plug); + + retval = do_direct_IO(dio, &sdio, &map_bh); + if (retval) + dio_cleanup(dio, &sdio); + + if (retval == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + retval = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, &sdio, 1, &map_bh); + + if (sdio.cur_page) { + ssize_t ret2; + + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); + if (retval == 0) + retval = ret2; + page_cache_release(sdio.cur_page); + sdio.cur_page = NULL; + } + if (sdio.bio) + dio_bio_submit(dio, &sdio); + + blk_finish_plug(&plug); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio, &sdio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(retval == -EIOCBQUEUED); + if (dio->is_async && retval == 0 && dio->result && + (iov_iter_rw(iter) == READ || dio->result == count)) + retval = -EIOCBQUEUED; + else + dio_await_completion(dio); + + if (drop_refcount(dio) == 0) { + retval = dio_complete(dio, offset, retval, false); + } else + BUG_ON(retval != -EIOCBQUEUED); + +out: + return retval; +} + +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + loff_t offset, get_block_t get_block, + dio_iodone_t end_io, dio_submit_t submit_io, + int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block, + end_io, submit_io, flags); +} + +EXPORT_SYMBOL(__blockdev_direct_IO); + +static __init int dio_init(void) +{ + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + return 0; +} +module_init(dio_init) diff --git a/kernel/fs/dlm/Kconfig b/kernel/fs/dlm/Kconfig new file mode 100644 index 000000000..e4242c3f8 --- /dev/null +++ b/kernel/fs/dlm/Kconfig @@ -0,0 +1,16 @@ +menuconfig DLM + tristate "Distributed Lock Manager (DLM)" + depends on INET + depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) + select IP_SCTP + help + A general purpose distributed lock manager for kernel or userspace + applications. + +config DLM_DEBUG + bool "DLM debugging" + depends on DLM + help + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. diff --git a/kernel/fs/dlm/Makefile b/kernel/fs/dlm/Makefile new file mode 100644 index 000000000..ca1c9124c --- /dev/null +++ b/kernel/fs/dlm/Makefile @@ -0,0 +1,21 @@ +obj-$(CONFIG_DLM) += dlm.o +dlm-y := ast.o \ + config.o \ + dir.o \ + lock.o \ + lockspace.o \ + main.o \ + member.o \ + memory.o \ + midcomms.o \ + netlink.o \ + lowcomms.o \ + plock.o \ + rcom.o \ + recover.o \ + recoverd.o \ + requestqueue.o \ + user.o \ + util.o +dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o + diff --git a/kernel/fs/dlm/ast.c b/kernel/fs/dlm/ast.c new file mode 100644 index 000000000..dcea1e37a --- /dev/null +++ b/kernel/fs/dlm/ast.c @@ -0,0 +1,314 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lock.h" +#include "user.h" +#include "ast.h" + +static uint64_t dlm_cb_seq; +static DEFINE_SPINLOCK(dlm_cb_seq_spin); + +static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +{ + int i; + + log_print("last_bast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_bast.seq, + lkb->lkb_last_bast.flags, + lkb->lkb_last_bast.mode, + lkb->lkb_last_bast.sb_status, + lkb->lkb_last_bast.sb_flags); + + log_print("last_cast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.flags, + lkb->lkb_last_cast.mode, + lkb->lkb_last_cast.sb_status, + lkb->lkb_last_cast.sb_flags); + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + log_print("cb %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_callbacks[i].seq, + lkb->lkb_callbacks[i].flags, + lkb->lkb_callbacks[i].mode, + lkb->lkb_callbacks[i].sb_status, + lkb->lkb_callbacks[i].sb_flags); + } +} + +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t prev_seq; + int prev_mode; + int i, rv; + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (lkb->lkb_callbacks[i].seq) + continue; + + /* + * Suppress some redundant basts here, do more on removal. + * Don't even add a bast if the callback just before it + * is a bast for the same mode or a more restrictive mode. + * (the addional > PR check is needed for PR/CW inversion) + */ + + if ((i > 0) && (flags & DLM_CB_BAST) && + (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { + + prev_seq = lkb->lkb_callbacks[i-1].seq; + prev_mode = lkb->lkb_callbacks[i-1].mode; + + if ((prev_mode == mode) || + (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { + + log_debug(ls, "skip %x add bast %llu mode %d " + "for bast %llu mode %d", + lkb->lkb_id, + (unsigned long long)seq, + mode, + (unsigned long long)prev_seq, + prev_mode); + rv = 0; + goto out; + } + } + + lkb->lkb_callbacks[i].seq = seq; + lkb->lkb_callbacks[i].flags = flags; + lkb->lkb_callbacks[i].mode = mode; + lkb->lkb_callbacks[i].sb_status = status; + lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); + rv = 0; + break; + } + + if (i == DLM_CALLBACKS_SIZE) { + log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, (unsigned long long)seq, + flags, mode, status, sbflags); + dlm_dump_lkb_callbacks(lkb); + rv = -1; + goto out; + } + out: + return rv; +} + +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid) +{ + int i, rv; + + *resid = 0; + + if (!lkb->lkb_callbacks[0].seq) { + rv = -ENOENT; + goto out; + } + + /* oldest undelivered cb is callbacks[0] */ + + memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); + + /* shift others down */ + + for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { + if (!lkb->lkb_callbacks[i].seq) + break; + memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], + sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); + (*resid)++; + } + + /* if cb is a bast, it should be skipped if the blocking mode is + compatible with the last granted mode */ + + if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { + if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { + cb->flags |= DLM_CB_SKIP; + + log_debug(ls, "skip %x bast %llu mode %d " + "for cast %llu mode %d", + lkb->lkb_id, + (unsigned long long)cb->seq, + cb->mode, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.mode); + rv = 0; + goto out; + } + } + + if (cb->flags & DLM_CB_CAST) { + memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_cast_time = ktime_get(); + } + + if (cb->flags & DLM_CB_BAST) { + memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_bast_time = ktime_get(); + } + rv = 0; + out: + return rv; +} + +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t new_seq, prev_seq; + int rv; + + spin_lock(&dlm_cb_seq_spin); + new_seq = ++dlm_cb_seq; + spin_unlock(&dlm_cb_seq_spin); + + if (lkb->lkb_flags & DLM_IFL_USER) { + dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); + return; + } + + mutex_lock(&lkb->lkb_cb_mutex); + prev_seq = lkb->lkb_callbacks[0].seq; + + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); + if (rv < 0) + goto out; + + if (!prev_seq) { + kref_get(&lkb->lkb_ref); + + if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { + mutex_lock(&ls->ls_cb_mutex); + list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); + mutex_unlock(&ls->ls_cb_mutex); + } else { + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + } + } + out: + mutex_unlock(&lkb->lkb_cb_mutex); +} + +void dlm_callback_work(struct work_struct *work) +{ + struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work); + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + void (*castfn) (void *astparam); + void (*bastfn) (void *astparam, int mode); + struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; + int i, rv, resid; + + memset(&callbacks, 0, sizeof(callbacks)); + + mutex_lock(&lkb->lkb_cb_mutex); + if (!lkb->lkb_callbacks[0].seq) { + /* no callback work exists, shouldn't happen */ + log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); + if (rv < 0) + break; + } + + if (resid) { + /* cbs remain, loop should have removed all, shouldn't happen */ + log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, + resid); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } + mutex_unlock(&lkb->lkb_cb_mutex); + + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (!callbacks[i].seq) + break; + if (callbacks[i].flags & DLM_CB_SKIP) { + continue; + } else if (callbacks[i].flags & DLM_CB_BAST) { + bastfn(lkb->lkb_astparam, callbacks[i].mode); + } else if (callbacks[i].flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = callbacks[i].sb_status; + lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + castfn(lkb->lkb_astparam); + } + } + + /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ + dlm_put_lkb(lkb); +} + +int dlm_callback_start(struct dlm_ls *ls) +{ + ls->ls_callback_wq = alloc_workqueue("dlm_callback", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!ls->ls_callback_wq) { + log_print("can't start dlm_callback workqueue"); + return -ENOMEM; + } + return 0; +} + +void dlm_callback_stop(struct dlm_ls *ls) +{ + if (ls->ls_callback_wq) + destroy_workqueue(ls->ls_callback_wq); +} + +void dlm_callback_suspend(struct dlm_ls *ls) +{ + set_bit(LSFL_CB_DELAY, &ls->ls_flags); + + if (ls->ls_callback_wq) + flush_workqueue(ls->ls_callback_wq); +} + +void dlm_callback_resume(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + int count = 0; + + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + + if (!ls->ls_callback_wq) + return; + + mutex_lock(&ls->ls_cb_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { + list_del_init(&lkb->lkb_cb_list); + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + count++; + } + mutex_unlock(&ls->ls_cb_mutex); + + if (count) + log_rinfo(ls, "dlm_callback_resume %d", count); +} + diff --git a/kernel/fs/dlm/ast.h b/kernel/fs/dlm/ast.h new file mode 100644 index 000000000..757b551c6 --- /dev/null +++ b/kernel/fs/dlm/ast.h @@ -0,0 +1,32 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __ASTD_DOT_H__ +#define __ASTD_DOT_H__ + +void dlm_del_ast(struct dlm_lkb *lkb); +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid); +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); + +void dlm_callback_work(struct work_struct *work); +int dlm_callback_start(struct dlm_ls *ls); +void dlm_callback_stop(struct dlm_ls *ls); +void dlm_callback_suspend(struct dlm_ls *ls); +void dlm_callback_resume(struct dlm_ls *ls); + +#endif + + diff --git a/kernel/fs/dlm/config.c b/kernel/fs/dlm/config.c new file mode 100644 index 000000000..d521bddf8 --- /dev/null +++ b/kernel/fs/dlm/config.c @@ -0,0 +1,1040 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "lowcomms.h" + +/* + * /config/dlm//spaces//nodes//nodeid + * /config/dlm//spaces//nodes//weight + * /config/dlm//comms//nodeid + * /config/dlm//comms//local + * /config/dlm//comms//addr (write only) + * /config/dlm//comms//addr_list (read only) + * The level is useless, but I haven't figured out how to avoid it. + */ + +static struct config_group *space_list; +static struct config_group *comm_list; +static struct dlm_comm *local_comm; +static uint32_t dlm_comm_count; + +struct dlm_clusters; +struct dlm_cluster; +struct dlm_spaces; +struct dlm_space; +struct dlm_comms; +struct dlm_comm; +struct dlm_nodes; +struct dlm_node; + +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); +static struct config_group *make_space(struct config_group *, const char *); +static void drop_space(struct config_group *, struct config_item *); +static void release_space(struct config_item *); +static struct config_item *make_comm(struct config_group *, const char *); +static void drop_comm(struct config_group *, struct config_item *); +static void release_comm(struct config_item *); +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); + +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); + +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf); +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len); +static ssize_t node_weight_read(struct dlm_node *nd, char *buf); +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len); + +struct dlm_cluster { + struct config_group group; + unsigned int cl_tcp_port; + unsigned int cl_buffer_size; + unsigned int cl_rsbtbl_size; + unsigned int cl_recover_timer; + unsigned int cl_toss_secs; + unsigned int cl_scan_secs; + unsigned int cl_log_debug; + unsigned int cl_protocol; + unsigned int cl_timewarn_cs; + unsigned int cl_waitwarn_us; + unsigned int cl_new_rsb_count; + unsigned int cl_recover_callbacks; + char cl_cluster_name[DLM_LOCKSPACE_LEN]; +}; + +enum { + CLUSTER_ATTR_TCP_PORT = 0, + CLUSTER_ATTR_BUFFER_SIZE, + CLUSTER_ATTR_RSBTBL_SIZE, + CLUSTER_ATTR_RECOVER_TIMER, + CLUSTER_ATTR_TOSS_SECS, + CLUSTER_ATTR_SCAN_SECS, + CLUSTER_ATTR_LOG_DEBUG, + CLUSTER_ATTR_PROTOCOL, + CLUSTER_ATTR_TIMEWARN_CS, + CLUSTER_ATTR_WAITWARN_US, + CLUSTER_ATTR_NEW_RSB_COUNT, + CLUSTER_ATTR_RECOVER_CALLBACKS, + CLUSTER_ATTR_CLUSTER_NAME, +}; + +struct cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_cluster *, char *); + ssize_t (*store)(struct dlm_cluster *, const char *, size_t); +}; + +static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) +{ + return sprintf(buf, "%s\n", cl->cl_cluster_name); +} + +static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, + const char *buf, size_t len) +{ + strlcpy(dlm_config.ci_cluster_name, buf, + sizeof(dlm_config.ci_cluster_name)); + strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); + return len; +} + +static struct cluster_attribute cluster_attr_cluster_name = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "cluster_name", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = cluster_cluster_name_read, + .store = cluster_cluster_name_write, +}; + +static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, + int *info_field, int check_zero, + const char *buf, size_t len) +{ + unsigned int x; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + rc = kstrtouint(buf, 0, &x); + if (rc) + return rc; + + if (check_zero && !x) + return -EINVAL; + + *cl_field = x; + *info_field = x; + + return len; +} + +#define CLUSTER_ATTR(name, check_zero) \ +static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \ +{ \ + return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ + check_zero, buf, len); \ +} \ +static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ +} \ +static struct cluster_attribute cluster_attr_##name = \ +__CONFIGFS_ATTR(name, 0644, name##_read, name##_write) + +CLUSTER_ATTR(tcp_port, 1); +CLUSTER_ATTR(buffer_size, 1); +CLUSTER_ATTR(rsbtbl_size, 1); +CLUSTER_ATTR(recover_timer, 1); +CLUSTER_ATTR(toss_secs, 1); +CLUSTER_ATTR(scan_secs, 1); +CLUSTER_ATTR(log_debug, 0); +CLUSTER_ATTR(protocol, 0); +CLUSTER_ATTR(timewarn_cs, 1); +CLUSTER_ATTR(waitwarn_us, 0); +CLUSTER_ATTR(new_rsb_count, 0); +CLUSTER_ATTR(recover_callbacks, 0); + +static struct configfs_attribute *cluster_attrs[] = { + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, + [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, + [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, + [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, + [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, + [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr, + [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr, + NULL, +}; + +enum { + COMM_ATTR_NODEID = 0, + COMM_ATTR_LOCAL, + COMM_ATTR_ADDR, + COMM_ATTR_ADDR_LIST, +}; + +struct comm_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_comm *, char *); + ssize_t (*store)(struct dlm_comm *, const char *, size_t); +}; + +static struct comm_attribute comm_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_nodeid_read, + .store = comm_nodeid_write, +}; + +static struct comm_attribute comm_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_local_read, + .store = comm_local_write, +}; + +static struct comm_attribute comm_attr_addr = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr", + .ca_mode = S_IWUSR }, + .store = comm_addr_write, +}; + +static struct comm_attribute comm_attr_addr_list = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr_list", + .ca_mode = S_IRUGO }, + .show = comm_addr_list_read, +}; + +static struct configfs_attribute *comm_attrs[] = { + [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, + [COMM_ATTR_LOCAL] = &comm_attr_local.attr, + [COMM_ATTR_ADDR] = &comm_attr_addr.attr, + [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr, + NULL, +}; + +enum { + NODE_ATTR_NODEID = 0, + NODE_ATTR_WEIGHT, +}; + +struct node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_node *, char *); + ssize_t (*store)(struct dlm_node *, const char *, size_t); +}; + +static struct node_attribute node_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_nodeid_read, + .store = node_nodeid_write, +}; + +static struct node_attribute node_attr_weight = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "weight", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_weight_read, + .store = node_weight_write, +}; + +static struct configfs_attribute *node_attrs[] = { + [NODE_ATTR_NODEID] = &node_attr_nodeid.attr, + [NODE_ATTR_WEIGHT] = &node_attr_weight.attr, + NULL, +}; + +struct dlm_clusters { + struct configfs_subsystem subsys; +}; + +struct dlm_spaces { + struct config_group ss_group; +}; + +struct dlm_space { + struct config_group group; + struct list_head members; + struct mutex members_lock; + int members_count; +}; + +struct dlm_comms { + struct config_group cs_group; +}; + +struct dlm_comm { + struct config_item item; + int seq; + int nodeid; + int local; + int addr_count; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +struct dlm_nodes { + struct config_group ns_group; +}; + +struct dlm_node { + struct config_item item; + struct list_head list; /* space->members */ + int nodeid; + int weight; + int new; + int comm_seq; /* copy of cm->seq when nd->nodeid is set */ +}; + +static struct configfs_group_operations clusters_ops = { + .make_group = make_cluster, + .drop_item = drop_cluster, +}; + +static struct configfs_item_operations cluster_ops = { + .release = release_cluster, + .show_attribute = show_cluster, + .store_attribute = store_cluster, +}; + +static struct configfs_group_operations spaces_ops = { + .make_group = make_space, + .drop_item = drop_space, +}; + +static struct configfs_item_operations space_ops = { + .release = release_space, +}; + +static struct configfs_group_operations comms_ops = { + .make_item = make_comm, + .drop_item = drop_comm, +}; + +static struct configfs_item_operations comm_ops = { + .release = release_comm, + .show_attribute = show_comm, + .store_attribute = store_comm, +}; + +static struct configfs_group_operations nodes_ops = { + .make_item = make_node, + .drop_item = drop_node, +}; + +static struct configfs_item_operations node_ops = { + .release = release_node, + .show_attribute = show_node, + .store_attribute = store_node, +}; + +static struct config_item_type clusters_type = { + .ct_group_ops = &clusters_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type cluster_type = { + .ct_item_ops = &cluster_ops, + .ct_attrs = cluster_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type spaces_type = { + .ct_group_ops = &spaces_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type space_type = { + .ct_item_ops = &space_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comms_type = { + .ct_group_ops = &comms_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comm_type = { + .ct_item_ops = &comm_ops, + .ct_attrs = comm_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type nodes_type = { + .ct_group_ops = &nodes_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type node_type = { + .ct_item_ops = &node_ops, + .ct_attrs = node_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_cluster, group) : + NULL; +} + +static struct dlm_space *config_item_to_space(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_space, group) : + NULL; +} + +static struct dlm_comm *config_item_to_comm(struct config_item *i) +{ + return i ? container_of(i, struct dlm_comm, item) : NULL; +} + +static struct dlm_node *config_item_to_node(struct config_item *i) +{ + return i ? container_of(i, struct dlm_node, item) : NULL; +} + +static struct config_group *make_cluster(struct config_group *g, + const char *name) +{ + struct dlm_cluster *cl = NULL; + struct dlm_spaces *sps = NULL; + struct dlm_comms *cms = NULL; + void *gps = NULL; + + cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); + gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); + cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); + + if (!cl || !gps || !sps || !cms) + goto fail; + + config_group_init_type_name(&cl->group, name, &cluster_type); + config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); + config_group_init_type_name(&cms->cs_group, "comms", &comms_type); + + cl->group.default_groups = gps; + cl->group.default_groups[0] = &sps->ss_group; + cl->group.default_groups[1] = &cms->cs_group; + cl->group.default_groups[2] = NULL; + + cl->cl_tcp_port = dlm_config.ci_tcp_port; + cl->cl_buffer_size = dlm_config.ci_buffer_size; + cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; + cl->cl_recover_timer = dlm_config.ci_recover_timer; + cl->cl_toss_secs = dlm_config.ci_toss_secs; + cl->cl_scan_secs = dlm_config.ci_scan_secs; + cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_protocol = dlm_config.ci_protocol; + cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; + cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; + cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; + cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; + memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, + DLM_LOCKSPACE_LEN); + + space_list = &sps->ss_group; + comm_list = &cms->cs_group; + return &cl->group; + + fail: + kfree(cl); + kfree(gps); + kfree(sps); + kfree(cms); + return ERR_PTR(-ENOMEM); +} + +static void drop_cluster(struct config_group *g, struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct config_item *tmp; + int j; + + for (j = 0; cl->group.default_groups[j]; j++) { + tmp = &cl->group.default_groups[j]->cg_item; + cl->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + space_list = NULL; + comm_list = NULL; + + config_item_put(i); +} + +static void release_cluster(struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + kfree(cl->group.default_groups); + kfree(cl); +} + +static struct config_group *make_space(struct config_group *g, const char *name) +{ + struct dlm_space *sp = NULL; + struct dlm_nodes *nds = NULL; + void *gps = NULL; + + sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); + gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); + + if (!sp || !gps || !nds) + goto fail; + + config_group_init_type_name(&sp->group, name, &space_type); + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); + + sp->group.default_groups = gps; + sp->group.default_groups[0] = &nds->ns_group; + sp->group.default_groups[1] = NULL; + + INIT_LIST_HEAD(&sp->members); + mutex_init(&sp->members_lock); + sp->members_count = 0; + return &sp->group; + + fail: + kfree(sp); + kfree(gps); + kfree(nds); + return ERR_PTR(-ENOMEM); +} + +static void drop_space(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + struct config_item *tmp; + int j; + + /* assert list_empty(&sp->members) */ + + for (j = 0; sp->group.default_groups[j]; j++) { + tmp = &sp->group.default_groups[j]->cg_item; + sp->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + config_item_put(i); +} + +static void release_space(struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + kfree(sp->group.default_groups); + kfree(sp); +} + +static struct config_item *make_comm(struct config_group *g, const char *name) +{ + struct dlm_comm *cm; + + cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); + if (!cm) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&cm->item, name, &comm_type); + + cm->seq = dlm_comm_count++; + if (!cm->seq) + cm->seq = dlm_comm_count++; + + cm->nodeid = -1; + cm->local = 0; + cm->addr_count = 0; + return &cm->item; +} + +static void drop_comm(struct config_group *g, struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + if (local_comm == cm) + local_comm = NULL; + dlm_lowcomms_close(cm->nodeid); + while (cm->addr_count--) + kfree(cm->addr[cm->addr_count]); + config_item_put(i); +} + +static void release_comm(struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + kfree(cm); +} + +static struct config_item *make_node(struct config_group *g, const char *name) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd; + + nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); + if (!nd) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&nd->item, name, &node_type); + nd->nodeid = -1; + nd->weight = 1; /* default weight of 1 if none is set */ + nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ + + mutex_lock(&sp->members_lock); + list_add(&nd->list, &sp->members); + sp->members_count++; + mutex_unlock(&sp->members_lock); + + return &nd->item; +} + +static void drop_node(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd = config_item_to_node(i); + + mutex_lock(&sp->members_lock); + list_del(&nd->list); + sp->members_count--; + mutex_unlock(&sp->members_lock); + + config_item_put(i); +} + +static void release_node(struct config_item *i) +{ + struct dlm_node *nd = config_item_to_node(i); + kfree(nd); +} + +static struct dlm_clusters clusters_root = { + .subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "dlm", + .ci_type = &clusters_type, + }, + }, + }, +}; + +int __init dlm_config_init(void) +{ + config_group_init(&clusters_root.subsys.su_group); + mutex_init(&clusters_root.subsys.su_mutex); + return configfs_register_subsystem(&clusters_root.subsys); +} + +void dlm_config_exit(void) +{ + configfs_unregister_subsystem(&clusters_root.subsys); +} + +/* + * Functions for user space to read/write attributes + */ + +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->show ? cla->show(cl, buf) : 0; +} + +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->store ? cla->store(cl, buf, len) : -EINVAL; +} + +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_comm *cm = config_item_to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->show ? cma->show(cm, buf) : 0; +} + +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_comm *cm = config_item_to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->store ? cma->store(cm, buf, len) : -EINVAL; +} + +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->nodeid); +} + +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len) +{ + int rc = kstrtoint(buf, 0, &cm->nodeid); + + if (rc) + return rc; + return len; +} + +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->local); +} + +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len) +{ + int rc = kstrtoint(buf, 0, &cm->local); + + if (rc) + return rc; + if (cm->local && !local_comm) + local_comm = cm; + return len; +} + +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) +{ + struct sockaddr_storage *addr; + int rv; + + if (len != sizeof(struct sockaddr_storage)) + return -EINVAL; + + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) + return -ENOSPC; + + addr = kzalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + return -ENOMEM; + + memcpy(addr, buf, len); + + rv = dlm_lowcomms_addr(cm->nodeid, addr, len); + if (rv) { + kfree(addr); + return rv; + } + + cm->addr[cm->addr_count++] = addr; + return len; +} + +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf) +{ + ssize_t s; + ssize_t allowance; + int i; + struct sockaddr_storage *addr; + struct sockaddr_in *addr_in; + struct sockaddr_in6 *addr_in6; + + /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ + char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; + + + /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ + allowance = 4096; + buf[0] = '\0'; + + for (i = 0; i < cm->addr_count; i++) { + addr = cm->addr[i]; + + switch(addr->ss_family) { + case AF_INET: + addr_in = (struct sockaddr_in *)addr; + s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); + break; + case AF_INET6: + addr_in6 = (struct sockaddr_in6 *)addr; + s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); + break; + default: + s = sprintf(buf0, "%s\n", ""); + break; + } + allowance -= s; + if (allowance >= 0) + strcat(buf, buf0); + else { + allowance += s; + break; + } + } + return 4096 - allowance; +} + +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_node *nd = config_item_to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->show ? nda->show(nd, buf) : 0; +} + +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_node *nd = config_item_to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->store ? nda->store(nd, buf, len) : -EINVAL; +} + +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->nodeid); +} + +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len) +{ + uint32_t seq = 0; + int rc = kstrtoint(buf, 0, &nd->nodeid); + + if (rc) + return rc; + dlm_comm_seq(nd->nodeid, &seq); + nd->comm_seq = seq; + return len; +} + +static ssize_t node_weight_read(struct dlm_node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->weight); +} + +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len) +{ + int rc = kstrtoint(buf, 0, &nd->weight); + + if (rc) + return rc; + return len; +} + +/* + * Functions for the dlm to get the info that's been configured + */ + +static struct dlm_space *get_space(char *name) +{ + struct config_item *i; + + if (!space_list) + return NULL; + + mutex_lock(&space_list->cg_subsys->su_mutex); + i = config_group_find_item(space_list, name); + mutex_unlock(&space_list->cg_subsys->su_mutex); + + return config_item_to_space(i); +} + +static void put_space(struct dlm_space *sp) +{ + config_item_put(&sp->group.cg_item); +} + +static struct dlm_comm *get_comm(int nodeid) +{ + struct config_item *i; + struct dlm_comm *cm = NULL; + int found = 0; + + if (!comm_list) + return NULL; + + mutex_lock(&clusters_root.subsys.su_mutex); + + list_for_each_entry(i, &comm_list->cg_children, ci_entry) { + cm = config_item_to_comm(i); + + if (cm->nodeid != nodeid) + continue; + found = 1; + config_item_get(i); + break; + } + mutex_unlock(&clusters_root.subsys.su_mutex); + + if (!found) + cm = NULL; + return cm; +} + +static void put_comm(struct dlm_comm *cm) +{ + config_item_put(&cm->item); +} + +/* caller must free mem */ +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out) +{ + struct dlm_space *sp; + struct dlm_node *nd; + struct dlm_config_node *nodes, *node; + int rv, count; + + sp = get_space(lsname); + if (!sp) + return -EEXIST; + + mutex_lock(&sp->members_lock); + if (!sp->members_count) { + rv = -EINVAL; + printk(KERN_ERR "dlm: zero members_count\n"); + goto out; + } + + count = sp->members_count; + + nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); + if (!nodes) { + rv = -ENOMEM; + goto out; + } + + node = nodes; + list_for_each_entry(nd, &sp->members, list) { + node->nodeid = nd->nodeid; + node->weight = nd->weight; + node->new = nd->new; + node->comm_seq = nd->comm_seq; + node++; + + nd->new = 0; + } + + *count_out = count; + *nodes_out = nodes; + rv = 0; + out: + mutex_unlock(&sp->members_lock); + put_space(sp); + return rv; +} + +int dlm_comm_seq(int nodeid, uint32_t *seq) +{ + struct dlm_comm *cm = get_comm(nodeid); + if (!cm) + return -EEXIST; + *seq = cm->seq; + put_comm(cm); + return 0; +} + +int dlm_our_nodeid(void) +{ + return local_comm ? local_comm->nodeid : 0; +} + +/* num 0 is first addr, num 1 is second addr */ +int dlm_our_addr(struct sockaddr_storage *addr, int num) +{ + if (!local_comm) + return -1; + if (num + 1 > local_comm->addr_count) + return -1; + memcpy(addr, local_comm->addr[num], sizeof(*addr)); + return 0; +} + +/* Config file defaults */ +#define DEFAULT_TCP_PORT 21064 +#define DEFAULT_BUFFER_SIZE 4096 +#define DEFAULT_RSBTBL_SIZE 1024 +#define DEFAULT_RECOVER_TIMER 5 +#define DEFAULT_TOSS_SECS 10 +#define DEFAULT_SCAN_SECS 5 +#define DEFAULT_LOG_DEBUG 0 +#define DEFAULT_PROTOCOL 0 +#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ +#define DEFAULT_WAITWARN_US 0 +#define DEFAULT_NEW_RSB_COUNT 128 +#define DEFAULT_RECOVER_CALLBACKS 0 +#define DEFAULT_CLUSTER_NAME "" + +struct dlm_config_info dlm_config = { + .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .ci_recover_timer = DEFAULT_RECOVER_TIMER, + .ci_toss_secs = DEFAULT_TOSS_SECS, + .ci_scan_secs = DEFAULT_SCAN_SECS, + .ci_log_debug = DEFAULT_LOG_DEBUG, + .ci_protocol = DEFAULT_PROTOCOL, + .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, + .ci_waitwarn_us = DEFAULT_WAITWARN_US, + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, + .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, + .ci_cluster_name = DEFAULT_CLUSTER_NAME +}; + diff --git a/kernel/fs/dlm/config.h b/kernel/fs/dlm/config.h new file mode 100644 index 000000000..f30697bc2 --- /dev/null +++ b/kernel/fs/dlm/config.h @@ -0,0 +1,53 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CONFIG_DOT_H__ +#define __CONFIG_DOT_H__ + +struct dlm_config_node { + int nodeid; + int weight; + int new; + uint32_t comm_seq; +}; + +#define DLM_MAX_ADDR_COUNT 3 + +struct dlm_config_info { + int ci_tcp_port; + int ci_buffer_size; + int ci_rsbtbl_size; + int ci_recover_timer; + int ci_toss_secs; + int ci_scan_secs; + int ci_log_debug; + int ci_protocol; + int ci_timewarn_cs; + int ci_waitwarn_us; + int ci_new_rsb_count; + int ci_recover_callbacks; + char ci_cluster_name[DLM_LOCKSPACE_LEN]; +}; + +extern struct dlm_config_info dlm_config; + +int dlm_config_init(void); +void dlm_config_exit(void); +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out); +int dlm_comm_seq(int nodeid, uint32_t *seq); +int dlm_our_nodeid(void); +int dlm_our_addr(struct sockaddr_storage *addr, int num); + +#endif /* __CONFIG_DOT_H__ */ + diff --git a/kernel/fs/dlm/debug_fs.c b/kernel/fs/dlm/debug_fs.c new file mode 100644 index 000000000..eea64912c --- /dev/null +++ b/kernel/fs/dlm/debug_fs.c @@ -0,0 +1,791 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lock.h" + +#define DLM_DEBUG_BUF_LEN 4096 +static char debug_buf[DLM_DEBUG_BUF_LEN]; +static struct mutex debug_buf_lock; + +static struct dentry *dlm_root; + +static char *print_lockmode(int mode) +{ + switch (mode) { + case DLM_LOCK_IV: + return "--"; + case DLM_LOCK_NL: + return "NL"; + case DLM_LOCK_CR: + return "CR"; + case DLM_LOCK_CW: + return "CW"; + case DLM_LOCK_PR: + return "PR"; + case DLM_LOCK_PW: + return "PW"; + case DLM_LOCK_EX: + return "EX"; + default: + return "??"; + } +} + +static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) +{ + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); + + if (lkb->lkb_status == DLM_LKSTS_CONVERT || + lkb->lkb_status == DLM_LKSTS_WAITING) + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); + + if (lkb->lkb_nodeid) { + if (lkb->lkb_nodeid != res->res_nodeid) + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, + lkb->lkb_remid); + else + seq_printf(s, " Master: %08x", lkb->lkb_remid); + } + + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + + seq_puts(s, "\n"); +} + +static void print_format1(struct dlm_rsb *res, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; + + lock_rsb(res); + + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); + + for (i = 0; i < res->res_length; i++) { + if (isprint(res->res_name[i])) + seq_printf(s, "%c", res->res_name[i]); + else + seq_printf(s, "%c", '.'); + } + + if (res->res_nodeid > 0) + seq_printf(s, "\"\nLocal Copy, Master is node %d\n", + res->res_nodeid); + else if (res->res_nodeid == 0) + seq_puts(s, "\"\nMaster Copy\n"); + else if (res->res_nodeid == -1) + seq_printf(s, "\"\nLooking up master (lkid %x)\n", + res->res_first_lkid); + else + seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); + if (seq_has_overflowed(s)) + goto out; + + /* Print the LVB: */ + if (res->res_lvbptr) { + seq_puts(s, "LVB: "); + for (i = 0; i < lvblen; i++) { + if (i == lvblen / 2) + seq_puts(s, "\n "); + seq_printf(s, "%02x ", + (unsigned char) res->res_lvbptr[i]); + } + if (rsb_flag(res, RSB_VALNOTVALID)) + seq_puts(s, " (INVALID)"); + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) + goto out; + } + + root_list = !list_empty(&res->res_root_list); + recover_list = !list_empty(&res->res_recover_list); + + if (root_list || recover_list) { + seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n", + root_list, recover_list, + res->res_flags, res->res_recover_locks_count); + } + + /* Print the locks attached to this resource */ + seq_puts(s, "Granted Queue\n"); + list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + seq_puts(s, "Conversion Queue\n"); + list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + seq_puts(s, "Waiting Queue\n"); + list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) + goto out; + } + + if (list_empty(&res->res_lookup)) + goto out; + + seq_puts(s, "Lookup Queue\n"); + list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { + seq_printf(s, "%08x %s", + lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(res); +} + +static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) +{ + u64 xid = 0; + u64 us; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + /* microseconds since lkb was added to current queue */ + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp)); + + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us + r_nodeid r_len r_name */ + + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + (unsigned long long)us, + r->res_nodeid, + r->res_length, + r->res_name); +} + +static void print_format2(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + +static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, + int rsb_lookup) +{ + u64 xid = 0; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_last_bast.mode, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); +} + +static void print_format3(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = r->res_ls->ls_lvblen; + int print_name = 1; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + if (seq_has_overflowed(s)) + goto out; + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_puts(s, print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) + goto out; + + if (!r->res_lvbptr) + goto do_locks; + + seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen); + + for (i = 0; i < lvblen; i++) + seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) + goto out; + + do_locks: + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) { + print_format3_lock(s, lkb, 1); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + +static void print_format4(struct dlm_rsb *r, struct seq_file *s) +{ + int our_nodeid = dlm_our_nodeid(); + int print_name = 1; + int i; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ", + r, + r->res_nodeid, + r->res_master_nodeid, + r->res_dir_nodeid, + our_nodeid, + r->res_toss_time, + r->res_flags, + r->res_length); + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_puts(s, print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_puts(s, "\n"); + + unlock_rsb(r); +} + +struct rsbtbl_iter { + struct dlm_rsb *rsb; + unsigned bucket; + int format; + int header; +}; + +/* + * If the buffer is full, seq_printf can be called again, but it + * does nothing. So, the these printing routines periodically check + * seq_has_overflowed to avoid wasting too much time trying to print to + * a full buffer. + */ + +static int table_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + + switch (ri->format) { + case 1: + print_format1(ri->rsb, seq); + break; + case 2: + if (ri->header) { + seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n"); + ri->header = 0; + } + print_format2(ri->rsb, seq); + break; + case 3: + if (ri->header) { + seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + ri->header = 0; + } + print_format3(ri->rsb, seq); + break; + case 4: + if (ri->header) { + seq_puts(seq, "version 4 rsb 2\n"); + ri->header = 0; + } + print_format4(ri->rsb, seq); + break; + } + + return 0; +} + +static const struct seq_operations format1_seq_ops; +static const struct seq_operations format2_seq_ops; +static const struct seq_operations format3_seq_ops; +static const struct seq_operations format4_seq_ops; + +static void *table_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct rb_root *tree; + struct rb_node *node; + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri; + struct dlm_rsb *r; + loff_t n = *pos; + unsigned bucket, entry; + int toss = (seq->op == &format4_seq_ops); + + bucket = n >> 32; + entry = n & ((1LL << 32) - 1); + + if (bucket >= ls->ls_rsbtbl_size) + return NULL; + + ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + if (!ri) + return NULL; + if (n == 0) + ri->header = 1; + if (seq->op == &format1_seq_ops) + ri->format = 1; + if (seq->op == &format2_seq_ops) + ri->format = 2; + if (seq->op == &format3_seq_ops) + ri->format = 3; + if (seq->op == &format4_seq_ops) + ri->format = 4; + + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + for (node = rb_first(tree); node; node = rb_next(node)) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + if (!entry--) { + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return ri; + } + } + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + return NULL; + } + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + node = rb_first(tree); + r = rb_entry(node, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) +{ + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri = iter_ptr; + struct rb_root *tree; + struct rb_node *next; + struct dlm_rsb *r, *rp; + loff_t n = *pos; + unsigned bucket; + int toss = (seq->op == &format4_seq_ops); + + bucket = n >> 32; + + /* + * move to the next rsb in the same bucket + */ + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + rp = ri->rsb; + next = rb_next(&rp->res_hashnode); + + if (next) { + r = rb_entry(next, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + ++*pos; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + return NULL; + } + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!RB_EMPTY_ROOT(tree)) { + next = rb_first(tree); + r = rb_entry(next, struct dlm_rsb, res_hashnode); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void table_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + + if (ri) { + dlm_put_rsb(ri->rsb); + kfree(ri); + } +} + +static const struct seq_operations format1_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format2_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format3_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format4_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct file_operations format1_fops; +static const struct file_operations format2_fops; +static const struct file_operations format3_fops; +static const struct file_operations format4_fops; + +static int table_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret = -1; + + if (file->f_op == &format1_fops) + ret = seq_open(file, &format1_seq_ops); + else if (file->f_op == &format2_fops) + ret = seq_open(file, &format2_seq_ops); + else if (file->f_op == &format3_fops) + ret = seq_open(file, &format3_seq_ops); + else if (file->f_op == &format4_fops) + ret = seq_open(file, &format4_seq_ops); + + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static const struct file_operations format1_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format2_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format3_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format4_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * dump lkb's on the ls_waiters list + */ +static ssize_t waiters_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + struct dlm_lkb *lkb; + size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv; + + mutex_lock(&debug_buf_lock); + mutex_lock(&ls->ls_waiters_mutex); + memset(debug_buf, 0, sizeof(debug_buf)); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n", + lkb->lkb_id, lkb->lkb_wait_type, + lkb->lkb_nodeid, lkb->lkb_resource->res_name); + if (ret >= len - pos) + break; + pos += ret; + } + mutex_unlock(&ls->ls_waiters_mutex); + + rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos); + mutex_unlock(&debug_buf_lock); + return rv; +} + +static const struct file_operations waiters_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = waiters_read, + .llseek = default_llseek, +}; + +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + debugfs_remove(ls->ls_debug_rsb_dentry); + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_locks_dentry); + debugfs_remove(ls->ls_debug_all_dentry); + debugfs_remove(ls->ls_debug_toss_dentry); +} + +int dlm_create_debug_file(struct dlm_ls *ls) +{ + char name[DLM_LOCKSPACE_LEN+8]; + + /* format 1 */ + + ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format1_fops); + if (!ls->ls_debug_rsb_dentry) + goto fail; + + /* format 2 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + + ls->ls_debug_locks_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format2_fops); + if (!ls->ls_debug_locks_dentry) + goto fail; + + /* format 3 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + + ls->ls_debug_all_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format3_fops); + if (!ls->ls_debug_all_dentry) + goto fail; + + /* format 4 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name); + + ls->ls_debug_toss_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format4_fops); + if (!ls->ls_debug_toss_dentry) + goto fail; + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + + ls->ls_debug_waiters_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &waiters_fops); + if (!ls->ls_debug_waiters_dentry) + goto fail; + + return 0; + + fail: + dlm_delete_debug_file(ls); + return -ENOMEM; +} + +int __init dlm_register_debugfs(void) +{ + mutex_init(&debug_buf_lock); + dlm_root = debugfs_create_dir("dlm", NULL); + return dlm_root ? 0 : -ENOMEM; +} + +void dlm_unregister_debugfs(void) +{ + debugfs_remove(dlm_root); +} + diff --git a/kernel/fs/dlm/dir.c b/kernel/fs/dlm/dir.c new file mode 100644 index 000000000..d975851a7 --- /dev/null +++ b/kernel/fs/dlm/dir.c @@ -0,0 +1,308 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "rcom.h" +#include "config.h" +#include "memory.h" +#include "recover.h" +#include "util.h" +#include "lock.h" +#include "dir.h" + +/* + * We use the upper 16 bits of the hash value to select the directory node. + * Low bits are used for distribution of rsb's among hash buckets on each node. + * + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of + * num_nodes to the hash value. This value in the desired range is used as an + * offset into the sorted list of nodeid's to give the particular nodeid. + */ + +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) +{ + uint32_t node; + + if (ls->ls_num_nodes == 1) + return dlm_our_nodeid(); + else { + node = (hash >> 16) % ls->ls_total_weight; + return ls->ls_node_array[node]; + } +} + +int dlm_dir_nodeid(struct dlm_rsb *r) +{ + return r->res_dir_nodeid; +} + +void dlm_recover_dir_nodeid(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash); + } + up_read(&ls->ls_root_sem); +} + +int dlm_recover_directory(struct dlm_ls *ls) +{ + struct dlm_member *memb; + char *b, *last_name = NULL; + int error = -ENOMEM, last_len, nodeid, result; + uint16_t namelen; + unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; + + log_rinfo(ls, "dlm_recover_directory"); + + if (dlm_no_directory(ls)) + goto out_status; + + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); + if (!last_name) + goto out; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == dlm_our_nodeid()) + continue; + + memset(last_name, 0, DLM_RESNAME_MAXLEN); + last_len = 0; + + for (;;) { + int left; + error = dlm_recovery_stopped(ls); + if (error) + goto out_free; + + error = dlm_rcom_names(ls, memb->nodeid, + last_name, last_len); + if (error) + goto out_free; + + cond_resched(); + + /* + * pick namelen/name pairs out of received buffer + */ + + b = ls->ls_recover_buf->rc_buf; + left = ls->ls_recover_buf->rc_header.h_length; + left -= sizeof(struct dlm_rcom); + + for (;;) { + __be16 v; + + error = -EINVAL; + if (left < sizeof(__be16)) + goto out_free; + + memcpy(&v, b, sizeof(__be16)); + namelen = be16_to_cpu(v); + b += sizeof(__be16); + left -= sizeof(__be16); + + /* namelen of 0xFFFFF marks end of names for + this node; namelen of 0 marks end of the + buffer */ + + if (namelen == 0xFFFF) + goto done; + if (!namelen) + break; + + if (namelen > left) + goto out_free; + + if (namelen > DLM_RESNAME_MAXLEN) + goto out_free; + + error = dlm_master_lookup(ls, memb->nodeid, + b, namelen, + DLM_LU_RECOVER_DIR, + &nodeid, &result); + if (error) { + log_error(ls, "recover_dir lookup %d", + error); + goto out_free; + } + + /* The name was found in rsbtbl, but the + * master nodeid is different from + * memb->nodeid which says it is the master. + * This should not happen. */ + + if (result == DLM_LU_MATCH && + nodeid != memb->nodeid) { + count_bad++; + log_error(ls, "recover_dir lookup %d " + "nodeid %d memb %d bad %u", + result, nodeid, memb->nodeid, + count_bad); + print_hex_dump_bytes("dlm_recover_dir ", + DUMP_PREFIX_NONE, + b, namelen); + } + + /* The name was found in rsbtbl, and the + * master nodeid matches memb->nodeid. */ + + if (result == DLM_LU_MATCH && + nodeid == memb->nodeid) { + count_match++; + } + + /* The name was not found in rsbtbl and was + * added with memb->nodeid as the master. */ + + if (result == DLM_LU_ADD) { + count_add++; + } + + last_len = namelen; + memcpy(last_name, b, namelen); + b += namelen; + left -= namelen; + count++; + } + } + done: + ; + } + + out_status: + error = 0; + dlm_set_recover_status(ls, DLM_RS_DIR); + + log_rinfo(ls, "dlm_recover_directory %u in %u new", + count, count_add); + out_free: + kfree(last_name); + out: + return error; +} + +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +{ + struct dlm_rsb *r; + uint32_t hash, bucket; + int rv; + + hash = jhash(name, len, 0); + bucket = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r); + if (rv) + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss, + name, len, &r); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + + if (!rv) + return r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (len == r->res_length && !memcmp(name, r->res_name, len)) { + up_read(&ls->ls_root_sem); + log_debug(ls, "find_rsb_root revert to root_list %s", + r->res_name); + return r; + } + } + up_read(&ls->ls_root_sem); + return NULL; +} + +/* Find the rsb where we left off (or start again), then send rsb names + for rsb's we're master of and whose directory node matches the requesting + node. inbuf is the rsb name last sent, inlen is the name's length */ + +void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid) +{ + struct list_head *list; + struct dlm_rsb *r; + int offset = 0, dir_nodeid; + __be16 be_namelen; + + down_read(&ls->ls_root_sem); + + if (inlen > 1) { + r = find_rsb_root(ls, inbuf, inlen); + if (!r) { + inbuf[inlen - 1] = '\0'; + log_error(ls, "copy_master_names from %d start %d %s", + nodeid, inlen, inbuf); + goto out; + } + list = r->res_root_list.next; + } else { + list = ls->ls_root_list.next; + } + + for (offset = 0; list != &ls->ls_root_list; list = list->next) { + r = list_entry(list, struct dlm_rsb, res_root_list); + if (r->res_nodeid) + continue; + + dir_nodeid = dlm_dir_nodeid(r); + if (dir_nodeid != nodeid) + continue; + + /* + * The block ends when we can't fit the following in the + * remaining buffer space: + * namelen (uint16_t) + + * name (r->res_length) + + * end-of-block record 0x0000 (uint16_t) + */ + + if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { + /* Write end-of-block record */ + be_namelen = cpu_to_be16(0); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + ls->ls_recover_dir_sent_msg++; + goto out; + } + + be_namelen = cpu_to_be16(r->res_length); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + memcpy(outbuf + offset, r->res_name, r->res_length); + offset += r->res_length; + ls->ls_recover_dir_sent_res++; + } + + /* + * If we've reached the end of the list (and there's room) write a + * terminating record. + */ + + if ((list == &ls->ls_root_list) && + (offset + sizeof(uint16_t) <= outlen)) { + be_namelen = cpu_to_be16(0xFFFF); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + ls->ls_recover_dir_sent_msg++; + } + out: + up_read(&ls->ls_root_sem); +} + diff --git a/kernel/fs/dlm/dir.h b/kernel/fs/dlm/dir.h new file mode 100644 index 000000000..417506344 --- /dev/null +++ b/kernel/fs/dlm/dir.h @@ -0,0 +1,25 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +int dlm_dir_nodeid(struct dlm_rsb *rsb); +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); +void dlm_recover_dir_nodeid(struct dlm_ls *ls); +int dlm_recover_directory(struct dlm_ls *ls); +void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); + +#endif /* __DIR_DOT_H__ */ + diff --git a/kernel/fs/dlm/dlm_internal.h b/kernel/fs/dlm/dlm_internal.h new file mode 100644 index 000000000..5eff6ea3e --- /dev/null +++ b/kernel/fs/dlm/dlm_internal.h @@ -0,0 +1,729 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DLM_INTERNAL_DOT_H__ +#define __DLM_INTERNAL_DOT_H__ + +/* + * This is the main header file to be included in each DLM source file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "config.h" + +/* Size of the temp buffer midcomms allocates on the stack. + We try to make this large enough so most messages fit. + FIXME: should sctp make this unnecessary? */ + +#define DLM_INBUF_LEN 148 + +struct dlm_ls; +struct dlm_lkb; +struct dlm_rsb; +struct dlm_member; +struct dlm_rsbtable; +struct dlm_recover; +struct dlm_header; +struct dlm_message; +struct dlm_rcom; +struct dlm_mhandle; + +#define log_print(fmt, args...) \ + printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_error(ls, fmt, args...) \ + printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) +#define log_rinfo(ls, fmt, args...) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args); + +#define log_debug(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define log_limit(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define DLM_ASSERT(x, do) \ +{ \ + if (!(x)) \ + { \ + printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \ + "DLM: assertion: \"%s\"\n" \ + "DLM: time = %lu\n", \ + __LINE__, __FILE__, #x, jiffies); \ + {do} \ + printk("\n"); \ + BUG(); \ + panic("DLM: Record message above and reboot.\n"); \ + } \ +} + + +#define DLM_RTF_SHRINK 0x00000001 + +struct dlm_rsbtable { + struct rb_root keep; + struct rb_root toss; + spinlock_t lock; + uint32_t flags; +}; + + +/* + * Lockspace member (per node in a ls) + */ + +struct dlm_member { + struct list_head list; + int nodeid; + int weight; + int slot; + int slot_prev; + int comm_seq; + uint32_t generation; +}; + +/* + * Save and manage recovery state for a lockspace. + */ + +struct dlm_recover { + struct list_head list; + struct dlm_config_node *nodes; + int nodes_count; + uint64_t seq; +}; + +/* + * Pass input args to second stage locking function. + */ + +struct dlm_args { + uint32_t flags; + void (*astfn) (void *astparam); + void *astparam; + void (*bastfn) (void *astparam, int mode); + int mode; + struct dlm_lksb *lksb; + unsigned long timeout; +}; + + +/* + * Lock block + * + * A lock can be one of three types: + * + * local copy lock is mastered locally + * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set) + * process copy lock is mastered on a remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set) + * master copy master node's copy of a lock owned by remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set) + * + * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or + * dlm_unlock. The dlm does not modify these or use any private flags in + * this field; it only contains DLM_LKF_ flags from dlm.h. These flags + * are sent as-is to the remote master when the lock is remote. + * + * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h. + * Some internal flags are shared between the master and process nodes; + * these shared flags are kept in the lower two bytes. One of these + * flags set on the master copy will be propagated to the process copy + * and v.v. Other internal flags are private to the master or process + * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes. + * + * lkb_sbflags: status block flags. These flags are copied directly into + * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion + * ast. All defined in dlm.h with DLM_SBF_ prefix. + * + * lkb_status: the lock status indicates which rsb queue the lock is + * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT + * + * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a + * reply is needed. Only set when the lkb is on the lockspace waiters + * list awaiting a reply from a remote node. + * + * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb + * is a master copy, nodeid specifies the remote lock holder, when the + * lkb is a process copy, the nodeid specifies the lock master. + */ + +/* lkb_status */ + +#define DLM_LKSTS_WAITING 1 +#define DLM_LKSTS_GRANTED 2 +#define DLM_LKSTS_CONVERT 3 + +/* lkb_flags */ + +#define DLM_IFL_MSTCPY 0x00010000 +#define DLM_IFL_RESEND 0x00020000 +#define DLM_IFL_DEAD 0x00040000 +#define DLM_IFL_OVERLAP_UNLOCK 0x00080000 +#define DLM_IFL_OVERLAP_CANCEL 0x00100000 +#define DLM_IFL_ENDOFLIFE 0x00200000 +#define DLM_IFL_WATCH_TIMEWARN 0x00400000 +#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 +#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +#define DLM_IFL_USER 0x00000001 +#define DLM_IFL_ORPHAN 0x00000002 + +#define DLM_CALLBACKS_SIZE 6 + +#define DLM_CB_CAST 0x00000001 +#define DLM_CB_BAST 0x00000002 +#define DLM_CB_SKIP 0x00000004 + +struct dlm_callback { + uint64_t seq; + uint32_t flags; /* DLM_CBF_ */ + int sb_status; /* copy to lksb status */ + uint8_t sb_flags; /* copy to lksb flags */ + int8_t mode; /* rq mode of bast, gr mode of cast */ +}; + +struct dlm_lkb { + struct dlm_rsb *lkb_resource; /* the rsb */ + struct kref lkb_ref; + int lkb_nodeid; /* copied from rsb */ + int lkb_ownpid; /* pid of lock owner */ + uint32_t lkb_id; /* our lock ID */ + uint32_t lkb_remid; /* lock ID on remote partner */ + uint32_t lkb_exflags; /* external flags from caller */ + uint32_t lkb_sbflags; /* lksb flags */ + uint32_t lkb_flags; /* internal flags */ + uint32_t lkb_lvbseq; /* lvb sequence number */ + + int8_t lkb_status; /* granted, waiting, convert */ + int8_t lkb_rqmode; /* requested lock mode */ + int8_t lkb_grmode; /* granted lock mode */ + int8_t lkb_highbast; /* highest mode bast sent for */ + + int8_t lkb_wait_type; /* type of reply waiting for */ + int8_t lkb_wait_count; + int lkb_wait_nodeid; /* for debugging */ + + struct list_head lkb_statequeue; /* rsb g/c/w list */ + struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ + struct list_head lkb_wait_reply; /* waiting for remote reply */ + struct list_head lkb_ownqueue; /* list of locks for a process */ + struct list_head lkb_time_list; + ktime_t lkb_timestamp; + ktime_t lkb_wait_time; + unsigned long lkb_timeout_cs; + + struct mutex lkb_cb_mutex; + struct work_struct lkb_cb_work; + struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ + struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; + struct dlm_callback lkb_last_cast; + struct dlm_callback lkb_last_bast; + ktime_t lkb_last_cast_time; /* for debugging */ + ktime_t lkb_last_bast_time; /* for debugging */ + + uint64_t lkb_recover_seq; /* from ls_recover_seq */ + + char *lkb_lvbptr; + struct dlm_lksb *lkb_lksb; /* caller's status block */ + void (*lkb_astfn) (void *astparam); + void (*lkb_bastfn) (void *astparam, int mode); + union { + void *lkb_astparam; /* caller's ast arg */ + struct dlm_user_args *lkb_ua; + }; +}; + +/* + * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real + * nodeid, even when nodeid is our_nodeid. + * + * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid, + * greater than zero when another nodeid. + * + * (TODO: remove res_nodeid and only use res_master_nodeid) + */ + +struct dlm_rsb { + struct dlm_ls *res_ls; /* the lockspace */ + struct kref res_ref; + struct mutex res_mutex; + unsigned long res_flags; + int res_length; /* length of rsb name */ + int res_nodeid; + int res_master_nodeid; + int res_dir_nodeid; + int res_id; /* for ls_recover_idr */ + uint32_t res_lvbseq; + uint32_t res_hash; + uint32_t res_bucket; /* rsbtbl */ + unsigned long res_toss_time; + uint32_t res_first_lkid; + struct list_head res_lookup; /* lkbs waiting on first */ + union { + struct list_head res_hashchain; + struct rb_node res_hashnode; /* rsbtbl */ + }; + struct list_head res_grantqueue; + struct list_head res_convertqueue; + struct list_head res_waitqueue; + + struct list_head res_root_list; /* used for recovery */ + struct list_head res_recover_list; /* used for recovery */ + int res_recover_locks_count; + + char *res_lvbptr; + char res_name[DLM_RESNAME_MAXLEN+1]; +}; + +/* dlm_master_lookup() flags */ + +#define DLM_LU_RECOVER_DIR 1 +#define DLM_LU_RECOVER_MASTER 2 + +/* dlm_master_lookup() results */ + +#define DLM_LU_MATCH 1 +#define DLM_LU_ADD 2 + +/* find_rsb() flags */ + +#define R_REQUEST 0x00000001 +#define R_RECEIVE_REQUEST 0x00000002 +#define R_RECEIVE_RECOVER 0x00000004 + +/* rsb_flags */ + +enum rsb_flags { + RSB_MASTER_UNCERTAIN, + RSB_VALNOTVALID, + RSB_VALNOTVALID_PREV, + RSB_NEW_MASTER, + RSB_NEW_MASTER2, + RSB_RECOVER_CONVERT, + RSB_RECOVER_GRANT, + RSB_RECOVER_LVB_INVAL, +}; + +static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __set_bit(flag, &r->res_flags); +} + +static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __clear_bit(flag, &r->res_flags); +} + +static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + return test_bit(flag, &r->res_flags); +} + + +/* dlm_header is first element of all structs sent between nodes */ + +#define DLM_HEADER_MAJOR 0x00030000 +#define DLM_HEADER_MINOR 0x00000001 + +#define DLM_HEADER_SLOTS 0x00000001 + +#define DLM_MSG 1 +#define DLM_RCOM 2 + +struct dlm_header { + uint32_t h_version; + uint32_t h_lockspace; + uint32_t h_nodeid; /* nodeid of sender */ + uint16_t h_length; + uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ + uint8_t h_pad; +}; + + +#define DLM_MSG_REQUEST 1 +#define DLM_MSG_CONVERT 2 +#define DLM_MSG_UNLOCK 3 +#define DLM_MSG_CANCEL 4 +#define DLM_MSG_REQUEST_REPLY 5 +#define DLM_MSG_CONVERT_REPLY 6 +#define DLM_MSG_UNLOCK_REPLY 7 +#define DLM_MSG_CANCEL_REPLY 8 +#define DLM_MSG_GRANT 9 +#define DLM_MSG_BAST 10 +#define DLM_MSG_LOOKUP 11 +#define DLM_MSG_REMOVE 12 +#define DLM_MSG_LOOKUP_REPLY 13 +#define DLM_MSG_PURGE 14 + +struct dlm_message { + struct dlm_header m_header; + uint32_t m_type; /* DLM_MSG_ */ + uint32_t m_nodeid; + uint32_t m_pid; + uint32_t m_lkid; /* lkid on sender */ + uint32_t m_remid; /* lkid on receiver */ + uint32_t m_parent_lkid; + uint32_t m_parent_remid; + uint32_t m_exflags; + uint32_t m_sbflags; + uint32_t m_flags; + uint32_t m_lvbseq; + uint32_t m_hash; + int m_status; + int m_grmode; + int m_rqmode; + int m_bastmode; + int m_asts; + int m_result; /* 0 or -EXXX */ + char m_extra[0]; /* name or lvb */ +}; + + +#define DLM_RS_NODES 0x00000001 +#define DLM_RS_NODES_ALL 0x00000002 +#define DLM_RS_DIR 0x00000004 +#define DLM_RS_DIR_ALL 0x00000008 +#define DLM_RS_LOCKS 0x00000010 +#define DLM_RS_LOCKS_ALL 0x00000020 +#define DLM_RS_DONE 0x00000040 +#define DLM_RS_DONE_ALL 0x00000080 + +#define DLM_RCOM_STATUS 1 +#define DLM_RCOM_NAMES 2 +#define DLM_RCOM_LOOKUP 3 +#define DLM_RCOM_LOCK 4 +#define DLM_RCOM_STATUS_REPLY 5 +#define DLM_RCOM_NAMES_REPLY 6 +#define DLM_RCOM_LOOKUP_REPLY 7 +#define DLM_RCOM_LOCK_REPLY 8 + +struct dlm_rcom { + struct dlm_header rc_header; + uint32_t rc_type; /* DLM_RCOM_ */ + int rc_result; /* multi-purpose */ + uint64_t rc_id; /* match reply with request */ + uint64_t rc_seq; /* sender's ls_recover_seq */ + uint64_t rc_seq_reply; /* remote ls_recover_seq */ + char rc_buf[0]; +}; + +union dlm_packet { + struct dlm_header header; /* common to other two */ + struct dlm_message message; + struct dlm_rcom rcom; +}; + +#define DLM_RSF_NEED_SLOTS 0x00000001 + +/* RCOM_STATUS data */ +struct rcom_status { + __le32 rs_flags; + __le32 rs_unused1; + __le64 rs_unused2; +}; + +/* RCOM_STATUS_REPLY data */ +struct rcom_config { + __le32 rf_lvblen; + __le32 rf_lsflags; + + /* DLM_HEADER_SLOTS adds: */ + __le32 rf_flags; + __le16 rf_our_slot; + __le16 rf_num_slots; + __le32 rf_generation; + __le32 rf_unused1; + __le64 rf_unused2; +}; + +struct rcom_slot { + __le32 ro_nodeid; + __le16 ro_slot; + __le16 ro_unused1; + __le64 ro_unused2; +}; + +struct rcom_lock { + __le32 rl_ownpid; + __le32 rl_lkid; + __le32 rl_remid; + __le32 rl_parent_lkid; + __le32 rl_parent_remid; + __le32 rl_exflags; + __le32 rl_flags; + __le32 rl_lvbseq; + __le32 rl_result; + int8_t rl_rqmode; + int8_t rl_grmode; + int8_t rl_status; + int8_t rl_asts; + __le16 rl_wait_type; + __le16 rl_namelen; + char rl_name[DLM_RESNAME_MAXLEN]; + char rl_lvb[0]; +}; + +/* + * The max number of resources per rsbtbl bucket that shrink will attempt + * to remove in each iteration. + */ + +#define DLM_REMOVE_NAMES_MAX 8 + +struct dlm_ls { + struct list_head ls_list; /* list of lockspaces */ + dlm_lockspace_t *ls_local_handle; + uint32_t ls_global_id; /* global unique lockspace ID */ + uint32_t ls_generation; + uint32_t ls_exflags; + int ls_lvblen; + int ls_count; /* refcount of processes in + the dlm using this ls */ + int ls_create_count; /* create/release refcount */ + unsigned long ls_flags; /* LSFL_ */ + unsigned long ls_scan_time; + struct kobject ls_kobj; + + struct idr ls_lkbidr; + spinlock_t ls_lkbidr_spin; + + struct dlm_rsbtable *ls_rsbtbl; + uint32_t ls_rsbtbl_size; + + struct mutex ls_waiters_mutex; + struct list_head ls_waiters; /* lkbs needing a reply */ + + struct mutex ls_orphans_mutex; + struct list_head ls_orphans; + + struct mutex ls_timeout_mutex; + struct list_head ls_timeout; + + spinlock_t ls_new_rsb_spin; + int ls_new_rsb_count; + struct list_head ls_new_rsb; /* new rsb structs */ + + spinlock_t ls_remove_spin; + char ls_remove_name[DLM_RESNAME_MAXLEN+1]; + char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; + int ls_remove_len; + int ls_remove_lens[DLM_REMOVE_NAMES_MAX]; + + struct list_head ls_nodes; /* current nodes in ls */ + struct list_head ls_nodes_gone; /* dead node list, recovery */ + int ls_num_nodes; /* number of nodes in ls */ + int ls_low_nodeid; + int ls_total_weight; + int *ls_node_array; + + int ls_slot; + int ls_num_slots; + int ls_slots_size; + struct dlm_slot *ls_slots; + + struct dlm_rsb ls_stub_rsb; /* for returning errors */ + struct dlm_lkb ls_stub_lkb; /* for returning errors */ + struct dlm_message ls_stub_ms; /* for faking a reply */ + + struct dentry *ls_debug_rsb_dentry; /* debugfs */ + struct dentry *ls_debug_waiters_dentry; /* debugfs */ + struct dentry *ls_debug_locks_dentry; /* debugfs */ + struct dentry *ls_debug_all_dentry; /* debugfs */ + struct dentry *ls_debug_toss_dentry; /* debugfs */ + + wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ + int ls_uevent_result; + struct completion ls_members_done; + int ls_members_result; + + struct miscdevice ls_device; + + struct workqueue_struct *ls_callback_wq; + + /* recovery related */ + + struct mutex ls_cb_mutex; + struct list_head ls_cb_delay; /* save for queue_work later */ + struct timer_list ls_timer; + struct task_struct *ls_recoverd_task; + struct mutex ls_recoverd_active; + spinlock_t ls_recover_lock; + unsigned long ls_recover_begin; /* jiffies timestamp */ + uint32_t ls_recover_status; /* DLM_RS_ */ + uint64_t ls_recover_seq; + struct dlm_recover *ls_recover_args; + struct rw_semaphore ls_in_recovery; /* block local requests */ + struct rw_semaphore ls_recv_active; /* block dlm_recv */ + struct list_head ls_requestqueue;/* queue remote requests */ + struct mutex ls_requestqueue_mutex; + struct dlm_rcom *ls_recover_buf; + int ls_recover_nodeid; /* for debugging */ + unsigned int ls_recover_dir_sent_res; /* for log info */ + unsigned int ls_recover_dir_sent_msg; /* for log info */ + unsigned int ls_recover_locks_in; /* for log info */ + uint64_t ls_rcom_seq; + spinlock_t ls_rcom_spin; + struct list_head ls_recover_list; + spinlock_t ls_recover_list_lock; + int ls_recover_list_count; + struct idr ls_recover_idr; + spinlock_t ls_recover_idr_lock; + wait_queue_head_t ls_wait_general; + wait_queue_head_t ls_recover_lock_wait; + struct mutex ls_clear_proc_locks; + + struct list_head ls_root_list; /* root resources */ + struct rw_semaphore ls_root_sem; /* protect root_list */ + + const struct dlm_lockspace_ops *ls_ops; + void *ls_ops_arg; + + int ls_namelen; + char ls_name[1]; +}; + +/* + * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines + * that they should abort what they're doing so new recovery can be started. + * + * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it + * should do down_write() on the in_recovery rw_semaphore. (doing down_write + * within dlm_ls_stop causes complaints about the lock acquired/released + * in different contexts.) + * + * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore. + * It sets this after it is done with down_write() on the in_recovery + * rw_semaphore and clears it after it has released the rw_semaphore. + * + * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it + * should begin recovery of the lockspace. + * + * LSFL_RUNNING - set when normal locking activity is enabled. + * dlm_ls_stop() clears this to tell dlm locking routines that they should + * quit what they are doing so recovery can run. dlm_recoverd sets + * this after recovery is finished. + */ + +#define LSFL_RECOVER_STOP 0 +#define LSFL_RECOVER_DOWN 1 +#define LSFL_RECOVER_LOCK 2 +#define LSFL_RECOVER_WORK 3 +#define LSFL_RUNNING 4 + +#define LSFL_RCOM_READY 5 +#define LSFL_RCOM_WAIT 6 +#define LSFL_UEVENT_WAIT 7 +#define LSFL_TIMEWARN 8 +#define LSFL_CB_DELAY 9 +#define LSFL_NODIR 10 + +/* much of this is just saving user space pointers associated with the + lock that we pass back to the user lib with an ast */ + +struct dlm_user_args { + struct dlm_user_proc *proc; /* each process that opens the lockspace + device has private data + (dlm_user_proc) on the struct file, + the process's locks point back to it*/ + struct dlm_lksb lksb; + struct dlm_lksb __user *user_lksb; + void __user *castparam; + void __user *castaddr; + void __user *bastparam; + void __user *bastaddr; + uint64_t xid; +}; + +#define DLM_PROC_FLAGS_CLOSING 1 +#define DLM_PROC_FLAGS_COMPAT 2 + +/* locks list is kept so we can remove all a process's locks when it + exits (or orphan those that are persistent) */ + +struct dlm_user_proc { + dlm_lockspace_t *lockspace; + unsigned long flags; /* DLM_PROC_FLAGS */ + struct list_head asts; + spinlock_t asts_spin; + struct list_head locks; + spinlock_t locks_spin; + struct list_head unlocking; + wait_queue_head_t wait; +}; + +static inline int dlm_locking_stopped(struct dlm_ls *ls) +{ + return !test_bit(LSFL_RUNNING, &ls->ls_flags); +} + +static inline int dlm_recovery_stopped(struct dlm_ls *ls) +{ + return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); +} + +static inline int dlm_no_directory(struct dlm_ls *ls) +{ + return test_bit(LSFL_NODIR, &ls->ls_flags); +} + +int dlm_netlink_init(void); +void dlm_netlink_exit(void); +void dlm_timeout_warn(struct dlm_lkb *lkb); +int dlm_plock_init(void); +void dlm_plock_exit(void); + +#ifdef CONFIG_DLM_DEBUG +int dlm_register_debugfs(void); +void dlm_unregister_debugfs(void); +int dlm_create_debug_file(struct dlm_ls *ls); +void dlm_delete_debug_file(struct dlm_ls *ls); +#else +static inline int dlm_register_debugfs(void) { return 0; } +static inline void dlm_unregister_debugfs(void) { } +static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } +static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +#endif + +#endif /* __DLM_INTERNAL_DOT_H__ */ + diff --git a/kernel/fs/dlm/lock.c b/kernel/fs/dlm/lock.c new file mode 100644 index 000000000..35502d404 --- /dev/null +++ b/kernel/fs/dlm/lock.c @@ -0,0 +1,6304 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* Central locking logic has four stages: + + dlm_lock() + dlm_unlock() + + request_lock(ls, lkb) + convert_lock(ls, lkb) + unlock_lock(ls, lkb) + cancel_lock(ls, lkb) + + _request_lock(r, lkb) + _convert_lock(r, lkb) + _unlock_lock(r, lkb) + _cancel_lock(r, lkb) + + do_request(r, lkb) + do_convert(r, lkb) + do_unlock(r, lkb) + do_cancel(r, lkb) + + Stage 1 (lock, unlock) is mainly about checking input args and + splitting into one of the four main operations: + + dlm_lock = request_lock + dlm_lock+CONVERT = convert_lock + dlm_unlock = unlock_lock + dlm_unlock+CANCEL = cancel_lock + + Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is + provided to the next stage. + + Stage 3, _xxxx_lock(), determines if the operation is local or remote. + When remote, it calls send_xxxx(), when local it calls do_xxxx(). + + Stage 4, do_xxxx(), is the guts of the operation. It manipulates the + given rsb and lkb and queues callbacks. + + For remote operations, send_xxxx() results in the corresponding do_xxxx() + function being executed on the remote node. The connecting send/receive + calls on local (L) and remote (R) nodes: + + L: send_xxxx() -> R: receive_xxxx() + R: do_xxxx() + L: receive_xxxx_reply() <- R: send_xxxx_reply() +*/ +#include +#include +#include +#include "dlm_internal.h" +#include +#include "memory.h" +#include "lowcomms.h" +#include "requestqueue.h" +#include "util.h" +#include "dir.h" +#include "member.h" +#include "lockspace.h" +#include "ast.h" +#include "lock.h" +#include "rcom.h" +#include "recover.h" +#include "lvb_table.h" +#include "user.h" +#include "config.h" + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode); +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_remove(struct dlm_rsb *r); +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms); +static int receive_extralen(struct dlm_message *ms); +static void do_purge(struct dlm_ls *ls, int nodeid, int pid); +static void del_timeout(struct dlm_lkb *lkb); +static void toss_rsb(struct kref *kref); + +/* + * Lock compatibilty matrix - thanks Steve + * UN = Unlocked state. Not really a state, used as a flag + * PD = Padding. Used to make the matrix a nice power of two in size + * Other states are the same as the VMS DLM. + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) + */ + +static const int __dlm_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +/* + * This defines the direction of transfer of LVB data. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + * 1 = LVB is returned to the caller + * 0 = LVB is written to the resource + * -1 = nothing happens to the LVB + */ + +const int dlm_lvb_operations[8][8] = { + /* UN NL CR CW PR PW EX PD*/ + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ +}; + +#define modes_compat(gr, rq) \ + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] + +int dlm_modes_compat(int mode1, int mode2) +{ + return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; +} + +/* + * Compatibility matrix for conversions with QUECVT set. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + */ + +static const int __quecvt_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +void dlm_print_lkb(struct dlm_lkb *lkb) +{ + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " + "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", + lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, + lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, + (unsigned long long)lkb->lkb_recover_seq); +} + +static void dlm_print_rsb(struct dlm_rsb *r) +{ + printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x " + "rlc %d name %s\n", + r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid, + r->res_flags, r->res_first_lkid, r->res_recover_locks_count, + r->res_name); +} + +void dlm_dump_rsb(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb; + + dlm_print_rsb(r); + + printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n", + list_empty(&r->res_root_list), list_empty(&r->res_recover_list)); + printk(KERN_ERR "rsb lookup list\n"); + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb grant queue:\n"); + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb convert queue:\n"); + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb wait queue:\n"); + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + dlm_print_lkb(lkb); +} + +/* Threads cannot use the lockspace while it's being recovered */ + +static inline void dlm_lock_recovery(struct dlm_ls *ls) +{ + down_read(&ls->ls_in_recovery); +} + +void dlm_unlock_recovery(struct dlm_ls *ls) +{ + up_read(&ls->ls_in_recovery); +} + +int dlm_lock_recovery_try(struct dlm_ls *ls) +{ + return down_read_trylock(&ls->ls_in_recovery); +} + +static inline int can_be_queued(struct dlm_lkb *lkb) +{ + return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE); +} + +static inline int force_blocking_asts(struct dlm_lkb *lkb) +{ + return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST); +} + +static inline int is_demoted(struct dlm_lkb *lkb) +{ + return (lkb->lkb_sbflags & DLM_SBF_DEMOTED); +} + +static inline int is_altmode(struct dlm_lkb *lkb) +{ + return (lkb->lkb_sbflags & DLM_SBF_ALTMODE); +} + +static inline int is_granted(struct dlm_lkb *lkb) +{ + return (lkb->lkb_status == DLM_LKSTS_GRANTED); +} + +static inline int is_remote(struct dlm_rsb *r) +{ + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r);); + return !!r->res_nodeid; +} + +static inline int is_process_copy(struct dlm_lkb *lkb) +{ + return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY)); +} + +static inline int is_master_copy(struct dlm_lkb *lkb) +{ + return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; +} + +static inline int middle_conversion(struct dlm_lkb *lkb) +{ + if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) || + (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW)) + return 1; + return 0; +} + +static inline int down_conversion(struct dlm_lkb *lkb) +{ + return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode); +} + +static inline int is_overlap_unlock(struct dlm_lkb *lkb) +{ + return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK; +} + +static inline int is_overlap_cancel(struct dlm_lkb *lkb) +{ + return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL; +} + +static inline int is_overlap(struct dlm_lkb *lkb) +{ + return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK | + DLM_IFL_OVERLAP_CANCEL)); +} + +static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + if (is_master_copy(lkb)) + return; + + del_timeout(lkb); + + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); + + /* if the operation was a cancel, then return -DLM_ECANCEL, if a + timeout caused the cancel then return -ETIMEDOUT */ + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; + rv = -ETIMEDOUT; + } + + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; + rv = -EDEADLK; + } + + dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); +} + +static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + queue_cast(r, lkb, + is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL); +} + +static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) +{ + if (is_master_copy(lkb)) { + send_bast(r, lkb, rqmode); + } else { + dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0); + } +} + +/* + * Basic operations on rsb's and lkb's + */ + +/* This is only called to add a reference when the code already holds + a valid reference to the rsb, so there's no need for locking. */ + +static inline void hold_rsb(struct dlm_rsb *r) +{ + kref_get(&r->res_ref); +} + +void dlm_hold_rsb(struct dlm_rsb *r) +{ + hold_rsb(r); +} + +/* When all references to the rsb are gone it's transferred to + the tossed list for later disposal. */ + +static void put_rsb(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + uint32_t bucket = r->res_bucket; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + kref_put(&r->res_ref, toss_rsb); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); +} + +void dlm_put_rsb(struct dlm_rsb *r) +{ + put_rsb(r); +} + +static int pre_rsb_struct(struct dlm_ls *ls) +{ + struct dlm_rsb *r1, *r2; + int count = 0; + + spin_lock(&ls->ls_new_rsb_spin); + if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) { + spin_unlock(&ls->ls_new_rsb_spin); + return 0; + } + spin_unlock(&ls->ls_new_rsb_spin); + + r1 = dlm_allocate_rsb(ls); + r2 = dlm_allocate_rsb(ls); + + spin_lock(&ls->ls_new_rsb_spin); + if (r1) { + list_add(&r1->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + if (r2) { + list_add(&r2->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + + if (!count) + return -ENOMEM; + return 0; +} + +/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can + unlock any spinlocks, go back and call pre_rsb_struct again. + Otherwise, take an rsb off the list and return it. */ + +static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, + struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r; + int count; + + spin_lock(&ls->ls_new_rsb_spin); + if (list_empty(&ls->ls_new_rsb)) { + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + log_debug(ls, "find_rsb retry %d %d %s", + count, dlm_config.ci_new_rsb_count, name); + return -EAGAIN; + } + + r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); + list_del(&r->res_hashchain); + /* Convert the empty list_head to a NULL rb_node for tree usage: */ + memset(&r->res_hashnode, 0, sizeof(struct rb_node)); + ls->ls_new_rsb_count--; + spin_unlock(&ls->ls_new_rsb_spin); + + r->res_ls = ls; + r->res_length = len; + memcpy(r->res_name, name, len); + mutex_init(&r->res_mutex); + + INIT_LIST_HEAD(&r->res_lookup); + INIT_LIST_HEAD(&r->res_grantqueue); + INIT_LIST_HEAD(&r->res_convertqueue); + INIT_LIST_HEAD(&r->res_waitqueue); + INIT_LIST_HEAD(&r->res_root_list); + INIT_LIST_HEAD(&r->res_recover_list); + + *r_ret = r; + return 0; +} + +static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) +{ + char maxname[DLM_RESNAME_MAXLEN]; + + memset(maxname, 0, DLM_RESNAME_MAXLEN); + memcpy(maxname, name, nlen); + return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); +} + +int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, + struct dlm_rsb **r_ret) +{ + struct rb_node *node = tree->rb_node; + struct dlm_rsb *r; + int rc; + + while (node) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + rc = rsb_cmp(r, name, len); + if (rc < 0) + node = node->rb_left; + else if (rc > 0) + node = node->rb_right; + else + goto found; + } + *r_ret = NULL; + return -EBADR; + + found: + *r_ret = r; + return 0; +} + +static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) +{ + struct rb_node **newn = &tree->rb_node; + struct rb_node *parent = NULL; + int rc; + + while (*newn) { + struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb, + res_hashnode); + + parent = *newn; + rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); + if (rc < 0) + newn = &parent->rb_left; + else if (rc > 0) + newn = &parent->rb_right; + else { + log_print("rsb_insert match"); + dlm_dump_rsb(rsb); + dlm_dump_rsb(cur); + return -EEXIST; + } + } + + rb_link_node(&rsb->res_hashnode, parent, newn); + rb_insert_color(&rsb->res_hashnode, tree); + return 0; +} + +/* + * Find rsb in rsbtbl and potentially create/add one + * + * Delaying the release of rsb's has a similar benefit to applications keeping + * NL locks on an rsb, but without the guarantee that the cached master value + * will still be valid when the rsb is reused. Apps aren't always smart enough + * to keep NL locks on an rsb that they may lock again shortly; this can lead + * to excessive master lookups and removals if we don't delay the release. + * + * Searching for an rsb means looking through both the normal list and toss + * list. When found on the toss list the rsb is moved to the normal list with + * ref count of 1; when found on normal list the ref count is incremented. + * + * rsb's on the keep list are being used locally and refcounted. + * rsb's on the toss list are not being used locally, and are not refcounted. + * + * The toss list rsb's were either + * - previously used locally but not any more (were on keep list, then + * moved to toss list when last refcount dropped) + * - created and put on toss list as a directory record for a lookup + * (we are the dir node for the res, but are not using the res right now, + * but some other node is) + * + * The purpose of find_rsb() is to return a refcounted rsb for local use. + * So, if the given rsb is on the toss list, it is moved to the keep list + * before being returned. + * + * toss_rsb() happens when all local usage of the rsb is done, i.e. no + * more refcounts exist, so the rsb is moved from the keep list to the + * toss list. + * + * rsb's on both keep and toss lists are used for doing a name to master + * lookups. rsb's that are in use locally (and being refcounted) are on + * the keep list, rsb's that are not in use locally (not refcounted) and + * only exist for name/master lookups are on the toss list. + * + * rsb's on the toss list who's dir_nodeid is not local can have stale + * name/master mappings. So, remote requests on such rsb's can potentially + * return with an error, which means the mapping is stale and needs to + * be updated with a new lookup. (The idea behind MASTER UNCERTAIN and + * first_lkid is to keep only a single outstanding request on an rsb + * while that rsb has a potentially stale master.) + */ + +static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, + uint32_t hash, uint32_t b, + int dir_nodeid, int from_nodeid, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r = NULL; + int our_nodeid = dlm_our_nodeid(); + int from_local = 0; + int from_other = 0; + int from_dir = 0; + int create = 0; + int error; + + if (flags & R_RECEIVE_REQUEST) { + if (from_nodeid == dir_nodeid) + from_dir = 1; + else + from_other = 1; + } else if (flags & R_REQUEST) { + from_local = 1; + } + + /* + * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so + * from_nodeid has sent us a lock in dlm_recover_locks, believing + * we're the new master. Our local recovery may not have set + * res_master_nodeid to our_nodeid yet, so allow either. Don't + * create the rsb; dlm_recover_process_copy() will handle EBADR + * by resending. + * + * If someone sends us a request, we are the dir node, and we do + * not find the rsb anywhere, then recreate it. This happens if + * someone sends us a request after we have removed/freed an rsb + * from our toss list. (They sent a request instead of lookup + * because they are using an rsb from their toss list.) + */ + + if (from_local || from_dir || + (from_other && (dir_nodeid == our_nodeid))) { + create = 1; + } + + retry: + if (create) { + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + } + + spin_lock(&ls->ls_rsbtbl[b].lock); + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (error) + goto do_toss; + + /* + * rsb is active, so we can't check master_nodeid without lock_rsb. + */ + + kref_get(&r->res_ref); + error = 0; + goto out_unlock; + + + do_toss: + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto do_new; + + /* + * rsb found inactive (master_nodeid may be out of date unless + * we are the dir_nodeid or were the master) No other thread + * is using this rsb because it's on the toss list, so we can + * look at or update res_master_nodeid without lock_rsb. + */ + + if ((r->res_master_nodeid != our_nodeid) && from_other) { + /* our rsb was not master, and another node (not the dir node) + has sent us a request */ + log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s", + from_nodeid, r->res_master_nodeid, dir_nodeid, + r->res_name); + error = -ENOTBLK; + goto out_unlock; + } + + if ((r->res_master_nodeid != our_nodeid) && from_dir) { + /* don't think this should ever happen */ + log_error(ls, "find_rsb toss from_dir %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + /* fix it and go on */ + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } + + if (from_local && (r->res_master_nodeid != our_nodeid)) { + /* Because we have held no locks on this rsb, + res_master_nodeid could have become stale. */ + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + goto out_unlock; + + + do_new: + /* + * rsb not found + */ + + if (error == -EBADR && !create) + goto out_unlock; + + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = dir_nodeid; + kref_init(&r->res_ref); + + if (from_dir) { + /* want to see how often this happens */ + log_debug(ls, "find_rsb new from_dir %d recreate %s", + from_nodeid, r->res_name); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + goto out_add; + } + + if (from_other && (dir_nodeid != our_nodeid)) { + /* should never happen */ + log_error(ls, "find_rsb new from_other %d dir %d our %d %s", + from_nodeid, dir_nodeid, our_nodeid, r->res_name); + dlm_free_rsb(r); + r = NULL; + error = -ENOTBLK; + goto out_unlock; + } + + if (from_other) { + log_debug(ls, "find_rsb new from_other %d dir %d %s", + from_nodeid, dir_nodeid, r->res_name); + } + + if (dir_nodeid == our_nodeid) { + /* When we are the dir nodeid, we can set the master + node immediately */ + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } else { + /* set_master will send_lookup to dir_nodeid */ + r->res_master_nodeid = 0; + r->res_nodeid = -1; + } + + out_add: + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + out: + *r_ret = r; + return error; +} + +/* During recovery, other nodes can send us new MSTCPY locks (from + dlm_recover_locks) before we've made ourself master (in + dlm_recover_masters). */ + +static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, + uint32_t hash, uint32_t b, + int dir_nodeid, int from_nodeid, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r = NULL; + int our_nodeid = dlm_our_nodeid(); + int recover = (flags & R_RECEIVE_RECOVER); + int error; + + retry: + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + + spin_lock(&ls->ls_rsbtbl[b].lock); + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (error) + goto do_toss; + + /* + * rsb is active, so we can't check master_nodeid without lock_rsb. + */ + + kref_get(&r->res_ref); + goto out_unlock; + + + do_toss: + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto do_new; + + /* + * rsb found inactive. No other thread is using this rsb because + * it's on the toss list, so we can look at or update + * res_master_nodeid without lock_rsb. + */ + + if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) { + /* our rsb is not master, and another node has sent us a + request; this should never happen */ + log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d", + from_nodeid, r->res_master_nodeid, dir_nodeid); + dlm_print_rsb(r); + error = -ENOTBLK; + goto out_unlock; + } + + if (!recover && (r->res_master_nodeid != our_nodeid) && + (dir_nodeid == our_nodeid)) { + /* our rsb is not master, and we are dir; may as well fix it; + this should never happen */ + log_error(ls, "find_rsb toss our %d master %d dir %d", + our_nodeid, r->res_master_nodeid, dir_nodeid); + dlm_print_rsb(r); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + goto out_unlock; + + + do_new: + /* + * rsb not found + */ + + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = dir_nodeid; + r->res_master_nodeid = dir_nodeid; + r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid; + kref_init(&r->res_ref); + + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + out: + *r_ret = r; + return error; +} + +static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, + unsigned int flags, struct dlm_rsb **r_ret) +{ + uint32_t hash, b; + int dir_nodeid; + + if (len > DLM_RESNAME_MAXLEN) + return -EINVAL; + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + + if (dlm_no_directory(ls)) + return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid, + from_nodeid, flags, r_ret); + else + return find_rsb_dir(ls, name, len, hash, b, dir_nodeid, + from_nodeid, flags, r_ret); +} + +/* we have received a request and found that res_master_nodeid != our_nodeid, + so we need to return an error or make ourself the master */ + +static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r, + int from_nodeid) +{ + if (dlm_no_directory(ls)) { + log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d", + from_nodeid, r->res_master_nodeid, + r->res_dir_nodeid); + dlm_print_rsb(r); + return -ENOTBLK; + } + + if (from_nodeid != r->res_dir_nodeid) { + /* our rsb is not master, and another node (not the dir node) + has sent us a request. this is much more common when our + master_nodeid is zero, so limit debug to non-zero. */ + + if (r->res_master_nodeid) { + log_debug(ls, "validate master from_other %d master %d " + "dir %d first %x %s", from_nodeid, + r->res_master_nodeid, r->res_dir_nodeid, + r->res_first_lkid, r->res_name); + } + return -ENOTBLK; + } else { + /* our rsb is not master, but the dir nodeid has sent us a + request; this could happen with master 0 / res_nodeid -1 */ + + if (r->res_master_nodeid) { + log_error(ls, "validate master from_dir %d master %d " + "first %x %s", + from_nodeid, r->res_master_nodeid, + r->res_first_lkid, r->res_name); + } + + r->res_master_nodeid = dlm_our_nodeid(); + r->res_nodeid = 0; + return 0; + } +} + +/* + * We're the dir node for this res and another node wants to know the + * master nodeid. During normal operation (non recovery) this is only + * called from receive_lookup(); master lookups when the local node is + * the dir node are done by find_rsb(). + * + * normal operation, we are the dir node for a resource + * . _request_lock + * . set_master + * . send_lookup + * . receive_lookup + * . dlm_master_lookup flags 0 + * + * recover directory, we are rebuilding dir for all resources + * . dlm_recover_directory + * . dlm_rcom_names + * remote node sends back the rsb names it is master of and we are dir of + * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1) + * we either create new rsb setting remote node as master, or find existing + * rsb and set master to be the remote node. + * + * recover masters, we are finding the new master for resources + * . dlm_recover_masters + * . recover_master + * . dlm_send_rcom_lookup + * . receive_rcom_lookup + * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) + */ + +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, + unsigned int flags, int *r_nodeid, int *result) +{ + struct dlm_rsb *r = NULL; + uint32_t hash, b; + int from_master = (flags & DLM_LU_RECOVER_DIR); + int fix_master = (flags & DLM_LU_RECOVER_MASTER); + int our_nodeid = dlm_our_nodeid(); + int dir_nodeid, error, toss_list = 0; + + if (len > DLM_RESNAME_MAXLEN) + return -EINVAL; + + if (from_nodeid == our_nodeid) { + log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x", + our_nodeid, flags); + return -EINVAL; + } + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + if (dir_nodeid != our_nodeid) { + log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d", + from_nodeid, dir_nodeid, our_nodeid, hash, + ls->ls_num_nodes); + *r_nodeid = -1; + return -EINVAL; + } + + retry: + error = pre_rsb_struct(ls); + if (error < 0) + return error; + + spin_lock(&ls->ls_rsbtbl[b].lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!error) { + /* because the rsb is active, we need to lock_rsb before + checking/changing re_master_nodeid */ + + hold_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + lock_rsb(r); + goto found; + } + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto not_found; + + /* because the rsb is inactive (on toss list), it's not refcounted + and lock_rsb is not used, but is protected by the rsbtbl lock */ + + toss_list = 1; + found: + if (r->res_dir_nodeid != our_nodeid) { + /* should not happen, but may as well fix it and carry on */ + log_error(ls, "dlm_master_lookup res_dir %d our %d %s", + r->res_dir_nodeid, our_nodeid, r->res_name); + r->res_dir_nodeid = our_nodeid; + } + + if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) { + /* Recovery uses this function to set a new master when + the previous master failed. Setting NEW_MASTER will + force dlm_recover_masters to call recover_master on this + rsb even though the res_nodeid is no longer removed. */ + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + + if (toss_list) { + /* I don't think we should ever find it on toss list. */ + log_error(ls, "dlm_master_lookup fix_master on toss"); + dlm_dump_rsb(r); + } + } + + if (from_master && (r->res_master_nodeid != from_nodeid)) { + /* this will happen if from_nodeid became master during + a previous recovery cycle, and we aborted the previous + cycle before recovering this master value */ + + log_limit(ls, "dlm_master_lookup from_master %d " + "master_nodeid %d res_nodeid %d first %x %s", + from_nodeid, r->res_master_nodeid, r->res_nodeid, + r->res_first_lkid, r->res_name); + + if (r->res_master_nodeid == our_nodeid) { + log_error(ls, "from_master %d our_master", from_nodeid); + dlm_dump_rsb(r); + dlm_send_rcom_lookup_dump(r, from_nodeid); + goto out_found; + } + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + } + + if (!r->res_master_nodeid) { + /* this will happen if recovery happens while we're looking + up the master for this rsb */ + + log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s", + from_nodeid, r->res_first_lkid, r->res_name); + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + } + + if (!from_master && !fix_master && + (r->res_master_nodeid == from_nodeid)) { + /* this can happen when the master sends remove, the dir node + finds the rsb on the keep list and ignores the remove, + and the former master sends a lookup */ + + log_limit(ls, "dlm_master_lookup from master %d flags %x " + "first %x %s", from_nodeid, flags, + r->res_first_lkid, r->res_name); + } + + out_found: + *r_nodeid = r->res_master_nodeid; + if (result) + *result = DLM_LU_MATCH; + + if (toss_list) { + r->res_toss_time = jiffies; + /* the rsb was inactive (on toss list) */ + spin_unlock(&ls->ls_rsbtbl[b].lock); + } else { + /* the rsb was active */ + unlock_rsb(r); + put_rsb(r); + } + return 0; + + not_found: + error = get_rsb_struct(ls, name, len, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + if (error) + goto out_unlock; + + r->res_hash = hash; + r->res_bucket = b; + r->res_dir_nodeid = our_nodeid; + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + kref_init(&r->res_ref); + r->res_toss_time = jiffies; + + error = rsb_insert(r, &ls->ls_rsbtbl[b].toss); + if (error) { + /* should never happen */ + dlm_free_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + goto retry; + } + + if (result) + *result = DLM_LU_ADD; + *r_nodeid = from_nodeid; + error = 0; + out_unlock: + spin_unlock(&ls->ls_rsbtbl[b].lock); + return error; +} + +static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + if (r->res_hash == hash) + dlm_dump_rsb(r); + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } +} + +void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) +{ + struct dlm_rsb *r = NULL; + uint32_t hash, b; + int error; + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[b].lock); + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!error) + goto out_dump; + + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (error) + goto out; + out_dump: + dlm_dump_rsb(r); + out: + spin_unlock(&ls->ls_rsbtbl[b].lock); +} + +static void toss_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + struct dlm_ls *ls = r->res_ls; + + DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); + kref_init(&r->res_ref); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); + rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); + r->res_toss_time = jiffies; + ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK; + if (r->res_lvbptr) { + dlm_free_lvb(r->res_lvbptr); + r->res_lvbptr = NULL; + } +} + +/* See comment for unhold_lkb */ + +static void unhold_rsb(struct dlm_rsb *r) +{ + int rv; + rv = kref_put(&r->res_ref, toss_rsb); + DLM_ASSERT(!rv, dlm_dump_rsb(r);); +} + +static void kill_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the remove and free. */ + + DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r);); +} + +/* Attaching/detaching lkb's from rsb's is for rsb reference counting. + The rsb must exist as long as any lkb's for it do. */ + +static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + hold_rsb(r); + lkb->lkb_resource = r; +} + +static void detach_lkb(struct dlm_lkb *lkb) +{ + if (lkb->lkb_resource) { + put_rsb(lkb->lkb_resource); + lkb->lkb_resource = NULL; + } +} + +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + struct dlm_lkb *lkb; + int rv; + + lkb = dlm_allocate_lkb(ls); + if (!lkb) + return -ENOMEM; + + lkb->lkb_nodeid = -1; + lkb->lkb_grmode = DLM_LOCK_IV; + kref_init(&lkb->lkb_ref); + INIT_LIST_HEAD(&lkb->lkb_ownqueue); + INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); + INIT_LIST_HEAD(&lkb->lkb_time_list); + INIT_LIST_HEAD(&lkb->lkb_cb_list); + mutex_init(&lkb->lkb_cb_mutex); + INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); + + idr_preload(GFP_NOFS); + spin_lock(&ls->ls_lkbidr_spin); + rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + if (rv >= 0) + lkb->lkb_id = rv; + spin_unlock(&ls->ls_lkbidr_spin); + idr_preload_end(); + + if (rv < 0) { + log_error(ls, "create_lkb idr error %d", rv); + return rv; + } + + *lkb_ret = lkb; + return 0; +} + +static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) +{ + struct dlm_lkb *lkb; + + spin_lock(&ls->ls_lkbidr_spin); + lkb = idr_find(&ls->ls_lkbidr, lkid); + if (lkb) + kref_get(&lkb->lkb_ref); + spin_unlock(&ls->ls_lkbidr_spin); + + *lkb_ret = lkb; + return lkb ? 0 : -ENOENT; +} + +static void kill_lkb(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the detach_lkb */ + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); +} + +/* __put_lkb() is used when an lkb may not have an rsb attached to + it so we need to provide the lockspace explicitly */ + +static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + uint32_t lkid = lkb->lkb_id; + + spin_lock(&ls->ls_lkbidr_spin); + if (kref_put(&lkb->lkb_ref, kill_lkb)) { + idr_remove(&ls->ls_lkbidr, lkid); + spin_unlock(&ls->ls_lkbidr_spin); + + detach_lkb(lkb); + + /* for local/process lkbs, lvbptr points to caller's lksb */ + if (lkb->lkb_lvbptr && is_master_copy(lkb)) + dlm_free_lvb(lkb->lkb_lvbptr); + dlm_free_lkb(lkb); + return 1; + } else { + spin_unlock(&ls->ls_lkbidr_spin); + return 0; + } +} + +int dlm_put_lkb(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls; + + DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb);); + DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb);); + + ls = lkb->lkb_resource->res_ls; + return __put_lkb(ls, lkb); +} + +/* This is only called to add a reference when the code already holds + a valid reference to the lkb, so there's no need for locking. */ + +static inline void hold_lkb(struct dlm_lkb *lkb) +{ + kref_get(&lkb->lkb_ref); +} + +/* This is called when we need to remove a reference and are certain + it's not the last ref. e.g. del_lkb is always called between a + find_lkb/put_lkb and is always the inverse of a previous add_lkb. + put_lkb would work fine, but would involve unnecessary locking */ + +static inline void unhold_lkb(struct dlm_lkb *lkb) +{ + int rv; + rv = kref_put(&lkb->lkb_ref, kill_lkb); + DLM_ASSERT(!rv, dlm_print_lkb(lkb);); +} + +static void lkb_add_ordered(struct list_head *new, struct list_head *head, + int mode) +{ + struct dlm_lkb *lkb = NULL; + + list_for_each_entry(lkb, head, lkb_statequeue) + if (lkb->lkb_rqmode < mode) + break; + + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); +} + +/* add/remove lkb to rsb's grant/convert/wait queue */ + +static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) +{ + kref_get(&lkb->lkb_ref); + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); + + lkb->lkb_timestamp = ktime_get(); + + lkb->lkb_status = status; + + switch (status) { + case DLM_LKSTS_WAITING: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_waitqueue); + else + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); + break; + case DLM_LKSTS_GRANTED: + /* convention says granted locks kept in order of grmode */ + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, + lkb->lkb_grmode); + break; + case DLM_LKSTS_CONVERT: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); + else + list_add_tail(&lkb->lkb_statequeue, + &r->res_convertqueue); + break; + default: + DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status);); + } +} + +static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + lkb->lkb_status = 0; + list_del(&lkb->lkb_statequeue); + unhold_lkb(lkb); +} + +static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts) +{ + hold_lkb(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, sts); + unhold_lkb(lkb); +} + +static int msg_reply_type(int mstype) +{ + switch (mstype) { + case DLM_MSG_REQUEST: + return DLM_MSG_REQUEST_REPLY; + case DLM_MSG_CONVERT: + return DLM_MSG_CONVERT_REPLY; + case DLM_MSG_UNLOCK: + return DLM_MSG_UNLOCK_REPLY; + case DLM_MSG_CANCEL: + return DLM_MSG_CANCEL_REPLY; + case DLM_MSG_LOOKUP: + return DLM_MSG_LOOKUP_REPLY; + } + return -1; +} + +static int nodeid_warned(int nodeid, int num_nodes, int *warned) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (!warned[i]) { + warned[i] = nodeid; + return 0; + } + if (warned[i] == nodeid) + return 1; + } + return 0; +} + +void dlm_scan_waiters(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + ktime_t zero = ktime_set(0, 0); + s64 us; + s64 debug_maxus = 0; + u32 debug_scanned = 0; + u32 debug_expired = 0; + int num_nodes = 0; + int *warned = NULL; + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_equal(lkb->lkb_wait_time, zero)) + continue; + + debug_scanned++; + + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); + + if (us < dlm_config.ci_waitwarn_us) + continue; + + lkb->lkb_wait_time = zero; + + debug_expired++; + if (us > debug_maxus) + debug_maxus = us; + + if (!num_nodes) { + num_nodes = ls->ls_num_nodes; + warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); + } + if (!warned) + continue; + if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) + continue; + + log_error(ls, "waitwarn %x %lld %d us check connection to " + "node %d", lkb->lkb_id, (long long)us, + dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); + } + mutex_unlock(&ls->ls_waiters_mutex); + kfree(warned); + + if (debug_expired) + log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", + debug_scanned, debug_expired, + dlm_config.ci_waitwarn_us, (long long)debug_maxus); +} + +/* add/remove lkb from global waiters list of lkb's waiting for + a reply from a remote node */ + +static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error = 0; + + mutex_lock(&ls->ls_waiters_mutex); + + if (is_overlap_unlock(lkb) || + (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) { + error = -EINVAL; + goto out; + } + + if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) { + switch (mstype) { + case DLM_MSG_UNLOCK: + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + break; + case DLM_MSG_CANCEL: + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + break; + default: + error = -EBUSY; + goto out; + } + lkb->lkb_wait_count++; + hold_lkb(lkb); + + log_debug(ls, "addwait %x cur %d overlap %d count %d f %x", + lkb->lkb_id, lkb->lkb_wait_type, mstype, + lkb->lkb_wait_count, lkb->lkb_flags); + goto out; + } + + DLM_ASSERT(!lkb->lkb_wait_count, + dlm_print_lkb(lkb); + printk("wait_count %d\n", lkb->lkb_wait_count);); + + lkb->lkb_wait_count++; + lkb->lkb_wait_type = mstype; + lkb->lkb_wait_time = ktime_get(); + lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ + hold_lkb(lkb); + list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); + out: + if (error) + log_error(ls, "addwait error %x %d flags %x %d %d %s", + lkb->lkb_id, error, lkb->lkb_flags, mstype, + lkb->lkb_wait_type, lkb->lkb_resource->res_name); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* We clear the RESEND flag because we might be taking an lkb off the waiters + list as part of process_requestqueue (e.g. a lookup that has an optimized + request reply on the requestqueue) between dlm_recover_waiters_pre() which + set RESEND and dlm_recover_waiters_post() */ + +static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, + struct dlm_message *ms) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int overlap_done = 0; + + if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) { + log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + overlap_done = 1; + goto out_del; + } + + if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) { + log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + overlap_done = 1; + goto out_del; + } + + /* Cancel state was preemptively cleared by a successful convert, + see next comment, nothing to do. */ + + if ((mstype == DLM_MSG_CANCEL_REPLY) && + (lkb->lkb_wait_type != DLM_MSG_CANCEL)) { + log_debug(ls, "remwait %x cancel_reply wait_type %d", + lkb->lkb_id, lkb->lkb_wait_type); + return -1; + } + + /* Remove for the convert reply, and premptively remove for the + cancel reply. A convert has been granted while there's still + an outstanding cancel on it (the cancel is moot and the result + in the cancel reply should be 0). We preempt the cancel reply + because the app gets the convert result and then can follow up + with another op, like convert. This subsequent op would see the + lingering state of the cancel and fail with -EBUSY. */ + + if ((mstype == DLM_MSG_CONVERT_REPLY) && + (lkb->lkb_wait_type == DLM_MSG_CONVERT) && + is_overlap_cancel(lkb) && ms && !ms->m_result) { + log_debug(ls, "remwait %x convert_reply zap overlap_cancel", + lkb->lkb_id); + lkb->lkb_wait_type = 0; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_wait_count--; + goto out_del; + } + + /* N.B. type of reply may not always correspond to type of original + msg due to lookup->request optimization, verify others? */ + + if (lkb->lkb_wait_type) { + lkb->lkb_wait_type = 0; + goto out_del; + } + + log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", + lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid, + mstype, lkb->lkb_flags); + return -1; + + out_del: + /* the force-unlock/cancel has completed and we haven't recvd a reply + to the op that was in progress prior to the unlock/cancel; we + give up on any reply to the earlier op. FIXME: not sure when/how + this would happen */ + + if (overlap_done && lkb->lkb_wait_type) { + log_error(ls, "remwait error %x reply %d wait_type %d overlap", + lkb->lkb_id, mstype, lkb->lkb_wait_type); + lkb->lkb_wait_count--; + lkb->lkb_wait_type = 0; + } + + DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb);); + + lkb->lkb_flags &= ~DLM_IFL_RESEND; + lkb->lkb_wait_count--; + if (!lkb->lkb_wait_count) + list_del_init(&lkb->lkb_wait_reply); + unhold_lkb(lkb); + return 0; +} + +static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, mstype, NULL); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* Handles situations where we might be processing a "fake" or "stub" reply in + which we can't try to take waiters_mutex again. */ + +static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + if (ms->m_flags != DLM_IFL_STUB_MS) + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, ms->m_type, ms); + if (ms->m_flags != DLM_IFL_STUB_MS) + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* If there's an rsb for the same resource being removed, ensure + that the remove message is sent before the new lookup message. + It should be rare to need a delay here, but if not, then it may + be worthwhile to add a proper wait mechanism rather than a delay. */ + +static void wait_pending_remove(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + restart: + spin_lock(&ls->ls_remove_spin); + if (ls->ls_remove_len && + !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { + log_debug(ls, "delay lookup for remove dir %d %s", + r->res_dir_nodeid, r->res_name); + spin_unlock(&ls->ls_remove_spin); + msleep(1); + goto restart; + } + spin_unlock(&ls->ls_remove_spin); +} + +/* + * ls_remove_spin protects ls_remove_name and ls_remove_len which are + * read by other threads in wait_pending_remove. ls_remove_names + * and ls_remove_lens are only used by the scan thread, so they do + * not need protection. + */ + +static void shrink_bucket(struct dlm_ls *ls, int b) +{ + struct rb_node *n, *next; + struct dlm_rsb *r; + char *name; + int our_nodeid = dlm_our_nodeid(); + int remote_count = 0; + int need_shrink = 0; + int i, len, rv; + + memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); + + spin_lock(&ls->ls_rsbtbl[b].lock); + + if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { + next = rb_next(n); + r = rb_entry(n, struct dlm_rsb, res_hashnode); + + /* If we're the directory record for this rsb, and + we're not the master of it, then we need to wait + for the master node to send us a dir remove for + before removing the dir record. */ + + if (!dlm_no_directory(ls) && + (r->res_master_nodeid != our_nodeid) && + (dlm_dir_nodeid(r) == our_nodeid)) { + continue; + } + + need_shrink = 1; + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + continue; + } + + if (!dlm_no_directory(ls) && + (r->res_master_nodeid == our_nodeid) && + (dlm_dir_nodeid(r) != our_nodeid)) { + + /* We're the master of this rsb but we're not + the directory record, so we need to tell the + dir node to remove the dir record. */ + + ls->ls_remove_lens[remote_count] = r->res_length; + memcpy(ls->ls_remove_names[remote_count], r->res_name, + DLM_RESNAME_MAXLEN); + remote_count++; + + if (remote_count >= DLM_REMOVE_NAMES_MAX) + break; + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { + log_error(ls, "tossed rsb in use %s", r->res_name); + continue; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + dlm_free_rsb(r); + } + + if (need_shrink) + ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + else + ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK; + spin_unlock(&ls->ls_rsbtbl[b].lock); + + /* + * While searching for rsb's to free, we found some that require + * remote removal. We leave them in place and find them again here + * so there is a very small gap between removing them from the toss + * list and sending the removal. Keeping this gap small is + * important to keep us (the master node) from being out of sync + * with the remote dir node for very long. + * + * From the time the rsb is removed from toss until just after + * send_remove, the rsb name is saved in ls_remove_name. A new + * lookup checks this to ensure that a new lookup message for the + * same resource name is not sent just before the remove message. + */ + + for (i = 0; i < remote_count; i++) { + name = ls->ls_remove_names[i]; + len = ls->ls_remove_lens[i]; + + spin_lock(&ls->ls_rsbtbl[b].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name not toss %s", name); + continue; + } + + if (r->res_master_nodeid != our_nodeid) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name master %d dir %d our %d %s", + r->res_master_nodeid, r->res_dir_nodeid, + our_nodeid, name); + continue; + } + + if (r->res_dir_nodeid == our_nodeid) { + /* should never happen */ + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name dir %d master %d our %d %s", + r->res_dir_nodeid, r->res_master_nodeid, + our_nodeid, name); + continue; + } + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name toss_time %lu now %lu %s", + r->res_toss_time, jiffies, name); + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name in use %s", name); + continue; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + + /* block lookup of same name until we've sent remove */ + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = len; + memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + send_remove(r); + + /* allow lookup of name again */ + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = 0; + memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + + dlm_free_rsb(r); + } +} + +void dlm_scan_rsbs(struct dlm_ls *ls) +{ + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + shrink_bucket(ls, i); + if (dlm_locking_stopped(ls)) + break; + cond_resched(); + } +} + +static void add_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + if (is_master_copy(lkb)) + return; + + if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && + !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN; + goto add_it; + } + if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) + goto add_it; + return; + + add_it: + DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); + mutex_lock(&ls->ls_timeout_mutex); + hold_lkb(lkb); + list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); + mutex_unlock(&ls->ls_timeout_mutex); +} + +static void del_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + mutex_lock(&ls->ls_timeout_mutex); + if (!list_empty(&lkb->lkb_time_list)) { + list_del_init(&lkb->lkb_time_list); + unhold_lkb(lkb); + } + mutex_unlock(&ls->ls_timeout_mutex); +} + +/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and + lkb_lksb_timeout without lock_rsb? Note: we can't lock timeout_mutex + and then lock rsb because of lock ordering in add_timeout. We may need + to specify some special timeout-related bits in the lkb that are just to + be accessed under the timeout_mutex. */ + +void dlm_scan_timeout(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + struct dlm_lkb *lkb; + int do_cancel, do_warn; + s64 wait_us; + + for (;;) { + if (dlm_locking_stopped(ls)) + break; + + do_cancel = 0; + do_warn = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + + wait_us = ktime_to_us(ktime_sub(ktime_get(), + lkb->lkb_timestamp)); + + if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && + wait_us >= (lkb->lkb_timeout_cs * 10000)) + do_cancel = 1; + + if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && + wait_us >= dlm_config.ci_timewarn_cs * 10000) + do_warn = 1; + + if (!do_cancel && !do_warn) + continue; + hold_lkb(lkb); + break; + } + mutex_unlock(&ls->ls_timeout_mutex); + + if (!do_cancel && !do_warn) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + if (do_warn) { + /* clear flag so we only warn once */ + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT)) + del_timeout(lkb); + dlm_timeout_warn(lkb); + } + + if (do_cancel) { + log_debug(ls, "timeout cancel %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; + del_timeout(lkb); + _cancel_lock(r, lkb); + } + + unlock_rsb(r); + unhold_rsb(r); + dlm_put_lkb(lkb); + } +} + +/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping + dlm_recoverd before checking/setting ls_recover_begin. */ + +void dlm_adjust_timeouts(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin); + + ls->ls_recover_begin = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) + lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); + mutex_unlock(&ls->ls_timeout_mutex); + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_to_us(lkb->lkb_wait_time)) + lkb->lkb_wait_time = ktime_get(); + } + mutex_unlock(&ls->ls_waiters_mutex); +} + +/* lkb is master or local copy */ + +static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int b, len = r->res_ls->ls_lvblen; + + /* b=1 lvb returned to caller + b=0 lvb written to rsb or invalidated + b=-1 do nothing */ + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + + if (b == 1) { + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + return; + + memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len); + lkb->lkb_lvbseq = r->res_lvbseq; + + } else if (b == 0) { + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len); + r->res_lvbseq++; + lkb->lkb_lvbseq = r->res_lvbseq; + rsb_clear_flag(r, RSB_VALNOTVALID); + } + + if (rsb_flag(r, RSB_VALNOTVALID)) + lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID; +} + +static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode < DLM_LOCK_PW) + return; + + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + r->res_lvbseq++; + rsb_clear_flag(r, RSB_VALNOTVALID); +} + +/* lkb is process copy (pc) */ + +static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + int b; + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + if (b == 1) { + int len = receive_extralen(ms); + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + lkb->lkb_lvbseq = ms->m_lvbseq; + } +} + +/* Manipulate lkb's on rsb's convert/granted/waiting queues + remove_lock -- used for unlock, removes lkb from granted + revert_lock -- used for cancel, moves lkb from convert to granted + grant_lock -- used for request and convert, adds lkb to granted or + moves lkb from convert or waiting to granted + + Each of these is used for master or local copy lkb's. There is + also a _pc() variation used to make the corresponding change on + a process copy (pc) lkb. */ + +static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); +} + +static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_unlock(r, lkb); + _remove_lock(r, lkb); +} + +static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + _remove_lock(r, lkb); +} + +/* returns: 0 did nothing + 1 moved lock to granted + -1 removed lock */ + +static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int rv = 0; + + lkb->lkb_rqmode = DLM_LOCK_IV; + + switch (lkb->lkb_status) { + case DLM_LKSTS_GRANTED: + break; + case DLM_LKSTS_CONVERT: + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + rv = 1; + break; + case DLM_LKSTS_WAITING: + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); + rv = -1; + break; + default: + log_print("invalid status for revert %d", lkb->lkb_status); + } + return rv; +} + +static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return revert_lock(r, lkb); +} + +static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode != lkb->lkb_rqmode) { + lkb->lkb_grmode = lkb->lkb_rqmode; + if (lkb->lkb_status) + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + else + add_lkb(r, lkb, DLM_LKSTS_GRANTED); + } + + lkb->lkb_rqmode = DLM_LOCK_IV; + lkb->lkb_highbast = 0; +} + +static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_lock(r, lkb); + _grant_lock(r, lkb); +} + +static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + set_lvb_lock_pc(r, lkb, ms); + _grant_lock(r, lkb); +} + +/* called by grant_pending_locks() which means an async grant message must + be sent to the requesting node in addition to granting the lock if the + lkb belongs to a remote node. */ + +static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + grant_lock(r, lkb); + if (is_master_copy(lkb)) + send_grant(r, lkb); + else + queue_cast(r, lkb, 0); +} + +/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to + change the granted/requested modes. We're munging things accordingly in + the process copy. + CONVDEADLK: our grmode may have been forced down to NL to resolve a + conversion deadlock + ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become + compatible with other granted locks */ + +static void munge_demoted(struct dlm_lkb *lkb) +{ + if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { + log_print("munge_demoted %x invalid modes gr %d rq %d", + lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); + return; + } + + lkb->lkb_grmode = DLM_LOCK_NL; +} + +static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + if (ms->m_type != DLM_MSG_REQUEST_REPLY && + ms->m_type != DLM_MSG_GRANT) { + log_print("munge_altmode %x invalid reply type %d", + lkb->lkb_id, ms->m_type); + return; + } + + if (lkb->lkb_exflags & DLM_LKF_ALTPR) + lkb->lkb_rqmode = DLM_LOCK_PR; + else if (lkb->lkb_exflags & DLM_LKF_ALTCW) + lkb->lkb_rqmode = DLM_LOCK_CW; + else { + log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags); + dlm_print_lkb(lkb); + } +} + +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) +{ + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, + lkb_statequeue); + if (lkb->lkb_id == first->lkb_id) + return 1; + + return 0; +} + +/* Check if the given lkb conflicts with another lkb on the queue. */ + +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) +{ + struct dlm_lkb *this; + + list_for_each_entry(this, head, lkb_statequeue) { + if (this == lkb) + continue; + if (!modes_compat(this, lkb)) + return 1; + } + return 0; +} + +/* + * "A conversion deadlock arises with a pair of lock requests in the converting + * queue for one resource. The granted mode of each lock blocks the requested + * mode of the other lock." + * + * Part 2: if the granted mode of lkb is preventing an earlier lkb in the + * convert queue from being granted, then deadlk/demote lkb. + * + * Example: + * Granted Queue: empty + * Convert Queue: NL->EX (first lock) + * PR->EX (second lock) + * + * The first lock can't be granted because of the granted mode of the second + * lock and the second lock can't be granted because it's not first in the + * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we + * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK + * flag set and return DEMOTED in the lksb flags. + * + * Originally, this function detected conv-deadlk in a more limited scope: + * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or + * - if lkb1 was the first entry in the queue (not just earlier), and was + * blocked by the granted mode of lkb2, and there was nothing on the + * granted queue preventing lkb1 from being granted immediately, i.e. + * lkb2 was the only thing preventing lkb1 from being granted. + * + * That second condition meant we'd only say there was conv-deadlk if + * resolving it (by demotion) would lead to the first lock on the convert + * queue being granted right away. It allowed conversion deadlocks to exist + * between locks on the convert queue while they couldn't be granted anyway. + * + * Now, we detect and take action on conversion deadlocks immediately when + * they're created, even if they may not be immediately consequential. If + * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted + * mode that would prevent lkb1's conversion from being granted, we do a + * deadlk/demote on lkb2 right away and don't let it onto the convert queue. + * I think this means that the lkb_is_ahead condition below should always + * be zero, i.e. there will never be conv-deadlk between two locks that are + * both already on the convert queue. + */ + +static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2) +{ + struct dlm_lkb *lkb1; + int lkb_is_ahead = 0; + + list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) { + if (lkb1 == lkb2) { + lkb_is_ahead = 1; + continue; + } + + if (!lkb_is_ahead) { + if (!modes_compat(lkb2, lkb1)) + return 1; + } else { + if (!modes_compat(lkb2, lkb1) && + !modes_compat(lkb1, lkb2)) + return 1; + } + } + return 0; +} + +/* + * Return 1 if the lock can be granted, 0 otherwise. + * Also detect and resolve conversion deadlocks. + * + * lkb is the lock to be granted + * + * now is 1 if the function is being called in the context of the + * immediate request, it is 0 if called later, after the lock has been + * queued. + * + * recover is 1 if dlm_recover_grant() is trying to grant conversions + * after recovery. + * + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis + */ + +static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int recover) +{ + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); + + /* + * 6-10: Version 5.4 introduced an option to address the phenomenon of + * a new request for a NL mode lock being blocked. + * + * 6-11: If the optional EXPEDITE flag is used with the new NL mode + * request, then it would be granted. In essence, the use of this flag + * tells the Lock Manager to expedite theis request by not considering + * what may be in the CONVERTING or WAITING queues... As of this + * writing, the EXPEDITE flag can be used only with new requests for NL + * mode locks. This flag is not valid for conversion requests. + * + * A shortcut. Earlier checks return an error if EXPEDITE is used in a + * conversion or used with a non-NL requested mode. We also know an + * EXPEDITE request is always granted immediately, so now must always + * be 1. The full condition to grant an expedite request: (now && + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can + * therefore be shortened to just checking the flag. + */ + + if (lkb->lkb_exflags & DLM_LKF_EXPEDITE) + return 1; + + /* + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be + * added to the remaining conditions. + */ + + if (queue_conflict(&r->res_grantqueue, lkb)) + return 0; + + /* + * 6-3: By default, a conversion request is immediately granted if the + * requested mode is compatible with the modes of all other granted + * locks + */ + + if (queue_conflict(&r->res_convertqueue, lkb)) + return 0; + + /* + * The RECOVER_GRANT flag means dlm_recover_grant() is granting + * locks for a recovered rsb, on which lkb's have been rebuilt. + * The lkb's may have been rebuilt on the queues in a different + * order than they were in on the previous master. So, granting + * queued conversions in order after recovery doesn't make sense + * since the order hasn't been preserved anyway. The new order + * could also have created a new "in place" conversion deadlock. + * (e.g. old, failed master held granted EX, with PR->EX, NL->EX. + * After recovery, there would be no granted locks, and possibly + * NL->EX, PR->EX, an in-place conversion deadlock.) So, after + * recovery, grant conversions without considering order. + */ + + if (conv && recover) + return 1; + + /* + * 6-5: But the default algorithm for deciding whether to grant or + * queue conversion requests does not by itself guarantee that such + * requests are serviced on a "first come first serve" basis. This, in + * turn, can lead to a phenomenon known as "indefinate postponement". + * + * 6-7: This issue is dealt with by using the optional QUECVT flag with + * the system service employed to request a lock conversion. This flag + * forces certain conversion requests to be queued, even if they are + * compatible with the granted modes of other locks on the same + * resource. Thus, the use of this flag results in conversion requests + * being ordered on a "first come first servce" basis. + * + * DCT: This condition is all about new conversions being able to occur + * "in place" while the lock remains on the granted queue (assuming + * nothing else conflicts.) IOW if QUECVT isn't set, a conversion + * doesn't _have_ to go onto the convert queue where it's processed in + * order. The "now" variable is necessary to distinguish converts + * being received and processed for the first time now, because once a + * convert is moved to the conversion queue the condition below applies + * requiring fifo granting. + */ + + if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT)) + return 1; + + /* + * Even if the convert is compat with all granted locks, + * QUECVT forces it behind other locks on the convert queue. + */ + + if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) { + if (list_empty(&r->res_convertqueue)) + return 1; + else + return 0; + } + + /* + * The NOORDER flag is set to avoid the standard vms rules on grant + * order. + */ + + if (lkb->lkb_exflags & DLM_LKF_NOORDER) + return 1; + + /* + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be + * granted until all other conversion requests ahead of it are granted + * and/or canceled. + */ + + if (!now && conv && first_in_list(lkb, &r->res_convertqueue)) + return 1; + + /* + * 6-4: By default, a new request is immediately granted only if all + * three of the following conditions are satisfied when the request is + * issued: + * - The queue of ungranted conversion requests for the resource is + * empty. + * - The queue of ungranted new requests for the resource is empty. + * - The mode of the new request is compatible with the most + * restrictive mode of all granted locks on the resource. + */ + + if (now && !conv && list_empty(&r->res_convertqueue) && + list_empty(&r->res_waitqueue)) + return 1; + + /* + * 6-4: Once a lock request is in the queue of ungranted new requests, + * it cannot be granted until the queue of ungranted conversion + * requests is empty, all ungranted new requests ahead of it are + * granted and/or canceled, and it is compatible with the granted mode + * of the most restrictive lock granted on the resource. + */ + + if (!now && !conv && list_empty(&r->res_convertqueue) && + first_in_list(lkb, &r->res_waitqueue)) + return 1; + + return 0; +} + +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int recover, int *err) +{ + int rv; + int8_t alt = 0, rqmode = lkb->lkb_rqmode; + int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV); + + if (err) + *err = 0; + + rv = _can_be_granted(r, lkb, now, recover); + if (rv) + goto out; + + /* + * The CONVDEADLK flag is non-standard and tells the dlm to resolve + * conversion deadlocks by demoting grmode to NL, otherwise the dlm + * cancels one of the locks. + */ + + if (is_convert && can_be_queued(lkb) && + conversion_deadlock_detect(r, lkb)) { + if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { + lkb->lkb_grmode = DLM_LOCK_NL; + lkb->lkb_sbflags |= DLM_SBF_DEMOTED; + } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + if (err) + *err = -EDEADLK; + else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); + } + } + goto out; + } + + /* + * The ALTPR and ALTCW flags are non-standard and tell the dlm to try + * to grant a request in a mode other than the normal rqmode. It's a + * simple way to provide a big optimization to applications that can + * use them. + */ + + if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR)) + alt = DLM_LOCK_PR; + else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW)) + alt = DLM_LOCK_CW; + + if (alt) { + lkb->lkb_rqmode = alt; + rv = _can_be_granted(r, lkb, now, 0); + if (rv) + lkb->lkb_sbflags |= DLM_SBF_ALTMODE; + else + lkb->lkb_rqmode = rqmode; + } + out: + return rv; +} + +/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock + for locks pending on the convert list. Once verified (watch for these + log_prints), we should be able to just call _can_be_granted() and not + bother with the demote/deadlk cases here (and there's no easy way to deal + with a deadlk here, we'd have to generate something like grant_lock with + the deadlk error.) */ + +/* Returns the highest requested mode of all blocked conversions; sets + cw if there's a blocked conversion to DLM_LOCK_CW. */ + +static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + int recover = rsb_flag(r, RSB_RECOVER_GRANT); + int hi, demoted, quit, grant_restart, demote_restart; + int deadlk; + + quit = 0; + restart: + grant_restart = 0; + demote_restart = 0; + hi = DLM_LOCK_IV; + + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { + demoted = is_demoted(lkb); + deadlk = 0; + + if (can_be_granted(r, lkb, 0, recover, &deadlk)) { + grant_lock_pending(r, lkb); + grant_restart = 1; + if (count) + (*count)++; + continue; + } + + if (!demoted && is_demoted(lkb)) { + log_print("WARN: pending demoted %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + demote_restart = 1; + continue; + } + + if (deadlk) { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + dlm_dump_rsb(r); + continue; + } + + hi = max_t(int, lkb->lkb_rqmode, hi); + + if (cw && lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + + if (grant_restart) + goto restart; + if (demote_restart && !quit) { + quit = 1; + goto restart; + } + + return max_t(int, high, hi); +} + +static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { + if (can_be_granted(r, lkb, 0, 0, NULL)) { + grant_lock_pending(r, lkb); + if (count) + (*count)++; + } else { + high = max_t(int, lkb->lkb_rqmode, high); + if (lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + } + + return high; +} + +/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked + on either the convert or waiting queue. + high is the largest rqmode of all locks blocked on the convert or + waiting queue. */ + +static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw) +{ + if (gr->lkb_grmode == DLM_LOCK_PR && cw) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < high && + !__dlm_compat_matrix[gr->lkb_grmode+1][high+1]) + return 1; + return 0; +} + +static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) +{ + struct dlm_lkb *lkb, *s; + int high = DLM_LOCK_IV; + int cw = 0; + + if (!is_master(r)) { + log_print("grant_pending_locks r nodeid %d", r->res_nodeid); + dlm_dump_rsb(r); + return; + } + + high = grant_pending_convert(r, high, &cw, count); + high = grant_pending_wait(r, high, &cw, count); + + if (high == DLM_LOCK_IV) + return; + + /* + * If there are locks left on the wait/convert queue then send blocking + * ASTs to granted locks based on the largest requested mode (high) + * found above. + */ + + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) { + if (cw && high == DLM_LOCK_PR && + lkb->lkb_grmode == DLM_LOCK_PR) + queue_bast(r, lkb, DLM_LOCK_CW); + else + queue_bast(r, lkb, high); + lkb->lkb_highbast = high; + } + } +} + +static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq) +{ + if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) || + (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq)) + return 1; + return 0; +} + +static void send_bast_queue(struct dlm_rsb *r, struct list_head *head, + struct dlm_lkb *lkb) +{ + struct dlm_lkb *gr; + + list_for_each_entry(gr, head, lkb_statequeue) { + /* skip self when sending basts to convertqueue */ + if (gr == lkb) + continue; + if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { + queue_bast(r, gr, lkb->lkb_rqmode); + gr->lkb_highbast = lkb->lkb_rqmode; + } + } +} + +static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); +} + +static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); + send_bast_queue(r, &r->res_convertqueue, lkb); +} + +/* set_master(r, lkb) -- set the master nodeid of a resource + + The purpose of this function is to set the nodeid field in the given + lkb using the nodeid field in the given rsb. If the rsb's nodeid is + known, it can just be copied to the lkb and the function will return + 0. If the rsb's nodeid is _not_ known, it needs to be looked up + before it can be copied to the lkb. + + When the rsb nodeid is being looked up remotely, the initial lkb + causing the lookup is kept on the ls_waiters list waiting for the + lookup reply. Other lkb's waiting for the same rsb lookup are kept + on the rsb's res_lookup list until the master is verified. + + Return values: + 0: nodeid is set in rsb/lkb and the caller should go ahead and use it + 1: the rsb master is not available and the lkb has been placed on + a wait queue +*/ + +static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int our_nodeid = dlm_our_nodeid(); + + if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = lkb->lkb_id; + lkb->lkb_nodeid = r->res_nodeid; + return 0; + } + + if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) { + list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup); + return 1; + } + + if (r->res_master_nodeid == our_nodeid) { + lkb->lkb_nodeid = 0; + return 0; + } + + if (r->res_master_nodeid) { + lkb->lkb_nodeid = r->res_master_nodeid; + return 0; + } + + if (dlm_dir_nodeid(r) == our_nodeid) { + /* This is a somewhat unusual case; find_rsb will usually + have set res_master_nodeid when dir nodeid is local, but + there are cases where we become the dir node after we've + past find_rsb and go through _request_lock again. + confirm_master() or process_lookup_list() needs to be + called after this. */ + log_debug(r->res_ls, "set_master %x self master %d dir %d %s", + lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid, + r->res_name); + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + lkb->lkb_nodeid = 0; + return 0; + } + + wait_pending_remove(r); + + r->res_first_lkid = lkb->lkb_id; + send_lookup(r, lkb); + return 1; +} + +static void process_lookup_list(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { + list_del_init(&lkb->lkb_rsb_lookup); + _request_lock(r, lkb); + schedule(); + } +} + +/* confirm_master -- confirm (or deny) an rsb's master nodeid */ + +static void confirm_master(struct dlm_rsb *r, int error) +{ + struct dlm_lkb *lkb; + + if (!r->res_first_lkid) + return; + + switch (error) { + case 0: + case -EINPROGRESS: + r->res_first_lkid = 0; + process_lookup_list(r); + break; + + case -EAGAIN: + case -EBADR: + case -ENOTBLK: + /* the remote request failed and won't be retried (it was + a NOQUEUE, or has been canceled/unlocked); make a waiting + lkb the first_lkid */ + + r->res_first_lkid = 0; + + if (!list_empty(&r->res_lookup)) { + lkb = list_entry(r->res_lookup.next, struct dlm_lkb, + lkb_rsb_lookup); + list_del_init(&lkb->lkb_rsb_lookup); + r->res_first_lkid = lkb->lkb_id; + _request_lock(r, lkb); + } + break; + + default: + log_error(r->res_ls, "confirm_master unknown error %d", error); + } +} + +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, unsigned long timeout_cs, + void (*ast) (void *astparam), + void *astparam, + void (*bast) (void *astparam, int mode), + struct dlm_args *args) +{ + int rv = -EINVAL; + + /* check for invalid arg usage */ + + if (mode < 0 || mode > DLM_LOCK_EX) + goto out; + + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN)) + goto out; + + if (flags & DLM_LKF_CANCEL) + goto out; + + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL) + goto out; + + if (!ast || !lksb) + goto out; + + if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) + goto out; + + if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) + goto out; + + /* these args will be copied to the lkb in validate_lock_args, + it cannot be done now because when converting locks, fields in + an active lkb cannot be modified before locking the rsb */ + + args->flags = flags; + args->astfn = ast; + args->astparam = astparam; + args->bastfn = bast; + args->timeout = timeout_cs; + args->mode = mode; + args->lksb = lksb; + rv = 0; + out: + return rv; +} + +static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) +{ + if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK | + DLM_LKF_FORCEUNLOCK)) + return -EINVAL; + + if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK) + return -EINVAL; + + args->flags = flags; + args->astparam = astarg; + return 0; +} + +static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + int rv = -EINVAL; + + if (args->flags & DLM_LKF_CONVERT) { + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; + + rv = -EBUSY; + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + goto out; + + if (lkb->lkb_wait_type) + goto out; + + if (is_overlap(lkb)) + goto out; + } + + lkb->lkb_exflags = args->flags; + lkb->lkb_sbflags = 0; + lkb->lkb_astfn = args->astfn; + lkb->lkb_astparam = args->astparam; + lkb->lkb_bastfn = args->bastfn; + lkb->lkb_rqmode = args->mode; + lkb->lkb_lksb = args->lksb; + lkb->lkb_lvbptr = args->lksb->sb_lvbptr; + lkb->lkb_ownpid = (int) current->pid; + lkb->lkb_timeout_cs = args->timeout; + rv = 0; + out: + if (rv) + log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s", + rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + return rv; +} + +/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0 + for success */ + +/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here + because there may be a lookup in progress and it's valid to do + cancel/unlockf on it */ + +static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int rv = -EINVAL; + + if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + cancel, unlock or failed noqueue request; an app can't use these + locks; return same error as if the lkid had not been found at all */ + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; + goto out; + } + + /* an lkb may be waiting for an rsb lookup to complete where the + lookup was initiated by another lock */ + + if (!list_empty(&lkb->lkb_rsb_lookup)) { + if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { + log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); + list_del_init(&lkb->lkb_rsb_lookup); + queue_cast(lkb->lkb_resource, lkb, + args->flags & DLM_LKF_CANCEL ? + -DLM_ECANCEL : -DLM_EUNLOCK); + unhold_lkb(lkb); /* undoes create_lkb() */ + } + /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ + rv = -EBUSY; + goto out; + } + + /* cancel not allowed with another cancel/unlock in progress */ + + if (args->flags & DLM_LKF_CANCEL) { + if (lkb->lkb_exflags & DLM_LKF_CANCEL) + goto out; + + if (is_overlap(lkb)) + goto out; + + /* don't let scand try to do a cancel */ + del_timeout(lkb); + + if (lkb->lkb_flags & DLM_IFL_RESEND) { + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + rv = -EBUSY; + goto out; + } + + /* there's nothing to cancel */ + if (lkb->lkb_status == DLM_LKSTS_GRANTED && + !lkb->lkb_wait_type) { + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + goto out; + } + /* add_to_waiters() will set OVERLAP_CANCEL */ + goto out_ok; + } + + /* do we need to allow a force-unlock if there's a normal unlock + already in progress? in what conditions could the normal unlock + fail such that we'd want to send a force-unlock to be sure? */ + + if (args->flags & DLM_LKF_FORCEUNLOCK) { + if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK) + goto out; + + if (is_overlap_unlock(lkb)) + goto out; + + /* don't let scand try to do a cancel */ + del_timeout(lkb); + + if (lkb->lkb_flags & DLM_IFL_RESEND) { + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + goto out; + } + /* add_to_waiters() will set OVERLAP_UNLOCK */ + goto out_ok; + } + + /* normal unlock not allowed if there's any op in progress */ + rv = -EBUSY; + if (lkb->lkb_wait_type || lkb->lkb_wait_count) + goto out; + + out_ok: + /* an overlapping op shouldn't blow away exflags from other op */ + lkb->lkb_exflags |= args->flags; + lkb->lkb_sbflags = 0; + lkb->lkb_astparam = args->astparam; + rv = 0; + out: + if (rv) + log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv, + lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + return rv; +} + +/* + * Four stage 4 varieties: + * do_request(), do_convert(), do_unlock(), do_cancel() + * These are called on the master node for the given lock and + * from the central locking logic. + */ + +static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + + if (can_be_granted(r, lkb, 1, 0, NULL)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + add_lkb(r, lkb, DLM_LKSTS_WAITING); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + int deadlk = 0; + + /* changing an existing lock may allow others to be granted */ + + if (can_be_granted(r, lkb, 1, 0, &deadlk)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + /* can_be_granted() detected that this lock would block in a conversion + deadlock, so we leave it on the granted queue and return EDEADLK in + the ast for the convert. */ + + if (deadlk) { + /* it's left on the granted queue */ + revert_lock(r, lkb); + queue_cast(r, lkb, -EDEADLK); + error = -EDEADLK; + goto out; + } + + /* is_demoted() means the can_be_granted() above set the grmode + to NL, and left us on the granted queue. This auto-demotion + (due to CONVDEADLK) might mean other locks, and/or this lock, are + now grantable. We have to try to grant other converting locks + before we try again to grant this one. */ + + if (is_demoted(lkb)) { + grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); + if (_can_be_granted(r, lkb, 1, 0)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + /* else fall through and move to convert queue */ + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case 0: + grant_pending_locks(r, NULL); + /* grant_pending_locks also sends basts */ + break; + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + remove_lock(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + return -DLM_EUNLOCK; +} + +static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + grant_pending_locks(r, NULL); +} + +/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ + +static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = revert_lock(r, lkb); + if (error) { + queue_cast(r, lkb, -DLM_ECANCEL); + return -DLM_ECANCEL; + } + return 0; +} + +static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + if (error) + grant_pending_locks(r, NULL); +} + +/* + * Four stage 3 varieties: + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() + */ + +/* add a new lkb to a possibly new rsb, called by requesting process */ + +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + /* set_master: sets lkb nodeid from r */ + + error = set_master(r, lkb); + if (error < 0) + goto out; + if (error) { + error = 0; + goto out; + } + + if (is_remote(r)) { + /* receive_request() calls do_request() on remote node */ + error = send_request(r, lkb); + } else { + error = do_request(r, lkb); + /* for remote locks the request_reply is sent + between do_request and do_request_effects */ + do_request_effects(r, lkb, error); + } + out: + return error; +} + +/* change some property of an existing lkb, e.g. mode */ + +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_convert() calls do_convert() on remote node */ + error = send_convert(r, lkb); + } else { + error = do_convert(r, lkb); + /* for remote locks the convert_reply is sent + between do_convert and do_convert_effects */ + do_convert_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the granted queue */ + +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_unlock() calls do_unlock() on remote node */ + error = send_unlock(r, lkb); + } else { + error = do_unlock(r, lkb); + /* for remote locks the unlock_reply is sent + between do_unlock and do_unlock_effects */ + do_unlock_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the convert or wait queue */ + +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_cancel() calls do_cancel() on remote node */ + error = send_cancel(r, lkb); + } else { + error = do_cancel(r, lkb); + /* for remote locks the cancel_reply is sent + between do_cancel and do_cancel_effects */ + do_cancel_effects(r, lkb, error); + } + + return error; +} + +/* + * Four stage 2 varieties: + * request_lock(), convert_lock(), unlock_lock(), cancel_lock() + */ + +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, + int len, struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + error = validate_lock_args(ls, lkb, args); + if (error) + return error; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) + return error; + + lock_rsb(r); + + attach_lkb(r, lkb); + lkb->lkb_lksb->sb_lkid = lkb->lkb_id; + + error = _request_lock(r, lkb); + + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_lock_args(ls, lkb, args); + if (error) + goto out; + + error = _convert_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _unlock_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _cancel_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +/* + * Two stage 1 varieties: dlm_lock() and dlm_unlock() + */ + +int dlm_lock(dlm_lockspace_t *lockspace, + int mode, + struct dlm_lksb *lksb, + uint32_t flags, + void *name, + unsigned int namelen, + uint32_t parent_lkid, + void (*ast) (void *astarg), + void *astarg, + void (*bast) (void *astarg, int mode)) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error, convert = flags & DLM_LKF_CONVERT; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + if (convert) + error = find_lkb(ls, lksb->sb_lkid, &lkb); + else + error = create_lkb(ls, &lkb); + + if (error) + goto out; + + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, + astarg, bast, &args); + if (error) + goto out_put; + + if (convert) + error = convert_lock(ls, lkb, &args); + else + error = request_lock(ls, lkb, name, namelen, &args); + + if (error == -EINPROGRESS) + error = 0; + out_put: + if (convert || error) + __put_lkb(ls, lkb); + if (error == -EAGAIN || error == -EDEADLK) + error = 0; + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +int dlm_unlock(dlm_lockspace_t *lockspace, + uint32_t lkid, + uint32_t flags, + struct dlm_lksb *lksb, + void *astarg) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + error = set_unlock_args(flags, astarg, &args); + if (error) + goto out_put; + + if (flags & DLM_LKF_CANCEL) + error = cancel_lock(ls, lkb, &args); + else + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL) + error = 0; + if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +/* + * send/receive routines for remote operations and replies + * + * send_args + * send_common + * send_request receive_request + * send_convert receive_convert + * send_unlock receive_unlock + * send_cancel receive_cancel + * send_grant receive_grant + * send_bast receive_bast + * send_lookup receive_lookup + * send_remove receive_remove + * + * send_common_reply + * receive_request_reply send_request_reply + * receive_convert_reply send_convert_reply + * receive_unlock_reply send_unlock_reply + * receive_cancel_reply send_cancel_reply + * receive_lookup_reply send_lookup_reply + */ + +static int _create_message(struct dlm_ls *ls, int mb_len, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + char *mb; + + /* get_buffer gives us a message handle (mh) that we need to + pass into lowcomms_commit and a message buffer (mb) that we + write our data into */ + + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + + memset(mb, 0, mb_len); + + ms = (struct dlm_message *) mb; + + ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + ms->m_header.h_lockspace = ls->ls_global_id; + ms->m_header.h_nodeid = dlm_our_nodeid(); + ms->m_header.h_length = mb_len; + ms->m_header.h_cmd = DLM_MSG; + + ms->m_type = mstype; + + *mh_ret = mh; + *ms_ret = ms; + return 0; +} + +static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret) +{ + int mb_len = sizeof(struct dlm_message); + + switch (mstype) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + case DLM_MSG_REMOVE: + mb_len += r->res_length; + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (lkb && lkb->lkb_lvbptr) + mb_len += r->res_ls->ls_lvblen; + break; + } + + return _create_message(r->res_ls, mb_len, to_nodeid, mstype, + ms_ret, mh_ret); +} + +/* further lowcomms enhancements or alternate implementations may make + the return value from this function useful at some point */ + +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) +{ + dlm_message_out(ms); + dlm_lowcomms_commit_buffer(mh); + return 0; +} + +static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + ms->m_nodeid = lkb->lkb_nodeid; + ms->m_pid = lkb->lkb_ownpid; + ms->m_lkid = lkb->lkb_id; + ms->m_remid = lkb->lkb_remid; + ms->m_exflags = lkb->lkb_exflags; + ms->m_sbflags = lkb->lkb_sbflags; + ms->m_flags = lkb->lkb_flags; + ms->m_lvbseq = lkb->lkb_lvbseq; + ms->m_status = lkb->lkb_status; + ms->m_grmode = lkb->lkb_grmode; + ms->m_rqmode = lkb->lkb_rqmode; + ms->m_hash = r->res_hash; + + /* m_result and m_bastmode are set from function args, + not from lkb fields */ + + if (lkb->lkb_bastfn) + ms->m_asts |= DLM_CB_BAST; + if (lkb->lkb_astfn) + ms->m_asts |= DLM_CB_CAST; + + /* compare with switch in create_message; send_remove() doesn't + use send_args() */ + + switch (ms->m_type) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + memcpy(ms->m_extra, r->res_name, r->res_length); + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (!lkb->lkb_lvbptr) + break; + memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + break; + } +} + +static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = r->res_nodeid; + + error = add_to_waiters(lkb, mstype, to_nodeid); + if (error) + return error; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, msg_reply_type(mstype)); + return error; +} + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_REQUEST); +} + +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = send_common(r, lkb, DLM_MSG_CONVERT); + + /* down conversions go without a reply from the master */ + if (!error && down_conversion(lkb)) { + remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS; + r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; + r->res_ls->ls_stub_ms.m_result = 0; + __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); + } + + return error; +} + +/* FIXME: if this lkb is the only lock we hold on the rsb, then set + MASTER_UNCERTAIN to force the next request on the rsb to confirm + that the master is still correct. */ + +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_UNLOCK); +} + +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_CANCEL); +} + +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = 0; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_bastmode = mode; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); + if (error) + return error; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + return error; +} + +static int send_remove(struct dlm_rsb *r) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); + if (error) + goto out; + + memcpy(ms->m_extra, r->res_name, r->res_length); + ms->m_hash = r->res_hash; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + int mstype, int rv) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = rv; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv); +} + +static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv); +} + +static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv); +} + +static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); +} + +static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, + int ret_nodeid, int rv) +{ + struct dlm_rsb *r = &ls->ls_stub_rsb; + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error, nodeid = ms_in->m_header.h_nodeid; + + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); + if (error) + goto out; + + ms->m_lkid = ms_in->m_lkid; + ms->m_result = rv; + ms->m_nodeid = ret_nodeid; + + error = send_message(mh, ms); + out: + return error; +} + +/* which args we save from a received message depends heavily on the type + of message, unlike the send side where we can safely send everything about + the lkb for any type of message */ + +static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + lkb->lkb_exflags = ms->m_exflags; + lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | + (ms->m_flags & 0x0000FFFF); +} + +static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + if (ms->m_flags == DLM_IFL_STUB_MS) + return; + + lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | + (ms->m_flags & 0x0000FFFF); +} + +static int receive_extralen(struct dlm_message *ms) +{ + return (ms->m_header.h_length - sizeof(struct dlm_message)); +} + +static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + int len; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + if (!lkb->lkb_lvbptr) + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + len = receive_extralen(ms); + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + } + return 0; +} + +static void fake_bastfn(void *astparam, int mode) +{ + log_print("fake_bastfn should not be called"); +} + +static void fake_astfn(void *astparam) +{ + log_print("fake_astfn should not be called"); +} + +static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + lkb->lkb_nodeid = ms->m_header.h_nodeid; + lkb->lkb_ownpid = ms->m_pid; + lkb->lkb_remid = ms->m_lkid; + lkb->lkb_grmode = DLM_LOCK_IV; + lkb->lkb_rqmode = ms->m_rqmode; + + lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + /* lkb was just created so there won't be an lvb yet */ + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + } + + return 0; +} + +static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + return -EBUSY; + + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + + lkb->lkb_rqmode = ms->m_rqmode; + lkb->lkb_lvbseq = ms->m_lvbseq; + + return 0; +} + +static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + return 0; +} + +/* We fill in the stub-lkb fields with the info that send_xxxx_reply() + uses to send a reply and that the remote end uses to process the reply. */ + +static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb = &ls->ls_stub_lkb; + lkb->lkb_nodeid = ms->m_header.h_nodeid; + lkb->lkb_remid = ms->m_lkid; +} + +/* This is called after the rsb is locked so that we can safely inspect + fields in the lkb. */ + +static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + int from = ms->m_header.h_nodeid; + int error = 0; + + switch (ms->m_type) { + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_UNLOCK_REPLY: + case DLM_MSG_CANCEL_REPLY: + case DLM_MSG_GRANT: + case DLM_MSG_BAST: + if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_REQUEST_REPLY: + if (!is_process_copy(lkb)) + error = -EINVAL; + else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + default: + error = -EINVAL; + } + + if (error) + log_error(lkb->lkb_resource->res_ls, + "ignore invalid message %d from %d %x %x %x %d", + ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, + lkb->lkb_flags, lkb->lkb_nodeid); + return error; +} + +static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) +{ + char name[DLM_RESNAME_MAXLEN + 1]; + struct dlm_message *ms; + struct dlm_mhandle *mh; + struct dlm_rsb *r; + uint32_t hash, b; + int rv, dir_nodeid; + + memset(name, 0, sizeof(name)); + memcpy(name, ms_name, len); + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + + log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); + + spin_lock(&ls->ls_rsbtbl[b].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "repeat_remove on keep %s", name); + return; + } + + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (!rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "repeat_remove on toss %s", name); + return; + } + + /* use ls->remove_name2 to avoid conflict with shrink? */ + + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = len; + memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + rv = _create_message(ls, sizeof(struct dlm_message) + len, + dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); + if (rv) + return; + + memcpy(ms->m_extra, name, len); + ms->m_hash = hash; + + send_message(mh, ms); + + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = 0; + memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); +} + +static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int from_nodeid; + int error, namelen = 0; + + from_nodeid = ms->m_header.h_nodeid; + + error = create_lkb(ls, &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + lkb->lkb_flags |= DLM_IFL_MSTCPY; + error = receive_request_args(ls, lkb, ms); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + /* The dir node is the authority on whether we are the master + for this rsb or not, so if the master sends us a request, we should + recreate the rsb if we've destroyed it. This race happens when we + send a remove message to the dir node at the same time that the dir + node sends us a request for the rsb. */ + + namelen = receive_extralen(ms); + + error = find_rsb(ls, ms->m_extra, namelen, from_nodeid, + R_RECEIVE_REQUEST, &r); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + lock_rsb(r); + + if (r->res_master_nodeid != dlm_our_nodeid()) { + error = validate_master_nodeid(ls, r, from_nodeid); + if (error) { + unlock_rsb(r); + put_rsb(r); + __put_lkb(ls, lkb); + goto fail; + } + } + + attach_lkb(r, lkb); + error = do_request(r, lkb); + send_request_reply(r, lkb, error); + do_request_effects(r, lkb, error); + + unlock_rsb(r); + put_rsb(r); + + if (error == -EINPROGRESS) + error = 0; + if (error) + dlm_put_lkb(lkb); + return 0; + + fail: + /* TODO: instead of returning ENOTBLK, add the lkb to res_lookup + and do this receive_request again from process_lookup_list once + we get the lookup reply. This would avoid a many repeated + ENOTBLK request failures when the lookup reply designating us + as master is delayed. */ + + /* We could repeatedly return -EBADR here if our send_remove() is + delayed in being sent/arriving/being processed on the dir node. + Another node would repeatedly lookup up the master, and the dir + node would continue returning our nodeid until our send_remove + took effect. + + We send another remove message in case our previous send_remove + was lost/ignored/missed somehow. */ + + if (error != -ENOTBLK) { + log_limit(ls, "receive_request %x from %d %d", + ms->m_lkid, from_nodeid, error); + } + + if (namelen && error == -EBADR) { + send_repeat_remove(ls, ms->m_extra, namelen); + msleep(1000); + } + + setup_stub_lkb(ls, ms); + send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; +} + +static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, reply = 1; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_convert %x remid %x recover_seq %llu " + "remote %d %x", lkb->lkb_id, lkb->lkb_remid, + (unsigned long long)lkb->lkb_recover_seq, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_convert_args(ls, lkb, ms); + if (error) { + send_convert_reply(r, lkb, error); + goto out; + } + + reply = !down_conversion(lkb); + + error = do_convert(r, lkb); + if (reply) + send_convert_reply(r, lkb, error); + do_convert_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_stub_lkb(ls, ms); + send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; +} + +static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_unlock %x remid %x remote %d %x", + lkb->lkb_id, lkb->lkb_remid, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_unlock_args(ls, lkb, ms); + if (error) { + send_unlock_reply(r, lkb, error); + goto out; + } + + error = do_unlock(r, lkb); + send_unlock_reply(r, lkb, error); + do_unlock_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_stub_lkb(ls, ms); + send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; +} + +static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + error = do_cancel(r, lkb); + send_cancel_reply(r, lkb, error); + do_cancel_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; + + fail: + setup_stub_lkb(ls, ms); + send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; +} + +static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags_reply(lkb, ms); + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + queue_bast(r, lkb, ms->m_bastmode); + lkb->lkb_highbast = ms->m_bastmode; + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) +{ + int len, error, ret_nodeid, from_nodeid, our_nodeid; + + from_nodeid = ms->m_header.h_nodeid; + our_nodeid = dlm_our_nodeid(); + + len = receive_extralen(ms); + + error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0, + &ret_nodeid, NULL); + + /* Optimization: we're master so treat lookup as a request */ + if (!error && ret_nodeid == our_nodeid) { + receive_request(ls, ms); + return; + } + send_lookup_reply(ls, ms, ret_nodeid, error); +} + +static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) +{ + char name[DLM_RESNAME_MAXLEN+1]; + struct dlm_rsb *r; + uint32_t hash, b; + int rv, len, dir_nodeid, from_nodeid; + + from_nodeid = ms->m_header.h_nodeid; + + len = receive_extralen(ms); + + if (len > DLM_RESNAME_MAXLEN) { + log_error(ls, "receive_remove from %d bad len %d", + from_nodeid, len); + return; + } + + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); + if (dir_nodeid != dlm_our_nodeid()) { + log_error(ls, "receive_remove from %d bad nodeid %d", + from_nodeid, dir_nodeid); + return; + } + + /* Look for name on rsbtbl.toss, if it's there, kill it. + If it's on rsbtbl.keep, it's being used, and we should ignore this + message. This is an expected race between the dir node sending a + request to the master node at the same time as the master node sends + a remove to the dir node. The resolution to that race is for the + dir node to ignore the remove message, and the master node to + recreate the master rsb when it gets a request from the dir node for + an rsb it doesn't have. */ + + memset(name, 0, sizeof(name)); + memcpy(name, ms->m_extra, len); + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + spin_lock(&ls->ls_rsbtbl[b].lock); + + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (rv) { + /* verify the rsb is on keep list per comment above */ + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (rv) { + /* should not happen */ + log_error(ls, "receive_remove from %d not found %s", + from_nodeid, name); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + if (r->res_master_nodeid != from_nodeid) { + /* should not happen */ + log_error(ls, "receive_remove keep from %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + log_debug(ls, "receive_remove from %d master %d first %x %s", + from_nodeid, r->res_master_nodeid, r->res_first_lkid, + name); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + if (r->res_master_nodeid != from_nodeid) { + log_error(ls, "receive_remove toss from %d master %d", + from_nodeid, r->res_master_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + + if (kref_put(&r->res_ref, kill_rsb)) { + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + spin_unlock(&ls->ls_rsbtbl[b].lock); + dlm_free_rsb(r); + } else { + log_error(ls, "receive_remove from %d rsb ref error", + from_nodeid); + dlm_print_rsb(r); + spin_unlock(&ls->ls_rsbtbl[b].lock); + } +} + +static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) +{ + do_purge(ls, ms->m_nodeid, ms->m_pid); +} + +static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, mstype, result; + int from_nodeid = ms->m_header.h_nodeid; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + mstype = lkb->lkb_wait_type; + error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); + if (error) { + log_error(ls, "receive_request_reply %x remote %d %x result %d", + lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result); + dlm_dump_rsb(r); + goto out; + } + + /* Optimization: the dir node was also the master, so it took our + lookup as a request and sent request reply instead of lookup reply */ + if (mstype == DLM_MSG_LOOKUP) { + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + lkb->lkb_nodeid = from_nodeid; + } + + /* this is the value returned from do_request() on the master */ + result = ms->m_result; + + switch (result) { + case -EAGAIN: + /* request would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + confirm_master(r, -EAGAIN); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + + case -EINPROGRESS: + case 0: + /* request was queued or granted on remote master */ + receive_flags_reply(lkb, ms); + lkb->lkb_remid = ms->m_lkid; + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + if (result) { + add_lkb(r, lkb, DLM_LKSTS_WAITING); + add_timeout(lkb); + } else { + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + } + confirm_master(r, result); + break; + + case -EBADR: + case -ENOTBLK: + /* find_rsb failed to find rsb or rsb wasn't master */ + log_limit(ls, "receive_request_reply %x from %d %d " + "master %d dir %d first %x %s", lkb->lkb_id, + from_nodeid, result, r->res_master_nodeid, + r->res_dir_nodeid, r->res_first_lkid, r->res_name); + + if (r->res_dir_nodeid != dlm_our_nodeid() && + r->res_master_nodeid != dlm_our_nodeid()) { + /* cause _request_lock->set_master->send_lookup */ + r->res_master_nodeid = 0; + r->res_nodeid = -1; + lkb->lkb_nodeid = -1; + } + + if (is_overlap(lkb)) { + /* we'll ignore error in cancel/unlock reply */ + queue_cast_overlap(r, lkb); + confirm_master(r, result); + unhold_lkb(lkb); /* undoes create_lkb() */ + } else { + _request_lock(r, lkb); + + if (r->res_master_nodeid == dlm_our_nodeid()) + confirm_master(r, 0); + } + break; + + default: + log_error(ls, "receive_request_reply %x error %d", + lkb->lkb_id, result); + } + + if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) { + log_debug(ls, "receive_request_reply %x result %d unlock", + lkb->lkb_id, result); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + send_unlock(r, lkb); + } else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) { + log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + send_cancel(r, lkb); + } else { + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + } + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return 0; +} + +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + /* this is the value returned from do_convert() on the master */ + switch (ms->m_result) { + case -EAGAIN: + /* convert would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + break; + + case -EDEADLK: + receive_flags_reply(lkb, ms); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -EDEADLK); + break; + + case -EINPROGRESS: + /* convert was queued on remote master */ + receive_flags_reply(lkb, ms); + if (is_demoted(lkb)) + munge_demoted(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + add_timeout(lkb); + break; + + case 0: + /* convert was granted on remote master */ + receive_flags_reply(lkb, ms); + if (is_demoted(lkb)) + munge_demoted(lkb); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + break; + + default: + log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_result); + dlm_print_rsb(r); + dlm_print_lkb(lkb); + } +} + +static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + __receive_convert_reply(r, lkb, ms); + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + _receive_convert_reply(lkb, ms); + dlm_put_lkb(lkb); + return 0; +} + +static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + /* this is the value returned from do_unlock() on the master */ + + switch (ms->m_result) { + case -DLM_EUNLOCK: + receive_flags_reply(lkb, ms); + remove_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + break; + case -ENOENT: + break; + default: + log_error(r->res_ls, "receive_unlock_reply %x error %d", + lkb->lkb_id, ms->m_result); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + _receive_unlock_reply(lkb, ms); + dlm_put_lkb(lkb); + return 0; +} + +static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + /* this is the value returned from do_cancel() on the master */ + + switch (ms->m_result) { + case -DLM_ECANCEL: + receive_flags_reply(lkb, ms); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_ECANCEL); + break; + case 0: + break; + default: + log_error(r->res_ls, "receive_cancel_reply %x error %d", + lkb->lkb_id, ms->m_result); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + return error; + + _receive_cancel_reply(lkb, ms); + dlm_put_lkb(lkb); + return 0; +} + +static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, ret_nodeid; + int do_lookup_list = 0; + + error = find_lkb(ls, ms->m_lkid, &lkb); + if (error) { + log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid); + return; + } + + /* ms->m_result is the value returned by dlm_master_lookup on dir node + FIXME: will a non-zero error ever be returned? */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + if (error) + goto out; + + ret_nodeid = ms->m_nodeid; + + /* We sometimes receive a request from the dir node for this + rsb before we've received the dir node's loookup_reply for it. + The request from the dir node implies we're the master, so we set + ourself as master in receive_request_reply, and verify here that + we are indeed the master. */ + + if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) { + /* This should never happen */ + log_error(ls, "receive_lookup_reply %x from %d ret %d " + "master %d dir %d our %d first %x %s", + lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid, + r->res_master_nodeid, r->res_dir_nodeid, + dlm_our_nodeid(), r->res_first_lkid, r->res_name); + } + + if (ret_nodeid == dlm_our_nodeid()) { + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = 0; + do_lookup_list = 1; + r->res_first_lkid = 0; + } else if (ret_nodeid == -1) { + /* the remote node doesn't believe it's the dir node */ + log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid", + lkb->lkb_id, ms->m_header.h_nodeid); + r->res_master_nodeid = 0; + r->res_nodeid = -1; + lkb->lkb_nodeid = -1; + } else { + /* set_master() will set lkb_nodeid from r */ + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = ret_nodeid; + } + + if (is_overlap(lkb)) { + log_debug(ls, "receive_lookup_reply %x unlock %x", + lkb->lkb_id, lkb->lkb_flags); + queue_cast_overlap(r, lkb); + unhold_lkb(lkb); /* undoes create_lkb() */ + goto out_list; + } + + _request_lock(r, lkb); + + out_list: + if (do_lookup_list) + process_lookup_list(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) +{ + int error = 0, noent = 0; + + if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { + log_limit(ls, "receive %d from non-member %d %x %x %d", + ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_remid, ms->m_result); + return; + } + + switch (ms->m_type) { + + /* messages sent to a master node */ + + case DLM_MSG_REQUEST: + error = receive_request(ls, ms); + break; + + case DLM_MSG_CONVERT: + error = receive_convert(ls, ms); + break; + + case DLM_MSG_UNLOCK: + error = receive_unlock(ls, ms); + break; + + case DLM_MSG_CANCEL: + noent = 1; + error = receive_cancel(ls, ms); + break; + + /* messages sent from a master node (replies to above) */ + + case DLM_MSG_REQUEST_REPLY: + error = receive_request_reply(ls, ms); + break; + + case DLM_MSG_CONVERT_REPLY: + error = receive_convert_reply(ls, ms); + break; + + case DLM_MSG_UNLOCK_REPLY: + error = receive_unlock_reply(ls, ms); + break; + + case DLM_MSG_CANCEL_REPLY: + error = receive_cancel_reply(ls, ms); + break; + + /* messages sent from a master node (only two types of async msg) */ + + case DLM_MSG_GRANT: + noent = 1; + error = receive_grant(ls, ms); + break; + + case DLM_MSG_BAST: + noent = 1; + error = receive_bast(ls, ms); + break; + + /* messages sent to a dir node */ + + case DLM_MSG_LOOKUP: + receive_lookup(ls, ms); + break; + + case DLM_MSG_REMOVE: + receive_remove(ls, ms); + break; + + /* messages sent from a dir node (remove has no reply) */ + + case DLM_MSG_LOOKUP_REPLY: + receive_lookup_reply(ls, ms); + break; + + /* other messages */ + + case DLM_MSG_PURGE: + receive_purge(ls, ms); + break; + + default: + log_error(ls, "unknown message type %d", ms->m_type); + } + + /* + * When checking for ENOENT, we're checking the result of + * find_lkb(m_remid): + * + * The lock id referenced in the message wasn't found. This may + * happen in normal usage for the async messages and cancel, so + * only use log_debug for them. + * + * Some errors are expected and normal. + */ + + if (error == -ENOENT && noent) { + log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + } else if (error == -ENOENT) { + log_error(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + + if (ms->m_type == DLM_MSG_CONVERT) + dlm_dump_rsb_hash(ls, ms->m_hash); + } + + if (error == -EINVAL) { + log_error(ls, "receive %d inval from %d lkid %x remid %x " + "saved_seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, saved_seq); + } +} + +/* If the lockspace is in recovery mode (locking stopped), then normal + messages are saved on the requestqueue for processing after recovery is + done. When not in recovery mode, we wait for dlm_recoverd to drain saved + messages off the requestqueue before we process new ones. This occurs right + after recovery completes when we transition from saving all messages on + requestqueue, to processing all the saved messages, to processing new + messages as they arrive. */ + +static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, + int nodeid) +{ + if (dlm_locking_stopped(ls)) { + /* If we were a member of this lockspace, left, and rejoined, + other nodes may still be sending us messages from the + lockspace generation before we left. */ + if (!ls->ls_generation) { + log_limit(ls, "receive %d from %d ignore old gen", + ms->m_type, nodeid); + return; + } + + dlm_add_requestqueue(ls, nodeid, ms); + } else { + dlm_wait_requestqueue(ls); + _receive_message(ls, ms, 0); + } +} + +/* This is called by dlm_recoverd to process messages that were saved on + the requestqueue. */ + +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) +{ + _receive_message(ls, ms, saved_seq); +} + +/* This is called by the midcomms layer when something is received for + the lockspace. It could be either a MSG (normal message sent as part of + standard locking activity) or an RCOM (recovery message sent as part of + lockspace recovery). */ + +void dlm_receive_buffer(union dlm_packet *p, int nodeid) +{ + struct dlm_header *hd = &p->header; + struct dlm_ls *ls; + int type = 0; + + switch (hd->h_cmd) { + case DLM_MSG: + dlm_message_in(&p->message); + type = p->message.m_type; + break; + case DLM_RCOM: + dlm_rcom_in(&p->rcom); + type = p->rcom.rc_type; + break; + default: + log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid); + return; + } + + if (hd->h_nodeid != nodeid) { + log_print("invalid h_nodeid %d from %d lockspace %x", + hd->h_nodeid, nodeid, hd->h_lockspace); + return; + } + + ls = dlm_find_lockspace_global(hd->h_lockspace); + if (!ls) { + if (dlm_config.ci_log_debug) { + printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " + "%u from %d cmd %d type %d\n", + hd->h_lockspace, nodeid, hd->h_cmd, type); + } + + if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) + dlm_send_ls_not_ready(nodeid, &p->rcom); + return; + } + + /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to + be inactive (in this ls) before transitioning to recovery mode */ + + down_read(&ls->ls_recv_active); + if (hd->h_cmd == DLM_MSG) + dlm_receive_message(ls, &p->message, nodeid); + else + dlm_receive_rcom(ls, &p->rcom, nodeid); + up_read(&ls->ls_recv_active); + + dlm_put_lockspace(ls); +} + +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms_stub) +{ + if (middle_conversion(lkb)) { + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CONVERT_REPLY; + ms_stub->m_result = -EINPROGRESS; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_convert_reply(lkb, ms_stub); + + /* Same special case as in receive_rcom_lock_args() */ + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT); + unhold_lkb(lkb); + + } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { + lkb->lkb_flags |= DLM_IFL_RESEND; + } + + /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down + conversions are async; there's no reply from the remote master */ +} + +/* A waiting lkb needs recovery if the master node has failed, or + the master node is changing (only when no directory is used) */ + +static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, + int dir_nodeid) +{ + if (dlm_no_directory(ls)) + return 1; + + if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) + return 1; + + return 0; +} + +/* Recovery for locks that are waiting for replies from nodes that are now + gone. We can just complete unlocks and cancels by faking a reply from the + dead node. Requests and up-conversions we flag to be resent after + recovery. Down-conversions can just be completed with a fake reply like + unlocks. Conversions between PR and CW need special attention. */ + +void dlm_recover_waiters_pre(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_message *ms_stub; + int wait_type, stub_unlock_result, stub_cancel_result; + int dir_nodeid; + + ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); + if (!ms_stub) { + log_error(ls, "dlm_recover_waiters_pre no mem"); + return; + } + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { + + dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); + + /* exclude debug messages about unlocks because there can be so + many and they aren't very interesting */ + + if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", + lkb->lkb_id, + lkb->lkb_remid, + lkb->lkb_wait_type, + lkb->lkb_resource->res_nodeid, + lkb->lkb_nodeid, + lkb->lkb_wait_nodeid, + dir_nodeid); + } + + /* all outstanding lookups, regardless of destination will be + resent after recovery is done */ + + if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) { + lkb->lkb_flags |= DLM_IFL_RESEND; + continue; + } + + if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) + continue; + + wait_type = lkb->lkb_wait_type; + stub_unlock_result = -DLM_EUNLOCK; + stub_cancel_result = -DLM_ECANCEL; + + /* Main reply may have been received leaving a zero wait_type, + but a reply for the overlapping op may not have been + received. In that case we need to fake the appropriate + reply for the overlap op. */ + + if (!wait_type) { + if (is_overlap_cancel(lkb)) { + wait_type = DLM_MSG_CANCEL; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_cancel_result = 0; + } + if (is_overlap_unlock(lkb)) { + wait_type = DLM_MSG_UNLOCK; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_unlock_result = -ENOENT; + } + + log_debug(ls, "rwpre overlap %x %x %d %d %d", + lkb->lkb_id, lkb->lkb_flags, wait_type, + stub_cancel_result, stub_unlock_result); + } + + switch (wait_type) { + + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_RESEND; + break; + + case DLM_MSG_CONVERT: + recover_convert_waiter(ls, lkb, ms_stub); + break; + + case DLM_MSG_UNLOCK: + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_UNLOCK_REPLY; + ms_stub->m_result = stub_unlock_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_unlock_reply(lkb, ms_stub); + dlm_put_lkb(lkb); + break; + + case DLM_MSG_CANCEL: + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CANCEL_REPLY; + ms_stub->m_result = stub_cancel_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_cancel_reply(lkb, ms_stub); + dlm_put_lkb(lkb); + break; + + default: + log_error(ls, "invalid lkb wait_type %d %d", + lkb->lkb_wait_type, wait_type); + } + schedule(); + } + mutex_unlock(&ls->ls_waiters_mutex); + kfree(ms_stub); +} + +static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + int found = 0; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (lkb->lkb_flags & DLM_IFL_RESEND) { + hold_lkb(lkb); + found = 1; + break; + } + } + mutex_unlock(&ls->ls_waiters_mutex); + + if (!found) + lkb = NULL; + return lkb; +} + +/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the + master or dir-node for r. Processing the lkb may result in it being placed + back on waiters. */ + +/* We do this after normal locking has been enabled and any saved messages + (in requestqueue) have been processed. We should be confident that at + this point we won't get or process a reply to any of these waiting + operations. But, new ops may be coming in on the rsbs/locks here from + userspace or remotely. */ + +/* there may have been an overlap unlock/cancel prior to recovery or after + recovery. if before, the lkb may still have a pos wait_count; if after, the + overlap flag would just have been set and nothing new sent. we can be + confident here than any replies to either the initial op or overlap ops + prior to recovery have been received. */ + +int dlm_recover_waiters_post(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error = 0, mstype, err, oc, ou; + + while (1) { + if (dlm_locking_stopped(ls)) { + log_debug(ls, "recover_waiters_post aborted"); + error = -EINTR; + break; + } + + lkb = find_resend_waiter(ls); + if (!lkb) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + mstype = lkb->lkb_wait_type; + oc = is_overlap_cancel(lkb); + ou = is_overlap_unlock(lkb); + err = 0; + + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " + "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype, + r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid, + dlm_dir_nodeid(r), oc, ou); + + /* At this point we assume that we won't get a reply to any + previous op or overlap op on this lock. First, do a big + remove_from_waiters() for all previous ops. */ + + lkb->lkb_flags &= ~DLM_IFL_RESEND; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_wait_type = 0; + lkb->lkb_wait_count = 0; + mutex_lock(&ls->ls_waiters_mutex); + list_del_init(&lkb->lkb_wait_reply); + mutex_unlock(&ls->ls_waiters_mutex); + unhold_lkb(lkb); /* for waiters list */ + + if (oc || ou) { + /* do an unlock or cancel instead of resending */ + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + queue_cast(r, lkb, ou ? -DLM_EUNLOCK : + -DLM_ECANCEL); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + case DLM_MSG_CONVERT: + if (oc) { + queue_cast(r, lkb, -DLM_ECANCEL); + } else { + lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK; + _unlock_lock(r, lkb); + } + break; + default: + err = 1; + } + } else { + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + _request_lock(r, lkb); + if (is_master(r)) + confirm_master(r, 0); + break; + case DLM_MSG_CONVERT: + _convert_lock(r, lkb); + break; + default: + err = 1; + } + } + + if (err) { + log_error(ls, "waiter %x msg %d r_nodeid %d " + "dir_nodeid %d overlap %d %d", + lkb->lkb_id, mstype, r->res_nodeid, + dlm_dir_nodeid(r), oc, ou); + } + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + } + + return error; +} + +static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + /* don't purge lkbs we've added in recover_master_copy for + the current recovery seq */ + + if (lkb->lkb_recover_seq == ls->ls_recover_seq) + continue; + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged mstcpy lkb not released"); + } +} + +void dlm_purge_mstcpy_locks(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + purge_mstcpy_list(ls, r, &r->res_grantqueue); + purge_mstcpy_list(ls, r, &r->res_convertqueue); + purge_mstcpy_list(ls, r, &r->res_waitqueue); +} + +static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list, + int nodeid_gone, unsigned int *count) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + if ((lkb->lkb_nodeid == nodeid_gone) || + dlm_is_removed(ls, lkb->lkb_nodeid)) { + + /* tell recover_lvb to invalidate the lvb + because a node holding EX/PW failed */ + if ((lkb->lkb_exflags & DLM_LKF_VALBLK) && + (lkb->lkb_grmode >= DLM_LOCK_PW)) { + rsb_set_flag(r, RSB_RECOVER_LVB_INVAL); + } + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged dead lkb not released"); + + rsb_set_flag(r, RSB_RECOVER_GRANT); + + (*count)++; + } + } +} + +/* Get rid of locks held by nodes that are gone. */ + +void dlm_recover_purge(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + struct dlm_member *memb; + int nodes_count = 0; + int nodeid_gone = 0; + unsigned int lkb_count = 0; + + /* cache one removed nodeid to optimize the common + case of a single node removed */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + nodes_count++; + nodeid_gone = memb->nodeid; + } + + if (!nodes_count) + return; + + down_write(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + hold_rsb(r); + lock_rsb(r); + if (is_master(r)) { + purge_dead_list(ls, r, &r->res_grantqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_convertqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_waitqueue, + nodeid_gone, &lkb_count); + } + unlock_rsb(r); + unhold_rsb(r); + cond_resched(); + } + up_write(&ls->ls_root_sem); + + if (lkb_count) + log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", + lkb_count, nodes_count); +} + +static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) +{ + struct rb_node *n; + struct dlm_rsb *r; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + + if (!rsb_flag(r, RSB_RECOVER_GRANT)) + continue; + if (!is_master(r)) { + rsb_clear_flag(r, RSB_RECOVER_GRANT); + continue; + } + hold_rsb(r); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return r; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return NULL; +} + +/* + * Attempt to grant locks on resources that we are the master of. + * Locks may have become grantable during recovery because locks + * from departed nodes have been purged (or not rebuilt), allowing + * previously blocked locks to now be granted. The subset of rsb's + * we are interested in are those with lkb's on either the convert or + * waiting queues. + * + * Simplest would be to go through each master rsb and check for non-empty + * convert or waiting queues, and attempt to grant on those rsbs. + * Checking the queues requires lock_rsb, though, for which we'd need + * to release the rsbtbl lock. This would make iterating through all + * rsb's very inefficient. So, we rely on earlier recovery routines + * to set RECOVER_GRANT on any rsb's that we should attempt to grant + * locks for. + */ + +void dlm_recover_grant(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int bucket = 0; + unsigned int count = 0; + unsigned int rsb_count = 0; + unsigned int lkb_count = 0; + + while (1) { + r = find_grant_rsb(ls, bucket); + if (!r) { + if (bucket == ls->ls_rsbtbl_size - 1) + break; + bucket++; + continue; + } + rsb_count++; + count = 0; + lock_rsb(r); + /* the RECOVER_GRANT flag is checked in the grant path */ + grant_pending_locks(r, &count); + rsb_clear_flag(r, RSB_RECOVER_GRANT); + lkb_count += count; + confirm_master(r, 0); + unlock_rsb(r); + put_rsb(r); + cond_resched(); + } + + if (lkb_count) + log_rinfo(ls, "dlm_recover_grant %u locks on %u resources", + lkb_count, rsb_count); +} + +static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, head, lkb_statequeue) { + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) + return lkb; + } + return NULL; +} + +static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + lkb = search_remid_list(&r->res_grantqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_convertqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_waitqueue, nodeid, remid); + if (lkb) + return lkb; + return NULL; +} + +/* needs at least dlm_rcom + rcom_lock */ +static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_rsb *r, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + + lkb->lkb_nodeid = rc->rc_header.h_nodeid; + lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid); + lkb->lkb_remid = le32_to_cpu(rl->rl_lkid); + lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags); + lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF; + lkb->lkb_flags |= DLM_IFL_MSTCPY; + lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq); + lkb->lkb_rqmode = rl->rl_rqmode; + lkb->lkb_grmode = rl->rl_grmode; + /* don't set lkb_status because add_lkb wants to itself */ + + lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - + sizeof(struct rcom_lock); + if (lvblen > ls->ls_lvblen) + return -EINVAL; + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen); + } + + /* Conversions between PR and CW (middle modes) need special handling. + The real granted mode of these converting locks cannot be determined + until all locks have been rebuilt on the rsb (recover_conversion) */ + + if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) && + middle_conversion(lkb)) { + rl->rl_status = DLM_LKSTS_CONVERT; + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(r, RSB_RECOVER_CONVERT); + } + + return 0; +} + +/* This lkb may have been recovered in a previous aborted recovery so we need + to check if the rsb already has an lkb with the given remote nodeid/lkid. + If so we just send back a standard reply. If not, we create a new lkb with + the given values and send back our lkid. We send back our lkid by sending + back the rcom_lock struct we got but with the remid field filled in. */ + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + uint32_t remid = 0; + int from_nodeid = rc->rc_header.h_nodeid; + int error; + + if (rl->rl_parent_lkid) { + error = -EOPNOTSUPP; + goto out; + } + + remid = le32_to_cpu(rl->rl_lkid); + + /* In general we expect the rsb returned to be R_MASTER, but we don't + have to require it. Recovery of masters on one node can overlap + recovery of locks on another node, so one node can send us MSTCPY + locks before we've made ourselves master of this rsb. We can still + add new MSTCPY locks that we receive here without any harm; when + we make ourselves master, dlm_recover_masters() won't touch the + MSTCPY locks we've received early. */ + + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), + from_nodeid, R_RECEIVE_RECOVER, &r); + if (error) + goto out; + + lock_rsb(r); + + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", + from_nodeid, remid); + error = -EBADR; + goto out_unlock; + } + + lkb = search_remid(r, from_nodeid, remid); + if (lkb) { + error = -EEXIST; + goto out_remid; + } + + error = create_lkb(ls, &lkb); + if (error) + goto out_unlock; + + error = receive_rcom_lock_args(ls, lkb, r, rc); + if (error) { + __put_lkb(ls, lkb); + goto out_unlock; + } + + attach_lkb(r, lkb); + add_lkb(r, lkb, rl->rl_status); + error = 0; + ls->ls_recover_locks_in++; + + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); + + out_remid: + /* this is the new value returned to the lock holder for + saving in its process-copy lkb */ + rl->rl_remid = cpu_to_le32(lkb->lkb_id); + + lkb->lkb_recover_seq = ls->ls_recover_seq; + + out_unlock: + unlock_rsb(r); + put_rsb(r); + out: + if (error && error != -EEXIST) + log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", + from_nodeid, remid, error); + rl->rl_result = cpu_to_le32(error); + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + uint32_t lkid, remid; + int error, result; + + lkid = le32_to_cpu(rl->rl_lkid); + remid = le32_to_cpu(rl->rl_remid); + result = le32_to_cpu(rl->rl_result); + + error = find_lkb(ls, lkid, &lkb); + if (error) { + log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + return error; + } + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + if (!is_process_copy(lkb)) { + log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + dlm_dump_rsb(r); + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return -EINVAL; + } + + switch (result) { + case -EBADR: + /* There's a chance the new master received our lock before + dlm_recover_master_reply(), this wouldn't happen if we did + a barrier between recover_masters and recover_locks. */ + + log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + + dlm_send_rcom_lock(r, lkb); + goto out; + case -EEXIST: + case 0: + lkb->lkb_remid = remid; + break; + default: + log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", + lkid, rc->rc_header.h_nodeid, remid, result); + } + + /* an ack for dlm_recover_locks() which waits for replies from + all the locks it sends to new masters */ + dlm_recovered_lock(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + + return 0; +} + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + int error; + + dlm_lock_recovery(ls); + + error = create_lkb(ls, &lkb); + if (error) { + kfree(ua); + goto out; + } + + if (flags & DLM_LKF_VALBLK) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + kfree(ua); + __put_lkb(ls, lkb); + error = -ENOMEM; + goto out; + } + } + + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + + error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, + fake_astfn, ua, fake_bastfn, &args); + lkb->lkb_flags |= DLM_IFL_USER; + + if (error) { + __put_lkb(ls, lkb); + goto out; + } + + error = request_lock(ls, lkb, name, namelen, &args); + + switch (error) { + case 0: + break; + case -EINPROGRESS: + error = 0; + break; + case -EAGAIN: + error = 0; + /* fall through */ + default: + __put_lkb(ls, lkb); + goto out; + } + + /* add this new lkb to the per-process list of locks */ + spin_lock(&ua->proc->locks_spin); + hold_lkb(lkb); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + dlm_unlock_recovery(ls); + return error; +} + +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + /* user can change the params on its lock when it converts it, or + add an lvb that didn't exist before */ + + ua = lkb->lkb_ua; + + if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + error = -ENOMEM; + goto out_put; + } + } + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, + fake_astfn, ua, fake_bastfn, &args); + if (error) + goto out_put; + + error = convert_lock(ls, lkb, &args); + + if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +/* + * The caller asks for an orphan lock on a given resource with a given mode. + * If a matching lock exists, it's moved to the owner's list of locks and + * the lkid is returned. + */ + +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid) +{ + struct dlm_lkb *lkb; + struct dlm_user_args *ua; + int found_other_mode = 0; + int found = 0; + int rv = 0; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) { + if (lkb->lkb_resource->res_length != namelen) + continue; + if (memcmp(lkb->lkb_resource->res_name, name, namelen)) + continue; + if (lkb->lkb_grmode != mode) { + found_other_mode = 1; + continue; + } + + found = 1; + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags &= ~DLM_IFL_ORPHAN; + *lkid = lkb->lkb_id; + break; + } + mutex_unlock(&ls->ls_orphans_mutex); + + if (!found && found_other_mode) { + rv = -EAGAIN; + goto out; + } + + if (!found) { + rv = -ENOENT; + goto out; + } + + lkb->lkb_exflags = flags; + lkb->lkb_ownpid = (int) current->pid; + + ua = lkb->lkb_ua; + + ua->proc = ua_tmp->proc; + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + /* + * The lkb reference from the ls_orphans list was not + * removed above, and is now considered the reference + * for the proc locks list. + */ + + spin_lock(&ua->proc->locks_spin); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + kfree(ua_tmp); + return rv; +} + +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK)) + error = 0; + if (error) + goto out_put; + + spin_lock(&ua->proc->locks_spin); + /* dlm_user_add_cb() may have already taken lkb off the proc list */ + if (!list_empty(&lkb->lkb_ownqueue)) + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); + spin_unlock(&ua->proc->locks_spin); + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = cancel_lock(ls, lkb, &args); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + struct dlm_rsb *r; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, &args); + if (error) + goto out_r; + lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL; + + error = _cancel_lock(r, lkb); + out_r: + unlock_rsb(r); + put_rsb(r); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + return error; +} + +/* lkb's that are removed from the waiters list by revert are just left on the + orphans list with the granted orphan locks, to be freed by purge */ + +static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + hold_lkb(lkb); /* reference for the ls_orphans list */ + mutex_lock(&ls->ls_orphans_mutex); + list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); + mutex_unlock(&ls->ls_orphans_mutex); + + set_unlock_args(0, lkb->lkb_ua, &args); + + error = cancel_lock(ls, lkb, &args); + if (error == -DLM_ECANCEL) + error = 0; + return error; +} + +/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't + granted. Regardless of what rsb queue the lock is on, it's removed and + freed. The IVVALBLK flag causes the lvb on the resource to be invalidated + if our lock is PW/EX (it's ignored if our granted mode is smaller.) */ + +static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, + lkb->lkb_ua, &args); + + error = unlock_lock(ls, lkb, &args); + if (error == -DLM_EUNLOCK) + error = 0; + return error; +} + +/* We have to release clear_proc_locks mutex before calling unlock_proc_lock() + (which does lock_rsb) due to deadlock with receiving a message that does + lock_rsb followed by dlm_user_add_cb() */ + +static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, + struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb = NULL; + + mutex_lock(&ls->ls_clear_proc_locks); + if (list_empty(&proc->locks)) + goto out; + + lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + lkb->lkb_flags |= DLM_IFL_ORPHAN; + else + lkb->lkb_flags |= DLM_IFL_DEAD; + out: + mutex_unlock(&ls->ls_clear_proc_locks); + return lkb; +} + +/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which + 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, + which we clear here. */ + +/* proc CLOSING flag is set so no more device_reads should look at proc->asts + list, and no more device_writes should add lkb's to proc->locks list; so we + shouldn't need to take asts_spin or locks_spin here. this assumes that + device reads/writes/closes are serialized -- FIXME: we may need to serialize + them ourself. */ + +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + dlm_lock_recovery(ls); + + while (1) { + lkb = del_proc_lock(ls, proc); + if (!lkb) + break; + del_timeout(lkb); + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + orphan_proc_lock(ls, lkb); + else + unlock_proc_lock(ls, lkb); + + /* this removes the reference for the proc->locks list + added by dlm_user_request, it may result in the lkb + being freed */ + + dlm_put_lkb(lkb); + } + + mutex_lock(&ls->ls_clear_proc_locks); + + /* in-progress unlocks */ + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_cb_list); + dlm_put_lkb(lkb); + } + + mutex_unlock(&ls->ls_clear_proc_locks); + dlm_unlock_recovery(ls); +} + +static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + while (1) { + lkb = NULL; + spin_lock(&proc->locks_spin); + if (!list_empty(&proc->locks)) { + lkb = list_entry(proc->locks.next, struct dlm_lkb, + lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + } + spin_unlock(&proc->locks_spin); + + if (!lkb) + break; + + lkb->lkb_flags |= DLM_IFL_DEAD; + unlock_proc_lock(ls, lkb); + dlm_put_lkb(lkb); /* ref from proc->locks list */ + } + + spin_lock(&proc->locks_spin); + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + + spin_lock(&proc->asts_spin); + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_cb_list); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->asts_spin); +} + +/* pid of 0 means purge all orphans */ + +static void do_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_lkb *lkb, *safe; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) { + if (pid && lkb->lkb_ownpid != pid) + continue; + unlock_proc_lock(ls, lkb); + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + mutex_unlock(&ls->ls_orphans_mutex); +} + +static int send_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error; + + error = _create_message(ls, sizeof(struct dlm_message), nodeid, + DLM_MSG_PURGE, &ms, &mh); + if (error) + return error; + ms->m_nodeid = nodeid; + ms->m_pid = pid; + + return send_message(mh, ms); +} + +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid) +{ + int error = 0; + + if (nodeid && (nodeid != dlm_our_nodeid())) { + error = send_purge(ls, nodeid, pid); + } else { + dlm_lock_recovery(ls); + if (pid == current->pid) + purge_proc_locks(ls, proc); + else + do_purge(ls, nodeid, pid); + dlm_unlock_recovery(ls); + } + return error; +} + diff --git a/kernel/fs/dlm/lock.h b/kernel/fs/dlm/lock.h new file mode 100644 index 000000000..ed8ebd3a8 --- /dev/null +++ b/kernel/fs/dlm/lock.h @@ -0,0 +1,80 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCK_DOT_H__ +#define __LOCK_DOT_H__ + +void dlm_dump_rsb(struct dlm_rsb *r); +void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len); +void dlm_print_lkb(struct dlm_lkb *lkb); +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq); +void dlm_receive_buffer(union dlm_packet *p, int nodeid); +int dlm_modes_compat(int mode1, int mode2); +void dlm_put_rsb(struct dlm_rsb *r); +void dlm_hold_rsb(struct dlm_rsb *r); +int dlm_put_lkb(struct dlm_lkb *lkb); +void dlm_scan_rsbs(struct dlm_ls *ls); +int dlm_lock_recovery_try(struct dlm_ls *ls); +void dlm_unlock_recovery(struct dlm_ls *ls); +void dlm_scan_waiters(struct dlm_ls *ls); +void dlm_scan_timeout(struct dlm_ls *ls); +void dlm_adjust_timeouts(struct dlm_ls *ls); +int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, + unsigned int flags, int *r_nodeid, int *result); + +int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, + struct dlm_rsb **r_ret); + +void dlm_recover_purge(struct dlm_ls *ls); +void dlm_purge_mstcpy_locks(struct dlm_rsb *r); +void dlm_recover_grant(struct dlm_ls *ls); +int dlm_recover_waiters_post(struct dlm_ls *ls); +void dlm_recover_waiters_pre(struct dlm_ls *ls); +int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs); +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid); +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in); +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid); +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid); +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); + +static inline int is_master(struct dlm_rsb *r) +{ + return !r->res_nodeid; +} + +static inline void lock_rsb(struct dlm_rsb *r) +{ + mutex_lock(&r->res_mutex); +} + +static inline void unlock_rsb(struct dlm_rsb *r) +{ + mutex_unlock(&r->res_mutex); +} + +#endif + diff --git a/kernel/fs/dlm/lockspace.c b/kernel/fs/dlm/lockspace.c new file mode 100644 index 000000000..f3e72787e --- /dev/null +++ b/kernel/fs/dlm/lockspace.c @@ -0,0 +1,917 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "dir.h" +#include "lowcomms.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "recover.h" +#include "requestqueue.h" +#include "user.h" +#include "ast.h" + +static int ls_count; +static struct mutex ls_lock; +static struct list_head lslist; +static spinlock_t lslist_lock; +static struct task_struct * scand_task; + + +static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int n; + int rc = kstrtoint(buf, 0, &n); + + if (rc) + return rc; + ls = dlm_find_lockspace_local(ls->ls_local_handle); + if (!ls) + return -EINVAL; + + switch (n) { + case 0: + dlm_ls_stop(ls); + break; + case 1: + dlm_ls_start(ls); + break; + default: + ret = -EINVAL; + } + dlm_put_lockspace(ls); + return ret; +} + +static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int rc = kstrtoint(buf, 0, &ls->ls_uevent_result); + + if (rc) + return rc; + set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); + wake_up(&ls->ls_uevent_wait); + return len; +} + +static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id); +} + +static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int rc = kstrtouint(buf, 0, &ls->ls_global_id); + + if (rc) + return rc; + return len; +} + +static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls)); +} + +static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int val; + int rc = kstrtoint(buf, 0, &val); + + if (rc) + return rc; + if (val == 1) + set_bit(LSFL_NODIR, &ls->ls_flags); + return len; +} + +static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) +{ + uint32_t status = dlm_recover_status(ls); + return snprintf(buf, PAGE_SIZE, "%x\n", status); +} + +static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid); +} + +struct dlm_attr { + struct attribute attr; + ssize_t (*show)(struct dlm_ls *, char *); + ssize_t (*store)(struct dlm_ls *, const char *, size_t); +}; + +static struct dlm_attr dlm_attr_control = { + .attr = {.name = "control", .mode = S_IWUSR}, + .store = dlm_control_store +}; + +static struct dlm_attr dlm_attr_event = { + .attr = {.name = "event_done", .mode = S_IWUSR}, + .store = dlm_event_store +}; + +static struct dlm_attr dlm_attr_id = { + .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_id_show, + .store = dlm_id_store +}; + +static struct dlm_attr dlm_attr_nodir = { + .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_nodir_show, + .store = dlm_nodir_store +}; + +static struct dlm_attr dlm_attr_recover_status = { + .attr = {.name = "recover_status", .mode = S_IRUGO}, + .show = dlm_recover_status_show +}; + +static struct dlm_attr dlm_attr_recover_nodeid = { + .attr = {.name = "recover_nodeid", .mode = S_IRUGO}, + .show = dlm_recover_nodeid_show +}; + +static struct attribute *dlm_attrs[] = { + &dlm_attr_control.attr, + &dlm_attr_event.attr, + &dlm_attr_id.attr, + &dlm_attr_nodir.attr, + &dlm_attr_recover_status.attr, + &dlm_attr_recover_nodeid.attr, + NULL, +}; + +static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->show ? a->show(ls, buf) : 0; +} + +static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->store ? a->store(ls, buf, len) : len; +} + +static void lockspace_kobj_release(struct kobject *k) +{ + struct dlm_ls *ls = container_of(k, struct dlm_ls, ls_kobj); + kfree(ls); +} + +static const struct sysfs_ops dlm_attr_ops = { + .show = dlm_attr_show, + .store = dlm_attr_store, +}; + +static struct kobj_type dlm_ktype = { + .default_attrs = dlm_attrs, + .sysfs_ops = &dlm_attr_ops, + .release = lockspace_kobj_release, +}; + +static struct kset *dlm_kset; + +static int do_uevent(struct dlm_ls *ls, int in) +{ + int error; + + if (in) + kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); + else + kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + + log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + + /* dlm_controld will see the uevent, do the necessary group management + and then write to sysfs to wake us */ + + error = wait_event_interruptible(ls->ls_uevent_wait, + test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); + + log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result); + + if (error) + goto out; + + error = ls->ls_uevent_result; + out: + if (error) + log_error(ls, "group %s failed %d %d", in ? "join" : "leave", + error, ls->ls_uevent_result); + return error; +} + +static int dlm_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + + add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name); + return 0; +} + +static struct kset_uevent_ops dlm_uevent_ops = { + .uevent = dlm_uevent, +}; + +int __init dlm_lockspace_init(void) +{ + ls_count = 0; + mutex_init(&ls_lock); + INIT_LIST_HEAD(&lslist); + spin_lock_init(&lslist_lock); + + dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj); + if (!dlm_kset) { + printk(KERN_WARNING "%s: can not create kset\n", __func__); + return -ENOMEM; + } + return 0; +} + +void dlm_lockspace_exit(void) +{ + kset_unregister(dlm_kset); +} + +static struct dlm_ls *find_ls_to_scan(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (time_after_eq(jiffies, ls->ls_scan_time + + dlm_config.ci_scan_secs * HZ)) { + spin_unlock(&lslist_lock); + return ls; + } + } + spin_unlock(&lslist_lock); + return NULL; +} + +static int dlm_scand(void *data) +{ + struct dlm_ls *ls; + + while (!kthread_should_stop()) { + ls = find_ls_to_scan(); + if (ls) { + if (dlm_lock_recovery_try(ls)) { + ls->ls_scan_time = jiffies; + dlm_scan_rsbs(ls); + dlm_scan_timeout(ls); + dlm_scan_waiters(ls); + dlm_unlock_recovery(ls); + } else { + ls->ls_scan_time += HZ; + } + continue; + } + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); + } + return 0; +} + +static int dlm_scand_start(void) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_scand, NULL, "dlm_scand"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + scand_task = p; + return error; +} + +static void dlm_scand_stop(void) +{ + kthread_stop(scand_task); +} + +struct dlm_ls *dlm_find_lockspace_global(uint32_t id) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_global_id == id) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_local_handle == lockspace) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_device(int minor) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_device.minor == minor) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +void dlm_put_lockspace(struct dlm_ls *ls) +{ + spin_lock(&lslist_lock); + ls->ls_count--; + spin_unlock(&lslist_lock); +} + +static void remove_lockspace(struct dlm_ls *ls) +{ + for (;;) { + spin_lock(&lslist_lock); + if (ls->ls_count == 0) { + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + return; + } + spin_unlock(&lslist_lock); + ssleep(1); + } +} + +static int threads_start(void) +{ + int error; + + error = dlm_scand_start(); + if (error) { + log_print("cannot start dlm_scand thread %d", error); + goto fail; + } + + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_lowcomms_start(); + if (error) { + log_print("cannot start dlm lowcomms %d", error); + goto scand_fail; + } + + return 0; + + scand_fail: + dlm_scand_stop(); + fail: + return error; +} + +static void threads_stop(void) +{ + dlm_scand_stop(); + dlm_lowcomms_stop(); +} + +static int new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) +{ + struct dlm_ls *ls; + int i, size, error; + int do_unreg = 0; + int namelen = strlen(name); + + if (namelen > DLM_LOCKSPACE_LEN) + return -EINVAL; + + if (!lvblen || (lvblen % 8)) + return -EINVAL; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + if (!dlm_user_daemon_available()) { + log_print("dlm user daemon not available"); + error = -EUNATCH; + goto out; + } + + if (ops && ops_result) { + if (!dlm_config.ci_recover_callbacks) + *ops_result = -EOPNOTSUPP; + else + *ops_result = 0; + } + + if (dlm_config.ci_recover_callbacks && cluster && + strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { + log_print("dlm cluster name %s mismatch %s", + dlm_config.ci_cluster_name, cluster); + error = -EBADR; + goto out; + } + + error = 0; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + WARN_ON(ls->ls_create_count <= 0); + if (ls->ls_namelen != namelen) + continue; + if (memcmp(ls->ls_name, name, namelen)) + continue; + if (flags & DLM_LSFL_NEWEXCL) { + error = -EEXIST; + break; + } + ls->ls_create_count++; + *lockspace = ls; + error = 1; + break; + } + spin_unlock(&lslist_lock); + + if (error) + goto out; + + error = -ENOMEM; + + ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + if (!ls) + goto out; + memcpy(ls->ls_name, name, namelen); + ls->ls_namelen = namelen; + ls->ls_lvblen = lvblen; + ls->ls_count = 0; + ls->ls_flags = 0; + ls->ls_scan_time = jiffies; + + if (ops && dlm_config.ci_recover_callbacks) { + ls->ls_ops = ops; + ls->ls_ops_arg = ops_arg; + } + + if (flags & DLM_LSFL_TIMEWARN) + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + + /* ls_exflags are forced to match among nodes, and we don't + need to require all nodes to have some flags set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | + DLM_LSFL_NEWEXCL)); + + size = dlm_config.ci_rsbtbl_size; + ls->ls_rsbtbl_size = size; + + ls->ls_rsbtbl = vmalloc(sizeof(struct dlm_rsbtable) * size); + if (!ls->ls_rsbtbl) + goto out_lsfree; + for (i = 0; i < size; i++) { + ls->ls_rsbtbl[i].keep.rb_node = NULL; + ls->ls_rsbtbl[i].toss.rb_node = NULL; + spin_lock_init(&ls->ls_rsbtbl[i].lock); + } + + spin_lock_init(&ls->ls_remove_spin); + + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { + ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, + GFP_KERNEL); + if (!ls->ls_remove_names[i]) + goto out_rsbtbl; + } + + idr_init(&ls->ls_lkbidr); + spin_lock_init(&ls->ls_lkbidr_spin); + + INIT_LIST_HEAD(&ls->ls_waiters); + mutex_init(&ls->ls_waiters_mutex); + INIT_LIST_HEAD(&ls->ls_orphans); + mutex_init(&ls->ls_orphans_mutex); + INIT_LIST_HEAD(&ls->ls_timeout); + mutex_init(&ls->ls_timeout_mutex); + + INIT_LIST_HEAD(&ls->ls_new_rsb); + spin_lock_init(&ls->ls_new_rsb_spin); + + INIT_LIST_HEAD(&ls->ls_nodes); + INIT_LIST_HEAD(&ls->ls_nodes_gone); + ls->ls_num_nodes = 0; + ls->ls_low_nodeid = 0; + ls->ls_total_weight = 0; + ls->ls_node_array = NULL; + + memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb)); + ls->ls_stub_rsb.res_ls = ls; + + ls->ls_debug_rsb_dentry = NULL; + ls->ls_debug_waiters_dentry = NULL; + + init_waitqueue_head(&ls->ls_uevent_wait); + ls->ls_uevent_result = 0; + init_completion(&ls->ls_members_done); + ls->ls_members_result = -1; + + mutex_init(&ls->ls_cb_mutex); + INIT_LIST_HEAD(&ls->ls_cb_delay); + + ls->ls_recoverd_task = NULL; + mutex_init(&ls->ls_recoverd_active); + spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); + ls->ls_recover_status = 0; + ls->ls_recover_seq = 0; + ls->ls_recover_args = NULL; + init_rwsem(&ls->ls_in_recovery); + init_rwsem(&ls->ls_recv_active); + INIT_LIST_HEAD(&ls->ls_requestqueue); + mutex_init(&ls->ls_requestqueue_mutex); + mutex_init(&ls->ls_clear_proc_locks); + + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + if (!ls->ls_recover_buf) + goto out_lkbidr; + + ls->ls_slot = 0; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_slots = NULL; + + INIT_LIST_HEAD(&ls->ls_recover_list); + spin_lock_init(&ls->ls_recover_list_lock); + idr_init(&ls->ls_recover_idr); + spin_lock_init(&ls->ls_recover_idr_lock); + ls->ls_recover_list_count = 0; + ls->ls_local_handle = ls; + init_waitqueue_head(&ls->ls_wait_general); + INIT_LIST_HEAD(&ls->ls_root_list); + init_rwsem(&ls->ls_root_sem); + + spin_lock(&lslist_lock); + ls->ls_create_count = 1; + list_add(&ls->ls_list, &lslist); + spin_unlock(&lslist_lock); + + if (flags & DLM_LSFL_FS) { + error = dlm_callback_start(ls); + if (error) { + log_error(ls, "can't start dlm_callback %d", error); + goto out_delist; + } + } + + init_waitqueue_head(&ls->ls_recover_lock_wait); + + /* + * Once started, dlm_recoverd first looks for ls in lslist, then + * initializes ls_in_recovery as locked in "down" mode. We need + * to wait for the wakeup from dlm_recoverd because in_recovery + * has to start out in down mode. + */ + + error = dlm_recoverd_start(ls); + if (error) { + log_error(ls, "can't start dlm_recoverd %d", error); + goto out_callback; + } + + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + + ls->ls_kobj.kset = dlm_kset; + error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, + "%s", ls->ls_name); + if (error) + goto out_recoverd; + kobject_uevent(&ls->ls_kobj, KOBJ_ADD); + + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; + + /* This uevent triggers dlm_controld in userspace to add us to the + group of nodes that are members of this lockspace (managed by the + cluster infrastructure.) Once it's done that, it tells us who the + current lockspace members are (via configfs) and then tells the + lockspace to start running (via sysfs) in dlm_ls_start(). */ + + error = do_uevent(ls, 1); + if (error) + goto out_recoverd; + + wait_for_completion(&ls->ls_members_done); + error = ls->ls_members_result; + if (error) + goto out_members; + + dlm_create_debug_file(ls); + + log_rinfo(ls, "join complete"); + *lockspace = ls; + return 0; + + out_members: + do_uevent(ls, 0); + dlm_clear_members(ls); + kfree(ls->ls_node_array); + out_recoverd: + dlm_recoverd_stop(ls); + out_callback: + dlm_callback_stop(ls); + out_delist: + spin_lock(&lslist_lock); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + idr_destroy(&ls->ls_recover_idr); + kfree(ls->ls_recover_buf); + out_lkbidr: + idr_destroy(&ls->ls_lkbidr); + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { + if (ls->ls_remove_names[i]) + kfree(ls->ls_remove_names[i]); + } + out_rsbtbl: + vfree(ls->ls_rsbtbl); + out_lsfree: + if (do_unreg) + kobject_put(&ls->ls_kobj); + else + kfree(ls); + out: + module_put(THIS_MODULE); + return error; +} + +int dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) +{ + int error = 0; + + mutex_lock(&ls_lock); + if (!ls_count) + error = threads_start(); + if (error) + goto out; + + error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg, + ops_result, lockspace); + if (!error) + ls_count++; + if (error > 0) + error = 0; + if (!ls_count) + threads_stop(); + out: + mutex_unlock(&ls_lock); + return error; +} + +static int lkb_idr_is_local(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV; +} + +static int lkb_idr_is_any(int id, void *p, void *data) +{ + return 1; +} + +static int lkb_idr_free(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) + dlm_free_lvb(lkb->lkb_lvbptr); + + dlm_free_lkb(lkb); + return 0; +} + +/* NOTE: We check the lkbidr here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been unlinked + from their RSBs and are pending deletion once the AST has been delivered */ + +static int lockspace_busy(struct dlm_ls *ls, int force) +{ + int rv; + + spin_lock(&ls->ls_lkbidr_spin); + if (force == 0) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls); + } else if (force == 1) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls); + } else { + rv = 0; + } + spin_unlock(&ls->ls_lkbidr_spin); + return rv; +} + +static int release_lockspace(struct dlm_ls *ls, int force) +{ + struct dlm_rsb *rsb; + struct rb_node *n; + int i, busy, rv; + + busy = lockspace_busy(ls, force); + + spin_lock(&lslist_lock); + if (ls->ls_create_count == 1) { + if (busy) { + rv = -EBUSY; + } else { + /* remove_lockspace takes ls off lslist */ + ls->ls_create_count = 0; + rv = 0; + } + } else if (ls->ls_create_count > 1) { + rv = --ls->ls_create_count; + } else { + rv = -EINVAL; + } + spin_unlock(&lslist_lock); + + if (rv) { + log_debug(ls, "release_lockspace no remove %d", rv); + return rv; + } + + dlm_device_deregister(ls); + + if (force < 3 && dlm_user_daemon_available()) + do_uevent(ls, 0); + + dlm_recoverd_stop(ls); + + dlm_callback_stop(ls); + + remove_lockspace(ls); + + dlm_delete_debug_file(ls); + + kfree(ls->ls_recover_buf); + + /* + * Free all lkb's in idr + */ + + idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); + idr_destroy(&ls->ls_lkbidr); + + /* + * Free all rsb's on rsbtbl[] lists + */ + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].keep); + dlm_free_rsb(rsb); + } + + while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(rsb); + } + } + + vfree(ls->ls_rsbtbl); + + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) + kfree(ls->ls_remove_names[i]); + + while (!list_empty(&ls->ls_new_rsb)) { + rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } + + /* + * Free structures on any other lists + */ + + dlm_purge_requestqueue(ls); + kfree(ls->ls_recover_args); + dlm_clear_members(ls); + dlm_clear_members_gone(ls); + kfree(ls->ls_node_array); + log_rinfo(ls, "release_lockspace final free"); + kobject_put(&ls->ls_kobj); + /* The ls structure will be freed when the kobject is done with */ + + module_put(THIS_MODULE); + return 0; +} + +/* + * Called when a system has released all its locks and is not going to use the + * lockspace any longer. We free everything we're managing for this lockspace. + * Remaining nodes will go through the recovery process as if we'd died. The + * lockspace must continue to function as usual, participating in recoveries, + * until this returns. + * + * Force has 4 possible values: + * 0 - don't destroy locksapce if it has any LKBs + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs + * 2 - destroy lockspace regardless of LKBs + * 3 - destroy lockspace as part of a forced shutdown + */ + +int dlm_release_lockspace(void *lockspace, int force) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + dlm_put_lockspace(ls); + + mutex_lock(&ls_lock); + error = release_lockspace(ls, force); + if (!error) + ls_count--; + if (!ls_count) + threads_stop(); + mutex_unlock(&ls_lock); + + return error; +} + +void dlm_stop_lockspaces(void) +{ + struct dlm_ls *ls; + int count; + + restart: + count = 0; + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) { + count++; + continue; + } + spin_unlock(&lslist_lock); + log_error(ls, "no userland control daemon, stopping lockspace"); + dlm_ls_stop(ls); + goto restart; + } + spin_unlock(&lslist_lock); + + if (count) + log_print("dlm user daemon left %d lockspaces", count); +} + diff --git a/kernel/fs/dlm/lockspace.h b/kernel/fs/dlm/lockspace.h new file mode 100644 index 000000000..f879f8790 --- /dev/null +++ b/kernel/fs/dlm/lockspace.h @@ -0,0 +1,26 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKSPACE_DOT_H__ +#define __LOCKSPACE_DOT_H__ + +int dlm_lockspace_init(void); +void dlm_lockspace_exit(void); +struct dlm_ls *dlm_find_lockspace_global(uint32_t id); +struct dlm_ls *dlm_find_lockspace_local(void *id); +struct dlm_ls *dlm_find_lockspace_device(int minor); +void dlm_put_lockspace(struct dlm_ls *ls); +void dlm_stop_lockspaces(void); + +#endif /* __LOCKSPACE_DOT_H__ */ + diff --git a/kernel/fs/dlm/lowcomms.c b/kernel/fs/dlm/lowcomms.c new file mode 100644 index 000000000..d08e079ea --- /dev/null +++ b/kernel/fs/dlm/lowcomms.c @@ -0,0 +1,1830 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is its + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * lowcomms will choose to use either TCP or SCTP as its transport layer + * depending on the configuration variable 'protocol'. This should be set + * to 0 (default) for TCP or 1 for SCTP. It should be configured using a + * cluster-wide mechanism as it must be the same on all nodes of the cluster + * for the DLM to function. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "config.h" + +#define NEEDED_RMEM (4*1024*1024) +#define CONN_HASH_SIZE 32 + +/* Number of messages to send before rescheduling */ +#define MAX_SEND_MSG_COUNT 25 + +struct cbuf { + unsigned int base; + unsigned int len; + unsigned int mask; +}; + +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} + +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} + +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} + +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} + +static bool cbuf_empty(struct cbuf *cb) +{ + return cb->len == 0; +} + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + struct mutex sock_mutex; + unsigned long flags; +#define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 +#define CF_CONNECT_PENDING 3 +#define CF_INIT_PENDING 4 +#define CF_IS_OTHERCON 5 +#define CF_CLOSE 6 +#define CF_APP_LIMITED 7 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + spinlock_t writequeue_lock; + int (*rx_action) (struct connection *); /* What to do when active */ + void (*connect_action) (struct connection *); /* What to do to connect */ + struct page *rx_page; + struct cbuf cb; + int retries; +#define MAX_CONNECT_RETRIES 3 + int sctp_assoc; + struct hlist_node list; + struct connection *othercon; + struct work_struct rwork; /* Receive workqueue */ + struct work_struct swork; /* Send workqueue */ + bool try_new_addr; +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct connection *con; +}; + +struct dlm_node_addr { + struct list_head list; + int nodeid; + int addr_count; + int curr_addr_index; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +static LIST_HEAD(dlm_node_addrs); +static DEFINE_SPINLOCK(dlm_node_addrs_spin); + +static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static int dlm_local_count; +static int dlm_allow_conn; + +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; + +static struct hlist_head connection_hash[CONN_HASH_SIZE]; +static DEFINE_MUTEX(connections_lock); +static struct kmem_cache *con_cache; + +static void process_recv_sockets(struct work_struct *work); +static void process_send_sockets(struct work_struct *work); + + +/* This is deliberately very simple because most clusters have simple + sequential nodeids, so we should be able to go straight to a connection + struct in the array */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} + +static struct connection *__find_con(int nodeid) +{ + int r; + struct connection *con; + + r = nodeid_hash(nodeid); + + hlist_for_each_entry(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) + return con; + } + return NULL; +} + +/* + * If 'allocation' is zero then we don't attempt to create a new + * connection structure for this node. + */ +static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +{ + struct connection *con = NULL; + int r; + + con = __find_con(nodeid); + if (con || !alloc) + return con; + + con = kmem_cache_zalloc(con_cache, alloc); + if (!con) + return NULL; + + r = nodeid_hash(nodeid); + hlist_add_head(&con->list, &connection_hash[r]); + + con->nodeid = nodeid; + mutex_init(&con->sock_mutex); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); + + /* Setup action pointers for child sockets */ + if (con->nodeid) { + struct connection *zerocon = __find_con(0); + + con->connect_action = zerocon->connect_action; + if (!con->rx_action) + con->rx_action = zerocon->rx_action; + } + + return con; +} + +/* Loop round all connections */ +static void foreach_conn(void (*conn_func)(struct connection *c)) +{ + int i; + struct hlist_node *n; + struct connection *con; + + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + conn_func(con); + } +} + +static struct connection *nodeid2con(int nodeid, gfp_t allocation) +{ + struct connection *con; + + mutex_lock(&connections_lock); + con = __nodeid2con(nodeid, allocation); + mutex_unlock(&connections_lock); + + return con; +} + +/* This is a bit drastic, but only called when things go wrong */ +static struct connection *assoc2con(int assoc_id) +{ + int i; + struct connection *con; + + mutex_lock(&connections_lock); + + for (i = 0 ; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry(con, &connection_hash[i], list) { + if (con->sctp_assoc == assoc_id) { + mutex_unlock(&connections_lock); + return con; + } + } + } + mutex_unlock(&connections_lock); + return NULL; +} + +static struct dlm_node_addr *find_node_addr(int nodeid) +{ + struct dlm_node_addr *na; + + list_for_each_entry(na, &dlm_node_addrs, list) { + if (na->nodeid == nodeid) + return na; + } + return NULL; +} + +static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, + struct sockaddr *sa_out, bool try_new_addr) +{ + struct sockaddr_storage sas; + struct dlm_node_addr *na; + + if (!dlm_local_count) + return -1; + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (na && na->addr_count) { + if (try_new_addr) { + na->curr_addr_index++; + if (na->curr_addr_index == na->addr_count) + na->curr_addr_index = 0; + } + + memcpy(&sas, na->addr[na->curr_addr_index ], + sizeof(struct sockaddr_storage)); + } + spin_unlock(&dlm_node_addrs_spin); + + if (!na) + return -EEXIST; + + if (!na->addr_count) + return -ENOENT; + + if (sas_out) + memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); + + if (!sa_out) + return 0; + + if (dlm_local_addr[0]->ss_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; + struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; + } else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; + ret6->sin6_addr = in6->sin6_addr; + } + + return 0; +} + +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +{ + struct dlm_node_addr *na; + int rv = -EEXIST; + int addr_i; + + spin_lock(&dlm_node_addrs_spin); + list_for_each_entry(na, &dlm_node_addrs, list) { + if (!na->addr_count) + continue; + + for (addr_i = 0; addr_i < na->addr_count; addr_i++) { + if (addr_compare(na->addr[addr_i], addr)) { + *nodeid = na->nodeid; + rv = 0; + goto unlock; + } + } + } +unlock: + spin_unlock(&dlm_node_addrs_spin); + return rv; +} + +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + struct sockaddr_storage *new_addr; + struct dlm_node_addr *new_node, *na; + + new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); + if (!new_node) + return -ENOMEM; + + new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS); + if (!new_addr) { + kfree(new_node); + return -ENOMEM; + } + + memcpy(new_addr, addr, len); + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (!na) { + new_node->nodeid = nodeid; + new_node->addr[0] = new_addr; + new_node->addr_count = 1; + list_add(&new_node->list, &dlm_node_addrs); + spin_unlock(&dlm_node_addrs_spin); + return 0; + } + + if (na->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&dlm_node_addrs_spin); + kfree(new_addr); + kfree(new_node); + return -ENOSPC; + } + + na->addr[na->addr_count++] = new_addr; + spin_unlock(&dlm_node_addrs_spin); + kfree(new_node); + return 0; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk) +{ + struct connection *con = sock2con(sk); + if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + if (!con) + return; + + clear_bit(SOCK_NOSPACE, &con->sock->flags); + + if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { + con->sock->sk->sk_write_pending--; + clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags); + } + + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); +} + +static inline void lowcomms_connect_sock(struct connection *con) +{ + if (test_bit(CF_CLOSE, &con->flags)) + return; + if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); +} + +static void lowcomms_state_change(struct sock *sk) +{ + if (sk->sk_state == TCP_ESTABLISHED) + lowcomms_write_space(sk); +} + +int dlm_lowcomms_connect_node(int nodeid) +{ + struct connection *con; + + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + + if (nodeid == dlm_our_nodeid()) + return 0; + + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) + return -ENOMEM; + lowcomms_connect_sock(con); + return 0; +} + +/* Make a socket active */ +static void add_sock(struct socket *sock, struct connection *con) +{ + con->sock = sock; + + /* Install a data_ready callback */ + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->sock->sk->sk_write_space = lowcomms_write_space; + con->sock->sk->sk_state_change = lowcomms_state_change; + con->sock->sk->sk_user_data = con; + con->sock->sk->sk_allocation = GFP_NOFS; +} + +/* Add the port number to an IPv6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + saddr->ss_family = dlm_local_addr[0]->ss_family; + if (saddr->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); + } else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } + memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, bool and_other) +{ + mutex_lock(&con->sock_mutex); + + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + if (con->othercon && and_other) { + /* Will only re-enter once. */ + close_connection(con->othercon, false); + } + if (con->rx_page) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + con->retries = 0; + mutex_unlock(&con->sock_mutex); +} + +/* We only send shutdown messages to nodes that are not part of the cluster */ +static void sctp_send_shutdown(sctp_assoc_t associd) +{ + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int ret; + struct connection *con; + + con = nodeid2con(0,0); + BUG_ON(con == NULL); + + outmessage.msg_name = NULL; + outmessage.msg_namelen = 0; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + outmessage.msg_controllen = cmsg->cmsg_len; + sinfo = CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + + sinfo->sinfo_flags |= MSG_EOF; + sinfo->sinfo_assoc_id = associd; + + ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0); + + if (ret != 0) + log_print("send EOF to node failed: %d", ret); +} + +static void sctp_init_failed_foreach(struct connection *con) +{ + + /* + * Don't try to recover base con and handle race where the + * other node's assoc init creates a assoc and we get that + * notification, then we get a notification that our attempt + * failed due. This happens when we are still trying the primary + * address, but the other node has already tried secondary addrs + * and found one that worked. + */ + if (!con->nodeid || con->sctp_assoc) + return; + + log_print("Retrying SCTP association init for node %d\n", con->nodeid); + + con->try_new_addr = true; + con->sctp_assoc = 0; + if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } +} + +/* INIT failed but we don't know which node... + restart INIT on all pending nodes */ +static void sctp_init_failed(void) +{ + mutex_lock(&connections_lock); + + foreach_conn(sctp_init_failed_foreach); + + mutex_unlock(&connections_lock); +} + +static void retry_failed_sctp_send(struct connection *recv_con, + struct sctp_send_failed *sn_send_failed, + char *buf) +{ + int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); + struct dlm_mhandle *mh; + struct connection *con; + char *retry_buf; + int nodeid = sn_send_failed->ssf_info.sinfo_ppid; + + log_print("Retry sending %d bytes to node id %d", len, nodeid); + + if (!nodeid) { + log_print("Shouldn't resend data via listening connection."); + return; + } + + con = nodeid2con(nodeid, 0); + if (!con) { + log_print("Could not look up con for nodeid %d\n", + nodeid); + return; + } + + mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); + if (!mh) { + log_print("Could not allocate buf for retry."); + return; + } + memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); + dlm_lowcomms_commit_buffer(mh); + + /* + * If we got a assoc changed event before the send failed event then + * we only need to retry the send. + */ + if (con->sctp_assoc) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } else + sctp_init_failed_foreach(con); +} + +/* Something happened to an association */ +static void process_sctp_notification(struct connection *con, + struct msghdr *msg, char *buf) +{ + union sctp_notification *sn = (union sctp_notification *)buf; + struct linger linger; + + switch (sn->sn_header.sn_type) { + case SCTP_SEND_FAILED: + retry_failed_sctp_send(con, &sn->sn_send_failed, buf); + break; + case SCTP_ASSOC_CHANGE: + switch (sn->sn_assoc_change.sac_state) { + case SCTP_COMM_UP: + case SCTP_RESTART: + { + /* Check that the new node is in the lockspace */ + struct sctp_prim prim; + int nodeid; + int prim_len, ret; + int addr_len; + struct connection *new_con; + + /* + * We get this before any data for an association. + * We verify that the node is in the cluster and + * then peel off a socket for it. + */ + if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { + log_print("COMM_UP for invalid assoc ID %d", + (int)sn->sn_assoc_change.sac_assoc_id); + sctp_init_failed(); + return; + } + memset(&prim, 0, sizeof(struct sctp_prim)); + prim_len = sizeof(struct sctp_prim); + prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; + + ret = kernel_getsockopt(con->sock, + IPPROTO_SCTP, + SCTP_PRIMARY_ADDR, + (char*)&prim, + &prim_len); + if (ret < 0) { + log_print("getsockopt/sctp_primary_addr on " + "new assoc %d failed : %d", + (int)sn->sn_assoc_change.sac_assoc_id, + ret); + + /* Retry INIT later */ + new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id); + if (new_con) + clear_bit(CF_CONNECT_PENDING, &con->flags); + return; + } + make_sockaddr(&prim.ssp_addr, 0, &addr_len); + if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + unsigned char *b=(unsigned char *)&prim.ssp_addr; + log_print("reject connect from unknown addr"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); + sctp_send_shutdown(prim.ssp_assoc_id); + return; + } + + new_con = nodeid2con(nodeid, GFP_NOFS); + if (!new_con) + return; + + /* Peel off a new sock */ + lock_sock(con->sock->sk); + ret = sctp_do_peeloff(con->sock->sk, + sn->sn_assoc_change.sac_assoc_id, + &new_con->sock); + release_sock(con->sock->sk); + if (ret < 0) { + log_print("Can't peel off a socket for " + "connection %d to node %d: err=%d", + (int)sn->sn_assoc_change.sac_assoc_id, + nodeid, ret); + return; + } + add_sock(new_con->sock, new_con); + + linger.l_onoff = 1; + linger.l_linger = 0; + ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (ret < 0) + log_print("set socket option SO_LINGER failed"); + + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); + + new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; + new_con->try_new_addr = false; + /* Send any pending writes */ + clear_bit(CF_CONNECT_PENDING, &new_con->flags); + clear_bit(CF_INIT_PENDING, &new_con->flags); + if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { + queue_work(send_workqueue, &new_con->swork); + } + if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags)) + queue_work(recv_workqueue, &new_con->rwork); + } + break; + + case SCTP_COMM_LOST: + case SCTP_SHUTDOWN_COMP: + { + con = assoc2con(sn->sn_assoc_change.sac_assoc_id); + if (con) { + con->sctp_assoc = 0; + } + } + break; + + case SCTP_CANT_STR_ASSOC: + { + /* Will retry init when we get the send failed notification */ + log_print("Can't start SCTP association - retrying"); + } + break; + + default: + log_print("unexpected SCTP assoc change id=%d state=%d", + (int)sn->sn_assoc_change.sac_assoc_id, + sn->sn_assoc_change.sac_state); + } + default: + ; /* fall through */ + } +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con) +{ + int ret = 0; + struct msghdr msg = {}; + struct kvec iov[2]; + unsigned len; + int r; + int call_again_soon = 0; + int nvec; + char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + + mutex_lock(&con->sock_mutex); + + if (con->sock == NULL) { + ret = -EAGAIN; + goto out_close; + } + + if (con->rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + con->rx_page = alloc_page(GFP_ATOMIC); + if (con->rx_page == NULL) + goto out_resched; + cbuf_init(&con->cb, PAGE_CACHE_SIZE); + } + + /* Only SCTP needs these really */ + memset(&incmsg, 0, sizeof(incmsg)); + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); + iov[1].iov_len = 0; + nvec = 1; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (cbuf_data(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); + iov[1].iov_len = con->cb.base; + iov[1].iov_base = page_address(con->rx_page); + nvec = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, + MSG_DONTWAIT | MSG_NOSIGNAL); + if (ret <= 0) + goto out_close; + + /* Process SCTP notifications */ + if (msg.msg_flags & MSG_NOTIFICATION) { + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + + process_sctp_notification(con, &msg, + page_address(con->rx_page) + con->cb.base); + mutex_unlock(&con->sock_mutex); + return 0; + } + BUG_ON(con->nodeid == 0); + + if (ret == len) + call_again_soon = 1; + cbuf_add(&con->cb, ret); + ret = dlm_process_incoming_buffer(con->nodeid, + page_address(con->rx_page), + con->cb.base, con->cb.len, + PAGE_CACHE_SIZE); + if (ret == -EBADMSG) { + log_print("lowcomms: addr=%p, base=%u, len=%u, " + "iov_len=%u, iov_base[0]=%p, read=%d", + page_address(con->rx_page), con->cb.base, con->cb.len, + len, iov[0].iov_base, r); + } + if (ret < 0) + goto out_close; + cbuf_eat(&con->cb, ret); + + if (cbuf_empty(&con->cb) && !call_again_soon) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + if (call_again_soon) + goto out_resched; + mutex_unlock(&con->sock_mutex); + return 0; + +out_resched: + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); + mutex_unlock(&con->sock_mutex); + return -EAGAIN; + +out_close: + mutex_unlock(&con->sock_mutex); + if (ret != -EAGAIN) { + close_connection(con, false); + /* Reconnect when there is something to send */ + } + /* Don't return success if we really got EOF */ + if (ret == 0) + ret = -EAGAIN; + + return ret; +} + +/* Listening socket is busy, accept a connection */ +static int tcp_accept_from_sock(struct connection *con) +{ + int result; + struct sockaddr_storage peeraddr; + struct socket *newsock; + int len; + int nodeid; + struct connection *newcon; + struct connection *addcon; + + mutex_lock(&connections_lock); + if (!dlm_allow_conn) { + mutex_unlock(&connections_lock); + return -1; + } + mutex_unlock(&connections_lock); + + memset(&peeraddr, 0, sizeof(peeraddr)); + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &newsock); + if (result < 0) + return -ENOMEM; + + mutex_lock_nested(&con->sock_mutex, 0); + + result = -ENOTCONN; + if (con->sock == NULL) + goto accept_err; + + newsock->type = con->sock->type; + newsock->ops = con->sock->ops; + + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + memset(&peeraddr, 0, sizeof(peeraddr)); + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, + &len, 2)) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + make_sockaddr(&peeraddr, 0, &len); + if (addr_to_nodeid(&peeraddr, &nodeid)) { + unsigned char *b=(unsigned char *)&peeraddr; + log_print("connect from non cluster node"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); + sock_release(newsock); + mutex_unlock(&con->sock_mutex); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * In this case we store the incoming one in "othercon" + */ + newcon = nodeid2con(nodeid, GFP_NOFS); + if (!newcon) { + result = -ENOMEM; + goto accept_err; + } + mutex_lock_nested(&newcon->sock_mutex, 1); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + if (!othercon) { + log_print("failed to allocate incoming socket"); + mutex_unlock(&newcon->sock_mutex); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); + set_bit(CF_IS_OTHERCON, &othercon->flags); + } + if (!othercon->sock) { + newcon->othercon = othercon; + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + addcon = othercon; + } + else { + printk("Extra connection from node %d attempted\n", nodeid); + result = -EAGAIN; + mutex_unlock(&newcon->sock_mutex); + goto accept_err; + } + } + else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + addcon = newcon; + } + + mutex_unlock(&newcon->sock_mutex); + + /* + * Add it to the active queue in case we got data + * between processing the accept adding the socket + * to the read_sockets list + */ + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); + + return 0; + +accept_err: + mutex_unlock(&con->sock_mutex); + sock_release(newsock); + + if (result != -EAGAIN) + log_print("error accepting connection from node: %d", result); + return result; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* + * writequeue_entry_complete - try to delete and free write queue entry + * @e: write queue entry to try to delete + * @completed: bytes completed + * + * writequeue_lock must be held. + */ +static void writequeue_entry_complete(struct writequeue_entry *e, int completed) +{ + e->offset += completed; + e->len -= completed; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + } +} + +/* Initiate an SCTP association. + This is a special case of send_to_sock() in that we don't yet have a + peeled-off socket for this association, so we use the listening socket + and add the primary IP address of the remote node. + */ +static void sctp_init_assoc(struct connection *con) +{ + struct sockaddr_storage rem_addr; + char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + struct connection *base_con; + struct writequeue_entry *e; + int len, offset; + int ret; + int addrlen; + struct kvec iov[1]; + + mutex_lock(&con->sock_mutex); + if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) + goto unlock; + + if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, + con->try_new_addr)) { + log_print("no address for nodeid %d", con->nodeid); + goto unlock; + } + base_con = nodeid2con(0, 0); + BUG_ON(base_con == NULL); + + make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); + + outmessage.msg_name = &rem_addr; + outmessage.msg_namelen = addrlen; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + spin_lock(&con->writequeue_lock); + + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + goto unlock; + } + + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); + len = e->len; + offset = e->offset; + + /* Send the first block off the write queue */ + iov[0].iov_base = page_address(e->page)+offset; + iov[0].iov_len = len; + spin_unlock(&con->writequeue_lock); + + if (rem_addr.ss_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; + log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; + log_print("Trying to connect to %pI6", &sin6->sin6_addr); + } + + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + sinfo = CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); + outmessage.msg_controllen = cmsg->cmsg_len; + sinfo->sinfo_flags |= SCTP_ADDR_OVER; + + ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); + if (ret < 0) { + log_print("Send first packet to node %d failed: %d", + con->nodeid, ret); + + /* Try again later */ + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_INIT_PENDING, &con->flags); + } + else { + spin_lock(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + spin_unlock(&con->writequeue_lock); + } + +unlock: + mutex_unlock(&con->sock_mutex); +} + +/* Connect a new socket to its peer */ +static void tcp_connect_to_sock(struct connection *con) +{ + struct sockaddr_storage saddr, src_addr; + int addr_len; + struct socket *sock = NULL; + int one = 1; + int result; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return; + } + + mutex_lock(&con->sock_mutex); + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + /* Some odd races can cause double-connects, ignore them */ + if (con->sock) + goto out; + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (result < 0) + goto out_err; + + memset(&saddr, 0, sizeof(saddr)); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); + goto out_err; + } + + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + con->connect_action = tcp_connect_to_sock; + add_sock(sock, con); + + /* Bind to our cluster-known address connecting to avoid + routing problems */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, + addr_len); + if (result < 0) { + log_print("could not bind for connect: %d", result); + /* This *may* not indicate a critical error */ + } + + make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); + + log_print("connecting to %d", con->nodeid); + + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + + result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result == 0) + goto out; + +out_err: + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } else if (sock) { + sock_release(sock); + } + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + mutex_unlock(&con->sock_mutex); + msleep(1000); + lowcomms_connect_sock(con); + return; + } +out: + mutex_unlock(&con->sock_mutex); + return; +} + +static struct socket *tcp_create_listen_sock(struct connection *con, + struct sockaddr_storage *saddr) +{ + struct socket *sock = NULL; + int result = 0; + int one = 1; + int addr_len; + + if (dlm_local_addr[0]->ss_family == AF_INET) + addr_len = sizeof(struct sockaddr_in); + else + addr_len = sizeof(struct sockaddr_in6); + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (result < 0) { + log_print("Can't create listening comms socket"); + goto create_out; + } + + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + + result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); + + if (result < 0) { + log_print("Failed to set SO_REUSEADDR on socket: %d", result); + } + con->rx_action = tcp_accept_from_sock; + con->connect_action = tcp_connect_to_sock; + + /* Bind to our port */ + make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); + if (result < 0) { + log_print("Can't bind to port %d", dlm_config.ci_tcp_port); + sock_release(sock); + sock = NULL; + con->sock = NULL; + goto create_out; + } + result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&one, sizeof(one)); + if (result < 0) { + log_print("Set keepalive failed: %d", result); + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + log_print("Can't listen on port %d", dlm_config.ci_tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + +create_out: + return sock; +} + +/* Get local addresses */ +static void init_local(void) +{ + struct sockaddr_storage sas, *addr; + int i; + + dlm_local_count = 0; + for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { + if (dlm_our_addr(&sas, i)) + break; + + addr = kmalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + break; + memcpy(addr, &sas, sizeof(*addr)); + dlm_local_addr[dlm_local_count++] = addr; + } +} + +/* Bind to an IP address. SCTP allows multiple address so it can do + multi-homing */ +static int add_sctp_bind_addr(struct connection *sctp_con, + struct sockaddr_storage *addr, + int addr_len, int num) +{ + int result = 0; + + if (num == 1) + result = kernel_bind(sctp_con->sock, + (struct sockaddr *) addr, + addr_len); + else + result = kernel_setsockopt(sctp_con->sock, SOL_SCTP, + SCTP_SOCKOPT_BINDX_ADD, + (char *)addr, addr_len); + + if (result < 0) + log_print("Can't bind to port %d addr number %d", + dlm_config.ci_tcp_port, num); + + return result; +} + +/* Initialise SCTP socket and bind to all interfaces */ +static int sctp_listen_for_all(void) +{ + struct socket *sock = NULL; + struct sockaddr_storage localaddr; + struct sctp_event_subscribe subscribe; + int result = -EINVAL, num = 1, i, addr_len; + struct connection *con = nodeid2con(0, GFP_NOFS); + int bufsize = NEEDED_RMEM; + int one = 1; + + if (!con) + return -ENOMEM; + + log_print("Using SCTP for communications"); + + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, + IPPROTO_SCTP, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + /* Listen for events */ + memset(&subscribe, 0, sizeof(subscribe)); + subscribe.sctp_data_io_event = 1; + subscribe.sctp_association_event = 1; + subscribe.sctp_send_failure_event = 1; + subscribe.sctp_shutdown_event = 1; + subscribe.sctp_partial_delivery_event = 1; + + result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, + (char *)&bufsize, sizeof(bufsize)); + if (result) + log_print("Error increasing buffer space on socket %d", result); + + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS, + (char *)&subscribe, sizeof(subscribe)); + if (result < 0) { + log_print("Failed to set SCTP_EVENTS on socket: result=%d", + result); + goto create_delsock; + } + + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, + sizeof(one)); + if (result < 0) + log_print("Could not set SCTP NODELAY error %d\n", result); + + /* Init con struct */ + sock->sk->sk_user_data = con; + con->sock = sock; + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->rx_action = receive_from_sock; + con->connect_action = sctp_init_assoc; + + /* Bind to all interfaces. */ + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); + + result = add_sctp_bind_addr(con, &localaddr, addr_len, num); + if (result) + goto create_delsock; + ++num; + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + log_print("Can't set socket listening"); + goto create_delsock; + } + + return 0; + +create_delsock: + sock_release(sock); + con->sock = NULL; +out: + return result; +} + +static int tcp_listen_for_all(void) +{ + struct socket *sock = NULL; + struct connection *con = nodeid2con(0, GFP_NOFS); + int result = -EINVAL; + + if (!con) + return -ENOMEM; + + /* We don't support multi-homed hosts */ + if (dlm_local_addr[1] != NULL) { + log_print("TCP protocol can't handle multi-homed hosts, " + "try SCTP"); + return -EINVAL; + } + + log_print("Using TCP for communications"); + + sock = tcp_create_listen_sock(con, dlm_local_addr[0]); + if (sock) { + add_sock(sock, con); + result = 0; + } + else { + result = -EADDRINUSE; + } + + return result; +} + + + +static struct writequeue_entry *new_writequeue_entry(struct connection *con, + gfp_t allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + entry->con = con; + + return entry; +} + +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct connection *con; + struct writequeue_entry *e; + int offset = 0; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + spin_lock(&con->writequeue_lock); + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); + if ((&e->list == &con->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + e->users++; + } + spin_unlock(&con->writequeue_lock); + + if (e) { + got_one: + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(con, allocation); + if (e) { + spin_lock(&con->writequeue_lock); + offset = e->end; + e->end += len; + e->users++; + list_add_tail(&e->list, &con->writequeue); + spin_unlock(&con->writequeue_lock); + goto got_one; + } + return NULL; +} + +void dlm_lowcomms_commit_buffer(void *mh) +{ + struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct connection *con = e->con; + int users; + + spin_lock(&con->writequeue_lock); + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + spin_unlock(&con->writequeue_lock); + + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { + queue_work(send_workqueue, &con->swork); + } + return; + +out: + spin_unlock(&con->writequeue_lock); + return; +} + +/* Send a message */ +static void send_to_sock(struct connection *con) +{ + int ret = 0; + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + struct writequeue_entry *e; + int len, offset; + int count = 0; + + mutex_lock(&con->sock_mutex); + if (con->sock == NULL) + goto out_connect; + + spin_lock(&con->writequeue_lock); + for (;;) { + e = list_entry(con->writequeue.next, struct writequeue_entry, + list); + if ((struct list_head *) e == &con->writequeue) + break; + + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&con->writequeue_lock); + + ret = 0; + if (len) { + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } + cond_resched(); + goto out; + } else if (ret < 0) + goto send_error; + } + + /* Don't starve people filling buffers */ + if (++count >= MAX_SEND_MSG_COUNT) { + cond_resched(); + count = 0; + } + + spin_lock(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + } + spin_unlock(&con->writequeue_lock); +out: + mutex_unlock(&con->sock_mutex); + return; + +send_error: + mutex_unlock(&con->sock_mutex); + close_connection(con, false); + lowcomms_connect_sock(con); + return; + +out_connect: + mutex_unlock(&con->sock_mutex); + if (!test_bit(CF_INIT_PENDING, &con->flags)) + lowcomms_connect_sock(con); +} + +static void clean_one_writequeue(struct connection *con) +{ + struct writequeue_entry *e, *safe; + + spin_lock(&con->writequeue_lock); + list_for_each_entry_safe(e, safe, &con->writequeue, list) { + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int dlm_lowcomms_close(int nodeid) +{ + struct connection *con; + struct dlm_node_addr *na; + + log_print("closing connection to node %d", nodeid); + con = nodeid2con(nodeid, 0); + if (con) { + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + set_bit(CF_CLOSE, &con->flags); + if (cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", nodeid); + if (cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", nodeid); + clean_one_writequeue(con); + close_connection(con, true); + } + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (na) { + list_del(&na->list); + while (na->addr_count--) + kfree(na->addr[na->addr_count]); + kfree(na); + } + spin_unlock(&dlm_node_addrs_spin); + + return 0; +} + +/* Receive workqueue function */ +static void process_recv_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, rwork); + int err; + + clear_bit(CF_READ_PENDING, &con->flags); + do { + err = con->rx_action(con); + } while (!err); +} + +/* Send workqueue function */ +static void process_send_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, swork); + + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + con->connect_action(con); + set_bit(CF_WRITE_PENDING, &con->flags); + } + if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + send_to_sock(con); +} + + +/* Discard all entries on the write queues */ +static void clean_writequeues(void) +{ + foreach_conn(clean_one_writequeue); +} + +static void work_stop(void) +{ + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); +} + +static int work_start(void) +{ + recv_workqueue = alloc_workqueue("dlm_recv", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + if (!recv_workqueue) { + log_print("can't start dlm_recv"); + return -ENOMEM; + } + + send_workqueue = alloc_workqueue("dlm_send", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + if (!send_workqueue) { + log_print("can't start dlm_send"); + destroy_workqueue(recv_workqueue); + return -ENOMEM; + } + + return 0; +} + +static void stop_conn(struct connection *con) +{ + con->flags |= 0x0F; + if (con->sock && con->sock->sk) + con->sock->sk->sk_user_data = NULL; +} + +static void free_conn(struct connection *con) +{ + close_connection(con, true); + if (con->othercon) + kmem_cache_free(con_cache, con->othercon); + hlist_del(&con->list); + kmem_cache_free(con_cache, con); +} + +void dlm_lowcomms_stop(void) +{ + /* Set all the flags to prevent any + socket activity. + */ + mutex_lock(&connections_lock); + dlm_allow_conn = 0; + foreach_conn(stop_conn); + mutex_unlock(&connections_lock); + + work_stop(); + + mutex_lock(&connections_lock); + clean_writequeues(); + + foreach_conn(free_conn); + + mutex_unlock(&connections_lock); + kmem_cache_destroy(con_cache); +} + +int dlm_lowcomms_start(void) +{ + int error = -EINVAL; + struct connection *con; + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + init_local(); + if (!dlm_local_count) { + error = -ENOTCONN; + log_print("no local IP address has been set"); + goto fail; + } + + error = -ENOMEM; + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), + __alignof__(struct connection), 0, + NULL); + if (!con_cache) + goto fail; + + error = work_start(); + if (error) + goto fail_destroy; + + dlm_allow_conn = 1; + + /* Start listening */ + if (dlm_config.ci_protocol == 0) + error = tcp_listen_for_all(); + else + error = sctp_listen_for_all(); + if (error) + goto fail_unlisten; + + return 0; + +fail_unlisten: + dlm_allow_conn = 0; + con = nodeid2con(0,0); + if (con) { + close_connection(con, false); + kmem_cache_free(con_cache, con); + } +fail_destroy: + kmem_cache_destroy(con_cache); +fail: + return error; +} + +void dlm_lowcomms_exit(void) +{ + struct dlm_node_addr *na, *safe; + + spin_lock(&dlm_node_addrs_spin); + list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) { + list_del(&na->list); + while (na->addr_count--) + kfree(na->addr[na->addr_count]); + kfree(na); + } + spin_unlock(&dlm_node_addrs_spin); +} diff --git a/kernel/fs/dlm/lowcomms.h b/kernel/fs/dlm/lowcomms.h new file mode 100644 index 000000000..67462e54f --- /dev/null +++ b/kernel/fs/dlm/lowcomms.h @@ -0,0 +1,27 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOWCOMMS_DOT_H__ +#define __LOWCOMMS_DOT_H__ + +int dlm_lowcomms_start(void); +void dlm_lowcomms_stop(void); +void dlm_lowcomms_exit(void); +int dlm_lowcomms_close(int nodeid); +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); +void dlm_lowcomms_commit_buffer(void *mh); +int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); + +#endif /* __LOWCOMMS_DOT_H__ */ + diff --git a/kernel/fs/dlm/lvb_table.h b/kernel/fs/dlm/lvb_table.h new file mode 100644 index 000000000..cc3e92f3f --- /dev/null +++ b/kernel/fs/dlm/lvb_table.h @@ -0,0 +1,18 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LVB_TABLE_DOT_H__ +#define __LVB_TABLE_DOT_H__ + +extern const int dlm_lvb_operations[8][8]; + +#endif diff --git a/kernel/fs/dlm/main.c b/kernel/fs/dlm/main.c new file mode 100644 index 000000000..079c0bd71 --- /dev/null +++ b/kernel/fs/dlm/main.c @@ -0,0 +1,97 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "user.h" +#include "memory.h" +#include "config.h" +#include "lowcomms.h" + +static int __init init_dlm(void) +{ + int error; + + error = dlm_memory_init(); + if (error) + goto out; + + error = dlm_lockspace_init(); + if (error) + goto out_mem; + + error = dlm_config_init(); + if (error) + goto out_lockspace; + + error = dlm_register_debugfs(); + if (error) + goto out_config; + + error = dlm_user_init(); + if (error) + goto out_debug; + + error = dlm_netlink_init(); + if (error) + goto out_user; + + error = dlm_plock_init(); + if (error) + goto out_netlink; + + printk("DLM installed\n"); + + return 0; + + out_netlink: + dlm_netlink_exit(); + out_user: + dlm_user_exit(); + out_debug: + dlm_unregister_debugfs(); + out_config: + dlm_config_exit(); + out_lockspace: + dlm_lockspace_exit(); + out_mem: + dlm_memory_exit(); + out: + return error; +} + +static void __exit exit_dlm(void) +{ + dlm_plock_exit(); + dlm_netlink_exit(); + dlm_user_exit(); + dlm_config_exit(); + dlm_memory_exit(); + dlm_lockspace_exit(); + dlm_lowcomms_exit(); + dlm_unregister_debugfs(); +} + +module_init(init_dlm); +module_exit(exit_dlm); + +MODULE_DESCRIPTION("Distributed Lock Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL_GPL(dlm_new_lockspace); +EXPORT_SYMBOL_GPL(dlm_release_lockspace); +EXPORT_SYMBOL_GPL(dlm_lock); +EXPORT_SYMBOL_GPL(dlm_unlock); + diff --git a/kernel/fs/dlm/member.c b/kernel/fs/dlm/member.c new file mode 100644 index 000000000..9c47f1c14 --- /dev/null +++ b/kernel/fs/dlm/member.c @@ -0,0 +1,722 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "recover.h" +#include "rcom.h" +#include "config.h" +#include "lowcomms.h" + +int dlm_slots_version(struct dlm_header *h) +{ + if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS) + return 0; + return 1; +} + +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb) +{ + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + + if (!dlm_slots_version(&rc->rc_header)) + return; + + memb->slot = le16_to_cpu(rf->rf_our_slot); + memb->generation = le32_to_cpu(rf->rf_generation); +} + +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_slot *slot; + struct rcom_slot *ro; + int i; + + ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + /* ls_slots array is sparse, but not rcom_slots */ + + for (i = 0; i < ls->ls_slots_size; i++) { + slot = &ls->ls_slots[i]; + if (!slot->nodeid) + continue; + ro->ro_nodeid = cpu_to_le32(slot->nodeid); + ro->ro_slot = cpu_to_le16(slot->slot); + ro++; + } +} + +#define SLOT_DEBUG_LINE 128 + +static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) +{ + char line[SLOT_DEBUG_LINE]; + int len = SLOT_DEBUG_LINE - 1; + int pos = 0; + int ret, i; + + memset(line, 0, sizeof(line)); + + if (array) { + for (i = 0; i < array_size; i++) { + if (!array[i].nodeid) + continue; + + ret = snprintf(line + pos, len - pos, " %d:%d", + array[i].slot, array[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } else if (ro0) { + for (i = 0; i < num_slots; i++) { + ret = snprintf(line + pos, len - pos, " %d:%d", + ro0[i].ro_slot, ro0[i].ro_nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } + + log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line); +} + +int dlm_slots_copy_in(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_rcom *rc = ls->ls_recover_buf; + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + struct rcom_slot *ro0, *ro; + int our_nodeid = dlm_our_nodeid(); + int i, num_slots; + uint32_t gen; + + if (!dlm_slots_version(&rc->rc_header)) + return -1; + + gen = le32_to_cpu(rf->rf_generation); + if (gen <= ls->ls_generation) { + log_error(ls, "dlm_slots_copy_in gen %u old %u", + gen, ls->ls_generation); + } + ls->ls_generation = gen; + + num_slots = le16_to_cpu(rf->rf_num_slots); + if (!num_slots) + return -1; + + ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid); + ro->ro_slot = le16_to_cpu(ro->ro_slot); + } + + log_slots(ls, gen, num_slots, ro0, NULL, 0); + + list_for_each_entry(memb, &ls->ls_nodes, list) { + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + if (ro->ro_nodeid != memb->nodeid) + continue; + memb->slot = ro->ro_slot; + memb->slot_prev = memb->slot; + break; + } + + if (memb->nodeid == our_nodeid) { + if (ls->ls_slot && ls->ls_slot != memb->slot) { + log_error(ls, "dlm_slots_copy_in our slot " + "changed %d %d", ls->ls_slot, + memb->slot); + return -1; + } + + if (!ls->ls_slot) + ls->ls_slot = memb->slot; + } + + if (!memb->slot) { + log_error(ls, "dlm_slots_copy_in nodeid %d no slot", + memb->nodeid); + return -1; + } + } + + return 0; +} + +/* for any nodes that do not support slots, we will not have set memb->slot + in wait_status_all(), so memb->slot will remain -1, and we will not + assign slots or set ls_num_slots here */ + +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out) +{ + struct dlm_member *memb; + struct dlm_slot *array; + int our_nodeid = dlm_our_nodeid(); + int array_size, max_slots, i; + int need = 0; + int max = 0; + int num = 0; + uint32_t gen = 0; + + /* our own memb struct will have slot -1 gen 0 */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == our_nodeid) { + memb->slot = ls->ls_slot; + memb->generation = ls->ls_generation; + break; + } + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->generation > gen) + gen = memb->generation; + + /* node doesn't support slots */ + + if (memb->slot == -1) + return -1; + + /* node needs a slot assigned */ + + if (!memb->slot) + need++; + + /* node has a slot assigned */ + + num++; + + if (!max || max < memb->slot) + max = memb->slot; + + /* sanity check, once slot is assigned it shouldn't change */ + + if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) { + log_error(ls, "nodeid %d slot changed %d %d", + memb->nodeid, memb->slot_prev, memb->slot); + return -1; + } + memb->slot_prev = memb->slot; + } + + array_size = max + need; + + array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + if (!array) + return -ENOMEM; + + num = 0; + + /* fill in slots (offsets) that are used */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!memb->slot) + continue; + + if (memb->slot > array_size) { + log_error(ls, "invalid slot number %d", memb->slot); + kfree(array); + return -1; + } + + array[memb->slot - 1].nodeid = memb->nodeid; + array[memb->slot - 1].slot = memb->slot; + num++; + } + + /* assign new slots from unused offsets */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->slot) + continue; + + for (i = 0; i < array_size; i++) { + if (array[i].nodeid) + continue; + + memb->slot = i + 1; + memb->slot_prev = memb->slot; + array[i].nodeid = memb->nodeid; + array[i].slot = memb->slot; + num++; + + if (!ls->ls_slot && memb->nodeid == our_nodeid) + ls->ls_slot = memb->slot; + break; + } + + if (!memb->slot) { + log_error(ls, "no free slot found"); + kfree(array); + return -1; + } + } + + gen++; + + log_slots(ls, gen, num, NULL, array, array_size); + + max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - + sizeof(struct rcom_config)) / sizeof(struct rcom_slot); + + if (num > max_slots) { + log_error(ls, "num_slots %d exceeds max_slots %d", + num, max_slots); + kfree(array); + return -1; + } + + *gen_out = gen; + *slots_out = array; + *slots_size = array_size; + *num_slots = num; + return 0; +} + +static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) +{ + struct dlm_member *memb = NULL; + struct list_head *tmp; + struct list_head *newlist = &new->list; + struct list_head *head = &ls->ls_nodes; + + list_for_each(tmp, head) { + memb = list_entry(tmp, struct dlm_member, list); + if (new->nodeid < memb->nodeid) + break; + } + + if (!memb) + list_add_tail(newlist, head); + else { + /* FIXME: can use list macro here */ + newlist->prev = tmp->prev; + newlist->next = tmp; + tmp->prev->next = newlist; + tmp->prev = newlist; + } +} + +static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) +{ + struct dlm_member *memb; + int error; + + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + if (!memb) + return -ENOMEM; + + error = dlm_lowcomms_connect_node(node->nodeid); + if (error < 0) { + kfree(memb); + return error; + } + + memb->nodeid = node->nodeid; + memb->weight = node->weight; + memb->comm_seq = node->comm_seq; + add_ordered_member(ls, memb); + ls->ls_num_nodes++; + return 0; +} + +static struct dlm_member *find_memb(struct list_head *head, int nodeid) +{ + struct dlm_member *memb; + + list_for_each_entry(memb, head, list) { + if (memb->nodeid == nodeid) + return memb; + } + return NULL; +} + +int dlm_is_member(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes, nodeid)) + return 1; + return 0; +} + +int dlm_is_removed(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes_gone, nodeid)) + return 1; + return 0; +} + +static void clear_memb_list(struct list_head *head) +{ + struct dlm_member *memb; + + while (!list_empty(head)) { + memb = list_entry(head->next, struct dlm_member, list); + list_del(&memb->list); + kfree(memb); + } +} + +void dlm_clear_members(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes); + ls->ls_num_nodes = 0; +} + +void dlm_clear_members_gone(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes_gone); +} + +static void make_member_array(struct dlm_ls *ls) +{ + struct dlm_member *memb; + int i, w, x = 0, total = 0, all_zero = 0, *array; + + kfree(ls->ls_node_array); + ls->ls_node_array = NULL; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->weight) + total += memb->weight; + } + + /* all nodes revert to weight of 1 if all have weight 0 */ + + if (!total) { + total = ls->ls_num_nodes; + all_zero = 1; + } + + ls->ls_total_weight = total; + + array = kmalloc(sizeof(int) * total, GFP_NOFS); + if (!array) + return; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!all_zero && !memb->weight) + continue; + + if (all_zero) + w = 1; + else + w = memb->weight; + + DLM_ASSERT(x < total, printk("total %d x %d\n", total, x);); + + for (i = 0; i < w; i++) + array[x++] = memb->nodeid; + } + + ls->ls_node_array = array; +} + +/* send a status request to all members just to establish comms connections */ + +static int ping_members(struct dlm_ls *ls) +{ + struct dlm_member *memb; + int error = 0; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + error = dlm_recovery_stopped(ls); + if (error) + break; + error = dlm_rcom_status(ls, memb->nodeid, 0); + if (error) + break; + } + if (error) + log_rinfo(ls, "ping_members aborted %d last nodeid %d", + error, ls->ls_recover_nodeid); + return error; +} + +static void dlm_lsop_recover_prep(struct dlm_ls *ls) +{ + if (!ls->ls_ops || !ls->ls_ops->recover_prep) + return; + ls->ls_ops->recover_prep(ls->ls_ops_arg); +} + +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +{ + struct dlm_slot slot; + uint32_t seq; + int error; + + if (!ls->ls_ops || !ls->ls_ops->recover_slot) + return; + + /* if there is no comms connection with this node + or the present comms connection is newer + than the one when this member was added, then + we consider the node to have failed (versus + being removed due to dlm_release_lockspace) */ + + error = dlm_comm_seq(memb->nodeid, &seq); + + if (!error && seq == memb->comm_seq) + return; + + slot.nodeid = memb->nodeid; + slot.slot = memb->slot; + + ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot); +} + +void dlm_lsop_recover_done(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int i, num; + + if (!ls->ls_ops || !ls->ls_ops->recover_done) + return; + + num = ls->ls_num_nodes; + + slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + if (!slots) + return; + + i = 0; + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (i == num) { + log_error(ls, "dlm_lsop_recover_done bad num %d", num); + goto out; + } + slots[i].nodeid = memb->nodeid; + slots[i].slot = memb->slot; + i++; + } + + ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num, + ls->ls_slot, ls->ls_generation); + out: + kfree(slots); +} + +static struct dlm_config_node *find_config_node(struct dlm_recover *rv, + int nodeid) +{ + int i; + + for (i = 0; i < rv->nodes_count; i++) { + if (rv->nodes[i].nodeid == nodeid) + return &rv->nodes[i]; + } + return NULL; +} + +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) +{ + struct dlm_member *memb, *safe; + struct dlm_config_node *node; + int i, error, neg = 0, low = -1; + + /* previously removed members that we've not finished removing need to + count as a negative change so the "neg" recovery steps will happen */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + log_rinfo(ls, "prev removed member %d", memb->nodeid); + neg++; + } + + /* move departed members from ls_nodes to ls_nodes_gone */ + + list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { + node = find_config_node(rv, memb->nodeid); + if (node && !node->new) + continue; + + if (!node) { + log_rinfo(ls, "remove member %d", memb->nodeid); + } else { + /* removed and re-added */ + log_rinfo(ls, "remove member %d comm_seq %u %u", + memb->nodeid, memb->comm_seq, node->comm_seq); + } + + neg++; + list_move(&memb->list, &ls->ls_nodes_gone); + ls->ls_num_nodes--; + dlm_lsop_recover_slot(ls, memb); + } + + /* add new members to ls_nodes */ + + for (i = 0; i < rv->nodes_count; i++) { + node = &rv->nodes[i]; + if (dlm_is_member(ls, node->nodeid)) + continue; + dlm_add_member(ls, node); + log_rinfo(ls, "add member %d", node->nodeid); + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (low == -1 || memb->nodeid < low) + low = memb->nodeid; + } + ls->ls_low_nodeid = low; + + make_member_array(ls); + *neg_out = neg; + + error = ping_members(ls); + if (!error || error == -EPROTO) { + /* new_lockspace() may be waiting to know if the config + is good or bad */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); + } + + log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); + return error; +} + +/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before + dlm_ls_start() is called on any of them to start the new recovery. */ + +int dlm_ls_stop(struct dlm_ls *ls) +{ + int new; + + /* + * Prevent dlm_recv from being in the middle of something when we do + * the stop. This includes ensuring dlm_recv isn't processing a + * recovery message (rcom), while dlm_recoverd is aborting and + * resetting things from an in-progress recovery. i.e. we want + * dlm_recoverd to abort its recovery without worrying about dlm_recv + * processing an rcom at the same time. Stopping dlm_recv also makes + * it easy for dlm_receive_message() to check locking stopped and add a + * message to the requestqueue without races. + */ + + down_write(&ls->ls_recv_active); + + /* + * Abort any recovery that's in progress (see RECOVER_STOP, + * dlm_recovery_stopped()) and tell any other threads running in the + * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). + */ + + spin_lock(&ls->ls_recover_lock); + set_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); + ls->ls_recover_seq++; + spin_unlock(&ls->ls_recover_lock); + + /* + * Let dlm_recv run again, now any normal messages will be saved on the + * requestqueue for later. + */ + + up_write(&ls->ls_recv_active); + + /* + * This in_recovery lock does two things: + * 1) Keeps this function from returning until all threads are out + * of locking routines and locking is truly stopped. + * 2) Keeps any new requests from being processed until it's unlocked + * when recovery is complete. + */ + + if (new) { + set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + } + + /* + * The recoverd suspend/resume makes sure that dlm_recoverd (if + * running) has noticed RECOVER_STOP above and quit processing the + * previous recovery. + */ + + dlm_recoverd_suspend(ls); + + spin_lock(&ls->ls_recover_lock); + kfree(ls->ls_slots); + ls->ls_slots = NULL; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_recover_status = 0; + spin_unlock(&ls->ls_recover_lock); + + dlm_recoverd_resume(ls); + + if (!ls->ls_recover_begin) + ls->ls_recover_begin = jiffies; + + dlm_lsop_recover_prep(ls); + return 0; +} + +int dlm_ls_start(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL, *rv_old; + struct dlm_config_node *nodes; + int error, count; + + rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + if (!rv) + return -ENOMEM; + + error = dlm_config_nodes(ls->ls_name, &nodes, &count); + if (error < 0) + goto fail; + + spin_lock(&ls->ls_recover_lock); + + /* the lockspace needs to be stopped before it can be started */ + + if (!dlm_locking_stopped(ls)) { + spin_unlock(&ls->ls_recover_lock); + log_error(ls, "start ignored: lockspace running"); + error = -EINVAL; + goto fail; + } + + rv->nodes = nodes; + rv->nodes_count = count; + rv->seq = ++ls->ls_recover_seq; + rv_old = ls->ls_recover_args; + ls->ls_recover_args = rv; + spin_unlock(&ls->ls_recover_lock); + + if (rv_old) { + log_error(ls, "unused recovery %llx %d", + (unsigned long long)rv_old->seq, rv_old->nodes_count); + kfree(rv_old->nodes); + kfree(rv_old); + } + + set_bit(LSFL_RECOVER_WORK, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); + return 0; + + fail: + kfree(rv); + kfree(nodes); + return error; +} + diff --git a/kernel/fs/dlm/member.h b/kernel/fs/dlm/member.h new file mode 100644 index 000000000..3deb70661 --- /dev/null +++ b/kernel/fs/dlm/member.h @@ -0,0 +1,33 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMBER_DOT_H__ +#define __MEMBER_DOT_H__ + +int dlm_ls_stop(struct dlm_ls *ls); +int dlm_ls_start(struct dlm_ls *ls); +void dlm_clear_members(struct dlm_ls *ls); +void dlm_clear_members_gone(struct dlm_ls *ls); +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); +int dlm_is_removed(struct dlm_ls *ls, int nodeid); +int dlm_is_member(struct dlm_ls *ls, int nodeid); +int dlm_slots_version(struct dlm_header *h); +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb); +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_slots_copy_in(struct dlm_ls *ls); +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out); +void dlm_lsop_recover_done(struct dlm_ls *ls); + +#endif /* __MEMBER_DOT_H__ */ + diff --git a/kernel/fs/dlm/memory.c b/kernel/fs/dlm/memory.c new file mode 100644 index 000000000..7cd24bccd --- /dev/null +++ b/kernel/fs/dlm/memory.c @@ -0,0 +1,96 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "config.h" +#include "memory.h" + +static struct kmem_cache *lkb_cache; +static struct kmem_cache *rsb_cache; + + +int __init dlm_memory_init(void) +{ + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), + __alignof__(struct dlm_lkb), 0, NULL); + if (!lkb_cache) + return -ENOMEM; + + rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), + __alignof__(struct dlm_rsb), 0, NULL); + if (!rsb_cache) { + kmem_cache_destroy(lkb_cache); + return -ENOMEM; + } + + return 0; +} + +void dlm_memory_exit(void) +{ + if (lkb_cache) + kmem_cache_destroy(lkb_cache); + if (rsb_cache) + kmem_cache_destroy(rsb_cache); +} + +char *dlm_allocate_lvb(struct dlm_ls *ls) +{ + char *p; + + p = kzalloc(ls->ls_lvblen, GFP_NOFS); + return p; +} + +void dlm_free_lvb(char *p) +{ + kfree(p); +} + +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + r = kmem_cache_zalloc(rsb_cache, GFP_NOFS); + return r; +} + +void dlm_free_rsb(struct dlm_rsb *r) +{ + if (r->res_lvbptr) + dlm_free_lvb(r->res_lvbptr); + kmem_cache_free(rsb_cache, r); +} + +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + + lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); + return lkb; +} + +void dlm_free_lkb(struct dlm_lkb *lkb) +{ + if (lkb->lkb_flags & DLM_IFL_USER) { + struct dlm_user_args *ua; + ua = lkb->lkb_ua; + if (ua) { + if (ua->lksb.sb_lvbptr) + kfree(ua->lksb.sb_lvbptr); + kfree(ua); + } + } + kmem_cache_free(lkb_cache, lkb); +} + diff --git a/kernel/fs/dlm/memory.h b/kernel/fs/dlm/memory.h new file mode 100644 index 000000000..177c11cbb --- /dev/null +++ b/kernel/fs/dlm/memory.h @@ -0,0 +1,27 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMORY_DOT_H__ +#define __MEMORY_DOT_H__ + +int dlm_memory_init(void); +void dlm_memory_exit(void); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls); +void dlm_free_rsb(struct dlm_rsb *r); +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); +void dlm_free_lkb(struct dlm_lkb *l); +char *dlm_allocate_lvb(struct dlm_ls *ls); +void dlm_free_lvb(char *l); + +#endif /* __MEMORY_DOT_H__ */ + diff --git a/kernel/fs/dlm/midcomms.c b/kernel/fs/dlm/midcomms.c new file mode 100644 index 000000000..f3396c622 --- /dev/null +++ b/kernel/fs/dlm/midcomms.c @@ -0,0 +1,137 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * midcomms.c + * + * This is the appallingly named "mid-level" comms layer. + * + * Its purpose is to take packets from the "real" comms layer, + * split them up into packets and pass them to the interested + * part of the locking mechanism. + * + * It also takes messages from the locking layer, formats them + * into packets and sends them to the comms layer. + */ + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "config.h" +#include "lock.h" +#include "midcomms.h" + + +static void copy_from_cb(void *dst, const void *base, unsigned offset, + unsigned len, unsigned limit) +{ + unsigned copy = len; + + if ((copy + offset) > limit) + copy = limit - offset; + memcpy(dst, base + offset, copy); + len -= copy; + if (len) + memcpy(dst + copy, base, len); +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + * + * Only complete messages are processed here, any "spare" bytes from + * the end of a buffer are saved and tacked onto the front of the next + * message that comes in. I doubt this will happen very often but we + * need to be able to cope with it and I don't want the task to be waiting + * for packets to come in when there is useful work to be done. + */ + +int dlm_process_incoming_buffer(int nodeid, const void *base, + unsigned offset, unsigned len, unsigned limit) +{ + union { + unsigned char __buf[DLM_INBUF_LEN]; + /* this is to force proper alignment on some arches */ + union dlm_packet p; + } __tmp; + union dlm_packet *p = &__tmp.p; + int ret = 0; + int err = 0; + uint16_t msglen; + uint32_t lockspace; + + while (len > sizeof(struct dlm_header)) { + + /* Copy just the header to check the total length. The + message may wrap around the end of the buffer back to the + start, so we need to use a temp buffer and copy_from_cb. */ + + copy_from_cb(p, base, offset, sizeof(struct dlm_header), + limit); + + msglen = le16_to_cpu(p->header.h_length); + lockspace = p->header.h_lockspace; + + err = -EINVAL; + if (msglen < sizeof(struct dlm_header)) + break; + if (p->header.h_cmd == DLM_MSG) { + if (msglen < sizeof(struct dlm_message)) + break; + } else { + if (msglen < sizeof(struct dlm_rcom)) + break; + } + err = -E2BIG; + if (msglen > dlm_config.ci_buffer_size) { + log_print("message size %d from %d too big, buf len %d", + msglen, nodeid, len); + break; + } + err = 0; + + /* If only part of the full message is contained in this + buffer, then do nothing and wait for lowcomms to call + us again later with more data. We return 0 meaning + we've consumed none of the input buffer. */ + + if (msglen > len) + break; + + /* Allocate a larger temp buffer if the full message won't fit + in the buffer on the stack (which should work for most + ordinary messages). */ + + if (msglen > sizeof(__tmp) && p == &__tmp.p) { + p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + if (p == NULL) + return ret; + } + + copy_from_cb(p, base, offset, msglen, limit); + + BUG_ON(lockspace != p->header.h_lockspace); + + ret += msglen; + offset += msglen; + offset &= (limit - 1); + len -= msglen; + + dlm_receive_buffer(p, nodeid); + } + + if (p != &__tmp.p) + kfree(p); + + return err ? err : ret; +} + diff --git a/kernel/fs/dlm/midcomms.h b/kernel/fs/dlm/midcomms.h new file mode 100644 index 000000000..95852a5f1 --- /dev/null +++ b/kernel/fs/dlm/midcomms.h @@ -0,0 +1,21 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MIDCOMMS_DOT_H__ +#define __MIDCOMMS_DOT_H__ + +int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, + unsigned len, unsigned limit); + +#endif /* __MIDCOMMS_DOT_H__ */ + diff --git a/kernel/fs/dlm/netlink.c b/kernel/fs/dlm/netlink.c new file mode 100644 index 000000000..1e6e22713 --- /dev/null +++ b/kernel/fs/dlm/netlink.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include + +#include "dlm_internal.h" + +static uint32_t dlm_nl_seqnum; +static uint32_t listener_nlportid; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, +}; + +static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) +{ + struct sk_buff *skb; + void *data; + + skb = genlmsg_new(size, GFP_NOFS); + if (!skb) + return -ENOMEM; + + /* add the message headers */ + data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd); + if (!data) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + return 0; +} + +static struct dlm_lock_data *mk_data(struct sk_buff *skb) +{ + struct nlattr *ret; + + ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data)); + if (!ret) + return NULL; + return nla_data(ret); +} + +static int send_data(struct sk_buff *skb) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *data = genlmsg_data(genlhdr); + + genlmsg_end(skb, data); + + return genlmsg_unicast(&init_net, skb, listener_nlportid); +} + +static int user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + listener_nlportid = info->snd_portid; + printk("user_cmd nlpid %u\n", listener_nlportid); + return 0; +} + +static struct genl_ops dlm_nl_ops[] = { + { + .cmd = DLM_CMD_HELLO, + .doit = user_cmd, + }, +}; + +int __init dlm_netlink_init(void) +{ + return genl_register_family_with_ops(&family, dlm_nl_ops); +} + +void dlm_netlink_exit(void) +{ + genl_unregister_family(&family); +} + +static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) +{ + struct dlm_rsb *r = lkb->lkb_resource; + + memset(data, 0, sizeof(struct dlm_lock_data)); + + data->version = DLM_LOCK_DATA_VERSION; + data->nodeid = lkb->lkb_nodeid; + data->ownpid = lkb->lkb_ownpid; + data->id = lkb->lkb_id; + data->remid = lkb->lkb_remid; + data->status = lkb->lkb_status; + data->grmode = lkb->lkb_grmode; + data->rqmode = lkb->lkb_rqmode; + if (lkb->lkb_ua) + data->xid = lkb->lkb_ua->xid; + if (r) { + data->lockspace_id = r->res_ls->ls_global_id; + data->resource_namelen = r->res_length; + memcpy(data->resource_name, r->res_name, r->res_length); + } +} + +void dlm_timeout_warn(struct dlm_lkb *lkb) +{ + struct sk_buff *uninitialized_var(send_skb); + struct dlm_lock_data *data; + size_t size; + int rv; + + size = nla_total_size(sizeof(struct dlm_lock_data)) + + nla_total_size(0); /* why this? */ + + rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size); + if (rv < 0) + return; + + data = mk_data(send_skb); + if (!data) { + nlmsg_free(send_skb); + return; + } + + fill_data(data, lkb); + + send_data(send_skb); +} + diff --git a/kernel/fs/dlm/plock.c b/kernel/fs/dlm/plock.c new file mode 100644 index 000000000..e0ab3a93e --- /dev/null +++ b/kernel/fs/dlm/plock.c @@ -0,0 +1,515 @@ +/* + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lockspace.h" + +static spinlock_t ops_lock; +static struct list_head send_list; +static struct list_head recv_list; +static wait_queue_head_t send_wq; +static wait_queue_head_t recv_wq; + +struct plock_op { + struct list_head list; + int done; + struct dlm_plock_info info; +}; + +struct plock_xop { + struct plock_op xop; + int (*callback)(struct file_lock *fl, int result); + void *fl; + void *file; + struct file_lock flc; +}; + + +static inline void set_version(struct dlm_plock_info *info) +{ + info->version[0] = DLM_PLOCK_VERSION_MAJOR; + info->version[1] = DLM_PLOCK_VERSION_MINOR; + info->version[2] = DLM_PLOCK_VERSION_PATCH; +} + +static int check_version(struct dlm_plock_info *info) +{ + if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (DLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_print("plock device version mismatch: " + "kernel (%u.%u.%u), user (%u.%u.%u)", + DLM_PLOCK_VERSION_MAJOR, + DLM_PLOCK_VERSION_MINOR, + DLM_PLOCK_VERSION_PATCH, + info->version[0], + info->version[1], + info->version[2]); + return -EINVAL; + } + return 0; +} + +static void send_op(struct plock_op *op) +{ + set_version(&op->info); + INIT_LIST_HEAD(&op->list); + spin_lock(&ops_lock); + list_add_tail(&op->list, &send_list); + spin_unlock(&ops_lock); + wake_up(&send_wq); +} + +/* If a process was killed while waiting for the only plock on a file, + locks_remove_posix will not see any lock on the file so it won't + send an unlock-close to us to pass on to userspace to clean up the + abandoned waiter. So, we have to insert the unlock-close when the + lock call is interrupted. */ + +static void do_unlock_close(struct dlm_ls *ls, u64 number, + struct file *file, struct file_lock *fl) +{ + struct plock_op *op; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) + return; + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = 0; + op->info.end = OFFSET_MAX; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); +} + +int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + int cmd, struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + struct plock_xop *xop; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + xop = kzalloc(sizeof(*xop), GFP_NOFS); + if (!xop) { + rv = -ENOMEM; + goto out; + } + + op = &xop->xop; + op->info.optype = DLM_PLOCK_OP_LOCK; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.wait = IS_SETLKW(cmd); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + /* fl_owner is lockd which doesn't distinguish + processes on the nfs client */ + op->info.owner = (__u64) fl->fl_pid; + xop->callback = fl->fl_lmops->lm_grant; + locks_init_lock(&xop->flc); + locks_copy_lock(&xop->flc, fl); + xop->fl = fl; + xop->file = file; + } else { + op->info.owner = (__u64)(long) fl->fl_owner; + xop->callback = NULL; + } + + send_op(op); + + if (xop->callback == NULL) { + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + log_debug(ls, "dlm_posix_lock: wait killed %llx", + (unsigned long long)number); + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + kfree(xop); + do_unlock_close(ls, number, file, fl); + goto out; + } + } else { + rv = FILE_LOCK_DEFERRED; + goto out; + } + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_lock: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (!rv) { + if (posix_lock_file_wait(file, fl) < 0) + log_error(ls, "dlm_posix_lock: vfs lock error %llx", + (unsigned long long)number); + } + + kfree(xop); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_lock); + +/* Returns failure iff a successful lock operation should be canceled */ +static int dlm_plock_callback(struct plock_op *op) +{ + struct file *file; + struct file_lock *fl; + struct file_lock *flc; + int (*notify)(struct file_lock *fl, int result) = NULL; + struct plock_xop *xop = (struct plock_xop *)op; + int rv = 0; + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_print("dlm_plock_callback: op on list %llx", + (unsigned long long)op->info.number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + /* check if the following 2 are still valid or make a copy */ + file = xop->file; + flc = &xop->flc; + fl = xop->fl; + notify = xop->callback; + + if (op->info.rv) { + notify(fl, op->info.rv); + goto out; + } + + /* got fs lock; bookkeep locally as well: */ + flc->fl_flags &= ~FL_SLEEP; + if (posix_lock_file(file, flc, NULL)) { + /* + * This can only happen in the case of kmalloc() failure. + * The filesystem's own lock is the authoritative lock, + * so a failure to get the lock locally is not a disaster. + * As long as the fs cannot reliably cancel locks (especially + * in a low-memory situation), we're better off ignoring + * this failure than trying to recover. + */ + log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p", + (unsigned long long)op->info.number, file, fl); + } + + rv = notify(fl, 0); + if (rv) { + /* XXX: We need to cancel the fs lock here: */ + log_print("dlm_plock_callback: lock granted after lock request " + "failed; dangling lock!\n"); + goto out; + } + +out: + kfree(xop); + return rv; +} + +int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + unsigned char fl_flags = fl->fl_flags; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + /* cause the vfs unlock to return ENOENT if lock is not found */ + fl->fl_flags |= FL_EXISTS; + + rv = posix_lock_file_wait(file, fl); + if (rv == -ENOENT) { + rv = 0; + goto out_free; + } + if (rv < 0) { + log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx", + rv, (unsigned long long)number); + } + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + if (fl->fl_flags & FL_CLOSE) { + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); + rv = 0; + goto out; + } + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_unlock: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (rv == -ENOENT) + rv = 0; + +out_free: + kfree(op); +out: + dlm_put_lockspace(ls); + fl->fl_flags = fl_flags; + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_unlock); + +int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + op->info.optype = DLM_PLOCK_OP_GET; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->lm_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_get: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + /* info.rv from userspace is 1 for conflict, 0 for no-conflict, + -ENOENT if there are no locks on the file */ + + rv = op->info.rv; + + fl->fl_type = F_UNLCK; + if (rv == -ENOENT) + rv = 0; + else if (rv > 0) { + locks_init_lock(fl); + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_flags = FL_POSIX; + fl->fl_pid = op->info.pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + rv = 0; + } + + kfree(op); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_get); + +/* a read copies out one plock request from the send list */ +static ssize_t dev_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct dlm_plock_info info; + struct plock_op *op = NULL; + + if (count < sizeof(info)) + return -EINVAL; + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + op = list_entry(send_list.next, struct plock_op, list); + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + list_del(&op->list); + else + list_move(&op->list, &recv_list); + memcpy(&info, &op->info, sizeof(info)); + } + spin_unlock(&ops_lock); + + if (!op) + return -EAGAIN; + + /* there is no need to get a reply from userspace for unlocks + that were generated by the vfs cleaning up for a close + (the process did not make an unlock call). */ + + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + kfree(op); + + if (copy_to_user(u, &info, sizeof(info))) + return -EFAULT; + return sizeof(info); +} + +/* a write copies in one plock result that should match a plock_op + on the recv list */ +static ssize_t dev_write(struct file *file, const char __user *u, size_t count, + loff_t *ppos) +{ + struct dlm_plock_info info; + struct plock_op *op; + int found = 0, do_callback = 0; + + if (count != sizeof(info)) + return -EINVAL; + + if (copy_from_user(&info, u, sizeof(info))) + return -EFAULT; + + if (check_version(&info)) + return -EINVAL; + + spin_lock(&ops_lock); + list_for_each_entry(op, &recv_list, list) { + if (op->info.fsid == info.fsid && + op->info.number == info.number && + op->info.owner == info.owner) { + struct plock_xop *xop = (struct plock_xop *)op; + list_del_init(&op->list); + memcpy(&op->info, &info, sizeof(info)); + if (xop->callback) + do_callback = 1; + else + op->done = 1; + found = 1; + break; + } + } + spin_unlock(&ops_lock); + + if (found) { + if (do_callback) + dlm_plock_callback(op); + else + wake_up(&recv_wq); + } else + log_print("dev_write no op %x %llx", info.fsid, + (unsigned long long)info.number); + return count; +} + +static unsigned int dev_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = 0; + + poll_wait(file, &send_wq, wait); + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) + mask = POLLIN | POLLRDNORM; + spin_unlock(&ops_lock); + + return mask; +} + +static const struct file_operations dev_fops = { + .read = dev_read, + .write = dev_write, + .poll = dev_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice plock_dev_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DLM_PLOCK_MISC_NAME, + .fops = &dev_fops +}; + +int dlm_plock_init(void) +{ + int rv; + + spin_lock_init(&ops_lock); + INIT_LIST_HEAD(&send_list); + INIT_LIST_HEAD(&recv_list); + init_waitqueue_head(&send_wq); + init_waitqueue_head(&recv_wq); + + rv = misc_register(&plock_dev_misc); + if (rv) + log_print("dlm_plock_init: misc_register failed %d", rv); + return rv; +} + +void dlm_plock_exit(void) +{ + if (misc_deregister(&plock_dev_misc) < 0) + log_print("dlm_plock_exit: misc_deregister failed"); +} + diff --git a/kernel/fs/dlm/rcom.c b/kernel/fs/dlm/rcom.c new file mode 100644 index 000000000..f3f5e72a2 --- /dev/null +++ b/kernel/fs/dlm/rcom.c @@ -0,0 +1,656 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "rcom.h" +#include "recover.h" +#include "dir.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "util.h" + +static int rcom_response(struct dlm_ls *ls) +{ + return test_bit(LSFL_RCOM_READY, &ls->ls_flags); +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + char *mb; + int mb_len = sizeof(struct dlm_rcom) + len; + + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + memset(mb, 0, mb_len); + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.h_lockspace = ls->ls_global_id; + rc->rc_header.h_nodeid = dlm_our_nodeid(); + rc->rc_header.h_length = mb_len; + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = type; + + spin_lock(&ls->ls_recover_lock); + rc->rc_seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + + *mh_ret = mh; + *rc_ret = rc; + return 0; +} + +static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, + struct dlm_rcom *rc) +{ + dlm_rcom_out(rc); + dlm_lowcomms_commit_buffer(mh); +} + +static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, + uint32_t flags) +{ + rs->rs_flags = cpu_to_le32(flags); +} + +/* When replying to a status request, a node also sends back its + configuration values. The requesting node then checks that the remote + node is configured the same way as itself. */ + +static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf, + uint32_t num_slots) +{ + rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); + rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); + + rf->rf_our_slot = cpu_to_le16(ls->ls_slot); + rf->rf_num_slots = cpu_to_le16(num_slots); + rf->rf_generation = cpu_to_le32(ls->ls_generation); +} + +static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +{ + struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; + + if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { + log_error(ls, "version mismatch: %x nodeid %d: %x", + DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, + rc->rc_header.h_version); + return -EPROTO; + } + + if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || + le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { + log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", + ls->ls_lvblen, ls->ls_exflags, nodeid, + le32_to_cpu(rf->rf_lvblen), + le32_to_cpu(rf->rf_lsflags)); + return -EPROTO; + } + return 0; +} + +static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq) +{ + spin_lock(&ls->ls_rcom_spin); + *new_seq = ++ls->ls_rcom_seq; + set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +static void disallow_sync_reply(struct dlm_ls *ls) +{ + spin_lock(&ls->ls_rcom_spin); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +/* + * low nodeid gathers one slot value at a time from each node. + * it sets need_slots=0, and saves rf_our_slot returned from each + * rcom_config. + * + * other nodes gather all slot values at once from the low nodeid. + * they set need_slots=1, and ignore the rf_our_slot returned from each + * rcom_config. they use the rf_num_slots returned from the low + * node's rcom_config. + */ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error = 0; + + ls->ls_recover_nodeid = nodeid; + + if (nodeid == dlm_our_nodeid()) { + rc = ls->ls_recover_buf; + rc->rc_result = dlm_recover_status(ls); + goto out; + } + + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &mh); + if (error) + goto out; + + set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + + send_rcom(ls, mh, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + if (error) + goto out; + + rc = ls->ls_recover_buf; + + if (rc->rc_result == -ESRCH) { + /* we pretend the remote lockspace exists with 0 status */ + log_debug(ls, "remote node %d not ready", nodeid); + rc->rc_result = 0; + error = 0; + } else { + error = check_rcom_config(ls, rc, nodeid); + } + + /* the caller looks at rc_result for the remote recovery status */ + out: + return error; +} + +static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct rcom_status *rs; + uint32_t status; + int nodeid = rc_in->rc_header.h_nodeid; + int len = sizeof(struct rcom_config); + int num_slots = 0; + int error; + + if (!dlm_slots_version(&rc_in->rc_header)) { + status = dlm_recover_status(ls); + goto do_create; + } + + rs = (struct rcom_status *)rc_in->rc_buf; + + if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) { + status = dlm_recover_status(ls); + goto do_create; + } + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + num_slots = ls->ls_num_slots; + spin_unlock(&ls->ls_recover_lock); + len += num_slots * sizeof(struct rcom_slot); + + do_create: + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, + len, &rc, &mh); + if (error) + return; + + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = status; + + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots); + + if (!num_slots) + goto do_send; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_num_slots != num_slots) { + spin_unlock(&ls->ls_recover_lock); + log_debug(ls, "receive_rcom_status num_slots %d to %d", + num_slots, ls->ls_num_slots); + rc->rc_result = 0; + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0); + goto do_send; + } + + dlm_slots_copy_out(ls, rc); + spin_unlock(&ls->ls_recover_lock); + + do_send: + send_rcom(ls, mh, rc); +} + +static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + spin_lock(&ls->ls_rcom_spin); + if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || + rc_in->rc_id != ls->ls_rcom_seq) { + log_debug(ls, "reject reply %d from %d seq %llx expect %llx", + rc_in->rc_type, rc_in->rc_header.h_nodeid, + (unsigned long long)rc_in->rc_id, + (unsigned long long)ls->ls_rcom_seq); + goto out; + } + memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); + set_bit(LSFL_RCOM_READY, &ls->ls_flags); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + wake_up(&ls->ls_wait_general); + out: + spin_unlock(&ls->ls_rcom_spin); +} + +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error = 0; + + ls->ls_recover_nodeid = nodeid; + + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); + if (error) + goto out; + memcpy(rc->rc_buf, last_name, last_len); + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + + send_rcom(ls, mh, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + out: + return error; +} + +static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, inlen, outlen, nodeid; + + nodeid = rc_in->rc_header.h_nodeid; + inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); + + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); + if (error) + return; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, + nodeid); + send_rcom(ls, mh, rc); +} + +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct dlm_ls *ls = r->res_ls; + int error; + + error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, + &rc, &mh); + if (error) + goto out; + memcpy(rc->rc_buf, r->res_name, r->res_length); + rc->rc_id = (unsigned long) r->res_id; + + send_rcom(ls, mh, rc); + out: + return error; +} + +int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct dlm_ls *ls = r->res_ls; + int error; + + error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length, + &rc, &mh); + if (error) + goto out; + memcpy(rc->rc_buf, r->res_name, r->res_length); + rc->rc_id = 0xFFFFFFFF; + + send_rcom(ls, mh, rc); + out: + return error; +} + +static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; + int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + if (error) + return; + + if (rc_in->rc_id == 0xFFFFFFFF) { + log_error(ls, "receive_rcom_lookup dump from %d", nodeid); + dlm_dump_rsb_name(ls, rc_in->rc_buf, len); + return; + } + + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, + DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); + if (error) + ret_nodeid = error; + rc->rc_result = ret_nodeid; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(ls, mh, rc); +} + +static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + dlm_recover_master_reply(ls, rc_in); +} + +static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct rcom_lock *rl) +{ + memset(rl, 0, sizeof(*rl)); + + rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid); + rl->rl_lkid = cpu_to_le32(lkb->lkb_id); + rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags); + rl->rl_flags = cpu_to_le32(lkb->lkb_flags); + rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); + rl->rl_rqmode = lkb->lkb_rqmode; + rl->rl_grmode = lkb->lkb_grmode; + rl->rl_status = lkb->lkb_status; + rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); + + if (lkb->lkb_bastfn) + rl->rl_asts |= DLM_CB_BAST; + if (lkb->lkb_astfn) + rl->rl_asts |= DLM_CB_CAST; + + rl->rl_namelen = cpu_to_le16(r->res_length); + memcpy(rl->rl_name, r->res_name, r->res_length); + + /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ? + If so, receive_rcom_lock_args() won't take this copy. */ + + if (lkb->lkb_lvbptr) + memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); +} + +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct rcom_lock *rl; + int error, len = sizeof(struct rcom_lock); + + if (lkb->lkb_lvbptr) + len += ls->ls_lvblen; + + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); + if (error) + goto out; + + rl = (struct rcom_lock *) rc->rc_buf; + pack_rcom_lock(r, lkb, rl); + rc->rc_id = (unsigned long) r; + + send_rcom(ls, mh, rc); + out: + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, nodeid = rc_in->rc_header.h_nodeid; + + dlm_recover_master_copy(ls, rc_in); + + error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, + sizeof(struct rcom_lock), &rc, &mh); + if (error) + return; + + /* We send back the same rcom_lock struct we received, but + dlm_recover_master_copy() has filled in rl_remid and rl_result */ + + memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(ls, mh, rc); +} + +/* If the lockspace doesn't exist then still send a status message + back; it's possible that it just doesn't have its global_id yet. */ + +int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct rcom_config *rf; + struct dlm_mhandle *mh; + char *mb; + int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); + + mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + memset(mb, 0, mb_len); + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; + rc->rc_header.h_nodeid = dlm_our_nodeid(); + rc->rc_header.h_length = mb_len; + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = DLM_RCOM_STATUS_REPLY; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = -ESRCH; + + rf = (struct rcom_config *) rc->rc_buf; + rf->rf_lvblen = cpu_to_le32(~0U); + + dlm_rcom_out(rc); + dlm_lowcomms_commit_buffer(mh); + + return 0; +} + +/* + * Ignore messages for stage Y before we set + * recover_status bit for stage X: + * + * recover_status = 0 + * + * dlm_recover_members() + * - send nothing + * - recv nothing + * - ignore NAMES, NAMES_REPLY + * - ignore LOOKUP, LOOKUP_REPLY + * - ignore LOCK, LOCK_REPLY + * + * recover_status |= NODES + * + * dlm_recover_members_wait() + * + * dlm_recover_directory() + * - send NAMES + * - recv NAMES_REPLY + * - ignore LOOKUP, LOOKUP_REPLY + * - ignore LOCK, LOCK_REPLY + * + * recover_status |= DIR + * + * dlm_recover_directory_wait() + * + * dlm_recover_masters() + * - send LOOKUP + * - recv LOOKUP_REPLY + * + * dlm_recover_locks() + * - send LOCKS + * - recv LOCKS_REPLY + * + * recover_status |= LOCKS + * + * dlm_recover_locks_wait() + * + * recover_status |= DONE + */ + +/* Called by dlm_recv; corresponds to dlm_receive_message() but special + recovery-only comms are sent through here. */ + +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +{ + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + int stop, reply = 0, names = 0, lookup = 0, lock = 0; + uint32_t status; + uint64_t seq; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS_REPLY: + reply = 1; + break; + case DLM_RCOM_NAMES: + names = 1; + break; + case DLM_RCOM_NAMES_REPLY: + names = 1; + reply = 1; + break; + case DLM_RCOM_LOOKUP: + lookup = 1; + break; + case DLM_RCOM_LOOKUP_REPLY: + lookup = 1; + reply = 1; + break; + case DLM_RCOM_LOCK: + lock = 1; + break; + case DLM_RCOM_LOCK_REPLY: + lock = 1; + reply = 1; + break; + }; + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + + if (stop && (rc->rc_type != DLM_RCOM_STATUS)) + goto ignore; + + if (reply && (rc->rc_seq_reply != seq)) + goto ignore; + + if (!(status & DLM_RS_NODES) && (names || lookup || lock)) + goto ignore; + + if (!(status & DLM_RS_DIR) && (lookup || lock)) + goto ignore; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS: + receive_rcom_status(ls, rc); + break; + + case DLM_RCOM_NAMES: + receive_rcom_names(ls, rc); + break; + + case DLM_RCOM_LOOKUP: + receive_rcom_lookup(ls, rc); + break; + + case DLM_RCOM_LOCK: + if (rc->rc_header.h_length < lock_size) + goto Eshort; + receive_rcom_lock(ls, rc); + break; + + case DLM_RCOM_STATUS_REPLY: + receive_sync_reply(ls, rc); + break; + + case DLM_RCOM_NAMES_REPLY: + receive_sync_reply(ls, rc); + break; + + case DLM_RCOM_LOOKUP_REPLY: + receive_rcom_lookup_reply(ls, rc); + break; + + case DLM_RCOM_LOCK_REPLY: + if (rc->rc_header.h_length < lock_size) + goto Eshort; + dlm_recover_process_copy(ls, rc); + break; + + default: + log_error(ls, "receive_rcom bad type %d", rc->rc_type); + } + return; + +ignore: + log_limit(ls, "dlm_receive_rcom ignore msg %d " + "from %d %llu %llu recover seq %llu sts %x gen %u", + rc->rc_type, + nodeid, + (unsigned long long)rc->rc_seq, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq, + status, ls->ls_generation); + return; +Eshort: + log_error(ls, "recovery message %d from %d is too short", + rc->rc_type, nodeid); +} + diff --git a/kernel/fs/dlm/rcom.h b/kernel/fs/dlm/rcom.h new file mode 100644 index 000000000..f8e243463 --- /dev/null +++ b/kernel/fs/dlm/rcom.h @@ -0,0 +1,26 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RCOM_DOT_H__ +#define __RCOM_DOT_H__ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); +int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); +int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); + +#endif + diff --git a/kernel/fs/dlm/recover.c b/kernel/fs/dlm/recover.c new file mode 100644 index 000000000..eaea789bf --- /dev/null +++ b/kernel/fs/dlm/recover.c @@ -0,0 +1,955 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "dir.h" +#include "config.h" +#include "ast.h" +#include "memory.h" +#include "rcom.h" +#include "lock.h" +#include "lowcomms.h" +#include "member.h" +#include "recover.h" + + +/* + * Recovery waiting routines: these functions wait for a particular reply from + * a remote node, or for the remote node to report a certain status. They need + * to abort if the lockspace is stopped indicating a node has failed (perhaps + * the one being waited for). + */ + +/* + * Wait until given function returns non-zero or lockspace is stopped + * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another + * function thinks it could have completed the waited-on task, they should wake + * up ls_wait_general to get an immediate response rather than waiting for the + * timeout. This uses a timeout so it can check periodically if the wait + * should abort due to node failure (which doesn't cause a wake_up). + * This should only be called by the dlm_recoverd thread. + */ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) +{ + int error = 0; + int rv; + + while (1) { + rv = wait_event_timeout(ls->ls_wait_general, + testfn(ls) || dlm_recovery_stopped(ls), + dlm_config.ci_recover_timer * HZ); + if (rv) + break; + } + + if (dlm_recovery_stopped(ls)) { + log_debug(ls, "dlm_wait_function aborted"); + error = -EINTR; + } + return error; +} + +/* + * An efficient way for all nodes to wait for all others to have a certain + * status. The node with the lowest nodeid polls all the others for their + * status (wait_status_all) and all the others poll the node with the low id + * for its accumulated result (wait_status_low). When all nodes have set + * status flag X, then status flag X_ALL will be set on the low nodeid. + */ + +uint32_t dlm_recover_status(struct dlm_ls *ls) +{ + uint32_t status; + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + spin_unlock(&ls->ls_recover_lock); + return status; +} + +static void _set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + ls->ls_recover_status |= status; +} + +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, status); + spin_unlock(&ls->ls_recover_lock); +} + +static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, + int save_slots) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + struct dlm_member *memb; + int error = 0, delay; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + delay = 0; + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, memb->nodeid, 0); + if (error) + goto out; + + if (save_slots) + dlm_slot_save(ls, rc, memb); + + if (rc->rc_result & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + } + out: + return error; +} + +static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, + uint32_t status_flags) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; + + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, nodeid, status_flags); + if (error) + break; + + if (rc->rc_result & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + out: + return error; +} + +static int wait_status(struct dlm_ls *ls, uint32_t status) +{ + uint32_t status_all = status << 1; + int error; + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, status, 0); + if (!error) + dlm_set_recover_status(ls, status_all); + } else + error = wait_status_low(ls, status_all, 0); + + return error; +} + +int dlm_recover_members_wait(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int num_slots, slots_size; + int error, rv; + uint32_t gen; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + memb->slot = -1; + memb->generation = 0; + } + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, DLM_RS_NODES, 1); + if (error) + goto out; + + /* slots array is sparse, slots_size may be > num_slots */ + + rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen); + if (!rv) { + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, DLM_RS_NODES_ALL); + ls->ls_num_slots = num_slots; + ls->ls_slots_size = slots_size; + ls->ls_slots = slots; + ls->ls_generation = gen; + spin_unlock(&ls->ls_recover_lock); + } else { + dlm_set_recover_status(ls, DLM_RS_NODES_ALL); + } + } else { + error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + if (error) + goto out; + + dlm_slots_copy_in(ls); + } + out: + return error; +} + +int dlm_recover_directory_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_DIR); +} + +int dlm_recover_locks_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_LOCKS); +} + +int dlm_recover_done_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_DONE); +} + +/* + * The recover_list contains all the rsb's for which we've requested the new + * master nodeid. As replies are returned from the resource directories the + * rsb's are removed from the list. When the list is empty we're done. + * + * The recover_list is later similarly used for all rsb's for which we've sent + * new lkb's and need to receive new corresponding lkid's. + * + * We use the address of the rsb struct as a simple local identifier for the + * rsb so we can match an rcom reply with the rsb it was sent for. + */ + +static int recover_list_empty(struct dlm_ls *ls) +{ + int empty; + + spin_lock(&ls->ls_recover_list_lock); + empty = list_empty(&ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); + + return empty; +} + +static void recover_list_add(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + if (list_empty(&r->res_recover_list)) { + list_add_tail(&r->res_recover_list, &ls->ls_recover_list); + ls->ls_recover_list_count++; + dlm_hold_rsb(r); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +static void recover_list_del(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + list_del_init(&r->res_recover_list); + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_list_lock); + + dlm_put_rsb(r); +} + +static void recover_list_clear(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *s; + + spin_lock(&ls->ls_recover_list_lock); + list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { + list_del_init(&r->res_recover_list); + r->res_recover_locks_count = 0; + dlm_put_rsb(r); + ls->ls_recover_list_count--; + } + + if (ls->ls_recover_list_count != 0) { + log_error(ls, "warning: recover_list_count %d", + ls->ls_recover_list_count); + ls->ls_recover_list_count = 0; + } + spin_unlock(&ls->ls_recover_list_lock); +} + +static int recover_idr_empty(struct dlm_ls *ls) +{ + int empty = 1; + + spin_lock(&ls->ls_recover_idr_lock); + if (ls->ls_recover_list_count) + empty = 0; + spin_unlock(&ls->ls_recover_idr_lock); + + return empty; +} + +static int recover_idr_add(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + int rv; + + idr_preload(GFP_NOFS); + spin_lock(&ls->ls_recover_idr_lock); + if (r->res_id) { + rv = -1; + goto out_unlock; + } + rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT); + if (rv < 0) + goto out_unlock; + + r->res_id = rv; + ls->ls_recover_list_count++; + dlm_hold_rsb(r); + rv = 0; +out_unlock: + spin_unlock(&ls->ls_recover_idr_lock); + idr_preload_end(); + return rv; +} + +static void recover_idr_del(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_idr_lock); + idr_remove(&ls->ls_recover_idr, r->res_id); + r->res_id = 0; + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_idr_lock); + + dlm_put_rsb(r); +} + +static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id) +{ + struct dlm_rsb *r; + + spin_lock(&ls->ls_recover_idr_lock); + r = idr_find(&ls->ls_recover_idr, (int)id); + spin_unlock(&ls->ls_recover_idr_lock); + return r; +} + +static void recover_idr_clear(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int id; + + spin_lock(&ls->ls_recover_idr_lock); + + idr_for_each_entry(&ls->ls_recover_idr, r, id) { + idr_remove(&ls->ls_recover_idr, id); + r->res_id = 0; + r->res_recover_locks_count = 0; + ls->ls_recover_list_count--; + + dlm_put_rsb(r); + } + + if (ls->ls_recover_list_count != 0) { + log_error(ls, "warning: recover_list_count %d", + ls->ls_recover_list_count); + ls->ls_recover_list_count = 0; + } + spin_unlock(&ls->ls_recover_idr_lock); +} + + +/* Master recovery: find new master node for rsb's that were + mastered on nodes that have been removed. + + dlm_recover_masters + recover_master + dlm_send_rcom_lookup -> receive_rcom_lookup + dlm_dir_lookup + receive_rcom_lookup_reply <- + dlm_recover_master_reply + set_new_master + set_master_lkbs + set_lock_master +*/ + +/* + * Set the lock master for all LKBs in a lock queue + * If we are the new master of the rsb, we may have received new + * MSTCPY locks from other nodes already which we need to ignore + * when setting the new nodeid. + */ + +static void set_lock_master(struct list_head *queue, int nodeid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, queue, lkb_statequeue) { + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { + lkb->lkb_nodeid = nodeid; + lkb->lkb_remid = 0; + } + } +} + +static void set_master_lkbs(struct dlm_rsb *r) +{ + set_lock_master(&r->res_grantqueue, r->res_nodeid); + set_lock_master(&r->res_convertqueue, r->res_nodeid); + set_lock_master(&r->res_waitqueue, r->res_nodeid); +} + +/* + * Propagate the new master nodeid to locks + * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. + * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which + * rsb's to consider. + */ + +static void set_new_master(struct dlm_rsb *r) +{ + set_master_lkbs(r); + rsb_set_flag(r, RSB_NEW_MASTER); + rsb_set_flag(r, RSB_NEW_MASTER2); +} + +/* + * We do async lookups on rsb's that need new masters. The rsb's + * waiting for a lookup reply are kept on the recover_list. + * + * Another node recovering the master may have sent us a rcom lookup, + * and our dlm_master_lookup() set it as the new master, along with + * NEW_MASTER so that we'll recover it here (this implies dir_nodeid + * equals our_nodeid below). + */ + +static int recover_master(struct dlm_rsb *r, unsigned int *count) +{ + struct dlm_ls *ls = r->res_ls; + int our_nodeid, dir_nodeid; + int is_removed = 0; + int error; + + if (is_master(r)) + return 0; + + is_removed = dlm_is_removed(ls, r->res_nodeid); + + if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER)) + return 0; + + our_nodeid = dlm_our_nodeid(); + dir_nodeid = dlm_dir_nodeid(r); + + if (dir_nodeid == our_nodeid) { + if (is_removed) { + r->res_master_nodeid = our_nodeid; + r->res_nodeid = 0; + } + + /* set master of lkbs to ourself when is_removed, or to + another new master which we set along with NEW_MASTER + in dlm_master_lookup */ + set_new_master(r); + error = 0; + } else { + recover_idr_add(r); + error = dlm_send_rcom_lookup(r, dir_nodeid); + } + + (*count)++; + return error; +} + +/* + * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. + * This is necessary because recovery can be started, aborted and restarted, + * causing the master nodeid to briefly change during the aborted recovery, and + * change back to the original value in the second recovery. The MSTCPY locks + * may or may not have been purged during the aborted recovery. Another node + * with an outstanding request in waiters list and a request reply saved in the + * requestqueue, cannot know whether it should ignore the reply and resend the + * request, or accept the reply and complete the request. It must do the + * former if the remote node purged MSTCPY locks, and it must do the later if + * the remote node did not. This is solved by always purging MSTCPY locks, in + * which case, the request reply would always be ignored and the request + * resent. + */ + +static int recover_master_static(struct dlm_rsb *r, unsigned int *count) +{ + int dir_nodeid = dlm_dir_nodeid(r); + int new_master = dir_nodeid; + + if (dir_nodeid == dlm_our_nodeid()) + new_master = 0; + + dlm_purge_mstcpy_locks(r); + r->res_master_nodeid = dir_nodeid; + r->res_nodeid = new_master; + set_new_master(r); + (*count)++; + return 0; +} + +/* + * Go through local root resources and for each rsb which has a master which + * has departed, get the new master nodeid from the directory. The dir will + * assign mastery to the first node to look up the new master. That means + * we'll discover in this lookup if we're the new master of any rsb's. + * + * We fire off all the dir lookup requests individually and asynchronously to + * the correct dir node. + */ + +int dlm_recover_masters(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + unsigned int total = 0; + unsigned int count = 0; + int nodir = dlm_no_directory(ls); + int error; + + log_rinfo(ls, "dlm_recover_masters"); + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (dlm_recovery_stopped(ls)) { + up_read(&ls->ls_root_sem); + error = -EINTR; + goto out; + } + + lock_rsb(r); + if (nodir) + error = recover_master_static(r, &count); + else + error = recover_master(r, &count); + unlock_rsb(r); + cond_resched(); + total++; + + if (error) { + up_read(&ls->ls_root_sem); + goto out; + } + } + up_read(&ls->ls_root_sem); + + log_rinfo(ls, "dlm_recover_masters %u of %u", count, total); + + error = dlm_wait_function(ls, &recover_idr_empty); + out: + if (error) + recover_idr_clear(ls); + return error; +} + +int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_rsb *r; + int ret_nodeid, new_master; + + r = recover_idr_find(ls, rc->rc_id); + if (!r) { + log_error(ls, "dlm_recover_master_reply no id %llx", + (unsigned long long)rc->rc_id); + goto out; + } + + ret_nodeid = rc->rc_result; + + if (ret_nodeid == dlm_our_nodeid()) + new_master = 0; + else + new_master = ret_nodeid; + + lock_rsb(r); + r->res_master_nodeid = ret_nodeid; + r->res_nodeid = new_master; + set_new_master(r); + unlock_rsb(r); + recover_idr_del(r); + + if (recover_idr_empty(ls)) + wake_up(&ls->ls_wait_general); + out: + return 0; +} + + +/* Lock recovery: rebuild the process-copy locks we hold on a + remastered rsb on the new rsb master. + + dlm_recover_locks + recover_locks + recover_locks_queue + dlm_send_rcom_lock -> receive_rcom_lock + dlm_recover_master_copy + receive_rcom_lock_reply <- + dlm_recover_process_copy +*/ + + +/* + * keep a count of the number of lkb's we send to the new master; when we get + * an equal number of replies then recovery for the rsb is done + */ + +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) +{ + struct dlm_lkb *lkb; + int error = 0; + + list_for_each_entry(lkb, head, lkb_statequeue) { + error = dlm_send_rcom_lock(r, lkb); + if (error) + break; + r->res_recover_locks_count++; + } + + return error; +} + +static int recover_locks(struct dlm_rsb *r) +{ + int error = 0; + + lock_rsb(r); + + DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); + + error = recover_locks_queue(r, &r->res_grantqueue); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_convertqueue); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_waitqueue); + if (error) + goto out; + + if (r->res_recover_locks_count) + recover_list_add(r); + else + rsb_clear_flag(r, RSB_NEW_MASTER); + out: + unlock_rsb(r); + return error; +} + +int dlm_recover_locks(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int error, count = 0; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (is_master(r)) { + rsb_clear_flag(r, RSB_NEW_MASTER); + continue; + } + + if (!rsb_flag(r, RSB_NEW_MASTER)) + continue; + + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + up_read(&ls->ls_root_sem); + goto out; + } + + error = recover_locks(r); + if (error) { + up_read(&ls->ls_root_sem); + goto out; + } + + count += r->res_recover_locks_count; + } + up_read(&ls->ls_root_sem); + + log_rinfo(ls, "dlm_recover_locks %d out", count); + + error = dlm_wait_function(ls, &recover_list_empty); + out: + if (error) + recover_list_clear(ls); + return error; +} + +void dlm_recovered_lock(struct dlm_rsb *r) +{ + DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r);); + + r->res_recover_locks_count--; + if (!r->res_recover_locks_count) { + rsb_clear_flag(r, RSB_NEW_MASTER); + recover_list_del(r); + } + + if (recover_list_empty(r->res_ls)) + wake_up(&r->res_ls->ls_wait_general); +} + +/* + * The lvb needs to be recovered on all master rsb's. This includes setting + * the VALNOTVALID flag if necessary, and determining the correct lvb contents + * based on the lvb's of the locks held on the rsb. + * + * RSB_VALNOTVALID is set in two cases: + * + * 1. we are master, but not new, and we purged an EX/PW lock held by a + * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL) + * + * 2. we are a new master, and there are only NL/CR locks left. + * (We could probably improve this by only invaliding in this way when + * the previous master left uncleanly. VMS docs mention that.) + * + * The LVB contents are only considered for changing when this is a new master + * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with + * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken + * from the lkb with the largest lvb sequence number. + */ + +static void recover_lvb(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *high_lkb = NULL; + uint32_t high_seq = 0; + int lock_lvb_exists = 0; + int big_lock_exists = 0; + int lvblen = r->res_ls->ls_lvblen; + + if (!rsb_flag(r, RSB_NEW_MASTER2) && + rsb_flag(r, RSB_RECOVER_LVB_INVAL)) { + /* case 1 above */ + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!rsb_flag(r, RSB_NEW_MASTER2)) + return; + + /* we are the new master, so figure out if VALNOTVALID should + be set, and set the rsb lvb from the best lkb available. */ + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (lkb->lkb_grmode > DLM_LOCK_CR) { + big_lock_exists = 1; + goto setflag; + } + + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = lkb; + high_seq = lkb->lkb_lvbseq; + } + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (lkb->lkb_grmode > DLM_LOCK_CR) { + big_lock_exists = 1; + goto setflag; + } + + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = lkb; + high_seq = lkb->lkb_lvbseq; + } + } + + setflag: + if (!lock_lvb_exists) + goto out; + + /* lvb is invalidated if only NL/CR locks remain */ + if (!big_lock_exists) + rsb_set_flag(r, RSB_VALNOTVALID); + + if (!r->res_lvbptr) { + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + if (!r->res_lvbptr) + goto out; + } + + if (big_lock_exists) { + r->res_lvbseq = lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen); + } else if (high_lkb) { + r->res_lvbseq = high_lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen); + } else { + r->res_lvbseq = 0; + memset(r->res_lvbptr, 0, lvblen); + } + out: + return; +} + +/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks + converting PR->CW or CW->PR need to have their lkb_grmode set. */ + +static void recover_conversion(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_lkb *lkb; + int grmode = -1; + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_grmode == DLM_LOCK_PR || + lkb->lkb_grmode == DLM_LOCK_CW) { + grmode = lkb->lkb_grmode; + break; + } + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + if (lkb->lkb_grmode != DLM_LOCK_IV) + continue; + if (grmode == -1) { + log_debug(ls, "recover_conversion %x set gr to rq %d", + lkb->lkb_id, lkb->lkb_rqmode); + lkb->lkb_grmode = lkb->lkb_rqmode; + } else { + log_debug(ls, "recover_conversion %x set gr %d", + lkb->lkb_id, grmode); + lkb->lkb_grmode = grmode; + } + } +} + +/* We've become the new master for this rsb and waiting/converting locks may + need to be granted in dlm_recover_grant() due to locks that may have + existed from a removed node. */ + +static void recover_grant(struct dlm_rsb *r) +{ + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); +} + +void dlm_recover_rsbs(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + unsigned int count = 0; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + lock_rsb(r); + if (is_master(r)) { + if (rsb_flag(r, RSB_RECOVER_CONVERT)) + recover_conversion(r); + + /* recover lvb before granting locks so the updated + lvb/VALNOTVALID is presented in the completion */ + recover_lvb(r); + + if (rsb_flag(r, RSB_NEW_MASTER2)) + recover_grant(r); + count++; + } else { + rsb_clear_flag(r, RSB_VALNOTVALID); + } + rsb_clear_flag(r, RSB_RECOVER_CONVERT); + rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL); + rsb_clear_flag(r, RSB_NEW_MASTER2); + unlock_rsb(r); + } + up_read(&ls->ls_root_sem); + + if (count) + log_rinfo(ls, "dlm_recover_rsbs %d done", count); +} + +/* Create a single list of all root rsb's to be used during recovery */ + +int dlm_create_root_list(struct dlm_ls *ls) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i, error = 0; + + down_write(&ls->ls_root_sem); + if (!list_empty(&ls->ls_root_list)) { + log_error(ls, "root list not empty"); + error = -EINVAL; + goto out; + } + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + list_add(&r->res_root_list, &ls->ls_root_list); + dlm_hold_rsb(r); + } + + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss)) + log_error(ls, "dlm_create_root_list toss not empty"); + spin_unlock(&ls->ls_rsbtbl[i].lock); + } + out: + up_write(&ls->ls_root_sem); + return error; +} + +void dlm_release_root_list(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *safe; + + down_write(&ls->ls_root_sem); + list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) { + list_del_init(&r->res_root_list); + dlm_put_rsb(r); + } + up_write(&ls->ls_root_sem); +} + +void dlm_clear_toss(struct dlm_ls *ls) +{ + struct rb_node *n, *next; + struct dlm_rsb *r; + unsigned int count = 0; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { + next = rb_next(n); + r = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(r); + count++; + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } + + if (count) + log_rinfo(ls, "dlm_clear_toss %u done", count); +} + diff --git a/kernel/fs/dlm/recover.h b/kernel/fs/dlm/recover.h new file mode 100644 index 000000000..d8c8738c7 --- /dev/null +++ b/kernel/fs/dlm/recover.h @@ -0,0 +1,34 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVER_DOT_H__ +#define __RECOVER_DOT_H__ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); +uint32_t dlm_recover_status(struct dlm_ls *ls); +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); +int dlm_recover_members_wait(struct dlm_ls *ls); +int dlm_recover_directory_wait(struct dlm_ls *ls); +int dlm_recover_locks_wait(struct dlm_ls *ls); +int dlm_recover_done_wait(struct dlm_ls *ls); +int dlm_recover_masters(struct dlm_ls *ls); +int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls); +void dlm_recovered_lock(struct dlm_rsb *r); +int dlm_create_root_list(struct dlm_ls *ls); +void dlm_release_root_list(struct dlm_ls *ls); +void dlm_clear_toss(struct dlm_ls *ls); +void dlm_recover_rsbs(struct dlm_ls *ls); + +#endif /* __RECOVER_DOT_H__ */ + diff --git a/kernel/fs/dlm/recoverd.c b/kernel/fs/dlm/recoverd.c new file mode 100644 index 000000000..6859b4bf9 --- /dev/null +++ b/kernel/fs/dlm/recoverd.c @@ -0,0 +1,342 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "dir.h" +#include "ast.h" +#include "recover.h" +#include "lowcomms.h" +#include "lock.h" +#include "requestqueue.h" +#include "recoverd.h" + + +/* If the start for which we're re-enabling locking (seq) has been superseded + by a newer stop (ls_recover_seq), we need to leave locking disabled. + + We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees + locking stopped and b) adds a message to the requestqueue, but dlm_recoverd + enables locking and clears the requestqueue between a and b. */ + +static int enable_locking(struct dlm_ls *ls, uint64_t seq) +{ + int error = -EINTR; + + down_write(&ls->ls_recv_active); + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_recover_seq == seq) { + set_bit(LSFL_RUNNING, &ls->ls_flags); + /* unblocks processes waiting to enter the dlm */ + up_write(&ls->ls_in_recovery); + clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + error = 0; + } + spin_unlock(&ls->ls_recover_lock); + + up_write(&ls->ls_recv_active); + return error; +} + +static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) +{ + unsigned long start; + int error, neg = 0; + + log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq); + + mutex_lock(&ls->ls_recoverd_active); + + dlm_callback_suspend(ls); + + dlm_clear_toss(ls); + + /* + * This list of root rsb's will be the basis of most of the recovery + * routines. + */ + + dlm_create_root_list(ls); + + /* + * Add or remove nodes from the lockspace's ls_nodes list. + */ + + error = dlm_recover_members(ls, rv, &neg); + if (error) { + log_rinfo(ls, "dlm_recover_members error %d", error); + goto fail; + } + + dlm_recover_dir_nodeid(ls); + + ls->ls_recover_dir_sent_res = 0; + ls->ls_recover_dir_sent_msg = 0; + ls->ls_recover_locks_in = 0; + + dlm_set_recover_status(ls, DLM_RS_NODES); + + error = dlm_recover_members_wait(ls); + if (error) { + log_rinfo(ls, "dlm_recover_members_wait error %d", error); + goto fail; + } + + start = jiffies; + + /* + * Rebuild our own share of the directory by collecting from all other + * nodes their master rsb names that hash to us. + */ + + error = dlm_recover_directory(ls); + if (error) { + log_rinfo(ls, "dlm_recover_directory error %d", error); + goto fail; + } + + dlm_set_recover_status(ls, DLM_RS_DIR); + + error = dlm_recover_directory_wait(ls); + if (error) { + log_rinfo(ls, "dlm_recover_directory_wait error %d", error); + goto fail; + } + + log_rinfo(ls, "dlm_recover_directory %u out %u messages", + ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); + + /* + * We may have outstanding operations that are waiting for a reply from + * a failed node. Mark these to be resent after recovery. Unlock and + * cancel ops can just be completed. + */ + + dlm_recover_waiters_pre(ls); + + error = dlm_recovery_stopped(ls); + if (error) + goto fail; + + if (neg || dlm_no_directory(ls)) { + /* + * Clear lkb's for departed nodes. + */ + + dlm_recover_purge(ls); + + /* + * Get new master nodeid's for rsb's that were mastered on + * departed nodes. + */ + + error = dlm_recover_masters(ls); + if (error) { + log_rinfo(ls, "dlm_recover_masters error %d", error); + goto fail; + } + + /* + * Send our locks on remastered rsb's to the new masters. + */ + + error = dlm_recover_locks(ls); + if (error) { + log_rinfo(ls, "dlm_recover_locks error %d", error); + goto fail; + } + + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls); + if (error) { + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); + goto fail; + } + + log_rinfo(ls, "dlm_recover_locks %u in", + ls->ls_recover_locks_in); + + /* + * Finalize state in master rsb's now that all locks can be + * checked. This includes conversion resolution and lvb + * settings. + */ + + dlm_recover_rsbs(ls); + } else { + /* + * Other lockspace members may be going through the "neg" steps + * while also adding us to the lockspace, in which case they'll + * be doing the recover_locks (RS_LOCKS) barrier. + */ + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls); + if (error) { + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); + goto fail; + } + } + + dlm_release_root_list(ls); + + /* + * Purge directory-related requests that are saved in requestqueue. + * All dir requests from before recovery are invalid now due to the dir + * rebuild and will be resent by the requesting nodes. + */ + + dlm_purge_requestqueue(ls); + + dlm_set_recover_status(ls, DLM_RS_DONE); + + error = dlm_recover_done_wait(ls); + if (error) { + log_rinfo(ls, "dlm_recover_done_wait error %d", error); + goto fail; + } + + dlm_clear_members_gone(ls); + + dlm_adjust_timeouts(ls); + + dlm_callback_resume(ls); + + error = enable_locking(ls, rv->seq); + if (error) { + log_rinfo(ls, "enable_locking error %d", error); + goto fail; + } + + error = dlm_process_requestqueue(ls); + if (error) { + log_rinfo(ls, "dlm_process_requestqueue error %d", error); + goto fail; + } + + error = dlm_recover_waiters_post(ls); + if (error) { + log_rinfo(ls, "dlm_recover_waiters_post error %d", error); + goto fail; + } + + dlm_recover_grant(ls); + + log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms", + (unsigned long long)rv->seq, ls->ls_generation, + jiffies_to_msecs(jiffies - start)); + mutex_unlock(&ls->ls_recoverd_active); + + dlm_lsop_recover_done(ls); + return 0; + + fail: + dlm_release_root_list(ls); + log_rinfo(ls, "dlm_recover %llu error %d", + (unsigned long long)rv->seq, error); + mutex_unlock(&ls->ls_recoverd_active); + return error; +} + +/* The dlm_ls_start() that created the rv we take here may already have been + stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP + flag set. */ + +static void do_ls_recovery(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL; + + spin_lock(&ls->ls_recover_lock); + rv = ls->ls_recover_args; + ls->ls_recover_args = NULL; + if (rv && ls->ls_recover_seq == rv->seq) + clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + if (rv) { + ls_recover(ls, rv); + kfree(rv->nodes); + kfree(rv); + } +} + +static int dlm_recoverd(void *arg) +{ + struct dlm_ls *ls; + + ls = dlm_find_lockspace_local(arg); + if (!ls) { + log_print("dlm_recoverd: no lockspace %p", arg); + return -1; + } + + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) + schedule(); + set_current_state(TASK_RUNNING); + + if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + } + + if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags)) + do_ls_recovery(ls); + } + + if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)) + up_write(&ls->ls_in_recovery); + + dlm_put_lockspace(ls); + return 0; +} + +int dlm_recoverd_start(struct dlm_ls *ls) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_recoverd, ls, "dlm_recoverd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + ls->ls_recoverd_task = p; + return error; +} + +void dlm_recoverd_stop(struct dlm_ls *ls) +{ + kthread_stop(ls->ls_recoverd_task); +} + +void dlm_recoverd_suspend(struct dlm_ls *ls) +{ + wake_up(&ls->ls_wait_general); + mutex_lock(&ls->ls_recoverd_active); +} + +void dlm_recoverd_resume(struct dlm_ls *ls) +{ + mutex_unlock(&ls->ls_recoverd_active); +} + diff --git a/kernel/fs/dlm/recoverd.h b/kernel/fs/dlm/recoverd.h new file mode 100644 index 000000000..885607973 --- /dev/null +++ b/kernel/fs/dlm/recoverd.h @@ -0,0 +1,23 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVERD_DOT_H__ +#define __RECOVERD_DOT_H__ + +void dlm_recoverd_stop(struct dlm_ls *ls); +int dlm_recoverd_start(struct dlm_ls *ls); +void dlm_recoverd_suspend(struct dlm_ls *ls); +void dlm_recoverd_resume(struct dlm_ls *ls); + +#endif /* __RECOVERD_DOT_H__ */ + diff --git a/kernel/fs/dlm/requestqueue.c b/kernel/fs/dlm/requestqueue.c new file mode 100644 index 000000000..1695f1b0d --- /dev/null +++ b/kernel/fs/dlm/requestqueue.c @@ -0,0 +1,171 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "member.h" +#include "lock.h" +#include "dir.h" +#include "config.h" +#include "requestqueue.h" + +struct rq_entry { + struct list_head list; + uint32_t recover_seq; + int nodeid; + struct dlm_message request; +}; + +/* + * Requests received while the lockspace is in recovery get added to the + * request queue and processed when recovery is complete. This happens when + * the lockspace is suspended on some nodes before it is on others, or the + * lockspace is enabled on some while still suspended on others. + */ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) +{ + struct rq_entry *e; + int length = ms->m_header.h_length - sizeof(struct dlm_message); + + e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); + if (!e) { + log_print("dlm_add_requestqueue: out of memory len %d", length); + return; + } + + e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; + e->nodeid = nodeid; + memcpy(&e->request, ms, ms->m_header.h_length); + + mutex_lock(&ls->ls_requestqueue_mutex); + list_add_tail(&e->list, &ls->ls_requestqueue); + mutex_unlock(&ls->ls_requestqueue_mutex); +} + +/* + * Called by dlm_recoverd to process normal messages saved while recovery was + * happening. Normal locking has been enabled before this is called. dlm_recv + * upon receiving a message, will wait for all saved messages to be drained + * here before processing the message it got. If a new dlm_ls_stop() arrives + * while we're processing these saved messages, it may block trying to suspend + * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that + * case, we don't abort since locking_stopped is still 0. If dlm_recv is not + * waiting for us, then this processing may be aborted due to locking_stopped. + */ + +int dlm_process_requestqueue(struct dlm_ls *ls) +{ + struct rq_entry *e; + struct dlm_message *ms; + int error = 0; + + mutex_lock(&ls->ls_requestqueue_mutex); + + for (;;) { + if (list_empty(&ls->ls_requestqueue)) { + mutex_unlock(&ls->ls_requestqueue_mutex); + error = 0; + break; + } + e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); + mutex_unlock(&ls->ls_requestqueue_mutex); + + ms = &e->request; + + log_limit(ls, "dlm_process_requestqueue msg %d from %d " + "lkid %x remid %x result %d seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, ms->m_result, + e->recover_seq); + + dlm_receive_message_saved(ls, &e->request, e->recover_seq); + + mutex_lock(&ls->ls_requestqueue_mutex); + list_del(&e->list); + kfree(e); + + if (dlm_locking_stopped(ls)) { + log_debug(ls, "process_requestqueue abort running"); + mutex_unlock(&ls->ls_requestqueue_mutex); + error = -EINTR; + break; + } + schedule(); + } + + return error; +} + +/* + * After recovery is done, locking is resumed and dlm_recoverd takes all the + * saved requests and processes them as they would have been by dlm_recv. At + * the same time, dlm_recv will start receiving new requests from remote nodes. + * We want to delay dlm_recv processing new requests until dlm_recoverd has + * finished processing the old saved requests. We don't check for locking + * stopped here because dlm_ls_stop won't stop locking until it's suspended us + * (dlm_recv). + */ + +void dlm_wait_requestqueue(struct dlm_ls *ls) +{ + for (;;) { + mutex_lock(&ls->ls_requestqueue_mutex); + if (list_empty(&ls->ls_requestqueue)) + break; + mutex_unlock(&ls->ls_requestqueue_mutex); + schedule(); + } + mutex_unlock(&ls->ls_requestqueue_mutex); +} + +static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) +{ + uint32_t type = ms->m_type; + + /* the ls is being cleaned up and freed by release_lockspace */ + if (!ls->ls_count) + return 1; + + if (dlm_is_removed(ls, nodeid)) + return 1; + + /* directory operations are always purged because the directory is + always rebuilt during recovery and the lookups resent */ + + if (type == DLM_MSG_REMOVE || + type == DLM_MSG_LOOKUP || + type == DLM_MSG_LOOKUP_REPLY) + return 1; + + if (!dlm_no_directory(ls)) + return 0; + + return 1; +} + +void dlm_purge_requestqueue(struct dlm_ls *ls) +{ + struct dlm_message *ms; + struct rq_entry *e, *safe; + + mutex_lock(&ls->ls_requestqueue_mutex); + list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) { + ms = &e->request; + + if (purge_request(ls, ms, e->nodeid)) { + list_del(&e->list); + kfree(e); + } + } + mutex_unlock(&ls->ls_requestqueue_mutex); +} + diff --git a/kernel/fs/dlm/requestqueue.h b/kernel/fs/dlm/requestqueue.h new file mode 100644 index 000000000..10ce449b7 --- /dev/null +++ b/kernel/fs/dlm/requestqueue.h @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __REQUESTQUEUE_DOT_H__ +#define __REQUESTQUEUE_DOT_H__ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); +int dlm_process_requestqueue(struct dlm_ls *ls); +void dlm_wait_requestqueue(struct dlm_ls *ls); +void dlm_purge_requestqueue(struct dlm_ls *ls); + +#endif + diff --git a/kernel/fs/dlm/user.c b/kernel/fs/dlm/user.c new file mode 100644 index 000000000..fb85f32e9 --- /dev/null +++ b/kernel/fs/dlm/user.c @@ -0,0 +1,1015 @@ +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "lvb_table.h" +#include "user.h" +#include "ast.h" + +static const char name_prefix[] = "dlm"; +static const struct file_operations device_fops; +static atomic_t dlm_monitor_opened; +static int dlm_monitor_unused = 1; + +#ifdef CONFIG_COMPAT + +struct dlm_lock_params32 { + __u8 mode; + __u8 namelen; + __u16 unused; + __u32 flags; + __u32 lkid; + __u32 parent; + __u64 xid; + __u64 timeout; + __u32 castparam; + __u32 castaddr; + __u32 bastparam; + __u32 bastaddr; + __u32 lksb; + char lvb[DLM_USER_LVB_LEN]; + char name[0]; +}; + +struct dlm_write_request32 { + __u32 version[3]; + __u8 cmd; + __u8 is64bit; + __u8 unused[2]; + + union { + struct dlm_lock_params32 lock; + struct dlm_lspace_params lspace; + struct dlm_purge_params purge; + } i; +}; + +struct dlm_lksb32 { + __u32 sb_status; + __u32 sb_lkid; + __u8 sb_flags; + __u32 sb_lvbptr; +}; + +struct dlm_lock_result32 { + __u32 version[3]; + __u32 length; + __u32 user_astaddr; + __u32 user_astparam; + __u32 user_lksb; + struct dlm_lksb32 lksb; + __u8 bast_mode; + __u8 unused[3]; + /* Offsets may be zero if no data is present */ + __u32 lvb_offset; +}; + +static void compat_input(struct dlm_write_request *kb, + struct dlm_write_request32 *kb32, + int namelen) +{ + kb->version[0] = kb32->version[0]; + kb->version[1] = kb32->version[1]; + kb->version[2] = kb32->version[2]; + + kb->cmd = kb32->cmd; + kb->is64bit = kb32->is64bit; + if (kb->cmd == DLM_USER_CREATE_LOCKSPACE || + kb->cmd == DLM_USER_REMOVE_LOCKSPACE) { + kb->i.lspace.flags = kb32->i.lspace.flags; + kb->i.lspace.minor = kb32->i.lspace.minor; + memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen); + } else if (kb->cmd == DLM_USER_PURGE) { + kb->i.purge.nodeid = kb32->i.purge.nodeid; + kb->i.purge.pid = kb32->i.purge.pid; + } else { + kb->i.lock.mode = kb32->i.lock.mode; + kb->i.lock.namelen = kb32->i.lock.namelen; + kb->i.lock.flags = kb32->i.lock.flags; + kb->i.lock.lkid = kb32->i.lock.lkid; + kb->i.lock.parent = kb32->i.lock.parent; + kb->i.lock.xid = kb32->i.lock.xid; + kb->i.lock.timeout = kb32->i.lock.timeout; + kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam; + kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr; + kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam; + kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; + kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; + memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); + memcpy(kb->i.lock.name, kb32->i.lock.name, namelen); + } +} + +static void compat_output(struct dlm_lock_result *res, + struct dlm_lock_result32 *res32) +{ + res32->version[0] = res->version[0]; + res32->version[1] = res->version[1]; + res32->version[2] = res->version[2]; + + res32->user_astaddr = (__u32)(long)res->user_astaddr; + res32->user_astparam = (__u32)(long)res->user_astparam; + res32->user_lksb = (__u32)(long)res->user_lksb; + res32->bast_mode = res->bast_mode; + + res32->lvb_offset = res->lvb_offset; + res32->length = res->length; + + res32->lksb.sb_status = res->lksb.sb_status; + res32->lksb.sb_flags = res->lksb.sb_flags; + res32->lksb.sb_lkid = res->lksb.sb_lkid; + res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr; +} +#endif + +/* Figure out if this lock is at the end of its life and no longer + available for the application to use. The lkb still exists until + the final ast is read. A lock becomes EOL in three situations: + 1. a noqueue request fails with EAGAIN + 2. an unlock completes with EUNLOCK + 3. a cancel of a waiting request completes with ECANCEL/EDEADLK + An EOL lock needs to be removed from the process's list of locks. + And we can't allow any new operation on an EOL lock. This is + not related to the lifetime of the lkb struct which is managed + entirely by refcount. */ + +static int lkb_is_endoflife(int mode, int status) +{ + switch (status) { + case -DLM_EUNLOCK: + return 1; + case -DLM_ECANCEL: + case -ETIMEDOUT: + case -EDEADLK: + case -EAGAIN: + if (mode == DLM_LOCK_IV) + return 1; + break; + } + return 0; +} + +/* we could possibly check if the cancel of an orphan has resulted in the lkb + being removed and then remove that lkb from the orphans list and free it */ + +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + struct dlm_user_proc *proc; + int rv; + + if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + return; + + ls = lkb->lkb_resource->res_ls; + mutex_lock(&ls->ls_clear_proc_locks); + + /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast + can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed + lkb->ua so we can't try to use it. This second check is necessary + for cases where a completion ast is received for an operation that + began before clear_proc_locks did its cancel/unlock. */ + + if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + goto out; + + DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb);); + ua = lkb->lkb_ua; + proc = ua->proc; + + if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL) + goto out; + + if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) + lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; + + spin_lock(&proc->asts_spin); + + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); + if (rv < 0) { + spin_unlock(&proc->asts_spin); + goto out; + } + + if (list_empty(&lkb->lkb_cb_list)) { + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_cb_list, &proc->asts); + wake_up_interruptible(&proc->wait); + } + spin_unlock(&proc->asts_spin); + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + /* N.B. spin_lock locks_spin, not asts_spin */ + spin_lock(&proc->locks_spin); + if (!list_empty(&lkb->lkb_ownqueue)) { + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + } + out: + mutex_unlock(&ls->ls_clear_proc_locks); +} + +static int device_user_lock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + uint32_t lkid; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + if (!params->castaddr || !params->lksb) { + error = -EINVAL; + goto out; + } + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + ua->bastparam = params->bastparam; + ua->bastaddr = params->bastaddr; + ua->xid = params->xid; + + if (params->flags & DLM_LKF_CONVERT) { + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb, + (unsigned long) params->timeout); + } else if (params->flags & DLM_LKF_ORPHAN) { + error = dlm_user_adopt_orphan(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + (unsigned long) params->timeout, + &lkid); + if (!error) + error = lkid; + } else { + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + (unsigned long) params->timeout); + if (!error) + error = ua->lksb.sb_lkid; + } + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_unlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + + if (params->flags & DLM_LKF_CANCEL) + error = dlm_user_cancel(ls, ua, params->flags, params->lkid); + else + error = dlm_user_unlock(ls, ua, params->flags, params->lkid, + params->lvb); + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_deadlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_deadlock(ls, params->flags, params->lkid); + + dlm_put_lockspace(ls); + return error; +} + +static int dlm_device_register(struct dlm_ls *ls, char *name) +{ + int error, len; + + /* The device is already registered. This happens when the + lockspace is created multiple times from userspace. */ + if (ls->ls_device.name) + return 0; + + error = -ENOMEM; + len = strlen(name) + strlen(name_prefix) + 2; + ls->ls_device.name = kzalloc(len, GFP_NOFS); + if (!ls->ls_device.name) + goto fail; + + snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix, + name); + ls->ls_device.fops = &device_fops; + ls->ls_device.minor = MISC_DYNAMIC_MINOR; + + error = misc_register(&ls->ls_device); + if (error) { + kfree(ls->ls_device.name); + } +fail: + return error; +} + +int dlm_device_deregister(struct dlm_ls *ls) +{ + int error; + + /* The device is not registered. This happens when the lockspace + was never used from userspace, or when device_create_lockspace() + calls dlm_release_lockspace() after the register fails. */ + if (!ls->ls_device.name) + return 0; + + error = misc_deregister(&ls->ls_device); + if (!error) + kfree(ls->ls_device.name); + return error; +} + +static int device_user_purge(struct dlm_user_proc *proc, + struct dlm_purge_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_purge(ls, proc, params->nodeid, params->pid); + + dlm_put_lockspace(ls); + return error; +} + +static int device_create_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = dlm_new_lockspace(params->name, NULL, params->flags, + DLM_USER_LVB_LEN, NULL, NULL, NULL, + &lockspace); + if (error) + return error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -ENOENT; + + error = dlm_device_register(ls, params->name); + dlm_put_lockspace(ls); + + if (error) + dlm_release_lockspace(lockspace, 0); + else + error = ls->ls_device.minor; + + return error; +} + +static int device_remove_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error, force = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ls = dlm_find_lockspace_device(params->minor); + if (!ls) + return -ENOENT; + + if (params->flags & DLM_USER_LSFLG_FORCEFREE) + force = 2; + + lockspace = ls->ls_local_handle; + dlm_put_lockspace(ls); + + /* The final dlm_release_lockspace waits for references to go to + zero, so all processes will need to close their device for the + ls before the release will proceed. release also calls the + device_deregister above. Converting a positive return value + from release to zero means that userspace won't know when its + release was the final one, but it shouldn't need to know. */ + + error = dlm_release_lockspace(lockspace, force); + if (error > 0) + error = 0; + return error; +} + +/* Check the user's version matches ours */ +static int check_version(struct dlm_write_request *req) +{ + if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || + (req->version[0] == DLM_DEVICE_VERSION_MAJOR && + req->version[1] > DLM_DEVICE_VERSION_MINOR)) { + + printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " + "user (%d.%d.%d) kernel (%d.%d.%d)\n", + current->comm, + task_pid_nr(current), + req->version[0], + req->version[1], + req->version[2], + DLM_DEVICE_VERSION_MAJOR, + DLM_DEVICE_VERSION_MINOR, + DLM_DEVICE_VERSION_PATCH); + return -EINVAL; + } + return 0; +} + +/* + * device_write + * + * device_user_lock + * dlm_user_request -> request_lock + * dlm_user_convert -> convert_lock + * + * device_user_unlock + * dlm_user_unlock -> unlock_lock + * dlm_user_cancel -> cancel_lock + * + * device_create_lockspace + * dlm_new_lockspace + * + * device_remove_lockspace + * dlm_release_lockspace + */ + +/* a write to a lockspace device is a lock or unlock request, a write + to the control device is to create/remove a lockspace */ + +static ssize_t device_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_write_request *kbuf; + int error; + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_write_request32)) +#else + if (count < sizeof(struct dlm_write_request)) +#endif + return -EINVAL; + + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ + if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) + return -EINVAL; + + kbuf = kzalloc(count + 1, GFP_NOFS); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, buf, count)) { + error = -EFAULT; + goto out_free; + } + + if (check_version(kbuf)) { + error = -EBADE; + goto out_free; + } + +#ifdef CONFIG_COMPAT + if (!kbuf->is64bit) { + struct dlm_write_request32 *k32buf; + int namelen = 0; + + if (count > sizeof(struct dlm_write_request32)) + namelen = count - sizeof(struct dlm_write_request32); + + k32buf = (struct dlm_write_request32 *)kbuf; + + /* add 1 after namelen so that the name string is terminated */ + kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, + GFP_NOFS); + if (!kbuf) { + kfree(k32buf); + return -ENOMEM; + } + + if (proc) + set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); + + compat_input(kbuf, k32buf, namelen); + kfree(k32buf); + } +#endif + + /* do we really need this? can a write happen after a close? */ + if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && + (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) { + error = -EINVAL; + goto out_free; + } + + error = -EINVAL; + + switch (kbuf->cmd) + { + case DLM_USER_LOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_lock(proc, &kbuf->i.lock); + break; + + case DLM_USER_UNLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_unlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_DEADLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_deadlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_CREATE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_free; + } + error = device_create_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_REMOVE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_free; + } + error = device_remove_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_PURGE: + if (!proc) { + log_print("no locking on control device"); + goto out_free; + } + error = device_user_purge(proc, &kbuf->i.purge); + break; + + default: + log_print("Unknown command passed to DLM device : %d\n", + kbuf->cmd); + } + + out_free: + kfree(kbuf); + return error; +} + +/* Every process that opens the lockspace device has its own "proc" structure + hanging off the open file that's used to keep track of locks owned by the + process and asts that need to be delivered to the process. */ + +static int device_open(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc; + struct dlm_ls *ls; + + ls = dlm_find_lockspace_device(iminor(inode)); + if (!ls) + return -ENOENT; + + proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); + if (!proc) { + dlm_put_lockspace(ls); + return -ENOMEM; + } + + proc->lockspace = ls->ls_local_handle; + INIT_LIST_HEAD(&proc->asts); + INIT_LIST_HEAD(&proc->locks); + INIT_LIST_HEAD(&proc->unlocking); + spin_lock_init(&proc->asts_spin); + spin_lock_init(&proc->locks_spin); + init_waitqueue_head(&proc->wait); + file->private_data = proc; + + return 0; +} + +static int device_close(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_ls *ls; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); + + dlm_clear_proc_locks(ls, proc); + + /* at this point no more lkb's should exist for this lockspace, + so there's no chance of dlm_user_add_ast() being called and + looking for lkb->ua->proc */ + + kfree(proc); + file->private_data = NULL; + + dlm_put_lockspace(ls); + dlm_put_lockspace(ls); /* for the find in device_open() */ + + /* FIXME: AUTOFREE: if this ls is no longer used do + device_remove_lockspace() */ + + return 0; +} + +static int copy_result_to_user(struct dlm_user_args *ua, int compat, + uint32_t flags, int mode, int copy_lvb, + char __user *buf, size_t count) +{ +#ifdef CONFIG_COMPAT + struct dlm_lock_result32 result32; +#endif + struct dlm_lock_result result; + void *resultptr; + int error=0; + int len; + int struct_len; + + memset(&result, 0, sizeof(struct dlm_lock_result)); + result.version[0] = DLM_DEVICE_VERSION_MAJOR; + result.version[1] = DLM_DEVICE_VERSION_MINOR; + result.version[2] = DLM_DEVICE_VERSION_PATCH; + memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb)); + result.user_lksb = ua->user_lksb; + + /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated + in a conversion unless the conversion is successful. See code + in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though, + notes that a new blocking AST address and parameter are set even if + the conversion fails, so maybe we should just do that. */ + + if (flags & DLM_CB_BAST) { + result.user_astaddr = ua->bastaddr; + result.user_astparam = ua->bastparam; + result.bast_mode = mode; + } else { + result.user_astaddr = ua->castaddr; + result.user_astparam = ua->castparam; + } + +#ifdef CONFIG_COMPAT + if (compat) + len = sizeof(struct dlm_lock_result32); + else +#endif + len = sizeof(struct dlm_lock_result); + struct_len = len; + + /* copy lvb to userspace if there is one, it's been updated, and + the user buffer has space for it */ + + if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) { + if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, + DLM_USER_LVB_LEN)) { + error = -EFAULT; + goto out; + } + + result.lvb_offset = len; + len += DLM_USER_LVB_LEN; + } + + result.length = len; + resultptr = &result; +#ifdef CONFIG_COMPAT + if (compat) { + compat_output(&result, &result32); + resultptr = &result32; + } +#endif + + if (copy_to_user(buf, resultptr, struct_len)) + error = -EFAULT; + else + error = len; + out: + return error; +} + +static int copy_version_to_user(char __user *buf, size_t count) +{ + struct dlm_device_version ver; + + memset(&ver, 0, sizeof(struct dlm_device_version)); + ver.version[0] = DLM_DEVICE_VERSION_MAJOR; + ver.version[1] = DLM_DEVICE_VERSION_MINOR; + ver.version[2] = DLM_DEVICE_VERSION_PATCH; + + if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version))) + return -EFAULT; + return sizeof(struct dlm_device_version); +} + +/* a read returns a single ast described in a struct dlm_lock_result */ + +static ssize_t device_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_lkb *lkb; + DECLARE_WAITQUEUE(wait, current); + struct dlm_callback cb; + int rv, resid, copy_lvb = 0; + + if (count == sizeof(struct dlm_device_version)) { + rv = copy_version_to_user(buf, count); + return rv; + } + + if (!proc) { + log_print("non-version read from control device %zu", count); + return -EINVAL; + } + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_lock_result32)) +#else + if (count < sizeof(struct dlm_lock_result)) +#endif + return -EINVAL; + + try_another: + + /* do we really need this? can a read happen after a close? */ + if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) + return -EINVAL; + + spin_lock(&proc->asts_spin); + if (list_empty(&proc->asts)) { + if (file->f_flags & O_NONBLOCK) { + spin_unlock(&proc->asts_spin); + return -EAGAIN; + } + + add_wait_queue(&proc->wait, &wait); + + repeat: + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&proc->asts) && !signal_pending(current)) { + spin_unlock(&proc->asts_spin); + schedule(); + spin_lock(&proc->asts_spin); + goto repeat; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&proc->wait, &wait); + + if (signal_pending(current)) { + spin_unlock(&proc->asts_spin); + return -ERESTARTSYS; + } + } + + /* if we empty lkb_callbacks, we don't want to unlock the spinlock + without removing lkb_cb_list; so empty lkb_cb_list is always + consistent with empty lkb_callbacks */ + + lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + + rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); + if (rv < 0) { + /* this shouldn't happen; lkb should have been removed from + list when resid was zero */ + log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); + list_del_init(&lkb->lkb_cb_list); + spin_unlock(&proc->asts_spin); + /* removes ref for proc->asts, may cause lkb to be freed */ + dlm_put_lkb(lkb); + goto try_another; + } + if (!resid) + list_del_init(&lkb->lkb_cb_list); + spin_unlock(&proc->asts_spin); + + if (cb.flags & DLM_CB_SKIP) { + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) + dlm_put_lkb(lkb); + goto try_another; + } + + if (cb.flags & DLM_CB_CAST) { + int old_mode, new_mode; + + old_mode = lkb->lkb_last_cast.mode; + new_mode = cb.mode; + + if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + dlm_lvb_operations[old_mode + 1][new_mode + 1]) + copy_lvb = 1; + + lkb->lkb_lksb->sb_status = cb.sb_status; + lkb->lkb_lksb->sb_flags = cb.sb_flags; + } + + rv = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + cb.flags, cb.mode, copy_lvb, buf, count); + + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) + dlm_put_lkb(lkb); + + return rv; +} + +static unsigned int device_poll(struct file *file, poll_table *wait) +{ + struct dlm_user_proc *proc = file->private_data; + + poll_wait(file, &proc->wait, wait); + + spin_lock(&proc->asts_spin); + if (!list_empty(&proc->asts)) { + spin_unlock(&proc->asts_spin); + return POLLIN | POLLRDNORM; + } + spin_unlock(&proc->asts_spin); + return 0; +} + +int dlm_user_daemon_available(void) +{ + /* dlm_controld hasn't started (or, has started, but not + properly populated configfs) */ + + if (!dlm_our_nodeid()) + return 0; + + /* This is to deal with versions of dlm_controld that don't + know about the monitor device. We assume that if the + dlm_controld was started (above), but the monitor device + was never opened, that it's an old version. dlm_controld + should open the monitor device before populating configfs. */ + + if (dlm_monitor_unused) + return 1; + + return atomic_read(&dlm_monitor_opened) ? 1 : 0; +} + +static int ctl_device_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return 0; +} + +static int ctl_device_close(struct inode *inode, struct file *file) +{ + return 0; +} + +static int monitor_device_open(struct inode *inode, struct file *file) +{ + atomic_inc(&dlm_monitor_opened); + dlm_monitor_unused = 0; + return 0; +} + +static int monitor_device_close(struct inode *inode, struct file *file) +{ + if (atomic_dec_and_test(&dlm_monitor_opened)) + dlm_stop_lockspaces(); + return 0; +} + +static const struct file_operations device_fops = { + .open = device_open, + .release = device_close, + .read = device_read, + .write = device_write, + .poll = device_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static const struct file_operations ctl_device_fops = { + .open = ctl_device_open, + .release = ctl_device_close, + .read = device_read, + .write = device_write, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice ctl_device = { + .name = "dlm-control", + .fops = &ctl_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +static const struct file_operations monitor_device_fops = { + .open = monitor_device_open, + .release = monitor_device_close, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice monitor_device = { + .name = "dlm-monitor", + .fops = &monitor_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +int __init dlm_user_init(void) +{ + int error; + + atomic_set(&dlm_monitor_opened, 0); + + error = misc_register(&ctl_device); + if (error) { + log_print("misc_register failed for control device"); + goto out; + } + + error = misc_register(&monitor_device); + if (error) { + log_print("misc_register failed for monitor device"); + misc_deregister(&ctl_device); + } + out: + return error; +} + +void dlm_user_exit(void) +{ + misc_deregister(&ctl_device); + misc_deregister(&monitor_device); +} + diff --git a/kernel/fs/dlm/user.h b/kernel/fs/dlm/user.h new file mode 100644 index 000000000..00499ab88 --- /dev/null +++ b/kernel/fs/dlm/user.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __USER_DOT_H__ +#define __USER_DOT_H__ + +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); +int dlm_user_init(void); +void dlm_user_exit(void); +int dlm_device_deregister(struct dlm_ls *ls); +int dlm_user_daemon_available(void); + +#endif diff --git a/kernel/fs/dlm/util.c b/kernel/fs/dlm/util.c new file mode 100644 index 000000000..e36520af7 --- /dev/null +++ b/kernel/fs/dlm/util.c @@ -0,0 +1,154 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "rcom.h" +#include "util.h" + +#define DLM_ERRNO_EDEADLK 35 +#define DLM_ERRNO_EBADR 53 +#define DLM_ERRNO_EBADSLT 57 +#define DLM_ERRNO_EPROTO 71 +#define DLM_ERRNO_EOPNOTSUPP 95 +#define DLM_ERRNO_ETIMEDOUT 110 +#define DLM_ERRNO_EINPROGRESS 115 + +static void header_out(struct dlm_header *hd) +{ + hd->h_version = cpu_to_le32(hd->h_version); + hd->h_lockspace = cpu_to_le32(hd->h_lockspace); + hd->h_nodeid = cpu_to_le32(hd->h_nodeid); + hd->h_length = cpu_to_le16(hd->h_length); +} + +static void header_in(struct dlm_header *hd) +{ + hd->h_version = le32_to_cpu(hd->h_version); + hd->h_lockspace = le32_to_cpu(hd->h_lockspace); + hd->h_nodeid = le32_to_cpu(hd->h_nodeid); + hd->h_length = le16_to_cpu(hd->h_length); +} + +/* higher errno values are inconsistent across architectures, so select + one set of values for on the wire */ + +static int to_dlm_errno(int err) +{ + switch (err) { + case -EDEADLK: + return -DLM_ERRNO_EDEADLK; + case -EBADR: + return -DLM_ERRNO_EBADR; + case -EBADSLT: + return -DLM_ERRNO_EBADSLT; + case -EPROTO: + return -DLM_ERRNO_EPROTO; + case -EOPNOTSUPP: + return -DLM_ERRNO_EOPNOTSUPP; + case -ETIMEDOUT: + return -DLM_ERRNO_ETIMEDOUT; + case -EINPROGRESS: + return -DLM_ERRNO_EINPROGRESS; + } + return err; +} + +static int from_dlm_errno(int err) +{ + switch (err) { + case -DLM_ERRNO_EDEADLK: + return -EDEADLK; + case -DLM_ERRNO_EBADR: + return -EBADR; + case -DLM_ERRNO_EBADSLT: + return -EBADSLT; + case -DLM_ERRNO_EPROTO: + return -EPROTO; + case -DLM_ERRNO_EOPNOTSUPP: + return -EOPNOTSUPP; + case -DLM_ERRNO_ETIMEDOUT: + return -ETIMEDOUT; + case -DLM_ERRNO_EINPROGRESS: + return -EINPROGRESS; + } + return err; +} + +void dlm_message_out(struct dlm_message *ms) +{ + header_out(&ms->m_header); + + ms->m_type = cpu_to_le32(ms->m_type); + ms->m_nodeid = cpu_to_le32(ms->m_nodeid); + ms->m_pid = cpu_to_le32(ms->m_pid); + ms->m_lkid = cpu_to_le32(ms->m_lkid); + ms->m_remid = cpu_to_le32(ms->m_remid); + ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid); + ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid); + ms->m_exflags = cpu_to_le32(ms->m_exflags); + ms->m_sbflags = cpu_to_le32(ms->m_sbflags); + ms->m_flags = cpu_to_le32(ms->m_flags); + ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq); + ms->m_hash = cpu_to_le32(ms->m_hash); + ms->m_status = cpu_to_le32(ms->m_status); + ms->m_grmode = cpu_to_le32(ms->m_grmode); + ms->m_rqmode = cpu_to_le32(ms->m_rqmode); + ms->m_bastmode = cpu_to_le32(ms->m_bastmode); + ms->m_asts = cpu_to_le32(ms->m_asts); + ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); +} + +void dlm_message_in(struct dlm_message *ms) +{ + header_in(&ms->m_header); + + ms->m_type = le32_to_cpu(ms->m_type); + ms->m_nodeid = le32_to_cpu(ms->m_nodeid); + ms->m_pid = le32_to_cpu(ms->m_pid); + ms->m_lkid = le32_to_cpu(ms->m_lkid); + ms->m_remid = le32_to_cpu(ms->m_remid); + ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); + ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid); + ms->m_exflags = le32_to_cpu(ms->m_exflags); + ms->m_sbflags = le32_to_cpu(ms->m_sbflags); + ms->m_flags = le32_to_cpu(ms->m_flags); + ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq); + ms->m_hash = le32_to_cpu(ms->m_hash); + ms->m_status = le32_to_cpu(ms->m_status); + ms->m_grmode = le32_to_cpu(ms->m_grmode); + ms->m_rqmode = le32_to_cpu(ms->m_rqmode); + ms->m_bastmode = le32_to_cpu(ms->m_bastmode); + ms->m_asts = le32_to_cpu(ms->m_asts); + ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); +} + +void dlm_rcom_out(struct dlm_rcom *rc) +{ + header_out(&rc->rc_header); + + rc->rc_type = cpu_to_le32(rc->rc_type); + rc->rc_result = cpu_to_le32(rc->rc_result); + rc->rc_id = cpu_to_le64(rc->rc_id); + rc->rc_seq = cpu_to_le64(rc->rc_seq); + rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); +} + +void dlm_rcom_in(struct dlm_rcom *rc) +{ + header_in(&rc->rc_header); + + rc->rc_type = le32_to_cpu(rc->rc_type); + rc->rc_result = le32_to_cpu(rc->rc_result); + rc->rc_id = le64_to_cpu(rc->rc_id); + rc->rc_seq = le64_to_cpu(rc->rc_seq); + rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); +} diff --git a/kernel/fs/dlm/util.h b/kernel/fs/dlm/util.h new file mode 100644 index 000000000..2b2591516 --- /dev/null +++ b/kernel/fs/dlm/util.h @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +void dlm_message_out(struct dlm_message *ms); +void dlm_message_in(struct dlm_message *ms); +void dlm_rcom_out(struct dlm_rcom *rc); +void dlm_rcom_in(struct dlm_rcom *rc); + +#endif + diff --git a/kernel/fs/drop_caches.c b/kernel/fs/drop_caches.c new file mode 100644 index 000000000..5718cb9f7 --- /dev/null +++ b/kernel/fs/drop_caches.c @@ -0,0 +1,67 @@ +/* + * Implement the manual drop-all-pagecache function + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* A global variable is a bit ugly, but it keeps the code simple */ +int sysctl_drop_caches; + +static void drop_pagecache_sb(struct super_block *sb, void *unused) +{ + struct inode *inode, *toput_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(toput_inode); + toput_inode = inode; + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(toput_inode); +} + +int drop_caches_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + if (write) { + static int stfu; + + if (sysctl_drop_caches & 1) { + iterate_supers(drop_pagecache_sb, NULL); + count_vm_event(DROP_PAGECACHE); + } + if (sysctl_drop_caches & 2) { + drop_slab(); + count_vm_event(DROP_SLAB); + } + if (!stfu) { + pr_info("%s (%d): drop_caches: %d\n", + current->comm, task_pid_nr(current), + sysctl_drop_caches); + } + stfu |= sysctl_drop_caches & 4; + } + return 0; +} diff --git a/kernel/fs/ecryptfs/Kconfig b/kernel/fs/ecryptfs/Kconfig new file mode 100644 index 000000000..434aa313f --- /dev/null +++ b/kernel/fs/ecryptfs/Kconfig @@ -0,0 +1,22 @@ +config ECRYPT_FS + tristate "eCrypt filesystem layer support" + depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + select CRYPTO_ECB + select CRYPTO_CBC + select CRYPTO_MD5 + help + Encrypted filesystem that operates on the VFS layer. See + to learn more about + eCryptfs. Userspace components are required and can be + obtained from . + + To compile this file system support as a module, choose M here: the + module will be called ecryptfs. + +config ECRYPT_FS_MESSAGING + bool "Enable notifications for userspace key wrap/unwrap" + depends on ECRYPT_FS + help + Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows + for userspace to wrap/unwrap file encryption keys by other + backends, like OpenSSL. diff --git a/kernel/fs/ecryptfs/Makefile b/kernel/fs/ecryptfs/Makefile new file mode 100644 index 000000000..49678a699 --- /dev/null +++ b/kernel/fs/ecryptfs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux eCryptfs +# + +obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o + +ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \ + crypto.o keystore.o kthread.o debug.o + +ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o diff --git a/kernel/fs/ecryptfs/crypto.c b/kernel/fs/ecryptfs/crypto.c new file mode 100644 index 000000000..97315f2f6 --- /dev/null +++ b/kernel/fs/ecryptfs/crypto.c @@ -0,0 +1,2166 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +#define DECRYPT 0 +#define ENCRYPT 1 + +/** + * ecryptfs_to_hex + * @dst: Buffer to take hex character representation of contents of + * src; must be at least of size (src_size * 2) + * @src: Buffer to be converted to a hex string respresentation + * @src_size: number of bytes to convert + */ +void ecryptfs_to_hex(char *dst, char *src, size_t src_size) +{ + int x; + + for (x = 0; x < src_size; x++) + sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]); +} + +/** + * ecryptfs_from_hex + * @dst: Buffer to take the bytes from src hex; must be at least of + * size (src_size / 2) + * @src: Buffer to be converted from a hex string respresentation to raw value + * @dst_size: size of dst buffer, or number of hex characters pairs to convert + */ +void ecryptfs_from_hex(char *dst, char *src, int dst_size) +{ + int x; + char tmp[3] = { 0, }; + + for (x = 0; x < dst_size; x++) { + tmp[0] = src[x * 2]; + tmp[1] = src[x * 2 + 1]; + dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16); + } +} + +/** + * ecryptfs_calculate_md5 - calculates the md5 of @src + * @dst: Pointer to 16 bytes of allocated memory + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @src: Data to be md5'd + * @len: Length of @src + * + * Uses the allocated crypto context that crypt_stat references to + * generate the MD5 sum of the contents of src. + */ +static int ecryptfs_calculate_md5(char *dst, + struct ecryptfs_crypt_stat *crypt_stat, + char *src, int len) +{ + struct scatterlist sg; + struct hash_desc desc = { + .tfm = crypt_stat->hash_tfm, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + mutex_lock(&crypt_stat->cs_hash_tfm_mutex); + sg_init_one(&sg, (u8 *)src, len); + if (!desc.tfm) { + desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) { + rc = PTR_ERR(desc.tfm); + ecryptfs_printk(KERN_ERR, "Error attempting to " + "allocate crypto context; rc = [%d]\n", + rc); + goto out; + } + crypt_stat->hash_tfm = desc.tfm; + } + rc = crypto_hash_init(&desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } + rc = crypto_hash_update(&desc, &sg, len); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } + rc = crypto_hash_final(&desc, dst); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } +out: + mutex_unlock(&crypt_stat->cs_hash_tfm_mutex); + return rc; +} + +static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, + char *cipher_name, + char *chaining_modifier) +{ + int cipher_name_len = strlen(cipher_name); + int chaining_modifier_len = strlen(chaining_modifier); + int algified_name_len; + int rc; + + algified_name_len = (chaining_modifier_len + cipher_name_len + 3); + (*algified_name) = kmalloc(algified_name_len, GFP_KERNEL); + if (!(*algified_name)) { + rc = -ENOMEM; + goto out; + } + snprintf((*algified_name), algified_name_len, "%s(%s)", + chaining_modifier, cipher_name); + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_derive_iv + * @iv: destination for the derived iv vale + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @offset: Offset of the extent whose IV we are to derive + * + * Generate the initialization vector from the given root IV and page + * offset. + * + * Returns zero on success; non-zero on error. + */ +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + char src[ECRYPTFS_MAX_IV_BYTES + 16]; + + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "root iv:\n"); + ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes); + } + /* TODO: It is probably secure to just cast the least + * significant bits of the root IV into an unsigned long and + * add the offset to that rather than go through all this + * hashing business. -Halcrow */ + memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes); + memset((src + crypt_stat->iv_bytes), 0, 16); + snprintf((src + crypt_stat->iv_bytes), 16, "%lld", offset); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "source:\n"); + ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, src, + (crypt_stat->iv_bytes + 16)); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating IV for a page\n"); + goto out; + } + memcpy(iv, dst, crypt_stat->iv_bytes); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); + ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); + } +out: + return rc; +} + +/** + * ecryptfs_init_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Initialize the crypt_stat structure. + */ +void +ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); + INIT_LIST_HEAD(&crypt_stat->keysig_list); + mutex_init(&crypt_stat->keysig_list_mutex); + mutex_init(&crypt_stat->cs_mutex); + mutex_init(&crypt_stat->cs_tfm_mutex); + mutex_init(&crypt_stat->cs_hash_tfm_mutex); + crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; +} + +/** + * ecryptfs_destroy_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Releases all memory associated with a crypt_stat struct. + */ +void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + struct ecryptfs_key_sig *key_sig, *key_sig_tmp; + + if (crypt_stat->tfm) + crypto_free_ablkcipher(crypt_stat->tfm); + if (crypt_stat->hash_tfm) + crypto_free_hash(crypt_stat->hash_tfm); + list_for_each_entry_safe(key_sig, key_sig_tmp, + &crypt_stat->keysig_list, crypt_stat_list) { + list_del(&key_sig->crypt_stat_list); + kmem_cache_free(ecryptfs_key_sig_cache, key_sig); + } + memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); +} + +void ecryptfs_destroy_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *auth_tok, *auth_tok_tmp; + + if (!(mount_crypt_stat->flags & ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED)) + return; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry_safe(auth_tok, auth_tok_tmp, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + list_del(&auth_tok->mount_crypt_stat_list); + if (auth_tok->global_auth_tok_key + && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) + key_put(auth_tok->global_auth_tok_key); + kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok); + } + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); +} + +/** + * virt_to_scatterlist + * @addr: Virtual address + * @size: Size of data; should be an even multiple of the block size + * @sg: Pointer to scatterlist array; set to NULL to obtain only + * the number of scatterlist structs required in array + * @sg_size: Max array size + * + * Fills in a scatterlist array with page references for a passed + * virtual address. + * + * Returns the number of scatterlist structs in array used + */ +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size) +{ + int i = 0; + struct page *pg; + int offset; + int remainder_of_page; + + sg_init_table(sg, sg_size); + + while (size > 0 && i < sg_size) { + pg = virt_to_page(addr); + offset = offset_in_page(addr); + sg_set_page(&sg[i], pg, 0, offset); + remainder_of_page = PAGE_CACHE_SIZE - offset; + if (size >= remainder_of_page) { + sg[i].length = remainder_of_page; + addr += remainder_of_page; + size -= remainder_of_page; + } else { + sg[i].length = size; + addr += size; + size = 0; + } + i++; + } + if (size > 0) + return -ENOMEM; + return i; +} + +struct extent_crypt_result { + struct completion completion; + int rc; +}; + +static void extent_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct extent_crypt_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->rc = rc; + complete(&ecr->completion); +} + +/** + * crypt_scatterlist + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * @dst_sg: Destination of the data after performing the crypto operation + * @src_sg: Data to be encrypted or decrypted + * @size: Length of data + * @iv: IV to use + * @op: ENCRYPT or DECRYPT to indicate the desired operation + * + * Returns the number of bytes encrypted or decrypted; negative value on error + */ +static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dst_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv, int op) +{ + struct ablkcipher_request *req = NULL; + struct extent_crypt_result ecr; + int rc = 0; + + BUG_ON(!crypt_stat || !crypt_stat->tfm + || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", + crypt_stat->key_size); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } + + init_completion(&ecr.completion); + + mutex_lock(&crypt_stat->cs_tfm_mutex); + req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); + if (!req) { + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -ENOMEM; + goto out; + } + + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + extent_crypt_complete, &ecr); + /* Consider doing this once, when the file is opened */ + if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { + rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_ERR, + "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + crypt_stat->flags |= ECRYPTFS_KEY_SET; + } + mutex_unlock(&crypt_stat->cs_tfm_mutex); + ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); + rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) : + crypto_ablkcipher_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) { + struct extent_crypt_result *ecr = req->base.data; + + wait_for_completion(&ecr->completion); + rc = ecr->rc; + reinit_completion(&ecr->completion); + } +out: + ablkcipher_request_free(req); + return rc; +} + +/** + * lower_offset_for_page + * + * Convert an eCryptfs page index into a lower byte offset + */ +static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, + struct page *page) +{ + return ecryptfs_lower_header_size(crypt_stat) + + ((loff_t)page->index << PAGE_CACHE_SHIFT); +} + +/** + * crypt_extent + * @crypt_stat: crypt_stat containing cryptographic context for the + * encryption operation + * @dst_page: The page to write the result into + * @src_page: The page to read from + * @extent_offset: Page extent offset for use in generating IV + * @op: ENCRYPT or DECRYPT to indicate the desired operation + * + * Encrypts or decrypts one extent of data. + * + * Return zero on success; non-zero otherwise + */ +static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, + struct page *src_page, + unsigned long extent_offset, int op) +{ + pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index; + loff_t extent_base; + char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + struct scatterlist src_sg, dst_sg; + size_t extent_size = crypt_stat->extent_size; + int rc; + + extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size)); + rc = ecryptfs_derive_iv(extent_iv, crypt_stat, + (extent_base + extent_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); + goto out; + } + + sg_init_table(&src_sg, 1); + sg_init_table(&dst_sg, 1); + + sg_set_page(&src_sg, src_page, extent_size, + extent_offset * extent_size); + sg_set_page(&dst_sg, dst_page, extent_size, + extent_offset * extent_size); + + rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size, + extent_iv, op); + if (rc < 0) { + printk(KERN_ERR "%s: Error attempting to crypt page with " + "page_index = [%ld], extent_offset = [%ld]; " + "rc = [%d]\n", __func__, page_index, extent_offset, rc); + goto out; + } + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_encrypt_page + * @page: Page mapped from the eCryptfs inode for the file; contains + * decrypted content that needs to be encrypted (to a temporary + * page; not in place) and written out to the lower file + * + * Encrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * Returns zero on success; negative on error + */ +int ecryptfs_encrypt_page(struct page *page) +{ + struct inode *ecryptfs_inode; + struct ecryptfs_crypt_stat *crypt_stat; + char *enc_extent_virt; + struct page *enc_extent_page = NULL; + loff_t extent_offset; + loff_t lower_offset; + int rc = 0; + + ecryptfs_inode = page->mapping->host; + crypt_stat = + &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + enc_extent_page = alloc_page(GFP_USER); + if (!enc_extent_page) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error allocating memory for " + "encrypted extent\n"); + goto out; + } + + for (extent_offset = 0; + extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset++) { + rc = crypt_extent(crypt_stat, enc_extent_page, page, + extent_offset, ENCRYPT); + if (rc) { + printk(KERN_ERR "%s: Error encrypting extent; " + "rc = [%d]\n", __func__, rc); + goto out; + } + } + + lower_offset = lower_offset_for_page(crypt_stat, page); + enc_extent_virt = kmap(enc_extent_page); + rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, + PAGE_CACHE_SIZE); + kunmap(enc_extent_page); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, + "Error attempting to write lower page; rc = [%d]\n", + rc); + goto out; + } + rc = 0; +out: + if (enc_extent_page) { + __free_page(enc_extent_page); + } + return rc; +} + +/** + * ecryptfs_decrypt_page + * @page: Page mapped from the eCryptfs inode for the file; data read + * and decrypted from the lower file will be written into this + * page + * + * Decrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * Returns zero on success; negative on error + */ +int ecryptfs_decrypt_page(struct page *page) +{ + struct inode *ecryptfs_inode; + struct ecryptfs_crypt_stat *crypt_stat; + char *page_virt; + unsigned long extent_offset; + loff_t lower_offset; + int rc = 0; + + ecryptfs_inode = page->mapping->host; + crypt_stat = + &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + + lower_offset = lower_offset_for_page(crypt_stat, page); + page_virt = kmap(page); + rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE, + ecryptfs_inode); + kunmap(page); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, + "Error attempting to read lower page; rc = [%d]\n", + rc); + goto out; + } + + for (extent_offset = 0; + extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset++) { + rc = crypt_extent(crypt_stat, page, page, + extent_offset, DECRYPT); + if (rc) { + printk(KERN_ERR "%s: Error encrypting extent; " + "rc = [%d]\n", __func__, rc); + goto out; + } + } +out: + return rc; +} + +#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 + +/** + * ecryptfs_init_crypt_ctx + * @crypt_stat: Uninitialized crypt stats structure + * + * Initialize the crypto context. + * + * TODO: Performance: Keep a cache of initialized cipher contexts; + * only init if needed + */ +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) +{ + char *full_alg_name; + int rc = -EINVAL; + + ecryptfs_printk(KERN_DEBUG, + "Initializing cipher [%s]; strlen = [%d]; " + "key_size_bits = [%zd]\n", + crypt_stat->cipher, (int)strlen(crypt_stat->cipher), + crypt_stat->key_size << 3); + mutex_lock(&crypt_stat->cs_tfm_mutex); + if (crypt_stat->tfm) { + rc = 0; + goto out_unlock; + } + rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, + crypt_stat->cipher, "cbc"); + if (rc) + goto out_unlock; + crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); + if (IS_ERR(crypt_stat->tfm)) { + rc = PTR_ERR(crypt_stat->tfm); + crypt_stat->tfm = NULL; + ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " + "Error initializing cipher [%s]\n", + full_alg_name); + goto out_free; + } + crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + rc = 0; +out_free: + kfree(full_alg_name); +out_unlock: + mutex_unlock(&crypt_stat->cs_tfm_mutex); + return rc; +} + +static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat) +{ + int extent_size_tmp; + + crypt_stat->extent_mask = 0xFFFFFFFF; + crypt_stat->extent_shift = 0; + if (crypt_stat->extent_size == 0) + return; + extent_size_tmp = crypt_stat->extent_size; + while ((extent_size_tmp & 0x01) == 0) { + extent_size_tmp >>= 1; + crypt_stat->extent_mask <<= 1; + crypt_stat->extent_shift++; + } +} + +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) +{ + /* Default values; may be overwritten as we are parsing the + * packets. */ + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; + set_extent_mask_and_shift(crypt_stat); + crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + else { + if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) + crypt_stat->metadata_size = + ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + else + crypt_stat->metadata_size = PAGE_CACHE_SIZE; + } +} + +/** + * ecryptfs_compute_root_iv + * @crypt_stats + * + * On error, sets the root IV to all 0's. + */ +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + + BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); + BUG_ON(crypt_stat->iv_bytes <= 0); + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Session key not valid; " + "cannot generate root IV\n"); + goto out; + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating root IV\n"); + goto out; + } + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); +out: + if (rc) { + memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); + crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + } + return rc; +} + +static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) +{ + get_random_bytes(crypt_stat->key, crypt_stat->key_size); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + ecryptfs_compute_root_iv(crypt_stat); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +} + +/** + * ecryptfs_copy_mount_wide_flags_to_inode_flags + * @crypt_stat: The inode's cryptographic context + * @mount_crypt_stat: The mount point's cryptographic context + * + * This function propagates the mount-wide flags to individual inode + * flags. + */ +static void ecryptfs_copy_mount_wide_flags_to_inode_flags( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; + else if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; + } +} + +static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + int rc = 0; + + mutex_lock(&crypt_stat->keysig_list_mutex); + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + + list_for_each_entry(global_auth_tok, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; + rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); + if (rc) { + printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); + goto out; + } + } + +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + mutex_unlock(&crypt_stat->keysig_list_mutex); + return rc; +} + +/** + * ecryptfs_set_default_crypt_stat_vals + * @crypt_stat: The inode's cryptographic context + * @mount_crypt_stat: The mount point's cryptographic context + * + * Default values in the event that policy does not override them. + */ +static void ecryptfs_set_default_crypt_stat_vals( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + ecryptfs_set_default_sizes(crypt_stat); + strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER); + crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES; + crypt_stat->flags &= ~(ECRYPTFS_KEY_VALID); + crypt_stat->file_version = ECRYPTFS_FILE_VERSION; + crypt_stat->mount_crypt_stat = mount_crypt_stat; +} + +/** + * ecryptfs_new_file_context + * @ecryptfs_inode: The eCryptfs inode + * + * If the crypto context for the file has not yet been established, + * this is where we do that. Establishing a new crypto context + * involves the following decisions: + * - What cipher to use? + * - What set of authentication tokens to use? + * Here we just worry about getting enough information into the + * authentication tokens so that we know that they are available. + * We associate the available authentication tokens with the new file + * via the set of signatures in the crypt_stat struct. Later, when + * the headers are actually written out, we may again defer to + * userspace to perform the encryption of the session key; for the + * foreseeable future, this will be the case with public key packets. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_inode->i_sb)->mount_crypt_stat; + int cipher_name_len; + int rc = 0; + + ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat); + crypt_stat->flags |= (ECRYPTFS_ENCRYPTED | ECRYPTFS_KEY_VALID); + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + rc = ecryptfs_copy_mount_wide_sigs_to_inode_sigs(crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "Error attempting to copy mount-wide key sigs " + "to the inode key sigs; rc = [%d]\n", rc); + goto out; + } + cipher_name_len = + strlen(mount_crypt_stat->global_default_cipher_name); + memcpy(crypt_stat->cipher, + mount_crypt_stat->global_default_cipher_name, + cipher_name_len); + crypt_stat->cipher[cipher_name_len] = '\0'; + crypt_stat->key_size = + mount_crypt_stat->global_default_cipher_key_size; + ecryptfs_generate_new_key(crypt_stat); + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + ecryptfs_printk(KERN_ERR, "Error initializing cryptographic " + "context for cipher [%s]: rc = [%d]\n", + crypt_stat->cipher, rc); +out: + return rc; +} + +/** + * ecryptfs_validate_marker - check for the ecryptfs marker + * @data: The data block in which to check + * + * Returns zero if marker found; -EINVAL if not found + */ +static int ecryptfs_validate_marker(char *data) +{ + u32 m_1, m_2; + + m_1 = get_unaligned_be32(data); + m_2 = get_unaligned_be32(data + 4); + if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) + return 0; + ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " + "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, + MAGIC_ECRYPTFS_MARKER); + ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " + "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); + return -EINVAL; +} + +struct ecryptfs_flag_map_elem { + u32 file_flag; + u32 local_flag; +}; + +/* Add support for additional flags by adding elements here. */ +static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { + {0x00000001, ECRYPTFS_ENABLE_HMAC}, + {0x00000002, ECRYPTFS_ENCRYPTED}, + {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, + {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} +}; + +/** + * ecryptfs_process_flags + * @crypt_stat: The cryptographic context + * @page_virt: Source data to be parsed + * @bytes_read: Updated with the number of bytes read + * + * Returns zero on success; non-zero if the flag set is invalid + */ +static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, + char *page_virt, int *bytes_read) +{ + int rc = 0; + int i; + u32 flags; + + flags = get_unaligned_be32(page_virt); + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (flags & ecryptfs_flag_map[i].file_flag) { + crypt_stat->flags |= ecryptfs_flag_map[i].local_flag; + } else + crypt_stat->flags &= ~(ecryptfs_flag_map[i].local_flag); + /* Version is in top 8 bits of the 32-bit flag vector */ + crypt_stat->file_version = ((flags >> 24) & 0xFF); + (*bytes_read) = 4; + return rc; +} + +/** + * write_ecryptfs_marker + * @page_virt: The pointer to in a page to begin writing the marker + * @written: Number of bytes written + * + * Marker = 0x3c81b7f5 + */ +static void write_ecryptfs_marker(char *page_virt, size_t *written) +{ + u32 m_1, m_2; + + get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER); + put_unaligned_be32(m_1, page_virt); + page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2); + put_unaligned_be32(m_2, page_virt); + (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; +} + +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 flags = 0; + int i; + + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag) + flags |= ecryptfs_flag_map[i].file_flag; + /* Version is in top 8 bits of the 32-bit flag vector */ + flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); + put_unaligned_be32(flags, page_virt); + (*written) = 4; +} + +struct ecryptfs_cipher_code_str_map_elem { + char cipher_str[16]; + u8 cipher_code; +}; + +/* Add support for additional ciphers by adding elements here. The + * cipher_code is whatever OpenPGP applicatoins use to identify the + * ciphers. List in order of probability. */ +static struct ecryptfs_cipher_code_str_map_elem +ecryptfs_cipher_code_str_map[] = { + {"aes",RFC2440_CIPHER_AES_128 }, + {"blowfish", RFC2440_CIPHER_BLOWFISH}, + {"des3_ede", RFC2440_CIPHER_DES3_EDE}, + {"cast5", RFC2440_CIPHER_CAST_5}, + {"twofish", RFC2440_CIPHER_TWOFISH}, + {"cast6", RFC2440_CIPHER_CAST_6}, + {"aes", RFC2440_CIPHER_AES_192}, + {"aes", RFC2440_CIPHER_AES_256} +}; + +/** + * ecryptfs_code_for_cipher_string + * @cipher_name: The string alias for the cipher + * @key_bytes: Length of key in bytes; used for AES code selection + * + * Returns zero on no match, or the cipher code on match + */ +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) +{ + int i; + u8 code = 0; + struct ecryptfs_cipher_code_str_map_elem *map = + ecryptfs_cipher_code_str_map; + + if (strcmp(cipher_name, "aes") == 0) { + switch (key_bytes) { + case 16: + code = RFC2440_CIPHER_AES_128; + break; + case 24: + code = RFC2440_CIPHER_AES_192; + break; + case 32: + code = RFC2440_CIPHER_AES_256; + } + } else { + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (strcmp(cipher_name, map[i].cipher_str) == 0) { + code = map[i].cipher_code; + break; + } + } + return code; +} + +/** + * ecryptfs_cipher_code_to_string + * @str: Destination to write out the cipher name + * @cipher_code: The code to convert to cipher name string + * + * Returns zero on success + */ +int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code) +{ + int rc = 0; + int i; + + str[0] = '\0'; + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code) + strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str); + if (str[0] == '\0') { + ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: " + "[%d]\n", cipher_code); + rc = -EINVAL; + } + return rc; +} + +int ecryptfs_read_and_validate_header_region(struct inode *inode) +{ + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; + int rc; + + rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES, + inode); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); + return rc; +} + +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 header_extent_size; + u16 num_header_extents_at_front; + + header_extent_size = (u32)crypt_stat->extent_size; + num_header_extents_at_front = + (u16)(crypt_stat->metadata_size / crypt_stat->extent_size); + put_unaligned_be32(header_extent_size, virt); + virt += 4; + put_unaligned_be16(num_header_extents_at_front, virt); + (*written) = 6; +} + +struct kmem_cache *ecryptfs_header_cache; + +/** + * ecryptfs_write_headers_virt + * @page_virt: The virtual address to write the headers to + * @max: The size of memory allocated at page_virt + * @size: Set to the number of bytes written by this function + * @crypt_stat: The cryptographic context + * @ecryptfs_dentry: The eCryptfs dentry + * + * Format version: 1 + * + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + * Data Extent 0: + * Lower data (CBC encrypted) + * Data Extent 1: + * Lower data (CBC encrypted) + * ... + * + * Returns zero on success + */ +static int ecryptfs_write_headers_virt(char *page_virt, size_t max, + size_t *size, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry) +{ + int rc; + size_t written; + size_t offset; + + offset = ECRYPTFS_FILE_SIZE_BYTES; + write_ecryptfs_marker((page_virt + offset), &written); + offset += written; + ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat, + &written); + offset += written; + ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, + &written); + offset += written; + rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat, + ecryptfs_dentry, &written, + max - offset); + if (rc) + ecryptfs_printk(KERN_WARNING, "Error generating key packet " + "set; rc = [%d]\n", rc); + if (size) { + offset += written; + *size = offset; + } + return rc; +} + +static int +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, + char *virt, size_t virt_len) +{ + int rc; + + rc = ecryptfs_write_lower(ecryptfs_inode, virt, + 0, virt_len); + if (rc < 0) + printk(KERN_ERR "%s: Error attempting to write header " + "information to lower file; rc = [%d]\n", __func__, rc); + else + rc = 0; + return rc; +} + +static int +ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + char *page_virt, size_t size) +{ + int rc; + + rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, + size, 0); + return rc; +} + +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +/** + * ecryptfs_write_metadata + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode + * + * Write the file headers out. This will likely involve a userspace + * callout, in which the session key is encrypted with one or more + * public keys and/or the passphrase necessary to do the encryption is + * retrieved via a prompt. Exactly what happens at this point should + * be policy-dependent. + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + unsigned int order; + char *virt; + size_t virt_len; + size_t size = 0; + int rc = 0; + + if (likely(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + printk(KERN_ERR "Key is invalid; bailing out\n"); + rc = -EINVAL; + goto out; + } + } else { + printk(KERN_WARNING "%s: Encrypted flag not set\n", + __func__); + rc = -EINVAL; + goto out; + } + virt_len = crypt_stat->metadata_size; + order = get_order(virt_len); + /* Released in this function */ + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); + if (!virt) { + printk(KERN_ERR "%s: Out of memory\n", __func__); + rc = -ENOMEM; + goto out; + } + /* Zeroed page ensures the in-header unencrypted i_size is set to 0 */ + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); + if (unlikely(rc)) { + printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", + __func__, rc); + goto out_free; + } + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); + else + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, + virt_len); + if (rc) { + printk(KERN_ERR "%s: Error writing metadata out to lower file; " + "rc = [%d]\n", __func__, rc); + goto out_free; + } +out_free: + free_pages((unsigned long)virt, order); +out: + return rc; +} + +#define ECRYPTFS_DONT_VALIDATE_HEADER_SIZE 0 +#define ECRYPTFS_VALIDATE_HEADER_SIZE 1 +static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, + char *virt, int *bytes_read, + int validate_header_size) +{ + int rc = 0; + u32 header_extent_size; + u16 num_header_extents_at_front; + + header_extent_size = get_unaligned_be32(virt); + virt += sizeof(__be32); + num_header_extents_at_front = get_unaligned_be16(virt); + crypt_stat->metadata_size = (((size_t)num_header_extents_at_front + * (size_t)header_extent_size)); + (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); + if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) + && (crypt_stat->metadata_size + < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { + rc = -EINVAL; + printk(KERN_WARNING "Invalid header size: [%zd]\n", + crypt_stat->metadata_size); + } + return rc; +} + +/** + * set_default_header_data + * @crypt_stat: The cryptographic context + * + * For version 0 file format; this function is only for backwards + * compatibility for files created with the prior versions of + * eCryptfs. + */ +static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) +{ + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; +} + +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; + u64 file_size; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = + &ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { + file_size = i_size_read(ecryptfs_inode_to_lower(inode)); + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + file_size += crypt_stat->metadata_size; + } else + file_size = get_unaligned_be64(page_virt); + i_size_write(inode, (loff_t)file_size); + crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED; +} + +/** + * ecryptfs_read_headers_virt + * @page_virt: The virtual address into which to read the headers + * @crypt_stat: The cryptographic context + * @ecryptfs_dentry: The eCryptfs dentry + * @validate_header_size: Whether to validate the header size while reading + * + * Read/parse the header data. The header format is detailed in the + * comment block for the ecryptfs_write_headers_virt() function. + * + * Returns zero on success + */ +static int ecryptfs_read_headers_virt(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, + int validate_header_size) +{ + int rc = 0; + int offset; + int bytes_read; + + ecryptfs_set_default_sizes(crypt_stat); + crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + offset = ECRYPTFS_FILE_SIZE_BYTES; + rc = ecryptfs_validate_marker(page_virt + offset); + if (rc) + goto out; + if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) + ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry)); + offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; + rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), + &bytes_read); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error processing flags\n"); + goto out; + } + if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) { + ecryptfs_printk(KERN_WARNING, "File version is [%d]; only " + "file version [%d] is supported by this " + "version of eCryptfs\n", + crypt_stat->file_version, + ECRYPTFS_SUPPORTED_FILE_VERSION); + rc = -EINVAL; + goto out; + } + offset += bytes_read; + if (crypt_stat->file_version >= 1) { + rc = parse_header_metadata(crypt_stat, (page_virt + offset), + &bytes_read, validate_header_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error reading header " + "metadata; rc = [%d]\n", rc); + } + offset += bytes_read; + } else + set_default_header_data(crypt_stat); + rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset), + ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_read_xattr_region + * @page_virt: The vitual address into which to read the xattr data + * @ecryptfs_inode: The eCryptfs inode + * + * Attempts to read the crypto metadata from the extended attribute + * region of the lower file. + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode) +{ + struct dentry *lower_dentry = + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; + ssize_t size; + int rc = 0; + + size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME, + page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE); + if (size < 0) { + if (unlikely(ecryptfs_verbosity > 0)) + printk(KERN_INFO "Error attempting to read the [%s] " + "xattr from the lower file; return value = " + "[%zd]\n", ECRYPTFS_XATTR_NAME, size); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode) +{ + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; + int rc; + + rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), + ECRYPTFS_XATTR_NAME, file_size, + ECRYPTFS_SIZE_AND_MARKER_BYTES); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); + return rc; +} + +/** + * ecryptfs_read_metadata + * + * Common entry point for reading file metadata. From here, we could + * retrieve the header information from the header region of the file, + * the xattr region of the file, or some other repostory that is + * stored separately from the file itself. The current implementation + * supports retrieving the metadata information from the file contents + * and from the xattr region. + * + * Returns zero if valid headers found and parsed; non-zero otherwise + */ +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) +{ + int rc; + char *page_virt; + struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry); + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + /* Read the first page from the underlying file */ + page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER); + if (!page_virt) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Unable to allocate page_virt\n", + __func__); + goto out; + } + rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, + ecryptfs_inode); + if (rc >= 0) + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry, + ECRYPTFS_VALIDATE_HEADER_SIZE); + if (rc) { + /* metadata is not in the file header, so try xattrs */ + memset(page_virt, 0, PAGE_CACHE_SIZE); + rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file header region or xattr region, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry, + ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file xattr region either, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + } + if (crypt_stat->mount_crypt_stat->flags + & ECRYPTFS_XATTR_METADATA_ENABLED) { + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } else { + printk(KERN_WARNING "Attempt to access file with " + "crypto metadata only in the extended attribute " + "region, but eCryptfs was mounted without " + "xattr support enabled. eCryptfs will not treat " + "this like an encrypted file, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + } + } +out: + if (page_virt) { + memset(page_virt, 0, PAGE_CACHE_SIZE); + kmem_cache_free(ecryptfs_header_cache, page_virt); + } + return rc; +} + +/** + * ecryptfs_encrypt_filename - encrypt filename + * + * CBC-encrypts the filename. We do not want to encrypt the same + * filename with the same key and IV, which may happen with hard + * links, so we prepend random bits to each filename. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + int rc = 0; + + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + size_t packet_size; + size_t remaining_bytes; + + rc = ecryptfs_write_tag_70_packet( + NULL, NULL, + &filename->encrypted_filename_size, + mount_crypt_stat, NULL, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to get packet " + "size for tag 72; rc = [%d]\n", __func__, + rc); + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename = + kmalloc(filename->encrypted_filename_size, GFP_KERNEL); + if (!filename->encrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + filename->encrypted_filename_size); + rc = -ENOMEM; + goto out; + } + remaining_bytes = filename->encrypted_filename_size; + rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, + &remaining_bytes, + &packet_size, + mount_crypt_stat, + filename->filename, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to generate " + "tag 70 packet; rc = [%d]\n", __func__, + rc); + kfree(filename->encrypted_filename); + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename_size = packet_size; + } else { + printk(KERN_ERR "%s: No support for requested filename " + "encryption method in this release\n", __func__); + rc = -EOPNOTSUPP; + goto out; + } +out: + return rc; +} + +static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, + const char *name, size_t name_size) +{ + int rc = 0; + + (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL); + if (!(*copied_name)) { + rc = -ENOMEM; + goto out; + } + memcpy((void *)(*copied_name), (void *)name, name_size); + (*copied_name)[(name_size)] = '\0'; /* Only for convenience + * in printing out the + * string in debug + * messages */ + (*copied_name_size) = name_size; +out: + return rc; +} + +/** + * ecryptfs_process_key_cipher - Perform key cipher initialization. + * @key_tfm: Crypto context for key material, set by this function + * @cipher_name: Name of the cipher + * @key_size: Size of the key in bytes + * + * Returns zero on success. Any crypto_tfm structs allocated here + * should be released by other functions, such as on a superblock put + * event, regardless of whether this function succeeds for fails. + */ +static int +ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, + char *cipher_name, size_t *key_size) +{ + char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; + char *full_alg_name = NULL; + int rc; + + *key_tfm = NULL; + if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { + rc = -EINVAL; + printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " + "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); + goto out; + } + rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, cipher_name, + "ecb"); + if (rc) + goto out; + *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*key_tfm)) { + rc = PTR_ERR(*key_tfm); + printk(KERN_ERR "Unable to allocate crypto cipher with name " + "[%s]; rc = [%d]\n", full_alg_name, rc); + goto out; + } + crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); + if (*key_size == 0) { + struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm); + + *key_size = alg->max_keysize; + } + get_random_bytes(dummy_key, *key_size); + rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); + if (rc) { + printk(KERN_ERR "Error attempting to set key of size [%zd] for " + "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name, + rc); + rc = -EINVAL; + goto out; + } +out: + kfree(full_alg_name); + return rc; +} + +struct kmem_cache *ecryptfs_key_tfm_cache; +static struct list_head key_tfm_list; +struct mutex key_tfm_list_mutex; + +int __init ecryptfs_init_crypto(void) +{ + mutex_init(&key_tfm_list_mutex); + INIT_LIST_HEAD(&key_tfm_list); + return 0; +} + +/** + * ecryptfs_destroy_crypto - free all cached key_tfms on key_tfm_list + * + * Called only at module unload time + */ +int ecryptfs_destroy_crypto(void) +{ + struct ecryptfs_key_tfm *key_tfm, *key_tfm_tmp; + + mutex_lock(&key_tfm_list_mutex); + list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list, + key_tfm_list) { + list_del(&key_tfm->key_tfm_list); + if (key_tfm->key_tfm) + crypto_free_blkcipher(key_tfm->key_tfm); + kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm); + } + mutex_unlock(&key_tfm_list_mutex); + return 0; +} + +int +ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, + size_t key_size) +{ + struct ecryptfs_key_tfm *tmp_tfm; + int rc = 0; + + BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); + + tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL); + if (key_tfm != NULL) + (*key_tfm) = tmp_tfm; + if (!tmp_tfm) { + rc = -ENOMEM; + printk(KERN_ERR "Error attempting to allocate from " + "ecryptfs_key_tfm_cache\n"); + goto out; + } + mutex_init(&tmp_tfm->key_tfm_mutex); + strncpy(tmp_tfm->cipher_name, cipher_name, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + tmp_tfm->key_size = key_size; + rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm, + tmp_tfm->cipher_name, + &tmp_tfm->key_size); + if (rc) { + printk(KERN_ERR "Error attempting to initialize key TFM " + "cipher with name = [%s]; rc = [%d]\n", + tmp_tfm->cipher_name, rc); + kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm); + if (key_tfm != NULL) + (*key_tfm) = NULL; + goto out; + } + list_add(&tmp_tfm->key_tfm_list, &key_tfm_list); +out: + return rc; +} + +/** + * ecryptfs_tfm_exists - Search for existing tfm for cipher_name. + * @cipher_name: the name of the cipher to search for + * @key_tfm: set to corresponding tfm if found + * + * Searches for cached key_tfm matching @cipher_name + * Must be called with &key_tfm_list_mutex held + * Returns 1 if found, with @key_tfm set + * Returns 0 if not found, with @key_tfm set to NULL + */ +int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm) +{ + struct ecryptfs_key_tfm *tmp_key_tfm; + + BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); + + list_for_each_entry(tmp_key_tfm, &key_tfm_list, key_tfm_list) { + if (strcmp(tmp_key_tfm->cipher_name, cipher_name) == 0) { + if (key_tfm) + (*key_tfm) = tmp_key_tfm; + return 1; + } + } + if (key_tfm) + (*key_tfm) = NULL; + return 0; +} + +/** + * ecryptfs_get_tfm_and_mutex_for_cipher_name + * + * @tfm: set to cached tfm found, or new tfm created + * @tfm_mutex: set to mutex for cached tfm found, or new tfm created + * @cipher_name: the name of the cipher to search for and/or add + * + * Sets pointers to @tfm & @tfm_mutex matching @cipher_name. + * Searches for cached item first, and creates new if not found. + * Returns 0 on success, non-zero if adding new cipher failed + */ +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, + struct mutex **tfm_mutex, + char *cipher_name) +{ + struct ecryptfs_key_tfm *key_tfm; + int rc = 0; + + (*tfm) = NULL; + (*tfm_mutex) = NULL; + + mutex_lock(&key_tfm_list_mutex); + if (!ecryptfs_tfm_exists(cipher_name, &key_tfm)) { + rc = ecryptfs_add_new_key_tfm(&key_tfm, cipher_name, 0); + if (rc) { + printk(KERN_ERR "Error adding new key_tfm to list; " + "rc = [%d]\n", rc); + goto out; + } + } + (*tfm) = key_tfm->key_tfm; + (*tfm_mutex) = &key_tfm->key_tfm_mutex; +out: + mutex_unlock(&key_tfm_list_mutex); + return rc; +} + +/* 64 characters forming a 6-bit target field */ +static unsigned char *portable_filename_chars = ("-.0123456789ABCD" + "EFGHIJKLMNOPQRST" + "UVWXYZabcdefghij" + "klmnopqrstuvwxyz"); + +/* We could either offset on every reverse map or just pad some 0x00's + * at the front here */ +static const unsigned char filename_rev_map[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ + 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ + 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ + 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ + 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ + 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ + 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ +}; + +/** + * ecryptfs_encode_for_filename + * @dst: Destination location for encoded filename + * @dst_size: Size of the encoded filename in bytes + * @src: Source location for the filename to encode + * @src_size: Size of the source in bytes + */ +static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, + unsigned char *src, size_t src_size) +{ + size_t num_blocks; + size_t block_num = 0; + size_t dst_offset = 0; + unsigned char last_block[3]; + + if (src_size == 0) { + (*dst_size) = 0; + goto out; + } + num_blocks = (src_size / 3); + if ((src_size % 3) == 0) { + memcpy(last_block, (&src[src_size - 3]), 3); + } else { + num_blocks++; + last_block[2] = 0x00; + switch (src_size % 3) { + case 1: + last_block[0] = src[src_size - 1]; + last_block[1] = 0x00; + break; + case 2: + last_block[0] = src[src_size - 2]; + last_block[1] = src[src_size - 1]; + } + } + (*dst_size) = (num_blocks * 4); + if (!dst) + goto out; + while (block_num < num_blocks) { + unsigned char *src_block; + unsigned char dst_block[4]; + + if (block_num == (num_blocks - 1)) + src_block = last_block; + else + src_block = &src[block_num * 3]; + dst_block[0] = ((src_block[0] >> 2) & 0x3F); + dst_block[1] = (((src_block[0] << 4) & 0x30) + | ((src_block[1] >> 4) & 0x0F)); + dst_block[2] = (((src_block[1] << 2) & 0x3C) + | ((src_block[2] >> 6) & 0x03)); + dst_block[3] = (src_block[2] & 0x3F); + dst[dst_offset++] = portable_filename_chars[dst_block[0]]; + dst[dst_offset++] = portable_filename_chars[dst_block[1]]; + dst[dst_offset++] = portable_filename_chars[dst_block[2]]; + dst[dst_offset++] = portable_filename_chars[dst_block[3]]; + block_num++; + } +out: + return; +} + +static size_t ecryptfs_max_decoded_size(size_t encoded_size) +{ + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + return ((encoded_size + 1) * 3) / 4; +} + +/** + * ecryptfs_decode_from_filename + * @dst: If NULL, this function only sets @dst_size and returns. If + * non-NULL, this function decodes the encoded octets in @src + * into the memory that @dst points to. + * @dst_size: Set to the size of the decoded string. + * @src: The encoded set of octets to decode. + * @src_size: The size of the encoded set of octets to decode. + */ +static void +ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + const unsigned char *src, size_t src_size) +{ + u8 current_bit_offset = 0; + size_t src_byte_offset = 0; + size_t dst_byte_offset = 0; + + if (dst == NULL) { + (*dst_size) = ecryptfs_max_decoded_size(src_size); + goto out; + } + while (src_byte_offset < src_size) { + unsigned char src_byte = + filename_rev_map[(int)src[src_byte_offset]]; + + switch (current_bit_offset) { + case 0: + dst[dst_byte_offset] = (src_byte << 2); + current_bit_offset = 6; + break; + case 6: + dst[dst_byte_offset++] |= (src_byte >> 4); + dst[dst_byte_offset] = ((src_byte & 0xF) + << 4); + current_bit_offset = 4; + break; + case 4: + dst[dst_byte_offset++] |= (src_byte >> 2); + dst[dst_byte_offset] = (src_byte << 6); + current_bit_offset = 2; + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); + current_bit_offset = 0; + break; + } + src_byte_offset++; + } + (*dst_size) = dst_byte_offset; +out: + return; +} + +/** + * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * Returns zero on success; non-zero on otherwise + */ +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size) +{ + size_t encoded_name_no_prefix_size; + int rc = 0; + + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + struct ecryptfs_filename *filename; + + filename = kzalloc(sizeof(*filename), GFP_KERNEL); + if (!filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + sizeof(*filename)); + rc = -ENOMEM; + goto out; + } + filename->filename = (char *)name; + filename->filename_size = name_size; + rc = ecryptfs_encrypt_filename(filename, crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt " + "filename; rc = [%d]\n", __func__, rc); + kfree(filename); + goto out; + } + ecryptfs_encode_for_filename( + NULL, &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + else + (*encoded_name_size) = + (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); + if (!(*encoded_name)) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + (*encoded_name_size)); + rc = -ENOMEM; + kfree(filename->encrypted_filename); + kfree(filename); + goto out; + } + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + memcpy((*encoded_name), + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); + ecryptfs_encode_for_filename( + ((*encoded_name) + + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), + &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name)[(*encoded_name_size)] = '\0'; + } else { + rc = -EOPNOTSUPP; + } + if (rc) { + printk(KERN_ERR "%s: Error attempting to encode " + "encrypted filename; rc = [%d]\n", __func__, + rc); + kfree((*encoded_name)); + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + } + kfree(filename->encrypted_filename); + kfree(filename); + } else { + rc = ecryptfs_copy_filename(encoded_name, + encoded_name_size, + name, name_size); + } +out: + return rc; +} + +/** + * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext + * @plaintext_name: The plaintext name + * @plaintext_name_size: The plaintext name size + * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @name: The filename in cipher text + * @name_size: The cipher text name size + * + * Decrypts and decodes the filename. + * + * Returns zero on error; non-zero otherwise + */ +int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, + size_t *plaintext_name_size, + struct super_block *sb, + const char *name, size_t name_size) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + char *decoded_name; + size_t decoded_name_size; + size_t packet_size; + int rc = 0; + + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { + const char *orig_name = name; + size_t orig_name_size = name_size; + + name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + ecryptfs_decode_from_filename(NULL, &decoded_name_size, + name, name_size); + decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); + if (!decoded_name) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + decoded_name_size); + rc = -ENOMEM; + goto out; + } + ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, + name, name_size); + rc = ecryptfs_parse_tag_70_packet(plaintext_name, + plaintext_name_size, + &packet_size, + mount_crypt_stat, + decoded_name, + decoded_name_size); + if (rc) { + printk(KERN_INFO "%s: Could not parse tag 70 packet " + "from filename; copying through filename " + "as-is\n", __func__); + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + orig_name, orig_name_size); + goto out_free; + } + } else { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } +out_free: + kfree(decoded_name); +out: + return rc; +} + +#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143 + +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct blkcipher_desc desc; + struct mutex *tfm_mutex; + size_t cipher_blocksize; + int rc; + + if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + (*namelen) = lower_namelen; + return 0; + } + + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + (*namelen) = 0; + return rc; + } + + mutex_lock(tfm_mutex); + cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); + mutex_unlock(tfm_mutex); + + /* Return an exact amount for the common cases */ + if (lower_namelen == NAME_MAX + && (cipher_blocksize == 8 || cipher_blocksize == 16)) { + (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16; + return 0; + } + + /* Return a safe estimate for the uncommon cases */ + (*namelen) = lower_namelen; + (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + /* Since this is the max decoded size, subtract 1 "decoded block" len */ + (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3; + (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE; + (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES; + /* Worst case is that the filename is padded nearly a full block size */ + (*namelen) -= cipher_blocksize - 1; + + if ((*namelen) < 0) + (*namelen) = 0; + + return 0; +} diff --git a/kernel/fs/ecryptfs/debug.c b/kernel/fs/ecryptfs/debug.c new file mode 100644 index 000000000..3d2bdf546 --- /dev/null +++ b/kernel/fs/ecryptfs/debug.c @@ -0,0 +1,121 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Functions only useful for debugging. + * + * Copyright (C) 2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_dump_auth_tok - debug function to print auth toks + * + * This function will print the contents of an ecryptfs authentication + * token. + */ +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) +{ + char salt[ECRYPTFS_SALT_SIZE * 2 + 1]; + char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; + + ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n", + auth_tok); + if (auth_tok->flags & ECRYPTFS_PRIVATE_KEY) { + ecryptfs_printk(KERN_DEBUG, " * private key type\n"); + } else { + ecryptfs_printk(KERN_DEBUG, " * passphrase type\n"); + ecryptfs_to_hex(salt, auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + salt[ECRYPTFS_SALT_SIZE * 2] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt); + if (auth_tok->token.password.flags & + ECRYPTFS_PERSISTENT_PASSWORD) { + ecryptfs_printk(KERN_DEBUG, " * persistent\n"); + } + memcpy(sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE_HEX); + sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig); + } + ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n", + auth_tok->session_key.flags); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace decrypt request set\n"); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace encrypt request set\n"); + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.decrypted_key_size = [0x%x]\n", + auth_tok->session_key.decrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Decrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.decrypted_key, + ECRYPTFS_DEFAULT_KEY_BYTES); + } + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.encrypted_key_size = [0x%x]\n", + auth_tok->session_key.encrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Encrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.encrypted_key, + auth_tok->session_key. + encrypted_key_size); + } +} + +/** + * ecryptfs_dump_hex - debug hex printer + * @data: string of bytes to be printed + * @bytes: number of bytes to print + * + * Dump hexadecimal representation of char array + */ +void ecryptfs_dump_hex(char *data, int bytes) +{ + int i = 0; + int add_newline = 1; + + if (ecryptfs_verbosity < 1) + return; + if (bytes != 0) { + printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]); + i++; + } + while (i < bytes) { + printk("0x%.2x.", (unsigned char)data[i]); + i++; + if (i % 16 == 0) { + printk("\n"); + add_newline = 0; + } else + add_newline = 1; + } + if (add_newline) + printk("\n"); +} + diff --git a/kernel/fs/ecryptfs/dentry.c b/kernel/fs/ecryptfs/dentry.c new file mode 100644 index 000000000..8db0b4644 --- /dev/null +++ b/kernel/fs/ecryptfs/dentry.c @@ -0,0 +1,92 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_d_revalidate - revalidate an ecryptfs dentry + * @dentry: The ecryptfs dentry + * @flags: lookup flags + * + * Called when the VFS needs to revalidate a dentry. This + * is called whenever a name lookup finds a dentry in the + * dcache. Most filesystems leave this as NULL, because all their + * dentries in the dcache are valid. + * + * Returns 1 if valid, 0 otherwise. + * + */ +static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + int rc; + + if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) + return 1; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (d_really_is_positive(dentry)) { + struct inode *lower_inode = + ecryptfs_inode_to_lower(d_inode(dentry)); + + fsstack_copy_attr_all(d_inode(dentry), lower_inode); + } + return rc; +} + +struct kmem_cache *ecryptfs_dentry_info_cache; + +static void ecryptfs_dentry_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(ecryptfs_dentry_info_cache, + container_of(head, struct ecryptfs_dentry_info, rcu)); +} + +/** + * ecryptfs_d_release + * @dentry: The ecryptfs dentry + * + * Called when a dentry is really deallocated. + */ +static void ecryptfs_d_release(struct dentry *dentry) +{ + struct ecryptfs_dentry_info *p = dentry->d_fsdata; + if (p) { + path_put(&p->lower_path); + call_rcu(&p->rcu, ecryptfs_dentry_free_rcu); + } +} + +const struct dentry_operations ecryptfs_dops = { + .d_revalidate = ecryptfs_d_revalidate, + .d_release = ecryptfs_d_release, +}; diff --git a/kernel/fs/ecryptfs/ecryptfs_kernel.h b/kernel/fs/ecryptfs/ecryptfs_kernel.h new file mode 100644 index 000000000..5ba029e62 --- /dev/null +++ b/kernel/fs/ecryptfs/ecryptfs_kernel.h @@ -0,0 +1,721 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Kernel declarations. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Trevor S. Highland + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#ifndef ECRYPTFS_KERNEL_H +#define ECRYPTFS_KERNEL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ECRYPTFS_DEFAULT_IV_BYTES 16 +#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 +#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 +#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 +#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ +#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) +#define ECRYPTFS_DEFAULT_NUM_USERS 4 +#define ECRYPTFS_MAX_NUM_USERS 32768 +#define ECRYPTFS_XATTR_NAME "user.ecryptfs" + +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); +extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); +extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); + +struct ecryptfs_key_record { + unsigned char type; + size_t enc_key_size; + unsigned char sig[ECRYPTFS_SIG_SIZE]; + unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; +}; + +struct ecryptfs_auth_tok_list { + struct ecryptfs_auth_tok *auth_tok; + struct list_head list; +}; + +struct ecryptfs_crypt_stat; +struct ecryptfs_mount_crypt_stat; + +struct ecryptfs_page_crypt_context { + struct page *page; +#define ECRYPTFS_PREPARE_COMMIT_MODE 0 +#define ECRYPTFS_WRITEPAGE_MODE 1 + unsigned int mode; + union { + struct file *lower_file; + struct writeback_control *wbc; + } param; +}; + +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + if (key->type == &key_type_encrypted) + return (struct ecryptfs_auth_tok *) + (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + else + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return request_key(&key_type_encrypted, sig, NULL); +} + +#else +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return ERR_PTR(-ENOKEY); +} + +#endif /* CONFIG_ENCRYPTED_KEYS */ + +static inline struct ecryptfs_auth_tok * +ecryptfs_get_key_payload_data(struct key *key) +{ + struct ecryptfs_auth_tok *auth_tok; + + auth_tok = ecryptfs_get_encrypted_key_payload_data(key); + if (!auth_tok) + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload *)key->payload.data)->data); + else + return auth_tok; +} + +#define ECRYPTFS_MAX_KEYSET_SIZE 1024 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 +#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 +#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ +#define ECRYPTFS_SALT_BYTES 2 +#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 +#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ +#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) +#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \ + + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) +#define ECRYPTFS_DEFAULT_CIPHER "aes" +#define ECRYPTFS_DEFAULT_KEY_BYTES 16 +#define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH +#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 +#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C +#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED +#define ECRYPTFS_TAG_64_PACKET_TYPE 0x40 +#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 +#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 +#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 +#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename + * as dentry name */ +#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in + * metadata */ +#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as + * dentry name */ +#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as + * metadata */ +#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */ +#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to + * ecryptfs_parse_packet_length() and + * ecryptfs_write_packet_length() + */ +/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= + * ECRYPTFS_MAX_IV_BYTES */ +#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 +#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ +#define MD5_DIGEST_SIZE 16 +#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) +#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 +#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) + +#ifdef CONFIG_ECRYPT_FS_MESSAGING +# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_PUBKEY) +#else +# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 +#endif + +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_MASK_MESSAGING \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) +struct ecryptfs_key_sig { + struct list_head crypt_stat_list; + char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +struct ecryptfs_filename { + struct list_head crypt_stat_list; +#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 + u32 flags; + u32 seq_no; + char *filename; + char *encrypted_filename; + size_t filename_size; + size_t encrypted_filename_size; + char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; + char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; +}; + +/** + * This is the primary struct associated with each encrypted file. + * + * TODO: cache align/pack? + */ +struct ecryptfs_crypt_stat { +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_ENCRYPTED 0x00000004 +#define ECRYPTFS_SECURITY_WARNING 0x00000008 +#define ECRYPTFS_ENABLE_HMAC 0x00000010 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000020 +#define ECRYPTFS_KEY_VALID 0x00000040 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000080 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000100 +#define ECRYPTFS_KEY_SET 0x00000200 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000400 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800 +#define ECRYPTFS_ENCFN_USE_FEK 0x00001000 +#define ECRYPTFS_UNLINK_SIGS 0x00002000 +#define ECRYPTFS_I_SIZE_INITIALIZED 0x00004000 + u32 flags; + unsigned int file_version; + size_t iv_bytes; + size_t metadata_size; + size_t extent_size; /* Data extent size; default is 4096 */ + size_t key_size; + size_t extent_shift; + unsigned int extent_mask; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct crypto_ablkcipher *tfm; + struct crypto_hash *hash_tfm; /* Crypto context for generating + * the initialization vectors */ + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; + unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; + struct list_head keysig_list; + struct mutex keysig_list_mutex; + struct mutex cs_tfm_mutex; + struct mutex cs_hash_tfm_mutex; + struct mutex cs_mutex; +}; + +/* inode private data. */ +struct ecryptfs_inode_info { + struct inode vfs_inode; + struct inode *wii_inode; + struct mutex lower_file_mutex; + atomic_t lower_file_count; + struct file *lower_file; + struct ecryptfs_crypt_stat crypt_stat; +}; + +/* dentry private data. Each dentry must keep track of a lower + * vfsmount too. */ +struct ecryptfs_dentry_info { + struct path lower_path; + union { + struct ecryptfs_crypt_stat *crypt_stat; + struct rcu_head rcu; + }; +}; + +/** + * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint + * @flags: Status flags + * @mount_crypt_stat_list: These auth_toks hang off the mount-wide + * cryptographic context. Every time a new + * inode comes into existence, eCryptfs copies + * the auth_toks on that list to the set of + * auth_toks on the inode's crypt_stat + * @global_auth_tok_key: The key from the user's keyring for the sig + * @global_auth_tok: The key contents + * @sig: The key identifier + * + * ecryptfs_global_auth_tok structs refer to authentication token keys + * in the user keyring that apply to newly created files. A list of + * these objects hangs off of the mount_crypt_stat struct for any + * given eCryptfs mount. This struct maintains a reference to both the + * key contents and the key itself so that the key can be put on + * unmount. + */ +struct ecryptfs_global_auth_tok { +#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 + u32 flags; + struct list_head mount_crypt_stat_list; + struct key *global_auth_tok_key; + unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +/** + * ecryptfs_key_tfm - Persistent key tfm + * @key_tfm: crypto API handle to the key + * @key_size: Key size in bytes + * @key_tfm_mutex: Mutex to ensure only one operation in eCryptfs is + * using the persistent TFM at any point in time + * @key_tfm_list: Handle to hang this off the module-wide TFM list + * @cipher_name: String name for the cipher for this TFM + * + * Typically, eCryptfs will use the same ciphers repeatedly throughout + * the course of its operations. In order to avoid unnecessarily + * destroying and initializing the same cipher repeatedly, eCryptfs + * keeps a list of crypto API contexts around to use when needed. + */ +struct ecryptfs_key_tfm { + struct crypto_blkcipher *key_tfm; + size_t key_size; + struct mutex key_tfm_mutex; + struct list_head key_tfm_list; + unsigned char cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; +}; + +extern struct mutex key_tfm_list_mutex; + +/** + * This struct is to enable a mount-wide passphrase/salt combo. This + * is more or less a stopgap to provide similar functionality to other + * crypto filesystems like EncFS or CFS until full policy support is + * implemented in eCryptfs. + */ +struct ecryptfs_mount_crypt_stat { + /* Pointers to memory we do not own, do not free these */ +#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001 +#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 +#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 +#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 +#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 +#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 +#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 +#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080 + u32 flags; + struct list_head global_auth_tok_list; + struct mutex global_auth_tok_list_mutex; + size_t global_default_cipher_key_size; + size_t global_default_fn_cipher_key_bytes; + unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + + 1]; + unsigned char global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +/* superblock private data. */ +struct ecryptfs_sb_info { + struct super_block *wsi_sb; + struct ecryptfs_mount_crypt_stat mount_crypt_stat; + struct backing_dev_info bdi; +}; + +/* file private data. */ +struct ecryptfs_file_info { + struct file *wfi_file; + struct ecryptfs_crypt_stat *crypt_stat; +}; + +/* auth_tok <=> encrypted_session_key mappings */ +struct ecryptfs_auth_tok_list_item { + unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES]; + struct list_head list; + struct ecryptfs_auth_tok auth_tok; +}; + +struct ecryptfs_message { + /* Can never be greater than ecryptfs_message_buf_len */ + /* Used to find the parent msg_ctx */ + /* Inherits from msg_ctx->index */ + u32 index; + u32 data_len; + u8 data[]; +}; + +struct ecryptfs_msg_ctx { +#define ECRYPTFS_MSG_CTX_STATE_FREE 0x01 +#define ECRYPTFS_MSG_CTX_STATE_PENDING 0x02 +#define ECRYPTFS_MSG_CTX_STATE_DONE 0x03 +#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04 + u8 state; +#define ECRYPTFS_MSG_HELO 100 +#define ECRYPTFS_MSG_QUIT 101 +#define ECRYPTFS_MSG_REQUEST 102 +#define ECRYPTFS_MSG_RESPONSE 103 + u8 type; + u32 index; + /* Counter converts to a sequence number. Each message sent + * out for which we expect a response has an associated + * sequence number. The response must have the same sequence + * number as the counter for the msg_stc for the message to be + * valid. */ + u32 counter; + size_t msg_size; + struct ecryptfs_message *msg; + struct task_struct *task; + struct list_head node; + struct list_head daemon_out_list; + struct mutex mux; +}; + +struct ecryptfs_daemon { +#define ECRYPTFS_DAEMON_IN_READ 0x00000001 +#define ECRYPTFS_DAEMON_IN_POLL 0x00000002 +#define ECRYPTFS_DAEMON_ZOMBIE 0x00000004 +#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 + u32 flags; + u32 num_queued_msg_ctx; + struct file *file; + struct mutex mux; + struct list_head msg_ctx_out_queue; + wait_queue_head_t wait; + struct hlist_node euid_chain; +}; + +#ifdef CONFIG_ECRYPT_FS_MESSAGING +extern struct mutex ecryptfs_daemon_hash_mux; +#endif + +static inline size_t +ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return 0; + return crypt_stat->metadata_size; +} + +static inline struct ecryptfs_file_info * +ecryptfs_file_to_private(struct file *file) +{ + return file->private_data; +} + +static inline void +ecryptfs_set_file_private(struct file *file, + struct ecryptfs_file_info *file_info) +{ + file->private_data = file_info; +} + +static inline struct file *ecryptfs_file_to_lower(struct file *file) +{ + return ((struct ecryptfs_file_info *)file->private_data)->wfi_file; +} + +static inline void +ecryptfs_set_file_lower(struct file *file, struct file *lower_file) +{ + ((struct ecryptfs_file_info *)file->private_data)->wfi_file = + lower_file; +} + +static inline struct ecryptfs_inode_info * +ecryptfs_inode_to_private(struct inode *inode) +{ + return container_of(inode, struct ecryptfs_inode_info, vfs_inode); +} + +static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode) +{ + return ecryptfs_inode_to_private(inode)->wii_inode; +} + +static inline void +ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode) +{ + ecryptfs_inode_to_private(inode)->wii_inode = lower_inode; +} + +static inline struct ecryptfs_sb_info * +ecryptfs_superblock_to_private(struct super_block *sb) +{ + return (struct ecryptfs_sb_info *)sb->s_fs_info; +} + +static inline void +ecryptfs_set_superblock_private(struct super_block *sb, + struct ecryptfs_sb_info *sb_info) +{ + sb->s_fs_info = sb_info; +} + +static inline struct super_block * +ecryptfs_superblock_to_lower(struct super_block *sb) +{ + return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb; +} + +static inline void +ecryptfs_set_superblock_lower(struct super_block *sb, + struct super_block *lower_sb) +{ + ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; +} + +static inline struct ecryptfs_dentry_info * +ecryptfs_dentry_to_private(struct dentry *dentry) +{ + return (struct ecryptfs_dentry_info *)dentry->d_fsdata; +} + +static inline void +ecryptfs_set_dentry_private(struct dentry *dentry, + struct ecryptfs_dentry_info *dentry_info) +{ + dentry->d_fsdata = dentry_info; +} + +static inline struct dentry * +ecryptfs_dentry_to_lower(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; +} + +static inline struct vfsmount * +ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; +} + +static inline struct path * +ecryptfs_dentry_to_lower_path(struct dentry *dentry) +{ + return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; +} + +#define ecryptfs_printk(type, fmt, arg...) \ + __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); +__printf(1, 2) +void __ecryptfs_printk(const char *fmt, ...); + +extern const struct file_operations ecryptfs_main_fops; +extern const struct file_operations ecryptfs_dir_fops; +extern const struct inode_operations ecryptfs_main_iops; +extern const struct inode_operations ecryptfs_dir_iops; +extern const struct inode_operations ecryptfs_symlink_iops; +extern const struct super_operations ecryptfs_sops; +extern const struct dentry_operations ecryptfs_dops; +extern const struct address_space_operations ecryptfs_aops; +extern int ecryptfs_verbosity; +extern unsigned int ecryptfs_message_buf_len; +extern signed long ecryptfs_message_wait_timeout; +extern unsigned int ecryptfs_number_of_users; + +extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; +extern struct kmem_cache *ecryptfs_file_info_cache; +extern struct kmem_cache *ecryptfs_dentry_info_cache; +extern struct kmem_cache *ecryptfs_inode_info_cache; +extern struct kmem_cache *ecryptfs_sb_info_cache; +extern struct kmem_cache *ecryptfs_header_cache; +extern struct kmem_cache *ecryptfs_xattr_cache; +extern struct kmem_cache *ecryptfs_key_record_cache; +extern struct kmem_cache *ecryptfs_key_sig_cache; +extern struct kmem_cache *ecryptfs_global_auth_tok_cache; +extern struct kmem_cache *ecryptfs_key_tfm_cache; + +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb); +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); +int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); +int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct super_block *sb, + const char *name, size_t name_size); +int ecryptfs_fill_zeros(struct file *file, loff_t new_length); +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size); +struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); +void ecryptfs_dump_hex(char *data, int bytes); +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size); +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_rotate_iv(unsigned char *iv); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destroy_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); +int ecryptfs_encrypt_page(struct page *page); +int ecryptfs_decrypt_page(struct page *page); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); +int ecryptfs_read_and_validate_header_region(struct inode *inode); +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode); +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); +int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, + size_t *len, size_t max); +int +ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, struct dentry *ecryptfs_dentry); +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); +ssize_t +ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, + void *value, size_t size); +int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +#ifdef CONFIG_ECRYPT_FS_MESSAGING +int ecryptfs_process_response(struct ecryptfs_daemon *daemon, + struct ecryptfs_message *msg, u32 seq); +int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx); +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg); +int ecryptfs_init_messaging(void); +void ecryptfs_release_messaging(void); +#else +static inline int ecryptfs_init_messaging(void) +{ + return 0; +} +static inline void ecryptfs_release_messaging(void) +{ } +static inline int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + return -ENOTCONN; +} +static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg) +{ + return -ENOMSG; +} +#endif + +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); +int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); +int +ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig, u32 global_auth_tok_flags); +int ecryptfs_get_global_auth_tok_for_sig( + struct ecryptfs_global_auth_tok **global_auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); +int +ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, + size_t key_size); +int ecryptfs_init_crypto(void); +int ecryptfs_destroy_crypto(void); +int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm); +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, + struct mutex **tfm_mutex, + char *cipher_name); +int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + char *sig); +int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, + loff_t offset, size_t size); +int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, + struct page *page_for_lower, + size_t offset_in_page, size_t size); +int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); +int ecryptfs_read_lower(char *data, loff_t offset, size_t size, + struct inode *ecryptfs_inode); +int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, + pgoff_t page_index, + size_t offset_in_page, size_t size, + struct inode *ecryptfs_inode); +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size); +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length); +#ifdef CONFIG_ECRYPT_FS_MESSAGING +int ecryptfs_init_ecryptfs_miscdev(void); +void ecryptfs_destroy_ecryptfs_miscdev(void); +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon); +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); +#endif +int ecryptfs_init_kthread(void); +void ecryptfs_destroy_kthread(void); +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt, + const struct cred *cred); +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode); +void ecryptfs_put_lower_file(struct inode *inode); +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size); +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size); +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); + +#endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/kernel/fs/ecryptfs/file.c b/kernel/fs/ecryptfs/file.c new file mode 100644 index 000000000..72afcc629 --- /dev/null +++ b/kernel/fs/ecryptfs/file.c @@ -0,0 +1,375 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_read_update_atime + * + * generic_file_read updates the atime of upper layer inode. But, it + * doesn't give us a chance to update the atime of the lower layer + * inode. This function is a wrapper to generic_file_read. It + * updates the atime of the lower level inode if generic_file_read + * returns without any errors. This is to be used only for file reads. + * The function to be used for directory reads is ecryptfs_read. + */ +static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, + struct iov_iter *to) +{ + ssize_t rc; + struct path *path; + struct file *file = iocb->ki_filp; + + rc = generic_file_read_iter(iocb, to); + if (rc >= 0) { + path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); + touch_atime(path); + } + return rc; +} + +struct ecryptfs_getdents_callback { + struct dir_context ctx; + struct dir_context *caller; + struct super_block *sb; + int filldir_called; + int entries_written; +}; + +/* Inspired by generic filldir in fs/readdir.c */ +static int +ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, + int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) +{ + struct ecryptfs_getdents_callback *buf = + container_of(ctx, struct ecryptfs_getdents_callback, ctx); + size_t name_size; + char *name; + int rc; + + buf->filldir_called++; + rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->sb, lower_name, + lower_namelen); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and decrypt " + "filename [%s]; rc = [%d]\n", __func__, lower_name, + rc); + goto out; + } + buf->caller->pos = buf->ctx.pos; + rc = !dir_emit(buf->caller, name, name_size, ino, d_type); + kfree(name); + if (!rc) + buf->entries_written++; +out: + return rc; +} + +/** + * ecryptfs_readdir + * @file: The eCryptfs directory file + * @ctx: The actor to feed the entries to + */ +static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) +{ + int rc; + struct file *lower_file; + struct inode *inode = file_inode(file); + struct ecryptfs_getdents_callback buf = { + .ctx.actor = ecryptfs_filldir, + .caller = ctx, + .sb = inode->i_sb, + }; + lower_file = ecryptfs_file_to_lower(file); + lower_file->f_pos = ctx->pos; + rc = iterate_dir(lower_file, &buf.ctx); + ctx->pos = buf.ctx.pos; + if (rc < 0) + goto out; + if (buf.filldir_called && !buf.entries_written) + goto out; + if (rc >= 0) + fsstack_copy_attr_atime(inode, + file_inode(lower_file)); +out: + return rc; +} + +struct kmem_cache *ecryptfs_file_info_cache; + +static int read_or_initialize_metadata(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; + int rc; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = &ecryptfs_superblock_to_private( + inode->i_sb)->mount_crypt_stat; + mutex_lock(&crypt_stat->cs_mutex); + + if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED && + crypt_stat->flags & ECRYPTFS_KEY_VALID) { + rc = 0; + goto out; + } + + rc = ecryptfs_read_metadata(dentry); + if (!rc) + goto out; + + if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) { + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); + rc = 0; + goto out; + } + + if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) && + !i_size_read(ecryptfs_inode_to_lower(inode))) { + rc = ecryptfs_initialize_file(dentry, inode); + if (!rc) + goto out; + } + + rc = -EIO; +out: + mutex_unlock(&crypt_stat->cs_mutex); + return rc; +} + +/** + * ecryptfs_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_open(struct inode *inode, struct file *file) +{ + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (!file_info) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mutex_lock(&crypt_stat->cs_mutex); + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { + ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n"); + /* Policy code enabled in future release */ + crypt_stat->flags |= (ECRYPTFS_POLICY_APPLIED + | ECRYPTFS_ENCRYPTED); + } + mutex_unlock(&crypt_stat->cs_mutex); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); + goto out_free; + } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) + == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out_put; + } + ecryptfs_set_file_lower( + file, ecryptfs_inode_to_private(inode)->lower_file); + if (d_is_dir(ecryptfs_dentry)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + mutex_lock(&crypt_stat->cs_mutex); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + mutex_unlock(&crypt_stat->cs_mutex); + rc = 0; + goto out; + } + rc = read_or_initialize_metadata(ecryptfs_dentry); + if (rc) + goto out_put; + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " + "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + (unsigned long long)i_size_read(inode)); + goto out; +out_put: + ecryptfs_put_lower_file(inode); +out_free: + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); +out: + return rc; +} + +static int ecryptfs_flush(struct file *file, fl_owner_t td) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + + if (lower_file->f_op->flush) { + filemap_write_and_wait(file->f_mapping); + return lower_file->f_op->flush(lower_file, td); + } + + return 0; +} + +static int ecryptfs_release(struct inode *inode, struct file *file) +{ + ecryptfs_put_lower_file(inode); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static int +ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int rc; + + rc = filemap_write_and_wait(file->f_mapping); + if (rc) + return rc; + + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); +} + +static int ecryptfs_fasync(int fd, struct file *file, int flag) +{ + int rc = 0; + struct file *lower_file = NULL; + + lower_file = ecryptfs_file_to_lower(file); + if (lower_file->f_op->fasync) + rc = lower_file->f_op->fasync(fd, lower_file, flag); + return rc; +} + +static long +ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + long rc = -ENOTTY; + + if (!lower_file->f_op->unlocked_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + case FS_IOC_GETVERSION: + case FS_IOC_SETVERSION: + rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } +} + +#ifdef CONFIG_COMPAT +static long +ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + long rc = -ENOIOCTLCMD; + + if (!lower_file->f_op->compat_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + case FS_IOC32_GETVERSION: + case FS_IOC32_SETVERSION: + rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } +} +#endif + +const struct file_operations ecryptfs_dir_fops = { + .iterate = ecryptfs_readdir, + .read = generic_read_dir, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .splice_read = generic_file_splice_read, + .llseek = default_llseek, +}; + +const struct file_operations ecryptfs_main_fops = { + .llseek = generic_file_llseek, + .read_iter = ecryptfs_read_update_atime, + .write_iter = generic_file_write_iter, + .iterate = ecryptfs_readdir, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .splice_read = generic_file_splice_read, +}; diff --git a/kernel/fs/ecryptfs/inode.c b/kernel/fs/ecryptfs/inode.c new file mode 100644 index 000000000..fc850b55d --- /dev/null +++ b/kernel/fs/ecryptfs/inode.c @@ -0,0 +1,1138 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompsion + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *dir; + + dir = dget_parent(dentry); + mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + return dir; +} + +static void unlock_dir(struct dentry *dir) +{ + mutex_unlock(&d_inode(dir)->i_mutex); + dput(dir); +} + +static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) +{ + return ecryptfs_inode_to_lower(inode) == lower_inode; +} + +static int ecryptfs_inode_set(struct inode *inode, void *opaque) +{ + struct inode *lower_inode = opaque; + + ecryptfs_set_inode_lower(inode, lower_inode); + fsstack_copy_attr_all(inode, lower_inode); + /* i_size will be overwritten for encrypted regular files */ + fsstack_copy_inode_size(inode, lower_inode); + inode->i_ino = lower_inode->i_ino; + inode->i_version++; + inode->i_mapping->a_ops = &ecryptfs_aops; + + if (S_ISLNK(inode->i_mode)) + inode->i_op = &ecryptfs_symlink_iops; + else if (S_ISDIR(inode->i_mode)) + inode->i_op = &ecryptfs_dir_iops; + else + inode->i_op = &ecryptfs_main_iops; + + if (S_ISDIR(inode->i_mode)) + inode->i_fop = &ecryptfs_dir_fops; + else if (special_file(inode->i_mode)) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + else + inode->i_fop = &ecryptfs_main_fops; + + return 0; +} + +static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode; + + if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) + return ERR_PTR(-EXDEV); + if (!igrab(lower_inode)) + return ERR_PTR(-ESTALE); + inode = iget5_locked(sb, (unsigned long)lower_inode, + ecryptfs_inode_test, ecryptfs_inode_set, + lower_inode); + if (!inode) { + iput(lower_inode); + return ERR_PTR(-EACCES); + } + if (!(inode->i_state & I_NEW)) + iput(lower_inode); + + return inode; +} + +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); + + if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + unlock_new_inode(inode); + + return inode; +} + +/** + * ecryptfs_interpose + * @lower_dentry: Existing dentry in the lower filesystem + * @dentry: ecryptfs' dentry + * @sb: ecryptfs's super_block + * + * Interposes upper and lower dentries. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_interpose(struct dentry *lower_dentry, + struct dentry *dentry, struct super_block *sb) +{ + struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + + return 0; +} + +static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); + struct dentry *lower_dir_dentry; + int rc; + + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + if (rc) { + printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); + goto out_unlock; + } + fsstack_copy_attr_times(dir, lower_dir_inode); + set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); + inode->i_ctime = dir->i_ctime; + d_drop(dentry); +out_unlock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + return rc; +} + +/** + * ecryptfs_do_create + * @directory_inode: inode of the new file's dentry's parent in ecryptfs + * @ecryptfs_dentry: New file's dentry in ecryptfs + * @mode: The mode of the new file + * @nd: nameidata of ecryptfs' parent's dentry & vfsmount + * + * Creates the underlying file and the eCryptfs inode which will link to + * it. It will also update the eCryptfs directory inode to mimic the + * stat of the lower directory inode. + * + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition + */ +static struct inode * +ecryptfs_do_create(struct inode *directory_inode, + struct dentry *ecryptfs_dentry, umode_t mode) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + struct inode *inode; + + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); + if (rc) { + printk(KERN_ERR "%s: Failure to create dentry in lower fs; " + "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); + goto out_lock; + } + inode = __ecryptfs_get_inode(d_inode(lower_dentry), + directory_inode->i_sb); + if (IS_ERR(inode)) { + vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); + goto out_lock; + } + fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); +out_lock: + unlock_dir(lower_dir_dentry); + return inode; +} + +/** + * ecryptfs_initialize_file + * + * Cause the file to be changed from a basic empty file to an ecryptfs + * file with a header and first data page. + * + * Returns zero on success + */ +int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + int rc = 0; + + if (S_ISDIR(ecryptfs_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); + rc = ecryptfs_new_file_context(ecryptfs_inode); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error creating new file " + "context; rc = [%d]\n", rc); + goto out; + } + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); + goto out; + } + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); + if (rc) + printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); + ecryptfs_put_lower_file(ecryptfs_inode); +out: + return rc; +} + +/** + * ecryptfs_create + * @dir: The inode of the directory in which to create the file. + * @dentry: The eCryptfs dentry + * @mode: The mode of the new file. + * + * Creates a new file. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, + umode_t mode, bool excl) +{ + struct inode *ecryptfs_inode; + int rc; + + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { + ecryptfs_printk(KERN_WARNING, "Failed to create file in" + "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); + goto out; + } + /* At this point, a file exists on "disk"; we need to make sure + * that this on disk file is prepared to be an ecryptfs file */ + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + ecryptfs_do_unlink(directory_inode, ecryptfs_dentry, + ecryptfs_inode); + make_bad_inode(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + unlock_new_inode(ecryptfs_inode); + d_instantiate(ecryptfs_dentry, ecryptfs_inode); +out: + return rc; +} + +static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) +{ + struct ecryptfs_crypt_stat *crypt_stat; + int rc; + + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%d]\n", __func__, + dentry, rc); + return rc; + } + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + + rc = ecryptfs_read_and_validate_header_region(inode); + ecryptfs_put_lower_file(inode); + if (rc) { + rc = ecryptfs_read_and_validate_xattr_region(dentry, inode); + if (!rc) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } + + /* Must return 0 to allow non-eCryptfs files to be looked up, too */ + return 0; +} + +/** + * ecryptfs_lookup_interpose - Dentry interposition for a lookup + */ +static int ecryptfs_lookup_interpose(struct dentry *dentry, + struct dentry *lower_dentry, + struct inode *dir_inode) +{ + struct inode *inode, *lower_inode = d_inode(lower_dentry); + struct ecryptfs_dentry_info *dentry_info; + struct vfsmount *lower_mnt; + int rc = 0; + + dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!dentry_info) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n", + __func__); + dput(lower_dentry); + return -ENOMEM; + } + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent)); + BUG_ON(!d_count(lower_dentry)); + + ecryptfs_set_dentry_private(dentry, dentry_info); + dentry_info->lower_path.mnt = lower_mnt; + dentry_info->lower_path.dentry = lower_dentry; + + if (d_really_is_negative(lower_dentry)) { + /* We want to add because we couldn't find in lower */ + d_add(dentry, NULL); + return 0; + } + inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb); + if (IS_ERR(inode)) { + printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n", + __func__, PTR_ERR(inode)); + return PTR_ERR(inode); + } + if (S_ISREG(inode->i_mode)) { + rc = ecryptfs_i_size_read(dentry, inode); + if (rc) { + make_bad_inode(inode); + return rc; + } + } + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + d_add(dentry, inode); + + return rc; +} + +/** + * ecryptfs_lookup + * @ecryptfs_dir_inode: The eCryptfs directory inode + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @ecryptfs_nd: nameidata; may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, + struct dentry *ecryptfs_dentry, + unsigned int flags) +{ + char *encrypted_and_encoded_name = NULL; + size_t encrypted_and_encoded_name_size; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct dentry *lower_dir_dentry, *lower_dentry; + int rc = 0; + + lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%pd]\n", __func__, rc, + ecryptfs_dentry); + goto out; + } + if (d_really_is_positive(lower_dentry)) + goto interpose; + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + goto interpose; + dput(lower_dentry); + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + goto out; + } + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out; + } +interpose: + rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, + ecryptfs_dir_inode); +out: + kfree(encrypted_and_encoded_name); + return ERR_PTR(rc); +} + +static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_dir_dentry; + u64 file_size_save; + int rc; + + file_size_save = i_size_read(d_inode(old_dentry)); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_dir_dentry = lock_parent(lower_new_dentry); + rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), + lower_new_dentry, NULL); + if (rc || d_really_is_negative(lower_new_dentry)) + goto out_lock; + rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); + if (rc) + goto out_lock; + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(d_inode(old_dentry), + ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); + i_size_write(d_inode(new_dentry), file_size_save); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + return rc; +} + +static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) +{ + return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); +} + +static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + char *encoded_symname; + size_t encoded_symlen; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + mount_crypt_stat = &ecryptfs_superblock_to_private( + dir->i_sb)->mount_crypt_stat; + rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, + &encoded_symlen, + NULL, + mount_crypt_stat, symname, + strlen(symname)); + if (rc) + goto out_lock; + rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, + encoded_symname); + kfree(encoded_symname); + if (rc || d_really_is_negative(lower_dentry)) + goto out_lock; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out_lock; + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + if (d_really_is_negative(dentry)) + d_drop(dentry); + return rc; +} + +static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + if (rc || d_really_is_negative(lower_dentry)) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out; + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); +out: + unlock_dir(lower_dir_dentry); + if (d_really_is_negative(dentry)) + d_drop(dentry); + return rc; +} + +static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + int rc; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + dget(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + dget(lower_dentry); + rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); + dput(lower_dentry); + if (!rc && d_really_is_positive(dentry)) + clear_nlink(d_inode(dentry)); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); + unlock_dir(lower_dir_dentry); + if (!rc) + d_drop(dentry); + dput(dentry); + return rc; +} + +static int +ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + if (rc || d_really_is_negative(lower_dentry)) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out; + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); +out: + unlock_dir(lower_dir_dentry); + if (d_really_is_negative(dentry)) + d_drop(dentry); + return rc; +} + +static int +ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int rc; + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_old_dir_dentry; + struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; + struct inode *target_inode; + + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_old_dir_dentry = dget_parent(lower_old_dentry); + lower_new_dir_dentry = dget_parent(lower_new_dentry); + target_inode = d_inode(new_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + rc = -EINVAL; + goto out_lock; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + rc = -ENOTEMPTY; + goto out_lock; + } + rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + d_inode(lower_new_dir_dentry), lower_new_dentry, + NULL, 0); + if (rc) + goto out_lock; + if (target_inode) + fsstack_copy_attr_all(target_inode, + ecryptfs_inode_to_lower(target_inode)); + fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); + if (new_dir != old_dir) + fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); +out_lock: + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dput(lower_new_dir_dentry); + dput(lower_old_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + return rc; +} + +static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + char *lower_buf; + char *buf; + mm_segment_t old_fs; + int rc; + + lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!lower_buf) + return ERR_PTR(-ENOMEM); + old_fs = get_fs(); + set_fs(get_ds()); + rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry, + (char __user *)lower_buf, + PATH_MAX); + set_fs(old_fs); + if (rc < 0) + goto out; + rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb, + lower_buf, rc); +out: + kfree(lower_buf); + return rc ? ERR_PTR(rc) : buf; +} + +static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + size_t len; + char *buf = ecryptfs_readlink_lower(dentry, &len); + if (IS_ERR(buf)) + goto out; + fsstack_copy_attr_atime(d_inode(dentry), + d_inode(ecryptfs_dentry_to_lower(dentry))); + buf[len] = '\0'; +out: + nd_set_link(nd, buf); + return NULL; +} + +/** + * upper_size_to_lower_size + * @crypt_stat: Crypt_stat associated with file + * @upper_size: Size of the upper file + * + * Calculate the required size of the lower file based on the + * specified size of the upper file. This calculation is based on the + * number of headers in the underlying file and the extent size. + * + * Returns Calculated size of the lower file. + */ +static loff_t +upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, + loff_t upper_size) +{ + loff_t lower_size; + + lower_size = ecryptfs_lower_header_size(crypt_stat); + if (upper_size != 0) { + loff_t num_extents; + + num_extents = upper_size >> crypt_stat->extent_shift; + if (upper_size & ~crypt_stat->extent_mask) + num_extents++; + lower_size += (num_extents * crypt_stat->extent_size); + } + return lower_size; +} + +/** + * truncate_upper + * @dentry: The ecryptfs layer dentry + * @ia: Address of the ecryptfs inode's attributes + * @lower_ia: Address of the lower inode's attributes + * + * Function to handle truncations modifying the size of the file. Note + * that the file sizes are interpolated. When expanding, we are simply + * writing strings of 0's out. When truncating, we truncate the upper + * inode and update the lower_ia according to the page index + * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return, + * the caller must use lower_ia in a call to notify_change() to perform + * the truncation of the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +static int truncate_upper(struct dentry *dentry, struct iattr *ia, + struct iattr *lower_ia) +{ + int rc = 0; + struct inode *inode = d_inode(dentry); + struct ecryptfs_crypt_stat *crypt_stat; + loff_t i_size = i_size_read(inode); + loff_t lower_size_before_truncate; + loff_t lower_size_after_truncate; + + if (unlikely((ia->ia_size == i_size))) { + lower_ia->ia_valid &= ~ATTR_SIZE; + return 0; + } + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) + return rc; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; + /* Switch on growing or shrinking file */ + if (ia->ia_size > i_size) { + char zero[] = { 0x00 }; + + lower_ia->ia_valid &= ~ATTR_SIZE; + /* Write a single 0 at the last position of the file; + * this triggers code that will fill in 0's throughout + * the intermediate portion of the previous end of the + * file and the new and of the file */ + rc = ecryptfs_write(inode, zero, + (ia->ia_size - 1), 1); + } else { /* ia->ia_size < i_size_read(inode) */ + /* We're chopping off all the pages down to the page + * in which ia->ia_size is located. Fill in the end of + * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to + * PAGE_CACHE_SIZE with zeros. */ + size_t num_zeros = (PAGE_CACHE_SIZE + - (ia->ia_size & ~PAGE_CACHE_MASK)); + + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + truncate_setsize(inode, ia->ia_size); + lower_ia->ia_size = ia->ia_size; + lower_ia->ia_valid |= ATTR_SIZE; + goto out; + } + if (num_zeros) { + char *zeros_virt; + + zeros_virt = kzalloc(num_zeros, GFP_KERNEL); + if (!zeros_virt) { + rc = -ENOMEM; + goto out; + } + rc = ecryptfs_write(inode, zeros_virt, + ia->ia_size, num_zeros); + kfree(zeros_virt); + if (rc) { + printk(KERN_ERR "Error attempting to zero out " + "the remainder of the end page on " + "reducing truncate; rc = [%d]\n", rc); + goto out; + } + } + truncate_setsize(inode, ia->ia_size); + rc = ecryptfs_write_inode_size_to_metadata(inode); + if (rc) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); + goto out; + } + /* We are reducing the size of the ecryptfs file, and need to + * know if we need to reduce the size of the lower file. */ + lower_size_before_truncate = + upper_size_to_lower_size(crypt_stat, i_size); + lower_size_after_truncate = + upper_size_to_lower_size(crypt_stat, ia->ia_size); + if (lower_size_after_truncate < lower_size_before_truncate) { + lower_ia->ia_size = lower_size_after_truncate; + lower_ia->ia_valid |= ATTR_SIZE; + } else + lower_ia->ia_valid &= ~ATTR_SIZE; + } +out: + ecryptfs_put_lower_file(inode); + return rc; +} + +static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset) +{ + struct ecryptfs_crypt_stat *crypt_stat; + loff_t lower_oldsize, lower_newsize; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + lower_oldsize = upper_size_to_lower_size(crypt_stat, + i_size_read(inode)); + lower_newsize = upper_size_to_lower_size(crypt_stat, offset); + if (lower_newsize > lower_oldsize) { + /* + * The eCryptfs inode and the new *lower* size are mixed here + * because we may not have the lower i_mutex held and/or it may + * not be appropriate to call inode_newsize_ok() with inodes + * from other filesystems. + */ + return inode_newsize_ok(inode, lower_newsize); + } + + return 0; +} + +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Simple function that handles the truncation of an eCryptfs inode and + * its corresponding lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length }; + struct iattr lower_ia = { .ia_valid = 0 }; + int rc; + + rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length); + if (rc) + return rc; + + rc = truncate_upper(dentry, &ia, &lower_ia); + if (!rc && lower_ia.ia_valid & ATTR_SIZE) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = notify_change(lower_dentry, &lower_ia, NULL); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); + } + return rc; +} + +static int +ecryptfs_permission(struct inode *inode, int mask) +{ + return inode_permission(ecryptfs_inode_to_lower(inode), mask); +} + +/** + * ecryptfs_setattr + * @dentry: dentry handle to the inode to modify + * @ia: Structure with flags of what to change and values + * + * Updates the metadata of an inode. If the update is to the size + * i.e. truncation, then ecryptfs_truncate will handle the size modification + * of both the ecryptfs inode and the lower inode. + * + * All other metadata changes will be passed right to the lower filesystem, + * and we will just update our inode to look like the lower. + */ +static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int rc = 0; + struct dentry *lower_dentry; + struct iattr lower_ia; + struct inode *inode; + struct inode *lower_inode; + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); + inode = d_inode(dentry); + lower_inode = ecryptfs_inode_to_lower(inode); + lower_dentry = ecryptfs_dentry_to_lower(dentry); + mutex_lock(&crypt_stat->cs_mutex); + if (d_is_dir(dentry)) + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + else if (d_is_reg(dentry) + && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) + || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) { + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + rc = ecryptfs_read_metadata(dentry); + ecryptfs_put_lower_file(inode); + if (rc) { + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Either the lower file " + "is not in a valid eCryptfs format, " + "or the key could not be retrieved. " + "Plaintext passthrough mode is not " + "enabled; returning -EIO\n"); + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + rc = 0; + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); + } + } + mutex_unlock(&crypt_stat->cs_mutex); + + rc = inode_change_ok(inode, ia); + if (rc) + goto out; + if (ia->ia_valid & ATTR_SIZE) { + rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size); + if (rc) + goto out; + } + + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); + if (ia->ia_valid & ATTR_SIZE) { + rc = truncate_upper(dentry, ia, &lower_ia); + if (rc < 0) + goto out; + } + + /* + * mode change is for clearing setuid/setgid bits. Allow lower fs + * to interpret this in its own way. + */ + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; + + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = notify_change(lower_dentry, &lower_ia, NULL); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); +out: + fsstack_copy_attr_all(inode, lower_inode); + return rc; +} + +static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + int rc = 0; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + generic_fillattr(d_inode(dentry), stat); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + char *target; + size_t targetsiz; + + target = ecryptfs_readlink_lower(dentry, &targetsiz); + if (!IS_ERR(target)) { + kfree(target); + stat->size = targetsiz; + } else { + rc = PTR_ERR(target); + } + } + return rc; +} + +static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); + if (!rc) { + fsstack_copy_attr_all(d_inode(dentry), + ecryptfs_inode_to_lower(d_inode(dentry))); + generic_fillattr(d_inode(dentry), stat); + stat->blocks = lower_stat.blocks; + } + return rc; +} + +int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!d_inode(lower_dentry)->i_op->setxattr) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); + if (!rc && d_really_is_positive(dentry)) + fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); +out: + return rc; +} + +ssize_t +ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, + void *value, size_t size) +{ + int rc = 0; + + if (!d_inode(lower_dentry)->i_op->getxattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, + size); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); +out: + return rc; +} + +static ssize_t +ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name, + value, size); +} + +static ssize_t +ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!d_inode(lower_dentry)->i_op->listxattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); +out: + return rc; +} + +static int ecryptfs_removexattr(struct dentry *dentry, const char *name) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!d_inode(lower_dentry)->i_op->removexattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); +out: + return rc; +} + +const struct inode_operations ecryptfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = ecryptfs_follow_link, + .put_link = kfree_put_link, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr_link, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +const struct inode_operations ecryptfs_dir_iops = { + .create = ecryptfs_create, + .lookup = ecryptfs_lookup, + .link = ecryptfs_link, + .unlink = ecryptfs_unlink, + .symlink = ecryptfs_symlink, + .mkdir = ecryptfs_mkdir, + .rmdir = ecryptfs_rmdir, + .mknod = ecryptfs_mknod, + .rename = ecryptfs_rename, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +const struct inode_operations ecryptfs_main_iops = { + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; diff --git a/kernel/fs/ecryptfs/keystore.c b/kernel/fs/ecryptfs/keystore.c new file mode 100644 index 000000000..6bd67e201 --- /dev/null +++ b/kernel/fs/ecryptfs/keystore.c @@ -0,0 +1,2529 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * In-kernel key management code. Includes functions to parse and + * write authentication token-related packets with the underlying + * file. + * + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * Trevor S. Highland + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * request_key returned an error instead of a valid key address; + * determine the type of error, make appropriate log entries, and + * return an error code. + */ +static int process_request_key_err(long err_code) +{ + int rc = 0; + + switch (err_code) { + case -ENOKEY: + ecryptfs_printk(KERN_WARNING, "No key\n"); + rc = -ENOENT; + break; + case -EKEYEXPIRED: + ecryptfs_printk(KERN_WARNING, "Key expired\n"); + rc = -ETIME; + break; + case -EKEYREVOKED: + ecryptfs_printk(KERN_WARNING, "Key revoked\n"); + rc = -EINVAL; + break; + default: + ecryptfs_printk(KERN_WARNING, "Unknown error code: " + "[0x%.16lx]\n", err_code); + rc = -EINVAL; + } + return rc; +} + +static int process_find_global_auth_tok_for_sig_err(int err_code) +{ + int rc = err_code; + + switch (err_code) { + case -ENOENT: + ecryptfs_printk(KERN_WARNING, "Missing auth tok\n"); + break; + case -EINVAL: + ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n"); + break; + default: + rc = process_request_key_err(err_code); + break; + } + return rc; +} + +/** + * ecryptfs_parse_packet_length + * @data: Pointer to memory containing length at offset + * @size: This function writes the decoded size to this memory + * address; zero on error + * @length_size: The number of bytes occupied by the encoded length + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size) +{ + int rc = 0; + + (*length_size) = 0; + (*size) = 0; + if (data[0] < 192) { + /* One-byte length */ + (*size) = data[0]; + (*length_size) = 1; + } else if (data[0] < 224) { + /* Two-byte length */ + (*size) = (data[0] - 192) * 256; + (*size) += data[1] + 192; + (*length_size) = 2; + } else if (data[0] == 255) { + /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ + ecryptfs_printk(KERN_ERR, "Five-byte packet length not " + "supported\n"); + rc = -EINVAL; + goto out; + } else { + ecryptfs_printk(KERN_ERR, "Error parsing packet length\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_write_packet_length + * @dest: The byte array target into which to write the length. Must + * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated. + * @size: The length to write. + * @packet_size_length: The number of bytes used to encode the packet + * length is written to this address. + * + * Returns zero on success; non-zero on error. + */ +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length) +{ + int rc = 0; + + if (size < 192) { + dest[0] = size; + (*packet_size_length) = 1; + } else if (size < 65536) { + dest[0] = (((size - 192) / 256) + 192); + dest[1] = ((size - 192) % 256); + (*packet_size_length) = 2; + } else { + /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, + "Unsupported packet size: [%zd]\n", size); + } + return rc; +} + +static int +write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key, + char **packet, size_t *packet_len) +{ + size_t i = 0; + size_t data_len; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 64 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + + session_key->encrypted_key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + rc = ecryptfs_write_packet_length(&message[i], + session_key->encrypted_key_size, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], session_key->encrypted_key, + session_key->encrypted_key_size); + i += session_key->encrypted_key_size; + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t m_size; + size_t message_len; + u16 checksum = 0; + u16 expected_checksum = 0; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + if (message_len < 4) { + rc = -EIO; + goto out; + } + if (data[i++] != ECRYPTFS_TAG_65_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_65\n"); + rc = -EIO; + goto out; + } + if (data[i++]) { + ecryptfs_printk(KERN_ERR, "Status indicator has non-zero value " + "[%d]\n", data[i-1]); + rc = -EIO; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + m_size)) { + ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd " + "is shorter than expected\n"); + rc = -EIO; + goto out; + } + if (m_size < 3) { + ecryptfs_printk(KERN_ERR, + "The decrypted key is not long enough to " + "include a cipher code and checksum\n"); + rc = -EIO; + goto out; + } + *cipher_code = data[i++]; + /* The decrypted key includes 1 byte cipher code and 2 byte checksum */ + session_key->decrypted_key_size = m_size - 3; + if (session_key->decrypted_key_size > ECRYPTFS_MAX_KEY_BYTES) { + ecryptfs_printk(KERN_ERR, "key_size [%d] larger than " + "the maximum key size [%d]\n", + session_key->decrypted_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + rc = -EIO; + goto out; + } + memcpy(session_key->decrypted_key, &data[i], + session_key->decrypted_key_size); + i += session_key->decrypted_key_size; + expected_checksum += (unsigned char)(data[i++]) << 8; + expected_checksum += (unsigned char)(data[i++]); + for (i = 0; i < session_key->decrypted_key_size; i++) + checksum += session_key->decrypted_key[i]; + if (expected_checksum != checksum) { + ecryptfs_printk(KERN_ERR, "Invalid checksum for file " + "encryption key; expected [%x]; calculated " + "[%x]\n", expected_checksum, checksum); + rc = -EIO; + } +out: + return rc; +} + + +static int +write_tag_66_packet(char *signature, u8 cipher_code, + struct ecryptfs_crypt_stat *crypt_stat, char **packet, + size_t *packet_len) +{ + size_t i = 0; + size_t j; + size_t data_len; + size_t checksum = 0; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 66 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + /* The encrypted key includes 1 byte cipher code and 2 byte checksum */ + rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + message[i++] = cipher_code; + memcpy(&message[i], crypt_stat->key, crypt_stat->key_size); + i += crypt_stat->key_size; + for (j = 0; j < crypt_stat->key_size; j++) + checksum += crypt_stat->key[j]; + message[i++] = (checksum / 256) % 256; + message[i++] = (checksum % 256); + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_67_packet(struct ecryptfs_key_record *key_rec, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t message_len; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + /* verify that everything through the encrypted FEK size is present */ + if (message_len < 4) { + rc = -EIO; + printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " + "message length is [%d]\n", __func__, message_len, 4); + goto out; + } + if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) { + rc = -EIO; + printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n", + __func__); + goto out; + } + if (data[i++]) { + rc = -EIO; + printk(KERN_ERR "%s: Status indicator has non zero " + "value [%d]\n", __func__, data[i-1]); + + goto out; + } + rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size, + &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + key_rec->enc_key_size)) { + rc = -EIO; + printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", + __func__, message_len, (i + key_rec->enc_key_size)); + goto out; + } + if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + rc = -EIO; + printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " + "the maximum key size [%d]\n", __func__, + key_rec->enc_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + goto out; + } + memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size); +out: + return rc; +} + +/** + * ecryptfs_verify_version + * @version: The version number to confirm + * + * Returns zero on good version; non-zero otherwise + */ +static int ecryptfs_verify_version(u16 version) +{ + int rc = 0; + unsigned char major; + unsigned char minor; + + major = ((version >> 8) & 0xFF); + minor = (version & 0xFF); + if (major != ECRYPTFS_VERSION_MAJOR) { + ecryptfs_printk(KERN_ERR, "Major version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MAJOR, major); + rc = -EINVAL; + goto out; + } + if (minor != ECRYPTFS_VERSION_MINOR) { + ecryptfs_printk(KERN_ERR, "Minor version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MINOR, minor); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_verify_auth_tok_from_key + * @auth_tok_key: key containing the authentication token + * @auth_tok: authentication token + * + * Returns zero on valid auth tok; -EINVAL otherwise + */ +static int +ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok **auth_tok) +{ + int rc = 0; + + (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key); + if (ecryptfs_verify_version((*auth_tok)->version)) { + printk(KERN_ERR "Data structure version mismatch. Userspace " + "tools must match eCryptfs kernel module with major " + "version [%d] and minor version [%d]\n", + ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR); + rc = -EINVAL; + goto out; + } + if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD + && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { + printk(KERN_ERR "Invalid auth_tok structure " + "returned from key query\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +static int +ecryptfs_find_global_auth_tok_for_sig( + struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) +{ + struct ecryptfs_global_auth_tok *walker; + int rc = 0; + + (*auth_tok_key) = NULL; + (*auth_tok) = NULL; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX)) + continue; + + if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) { + rc = -EINVAL; + goto out; + } + + rc = key_validate(walker->global_auth_tok_key); + if (rc) { + if (rc == -EKEYEXPIRED) + goto out; + goto out_invalid_auth_tok; + } + + down_write(&(walker->global_auth_tok_key->sem)); + rc = ecryptfs_verify_auth_tok_from_key( + walker->global_auth_tok_key, auth_tok); + if (rc) + goto out_invalid_auth_tok_unlock; + + (*auth_tok_key) = walker->global_auth_tok_key; + key_get(*auth_tok_key); + goto out; + } + rc = -ENOENT; + goto out; +out_invalid_auth_tok_unlock: + up_write(&(walker->global_auth_tok_key->sem)); +out_invalid_auth_tok: + printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig); + walker->flags |= ECRYPTFS_AUTH_TOK_INVALID; + key_put(walker->global_auth_tok_key); + walker->global_auth_tok_key = NULL; +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + return rc; +} + +/** + * ecryptfs_find_auth_tok_for_sig + * @auth_tok: Set to the matching auth_tok; NULL if not found + * @crypt_stat: inode crypt_stat crypto context + * @sig: Sig of auth_tok to find + * + * For now, this function simply looks at the registered auth_tok's + * linked off the mount_crypt_stat, so all the auth_toks that can be + * used must be registered at mount time. This function could + * potentially try a lot harder to find auth_tok's (e.g., by calling + * out to ecryptfsd to dynamically retrieve an auth_tok object) so + * that static registration of auth_tok's will no longer be necessary. + * + * Returns zero on no error; non-zero on error + */ +static int +ecryptfs_find_auth_tok_for_sig( + struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig) +{ + int rc = 0; + + rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok, + mount_crypt_stat, sig); + if (rc == -ENOENT) { + /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the + * mount_crypt_stat structure, we prevent to use auth toks that + * are not inserted through the ecryptfs_add_global_auth_tok + * function. + */ + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + return -EINVAL; + + rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, + sig); + } + return rc; +} + +/** + * write_tag_70_packet can gobble a lot of stack space. We stuff most + * of the function's parameters in a kmalloc'd struct to help reduce + * eCryptfs' overall stack usage. + */ +struct ecryptfs_write_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + size_t j; + size_t num_rand_bytes; + struct mutex *tfm_mutex; + char *block_aligned_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; + struct blkcipher_desc desc; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + struct hash_desc hash_desc; + struct scatterlist hash_sg; +}; + +/** + * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK + * @filename: NULL-terminated filename string + * + * This is the simplest mechanism for achieving filename encryption in + * eCryptfs. It encrypts the given filename with the mount-wide + * filename encryption key (FNEK) and stores it in a packet to @dest, + * which the callee will encode and write directly into the dentry + * name. + */ +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size) +{ + struct ecryptfs_write_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; + int rc = 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + (*packet_size) = 0; + rc = ecryptfs_find_auth_tok_for_sig( + &auth_tok_key, + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( + &s->desc.tfm, + &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + /* Plus one for the \0 separator between the random prefix + * and the plaintext filename */ + s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); + s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); + if ((s->block_aligned_filename_size % s->block_size) != 0) { + s->num_rand_bytes += (s->block_size + - (s->block_aligned_filename_size + % s->block_size)); + s->block_aligned_filename_size = (s->num_rand_bytes + + filename_size); + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random characters, a \0 + * separator, and then the filename */ + s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE + + s->block_aligned_filename_size); + if (dest == NULL) { + (*packet_size) = s->max_packet_size; + goto out_unlock; + } + if (s->max_packet_size > (*remaining_bytes)) { + printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " + "[%zd] available\n", __func__, s->max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out_unlock; + } + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->block_aligned_filename) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kzalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + s->i = 0; + dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[s->i], + (ECRYPTFS_SIG_SIZE + + 1 /* Cipher code */ + + s->block_aligned_filename_size), + &s->packet_size_len); + if (rc) { + printk(KERN_ERR "%s: Error generating tag 70 packet " + "header; cannot generate packet length; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + s->i += s->packet_size_len; + ecryptfs_from_hex(&dest[s->i], + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_SIG_SIZE); + s->i += ECRYPTFS_SIG_SIZE; + s->cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (s->cipher_code == 0) { + printk(KERN_WARNING "%s: Unable to generate code for " + "cipher [%s] with key bytes [%zd]\n", __func__, + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + rc = -EINVAL; + goto out_free_unlock; + } + dest[s->i++] = s->cipher_code; + /* TODO: Support other key modules than passphrase for + * filename encryption */ + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } + sg_init_one( + &s->hash_sg, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes); + s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(s->hash_desc.tfm)) { + rc = PTR_ERR(s->hash_desc.tfm); + printk(KERN_ERR "%s: Error attempting to " + "allocate hash crypto context; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update( + &s->hash_desc, &s->hash_sg, + s->auth_tok->token.password.session_key_encryption_key_bytes); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { + s->block_aligned_filename[s->j] = + s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; + if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) + == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { + sg_init_one(&s->hash_sg, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, + ECRYPTFS_TAG_70_DIGEST_SIZE); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + memcpy(s->hash, s->tmp_hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + } + if (s->block_aligned_filename[s->j] == '\0') + s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; + } + memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, + filename_size); + rc = virt_to_scatterlist(s->block_aligned_filename, + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert filename memory to scatterlist; rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, + s->dst_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_release_free_unlock; + } + /* The characters in the first block effectively do the job + * of the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_release_free_unlock; + } + rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + s->i += s->block_aligned_filename_size; + (*packet_size) = s->i; + (*remaining_bytes) -= (*packet_size); +out_release_free_unlock: + crypto_free_hash(s->hash_desc.tfm); +out_free_unlock: + kzfree(s->block_aligned_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + } + kfree(s); + return rc; +} + +struct ecryptfs_parse_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t parsed_tag_70_packet_size; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + struct mutex *tfm_mutex; + char *decrypted_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; + struct blkcipher_desc desc; + char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; +}; + +/** + * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * @filename: This function kmalloc's the memory for the filename + * @filename_size: This function sets this to the amount of memory + * kmalloc'd for the filename + * @packet_size: This function sets this to the the number of octets + * in the packet parsed + * @mount_crypt_stat: The mount-wide cryptographic context + * @data: The memory location containing the start of the tag 70 + * packet + * @max_packet_size: The maximum legal size of the packet to be parsed + * from @data + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size) +{ + struct ecryptfs_parse_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; + int rc = 0; + + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " + "at least [%d]\n", __func__, max_packet_size, + ECRYPTFS_TAG_70_MIN_METADATA_SIZE); + rc = -EINVAL; + goto out; + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random numbers, a \0 + * separator, and then the filename */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { + printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " + "tag [0x%.2x]\n", __func__, + data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], + &s->parsed_tag_70_packet_size, + &s->packet_size_len); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out; + } + s->block_aligned_filename_size = (s->parsed_tag_70_packet_size + - ECRYPTFS_SIG_SIZE - 1); + if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) + > max_packet_size) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " + "size is [%zd]\n", __func__, max_packet_size, + (1 + s->packet_size_len + 1 + + s->block_aligned_filename_size)); + rc = -EINVAL; + goto out; + } + (*packet_size) += s->packet_size_len; + ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], + ECRYPTFS_SIG_SIZE); + s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + (*packet_size) += ECRYPTFS_SIG_SIZE; + s->cipher_code = data[(*packet_size)++]; + rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); + if (rc) { + printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", + __func__, s->cipher_code); + goto out; + } + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + &s->tfm_mutex, + s->cipher_string); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + s->cipher_string, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + rc = virt_to_scatterlist(&data[(*packet_size)], + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_unlock; + } + (*packet_size) += s->block_aligned_filename_size; + s->decrypted_filename = kmalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->decrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + rc = virt_to_scatterlist(s->decrypted_filename, + s->block_aligned_filename_size, s->dst_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert decrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_free_unlock; + } + /* The characters in the first block effectively do the job of + * the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + /* TODO: Support other key modules than passphrase for + * filename encryption */ + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_free_unlock; + } + rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_free_unlock; + } + s->i = 0; + while (s->decrypted_filename[s->i] != '\0' + && s->i < s->block_aligned_filename_size) + s->i++; + if (s->i == s->block_aligned_filename_size) { + printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " + "find valid separator between random characters and " + "the filename\n", __func__); + rc = -EINVAL; + goto out_free_unlock; + } + s->i++; + (*filename_size) = (s->block_aligned_filename_size - s->i); + if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { + printk(KERN_WARNING "%s: Filename size is [%zd], which is " + "invalid\n", __func__, (*filename_size)); + rc = -EINVAL; + goto out_free_unlock; + } + (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); + if (!(*filename)) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + ((*filename_size) + 1)); + rc = -ENOMEM; + goto out_free_unlock; + } + memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); + (*filename)[(*filename_size)] = '\0'; +out_free_unlock: + kfree(s->decrypted_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (rc) { + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + } + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + } + kfree(s); + return rc; +} + +static int +ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) +{ + int rc = 0; + + (*sig) = NULL; + switch (auth_tok->token_type) { + case ECRYPTFS_PASSWORD: + (*sig) = auth_tok->token.password.signature; + break; + case ECRYPTFS_PRIVATE_KEY: + (*sig) = auth_tok->token.private_key.signature; + break; + default: + printk(KERN_ERR "Cannot get sig for auth_tok of type [%d]\n", + auth_tok->token_type); + rc = -EINVAL; + } + return rc; +} + +/** + * decrypt_pki_encrypted_session_key - Decrypt the session key with the given auth_tok. + * @auth_tok: The key authentication token used to decrypt the session key + * @crypt_stat: The cryptographic context + * + * Returns zero on success; non-zero error otherwise. + */ +static int +decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + u8 cipher_code = 0; + struct ecryptfs_msg_ctx *msg_ctx; + struct ecryptfs_message *msg = NULL; + char *auth_tok_sig; + char *payload = NULL; + size_t payload_len = 0; + int rc; + + rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); + if (rc) { + printk(KERN_ERR "Unrecognized auth tok type: [%d]\n", + auth_tok->token_type); + goto out; + } + rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), + &payload, &payload_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); + goto out; + } + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd: %d\n", rc); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 65 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_65_packet(&(auth_tok->session_key), + &cipher_code, msg); + if (rc) { + printk(KERN_ERR "Failed to parse tag 65 packet; rc = [%d]\n", + rc); + goto out; + } + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + crypt_stat->key_size = auth_tok->session_key.decrypted_key_size; + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code); + if (rc) { + ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n", + cipher_code) + goto out; + } + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +out: + kfree(msg); + kfree(payload); + return rc; +} + +static void wipe_auth_tok_list(struct list_head *auth_tok_list_head) +{ + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; + + list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp, + auth_tok_list_head, list) { + list_del(&auth_tok_list_item->list); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + } +} + +struct kmem_cache *ecryptfs_auth_tok_list_item_cache; + +/** + * parse_tag_1_packet + * @crypt_stat: The cryptographic context to modify based on packet contents + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the + * end of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * @max_packet_size: The maximum allowable packet size + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + /** + * This format is inspired by OpenPGP; see RFC 2440 + * packet tag 1 + * + * Tag 1 identifier (1 byte) + * Max Tag 1 packet size (max 3 bytes) + * Version (1 byte) + * Key identifier (8 bytes; ECRYPTFS_SIG_SIZE) + * Cipher identifier (1 byte) + * Encrypted key size (arbitrary) + * + * 12 bytes minimum packet size + */ + if (unlikely(max_packet_size < 12)) { + printk(KERN_ERR "Invalid max packet size; must be >=12\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_1_PACKET_TYPE) { + printk(KERN_ERR "Enter w/ first byte != 0x%.2x\n", + ECRYPTFS_TAG_1_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, + GFP_KERNEL); + if (!auth_tok_list_item) { + printk(KERN_ERR "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out_free; + } + if (unlikely(body_size < (ECRYPTFS_SIG_SIZE + 2))) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + if (unlikely((*packet_size) + body_size > max_packet_size)) { + printk(KERN_WARNING "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + if (unlikely(data[(*packet_size)++] != 0x03)) { + printk(KERN_WARNING "Unknown version number [%d]\n", + data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + ecryptfs_to_hex((*new_auth_tok)->token.private_key.signature, + &data[(*packet_size)], ECRYPTFS_SIG_SIZE); + *packet_size += ECRYPTFS_SIG_SIZE; + /* This byte is skipped because the kernel does not need to + * know which public key encryption algorithm was used */ + (*packet_size)++; + (*new_auth_tok)->session_key.encrypted_key_size = + body_size - (ECRYPTFS_SIG_SIZE + 2); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 1 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES"); + rc = -EINVAL; + goto out; + } + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2))); + (*packet_size) += (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token_type = ECRYPTFS_PRIVATE_KEY; + (*new_auth_tok)->flags = 0; + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * parse_tag_3_packet + * @crypt_stat: The cryptographic context to modify based on packet + * contents. + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the end + * of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + /** + *This format is inspired by OpenPGP; see RFC 2440 + * packet tag 3 + * + * Tag 3 identifier (1 byte) + * Max Tag 3 packet size (max 3 bytes) + * Version (1 byte) + * Cipher code (1 byte) + * S2K specifier (1 byte) + * Hash identifier (1 byte) + * Salt (ECRYPTFS_SALT_SIZE) + * Hash iterations (1 byte) + * Encrypted key (arbitrary) + * + * (ECRYPTFS_SALT_SIZE + 7) minimum packet size + */ + if (max_packet_size < (ECRYPTFS_SALT_SIZE + 7)) { + printk(KERN_ERR "Max packet size too large\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) { + printk(KERN_ERR "First byte != 0x%.2x; invalid packet\n", + ECRYPTFS_TAG_3_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL); + if (!auth_tok_list_item) { + printk(KERN_ERR "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n", + rc); + goto out_free; + } + if (unlikely(body_size < (ECRYPTFS_SALT_SIZE + 5))) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + if (unlikely((*packet_size) + body_size > max_packet_size)) { + printk(KERN_ERR "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + (*new_auth_tok)->session_key.encrypted_key_size = + (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } + if (unlikely(data[(*packet_size)++] != 0x04)) { + printk(KERN_WARNING "Unknown version number [%d]\n", + data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, + (u16)data[(*packet_size)]); + if (rc) + goto out_free; + /* A little extra work to differentiate among the AES key + * sizes; see RFC2440 */ + switch(data[(*packet_size)++]) { + case RFC2440_CIPHER_AES_192: + crypt_stat->key_size = 24; + break; + default: + crypt_stat->key_size = + (*new_auth_tok)->session_key.encrypted_key_size; + } + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + goto out_free; + if (unlikely(data[(*packet_size)++] != 0x03)) { + printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); + rc = -ENOSYS; + goto out_free; + } + /* TODO: finish the hash mapping */ + switch (data[(*packet_size)++]) { + case 0x01: /* See RFC2440 for these numbers and their mappings */ + /* Choose MD5 */ + memcpy((*new_auth_tok)->token.password.salt, + &data[(*packet_size)], ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; + /* This conversion was taken straight from RFC2440 */ + (*new_auth_tok)->token.password.hash_iterations = + ((u32) 16 + (data[(*packet_size)] & 15)) + << ((data[(*packet_size)] >> 4) + 6); + (*packet_size)++; + /* Friendly reminder: + * (*new_auth_tok)->session_key.encrypted_key_size = + * (body_size - (ECRYPTFS_SALT_SIZE + 5)); */ + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], + (*new_auth_tok)->session_key.encrypted_key_size); + (*packet_size) += + (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token.password.hash_algo = 0x01; /* MD5 */ + break; + default: + ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: " + "[%d]\n", data[(*packet_size) - 1]); + rc = -ENOSYS; + goto out_free; + } + (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD; + /* TODO: Parametarize; we might actually want userspace to + * decrypt the session key. */ + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * parse_tag_11_packet + * @data: The raw bytes of the packet + * @contents: This function writes the data contents of the literal + * packet into this memory location + * @max_contents_bytes: The maximum number of bytes that this function + * is allowed to write into contents + * @tag_11_contents_size: This function writes the size of the parsed + * contents into this memory location; zero on + * error + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_11_packet(unsigned char *data, unsigned char *contents, + size_t max_contents_bytes, size_t *tag_11_contents_size, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 11 + * + * Tag 11 identifier (1 byte) + * Max Tag 11 packet size (max 3 bytes) + * Binary format specifier (1 byte) + * Filename length (1 byte) + * Filename ("_CONSOLE") (8 bytes) + * Modification date (4 bytes) + * Literal data (arbitrary) + * + * We need at least 16 bytes of data for the packet to even be + * valid. + */ + if (max_packet_size < 16) { + printk(KERN_ERR "Maximum packet size too small\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) { + printk(KERN_WARNING "Invalid tag 11 packet format\n"); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Invalid tag 11 packet format\n"); + goto out; + } + if (body_size < 14) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out; + } + (*packet_size) += length_size; + (*tag_11_contents_size) = (body_size - 14); + if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) { + printk(KERN_ERR "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != 0x62) { + printk(KERN_WARNING "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != 0x08) { + printk(KERN_WARNING "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + (*packet_size) += 12; /* Ignore filename and modification date */ + memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size)); + (*packet_size) += (*tag_11_contents_size); +out: + if (rc) { + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + } + return rc; +} + +int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + char *sig) +{ + int rc = 0; + + (*auth_tok_key) = request_key(&key_type_user, sig, NULL); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } + } + down_write(&(*auth_tok_key)->sem); + rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); + if (rc) { + up_write(&(*auth_tok_key)->sem); + key_put(*auth_tok_key); + (*auth_tok_key) = NULL; + goto out; + } +out: + return rc; +} + +/** + * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. + * @auth_tok: The passphrase authentication token to use to encrypt the FEK + * @crypt_stat: The cryptographic context + * + * Returns zero on success; non-zero error otherwise + */ +static int +decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + struct scatterlist dst_sg[2]; + struct scatterlist src_sg[2]; + struct mutex *tfm_mutex; + struct blkcipher_desc desc = { + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk( + KERN_DEBUG, "Session key encryption key (size [%d]):\n", + auth_tok->token.password.session_key_encryption_key_bytes); + ecryptfs_dump_hex( + auth_tok->token.password.session_key_encryption_key, + auth_tok->token.password.session_key_encryption_key_bytes); + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + crypt_stat->cipher); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + goto out; + } + rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size, + src_sg, 2); + if (rc < 1 || rc > 2) { + printk(KERN_ERR "Internal error whilst attempting to convert " + "auth_tok->session_key.encrypted_key to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "auth_tok->session_key.encrypted_key_size = [%d]\n", rc, + auth_tok->session_key.encrypted_key_size); + goto out; + } + auth_tok->session_key.decrypted_key_size = + auth_tok->session_key.encrypted_key_size; + rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size, + dst_sg, 2); + if (rc < 1 || rc > 2) { + printk(KERN_ERR "Internal error whilst attempting to convert " + "auth_tok->session_key.decrypted_key to scatterlist; " + "expected rc = 1; got rc = [%d]\n", rc); + goto out; + } + mutex_lock(tfm_mutex); + rc = crypto_blkcipher_setkey( + desc.tfm, auth_tok->token.password.session_key_encryption_key, + crypt_stat->key_size); + if (unlikely(rc < 0)) { + mutex_unlock(tfm_mutex); + printk(KERN_ERR "Error setting key for crypto context\n"); + rc = -EINVAL; + goto out; + } + rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg, + auth_tok->session_key.encrypted_key_size); + mutex_unlock(tfm_mutex); + if (unlikely(rc)) { + printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc); + goto out; + } + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n", + crypt_stat->key_size); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +out: + return rc; +} + +/** + * ecryptfs_parse_packet_set + * @crypt_stat: The cryptographic context + * @src: Virtual address of region of memory containing the packets + * @ecryptfs_dentry: The eCryptfs dentry associated with the packet set + * + * Get crypt_stat to have the file's session key if the requisite key + * is available to decrypt the session key. + * + * Returns Zero if a valid authentication token was retrieved and + * processed; negative value for file not encrypted or for error + * conditions. + */ +int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, + struct dentry *ecryptfs_dentry) +{ + size_t i = 0; + size_t found_auth_tok; + size_t next_packet_is_auth_tok_packet; + struct list_head auth_tok_list; + struct ecryptfs_auth_tok *matching_auth_tok; + struct ecryptfs_auth_tok *candidate_auth_tok; + char *candidate_auth_tok_sig; + size_t packet_size; + struct ecryptfs_auth_tok *new_auth_tok; + unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE]; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t tag_11_contents_size; + size_t tag_11_packet_size; + struct key *auth_tok_key = NULL; + int rc = 0; + + INIT_LIST_HEAD(&auth_tok_list); + /* Parse the header to find as many packets as we can; these will be + * added the our &auth_tok_list */ + next_packet_is_auth_tok_packet = 1; + while (next_packet_is_auth_tok_packet) { + size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i); + + switch (src[i]) { + case ECRYPTFS_TAG_3_PACKET_TYPE: + rc = parse_tag_3_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + rc = parse_tag_11_packet((unsigned char *)&src[i], + sig_tmp_space, + ECRYPTFS_SIG_SIZE, + &tag_11_contents_size, + &tag_11_packet_size, + max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "No valid " + "(ecryptfs-specific) literal " + "packet containing " + "authentication token " + "signature found after " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += tag_11_packet_size; + if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { + ecryptfs_printk(KERN_ERR, "Expected " + "signature of size [%d]; " + "read size [%zd]\n", + ECRYPTFS_SIG_SIZE, + tag_11_contents_size); + rc = -EIO; + goto out_wipe_list; + } + ecryptfs_to_hex(new_auth_tok->token.password.signature, + sig_tmp_space, tag_11_contents_size); + new_auth_tok->token.password.signature[ + ECRYPTFS_PASSWORD_SIG_SIZE] = '\0'; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + break; + case ECRYPTFS_TAG_1_PACKET_TYPE: + rc = parse_tag_1_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 1 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + break; + case ECRYPTFS_TAG_11_PACKET_TYPE: + ecryptfs_printk(KERN_WARNING, "Invalid packet set " + "(Tag 11 not allowed by itself)\n"); + rc = -EIO; + goto out_wipe_list; + default: + ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " + "of the file header; hex value of " + "character is [0x%.2x]\n", i, src[i]); + next_packet_is_auth_tok_packet = 0; + } + } + if (list_empty(&auth_tok_list)) { + printk(KERN_ERR "The lower file appears to be a non-encrypted " + "eCryptfs file; this is not supported in this version " + "of the eCryptfs kernel module\n"); + rc = -EINVAL; + goto out; + } + /* auth_tok_list contains the set of authentication tokens + * parsed from the metadata. We need to find a matching + * authentication token that has the secret component(s) + * necessary to decrypt the EFEK in the auth_tok parsed from + * the metadata. There may be several potential matches, but + * just one will be sufficient to decrypt to get the FEK. */ +find_next_matching_auth_tok: + found_auth_tok = 0; + list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { + candidate_auth_tok = &auth_tok_list_item->auth_tok; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, + "Considering cadidate auth tok:\n"); + ecryptfs_dump_auth_tok(candidate_auth_tok); + } + rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig, + candidate_auth_tok); + if (rc) { + printk(KERN_ERR + "Unrecognized candidate auth tok type: [%d]\n", + candidate_auth_tok->token_type); + rc = -EINVAL; + goto out_wipe_list; + } + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &matching_auth_tok, + crypt_stat->mount_crypt_stat, + candidate_auth_tok_sig); + if (!rc) { + found_auth_tok = 1; + goto found_matching_auth_tok; + } + } + if (!found_auth_tok) { + ecryptfs_printk(KERN_ERR, "Could not find a usable " + "authentication token\n"); + rc = -EIO; + goto out_wipe_list; + } +found_matching_auth_tok: + if (candidate_auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + memcpy(&(candidate_auth_tok->token.private_key), + &(matching_auth_tok->token.private_key), + sizeof(struct ecryptfs_private_key)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, + crypt_stat); + } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { + memcpy(&(candidate_auth_tok->token.password), + &(matching_auth_tok->token.password), + sizeof(struct ecryptfs_password)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = decrypt_passphrase_encrypted_session_key( + candidate_auth_tok, crypt_stat); + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = -EINVAL; + } + if (rc) { + struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; + + ecryptfs_printk(KERN_WARNING, "Error decrypting the " + "session key for authentication token with sig " + "[%.*s]; rc = [%d]. Removing auth tok " + "candidate from the list and searching for " + "the next match.\n", ECRYPTFS_SIG_SIZE_HEX, + candidate_auth_tok_sig, rc); + list_for_each_entry_safe(auth_tok_list_item, + auth_tok_list_item_tmp, + &auth_tok_list, list) { + if (candidate_auth_tok + == &auth_tok_list_item->auth_tok) { + list_del(&auth_tok_list_item->list); + kmem_cache_free( + ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + goto find_next_matching_auth_tok; + } + } + BUG(); + } + rc = ecryptfs_compute_root_iv(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error computing " + "the root IV\n"); + goto out_wipe_list; + } + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error initializing crypto " + "context for cipher [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + } +out_wipe_list: + wipe_auth_tok_list(&auth_tok_list); +out: + return rc; +} + +static int +pki_encrypt_session_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec) +{ + struct ecryptfs_msg_ctx *msg_ctx = NULL; + char *payload = NULL; + size_t payload_len = 0; + struct ecryptfs_message *msg; + int rc; + + rc = write_tag_66_packet(auth_tok->token.private_key.signature, + ecryptfs_code_for_cipher_string( + crypt_stat->cipher, + crypt_stat->key_size), + crypt_stat, &payload, &payload_len); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); + goto out; + } + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd: %d\n", rc); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 67 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_67_packet(key_rec, msg); + if (rc) + ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); + kfree(msg); +out: + kfree(payload); + return rc; +} +/** + * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet + * @dest: Buffer into which to write the packet + * @remaining_bytes: Maximum number of bytes that can be writtn + * @auth_tok_key: The authentication token key to unlock and put when done with + * @auth_tok + * @auth_tok: The authentication token used for generating the tag 1 packet + * @crypt_stat: The cryptographic context + * @key_rec: The key record struct for the tag 1 packet + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_1_packet(char *dest, size_t *remaining_bytes, + struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + size_t i; + size_t encrypted_session_key_valid = 0; + size_t packet_size_length; + size_t max_packet_size; + int rc = 0; + + (*packet_size) = 0; + ecryptfs_from_hex(key_rec->sig, auth_tok->token.private_key.signature, + ECRYPTFS_SIG_SIZE); + encrypted_session_key_valid = 0; + for (i = 0; i < crypt_stat->key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + memcpy(key_rec->enc_key, + auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + goto encrypted_session_key_set; + } + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + auth_tok->token.private_key.key_size; + rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat, + key_rec); + if (rc) { + printk(KERN_ERR "Failed to encrypt session key via a key " + "module; rc = [%d]\n", rc); + goto out; + } + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Encrypted key:\n"); + ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size); + } +encrypted_session_key_set: + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 1 */ + max_packet_size = (1 /* Tag 1 identifier */ + + 3 /* Max Tag 1 packet size */ + + 1 /* Version */ + + ECRYPTFS_SIG_SIZE /* Key identifier */ + + 1 /* Cipher identifier */ + + key_rec->enc_key_size); /* Encrypted key size */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet length larger than maximum allowable; " + "need up to [%td] bytes, but there are only [%td] " + "available\n", max_packet_size, (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet " + "header; cannot generate packet length\n"); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x03; /* version 3 */ + memcpy(&dest[(*packet_size)], key_rec->sig, ECRYPTFS_SIG_SIZE); + (*packet_size) += ECRYPTFS_SIG_SIZE; + dest[(*packet_size)++] = RFC2440_CIPHER_RSA; + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; +out: + if (rc) + (*packet_size) = 0; + else + (*remaining_bytes) -= (*packet_size); + return rc; +} + +/** + * write_tag_11_packet + * @dest: Target into which Tag 11 packet is to be written + * @remaining_bytes: Maximum packet length + * @contents: Byte array of contents to copy in + * @contents_length: Number of bytes in contents + * @packet_length: Length of the Tag 11 packet written; zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents, + size_t contents_length, size_t *packet_length) +{ + size_t packet_size_length; + size_t max_packet_size; + int rc = 0; + + (*packet_length) = 0; + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 11 */ + max_packet_size = (1 /* Tag 11 identifier */ + + 3 /* Max Tag 11 packet size */ + + 1 /* Binary format specifier */ + + 1 /* Filename length */ + + 8 /* Filename ("_CONSOLE") */ + + 4 /* Modification date */ + + contents_length); /* Literal data */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet length larger than maximum allowable; " + "need up to [%td] bytes, but there are only [%td] " + "available\n", max_packet_size, (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[(*packet_length)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + printk(KERN_ERR "Error generating tag 11 packet header; cannot " + "generate packet length. rc = [%d]\n", rc); + goto out; + } + (*packet_length) += packet_size_length; + dest[(*packet_length)++] = 0x62; /* binary data format specifier */ + dest[(*packet_length)++] = 8; + memcpy(&dest[(*packet_length)], "_CONSOLE", 8); + (*packet_length) += 8; + memset(&dest[(*packet_length)], 0x00, 4); + (*packet_length) += 4; + memcpy(&dest[(*packet_length)], contents, contents_length); + (*packet_length) += contents_length; + out: + if (rc) + (*packet_length) = 0; + else + (*remaining_bytes) -= (*packet_length); + return rc; +} + +/** + * write_tag_3_packet + * @dest: Buffer into which to write the packet + * @remaining_bytes: Maximum number of bytes that can be written + * @auth_tok: Authentication token + * @crypt_stat: The cryptographic context + * @key_rec: encrypted key + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_3_packet(char *dest, size_t *remaining_bytes, + struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + size_t i; + size_t encrypted_session_key_valid = 0; + char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + struct scatterlist dst_sg[2]; + struct scatterlist src_sg[2]; + struct mutex *tfm_mutex = NULL; + u8 cipher_code; + size_t packet_size_length; + size_t max_packet_size; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + crypt_stat->mount_crypt_stat; + struct blkcipher_desc desc = { + .tfm = NULL, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + (*packet_size) = 0; + ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE); + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + crypt_stat->cipher); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + goto out; + } + if (mount_crypt_stat->global_default_cipher_key_size == 0) { + struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm); + + printk(KERN_WARNING "No key size specified at mount; " + "defaulting to [%d]\n", alg->max_keysize); + mount_crypt_stat->global_default_cipher_key_size = + alg->max_keysize; + } + if (crypt_stat->key_size == 0) + crypt_stat->key_size = + mount_crypt_stat->global_default_cipher_key_size; + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + crypt_stat->key_size; + if (crypt_stat->key_size == 24 + && strcmp("aes", crypt_stat->cipher) == 0) { + memset((crypt_stat->key + 24), 0, 8); + auth_tok->session_key.encrypted_key_size = 32; + } else + auth_tok->session_key.encrypted_key_size = crypt_stat->key_size; + key_rec->enc_key_size = + auth_tok->session_key.encrypted_key_size; + encrypted_session_key_valid = 0; + for (i = 0; i < auth_tok->session_key.encrypted_key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " + "using auth_tok->session_key.encrypted_key, " + "where key_rec->enc_key_size = [%zd]\n", + key_rec->enc_key_size); + memcpy(key_rec->enc_key, + auth_tok->session_key.encrypted_key, + key_rec->enc_key_size); + goto encrypted_session_key_set; + } + if (auth_tok->token.password.flags & + ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) { + ecryptfs_printk(KERN_DEBUG, "Using previously generated " + "session key encryption key of size [%d]\n", + auth_tok->token.password. + session_key_encryption_key_bytes); + memcpy(session_key_encryption_key, + auth_tok->token.password.session_key_encryption_key, + crypt_stat->key_size); + ecryptfs_printk(KERN_DEBUG, + "Cached session key encryption key:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n"); + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size, + src_sg, 2); + if (rc < 1 || rc > 2) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat session key; expected rc = 1; " + "got rc = [%d]. key_rec->enc_key_size = [%zd]\n", + rc, key_rec->enc_key_size); + rc = -ENOMEM; + goto out; + } + rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size, + dst_sg, 2); + if (rc < 1 || rc > 2) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat encrypted session key; " + "expected rc = 1; got rc = [%d]. " + "key_rec->enc_key_size = [%zd]\n", rc, + key_rec->enc_key_size); + rc = -ENOMEM; + goto out; + } + mutex_lock(tfm_mutex); + rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key, + crypt_stat->key_size); + if (rc < 0) { + mutex_unlock(tfm_mutex); + ecryptfs_printk(KERN_ERR, "Error setting key for crypto " + "context; rc = [%d]\n", rc); + goto out; + } + rc = 0; + ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", + crypt_stat->key_size); + rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, + (*key_rec).enc_key_size); + mutex_unlock(tfm_mutex); + if (rc) { + printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc); + goto out; + } + ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n", + key_rec->enc_key_size); + ecryptfs_dump_hex(key_rec->enc_key, + key_rec->enc_key_size); + } +encrypted_session_key_set: + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 3 */ + max_packet_size = (1 /* Tag 3 identifier */ + + 3 /* Max Tag 3 packet size */ + + 1 /* Version */ + + 1 /* Cipher code */ + + 1 /* S2K specifier */ + + 1 /* Hash identifier */ + + ECRYPTFS_SALT_SIZE /* Salt */ + + 1 /* Hash iterations */ + + key_rec->enc_key_size); /* Encrypted key size */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet too large; need up to [%td] bytes, but " + "there are only [%td] available\n", max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE; + /* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3) + * to get the number of octets in the actual Tag 3 packet */ + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + printk(KERN_ERR "Error generating tag 3 packet header; cannot " + "generate packet length. rc = [%d]\n", rc); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x04; /* version 4 */ + /* TODO: Break from RFC2440 so that arbitrary ciphers can be + * specified with strings */ + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, + crypt_stat->key_size); + if (cipher_code == 0) { + ecryptfs_printk(KERN_WARNING, "Unable to generate code for " + "cipher [%s]\n", crypt_stat->cipher); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = cipher_code; + dest[(*packet_size)++] = 0x03; /* S2K */ + dest[(*packet_size)++] = 0x01; /* MD5 (TODO: parameterize) */ + memcpy(&dest[(*packet_size)], auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */ + dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */ + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; +out: + if (rc) + (*packet_size) = 0; + else + (*remaining_bytes) -= (*packet_size); + return rc; +} + +struct kmem_cache *ecryptfs_key_record_cache; + +/** + * ecryptfs_generate_key_packet_set + * @dest_base: Virtual address from which to write the key record set + * @crypt_stat: The cryptographic context from which the + * authentication tokens will be retrieved + * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat + * for the global parameters + * @len: The amount written + * @max: The maximum amount of data allowed to be written + * + * Generates a key packet set and writes it to the virtual address + * passed in. + * + * Returns zero on success; non-zero on error. + */ +int +ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, size_t *len, + size_t max) +{ + struct ecryptfs_auth_tok *auth_tok; + struct key *auth_tok_key = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + size_t written; + struct ecryptfs_key_record *key_rec; + struct ecryptfs_key_sig *key_sig; + int rc = 0; + + (*len) = 0; + mutex_lock(&crypt_stat->keysig_list_mutex); + key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL); + if (!key_rec) { + rc = -ENOMEM; + goto out; + } + list_for_each_entry(key_sig, &crypt_stat->keysig_list, + crypt_stat_list) { + memset(key_rec, 0, sizeof(*key_rec)); + rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key, + &auth_tok, + mount_crypt_stat, + key_sig->keysig); + if (rc) { + printk(KERN_WARNING "Unable to retrieve auth tok with " + "sig = [%s]\n", key_sig->keysig); + rc = process_find_global_auth_tok_for_sig_err(rc); + goto out_free; + } + if (auth_tok->token_type == ECRYPTFS_PASSWORD) { + rc = write_tag_3_packet((dest_base + (*len)), + &max, auth_tok, + crypt_stat, key_rec, + &written); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 3 packet\n"); + goto out_free; + } + (*len) += written; + /* Write auth tok signature packet */ + rc = write_tag_11_packet((dest_base + (*len)), &max, + key_rec->sig, + ECRYPTFS_SIG_SIZE, &written); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error writing " + "auth tok signature packet\n"); + goto out_free; + } + (*len) += written; + } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + rc = write_tag_1_packet(dest_base + (*len), &max, + auth_tok_key, auth_tok, + crypt_stat, key_rec, &written); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 1 packet\n"); + goto out_free; + } + (*len) += written; + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + ecryptfs_printk(KERN_WARNING, "Unsupported " + "authentication token type\n"); + rc = -EINVAL; + goto out_free; + } + } + if (likely(max > 0)) { + dest_base[(*len)] = 0x00; + } else { + ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); + rc = -EIO; + } +out_free: + kmem_cache_free(ecryptfs_key_record_cache, key_rec); +out: + if (rc) + (*len) = 0; + mutex_unlock(&crypt_stat->keysig_list_mutex); + return rc; +} + +struct kmem_cache *ecryptfs_key_sig_cache; + +int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) +{ + struct ecryptfs_key_sig *new_key_sig; + + new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); + if (!new_key_sig) { + printk(KERN_ERR + "Error allocating from ecryptfs_key_sig_cache\n"); + return -ENOMEM; + } + memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + /* Caller must hold keysig_list_mutex */ + list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); + + return 0; +} + +struct kmem_cache *ecryptfs_global_auth_tok_cache; + +int +ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig, u32 global_auth_tok_flags) +{ + struct ecryptfs_global_auth_tok *new_auth_tok; + int rc = 0; + + new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache, + GFP_KERNEL); + if (!new_auth_tok) { + rc = -ENOMEM; + printk(KERN_ERR "Error allocating from " + "ecryptfs_global_auth_tok_cache\n"); + goto out; + } + memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; + new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_add(&new_auth_tok->mount_crypt_stat_list, + &mount_crypt_stat->global_auth_tok_list); + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); +out: + return rc; +} + diff --git a/kernel/fs/ecryptfs/kthread.c b/kernel/fs/ecryptfs/kthread.c new file mode 100644 index 000000000..866bb18ef --- /dev/null +++ b/kernel/fs/ecryptfs/kthread.c @@ -0,0 +1,172 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +struct ecryptfs_open_req { + struct file **lower_file; + struct path path; + struct completion done; + struct list_head kthread_ctl_list; +}; + +static struct ecryptfs_kthread_ctl { +#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001 + u32 flags; + struct mutex mux; + struct list_head req_list; + wait_queue_head_t wait; +} ecryptfs_kthread_ctl; + +static struct task_struct *ecryptfs_kthread; + +/** + * ecryptfs_threadfn + * @ignored: ignored + * + * The eCryptfs kernel thread that has the responsibility of getting + * the lower file with RW permissions. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_threadfn(void *ignored) +{ + set_freezable(); + while (1) { + struct ecryptfs_open_req *req; + + wait_event_freezable( + ecryptfs_kthread_ctl.wait, + (!list_empty(&ecryptfs_kthread_ctl.req_list) + || kthread_should_stop())); + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + mutex_unlock(&ecryptfs_kthread_ctl.mux); + goto out; + } + while (!list_empty(&ecryptfs_kthread_ctl.req_list)) { + req = list_first_entry(&ecryptfs_kthread_ctl.req_list, + struct ecryptfs_open_req, + kthread_ctl_list); + list_del(&req->kthread_ctl_list); + *req->lower_file = dentry_open(&req->path, + (O_RDWR | O_LARGEFILE), current_cred()); + complete(&req->done); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + } +out: + return 0; +} + +int __init ecryptfs_init_kthread(void) +{ + int rc = 0; + + mutex_init(&ecryptfs_kthread_ctl.mux); + init_waitqueue_head(&ecryptfs_kthread_ctl.wait); + INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list); + ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL, + "ecryptfs-kthread"); + if (IS_ERR(ecryptfs_kthread)) { + rc = PTR_ERR(ecryptfs_kthread); + printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]" + "\n", __func__, rc); + } + return rc; +} + +void ecryptfs_destroy_kthread(void) +{ + struct ecryptfs_open_req *req, *tmp; + + mutex_lock(&ecryptfs_kthread_ctl.mux); + ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; + list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { + list_del(&req->kthread_ctl_list); + *req->lower_file = ERR_PTR(-EIO); + complete(&req->done); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + kthread_stop(ecryptfs_kthread); + wake_up(&ecryptfs_kthread_ctl.wait); +} + +/** + * ecryptfs_privileged_open + * @lower_file: Result of dentry_open by root on lower dentry + * @lower_dentry: Lower dentry for file to open + * @lower_mnt: Lower vfsmount for file to open + * + * This function gets a r/w file opened againt the lower dentry. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt, + const struct cred *cred) +{ + struct ecryptfs_open_req req; + int flags = O_LARGEFILE; + int rc = 0; + + init_completion(&req.done); + req.lower_file = lower_file; + req.path.dentry = lower_dentry; + req.path.mnt = lower_mnt; + + /* Corresponding dput() and mntput() are done when the + * lower file is fput() when all eCryptfs files for the inode are + * released. */ + flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; + (*lower_file) = dentry_open(&req.path, flags, cred); + if (!IS_ERR(*lower_file)) + goto out; + if ((flags & O_ACCMODE) == O_RDONLY) { + rc = PTR_ERR((*lower_file)); + goto out; + } + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + rc = -EIO; + mutex_unlock(&ecryptfs_kthread_ctl.mux); + printk(KERN_ERR "%s: We are in the middle of shutting down; " + "aborting privileged request to open lower file\n", + __func__); + goto out; + } + list_add_tail(&req.kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); + mutex_unlock(&ecryptfs_kthread_ctl.mux); + wake_up(&ecryptfs_kthread_ctl.wait); + wait_for_completion(&req.done); + if (IS_ERR(*lower_file)) + rc = PTR_ERR(*lower_file); +out: + return rc; +} diff --git a/kernel/fs/ecryptfs/main.c b/kernel/fs/ecryptfs/main.c new file mode 100644 index 000000000..4f4d0474b --- /dev/null +++ b/kernel/fs/ecryptfs/main.c @@ -0,0 +1,906 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * Module parameter that defines the ecryptfs_verbosity level. + */ +int ecryptfs_verbosity = 0; + +module_param(ecryptfs_verbosity, int, 0); +MODULE_PARM_DESC(ecryptfs_verbosity, + "Initial verbosity level (0 or 1; defaults to " + "0, which is Quiet)"); + +/** + * Module parameter that defines the number of message buffer elements + */ +unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; + +module_param(ecryptfs_message_buf_len, uint, 0); +MODULE_PARM_DESC(ecryptfs_message_buf_len, + "Number of message buffer elements"); + +/** + * Module parameter that defines the maximum guaranteed amount of time to wait + * for a response from ecryptfsd. The actual sleep time will be, more than + * likely, a small amount greater than this specified value, but only less if + * the message successfully arrives. + */ +signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; + +module_param(ecryptfs_message_wait_timeout, long, 0); +MODULE_PARM_DESC(ecryptfs_message_wait_timeout, + "Maximum number of seconds that an operation will " + "sleep while waiting for a message response from " + "userspace"); + +/** + * Module parameter that is an estimate of the maximum number of users + * that will be concurrently using eCryptfs. Set this to the right + * value to balance performance and memory use. + */ +unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; + +module_param(ecryptfs_number_of_users, uint, 0); +MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " + "concurrent users of eCryptfs"); + +void __ecryptfs_printk(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + if (fmt[1] == '7') { /* KERN_DEBUG */ + if (ecryptfs_verbosity >= 1) + vprintk(fmt, args); + } else + vprintk(fmt, args); + va_end(args); +} + +/** + * ecryptfs_init_lower_file + * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with + * the lower dentry and the lower mount set + * + * eCryptfs only ever keeps a single open file for every lower + * inode. All I/O operations to the lower inode occur through that + * file. When the first eCryptfs dentry that interposes with the first + * lower dentry for that inode is created, this function creates the + * lower file struct and associates it with the eCryptfs + * inode. When all eCryptfs files associated with the inode are released, the + * file is closed. + * + * The lower file will be opened with read/write permissions, if + * possible. Otherwise, it is opened read-only. + * + * This function does nothing if a lower file is already + * associated with the eCryptfs inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_init_lower_file(struct dentry *dentry, + struct file **lower_file) +{ + const struct cred *cred = current_cred(); + struct path *path = ecryptfs_dentry_to_lower_path(dentry); + int rc; + + rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, + cred); + if (rc) { + printk(KERN_ERR "Error opening lower file " + "for lower_dentry [0x%p] and lower_mnt [0x%p]; " + "rc = [%d]\n", path->dentry, path->mnt, rc); + (*lower_file) = NULL; + } + return rc; +} + +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + int count, rc = 0; + + inode_info = ecryptfs_inode_to_private(inode); + mutex_lock(&inode_info->lower_file_mutex); + count = atomic_inc_return(&inode_info->lower_file_count); + if (WARN_ON_ONCE(count < 1)) + rc = -EINVAL; + else if (count == 1) { + rc = ecryptfs_init_lower_file(dentry, + &inode_info->lower_file); + if (rc) + atomic_set(&inode_info->lower_file_count, 0); + } + mutex_unlock(&inode_info->lower_file_mutex); + return rc; +} + +void ecryptfs_put_lower_file(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, + &inode_info->lower_file_mutex)) { + filemap_write_and_wait(inode->i_mapping); + fput(inode_info->lower_file); + inode_info->lower_file = NULL; + mutex_unlock(&inode_info->lower_file_mutex); + } +} + +enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, + ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, + ecryptfs_opt_ecryptfs_key_bytes, + ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, + ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, + ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, + ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_check_dev_ruid, + ecryptfs_opt_err }; + +static const match_table_t tokens = { + {ecryptfs_opt_sig, "sig=%s"}, + {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, + {ecryptfs_opt_cipher, "cipher=%s"}, + {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, + {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, + {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, + {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, + {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, + {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, + {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, + {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, + {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, + {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, + {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, + {ecryptfs_opt_err, NULL} +}; + +static int ecryptfs_init_global_auth_toks( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + struct ecryptfs_auth_tok *auth_tok; + int rc = 0; + + list_for_each_entry(global_auth_tok, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + rc = ecryptfs_keyring_auth_tok_for_sig( + &global_auth_tok->global_auth_tok_key, &auth_tok, + global_auth_tok->sig); + if (rc) { + printk(KERN_ERR "Could not find valid key in user " + "session keyring for sig specified in mount " + "option: [%s]\n", global_auth_tok->sig); + global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; + goto out; + } else { + global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; + up_write(&(global_auth_tok->global_auth_tok_key)->sem); + } + } +out: + return rc; +} + +static void ecryptfs_init_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + memset((void *)mount_crypt_stat, 0, + sizeof(struct ecryptfs_mount_crypt_stat)); + INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); + mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); + mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; +} + +/** + * ecryptfs_parse_options + * @sb: The ecryptfs super block + * @options: The options passed to the kernel + * @check_ruid: set to 1 if device uid should be checked against the ruid + * + * Parse mount options: + * debug=N - ecryptfs_verbosity level for debug output + * sig=XXX - description(signature) of the key to use + * + * Returns the dentry object of the lower-level (lower/interposed) + * directory; We want to mount our stackable file system on top of + * that lower directory. + * + * The signature of the key to use must be the description of a key + * already in the keyring. Mounting will fail if the key can not be + * found. + * + * Returns zero on success; non-zero on error + */ +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + uid_t *check_ruid) +{ + char *p; + int rc = 0; + int sig_set = 0; + int cipher_name_set = 0; + int fn_cipher_name_set = 0; + int cipher_key_bytes; + int cipher_key_bytes_set = 0; + int fn_cipher_key_bytes; + int fn_cipher_key_bytes_set = 0; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &sbi->mount_crypt_stat; + substring_t args[MAX_OPT_ARGS]; + int token; + char *sig_src; + char *cipher_name_dst; + char *cipher_name_src; + char *fn_cipher_name_dst; + char *fn_cipher_name_src; + char *fnek_dst; + char *fnek_src; + char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; + u8 cipher_code; + + *check_ruid = 0; + + if (!options) { + rc = -EINVAL; + goto out; + } + ecryptfs_init_mount_crypt_stat(mount_crypt_stat); + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + token = match_token(p, tokens, args); + switch (token) { + case ecryptfs_opt_sig: + case ecryptfs_opt_ecryptfs_sig: + sig_src = args[0].from; + rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, + sig_src, 0); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global sig; rc = [%d]\n", rc); + goto out; + } + sig_set = 1; + break; + case ecryptfs_opt_cipher: + case ecryptfs_opt_ecryptfs_cipher: + cipher_name_src = args[0].from; + cipher_name_dst = + mount_crypt_stat-> + global_default_cipher_name; + strncpy(cipher_name_dst, cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + cipher_name_set = 1; + break; + case ecryptfs_opt_ecryptfs_key_bytes: + cipher_key_bytes_src = args[0].from; + cipher_key_bytes = + (int)simple_strtol(cipher_key_bytes_src, + &cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_cipher_key_size = + cipher_key_bytes; + cipher_key_bytes_set = 1; + break; + case ecryptfs_opt_passthrough: + mount_crypt_stat->flags |= + ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; + break; + case ecryptfs_opt_xattr_metadata: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + break; + case ecryptfs_opt_encrypted_view: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + mount_crypt_stat->flags |= + ECRYPTFS_ENCRYPTED_VIEW_ENABLED; + break; + case ecryptfs_opt_fnek_sig: + fnek_src = args[0].from; + fnek_dst = + mount_crypt_stat->global_default_fnek_sig; + strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); + mount_crypt_stat->global_default_fnek_sig[ + ECRYPTFS_SIG_SIZE_HEX] = '\0'; + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, + rc); + goto out; + } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case ecryptfs_opt_fn_cipher: + fn_cipher_name_src = args[0].from; + fn_cipher_name_dst = + mount_crypt_stat->global_default_fn_cipher_name; + strncpy(fn_cipher_name_dst, fn_cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + mount_crypt_stat->global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + fn_cipher_name_set = 1; + break; + case ecryptfs_opt_fn_cipher_key_bytes: + fn_cipher_key_bytes_src = args[0].from; + fn_cipher_key_bytes = + (int)simple_strtol(fn_cipher_key_bytes_src, + &fn_cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_fn_cipher_key_bytes = + fn_cipher_key_bytes; + fn_cipher_key_bytes_set = 1; + break; + case ecryptfs_opt_unlink_sigs: + mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; + break; + case ecryptfs_opt_mount_auth_tok_only: + mount_crypt_stat->flags |= + ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; + break; + case ecryptfs_opt_check_dev_ruid: + *check_ruid = 1; + break; + case ecryptfs_opt_err: + default: + printk(KERN_WARNING + "%s: eCryptfs: unrecognized option [%s]\n", + __func__, p); + } + } + if (!sig_set) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "You must supply at least one valid " + "auth tok signature as a mount " + "parameter; see the eCryptfs README\n"); + goto out; + } + if (!cipher_name_set) { + int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); + + BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); + strcpy(mount_crypt_stat->global_default_cipher_name, + ECRYPTFS_DEFAULT_CIPHER); + } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_name_set) + strcpy(mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_cipher_name); + if (!cipher_key_bytes_set) + mount_crypt_stat->global_default_cipher_key_size = 0; + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; + + cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size); + if (!cipher_code) { + ecryptfs_printk(KERN_ERR, + "eCryptfs doesn't support cipher: %s", + mount_crypt_stat->global_default_cipher_name); + rc = -EINVAL; + goto out; + } + + mutex_lock(&key_tfm_list_mutex); + if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, + NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !ecryptfs_tfm_exists( + mount_crypt_stat->global_default_fn_cipher_name, NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + mutex_unlock(&key_tfm_list_mutex); + rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); + if (rc) + printk(KERN_WARNING "One or more global auth toks could not " + "properly register; rc = [%d]\n", rc); +out: + return rc; +} + +struct kmem_cache *ecryptfs_sb_info_cache; +static struct file_system_type ecryptfs_fs_type; + +/** + * ecryptfs_get_sb + * @fs_type + * @flags + * @dev_name: The path to mount over + * @raw_data: The options passed into the kernel + */ +static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct super_block *s; + struct ecryptfs_sb_info *sbi; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; + struct inode *inode; + struct path path; + uid_t check_ruid; + int rc; + + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + rc = -ENOMEM; + goto out; + } + + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); + if (rc) { + err = "Error parsing options"; + goto out; + } + mount_crypt_stat = &sbi->mount_crypt_stat; + + s = sget(fs_type, NULL, set_anon_super, flags, NULL); + if (IS_ERR(s)) { + rc = PTR_ERR(s); + goto out; + } + + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); + if (rc) + goto out1; + + ecryptfs_set_superblock_private(s, sbi); + s->s_bdi = &sbi->bdi; + + /* ->kill_sb() will take care of sbi after that point */ + sbi = NULL; + s->s_op = &ecryptfs_sops; + s->s_d_op = &ecryptfs_dops; + + err = "Reading sb failed"; + rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); + if (rc) { + ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); + goto out1; + } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } + + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { + rc = -EPERM; + printk(KERN_ERR "Mount of device (uid: %d) not owned by " + "requested user (uid: %d)\n", + i_uid_read(d_inode(path.dentry)), + from_kuid(&init_user_ns, current_uid())); + goto out_free; + } + + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); + + /** + * Set the POSIX ACL flag based on whether they're enabled in the lower + * mount. + */ + s->s_flags = flags & ~MS_POSIXACL; + s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + + /** + * Force a read-only eCryptfs mount when: + * 1) The lower mount is ro + * 2) The ecryptfs_encrypted_view mount option is specified + */ + if (path.dentry->d_sb->s_flags & MS_RDONLY || + mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + s->s_flags |= MS_RDONLY; + + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + s->s_blocksize = path.dentry->d_sb->s_blocksize; + s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } + + inode = ecryptfs_get_inode(d_inode(path.dentry), s); + rc = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_free; + + s->s_root = d_make_root(inode); + if (!s->s_root) { + rc = -ENOMEM; + goto out_free; + } + + rc = -ENOMEM; + root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!root_info) + goto out_free; + + /* ->kill_sb() will take care of root_info */ + ecryptfs_set_dentry_private(s->s_root, root_info); + root_info->lower_path = path; + + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); + +out_free: + path_put(&path); +out1: + deactivate_locked_super(s); +out: + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); + return ERR_PTR(rc); +} + +/** + * ecryptfs_kill_block_super + * @sb: The ecryptfs super block + * + * Used to bring the superblock down and free the private data. + */ +static void ecryptfs_kill_block_super(struct super_block *sb) +{ + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + kill_anon_super(sb); + if (!sb_info) + return; + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); +} + +static struct file_system_type ecryptfs_fs_type = { + .owner = THIS_MODULE, + .name = "ecryptfs", + .mount = ecryptfs_mount, + .kill_sb = ecryptfs_kill_block_super, + .fs_flags = 0 +}; +MODULE_ALIAS_FS("ecryptfs"); + +/** + * inode_info_init_once + * + * Initializes the ecryptfs_inode_info_cache when it is created + */ +static void +inode_info_init_once(void *vptr) +{ + struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; + + inode_init_once(&ei->vfs_inode); +} + +static struct ecryptfs_cache_info { + struct kmem_cache **cache; + const char *name; + size_t size; + void (*ctor)(void *obj); +} ecryptfs_cache_infos[] = { + { + .cache = &ecryptfs_auth_tok_list_item_cache, + .name = "ecryptfs_auth_tok_list_item", + .size = sizeof(struct ecryptfs_auth_tok_list_item), + }, + { + .cache = &ecryptfs_file_info_cache, + .name = "ecryptfs_file_cache", + .size = sizeof(struct ecryptfs_file_info), + }, + { + .cache = &ecryptfs_dentry_info_cache, + .name = "ecryptfs_dentry_info_cache", + .size = sizeof(struct ecryptfs_dentry_info), + }, + { + .cache = &ecryptfs_inode_info_cache, + .name = "ecryptfs_inode_cache", + .size = sizeof(struct ecryptfs_inode_info), + .ctor = inode_info_init_once, + }, + { + .cache = &ecryptfs_sb_info_cache, + .name = "ecryptfs_sb_cache", + .size = sizeof(struct ecryptfs_sb_info), + }, + { + .cache = &ecryptfs_header_cache, + .name = "ecryptfs_headers", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_xattr_cache, + .name = "ecryptfs_xattr_cache", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_key_record_cache, + .name = "ecryptfs_key_record_cache", + .size = sizeof(struct ecryptfs_key_record), + }, + { + .cache = &ecryptfs_key_sig_cache, + .name = "ecryptfs_key_sig_cache", + .size = sizeof(struct ecryptfs_key_sig), + }, + { + .cache = &ecryptfs_global_auth_tok_cache, + .name = "ecryptfs_global_auth_tok_cache", + .size = sizeof(struct ecryptfs_global_auth_tok), + }, + { + .cache = &ecryptfs_key_tfm_cache, + .name = "ecryptfs_key_tfm_cache", + .size = sizeof(struct ecryptfs_key_tfm), + }, +}; + +static void ecryptfs_free_kmem_caches(void) +{ + int i; + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + if (*(info->cache)) + kmem_cache_destroy(*(info->cache)); + } +} + +/** + * ecryptfs_init_kmem_caches + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_init_kmem_caches(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + *(info->cache) = kmem_cache_create(info->name, info->size, + 0, SLAB_HWCACHE_ALIGN, info->ctor); + if (!*(info->cache)) { + ecryptfs_free_kmem_caches(); + ecryptfs_printk(KERN_WARNING, "%s: " + "kmem_cache_create failed\n", + info->name); + return -ENOMEM; + } + } + return 0; +} + +static struct kobject *ecryptfs_kobj; + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buff) +{ + return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); +} + +static struct kobj_attribute version_attr = __ATTR_RO(version); + +static struct attribute *attributes[] = { + &version_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attributes, +}; + +static int do_sysfs_registration(void) +{ + int rc; + + ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); + if (!ecryptfs_kobj) { + printk(KERN_ERR "Unable to create ecryptfs kset\n"); + rc = -ENOMEM; + goto out; + } + rc = sysfs_create_group(ecryptfs_kobj, &attr_group); + if (rc) { + printk(KERN_ERR + "Unable to create ecryptfs version attributes\n"); + kobject_put(ecryptfs_kobj); + } +out: + return rc; +} + +static void do_sysfs_unregistration(void) +{ + sysfs_remove_group(ecryptfs_kobj, &attr_group); + kobject_put(ecryptfs_kobj); +} + +static int __init ecryptfs_init(void) +{ + int rc; + + if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " + "larger than the host's page size, and so " + "eCryptfs cannot run on this system. The " + "default eCryptfs extent size is [%u] bytes; " + "the page size is [%lu] bytes.\n", + ECRYPTFS_DEFAULT_EXTENT_SIZE, + (unsigned long)PAGE_CACHE_SIZE); + goto out; + } + rc = ecryptfs_init_kmem_caches(); + if (rc) { + printk(KERN_ERR + "Failed to allocate one or more kmem_cache objects\n"); + goto out; + } + rc = do_sysfs_registration(); + if (rc) { + printk(KERN_ERR "sysfs registration failed\n"); + goto out_free_kmem_caches; + } + rc = ecryptfs_init_kthread(); + if (rc) { + printk(KERN_ERR "%s: kthread initialization failed; " + "rc = [%d]\n", __func__, rc); + goto out_do_sysfs_unregistration; + } + rc = ecryptfs_init_messaging(); + if (rc) { + printk(KERN_ERR "Failure occurred while attempting to " + "initialize the communications channel to " + "ecryptfsd\n"); + goto out_destroy_kthread; + } + rc = ecryptfs_init_crypto(); + if (rc) { + printk(KERN_ERR "Failure whilst attempting to init crypto; " + "rc = [%d]\n", rc); + goto out_release_messaging; + } + rc = register_filesystem(&ecryptfs_fs_type); + if (rc) { + printk(KERN_ERR "Failed to register filesystem\n"); + goto out_destroy_crypto; + } + if (ecryptfs_verbosity > 0) + printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " + "will be written to the syslog!\n", ecryptfs_verbosity); + + goto out; +out_destroy_crypto: + ecryptfs_destroy_crypto(); +out_release_messaging: + ecryptfs_release_messaging(); +out_destroy_kthread: + ecryptfs_destroy_kthread(); +out_do_sysfs_unregistration: + do_sysfs_unregistration(); +out_free_kmem_caches: + ecryptfs_free_kmem_caches(); +out: + return rc; +} + +static void __exit ecryptfs_exit(void) +{ + int rc; + + rc = ecryptfs_destroy_crypto(); + if (rc) + printk(KERN_ERR "Failure whilst attempting to destroy crypto; " + "rc = [%d]\n", rc); + ecryptfs_release_messaging(); + ecryptfs_destroy_kthread(); + do_sysfs_unregistration(); + unregister_filesystem(&ecryptfs_fs_type); + ecryptfs_free_kmem_caches(); +} + +MODULE_AUTHOR("Michael A. Halcrow "); +MODULE_DESCRIPTION("eCryptfs"); + +MODULE_LICENSE("GPL"); + +module_init(ecryptfs_init) +module_exit(ecryptfs_exit) diff --git a/kernel/fs/ecryptfs/messaging.c b/kernel/fs/ecryptfs/messaging.c new file mode 100644 index 000000000..286f10b03 --- /dev/null +++ b/kernel/fs/ecryptfs/messaging.c @@ -0,0 +1,468 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2004-2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static LIST_HEAD(ecryptfs_msg_ctx_free_list); +static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); +static struct mutex ecryptfs_msg_ctx_lists_mux; + +static struct hlist_head *ecryptfs_daemon_hash; +struct mutex ecryptfs_daemon_hash_mux; +static int ecryptfs_hash_bits; +#define ecryptfs_current_euid_hash(uid) \ + hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits) + +static u32 ecryptfs_msg_counter; +static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; + +/** + * ecryptfs_acquire_free_msg_ctx + * @msg_ctx: The context that was acquired from the free list + * + * Acquires a context element from the free list and locks the mutex + * on the context. Sets the msg_ctx task to current. Returns zero on + * success; non-zero on error or upon failure to acquire a free + * context element. Must be called with ecryptfs_msg_ctx_lists_mux + * held. + */ +static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx) +{ + struct list_head *p; + int rc; + + if (list_empty(&ecryptfs_msg_ctx_free_list)) { + printk(KERN_WARNING "%s: The eCryptfs free " + "context list is empty. It may be helpful to " + "specify the ecryptfs_message_buf_len " + "parameter to be greater than the current " + "value of [%d]\n", __func__, ecryptfs_message_buf_len); + rc = -ENOMEM; + goto out; + } + list_for_each(p, &ecryptfs_msg_ctx_free_list) { + *msg_ctx = list_entry(p, struct ecryptfs_msg_ctx, node); + if (mutex_trylock(&(*msg_ctx)->mux)) { + (*msg_ctx)->task = current; + rc = 0; + goto out; + } + } + rc = -ENOMEM; +out: + return rc; +} + +/** + * ecryptfs_msg_ctx_free_to_alloc + * @msg_ctx: The context to move from the free list to the alloc list + * + * Must be called with ecryptfs_msg_ctx_lists_mux held. + */ +static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&msg_ctx->node, &ecryptfs_msg_ctx_alloc_list); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_PENDING; + msg_ctx->counter = ++ecryptfs_msg_counter; +} + +/** + * ecryptfs_msg_ctx_alloc_to_free + * @msg_ctx: The context to move from the alloc list to the free list + * + * Must be called with ecryptfs_msg_ctx_lists_mux held. + */ +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); + kfree(msg_ctx->msg); + msg_ctx->msg = NULL; + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; +} + +/** + * ecryptfs_find_daemon_by_euid + * @daemon: If return value is zero, points to the desired daemon pointer + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Search the hash list for the current effective user id. + * + * Returns zero if the user id exists in the list; non-zero otherwise. + */ +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon) +{ + int rc; + + hlist_for_each_entry(*daemon, + &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], + euid_chain) { + if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) { + rc = 0; + goto out; + } + } + rc = -EINVAL; +out: + return rc; +} + +/** + * ecryptfs_spawn_daemon - Create and initialize a new daemon struct + * @daemon: Pointer to set to newly allocated daemon struct + * @file: File used when opening /dev/ecryptfs + * + * Must be called ceremoniously while in possession of + * ecryptfs_sacred_daemon_hash_mux + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file) +{ + int rc = 0; + + (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); + if (!(*daemon)) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " + "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); + goto out; + } + (*daemon)->file = file; + mutex_init(&(*daemon)->mux); + INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue); + init_waitqueue_head(&(*daemon)->wait); + (*daemon)->num_queued_msg_ctx = 0; + hlist_add_head(&(*daemon)->euid_chain, + &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()]); +out: + return rc; +} + +/** + * ecryptfs_exorcise_daemon - Destroy the daemon struct + * + * Must be called ceremoniously while in possession of + * ecryptfs_daemon_hash_mux and the daemon's own mux. + */ +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) +{ + struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp; + int rc = 0; + + mutex_lock(&daemon->mux); + if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ) + || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) { + rc = -EBUSY; + mutex_unlock(&daemon->mux); + goto out; + } + list_for_each_entry_safe(msg_ctx, msg_ctx_tmp, + &daemon->msg_ctx_out_queue, daemon_out_list) { + list_del(&msg_ctx->daemon_out_list); + daemon->num_queued_msg_ctx--; + printk(KERN_WARNING "%s: Warning: dropping message that is in " + "the out queue of a dying daemon\n", __func__); + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + } + hlist_del(&daemon->euid_chain); + mutex_unlock(&daemon->mux); + kzfree(daemon); +out: + return rc; +} + +/** + * ecryptfs_process_reponse + * @msg: The ecryptfs message received; the caller should sanity check + * msg->data_len and free the memory + * @seq: The sequence number of the message; must match the sequence + * number for the existing message context waiting for this + * response + * + * Processes a response message after sending an operation request to + * userspace. Some other process is awaiting this response. Before + * sending out its first communications, the other process allocated a + * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The + * response message contains this index so that we can copy over the + * response message into the msg_ctx that the process holds a + * reference to. The other process is going to wake up, check to see + * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then + * proceed to read off and process the response message. Returns zero + * upon delivery to desired context element; non-zero upon delivery + * failure or error. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_process_response(struct ecryptfs_daemon *daemon, + struct ecryptfs_message *msg, u32 seq) +{ + struct ecryptfs_msg_ctx *msg_ctx; + size_t msg_size; + int rc; + + if (msg->index >= ecryptfs_message_buf_len) { + rc = -EINVAL; + printk(KERN_ERR "%s: Attempt to reference " + "context buffer at index [%d]; maximum " + "allowable is [%d]\n", __func__, msg->index, + (ecryptfs_message_buf_len - 1)); + goto out; + } + msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; + mutex_lock(&msg_ctx->mux); + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { + rc = -EINVAL; + printk(KERN_WARNING "%s: Desired context element is not " + "pending a response\n", __func__); + goto unlock; + } else if (msg_ctx->counter != seq) { + rc = -EINVAL; + printk(KERN_WARNING "%s: Invalid message sequence; " + "expected [%d]; received [%d]\n", __func__, + msg_ctx->counter, seq); + goto unlock; + } + msg_size = (sizeof(*msg) + msg->data_len); + msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " + "GFP_KERNEL memory\n", __func__, msg_size); + goto unlock; + } + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; + wake_up_process(msg_ctx->task); + rc = 0; +unlock: + mutex_unlock(&msg_ctx->mux); +out: + return rc; +} + +/** + * ecryptfs_send_message_locked + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx) +{ + struct ecryptfs_daemon *daemon; + int rc; + + rc = ecryptfs_find_daemon_by_euid(&daemon); + if (rc) { + rc = -ENOTCONN; + goto out; + } + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_acquire_free_msg_ctx(msg_ctx); + if (rc) { + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + printk(KERN_WARNING "%s: Could not claim a free " + "context element\n", __func__); + goto out; + } + ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); + mutex_unlock(&(*msg_ctx)->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0, + daemon); + if (rc) + printk(KERN_ERR "%s: Error attempting to send message to " + "userspace daemon; rc = [%d]\n", __func__, rc); +out: + return rc; +} + +/** + * ecryptfs_send_message + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + * + * Grabs ecryptfs_daemon_hash_mux. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST, + msg_ctx); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_wait_for_response + * @msg_ctx: The context that was assigned when sending a message + * @msg: The incoming message from userspace; not set if rc != 0 + * + * Sleeps until awaken by ecryptfs_receive_message or until the amount + * of time exceeds ecryptfs_message_wait_timeout. If zero is + * returned, msg will point to a valid message from userspace; a + * non-zero value is returned upon failure to receive a message or an + * error occurs. Callee must free @msg on success. + */ +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **msg) +{ + signed long timeout = ecryptfs_message_wait_timeout * HZ; + int rc = 0; + +sleep: + timeout = schedule_timeout_interruptible(timeout); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_DONE) { + if (timeout) { + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + goto sleep; + } + rc = -ENOMSG; + } else { + *msg = msg_ctx->msg; + msg_ctx->msg = NULL; + } + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + return rc; +} + +int __init ecryptfs_init_messaging(void) +{ + int i; + int rc = 0; + + if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) { + ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS; + printk(KERN_WARNING "%s: Specified number of users is " + "too large, defaulting to [%d] users\n", __func__, + ecryptfs_number_of_users); + } + mutex_init(&ecryptfs_daemon_hash_mux); + mutex_lock(&ecryptfs_daemon_hash_mux); + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; + ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); + if (!ecryptfs_daemon_hash) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); + mutex_unlock(&ecryptfs_daemon_hash_mux); + goto out; + } + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) + INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); + mutex_unlock(&ecryptfs_daemon_hash_mux); + ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) + * ecryptfs_message_buf_len), + GFP_KERNEL); + if (!ecryptfs_msg_ctx_arr) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); + goto out; + } + mutex_init(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + ecryptfs_msg_counter = 0; + for (i = 0; i < ecryptfs_message_buf_len; i++) { + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node); + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list); + mutex_init(&ecryptfs_msg_ctx_arr[i].mux); + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + ecryptfs_msg_ctx_arr[i].index = i; + ecryptfs_msg_ctx_arr[i].state = ECRYPTFS_MSG_CTX_STATE_FREE; + ecryptfs_msg_ctx_arr[i].counter = 0; + ecryptfs_msg_ctx_arr[i].task = NULL; + ecryptfs_msg_ctx_arr[i].msg = NULL; + list_add_tail(&ecryptfs_msg_ctx_arr[i].node, + &ecryptfs_msg_ctx_free_list); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(); +out: + return rc; +} + +void ecryptfs_release_messaging(void) +{ + if (ecryptfs_msg_ctx_arr) { + int i; + + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + for (i = 0; i < ecryptfs_message_buf_len; i++) { + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + kfree(ecryptfs_msg_ctx_arr[i].msg); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + kfree(ecryptfs_msg_ctx_arr); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + } + if (ecryptfs_daemon_hash) { + struct ecryptfs_daemon *daemon; + int i; + + mutex_lock(&ecryptfs_daemon_hash_mux); + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { + int rc; + + hlist_for_each_entry(daemon, + &ecryptfs_daemon_hash[i], + euid_chain) { + rc = ecryptfs_exorcise_daemon(daemon); + if (rc) + printk(KERN_ERR "%s: Error whilst " + "attempting to destroy daemon; " + "rc = [%d]. Dazed and confused, " + "but trying to continue.\n", + __func__, rc); + } + } + kfree(ecryptfs_daemon_hash); + mutex_unlock(&ecryptfs_daemon_hash_mux); + } + ecryptfs_destroy_ecryptfs_miscdev(); + return; +} diff --git a/kernel/fs/ecryptfs/miscdev.c b/kernel/fs/ecryptfs/miscdev.c new file mode 100644 index 000000000..e4141f257 --- /dev/null +++ b/kernel/fs/ecryptfs/miscdev.c @@ -0,0 +1,511 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static atomic_t ecryptfs_num_miscdev_opens; + +/** + * ecryptfs_miscdev_poll + * @file: dev file + * @pt: dev poll table (ignored) + * + * Returns the poll mask + */ +static unsigned int +ecryptfs_miscdev_poll(struct file *file, poll_table *pt) +{ + struct ecryptfs_daemon *daemon = file->private_data; + unsigned int mask = 0; + + mutex_lock(&daemon->mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + printk(KERN_WARNING "%s: Attempt to poll on zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) + goto out_unlock_daemon; + if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL) + goto out_unlock_daemon; + daemon->flags |= ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + poll_wait(file, &daemon->wait, pt); + mutex_lock(&daemon->mux); + if (!list_empty(&daemon->msg_ctx_out_queue)) + mask |= POLLIN | POLLRDNORM; +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + return mask; +} + +/** + * ecryptfs_miscdev_open + * @inode: inode of miscdev handle (ignored) + * @file: file for miscdev handle + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_open(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = NULL; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon); + if (!rc) { + rc = -EINVAL; + goto out_unlock_daemon_list; + } + rc = ecryptfs_spawn_daemon(&daemon, file); + if (rc) { + printk(KERN_ERR "%s: Error attempting to spawn daemon; " + "rc = [%d]\n", __func__, rc); + goto out_unlock_daemon_list; + } + mutex_lock(&daemon->mux); + if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { + rc = -EBUSY; + goto out_unlock_daemon; + } + daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; + file->private_data = daemon; + atomic_inc(&ecryptfs_num_miscdev_opens); +out_unlock_daemon: + mutex_unlock(&daemon->mux); +out_unlock_daemon_list: + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_miscdev_release + * @inode: inode of fs/ecryptfs/euid handle (ignored) + * @file: file for fs/ecryptfs/euid handle + * + * This keeps the daemon registered until the daemon sends another + * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_release(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = file->private_data; + int rc; + + mutex_lock(&daemon->mux); + BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); + daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_dec(&ecryptfs_num_miscdev_opens); + mutex_unlock(&daemon->mux); + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_exorcise_daemon(daemon); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (rc) { + printk(KERN_CRIT "%s: Fatal error whilst attempting to " + "shut down daemon; rc = [%d]. Please report this " + "bug.\n", __func__, rc); + BUG(); + } + return rc; +} + +/** + * ecryptfs_send_miscdev + * @data: Data to send to daemon; may be NULL + * @data_size: Amount of data to send to daemon + * @msg_ctx: Message context, which is used to handle the reply. If + * this is NULL, then we do not expect a reply. + * @msg_type: Type of message + * @msg_flags: Flags for message + * @daemon: eCryptfs daemon object + * + * Add msg_ctx to queue and then, if it exists, notify the blocked + * miscdevess about the data being available. Must be called with + * ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon) +{ + struct ecryptfs_message *msg; + + msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL); + if (!msg) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, + (sizeof(*msg) + data_size)); + return -ENOMEM; + } + + mutex_lock(&msg_ctx->mux); + msg_ctx->msg = msg; + msg_ctx->msg->index = msg_ctx->index; + msg_ctx->msg->data_len = data_size; + msg_ctx->type = msg_type; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); + list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); + mutex_unlock(&msg_ctx->mux); + + mutex_lock(&daemon->mux); + daemon->num_queued_msg_ctx++; + wake_up_interruptible(&daemon->wait); + mutex_unlock(&daemon->mux); + + return 0; +} + +/* + * miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Octets 5-N1 not written if the packet type does not include a message + */ +#define PKT_TYPE_SIZE 1 +#define PKT_CTR_SIZE 4 +#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE) +#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \ + + ECRYPTFS_MIN_PKT_LEN_SIZE) +/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */ +#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \ + + ECRYPTFS_MAX_PKT_LEN_SIZE \ + + sizeof(struct ecryptfs_message) \ + + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) +#define PKT_TYPE_OFFSET 0 +#define PKT_CTR_OFFSET PKT_TYPE_SIZE +#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE) + +/** + * ecryptfs_miscdev_read - format and send message from queue + * @file: miscdevfs handle + * @buf: User buffer into which to copy the next message on the daemon queue + * @count: Amount of space available in @buf + * @ppos: Offset in file (ignored) + * + * Pulls the most recent message from the daemon queue, formats it for + * being sent via a miscdevfs handle, and copies it into @buf + * + * Returns the number of bytes copied into the user buffer + */ +static ssize_t +ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct ecryptfs_daemon *daemon = file->private_data; + struct ecryptfs_msg_ctx *msg_ctx; + size_t packet_length_size; + char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE]; + size_t i; + size_t total_length; + int rc; + + mutex_lock(&daemon->mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + printk(KERN_WARNING "%s: Attempt to read from zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) { + rc = 0; + goto out_unlock_daemon; + } + /* This daemon will not go away so long as this flag is set */ + daemon->flags |= ECRYPTFS_DAEMON_IN_READ; +check_list: + if (list_empty(&daemon->msg_ctx_out_queue)) { + mutex_unlock(&daemon->mux); + rc = wait_event_interruptible( + daemon->wait, !list_empty(&daemon->msg_ctx_out_queue)); + mutex_lock(&daemon->mux); + if (rc < 0) { + rc = 0; + goto out_unlock_daemon; + } + } + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + goto out_unlock_daemon; + } + if (list_empty(&daemon->msg_ctx_out_queue)) { + /* Something else jumped in since the + * wait_event_interruptable() and removed the + * message from the queue; try again */ + goto check_list; + } + msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, + struct ecryptfs_msg_ctx, daemon_out_list); + BUG_ON(!msg_ctx); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->msg) { + rc = ecryptfs_write_packet_length(packet_length, + msg_ctx->msg_size, + &packet_length_size); + if (rc) { + rc = 0; + printk(KERN_WARNING "%s: Error writing packet length; " + "rc = [%d]\n", __func__, rc); + goto out_unlock_msg_ctx; + } + } else { + packet_length_size = 0; + msg_ctx->msg_size = 0; + } + total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size + + msg_ctx->msg_size); + if (count < total_length) { + rc = 0; + printk(KERN_WARNING "%s: Only given user buffer of " + "size [%zd], but we need [%zd] to read the " + "pending message\n", __func__, count, total_length); + goto out_unlock_msg_ctx; + } + rc = -EFAULT; + if (put_user(msg_ctx->type, buf)) + goto out_unlock_msg_ctx; + if (put_user(cpu_to_be32(msg_ctx->counter), + (__be32 __user *)(&buf[PKT_CTR_OFFSET]))) + goto out_unlock_msg_ctx; + i = PKT_TYPE_SIZE + PKT_CTR_SIZE; + if (msg_ctx->msg) { + if (copy_to_user(&buf[i], packet_length, packet_length_size)) + goto out_unlock_msg_ctx; + i += packet_length_size; + if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size)) + goto out_unlock_msg_ctx; + i += msg_ctx->msg_size; + } + rc = i; + list_del(&msg_ctx->daemon_out_list); + kfree(msg_ctx->msg); + msg_ctx->msg = NULL; + /* We do not expect a reply from the userspace daemon for any + * message type other than ECRYPTFS_MSG_REQUEST */ + if (msg_ctx->type != ECRYPTFS_MSG_REQUEST) + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); +out_unlock_msg_ctx: + mutex_unlock(&msg_ctx->mux); +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ; + mutex_unlock(&daemon->mux); + return rc; +} + +/** + * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon + * @data: Bytes comprising struct ecryptfs_message + * @data_size: sizeof(struct ecryptfs_message) + data len + * @seq: Sequence number for miscdev response packet + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_miscdev_response(struct ecryptfs_daemon *daemon, char *data, + size_t data_size, u32 seq) +{ + struct ecryptfs_message *msg = (struct ecryptfs_message *)data; + int rc; + + if ((sizeof(*msg) + msg->data_len) != data_size) { + printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " + "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, + (sizeof(*msg) + msg->data_len), data_size); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_process_response(daemon, msg, seq); + if (rc) + printk(KERN_ERR + "Error processing response message; rc = [%d]\n", rc); +out: + return rc; +} + +/** + * ecryptfs_miscdev_write - handle write to daemon miscdev handle + * @file: File for misc dev handle + * @buf: Buffer containing user data + * @count: Amount of data in @buf + * @ppos: Pointer to offset in file (ignored) + * + * Returns the number of bytes read from @buf + */ +static ssize_t +ecryptfs_miscdev_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + __be32 counter_nbo; + u32 seq; + size_t packet_size, packet_size_length; + char *data; + unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE]; + ssize_t rc; + + if (count == 0) { + return 0; + } else if (count == MIN_NON_MSG_PKT_SIZE) { + /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */ + goto memdup; + } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { + printk(KERN_WARNING "%s: Acceptable packet size range is " + "[%d-%zu], but amount of data written is [%zu].", + __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); + return -EINVAL; + } + + if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET], + sizeof(packet_size_peek))) { + printk(KERN_WARNING "%s: Error while inspecting packet size\n", + __func__); + return -EFAULT; + } + + rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size, + &packet_size_length); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%zd]\n", __func__, rc); + return rc; + } + + if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size) + != count) { + printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__, + packet_size); + return -EINVAL; + } + +memdup: + data = memdup_user(buf, count); + if (IS_ERR(data)) { + printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", + __func__, PTR_ERR(data)); + return PTR_ERR(data); + } + switch (data[PKT_TYPE_OFFSET]) { + case ECRYPTFS_MSG_RESPONSE: + if (count < (MIN_MSG_PKT_SIZE + + sizeof(struct ecryptfs_message))) { + printk(KERN_WARNING "%s: Minimum acceptable packet " + "size is [%zd], but amount of data written is " + "only [%zd]. Discarding response packet.\n", + __func__, + (MIN_MSG_PKT_SIZE + + sizeof(struct ecryptfs_message)), count); + rc = -EINVAL; + goto out_free; + } + memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE); + seq = be32_to_cpu(counter_nbo); + rc = ecryptfs_miscdev_response(file->private_data, + &data[PKT_LEN_OFFSET + packet_size_length], + packet_size, seq); + if (rc) { + printk(KERN_WARNING "%s: Failed to deliver miscdev " + "response to requesting operation; rc = [%zd]\n", + __func__, rc); + goto out_free; + } + break; + case ECRYPTFS_MSG_HELO: + case ECRYPTFS_MSG_QUIT: + break; + default: + ecryptfs_printk(KERN_WARNING, "Dropping miscdev " + "message of unrecognized type [%d]\n", + data[0]); + rc = -EINVAL; + goto out_free; + } + rc = count; +out_free: + kfree(data); + return rc; +} + + +static const struct file_operations ecryptfs_miscdev_fops = { + .owner = THIS_MODULE, + .open = ecryptfs_miscdev_open, + .poll = ecryptfs_miscdev_poll, + .read = ecryptfs_miscdev_read, + .write = ecryptfs_miscdev_write, + .release = ecryptfs_miscdev_release, + .llseek = noop_llseek, +}; + +static struct miscdevice ecryptfs_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ecryptfs", + .fops = &ecryptfs_miscdev_fops +}; + +/** + * ecryptfs_init_ecryptfs_miscdev + * + * Messages sent to the userspace daemon from the kernel are placed on + * a queue associated with the daemon. The next read against the + * miscdev handle by that daemon will return the oldest message placed + * on the message queue for the daemon. + * + * Returns zero on success; non-zero otherwise + */ +int __init ecryptfs_init_ecryptfs_miscdev(void) +{ + int rc; + + atomic_set(&ecryptfs_num_miscdev_opens, 0); + rc = misc_register(&ecryptfs_miscdev); + if (rc) + printk(KERN_ERR "%s: Failed to register miscellaneous device " + "for communications with userspace daemons; rc = [%d]\n", + __func__, rc); + return rc; +} + +/** + * ecryptfs_destroy_ecryptfs_miscdev + * + * All of the daemons must be exorcised prior to calling this + * function. + */ +void ecryptfs_destroy_ecryptfs_miscdev(void) +{ + BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0); + misc_deregister(&ecryptfs_miscdev); +} diff --git a/kernel/fs/ecryptfs/mmap.c b/kernel/fs/ecryptfs/mmap.c new file mode 100644 index 000000000..cf2085229 --- /dev/null +++ b/kernel/fs/ecryptfs/mmap.c @@ -0,0 +1,561 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * This is where eCryptfs coordinates the symmetric encryption and + * decryption of the file data as it passes between the lower + * encrypted file and the upper decrypted file. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_get_locked_page + * + * Get one page from cache or lower f/s, return error otherwise. + * + * Returns locked and up-to-date page (if ok), with increased + * refcnt. + */ +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) +{ + struct page *page = read_mapping_page(inode->i_mapping, index, NULL); + if (!IS_ERR(page)) + lock_page(page); + return page; +} + +/** + * ecryptfs_writepage + * @page: Page that is locked before this call is made + * + * Returns zero on success; non-zero otherwise + * + * This is where we encrypt the data and pass the encrypted data to + * the lower filesystem. In OpenPGP-compatible mode, we operate on + * entire underlying packets. + */ +static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int rc; + + rc = ecryptfs_encrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting " + "page (upper index [0x%.16lx])\n", page->index); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); +out: + unlock_page(page); + return rc; +} + +static void strip_xattr_flag(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + size_t written; + + crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR; + ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat, + &written); + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } +} + +/** + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + */ + +/** + * ecryptfs_copy_up_encrypted_with_header + * @page: Sort of a ``virtual'' representation of the encrypted lower + * file. The actual lower file does not have the metadata in + * the header. This is locked. + * @crypt_stat: The eCryptfs inode's cryptographic context + * + * The ``view'' is the version of the file that userspace winds up + * seeing, with the header information inserted. + */ +static int +ecryptfs_copy_up_encrypted_with_header(struct page *page, + struct ecryptfs_crypt_stat *crypt_stat) +{ + loff_t extent_num_in_page = 0; + loff_t num_extents_per_page = (PAGE_CACHE_SIZE + / crypt_stat->extent_size); + int rc = 0; + + while (extent_num_in_page < num_extents_per_page) { + loff_t view_extent_num = ((((loff_t)page->index) + * num_extents_per_page) + + extent_num_in_page); + size_t num_header_extents_at_front = + (crypt_stat->metadata_size / crypt_stat->extent_size); + + if (view_extent_num < num_header_extents_at_front) { + /* This is a header extent */ + char *page_virt; + + page_virt = kmap_atomic(page); + memset(page_virt, 0, PAGE_CACHE_SIZE); + /* TODO: Support more than one header extent */ + if (view_extent_num == 0) { + size_t written; + + rc = ecryptfs_read_xattr_region( + page_virt, page->mapping->host); + strip_xattr_flag(page_virt + 16, crypt_stat); + ecryptfs_write_header_metadata(page_virt + 20, + crypt_stat, + &written); + } + kunmap_atomic(page_virt); + flush_dcache_page(page); + if (rc) { + printk(KERN_ERR "%s: Error reading xattr " + "region; rc = [%d]\n", __func__, rc); + goto out; + } + } else { + /* This is an encrypted data extent */ + loff_t lower_offset = + ((view_extent_num * crypt_stat->extent_size) + - crypt_stat->metadata_size); + + rc = ecryptfs_read_lower_page_segment( + page, (lower_offset >> PAGE_CACHE_SHIFT), + (lower_offset & ~PAGE_CACHE_MASK), + crypt_stat->extent_size, page->mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error attempting to read " + "extent at offset [%lld] in the lower " + "file; rc = [%d]\n", __func__, + lower_offset, rc); + goto out; + } + } + extent_num_in_page++; + } +out: + return rc; +} + +/** + * ecryptfs_readpage + * @file: An eCryptfs file + * @page: Page from eCryptfs inode mapping into which to stick the read data + * + * Read in a page, decrypting if necessary. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_readpage(struct file *file, struct page *page) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; + int rc = 0; + + if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_read_lower_page_segment(page, page->index, 0, + PAGE_CACHE_SIZE, + page->mapping->host); + } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + rc = ecryptfs_copy_up_encrypted_with_header(page, + crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to copy " + "the encrypted content from the lower " + "file whilst inserting the metadata " + "from the xattr into the header; rc = " + "[%d]\n", __func__, rc); + goto out; + } + + } else { + rc = ecryptfs_read_lower_page_segment( + page, page->index, 0, PAGE_CACHE_SIZE, + page->mapping->host); + if (rc) { + printk(KERN_ERR "Error reading page; rc = " + "[%d]\n", rc); + goto out; + } + } + } else { + rc = ecryptfs_decrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting page; " + "rc = [%d]\n", rc); + goto out; + } + } +out: + if (rc) + ClearPageUptodate(page); + else + SetPageUptodate(page); + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", + page->index); + unlock_page(page); + return rc; +} + +/** + * Called with lower inode mutex held. + */ +static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +{ + struct inode *inode = page->mapping->host; + int end_byte_in_page; + + if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index) + goto out; + end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; + if (to > end_byte_in_page) + end_byte_in_page = to; + zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE); +out: + return 0; +} + +/** + * ecryptfs_write_begin + * @file: The eCryptfs file + * @mapping: The eCryptfs object + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused) + * + * This function must zero any hole we create + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + loff_t prev_page_end_size; + int rc = 0; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); + if (!PageUptodate(page)) { + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(mapping->host)->crypt_stat; + + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_read_lower_page_segment( + page, index, 0, PAGE_CACHE_SIZE, mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error attemping to read " + "lower page segment; rc = [%d]\n", + __func__, rc); + ClearPageUptodate(page); + goto out; + } else + SetPageUptodate(page); + } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + rc = ecryptfs_copy_up_encrypted_with_header( + page, crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting " + "to copy the encrypted content " + "from the lower file whilst " + "inserting the metadata from " + "the xattr into the header; rc " + "= [%d]\n", __func__, rc); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + } else { + rc = ecryptfs_read_lower_page_segment( + page, index, 0, PAGE_CACHE_SIZE, + mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error reading " + "page; rc = [%d]\n", + __func__, rc); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + } + } else { + if (prev_page_end_size + >= i_size_read(page->mapping->host)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (len < PAGE_CACHE_SIZE) { + rc = ecryptfs_decrypt_page(page); + if (rc) { + printk(KERN_ERR "%s: Error decrypting " + "page at index [%ld]; " + "rc = [%d]\n", + __func__, page->index, rc); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + } + } + } + /* If creating a page or more of holes, zero them out via truncate. + * Note, this will increase i_size. */ + if (index != 0) { + if (prev_page_end_size > i_size_read(page->mapping->host)) { + rc = ecryptfs_truncate(file->f_path.dentry, + prev_page_end_size); + if (rc) { + printk(KERN_ERR "%s: Error on attempt to " + "truncate to (higher) offset [%lld];" + " rc = [%d]\n", __func__, + prev_page_end_size, rc); + goto out; + } + } + } + /* Writing to a new page, and creating a small hole from start + * of page? Zero it out. */ + if ((i_size_read(mapping->host) == prev_page_end_size) + && (pos != 0)) + zero_user(page, 0, PAGE_CACHE_SIZE); +out: + if (unlikely(rc)) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + } + return rc; +} + +/** + * ecryptfs_write_inode_size_to_header + * + * Writes the lower file size to the first 8 bytes of the header. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) +{ + char *file_size_virt; + int rc; + + file_size_virt = kmalloc(sizeof(u64), GFP_KERNEL); + if (!file_size_virt) { + rc = -ENOMEM; + goto out; + } + put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt); + rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, + sizeof(u64)); + kfree(file_size_virt); + if (rc < 0) + printk(KERN_ERR "%s: Error writing file size to header; " + "rc = [%d]\n", __func__, rc); + else + rc = 0; +out: + return rc; +} + +struct kmem_cache *ecryptfs_xattr_cache; + +static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) +{ + ssize_t size; + void *xattr_virt; + struct dentry *lower_dentry = + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; + struct inode *lower_inode = d_inode(lower_dentry); + int rc; + + if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { + printk(KERN_WARNING + "No support for setting xattr in lower filesystem\n"); + rc = -ENOSYS; + goto out; + } + xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL); + if (!xattr_virt) { + printk(KERN_ERR "Out of memory whilst attempting to write " + "inode size to xattr\n"); + rc = -ENOMEM; + goto out; + } + mutex_lock(&lower_inode->i_mutex); + size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + xattr_virt, PAGE_CACHE_SIZE); + if (size < 0) + size = 8; + put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); + rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + xattr_virt, size, 0); + mutex_unlock(&lower_inode->i_mutex); + if (rc) + printk(KERN_ERR "Error whilst attempting to write inode size " + "to lower file xattr; rc = [%d]\n", rc); + kmem_cache_free(ecryptfs_xattr_cache, xattr_virt); +out: + return rc; +} + +int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) +{ + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); + else + return ecryptfs_write_inode_size_to_header(ecryptfs_inode); +} + +/** + * ecryptfs_write_end + * @file: The eCryptfs file object + * @mapping: The eCryptfs object + * @pos: The file position + * @len: The length of the data (unused) + * @copied: The amount of data copied + * @page: The eCryptfs page + * @fsdata: The fsdata (unused) + */ +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + copied; + struct inode *ecryptfs_inode = mapping->host; + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + int rc; + + ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" + "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, + to); + if (!rc) { + rc = copied; + fsstack_copy_inode_size(ecryptfs_inode, + ecryptfs_inode_to_lower(ecryptfs_inode)); + } + goto out; + } + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + rc = 0; + goto out; + } + SetPageUptodate(page); + } + /* Fills in zeros if 'to' goes beyond inode size */ + rc = fill_zeros_to_end_of_page(page, to); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to fill " + "zeros in page with index = [0x%.16lx]\n", index); + goto out; + } + rc = ecryptfs_encrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " + "index [0x%.16lx])\n", index); + goto out; + } + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); + ecryptfs_printk(KERN_DEBUG, "Expanded file size to " + "[0x%.16llx]\n", + (unsigned long long)i_size_read(ecryptfs_inode)); + } + rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); + if (rc) + printk(KERN_ERR "Error writing inode size to metadata; " + "rc = [%d]\n", rc); + else + rc = copied; +out: + unlock_page(page); + page_cache_release(page); + return rc; +} + +static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) +{ + int rc = 0; + struct inode *inode; + struct inode *lower_inode; + + inode = (struct inode *)mapping->host; + lower_inode = ecryptfs_inode_to_lower(inode); + if (lower_inode->i_mapping->a_ops->bmap) + rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping, + block); + return rc; +} + +const struct address_space_operations ecryptfs_aops = { + .writepage = ecryptfs_writepage, + .readpage = ecryptfs_readpage, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, + .bmap = ecryptfs_bmap, +}; diff --git a/kernel/fs/ecryptfs/read_write.c b/kernel/fs/ecryptfs/read_write.c new file mode 100644 index 000000000..09fe62227 --- /dev/null +++ b/kernel/fs/ecryptfs/read_write.c @@ -0,0 +1,273 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_write_lower + * @ecryptfs_inode: The eCryptfs inode + * @data: Data to write + * @offset: Byte offset in the lower file to which to write the data + * @size: Number of bytes from @data to write at @offset in the lower + * file + * + * Write data to the lower file. + * + * Returns bytes written on success; less than zero on error + */ +int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, + loff_t offset, size_t size) +{ + struct file *lower_file; + ssize_t rc; + + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; + rc = kernel_write(lower_file, data, size, offset); + mark_inode_dirty_sync(ecryptfs_inode); + return rc; +} + +/** + * ecryptfs_write_lower_page_segment + * @ecryptfs_inode: The eCryptfs inode + * @page_for_lower: The page containing the data to be written to the + * lower file + * @offset_in_page: The offset in the @page_for_lower from which to + * start writing the data + * @size: The amount of data from @page_for_lower to write to the + * lower file + * + * Determines the byte offset in the file for the given page and + * offset within the page, maps the page, and makes the call to write + * the contents of @page_for_lower to the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, + struct page *page_for_lower, + size_t offset_in_page, size_t size) +{ + char *virt; + loff_t offset; + int rc; + + offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT) + + offset_in_page); + virt = kmap(page_for_lower); + rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); + if (rc > 0) + rc = 0; + kunmap(page_for_lower); + return rc; +} + +/** + * ecryptfs_write + * @ecryptfs_inode: The eCryptfs file into which to write + * @data: Virtual address where data to write is located + * @offset: Offset in the eCryptfs file at which to begin writing the + * data from @data + * @size: The number of bytes to write from @data + * + * Write an arbitrary amount of data to an arbitrary location in the + * eCryptfs inode page cache. This is done on a page-by-page, and then + * by an extent-by-extent, basis; individual extents are encrypted and + * written to the lower page cache (via VFS writes). This function + * takes care of all the address translation to locations in the lower + * filesystem; it also handles truncate events, writing out zeros + * where necessary. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, + size_t size) +{ + struct page *ecryptfs_page; + struct ecryptfs_crypt_stat *crypt_stat; + char *ecryptfs_page_virt; + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); + loff_t data_offset = 0; + loff_t pos; + int rc = 0; + + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + /* + * if we are writing beyond current size, then start pos + * at the current size - we'll fill in zeros from there. + */ + if (offset > ecryptfs_file_size) + pos = ecryptfs_file_size; + else + pos = offset; + while (pos < (offset + size)) { + pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); + size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); + size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); + loff_t total_remaining_bytes = ((offset + size) - pos); + + if (fatal_signal_pending(current)) { + rc = -EINTR; + break; + } + + if (num_bytes > total_remaining_bytes) + num_bytes = total_remaining_bytes; + if (pos < offset) { + /* remaining zeros to write, up to destination offset */ + loff_t total_remaining_zeros = (offset - pos); + + if (num_bytes > total_remaining_zeros) + num_bytes = total_remaining_zeros; + } + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, + ecryptfs_page_idx); + if (IS_ERR(ecryptfs_page)) { + rc = PTR_ERR(ecryptfs_page); + printk(KERN_ERR "%s: Error getting page at " + "index [%ld] from eCryptfs inode " + "mapping; rc = [%d]\n", __func__, + ecryptfs_page_idx, rc); + goto out; + } + ecryptfs_page_virt = kmap_atomic(ecryptfs_page); + + /* + * pos: where we're now writing, offset: where the request was + * If current pos is before request, we are filling zeros + * If we are at or beyond request, we are writing the *data* + * If we're in a fresh page beyond eof, zero it in either case + */ + if (pos < offset || !start_offset_in_page) { + /* We are extending past the previous end of the file. + * Fill in zero values to the end of the page */ + memset(((char *)ecryptfs_page_virt + + start_offset_in_page), 0, + PAGE_CACHE_SIZE - start_offset_in_page); + } + + /* pos >= offset, we are now writing the data request */ + if (pos >= offset) { + memcpy(((char *)ecryptfs_page_virt + + start_offset_in_page), + (data + data_offset), num_bytes); + data_offset += num_bytes; + } + kunmap_atomic(ecryptfs_page_virt); + flush_dcache_page(ecryptfs_page); + SetPageUptodate(ecryptfs_page); + unlock_page(ecryptfs_page); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) + rc = ecryptfs_encrypt_page(ecryptfs_page); + else + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + ecryptfs_page, + start_offset_in_page, + data_offset); + page_cache_release(ecryptfs_page); + if (rc) { + printk(KERN_ERR "%s: Error encrypting " + "page; rc = [%d]\n", __func__, rc); + goto out; + } + pos += num_bytes; + } + if (pos > ecryptfs_file_size) { + i_size_write(ecryptfs_inode, pos); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { + int rc2; + + rc2 = ecryptfs_write_inode_size_to_metadata( + ecryptfs_inode); + if (rc2) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc2); + if (!rc) + rc = rc2; + goto out; + } + } + } +out: + return rc; +} + +/** + * ecryptfs_read_lower + * @data: The read data is stored here by this function + * @offset: Byte offset in the lower file from which to read the data + * @size: Number of bytes to read from @offset of the lower file and + * store into @data + * @ecryptfs_inode: The eCryptfs inode + * + * Read @size bytes of data at byte offset @offset from the lower + * inode into memory location @data. + * + * Returns bytes read on success; 0 on EOF; less than zero on error + */ +int ecryptfs_read_lower(char *data, loff_t offset, size_t size, + struct inode *ecryptfs_inode) +{ + struct file *lower_file; + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; + return kernel_read(lower_file, offset, data, size); +} + +/** + * ecryptfs_read_lower_page_segment + * @page_for_ecryptfs: The page into which data for eCryptfs will be + * written + * @offset_in_page: Offset in @page_for_ecryptfs from which to start + * writing + * @size: The number of bytes to write into @page_for_ecryptfs + * @ecryptfs_inode: The eCryptfs inode + * + * Determines the byte offset in the file for the given page and + * offset within the page, maps the page, and makes the call to read + * the contents of @page_for_ecryptfs from the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, + pgoff_t page_index, + size_t offset_in_page, size_t size, + struct inode *ecryptfs_inode) +{ + char *virt; + loff_t offset; + int rc; + + offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); + virt = kmap(page_for_ecryptfs); + rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); + if (rc > 0) + rc = 0; + kunmap(page_for_ecryptfs); + flush_dcache_page(page_for_ecryptfs); + return rc; +} diff --git a/kernel/fs/ecryptfs/super.c b/kernel/fs/ecryptfs/super.c new file mode 100644 index 000000000..afa1b81c3 --- /dev/null +++ b/kernel/fs/ecryptfs/super.c @@ -0,0 +1,191 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_inode_info_cache; + +/** + * ecryptfs_alloc_inode - allocate an ecryptfs inode + * @sb: Pointer to the ecryptfs super block + * + * Called to bring an inode into existence. + * + * Only handle allocation, setting up structures should be done in + * ecryptfs_read_inode. This is because the kernel, between now and + * then, will 0 out the private data pointer. + * + * Returns a pointer to a newly allocated inode, NULL otherwise + */ +static struct inode *ecryptfs_alloc_inode(struct super_block *sb) +{ + struct ecryptfs_inode_info *inode_info; + struct inode *inode = NULL; + + inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL); + if (unlikely(!inode_info)) + goto out; + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); + mutex_init(&inode_info->lower_file_mutex); + atomic_set(&inode_info->lower_file_count, 0); + inode_info->lower_file = NULL; + inode = &inode_info->vfs_inode; +out: + return inode; +} + +static void ecryptfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ecryptfs_inode_info *inode_info; + inode_info = ecryptfs_inode_to_private(inode); + + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); +} + +/** + * ecryptfs_destroy_inode + * @inode: The ecryptfs inode + * + * This is used during the final destruction of the inode. All + * allocation of memory related to the inode, including allocated + * memory in the crypt_stat struct, will be released here. + * There should be no chance that this deallocation will be missed. + */ +static void ecryptfs_destroy_inode(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + BUG_ON(inode_info->lower_file); + ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); + call_rcu(&inode->i_rcu, ecryptfs_i_callback); +} + +/** + * ecryptfs_statfs + * @sb: The ecryptfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. Currently, we let this pass right through + * to the lower filesystem and take no action ourselves. + */ +static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + int rc; + + if (!lower_dentry->d_sb->s_op->statfs) + return -ENOSYS; + + rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + if (rc) + return rc; + + buf->f_type = ECRYPTFS_SUPER_MAGIC; + rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen, + &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat); + + return rc; +} + +/** + * ecryptfs_evict_inode + * @inode - The ecryptfs inode + * + * Called by iput() when the inode reference count reached zero + * and the inode is not hashed anywhere. Used to clear anything + * that needs to be, before the inode is completely destroyed and put + * on the inode free list. We use this to drop out reference to the + * lower inode. + */ +static void ecryptfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + iput(ecryptfs_inode_to_lower(inode)); +} + +/** + * ecryptfs_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ecryptfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + struct ecryptfs_global_auth_tok *walker; + + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK) + seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig); + else + seq_printf(m, ",ecryptfs_sig=%s", walker->sig); + } + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + + seq_printf(m, ",ecryptfs_cipher=%s", + mount_crypt_stat->global_default_cipher_name); + + if (mount_crypt_stat->global_default_cipher_key_size) + seq_printf(m, ",ecryptfs_key_bytes=%zd", + mount_crypt_stat->global_default_cipher_key_size); + if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) + seq_printf(m, ",ecryptfs_passthrough"); + if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) + seq_printf(m, ",ecryptfs_xattr_metadata"); + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + seq_printf(m, ",ecryptfs_encrypted_view"); + if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) + seq_printf(m, ",ecryptfs_unlink_sigs"); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + seq_printf(m, ",ecryptfs_mount_auth_tok_only"); + + return 0; +} + +const struct super_operations ecryptfs_sops = { + .alloc_inode = ecryptfs_alloc_inode, + .destroy_inode = ecryptfs_destroy_inode, + .statfs = ecryptfs_statfs, + .remount_fs = NULL, + .evict_inode = ecryptfs_evict_inode, + .show_options = ecryptfs_show_options +}; diff --git a/kernel/fs/efivarfs/Kconfig b/kernel/fs/efivarfs/Kconfig new file mode 100644 index 000000000..c2499ef17 --- /dev/null +++ b/kernel/fs/efivarfs/Kconfig @@ -0,0 +1,13 @@ +config EFIVAR_FS + tristate "EFI Variable filesystem" + depends on EFI + default m + help + efivarfs is a replacement filesystem for the old EFI + variable support via sysfs, as it doesn't suffer from the + same 1024-byte variable size limit. + + To compile this file system support as a module, choose M + here. The module will be called efivarfs. + + If unsure, say N. diff --git a/kernel/fs/efivarfs/Makefile b/kernel/fs/efivarfs/Makefile new file mode 100644 index 000000000..955d47817 --- /dev/null +++ b/kernel/fs/efivarfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the efivarfs filesystem +# + +obj-$(CONFIG_EFIVAR_FS) += efivarfs.o + +efivarfs-objs := inode.o file.o super.o diff --git a/kernel/fs/efivarfs/file.c b/kernel/fs/efivarfs/file.c new file mode 100644 index 000000000..90001da9a --- /dev/null +++ b/kernel/fs/efivarfs/file.c @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include "internal.h" + +static ssize_t efivarfs_file_write(struct file *file, + const char __user *userbuf, size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + void *data; + u32 attributes; + struct inode *inode = file->f_mapping->host; + unsigned long datasize = count - sizeof(attributes); + ssize_t bytes; + bool set = false; + + if (count < sizeof(attributes)) + return -EINVAL; + + if (copy_from_user(&attributes, userbuf, sizeof(attributes))) + return -EFAULT; + + if (attributes & ~(EFI_VARIABLE_MASK)) + return -EINVAL; + + data = memdup_user(userbuf + sizeof(attributes), datasize); + if (IS_ERR(data)) + return PTR_ERR(data); + + bytes = efivar_entry_set_get_size(var, attributes, &datasize, + data, &set); + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; + goto out; + } + + if (bytes == -ENOENT) { + drop_nlink(inode); + d_delete(file->f_path.dentry); + dput(file->f_path.dentry); + } else { + mutex_lock(&inode->i_mutex); + i_size_write(inode, datasize + sizeof(attributes)); + mutex_unlock(&inode->i_mutex); + } + + bytes = count; + +out: + kfree(data); + + return bytes; +} + +static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + int err; + + err = efivar_entry_size(var, &datasize); + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, &attributes, &datasize, + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, &attributes, sizeof(attributes)); + size = simple_read_from_buffer(userbuf, count, ppos, + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +const struct file_operations efivarfs_file_operations = { + .open = simple_open, + .read = efivarfs_file_read, + .write = efivarfs_file_write, + .llseek = no_llseek, +}; diff --git a/kernel/fs/efivarfs/inode.c b/kernel/fs/efivarfs/inode.c new file mode 100644 index 000000000..3381b9da9 --- /dev/null +++ b/kernel/fs/efivarfs/inode.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include "internal.h" + +struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &efivarfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &efivarfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +/* + * Return true if 'str' is a valid efivarfs filename of the form, + * + * VariableName-12345678-1234-1234-1234-1234567891bc + */ +bool efivarfs_valid_name(const char *str, int len) +{ + static const char dashes[EFI_VARIABLE_GUID_LEN] = { + [8] = 1, [13] = 1, [18] = 1, [23] = 1 + }; + const char *s = str + len - EFI_VARIABLE_GUID_LEN; + int i; + + /* + * We need a GUID, plus at least one letter for the variable name, + * plus the '-' separator + */ + if (len < EFI_VARIABLE_GUID_LEN + 2) + return false; + + /* GUID must be preceded by a '-' */ + if (*(s - 1) != '-') + return false; + + /* + * Validate that 's' is of the correct format, e.g. + * + * 12345678-1234-1234-1234-123456789abc + */ + for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { + if (dashes[i]) { + if (*s++ != '-') + return false; + } else { + if (!isxdigit(*s++)) + return false; + } + } + + return true; +} + +static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) +{ + guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); + guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); + guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); + guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); + guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); + guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); + guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); + guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); + guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); + guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); + guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); + guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); + guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); + guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); + guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); + guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); +} + +static int efivarfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct inode *inode; + struct efivar_entry *var; + int namelen, i = 0, err = 0; + + if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) + return -EINVAL; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOMEM; + + var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); + if (!var) { + err = -ENOMEM; + goto out; + } + + /* length of the variable name itself: remove GUID and separator */ + namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; + + efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, + &var->var.VendorGuid); + + for (i = 0; i < namelen; i++) + var->var.VariableName[i] = dentry->d_name.name[i]; + + var->var.VariableName[i] = '\0'; + + inode->i_private = var; + + efivar_entry_add(var, &efivarfs_list); + d_instantiate(dentry, inode); + dget(dentry); +out: + if (err) { + kfree(var); + iput(inode); + } + return err; +} + +static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efivar_entry *var = d_inode(dentry)->i_private; + + if (efivar_entry_delete(var)) + return -EINVAL; + + drop_nlink(d_inode(dentry)); + dput(dentry); + return 0; +}; + +const struct inode_operations efivarfs_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = efivarfs_unlink, + .create = efivarfs_create, +}; diff --git a/kernel/fs/efivarfs/internal.h b/kernel/fs/efivarfs/internal.h new file mode 100644 index 000000000..b5ff16add --- /dev/null +++ b/kernel/fs/efivarfs/internal.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef EFIVAR_FS_INTERNAL_H +#define EFIVAR_FS_INTERNAL_H + +#include + +extern const struct file_operations efivarfs_file_operations; +extern const struct inode_operations efivarfs_dir_inode_operations; +extern bool efivarfs_valid_name(const char *str, int len); +extern struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev); + +extern struct list_head efivarfs_list; + +#endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/kernel/fs/efivarfs/super.c b/kernel/fs/efivarfs/super.c new file mode 100644 index 000000000..86a212182 --- /dev/null +++ b/kernel/fs/efivarfs/super.c @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(efivarfs_list); + +static void efivarfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct super_operations efivarfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = efivarfs_evict_inode, + .show_options = generic_show_options, +}; + +static struct super_block *efivarfs_sb; + +/* + * Compare two efivarfs file names. + * + * An efivarfs filename is composed of two parts, + * + * 1. A case-sensitive variable name + * 2. A case-insensitive GUID + * + * So we need to perform a case-sensitive match on part 1 and a + * case-insensitive match on part 2. + */ +static int efivarfs_d_compare(const struct dentry *parent, + const struct dentry *dentry, + unsigned int len, const char *str, + const struct qstr *name) +{ + int guid = len - EFI_VARIABLE_GUID_LEN; + + if (name->len != len) + return 1; + + /* Case-sensitive compare for the variable name */ + if (memcmp(str, name->name, guid)) + return 1; + + /* Case-insensitive compare for the GUID */ + return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); +} + +static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + unsigned long hash = init_name_hash(); + const unsigned char *s = qstr->name; + unsigned int len = qstr->len; + + if (!efivarfs_valid_name(s, len)) + return -EINVAL; + + while (len-- > EFI_VARIABLE_GUID_LEN) + hash = partial_name_hash(*s++, hash); + + /* GUID is case-insensitive. */ + while (len--) + hash = partial_name_hash(tolower(*s++), hash); + + qstr->hash = end_name_hash(hash); + return 0; +} + +static const struct dentry_operations efivarfs_d_ops = { + .d_compare = efivarfs_d_compare, + .d_hash = efivarfs_d_hash, + .d_delete = always_delete_dentry, +}; + +static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) +{ + struct dentry *d; + struct qstr q; + int err; + + q.name = name; + q.len = strlen(name); + + err = efivarfs_d_hash(NULL, &q); + if (err) + return ERR_PTR(err); + + d = d_alloc(parent, &q); + if (d) + return d; + + return ERR_PTR(-ENOMEM); +} + +static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct efivar_entry *entry; + struct inode *inode = NULL; + struct dentry *dentry, *root = sb->s_root; + unsigned long size = 0; + char *name; + int len, i; + int err = -ENOMEM; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return err; + + memcpy(entry->var.VariableName, name16, name_size); + memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); + + len = ucs2_strlen(entry->var.VariableName); + + /* name, plus '-', plus GUID, plus NUL*/ + name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); + if (!name) + goto fail; + + for (i = 0; i < len; i++) + name[i] = entry->var.VariableName[i] & 0xFF; + + name[len] = '-'; + + efi_guid_to_str(&entry->var.VendorGuid, name + len + 1); + + name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); + if (!inode) + goto fail_name; + + dentry = efivarfs_alloc_dentry(root, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto fail_inode; + } + + /* copied by the above to local storage in the dentry. */ + kfree(name); + + efivar_entry_size(entry, &size); + efivar_entry_add(entry, &efivarfs_list); + + mutex_lock(&inode->i_mutex); + inode->i_private = entry; + i_size_write(inode, size + sizeof(entry->var.Attributes)); + mutex_unlock(&inode->i_mutex); + d_add(dentry, inode); + + return 0; + +fail_inode: + iput(inode); +fail_name: + kfree(name); +fail: + kfree(entry); + return err; +} + +static int efivarfs_destroy(struct efivar_entry *entry, void *data) +{ + efivar_entry_remove(entry); + kfree(entry); + return 0; +} + +static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + efivarfs_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = EFIVARFS_MAGIC; + sb->s_op = &efivarfs_ops; + sb->s_d_op = &efivarfs_d_ops; + sb->s_time_gran = 1; + + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + inode->i_op = &efivarfs_dir_inode_operations; + + root = d_make_root(inode); + sb->s_root = root; + if (!root) + return -ENOMEM; + + INIT_LIST_HEAD(&efivarfs_list); + + err = efivar_init(efivarfs_callback, (void *)sb, false, + true, &efivarfs_list); + if (err) + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + + return err; +} + +static struct dentry *efivarfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, efivarfs_fill_super); +} + +static void efivarfs_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + efivarfs_sb = NULL; + + /* Remove all entries and destroy */ + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); +} + +static struct file_system_type efivarfs_type = { + .owner = THIS_MODULE, + .name = "efivarfs", + .mount = efivarfs_mount, + .kill_sb = efivarfs_kill_sb, +}; + +static __init int efivarfs_init(void) +{ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + if (!efivars_kobject()) + return -ENODEV; + + return register_filesystem(&efivarfs_type); +} + +static __exit void efivarfs_exit(void) +{ + unregister_filesystem(&efivarfs_type); +} + +MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); +MODULE_DESCRIPTION("EFI Variable Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("efivarfs"); + +module_init(efivarfs_init); +module_exit(efivarfs_exit); diff --git a/kernel/fs/efs/Kconfig b/kernel/fs/efs/Kconfig new file mode 100644 index 000000000..d020e3c30 --- /dev/null +++ b/kernel/fs/efs/Kconfig @@ -0,0 +1,14 @@ +config EFS_FS + tristate "EFS file system support (read only)" + depends on BLOCK + help + EFS is an older file system used for non-ISO9660 CD-ROMs and hard + disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer + uses the XFS file system for hard disk partitions however). + + This implementation only offers read-only access. If you don't know + what all this is about, it's safe to say N. For more information + about EFS see its home page at . + + To compile the EFS file system support as a module, choose M here: the + module will be called efs. diff --git a/kernel/fs/efs/Makefile b/kernel/fs/efs/Makefile new file mode 100644 index 000000000..963543d46 --- /dev/null +++ b/kernel/fs/efs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux efs-filesystem routines. +# + +obj-$(CONFIG_EFS_FS) += efs.o + +efs-objs := super.o inode.o namei.o dir.o file.o symlink.o diff --git a/kernel/fs/efs/dir.c b/kernel/fs/efs/dir.c new file mode 100644 index 000000000..ce63b24f7 --- /dev/null +++ b/kernel/fs/efs/dir.c @@ -0,0 +1,103 @@ +/* + * dir.c + * + * Copyright (c) 1999 Al Smith + */ + +#include +#include "efs.h" + +static int efs_readdir(struct file *, struct dir_context *); + +const struct file_operations efs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = efs_readdir, +}; + +const struct inode_operations efs_dir_inode_operations = { + .lookup = efs_lookup, +}; + +static int efs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + efs_block_t block; + int slot; + + if (inode->i_size & (EFS_DIRBSIZE-1)) + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); + + /* work out where this entry can be found */ + block = ctx->pos >> EFS_DIRBSIZE_BITS; + + /* each block contains at most 256 slots */ + slot = ctx->pos & 0xff; + + /* look at all blocks */ + while (block < inode->i_blocks) { + struct efs_dir *dirblock; + struct buffer_head *bh; + + /* read the dir block */ + bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); + + if (!bh) { + pr_err("%s(): failed to read dir block %d\n", + __func__, block); + break; + } + + dirblock = (struct efs_dir *) bh->b_data; + + if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { + pr_err("%s(): invalid directory block\n", __func__); + brelse(bh); + break; + } + + for (; slot < dirblock->slots; slot++) { + struct efs_dentry *dirslot; + efs_ino_t inodenum; + const char *nameptr; + int namelen; + + if (dirblock->space[slot] == 0) + continue; + + dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); + + inodenum = be32_to_cpu(dirslot->inode); + namelen = dirslot->namelen; + nameptr = dirslot->name; + pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", + __func__, block, slot, dirblock->slots-1, + inodenum, nameptr, namelen); + if (!namelen) + continue; + /* found the next entry */ + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; + + /* sanity check */ + if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { + pr_warn("directory entry %d exceeds directory block\n", + slot); + continue; + } + + /* copy filename and data in dirslot */ + if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) { + brelse(bh); + return 0; + } + } + brelse(bh); + + slot = 0; + block++; + } + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; + return 0; +} + diff --git a/kernel/fs/efs/efs.h b/kernel/fs/efs/efs.h new file mode 100644 index 000000000..5bbf96121 --- /dev/null +++ b/kernel/fs/efs/efs.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + * Portions derived from IRIX header files (c) 1988 Silicon Graphics + */ +#ifndef _EFS_EFS_H_ +#define _EFS_EFS_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#define EFS_VERSION "1.0a" + +static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith "; + + +/* 1 block is 512 bytes */ +#define EFS_BLOCKSIZE_BITS 9 +#define EFS_BLOCKSIZE (1 << EFS_BLOCKSIZE_BITS) + +typedef int32_t efs_block_t; +typedef uint32_t efs_ino_t; + +#define EFS_DIRECTEXTENTS 12 + +/* + * layout of an extent, in memory and on disk. 8 bytes exactly. + */ +typedef union extent_u { + unsigned char raw[8]; + struct extent_s { + unsigned int ex_magic:8; /* magic # (zero) */ + unsigned int ex_bn:24; /* basic block */ + unsigned int ex_length:8; /* numblocks in this extent */ + unsigned int ex_offset:24; /* logical offset into file */ + } cooked; +} efs_extent; + +typedef struct edevs { + __be16 odev; + __be32 ndev; +} efs_devs; + +/* + * extent based filesystem inode as it appears on disk. The efs inode + * is exactly 128 bytes long. + */ +struct efs_dinode { + __be16 di_mode; /* mode and type of file */ + __be16 di_nlink; /* number of links to file */ + __be16 di_uid; /* owner's user id */ + __be16 di_gid; /* owner's group id */ + __be32 di_size; /* number of bytes in file */ + __be32 di_atime; /* time last accessed */ + __be32 di_mtime; /* time last modified */ + __be32 di_ctime; /* time created */ + __be32 di_gen; /* generation number */ + __be16 di_numextents; /* # of extents */ + u_char di_version; /* version of inode */ + u_char di_spare; /* spare - used by AFS */ + union di_addr { + efs_extent di_extents[EFS_DIRECTEXTENTS]; + efs_devs di_dev; /* device for IFCHR/IFBLK */ + } di_u; +}; + +/* efs inode storage in memory */ +struct efs_inode_info { + int numextents; + int lastextent; + + efs_extent extents[EFS_DIRECTEXTENTS]; + struct inode vfs_inode; +}; + +#include + +#define EFS_DIRBSIZE_BITS EFS_BLOCKSIZE_BITS +#define EFS_DIRBSIZE (1 << EFS_DIRBSIZE_BITS) + +struct efs_dentry { + __be32 inode; + unsigned char namelen; + char name[3]; +}; + +#define EFS_DENTSIZE (sizeof(struct efs_dentry) - 3 + 1) +#define EFS_MAXNAMELEN ((1 << (sizeof(char) * 8)) - 1) + +#define EFS_DIRBLK_HEADERSIZE 4 +#define EFS_DIRBLK_MAGIC 0xbeef /* moo */ + +struct efs_dir { + __be16 magic; + unsigned char firstused; + unsigned char slots; + + unsigned char space[EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE]; +}; + +#define EFS_MAXENTS \ + ((EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE) / \ + (EFS_DENTSIZE + sizeof(char))) + +#define EFS_SLOTAT(dir, slot) EFS_REALOFF((dir)->space[slot]) + +#define EFS_REALOFF(offset) ((offset << 1)) + + +static inline struct efs_inode_info *INODE_INFO(struct inode *inode) +{ + return container_of(inode, struct efs_inode_info, vfs_inode); +} + +static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb) +{ + return sb->s_fs_info; +} + +struct statfs; +struct fid; + +extern const struct inode_operations efs_dir_inode_operations; +extern const struct file_operations efs_dir_operations; +extern const struct address_space_operations efs_symlink_aops; + +extern struct inode *efs_iget(struct super_block *, unsigned long); +extern efs_block_t efs_map_block(struct inode *, efs_block_t); +extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int); + +extern struct dentry *efs_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *efs_get_parent(struct dentry *); +extern int efs_bmap(struct inode *, int); + +#endif /* _EFS_EFS_H_ */ diff --git a/kernel/fs/efs/file.c b/kernel/fs/efs/file.c new file mode 100644 index 000000000..a37dcee46 --- /dev/null +++ b/kernel/fs/efs/file.c @@ -0,0 +1,56 @@ +/* + * file.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include "efs.h" + +int efs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int error = -EROFS; + long phys; + + if (create) + return error; + if (iblock >= inode->i_blocks) { +#ifdef DEBUG + /* + * i have no idea why this happens as often as it does + */ + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); +#endif + return 0; + } + phys = efs_map_block(inode, iblock); + if (phys) + map_bh(bh_result, inode->i_sb, phys); + return 0; +} + +int efs_bmap(struct inode *inode, efs_block_t block) { + + if (block < 0) { + pr_warn("%s(): block < 0\n", __func__); + return 0; + } + + /* are we about to read past the end of a file ? */ + if (!(block < inode->i_blocks)) { +#ifdef DEBUG + /* + * i have no idea why this happens as often as it does + */ + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); +#endif + return 0; + } + + return efs_map_block(inode, block); +} diff --git a/kernel/fs/efs/inode.c b/kernel/fs/efs/inode.c new file mode 100644 index 000000000..079d20306 --- /dev/null +++ b/kernel/fs/efs/inode.c @@ -0,0 +1,311 @@ +/* + * inode.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang, + * and from work (c) 1998 Mike Shaver. + */ + +#include +#include +#include +#include "efs.h" +#include + +static int efs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,efs_get_block); +} +static sector_t _efs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,efs_get_block); +} +static const struct address_space_operations efs_aops = { + .readpage = efs_readpage, + .bmap = _efs_bmap +}; + +static inline void extent_copy(efs_extent *src, efs_extent *dst) { + /* + * this is slightly evil. it doesn't just copy + * efs_extent from src to dst, it also mangles + * the bits so that dst ends up in cpu byte-order. + */ + + dst->cooked.ex_magic = (unsigned int) src->raw[0]; + dst->cooked.ex_bn = ((unsigned int) src->raw[1] << 16) | + ((unsigned int) src->raw[2] << 8) | + ((unsigned int) src->raw[3] << 0); + dst->cooked.ex_length = (unsigned int) src->raw[4]; + dst->cooked.ex_offset = ((unsigned int) src->raw[5] << 16) | + ((unsigned int) src->raw[6] << 8) | + ((unsigned int) src->raw[7] << 0); + return; +} + +struct inode *efs_iget(struct super_block *super, unsigned long ino) +{ + int i, inode_index; + dev_t device; + u32 rdev; + struct buffer_head *bh; + struct efs_sb_info *sb = SUPER_INFO(super); + struct efs_inode_info *in; + efs_block_t block, offset; + struct efs_dinode *efs_inode; + struct inode *inode; + + inode = iget_locked(super, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + in = INODE_INFO(inode); + + /* + ** EFS layout: + ** + ** | cylinder group | cylinder group | cylinder group ..etc + ** |inodes|data |inodes|data |inodes|data ..etc + ** + ** work out the inode block index, (considering initially that the + ** inodes are stored as consecutive blocks). then work out the block + ** number of that inode given the above layout, and finally the + ** offset of the inode within that block. + */ + + inode_index = inode->i_ino / + (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); + + block = sb->fs_start + sb->first_block + + (sb->group_size * (inode_index / sb->inode_blocks)) + + (inode_index % sb->inode_blocks); + + offset = (inode->i_ino % + (EFS_BLOCKSIZE / sizeof(struct efs_dinode))) * + sizeof(struct efs_dinode); + + bh = sb_bread(inode->i_sb, block); + if (!bh) { + pr_warn("%s() failed at block %d\n", __func__, block); + goto read_inode_error; + } + + efs_inode = (struct efs_dinode *) (bh->b_data + offset); + + inode->i_mode = be16_to_cpu(efs_inode->di_mode); + set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); + i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid)); + i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid)); + inode->i_size = be32_to_cpu(efs_inode->di_size); + inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); + inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); + inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + + /* this is the number of blocks in the file */ + if (inode->i_size == 0) { + inode->i_blocks = 0; + } else { + inode->i_blocks = ((inode->i_size - 1) >> EFS_BLOCKSIZE_BITS) + 1; + } + + rdev = be16_to_cpu(efs_inode->di_u.di_dev.odev); + if (rdev == 0xffff) { + rdev = be32_to_cpu(efs_inode->di_u.di_dev.ndev); + if (sysv_major(rdev) > 0xfff) + device = 0; + else + device = MKDEV(sysv_major(rdev), sysv_minor(rdev)); + } else + device = old_decode_dev(rdev); + + /* get the number of extents for this object */ + in->numextents = be16_to_cpu(efs_inode->di_numextents); + in->lastextent = 0; + + /* copy the extents contained within the inode to memory */ + for(i = 0; i < EFS_DIRECTEXTENTS; i++) { + extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); + if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { + pr_warn("extent %d has bad magic number in inode %lu\n", + i, inode->i_ino); + brelse(bh); + goto read_inode_error; + } + } + + brelse(bh); + pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + inode->i_ino, in->numextents, inode->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &efs_dir_inode_operations; + inode->i_fop = &efs_dir_operations; + break; + case S_IFREG: + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &efs_aops; + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &efs_symlink_aops; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + init_special_inode(inode, inode->i_mode, device); + break; + default: + pr_warn("unsupported inode mode %o\n", inode->i_mode); + goto read_inode_error; + break; + } + + unlock_new_inode(inode); + return inode; + +read_inode_error: + pr_warn("failed to read inode %lu\n", inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static inline efs_block_t +efs_extent_check(efs_extent *ptr, efs_block_t block, struct efs_sb_info *sb) { + efs_block_t start; + efs_block_t length; + efs_block_t offset; + + /* + * given an extent and a logical block within a file, + * can this block be found within this extent ? + */ + start = ptr->cooked.ex_bn; + length = ptr->cooked.ex_length; + offset = ptr->cooked.ex_offset; + + if ((block >= offset) && (block < offset+length)) { + return(sb->fs_start + start + block - offset); + } else { + return 0; + } +} + +efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { + struct efs_sb_info *sb = SUPER_INFO(inode->i_sb); + struct efs_inode_info *in = INODE_INFO(inode); + struct buffer_head *bh = NULL; + + int cur, last, first = 1; + int ibase, ioffset, dirext, direxts, indext, indexts; + efs_block_t iblock, result = 0, lastblock = 0; + efs_extent ext, *exts; + + last = in->lastextent; + + if (in->numextents <= EFS_DIRECTEXTENTS) { + /* first check the last extent we returned */ + if ((result = efs_extent_check(&in->extents[last], block, sb))) + return result; + + /* if we only have one extent then nothing can be found */ + if (in->numextents == 1) { + pr_err("%s() failed to map (1 extent)\n", __func__); + return 0; + } + + direxts = in->numextents; + + /* + * check the stored extents in the inode + * start with next extent and check forwards + */ + for(dirext = 1; dirext < direxts; dirext++) { + cur = (last + dirext) % in->numextents; + if ((result = efs_extent_check(&in->extents[cur], block, sb))) { + in->lastextent = cur; + return result; + } + } + + pr_err("%s() failed to map block %u (dir)\n", __func__, block); + return 0; + } + + pr_debug("%s(): indirect search for logical block %u\n", + __func__, block); + direxts = in->extents[0].cooked.ex_offset; + indexts = in->numextents; + + for(indext = 0; indext < indexts; indext++) { + cur = (last + indext) % indexts; + + /* + * work out which direct extent contains `cur'. + * + * also compute ibase: i.e. the number of the first + * indirect extent contained within direct extent `cur'. + * + */ + ibase = 0; + for(dirext = 0; cur < ibase && dirext < direxts; dirext++) { + ibase += in->extents[dirext].cooked.ex_length * + (EFS_BLOCKSIZE / sizeof(efs_extent)); + } + + if (dirext == direxts) { + /* should never happen */ + pr_err("couldn't find direct extent for indirect extent %d (block %u)\n", + cur, block); + if (bh) brelse(bh); + return 0; + } + + /* work out block number and offset of this indirect extent */ + iblock = sb->fs_start + in->extents[dirext].cooked.ex_bn + + (cur - ibase) / + (EFS_BLOCKSIZE / sizeof(efs_extent)); + ioffset = (cur - ibase) % + (EFS_BLOCKSIZE / sizeof(efs_extent)); + + if (first || lastblock != iblock) { + if (bh) brelse(bh); + + bh = sb_bread(inode->i_sb, iblock); + if (!bh) { + pr_err("%s() failed at block %d\n", + __func__, iblock); + return 0; + } + pr_debug("%s(): read indirect extent block %d\n", + __func__, iblock); + first = 0; + lastblock = iblock; + } + + exts = (efs_extent *) bh->b_data; + + extent_copy(&(exts[ioffset]), &ext); + + if (ext.cooked.ex_magic != 0) { + pr_err("extent %d has bad magic number in block %d\n", + cur, iblock); + if (bh) brelse(bh); + return 0; + } + + if ((result = efs_extent_check(&ext, block, sb))) { + if (bh) brelse(bh); + in->lastextent = cur; + return result; + } + } + if (bh) brelse(bh); + pr_err("%s() failed to map block %u (indir)\n", __func__, block); + return 0; +} + +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/efs/namei.c b/kernel/fs/efs/namei.c new file mode 100644 index 000000000..40ba9cc41 --- /dev/null +++ b/kernel/fs/efs/namei.c @@ -0,0 +1,119 @@ +/* + * namei.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include "efs.h" + + +static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) +{ + struct buffer_head *bh; + + int slot, namelen; + char *nameptr; + struct efs_dir *dirblock; + struct efs_dentry *dirslot; + efs_ino_t inodenum; + efs_block_t block; + + if (inode->i_size & (EFS_DIRBSIZE-1)) + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); + + for(block = 0; block < inode->i_blocks; block++) { + + bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); + if (!bh) { + pr_err("%s(): failed to read dir block %d\n", + __func__, block); + return 0; + } + + dirblock = (struct efs_dir *) bh->b_data; + + if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { + pr_err("%s(): invalid directory block\n", __func__); + brelse(bh); + return 0; + } + + for (slot = 0; slot < dirblock->slots; slot++) { + dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); + + namelen = dirslot->namelen; + nameptr = dirslot->name; + + if ((namelen == len) && (!memcmp(name, nameptr, len))) { + inodenum = be32_to_cpu(dirslot->inode); + brelse(bh); + return inodenum; + } + } + brelse(bh); + } + return 0; +} + +struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + efs_ino_t inodenum; + struct inode *inode = NULL; + + inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); + if (inodenum) + inode = efs_iget(dir->i_sb, inodenum); + + return d_splice_alias(inode, dentry); +} + +static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, + u32 generation) +{ + struct inode *inode; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = efs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + efs_nfs_get_inode); +} + +struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + efs_nfs_get_inode); +} + +struct dentry *efs_get_parent(struct dentry *child) +{ + struct dentry *parent = ERR_PTR(-ENOENT); + efs_ino_t ino; + + ino = efs_find_entry(d_inode(child), "..", 2); + if (ino) + parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino)); + + return parent; +} diff --git a/kernel/fs/efs/super.c b/kernel/fs/efs/super.c new file mode 100644 index 000000000..7fca462ea --- /dev/null +++ b/kernel/fs/efs/super.c @@ -0,0 +1,353 @@ +/* + * super.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include +#include +#include + +#include "efs.h" +#include +#include + +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); +static int efs_fill_super(struct super_block *s, void *d, int silent); + +static struct dentry *efs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); +} + +static void efs_kill_sb(struct super_block *s) +{ + struct efs_sb_info *sbi = SUPER_INFO(s); + kill_block_super(s); + kfree(sbi); +} + +static struct file_system_type efs_fs_type = { + .owner = THIS_MODULE, + .name = "efs", + .mount = efs_mount, + .kill_sb = efs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("efs"); + +static struct pt_types sgi_pt_types[] = { + {0x00, "SGI vh"}, + {0x01, "SGI trkrepl"}, + {0x02, "SGI secrepl"}, + {0x03, "SGI raw"}, + {0x04, "SGI bsd"}, + {SGI_SYSV, "SGI sysv"}, + {0x06, "SGI vol"}, + {SGI_EFS, "SGI efs"}, + {0x08, "SGI lv"}, + {0x09, "SGI rlv"}, + {0x0A, "SGI xfs"}, + {0x0B, "SGI xfslog"}, + {0x0C, "SGI xlv"}, + {0x82, "Linux swap"}, + {0x83, "Linux native"}, + {0, NULL} +}; + + +static struct kmem_cache * efs_inode_cachep; + +static struct inode *efs_alloc_inode(struct super_block *sb) +{ + struct efs_inode_info *ei; + ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void efs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); +} + +static void efs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, efs_i_callback); +} + +static void init_once(void *foo) +{ + struct efs_inode_info *ei = (struct efs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + efs_inode_cachep = kmem_cache_create("efs_inode_cache", + sizeof(struct efs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (efs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(efs_inode_cachep); +} + +static int efs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations efs_superblock_operations = { + .alloc_inode = efs_alloc_inode, + .destroy_inode = efs_destroy_inode, + .statfs = efs_statfs, + .remount_fs = efs_remount, +}; + +static const struct export_operations efs_export_ops = { + .fh_to_dentry = efs_fh_to_dentry, + .fh_to_parent = efs_fh_to_parent, + .get_parent = efs_get_parent, +}; + +static int __init init_efs_fs(void) { + int err; + pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&efs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_efs_fs(void) { + unregister_filesystem(&efs_fs_type); + destroy_inodecache(); +} + +module_init(init_efs_fs) +module_exit(exit_efs_fs) + +static efs_block_t efs_validate_vh(struct volume_header *vh) { + int i; + __be32 cs, *ui; + int csum; + efs_block_t sblock = 0; /* shuts up gcc */ + struct pt_types *pt_entry; + int pt_type, slice = -1; + + if (be32_to_cpu(vh->vh_magic) != VHMAGIC) { + /* + * assume that we're dealing with a partition and allow + * read_super() to try and detect a valid superblock + * on the next block. + */ + return 0; + } + + ui = ((__be32 *) (vh + 1)) - 1; + for(csum = 0; ui >= ((__be32 *) vh);) { + cs = *ui--; + csum += be32_to_cpu(cs); + } + if (csum) { + pr_warn("SGI disklabel: checksum bad, label corrupted\n"); + return 0; + } + +#ifdef DEBUG + pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); + + for(i = 0; i < NVDIR; i++) { + int j; + char name[VDNAMESIZE+1]; + + for(j = 0; j < VDNAMESIZE; j++) { + name[j] = vh->vh_vd[i].vd_name[j]; + } + name[j] = (char) 0; + + if (name[0]) { + pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", + name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), + (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); + } + } +#endif + + for(i = 0; i < NPARTAB; i++) { + pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type); + for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) { + if (pt_type == pt_entry->pt_type) break; + } +#ifdef DEBUG + if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { + pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", + i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), + (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), + pt_type, (pt_entry->pt_name) ? + pt_entry->pt_name : "unknown"); + } +#endif + if (IS_EFS(pt_type)) { + sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn); + slice = i; + } + } + + if (slice == -1) { + pr_notice("partition table contained no EFS partitions\n"); +#ifdef DEBUG + } else { + pr_info("using slice %d (type %s, offset 0x%x)\n", slice, + (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", + sblock); +#endif + } + return sblock; +} + +static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { + + if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) + return -1; + + sb->fs_magic = be32_to_cpu(super->fs_magic); + sb->total_blocks = be32_to_cpu(super->fs_size); + sb->first_block = be32_to_cpu(super->fs_firstcg); + sb->group_size = be32_to_cpu(super->fs_cgfsize); + sb->data_free = be32_to_cpu(super->fs_tfree); + sb->inode_free = be32_to_cpu(super->fs_tinode); + sb->inode_blocks = be16_to_cpu(super->fs_cgisize); + sb->total_groups = be16_to_cpu(super->fs_ncg); + + return 0; +} + +static int efs_fill_super(struct super_block *s, void *d, int silent) +{ + struct efs_sb_info *sb; + struct buffer_head *bh; + struct inode *root; + + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + if (!sb) + return -ENOMEM; + s->s_fs_info = sb; + + s->s_magic = EFS_SUPER_MAGIC; + if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { + pr_err("device does not support %d byte blocks\n", + EFS_BLOCKSIZE); + return -EINVAL; + } + + /* read the vh (volume header) block */ + bh = sb_bread(s, 0); + + if (!bh) { + pr_err("cannot read volume header\n"); + return -EINVAL; + } + + /* + * if this returns zero then we didn't find any partition table. + * this isn't (yet) an error - just assume for the moment that + * the device is valid and go on to search for a superblock. + */ + sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data); + brelse(bh); + + if (sb->fs_start == -1) { + return -EINVAL; + } + + bh = sb_bread(s, sb->fs_start + EFS_SUPER); + if (!bh) { + pr_err("cannot read superblock\n"); + return -EINVAL; + } + + if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { +#ifdef DEBUG + pr_warn("invalid superblock at block %u\n", + sb->fs_start + EFS_SUPER); +#endif + brelse(bh); + return -EINVAL; + } + brelse(bh); + + if (!(s->s_flags & MS_RDONLY)) { +#ifdef DEBUG + pr_info("forcing read-only mode\n"); +#endif + s->s_flags |= MS_RDONLY; + } + s->s_op = &efs_superblock_operations; + s->s_export_op = &efs_export_ops; + root = efs_iget(s, EFS_ROOTINODE); + if (IS_ERR(root)) { + pr_err("get root inode failed\n"); + return PTR_ERR(root); + } + + s->s_root = d_make_root(root); + if (!(s->s_root)) { + pr_err("get root dentry failed\n"); + return -ENOMEM; + } + + return 0; +} + +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + struct efs_sb_info *sbi = SUPER_INFO(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ + buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ + buf->f_blocks = sbi->total_groups * /* total data blocks */ + (sbi->group_size - sbi->inode_blocks); + buf->f_bfree = sbi->data_free; /* free data blocks */ + buf->f_bavail = sbi->data_free; /* free blocks for non-root */ + buf->f_files = sbi->total_groups * /* total inodes */ + sbi->inode_blocks * + (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); + buf->f_ffree = sbi->inode_free; /* free inodes */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ + + return 0; +} + diff --git a/kernel/fs/efs/symlink.c b/kernel/fs/efs/symlink.c new file mode 100644 index 000000000..75117d0da --- /dev/null +++ b/kernel/fs/efs/symlink.c @@ -0,0 +1,54 @@ +/* + * symlink.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include "efs.h" + +static int efs_symlink_readpage(struct file *file, struct page *page) +{ + char *link = kmap(page); + struct buffer_head * bh; + struct inode * inode = page->mapping->host; + efs_block_t size = inode->i_size; + int err; + + err = -ENAMETOOLONG; + if (size > 2 * EFS_BLOCKSIZE) + goto fail; + + /* read first 512 bytes of link target */ + err = -EIO; + bh = sb_bread(inode->i_sb, efs_bmap(inode, 0)); + if (!bh) + goto fail; + memcpy(link, bh->b_data, (size > EFS_BLOCKSIZE) ? EFS_BLOCKSIZE : size); + brelse(bh); + if (size > EFS_BLOCKSIZE) { + bh = sb_bread(inode->i_sb, efs_bmap(inode, 1)); + if (!bh) + goto fail; + memcpy(link + EFS_BLOCKSIZE, bh->b_data, size - EFS_BLOCKSIZE); + brelse(bh); + } + link[size] = '\0'; + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations efs_symlink_aops = { + .readpage = efs_symlink_readpage +}; diff --git a/kernel/fs/eventfd.c b/kernel/fs/eventfd.c new file mode 100644 index 000000000..8d0c0df01 --- /dev/null +++ b/kernel/fs/eventfd.c @@ -0,0 +1,449 @@ +/* + * fs/eventfd.c + * + * Copyright (C) 2007 Davide Libenzi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct eventfd_ctx { + struct kref kref; + wait_queue_head_t wqh; + /* + * Every time that a write(2) is performed on an eventfd, the + * value of the __u64 being written is added to "count" and a + * wakeup is performed on "wqh". A read(2) will return the "count" + * value to userspace, and will reset "count" to zero. The kernel + * side eventfd_signal() also, adds to the "count" counter and + * issue a wakeup. + */ + __u64 count; + unsigned int flags; +}; + +/** + * eventfd_signal - Adds @n to the eventfd counter. + * @ctx: [in] Pointer to the eventfd context. + * @n: [in] Value of the counter to be added to the eventfd internal counter. + * The value cannot be negative. + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returining a POLLERR + * to poll(2). + * + * Returns the amount by which the counter was incrememnted. This will be less + * than @n if the counter has overflowed. + */ +__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ULLONG_MAX - ctx->count < n) + n = ULLONG_MAX - ctx->count; + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLIN); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; +} +EXPORT_SYMBOL_GPL(eventfd_signal); + +static void eventfd_free_ctx(struct eventfd_ctx *ctx) +{ + kfree(ctx); +} + +static void eventfd_free(struct kref *kref) +{ + struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); + + eventfd_free_ctx(ctx); +} + +/** + * eventfd_ctx_get - Acquires a reference to the internal eventfd context. + * @ctx: [in] Pointer to the eventfd context. + * + * Returns: In case of success, returns a pointer to the eventfd context. + */ +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx) +{ + kref_get(&ctx->kref); + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_get); + +/** + * eventfd_ctx_put - Releases a reference to the internal eventfd context. + * @ctx: [in] Pointer to eventfd context. + * + * The eventfd context reference must have been previously acquired either + * with eventfd_ctx_get() or eventfd_ctx_fdget(). + */ +void eventfd_ctx_put(struct eventfd_ctx *ctx) +{ + kref_put(&ctx->kref, eventfd_free); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_put); + +static int eventfd_release(struct inode *inode, struct file *file) +{ + struct eventfd_ctx *ctx = file->private_data; + + wake_up_poll(&ctx->wqh, POLLHUP); + eventfd_ctx_put(ctx); + return 0; +} + +static unsigned int eventfd_poll(struct file *file, poll_table *wait) +{ + struct eventfd_ctx *ctx = file->private_data; + unsigned int events = 0; + u64 count; + + poll_wait(file, &ctx->wqh, wait); + smp_rmb(); + count = ctx->count; + + if (count > 0) + events |= POLLIN; + if (count == ULLONG_MAX) + events |= POLLERR; + if (ULLONG_MAX - 1 > count) + events |= POLLOUT; + + return events; +} + +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64-bit counter value. + * + * Returns %0 if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64-bit counter value. + * + * Returns %0 if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was non-zero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) +{ + ssize_t res; + DECLARE_WAITQUEUE(wait, current); + + spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; + res = -EAGAIN; + if (ctx->count > 0) + res = 0; + else if (!no_wait) { + __add_wait_queue(&ctx->wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ctx->count > 0) { + res = 0; + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(&ctx->wqh.lock); + schedule(); + spin_lock_irq(&ctx->wqh.lock); + } + __remove_wait_queue(&ctx->wqh, &wait); + __set_current_state(TASK_RUNNING); + } + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + } + spin_unlock_irq(&ctx->wqh.lock); + + return res; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} + +static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 ucnt; + DECLARE_WAITQUEUE(wait, current); + + if (count < sizeof(ucnt)) + return -EINVAL; + if (copy_from_user(&ucnt, buf, sizeof(ucnt))) + return -EFAULT; + if (ucnt == ULLONG_MAX) + return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); + res = -EAGAIN; + if (ULLONG_MAX - ctx->count > ucnt) + res = sizeof(ucnt); + else if (!(file->f_flags & O_NONBLOCK)) { + __add_wait_queue(&ctx->wqh, &wait); + for (res = 0;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ULLONG_MAX - ctx->count > ucnt) { + res = sizeof(ucnt); + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(&ctx->wqh.lock); + schedule(); + spin_lock_irq(&ctx->wqh.lock); + } + __remove_wait_queue(&ctx->wqh, &wait); + __set_current_state(TASK_RUNNING); + } + if (likely(res > 0)) { + ctx->count += ucnt; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLIN); + } + spin_unlock_irq(&ctx->wqh.lock); + + return res; +} + +#ifdef CONFIG_PROC_FS +static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventfd_ctx *ctx = f->private_data; + + spin_lock_irq(&ctx->wqh.lock); + seq_printf(m, "eventfd-count: %16llx\n", + (unsigned long long)ctx->count); + spin_unlock_irq(&ctx->wqh.lock); +} +#endif + +static const struct file_operations eventfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = eventfd_show_fdinfo, +#endif + .release = eventfd_release, + .poll = eventfd_poll, + .read = eventfd_read, + .write = eventfd_write, + .llseek = noop_llseek, +}; + +/** + * eventfd_fget - Acquire a reference of an eventfd file descriptor. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the eventfd file structure in case of success, or the + * following error pointer: + * + * -EBADF : Invalid @fd file descriptor. + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ +struct file *eventfd_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + if (file->f_op != &eventfd_fops) { + fput(file); + return ERR_PTR(-EINVAL); + } + + return file; +} +EXPORT_SYMBOL_GPL(eventfd_fget); + +/** + * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointers returned by the following functions: + * + * eventfd_fget + */ +struct eventfd_ctx *eventfd_ctx_fdget(int fd) +{ + struct eventfd_ctx *ctx; + struct fd f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + ctx = eventfd_ctx_fileget(f.file); + fdput(f); + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); + +/** + * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. + * @file: [in] Eventfd file pointer. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointer: + * + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ +struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) +{ + if (file->f_op != &eventfd_fops) + return ERR_PTR(-EINVAL); + + return eventfd_ctx_get(file->private_data); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); + +/** + * eventfd_file_create - Creates an eventfd file pointer. + * @count: Initial eventfd counter value. + * @flags: Flags for the eventfd file. + * + * This function creates an eventfd file pointer, w/out installing it into + * the fd table. This is useful when the eventfd file is used during the + * initialization of data structures that require extra setup after the eventfd + * creation. So the eventfd creation is split into the file pointer creation + * phase, and the file descriptor installation phase. + * In this way races with userspace closing the newly installed file descriptor + * can be avoided. + * Returns an eventfd file pointer, or a proper error pointer. + */ +struct file *eventfd_file_create(unsigned int count, int flags) +{ + struct file *file; + struct eventfd_ctx *ctx; + + /* Check the EFD_* constants for consistency. */ + BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~EFD_FLAGS_SET) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + kref_init(&ctx->kref); + init_waitqueue_head(&ctx->wqh); + ctx->count = count; + ctx->flags = flags; + + file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) + eventfd_free_ctx(ctx); + + return file; +} + +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = eventfd_file_create(count, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; +} + +SYSCALL_DEFINE1(eventfd, unsigned int, count) +{ + return sys_eventfd2(count, 0); +} + diff --git a/kernel/fs/eventpoll.c b/kernel/fs/eventpoll.c new file mode 100644 index 000000000..d0c12504d --- /dev/null +++ b/kernel/fs/eventpoll.c @@ -0,0 +1,2133 @@ +/* + * fs/eventpoll.c (Efficient event retrieval implementation) + * Copyright (C) 2001,...,2009 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * LOCKING: + * There are three level of locking required by epoll : + * + * 1) epmutex (mutex) + * 2) ep->mtx (mutex) + * 3) ep->lock (spinlock) + * + * The acquire order is the one listed above, from 1 to 3. + * We need a spinlock (ep->lock) because we manipulate objects + * from inside the poll callback, that might be triggered from + * a wake_up() that in turn might be called from IRQ context. + * So we can't sleep inside the poll callback and hence we need + * a spinlock. During the event transfer loop (from kernel to + * user space) we could end up sleeping due a copy_to_user(), so + * we need a lock that will allow us to sleep. This lock is a + * mutex (ep->mtx). It is acquired during the event transfer loop, + * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). + * Then we also need a global mutex to serialize eventpoll_release_file() + * and ep_free(). + * This mutex is acquired by ep_free() during the epoll file + * cleanup path and it is also acquired by eventpoll_release_file() + * if a file has been pushed inside an epoll set and it is then + * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. + * It is possible to drop the "ep->mtx" and to use the global + * mutex "epmutex" (together with "ep->lock") to have it working, + * but having "ep->mtx" will make the interface more scalable. + * Events that require holding "epmutex" are very rare, while for + * normal operations the epoll private "ep->mtx" will guarantee + * a better scalability. + */ + +/* Epoll private bits inside the event mask */ +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) + +/* Maximum number of nesting allowed inside epoll sets */ +#define EP_MAX_NESTS 4 + +#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) + +#define EP_UNACTIVE_PTR ((void *) -1L) + +#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) + +struct epoll_filefd { + struct file *file; + int fd; +} __packed; + +/* + * Structure used to track possible nested calls, for too deep recursions + * and loop cycles. + */ +struct nested_call_node { + struct list_head llink; + void *cookie; + void *ctx; +}; + +/* + * This structure is used as collector for nested calls, to check for + * maximum recursion dept and loop cycles. + */ +struct nested_calls { + struct list_head tasks_call_list; + spinlock_t lock; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + * Avoid increasing the size of this struct, there can be many thousands + * of these on a server and we do not want this to take another cache line. + */ +struct epitem { + union { + /* RB tree node links this structure to the eventpoll RB tree */ + struct rb_node rbn; + /* Used to free the struct epitem */ + struct rcu_head rcu; + }; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* wakeup_source used when EPOLLWAKEUP is set */ + struct wakeup_source __rcu *ws; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and represents the main data structure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the access to this structure */ + spinlock_t lock; + + /* + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. + */ + struct mutex mtx; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB tree root used to store monitored fd structs */ + struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transferring ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; + + /* wakeup_source used when ep_scan_ready_list is running */ + struct wakeup_source *ws; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; +}; + +/* Wait structure used by the poll hooks */ +struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ + struct list_head llink; + + /* The "base" pointer is set to the container "struct epitem" */ + struct epitem *base; + + /* + * Wait queue item that will be linked to the target file wait + * queue head. + */ + wait_queue_t wait; + + /* The wait queue head that linked the "wait" wait queue item */ + wait_queue_head_t *whead; +}; + +/* Wrapper struct used by poll queueing */ +struct ep_pqueue { + poll_table pt; + struct epitem *epi; +}; + +/* Used by the ep_send_events() function as callback private data */ +struct ep_send_events_data { + int maxevents; + struct epoll_event __user *events; +}; + +/* + * Configuration options available inside /proc/sys/fs/epoll/ + */ +/* Maximum number of epoll watched descriptors, per user */ +static long max_user_watches __read_mostly; + +/* + * This mutex is used to serialize ep_free() and eventpoll_release_file(). + */ +static DEFINE_MUTEX(epmutex); + +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + +/* Used for safe wake up implementation */ +static struct nested_calls poll_safewake_ncalls; + +/* Used to call file's f_op->poll() under the nested calls boundaries */ +static struct nested_calls poll_readywalk_ncalls; + +/* Slab cache used to allocate "struct epitem" */ +static struct kmem_cache *epi_cache __read_mostly; + +/* Slab cache used to allocate "struct eppoll_entry" */ +static struct kmem_cache *pwq_cache __read_mostly; + +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + +#ifdef CONFIG_SYSCTL + +#include + +static long zero; +static long long_max = LONG_MAX; + +struct ctl_table epoll_table[] = { + { + .procname = "max_user_watches", + .data = &max_user_watches, + .maxlen = sizeof(max_user_watches), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero, + .extra2 = &long_max, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} + +/* Setup the structure that is used as key for the RB tree */ +static inline void ep_set_ffd(struct epoll_filefd *ffd, + struct file *file, int fd) +{ + ffd->file = file; + ffd->fd = fd; +} + +/* Compare RB tree keys */ +static inline int ep_cmp_ffd(struct epoll_filefd *p1, + struct epoll_filefd *p2) +{ + return (p1->file > p2->file ? +1: + (p1->file < p2->file ? -1 : p1->fd - p2->fd)); +} + +/* Tells us if the item is currently linked */ +static inline int ep_is_linked(struct list_head *p) +{ + return !list_empty(p); +} + +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + +/* Get the "struct epitem" from a wait queue pointer */ +static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait)->base; +} + +/* Get the "struct epitem" from an epoll queue wrapper */ +static inline struct epitem *ep_item_from_epqueue(poll_table *p) +{ + return container_of(p, struct ep_pqueue, pt)->epi; +} + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_has_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + +/* Initialize the poll safe wake up structure */ +static void ep_nested_calls_init(struct nested_calls *ncalls) +{ + INIT_LIST_HEAD(&ncalls->tasks_call_list); + spin_lock_init(&ncalls->lock); +} + +/** + * ep_events_available - Checks if ready events might be available. + * + * @ep: Pointer to the eventpoll context. + * + * Returns: Returns a value different than zero if ready events are available, + * or zero otherwise. + */ +static inline int ep_events_available(struct eventpoll *ep) +{ + return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; +} + +/** + * ep_call_nested - Perform a bound (possibly) nested call, by checking + * that the recursion limit is not exceeded, and that + * the same nested call (by the meaning of same cookie) is + * no re-entered. + * + * @ncalls: Pointer to the nested_calls structure to be used for this call. + * @max_nests: Maximum number of allowed nesting calls. + * @nproc: Nested call core function pointer. + * @priv: Opaque data to be passed to the @nproc callback. + * @cookie: Cookie to be used to identify this nested call. + * @ctx: This instance context. + * + * Returns: Returns the code returned by the @nproc callback, or -1 if + * the maximum recursion limit has been exceeded. + */ +static int ep_call_nested(struct nested_calls *ncalls, int max_nests, + int (*nproc)(void *, void *, int), void *priv, + void *cookie, void *ctx) +{ + int error, call_nests = 0; + unsigned long flags; + struct list_head *lsthead = &ncalls->tasks_call_list; + struct nested_call_node *tncur; + struct nested_call_node tnode; + + spin_lock_irqsave(&ncalls->lock, flags); + + /* + * Try to see if the current task is already inside this wakeup call. + * We use a list here, since the population inside this set is always + * very much limited. + */ + list_for_each_entry(tncur, lsthead, llink) { + if (tncur->ctx == ctx && + (tncur->cookie == cookie || ++call_nests > max_nests)) { + /* + * Ops ... loop detected or maximum nest level reached. + * We abort this wake by breaking the cycle itself. + */ + error = -1; + goto out_unlock; + } + } + + /* Add the current task and cookie to the list */ + tnode.ctx = ctx; + tnode.cookie = cookie; + list_add(&tnode.llink, lsthead); + + spin_unlock_irqrestore(&ncalls->lock, flags); + + /* Call the nested function */ + error = (*nproc)(priv, cookie, call_nests); + + /* Remove the current task from the list */ + spin_lock_irqsave(&ncalls->lock, flags); + list_del(&tnode.llink); +out_unlock: + spin_unlock_irqrestore(&ncalls->lock, flags); + + return error; +} + +/* + * As described in commit 0ccf831cb lockdep: annotate epoll + * the use of wait queues used by epoll is done in a very controlled + * manner. Wake ups can nest inside each other, but are never done + * with the same locking. For example: + * + * dfd = socket(...); + * efd1 = epoll_create(); + * efd2 = epoll_create(); + * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); + * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); + * + * When a packet arrives to the device underneath "dfd", the net code will + * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a + * callback wakeup entry on that queue, and the wake_up() performed by the + * "dfd" net code will end up in ep_poll_callback(). At this point epoll + * (efd1) notices that it may have some event ready, so it needs to wake up + * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() + * that ends up in another wake_up(), after having checked about the + * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to + * avoid stack blasting. + * + * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle + * this special case of epoll. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); + wake_up_locked_poll(wqueue, events); + spin_unlock_irqrestore(&wqueue->lock, flags); +} +#else +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + wake_up_poll(wqueue, events); +} +#endif + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) +{ + ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, + 1 + call_nests); + return 0; +} + +/* + * Perform a safe wake up of the poll wait list. The problem is that + * with the new callback'd wake up system, it is possible that the + * poll callback is reentered from inside the call to wake_up() done + * on the poll wait queue head. The rule is that we cannot reenter the + * wake up code from the same task more than EP_MAX_NESTS times, + * and we cannot reenter the same wait queue head at all. This will + * enable to have a hierarchy of epoll file descriptor of no more than + * EP_MAX_NESTS deep. + */ +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + int this_cpu = get_cpu_light(); + + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + + put_cpu_light(); +} + +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + +/* + * This function unregisters poll callbacks from the associated file + * descriptor. Must be called with "mtx" held (or "epmutex" if called from + * ep_free). + */ +static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) +{ + struct list_head *lsthead = &epi->pwqlist; + struct eppoll_entry *pwq; + + while (!list_empty(lsthead)) { + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); + + list_del(&pwq->llink); + ep_remove_wait_queue(pwq); + kmem_cache_free(pwq_cache, pwq); + } +} + +/* call only when ep->mtx is held */ +static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) +{ + return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); +} + +/* call only when ep->mtx is held */ +static inline void ep_pm_stay_awake(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + if (ws) + __pm_stay_awake(ws); +} + +static inline bool ep_has_wakeup_source(struct epitem *epi) +{ + return rcu_access_pointer(epi->ws) ? true : false; +} + +/* call when ep->mtx cannot be held (ep_poll_callback) */ +static inline void ep_pm_stay_awake_rcu(struct epitem *epi) +{ + struct wakeup_source *ws; + + rcu_read_lock(); + ws = rcu_dereference(epi->ws); + if (ws) + __pm_stay_awake(ws); + rcu_read_unlock(); +} + +/** + * ep_scan_ready_list - Scans the ready list in a way that makes possible for + * the scan code, to call f_op->poll(). Also allows for + * O(NumReady) performance. + * + * @ep: Pointer to the epoll private data structure. + * @sproc: Pointer to the scan callback. + * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. + * @ep_locked: caller already holds ep->mtx + * + * Returns: The same integer error code returned by the @sproc callback. + */ +static int ep_scan_ready_list(struct eventpoll *ep, + int (*sproc)(struct eventpoll *, + struct list_head *, void *), + void *priv, int depth, bool ep_locked) +{ + int error, pwake = 0; + unsigned long flags; + struct epitem *epi, *nepi; + LIST_HEAD(txlist); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() and epoll_ctl(). + */ + + if (!ep_locked) + mutex_lock_nested(&ep->mtx, depth); + + /* + * Steal the ready list, and re-init the original one to the + * empty list. Also, set ep->ovflist to NULL so that events + * happening while looping w/out locks, are not lost. We cannot + * have the poll callback to queue directly on ep->rdllist, + * because we want the "sproc" callback to be able to do it + * in a lockless way. + */ + spin_lock_irqsave(&ep->lock, flags); + list_splice_init(&ep->rdllist, &txlist); + ep->ovflist = NULL; + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Now call the callback function. + */ + error = (*sproc)(ep, &txlist, priv); + + spin_lock_irqsave(&ep->lock, flags); + /* + * During the time we spent inside the "sproc" callback, some + * other events might have been queued by the poll callback. + * We re-insert them inside the main ready-list here. + */ + for (nepi = ep->ovflist; (epi = nepi) != NULL; + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + /* + * We need to check if the item is already in the list. + * During the "sproc" callback execution time, items are + * queued into ->ovflist but the "txlist" might already + * contain them, and the list_splice() below takes care of them. + */ + if (!ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + } + } + /* + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after + * releasing the lock, events will be queued in the normal way inside + * ep->rdllist. + */ + ep->ovflist = EP_UNACTIVE_PTR; + + /* + * Quickly re-inject items left on "txlist". + */ + list_splice(&txlist, &ep->rdllist); + __pm_relax(ep->ws); + + if (!list_empty(&ep->rdllist)) { + /* + * Wake up (if active) both the eventpoll wait list and + * the ->poll() wait list (delayed after we release the lock). + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + spin_unlock_irqrestore(&ep->lock, flags); + + if (!ep_locked) + mutex_unlock(&ep->mtx); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return error; +} + +static void epi_rcu_free(struct rcu_head *head) +{ + struct epitem *epi = container_of(head, struct epitem, rcu); + kmem_cache_free(epi_cache, epi); +} + +/* + * Removes a "struct epitem" from the eventpoll RB tree and deallocates + * all the associated resources. Must be called with "mtx" held. + */ +static int ep_remove(struct eventpoll *ep, struct epitem *epi) +{ + unsigned long flags; + struct file *file = epi->ffd.file; + + /* + * Removes poll wait queue hooks. We _have_ to do this without holding + * the "ep->lock" otherwise a deadlock might occur. This because of the + * sequence of the lock acquisition. Here we do "ep->lock" then the wait + * queue head lock when unregistering the wait queue. The wakeup callback + * will run by holding the wait queue head lock and will call our callback + * that will try to get "ep->lock". + */ + ep_unregister_pollwait(ep, epi); + + /* Remove the current item from the list of epoll hooks */ + spin_lock(&file->f_lock); + list_del_rcu(&epi->fllink); + spin_unlock(&file->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + + spin_lock_irqsave(&ep->lock, flags); + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + spin_unlock_irqrestore(&ep->lock, flags); + + wakeup_source_unregister(ep_wakeup_source(epi)); + /* + * At this point it is safe to free the eventpoll item. Use the union + * field epi->rcu, since we are trying to minimize the size of + * 'struct epitem'. The 'rbn' field is no longer in use. Protected by + * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make + * use of the rbn field. + */ + call_rcu(&epi->rcu, epi_rcu_free); + + atomic_long_dec(&ep->user->epoll_watches); + + return 0; +} + +static void ep_free(struct eventpoll *ep) +{ + struct rb_node *rbp; + struct epitem *epi; + + /* We need to release all tasks waiting for these file */ + if (waitqueue_active(&ep->poll_wait)) + ep_poll_safewake(&ep->poll_wait); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() while we're freeing the "struct eventpoll". + * We do not need to hold "ep->mtx" here because the epoll file + * is on the way to be removed and no one has references to it + * anymore. The only hit might come from eventpoll_release_file() but + * holding "epmutex" is sufficient here. + */ + mutex_lock(&epmutex); + + /* + * Walks through the whole tree by unregistering poll callbacks. + */ + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + + ep_unregister_pollwait(ep, epi); + cond_resched(); + } + + /* + * Walks through the whole tree by freeing each "struct epitem". At this + * point we are sure no poll callbacks will be lingering around, and also by + * holding "epmutex" we can be sure that no file cleanup code will hit + * us during this operation. So we can avoid the lock on "ep->lock". + * We do not need to lock ep->mtx, either, we only do it to prevent + * a lockdep warning. + */ + mutex_lock(&ep->mtx); + while ((rbp = rb_first(&ep->rbr)) != NULL) { + epi = rb_entry(rbp, struct epitem, rbn); + ep_remove(ep, epi); + cond_resched(); + } + mutex_unlock(&ep->mtx); + + mutex_unlock(&epmutex); + mutex_destroy(&ep->mtx); + free_uid(ep->user); + wakeup_source_unregister(ep->ws); + kfree(ep); +} + +static int ep_eventpoll_release(struct inode *inode, struct file *file) +{ + struct eventpoll *ep = file->private_data; + + if (ep) + ep_free(ep); + + return 0; +} + +static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +{ + pt->_key = epi->event.events; + + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; +} + +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) +{ + struct epitem *epi, *tmp; + poll_table pt; + + init_poll_funcptr(&pt, NULL); + + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (ep_item_poll(epi, &pt)) + return POLLIN | POLLRDNORM; + else { + /* + * Item has been dropped into the ready list by the poll + * callback, but it's not actually ready, as far as + * caller requested events goes. We can remove it here. + */ + __pm_relax(ep_wakeup_source(epi)); + list_del_init(&epi->rdllink); + } + } + + return 0; +} + +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +struct readyevents_arg { + struct eventpoll *ep; + bool locked; +}; + +static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) +{ + struct readyevents_arg *arg = priv; + + return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, + call_nests + 1, arg->locked); +} + +static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) +{ + int pollflags; + struct eventpoll *ep = file->private_data; + struct readyevents_arg arg; + + /* + * During ep_insert() we already hold the ep->mtx for the tfile. + * Prevent re-aquisition. + */ + arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); + arg.ep = ep; + + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + + /* + * Proceed to find out if wanted events are really available inside + * the ready list. This need to be done under ep_call_nested() + * supervision, since the call to f_op->poll() done on listed files + * could re-enter here. + */ + pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, + ep_poll_readyevents_proc, &arg, ep, current); + + return pollflags != -1 ? pollflags : 0; +} + +#ifdef CONFIG_PROC_FS +static void ep_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventpoll *ep = f->private_data; + struct rb_node *rbp; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi = rb_entry(rbp, struct epitem, rbn); + + seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (seq_has_overflowed(m)) + break; + } + mutex_unlock(&ep->mtx); +} +#endif + +/* File callbacks that implement the eventpoll file behaviour */ +static const struct file_operations eventpoll_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = ep_show_fdinfo, +#endif + .release = ep_eventpoll_release, + .poll = ep_eventpoll_poll, + .llseek = noop_llseek, +}; + +/* + * This is called from eventpoll_release() to unlink files from the eventpoll + * interface. We need to have this facility to cleanup correctly files that are + * closed without being removed from the eventpoll interface. + */ +void eventpoll_release_file(struct file *file) +{ + struct eventpoll *ep; + struct epitem *epi, *next; + + /* + * We don't want to get "file->f_lock" because it is not + * necessary. It is not necessary because we're in the "struct file" + * cleanup path, and this means that no one is using this file anymore. + * So, for example, epoll_ctl() cannot hit here since if we reach this + * point, the file counter already went to zero and fget() would fail. + * The only hit might come from ep_free() but by holding the mutex + * will correctly serialize the operation. We do need to acquire + * "ep->mtx" after "epmutex" because ep_remove() requires it when called + * from anywhere but ep_free(). + * + * Besides, ep_remove() acquires the lock, so we can't hold it here. + */ + mutex_lock(&epmutex); + list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { + ep = epi->ep; + mutex_lock_nested(&ep->mtx, 0); + ep_remove(ep, epi); + mutex_unlock(&ep->mtx); + } + mutex_unlock(&epmutex); +} + +static int ep_alloc(struct eventpoll **pep) +{ + int error; + struct user_struct *user; + struct eventpoll *ep; + + user = get_current_user(); + error = -ENOMEM; + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (unlikely(!ep)) + goto free_uid; + + spin_lock_init(&ep->lock); + mutex_init(&ep->mtx); + init_waitqueue_head(&ep->wq); + init_waitqueue_head(&ep->poll_wait); + INIT_LIST_HEAD(&ep->rdllist); + ep->rbr = RB_ROOT; + ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; + + *pep = ep; + + return 0; + +free_uid: + free_uid(user); + return error; +} + +/* + * Search the file inside the eventpoll tree. The RB tree operations + * are protected by the "mtx" mutex, and ep_find() must be called with + * "mtx" held. + */ +static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +{ + int kcmp; + struct rb_node *rbp; + struct epitem *epi, *epir = NULL; + struct epoll_filefd ffd; + + ep_set_ffd(&ffd, file, fd); + for (rbp = ep->rbr.rb_node; rbp; ) { + epi = rb_entry(rbp, struct epitem, rbn); + kcmp = ep_cmp_ffd(&ffd, &epi->ffd); + if (kcmp > 0) + rbp = rbp->rb_right; + else if (kcmp < 0) + rbp = rbp->rb_left; + else { + epir = epi; + break; + } + } + + return epir; +} + +/* + * This is the callback that is passed to the wait queue wakeup + * mechanism. It is called by the stored file descriptors when they + * have events to report. + */ +static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int pwake = 0; + unsigned long flags; + struct epitem *epi = ep_item_from_wait(wait); + struct eventpoll *ep = epi->ep; + + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ + list_del_init(&wait->task_list); + } + + spin_lock_irqsave(&ep->lock, flags); + + /* + * If the event mask does not contain any poll(2) event, we consider the + * descriptor to be disabled. This condition is likely the effect of the + * EPOLLONESHOT bit that disables the descriptor when an event is received, + * until the next EPOLL_CTL_MOD will be issued. + */ + if (!(epi->event.events & ~EP_PRIVATE_BITS)) + goto out_unlock; + + /* + * Check the events coming with the callback. At this stage, not + * every device reports the events in the "key" parameter of the + * callback. We need to be able to handle both cases here, hence the + * test for "key" != NULL before the event match test. + */ + if (key && !((unsigned long) key & epi->event.events)) + goto out_unlock; + + /* + * If we are transferring events to userspace, we can hold no locks + * (because we're accessing user memory, and because of linux f_op->poll() + * semantics). All the events that happen during that period of time are + * chained in ep->ovflist and requeued later on. + */ + if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = ep->ovflist; + ep->ovflist = epi; + if (epi->ws) { + /* + * Activate ep->ws since epi->ws may get + * deactivated at any time. + */ + __pm_stay_awake(ep->ws); + } + + } + goto out_unlock; + } + + /* If this file is already in the ready list we exit soon */ + if (!ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); + } + + /* + * Wake up ( if active ) both the eventpoll wait list and the ->poll() + * wait list. + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + +out_unlock: + spin_unlock_irqrestore(&ep->lock, flags); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 1; +} + +/* + * This is the callback that is used to add our wait queue to the + * target file wakeup lists. + */ +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt) +{ + struct epitem *epi = ep_item_from_epqueue(pt); + struct eppoll_entry *pwq; + + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { + init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); + pwq->whead = whead; + pwq->base = epi; + add_wait_queue(whead, &pwq->wait); + list_add_tail(&pwq->llink, &epi->pwqlist); + epi->nwait++; + } else { + /* We have to signal that an error occurred */ + epi->nwait = -1; + } +} + +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) +{ + int kcmp; + struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct epitem *epic; + + while (*p) { + parent = *p; + epic = rb_entry(parent, struct epitem, rbn); + kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); + if (kcmp > 0) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&epi->rbn, parent, p); + rb_insert_color(&epi->rbn, &ep->rbr); +} + + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + /* Allow an arbitrary number of depth 1 paths */ + if (nests == 0) + return 0; + + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + /* CTL_DEL can remove links here, but that can't increase our count */ + rcu_read_lock(); + list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + rcu_read_unlock(); + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + +static int ep_create_wakeup_source(struct epitem *epi) +{ + const char *name; + struct wakeup_source *ws; + + if (!epi->ep->ws) { + epi->ep->ws = wakeup_source_register("eventpoll"); + if (!epi->ep->ws) + return -ENOMEM; + } + + name = epi->ffd.file->f_path.dentry->d_name.name; + ws = wakeup_source_register(name); + + if (!ws) + return -ENOMEM; + rcu_assign_pointer(epi->ws, ws); + + return 0; +} + +/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ +static noinline void ep_destroy_wakeup_source(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + RCU_INIT_POINTER(epi->ws, NULL); + + /* + * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is + * used internally by wakeup_source_remove, too (called by + * wakeup_source_unregister), so we cannot use call_rcu + */ + synchronize_rcu(); + wakeup_source_unregister(ws); +} + +/* + * Must be called with "mtx" held. + */ +static int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd, int full_check) +{ + int error, revents, pwake = 0; + unsigned long flags; + long user_watches; + struct epitem *epi; + struct ep_pqueue epq; + + user_watches = atomic_long_read(&ep->user->epoll_watches); + if (unlikely(user_watches >= max_user_watches)) + return -ENOSPC; + if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) + return -ENOMEM; + + /* Item initialization follow here ... */ + INIT_LIST_HEAD(&epi->rdllink); + INIT_LIST_HEAD(&epi->fllink); + INIT_LIST_HEAD(&epi->pwqlist); + epi->ep = ep; + ep_set_ffd(&epi->ffd, tfile, fd); + epi->event = *event; + epi->nwait = 0; + epi->next = EP_UNACTIVE_PTR; + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) + goto error_create_wakeup_source; + } else { + RCU_INIT_POINTER(epi->ws, NULL); + } + + /* Initialize the poll table using the queue callback */ + epq.epi = epi; + init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); + + /* + * Attach the item to the poll hooks and get current event bits. + * We can safely use the file* here because its usage count has + * been increased by the caller of this function. Note that after + * this operation completes, the poll callback can start hitting + * the new item. + */ + revents = ep_item_poll(epi, &epq.pt); + + /* + * We have to check if something went wrong during the poll wait queue + * install process. Namely an allocation for a wait queue failed due + * high memory pressure. + */ + error = -ENOMEM; + if (epi->nwait < 0) + goto error_unregister; + + /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (full_check && reverse_path_check()) + goto error_remove_epi; + + /* We have to drop the new item inside our item list to keep track of it */ + spin_lock_irqsave(&ep->lock, flags); + + /* If the file is already "ready" we drop it inside the ready list */ + if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + + spin_unlock_irqrestore(&ep->lock, flags); + + atomic_long_inc(&ep->user->epoll_watches); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 0; + +error_remove_epi: + spin_lock(&tfile->f_lock); + list_del_rcu(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + +error_unregister: + ep_unregister_pollwait(ep, epi); + + /* + * We need to do this because an event could have been arrived on some + * allocated wait queue. Note that we don't care about the ep->ovflist + * list, since that is used/cleaned only inside a section bound by "mtx". + * And ep_insert() is called with "mtx" held. + */ + spin_lock_irqsave(&ep->lock, flags); + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + spin_unlock_irqrestore(&ep->lock, flags); + + wakeup_source_unregister(ep_wakeup_source(epi)); + +error_create_wakeup_source: + kmem_cache_free(epi_cache, epi); + + return error; +} + +/* + * Modify the interest event mask by dropping an event if the new mask + * has a match in the current file status. Must be called with "mtx" held. + */ +static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) +{ + int pwake = 0; + unsigned int revents; + poll_table pt; + + init_poll_funcptr(&pt, NULL); + + /* + * Set the new event interest mask before calling f_op->poll(); + * otherwise we might miss an event that happens between the + * f_op->poll() call and the new event set registering. + */ + epi->event.events = event->events; /* need barrier below */ + epi->event.data = event->data; /* protected by mtx */ + if (epi->event.events & EPOLLWAKEUP) { + if (!ep_has_wakeup_source(epi)) + ep_create_wakeup_source(epi); + } else if (ep_has_wakeup_source(epi)) { + ep_destroy_wakeup_source(epi); + } + + /* + * The following barrier has two effects: + * + * 1) Flush epi changes above to other CPUs. This ensures + * we do not miss events from ep_poll_callback if an + * event occurs immediately after we call f_op->poll(). + * We need this because we did not take ep->lock while + * changing epi above (but ep_poll_callback does take + * ep->lock). + * + * 2) We also need to ensure we do not miss _past_ events + * when calling f_op->poll(). This barrier also + * pairs with the barrier in wq_has_sleeper (see + * comments for wq_has_sleeper). + * + * This barrier will now guarantee ep_poll_callback or f_op->poll + * (or both) will notice the readiness of an item. + */ + smp_mb(); + + /* + * Get current event bits. We can safely use the file* here because + * its usage count has been increased by the caller of this function. + */ + revents = ep_item_poll(epi, &pt); + + /* + * If the item is "hot" and it is not registered inside the ready + * list, push it inside. + */ + if (revents & event->events) { + spin_lock_irq(&ep->lock); + if (!ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + spin_unlock_irq(&ep->lock); + } + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 0; +} + +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) +{ + struct ep_send_events_data *esed = priv; + int eventcnt; + unsigned int revents; + struct epitem *epi; + struct epoll_event __user *uevent; + struct wakeup_source *ws; + poll_table pt; + + init_poll_funcptr(&pt, NULL); + + /* + * We can loop without lock because we are passed a task private list. + * Items cannot vanish during the loop because ep_scan_ready_list() is + * holding "mtx" during this call. + */ + for (eventcnt = 0, uevent = esed->events; + !list_empty(head) && eventcnt < esed->maxevents;) { + epi = list_first_entry(head, struct epitem, rdllink); + + /* + * Activate ep->ws before deactivating epi->ws to prevent + * triggering auto-suspend here (in case we reactive epi->ws + * below). + * + * This could be rearranged to delay the deactivation of epi->ws + * instead, but then epi->ws would temporarily be out of sync + * with ep_is_linked(). + */ + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + + list_del_init(&epi->rdllink); + + revents = ep_item_poll(epi, &pt); + + /* + * If the event mask intersect the caller-requested one, + * deliver the event to userspace. Again, ep_scan_ready_list() + * is holding "mtx", so no operations coming from userspace + * can change the item. + */ + if (revents) { + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + ep_pm_stay_awake(epi); + return eventcnt ? eventcnt : -EFAULT; + } + eventcnt++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, no one can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + } + } + } + + return eventcnt; +} + +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + struct ep_send_events_data esed; + + esed.maxevents = maxevents; + esed.events = events; + + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); +} + +static inline struct timespec ep_set_mstimeout(long ms) +{ + struct timespec now, ts = { + .tv_sec = ms / MSEC_PER_SEC, + .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), + }; + + ktime_get_ts(&now); + return timespec_add_safe(now, ts); +} + +/** + * ep_poll - Retrieves ready events, and delivers them to the caller supplied + * event buffer. + * + * @ep: Pointer to the eventpoll context. + * @events: Pointer to the userspace buffer where the ready events should be + * stored. + * @maxevents: Size (in terms of number of events) of the caller event buffer. + * @timeout: Maximum timeout for the ready events fetch operation, in + * milliseconds. If the @timeout is zero, the function will not block, + * while if the @timeout is less than zero, the function will block + * until at least one event has been retrieved (or an error + * occurred). + * + * Returns: Returns the number of ready events which have been fetched, or an + * error code, in case of error. + */ +static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, + int maxevents, long timeout) +{ + int res = 0, eavail, timed_out = 0; + unsigned long flags; + long slack = 0; + wait_queue_t wait; + ktime_t expires, *to = NULL; + + if (timeout > 0) { + struct timespec end_time = ep_set_mstimeout(timeout); + + slack = select_estimate_accuracy(&end_time); + to = &expires; + *to = timespec_to_ktime(end_time); + } else if (timeout == 0) { + /* + * Avoid the unnecessary trip to the wait queue loop, if the + * caller specified a non blocking operation. + */ + timed_out = 1; + spin_lock_irqsave(&ep->lock, flags); + goto check_events; + } + +fetch_events: + spin_lock_irqsave(&ep->lock, flags); + + if (!ep_events_available(ep)) { + /* + * We don't have any available event to return to the caller. + * We need to sleep here, and we will be wake up by + * ep_poll_callback() when events will become available. + */ + init_waitqueue_entry(&wait, current); + __add_wait_queue_exclusive(&ep->wq, &wait); + + for (;;) { + /* + * We don't want to sleep if the ep_poll_callback() sends us + * a wakeup in between. That's why we set the task state + * to TASK_INTERRUPTIBLE before doing the checks. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (ep_events_available(ep) || timed_out) + break; + if (signal_pending(current)) { + res = -EINTR; + break; + } + + spin_unlock_irqrestore(&ep->lock, flags); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; + + spin_lock_irqsave(&ep->lock, flags); + } + + __remove_wait_queue(&ep->wq, &wait); + __set_current_state(TASK_RUNNING); + } +check_events: + /* Is it worth to try to dig for events ? */ + eavail = ep_events_available(ep); + + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + if (!res && eavail && + !(res = ep_send_events(ep, events, maxevents)) && !timed_out) + goto fetch_events; + + return res; +} + +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); + if (error != 0) + break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); +} + +/* + * Open an eventpoll file descriptor. + */ +SYSCALL_DEFINE1(epoll_create1, int, flags) +{ + int error, fd; + struct eventpoll *ep = NULL; + struct file *file; + + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); + + if (flags & ~EPOLL_CLOEXEC) + return -EINVAL; + /* + * Create the internal data structure ("struct eventpoll"). + */ + error = ep_alloc(&ep); + if (error < 0) + return error; + /* + * Creates all the items needed to setup an eventpoll file. That is, + * a file structure and a free file descriptor. + */ + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, + O_RDWR | (flags & O_CLOEXEC)); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + ep->file = file; + fd_install(fd, file); + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); + return error; +} + +SYSCALL_DEFINE1(epoll_create, int, size) +{ + if (size <= 0) + return -EINVAL; + + return sys_epoll_create1(0); +} + +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + int error; + int full_check = 0; + struct fd f, tf; + struct eventpoll *ep; + struct epitem *epi; + struct epoll_event epds; + struct eventpoll *tep = NULL; + + error = -EFAULT; + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + goto error_return; + + error = -EBADF; + f = fdget(epfd); + if (!f.file) + goto error_return; + + /* Get the "struct file *" for the target file */ + tf = fdget(fd); + if (!tf.file) + goto error_fput; + + /* The target file descriptor must support poll */ + error = -EPERM; + if (!tf.file->f_op->poll) + goto error_tgt_fput; + + /* Check if EPOLLWAKEUP is allowed */ + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); + + /* + * We have to check that the file structure underneath the file descriptor + * the user passed to us _is_ an eventpoll file. And also we do not permit + * adding an epoll file descriptor inside itself. + */ + error = -EINVAL; + if (f.file == tf.file || !is_file_epoll(f.file)) + goto error_tgt_fput; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = f.file->private_data; + + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. + * + * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when + * the epoll file descriptor is attaching directly to a wakeup source, + * unless the epoll file descriptor is nested. The purpose of taking the + * 'epmutex' on add is to prevent complex toplogies such as loops and + * deep wakeup paths from forming in parallel through multiple + * EPOLL_CTL_ADD operations. + */ + mutex_lock_nested(&ep->mtx, 0); + if (op == EPOLL_CTL_ADD) { + if (!list_empty(&f.file->f_ep_links) || + is_file_epoll(tf.file)) { + full_check = 1; + mutex_unlock(&ep->mtx); + mutex_lock(&epmutex); + if (is_file_epoll(tf.file)) { + error = -ELOOP; + if (ep_loop_check(ep, tf.file) != 0) { + clear_tfile_check_list(); + goto error_tgt_fput; + } + } else + list_add(&tf.file->f_tfile_llink, + &tfile_check_list); + mutex_lock_nested(&ep->mtx, 0); + if (is_file_epoll(tf.file)) { + tep = tf.file->private_data; + mutex_lock_nested(&tep->mtx, 1); + } + } + } + + /* + * Try to lookup the file inside our RB tree, Since we grabbed "mtx" + * above, we can be sure to be able to use the item looked up by + * ep_find() till we release the mutex. + */ + epi = ep_find(ep, tf.file, fd); + + error = -EINVAL; + switch (op) { + case EPOLL_CTL_ADD: + if (!epi) { + epds.events |= POLLERR | POLLHUP; + error = ep_insert(ep, &epds, tf.file, fd, full_check); + } else + error = -EEXIST; + if (full_check) + clear_tfile_check_list(); + break; + case EPOLL_CTL_DEL: + if (epi) + error = ep_remove(ep, epi); + else + error = -ENOENT; + break; + case EPOLL_CTL_MOD: + if (epi) { + epds.events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, &epds); + } else + error = -ENOENT; + break; + } + if (tep != NULL) + mutex_unlock(&tep->mtx); + mutex_unlock(&ep->mtx); + +error_tgt_fput: + if (full_check) + mutex_unlock(&epmutex); + + fdput(tf); +error_fput: + fdput(f); +error_return: + + return error; +} + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_wait(2). + */ +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + int error; + struct fd f; + struct eventpoll *ep; + + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* Get the "struct file *" for the eventpoll file */ + f = fdget(epfd); + if (!f.file) + return -EBADF; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + error = -EINVAL; + if (!is_file_epoll(f.file)) + goto error_fput; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = f.file->private_data; + + /* Time to fish for events ... */ + error = ep_poll(ep, events, maxevents, timeout); + +error_fput: + fdput(f); + return error; +} + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_pwait(2). + */ +SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + int error; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + sigsaved = current->blocked; + set_current_blocked(&ksigmask); + } + + error = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (error == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } + + return error; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, + struct epoll_event __user *, events, + int, maxevents, int, timeout, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } + + return err; +} +#endif + +static int __init eventpoll_init(void) +{ + struct sysinfo si; + + si_meminfo(&si); + /* + * Allows top 4% of lomem to be allocated for epoll watches (per user). + */ + max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / + EP_ITEM_COST; + BUG_ON(max_user_watches < 0); + + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + + /* Initialize the structure used to perform safe poll wait head wake ups */ + ep_nested_calls_init(&poll_safewake_ncalls); + + /* Initialize the structure used to perform file's f_op->poll() calls */ + ep_nested_calls_init(&poll_readywalk_ncalls); + + /* + * We can have many thousands of epitems, so prevent this from + * using an extra cache line on 64-bit (and smaller) CPUs + */ + BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); + + /* Allocates slab cache used to allocate "struct epitem" items */ + epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* Allocates slab cache used to allocate "struct eppoll_entry" */ + pwq_cache = kmem_cache_create("eventpoll_pwq", + sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + + return 0; +} +fs_initcall(eventpoll_init); diff --git a/kernel/fs/exec.c b/kernel/fs/exec.c new file mode 100644 index 000000000..0e7125be0 --- /dev/null +++ b/kernel/fs/exec.c @@ -0,0 +1,1747 @@ +/* + * linux/fs/exec.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * #!-checking implemented by tytso. + */ +/* + * Demand-loading implemented 01.12.91 - no need to read anything but + * the header into memory. The inode of the executable is put into + * "current->executable", and page faults do the actual loading. Clean. + * + * Once more I can proudly say that linux stood up to being changed: it + * was less than 2 hours work to get demand-loading completely implemented. + * + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary + * formats. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "internal.h" + +#include + +int suid_dumpable = 0; + +static LIST_HEAD(formats); +static DEFINE_RWLOCK(binfmt_lock); + +void __register_binfmt(struct linux_binfmt * fmt, int insert) +{ + BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; + write_lock(&binfmt_lock); + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); + write_unlock(&binfmt_lock); +} + +EXPORT_SYMBOL(__register_binfmt); + +void unregister_binfmt(struct linux_binfmt * fmt) +{ + write_lock(&binfmt_lock); + list_del(&fmt->lh); + write_unlock(&binfmt_lock); +} + +EXPORT_SYMBOL(unregister_binfmt); + +static inline void put_binfmt(struct linux_binfmt * fmt) +{ + module_put(fmt->module); +} + +#ifdef CONFIG_USELIB +/* + * Note that a shared library must be both readable and executable due to + * security reasons. + * + * Also note that we take the address to load from from the file itself. + */ +SYSCALL_DEFINE1(uselib, const char __user *, library) +{ + struct linux_binfmt *fmt; + struct file *file; + struct filename *tmp = getname(library); + int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, + }; + + if (IS_ERR(tmp)) + goto out; + + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); + putname(tmp); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + error = -EINVAL; + if (!S_ISREG(file_inode(file)->i_mode)) + goto exit; + + error = -EACCES; + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + fsnotify_open(file); + + error = -ENOEXEC; + + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!fmt->load_shlib) + continue; + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + error = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (error != -ENOEXEC) + break; + } + read_unlock(&binfmt_lock); +exit: + fput(file); +out: + return error; +} +#endif /* #ifdef CONFIG_USELIB */ + +#ifdef CONFIG_MMU +/* + * The nascent bprm->mm is not visible until exec_mmap() but it can + * use a lot of memory, account these pages in current->mm temporary + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we + * change the counter back via acct_arg_size(0). + */ +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + add_mm_counter(mm, MM_ANONPAGES, diff); +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + int ret; + +#ifdef CONFIG_STACK_GROWSUP + if (write) { + ret = expand_downwards(bprm->vma, pos); + if (ret < 0) + return NULL; + } +#endif + ret = get_user_pages(current, bprm->mm, pos, + 1, write, 1, &page, NULL); + if (ret <= 0) + return NULL; + + if (write) { + unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + struct rlimit *rlim; + + acct_arg_size(bprm, size / PAGE_SIZE); + + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + if (size <= ARG_MAX) + return page; + + /* + * Limit to 1/4-th the stack size for the argv+env strings. + * This ensures that: + * - the remaining binfmt code will not run out of stack space, + * - the program will have a reasonable amount of stack left + * to work from. + */ + rlim = current->signal->rlim; + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { + put_page(page); + return NULL; + } + } + + return page; +} + +static void put_arg_page(struct page *page) +{ + put_page(page); +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ + flush_cache_page(bprm->vma, pos, page_to_pfn(page)); +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct vm_area_struct *vma = NULL; + struct mm_struct *mm = bprm->mm; + + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return -ENOMEM; + + down_write(&mm->mmap_sem); + vma->vm_mm = mm; + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); + return 0; +err: + up_write(&mm->mmap_sem); + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); + return err; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= MAX_ARG_STRLEN; +} + +#else + +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + + page = bprm->page[pos / PAGE_SIZE]; + if (!page && write) { + page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); + if (!page) + return NULL; + bprm->page[pos / PAGE_SIZE] = page; + } + + return page; +} + +static void put_arg_page(struct page *page) +{ +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ + if (bprm->page[i]) { + __free_page(bprm->page[i]); + bprm->page[i] = NULL; + } +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ + int i; + + for (i = 0; i < MAX_ARG_PAGES; i++) + free_arg_page(bprm, i); +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); + return 0; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= bprm->p; +} + +#endif /* CONFIG_MMU */ + +/* + * Create a new mm_struct and populate it with a temporary stack + * vm_area_struct. We don't have enough context at this point to set the stack + * flags, permissions, and offset, so we use temporary values. We'll update + * them later in setup_arg_pages(). + */ +static int bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct mm_struct *mm = NULL; + + bprm->mm = mm = mm_alloc(); + err = -ENOMEM; + if (!mm) + goto err; + + err = __bprm_mm_init(bprm); + if (err) + goto err; + + return 0; + +err: + if (mm) { + bprm->mm = NULL; + mmdrop(mm); + } + + return err; +} + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + const compat_uptr_t __user *compat; +#endif + } ptr; +}; + +static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) +{ + const char __user *native; + +#ifdef CONFIG_COMPAT + if (unlikely(argv.is_compat)) { + compat_uptr_t compat; + + if (get_user(compat, argv.ptr.compat + nr)) + return ERR_PTR(-EFAULT); + + return compat_ptr(compat); + } +#endif + + if (get_user(native, argv.ptr.native + nr)) + return ERR_PTR(-EFAULT); + + return native; +} + +/* + * count() counts the number of strings in array ARGV. + */ +static int count(struct user_arg_ptr argv, int max) +{ + int i = 0; + + if (argv.ptr.native != NULL) { + for (;;) { + const char __user *p = get_user_arg_ptr(argv, i); + + if (!p) + break; + + if (IS_ERR(p)) + return -EFAULT; + + if (i >= max) + return -E2BIG; + ++i; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + } + return i; +} + +/* + * 'copy_strings()' copies argument/environment strings from the old + * processes's memory to the new process's stack. The call to get_user_pages() + * ensures the destination page is created and not swapped out. + */ +static int copy_strings(int argc, struct user_arg_ptr argv, + struct linux_binprm *bprm) +{ + struct page *kmapped_page = NULL; + char *kaddr = NULL; + unsigned long kpos = 0; + int ret; + + while (argc-- > 0) { + const char __user *str; + int len; + unsigned long pos; + + ret = -EFAULT; + str = get_user_arg_ptr(argv, argc); + if (IS_ERR(str)) + goto out; + + len = strnlen_user(str, MAX_ARG_STRLEN); + if (!len) + goto out; + + ret = -E2BIG; + if (!valid_arg_len(bprm, len)) + goto out; + + /* We're going to work our way backwords. */ + pos = bprm->p; + str += len; + bprm->p -= len; + + while (len > 0) { + int offset, bytes_to_copy; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + + offset = pos % PAGE_SIZE; + if (offset == 0) + offset = PAGE_SIZE; + + bytes_to_copy = offset; + if (bytes_to_copy > len) + bytes_to_copy = len; + + offset -= bytes_to_copy; + pos -= bytes_to_copy; + str -= bytes_to_copy; + len -= bytes_to_copy; + + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + + page = get_arg_page(bprm, pos, 1); + if (!page) { + ret = -E2BIG; + goto out; + } + + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + kmapped_page = page; + kaddr = kmap(kmapped_page); + kpos = pos & PAGE_MASK; + flush_arg_page(bprm, kpos, kmapped_page); + } + if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { + ret = -EFAULT; + goto out; + } + } + } + ret = 0; +out: + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + return ret; +} + +/* + * Like copy_strings, but get argv and its values from kernel memory. + */ +int copy_strings_kernel(int argc, const char *const *__argv, + struct linux_binprm *bprm) +{ + int r; + mm_segment_t oldfs = get_fs(); + struct user_arg_ptr argv = { + .ptr.native = (const char __user *const __user *)__argv, + }; + + set_fs(KERNEL_DS); + r = copy_strings(argc, argv, bprm); + set_fs(oldfs); + + return r; +} +EXPORT_SYMBOL(copy_strings_kernel); + +#ifdef CONFIG_MMU + +/* + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once + * the binfmt code determines where the new stack should reside, we shift it to + * its final location. The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != find_vma(mm, new_start)) + return -EFAULT; + + /* + * cover the whole range: [new_start, old_end) + */ + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + if (length != move_page_tables(vma, old_start, + vma, new_start, length, false)) + return -ENOMEM; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, old_start, old_end); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb, old_start, old_end); + + /* + * Shrink the vma to just the new range. Always succeeds. + */ + vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); + + return 0; +} + +/* + * Finalizes the stack vm_area_struct. The flags and permissions are updated, + * the stack is optionally relocated, and some extra space is added. + */ +int setup_arg_pages(struct linux_binprm *bprm, + unsigned long stack_top, + int executable_stack) +{ + unsigned long ret; + unsigned long stack_shift; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = bprm->vma; + struct vm_area_struct *prev = NULL; + unsigned long vm_flags; + unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; + +#ifdef CONFIG_STACK_GROWSUP + /* Limit stack size */ + stack_base = rlimit_max(RLIMIT_STACK); + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; + + /* Add space for stack randomization. */ + stack_base += (STACK_RND_MASK << PAGE_SHIFT); + + /* Make sure we didn't let the argument array grow too large. */ + if (vma->vm_end - vma->vm_start > stack_base) + return -ENOMEM; + + stack_base = PAGE_ALIGN(stack_top - stack_base); + + stack_shift = vma->vm_start - stack_base; + mm->arg_start = bprm->p - stack_shift; + bprm->p = vma->vm_end - stack_shift; +#else + stack_top = arch_align_stack(stack_top); + stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + + stack_shift = vma->vm_end - stack_top; + + bprm->p -= stack_shift; + mm->arg_start = bprm->p; +#endif + + if (bprm->loader) + bprm->loader -= stack_shift; + bprm->exec -= stack_shift; + + down_write(&mm->mmap_sem); + vm_flags = VM_STACK_FLAGS; + + /* + * Adjust stack execute permissions; explicitly enable for + * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone + * (arch default) otherwise. + */ + if (unlikely(executable_stack == EXSTACK_ENABLE_X)) + vm_flags |= VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + vm_flags &= ~VM_EXEC; + vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; + + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + vm_flags); + if (ret) + goto out_unlock; + BUG_ON(prev != vma); + + /* Move stack pages down in memory. */ + if (stack_shift) { + ret = shift_arg_pages(vma, stack_shift); + if (ret) + goto out_unlock; + } + + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; +#ifdef CONFIG_STACK_GROWSUP + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; +#else + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; +#endif + current->mm->start_stack = bprm->p; + ret = expand_stack(vma, stack_base); + if (ret) + ret = -EFAULT; + +out_unlock: + up_write(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(setup_arg_pages); + +#endif /* CONFIG_MMU */ + +static struct file *do_open_execat(int fd, struct filename *name, int flags) +{ + struct file *file; + int err; + struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, + }; + + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return ERR_PTR(-EINVAL); + if (flags & AT_SYMLINK_NOFOLLOW) + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; + + file = do_filp_open(fd, name, &open_exec_flags); + if (IS_ERR(file)) + goto out; + + err = -EACCES; + if (!S_ISREG(file_inode(file)->i_mode)) + goto exit; + + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = deny_write_access(file); + if (err) + goto exit; + + if (name->name[0] != '\0') + fsnotify_open(file); + +out: + return file; + +exit: + fput(file); + return ERR_PTR(err); +} + +struct file *open_exec(const char *name) +{ + struct filename *filename = getname_kernel(name); + struct file *f = ERR_CAST(filename); + + if (!IS_ERR(filename)) { + f = do_open_execat(AT_FDCWD, filename, 0); + putname(filename); + } + return f; +} +EXPORT_SYMBOL(open_exec); + +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) +{ + mm_segment_t old_fs; + loff_t pos = offset; + int result; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + result = vfs_read(file, (void __user *)addr, count, &pos); + set_fs(old_fs); + return result; +} + +EXPORT_SYMBOL(kernel_read); + +ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) +{ + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); + if (res > 0) + flush_icache_range(addr, addr + len); + return res; +} +EXPORT_SYMBOL(read_code); + +static int exec_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk; + struct mm_struct *old_mm, *active_mm; + + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; + old_mm = current->mm; + mm_release(tsk, old_mm); + + if (old_mm) { + sync_mm_rss(old_mm); + /* + * Make sure that if there is a core dump in progress + * for the old mm, we get out and die instead of going + * through with the exec. We must hold mmap_sem around + * checking core_state and changing tsk->mm. + */ + down_read(&old_mm->mmap_sem); + if (unlikely(old_mm->core_state)) { + up_read(&old_mm->mmap_sem); + return -EINTR; + } + } + task_lock(tsk); + preempt_disable_rt(); + active_mm = tsk->active_mm; + tsk->mm = mm; + tsk->active_mm = mm; + activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); + preempt_enable_rt(); + task_unlock(tsk); + if (old_mm) { + up_read(&old_mm->mmap_sem); + BUG_ON(active_mm != old_mm); + setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); + mm_update_next_owner(old_mm); + mmput(old_mm); + return 0; + } + mmdrop(active_mm); + return 0; +} + +/* + * This function makes sure the current process has its own signal table, + * so that flush_signal_handlers can later reset the handlers without + * disturbing other processes. (Other processes might share the signal + * table via the CLONE_SIGHAND option to clone().) + */ +static int de_thread(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *oldsighand = tsk->sighand; + spinlock_t *lock = &oldsighand->siglock; + + if (thread_group_empty(tsk)) + goto no_thread_group; + + /* + * Kill all other threads in the thread group. + */ + spin_lock_irq(lock); + if (signal_group_exit(sig)) { + /* + * Another group action in progress, just + * return so that the signal is processed. + */ + spin_unlock_irq(lock); + return -EAGAIN; + } + + sig->group_exit_task = tsk; + sig->notify_count = zap_other_threads(tsk); + if (!thread_group_leader(tsk)) + sig->notify_count--; + + while (sig->notify_count) { + __set_current_state(TASK_KILLABLE); + spin_unlock_irq(lock); + schedule(); + if (unlikely(__fatal_signal_pending(tsk))) + goto killed; + spin_lock_irq(lock); + } + spin_unlock_irq(lock); + + /* + * At this point all other threads have exited, all we have to + * do is to wait for the thread group leader to become inactive, + * and to assume its PID: + */ + if (!thread_group_leader(tsk)) { + struct task_struct *leader = tsk->group_leader; + + for (;;) { + threadgroup_change_begin(tsk); + write_lock_irq(&tasklist_lock); + /* + * Do this under tasklist_lock to ensure that + * exit_notify() can't miss ->group_exit_task + */ + sig->notify_count = -1; + if (likely(leader->exit_state)) + break; + __set_current_state(TASK_KILLABLE); + write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); + schedule(); + if (unlikely(__fatal_signal_pending(tsk))) + goto killed; + } + + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + tsk->start_time = leader->start_time; + tsk->real_start_time = leader->real_start_time; + + BUG_ON(!same_thread_group(leader, tsk)); + BUG_ON(has_group_leader_pid(tsk)); + /* + * An exec() starts a new thread group with the + * TGID of the previous thread group. Rehash the + * two threads with a switched PID, and release + * the former thread group leader: + */ + + /* Become a process group leader with the old leader's pid. + * The old leader becomes a thread of the this thread group. + * Note: The old leader also uses this pid until release_task + * is called. Odd but simple and correct. + */ + tsk->pid = leader->pid; + change_pid(tsk, PIDTYPE_PID, task_pid(leader)); + transfer_pid(leader, tsk, PIDTYPE_PGID); + transfer_pid(leader, tsk, PIDTYPE_SID); + + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); + + tsk->group_leader = tsk; + leader->group_leader = tsk; + + tsk->exit_signal = SIGCHLD; + leader->exit_signal = -1; + + BUG_ON(leader->exit_state != EXIT_ZOMBIE); + leader->exit_state = EXIT_DEAD; + + /* + * We are going to release_task()->ptrace_unlink() silently, + * the tracer can sleep in do_wait(). EXIT_DEAD guarantees + * the tracer wont't block again waiting for this thread. + */ + if (unlikely(leader->ptrace)) + __wake_up_parent(leader, leader->parent); + write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); + + release_task(leader); + } + + sig->group_exit_task = NULL; + sig->notify_count = 0; + +no_thread_group: + /* we have changed execution domain */ + tsk->exit_signal = SIGCHLD; + + exit_itimers(sig); + flush_itimer_signals(); + + if (atomic_read(&oldsighand->count) != 1) { + struct sighand_struct *newsighand; + /* + * This ->sighand is shared with the CLONE_SIGHAND + * but not CLONE_THREAD task, switch to the new one. + */ + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + if (!newsighand) + return -ENOMEM; + + atomic_set(&newsighand->count, 1); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); + + write_lock_irq(&tasklist_lock); + spin_lock(&oldsighand->siglock); + rcu_assign_pointer(tsk->sighand, newsighand); + spin_unlock(&oldsighand->siglock); + write_unlock_irq(&tasklist_lock); + + __cleanup_sighand(oldsighand); + } + + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + +char *get_task_comm(char *buf, struct task_struct *tsk) +{ + /* buf must be at least sizeof(tsk->comm) in size */ + task_lock(tsk); + strncpy(buf, tsk->comm, sizeof(tsk->comm)); + task_unlock(tsk); + return buf; +} +EXPORT_SYMBOL_GPL(get_task_comm); + +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ + +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) +{ + task_lock(tsk); + trace_task_rename(tsk, buf); + strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + task_unlock(tsk); + perf_event_comm(tsk, exec); +} + +int flush_old_exec(struct linux_binprm * bprm) +{ + int retval; + + /* + * Make sure we have a private signal table and that + * we are unassociated from the previous thread group. + */ + retval = de_thread(current); + if (retval) + goto out; + + /* + * Must be called _before_ exec_mmap() as bprm->mm is + * not visibile until then. This also enables the update + * to be lockless. + */ + set_mm_exe_file(bprm->mm, bprm->file); + + /* + * Release all of the old mmap stuff + */ + acct_arg_size(bprm, 0); + retval = exec_mmap(bprm->mm); + if (retval) + goto out; + + bprm->mm = NULL; /* We're using it now */ + + set_fs(USER_DS); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void would_dump(struct linux_binprm *bprm, struct file *file) +{ + if (inode_permission(file_inode(file), MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; +} +EXPORT_SYMBOL(would_dump); + +void setup_new_exec(struct linux_binprm * bprm) +{ + arch_pick_mmap_layout(current->mm); + + /* This is the point of no return */ + current->sas_ss_sp = current->sas_ss_size = 0; + + if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) + set_dumpable(current->mm, SUID_DUMP_USER); + else + set_dumpable(current->mm, suid_dumpable); + + perf_event_exec(); + __set_task_comm(current, kbasename(bprm->filename), true); + + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + current->mm->task_size = TASK_SIZE; + + /* install the new credentials */ + if (!uid_eq(bprm->cred->uid, current_euid()) || + !gid_eq(bprm->cred->gid, current_egid())) { + current->pdeath_signal = 0; + } else { + would_dump(bprm, bprm->file); + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) + set_dumpable(current->mm, suid_dumpable); + } + + /* An exec changes our domain. We are no longer part of the thread + group */ + current->self_exec_id++; + flush_signal_handlers(current, 0); + do_close_on_exec(current->files); +} +EXPORT_SYMBOL(setup_new_exec); + +/* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ENOMEM; +} + +static void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->signal->cred_guard_mutex); + abort_creds(bprm->cred); + } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } + /* If a binfmt changed the interp, free it. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); + kfree(bprm); +} + +int bprm_change_interp(char *interp, struct linux_binprm *bprm) +{ + /* If a binfmt changed the interp, free it first. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); + bprm->interp = kstrdup(interp, GFP_KERNEL); + if (!bprm->interp) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(bprm_change_interp); + +/* + * install the new credentials for this executable + */ +void install_exec_creds(struct linux_binprm *bprm) +{ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); + /* + * cred_guard_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked. + */ + security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->cred_guard_mutex); +} +EXPORT_SYMBOL(install_exec_creds); + +/* + * determine how safe it is to execute the proposed program + * - the caller must hold ->cred_guard_mutex to protect against + * PTRACE_ATTACH or seccomp thread-sync + */ +static void check_unsafe_exec(struct linux_binprm *bprm) +{ + struct task_struct *p = current, *t; + unsigned n_fs; + + if (p->ptrace) { + if (p->ptrace & PT_PTRACE_CAP) + bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; + else + bprm->unsafe |= LSM_UNSAFE_PTRACE; + } + + /* + * This isn't strictly necessary, but it makes it harder for LSMs to + * mess up. + */ + if (task_no_new_privs(current)) + bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + + t = p; + n_fs = 1; + spin_lock(&p->fs->lock); + rcu_read_lock(); + while_each_thread(p, t) { + if (t->fs == p->fs) + n_fs++; + } + rcu_read_unlock(); + + if (p->fs->users > n_fs) + bprm->unsafe |= LSM_UNSAFE_SHARE; + else + p->fs->in_exec = 1; + spin_unlock(&p->fs->lock); +} + +static void bprm_fill_uid(struct linux_binprm *bprm) +{ + struct inode *inode; + unsigned int mode; + kuid_t uid; + kgid_t gid; + + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); + + if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + return; + + if (task_no_new_privs(current)) + return; + + inode = file_inode(bprm->file); + mode = READ_ONCE(inode->i_mode); + if (!(mode & (S_ISUID|S_ISGID))) + return; + + /* Be careful if suid/sgid is set */ + mutex_lock(&inode->i_mutex); + + /* reload atomically mode/uid/gid now that lock held */ + mode = inode->i_mode; + uid = inode->i_uid; + gid = inode->i_gid; + mutex_unlock(&inode->i_mutex); + + /* We ignore suid/sgid if there are no mappings for them in the ns */ + if (!kuid_has_mapping(bprm->cred->user_ns, uid) || + !kgid_has_mapping(bprm->cred->user_ns, gid)) + return; + + if (mode & S_ISUID) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = uid; + } + + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = gid; + } +} + +/* + * Fill the binprm structure from the inode. + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). + */ +int prepare_binprm(struct linux_binprm *bprm) +{ + int retval; + + bprm_fill_uid(bprm); + + /* fill in binprm security blob */ + retval = security_bprm_set_creds(bprm); + if (retval) + return retval; + bprm->cred_prepared = 1; + + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); +} + +EXPORT_SYMBOL(prepare_binprm); + +/* + * Arguments are '\0' separated strings found at the location bprm->p + * points to; chop off the first by relocating brpm->p to right after + * the first '\0' encountered. + */ +int remove_arg_zero(struct linux_binprm *bprm) +{ + int ret = 0; + unsigned long offset; + char *kaddr; + struct page *page; + + if (!bprm->argc) + return 0; + + do { + offset = bprm->p & ~PAGE_MASK; + page = get_arg_page(bprm, bprm->p, 0); + if (!page) { + ret = -EFAULT; + goto out; + } + kaddr = kmap_atomic(page); + + for (; offset < PAGE_SIZE && kaddr[offset]; + offset++, bprm->p++) + ; + + kunmap_atomic(kaddr); + put_arg_page(page); + + if (offset == PAGE_SIZE) + free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); + } while (offset == PAGE_SIZE); + + bprm->p++; + bprm->argc--; + ret = 0; + +out: + return ret; +} +EXPORT_SYMBOL(remove_arg_zero); + +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) +/* + * cycle the list of binary formats handler, until one recognizes the image + */ +int search_binary_handler(struct linux_binprm *bprm) +{ + bool need_retry = IS_ENABLED(CONFIG_MODULES); + struct linux_binfmt *fmt; + int retval; + + /* This allows 4 levels of binfmt rewrites before failing hard. */ + if (bprm->recursion_depth > 5) + return -ELOOP; + + retval = security_bprm_check(bprm); + if (retval) + return retval; + + retval = -ENOENT; + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + bprm->recursion_depth++; + retval = fmt->load_binary(bprm); + read_lock(&binfmt_lock); + put_binfmt(fmt); + bprm->recursion_depth--; + if (retval < 0 && !bprm->mm) { + /* we got to flush_old_exec() and failed after it */ + read_unlock(&binfmt_lock); + force_sigsegv(SIGSEGV, current); + return retval; + } + if (retval != -ENOEXEC || !bprm->file) { + read_unlock(&binfmt_lock); + return retval; + } + } + read_unlock(&binfmt_lock); + + if (need_retry) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; + need_retry = false; + goto retry; + } + + return retval; +} +EXPORT_SYMBOL(search_binary_handler); + +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret; + + /* Need to fetch pid before load_binary changes it */ + old_pid = current->pid; + rcu_read_lock(); + old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); + rcu_read_unlock(); + + ret = search_binary_handler(bprm); + if (ret >= 0) { + audit_bprm(bprm); + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + proc_exec_connector(current); + } + + return ret; +} + +/* + * sys_execve() executes a new program. + */ +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + char *pathbuf = NULL; + struct linux_binprm *bprm; + struct file *file; + struct files_struct *displaced; + int retval; + + if (IS_ERR(filename)) + return PTR_ERR(filename); + + /* + * We move the actual failure in case of RLIMIT_NPROC excess from + * set*uid() to execve() because too many poorly written programs + * don't check setuid() return code. Here we additionally recheck + * whether NPROC limit is still exceeded. + */ + if ((current->flags & PF_NPROC_EXCEEDED) && + atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + retval = -EAGAIN; + goto out_ret; + } + + /* We're below the limit (still or again), so we don't want to make + * further execve() calls fail. */ + current->flags &= ~PF_NPROC_EXCEEDED; + + retval = unshare_files(&displaced); + if (retval) + goto out_ret; + + retval = -ENOMEM; + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + goto out_files; + + retval = prepare_bprm_creds(bprm); + if (retval) + goto out_free; + + check_unsafe_exec(bprm); + current->in_execve = 1; + + file = do_open_execat(fd, filename, flags); + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_unmark; + + sched_exec(); + + bprm->file = file; + if (fd == AT_FDCWD || filename->name[0] == '/') { + bprm->filename = filename->name; + } else { + if (filename->name[0] == '\0') + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + else + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + fd, filename->name); + if (!pathbuf) { + retval = -ENOMEM; + goto out_unmark; + } + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. Relies on having exclusive access to + * current->files (due to unshare_files above). + */ + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = pathbuf; + } + bprm->interp = bprm->filename; + + retval = bprm_mm_init(bprm); + if (retval) + goto out_unmark; + + bprm->argc = count(argv, MAX_ARG_STRINGS); + if ((retval = bprm->argc) < 0) + goto out; + + bprm->envc = count(envp, MAX_ARG_STRINGS); + if ((retval = bprm->envc) < 0) + goto out; + + retval = prepare_binprm(bprm); + if (retval < 0) + goto out; + + retval = copy_strings_kernel(1, &bprm->filename, bprm); + if (retval < 0) + goto out; + + bprm->exec = bprm->p; + retval = copy_strings(bprm->envc, envp, bprm); + if (retval < 0) + goto out; + + retval = copy_strings(bprm->argc, argv, bprm); + if (retval < 0) + goto out; + + retval = exec_binprm(bprm); + if (retval < 0) + goto out; + + /* execve succeeded */ + current->fs->in_exec = 0; + current->in_execve = 0; + acct_update_integrals(current); + task_numa_free(current); + free_bprm(bprm); + kfree(pathbuf); + putname(filename); + if (displaced) + put_files_struct(displaced); + return retval; + +out: + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } + +out_unmark: + current->fs->in_exec = 0; + current->in_execve = 0; + +out_free: + free_bprm(bprm); + kfree(pathbuf); + +out_files: + if (displaced) + reset_files_struct(displaced); +out_ret: + putname(filename); + return retval; +} + +int do_execve(struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return do_execveat_common(fd, filename, argv, envp, flags); +} + +#ifdef CONFIG_COMPAT +static int compat_do_execve(struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); +} +#endif + +void set_binfmt(struct linux_binfmt *new) +{ + struct mm_struct *mm = current->mm; + + if (mm->binfmt) + module_put(mm->binfmt->module); + + mm->binfmt = new; + if (new) + __module_get(new->module); +} +EXPORT_SYMBOL(set_binfmt); + +/* + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. + */ +void set_dumpable(struct mm_struct *mm, int value) +{ + unsigned long old, new; + + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; + + do { + old = ACCESS_ONCE(mm->flags); + new = (old & ~MMF_DUMPABLE_MASK) | value; + } while (cmpxchg(&mm->flags, old, new) != old); +} + +SYSCALL_DEFINE3(execve, + const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp) +{ + return do_execve(getname(filename), argv, envp); +} + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp) +{ + return compat_do_execve(getname(filename), argv, envp); +} + +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, + const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return compat_do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} +#endif diff --git a/kernel/fs/exofs/BUGS b/kernel/fs/exofs/BUGS new file mode 100644 index 000000000..1b2d4c63a --- /dev/null +++ b/kernel/fs/exofs/BUGS @@ -0,0 +1,3 @@ +- Out-of-space may cause a severe problem if the object (and directory entry) + were written, but the inode attributes failed. Then if the filesystem was + unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/kernel/fs/exofs/Kbuild b/kernel/fs/exofs/Kbuild new file mode 100644 index 000000000..b47c7b8dc --- /dev/null +++ b/kernel/fs/exofs/Kbuild @@ -0,0 +1,20 @@ +# +# Kbuild for the EXOFS module +# +# Copyright (C) 2008 Panasas Inc. All rights reserved. +# +# Authors: +# Boaz Harrosh +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 +# +# Kbuild - Gets included from the Kernels Makefile and build system +# + +# ore module library +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o + +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o +obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/kernel/fs/exofs/Kconfig b/kernel/fs/exofs/Kconfig new file mode 100644 index 000000000..86194b2f7 --- /dev/null +++ b/kernel/fs/exofs/Kconfig @@ -0,0 +1,13 @@ +config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. + +# Debugging-related stuff +config EXOFS_DEBUG + bool "Enable debugging" + depends on EXOFS_FS + help + This option enables EXOFS debug prints. diff --git a/kernel/fs/exofs/Kconfig.ore b/kernel/fs/exofs/Kconfig.ore new file mode 100644 index 000000000..2daf2329c --- /dev/null +++ b/kernel/fs/exofs/Kconfig.ore @@ -0,0 +1,14 @@ +# ORE - Objects Raid Engine (libore.ko) +# +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ + default SCSI_OSD_ULD diff --git a/kernel/fs/exofs/common.h b/kernel/fs/exofs/common.h new file mode 100644 index 000000000..7d88ef566 --- /dev/null +++ b/kernel/fs/exofs/common.h @@ -0,0 +1,262 @@ +/* + * common.h - Common definitions for both Kernel and user-mode utilities + * + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __EXOFS_COM_H__ +#define __EXOFS_COM_H__ + +#include + +#include +#include +#include + +/**************************************************************************** + * Object ID related defines + * NOTE: inode# = object ID - EXOFS_OBJ_OFF + ****************************************************************************/ +#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ +#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ +#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ +#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ + +/* exofs Application specific page/attribute */ +/* Inode attrs */ +# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) +# define EXOFS_ATTR_INODE_DATA 1 +# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 +# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 +/* Partition attrs */ +# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) +# define EXOFS_ATTR_SB_STATS 1 + +/* + * The maximum number of files we can have is limited by the size of the + * inode number. This is the largest object ID that the file system supports. + * Object IDs 0, 1, and 2 are always in use (see above defines). + */ +enum { + EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : + (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), + EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), +}; + +/**************************************************************************** + * Misc. + ****************************************************************************/ +#define EXOFS_BLKSHIFT 12 +#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) + +/**************************************************************************** + * superblock-related things + ****************************************************************************/ +#define EXOFS_SUPER_MAGIC 0x5DF5 + +/* + * The file system control block - stored in object EXOFS_SUPER_ID's data. + * This is where the in-memory superblock is stored on disk. + */ +enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; +struct exofs_fscb { + __le64 s_nextid; /* Only used after mkfs */ + __le64 s_numfiles; /* Only used after mkfs */ + __le32 s_version; /* == EXOFS_FSCB_VER */ + __le16 s_magic; /* Magic signature */ + __le16 s_newfs; /* Non-zero if this is a new fs */ + + /* From here on it's a static part, only written by mkexofs */ + __le64 s_dev_table_oid; /* Resurved, not used */ + __le64 s_dev_table_count; /* == 0 means no dev_table */ +} __packed; + +/* + * This struct is set on the FS partition's attributes. + * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together + * with the create command, to atomically persist the sb writeable information. + */ +struct exofs_sb_stats { + __le64 s_nextid; /* Highest object ID used */ + __le64 s_numfiles; /* Number of files on fs */ +} __packed; + +/* + * Describes the raid used in the FS. It is part of the device table. + * This here is taken from the pNFS-objects definition. In exofs we + * use one raid policy through-out the filesystem. (NOTE: the funny + * alignment at beginning. We take care of it at exofs_device_table. + */ +struct exofs_dt_data_map { + __le32 cb_num_comps; + __le64 cb_stripe_unit; + __le32 cb_group_width; + __le32 cb_group_depth; + __le32 cb_mirror_cnt; + __le32 cb_raid_algorithm; +} __packed; + +/* + * This is an osd device information descriptor. It is a single entry in + * the exofs device table. It describes an osd target lun which + * contains data belonging to this FS. (Same partition_id on all devices) + */ +struct exofs_dt_device_info { + __le32 systemid_len; + u8 systemid[OSD_SYSTEMID_LEN]; + __le64 long_name_offset; /* If !0 then offset-in-file */ + __le32 osdname_len; /* */ + u8 osdname[44]; /* Embbeded, Usually an asci uuid */ +} __packed; + +/* + * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. + * It contains the raid used for this multy-device FS and an array of + * participating devices. + */ +struct exofs_device_table { + __le32 dt_version; /* == EXOFS_DT_VER */ + struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ + + /* Resurved space For future use. Total includeing this: + * (8 * sizeof(le64)) + */ + __le64 __Resurved[4]; + + __le64 dt_num_devices; /* Array size */ + struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ +} __packed; + +/**************************************************************************** + * inode-related things + ****************************************************************************/ +#define EXOFS_IDATA 5 + +/* + * The file control block - stored in an object's attributes. This is where + * the in-memory inode is stored on disk. + */ +struct exofs_fcb { + __le64 i_size; /* Size of the file */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_flags; /* File flags (unused for now)*/ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ +}; + +#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) + +/* This is the Attribute the fcb is stored in */ +static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DATA, + EXOFS_INO_ATTR_SIZE); + +/**************************************************************************** + * dentry-related things + ****************************************************************************/ +#define EXOFS_NAME_LEN 255 + +/* + * The on-disk directory entry + */ +struct exofs_dir_entry { + __le64 inode_no; /* inode number */ + __le16 rec_len; /* directory entry length */ + u8 name_len; /* name length */ + u8 file_type; /* umm...file type */ + char name[EXOFS_NAME_LEN]; /* file name */ +}; + +enum { + EXOFS_FT_UNKNOWN, + EXOFS_FT_REG_FILE, + EXOFS_FT_DIR, + EXOFS_FT_CHRDEV, + EXOFS_FT_BLKDEV, + EXOFS_FT_FIFO, + EXOFS_FT_SOCK, + EXOFS_FT_SYMLINK, + EXOFS_FT_MAX +}; + +#define EXOFS_DIR_PAD 4 +#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) +#define EXOFS_DIR_REC_LEN(name_len) \ + (((name_len) + offsetof(struct exofs_dir_entry, name) + \ + EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) + +/* + * The on-disk (optional) layout structure. + * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT + * attribute, attached to any inode, usually to a directory. + */ + +enum exofs_inode_layout_gen_functions { + LAYOUT_MOVING_WINDOW = 0, + LAYOUT_IMPLICT = 1, +}; + +struct exofs_on_disk_inode_layout { + __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ + __le16 pad; + union { + /* gen_func == LAYOUT_MOVING_WINDOW (default) */ + struct exofs_layout_sliding_window { + __le32 num_devices; /* first n devices in global-table*/ + } sliding_window __packed; + + /* gen_func == LAYOUT_IMPLICT */ + struct exofs_layout_implict_list { + struct exofs_dt_data_map data_map; + /* Variable array of size data_map.cb_num_comps. These + * are device indexes of the devices in the global table + */ + __le32 dev_indexes[]; + } implict __packed; + }; +} __packed; + +static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) +{ + return sizeof(struct exofs_on_disk_inode_layout) + + max_devs * sizeof(__le32); +} + +#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/kernel/fs/exofs/dir.c b/kernel/fs/exofs/dir.c new file mode 100644 index 000000000..4deb0b05b --- /dev/null +++ b/kernel/fs/exofs/dir.c @@ -0,0 +1,667 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline unsigned exofs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void exofs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* Accesses dir's inode->i_size must be called under inode lock */ +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) +{ + loff_t last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + + if (!PageUptodate(page)) + SetPageUptodate(page); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + set_page_dirty(page); + + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + + return err; +} + +static void exofs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + unsigned chunk_size = exofs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct exofs_dir_entry *p; + char *error; + + /* if the page is the last one in the directory */ + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct exofs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < EXOFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + +Ebadsize: + EXOFS_ERR("ERROR [exofs_check_page]: " + "size of directory(0x%lx) is not a multiple of chunk size\n", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +bad_entry: + EXOFS_ERR( + "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " + "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + dir->i_ino, error, (page->index<inode_no)), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct exofs_dir_entry *)(kaddr + offs); + EXOFS_ERR("ERROR [exofs_check_page]: " + "entry in directory(0x%lx) spans the page boundary" + "offset=%lu, inode=0x%llx\n", + dir->i_ino, (page->index<inode_no))); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *exofs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + exofs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + exofs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline int exofs_match(int len, const unsigned char *name, + struct exofs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode_no) + return 0; + return !memcmp(name, de->name, len); +} + +static inline +struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) +{ + return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static inline unsigned +exofs_validate_entry(char *base, unsigned offset, unsigned mask) +{ + struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); + struct exofs_dir_entry *p = + (struct exofs_dir_entry *)(base + (offset&mask)); + while ((char *)p < (char *)de) { + if (p->rec_len == 0) + break; + p = exofs_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { + [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, + [EXOFS_FT_REG_FILE] = DT_REG, + [EXOFS_FT_DIR] = DT_DIR, + [EXOFS_FT_CHRDEV] = DT_CHR, + [EXOFS_FT_BLKDEV] = DT_BLK, + [EXOFS_FT_FIFO] = DT_FIFO, + [EXOFS_FT_SOCK] = DT_SOCK, + [EXOFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, +}; + +static inline +void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) +{ + umode_t mode = inode->i_mode; + de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static int +exofs_readdir(struct file *file, struct dir_context *ctx) +{ + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); + int need_revalidate = (file->f_version != inode->i_version); + + if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) + return 0; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct exofs_dir_entry *de; + struct page *page = exofs_get_page(inode, n); + + if (IS_ERR(page)) { + EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", + inode->i_ino); + ctx->pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = exofs_validate_entry(kaddr, offset, + chunk_mask); + ctx->pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct exofs_dir_entry *)(kaddr + offset); + limit = kaddr + exofs_last_byte(inode, n) - + EXOFS_DIR_REC_LEN(1); + for (; (char *)de <= limit; de = exofs_next_entry(de)) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: " + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + exofs_put_page(page); + return -EIO; + } + if (de->inode_no) { + unsigned char t; + + if (de->file_type < EXOFS_FT_MAX) + t = exofs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; + + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode_no), + t)) { + exofs_put_page(page); + return 0; + } + } + ctx->pos += le16_to_cpu(de->rec_len); + } + exofs_put_page(page); + } + return 0; +} + +struct exofs_dir_entry *exofs_find_entry(struct inode *dir, + struct dentry *dentry, struct page **res_page) +{ + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct exofs_i_info *oi = exofs_i(dir); + struct exofs_dir_entry *de; + + if (npages == 0) + goto out; + + *res_page = NULL; + + start = oi->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = exofs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct exofs_dir_entry *) kaddr; + kaddr += exofs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: zero-length entry in " + "directory(0x%lx)\n", + dir->i_ino); + exofs_put_page(page); + goto out; + } + if (exofs_match(namelen, name, de)) + goto found; + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + oi->i_dir_start_lookup = n; + return de; +} + +struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = exofs_get_page(dir, 0); + struct exofs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = exofs_next_entry( + (struct exofs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t exofs_parent_ino(struct dentry *child) +{ + struct page *page; + struct exofs_dir_entry *de; + ino_t ino; + + de = exofs_dotdot(d_inode(child), &page); + if (!de) + return 0; + + ino = le64_to_cpu(de->inode_no); + exofs_put_page(page); + return ino; +} + +ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct exofs_dir_entry *de; + struct page *page; + + de = exofs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode_no); + exofs_put_page(page); + } + return res; +} + +int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = le16_to_cpu(de->rec_len); + int err; + + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, len, + AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (err) + EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", + err); + + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + if (likely(!err)) + err = exofs_commit_chunk(page, pos, len); + exofs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + return err; +} + +int exofs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = exofs_chunk_size(dir); + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = exofs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + exofs_last_byte(dir, n); + de = (struct exofs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode_no = 0; + goto got_it; + } + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_add_link: " + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (exofs_match(namelen, name, de)) + goto out_unlock; + name_len = EXOFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode_no && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct exofs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + exofs_put_page(page); + } + + EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n", + dentry, inode->i_ino); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char *)de - (char *)page_address(page); + err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, + &page, NULL); + if (err) + goto out_unlock; + if (de->inode_no) { + struct exofs_dir_entry *de1 = + (struct exofs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + err = exofs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + sbi->s_numfiles++; + +out_put: + exofs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + loff_t pos; + struct exofs_dir_entry *pde = NULL; + struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); + int err; + + while (de < dir) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_delete_entry:" + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + err = -EIO; + goto out; + } + pde = de; + de = exofs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, + &page, NULL); + if (err) + EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n", + err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode_no = 0; + if (likely(!err)) + err = exofs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty(inode); + sbi->s_numfiles--; +out: + exofs_put_page(page); + return err; +} + +/* kept aligned on 4 bytes */ +#define THIS_DIR ".\0\0" +#define PARENT_DIR "..\0" + +int exofs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = exofs_chunk_size(inode); + struct exofs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, + &page, NULL); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page); + de = (struct exofs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); + memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + + de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); + de->inode_no = cpu_to_le64(parent->i_ino); + memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); + exofs_set_de_type(de, inode); + kunmap_atomic(kaddr); + err = exofs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +int exofs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct exofs_dir_entry *de; + page = exofs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct exofs_dir_entry *)kaddr; + kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_empty_dir: " + "zero-length directory entry" + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode_no != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (le64_to_cpu(de->inode_no) != + inode->i_ino) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + return 1; + +not_empty: + exofs_put_page(page); + return 0; +} + +const struct file_operations exofs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = exofs_readdir, +}; diff --git a/kernel/fs/exofs/exofs.h b/kernel/fs/exofs/exofs.h new file mode 100644 index 000000000..ad9cac670 --- /dev/null +++ b/kernel/fs/exofs/exofs.h @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __EXOFS_H__ +#define __EXOFS_H__ + +#include +#include +#include +#include + +#include "common.h" + +#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define EXOFS_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define EXOFS_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +struct exofs_dev { + struct ore_dev ored; + unsigned did; + unsigned urilen; + uint8_t *uri; + struct kobject ed_kobj; +}; +/* + * our extension to the in-memory superblock + */ +struct exofs_sb_info { + struct backing_dev_info bdi; /* register our bdi with VFS */ + struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ + int s_timeout; /* timeout for OSD operations */ + uint64_t s_nextid; /* highest object ID used */ + uint32_t s_numfiles; /* number of files on fs */ + spinlock_t s_next_gen_lock; /* spinlock for gen # update */ + u32 s_next_generation; /* next gen # to use */ + atomic_t s_curr_pending; /* number of pending commands */ + + struct ore_layout layout; /* Default files layout */ + struct ore_comp one_comp; /* id & cred of partition id=0*/ + struct ore_components oc; /* comps for the partition */ + struct kobject s_kobj; /* holds per-sbi kobject */ +}; + +/* + * our extension to the in-memory inode + */ +struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ + uint64_t i_commit_size; /* the object's written length */ + struct ore_comp one_comp; /* same component for all devices */ + struct ore_components oc; /* inode view of the device table */ +}; + +static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) +{ + return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; +} + +/* + * our inode flags + */ +#define OBJ_2BCREATED 0 /* object will be created soon*/ +#define OBJ_CREATED 1 /* object has been created on the osd*/ + +static inline int obj_2bcreated(struct exofs_i_info *oi) +{ + return test_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline void set_obj_2bcreated(struct exofs_i_info *oi) +{ + set_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline int obj_created(struct exofs_i_info *oi) +{ + return test_bit(OBJ_CREATED, &oi->i_flags); +} + +static inline void set_obj_created(struct exofs_i_info *oi) +{ + set_bit(OBJ_CREATED, &oi->i_flags); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi); +static inline int wait_obj_created(struct exofs_i_info *oi) +{ + if (likely(obj_created(oi))) + return 0; + + return __exofs_wait_obj_created(oi); +} + +/* + * get to our inode from the vfs inode + */ +static inline struct exofs_i_info *exofs_i(struct inode *inode) +{ + return container_of(inode, struct exofs_i_info, vfs_inode); +} + +/* + * Maximum count of links to a file + */ +#define EXOFS_LINK_MAX 32000 + +/************************* + * function declarations * + *************************/ + +/* inode.c */ +unsigned exofs_max_io_pages(struct ore_layout *layout, + unsigned expected_pages); +int exofs_setattr(struct dentry *, struct iattr *); +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern struct inode *exofs_iget(struct super_block *, unsigned long); +struct inode *exofs_new_inode(struct inode *, umode_t); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); +extern void exofs_evict_inode(struct inode *); + +/* dir.c: */ +int exofs_add_link(struct dentry *, struct inode *); +ino_t exofs_inode_by_name(struct inode *, struct dentry *); +int exofs_delete_entry(struct exofs_dir_entry *, struct page *); +int exofs_make_empty(struct inode *, struct inode *); +struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, + struct page **); +int exofs_empty_dir(struct inode *); +struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); +ino_t exofs_parent_ino(struct dentry *child); +int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, + struct inode *); + +/* super.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); +int exofs_sbi_write_stats(struct exofs_sb_info *sbi); + +/* sys.c */ +int exofs_sysfs_init(void); +void exofs_sysfs_uninit(void); +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev); +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); +int exofs_sysfs_odev_add(struct exofs_dev *edev, + struct exofs_sb_info *sbi); +void exofs_sysfs_dbg_print(void); + +/********************* + * operation vectors * + *********************/ +/* dir.c: */ +extern const struct file_operations exofs_dir_operations; + +/* file.c */ +extern const struct inode_operations exofs_file_inode_operations; +extern const struct file_operations exofs_file_operations; + +/* inode.c */ +extern const struct address_space_operations exofs_aops; + +/* namei.c */ +extern const struct inode_operations exofs_dir_inode_operations; +extern const struct inode_operations exofs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations exofs_symlink_inode_operations; +extern const struct inode_operations exofs_fast_symlink_inode_operations; + +/* exofs_init_comps will initialize an ore_components device array + * pointing to a single ore_comp struct, and a round-robin view + * of the device table. + * The first device of each inode is the [inode->ino % num_devices] + * and the rest of the devices sequentially following where the + * first device is after the last device. + * It is assumed that the global device array at @sbi is twice + * bigger and that the device table repeats twice. + * See: exofs_read_lookup_dev_table() + */ +static inline void exofs_init_comps(struct ore_components *oc, + struct ore_comp *one_comp, + struct exofs_sb_info *sbi, osd_id oid) +{ + unsigned dev_mod = (unsigned)oid, first_dev; + + one_comp->obj.partition = sbi->one_comp.obj.partition; + one_comp->obj.id = oid; + exofs_make_credential(one_comp->cred, &one_comp->obj); + + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; + + /* Round robin device view of the table */ + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; +} + +#endif diff --git a/kernel/fs/exofs/file.c b/kernel/fs/exofs/file.c new file mode 100644 index 000000000..906de66e8 --- /dev/null +++ b/kernel/fs/exofs/file.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "exofs.h" + +static int exofs_release_file(struct inode *inode, struct file *filp) +{ + return 0; +} + +/* exofs_file_fsync - flush the inode to disk + * + * Note, in exofs all metadata is written as part of inode, regardless. + * The writeout is synchronous + */ +static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ret = sync_inode_metadata(filp->f_mapping->host, 1); + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int exofs_flush(struct file *file, fl_owner_t id) +{ + int ret = vfs_fsync(file, 0); + /* TODO: Flush the OSD target */ + return ret; +} + +const struct file_operations exofs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = exofs_release_file, + .fsync = exofs_file_fsync, + .flush = exofs_flush, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; + +const struct inode_operations exofs_file_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/kernel/fs/exofs/inode.c b/kernel/fs/exofs/inode.c new file mode 100644 index 000000000..786e4cc8c --- /dev/null +++ b/kernel/fs/exofs/inode.c @@ -0,0 +1,1524 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "exofs.h" + +#define EXOFS_DBGMSG2(M...) do {} while (0) + +unsigned exofs_max_io_pages(struct ore_layout *layout, + unsigned expected_pages) +{ + unsigned pages = min_t(unsigned, expected_pages, + layout->max_io_length / PAGE_SIZE); + + return pages; +} + +struct page_collect { + struct exofs_sb_info *sbi; + struct inode *inode; + unsigned expected_pages; + struct ore_io_state *ios; + + struct page **pages; + unsigned alloc_pages; + unsigned nr_pages; + unsigned long length; + loff_t pg_first; /* keep 64bit also in 32-arches */ + bool read_4_write; /* This means two things: that the read is sync + * And the pages should not be unlocked. + */ + struct page *that_locked_page; +}; + +static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, + struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + + pcol->sbi = sbi; + pcol->inode = inode; + pcol->expected_pages = expected_pages; + + pcol->ios = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + pcol->read_4_write = false; + pcol->that_locked_page = NULL; +} + +static void _pcol_reset(struct page_collect *pcol) +{ + pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); + + pcol->pages = NULL; + pcol->alloc_pages = 0; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + pcol->ios = NULL; + pcol->that_locked_page = NULL; + + /* this is probably the end of the loop but in writes + * it might not end here. don't be left with nothing + */ + if (!pcol->expected_pages) + pcol->expected_pages = + exofs_max_io_pages(&pcol->sbi->layout, ~0); +} + +static int pcol_try_alloc(struct page_collect *pcol) +{ + unsigned pages; + + /* TODO: easily support bio chaining */ + pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); + + for (; pages; pages >>= 1) { + pcol->pages = kmalloc(pages * sizeof(struct page *), + GFP_KERNEL); + if (likely(pcol->pages)) { + pcol->alloc_pages = pages; + return 0; + } + } + + EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", + pcol->expected_pages); + return -ENOMEM; +} + +static void pcol_free(struct page_collect *pcol) +{ + kfree(pcol->pages); + pcol->pages = NULL; + + if (pcol->ios) { + ore_put_io_state(pcol->ios); + pcol->ios = NULL; + } +} + +static int pcol_add_page(struct page_collect *pcol, struct page *page, + unsigned len) +{ + if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) + return -ENOMEM; + + pcol->pages[pcol->nr_pages++] = page; + pcol->length += len; + return 0; +} + +enum {PAGE_WAS_NOT_IN_IO = 17}; +static int update_read_page(struct page *page, int ret) +{ + switch (ret) { + case 0: + /* Everything is OK */ + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + break; + case -EFAULT: + /* In this case we were trying to read something that wasn't on + * disk yet - return a page full of zeroes. This should be OK, + * because the object should be empty (if there was a write + * before this read, the read would be waiting with the page + * locked */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + EXOFS_DBGMSG("recovered read error\n"); + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: + SetPageError(page); + } + return ret; +} + +static void update_write_page(struct page *page, int ret) +{ + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + + if (ret) { + mapping_set_error(page->mapping, ret); + SetPageError(page); + } + end_page_writeback(page); +} + +/* Called at the end of reads, to optionally unlock pages and update their + * status. + */ +static int __readpages_done(struct page_collect *pcol) +{ + int i; + u64 good_bytes; + u64 length = 0; + int ret = ore_check_io(pcol->ios, NULL); + + if (likely(!ret)) { + good_bytes = pcol->length; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } + + EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages at end */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", + inode->i_ino, page->index, + page_stat ? "bad_bytes" : "good_bytes"); + + ret = update_read_page(page, page_stat); + if (!pcol->read_4_write) + unlock_page(page); + length += PAGE_SIZE; + } + + pcol_free(pcol); + EXOFS_DBGMSG2("readpages_done END\n"); + return ret; +} + +/* callback of async reads */ +static void readpages_done(struct ore_io_state *ios, void *p) +{ + struct page_collect *pcol = p; + + __readpages_done(pcol); + atomic_dec(&pcol->sbi->s_curr_pending); + kfree(pcol); +} + +static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) +{ + int i; + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + + if (rw == READ) + update_read_page(page, ret); + else + update_write_page(page, ret); + + unlock_page(page); + } +} + +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + +static int read_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct ore_io_state *ios; + struct page_collect *pcol_copy = NULL; + int ret; + + if (!pcol->pages) + return 0; + + if (!pcol->ios) { + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + + if (ret) + return ret; + } + + ios = pcol->ios; + ios->pages = pcol->pages; + + if (pcol->read_4_write) { + ore_read(pcol->ios); + return __readpages_done(pcol); + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + ios->done = readpages_done; + ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + + ret = ore_read(ios); + if (unlikely(ret)) + goto err; + + atomic_inc(&pcol->sbi->s_curr_pending); + + return 0; + +err: + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, READ); + pcol_free(pcol_copy); + kfree(pcol_copy); + + return ret; +} + +/* readpage_strip is called either directly from readpage() or by the VFS from + * within read_cache_pages(), to add one more page to be read. It will try to + * collect as many contiguous pages as posible. If a discontinuity is + * encountered, or it runs out of resources, it will submit the previous segment + * and will start a new collection. Eventually caller must submit the last + * segment if present. + */ +static int readpage_strip(void *data, struct page *page) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + BUG_ON(!PageLocked(page)); + + /* FIXME: Just for debugging, will be removed */ + if (PageUptodate(page)) + EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, + page->index); + + pcol->that_locked_page = page; + + if (page->index < end_index) + len = PAGE_CACHE_SIZE; + else if (page->index == end_index) + len = i_size & ~PAGE_CACHE_MASK; + else + len = 0; + + if (!len || !obj_created(oi)) { + /* this will be out of bounds, or doesn't exist yet. + * Current page is cleared and the request is split + */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + + if (!pcol->read_4_write) + unlock_page(page); + EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " + "read_4_write=%d index=0x%lx end_index=0x%lx " + "splitting\n", inode->i_ino, len, + pcol->read_4_write, page->index, end_index); + + return read_exec(pcol); + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = read_exec(pcol); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->pages) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + if (len != PAGE_CACHE_SIZE) + zero_user(page, len, PAGE_CACHE_SIZE - len); + + EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (ret) { + EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " + "this_len=0x%zx nr_pages=%u length=0x%lx\n", + page, len, pcol->nr_pages, pcol->length); + + /* split the request, and start again with current page */ + ret = read_exec(pcol); + if (unlikely(ret)) + goto fail; + + goto try_again; + } + + return 0; + +fail: + /* SetPageError(page); ??? */ + unlock_page(page); + return ret; +} + +static int exofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, nr_pages, mapping->host); + + ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); + if (ret) { + EXOFS_ERR("read_cache_pages => %d\n", ret); + return ret; + } + + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + + return read_exec(&pcol); +} + +static int _readpage(struct page *page, bool read_4_write) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + pcol.read_4_write = read_4_write; + ret = readpage_strip(&pcol, page); + if (ret) { + EXOFS_ERR("_readpage => %d\n", ret); + return ret; + } + + return read_exec(&pcol); +} + +/* + * We don't need the file + */ +static int exofs_readpage(struct file *file, struct page *page) +{ + return _readpage(page, false); +} + +/* Callback for osd_write. All writes are asynchronous */ +static void writepages_done(struct ore_io_state *ios, void *p) +{ + struct page_collect *pcol = p; + int i; + u64 good_bytes; + u64 length = 0; + int ret = ore_check_io(ios, NULL); + + atomic_dec(&pcol->sbi->s_curr_pending); + + if (likely(!ret)) { + good_bytes = pcol->length; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } + + EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages to a bio */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + update_write_page(page, page_stat); + unlock_page(page); + EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", + inode->i_ino, page->index, page_stat); + + length += PAGE_SIZE; + } + + pcol_free(pcol); + kfree(pcol); + EXOFS_DBGMSG2("writepages_done END\n"); +} + +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page; + loff_t i_size = i_size_read(pcol->inode); + + if (offset >= i_size) { + *uptodate = true; + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); + return ZERO_PAGE(0); + } + + page = find_get_page(pcol->inode->i_mapping, index); + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { + EXOFS_DBGMSG2("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", + ZERO_PAGE(0) == page ? -1 : page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +static int write_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct ore_io_state *ios; + struct page_collect *pcol_copy = NULL; + int ret; + + if (!pcol->pages) + return 0; + + BUG_ON(pcol->ios); + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + if (unlikely(ret)) + goto err; + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + + ios = pcol->ios; + ios->pages = pcol_copy->pages; + ios->done = writepages_done; + ios->r4w = &_r4w_op; + ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + + ret = ore_write(ios); + if (unlikely(ret)) { + EXOFS_ERR("write_exec: ore_write() Failed\n"); + goto err; + } + + atomic_inc(&pcol->sbi->s_curr_pending); + return 0; + +err: + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, WRITE); + pcol_free(pcol_copy); + kfree(pcol_copy); + + return ret; +} + +/* writepage_strip is called either directly from writepage() or by the VFS from + * within write_cache_pages(), to add one more page to be written to storage. + * It will try to collect as many contiguous pages as possible. If a + * discontinuity is encountered or it runs out of resources it will submit the + * previous segment and will start a new collection. + * Eventually caller must submit the last segment if present. + */ +static int writepage_strip(struct page *page, + struct writeback_control *wbc_unused, void *data) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + BUG_ON(!PageLocked(page)); + + ret = wait_obj_created(oi); + if (unlikely(ret)) + goto fail; + + if (page->index < end_index) + /* in this case, the page is within the limits of the file */ + len = PAGE_CACHE_SIZE; + else { + len = i_size & ~PAGE_CACHE_MASK; + + if (page->index > end_index || !len) { + /* in this case, the page is outside the limits + * (truncate in progress) + */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + if (PageError(page)) + ClearPageError(page); + unlock_page(page); + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " + "outside the limits\n", + inode->i_ino, page->index); + return 0; + } + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", + inode->i_ino, page->index); + goto try_again; + } + + if (!pcol->pages) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (unlikely(ret)) { + EXOFS_DBGMSG2("Failed pcol_add_page " + "nr_pages=%u total_length=0x%lx\n", + pcol->nr_pages, pcol->length); + + /* split the request, next loop will start again */ + ret = write_exec(pcol); + if (unlikely(ret)) { + EXOFS_DBGMSG("write_exec failed => %d", ret); + goto fail; + } + + goto try_again; + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + return 0; + +fail: + EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", + inode->i_ino, page->index, ret); + set_bit(AS_EIO, &page->mapping->flags); + unlock_page(page); + return ret; +} + +static int exofs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct page_collect pcol; + long start, end, expected_pages; + int ret; + + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = (wbc->range_end == LLONG_MAX) ? + start + mapping->nrpages : + wbc->range_end >> PAGE_CACHE_SHIFT; + + if (start || end) + expected_pages = end - start + 1; + else + expected_pages = mapping->nrpages; + + if (expected_pages < 32L) + expected_pages = 32L; + + EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", + mapping->host->i_ino, wbc->range_start, wbc->range_end, + mapping->nrpages, start, end, expected_pages); + + _pcol_init(&pcol, expected_pages, mapping->host); + + ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); + if (unlikely(ret)) { + EXOFS_ERR("write_cache_pages => %d\n", ret); + return ret; + } + + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; +} + +/* +static int exofs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + ret = writepage_strip(page, NULL, &pcol); + if (ret) { + EXOFS_ERR("exofs_writepage => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} +*/ +/* i_mutex held using inode->i_size directly */ +static void _write_failed(struct inode *inode, loff_t to) +{ + if (to > inode->i_size) + truncate_pagecache(inode, inode->i_size); +} + +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret = 0; + struct page *page; + + page = *pagep; + if (page == NULL) { + ret = simple_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); + if (ret) { + EXOFS_DBGMSG("simple_write_begin failed\n"); + goto out; + } + + page = *pagep; + } + + /* read modify write */ + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + loff_t i_size = i_size_read(mapping->host); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t rlen; + + if (page->index < end_index) + rlen = PAGE_CACHE_SIZE; + else if (page->index == end_index) + rlen = i_size & ~PAGE_CACHE_MASK; + else + rlen = 0; + + if (!rlen) { + clear_highpage(page); + SetPageUptodate(page); + goto out; + } + + ret = _readpage(page, true); + if (ret) { + /*SetPageError was done by _readpage. Is it ok?*/ + unlock_page(page); + EXOFS_DBGMSG("__readpage failed\n"); + } + } +out: + if (unlikely(ret)) + _write_failed(mapping->host, pos + len); + + return ret; +} + +static int exofs_write_begin_export(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + + return exofs_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); +} + +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (unlikely(ret)) + _write_failed(inode, pos + len); + + /* TODO: once simple_write_end marks inode dirty remove */ + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + +static int exofs_releasepage(struct page *page, gfp_t gfp) +{ + EXOFS_DBGMSG("page 0x%lx\n", page->index); + WARN_ON(1); + return 0; +} + +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); + WARN_ON(1); +} + + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + return 0; +} + +const struct address_space_operations exofs_aops = { + .readpage = exofs_readpage, + .readpages = exofs_readpages, + .writepage = NULL, + .writepages = exofs_writepages, + .write_begin = exofs_write_begin_export, + .write_end = exofs_write_end, + .releasepage = exofs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .invalidatepage = exofs_invalidatepage, + + /* Not implemented Yet */ + .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ + .direct_IO = exofs_direct_IO, + + /* With these NULL has special meaning or default is not exported */ + .migratepage = NULL, + .launder_page = NULL, + .is_partially_uptodate = NULL, + .error_remove_page = NULL, +}; + +/****************************************************************************** + * INODE OPERATIONS + *****************************************************************************/ + +/* + * Test whether an inode is a fast symlink. + */ +static inline int exofs_inode_is_fast_symlink(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + + return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); +} + +static int _do_truncate(struct inode *inode, loff_t newsize) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + int ret; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); + if (likely(!ret)) + truncate_setsize(inode, newsize); + + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", + inode->i_ino, newsize, ret); + return ret; +} + +/* + * Set inode attributes - update size attribute on OSD if needed, + * otherwise just call generic functions. + */ +int exofs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int error; + + /* if we are about to modify an object, and it hasn't been + * created yet, wait + */ + error = wait_obj_created(exofs_i(inode)); + if (unlikely(error)) + return error; + + error = inode_change_ok(inode, iattr); + if (unlikely(error)) + return error; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + error = _do_truncate(inode, iattr->ia_size); + if (unlikely(error)) + return error; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + return 0; +} + +static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_FILE_LAYOUT, + 0); +static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DIR_LAYOUT, + 0); + +/* + * Read the Linux inode info from the OSD, and return it as is. In exofs the + * inode info is in an application specific page/attribute of the osd-object. + */ +static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, + struct exofs_fcb *inode) +{ + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_attr attrs[] = { + [0] = g_attr_inode_data, + [1] = g_attr_inode_file_layout, + [2] = g_attr_inode_dir_layout, + }; + struct ore_io_state *ios; + struct exofs_on_disk_inode_layout *layout; + int ret; + + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + return ret; + } + + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = ore_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", + _LLU(oi->one_comp.obj.id), ret); + memset(inode, 0, sizeof(*inode)); + inode->i_mode = 0040000 | (0777 & ~022); + /* If object is lost on target we might as well enable it's + * delete. + */ + ret = 0; + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); + goto out; + } + WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); + + ret = extract_attr_from_ios(ios, &attrs[1]); + if (ret) { + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); + goto out; + } + if (attrs[1].len) { + layout = attrs[1].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported files layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + ret = extract_attr_from_ios(ios, &attrs[2]); + if (ret) { + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); + goto out; + } + if (attrs[2].len) { + layout = attrs[2].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported meta-data layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + +out: + ore_put_io_state(ios); + return ret; +} + +static void __oi_init(struct exofs_i_info *oi) +{ + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; +} +/* + * Fill in an inode read from the OSD and set it up for use + */ +struct inode *exofs_iget(struct super_block *sb, unsigned long ino) +{ + struct exofs_i_info *oi; + struct exofs_fcb fcb; + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + oi = exofs_i(inode); + __oi_init(oi); + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); + + /* read the inode from the osd */ + ret = exofs_get_inode(sb, oi, &fcb); + if (ret) + goto bad_inode; + + set_obj_created(oi); + + /* copy stuff from on-disk struct to in-memory struct */ + inode->i_mode = le16_to_cpu(fcb.i_mode); + i_uid_write(inode, le32_to_cpu(fcb.i_uid)); + i_gid_write(inode, le32_to_cpu(fcb.i_gid)); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); + inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); + inode->i_ctime.tv_nsec = + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; + oi->i_commit_size = le64_to_cpu(fcb.i_size); + i_size_write(inode, oi->i_commit_size); + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_generation = le32_to_cpu(fcb.i_generation); + + oi->i_dir_start_lookup = 0; + + if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (fcb.i_data[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(fcb.i_data[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(fcb.i_data[1])); + } else { + memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (exofs_inode_is_fast_symlink(inode)) + inode->i_op = &exofs_fast_symlink_inode_operations; + else { + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + } + } else { + inode->i_op = &exofs_special_inode_operations; + if (fcb.i_data[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(fcb.i_data[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(fcb.i_data[1]))); + } + + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi) +{ + if (!obj_created(oi)) { + EXOFS_DBGMSG("!obj_created\n"); + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + EXOFS_DBGMSG("wait_event done\n"); + } + return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; +} + +/* + * Callback function from exofs_new_inode(). The important thing is that we + * set the obj_created flag so that other methods know that the object exists on + * the OSD. + */ +static void create_done(struct ore_io_state *ios, void *p) +{ + struct inode *inode = p; + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + int ret; + + ret = ore_check_io(ios, NULL); + ore_put_io_state(ios); + + atomic_dec(&sbi->s_curr_pending); + + if (unlikely(ret)) { + EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", + _LLU(exofs_oi_objno(oi)), + _LLU(oi->one_comp.obj.partition)); + /*TODO: When FS is corrupted creation can fail, object already + * exist. Get rid of this asynchronous creation, if exist + * increment the obj counter and try the next object. Until we + * succeed. All these dangling objects will be made into lost + * files by chkfs.exofs + */ + } + + set_obj_created(oi); + + wake_up(&oi->i_wq); +} + +/* + * Set up a new inode and create an object for it on the OSD + */ +struct inode *exofs_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct inode *inode; + struct exofs_i_info *oi; + struct ore_io_state *ios; + int ret; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + oi = exofs_i(inode); + __oi_init(oi); + + set_obj_2bcreated(oi); + + inode_init_owner(inode, dir, mode); + inode->i_ino = sbi->s_nextid++; + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + oi->i_commit_size = inode->i_size = 0; + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); + exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ + + mark_inode_dirty(inode); + + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); + return ERR_PTR(ret); + } + + ios->done = create_done; + ios->private = inode; + + ret = ore_create(ios); + if (ret) { + ore_put_io_state(ios); + return ERR_PTR(ret); + } + atomic_inc(&sbi->s_curr_pending); + + return inode; +} + +/* + * struct to pass two arguments to update_inode's callback + */ +struct updatei_args { + struct exofs_sb_info *sbi; + struct exofs_fcb fcb; +}; + +/* + * Callback function from exofs_update_inode(). + */ +static void updatei_done(struct ore_io_state *ios, void *p) +{ + struct updatei_args *args = p; + + ore_put_io_state(ios); + + atomic_dec(&args->sbi->s_curr_pending); + + kfree(args); +} + +/* + * Write the inode to the OSD. Just fill up the struct, and set the attribute + * synchronously or asynchronously depending on the do_sync flag. + */ +static int exofs_update_inode(struct inode *inode, int do_sync) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct ore_io_state *ios; + struct osd_attr attr; + struct exofs_fcb *fcb; + struct updatei_args *args; + int ret; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + EXOFS_DBGMSG("Failed kzalloc of args\n"); + return -ENOMEM; + } + + fcb = &args->fcb; + + fcb->i_mode = cpu_to_le16(inode->i_mode); + fcb->i_uid = cpu_to_le32(i_uid_read(inode)); + fcb->i_gid = cpu_to_le32(i_gid_read(inode)); + fcb->i_links_count = cpu_to_le16(inode->i_nlink); + fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + oi->i_commit_size = i_size_read(inode); + fcb->i_size = cpu_to_le64(oi->i_commit_size); + fcb->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + fcb->i_data[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + fcb->i_data[1] = 0; + } else { + fcb->i_data[0] = 0; + fcb->i_data[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + fcb->i_data[2] = 0; + } + } else + memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); + + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + goto free_args; + } + + attr = g_attr_inode_data; + attr.val_ptr = fcb; + ios->out_attr_len = 1; + ios->out_attr = &attr; + + wait_obj_created(oi); + + if (!do_sync) { + args->sbi = sbi; + ios->done = updatei_done; + ios->private = args; + } + + ret = ore_write(ios); + if (!do_sync && !ret) { + atomic_inc(&sbi->s_curr_pending); + goto out; /* deallocation in updatei_done */ + } + + ore_put_io_state(ios); +free_args: + kfree(args); +out: + EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", + inode->i_ino, do_sync, ret); + return ret; +} + +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ + return exofs_update_inode(inode, 1); +} + +/* + * Callback function from exofs_delete_inode() - don't have much cleaning up to + * do. + */ +static void delete_done(struct ore_io_state *ios, void *p) +{ + struct exofs_sb_info *sbi = p; + + ore_put_io_state(ios); + + atomic_dec(&sbi->s_curr_pending); +} + +/* + * Called when the refcount of an inode reaches zero. We remove the object + * from the OSD here. We make sure the object was created before we try and + * delete it. + */ +void exofs_evict_inode(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct ore_io_state *ios; + int ret; + + truncate_inode_pages_final(&inode->i_data); + + /* TODO: should do better here */ + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + inode->i_size = 0; + clear_inode(inode); + + /* if we are deleting an obj that hasn't been created yet, wait. + * This also makes sure that create_done cannot be called with an + * already evicted inode. + */ + wait_obj_created(oi); + /* ignore the error, attempt a remove anyway */ + + /* Now Remove the OSD objects */ + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); + return; + } + + ios->done = delete_done; + ios->private = sbi; + + ret = ore_remove(ios); + if (ret) { + EXOFS_ERR("%s: ore_remove failed\n", __func__); + ore_put_io_state(ios); + return; + } + atomic_inc(&sbi->s_curr_pending); + + return; + +no_delete: + clear_inode(inode); +} diff --git a/kernel/fs/exofs/namei.c b/kernel/fs/exofs/namei.c new file mode 100644 index 000000000..5ae25e431 --- /dev/null +++ b/kernel/fs/exofs/namei.c @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = exofs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > EXOFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = exofs_inode_by_name(dir, dentry); + inode = ino ? exofs_iget(dir->i_sb, ino) : NULL; + return d_splice_alias(inode, dentry); +} + +static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode = exofs_new_inode(dir, mode); + int err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct inode *inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + inode = exofs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct super_block *sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode *inode; + struct exofs_i_info *oi; + + if (l > sb->s_blocksize) + goto out; + + inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + oi = exofs_i(inode); + if (l > sizeof(oi->i_data)) { + /* slow symlink */ + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + memset(oi->i_data, 0, sizeof(oi->i_data)); + + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &exofs_fast_symlink_inode_operations; + memcpy(oi->i_data, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = exofs_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int exofs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + ihold(inode); + + return exofs_add_nondir(dentry, inode); +} + +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int err; + + inode_inc_link_count(dir); + + inode = exofs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + + inode_inc_link_count(inode); + + err = exofs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = exofs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int exofs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct exofs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = exofs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + err = exofs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int exofs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int err = -ENOTEMPTY; + + if (exofs_empty_dir(inode)) { + err = exofs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *dir_page = NULL; + struct exofs_dir_entry *dir_de = NULL; + struct page *old_page; + struct exofs_dir_entry *old_de; + int err = -ENOENT; + + old_de = exofs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = exofs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct exofs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !exofs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = exofs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + err = exofs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + if (err) + goto out_dir; + } else { + err = exofs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + old_inode->i_ctime = CURRENT_TIME; + + exofs_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + if (err) + goto out_dir; + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations exofs_dir_inode_operations = { + .create = exofs_create, + .lookup = exofs_lookup, + .link = exofs_link, + .unlink = exofs_unlink, + .symlink = exofs_symlink, + .mkdir = exofs_mkdir, + .rmdir = exofs_rmdir, + .mknod = exofs_mknod, + .rename = exofs_rename, + .setattr = exofs_setattr, +}; + +const struct inode_operations exofs_special_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/kernel/fs/exofs/ore.c b/kernel/fs/exofs/ore.c new file mode 100644 index 000000000..7bd8ac8df --- /dev/null +++ b/kernel/fs/exofs/ore.c @@ -0,0 +1,1164 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include + +#include "ore_raid.h" + +MODULE_AUTHOR("Boaz Harrosh "); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; + +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } + + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } + + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + (layout->group_width - layout->parity); + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; + + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); + +static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) +{ + return ios->oc->comps[index & ios->oc->single_comp].cred; +} + +static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) +{ + return &ios->oc->comps[index & ios->oc->single_comp].obj; +} + +static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) +{ + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); +} + +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; + + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } + } + + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; + ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } + + return 0; +} +EXPORT_SYMBOL(ore_get_rw_state); + +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) +{ + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); +} +EXPORT_SYMBOL(ore_get_io_state); + +void ore_put_io_state(struct ore_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + _ore_free_raid_stuff(ios); + kfree(ios); + } +} +EXPORT_SYMBOL(ore_put_io_state); + +static void _sync_done(struct ore_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct ore_io_state *ios = container_of( + kref, struct ore_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct ore_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +int ore_io_execute(struct ore_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); + if (unlikely(ret)) { + ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = ore_check_io(ios, NULL); + } + return ret; +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + bio_for_each_segment_all(bv, bio, i) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; + int ret; + + if (unlikely(!or)) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ + _clear_bio(per_dev->bio); + ORE_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(per_dev->offset), + _LLU(per_dev->length)); + + continue; /* we recovered */ + } + + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + unsigned dev = per_dev->dev - ios->oc->first_dev; + struct ore_dev *od = ios->oc->ods[dev]; + + on_dev_error(ios, od, dev, osi.osd_err_pri, + offset, residual); + } + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + return acumulated_lin_err; +} +EXPORT_SYMBOL(ore_check_io); + +/* + * L - logical offset into the file + * + * D - number of Data devices + * D = group_width - parity + * + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D + * + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) + * T = U * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * S = T * group_count + * + * M - The "major" (i.e., across all components) cycle number + * M = L / S + * + * G - Counts the groups from the beginning of the major cycle + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * N = H / U + * + * C - The component index coresponding to L + * + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] + * + * O - The component offset coresponding to L + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) + */ +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) +{ + u32 stripe_unit = layout->stripe_unit; + u32 group_width = layout->group_width; + u64 group_depth = layout->group_depth; + u32 parity = layout->parity; + + u32 D = group_width - parity; + u32 U = D * stripe_unit; + u64 T = U * group_depth; + u64 S = T * layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + u32 Nlast; + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; + + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + + si->M = M; +} +EXPORT_SYMBOL(ore_calc_stripe_info); + +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_ios_od(ios, per_dev->dev)); + unsigned len = cur_len; + int ret; + + if (per_dev->bio == NULL) { + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + bio_size); + ret = -ENOMEM; + goto out; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) { + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + per_dev->length += len; + *cur_pg = pg; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; +} + +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + +static int _prepare_for_striping(struct ore_io_state *ios) +{ + struct ore_striping_info *si = &ios->si; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; + int ret = 0; + + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + while (length) { + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ + per_dev->dev = dev; + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); + if (unlikely(ret)) + goto out; + + length -= cur_len; + + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; + } + + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); + if (unlikely(ret)) + goto out; + + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; + } + } +out: + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + return ret; +} + +int ore_create(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_create); + +int ore_remove(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_remove); + +static int _write_mirror(struct ore_io_state *ios, int cur_comp) +{ + struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; + + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + per_dev->or = or; + + if (ios->pages) { + struct bio *bio; + + if (per_dev != master_dev) { + bio = bio_clone_kmalloc(master_dev->bio, + GFP_KERNEL); + if (unlikely(!bio)) { + ORE_DBGMSG( + "Failed to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto out; + } + + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->offset = master_dev->offset; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; + } else { + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, _ios_obj(ios, cur_comp), + per_dev->offset, bio, per_dev->length); + ORE_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(per_dev->length), dev); + } else if (ios->kern_buff) { + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), + per_dev->offset, + ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; + ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(ios->length), per_dev->dev); + } else { + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + ios->out_attr_len, dev); + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + +out: + return ret; +} + +int ore_write(struct ore_io_state *ios) +{ + int i; + int ret; + + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_write); + +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) +{ + struct osd_request *or; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_obj_id *obj = _ios_obj(ios, cur_comp); + unsigned first_dev = (unsigned)obj->id; + + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; + or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + if (ios->pages) { + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d sg_len=%d\n", _LLU(obj->id), + _LLU(per_dev->offset), _LLU(per_dev->length), + first_dev, per_dev->cur_sg); + } else { + BUG_ON(ios->kern_buff); + + osd_req_get_attributes(or, obj); + ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(obj->id), + ios->in_attr_len, first_dev); + } + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); + + return 0; +} + +int ore_read(struct ore_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _ore_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_read); + +int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} +EXPORT_SYMBOL(extract_attr_from_ios); + +static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + +struct _trunc_info { + struct ore_striping_info si; + u64 prev_group_obj_off; + u64 next_group_obj_off; + + unsigned first_group_dev; + unsigned nex_group_dev; +}; + +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) +{ + unsigned stripe_unit = layout->stripe_unit; + + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); + + ti->prev_group_obj_off = ti->si.M * stripe_unit; + ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; + + ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); + ti->nex_group_dev = ti->first_group_dev + layout->group_width; +} + +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, + u64 size) +{ + struct ore_io_state *ios; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + struct _trunc_info ti; + int i, ret; + + ret = ore_get_io_state(layout, oc, &ios); + if (unlikely(ret)) + return ret; + + _calc_trunk_info(ios->layout, size, &ti); + + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } + + ios->numdevs = ios->oc->numdevs; + + for (i = 0; i < ios->numdevs; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; + + if (i < ti.first_group_dev) + obj_size = ti.prev_group_obj_off; + else if (i >= ti.nex_group_dev) + obj_size = ti.next_group_obj_off; + else if (i < ti.si.dev) /* dev within this group */ + obj_size = ti.si.obj_offset + + ios->layout->stripe_unit - ti.si.unit_off; + else if (i == ti.si.dev) + obj_size = ti.si.obj_offset; + else /* i > ti.dev */ + obj_size = ti.si.obj_offset - ti.si.unit_off; + + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + _LLU(oc->comps->obj.id), _LLU(obj_size), i); + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; + } + ret = ore_io_execute(ios); + +out: + kfree(size_attrs); + ore_put_io_state(ios); + return ret; +} +EXPORT_SYMBOL(ore_truncate); + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +EXPORT_SYMBOL(g_attr_logical_length); diff --git a/kernel/fs/exofs/ore_raid.c b/kernel/fs/exofs/ore_raid.c new file mode 100644 index 000000000..27cbdb697 --- /dev/null +++ b/kernel/fs/exofs/ore_raid.c @@ -0,0 +1,721 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include +#include + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +static struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +static void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + int p, c; + + if (!sp2d->needed) + return; + + for (c = data_devs - 1; c >= 0; --c) + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + int p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, + struct page *page, unsigned pg_len) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, + si->obj_offset % PAGE_SIZE); + if (unlikely(added_len != pg_len)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += pg_len; + return 0; +} + +/* read the beginning of an unaligned first page */ +static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) +{ + struct ore_striping_info si; + unsigned pg_len; + + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); + + pg_len = si.obj_offset % PAGE_SIZE; + si.obj_offset -= pg_len; + + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", + _LLU(si.obj_offset), pg_len, page->index, si.dev); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +/* read the end of an incomplete last page */ +static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) +{ + struct ore_striping_info si; + struct page *page; + unsigned pg_len, p, c; + + ore_calc_stripe_info(ios->layout, *offset, 0, &si); + + p = si.cur_pg; + c = si.cur_comp; + page = ios->sp2d->_1p_stripes[p].pages[c]; + + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); + *offset += pg_len; + + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); + + BUG_ON(!page); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write_first_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", + offset, ios->offset, min_p, max_p); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) { + if (ios->offset % PAGE_SIZE) + /* Read the remainder of the page */ + _add_to_r4w_first_page(ios, *pp); + /* to-be-written pages start here */ + goto read_last_stripe; + } + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + return 0; +} + +static int _read_4_write_last_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); + /* offset will be aligned to next page */ + + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.cur_pg; + c = read_si.cur_comp; + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", + offset, last_stripe_end, min_p, max_p); + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + return 0; +} + +static int _read_4_write_execute(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + unsigned i; + int ret; + + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + ore_put_io_state(ios_read); + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len, bool do_xor) +{ + if (ios->reading) { + if (per_dev->cur_sg >= ios->sgs_per_dev) { + ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , + per_dev->cur_sg, ios->sgs_per_dev); + return -ENOMEM; + } + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + if (do_xor) + _read_4_write_first_stripe(ios); + } + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ + _read_4_write_last_stripe(ios); + _read_4_write_execute(ios); + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp < sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + if (ios->parity_pages) { + struct ore_layout *layout = ios->layout; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/kernel/fs/exofs/ore_raid.h b/kernel/fs/exofs/ore_raid.h new file mode 100644 index 000000000..a6e746775 --- /dev/null +++ b/kernel/fs/exofs/ore_raid.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/kernel/fs/exofs/super.c b/kernel/fs/exofs/super.c new file mode 100644 index 000000000..b795c567b --- /dev/null +++ b/kernel/fs/exofs/super.c @@ -0,0 +1,1049 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "exofs.h" + +#define EXOFS_DBGMSG2(M...) do {} while (0) + +/****************************************************************************** + * MOUNT OPTIONS + *****************************************************************************/ + +/* + * struct to hold what we get from mount options + */ +struct exofs_mountopt { + bool is_osdname; + const char *dev_name; + uint64_t pid; + int timeout; +}; + +/* + * exofs-specific mount-time options. + */ +enum { Opt_name, Opt_pid, Opt_to, Opt_err }; + +/* + * Our mount-time options. These should ideally be 64-bit unsigned, but the + * kernel's parsing functions do not currently support that. 32-bit should be + * sufficient for most applications now. + */ +static match_table_t tokens = { + {Opt_name, "osdname=%s"}, + {Opt_pid, "pid=%u"}, + {Opt_to, "to=%u"}, + {Opt_err, NULL} +}; + +/* + * The main option parsing method. Also makes sure that all of the mandatory + * mount options were set. + */ +static int parse_options(char *options, struct exofs_mountopt *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + bool s_pid = false; + + EXOFS_DBGMSG("parse_options %s\n", options); + /* defaults */ + memset(opts, 0, sizeof(*opts)); + opts->timeout = BLK_DEFAULT_SG_TIMEOUT; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + char str[32]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_name: + opts->dev_name = match_strdup(&args[0]); + if (unlikely(!opts->dev_name)) { + EXOFS_ERR("Error allocating dev_name"); + return -ENOMEM; + } + opts->is_osdname = true; + break; + case Opt_pid: + if (0 == match_strlcpy(str, &args[0], sizeof(str))) + return -EINVAL; + opts->pid = simple_strtoull(str, NULL, 0); + if (opts->pid < EXOFS_MIN_PID) { + EXOFS_ERR("Partition ID must be >= %u", + EXOFS_MIN_PID); + return -EINVAL; + } + s_pid = 1; + break; + case Opt_to: + if (match_int(&args[0], &option)) + return -EINVAL; + if (option <= 0) { + EXOFS_ERR("Timout must be > 0"); + return -EINVAL; + } + opts->timeout = option * HZ; + break; + } + } + + if (!s_pid) { + EXOFS_ERR("Need to specify the following options:\n"); + EXOFS_ERR(" -o pid=pid_no_to_use\n"); + return -EINVAL; + } + + return 0; +} + +/****************************************************************************** + * INODE CACHE + *****************************************************************************/ + +/* + * Our inode cache. Isn't it pretty? + */ +static struct kmem_cache *exofs_inode_cachep; + +/* + * Allocate an inode in the cache + */ +static struct inode *exofs_alloc_inode(struct super_block *sb) +{ + struct exofs_i_info *oi; + + oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + oi->vfs_inode.i_version = 1; + return &oi->vfs_inode; +} + +static void exofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + +/* + * Remove an inode from the cache + */ +static void exofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, exofs_i_callback); +} + +/* + * Initialize the inode + */ +static void exofs_init_once(void *foo) +{ + struct exofs_i_info *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +/* + * Create and initialize the inode cache + */ +static int init_inodecache(void) +{ + exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", + sizeof(struct exofs_i_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exofs_init_once); + if (exofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +/* + * Destroy the inode cache + */ +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(exofs_inode_cachep); +} + +/****************************************************************************** + * Some osd helpers + *****************************************************************************/ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%p ret=>%d\n", + _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); + return ret; +} + +static const struct osd_attr g_attr_sb_stats = ATTR_DEF( + EXOFS_APAGE_SB_DATA, + EXOFS_ATTR_SB_STATS, + sizeof(struct exofs_sb_stats)); + +static int __sbi_read_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct ore_io_state *ios; + int ret; + + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + return ret; + } + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = ore_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("Error reading super_block stats => %d\n", ret); + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); + goto out; + } + if (attrs[0].len) { + struct exofs_sb_stats *ess; + + if (unlikely(attrs[0].len != sizeof(*ess))) { + EXOFS_ERR("%s: Wrong version of exofs_sb_stats " + "size(%d) != expected(%zd)\n", + __func__, attrs[0].len, sizeof(*ess)); + goto out; + } + + ess = attrs[0].val_ptr; + sbi->s_nextid = le64_to_cpu(ess->s_nextid); + sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); + } + +out: + ore_put_io_state(ios); + return ret; +} + +static void stats_done(struct ore_io_state *ios, void *p) +{ + ore_put_io_state(ios); + /* Good thanks nothing to do anymore */ +} + +/* Asynchronously write the stats attribute */ +int exofs_sbi_write_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct ore_io_state *ios; + int ret; + + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + return ret; + } + + sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); + sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); + attrs[0].val_ptr = &sbi->s_ess; + + + ios->done = stats_done; + ios->private = sbi; + ios->out_attr = attrs; + ios->out_attr_len = ARRAY_SIZE(attrs); + + ret = ore_write(ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_write failed.\n", __func__); + ore_put_io_state(ios); + } + + return ret; +} + +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + +/* + * Write the superblock to the OSD + */ +static int exofs_sync_fs(struct super_block *sb, int wait) +{ + struct exofs_sb_info *sbi; + struct exofs_fscb *fscb; + struct ore_comp one_comp; + struct ore_components oc; + struct ore_io_state *ios; + int ret = -ENOMEM; + + fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); + if (unlikely(!fscb)) + return -ENOMEM; + + sbi = sb->s_fs_info; + + /* NOTE: We no longer dirty the super_block anywhere in exofs. The + * reason we write the fscb here on unmount is so we can stay backwards + * compatible with fscb->s_version == 1. (What we are not compatible + * with is if a new version FS crashed and then we try to mount an old + * version). Otherwise the exofs_fscb is read-only from mkfs time. All + * the writeable info is set in exofs_sbi_write_stats() above. + */ + + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); + + ret = ore_get_io_state(&sbi->layout, &oc, &ios); + if (unlikely(ret)) + goto out; + + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); + memset(fscb, 0, ios->length); + fscb->s_nextid = cpu_to_le64(sbi->s_nextid); + fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); + fscb->s_magic = cpu_to_le16(sb->s_magic); + fscb->s_newfs = 0; + fscb->s_version = EXOFS_FSCB_VER; + + ios->offset = 0; + ios->kern_buff = fscb; + + ret = ore_write(ios); + if (unlikely(ret)) + EXOFS_ERR("%s: ore_write failed.\n", __func__); + +out: + EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); + ore_put_io_state(ios); + kfree(fscb); + return ret; +} + +static void _exofs_print_device(const char *msg, const char *dev_path, + struct osd_dev *od, u64 pid) +{ + const struct osd_dev_info *odi = osduld_device_info(od); + + printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", + msg, dev_path ?: "", odi->osdname, _LLU(pid)); +} + +static void exofs_free_sbi(struct exofs_sb_info *sbi) +{ + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); + + if (od) { + ore_comp_set_dev(&sbi->oc, i, NULL); + osduld_put_device(od); + } + } + kfree(sbi->oc.ods); + kfree(sbi); +} + +/* + * This function is called when the vfs is freeing the superblock. We just + * need to free our own part. + */ +static void exofs_put_super(struct super_block *sb) +{ + int num_pend; + struct exofs_sb_info *sbi = sb->s_fs_info; + + /* make sure there are no pending commands */ + for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; + num_pend = atomic_read(&sbi->s_curr_pending)) { + wait_queue_head_t wq; + + printk(KERN_NOTICE "%s: !!Pending operations in flight. " + "This is a BUG. please report to osd-dev@open-osd.org\n", + __func__); + init_waitqueue_head(&wq); + wait_event_timeout(wq, + (atomic_read(&sbi->s_curr_pending) == 0), + msecs_to_jiffies(100)); + } + + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); + + exofs_sysfs_sb_del(sbi); + bdi_destroy(&sbi->bdi); + exofs_free_sbi(sbi); + sb->s_fs_info = NULL; +} + +static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_device_table *dt) +{ + int ret; + + sbi->layout.stripe_unit = + le64_to_cpu(dt->dt_data_map.cb_stripe_unit); + sbi->layout.group_width = + le32_to_cpu(dt->dt_data_map.cb_group_width); + sbi->layout.group_depth = + le32_to_cpu(dt->dt_data_map.cb_group_depth); + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = + le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); + + ret = ore_verify_layout(numdevs, &sbi->layout); + + EXOFS_DBGMSG("exofs: layout: " + "num_comps=%u stripe_unit=0x%x group_width=%u " + "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", + numdevs, + sbi->layout.stripe_unit, + sbi->layout.group_width, + _LLU(sbi->layout.group_depth), + sbi->layout.mirrors_p1, + sbi->layout.raid_algorithm); + return ret; +} + +static unsigned __ra_pages(struct ore_layout *layout) +{ + const unsigned _MIN_RA = 32; /* min 128K read-ahead */ + unsigned ra_pages = layout->group_width * layout->stripe_unit / + PAGE_SIZE; + unsigned max_io_pages = exofs_max_io_pages(layout, ~0); + + ra_pages *= 2; /* two stripes */ + if (ra_pages < _MIN_RA) + ra_pages = roundup(_MIN_RA, ra_pages / 2); + + if (ra_pages > max_io_pages) + ra_pages = max_io_pages; + + return ra_pages; +} + +/* @odi is valid only as long as @fscb_dev is valid */ +static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, + struct osd_dev_info *odi) +{ + odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); + if (likely(odi->systemid_len)) + memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); + + odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); + odi->osdname = dt_dev->osdname; + + /* FIXME support long names. Will need a _put function */ + if (dt_dev->long_name_offset) + return -EINVAL; + + /* Make sure osdname is printable! + * mkexofs should give us space for a null-terminator else the + * device-table is invalid. + */ + if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) + odi->osdname_len = sizeof(dt_dev->osdname) - 1; + dt_dev->osdname[odi->osdname_len] = 0; + + /* If it's all zeros something is bad we read past end-of-obj */ + return !(odi->systemid_len || odi->osdname_len); +} + +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, + struct osd_dev *fscb_od, + unsigned table_count) +{ + struct ore_comp comp; + struct exofs_device_table *dt; + struct exofs_dev *eds; + unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + + sizeof(*dt); + unsigned numdevs, i; + int ret; + + dt = kmalloc(table_bytes, GFP_KERNEL); + if (unlikely(!dt)) { + EXOFS_ERR("ERROR: allocating %x bytes for device table\n", + table_bytes); + return -ENOMEM; + } + + sbi->oc.numdevs = 0; + + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_DEVTABLE_ID; + exofs_make_credential(comp.cred, &comp.obj); + + ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, + table_bytes); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: reading device table\n"); + goto out; + } + + numdevs = le64_to_cpu(dt->dt_num_devices); + if (unlikely(!numdevs)) { + ret = -EINVAL; + goto out; + } + WARN_ON(table_count != numdevs); + + ret = _read_and_match_data_map(sbi, numdevs, dt); + if (unlikely(ret)) + goto out; + + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); + + /* create sysfs subdir under which we put the device table + * And cluster layout. A Superblock is identified by the string: + * "dev[0].osdname"_"pid" + */ + exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); + + for (i = 0; i < numdevs; i++) { + struct exofs_fscb fscb; + struct osd_dev_info odi; + struct osd_dev *od; + + if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { + EXOFS_ERR("ERROR: Read all-zeros device entry\n"); + ret = -EINVAL; + goto out; + } + + printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", + i, odi.osdname); + + /* the exofs id is currently the table index */ + eds[i].did = i; + + /* On all devices the device table is identical. The user can + * specify any one of the participating devices on the command + * line. We always keep them in device-table order. + */ + if (fscb_od && osduld_device_same(fscb_od, &odi)) { + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; + fscb_od = NULL; + exofs_sysfs_odev_add(&eds[i], sbi); + continue; + } + + od = osduld_info_lookup(&odi); + if (IS_ERR(od)) { + ret = PTR_ERR(od); + EXOFS_ERR("ERROR: device requested is not found " + "osd_name-%s =>%d\n", odi.osdname, ret); + goto out; + } + + eds[i].ored.od = od; + ++sbi->oc.numdevs; + + /* Read the fscb of the other devices to make sure the FS + * partition is there. + */ + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, + sizeof(fscb)); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: Malformed participating device " + "error reading fscb osd_name-%s\n", + odi.osdname); + goto out; + } + exofs_sysfs_odev_add(&eds[i], sbi); + + /* TODO: verify other information is correct and FS-uuid + * matches. Benny what did you say about device table + * generation and old devices? + */ + } + +out: + kfree(dt); + if (unlikely(fscb_od && !ret)) { + EXOFS_ERR("ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + return -EINVAL; + } + return ret; +} + +/* + * Read the superblock from the OSD and fill in the fields + */ +static int exofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root; + struct exofs_mountopt *opts = data; + struct exofs_sb_info *sbi; /*extended info */ + struct osd_dev *od; /* Master device */ + struct exofs_fscb fscb; /*on-disk superblock info */ + struct ore_comp comp; + unsigned table_count; + int ret; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + /* use mount options to fill superblock */ + if (opts->is_osdname) { + struct osd_dev_info odi = {.systemid_len = 0}; + + odi.osdname_len = strlen(opts->dev_name); + odi.osdname = (u8 *)opts->dev_name; + od = osduld_info_lookup(&odi); + kfree(opts->dev_name); + opts->dev_name = NULL; + } else { + od = osduld_path_lookup(opts->dev_name); + } + if (IS_ERR(od)) { + ret = -EINVAL; + goto free_sbi; + } + + /* Default layout in case we do not have a device-table */ + sbi->layout.stripe_unit = PAGE_SIZE; + sbi->layout.mirrors_p1 = 1; + sbi->layout.group_width = 1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + sbi->s_timeout = opts->timeout; + + sbi->one_comp.obj.partition = opts->pid; + sbi->one_comp.obj.id = 0; + exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; + + /* fill in some other data by hand */ + memset(sb->s_id, 0, sizeof(sb->s_id)); + strcpy(sb->s_id, "exofs"); + sb->s_blocksize = EXOFS_BLKSIZE; + sb->s_blocksize_bits = EXOFS_BLKSHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = EXOFS_LINK_MAX; + atomic_set(&sbi->s_curr_pending, 0); + sb->s_bdev = NULL; + sb->s_dev = 0; + + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_SUPER_ID; + exofs_make_credential(comp.cred, &comp.obj); + + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) + goto free_sbi; + + sb->s_magic = le16_to_cpu(fscb.s_magic); + /* NOTE: we read below to be backward compatible with old versions */ + sbi->s_nextid = le64_to_cpu(fscb.s_nextid); + sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); + + /* make sure what we read from the object store is correct */ + if (sb->s_magic != EXOFS_SUPER_MAGIC) { + if (!silent) + EXOFS_ERR("ERROR: Bad magic value\n"); + ret = -EINVAL; + goto free_sbi; + } + if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { + EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", + EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); + ret = -EINVAL; + goto free_sbi; + } + + /* start generation numbers from a random point */ + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + table_count = le64_to_cpu(fscb.s_dev_table_count); + if (table_count) { + ret = exofs_read_lookup_dev_table(sbi, od, table_count); + if (unlikely(ret)) + goto free_sbi; + } else { + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); + sbi->oc.numdevs = 1; + } + + __sbi_read_stats(sbi); + + /* set up operation vectors */ + sbi->bdi.ra_pages = __ra_pages(&sbi->layout); + sb->s_bdi = &sbi->bdi; + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); + ret = PTR_ERR(root); + goto free_sbi; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + EXOFS_ERR("ERROR: get root inode failed\n"); + ret = -ENOMEM; + goto free_sbi; + } + + if (!S_ISDIR(root->i_mode)) { + dput(sb->s_root); + sb->s_root = NULL; + EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", + root->i_mode); + ret = -EINVAL; + goto free_sbi; + } + + ret = bdi_setup_and_register(&sbi->bdi, "exofs"); + if (ret) { + EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + dput(sb->s_root); + sb->s_root = NULL; + goto free_sbi; + } + + exofs_sysfs_dbg_print(); + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); + return 0; + +free_sbi: + EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", + opts->dev_name, sbi->one_comp.obj.partition, ret); + exofs_free_sbi(sbi); + return ret; +} + +/* + * Set up the superblock (calls exofs_fill_super eventually) + */ +static struct dentry *exofs_mount(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + struct exofs_mountopt opts; + int ret; + + ret = parse_options(data, &opts); + if (ret) + return ERR_PTR(ret); + + if (!opts.dev_name) + opts.dev_name = dev_name; + return mount_nodev(type, flags, &opts, exofs_fill_super); +} + +/* + * Return information about the file system state in the buffer. This is used + * by the 'df' command, for example. + */ +static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct ore_io_state *ios; + struct osd_attr attrs[] = { + ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, + OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), + ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, + OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), + }; + uint64_t capacity = ULLONG_MAX; + uint64_t used = ULLONG_MAX; + int ret; + + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); + if (ret) { + EXOFS_DBGMSG("ore_get_io_state failed.\n"); + return ret; + } + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = ore_read(ios); + if (unlikely(ret)) + goto out; + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (likely(!ret)) { + capacity = get_unaligned_be64(attrs[0].val_ptr); + if (unlikely(!capacity)) + capacity = ULLONG_MAX; + } else + EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); + + ret = extract_attr_from_ios(ios, &attrs[1]); + if (likely(!ret)) + used = get_unaligned_be64(attrs[1].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); + + /* fill in the stats buffer */ + buf->f_type = EXOFS_SUPER_MAGIC; + buf->f_bsize = EXOFS_BLKSIZE; + buf->f_blocks = capacity >> 9; + buf->f_bfree = (capacity - used) >> 9; + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_numfiles; + buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; + buf->f_namelen = EXOFS_NAME_LEN; + +out: + ore_put_io_state(ios); + return ret; +} + +static const struct super_operations exofs_sops = { + .alloc_inode = exofs_alloc_inode, + .destroy_inode = exofs_destroy_inode, + .write_inode = exofs_write_inode, + .evict_inode = exofs_evict_inode, + .put_super = exofs_put_super, + .sync_fs = exofs_sync_fs, + .statfs = exofs_statfs, +}; + +/****************************************************************************** + * EXPORT OPERATIONS + *****************************************************************************/ + +static struct dentry *exofs_get_parent(struct dentry *child) +{ + unsigned long ino = exofs_parent_ino(child); + + if (!ino) + return ERR_PTR(-ESTALE); + + return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino)); +} + +static struct inode *exofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = exofs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *exofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static struct dentry *exofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static const struct export_operations exofs_export_ops = { + .fh_to_dentry = exofs_fh_to_dentry, + .fh_to_parent = exofs_fh_to_parent, + .get_parent = exofs_get_parent, +}; + +/****************************************************************************** + * INSMOD/RMMOD + *****************************************************************************/ + +/* + * struct that describes this file system + */ +static struct file_system_type exofs_type = { + .owner = THIS_MODULE, + .name = "exofs", + .mount = exofs_mount, + .kill_sb = generic_shutdown_super, +}; +MODULE_ALIAS_FS("exofs"); + +static int __init init_exofs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out; + + err = register_filesystem(&exofs_type); + if (err) + goto out_d; + + /* We don't fail if sysfs creation failed */ + exofs_sysfs_init(); + + return 0; +out_d: + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_exofs(void) +{ + exofs_sysfs_uninit(); + unregister_filesystem(&exofs_type); + destroy_inodecache(); +} + +MODULE_AUTHOR("Avishay Traeger "); +MODULE_DESCRIPTION("exofs"); +MODULE_LICENSE("GPL"); + +module_init(init_exofs) +module_exit(exit_exofs) diff --git a/kernel/fs/exofs/symlink.c b/kernel/fs/exofs/symlink.c new file mode 100644 index 000000000..6f6f3a4c1 --- /dev/null +++ b/kernel/fs/exofs/symlink.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "exofs.h" + +static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct exofs_i_info *oi = exofs_i(d_inode(dentry)); + + nd_set_link(nd, (char *)oi->i_data); + return NULL; +} + +const struct inode_operations exofs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; + +const struct inode_operations exofs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = exofs_follow_link, +}; diff --git a/kernel/fs/exofs/sys.c b/kernel/fs/exofs/sys.c new file mode 100644 index 000000000..5e6a2c0a1 --- /dev/null +++ b/kernel/fs/exofs/sys.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2012 + * Sachin Bhamare + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published by + * the Free Software Foundation. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the: + * Free Software Foundation + */ + +#include +#include + +#include "exofs.h" + +struct odev_attr { + struct attribute attr; + ssize_t (*show)(struct exofs_dev *, char *); + ssize_t (*store)(struct exofs_dev *, const char *, size_t); +}; + +static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->show ? a->show(edp, buf) : 0; +} + +static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->store ? a->store(edp, buf, len) : len; +} + +static const struct sysfs_ops odev_attr_ops = { + .show = odev_attr_show, + .store = odev_attr_store, +}; + + +static struct kset *exofs_kset; + +static ssize_t osdname_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); +} + +static ssize_t systemid_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + memcpy(buf, odi->systemid, odi->systemid_len); + return odi->systemid_len; +} + +static ssize_t uri_show(struct exofs_dev *edp, char *buf) +{ + return snprintf(buf, edp->urilen, "%s", edp->uri); +} + +static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) +{ + uint8_t *new_uri; + + edp->urilen = strlen(buf) + 1; + new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + if (new_uri == NULL) + return -ENOMEM; + edp->uri = new_uri; + strncpy(edp->uri, buf, edp->urilen); + return edp->urilen; +} + +#define OSD_ATTR(name, mode, show, store) \ + static struct odev_attr odev_attr_##name = \ + __ATTR(name, mode, show, store) + +OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); +OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); +OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); + +static struct attribute *odev_attrs[] = { + &odev_attr_osdname.attr, + &odev_attr_systemid.attr, + &odev_attr_uri.attr, + NULL, +}; + +static struct kobj_type odev_ktype = { + .default_attrs = odev_attrs, + .sysfs_ops = &odev_attr_ops, +}; + +static struct kobj_type uuid_ktype = { +}; + +void exofs_sysfs_dbg_print(void) +{ +#ifdef CONFIG_EXOFS_DEBUG + struct kobject *k_name, *k_tmp; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + printk(KERN_INFO "%s: name %s ref %d\n", + __func__, kobject_name(k_name), + (int)atomic_read(&k_name->kref.refcount)); + } +#endif +} +/* + * This function removes all kobjects under exofs_kset + * At the end of it, exofs_kset kobject will have a refcount + * of 1 which gets decremented only on exofs module unload + */ +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) +{ + struct kobject *k_name, *k_tmp; + struct kobject *s_kobj = &sbi->s_kobj; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + /* Remove all that are children of this SBI */ + if (k_name->parent == s_kobj) + kobject_put(k_name); + } + kobject_put(s_kobj); +} + +/* + * This function creates sysfs entries to hold the current exofs cluster + * instance (uniquely identified by osdname,pid tuple). + * This function gets called once per exofs mount instance. + */ +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev) +{ + struct kobject *s_kobj; + int retval = 0; + uint64_t pid = sbi->one_comp.obj.partition; + + /* allocate new uuid dirent */ + s_kobj = &sbi->s_kobj; + s_kobj->kset = exofs_kset; + retval = kobject_init_and_add(s_kobj, &uuid_ktype, + &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); + return -ENOMEM; + } + return 0; +} + +int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) +{ + struct kobject *d_kobj; + int retval = 0; + + /* create osd device group which contains following attributes + * osdname, systemid & uri + */ + d_kobj = &edev->ed_kobj; + d_kobj->kset = exofs_kset; + retval = kobject_init_and_add(d_kobj, &odev_ktype, + &sbi->s_kobj, "dev%u", edev->did); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "device dev%u\n", edev->did); + return retval; + } + return 0; +} + +int exofs_sysfs_init(void) +{ + exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); + if (!exofs_kset) { + EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); + return -ENOMEM; + } + return 0; +} + +void exofs_sysfs_uninit(void) +{ + kset_unregister(exofs_kset); +} diff --git a/kernel/fs/exportfs/Makefile b/kernel/fs/exportfs/Makefile new file mode 100644 index 000000000..d7c5d4ddb --- /dev/null +++ b/kernel/fs/exportfs/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the filesystem export support routines. + +obj-$(CONFIG_EXPORTFS) += exportfs.o + +exportfs-objs := expfs.o diff --git a/kernel/fs/exportfs/expfs.c b/kernel/fs/exportfs/expfs.c new file mode 100644 index 000000000..714cd37a6 --- /dev/null +++ b/kernel/fs/exportfs/expfs.c @@ -0,0 +1,544 @@ +/* + * Copyright (C) Neil Brown 2002 + * Copyright (C) Christoph Hellwig 2007 + * + * This file contains the code mapping from inodes to NFS file handles, + * and for mapping back from file handles to dentries. + * + * For details on why we do all the strange and hairy things in here + * take a look at Documentation/filesystems/nfs/Exporting. + */ +#include +#include +#include +#include +#include +#include +#include + +#define dprintk(fmt, args...) do{}while(0) + + +static int get_name(const struct path *path, char *name, struct dentry *child); + + +static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir, + char *name, struct dentry *child) +{ + const struct export_operations *nop = dir->d_sb->s_export_op; + struct path path = {.mnt = mnt, .dentry = dir}; + + if (nop->get_name) + return nop->get_name(dir, name, child); + else + return get_name(&path, name, child); +} + +/* + * Check if the dentry or any of it's aliases is acceptable. + */ +static struct dentry * +find_acceptable_alias(struct dentry *result, + int (*acceptable)(void *context, struct dentry *dentry), + void *context) +{ + struct dentry *dentry, *toput = NULL; + struct inode *inode; + + if (acceptable(context, result)) + return result; + + inode = result->d_inode; + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + dget(dentry); + spin_unlock(&inode->i_lock); + if (toput) + dput(toput); + if (dentry != result && acceptable(context, dentry)) { + dput(result); + return dentry; + } + spin_lock(&inode->i_lock); + toput = dentry; + } + spin_unlock(&inode->i_lock); + + if (toput) + dput(toput); + return NULL; +} + +static bool dentry_connected(struct dentry *dentry) +{ + dget(dentry); + while (dentry->d_flags & DCACHE_DISCONNECTED) { + struct dentry *parent = dget_parent(dentry); + + dput(dentry); + if (IS_ROOT(dentry)) { + dput(parent); + return false; + } + dentry = parent; + } + dput(dentry); + return true; +} + +static void clear_disconnected(struct dentry *dentry) +{ + dget(dentry); + while (dentry->d_flags & DCACHE_DISCONNECTED) { + struct dentry *parent = dget_parent(dentry); + + WARN_ON_ONCE(IS_ROOT(dentry)); + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); + + dput(dentry); + dentry = parent; + } + dput(dentry); +} + +/* + * Reconnect a directory dentry with its parent. + * + * This can return a dentry, or NULL, or an error. + * + * In the first case the returned dentry is the parent of the given + * dentry, and may itself need to be reconnected to its parent. + * + * In the NULL case, a concurrent VFS operation has either renamed or + * removed this directory. The concurrent operation has reconnected our + * dentry, so we no longer need to. + */ +static struct dentry *reconnect_one(struct vfsmount *mnt, + struct dentry *dentry, char *nbuf) +{ + struct dentry *parent; + struct dentry *tmp; + int err; + + parent = ERR_PTR(-EACCES); + mutex_lock(&dentry->d_inode->i_mutex); + if (mnt->mnt_sb->s_export_op->get_parent) + parent = mnt->mnt_sb->s_export_op->get_parent(dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + + if (IS_ERR(parent)) { + dprintk("%s: get_parent of %ld failed, err %d\n", + __func__, dentry->d_inode->i_ino, PTR_ERR(parent)); + return parent; + } + + dprintk("%s: find name of %lu in %lu\n", __func__, + dentry->d_inode->i_ino, parent->d_inode->i_ino); + err = exportfs_get_name(mnt, parent, nbuf, dentry); + if (err == -ENOENT) + goto out_reconnected; + if (err) + goto out_err; + dprintk("%s: found name: %s\n", __func__, nbuf); + mutex_lock(&parent->d_inode->i_mutex); + tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); + mutex_unlock(&parent->d_inode->i_mutex); + if (IS_ERR(tmp)) { + dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + goto out_err; + } + if (tmp != dentry) { + dput(tmp); + goto out_reconnected; + } + dput(tmp); + if (IS_ROOT(dentry)) { + err = -ESTALE; + goto out_err; + } + return parent; + +out_err: + dput(parent); + return ERR_PTR(err); +out_reconnected: + dput(parent); + /* + * Someone must have renamed our entry into another parent, in + * which case it has been reconnected by the rename. + * + * Or someone removed it entirely, in which case filehandle + * lookup will succeed but the directory is now IS_DEAD and + * subsequent operations on it will fail. + * + * Alternatively, maybe there was no race at all, and the + * filesystem is just corrupt and gave us a parent that doesn't + * actually contain any entry pointing to this inode. So, + * double check that this worked and return -ESTALE if not: + */ + if (!dentry_connected(dentry)) + return ERR_PTR(-ESTALE); + return NULL; +} + +/* + * Make sure target_dir is fully connected to the dentry tree. + * + * On successful return, DCACHE_DISCONNECTED will be cleared on + * target_dir, and target_dir->d_parent->...->d_parent will reach the + * root of the filesystem. + * + * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected. + * But the converse is not true: target_dir may have DCACHE_DISCONNECTED + * set but already be connected. In that case we'll verify the + * connection to root and then clear the flag. + * + * Note that target_dir could be removed by a concurrent operation. In + * that case reconnect_path may still succeed with target_dir fully + * connected, but further operations using the filehandle will fail when + * necessary (due to S_DEAD being set on the directory). + */ +static int +reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) +{ + struct dentry *dentry, *parent; + + dentry = dget(target_dir); + + while (dentry->d_flags & DCACHE_DISCONNECTED) { + BUG_ON(dentry == mnt->mnt_sb->s_root); + + if (IS_ROOT(dentry)) + parent = reconnect_one(mnt, dentry, nbuf); + else + parent = dget_parent(dentry); + + if (!parent) + break; + dput(dentry); + if (IS_ERR(parent)) + return PTR_ERR(parent); + dentry = parent; + } + dput(dentry); + clear_disconnected(target_dir); + return 0; +} + +struct getdents_callback { + struct dir_context ctx; + char *name; /* name that was found. It already points to a + buffer NAME_MAX+1 is size */ + u64 ino; /* the inum we are looking for */ + int found; /* inode matched? */ + int sequence; /* sequence counter */ +}; + +/* + * A rather strange filldir function to capture + * the name matching the specified inode number. + */ +static int filldir_one(struct dir_context *ctx, const char *name, int len, + loff_t pos, u64 ino, unsigned int d_type) +{ + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); + int result = 0; + + buf->sequence++; + if (buf->ino == ino && len <= NAME_MAX) { + memcpy(buf->name, name, len); + buf->name[len] = '\0'; + buf->found = 1; + result = -1; + } + return result; +} + +/** + * get_name - default export_operations->get_name function + * @path: the directory in which to find a name + * @name: a pointer to a %NAME_MAX+1 char buffer to store the name + * @child: the dentry for the child directory. + * + * calls readdir on the parent until it finds an entry with + * the same inode number as the child, and returns that. + */ +static int get_name(const struct path *path, char *name, struct dentry *child) +{ + const struct cred *cred = current_cred(); + struct inode *dir = path->dentry->d_inode; + int error; + struct file *file; + struct kstat stat; + struct path child_path = { + .mnt = path->mnt, + .dentry = child, + }; + struct getdents_callback buffer = { + .ctx.actor = filldir_one, + .name = name, + }; + + error = -ENOTDIR; + if (!dir || !S_ISDIR(dir->i_mode)) + goto out; + error = -EINVAL; + if (!dir->i_fop) + goto out; + /* + * inode->i_ino is unsigned long, kstat->ino is u64, so the + * former would be insufficient on 32-bit hosts when the + * filesystem supports 64-bit inode numbers. So we need to + * actually call ->getattr, not just read i_ino: + */ + error = vfs_getattr_nosec(&child_path, &stat); + if (error) + return error; + buffer.ino = stat.ino; + /* + * Open the directory ... + */ + file = dentry_open(path, O_RDONLY, cred); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + error = -EINVAL; + if (!file->f_op->iterate) + goto out_close; + + buffer.sequence = 0; + while (1) { + int old_seq = buffer.sequence; + + error = iterate_dir(file, &buffer.ctx); + if (buffer.found) { + error = 0; + break; + } + + if (error < 0) + break; + + error = -ENOENT; + if (old_seq == buffer.sequence) + break; + } + +out_close: + fput(file); +out: + return error; +} + +/** + * export_encode_fh - default export_operations->encode_fh function + * @inode: the object to encode + * @fid: where to store the file handle fragment + * @max_len: maximum length to store there + * @parent: parent directory inode, if wanted + * + * This default encode_fh function assumes that the 32 inode number + * is suitable for locating an inode, and that the generation number + * can be used to check that it is still valid. It places them in the + * filehandle fragment where export_decode_fh expects to find them. + */ +static int export_encode_fh(struct inode *inode, struct fid *fid, + int *max_len, struct inode *parent) +{ + int len = *max_len; + int type = FILEID_INO32_GEN; + + if (parent && (len < 4)) { + *max_len = 4; + return FILEID_INVALID; + } else if (len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + len = 2; + fid->i32.ino = inode->i_ino; + fid->i32.gen = inode->i_generation; + if (parent) { + fid->i32.parent_ino = parent->i_ino; + fid->i32.parent_gen = parent->i_generation; + len = 4; + type = FILEID_INO32_GEN_PARENT; + } + *max_len = len; + return type; +} + +int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, + int *max_len, struct inode *parent) +{ + const struct export_operations *nop = inode->i_sb->s_export_op; + + if (nop && nop->encode_fh) + return nop->encode_fh(inode, fid->raw, max_len, parent); + + return export_encode_fh(inode, fid, max_len, parent); +} +EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); + +int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, + int connectable) +{ + int error; + struct dentry *p = NULL; + struct inode *inode = dentry->d_inode, *parent = NULL; + + if (connectable && !S_ISDIR(inode->i_mode)) { + p = dget_parent(dentry); + /* + * note that while p might've ceased to be our parent already, + * it's still pinned by and still positive. + */ + parent = p->d_inode; + } + + error = exportfs_encode_inode_fh(inode, fid, max_len, parent); + dput(p); + + return error; +} +EXPORT_SYMBOL_GPL(exportfs_encode_fh); + +struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, + int fh_len, int fileid_type, + int (*acceptable)(void *, struct dentry *), void *context) +{ + const struct export_operations *nop = mnt->mnt_sb->s_export_op; + struct dentry *result, *alias; + char nbuf[NAME_MAX+1]; + int err; + + /* + * Try to get any dentry for the given file handle from the filesystem. + */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); + result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); + if (!result) + result = ERR_PTR(-ESTALE); + if (IS_ERR(result)) + return result; + + if (d_is_dir(result)) { + /* + * This request is for a directory. + * + * On the positive side there is only one dentry for each + * directory inode. On the negative side this implies that we + * to ensure our dentry is connected all the way up to the + * filesystem root. + */ + if (result->d_flags & DCACHE_DISCONNECTED) { + err = reconnect_path(mnt, result, nbuf); + if (err) + goto err_result; + } + + if (!acceptable(context, result)) { + err = -EACCES; + goto err_result; + } + + return result; + } else { + /* + * It's not a directory. Life is a little more complicated. + */ + struct dentry *target_dir, *nresult; + + /* + * See if either the dentry we just got from the filesystem + * or any alias for it is acceptable. This is always true + * if this filesystem is exported without the subtreecheck + * option. If the filesystem is exported with the subtree + * check option there's a fair chance we need to look at + * the parent directory in the file handle and make sure + * it's connected to the filesystem root. + */ + alias = find_acceptable_alias(result, acceptable, context); + if (alias) + return alias; + + /* + * Try to extract a dentry for the parent directory from the + * file handle. If this fails we'll have to give up. + */ + err = -ESTALE; + if (!nop->fh_to_parent) + goto err_result; + + target_dir = nop->fh_to_parent(mnt->mnt_sb, fid, + fh_len, fileid_type); + if (!target_dir) + goto err_result; + err = PTR_ERR(target_dir); + if (IS_ERR(target_dir)) + goto err_result; + + /* + * And as usual we need to make sure the parent directory is + * connected to the filesystem root. The VFS really doesn't + * like disconnected directories.. + */ + err = reconnect_path(mnt, target_dir, nbuf); + if (err) { + dput(target_dir); + goto err_result; + } + + /* + * Now that we've got both a well-connected parent and a + * dentry for the inode we're after, make sure that our + * inode is actually connected to the parent. + */ + err = exportfs_get_name(mnt, target_dir, nbuf, result); + if (!err) { + mutex_lock(&target_dir->d_inode->i_mutex); + nresult = lookup_one_len(nbuf, target_dir, + strlen(nbuf)); + mutex_unlock(&target_dir->d_inode->i_mutex); + if (!IS_ERR(nresult)) { + if (nresult->d_inode) { + dput(result); + result = nresult; + } else + dput(nresult); + } + } + + /* + * At this point we are done with the parent, but it's pinned + * by the child dentry anyway. + */ + dput(target_dir); + + /* + * And finally make sure the dentry is actually acceptable + * to NFSD. + */ + alias = find_acceptable_alias(result, acceptable, context); + if (!alias) { + err = -EACCES; + goto err_result; + } + + return alias; + } + + err_result: + dput(result); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(exportfs_decode_fh); + +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/ext2/Kconfig b/kernel/fs/ext2/Kconfig new file mode 100644 index 000000000..c634874e1 --- /dev/null +++ b/kernel/fs/ext2/Kconfig @@ -0,0 +1,44 @@ +config EXT2_FS + tristate "Second extended fs support" + help + Ext2 is a standard Linux file system for hard disks. + + To compile this file system support as a module, choose M here: the + module will be called ext2. + + If unsure, say Y. + +config EXT2_FS_XATTR + bool "Ext2 extended attributes" + depends on EXT2_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config EXT2_FS_POSIX_ACL + bool "Ext2 POSIX Access Control Lists" + depends on EXT2_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT2_FS_SECURITY + bool "Ext2 Security Labels" + depends on EXT2_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/kernel/fs/ext2/Makefile b/kernel/fs/ext2/Makefile new file mode 100644 index 000000000..445b0e996 --- /dev/null +++ b/kernel/fs/ext2/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux ext2-filesystem routines. +# + +obj-$(CONFIG_EXT2_FS) += ext2.o + +ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + +ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o +ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o diff --git a/kernel/fs/ext2/acl.c b/kernel/fs/ext2/acl.c new file mode 100644 index 000000000..27695e6f4 --- /dev/null +++ b/kernel/fs/ext2/acl.c @@ -0,0 +1,257 @@ +/* + * linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext2_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext2_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext2_acl_header *)value)->a_version != + cpu_to_le32(EXT2_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext2_acl_header); + count = ext2_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext2_acl_entry *entry = + (ext2_acl_entry *)value; + if ((char *)value + sizeof(ext2_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext2_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(ext2_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(ext2_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext2_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext2_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext2_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count * + sizeof(ext2_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext2_acl_header); + for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext2_acl_entry *entry = (ext2_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext2_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(ext2_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext2_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * inode->i_mutex: don't care + */ +struct posix_acl * +ext2_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext2_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext2_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext2_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * inode->i_mutex: down + */ +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext2_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext2_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext2_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; +} diff --git a/kernel/fs/ext2/acl.h b/kernel/fs/ext2/acl.h new file mode 100644 index 000000000..44937f9fc --- /dev/null +++ b/kernel/fs/ext2/acl.h @@ -0,0 +1,71 @@ +/* + File: fs/ext2/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT2_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext2_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext2_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext2_acl_header; + +static inline size_t ext2_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext2_acl_header) + + count * sizeof(ext2_acl_entry_short); + } else { + return sizeof(ext2_acl_header) + + 4 * sizeof(ext2_acl_entry_short) + + (count - 4) * sizeof(ext2_acl_entry); + } +} + +static inline int ext2_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext2_acl_header); + s = size - 4 * sizeof(ext2_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext2_acl_entry_short)) + return -1; + return size / sizeof(ext2_acl_entry_short); + } else { + if (s % sizeof(ext2_acl_entry)) + return -1; + return s / sizeof(ext2_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT2_FS_POSIX_ACL + +/* acl.c */ +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); +extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int ext2_init_acl (struct inode *, struct inode *); + +#else +#include +#define ext2_get_acl NULL +#define ext2_set_acl NULL + +static inline int ext2_init_acl (struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif + diff --git a/kernel/fs/ext2/balloc.c b/kernel/fs/ext2/balloc.c new file mode 100644 index 000000000..9f9992b37 --- /dev/null +++ b/kernel/fs/ext2/balloc.c @@ -0,0 +1,1536 @@ +/* + * linux/fs/ext2/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include "ext2.h" +#include +#include +#include +#include +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext2_fill_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext2_group_desc * desc; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext2_error (sb, "ext2_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + + group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext2_error (sb, "ext2_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +static int ext2_valid_block_bitmap(struct super_block *sb, + struct ext2_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext2_grpblk_t offset; + ext2_grpblk_t next_zero_bit; + ext2_fsblk_t bitmap_blk; + ext2_fsblk_t group_first_block; + + group_first_block = ext2_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext2_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext2_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_table); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext2_find_next_zero_bit(bh->b_data, + offset + EXT2_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT2_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext2_error(sb, __func__, + "Invalid block bitmap - " + "block_group = %d, block = %lu", + block_group, bitmap_blk); + return 0; +} + +/* + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext2_group_desc * desc; + struct buffer_head * bh = NULL; + ext2_fsblk_t bitmap_blk; + + desc = ext2_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext2_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + if (likely(bh_uptodate_or_lock(bh))) + return bh; + + if (bh_submit_read(bh) < 0) { + brelse(bh); + ext2_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + + ext2_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ + return bh; +} + +static void group_adjust_blocks(struct super_block *sb, int group_no, + struct ext2_group_desc *desc, struct buffer_head *bh, int count) +{ + if (count) { + struct ext2_sb_info *sbi = EXT2_SB(sb); + unsigned free_blocks; + + spin_lock(sb_bgl_lock(sbi, group_no)); + free_blocks = le16_to_cpu(desc->bg_free_blocks_count); + desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); + spin_unlock(sb_bgl_lock(sbi, group_no)); + mark_buffer_dirty(bh); + } +} + +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map + * + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. + */ +#if 1 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext2_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %lu, end: %lu\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + BUG_ON(bad); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __func__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ +static int +goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, + unsigned int group, struct super_block * sb) +{ + ext2_fsblk_t group_first_block, group_last_block; + + group_first_block = ext2_group_first_block_no(sb, group); + group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1; + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext2_reserve_window_node * +search_reserve_window(struct rb_root *root, ext2_fsblk_t goal) +{ + struct rb_node *n = root->rb_node; + struct ext2_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + } + return rsv; +} + +/* + * ext2_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock held. + */ +void ext2_rsv_window_add(struct super_block *sb, + struct ext2_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT2_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + ext2_fsblk_t start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext2_reserve_window_node *this; + + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext2_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else { + rsv_window_dump(root, 1); + BUG(); + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +/** + * rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock held. + */ +static void rsv_window_remove(struct super_block *sb, + struct ext2_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT2_SB(sb)->s_rsv_window_root); +} + +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT2_RESERVE_WINDOW_NOT_ALLOCATED. + */ +static inline int rsv_is_empty(struct ext2_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return (rsv->_rsv_end == EXT2_RESERVE_WINDOW_NOT_ALLOCATED); +} + +/** + * ext2_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext2 inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext2 inode the first time the open file + * needs a new block. So, before every ext2_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext2_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to calling this function. + */ +void ext2_init_block_alloc_info(struct inode *inode) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct ext2_block_alloc_info *block_i; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT2_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +/** + * ext2_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext2_release_file(): last writer closes the file + * ext2_clear_inode(): last iput(), when nobody links to this file. + * ext2_truncate(): when the block indirect map is about to change. + */ +void ext2_discard_reservation(struct inode *inode) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT2_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) + rsv_window_remove(inode->i_sb, rsv); + spin_unlock(rsv_lock); + } +} + +/** + * ext2_free_blocks() -- Free given blocks and update quota and i_blocks + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to free + */ +void ext2_free_blocks (struct inode * inode, unsigned long block, + unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + unsigned long i; + unsigned long overflow; + struct super_block * sb = inode->i_sb; + struct ext2_sb_info * sbi = EXT2_SB(sb); + struct ext2_group_desc * desc; + struct ext2_super_block * es = sbi->s_es; + unsigned freed = 0, group_freed; + + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext2_error (sb, "ext2_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT2_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT2_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT2_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT2_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + desc = ext2_get_group_desc (sb, block_group, &bh2); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) { + ext2_error (sb, "ext2_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + goto error_return; + } + + for (i = 0, group_freed = 0; i < count; i++) { + if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + ext2_error(sb, __func__, + "bit already cleared for block %lu", block + i); + } else { + group_freed++; + } + } + + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + group_adjust_blocks(sb, block_group, desc, bh2, group_freed); + freed += group_freed; + + if (overflow) { + block += count; + count = overflow; + goto do_more; + } +error_return: + brelse(bitmap_bh); + if (freed) { + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + dquot_free_block_nodirty(inode, freed); + mark_inode_dirty(inode); + } +} + +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward through the actual bitmap on disk until + * we find a bit free. + */ +static ext2_grpblk_t +bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh, + ext2_grpblk_t maxblocks) +{ + ext2_grpblk_t next; + + next = ext2_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + return next; +} + +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; + * then for any free bit in the bitmap. + */ +static ext2_grpblk_t +find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) +{ + ext2_grpblk_t here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT2_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + ext2_grpblk_t end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext2_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal) + return here; + ext2_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = ((char *)bh->b_data) + (here >> 3); + r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); + next = (r - ((char *)bh->b_data)) << 3; + + if (next < maxblocks && next >= here) + return next; + + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/** + * ext2_try_to_allocate() + * @sb: superblock + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) + * from the file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, + * ends at the block group's last block. + * + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. + */ +static int +ext2_try_to_allocate(struct super_block *sb, int group, + struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal, + unsigned long *count, + struct ext2_reserve_window *my_rsv) +{ + ext2_fsblk_t group_first_block; + ext2_grpblk_t start, end; + unsigned long num = 0; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = ext2_group_first_block_no(sb, group); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT2_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT2_BLOCKS_PER_GROUP(sb); + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; + else + grp_goal = -1; + } else { + if (grp_goal > 0) + start = grp_goal; + else + start = 0; + end = EXT2_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); + +repeat: + if (grp_goal < 0) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && grp_goal > start && + !ext2_test_bit(grp_goal - 1, + bitmap_bh->b_data); + i++, grp_goal--) + ; + } + } + start = grp_goal; + + if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, + bitmap_bh->b_data)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + grp_goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + num++; + grp_goal++; + while (num < *count && grp_goal < end + && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), + grp_goal, bitmap_bh->b_data)) { + num++; + grp_goal++; + } + *count = num; + return grp_goal - num; +fail_access: + *count = num; + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @size: the target new reservation window size + * + * @group_first_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + */ +static int find_next_reservable_window( + struct ext2_reserve_window_node *search_head, + struct ext2_reserve_window_node *my_rsv, + struct super_block * sb, + ext2_fsblk_t start_block, + ext2_fsblk_t last_block) +{ + struct rb_node *next; + struct ext2_reserve_window_node *rsv, *prev; + ext2_fsblk_t cur; + int size = my_rsv->rsv_goal_size; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = start_block; + rsv = search_head; + if (!rsv) + return -1; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return -1; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = rb_entry(next,struct ext2_reserve_window_node,rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole available window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext2_rsv_window_add(sb, my_rsv); + + return 0; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @rsv: the reservation + * + * @grp_goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a goal(goal >0 ), then start from there, + * no goal(goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + * + */ +static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, + ext2_grpblk_t grp_goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext2_reserve_window_node *search_head; + ext2_fsblk_t group_first_block, group_end_block, start_block; + ext2_grpblk_t first_free_block; + struct rb_root *fs_rsv_root = &EXT2_SB(sb)->s_rsv_window_root; + unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; + + group_first_block = ext2_group_first_block_no(sb, group); + group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + if (grp_goal < 0) + start_block = group_first_block; + else + start_block = grp_goal + group_first_block; + + size = my_rsv->rsv_goal_size; + + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window + */ + size = size * 2; + if (size > EXT2_MAX_RESERVE_BLOCKS) + size = EXT2_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + + spin_lock(rsv_lock); + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * Search the first free bit on the block bitmap. Search starts from + * the start block of the reservable space we just found. + */ + spin_unlock(rsv_lock); + first_free_block = bitmap_search_next_usable_block( + my_rsv->rsv_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ + } + + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just reserved + */ + if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + return 0; /* success */ + /* + * if the first free bit we found is out of the reservable space + * continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = my_rsv; + spin_lock(rsv_lock); + goto retry; +} + +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext2_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext2_new_blocks() tries to allocate blocks. + */ +static void try_to_extend_reservation(struct ext2_reserve_window_node *my_rsv, + struct super_block *sb, int size) +{ + struct ext2_reserve_window_node *next_rsv; + struct rb_node *next; + spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; + + if (!spin_trylock(rsv_lock)) + return; + + next = rb_next(&my_rsv->rsv_node); + + if (!next) + my_rsv->rsv_end += size; + else { + next_rsv = rb_entry(next, struct ext2_reserve_window_node, rsv_node); + + if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) + my_rsv->rsv_end += size; + else + my_rsv->rsv_end = next_rsv->rsv_start - 1; + } + spin_unlock(rsv_lock); +} + +/** + * ext2_try_to_allocate_with_rsv() + * @sb: superblock + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when somebody is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a red-black tree for the per-filesystem reservation list. + */ +static ext2_grpblk_t +ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, + struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal, + struct ext2_reserve_window_node * my_rsv, + unsigned long *count) +{ + ext2_fsblk_t group_first_block, group_last_block; + ext2_grpblk_t ret = 0; + unsigned long num = *count; + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL) { + return ext2_try_to_allocate(sb, group, bitmap_bh, + grp_goal, count, NULL); + } + /* + * grp_goal is a group relative block number (if there is a goal) + * 0 <= grp_goal < EXT2_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = ext2_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { + if (my_rsv->rsv_goal_size < *count) + my_rsv->rsv_goal_size = *count; + ret = alloc_new_reservation(my_rsv, grp_goal, sb, + group, bitmap_bh); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal >= 0) { + int curr = my_rsv->rsv_end - + (grp_goal + group_first_block) + 1; + + if (curr < *count) + try_to_extend_reservation(my_rsv, sb, + *count - curr); + } + + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1); + BUG(); + } + ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal, + &num, &my_rsv->rsv_window); + if (ret >= 0) { + my_rsv->rsv_alloc_hit += num; + *count = num; + break; /* succeed */ + } + num = *count; + } + return ret; +} + +/** + * ext2_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ +static int ext2_has_free_blocks(struct ext2_sb_info *sbi) +{ + ext2_fsblk_t free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/* + * ext2_new_blocks() -- core block(s) allocation function + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext2_new_blocks uses a goal block to assist allocation. If the goal is + * free, or there is a free block within 32 blocks of the goal, that block + * is allocated. Otherwise a forward search is made for a free block; within + * each block group the search first looks for an entire free byte in the block + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, + unsigned long *count, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + ext2_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext2_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext2_fsblk_t ret_block; /* filesyetem-wide allocated block */ + int bgi; /* blockgroup iteration index */ + int performed_allocation = 0; + ext2_grpblk_t free_blocks; /* number of free blocks in a group */ + struct super_block *sb; + struct ext2_group_desc *gdp; + struct ext2_super_block *es; + struct ext2_sb_info *sbi; + struct ext2_reserve_window_node *my_rsv = NULL; + struct ext2_block_alloc_info *block_i; + unsigned short windowsz = 0; + unsigned long ngroups; + unsigned long num = *count; + int ret; + + *errp = -ENOSPC; + sb = inode->i_sb; + + /* + * Check quota for allocation of this block. + */ + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; + return 0; + } + + sbi = EXT2_SB(sb); + es = EXT2_SB(sb)->s_es; + ext2_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT2_I(inode)->i_block_alloc_info; + if (block_i) { + windowsz = block_i->rsv_window_node.rsv_goal_size; + if (windowsz > 0) + my_rsv = &block_i->rsv_window_node; + } + + if (!ext2_has_free_blocks(sbi)) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT2_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: + gdp = ext2_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT2_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no, + bitmap_bh, grp_target_blk, + my_rsv, &num); + if (grp_alloc_blk >= 0) + goto allocated; + } + + ngroups = EXT2_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * group_no and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext2_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (my_rsv && (free_blocks <= (windowsz/2))) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no, + bitmap_bh, -1, my_rsv, &num); + if (grp_alloc_blk >= 0) + goto allocated; + } + /* + * We may end up a bogus earlier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks available on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + windowsz = 0; + group_no = goal_group; + goto retry_alloc; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext2_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + ret_block = grp_alloc_blk + ext2_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + EXT2_SB(sb)->s_itb_per_group) || + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT2_SB(sb)->s_itb_per_group)) { + ext2_error(sb, "ext2_new_blocks", + "Allocating block in system zone - " + "blocks from "E2FSBLK", length %lu", + ret_block, num); + /* + * ext2_try_to_allocate marked the blocks we allocated as in + * use. So we may want to selectively mark some of the blocks + * as free + */ + goto retry_alloc; + } + + performed_allocation = 1; + + if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext2_error(sb, "ext2_new_blocks", + "block("E2FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + group_adjust_blocks(sb, group_no, gdp, gdp_bh, -num); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + *errp = 0; + brelse(bitmap_bh); + if (num < *count) { + dquot_free_block_nodirty(inode, *count-num); + mark_inode_dirty(inode); + *count = num; + } + return ret_block; + +io_error: + *errp = -EIO; +out: + /* + * Undo the block allocation + */ + if (!performed_allocation) { + dquot_free_block_nodirty(inode, *count); + mark_inode_dirty(inode); + } + brelse(bitmap_bh); + return 0; +} + +ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp) +{ + unsigned long count = 1; + + return ext2_new_blocks(inode, goal, &count, errp); +} + +#ifdef EXT2FS_DEBUG + +unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); +} + +#endif /* EXT2FS_DEBUG */ + +unsigned long ext2_count_free_blocks (struct super_block * sb) +{ + struct ext2_group_desc * desc; + unsigned long desc_count = 0; + int i; +#ifdef EXT2FS_DEBUG + unsigned long bitmap_count, x; + struct ext2_super_block *es; + + es = EXT2_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + desc = NULL; + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + struct buffer_head *bitmap_bh; + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); + bitmap_bh = read_block_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext2_count_free(bitmap_bh, sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(desc->bg_free_blocks_count), x); + bitmap_count += x; + brelse(bitmap_bh); + } + printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n", + (long)le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); + return bitmap_count; +#else + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); + } + return desc_count; +#endif +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext2_group_sparse(int group) +{ + if (group <= 1) + return 1; + return (test_root(group, 3) || test_root(group, 5) || + test_root(group, 7)); +} + +/** + * ext2_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext2_bg_has_super(struct super_block *sb, int group) +{ + if (EXT2_HAS_RO_COMPAT_FEATURE(sb,EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext2_group_sparse(group)) + return 0; + return 1; +} + +/** + * ext2_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext2_bg_num_gdb(struct super_block *sb, int group) +{ + return ext2_bg_has_super(sb, group) ? EXT2_SB(sb)->s_gdb_count : 0; +} + diff --git a/kernel/fs/ext2/dir.c b/kernel/fs/ext2/dir.c new file mode 100644 index 000000000..796b491e6 --- /dev/null +++ b/kernel/fs/ext2/dir.c @@ -0,0 +1,730 @@ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include "ext2.h" +#include +#include +#include + +typedef struct ext2_dir_entry_2 ext2_dirent; + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext2_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT2_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext2_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT2_MAX_REC_LEN); + else + BUG_ON(len > (1 << 16)); +#endif + return cpu_to_le16(len); +} + +/* + * ext2 uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned ext2_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void ext2_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +ext2_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + + if (IS_DIRSYNC(dir)) { + err = write_one_page(page, 1); + if (!err) + err = sync_inode_metadata(dir, 1); + } else { + unlock_page(page); + } + + return err; +} + +static void ext2_check_page(struct page *page, int quiet) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = ext2_chunk_size(dir); + char *kaddr = page_address(page); + u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + ext2_dirent *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) { + p = (ext2_dirent *)(kaddr + offs); + rec_len = ext2_rec_len_from_disk(p->rec_len); + + if (unlikely(rec_len < EXT2_DIR_REC_LEN(1))) + goto Eshort; + if (unlikely(rec_len & 3)) + goto Ealign; + if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len))) + goto Enamelen; + if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))) + goto Espan; + if (unlikely(le32_to_cpu(p->inode) > max_inumber)) + goto Einumber; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + if (!quiet) + ext2_error(sb, __func__, + "size of directory #%lu is not a multiple " + "of chunk size", dir->i_ino); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "inode out of bounds"; +bad_entry: + if (!quiet) + ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<inode), + rec_len, p->name_len); + goto fail; +Eend: + if (!quiet) { + p = (ext2_dirent *)(kaddr + offs); + ext2_error(sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<inode)); + } +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page * ext2_get_page(struct inode *dir, unsigned long n, + int quiet) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + ext2_check_page(page, quiet); + if (PageError(page)) + goto fail; + } + return page; + +fail: + ext2_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, ext2_match returns 1 for success, 0 for failure. + * + * len <= EXT2_NAME_LEN and de != NULL are guaranteed by caller. + */ +static inline int ext2_match (int len, const char * const name, + struct ext2_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static inline ext2_dirent *ext2_next_entry(ext2_dirent *p) +{ + return (ext2_dirent *)((char *)p + + ext2_rec_len_from_disk(p->rec_len)); +} + +static inline unsigned +ext2_validate_entry(char *base, unsigned offset, unsigned mask) +{ + ext2_dirent *de = (ext2_dirent*)(base + offset); + ext2_dirent *p = (ext2_dirent*)(base + (offset&mask)); + while ((char*)p < (char*)de) { + if (p->rec_len == 0) + break; + p = ext2_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { + [EXT2_FT_UNKNOWN] = DT_UNKNOWN, + [EXT2_FT_REG_FILE] = DT_REG, + [EXT2_FT_DIR] = DT_DIR, + [EXT2_FT_CHRDEV] = DT_CHR, + [EXT2_FT_BLKDEV] = DT_BLK, + [EXT2_FT_FIFO] = DT_FIFO, + [EXT2_FT_SOCK] = DT_SOCK, + [EXT2_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK, +}; + +static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) +{ + umode_t mode = inode->i_mode; + if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + else + de->file_type = 0; +} + +static int +ext2_readdir(struct file *file, struct dir_context *ctx) +{ + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); + unsigned char *types = NULL; + int need_revalidate = file->f_version != inode->i_version; + + if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) + return 0; + + if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) + types = ext2_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + ext2_dirent *de; + struct page *page = ext2_get_page(inode, n, 0); + + if (IS_ERR(page)) { + ext2_error(sb, __func__, + "bad page in #%lu", + inode->i_ino); + ctx->pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ext2_validate_entry(kaddr, offset, chunk_mask); + ctx->pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (ext2_dirent *)(kaddr+offset); + limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1); + for ( ;(char*)de <= limit; de = ext2_next_entry(de)) { + if (de->rec_len == 0) { + ext2_error(sb, __func__, + "zero-length directory entry"); + ext2_put_page(page); + return -EIO; + } + if (de->inode) { + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < EXT2_FT_MAX) + d_type = types[de->file_type]; + + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + d_type)) { + ext2_put_page(page); + return 0; + } + } + ctx->pos += ext2_rec_len_from_disk(de->rec_len); + } + ext2_put_page(page); + } + return 0; +} + +/* + * ext2_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, + struct qstr *child, struct page ** res_page) +{ + const char *name = child->name; + int namelen = child->len; + unsigned reclen = EXT2_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct ext2_inode_info *ei = EXT2_I(dir); + ext2_dirent * de; + int dir_has_error = 0; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = ext2_get_page(dir, n, dir_has_error); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (ext2_dirent *) kaddr; + kaddr += ext2_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + ext2_error(dir->i_sb, __func__, + "zero-length directory entry"); + ext2_put_page(page); + goto out; + } + if (ext2_match (namelen, name, de)) + goto found; + de = ext2_next_entry(de); + } + ext2_put_page(page); + } else + dir_has_error = 1; + + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + ext2_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block count %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = ext2_get_page(dir, 0, 0); + ext2_dirent *de = NULL; + + if (!IS_ERR(page)) { + de = ext2_next_entry((ext2_dirent *) page_address(page)); + *p = page; + } + return de; +} + +ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) +{ + ino_t res = 0; + struct ext2_dir_entry_2 *de; + struct page *page; + + de = ext2_find_entry (dir, child, &page); + if (de) { + res = le32_to_cpu(de->inode); + ext2_put_page(page); + } + return res; +} + +static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, ext2_get_block); +} + +/* Releases the page */ +void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, struct inode *inode, int update_times) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = ext2_rec_len_from_disk(de->rec_len); + int err; + + lock_page(page); + err = ext2_prepare_chunk(page, pos, len); + BUG_ON(err); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type(de, inode); + err = ext2_commit_chunk(page, pos, len); + ext2_put_page(page); + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); +} + +/* + * Parent is locked. + */ +int ext2_add_link (struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = ext2_chunk_size(dir); + unsigned reclen = EXT2_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + ext2_dirent * de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = ext2_get_page(dir, n, 0); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + ext2_last_byte(dir, n); + de = (ext2_dirent *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = ext2_rec_len_to_disk(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + ext2_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (ext2_match (namelen, name, de)) + goto out_unlock; + name_len = EXT2_DIR_REC_LEN(de->name_len); + rec_len = ext2_rec_len_from_disk(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (ext2_dirent *) ((char *) de + rec_len); + } + unlock_page(page); + ext2_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + err = ext2_prepare_chunk(page, pos, rec_len); + if (err) + goto out_unlock; + if (de->inode) { + ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len); + de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len); + de->rec_len = ext2_rec_len_to_disk(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type (de, inode); + err = ext2_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + ext2_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * ext2_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) +{ + struct inode *inode = page->mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + + ext2_rec_len_from_disk(dir->rec_len); + loff_t pos; + ext2_dirent * pde = NULL; + ext2_dirent * de = (ext2_dirent *) (kaddr + from); + int err; + + while ((char*)de < (char*)dir) { + if (de->rec_len == 0) { + ext2_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = ext2_next_entry(de); + } + if (pde) + from = (char*)pde - (char*)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = ext2_prepare_chunk(page, pos, to - from); + BUG_ON(err); + if (pde) + pde->rec_len = ext2_rec_len_to_disk(to - from); + dir->inode = 0; + err = ext2_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(inode); +out: + ext2_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int ext2_make_empty(struct inode *inode, struct inode *parent) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + unsigned chunk_size = ext2_chunk_size(inode); + struct ext2_dir_entry_2 * de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = ext2_prepare_chunk(page, 0, chunk_size); + if (err) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page); + memset(kaddr, 0, chunk_size); + de = (struct ext2_dir_entry_2 *)kaddr; + de->name_len = 1; + de->rec_len = ext2_rec_len_to_disk(EXT2_DIR_REC_LEN(1)); + memcpy (de->name, ".\0\0", 4); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type (de, inode); + + de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = ext2_rec_len_to_disk(chunk_size - EXT2_DIR_REC_LEN(1)); + de->inode = cpu_to_le32(parent->i_ino); + memcpy (de->name, "..\0", 4); + ext2_set_de_type (de, inode); + kunmap_atomic(kaddr); + err = ext2_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ext2_empty_dir (struct inode * inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + int dir_has_error = 0; + + for (i = 0; i < npages; i++) { + char *kaddr; + ext2_dirent * de; + page = ext2_get_page(inode, i, dir_has_error); + + if (IS_ERR(page)) { + dir_has_error = 1; + continue; + } + + kaddr = page_address(page); + de = (ext2_dirent *)kaddr; + kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + ext2_error(inode->i_sb, __func__, + "zero-length directory entry"); + printk("kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le32(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = ext2_next_entry(de); + } + ext2_put_page(page); + } + return 1; + +not_empty: + ext2_put_page(page); + return 0; +} + +const struct file_operations ext2_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = ext2_readdir, + .unlocked_ioctl = ext2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext2_compat_ioctl, +#endif + .fsync = ext2_fsync, +}; diff --git a/kernel/fs/ext2/ext2.h b/kernel/fs/ext2/ext2.h new file mode 100644 index 000000000..8d15febd0 --- /dev/null +++ b/kernel/fs/ext2/ext2.h @@ -0,0 +1,820 @@ +/* + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ +#include +#include +#include +#include +#include + +/* XXX Here for now... not interested in restructing headers JUST now */ + +/* data type for block offset of block group */ +typedef int ext2_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext2_fsblk_t; + +#define E2FSBLK "%lu" + +struct ext2_reserve_window { + ext2_fsblk_t _rsv_start; /* First byte reserved */ + ext2_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext2_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext2_reserve_window rsv_window; +}; + +struct ext2_block_alloc_info { + /* information about reservation window */ + struct ext2_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext2_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext2_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext2_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * second extended-fs super-block data in memory + */ +struct ext2_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext2_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + unsigned long s_sb_block; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + unsigned long s_dir_count; + u8 *s_debts; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext2_reserve_window_node s_rsv_window_head; + /* + * s_lock protects against concurrent modifications of s_mount_state, + * s_blocks_last, s_overhead_last and the content of superblock's + * buffer pointed to by sbi->s_es. + * + * Note: It is used in ext2_show_options() to provide a consistent view + * of the mount options. + */ + spinlock_t s_lock; +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +/* + * Define EXT2FS_DEBUG to produce debug messages + */ +#undef EXT2FS_DEBUG + +/* + * Define EXT2_RESERVATION to reserve data blocks for expanding files + */ +#define EXT2_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT2_MAX_RESERVE_BLOCKS 1027 +#define EXT2_RESERVE_WINDOW_NOT_ALLOCATED 0 +/* + * The second extended file system version + */ +#define EXT2FS_DATE "95/08/09" +#define EXT2FS_VERSION "0.5b" + +/* + * Debug code + */ +#ifdef EXT2FS_DEBUG +# define ext2_debug(f, a...) { \ + printk ("EXT2-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (f, ## a); \ + } +#else +# define ext2_debug(f, a...) /**/ +#endif + +/* + * Special inode numbers + */ +#define EXT2_BAD_INO 1 /* Bad blocks inode */ +#define EXT2_ROOT_INO 2 /* Root inode */ +#define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +/* First non-reserved inode for old ext2 filesystems */ +#define EXT2_GOOD_OLD_FIRST_INO 11 + +static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT2_MIN_BLOCK_SIZE 1024 +#define EXT2_MAX_BLOCK_SIZE 4096 +#define EXT2_MIN_BLOCK_LOG_SIZE 10 +#define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT2_ADDR_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_addr_per_block_bits) +#define EXT2_INODE_SIZE(s) (EXT2_SB(s)->s_inode_size) +#define EXT2_FIRST_INO(s) (EXT2_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT2_MIN_FRAG_SIZE 1024 +#define EXT2_MAX_FRAG_SIZE 4096 +#define EXT2_MIN_FRAG_LOG_SIZE 10 +#define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->s_frag_size) +#define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext2_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT2_BLOCKS_PER_GROUP(s) (EXT2_SB(s)->s_blocks_per_group) +#define EXT2_DESC_PER_BLOCK(s) (EXT2_SB(s)->s_desc_per_block) +#define EXT2_INODES_PER_GROUP(s) (EXT2_SB(s)->s_inodes_per_group) +#define EXT2_DESC_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT2_NDIR_BLOCKS 12 +#define EXT2_IND_BLOCK EXT2_NDIR_BLOCKS +#define EXT2_DIND_BLOCK (EXT2_IND_BLOCK + 1) +#define EXT2_TIND_BLOCK (EXT2_DIND_BLOCK + 1) +#define EXT2_N_BLOCKS (EXT2_TIND_BLOCK + 1) + +/* + * Inode flags (GETFLAGS/SETFLAGS) + */ +#define EXT2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define EXT2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define EXT2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define EXT2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define EXT2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define EXT2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define EXT2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define EXT2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT2_DIRTY_FL FS_DIRTY_FL +#define EXT2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define EXT2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define EXT2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define EXT2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define EXT2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define EXT2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\ + EXT2_SYNC_FL | EXT2_NODUMP_FL |\ + EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\ + EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\ + EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT2_REG_FLMASK; + else + return flags & EXT2_OTHER_FLMASK; +} + +/* + * ioctl commands + */ +#define EXT2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT2_IOC_GETVERSION FS_IOC_GETVERSION +#define EXT2_IOC_SETVERSION FS_IOC_SETVERSION +#define EXT2_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT2_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT2_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT2_IOC32_GETVERSION FS_IOC32_GETVERSION +#define EXT2_IOC32_SETVERSION FS_IOC32_SETVERSION + +/* + * Structure of an inode on the disk + */ +struct ext2_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_reserved1; + } linux1; + struct { + __le32 h_i_translator; + } hurd1; + struct { + __le32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT2_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __le16 h_i_mode_high; + __le16 h_i_uid_high; + __le16 h_i_gid_high; + __le32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT2_ERROR_FS 0x0002 /* Errors detected */ + +/* + * Mount flags + */ +#define EXT2_MOUNT_CHECK 0x000001 /* Do mount-time checks */ +#define EXT2_MOUNT_OLDALLOC 0x000002 /* Don't use the new Orlov allocator */ +#define EXT2_MOUNT_GRPID 0x000004 /* Create files with directory's group */ +#define EXT2_MOUNT_DEBUG 0x000008 /* Some debugging messages */ +#define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */ +#define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */ +#define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */ +#define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ +#define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ +#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ +#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ +#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ +#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ +#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ +#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif + + +#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt +#define set_opt(o, opt) o |= EXT2_MOUNT_##opt +#define test_opt(sb, opt) (EXT2_SB(sb)->s_mount_opt & \ + EXT2_MOUNT_##opt) +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT2_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT2_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT2_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT2_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT2_ERRORS_PANIC 3 /* Panic */ +#define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext2_super_block { + __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ + __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ + __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ + __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ + __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ + __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT2_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ + __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ + __u8 s_uuid[16]; /* 128-bit uuid for volume */ + char s_volume_name[16]; /* volume name */ + char s_last_mounted[64]; /* directory where last mounted */ + __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT2_COMPAT_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ + __u8 s_journal_uuid[16]; /* uuid of journal superblock */ + __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ + __u32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __u32 s_reserved[190]; /* Padding to the end of the block */ +}; + +/* + * Codes for operating systems + */ +#define EXT2_OS_LINUX 0 +#define EXT2_OS_HURD 1 +#define EXT2_OS_MASIX 2 +#define EXT2_OS_FREEBSD 3 +#define EXT2_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT2_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT2_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT2_CURRENT_REV EXT2_GOOD_OLD_REV +#define EXT2_MAX_SUPP_REV EXT2_DYNAMIC_REV + +#define EXT2_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT2_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT2_SET_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT2_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT2_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT2_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT2_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT2_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT2_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT2_FEATURE_COMPAT_RESIZE_INO 0x0010 +#define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT2_FEATURE_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT2_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT2_FEATURE_RO_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 +#define EXT2_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \ + EXT2_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT2_FEATURE_RO_COMPAT_BTREE_DIR) +#define EXT2_FEATURE_RO_COMPAT_UNSUPPORTED ~EXT2_FEATURE_RO_COMPAT_SUPP +#define EXT2_FEATURE_INCOMPAT_UNSUPPORTED ~EXT2_FEATURE_INCOMPAT_SUPP + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT2_DEF_RESUID 0 +#define EXT2_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT2_DEFM_DEBUG 0x0001 +#define EXT2_DEFM_BSDGROUPS 0x0002 +#define EXT2_DEFM_XATTR_USER 0x0004 +#define EXT2_DEFM_ACL 0x0008 +#define EXT2_DEFM_UID16 0x0010 + /* Not used by ext2, but reserved for use by ext3 */ +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ + +struct ext2_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[]; /* File name, up to EXT2_NAME_LEN */ +}; + +/* + * The new version of the directory entry. Since EXT2 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext2_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[]; /* File name, up to EXT2_NAME_LEN */ +}; + +/* + * Ext2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +enum { + EXT2_FT_UNKNOWN = 0, + EXT2_FT_REG_FILE = 1, + EXT2_FT_DIR = 2, + EXT2_FT_CHRDEV = 3, + EXT2_FT_BLKDEV = 4, + EXT2_FT_FIFO = 5, + EXT2_FT_SOCK = 6, + EXT2_FT_SYMLINK = 7, + EXT2_FT_MAX +}; + +/* + * EXT2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT2_DIR_PAD 4 +#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1) +#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \ + ~EXT2_DIR_ROUND) +#define EXT2_MAX_REC_LEN ((1<<16)-1) + +static inline void verify_offsets(void) +{ +#define A(x,y) BUILD_BUG_ON(x != offsetof(struct ext2_super_block, y)); + A(EXT2_SB_MAGIC_OFFSET, s_magic); + A(EXT2_SB_BLOCKS_OFFSET, s_blocks_count); + A(EXT2_SB_BSIZE_OFFSET, s_log_block_size); +#undef A +} + +/* + * ext2 mount options + */ +struct ext2_mount_options { + unsigned long s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; +}; + +/* + * second extended file system inode data in memory + */ +struct ext2_inode_info { + __le32 i_data[15]; + __u32 i_flags; + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; + __u16 i_state; + __u32 i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + + /* block reservation info */ + struct ext2_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT2_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + rwlock_t i_meta_lock; + + /* + * truncate_mutex is for serialising ext2_truncate() against + * ext2_getblock(). It also protects the internals of the inode's + * reservation data structures: ext2_reserve_window and + * ext2_reserve_window_node. + */ + struct mutex truncate_mutex; + struct inode vfs_inode; + struct list_head i_orphan; /* unlinked but open inodes */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif +}; + +/* + * Inode dynamic state flags + */ +#define EXT2_STATE_NEW 0x00000001 /* inode is newly created */ + + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext2 source programs needs to include it so they are duplicated here. + */ + +static inline struct ext2_inode_info *EXT2_I(struct inode *inode) +{ + return container_of(inode, struct ext2_inode_info, vfs_inode); +} + +/* balloc.c */ +extern int ext2_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); +extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); +extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, + unsigned long *, int *); +extern void ext2_free_blocks (struct inode *, unsigned long, + unsigned long); +extern unsigned long ext2_count_free_blocks (struct super_block *); +extern unsigned long ext2_count_dirs (struct super_block *); +extern void ext2_check_blocks_bitmap (struct super_block *); +extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern void ext2_discard_reservation (struct inode *); +extern int ext2_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext2_init_block_alloc_info(struct inode *); +extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv); + +/* dir.c */ +extern int ext2_add_link (struct dentry *, struct inode *); +extern ino_t ext2_inode_by_name(struct inode *, struct qstr *); +extern int ext2_make_empty(struct inode *, struct inode *); +extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **); +extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); +extern int ext2_empty_dir (struct inode *); +extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); +extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); + +/* ialloc.c */ +extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); +extern void ext2_free_inode (struct inode *); +extern unsigned long ext2_count_free_inodes (struct super_block *); +extern void ext2_check_inodes_bitmap (struct super_block *); +extern unsigned long ext2_count_free (struct buffer_head *, unsigned); + +/* inode.c */ +extern struct inode *ext2_iget (struct super_block *, unsigned long); +extern int ext2_write_inode (struct inode *, struct writeback_control *); +extern void ext2_evict_inode(struct inode *); +extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int ext2_setattr (struct dentry *, struct iattr *); +extern void ext2_set_inode_flags(struct inode *inode); +extern void ext2_get_inode_flags(struct ext2_inode_info *); +extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +struct dentry *ext2_get_parent(struct dentry *child); + +/* super.c */ +extern __printf(3, 4) +void ext2_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext2_msg(struct super_block *, const char *, const char *, ...); +extern void ext2_update_dynamic_rev (struct super_block *sb); +extern void ext2_write_super (struct super_block *); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext2_dir_operations; + +/* file.c */ +extern int ext2_fsync(struct file *file, loff_t start, loff_t end, + int datasync); +extern const struct inode_operations ext2_file_inode_operations; +extern const struct file_operations ext2_file_operations; + +/* inode.c */ +extern const struct address_space_operations ext2_aops; +extern const struct address_space_operations ext2_nobh_aops; + +/* namei.c */ +extern const struct inode_operations ext2_dir_inode_operations; +extern const struct inode_operations ext2_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext2_fast_symlink_inode_operations; +extern const struct inode_operations ext2_symlink_inode_operations; + +static inline ext2_fsblk_t +ext2_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); +} + +#define ext2_set_bit __test_and_set_bit_le +#define ext2_clear_bit __test_and_clear_bit_le +#define ext2_test_bit test_bit_le +#define ext2_find_first_zero_bit find_first_zero_bit_le +#define ext2_find_next_zero_bit find_next_zero_bit_le diff --git a/kernel/fs/ext2/file.c b/kernel/fs/ext2/file.c new file mode 100644 index 000000000..3a0a6c640 --- /dev/null +++ b/kernel/fs/ext2/file.c @@ -0,0 +1,121 @@ +/* + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +#ifdef CONFIG_FS_DAX +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + +/* + * Called when filp is released. This happens when all file descriptors + * for a single struct file are closed. Note that different open() calls + * for the same file yield different struct file structures. + */ +static int ext2_release_file (struct inode * inode, struct file * filp) +{ + if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&EXT2_I(inode)->truncate_mutex); + ext2_discard_reservation(inode); + mutex_unlock(&EXT2_I(inode)->truncate_mutex); + } + return 0; +} + +int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct super_block *sb = file->f_mapping->host->i_sb; + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + + ret = generic_file_fsync(file, start, end, datasync); + if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + /* We don't really know where the IO error happened... */ + ext2_error(sb, __func__, + "detected IO error when writing metadata buffers"); + ret = -EIO; + } + return ret; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the ext2 filesystem. + */ +const struct file_operations ext2_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .unlocked_ioctl = ext2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext2_compat_ioctl, +#endif + .mmap = ext2_file_mmap, + .open = dquot_file_open, + .release = ext2_release_file, + .fsync = ext2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; + +const struct inode_operations ext2_file_inode_operations = { +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, + .fiemap = ext2_fiemap, +}; diff --git a/kernel/fs/ext2/ialloc.c b/kernel/fs/ext2/ialloc.c new file mode 100644 index 000000000..5c04a0dde --- /dev/null +++ b/kernel/fs/ext2/ialloc.c @@ -0,0 +1,675 @@ +/* + * linux/fs/ext2/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext2_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext2_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext2_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +static void ext2_release_inode(struct super_block *sb, int group, int dir) +{ + struct ext2_group_desc * desc; + struct buffer_head *bh; + + desc = ext2_get_group_desc(sb, group, &bh); + if (!desc) { + ext2_error(sb, "ext2_release_inode", + "can't get descriptor for group %d", group); + return; + } + + spin_lock(sb_bgl_lock(EXT2_SB(sb), group)); + le16_add_cpu(&desc->bg_free_inodes_count, 1); + if (dir) + le16_add_cpu(&desc->bg_used_dirs_count, -1); + spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); + if (dir) + percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); + mark_buffer_dirty(bh); +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext2_free_inode (struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh; + unsigned long block_group; + unsigned long bit; + struct ext2_super_block * es; + + ino = inode->i_ino; + ext2_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + /* Quota is already initialized in iput() */ + dquot_free_inode(inode); + dquot_drop(inode); + + es = EXT2_SB(sb)->s_es; + is_directory = S_ISDIR(inode->i_mode); + + if (ino < EXT2_FIRST_INO(sb) || + ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_free_inode", + "reserved or nonexistent inode %lu", ino); + return; + } + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), + bit, (void *) bitmap_bh->b_data)) + ext2_error (sb, "ext2_free_inode", + "bit already cleared for inode %lu", ino); + else + ext2_release_inode(sb, block_group, is_directory); + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + brelse(bitmap_bh); +} + +/* + * We perform asynchronous prereading of the new inode's inode block when + * we create the inode, in the expectation that the inode will be written + * back soon. There are two reasons: + * + * - When creating a large number of files, the async prereads will be + * nicely merged into large reads + * - When writing out a large number of inodes, we don't need to keep on + * stalling the writes while we read the inode block. + * + * FIXME: ext2_get_group_desc() needs to be simplified. + */ +static void ext2_preread_inode(struct inode *inode) +{ + unsigned long block_group; + unsigned long offset; + unsigned long block; + struct ext2_group_desc * gdp; + struct backing_dev_info *bdi; + + bdi = inode_to_bdi(inode); + if (bdi_read_congested(bdi)) + return; + if (bdi_write_congested(bdi)) + return; + + block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); + if (gdp == NULL) + return; + + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + EXT2_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); + sb_breadahead(inode->i_sb, block); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent) +{ + int ngroups = EXT2_SB(sb)->s_groups_count; + int avefreei = ext2_count_free_inodes(sb) / ngroups; + struct ext2_group_desc *desc, *best_desc = NULL; + int group, best_group = -1; + + for (group = 0; group < ngroups; group++) { + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + } + } + if (!best_desc) + return -1; + + return best_group; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT2_I(parent)->i_block_group; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT2_INODES_PER_GROUP(sb); + int freei; + int avefreei; + int free_blocks; + int avefreeb; + int blocks_per_dir; + int ndirs; + int max_debt, max_dirs, min_blocks, min_inodes; + int group = -1, i; + struct ext2_group_desc *desc; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = free_blocks / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == d_inode(sb->s_root)) || + (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { + struct ext2_group_desc *best_desc = NULL; + int best_ndir = inodes_per_group; + int best_group = -1; + + group = prandom_u32(); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_desc = desc; + } + if (best_group >= 0) { + desc = best_desc; + group = best_group; + goto found; + } + goto fallback; + } + + if (ndirs == 0) + ndirs = 1; /* percpu_counters are approximate... */ + + blocks_per_dir = (le32_to_cpu(es->s_blocks_count)-free_blocks) / ndirs; + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT2_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT2_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (sbi->s_debts[group] >= max_debt) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + goto found; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + goto found; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; + +found: + return group; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT2_I(parent)->i_block_group; + int ngroups = EXT2_SB(sb)->s_groups_count; + struct ext2_group_desc *desc; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + goto found; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some + * free blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + goto found; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + goto found; + } + + return -1; + +found: + return group; +} + +struct inode *ext2_new_inode(struct inode *dir, umode_t mode, + const struct qstr *qstr) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group, i; + ino_t ino = 0; + struct inode * inode; + struct ext2_group_desc *gdp; + struct ext2_super_block *es; + struct ext2_inode_info *ei; + struct ext2_sb_info *sbi; + int err; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + ei = EXT2_I(inode); + sbi = EXT2_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) { + if (test_opt(sb, OLDALLOC)) + group = find_group_dir(sb, dir); + else + group = find_group_orlov(sb, dir); + } else + group = find_group_other(sb, dir); + + if (group == -1) { + err = -ENOSPC; + goto fail; + } + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext2_get_group_desc(sb, group, &bh2); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto fail; + } + ino = 0; + +repeat_in_this_group: + ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data, + EXT2_INODES_PER_GROUP(sb), ino); + if (ino >= EXT2_INODES_PER_GROUP(sb)) { + /* + * Rare race: find_group_xx() decided that there were + * free inodes in this group, but by the time we tried + * to allocate one, they're all gone. This can also + * occur because the counters which find_group_orlov() + * uses are approximate. So just go and search the + * next block group. + */ + if (++group == sbi->s_groups_count) + group = 0; + continue; + } + if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we lost this inode */ + if (++ino >= EXT2_INODES_PER_GROUP(sb)) { + /* this group is exhausted, try next group */ + if (++group == sbi->s_groups_count) + group = 0; + continue; + } + /* try to find free inode in the same group */ + goto repeat_in_this_group; + } + goto got; + } + + /* + * Scanned all blockgroups. + */ + err = -ENOSPC; + goto fail; +got: + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + brelse(bitmap_bh); + + ino += group * EXT2_INODES_PER_GROUP(sb) + 1; + if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%lu", group, + (unsigned long) ino); + err = -EIO; + goto fail; + } + + percpu_counter_add(&sbi->s_freeinodes_counter, -1); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); + if (S_ISDIR(mode)) { + if (sbi->s_debts[group] < 255) + sbi->s_debts[group]++; + le16_add_cpu(&gdp->bg_used_dirs_count, 1); + } else { + if (sbi->s_debts[group]) + sbi->s_debts[group]--; + } + spin_unlock(sb_bgl_lock(sbi, group)); + + mark_buffer_dirty(bh2); + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + ei->i_dir_start_lookup = 0; + ei->i_state = EXT2_STATE_NEW; + ext2_set_inode_flags(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + if (insert_inode_locked(inode) < 0) { + ext2_error(sb, "ext2_new_inode", + "inode number already in use - inode=%lu", + (unsigned long) ino); + err = -EIO; + goto fail; + } + + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext2_init_acl(inode, dir); + if (err) + goto fail_free_drop; + + err = ext2_init_security(inode, dir, qstr); + if (err) + goto fail_free_drop; + + mark_inode_dirty(inode); + ext2_debug("allocating inode %lu\n", inode->i_ino); + ext2_preread_inode(inode); + return inode; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); + +fail: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +unsigned long ext2_count_free_inodes (struct super_block * sb) +{ + struct ext2_group_desc *desc; + unsigned long desc_count = 0; + int i; + +#ifdef EXT2FS_DEBUG + struct ext2_super_block *es; + unsigned long bitmap_count = 0; + struct buffer_head *bitmap_bh = NULL; + + es = EXT2_SB(sb)->s_es; + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + unsigned x; + + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %u\n", + i, le16_to_cpu(desc->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + (unsigned long) + percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter), + desc_count, bitmap_count); + return desc_count; +#else + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext2_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + struct ext2_group_desc *gdp = ext2_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff --git a/kernel/fs/ext2/inode.c b/kernel/fs/ext2/inode.c new file mode 100644 index 000000000..f460ae36d --- /dev/null +++ b/kernel/fs/ext2/inode.c @@ -0,0 +1,1573 @@ +/* + * linux/fs/ext2/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@dcs.ed.ac.uk), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "acl.h" +#include "xattr.h" + +static int __ext2_write_inode(struct inode *inode, int do_sync); + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ext2_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT2_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && + inode->i_blocks - ea_blocks == 0); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset); + +static void ext2_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + ext2_truncate_blocks(inode, inode->i_size); + } +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext2_evict_inode(struct inode * inode) +{ + struct ext2_block_alloc_info *rsv; + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) { + want_delete = 1; + dquot_initialize(inode); + } else { + dquot_drop(inode); + } + + truncate_inode_pages_final(&inode->i_data); + + if (want_delete) { + sb_start_intwrite(inode->i_sb); + /* set dtime */ + EXT2_I(inode)->i_dtime = get_seconds(); + mark_inode_dirty(inode); + __ext2_write_inode(inode, inode_needs_sync(inode)); + /* truncate to 0 */ + inode->i_size = 0; + if (inode->i_blocks) + ext2_truncate_blocks(inode, 0); + ext2_xattr_delete_inode(inode); + } + + invalidate_inode_buffers(inode); + clear_inode(inode); + + ext2_discard_reservation(inode); + rsv = EXT2_I(inode)->i_block_alloc_info; + EXT2_I(inode)->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); + + if (want_delete) { + ext2_free_inode(inode); + sb_end_intwrite(inode->i_sb); + } +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext2_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * To store the locations of file's data ext2 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext2_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT2_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block < 0", __func__); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT2_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT2_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT2_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block is too big", __func__); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + + return n; +} + +/** + * ext2_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext2_get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[4], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + read_lock(&EXT2_I(inode)->i_meta_lock); + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + read_unlock(&EXT2_I(inode)->i_meta_lock); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + read_unlock(&EXT2_I(inode)->i_meta_lock); + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext2_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ + +static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext2_fsblk_t bg_start; + ext2_fsblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred from inode itself? OK, just put it into + * the same cylinder group then. + */ + bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext2_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Returns preferred place for a block (the goal). + */ + +static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block, + Indirect *partial) +{ + struct ext2_block_alloc_info *block_i; + + block_i = EXT2_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext2_find_near(inode, partial); +} + +/** + * ext2_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int +ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now don't hanel cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary + && le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext2_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext2_alloc_blocks(struct inode *inode, + ext2_fsblk_t goal, int indirect_blks, int blks, + ext2_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext2_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext2_new_blocks(inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same + * picture as after the successful ext2_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext2_alloc_branch(struct inode *inode, + int indirect_blks, int *blks, ext2_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext2_fsblk_t new_blocks[4]; + ext2_fsblk_t current_block; + + num = ext2_alloc_blocks(inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } + branch[n].bh = bh; + lock_buffer(bh); + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + /* We used to sync bh here if IS_SYNC(inode). + * But we now rely upon generic_write_sync() + * and b_inode_buffers. But not for directories. + */ + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + sync_dirty_buffer(bh); + } + *blks = num; + return err; + +failed: + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < indirect_blks; i++) + ext2_free_blocks(inode, new_blocks[i], 1); + ext2_free_blocks(inode, new_blocks[i], num); + return err; +} + +/** + * ext2_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static void ext2_splice_branch(struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + struct ext2_block_alloc_info *block_i; + ext2_fsblk_t current_block; + + block_i = EXT2_I(inode)->i_block_alloc_info; + + /* XXX LOCKING probably should have i_meta_lock ?*/ + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + /* had we spliced it onto indirect block? */ + if (where->bh) + mark_buffer_dirty_inode(where->bh, inode); + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +static int ext2_get_blocks(struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext2_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext2_inode_info *ei = EXT2_I(inode); + int count = 0; + ext2_fsblk_t first_block = 0; + + BUG_ON(maxblocks == 0); + + depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + return (err); + + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); /* What's this do? */ + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext2_fsblk_t blk; + + if (!verify_chain(chain, chain + depth - 1)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now, go to reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + /* + * If the indirect block is missing while we are reading + * the chain(ext2_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext2_init_block_alloc_info(inode); + + goal = ext2_find_goal(inode, iblock, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext2_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * XXX ???? Block out ext2_truncate while we alter the tree + */ + err = ext2_alloc_branch(inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } + + if (IS_DAX(inode)) { + /* + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised + */ + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } + } + + ext2_splice_branch(inode, iblock, partial, indirect_blks, count); + mutex_unlock(&ei->truncate_mutex); + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + return err; +} + +int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int ret = ext2_get_blocks(inode, iblock, max_blocks, + bh_result, create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; + +} + +int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext2_get_block); +} + +static int ext2_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, ext2_get_block, wbc); +} + +static int ext2_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext2_get_block); +} + +static int +ext2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); +} + +static int +ext2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int +ext2_nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, + ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_nobh_writepage(struct page *page, + struct writeback_control *wbc) +{ + return nobh_writepage(page, ext2_get_block, wbc); +} + +static sector_t ext2_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,ext2_get_block); +} + +static ssize_t +ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (IS_DAX(inode)) + ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL, + DIO_LOCKING); + else + ret = blockdev_direct_IO(iocb, inode, iter, offset, + ext2_get_block); + if (ret < 0 && iov_iter_rw(iter) == WRITE) + ext2_write_failed(mapping, offset + count); + return ret; +} + +static int +ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, ext2_get_block); +} + +const struct address_space_operations ext2_aops = { + .readpage = ext2_readpage, + .readpages = ext2_readpages, + .writepage = ext2_writepage, + .write_begin = ext2_write_begin, + .write_end = ext2_write_end, + .bmap = ext2_bmap, + .direct_IO = ext2_direct_IO, + .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +const struct address_space_operations ext2_nobh_aops = { + .readpage = ext2_readpage, + .readpages = ext2_readpages, + .writepage = ext2_nobh_writepage, + .write_begin = ext2_nobh_write_begin, + .write_end = nobh_write_end, + .bmap = ext2_bmap, + .direct_IO = ext2_direct_IO, + .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, + .error_remove_page = generic_error_remove_page, +}; + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext2_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext2_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext2_truncate(). + * + * When we do truncate() we may have to clean the ends of several indirect + * blocks but leave the blocks themselves alive. Block is partially + * truncated if some data below the new i_size is referred from it (and + * it is on the path to the first completely truncated data block, indeed). + * We have to free the top of that path along with everything to the right + * of the path. Since no allocation past the truncation point is possible + * until ext2_truncate() finishes, we may safely do the latter, but top + * of branch may require special attention - pageout below the truncation + * point might try to populate it. + * + * We atomically detach the top of branch from the tree, store the block + * number of its root in *@top, pointers to buffer_heads of partially + * truncated blocks - in @chain[].bh and pointers to their last elements + * that should not be removed - in @chain[].p. Return value is the pointer + * to last filled element of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0].p + * (no partially truncated stuff there). + */ + +static Indirect *ext2_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext2_get_branch(inode, k, offsets, chain, &err); + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + write_lock(&EXT2_I(inode)->i_meta_lock); + if (!partial->key && *partial->p) { + write_unlock(&EXT2_I(inode)->i_meta_lock); + goto no_top; + } + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&EXT2_I(inode)->i_meta_lock); + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/** + * ext2_free_data - free a list of data blocks + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) +{ + unsigned long block_to_free = 0, count = 0; + unsigned long nr; + + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (nr) { + *p = 0; + /* accumulate blocks to free if they're contiguous */ + if (count == 0) + goto free_this; + else if (block_to_free == nr - count) + count++; + else { + ext2_free_blocks (inode, block_to_free, count); + mark_inode_dirty(inode); + free_this: + block_to_free = nr; + count = 1; + } + } + } + if (count > 0) { + ext2_free_blocks (inode, block_to_free, count); + mark_inode_dirty(inode); + } +} + +/** + * ext2_free_branches - free an array of branches + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth) +{ + struct buffer_head * bh; + unsigned long nr; + + if (depth--) { + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (!nr) + continue; + *p = 0; + bh = sb_bread(inode->i_sb, nr); + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext2_error(inode->i_sb, "ext2_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + ext2_free_branches(inode, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, + depth); + bforget(bh); + ext2_free_blocks(inode, nr, 1); + mark_inode_dirty(inode); + } + } else + ext2_free_data(inode, p, q); +} + +static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + __le32 *i_data = EXT2_I(inode)->i_data; + struct ext2_inode_info *ei = EXT2_I(inode); + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long iblock; + unsigned blocksize; + blocksize = inode->i_sb->s_blocksize; + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); + + n = ext2_block_to_path(inode, iblock, offsets, NULL); + if (n == 0) + return; + + /* + * From here we block out all ext2_get_block() callers who want to + * modify the block allocation tree. + */ + mutex_lock(&ei->truncate_mutex); + + if (n == 1) { + ext2_free_data(inode, i_data+offsets[0], + i_data + EXT2_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext2_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (already detached) */ + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + mark_buffer_dirty_inode(partial->bh, inode); + ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext2_free_branches(inode, + partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + mark_buffer_dirty_inode(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT2_IND_BLOCK]; + if (nr) { + i_data[EXT2_IND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 1); + } + case EXT2_IND_BLOCK: + nr = i_data[EXT2_DIND_BLOCK]; + if (nr) { + i_data[EXT2_DIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 2); + } + case EXT2_DIND_BLOCK: + nr = i_data[EXT2_TIND_BLOCK]; + if (nr) { + i_data[EXT2_TIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 3); + } + case EXT2_TIND_BLOCK: + ; + } + + ext2_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + /* + * XXX: it seems like a bug here that we don't allow + * IS_APPEND inode to have blocks-past-i_size trimmed off. + * review and fix this. + * + * Also would be nice to be able to handle IO errors and such, + * but that's probably too much to ask. + */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext2_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ext2_truncate_blocks(inode, offset); +} + +static int ext2_setsize(struct inode *inode, loff_t newsize) +{ + int error; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (ext2_inode_is_fast_symlink(inode)) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + inode_dio_wait(inode); + + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, ext2_get_block); + else if (test_opt(inode->i_sb, NOBH)) + error = nobh_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + else + error = block_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + if (error) + return error; + + truncate_setsize(inode, newsize); + __ext2_truncate_blocks(inode, newsize); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + if (inode_needs_sync(inode)) { + sync_mapping_buffers(inode->i_mapping); + sync_inode_metadata(inode, 1); + } else { + mark_inode_dirty(inode); + } + + return 0; +} + +static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, + struct buffer_head **p) +{ + struct buffer_head * bh; + unsigned long block_group; + unsigned long block; + unsigned long offset; + struct ext2_group_desc * gdp; + + *p = NULL; + if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) || + ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) + goto Einval; + + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); + gdp = ext2_get_group_desc(sb, block_group, NULL); + if (!gdp) + goto Egdp; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(sb)); + if (!(bh = sb_bread(sb, block))) + goto Eio; + + *p = bh; + offset &= (EXT2_BLOCK_SIZE(sb) - 1); + return (struct ext2_inode *) (bh->b_data + offset); + +Einval: + ext2_error(sb, "ext2_get_inode", "bad inode number: %lu", + (unsigned long) ino); + return ERR_PTR(-EINVAL); +Eio: + ext2_error(sb, "ext2_get_inode", + "unable to read inode block - inode=%lu, block=%lu", + (unsigned long) ino, block); +Egdp: + return ERR_PTR(-EIO); +} + +void ext2_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT2_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); + if (flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT2_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + inode->i_flags |= S_DAX; +} + +/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ +void ext2_get_inode_flags(struct ext2_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| + EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT2_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT2_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT2_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT2_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT2_DIRSYNC_FL; +} + +struct inode *ext2_iget (struct super_block *sb, unsigned long ino) +{ + struct ext2_inode_info *ei; + struct buffer_head * bh; + struct ext2_inode *raw_inode; + struct inode *inode; + long ret = -EIO; + int n; + uid_t i_uid; + gid_t i_gid; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT2_I(inode); + ei->i_block_alloc_info = NULL; + + raw_inode = ext2_get_inode(inode->i_sb, ino, &bh); + if (IS_ERR(raw_inode)) { + ret = PTR_ERR(raw_inode); + goto bad_inode; + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt (inode->i_sb, NO_UID32))) { + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) { + /* this inode is deleted */ + brelse (bh); + ret = -ESTALE; + goto bad_inode; + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ei->i_dir_acl = 0; + if (S_ISREG(inode->i_mode)) + inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + else + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + ei->i_dtime = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_state = 0; + ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + ei->i_dir_start_lookup = 0; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (n = 0; n < EXT2_N_BLOCKS; n++) + ei->i_data[n] = raw_inode->i_block[n]; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext2_dir_inode_operations; + inode->i_fop = &ext2_dir_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (ext2_inode_is_fast_symlink(inode)) { + inode->i_op = &ext2_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext2_symlink_inode_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + } + } else { + inode->i_op = &ext2_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (bh); + ext2_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +static int __ext2_write_inode(struct inode *inode, int do_sync) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + uid_t uid = i_uid_read(inode); + gid_t gid = i_gid_read(inode); + struct buffer_head * bh; + struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); + int n; + int err = 0; + + if (IS_ERR(raw_inode)) + return -EIO; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ei->i_state & EXT2_STATE_NEW) + memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); + + ext2_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + else { + raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32); + if (inode->i_size > 0x7fffffffULL) { + if (!EXT2_HAS_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT2_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT2_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + spin_lock(&EXT2_SB(sb)->s_lock); + ext2_update_dynamic_rev(sb); + EXT2_SET_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE); + spin_unlock(&EXT2_SB(sb)->s_lock); + ext2_write_super(sb); + } + } + } + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (n = 0; n < EXT2_N_BLOCKS; n++) + raw_inode->i_block[n] = ei->i_data[n]; + mark_buffer_dirty(bh); + if (do_sync) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing ext2 inode [%s:%08lx]\n", + sb->s_id, (unsigned long) ino); + err = -EIO; + } + } + ei->i_state &= ~EXT2_STATE_NEW; + brelse (bh); + return err; +} + +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int ext2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { + error = dquot_transfer(inode, iattr); + if (error) + return error; + } + if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { + error = ext2_setsize(inode, iattr->ia_size); + if (error) + return error; + } + setattr_copy(inode, iattr); + if (iattr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + mark_inode_dirty(inode); + + return error; +} diff --git a/kernel/fs/ext2/ioctl.c b/kernel/fs/ext2/ioctl.c new file mode 100644 index 000000000..5d46c0986 --- /dev/null +++ b/kernel/fs/ext2/ioctl.c @@ -0,0 +1,188 @@ +/* + * linux/fs/ext2/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include "ext2.h" +#include +#include +#include +#include +#include +#include +#include + + +long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ext2_inode_info *ei = EXT2_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + int ret; + + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT2_IOC_GETFLAGS: + ext2_get_inode_flags(ei); + flags = ei->i_flags & EXT2_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT2_IOC_SETFLAGS: { + unsigned int oldflags; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto setflags_out; + } + + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } + + flags = ext2_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto setflags_out; + } + oldflags = ei->i_flags; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto setflags_out; + } + } + + flags = flags & EXT2_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext2_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mutex_unlock(&inode->i_mutex); + + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + return ret; + } + case EXT2_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *) arg); + case EXT2_IOC_SETVERSION: { + __u32 generation; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (get_user(generation, (int __user *) arg)) { + ret = -EFAULT; + goto setversion_out; + } + + mutex_lock(&inode->i_mutex); + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + mutex_unlock(&inode->i_mutex); + + mark_inode_dirty(inode); +setversion_out: + mnt_drop_write_file(filp); + return ret; + } + case EXT2_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT2_IOC_SETRSVSZ: { + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(rsv_window_size, (int __user *)arg)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT2_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + /* + * XXX What lock should protect the rsv_goal_size? + * Accessed in ext2_get_block only. ext3 uses i_truncate. + */ + mutex_lock(&ei->truncate_mutex); + if (!ei->i_block_alloc_info) + ext2_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + mutex_unlock(&ei->truncate_mutex); + mnt_drop_write_file(filp); + return 0; + } + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT2_IOC32_GETFLAGS: + cmd = EXT2_IOC_GETFLAGS; + break; + case EXT2_IOC32_SETFLAGS: + cmd = EXT2_IOC_SETFLAGS; + break; + case EXT2_IOC32_GETVERSION: + cmd = EXT2_IOC_GETVERSION; + break; + case EXT2_IOC32_SETVERSION: + cmd = EXT2_IOC_SETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + return ext2_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/kernel/fs/ext2/namei.c b/kernel/fs/ext2/namei.c new file mode 100644 index 000000000..3e074a9cc --- /dev/null +++ b/kernel/fs/ext2/namei.c @@ -0,0 +1,431 @@ +/* + * linux/fs/ext2/namei.c + * + * Rewrite to pagecache. Almost all code had been changed, so blame me + * if the things go wrong. Please, send bug reports to + * viro@parcelfarce.linux.theplanet.co.uk + * + * Stuff here is basically a glue between the VFS and generic UNIXish + * filesystem that keeps everything in pagecache. All knowledge of the + * directory layout is in fs/ext2/dir.c - it turned out to be easily separatable + * and it's easier to debug that way. In principle we might want to + * generalize that a bit and turn it into a library. Or not. + * + * The only non-static object here is ext2_dir_inode_operations. + * + * TODO: get rid of kmap() use, add readahead. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = ext2_add_link(dentry, inode); + if (!err) { + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) +{ + struct inode * inode; + ino_t ino; + + if (dentry->d_name.len > EXT2_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = ext2_inode_by_name(dir, &dentry->d_name); + inode = NULL; + if (ino) { + inode = ext2_iget(dir->i_sb, ino); + if (inode == ERR_PTR(-ESTALE)) { + ext2_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + (unsigned long) ino); + return ERR_PTR(-EIO); + } + } + return d_splice_alias(inode, dentry); +} + +struct dentry *ext2_get_parent(struct dentry *child) +{ + struct qstr dotdot = QSTR_INIT("..", 2); + unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino)); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +{ + struct inode *inode; + + dquot_initialize(dir); + + inode = ext2_new_inode(dir, mode, &dentry->d_name); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); +} + +static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = ext2_new_inode(dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; +} + +static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct inode * inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + inode = ext2_new_inode (dir, mode, &dentry->d_name); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT2_FS_XATTR + inode->i_op = &ext2_special_inode_operations; +#endif + mark_inode_dirty(inode); + err = ext2_add_nondir(dentry, inode); + } + return err; +} + +static int ext2_symlink (struct inode * dir, struct dentry * dentry, + const char * symname) +{ + struct super_block * sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode * inode; + + if (l > sb->s_blocksize) + goto out; + + dquot_initialize(dir); + + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + if (l > sizeof (EXT2_I(inode)->i_data)) { + /* slow symlink */ + inode->i_op = &ext2_symlink_inode_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &ext2_fast_symlink_inode_operations; + memcpy((char*)(EXT2_I(inode)->i_data),symname,l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = ext2_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput (inode); + goto out; +} + +static int ext2_link (struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int err; + + dquot_initialize(dir); + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + err = ext2_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +{ + struct inode * inode; + int err; + + dquot_initialize(dir); + + inode_inc_link_count(dir); + + inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &ext2_dir_inode_operations; + inode->i_fop = &ext2_dir_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + + inode_inc_link_count(inode); + + err = ext2_make_empty(inode, dir); + if (err) + goto out_fail; + + err = ext2_add_link(dentry, inode); + if (err) + goto out_fail; + + unlock_new_inode(inode); + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int ext2_unlink(struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = d_inode(dentry); + struct ext2_dir_entry_2 * de; + struct page * page; + int err = -ENOENT; + + dquot_initialize(dir); + + de = ext2_find_entry (dir, &dentry->d_name, &page); + if (!de) + goto out; + + err = ext2_delete_entry (de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int ext2_rmdir (struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = d_inode(dentry); + int err = -ENOTEMPTY; + + if (ext2_empty_dir(inode)) { + err = ext2_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry ) +{ + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); + struct page * dir_page = NULL; + struct ext2_dir_entry_2 * dir_de = NULL; + struct page * old_page; + struct ext2_dir_entry_2 * old_de; + int err = -ENOENT; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = ext2_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct ext2_dir_entry_2 *new_de; + + err = -ENOTEMPTY; + if (dir_de && !ext2_empty_dir (new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + ext2_set_link(new_dir, new_de, new_page, old_inode, 1); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + err = ext2_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); + + ext2_delete_entry (old_de, old_page); + + if (dir_de) { + if (old_dir != new_dir) + ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } + inode_dec_link_count(old_dir); + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations ext2_dir_inode_operations = { + .create = ext2_create, + .lookup = ext2_lookup, + .link = ext2_link, + .unlink = ext2_unlink, + .symlink = ext2_symlink, + .mkdir = ext2_mkdir, + .rmdir = ext2_rmdir, + .mknod = ext2_mknod, + .rename = ext2_rename, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, + .tmpfile = ext2_tmpfile, +}; + +const struct inode_operations ext2_special_inode_operations = { +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, +}; diff --git a/kernel/fs/ext2/super.c b/kernel/fs/ext2/super.c new file mode 100644 index 000000000..d0e746e96 --- /dev/null +++ b/kernel/fs/ext2/super.c @@ -0,0 +1,1580 @@ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +static void ext2_sync_super(struct super_block *sb, + struct ext2_super_block *es, int wait); +static int ext2_remount (struct super_block * sb, int * flags, char * data); +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext2_sync_fs(struct super_block *sb, int wait); +static int ext2_freeze(struct super_block *sb); +static int ext2_unfreeze(struct super_block *sb); + +void ext2_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + + if (!(sb->s_flags & MS_RDONLY)) { + spin_lock(&sbi->s_lock); + sbi->s_mount_state |= EXT2_ERROR_FS; + es->s_state |= cpu_to_le16(EXT2_ERROR_FS); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT2-fs: panic from previous error\n"); + if (test_opt(sb, ERRORS_RO)) { + ext2_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } +} + +void ext2_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + + va_end(args); +} + +/* + * This must be called with sbi->s_lock held. + */ +void ext2_update_dynamic_rev(struct super_block *sb) +{ + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) + return; + + ext2_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT2_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +static void ext2_put_super (struct super_block * sb) +{ + int db_count; + int i; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = sbi->s_es; + + spin_lock(&sbi->s_lock); + es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); + } + db_count = sbi->s_gdb_count; + for (i = 0; i < db_count; i++) + if (sbi->s_group_desc[i]) + brelse (sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + kfree(sbi->s_debts); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse (sbi->s_sbh); + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache * ext2_inode_cachep; + +static struct inode *ext2_alloc_inode(struct super_block *sb) +{ + struct ext2_inode_info *ei; + ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + + return &ei->vfs_inode; +} + +static void ext2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); +} + +static void ext2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ext2_i_callback); +} + +static void init_once(void *foo) +{ + struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; + + rwlock_init(&ei->i_meta_lock); +#ifdef CONFIG_EXT2_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + mutex_init(&ei->truncate_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", + sizeof(struct ext2_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext2_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext2_inode_cachep); +} + +static int ext2_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + unsigned long def_mount_opts; + + spin_lock(&sbi->s_lock); + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%lu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT2_DEF_RESUID)) || + le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + } + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT2_DEF_RESGID)) || + le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); + } + if (test_opt(sb, ERRORS_RO)) { + int def_errors = le16_to_cpu(es->s_errors); + + if (def_errors == EXT2_ERRORS_PANIC || + def_errors == EXT2_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG)) + seq_puts(seq, ",debug"); + if (test_opt(sb, OLDALLOC)) + seq_puts(seq, ",oldalloc"); + +#ifdef CONFIG_EXT2_FS_XATTR + if (test_opt(sb, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER) && + (def_mount_opts & EXT2_DEFM_XATTR_USER)) { + seq_puts(seq, ",nouser_xattr"); + } +#endif + +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT2_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + + if (test_opt(sb, NOBH)) + seq_puts(seq, ",nobh"); + +#if defined(CONFIG_QUOTA) + if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif + +#ifdef CONFIG_FS_DAX + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) + seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); +#endif + + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); + + spin_unlock(&sbi->s_lock); + return 0; +} + +#ifdef CONFIG_QUOTA +static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); +static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static struct dquot **ext2_get_dquots(struct inode *inode) +{ + return EXT2_I(inode)->i_dquot; +} +#endif + +static const struct super_operations ext2_sops = { + .alloc_inode = ext2_alloc_inode, + .destroy_inode = ext2_destroy_inode, + .write_inode = ext2_write_inode, + .evict_inode = ext2_evict_inode, + .put_super = ext2_put_super, + .sync_fs = ext2_sync_fs, + .freeze_fs = ext2_freeze, + .unfreeze_fs = ext2_unfreeze, + .statfs = ext2_statfs, + .remount_fs = ext2_remount, + .show_options = ext2_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext2_quota_read, + .quota_write = ext2_quota_write, + .get_dquots = ext2_get_dquots, +#endif +}; + +static struct inode *ext2_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. + */ + inode = ext2_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *ext2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext2_nfs_get_inode); +} + +static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext2_nfs_get_inode); +} + +static const struct export_operations ext2_export_ops = { + .fh_to_dentry = ext2_fh_to_dentry, + .fh_to_parent = ext2_fh_to_parent, + .get_parent = ext2_get_parent, +}; + +static unsigned long get_sb_block(void **data) +{ + unsigned long sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk("EXT2-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "check=none"}, + {Opt_nocheck, "nocheck"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_nobh, "nobh"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_xip, "xip"}, + {Opt_dax, "dax"}, + {Opt_grpquota, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb) +{ + char *p; + struct ext2_sb_info *sbi = EXT2_SB(sb); + substring_t args[MAX_OPT_ARGS]; + int option; + kuid_t uid; + kgid_t gid; + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return 0; + + } + sbi->s_resuid = uid; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return 0; + } + sbi->s_resgid = gid; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_nobh: + set_opt (sbi->s_mount_opt, NOBH); + break; +#ifdef CONFIG_EXT2_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + ext2_msg(sb, KERN_INFO, "(no)user_xattr options" + "not supported"); + break; +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + ext2_msg(sb, KERN_INFO, + "(no)acl options not supported"); + break; +#endif + case Opt_xip: + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: +#ifdef CONFIG_FS_DAX + set_opt(sbi->s_mount_opt, DAX); +#else + ext2_msg(sb, KERN_INFO, "dax option not supported"); +#endif + break; + +#if defined(CONFIG_QUOTA) + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + + case Opt_grpquota: + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext2_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif + + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations ON"); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations OFF"); + break; + case Opt_ignore: + break; + default: + return 0; + } + } + return 1; +} + +static int ext2_setup_super (struct super_block * sb, + struct ext2_super_block * es, + int read_only) +{ + int res = 0; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT2_VALID_FS)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT2_ERROR_FS)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext2_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext2_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + if (test_opt (sb, DEBUG)) + ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", + EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, + sbi->s_frag_size, + sbi->s_groups_count, + EXT2_BLOCKS_PER_GROUP(sb), + EXT2_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + return res; +} + +static int ext2_check_descriptors(struct super_block *sb) +{ + int i; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + ext2_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL); + ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i); + ext2_fsblk_t last_block; + + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > + last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + } + return 1; +} + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext2_max_size(int bits) +{ + loff_t res = EXT2_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + + /* This is calculated to be the largest file size for a + * dense, file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32 -1 + * __u32 i_blocks representing the total number of + * 512 bytes blocks of the file + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static unsigned long descriptor_loc(struct super_block *sb, + unsigned long logic_sb_block, + int nr) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + unsigned long bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext2_bg_has_super(sb, bg)) + has_super = 1; + + return ext2_group_first_block_no(sb, bg) + has_super; +} + +static int ext2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext2_sb_info * sbi; + struct ext2_super_block * es; + struct inode *root; + unsigned long block; + unsigned long sb_block = get_sb_block(&data); + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long def_mount_opts; + long ret = -EINVAL; + int blocksize = BLOCK_SIZE; + int db_count; + int i, j; + __le32 features; + int err; + + err = -ENOMEM; + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto failed; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + goto failed; + } + sb->s_fs_info = sbi; + sbi->s_sb_block = sb_block; + + spin_lock_init(&sbi->s_lock); + + /* + * See what the current blocksize for the device is, and + * use that as the blocksize. Otherwise (or if the blocksize + * is smaller than the default) use the default. + * This is important for devices that have a hardware + * sectorsize that is larger than the default. + */ + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + ext2_msg(sb, KERN_ERR, "error: unable to set blocksize"); + goto failed_sbi; + } + + /* + * If the superblock doesn't start on a hardware sector boundary, + * calculate the offset. + */ + if (blocksize != BLOCK_SIZE) { + logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize; + offset = (sb_block*BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + ext2_msg(sb, KERN_ERR, "error: unable to read superblock"); + goto failed_sbi; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext2 macro-instructions depend on its value + */ + es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + + if (sb->s_magic != EXT2_SUPER_MAGIC) + goto cantfind_ext2; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT2_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT2_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT2_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT2_FS_XATTR + if (def_mount_opts & EXT2_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (def_mount_opts & EXT2_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options((char *) data, sb)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && + (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext2_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP); + if (features) { + ext2_msg(sb, KERN_ERR, "error: couldn't mount because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); + goto failed_mount; + } + if (!(sb->s_flags & MS_RDONLY) && + (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); + goto failed_mount; + } + + blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext2_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + + /* If the blocksize doesn't match, re-read the thing.. */ + if (sb->s_blocksize != blocksize) { + brelse(bh); + + if (!sb_set_blocksize(sb, blocksize)) { + ext2_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); + goto failed_sbi; + } + + logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize; + offset = (sb_block*BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if(!bh) { + ext2_msg(sb, KERN_ERR, "error: couldn't read" + "superblock on 2nd try"); + goto failed_sbi; + } + es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) { + ext2_msg(sb, KERN_ERR, "error: magic mismatch"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits); + sb->s_max_links = EXT2_LINK_MAX; + + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) { + sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) || + !is_power_of_2(sbi->s_inode_size) || + (sbi->s_inode_size > blocksize)) { + ext2_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + } + + sbi->s_frag_size = EXT2_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (sbi->s_frag_size == 0) + goto cantfind_ext2; + sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + if (EXT2_INODE_SIZE(sb) == 0) + goto cantfind_ext2; + sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) + goto cantfind_ext2; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / + sizeof (struct ext2_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = + ilog2 (EXT2_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = + ilog2 (EXT2_DESC_PER_BLOCK(sb)); + + if (sb->s_magic != EXT2_SUPER_MAGIC) + goto cantfind_ext2; + + if (sb->s_blocksize != bh->b_size) { + if (!silent) + ext2_msg(sb, KERN_ERR, "error: unsupported blocksize"); + goto failed_mount; + } + + if (sb->s_blocksize != sbi->s_frag_size) { + ext2_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %lu" + "(not supported yet)", + sbi->s_frag_size, sb->s_blocksize); + goto failed_mount; + } + + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + if (EXT2_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext2; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / + EXT2_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext2_msg(sb, KERN_ERR, "error: not enough memory"); + goto failed_mount; + } + bgl_lock_init(sbi->s_blockgroup_lock); + sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); + if (!sbi->s_debts) { + ext2_msg(sb, KERN_ERR, "error: not enough memory"); + goto failed_mount_group_desc; + } + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + for (j = 0; j < i; j++) + brelse (sbi->s_group_desc[j]); + ext2_msg(sb, KERN_ERR, + "error: unable to read group descriptors"); + goto failed_mount_group_desc; + } + } + if (!ext2_check_descriptors (sb)) { + ext2_msg(sb, KERN_ERR, "group descriptors corrupted"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* + * Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. + */ + sbi->s_rsv_window_head.rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); + + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext2_count_free_blocks(sb), GFP_KERNEL); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext2_count_free_inodes(sb), GFP_KERNEL); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext2_count_dirs(sb), GFP_KERNEL); + } + if (err) { + ext2_msg(sb, KERN_ERR, "error: insufficient memory"); + goto failed_mount3; + } + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext2_sops; + sb->s_export_op = &ext2_export_ops; + sb->s_xattr = ext2_xattr_handlers; + +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + + root = ext2_iget(sb, EXT2_ROOT_INO); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto failed_mount3; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); + goto failed_mount3; + } + + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ext2_msg(sb, KERN_ERR, "error: get root inode failed"); + ret = -ENOMEM; + goto failed_mount3; + } + if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting ext3 filesystem as ext2"); + if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + ext2_write_super(sb); + return 0; + +cantfind_ext2: + if (!silent) + ext2_msg(sb, KERN_ERR, + "error: can't find an ext2 filesystem on dev %s.", + sb->s_id); + goto failed_mount; +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +failed_mount_group_desc: + kfree(sbi->s_group_desc); + kfree(sbi->s_debts); +failed_mount: + brelse(bh); +failed_sbi: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +failed: + return ret; +} + +static void ext2_clear_super_error(struct super_block *sb) +{ + struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext2_msg(sb, KERN_ERR, + "previous I/O error to superblock detected\n"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } +} + +static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) +{ + ext2_clear_super_error(sb); + spin_lock(&EXT2_SB(sb)->s_lock); + es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); + es->s_wtime = cpu_to_le32(get_seconds()); + /* unlock before we do IO */ + spin_unlock(&EXT2_SB(sb)->s_lock); + mark_buffer_dirty(EXT2_SB(sb)->s_sbh); + if (wait) + sync_dirty_buffer(EXT2_SB(sb)->s_sbh); +} + +/* + * In the second extended file system, it is not necessary to + * write the super block since we use a mapping of the + * disk super block in a buffer. + * + * However, this function is still used to set the fs valid + * flags to 0. We need to set this flag to 0 since the fs + * may have been checked while mounted and e2fsck may have + * set s_state to EXT2_VALID_FS after some corrections. + */ +static int ext2_sync_fs(struct super_block *sb, int wait) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + + /* + * Write quota structures to quota file, sync_blockdev() will write + * them to disk later + */ + dquot_writeback_dquots(sb, -1); + + spin_lock(&sbi->s_lock); + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { + ext2_debug("setting valid to 0\n"); + es->s_state &= cpu_to_le16(~EXT2_VALID_FS); + } + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, wait); + return 0; +} + +static int ext2_freeze(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + /* + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared + * because we have unattached inodes and thus filesystem is not fully + * consistent. + */ + if (atomic_long_read(&sb->s_remove_count)) { + ext2_sync_fs(sb, 1); + return 0; + } + /* Set EXT2_FS_VALID flag */ + spin_lock(&sbi->s_lock); + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, sbi->s_es, 1); + + return 0; +} + +static int ext2_unfreeze(struct super_block *sb) +{ + /* Just write sb to clear EXT2_VALID_FS flag */ + ext2_write_super(sb); + + return 0; +} + +void ext2_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + ext2_sync_fs(sb, 1); +} + +static int ext2_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext2_sb_info * sbi = EXT2_SB(sb); + struct ext2_super_block * es; + struct ext2_mount_options old_opts; + unsigned long old_sb_flags; + int err; + + sync_filesystem(sb); + spin_lock(&sbi->s_lock); + + /* Store the old options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb)) { + err = -EINVAL; + goto restore_opts; + } + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { + ext2_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; + } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + spin_unlock(&sbi->s_lock); + return 0; + } + if (*flags & MS_RDONLY) { + if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || + !(sbi->s_mount_state & EXT2_VALID_FS)) { + spin_unlock(&sbi->s_lock); + return 0; + } + + /* + * OK, we are remounting a valid rw partition rdonly, so set + * the rdonly flag and then mark the partition as valid again. + */ + es->s_state = cpu_to_le16(sbi->s_mount_state); + es->s_mtime = cpu_to_le32(get_seconds()); + spin_unlock(&sbi->s_lock); + + err = dquot_suspend(sb, -1); + if (err < 0) { + spin_lock(&sbi->s_lock); + goto restore_opts; + } + + ext2_sync_super(sb, es, 1); + } else { + __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, + ~EXT2_FEATURE_RO_COMPAT_SUPP); + if (ret) { + ext2_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR because of " + "unsupported optional features (%x).", + le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by e2fsck since we originally mounted the partition.) + */ + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext2_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + spin_unlock(&sbi->s_lock); + + ext2_write_super(sb); + + dquot_resume(sb, -1); + } + + return 0; +restore_opts: + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sb->s_flags = old_sb_flags; + spin_unlock(&sbi->s_lock); + return err; +} + +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + u64 fsid; + + spin_lock(&sbi->s_lock); + + if (test_opt (sb, MINIX_DF)) + sbi->s_overhead_last = 0; + else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long i, overhead = 0; + smp_rmb(); + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < sbi->s_groups_count; i++) + overhead += ext2_bg_has_super(sb, i) + + ext2_bg_num_gdb(sb, i); + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (sbi->s_groups_count * + (2 + sbi->s_itb_per_group)); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); + } + + buf->f_type = EXT2_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; + buf->f_bfree = ext2_count_free_blocks(sb); + es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = ext2_count_free_inodes(sb); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); + buf->f_namelen = EXT2_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + spin_unlock(&sbi->s_lock); + return 0; +} + +static struct dentry *ext2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); +} + +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; + err = ext2_get_block(inode, blk, &tmp_bh, 0); + if (err < 0) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t ext2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; + err = ext2_get_block(inode, blk, &tmp_bh, 1); + if (err < 0) + goto out; + if (offset || tocopy != EXT2_BLOCK_SIZE(sb)) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (unlikely(!bh)) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return len - towrite; +} + +#endif + +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext2_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext2"); + +static int __init init_ext2_fs(void) +{ + int err = init_ext2_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext2_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext2_xattr(); + return err; +} + +static void __exit exit_ext2_fs(void) +{ + unregister_filesystem(&ext2_fs_type); + destroy_inodecache(); + exit_ext2_xattr(); +} + +MODULE_AUTHOR("Remy Card and others"); +MODULE_DESCRIPTION("Second Extended Filesystem"); +MODULE_LICENSE("GPL"); +module_init(init_ext2_fs) +module_exit(exit_ext2_fs) diff --git a/kernel/fs/ext2/symlink.c b/kernel/fs/ext2/symlink.c new file mode 100644 index 000000000..20608f17c --- /dev/null +++ b/kernel/fs/ext2/symlink.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext2/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 symlink handling code + */ + +#include "ext2.h" +#include "xattr.h" +#include + +static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); + nd_set_link(nd, (char *)ei->i_data); + return NULL; +} + +const struct inode_operations ext2_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext2_setattr, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext2_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext2_follow_link, + .setattr = ext2_setattr, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/kernel/fs/ext2/xattr.c b/kernel/fs/ext2/xattr.c new file mode 100644 index 000000000..0b6bfd3a3 --- /dev/null +++ b/kernel/fs/ext2/xattr.c @@ -0,0 +1,1029 @@ +/* + * linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher + * + * Fix by Harrison Xing . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * + */ + +/* + * Extended attributes are stored on disk blocks allocated outside of + * any inode. The i_file_acl field is then made to point to this allocated + * block. If all extended attributes of an inode are identical, these + * inodes may share the same extended attribute block. Such situations + * are automatically detected by keeping a cache of recent attribute block + * numbers and hashes over the block's contents in memory. + * + * + * Extended attribute block layout: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The block header is followed by multiple entry descriptors. These entry + * descriptors are variable in size, and aligned to EXT2_XATTR_PAD + * byte boundaries. The entry descriptors are sorted by attribute name, + * so that two extended attribute blocks can be compared efficiently. + * + * Attribute values are aligned to the end of the block, stored in + * no specific order. They are also padded to EXT2_XATTR_PAD byte + * boundaries. No additional gaps are left between them. + * + * Locking strategy + * ---------------- + * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count will change. Multiple writers to an EA block are synchronized + * by the bh lock. No more than a single bh lock is held at any time + * to avoid deadlocks. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) +#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#ifdef EXT2_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%ld: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static int ext2_xattr_set2(struct inode *, struct buffer_head *, + struct ext2_xattr_header *); + +static int ext2_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext2_xattr_cache_find(struct inode *, + struct ext2_xattr_header *); +static void ext2_xattr_rehash(struct ext2_xattr_header *, + struct ext2_xattr_entry *); + +static struct mb_cache *ext2_xattr_cache; + +static const struct xattr_handler *ext2_xattr_handler_map[] = { + [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, +#ifdef CONFIG_EXT2_FS_POSIX_ACL + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, +#ifdef CONFIG_EXT2_FS_SECURITY + [EXT2_XATTR_INDEX_SECURITY] = &ext2_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext2_xattr_handlers[] = { + &ext2_xattr_user_handler, + &ext2_xattr_trusted_handler, +#ifdef CONFIG_EXT2_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif +#ifdef CONFIG_EXT2_FS_SECURITY + &ext2_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext2_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) + handler = ext2_xattr_handler_map[name_index]; + return handler; +} + +/* + * ext2_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext2_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext2_xattr_entry *entry; + size_t name_len, size; + char *end; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + + down_read(&EXT2_I(inode)->xattr_sem); + error = -ENODATA; + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); + end = bh->b_data + bh->b_size; + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + + /* find named attribute */ + entry = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(entry)) { + struct ext2_xattr_entry *next = + EXT2_XATTR_NEXT(entry); + if ((char *)next >= end) + goto bad_block; + if (name_index == entry->e_name_index && + name_len == entry->e_name_len && + memcmp(name, entry->e_name, name_len) == 0) + goto found; + entry = next; + } + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + error = -ENODATA; + goto cleanup; +found: + /* check the buffer size */ + if (entry->e_value_block != 0) + goto bad_block; + size = le32_to_cpu(entry->e_value_size); + if (size > inode->i_sb->s_blocksize || + le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) + goto bad_block; + + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + /* return value of attribute */ + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + up_read(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * ext2_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct buffer_head *bh = NULL; + struct ext2_xattr_entry *entry; + char *end; + size_t rest = buffer_size; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + down_read(&EXT2_I(inode)->xattr_sem); + error = 0; + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); + end = bh->b_data + bh->b_size; + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + + /* check the on-disk data structure */ + entry = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(entry)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry); + + if ((char *)next >= end) + goto bad_block; + entry = next; + } + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + + /* list the attribute names */ + for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); + entry = EXT2_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext2_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + buffer += size; + } + rest -= size; + } + } + error = buffer_size - rest; /* total size */ + +cleanup: + brelse(bh); + up_read(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * Inode operation listxattr() + * + * d_inode(dentry)->i_mutex: don't care + */ +ssize_t +ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext2_xattr_list(dentry, buffer, size); +} + +/* + * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext2_xattr_update_super_block(struct super_block *sb) +{ + if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) + return; + + spin_lock(&EXT2_SB(sb)->s_lock); + EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); + spin_unlock(&EXT2_SB(sb)->s_lock); + mark_buffer_dirty(EXT2_SB(sb)->s_sbh); +} + +/* + * ext2_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext2_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; + struct ext2_xattr_header *header = NULL; + struct ext2_xattr_entry *here, *last; + size_t name_len, free, min_offs = sb->s_blocksize; + int not_found = 1, error; + char *end; + + /* + * header -- Points either into bh, or to a temporarily + * allocated buffer. + * here -- The named entry found, or the place for inserting, within + * the block pointed to by header. + * last -- Points right after the last named entry within the block + * pointed to by header. + * min_offs -- The offset of the first value (values are aligned + * towards the end of the block). + * end -- Points right after the block pointed to by header. + */ + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + name_index, name, value, (long)value_len); + + if (value == NULL) + value_len = 0; + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + if (name_len > 255 || value_len > sb->s_blocksize) + return -ERANGE; + down_write(&EXT2_I(inode)->xattr_sem); + if (EXT2_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bh = sb_bread(sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), + le32_to_cpu(HDR(bh)->h_refcount)); + header = HDR(bh); + end = bh->b_data + bh->b_size; + if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + header->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(sb, "ext2_xattr_set", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + here = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(here)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); + if ((char *)next >= end) + goto bad_block; + if (!here->e_value_block && here->e_value_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + not_found = name_index - here->e_name_index; + if (!not_found) + not_found = name_len - here->e_name_len; + if (!not_found) + not_found = memcmp(name, here->e_name,name_len); + if (not_found <= 0) + break; + here = next; + } + last = here; + /* We still need to compute min_offs and last. */ + while (!IS_LAST_ENTRY(last)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); + if ((char *)next >= end) + goto bad_block; + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + last = next; + } + + /* Check whether we have enough space left. */ + free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); + } else { + /* We will use a new extended attribute block. */ + free = sb->s_blocksize - + sizeof(struct ext2_xattr_header) - sizeof(__u32); + here = last = NULL; /* avoid gcc uninitialized warning. */ + } + + if (not_found) { + /* Request to remove a nonexistent attribute? */ + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (value == NULL) + goto cleanup; + } else { + /* Request to create an existing attribute? */ + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + if (!here->e_value_block && here->e_value_size) { + size_t size = le32_to_cpu(here->e_value_size); + + if (le16_to_cpu(here->e_value_offs) + size > + sb->s_blocksize || size > sb->s_blocksize) + goto bad_block; + free += EXT2_XATTR_SIZE(size); + } + free += EXT2_XATTR_LEN(name_len); + } + error = -ENOSPC; + if (free < EXT2_XATTR_LEN(name_len) + EXT2_XATTR_SIZE(value_len)) + goto cleanup; + + /* Here we know that we can set the new attribute. */ + + if (header) { + struct mb_cache_entry *ce; + + /* assert(header == HDR(bh)); */ + ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, + bh->b_blocknr); + lock_buffer(bh); + if (header->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "modifying in-place"); + if (ce) + mb_cache_entry_free(ce); + /* keep the buffer locked while modifying it. */ + } else { + int offset; + + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); + ea_bdebug(bh, "cloning"); + header = kmalloc(bh->b_size, GFP_KERNEL); + error = -ENOMEM; + if (header == NULL) + goto cleanup; + memcpy(header, HDR(bh), bh->b_size); + header->h_refcount = cpu_to_le32(1); + + offset = (char *)here - bh->b_data; + here = ENTRY((char *)header + offset); + offset = (char *)last - bh->b_data; + last = ENTRY((char *)header + offset); + } + } else { + /* Allocate a buffer where we construct the new block. */ + header = kzalloc(sb->s_blocksize, GFP_KERNEL); + error = -ENOMEM; + if (header == NULL) + goto cleanup; + end = (char *)header + sb->s_blocksize; + header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); + header->h_blocks = header->h_refcount = cpu_to_le32(1); + last = here = ENTRY(header+1); + } + + /* Iff we are modifying the block in-place, bh is locked here. */ + + if (not_found) { + /* Insert the new name. */ + size_t size = EXT2_XATTR_LEN(name_len); + size_t rest = (char *)last - (char *)here; + memmove((char *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = name_index; + here->e_name_len = name_len; + memcpy(here->e_name, name, name_len); + } else { + if (!here->e_value_block && here->e_value_size) { + char *first_val = (char *)header + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + char *val = (char *)header + offs; + size_t size = EXT2_XATTR_SIZE( + le32_to_cpu(here->e_value_size)); + + if (size == EXT2_XATTR_SIZE(value_len)) { + /* The old and the new value have the same + size. Just replace. */ + here->e_value_size = cpu_to_le32(value_len); + memset(val + size - EXT2_XATTR_PAD, 0, + EXT2_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, value, value_len); + goto skip_replace; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = ENTRY(header+1); + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT2_XATTR_NEXT(last); + } + } + if (value == NULL) { + /* Remove the old name. */ + size_t size = EXT2_XATTR_LEN(name_len); + last = ENTRY((char *)last - size); + memmove(here, (char*)here + size, + (char*)last - (char*)here); + memset(last, 0, size); + } + } + + if (value != NULL) { + /* Insert the new value. */ + here->e_value_size = cpu_to_le32(value_len); + if (value_len) { + size_t size = EXT2_XATTR_SIZE(value_len); + char *val = (char *)header + min_offs - size; + here->e_value_offs = + cpu_to_le16((char *)val - (char *)header); + memset(val + size - EXT2_XATTR_PAD, 0, + EXT2_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, value, value_len); + } + } + +skip_replace: + if (IS_LAST_ENTRY(ENTRY(header+1))) { + /* This block is now empty. */ + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ + error = ext2_xattr_set2(inode, bh, NULL); + } else { + ext2_xattr_rehash(header, here); + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ + error = ext2_xattr_set2(inode, bh, header); + } + +cleanup: + brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); + up_write(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * Second half of ext2_xattr_set(): Update the file system. + */ +static int +ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + struct ext2_xattr_header *header) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + int error; + + if (header) { + new_bh = ext2_xattr_cache_find(inode, header); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == old_bh) { + ea_bdebug(new_bh, "keeping this block"); + } else { + /* The old block is released after updating + the inode. */ + ea_bdebug(new_bh, "reusing block"); + + error = dquot_alloc_block(inode, 1); + if (error) { + unlock_buffer(new_bh); + goto cleanup; + } + le32_add_cpu(&HDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "refcount now=%d", + le32_to_cpu(HDR(new_bh)->h_refcount)); + } + unlock_buffer(new_bh); + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. No need to lock the block as we + don't need to change the reference count. */ + new_bh = old_bh; + get_bh(new_bh); + ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + ext2_fsblk_t goal = ext2_group_first_block_no(sb, + EXT2_I(inode)->i_block_group); + int block = ext2_new_block(inode, goal, &error); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (unlikely(!new_bh)) { + ext2_free_blocks(inode, block, 1); + mark_inode_dirty(inode); + error = -ENOMEM; + goto cleanup; + } + lock_buffer(new_bh); + memcpy(new_bh->b_data, header, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } + mark_buffer_dirty(new_bh); + if (IS_SYNC(inode)) { + sync_dirty_buffer(new_bh); + error = -EIO; + if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) + goto cleanup; + } + } + + /* Update the inode. */ + EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) { + error = sync_inode_metadata(inode, 1); + /* In case sync failed due to ENOSPC the inode was actually + * written (only some dirty data were not) so we just proceed + * as if nothing happened and cleanup the unused block */ + if (error && error != -ENOSPC) { + if (new_bh && new_bh != old_bh) { + dquot_free_block_nodirty(inode, 1); + mark_inode_dirty(inode); + } + goto cleanup; + } + } else + mark_inode_dirty(inode); + + error = 0; + if (old_bh && old_bh != new_bh) { + struct mb_cache_entry *ce; + + /* + * If there was an old block and we are no longer using it, + * release the old block. + */ + ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, + old_bh->b_blocknr); + lock_buffer(old_bh); + if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + /* Free the old block. */ + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); + ext2_free_blocks(inode, old_bh->b_blocknr, 1); + mark_inode_dirty(inode); + /* We let our caller release old_bh, so we + * need to duplicate the buffer before. */ + get_bh(old_bh); + bforget(old_bh); + } else { + /* Decrement the refcount only. */ + le32_add_cpu(&HDR(old_bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + dquot_free_block_nodirty(inode, 1); + mark_inode_dirty(inode); + mark_buffer_dirty(old_bh); + ea_bdebug(old_bh, "refcount now=%d", + le32_to_cpu(HDR(old_bh)->h_refcount)); + } + unlock_buffer(old_bh); + } + +cleanup: + brelse(new_bh); + + return error; +} + +/* + * ext2_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. + */ +void +ext2_xattr_delete_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + struct mb_cache_entry *ce; + + down_write(&EXT2_I(inode)->xattr_sem); + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + if (!bh) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: block %d read error", inode->i_ino, + EXT2_I(inode)->i_file_acl); + goto cleanup; + } + ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + goto cleanup; + } + ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); + lock_buffer(bh); + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); + ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); + get_bh(bh); + bforget(bh); + unlock_buffer(bh); + } else { + le32_add_cpu(&HDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); + mark_buffer_dirty(bh); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); + dquot_free_block_nodirty(inode, 1); + } + EXT2_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); + up_write(&EXT2_I(inode)->xattr_sem); +} + +/* + * ext2_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext2_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + + +/* + * ext2_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static int +ext2_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); + if (!ce) + return -ENOMEM; + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache (%d cache entries)", + atomic_read(&ext2_xattr_cache->c_entry_count)); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, + atomic_read(&ext2_xattr_cache->c_entry_count)); + mb_cache_entry_release(ce); + } + return error; +} + +/* + * ext2_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext2_xattr_cmp(struct ext2_xattr_header *header1, + struct ext2_xattr_header *header2) +{ + struct ext2_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT2_XATTR_NEXT(entry1); + entry2 = EXT2_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext2_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a locked buffer head to the block found, or NULL if such + * a block was not found or an error occurred. + */ +static struct buffer_head * +ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext2_error(inode->i_sb, "ext2_xattr_cache_find", + "inode %ld: block %ld read error", + inode->i_ino, (unsigned long) ce->e_block); + } else { + lock_buffer(bh); + if (le32_to_cpu(HDR(bh)->h_refcount) > + EXT2_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %ld refcount %d>%d", + (unsigned long) ce->e_block, + le32_to_cpu(HDR(bh)->h_refcount), + EXT2_XATTR_REFCOUNT_MAX); + } else if (!ext2_xattr_cmp(header, HDR(bh))) { + ea_bdebug(bh, "b_count=%d", + atomic_read(&(bh->b_count))); + mb_cache_entry_release(ce); + return bh; + } + unlock_buffer(bh); + brelse(bh); + } + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, + struct ext2_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext2_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext2_xattr_rehash(struct ext2_xattr_header *header, + struct ext2_xattr_entry *entry) +{ + struct ext2_xattr_entry *here; + __u32 hash = 0; + + ext2_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT2_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext2_xattr(void) +{ + ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); + if (!ext2_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext2_xattr(void) +{ + mb_cache_destroy(ext2_xattr_cache); +} diff --git a/kernel/fs/ext2/xattr.h b/kernel/fs/ext2/xattr.h new file mode 100644 index 000000000..60edf2986 --- /dev/null +++ b/kernel/fs/ext2/xattr.h @@ -0,0 +1,125 @@ +/* + File: linux/ext2_xattr.h + + On-disk format of extended attributes for the ext2 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include +#include + +/* Magic value in attribute blocks */ +#define EXT2_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT2_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT2_XATTR_INDEX_USER 1 +#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT2_XATTR_INDEX_TRUSTED 4 +#define EXT2_XATTR_INDEX_LUSTRE 5 +#define EXT2_XATTR_INDEX_SECURITY 6 + +struct ext2_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext2_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT2_XATTR_PAD_BITS 2 +#define EXT2_XATTR_PAD (1<e_name_len)) ) +#define EXT2_XATTR_SIZE(size) \ + (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) + +# ifdef CONFIG_EXT2_FS_XATTR + +extern const struct xattr_handler ext2_xattr_user_handler; +extern const struct xattr_handler ext2_xattr_trusted_handler; +extern const struct xattr_handler ext2_xattr_security_handler; + +extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); + +extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); + +extern void ext2_xattr_delete_inode(struct inode *); +extern void ext2_xattr_put_super(struct super_block *); + +extern int init_ext2_xattr(void); +extern void exit_ext2_xattr(void); + +extern const struct xattr_handler *ext2_xattr_handlers[]; + +# else /* CONFIG_EXT2_FS_XATTR */ + +static inline int +ext2_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline int +ext2_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext2_xattr_delete_inode(struct inode *inode) +{ +} + +static inline void +ext2_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext2_xattr(void) +{ + return 0; +} + +static inline void +exit_ext2_xattr(void) +{ +} + +#define ext2_xattr_handlers NULL + +# endif /* CONFIG_EXT2_FS_XATTR */ + +#ifdef CONFIG_EXT2_FS_SECURITY +extern int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); +#else +static inline int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/kernel/fs/ext2/xattr_security.c b/kernel/fs/ext2/xattr_security.c new file mode 100644 index 000000000..702fc6840 --- /dev/null +++ b/kernel/fs/ext2/xattr_security.c @@ -0,0 +1,74 @@ +/* + * linux/fs/ext2/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include "ext2.h" +#include +#include "xattr.h" + +static size_t +ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int +ext2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + value, size, flags); +} + +static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; + } + return err; +} + +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + +const struct xattr_handler ext2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext2_xattr_security_list, + .get = ext2_xattr_security_get, + .set = ext2_xattr_security_set, +}; diff --git a/kernel/fs/ext2/xattr_trusted.c b/kernel/fs/ext2/xattr_trusted.c new file mode 100644 index 000000000..42b6e9874 --- /dev/null +++ b/kernel/fs/ext2/xattr_trusted.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext2/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include "ext2.h" +#include "xattr.h" + +static size_t +ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int +ext2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +const struct xattr_handler ext2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext2_xattr_trusted_list, + .get = ext2_xattr_trusted_get, + .set = ext2_xattr_trusted_set, +}; diff --git a/kernel/fs/ext2/xattr_user.c b/kernel/fs/ext2/xattr_user.c new file mode 100644 index 000000000..ecdc46051 --- /dev/null +++ b/kernel/fs/ext2/xattr_user.c @@ -0,0 +1,61 @@ +/* + * linux/fs/ext2/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include "ext2.h" +#include "xattr.h" + +static size_t +ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext2_xattr_user_list, + .get = ext2_xattr_user_get, + .set = ext2_xattr_user_set, +}; diff --git a/kernel/fs/ext3/Kconfig b/kernel/fs/ext3/Kconfig new file mode 100644 index 000000000..e8c6ba0e4 --- /dev/null +++ b/kernel/fs/ext3/Kconfig @@ -0,0 +1,89 @@ +config EXT3_FS + tristate "Ext3 journalling file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at ). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3_DEFAULTS_TO_ORDERED + bool "Default to 'data=ordered' in ext3" + depends on EXT3_FS + default y + help + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/kernel/fs/ext3/Makefile b/kernel/fs/ext3/Makefile new file mode 100644 index 000000000..e77766a8b --- /dev/null +++ b/kernel/fs/ext3/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux ext3-filesystem routines. +# + +obj-$(CONFIG_EXT3_FS) += ext3.o + +ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o + +ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/kernel/fs/ext3/acl.c b/kernel/fs/ext3/acl.c new file mode 100644 index 000000000..8bbaf5bcf --- /dev/null +++ b/kernel/fs/ext3/acl.c @@ -0,0 +1,281 @@ +/* + * linux/fs/ext3/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include "ext3.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext3_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext3_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext3_acl_header *)value)->a_version != + cpu_to_le32(EXT3_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext3_acl_header); + count = ext3_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext3_acl_entry *entry = + (ext3_acl_entry *)value; + if ((char *)value + sizeof(ext3_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext3_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext3_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext3_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count * + sizeof(ext3_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext3_acl_header); + for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext3_acl_entry *entry = (ext3_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext3_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(ext3_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext3_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +struct posix_acl * +ext3_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + + retval = ext3_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext3_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext3_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext3_new_inode + */ +static int +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext3_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext3_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = __ext3_set_acl(handle, inode, type, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } + return error; +} diff --git a/kernel/fs/ext3/acl.h b/kernel/fs/ext3/acl.h new file mode 100644 index 000000000..ea1c69eda --- /dev/null +++ b/kernel/fs/ext3/acl.h @@ -0,0 +1,72 @@ +/* + File: fs/ext3/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT3_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext3_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext3_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext3_acl_header; + +static inline size_t ext3_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext3_acl_header) + + count * sizeof(ext3_acl_entry_short); + } else { + return sizeof(ext3_acl_header) + + 4 * sizeof(ext3_acl_entry_short) + + (count - 4) * sizeof(ext3_acl_entry); + } +} + +static inline int ext3_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext3_acl_header); + s = size - 4 * sizeof(ext3_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext3_acl_entry_short)) + return -1; + return size / sizeof(ext3_acl_entry_short); + } else { + if (s % sizeof(ext3_acl_entry)) + return -1; + return s / sizeof(ext3_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT3_FS_POSIX_ACL + +/* acl.c */ +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT3_FS_POSIX_ACL */ +#include +#define ext3_get_acl NULL +#define ext3_set_acl NULL + +static inline int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT3_FS_POSIX_ACL */ + diff --git a/kernel/fs/ext3/balloc.c b/kernel/fs/ext3/balloc.c new file mode 100644 index 000000000..158b5d4ce --- /dev/null +++ b/kernel/fs/ext3/balloc.c @@ -0,0 +1,2158 @@ +/* + * linux/fs/ext3/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include "ext3.h" + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_fill_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* + * Calculate the block group number and offset, given a block number + */ +static void ext3_get_group_no_and_offset(struct super_block *sb, + ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + if (offsetp) + *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); + if (blockgrpp) + *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); +} + +/** + * ext3_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext3_group_desc * desc; + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + smp_rmb(); + + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +static int ext3_valid_block_bitmap(struct super_block *sb, + struct ext3_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext3_grpblk_t offset; + ext3_grpblk_t next_zero_bit; + ext3_fsblk_t bitmap_blk; + ext3_fsblk_t group_first_block; + + group_first_block = ext3_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext3_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext3_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_table); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext3_find_next_zero_bit(bh->b_data, + offset + EXT3_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext3_error(sb, __func__, + "Invalid block bitmap - " + "block_group = %d, block = %lu", + block_group, bitmap_blk); + return 0; +} + +/** + * read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext3_group_desc * desc; + struct buffer_head * bh = NULL; + ext3_fsblk_t bitmap_blk; + + desc = ext3_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + trace_ext3_read_block_bitmap(sb, block_group); + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext3_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + if (likely(bh_uptodate_or_lock(bh))) + return bh; + + if (bh_submit_read(bh) < 0) { + brelse(bh); + ext3_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + ext3_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ + return bh; +} +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map + * + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. + */ +#if 1 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext3_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %lu, end: %lu\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + BUG_ON(bad); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __func__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ +static int +goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, + unsigned int group, struct super_block * sb) +{ + ext3_fsblk_t group_first_block, group_last_block; + + group_first_block = ext3_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext3_reserve_window_node * +search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) +{ + struct rb_node *n = root->rb_node; + struct ext3_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + } + return rsv; +} + +/** + * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock hold. + */ +void ext3_rsv_window_add(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + ext3_fsblk_t start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext3_reserve_window_node *this; + + trace_ext3_rsv_window_add(sb, rsv); + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else { + rsv_window_dump(root, 1); + BUG(); + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +/** + * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock hold. + */ +static void rsv_window_remove(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); +} + +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED. + */ +static inline int rsv_is_empty(struct ext3_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED; +} + +/** + * ext3_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext3 inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext3 inode the first time the open file + * needs a new block. So, before every ext3_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext3_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to call this function. + */ +void ext3_init_block_alloc_info(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +/** + * ext3_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext3_release_file(): last writer close the file + * ext3_clear_inode(): last iput(), when nobody link to this file. + * ext3_truncate(): when the block indirect map is about to change. + * + */ +void ext3_discard_reservation(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); + rsv_window_remove(inode->i_sb, rsv); + } + spin_unlock(rsv_lock); + } +} + +/** + * ext3_free_blocks_sb() -- Free given blocks and update quota + * @handle: handle to this transaction + * @sb: super block + * @block: start physical block to free + * @count: number of blocks to free + * @pdquot_freed_blocks: pointer to quota + */ +void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + unsigned long block_group; + ext3_grpblk_t bit; + unsigned long i; + unsigned long overflow; + struct ext3_group_desc * desc; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int err = 0, ret; + ext3_grpblk_t group_freed; + + *pdquot_freed_blocks = 0; + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = "E3FSBLK", count = %lu", block, count); + goto error_return; + } + + ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = "E3FSBLK", count = %lu", + block, count); + goto error_return; + } + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + jbd_lock_bh_state(bitmap_bh); + + for (i = 0, group_freed = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ +#ifdef CONFIG_JBD_DEBUG + jbd_unlock_bh_state(bitmap_bh); + { + struct buffer_head *debug_bh; + debug_bh = sb_find_get_block(sb, block + i); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No committed data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); +#endif + if (need_resched()) { + jbd_unlock_bh_state(bitmap_bh); + cond_resched(); + jbd_lock_bh_state(bitmap_bh); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, + bh2jh(bitmap_bh)->b_committed_data); + + /* + * We clear the bit in the bitmap after setting the committed + * data bit, because this is the reverse order to that which + * the allocator uses. + */ + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + jbd_unlock_bh_state(bitmap_bh); + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + block + i); + jbd_lock_bh_state(bitmap_bh); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + group_freed++; + } + } + jbd_unlock_bh_state(bitmap_bh); + + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&desc->bg_free_blocks_count, group_freed); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_add(&sbi->s_freeblocks_counter, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + *pdquot_freed_blocks += group_freed; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +/** + * ext3_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + */ +void ext3_free_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count) +{ + struct super_block *sb = inode->i_sb; + unsigned long dquot_freed_blocks; + + trace_ext3_free_blocks(inode, block, count); + ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); + if (dquot_freed_blocks) + dquot_free_block(inode, dquot_freed_blocks); + return; +} + +/** + * ext3_test_allocatable() + * @nr: given allocation block group + * @bh: bufferhead contains the bitmap of the given block group + * + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ +static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) +{ + int ret; + struct journal_head *jh = bh2jh(bh); + + if (ext3_test_bit(nr, bh->b_data)) + return 0; + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) + ret = 1; + else + ret = !ext3_test_bit(nr, jh->b_committed_data); + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward alternately through the actual + * bitmap on disk and the last-committed copy in journal, until we find a + * bit free in both bitmaps. + */ +static ext3_grpblk_t +bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) +{ + ext3_grpblk_t next; + struct journal_head *jh = bh2jh(bh); + + while (start < maxblocks) { + next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = ext3_find_next_zero_bit(jh->b_committed_data, + maxblocks, next); + jbd_unlock_bh_state(bh); + } + return -1; +} + +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We honor both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ +static ext3_grpblk_t +find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) +{ + ext3_grpblk_t here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + ext3_grpblk_t end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3_test_allocatable(here, bh)) + return here; + ext3_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = bh->b_data + (here >> 3); + r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); + next = (r - bh->b_data) << 3; + + if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) + return next; + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/** + * claim_block() + * @lock: the spin lock for this block group + * @block: the free block (group relative) to allocate + * @bh: the buffer_head contains the block group bitmap + * + * We think we can allocate this block in this bitmap. Try to set the bit. + * If that succeeds then check that nobody has allocated and then freed the + * block since we saw that is was not marked in b_committed_data. If it _was_ + * allocated and freed then clear the bit in the bitmap again and return + * zero (failure). + */ +static inline int +claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + int ret; + + if (ext3_set_bit_atomic(lock, block, bh->b_data)) + return 0; + jbd_lock_bh_state(bh); + if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) { + ext3_clear_bit_atomic(lock, block, bh->b_data); + ret = 0; + } else { + ret = 1; + } + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * ext3_try_to_allocate() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) from the + * file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, ends at + * the block group's last block. + * + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. In that case we must release write access to the old one via + * ext3_journal_release_buffer(), else we'll run out of credits. + */ +static ext3_grpblk_t +ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, + struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, + unsigned long *count, struct ext3_reserve_window *my_rsv) +{ + ext3_fsblk_t group_first_block; + ext3_grpblk_t start, end; + unsigned long num = 0; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = ext3_group_first_block_no(sb, group); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT3_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT3_BLOCKS_PER_GROUP(sb); + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; + else + grp_goal = -1; + } else { + if (grp_goal > 0) + start = grp_goal; + else + start = 0; + end = EXT3_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); + +repeat: + if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && grp_goal > start && + ext3_test_allocatable(grp_goal - 1, + bitmap_bh); + i++, grp_goal--) + ; + } + } + start = grp_goal; + + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + grp_goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + num++; + grp_goal++; + while (num < *count && grp_goal < end + && ext3_test_allocatable(grp_goal, bitmap_bh) + && claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { + num++; + grp_goal++; + } + *count = num; + return grp_goal - num; +fail_access: + *count = num; + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @my_rsv: the reservation window + * + * @sb: the super block + * + * @start_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + */ +static int find_next_reservable_window( + struct ext3_reserve_window_node *search_head, + struct ext3_reserve_window_node *my_rsv, + struct super_block * sb, + ext3_fsblk_t start_block, + ext3_fsblk_t last_block) +{ + struct rb_node *next; + struct ext3_reserve_window_node *rsv, *prev; + ext3_fsblk_t cur; + int size = my_rsv->rsv_goal_size; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = start_block; + rsv = search_head; + if (!rsv) + return -1; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return -1; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole available window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3_rsv_window_add(sb, my_rsv); + + return 0; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @my_rsv: the reservation window + * + * @grp_goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a grp_goal(grp_goal >0 ), then start from there, + * no grp_goal(grp_goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + * + */ +static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, + ext3_grpblk_t grp_goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext3_reserve_window_node *search_head; + ext3_fsblk_t group_first_block, group_end_block, start_block; + ext3_grpblk_t first_free_block; + struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; + unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + + group_first_block = ext3_group_first_block_no(sb, group); + group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if (grp_goal < 0) + start_block = group_first_block; + else + start_block = grp_goal + group_first_block; + + trace_ext3_alloc_new_reservation(sb, start_block); + size = my_rsv->rsv_goal_size; + + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window + */ + size = size * 2; + if (size > EXT3_MAX_RESERVE_BLOCKS) + size = EXT3_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + + spin_lock(rsv_lock); + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * searching the first free bit on the block bitmap and copy of + * last committed bitmap alternatively, until we found a allocatable + * block. Search start from the start block of the reservable space + * we just found. + */ + spin_unlock(rsv_lock); + first_free_block = bitmap_search_next_usable_block( + my_rsv->rsv_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ + } + + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just reserved + */ + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); + return 0; /* success */ + } + /* + * if the first free bit we found is out of the reservable space + * continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = my_rsv; + spin_lock(rsv_lock); + goto retry; +} + +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext3_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext3_new_blocks() tries to allocate blocks, + */ +static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, + struct super_block *sb, int size) +{ + struct ext3_reserve_window_node *next_rsv; + struct rb_node *next; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + + if (!spin_trylock(rsv_lock)) + return; + + next = rb_next(&my_rsv->rsv_node); + + if (!next) + my_rsv->rsv_end += size; + else { + next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node); + + if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) + my_rsv->rsv_end += size; + else + my_rsv->rsv_end = next_rsv->rsv_start - 1; + } + spin_unlock(rsv_lock); +} + +/** + * ext3_try_to_allocate_with_rsv() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @my_rsv: reservation window + * @count: target number of blocks to allocate + * @errp: pointer to store the error code + * + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when somebody is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a red-black tree for the per-filesystem reservation list. + * + */ +static ext3_grpblk_t +ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, + ext3_grpblk_t grp_goal, + struct ext3_reserve_window_node * my_rsv, + unsigned long *count, int *errp) +{ + ext3_fsblk_t group_first_block, group_last_block; + ext3_grpblk_t ret = 0; + int fatal; + unsigned long num = *count; + + *errp = 0; + + /* + * Make sure we use undo access for the bitmap, because it is critical + * that we do the frozen_data COW on bitmap buffers in all cases even + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL ) { + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, count, NULL); + goto out; + } + /* + * grp_goal is a group relative block number (if there is a goal) + * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = ext3_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { + if (my_rsv->rsv_goal_size < *count) + my_rsv->rsv_goal_size = *count; + ret = alloc_new_reservation(my_rsv, grp_goal, sb, + group, bitmap_bh); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal >= 0) { + int curr = my_rsv->rsv_end - + (grp_goal + group_first_block) + 1; + + if (curr < *count) + try_to_extend_reservation(my_rsv, sb, + *count - curr); + } + + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1); + BUG(); + } + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, &num, &my_rsv->rsv_window); + if (ret >= 0) { + my_rsv->rsv_alloc_hit += num; + *count = num; + break; /* succeed */ + } + num = *count; + } +out: + if (ret >= 0) { + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " + "bitmap block"); + fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + return ret; + } + + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + ext3_journal_release_buffer(handle, bitmap_bh); + return ret; +} + +/** + * ext3_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) +{ + ext3_fsblk_t free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/** + * ext3_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext3_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or committing transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext3_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return journal_force_commit_nested(EXT3_SB(sb)->s_journal); +} + +/** + * ext3_new_blocks() -- core block(s) allocation function + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext3_new_blocks uses a goal block to assist allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If that + * fails, it will try to allocate block(s) from other block groups without + * any specific goal block. + * + */ +ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */ + int bgi; /* blockgroup iteration index */ + int fatal = 0, err; + int performed_allocation = 0; + ext3_grpblk_t free_blocks; /* number of free blocks in a group */ + struct super_block *sb; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct ext3_sb_info *sbi; + struct ext3_reserve_window_node *my_rsv = NULL; + struct ext3_block_alloc_info *block_i; + unsigned short windowsz = 0; +#ifdef EXT3FS_DEBUG + static int goal_hits, goal_attempts; +#endif + unsigned long ngroups; + unsigned long num = *count; + + *errp = -ENOSPC; + sb = inode->i_sb; + + /* + * Check quota for allocation of this block. + */ + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; + return 0; + } + + trace_ext3_request_blocks(inode, goal, num); + + sbi = EXT3_SB(sb); + es = sbi->s_es; + ext3_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT3_I(inode)->i_block_alloc_info; + if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) + my_rsv = &block_i->rsv_window_node; + + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, grp_target_blk, + my_rsv, &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * group_no and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (my_rsv && (free_blocks <= (windowsz/2))) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, -1, my_rsv, + &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + /* + * We may end up a bogus earlier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks available on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + windowsz = 0; + group_no = goal_group; + goto retry_alloc; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext3_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + BUFFER_TRACE(gdp_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, gdp_bh); + if (fatal) + goto out; + + ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) { + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "blocks from "E3FSBLK", length %lu", + ret_block, num); + /* + * claim_block() marked the blocks we allocated as in use. So we + * may want to selectively mark some of the blocks as free. + */ + goto retry_alloc; + } + + performed_allocation = 1; + +#ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = sb_find_get_block(sb, ret_block); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); + brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); + spin_lock(sb_bgl_lock(sbi, group_no)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + int i; + + for (i = 0; i < num; i++) { + if (ext3_test_bit(grp_alloc_blk+i, + bh2jh(bitmap_bh)->b_committed_data)) { + printk("%s: block was unexpectedly set in " + "b_committed_data\n", __func__); + } + } + } + ext3_debug("found bit %d\n", grp_alloc_blk); + spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif + + if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext3_error(sb, "ext3_new_block", + "block("E3FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + ext3_debug("allocating block %lu. Goal hits %d of %d.\n", + ret_block, goal_hits, goal_attempts); + + spin_lock(sb_bgl_lock(sbi, group_no)); + le16_add_cpu(&gdp->bg_free_blocks_count, -num); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); + if (fatal) + goto out; + + *errp = 0; + brelse(bitmap_bh); + + if (num < *count) { + dquot_free_block(inode, *count-num); + *count = num; + } + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + + return ret_block; + +io_error: + *errp = -EIO; +out: + if (fatal) { + *errp = fatal; + ext3_std_error(sb, fatal); + } + /* + * Undo the block allocation + */ + if (!performed_allocation) + dquot_free_block(inode, *count); + brelse(bitmap_bh); + return 0; +} + +ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + + return ext3_new_blocks(handle, inode, goal, &count, errp); +} + +/** + * ext3_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ +ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) +{ + ext3_fsblk_t desc_count; + struct ext3_group_desc *gdp; + int i; + unsigned long ngroups = EXT3_SB(sb)->s_groups_count; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + ext3_fsblk_t bitmap_count; + unsigned long x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext3_count_free(bitmap_bh, sb->s_blocksize); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_blocks: stored = "E3FSBLK + ", computed = "E3FSBLK", "E3FSBLK"\n", + (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; +#endif +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext3_group_sparse(int group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext3_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext3_bg_has_super(struct super_block *sb, int group) +{ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext3_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group) +{ + unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); + unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb); + unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group) +{ + return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0; +} + +/** + * ext3_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext3_bg_num_gdb_nometa(sb,group); + + return ext3_bg_num_gdb_meta(sb,group); + +} + +/** + * ext3_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: allocation group to trim + * @start: first group block to examine + * @max: last group block to examine + * @gdp: allocation group description structure + * @minblocks: minimum extent block count + * + * ext3_trim_all_free walks through group's block bitmap searching for free + * blocks. When the free block is found, it tries to allocate this block and + * consequent free block to get the biggest free extent possible, until it + * reaches any used block. Then issue a TRIM command on this extent and free + * the extent in the block bitmap. This is done until whole group is scanned. + */ +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) +{ + handle_t *handle; + ext3_grpblk_t next, free_blocks, bit, freed, count = 0; + ext3_fsblk_t discard_block; + struct ext3_sb_info *sbi; + struct buffer_head *gdp_bh, *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int err = 0, ret = 0; + + /* + * We will update one block bitmap, and one group descriptor + */ + handle = ext3_journal_start_sb(sb, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bitmap_bh = read_block_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto err_out; + + gdp = ext3_get_group_desc(sb, group, &gdp_bh); + if (!gdp) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto err_out; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + sbi = EXT3_SB(sb); + + /* Walk through the whole group */ + while (start <= max) { + start = bitmap_search_next_usable_block(start, bitmap_bh, max); + if (start < 0) + break; + next = start; + + /* + * Allocate contiguous free extents by setting bits in the + * block bitmap + */ + while (next <= max + && claim_block(sb_bgl_lock(sbi, group), + next, bitmap_bh)) { + next++; + } + + /* We did not claim any blocks */ + if (next == start) + continue; + + discard_block = (ext3_fsblk_t)start + + ext3_group_first_block_no(sb, group); + + /* Update counters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, start - next); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + + free_blocks -= next - start; + /* Do not issue a TRIM on extents smaller than minblocks */ + if ((next - start) < minblocks) + goto free_extent; + + trace_ext3_discard_blocks(sb, discard_block, next - start); + /* Send the TRIM command down to the device */ + err = sb_issue_discard(sb, discard_block, next - start, + GFP_NOFS, 0); + count += (next - start); +free_extent: + freed = 0; + + /* + * Clear bits in the bitmap + */ + for (bit = start; bit < next; bit++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), + bit, bitmap_bh->b_data)) { + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + (unsigned long)bit); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + freed++; + } + } + + /* Update couters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, freed); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + + start = next; + if (err < 0) { + if (err != -EOPNOTSUPP) + ext3_warning(sb, __func__, "Discard command " + "returned error %d\n", err); + break; + } + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + cond_resched(); + + /* No more suitable extents */ + if (free_blocks < minblocks) + break; + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + ret = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = ret; + + /* And the group descriptor block */ + BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!err) + err = ret; + + ext3_debug("trimmed %d blocks in the group %d\n", + count, group); + +err_out: + if (err) + count = err; + ext3_journal_stop(handle); + brelse(bitmap_bh); + + return count; +} + +/** + * ext3_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @start: First Byte to trim + * @len: number of Bytes to trim from start + * @minlen: minimum extent length in Bytes + * + * ext3_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext3_trim_all_free function + * is invoked to trim all free space. + */ +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + ext3_grpblk_t last_block, first_block; + unsigned long group, first_group, last_group; + struct ext3_group_desc *gdp; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + uint64_t start, minlen, end, trimmed = 0; + ext3_fsblk_t first_data_blk = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); + ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + smp_rmb(); + + /* Determine first and last group to examine based on start and len */ + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, + &first_group, &first_block); + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, + &last_group, &last_block); + + /* end now represents the last block to discard in this group */ + end = EXT3_BLOCKS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + gdp = ext3_get_group_desc(sb, group, NULL); + if (!gdp) + break; + + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_block is + * already computed earlier by ext3_get_group_no_and_offset() + */ + if (group == last_group) + end = last_block; + + if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { + ret = ext3_trim_all_free(sb, group, first_block, + end, minlen); + if (ret < 0) + break; + trimmed += ret; + } + + /* + * For every group except the first one, we are sure + * that the first block to discard will be block #0. + */ + first_block = 0; + } + + if (ret > 0) + ret = 0; + +out: + range->len = trimmed * sb->s_blocksize; + return ret; +} diff --git a/kernel/fs/ext3/bitmap.c b/kernel/fs/ext3/bitmap.c new file mode 100644 index 000000000..ef9c643e8 --- /dev/null +++ b/kernel/fs/ext3/bitmap.c @@ -0,0 +1,20 @@ +/* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include "ext3.h" + +#ifdef EXT3FS_DEBUG + +unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); +} + +#endif /* EXT3FS_DEBUG */ + diff --git a/kernel/fs/ext3/dir.c b/kernel/fs/ext3/dir.c new file mode 100644 index 000000000..17742eed2 --- /dev/null +++ b/kernel/fs/ext3/dir.c @@ -0,0 +1,537 @@ +/* + * linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include "ext3.h" + +static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext3_dx_readdir(struct file *, struct dir_context *); + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT3_FT_MAX)) + return DT_UNKNOWN; + + return (ext3_filetype_table[filetype]); +} + +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} + +int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char * error_msg = NULL; + const int rlen = ext3_rec_len_from_disk(de->rec_len); + + if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) + error_msg = "directory entry across blocks"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + + if (unlikely(error_msg != NULL)) + ext3_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + + return error_msg == NULL ? 1 : 0; +} + +static int ext3_readdir(struct file *file, struct dir_context *ctx) +{ + unsigned long offset; + int i; + struct ext3_dir_entry_2 *de; + int err; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + int dir_has_error = 0; + + if (is_dx_dir(inode)) { + err = ext3_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; + } + offset = ctx->pos & (sb->s_blocksize - 1); + + while (ctx->pos < inode->i_size) { + unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + struct buffer_head *bh = NULL; + + map_bh.b_state = 0; + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); + if (err > 0) { + pgoff_t index = map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&file->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &file->f_ra, file, + index, 1); + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext3_bread(NULL, inode, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, ctx->pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (ctx->pos > inode->i_blocks << 9) + break; + ctx->pos += sb->s_blocksize - offset; + continue; + } + + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (offset && file->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext3_rec_len_from_disk(de->rec_len) < + EXT3_DIR_REC_LEN(1)) + break; + i += ext3_rec_len_from_disk(de->rec_len); + } + offset = i; + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) + | offset; + file->f_version = inode->i_version; + } + + while (ctx->pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); + if (!ext3_check_dir_entry ("ext3_readdir", inode, de, + bh, offset)) { + /* On error, skip the to the + next block. */ + ctx->pos = (ctx->pos | + (sb->s_blocksize - 1)) + 1; + break; + } + offset += ext3_rec_len_from_disk(de->rec_len); + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } + } + ctx->pos += ext3_rec_len_from_disk(de->rec_len); + } + offset = 0; + brelse (bh); + if (ctx->pos < inode->i_size) + if (!dir_relax(inode)) + return 0; + } + return 0; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext3_get_htree_eof(file); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + return generic_file_llseek(file, offset, whence); +} + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } while (fname); + + *root = RB_ROOT; +} + +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + return p; +} + +void ext3_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname * fname, *new_fn; + struct dir_private_info *info; + int len; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext3_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static bool call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + + if (!fname) { + printk("call_filldir: called with null fname?!?\n"); + return true; + } + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); + while (fname) { + if (!dir_emit(ctx, fname->name, fname->name_len, + fname->inode, + get_dtype(sb, fname->file_type))) { + info->extra_fname = fname; + return false; + } + fname = fname->next; + } + return true; +} + +static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct fname *fname; + int ret; + + if (!info) { + info = ext3_htree_create_dir_info(file, ctx->pos); + if (!info) + return -ENOMEM; + file->private_data = info; + } + + if (ctx->pos == ext3_get_htree_eof(file)) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != ctx->pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (!call_filldir(file, ctx, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (file->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + file->f_version = inode->i_version; + ret = ext3_htree_fill_tree(file, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + ctx->pos = ext3_get_htree_eof(file); + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (!call_filldir(file, ctx, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + ctx->pos = ext3_get_htree_eof(file); + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = ctx->pos; + return 0; +} + +static int ext3_release_dir (struct inode * inode, struct file * filp) +{ + if (filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .iterate = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/kernel/fs/ext3/ext3.h b/kernel/fs/ext3/ext3.h new file mode 100644 index 000000000..f483a80b3 --- /dev/null +++ b/kernel/fs/ext3/ext3.h @@ -0,0 +1,1332 @@ +/* + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3FS_DEBUG to produce debug messages + */ +#undef EXT3FS_DEBUG + +/* + * Define EXT3_RESERVATION to reserve data blocks for expanding files + */ +#define EXT3_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT3_MAX_RESERVE_BLOCKS 1027 +#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT3FS_DEBUG +#define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3_BAD_INO 1 /* Bad blocks inode */ +#define EXT3_ROOT_INO 2 /* Root inode */ +#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3 filesystems */ +#define EXT3_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT3_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3_MIN_BLOCK_SIZE 1024 +#define EXT3_MAX_BLOCK_SIZE 65536 +#define EXT3_MIN_BLOCK_LOG_SIZE 10 +#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) +#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) +#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3_MIN_FRAG_SIZE 1024 +#define EXT3_MAX_FRAG_SIZE 4096 +#define EXT3_MIN_FRAG_LOG_SIZE 10 +#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) +#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext3_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) +#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) +#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) +#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT3_NDIR_BLOCKS 12 +#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS +#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) +#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) +#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3_DIRTY_FL 0x00000100 +#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ + EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ + EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT3_REG_FLMASK; + else + return flags & EXT3_OTHER_FLMASK; +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext3_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ +struct ext3_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + +/* + * ioctl commands + */ +#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT3_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) +#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + +/* Number of supported quota types */ +#define EXT3_MAXQUOTAS 2 + +/* + * Mount options + */ +struct ext3_mount_options { + unsigned long s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[EXT3_MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext3_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +/* EXT3_MOUNT_OLDALLOC was there */ +#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ + +/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3_MOUNT_##opt +#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS +#endif + +#define ext3_set_bit __set_bit_le +#define ext3_set_bit_atomic ext2_set_bit_atomic +#define ext3_clear_bit __clear_bit_le +#define ext3_clear_bit_atomic ext2_clear_bit_atomic +#define ext3_test_bit test_bit_le +#define ext3_find_next_zero_bit find_next_zero_bit_le + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3_ERRORS_PANIC 3 /* Panic */ +#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ +}; + +/* data type for block offset of block group */ +typedef int ext3_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext3_fsblk_t; + +#define E3FSBLK "%lu" + +struct ext3_reserve_window { + ext3_fsblk_t _rsv_start; /* First byte reserved */ + ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext3_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext3_reserve_window rsv_window; +}; + +struct ext3_block_alloc_info { + /* information about reservation window */ + struct ext3_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext3_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext3_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext3_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * third extended file system inode data in memory + */ +struct ext3_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; +#ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; +#endif + ext3_fsblk_t i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + unsigned long i_state_flags; /* Dynamic state flags for ext3 */ + + /* block reservation info */ + struct ext3_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT3_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * truncate_mutex is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_mutex. + */ + struct mutex truncate_mutex; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + atomic_t i_sync_tid; + atomic_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + struct inode vfs_inode; +}; + +/* + * third extended-fs super-block data in memory + */ +struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + ext3_fsblk_t s_sb_block; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext3_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext3_inode_info *EXT3_I(struct inode *inode) +{ + return container_of(inode, struct ext3_inode_info, vfs_inode); +} + +static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3_ROOT_INO || + ino == EXT3_JOURNAL_INO || + ino == EXT3_RESIZE_INO || + (ino >= EXT3_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT3_OS_LINUX 0 +#define EXT3_OS_HURD 1 +#define EXT3_OS_MASIX 2 +#define EXT3_OS_FREEBSD 3 +#define EXT3_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV +#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + +#define EXT3_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 + +#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3_DEF_RESUID 0 +#define EXT3_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT3_DEFM_DEBUG 0x0001 +#define EXT3_DEFM_BSDGROUPS 0x0002 +#define EXT3_DEFM_XATTR_USER 0x0004 +#define EXT3_DEFM_ACL 0x0008 +#define EXT3_DEFM_UID16 0x0010 +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT3_NAME_LEN 255 + +struct ext3_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3_FT_UNKNOWN 0 +#define EXT3_FT_REG_FILE 1 +#define EXT3_FT_DIR 2 +#define EXT3_FT_CHRDEV 3 +#define EXT3_FT_BLKDEV 4 +#define EXT3_FT_FIFO 5 +#define EXT3_FT_SOCK 6 +#define EXT3_FT_SYMLINK 7 + +#define EXT3_FT_MAX 8 + +/* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3_DIR_PAD 4 +#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) +#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) +#define EXT3_MAX_REC_LEN ((1<<16)-1) + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext3_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT3_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext3_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT3_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); +#endif + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext3_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3_iloc +{ + struct buffer_head *bh; + unsigned long offset; + unsigned long block_group; +}; + +static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) +{ + return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext3_fsblk_t +ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext3_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); +extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); +extern void ext3_free_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count); +extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); +extern void ext3_check_blocks_bitmap (struct super_block *); +extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext3_init_block_alloc_info(struct inode *); +extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); +extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); + +/* dir.c */ +extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +extern void ext3_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext3fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext3_new_inode (handle_t *, struct inode *, + const struct qstr *, umode_t); +extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext3_count_free_inodes (struct super_block *); +extern unsigned long ext3_count_dirs (struct super_block *); +extern void ext3_check_inodes_bitmap (struct super_block *); +extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + +/* inode.c */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); +struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, + int create); + +extern struct inode *ext3_iget(struct super_block *, unsigned long); +extern int ext3_write_inode (struct inode *, struct writeback_control *); +extern int ext3_setattr (struct dentry *, struct iattr *); +extern void ext3_evict_inode (struct inode *); +extern int ext3_sync_inode (handle_t *, struct inode *); +extern void ext3_discard_reservation (struct inode *); +extern void ext3_dirty_inode(struct inode *, int); +extern int ext3_change_inode_journal_flag(struct inode *, int); +extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); +extern int ext3_can_truncate(struct inode *inode); +extern void ext3_truncate(struct inode *inode); +extern void ext3_set_inode_flags(struct inode *); +extern void ext3_get_inode_flags(struct ext3_inode_info *); +extern void ext3_set_aops(struct inode *inode); +extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +extern int ext3_orphan_add(handle_t *, struct inode *); +extern int ext3_orphan_del(handle_t *, struct inode *); +extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext3_group_add(struct super_block *sb, + struct ext3_new_group_data *input); +extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count); + +/* super.c */ +extern __printf(3, 4) +void ext3_error(struct super_block *, const char *, const char *, ...); +extern void __ext3_std_error (struct super_block *, const char *, int); +extern __printf(3, 4) +void ext3_abort(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_msg(struct super_block *, const char *, const char *, ...); +extern void ext3_update_dynamic_rev (struct super_block *sb); + +#define ext3_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3_std_error((sb), __func__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext3_dir_operations; + +/* file.c */ +extern const struct inode_operations ext3_file_inode_operations; +extern const struct file_operations ext3_file_operations; + +/* namei.c */ +extern const struct inode_operations ext3_dir_inode_operations; +extern const struct inode_operations ext3_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext3_symlink_inode_operations; +extern const struct inode_operations ext3_fast_symlink_inode_operations; + +#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT3_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ + EXT3_XATTR_TRANS_BLOCKS - 2 + \ + EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT3_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT3_RESERVE_TRANS_BLOCKS 12U + +#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) + +int +ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext3_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh); + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__func__, (handle), (bh)) +#define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__func__, (handle), (bh)) +#define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__func__, (handle), (bh)) +#define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__func__, (handle), (bh)) +#define ext3_journal_forget(handle, bh) \ + __ext3_journal_forget(__func__, (handle), (bh)) + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); +int __ext3_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) +{ + return ext3_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext3_journal_stop(handle) \ + __ext3_journal_stop(__func__, (handle)) + +static inline handle_t *ext3_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext3_journal_extend(handle_t *handle, int nblocks) +{ + return journal_extend(handle, nblocks); +} + +static inline int ext3_journal_restart(handle_t *handle, int nblocks) +{ + return journal_restart(handle, nblocks); +} + +static inline int ext3_journal_blocks_per_page(struct inode *inode) +{ + return journal_blocks_per_page(inode); +} + +static inline int ext3_journal_force_commit(journal_t *journal) +{ + return journal_force_commit(journal); +} + +/* super.c */ +int ext3_force_commit(struct super_block *sb); + +static inline int ext3_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext3_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext3_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#include diff --git a/kernel/fs/ext3/ext3_jbd.c b/kernel/fs/ext3/ext3_jbd.c new file mode 100644 index 000000000..785a3261a --- /dev/null +++ b/kernel/fs/ext3/ext3_jbd.c @@ -0,0 +1,59 @@ +/* + * Interface between ext3 and JBD + */ + +#include "ext3.h" + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_undo_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_write_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_forget(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh) +{ + int err = journal_revoke(handle, blocknr, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_get_create_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_metadata(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} diff --git a/kernel/fs/ext3/file.c b/kernel/fs/ext3/file.c new file mode 100644 index 000000000..3b8f650de --- /dev/null +++ b/kernel/fs/ext3/file.c @@ -0,0 +1,79 @@ +/* + * linux/fs/ext3/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include "ext3.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext3_release_file (struct inode * inode, struct file * filp) +{ + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { + filemap_flush(inode->i_mapping); + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + { + mutex_lock(&EXT3_I(inode)->truncate_mutex); + ext3_discard_reservation(inode); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + } + if (is_dx(inode) && filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} + +const struct file_operations ext3_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = dquot_file_open, + .release = ext3_release_file, + .fsync = ext3_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; + +const struct inode_operations ext3_file_inode_operations = { + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, + .fiemap = ext3_fiemap, +}; + diff --git a/kernel/fs/ext3/fsync.c b/kernel/fs/ext3/fsync.c new file mode 100644 index 000000000..1cb9c7e10 --- /dev/null +++ b/kernel/fs/ext3/fsync.c @@ -0,0 +1,109 @@ +/* + * linux/fs/ext3/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include "ext3.h" + +/* + * akpm: A new design for ext3_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + int ret, needs_barrier = 0; + tid_t commit_tid; + + trace_ext3_sync_file_enter(file, datasync); + + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated state */ + smp_rmb(); + if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) + return -EROFS; + return 0; + } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + goto out; + + J_ASSERT(ext3_journal_current_handle() == NULL); + + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext3_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext3_should_journal_data(inode)) { + ret = ext3_force_commit(inode->i_sb); + goto out; + } + + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); + + if (test_opt(inode->i_sb, BARRIER) && + !journal_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = 1; + log_start_commit(journal, commit_tid); + ret = log_wait_commit(journal, commit_tid); + + /* + * In case we didn't commit a transaction, we have to flush + * disk caches manually so that data really is on persistent + * storage + */ + if (needs_barrier) { + int err; + + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } +out: + trace_ext3_sync_file_exit(inode, ret); + return ret; +} diff --git a/kernel/fs/ext3/hash.c b/kernel/fs/ext3/hash.c new file mode 100644 index 000000000..ede315cdf --- /dev/null +++ b/kernel/fs/ext3/hash.c @@ -0,0 +1,206 @@ +/* + * linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include "ext3.h" +#include + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i=0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/kernel/fs/ext3/ialloc.c b/kernel/fs/ext3/ialloc.c new file mode 100644 index 000000000..3ad242e58 --- /dev/null +++ b/kernel/fs/ext3/ialloc.c @@ -0,0 +1,706 @@ +/* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include + +#include "ext3.h" +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext3_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext3_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext3_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext3_free_inode (handle_t *handle, struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + unsigned long block_group; + unsigned long bit; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int fatal = 0, err; + + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + sbi = EXT3_SB(sb); + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); + + is_directory = S_ISDIR(inode->i_mode); + + es = EXT3_SB(sb)->s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + if (gdp) { + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&gdp->bg_free_inodes_count, 1); + if (is_directory) + le16_add_cpu(&gdp->bg_used_dirs_count, -1); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!fatal) + fatal = err; + +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, fatal); +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks). + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + struct ext3_sb_info *sbi = EXT3_SB(sb); + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT3_INODES_PER_GROUP(sb); + unsigned int freei, avefreei; + ext3_fsblk_t freeb, avefreeb; + unsigned int ndirs; + int max_dirs, min_inodes; + ext3_grpblk_t min_blocks; + int group = -1, i; + struct ext3_group_desc *desc; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == d_inode(sb->s_root)) || + (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { + int best_ndir = inodes_per_group; + int best_group = -1; + + group = prandom_u32(); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + } + if (best_group >= 0) + return best_group; + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + return group; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + return group; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + int ngroups = EXT3_SB(sb)->s_groups_count; + struct ext3_group_desc *desc; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + return group; + } + + return -1; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, umode_t mode) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group; + unsigned long ino = 0; + struct inode * inode; + struct ext3_group_desc * gdp = NULL; + struct ext3_super_block * es; + struct ext3_inode_info *ei; + struct ext3_sb_info *sbi; + int err = 0; + struct inode *ret; + int i; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3_I(inode); + + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else + group = find_group_other(sb, dir); + + err = -ENOSPC; + if (group == -1) + goto out; + + for (i = 0; i < sbi->s_groups_count; i++) { + err = -EIO; + + gdp = ext3_get_group_desc(sb, group, &bh2); + if (!gdp) + goto fail; + + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = 0; + +repeat_in_this_group: + ino = ext3_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); + if (ino < EXT3_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + + if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + goto got; + } + /* we lost it */ + journal_release_buffer(handle, bitmap_bh); + + if (++ino < EXT3_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + ino += group * EXT3_INODES_PER_GROUP(sb) + 1; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d, inode=%lu", group, ino); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); + if (S_ISDIR(mode)) { + le16_add_cpu(&gdp->bg_used_dirs_count, 1); + } + spin_unlock(sb_bgl_lock(sbi, group)); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); +#ifdef EXT3_FRAGMENTS + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; +#endif + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + + ext3_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + handle->h_sync = 1; + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; + } + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state_flags = 0; + ext3_set_inode_state(inode, EXT3_STATE_NEW); + + /* See comment in ext3_iget for explanation */ + if (ino >= EXT3_FIRST_INO(sb) + 1 && + EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = + sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; + } else { + ei->i_extra_isize = 0; + } + + ret = inode; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext3_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext3_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); + goto fail_free_drop; + } + + ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); + goto really_out; +fail: + ext3_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + brelse(bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __func__, + "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext3_warning(sb, __func__, + "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext3_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext3_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext3_warning(sb, __func__, + "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext3_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext3_count_free_inodes (struct super_block * sb) +{ + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext3_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff --git a/kernel/fs/ext3/inode.c b/kernel/fs/ext3/inode.c new file mode 100644 index 000000000..2ee2dc435 --- /dev/null +++ b/kernel/fs/ext3/inode.c @@ -0,0 +1,3573 @@ +/* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include "ext3.h" +#include "xattr.h" +#include "acl.h" + +static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext3_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT3_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext3_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + return ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext3 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) +{ + int ret; + + jbd_debug(2, "restarting handle %p\n", handle); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; +} + +/* + * Called at inode eviction from icache + */ +void ext3_evict_inode (struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *rsv; + handle_t *handle; + int want_delete = 0; + + trace_ext3_evict_inode(inode); + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + want_delete = 1; + } + + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages_final(&inode->i_data); + + ext3_discard_reservation(inode); + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); + + if (!want_delete) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext3_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. + */ + ext3_orphan_del(handle, inode); + ei->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) { + /* If that failed, just dquot_drop() and be done with that */ + dquot_drop(inode); + clear_inode(inode); + } else { + ext3_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + clear_inode(inode); + ext3_free_inode(handle, inode); + } + ext3_journal_stop(handle); + return; +no_delete: + clear_inode(inode); + dquot_drop(inode); +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext3_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + */ + +static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, + Indirect *partial) +{ + struct ext3_block_alloc_info *block_i; + + block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext3_find_near(inode, partial); +} + +/** + * ext3_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext3_alloc_blocks - multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: owner + * @goal: preferred place for allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of blocks need to allocated for direct blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: here we store the error value + * + * return the number of direct blocks allocated + */ +static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext3_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext3_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int indirect_blks, int *blks, ext3_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext3_fsblk_t new_blocks[4]; + ext3_fsblk_t current_block; + + num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + for (i = 1; i <= n ; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < indirect_blks; i++) + ext3_free_blocks(handle, inode, new_blocks[i], 1); + + ext3_free_blocks(handle, inode, new_blocks[i], num); + + return err; +} + +/** + * ext3_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext3_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + int err = 0; + struct ext3_block_alloc_info *block_i; + ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; + + block_i = ei->i_block_alloc_info; + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); + } + ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext3_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext3_inode_info *ei = EXT3_I(inode); + int count = 0; + ext3_fsblk_t first_block = 0; + + + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); + J_ASSERT(handle != NULL || create == 0); + depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext3_fsblk_t blk; + + if (!verify_chain(chain, chain + depth - 1)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + /* + * Block out ext3_truncate while we alter the tree + */ + mutex_lock(&ei->truncate_mutex); + + /* + * If the indirect block is missing while we are reading + * the chain(ext3_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext3_init_block_alloc_info(inode); + + goal = ext3_find_goal(inode, iblock, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext3_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); + return err; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 +/* + * Number of credits we need for writing DIO_MAX_BLOCKS: + * We need sb + group descriptor + bitmap + inode -> 4 + * For B blocks with A block pointers per block we need: + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. + */ +#define DIO_CREDITS 25 + +static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = ext3_journal_current_handle(); + int ret = 0, started = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + if (create && !handle) { /* Direct IO write... */ + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + handle = ext3_journal_start(inode, DIO_CREDITS + + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; + } + + ret = ext3_get_blocks_handle(handle, inode, iblock, + max_blocks, bh_result, create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + if (started) + ext3_journal_stop(handle); +out: + return ret; +} + +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + err = ext3_get_blocks_handle(handle, inode, block, 1, + &dummy, create); + /* + * ext3_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + WARN_ON(err > 1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (unlikely(!bh)) { + *errp = -ENOMEM; + goto err; + } + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext3_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data,0,inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } +err: + return NULL; +} + +struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext3_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (bh_uptodate_or_lock(bh)) + return bh; + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + int dirty = buffer_dirty(bh); + int ret; + + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; +} + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + +static int ext3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret; + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + + trace_ext3_write_begin(inode, pos, len, flags); + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + unlock_page(page); + page_cache_release(page); + ret = PTR_ERR(handle); + goto out; + } + ret = __block_write_begin(page, pos, len, ext3_get_block); + if (ret) + goto write_begin_failed; + + if (ext3_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } +write_begin_failed: + if (ret) { + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + ext3_journal_abort_handle(__func__, __func__, + bh, handle, err); + return err; +} + +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext3_journal_dirty_metadata(handle, bh); +} + +/* + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. + */ +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) +{ + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; + mark_inode_dirty(inode); + } +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext3 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext3_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = file->f_mapping->host; + unsigned from, to; + int ret = 0, ret2; + + trace_ext3_ordered_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, journal_dirty_data_fn); + + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +static int ext3_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = file->f_mapping->host; + int ret; + + trace_ext3_writeback_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +static int ext3_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + + trace_ext3_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from + copied, to); + to = from + copied; + } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext3_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +/* + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + */ +static int ext3_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (ext3_journal_current_handle()) + goto out_fail; + + trace_ext3_ordered_writepage(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + + ret = block_write_full_page(page, ext3_get_block, wbc); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_bufs is + * safe due to elevated refcount. + */ + + /* + * And attach them to the current transaction. But only if + * block_write_full_page() succeeded. Otherwise they are unmapped, + * and generally junk. + */ + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, journal_dirty_data_fn); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + if (ext3_journal_current_handle()) + goto out_fail; + + trace_ext3_writeback_writepage(page); + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + ret = block_write_full_page(page, ext3_get_block, wbc); + + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + trace_ext3_journalled_writepage(page); + if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); + if (ret != 0) { + ext3_journal_stop(handle); + goto out_unlock; + } + ret = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); + unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + } else { + /* + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. + */ + ret = block_write_full_page(page, NULL, wbc); + } +out: + return ret; + +no_write: + redirty_page_for_writepage(wbc, page); +out_unlock: + unlock_page(page); + goto out; +} + +static int ext3_readpage(struct file *file, struct page *page) +{ + trace_ext3_readpage(page); + return mpage_readpage(page, ext3_get_block); +} + +static int +ext3_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); +} + +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + trace_ext3_invalidatepage(page, offset, length); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0 && length == PAGE_CACHE_SIZE) + ClearPageChecked(page); + + journal_invalidatepage(journal, page, offset, length); +} + +static int ext3_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + trace_ext3_releasepage(page); + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + return journal_try_to_free_buffers(journal, page, wait); +} + +/* + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_iter_count(iter); + int retries = 0; + + trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + + if (iov_iter_rw(iter) == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext3_orphan_add(handle, inode); + if (ret) { + ext3_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext3_journal_stop(handle); + } + } + +retry: + ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + ext3_truncate_failed_direct_write(inode); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate_failed_direct_write(inode); + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + goto out; + } + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext3_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext3_mark_inode_dirty(handle, inode); + } + } + err = ext3_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext3's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext3_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext3_ordered_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext3_writeback_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext3_journalled_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void ext3_set_aops(struct inode *inode) +{ + if (ext3_should_order_data(inode)) + inode->i_mapping->a_ops = &ext3_ordered_aops; + else if (ext3_should_writeback_data(inode)) + inode->i_mapping->a_ops = &ext3_writeback_aops; + else + inode->i_mapping->a_ops = &ext3_journalled_aops; +} + +/* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext3_block_truncate_page(struct inode *inode, loff_t from) +{ + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE - 1); + unsigned blocksize, iblock, length, pos; + struct page *page; + handle_t *handle = NULL; + struct buffer_head *bh; + int err = 0; + + /* Truncated on block boundary - nothing to do */ + blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext3_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (err) + goto unlock; + } + + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto stop; + } + + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } +stop: + if (handle) + ext3_journal_stop(handle); + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext3_find_shared(struct inode *inode, int depth, + int offsets[4], Indirect chain[4], __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while(partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void ext3_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) +{ + __le32 *p; + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + if (ext3_journal_dirty_metadata(handle, bh)) + return; + } + ext3_mark_inode_dirty(handle, inode); + truncate_restart_transaction(handle, inode); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + if (ext3_journal_get_write_access(handle, bh)) + return; + } + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = sb_find_get_block(inode->i_sb, nr); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); +} + +/** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext3_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); + } +} + +/** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext3_fsblk_t nr; + __le32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%lu, block="E3FSBLK, + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, + depth); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + truncate_restart_transaction(handle, inode); + } + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } +} + +int ext3_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ +void ext3_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long last_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + trace_ext3_truncate_enter(inode); + + if (!ext3_can_truncate(inode)) + goto out_notrans; + + if (inode->i_size == 0 && ext3_should_writeback_data(inode)) + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + goto out_notrans; + + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + n = ext3_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + mutex_lock(&ei->truncate_mutex); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + + ext3_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_evict_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); + return; +out_notrans: + /* + * Delete the inode from orphan list so that it doesn't stay there + * forever and trigger assertion on umount. + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); +} + +static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, + unsigned long ino, struct ext3_iloc *iloc) +{ + unsigned long block_group; + unsigned long offset; + ext3_fsblk_t block; + struct ext3_group_desc *gdp; + + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ + return 0; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + gdp = ext3_get_group_desc(sb, block_group, NULL); + if (!gdp) + return 0; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * + EXT3_INODE_SIZE(sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); + + iloc->block_group = block_group; + iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); + return block; +} + +/* + * ext3_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) +{ + ext3_fsblk_t block; + struct buffer_head *bh; + + block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); + if (!block) + return -EIO; + + bh = sb_getblk(inode->i_sb, block); + if (unlikely(!bh)) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + return -ENOMEM; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + struct ext3_group_desc *desc; + int inodes_per_buffer; + int inode_offset, i; + int block_group; + int start; + + block_group = (inode->i_ino - 1) / + EXT3_INODES_PER_GROUP(inode->i_sb); + inodes_per_buffer = bh->b_size / + EXT3_INODE_SIZE(inode->i_sb); + inode_offset = ((inode->i_ino - 1) % + EXT3_INODES_PER_GROUP(inode->i_sb)); + start = inode_offset & ~(inodes_per_buffer - 1); + + /* Is the inode bitmap in cache? */ + desc = ext3_get_group_desc(inode->i_sb, + block_group, NULL); + if (!desc) + goto make_io; + + bitmap_bh = sb_getblk(inode->i_sb, + le32_to_cpu(desc->bg_inode_bitmap)); + if (unlikely(!bitmap_bh)) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_buffer; i++) { + if (i == inode_offset) + continue; + if (ext3_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_buffer) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext3_load_inode(inode); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext3_error(inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext3_get_inode_loc(inode, iloc, + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); +} + +void ext3_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT3_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT3_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ +void ext3_get_inode_flags(struct ext3_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| + EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; +} + +struct inode *ext3_iget(struct super_block *sb, unsigned long ino) +{ + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct ext3_inode_info *ei; + struct buffer_head *bh; + struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; + long ret; + int block; + uid_t i_uid; + gid_t i_gid; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT3_I(inode); + ei->i_block_alloc_info = NULL; + + ret = __ext3_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + bh = iloc.bh; + raw_inode = ext3_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; + + ei->i_state_flags = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); +#ifdef EXT3_FRAGMENTS + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; +#endif + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && + EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { + /* + * When mke2fs creates big inodes it does not zero out + * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, + * so ignore those first few inodes. + */ + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT3_INODE_SIZE(inode->i_sb)) { + brelse (bh); + ret = -EIO; + goto bad_inode; + } + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext3_inode) - + EXT3_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT3_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) + ext3_set_inode_state(inode, EXT3_STATE_XATTR); + } + } else + ei->i_extra_isize = 0; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) { + inode->i_op = &ext3_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + } + } else { + inode->i_op = &ext3_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (iloc.bh); + ext3_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) +{ + struct ext3_inode *raw_inode = ext3_raw_inode(iloc); + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; + uid_t i_uid; + gid_t i_gid; + +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + + ext3_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); +#ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; +#endif + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + unlock_buffer(bh); + err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh); + if (err) + goto out_brelse; + + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; + } + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + if (ei->i_extra_isize) + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); +out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * transaction to commit. + * + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. + * + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. + */ +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + return 0; + + if (ext3_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) + return 0; + + return ext3_force_commit(inode->i_sb); +} + +/* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Called with inode->sem down. + */ +int ext3_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error, rc = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = dquot_transfer(inode, attr); + if (error) { + ext3_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + } + + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } + EXT3_I(inode)->i_disksize = attr->ia_size; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (ia_valid & ATTR_MODE) + rc = posix_acl_chmod(inode, inode->i_mode); + +err_out: + ext3_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + + +/* + * How many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + +static int ext3_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + indirects + 2; + +#ifdef CONFIG_QUOTA + /* We know that structure was already allocated during dquot_initialize so + * we will be updating only the data blocks + inodes */ + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); +#endif + + return ret; +} + +/* + * The caller must have previously called ext3_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext3_iloc *iloc) +{ + int err = 0; + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext3_do_update_inode() does journal_dirty_metadata */ + err = ext3_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + */ +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + int err; + + might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_space() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext3_dirty_inode(struct inode *inode, int flags) +{ + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG "%s: transactions do not match!\n", + __func__); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext3_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext3_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; + else + EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; + ext3_set_aops(inode); + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle); + ext3_std_error(inode->i_sb, err); + + return err; +} diff --git a/kernel/fs/ext3/ioctl.c b/kernel/fs/ext3/ioctl.c new file mode 100644 index 000000000..4d96e9a64 --- /dev/null +++ b/kernel/fs/ext3/ioctl.c @@ -0,0 +1,327 @@ +/* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include "ext3.h" + +long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + ext3_get_inode_flags(ei); + flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext3_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + /* Is it quota file? Do not allow user to mess with it */ + err = -EPERM; + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext3_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext3_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + mutex_lock(&inode->i_mutex); + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + } + ext3_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); +setversion_out: + mnt_drop_write_file(filp); + return err; + } + case EXT3_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT3_IOC_SETRSVSZ: { + int err; + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setrsvsz_out; + } + + if (get_user(rsv_window_size, (int __user *)arg)) { + err = -EFAULT; + goto setrsvsz_out; + } + + if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + mutex_lock(&ei->truncate_mutex); + if (!ei->i_block_alloc_info) + ext3_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + mutex_unlock(&ei->truncate_mutex); +setrsvsz_out: + mnt_drop_write_file(filp); + return err; + } + case EXT3_IOC_GROUP_EXTEND: { + ext3_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); + journal_lock_updates(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; +group_extend_out: + mnt_drop_write_file(filp); + return err; + } + case EXT3_IOC_GROUP_ADD: { + struct ext3_new_group_data input; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } + + err = ext3_group_add(sb, &input); + journal_lock_updates(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; +group_add_out: + mnt_drop_write_file(filp); + return err; + } + case FITRIM: { + + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext3_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT3_IOC32_GETFLAGS: + cmd = EXT3_IOC_GETFLAGS; + break; + case EXT3_IOC32_SETFLAGS: + cmd = EXT3_IOC_SETFLAGS; + break; + case EXT3_IOC32_GETVERSION: + cmd = EXT3_IOC_GETVERSION; + break; + case EXT3_IOC32_SETVERSION: + cmd = EXT3_IOC_SETVERSION; + break; + case EXT3_IOC32_GROUP_EXTEND: + cmd = EXT3_IOC_GROUP_EXTEND; + break; + case EXT3_IOC32_GETVERSION_OLD: + cmd = EXT3_IOC_GETVERSION_OLD; + break; + case EXT3_IOC32_SETVERSION_OLD: + cmd = EXT3_IOC_SETVERSION_OLD; + break; +#ifdef CONFIG_JBD_DEBUG + case EXT3_IOC32_WAIT_FOR_READONLY: + cmd = EXT3_IOC_WAIT_FOR_READONLY; + break; +#endif + case EXT3_IOC32_GETRSVSZ: + cmd = EXT3_IOC_GETRSVSZ; + break; + case EXT3_IOC32_SETRSVSZ: + cmd = EXT3_IOC_SETRSVSZ; + break; + case EXT3_IOC_GROUP_ADD: + break; + default: + return -ENOIOCTLCMD; + } + return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/kernel/fs/ext3/namei.c b/kernel/fs/ext3/namei.c new file mode 100644 index 000000000..4264b9bd0 --- /dev/null +++ b/kernel/fs/ext3/namei.c @@ -0,0 +1,2585 @@ +/* + * linux/fs/ext3/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include "ext3.h" +#include "namei.h" +#include "xattr.h" +#include "acl.h" + +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3_I(inode)->i_disksize = inode->i_size; + *err = ext3_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +static inline unsigned dx_get_block (struct dx_entry *entry); +static void dx_set_block (struct dx_entry *entry, unsigned value); +static inline unsigned dx_get_hash (struct dx_entry *entry); +static void dx_set_hash (struct dx_entry *entry, unsigned value); +static unsigned dx_get_count (struct dx_entry *entries); +static unsigned dx_get_limit (struct dx_entry *entries); +static void dx_set_count (struct dx_entry *entries, unsigned value); +static void dx_set_limit (struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +static unsigned dx_node_limit (struct inode *dir); +static struct dx_frame *dx_probe(struct qstr *entry, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release (struct dx_frame *frames); +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); +static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err); +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext3_dir_entry_2 * +ext3_next_entry(struct ext3_dir_entry_2 *p) +{ + return (struct ext3_dir_entry_2 *)((char *)p + + ext3_rec_len_from_disk(p->rec_len)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline unsigned dx_get_block (struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block (struct dx_entry *entry, unsigned value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash (struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - + EXT3_DIR_REC_LEN(2) - infosize; + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit (struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index (char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk("%s index ", label); + for (i = 0; i < n; i++) + { + printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext3fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + (unsigned) ((char *) de - base)); + } + space += EXT3_DIR_REC_LEN(de->name_len); + names++; + } + de = ext3_next_entry(de); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse (bh); + } + if (bcount) + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(struct qstr *entry, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) { + *err = ERR_BAD_DX_DIR; + goto fail; + } + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __func__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; + if (entry) + ext3fs_dirhash(entry->name, entry->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext3_warning(dir->i_sb, __func__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext3_warning(dir->i_sb, __func__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + dxtrace (printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: no count or count > limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) { + *err = ERR_BAD_DX_DIR; + goto fail2; + } + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: limit != node limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + frame++; + frame->bh = NULL; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + if (*err == ERR_BAD_DX_DIR) + ext3_warning(dir->i_sb, __func__, + "Corrupt dir inode %ld, running e2fsck is " + "recommended.", dir->i_ino); + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, int block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext3_dir_entry_2 *de, *top; + int err = 0, count = 0; + + dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); + + if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err))) + return err; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) { + if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + (block<i_sb)) + +((char *)de - bh->b_data))) { + /* silently ignore the rest of the block */ + break; + } + ext3fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext3_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + int block, err; + int count = 0; + int ret; + __u32 hashval; + + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = file_inode(dir_file); + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + de = ext3_next_entry(de); + if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + blocksize) + { + if (de->name_len && de->inode) { + ext3fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u16) ((char *) de - base); + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext3_next_entry(de); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) + { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) + { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +static void ext3_update_dx_flag(struct inode *inode) +{ + if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX)) + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; +} + +/* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * + * `len <= EXT3_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext3_match (int len, const char * const name, + struct ext3_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct qstr *child, + unsigned long offset, + struct ext3_dir_entry_2 ** res_dir) +{ + struct ext3_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = child->name; + int namelen = child->len; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext3_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext3_rec_len_from_disk(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head *ext3_find_entry(struct inode *dir, + struct qstr *entry, + struct ext3_dir_entry_2 **res_dir) +{ + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + const u8 *name = entry->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + int namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = entry->len; + if (namelen > EXT3_NAME_LEN) + return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == 0)) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + bh = ext3_dx_find_entry(dir, entry, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); + } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh && !bh_uptodate_or_lock(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + ext3_error(sb, __func__, "reading directory #%lu " + "offset %lu", dir->i_ino, block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err) +{ + struct super_block *sb = dir->i_sb; + struct dx_hash_info hinfo; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + unsigned long block; + int retval; + + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) + return NULL; + do { + block = dx_get_block(frame->at); + if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err))) + goto errout; + + retval = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { + dx_release(frames); + return bh; + } + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext3_htree_next_block(dir, hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext3_warning(sb, __func__, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk("%s not found\n", entry->name)); + dx_release (frames); + return NULL; +} + +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) +{ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext3_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + if (!ext3_valid_inum(dir->i_sb, ino)) { + ext3_error(dir->i_sb, "ext3_lookup", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + inode = ext3_iget(dir->i_sb, ino); + if (inode == ERR_PTR(-ESTALE)) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext3_get_parent(struct dentry *child) +{ + unsigned long ino; + struct qstr dotdot = QSTR_INIT("..", 2); + struct ext3_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext3_find_entry(d_inode(child), &dotdot, &de); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { + ext3_error(d_inode(child)->i_sb, "ext3_get_parent", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); +} + +#define S_SHIFT 12 +static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK, +}; + +static inline void ext3_set_de_type(struct super_block *sb, + struct ext3_dir_entry_2 *de, + umode_t mode) { + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext3_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = + ext3_rec_len_to_disk(rec_len); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext3_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) +{ + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; + unsigned rec_len = 0; + + prev = to = de; + while ((char *)de < base + blocksize) { + next = ext3_next_entry(de); + if (de->inode && de->name_len) { + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext3_rec_len_to_disk(rec_len); + prev = to; + to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + u32 newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext3_dir_entry_2 *de = NULL, *de2; + int err = 0, i; + + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map ((struct ext3_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map (map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); + de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block (frame, hash2 + continued, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; + err = ext3_journal_dirty_metadata (handle, frame->bh); + if (err) + goto journal_error; + brelse (bh2); + dxtrace(dx_show_index ("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext3_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; +} + + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext3_dir_entry_2 *de, + struct buffer_head * bh) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext3_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext3_check_dir_entry("ext3_add_entry", dir, de, + bh, offset)) { + brelse (bh); + return -EIO; + } + if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; + } + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = ext3_rec_len_from_disk(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext3_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) { + ext3_std_error(dir->i_sb, err); + brelse(bh); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = ext3_rec_len_from_disk(de->rec_len); + if (de->inode) { + struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext3_rec_len_to_disk(rlen - nlen); + de->rec_len = ext3_rec_len_to_disk(nlen); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + dir->i_version++; + ext3_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + ext3_std_error(dir->i_sb, err); + brelse(bh); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + retval = ext3_journal_get_write_access(handle, bh); + if (retval) { + ext3_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + + ext3_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext3_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext3_next_entry(de)) < top) + de = de2; + de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext3_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; + } + dx_release(frames); + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * ext3_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head * bh; + struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + u32 block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; + dx_fallback++; + ext3_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval))) + return retval; + + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + return retval; + + if (blocks == 1 && !dx_fallback && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext3_rec_len_to_disk(blocksize); + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct inode *dir = d_inode(dentry->d_parent); + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) { + bh = NULL; + goto cleanup; + } + + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + u32 newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __func__, + "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block (frames + 0, hash2, newblock); + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + err = ext3_journal_dirty_metadata(handle, frames[0].bh); + if (err) + goto journal_error; + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + bh = NULL; + goto cleanup; + +journal_error: + ext3_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +{ + struct ext3_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + int err; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + if (pde) + pde->rec_len = ext3_rec_len_to_disk( + ext3_rec_len_from_disk(pde->rec_len) + + ext3_rec_len_from_disk(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { +journal_error: + ext3_std_error(dir->i_sb, err); + return err; + } + return 0; + } + i += ext3_rec_len_from_disk(de->rec_len); + pde = de; + de = ext3_next_entry(de); + } + return -ENOENT; +} + +static int ext3_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + } + drop_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, + bool excl) +{ + handle_t *handle; + struct inode * inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_mknod (struct inode * dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT3_FS_XATTR + inode->i_op = &ext3_special_inode_operations; +#endif + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT3_XATTR_TRANS_BLOCKS); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + inode = ext3_new_inode (handle, dir, NULL, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + d_tmpfile(dentry, inode); + err = ext3_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_unlock_inode: + ext3_journal_stop(handle); + unlock_new_inode(inode); + return err; +} + +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +{ + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block = NULL; + struct ext3_dir_entry_2 * de; + int err, retries = 0; + + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err))) + goto out_clear_inode; + + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext3_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + + de = (struct ext3_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len)); + strcpy (de->name, "."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext3_next_entry(de); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize - + EXT3_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + set_nlink(inode, 2); + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, dir_block); + if (err) + goto out_clear_inode; + + err = ext3_mark_inode_dirty(handle, inode); + if (!err) + err = ext3_add_entry (handle, dentry, inode); + + if (err) { +out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + inc_nlink(dir); + ext3_update_dx_flag(dir); + err = ext3_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + + unlock_new_inode(inode); + d_instantiate(dentry, inode); +out_stop: + brelse(dir_block); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir (struct inode * inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || + !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) { + if (err) + ext3_error(inode->i_sb, __func__, + "error %d reading directory #%lu offset 0", + err, inode->i_ino); + else + ext3_warning(inode->i_sb, __func__, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de1 = ext3_next_entry(de); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = ext3_rec_len_from_disk(de->rec_len) + + ext3_rec_len_from_disk(de1->rec_len); + de = ext3_next_entry(de1); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + err = 0; + brelse (bh); + if (!(bh = ext3_dir_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) { + if (err) + ext3_error(sb, __func__, + "error %d reading directory" + " #%lu offset %lu", + err, inode->i_ino, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) { + de = (struct ext3_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse (bh); + return 0; + } + offset += ext3_rec_len_from_disk(de->rec_len); + de = ext3_next_entry(de); + } + brelse (bh); + return 1; +} + +/* ext3_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ +int ext3_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; + + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on s_orphan_lock), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext3_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %lu\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); + err = ext3_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext3_std_error(inode->i_sb, err); +out: + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext3_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + dquot_initialize(dir); + dquot_initialize(d_inode(dentry)); + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = d_inode(dentry); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3_warning (inode->i_sb, "ext3_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + drop_nlink(dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +end_rmdir: + ext3_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3_unlink(struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + trace_ext3_unlink_enter(dir, dentry); + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + dquot_initialize(dir); + dquot_initialize(d_inode(dentry)); + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_unlink; + + inode = d_inode(dentry); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext3_journal_stop(handle); + brelse (bh); + trace_ext3_unlink_exit(dentry, retval); + return retval; +} + +static int ext3_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) +{ + handle_t *handle; + struct inode * inode; + int l, err, retries = 0; + int credits; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + dquot_initialize(dir); + + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } +retry: + handle = ext3_journal_start(dir, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > EXT3_N_BLOCKS * 4) { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + /* + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; + err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + set_nlink(inode, 1); + err = ext3_orphan_del(handle, inode); + if (err) { + ext3_journal_stop(handle); + drop_nlink(inode); + goto err_drop_inode; + } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } + EXT3_I(inode)->i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); +out_stop: + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; +} + +static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = d_inode(old_dentry); + int err, retries = 0; + + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME_SEC; + inc_nlink(inode); + ihold(inode); + + err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext3_orphan_del(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer) \ + (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode) + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) +{ + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval, flush_file = 0; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (d_really_is_positive(new_dentry)) + dquot_initialize(d_inode(new_dentry)); + handle = ext3_journal_start(old_dir, 2 * + EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = d_inode(old_dentry); + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = d_inode(new_dentry); + new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + retval = ext3_journal_get_write_access(handle, new_bh); + if (retval) + goto journal_error; + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, new_dir); + BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); + retval = ext3_journal_dirty_metadata(handle, new_bh); + if (retval) + goto journal_error; + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext3_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + + old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, + &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext3_warning(old_dir->i_sb, "ext3_rename", + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + drop_nlink(new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext3_journal_get_write_access(handle, dir_bh); + if (retval) + goto journal_error; + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + retval = ext3_journal_dirty_metadata(handle, dir_bh); + if (retval) { +journal_error: + ext3_std_error(new_dir->i_sb, retval); + goto end_rename; + } + drop_nlink(old_dir); + if (new_inode) { + drop_nlink(new_inode); + } else { + inc_nlink(new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } + ext3_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3_orphan_add(handle, new_inode); + if (ext3_should_writeback_data(new_inode)) + flush_file = 1; + } + retval = 0; + +end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3_journal_stop(handle); + if (retval == 0 && flush_file) + filemap_flush(old_inode->i_mapping); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext3_dir_inode_operations = { + .create = ext3_create, + .lookup = ext3_lookup, + .link = ext3_link, + .unlink = ext3_unlink, + .symlink = ext3_symlink, + .mkdir = ext3_mkdir, + .rmdir = ext3_rmdir, + .mknod = ext3_mknod, + .tmpfile = ext3_tmpfile, + .rename = ext3_rename, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, +}; + +const struct inode_operations ext3_special_inode_operations = { + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, +}; diff --git a/kernel/fs/ext3/namei.h b/kernel/fs/ext3/namei.h new file mode 100644 index 000000000..46304d8c9 --- /dev/null +++ b/kernel/fs/ext3/namei.h @@ -0,0 +1,27 @@ +/* linux/fs/ext3/namei.h + * + * Copyright (C) 2005 Simtec Electronics + * Ben Dooks + * +*/ + +extern struct dentry *ext3_get_parent(struct dentry *child); + +static inline struct buffer_head *ext3_dir_bread(handle_t *handle, + struct inode *inode, + int block, int create, + int *err) +{ + struct buffer_head *bh; + + bh = ext3_bread(handle, inode, block, create, err); + + if (!bh && !(*err)) { + *err = -EIO; + ext3_error(inode->i_sb, __func__, + "Directory hole detected on inode %lu\n", + inode->i_ino); + return NULL; + } + return bh; +} diff --git a/kernel/fs/ext3/resize.c b/kernel/fs/ext3/resize.c new file mode 100644 index 000000000..271056555 --- /dev/null +++ b/kernel/fs/ext3/resize.c @@ -0,0 +1,1117 @@ +/* + * linux/fs/ext3/resize.c + * + * Support for resizing an ext3 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT3FS_DEBUG + +#include "ext3.h" + + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); + ext3_fsblk_t end = start + input->blocks_count; + unsigned group = input->group; + ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext3_bg_has_super(sb, group) ? + (1 + ext3_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext3_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext3_grpblk_t free_blocks_count; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext3_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + if (group != sbi->s_groups_count) + ext3_warning(sb, __func__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if ((start - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)) + ext3_warning(sb, __func__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext3_warning(sb, __func__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext3_warning(sb, __func__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext3_warning(sb, __func__, + "Cannot read last block ("E3FSBLK")", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext3_warning(sb, __func__, + "Block bitmap not in group (block %u)", + input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext3_warning(sb, __func__, + "Inode bitmap not in group (block %u)", + input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext3_warning(sb, __func__, + "Inode table not in group (blocks %u-"E3FSBLK")", + input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext3_warning(sb, __func__, + "Block bitmap same as inode bitmap (%u)", + input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext3_warning(sb, __func__, + "Block bitmap (%u) in inode table (%u-"E3FSBLK")", + input->block_bitmap, input->inode_table, itend-1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext3_warning(sb, __func__, + "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", + input->inode_bitmap, input->inode_table, itend-1); + else if (inside(input->block_bitmap, start, metaend)) + ext3_warning(sb, __func__, + "Block bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->block_bitmap, start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext3_warning(sb, __func__, + "Inode bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->inode_bitmap, start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext3_warning(sb, __func__, + "Inode table (%u-"E3FSBLK") overlaps" + "GDT table ("E3FSBLK"-"E3FSBLK")", + input->inode_table, itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext3_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext3_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* + * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh, + struct buffer_head *bh) +{ + int err; + + if (handle->h_buffer_credits >= thresh) + return 0; + + err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA); + if (err) + return err; + err = ext3_journal_get_write_access(handle, bh); + if (err) + return err; + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group); + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + ext3_fsblk_t block; + ext3_grpblk_t bit; + int i; + int err = 0, err2; + + /* This transaction may be extended/restarted along the way */ + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext3_bg_has_super(sb, input->group)) { + ext3_debug("mark backup superblock %#04lx (+0)\n", start); + ext3_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("update backup group %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + gdb = sb_getblk(sb, block); + if (unlikely(!gdb)) { + err = -ENOMEM; + goto exit_bh; + } + if ((err = ext3_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(gdb); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(gdb); + goto exit_bh; + } + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, + input->block_bitmap - start); + ext3_set_bit(input->block_bitmap - start, bh->b_data); + ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext3_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext3_debug("clear inode block %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + err = ext3_journal_dirty_metadata(handle, it); + if (err) { + brelse(it); + goto exit_bh; + } + brelse(it); + ext3_set_bit(bit, bh->b_data); + } + + err = extend_or_restart_transaction(handle, 2, bh); + if (err) + goto exit_bh; + + mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto exit_bh; + brelse(bh); + + /* Mark unused entries in inode bitmap used */ + ext3_debug("clear inode bitmap %#04x (+%ld)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + err = ext3_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const ext3_fsblk_t blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __func__, + "reserved GDT "E3FSBLK + " missing grp %d ("E3FSBLK")", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; + } + if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext3_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT3_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { + ext3_warning(sb, __func__, + "won't resize using backup superblock at %llu", + (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext3_warning(sb, __func__, + "new group %u GDT block "E3FSBLK" not reserved", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext3_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext3_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext3_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext3_warning (sb, __func__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; + err = ext3_journal_dirty_metadata(handle, dind); + if (err) + goto exit_group_desc; + brelse(dind); + dind = NULL; + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto exit_group_desc; + memset((*primary)->b_data, 0, sb->s_blocksize); + err = ext3_journal_dirty_metadata(handle, *primary); + if (err) + goto exit_group_desc; + + o_group_desc = EXT3_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT3_SB(sb)->s_group_desc = n_group_desc; + EXT3_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto exit_inode; + + return 0; + +exit_group_desc: + kfree(n_group_desc); +exit_inode: + //ext3_journal_release_buffer(handle, iloc.bh); + brelse(iloc.bh); +exit_dindj: + //ext3_journal_release_buffer(handle, dind); +exit_primary: + //ext3_journal_release_buffer(handle, *primary); +exit_sbh: + //ext3_journal_release_buffer(handle, *primary); +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext3_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext3_iloc iloc; + ext3_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % + EXT3_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext3_warning(sb, __func__, + "reserved block "E3FSBLK + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext3_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext3_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext3_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext3 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && + (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (unlikely(!bh)) { + err = -ENOMEM; + break; + } + ext3_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + break; + } + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + if (err) + break; + } + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext3_warning(sb, __func__, + "can't update backup for group %d (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext3_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext3_warning(sb, __func__, + "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < + le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __func__, "blocks_count overflow\n"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext3_warning(sb, __func__, "inodes_count overflow\n"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT3_HAS_COMPAT_FEATURE(sb, + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext3_warning(sb, __func__, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext3_iget(sb, EXT3_RESIZE_INO); + if (IS_ERR(inode)) { + ext3_warning(sb, __func__, + "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext3_journal_start_sb(sb, + ext3_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + ext3_warning(sb, __func__, + "multiple resizers run on filesystem!"); + err = -EBUSY; + goto exit_journal; + } + + if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext3_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * We do not lock all allocations via s_resize_lock + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; + + gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); + gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); + gdp->bg_inode_table = cpu_to_le32(input->inode_table); + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + le32_add_cpu(&es->s_blocks_count, input->blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold s_resize_lock + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold s_resize_lock over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count++; + + err = ext3_journal_dirty_metadata(handle, primary); + if (err) + goto exit_journal; + + /* Update the reserved block counts only once the new group is + * active. */ + le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT3_INODES_PER_GROUP(sb)); + + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext3_group_add */ + +/* Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext3_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count) +{ + ext3_fsblk_t o_blocks_count; + ext3_grpblk_t last; + ext3_grpblk_t add; + struct buffer_head * bh; + handle_t *handle; + int err; + unsigned long freed_blocks; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking the s_resize_lock below. */ + o_blocks_count = le32_to_cpu(es->s_blocks_count); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK + " up to "E3FSBLK" blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3-fs: filesystem on %s:" + " too large to resize to "E3FSBLK" blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext3_warning(sb, __func__, + "CONFIG_LBDAF not enabled\n"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __func__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + + if (last == 0) { + ext3_warning(sb, __func__, + "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT3_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext3_warning(sb, __func__, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext3_warning(sb, __func__, + "will only finish group ("E3FSBLK + " blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add -1); + if (!bh) { + ext3_warning(sb, __func__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext3_free_blocks(). + */ + handle = ext3_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext3_warning(sb, __func__, "error %d on journal start",err); + goto exit_put; + } + + mutex_lock(&EXT3_SB(sb)->s_resize_lock); + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __func__, + "multiple resizers run on filesystem!"); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + ext3_journal_stop(handle); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh))) { + ext3_warning(sb, __func__, + "error %d on journal write access", err); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + ext3_journal_stop(handle); + goto exit_put; + } + es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + if (err) { + ext3_warning(sb, __func__, + "error %d on journal dirty metadata", err); + ext3_journal_stop(handle); + goto exit_put; + } + ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); + ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); + if ((err = ext3_journal_stop(handle))) + goto exit_put; + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", + le32_to_cpu(es->s_blocks_count)); + update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); +exit_put: + return err; +} /* ext3_group_extend */ diff --git a/kernel/fs/ext3/super.c b/kernel/fs/ext3/super.c new file mode 100644 index 000000000..a9312f0a5 --- /dev/null +++ b/kernel/fs/ext3/super.c @@ -0,0 +1,3165 @@ +/* + * linux/fs/ext3/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS + +#include "ext3.h" +#include "xattr.h" +#include "acl.h" +#include "namei.h" + +#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA +#else + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA +#endif + +static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); +static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + unsigned int); +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, + int sync); +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); +static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); +static int ext3_sync_fs(struct super_block *sb, int wait); +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]); +static int ext3_remount (struct super_block * sb, int * flags, char * data); +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext3_unfreeze(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); + +/* + * Wrappers for journal_start/end. + */ +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ + journal = EXT3_SB(sb)->s_journal; + if (is_journal_aborted(journal)) { + ext3_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + + return journal_start(journal, nblocks); +} + +int __ext3_journal_stop(const char *where, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext3_std_error(sb, where, err); + return err; +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext3_decode_error(NULL, err, nbuf); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); + + journal_abort_handle(handle); +} + +void ext3_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + + va_end(args); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + */ + +static void ext3_handle_error(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt (sb, ERRORS_CONT)) { + journal_t *journal = EXT3_SB(sb)->s_journal; + + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + if (journal) + journal_abort(journal, -EIO); + } + if (test_opt (sb, ERRORS_RO)) { + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + } + ext3_commit_super(sb, es, 1); + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs (%s): panic forced after error\n", + sb->s_id); +} + +void ext3_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + ext3_handle_error(sb); +} + +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext3_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext3_std_error (struct super_block * sb, const char * function, + int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext3_decode_error(sb, errno, nbuf); + ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); + + ext3_handle_error(sb); +} + +/* + * ext3_abort is a much stronger failure handler than ext3_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void ext3_abort(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs: panic from previous error\n"); + + if (sb->s_flags & MS_RDONLY) + return; + + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + + if (EXT3_SB(sb)->s_journal) + journal_abort(EXT3_SB(sb)->s_journal, -EIO); +} + +void ext3_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); +} + +void ext3_update_dynamic_rev(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) + return; + + ext3_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT3_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + + return NULL; +} + +/* + * Release the journal device + */ +static void ext3_blkdev_put(struct block_device *bdev) +{ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static void ext3_blkdev_remove(struct ext3_sb_info *sbi) +{ + struct block_device *bdev; + bdev = sbi->journal_bdev; + if (bdev) { + ext3_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) +{ + struct list_head *l; + + ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + ext3_msg(sb, KERN_ERR, " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext3_put_super (struct super_block * sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int i, err; + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext3_xattr_put_super(sb); + err = journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext3_abort(sb, __func__, "Couldn't clean up the journal"); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < EXT3_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext3_blkdev_remove(sbi); + } + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + mutex_destroy(&sbi->s_orphan_lock); + mutex_destroy(&sbi->s_resize_lock); + kfree(sbi); +} + +static struct kmem_cache *ext3_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext3_alloc_inode(struct super_block *sb) +{ + struct ext3_inode_info *ei; + + ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + + return &ei->vfs_inode; +} + +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + +static void ext3_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT3_I(inode)->i_orphan))) { + printk("EXT3 Inode %p: orphan list check failed!\n", + EXT3_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT3_I(inode), sizeof(struct ext3_inode_info), + false); + dump_stack(); + } + call_rcu(&inode->i_rcu, ext3_i_callback); +} + +static void init_once(void *foo) +{ + struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT3_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + mutex_init(&ei->truncate_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", + sizeof(struct ext3_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext3_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext3_inode_cachep); +} + +static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int ext3_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + unsigned long def_mount_opts; + + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%lu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || + le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + } + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || + le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); + } + if (test_opt(sb, ERRORS_RO)) { + int def_errors = le16_to_cpu(es->s_errors); + + if (def_errors == EXT3_ERRORS_PANIC || + def_errors == EXT3_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG)) + seq_puts(seq, ",debug"); +#ifdef CONFIG_EXT3_FS_XATTR + if (test_opt(sb, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER) && + (def_mount_opts & EXT3_DEFM_XATTR_USER)) { + seq_puts(seq, ",nouser_xattr"); + } +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); + if (sbi->s_commit_interval) { + seq_printf(seq, ",commit=%u", + (unsigned) (sbi->s_commit_interval / HZ)); + } + + /* + * Always display barrier state so it's clear what the status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + + ext3_show_quota_options(seq, sb); + + return 0; +} + + +static struct inode *ext3_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext3_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext3_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext3_nfs_get_inode); +} + +static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext3_nfs_get_inode); +} + +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") +#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext3_write_dquot(struct dquot *dquot); +static int ext3_acquire_dquot(struct dquot *dquot); +static int ext3_release_dquot(struct dquot *dquot); +static int ext3_mark_dquot_dirty(struct dquot *dquot); +static int ext3_write_info(struct super_block *sb, int type); +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); +static int ext3_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +static struct dquot **ext3_get_dquots(struct inode *inode) +{ + return EXT3_I(inode)->i_dquot; +} + +static const struct dquot_operations ext3_quota_operations = { + .write_dquot = ext3_write_dquot, + .acquire_dquot = ext3_acquire_dquot, + .release_dquot = ext3_release_dquot, + .mark_dirty = ext3_mark_dquot_dirty, + .write_info = ext3_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops ext3_qctl_operations = { + .quota_on = ext3_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +#endif + +static const struct super_operations ext3_sops = { + .alloc_inode = ext3_alloc_inode, + .destroy_inode = ext3_destroy_inode, + .write_inode = ext3_write_inode, + .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, + .evict_inode = ext3_evict_inode, + .put_super = ext3_put_super, + .sync_fs = ext3_sync_fs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, + .statfs = ext3_statfs, + .remount_fs = ext3_remount, + .show_options = ext3_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext3_quota_read, + .quota_write = ext3_quota_write, + .get_dquots = ext3_get_dquots, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct export_operations ext3_export_ops = { + .fh_to_dentry = ext3_fh_to_dentry, + .fh_to_parent = ext3_fh_to_parent, + .get_parent = ext3_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "nocheck"}, + {Opt_nocheck, "check=none"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_resize, "resize"}, + {Opt_err, NULL}, +}; + +static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) +{ + ext3_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + /*todo: use simple_strtoll with >32bit ext3 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype]) { + int same = !strcmp(sbi->s_qf_names[qtype], qname); + + kfree(qname); + if (!same) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + } + return same; + } + if (strchr(qname, '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + if (sbi->s_qf_names[qtype]) { + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + } + return 1; +} +#endif + +static int parse_options (char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext3_fsblk_t *n_blocks_count, int is_remount) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; + kuid_t uid; + kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + +#ifdef CONFIG_QUOTA + int qfmt; +#endif + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return 0; + + } + sbi->s_resuid = uid; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return 0; + } + sbi->s_resgid = gid; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); + break; + case Opt_orlov: + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); + break; +#ifdef CONFIG_EXT3_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + ext3_msg(sb, KERN_INFO, + "(no)user_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + ext3_msg(sb, KERN_INFO, + "(no)acl options not supported"); + break; +#endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + break; + case Opt_journal_inum: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *inum = option; + break; + case Opt_journal_dev: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = d_inode(path.dentry); + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_data_journal: + data_opt = EXT3_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT3_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT3_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if (test_opt(sb, DATA_FLAGS) == data_opt) + break; + ext3_msg(sb, KERN_ERR, + "error: cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.", + data_mode_string(test_opt(sb, + DATA_FLAGS)), + data_mode_string(data_opt)); + return 0; + } else { + clear_opt(sbi->s_mount_opt, DATA_FLAGS); + sbi->s_mount_opt |= data_opt; + } + break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!set_qf_name(sb, USRQUOTA, &args[0])) + return 0; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) + return 0; + break; + case Opt_offusrjquota: + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; + case Opt_offgrpjquota: + if (!clear_qf_name(sb, GRPQUOTA)) + return 0; + break; + case Opt_jqfmt_vfsold: + qfmt = QFMT_VFS_OLD; + goto set_qf_format; + case Opt_jqfmt_vfsv0: + qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; +set_qf_format: + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != qfmt) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "journaled quota options when " + "quota turned on."); + return 0; + } + sbi->s_jquota_fmt = qfmt; + break; + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; + case Opt_noquota: + if (sb_any_quota_loaded(sb)) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "quota options when quota turned on."); + return 0; + } + clear_opt(sbi->s_mount_opt, QUOTA); + clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext3_msg(sb, KERN_ERR, + "error: quota options not supported."); + break; + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + ext3_msg(sb, KERN_ERR, + "error: journaled quota options not " + "supported."); + break; + case Opt_noquota: + break; +#endif + case Opt_abort: + set_opt(sbi->s_mount_opt, ABORT); + break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_barrier: + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!is_remount) { + ext3_msg(sb, KERN_ERR, + "error: resize option only available " + "for remount"); + return 0; + } + if (match_int(&args[0], &option) != 0) + return 0; + *n_blocks_count = option; + break; + case Opt_nobh: + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); + break; + case Opt_bh: + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); + break; + default: + ext3_msg(sb, KERN_ERR, + "error: unrecognized mount option \"%s\" " + "or missing value", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi->s_mount_opt, USRQUOTA); + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi->s_mount_opt, GRPQUOTA); + + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext3_msg(sb, KERN_ERR, "error: old and new quota " + "format mixing."); + return 0; + } + + if (!sbi->s_jquota_fmt) { + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "not specified."); + return 0; + } + } +#endif + return 1; +} + +static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, + int read_only) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { + ext3_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3_VALID_FS)) + ext3_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT3_ERROR_FS)) + ext3_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + le16_to_cpu(es->s_max_mnt_count)) + ext3_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext3_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); +#if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state &= cpu_to_le16(~EXT3_VALID_FS); +#endif + if (!le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + + ext3_commit_super(sb, es, 1); + if (test_opt(sb, DEBUG)) + ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", + sb->s_blocksize, + sbi->s_groups_count, + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { + char b[BDEVNAME_SIZE]; + ext3_msg(sb, KERN_INFO, "using external journal on %s", + bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); + } else { + ext3_msg(sb, KERN_INFO, "using internal journal"); + } + cleancache_init_fs(sb); + return res; +} + +/* Called at mount-time, super-block is locked */ +static int ext3_check_descriptors(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i; + + ext3_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); + ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); + ext3_fsblk_t last_block; + + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > + last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + } + + sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); + return 1; +} + + +/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, skipping orphan cleanup."); + return; + } + + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + } + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < EXT3_MAXQUOTAS; i++) { + if (EXT3_SB(sb)->s_qf_names[i]) { + int ret = ext3_quota_on_mount(sb, i); + if (ret < 0) + ext3_msg(sb, KERN_ERR, + "error: cannot turn on journaled " + "quota: %d", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + printk(KERN_DEBUG + "%s: truncating inode %lu to %Ld bytes\n", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG + "%s: deleting unreferenced inode %lu\n", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < EXT3_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext3_max_size(int bits) +{ + loff_t res = EXT3_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + + /* This is calculated to be the largest file size for a + * dense, file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32 -1 + * __u32 i_blocks representing the total number of + * 512 bytes blocks of the file + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext3_fsblk_t descriptor_loc(struct super_block *sb, + ext3_fsblk_t logic_sb_block, + int nr) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext3_bg_has_super(sb, bg)) + has_super = 1; + return (has_super + ext3_group_first_block_no(sb, bg)); +} + + +static int ext3_fill_super (struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext3_super_block *es = NULL; + struct ext3_sb_info *sbi; + ext3_fsblk_t block; + ext3_fsblk_t sb_block = get_sb_block(&data, sb); + ext3_fsblk_t logic_sb_block; + unsigned long offset = 0; + unsigned int journal_inum = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + int ret = -EINVAL; + __le32 features; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } + sb->s_fs_info = sbi; + sbi->s_sb_block = sb_block; + + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { + ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); + goto out_fail; + } + + /* + * The ext3 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3 macro-instructions depend on its value + */ + es = (struct ext3_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3_SUPER_MAGIC) + goto cantfind_ext3; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT3_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT3_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT3_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT3_FS_XATTR + if (def_mount_opts & EXT3_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + if (def_mount_opts & EXT3_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + + /* enable barriers by default */ + set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && + (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext3_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); + if (features) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "optional features (%x)", le32_to_cpu(features)); + goto failed_mount; + } + features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); + if (!(sb->s_flags & MS_RDONLY) && features) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount RDWR because of unsupported " + "optional features (%x)", le32_to_cpu(features)); + goto failed_mount; + } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT3_MIN_BLOCK_SIZE || + blocksize > EXT3_MAX_BLOCK_SIZE) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "filesystem blocksize %d", blocksize); + goto failed_mount; + } + + hblock = bdev_logical_block_size(sb->s_bdev); + if (sb->s_blocksize != blocksize) { + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (blocksize < hblock) { + ext3_msg(sb, KERN_ERR, + "error: fsblocksize %d too small for " + "hardware sectorsize %d", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + if (!sb_set_blocksize(sb, blocksize)) { + ext3_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); + goto out_fail; + } + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if (!bh) { + ext3_msg(sb, KERN_ERR, + "error: can't read superblock on 2nd try"); + goto failed_mount; + } + es = (struct ext3_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { + ext3_msg(sb, KERN_ERR, + "error: magic mismatch"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext3_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + ext3_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %u (unsupported)", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext3; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + + if (sbi->s_blocks_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + err = generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count)); + if (err) { + ext3_msg(sb, KERN_ERR, + "error: filesystem is too large to mount safely"); + if (sizeof(sector_t) < 8) + ext3_msg(sb, KERN_ERR, + "error: CONFIG_LBDAF not enabled"); + ret = err; + goto failed_mount; + } + + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT3_BLOCKS_PER_GROUP(sb)) + 1; + db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext3_msg(sb, KERN_ERR, + "error: not enough memory"); + ret = -ENOMEM; + goto failed_mount; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + ext3_msg(sb, KERN_ERR, + "error: can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3_check_descriptors (sb)) { + ext3_msg(sb, KERN_ERR, + "error: group descriptors corrupted"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. */ + sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext3_rsv_window_add(sb, &sbi->s_rsv_window_head); + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3_sops; + sb->s_export_op = &ext3_export_ops; + sb->s_xattr = ext3_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext3_qctl_operations; + sb->dq_op = &ext3_quota_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) + goto failed_mount2; + } else { + if (!silent) + ext3_msg(sb, KERN_ERR, + "error: no journal found. " + "mounting ext3 over ext2?"); + goto failed_mount2; + } + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb), GFP_KERNEL); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb), GFP_KERNEL); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext3_count_dirs(sb), GFP_KERNEL); + } + if (err) { + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); + ret = err; + goto failed_mount3; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3_MOUNT_ORDERED_DATA: + case EXT3_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + ext3_msg(sb, KERN_ERR, + "error: journal does not support " + "requested data journaling mode"); + goto failed_mount3; + } + default: + break; + } + + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext3_iget(sb, EXT3_ROOT_INO); + if (IS_ERR(root)) { + ext3_msg(sb, KERN_ERR, "error: get root inode failed"); + ret = PTR_ERR(root); + goto failed_mount3; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); + goto failed_mount3; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); + ret = -ENOMEM; + goto failed_mount3; + } + + if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + ext3_orphan_cleanup(sb, es); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + if (needs_recovery) { + ext3_mark_recovery_complete(sb, es); + ext3_msg(sb, KERN_INFO, "recovery complete"); + } + ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + return 0; + +cantfind_ext3: + if (!silent) + ext3_msg(sb, KERN_INFO, + "error: can't find ext3 filesystem on dev %s.", + sb->s_id); + goto failed_mount; + +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + journal_destroy(sbi->s_journal); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: +#ifdef CONFIG_QUOTA + for (i = 0; i < EXT3_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext3_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); + return ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext3-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ + + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; + spin_unlock(&journal->j_state_lock); +} + +static journal_t *ext3_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext3_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + ext3_msg(sb, KERN_ERR, "error: no journal found"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) { + ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext3_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext3_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head * bh; + journal_t *journal; + ext3_fsblk_t start; + ext3_fsblk_t len; + int hblock, blocksize; + ext3_fsblk_t sb_block; + unsigned long offset; + struct ext3_super_block * es; + struct block_device *bdev; + + bdev = ext3_blkdev_get(j_dev, sb); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext3_msg(sb, KERN_ERR, + "error: blocksize too small for journal device"); + goto out_bdev; + } + + sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; + offset = EXT3_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " + "external journal"); + goto out_bdev; + } + + es = (struct ext3_super_block *) (bh->b_data + offset); + if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext3_msg(sb, KERN_ERR, "error: external journal has " + "bad superblock"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + ext3_msg(sb, KERN_ERR, + "error: failed to create device journal"); + goto out_bdev; + } + journal->j_private = sb; + if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { + if (bh_submit_read(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext3_msg(sb, KERN_ERR, + "error: external journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; + ext3_init_journal_params(sb, journal); + return journal; +out_journal: + journal_destroy(journal); +out_bdev: + ext3_blkdev_put(bdev); + return NULL; +} + +static int ext3_load_journal(struct super_block *sb, + struct ext3_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext3_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_INFO, + "recovery required on readonly filesystem"); + if (really_read_only) { + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, cannot proceed"); + return -EROFS; + } + ext3_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); + } + } + + if (journal_inum && journal_dev) { + ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " + "and inode journals"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!(journal->j_flags & JFS_BARRIER)) + printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + ext3_msg(sb, KERN_ERR, "error updating journal"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) + err = journal_wipe(journal, !really_read_only); + if (!err) + err = journal_load(journal); + + if (err) { + ext3_msg(sb, KERN_ERR, "error loading journal"); + journal_destroy(journal); + return err; + } + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + } + + return 0; +} + +static int ext3_create_journal(struct super_block *sb, + struct ext3_super_block *es, + unsigned int journal_inum) +{ + journal_t *journal; + int err; + + if (sb->s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_ERR, + "error: readonly filesystem when trying to " + "create journal"); + return -EROFS; + } + + journal = ext3_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + + ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", + journal_inum); + + err = journal_create(journal); + if (err) { + ext3_msg(sb, KERN_ERR, "error creating journal"); + journal_destroy(journal); + return -EIO; + } + + EXT3_SB(sb)->s_journal = journal; + + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + + return 0; +} + +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, + int sync) +{ + struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; + + if (!sbh) + return error; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext3_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); + es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + ext3_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; +} + + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + journal_lock_updates(journal); + if (journal_flush(journal) < 0) + goto out; + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, es, 1); + } + +out: + journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext3_clear_journal_err(struct super_block *sb, + struct ext3_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3_error() or ext3_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3_decode_error(sb, j_errno, nbuf); + ext3_warning(sb, __func__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3_warning(sb, __func__, "Marking fs in need of " + "filesystem check."); + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + + journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext3_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3_SB(sb)->s_journal; + ret = ext3_journal_force_commit(journal); + return ret; +} + +static int ext3_sync_fs(struct super_block *sb, int wait) +{ + tid_t target; + + trace_ext3_sync_fs(sb, wait); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } + return 0; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ +static int ext3_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal; + + if (!(sb->s_flags & MS_RDONLY)) { + journal = EXT3_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + error = journal_flush(journal); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; + } + return 0; + +out: + journal_unlock_updates(journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext3_unfreeze(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) { + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + } + return 0; +} + +static int ext3_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); + ext3_fsblk_t n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext3_mount_options old_opts; + int enable_quota = 0; + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + sync_filesystem(sb); + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < EXT3_MAXQUOTAS; i++) + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + int j; + + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; +#endif + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (test_opt(sb, ABORT)) + ext3_abort(sb, __func__, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + ext3_init_journal_params(sb, sbi->s_journal); + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > le32_to_cpu(es->s_blocks_count)) { + if (test_opt(sb, ABORT)) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && + (sbi->s_mount_state & EXT3_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3_mark_recovery_complete(sb, es); + } else { + __le32 ret; + if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3_FEATURE_RO_COMPAT_SUPP))) { + ext3_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR " + "because of unsupported optional " + "features (%x)", le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount & mount for now. + */ + if (es->s_last_orphan) { + ext3_msg(sb, KERN_WARNING, "warning: couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount & mount instead."); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((err = ext3_group_extend(sb, es, n_blocks_count))) + goto restore_opts; + if (!ext3_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; + } + } +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < EXT3_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); +#endif + if (enable_quota) + dquot_resume(sb, -1); + return 0; +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < EXT3_MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + return err; +} + +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + u64 fsid; + + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long ngroups = sbi->s_groups_count, i; + ext3_fsblk_t overhead = 0; + smp_rmb(); + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += ngroups * (2 + sbi->s_itb_per_group); + + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) + overhead += sbi->s_journal->j_maxlen; + + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); + } + + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT3_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction before quota file + * is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext3_create() quota_sync() + * journal_start() write_dquot() + * dquot_initialize() down(dqio_mutex) + * down(dqio_mutex) journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; +} + +static int ext3_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext3_journal_start(inode, + EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext3_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext3_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext3_journal_start(d_inode(sb->s_root), 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext3_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) + return -EXDEV; + /* Journaling quota? */ + if (EXT3_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext3_msg(sb, KERN_WARNING, + "warning: Quota file not on filesystem root. " + "Journaled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext3_should_journal_data(d_inode(path->dentry))) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + journal_lock_updates(EXT3_SB(sb)->s_journal); + err = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err) + return err; + } + + return dquot_quota_on(sb, type, format_id, path); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext3_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (!handle) { + ext3_msg(sb, KERN_WARNING, + "warning: quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); +out: + if (err) + return err; + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT3_I(inode)->i_disksize = inode->i_size; + } + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + return len; +} + +#endif + +static struct dentry *ext3_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); +} + +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .mount = ext3_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext3"); + +static int __init init_ext3_fs(void) +{ + int err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext3_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext3_xattr(); + return err; +} + +static void __exit exit_ext3_fs(void) +{ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +MODULE_LICENSE("GPL"); +module_init(init_ext3_fs) +module_exit(exit_ext3_fs) diff --git a/kernel/fs/ext3/symlink.c b/kernel/fs/ext3/symlink.c new file mode 100644 index 000000000..ea96df3c5 --- /dev/null +++ b/kernel/fs/ext3/symlink.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext3/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 symlink handling code + */ + +#include +#include "ext3.h" +#include "xattr.h" + +static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext3_inode_info *ei = EXT3_I(d_inode(dentry)); + nd_set_link(nd, (char*)ei->i_data); + return NULL; +} + +const struct inode_operations ext3_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext3_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext3_follow_link, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/kernel/fs/ext3/xattr.c b/kernel/fs/ext3/xattr.c new file mode 100644 index 000000000..7cf36501c --- /dev/null +++ b/kernel/fs/ext3/xattr.c @@ -0,0 +1,1330 @@ +/* + * linux/fs/ext3/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext3 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include "ext3.h" +#include +#include +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define IHDR(inode, raw_inode) \ + ((struct ext3_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT3_GOOD_OLD_INODE_SIZE + \ + EXT3_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1)) + +#ifdef EXT3_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext3_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext3_xattr_cache_find(struct inode *, + struct ext3_xattr_header *, + struct mb_cache_entry **); +static void ext3_xattr_rehash(struct ext3_xattr_header *, + struct ext3_xattr_entry *); +static int ext3_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); + +static struct mb_cache *ext3_xattr_cache; + +static const struct xattr_handler *ext3_xattr_handler_map[] = { + [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_SECURITY + [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext3_xattr_handlers[] = { + &ext3_xattr_user_handler, + &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif +#ifdef CONFIG_EXT3_FS_SECURITY + &ext3_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext3_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) + handler = ext3_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * d_inode(dentry)->i_mutex: don't care + */ +ssize_t +ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext3_xattr_list(dentry, buffer, size); +} + +static int +ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext3_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext3_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext3_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { +bad_block: ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_entry *entry; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) + return -ENODATA; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext3_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT3_I(inode)->xattr_sem); + error = ext3_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext3_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT3_I(inode)->xattr_sem); + return error; +} + +static int +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext3_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + void *end; + int error; + + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) + return 0; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext3_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int i_error, b_error; + + down_read(&EXT3_I(d_inode(dentry))->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); + if (i_error < 0) { + b_error = 0; + } else { + if (buffer) { + buffer += i_error; + buffer_size -= i_error; + } + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); + if (b_error < 0) + i_error = 0; + } + up_read(&EXT3_I(d_inode(dentry))->xattr_sem); + return i_error + b_error; +} + +/* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext3_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext3_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + + ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr); + error = ext3_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + ext3_free_blocks(handle, inode, bh->b_blocknr, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + error = ext3_journal_dirty_metadata(handle, bh); + if (IS_SYNC(inode)) + handle->h_sync = 1; + dquot_free_block(inode, 1); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + if (ce) + mb_cache_entry_release(ce); + } + unlock_buffer(bh); +out: + ext3_std_error(inode->i_sb, error); + return; +} + +struct ext3_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext3_xattr_search { + struct ext3_xattr_entry *first; + void *base; + void *end; + struct ext3_xattr_entry *here; + int not_found; +}; + +static int +ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) +{ + struct ext3_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT3_XATTR_SIZE(size); + } + free += EXT3_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT3_XATTR_LEN(name_len) + + EXT3_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT3_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT3_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT3_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT3_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT3_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext3_xattr_block_find { + struct ext3_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext3_xattr_check_block(bs->bh)) { + ext3_error(sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext3_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext3_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext3_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + +#define header(x) ((struct ext3_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + error = ext3_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext3_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), + s->here); + ext3_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext3_journal_dirty_metadata(handle, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + journal_release_buffer(handle, bs->bh); + + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext3_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, 1); + if (error) + goto cleanup; + error = ext3_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext3_journal_dirty_metadata(handle, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext3_fsblk_t goal = ext3_group_first_block_no(sb, + EXT3_I(inode)->i_block_group); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (unlikely(!new_bh)) { +getblk_failed: + ext3_free_blocks(handle, inode, block, 1); + error = -ENOMEM; + goto cleanup; + } + lock_buffer(new_bh); + error = ext3_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext3_xattr_cache_insert(new_bh); + error = ext3_journal_dirty_metadata(handle, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext3_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, 1); + goto cleanup; + +bad_block: + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext3_xattr_ibody_find { + struct ext3_xattr_search s; + struct ext3_iloc iloc; +}; + +static int +ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext3_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { + error = ext3_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext3_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_search *s = &is->s; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext3_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext3_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + ext3_set_inode_state(inode, EXT3_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); + } + return 0; +} + +/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext3_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext3_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext3_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT3_I(inode)->xattr_sem); + error = ext3_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { + struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + } + + error = ext3_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + if (!value) { + if (!is.s.not_found) + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT3_I(inode)->i_file_acl && !bs.s.base) { + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext3_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext3_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext3_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = CURRENT_TIME_SEC; + error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext3_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + up_write(&EXT3_I(inode)->xattr_sem); + return error; +} + +/* + * ext3_xattr_set() + * + * Like ext3_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext3_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * ext3_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) { + ext3_error(inode->i_sb, __func__, + "inode %lu: block "E3FSBLK" read error", inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + ext3_xattr_release_block(handle, inode, bh); + EXT3_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext3_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext3_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext3_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext3_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext3_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext3_xattr_cmp(struct ext3_xattr_header *header1, + struct ext3_xattr_header *header2) +{ + struct ext3_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT3_XATTR_NEXT(entry1); + entry2 = EXT3_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext3_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext3_error(inode->i_sb, __func__, + "inode %lu: block %lu read error", + inode->i_ino, (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT3_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT3_XATTR_REFCOUNT_MAX); + } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext3_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext3_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext3_xattr_rehash(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + struct ext3_xattr_entry *here; + __u32 hash = 0; + + ext3_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT3_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext3_xattr(void) +{ + ext3_xattr_cache = mb_cache_create("ext3_xattr", 6); + if (!ext3_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext3_xattr(void) +{ + if (ext3_xattr_cache) + mb_cache_destroy(ext3_xattr_cache); + ext3_xattr_cache = NULL; +} diff --git a/kernel/fs/ext3/xattr.h b/kernel/fs/ext3/xattr.h new file mode 100644 index 000000000..32e93ebf8 --- /dev/null +++ b/kernel/fs/ext3/xattr.h @@ -0,0 +1,136 @@ +/* + File: fs/ext3/xattr.h + + On-disk format of extended attributes for the ext3 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT3_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT3_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT3_XATTR_INDEX_USER 1 +#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT3_XATTR_INDEX_TRUSTED 4 +#define EXT3_XATTR_INDEX_LUSTRE 5 +#define EXT3_XATTR_INDEX_SECURITY 6 + +struct ext3_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext3_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext3_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT3_XATTR_PAD_BITS 2 +#define EXT3_XATTR_PAD (1<e_name_len)) ) +#define EXT3_XATTR_SIZE(size) \ + (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) + +# ifdef CONFIG_EXT3_FS_XATTR + +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_security_handler; + +extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); + +extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext3_xattr_delete_inode(handle_t *, struct inode *); +extern void ext3_xattr_put_super(struct super_block *); + +extern int init_ext3_xattr(void); +extern void exit_ext3_xattr(void); + +extern const struct xattr_handler *ext3_xattr_handlers[]; + +# else /* CONFIG_EXT3_FS_XATTR */ + +static inline int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext3_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext3_xattr(void) +{ + return 0; +} + +static inline void +exit_ext3_xattr(void) +{ +} + +#define ext3_xattr_handlers NULL + +# endif /* CONFIG_EXT3_FS_XATTR */ + +#ifdef CONFIG_EXT3_FS_SECURITY +extern int ext3_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext3_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/kernel/fs/ext3/xattr_security.c b/kernel/fs/ext3/xattr_security.c new file mode 100644 index 000000000..c9506d5e3 --- /dev/null +++ b/kernel/fs/ext3/xattr_security.c @@ -0,0 +1,78 @@ +/* + * linux/fs/ext3/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include "ext3.h" +#include "xattr.h" + +static size_t +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; + } + return err; +} + +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + +const struct xattr_handler ext3_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext3_xattr_security_list, + .get = ext3_xattr_security_get, + .set = ext3_xattr_security_set, +}; diff --git a/kernel/fs/ext3/xattr_trusted.c b/kernel/fs/ext3/xattr_trusted.c new file mode 100644 index 000000000..206cc66dc --- /dev/null +++ b/kernel/fs/ext3/xattr_trusted.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext3/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include "ext3.h" +#include "xattr.h" + +static size_t +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +const struct xattr_handler ext3_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext3_xattr_trusted_list, + .get = ext3_xattr_trusted_get, + .set = ext3_xattr_trusted_set, +}; diff --git a/kernel/fs/ext3/xattr_user.c b/kernel/fs/ext3/xattr_user.c new file mode 100644 index 000000000..021508ad1 --- /dev/null +++ b/kernel/fs/ext3/xattr_user.c @@ -0,0 +1,58 @@ +/* + * linux/fs/ext3/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include "ext3.h" +#include "xattr.h" + +static size_t +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext3_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext3_xattr_user_list, + .get = ext3_xattr_user_get, + .set = ext3_xattr_user_set, +}; diff --git a/kernel/fs/ext4/Kconfig b/kernel/fs/ext4/Kconfig new file mode 100644 index 000000000..024f2284d --- /dev/null +++ b/kernel/fs/ext4/Kconfig @@ -0,0 +1,97 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + select CRYPTO + select CRYPTO_CRC32C + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formatting a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS + depends on EXT3_FS=n || EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" + depends on EXT4_FS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of ext4 files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + +config EXT4_DEBUG + bool "EXT4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug diff --git a/kernel/fs/ext4/Makefile b/kernel/fs/ext4/Makefile new file mode 100644 index 000000000..75285ea9a --- /dev/null +++ b/kernel/fs/ext4/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the linux ext4-filesystem routines. +# + +obj-$(CONFIG_EXT4_FS) += ext4.o + +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o readpage.o + +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/kernel/fs/ext4/acl.c b/kernel/fs/ext4/acl.c new file mode 100644 index 000000000..69b1e7302 --- /dev/null +++ b/kernel/fs/ext4/acl.c @@ -0,0 +1,283 @@ +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext4_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +struct posix_acl * +ext4_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext4_new_inode + */ +static int +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + error = __ext4_set_acl(handle, inode, type, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } + return error; +} diff --git a/kernel/fs/ext4/acl.h b/kernel/fs/ext4/acl.h new file mode 100644 index 000000000..da2c79577 --- /dev/null +++ b/kernel/fs/ext4/acl.h @@ -0,0 +1,72 @@ +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* acl.c */ +struct posix_acl *ext4_get_acl(struct inode *inode, int type); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include +#define ext4_get_acl NULL +#define ext4_set_acl NULL + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/kernel/fs/ext4/balloc.c b/kernel/fs/ext4/balloc.c new file mode 100644 index 000000000..955bf49a7 --- /dev/null +++ b/kernel/fs/ext4/balloc.c @@ -0,0 +1,880 @@ +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "mballoc.h" + +#include + +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (block - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; +} + +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; + } + } + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + ext4_inode_bitmap(sb, gdp) - start); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; + } + } + + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, itbl_blk + i - start); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; + } + } + + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { + /* + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. + */ + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} + +/* Initializes an uninitialized block bitmap */ +static int ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + struct ext4_group_info *grp; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return -EIO; + } + memset(bh->b_data, 0, sb->s_blocksize); + + bit_max = ext4_num_base_meta_clusters(sb, block_group); + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + } + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + return 0; +} + +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned int group_desc; + unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); + + return NULL; + } + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)sbi->s_group_desc[group_desc]->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc; +} + +/* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. + */ +static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_fsblk_t blk; + ext4_fsblk_t group_first_block; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 0; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + blk = ext4_block_bitmap(sb, desc); + offset = blk - group_first_block; + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode bitmap block number is set */ + blk = ext4_inode_bitmap(sb, desc); + offset = blk - group_first_block; + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + /* bad block bitmap */ + return blk; + + /* check whether the inode table block number is set */ + blk = ext4_inode_table(sb, desc); + offset = blk - group_first_block; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group)) + /* bad bitmap for inode tables */ + return blk; + return 0; +} + +static void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return; + + ext4_lock_group(sb, block_group); + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + return; + } + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + return; + } + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); +} + +/** + * ext4_read_block_bitmap_nowait() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = ext4_block_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + int err; + + err = ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + if (err) + ext4_error(sb, "Checksum bad for grp %u", block_group); + return bh; + } + ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + set_buffer_new(bh); + trace_ext4_read_block_bitmap_load(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); + return bh; +verify: + ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (buffer_verified(bh)) + return bh; + put_bh(bh); + return NULL; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext4_error(sb, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; + } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ + ext4_validate_block_bitmap(sb, desc, block_group, bh); + /* ...but check for error just in case errors=continue. */ + return !buffer_verified(bh); +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (!bh) + return NULL; + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); + return NULL; + } + return bh; +} + +/** + * ext4_has_free_clusters() + * @sbi: in-core super block structure. + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() + * + * Check if filesystem has nclusters free & available for allocation. + * On success return 1, return 0 on failure. + */ +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + s64 free_clusters, dirty_clusters, rsv, resv_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; + + if (free_clusters - (nclusters + rsv + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = percpu_counter_sum_positive(fcc); + dirty_clusters = percpu_counter_sum_positive(dcc); + } + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. + */ + if (free_clusters >= (rsv + nclusters + dirty_clusters)) + return 1; + + /* Hm, nope. Are (enough) root reserved clusters available? */ + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || + capable(CAP_SYS_RESOURCE) || + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { + if (free_clusters >= (nclusters + dirty_clusters)) + return 1; + } + + return 0; +} + +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext4_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or committing transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || + (*retries)++ > 3 || + !EXT4_SB(sb)->s_journal) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: pointer to total number of clusters needed + * @errp: error code + * + * Return 1st allocated block number on success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + ar.flags = flags; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); + } + return ret; +} + +/** + * ext4_count_free_clusters() -- count filesystem free clusters + * @sb: superblock + * + * Adds up the number of free clusters from each block group. + */ +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_info *grp; +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned int x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_CLUSTERS_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_group_clusters(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; + return 0; + } + if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + return 1; + if (!(group & 1)) + return 0; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + +/* + * This function returns the number of file system metadata clusters at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb(sb, block_group); + } + return EXT4_NUM_B2C(sbi, num); +} +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/kernel/fs/ext4/bitmap.c b/kernel/fs/ext4/bitmap.c new file mode 100644 index 000000000..4a606afb1 --- /dev/null +++ b/kernel/fs/ext4/bitmap.c @@ -0,0 +1,97 @@ +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include "ext4.h" + +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); +} + +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!ext4_has_metadata_csum(sb)) + return 1; + + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!ext4_has_metadata_csum(sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} + +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + + if (!ext4_has_metadata_csum(sb)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + if (provided == calculated) + return 1; + + return 0; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!ext4_has_metadata_csum(sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/kernel/fs/ext4/block_validity.c b/kernel/fs/ext4/block_validity.c new file mode 100644 index 000000000..3522340c7 --- /dev/null +++ b/kernel/fs/ext4/block_validity.c @@ -0,0 +1,242 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init ext4_init_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + ext4_bg_num_gdb(sb, i) + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &EXT4_SB(sb)->system_blks, node) + kmem_cache_free(ext4_system_zone_cachep, entry); + + EXT4_SB(sb)->system_blks = RB_ROOT; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + diff --git a/kernel/fs/ext4/crypto.c b/kernel/fs/ext4/crypto.c new file mode 100644 index 000000000..8ff15273a --- /dev/null +++ b/kernel/fs/ext4/crypto.c @@ -0,0 +1,558 @@ +/* + * linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption functions for ext4 + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_extents.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *ext4_bounce_page_pool; + +static LIST_HEAD(ext4_free_crypto_ctxs); +static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); + +/** + * ext4_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->bounce_page) { + if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) + __free_page(ctx->bounce_page); + else + mempool_free(ctx->bounce_page, ext4_bounce_page_pool); + ctx->bounce_page = NULL; + } + ctx->control_page = NULL; + if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { + if (ctx->tfm) + crypto_free_tfm(ctx->tfm); + kfree(ctx); + } else { + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + } +} + +/** + * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context + * @mask: The allocation mask. + * + * Return: An allocated and initialized encryption context on success. An error + * value or NULL otherwise. + */ +static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) +{ + struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), + mask); + + if (!ctx) + return ERR_PTR(-ENOMEM); + return ctx; +} + +/** + * ext4_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +{ + struct ext4_crypto_ctx *ctx = NULL; + int res = 0; + unsigned long flags; + struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + + if (!ext4_read_workqueue) + ext4_init_crypto(); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, + struct ext4_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + if (!ctx) { + ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + + /* Allocate a new Crypto API context if we don't already have + * one or if it isn't the right mode. */ + BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); + if (ctx->tfm && (ctx->mode != key->mode)) { + crypto_free_tfm(ctx->tfm); + ctx->tfm = NULL; + ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; + } + if (!ctx->tfm) { + switch (key->mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + ctx->tfm = crypto_ablkcipher_tfm( + crypto_alloc_ablkcipher("xts(aes)", 0, 0)); + break; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + /* TODO(mhalcrow): AEAD w/ gcm(aes); + * crypto_aead_setauthsize() */ + ctx->tfm = ERR_PTR(-ENOTSUPP); + break; + default: + BUG(); + } + if (IS_ERR_OR_NULL(ctx->tfm)) { + res = PTR_ERR(ctx->tfm); + ctx->tfm = NULL; + goto out; + } + ctx->mode = key->mode; + } + BUG_ON(key->size != ext4_encryption_key_size(key->mode)); + + /* There shouldn't be a bounce page attached to the crypto + * context at this point. */ + BUG_ON(ctx->bounce_page); + +out: + if (res) { + if (!IS_ERR_OR_NULL(ctx)) + ext4_release_crypto_ctx(ctx); + ctx = ERR_PTR(res); + } + return ctx; +} + +struct workqueue_struct *ext4_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +/** + * ext4_exit_crypto() - Shutdown the ext4 encryption system + */ +void ext4_exit_crypto(void) +{ + struct ext4_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { + if (pos->bounce_page) { + if (pos->flags & + EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { + __free_page(pos->bounce_page); + } else { + mempool_free(pos->bounce_page, + ext4_bounce_page_pool); + } + } + if (pos->tfm) + crypto_free_tfm(pos->tfm); + kfree(pos); + } + INIT_LIST_HEAD(&ext4_free_crypto_ctxs); + if (ext4_bounce_page_pool) + mempool_destroy(ext4_bounce_page_pool); + ext4_bounce_page_pool = NULL; + if (ext4_read_workqueue) + destroy_workqueue(ext4_read_workqueue); + ext4_read_workqueue = NULL; +} + +/** + * ext4_init_crypto() - Set up for ext4 encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_init_crypto(void) +{ + int i, res; + + mutex_lock(&crypto_init); + if (ext4_read_workqueue) + goto already_initialized; + ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); + if (!ext4_read_workqueue) { + res = -ENOMEM; + goto fail; + } + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct ext4_crypto_ctx *ctx; + + ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto fail; + } + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + } + + ext4_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!ext4_bounce_page_pool) { + res = -ENOMEM; + goto fail; + } +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + ext4_exit_crypto(); + mutex_unlock(&crypto_init); + return res; +} + +void ext4_restore_control_page(struct page *data_page) +{ + struct ext4_crypto_ctx *ctx = + (struct ext4_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + ext4_release_crypto_ctx(ctx); +} + +/** + * ext4_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void ext4_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + EXT4_DECRYPT = 0, + EXT4_ENCRYPT, +} ext4_direction_t; + +static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, + struct inode *inode, + ext4_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) + +{ + u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct ext4_inode_info *ei = EXT4_I(inode); + struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + int res = 0; + + BUG_ON(!ctx->tfm); + BUG_ON(ctx->mode != ei->i_encryption_key.mode); + + if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { + printk_ratelimited(KERN_ERR + "%s: unsupported crypto algorithm: %d\n", + __func__, ctx->mode); + return -ENOTSUPP; + } + + crypto_ablkcipher_clear_flags(atfm, ~0); + crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + + res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, + ei->i_encryption_key.size); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_ablkcipher_setkey() failed\n", + __func__); + return res; + } + req = ablkcipher_request_alloc(atfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_crypt_complete, &ecr); + + BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + EXT4_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == EXT4_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited( + KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +/** + * ext4_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * ext4_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *) ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + ctx->control_page = plaintext_page; + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ext4_release_crypto_ctx(ctx); + return ERR_PTR(err); + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; +} + +/** + * ext4_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return ext4_page_crypto(ctx, page->mapping->host, + EXT4_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int ext4_decrypt_one(struct inode *inode, struct page *page) +{ + int ret; + + struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); + + if (!ctx) + return -ENOMEM; + ret = ext4_decrypt(ctx, page); + ext4_release_crypto_ctx(ctx); + return ret; +} + +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + ext4_lblk_t lblk = ex->ee_block; + ext4_fsblk_t pblk = ext4_ext_pblock(ex); + unsigned int len = ext4_ext_get_actual_len(ex); + int err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + + while (len--) { + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page); + if (err) + goto errout; + + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = pblk; + err = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (err) { + bio_put(bio); + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if (err) + goto errout; + } + err = 0; +errout: + ext4_release_crypto_ctx(ctx); + return err; +} + +bool ext4_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * ext4_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == ext4_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/kernel/fs/ext4/crypto_fname.c b/kernel/fs/ext4/crypto_fname.c new file mode 100644 index 000000000..fded02f72 --- /dev/null +++ b/kernel/fs/ext4/crypto_fname.c @@ -0,0 +1,719 @@ +/* + * linux/fs/ext4/crypto_fname.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains functions for filename crypto management in ext4 + * + * Written by Uday Savagaonkar, 2014. + * + * This has not yet undergone a rigorous security audit. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_crypto.h" +#include "xattr.h" + +/** + * ext4_dir_crypt_complete() - + */ +static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool ext4_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); +} + +/** + * ext4_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + struct scatterlist sg[1]; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, ciphertext_len); + res = ciphertext_len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * ext4_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + struct ext4_str tmp_in[2], tmp_out[1]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist sg[1]; + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + tmp_in[0].name = iname->name; + tmp_in[0].len = iname->len; + tmp_out[0].name = oname->name; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, iname->len); + res = iname->len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +/** + * ext4_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +static int digest_encode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + i++; + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static int digest_decode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; + } + if (ac) + return -1; + return cp - dst; +} + +/** + * ext4_free_fname_crypto_ctx() - + * + * Frees up a crypto context. + */ +void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) +{ + if (ctx == NULL || IS_ERR(ctx)) + return; + + if (ctx->ctfm && !IS_ERR(ctx->ctfm)) + crypto_free_ablkcipher(ctx->ctfm); + if (ctx->htfm && !IS_ERR(ctx->htfm)) + crypto_free_hash(ctx->htfm); + if (ctx->workpage && !IS_ERR(ctx->workpage)) + __free_page(ctx->workpage); + kfree(ctx); +} + +/** + * ext4_put_fname_crypto_ctx() - + * + * Return: The crypto context onto free list. If the free list is above a + * threshold, completely frees up the context, and returns the memory. + * + * TODO: Currently we directly free the crypto context. Eventually we should + * add code it to return to free list. Such an approach will increase + * efficiency of directory lookup. + */ +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) +{ + if (*ctx == NULL || IS_ERR(*ctx)) + return; + ext4_free_fname_crypto_ctx(*ctx); + *ctx = NULL; +} + +/** + * ext4_search_fname_crypto_ctx() - + */ +static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + return NULL; +} + +/** + * ext4_alloc_fname_crypto_ctx() - + */ +struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + struct ext4_fname_crypto_ctx *ctx; + + ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { + /* This will automatically set key mode to invalid + * As enum for ENCRYPTION_MODE_INVALID is zero */ + memset(&ctx->key, 0, sizeof(ctx->key)); + } else { + memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); + } + ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) + ? 0 : 1; + ctx->ctfm_key_is_ready = 0; + ctx->ctfm = NULL; + ctx->htfm = NULL; + ctx->workpage = NULL; + return ctx; +} + +/** + * ext4_get_fname_crypto_ctx() - + * + * Allocates a free crypto context and initializes it to hold + * the crypto material for the inode. + * + * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. + */ +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( + struct inode *inode, u32 max_ciphertext_len) +{ + struct ext4_fname_crypto_ctx *ctx; + struct ext4_inode_info *ei = EXT4_I(inode); + int res; + + /* Check if the crypto policy is set on the inode */ + res = ext4_encrypted_inode(inode); + if (res == 0) + return NULL; + + if (!ext4_has_encryption_key(inode)) + ext4_generate_encryption_key(inode); + + /* Get a crypto context based on the key. + * A new context is allocated if no context matches the requested key. + */ + ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); + if (ctx == NULL) + ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); + if (IS_ERR(ctx)) + return ctx; + + ctx->flags = ei->i_crypt_policy_flags; + if (ctx->has_valid_key) { + if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + printk_once(KERN_WARNING + "ext4: unsupported key mode %d\n", + ctx->key.mode); + return ERR_PTR(-ENOKEY); + } + + /* As a first cut, we will allocate new tfm in every call. + * later, we will keep the tfm around, in case the key gets + * re-used */ + if (ctx->ctfm == NULL) { + ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", + 0, 0); + } + if (IS_ERR(ctx->ctfm)) { + res = PTR_ERR(ctx->ctfm); + printk( + KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", + __func__, res); + ctx->ctfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->ctfm == NULL) { + printk( + KERN_DEBUG "%s: could not allocate crypto tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + if (ctx->workpage == NULL) + ctx->workpage = alloc_page(GFP_NOFS); + if (IS_ERR(ctx->workpage)) { + res = PTR_ERR(ctx->workpage); + printk( + KERN_DEBUG "%s: error (%d) allocating work page\n", + __func__, res); + ctx->workpage = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->workpage == NULL) { + printk( + KERN_DEBUG "%s: could not allocate work page\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + ctx->lim = max_ciphertext_len; + crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + + /* If we are lucky, we will get a context that is already + * set up with the right key. Else, we will have to + * set the key */ + if (!ctx->ctfm_key_is_ready) { + /* Since our crypto objectives for filename encryption + * are pretty weak, + * we directly use the inode master key */ + res = crypto_ablkcipher_setkey(ctx->ctfm, + ctx->key.raw, ctx->key.size); + if (res) { + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-EIO); + } + ctx->ctfm_key_is_ready = 1; + } else { + /* In the current implementation, key should never be + * marked "ready" for a context that has just been + * allocated. So we should never reach here */ + BUG(); + } + } + if (ctx->htfm == NULL) + ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ctx->htfm)) { + res = PTR_ERR(ctx->htfm); + printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", + __func__, res); + ctx->htfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->htfm == NULL) { + printk(KERN_DEBUG "%s: could not allocate hash tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + + return ctx; +} + +/** + * ext4_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size+blksize-1)/blksize)*blksize; +} + +/** + * ext4_fname_crypto_namelen_on_disk() - + */ +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen) +{ + u32 ciphertext_len; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + + if (ctx == NULL) + return -EIO; + if (!(ctx->has_valid_key)) + return -EACCES; + ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + return (int) ciphertext_len; +} + +/** + * ext4_fname_crypto_alloc_obuff() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str) +{ + unsigned int olen; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + + if (!ctx) + return -EIO; + if (padding < EXT4_CRYPTO_BLOCK_SIZE) + padding = EXT4_CRYPTO_BLOCK_SIZE; + olen = ext4_fname_crypto_round_up(ilen, padding); + crypto_str->len = olen; + if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) + olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen+1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * ext4_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * ext4_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + char buf[24]; + int ret; + + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) + return ext4_fname_decrypt(ctx, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; +} + +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname) +{ + struct ext4_str iname = {.name = (unsigned char *) de->name, + .len = de->name_len }; + + return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); +} + + +/** + * ext4_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + int res; + + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && + iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) { + res = ext4_fname_encrypt(ctx, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +/* + * Calculate the htree hash from a filename from user space + */ +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo) +{ + struct ext4_str tmp; + int ret = 0; + char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + + if (!ctx || + ((iname->name[0] == '.') && + ((iname->len == 1) || + ((iname->name[1] == '.') && (iname->len == 2))))) { + ext4fs_dirhash(iname->name, iname->len, hinfo); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] == '_') { + if (iname->len != 33) + return -ENOENT; + ret = digest_decode(iname->name+1, iname->len, buf); + if (ret != 24) + return -ENOENT; + memcpy(&hinfo->hash, buf, 4); + memcpy(&hinfo->minor_hash, buf + 4, 4); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] != '_') { + if (iname->len > 43) + return -ENOENT; + ret = digest_decode(iname->name, iname->len, buf); + ext4fs_dirhash(buf, ret, hinfo); + return 0; + } + + /* First encrypt the plaintext name */ + ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); + if (ret < 0) + return ret; + + ret = ext4_fname_encrypt(ctx, iname, &tmp); + if (ret >= 0) { + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ret = 0; + } + + ext4_fname_crypto_free_buffer(&tmp); + return ret; +} + +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de) +{ + int ret = -ENOENT; + int bigname = (*name == '_'); + + if (ctx->has_valid_key) { + if (cstr->name == NULL) { + struct qstr istr; + + ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); + if (ret < 0) + goto errout; + istr.name = name; + istr.len = len; + ret = ext4_fname_encrypt(ctx, &istr, cstr); + if (ret < 0) + goto errout; + } + } else { + if (cstr->name == NULL) { + cstr->name = kmalloc(32, GFP_KERNEL); + if (cstr->name == NULL) + return -ENOMEM; + if ((bigname && (len != 33)) || + (!bigname && (len > 43))) + goto errout; + ret = digest_decode(name+bigname, len-bigname, + cstr->name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + cstr->len = ret; + } + if (bigname) { + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + cstr->name + 8, 16); + return (ret == 0) ? 1 : 0; + } + } + if (de->name_len != cstr->len) + return 0; + ret = memcmp(de->name, cstr->name, cstr->len); + return (ret == 0) ? 1 : 0; +errout: + kfree(cstr->name); + cstr->name = NULL; + return ret; +} diff --git a/kernel/fs/ext4/crypto_key.c b/kernel/fs/ext4/crypto_key.c new file mode 100644 index 000000000..52170d0b7 --- /dev/null +++ b/kernel/fs/ext4/crypto_key.c @@ -0,0 +1,166 @@ +/* + * linux/fs/ext4/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for ext4 + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct ext4_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * ext4_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], + char source_key[EXT4_AES_256_XTS_KEY_SIZE], + char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +/** + * ext4_generate_encryption_key() - generates an encryption key + * @inode: The inode to generate the encryption key for. + */ +int ext4_generate_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct ext4_encryption_key *master_key; + struct ext4_encryption_context ctx; + struct user_key_payload *ukp; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) { + if (res > 0) + res = -EINVAL; + goto out; + } + res = 0; + + ei->i_crypt_policy_flags = ctx.flags; + if (S_ISREG(inode->i_mode)) + crypt_key->mode = ctx.contents_encryption_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + crypt_key->mode = ctx.filenames_encryption_mode; + else { + printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + BUG(); + } + crypt_key->size = ext4_encryption_key_size(crypt_key->mode); + BUG_ON(!crypt_key->size); + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto out; + } + memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, + "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct ext4_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct ext4_encryption_key *)ukp->data; + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != + EXT4_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); +out: + if (keyring_key) + key_put(keyring_key); + if (res < 0) + crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + return res; +} + +int ext4_has_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + + return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); +} diff --git a/kernel/fs/ext4/crypto_policy.c b/kernel/fs/ext4/crypto_policy.c new file mode 100644 index 000000000..a6d6291ae --- /dev/null +++ b/kernel/fs/ext4/crypto_policy.c @@ -0,0 +1,198 @@ +/* + * linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption policy functions for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#include +#include +#include + +#include "ext4.h" +#include "xattr.h" + +static int ext4_inode_has_encryption_context(struct inode *inode) +{ + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int ext4_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int ext4_create_encryption_context_from_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = 0; + + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + return res; +} + +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!ext4_inode_has_encryption_context(inode)) { + if (!ext4_empty_dir(inode)) + return -ENOTEMPTY; + return ext4_create_encryption_context_from_policy(inode, + policy); + } + + if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENOENT; + if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct ext4_encryption_context parent_ctx, child_ctx; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + /* no restrictions if the parent directory is not encrypted */ + if (!ext4_encrypted_inode(parent)) + return 1; + res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) + return 0; + /* if the child directory is not encrypted, this is always a problem */ + if (!ext4_encrypted_inode(child)) + return 0; + res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + return (memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode)); +} + +/** + * ext4_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int ext4_inherit_context(struct inode *parent, struct inode *child) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) { + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + ctx.contents_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + goto out; + } + } + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); +out: + if (!res) + ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + return res; +} diff --git a/kernel/fs/ext4/dir.c b/kernel/fs/ext4/dir.c new file mode 100644 index 000000000..5665d82d2 --- /dev/null +++ b/kernel/fs/ext4/dir.c @@ -0,0 +1,644 @@ +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include "ext4.h" +#include "xattr.h" + +static int ext4_dx_readdir(struct file *, struct dir_context *); + +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) + return 1; + + return 0; +} + +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. + */ +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, struct file *filp, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, char *buf, int size, + unsigned int offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + else + return 0; + + if (filp) + ext4_error_file(filp, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset % size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset % size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + + return 1; +} + +static int ext4_readdir(struct file *file, struct dir_context *ctx) +{ + unsigned int offset; + int i; + struct ext4_dir_entry_2 *de; + int err; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; + int dir_has_error = 0; + struct ext4_fname_crypto_ctx *enc_ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + + if (is_dx_dir(inode)) { + err = ext4_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) { + return err; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + ext4_clear_inode_flag(file_inode(file), + EXT4_INODE_INDEX); + } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + err = ext4_read_inline_dir(file, ctx, + &has_inline_data); + if (has_inline_data) + return err; + } + + enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); + if (IS_ERR(enc_ctx)) + return PTR_ERR(enc_ctx); + if (enc_ctx) { + err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&enc_ctx); + return err; + } + } + + offset = ctx->pos & (sb->s_blocksize - 1); + + while (ctx->pos < inode->i_size) { + struct ext4_map_blocks map; + + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err > 0) { + pgoff_t index = map.m_pblk >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&file->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &file->f_ra, file, + index, 1); + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + } + + if (!bh) { + if (!dir_has_error) { + EXT4_ERROR_FILE(file, 0, + "directory contains a " + "hole at offset %llu", + (unsigned long long) ctx->pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (ctx->pos > inode->i_blocks << 9) + break; + ctx->pos += sb->s_blocksize - offset; + continue; + } + + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_FILE(file, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; + brelse(bh); + bh = NULL; + continue; + } + set_buffer_verified(bh); + + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (file->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = i; + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) + | offset; + file->f_version = inode->i_version; + } + + while (ctx->pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (ext4_check_dir_entry(inode, file, de, bh, + bh->b_data, bh->b_size, + offset)) { + /* + * On error, skip to the next block + */ + ctx->pos = (ctx->pos | + (sb->s_blocksize - 1)) + 1; + break; + } + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + if (le32_to_cpu(de->inode)) { + if (enc_ctx == NULL) { + /* Directory is not encrypted */ + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(enc_ctx, + NULL, de, &fname_crypto_str); + if (err < 0) + goto errout; + if (!dir_emit(ctx, + fname_crypto_str.name, err, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + if ((ctx->pos < inode->i_size) && !dir_relax(inode)) + goto done; + brelse(bh); + bh = NULL; + offset = 0; + } +done: + err = 0; +errout: +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&enc_ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif + brelse(bh); + return err; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() calls generic_file_llseek_size to handle htree + * directories, where the "offset" is in terms of the filename hash + * value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond offset limits, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * For non-htree, ext4_llseek already chooses the proper max offset. + */ +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext4_get_htree_eof(file); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + return ext4_llseek(file, offset, whence); +} + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + + *root = RB_ROOT; +} + + +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, + loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + return p; +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + ent_name->len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, ent_name->name, ent_name->len); + new_fn->name[ent_name->len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + + if (!fname) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); + return 0; + } + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); + while (fname) { + if (!dir_emit(ctx, fname->name, + fname->name_len, + fname->inode, + get_dtype(sb, fname->file_type))) { + info->extra_fname = fname; + return 1; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) +{ + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct fname *fname; + int ret; + + if (!info) { + info = ext4_htree_create_dir_info(file, ctx->pos); + if (!info) + return -ENOMEM; + file->private_data = info; + } + + if (ctx->pos == ext4_get_htree_eof(file)) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != ctx->pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(file, ctx, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (file->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + file->f_version = inode->i_version; + ret = ext4_htree_fill_tree(file, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(file, ctx, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + ctx->pos = ext4_get_htree_eof(file); + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = ctx->pos; + return 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EIO; + + return 0; +} + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .iterate = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; diff --git a/kernel/fs/ext4/ext4.h b/kernel/fs/ext4/ext4.h new file mode 100644 index 000000000..9a83f149a --- /dev/null +++ b/kernel/fs/ext4/ext4.h @@ -0,0 +1,2998 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ + atomic_t count; /* reference counter */ +} ext4_io_end_t; + +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Do not take i_data_sem locking in ext4_map_blocks */ +#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 + +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 +#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 + +#include "ext4_crypto.h" + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) +#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; + char i_crypt_policy_flags; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + /* + * Completed IOs that need unwritten extents handling and don't have + * transaction reserved + */ + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + struct work_struct i_rsv_conversion_work; + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption params */ + struct ext4_encryption_key i_encryption_key; +#endif +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_reserved[100]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 2 + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_mb_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption */ + uint32_t s_file_encryption_mode; + uint32_t s_dir_encryption_mode; +#endif +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_USR_QUOTA_INO || + ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +/* + * Returns true if the inode is inode is encrypted + */ +static inline int ext4_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); +#else + return 0; +#endif +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +/* crypto_policy.c */ +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child); +int ext4_inherit_context(struct inode *parent, struct inode *child); +void ext4_to_hex(char *dst, char *src, size_t src_size); +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode); +int ext4_get_policy(struct inode *inode, + struct ext4_encryption_policy *policy); + +/* crypto.c */ +bool ext4_valid_contents_enc_mode(uint32_t mode); +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); +extern struct workqueue_struct *ext4_read_workqueue; +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); +void ext4_restore_control_page(struct page *data_page); +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page); +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); +int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_init_crypto(void); +void ext4_exit_crypto(void); +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); +} +#else +static inline int ext4_init_crypto(void) { return 0; } +static inline void ext4_exit_crypto(void) { } +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return 0; +} +#endif + +/* crypto_fname.c */ +bool ext4_valid_filenames_enc_mode(uint32_t mode); +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str); +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname); +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname); +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname); +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo); +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen); +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de); + + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len); +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +#else +static inline +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } +static inline +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len) +{ + return NULL; +} +static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +#endif + + +/* crypto_key.c */ +int ext4_generate_encryption_key(struct inode *inode); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_has_encryption_key(struct inode *inode); +#else +static inline int ext4_has_encryption_key(struct inode *inode) +{ + return 0; +} +#endif + + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const struct qstr *iname, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, int handle_type, + unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern int ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + !EXT4_SB(sb)->s_chksum_driver); + + return (EXT4_SB(sb)->s_chksum_driver != NULL); +} +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !mutex_is_locked(&inode->i_mutex)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages); + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +#endif /* __KERNEL__ */ + +#endif /* _EXT4_H */ diff --git a/kernel/fs/ext4/ext4_crypto.h b/kernel/fs/ext4/ext4_crypto.h new file mode 100644 index 000000000..d75159c10 --- /dev/null +++ b/kernel/fs/ext4/ext4_crypto.h @@ -0,0 +1,156 @@ +/* + * linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#ifndef _EXT4_CRYPTO_H +#define _EXT4_CRYPTO_H + +#include + +#define EXT4_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct ext4_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 + +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Reserved + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct ext4_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; + char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +/* Encryption parameters */ +#define EXT4_XTS_TWEAK_SIZE 16 +#define EXT4_AES_128_ECB_KEY_SIZE 16 +#define EXT4_AES_256_GCM_KEY_SIZE 32 +#define EXT4_AES_256_CBC_KEY_SIZE 32 +#define EXT4_AES_256_CTS_KEY_SIZE 32 +#define EXT4_AES_256_XTS_KEY_SIZE 64 +#define EXT4_MAX_KEY_SIZE 64 + +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 + +struct ext4_encryption_key { + uint32_t mode; + char raw[EXT4_MAX_KEY_SIZE]; + uint32_t size; +}; + +#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 + +struct ext4_crypto_ctx { + struct crypto_tfm *tfm; /* Crypto API context */ + struct page *bounce_page; /* Ciphertext page on write path */ + struct page *control_page; /* Original page on write path */ + struct bio *bio; /* The bio for this context */ + struct work_struct work; /* Work queue for read complete path */ + struct list_head free_list; /* Free list */ + int flags; /* Flags */ + int mode; /* Encryption mode for tfm */ +}; + +struct ext4_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ + struct ext4_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int ext4_encryption_key_size(int mode) +{ + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + return EXT4_AES_256_XTS_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + return EXT4_AES_256_GCM_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CBC: + return EXT4_AES_256_CBC_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + return EXT4_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 +#define EXT4_CRYPTO_BLOCK_SIZE 16 +#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 + +struct ext4_str { + unsigned char *name; + u32 len; +}; + +struct ext4_fname_crypto_ctx { + u32 lim; + char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; + struct crypto_ablkcipher *ctfm; + struct crypto_hash *htfm; + struct page *workpage; + struct ext4_encryption_key key; + unsigned flags : 8; + unsigned has_valid_key : 1; + unsigned ctfm_key_is_ready : 1; +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct ext4_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + if (l < EXT4_CRYPTO_BLOCK_SIZE) + l = EXT4_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); +} + +#endif /* _EXT4_CRYPTO_H */ diff --git a/kernel/fs/ext4/ext4_extents.h b/kernel/fs/ext4/ext4_extents.h new file mode 100644 index 000000000..3c9381547 --- /dev/null +++ b/kernel/fs/ext4/ext4_extents.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * If EXT_STATS is defined then stats numbers are collected. + * These number will be displayed at umount time. + */ +#define EXT_STATS_ + + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + * For non-inode extent blocks, ext4_extent_tail + * follows the array. + */ + +/* + * This is the extent tail on-disk structure. + * All other extent structures are 12 bytes long. It turns out that + * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which + * covers all valid ext4 block sizes. Therefore, this tail structure can be + * crammed into the end of the block without having to rebalance the tree. + */ +struct ext4_extent_tail { + __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ +}; + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) + +#define EXT4_EXTENT_TAIL_OFFSET(hdr) \ + (sizeof(struct ext4_extent_header) + \ + (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) + +static inline struct ext4_extent_tail * +find_ext4_extent_tail(struct ext4_extent_header *eh) +{ + return (struct ext4_extent_tail *)(((void *)eh) + + EXT4_EXTENT_TAIL_OFFSET(eh)); +} + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + __u16 p_maxdepth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an unwritten (i.e. + * preallocated). + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) +{ + /* We can not have an unwritten extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + +#endif /* _EXT4_EXTENTS */ + diff --git a/kernel/fs/ext4/ext4_jbd2.c b/kernel/fs/ext4/ext4_jbd2.c new file mode 100644 index 000000000..d41843181 --- /dev/null +++ b/kernel/fs/ext4/ext4_jbd2.c @@ -0,0 +1,327 @@ +/* + * Interface between ext4 and JBD + */ + +#include "ext4_jbd2.h" + +#include + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +static int ext4_journal_check_start(struct super_block *sb) +{ + journal_t *journal; + + might_sleep(); + if (sb->s_flags & MS_RDONLY) + return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); + journal = EXT4_SB(sb)->s_journal; + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (journal && is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return -EROFS; + } + return 0; +} + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks) +{ + journal_t *journal; + int err; + + trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) + return ERR_PTR(err); + + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, + type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + + if (!handle->h_transaction) { + err = jbd2_journal_stop(handle); + return handle->h_err ? handle->h_err : err; + } + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type) +{ + struct super_block *sb; + int err; + + if (!ext4_handle_valid(handle)) + return ext4_get_nojournal(); + + sb = handle->h_journal->j_private; + trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, + _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) { + jbd2_journal_free_reserved(handle); + return ERR_PTR(err); + } + + err = jbd2_journal_start_reserved(handle, type, line); + if (err < 0) + return ERR_PTR(err); + return handle; +} + +static void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + might_sleep(); + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + } + return err; +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { + bforget(bh); + return 0; + } + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + return err; + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); + return err; +} + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } + return err; +} + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + int err = 0; + + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + /* Errors can only happen due to aborted journal or a nasty bug */ + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + } + } else { + if (inode) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); + err = -EIO; + } + } + } + return err; +} + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + ext4_superblock_csum_set(sb); + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + mark_buffer_dirty(bh); + return err; +} diff --git a/kernel/fs/ext4/ext4_jbd2.h b/kernel/fs/ext4/ext4_jbd2.h new file mode 100644 index 000000000..9c5b49fb2 --- /dev/null +++ b/kernel/fs/ext4/ext4_jbd2.h @@ -0,0 +1,450 @@ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include +#include +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 20U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only data block */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + 1 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_INIT_REWRITE) : 0) + +#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + +static inline int ext4_jbd2_credits_xattr(struct inode *inode) +{ + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + return credits; +} + + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC 0 +#define EXT4_HT_INODE 1 +#define EXT4_HT_WRITE_PAGE 2 +#define EXT4_HT_MAP_BLOCKS 3 +#define EXT4_HT_DIR 4 +#define EXT4_HT_TRUNCATE 5 +#define EXT4_HT_QUOTA 6 +#define EXT4_HT_RESIZE 7 +#define EXT4_HT_MIGRATE 8 +#define EXT4_HT_MOVE_EXTENTS 9 +#define EXT4_HT_XATTR 10 +#define EXT4_HT_EXT_CONVERT 11 +#define EXT4_HT_MAX 12 + +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + * Return true if object was successfully removed + */ +static inline bool ext4_journal_callback_try_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + bool deleted; + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + return deleted; +} + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext4 calls into JBD. + */ +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); + +#define ext4_journal_get_write_access(handle, bh) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) +#define ext4_journal_get_create_access(handle, bh) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); + +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) + +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ +static inline int ext4_handle_valid(handle_t *handle) +{ + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +#define ext4_journal_start_sb(sb, type, nblocks) \ + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start(inode, type, nblocks) \ + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, + unsigned int line, int type, + int blocks, int rsv_blocks) +{ + return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + rsv_blocks); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__func__, __LINE__, (handle)) + +#define ext4_journal_start_reserved(handle, type) \ + __ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type); + +static inline void ext4_journal_free_reserved(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_free_reserved(handle); +} + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; +} + +static inline int ext4_journal_blocks_per_page(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + if (journal) + return jbd2_journal_force_commit(journal); + return 0; +} + +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return 0; +} + +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; +} + +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/kernel/fs/ext4/extents.c b/kernel/fs/ext4/extents.c new file mode 100644 index 000000000..e003a1e81 --- /dev/null +++ b/kernel/fs/ext4/extents.c @@ -0,0 +1,5707 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT4 + * + * TODO: + * - ext4*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "xattr.h" + +#include + +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ + +#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ +#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ + +static __le32 ext4_extent_block_csum(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + EXT4_EXTENT_TAIL_OFFSET(eh)); + return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return 1; + + et = find_ext4_extent_tail(eh); + if (et->et_checksum != ext4_extent_block_csum(inode, eh)) + return 0; + return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + et = find_ext4_extent_tail(eh); + et->et_checksum = ext4_extent_block_csum(inode, eh); +} + +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path **ppath, + struct ext4_map_blocks *map, + int split_flag, + int flags); + +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path **ppath, + ext4_lblk_t split, + int split_flag, + int flags); + +static int ext4_find_delayed_extent(struct inode *inode, + struct extent_status *newes); + +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) +{ + int err; + + if (!ext4_handle_valid(handle)) + return 0; + if (handle->h_buffer_credits > needed) + return 0; + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; + err = ext4_truncate_restart_trans(handle, inode, needed); + if (err == 0) + err = -EAGAIN; + + return err; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext4_ext_get_access(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) +{ + int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (path->p_bh) { + ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); + /* path points to block */ + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext4_mark_inode_dirty(handle, inode); + } + return err; +} + +static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block) +{ + if (path) { + int depth = path->p_depth; + struct ext4_extent *ex; + + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especially if the latter case turns out to be + * common. + */ + ex = path[depth].p_ext; + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } + + /* it looks like index is empty; + * try to find starting block from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + return ext4_inode_to_goal_block(inode); +} + +/* + * Allocation for a meta data block + */ +static ext4_fsblk_t +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex, int *err, unsigned int flags) +{ + ext4_fsblk_t goal, newblock; + + goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); + return newblock; +} + +static inline int ext4_ext_space_block(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 6) + size = 6; +#endif + return size; +} + +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 5) + size = 5; +#endif + return size; +} + +static inline int ext4_ext_space_root(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 3) + size = 3; +#endif + return size; +} + +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 4) + size = 4; +#endif + return size; +} + +static inline int +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path **ppath, ext4_lblk_t lblk, + int nofail) +{ + struct ext4_ext_path *path = *ppath; + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); +} + +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs; + + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); + + /* + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. + */ + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + int num = 0; + + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } + + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; +} + +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode, 1); + else + max = ext4_ext_space_root_idx(inode, 1); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode, 1); + else + max = ext4_ext_space_block_idx(inode, 1); + } + + return max; +} + +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext4_ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; + + if (len == 0 || lblock > last) + return 0; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); + + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } + ext++; + entries--; + prev = lblock + len - 1; + } + } else { + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth, ext4_fsblk_t pblk) +{ + const char *error_msg; + int max = 0; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } + /* Verify checksum on non-root extent tree nodes */ + if (ext_depth(inode) != depth && + !ext4_extent_block_csum_verify(inode, eh)) { + error_msg = "extent tree corrupted"; + goto corrupted; + } + return 0; + +corrupted: + ext4_error_inode(inode, function, line, 0, + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + return -EIO; +} + +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); +} + +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(inode->i_sb, pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; + set_buffer_verified(bh); + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + +} + +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} + +#ifdef EXT_DEBUG +static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug("path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(" %d:[%d]%d:%llu ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_unwritten(path->p_ext), + ext4_ext_get_actual_len(path->p_ext), + ext4_ext_pblock(path->p_ext)); + } else + ext_debug(" []"); + } + ext_debug("\n"); +} + +static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext4_extent_header *eh; + struct ext4_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); + } + ext_debug("\n"); +} + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", level, + le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), + newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + +#else +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) +#endif + +void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth, i; + + if (!path) + return; + depth = path->p_depth; + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * ext4_ext_binsearch_idx: + * binary search for the closest index of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch_idx(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent_idx *r, *l, *m; + + + ext_debug("binsearch for %u(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_LAST_INDEX(eh); + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), + m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); + } + + path->p_idx = l - 1; + ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * ext4_ext_binsearch: + * binary search for closest extent of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug("binsearch for %u: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_LAST_EXTENT(eh); + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), + m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); + } + + path->p_ext = l - 1; + ext_debug(" -> %d:%llu:[%d]%d ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_pblock(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), + ext4_ext_get_actual_len(path->p_ext)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext4_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + ext4_mark_inode_dirty(handle, inode); + return 0; +} + +struct ext4_ext_path * +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path **orig_path, int flags) +{ + struct ext4_extent_header *eh; + struct buffer_head *bh; + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; + short int depth, i, ppos = 0; + int ret; + + eh = ext_inode_hdr(inode); + depth = ext_depth(inode); + + if (path) { + ext4_ext_drop_refs(path); + if (depth > path[0].p_maxdepth) { + kfree(path); + *orig_path = path = NULL; + } + } + if (!path) { + /* account possible depth increase */ + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), + GFP_NOFS); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + path[0].p_maxdepth = depth + 1; + } + path[0].p_hdr = eh; + path[0].p_bh = NULL; + + i = depth; + /* walk through the tree */ + while (i) { + ext_debug("depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext4_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (unlikely(IS_ERR(bh))) { + ret = PTR_ERR(bh); + goto err; + } + + eh = ext_block_hdr(bh); + ppos++; + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + ret = -EIO; + goto err; + } + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + } + + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + + ext4_ext_show_path(inode, path); + + return path; + +err: + ext4_ext_drop_refs(path); + kfree(path); + if (orig_path) + *orig_path = NULL; + return ERR_PTR(ret); +} + +/* + * ext4_ext_insert_index: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) +{ + struct ext4_extent_idx *ix; + int len, err; + + err = ext4_ext_get_access(handle, inode, curp); + if (err) + return err; + + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + ext_debug("insert new index %d after: %llu\n", logical, ptr); + ix = curp->p_idx + 1; + } else { + /* insert before */ + ext_debug("insert new index %d before: %llu\n", logical, ptr); + ix = curp->p_idx; + } + + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug("insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EIO; + } + + ix->ei_block = cpu_to_le32(logical); + ext4_idx_store_pblock(ix, ptr); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); + + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } + + err = ext4_ext_dirty(handle, inode, curp); + ext4_std_error(inode->i_sb, err); + + return err; +} + +/* + * ext4_ext_split: + * inserts new subtree into the path, using free index entry + * at depth @at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extents and index entries (right to the split point) + * into the newly allocated blocks + * - initializes subtree + */ +static int ext4_ext_split(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + int i = at, k, m, a; + ext4_fsblk_t newblock, oldblock; + __le32 border; + ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now decision is simplest: at current extent */ + + /* if current leaf will be split, then we should use + * border from split point */ + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug("leaf will be split." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug("leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * If error occurs, then we break processing + * and mark filesystem read-only. index won't + * be inserted and tree will be in consistent + * state. Next mount will repair buffers too. + */ + + /* + * Get array to track all allocated blocks. + * We need this to handle errors and free blocks + * upon them. + */ + ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + + /* allocate all needed blocks */ + ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } + bh = sb_getblk(inode->i_sb, newblock); + if (unlikely(!bh)) { + err = -ENOMEM; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_depth = 0; + + /* move remainder of path[depth] to the new leaf */ + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } + /* start copy from next extent */ + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); + if (m) { + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); + le16_add_cpu(&neh->eh_entries, m); + } + + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } + if (k) + ext_debug("create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (unlikely(!bh)) { + err = -ENOMEM; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext4_idx_store_pblock(fidx, oldblock); + + ext_debug("int.index at %d (block %llu): %u -> %llu\n", + i, newblock, le32_to_cpu(border), oldblock); + + /* move remainder of path[i] to the new index block */ + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); + if (m) { + memmove(++fidx, path[i].p_idx, + sizeof(struct ext4_extent_idx) * m); + le16_add_cpu(&neh->eh_entries, m); + } + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + err = ext4_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); + } + } + kfree(ablocks); + + return err; +} + +/* + * ext4_ext_grow_indepth: + * implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initializes new top-level, creating index that points to the + * just created block + */ +static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, + unsigned int flags) +{ + struct ext4_extent_header *neh; + struct buffer_head *bh; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + int err = 0; + + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (unlikely(!bh)) + return -ENOMEM; + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, EXT4_I(inode)->i_data, + sizeof(EXT4_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + else + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + ext4_extent_block_csum_set(inode, neh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto out; + + /* Update top-level index: num,max,pointer */ + neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } + ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); + + le16_add_cpu(&neh->eh_depth, 1); + ext4_mark_inode_dirty(handle, inode); +out: + brelse(bh); + + return err; +} + +/* + * ext4_ext_create_new_leaf: + * finds empty index and adds new leaf. + * if no free index is found, then it requests in-depth growing. + */ +static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int mb_flags, + unsigned int gb_flags, + struct ext4_ext_path **ppath, + struct ext4_extent *newext) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block, + * so subsequent data blocks should be contiguous */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); + if (err) + goto out; + + /* refill path */ + path = ext4_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + ppath, gb_flags); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, mb_flags); + if (err) + goto out; + + /* refill path */ + path = ext4_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + ppath, gb_flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth, ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, + depth); + return -EIO; + } + } + return 0; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; + return 0; +} + +/* + * search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the largest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent **ret_ex) +{ + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + ext4_fsblk_t block; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } + } + goto found_extent; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + goto found_extent; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + goto got_index; + } + + /* we've gone up to the root and found no index to the right */ + return 0; + +got_index: + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + block = ext4_idx_pblock(ix); + while (++depth < path->p_depth) { + /* subtract from p_depth to get proper eh_depth */ + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); + ix = EXT_FIRST_INDEX(eh); + block = ext4_idx_pblock(ix); + put_bh(bh); + } + + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); + ex = EXT_FIRST_EXTENT(eh); +found_extent: + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + *ret_ex = ex; + if (bh) + put_bh(bh); + return 0; +} + +/* + * ext4_ext_next_allocated_block: + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. + * NOTE: it considers block number from index entry as + * allocated block. Thus, index entries have to be consistent + * with leaves. + */ +ext4_lblk_t +ext4_ext_next_allocated_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCKS; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext && + path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_next_leaf_block: + * returns first allocated block from next leaf or EXT_MAX_BLOCKS + */ +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCKS; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return (ext4_lblk_t) + le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_correct_indexes: + * if leaf gets modified and modified extent is first in the leaf, + * then we have to correct all indexes above. + * TODO: do we need to correct tree in all cases? + */ +static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + int depth = ext_depth(inode); + struct ext4_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller than current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + break; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + break; + } + + return err; +} + +int +ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, + struct ext4_extent *ex2) +{ + unsigned short ext1_ee_len, ext2_ee_len; + + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != + le32_to_cpu(ex2->ee_block)) + return 0; + + /* + * To allow future support for preallocated extents to be added + * as an RO_COMPAT feature, refuse to merge to extents if + * this can result in the top bit of ee_len being set. + */ + if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) + return 0; + if (ext4_ext_is_unwritten(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + return 0; +#ifdef AGGRESSIVE_TEST + if (ext1_ee_len >= 4) + return 0; +#endif + + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +static int ext4_ext_try_to_merge_right(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0, unwritten; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + le16_add_cpu(&eh->eh_entries, -1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); + } + + return merge_done; +} + +/* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2)) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + path[1].p_maxdepth = path[0].p_maxdepth; + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + brelse(path[1].p_bh); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); +} + +/* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + (void) ext4_ext_try_to_merge_right(inode, path, ex); + + ext4_ext_try_to_merge_up(handle, inode, path); +} + +/* + * check if a portion of the "newext" extent overlaps with an + * existing extent. + * + * If there is an overlap discovered, it updates the length of the newext + * such that there will be no overlap, and then returns 1. + * If there is no overlap found, it returns 0. + */ +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) +{ + ext4_lblk_t b1, b2; + unsigned int depth, len1; + unsigned int ret = 0; + + b1 = le32_to_cpu(newext->ee_block); + len1 = ext4_ext_get_actual_len(newext); + depth = ext_depth(inode); + if (!path[depth].p_ext) + goto out; + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); + + /* + * get the next allocated block if the extent in the path + * is before the requested block(s) + */ + if (b2 < b1) { + b2 = ext4_ext_next_allocated_block(path); + if (b2 == EXT_MAX_BLOCKS) + goto out; + b2 = EXT4_LBLK_CMASK(sbi, b2); + } + + /* check for wrap through zero on extent logical start block*/ + if (b1 + len1 < b1) { + len1 = EXT_MAX_BLOCKS - b1; + newext->ee_len = cpu_to_le16(len1); + ret = 1; + } + + /* check for overlap */ + if (b1 + len1 > b2) { + newext->ee_len = cpu_to_le16(b2 - b1); + ret = 1; + } +out: + return ret; +} + +/* + * ext4_ext_insert_extent: + * tries to merge requsted extent into the existing extent or + * inserts requested extent as new one into the tree, + * creating new leaf in the no-space case. + */ +int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path **ppath, + struct ext4_extent *newext, int gb_flags) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; + struct ext4_extent *nearex; /* nearest extent */ + struct ext4_ext_path *npath = NULL; + int depth, len, err; + ext4_lblk_t next; + int mb_flags = 0, unwritten; + + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + + /* try to insert block into found extent and return */ + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { + + /* + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. + */ + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + } + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { + ext_debug("next leaf block - %u\n", next); + BUG_ON(npath != NULL); + npath = ext4_find_extent(inode, next, NULL, 0); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug("next leaf isn't full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto has_space; + } + ext_debug("next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * There is no free space in the found leaf. + * We're gonna add a new leaf in the tree. + */ + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags |= EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + ppath, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext)); + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { + /* Insert after */ + ext_debug("insert %u:%llu:[%d]%d before: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug("insert %u:%llu:[%d]%d after: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug("insert %u:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); + } + } + + le16_add_cpu(&eh->eh_entries, 1); + path[depth].p_ext = nearex; + nearex->ee_block = newext->ee_block; + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents */ + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(handle, inode, path, nearex); + + + /* time to correct all indexes above */ + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + +cleanup: + ext4_ext_drop_refs(npath); + kfree(npath); + return err; +} + +static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + struct extent_status es; + ext4_lblk_t next, next_del, start = 0, end = 0; + ext4_lblk_t last = block + num; + int exists, depth = 0, err = 0; + unsigned int flags = 0; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + + while (block < last && block != EXT_MAX_BLOCKS) { + num = last - block; + /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); + + path = ext4_find_extent(inode, block, &path, 0); + if (IS_ERR(path)) { + up_read(&EXT4_I(inode)->i_data_sem); + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + if (unlikely(path[depth].p_hdr == NULL)) { + up_read(&EXT4_I(inode)->i_data_sem); + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + flags = 0; + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + es.es_lblk = start; + es.es_len = end - start; + es.es_pblk = 0; + } else { + es.es_lblk = le32_to_cpu(ex->ee_block); + es.es_len = ext4_ext_get_actual_len(ex); + es.es_pblk = ext4_ext_pblock(ex); + if (ext4_ext_is_unwritten(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + } + + /* + * Find delayed extent and update es accordingly. We call + * it even in !exists case to find out whether es is the + * last existing extent or not. + */ + next_del = ext4_find_delayed_extent(inode, &es); + if (!exists && next_del) { + exists = 1; + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + } + up_read(&EXT4_I(inode)->i_data_sem); + + if (unlikely(es.es_len == 0)) { + EXT4_ERROR_INODE(inode, "es.es_len == 0"); + err = -EIO; + break; + } + + /* + * This is possible iff next == next_del == EXT_MAX_BLOCKS. + * we need to check next == EXT_MAX_BLOCKS because it is + * possible that an extent is with unwritten and delayed + * status due to when an extent is delayed allocated and + * is allocated by fallocate status tree will track both of + * them in a extent. + * + * So we could return a unwritten and delayed extent, and + * its block is equal to 'next'. + */ + if (next == next_del && next == EXT_MAX_BLOCKS) { + flags |= FIEMAP_EXTENT_LAST; + if (unlikely(next_del != EXT_MAX_BLOCKS || + next != EXT_MAX_BLOCKS)) { + EXT4_ERROR_INODE(inode, + "next extent == %u, next " + "delalloc extent = %u", + next, next_del); + err = -EIO; + break; + } + } + + if (exists) { + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } + } + + block = es.es_lblk + es.es_len; + } + + ext4_ext_drop_refs(path); + kfree(path); + return err; +} + +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t block) +{ + int depth = ext_depth(inode); + ext4_lblk_t len; + ext4_lblk_t lblock; + struct ext4_extent *ex; + struct extent_status es; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug("cache gap(before): %u [%u:%u]", + block, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; + lblock = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + + next = ext4_ext_next_allocated_block(path); + ext_debug("cache gap(after): [%u:%u] %u", + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + block); + BUG_ON(next == lblock); + len = next - lblock; + } else { + BUG(); + } + + ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + if (es.es_len) { + /* There's delayed extent containing lblock? */ + if (es.es_lblk <= lblock) + return; + len = min(es.es_lblk - lblock, len); + } + ext_debug(" -> %u:%u\n", lblock, len); + ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); +} + +/* + * ext4_ext_rm_idx: + * removes index from the index block. + */ +static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, int depth) +{ + int err; + ext4_fsblk_t leaf; + + /* free index block */ + depth--; + path = path + depth; + leaf = ext4_idx_pblock(path->p_idx); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } + err = ext4_ext_get_access(handle, inode, path); + if (err) + return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + + le16_add_cpu(&path->p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path); + if (err) + return err; + ext_debug("index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + + ext4_free_blocks(handle, inode, NULL, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + + while (--depth >= 0) { + if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + break; + path--; + err = ext4_ext_get_access(handle, inode, path); + if (err) + break; + path->p_idx->ei_block = (path+1)->p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path); + if (err) + break; + } + return err; +} + +/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. + */ +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, + struct ext4_ext_path *path) +{ + if (path) { + int depth = ext_depth(inode); + int ret = 0; + + /* probably there is space in leaf? */ + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) { + + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadata blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; + } + } + + return ext4_chunk_trans_blocks(inode, nrblocks); +} + +/* + * How many index/leaf blocks need to change/allocate to add @extents extents? + * + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. + * + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) +{ + int index; + int depth; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; + + depth = ext_depth(inode); + + if (extents <= 1) + index = depth * 2; + else + index = depth * 3; + + return index; +} + +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + +static int ext4_remove_blocks(handle_t *handle, struct inode *inode, + struct ext4_extent *ex, + long long *partial_cluster, + ext4_lblk_t from, ext4_lblk_t to) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t pblk; + int flags = get_default_free_blocks_flags(inode); + + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we make a note + * that we tried freeing the cluster, and check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + + trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); + /* + * If we have a partial cluster, and it's different from the + * cluster of the last block, we need to explicitly free the + * partial cluster here. + */ + pblk = ext4_ext_pblock(ex) + ee_len - 1; + if (*partial_cluster > 0 && + *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + +#ifdef EXTENTS_STATS + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* tail removal */ + ext4_lblk_t num; + long long first_cluster; + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; + /* + * Usually we want to free partial cluster at the end of the + * extent, except for the situation when the cluster is still + * used by any other extent (partial_cluster is negative). + */ + if (*partial_cluster < 0 && + *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + ext_debug("free last %u blocks starting %llu partial %lld\n", + num, pblk, *partial_cluster); + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + /* + * If the block range to be freed didn't start at the + * beginning of a cluster, and we removed the entire + * extent and the cluster is not used by any other extent, + * save the partial cluster here, since we might need to + * delete if we determine that the truncate or punch hole + * operation has removed all of the blocks in the cluster. + * If that cluster is used by another extent, preserve its + * negative value so it isn't freed later on. + * + * If the whole extent wasn't freed, we've reached the + * start of the truncated/punched region and have finished + * removing blocks. If there's a partial cluster here it's + * shared with the remainder of the extent and is no longer + * a candidate for removal. + */ + if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { + first_cluster = (long long) EXT4_B2C(sbi, pblk); + if (first_cluster != -*partial_cluster) + *partial_cluster = first_cluster; + } else { + *partial_cluster = 0; + } + } else + ext4_error(sbi->s_sb, "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; +} + + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. + * @start: The first block to remove + * @end: The last block to remove + */ +static int +ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + long long *partial_cluster, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext4_extent_header *eh; + ext4_lblk_t a, b; + unsigned num; + ext4_lblk_t ex_ee_block; + unsigned short ex_ee_len; + unsigned unwritten = 0; + struct ext4_extent *ex; + ext4_fsblk_t pblk; + + /* the header must be checked already in ext4_ext_remove_space() */ + ext_debug("truncate since %u in leaf to %u\n", start, end); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + /* find where to start removing */ + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; + else + unwritten = 0; + + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + unwritten, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + ext_debug(" border %u:%u\n", a, b); + + /* If this extent is beyond the end of the hole, skip it */ + if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); + *partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); + err = -EIO; + goto out; + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + num = a - ex_ee_block; + } else { + /* remove whole extent: excellent! */ + num = 0; + } + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext4_remove_blocks(handle, inode, ex, partial_cluster, + a, b); + if (err) + goto out; + + if (num == 0) + /* this extent is removed; mark slot entirely unused */ + ext4_ext_store_pblock(ex, 0); + + ex->ee_len = cpu_to_le16(num); + /* + * Do not mark unwritten if all the blocks in the + * extent have been removed. + */ + if (unwritten && num) + ext4_ext_mark_unwritten(ex); + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCKS - 1) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, + ext4_ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + } + + if (correct_index && eh->eh_entries) + err = ext4_ext_correct_indexes(handle, inode, path); + + /* + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If it is shared with the current extent + * we zero partial_cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. + */ + if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } + *partial_cluster = 0; + } + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext4_ext_rm_idx(handle, inode, path, depth); + +out: + return err; +} + +/* + * ext4_ext_more_to_rm: + * returns 1 if current index has to be freed (even partial) + */ +static int +ext4_ext_more_to_rm(struct ext4_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened, it wasn't partial, + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int depth = ext_depth(inode); + struct ext4_ext_path *path = NULL; + long long partial_cluster = 0; + handle_t *handle; + int i = 0, err = 0; + + ext_debug("truncate since %u to %u\n", start, end); + + /* probably first extent we're gonna free will be last in block */ + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +again: + trace_ext4_ext_remove_space(inode, start, end, depth); + + /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; + + /* find extent for or closest extent to this block */ + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ + ex = path[depth].p_ext; + if (!ex) { + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EIO; + } + goto out; + } + + ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. + */ + err = ext4_force_split_extent_at(handle, inode, &path, + end + 1, 1); + if (err < 0) + goto out; + + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + /* + * If there's an extent to the right its first cluster + * contains the immediate right boundary of the + * truncated/punched region. Set partial_cluster to + * its negative value so it won't be freed if shared + * with the current extent. The end < ee_block case + * is handled in ext4_ext_rm_leaf(). + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + &ex); + if (err) + goto out; + if (pblk) + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + } + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ + depth = ext_depth(inode); + if (path) { + int k = i = depth; + while (--k > 0) + path[k].p_block = + le16_to_cpu(path[k].p_hdr->eh_entries)+1; + } else { + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_maxdepth = path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + i = 0; + + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { + err = -EIO; + goto out; + } + } + err = 0; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext4_ext_rm_leaf(handle, inode, path, + &partial_cluster, start, + end); + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug("initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't been touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug("init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we were already here, see at next index */ + path[i].p_idx--; + } + + ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug("move to level %d (block %llu)\n", + i + 1, ext4_idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { + /* should we reset i_size? */ + err = PTR_ERR(bh); + break; + } + /* Yield here to deal with large extent trees. + * Should be a no-op if we did IO above. */ + cond_resched(); + if (WARN_ON(i + 1 > depth)) { + err = -EIO; + break; + } + path[i + 1].p_bh = bh; + + /* save actual number of indexes since this + * number is changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finished processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it; + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext4_ext_rm_idx(handle, inode, path, i); + } + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug("return to level %d\n", i); + } + } + + trace_ext4_ext_remove_space_done(inode, start, end, depth, + partial_cluster, path->p_hdr->eh_entries); + + /* + * If we still have something in the partial cluster and we have removed + * even the first extent, then we should free the blocks in the partial + * cluster as well. (This code will only run when there are no leaves + * to the immediate left of the truncated/punched region.) + */ + if (partial_cluster > 0 && err == 0) { + /* don't zero partial_cluster since it's not used afterwards */ + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root(inode, 0)); + err = ext4_ext_dirty(handle, inode, path); + } + } +out: + ext4_ext_drop_refs(path); + kfree(path); + path = NULL; + if (err == -EAGAIN) + goto again; + ext4_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext4_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled" +#ifdef AGGRESSIVE_TEST + ", aggressive tests" +#endif +#ifdef CHECK_BINSEARCH + ", check binsearch" +#endif +#ifdef EXTENTS_STATS + ", stats" +#endif + "\n"); +#endif +#ifdef EXTENTS_STATS + spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); + EXT4_SB(sb)->s_ext_min = 1 << 30; + EXT4_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext4_ext_release(struct super_block *sb) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + int ret; + + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, ex); + + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or unwritten) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path **ppath, + ext4_lblk_t split, + int split_flag, + int flags) +{ + struct ext4_ext_path *path = *ppath; + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex, zero_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == + (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_unwritten(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(handle, inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); + + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { + if (split_flag & EXT4_EXT_DATA_VALID1) { + err = ext4_ext_zeroout(inode, ex2); + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { + err = ext4_ext_zeroout(inode, ex); + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { + err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } + + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_zeroout_es(inode, &zero_ex); + + goto out; + } else if (err) + goto fix_extent_len; + +out: + ext4_ext_show_leaf(inode, path); + return err; + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + ext4_ext_dirty(handle, inode, path + path->p_depth); + return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (up to three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path **ppath, + struct ext4_map_blocks *map, + int split_flag, + int flags) +{ + struct ext4_ext_path *path = *ppath; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int err = 0; + int unwritten; + int split_flag1, flags1; + int allocated = map->m_len; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + unwritten = ext4_ext_is_unwritten(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + if (split_flag & EXT4_EXT_DATA_VALID2) + split_flag1 |= EXT4_EXT_DATA_VALID1; + err = ext4_split_extent_at(handle, inode, ppath, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) + goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); + } + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + unwritten = ext4_ext_is_unwritten(ex); + split_flag1 = 0; + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT2); + } + err = ext4_split_extent_at(handle, inode, ppath, + map->m_lblk, split_flag1, flags); + if (err) + goto out; + } + + ext4_ext_show_leaf(inode, path); +out: + return err ? err : allocated; +} + +/* + * This function is called by ext4_ext_map_blocks() if someone tries to write + * to an unwritten extent. It may result in splitting the unwritten + * extent into multiple extents (up to three - one initialized and two + * unwritten). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is unwritten. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. + */ +static int ext4_ext_convert_to_initialized(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, + int flags) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_sb_info *sbi; + struct ext4_extent_header *eh; + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex; + struct ext4_extent *ex, *abut_ex; + ext4_lblk_t ee_block, eof_block; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; + int err = 0; + int split_flag = 0; + + ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map_len); + + sbi = EXT4_SB(inode->i_sb); + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + zero_ex.ee_len = 0; + + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_unwritten(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + + /* + * Attempt to transfer newly initialized blocks from the currently + * unwritten extent to its neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. + * + * Limitations of the current logic: + * - L1: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L2: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len; + + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); + + WARN_ON(map->m_lblk < ee_block); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { + err = ext4_ext_zeroout(inode, ex); + if (err) + goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + ext4_ext_mark_initialized(ex); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + goto out; + } + + /* + * four cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the first half. + * 3. split the extent into two extents, zeroout the second half. + * 4. split the extent into two extents with out zeroout. + */ + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (max_zeroout && (allocated > map->m_len)) { + if (allocated <= max_zeroout) { + /* case 3 */ + zero_ex.ee_block = + cpu_to_le32(map->m_lblk); + zero_ex.ee_len = cpu_to_le16(allocated); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex) + map->m_lblk - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + split_map.m_lblk = map->m_lblk; + split_map.m_len = allocated; + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { + /* case 2 */ + if (map->m_lblk != ee_block) { + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(map->m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + } + + split_map.m_lblk = ee_block; + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; + } + } + + err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, + flags); + if (err > 0) + err = 0; +out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_zeroout_es(inode, &zero_ex); + return err ? err : allocated; +} + +/* + * This function is called by ext4_ext_map_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an unwritten extent. + * + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be unwritten + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * This works the same way in the case of initialized -> unwritten conversion. + * + * One of more index blocks maybe needed if the extent tree grow after + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + * + * Returns the size of unwritten extent to be written on success. + */ +static int ext4_split_convert_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, + int flags) +{ + struct ext4_ext_path *path = *ppath; + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; + + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); +} + +static int ext4_convert_unwritten_extents_endio(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)ee_block, ee_len); + + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ + if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT); + if (err < 0) + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + +/* + * Handle EOFBLOCKS_FL flag, clearing it if necessary + */ +static int check_eofblocks_fl(handle_t *handle, struct inode *inode, + ext4_lblk_t lblk, + struct ext4_ext_path *path, + unsigned int len) +{ + int i, depth; + struct ext4_extent_header *eh; + struct ext4_extent *last_ex; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + return 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; + last_ex = EXT_LAST_EXTENT(eh); + /* + * We should clear the EOFBLOCKS_FL flag if we are writing the + * last block in the last extent in the file. We test this by + * first checking to see if the caller to + * ext4_ext_get_blocks() was interested in the last block (or + * a block beyond the last block) in the current extent. If + * this turns out to be false, we can bail out from this + * function immediately. + */ + if (lblk + len < le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + return 0; + /* + * If the caller does appear to be planning to write at or + * beyond the end of the current extent, we then test to see + * if the current extent is the last extent in the file, by + * checking to make sure it was reached via the rightmost node + * at each level of the tree. + */ + for (i = depth-1; i >= 0; i--) + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + return 0; +out: + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + return ext4_mark_inode_dirty(handle, inode); +} + +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Return 1 if there is a delalloc block in the range, otherwise 0. + */ +int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end) +{ + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); + if (es.es_len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) + return 1; + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) + return 1; + else + return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + * '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + * are not delalloc'ed. + * Ex: + * |----c---=|====c====|====c====|===-c----| + * |++++++ allocated ++++++| + * ==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + * marked for delayed allocation. In this case, we will exclude that + * cluster. + * Ex: + * |----====c========|========c========| + * |++++++ allocated ++++++| + * ==> 1 complete clusters in above example + * + * Ex: + * |================c================| + * |++++++ allocated ++++++| + * ==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, + unsigned int num_blks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t alloc_cluster_start, alloc_cluster_end; + ext4_lblk_t lblk_from, lblk_to, c_offset; + unsigned int allocated_clusters = 0; + + alloc_cluster_start = EXT4_B2C(sbi, lblk_start); + alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + + /* max possible clusters for this allocation */ + allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + + trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + + /* Check towards left side */ + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); + if (c_offset) { + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); + lblk_to = lblk_from + c_offset - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + allocated_clusters--; + } + + /* Now check towards right. */ + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); + if (allocated_clusters && c_offset) { + lblk_from = lblk_start + num_blks; + lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + allocated_clusters--; + } + + return allocated_clusters; +} + +static int +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); + if (err) + return err; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + return allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + struct ext4_ext_path *path = *ppath; + int ret = 0; + int err = 0; + ext4_io_end_t *io = ext4_inode_aio(inode); + + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", + inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + /* + * When writing into unwritten space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, + allocated, newblock); + + /* get_block() before submit the IO, split the extent */ + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + ret = ext4_split_convert_extents(handle, inode, map, ppath, + flags | EXT4_GET_BLOCKS_CONVERT); + if (ret <= 0) + goto out; + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to conversion to written when IO is + * completed + */ + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out; + } + /* IO end_io complete, convert the filled extent to written */ + if (flags & EXT4_GET_BLOCKS_CONVERT) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, map, + ppath); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto map_out; + } + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out1; + } + + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + map->m_flags |= EXT4_MAP_NEW; + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > map->m_len) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + map->m_len, + allocated - map->m_len); + allocated = map->m_len; + } + map->m_len = allocated; + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, map->m_len); + if (reserved_clusters) + ext4_da_update_reserve_space(inode, + reserved_clusters, + 0); + } + +map_out: + map->m_flags |= EXT4_MAP_MAPPED; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } +out1: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_pblk = newblock; + map->m_len = allocated; +out2: + return err ? err : allocated; +} + +/* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sb The filesystem superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); + return 1; + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); + return 0; +} + + +/* + * Block allocation/map/preallocation routine for extents based files + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. + */ +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent newex, *ex, *ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_fsblk_t newblock = 0; + int free_on_err = 0, err = 0, depth, ret; + unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0; + struct ext4_allocation_request ar; + ext4_io_end_t *io = ext4_inode_aio(inode); + ext4_lblk_t cluster_offset; + int set_unwritten = 0; + bool map_from_cluster = false; + + ext_debug("blocks %u/%u requested for inode %lu\n", + map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + + /* find extent for this block */ + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty; + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_find_extent() + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); + err = -EIO; + goto out2; + } + + ex = path[depth].p_ext; + if (ex) { + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len; + + + /* + * unwritten extents are treated as holes, except that + * we split out initialized portions during a write. + */ + ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + + /* if found extent covers block, simply return it */ + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; + /* number of remaining blocks in the extent */ + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, + ee_block, ee_len, newblock); + + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = convert_initialized_extent( + handle, inode, map, &path, + flags, allocated, newblock); + goto out2; + } else if (!ext4_ext_is_unwritten(ex)) + goto out; + + ret = ext4_ext_handle_unwritten_extents( + handle, inode, map, &path, flags, + allocated, newblock); + if (ret < 0) + err = ret; + else + allocated = ret; + goto out2; + } + } + + /* + * requested block isn't allocated yet; + * we couldn't try to create block if create flag is zero + */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * put just found gap into cache to speed up + * subsequent requests + */ + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + goto out2; + } + + /* + * Okay, we need to do block allocation. + */ + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map_from_cluster = true; + goto got_allocated_blocks; + } + + /* find neighbour allocated blocks */ + ar.lleft = map->m_lblk; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out2; + ar.lright = map->m_lblk; + ex2 = NULL; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + if (err) + goto out2; + + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && ex2 && + get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map_from_cluster = true; + goto got_allocated_blocks; + } + + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. + */ + if (map->m_len > EXT_INIT_MAX_LEN && + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; + + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_len = cpu_to_le16(map->m_len); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); + if (err) + allocated = ext4_ext_get_actual_len(&newex); + else + allocated = map->m_len; + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out2; + ext_debug("allocate new block: goal %llu, found %llu/%u\n", + ar.goal, newblock, allocated); + free_on_err = 1; + allocated_clusters = ar.len; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + if (ar.len > allocated) + ar.len = allocated; + +got_allocated_blocks: + /* try to insert new extent into found leaf and return */ + ext4_ext_store_pblock(&newex, newblock + offset); + newex.ee_len = cpu_to_le16(ar.len); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); + map->m_flags |= EXT4_MAP_UNWRITTEN; + /* + * io_end structure was created for every IO write to an + * unwritten extent. To avoid unnecessary conversion, + * here we flag the IO that really needs the conversion. + * For non asycn direct IO case, flag the inode state + * that we need to perform conversion when IO is done. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO) + set_unwritten = 1; + } + + err = 0; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, ar.len); + if (!err) + err = ext4_ext_insert_extent(handle, inode, &path, + &newex, flags); + + if (!err && set_unwritten) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + + if (err && free_on_err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ + ext4_discard_preallocations(inode); + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), fb_flags); + goto out2; + } + + /* previous routine could use block we allocated */ + newblock = ext4_ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); + if (allocated > map->m_len) + allocated = map->m_len; + map->m_flags |= EXT4_MAP_NEW; + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + /* + * Check how many clusters we had reserved this allocated range + */ + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, allocated); + if (!map_from_cluster) { + BUG_ON(allocated_clusters < reserved_clusters); + if (reserved_clusters < allocated_clusters) { + struct ext4_inode_info *ei = EXT4_I(inode); + int reservation = allocated_clusters - + reserved_clusters; + /* + * It seems we claimed few clusters outside of + * the range of this allocation. We should give + * it back to the reservation pool. This can + * happen in the following case: + * + * * Suppose s_cluster_ratio is 4 (i.e., each + * cluster has 4 blocks. Thus, the clusters + * are [0-3],[4-7],[8-11]... + * * First comes delayed allocation write for + * logical blocks 10 & 11. Since there were no + * previous delayed allocated blocks in the + * range [8-11], we would reserve 1 cluster + * for this write. + * * Next comes write for logical blocks 3 to 8. + * In this case, we will reserve 2 clusters + * (for [0-3] and [4-7]; and not for [8-11] as + * that range has a delayed allocated blocks. + * Thus total reserved clusters now becomes 3. + * * Now, during the delayed allocation writeout + * time, we will first write blocks [3-8] and + * allocate 3 clusters for writing these + * blocks. Also, we would claim all these + * three clusters above. + * * Now when we come here to writeout the + * blocks [10-11], we would expect to claim + * the reservation of 1 cluster we had made + * (and we would claim it since there are no + * more delayed allocated blocks in the range + * [8-11]. But our reserved cluster count had + * already gone to 0. + * + * Thus, at the step 4 above when we determine + * that there are still some unwritten delayed + * allocated blocks outside of our current + * block range, we should increment the + * reserved clusters count so that when the + * remaining blocks finally gets written, we + * could claim them. + */ + dquot_reserve_block(inode, + EXT4_C2B(sbi, reservation)); + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks += reservation; + spin_unlock(&ei->i_block_reservation_lock); + } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); + } + } + + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an unwritten extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) + ext4_update_inode_fsync_trans(handle, inode, 1); + else + ext4_update_inode_fsync_trans(handle, inode, 0); +out: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + map->m_len = allocated; +out2: + ext4_ext_drop_refs(path); + kfree(path); + + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); + return err ? err : allocated; +} + +void ext4_ext_truncate(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t last_block; + int err = 0; + + /* + * TODO: optimization is possible here. + * Probably we need not scan at all, + * because page truncation is enough. + */ + + /* we have to know where to truncate from in crash case */ + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT4_BLOCK_SIZE_BITS(sb); +retry: + err = ext4_es_remove_extent(inode, last_block, + EXT_MAX_BLOCKS - last_block); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) { + ext4_std_error(inode->i_sb, err); + return; + } + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + ext4_std_error(inode->i_sb, err); +} + +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, loff_t new_size, + int flags, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; + loff_t epos; + + map.m_lblk = offset; + map.m_len = len; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + +retry: + while (ret >= 0 && len) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int credits; + int partial_begin, partial_end; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + + /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + } + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + /* Preallocate the range including the unaligned edges */ + if (partial_begin || partial_end) { + ret = ext4_alloc_file_blocks(file, + round_down(offset, 1 << blkbits) >> blkbits, + (round_up((offset + len), 1 << blkbits) - + round_down(offset, 1 << blkbits)) >> blkbits, + new_size, flags, mode); + if (ret) + goto out_mutex; + + } + + /* Zero range excluding the unaligned edges */ + if (max_blocks > 0) { + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out_dio; + } + if (!partial_begin && !partial_end) + goto out_dio; + + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + if (new_size) { + ext4_update_inode_size(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * preallocate space for a file. This implements ext4's fallocate file + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + unsigned int max_blocks; + int ret = 0; + int flags; + ext4_lblk_t lblk; + unsigned int blkbits = inode->i_blkbits; + + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + * + * XXX It's not clear why zero range isn't working, but we'll + * leave it disabled for encrypted inodes for now. This is a + * bug we should fix.... + */ + if (ext4_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + return -EOPNOTSUPP; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(inode, offset, len); + + ret = ext4_convert_inline_data(inode); + if (ret) + return ret; + + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); + + trace_ext4_fallocate_enter(inode, offset, len, mode); + lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + + /* + * We only support preallocation for extent-based files only + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; +} + +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct ext4_map_blocks map; + unsigned int credits, blkbits = inode->i_blkbits; + + map.m_lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - + map.m_lblk); + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + credits = 0; + } else { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + } + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_IO_CONVERT_EXT); + if (ret <= 0) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + if (credits) + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2) + break; + } + if (!credits) + ret2 = ext4_journal_stop(handle); + return ret > 0 ? ret2 : ret; +} + +/* + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and + * return start of the next delayed extent. + * + * If newes is existing extent (newes->ec_pblk is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newes unmodified. + */ +static int ext4_find_delayed_extent(struct inode *inode, + struct extent_status *newes) +{ + struct extent_status es; + ext4_lblk_t block, next_del; + + if (newes->es_pblk == 0) { + ext4_es_find_delayed_extent_range(inode, newes->es_lblk, + newes->es_lblk + newes->es_len - 1, &es); + + /* + * No extent in extent-tree contains block @newes->es_pblk, + * then the block may stay in 1)a hole or 2)delayed-extent. + */ + if (es.es_len == 0) + /* A hole found. */ + return 0; + + if (es.es_lblk > newes->es_lblk) { + /* A hole found. */ + newes->es_len = min(es.es_lblk - newes->es_lblk, + newes->es_len); + return 0; + } + + newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; + } + + block = newes->es_lblk + newes->es_len; + ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + if (es.es_len == 0) + next_del = EXT_MAX_BLOCKS; + else + next_del = es.es_lblk; + + return next_del; +} +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = (__u64)iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + brelse(iloc.bh); + } else { /* external block */ + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + int error = 0; + + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, + start, len); + + if (has_inline) + return error; + } + + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + + /* fallback to generic here if not in extents fmt */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + ext4_lblk_t len_blks; + __u64 last_blk; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); + } + return error; +} + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = le32_to_cpu(ex_last->ee_block) + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) + goto out; + + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + goto out; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_find_extent(inode, start - 1, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_find_extent(inode, start, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + if (start > le32_to_cpu(extent->ee_block)) { + /* Hole, move to the next extent */ + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + start = ext4_ext_next_allocated_block(path); + continue; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + if (ret) + break; + } +out: + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* + * We need to test this early because xfstests assumes that a + * collapse range of (0, 1) will return EOPNOTSUPP if the file + * system does not support collapse range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/** + * ext4_swap_extents - Swap extents between two inodes + * + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_mutex is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + + *erp = ext4_es_remove_extent(inode1, lblk1, count); + if (unlikely(*erp)) + return 0; + *erp = ext4_es_remove_extent(inode2, lblk2, count); + if (unlikely(*erp)) + return 0; + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path1))) { + *erp = PTR_ERR(path1); + path1 = NULL; + finish: + count = 0; + goto repeat; + } + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path2))) { + *erp = PTR_ERR(path2); + path2 = NULL; + goto finish; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have somthing to swap ? */ + if (unlikely(!ex2 || !ex1)) + goto finish; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e1_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + goto finish; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + goto repeat; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1, 0); + if (unlikely(*erp)) + goto finish; + } + if (e2_blk < lblk2) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2, 0); + if (unlikely(*erp)) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1 + len, 0); + if (unlikely(*erp)) + goto finish; + } + if (len != e2_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2 + len, 0); + if (*erp) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (unlikely(*erp)) + goto finish; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (unlikely(*erp)) + goto finish; + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + + repeat: + ext4_ext_drop_refs(path1); + kfree(path1); + ext4_ext_drop_refs(path2); + kfree(path2); + path1 = path2 = NULL; + } + return replaced_count; +} diff --git a/kernel/fs/ext4/extents_status.c b/kernel/fs/ext4/extents_status.c new file mode 100644 index 000000000..26724aeec --- /dev/null +++ b/kernel/fs/ext4/extents_status.c @@ -0,0 +1,1310 @@ +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Hugh Dickins + * Zheng Liu + * + * Ext4 extents status tree core functions. + */ +#include +#include +#include +#include "ext4.h" + +#include + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delayed extents in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. + * + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. + */ + +/* + * Extent status tree implementation for ext4. + * + * + * ========================================================================== + * Extent status tree tracks all extent status. + * + * 1. Why we need to implement extent status tree? + * + * Without extent status tree, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. + * + * Let us have a look at how they do without extent status tree. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time comsuming. + * + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the extent tree. + * + * + * ========================================================================== + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. + * + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. + * + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. + * + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. + * + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. + * + * + * ========================================================================== + * 3. Performance analysis + * + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 4. TODO list + * + * -- Refactor delayed space reservation + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); + +int __init ext4_init_es(void) +{ + ext4_es_cachep = kmem_cache_create("ext4_extent_status", + sizeof(struct extent_status), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + if (ext4_es_cachep) + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u) %llu %x", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) +{ + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t lblk) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (lblk < es->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && lblk < es->es_lblk) + return es; + + if (es && lblk > ext4_es_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * + * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search + * @end: the offset where we stop to search + * @es: delayed extent that we found + */ +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + + BUG_ON(es == NULL); + BUG_ON(end < lblk); + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; + } + } + + es1 = __es_tree_search(&tree->root, lblk); + +out: + if (es1 && !ext4_es_is_delayed(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } + if (ext4_es_is_delayed(es1)) + break; + } + } + + if (es1 && ext4_es_is_delayed(es1)) { + tree->cache_es = es1; + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_delayed_extent_range_exit(inode, es); +} + +static void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +static void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + +static struct extent_status * +ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) +{ + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) + return NULL; + es->es_lblk = lblk; + es->es_len = len; + es->es_pblk = pblk; + + /* + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { + if (!EXT4_I(inode)->i_es_shk_nr++) + ext4_es_list_add(inode); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } + + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + + return es; +} + +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) +{ + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + + /* Decrease the shrink counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + if (!--EXT4_I(inode)->i_es_shk_nr) + ext4_es_list_del(inode); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +} + +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (ext4_es_type(es1) != ext4_es_type(es2)) + return 0; + + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { + pr_warn("ES assertion failed when merging extents. " + "The sum of lengths of es1 (%d) and es2 (%d) " + "is bigger than allowed file size (%d)\n", + es1->es_len, es2->es_len, EXT_MAX_BLOCKS); + WARN_ON(1); + return 0; + } + + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; + if (ext4_es_is_referenced(es)) + ext4_es_set_referenced(es1); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; + if (ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es); + rb_erase(node, &tree->root); + ext4_es_free_extent(inode, es1); + } + + return es; +} + +#ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertion failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + ext4_ext_drop_refs(path); + kfree(path); +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertion failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertion failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertion failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "We can't find the block but we want to add " + "a written extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); + es = ext4_es_try_to_merge_left(inode, es); + goto out; + } + p = &(*p)->rb_left; + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; + es = ext4_es_try_to_merge_right(inode, es); + goto out; + } + p = &(*p)->rb_right; + } else { + BUG_ON(1); + return -EINVAL; + } + } + + es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, + newes->es_pblk); + if (!es) + return -ENOMEM; + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds information to an inode's extent + * status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + int err = 0; + + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", + lblk, len, pblk, status, inode->i_ino); + + if (!len) + return 0; + + BUG_ON(end < lblk); + + if ((status & EXTENT_STATUS_DELAYED) && + (status & EXTENT_STATUS_WRITTEN)) { + ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " + " delayed and written which can potentially " + " cause data loss.\n", lblk, len); + WARN_ON(1); + } + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_insert_extent(inode, &newes); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + + return err; +} + +/* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* + * ext4_es_lookup_extent() looks up an extent in extent status tree. + * + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not + */ +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) +{ + struct ext4_es_tree *tree; + struct ext4_es_stats *stats; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + if (!ext4_es_is_referenced(es)) + ext4_es_set_referenced(es); + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node *node; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2; + ext4_fsblk_t block; + int err; + +retry: + err = 0; + es = __es_tree_search(&tree->root, lblk); + if (!es) + goto out; + if (es->es_lblk > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; + if (len1 > 0) + es->es_len = len1; + if (len2 > 0) { + if (len1 > 0) { + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + block = 0x7FDEADBEEFULL; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); + err = __es_insert_extent(inode, &newes); + if (err) { + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && + __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + goto out; + } + } else { + es->es_lblk = end + 1; + es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && ext4_es_end(es) <= end) { + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + + len1 = ext4_es_end(es) - end; + es->es_lblk = end + 1; + es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } + } + +out: + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + ext4_lblk_t end; + int err = 0; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + if (!len) + return err; + + end = lblk + len - 1; + BUG_ON(end < lblk); + + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); + write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_print_tree(inode); + return err; +} + +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) +{ + struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; + ktime_t start_time; + u64 scan_time; + int nr_to_walk; + int nr_shrunk = 0; + int retried = 0, nr_skipped = 0; + + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); + +retry: + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move_tail(&ei->i_es_list, &sbi->s_es_list); + + /* + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. + */ + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { + nr_skipped++; + continue; + } + + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; + continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); + + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); + write_unlock(&ei->i_es_lock); + + if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); + } + spin_unlock(&sbi->s_es_lock); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, try again to scan precached inodes. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + goto retry; + } + + if (locked_ei && nr_shrunk == 0) + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); + +out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, + nr_skipped, retried); + return nr_shrunk; +} + +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); + return nr_shrunk; +} + +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void * +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = seq->private; + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); + if (inode_cnt) + seq_printf(seq, " %d inodes on list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_es_seq_shrinker_info_ops = { + .start = ext4_es_seq_shrinker_info_start, + .next = ext4_es_seq_shrinker_info_next, + .stop = ext4_es_seq_shrinker_info_stop, + .show = ext4_es_seq_shrinker_info_show, +}; + +static int +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE_DATA(inode); + } + + return ret; +} + +static int +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static const struct file_operations ext4_es_seq_shrinker_info_fops = { + .owner = THIS_MODULE, + .open = ext4_es_seq_shrinker_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext4_es_seq_shrinker_info_release, +}; + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) +{ + int err; + + /* Make sure we have enough bits for physical block number */ + BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); + if (err) + goto err1; + + sbi->s_es_shrinker.scan_objects = ext4_es_scan; + sbi->s_es_shrinker.count_objects = ext4_es_count; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + err = register_shrinker(&sbi->s_es_shrinker); + if (err) + goto err2; + + if (sbi->s_proc) + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, + &ext4_es_seq_shrinker_info_fops, sbi); + + return 0; + +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; +} + +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) +{ + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); + unregister_shrinker(&sbi->s_es_shrinker); +} + +/* + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at + * most *nr_to_scan extents, update *nr_to_scan accordingly. + * + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. + * Increment *nr_shrunk by the number of reclaimed extents. Also update + * ei->i_es_shrink_lblk to where we should continue scanning. + */ +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + int *nr_to_scan, int *nr_shrunk) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct extent_status *es; + struct rb_node *node; + + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); + if (!es) + goto out_wrap; + node = &es->rb_node; + while (*nr_to_scan > 0) { + if (es->es_lblk > end) { + ei->i_es_shrink_lblk = end + 1; + return 0; + } + + (*nr_to_scan)--; + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (ext4_es_is_delayed(es)) + goto next; + if (ext4_es_is_referenced(es)) { + ext4_es_clear_referenced(es); + goto next; + } + + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + (*nr_shrunk)++; +next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); + } + ei->i_es_shrink_lblk = es->es_lblk; + return 1; +out_wrap: + ei->i_es_shrink_lblk = 0; + return 0; +} + +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + int nr_shrunk = 0; + ext4_lblk_t start = ei->i_es_shrink_lblk; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (ei->i_es_shk_nr == 0) + return 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && + start != 0) + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); + + ei->i_es_tree.cache_es = NULL; + return nr_shrunk; +} diff --git a/kernel/fs/ext4/extents_status.h b/kernel/fs/ext4/extents_status.h new file mode 100644 index 000000000..691b52613 --- /dev/null +++ b/kernel/fs/ext4/extents_status.h @@ -0,0 +1,175 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/kernel/fs/ext4/file.c b/kernel/fs/ext4/file.c new file mode 100644 index 000000000..0613c256c --- /dev/null +++ b/kernel/fs/ext4/file.c @@ -0,0 +1,651 @@ +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) + { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +static void ext4_unwritten_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + + if (pos >= i_size_read(inode)) + return 0; + + if ((pos | iov_iter_alignment(from)) & blockmask) + return 1; + + return 0; +} + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(iocb->ki_filp); + struct mutex *aio_mutex = NULL; + struct blk_plug plug; + int o_direct = iocb->ki_flags & IOCB_DIRECT; + int overwrite = 0; + ssize_t ret; + + /* + * Unaligned direct AIO must be serialized; see comment above + * In the case of O_APPEND, assume that we must always serialize + */ + if (o_direct && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && + (iocb->ki_flags & IOCB_APPEND || + ext4_unaligned_aio(inode, from, iocb->ki_pos))) { + aio_mutex = ext4_aio_mutex(inode); + mutex_lock(aio_mutex); + ext4_unwritten_wait(inode); + } + + mutex_lock(&inode->i_mutex); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { + ret = -EFBIG; + goto out; + } + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + + iocb->private = &overwrite; + if (o_direct) { + size_t length = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + blk_start_plug(&plug); + + /* check whether we do a DIO overwrite or not */ + if (ext4_should_dioread_nolock(inode) && !aio_mutex && + !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, len; + + map.m_lblk = pos >> blkbits; + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) + - map.m_lblk; + len = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of blocks has + * been preallocated no matter they are + * initialized or not. For excluding + * unwritten extents, we need to check + * m_flags. There are two conditions that + * indicate for initialized extents. 1) If we + * hit extent cache, EXT4_MAP_MAPPED flag is + * returned; 2) If we do a real lookup, + * non-flags are returned. So we should check + * these two conditions. + */ + if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) + overwrite = 1; + } + } + + ret = __generic_file_write_iter(iocb, from); + mutex_unlock(&inode->i_mutex); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + if (o_direct) + blk_finish_plug(&plug); + + if (aio_mutex) + mutex_unlock(aio_mutex); + return ret; + +out: + mutex_unlock(&inode->i_mutex); + if (aio_mutex) + mutex_unlock(aio_mutex); + return ret; +} + +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + +static const struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file->f_mapping->host; + + if (ext4_encrypted_inode(inode)) { + int err = ext4_generate_encryption_key(inode); + if (err) + return 0; + } + file_accessed(file); + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } + return 0; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct vfsmount *mnt = filp->f_path.mnt; + struct path path; + char buf[64], *cp; + int ret; + + if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && + !(sb->s_flags & MS_RDONLY))) { + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + if (!IS_ERR(cp)) { + handle_t *handle; + int err; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) { + ext4_journal_stop(handle); + return err; + } + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_handle_dirty_super(handle, sb); + ext4_journal_stop(handle); + } + } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (filp->f_mode & FMODE_WRITE) { + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; + } + ret = dquot_file_open(inode, filp); + if (!ret && ext4_encrypted_inode(inode)) { + ret = ext4_generate_encryption_key(inode); + if (ret) + ret = -EACCES; + } + return ret; +} + +/* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int whence, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (whence == SEEK_DATA) + break; + + BUG_ON(whence != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && whence == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && whence == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (whence == SEEK_DATA) + found = 1; + } else { + if (whence == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && whence == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = (loff_t)last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + ext4_es_find_delayed_extent_range(inode, last, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + if (last != start) + dataoff = (loff_t)last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = (loff_t)last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + return vfs_setpos(file, dataoff, maxsize); +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = (loff_t)last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + ext4_es_find_delayed_extent_range(inode, last, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + last = es.es_lblk + es.es_len; + holeoff = (loff_t)last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = (loff_t)last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + return vfs_setpos(file, holeoff, maxsize); +} + +/* + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + else + maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; +} + +const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = ext4_fallocate, +}; + +const struct inode_operations ext4_file_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_getattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, + .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, + .fiemap = ext4_fiemap, +}; + diff --git a/kernel/fs/ext4/fsync.c b/kernel/fs/ext4/fsync.c new file mode 100644 index 000000000..885025413 --- /dev/null +++ b/kernel/fs/ext4/fsync.c @@ -0,0 +1,150 @@ +/* + * linux/fs/ext4/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" + +#include + +/* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static int ext4_sync_parent(struct inode *inode) +{ + struct dentry *dentry = NULL; + struct inode *next; + int ret = 0; + + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + dentry = d_find_any_alias(inode); + if (!dentry) + break; + next = igrab(d_inode(dentry->d_parent)); + dput(dentry); + if (!next) + break; + iput(inode); + inode = next; + ret = sync_mapping_buffers(inode->i_mapping); + if (ret) + break; + ret = sync_inode_metadata(inode, 1); + if (ret) + break; + } + iput(inode); + return ret; +} + +/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + int ret = 0, err; + tid_t commit_tid; + bool needs_barrier = false; + + J_ASSERT(ext4_journal_current_handle() == NULL); + + trace_ext4_sync_file_enter(file, datasync); + + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ret = -EROFS; + goto out; + } + + if (!journal) { + ret = generic_file_fsync(file, start, end, datasync); + if (!ret && !hlist_empty(&inode->i_dentry)) + ret = ext4_sync_parent(inode); + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for proper transaction to + * commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext4_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } + + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + ret = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } +out: + trace_ext4_sync_file_exit(inode, ret); + return ret; +} diff --git a/kernel/fs/ext4/hash.c b/kernel/fs/ext4/hash.c new file mode 100644 index 000000000..e026aa941 --- /dev/null +++ b/kernel/fs/ext4/hash.c @@ -0,0 +1,207 @@ +/* + * linux/fs/ext4/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include "ext4.h" + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i = 0; i < 4; i++) { + if (hinfo->seed[i]) { + memcpy(buf, hinfo->seed, sizeof(buf)); + break; + } + } + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/kernel/fs/ext4/ialloc.c b/kernel/fs/ext4/ialloc.c new file mode 100644 index 000000000..1eaa6cb96 --- /dev/null +++ b/kernel/fs/ext4/ialloc.c @@ -0,0 +1,1350 @@ +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +#include + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* Initializes an uninitialized inode bitmap */ +static unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks and inodes use to prevent + * allocation, essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return 0; + } + + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); + + return EXT4_INODES_PER_GROUP(sb); +} + +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + if (bitmap_uptodate(bh)) + goto verify; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + goto verify; + } + + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + goto verify; + } + /* + * submit the buffer_head for reading + */ + trace_ext4_load_inode_bitmap(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + put_bh(bh); + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + +verify: + ext4_lock_group(sb, block_group); + if (!buffer_verified(bh) && + !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + put_bh(bh); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return NULL; + } + ext4_unlock_group(sb, block_group); + set_buffer_verified(bh); + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; + + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); + return; + } + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + trace_ext4_free_inode(inode); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + dquot_initialize(inode); + ext4_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + ext4_clear_inode(inode); + + es = EXT4_SB(sb)->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bh2); + } + ext4_lock_group(sb, block_group); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } + + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + percpu_counter_dec(&sbi->s_dirs_counter); + } + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, block_group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + if (is_directory) + atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + } else { + ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +struct orlov_stats { + __u64 free_clusters; + __u32 free_inodes; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; + + if (flex_size > 1) { + stats->free_inodes = atomic_read(&flex_group[g].free_inodes); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); + stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_clusters = 0; + stats->used_dirs = 0; + } +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode, + const struct qstr *qstr) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t real_ngroups = ext4_get_groups_count(sb); + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei, grp_free; + ext4_fsblk_t freeb, avefreec; + unsigned int ndirs; + int max_dirs, min_inodes; + ext4_grpblk_t min_clusters; + ext4_group_t i, grp, g, ngroups; + struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; + + ngroups = real_ngroups; + if (flex_size > 1) { + ngroups = (real_ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + avefreec = freeb; + do_div(avefreec, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if (S_ISDIR(mode) && + ((parent == d_inode(sb->s_root)) || + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(qstr->name, qstr->len, &hinfo); + grp = hinfo.hash; + } else + grp = prandom_u32(); + parent_group = (unsigned)grp % ngroups; + for (i = 0; i < ngroups; i++) { + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) + continue; + if (stats.used_dirs >= best_ndir) + continue; + if (stats.free_inodes < avefreei) + continue; + if (stats.free_clusters < avefreec) + continue; + grp = g; + ret = 0; + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= real_ngroups) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } + + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) + continue; + if (stats.free_inodes < min_inodes) + continue; + if (stats.free_clusters < min_clusters) + continue; + goto found_flex_bg; + } + +fallback: + ngroups = real_ngroups; + avefreei = freei / ngroups; +fallback_retry: + parent_group = EXT4_I(parent)->i_block_group; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } + } + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback_retry; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode, NULL); + } + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) + return 0; + } + + return -1; +} + +/* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) +{ + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; + ext4_group_t ngroups, group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err = 0; + struct inode *ret; + ext4_group_t i; + ext4_group_t flex_group; + struct ext4_group_info *grp; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + sbi = EXT4_SB(sb); + + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else + ret2 = find_group_other(sb, dir, &group, mode); + +got_group: + EXT4_I(dir)->i_last_alloc_group = group; + err = -ENOSPC; + if (ret2 == -1) + goto out; + + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ + for (i = 0; i < ngroups; i++, ino = 0) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } + +repeat_in_this_group: + ino = ext4_find_next_zero_bit((unsigned long *) + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks, + 0); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; + } + } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ +next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; +next_group: + if (++group == ngroups) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (!block_bitmap_bh) { + err = -EIO; + goto out; + } + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); + if (err) { + brelse(block_bitmap_bh); + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); + } + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* Don't inherit extent flag from directory, amongst others. */ + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; + + /* If the directory encrypted, then we should encrypt the inode. */ + if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && + (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(sbi))) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; + } + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + /* Precompute checksum seed for inode metadata */ + if (ext4_has_metadata_csum(sb)) { + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && + (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } else { + /* Inline data and encryption are incompatible + * We turn off inline data since encryption is enabled */ + ei->i_inline_off = 1; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } +#else + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +#endif + ret = inode; + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_ext4_allocate_inode(inode, dir, mode); + brelse(inode_bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); +fail_drop: + clear_nlink(inode); + unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + brelse(inode_bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_WARNING "inode=%p\n", inode); + if (inode) { + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += ext4_used_dirs_count(sb, gdp); + } + return count; +} + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_new_inode() until we are finished. + */ +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + + /* This should not happen, but just to be sure check this */ + if (sb->s_flags & MS_RDONLY) { + ret = 1; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); + + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, + group_desc_bh); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} diff --git a/kernel/fs/ext4/indirect.c b/kernel/fs/ext4/indirect.c new file mode 100644 index 000000000..958824019 --- /dev/null +++ b/kernel/fs/ext4/indirect.c @@ -0,0 +1,1558 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include "ext4_jbd2.h" +#include "truncate.h" +#include + +#include + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + int ret = -EIO; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) { + ret = -ENOMEM; + goto failure; + } + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = ret; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, + struct ext4_allocation_request *ar, + int indirect_blks, ext4_lblk_t *offsets, + Indirect *branch) +{ + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); + } else + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, + ar->inode, ar->goal, + ar->flags & EXT4_MB_DELALLOC_RESERVED, + NULL, &err); + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar->len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); + if (err) + goto failed; + } + return 0; +failed: + for (; i >= 0; i--) { + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, ar->inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar->len : 1, 0); + } + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, + struct ext4_allocation_request *ar, + Indirect *where, int num) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && ar->len > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < ar->len; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, ar->inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), + ar->len, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + struct ext4_allocation_request ar; + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + return -ENOSPC; + } + + /* Set up for the direct block allocation */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + ar.len = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, &ar, indirect_blks, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); + count = ar.len; +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_iter_count(iter); + int retries = 0; + + if (iov_iter_rw(iter) == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + inode_dio_begin(inode); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) { + inode_dio_end(inode); + goto locked; + } + if (IS_DAX(inode)) + ret = dax_do_io(iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(iocb, inode, + inode->i_sb->s_bdev, iter, + offset, ext4_get_block, NULL, + NULL, 0); + inode_dio_end(inode); + } else { +locked: + if (IS_DAX(inode)) + ret = dax_do_io(iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(iocb, inode, iter, offset, + ext4_get_block); + + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) +{ + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * Try to extend this transaction for the purposes of truncation. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(handle_t *handle, struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + return; + } + + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + return; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } +} + +/** + * ext4_ind_remove_space - remove space from the range + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @start: First block to remove + * @end: One block after the last block to remove (exclusive) + * + * Free the blocks in the defined range (end is exclusive endpoint of + * range). This is used by ext4_punch_hole(). + */ +int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + ext4_lblk_t offsets[4], offsets2[4]; + Indirect chain[4], chain2[4]; + Indirect *partial, *partial2; + ext4_lblk_t max_block; + __le32 nr = 0, nr2 = 0; + int n = 0, n2 = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + if (end >= max_block) + end = max_block; + if ((start >= end) || (start > max_block)) + return 0; + + n = ext4_block_to_path(inode, start, offsets, NULL); + n2 = ext4_block_to_path(inode, end, offsets2, NULL); + + BUG_ON(n > n2); + + if ((n == 1) && (n == n2)) { + /* We're punching only within direct block range */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + offsets2[0]); + return 0; + } else if (n2 > n) { + /* + * Start and end are on a different levels so we're going to + * free partial block at start, and partial block at end of + * the range. If there are some levels in between then + * do_indirects label will take care of that. + */ + + if (n == 1) { + /* + * Start is at the direct block level, free + * everything to the end of the level. + */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto end_range; + } + + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + +end_range: + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + if (nr2) { + if (partial2 == chain2) { + /* + * Remember, end is exclusive so here we're at + * the start of the next level we're not going + * to free. Everything was covered by the start + * of the range. + */ + goto do_indirects; + } + } else { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + while (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; + } + goto do_indirects; + } + + /* Punch happened within the same level (n == n2) */ + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + + /* Free top, but only if partial2 isn't its subtree. */ + if (nr) { + int level = min(partial - chain, partial2 - chain2); + int i; + int subtree = 1; + + for (i = 0; i <= level; i++) { + if (offsets[i] != offsets2[i]) { + subtree = 0; + break; + } + } + + if (!subtree) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, + (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, + (chain+n-1) - partial); + } + } + } + + if (!nr2) { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + } + + while (partial > chain || partial2 > chain2) { + int depth = (chain+n-1) - partial; + int depth2 = (chain2+n2-1) - partial2; + + if (partial > chain && partial2 > chain2 && + partial->bh->b_blocknr == partial2->bh->b_blocknr) { + /* + * We've converged on the same block. Clear the range, + * then we're done. + */ + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + partial2->p, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + return 0; + } + + /* + * The start and end partial branches may not be at the same + * level even though the punch happened within one level. So, we + * give them a chance to arrive at the same level, then walk + * them in step with each other until we converge on the same + * block. + */ + if (partial > chain && depth <= depth2) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + if (partial2 > chain2 && depth2 <= depth) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; + } + } + return 0; + +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + if (++n >= n2) + return 0; + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + return 0; +} diff --git a/kernel/fs/ext4/inline.c b/kernel/fs/ext4/inline.c new file mode 100644 index 000000000..095c7a258 --- /dev/null +++ b/kernel/fs/ext4/inline.c @@ -0,0 +1,2024 @@ +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include + +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "truncate.h" + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + int free, min_offs; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_block && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? + len : EXT4_MIN_INLINE_DATA_SIZE; + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_inline_set. + * That saves us one memcpy. + */ +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = EXT4_ZERO_XATTR_VALUE; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(!is.s.not_found); + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(is.s.not_found); + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) + goto out; + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error == -ENODATA) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + /* Update the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int ret, size; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + down_write(&EXT4_I(inode)->xattr_sem); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +static int ext4_read_inline_page(struct inode *inode, struct page *page) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!PageLocked(page)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(page->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + kaddr = kmap_atomic(page); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + flush_dcache_page(page); + kunmap_atomic(kaddr); + zero_user_segment(page, len, PAGE_CACHE_SIZE); + SetPageUptodate(page); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!page->index) + ret = ext4_read_inline_page(inode, page); + else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + unlock_page(page); + return ret >= 0 ? 0 : ret; +} + +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags) +{ + int ret, needed_blocks; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct page *page = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, from, to, ext4_get_block_write); + else + ret = __block_write_begin(page, from, to, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + page = NULL; + ext4_orphan_add(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (page) + block_commit_write(page, from, to); +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (sem_held) + up_write(&EXT4_I(inode)->xattr_sem); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + int ret; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + if (pos + len > ext4_get_max_inline_size(inode)) + goto convert; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + /* + * The possible write could happen in the inode, + * so try to reserve the space in inode first. + */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + + /* We don't have space in inline inode, so convert it to extent. */ + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + brelse(iloc.bh); + goto convert; + } + + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + *pagep = page; + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + unlock_page(page); + page_cache_release(page); + goto out_up_read; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_up_read; + } + + ret = 1; + handle = NULL; +out_up_read: + up_read(&EXT4_I(inode)->xattr_sem); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +convert: + return ext4_convert_inline_data_to_extent(mapping, + inode, flags); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + if (unlikely(copied < len)) { + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + copied = 0; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + up_write(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); +out: + return copied; +} + +struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + return NULL; + } + + down_write(&EXT4_I(inode)->xattr_sem); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); + kunmap_atomic(kaddr); + up_write(&EXT4_I(inode)->xattr_sem); + + return iloc.bh; +} + +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metatdata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags, + void **fsdata) +{ + int ret = 0, inline_size; + struct page *page; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = __block_write_begin(page, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); + ext4_truncate_failed_write(inode); + return ret; + } + + SetPageDirty(page); + SetPageUptodate(page); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (page) { + unlock_page(page); + page_cache_release(page); + } + return ret; +} + +/* + * Prepare the write for the inline data. + * If the the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret, inline_size; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + int retries; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + inline_size = ext4_get_max_inline_size(inode); + + ret = -ENOSPC; + if (inline_size >= pos + len) { + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_journal; + } + + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + + if (ret == -ENOSPC) { + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out; + } + + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out_journal; + } + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_page; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_release_page; + } + + up_read(&EXT4_I(inode)->xattr_sem); + *pagep = page; + brelse(iloc.bh); + return 1; +out_release_page: + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); +out_journal: + ext4_journal_stop(handle); +out: + brelse(iloc.bh); + return ret; +} + +int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + int i_size_changed = 0; + + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} + +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct dentry *dentry, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + int err; + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, + inline_start, inline_size, + name, namelen, &de); + if (err) + return err; + + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; + ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, + name, namelen); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = (struct ext4_dir_entry_2 *)de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = (struct ext4_dir_entry_2 *)de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + ext4_create_inline_data(handle, inode, inline_size); + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_finish_convert_inline_dir(handle_t *handle, + struct inode *inode, + struct buffer_head *dir_block, + void *buf, + int inline_size) +{ + int err, csum_size = 0, header_size = 0; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + void *target = dir_block->b_data; + + /* + * First create "." and ".." and then copy the dir information + * back to the block. + */ + de = (struct ext4_dir_entry_2 *)target; + de = ext4_init_dot_dotdot(inode, de, + inode->i_sb->s_blocksize, csum_size, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); + header_size = (void *)de - target; + + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + ext4_update_final_de(dir_block->b_data, + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, + inode->i_sb->s_blocksize - csum_size); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, + inode->i_sb->s_blocksize); + initialize_dirent_tail(t, inode->i_sb->s_blocksize); + } + set_buffer_uptodate(dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + return err; +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + /* + * Make sure the inline directory entries pass checks before we try to + * convert them, so that we avoid touching stuff that needs fsck. + */ + if (S_ISDIR(inode->i_mode)) { + error = ext4_check_all_de(inode, iloc->bh, + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (error) + goto out; + } + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -ENOMEM; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, data_bh); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, + buf, inline_size); + } + + unlock_buffer(data_bh); +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + int ret, inline_size; + void *inline_start; + struct ext4_iloc iloc; + struct inode *dir = d_inode(dentry->d_parent); + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_mark_inode_dirty(handle, dir); + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + struct ext4_str tmp_str; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, hinfo->hash, + hinfo->minor_hash, de, &tmp_str); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ +int ext4_read_inline_dir(struct file *file, + struct dir_context *ctx, + int *has_inline_data) +{ + unsigned int offset, parent_ino; + int i; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = file_inode(file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + ret = 0; + sb = inode->i_sb; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = ctx->pos; + + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (file->f_version != inode->i_version) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ + if (!i) { + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; + continue; + } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ + de = (struct ext4_dir_entry_2 *) + (dir_buf + i - extra_offset); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) + < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); + } + offset = i; + ctx->pos = offset; + file->f_version = inode->i_version; + } + + while (ctx->pos < extra_size) { + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_offset; + continue; + } + + if (ctx->pos == dotdot_offset) { + if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_size; + continue; + } + + de = (struct ext4_dir_entry_2 *) + (dir_buf + ctx->pos - extra_offset); + if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, + extra_size, ctx->pos)) + goto out; + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto out; + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + int ret; + struct ext4_iloc iloc; + void *inline_start; + int inline_size; + + if (ext4_get_inode_loc(dir, &iloc)) + return NULL; + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(iloc.bh); + iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return iloc.bh; +} + +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_mark_inode_dirty(handle, dir); + if (unlikely(err)) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + int ret = 1; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", + err, dir->i_ino); + return 1; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + ret = 1; + goto out; + } + + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < dir->i_size) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d\n", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + ret = 1; + goto out; + } + if (le32_to_cpu(de->inode)) { + ret = 0; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret; + + down_write(&EXT4_I(inode)->xattr_sem); + ret = ext4_destroy_inline_data_nolock(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len) +{ + __u64 physical = 0; + __u64 inline_len; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + int error = 0; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + goto out; + } + inline_len = min_t(size_t, ext4_get_inline_size(inode), + i_size_read(inode)); + if (start >= inline_len) + goto out; + if (start + len < inline_len) + inline_len = start + len; + inline_len -= start; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + physical += offsetof(struct ext4_inode, i_block); + + if (physical) + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); + brelse(iloc.bh); +out: + up_read(&EXT4_I(inode)->xattr_sem); + return (error < 0 ? error : 0); +} + +/* + * Called during xattr set, and if we can sparse space 'needed', + * just create the extent tree evict the data to the outer block. + * + * We use jbd2 instead of page cache to move data to the 1st block + * so that the whole transaction can be committed as a whole and + * the data isn't lost because of the delayed page cache write. + */ +int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed) +{ + int error; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + raw_inode = ext4_raw_inode(&iloc); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + if (EXT4_XATTR_LEN(entry->e_name_len) + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { + error = -ENOSPC; + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +out: + brelse(iloc.bh); + return error; +} + +void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); + if (IS_ERR(handle)) + return; + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + ext4_journal_stop(handle); + return; + } + + if (ext4_orphan_add(handle, inode)) + goto out; + + if (ext4_get_inode_loc(inode, &is.iloc)) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if (ext4_xattr_ibody_find(inode, &i, &is)) + goto out_error; + + BUG_ON(is.s.not_found); + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) + goto out_error; + + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, value_len)) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + up_write(&EXT4_I(inode)->xattr_sem); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); + return; +} + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_write(&EXT4_I(inode)->xattr_sem); + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + up_write(&EXT4_I(inode)->xattr_sem); +out: + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} diff --git a/kernel/fs/ext4/inode.c b/kernel/fs/ext4/inode.c new file mode 100644 index 000000000..0554b0b59 --- /dev/null +++ b/kernel/fs/ext4/inode.c @@ -0,0 +1,5279 @@ +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "truncate.h" + +#include + +#define MPAGE_DA_EXTENT_TAIL 0x01 + +static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u16 csum_lo; + __u16 csum_hi = 0; + __u32 csum; + + csum_lo = le16_to_cpu(raw->i_checksum_lo); + raw->i_checksum_lo = 0; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum_hi = le16_to_cpu(raw->i_checksum_hi); + raw->i_checksum_hi = 0; + } + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, + EXT4_INODE_SIZE(inode->i_sb)); + + raw->i_checksum_lo = cpu_to_le16(csum_lo); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum_hi); + + return csum; +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 provided, calculated; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !ext4_has_metadata_csum(inode->i_sb)) + return 1; + + provided = le16_to_cpu(raw->i_checksum_lo); + calculated = ext4_inode_csum(inode, raw, ei); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; + else + calculated &= 0xFFFF; + + return provided == calculated; +} + +static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !ext4_has_metadata_csum(inode->i_sb)) + return; + + csum = ext4_inode_csum(inode, raw, ei); + raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum >> 16); +} + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + trace_ext4_begin_ordered_truncate(inode, new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); + +/* + * Test whether an inode is a fast symlink. + */ +int ext4_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) +{ + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + jbd_debug(2, "restarting handle %p\n", handle); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, nblocks); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + return ret; +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_evict_inode(struct inode *inode) +{ + handle_t *handle; + int err; + + trace_ext4_evict_inode(inode); + + if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_complete_transaction(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages_final(&inode->i_data); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); + goto no_delete; + } + + if (is_bad_inode(inode)) + goto no_delete; + dquot_initialize(inode); + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages_final(&inode->i_data); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); + + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it + */ + sb_start_intwrite(inode->i_sb); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)+3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + goto no_delete; + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } + if (inode->i_blocks) + ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (!ext4_handle_has_enough_credits(handle, 3)) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + goto no_delete; + } + } + + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); + return; +no_delete: + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) +{ + return &EXT4_I(inode)->i_reserved_quota; +} +#endif + +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_warning(inode->i_sb, "%s: ino %lu, used %d " + "with only %d reserved data blocks", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + /* Update quota subsystem for data blocks */ + if (quota_claim) + dquot_claim_block(inode, EXT4_C2B(sbi, used)); + else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not re-claim the quota for fallocated blocks. + */ + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); + } + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); +} + +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, + struct ext4_map_blocks *map) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, + map->m_len)) { + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); + return -EIO; + } + return 0; +} + +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertion failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + +/* + * The ext4_map_blocks() function tries to look up the requested blocks, + * and returns if the blocks are already mapped. + * + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that case, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + int retval; + int ret = 0; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif + + map->m_flags = 0; + ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, map->m_len, + (unsigned long) map->m_lblk); + + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + retval = 0; + } else { + BUG_ON(1); + } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif + goto found; + } + + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (retval > 0) { + unsigned int status; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, + map->m_len, map->m_pblk, status); + if (ret < 0) + retval = ret; + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + +found: + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + + /* If it is only a block(s) look up */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns the create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * New blocks allocate and/or writing to unwritten extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_block() + * with create == 1 flag. + */ + down_write(&EXT4_I(inode)->i_data_sem); + + /* + * We need to check for EXT4 here because migrate + * could have changed the inode type in between + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { + /* + * We allocated new blocks which will result in + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } + + if (retval > 0) { + unsigned int status; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret < 0) + retval = ret; + } + +has_zeroout: + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + return retval; +} + +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) +{ + handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; + int ret = 0, started = 0; + int dio_credits; + + if (ext4_has_inline_data(inode)) + return -ERANGE; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { + /* Direct IO write... */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + started = 1; + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); + bh->b_size = inode->i_sb->s_blocksize * map.m_len; + ret = 0; + } + if (started) + ext4_journal_stop(handle); + return ret; +} + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create) +{ + struct ext4_map_blocks map; + struct buffer_head *bh; + int err; + + J_ASSERT(handle != NULL || create == 0); + + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, + create ? EXT4_GET_BLOCKS_CREATE : 0); + + if (err == 0) + return create ? ERR_PTR(-ENOSPC) : NULL; + if (err < 0) + return ERR_PTR(err); + + bh = sb_getblk(inode->i_sb, map.m_pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + if (map.m_flags & EXT4_MAP_NEW) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (unlikely(err)) { + unlock_buffer(bh); + goto errout; + } + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto errout; + } else + BUFFER_TRACE(bh, "not a new buffer"); + return bh; +errout: + brelse(bh); + return ERR_PTR(err); +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create) +{ + struct buffer_head *bh; + + bh = ext4_getblk(handle, inode, block, create); + if (IS_ERR(bh)) + return bh; + if (!bh || buffer_uptodate(bh)) + return bh; + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + return ERR_PTR(-EIO); +} + +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the commit_write(). So doing the jbd2_journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext4_writepage(). In that case, we + * *know* that ext4_writepage() has generated enough buffer credits to do the + * whole page. So we won't block on the journal in that case, which is good, + * because the caller may be PF_MEMALLOC. + * + * By accident, ext4 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that jbd2_journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + int dirty = buffer_dirty(bh); + int ret; + + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + /* + * __block_write_begin() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_write_begin() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; +} + +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + bool decrypt = false; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + bbits = ilog2(blocksize); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + decrypt = ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode); + } + } + /* + * If we issued read requests, let them complete. + */ + while (wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + else if (decrypt) + err = ext4_decrypt_one(inode, page); + return err; +} +#endif + +static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret, needed_blocks; + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + + trace_ext4_write_begin(inode, pos, len, flags); + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason + */ + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + flags, pagep); + if (ret < 0) + return ret; + if (ret == 1) + return 0; + } + + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + page_cache_release(page); + return PTR_ERR(handle); + } + + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); + ext4_journal_stop(handle); + goto retry_grab; + } + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_should_dioread_nolock(inode)) + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block_write); + else + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block); +#else + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, pos, len, ext4_get_block_write); + else + ret = __block_write_begin(page, pos, len, ext4_get_block); +#endif + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + /* + * __block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before + * truncate finishes + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ext4_journal_stop(handle); + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + page_cache_release(page); + return ret; + } + *pagep = page; + return ret; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + int ret = 0, ret2; + int i_size_changed = 0; + + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } + + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) + goto errout; + copied = ret; + } else + copied = block_write_end(file, mapping, pos, + len, copied, page, fsdata); + /* + * it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + i_size_changed = ext4_update_inode_size(inode, pos + copied); + unlock_page(page); + page_cache_release(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + ext4_mark_inode_dirty(handle, inode); + + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); +errout: + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +static int ext4_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + int size_changed = 0; + + trace_ext4_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + BUG_ON(!ext4_handle_valid(handle)); + + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else { + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } + + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + } + size_changed = ext4_update_inode_size(inode, pos + copied); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + unlock_page(page); + page_cache_release(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (size_changed) { + ret2 = ext4_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +/* + * Reserve a single cluster located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + int ret; + + /* + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. + */ + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + if (ret) + return ret; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + md_needed = 0; + trace_ext4_da_reserve_space(inode, 0); + + if (ext4_claim_free_clusters(sbi, 1, 0)) { + spin_unlock(&ei->i_block_reservation_lock); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + return -ENOSPC; + } + ei->i_reserved_data_blocks++; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +static void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + trace_ext4_da_release_space(inode, to_free); + if (unlikely(to_free > ei->i_reserved_data_blocks)) { + /* + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. + */ + ext4_warning(inode->i_sb, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; + } + ei->i_reserved_data_blocks -= to_free; + + /* update fs dirty data blocks counter */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned int offset, + unsigned int length) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + struct inode *inode = page->mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int stop = offset + length; + int num_clusters; + ext4_fsblk_t lblk; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if (next_off > stop) + break; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + + if (to_release) { + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, lblk, to_release); + } + + /* If we have released all the blocks belonging to a cluster, then we + * need to release the reserved space for that cluster. */ + num_clusters = EXT4_NUM_B2C(sbi, to_release); + while (num_clusters > 0) { + lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + ((num_clusters - 1) << sbi->s_cluster_bits); + if (sbi->s_cluster_ratio == 1 || + !ext4_find_delalloc_cluster(inode, lblk)) + ext4_da_release_space(inode, 1); + + num_clusters--; + } +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct writeback_control *wbc; + + pgoff_t first_page; /* The first page to write */ + pgoff_t next_page; /* Current page to examine */ + pgoff_t last_page; /* Last page to examine */ + /* + * Extent to map - this can be after first_page because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. + */ + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ +}; + +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + /* This is necessary when next_page == 0. */ + if (mpd->first_page >= mpd->next_page) + return; + + index = mpd->first_page; + end = mpd->next_page - 1; + if (invalidate) { + ext4_lblk_t start, last; + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + } + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + if (page->index > end) + break; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } + unlock_page(page); + } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); + } +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(sb))); + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + ei->i_reserved_data_blocks); + return; +} + +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +{ + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); +} + +/* + * This function is grabs code from the very beginning of + * ext4_map_blocks, but assumes that the caller is from delayed write + * time. This function looks up the requested blocks and sets the + * buffer delay bit under the protection of i_data_sem. + */ +static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + struct ext4_map_blocks *map, + struct buffer_head *bh) +{ + struct extent_status es; + int retval; + sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map->m_flags = 0; + ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," + "logical block %lu\n", inode->i_ino, map->m_len, + (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read(&EXT4_I(inode)->i_data_sem); + goto add_delayed; + } + + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG_ON(1); + +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif + return retval; + } + + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_has_inline_data(inode)) + retval = 0; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else + retval = ext4_ind_map_blocks(NULL, inode, map, 0); + +add_delayed: + if (retval == 0) { + int ret; + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ + if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + !ext4_find_delalloc_cluster(inode, map->m_lblk)) { + ret = ext4_da_reserve_space(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } + } + + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + ~0, EXTENT_STATUS_DELAYED); + if (ret) { + retval = ret; + goto out_unlock; + } + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + } else if (retval > 0) { + int ret; + unsigned int status; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret != 0) + retval = ret; + } + +out_unlock: + up_read((&EXT4_I(inode)->i_data_sem)); + + return retval; +} + +/* + * This is a special get_block_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + */ +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct ext4_map_blocks map; + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + map.m_lblk = iblock; + map.m_len = 1; + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_da_map_blocks(inode, iblock, &map, bh); + if (ret <= 0) + return ret; + + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs = NULL; + handle_t *handle = NULL; + int ret = 0, err = 0; + int inline_data = ext4_has_inline_data(inode); + struct buffer_head *inode_bh = NULL; + + ClearPageChecked(page); + + if (inline_data) { + BUG_ON(page->index != 0); + BUG_ON(len > ext4_get_max_inline_size(inode)); + inode_bh = ext4_journalled_write_inline_data(inode, len, page); + if (inode_bh == NULL) + goto out; + } else { + page_bufs = page_buffers(page); + if (!page_bufs) { + BUG(); + goto out; + } + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bget_one); + } + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + BUG_ON(!ext4_handle_valid(handle)); + + if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); + ret = ext4_journal_get_write_access(handle, inode_bh); + + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); + + } else { + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + } + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + if (!ext4_has_inline_data(inode)) + ext4_walk_page_buffers(NULL, page_bufs, 0, len, + NULL, bput_one); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); +out: + brelse(inode_bh); + return ret; +} + +/* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), no one guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * + * This function can get called via... + * - ext4_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via the kswapd/direct reclaim (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other buffer_heads would be unmapped but dirty (dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. + */ +static int ext4_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned int len; + struct buffer_head *page_bufs = NULL; + struct inode *inode = page->mapping->host; + struct ext4_io_submit io_submit; + bool keep_towrite = false; + + trace_ext4_writepage(page); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + page_bufs = page_buffers(page); + /* + * We cannot do block allocation or other extent handling in this + * function. If there are buffers needing that, we have to redirty + * the page. But we may reach here when we do a journal commit via + * journal_submit_inode_data_buffers() and in that case we must write + * allocated buffers to achieve data=ordered mode guarantees. + */ + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + redirty_page_for_writepage(wbc, page); + if (current->flags & PF_MEMALLOC) { + /* + * For memory cleaning there's no point in writing only + * some buffers. So just bail out. Warn if we came here + * from direct reclaim. + */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) + == PF_MEMALLOC); + unlock_page(page); + return 0; + } + keep_towrite = true; + } + + if (PageChecked(page) && ext4_should_journal_data(inode)) + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + return __ext4_journalled_writepage(page, len); + + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return -ENOMEM; + } + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); + ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); + return ret; +} + +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) + +/* + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks up to full group size. + */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 + +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @bh - buffer head we want to add to the extent + * + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. + */ +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } + + /* First block in the extent? */ + if (map->m_len == 0) { + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; + } + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (bh->b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return true; + } + return false; +} + +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + int err; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + + do { + BUG_ON(buffer_locked(bh)); + + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { + /* Found extent to map? */ + if (mpd->map.m_len) + return 0; + /* Everything mapped so far and we hit EOF */ + break; + } + } while (lblk++, (bh = bh->b_this_page) != head); + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to unwritten extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) +{ + struct pagevec pvec; + int nr_pages, i; + struct inode *inode = mpd->inode; + struct buffer_head *head, *bh; + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + sector_t pblock; + int err; + + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + lblk = start << bpp_bits; + pblock = mpd->map.m_pblk; + + pagevec_init(&pvec, 0); + while (start <= end) { + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + /* Up to 'end' pages must be contiguous */ + BUG_ON(page->index != start); + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); + pagevec_release(&pvec); + if (err > 0) + err = 0; + return err; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (lblk++, (bh = bh->b_this_page) != head); + + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + /* Page fully mapped - let IO run! */ + err = mpage_submit_page(mpd, page); + if (err < 0) { + pagevec_release(&pvec); + return err; + } + start++; + } + pagevec_release(&pvec); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err, dioread_nolock; + + trace_ext4_da_write_pages_extent(inode, map); + /* + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an unwritten extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if + * the blocks in question are delalloc blocks. This indicates + * that the blocks and quotas has already been checked when + * the data was copied into the page cache. + */ + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (map->m_flags & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + } + + BUG_ON(map->m_len == 0); + if (map->m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map->m_len; i++) + unmap_underlying_metadata(bdev, map->m_pblk + i); + } + return 0; +} + +/* + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + * is no hope of writing the data. The caller should discard + * dirty pages to avoid infinite loops. + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. + */ +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; + int progress = 0; + + mpd->io_submit.io_end->offset = + ((loff_t)map->m_lblk) << inode->i_blkbits; + do { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) { + if (progress) + goto update_disksize; + return err; + } + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + progress = 1; + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + goto update_disksize; + } while (map->m_len); + +update_disksize: + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + if (disksize > EXT4_I(inode)->i_disksize) { + int err2; + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (err2) + ext4_error(inode->i_sb, + "Failed to mark inode %lu dirty", + inode->i_ino); + if (!err) + err = err2; + } + return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + + return ext4_meta_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; + long left = mpd->wbc->nr_to_write; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + int tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + pagevec_init(&pvec, 0); + mpd->map.m_len = 0; + mpd->next_page = index; + while (index <= end) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + goto out; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) + goto out; + + /* + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ + if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + goto out; + + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; + + lock_page(page); + /* + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page + */ + if (!PageDirty(page) || + (PageWriteback(page) && + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + wait_on_page_writeback(page); + BUG_ON(PageWriteback(page)); + + if (mpd->map.m_len == 0) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_CACHE_SHIFT - blkbits); + head = page_buffers(page); + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) + goto out; + err = 0; + left--; + } + pagevec_release(&pvec); + cond_resched(); + } + return 0; +out: + pagevec_release(&pvec); + return err; +} + +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = ext4_writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; + int range_whole = 0; + int cycled = 1; + handle_t *handle = NULL; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int needed_blocks, rsv_blocks = 0, ret = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + bool done; + struct blk_plug plug; + bool give_up_on_write = false; + + trace_ext4_writepages(inode, wbc); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + goto out_writepages; + + if (ext4_should_journal_data(inode)) { + struct blk_plug plug; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + goto out_writepages; + } + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ret = -EROFS; + goto out_writepages; + } + + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + } + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } + + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; + if (writeback_index) + cycled = 0; + mpd.first_page = writeback_index; + mpd.last_page = -1; + } else { + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; + } + + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + done = false; + blk_start_plug(&plug); + while (!done && mpd.first_page <= mpd.last_page) { + /* For each extent of pages we use new io_end */ + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + break; + } + + /* + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " + "%ld pages, ino %lu; err %d", __func__, + wbc->nr_to_write, inode->i_ino, ret); + /* Release allocated io_end */ + ext4_put_io_end(mpd.io_submit.io_end); + break; + } + + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); + ret = mpage_prepare_extent_to_map(&mpd); + if (!ret) { + if (mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, + &give_up_on_write); + else { + /* + * We scanned the whole range (or exhausted + * nr_to_write), submitted what was mapped and + * didn't find anything needing mapping. We are + * done. + */ + done = true; + } + } + ext4_journal_stop(handle); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, give_up_on_write); + /* Drop our io_end reference we got from init */ + ext4_put_io_end(mpd.io_submit.io_end); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + ret = 0; + continue; + } + /* Fatal error - ENOMEM, EIO... */ + if (ret) + break; + } + blk_finish_plug(&plug); + if (!ret && !cycled && wbc->nr_to_write > 0) { + cycled = 1; + mpd.last_page = writeback_index - 1; + mpd.first_page = 0; + goto retry; + } + + /* Update index */ + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * Set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = mpd.first_page; + +out_writepages: + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + return ret; +} + +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_clusters, dirty_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { + /* + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark + */ + return 1; + } + return 0; +} + +/* We always reserve for an inode update; the superblock could be there too */ +static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) +{ + if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + return 1; + + if (pos + len <= 0x7fffffffULL) + return 1; + + /* We might need to update the superblock to set LARGE_FILE */ + return 2; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; + trace_ext4_da_write_begin(inode, pos, len, flags); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_da_write_inline_data_begin(mapping, inode, + pos, len, flags, + pagep, fsdata); + if (ret < 0) + return ret; + if (ret == 1) + return 0; + } + + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_da_write_credits(inode, pos, len)); + if (IS_ERR(handle)) { + page_cache_release(page); + return PTR_ERR(handle); + } + + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); + ext4_journal_stop(handle); + goto retry_grab; + } + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ret = ext4_block_write_begin(page, pos, len, + ext4_da_get_block_prep); +#else + ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); +#endif + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + + page_cache_release(page); + return ret; + } + + *pagep = page; + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); + + trace_ext4_da_write_end(inode, pos, len, copied); + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + new_i_size = pos + copied; + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_has_inline_data(inode) || + ext4_da_should_update_i_disksize(page, end)) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + page); + else + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset, length); + +out: + ext4_invalidatepage(page, offset, length); + + return; +} + +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + trace_ext4_alloc_da_blocks(inode); + + if (!EXT4_I(inode)->i_reserved_data_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writepage() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them because we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + return 0; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT4_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping, block, ext4_get_block); +} + +static int ext4_readpage(struct file *file, struct page *page) +{ + int ret = -EAGAIN; + struct inode *inode = page->mapping->host; + + trace_ext4_readpage(page); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, page); + + if (ret == -EAGAIN) + return ext4_mpage_readpages(page->mapping, NULL, page, 1); + + return ret; +} + +static int +ext4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + + /* If the file has inline data, no need to do readpages. */ + if (ext4_has_inline_data(inode)) + return 0; + + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); +} + +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + trace_ext4_invalidatepage(page, offset, length); + + /* No journalling happens on data buffers when this function is used */ + WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + + block_invalidatepage(page, offset, length); +} + +static int __ext4_journalled_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_journalled_invalidatepage(page, offset, length); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0 && length == PAGE_CACHE_SIZE) + ClearPageChecked(page); + + return jbd2_journal_invalidatepage(journal, page, offset, length); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); +} + +static int ext4_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_releasepage(page); + + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); +} + +/* + * ext4_get_block used when preparing for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after the IO is complete. + */ +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_NO_LOCK); +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private) +{ + ext4_io_end_t *io_end = iocb->private; + + /* if not async direct IO just return */ + if (!io_end) + return; + + ext_debug("ext4_end_io_dio(): io_end 0x%p " + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + + iocb->private = NULL; + io_end->offset = offset; + io_end->size = size; + ext4_put_io_end(io_end); +} + +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as unwritten + * If those blocks were preallocated, we mark sure they are split, but + * still keep the range to write as unwritten. + * + * The unwritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the conversion + * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_iter_count(iter); + int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; + loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; + + /* Use the old path for reads and writes beyond i_size. */ + if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(iocb, iter, offset); + + BUG_ON(iocb->private == NULL); + + /* + * Make all waiters for direct IO properly wait also for extent + * conversion. This also disallows race between truncate() and + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + */ + if (iov_iter_rw(iter) == WRITE) + inode_dio_begin(inode); + + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); + + if (overwrite) { + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } + + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as + * unwritten to prevent parallel buffered read to expose + * the stale data before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block will + * just simply mark the buffer mapped but still keep the + * extents unwritten. + * + * For non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * For async DIO, the conversion needs to be deferred when the + * IO is completed. The ext4 end_io callback function will be + * called to take care of the conversion work. Here for async + * case, we allocate an io_end structure to hook to the iocb. + */ + iocb->private = NULL; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); + /* + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } + + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); +#endif + if (IS_DAX(inode)) + ret = dax_do_io(iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); + + /* + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. + */ + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * When no IO was submitted ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { + WARN_ON(iocb->private != io_end); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(NULL, inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } + +retake_lock: + if (iov_iter_rw(iter) == WRITE) + inode_dio_end(inode); + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); + } + + return ret; +} + +static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; +#endif + + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + + /* Let buffer I/O handle the inline data case. */ + if (ext4_has_inline_data(inode)) + return 0; + + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_direct_IO(iocb, iter, offset); + else + ret = ext4_ind_direct_IO(iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext4's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext4_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext4_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .writepages = ext4_writepages, + .write_begin = ext4_write_begin, + .write_end = ext4_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_journalled_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .writepages = ext4_writepages, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_journalled_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .writepages = ext4_writepages, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void ext4_set_aops(struct inode *inode) +{ + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); + break; + case EXT4_INODE_JOURNAL_DATA_MODE: + inode->i_mapping->a_ops = &ext4_journalled_aops; + return; + default: + BUG(); + } + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; +} + +static int __ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, pos; + ext4_lblk_t iblock; + struct inode *inode = mapping->host; + struct buffer_head *bh; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + if (S_ISREG(inode->i_mode) && + ext4_encrypted_inode(inode)) { + /* We expect the key to be set. */ + BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(blocksize != PAGE_CACHE_SIZE); + WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + } + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); + + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + err = ext4_jbd2_file_inode(handle, inode); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; + + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); + + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; + + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; + } + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); + return err; +} + +int ext4_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on success or negative on failure + */ + +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_block_offset, last_block_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_punch_hole(inode, offset, length, 0); + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + + } + + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + + /* Now release the pages and zero block aligned part of pages*/ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + ret = ext4_zero_partial_blocks(handle, inode, offset, + length); + if (ret) + goto out_stop; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_ind_remove_space(handle, inode, first_block, + stop_block); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + +/* + * ext4_truncate() + * + * We block out ext4_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext4_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext4_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext4 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext4_truncate() run will find them and release them. + */ +void ext4_truncate(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + + /* + * There is a possibility that we're either freeing the inode + * or it's a completely new inode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + return; + + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + ext4_inline_data_truncate(inode, &has_inline); + if (has_inline) + return; + } + + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(handle, inode); + else + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_evict_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + trace_ext4_truncate_exit(inode); +} + +/* + * ext4_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext4_get_inode_loc(struct inode *inode, + struct ext4_iloc *iloc, int in_mem) +{ + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = NULL; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; + + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); + if (unlikely(!bh)) + return -ENOMEM; + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (unlikely(!bitmap_bh)) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~((ext4_fsblk_t) ra_blks - 1); + if (table > b) + b = table; + end = b + ra_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(inode); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext4_get_inode_loc(inode, iloc, + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); +} + +void ext4_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & EXT4_SYNC_FL) + new_fl |= S_SYNC; + if (flags & EXT4_APPEND_FL) + new_fl |= S_APPEND; + if (flags & EXT4_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); +} + +/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +void ext4_get_inode_flags(struct ext4_inode_info *ei) +{ + unsigned int vfs_fl; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| + EXT4_DIRSYNC_FL); + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; + if (vfs_fl & S_IMMUTABLE) + new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); +} + +static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + blkcnt_t i_blocks ; + struct inode *inode = &(ei->vfs_inode); + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + /* we are using combined 48 bit field */ + i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | + le32_to_cpu(raw_inode->i_blocks_lo); + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { + /* i_blocks represent file system block size */ + return i_blocks << (inode->i_blkbits - 9); + } else { + return i_blocks; + } + } else { + return le32_to_cpu(raw_inode->i_blocks_lo); + } +} + +static inline void ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_find_inline_data_nolock(inode); + } else + EXT4_I(inode)->i_inline_off = 0; +} + +struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +{ + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; + uid_t i_uid; + gid_t i_gid; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT4_I(inode); + iloc.bh = NULL; + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + ret = -EIO; + goto bad_inode; + } + } else + ei->i_extra_isize = 0; + + /* Precompute checksum seed for inode metadata */ + if (ext4_has_metadata_csum(sb)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = raw_inode->i_generation; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + + if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + EXT4_ERROR_INODE(inode, "checksum invalid"); + ret = -EIO; + goto bad_inode; + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { + /* this inode is deleted */ + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ + } + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + inode->i_blocks = ext4_inode_blocks(raw_inode, ei); + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + inode->i_size = ext4_isize(raw_inode); + ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT4_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + } else { + ext4_iget_extra_inode(inode, raw_inode, ei); + } + } + + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } + } + + ret = 0; + if (ei->i_file_acl && + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ei->i_file_acl); + ret = -EIO; + goto bad_inode; + } else if (!ext4_has_inline_data(inode)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } + } + if (ret) + goto bad_inode; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext4_inode_is_fast_symlink(inode) && + !ext4_encrypted_inode(inode)) { + inode->i_op = &ext4_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + } + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &ext4_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); + } else { + ret = -EIO; + EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + goto bad_inode; + } + brelse(iloc.bh); + ext4_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + brelse(iloc.bh); + iget_failed(inode); + return ERR_PTR(ret); +} + +struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) +{ + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-EIO); + return ext4_iget(sb, ino); +} + +static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = inode->i_blocks; + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represented in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +struct other_inode { + unsigned long orig_ino; + struct ext4_inode *raw_inode; +}; + +static int other_inode_match(struct inode * inode, unsigned long ino, + void *data) +{ + struct other_inode *oi = (struct other_inode *) data; + + if ((inode->i_ino != ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + ((inode->i_state & I_DIRTY_TIME) == 0)) + return 0; + spin_lock(&inode->i_lock); + if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + (inode->i_state & I_DIRTY_TIME)) { + struct ext4_inode_info *ei = EXT4_I(inode); + + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + + spin_lock(&ei->i_raw_lock); + EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); + ext4_inode_csum_set(inode, oi->raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + trace_ext4_other_inode_update_time(inode, oi->orig_ino); + return -1; + } + spin_unlock(&inode->i_lock); + return -1; +} + +/* + * Opportunistically update the other time fields for other inodes in + * the same inode table block. + */ +static void ext4_update_other_inodes_time(struct super_block *sb, + unsigned long orig_ino, char *buf) +{ + struct other_inode oi; + unsigned long ino; + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int inode_size = EXT4_INODE_SIZE(sb); + + oi.orig_ino = orig_ino; + ino = (orig_ino & ~(inodes_per_block - 1)) + 1; + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { + if (ino == orig_ino) + continue; + oi.raw_inode = (struct ext4_inode *) buf; + (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + } +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; + int err = 0, rc, block; + int need_datasync = 0, set_large_file = 0; + uid_t i_uid; + gid_t i_gid; + + spin_lock(&ei->i_raw_lock); + + /* For fields not tracked in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + err = ext4_inode_blocks_set(handle, raw_inode, ei); + if (err) { + spin_unlock(&ei->i_raw_lock); + goto out_brelse; + } + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + if (ei->i_disksize != ext4_isize(raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } + if (ei->i_disksize > 0x7fffffffULL) { + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT4_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else if (!ext4_has_inline_data(inode)) { + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + } + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } + } + ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + if (inode->i_sb->s_flags & MS_LAZYTIME) + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, + bh->b_data); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); + if (!err) + err = rc; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } + ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * transaction to commit. + * + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. + * + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. + */ +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err; + + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + return 0; + + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) + return 0; + + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; + + err = __ext4_get_inode_loc(inode, &iloc, 0); + if (err) + return err; + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); + err = -EIO; + } + brelse(iloc.bh); + } + return err; +} + +/* + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate + * buffers that are attached to a page stradding i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ + struct page *page; + unsigned offset; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = 0; + int ret; + + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* + * All buffers in the last page remain valid? Then there's nothing to + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == + * blocksize case + */ + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) + return; + while (1) { + page = find_lock_page(inode->i_mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + ret = __ext4_journalled_invalidatepage(page, offset, + PAGE_CACHE_SIZE - offset); + unlock_page(page); + page_cache_release(page); + if (ret != -EBUSY) + return; + commit_tid = 0; + read_lock(&journal->j_state_lock); + if (journal->j_committing_transaction) + commit_tid = journal->j_committing_transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (commit_tid) + jbd2_log_wait_commit(journal, commit_tid); + } +} + +/* + * ext4_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. + */ +int ext4_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error, rc = 0; + int orphan = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = dquot_transfer(inode, attr); + if (error) { + ext4_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; + } + + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + /* + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). + */ + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else { + loff_t oldsize = inode->i_size; + + i_size_write(inode, attr->ia_size); + pagecache_isize_extended(inode, oldsize, inode->i_size); + } + + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); + } + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, inode->i_size); + } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); + + if (!rc) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + + /* + * If the call to ext4_truncate failed to get a transaction handle at + * all, we need to clean up the in-core orphan list manually. + */ + if (orphan && inode->i_nlink) + ext4_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = posix_acl_chmod(inode, inode->i_mode); + +err_out: + ext4_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long long delalloc_blocks; + + inode = d_inode(dentry); + generic_fillattr(inode, stat); + + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others doen't incorrectly think the file is completely sparse. + */ + if (unlikely(ext4_has_inline_data(inode))) + stat->blocks += (stat->size + 511) >> 9; + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), + EXT4_I(inode)->i_reserved_data_blocks); + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); + return 0; +} + +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) +{ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); +} + +/* + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiguous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks + */ +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) +{ + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to map @lblocks logical blocks + * to @pextents physical extents? + */ + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks + pextents; + gdpblocks = groups; + if (groups > ngroups) + groups = ngroups; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calculate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. + * + * This could be called via ext4_write_begin() + * + * We need to consider the worse case, when + * one new block per extent. + */ +int ext4_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + int ret; + + ret = ext4_meta_trans_blocks(inode, bpp, bpp); + + /* Account for data blocks for journalled mode */ + if (ext4_should_journal_data(inode)) + ret += bpp; + return ret; +} + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* + * The caller must have previously called ext4_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext4_iloc *iloc) +{ + int err = 0; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ + err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc) +{ + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + raw_inode = ext4_raw_inode(&iloc); + + header = IHDR(inode, raw_inode); + + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + return ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + */ +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + int err, ret; + + might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if ((jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + ret = ext4_expand_extra_isize(inode, + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); + if (mnt_count != + le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, + "Unable to expand inode %lu. Delete" + " some EAs or run e2fsck.", + inode->i_ino); + mnt_count = + le16_to_cpu(sbi->s_es->s_mnt_count); + } + } + } + } + if (!err) + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext4_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_block() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. + */ +void ext4_dirty_inode(struct inode *inode, int flags) +{ + handle_t *handle; + + if (flags == I_DIRTY_TIME) + return; + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + goto out; + + ext4_mark_inode_dirty(handle, inode); + + ext4_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext4_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext4_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + + int err = 0; + if (handle) { + err = ext4_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = jbd2_journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext4_handle_dirty_metadata(handle, + NULL, + iloc.bh); + brelse(iloc.bh); + } + } + ext4_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext4_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; + if (is_journal_aborted(journal)) + return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + jbd2_journal_lock_updates(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + else { + err = jbd2_journal_flush(journal); + if (err < 0) { + jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); + return err; + } + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); + + /* Finally we can mark the inode as dirty. */ + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_mark_inode_dirty(handle, inode); + ext4_handle_sync(handle); + ext4_journal_stop(handle); + ext4_std_error(inode->i_sb, err); + + return err; +} + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + loff_t size; + unsigned long len; + int ret; + struct file *file = vma->vm_file; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + handle_t *handle; + get_block_t *get_block; + int retries = 0; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_should_journal_data(inode) && + !ext4_nonda_switch(inode->i_sb)) { + do { + ret = __block_page_mkwrite(vma, vmf, + ext4_da_get_block_prep); + } while (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)); + goto out_ret; + } + + lock_page(page); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (page->mapping != mapping || page_offset(page) > size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + /* + * Return if we have all the buffers mapped. This avoids the need to do + * journal_start/journal_stop which can block and take a long time + */ + if (page_has_buffers(page)) { + if (!ext4_walk_page_buffers(NULL, page_buffers(page), + 0, len, NULL, + ext4_bh_unmapped)) { + /* Wait so that we don't change page under IO */ + wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; + goto out; + } + } + unlock_page(page); + /* OK, we need to fill the hole... */ + if (ext4_should_dioread_nolock(inode)) + get_block = ext4_get_block_write; + else + get_block = ext4_get_block; +retry_alloc: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = __block_page_mkwrite(vma, vmf, get_block); + if (!ret && ext4_should_journal_data(inode)) { + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); + goto out; + } + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + } + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_alloc; +out_ret: + ret = block_page_mkwrite_return(ret); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} diff --git a/kernel/fs/ext4/ioctl.c b/kernel/fs/ext4/ioctl.c new file mode 100644 index 000000000..2cb9e178d --- /dev/null +++ b/kernel/fs/ext4/ioctl.c @@ -0,0 +1,773 @@ +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + lock_two_nondirectories(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto journal_err_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + ext4_journal_stop(handle); + ext4_double_up_write_data_sem(inode, inode_bl); + +journal_err_out: + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + unlock_two_nondirectories(inode, inode_bl); + iput(inode_bl); + return err; +} + +static int uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return 0; + return 1; +} + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int flags; + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT4_IOC_GETFLAGS: + ext4_get_inode_flags(ei); + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT4_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_mask_flags(inode->i_mode, flags); + + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (ext4_has_metadata_csum(inode->i_sb)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + mutex_lock(&inode->i_mutex); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = ext4_current_time(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); +setversion_out: + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_extend_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_extend_out; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); +group_extend_out: + ext4_resize_end(sb); + return err; + } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + struct fd donor; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + me.moved_len = 0; + + donor = fdget(me.donor_fd); + if (!donor.file) + return -EBADF; + + if (!(donor.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + err = -EOPNOTSUPP; + goto mext_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto mext_out; + + err = ext4_move_extents(filp, donor.file, me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) + err = -EFAULT; +mext_out: + fdput(donor); + return err; + } + + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, &input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input.group); +group_add_out: + ext4_resize_end(sb); + return err; + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_SWAP_BOOT: + { + int err; + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write_file(filp); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); + if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + +resizefs_out: + ext4_resize_end(sb); + return err; + } + + case FITRIM: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); + case EXT4_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (copy_from_user(&policy, + (struct ext4_encryption_policy __user *)arg, + sizeof(policy))) { + err = -EFAULT; + goto encryption_policy_out; + } + + err = ext4_process_policy(&policy, inode); +encryption_policy_out: + return err; +#else + return -EOPNOTSUPP; +#endif + } + case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + int err, err2; + struct ext4_sb_info *sbi = EXT4_SB(sb); + handle_t *handle; + + if (!ext4_sb_has_crypto(sb)) + return -EOPNOTSUPP; + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto pwsalt_err_journal; + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + err = ext4_handle_dirty_metadata(handle, NULL, + sbi->s_sbh); + pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, + 16)) + return -EFAULT; + return 0; + } + case EXT4_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (!ext4_encrypted_inode(inode)) + return -ENOENT; + err = ext4_get_policy(inode, &policy); + if (err) + return err; + if (copy_to_user((void *)arg, &policy, sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif + } + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETFLAGS: + cmd = EXT4_IOC_GETFLAGS; + break; + case EXT4_IOC32_SETFLAGS: + cmd = EXT4_IOC_SETFLAGS; + break; + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_input input; + mm_segment_t old_fs; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, + (unsigned long) &input); + set_fs(old_fs); + return err; + } + case EXT4_IOC_MOVE_EXT: + case FITRIM: + case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: + case EXT4_IOC_SET_ENCRYPTION_POLICY: + case EXT4_IOC_GET_ENCRYPTION_PWSALT: + case EXT4_IOC_GET_ENCRYPTION_POLICY: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/kernel/fs/ext4/mballoc.c b/kernel/fs/ext4/mballoc.c new file mode 100644 index 000000000..8d1e60214 --- /dev/null +++ b/kernel/fs/ext4/mballoc.c @@ -0,0 +1,5238 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include "ext4_jbd2.h" +#include "mballoc.h" +#include +#include +#include +#include + +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif + +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4//mb_stream_req. The value is represented in + * terms of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small files closer together on the disk. + * + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list represented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) within the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is + * 512 blocks. This can be tuned via + * /sys/fs/ext4//mb_group_prealloc. The value is represented in + * terms of number of blocks. If we have mounted the file system with -O + * stripe= option the group prealloc request is normalized to the + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. + * + * The regular allocator (using the buddy cache) supports a few tunables. + * + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /sys/fs/ext4//mb_order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contiguous block in + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scan indicates how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can be used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_data_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); + +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline int mb_test_bit(int bit, void *addr) +{ + /* + * ext4_test_bit on architecture like powerpc + * needs unsigned long aligned address + */ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +{ + char *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(max == NULL); + + if (order > e4b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return e4b->bd_bitmap; + } + + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef DOUBLE_CHECK +static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int i; + struct super_block *sb = e4b->bd_sb; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); + } + mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) +{ + int i; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); + mb_set_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { + unsigned char *b1, *b2; + int i; + b1 = (unsigned char *) e4b->bd_info->bb_bitmap; + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } + } +} + +#else +static inline void mb_free_blocks_double(struct inode *inode, + struct ext4_buddy *e4b, int first, int count) +{ + return; +} +static inline void mb_mark_used_double(struct ext4_buddy *e4b, + int first, int count) +{ + return; +} +static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + return; +} +#endif + +#ifdef AGGRESSIVE_CHECK + +#define MB_CHECK_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + function, file, line, # assert); \ + BUG(); \ + } \ +} while (0) + +static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, + const char *function, int line) +{ + struct super_block *sb = e4b->bd_sb; + int order = e4b->bd_blkbits + 1; + int max; + int max2; + int i; + int j; + int k; + int count; + struct ext4_group_info *grp; + int fragments = 0; + int fstart; + struct list_head *cur; + void *buddy; + void *buddy2; + + { + static int mb_check_counter; + if (mb_check_counter++ % 100 != 0) + return 0; + } + + while (order > 1) { + buddy = mb_find_buddy(e4b, order, &max); + MB_CHECK_ASSERT(buddy); + buddy2 = mb_find_buddy(e4b, order - 1, &max2); + MB_CHECK_ASSERT(buddy2); + MB_CHECK_ASSERT(buddy != buddy2); + MB_CHECK_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit((i<<1)+1, buddy2)); + } else if (!mb_test_bit((i << 1) + 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit(i << 1, buddy2)); + } + continue; + } + + /* both bits in buddy2 must be 1 */ + MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); + MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + MB_CHECK_ASSERT( + !mb_test_bit(k, e4b->bd_bitmap)); + } + count++; + } + MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e4b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e4b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + } + } + MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); + MB_CHECK_ASSERT(groupnr == e4b->bd_group); + for (i = 0; i < pa->pa_len; i++) + MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); + } + return 0; +} +#undef MB_CHECK_ASSERT +#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ + __FILE__, __func__, __LINE__) +#else +#define mb_check_buddy(e4b) +#endif + +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; + unsigned short border; + + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fls(len) - 1; + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) + mb_clear_bit(first >> min, + buddy + sbi->s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; + unsigned free = 0; + unsigned fragments = 0; + unsigned long long period = get_cycles(); + + /* initialize buddy from bitmap which is aggregation + * of on-disk bitmap and preallocations */ + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = mb_find_next_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext4_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { + ext4_grp_locked_error(sb, group, 0, 0, + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", + free, grp->bb_free); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + } + mb_set_largest_free_order(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + spin_lock(&EXT4_SB(sb)->s_bal_lock); + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); +} + +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + +/* The buddy information is attached the buddy cache inode + * for convenience. The information regarding each group + * is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are + * stored in the inode as + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. + * So for each group we take up 2 blocks. A page can + * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! + */ + +static int ext4_mb_init_cache(struct page *page, char *incore) +{ + ext4_group_t ngroups; + int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group, group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh = NULL; + struct inode *inode; + char *data; + char *bitmap; + struct ext4_group_info *grinfo; + + mb_debug(1, "init page %lu\n", page->index); + + inode = page->mapping->host; + sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kzalloc(i, GFP_NOFS); + if (bh == NULL) { + err = -ENOMEM; + goto out; + } + } else + bh = &bhs; + + first_group = page->index * blocks_per_page / 2; + + /* read all groups the page covers into the cache */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) + break; + + grinfo = ext4_get_group_info(sb, group); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; + goto out; + } + mb_debug(1, "read bitmap for group %u\n", group); + } + + /* wait for I/O completion */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; + goto out; + } + } + + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; + + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + + /* + * data carry information regarding this + * particular group in the format specified + * above + * + */ + data = page_address(page) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + /* + * We place the buddy block and bitmap block + * close together + */ + if ((first_block + i) & 1) { + /* this is block of buddy */ + BUG_ON(incore == NULL); + mb_debug(1, "put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo = ext4_get_group_info(sb, group); + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); + ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { + /* this is block of bitmap */ + BUG_ON(incore != NULL); + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); + + /* see comments in ext4_mb_put_pa() */ + ext4_lock_group(sb, group); + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ + ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be + * generated using this + */ + incore = data; + } + } + SetPageUptodate(page); + +out: + if (bh) { + for (i = 0; i < groups_per_page; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + block++; + pnum = block / blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + struct ext4_group_info *this_grp; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; + + might_sleep(); + mb_debug(1, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + goto err; + } + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + if (e4b.bd_buddy_page == NULL) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + ret = 0; + goto err; + } + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } +err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + int blocks_per_page; + int block; + int pnum; + int poff; + struct page *page; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + + might_sleep(); + mb_debug(1, "load group %u\n", group); + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = grp; + e4b->bd_sb = sb; + e4b->bd_group = group; + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + } + + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); + if (page == NULL || !PageUptodate(page)) { + if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + mb_cmp_bitmaps(e4b, page_address(page) + + (poff * sb->s_blocksize)); + } + unlock_page(page); + } + } + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + /* Pages marked accessed already */ + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } + unlock_page(page); + } + } + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + /* Pages marked accessed already */ + e4b->bd_buddy_page = page; + e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + return 0; + +err: + if (page) + page_cache_release(page); + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; + return ret; +} + +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); +} + + +static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) +{ + int order = 1; + void *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); + + bb = e4b->bd_buddy; + while (order <= e4b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e4b->bd_blkbits - order); + order++; + } + return 0; +} + +static void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit(cur, bm); + cur++; + } +} + +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + +void ext4_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: set whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit(cur, bm); + cur++; + } +} + +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; + struct super_block *sb = e4b->bd_sb; + + if (WARN_ON(count == 0)) + return; + BUG_ON(last >= (sb->s_blocksize << 3)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ + if (first != 0) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u); block bitmap corrupt.", + block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); + mb_regenerate_buddy(e4b); + goto done; + } + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) + e4b->bd_info->bb_fragments--; + else if (!left_is_free && !right_is_free) + e4b->bd_info->bb_fragments++; + + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } + + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); + +done: + mb_set_largest_free_order(sb, e4b->bd_info); + mb_check_buddy(e4b); +} + +static int mb_find_extent(struct ext4_buddy *e4b, int block, + int needed, struct ext4_free_extent *ex) +{ + int next = block; + int max, order; + void *buddy; + + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + BUG_ON(ex == NULL); + + buddy = mb_find_buddy(e4b, 0, &max); + BUG_ON(buddy == NULL); + BUG_ON(block >= max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e4b->bd_group; + + /* calc difference from given start */ + next = next - ex->fe_start; + ex->fe_len -= next; + ex->fe_start += next; + + while (needed > ex->fe_len && + mb_find_buddy(e4b, order, &max)) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, e4b->bd_bitmap)) + break; + + order = mb_find_order_for_block(e4b, next); + + block = next >> order; + ex->fe_len += 1 << order; + } + + BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + int ord; + int mlen = 0; + int max = 0; + int cur; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + + BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); + BUG_ON(e4b->bd_group != ex->fe_group); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); + if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, e4b->bd_bitmap); + if (mlen && max) + e4b->bd_info->bb_fragments++; + else if (!mlen && !max) + e4b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e4b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e4b, ord, &max); + BUG_ON((start >> ord) >= max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + BUG_ON(len < 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + /* we have to split large buddy */ + BUG_ON(ord <= 0); + buddy = mb_find_buddy(e4b, ord, &max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e4b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e4b->bd_info->bb_counters[ord]++; + e4b->bd_info->bb_counters[ord]++; + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); + mb_check_buddy(e4b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int ret; + + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; + ret = mb_mark_used(e4b, &ac->ac_b_ex); + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ + ac->ac_bitmap_page = e4b->bd_bitmap_page; + get_page(ac->ac_bitmap_page); + ac->ac_buddy_page = e4b->bd_buddy_page; + get_page(ac->ac_buddy_page); + /* store last allocated for subsequent stream allocation */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + spin_lock(&sbi->s_md_lock); + sbi->s_mb_last_group = ac->ac_f_ex.fe_group; + sbi->s_mb_last_start = ac->ac_f_ex.fe_start; + spin_unlock(&sbi->s_md_lock); + } +} + +/* + * regular allocator, for general purposes allocation + */ + +static void ext4_mb_check_limits(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b, + int finish_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + struct ext4_free_extent ex; + int max; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + ac->ac_status = AC_STATUS_BREAK; + return; + } + + /* + * Haven't found good chunk so far, let's continue + */ + if (bex->fe_len < gex->fe_len) + return; + + if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) + && bex->fe_group == e4b->bd_group) { + /* recheck chunk's availability - we don't know + * when it was found (within this lock-unlock + * period or not) */ + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); + if (max >= gex->fe_len) { + ext4_mb_use_best_found(ac, e4b); + return; + } + } +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, + struct ext4_free_extent *ex, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + BUG_ON(ex->fe_len <= 0); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + if (ex->fe_len < bex->fe_len) + *bex = *ex; + } + + ext4_mb_check_limits(ac, e4b, 0); +} + +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex = ac->ac_b_ex; + ext4_group_t group = ex.fe_group; + int max; + int err; + + BUG_ON(ex.fe_len <= 0); + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + ext4_group_t group = ac->ac_g_ex.fe_group; + int max; + int err; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct ext4_free_extent ex; + + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return 0; + if (grp->bb_free == 0) + return 0; + + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + ext4_fsblk_t start; + + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; + /* use do_div to get remainder (would be 64-bit modulo) */ + if (do_div(start, sbi->s_stripe) == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_group_info *grp = e4b->bd_info; + void *buddy; + int i; + int k; + int max; + + BUG_ON(ac->ac_2order <= 0); + for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e4b, i, &max); + BUG_ON(buddy == NULL); + + k = mb_find_next_zero_bit(buddy, max, 0); + BUG_ON(k >= max); + + ac->ac_found++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e4b->bd_group; + + ext4_mb_use_best_found(ac, e4b); + + BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + + if (EXT4_SB(sb)->s_mb_stats) + atomic_inc(&EXT4_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + int i; + int free; + + free = e4b->bd_info->bb_free; + BUG_ON(free <= 0); + + i = e4b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { + /* + * IF we have corrupt bitmap, we won't find any + * free blocks even though group info says we + * we have free blocks + */ + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But bitmap says 0", + free); + break; + } + + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); + BUG_ON(ex.fe_len <= 0); + if (free < ex.fe_len) { + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But got %d blocks", + free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; + } + ex.fe_logical = 0xDEADC0DE; /* debug value */ + ext4_mb_measure_extent(ac, &ex, e4b); + + i += ex.fe_len; + free -= ex.fe_len; + } + + ext4_mb_check_limits(ac, e4b, 1); +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size-multiple requests + */ +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + ext4_fsblk_t first_group_block; + ext4_fsblk_t a; + ext4_grpblk_t i; + int max; + + BUG_ON(sbi->s_stripe == 0); + + /* find first stripe-aligned block in group */ + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + + a = first_group_block + sbi->s_stripe - 1; + do_div(a, sbi->s_stripe); + i = (a * sbi->s_stripe) - first_group_block; + + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); + if (max >= sbi->s_stripe) { + ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + break; + } + } + i += sbi->s_stripe; + } +} + +/* This is now called BEFORE we load the buddy bitmap. */ +static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + unsigned free, fragments; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < 0 || cr >= 4); + + free = grp->bb_free; + if (free == 0) + return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } + + fragments = grp->bb_fragments; + if (fragments == 0) + return 0; + + switch (cr) { + case 0: + BUG_ON(ac->ac_2order == 0); + + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + + return 1; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; + break; + case 3: + return 1; + default: + BUG(); + } + + return 0; +} + +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +{ + ext4_group_t ngroups, group, i; + int cr; + int err = 0; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ + err = ext4_mb_find_by_goal(ac, &e4b); + if (err || ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + /* + * ac->ac2_order is set only if the fe_len is a power of 2 + * if ac2_order is set we also set criteria to 0 so that we + * try exact allocation using buddy. + */ + i = fls(ac->ac_g_ex.fe_len); + ac->ac_2order = 0; + /* + * We search using buddy data only if the order of the request + * is greater than equal to the sbi_s_mb_order2_reqs + * You can tune it via /sys/fs/ext4//mb_order2_req + */ + if (i >= sbi->s_mb_order2_reqs) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = i - 1; + } + + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); + ac->ac_g_ex.fe_group = sbi->s_mb_last_group; + ac->ac_g_ex.fe_start = sbi->s_mb_last_start; + spin_unlock(&sbi->s_md_lock); + } + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything + */ +repeat: + for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + + for (i = 0; i < ngroups; group++, i++) { + cond_resched(); + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) + group = 0; + + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) + continue; + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) + goto out; + + ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ + if (!ext4_mb_good_group(ac, group, cr)) { + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ac->ac_groups_scanned++; + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) + ext4_mb_simple_scan_group(ac, &e4b); + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + ext4_mb_scan_aligned(ac, &e4b); + else + ext4_mb_complex_scan_group(ac, &e4b); + + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + + ext4_mb_try_best_found(ac, &e4b); + if (ac->ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); + */ + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + cr = 3; + atomic_inc(&sbi->s_mb_lost_chunks); + goto repeat; + } + } +out: + return err; +} + +static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + ++*pos; + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err, buddy_loaded = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + struct sg { + struct ext4_group_info info; + ext4_grpblk_t counters[16]; + } sg; + + group--; + if (group == 0) + seq_printf(seq, "#%-5s: %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", + "group", "free", "frags", "first", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + grinfo = ext4_get_group_info(sb, group); + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + buddy_loaded = 1; + } + + memcpy(&sg, ext4_get_group_info(sb, group), i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); + seq_printf(seq, " ]\n"); + + return 0; +} + +static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_mb_seq_groups_ops = { + .start = ext4_mb_seq_groups_start, + .next = ext4_mb_seq_groups_next, + .stop = ext4_mb_seq_groups_stop, + .show = ext4_mb_seq_groups_show, +}; + +static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE_DATA(inode); + int rc; + + rc = seq_open(file, &ext4_mb_seq_groups_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = sb; + } + return rc; + +} + +static const struct file_operations ext4_mb_seq_groups_fops = { + .owner = THIS_MODULE, + .open = ext4_mb_seq_groups_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_NOFS); + if (meta_group_info == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate mem " + "for a buddy group"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); + if (meta_group_info[i] == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_clusters_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + ext4_free_group_clusters(sb, desc); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_NOFS); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +static int ext4_mb_init_backend(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + struct ext4_group_desc *desc; + struct kmem_cache *cachep; + + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; + + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + ext4_msg(sb, KERN_ERR, "can't get new inode"); + goto err_freesgi; + } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; + } + + return 0; + +err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); + while (i-- > 0) + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = sbi->s_group_info_size; + while (i-- > 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); +err_freesgi: + kvfree(sbi->s_group_info); + return -ENOMEM; +} + +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + ext4_groupinfo_caches[cache_index] = cachep; + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +int ext4_mb_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned i, j; + unsigned offset; + unsigned max; + int ret; + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + ret = -ENOMEM; + goto out; + } + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += 1 << (sb->s_blocksize_bits - i); + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); + + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { + ret = -ENOMEM; + goto out; + } + for_each_possible_cpu(i) { + struct ext4_locality_group *lg; + lg = per_cpu_ptr(sbi->s_locality_groups, i); + mutex_init(&lg->lg_mutex); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); + spin_lock_init(&lg->lg_prealloc_lock); + } + + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out: + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; + return ret; +} + +/* need to called with the ext4 group lock held */ +static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +{ + struct ext4_prealloc_space *pa; + struct list_head *cur, *tmp; + int count = 0; + + list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + list_del(&pa->pa_group_list); + count++; + kmem_cache_free(ext4_pspace_cachep, pa); + } + if (count) + mb_debug(1, "mballoc: %u PAs left\n", count); + +} + +int ext4_mb_release(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + int num_meta_group_infos; + struct ext4_group_info *grinfo; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { + grinfo = ext4_get_group_info(sb, i); +#ifdef DOUBLE_CHECK + kfree(grinfo->bb_bitmap); +#endif + ext4_lock_group(sb, i); + ext4_mb_cleanup_pa(grinfo); + ext4_unlock_group(sb, i); + kmem_cache_free(cachep, grinfo); + } + num_meta_group_infos = (ngroups + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); + kvfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, + sbi->s_mb_generation_time); + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } + + free_percpu(sbi->s_locality_groups); + + return 0; +} + +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t cluster, int count) +{ + ext4_fsblk_t discard_block; + + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); +} + +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) +{ + struct ext4_free_data *entry = (struct ext4_free_data *)jce; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); + + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } + + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); + + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); + + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +} + +int __init ext4_init_mballoc(void) +{ + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); + if (ext4_pspace_cachep == NULL) + return -ENOMEM; + + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); + if (ext4_ac_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + return -ENOMEM; + } + + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } + return 0; +} + +void ext4_exit_mballoc(void) +{ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); +} + + +/* + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps + * Returns 0 if success or error code + */ +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned int reserv_clstrs) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block; + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + + err = -EIO; + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); + if (!bitmap_bh) + goto out_err; + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto out_err; + + err = -EIO; + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, + ext4_free_group_clusters(sb, gdp)); + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (!ext4_data_block_valid(sbi, block, len)) { + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata", block, block+len); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; + } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < ac->ac_b_ex.fe_len; i++) { + BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, + bitmap_bh->b_data)); + } + } +#endif + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); + } + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + +out_err: + brelse(bitmap_bh); + return err; +} + +/* + * here we normalize request for locality group + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via + * /sys/fs/ext4//mb_group_prealloc + * + * XXX: should we try to preallocate more than the group has now? + */ +static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug(1, "#%u: goal %u blocks for locality group\n", + current->pid, ac->ac_g_ex.fe_len); +} + +/* + * Normalization means making request better in terms of + * size and alignment + */ +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits, max; + ext4_lblk_t end; + loff_t size, start_off; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests + do not need preallocation */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + /* sometime caller may want exact blocks */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + /* caller may indicate that preallocation isn't + * required (it's a tail, for example) */ + if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) + return; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { + ext4_mb_normalize_group_request(ac); + return ; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + + /* first, let's learn actual file size + * given current request is allocated */ + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + orig_size = size; + + /* max size of free chunks */ + max = 2 << bsbits; + +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) + + /* first, try to predict filesize */ + /* XXX: should this table be tunable? */ + start_off = 0; + if (size <= 16 * 1024) { + size = 16 * 1024; + } else if (size <= 32 * 1024) { + size = 32 * 1024; + } else if (size <= 64 * 1024) { + size = 64 * 1024; + } else if (size <= 128 * 1024) { + size = 128 * 1024; + } else if (size <= 256 * 1024) { + size = 256 * 1024; + } else if (size <= 512 * 1024) { + size = 512 * 1024; + } else if (size <= 1024 * 1024) { + size = 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size = 4 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + (8<<20)>>bsbits, max, 8 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size = 8 * 1024 * 1024; + } else { + start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; + size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + ac->ac_o_ex.fe_len) << bsbits; + } + size = size >> bsbits; + start = start_off >> bsbits; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { + size -= ar->lleft + 1 - start; + start = ar->lleft + 1; + } + if (ar->pright && start + size - 1 >= ar->lright) + size -= start + size - ar->lright; + + end = start + size; + + /* check we don't cross already preallocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + + if (pa->pa_deleted) + continue; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); + + /* PA must not overlap original request */ + BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || + ac->ac_o_ex.fe_logical < pa->pa_lstart)); + + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { + spin_unlock(&pa->pa_lock); + continue; + } + BUG_ON(pa->pa_lstart <= start && pa_end >= end); + + /* adjust start or end to be adjacent to this pa */ + if (pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(pa_end < start); + start = pa_end; + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + BUG_ON(pa->pa_lstart > end); + end = pa->pa_lstart; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + size = end - start; + + /* XXX: extra loop to check we really don't overlap preallocations */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0) { + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); + BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + if (start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical) { + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + + /* XXX: is it better to align blocks WRT to logical + * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + + /* define goal start in order to merge */ + if (ar->pright && (ar->lright == (start + size))) { + /* merge to the right */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + if (ar->pleft && (ar->lleft + 1 == start)) { + /* merge to the left */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, + (unsigned) orig_size, (unsigned) start); +} + +static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); +} + +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + struct ext4_buddy e4b; + int err; + + if (pa == NULL) { + if (ac->ac_f_ex.fe_len == 0) + return; + err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); + if (err) { + /* + * This should never happen since we pin the + * pages in the ext4_allocation_context so + * ext4_mb_load_buddy() should never fail. + */ + WARN(1, "mb_load_buddy failed (%d)", err); + return; + } + ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, + ac->ac_f_ex.fe_len); + ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + ext4_mb_unload_buddy(&e4b); + return; + } + if (pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; +} + +/* + * use blocks preallocated to inode + */ +static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + ext4_fsblk_t end; + int len; + + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); + ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + BUG_ON(start < pa->pa_pstart); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); + BUG_ON(pa->pa_free < len); + pa->pa_free -= len; + + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); +} + +/* + * use blocks preallocated to locality group + */ +static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + unsigned int len = ac->ac_o_ex.fe_len; + + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, + &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + /* we don't correct pa_pstart or pa_plen here to avoid + * possible race when the group is being loaded concurrently + * instead we correct pa later, after blocks are marked + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex + */ + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance <= new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* + * search goal blocks in preallocated space + */ +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int order, i; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; + + /* only data can be preallocated */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return 0; + + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + + /* all fields in this condition don't change, + * so we can skip locking for them */ + if (ac->ac_o_ex.fe_logical < pa->pa_lstart || + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) + continue; + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) + continue; + + /* found preallocated blocks, use them */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && pa->pa_free) { + atomic_inc(&pa->pa_count); + ext4_mb_use_inode_pa(ac, pa); + spin_unlock(&pa->pa_lock); + ac->ac_criteria = 10; + rcu_read_unlock(); + return 1; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + /* can we use group allocation? */ + if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) + return 0; + + /* inode may have no locality group for some reason */ + lg = ac->ac_lg; + if (lg == NULL) + return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } + return 0; +} + +/* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with the ext4 group lock held + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); + n = rb_next(n); + } + return; +} + +/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int len; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here + * notice we do NOT ignore preallocations with pa_deleted + * otherwise we could leave used blocks available for + * allocation in buddy when concurrent ext4_mb_put_pa() + * is dropping preallocation + */ + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); + if (unlikely(len == 0)) + continue; + BUG_ON(groupnr != group); + ext4_set_bits(bitmap, start, len); + preallocated += len; + } + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); +} + +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); + kmem_cache_free(ext4_pspace_cachep, pa); +} + +/* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed + */ +static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + struct super_block *sb, struct ext4_prealloc_space *pa) +{ + ext4_group_t grp; + ext4_fsblk_t grp_blk; + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + + if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } + + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + grp_blk = pa->pa_pstart; + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + + grp = ext4_get_group_number(sb, grp_blk); + + /* + * possible race: + * + * P1 (buddy init) P2 (regular allocation) + * find block B in PA + * copy on-disk bitmap to buddy + * mark B in on-disk bitmap + * drop PA from group + * mark all PAs in buddy + * + * thus, P1 initializes buddy with B available. to prevent this + * we make "copy" and "mark all PAs" atomic and serialize "drop PA" + * against that pair + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +} + +/* + * creates new preallocated space for given inode + */ +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + struct ext4_inode_info *ei; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + int winl; + int wins; + int win; + int offs; + + /* we can't allocate as much as normalizer wants. + * so, found space must get proper lstart + * to cover original request */ + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); + + /* we're limited by original request in that + * logical block must be covered any way + * winl is window we can move our chunk within */ + winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + + /* also, we should cover whole original request */ + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); + + /* the smallest one defines real window */ + win = min(winl, wins); + + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (offs && offs < win) + win = offs; + + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_NUM_B2C(sbi, win); + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + } + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_lstart = ac->ac_b_ex.fe_logical; + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); + + ext4_mb_use_inode_pa(ac, pa); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + + pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_obj_lock); + + return 0; +} + +/* + * creates new preallocated space for locality group inodes belongs to + */ +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); + + ext4_mb_use_group_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + lg = ac->ac_lg; + BUG_ON(lg == NULL); + + pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_inode = NULL; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ + return 0; +} + +static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +{ + int err; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + err = ext4_mb_new_group_pa(ac); + else + err = ext4_mb_new_inode_pa(ac); + return err; +} + +/* + * finds all unused blocks in on-disk bitmap, frees them in + * in-core bitmap and buddy. + * @pa must be unlinked from inode and group lists, so that + * nobody else can find/use it. + * the caller MUST hold group/inode locks. + * TODO: optimize the case when there are no in-core structures yet + */ +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int end; + unsigned int next; + ext4_group_t group; + ext4_grpblk_t bit; + unsigned long long grp_blk_start; + int err = 0; + int free = 0; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + end = bit + pa->pa_len; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); + if (bit >= end) + break; + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); + mb_debug(1, " free preallocated %u/%u in group %u\n", + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); + free += next - bit; + + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), + next - bit); + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } + if (free != pa->pa_free) { + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } + atomic_add(free, &sbi->s_mb_discarded); + + return err; +} + +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + ext4_group_t group; + ext4_grpblk_t bit; + + trace_ext4_mb_release_group_pa(sb, pa); + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); + atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); + + return 0; +} + +/* + * releases all preallocations in given group + * + * first, we need to decide discard policy: + * - when do we discard + * 1) ENOSPC + * - how many do we discard + * 1) how many requested + */ +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, + ext4_group_t group, int needed) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + struct list_head list; + struct ext4_buddy e4b; + int err; + int busy = 0; + int free = 0; + + mb_debug(1, "discard preallocation for group %u\n", group); + + if (list_empty(&grp->bb_prealloc_list)) + return 0; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", group); + return 0; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", group); + put_bh(bitmap_bh); + return 0; + } + + if (needed == 0) + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; + + INIT_LIST_HEAD(&list); +repeat: + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); + busy = 1; + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + + /* we can trust pa_free ... */ + free += pa->pa_free; + + spin_unlock(&pa->pa_lock); + + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } + + /* if we still need more blocks and some PAs were used, try again */ + if (free < needed && busy) { + busy = 0; + ext4_unlock_group(sb, group); + cond_resched(); + goto repeat; + } + + /* found anything to free? */ + if (list_empty(&list)) { + BUG_ON(free != 0); + goto out; + } + + /* now free all selected PAs */ + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + + /* remove from object (inode or locality group) */ + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + if (pa->pa_type == MB_GROUP_PA) + ext4_mb_release_group_pa(&e4b, pa); + else + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + return free; +} + +/* + * releases all non-used preallocated blocks for given inode + * + * It's important to discard preallocations under i_data_sem + * We don't want another block to be served from the prealloc + * space when we are discarding the inode prealloc space. + * + * FIXME!! Make sure it is valid at all the call sites + */ +void ext4_discard_preallocations(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct super_block *sb = inode->i_sb; + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + ext4_group_t group = 0; + struct list_head list; + struct ext4_buddy e4b; + int err; + + if (!S_ISREG(inode->i_mode)) { + /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ + return; + } + + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + trace_ext4_discard_preallocations(inode); + + INIT_LIST_HEAD(&list); + +repeat: + /* first, collect all pa's in the inode */ + spin_lock(&ei->i_prealloc_lock); + while (!list_empty(&ei->i_prealloc_list)) { + pa = list_entry(ei->i_prealloc_list.next, + struct ext4_prealloc_space, pa_inode_list); + BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* this shouldn't happen often - nobody should + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; + + } + if (pa->pa_deleted == 0) { + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &list); + continue; + } + + /* someone is deleting pa right now */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + + /* we have to wait here because pa_deleted + * doesn't mean pa is already unlinked from + * the list. as we might be called from + * ->clear_inode() the inode will get freed + * and concurrent thread which is unlinking + * pa from inode's list may access already + * freed memory, bad-bad-bad */ + + /* XXX: if this happens too often, we can + * add a flag to force wait only in case + * of ->clear_inode(), but not in case of + * regular truncate */ + schedule_timeout_uninterruptible(HZ); + goto repeat; + } + spin_unlock(&ei->i_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + BUG_ON(pa->pa_type != MB_INODE_PA); + group = ext4_get_group_number(sb, pa->pa_pstart); + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", + group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +#ifdef CONFIG_EXT4_DEBUG +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + ext4_group_t ngroups, i; + + if (!ext4_mballoc_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", + ac->ac_status, ac->ac_flags); + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); + } + ext4_unlock_group(sb, i); + + if (grp->bb_free == 0) + continue; + printk(KERN_ERR "%u: %d/%d \n", + i, grp->bb_free, grp->bb_fragments); + } + printk(KERN_ERR "\n"); +} +#else +static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + return; +} +#endif + +/* + * We use locality group preallocation for small size file. The size of the + * file is determined by the current size or the resulting size after + * allocation which ever is larger + * + * One can tune this size via /sys/fs/ext4//mb_stream_req + */ +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + + /* don't use group allocation for large files */ + size = max(size, isize); + if (size > sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having + * per cpu locality group is to reduce the contention between block + * request from multiple CPUs. + */ + ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); + + /* we're going to use group allocation */ + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; + + /* serialize all allocations in the group */ + mutex_lock(&ac->ac_lg->lg_mutex); +} + +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t group; + unsigned int len; + ext4_fsblk_t goal; + ext4_grpblk_t block; + + /* we can't allocate > group size */ + len = ar->len; + + /* just a dirty hack to filter too big requests */ + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); + + /* start searching from the goal */ + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + ext4_get_group_no_and_offset(sb, goal, &group, &block); + + /* set up allocation goals */ + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; + ac->ac_o_ex.fe_group = group; + ac->ac_o_ex.fe_start = block; + ac->ac_o_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; + ac->ac_flags = ar->flags; + + /* we have to define context: we'll we work with a file or + * locality group. this is a policy, actually */ + ext4_mb_group_or_file(ac); + + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + "left: %u/%u, right %u/%u to %swritable\n", + (unsigned) ar->len, (unsigned) ar->logical, + (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, + (unsigned) ar->lleft, (unsigned) ar->pleft, + (unsigned) ar->lright, (unsigned) ar->pright, + atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + return 0; + +} + +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + + mb_debug(1, "discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(pa->pa_type != MB_GROUP_PA); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + group = ext4_get_group_number(sb, pa->pa_pstart); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + spin_unlock(&lg->lg_prealloc_lock); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + +/* + * release all resource we used in allocation + */ +static int ext4_mb_release_context(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_type == MB_GROUP_PA) { + /* see comment in ext4_mb_use_group_pa() */ + spin_lock(&pa->pa_lock); + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } + } + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } + if (ac->ac_bitmap_page) + page_cache_release(ac->ac_bitmap_page); + if (ac->ac_buddy_page) + page_cache_release(ac->ac_buddy_page); + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + mutex_unlock(&ac->ac_lg->lg_mutex); + ext4_mb_collect_stats(ac); + return 0; +} + +static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) +{ + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int ret; + int freed = 0; + + trace_ext4_mb_discard_preallocations(sb, needed); + for (i = 0; i < ngroups && needed > 0; i++) { + ret = ext4_mb_discard_group_preallocations(sb, i, needed); + freed += ret; + needed -= ret; + } + + return freed; +} + +/* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back + * to usual allocation + */ +ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; + + might_sleep(); + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); + + trace_ext4_request_blocks(ar); + + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. + */ + while (ar->len && + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { + + /* let others to free the space */ + cond_resched(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_clstrs = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + } + inquota = ar->len; + if (ar->len == 0) { + *errp = -EDQUOT; + goto out; + } + } + + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); + if (!ac) { + ar->len = 0; + *errp = -ENOMEM; + goto out; + } + + *errp = ext4_mb_initialize_context(ac, ar); + if (*errp) { + ar->len = 0; + goto out; + } + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); +repeat: + /* allocate space in core */ + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto discard_and_exit; + + /* as we've just preallocated more space than + * user requested originally, we store allocated + * space in a special descriptor */ + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) { + ext4_discard_allocated_blocks(ac); + goto errout; + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } + } else { + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + goto repeat; + *errp = -ENOSPC; + } + +errout: + if (*errp) { + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } + ext4_mb_release_context(ac); +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); + if (!ar->len) { + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + } + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); + + return block; +} + +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) + return 1; + return 0; +} + +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) +{ + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t cluster; + struct ext4_free_data *entry; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + BUG_ON(!ext4_handle_valid(handle)); + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) + n = &(*n)->rb_left; + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), + "Block already on to-be-freed list"); + return 0; + } + } + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + kmem_cache_free(ext4_free_data_cachep, entry); + } + } + + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + kmem_cache_free(ext4_free_data_cachep, entry); + } + } + /* Add the extent to transaction's private list */ + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); + return 0; +} + +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @flags: flags used by ext4_free_blocks + */ +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) +{ + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; + unsigned int overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; + ext4_group_t block_group; + struct ext4_sb_info *sbi; + struct ext4_buddy e4b; + unsigned int count_clusters; + int err = 0; + int ret; + + might_sleep(); + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } + + sbi = EXT4_SB(sb); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); + goto error_return; + } + + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + cond_resched(); + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + if (!tbh) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = EXT4_PBLK_COFF(sbi, block); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = EXT4_LBLK_COFF(sbi, count); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + } + +do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + count_clusters = EXT4_NUM_B2C(sbi, count); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + gdp = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!gdp) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, gdp), block, count) || + in_range(ext4_inode_bitmap(sb, gdp), block, count) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { + + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count_clusters; i++) + BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + retry: + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); + if (!new_entry) { + /* + * We use a retry loop because + * ext4_free_blocks() is not allowed to fail. + */ + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; + + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + ext4_mb_free_metadata(handle, &e4b, new_entry); + } else { + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); + } + + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); + } + + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + put_bh(bitmap_bh); + goto do_more; + } +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** + * ext4_group_add_blocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physical block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + if (count == 0) + return 0; + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; + goto error_return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_NUM_B2C(sbi, blocks_freed)); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return err; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) +{ + struct ext4_free_extent ex; + int ret = 0; + + trace_ext4_trim_extent(sb, group, start, count); + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ret = ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0, free_count = 0; + struct ext4_buddy e4b; + int ret = 0; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b.bd_info->bb_free - free_count) < minblocks) + break; + } + + if (!ret) { + ret = count; + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + return ret; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_group_info *grp; + ext4_group_t group, first_group, last_group; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; + uint64_t start, end, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); + + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_cluster); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, + &last_group, &last_cluster); + + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; + } + + /* + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() + */ + if (group == last_group) + end = last_cluster; + + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_cluster, + end, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + trimmed += cnt; + } + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ + first_cluster = 0; + } + + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; + return ret; +} diff --git a/kernel/fs/ext4/mballoc.h b/kernel/fs/ext4/mballoc.h new file mode 100644 index 000000000..d634e183b --- /dev/null +++ b/kernel/fs/ext4/mballoc.h @@ -0,0 +1,215 @@ +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#ifdef CONFIG_EXT4_DEBUG +extern ushort ext4_mballoc_debug; + +#define mb_debug(n, fmt, a...) \ + do { \ + if ((n) <= ext4_mballoc_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __func__); \ + printk(fmt, ## a); \ + } \ + } while (0) +#else +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) +#endif + +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 0 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4//stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + + +struct ext4_free_data { + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ + + /* this links the free block information from group_info */ + struct rb_node efd_node; + + /* group which free block extent belongs */ + ext4_group_t efd_group; + + /* free block extent */ + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; + + /* transaction which freed this extent */ + tid_t efd_tid; +}; + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; /* In cluster units */ + ext4_group_t fe_group; + ext4_grpblk_t fe_len; /* In cluster units */ +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (normalized ac_o_ex) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); +} +#endif diff --git a/kernel/fs/ext4/migrate.c b/kernel/fs/ext4/migrate.c new file mode 100644 index 000000000..b52374e42 --- /dev/null +++ b/kernel/fs/ext4/migrate.c @@ -0,0 +1,672 @@ +/* + * Copyright IBM Corporation, 2007 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" + +/* + * The contiguous blocks details which can be + * represented by a single extent + */ +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; + ext4_fsblk_t first_pblock, last_pblock; +}; + +static int finish_range(handle_t *handle, struct inode *inode, + struct migrate_struct *lb) + +{ + int retval = 0, needed; + struct ext4_extent newext; + struct ext4_ext_path *path; + if (lb->first_pblock == 0) + return 0; + + /* Add the extent to temp inode*/ + newext.ee_block = cpu_to_le32(lb->first_block); + newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); + ext4_ext_store_pblock(&newext, lb->first_pblock); + /* Locking only for convinience since we are operating on temp inode */ + down_write(&EXT4_I(inode)->i_data_sem); + path = ext4_find_extent(inode, lb->first_block, NULL, 0); + if (IS_ERR(path)) { + retval = PTR_ERR(path); + path = NULL; + goto err_out; + } + + /* + * Calculate the credit needed to inserting this extent + * Since we are doing this in loop we may accumalate extra + * credit. But below we try to not accumalate too much + * of them by restarting the journal. + */ + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); + + /* + * Make sure the credit we accumalated is not really high + */ + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { + up_write((&EXT4_I(inode)->i_data_sem)); + retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); + if (retval) + goto err_out; + } else if (needed) { + retval = ext4_journal_extend(handle, needed); + if (retval) { + /* + * IF not able to extend the journal restart the journal + */ + up_write((&EXT4_I(inode)->i_data_sem)); + retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); + if (retval) + goto err_out; + } + } + retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); +err_out: + up_write((&EXT4_I(inode)->i_data_sem)); + ext4_ext_drop_refs(path); + kfree(path); + lb->first_pblock = 0; + return retval; +} + +static int update_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, struct migrate_struct *lb) +{ + int retval; + /* + * See if we can add on to the existing range (if it exists) + */ + if (lb->first_pblock && + (lb->last_pblock+1 == pblock) && + (lb->last_block+1 == lb->curr_block)) { + lb->last_pblock = pblock; + lb->last_block = lb->curr_block; + lb->curr_block++; + return 0; + } + /* + * Start a new range. + */ + retval = finish_range(handle, inode, lb); + lb->first_pblock = lb->last_pblock = pblock; + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; + return retval; +} + +static int update_ind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + lb->curr_block++; + } + } + put_bh(bh); + return retval; + +} + +static int update_dind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_ind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int update_tind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_dind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries * max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) +{ + int retval = 0, needed; + + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + /* + * We are freeing a blocks. During this we touch + * superblock, group descriptor and block bitmap. + * So allocate a credit of 3. We may update + * quota (user and group). + */ + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + if (ext4_journal_extend(handle, needed) != 0) + retval = ext4_journal_restart(handle, needed); + + return retval; +} + +static int free_dind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_tind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i, retval = 0; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + retval = free_dind_blocks(handle, + inode, tmp_idata[i]); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) +{ + int retval; + + /* ei->i_data[EXT4_IND_BLOCK] */ + if (i_data[0]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + + /* ei->i_data[EXT4_DIND_BLOCK] */ + if (i_data[1]) { + retval = free_dind_blocks(handle, inode, i_data[1]); + if (retval) + return retval; + } + + /* ei->i_data[EXT4_TIND_BLOCK] */ + if (i_data[2]) { + retval = free_tind_blocks(handle, inode, i_data[2]); + if (retval) + return retval; + } + return 0; +} + +static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, + struct inode *tmp_inode) +{ + int retval; + __le32 i_data[3]; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); + + /* + * One credit accounted for writing the + * i_data field of the original inode + */ + retval = ext4_journal_extend(handle, 1); + if (retval) { + retval = ext4_journal_restart(handle, 1); + if (retval) + goto err_out; + } + + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; + + down_write(&EXT4_I(inode)->i_data_sem); + /* + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across + */ + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); + + /* + * Update i_blocks with the new blocks that got + * allocated while adding extents for extent index + * blocks. + * + * While converting to extents we need not + * update the orignal inode i_blocks for extent blocks + * via quota APIs. The quota update happened via tmp_inode already. + */ + spin_lock(&inode->i_lock); + inode->i_blocks += tmp_inode->i_blocks; + spin_unlock(&inode->i_lock); + up_write(&EXT4_I(inode)->i_data_sem); + + /* + * We mark the inode dirty after, because we decrement the + * i_blocks when freeing the indirect meta-data blocks + */ + retval = free_ind_block(handle, inode, i_data); + ext4_mark_inode_dirty(handle, inode); + +err_out: + return retval; +} + +static int free_ext_idx(handle_t *handle, struct inode *inode, + struct ext4_extent_idx *ix) +{ + int i, retval = 0; + ext4_fsblk_t block; + struct buffer_head *bh; + struct ext4_extent_header *eh; + + block = ext4_idx_pblock(ix); + bh = sb_bread(inode->i_sb, block); + if (!bh) + return -EIO; + + eh = (struct ext4_extent_header *)bh->b_data; + if (eh->eh_depth != 0) { + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + break; + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return retval; +} + +/* + * Free the extent meta data blocks only + */ +static int free_ext_block(handle_t *handle, struct inode *inode) +{ + int i, retval = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; + struct ext4_extent_idx *ix; + if (eh->eh_depth == 0) + /* + * No extra blocks allocated for extent meta data + */ + return 0; + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + return retval; + } + return retval; +} + +int ext4_ext_migrate(struct inode *inode) +{ + handle_t *handle; + int retval = 0, i; + __le32 *i_data; + struct ext4_inode_info *ei; + struct inode *tmp_inode = NULL; + struct migrate_struct lb; + unsigned long max_entries; + __u32 goal; + uid_t owner[2]; + + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) + /* + * don't migrate fast symlink + */ + return retval; + + /* + * Worst case we can touch the allocation bitmaps, a bgd + * block, and a block to link in the orphan list. We do need + * need to worry about credits for modifying the quota inode. + */ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, + 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + return retval; + } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = i_uid_read(inode); + owner[1] = i_gid_read(inode); + tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), + S_IFREG, NULL, goal, owner); + if (IS_ERR(tmp_inode)) { + retval = PTR_ERR(tmp_inode); + ext4_journal_stop(handle); + return retval; + } + i_size_write(tmp_inode, i_size_read(inode)); + /* + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. + */ + clear_nlink(tmp_inode); + + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); + ext4_journal_stop(handle); + + /* + * start with one credit accounted for + * superblock modification. + * + * For the tmp_inode we already have committed the + * transaction that created the inode. Later as and + * when we add extents we extent the journal + */ + /* + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. + */ + down_read(&EXT4_I(inode)->i_data_sem); + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } + + ei = EXT4_I(inode); + i_data = ei->i_data; + memset(&lb, 0, sizeof(lb)); + + /* 32 bit block address 4 bytes */ + max_entries = inode->i_sb->s_blocksize >> 2; + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[i]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block++; + } + if (i_data[EXT4_IND_BLOCK]) { + retval = update_ind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries; + if (i_data[EXT4_DIND_BLOCK]) { + retval = update_dind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries * max_entries; + if (i_data[EXT4_TIND_BLOCK]) { + retval = update_tind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); + if (retval) + goto err_out; + } + /* + * Build the last extent + */ + retval = finish_range(handle, tmp_inode, &lb); +err_out: + if (retval) + /* + * Failure case delete the extent information with the + * tmp_inode + */ + free_ext_block(handle, tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } + + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ + if (ext4_journal_extend(handle, 1) != 0) + ext4_journal_restart(handle, 1); + + /* + * Mark the tmp_inode as of size zero + */ + i_size_write(tmp_inode, 0); + + /* + * set the i_blocks count to zero + * so that the ext4_evict_inode() does the + * right job + * + * We don't need to take the i_lock because + * the inode is not visible to user space. + */ + tmp_inode->i_blocks = 0; + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); + ext4_journal_stop(handle); +out: + unlock_new_inode(tmp_inode); + iput(tmp_inode); + + return retval; +} + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/kernel/fs/ext4/mmp.c b/kernel/fs/ext4/mmp.c new file mode 100644 index 000000000..8313ca332 --- /dev/null +++ b/kernel/fs/ext4/mmp.c @@ -0,0 +1,393 @@ +#include +#include +#include +#include +#include + +#include "ext4.h" + +/* Checksumming functions */ +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct mmp_struct, mmp_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + + return cpu_to_le32(csum); +} + +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!ext4_has_metadata_csum(sb)) + return 1; + + return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); +} + +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!ext4_has_metadata_csum(sb)) + return; + + mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); +} + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +{ + struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + ext4_mmp_csum_set(sb, mmp); + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + sb_end_write(sb); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (!*bh) + return -ENOMEM; + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (unlikely(!*bh)) { + ext4_warning(sb, "Error while reading MMP block %llu", + mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || + !ext4_mmp_csum_verify(sb, mmp)) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update " + "node: %s, last update device: %s\n", + (long long unsigned int) le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = ((struct mmpd_data *) data)->sb; + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->nodename, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(sb, bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval) { + if ((failed_writes % 60) == 0) + ext4_error(sb, "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, "kmmpd being stopped since filesystem " + "has been remounted as readonly."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error(sb, "error reading MMP data: %d", + retval); + + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error(sb, "abort"); + goto failed; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MAX_CHECK_INTERVAL), + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(sb, bh); + +failed: + kfree(data); + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + new_seq = prandom_u32(); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + struct mmpd_data *mmpd_data; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); + + retval = write_mmp_block(sb, bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + if (!mmpd_data) { + ext4_warning(sb, "not enough memory for mmpd_data"); + goto failed; + } + mmpd_data->sb = sb; + mmpd_data->bh = bh; + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + bdevname(bh->b_bdev, + mmp->mmp_bdevname)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + kfree(mmpd_data); + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + goto failed; + } + + return 0; + +failed: + brelse(bh); + return 1; +} + + diff --git a/kernel/fs/ext4/move_extent.c b/kernel/fs/ext4/move_extent.c new file mode 100644 index 000000000..370420bfa --- /dev/null +++ b/kernel/fs/ext4/move_extent.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato + * Akira Fujita + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "ext4_extents.h" + +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **ppath) +{ + struct ext4_ext_path *path; + + path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + if (path[ext_depth(inode)].p_ext == NULL) { + ext4_ext_drop_refs(path); + kfree(path); + *ppath = NULL; + return -ENODATA; + } + *ppath = path; + return 0; +} + +/** + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem + * + * Acquire write lock of i_data_sem of the two inodes + */ +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) +{ + if (first < second) { + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } else { + down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); + + } +} + +/** + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write lock of i_data_sem of two inodes (orig and donor). + */ +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) +{ + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_check_coverage - Check that all extents in range has the same type + * + * @inode: inode in question + * @from: block offset of inode + * @count: block count to be checked + * @unwritten: extents expected to be unwritten + * @err: pointer to save error value + * + * Return 1 if all extents in range has expected type, and zero otherwise. + */ +static int +mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, + int unwritten, int *err) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext; + int ret = 0; + ext4_lblk_t last = from + count; + while (from < last) { + *err = get_ext_path(inode, from, &path); + if (*err) + goto out; + ext = path[ext_depth(inode)].p_ext; + if (unwritten != ext4_ext_is_unwritten(ext)) + goto out; + from += ext4_ext_get_actual_len(ext); + ext4_ext_drop_refs(path); + } + ret = 1; +out: + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} + +/** + * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * @index1: page index + * @index2: page index + * @page: result page vector + * + * Grab two locked pages for inode's by inode order + */ +static int +mext_page_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, struct page *page[2]) +{ + struct address_space *mapping[2]; + unsigned fl = AOP_FLAG_NOFS; + + BUG_ON(!inode1 || !inode2); + if (inode1 < inode2) { + mapping[0] = inode1->i_mapping; + mapping[1] = inode2->i_mapping; + } else { + pgoff_t tmp = index1; + index1 = index2; + index2 = tmp; + mapping[0] = inode2->i_mapping; + mapping[1] = inode1->i_mapping; + } + + page[0] = grab_cache_page_write_begin(mapping[0], index1, fl); + if (!page[0]) + return -ENOMEM; + + page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); + if (!page[1]) { + unlock_page(page[0]); + page_cache_release(page[0]); + return -ENOMEM; + } + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); + if (inode1 > inode2) { + struct page *tmp; + tmp = page[0]; + page[0] = page[1]; + page[1] = tmp; + } + return 0; +} + +/* Force page buffers uptodate w/o dropping page's lock */ +static int +mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + sector_t block; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, block_start, block_end; + int i, err, nr = 0, partial = 0; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (PageUptodate(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + head = page_buffers(page); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + continue; + } + if (buffer_uptodate(bh)) + continue; + if (!buffer_mapped(bh)) { + err = ext4_get_block(inode, block, bh, 0); + if (err) { + SetPageError(page); + return err; + } + if (!buffer_mapped(bh)) { + zero_user(page, block_start, blocksize); + set_buffer_uptodate(bh); + continue; + } + } + BUG_ON(nr >= MAX_BUF_PER_PAGE); + arr[nr++] = bh; + } + /* No io required */ + if (!nr) + goto out; + + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); + if (err) + return err; + } + } +out: + if (!partial) + SetPageUptodate(page); + return 0; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @donor_page_offset: page index on donor file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @unwritten: orig extent is unwritten or not + * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling ext4_swap_extents(). + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. + */ +static int +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, pgoff_t donor_page_offset, + int data_offset_in_page, + int block_len_in_page, int unwritten, int *err) +{ + struct inode *orig_inode = file_inode(o_filp); + struct page *pagep[2] = {NULL, NULL}; + handle_t *handle; + ext4_lblk_t orig_blk_offset, donor_blk_offset; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int tmp_data_size, data_size, replaced_size; + int err2, jblocks, retries = 0; + int replaced_count = 0; + int from = data_offset_in_page << orig_inode->i_blkbits; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + struct super_block *sb = orig_inode->i_sb; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ +again: + *err = 0; + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); + if (IS_ERR(handle)) { + *err = PTR_ERR(handle); + return 0; + } + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + donor_blk_offset = donor_page_offset * blocks_per_page + + data_offset_in_page; + + /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* + * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_size == 0) + tmp_data_size = blocksize; + + data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; + + *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, + donor_page_offset, pagep); + if (unlikely(*err < 0)) + goto stop_journal; + /* + * If orig extent was unwritten it can become initialized + * at any time after i_data_sem was dropped, in order to + * serialize with delalloc we have recheck extent while we + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. + */ + if (unwritten) { + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to + * fallback to data copying */ + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + if (!unwritten) { + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } + if ((page_has_private(pagep[0]) && + !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && + !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto drop_data_sem; + } + replaced_count = ext4_swap_extents(handle, orig_inode, + donor_inode, orig_blk_offset, + donor_blk_offset, + block_len_in_page, 1, err); + drop_data_sem: + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto unlock_pages; + } +data_copy: + *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + if (*err) + goto unlock_pages; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ + if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto unlock_pages; + } + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 1, err); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + if (*err) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto unlock_pages; + } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ + *err = __block_write_begin(pagep[0], from, replaced_size, + ext4_get_block); + if (!*err) + *err = block_commit_write(pagep[0], from, from + replaced_size); + + if (unlikely(*err < 0)) + goto repair_branches; + + /* Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss */ + *err = ext4_jbd2_file_inode(handle, orig_inode); + +unlock_pages: + unlock_page(pagep[0]); + page_cache_release(pagep[0]); + unlock_page(pagep[1]); + page_cache_release(pagep[1]); +stop_journal: + ext4_journal_stop(handle); + if (*err == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto again; + /* Buffer was busy because probably is pinned to journal transaction, + * force transaction commit may help to free it. */ + if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) + goto again; + return replaced_count; + +repair_branches: + /* + * This should never ever happen! + * Extents are swapped already, but we are not able to copy data. + * Try to swap extents to it's original places + */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 0, &err2); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + if (replaced_count != block_len_in_page) { + EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), + "Unable to copy data block," + " data will be lost."); + *err = -EIO; + } + replaced_count = 0; + goto unlock_pages; +} + +/** + * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + __u64 orig_eof, donor_eof; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be swapfile [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EBUSY; + } + + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { + ext4_debug("ext4 move extent: orig and donor's start " + "offset are not alligned [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (donor_start + *len >= EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + if (orig_eof < orig_start + *len - 1) + *len = orig_eof - orig_start; + if (donor_eof < donor_start + *len - 1) + *len = donor_eof - donor_start; + if (!*len) { + ext4_debug("ext4 move extent: len should not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_blk: start offset in block for orig + * @donor_blk: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) +{ + struct inode *orig_inode = file_inode(o_filp); + struct inode *donor_inode = file_inode(d_filp); + struct ext4_ext_path *path = NULL; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + ext4_lblk_t o_end, o_start = orig_blk; + ext4_lblk_t d_start = donor_blk; + int ret; + + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* orig and donor should be different inodes */ + if (orig_inode == donor_inode) { + ext4_debug("ext4 move extent: The argument files should not " + "be same inode [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* TODO: This is non obvious task to swap blocks for inodes with full + jornaling enabled */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + return -EINVAL; + } + /* Protect orig and donor inodes against a truncate */ + lock_two_nondirectories(orig_inode, donor_inode); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(orig_inode); + ext4_inode_block_unlocked_dio(donor_inode); + inode_dio_wait(orig_inode); + inode_dio_wait(donor_inode); + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, + donor_blk, &len); + if (ret) + goto out; + o_end = o_start + len; + + while (o_start < o_end) { + struct ext4_extent *ex; + ext4_lblk_t cur_blk, next_blk; + pgoff_t orig_page_index, donor_page_index; + int offset_in_page; + int unwritten, cur_len; + + ret = get_ext_path(orig_inode, o_start, &path); + if (ret) + goto out; + ex = path[path->p_depth].p_ext; + next_blk = ext4_ext_next_allocated_block(path); + cur_blk = le32_to_cpu(ex->ee_block); + cur_len = ext4_ext_get_actual_len(ex); + /* Check hole before the start pos */ + if (cur_blk + cur_len - 1 < o_start) { + if (next_blk == EXT_MAX_BLOCKS) { + o_start = o_end; + ret = -ENODATA; + goto out; + } + d_start += next_blk - o_start; + o_start = next_blk; + continue; + /* Check hole after the start pos */ + } else if (cur_blk > o_start) { + /* Skip hole */ + d_start += cur_blk - o_start; + o_start = cur_blk; + /* Extent inside requested range ?*/ + if (cur_blk >= o_end) + goto out; + } else { /* in_range(o_start, o_blk, o_len) */ + cur_len += cur_blk - o_start; + } + unwritten = ext4_ext_is_unwritten(ex); + if (o_end - o_start < cur_len) + cur_len = o_end - o_start; + + orig_page_index = o_start >> (PAGE_CACHE_SHIFT - + orig_inode->i_blkbits); + donor_page_index = d_start >> (PAGE_CACHE_SHIFT - + donor_inode->i_blkbits); + offset_in_page = o_start % blocks_per_page; + if (cur_len > blocks_per_page- offset_in_page) + cur_len = blocks_per_page - offset_in_page; + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + ext4_double_up_write_data_sem(orig_inode, donor_inode); + /* Swap original branches with new branches */ + move_extent_per_page(o_filp, donor_inode, + orig_page_index, donor_page_index, + offset_in_page, cur_len, + unwritten, &ret); + ext4_double_down_write_data_sem(orig_inode, donor_inode); + if (ret < 0) + break; + o_start += cur_len; + d_start += cur_len; + } + *moved_len = o_start - orig_blk; + if (*moved_len > len) + *moved_len = len; + +out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + + ext4_ext_drop_refs(path); + kfree(path); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + ext4_inode_resume_unlocked_dio(orig_inode); + ext4_inode_resume_unlocked_dio(donor_inode); + unlock_two_nondirectories(orig_inode, donor_inode); + + return ret; +} diff --git a/kernel/fs/ext4/namei.c b/kernel/fs/ext4/namei.c new file mode 100644 index 000000000..814f3beb4 --- /dev/null +++ b/kernel/fs/ext4/namei.c @@ -0,0 +1,3918 @@ +/* + * linux/fs/ext4/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" + +#include "xattr.h" +#include "acl.h" + +#include +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) +{ + struct buffer_head *bh; + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && + ((inode->i_size >> 10) >= + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1); + if (IS_ERR(bh)) + return bh; + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); + } + return bh; +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +typedef enum { + EITHER, INDEX, DIRENT +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ + __ext4_read_dirblock((inode), (block), (type), __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + ext4_lblk_t block, + dirblock_type_t type, + unsigned int line) +{ + struct buffer_head *bh; + struct ext4_dir_entry *dirent; + int is_dx_block = 0; + + bh = ext4_bread(NULL, inode, block, 0); + if (IS_ERR(bh)) { + __ext4_warning(inode->i_sb, __func__, line, + "error %ld reading directory block " + "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, + (unsigned long) block); + + return bh; + } + if (!bh) { + ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + return ERR_PTR(-EIO); + } + dirent = (struct ext4_dir_entry *) bh->b_data; + /* Determine whether or not we have an index block */ + if (is_dx(inode)) { + if (block == 0) + is_dx_block = 1; + else if (ext4_rec_len_from_disk(dirent->rec_len, + inode->i_sb->s_blocksize) == + inode->i_sb->s_blocksize) + is_dx_block = 1; + } + if (!is_dx_block && type == INDEX) { + ext4_error_inode(inode, __func__, line, block, + "directory leaf block found instead of index block"); + return ERR_PTR(-EIO); + } + if (!ext4_has_metadata_csum(inode->i_sb) || + buffer_verified(bh)) + return bh; + + /* + * An empty leaf block can get mistaken for a index block; for + * this reason, we can only check the index checksum when the + * caller is sure it should be an index block. + */ + if (is_dx_block && type == INDEX) { + if (ext4_dx_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory index failed checksum"); + brelse(bh); + return ERR_PTR(-EIO); + } + } + if (!is_dx_block) { + if (ext4_dirent_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory block failed checksum"); + brelse(bh); + return ERR_PTR(-EIO); + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +/* + * This goes at the end of each htree block. + */ +struct dx_tail { + u32 dt_reserved; + __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); +static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, + struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, + struct dx_map_entry *offsets, int count, unsigned blocksize); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static void dx_insert_block(struct dx_frame *frame, + u32 hash, ext4_lblk_t block); +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir); +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* checksumming functions */ +void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) +{ + memset(t, 0, sizeof(struct ext4_dir_entry_tail)); + t->det_rec_len = ext4_rec_len_to_disk( + sizeof(struct ext4_dir_entry_tail), blocksize); + t->det_reserved_ft = EXT4_FT_DIR_CSUM; +} + +/* Walk through a dirent block to find a checksum "dirent" at the tail */ +static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, + struct ext4_dir_entry *de) +{ + struct ext4_dir_entry_tail *t; + +#ifdef PARANOID + struct ext4_dir_entry *d, *top; + + d = de; + top = (struct ext4_dir_entry *)(((void *)de) + + (EXT4_BLOCK_SIZE(inode->i_sb) - + sizeof(struct ext4_dir_entry_tail))); + while (d < top && d->rec_len) + d = (struct ext4_dir_entry *)(((void *)d) + + le16_to_cpu(d->rec_len)); + + if (d != top) + return NULL; + + t = (struct ext4_dir_entry_tail *)d; +#else + t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb)); +#endif + + if (t->det_reserved_zero1 || + le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + t->det_reserved_zero2 || + t->det_reserved_ft != EXT4_FT_DIR_CSUM) + return NULL; + + return t; +} + +static __le32 ext4_dirent_csum(struct inode *inode, + struct ext4_dir_entry *dirent, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + return cpu_to_le32(csum); +} + +static void warn_no_space_for_csum(struct inode *inode) +{ + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " + "checksum. Please run e2fsck -D.", inode->i_ino); +} + +int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return 1; + + t = get_dirent_tail(inode, dirent); + if (!t) { + warn_no_space_for_csum(inode); + return 0; + } + + if (t->det_checksum != ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent)) + return 0; + + return 1; +} + +static void ext4_dirent_csum_set(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + t = get_dirent_tail(inode, dirent); + if (!t) { + warn_no_space_for_csum(inode); + return; + } + + t->det_checksum = ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent); +} + +int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + struct ext4_dir_entry *dirent, + int *offset) +{ + struct ext4_dir_entry *dp; + struct dx_root_info *root; + int count_offset; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; + else if (le16_to_cpu(dirent->rec_len) == 12) { + dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); + if (le16_to_cpu(dp->rec_len) != + EXT4_BLOCK_SIZE(inode->i_sb) - 12) + return NULL; + root = (struct dx_root_info *)(((void *)dp + 12)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; + count_offset = 32; + } else + return NULL; + + if (offset) + *offset = count_offset; + return (struct dx_countlimit *)(((void *)dirent) + count_offset); +} + +static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, + int count_offset, int count, struct dx_tail *t) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + __le32 save_csum; + int size; + + size = count_offset + (count * sizeof(struct dx_entry)); + save_csum = t->dt_checksum; + t->dt_checksum = 0; + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); + t->dt_checksum = save_csum; + + return cpu_to_le32(csum); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return 1; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return 1; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return 1; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, + count, t)) + return 0; + return 1; +} + +static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); +} + +static inline int ext4_handle_dirty_dx_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len, blocksize)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash(struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - + EXT4_DIR_REC_LEN(2) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit(struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index(char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk(KERN_DEBUG "%s index ", label); + for (i = 0; i < n; i++) { + printk("%x->%lu ", i ? dx_get_hash(entries + i) : + 0, (unsigned long)dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct inode *dir, + struct dx_hash_info *hinfo, + struct ext4_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + int len; + char *name; + struct ext4_str fname_crypto_str + = {.name = NULL, .len = 0}; + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + name = de->name; + len = de->name_len; + ctx = ext4_get_fname_crypto_ctx(dir, + EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + printk(KERN_WARNING "Error acquiring" + " crypto ctxt--skipping crypto\n"); + ctx = NULL; + } + if (ctx == NULL) { + /* Directory is not encrypted */ + ext4fs_dirhash(de->name, + de->name_len, &h); + printk("%*.s:(U)%x.%u ", len, + name, h.hash, + (unsigned) ((char *) de + - base)); + } else { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer( + ctx, de->name_len, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "allocating crypto " + "buffer--skipping " + "crypto\n"); + ext4_put_fname_crypto_ctx(&ctx); + ctx = NULL; + } + res = ext4_fname_disk_to_usr(ctx, NULL, de, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to usr" + "\n"); + name = "??"; + len = 2; + } else { + name = fname_crypto_str.name; + len = fname_crypto_str.len; + } + ext4fs_dirhash(de->name, de->name_len, + &h); + printk("%*.s:(E)%x.%u ", len, name, + h.hash, (unsigned) ((char *) de + - base)); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer( + &fname_crypto_str); + } +#else + int len = de->name_len; + char *name = de->name; + ext4fs_dirhash(de->name, de->name_len, &h); + printk("%*.s:%x.%u ", len, name, h.hash, + (unsigned) ((char *) de - base)); +#endif + } + space += EXT4_DIR_REC_LEN(de->name_len); + names++; + } + de = ext4_next_entry(de, size); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + ext4_lblk_t block = dx_get_block(entries); + ext4_lblk_t hash = i ? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + bh = ext4_bread(NULL,dir, block, 0); + if (!bh || IS_ERR(bh)) + continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) + bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse(bh); + } + if (bcount) + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(const struct qstr *d_name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); + u32 hash; + + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + + root = (struct dx_root *) frame->bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", + root->info.hash_version); + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (d_name) { + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + ret_err = ERR_PTR(PTR_ERR(ctx)); + goto fail; + } + res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); + if (res < 0) { + ret_err = ERR_PTR(res); + goto fail; + } + ext4_put_fname_crypto_ctx(&ctx); + } +#else + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); +#endif + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + goto fail; + } + + dxtrace(printk("Look up %x", hash)); + while (1) { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + goto fail; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) { // linear search cross check + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->entries = entries; + frame->at = at; + if (!indirect--) + return frame; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { + ret_err = (struct dx_frame *) frame->bh; + frame->bh = NULL; + goto fail; + } + entries = ((struct dx_node *) frame->bh->b_data)->entries; + + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, + "dx entry: limit != node limit"); + goto fail; + } + } +fail: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } + + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) + ext4_warning(dir->i_sb, + "Corrupt dir inode %lu, running e2fsck is " + "recommended.", dir->i_ino); + return ret_err; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); + p++; + brelse(p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *top; + int err = 0, count = 0; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; + + dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", + (unsigned long)block)); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(0)); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + brelse(bh); + return err; + } + if (ctx != NULL) { + err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&ctx); + brelse(bh); + return err; + } + } +#endif + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, + (block<i_sb)) + + ((char *)de - bh->b_data))) { + /* silently ignore the rest of the block */ + break; + } + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if (ctx == NULL) { + /* Directory is not encrypted */ + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(ctx, hinfo, de, + &fname_crypto_str); + if (err < 0) { + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); + } + if (err != 0) { + count = err; + goto errout; + } + count++; + } +errout: + brelse(bh); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; + int ret, err; + __u32 hashval; + struct ext4_str tmp_str; + + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = file_inode(dir_file); + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 0, 0, + de, &tmp_str); + if (err != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + de = ext4_next_entry(de, dir->i_sb->s_blocksize); + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 2, 0, + de, &tmp_str); + if (err != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + d_name, offset, res_dir); +} + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + blocksize) { + if (de->name_len && de->inode) { + ext4fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = ((char *) de - base)>>2; + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext4_next_entry(de, blocksize); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +/* + * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * + * `len <= EXT4_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, + struct ext4_str *fname_crypto_str, + int len, const char * const name, + struct ext4_dir_entry_2 *de) +{ + int res; + + if (!de->inode) + return 0; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + return ext4_fname_match(ctx, fname_crypto_str, len, name, de); +#endif + if (len != de->name_len) + return 0; + res = memcmp(name, de->name, len); + return (res == 0) ? 1 : 0; +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) +{ + struct ext4_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = d_name->name; + int namelen = d_name->len; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; + + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + if ((char *) de + de->name_len <= dlimit) { + res = ext4_match(ctx, &fname_crypto_str, namelen, + name, de); + if (res < 0) { + res = -1; + goto return_result; + } + if (res > 0) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, + bh->b_size, offset)) { + res = -1; + goto return_result; + } + *res_dir = de; + res = 1; + goto return_result; + } + + } + /* prevent looping on a bad block */ + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + if (de_len <= 0) { + res = -1; + goto return_result; + } + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } + + res = 0; +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; +} + +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} + +/* + * ext4_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *inlined) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block, b; + const u8 *name = d_name->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + ext4_lblk_t nblocks; + int i, namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = d_name->len; + if (namelen > EXT4_NAME_LEN) + return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, d_name, res_dir, + &has_inline_data); + if (has_inline_data) { + if (inlined) + *inlined = 1; + return ret; + } + } + + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + bh = ext4_dx_find_entry(dir, d_name, res_dir); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext4_getblk(NULL, dir, b++, 0); + if (unlikely(IS_ERR(bh))) { + if (ra_max == 0) + return bh; + break; + } + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + EXT4_ERROR_INODE(dir, "reading directory lblock %lu", + (unsigned long) block); + brelse(bh); + goto next; + } + if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + goto next; + } + set_buffer_verified(bh); + i = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT4_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir) +{ + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + *res_dir = NULL; +#endif + frame = dx_probe(d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { + block = dx_get_block(frame->at); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + goto errout; + + retval = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) + goto success; + brelse(bh); + if (retval == -1) { + bh = ERR_PTR(ERR_BAD_DX_DIR); + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning(sb, + "error %d reading index page in directory #%lu", + retval, dir->i_ino); + bh = ERR_PTR(retval); + goto errout; + } + } while (retval == 1); + + bh = NULL; +errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); +success: + dx_release(frames); + return bh; +} + +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; + + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); + brelse(bh); + if (!ext4_valid_inum(dir->i_sb, ino)) { + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); + return ERR_PTR(-EIO); + } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); + return ERR_PTR(-EIO); + } + inode = ext4_iget_normal(dir->i_sb, ino); + if (inode == ERR_PTR(-ESTALE)) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); + } + if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + !ext4_is_child_context_consistent_with_parent(dir, + inode)) { + iput(inode); + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return ERR_PTR(-EPERM); + } + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext4_get_parent(struct dentry *child) +{ + __u32 ino; + static const struct qstr dotdot = QSTR_INIT("..", 2); + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { + EXT4_ERROR_INODE(d_inode(child), + "bad parent inode number: %u", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext4_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + unsigned blocksize) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); + rec_len = EXT4_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext4_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +{ + struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { + rec_len = EXT4_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + prev = to; + to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + ext4_lblk_t newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext4_dir_entry_2 *de = NULL, *de2; + struct ext4_dir_entry_tail *t; + int csum_size = 0; + int err = 0, i; + + if (ext4_has_metadata_csum(dir->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + brelse(*bh); + *bh = NULL; + return (struct ext4_dir_entry_2 *) bh2; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map(map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", + (unsigned long)dx_get_block(frame->at), + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split, + blocksize); + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de2, + blocksize); + if (csum_size) { + t = EXT4_DIRENT_TAIL(data2, blocksize); + initialize_dirent_tail(t, blocksize); + + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, + blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) { + swap(*bh, bh2); + de = de2; + } + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); + if (err) + goto journal_error; + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (err) + goto journal_error; + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext4_std_error(dir->i_sb, err); + return ERR_PTR(err); +} + +int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + int nlen, rlen; + unsigned int offset = 0; + char *top; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; + + if (ctx != NULL) { + /* Calculate record length needed to store the entry */ + res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return res; + } + reclen = EXT4_DIR_REC_LEN(res); + } + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) { + res = -EIO; + goto return_result; + } + /* Provide crypto context and crypto buffer to ext4 match */ + res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + if (res < 0) + goto return_result; + if (res > 0) { + res = -EEXIST; + goto return_result; + } + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + + if ((char *) de > top) + res = -ENOSPC; + else { + *dest_de = de; + res = 0; + } +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; +} + +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const struct qstr *iname, + const char *name, int namelen) +{ + + int nlen, rlen; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + struct ext4_str tmp_str; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -EIO; + /* By default, the input name would be written to the disk */ + tmp_str.name = (unsigned char *)name; + tmp_str.len = namelen; + if (ctx != NULL) { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return -ENOMEM; + } + res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; + } + tmp_str.name = fname_crypto_str.name; + tmp_str.len = fname_crypto_str.len; + } + + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = tmp_str.len; + + memcpy(de->name, tmp_str.name, tmp_str.len); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return 0; +} + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext4_dir_entry_2 *de, + struct buffer_head *bh) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; + int err; + + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { + err = ext4_find_dest_de(dir, inode, + bh, bh->b_data, blocksize - csum_size, + name, namelen, &de); + if (err) + return err; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_std_error(dir->i_sb, err); + return err; + } + + /* By now the buffer is marked for journaling. Due to crypto operations, + * the following function call may fail */ + err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, + name, namelen); + if (err < 0) + return err; + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (err) + ext4_std_error(dir->i_sb, err); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = d_inode(dentry->d_parent); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; +#else + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; +#endif + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2; + struct ext4_dir_entry_tail *t; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; + struct fake_dirent *fde; + int csum_size = 0; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); +#endif + + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + BUFFER_TRACE(bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, bh); + if (retval) { + ext4_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len, blocksize)); + if ((char *) de >= (((char *) root) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EIO; + } + len = ((char *) root) + (blocksize - csum_size) - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block); + if (IS_ERR(bh2)) { + brelse(bh); + return PTR_ERR(bh2); + } + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext4_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + de = de2; + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, + blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + + /* Initialize the root; the dot dirents already exist */ + de = (struct ext4_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), + blocksize); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_mark_inode_dirty(handle, dir); + brelse(bh); + return res; + } + ext4_put_fname_crypto_ctx(&ctx); +#else + ext4fs_dirhash(name, namelen, &hinfo); +#endif + memset(frames, 0, sizeof(frames)); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + + retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (retval) + goto out_frames; + retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (retval) + goto out_frames; + + de = do_split(handle,dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; + } + dx_release(frames); + + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; +out_frames: + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; +} + +/* + * ext4_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext4_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + struct super_block *sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + ext4_lblk_t block, blocks; + int csum_size = 0; + + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, dentry, inode); + if (retval < 0) + return retval; + if (retval == 1) { + retval = 0; + goto out; + } + } + + if (is_dx(dir)) { + retval = ext4_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + goto out; + + if (blocks == 1 && !dx_fallback && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, dentry, inode, bh); + bh = NULL; /* make_indexed_dir releases bh */ + goto out; + } + brelse(bh); + } + bh = ext4_append(handle, dir, &block); + if (IS_ERR(bh)) + return PTR_ERR(bh); + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); +out: + brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = d_inode(dentry->d_parent); + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; + at = frame->at; + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto cleanup; + } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext4_warning(sb, "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); + goto cleanup; + } + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext4_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } + } + de = do_split(handle, dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; + } + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + +journal_error: + ext4_std_error(dir->i_sb, err); +cleanup: + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry + */ +int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) +{ + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; + int i; + + i = 0; + pde = NULL; + de = (struct ext4_dir_entry_2 *)entry_buf; + while (i < buf_size - csum_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, i)) + return -EIO; + if (de == de_del) { + if (pde) + pde->rec_len = ext4_rec_len_to_disk( + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); + else + de->inode = 0; + dir->i_version++; + return 0; + } + i += ext4_rec_len_from_disk(de->rec_len, blocksize); + pde = de; + de = ext4_next_entry(de, blocksize); + } + return -ENOENT; +} + +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (ext4_has_metadata_csum(dir->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, + bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. + */ +static void ext4_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && inode->i_nlink > 1) { + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { + set_nlink(inode, 1); + EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + } + } +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(handle_t *handle, struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + + +static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + } + drop_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + dquot_initialize(dir); + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = 0; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!err && (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { + err = ext4_inherit_context(dir, inode); + if (err) { + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + } + } +#endif + if (!err) + err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &ext4_special_inode_operations; + err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + inode = ext4_new_inode_start_handle(dir, mode, + NULL, 0, NULL, + EXT4_HT_DIR, + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT4_XATTR_TRANS_BLOCKS); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + d_tmpfile(dentry, inode); + err = ext4_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_unlock_inode: + ext4_journal_stop(handle); + unlock_new_inode(inode); + return err; +} + +struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) +{ + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(de->name_len), blocksize); + strcpy(de->name, ".."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); +} + +static int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) +{ + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + ext4_lblk_t block = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; + int err; + + if (ext4_has_metadata_csum(dir->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + + inode->i_size = 0; + dir_block = ext4_append(handle, inode, &block); + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); + set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + brelse(dir_block); + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return -EMLINK; + + dquot_initialize(dir); + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + err = ext4_init_new_dir(handle, dir, inode); + if (err) + goto out_clear_inode; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { + err = ext4_inherit_context(dir, inode); + if (err) + goto out_clear_inode; + } +#endif + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); + if (err) { +out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; + } + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + unlock_new_inode(inode); + d_instantiate(dentry, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + +out_stop: + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ext4_empty_dir(struct inode *inode) +{ + unsigned int offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; + int err = 0; + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + + err = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return err; + } + + sb = inode->i_sb; + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + EXT4_ERROR_INODE(inode, "invalid size"); + return 1; + } + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) + return 1; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + de1 = ext4_next_entry(de, sb->s_blocksize); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); + return 1; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); + de = ext4_next_entry(de1, sb->s_blocksize); + while (offset < inode->i_size) { + if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + unsigned int lblock; + err = 0; + brelse(bh); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_read_dirblock(inode, lblock, EITHER); + if (IS_ERR(bh)) + return 1; + de = (struct ext4_dir_entry_2 *) bh->b_data; + } + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset)) { + de = (struct ext4_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + } + brelse(bh); + return 1; +} + +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_iloc iloc; + int err = 0, rc; + bool dirty = false; + + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan)) + return 0; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + mutex_lock(&sbi->s_orphan_lock); + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_super(handle, sb); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out: + ext4_std_error(sb, err); + return err; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; + + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + + mutex_lock(&sbi->s_orphan_lock); + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + prev = ei->i_orphan.prev; + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_err; + } + + ino_next = NEXT_ORPHAN(inode); + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_super(handle, inode->i_sb); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out_err: + ext4_std_error(inode->i_sb, err); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle = NULL; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + dquot_initialize(dir); + dquot_initialize(d_inode(dentry)); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + goto end_rmdir; + + inode = d_inode(dentry); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!ext4_empty_dir(inode)) + goto end_rmdir; + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rmdir; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (!EXT4_DIR_LINK_EMPTY(inode)) + ext4_warning(inode->i_sb, + "empty directory has too many links (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_dec_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + +end_rmdir: + brelse(bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle = NULL; + + trace_ext4_unlink_enter(dir, dentry); + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + dquot_initialize(dir); + dquot_initialize(d_inode(dentry)); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + goto end_unlink; + + inode = d_inode(dentry); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_unlink; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + if (!inode->i_nlink) { + ext4_warning(inode->i_sb, + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext4_orphan_add(handle, inode); + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +end_unlink: + brelse(bh); + if (handle) + ext4_journal_stop(handle); + trace_ext4_unlink_exit(dentry, retval); + return retval; +} + +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + handle_t *handle; + struct inode *inode; + int err, len = strlen(symname); + int credits; + bool encryption_required; + struct ext4_str disk_link; + struct ext4_encrypted_symlink_data *sd = NULL; + + disk_link.len = len + 1; + disk_link.name = (char *) symname; + + encryption_required = (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); + if (encryption_required) + disk_link.len = encrypted_symlink_data_len(len) + 1; + if (disk_link.len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + dquot_initialize(dir); + + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; + } + + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + if (IS_ERR(inode)) { + if (handle) + ext4_journal_stop(handle); + return PTR_ERR(inode); + } + + if (encryption_required) { + struct ext4_fname_crypto_ctx *ctx = NULL; + struct qstr istr; + struct ext4_str ostr; + + sd = kzalloc(disk_link.len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto err_drop_inode; + } + err = ext4_inherit_context(dir, inode); + if (err) + goto err_drop_inode; + ctx = ext4_get_fname_crypto_ctx(inode, + inode->i_sb->s_blocksize); + if (IS_ERR_OR_NULL(ctx)) { + /* We just set the policy, so ctx should not be NULL */ + err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); + goto err_drop_inode; + } + istr.name = (const unsigned char *) symname; + istr.len = len; + ostr.name = sd->encrypted_path; + err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); + ext4_put_fname_crypto_ctx(&ctx); + if (err < 0) + goto err_drop_inode; + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *) sd; + } + + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + /* + * We cannot call page_symlink() with transaction started + * because it calls into ext4_write_begin() which can wait + * for transaction commit if we are running out of space + * and thus we deadlock. So we have to stop transaction now + * and restart it when symlink contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + handle = NULL; + if (err) + goto err_drop_inode; + err = __page_symlink(inode, disk_link.name, disk_link.len, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS + * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + goto err_drop_inode; + } + set_nlink(inode, 1); + err = ext4_orphan_del(handle, inode); + if (err) + goto err_drop_inode; + } else { + /* clear the extent format for fast symlink */ + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + inode->i_op = encryption_required ? + &ext4_symlink_inode_operations : + &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, + disk_link.len); + inode->i_size = disk_link.len - 1; + } + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + if (handle) + ext4_journal_stop(handle); + kfree(sd); + return err; +err_drop_inode: + if (handle) + ext4_journal_stop(handle); + kfree(sd); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = d_inode(old_dentry); + int err, retries = 0; + + if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_HT_DIR, + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode->i_ctime = ext4_current_time(inode); + ext4_inc_count(handle, inode); + ihold(inode); + + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext4_orphan_del(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) { + *retval = PTR_ERR(bh); + return NULL; + } + *parent_de = ext4_next_entry( + (struct ext4_dir_entry_2 *)bh->b_data, + inode->i_sb->s_blocksize); + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} + +struct ext4_renament { + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool is_dir; + int dir_nlink_delta; + + /* entry for "dentry" */ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + int inlined; + + /* entry for ".." in inode if it's a directory */ + struct buffer_head *dir_bh; + struct ext4_dir_entry_2 *parent_de; + int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, + &retval, &ent->parent_de, + &ent->dir_inlined); + if (!ent->dir_bh) + return retval; + if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) + return -EIO; + BUFFER_TRACE(ent->dir_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, ent->dir_bh); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, + unsigned dir_ino) +{ + int retval; + + ent->parent_de->inode = cpu_to_le32(dir_ino); + BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); + if (!ent->dir_inlined) { + if (is_dx(ent->inode)) { + retval = ext4_handle_dirty_dx_node(handle, + ent->inode, + ent->dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + ent->inode, + ent->dir_bh); + } + } else { + retval = ext4_mark_inode_dirty(handle, ent->inode); + } + if (retval) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + int retval; + + BUFFER_TRACE(ent->bh, "get write access"); + retval = ext4_journal_get_write_access(handle, ent->bh); + if (retval) + return retval; + ent->de->inode = cpu_to_le32(ino); + if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + ent->de->file_type = file_type; + ent->dir->i_version++; + ent->dir->i_ctime = ent->dir->i_mtime = + ext4_current_time(ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); + if (!ent->inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + ent->dir, ent->bh); + if (unlikely(retval)) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + } + brelse(ent->bh); + ent->bh = NULL; + + return 0; +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (bh) { + retval = ext4_delete_entry(handle, dir, de, bh); + brelse(bh); + } + return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, + int force_reread) +{ + int retval; + /* + * ent->de could have moved from under us during htree split, so make + * sure that we are deleting the right entry. We might also be pointing + * to a stale entry in the unused part of ent->bh so just checking inum + * and the name isn't enough. + */ + if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || + ent->de->name_len != ent->dentry->d_name.len || + strncmp(ent->de->name, ent->dentry->d_name.name, + ent->de->name_len) || + force_reread) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } else { + retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); + if (retval == -ENOENT) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } + } + + if (retval) { + ext4_warning(ent->dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + ent->dir->i_ino, ent->dir->i_nlink, retval); + } +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ + if (ent->dir_nlink_delta) { + if (ent->dir_nlink_delta == -1) + ext4_dec_count(handle, ent->dir); + else + ext4_inc_count(handle, ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + } +} + +static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, + int credits, handle_t **h) +{ + struct inode *wh; + handle_t *handle; + int retries = 0; + + /* + * for inode block, sb block, group summaries, + * and inode bitmap + */ + credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS + 4); +retry: + wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + &ent->dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + + handle = ext4_journal_current_handle(); + if (IS_ERR(wh)) { + if (handle) + ext4_journal_stop(handle); + if (PTR_ERR(wh) == -ENOSPC && + ext4_should_retry_alloc(ent->dir->i_sb, &retries)) + goto retry; + } else { + *h = handle; + init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); + wh->i_op = &ext4_special_inode_operations; + } + return wh; +} + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) + */ +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = d_inode(old_dentry), + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = d_inode(new_dentry), + }; + int force_reread; + int retval; + struct inode *whiteout = NULL; + int credits; + u8 old_file_type; + + dquot_initialize(old.dir); + dquot_initialize(new.dir); + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new.inode) + dquot_initialize(new.inode); + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + if ((old.dir != new.dir) && + ext4_encrypted_inode(new.dir) && + !ext4_is_child_context_consistent_with_parent(new.dir, + old.inode)) { + retval = -EPERM; + goto end_rename; + } + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto end_rename; + } + if (new.bh) { + if (!new.inode) { + brelse(new.bh); + new.bh = NULL; + } + } + if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old.inode); + + credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (!(flags & RENAME_WHITEOUT)) { + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } + } else { + whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + if (IS_ERR(whiteout)) { + retval = PTR_ERR(whiteout); + whiteout = NULL; + goto end_rename; + } + } + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + if (new.inode) { + retval = -ENOTEMPTY; + if (!ext4_empty_dir(new.inode)) + goto end_rename; + } else { + retval = -EMLINK; + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) + goto end_rename; + } + retval = ext4_rename_dir_prepare(handle, &old); + if (retval) + goto end_rename; + } + /* + * If we're renaming a file within an inline_data dir and adding or + * setting the new dirent causes a conversion from inline_data to + * extents/blockmap, we need to force the dirent delete code to + * re-read the directory, or else we end up trying to delete a dirent + * from what is now the extent tree root (or a block map). + */ + force_reread = (new.dir->i_ino == old.dir->i_ino && + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); + + old_file_type = old.de->file_type; + if (whiteout) { + /* + * Do this before adding a new entry, so the old entry is sure + * to be still pointing to the valid old entry. + */ + retval = ext4_setent(handle, &old, whiteout->i_ino, + EXT4_FT_CHRDEV); + if (retval) + goto end_rename; + ext4_mark_inode_dirty(handle, whiteout); + } + if (!new.bh) { + retval = ext4_add_entry(handle, new.dentry, old.inode); + if (retval) + goto end_rename; + } else { + retval = ext4_setent(handle, &new, + old.inode->i_ino, old_file_type); + if (retval) + goto end_rename; + } + if (force_reread) + force_reread = !ext4_test_inode_flag(new.dir, + EXT4_INODE_INLINE_DATA); + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old.inode->i_ctime = ext4_current_time(old.inode); + ext4_mark_inode_dirty(handle, old.inode); + + if (!whiteout) { + /* + * ok, that's it + */ + ext4_rename_delete(handle, &old, force_reread); + } + + if (new.inode) { + ext4_dec_count(handle, new.inode); + new.inode->i_ctime = ext4_current_time(new.inode); + } + old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + ext4_update_dx_flag(old.dir); + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + + ext4_dec_count(handle, old.dir); + if (new.inode) { + /* checked ext4_empty_dir above, can't have another + * parent, ext4_dec_count() won't work for many-linked + * dirs */ + clear_nlink(new.inode); + } else { + ext4_inc_count(handle, new.dir); + ext4_update_dx_flag(new.dir); + ext4_mark_inode_dirty(handle, new.dir); + } + } + ext4_mark_inode_dirty(handle, old.dir); + if (new.inode) { + ext4_mark_inode_dirty(handle, new.inode); + if (!new.inode->i_nlink) + ext4_orphan_add(handle, new.inode); + } + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (whiteout) { + if (retval) + drop_nlink(whiteout); + unlock_new_inode(whiteout); + iput(whiteout); + } + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = d_inode(old_dentry), + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = d_inode(new_dentry), + }; + u8 new_file_type; + int retval; + + dquot_initialize(old.dir); + dquot_initialize(new.dir); + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, + &old.de, &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + new.bh = NULL; + goto end_rename; + } + + /* RENAME_EXCHANGE case: old *and* new must both exist */ + if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) + goto end_rename; + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + old.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &old); + if (retval) + goto end_rename; + } + if (S_ISDIR(new.inode->i_mode)) { + new.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &new); + if (retval) + goto end_rename; + } + + /* + * Other than the special case of overwriting a directory, parents' + * nlink only needs to be modified if this is a cross directory rename. + */ + if (old.dir != new.dir && old.is_dir != new.is_dir) { + old.dir_nlink_delta = old.is_dir ? -1 : 1; + new.dir_nlink_delta = -old.dir_nlink_delta; + retval = -EMLINK; + if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || + (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) + goto end_rename; + } + + new_file_type = new.de->file_type; + retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); + if (retval) + goto end_rename; + + retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); + if (retval) + goto end_rename; + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old.inode->i_ctime = ext4_current_time(old.inode); + new.inode->i_ctime = ext4_current_time(new.inode); + ext4_mark_inode_dirty(handle, old.inode); + ext4_mark_inode_dirty(handle, new.inode); + + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + } + if (new.dir_bh) { + retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); + if (retval) + goto end_rename; + } + ext4_update_dir_count(handle, &old); + ext4_update_dir_count(handle, &new); + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(new.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return ext4_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext4_dir_inode_operations = { + .create = ext4_create, + .lookup = ext4_lookup, + .link = ext4_link, + .unlink = ext4_unlink, + .symlink = ext4_symlink, + .mkdir = ext4_mkdir, + .rmdir = ext4_rmdir, + .mknod = ext4_mknod, + .tmpfile = ext4_tmpfile, + .rename2 = ext4_rename2, + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, + .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, + .fiemap = ext4_fiemap, +}; + +const struct inode_operations ext4_special_inode_operations = { + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, + .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, +}; diff --git a/kernel/fs/ext4/page-io.c b/kernel/fs/ext4/page-io.c new file mode 100644 index 000000000..5765f88b3 --- /dev/null +++ b/kernel/fs/ext4/page-io.c @@ -0,0 +1,529 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +static struct kmem_cache *io_end_cachep; + +int __init ext4_init_pageio(void) +{ + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_end_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio) +{ + int i; + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct page *data_page = NULL; + struct ext4_crypto_ctx *ctx = NULL; +#endif + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!page->mapping) { + /* The bounce data pages are unmapped. */ + data_page = page; + ctx = (struct ext4_crypto_ctx *)page_private(data_page); + page = ctx->control_page; + } +#endif + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + ext4_restore_control_page(data_page); +#endif + end_page_writeback(page); + } + } +} + +static void ext4_release_io_end(ext4_io_end_t *io_end) +{ + struct bio *bio, *next_bio; + + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } + kmem_cache_free(io_end_cachep, io_end); +} + +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); +} + +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ +static int ext4_end_io(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + handle_t *handle = io->handle; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + if (ret < 0) { + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); + } + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); + return ret; +} + +static void dump_completed_IO(struct inode *inode, struct list_head *head) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + + if (list_empty(head)) + return; + + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} + +/* Add the io_end to per-inode completed end_io list. */ +static void ext4_add_complete_io(ext4_io_end_t *io_end) +{ + struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); + struct workqueue_struct *wq; + unsigned long flags; + + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + wq = sbi->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} + +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) +{ + ext4_io_end_t *io; + struct list_head unwritten; + unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, ret = 0; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + while (!list_empty(&unwritten)) { + io = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io->list); + + err = ext4_end_io(io); + if (unlikely(!ret && err)) + ret = err; + } + return ret; +} + +/* + * work on completed IO, to convert unwritten extents to extents + */ +void ext4_end_io_rsv_work(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); + if (io) { + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; + INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); + } + return io; +} + +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + +/* BIO completion function for page writeback */ +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + sector_t bi_sector = bio->bi_iter.bi_sector; + + BUG_ON(!io_end); + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + + if (error) { + struct inode *inode = io_end->inode; + + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + error, inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); + } + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); + } +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = NULL; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; + io->io_end = NULL; +} + +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) +{ + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + if (!bio) + return -ENOMEM; + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_end_io = ext4_end_bio; + bio->bi_private = ext4_get_io_end(io->io_end); + io->io_bio = bio; + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct page *page, + struct buffer_head *bh) +{ + int ret; + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init_bio(io, bh); + if (ret) + return ret; + } + ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + io->io_next_block++; + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite) +{ + struct page *data_page = NULL; + struct inode *inode = page->mapping->host; + unsigned block_start, blocksize; + struct buffer_head *bh, *head; + int ret = 0; + int nr_submitted = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); + ClearPageError(page); + + /* + * Comments copied from block_write_full_page: + * + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); + if (block_start >= len) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + if (!buffer_dirty(bh) || buffer_delay(bh) || + !buffer_mapped(bh) || buffer_unwritten(bh)) { + /* A hole? We can safely clear the dirty bit */ + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + continue; + } + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + bh = head = page_buffers(page); + + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); + data_page = NULL; + goto out; + } + } + + /* Now submit buffers to write */ + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, + data_page ? data_page : page, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + break; + } + nr_submitted++; + clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + out: + if (data_page) + ext4_restore_control_page(data_page); + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); + } + unlock_page(page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); + return ret; +} diff --git a/kernel/fs/ext4/readpage.c b/kernel/fs/ext4/readpage.c new file mode 100644 index 000000000..171b9ac4b --- /dev/null +++ b/kernel/fs/ext4/readpage.c @@ -0,0 +1,328 @@ +/* + * linux/fs/ext4/readpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2015, Google, Inc. + * + * This was originally taken from fs/mpage.c + * + * The intent is the ext4_mpage_readpages() function here is intended + * to replace mpage_readpages() in the general case, not just for + * encrypted files. It has some limitations (see below), where it + * will fall back to read_block_full_page(), but these limitations + * should only be hit when page_size != block_size. + * + * This will allow us to attach a callback function to support ext4 + * encryption. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" + +/* + * Call ext4_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_crypto_ctx *ctx = + container_of(work, struct ext4_crypto_ctx, work); + struct bio *bio = ctx->bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + int ret = ext4_decrypt(ctx, page); + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + ext4_release_crypto_ctx(ctx); + bio_put(bio); +#else + BUG(); +#endif +} + +static inline bool ext4_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + struct bio_vec *bv; + int i; + + if (ext4_bio_encrypted(bio)) { + struct ext4_crypto_ctx *ctx = bio->bi_private; + + if (err) { + ext4_release_crypto_ctx(ctx); + } else { + INIT_WORK(&ctx->work, completion_pages); + ctx->bio = bio; + queue_work(ext4_read_workqueue, &ctx->work); + return; + } + } + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + struct block_device *bdev = inode->i_sb->s_bdev; + int length; + unsigned relative_block = 0; + struct ext4_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + int fully_mapped = 1; + unsigned first_hole = blocks_per_page; + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & EXT4_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) { + unsigned map_offset = block_in_file - map.m_lblk; + unsigned last = map.m_len - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk + map_offset + + relative_block; + page_block++; + block_in_file++; + } + } + + /* + * Then do more ext4_map_blocks() calls until we are + * done with this page. + */ + while (page_block < blocks_per_page) { + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { + set_error_page: + SetPageError(page); + zero_user_segment(page, 0, + PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; + } + } + if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map.m_pblk-1) + goto confused; + for (relative_block = 0; ; relative_block++) { + if (relative_block == map.m_len) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk+relative_block; + page_block++; + block_in_file++; + } + } + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, + PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + if (fully_mapped && blocks_per_page == 1 && + !PageUptodate(page) && cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != blocks[0] - 1)) { + submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + struct ext4_crypto_ctx *ctx = NULL; + + if (ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + } + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) { + if (ctx) + ext4_release_crypto_ctx(ctx); + goto set_error_page; + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_end_io = mpage_end_io; + bio->bi_private = ctx; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) + goto submit_and_realloc; + + if (((map.m_flags & EXT4_MAP_BOUNDARY) && + (relative_block == map.m_len)) || + (first_hole != blocks_per_page)) { + submit_bio(READ, bio); + bio = NULL; + } else + last_block_in_bio = blocks[blocks_per_page - 1]; + goto next_page; + confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + if (!PageUptodate(page)) + block_read_full_page(page, ext4_get_block); + else + unlock_page(page); + next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; +} diff --git a/kernel/fs/ext4/resize.c b/kernel/fs/ext4/resize.c new file mode 100644 index 000000000..cf0c47204 --- /dev/null +++ b/kernel/fs/ext4/resize.c @@ -0,0 +1,2024 @@ +/* + * linux/fs/ext4/resize.c + * + * Support for resizing an ext4 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT4FS_DEBUG + +#include +#include + +#include "ext4_jbd2.h" + +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_atomic(); +} + +static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, + ext4_group_t group) { + return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << + EXT4_DESC_PER_BLOCK_BITS(sb); +} + +static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, + ext4_group_t group) { + group = ext4_meta_bg_first_group(sb, group); + return ext4_group_first_block_no(sb, group); +} + +static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, + ext4_group_t group) { + ext4_grpblk_t overhead; + overhead = ext4_bg_num_gdb(sb, group); + if (ext4_bg_has_super(sb, group)) + overhead += 1 + + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + return overhead; +} + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t start = ext4_blocks_count(es); + ext4_fsblk_t end = start + input->blocks_count; + ext4_group_t group = input->group; + ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead; + ext4_fsblk_t metaend; + struct buffer_head *bh = NULL; + ext4_grpblk_t free_blocks_count, offset; + int err = -EINVAL; + + if (group != sbi->s_groups_count) { + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + return -EINVAL; + } + + overhead = ext4_group_overhead_blocks(sb, group); + metaend = start + overhead; + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext4_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + ext4_get_group_no_and_offset(sb, start, NULL, &offset); + if (offset != 0) + ext4_warning(sb, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext4_warning(sb, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext4_warning(sb, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext4_warning(sb, "Cannot read last block (%llu)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext4_warning(sb, "Block bitmap not in group (block %llu)", + (unsigned long long)input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext4_warning(sb, "Inode bitmap not in group (block %llu)", + (unsigned long long)input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", + (unsigned long long)input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", + (unsigned long long)input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->block_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->inode_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->block_bitmap, start, metaend)) + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->block_bitmap, + start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", + (unsigned long long)input->inode_table, + itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + goto out2; + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + * + * Returns 0 on a successful allocation of the metadata blocks in the + * block group. + */ +static int ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + if (src_group >= group_data[0].group + flex_gd->count) + return -ENOSPC; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_group_overhead_blocks(sb, src_group); + + start_blk += overhead; + + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) { + overhead = ext4_group_overhead_blocks(sb, src_group); + if (overhead == 0) + last_blk += group_data[src_group - group].blocks_count; + else + break; + } + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + group = ext4_get_group_number(sb, start_blk - 1); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + flex_gd->bg_flags[group] &= uninit_mask; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + group = ext4_get_group_number(sb, start_blk - 1); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + flex_gd->bg_flags[group] &= uninit_mask; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); + group -= group_data[0].group; + + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].free_blocks_count -= overhead; + itb -= overhead; + } + + group_data[group].free_blocks_count -= itb; + flex_gd->bg_flags[group] &= uninit_mask; + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } + return 0; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext4_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + BUFFER_TRACE(bh, "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + } + + return bh; +} + +/* + * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh) +{ + int err; + + if (ext4_handle_has_enough_credits(handle, thresh)) + return 0; + + err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) + return err; + } + + return 0; +} + +/* + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + group = ext4_get_group_number(sb, block); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (unlikely(!bh)) + return -ENOMEM; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. + */ +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; + handle_t *handle; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + int meta_bg; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; + ext4_grpblk_t overhead; + + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); + + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) + goto handle_itb; + + if (meta_bg == 1) { + ext4_group_t first_group; + first_group = ext4_meta_bg_first_group(sb, group); + if (first_group != group + 1 && + first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) + goto handle_itb; + } + + block = start + ext4_bg_has_super(sb, group); + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0; j < gdblocks; j++, block++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (unlikely(!gdb)) { + err = -ENOMEM; + goto out; + } + + BUFFER_TRACE(gdb, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; + } + +handle_itb: + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; + + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; + +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; + + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto out; + } + overhead = ext4_group_overhead_blocks(sb, group); + if (overhead != 0) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, overhead); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; + + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + } + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = (&group_data[i].block_bitmap)[j]; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } + } + +out: + brelse(bh); + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, + struct buffer_head *primary) +{ + const ext4_fsblk_t blk = primary->b_blocknr; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != + grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ + ext4_warning(sb, "reserved GDT %llu" + " missing grp %d (%llu)", + blk, grp, + grp * + (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + blk); + return -EINVAL; + } + if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + struct buffer_head *gdb_bh; + int gdbackups; + struct ext4_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", + gdb_num); + + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); + if (gdbackups < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext4_warning(sb, "new group %u GDT block %llu not reserved", + group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) + goto exit_dind; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + goto exit_dind; + + BUFFER_TRACE(dind, "get_write_access"); + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); + + /* ext4_reserve_inode_write() gets a reference on the iloc */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) + goto exit_dind; + + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + kvfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext4_handle_dirty_super(handle, sb); + if (err) + ext4_std_error(sb, err); + + return err; + +exit_inode: + kvfree(n_group_desc); + brelse(iloc.bh); +exit_dind: + brelse(dind); +exit_bh: + brelse(gdb_bh); + + ext4_debug("leaving with error %d\n", err); + return err; +} + +/* + * add_new_gdb_meta_bg is the sister of add_new_gdb. + */ +static int add_new_gdb_meta_bg(struct super_block *sb, + handle_t *handle, ext4_group_t group) { + ext4_fsblk_t gdblock; + struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int err; + + gdblock = ext4_meta_bg_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + return err; + } + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + kvfree(o_group_desc); + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + brelse(gdb_bh); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext4_iloc iloc; + ext4_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext4_warning(sb, "reserved block %llu" + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + BUFFER_TRACE(primary[i], "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, primary[i]))) + goto exit_bh; + } + + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = group * EXT4_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext4 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, int blk_off, char *data, + int size, int meta_bg) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t last; + const int bpg = EXT4_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + ext4_group_t group = 0; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + if (meta_bg == 0) { + group = ext4_list_backups(sb, &three, &five, &seven); + last = sbi->s_groups_count; + } else { + group = ext4_meta_bg_first_group(sb, group) + 1; + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); + } + + while (group < sbi->s_groups_count) { + struct buffer_head *bh; + ext4_fsblk_t backup_block; + + /* Out of journal space, and can't get more - abort - so sad */ + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && + ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && + (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + break; + + if (meta_bg == 0) + backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; + else + backup_block = (ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group)); + + bh = sb_getblk(sb, backup_block); + if (unlikely(!bh)) { + err = -ENOMEM; + break; + } + ext4_debug("update metadata backup %llu(+%llu)\n", + backup_block, backup_block - + ext4_group_first_block_no(sb, group)); + BUFFER_TRACE(bh, "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); + brelse(bh); + + if (meta_bg == 0) + group = ext4_list_backups(sb, &three, &five, &seven); + else if (group == last) + break; + else + group = last; + } + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext4_warning(sb, "can't update backup for group %u (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT4_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + int meta_bg; + + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else if (meta_bg != 0) { + err = add_new_gdb_meta_bg(sb, handle, group); + } else { + err = add_new_gdb(handle, resize_inode, group); + } + if (err) + break; + } + return err; +} + +static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) +{ + struct buffer_head *bh = sb_getblk(sb, block); + if (unlikely(!bh)) + return NULL; + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + brelse(bh); + return NULL; + } + } + + return bh; +} + +static int ext4_set_bitmap_checksums(struct super_block *sb, + ext4_group_t group, + struct ext4_group_desc *gdp, + struct ext4_new_group_data *group_data) +{ + struct buffer_head *bh; + + if (!ext4_has_metadata_csum(sb)) + return 0; + + bh = ext4_get_bitmap(sb, group_data->inode_bitmap); + if (!bh) + return -EIO; + ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + brelse(bh); + + bh = ext4_get_bitmap(sb, group_data->block_bitmap); + if (!bh) + return -EIO; + ext4_block_bitmap_csum_set(sb, group, gdp, bh); + brelse(bh); + + return 0; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + if (err) { + ext4_std_error(sb, err); + break; + } + + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + if (ext4_has_group_desc_csum(sb)) + ext4_itable_unused_set(sb, gdp, + EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + ext4_group_desc_csum_set(sb, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_NUM_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + ext4_debug("free blocks count %llu", + percpu_counter_read(&sbi->s_freeclusters_counter)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + /* + * Update the fs overhead information + */ + ext4_calculate_overhead(sb); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * blocks. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int gdb_num_end = ((group + flex_gd->count - 1) / + EXT4_DESC_PER_BLOCK(sb)); + int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG); + sector_t old_gdb = 0; + + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block), 0); + for (; gdb_num <= gdb_num_end; gdb_num++) { + struct buffer_head *gdb_bh; + + gdb_bh = sbi->s_group_desc[gdb_num]; + if (old_gdb == gdb_bh->b_blocknr) + continue; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size, meta_bg); + old_gdb = gdb_bh->b_blocknr; + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_group_overhead_blocks(sb, group + i); + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (ext4_has_group_desc_csum(sb)) { + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + if (!test_opt(sb, INIT_INODE_TABLE)) + flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; + } else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && ext4_has_group_desc_csum(sb)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) +{ + struct ext4_new_flex_group_data flex_gd; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct inode *inode = NULL; + int gdb_off; + int err; + __u16 bg_flags = 0; + + gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext4_warning(sb, "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (ext4_blocks_count(es) + input->blocks_count < + ext4_blocks_count(es)) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext4_warning(sb, "inodes_count overflow"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT4_HAS_COMPAT_FEATURE(sb, + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext4_warning(sb, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + + err = verify_group_input(sb, input); + if (err) + goto out; + + err = ext4_alloc_flex_bg_array(sb, input->group + 1); + if (err) + goto out; + + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); + if (err) + goto out; + + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ + +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, "error %d on journal start", err); + return err; + } + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; + } + + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); + if (err) + goto errout; + ext4_handle_dirty_super(handle, sb); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, + (char *)es, sizeof(struct ext4_super_block), 0); + } + return err; +} + +/* + * Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext4_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count) +{ + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_grpblk_t add; + struct buffer_head *bh; + int err; + ext4_group_t group; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); + if (sizeof(sector_t) < 8) + ext4_warning(sb, "CONFIG_LBDAF not enabled"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + /* Handle the remaining blocks in the last group only. */ + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + + if (last == 0) { + ext4_warning(sb, "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT4_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + return err; +} /* ext4_group_extend */ + + +static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) +{ + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); +} + +/* + * Release the resize inode and drop the resize_inode feature if there + * are no more reserved gdt blocks, and then convert the file system + * to enable meta_bg + */ +static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) +{ + handle_t *handle; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t nr; + int i, ret, err = 0; + int credits = 1; + + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); + if (inode) { + if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "Unexpected non-zero " + "s_reserved_gdt_blocks"); + return -EPERM; + } + + /* Do a quick sanity check of the resize inode */ + if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + goto invalid_resize_inode; + for (i = 0; i < EXT4_N_BLOCKS; i++) { + if (i == EXT4_DIND_BLOCK) { + if (ei->i_data[i]) + continue; + else + goto invalid_resize_inode; + } + if (ei->i_data[i]) + goto invalid_resize_inode; + } + credits += 3; /* block bitmap, bg descriptor, resize inode */ + } + + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto errout; + + EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + sbi->s_es->s_first_meta_bg = + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + + err = ext4_handle_dirty_super(handle, sb); + if (err) { + ext4_std_error(sb, err); + goto errout; + } + + if (inode) { + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + ei->i_data[EXT4_DIND_BLOCK] = 0; + inode->i_blocks = 0; + + err = ext4_mark_inode_dirty(handle, inode); + if (err) + ext4_std_error(sb, err); + } + +errout: + ret = ext4_journal_stop(handle); + if (!err) + err = ret; + return ret; + +invalid_resize_inode: + ext4_error(sb, "corrupted/inconsistent resize inode"); + return -EINVAL; +} + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode = NULL; + ext4_grpblk_t add, offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_fsblk_t o_blocks_count; + ext4_fsblk_t n_blocks_count_retry = 0; + unsigned long last_update_time = 0; + int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int meta_bg; + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + +retry: + o_blocks_count = ext4_blocks_count(es); + + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); + + n_desc_blocks = num_desc_blocks(sb, n_group + 1); + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); + + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (meta_bg) { + ext4_error(sb, "resize_inode and meta_bg enabled " + "simultaneously"); + return -EINVAL; + } + if (n_desc_blocks > o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks)) { + n_blocks_count_retry = n_blocks_count; + n_desc_blocks = o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks); + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); + n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_group--; /* set to last group number */ + } + + if (!resize_inode) + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + } + + if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + err = ext4_convert_meta_bg(sb, resize_inode); + if (err) + goto out; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + if (n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + goto retry; + } + } + + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (ext4_blocks_count(es) == n_blocks_count) + goto out; + + err = ext4_alloc_flex_bg_array(sb, n_group + 1); + if (err) + return err; + + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); + if (err) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + if (jiffies - last_update_time > HZ * 10) { + if (last_update_time) + ext4_msg(sb, KERN_INFO, + "resized to %llu blocks", + ext4_blocks_count(es)); + last_update_time = jiffies; + } + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) + break; + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + + if (!err && n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + free_flex_gd(flex_gd); + flex_gd = NULL; + goto retry; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + if (resize_inode != NULL) + iput(resize_inode); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + return err; +} diff --git a/kernel/fs/ext4/super.c b/kernel/fs/ext4/super.c new file mode 100644 index 000000000..ca9d4a2fe --- /dev/null +++ b/kernel/fs/ext4/super.c @@ -0,0 +1,5668 @@ +/* + * linux/fs/ext4/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ext4.h" +#include "ext4_extents.h" /* Needed for trace points definition */ +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "mballoc.h" + +#define CREATE_TRACE_POINTS +#include + +static struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; +static struct ext4_lazy_init *ext4_li_info; +static struct mutex ext4_li_mtx; +static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready; + +static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); +static int ext4_commit_super(struct super_block *sb, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_sync_fs(struct super_block *sb, int wait); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); +static int ext4_unfreeze(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); +static int ext4_feature_set_ok(struct super_block *sb, int readonly); +static void ext4_destroy_lazyinit_thread(void); +static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); +#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); +#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#else +#define IS_EXT3_SB(sb) (0) +#endif + +static int ext4_verify_csum_type(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum_type == EXT4_CRC32C_CHKSUM; +} + +static __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct ext4_super_block, s_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, ~0, (char *)es, offset); + + return cpu_to_le32(csum); +} + +static int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!ext4_has_metadata_csum(sb)) + return 1; + + return es->s_checksum == ext4_superblock_csum(sb, es); +} + +void ext4_superblock_csum_set(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (!ext4_has_metadata_csum(sb)) + return; + + es->s_checksum = ext4_superblock_csum(sb, es); +} + +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_block_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_table_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); +} + +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + +void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_table_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); +} + +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + + +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + le32_add_cpu(&es->s_error_count, 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce; + + BUG_ON(txn->t_state == T_FINISHED); + spin_lock(&sbi->s_md_lock); + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext4, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the jbd2_journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + */ + +static void ext4_handle_error(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt(sb, ERRORS_CONT)) { + journal_t *journal = EXT4_SB(sb)->s_journal; + + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (journal) + jbd2_journal_abort(journal, -EIO); + } + if (test_opt(sb, ERRORS_RO)) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); +} + +#define ext4_error_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ + "EXT4-fs error") + +void __ext4_error(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + } + save_error_info(sb, function, line); + ext4_handle_error(sb); +} + +void __ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + if (ext4_error_ratelimit(inode->i_sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); + } + save_error_info(inode->i_sb, function, line); + ext4_handle_error(inode->i_sb); +} + +void __ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es; + struct inode *inode = file_inode(file); + char pathname[80], *path; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + if (ext4_error_ratelimit(inode->i_sb)) { + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); + va_end(args); + } + save_error_info(inode->i_sb, function, line); + ext4_handle_error(inode->i_sb); +} + +const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext4_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + if (ext4_error_ratelimit(sb)) { + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + + save_error_info(sb, function, line); + ext4_handle_error(sb); +} + +/* + * ext4_abort is a much stronger failure handler than ext4_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + va_list args; + + save_error_info(sb, function, line); + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs panic from previous error\n"); +} + +void __ext4_msg(struct super_block *sb, + const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + +void __ext4_warning(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning")) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); + va_end(args); +} + +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + struct va_format vaf; + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); + + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", + (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + } + + if (test_opt(sb, ERRORS_CONT)) { + ext4_commit_super(sb, 0); + return; + } + + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish between the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + +void ext4_update_dynamic_rev(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) + return; + + ext4_warning(sb, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT4_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static void ext4_blkdev_put(struct block_device *bdev) +{ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static void ext4_blkdev_remove(struct ext4_sb_info *sbi) +{ + struct block_device *bdev; + bdev = sbi->journal_bdev; + if (bdev) { + ext4_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) +{ + struct list_head *l; + + ext4_msg(sb, KERN_ERR, "sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext4_put_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i, err; + + ext4_unregister_li_request(sb); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + flush_workqueue(sbi->rsv_conversion_wq); + destroy_workqueue(sbi->rsv_conversion_wq); + + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, "Couldn't clean up the journal"); + } + + ext4_es_unregister_shrinker(sbi); + del_timer_sync(&sbi->s_err_report); + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } + if (!(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } + kobject_del(&sbi->s_kobj); + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kvfree(sbi->s_group_desc); + kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache *ext4_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext4_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + ei->i_da_metadata_calc_last_lblock = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + ei->jinode = NULL; + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + spin_lock_init(&ei->i_completed_io_lock); + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); + atomic_set(&ei->i_unwritten, 0); + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif + + return &ei->vfs_inode; +} + +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + +static void ext4_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): orphan list check failed!", + inode->i_ino, EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } + call_rcu(&inode->i_rcu, ext4_i_callback); +} + +static void init_once(void *foo) +{ + struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_inode_cachep); +} + +void ext4_clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + clear_inode(inode); + dquot_drop(inode); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } +} + +static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext4_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext4_iget_normal(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext4_write_dquot(struct dquot *dquot); +static int ext4_acquire_dquot(struct dquot *dquot); +static int ext4_release_dquot(struct dquot *dquot); +static int ext4_mark_dquot_dirty(struct dquot *dquot); +static int ext4_write_info(struct super_block *sb, int type); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); +static int ext4_quota_off(struct super_block *sb, int type); +static int ext4_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags); +static int ext4_enable_quotas(struct super_block *sb); + +static struct dquot **ext4_get_dquots(struct inode *inode) +{ + return EXT4_I(inode)->i_dquot; +} + +static const struct dquot_operations ext4_quota_operations = { + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops ext4_qctl_operations = { + .quota_on = ext4_quota_on, + .quota_off = ext4_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +#endif + +static const struct super_operations ext4_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .put_super = ext4_put_super, + .sync_fs = ext4_sync_fs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, + .get_dquots = ext4_get_dquots, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct export_operations ext4_export_ops = { + .fh_to_dentry = ext4_fh_to_dentry, + .fh_to_parent = ext4_fh_to_parent, + .get_parent = ext4_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_debug, Opt_removed, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_lazytime, Opt_nolazytime, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_debug, "debug"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_noload, "norecovery"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, + {Opt_journal_checksum, "journal_checksum"}, + {Opt_nojournal_checksum, "nojournal_checksum"}, + {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, + {Opt_stripe, "stripe=%u"}, + {Opt_delalloc, "delalloc"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ + {Opt_err, NULL}, +}; + +static ext4_fsblk_t get_sb_block(void **data) +{ + ext4_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + + options += 3; + /* TODO: use simple_strtoll with >32bit ext4 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + + return sb_block; +} + +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + int ret = -1; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -1; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 1; + else + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sb, QUOTA); + return 1; +errout: + kfree(qname); + return ret; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -1; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 +#ifdef CONFIG_QUOTA +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT +#endif +#define MOPT_DATAJ 0x0080 +#define MOPT_NO_EXT2 0x0100 +#define MOPT_NO_EXT3 0x0200 +#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_STRING 0x0400 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_resuid, 0, MOPT_GTE0}, + {Opt_resgid, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_STRING}, + {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, + MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, +#else + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, +#endif + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + kuid_t uid; + kgid_t gid; + int arg = 0; + +#ifdef CONFIG_QUOTA + if (token == Opt_usrjquota) + return set_qf_name(sb, USRQUOTA, &args[0]); + else if (token == Opt_grpjquota) + return set_qf_name(sb, GRPQUOTA, &args[0]); + else if (token == Opt_offusrjquota) + return clear_qf_name(sb, USRQUOTA); + else if (token == Opt_offgrpjquota) + return clear_qf_name(sb, GRPQUOTA); +#endif + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + return 1; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) + if (token == m->token) + break; + + if (m->token == Opt_err) { + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; + } + + if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext2", opt); + return -1; + } + if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext3", opt); + return -1; + } + + if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) + return -1; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks must be " + "0 or a power of 2 smaller than 2^31"); + return -1; + } + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (token == Opt_resuid) { + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); + return -1; + } + sbi->s_resuid = uid; + } else if (token == Opt_resgid) { + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); + return -1; + } + sbi->s_resgid = gid; + } else if (token == Opt_journal_dev) { + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + } else if (token == Opt_journal_path) { + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext4_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return -1; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return -1; + } + + journal_inode = d_inode(path.dentry); + if (!S_ISBLK(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return -1; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + } else if (token == Opt_journal_ioprio) { + if (arg > 7) { + ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + " (must be 0-7)"); + return -1; + } + *journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (token == Opt_test_dummy_encryption) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + } else if (m->flags & MOPT_DATAJ) { + if (is_remount) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { + ext4_msg(sb, KERN_ERR, + "Cannot change data mode on remount"); + return -1; + } + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= m->mount_opt; + } +#ifdef CONFIG_QUOTA + } else if (m->flags & MOPT_QFMT) { + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, + "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } + sbi->s_jquota_fmt = m->mount_opt; +#endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; + } + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; + } + return 1; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) + return 0; + } +#ifdef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { + ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " + "feature is enabled"); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sb, USRQUOTA); + + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sb, GRPQUOTA); + + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext4_msg(sb, KERN_ERR, "old and new quota " + "format mixing"); + return 0; + } + + if (!sbi->s_jquota_fmt) { + ext4_msg(sb, KERN_ERR, "journaled quota format " + "not specified"); + return 0; + } + } +#endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (blocksize < PAGE_CACHE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + return 0; + } + } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " + "in data=ordered mode"); + return 0; + } + return 1; +} + +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); +#endif +} + +static const char *token2str(int token) +{ + const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, + int read_only) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { + ext4_msg(sb, KERN_ERR, "revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + goto done; + if (!(sbi->s_mount_state & EXT4_VALID_FS)) + ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if (sbi->s_mount_state & EXT4_ERROR_FS) + ext4_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext4_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext4_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext4_update_dynamic_rev(sb); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + + ext4_commit_super(sb, 1); +done: + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT4_BLOCKS_PER_GROUP(sb), + EXT4_INODES_PER_GROUP(sb), + sbi->s_mount_opt, sbi->s_mount_opt2); + + cleancache_init_fs(sb); + return res; +} + +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *new_groups; + int size; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); + new_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", + size / (int) sizeof(struct flex_groups)); + return -ENOMEM; + } + + if (sbi->s_flex_groups) { + memcpy(new_groups, sbi->s_flex_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups))); + kvfree(sbi->s_flex_groups); + } + sbi->s_flex_groups = new_groups; + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + return 0; +} + +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + ext4_group_t flex_group; + int i, err; + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) + goto failed; + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); + } + + return 1; +failed: + return 0; +} + +static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + int offset; + __u16 crc = 0; + __le32 le_group = cpu_to_le32(block_group); + + if (ext4_has_metadata_csum(sbi->s_sb)) { + /* Use new metadata_csum algorithm */ + __le16 save_csum; + __u32 csum32; + + save_csum = gdp->bg_checksum; + gdp->bg_checksum = 0; + csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + sizeof(le_group)); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, + sbi->s_desc_size); + gdp->bg_checksum = save_csum; + + crc = csum32 & 0xFFFF; + goto out; + } + + /* old crc16 code */ + if (!(sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + return 0; + + offset = offsetof(struct ext4_group_desc, bg_checksum); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + +out: + return cpu_to_le16(crc); +} + +int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), + block_group, gdp))) + return 0; + + return 1; +} + +void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (!ext4_has_group_desc_csum(sb)) + return; + gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); +} + +/* Called at mount-time, super-block is locked */ +static int ext4_check_descriptors(struct super_block *sb, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext4_fsblk_t last_block; + ext4_fsblk_t block_bitmap; + ext4_fsblk_t inode_bitmap; + ext4_fsblk_t inode_table; + int flexbg_flag = 0; + ext4_group_t i, grp = sbi->s_groups_count; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flexbg_flag = 1; + + ext4_debug("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + + if (i == sbi->s_groups_count - 1 || flexbg_flag) + last_block = ext4_blocks_count(sbi->s_es) - 1; + else + last_block = first_block + + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + + block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap < first_block || block_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u not in group " + "(block %llu)!", i, block_bitmap); + return 0; + } + inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap < first_block || inode_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u not in group " + "(block %llu)!", i, inode_bitmap); + return 0; + } + inode_table = ext4_inode_table(sb, gdp); + if (inode_table < first_block || + inode_table + sbi->s_itb_per_group - 1 > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u not in group " + "(block %llu)!", i, inode_table); + return 0; + } + ext4_lock_group(sb, i); + if (!ext4_group_desc_csum_verify(sb, i, gdp)) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Checksum for group %u failed (%u!=%u)", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); + if (!(sb->s_flags & MS_RDONLY)) { + ext4_unlock_group(sb, i); + return 0; + } + } + ext4_unlock_group(sb, i); + if (!flexbg_flag) + first_block += EXT4_BLOCKS_PER_GROUP(sb); + } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; + return 1; +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + } + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + if (ret < 0) + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + mutex_lock(&inode->i_mutex); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); + mutex_unlock(&inode->i_mutex); + nr_truncates++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal extent format file size. + * Resulting logical blkno at s_maxbytes must fit in our on-disk + * extent format containers, within a sector_t, and within i_blocks + * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, + * so that won't be a limiting factor. + * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * + * Note, this does *not* consider any metadata overhead for vfs i_blocks. + */ +static loff_t ext4_max_size(int blkbits, int has_huge_files) +{ + loff_t res; + loff_t upper_limit = MAX_LFS_FILESIZE; + + /* small i_blocks in vfs inode? */ + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * CONFIG_LBDAF is not enabled implies the inode + * i_block represent total blocks in 512 bytes + * 32 == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (blkbits - 9); + upper_limit <<= blkbits; + } + + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; + res <<= blkbits; + + /* Sanity check against vm- & vfs- imposed limits */ + if (res > upper_limit) + res = upper_limit; + + return res; +} + +/* + * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^48 sector limit. + */ +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) +{ + loff_t res = EXT4_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + /* This is calculated to be the largest file size for a dense, block + * mapped file such that the file's total number of 512-byte sectors, + * including data and all indirect blocks, does not exceed (2^48 - 1). + * + * __u32 i_blocks_lo and _u16 i_blocks_high represent the total + * number of 512-byte sectors of the file. + */ + + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * !has_huge_files or CONFIG_LBDAF not enabled implies that + * the inode i_block field represents total file blocks in + * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + } else { + /* + * We use 48 bit ext4_inode i_blocks + * With EXT4_HUGE_FILE_FL set the i_blocks + * represent total number of blocks in + * file system block size + */ + upper_limit = (1LL << 48) - 1; + + } + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext4_fsblk_t descriptor_loc(struct super_block *sb, + ext4_fsblk_t logical_sb_block, int nr) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return logical_sb_block + nr + 1; + bg = sbi->s_desc_per_block * nr; + if (ext4_bg_has_super(sb, bg)) + has_super = 1; + + /* + * If we have a meta_bg fs with 1k blocks, group 0's GDT is at + * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled + * on modern mke2fs or blksize > 1k on older mke2fs) then we must + * compensate. + */ + if (sb->s_blocksize == 1024 && nr == 0 && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) + has_super++; + + return (has_super + ext4_group_first_block_no(sb, bg)); +} + +/** + * ext4_get_stripe_size: Get the stripe size. + * @sbi: In memory super block info + * + * If we have specified it via mount option, then + * use the mount option value. If the value specified at mount time is + * greater than the blocks per group use the super block value. + * If the super block value is greater than blocks per group return 0. + * Allocator needs it be less than blocks per group. + * + */ +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) +{ + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); + unsigned long stripe_width = + le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; + + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; + + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; + + return ret; +} + +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + union { + int offset; + int deprecated_val; + } u; +}; + +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) +{ + int ret; + + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (t && (!is_power_of_2(t) || t > 0x40000000)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *ui = t; + return count; +} + +static ssize_t es_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + + unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + + a->u.offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + +static ssize_t sbi_deprecated_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .u = { \ + .offset = offsetof(struct ext4_sb_info, _elname),\ + }, \ +} + +#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .u = { \ + .offset = offsetof(struct ext4_super_block, _elname), \ + }, \ +} + +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) + +#define EXT4_RO_ATTR_ES_UI(name, elname) \ + EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) + +#define ATTR_LIST(name) &ext4_attr_##name.attr +#define EXT4_DEPRECATED_ATTR(_name, _val) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = sbi_deprecated_show, \ + .u = { \ + .deprecated_val = _val, \ + }, \ +} + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RW_ATTR(reserved_clusters); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), + NULL, +}; + +/* Features this copy of ext4 supports */ +EXT4_INFO_ATTR(lazy_itable_init); +EXT4_INFO_ATTR(batched_discard); +EXT4_INFO_ATTR(meta_bg_resize); +EXT4_INFO_ATTR(encryption); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static void ext4_feat_release(struct kobject *kobj) +{ + complete(&ext4_feat->f_kobj_unregister); +} + +static ssize_t ext4_feat_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "supported\n"); +} + +/* + * We can not use ext4_attr_show/store because it relies on the kobject + * being embedded in the ext4_sb_info structure which is definitely not + * true in this case. + */ +static const struct sysfs_ops ext4_feat_ops = { + .show = ext4_feat_show, + .store = NULL, +}; + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_feat_ops, + .release = ext4_feat_release, +}; + +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + ext4_msg(sb, KERN_INFO, "filesystem is read-only"); + sb->s_flags |= MS_RDONLY; + return 1; + } + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } + +#ifndef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } +#endif /* CONFIG_QUOTA */ + return 1; +} + +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + ext4_group_t group, ngroups; + struct super_block *sb; + unsigned long timeout = 0; + int ret = 0; + + sb = elr->lr_super; + ngroups = EXT4_SB(sb)->s_groups_count; + + sb_start_write(sb); + for (group = elr->lr_next_group; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group >= ngroups) + ret = 1; + + if (!ret) { + timeout = jiffies; + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + if (elr->lr_timeout == 0) { + timeout = (jiffies - timeout) * + elr->lr_sbi->s_li_wait_mult; + elr->lr_timeout = timeout; + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + sb_end_write(sb); + + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request structure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + struct ext4_sb_info *sbi; + + if (!elr) + return; + + sbi = elr->lr_sbi; + + list_del(&elr->lr_request); + sbi->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); + return; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); + mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); +} + +static struct task_struct *ext4_lazyinit_task; + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup, cur; + + BUG_ON(NULL == eli); + +cont_thread: + while (true) { + next_wakeup = MAX_JIFFY_OFFSET; + + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + + list_for_each_safe(pos, n, &eli->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } + } + + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + } + mutex_unlock(&eli->li_list_mtx); + + try_to_freeze(); + + cur = jiffies; + if ((time_after_eq(cur, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { + cond_resched(); + continue; + } + + schedule_timeout_interruptible(next_wakeup - cur); + + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); + ext4_clear_request_list(); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_sbi = sbi; + elr->lr_next_group = start; + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + elr->lr_next_sched = jiffies + (prandom_u32() % + (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + return elr; +} + +int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr = NULL; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int ret = 0; + + mutex_lock(&ext4_li_mtx); + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; + goto out; + } + + if (first_not_zeroed == ngroups || + (sb->s_flags & MS_RDONLY) || + !test_opt(sb, INIT_INODE_TABLE)) + goto out; + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) { + ret = -ENOMEM; + goto out; + } + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + /* + * set elr to NULL here since it has been inserted to + * the request_list and the removal and free of it is + * handled by ext4_clear_request_list from now on. + */ + elr = NULL; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info || !ext4_lazyinit_task) + return; + + kthread_stop(ext4_lazyinit_task); +} + +static int set_journal_csum_feature_set(struct super_block *sb) +{ + int ret = 1; + int compat, incompat; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (ext4_has_metadata_csum(sb)) { + /* journal checksum v3 */ + compat = 0; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; + } else { + /* journal checksum v1 */ + compat = JBD2_FEATURE_COMPAT_CHECKSUM; + incompat = 0; + } + + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_CSUM_V3 | + JBD2_FEATURE_INCOMPAT_CSUM_V2); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + incompat); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + incompat); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + + return ret; +} + +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ +static int count_overhead(struct super_block *sb, ext4_group_t grp, + char *buf) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_fsblk_t first_block, last_block, b; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int s, j, count = 0; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + sbi->s_itb_per_group + 2); + + first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + + (grp * EXT4_BLOCKS_PER_GROUP(sb)); + last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + b = ext4_block_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_table(sb, gdp); + if (b >= first_block && b + sbi->s_itb_per_group <= last_block) + for (j = 0; j < sbi->s_itb_per_group; j++, b++) { + int c = EXT4_B2C(sbi, b - first_block); + ext4_set_bit(c, buf); + count++; + } + if (i != grp) + continue; + s = 0; + if (ext4_bg_has_super(sb, grp)) { + ext4_set_bit(s++, buf); + count++; + } + for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { + ext4_set_bit(EXT4_B2C(sbi, s++), buf); + count++; + } + } + if (!count) + return 0; + return EXT4_CLUSTERS_PER_GROUP(sb) - + ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); +} + +/* + * Compute the overhead and stash it in sbi->s_overhead + */ +int ext4_calculate_overhead(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + ext4_fsblk_t overhead = 0; + char *buf = (char *) get_zeroed_page(GFP_NOFS); + + if (!buf) + return -ENOMEM; + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are overhead + */ + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); + + /* + * Add the overhead found in each block group + */ + for (i = 0; i < ngroups; i++) { + int blks; + + blks = count_overhead(sb, i, buf); + overhead += blks; + if (blks) + memset(buf, 0, PAGE_SIZE); + cond_resched(); + } + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + + sbi->s_overhead = overhead; + smp_wmb(); + free_page((unsigned long) buf); + return 0; +} + + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) +{ + ext4_fsblk_t resv_clusters; + + /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * unwritten extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + +static int ext4_fill_super(struct super_block *sb, void *data, int silent) +{ + char *orig_data = kstrdup(data, GFP_KERNEL); + struct buffer_head *bh; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi; + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + char *cp; + const char *descr; + int ret = -ENOMEM; + int blocksize, clustersize; + unsigned int db_count; + unsigned int i; + int needs_recovery, has_huge_files, has_bigalloc; + __u64 blocks_count; + int err = 0; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto out_free_orig; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + goto out_free_orig; + } + sb->s_fs_info = sbi; + sbi->s_sb = sb; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sb_block = sb_block; + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Modes of operations for file and directory encryption. */ + sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif + + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + + /* -EINVAL is default */ + ret = -EINVAL; + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sb_block; + } + + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) + goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + /* Warn if metadata_csum and gdt_csum are both set. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + ext4_warning(sb, "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + silent = 1; + goto cantfind_ext4; + } + + /* Load the checksum driver */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; + } + } + + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + silent = 1; + goto cantfind_ext4; + } + + /* Precompute checksum seed for all metadata */ + if (ext4_has_metadata_csum(sb)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); + if (def_mount_opts & EXT4_DEFM_DEBUG) + set_opt(sb, DEBUG); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + set_opt(sb, GRPID); + if (def_mount_opts & EXT4_DEFM_UID16) + set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ + set_opt(sb, XATTR_USER); +#ifdef CONFIG_EXT4_FS_POSIX_ACL + set_opt(sb, POSIX_ACL); +#endif + /* don't forget to enable journal_csum when metadata_csum is enabled. */ + if (ext4_has_metadata_csum(sb)) + set_opt(sb, JOURNAL_CHECKSUM); + + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) + set_opt(sb, JOURNAL_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) + set_opt(sb, ORDERED_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) + set_opt(sb, WRITEBACK_DATA); + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + set_opt(sb, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + set_opt(sb, ERRORS_CONT); + else + set_opt(sb, ERRORS_RO); + /* block_validity enabled by default; disable with noblock_validity */ + set_opt(sb, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sb, DISCARD); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sb, BARRIER); + + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) + set_opt(sb, DELALLOC); + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } + sbi->s_def_mount_opt = sbi->s_mount_opt; + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, 0)) + goto failed_mount; + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + goto failed_mount; + } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && + (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext4_msg(sb, KERN_WARNING, + "feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + goto failed_mount; + } + } + + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + goto failed_mount; + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d", blocksize); + goto failed_mount; + } + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && + es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + goto failed_mount; + } + + if (sb->s_blocksize != blocksize) { + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + goto failed_mount; + } + + brelse(bh); + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = sb_bread_unmovable(sb, logical_sb_block); + if (!bh) { + ext4_msg(sb, KERN_ERR, + "Can't read superblock on 2nd try"); + goto failed_mount; + } + es = (struct ext4_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, + "Magic mismatch, very weird!"); + goto failed_mount; + } + } + + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + } + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + goto failed_mount; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext4; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + } + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC); + if (has_bigalloc) { + if (clustersize < blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%d)", clustersize, blocksize); + goto failed_mount; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + goto failed_mount; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + goto failed_mount; + } + } else { + if (clustersize != blocksize) { + ext4_warning(sb, "fragment/cluster size (%d) != " + "block size (%d)", clustersize, + blocksize); + clustersize = blocksize; + } + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; + } + sbi->s_cluster_ratio = clustersize / blocksize; + + if (sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); + + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + if (sizeof(sector_t) < 8) + ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + goto failed_mount; + } + + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + ret = -ENOMEM; + goto failed_mount; + } + + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + + bgl_lock_init(sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); + if (!sbi->s_group_desc[i]) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + goto failed_mount2; + } + + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + setup_timer(&sbi->s_err_report, print_daily_error_info, + (unsigned long) sb); + + /* Register extent status tree shrinker */ + if (ext4_es_register_shrinker(sbi)) + goto failed_mount3; + + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_extent_max_zeroout_kb = 32; + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext4_sops; + sb->s_export_op = &ext4_export_ops; + sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->dq_op = &ext4_quota_operations; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &ext4_qctl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3a; + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext4_load_journal(sb, es, journal_devnum)) + goto failed_mount3a; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_msg(sb, KERN_ERR, "required journal recovery " + "suppressed and not mounted read-only"); + goto failed_mount_wq; + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto failed_mount_wq; + } + + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto failed_mount_wq; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + set_opt(sb, ORDERED_DATA); + else + set_opt(sb, JOURNAL_DATA); + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto failed_mount_wq; + } + default: + break; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + +no_journal: + if (ext4_mballoc_ready) { + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; + } + } + + if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + !(sb->s_flags & MS_RDONLY) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + ext4_commit_super(sb, 1); + } + + /* + * Get the # of file system overhead blocks from the + * superblock if present. + */ + if (es->s_overhead_clusters) + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + else { + err = ext4_calculate_overhead(sb); + if (err) + goto failed_mount_wq; + } + + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->rsv_conversion_wq = + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->rsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + ret = -ENOMEM; + goto failed_mount4; + } + + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext4_iget(sb, EXT4_ROOT_INO); + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + ret = PTR_ERR(root); + root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + iput(root); + goto failed_mount4; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); + ret = -ENOMEM; + goto failed_mount4; + } + + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + ext4_msg(sb, KERN_INFO, "required extra inode space not" + "available"); + } + + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sb)); + goto failed_mount4a; + } + + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount5; + } + + block = ext4_count_free_clusters(sb); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount6; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount6; + } + + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount6; + + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) + goto failed_mount7; + +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !(sb->s_flags & MS_RDONLY)) { + err = ext4_enable_quotas(sb); + if (err) + goto failed_mount8; + } +#endif /* CONFIG_QUOTA */ + + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); + EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + if (needs_recovery) { + ext4_msg(sb, KERN_INFO, "recovery complete"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + if (test_opt(sb, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + + /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ + ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + + kfree(orig_data); + return 0; + +cantfind_ext4: + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + +#ifdef CONFIG_QUOTA +failed_mount8: + kobject_del(&sbi->s_kobj); +#endif +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_mb_release(sb); + if (sbi->s_flex_groups) + kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: + dput(sb->s_root); + sb->s_root = NULL; +failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); +failed_mount_wq: + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } +failed_mount3a: + ext4_es_unregister_shrinker(sbi); +failed_mount3: + del_timer_sync(&sbi->s_err_report); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kvfree(sbi->s_group_desc); +failed_mount: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } +#ifdef CONFIG_QUOTA + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext4_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +out_free_orig: + kfree(orig_data); + return err ? err : ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; + + write_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + write_unlock(&journal->j_state_lock); +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext4_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + ext4_msg(sb, KERN_ERR, "no journal found"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext4_msg(sb, KERN_ERR, "journal inode is deleted"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "invalid journal inode"); + iput(journal_inode); + return NULL; + } + + journal = jbd2_journal_init_inode(journal_inode); + if (!journal) { + ext4_msg(sb, KERN_ERR, "Could not load journal inode"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext4_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head *bh; + journal_t *journal; + ext4_fsblk_t start; + ext4_fsblk_t len; + int hblock, blocksize; + ext4_fsblk_t sb_block; + unsigned long offset; + struct ext4_super_block *es; + struct block_device *bdev; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + bdev = ext4_blkdev_get(j_dev, sb); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext4_msg(sb, KERN_ERR, + "blocksize too small for journal device"); + goto out_bdev; + } + + sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; + offset = EXT4_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + ext4_msg(sb, KERN_ERR, "couldn't read superblock of " + "external journal"); + goto out_bdev; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "bad superblock"); + brelse(bh); + goto out_bdev; + } + + if ((le32_to_cpu(es->s_feature_ro_compat) & + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "corrupt superblock"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + brelse(bh); + goto out_bdev; + } + + len = ext4_blocks_count(es); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = jbd2_journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + ext4_msg(sb, KERN_ERR, "failed to create device journal"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + ext4_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext4_msg(sb, KERN_ERR, "External journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT4_SB(sb)->journal_bdev = bdev; + ext4_init_journal_params(sb, journal); + return journal; + +out_journal: + jbd2_journal_destroy(journal); +out_bdev: + ext4_blkdev_put(bdev); + return NULL; +} + +static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext4_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "INFO: recovery " + "required on readonly filesystem"); + if (really_read_only) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, cannot proceed"); + return -EROFS; + } + ext4_msg(sb, KERN_INFO, "write access will " + "be enabled during recovery"); + } + } + + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, "filesystem has both journal " + "and inode journals!"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext4_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext4_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!(journal->j_flags & JBD2_BARRIER)) + ext4_msg(sb, KERN_INFO, "barriers disabled"); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + err = jbd2_journal_wipe(journal, !really_read_only); + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); + err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } + + if (err) { + ext4_msg(sb, KERN_ERR, "error loading journal"); + jbd2_journal_destroy(journal); + return err; + } + + EXT4_SB(sb)->s_journal = journal; + ext4_clear_journal_err(sb, es); + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + + /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb, 1); + } + + return 0; +} + +static int ext4_commit_super(struct super_block *sb, int sync) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; + + if (!sbh) + return error; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) + ext4_free_blocks_count_set(es, + EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeclusters_counter))); + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + BUFFER_TRACE(sbh, "marking dirty"); + ext4_superblock_csum_set(sb); + mark_buffer_dirty(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; +} + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } + jbd2_journal_lock_updates(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + } + +out: + jbd2_journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + journal = EXT4_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext4_error() or ext4_abort() + */ + + j_errno = jbd2_journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext4_decode_error(sb, j_errno, nbuf); + ext4_warning(sb, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext4_warning(sb, "Marking fs in need of filesystem check."); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, 1); + + jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext4_force_commit(struct super_block *sb) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + return ext4_journal_force_commit(journal); +} + +static int ext4_sync_fs(struct super_block *sb, int wait) +{ + int ret = 0; + tid_t target; + bool needs_barrier = false; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(sbi->rsv_conversion_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); + /* + * Data writeback is possible w/o journal transaction, so barrier must + * being sent at the end of the function. But we can skip it if + * transaction_commit will do it for us. + */ + if (sbi->s_journal) { + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + ret = jbd2_log_wait_commit(sbi->s_journal, + target); + } + } else if (wait && test_opt(sb, BARRIER)) + needs_barrier = true; + if (needs_barrier) { + int err; + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } + + return ret; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + * + * Note that only this function cannot bring a filesystem to be in a clean + * state independently. It relies on upper layer to stop all data & metadata + * modifications. + */ +static int ext4_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); + + /* + * Don't clear the needs_recovery flag if we failed to + * flush the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + } + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); +out: + if (journal) + /* we rely on upper layer to stop further updates */ + jbd2_journal_unlock_updates(journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext4_unfreeze(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return 0; + + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + return 0; +} + +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[EXT4_MAXQUOTAS]; +#endif +}; + +static int ext4_remount(struct super_block *sb, int *flags, char *data) +{ + struct ext4_super_block *es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + int enable_quota = 0; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + int err = 0; +#ifdef CONFIG_QUOTA + int i, j; +#endif + char *orig_data = kstrdup(data, GFP_KERNEL); + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < EXT4_MAXQUOTAS; i++) + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + kfree(orig_data); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; +#endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ + test_opt(sb, JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "changing journal_checksum " + "during remount not supported; ignoring"); + sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; + } + + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_abort(sb, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && + (sbi->s_mount_state & EXT4_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + if (sbi->s_journal) + ext4_mark_recovery_complete(sb, es); + } else { + /* Make sure we can mount this feature set readwrite */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_READONLY) || + !ext4_feature_set_ok(sb, 0)) { + err = -EROFS; + goto restore_opts; + } + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sb, g, gdp)) { + ext4_msg(sb, KERN_ERR, + "ext4_remount: Checksum for group %u failed (%u!=%u)", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + ext4_msg(sb, KERN_WARNING, "Couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead"); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } + enable_quota = 1; + } + } + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + + ext4_setup_system_zone(sb); + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); + if (enable_quota) { + if (sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); + else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + err = ext4_enable_quotas(sb); + if (err) + goto restore_opts; + } + } +#endif + + *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); + return 0; + +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + kfree(orig_data); + return err; +} + +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t overhead = 0, resv_blocks; + u64 fsid; + s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); + + if (!test_opt(sb, MINIX_DF)) + overhead = sbi->s_overhead; + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT4_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction + * before quota file is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext4_create() quota_sync() + * jbd2_journal_start() write_dquot() + * dquot_initialize() down(dqio_mutex) + * down(dqio_mutex) jbd2_journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; +} + +static int ext4_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Are we journaling quotas? */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext4_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext4_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) + return -EXDEV; + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not in fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext4_msg(sb, KERN_WARNING, + "Quota file not on filesystem root. " + "Journaled quota will not work"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(d_inode(path->dentry))) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) + return err; + } + + return dquot_quota_on(sb, type, format_id, path); +} + +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + int err; + struct inode *qf_inode; + unsigned long qf_inums[EXT4_MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + }; + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)); + + if (!qf_inums[type]) + return -EPERM; + + qf_inode = ext4_iget(sb, qf_inums[type]); + if (IS_ERR(qf_inode)) { + ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + + return err; +} + +/* Enable usage tracking for all quota types. */ +static int ext4_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inums[EXT4_MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < EXT4_MAXQUOTAS; type++) { + if (qf_inums[type]) { + err = ext4_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED); + if (err) { + ext4_warning(sb, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "e2fsck to fix.", type, err); + return err; + } + } + } + return 0; +} + +static int ext4_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + handle_t *handle; + + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) + sync_filesystem(sb); + + if (!inode) + goto out; + + /* Update modification times of quota files when userspace can + * start looking at them */ + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) + goto out; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out: + return dquot_quota_off(sb, type); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext4_bread(NULL, inode, blk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err, offset = off & (sb->s_blocksize - 1); + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (EXT4_SB(sb)->s_journal && !handle) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + bh = ext4_bread(handle, inode, blk, 1); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + goto out; + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + return err; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); +out: + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + } + return len; +} + +#endif + +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); +} + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + return 0; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ext4"); + +static int __init ext4_init_feat_adverts(void) +{ + struct ext4_features *ef; + int ret = -ENOMEM; + + ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); + if (!ef) + goto out; + + ef->f_kobj.kset = ext4_kset; + init_completion(&ef->f_kobj_unregister); + ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, + "features"); + if (ret) { + kfree(ef); + goto out; + } + + ext4_feat = ef; + ret = 0; +out: + return ret; +} + +static void ext4_exit_feat_adverts(void) +{ + kobject_put(&ext4_feat->f_kobj); + wait_for_completion(&ext4_feat->f_kobj_unregister); + kfree(ext4_feat); +} + +/* Shared across all ext4 file systems */ +wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +static int __init ext4_init_fs(void) +{ + int i, err; + + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); + + /* Build-time check for flags consistency */ + ext4_check_flag_values(); + + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { + mutex_init(&ext4__aio_mutex[i]); + init_waitqueue_head(&ext4__ioend_wq[i]); + } + + err = ext4_init_es(); + if (err) + return err; + + err = ext4_init_pageio(); + if (err) + goto out7; + + err = ext4_init_system_zone(); + if (err) + goto out6; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) { + err = -ENOMEM; + goto out5; + } + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + + err = ext4_init_feat_adverts(); + if (err) + goto out4; + + err = ext4_init_mballoc(); + if (err) + goto out2; + else + ext4_mballoc_ready = 1; + err = init_inodecache(); + if (err) + goto out1; + register_as_ext3(); + register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; +out: + unregister_as_ext2(); + unregister_as_ext3(); + destroy_inodecache(); +out1: + ext4_mballoc_ready = 0; + ext4_exit_mballoc(); +out2: + ext4_exit_feat_adverts(); +out4: + if (ext4_proc_root) + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +out5: + ext4_exit_system_zone(); +out6: + ext4_exit_pageio(); +out7: + ext4_exit_es(); + + return err; +} + +static void __exit ext4_exit_fs(void) +{ + ext4_destroy_lazyinit_thread(); + unregister_as_ext2(); + unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + ext4_exit_mballoc(); + ext4_exit_feat_adverts(); + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); + ext4_exit_system_zone(); + ext4_exit_pageio(); + ext4_exit_es(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); +MODULE_LICENSE("GPL"); +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) diff --git a/kernel/fs/ext4/symlink.c b/kernel/fs/ext4/symlink.c new file mode 100644 index 000000000..187b78920 --- /dev/null +++ b/kernel/fs/ext4/symlink.c @@ -0,0 +1,145 @@ +/* + * linux/fs/ext4/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 symlink handling code + */ + +#include +#include +#include "ext4.h" +#include "xattr.h" + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct ext4_str cstr, pstr; + struct inode *inode = d_inode(dentry); + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + int res; + u32 plen, max_size = inode->i_sb->s_blocksize; + + if (!ext4_encrypted_inode(inode)) + return page_follow_link_light(dentry, nd); + + ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); + if (IS_ERR(ctx)) + return ctx; + + if (ext4_inode_is_fast_symlink(inode)) { + caddr = (char *) EXT4_I(inode)->i_data; + max_size = sizeof(EXT4_I(inode)->i_data); + } else { + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) { + ext4_put_fname_crypto_ctx(&ctx); + return cpage; + } + caddr = kmap(cpage); + caddr[size] = 0; + } + + /* Symlink is encrypted */ + sd = (struct ext4_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le32_to_cpu(sd->len); + if ((cstr.len + + sizeof(struct ext4_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? + EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; + paddr = kmalloc(plen + 1, GFP_NOFS); + if (!paddr) { + res = -ENOMEM; + goto errout; + } + pstr.name = paddr; + res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + if (res < 0) + goto errout; + /* Null-terminate the name */ + if (res <= plen) + paddr[res] = '\0'; + nd_set_link(nd, paddr); + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + return NULL; +errout: + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + kfree(paddr); + return ERR_PTR(res); +} + +static void ext4_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct page *page = cookie; + + if (!page) { + kfree(nd_get_link(nd)); + } else { + kunmap(page); + page_cache_release(page); + } +} +#endif + +static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext4_inode_info *ei = EXT4_I(d_inode(dentry)); + nd_set_link(nd, (char *) ei->i_data); + return NULL; +} + +const struct inode_operations ext4_symlink_inode_operations = { + .readlink = generic_readlink, +#ifdef CONFIG_EXT4_FS_ENCRYPTION + .follow_link = ext4_follow_link, + .put_link = ext4_put_link, +#else + .follow_link = page_follow_link_light, + .put_link = page_put_link, +#endif + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +}; + +const struct inode_operations ext4_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_follow_fast_link, + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +}; diff --git a/kernel/fs/ext4/truncate.h b/kernel/fs/ext4/truncate.h new file mode 100644 index 000000000..011ba6670 --- /dev/null +++ b/kernel/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/kernel/fs/ext4/xattr.c b/kernel/fs/ext4/xattr.c new file mode 100644 index 000000000..16e28c08d --- /dev/null +++ b/kernel/fs/ext4/xattr.c @@ -0,0 +1,1727 @@ +/* + * linux/fs/ext4/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext4 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +#ifdef EXT4_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); +static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +static void ext4_xattr_rehash(struct ext4_xattr_header *, + struct ext4_xattr_entry *); +static int ext4_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); + +static const struct xattr_handler *ext4_xattr_handler_map[] = { + [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext4_xattr_handlers[] = { + &ext4_xattr_user_handler, + &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif +#ifdef CONFIG_EXT4_FS_SECURITY + &ext4_xattr_security_handler, +#endif + NULL +}; + +#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_mb_cache) + +static __le32 ext4_xattr_block_csum(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); + + save_csum = hdr->h_checksum; + hdr->h_checksum = 0; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, + EXT4_BLOCK_SIZE(inode->i_sb)); + + hdr->h_checksum = save_csum; + return cpu_to_le32(csum); +} + +static int ext4_xattr_block_csum_verify(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (ext4_has_metadata_csum(inode->i_sb) && + (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) + return 0; + return 1; +} + +static void ext4_xattr_block_csum_set(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); +} + +static inline int ext4_handle_dirty_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +static inline const struct xattr_handler * +ext4_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) + handler = ext4_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * d_inode(dentry)->i_mutex: don't care + */ +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext4_xattr_list(dentry, buffer, size); +} + +static int +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, + void *value_start) +{ + struct ext4_xattr_entry *e = entry; + + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); + if ((void *)next >= end) + return -EIO; + e = next; + } + + while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_size != 0 && + (value_start + le16_to_cpu(entry->e_value_offs) < + (void *)e + sizeof(__u32) || + value_start + le16_to_cpu(entry->e_value_offs) + + le32_to_cpu(entry->e_value_size) > end)) + return -EIO; + entry = EXT4_XATTR_NEXT(entry); + } + + return 0; +} + +static inline int +ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) +{ + int error; + + if (buffer_verified(bh)) + return 0; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + return -EIO; + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); + if (!error) + set_buffer_verified(bh); + return error; +} + +static inline int +ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext4_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext4_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_entry *entry; + size_t size; + int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(inode, bh)) { +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(ext4_mb_cache, bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +int +ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(entry, end, entry); + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + if (strlen(name) > 255) + return -ERANGE; + + down_read(&EXT4_I(inode)->xattr_sem); + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +static int +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext4_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct buffer_head *bh = NULL; + int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(inode, bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(ext4_mb_cache, bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + if (error) + goto cleanup; + error = ext4_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret, ret2; + + down_read(&EXT4_I(d_inode(dentry))->xattr_sem); + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; + } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); + return ret; +} + +/* + * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext4_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + return; + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + ext4_handle_dirty_super(handle, sb); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. + */ +static void +ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); + BUFFER_TRACE(bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + get_bh(bh); + unlock_buffer(bh); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); + unlock_buffer(bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } +out: + ext4_std_error(inode->i_sb, error); + return; +} + +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + +static int +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +{ + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT4_XATTR_LEN(name_len) + + EXT4_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT4_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear the pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } + } + } + return 0; +} + +struct ext4_xattr_block_find { + struct ext4_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT4_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext4_xattr_check_block(inode, bs->bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + +#define header(x) ((struct ext4_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + BUFFER_TRACE(bs->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); + ext4_xattr_cache_insert(ext4_mb_cache, + bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext4_handle_dirty_xattr_block(handle, + inode, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext4_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, + EXT4_C2B(EXT4_SB(sb), 1)); + if (error) + goto cleanup; + BUFFER_TRACE(new_bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); + if (error) + goto cleanup; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); + + new_bh = sb_getblk(sb, block); + if (unlikely(!new_bh)) { + error = -ENOMEM; +getblk_failed: + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA); + goto cleanup; + } + lock_buffer(new_bh); + error = ext4_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + error = -EIO; + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext4_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); + goto cleanup; + +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + error = ext4_xattr_check_names(IFIRST(header), is->s.end, + IFIRST(header)); + if (error) + return error; + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { + error = ext4_try_to_evict_inline_data(handle, inode, + EXT4_XATTR_LEN(strlen(i->name) + + EXT4_XATTR_SIZE(i->value_len))); + if (error) + return error; + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; + error = ext4_xattr_set_entry(i, s); + } + if (error) + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +/* + * ext4_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext4_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + unsigned long no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT4_I(inode)->xattr_sem); + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + + error = ext4_reserve_inode_write(handle, inode, &is.iloc); + if (error) + goto cleanup; + + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + } + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = ext4_current_time(inode); + if (!value) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + if (no_expand == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + +/* + * ext4_xattr_set() + * + * Like ext4_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + int credits = ext4_jbd2_credits_xattr(inode); + +retry: + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext4_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n, int blocksize) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) + > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry, *last, *first; + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t min_offs, free; + int total_ino; + void *base, *start, *end; + int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + + down_write(&EXT4_I(inode)->xattr_sem); +retry: + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + } + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = start = entry; + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + min_offs = end - base; + last = entry; + total_ino = sizeof(struct ext4_xattr_ibody_header); + + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (free >= new_extra_isize) { + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, + inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + error = 0; + goto cleanup; + } + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + if (ext4_xattr_check_block(inode, bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + base = BHDR(bh); + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + free = ext4_xattr_free_space(first, &min_offs, base, NULL); + if (free < new_extra_isize) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } else { + free = inode->i_sb->s_blocksize; + } + + while (new_extra_isize > 0) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ + unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + if (!is || !bs) { + error = -ENOMEM; + goto cleanup; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + entry = NULL; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= free && total_size < min_total_size) { + if (total_size < new_extra_isize) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry) { + entry = small_entry; + } else { + if (!tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + kfree(is); is = NULL; + kfree(bs); bs = NULL; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!buffer || !b_entry_name) { + error = -ENOMEM; + goto cleanup; + } + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + offs, + EXT4_XATTR_SIZE(size)); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto cleanup; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto cleanup; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; + + entry = IFIRST(header); + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) + shift_bytes = new_extra_isize; + else + shift_bytes = entry_size + size; + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - + shift_bytes, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + (void *)header, total_ino - entry_size, + inode->i_sb->s_blocksize); + + extra_isize += shift_bytes; + new_extra_isize -= shift_bytes; + EXT4_I(inode)->i_extra_isize = extra_isize; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto cleanup; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto cleanup; + kfree(b_entry_name); + kfree(buffer); + b_entry_name = NULL; + buffer = NULL; + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + } + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + +cleanup: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + + + +/* + * ext4_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext4_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext4_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext4_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext4_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext4_xattr_cmp(struct ext4_xattr_header *header1, + struct ext4_xattr_header *header2) +{ + struct ext4_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT4_XATTR_NEXT(entry1); + entry2 = EXT4_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext4_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT4_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT4_XATTR_REFCOUNT_MAX); + } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext4_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n = 0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext4_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext4_xattr_rehash(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_entry *here; + __u32 hash = 0; + + ext4_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT4_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(char *name) +{ + return mb_cache_create(name, HASH_BUCKET_BITS); +} + +void ext4_xattr_destroy_cache(struct mb_cache *cache) +{ + if (cache) + mb_cache_destroy(cache); +} + diff --git a/kernel/fs/ext4/xattr.h b/kernel/fs/ext4/xattr.h new file mode 100644 index 000000000..ddc095776 --- /dev/null +++ b/kernel/fs/ext4/xattr.h @@ -0,0 +1,139 @@ +/* + File: fs/ext4/xattr.h + + On-disk format of extended attributes for the ext4 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT4_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT4_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT4_XATTR_INDEX_USER 1 +#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT4_XATTR_INDEX_TRUSTED 4 +#define EXT4_XATTR_INDEX_LUSTRE 5 +#define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 +#define EXT4_XATTR_INDEX_ENCRYPTION 9 + +struct ext4_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ + /* id = inum if refcount=1, blknum otherwise */ + __u32 h_reserved[3]; /* zero right now */ +}; + +struct ext4_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT4_XATTR_PAD_BITS 2 +#define EXT4_XATTR_PAD (1<e_name_len))) +#define EXT4_XATTR_SIZE(size) \ + (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) + +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_security_handler; + +#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" + +extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); + +extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern void ext4_xattr_put_super(struct super_block *); + +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); + +extern const struct xattr_handler *ext4_xattr_handlers[]; + +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); + +#ifdef CONFIG_EXT4_FS_SECURITY +extern int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/kernel/fs/ext4/xattr_security.c b/kernel/fs/ext4/xattr_security.c new file mode 100644 index 000000000..95d90e056 --- /dev/null +++ b/kernel/fs/ext4/xattr_security.c @@ -0,0 +1,82 @@ +/* + * linux/fs/ext4/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; + } + return err; +} + +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + +const struct xattr_handler ext4_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext4_xattr_security_list, + .get = ext4_xattr_security_get, + .set = ext4_xattr_security_set, +}; diff --git a/kernel/fs/ext4/xattr_trusted.c b/kernel/fs/ext4/xattr_trusted.c new file mode 100644 index 000000000..891ee2ddf --- /dev/null +++ b/kernel/fs/ext4/xattr_trusted.c @@ -0,0 +1,58 @@ +/* + * linux/fs/ext4/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext4_xattr_trusted_list, + .get = ext4_xattr_trusted_get, + .set = ext4_xattr_trusted_set, +}; diff --git a/kernel/fs/ext4/xattr_user.c b/kernel/fs/ext4/xattr_user.c new file mode 100644 index 000000000..6ed932b3c --- /dev/null +++ b/kernel/fs/ext4/xattr_user.c @@ -0,0 +1,61 @@ +/* + * linux/fs/ext4/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext4_xattr_user_list, + .get = ext4_xattr_user_get, + .set = ext4_xattr_user_set, +}; diff --git a/kernel/fs/f2fs/Kconfig b/kernel/fs/f2fs/Kconfig new file mode 100644 index 000000000..05f0f663f --- /dev/null +++ b/kernel/fs/f2fs/Kconfig @@ -0,0 +1,83 @@ +config F2FS_FS + tristate "F2FS filesystem support" + depends on BLOCK + help + F2FS is based on Log-structured File System (LFS), which supports + versatile "flash-friendly" features. The design has been focused on + addressing the fundamental issues in LFS, which are snowball effect + of wandering tree and high cleaning overhead. + + Since flash-based storages show different characteristics according to + the internal geometry or flash memory management schemes aka FTL, F2FS + and tools support various parameters not only for configuring on-disk + layout, but also for selecting allocation and cleaning algorithms. + + If unsure, say N. + +config F2FS_STAT_FS + bool "F2FS Status Information" + depends on F2FS_FS && DEBUG_FS + default y + help + /sys/kernel/debug/f2fs/ contains information about all the partitions + mounted as f2fs. Each file shows the whole f2fs information. + + /sys/kernel/debug/f2fs/status includes: + - major filesystem information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +config F2FS_FS_XATTR + bool "F2FS extended attributes" + depends on F2FS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config F2FS_FS_POSIX_ACL + bool "F2FS Access Control Lists" + depends on F2FS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + gourps beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the filesystem consistency in runtime. + + If you want to improve the performance, say N. + +config F2FS_IO_TRACE + bool "F2FS IO tracer" + depends on F2FS_FS + depends on FUNCTION_TRACER + help + F2FS IO trace is based on a function trace, which gathers process + information and block IO patterns in the filesystem level. + + If unsure, say N. diff --git a/kernel/fs/f2fs/Makefile b/kernel/fs/f2fs/Makefile new file mode 100644 index 000000000..d92397731 --- /dev/null +++ b/kernel/fs/f2fs/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_F2FS_FS) += f2fs.o + +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o +f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o +f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o +f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o +f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o diff --git a/kernel/fs/f2fs/acl.c b/kernel/fs/f2fs/acl.c new file mode 100644 index 000000000..4320ffab3 --- /dev/null +++ b/kernel/fs/f2fs/acl.c @@ -0,0 +1,410 @@ +/* + * fs/f2fs/acl.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +static inline size_t f2fs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct f2fs_acl_header) + + count * sizeof(struct f2fs_acl_entry_short); + } else { + return sizeof(struct f2fs_acl_header) + + 4 * sizeof(struct f2fs_acl_entry_short) + + (count - 4) * sizeof(struct f2fs_acl_entry); + } +} + +static inline int f2fs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(struct f2fs_acl_header); + s = size - 4 * sizeof(struct f2fs_acl_entry_short); + if (s < 0) { + if (size % sizeof(struct f2fs_acl_entry_short)) + return -1; + return size / sizeof(struct f2fs_acl_entry_short); + } else { + if (s % sizeof(struct f2fs_acl_entry)) + return -1; + return s / sizeof(struct f2fs_acl_entry) + 4; + } +} + +static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) +{ + int i, count; + struct posix_acl *acl; + struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; + struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); + const char *end = value + size; + + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + + count = f2fs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + + if ((char *)entry > end) + goto fail; + + acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + + case ACL_USER: + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + default: + goto fail; + } + } + if ((char *)entry != end) + goto fail; + return acl; +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + struct f2fs_acl_header *f2fs_acl; + struct f2fs_acl_entry *entry; + int i; + + f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * + sizeof(struct f2fs_acl_entry), GFP_NOFS); + if (!f2fs_acl) + return ERR_PTR(-ENOMEM); + + f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); + entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); + + for (i = 0; i < acl->a_count; i++) { + + entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[i].e_uid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[i].e_gid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + default: + goto fail; + } + } + *size = f2fs_acl_size(acl->a_count); + return (void *)f2fs_acl; + +fail: + kfree(f2fs_acl); + return ERR_PTR(-EINVAL); +} + +static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, + struct page *dpage) +{ + int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + void *value = NULL; + struct posix_acl *acl; + int retval; + + if (type == ACL_TYPE_ACCESS) + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); + if (retval > 0) { + value = kmalloc(retval, GFP_F2FS_ZERO); + if (!value) + return ERR_PTR(-ENOMEM); + retval = f2fs_getxattr(inode, name_index, "", value, + retval, dpage); + } + + if (retval > 0) + acl = f2fs_acl_from_disk(value, retval); + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +{ + return __f2fs_get_acl(inode, type, NULL); +} + +static int __f2fs_set_acl(struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + set_acl_inode(fi, inode->i_mode); + if (error == 0) + acl = NULL; + } + break; + + case ACL_TYPE_DEFAULT: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = f2fs_acl_to_disk(acl, &size); + if (IS_ERR(value)) { + clear_inode_flag(fi, FI_ACL_MODE); + return (int)PTR_ERR(value); + } + } + + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + clear_inode_flag(fi, FI_ACL_MODE); + return error; +} + +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return __f2fs_set_acl(inode, type, acl, NULL); +} + +/* + * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create + * are copied from posix_acl.c + */ +static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, + gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + atomic_set(&clone->a_refcount, 1); + } + return clone; +} + +static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +static int f2fs_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl, + struct page *dpage) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; + return PTR_ERR(p); + } + + if (!p) + goto apply_umask; + + *acl = f2fs_acl_clone(p, GFP_NOFS); + if (!*acl) + goto no_mem; + + ret = f2fs_acl_create_masq(*acl, mode); + if (ret < 0) + goto no_mem_clone; + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +apply_umask: + *mode &= ~current_umask(); +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; + +no_mem_clone: + posix_acl_release(*acl); +no_mem: + posix_acl_release(p); + return -ENOMEM; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, + struct page *dpage) +{ + struct posix_acl *default_acl = NULL, *acl = NULL; + int error = 0; + + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); + if (error) + return error; + + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); + } + + return error; +} diff --git a/kernel/fs/f2fs/acl.h b/kernel/fs/f2fs/acl.h new file mode 100644 index 000000000..997ca8edb --- /dev/null +++ b/kernel/fs/f2fs/acl.h @@ -0,0 +1,54 @@ +/* + * fs/f2fs/acl.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.h + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_ACL_H__ +#define __F2FS_ACL_H__ + +#include + +#define F2FS_ACL_VERSION 0x0001 + +struct f2fs_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +struct f2fs_acl_entry_short { + __le16 e_tag; + __le16 e_perm; +}; + +struct f2fs_acl_header { + __le32 a_version; +}; + +#ifdef CONFIG_F2FS_FS_POSIX_ACL + +extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, + struct page *); +#else +#define f2fs_check_acl NULL +#define f2fs_get_acl NULL +#define f2fs_set_acl NULL + +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *ipage, struct page *dpage) +{ + return 0; +} +#endif +#endif /* __F2FS_ACL_H__ */ diff --git a/kernel/fs/f2fs/checkpoint.c b/kernel/fs/f2fs/checkpoint.c new file mode 100644 index 000000000..a5e17a2a0 --- /dev/null +++ b/kernel/fs/f2fs/checkpoint.c @@ -0,0 +1,1135 @@ +/* + * fs/f2fs/checkpoint.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "trace.h" +#include + +static struct kmem_cache *ino_entry_slab; +struct kmem_cache *inode_entry_slab; + +/* + * We guarantee no failure on the returned page. + */ +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page = NULL; +repeat: + page = grab_cache_page(mapping, index); + if (!page) { + cond_resched(); + goto repeat; + } + f2fs_wait_on_page_writeback(page, META); + SetPageUptodate(page); + return page; +} + +/* + * We guarantee no failure on the returned page. + */ +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO, + .blk_addr = index, + }; +repeat: + page = grab_cache_page(mapping, index); + if (!page) { + cond_resched(); + goto repeat; + } + if (PageUptodate(page)) + goto out; + + if (f2fs_submit_page_bio(sbi, page, &fio)) + goto repeat; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } +out: + return page; +} + +static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + switch (type) { + case META_NAT: + break; + case META_SIT: + if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) + return false; + break; + case META_SSA: + if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || + blkaddr < SM_I(sbi)->ssa_blkaddr)) + return false; + break; + case META_CP: + if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || + blkaddr < __start_cp_addr(sbi))) + return false; + break; + case META_POR: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) + return false; + break; + default: + BUG(); + } + + return true; +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + block_t blkno = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + + if (!is_valid_blkaddr(sbi, blkno, type)) + goto out; + + switch (type) { + case META_NAT: + if (unlikely(blkno >= + NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) + blkno = 0; + /* get nat block addr */ + fio.blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + fio.blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) + goto out; + prev_blk_addr = fio.blk_addr; + break; + case META_SSA: + case META_CP: + case META_POR: + fio.blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, &fio); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + bool readahead = false; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || (page && !PageUptodate(page))) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); +} + +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + trace_f2fs_writepage(page, META); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + f2fs_wait_on_page_writeback(page, META); + write_meta_page(sbi, page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, META, WRITE); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_meta_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + long diff, written; + + trace_f2fs_writepages(mapping->host, wbc, META); + + /* collect a number of dirty meta pages and write together */ + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; + + /* if mounting is failed, skip writing node pages */ + mutex_lock(&sbi->cp_mutex); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); + mutex_unlock(&sbi->cp_mutex); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + return 0; +} + +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write) +{ + struct address_space *mapping = META_MAPPING(sbi); + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + long nwritten = 0; + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + pagevec_init(&pvec, 0); + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (unlikely(nr_pages == 0)) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + if (mapping->a_ops->writepage(page, &wbc)) { + unlock_page(page); + break; + } + nwritten++; + if (unlikely(nwritten >= nr_to_write)) + break; + } + pagevec_release(&pvec); + cond_resched(); + } + + if (nwritten) + f2fs_submit_merged_bio(sbi, type, WRITE); + + return nwritten; +} + +static int f2fs_set_meta_page_dirty(struct page *page) +{ + trace_f2fs_set_page_dirty(page, META); + + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); + SetPagePrivate(page); + f2fs_trace_pid(page); + return 1; + } + return 0; +} + +const struct address_space_operations f2fs_meta_aops = { + .writepage = f2fs_write_meta_page, + .writepages = f2fs_write_meta_pages, + .set_page_dirty = f2fs_set_meta_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, +}; + +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; +retry: + if (radix_tree_preload(GFP_NOFS)) { + cond_resched(); + goto retry; + } + + spin_lock(&im->ino_lock); + + e = radix_tree_lookup(&im->ino_root, ino); + if (!e) { + e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); + if (!e) { + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); + goto retry; + } + if (radix_tree_insert(&im->ino_root, ino, e)) { + spin_unlock(&im->ino_lock); + kmem_cache_free(ino_entry_slab, e); + radix_tree_preload_end(); + goto retry; + } + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &im->ino_list); + if (type != ORPHAN_INO) + im->ino_num++; + } + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, ino); + im->ino_num--; + spin_unlock(&im->ino_lock); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&im->ino_lock); +} + +void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, type); +} + +void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct inode_management *im = &sbi->im[mode]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + spin_unlock(&im->ino_lock); + return e ? true : false; +} + +void release_dirty_inode(struct f2fs_sb_info *sbi) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = APPEND_INO; i <= UPDATE_INO; i++) { + struct inode_management *im = &sbi->im[i]; + + spin_lock(&im->ino_lock); + list_for_each_entry_safe(e, tmp, &im->ino_list, list) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, e->ino); + kmem_cache_free(ino_entry_slab, e); + im->ino_num--; + } + spin_unlock(&im->ino_lock); + } +} + +int acquire_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + int err = 0; + + spin_lock(&im->ino_lock); + if (unlikely(im->ino_num >= sbi->max_orphans)) + err = -ENOSPC; + else + im->ino_num++; + spin_unlock(&im->ino_lock); + + return err; +} + +void release_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + spin_lock(&im->ino_lock); + f2fs_bug_on(sbi, im->ino_num == 0); + im->ino_num--; + spin_unlock(&im->ino_lock); +} + +void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + /* add new orphan ino entry into list */ + __add_ino_entry(sbi, ino, ORPHAN_INO); +} + +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); +} + +static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode = f2fs_iget(sbi->sb, ino); + f2fs_bug_on(sbi, IS_ERR(inode)); + clear_nlink(inode); + + /* truncate all the data during iput */ + iput(inode); +} + +void recover_orphan_inodes(struct f2fs_sb_info *sbi) +{ + block_t start_blk, orphan_blocks, i, j; + + if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + return; + + set_sbi_flag(sbi, SBI_POR_DOING); + + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); + + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); + + for (i = 0; i < orphan_blocks; i++) { + struct page *page = get_meta_page(sbi, start_blk + i); + struct f2fs_orphan_block *orphan_blk; + + orphan_blk = (struct f2fs_orphan_block *)page_address(page); + for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { + nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + recover_orphan_inode(sbi, ino); + } + f2fs_put_page(page, 1); + } + /* clear Orphan Flag */ + clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_sbi_flag(sbi, SBI_POR_DOING); + return; +} + +static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) +{ + struct list_head *head; + struct f2fs_orphan_block *orphan_blk = NULL; + unsigned int nentries = 0; + unsigned short index; + unsigned short orphan_blocks; + struct page *page = NULL; + struct ino_entry *orphan = NULL; + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); + + for (index = 0; index < orphan_blocks; index++) + grab_meta_page(sbi, start_blk + index); + + index = 1; + spin_lock(&im->ino_lock); + head = &im->ino_list; + + /* loop for each orphan inode entry and write them in Jornal block */ + list_for_each_entry(orphan, head, list) { + if (!page) { + page = find_get_page(META_MAPPING(sbi), start_blk++); + f2fs_bug_on(sbi, !page); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + f2fs_put_page(page, 0); + } + + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + + if (nentries == F2FS_ORPHANS_PER_BLOCK) { + /* + * an orphan block is full of 1020 entries, + * then we need to flush current orphan blocks + * and bring another one in memory + */ + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + index++; + nentries = 0; + page = NULL; + } + } + + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + } + + spin_unlock(&im->ino_lock); +} + +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1, *cp_page_2 = NULL; + unsigned long blk_size = sbi->blocksize; + struct f2fs_checkpoint *cp_block; + unsigned long long cur_version = 0, pre_version = 0; + size_t crc_offset; + __u32 crc = 0; + + /* Read the 1st cp block in this CP pack */ + cp_page_1 = get_meta_page(sbi, cp_addr); + + /* get the version number */ + cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); + crc_offset = le32_to_cpu(cp_block->checksum_offset); + if (crc_offset >= blk_size) + goto invalid_cp1; + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); + if (!f2fs_crc_valid(crc, cp_block, crc_offset)) + goto invalid_cp1; + + pre_version = cur_cp_version(cp_block); + + /* Read the 2nd cp block in this CP pack */ + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_page_2 = get_meta_page(sbi, cp_addr); + + cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); + crc_offset = le32_to_cpu(cp_block->checksum_offset); + if (crc_offset >= blk_size) + goto invalid_cp2; + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); + if (!f2fs_crc_valid(crc, cp_block, crc_offset)) + goto invalid_cp2; + + cur_version = cur_cp_version(cp_block); + + if (cur_version == pre_version) { + *version = cur_version; + f2fs_put_page(cp_page_2, 1); + return cp_page_1; + } +invalid_cp2: + f2fs_put_page(cp_page_2, 1); +invalid_cp1: + f2fs_put_page(cp_page_1, 1); + return NULL; +} + +int get_valid_checkpoint(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *cp_block; + struct f2fs_super_block *fsb = sbi->raw_super; + struct page *cp1, *cp2, *cur_page; + unsigned long blk_size = sbi->blocksize; + unsigned long long cp1_version = 0, cp2_version = 0; + unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + __cp_payload(sbi); + block_t cp_blk_no; + int i; + + sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); + if (!sbi->ckpt) + return -ENOMEM; + /* + * Finding out valid cp block involves read both + * sets( cp pack1 and cp pack 2) + */ + cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); + cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); + + /* The second checkpoint pack should start at the next segment */ + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); + cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); + + if (cp1 && cp2) { + if (ver_after(cp2_version, cp1_version)) + cur_page = cp2; + else + cur_page = cp1; + } else if (cp1) { + cur_page = cp1; + } else if (cp2) { + cur_page = cp2; + } else { + goto fail_no_cp; + } + + cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + memcpy(sbi->ckpt, cp_block, blk_size); + + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = get_meta_page(sbi, cp_blk_no + i); + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); + return 0; + +fail_no_cp: + kfree(sbi->ckpt); + return -EINVAL; +} + +static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) + return -EEXIST; + + set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + F2FS_I(inode)->dirty_dir = new; + list_add_tail(&new->list, &sbi->dir_inode_list); + stat_inc_dirty_dir(sbi); + return 0; +} + +void update_dirty_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode_entry *new; + int ret = 0; + + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + return; + + if (!S_ISDIR(inode->i_mode)) { + inode_inc_dirty_pages(inode); + goto out; + } + + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new->inode = inode; + INIT_LIST_HEAD(&new->list); + + spin_lock(&sbi->dir_inode_lock); + ret = __add_dirty_inode(inode, new); + inode_inc_dirty_pages(inode); + spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); +out: + SetPagePrivate(page); + f2fs_trace_pid(page); +} + +void add_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode_entry *new = + f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; + + new->inode = inode; + INIT_LIST_HEAD(&new->list); + + spin_lock(&sbi->dir_inode_lock); + ret = __add_dirty_inode(inode, new); + spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); +} + +void remove_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode_entry *entry; + + if (!S_ISDIR(inode->i_mode)) + return; + + spin_lock(&sbi->dir_inode_lock); + if (get_dirty_pages(inode) || + !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { + spin_unlock(&sbi->dir_inode_lock); + return; + } + + entry = F2FS_I(inode)->dirty_dir; + list_del(&entry->list); + F2FS_I(inode)->dirty_dir = NULL; + clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + stat_dec_dirty_dir(sbi); + spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + + /* Only from the recovery routine */ + if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { + clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + iput(inode); + } +} + +void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +{ + struct list_head *head; + struct inode_entry *entry; + struct inode *inode; +retry: + if (unlikely(f2fs_cp_error(sbi))) + return; + + spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; + if (list_empty(head)) { + spin_unlock(&sbi->dir_inode_lock); + return; + } + entry = list_entry(head->next, struct inode_entry, list); + inode = igrab(entry->inode); + spin_unlock(&sbi->dir_inode_lock); + if (inode) { + filemap_fdatawrite(inode->i_mapping); + iput(inode); + } else { + /* + * We should submit bio, since it exists several + * wribacking dentry pages in the freeing inode. + */ + f2fs_submit_merged_bio(sbi, DATA, WRITE); + cond_resched(); + } + goto retry; +} + +/* + * Freeze all the FS-operations for checkpoint. + */ +static int block_operations(struct f2fs_sb_info *sbi) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + struct blk_plug plug; + int err = 0; + + blk_start_plug(&plug); + +retry_flush_dents: + f2fs_lock_all(sbi); + /* write all the dirty dentry pages */ + if (get_pages(sbi, F2FS_DIRTY_DENTS)) { + f2fs_unlock_all(sbi); + sync_dirty_dir_inodes(sbi); + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } + goto retry_flush_dents; + } + + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. + */ +retry_flush_nodes: + down_write(&sbi->node_write); + + if (get_pages(sbi, F2FS_DIRTY_NODES)) { + up_write(&sbi->node_write); + sync_node_pages(sbi, 0, &wbc); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_unlock_all(sbi); + err = -EIO; + goto out; + } + goto retry_flush_nodes; + } +out: + blk_finish_plug(&plug); + return err; +} + +static void unblock_operations(struct f2fs_sb_info *sbi) +{ + up_write(&sbi->node_write); + f2fs_unlock_all(sbi); +} + +static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!get_pages(sbi, F2FS_WRITEBACK)) + break; + + io_schedule(); + } + finish_wait(&sbi->cp_wait, &wait); +} + +static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + nid_t last_nid = nm_i->next_scan_nid; + block_t start_blk; + struct page *cp_page; + unsigned int data_sum_blocks, orphan_blocks; + __u32 crc32 = 0; + void *kaddr; + int i; + int cp_payload_blks = __cp_payload(sbi); + + /* + * This avoids to conduct wrong roll-forward operations and uses + * metapages, so should be called prior to sync_meta_pages below. + */ + discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return; + } + + next_free_nid(sbi, &last_nid); + + /* + * modify checkpoint + * version number is already updated + */ + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + ckpt->cur_node_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); + ckpt->cur_node_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = + curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + ckpt->cur_data_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); + ckpt->cur_data_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = + curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + } + + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); + + /* 2 cp + n data seg summary + orphan inode blocks */ + data_sum_blocks = npages_for_summary_flush(sbi, false); + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) + set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + else + clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + + orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); + + if (__remain_node_summaries(cpc->reason)) + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); + else + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks); + + if (cpc->reason == CP_UMOUNT) + set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* update SIT/NAT bitmap */ + get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); + get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); + + crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) + = cpu_to_le32(crc32); + + start_blk = __start_cp_addr(sbi); + + /* write out checkpoint buffer at block 0 */ + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, ckpt, F2FS_BLKSIZE); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + + for (i = 1; i < 1 + cp_payload_blks; i++) { + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + } + + if (orphan_num) { + write_orphan_inodes(sbi, start_blk); + start_blk += orphan_blocks; + } + + write_data_summaries(sbi, start_blk); + start_blk += data_sum_blocks; + if (__remain_node_summaries(cpc->reason)) { + write_node_summaries(sbi, start_blk); + start_blk += NR_CURSEG_NODE_TYPE; + } + + /* writeout checkpoint block */ + cp_page = grab_meta_page(sbi, start_blk); + kaddr = page_address(cp_page); + memcpy(kaddr, ckpt, F2FS_BLKSIZE); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + + /* wait for previous submitted node/meta pages writeback */ + wait_on_all_pages_writeback(sbi); + + if (unlikely(f2fs_cp_error(sbi))) + return; + + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); + + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->alloc_valid_block_count = 0; + + /* Here, we only have one bio having CP pack */ + sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + + /* wait for previous submitted meta pages writeback */ + wait_on_all_pages_writeback(sbi); + + release_dirty_inode(sbi); + + if (unlikely(f2fs_cp_error(sbi))) + return; + + clear_prefree_segments(sbi); + clear_sbi_flag(sbi, SBI_IS_DIRTY); +} + +/* + * We guarantee that this checkpoint procedure will not fail. + */ +void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_ver; + + mutex_lock(&sbi->cp_mutex); + + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && + (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC)) + goto out; + if (unlikely(f2fs_cp_error(sbi))) + goto out; + if (f2fs_readonly(sbi->sb)) + goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + + if (block_operations(sbi)) + goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); + + /* + * update checkpoint pack index + * Increase the version number so that + * SIT entries and seg summaries are written at correct place + */ + ckpt_ver = cur_cp_version(ckpt); + ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); + + /* write cached NAT/SIT entries to NAT/SIT area */ + flush_nat_entries(sbi); + flush_sit_entries(sbi, cpc); + + /* unlock all the fs_lock[] in do_checkpoint() */ + do_checkpoint(sbi, cpc); + + unblock_operations(sbi); + stat_inc_cp_count(sbi->stat_info); + + if (cpc->reason == CP_RECOVERY) + f2fs_msg(sbi->sb, KERN_NOTICE, + "checkpoint: version = %llx", ckpt_ver); +out: + mutex_unlock(&sbi->cp_mutex); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); +} + +void init_ino_entry_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); + spin_lock_init(&im->ino_lock); + INIT_LIST_HEAD(&im->ino_list); + im->ino_num = 0; + } + + sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; +} + +int __init create_checkpoint_caches(void) +{ + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) + return -ENOMEM; + inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + sizeof(struct inode_entry)); + if (!inode_entry_slab) { + kmem_cache_destroy(ino_entry_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_checkpoint_caches(void) +{ + kmem_cache_destroy(ino_entry_slab); + kmem_cache_destroy(inode_entry_slab); +} diff --git a/kernel/fs/f2fs/data.c b/kernel/fs/f2fs/data.c new file mode 100644 index 000000000..1e1aae669 --- /dev/null +++ b/kernel/fs/f2fs/data.c @@ -0,0 +1,1864 @@ +/* + * fs/f2fs/data.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "trace.h" +#include + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static void f2fs_read_end_io(struct bio *bio, int err) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (unlikely(err)) { + set_page_dirty(page); + set_bit(AS_EIO, &page->mapping->flags); + f2fs_stop_checkpoint(sbi); + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); + } + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + + if (!io->bio) + return; + + if (is_read_io(fio->rw)) + trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); + else + trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); + + submit_bio(fio->rw, io->bio); + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + down_write(&io->io_rwsem); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + if (test_opt(sbi, NOBARRIER)) + io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + else + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + up_write(&io->io_rwsem); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, fio); + f2fs_trace_ios(page, fio, 0); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(fio->rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, fio->blk_addr); + + down_write(&io->io_rwsem); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(sbi); + + io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = fio->blk_addr; + f2fs_trace_ios(page, fio, 0); + + up_write(&io->io_rwsem); + trace_f2fs_submit_page_mbio(page, fio); +} + +/* + * Lock ordering for the change of data block address: + * ->data_page + * ->node_page + * update block addresses in the node page + */ +void set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn; + __le32 *addr_array; + struct page *node_page = dn->node_page; + unsigned int ofs_in_node = dn->ofs_in_node; + + f2fs_wait_on_page_writeback(node_page, NODE); + + rn = F2FS_NODE(node_page); + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + set_page_dirty(node_page); +} + +int reserve_new_block(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; + + trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + mark_inode_dirty(dn->inode); + sync_inode_page(dn); + return 0; +} + +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + +static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs, + struct extent_info *ei, struct buffer_head *bh_result) +{ + unsigned int blkbits = sb->s_blocksize_bits; + size_t max_size = bh_result->b_size; + size_t mapped_size; + + clear_buffer_new(bh_result); + map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs); + mapped_size = (ei->fofs + ei->len - pgofs) << blkbits; + bh_result->b_size = min(max_size, mapped_size); +} + +static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t start_fofs, end_fofs; + block_t start_blkaddr; + + read_lock(&fi->ext_lock); + if (fi->ext.len == 0) { + read_unlock(&fi->ext_lock); + return false; + } + + stat_inc_total_hit(inode->i_sb); + + start_fofs = fi->ext.fofs; + end_fofs = fi->ext.fofs + fi->ext.len - 1; + start_blkaddr = fi->ext.blk; + + if (pgofs >= start_fofs && pgofs <= end_fofs) { + *ei = fi->ext; + stat_inc_read_hit(inode->i_sb); + read_unlock(&fi->ext_lock); + return true; + } + read_unlock(&fi->ext_lock); + return false; +} + +static bool update_extent_info(struct inode *inode, pgoff_t fofs, + block_t blkaddr) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t start_fofs, end_fofs; + block_t start_blkaddr, end_blkaddr; + int need_update = true; + + write_lock(&fi->ext_lock); + + start_fofs = fi->ext.fofs; + end_fofs = fi->ext.fofs + fi->ext.len - 1; + start_blkaddr = fi->ext.blk; + end_blkaddr = fi->ext.blk + fi->ext.len - 1; + + /* Drop and initialize the matched extent */ + if (fi->ext.len == 1 && fofs == start_fofs) + fi->ext.len = 0; + + /* Initial extent */ + if (fi->ext.len == 0) { + if (blkaddr != NULL_ADDR) { + fi->ext.fofs = fofs; + fi->ext.blk = blkaddr; + fi->ext.len = 1; + } + goto end_update; + } + + /* Front merge */ + if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { + fi->ext.fofs--; + fi->ext.blk--; + fi->ext.len++; + goto end_update; + } + + /* Back merge */ + if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { + fi->ext.len++; + goto end_update; + } + + /* Split the existing extent */ + if (fi->ext.len > 1 && + fofs >= start_fofs && fofs <= end_fofs) { + if ((end_fofs - fofs) < (fi->ext.len >> 1)) { + fi->ext.len = fofs - start_fofs; + } else { + fi->ext.fofs = fofs + 1; + fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; + fi->ext.len -= fofs - start_fofs + 1; + } + } else { + need_update = false; + } + + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } +end_update: + write_unlock(&fi->ext_lock); + return need_update; +} + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; +} + +static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, + nid_t ino) +{ + struct extent_tree *et; + + down_read(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + up_read(&sbi->extent_tree_lock); + return NULL; + } + atomic_inc(&et->refcount); + up_read(&sbi->extent_tree_lock); + + return et; +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct extent_tree *et, + unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en; + + if (et->cached_en) { + struct extent_info *cei = &et->cached_en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + return et->cached_en; + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + node = node->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + node = node->rb_right; + } else { + et->cached_en = en; + return en; + } + } + return NULL; +} + +static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *prev; + struct rb_node *node; + + node = rb_prev(&en->rb_node); + if (!node) + return NULL; + + prev = rb_entry(node, struct extent_node, rb_node); + if (__is_back_mergeable(&en->ei, &prev->ei)) { + en->ei.fofs = prev->ei.fofs; + en->ei.blk = prev->ei.blk; + en->ei.len += prev->ei.len; + __detach_extent_node(sbi, et, prev); + return prev; + } + return NULL; +} + +static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *next; + struct rb_node *node; + + node = rb_next(&en->rb_node); + if (!node) + return NULL; + + next = rb_entry(node, struct extent_node, rb_node); + if (__is_front_mergeable(&en->ei, &next->ei)) { + en->ei.len += next->ei.len; + __detach_extent_node(sbi, et, next); + return next; + } + return NULL; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en; + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + if (__is_front_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.fofs = ei->fofs; + en->ei.blk = ei->blk; + en->ei.len += ei->len; + *den = __try_back_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + if (__is_back_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.len += ei->len; + *den = __try_front_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_right; + } else { + f2fs_bug_on(sbi, 1); + } + } + + return __attach_extent_node(sbi, et, ei, parent, p); +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +static void f2fs_init_extent_tree(struct inode *inode, + struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + if (et->count) + goto out; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + en = __insert_extent_tree(sbi, et, &ei, NULL); + if (en) { + et->cached_en = en; + + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + return false; + + read_lock(&et->lock); + en = __lookup_extent_tree(et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + stat_inc_read_hit(sbi->sb); + } + stat_inc_total_hit(sbi->sb); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); + + atomic_dec(&et->refcount); + return en ? true : false; +} + +static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *den = NULL; + struct extent_info ei, dei; + unsigned int endofs; + + trace_f2fs_update_extent_tree(inode, fofs, blkaddr); + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + + /* 1. lookup and remove existing extent info in cache */ + en = __lookup_extent_tree(et, fofs); + if (!en) + goto update_extent; + + dei = en->ei; + __detach_extent_node(sbi, et, en); + + /* 2. if extent can be split more, split and insert the left part */ + if (dei.len > 1) { + /* insert left part of split extent into cache */ + if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + fofs - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len - 1; + if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, fofs + 1, + fofs - dei.fofs + dei.blk, endofs - fofs); + en2 = __insert_extent_tree(sbi, et, &ei, NULL); + } + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + set_extent_info(&ei, fofs, blkaddr, 1); + en3 = __insert_extent_tree(sbi, et, &ei, &den); + } + + /* 4. update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + /* + * en1 and en2 split from en, they will become more and more smaller + * fragments after splitting several times. So if the length is smaller + * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. + */ + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + /* 5. release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); + if (den) + kmem_cache_free(extent_node_slab, den); + + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +void f2fs_preserve_extent_tree(struct inode *inode) +{ + struct extent_tree *et; + struct extent_info *ext = &F2FS_I(inode)->ext; + bool sync = false; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return; + + et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); + if (!et) { + if (ext->len) { + ext->len = 0; + update_inode_page(inode); + } + return; + } + + read_lock(&et->lock); + if (et->count) { + struct extent_node *en; + + if (et->cached_en) { + en = et->cached_en; + } else { + struct rb_node *node = rb_first(&et->root); + + if (!node) + node = rb_last(&et->root); + en = rb_entry(node, struct extent_node, rb_node); + } + + if (__is_extent_same(ext, &en->ei)) + goto out; + + *ext = en->ei; + sync = true; + } else if (ext->len) { + ext->len = 0; + sync = true; + } +out: + read_unlock(&et->lock); + atomic_dec(&et->refcount); + + if (sync) + update_inode_page(inode); +} + +void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_iter iter; + void **slot; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + if (available_free_memory(sbi, EXTENT_CACHE)) + return; + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!nr_shrink--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + down_read(&sbi->extent_tree_lock); + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + atomic_inc(&et->refcount); + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + atomic_dec(&et->refcount); + } + } + up_read(&sbi->extent_tree_lock); + + down_write(&sbi->extent_tree_lock); + radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, + F2FS_ROOT_INO(sbi)) { + struct extent_tree *et = (struct extent_tree *)*slot; + + if (!atomic_read(&et->refcount) && !et->count) { + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + } + } + up_write(&sbi->extent_tree_lock); + + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + unsigned int node_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + goto out; + + /* free all extent info belong to this extent tree */ + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + atomic_dec(&et->refcount); + + /* try to find and delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); + if (!et) { + up_write(&sbi->extent_tree_lock); + goto out; + } + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_destroy_extent_tree(inode, node_cnt); + return; +} + +void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) +{ + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + f2fs_init_extent_tree(inode, i_ext); + + write_lock(&F2FS_I(inode)->ext_lock); + get_extent_info(&F2FS_I(inode)->ext, *i_ext); + write_unlock(&F2FS_I(inode)->ext_lock); +} + +static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return f2fs_lookup_extent_tree(inode, pgofs, ei); + + return lookup_extent_info(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) + return f2fs_update_extent_tree(dn->inode, fofs, + dn->data_blkaddr); + + if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) + sync_inode_page(dn); +} + +struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) +{ + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei; + int err; + struct f2fs_io_info fio = { + .type = DATA, + .rw = sync ? READ_SYNC : READA, + }; + + /* + * If sync is false, it needs to check its block allocation. + * This is need and triggered by two flows: + * gc and truncate_partial_data_page. + */ + if (!sync) + goto search; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); +search: + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + return ERR_PTR(err); + f2fs_put_dnode(&dn); + + if (dn.data_blkaddr == NULL_ADDR) + return ERR_PTR(-ENOENT); + + /* By fallocate(), there is no cached page, but with NEW_ADDR */ + if (unlikely(dn.data_blkaddr == NEW_ADDR)) + return ERR_PTR(-EINVAL); + +got_it: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + fio.blk_addr = dn.data_blkaddr; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); + if (err) + return ERR_PTR(err); + + if (sync) { + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } + } + return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei; + int err; + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + }; +repeat: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + f2fs_put_dnode(&dn); + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + f2fs_put_page(page, 1); + return ERR_PTR(-ENOENT); + } + +got_it: + if (PageUptodate(page)) + return page; + + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return page; + } + + fio.blk_addr = dn.data_blkaddr; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); + if (err) + return ERR_PTR(err); + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + return page; +} + +/* + * Caller ensures that this data page is never allocated. + * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. + */ +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); + if (err) + return ERR_PTR(err); +repeat: + page = grab_cache_page(mapping, index); + if (!page) { + err = -ENOMEM; + goto put_err; + } + + if (PageUptodate(page)) + return page; + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else { + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + .blk_addr = dn.data_blkaddr, + }; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); + if (err) + goto put_err; + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + err = -EIO; + goto put_err; + } + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + } + + if (new_i_size && + i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + /* Only the directory inode sets new_i_size */ + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); + } + return page; + +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); +} + +static int __allocate_data_block(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + struct f2fs_summary sum; + struct node_info ni; + int seg = CURSEG_WARM_DATA; + pgoff_t fofs; + + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + + dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + if (dn->data_blkaddr == NEW_ADDR) + goto alloc; + + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; + +alloc: + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + + if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) + seg = CURSEG_DIRECT_IO; + + allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + &sum, seg); + + /* direct IO doesn't use extent cache to maximize the performance */ + set_data_blkaddr(dn); + + /* update i_size */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + + return 0; +} + +static void __allocate_data_blocks(struct inode *inode, loff_t offset, + size_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + u64 start = F2FS_BYTES_TO_BLK(offset); + u64 len = F2FS_BYTES_TO_BLK(count); + bool allocated; + u64 end_offset; + + while (len) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + goto out; + + allocated = false; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + + while (dn.ofs_in_node < end_offset && len) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { + if (__allocate_data_block(&dn)) + goto sync_out; + allocated = true; + } + len--; + start++; + dn.ofs_in_node++; + } + + if (allocated) + sync_inode_page(&dn); + + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + } + return; + +sync_out: + if (allocated) + sync_inode_page(&dn); + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + return; +} + +/* + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev + */ +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) +{ + unsigned int blkbits = inode->i_sb->s_blocksize_bits; + unsigned maxblocks = bh_result->b_size >> blkbits; + struct dnode_of_data dn; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + struct extent_info ei; + bool allocated = false; + + /* Get the page offset from the block offset(iblock) */ + pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + + if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result); + goto out; + } + + if (create) + f2fs_lock_op(F2FS_I_SB(inode)); + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; + + if (dn.data_blkaddr != NULL_ADDR) { + clear_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; + +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; + + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + } + + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + set_buffer_new(bh_result); + blkaddr = dn.data_blkaddr; + } + /* Give more consecutive addresses for the readahead */ + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } + } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: + f2fs_put_dnode(&dn); +unlock_out: + if (create) + f2fs_unlock_op(F2FS_I_SB(inode)); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; +} + +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); +} + +static int f2fs_read_data_page(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EAGAIN; + + trace_f2fs_readpage(page, DATA); + + /* If the file has inline data, try to read it directly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + if (ret == -EAGAIN) + ret = mpage_readpage(page, get_data_block); + + return ret; +} + +static int f2fs_read_data_pages(struct file *file, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); +} + +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) +{ + struct inode *inode = page->mapping->host; + struct dnode_of_data dn; + int err = 0; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) + return err; + + fio->blk_addr = dn.data_blkaddr; + + /* This page is already truncated */ + if (fio->blk_addr == NULL_ADDR) { + ClearPageUptodate(page); + goto out_writepage; + } + + set_page_writeback(page); + + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ + if (unlikely(fio->blk_addr != NEW_ADDR && + !is_cold_data(page) && + need_inplace_update(inode))) { + rewrite_data_page(page, fio); + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + trace_f2fs_do_write_data_page(page, IPU); + } else { + write_data_page(page, &dn, fio); + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + } +out_writepage: + f2fs_put_dnode(&dn); + return err; +} + +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = ((unsigned long long) i_size) + >> PAGE_CACHE_SHIFT; + unsigned offset = 0; + bool need_balance_fs = false; + int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; + + trace_f2fs_writepage(page, DATA); + + if (page->index < end_index) + goto write; + + /* + * If the offset is out-of-range of file size, + * this page does not have to be written to disk. + */ + offset = i_size & (PAGE_CACHE_SIZE - 1); + if ((page->index >= end_index + 1) || !offset) + goto out; + + zero_user_segment(page, offset, PAGE_CACHE_SIZE); +write: + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (f2fs_is_drop_cache(inode)) + goto out; + if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)) + goto redirty_out; + + /* Dentry blocks are controlled by checkpoint */ + if (S_ISDIR(inode->i_mode)) { + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + err = do_write_data_page(page, &fio); + goto done; + } + + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + SetPageError(page); + goto out; + } + + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0)) + goto redirty_out; + + err = -EAGAIN; + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode)) + err = f2fs_write_inline_data(inode, page); + if (err == -EAGAIN) + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; + + clear_cold_data(page); +out: + inode_dec_dirty_pages(inode); + if (err) + ClearPageUptodate(page); + unlock_page(page); + if (need_balance_fs) + f2fs_balance_fs(sbi); + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, DATA, WRITE); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool locked = false; + int ret; + long diff; + + trace_f2fs_writepages(mapping->host, wbc, DATA); + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + diff = nr_pages_to_write(sbi, DATA, wbc); + + if (!S_ISDIR(inode->i_mode)) { + mutex_lock(&sbi->writepages); + locked = true; + } + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + if (locked) + mutex_unlock(&sbi->writepages); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); + + remove_dirty_dir_inode(inode); + + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return ret; + +skip_write: + wbc->pages_skipped += get_dirty_pages(inode); + return 0; +} + +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + truncate_blocks(inode, inode->i_size, true); + } +} + +static int f2fs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page, *ipage; + pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; + struct dnode_of_data dn; + int err = 0; + + trace_f2fs_write_begin(inode, pos, len, flags); + + f2fs_balance_fs(sbi); + + /* + * We should check this at this moment to avoid deadlock on inode page + * and #0 page. The locking rule for inline_data conversion should be: + * lock_page(page #0) -> lock_page(inode_page) + */ + if (index != 0) { + err = f2fs_convert_inline_inode(inode); + if (err) + goto fail; + } +repeat: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + err = -ENOMEM; + goto fail; + } + + *pagep = page; + + f2fs_lock_op(sbi); + + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_fail; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + goto put_next; + } + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto put_fail; + } + err = f2fs_reserve_block(&dn, index); + if (err) + goto put_fail; +put_next: + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + + f2fs_wait_on_page_writeback(page, DATA); + + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + goto out; + } + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + } else { + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + .blk_addr = dn.data_blkaddr, + }; + err = f2fs_submit_page_bio(sbi, page, &fio); + if (err) + goto fail; + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + err = -EIO; + goto fail; + } + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + } +out: + SetPageUptodate(page); + clear_cold_data(page); + return 0; + +put_fail: + f2fs_put_dnode(&dn); +unlock_fail: + f2fs_unlock_op(sbi); + f2fs_put_page(page, 1); +fail: + f2fs_write_failed(mapping, pos + len); + return err; +} + +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + trace_f2fs_write_end(inode, pos, len, copied); + + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) { + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + f2fs_put_page(page, 1); + return copied; +} + +static int check_direct_IO(struct inode *inode, struct iov_iter *iter, + loff_t offset) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + + if (iov_iter_rw(iter) == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + int err; + + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + if (check_direct_IO(inode, iter, offset)) + return 0; + + trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + + if (iov_iter_rw(iter) == WRITE) + __allocate_data_blocks(inode, offset, count); + + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block); + if (err < 0 && iov_iter_rw(iter) == WRITE) + f2fs_write_failed(mapping, offset + count); + + trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + + return err; +} + +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino >= F2FS_ROOT_INO(sbi) && + (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + return; + + if (PageDirty(page)) { + if (inode->i_ino == F2FS_META_INO(sbi)) + dec_page_count(sbi, F2FS_DIRTY_META); + else if (inode->i_ino == F2FS_NODE_INO(sbi)) + dec_page_count(sbi, F2FS_DIRTY_NODES); + else + inode_dec_dirty_pages(inode); + } + ClearPagePrivate(page); +} + +int f2fs_release_page(struct page *page, gfp_t wait) +{ + /* If this is dirty page, keep PagePrivate */ + if (PageDirty(page)) + return 0; + + ClearPagePrivate(page); + return 1; +} + +static int f2fs_set_data_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + + trace_f2fs_set_page_dirty(page, DATA); + + SetPageUptodate(page); + + if (f2fs_is_atomic_file(inode)) { + register_inmem_page(inode, page); + return 1; + } + + mark_inode_dirty(inode); + + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + update_dirty_page(inode, page); + return 1; + } + return 0; +} + +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + return generic_block_bmap(mapping, block, get_data_block); +} + +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} + +const struct address_space_operations f2fs_dblock_aops = { + .readpage = f2fs_read_data_page, + .readpages = f2fs_read_data_pages, + .writepage = f2fs_write_data_page, + .writepages = f2fs_write_data_pages, + .write_begin = f2fs_write_begin, + .write_end = f2fs_write_end, + .set_page_dirty = f2fs_set_data_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, + .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, +}; diff --git a/kernel/fs/f2fs/debug.c b/kernel/fs/f2fs/debug.c new file mode 100644 index 000000000..f5388f372 --- /dev/null +++ b/kernel/fs/f2fs/debug.c @@ -0,0 +1,413 @@ +/* + * f2fs debugging statistics + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2012 Linux Foundation + * Copyright (c) 2012 Greg Kroah-Hartman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static LIST_HEAD(f2fs_stat_list); +static struct dentry *f2fs_debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); + +static void update_general_status(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + int i; + + /* validation check of the segment numbers */ + si->hit_ext = sbi->read_hit_ext; + si->total_ext = sbi->total_hit_ext; + si->ext_tree = sbi->total_ext_tree; + si->ext_node = atomic_read(&sbi->total_ext_node); + si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); + si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); + si->ndirty_dirs = sbi->n_dirty_dirs; + si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->rsvd_segs = reserved_segments(sbi); + si->overp_segs = overprovision_segments(sbi); + si->valid_count = valid_user_blocks(sbi); + si->valid_node_count = valid_node_count(sbi); + si->valid_inode_count = valid_inode_count(sbi); + si->inline_inode = atomic_read(&sbi->inline_inode); + si->inline_dir = atomic_read(&sbi->inline_dir); + si->utilization = utilization(sbi); + + si->free_segs = free_segments(sbi); + si->free_secs = free_sections(sbi); + si->prefree_count = prefree_segments(sbi); + si->dirty_count = dirty_segments(sbi); + si->node_pages = NODE_MAPPING(sbi)->nrpages; + si->meta_pages = META_MAPPING(sbi)->nrpages; + si->nats = NM_I(sbi)->nat_cnt; + si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; + si->fnids = NM_I(sbi)->fcnt; + si->bg_gc = sbi->bg_gc; + si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_valid = (int)(written_block_count(sbi) >> + sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_invalid = 50 - si->util_free - si->util_valid; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; + si->cursec[i] = curseg->segno / sbi->segs_per_sec; + si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + } + + for (i = 0; i < 2; i++) { + si->segment_count[i] = sbi->segment_count[i]; + si->block_count[i] = sbi->block_count[i]; + } + + si->inplace_count = atomic_read(&sbi->inplace_count); +} + +/* + * This function calculates BDF of every segments + */ +static void update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; + si->bimodal = bimodal / dist; + if (si->dirty_count) + si->avg_vblocks = total_vblocks / ndirty; + else + si->avg_vblocks = 0; +} + +/* + * This function calculates memory footprint. + */ +static void update_mem_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned npages; + int i; + + if (si->base_mem) + goto get_cache; + + si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + si->base_mem += 2 * sizeof(struct f2fs_inode_info); + si->base_mem += sizeof(*sbi->ckpt); + + /* build sm */ + si->base_mem += sizeof(struct f2fs_sm_info); + + /* build sit */ + si->base_mem += sizeof(struct sit_info); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE; + if (sbi->segs_per_sec > 1) + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += __bitmap_size(sbi, SIT_BITMAP); + + /* build free segmap */ + si->base_mem += sizeof(struct free_segmap_info); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build curseg */ + si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; + si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + + /* build dirty segmap */ + si->base_mem += sizeof(struct dirty_seglist_info); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build nm */ + si->base_mem += sizeof(struct f2fs_nm_info); + si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + +get_cache: + si->cache_mem = 0; + + /* build gc */ + if (sbi->gc_thread) + si->cache_mem += sizeof(struct f2fs_gc_kthread); + + /* build merge flush thread */ + if (SM_I(sbi)->cmd_control_info) + si->cache_mem += sizeof(struct flush_cmd_control); + + /* free nids */ + si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->dirty_nat_cnt * + sizeof(struct nat_entry_set); + si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); + si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); + for (i = 0; i <= UPDATE_INO; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node); + + si->page_mem = 0; + npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += npages << PAGE_CACHE_SHIFT; + npages = META_MAPPING(sbi)->nrpages; + si->page_mem += npages << PAGE_CACHE_SHIFT; +} + +static int stat_show(struct seq_file *s, void *v) +{ + struct f2fs_stat_info *si; + int i = 0; + int j; + + mutex_lock(&f2fs_stat_mutex); + list_for_each_entry(si, &f2fs_stat_list, stat_list) { + char devname[BDEVNAME_SIZE]; + + update_general_status(si->sbi); + + seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", + bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); + seq_printf(s, "[SSA: %d] [MAIN: %d", + si->ssa_area_segs, si->main_area_segs); + seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", + si->overp_segs, si->rsvd_segs); + seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", + si->valid_node_count, si->valid_inode_count); + seq_printf(s, "Other: %u)\n - Data: %u\n", + si->valid_node_count - si->valid_inode_count, + si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); + seq_printf(s, " - Inline_dentry Inode: %u\n", + si->inline_dir); + seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", + si->main_area_segs, si->main_area_sections, + si->main_area_zones); + seq_printf(s, " - COLD data: %d, %d, %d\n", + si->curseg[CURSEG_COLD_DATA], + si->cursec[CURSEG_COLD_DATA], + si->curzone[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curseg[CURSEG_WARM_DATA], + si->cursec[CURSEG_WARM_DATA], + si->curzone[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curseg[CURSEG_HOT_DATA], + si->cursec[CURSEG_HOT_DATA], + si->curzone[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curseg[CURSEG_HOT_NODE], + si->cursec[CURSEG_HOT_NODE], + si->curzone[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curseg[CURSEG_WARM_NODE], + si->cursec[CURSEG_WARM_NODE], + si->curzone[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curseg[CURSEG_COLD_NODE], + si->cursec[CURSEG_COLD_NODE], + si->curzone[CURSEG_COLD_NODE]); + seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", + si->main_area_segs - si->dirty_count - + si->prefree_count - si->free_segs, + si->dirty_count); + seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", + si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "GC calls: %d (BG: %d)\n", + si->call_count, si->bg_gc); + seq_printf(s, " - data segments : %d (%d)\n", + si->data_segs, si->bg_data_segs); + seq_printf(s, " - node segments : %d (%d)\n", + si->node_segs, si->bg_node_segs); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); + seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", + si->hit_ext, si->total_ext); + seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); + seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - inmem: %4d, wb: %4d\n", + si->inmem_pages, si->wb_pages); + seq_printf(s, " - nodes: %4d in %4d\n", + si->ndirty_node, si->node_pages); + seq_printf(s, " - dents: %4d in dirs:%4d\n", + si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - meta: %4d in %4d\n", + si->ndirty_meta, si->meta_pages); + seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", + si->dirty_nats, si->nats, si->dirty_sits, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); + + for (j = 0; j < si->util_valid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_invalid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_free; j++) + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); + seq_printf(s, "IPU: %u blocks\n", si->inplace_count); + seq_printf(s, "SSR: %u blocks in %u segments\n", + si->block_count[SSR], si->segment_count[SSR]); + seq_printf(s, "LFS: %u blocks in %u segments\n", + si->block_count[LFS], si->segment_count[LFS]); + + /* segment usage info */ + update_sit_info(si->sbi); + seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", + si->bimodal, si->avg_vblocks); + + /* memory footprint */ + update_mem_info(si->sbi); + seq_printf(s, "\nMemory: %u KB\n", + (si->base_mem + si->cache_mem + si->page_mem) >> 10); + seq_printf(s, " - static: %u KB\n", + si->base_mem >> 10); + seq_printf(s, " - cached: %u KB\n", + si->cache_mem >> 10); + seq_printf(s, " - paged : %u KB\n", + si->page_mem >> 10); + } + mutex_unlock(&f2fs_stat_mutex); + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, stat_show, inode->i_private); +} + +static const struct file_operations stat_fops = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int f2fs_build_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_stat_info *si; + + si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) + return -ENOMEM; + + si->all_area_segs = le32_to_cpu(raw_super->segment_count); + si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); + si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); + si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa); + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + si->sbi = sbi; + sbi->stat_info = si; + + atomic_set(&sbi->inline_inode, 0); + atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->inplace_count, 0); + + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); + + return 0; +} + +void f2fs_destroy_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + mutex_lock(&f2fs_stat_mutex); + list_del(&si->stat_list); + mutex_unlock(&f2fs_stat_mutex); + + kfree(si); +} + +void __init f2fs_create_root_stats(void) +{ + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + return; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + } +} + +void f2fs_destroy_root_stats(void) +{ + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; +} diff --git a/kernel/fs/f2fs/dir.c b/kernel/fs/f2fs/dir.c new file mode 100644 index 000000000..3a3302ab7 --- /dev/null +++ b/kernel/fs/f2fs/dir.c @@ -0,0 +1,811 @@ +/* + * fs/f2fs/dir.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "f2fs.h" +#include "node.h" +#include "acl.h" +#include "xattr.h" + +static unsigned long dir_blocks(struct inode *inode) +{ + return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) + >> PAGE_CACHE_SHIFT; +} + +static unsigned int dir_buckets(unsigned int level, int dir_level) +{ + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return 1 << (level + dir_level); + else + return MAX_DIR_BUCKETS; +} + +static unsigned int bucket_blocks(unsigned int level) +{ + if (level < MAX_DIR_HASH_DEPTH / 2) + return 2; + else + return 4; +} + +unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { + [F2FS_FT_UNKNOWN] = DT_UNKNOWN, + [F2FS_FT_REG_FILE] = DT_REG, + [F2FS_FT_DIR] = DT_DIR, + [F2FS_FT_CHRDEV] = DT_CHR, + [F2FS_FT_BLKDEV] = DT_BLK, + [F2FS_FT_FIFO] = DT_FIFO, + [F2FS_FT_SOCK] = DT_SOCK, + [F2FS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, + [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, +}; + +void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +{ + de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) +{ + unsigned long i; + unsigned long bidx = 0; + + for (i = 0; i < level; i++) + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); + bidx += idx * bucket_blocks(level); + return bidx; +} + +static bool early_match_name(size_t namelen, f2fs_hash_t namehash, + struct f2fs_dir_entry *de) +{ + if (le16_to_cpu(de->name_len) != namelen) + return false; + + if (de->hash_code != namehash) + return false; + + return true; +} + +static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, + struct qstr *name, int *max_slots, + struct page **res_page) +{ + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + + dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + de = find_target_dentry(name, max_slots, &d); + + if (de) + *res_page = dentry_page; + else + kunmap(dentry_page); + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs has occurred. + */ + f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); + return de; +} + +struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, + struct f2fs_dentry_ptr *d) +{ + struct f2fs_dir_entry *de; + unsigned long bit_pos = 0; + f2fs_hash_t namehash = f2fs_dentry_hash(name); + int max_len = 0; + + if (max_slots) + *max_slots = 0; + while (bit_pos < d->max) { + if (!test_bit_le(bit_pos, d->bitmap)) { + bit_pos++; + max_len++; + continue; + } + + de = &d->dentry[bit_pos]; + if (early_match_name(name->len, namehash, de) && + !memcmp(d->filename[bit_pos], name->name, name->len)) + goto found; + + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + max_len = 0; + + /* remain bug on condition */ + if (unlikely(!de->name_len)) + d->max = -1; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + + de = NULL; +found: + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + return de; +} + +static struct f2fs_dir_entry *find_in_level(struct inode *dir, + unsigned int level, struct qstr *name, + f2fs_hash_t namehash, struct page **res_page) +{ + int s = GET_DENTRY_SLOTS(name->len); + unsigned int nbucket, nblock; + unsigned int bidx, end_block; + struct page *dentry_page; + struct f2fs_dir_entry *de = NULL; + bool room = false; + int max_slots; + + f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); + end_block = bidx + nblock; + + for (; bidx < end_block; bidx++) { + /* no need to allocate new dentry pages to all the indices */ + dentry_page = find_data_page(dir, bidx, true); + if (IS_ERR(dentry_page)) { + room = true; + continue; + } + + de = find_in_block(dentry_page, name, &max_slots, res_page); + if (de) + break; + + if (max_slots >= s) + room = true; + f2fs_put_page(dentry_page, 0); + } + + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } + + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + struct qstr *child, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + f2fs_hash_t name_hash; + unsigned int max_depth; + unsigned int level; + + *res_page = NULL; + + if (f2fs_has_inline_dentry(dir)) + return find_in_inline_dir(dir, child, res_page); + + if (npages == 0) + return NULL; + + name_hash = f2fs_dentry_hash(child); + max_depth = F2FS_I(dir)->i_current_depth; + + for (level = 0; level < max_depth; level++) { + de = find_in_level(dir, level, child, name_hash, res_page); + if (de) + break; + } + if (!de && F2FS_I(dir)->chash != name_hash) { + F2FS_I(dir)->chash = name_hash; + F2FS_I(dir)->clevel = level - 1; + } + return de; +} + +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +{ + struct page *page; + struct f2fs_dir_entry *de; + struct f2fs_dentry_block *dentry_blk; + + if (f2fs_has_inline_dentry(dir)) + return f2fs_parent_inline_dir(dir, p); + + page = get_lock_data_page(dir, 0); + if (IS_ERR(page)) + return NULL; + + dentry_blk = kmap(page); + de = &dentry_blk->dentry[1]; + *p = page; + unlock_page(page); + return de; +} + +ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +{ + ino_t res = 0; + struct f2fs_dir_entry *de; + struct page *page; + + de = f2fs_find_entry(dir, qstr, &page); + if (de) { + res = le32_to_cpu(de->ino); + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } + + return res; +} + +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode) +{ + enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); + f2fs_wait_on_page_writeback(page, type); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode->i_mode); + f2fs_dentry_kunmap(dir, page); + set_page_dirty(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + + f2fs_put_page(page, 1); +} + +static void init_dent_inode(const struct qstr *name, struct page *ipage) +{ + struct f2fs_inode *ri; + + f2fs_wait_on_page_writeback(ipage, NODE); + + /* copy name info. to this inode page */ + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); + set_page_dirty(ipage); +} + +int update_dent_inode(struct inode *inode, const struct qstr *name) +{ + struct page *page; + + page = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + init_dent_inode(name, page); + f2fs_put_page(page, 1); + + return 0; +} + +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d) +{ + struct f2fs_dir_entry *de; + + de = &d->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(d->filename[0], ".", 1); + set_de_type(de, inode->i_mode); + + de = &d->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(d->filename[1], "..", 2); + set_de_type(de, parent->i_mode); + + test_and_set_bit_le(0, (void *)d->bitmap); + test_and_set_bit_le(1, (void *)d->bitmap); +} + +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; + + if (f2fs_has_inline_dentry(inode)) + return make_empty_inline_dir(inode, parent, page); + + dentry_page = get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = kmap_atomic(dentry_page); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + do_make_empty_dir(inode, parent, &d); + + kunmap_atomic(dentry_blk); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *name, struct page *dpage) +{ + struct page *page; + int err; + + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + page = new_inode_page(inode); + if (IS_ERR(page)) + return page; + + if (S_ISDIR(inode->i_mode)) { + err = make_empty_dir(inode, dir, page); + if (err) + goto error; + } + + err = f2fs_init_acl(inode, dir, page, dpage); + if (err) + goto put_error; + + err = f2fs_init_security(inode, dir, name, page); + if (err) + goto put_error; + } else { + page = get_node_page(F2FS_I_SB(dir), inode->i_ino); + if (IS_ERR(page)) + return page; + + set_cold_node(inode, page); + } + + if (name) + init_dent_inode(name, page); + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ + if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + inc_nlink(inode); + } + return page; + +put_error: + f2fs_put_page(page, 1); +error: + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0, false); + remove_dirty_dir_inode(inode); + remove_inode_page(inode); + return ERR_PTR(err); +} + +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth) +{ + if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) { + inc_nlink(dir); + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + } + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + + if (F2FS_I(dir)->i_current_depth != current_depth) { + F2FS_I(dir)->i_current_depth = current_depth; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } + + if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); +} + +int room_for_filename(const void *bitmap, int slots, int max_slots) +{ + int bit_start = 0; + int zero_start, zero_end; +next: + zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); + if (zero_start >= max_slots) + return max_slots; + + zero_end = find_next_bit_le(bitmap, max_slots, zero_start); + if (zero_end - zero_start >= slots) + return zero_start; + + bit_start = zero_end + 1; + + if (zero_end + 1 >= max_slots) + return max_slots; + goto next; +} + +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + set_de_type(de, mode); + for (i = 0; i < slots; i++) + test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + unsigned int bit_pos; + unsigned int level; + unsigned int current_depth; + unsigned long bidx, block; + f2fs_hash_t dentry_hash; + unsigned int nbucket, nblock; + size_t namelen = name->len; + struct page *dentry_page = NULL; + struct f2fs_dentry_block *dentry_blk = NULL; + struct f2fs_dentry_ptr d; + int slots = GET_DENTRY_SLOTS(namelen); + struct page *page = NULL; + int err = 0; + + if (f2fs_has_inline_dentry(dir)) { + err = f2fs_add_inline_entry(dir, name, inode, ino, mode); + if (!err || err != -EAGAIN) + return err; + else + err = 0; + } + + dentry_hash = f2fs_dentry_hash(name); + level = 0; + current_depth = F2FS_I(dir)->i_current_depth; + if (F2FS_I(dir)->chash == dentry_hash) { + level = F2FS_I(dir)->clevel; + F2FS_I(dir)->chash = 0; + } + +start: + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; + + /* Increase the depth, if required */ + if (level == current_depth) + ++current_depth; + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); + + for (block = bidx; block <= (bidx + nblock - 1); block++) { + dentry_page = get_new_data_page(dir, NULL, block, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = kmap(dentry_page); + bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_DENTRY_IN_BLOCK); + if (bit_pos < NR_DENTRY_IN_BLOCK) + goto add_dentry; + + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + } + + /* Move to next level to find the empty slot for new dentry */ + ++level; + goto start; +add_dentry: + f2fs_wait_on_page_writeback(dentry_page, DATA); + + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos); + + set_page_dirty(dentry_page); + + if (inode) { + /* we don't need to mark_inode_dirty now */ + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } + + update_parent_metadata(dir, inode, current_depth); +fail: + if (inode) + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + return err; +} + +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, NULL, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + /* we don't need to mark_inode_dirty now */ + update_inode(inode, page); + f2fs_put_page(page, 1); + + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + return err; +} + +void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + if (page) + update_inode(dir, page); + else + update_inode_page(dir); + } + inode->i_ctime = CURRENT_TIME; + + drop_nlink(inode); + if (S_ISDIR(inode->i_mode)) { + drop_nlink(inode); + i_size_write(inode, 0); + } + up_write(&F2FS_I(inode)->i_sem); + update_inode_page(inode); + + if (inode->i_nlink == 0) + add_orphan_inode(sbi, inode->i_ino); + else + release_orphan_inode(sbi); +} + +/* + * It only removes the dentry from the dentry page, corresponding name + * entry in name page does not need to be touched during deletion. + */ +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_block *dentry_blk; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + int i; + + if (f2fs_has_inline_dentry(dir)) + return f2fs_delete_inline_entry(dentry, page, dir, inode); + + lock_page(page); + f2fs_wait_on_page_writeback(page, DATA); + + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; + for (i = 0; i < slots; i++) + clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + /* Let's check and deallocate this dentry page */ + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + 0); + kunmap(page); /* kunmap - pair of f2fs_find_entry */ + set_page_dirty(page); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + if (inode) + f2fs_drop_nlink(dir, inode, NULL); + + if (bit_pos == NR_DENTRY_IN_BLOCK) { + truncate_hole(dir, page->index, page->index + 1); + clear_page_dirty_for_io(page); + ClearPagePrivate(page); + ClearPageUptodate(page); + inode_dec_dirty_pages(dir); + } + f2fs_put_page(page, 1); +} + +bool f2fs_empty_dir(struct inode *dir) +{ + unsigned long bidx; + struct page *dentry_page; + unsigned int bit_pos; + struct f2fs_dentry_block *dentry_blk; + unsigned long nblock = dir_blocks(dir); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_empty_inline_dir(dir); + + for (bidx = 0; bidx < nblock; bidx++) { + dentry_page = get_lock_data_page(dir, bidx); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) + continue; + else + return false; + } + + dentry_blk = kmap_atomic(dentry_page); + if (bidx == 0) + bit_pos = 2; + else + bit_pos = 0; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_pos); + kunmap_atomic(dentry_blk); + + f2fs_put_page(dentry_page, 1); + + if (bit_pos < NR_DENTRY_IN_BLOCK) + return false; + } + return true; +} + +bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos) +{ + unsigned char d_type = DT_UNKNOWN; + unsigned int bit_pos; + struct f2fs_dir_entry *de = NULL; + + bit_pos = ((unsigned long)ctx->pos % d->max); + + while (bit_pos < d->max) { + bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); + if (bit_pos >= d->max) + break; + + de = &d->dentry[bit_pos]; + if (de->file_type < F2FS_FT_MAX) + d_type = f2fs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + if (!dir_emit(ctx, d->filename[bit_pos], + le16_to_cpu(de->name_len), + le32_to_cpu(de->ino), d_type)) + return true; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + ctx->pos = start_pos + bit_pos; + } + return false; +} + +static int f2fs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + unsigned long npages = dir_blocks(inode); + struct f2fs_dentry_block *dentry_blk = NULL; + struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); + struct f2fs_dentry_ptr d; + + if (f2fs_has_inline_dentry(inode)) + return f2fs_read_inline_dir(file, ctx); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + + for (; n < npages; n++) { + dentry_page = get_lock_data_page(inode, n); + if (IS_ERR(dentry_page)) + continue; + + dentry_blk = kmap(dentry_page); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK)) + goto stop; + + ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + dentry_page = NULL; + } +stop: + if (dentry_page && !IS_ERR(dentry_page)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + } + + return 0; +} + +const struct file_operations f2fs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = f2fs_readdir, + .fsync = f2fs_sync_file, + .unlocked_ioctl = f2fs_ioctl, +}; diff --git a/kernel/fs/f2fs/f2fs.h b/kernel/fs/f2fs/f2fs.h new file mode 100644 index 000000000..8de34ab6d --- /dev/null +++ b/kernel/fs/f2fs/f2fs.h @@ -0,0 +1,1814 @@ +/* + * fs/f2fs/f2fs.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_F2FS_H +#define _LINUX_F2FS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) +#define f2fs_down_write(x, y) down_write_nest_lock(x, y) +#else +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (unlikely(condition)) { \ + WARN_ON(1); \ + set_sbi_flag(sbi, SBI_NEED_FSCK); \ + } \ + } while (0) +#define f2fs_down_write(x, y) down_write(x) +#endif + +/* + * For mount options + */ +#define F2FS_MOUNT_BG_GC 0x00000001 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 +#define F2FS_MOUNT_DISCARD 0x00000004 +#define F2FS_MOUNT_NOHEAP 0x00000008 +#define F2FS_MOUNT_XATTR_USER 0x00000010 +#define F2FS_MOUNT_POSIX_ACL 0x00000020 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 +#define F2FS_MOUNT_NOBARRIER 0x00000800 +#define F2FS_MOUNT_FASTBOOT 0x00001000 +#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 + +#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) + +#define ver_after(a, b) (typecheck(unsigned long long, a) && \ + typecheck(unsigned long long, b) && \ + ((long long)((a) - (b)) > 0)) + +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ +typedef u32 nid_t; + +struct f2fs_mount_info { + unsigned int opt; +}; + +#define CRCPOLY_LE 0xedb88320 + +static inline __u32 f2fs_crc32(void *buf, size_t len) +{ + unsigned char *p = (unsigned char *)buf; + __u32 crc = F2FS_SUPER_MAGIC; + int i; + + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); + } + return crc; +} + +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) +{ + return f2fs_crc32(buf, buf_size) == blk_crc; +} + +/* + * For checkpoint manager + */ +enum { + NAT_BITMAP, + SIT_BITMAP +}; + +enum { + CP_UMOUNT, + CP_FASTBOOT, + CP_SYNC, + CP_RECOVERY, + CP_DISCARD, +}; + +#define DEF_BATCHED_TRIM_SECTIONS 32 +#define BATCHED_TRIM_SEGMENTS(sbi) \ + (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; + __u64 trimmed; +}; + +/* + * For CP/NAT/SIT/SSA readahead + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA, + META_POR, +}; + +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { + struct list_head list; /* list head */ + nid_t ino; /* inode number */ +}; + +/* + * for the list of directory inodes or gc inodes. + * NOTE: there are two slab users for this structure, if we add/modify/delete + * fields in structure for one of slab users, it may affect fields or size of + * other one, in this condition, it's better to split both of slab and related + * data structure. + */ +struct inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ +}; + +/* for the list of blockaddresses to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t blkaddr; /* block address to be discarded */ + int len; /* # of consecutive blocks of the discard */ +}; + +/* for the list of fsync inodes, used only during recovery */ +struct fsync_inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ + block_t last_inode; /* block address locating the last inode */ +}; + +#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) +#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) + +#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) +#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) +#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) +#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) + +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) + +static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +{ + int before = nats_in_cursum(rs); + rs->n_nats = cpu_to_le16(before + i); + return before; +} + +static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +{ + int before = sits_in_cursum(rs); + rs->n_sits = cpu_to_le16(before + i); + return before; +} + +static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, + int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(sum); + return size <= MAX_SIT_JENTRIES(sum); +} + +/* + * ioctl commands + */ +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION + +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) +#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) +#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) + +/* + * should be same as XFS_IOC_GOINGDOWN. + * Flags for going down operation used by FS_IOC_GOINGDOWN + */ +#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ +#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ +#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ +#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#endif + +/* + * For INODE and NODE manager + */ +/* for directory operations */ +struct f2fs_dentry_ptr { + const void *bitmap; + struct f2fs_dir_entry *dentry; + __u8 (*filename)[F2FS_SLOT_LEN]; + int max; +}; + +static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d, + void *src, int type) +{ + if (type == 1) { + struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; + } else { + struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; + } +} + +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_data_block. + */ +}; + +#define F2FS_LINK_MAX 32000 /* maximum link count per file */ + +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + +/* vector size for gang look-up from extent cache that consists of radix tree */ +#define EXT_TREE_VEC_SIZE 64 + +/* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define EXTENT_CACHE_SHRINK_NUMBER 128 + +struct extent_info { + unsigned int fofs; /* start offset in a file */ + u32 blk; /* start block address of the extent */ + unsigned int len; /* length of the extent */ +}; + +struct extent_node { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct list_head list; /* node in global extent list of sbi */ + struct extent_info ei; /* extent info */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + struct rb_root root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t refcount; /* reference count of rb-tree */ + unsigned int count; /* # of extent node in rb-tree*/ +}; + +/* + * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. + */ +#define FADVISE_COLD_BIT 0x01 +#define FADVISE_LOST_PINO_BIT 0x02 + +#define DEF_DIR_LEVEL 0 + +struct f2fs_inode_info { + struct inode vfs_inode; /* serve a vfs inode */ + unsigned long i_flags; /* keep an inode flags for ioctl */ + unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ + unsigned int i_current_depth; /* use only in directory structure */ + unsigned int i_pino; /* parent inode number */ + umode_t i_acl_mode; /* keep file acl mode temporarily */ + + /* Use below internally in f2fs*/ + unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ + atomic_t dirty_pages; /* # of dirty pages */ + f2fs_hash_t chash; /* hash value of given file name */ + unsigned int clevel; /* maximum level of given file name */ + nid_t i_xattr_nid; /* node id that contains xattrs */ + unsigned long long xattr_ver; /* cp version of xattr modification */ + struct extent_info ext; /* in-memory extent cache entry */ + rwlock_t ext_lock; /* rwlock for single extent cache */ + struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + + struct radix_tree_root inmem_root; /* radix tree for inmem pages */ + struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct mutex inmem_lock; /* lock for inmemory pages */ +}; + +static inline void get_extent_info(struct extent_info *ext, + struct f2fs_extent i_ext) +{ + ext->fofs = le32_to_cpu(i_ext.fofs); + ext->blk = le32_to_cpu(i_ext.blk); + ext->len = le32_to_cpu(i_ext.len); +} + +static inline void set_raw_extent(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + i_ext->fofs = cpu_to_le32(ext->fofs); + i_ext->blk = cpu_to_le32(ext->blk); + i_ext->len = cpu_to_le32(ext->len); +} + +static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, + u32 blk, unsigned int len) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; +} + +static inline bool __is_extent_same(struct extent_info *ei1, + struct extent_info *ei2) +{ + return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && + ei1->len == ei2->len); +} + +static inline bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static inline bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static inline bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); +} + +struct f2fs_nm_info { + block_t nat_blkaddr; /* base disk address of NAT */ + nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* maximum available node ids */ + nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ + + /* NAT cache management */ + struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct list_head nat_entries; /* cached nat entry list (clean) */ + unsigned int nat_cnt; /* the # of cached nat entries */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + + /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ + struct list_head free_nid_list; /* a list for free nids */ + spinlock_t free_nid_list_lock; /* protect free nid list */ + unsigned int fcnt; /* the number of free node id */ + struct mutex build_lock; /* lock for build free nids */ + + /* for checkpoint */ + char *nat_bitmap; /* NAT bitmap pointer */ + int bitmap_size; /* bitmap size */ +}; + +/* + * this structure is used as one of function parameters. + * all the information are dedicated to a given direct node block determined + * by the data offset in a file. + */ +struct dnode_of_data { + struct inode *inode; /* vfs inode pointer */ + struct page *inode_page; /* its inode page, NULL is possible */ + struct page *node_page; /* cached direct node page */ + nid_t nid; /* node id of the direct node block */ + unsigned int ofs_in_node; /* data offset in the node page */ + bool inode_page_locked; /* inode page is locked or not */ + block_t data_blkaddr; /* block address of the node block */ +}; + +static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, + struct page *ipage, struct page *npage, nid_t nid) +{ + memset(dn, 0, sizeof(*dn)); + dn->inode = inode; + dn->inode_page = ipage; + dn->node_page = npage; + dn->nid = nid; +} + +/* + * For SIT manager + * + * By default, there are 6 active log areas across the whole main area. + * When considering hot and cold data separation to reduce cleaning overhead, + * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, + * respectively. + * In the current design, you should not change the numbers intentionally. + * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 + * logs individually according to the underlying devices. (default: 6) + * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for + * data and 8 for node logs. + */ +#define NR_CURSEG_DATA_TYPE (3) +#define NR_CURSEG_NODE_TYPE (3) +#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) + +enum { + CURSEG_HOT_DATA = 0, /* directory entry blocks */ + CURSEG_WARM_DATA, /* data blocks */ + CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ + CURSEG_HOT_NODE, /* direct node blocks of directory files */ + CURSEG_WARM_NODE, /* direct node blocks of normal files */ + CURSEG_COLD_NODE, /* indirect node blocks */ + NO_CHECK_TYPE, + CURSEG_DIRECT_IO, /* to use for the direct IO path */ +}; + +struct flush_cmd { + struct completion wait; + struct llist_node llnode; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ +}; + +struct f2fs_sm_info { + struct sit_info *sit_info; /* whole segment information */ + struct free_segmap_info *free_info; /* free segment information */ + struct dirty_seglist_info *dirty_info; /* dirty segment information */ + struct curseg_info *curseg_array; /* active segment information */ + + block_t seg0_blkaddr; /* block address of 0'th segment */ + block_t main_blkaddr; /* start block address of main area */ + block_t ssa_blkaddr; /* start block address of SSA area */ + + unsigned int segment_count; /* total # of segments */ + unsigned int main_segments; /* # of segments in main area */ + unsigned int reserved_segments; /* # of reserved segments */ + unsigned int ovp_segments; /* # of overprovision segments */ + + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + /* for small discard management */ + struct list_head discard_list; /* 4KB discard list */ + int nr_discards; /* # of discards in the list */ + int max_discards; /* max. discards to be issued */ + + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + + struct list_head sit_entry_set; /* sit entry set list */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ + + /* for flush command control */ + struct flush_cmd_control *cmd_control_info; + +}; + +/* + * For superblock + */ +/* + * COUNT_TYPE for monitoring + * + * f2fs monitors the number of several block types such as on-writeback, + * dirty dentry blocks, dirty node blocks, and dirty meta blocks. + */ +enum count_type { + F2FS_WRITEBACK, + F2FS_DIRTY_DENTS, + F2FS_DIRTY_NODES, + F2FS_DIRTY_META, + F2FS_INMEM_PAGES, + NR_COUNT_TYPE, +}; + +/* + * The below are the page types of bios used in submit_bio(). + * The available types are: + * DATA User data pages. It operates as async mode. + * NODE Node pages. It operates as async mode. + * META FS metadata pages such as SIT, NAT, CP. + * NR_PAGE_TYPE The number of page types. + * META_FLUSH Make sure the previous pages are written + * with waiting the bio's completion + * ... Only can be used with META. + */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) +enum page_type { + DATA, + NODE, + META, + NR_PAGE_TYPE, + META_FLUSH, + INMEM, /* the below types are used by tracepoints only. */ + INMEM_DROP, + IPU, + OPU, +}; + +struct f2fs_io_info { + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + block_t blk_addr; /* block address to be written */ +}; + +#define is_read_io(rw) (((rw) & 1) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ +}; + +/* for inner inode cache management */ +struct inode_management { + struct radix_tree_root ino_root; /* ino entry array */ + spinlock_t ino_lock; /* for ino entry lock */ + struct list_head ino_list; /* inode list head */ + unsigned long ino_num; /* number of entries */ +}; + +/* For s_flag in struct f2fs_sb_info */ +enum { + SBI_IS_DIRTY, /* dirty flag for checkpoint */ + SBI_IS_CLOSE, /* specify unmounting */ + SBI_NEED_FSCK, /* need fsck.f2fs to fix */ + SBI_POR_DOING, /* recovery is doing or not */ +}; + +struct f2fs_sb_info { + struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ + struct buffer_head *raw_super_buf; /* buffer head of raw sb */ + struct f2fs_super_block *raw_super; /* raw super block pointer */ + int s_flag; /* flags for sbi */ + + /* for node-related operations */ + struct f2fs_nm_info *nm_info; /* node manager */ + struct inode *node_inode; /* cache node blocks */ + + /* for segment-related operations */ + struct f2fs_sm_info *sm_info; /* segment manager */ + + /* for bio operations */ + struct f2fs_bio_info read_io; /* for read bios */ + struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + + /* for checkpoint */ + struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + struct inode *meta_inode; /* cache meta blocks */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ + struct rw_semaphore node_write; /* locking node writes */ + struct mutex writepages; /* mutex for writepages() */ + wait_queue_head_t cp_wait; + + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + + /* for orphan inode, use 0'th array */ + unsigned int max_orphans; /* max orphan inodes */ + + /* for directory inode management */ + struct list_head dir_inode_list; /* dir inode list */ + spinlock_t dir_inode_lock; /* for dir inode list lock */ + + /* for extent tree cache */ + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + int total_ext_tree; /* extent tree count */ + atomic_t total_ext_node; /* extent info count */ + + /* basic filesystem units */ + unsigned int log_sectors_per_block; /* log2 sectors per block */ + unsigned int log_blocksize; /* log2 block size */ + unsigned int blocksize; /* block size */ + unsigned int root_ino_num; /* root inode number*/ + unsigned int node_ino_num; /* node inode number*/ + unsigned int meta_ino_num; /* meta inode number*/ + unsigned int log_blocks_per_seg; /* log2 blocks per segment */ + unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int segs_per_sec; /* segments per section */ + unsigned int secs_per_zone; /* sections per zone */ + unsigned int total_sections; /* total section count */ + unsigned int total_node_count; /* total node block count */ + unsigned int total_valid_node_count; /* valid node block count */ + unsigned int total_valid_inode_count; /* valid inode count */ + int active_logs; /* # of active logs */ + int dir_level; /* directory level */ + + block_t user_block_count; /* # of user blocks */ + block_t total_valid_block_count; /* # of valid blocks */ + block_t alloc_valid_block_count; /* # of allocated blocks */ + block_t last_valid_block_count; /* for recovery */ + u32 s_next_generation; /* for NFS support */ + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + + struct f2fs_mount_info mount_opt; /* mount options */ + + /* for cleaning operations */ + struct mutex gc_mutex; /* mutex for GC */ + struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ + + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + + /* + * for stat information. + * one is for the LFS mode, and the other is for the SSR mode. + */ +#ifdef CONFIG_F2FS_STAT_FS + struct f2fs_stat_info *stat_info; /* FS status information */ + unsigned int segment_count[2]; /* # of allocated segments */ + unsigned int block_count[2]; /* # of allocated blocks */ + atomic_t inplace_count; /* # of inplace update */ + int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + atomic_t inline_inode; /* # of inline_data inodes */ + atomic_t inline_dir; /* # of inline_dentry inodes */ + int bg_gc; /* background gc calls */ + unsigned int n_dirty_dirs; /* # of dir inodes */ +#endif + unsigned int last_victim[2]; /* last victim segment # */ + spinlock_t stat_lock; /* lock for stat operations */ + + /* For sysfs suppport */ + struct kobject s_kobj; + struct completion s_kobj_unregister; +}; + +/* + * Inline functions + */ +static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) +{ + return container_of(inode, struct f2fs_inode_info, vfs_inode); +} + +static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) +{ + return F2FS_M_SB(page->mapping); +} + +static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_super_block *)(sbi->raw_super); +} + +static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_checkpoint *)(sbi->ckpt); +} + +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + +static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_nm_info *)(sbi->nm_info); +} + +static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_sm_info *)(sbi->sm_info); +} + +static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) +{ + return (struct sit_info *)(SM_I(sbi)->sit_info); +} + +static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) +{ + return (struct free_segmap_info *)(SM_I(sbi)->free_info); +} + +static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) +{ + return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); +} + +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + +static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) +{ + return sbi->s_flag & (0x01 << type); +} + +static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + sbi->s_flag |= (0x01 << type); +} + +static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + sbi->s_flag &= ~(0x01 << type); +} + +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + +static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; +} + +static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags |= f; + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags &= (~f); + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) +{ + down_read(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) +{ + up_read(&sbi->cp_rwsem); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + up_write(&sbi->cp_rwsem); +} + +static inline int __get_cp_reason(struct f2fs_sb_info *sbi) +{ + int reason = CP_SYNC; + + if (test_opt(sbi, FASTBOOT)) + reason = CP_FASTBOOT; + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + reason = CP_UMOUNT; + return reason; +} + +static inline bool __remain_node_summaries(int reason) +{ + return (reason == CP_UMOUNT || reason == CP_FASTBOOT); +} + +static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) +{ + return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); +} + +/* + * Check whether the given nid is within node id range. + */ +static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; + if (unlikely(nid >= NM_I(sbi)->max_nid)) + return -EINVAL; + return 0; +} + +#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 + +/* + * Check whether the inode has blocks or not + */ +static inline int F2FS_HAS_BLOCKS(struct inode *inode) +{ + if (F2FS_I(inode)->i_xattr_nid) + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; + else + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; +} + +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + +static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, blkcnt_t count) +{ + block_t valid_block_count; + + spin_lock(&sbi->stat_lock); + valid_block_count = + sbi->total_valid_block_count + (block_t)count; + if (unlikely(valid_block_count > sbi->user_block_count)) { + spin_unlock(&sbi->stat_lock); + return false; + } + inode->i_blocks += count; + sbi->total_valid_block_count = valid_block_count; + sbi->alloc_valid_block_count += (block_t)count; + spin_unlock(&sbi->stat_lock); + return true; +} + +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, + blkcnt_t count) +{ + spin_lock(&sbi->stat_lock); + f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(sbi, inode->i_blocks < count); + inode->i_blocks -= count; + sbi->total_valid_block_count -= (block_t)count; + spin_unlock(&sbi->stat_lock); +} + +static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_inc(&sbi->nr_pages[count_type]); + set_sbi_flag(sbi, SBI_IS_DIRTY); +} + +static inline void inode_inc_dirty_pages(struct inode *inode) +{ + atomic_inc(&F2FS_I(inode)->dirty_pages); + if (S_ISDIR(inode->i_mode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); +} + +static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_dec(&sbi->nr_pages[count_type]); +} + +static inline void inode_dec_dirty_pages(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + return; + + atomic_dec(&F2FS_I(inode)->dirty_pages); + + if (S_ISDIR(inode->i_mode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); +} + +static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +{ + return atomic_read(&sbi->nr_pages[count_type]); +} + +static inline int get_dirty_pages(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_pages); +} + +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * + (1 << sbi->log_blocks_per_seg); + return ((get_pages(sbi, block_type) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; +} + +static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_block_count; +} + +static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + /* return NAT or SIT bitmap */ + if (flag == NAT_BITMAP) + return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + else if (flag == SIT_BITMAP) + return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + + return 0; +} + +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + +static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + int offset; + + if (__cp_payload(sbi) > 0) { + if (flag == NAT_BITMAP) + return &ckpt->sit_nat_version_bitmap; + else + return (unsigned char *)ckpt + F2FS_BLKSIZE; + } else { + offset = (flag == NAT_BITMAP) ? + le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } +} + +static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_version = cur_cp_version(ckpt); + + start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + /* + * odd numbered checkpoint should at cp segment 0 + * and even segment must be at cp segment 1 + */ + if (!(ckpt_version & 1)) + start_addr += sbi->blocks_per_seg; + + return start_addr; +} + +static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode) +{ + block_t valid_block_count; + unsigned int valid_node_count; + + spin_lock(&sbi->stat_lock); + + valid_block_count = sbi->total_valid_block_count + 1; + if (unlikely(valid_block_count > sbi->user_block_count)) { + spin_unlock(&sbi->stat_lock); + return false; + } + + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { + spin_unlock(&sbi->stat_lock); + return false; + } + + if (inode) + inode->i_blocks++; + + sbi->alloc_valid_block_count++; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; + spin_unlock(&sbi->stat_lock); + + return true; +} + +static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode) +{ + spin_lock(&sbi->stat_lock); + + f2fs_bug_on(sbi, !sbi->total_valid_block_count); + f2fs_bug_on(sbi, !sbi->total_valid_node_count); + f2fs_bug_on(sbi, !inode->i_blocks); + + inode->i_blocks--; + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; + + spin_unlock(&sbi->stat_lock); +} + +static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_node_count; +} + +static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) +{ + spin_lock(&sbi->stat_lock); + f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); + sbi->total_valid_inode_count++; + spin_unlock(&sbi->stat_lock); +} + +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) +{ + spin_lock(&sbi->stat_lock); + f2fs_bug_on(sbi, !sbi->total_valid_inode_count); + sbi->total_valid_inode_count--; + spin_unlock(&sbi->stat_lock); +} + +static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_inode_count; +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + + if (unlock) { + f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); + unlock_page(page); + } + page_cache_release(page); +} + +static inline void f2fs_put_dnode(struct dnode_of_data *dn) +{ + if (dn->node_page) + f2fs_put_page(dn->node_page, 1); + if (dn->inode_page && dn->node_page != dn->inode_page) + f2fs_put_page(dn->inode_page, 0); + dn->node_page = NULL; + dn->inode_page = NULL; +} + +static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, + size_t size) +{ + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; +retry: + entry = kmem_cache_alloc(cachep, flags); + if (!entry) { + cond_resched(); + goto retry; + } + + return entry; +} + +static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + while (radix_tree_insert(root, index, item)) + cond_resched(); +} + +#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) + +static inline bool IS_INODE(struct page *page) +{ + struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); +} + +static inline __le32 *blkaddr_in_node(struct f2fs_node *node) +{ + return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; +} + +static inline block_t datablock_addr(struct page *node_page, + unsigned int offset) +{ + struct f2fs_node *raw_node; + __le32 *addr_array; + raw_node = F2FS_NODE(node_page); + addr_array = blkaddr_in_node(raw_node); + return le32_to_cpu(addr_array[offset]); +} + +static inline int f2fs_test_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + return mask & *addr; +} + +static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr |= mask; + return ret; +} + +static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr &= ~mask; + return ret; +} + +static inline void f2fs_change_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr ^= mask; +} + +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_UPDATE_DIR, /* should update inode block for consistency */ + FI_DELAY_IPUT, /* used for the recovery */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ +}; + +static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +{ + if (!test_bit(flag, &fi->flags)) + set_bit(flag, &fi->flags); +} + +static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +{ + return test_bit(flag, &fi->flags); +} + +static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +{ + if (test_bit(flag, &fi->flags)) + clear_bit(flag, &fi->flags); +} + +static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +{ + fi->i_acl_mode = mode; + set_inode_flag(fi, FI_ACL_MODE); +} + +static inline void get_inline_info(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_INLINE_XATTR) + set_inode_flag(fi, FI_INLINE_XATTR); + if (ri->i_inline & F2FS_INLINE_DATA) + set_inode_flag(fi, FI_INLINE_DATA); + if (ri->i_inline & F2FS_INLINE_DENTRY) + set_inode_flag(fi, FI_INLINE_DENTRY); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(fi, FI_DATA_EXIST); + if (ri->i_inline & F2FS_INLINE_DOTS) + set_inode_flag(fi, FI_INLINE_DOTS); +} + +static inline void set_raw_inline(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(fi, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; + if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + ri->i_inline |= F2FS_INLINE_DENTRY; + if (is_inode_flag_set(fi, FI_DATA_EXIST)) + ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + ri->i_inline |= F2FS_INLINE_DOTS; +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + +static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +{ + if (f2fs_has_inline_xattr(&fi->vfs_inode)) + return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; + return DEF_ADDRS_PER_INODE; +} + +static inline void *inline_xattr_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (f2fs_has_inline_xattr(inode)) + return F2FS_INLINE_XATTR_ADDRS << 2; + else + return 0; +} + +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); +} + +static inline void f2fs_clear_inline_inode(struct inode *inode) +{ + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); +} + +static inline int f2fs_exist_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); +} + +static inline int f2fs_has_inline_dots(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_volatile_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); +} + +static inline bool f2fs_is_first_block_written(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); +} + +static inline bool f2fs_is_drop_cache(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); +} + +static inline void *inline_data_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); +} + +static inline int f2fs_has_inline_dentry(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); +} + +static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) +{ + if (!f2fs_has_inline_dentry(dir)) + kunmap(page); +} + +static inline int f2fs_readonly(struct super_block *sb) +{ + return sb->s_flags & MS_RDONLY; +} + +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); +} + +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + +#define get_inode_mode(i) \ + ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +/* get offset of first page in next direct node */ +#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ + ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ + (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) + +/* + * file.c + */ +int f2fs_sync_file(struct file *, loff_t, loff_t, int); +void truncate_data_blocks(struct dnode_of_data *); +int truncate_blocks(struct inode *, u64, bool); +void f2fs_truncate(struct inode *); +int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int f2fs_setattr(struct dentry *, struct iattr *); +int truncate_hole(struct inode *, pgoff_t, pgoff_t); +int truncate_data_blocks_range(struct dnode_of_data *, int); +long f2fs_ioctl(struct file *, unsigned int, unsigned long); +long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* + * inode.c + */ +void f2fs_set_inode_flags(struct inode *); +struct inode *f2fs_iget(struct super_block *, unsigned long); +int try_to_free_nats(struct f2fs_sb_info *, int); +void update_inode(struct inode *, struct page *); +void update_inode_page(struct inode *); +int f2fs_write_inode(struct inode *, struct writeback_control *); +void f2fs_evict_inode(struct inode *); +void handle_failed_inode(struct inode *); + +/* + * namei.c + */ +struct dentry *f2fs_get_parent(struct dentry *child); + +/* + * dir.c + */ +extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; +void set_de_type(struct f2fs_dir_entry *, umode_t); +struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, + struct f2fs_dentry_ptr *); +bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, + unsigned int); +void do_make_empty_dir(struct inode *, struct inode *, + struct f2fs_dentry_ptr *); +struct page *init_inode_metadata(struct inode *, struct inode *, + const struct qstr *, struct page *); +void update_parent_metadata(struct inode *, struct inode *, unsigned int); +int room_for_filename(const void *, int, int); +void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, + struct page **); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); +ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, + struct page *, struct inode *); +int update_dent_inode(struct inode *, const struct qstr *); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, + const struct qstr *, f2fs_hash_t , unsigned int); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, + umode_t); +void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, + struct inode *); +int f2fs_do_tmpfile(struct inode *, struct inode *); +int f2fs_make_empty(struct inode *, struct inode *); +bool f2fs_empty_dir(struct inode *); + +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); +} + +/* + * super.c + */ +int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); + +/* + * hash.c + */ +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); + +/* + * node.c + */ +struct dnode_of_data; +struct node_info; + +bool available_free_memory(struct f2fs_sb_info *, int); +bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool has_fsynced_inode(struct f2fs_sb_info *, nid_t); +bool need_inode_block_update(struct f2fs_sb_info *, nid_t); +void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); +int truncate_inode_blocks(struct inode *, pgoff_t); +int truncate_xattr_node(struct inode *, struct page *); +int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); +void remove_inode_page(struct inode *); +struct page *new_inode_page(struct inode *); +struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); +void ra_node_page(struct f2fs_sb_info *, nid_t); +struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_node_page_ra(struct page *, int); +void sync_inode_page(struct dnode_of_data *); +int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +bool alloc_nid(struct f2fs_sb_info *, nid_t *); +void alloc_nid_done(struct f2fs_sb_info *, nid_t); +void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +void recover_inline_xattr(struct inode *, struct page *); +void recover_xattr_data(struct inode *, struct page *, block_t); +int recover_inode_page(struct f2fs_sb_info *, struct page *); +int restore_node_summary(struct f2fs_sb_info *, unsigned int, + struct f2fs_summary_block *); +void flush_nat_entries(struct f2fs_sb_info *); +int build_node_manager(struct f2fs_sb_info *); +void destroy_node_manager(struct f2fs_sb_info *); +int __init create_node_manager_caches(void); +void destroy_node_manager_caches(void); + +/* + * segment.c + */ +void register_inmem_page(struct inode *, struct page *); +void commit_inmem_pages(struct inode *, bool); +void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); +int create_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *); +void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void clear_prefree_segments(struct f2fs_sb_info *); +void release_discard_addrs(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *, block_t); +int npages_for_summary_flush(struct f2fs_sb_info *, bool); +void allocate_new_segments(struct f2fs_sb_info *); +int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); +struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); +void write_meta_page(struct f2fs_sb_info *, struct page *); +void write_node_page(struct f2fs_sb_info *, struct page *, + unsigned int, struct f2fs_io_info *); +void write_data_page(struct page *, struct dnode_of_data *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, struct f2fs_io_info *); +void recover_data_page(struct f2fs_sb_info *, struct page *, + struct f2fs_summary *, block_t, block_t); +void allocate_data_block(struct f2fs_sb_info *, struct page *, + block_t, block_t *, struct f2fs_summary *, int); +void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void write_data_summaries(struct f2fs_sb_info *, block_t); +void write_node_summaries(struct f2fs_sb_info *, block_t); +int lookup_journal_in_cursum(struct f2fs_summary_block *, + int, unsigned int, int); +void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); +int build_segment_manager(struct f2fs_sb_info *); +void destroy_segment_manager(struct f2fs_sb_info *); +int __init create_segment_manager_caches(void); +void destroy_segment_manager_caches(void); + +/* + * checkpoint.c + */ +struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); +long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void release_dirty_inode(struct f2fs_sb_info *); +bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int acquire_orphan_inode(struct f2fs_sb_info *); +void release_orphan_inode(struct f2fs_sb_info *); +void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void remove_orphan_inode(struct f2fs_sb_info *, nid_t); +void recover_orphan_inodes(struct f2fs_sb_info *); +int get_valid_checkpoint(struct f2fs_sb_info *); +void update_dirty_page(struct inode *, struct page *); +void add_dirty_dir_inode(struct inode *); +void remove_dirty_dir_inode(struct inode *); +void sync_dirty_dir_inodes(struct f2fs_sb_info *); +void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void init_ino_entry_info(struct f2fs_sb_info *); +int __init create_checkpoint_caches(void); +void destroy_checkpoint_caches(void); + +/* + * data.c + */ +void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *); +void set_data_blkaddr(struct dnode_of_data *); +int reserve_new_block(struct dnode_of_data *); +int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); +void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_destroy_extent_tree(struct inode *); +void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_preserve_extent_tree(struct inode *); +struct page *find_data_page(struct inode *, pgoff_t, bool); +struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); +int do_write_data_page(struct page *, struct f2fs_io_info *); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); +void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); +int f2fs_release_page(struct page *, gfp_t); + +/* + * gc.c + */ +int start_gc_thread(struct f2fs_sb_info *); +void stop_gc_thread(struct f2fs_sb_info *); +block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); +int f2fs_gc(struct f2fs_sb_info *); +void build_gc_manager(struct f2fs_sb_info *); + +/* + * recovery.c + */ +int recover_fsync_data(struct f2fs_sb_info *); +bool space_for_roll_forward(struct f2fs_sb_info *); + +/* + * debug.c + */ +#ifdef CONFIG_F2FS_STAT_FS +struct f2fs_stat_info { + struct list_head stat_list; + struct f2fs_sb_info *sbi; + int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; + int main_area_segs, main_area_sections, main_area_zones; + int hit_ext, total_ext, ext_tree, ext_node; + int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int nats, dirty_nats, sits, dirty_sits, fnids; + int total_count, utilization; + int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages; + unsigned int valid_count, valid_node_count, valid_inode_count; + unsigned int bimodal, avg_vblocks; + int util_free, util_valid, util_invalid; + int rsvd_segs, overp_segs; + int dirty_count, node_pages, meta_pages; + int prefree_count, call_count, cp_count; + int tot_segs, node_segs, data_segs, free_segs, free_secs; + int bg_node_segs, bg_data_segs; + int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; + int curseg[NR_CURSEG_TYPE]; + int cursec[NR_CURSEG_TYPE]; + int curzone[NR_CURSEG_TYPE]; + + unsigned int segment_count[2]; + unsigned int block_count[2]; + unsigned int inplace_count; + unsigned base_mem, cache_mem, page_mem; +}; + +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info *)sbi->stat_info; +} + +#define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) +#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) +#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_inc_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_dec_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) +#define stat_inc_inplace_blocks(sbi) \ + (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_seg_count(sbi, type, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + (si)->tot_segs++; \ + if (type == SUM_TYPE_DATA) { \ + si->data_segs++; \ + si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ + } else { \ + si->node_segs++; \ + si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ + } \ + } while (0) + +#define stat_inc_tot_blk_count(si, blks) \ + (si->tot_blks += (blks)) + +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->data_blks += (blks); \ + si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + } while (0) + +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->node_blks += (blks); \ + si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + } while (0) + +int f2fs_build_stats(struct f2fs_sb_info *); +void f2fs_destroy_stats(struct f2fs_sb_info *); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); +#else +#define stat_inc_cp_count(si) +#define stat_inc_call_count(si) +#define stat_inc_bggc_count(si) +#define stat_inc_dirty_dir(sbi) +#define stat_dec_dirty_dir(sbi) +#define stat_inc_total_hit(sb) +#define stat_inc_read_hit(sb) +#define stat_inc_inline_inode(inode) +#define stat_dec_inline_inode(inode) +#define stat_inc_inline_dir(inode) +#define stat_dec_inline_dir(inode) +#define stat_inc_seg_type(sbi, curseg) +#define stat_inc_block_count(sbi, curseg) +#define stat_inc_inplace_blocks(sbi) +#define stat_inc_seg_count(sbi, type, gc_type) +#define stat_inc_tot_blk_count(si, blks) +#define stat_inc_data_blk_count(sbi, blks, gc_type) +#define stat_inc_node_blk_count(sbi, blks, gc_type) + +static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } +#endif + +extern const struct file_operations f2fs_dir_operations; +extern const struct file_operations f2fs_file_operations; +extern const struct inode_operations f2fs_file_inode_operations; +extern const struct address_space_operations f2fs_dblock_aops; +extern const struct address_space_operations f2fs_node_aops; +extern const struct address_space_operations f2fs_meta_aops; +extern const struct inode_operations f2fs_dir_inode_operations; +extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_special_inode_operations; +extern struct kmem_cache *inode_entry_slab; + +/* + * inline.c + */ +bool f2fs_may_inline(struct inode *); +void read_inline_data(struct page *, struct page *); +bool truncate_inline_inode(struct page *, u64); +int f2fs_read_inline_data(struct inode *, struct page *); +int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); +int f2fs_convert_inline_inode(struct inode *); +int f2fs_write_inline_data(struct inode *, struct page *); +bool recover_inline_data(struct inode *, struct page *); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, + struct page **); +struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); +int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, + nid_t, umode_t); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, + struct inode *, struct inode *); +bool f2fs_empty_inline_dir(struct inode *); +int f2fs_read_inline_dir(struct file *, struct dir_context *); +#endif diff --git a/kernel/fs/f2fs/file.c b/kernel/fs/f2fs/file.c new file mode 100644 index 000000000..2b52e48d7 --- /dev/null +++ b/kernel/fs/f2fs/file.c @@ -0,0 +1,1172 @@ +/* + * fs/f2fs/file.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" +#include "trace.h" +#include + +static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int err; + + f2fs_balance_fs(sbi); + + sb_start_pagefault(inode->i_sb); + + f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + + /* block allocation */ + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_reserve_block(&dn, page->index); + if (err) { + f2fs_unlock_op(sbi); + goto out; + } + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + file_update_time(vma->vm_file); + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode) || + !PageUptodate(page))) { + unlock_page(page); + err = -EFAULT; + goto out; + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto mapped; + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + unsigned offset; + offset = i_size_read(inode) & ~PAGE_CACHE_MASK; + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + } + set_page_dirty(page); + SetPageUptodate(page); + + trace_f2fs_vm_page_mkwrite(page, DATA); +mapped: + /* fill the page */ + f2fs_wait_on_page_writeback(page, DATA); +out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(err); +} + +static const struct vm_operations_struct f2fs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = f2fs_vm_page_mkwrite, +}; + +static int get_parent_ino(struct inode *inode, nid_t *pino) +{ + struct dentry *dentry; + + inode = igrab(inode); + dentry = d_find_any_alias(inode); + iput(inode); + if (!dentry) + return 0; + + if (update_dent_inode(inode, &dentry->d_name)) { + dput(dentry); + return 0; + } + + *pino = parent_ino(dentry); + dput(dentry); + return 1; +} + +static inline bool need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool need_cp = false; + + if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) + need_cp = true; + else if (file_wrong_pino(inode)) + need_cp = true; + else if (!space_for_roll_forward(sbi)) + need_cp = true; + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; + else if (test_opt(sbi, FASTBOOT)) + need_cp = true; + else if (sbi->active_logs == 2) + need_cp = true; + + return need_cp; +} + +static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + bool ret = false; + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) + ret = true; + f2fs_put_page(i, 0); + return ret; +} + +static void try_to_fix_pino(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t pino; + + down_write(&fi->i_sem); + fi->xattr_ver = 0; + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + fi->i_pino = pino; + file_got_pino(inode); + up_write(&fi->i_sem); + + mark_inode_dirty_sync(inode); + f2fs_write_inode(inode, NULL); + } else { + up_write(&fi->i_sem); + } +} + +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; + int ret = 0; + bool need_cp = false; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + + if (unlikely(f2fs_readonly(inode->i_sb))) + return 0; + + trace_f2fs_sync_file_enter(inode); + + /* if fdatasync is triggered, let's do in-place-update */ + if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(fi, FI_NEED_IPU); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + clear_inode_flag(fi, FI_NEED_IPU); + + if (ret) { + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + return ret; + } + + /* if the inode is dirty, let's recover all the time */ + if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { + update_inode_page(inode); + goto go_write; + } + + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + !exist_written_data(sbi, ino, APPEND_INO)) { + + /* it may call write_inode just prior to fsync */ + if (need_inode_page_update(sbi, ino)) + goto go_write; + + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + exist_written_data(sbi, ino, UPDATE_INO)) + goto flush_out; + goto out; + } +go_write: + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ + down_read(&fi->i_sem); + need_cp = need_do_checkpoint(inode); + up_read(&fi->i_sem); + + if (need_cp) { + /* all the dirty node pages should be flushed for POR */ + ret = f2fs_sync_fs(inode->i_sb, 1); + + /* + * We've secured consistency through sync_fs. Following pino + * will be used only for fsynced inodes after checkpoint. + */ + try_to_fix_pino(inode); + clear_inode_flag(fi, FI_APPEND_WRITE); + clear_inode_flag(fi, FI_UPDATE_WRITE); + goto out; + } +sync_nodes: + sync_node_pages(sbi, ino, &wbc); + + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) + goto out; + + if (need_inode_block_update(sbi, ino)) { + mark_inode_dirty_sync(inode); + f2fs_write_inode(inode, NULL); + goto sync_nodes; + } + + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); + ret = f2fs_issue_flush(sbi); +out: + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + f2fs_trace_ios(NULL, NULL, 1); + return ret; +} + +static pgoff_t __get_first_dirty_index(struct address_space *mapping, + pgoff_t pgofs, int whence) +{ + struct pagevec pvec; + int nr_pages; + + if (whence != SEEK_DATA) + return 0; + + /* find first dirty page index */ + pagevec_init(&pvec, 0); + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, + PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; + pagevec_release(&pvec); + return pgofs; +} + +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, + int whence) +{ + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || + (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset, dirty; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { + if (whence == SEEK_HOLE) + data_ofs = isize; + goto found; + } + + pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + + dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); + + for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node does not exists */ + if (whence == SEEK_DATA) { + pgofs = PGOFS_OF_NEXT_DNODE(pgofs, + F2FS_I(inode)); + continue; + } else { + goto found; + } + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + block_t blkaddr; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (__found_offset(blkaddr, dirty, pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + mutex_unlock(&inode->i_mutex); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + mutex_unlock(&inode->i_mutex); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + if (offset < 0) + return -ENXIO; + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + +static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + file_accessed(file); + vma->vm_ops = &f2fs_file_vm_ops; + return 0; +} + +int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ + int nr_free = 0, ofs = dn->ofs_in_node; + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_node *raw_node; + __le32 *addr; + + raw_node = F2FS_NODE(dn->node_page); + addr = blkaddr_in_node(raw_node) + ofs; + + for (; count > 0; count--, addr++, dn->ofs_in_node++) { + block_t blkaddr = le32_to_cpu(*addr); + if (blkaddr == NULL_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); + invalidate_blocks(sbi, blkaddr); + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(F2FS_I(dn->inode), + FI_FIRST_BLOCK_WRITTEN); + nr_free++; + } + if (nr_free) { + dec_valid_block_count(sbi, dn->inode, nr_free); + set_page_dirty(dn->node_page); + sync_inode_page(dn); + } + dn->ofs_in_node = ofs; + + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); + return nr_free; +} + +void truncate_data_blocks(struct dnode_of_data *dn) +{ + truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); +} + +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool force) +{ + unsigned offset = from & (PAGE_CACHE_SIZE - 1); + struct page *page; + + if (!offset && !force) + return 0; + + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force); + if (IS_ERR(page)) + return 0; + + lock_page(page); + if (unlikely(!PageUptodate(page) || + page->mapping != inode->i_mapping)) + goto out; + + f2fs_wait_on_page_writeback(page, DATA); + zero_user(page, offset, PAGE_CACHE_SIZE - offset); + if (!force) + set_page_dirty(page); +out: + f2fs_put_page(page, 1); + return 0; +} + +int truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int blocksize = inode->i_sb->s_blocksize; + struct dnode_of_data dn; + pgoff_t free_from; + int count = 0, err = 0; + struct page *ipage; + bool truncate_page = false; + + trace_f2fs_truncate_blocks_enter(inode, from); + + free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + + if (lock) + f2fs_lock_op(sbi); + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + if (truncate_inline_inode(ipage, from)) + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + truncate_page = true; + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + if (err) { + if (err == -ENOENT) + goto free_next; + goto out; + } + + count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + + count -= dn.ofs_in_node; + f2fs_bug_on(sbi, count < 0); + + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { + truncate_data_blocks_range(&dn, count); + free_from += count; + } + + f2fs_put_dnode(&dn); +free_next: + err = truncate_inode_blocks(inode, free_from); +out: + if (lock) + f2fs_unlock_op(sbi); + + /* lastly zero out the first data page */ + if (!err) + err = truncate_partial_data_page(inode, from, truncate_page); + + trace_f2fs_truncate_blocks_exit(inode, err); + return err; +} + +void f2fs_truncate(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + + trace_f2fs_truncate(inode); + + /* we should check inline_data size */ + if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) { + if (f2fs_convert_inline_inode(inode)) + return; + } + + if (!truncate_blocks(inode, i_size_read(inode), true)) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } +} + +int f2fs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + stat->blocks <<= 3; + return 0; +} + +#ifdef CONFIG_F2FS_FS_POSIX_ACL +static void __setattr_copy(struct inode *inode, const struct iattr *attr) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + set_acl_inode(fi, mode); + } +} +#else +#define __setattr_copy setattr_copy +#endif + +int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + f2fs_truncate(inode); + f2fs_balance_fs(F2FS_I_SB(inode)); + } else { + /* + * giving a chance to truncate blocks past EOF which + * are fallocated with FALLOC_FL_KEEP_SIZE. + */ + f2fs_truncate(inode); + } + } + + __setattr_copy(inode, attr); + + if (attr->ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, get_inode_mode(inode)); + if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { + inode->i_mode = fi->i_acl_mode; + clear_inode_flag(fi, FI_ACL_MODE); + } + } + + mark_inode_dirty(inode); + return err; +} + +const struct inode_operations f2fs_file_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif + .fiemap = f2fs_fiemap, +}; + +static void fill_zero(struct inode *inode, pgoff_t index, + loff_t start, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + + if (!len) + return; + + f2fs_balance_fs(sbi); + + f2fs_lock_op(sbi); + page = get_new_data_page(inode, NULL, index, false); + f2fs_unlock_op(sbi); + + if (!IS_ERR(page)) { + f2fs_wait_on_page_writeback(page, DATA); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +{ + pgoff_t index; + int err; + + for (index = pg_start; index < pg_end; index++) { + struct dnode_of_data dn; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + if (err == -ENOENT) + continue; + return err; + } + + if (dn.data_blkaddr != NULL_ADDR) + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + } + return 0; +} + +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t pg_start, pg_end; + loff_t off_start, off_end; + int ret = 0; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + /* skip punching hole beyond i_size */ + if (offset >= inode->i_size) + return ret; + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + if (pg_start == pg_end) { + fill_zero(inode, pg_start, off_start, + off_end - off_start); + } else { + if (off_start) + fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (off_end) + fill_zero(inode, pg_end, 0, off_end); + + if (pg_start < pg_end) { + struct address_space *mapping = inode->i_mapping; + loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + f2fs_balance_fs(sbi); + + blk_start = pg_start << PAGE_CACHE_SHIFT; + blk_end = pg_end << PAGE_CACHE_SHIFT; + truncate_inode_pages_range(mapping, blk_start, + blk_end - 1); + + f2fs_lock_op(sbi); + ret = truncate_hole(inode, pg_start, pg_end); + f2fs_unlock_op(sbi); + } + } + + return ret; +} + +static int expand_inode_data(struct inode *inode, loff_t offset, + loff_t len, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + f2fs_balance_fs(sbi); + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + f2fs_lock_op(sbi); + + for (index = pg_start; index <= pg_end; index++) { + struct dnode_of_data dn; + + if (index == pg_end && !off_end) + goto noalloc; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_reserve_block(&dn, index); + if (ret) + break; +noalloc: + if (pg_start == pg_end) + new_size = offset + len; + else if (index == pg_start && off_start) + new_size = (index + 1) << PAGE_CACHE_SHIFT; + else if (index == pg_end) + new_size = (index << PAGE_CACHE_SHIFT) + off_end; + else + new_size += PAGE_CACHE_SIZE; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + i_size_read(inode) < new_size) { + i_size_write(inode, new_size); + mark_inode_dirty(inode); + update_inode_page(inode); + } + f2fs_unlock_op(sbi); + + return ret; +} + +static long f2fs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + long ret; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = punch_hole(inode, offset, len); + else + ret = expand_inode_data(inode, offset, len, mode); + + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } + + mutex_unlock(&inode->i_mutex); + + trace_f2fs_fallocate(inode, mode, offset, len, ret); + return ret; +} + +static int f2fs_release_file(struct inode *inode, struct file *filp) +{ + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, true); + if (f2fs_is_volatile_file(inode)) { + set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + filemap_fdatawrite(inode->i_mapping); + clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + } + return 0; +} + +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + +static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); +} + +static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int oldflags; + int ret; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out; + } + + if (get_user(flags, (int __user *)arg)) { + ret = -EFAULT; + goto out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto out; + } + } + + flags = flags & FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + fi->i_flags = flags; + mutex_unlock(&inode->i_mutex); + + f2fs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return put_user(inode->i_generation, (int __user *)arg); +} + +static int f2fs_ioc_start_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_is_atomic_file(inode)) + return 0; + + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + + return f2fs_convert_inline_inode(inode); +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (f2fs_is_volatile_file(inode)) + return 0; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, false); + + ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + mnt_drop_write_file(filp); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + return ret; +} + +static int f2fs_ioc_start_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (f2fs_is_volatile_file(inode)) + return 0; + + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + + return f2fs_convert_inline_inode(inode); +} + +static int f2fs_ioc_release_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (!f2fs_is_volatile_file(inode)) + return 0; + + if (!f2fs_is_first_block_written(inode)) + return truncate_partial_data_page(inode, 0, true); + + punch_hole(inode, 0, F2FS_BLKSIZE); + return 0; +} + +static int f2fs_ioc_abort_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_is_atomic_file(inode)) { + commit_inmem_pages(inode, false); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + } + + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + filemap_fdatawrite(inode->i_mapping); + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + } + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + __u32 in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + switch (in) { + case F2FS_GOING_DOWN_FULLSYNC: + sb = freeze_bdev(sb->s_bdev); + if (sb && !IS_ERR(sb)) { + f2fs_stop_checkpoint(sbi); + thaw_bdev(sb->s_bdev, sb); + } + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + f2fs_sync_fs(sb, 1); + f2fs_stop_checkpoint(sbi); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi); + break; + default: + return -EINVAL; + } + return 0; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + return 0; +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC_GETFLAGS: + return f2fs_ioc_getflags(filp, arg); + case F2FS_IOC_SETFLAGS: + return f2fs_ioc_setflags(filp, arg); + case F2FS_IOC_GETVERSION: + return f2fs_ioc_getversion(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + return f2fs_ioc_start_volatile_write(filp); + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + return f2fs_ioc_release_volatile_write(filp); + case F2FS_IOC_ABORT_VOLATILE_WRITE: + return f2fs_ioc_abort_volatile_write(filp); + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC32_GETFLAGS: + cmd = F2FS_IOC_GETFLAGS; + break; + case F2FS_IOC32_SETFLAGS: + cmd = F2FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +const struct file_operations f2fs_file_operations = { + .llseek = f2fs_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .open = generic_file_open, + .release = f2fs_release_file, + .mmap = f2fs_file_mmap, + .fsync = f2fs_sync_file, + .fallocate = f2fs_fallocate, + .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; diff --git a/kernel/fs/f2fs/gc.c b/kernel/fs/f2fs/gc.c new file mode 100644 index 000000000..ed58211fe --- /dev/null +++ b/kernel/fs/f2fs/gc.c @@ -0,0 +1,746 @@ +/* + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" +#include + +static int gc_thread_func(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + long wait_ms; + + wait_ms = gc_th->min_sleep_time; + + do { + if (try_to_freeze()) + continue; + else + wait_event_interruptible_timeout(*wq, + kthread_should_stop(), + msecs_to_jiffies(wait_ms)); + if (kthread_should_stop()) + break; + + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + increase_sleep_time(gc_th, &wait_ms); + continue; + } + + /* + * [GC triggering condition] + * 0. GC is not conducted currently. + * 1. There are enough dirty segments. + * 2. IO subsystem is idle by checking the # of writeback pages. + * 3. IO subsystem is idle by checking the # of requests in + * bdev's request list. + * + * Note) We have to avoid triggering GCs frequently. + * Because it is possible that some segments can be + * invalidated soon after by user update or deletion. + * So, I'd like to wait some time to collect dirty segments. + */ + if (!mutex_trylock(&sbi->gc_mutex)) + continue; + + if (!is_idle(sbi)) { + increase_sleep_time(gc_th, &wait_ms); + mutex_unlock(&sbi->gc_mutex); + continue; + } + + if (has_enough_invalid_blocks(sbi)) + decrease_sleep_time(gc_th, &wait_ms); + else + increase_sleep_time(gc_th, &wait_ms); + + stat_inc_bggc_count(sbi); + + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi)) + wait_ms = gc_th->no_gc_sleep_time; + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); + + } while (!kthread_should_stop()); + return 0; +} + +int start_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; + + gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) { + err = -ENOMEM; + goto out; + } + + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_idle = 0; + + sbi->gc_thread = gc_th; + init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; + } +out: + return err; +} + +void stop_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) + return; + kthread_stop(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; +} + +static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) +{ + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + if (gc_th && gc_th->gc_idle) { + if (gc_th->gc_idle == 1) + gc_mode = GC_CB; + else if (gc_th->gc_idle == 2) + gc_mode = GC_GREEDY; + } + return gc_mode; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, + int type, struct victim_sel_policy *p) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (p->alloc_mode == SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else { + p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); + p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + p->ofs_unit = sbi->segs_per_sec; + } + + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + + p->offset = sbi->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return 1 << sbi->log_blocks_per_seg; + if (p->gc_mode == GC_GREEDY) + return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + else if (p->gc_mode == GC_CB) + return UINT_MAX; + else /* No other gc_mode */ + return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno; + + /* + * If the gc_type is FG_GC, we can select victim segments + * selected by background GC before. + * Those segments guarantee they have small valid blocks. + */ + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; + } + return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SECNO(sbi, segno); + unsigned int start = secno * sbi->segs_per_sec; + unsigned long long mtime = 0; + unsigned int vblocks; + unsigned char age = 0; + unsigned char u; + unsigned int i; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + mtime = div_u64(mtime, sbi->segs_per_sec); + vblocks = div_u64(vblocks, sbi->segs_per_sec); + + u = (vblocks * 100) >> sbi->log_blocks_per_seg; + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (sit_i->max_mtime != sit_i->min_mtime) + age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), + sit_i->max_mtime - sit_i->min_mtime); + + return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) +{ + if (p->alloc_mode == SSR) + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + /* alloc_mode == LFS */ + if (p->gc_mode == GC_GREEDY) + return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + else + return get_cb_cost(sbi, segno); +} + +/* + * This function is called from two paths. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, + unsigned int *result, int gc_type, int type, char alloc_mode) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct victim_sel_policy p; + unsigned int secno, max_cost; + int nsearched = 0; + + mutex_lock(&dirty_i->seglist_lock); + + p.alloc_mode = alloc_mode; + select_policy(sbi, gc_type, type, &p); + + p.min_segno = NULL_SEGNO; + p.min_cost = max_cost = get_max_cost(sbi, &p); + + if (p.alloc_mode == LFS && gc_type == FG_GC) { + p.min_segno = check_bg_victims(sbi); + if (p.min_segno != NULL_SEGNO) + goto got_it; + } + + while (1) { + unsigned long cost; + unsigned int segno; + + segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); + if (segno >= MAIN_SEGS(sbi)) { + if (sbi->last_victim[p.gc_mode]) { + sbi->last_victim[p.gc_mode] = 0; + p.offset = 0; + continue; + } + break; + } + + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) + p.offset -= segno % p.ofs_unit; + + secno = GET_SECNO(sbi, segno); + + if (sec_usage_check(sbi, secno)) + continue; + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) + continue; + + cost = get_gc_cost(sbi, segno, &p); + + if (p.min_cost > cost) { + p.min_segno = segno; + p.min_cost = cost; + } else if (unlikely(cost == max_cost)) { + continue; + } + + if (nsearched++ >= p.max_search) { + sbi->last_victim[p.gc_mode] = segno; + break; + } + } + if (p.min_segno != NULL_SEGNO) { +got_it: + if (p.alloc_mode == LFS) { + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); + } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); + } + mutex_unlock(&dirty_i->seglist_lock); + + return (p.min_segno == NULL_SEGNO) ? 0 : 1; +} + +static const struct victim_selection default_v_ops = { + .get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) +{ + struct inode_entry *ie; + + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; + return NULL; +} + +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) +{ + struct inode_entry *new_ie; + + if (inode == find_gc_inode(gc_list, inode->i_ino)) { + iput(inode); + return; + } + new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new_ie->inode = inode; + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); +} + +static void put_gc_inode(struct gc_inode_list *gc_list) +{ + struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); + iput(ie->inode); + list_del(&ie->list); + kmem_cache_free(inode_entry_slab, ie); + } +} + +static int check_valid_map(struct f2fs_sb_info *sbi, + unsigned int segno, int offset) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct seg_entry *sentry; + int ret; + + mutex_lock(&sit_i->sentry_lock); + sentry = get_seg_entry(sbi, segno); + ret = f2fs_test_bit(offset, sentry->cur_valid_map); + mutex_unlock(&sit_i->sentry_lock); + return ret; +} + +/* + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static void gc_node_segment(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ + bool initial = true; + struct f2fs_summary *entry; + int off; + +next_step: + entry = sum; + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + nid_t nid = le32_to_cpu(entry->nid); + struct page *node_page; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (initial) { + ra_node_page(sbi, nid); + continue; + } + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + continue; + + /* block may become invalid during get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + + /* set page dirty and write it */ + if (gc_type == FG_GC) { + f2fs_wait_on_page_writeback(node_page, NODE); + set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } + f2fs_put_page(node_page, 1); + stat_inc_node_blk_count(sbi, 1, gc_type); + } + + if (initial) { + initial = false; + goto next_step; + } + + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; + } +} + +/* + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. + */ +block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +{ + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; + + if (node_ofs == 0) + return 0; + + if (node_ofs <= 2) { + bidx = node_ofs - 1; + } else if (node_ofs <= indirect_blks) { + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; + } else { + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; + } + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); +} + +static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ + struct page *node_page; + nid_t nid; + unsigned int ofs_in_node; + block_t source_blkaddr; + + nid = le32_to_cpu(sum->nid); + ofs_in_node = le16_to_cpu(sum->ofs_in_node); + + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return 0; + + get_node_info(sbi, nid, dni); + + if (sum->version != dni->version) { + f2fs_put_page(node_page, 1); + return 0; + } + + *nofs = ofs_of_node(node_page); + source_blkaddr = datablock_addr(node_page, ofs_in_node); + f2fs_put_page(node_page, 1); + + if (source_blkaddr != blkaddr) + return 0; + return 1; +} + +static void move_data_page(struct inode *inode, struct page *page, int gc_type) +{ + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + + if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; + set_page_dirty(page); + set_cold_data(page); + } else { + f2fs_wait_on_page_writeback(page, DATA); + + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + set_cold_data(page); + do_write_data_page(page, &fio); + clear_cold_data(page); + } +out: + f2fs_put_page(page, 1); +} + +/* + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct gc_inode_list *gc_list, unsigned int segno, int gc_type) +{ + struct super_block *sb = sbi->sb; + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + struct page *data_page; + struct inode *inode; + struct node_info dni; /* dnode info for the data */ + unsigned int ofs_in_node, nofs; + block_t start_bidx; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + ra_node_page(sbi, le32_to_cpu(entry->nid)); + continue; + } + + /* Get an inode by ino with checking validity */ + if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + continue; + + if (phase == 1) { + ra_node_page(sbi, dni.ino); + continue; + } + + ofs_in_node = le16_to_cpu(entry->ofs_in_node); + + if (phase == 2) { + inode = f2fs_iget(sb, dni.ino); + if (IS_ERR(inode) || is_bad_inode(inode)) + continue; + + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + + data_page = find_data_page(inode, + start_bidx + ofs_in_node, false); + if (IS_ERR(data_page)) { + iput(inode); + continue; + } + + f2fs_put_page(data_page, 0); + add_gc_inode(gc_list, inode); + continue; + } + + /* phase 3 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = get_lock_data_page(inode, + start_bidx + ofs_in_node); + if (IS_ERR(data_page)) + continue; + move_data_page(inode, data_page, gc_type); + stat_inc_data_blk_count(sbi, 1, gc_type); + } + } + + if (++phase < 4) + goto next_step; + + if (gc_type == FG_GC) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, + int gc_type) +{ + struct sit_info *sit_i = SIT_I(sbi); + int ret; + + mutex_lock(&sit_i->sentry_lock); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, + NO_CHECK_TYPE, LFS); + mutex_unlock(&sit_i->sentry_lock); + return ret; +} + +static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, + struct gc_inode_list *gc_list, int gc_type) +{ + struct page *sum_page; + struct f2fs_summary_block *sum; + struct blk_plug plug; + + /* read segment summary of victim */ + sum_page = get_sum_page(sbi, segno); + + blk_start_plug(&plug); + + sum = page_address(sum_page); + + switch (GET_SUM_TYPE((&sum->footer))) { + case SUM_TYPE_NODE: + gc_node_segment(sbi, sum->entries, segno, gc_type); + break; + case SUM_TYPE_DATA: + gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); + break; + } + blk_finish_plug(&plug); + + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + stat_inc_call_count(sbi->stat_info); + + f2fs_put_page(sum_page, 1); +} + +int f2fs_gc(struct f2fs_sb_info *sbi) +{ + unsigned int segno, i; + int gc_type = BG_GC; + int nfree = 0; + int ret = -1; + struct cp_control cpc; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(GFP_NOFS), + }; + + cpc.reason = __get_cp_reason(sbi); +gc_more: + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + goto stop; + if (unlikely(f2fs_cp_error(sbi))) + goto stop; + + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + gc_type = FG_GC; + write_checkpoint(sbi, &cpc); + } + + if (!__get_victim(sbi, &segno, gc_type)) + goto stop; + ret = 0; + + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + + for (i = 0; i < sbi->segs_per_sec; i++) + do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; + nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } + + if (has_not_enough_free_secs(sbi, nfree)) + goto gc_more; + + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); +stop: + mutex_unlock(&sbi->gc_mutex); + + put_gc_inode(&gc_list); + return ret; +} + +void build_gc_manager(struct f2fs_sb_info *sbi) +{ + DIRTY_I(sbi)->v_ops = &default_v_ops; +} diff --git a/kernel/fs/f2fs/gc.h b/kernel/fs/f2fs/gc.h new file mode 100644 index 000000000..b4a65be9f --- /dev/null +++ b/kernel/fs/f2fs/gc.h @@ -0,0 +1,110 @@ +/* + * fs/f2fs/gc.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define GC_THREAD_MIN_WB_PAGES 1 /* + * a threshold to determine + * whether IO subsystem is idle + * or not + */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ +#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ +#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ + +/* Search max. number of dirty segments to select a victim segment */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ + +struct f2fs_gc_kthread { + struct task_struct *f2fs_gc_task; + wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_idle; +}; + +struct gc_inode_list { + struct list_head ilist; + struct radix_tree_root iroot; +}; + +/* + * inline functions + */ +static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) +{ + if (free_segments(sbi) < overprovision_segments(sbi)) + return 0; + else + return (free_segments(sbi) - overprovision_segments(sbi)) + << sbi->log_blocks_per_seg; +} + +static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +{ + return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; +} + +static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +{ + block_t reclaimable_user_blocks = sbi->user_block_count - + written_block_count(sbi); + return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; +} + +static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, + long *wait) +{ + if (*wait == gc_th->no_gc_sleep_time) + return; + + *wait += gc_th->min_sleep_time; + if (*wait > gc_th->max_sleep_time) + *wait = gc_th->max_sleep_time; +} + +static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, + long *wait) +{ + if (*wait == gc_th->no_gc_sleep_time) + *wait = gc_th->max_sleep_time; + + *wait -= gc_th->min_sleep_time; + if (*wait <= gc_th->min_sleep_time) + *wait = gc_th->min_sleep_time; +} + +static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) +{ + block_t invalid_user_blocks = sbi->user_block_count - + written_block_count(sbi); + /* + * Background GC is triggered with the following conditions. + * 1. There are a number of invalid blocks. + * 2. There is not enough free space. + */ + if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && + free_user_blocks(sbi) < limit_free_user_blocks(sbi)) + return true; + return false; +} + +static inline int is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); +} diff --git a/kernel/fs/f2fs/hash.c b/kernel/fs/f2fs/hash.c new file mode 100644 index 000000000..a844fcfb9 --- /dev/null +++ b/kernel/fs/f2fs/hash.c @@ -0,0 +1,104 @@ +/* + * fs/f2fs/hash.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include "f2fs.h" + +/* + * Hashing code copied from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(unsigned int buf[4], unsigned int const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) +{ + unsigned pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num * 4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +{ + __u32 hash; + f2fs_hash_t f2fs_hash; + const unsigned char *p; + __u32 in[8], buf[4]; + const unsigned char *name = name_info->name; + size_t len = name_info->len; + + if ((len <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) + return 0; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + p = name; + while (1) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + p += 16; + if (len <= 16) + break; + len -= 16; + } + hash = buf[0]; + f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); + return f2fs_hash; +} diff --git a/kernel/fs/f2fs/inline.c b/kernel/fs/f2fs/inline.c new file mode 100644 index 000000000..8140e4f0e --- /dev/null +++ b/kernel/fs/f2fs/inline.c @@ -0,0 +1,532 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li + * Haicheng Li + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include "f2fs.h" + +bool f2fs_may_inline(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) + return false; + + if (f2fs_is_atomic_file(inode)) + return false; + + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return false; + + if (i_size_read(inode) > MAX_INLINE_DATA) + return false; + + return true; +} + +void read_inline_data(struct page *page, struct page *ipage) +{ + void *src_addr, *dst_addr; + + if (PageUptodate(page)) + return; + + f2fs_bug_on(F2FS_P_SB(page), page->index); + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap_atomic(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + flush_dcache_page(page); + kunmap_atomic(dst_addr); + SetPageUptodate(page); +} + +bool truncate_inline_inode(struct page *ipage, u64 from) +{ + void *addr; + + if (from >= MAX_INLINE_DATA) + return false; + + addr = inline_data_addr(ipage); + + f2fs_wait_on_page_writeback(ipage, NODE); + memset(addr + from, 0, MAX_INLINE_DATA - from); + + return true; +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct page *ipage; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) { + unlock_page(page); + return PTR_ERR(ipage); + } + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_page(ipage, 1); + return -EAGAIN; + } + + if (page->index) + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + else + read_inline_data(page, ipage); + + SetPageUptodate(page); + f2fs_put_page(ipage, 1); + unlock_page(page); + return 0; +} + +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) +{ + void *src_addr, *dst_addr; + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + int dirty, err; + + f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); + + if (!f2fs_exist_data(dn->inode)) + goto clear_out; + + err = f2fs_reserve_block(dn, 0); + if (err) + return err; + + f2fs_wait_on_page_writeback(page, DATA); + + if (PageUptodate(page)) + goto no_update; + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(dn->inode_page); + dst_addr = kmap_atomic(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + flush_dcache_page(page); + kunmap_atomic(dst_addr); + SetPageUptodate(page); +no_update: + /* clear dirty state */ + dirty = clear_page_dirty_for_io(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + fio.blk_addr = dn->data_blkaddr; + write_data_page(page, dn, &fio); + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); + f2fs_wait_on_page_writeback(page, DATA); + if (dirty) + inode_dec_dirty_pages(dn->inode); + + /* this converted inline_data should be recovered. */ + set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + + /* clear inline data and flag after data writeback */ + truncate_inline_inode(dn->inode_page, 0); +clear_out: + stat_dec_inline_inode(dn->inode); + f2fs_clear_inline_inode(dn->inode); + sync_inode_page(dn); + f2fs_put_dnode(dn); + return 0; +} + +int f2fs_convert_inline_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage, *page; + int err = 0; + + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + + f2fs_lock_op(sbi); + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) + err = f2fs_convert_inline_page(&dn, page); + + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + + f2fs_put_page(page, 1); + return err; +} + +int f2fs_write_inline_data(struct inode *inode, struct page *page) +{ + void *src_addr, *dst_addr; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_dnode(&dn); + return -EAGAIN; + } + + f2fs_bug_on(F2FS_I_SB(inode), page->index); + + f2fs_wait_on_page_writeback(dn.inode_page, NODE); + src_addr = kmap_atomic(page); + dst_addr = inline_data_addr(dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap_atomic(src_addr); + + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + return 0; +} + +bool recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && (ri->i_inline & F2FS_INLINE_DATA)) { +process_inline: + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(sbi, IS_ERR(ipage)); + + f2fs_wait_on_page_writeback(ipage, NODE); + + src_addr = inline_data_addr(npage); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return true; + } + + if (f2fs_has_inline_data(inode)) { + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(sbi, IS_ERR(ipage)); + truncate_inline_inode(ipage, 0); + f2fs_clear_inline_inode(inode); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { + truncate_blocks(inode, 0, false); + goto process_inline; + } + return false; +} + +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct qstr *name, struct page **res_page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + struct page *ipage; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return NULL; + + inline_dentry = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)inline_dentry, 2); + de = find_target_dentry(name, NULL, &d); + + unlock_page(ipage); + if (de) + *res_page = ipage; + else + f2fs_put_page(ipage, 0); + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs has occurred. + */ + f2fs_bug_on(sbi, d.max < 0); + return de; +} + +struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, + struct page **p) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct f2fs_dir_entry *de; + struct f2fs_inline_dentry *dentry_blk; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return NULL; + + dentry_blk = inline_data_addr(ipage); + de = &dentry_blk->dentry[1]; + *p = ipage; + unlock_page(ipage); + return de; +} + +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage) +{ + struct f2fs_inline_dentry *dentry_blk; + struct f2fs_dentry_ptr d; + + dentry_blk = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)dentry_blk, 2); + do_make_empty_dir(inode, parent, &d); + + set_page_dirty(ipage); + + /* update i_size to MAX_INLINE_DATA */ + if (i_size_read(inode) < MAX_INLINE_DATA) { + i_size_write(inode, MAX_INLINE_DATA); + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); + } + return 0; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct page *page; + struct dnode_of_data dn; + struct f2fs_dentry_block *dentry_blk; + int err; + + page = grab_cache_page(dir->i_mapping, 0); + if (!page) + return -ENOMEM; + + set_new_dnode(&dn, dir, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + f2fs_wait_on_page_writeback(page, DATA); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + + dentry_blk = kmap_atomic(page); + + /* copy data from inline dentry block to new dentry block */ + memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, + INLINE_DENTRY_BITMAP_SIZE); + memcpy(dentry_blk->dentry, inline_dentry->dentry, + sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); + memcpy(dentry_blk->filename, inline_dentry->filename, + NR_INLINE_DENTRY * F2FS_SLOT_LEN); + + kunmap_atomic(dentry_blk); + SetPageUptodate(page); + set_page_dirty(page); + + /* clear inline dir and flag after data writeback */ + truncate_inline_inode(ipage, 0); + + stat_dec_inline_dir(dir); + clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + + if (i_size_read(dir) < PAGE_CACHE_SIZE) { + i_size_write(dir, PAGE_CACHE_SIZE); + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } + + sync_inode_page(&dn); +out: + f2fs_put_page(page, 1); + return err; +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos; + f2fs_hash_t name_hash; + size_t namelen = name->len; + struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_dentry_ptr d; + int slots = GET_DENTRY_SLOTS(namelen); + struct page *page = NULL; + int err = 0; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + dentry_blk = inline_data_addr(ipage); + bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_INLINE_DENTRY); + if (bit_pos >= NR_INLINE_DENTRY) { + err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + if (!err) + err = -EAGAIN; + goto out; + } + + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + f2fs_wait_on_page_writeback(ipage, NODE); + + name_hash = f2fs_dentry_hash(name); + make_dentry_ptr(&d, (void *)dentry_blk, 2); + f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + + set_page_dirty(ipage); + + /* we don't need to mark_inode_dirty now */ + if (inode) { + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } + + update_parent_metadata(dir, inode, 0); +fail: + if (inode) + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode(dir, ipage); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } +out: + f2fs_put_page(ipage, 1); + return err; +} + +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_inline_dentry *inline_dentry; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + unsigned int bit_pos; + int i; + + lock_page(page); + f2fs_wait_on_page_writeback(page, NODE); + + inline_dentry = inline_data_addr(page); + bit_pos = dentry - inline_dentry->dentry; + for (i = 0; i < slots; i++) + test_and_clear_bit_le(bit_pos + i, + &inline_dentry->dentry_bitmap); + + set_page_dirty(page); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + if (inode) + f2fs_drop_nlink(dir, inode, page); + + f2fs_put_page(page, 1); +} + +bool f2fs_empty_inline_dir(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos = 2; + struct f2fs_inline_dentry *dentry_blk; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return false; + + dentry_blk = inline_data_addr(ipage); + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_INLINE_DENTRY, + bit_pos); + + f2fs_put_page(ipage, 1); + + if (bit_pos < NR_INLINE_DENTRY) + return false; + + return true; +} + +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct f2fs_inline_dentry *inline_dentry = NULL; + struct page *ipage = NULL; + struct f2fs_dentry_ptr d; + + if (ctx->pos == NR_INLINE_DENTRY) + return 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + inline_dentry = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)inline_dentry, 2); + + if (!f2fs_fill_dentries(ctx, &d, 0)) + ctx->pos = NR_INLINE_DENTRY; + + f2fs_put_page(ipage, 1); + return 0; +} diff --git a/kernel/fs/f2fs/inode.c b/kernel/fs/f2fs/inode.c new file mode 100644 index 000000000..e622ec954 --- /dev/null +++ b/kernel/fs/f2fs/inode.c @@ -0,0 +1,387 @@ +/* + * fs/f2fs/inode.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" + +#include + +void f2fs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = F2FS_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & FS_SYNC_FL) + new_fl |= S_SYNC; + if (flags & FS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); +} + +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(ri->i_addr[1])); + } +} + +static bool __written_first_block(struct f2fs_inode *ri) +{ + block_t addr = le32_to_cpu(ri->i_addr[0]); + + if (addr != NEW_ADDR && addr != NULL_ADDR) + return true; + return false; +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } +} + +static void __recover_inline_status(struct inode *inode, struct page *ipage) +{ + void *inline_data = inline_data_addr(ipage); + __le32 *start = inline_data; + __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + + while (start < end) { + if (*start++) { + f2fs_wait_on_page_writeback(ipage, NODE); + + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_page_dirty(ipage); + return; + } + } + return; +} + +static int do_read_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct page *node_page; + struct f2fs_inode *ri; + + /* Check if ino is within scope */ + if (check_nid_range(sbi, inode->i_ino)) { + f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", + (unsigned long) inode->i_ino); + WARN_ON(1); + return -EINVAL; + } + + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + ri = F2FS_INODE(node_page); + + inode->i_mode = le16_to_cpu(ri->i_mode); + i_uid_write(inode, le32_to_cpu(ri->i_uid)); + i_gid_write(inode, le32_to_cpu(ri->i_gid)); + set_nlink(inode, le32_to_cpu(ri->i_links)); + inode->i_size = le64_to_cpu(ri->i_size); + inode->i_blocks = le64_to_cpu(ri->i_blocks); + + inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode->i_generation = le32_to_cpu(ri->i_generation); + + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); + fi->i_flags = le32_to_cpu(ri->i_flags); + fi->flags = 0; + fi->i_advise = ri->i_advise; + fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; + + f2fs_init_extent_cache(inode, &ri->i_ext); + + get_inline_info(fi, ri); + + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_page); + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + + if (__written_first_block(ri)) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + + f2fs_put_page(node_page, 1); + + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + return 0; +} + +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + int ret = 0; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); + return inode; + } + if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) + goto make_now; + + ret = do_read_inode(inode); + if (ret) + goto bad_inode; +make_now: + if (ino == F2FS_NODE_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_node_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + } else if (ino == F2FS_META_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &f2fs_symlink_inode_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &f2fs_special_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + ret = -EIO; + goto bad_inode; + } + unlock_new_inode(inode); + trace_f2fs_iget(inode); + return inode; + +bad_inode: + iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); + return ERR_PTR(ret); +} + +void update_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_inode *ri; + + f2fs_wait_on_page_writeback(node_page, NODE); + + ri = F2FS_INODE(node_page); + + ri->i_mode = cpu_to_le16(inode->i_mode); + ri->i_advise = F2FS_I(inode)->i_advise; + ri->i_uid = cpu_to_le32(i_uid_read(inode)); + ri->i_gid = cpu_to_le32(i_gid_read(inode)); + ri->i_links = cpu_to_le32(inode->i_nlink); + ri->i_size = cpu_to_le64(i_size_read(inode)); + ri->i_blocks = cpu_to_le64(inode->i_blocks); + + read_lock(&F2FS_I(inode)->ext_lock); + set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + read_unlock(&F2FS_I(inode)->ext_lock); + + set_raw_inline(F2FS_I(inode), ri); + + ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); + ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); + ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); + ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); + ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; + + __set_inode_rdev(inode, ri); + set_cold_node(inode, node_page); + set_page_dirty(node_page); + + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); +} + +void update_inode_page(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *node_page; +retry: + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } + update_inode(inode, node_page); + f2fs_put_page(node_page, 1); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + return 0; + + /* + * We need to lock here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + f2fs_lock_op(sbi); + update_inode_page(inode); + f2fs_unlock_op(sbi); + + if (wbc) + f2fs_balance_fs(sbi); + + return 0; +} + +/* + * Called at the last iput() if i_nlink is zero + */ +void f2fs_evict_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, true); + + trace_f2fs_evict_inode(inode); + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + goto out_clear; + + f2fs_bug_on(sbi, get_dirty_pages(inode)); + remove_dirty_dir_inode(inode); + + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + sb_start_intwrite(inode->i_sb); + set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); + i_size_write(inode, 0); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + f2fs_lock_op(sbi); + remove_inode_page(inode); + f2fs_unlock_op(sbi); + + sb_end_intwrite(inode->i_sb); +no_delete: + stat_dec_inline_dir(inode); + stat_dec_inline_inode(inode); + + /* update extent info in inode */ + if (inode->i_nlink) + f2fs_preserve_extent_tree(inode); + f2fs_destroy_extent_tree(inode); + + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); +out_clear: + clear_inode(inode); +} + +/* caller should call f2fs_lock_op() */ +void handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + clear_nlink(inode); + make_bad_inode(inode); + unlock_new_inode(inode); + + i_size_write(inode, 0); + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + remove_inode_page(inode); + + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + alloc_nid_failed(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); +} diff --git a/kernel/fs/f2fs/namei.c b/kernel/fs/f2fs/namei.c new file mode 100644 index 000000000..658e8079a --- /dev/null +++ b/kernel/fs/f2fs/namei.c @@ -0,0 +1,832 @@ +/* + * fs/f2fs/namei.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "xattr.h" +#include "acl.h" +#include + +static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + nid_t ino; + struct inode *inode; + bool nid_free = false; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + f2fs_lock_op(sbi); + if (!alloc_nid(sbi, &ino)) { + f2fs_unlock_op(sbi); + err = -ENOSPC; + goto fail; + } + f2fs_unlock_op(sbi); + + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_generation = sbi->s_next_generation++; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + nid_free = true; + goto out; + } + + if (f2fs_may_inline(inode)) + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode)) + set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + + trace_f2fs_new_inode(inode, 0); + mark_inode_dirty(inode); + return inode; + +out: + clear_nlink(inode); + unlock_new_inode(inode); +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + iput(inode); + if (nid_free) + alloc_nid_failed(sbi, ino); + return ERR_PTR(err); +} + +static int is_multimedia_file(const unsigned char *s, const char *sub) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + + if (sublen > slen) + return 0; + + return !strncasecmp(s + slen - sublen, sub, sublen); +} + +/* + * Set multimedia files as cold files for hot/cold data separation + */ +static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + int i; + __u8 (*extlist)[8] = sbi->raw_super->extension_list; + + int count = le32_to_cpu(sbi->raw_super->extension_count); + for (i = 0; i < count; i++) { + if (is_multimedia_file(name, extlist[i])) { + file_set_cold(inode); + break; + } + } +} + +static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + nid_t ino = 0; + int err; + + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_cold_files(sbi, inode, dentry->d_name.name); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + ino = inode->i_ino; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, ino); + + stat_inc_inline_inode(inode); + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out: + handle_failed_inode(inode); + return err; +} + +static int f2fs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; + + f2fs_balance_fs(sbi); + + inode->i_ctime = CURRENT_TIME; + ihold(inode); + + set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + d_instantiate(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out: + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + iput(inode); + f2fs_unlock_op(sbi); + return err; +} + +struct dentry *f2fs_get_parent(struct dentry *child) +{ + struct qstr dotdot = QSTR_INIT("..", 2); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); +} + +static int __recover_dot_dentries(struct inode *dir, nid_t pino) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + struct f2fs_dir_entry *de; + struct page *page; + int err = 0; + + f2fs_lock_op(sbi); + + de = f2fs_find_entry(dir, &dot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + if (err) + goto out; + } + + de = f2fs_find_entry(dir, &dotdot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + } +out: + if (!err) { + clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); + mark_inode_dirty(dir); + } + + f2fs_unlock_op(sbi); + return err; +} + +static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct f2fs_dir_entry *de; + struct page *page; + + if (dentry->d_name.len > F2FS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (de) { + nid_t ino = le32_to_cpu(de->ino); + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (f2fs_has_inline_dots(inode)) { + int err; + + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + } + } + + return d_splice_alias(inode, dentry); +} + +static int f2fs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode = d_inode(dentry); + struct f2fs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + trace_f2fs_unlink_enter(dir, dentry); + f2fs_balance_fs(sbi); + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto fail; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) { + f2fs_unlock_op(sbi); + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + goto fail; + } + f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + + /* In order to evict this inode, we set it dirty */ + mark_inode_dirty(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); +fail: + trace_f2fs_unlink_exit(inode, err); + return err; +} + +static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = page_follow_link_light(dentry, nd); + + if (IS_ERR_OR_NULL(page)) + return page; + + /* this is broken symlink case */ + if (*nd_get_link(nd) == 0) { + page_put_link(dentry, nd, page); + return ERR_PTR(-ENOENT); + } + return page; +} + +static int f2fs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + size_t symlen = strlen(symname) + 1; + int err; + + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_symlink_inode_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + err = page_symlink(inode, symname, symlen); + alloc_nid_done(sbi, inode->i_ino); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return err; +out: + handle_failed_inode(inode); + return err; +} + +static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + + set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_fail; + f2fs_unlock_op(sbi); + + stat_inc_inline_dir(inode); + alloc_nid_done(sbi, inode->i_ino); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; + +out_fail: + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + handle_failed_inode(inode); + return err; +} + +static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) + return f2fs_unlink(dir, dentry); + return -ENOTEMPTY; +} + +static int f2fs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &f2fs_special_inode_operations; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out: + handle_failed_inode(inode); + return err; +} + +static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *old_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL; + struct f2fs_dir_entry *old_entry; + struct f2fs_dir_entry *new_entry; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); + if (!old_dir_entry) + goto out_old; + } + + if (new_inode) { + + err = -ENOTEMPTY; + if (old_dir_entry && !f2fs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, + &new_page); + if (!new_entry) + goto out_dir; + + f2fs_lock_op(sbi); + + err = acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + if (update_dent_inode(old_inode, &new_dentry->d_name)) { + release_orphan_inode(sbi); + goto put_out_dir; + } + + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); + if (old_dir_entry) + drop_nlink(new_inode); + drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + mark_inode_dirty(new_inode); + + if (!new_inode->i_nlink) + add_orphan_inode(sbi, new_inode->i_ino); + else + release_orphan_inode(sbi); + + update_inode_page(old_inode); + update_inode_page(new_inode); + } else { + f2fs_lock_op(sbi); + + err = f2fs_add_link(new_dentry, old_inode); + if (err) { + f2fs_unlock_op(sbi); + goto out_dir; + } + + if (old_dir_entry) { + inc_nlink(new_dir); + update_inode_page(new_dir); + } + } + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + old_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_inode); + + f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + + if (old_dir_entry) { + if (old_dir != new_dir) { + f2fs_set_link(old_inode, old_dir_entry, + old_dir_page, new_dir); + update_inode_page(old_inode); + } else { + f2fs_dentry_kunmap(old_inode, old_dir_page); + f2fs_put_page(old_dir_page, 0); + } + drop_nlink(old_dir); + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; + +put_out_dir: + f2fs_unlock_op(sbi); + f2fs_dentry_kunmap(new_dir, new_page); + f2fs_put_page(new_page, 0); +out_dir: + if (old_dir_entry) { + f2fs_dentry_kunmap(old_inode, old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_old: + f2fs_dentry_kunmap(old_dir, old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) + goto out_old; + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) + goto out_new; + } + + if (S_ISDIR(new_inode->i_mode)) { + err = -EIO; + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) + goto out_old_dir; + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_lock_op(sbi); + + err = update_dent_inode(old_inode, &new_dentry->d_name); + if (err) + goto out_unlock; + + err = update_dent_inode(new_inode, &old_dentry->d_name); + if (err) + goto out_undo; + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + update_inode_page(old_inode); + + old_dir->i_ctime = CURRENT_TIME; + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + if (old_nlink < 0) + drop_nlink(old_dir); + else + inc_nlink(old_dir); + up_write(&F2FS_I(old_dir)->i_sem); + } + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + file_lost_pino(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + update_inode_page(new_inode); + + new_dir->i_ctime = CURRENT_TIME; + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + if (new_nlink < 0) + drop_nlink(new_dir); + else + inc_nlink(new_dir); + up_write(&F2FS_I(new_dir)->i_sem); + } + mark_inode_dirty(new_dir); + update_inode_page(new_dir); + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out_undo: + /* Still we may fail to recover name info of f2fs_inode here */ + update_dent_inode(old_inode, &old_dentry->d_name); +out_unlock: + f2fs_unlock_op(sbi); +out_new_dir: + if (new_dir_entry) { + f2fs_dentry_kunmap(new_inode, new_dir_page); + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + f2fs_dentry_kunmap(old_inode, old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_new: + f2fs_dentry_kunmap(new_dir, new_page); + f2fs_put_page(new_page, 0); +out_old: + f2fs_dentry_kunmap(old_dir, old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + + stat_inc_inline_inode(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + handle_failed_inode(inode); + return err; +} + +const struct inode_operations f2fs_dir_inode_operations = { + .create = f2fs_create, + .lookup = f2fs_lookup, + .link = f2fs_link, + .unlink = f2fs_unlink, + .symlink = f2fs_symlink, + .mkdir = f2fs_mkdir, + .rmdir = f2fs_rmdir, + .mknod = f2fs_mknod, + .rename2 = f2fs_rename2, + .tmpfile = f2fs_tmpfile, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = f2fs_follow_link, + .put_link = page_put_link, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/kernel/fs/f2fs/node.c b/kernel/fs/f2fs/node.c new file mode 100644 index 000000000..8ab0cf193 --- /dev/null +++ b/kernel/fs/f2fs/node.c @@ -0,0 +1,2084 @@ +/* + * fs/f2fs/node.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "trace.h" +#include + +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + +static struct kmem_cache *nat_entry_slab; +static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; + +bool available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct sysinfo val; + unsigned long avail_ram; + unsigned long mem_size = 0; + bool res = false; + + si_meminfo(&val); + + /* only uses low memory */ + avail_ram = val.totalram - val.totalhigh; + + /* + * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + */ + if (type == FREE_NIDS) { + mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> + PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> + PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INO_ENTRIES) { + int i; + + for (i = 0; i <= UPDATE_INO; i++) + mem_size += (sbi->im[i].ino_num * + sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == EXTENT_CACHE) { + mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; + } + return res; +} + +static void clear_node_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + unsigned int long flags; + + if (PageDirty(page)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + clear_page_dirty_for_io(page); + dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + } + ClearPageUptodate(page); +} + +static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + pgoff_t index = current_nat_addr(sbi, nid); + return get_meta_page(sbi, index); +} + +static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *src_page; + struct page *dst_page; + pgoff_t src_off; + pgoff_t dst_off; + void *src_addr; + void *dst_addr; + struct f2fs_nm_info *nm_i = NM_I(sbi); + + src_off = current_nat_addr(sbi, nid); + dst_off = next_nat_addr(sbi, src_off); + + /* get current nat block page with lock */ + src_page = get_meta_page(sbi, src_off); + dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(sbi, PageDirty(src_page)); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_nat(nm_i, nid); + + return dst_page; +} + +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +{ + return radix_tree_lookup(&nm_i->nat_root, n); +} + +static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); +} + +static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) +{ + list_del(&e->list); + radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); + nm_i->nat_cnt--; + kmem_cache_free(nat_entry_slab, e); +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + if (get_nat_flag(ne, IS_DIRTY)) + return; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); + } + list_move_tail(&ne->list, &head->entry_list); + nm_i->dirty_nat_cnt++; + head->entry_cnt++; + set_nat_flag(ne, IS_DIRTY, true); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (head) { + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + head->entry_cnt--; + nm_i->dirty_nat_cnt--; + } +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool is_cp = true; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; + up_read(&nm_i->nat_tree_lock); + return is_cp; +} + +bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsynced = false; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) + fsynced = true; + up_read(&nm_i->nat_tree_lock); + return fsynced; +} + +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need_update = true; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + up_read(&nm_i->nat_tree_lock); + return need_update; +} + +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +{ + struct nat_entry *new; + + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + memset(new, 0, sizeof(struct nat_entry)); + nat_set_nid(new, nid); + nat_reset_flag(new); + list_add_tail(&new->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return new; +} + +static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + struct f2fs_nat_entry *ne) +{ + struct nat_entry *e; + + down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (!e) { + e = grab_nat_entry(nm_i, nid); + node_info_from_raw_nat(&e->ni, ne); + } + up_write(&nm_i->nat_tree_lock); +} + +static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + block_t new_blkaddr, bool fsync_done) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + + down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ni->nid); + if (!e) { + e = grab_nat_entry(nm_i, ni->nid); + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); + } else if (new_blkaddr == NEW_ADDR) { + /* + * when nid is reallocated, + * previous nat entry can be remained in nat cache. + * So, reinitialize it with new information. + */ + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); + } + + /* sanity check */ + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && + new_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && + new_blkaddr == NEW_ADDR); + f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && + nat_get_blkaddr(e) != NULL_ADDR && + new_blkaddr == NEW_ADDR); + + /* increment version no as node is removed */ + if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { + unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); + } + + /* change address */ + nat_set_blkaddr(e, new_blkaddr); + if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + set_nat_flag(e, IS_CHECKPOINTED, false); + __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } + up_write(&nm_i->nat_tree_lock); +} + +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + if (available_free_memory(sbi, NAT_ENTRIES)) + return 0; + + down_write(&nm_i->nat_tree_lock); + while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + struct nat_entry *ne; + ne = list_first_entry(&nm_i->nat_entries, + struct nat_entry, list); + __del_from_nat_cache(nm_i, ne); + nr_shrink--; + } + up_write(&nm_i->nat_tree_lock); + return nr_shrink; +} + +/* + * This function always returns success + */ +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = START_NID(nid); + struct f2fs_nat_block *nat_blk; + struct page *page = NULL; + struct f2fs_nat_entry ne; + struct nat_entry *e; + int i; + + ni->nid = nid; + + /* Check nat cache */ + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + ni->ino = nat_get_ino(e); + ni->blk_addr = nat_get_blkaddr(e); + ni->version = nat_get_version(e); + } + up_read(&nm_i->nat_tree_lock); + if (e) + return; + + memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + + /* Check current segment summary */ + mutex_lock(&curseg->curseg_mutex); + i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + if (i >= 0) { + ne = nat_in_journal(sum, i); + node_info_from_raw_nat(ni, &ne); + } + mutex_unlock(&curseg->curseg_mutex); + if (i >= 0) + goto cache; + + /* Fill node_info from nat page */ + page = get_current_nat_page(sbi, start_nid); + nat_blk = (struct f2fs_nat_block *)page_address(page); + ne = nat_blk->entries[nid - start_nid]; + node_info_from_raw_nat(ni, &ne); + f2fs_put_page(page, 1); +cache: + /* cache nat entry */ + cache_nat_entry(NM_I(sbi), nid, &ne); +} + +/* + * The maximum depth is four. + * Offset[0] will have raw inode offset. + */ +static int get_node_path(struct f2fs_inode_info *fi, long block, + int offset[4], unsigned int noffset[4]) +{ + const long direct_index = ADDRS_PER_INODE(fi); + const long direct_blks = ADDRS_PER_BLOCK; + const long dptrs_per_blk = NIDS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; + int n = 0; + int level = 0; + + noffset[0] = 0; + + if (block < direct_index) { + offset[n] = block; + goto got; + } + block -= direct_index; + if (block < direct_blks) { + offset[n++] = NODE_DIR1_BLOCK; + noffset[n] = 1; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < direct_blks) { + offset[n++] = NODE_DIR2_BLOCK; + noffset[n] = 2; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND1_BLOCK; + noffset[n] = 3; + offset[n++] = block / direct_blks; + noffset[n] = 4 + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND2_BLOCK; + noffset[n] = 4 + dptrs_per_blk; + offset[n++] = block / direct_blks; + noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < dindirect_blks) { + offset[n++] = NODE_DIND_BLOCK; + noffset[n] = 5 + (dptrs_per_blk * 2); + offset[n++] = block / indirect_blks; + noffset[n] = 6 + (dptrs_per_blk * 2) + + offset[n - 1] * (dptrs_per_blk + 1); + offset[n++] = (block / direct_blks) % dptrs_per_blk; + noffset[n] = 7 + (dptrs_per_blk * 2) + + offset[n - 2] * (dptrs_per_blk + 1) + + offset[n - 1]; + offset[n] = block % direct_blks; + level = 3; + goto got; + } else { + BUG(); + } +got: + return level; +} + +/* + * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. + */ +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct page *npage[4]; + struct page *parent = NULL; + int offset[4]; + unsigned int noffset[4]; + nid_t nids[4]; + int level, i; + int err = 0; + + level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); + + nids[0] = dn->inode->i_ino; + npage[0] = dn->inode_page; + + if (!npage[0]) { + npage[0] = get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } + + /* if inline_data is set, should not report any block indices */ + if (f2fs_has_inline_data(dn->inode) && index) { + err = -ENOENT; + f2fs_put_page(npage[0], 1); + goto release_out; + } + + parent = npage[0]; + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); + dn->inode_page = npage[0]; + dn->inode_page_locked = true; + + /* get indirect or direct nodes */ + for (i = 1; i <= level; i++) { + bool done = false; + + if (!nids[i] && mode == ALLOC_NODE) { + /* alloc new node */ + if (!alloc_nid(sbi, &(nids[i]))) { + err = -ENOSPC; + goto release_pages; + } + + dn->nid = nids[i]; + npage[i] = new_node_page(dn, noffset[i], NULL); + if (IS_ERR(npage[i])) { + alloc_nid_failed(sbi, nids[i]); + err = PTR_ERR(npage[i]); + goto release_pages; + } + + set_nid(parent, offset[i - 1], nids[i], i == 1); + alloc_nid_done(sbi, nids[i]); + done = true; + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { + npage[i] = get_node_page_ra(parent, offset[i - 1]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + goto release_pages; + } + done = true; + } + if (i == 1) { + dn->inode_page_locked = false; + unlock_page(parent); + } else { + f2fs_put_page(parent, 1); + } + + if (!done) { + npage[i] = get_node_page(sbi, nids[i]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + f2fs_put_page(npage[0], 0); + goto release_out; + } + } + if (i < level) { + parent = npage[i]; + nids[i + 1] = get_nid(parent, offset[i], false); + } + } + dn->nid = nids[level]; + dn->ofs_in_node = offset[level]; + dn->node_page = npage[level]; + dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + return 0; + +release_pages: + f2fs_put_page(parent, 1); + if (i > 1) + f2fs_put_page(npage[0], 0); +release_out: + dn->inode_page = NULL; + dn->node_page = NULL; + return err; +} + +static void truncate_node(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info ni; + + get_node_info(sbi, dn->nid, &ni); + if (dn->inode->i_blocks == 0) { + f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); + goto invalidate; + } + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); + + /* Deallocate node address */ + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, dn->inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + + if (dn->nid == dn->inode->i_ino) { + remove_orphan_inode(sbi, dn->nid); + dec_valid_inode_count(sbi); + } else { + sync_inode_page(dn); + } +invalidate: + clear_node_page_dirty(dn->node_page); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + dn->node_page->index, dn->node_page->index); + + dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); +} + +static int truncate_dnode(struct dnode_of_data *dn) +{ + struct page *page; + + if (dn->nid == 0) + return 1; + + /* get direct node */ + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + return 1; + else if (IS_ERR(page)) + return PTR_ERR(page); + + /* Make dnode_of_data for parameter */ + dn->node_page = page; + dn->ofs_in_node = 0; + truncate_data_blocks(dn); + truncate_node(dn); + return 1; +} + +static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, + int ofs, int depth) +{ + struct dnode_of_data rdn = *dn; + struct page *page; + struct f2fs_node *rn; + nid_t child_nid; + unsigned int child_nofs; + int freed = 0; + int i, ret; + + if (dn->nid == 0) + return NIDS_PER_BLOCK + 1; + + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + rn = F2FS_NODE(page); + if (depth < 3) { + for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) + continue; + rdn.nid = child_nid; + ret = truncate_dnode(&rdn); + if (ret < 0) + goto out_err; + set_nid(page, i, 0, false); + } + } else { + child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; + for (i = ofs; i < NIDS_PER_BLOCK; i++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) { + child_nofs += NIDS_PER_BLOCK + 1; + continue; + } + rdn.nid = child_nid; + ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); + if (ret == (NIDS_PER_BLOCK + 1)) { + set_nid(page, i, 0, false); + child_nofs += ret; + } else if (ret < 0 && ret != -ENOENT) { + goto out_err; + } + } + freed = child_nofs; + } + + if (!ofs) { + /* remove current indirect node */ + dn->node_page = page; + truncate_node(dn); + freed++; + } else { + f2fs_put_page(page, 1); + } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); + return freed; + +out_err: + f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); + return ret; +} + +static int truncate_partial_nodes(struct dnode_of_data *dn, + struct f2fs_inode *ri, int *offset, int depth) +{ + struct page *pages[2]; + nid_t nid[3]; + nid_t child_nid; + int err = 0; + int i; + int idx = depth - 2; + + nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + if (!nid[0]) + return 0; + + /* get indirect nodes in the path */ + for (i = 0; i < idx + 1; i++) { + /* reference count'll be increased */ + pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); + if (IS_ERR(pages[i])) { + err = PTR_ERR(pages[i]); + idx = i - 1; + goto fail; + } + nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + } + + /* free direct nodes linked to a partial indirect node */ + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { + child_nid = get_nid(pages[idx], i, false); + if (!child_nid) + continue; + dn->nid = child_nid; + err = truncate_dnode(dn); + if (err < 0) + goto fail; + set_nid(pages[idx], i, 0, false); + } + + if (offset[idx + 1] == 0) { + dn->node_page = pages[idx]; + dn->nid = nid[idx]; + truncate_node(dn); + } else { + f2fs_put_page(pages[idx], 1); + } + offset[idx]++; + offset[idx + 1] = 0; + idx--; +fail: + for (i = idx; i >= 0; i--) + f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + + return err; +} + +/* + * All the block addresses of data and nodes should be nullified. + */ +int truncate_inode_blocks(struct inode *inode, pgoff_t from) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0, cont = 1; + int level, offset[4], noffset[4]; + unsigned int nofs = 0; + struct f2fs_inode *ri; + struct dnode_of_data dn; + struct page *page; + + trace_f2fs_truncate_inode_blocks_enter(inode, from); + + level = get_node_path(F2FS_I(inode), from, offset, noffset); +restart: + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + set_new_dnode(&dn, inode, page, NULL, 0); + unlock_page(page); + + ri = F2FS_INODE(page); + switch (level) { + case 0: + case 1: + nofs = noffset[1]; + break; + case 2: + nofs = noffset[1]; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + nofs += 1 + NIDS_PER_BLOCK; + break; + case 3: + nofs = 5 + 2 * NIDS_PER_BLOCK; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + break; + default: + BUG(); + } + +skip_partial: + while (cont) { + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + switch (offset[0]) { + case NODE_DIR1_BLOCK: + case NODE_DIR2_BLOCK: + err = truncate_dnode(&dn); + break; + + case NODE_IND1_BLOCK: + case NODE_IND2_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 2); + break; + + case NODE_DIND_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 3); + cont = 0; + break; + + default: + BUG(); + } + if (err < 0 && err != -ENOENT) + goto fail; + if (offset[1] == 0 && + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto restart; + } + f2fs_wait_on_page_writeback(page, NODE); + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + set_page_dirty(page); + unlock_page(page); + } + offset[1] = 0; + offset[0]++; + nofs += err; + } +fail: + f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); + return err > 0 ? 0 : err; +} + +int truncate_xattr_node(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct page *npage; + + if (!nid) + return 0; + + npage = get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); + + F2FS_I(inode)->i_xattr_nid = 0; + + /* need to do checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + + set_new_dnode(&dn, inode, page, npage, nid); + + if (page) + dn.inode_page_locked = true; + truncate_node(&dn); + return 0; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +void remove_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) + return; + + if (truncate_xattr_node(inode, dn.inode_page)) { + f2fs_put_dnode(&dn); + return; + } + + /* remove potential inline_data blocks */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + truncate_data_blocks_range(&dn, 1); + + /* 0 is possible, after f2fs_new_inode() has failed */ + f2fs_bug_on(F2FS_I_SB(inode), + inode->i_blocks != 0 && inode->i_blocks != 1); + + /* will put inode & node pages */ + truncate_node(&dn); +} + +struct page *new_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + + /* allocate inode page for new inode */ + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + + /* caller should f2fs_put_page(page, 1); */ + return new_node_page(&dn, 0, NULL); +} + +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info old_ni, new_ni; + struct page *page; + int err; + + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return ERR_PTR(-EPERM); + + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + if (!page) + return ERR_PTR(-ENOMEM); + + if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { + err = -ENOSPC; + goto fail; + } + + get_node_info(sbi, dn->nid, &old_ni); + + /* Reinitialize old_ni with new node page */ + f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); + new_ni = old_ni; + new_ni.ino = dn->inode->i_ino; + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + + f2fs_wait_on_page_writeback(page, NODE); + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(dn->inode, page); + SetPageUptodate(page); + set_page_dirty(page); + + if (f2fs_has_xattr_block(ofs)) + F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + + dn->node_page = page; + if (ipage) + update_inode(dn->inode, ipage); + else + sync_inode_page(dn); + if (ofs == 0) + inc_valid_inode_count(sbi); + + return page; + +fail: + clear_node_page_dirty(page); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE: f2fs_put_page(page, 1) + * error: nothing + */ +static int read_node_page(struct page *page, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = rw, + }; + + get_node_info(sbi, page->index, &ni); + + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + f2fs_put_page(page, 1); + return -ENOENT; + } + + if (PageUptodate(page)) + return LOCKED_PAGE; + + fio.blk_addr = ni.blk_addr; + return f2fs_submit_page_bio(sbi, page, &fio); +} + +/* + * Readahead a node page + */ +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *apage; + int err; + + apage = find_get_page(NODE_MAPPING(sbi), nid); + if (apage && PageUptodate(apage)) { + f2fs_put_page(apage, 0); + return; + } + f2fs_put_page(apage, 0); + + apage = grab_cache_page(NODE_MAPPING(sbi), nid); + if (!apage) + return; + + err = read_node_page(apage, READA); + if (err == 0) + f2fs_put_page(apage, 0); + else if (err == LOCKED_PAGE) + f2fs_put_page(apage, 1); +} + +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + struct page *page; + int err; +repeat: + page = grab_cache_page(NODE_MAPPING(sbi), nid); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, READ_SYNC); + if (err < 0) + return ERR_PTR(err); + else if (err != LOCKED_PAGE) + lock_page(page); + + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { + ClearPageUptodate(page); + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto repeat; + } + return page; +} + +/* + * Return a locked page for the desired node page. + * And, readahead MAX_RA_NODE number of node pages. + */ +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + struct page *page; + int err, i, end; + nid_t nid; + + /* First, try getting the desired direct node. */ + nid = get_nid(parent, start, false); + if (!nid) + return ERR_PTR(-ENOENT); +repeat: + page = grab_cache_page(NODE_MAPPING(sbi), nid); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, READ_SYNC); + if (err < 0) + return ERR_PTR(err); + else if (err == LOCKED_PAGE) + goto page_hit; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start + 1; i < end; i++) { + nid = get_nid(parent, i, false); + if (!nid) + continue; + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); + + lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto repeat; + } +page_hit: + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + return page; +} + +void sync_inode_page(struct dnode_of_data *dn) +{ + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { + update_inode(dn->inode, dn->node_page); + } else if (dn->inode_page) { + if (!dn->inode_page_locked) + lock_page(dn->inode_page); + update_inode(dn->inode, dn->inode_page); + if (!dn->inode_page_locked) + unlock_page(dn->inode_page); + } else { + update_inode_page(dn->inode); + } +} + +int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, + struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = ino ? 2 : 0; + int nwritten = 0, wrote = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = LONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * flushing sequence with step: + * 0. indirect nodes + * 1. dentry dnodes + * 2. file dnodes + */ + if (step == 0 && IS_DNODE(page)) + continue; + if (step == 1 && (!IS_DNODE(page) || + is_cold_node(page))) + continue; + if (step == 2 && (!IS_DNODE(page) || + !is_cold_node(page))) + continue; + + /* + * If an fsync mode, + * we should not skip writing node pages. + */ + if (ino && ino_of_node(page) == ino) + lock_page(page); + else if (!trylock_page(page)) + continue; + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino && ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + /* called by fsync() */ + if (ino && IS_DNODE(page)) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (!is_checkpointed_node(sbi, ino) && + !has_fsynced_inode(sbi, ino)) + set_dentry_mark(page, 1); + else + set_dentry_mark(page, 0); + } + nwritten++; + } else { + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + } + + if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + unlock_page(page); + else + wrote++; + + if (--wbc->nr_to_write == 0) + break; + } + pagevec_release(&pvec); + cond_resched(); + + if (wbc->nr_to_write == 0) { + step = 2; + break; + } + } + + if (step < 2) { + step++; + goto next_step; + } + + if (wrote) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + return nwritten; +} + +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + int ret2 = 0, ret = 0; + + pagevec_init(&pvec, 0); + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (unlikely(page->index > end)) + continue; + + if (ino && ino_of_node(page) == ino) { + f2fs_wait_on_page_writeback(page, NODE); + if (TestClearPageError(page)) + ret = -EIO; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) + ret2 = -ENOSPC; + if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) + ret2 = -EIO; + if (!ret) + ret = ret2; + return ret; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + f2fs_wait_on_page_writeback(page, NODE); + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + set_page_writeback(page); + fio.blk_addr = ni.blk_addr; + write_node_page(sbi, page, nid, &fio); + set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + long diff; + + trace_f2fs_writepages(mapping->host, wbc, NODE); + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi); + + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; + + diff = nr_pages_to_write(sbi, NODE, wbc); + wbc->sync_mode = WB_SYNC_NONE; + sync_node_pages(sbi, 0, wbc); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + return 0; +} + +static int f2fs_set_node_page_dirty(struct page *page) +{ + trace_f2fs_set_page_dirty(page, NODE); + + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); + SetPagePrivate(page); + f2fs_trace_pid(page); + return 1; + } + return 0; +} + +/* + * Structure of the f2fs node operations + */ +const struct address_space_operations f2fs_node_aops = { + .writepage = f2fs_write_node_page, + .writepages = f2fs_write_node_pages, + .set_page_dirty = f2fs_set_node_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, +}; + +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) +{ + return radix_tree_lookup(&nm_i->free_nid_root, n); +} + +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) +{ + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + struct nat_entry *ne; + bool allocated = false; + + if (!available_free_memory(sbi, FREE_NIDS)) + return -1; + + /* 0 nid should not be used */ + if (unlikely(nid == 0)) + return 0; + + if (build) { + /* do not add allocated nids */ + down_read(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && + (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + allocated = true; + up_read(&nm_i->nat_tree_lock); + if (allocated) + return 0; + } + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i->nid = nid; + i->state = NID_NEW; + + if (radix_tree_preload(GFP_NOFS)) { + kmem_cache_free(free_nid_slab, i); + return 0; + } + + spin_lock(&nm_i->free_nid_list_lock); + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { + spin_unlock(&nm_i->free_nid_list_lock); + radix_tree_preload_end(); + kmem_cache_free(free_nid_slab, i); + return 0; + } + list_add_tail(&i->list, &nm_i->free_nid_list); + nm_i->fcnt++; + spin_unlock(&nm_i->free_nid_list_lock); + radix_tree_preload_end(); + return 1; +} + +static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +{ + struct free_nid *i; + bool need_free = false; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + if (i && i->state == NID_NEW) { + __del_from_free_nid_list(nm_i, i); + nm_i->fcnt--; + need_free = true; + } + spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +static void scan_nat_page(struct f2fs_sb_info *sbi, + struct page *nat_page, nid_t start_nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct f2fs_nat_block *nat_blk = page_address(nat_page); + block_t blk_addr; + int i; + + i = start_nid % NAT_ENTRY_PER_BLOCK; + + for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + + if (unlikely(start_nid >= nm_i->max_nid)) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + f2fs_bug_on(sbi, blk_addr == NEW_ADDR); + if (blk_addr == NULL_ADDR) { + if (add_free_nid(sbi, start_nid, true) < 0) + break; + } + } +} + +static void build_free_nids(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int i = 0; + nid_t nid = nm_i->next_scan_nid; + + /* Enough entries */ + if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + return; + + /* readahead nat pages to be scanned */ + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); + + while (1) { + struct page *page = get_current_nat_page(sbi, nid); + + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + + nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (i++ == FREE_NID_PAGES) + break; + } + + /* go to the next free nat pages to find free nids abundantly */ + nm_i->next_scan_nid = nid; + + /* find free nids from current sum_pages */ + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < nats_in_cursum(sum); i++) { + block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); + nid = le32_to_cpu(nid_in_journal(sum, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(nm_i, nid); + } + mutex_unlock(&curseg->curseg_mutex); +} + +/* + * If this function returns success, caller can obtain a new nid + * from second parameter of this function. + * The returned nid could be used ino as well as nid when inode is created. + */ +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i = NULL; +retry: + if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) + return false; + + spin_lock(&nm_i->free_nid_list_lock); + + /* We should not use stale free nids created by build_free_nids */ + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + list_for_each_entry(i, &nm_i->free_nid_list, list) + if (i->state == NID_NEW) + break; + + f2fs_bug_on(sbi, i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; + } + spin_unlock(&nm_i->free_nid_list_lock); + + /* Let's scan nat pages and its caches to get free nids */ + mutex_lock(&nm_i->build_lock); + build_free_nids(sbi); + mutex_unlock(&nm_i->build_lock); + goto retry; +} + +/* + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + __del_from_free_nid_list(nm_i, i); + spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); +} + +/* + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + if (!nid) + return; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + if (!available_free_memory(sbi, FREE_NIDS)) { + __del_from_free_nid_list(nm_i, i); + need_free = true; + } else { + i->state = NID_NEW; + nm_i->fcnt++; + } + spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +void recover_inline_xattr(struct inode *inode, struct page *page) +{ + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + goto update_inode; + } + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + f2fs_wait_on_page_writeback(ipage, NODE); + memcpy(dst_addr, src_addr, inline_size); +update_inode: + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + +void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(sbi, 1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR, false); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); + + update_inode_page(inode); +} + +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *src, *dst; + nid_t ino = ino_of_node(page); + struct node_info old_ni, new_ni; + struct page *ipage; + + get_node_info(sbi, ino, &old_ni); + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; + + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); + if (!ipage) + return -ENOMEM; + + /* Should not use this inode from free nid list */ + remove_free_nid(NM_I(sbi), ino); + + SetPageUptodate(ipage); + fill_node_footer(ipage, ino, ino, 0, true); + + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); + + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + + new_ni = old_ni; + new_ni.ino = ino; + + if (unlikely(!inc_valid_node_count(sbi, NULL))) + WARN_ON(1); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + inc_valid_inode_count(sbi); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + return 0; +} + +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum) +{ + struct f2fs_node *rn; + struct f2fs_summary *sum_entry; + block_t addr; + int bio_blocks = MAX_BIO_BLOCKS(sbi); + int i, idx, last_offset, nrpages; + + /* scan the node segment */ + last_offset = sbi->blocks_per_seg; + addr = START_BLOCK(sbi, segno); + sum_entry = &sum->entries[0]; + + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, bio_blocks); + + /* readahead node pages */ + ra_meta_pages(sbi, addr, nrpages, META_POR); + + for (idx = addr; idx < addr + nrpages; idx++) { + struct page *page = get_meta_page(sbi, idx); + + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + f2fs_put_page(page, 1); + } + + invalidate_mapping_pages(META_MAPPING(sbi), addr, + addr + nrpages); + } + return 0; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int i; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < nats_in_cursum(sum); i++) { + struct nat_entry *ne; + struct f2fs_nat_entry raw_ne; + nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + + raw_ne = nat_in_journal(sum, i); + + down_write(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (!ne) { + ne = grab_nat_entry(nm_i, nid); + node_info_from_raw_nat(&ne->ni, &raw_ne); + } + __set_nat_cache_dirty(nm_i, ne); + up_write(&nm_i->nat_tree_lock); + } + update_nats_in_cursum(sum, -i); + mutex_unlock(&curseg->curseg_mutex); +} + +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) +{ + struct nat_entry_set *cur; + + if (nes->entry_cnt >= max) + goto add_out; + + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } + } +add_out: + list_add_tail(&nes->set_list, head); +} + +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page = NULL; + struct f2fs_nm_info *nm_i = NM_I(sbi); + + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; + + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; + + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); + + down_write(&NM_I(sbi)->nat_tree_lock); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), ne); + up_write(&NM_I(sbi)->nat_tree_lock); + + if (nat_get_blkaddr(ne) == NULL_ADDR) + add_free_nid(sbi, nid, false); + } + + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, set->entry_cnt); + + down_write(&nm_i->nat_tree_lock); + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + up_write(&nm_i->nat_tree_lock); + kmem_cache_free(nat_entry_set_slab, set); +} + +/* + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct nat_entry_set *setvec[SETVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + + if (!nm_i->dirty_nat_cnt) + return; + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + down_write(&nm_i->nat_tree_lock); + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, SETVEC_SIZE, setvec))) { + unsigned idx; + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(sum)); + } + up_write(&nm_i->nat_tree_lock); + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) + __flush_nat_entry_set(sbi, set); + + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); +} + +static int init_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned char *version_bitmap; + unsigned int nat_segs, nat_blocks; + + nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); + + /* segment_count_nat includes pair segment so divide to 2. */ + nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; + nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->fcnt = 0; + nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); + INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); + INIT_LIST_HEAD(&nm_i->nat_entries); + + mutex_init(&nm_i->build_lock); + spin_lock_init(&nm_i->free_nid_list_lock); + init_rwsem(&nm_i->nat_tree_lock); + + nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); + version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); + if (!version_bitmap) + return -EFAULT; + + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; + return 0; +} + +int build_node_manager(struct f2fs_sb_info *sbi) +{ + int err; + + sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + if (!sbi->nm_info) + return -ENOMEM; + + err = init_node_manager(sbi); + if (err) + return err; + + build_free_nids(sbi); + return 0; +} + +void destroy_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next_i; + struct nat_entry *natvec[NATVEC_SIZE]; + struct nat_entry_set *setvec[SETVEC_SIZE]; + nid_t nid = 0; + unsigned int found; + + if (!nm_i) + return; + + /* destroy free nid list */ + spin_lock(&nm_i->free_nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + f2fs_bug_on(sbi, i->state == NID_ALLOC); + __del_from_free_nid_list(nm_i, i); + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); + } + f2fs_bug_on(sbi, nm_i->fcnt); + spin_unlock(&nm_i->free_nid_list_lock); + + /* destroy nat cache */ + down_write(&nm_i->nat_tree_lock); + while ((found = __gang_lookup_nat_cache(nm_i, + nid, NATVEC_SIZE, natvec))) { + unsigned idx; + + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); + } + f2fs_bug_on(sbi, nm_i->nat_cnt); + + /* destroy nat set cache */ + nid = 0; + while ((found = __gang_lookup_nat_set(nm_i, + nid, SETVEC_SIZE, setvec))) { + unsigned idx; + + nid = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) { + /* entry_cnt is not zero, when cp_error was occurred */ + f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list)); + radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set); + kmem_cache_free(nat_entry_set_slab, setvec[idx]); + } + } + up_write(&nm_i->nat_tree_lock); + + kfree(nm_i->nat_bitmap); + sbi->nm_info = NULL; + kfree(nm_i); +} + +int __init create_node_manager_caches(void) +{ + nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + sizeof(struct nat_entry)); + if (!nat_entry_slab) + goto fail; + + free_nid_slab = f2fs_kmem_cache_create("free_nid", + sizeof(struct free_nid)); + if (!free_nid_slab) + goto destroy_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destroy_free_nid; + return 0; + +destroy_free_nid: + kmem_cache_destroy(free_nid_slab); +destroy_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; +} + +void destroy_node_manager_caches(void) +{ + kmem_cache_destroy(nat_entry_set_slab); + kmem_cache_destroy(free_nid_slab); + kmem_cache_destroy(nat_entry_slab); +} diff --git a/kernel/fs/f2fs/node.h b/kernel/fs/f2fs/node.h new file mode 100644 index 000000000..c56026f17 --- /dev/null +++ b/kernel/fs/f2fs/node.h @@ -0,0 +1,416 @@ +/* + * fs/f2fs/node.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* start node id of a node block dedicated to the given node id */ +#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) + +/* node block offset on the NAT area dedicated to the given start node id */ +#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform readahead before building free nids */ +#define FREE_NID_PAGES 4 + +/* maximum readahead size for node during getting data blocks */ +#define MAX_RA_NODE 128 + +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 + +/* vector size for gang look-up from nat cache that consists of radix tree */ +#define NATVEC_SIZE 64 +#define SETVEC_SIZE 32 + +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + +/* For flag in struct node_info */ +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ +}; + +/* + * For node information + */ +struct node_info { + nid_t nid; /* node id */ + nid_t ino; /* inode number of the node's owner */ + block_t blk_addr; /* block address of the node */ + unsigned char version; /* version of the node */ + unsigned char flag; /* for node information bits */ +}; + +struct nat_entry { + struct list_head list; /* for clean or dirty nat list */ + struct node_info ni; /* in-memory node information */ +}; + +#define nat_get_nid(nat) (nat->ni.nid) +#define nat_set_nid(nat, n) (nat->ni.nid = n) +#define nat_get_blkaddr(nat) (nat->ni.blk_addr) +#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) +#define nat_get_ino(nat) (nat->ni.ino) +#define nat_set_ino(nat, i) (nat->ni.ino = i) +#define nat_get_version(nat) (nat->ni.version) +#define nat_set_version(nat, v) (nat->ni.version = v) + +#define inc_node_version(version) (++version) + +static inline void copy_node_info(struct node_info *dst, + struct node_info *src) +{ + dst->nid = src->nid; + dst->ino = src->ino; + dst->blk_addr = src->blk_addr; + dst->version = src->version; + /* should not copy flag here */ +} + +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + unsigned char mask = 0x01 << type; + if (set) + ne->ni.flag |= mask; + else + ne->ni.flag &= ~mask; +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + unsigned char mask = 0x01 << type; + return ne->ni.flag & mask; +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} + +static inline void node_info_from_raw_nat(struct node_info *ni, + struct f2fs_nat_entry *raw_ne) +{ + ni->ino = le32_to_cpu(raw_ne->ino); + ni->blk_addr = le32_to_cpu(raw_ne->block_addr); + ni->version = raw_ne->version; +} + +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ + INO_ENTRIES, /* indicates inode entries */ + EXTENT_CACHE, /* indicates extent cache */ + BASE_CHECK, /* check kernel status */ +}; + +struct nat_entry_set { + struct list_head set_list; /* link with other nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t set; /* set number*/ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + +/* + * For free nid mangement + */ +enum nid_state { + NID_NEW, /* newly added to free nid list */ + NID_ALLOC /* it is allocated */ +}; + +struct free_nid { + struct list_head list; /* for free node id list */ + nid_t nid; /* node id */ + int state; /* in use or not: NID_NEW or NID_ALLOC */ +}; + +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *fnid; + + spin_lock(&nm_i->free_nid_list_lock); + if (nm_i->fcnt <= 0) { + spin_unlock(&nm_i->free_nid_list_lock); + return; + } + fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + *nid = fnid->nid; + spin_unlock(&nm_i->free_nid_list_lock); +} + +/* + * inline functions + */ +static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); +} + +static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + pgoff_t block_off; + pgoff_t block_addr; + int seg_off; + + block_off = NAT_BLOCK_OFFSET(start); + seg_off = block_off >> sbi->log_blocks_per_seg; + + block_addr = (pgoff_t)(nm_i->nat_blkaddr + + (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) + block_addr += sbi->blocks_per_seg; + + return block_addr; +} + +static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + block_addr -= nm_i->nat_blkaddr; + if ((block_addr >> sbi->log_blocks_per_seg) % 2) + block_addr -= sbi->blocks_per_seg; + else + block_addr += sbi->blocks_per_seg; + + return block_addr + nm_i->nat_blkaddr; +} + +static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) +{ + unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); + + f2fs_change_bit(block_off, nm_i->nat_bitmap); +} + +static inline void fill_node_footer(struct page *page, nid_t nid, + nid_t ino, unsigned int ofs, bool reset) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int old_flag = 0; + + if (reset) + memset(rn, 0, sizeof(*rn)); + else + old_flag = le32_to_cpu(rn->footer.flag); + + rn->footer.nid = cpu_to_le32(nid); + rn->footer.ino = cpu_to_le32(ino); + + /* should remain old flag bits such as COLD_BIT_SHIFT */ + rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | + (old_flag & OFFSET_BIT_MASK)); +} + +static inline void copy_node_footer(struct page *dst, struct page *src) +{ + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); + memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); +} + +static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_node *rn = F2FS_NODE(page); + + rn->footer.cp_ver = ckpt->checkpoint_ver; + rn->footer.next_blkaddr = cpu_to_le32(blkaddr); +} + +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline unsigned long long cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + +/* + * f2fs assigns the following node offsets described as (num). + * N = NIDS_PER_BLOCK + * + * Inode block (0) + * |- direct node (1) + * |- direct node (2) + * |- indirect node (3) + * | `- direct node (4 => 4 + N - 1) + * |- indirect node (4 + N) + * | `- direct node (5 + N => 5 + 2N - 1) + * `- double indirect node (5 + 2N) + * `- indirect node (6 + 2N) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node + */ +static inline bool IS_DNODE(struct page *node_page) +{ + unsigned int ofs = ofs_of_node(node_page); + + if (f2fs_has_xattr_block(ofs)) + return false; + + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || + ofs == 5 + 2 * NIDS_PER_BLOCK) + return false; + if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { + ofs -= 6 + 2 * NIDS_PER_BLOCK; + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) + return false; + } + return true; +} + +static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + f2fs_wait_on_page_writeback(p, NODE); + + if (i) + rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); + else + rn->in.nid[off] = cpu_to_le32(nid); + set_page_dirty(p); +} + +static inline nid_t get_nid(struct page *p, int off, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + if (i) + return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); + return le32_to_cpu(rn->in.nid[off]); +} + +/* + * Coldness identification: + * - Mark cold files in f2fs_inode_info + * - Mark cold node blocks in their node footer + * - Mark cold data pages in page cache + */ +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise |= type; +} + +static inline void clear_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise &= ~type; +} + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + +static inline int is_cold_data(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_cold_data(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_cold_data(struct page *page) +{ + ClearPageChecked(page); +} + +static inline int is_node(struct page *page, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + return le32_to_cpu(rn->footer.flag) & (1 << type); +} + +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) + +static inline void set_cold_node(struct inode *inode, struct page *page) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + + if (S_ISDIR(inode->i_mode)) + flag &= ~(0x1 << COLD_BIT_SHIFT); + else + flag |= (0x1 << COLD_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_mark(struct page *page, int mark, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) + flag |= (0x1 << type); + else + flag &= ~(0x1 << type); + rn->footer.flag = cpu_to_le32(flag); +} +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/kernel/fs/f2fs/recovery.c b/kernel/fs/f2fs/recovery.c new file mode 100644 index 000000000..8d8ea99f2 --- /dev/null +++ b/kernel/fs/f2fs/recovery.c @@ -0,0 +1,575 @@ +/* + * fs/f2fs/recovery.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + +static struct kmem_cache *fsync_entry_slab; + +bool space_for_roll_forward(struct f2fs_sb_info *sbi) +{ + if (sbi->last_valid_block_count + sbi->alloc_valid_block_count + > sbi->user_block_count) + return false; + return true; +} + +static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, + nid_t ino) +{ + struct fsync_inode_entry *entry; + + list_for_each_entry(entry, head, list) + if (entry->inode->i_ino == ino) + return entry; + + return NULL; +} + +static int recover_dentry(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + nid_t pino = le32_to_cpu(raw_inode->i_pino); + struct f2fs_dir_entry *de; + struct qstr name; + struct page *page; + struct inode *dir, *einode; + int err = 0; + + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + + name.len = le32_to_cpu(raw_inode->i_namelen); + name.name = raw_inode->i_name; + + if (unlikely(name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out_err; + } +retry: + de = f2fs_find_entry(dir, &name, &page); + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_unmap_put; + + if (de) { + einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + err = PTR_ERR(einode); + if (err == -ENOENT) + err = -EEXIST; + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); + if (err) { + iput(einode); + goto out_unmap_put; + } + f2fs_delete_entry(de, page, dir, einode); + iput(einode); + goto retry; + } + err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); + if (err) + goto out_err; + + if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { + iput(dir); + } else { + add_dirty_dir_inode(dir); + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + } + + goto out; + +out_unmap_put: + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); +out_err: + iput(dir); +out: + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), raw_inode->i_name, + IS_ERR(dir) ? 0 : dir->i_ino, err); + return err; +} + +static void recover_inode(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + + inode->i_mode = le16_to_cpu(raw->i_mode); + i_size_write(inode, le64_to_cpu(raw->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", + ino_of_node(page), F2FS_INODE(page)->i_name); +} + +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +{ + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); + struct curseg_info *curseg; + struct page *page = NULL; + block_t blkaddr; + int err = 0; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + ra_meta_pages(sbi, blkaddr, 1, META_POR); + + while (1) { + struct fsync_inode_entry *entry; + + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + return 0; + + page = get_meta_page(sbi, blkaddr); + + if (cp_ver != cpver_of_node(page)) + break; + + if (!is_fsync_dnode(page)) + goto next; + + entry = get_fsync_inode(head, ino_of_node(page)); + if (!entry) { + if (IS_INODE(page) && is_dent_dnode(page)) { + err = recover_inode_page(sbi, page); + if (err) + break; + } + + /* add this fsync inode to the list */ + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + if (!entry) { + err = -ENOMEM; + break; + } + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ + entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); + if (IS_ERR(entry->inode)) { + err = PTR_ERR(entry->inode); + kmem_cache_free(fsync_entry_slab, entry); + if (err == -ENOENT) { + err = 0; + goto next; + } + break; + } + list_add_tail(&entry->list, head); + } + entry->blkaddr = blkaddr; + + if (IS_INODE(page)) { + entry->last_inode = blkaddr; + if (is_dent_dnode(page)) + entry->last_dentry = blkaddr; + } +next: + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + ra_meta_pages_cond(sbi, blkaddr); + } + f2fs_put_page(page, 1); + return err; +} + +static void destroy_fsync_dnodes(struct list_head *head) +{ + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); + } +} + +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) +{ + struct seg_entry *sentry; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; + struct f2fs_summary sum; + struct page *sum_page, *node_page; + struct dnode_of_data tdn = *dn; + nid_t ino, nid; + struct inode *inode; + unsigned int offset; + block_t bidx; + int i; + + sentry = get_seg_entry(sbi, segno); + if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) + return 0; + + /* Get the previous summary */ + for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { + sum = curseg->sum_blk->entries[blkoff]; + goto got_it; + } + } + + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + if (dn->inode->i_ino == nid) { + tdn.nid = nid; + if (!dn->inode_page_locked) + lock_page(dn->inode_page); + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + goto truncate_out; + } else if (dn->nid == nid) { + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + goto truncate_out; + } + + /* Get the node page */ + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + offset = ofs_of_node(node_page); + ino = ino_of_node(node_page); + f2fs_put_page(node_page, 1); + + if (ino != dn->inode->i_ino) { + /* Deallocate previous index in the node page */ + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } else { + inode = dn->inode; + } + + bidx = start_bidx_of_node(offset, F2FS_I(inode)) + + le16_to_cpu(sum.ofs_in_node); + + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_page_locked) + unlock_page(dn->inode_page); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) + iput(inode); + else if (dn->inode_page_locked) + lock_page(dn->inode_page); + return 0; + +truncate_out: + if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_page_locked) + unlock_page(dn->inode_page); + return 0; +} + +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, block_t blkaddr) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int start, end; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + int err = 0, recovered = 0; + + /* step 1: recover xattr */ + if (IS_INODE(page)) { + recover_inline_xattr(inode, page); + } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + /* + * Deprecated; xattr blocks should be found from cold log. + * But, we should remain this for backward compatibility. + */ + recover_xattr_data(inode, page, blkaddr); + goto out; + } + + /* step 2: recover inline data */ + if (recover_inline_data(inode, page)) + goto out; + + /* step 3: recover data indices */ + start = start_bidx_of_node(ofs_of_node(page), fi); + end = start + ADDRS_PER_PAGE(page, fi); + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + f2fs_unlock_op(sbi); + goto out; + } + + f2fs_wait_on_page_writeback(dn.node_page, NODE); + + get_node_info(sbi, dn.nid, &ni); + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); + + for (; start < end; start++) { + block_t src, dest; + + src = datablock_addr(dn.node_page, dn.ofs_in_node); + dest = datablock_addr(page, dn.ofs_in_node); + + if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && + dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) { + + if (src == NULL_ADDR) { + err = reserve_new_block(&dn); + /* We should not get -ENOSPC */ + f2fs_bug_on(sbi, err); + } + + /* Check the previous node page having this index */ + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) + goto err; + + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* write dummy data page */ + recover_data_page(sbi, NULL, &sum, src, dest); + dn.data_blkaddr = dest; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + recovered++; + } + dn.ofs_in_node++; + } + + if (IS_INODE(dn.node_page)) + sync_inode_page(&dn); + + copy_node_footer(dn.node_page, page); + fill_node_footer(dn.node_page, dn.nid, ni.ino, + ofs_of_node(page), false); + set_page_dirty(dn.node_page); +err: + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx, recovered = %d blocks, err = %d", + inode->i_ino, recovered, err); + return err; +} + +static int recover_data(struct f2fs_sb_info *sbi, + struct list_head *head, int type) +{ + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); + struct curseg_info *curseg; + struct page *page = NULL; + int err = 0; + block_t blkaddr; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, type); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + while (1) { + struct fsync_inode_entry *entry; + + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + break; + + ra_meta_pages_cond(sbi, blkaddr); + + page = get_meta_page(sbi, blkaddr); + + if (cp_ver != cpver_of_node(page)) { + f2fs_put_page(page, 1); + break; + } + + entry = get_fsync_inode(head, ino_of_node(page)); + if (!entry) + goto next; + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (entry->last_inode == blkaddr) + recover_inode(entry->inode, page); + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + } + err = do_recover_data(sbi, entry->inode, page, blkaddr); + if (err) { + f2fs_put_page(page, 1); + break; + } + + if (entry->blkaddr == blkaddr) { + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); + } +next: + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + } + if (!err) + allocate_new_segments(sbi); + return err; +} + +int recover_fsync_data(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + struct list_head inode_list; + block_t blkaddr; + int err; + bool need_writecp = false; + + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + + INIT_LIST_HEAD(&inode_list); + + /* step #1: find fsynced inode numbers */ + set_sbi_flag(sbi, SBI_POR_DOING); + + /* prevent checkpoint */ + mutex_lock(&sbi->cp_mutex); + + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + err = find_fsync_dnodes(sbi, &inode_list); + if (err) + goto out; + + if (list_empty(&inode_list)) + goto out; + + need_writecp = true; + + /* step #2: recover data */ + err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + if (!err) + f2fs_bug_on(sbi, !list_empty(&inode_list)); +out: + destroy_fsync_dnodes(&inode_list); + kmem_cache_destroy(fsync_entry_slab); + + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + clear_sbi_flag(sbi, SBI_POR_DOING); + if (err) { + discard_next_dnode(sbi, blkaddr); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); + } else if (need_writecp) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + mutex_unlock(&sbi->cp_mutex); + write_checkpoint(sbi, &cpc); + } else { + mutex_unlock(&sbi->cp_mutex); + } + return err; +} diff --git a/kernel/fs/f2fs/segment.c b/kernel/fs/f2fs/segment.c new file mode 100644 index 000000000..f93966094 --- /dev/null +++ b/kernel/fs/f2fs/segment.c @@ -0,0 +1,2307 @@ +/* + * fs/f2fs/segment.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "node.h" +#include "trace.h" +#include + +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *inmem_entry_slab; + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * Example: + * LSB <--> MSB + * f2fs_set_bit(0, bitmap) => 0000 0001 + * f2fs_set_bit(7, bitmap) => 1000 0000 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~0UL << quot; + submask = (unsigned char)(0xff << rest) >> rest; + submask <<= quot; + mask &= submask; + tmp &= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG-1)) { + tmp = *(p++); + if (tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~(~0UL << quot); + submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); + submask <<= quot; + mask += submask; + tmp |= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffz(tmp); +} + +void register_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *new; + int err; + + SetPagePrivate(page); + f2fs_trace_pid(page); + + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + + /* add atomic page indices to the list */ + new->page = page; + INIT_LIST_HEAD(&new->list); +retry: + /* increase reference count with clean state */ + mutex_lock(&fi->inmem_lock); + err = radix_tree_insert(&fi->inmem_root, page->index, new); + if (err == -EEXIST) { + mutex_unlock(&fi->inmem_lock); + kmem_cache_free(inmem_entry_slab, new); + return; + } else if (err) { + mutex_unlock(&fi->inmem_lock); + goto retry; + } + get_page(page); + list_add_tail(&new->list, &fi->inmem_pages); + inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + mutex_unlock(&fi->inmem_lock); + + trace_f2fs_register_inmem_page(page, INMEM); +} + +void commit_inmem_pages(struct inode *inode, bool abort) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *cur, *tmp; + bool submit_bio = false; + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + + /* + * The abort is true only when f2fs_evict_inode is called. + * Basically, the f2fs_evict_inode doesn't produce any data writes, so + * that we don't need to call f2fs_balance_fs. + * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this + * inode becomes free by iget_locked in f2fs_iget. + */ + if (!abort) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + } + + mutex_lock(&fi->inmem_lock); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + if (!abort) { + lock_page(cur->page); + if (cur->page->mapping == inode->i_mapping) { + f2fs_wait_on_page_writeback(cur->page, DATA); + if (clear_page_dirty_for_io(cur->page)) + inode_dec_dirty_pages(inode); + trace_f2fs_commit_inmem_page(cur->page, INMEM); + do_write_data_page(cur->page, &fio); + submit_bio = true; + } + f2fs_put_page(cur->page, 1); + } else { + trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); + put_page(cur->page); + } + radix_tree_delete(&fi->inmem_root, cur->page->index); + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + mutex_unlock(&fi->inmem_lock); + + if (!abort) { + f2fs_unlock_op(sbi); + if (submit_bio) + f2fs_submit_merged_bio(sbi, DATA, WRITE); + } +} + +/* + * This function balances dirty node and dentry pages. + * In addition, it controls garbage collection. + */ +void f2fs_balance_fs(struct f2fs_sb_info *sbi) +{ + /* + * We should do GC or end up with checkpoint, if there are so many dirty + * dir/node pages without enough free segments. + */ + if (has_not_enough_free_secs(sbi, 0)) { + mutex_lock(&sbi->gc_mutex); + f2fs_gc(sbi); + } +} + +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +{ + /* try to shrink extent cache when there is no enough memory */ + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries and prefree segments */ + if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + excess_prefree_segs(sbi) || + !available_free_memory(sbi, INO_ENTRIES)) + f2fs_sync_fs(sbi->sb, true); +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&fcc->issue_list)) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { + cmd->ret = ret; + complete(&cmd->wait); + } + bio_put(bio); + fcc->dispatch_list = NULL; + } + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&fcc->issue_list)); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd cmd; + + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + + if (test_opt(sbi, NOBARRIER)) + return 0; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + init_completion(&cmd.wait); + + llist_add(&cmd.llnode, &fcc->issue_list); + + if (!fcc->dispatch_list) + wake_up(&fcc->flush_wait_queue); + + wait_for_completion(&cmd.wait); + + return cmd.ret; +} + +int create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + init_waitqueue_head(&fcc->flush_wait_queue); + init_llist_head(&fcc->issue_list); + SM_I(sbi)->cmd_control_info = fcc; + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + return err; + } + + return err; +} + +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + + if (fcc && fcc->f2fs_issue_flush) + kthread_stop(fcc->f2fs_issue_flush); + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; +} + +static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + /* need not be added */ + if (IS_CURSEG(sbi, segno)) + return; + + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]++; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; + } +} + +static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]--; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + clear_bit(GET_SECNO(sbi, segno), + dirty_i->victim_secmap); + } +} + +/* + * Should not occur error such as -ENOMEM. + * Adding dirty entry into seglist is not critical operation. + * If a given segment is one of current working segments, it won't be added. + */ +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned short valid_blocks; + + if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + return; + + mutex_lock(&dirty_i->seglist_lock); + + valid_blocks = get_valid_blocks(sbi, segno, 0); + + if (valid_blocks == 0) { + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } else if (valid_blocks < sbi->blocks_per_seg) { + __locate_dirty_segment(sbi, segno, DIRTY); + } else { + /* Recovery routine with SSR needs this */ + __remove_dirty_segment(sbi, segno, DIRTY); + } + + mutex_unlock(&dirty_i->seglist_lock); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = SECTOR_FROM_BLOCK(blkstart); + sector_t len = SECTOR_FROM_BLOCK(blklen); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); +} + +void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (f2fs_issue_discard(sbi, blkaddr, 1)) { + struct page *page = grab_meta_page(sbi, blkaddr); + /* zero-filled page */ + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void __add_discard_entry(struct f2fs_sb_info *sbi, + struct cp_control *cpc, unsigned int start, unsigned int end) +{ + struct list_head *head = &SM_I(sbi)->discard_list; + struct discard_entry *new, *last; + + if (!list_empty(head)) { + last = list_last_entry(head, struct discard_entry, list); + if (START_BLOCK(sbi, cpc->trim_start) + start == + last->blkaddr + last->len) { + last->len += end - start; + goto done; + } + } + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; + new->len = end - start; + list_add_tail(&new->list, head); +done: + SM_I(sbi)->nr_discards += end - start; + cpc->trimmed += end - start; +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *dmap = SIT_I(sbi)->tmp_map; + unsigned int start = 0, end = -1; + bool force = (cpc->reason == CP_DISCARD); + int i; + + if (!force && (!test_opt(sbi, DISCARD) || + SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)) + return; + + if (force && !se->valid_blocks) { + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + /* + * if this segment is registered in the prefree list, then + * we should skip adding a discard candidate, and let the + * checkpoint do that later. + */ + mutex_lock(&dirty_i->seglist_lock); + if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { + mutex_unlock(&dirty_i->seglist_lock); + cpc->trimmed += sbi->blocks_per_seg; + return; + } + mutex_unlock(&dirty_i->seglist_lock); + + __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg); + return; + } + + /* zero block will be discarded through the prefree list */ + if (!se->valid_blocks || se->valid_blocks == max_blocks) + return; + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = force ? ~ckpt_map[i] : + (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + + if (force && end - start < cpc->trim_minlen) + continue; + + __add_discard_entry(sbi, cpc, start, end); + } +} + +void release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) { + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); + } +} + +/* + * Should call clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) + __set_test_and_free(sbi, segno); + mutex_unlock(&dirty_i->seglist_lock); +} + +void clear_prefree_segments(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; + unsigned int start = 0, end = -1; + + mutex_lock(&dirty_i->seglist_lock); + + while (1) { + int i; + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) + break; + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); + + for (i = start; i < end; i++) + clear_bit(i, prefree_map); + + dirty_i->nr_dirty[PRE] -= end - start; + + if (!test_opt(sbi, DISCARD)) + continue; + + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); + } + mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + list_del(&entry->list); + SM_I(sbi)->nr_discards -= entry->len; + kmem_cache_free(discard_entry_slab, entry); + } +} + +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { + sit_i->dirty_sentries++; + return false; + } + + return true; +} + +static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, + unsigned int segno, int modified) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; + if (modified) + __mark_sit_entry_dirty(sbi, segno); +} + +static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) +{ + struct seg_entry *se; + unsigned int segno, offset; + long int new_vblocks; + + segno = GET_SEGNO(sbi, blkaddr); + + se = get_seg_entry(sbi, segno); + new_vblocks = se->valid_blocks + del; + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || + (new_vblocks > sbi->blocks_per_seg))); + + se->valid_blocks = new_vblocks; + se->mtime = get_mtime(sbi); + SIT_I(sbi)->max_mtime = se->mtime; + + /* Update valid block bitmap */ + if (del > 0) { + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + f2fs_bug_on(sbi, 1); + } else { + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + f2fs_bug_on(sbi, 1); + } + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks += del; + + __mark_sit_entry_dirty(sbi, segno); + + /* update total number of valid blocks to be written in ckpt area */ + SIT_I(sbi)->written_valid_blocks += del; + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, segno)->valid_blocks += del; +} + +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) +{ + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); +} + +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +{ + unsigned int segno = GET_SEGNO(sbi, addr); + struct sit_info *sit_i = SIT_I(sbi); + + f2fs_bug_on(sbi, addr == NULL_ADDR); + if (addr == NEW_ADDR) + return; + + /* add it into sit main buffer */ + mutex_lock(&sit_i->sentry_lock); + + update_sit_entry(sbi, addr, -1); + + /* add it into dirty seglist */ + locate_dirty_segment(sbi, segno); + + mutex_unlock(&sit_i->sentry_lock); +} + +/* + * This function should be resided under the curseg_mutex lock + */ +static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, + struct f2fs_summary *sum) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); + memcpy(addr, sum, sizeof(struct f2fs_summary)); +} + +/* + * Calculate the number of current summary pages for writing + */ +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +{ + int valid_sum_count = 0; + int i, sum_in_page; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (sbi->ckpt->alloc_type[i] == SSR) + valid_sum_count += sbi->blocks_per_seg; + else { + if (for_ra) + valid_sum_count += le16_to_cpu( + F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += curseg_blkoff(sbi, i); + } + } + + sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) + return 1; + else if ((valid_sum_count - sum_in_page) <= + (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + return 2; + return 3; +} + +/* + * Caller should put this summary page + */ +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +{ + return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + struct page *page = grab_meta_page(sbi, blk_addr); + void *kaddr = page_address(page); + memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + +/* + * Find a new segment from the free segments bitmap to right order + * This function should be returned with success, otherwise BUG + */ +static void get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno, secno, zoneno; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; + unsigned int hint = *newseg / sbi->segs_per_sec; + unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int left_start = hint; + bool init = true; + int go_left = 0; + int i; + + spin_lock(&free_i->segmap_lock); + + if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + segno = find_next_zero_bit(free_i->free_segmap, + MAIN_SEGS(sbi), *newseg + 1); + if (segno - *newseg < sbi->segs_per_sec - + (*newseg % sbi->segs_per_sec)) + goto got_it; + } +find_other_zone: + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + if (secno >= MAIN_SECS(sbi)) { + if (dir == ALLOC_RIGHT) { + secno = find_next_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); + } else { + go_left = 1; + left_start = hint - 1; + } + } + if (go_left == 0) + goto skip_left; + + while (test_bit(left_start, free_i->free_secmap)) { + if (left_start > 0) { + left_start--; + continue; + } + left_start = find_next_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); + break; + } + secno = left_start; +skip_left: + hint = secno; + segno = secno * sbi->segs_per_sec; + zoneno = secno / sbi->secs_per_zone; + + /* give up on finding another zone */ + if (!init) + goto got_it; + if (sbi->secs_per_zone == 1) + goto got_it; + if (zoneno == old_zoneno) + goto got_it; + if (dir == ALLOC_LEFT) { + if (!go_left && zoneno + 1 >= total_zones) + goto got_it; + if (go_left && zoneno == 0) + goto got_it; + } + for (i = 0; i < NR_CURSEG_TYPE; i++) + if (CURSEG_I(sbi, i)->zone == zoneno) + break; + + if (i < NR_CURSEG_TYPE) { + /* zone is in user, try another */ + if (go_left) + hint = zoneno * sbi->secs_per_zone - 1; + else if (zoneno + 1 >= total_zones) + hint = 0; + else + hint = (zoneno + 1) * sbi->secs_per_zone; + init = false; + goto find_other_zone; + } +got_it: + /* set it as dirty segment in free segmap */ + f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); + __set_inuse(sbi, segno); + *newseg = segno; + spin_unlock(&free_i->segmap_lock); +} + +static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct summary_footer *sum_footer; + + curseg->segno = curseg->next_segno; + curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->next_blkoff = 0; + curseg->next_segno = NULL_SEGNO; + + sum_footer = &(curseg->sum_blk->footer); + memset(sum_footer, 0, sizeof(struct summary_footer)); + if (IS_DATASEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); + if (IS_NODESEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); + __set_sit_entry_type(sbi, type, curseg->segno, modified); +} + +/* + * Allocate a current working segment. + * This function always allocates a free segment in LFS manner. + */ +static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno; + int dir = ALLOC_LEFT; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, segno)); + if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + dir = ALLOC_RIGHT; + + if (test_opt(sbi, NOHEAP)) + dir = ALLOC_RIGHT; + + get_new_segment(sbi, &segno, new_sec, dir); + curseg->next_segno = segno; + reset_curseg(sbi, type, 1); + curseg->alloc_type = LFS; +} + +static void __next_free_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg, block_t start) +{ + struct seg_entry *se = get_seg_entry(sbi, seg->segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; +} + +/* + * If a segment is written by LFS manner, next block offset is just obtained + * by increasing the current block offset. However, if a segment is written by + * SSR manner, next block offset obtained by calling __next_free_blkoff + */ +static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + if (seg->alloc_type == SSR) + __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + else + seg->next_blkoff++; +} + +/* + * This function always allocates a used segment(from dirty seglist) by SSR + * manner, so it should recover the existing segment information of valid blocks + */ +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int new_segno = curseg->next_segno; + struct f2fs_summary_block *sum_node; + struct page *sum_page; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); + + mutex_lock(&dirty_i->seglist_lock); + __remove_dirty_segment(sbi, new_segno, PRE); + __remove_dirty_segment(sbi, new_segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + + reset_curseg(sbi, type, 1); + curseg->alloc_type = SSR; + __next_free_blkoff(sbi, curseg, 0); + + if (reuse) { + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); + } +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + return v_ops->get_victim(sbi, + &(curseg)->next_segno, BG_GC, type, SSR); + + /* For data segments, let's do SSR more intensively */ + for (; type >= CURSEG_HOT_DATA; type--) + if (v_ops->get_victim(sbi, &(curseg)->next_segno, + BG_GC, type, SSR)) + return 1; + return 0; +} + +/* + * flush out current segment and replace it with new segment + * This function should be returned with success, otherwise BUG + */ +static void allocate_segment_by_default(struct f2fs_sb_info *sbi, + int type, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + if (force) + new_curseg(sbi, type, true); + else if (type == CURSEG_WARM_NODE) + new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); + else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) + change_curseg(sbi, type, true); + else + new_curseg(sbi, type, false); + + stat_inc_seg_type(sbi, curseg); +} + +static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int old_segno; + + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + locate_dirty_segment(sbi, old_segno); +} + +void allocate_new_segments(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + __allocate_new_segments(sbi, i); +} + +static const struct segment_allocation default_salloc_ops = { + .allocate_segment = allocate_segment_by_default, +}; + +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = F2FS_BYTES_TO_BLK(range->start); + __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; + unsigned int start_segno, end_segno; + struct cp_control cpc; + + if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) || + range->len < sbi->blocksize) + return -EINVAL; + + cpc.trimmed = 0; + if (end <= MAIN_BLKADDR(sbi)) + goto out; + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + cpc.reason = CP_DISCARD; + cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen); + + /* do checkpoint to issue discard commands safely */ + for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { + cpc.trim_start = start_segno; + cpc.trim_end = min_t(unsigned int, rounddown(start_segno + + BATCHED_TRIM_SEGMENTS(sbi), + sbi->segs_per_sec) - 1, end_segno); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + } +out: + range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); + return 0; +} + +static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (curseg->next_blkoff < sbi->blocks_per_seg) + return true; + return false; +} + +static int __get_segment_type_2(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) + return CURSEG_HOT_DATA; + else + return CURSEG_HOT_NODE; +} + +static int __get_segment_type_4(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) { + struct inode *inode = page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else + return CURSEG_COLD_DATA; + } else { + if (IS_DNODE(page) && is_cold_node(page)) + return CURSEG_WARM_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type_6(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) { + struct inode *inode = page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else if (is_cold_data(page) || file_is_cold(inode)) + return CURSEG_COLD_DATA; + else + return CURSEG_WARM_DATA; + } else { + if (IS_DNODE(page)) + return is_cold_node(page) ? CURSEG_WARM_NODE : + CURSEG_HOT_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type(struct page *page, enum page_type p_type) +{ + switch (F2FS_P_SB(page)->active_logs) { + case 2: + return __get_segment_type_2(page, p_type); + case 4: + return __get_segment_type_4(page, p_type); + } + /* NR_CURSEG_TYPE(6) logs by default */ + f2fs_bug_on(F2FS_P_SB(page), + F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); + return __get_segment_type_6(page, p_type); +} + +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + bool direct_io = (type == CURSEG_DIRECT_IO); + + type = direct_io ? CURSEG_WARM_DATA : type; + + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + /* direct_io'ed data is aligned to the segment for better performance */ + if (direct_io && curseg->next_blkoff) + __allocate_new_segments(sbi, type); + + *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + /* + * __add_sum_entry should be resided under the curseg_mutex + * because, this function updates a summary entry in the + * current summary block. + */ + __add_sum_entry(sbi, type, sum); + + __refresh_next_blkoff(sbi, curseg); + + stat_inc_block_count(sbi, curseg); + + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + + mutex_unlock(&sit_i->sentry_lock); + + if (page && IS_NODESEG(type)) + fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + + mutex_unlock(&curseg->curseg_mutex); +} + +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_summary *sum, + struct f2fs_io_info *fio) +{ + int type = __get_segment_type(page, fio->type); + + allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type); + + /* writeout dirty page into bdev */ + f2fs_submit_page_mbio(sbi, page, fio); +} + +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_io_info fio = { + .type = META, + .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .blk_addr = page->index, + }; + + set_page_writeback(page); + f2fs_submit_page_mbio(sbi, page, &fio); +} + +void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + unsigned int nid, struct f2fs_io_info *fio) +{ + struct f2fs_summary sum; + set_summary(&sum, nid, 0, 0); + do_write_page(sbi, page, &sum, fio); +} + +void write_data_page(struct page *page, struct dnode_of_data *dn, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_summary sum; + struct node_info ni; + + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + do_write_page(sbi, page, &sum, fio); + dn->data_blkaddr = fio->blk_addr; +} + +void rewrite_data_page(struct page *page, struct f2fs_io_info *fio) +{ + stat_inc_inplace_blocks(F2FS_P_SB(page)); + f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio); +} + +void recover_data_page(struct f2fs_sb_info *sbi, + struct page *page, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + unsigned int segno, old_cursegno; + struct seg_entry *se; + int type; + + segno = GET_SEGNO(sbi, new_blkaddr); + se = get_seg_entry(sbi, segno); + type = se->type; + + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + old_cursegno = curseg->segno; + + /* change the current segment */ + if (segno != curseg->segno) { + curseg->next_segno = segno; + change_curseg(sbi, type, true); + } + + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + __add_sum_entry(sbi, type, sum); + + refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + locate_dirty_segment(sbi, old_cursegno); + + mutex_unlock(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); +} + +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!io->bio) + goto out; + + bio_for_each_segment_all(bvec, io->bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type) +{ + if (PageWriteback(page)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); + wait_on_page_writeback(page); + } +} + +static int read_compacted_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *seg_i; + unsigned char *kaddr; + struct page *page; + block_t start; + int i, j, offset; + + start = start_sum_block(sbi); + + page = get_meta_page(sbi, start++); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: restore nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + + /* Step 2: restore sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, + SUM_JOURNAL_SIZE); + offset = 2 * SUM_JOURNAL_SIZE; + + /* Step 3: restore summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blk_off; + unsigned int segno; + + seg_i = CURSEG_I(sbi, i); + segno = le32_to_cpu(ckpt->cur_data_segno[i]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); + seg_i->next_segno = segno; + reset_curseg(sbi, i, 0); + seg_i->alloc_type = ckpt->alloc_type[i]; + seg_i->next_blkoff = blk_off; + + if (seg_i->alloc_type == SSR) + blk_off = sbi->blocks_per_seg; + + for (j = 0; j < blk_off; j++) { + struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); + seg_i->sum_blk->entries[j] = *s; + offset += SUMMARY_SIZE; + if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + SUM_FOOTER_SIZE) + continue; + + f2fs_put_page(page, 1); + page = NULL; + + page = get_meta_page(sbi, start++); + kaddr = (unsigned char *)page_address(page); + offset = 0; + } + } + f2fs_put_page(page, 1); + return 0; +} + +static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_summary_block *sum; + struct curseg_info *curseg; + struct page *new; + unsigned short blk_off; + unsigned int segno = 0; + block_t blk_addr = 0; + + /* get segment number and block addr */ + if (IS_DATASEG(type)) { + segno = le32_to_cpu(ckpt->cur_data_segno[type]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - + CURSEG_HOT_DATA]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + else + blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); + } else { + segno = le32_to_cpu(ckpt->cur_node_segno[type - + CURSEG_HOT_NODE]); + blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - + CURSEG_HOT_NODE]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, + type - CURSEG_HOT_NODE); + else + blk_addr = GET_SUM_BLOCK(sbi, segno); + } + + new = get_meta_page(sbi, blk_addr); + sum = (struct f2fs_summary_block *)page_address(new); + + if (IS_NODESEG(type)) { + if (__exist_node_summaries(sbi)) { + struct f2fs_summary *ns = &sum->entries[0]; + int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + ns->version = 0; + ns->ofs_in_node = 0; + } + } else { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { + f2fs_put_page(new, 1); + return err; + } + } + } + + /* set uncompleted segment to curseg */ + curseg = CURSEG_I(sbi, type); + mutex_lock(&curseg->curseg_mutex); + memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + curseg->next_segno = segno; + reset_curseg(sbi, type, 0); + curseg->alloc_type = ckpt->alloc_type[type]; + curseg->next_blkoff = blk_off; + mutex_unlock(&curseg->curseg_mutex); + f2fs_put_page(new, 1); + return 0; +} + +static int restore_curseg_summaries(struct f2fs_sb_info *sbi) +{ + int type = CURSEG_HOT_DATA; + int err; + + if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + int npages = npages_for_summary_flush(sbi, true); + + if (npages >= 2) + ra_meta_pages(sbi, start_sum_block(sbi), npages, + META_CP); + + /* restore for compacted data summary */ + if (read_compacted_summaries(sbi)) + return -EINVAL; + type = CURSEG_HOT_NODE; + } + + if (__exist_node_summaries(sbi)) + ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + NR_CURSEG_TYPE - type, META_CP); + + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + + return 0; +} + +static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct page *page; + unsigned char *kaddr; + struct f2fs_summary *summary; + struct curseg_info *seg_i; + int written_size = 0; + int i, j; + + page = grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: write nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 2: write sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, + SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 3: write summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); + if (sbi->ckpt->alloc_type[i] == SSR) + blkoff = sbi->blocks_per_seg; + else + blkoff = curseg_blkoff(sbi, i); + + for (j = 0; j < blkoff; j++) { + if (!page) { + page = grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + written_size = 0; + } + summary = (struct f2fs_summary *)(kaddr + written_size); + *summary = seg_i->sum_blk->entries[j]; + written_size += SUMMARY_SIZE; + + if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + SUM_FOOTER_SIZE) + continue; + + set_page_dirty(page); + f2fs_put_page(page, 1); + page = NULL; + } + } + if (page) { + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void write_normal_summaries(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + int i, end; + if (IS_DATASEG(type)) + end = type + NR_CURSEG_DATA_TYPE; + else + end = type + NR_CURSEG_NODE_TYPE; + + for (i = type; i < end; i++) { + struct curseg_info *sum = CURSEG_I(sbi, i); + mutex_lock(&sum->curseg_mutex); + write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); + mutex_unlock(&sum->curseg_mutex); + } +} + +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + write_compacted_summaries(sbi, start_blk); + else + write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); +} + +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); +} + +int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, + unsigned int val, int alloc) +{ + int i; + + if (type == NAT_JOURNAL) { + for (i = 0; i < nats_in_cursum(sum); i++) { + if (le32_to_cpu(nid_in_journal(sum, i)) == val) + return i; + } + if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + return update_nats_in_cursum(sum, 1); + } else if (type == SIT_JOURNAL) { + for (i = 0; i < sits_in_cursum(sum); i++) + if (le32_to_cpu(segno_in_journal(sum, i)) == val) + return i; + if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + return update_sits_in_cursum(sum, 1); + } + return -1; +} + +static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return get_meta_page(sbi, current_sit_addr(sbi, segno)); +} + +static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct page *src_page, *dst_page; + pgoff_t src_off, dst_off; + void *src_addr, *dst_addr; + + src_off = current_sit_addr(sbi, start); + dst_off = next_sit_addr(sbi, src_off); + + /* get current sit block page without lock */ + src_page = get_meta_page(sbi, src_off); + dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(sbi, PageDirty(src_page)); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_sit(sit_i, start); + + return dst_page; +} + +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; +} + +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&ses->set_list, &next->set_list); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int i; + + for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(sum, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); + } + update_sits_in_cursum(sum, -sits_in_cursum(sum)); +} + +/* + * CP calls this function, which flushes SIT entries including sit_journal, + * and moves prefree segs to free segs. + */ +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned long *bitmap = sit_i->dirty_sentries_bitmap; + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = true; + struct seg_entry *se; + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + if (!sit_i->dirty_sentries) + goto out; + + /* + * add and account sit entries of dirty bitmap in sit entry + * set temporarily + */ + add_sits_in_set(sbi); + + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. + */ + if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + remove_sits_in_journal(sbi); + + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct page *page = NULL; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (!to_journal) { + page = get_next_sit_page(sbi, start_segno); + raw_sit = page_address(page); + } + + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; + + se = get_seg_entry(sbi, segno); + + /* add discard candidates */ + if (cpc->reason != CP_DISCARD) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc); + } + + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(sum, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(sum, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + } + + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; + } + + if (!to_journal) + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); + } + + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason == CP_DISCARD) { + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc); + } + mutex_unlock(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + set_prefree_as_free_segments(sbi); +} + +static int build_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct sit_info *sit_i; + unsigned int sit_segs, start; + char *src_bitmap, *dst_bitmap; + unsigned int bitmap_size; + + /* allocate memory for SIT information */ + sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + if (!sit_i) + return -ENOMEM; + + SM_I(sbi)->sit_info = sit_i; + + sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); + if (!sit_i->sentries) + return -ENOMEM; + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!sit_i->dirty_sentries_bitmap) + return -ENOMEM; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + sit_i->sentries[start].cur_valid_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->sentries[start].ckpt_valid_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map + || !sit_i->sentries[start].ckpt_valid_map) + return -ENOMEM; + } + + sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->tmp_map) + return -ENOMEM; + + if (sbi->segs_per_sec > 1) { + sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * + sizeof(struct sec_entry)); + if (!sit_i->sec_entries) + return -ENOMEM; + } + + /* get information related with SIT */ + sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; + + /* setup SIT bitmap from ckeckpoint pack */ + bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); + + dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!dst_bitmap) + return -ENOMEM; + + /* init SIT information */ + sit_i->s_ops = &default_salloc_ops; + + sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); + sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; + sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->sit_bitmap = dst_bitmap; + sit_i->bitmap_size = bitmap_size; + sit_i->dirty_sentries = 0; + sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; + sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); + sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + mutex_init(&sit_i->sentry_lock); + return 0; +} + +static int build_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i; + unsigned int bitmap_size, sec_bitmap_size; + + /* allocate memory for free segmap information */ + free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + if (!free_i) + return -ENOMEM; + + SM_I(sbi)->free_info = free_i; + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + if (!free_i->free_segmap) + return -ENOMEM; + + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + if (!free_i->free_secmap) + return -ENOMEM; + + /* set all segments as dirty temporarily */ + memset(free_i->free_segmap, 0xff, bitmap_size); + memset(free_i->free_secmap, 0xff, sec_bitmap_size); + + /* init free segmap information */ + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); + free_i->free_segments = 0; + free_i->free_sections = 0; + spin_lock_init(&free_i->segmap_lock); + return 0; +} + +static int build_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array; + int i; + + array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); + if (!array) + return -ENOMEM; + + SM_I(sbi)->curseg_array = array; + + for (i = 0; i < NR_CURSEG_TYPE; i++) { + mutex_init(&array[i].curseg_mutex); + array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (!array[i].sum_blk) + return -ENOMEM; + array[i].segno = NULL_SEGNO; + array[i].next_blkoff = 0; + } + return restore_curseg_summaries(sbi); +} + +static void build_sit_entries(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int nrpages = MAX_BIO_BLOCKS(sbi); + + do { + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < MAIN_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) + == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } + } + mutex_unlock(&curseg->curseg_mutex); + + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); +got_it: + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } + } + start_blk += readed; + } while (start_blk < sit_blk_cnt); +} + +static void init_free_segmap(struct f2fs_sb_info *sbi) +{ + unsigned int start; + int type; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + struct seg_entry *sentry = get_seg_entry(sbi, start); + if (!sentry->valid_blocks) + __set_free(sbi, start); + } + + /* set use the current segments */ + for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { + struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); + } +} + +static void init_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno = 0, offset = 0; + unsigned short valid_blocks; + + while (1) { + /* find dirty segment based on free segmap */ + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) + break; + offset = segno + 1; + valid_blocks = get_valid_blocks(sbi, segno, 0); + if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + continue; + if (valid_blocks > sbi->blocks_per_seg) { + f2fs_bug_on(sbi, 1); + continue; + } + mutex_lock(&dirty_i->seglist_lock); + __locate_dirty_segment(sbi, segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + } +} + +static int init_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) + return -ENOMEM; + return 0; +} + +static int build_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i; + unsigned int bitmap_size, i; + + /* allocate memory for dirty segments list information */ + dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + if (!dirty_i) + return -ENOMEM; + + SM_I(sbi)->dirty_info = dirty_i; + mutex_init(&dirty_i->seglist_lock); + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + + for (i = 0; i < NR_DIRTY_TYPE; i++) { + dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->dirty_segmap[i]) + return -ENOMEM; + } + + init_dirty_segmap(sbi); + return init_victim_secmap(sbi); +} + +/* + * Update min, max modified time for cost-benefit GC algorithm + */ +static void init_min_max_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno; + + mutex_lock(&sit_i->sentry_lock); + + sit_i->min_mtime = LLONG_MAX; + + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + unsigned int i; + unsigned long long mtime = 0; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, segno + i)->mtime; + + mtime = div_u64(mtime, sbi->segs_per_sec); + + if (sit_i->min_mtime > mtime) + sit_i->min_mtime = mtime; + } + sit_i->max_mtime = get_mtime(sbi); + mutex_unlock(&sit_i->sentry_lock); +} + +int build_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_sm_info *sm_info; + int err; + + sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + if (!sm_info) + return -ENOMEM; + + /* init sm info */ + sbi->sm_info = sm_info; + sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + sm_info->segment_count = le32_to_cpu(raw_super->segment_count); + sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); + sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + + INIT_LIST_HEAD(&sm_info->discard_list); + sm_info->nr_discards = 0; + sm_info->max_discards = 0; + + sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; + + INIT_LIST_HEAD(&sm_info->sit_entry_set); + + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + err = create_flush_cmd_control(sbi); + if (err) + return err; + } + + err = build_sit_info(sbi); + if (err) + return err; + err = build_free_segmap(sbi); + if (err) + return err; + err = build_curseg(sbi); + if (err) + return err; + + /* reinit free segmap based on SIT */ + build_sit_entries(sbi); + + init_free_segmap(sbi); + err = build_dirty_segmap(sbi); + if (err) + return err; + + init_min_max_mtime(sbi); + return 0; +} + +static void discard_dirty_segmap(struct f2fs_sb_info *sbi, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + mutex_lock(&dirty_i->seglist_lock); + kfree(dirty_i->dirty_segmap[dirty_type]); + dirty_i->nr_dirty[dirty_type] = 0; + mutex_unlock(&dirty_i->seglist_lock); +} + +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kfree(dirty_i->victim_secmap); +} + +static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + if (!dirty_i) + return; + + /* discard pre-free/dirty segments list */ + for (i = 0; i < NR_DIRTY_TYPE; i++) + discard_dirty_segmap(sbi, i); + + destroy_victim_secmap(sbi); + SM_I(sbi)->dirty_info = NULL; + kfree(dirty_i); +} + +static void destroy_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array = SM_I(sbi)->curseg_array; + int i; + + if (!array) + return; + SM_I(sbi)->curseg_array = NULL; + for (i = 0; i < NR_CURSEG_TYPE; i++) + kfree(array[i].sum_blk); + kfree(array); +} + +static void destroy_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) + return; + SM_I(sbi)->free_info = NULL; + kfree(free_i->free_segmap); + kfree(free_i->free_secmap); + kfree(free_i); +} + +static void destroy_sit_info(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int start; + + if (!sit_i) + return; + + if (sit_i->sentries) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { + kfree(sit_i->sentries[start].cur_valid_map); + kfree(sit_i->sentries[start].ckpt_valid_map); + } + } + kfree(sit_i->tmp_map); + + vfree(sit_i->sentries); + vfree(sit_i->sec_entries); + kfree(sit_i->dirty_sentries_bitmap); + + SM_I(sbi)->sit_info = NULL; + kfree(sit_i->sit_bitmap); + kfree(sit_i); +} + +void destroy_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + destroy_flush_cmd_control(sbi); + destroy_dirty_segmap(sbi); + destroy_curseg(sbi); + destroy_free_segmap(sbi); + destroy_sit_info(sbi); + sbi->sm_info = NULL; + kfree(sm_info); +} + +int __init create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + goto fail; + + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sizeof(struct sit_entry_set)); + if (!sit_entry_set_slab) + goto destory_discard_entry; + + inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + sizeof(struct inmem_pages)); + if (!inmem_entry_slab) + goto destroy_sit_entry_set; + return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destory_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; +} + +void destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(inmem_entry_slab); +} diff --git a/kernel/fs/f2fs/segment.h b/kernel/fs/f2fs/segment.h new file mode 100644 index 000000000..85d7fa751 --- /dev/null +++ b/kernel/fs/f2fs/segment.h @@ -0,0 +1,751 @@ +/* + * fs/f2fs/segment.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include + +/* constant macro */ +#define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) + +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ + +/* L: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) +#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) + +#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) + +#define IS_CURSEG(sbi, seg) \ + ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + +#define IS_CURSEC(sbi, secno) \ + ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + sbi->segs_per_sec)) \ + +#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) +#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) (sbi->total_sections) + +#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ + sbi->log_blocks_per_seg)) + +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ + (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + +#define NEXT_FREE_BLKADDR(sbi, curseg) \ + (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) +#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + +#define GET_SEGNO(sbi, blk_addr) \ + (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ + GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#define GET_SECNO(sbi, segno) \ + ((segno) / sbi->segs_per_sec) +#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ + ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) + +#define GET_SUM_BLOCK(sbi, segno) \ + ((sbi->sm_info->ssa_blkaddr) + segno) + +#define GET_SUM_TYPE(footer) ((footer)->entry_type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) + +#define SIT_ENTRY_OFFSET(sit_i, segno) \ + (segno % sit_i->sents_per_block) +#define SIT_BLOCK_OFFSET(segno) \ + (segno / SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) +#define f2fs_bitmap_size(nr) \ + (BITS_TO_LONGS(nr) * sizeof(unsigned long)) + +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) +#define MAX_BIO_BLOCKS(sbi) \ + ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) + +/* + * indicate a block allocation direction: RIGHT and LEFT. + * RIGHT means allocating new sections towards the end of volume. + * LEFT means the opposite direction. + */ +enum { + ALLOC_RIGHT = 0, + ALLOC_LEFT +}; + +/* + * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * LFS writes data sequentially with cleaning operations. + * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + */ +enum { + LFS = 0, + SSR +}; + +/* + * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * GC_CB is based on cost-benefit algorithm. + * GC_GREEDY is based on greedy algorithm. + */ +enum { + GC_CB = 0, + GC_GREEDY +}; + +/* + * BG_GC means the background cleaning job. + * FG_GC means the on-demand cleaning job. + */ +enum { + BG_GC = 0, + FG_GC +}; + +/* for a function parameter to select a victim segment */ +struct victim_sel_policy { + int alloc_mode; /* LFS or SSR */ + int gc_mode; /* GC_CB or GC_GREEDY */ + unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int max_search; /* maximum # of segments to search */ + unsigned int offset; /* last scanned bitmap offset */ + unsigned int ofs_unit; /* bitmap search unit */ + unsigned int min_cost; /* minimum cost */ + unsigned int min_segno; /* segment # having min. cost */ +}; + +struct seg_entry { + unsigned short valid_blocks; /* # of valid blocks */ + unsigned char *cur_valid_map; /* validity bitmap of blocks */ + /* + * # of valid blocks and the validity bitmap stored in the the last + * checkpoint pack. This information is used by the SSR mode. + */ + unsigned short ckpt_valid_blocks; + unsigned char *ckpt_valid_map; + unsigned char type; /* segment type like CURSEG_XXX_TYPE */ + unsigned long long mtime; /* modification time of the segment */ +}; + +struct sec_entry { + unsigned int valid_blocks; /* # of valid blocks in a section */ +}; + +struct segment_allocation { + void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +}; + +struct inmem_pages { + struct list_head list; + struct page *page; +}; + +struct sit_info { + const struct segment_allocation *s_ops; + + block_t sit_base_addr; /* start block address of SIT area */ + block_t sit_blocks; /* # of blocks used by SIT area */ + block_t written_valid_blocks; /* # of valid blocks in main area */ + char *sit_bitmap; /* SIT bitmap pointer */ + unsigned int bitmap_size; /* SIT bitmap size */ + + unsigned long *tmp_map; /* bitmap for temporal use */ + unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ + unsigned int dirty_sentries; /* # of dirty sentries */ + unsigned int sents_per_block; /* # of SIT entries per block */ + struct mutex sentry_lock; /* to protect SIT cache */ + struct seg_entry *sentries; /* SIT segment-level cache */ + struct sec_entry *sec_entries; /* SIT section-level cache */ + + /* for cost-benefit algorithm in cleaning procedure */ + unsigned long long elapsed_time; /* elapsed time after mount */ + unsigned long long mounted_time; /* mount time */ + unsigned long long min_mtime; /* min. modification time */ + unsigned long long max_mtime; /* max. modification time */ +}; + +struct free_segmap_info { + unsigned int start_segno; /* start segment number logically */ + unsigned int free_segments; /* # of free segments */ + unsigned int free_sections; /* # of free sections */ + spinlock_t segmap_lock; /* free segmap lock */ + unsigned long *free_segmap; /* free segment bitmap */ + unsigned long *free_secmap; /* free section bitmap */ +}; + +/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ +enum dirty_type { + DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ + DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ + DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ + DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ + DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ + DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ + DIRTY, /* to count # of dirty segments */ + PRE, /* to count # of entirely obsolete segments */ + NR_DIRTY_TYPE +}; + +struct dirty_seglist_info { + const struct victim_selection *v_ops; /* victim selction operation */ + unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + struct mutex seglist_lock; /* lock for segment bitmaps */ + int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ + unsigned long *victim_secmap; /* background GC victims */ +}; + +/* victim selection function for cleaning and SSR */ +struct victim_selection { + int (*get_victim)(struct f2fs_sb_info *, unsigned int *, + int, int, char); +}; + +/* for active log information */ +struct curseg_info { + struct mutex curseg_mutex; /* lock for consistency */ + struct f2fs_summary_block *sum_blk; /* cached summary block */ + unsigned char alloc_type; /* current allocation type */ + unsigned int segno; /* current segment number */ + unsigned short next_blkoff; /* next block offset to write */ + unsigned int zone; /* current zone number */ + unsigned int next_segno; /* preallocated segment */ +}; + +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ +}; + +/* + * inline functions + */ +static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) +{ + return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); +} + +static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sentries[segno]; +} + +static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; +} + +static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, int section) +{ + /* + * In order to get # of valid blocks in a section instantly from many + * segments, f2fs manages two counting structures separately. + */ + if (section > 1) + return get_sec_entry(sbi, segno)->valid_blocks; + else + return get_seg_entry(sbi, segno)->valid_blocks; +} + +static inline void seg_info_from_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + se->valid_blocks = GET_SIT_VBLOCKS(rs); + se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); + memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->type = GET_SIT_TYPE(rs); + se->mtime = le64_to_cpu(rs->mtime); +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | + se->valid_blocks; + rs->vblocks = cpu_to_le16(raw_vblocks); + memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->ckpt_valid_blocks = se->valid_blocks; + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, + unsigned int max, unsigned int segno) +{ + unsigned int ret; + spin_lock(&free_i->segmap_lock); + ret = find_next_bit(free_i->free_segmap, max, segno); + spin_unlock(&free_i->segmap_lock); + return ret; +} + +static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int next; + + spin_lock(&free_i->segmap_lock); + clear_bit(segno, free_i->free_segmap); + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + clear_bit(secno, free_i->free_secmap); + free_i->free_sections++; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + set_bit(segno, free_i->free_segmap); + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; +} + +static inline void __set_test_and_free(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int next; + + spin_lock(&free_i->segmap_lock); + if (test_and_clear_bit(segno, free_i->free_segmap)) { + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + if (test_and_clear_bit(secno, free_i->free_secmap)) + free_i->free_sections++; + } + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + spin_lock(&free_i->segmap_lock); + if (!test_and_set_bit(segno, free_i->free_segmap)) { + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, + void *dst_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); +} + +static inline block_t written_block_count(struct f2fs_sb_info *sbi) +{ + return SIT_I(sbi)->written_valid_blocks; +} + +static inline unsigned int free_segments(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_segments; +} + +static inline int reserved_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->reserved_segments; +} + +static inline unsigned int free_sections(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_sections; +} + +static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[PRE]; +} + +static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; +} + +static inline int overprovision_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ovp_segments; +} + +static inline int overprovision_sections(struct f2fs_sb_info *sbi) +{ + return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; +} + +static inline int reserved_sections(struct f2fs_sb_info *sbi) +{ + return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; +} + +static inline bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + return free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi) + 1); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi)); +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; +} + +static inline int utilization(struct f2fs_sb_info *sbi) +{ + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); +} + +/* + * Sometimes f2fs may be better to drop out-of-place update policy. + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. + * F2FS_IPUT_DISABLE - disable IPU. (=default option) + */ +#define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_FSYNC, +}; + +static inline bool need_inplace_update(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + /* IPU can be done only for the user data */ + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + return true; + + return false; +} + +static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->segno; +} + +static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->alloc_type; +} + +static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->next_blkoff; +} + +#ifdef CONFIG_F2FS_CHECK_FS +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); +} + +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ + BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); + BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); +} + +/* + * Summary block is always treated as an invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; + int valid_blocks = 0; + int cur_pos = 0, next_pos; + + /* check segment usage */ + BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); + + /* check boundary of a given segment number */ + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); + + /* check bitmap with valid block count */ + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < sbi->blocks_per_seg); + BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); +} +#else +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + if (segno > TOTAL_SEGS(sbi) - 1) + set_sbi_flag(sbi, SBI_NEED_FSCK); +} + +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ + if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) + set_sbi_flag(sbi, SBI_NEED_FSCK); +} + +/* + * Summary block is always treated as an invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + /* check segment usage */ + if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + /* check boundary of a given segment number */ + if (segno > TOTAL_SEGS(sbi) - 1) + set_sbi_flag(sbi, SBI_NEED_FSCK); +} +#endif + +static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int offset = SIT_BLOCK_OFFSET(start); + block_t blk_addr = sit_i->sit_base_addr + offset; + + check_seg_range(sbi, start); + + /* calculate sit block address */ + if (f2fs_test_bit(offset, sit_i->sit_bitmap)) + blk_addr += sit_i->sit_blocks; + + return blk_addr; +} + +static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + block_addr -= sit_i->sit_base_addr; + if (block_addr < sit_i->sit_blocks) + block_addr += sit_i->sit_blocks; + else + block_addr -= sit_i->sit_blocks; + + return block_addr + sit_i->sit_base_addr; +} + +static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) +{ + unsigned int block_off = SIT_BLOCK_OFFSET(start); + + f2fs_change_bit(block_off, sit_i->sit_bitmap); +} + +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - + sit_i->mounted_time; +} + +static inline void set_summary(struct f2fs_summary *sum, nid_t nid, + unsigned int ofs_in_node, unsigned char version) +{ + sum->nid = cpu_to_le32(nid); + sum->ofs_in_node = cpu_to_le16(ofs_in_node); + sum->version = version; +} + +static inline block_t start_sum_block(struct f2fs_sb_info *sbi) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) + - (base + 1) + type; +} + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + return SECTOR_TO_BLOCK(queue_max_sectors(q)); +} + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->sb->s_bdi->dirty_exceeded) + return 0; + + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(sbi); + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(sbi); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} diff --git a/kernel/fs/f2fs/super.c b/kernel/fs/f2fs/super.c new file mode 100644 index 000000000..b2dd1b01f --- /dev/null +++ b/kernel/fs/f2fs/super.c @@ -0,0 +1,1350 @@ +/* + * fs/f2fs/super.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "gc.h" +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include + +static struct proc_dir_entry *f2fs_proc_root; +static struct kmem_cache *f2fs_inode_cachep; +static struct kset *f2fs_kset; + +enum { + Opt_gc_background, + Opt_disable_roll_forward, + Opt_norecovery, + Opt_discard, + Opt_noheap, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_active_logs, + Opt_disable_ext_identify, + Opt_inline_xattr, + Opt_inline_data, + Opt_inline_dentry, + Opt_flush_merge, + Opt_nobarrier, + Opt_fastboot, + Opt_extent_cache, + Opt_noinline_data, + Opt_err, +}; + +static match_table_t f2fs_tokens = { + {Opt_gc_background, "background_gc=%s"}, + {Opt_disable_roll_forward, "disable_roll_forward"}, + {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, + {Opt_noheap, "no_heap"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_active_logs, "active_logs=%u"}, + {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, + {Opt_inline_data, "inline_data"}, + {Opt_inline_dentry, "inline_dentry"}, + {Opt_flush_merge, "flush_merge"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_fastboot, "fastboot"}, + {Opt_extent_cache, "extent_cache"}, + {Opt_noinline_data, "noinline_data"}, + {Opt_err, NULL}, +}; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; + return NULL; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +static void init_once(void *foo) +{ + struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; + + inode_init_once(&fi->vfs_inode); +} + +static int parse_options(struct super_block *sb, char *options) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 2 && !strncmp(name, "on", 2)) + set_opt(sbi, BG_GC); + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, BG_GC); + else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* this option mounts f2fs with ro */ + set_opt(sbi, DISABLE_ROLL_FORWARD); + if (!f2fs_readonly(sb)) + return -EINVAL; + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; +#else + case Opt_user_xattr: + f2fs_msg(sb, KERN_INFO, + "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); + break; + case Opt_inline_xattr: + f2fs_msg(sb, KERN_INFO, + "inline_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_acl: + f2fs_msg(sb, KERN_INFO, "acl options not supported"); + break; + case Opt_noacl: + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + return -EINVAL; + sbi->active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; + case Opt_inline_dentry: + set_opt(sbi, INLINE_DENTRY); + break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; + case Opt_fastboot: + set_opt(sbi, FASTBOOT); + break; + case Opt_extent_cache: + set_opt(sbi, EXTENT_CACHE); + break; + case Opt_noinline_data: + clear_opt(sbi, INLINE_DATA); + break; + default: + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } + return 0; +} + +static struct inode *f2fs_alloc_inode(struct super_block *sb) +{ + struct f2fs_inode_info *fi; + + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + if (!fi) + return NULL; + + init_once((void *) fi); + + /* Initialize f2fs-specific inode info */ + fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); + fi->i_current_depth = 1; + fi->i_advise = 0; + rwlock_init(&fi->ext_lock); + init_rwsem(&fi->i_sem); + INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); + INIT_LIST_HEAD(&fi->inmem_pages); + mutex_init(&fi->inmem_lock); + + set_inode_flag(fi, FI_NEW_INODE); + + if (test_opt(F2FS_SB(sb), INLINE_XATTR)) + set_inode_flag(fi, FI_INLINE_XATTR); + + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + + return &fi->vfs_inode; +} + +static int f2fs_drop_inode(struct inode *inode) +{ + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + return 0; + return generic_drop_inode(inode); +} + +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); +} + +static void f2fs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); +} + +static void f2fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, f2fs_i_callback); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); + + f2fs_destroy_stats(sbi); + stop_gc_thread(sbi); + + /* + * We don't need to do checkpoint when superblock is clean. + * But, the previous checkpoint was not done by umount, it needs to do + * clean checkpoint again. + */ + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + write_checkpoint(sbi, &cpc); + } + + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ + release_dirty_inode(sbi); + release_discard_addrs(sbi); + + iput(sbi->node_inode); + iput(sbi->meta_inode); + + /* destroy f2fs internal modules */ + destroy_node_manager(sbi); + destroy_segment_manager(sbi); + + kfree(sbi->ckpt); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + + sb->s_fs_info = NULL; + brelse(sbi->raw_super_buf); + kfree(sbi); +} + +int f2fs_sync_fs(struct super_block *sb, int sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + trace_f2fs_sync_fs(sb, sync); + + if (sync) { + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + } else { + f2fs_balance_fs(sbi); + } + f2fs_trace_ios(NULL, NULL, 1); + + return 0; +} + +static int f2fs_freeze(struct super_block *sb) +{ + int err; + + if (f2fs_readonly(sb)) + return 0; + + err = f2fs_sync_fs(sb, 1); + return err; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ + return 0; +} + +static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + block_t total_count, user_block_count, start_count, ovp_count; + + total_count = le64_to_cpu(sbi->raw_super->block_count); + user_block_count = sbi->user_block_count; + start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + buf->f_type = F2FS_SUPER_MAGIC; + buf->f_bsize = sbi->blocksize; + + buf->f_blocks = total_count - start_count; + buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bavail = user_block_count - valid_user_blocks(sbi); + + buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + buf->f_ffree = buf->f_files - valid_inode_count(sbi); + + buf->f_namelen = F2FS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int f2fs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); + + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) + seq_printf(seq, ",background_gc=%s", "on"); + else + seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, DISCARD)) + seq_puts(seq, ",discard"); + if (test_opt(sbi, NOHEAP)) + seq_puts(seq, ",no_heap_alloc"); +#ifdef CONFIG_F2FS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif + if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) + seq_puts(seq, ",disable_ext_identify"); + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); + if (test_opt(sbi, INLINE_DENTRY)) + seq_puts(seq, ",inline_dentry"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (test_opt(sbi, FASTBOOT)) + seq_puts(seq, ",fastboot"); + if (test_opt(sbi, EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); + seq_printf(seq, ",active_logs=%u", sbi->active_logs); + + return 0; +} + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_info_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, segment_info_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations f2fs_seq_segment_info_fops = { + .owner = THIS_MODULE, + .open = segment_info_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + int err, active_logs; + bool need_restart_gc = false; + bool need_stop_gc = false; + + sync_filesystem(sb); + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + active_logs = sbi->active_logs; + + sbi->mount_opt.opt = 0; + sbi->active_logs = NR_CURSEG_TYPE; + + /* parse mount options */ + err = parse_options(sb, data); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so skip checking GC and FLUSH_MERGE conditions. + */ + if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) + goto skip; + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if (sbi->gc_thread) { + stop_gc_thread(sbi); + f2fs_sync_fs(sb, 1); + need_restart_gc = true; + } + } else if (!sbi->gc_thread) { + err = start_gc_thread(sbi); + if (err) + goto restore_opts; + need_stop_gc = true; + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + destroy_flush_cmd_control(sbi); + } else if (!SM_I(sbi)->cmd_control_info) { + err = create_flush_cmd_control(sbi); + if (err) + goto restore_gc; + } +skip: + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; +restore_gc: + if (need_restart_gc) { + if (start_gc_thread(sbi)) + f2fs_msg(sbi->sb, KERN_WARNING, + "background gc thread has stopped"); + } else if (need_stop_gc) { + stop_gc_thread(sbi); + } +restore_opts: + sbi->mount_opt = org_mount_opt; + sbi->active_logs = active_logs; + return err; +} + +static struct super_operations f2fs_sops = { + .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, + .destroy_inode = f2fs_destroy_inode, + .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, + .show_options = f2fs_show_options, + .evict_inode = f2fs_evict_inode, + .put_super = f2fs_put_super, + .sync_fs = f2fs_sync_fs, + .freeze_fs = f2fs_freeze, + .unfreeze_fs = f2fs_unfreeze, + .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, +}; + +static struct inode *f2fs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + + if (check_nid_range(sbi, ino)) + return ERR_PTR(-ESTALE); + + /* + * f2fs_iget isn't quite right if the inode is currently unallocated! + * However f2fs_iget currently does appropriate checks to handle stale + * inodes so everything is OK. + */ + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (unlikely(generation && inode->i_generation != generation)) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static const struct export_operations f2fs_export_ops = { + .fh_to_dentry = f2fs_fh_to_dentry, + .fh_to_parent = f2fs_fh_to_parent, + .get_parent = f2fs_get_parent, +}; + +static loff_t max_file_size(unsigned bits) +{ + loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t leaf_count = ADDRS_PER_BLOCK; + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + result <<= bits; + return result; +} + +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + unsigned int blocksize; + + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); + return 1; + } + + /* Currently, support only 4KB page cache size */ + if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid page_cache_size (%lu), supports only 4KB\n", + PAGE_CACHE_SIZE); + return 1; + } + + /* Currently, support only 4KB block size */ + blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); + if (blocksize != F2FS_BLKSIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); + return 1; + } + + /* Currently, support 512/1024/2048/4096 bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); + return 1; + } + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); + return 1; + } + return 0; +} + +static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +{ + unsigned int total, fsmeta; + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + total = le32_to_cpu(raw_super->segment_count); + fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); + fsmeta += le32_to_cpu(raw_super->segment_count_sit); + fsmeta += le32_to_cpu(raw_super->segment_count_nat); + fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); + fsmeta += le32_to_cpu(raw_super->segment_count_ssa); + + if (unlikely(fsmeta >= total)) + return 1; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + return 1; + } + return 0; +} + +static void init_sb_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = sbi->raw_super; + int i; + + sbi->log_sectors_per_block = + le32_to_cpu(raw_super->log_sectors_per_block); + sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); + sbi->blocksize = 1 << sbi->log_blocksize; + sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + sbi->total_sections = le32_to_cpu(raw_super->section_count); + sbi->total_node_count = + (le32_to_cpu(raw_super->segment_count_nat) / 2) + * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; + sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); + sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); + sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; + clear_sbi_flag(sbi, SBI_NEED_FSCK); +} + +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read the first one at first, + * if the first one is invalid, move to read the second one. + */ +static int read_raw_super_block(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf) +{ + int block = 0; + +retry: + *raw_super_buf = sb_bread(sb, block); + if (!*raw_super_buf) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EIO; + } + } + + *raw_super = (struct f2fs_super_block *) + ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + + /* sanity checking of raw super */ + if (sanity_check_raw_super(sb, *raw_super)) { + brelse(*raw_super_buf); + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EINVAL; + } + } + + return 0; +} + +static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct f2fs_sb_info *sbi; + struct f2fs_super_block *raw_super = NULL; + struct buffer_head *raw_super_buf; + struct inode *root; + long err = -EINVAL; + bool retry = true, need_fsck = false; + char *options = NULL; + int i; + +try_onemore: + /* allocate memory for f2fs-specific super block info */ + sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + /* set a block size */ + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); + goto free_sbi; + } + + err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + if (err) + goto free_sbi; + + sb->s_fs_info = sbi; + /* init some FS parameters */ + sbi->active_logs = NR_CURSEG_TYPE; + + set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_DATA); + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + /* parse mount options */ + options = kstrdup((const char *)data, GFP_KERNEL); + if (data && !options) { + err = -ENOMEM; + goto free_sb_buf; + } + + err = parse_options(sb, options); + if (err) + goto free_options; + + sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sb->s_max_links = F2FS_LINK_MAX; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + + sb->s_op = &f2fs_sops; + sb->s_xattr = f2fs_xattr_handlers; + sb->s_export_op = &f2fs_export_ops; + sb->s_magic = F2FS_SUPER_MAGIC; + sb->s_time_gran = 1; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + + /* init f2fs-specific super block info */ + sbi->sb = sb; + sbi->raw_super = raw_super; + sbi->raw_super_buf = raw_super_buf; + mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); + mutex_init(&sbi->cp_mutex); + init_rwsem(&sbi->node_write); + clear_sbi_flag(sbi, SBI_POR_DOING); + spin_lock_init(&sbi->stat_lock); + + init_rwsem(&sbi->read_io.io_rwsem); + sbi->read_io.sbi = sbi; + sbi->read_io.bio = NULL; + for (i = 0; i < NR_PAGE_TYPE; i++) { + init_rwsem(&sbi->write_io[i].io_rwsem); + sbi->write_io[i].sbi = sbi; + sbi->write_io[i].bio = NULL; + } + + init_rwsem(&sbi->cp_rwsem); + init_waitqueue_head(&sbi->cp_wait); + init_sb_info(sbi); + + /* get an inode for meta space */ + sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); + if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); + err = PTR_ERR(sbi->meta_inode); + goto free_options; + } + + err = get_valid_checkpoint(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); + goto free_meta_inode; + } + + /* sanity checking of checkpoint */ + err = -EINVAL; + if (sanity_check_ckpt(sbi)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); + goto free_cp; + } + + sbi->total_valid_node_count = + le32_to_cpu(sbi->ckpt->valid_node_count); + sbi->total_valid_inode_count = + le32_to_cpu(sbi->ckpt->valid_inode_count); + sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); + sbi->total_valid_block_count = + le64_to_cpu(sbi->ckpt->valid_block_count); + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->alloc_valid_block_count = 0; + INIT_LIST_HEAD(&sbi->dir_inode_list); + spin_lock_init(&sbi->dir_inode_lock); + + init_extent_cache_info(sbi); + + init_ino_entry_info(sbi); + + /* setup f2fs internal modules */ + err = build_segment_manager(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); + goto free_sm; + } + err = build_node_manager(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); + goto free_nm; + } + + build_gc_manager(sbi); + + /* get an inode for node space */ + sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); + if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); + err = PTR_ERR(sbi->node_inode); + goto free_nm; + } + + /* if there are nt orphan nodes free them */ + recover_orphan_inodes(sbi); + + /* read root inode and dentry */ + root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); + if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); + err = PTR_ERR(root); + goto free_node_inode; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + err = -EINVAL; + goto free_node_inode; + } + + sb->s_root = d_make_root(root); /* allocate root dentry */ + if (!sb->s_root) { + err = -ENOMEM; + goto free_root_inode; + } + + err = f2fs_build_stats(sbi); + if (err) + goto free_root_inode; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + + if (test_opt(sbi, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto free_proc; + + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + /* + * mount should be failed, when device has readonly mode, and + * previous checkpoint was not done by clean system shutdown. + */ + if (bdev_read_only(sb->s_bdev) && + !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + err = -EROFS; + goto free_kobj; + } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + err = recover_fsync_data(sbi); + if (err) { + need_fsck = true; + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + goto free_kobj; + } + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } + kfree(options); + return 0; + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + f2fs_destroy_stats(sbi); +free_root_inode: + dput(sb->s_root); + sb->s_root = NULL; +free_node_inode: + iput(sbi->node_inode); +free_nm: + destroy_node_manager(sbi); +free_sm: + destroy_segment_manager(sbi); +free_cp: + kfree(sbi->ckpt); +free_meta_inode: + make_bad_inode(sbi->meta_inode); + iput(sbi->meta_inode); +free_options: + kfree(options); +free_sb_buf: + brelse(raw_super_buf); +free_sbi: + kfree(sbi); + + /* give only one another chance */ + if (retry) { + retry = false; + shrink_dcache_sb(sb); + goto try_onemore; + } + return err; +} + +static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); +} + +static void kill_f2fs_super(struct super_block *sb) +{ + if (sb->s_root) + set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + kill_block_super(sb); +} + +static struct file_system_type f2fs_fs_type = { + .owner = THIS_MODULE, + .name = "f2fs", + .mount = f2fs_mount, + .kill_sb = kill_f2fs_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("f2fs"); + +static int __init init_inodecache(void) +{ + f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info)); + if (!f2fs_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(f2fs_inode_cachep); +} + +static int __init init_f2fs_fs(void) +{ + int err; + + f2fs_build_trace_ios(); + + err = init_inodecache(); + if (err) + goto fail; + err = create_node_manager_caches(); + if (err) + goto free_inodecache; + err = create_segment_manager_caches(); + if (err) + goto free_node_manager_caches; + err = create_checkpoint_caches(); + if (err) + goto free_segment_manager_caches; + err = create_extent_cache(); + if (err) + goto free_checkpoint_caches; + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) { + err = -ENOMEM; + goto free_extent_cache; + } + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_kset; + f2fs_create_root_stats(); + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return 0; + +free_kset: + kset_unregister(f2fs_kset); +free_extent_cache: + destroy_extent_cache(); +free_checkpoint_caches: + destroy_checkpoint_caches(); +free_segment_manager_caches: + destroy_segment_manager_caches(); +free_node_manager_caches: + destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); +fail: + return err; +} + +static void __exit exit_f2fs_fs(void) +{ + remove_proc_entry("fs/f2fs", NULL); + f2fs_destroy_root_stats(); + unregister_filesystem(&f2fs_fs_type); + destroy_extent_cache(); + destroy_checkpoint_caches(); + destroy_segment_manager_caches(); + destroy_node_manager_caches(); + destroy_inodecache(); + kset_unregister(f2fs_kset); + f2fs_destroy_trace_ios(); +} + +module_init(init_f2fs_fs) +module_exit(exit_f2fs_fs) + +MODULE_AUTHOR("Samsung Electronics's Praesto Team"); +MODULE_DESCRIPTION("Flash Friendly File System"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/f2fs/trace.c b/kernel/fs/f2fs/trace.c new file mode 100644 index 000000000..875aa8179 --- /dev/null +++ b/kernel/fs/f2fs/trace.c @@ -0,0 +1,159 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include + +#include "f2fs.h" +#include "trace.h" + +static RADIX_TREE(pids, GFP_ATOMIC); +static spinlock_t pids_lock; +static struct last_io_info last_io; + +static inline void __print_last_io(void) +{ + if (!last_io.len) + return; + + trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + last_io.major, last_io.minor, + last_io.pid, "----------------", + last_io.type, + last_io.fio.rw, last_io.fio.blk_addr, + last_io.len); + memset(&last_io, 0, sizeof(last_io)); +} + +static int __file_type(struct inode *inode, pid_t pid) +{ + if (f2fs_is_atomic_file(inode)) + return __ATOMIC_FILE; + else if (f2fs_is_volatile_file(inode)) + return __VOLATILE_FILE; + else if (S_ISDIR(inode->i_mode)) + return __DIR_FILE; + else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) + return __NODE_FILE; + else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) + return __META_FILE; + else if (pid) + return __NORMAL_FILE; + else + return __MISC_FILE; +} + +void f2fs_trace_pid(struct page *page) +{ + struct inode *inode = page->mapping->host; + pid_t pid = task_pid_nr(current); + void *p; + + page->private = pid; + + if (radix_tree_preload(GFP_NOFS)) + return; + + spin_lock(&pids_lock); + p = radix_tree_lookup(&pids, pid); + if (p == current) + goto out; + if (p) + radix_tree_delete(&pids, pid); + + f2fs_radix_tree_insert(&pids, pid, current); + + trace_printk("%3x:%3x %4x %-16s\n", + MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + pid, current->comm); +out: + spin_unlock(&pids_lock); + radix_tree_preload_end(); +} + +void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) +{ + struct inode *inode; + pid_t pid; + int major, minor; + + if (flush) { + __print_last_io(); + return; + } + + inode = page->mapping->host; + pid = page_private(page); + + major = MAJOR(inode->i_sb->s_dev); + minor = MINOR(inode->i_sb->s_dev); + + if (last_io.major == major && last_io.minor == minor && + last_io.pid == pid && + last_io.type == __file_type(inode, pid) && + last_io.fio.rw == fio->rw && + last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.len++; + return; + } + + __print_last_io(); + + last_io.major = major; + last_io.minor = minor; + last_io.pid = pid; + last_io.type = __file_type(inode, pid); + last_io.fio = *fio; + last_io.len = 1; + return; +} + +void f2fs_build_trace_ios(void) +{ + spin_lock_init(&pids_lock); +} + +#define PIDVEC_SIZE 128 +static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, + unsigned int max_items) +{ + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!max_items)) + return 0; + + radix_tree_for_each_slot(slot, &pids, &iter, first_index) { + results[ret] = iter.index; + if (++ret == PIDVEC_SIZE) + break; + } + return ret; +} + +void f2fs_destroy_trace_ios(void) +{ + pid_t pid[PIDVEC_SIZE]; + pid_t next_pid = 0; + unsigned int found; + + spin_lock(&pids_lock); + while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { + unsigned idx; + + next_pid = pid[found - 1] + 1; + for (idx = 0; idx < found; idx++) + radix_tree_delete(&pids, pid[idx]); + } + spin_unlock(&pids_lock); +} diff --git a/kernel/fs/f2fs/trace.h b/kernel/fs/f2fs/trace.h new file mode 100644 index 000000000..1041dbeb5 --- /dev/null +++ b/kernel/fs/f2fs/trace.h @@ -0,0 +1,46 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_TRACE_H__ +#define __F2FS_TRACE_H__ + +#ifdef CONFIG_F2FS_IO_TRACE +#include + +enum file_type { + __NORMAL_FILE, + __DIR_FILE, + __NODE_FILE, + __META_FILE, + __ATOMIC_FILE, + __VOLATILE_FILE, + __MISC_FILE, +}; + +struct last_io_info { + int major, minor; + pid_t pid; + enum file_type type; + struct f2fs_io_info fio; + block_t len; +}; + +extern void f2fs_trace_pid(struct page *); +extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int); +extern void f2fs_build_trace_ios(void); +extern void f2fs_destroy_trace_ios(void); +#else +#define f2fs_trace_pid(p) +#define f2fs_trace_ios(p, i, n) +#define f2fs_build_trace_ios() +#define f2fs_destroy_trace_ios() + +#endif +#endif /* __F2FS_TRACE_H__ */ diff --git a/kernel/fs/f2fs/xattr.c b/kernel/fs/f2fs/xattr.c new file mode 100644 index 000000000..9757f65a0 --- /dev/null +++ b/kernel/fs/f2fs/xattr.c @@ -0,0 +1,618 @@ +/* + * fs/f2fs/xattr.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher + * + * Fix by Harrison Xing . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "f2fs.h" +#include "xattr.h" + +static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t len, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + int total_len, prefix_len = 0; + const char *prefix = NULL; + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + prefix = XATTR_USER_PREFIX; + prefix_len = XATTR_USER_PREFIX_LEN; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prefix = XATTR_TRUSTED_PREFIX; + prefix_len = XATTR_TRUSTED_PREFIX_LEN; + break; + case F2FS_XATTR_INDEX_SECURITY: + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + break; + default: + return -EINVAL; + } + + total_len = prefix_len + len + 1; + if (list && total_len <= list_size) { + memcpy(list, prefix, prefix_len); + memcpy(list + prefix_len, name, len); + list[prefix_len + len] = '\0'; + } + return total_len; +} + +static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + if (strcmp(name, "") == 0) + return -EINVAL; + return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL); +} + +static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + if (strcmp(name, "") == 0) + return -EINVAL; + + return f2fs_setxattr(d_inode(dentry), type, name, + value, size, NULL, flags); +} + +static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t len, int type) +{ + const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; + size_t size; + + if (type != F2FS_XATTR_INDEX_ADVISE) + return 0; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct inode *inode = d_inode(dentry); + + if (strcmp(name, "") != 0) + return -EINVAL; + + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; + return sizeof(char); +} + +static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = d_inode(dentry); + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value == NULL) + return -EINVAL; + + F2FS_I(inode)->i_advise |= *(char *)value; + mark_inode_dirty(inode); + return 0; +} + +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page, 0); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + +const struct xattr_handler f2fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = F2FS_XATTR_INDEX_USER, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = F2FS_XATTR_INDEX_TRUSTED, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_advise_handler = { + .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .flags = F2FS_XATTR_INDEX_ADVISE, + .list = f2fs_xattr_advise_list, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, +}; + +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +static const struct xattr_handler *f2fs_xattr_handler_map[] = { + [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif + [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, +}; + +const struct xattr_handler *f2fs_xattr_handlers[] = { + &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif + &f2fs_xattr_advise_handler, + NULL, +}; + +static inline const struct xattr_handler *f2fs_xattr_handler(int index) +{ + const struct xattr_handler *handler = NULL; + + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; + return handler; +} + +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static void *read_all_xattrs(struct inode *inode, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_xattr_header *header; + size_t size = PAGE_SIZE, inline_size = 0; + void *txattr_addr; + + inline_size = inline_xattr_size(inode); + + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + if (!txattr_addr) + return NULL; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + goto fail; + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + } + + /* read from xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) + goto fail; + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + f2fs_put_page(xpage, 1); + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + return txattr_addr; +fail: + kzfree(txattr_addr); + return NULL; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + size_t inline_size = 0; + void *xattr_addr; + struct page *xpage; + nid_t new_nid = 0; + int err; + + inline_size = inline_xattr_size(inode); + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(page); + } + inline_addr = inline_xattr_addr(page); + f2fs_wait_on_page_writeback(page, NODE); + } + memcpy(inline_addr, txattr_addr, inline_size); + f2fs_put_page(page, 1); + + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = truncate_xattr_node(inode, ipage); + alloc_nid_failed(sbi, new_nid); + return err; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + f2fs_bug_on(sbi, new_nid); + f2fs_wait_on_page_writeback(xpage, NODE); + } else { + struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + alloc_nid_done(sbi, new_nid); + } + + xattr_addr = page_address(xpage); + memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - + sizeof(struct node_footer)); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + /* need to checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + return 0; +} + +int f2fs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size, struct page *ipage) +{ + struct f2fs_xattr_entry *entry; + void *base_addr; + int error = 0; + size_t size, len; + + if (name == NULL) + return -EINVAL; + + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; + + base_addr = read_all_xattrs(inode, ipage); + if (!base_addr) + return -ENOMEM; + + entry = __find_xattr(base_addr, index, len, name); + if (IS_XATTR_LAST_ENTRY(entry)) { + error = -ENODATA; + goto cleanup; + } + + size = le16_to_cpu(entry->e_value_size); + + if (buffer && size > buffer_size) { + error = -ERANGE; + goto cleanup; + } + + if (buffer) { + char *pval = entry->e_name + entry->e_name_len; + memcpy(buffer, pval, size); + } + error = size; + +cleanup: + kzfree(base_addr); + return error; +} + +ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct f2fs_xattr_entry *entry; + void *base_addr; + int error = 0; + size_t rest = buffer_size; + + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; + + list_for_each_xattr(entry, base_addr) { + const struct xattr_handler *handler = + f2fs_xattr_handler(entry->e_name_index); + size_t size; + + if (!handler) + continue; + + size = handler->list(dentry, buffer, rest, entry->e_name, + entry->e_name_len, handler->flags); + if (buffer && size > rest) { + error = -ERANGE; + goto cleanup; + } + + if (buffer) + buffer += size; + rest -= size; + } + error = buffer_size - rest; +cleanup: + kzfree(base_addr); + return error; +} + +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_xattr_entry *here, *last; + void *base_addr; + int found, newsize; + size_t len; + __u32 new_hsize; + int error = -ENOMEM; + + if (name == NULL) + return -EINVAL; + + if (value == NULL) + size = 0; + + len = strlen(name); + + if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) + return -ERANGE; + + base_addr = read_all_xattrs(inode, ipage); + if (!base_addr) + goto exit; + + /* find entry with wanted name. */ + here = __find_xattr(base_addr, index, len, name); + + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; + + if ((flags & XATTR_REPLACE) && !found) { + error = -ENODATA; + goto exit; + } else if ((flags & XATTR_CREATE) && found) { + error = -EEXIST; + goto exit; + } + + last = here; + while (!IS_XATTR_LAST_ENTRY(last)) + last = XATTR_NEXT_ENTRY(last); + + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); + + /* 1. Check space */ + if (value) { + int free; + /* + * If value is NULL, it is remove operation. + * In case of update operation, we calculate free. + */ + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); + if (found) + free = free + ENTRY_SIZE(here); + + if (unlikely(free < newsize)) { + error = -ENOSPC; + goto exit; + } + } + + /* 2. Remove old entry */ + if (found) { + /* + * If entry is found, remove old entry. + * If not found, remove operation is not needed. + */ + struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); + int oldsize = ENTRY_SIZE(here); + + memmove(here, next, (char *)last - (char *)next); + last = (struct f2fs_xattr_entry *)((char *)last - oldsize); + memset(last, 0, oldsize); + } + + new_hsize = (char *)last - (char *)base_addr; + + /* 3. Write new entry */ + if (value) { + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ + memset(last, 0, newsize); + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); + new_hsize += newsize; + } + + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; + + if (is_inode_flag_set(fi, FI_ACL_MODE)) { + inode->i_mode = fi->i_acl_mode; + inode->i_ctime = CURRENT_TIME; + clear_inode_flag(fi, FI_ACL_MODE); + } + + if (ipage) + update_inode(inode, ipage); + else + update_inode_page(inode); +exit: + kzfree(base_addr); + return error; +} + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + /* this case is only from init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); + f2fs_balance_fs(sbi); + + f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_sem); + f2fs_unlock_op(sbi); + + return err; +} diff --git a/kernel/fs/f2fs/xattr.h b/kernel/fs/f2fs/xattr.h new file mode 100644 index 000000000..969d792ca --- /dev/null +++ b/kernel/fs/f2fs/xattr.h @@ -0,0 +1,152 @@ +/* + * fs/f2fs/xattr.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.h + * + * On-disk format of extended attributes for the ext2 filesystem. + * + * (C) 2001 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_XATTR_H__ +#define __F2FS_XATTR_H__ + +#include +#include + +/* Magic value in attribute blocks */ +#define F2FS_XATTR_MAGIC 0xF2F52011 + +/* Maximum number of references to one attribute block */ +#define F2FS_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise" +#define F2FS_XATTR_INDEX_USER 1 +#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define F2FS_XATTR_INDEX_TRUSTED 4 +#define F2FS_XATTR_INDEX_LUSTRE 5 +#define F2FS_XATTR_INDEX_SECURITY 6 +#define F2FS_XATTR_INDEX_ADVISE 7 + +struct f2fs_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct f2fs_xattr_entry { + __u8 e_name_index; + __u8 e_name_len; + __le16 e_value_size; /* size of attribute value */ + char e_name[0]; /* attribute name */ +}; + +#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) +#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) +#define XATTR_ROUND (3) + +#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) + +#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ + entry->e_name_len + le16_to_cpu(entry->e_value_size))) + +#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ + ENTRY_SIZE(entry))) + +#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define list_for_each_xattr(entry, addr) \ + for (entry = XATTR_FIRST_ENTRY(addr);\ + !IS_XATTR_LAST_ENTRY(entry);\ + entry = XATTR_NEXT_ENTRY(entry)) + +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ + sizeof(struct node_footer) - sizeof(__u32)) + +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) + +/* + * On-disk structure of f2fs_xattr + * We use inline xattrs space + 1 block for xattr. + * + * +--------------------+ + * | f2fs_xattr_header | + * | | + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 1 | + * | .e_name_len = 3 | + * | .e_value_size = 14 | + * | .e_name = "foo" | + * | "value_of_xattr" |<- value_offs = e_name + e_name_len + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 4 | + * | .e_name = "bar" | + * +--------------------+ + * | | + * | Free | + * | | + * +--------------------+<- MIN_OFFSET + * | node_footer | + * | (nid, ino, offset) | + * +--------------------+ + * + **/ + +#ifdef CONFIG_F2FS_FS_XATTR +extern const struct xattr_handler f2fs_xattr_user_handler; +extern const struct xattr_handler f2fs_xattr_trusted_handler; +extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; + +extern const struct xattr_handler *f2fs_xattr_handlers[]; + +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *, int); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct page *); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +#else + +#define f2fs_xattr_handlers NULL +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} +static inline int f2fs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size, struct page *dpage) +{ + return -EOPNOTSUPP; +} +static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif + +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif +#endif /* __F2FS_XATTR_H__ */ diff --git a/kernel/fs/fat/Kconfig b/kernel/fs/fat/Kconfig new file mode 100644 index 000000000..182f9ffe2 --- /dev/null +++ b/kernel/fs/fat/Kconfig @@ -0,0 +1,100 @@ +config FAT_FS + tristate + select NLS + help + If you want to use one of the FAT-based file systems (the MS-DOS and + VFAT (Windows 95) file systems), then you must say Y or M here + to include FAT support. You will then be able to mount partitions or + diskettes with FAT-based file systems and transparently access the + files on them, i.e. MSDOS files will look and behave just like all + other Unix files. + + This FAT support is not a file system in itself, it only provides + the foundation for the other file systems. You will have to say Y or + M to at least one of "MSDOS fs support" or "VFAT fs support" in + order to make use of it. + + Another way to read and write MSDOS floppies and hard drive + partitions from within Linux (but not transparently) is with the + mtools ("man mtools") program suite. You don't need to say Y here in + order to do that. + + If you need to move large files on floppies between a DOS and a + Linux box, say Y here, mount the floppy under Linux with an MSDOS + file system and use GNU tar's M option. GNU tar is a program + available for Unix and DOS ("man tar" or "info tar"). + + The FAT support will enlarge your kernel by about 37 KB. If unsure, + say Y. + + To compile this as a module, choose M here: the module will be called + fat. Note that if you compile the FAT support as a module, you + cannot compile any of the FAT-based file systems into the kernel + -- they will have to be modules as well. + +config MSDOS_FS + tristate "MSDOS fs support" + select FAT_FS + help + This allows you to mount MSDOS partitions of your hard drive (unless + they are compressed; to access compressed MSDOS partitions under + Linux, you can either use the DOS emulator DOSEMU, described in the + DOSEMU-HOWTO, available from + , or try dmsdosfs in + . If you + intend to use dosemu with a non-compressed MSDOS partition, say Y + here) and MSDOS floppies. This means that file access becomes + transparent, i.e. the MSDOS files look and behave just like all + other Unix files. + + If you have Windows 95 or Windows NT installed on your MSDOS + partitions, you should use the VFAT file system (say Y to "VFAT fs + support" below), or you will not be able to see the long filenames + generated by Windows 95 / Windows NT. + + This option will enlarge your kernel by about 7 KB. If unsure, + answer Y. This will only work if you said Y to "DOS FAT fs support" + as well. To compile this as a module, choose M here: the module will + be called msdos. + +config VFAT_FS + tristate "VFAT (Windows-95) fs support" + select FAT_FS + help + This option provides support for normal Windows file systems with + long filenames. That includes non-compressed FAT-based file systems + used by Windows 95, Windows 98, Windows NT 4.0, and the Unix + programs from the mtools package. + + The VFAT support enlarges your kernel by about 10 KB and it only + works if you said Y to the "DOS FAT fs support" above. Please read + the file for details. If + unsure, say Y. + + To compile this as a module, choose M here: the module will be called + vfat. + +config FAT_DEFAULT_CODEPAGE + int "Default codepage for FAT" + depends on MSDOS_FS || VFAT_FS + default 437 + help + This option should be set to the codepage of your FAT filesystems. + It can be overridden with the "codepage" mount option. + See for more information. + +config FAT_DEFAULT_IOCHARSET + string "Default iocharset for FAT" + depends on VFAT_FS + default "iso8859-1" + help + Set this to the default input/output character set you'd + like FAT to use. It should probably match the character set + that most of your FAT filesystems use, and can be overridden + with the "iocharset" mount option for FAT filesystems. + Note that "utf8" is not recommended for FAT filesystems. + If unsure, you shouldn't set "utf8" here. + See for more information. + + Enable any character sets you need in File Systems/Native Language + Support. diff --git a/kernel/fs/fat/Makefile b/kernel/fs/fat/Makefile new file mode 100644 index 000000000..964b634f6 --- /dev/null +++ b/kernel/fs/fat/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the Linux fat filesystem support. +# + +obj-$(CONFIG_FAT_FS) += fat.o +obj-$(CONFIG_VFAT_FS) += vfat.o +obj-$(CONFIG_MSDOS_FS) += msdos.o + +fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o +vfat-y := namei_vfat.o +msdos-y := namei_msdos.o diff --git a/kernel/fs/fat/cache.c b/kernel/fs/fat/cache.c new file mode 100644 index 000000000..93fc62232 --- /dev/null +++ b/kernel/fs/fat/cache.c @@ -0,0 +1,351 @@ +/* + * linux/fs/fat/cache.c + * + * Written 1992,1993 by Werner Almesberger + * + * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead + * of inode number. + * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. + */ + +#include +#include "fat.h" + +/* this must be > 0. */ +#define FAT_MAX_CACHE 8 + +struct fat_cache { + struct list_head cache_list; + int nr_contig; /* number of contiguous clusters */ + int fcluster; /* cluster number in the file. */ + int dcluster; /* cluster number on disk. */ +}; + +struct fat_cache_id { + unsigned int id; + int nr_contig; + int fcluster; + int dcluster; +}; + +static inline int fat_max_cache(struct inode *inode) +{ + return FAT_MAX_CACHE; +} + +static struct kmem_cache *fat_cache_cachep; + +static void init_once(void *foo) +{ + struct fat_cache *cache = (struct fat_cache *)foo; + + INIT_LIST_HEAD(&cache->cache_list); +} + +int __init fat_cache_init(void) +{ + fat_cache_cachep = kmem_cache_create("fat_cache", + sizeof(struct fat_cache), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (fat_cache_cachep == NULL) + return -ENOMEM; + return 0; +} + +void fat_cache_destroy(void) +{ + kmem_cache_destroy(fat_cache_cachep); +} + +static inline struct fat_cache *fat_cache_alloc(struct inode *inode) +{ + return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); +} + +static inline void fat_cache_free(struct fat_cache *cache) +{ + BUG_ON(!list_empty(&cache->cache_list)); + kmem_cache_free(fat_cache_cachep, cache); +} + +static inline void fat_cache_update_lru(struct inode *inode, + struct fat_cache *cache) +{ + if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list) + list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru); +} + +static int fat_cache_lookup(struct inode *inode, int fclus, + struct fat_cache_id *cid, + int *cached_fclus, int *cached_dclus) +{ + static struct fat_cache nohit = { .fcluster = 0, }; + + struct fat_cache *hit = &nohit, *p; + int offset = -1; + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { + /* Find the cache of "fclus" or nearest cache. */ + if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { + hit = p; + if ((hit->fcluster + hit->nr_contig) < fclus) { + offset = hit->nr_contig; + } else { + offset = fclus - hit->fcluster; + break; + } + } + } + if (hit != &nohit) { + fat_cache_update_lru(inode, hit); + + cid->id = MSDOS_I(inode)->cache_valid_id; + cid->nr_contig = hit->nr_contig; + cid->fcluster = hit->fcluster; + cid->dcluster = hit->dcluster; + *cached_fclus = cid->fcluster + offset; + *cached_dclus = cid->dcluster + offset; + } + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + + return offset; +} + +static struct fat_cache *fat_cache_merge(struct inode *inode, + struct fat_cache_id *new) +{ + struct fat_cache *p; + + list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { + /* Find the same part as "new" in cluster-chain. */ + if (p->fcluster == new->fcluster) { + BUG_ON(p->dcluster != new->dcluster); + if (new->nr_contig > p->nr_contig) + p->nr_contig = new->nr_contig; + return p; + } + } + return NULL; +} + +static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) +{ + struct fat_cache *cache, *tmp; + + if (new->fcluster == -1) /* dummy cache */ + return; + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + if (new->id != FAT_CACHE_VALID && + new->id != MSDOS_I(inode)->cache_valid_id) + goto out; /* this cache was invalidated */ + + cache = fat_cache_merge(inode, new); + if (cache == NULL) { + if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) { + MSDOS_I(inode)->nr_caches++; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + + tmp = fat_cache_alloc(inode); + if (!tmp) { + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + MSDOS_I(inode)->nr_caches--; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + return; + } + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + cache = fat_cache_merge(inode, new); + if (cache != NULL) { + MSDOS_I(inode)->nr_caches--; + fat_cache_free(tmp); + goto out_update_lru; + } + cache = tmp; + } else { + struct list_head *p = MSDOS_I(inode)->cache_lru.prev; + cache = list_entry(p, struct fat_cache, cache_list); + } + cache->fcluster = new->fcluster; + cache->dcluster = new->dcluster; + cache->nr_contig = new->nr_contig; + } +out_update_lru: + fat_cache_update_lru(inode, cache); +out: + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); +} + +/* + * Cache invalidation occurs rarely, thus the LRU chain is not updated. It + * fixes itself after a while. + */ +static void __fat_cache_inval_inode(struct inode *inode) +{ + struct msdos_inode_info *i = MSDOS_I(inode); + struct fat_cache *cache; + + while (!list_empty(&i->cache_lru)) { + cache = list_entry(i->cache_lru.next, + struct fat_cache, cache_list); + list_del_init(&cache->cache_list); + i->nr_caches--; + fat_cache_free(cache); + } + /* Update. The copy of caches before this id is discarded. */ + i->cache_valid_id++; + if (i->cache_valid_id == FAT_CACHE_VALID) + i->cache_valid_id++; +} + +void fat_cache_inval_inode(struct inode *inode) +{ + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + __fat_cache_inval_inode(inode); + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); +} + +static inline int cache_contiguous(struct fat_cache_id *cid, int dclus) +{ + cid->nr_contig++; + return ((cid->dcluster + cid->nr_contig) == dclus); +} + +static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) +{ + cid->id = FAT_CACHE_VALID; + cid->fcluster = fclus; + cid->dcluster = dclus; + cid->nr_contig = 0; +} + +int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) +{ + struct super_block *sb = inode->i_sb; + const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits; + struct fat_entry fatent; + struct fat_cache_id cid; + int nr; + + BUG_ON(MSDOS_I(inode)->i_start == 0); + + *fclus = 0; + *dclus = MSDOS_I(inode)->i_start; + if (cluster == 0) + return 0; + + if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) { + /* + * dummy, always not contiguous + * This is reinitialized by cache_init(), later. + */ + cache_init(&cid, -1, -1); + } + + fatent_init(&fatent); + while (*fclus < cluster) { + /* prevent the infinite loop of cluster chain */ + if (*fclus > limit) { + fat_fs_error_ratelimit(sb, + "%s: detected the cluster chain loop" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); + nr = -EIO; + goto out; + } + + nr = fat_ent_read(inode, &fatent, *dclus); + if (nr < 0) + goto out; + else if (nr == FAT_ENT_FREE) { + fat_fs_error_ratelimit(sb, + "%s: invalid cluster chain (i_pos %lld)", + __func__, + MSDOS_I(inode)->i_pos); + nr = -EIO; + goto out; + } else if (nr == FAT_ENT_EOF) { + fat_cache_add(inode, &cid); + goto out; + } + (*fclus)++; + *dclus = nr; + if (!cache_contiguous(&cid, *dclus)) + cache_init(&cid, *fclus, *dclus); + } + nr = 0; + fat_cache_add(inode, &cid); +out: + fatent_brelse(&fatent); + return nr; +} + +static int fat_bmap_cluster(struct inode *inode, int cluster) +{ + struct super_block *sb = inode->i_sb; + int ret, fclus, dclus; + + if (MSDOS_I(inode)->i_start == 0) + return 0; + + ret = fat_get_cluster(inode, cluster, &fclus, &dclus); + if (ret < 0) + return ret; + else if (ret == FAT_ENT_EOF) { + fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); + return -EIO; + } + return dclus; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + sector_t last_block; + int cluster, offset; + + *phys = 0; + *mapped_blocks = 0; + if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) { + if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) { + *phys = sector + sbi->dir_start; + *mapped_blocks = 1; + } + return 0; + } + + last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= last_block) { + if (!create) + return 0; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= last_block) + return 0; + } + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *phys = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + return 0; +} diff --git a/kernel/fs/fat/dir.c b/kernel/fs/fat/dir.c new file mode 100644 index 000000000..4afc4d9d2 --- /dev/null +++ b/kernel/fs/fat/dir.c @@ -0,0 +1,1402 @@ +/* + * linux/fs/fat/dir.c + * + * directory handling functions for fat-based filesystems + * + * Written 1992,1993 by Werner Almesberger + * + * Hidden files 1995 by Albert Cahalan + * + * VFAT extensions by Gordon Chaffee + * Merged with msdos fs by Henrik Storner + * Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV + * Short name translation 1999, 2001 by Wolfram Pienkoss + */ + +#include +#include +#include +#include "fat.h" + +/* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + +static inline unsigned char fat_tolower(unsigned char c) +{ + return ((c >= 'A') && (c <= 'Z')) ? c+32 : c; +} + +static inline loff_t fat_make_i_pos(struct super_block *sb, + struct buffer_head *bh, + struct msdos_dir_entry *de) +{ + return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits) + | (de - (struct msdos_dir_entry *)bh->b_data); +} + +static inline void fat_dir_readahead(struct inode *dir, sector_t iblock, + sector_t phys) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + int sec; + + /* This is not a first sector of cluster, or sec_per_clus == 1 */ + if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1) + return; + /* root dir of FAT12/FAT16 */ + if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO)) + return; + + bh = sb_find_get_block(sb, phys); + if (bh == NULL || !buffer_uptodate(bh)) { + for (sec = 0; sec < sbi->sec_per_clus; sec++) + sb_breadahead(sb, phys + sec); + } + brelse(bh); +} + +/* Returns the inode number of the directory entry at offset pos. If bh is + non-NULL, it is brelse'd before. Pos is incremented. The buffer header is + returned in bh. + AV. Most often we do it item-by-item. Makes sense to optimize. + AV. OK, there we go: if both bh and de are non-NULL we assume that we just + AV. want the next entry (took one explicit de=NULL in vfat/namei.c). + AV. It's done in fat_get_entry() (inlined), here the slow case lives. + AV. Additionally, when we return -1 (i.e. reached the end of directory) + AV. we make bh NULL. + */ +static int fat__get_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, struct msdos_dir_entry **de) +{ + struct super_block *sb = dir->i_sb; + sector_t phys, iblock; + unsigned long mapped_blocks; + int err, offset; + +next: + if (*bh) + brelse(*bh); + + *bh = NULL; + iblock = *pos >> sb->s_blocksize_bits; + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + if (err || !phys) + return -1; /* beyond EOF or error */ + + fat_dir_readahead(dir, iblock, phys); + + *bh = sb_bread(sb, phys); + if (*bh == NULL) { + fat_msg_ratelimit(sb, KERN_ERR, + "Directory bread(block %llu) failed", (llu)phys); + /* skip this block */ + *pos = (iblock + 1) << sb->s_blocksize_bits; + goto next; + } + + offset = *pos & (sb->s_blocksize - 1); + *pos += sizeof(struct msdos_dir_entry); + *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); + + return 0; +} + +static inline int fat_get_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + /* Fast stuff first */ + if (*bh && *de && + (*de - (struct msdos_dir_entry *)(*bh)->b_data) < + MSDOS_SB(dir->i_sb)->dir_per_block - 1) { + *pos += sizeof(struct msdos_dir_entry); + (*de)++; + return 0; + } + return fat__get_entry(dir, pos, bh, de); +} + +/* + * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII. + * If uni_xlate is enabled and we can't get a 1:1 conversion, use a + * colon as an escape character since it is normally invalid on the vfat + * filesystem. The following four characters are the hexadecimal digits + * of Unicode value. This lets us do a full dump and restore of Unicode + * filenames. We could get into some trouble with long Unicode names, + * but ignore that right now. + * Ahem... Stack smashing in ring 0 isn't fun. Fixed. + */ +static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, + const wchar_t *uni, int len, struct nls_table *nls) +{ + int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate; + const wchar_t *ip; + wchar_t ec; + unsigned char *op; + int charlen; + + ip = uni; + op = ascii; + + while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { + ec = *ip++; + charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE); + if (charlen > 0) { + op += charlen; + len -= charlen; + } else { + if (uni_xlate == 1) { + *op++ = ':'; + op = hex_byte_pack(op, ec >> 8); + op = hex_byte_pack(op, ec); + len -= 5; + } else { + *op++ = '?'; + len--; + } + } + } + + if (unlikely(*ip)) { + fat_msg(sb, KERN_WARNING, + "filename was truncated while converting."); + } + + *op = 0; + return op - ascii; +} + +static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, + unsigned char *buf, int size) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + if (sbi->options.utf8) + return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, + UTF16_HOST_ENDIAN, buf, size); + else + return uni16_to_x8(sb, buf, uni, size, sbi->nls_io); +} + +static inline int +fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) +{ + int charlen; + + charlen = t->char2uni(c, clen, uni); + if (charlen < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } + return charlen; +} + +static inline int +fat_short2lower_uni(struct nls_table *t, unsigned char *c, + int clen, wchar_t *uni) +{ + int charlen; + wchar_t wc; + + charlen = t->char2uni(c, clen, &wc); + if (charlen < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } else if (charlen <= 1) { + unsigned char nc = t->charset2lower[*c]; + + if (!nc) + nc = *c; + + charlen = t->char2uni(&nc, 1, uni); + if (charlen < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } + } else + *uni = wc; + + return charlen; +} + +static inline int +fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size, + wchar_t *uni_buf, unsigned short opt, int lower) +{ + int len = 0; + + if (opt & VFAT_SFN_DISPLAY_LOWER) + len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); + else if (opt & VFAT_SFN_DISPLAY_WIN95) + len = fat_short2uni(nls, buf, buf_size, uni_buf); + else if (opt & VFAT_SFN_DISPLAY_WINNT) { + if (lower) + len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); + else + len = fat_short2uni(nls, buf, buf_size, uni_buf); + } else + len = fat_short2uni(nls, buf, buf_size, uni_buf); + + return len; +} + +static inline int fat_name_match(struct msdos_sb_info *sbi, + const unsigned char *a, int a_len, + const unsigned char *b, int b_len) +{ + if (a_len != b_len) + return 0; + + if (sbi->options.name_check != 's') + return !nls_strnicmp(sbi->nls_io, a, b, a_len); + else + return !memcmp(a, b, a_len); +} + +enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; + +/** + * fat_parse_long - Parse extended directory entry. + * + * This function returns zero on success, negative value on error, or one of + * the following: + * + * %PARSE_INVALID - Directory entry is invalid. + * %PARSE_NOT_LONGNAME - Directory entry does not contain longname. + * %PARSE_EOF - Directory has no more entries. + */ +static int fat_parse_long(struct inode *dir, loff_t *pos, + struct buffer_head **bh, struct msdos_dir_entry **de, + wchar_t **unicode, unsigned char *nr_slots) +{ + struct msdos_dir_slot *ds; + unsigned char id, slot, slots, alias_checksum; + + if (!*unicode) { + *unicode = __getname(); + if (!*unicode) { + brelse(*bh); + return -ENOMEM; + } + } +parse_long: + slots = 0; + ds = (struct msdos_dir_slot *)*de; + id = ds->id; + if (!(id & 0x40)) + return PARSE_INVALID; + slots = id & ~0x40; + if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */ + return PARSE_INVALID; + *nr_slots = slots; + alias_checksum = ds->alias_checksum; + + slot = slots; + while (1) { + int offset; + + slot--; + offset = slot * 13; + fat16_towchar(*unicode + offset, ds->name0_4, 5); + fat16_towchar(*unicode + offset + 5, ds->name5_10, 6); + fat16_towchar(*unicode + offset + 11, ds->name11_12, 2); + + if (ds->id & 0x40) + (*unicode)[offset + 13] = 0; + if (fat_get_entry(dir, pos, bh, de) < 0) + return PARSE_EOF; + if (slot == 0) + break; + ds = (struct msdos_dir_slot *)*de; + if (ds->attr != ATTR_EXT) + return PARSE_NOT_LONGNAME; + if ((ds->id & ~0x40) != slot) + goto parse_long; + if (ds->alias_checksum != alias_checksum) + goto parse_long; + } + if ((*de)->name[0] == DELETED_FLAG) + return PARSE_INVALID; + if ((*de)->attr == ATTR_EXT) + goto parse_long; + if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME)) + return PARSE_INVALID; + if (fat_checksum((*de)->name) != alias_checksum) + *nr_slots = 0; + + return 0; +} + +/** + * fat_parse_short - Parse MS-DOS (short) directory entry. + * @sb: superblock + * @de: directory entry to parse + * @name: FAT_MAX_SHORT_SIZE array in which to place extracted name + * @dot_hidden: Nonzero == prepend '.' to names with ATTR_HIDDEN + * + * Returns the number of characters extracted into 'name'. + */ +static int fat_parse_short(struct super_block *sb, + const struct msdos_dir_entry *de, + unsigned char *name, int dot_hidden) +{ + const struct msdos_sb_info *sbi = MSDOS_SB(sb); + int isvfat = sbi->options.isvfat; + int nocase = sbi->options.nocase; + unsigned short opt_shortname = sbi->options.shortname; + struct nls_table *nls_disk = sbi->nls_disk; + wchar_t uni_name[14]; + unsigned char c, work[MSDOS_NAME]; + unsigned char *ptname = name; + int chi, chl, i, j, k; + int dotoffset = 0; + int name_len = 0, uni_len = 0; + + if (!isvfat && dot_hidden && (de->attr & ATTR_HIDDEN)) { + *ptname++ = '.'; + dotoffset = 1; + } + + memcpy(work, de->name, sizeof(work)); + /* see namei.c, msdos_format_name */ + if (work[0] == 0x05) + work[0] = 0xE5; + + /* Filename */ + for (i = 0, j = 0; i < 8;) { + c = work[i]; + if (!c) + break; + chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, + &uni_name[j++], opt_shortname, + de->lcase & CASE_LOWER_BASE); + if (chl <= 1) { + if (!isvfat) + ptname[i] = nocase ? c : fat_tolower(c); + i++; + if (c != ' ') { + name_len = i; + uni_len = j; + } + } else { + uni_len = j; + if (isvfat) + i += min(chl, 8-i); + else { + for (chi = 0; chi < chl && i < 8; chi++, i++) + ptname[i] = work[i]; + } + if (chl) + name_len = i; + } + } + + i = name_len; + j = uni_len; + fat_short2uni(nls_disk, ".", 1, &uni_name[j++]); + if (!isvfat) + ptname[i] = '.'; + i++; + + /* Extension */ + for (k = 8; k < MSDOS_NAME;) { + c = work[k]; + if (!c) + break; + chl = fat_shortname2uni(nls_disk, &work[k], MSDOS_NAME - k, + &uni_name[j++], opt_shortname, + de->lcase & CASE_LOWER_EXT); + if (chl <= 1) { + k++; + if (!isvfat) + ptname[i] = nocase ? c : fat_tolower(c); + i++; + if (c != ' ') { + name_len = i; + uni_len = j; + } + } else { + uni_len = j; + if (isvfat) { + int offset = min(chl, MSDOS_NAME-k); + k += offset; + i += offset; + } else { + for (chi = 0; chi < chl && k < MSDOS_NAME; + chi++, i++, k++) { + ptname[i] = work[k]; + } + } + if (chl) + name_len = i; + } + } + + if (name_len > 0) { + name_len += dotoffset; + + if (sbi->options.isvfat) { + uni_name[uni_len] = 0x0000; + name_len = fat_uni_to_x8(sb, uni_name, name, + FAT_MAX_SHORT_SIZE); + } + } + + return name_len; +} + +/* + * Return values: negative -> error/not found, 0 -> found. + */ +int fat_search_long(struct inode *inode, const unsigned char *name, + int name_len, struct fat_slot_info *sinfo) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de; + unsigned char nr_slots; + wchar_t *unicode = NULL; + unsigned char bufname[FAT_MAX_SHORT_SIZE]; + loff_t cpos = 0; + int err, len; + + err = -ENOENT; + while (1) { + if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + goto end_of_dir; +parse_record: + nr_slots = 0; + if (de->name[0] == DELETED_FLAG) + continue; + if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) + continue; + if (de->attr != ATTR_EXT && IS_FREE(de->name)) + continue; + if (de->attr == ATTR_EXT) { + int status = fat_parse_long(inode, &cpos, &bh, &de, + &unicode, &nr_slots); + if (status < 0) { + err = status; + goto end_of_dir; + } else if (status == PARSE_INVALID) + continue; + else if (status == PARSE_NOT_LONGNAME) + goto parse_record; + else if (status == PARSE_EOF) + goto end_of_dir; + } + + /* Never prepend '.' to hidden files here. + * That is done only for msdos mounts (and only when + * 'dotsOK=yes'); if we are executing here, it is in the + * context of a vfat mount. + */ + len = fat_parse_short(sb, de, bufname, 0); + if (len == 0) + continue; + + /* Compare shortname */ + if (fat_name_match(sbi, name, name_len, bufname, len)) + goto found; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + + /* Compare longname */ + len = fat_uni_to_x8(sb, unicode, longname, size); + if (fat_name_match(sbi, name, name_len, longname, len)) + goto found; + } + } + +found: + nr_slots++; /* include the de */ + sinfo->slot_off = cpos - nr_slots * sizeof(*de); + sinfo->nr_slots = nr_slots; + sinfo->de = de; + sinfo->bh = bh; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + err = 0; +end_of_dir: + if (unicode) + __putname(unicode); + + return err; +} +EXPORT_SYMBOL_GPL(fat_search_long); + +struct fat_ioctl_filldir_callback { + struct dir_context ctx; + void __user *dirent; + int result; + /* for dir ioctl */ + const char *longname; + int long_len; + const char *shortname; + int short_len; +}; + +static int __fat_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, int short_only, + struct fat_ioctl_filldir_callback *both) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct msdos_dir_entry *de; + unsigned char nr_slots; + wchar_t *unicode = NULL; + unsigned char bufname[FAT_MAX_SHORT_SIZE]; + int isvfat = sbi->options.isvfat; + const char *fill_name = NULL; + int fake_offset = 0; + loff_t cpos; + int short_len = 0, fill_len = 0; + int ret = 0; + + mutex_lock(&sbi->s_lock); + + cpos = ctx->pos; + /* Fake . and .. for the root directory. */ + if (inode->i_ino == MSDOS_ROOT_INO) { + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos == 2) { + fake_offset = 1; + cpos = 0; + } + } + if (cpos & (sizeof(struct msdos_dir_entry) - 1)) { + ret = -ENOENT; + goto out; + } + + bh = NULL; +get_new: + if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + goto end_of_dir; +parse_record: + nr_slots = 0; + /* + * Check for long filename entry, but if short_only, we don't + * need to parse long filename. + */ + if (isvfat && !short_only) { + if (de->name[0] == DELETED_FLAG) + goto record_end; + if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) + goto record_end; + if (de->attr != ATTR_EXT && IS_FREE(de->name)) + goto record_end; + } else { + if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name)) + goto record_end; + } + + if (isvfat && de->attr == ATTR_EXT) { + int status = fat_parse_long(inode, &cpos, &bh, &de, + &unicode, &nr_slots); + if (status < 0) { + ctx->pos = cpos; + ret = status; + goto out; + } else if (status == PARSE_INVALID) + goto record_end; + else if (status == PARSE_NOT_LONGNAME) + goto parse_record; + else if (status == PARSE_EOF) + goto end_of_dir; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + int len = fat_uni_to_x8(sb, unicode, longname, size); + + fill_name = longname; + fill_len = len; + /* !both && !short_only, so we don't need shortname. */ + if (!both) + goto start_filldir; + + short_len = fat_parse_short(sb, de, bufname, + sbi->options.dotsOK); + if (short_len == 0) + goto record_end; + /* hack for fat_ioctl_filldir() */ + both->longname = fill_name; + both->long_len = fill_len; + both->shortname = bufname; + both->short_len = short_len; + fill_name = NULL; + fill_len = 0; + goto start_filldir; + } + } + + short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK); + if (short_len == 0) + goto record_end; + + fill_name = bufname; + fill_len = short_len; + +start_filldir: + if (!fake_offset) + ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + + if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { + if (!dir_emit_dot(file, ctx)) + goto fill_failed; + } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + if (!dir_emit_dotdot(file, ctx)) + goto fill_failed; + } else { + unsigned long inum; + loff_t i_pos = fat_make_i_pos(sb, bh, de); + struct inode *tmp = fat_iget(sb, i_pos); + if (tmp) { + inum = tmp->i_ino; + iput(tmp); + } else + inum = iunique(sb, MSDOS_ROOT_INO); + if (!dir_emit(ctx, fill_name, fill_len, inum, + (de->attr & ATTR_DIR) ? DT_DIR : DT_REG)) + goto fill_failed; + } + +record_end: + fake_offset = 0; + ctx->pos = cpos; + goto get_new; +end_of_dir: + ctx->pos = cpos; +fill_failed: + brelse(bh); + if (unicode) + __putname(unicode); +out: + mutex_unlock(&sbi->s_lock); + return ret; +} + +static int fat_readdir(struct file *file, struct dir_context *ctx) +{ + return __fat_readdir(file_inode(file), file, ctx, 0, NULL); +} + +#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ +static int func(struct dir_context *ctx, const char *name, int name_len, \ + loff_t offset, u64 ino, unsigned int d_type) \ +{ \ + struct fat_ioctl_filldir_callback *buf = \ + container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \ + struct dirent_type __user *d1 = buf->dirent; \ + struct dirent_type __user *d2 = d1 + 1; \ + \ + if (buf->result) \ + return -EINVAL; \ + buf->result++; \ + \ + if (name != NULL) { \ + /* dirent has only short name */ \ + if (name_len >= sizeof(d1->d_name)) \ + name_len = sizeof(d1->d_name) - 1; \ + \ + if (put_user(0, d2->d_name) || \ + put_user(0, &d2->d_reclen) || \ + copy_to_user(d1->d_name, name, name_len) || \ + put_user(0, d1->d_name + name_len) || \ + put_user(name_len, &d1->d_reclen)) \ + goto efault; \ + } else { \ + /* dirent has short and long name */ \ + const char *longname = buf->longname; \ + int long_len = buf->long_len; \ + const char *shortname = buf->shortname; \ + int short_len = buf->short_len; \ + \ + if (long_len >= sizeof(d1->d_name)) \ + long_len = sizeof(d1->d_name) - 1; \ + if (short_len >= sizeof(d1->d_name)) \ + short_len = sizeof(d1->d_name) - 1; \ + \ + if (copy_to_user(d2->d_name, longname, long_len) || \ + put_user(0, d2->d_name + long_len) || \ + put_user(long_len, &d2->d_reclen) || \ + put_user(ino, &d2->d_ino) || \ + put_user(offset, &d2->d_off) || \ + copy_to_user(d1->d_name, shortname, short_len) || \ + put_user(0, d1->d_name + short_len) || \ + put_user(short_len, &d1->d_reclen)) \ + goto efault; \ + } \ + return 0; \ +efault: \ + buf->result = -EFAULT; \ + return -EFAULT; \ +} + +FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) + +static int fat_ioctl_readdir(struct inode *inode, struct file *file, + void __user *dirent, filldir_t filldir, + int short_only, int both) +{ + struct fat_ioctl_filldir_callback buf = { + .ctx.actor = filldir, + .dirent = dirent + }; + int ret; + + buf.dirent = dirent; + buf.result = 0; + mutex_lock(&inode->i_mutex); + buf.ctx.pos = file->f_pos; + ret = -ENOENT; + if (!IS_DEADDIR(inode)) { + ret = __fat_readdir(inode, file, &buf.ctx, + short_only, both ? &buf : NULL); + file->f_pos = buf.ctx.pos; + } + mutex_unlock(&inode->i_mutex); + if (ret >= 0) + ret = buf.result; + return ret; +} + +static long fat_dir_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; + int short_only, both; + + switch (cmd) { + case VFAT_IOCTL_READDIR_SHORT: + short_only = 1; + both = 0; + break; + case VFAT_IOCTL_READDIR_BOTH: + short_only = 0; + both = 1; + break; + default: + return fat_generic_ioctl(filp, cmd, arg); + } + + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) + return -EFAULT; + /* + * Yes, we don't need this put_user() absolutely. However old + * code didn't return the right value. So, app use this value, + * in order to check whether it is EOF. + */ + if (put_user(0, &d1->d_reclen)) + return -EFAULT; + + return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir, + short_only, both); +} + +#ifdef CONFIG_COMPAT +#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) +#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) + +FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent) + +static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct compat_dirent __user *d1 = compat_ptr(arg); + int short_only, both; + + switch (cmd) { + case VFAT_IOCTL_READDIR_SHORT32: + short_only = 1; + both = 0; + break; + case VFAT_IOCTL_READDIR_BOTH32: + short_only = 0; + both = 1; + break; + default: + return fat_generic_ioctl(filp, cmd, (unsigned long)arg); + } + + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) + return -EFAULT; + /* + * Yes, we don't need this put_user() absolutely. However old + * code didn't return the right value. So, app use this value, + * in order to check whether it is EOF. + */ + if (put_user(0, &d1->d_reclen)) + return -EFAULT; + + return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir, + short_only, both); +} +#endif /* CONFIG_COMPAT */ + +const struct file_operations fat_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = fat_readdir, + .unlocked_ioctl = fat_dir_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_compat_dir_ioctl, +#endif + .fsync = fat_file_fsync, +}; + +static int fat_get_short_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + while (fat_get_entry(dir, pos, bh, de) >= 0) { + /* free entry or long name entry or volume label */ + if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME)) + return 0; + } + return -ENOENT; +} + +/* + * The ".." entry can not provide the "struct fat_slot_info" information + * for inode, nor a usable i_pos. So, this function provides some information + * only. + * + * Since this function walks through the on-disk inodes within a directory, + * callers are responsible for taking any locks necessary to prevent the + * directory from changing. + */ +int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + loff_t offset = 0; + + *de = NULL; + while (fat_get_short_entry(dir, &offset, bh, de) >= 0) { + if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); + +/* See if directory is empty */ +int fat_dir_empty(struct inode *dir) +{ + struct buffer_head *bh; + struct msdos_dir_entry *de; + loff_t cpos; + int result = 0; + + bh = NULL; + cpos = 0; + while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { + if (strncmp(de->name, MSDOS_DOT , MSDOS_NAME) && + strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + result = -ENOTEMPTY; + break; + } + } + brelse(bh); + return result; +} +EXPORT_SYMBOL_GPL(fat_dir_empty); + +/* + * fat_subdirs counts the number of sub-directories of dir. It can be run + * on directories being created. + */ +int fat_subdirs(struct inode *dir) +{ + struct buffer_head *bh; + struct msdos_dir_entry *de; + loff_t cpos; + int count = 0; + + bh = NULL; + cpos = 0; + while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { + if (de->attr & ATTR_DIR) + count++; + } + brelse(bh); + return count; +} + +/* + * Scans a directory for a given file (name points to its formatted name). + * Returns an error code or zero. + */ +int fat_scan(struct inode *dir, const unsigned char *name, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} +EXPORT_SYMBOL_GPL(fat_scan); + +/* + * Scans a directory for a given logstart. + * Returns an error code or zero. + */ +int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} + +static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct msdos_dir_entry *de, *endp; + int err = 0, orig_slots; + + while (nr_slots) { + bh = NULL; + if (fat_get_entry(dir, &pos, &bh, &de) < 0) { + err = -EIO; + break; + } + + orig_slots = nr_slots; + endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize); + while (nr_slots && de < endp) { + de->name[0] = DELETED_FLAG; + de++; + nr_slots--; + } + mark_buffer_dirty_inode(bh, dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bh); + brelse(bh); + if (err) + break; + + /* pos is *next* de's position, so this does `- sizeof(de)' */ + pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de); + } + + return err; +} + +int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + struct msdos_dir_entry *de; + struct buffer_head *bh; + int err = 0, nr_slots; + + /* + * First stage: Remove the shortname. By this, the directory + * entry is removed. + */ + nr_slots = sinfo->nr_slots; + de = sinfo->de; + sinfo->de = NULL; + bh = sinfo->bh; + sinfo->bh = NULL; + while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) { + de->name[0] = DELETED_FLAG; + de--; + nr_slots--; + } + mark_buffer_dirty_inode(bh, dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bh); + brelse(bh); + if (err) + return err; + dir->i_version++; + + if (nr_slots) { + /* + * Second stage: remove the remaining longname slots. + * (This directory entry is already removed, and so return + * the success) + */ + err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); + if (err) { + fat_msg(sb, KERN_WARNING, + "Couldn't remove the long name slots"); + } + } + + dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); + + return 0; +} +EXPORT_SYMBOL_GPL(fat_remove_entries); + +static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, + struct buffer_head **bhs, int nr_bhs) +{ + struct super_block *sb = dir->i_sb; + sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus; + int err, i, n; + + /* Zeroing the unused blocks on this cluster */ + blknr += nr_used; + n = nr_used; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto error; + } + memset(bhs[n]->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); + + n++; + blknr++; + if (n == nr_bhs) { + if (IS_DIRSYNC(dir)) { + err = fat_sync_bhs(bhs, n); + if (err) + goto error; + } + for (i = 0; i < n; i++) + brelse(bhs[i]); + n = 0; + } + } + if (IS_DIRSYNC(dir)) { + err = fat_sync_bhs(bhs, n); + if (err) + goto error; + } + for (i = 0; i < n; i++) + brelse(bhs[i]); + + return 0; + +error: + for (i = 0; i < n; i++) + bforget(bhs[i]); + return err; +} + +int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + struct msdos_dir_entry *de; + sector_t blknr; + __le16 date, time; + u8 time_cs; + int err, cluster; + + err = fat_alloc_clusters(dir, &cluster, 1); + if (err) + goto error; + + blknr = fat_clus_to_blknr(sbi, cluster); + bhs[0] = sb_getblk(sb, blknr); + if (!bhs[0]) { + err = -ENOMEM; + goto error_free; + } + + fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); + + de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* filling the new directory slots ("." and ".." entries) */ + memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); + memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); + de->attr = de[1].attr = ATTR_DIR; + de[0].lcase = de[1].lcase = 0; + de[0].time = de[1].time = time; + de[0].date = de[1].date = date; + if (sbi->options.isvfat) { + /* extra timestamps */ + de[0].ctime = de[1].ctime = time; + de[0].ctime_cs = de[1].ctime_cs = time_cs; + de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date; + } else { + de[0].ctime = de[1].ctime = 0; + de[0].ctime_cs = de[1].ctime_cs = 0; + de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0; + } + fat_set_start(&de[0], cluster); + fat_set_start(&de[1], MSDOS_I(dir)->i_logstart); + de[0].size = de[1].size = 0; + memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); + set_buffer_uptodate(bhs[0]); + mark_buffer_dirty_inode(bhs[0], dir); + + err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); + if (err) + goto error_free; + + return cluster; + +error_free: + fat_free_clusters(dir, cluster); +error: + return err; +} +EXPORT_SYMBOL_GPL(fat_alloc_new_dir); + +static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, + int *nr_cluster, struct msdos_dir_entry **de, + struct buffer_head **bh, loff_t *i_pos) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + sector_t blknr, start_blknr, last_blknr; + unsigned long size, copy; + int err, i, n, offset, cluster[2]; + + /* + * The minimum cluster size is 512bytes, and maximum entry + * size is 32*slots (672bytes). So, iff the cluster size is + * 512bytes, we may need two clusters. + */ + size = nr_slots * sizeof(struct msdos_dir_entry); + *nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits; + BUG_ON(*nr_cluster > 2); + + err = fat_alloc_clusters(dir, cluster, *nr_cluster); + if (err) + goto error; + + /* + * First stage: Fill the directory entry. NOTE: This cluster + * is not referenced from any inode yet, so updates order is + * not important. + */ + i = n = copy = 0; + do { + start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]); + last_blknr = start_blknr + sbi->sec_per_clus; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto error_nomem; + } + + /* fill the directory entry */ + copy = min(size, sb->s_blocksize); + memcpy(bhs[n]->b_data, slots, copy); + slots += copy; + size -= copy; + set_buffer_uptodate(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); + if (!size) + break; + n++; + blknr++; + } + } while (++i < *nr_cluster); + + memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy); + offset = copy - sizeof(struct msdos_dir_entry); + get_bh(bhs[n]); + *bh = bhs[n]; + *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); + *i_pos = fat_make_i_pos(sb, *bh, *de); + + /* Second stage: clear the rest of cluster, and write outs */ + err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); + if (err) + goto error_free; + + return cluster[0]; + +error_free: + brelse(*bh); + *bh = NULL; + n = 0; +error_nomem: + for (i = 0; i < n; i++) + bforget(bhs[i]); + fat_free_clusters(dir, cluster[0]); +error: + return err; +} + +int fat_add_entries(struct inode *dir, void *slots, int nr_slots, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ + struct msdos_dir_entry *uninitialized_var(de); + int err, free_slots, i, nr_bhs; + loff_t pos, i_pos; + + sinfo->nr_slots = nr_slots; + + /* First stage: search free directory entries */ + free_slots = nr_bhs = 0; + bh = prev = NULL; + pos = 0; + err = -ENOSPC; + while (fat_get_entry(dir, &pos, &bh, &de) > -1) { + /* check the maximum size of directory */ + if (pos >= FAT_MAX_DIR_SIZE) + goto error; + + if (IS_FREE(de->name)) { + if (prev != bh) { + get_bh(bh); + bhs[nr_bhs] = prev = bh; + nr_bhs++; + } + free_slots++; + if (free_slots == nr_slots) + goto found; + } else { + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + prev = NULL; + free_slots = nr_bhs = 0; + } + } + if (dir->i_ino == MSDOS_ROOT_INO) { + if (sbi->fat_bits != 32) + goto error; + } else if (MSDOS_I(dir)->i_start == 0) { + fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", + MSDOS_I(dir)->i_pos); + err = -EIO; + goto error; + } + +found: + err = 0; + pos -= free_slots * sizeof(*de); + nr_slots -= free_slots; + if (free_slots) { + /* + * Second stage: filling the free entries with new entries. + * NOTE: If this slots has shortname, first, we write + * the long name slots, then write the short name. + */ + int size = free_slots * sizeof(*de); + int offset = pos & (sb->s_blocksize - 1); + int long_bhs = nr_bhs - (nr_slots == 0); + + /* Fill the long name slots. */ + for (i = 0; i < long_bhs; i++) { + int copy = min_t(int, sb->s_blocksize - offset, size); + memcpy(bhs[i]->b_data + offset, slots, copy); + mark_buffer_dirty_inode(bhs[i], dir); + offset = 0; + slots += copy; + size -= copy; + } + if (long_bhs && IS_DIRSYNC(dir)) + err = fat_sync_bhs(bhs, long_bhs); + if (!err && i < nr_bhs) { + /* Fill the short name slot. */ + int copy = min_t(int, sb->s_blocksize - offset, size); + memcpy(bhs[i]->b_data + offset, slots, copy); + mark_buffer_dirty_inode(bhs[i], dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bhs[i]); + } + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + if (err) + goto error_remove; + } + + if (nr_slots) { + int cluster, nr_cluster; + + /* + * Third stage: allocate the cluster for new entries. + * And initialize the cluster with new entries, then + * add the cluster to dir. + */ + cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, + &de, &bh, &i_pos); + if (cluster < 0) { + err = cluster; + goto error_remove; + } + err = fat_chain_add(dir, cluster, nr_cluster); + if (err) { + fat_free_clusters(dir, cluster); + goto error_remove; + } + if (dir->i_size & (sbi->cluster_size - 1)) { + fat_fs_error(sb, "Odd directory size"); + dir->i_size = (dir->i_size + sbi->cluster_size - 1) + & ~((loff_t)sbi->cluster_size - 1); + } + dir->i_size += nr_cluster << sbi->cluster_bits; + MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits; + } + sinfo->slot_off = pos; + sinfo->de = de; + sinfo->bh = bh; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + + return 0; + +error: + brelse(bh); + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + return err; + +error_remove: + brelse(bh); + if (free_slots) + __fat_remove_entries(dir, pos, free_slots); + return err; +} +EXPORT_SYMBOL_GPL(fat_add_entries); diff --git a/kernel/fs/fat/fat.h b/kernel/fs/fat/fat.h new file mode 100644 index 000000000..be5e15323 --- /dev/null +++ b/kernel/fs/fat/fat.h @@ -0,0 +1,420 @@ +#ifndef _FAT_H +#define _FAT_H + +#include +#include +#include +#include +#include + +/* + * vfat shortname flags + */ +#define VFAT_SFN_DISPLAY_LOWER 0x0001 /* convert to lowercase for display */ +#define VFAT_SFN_DISPLAY_WIN95 0x0002 /* emulate win95 rule for display */ +#define VFAT_SFN_DISPLAY_WINNT 0x0004 /* emulate winnt rule for display */ +#define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ +#define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ + +#define FAT_ERRORS_CONT 1 /* ignore error and continue */ +#define FAT_ERRORS_PANIC 2 /* panic on error */ +#define FAT_ERRORS_RO 3 /* remount r/o on error */ + +#define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ +#define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ + +struct fat_mount_options { + kuid_t fs_uid; + kgid_t fs_gid; + unsigned short fs_fmask; + unsigned short fs_dmask; + unsigned short codepage; /* Codepage for shortname conversions */ + int time_offset; /* Offset of timestamps from UTC (in minutes) */ + char *iocharset; /* Charset used for filename input/display */ + unsigned short shortname; /* flags for shortname display/create rule */ + unsigned char name_check; /* r = relaxed, n = normal, s = strict */ + unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ + unsigned short allow_utime;/* permission for setting the [am]time */ + unsigned quiet:1, /* set = fake successful chmods and chowns */ + showexec:1, /* set = only set x bit for com/exe/bat */ + sys_immutable:1, /* set = system files are immutable */ + dotsOK:1, /* set = hidden and system files are named '.filename' */ + isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ + utf8:1, /* Use of UTF-8 character set (Default) */ + unicode_xlate:1, /* create escape sequences for unhandled Unicode */ + numtail:1, /* Does first alias have a numeric '~1' type tail? */ + flush:1, /* write things quickly */ + nocase:1, /* Does this need case conversion? 0=need case conversion*/ + usefree:1, /* Use free_clusters for FAT32 */ + tz_set:1, /* Filesystem timestamps' offset set */ + rodir:1, /* allow ATTR_RO for directory */ + discard:1, /* Issue discard requests on deletions */ + dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */ +}; + +#define FAT_HASH_BITS 8 +#define FAT_HASH_SIZE (1UL << FAT_HASH_BITS) + +/* + * MS-DOS file system in-core superblock data + */ +struct msdos_sb_info { + unsigned short sec_per_clus; /* sectors/cluster */ + unsigned short cluster_bits; /* log2(cluster_size) */ + unsigned int cluster_size; /* cluster size */ + unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */ + unsigned short fat_start; + unsigned long fat_length; /* FAT start & length (sec.) */ + unsigned long dir_start; + unsigned short dir_entries; /* root dir start & entries */ + unsigned long data_start; /* first data sector */ + unsigned long max_cluster; /* maximum cluster number */ + unsigned long root_cluster; /* first cluster of the root directory */ + unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ + struct mutex fat_lock; + struct mutex nfs_build_inode_lock; + struct mutex s_lock; + unsigned int prev_free; /* previously allocated cluster number */ + unsigned int free_clusters; /* -1 if undefined */ + unsigned int free_clus_valid; /* is free_clusters valid? */ + struct fat_mount_options options; + struct nls_table *nls_disk; /* Codepage used on disk */ + struct nls_table *nls_io; /* Charset used for input and display */ + const void *dir_ops; /* Opaque; default directory operations */ + int dir_per_block; /* dir entries per block */ + int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ + + int fatent_shift; + struct fatent_operations *fatent_ops; + struct inode *fat_inode; + struct inode *fsinfo_inode; + + struct ratelimit_state ratelimit; + + spinlock_t inode_hash_lock; + struct hlist_head inode_hashtable[FAT_HASH_SIZE]; + + spinlock_t dir_hash_lock; + struct hlist_head dir_hashtable[FAT_HASH_SIZE]; + + unsigned int dirty; /* fs state before mount */ + struct rcu_head rcu; +}; + +#define FAT_CACHE_VALID 0 /* special case for valid cache */ + +/* + * MS-DOS file system inode data in memory + */ +struct msdos_inode_info { + spinlock_t cache_lru_lock; + struct list_head cache_lru; + int nr_caches; + /* for avoiding the race between fat_free() and fat_get_cluster() */ + unsigned int cache_valid_id; + + /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ + loff_t mmu_private; /* physically allocated size */ + + int i_start; /* first cluster or 0 */ + int i_logstart; /* logical first cluster */ + int i_attrs; /* unused attribute bits */ + loff_t i_pos; /* on-disk position of directory entry or 0 */ + struct hlist_node i_fat_hash; /* hash by i_location */ + struct hlist_node i_dir_hash; /* hash by i_logstart */ + struct rw_semaphore truncate_lock; /* protect bmap against truncate */ + struct inode vfs_inode; +}; + +struct fat_slot_info { + loff_t i_pos; /* on-disk position of directory entry */ + loff_t slot_off; /* offset for slot or de start */ + int nr_slots; /* number of slots + 1(de) in filename */ + struct msdos_dir_entry *de; + struct buffer_head *bh; +}; + +static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) +{ + return container_of(inode, struct msdos_inode_info, vfs_inode); +} + +/* + * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to + * save ATTR_RO instead of ->i_mode. + * + * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only + * bit, it's just used as flag for app. + */ +static inline int fat_mode_can_hold_ro(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + umode_t mask; + + if (S_ISDIR(inode->i_mode)) { + if (!sbi->options.rodir) + return 0; + mask = ~sbi->options.fs_dmask; + } else + mask = ~sbi->options.fs_fmask; + + if (!(mask & S_IWUGO)) + return 0; + return 1; +} + +/* Convert attribute bits and a mask to the UNIX mode. */ +static inline umode_t fat_make_mode(struct msdos_sb_info *sbi, + u8 attrs, umode_t mode) +{ + if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) + mode &= ~S_IWUGO; + + if (attrs & ATTR_DIR) + return (mode & ~sbi->options.fs_dmask) | S_IFDIR; + else + return (mode & ~sbi->options.fs_fmask) | S_IFREG; +} + +/* Return the FAT attribute byte for this inode */ +static inline u8 fat_make_attrs(struct inode *inode) +{ + u8 attrs = MSDOS_I(inode)->i_attrs; + if (S_ISDIR(inode->i_mode)) + attrs |= ATTR_DIR; + if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO)) + attrs |= ATTR_RO; + return attrs; +} + +static inline void fat_save_attrs(struct inode *inode, u8 attrs) +{ + if (fat_mode_can_hold_ro(inode)) + MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED; + else + MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO); +} + +static inline unsigned char fat_checksum(const __u8 *name) +{ + unsigned char s = name[0]; + s = (s<<7) + (s>>1) + name[1]; s = (s<<7) + (s>>1) + name[2]; + s = (s<<7) + (s>>1) + name[3]; s = (s<<7) + (s>>1) + name[4]; + s = (s<<7) + (s>>1) + name[5]; s = (s<<7) + (s>>1) + name[6]; + s = (s<<7) + (s>>1) + name[7]; s = (s<<7) + (s>>1) + name[8]; + s = (s<<7) + (s>>1) + name[9]; s = (s<<7) + (s>>1) + name[10]; + return s; +} + +static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) +{ + return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus + + sbi->data_start; +} + +static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, + loff_t i_pos, sector_t *blknr, int *offset) +{ + *blknr = i_pos >> sbi->dir_per_block_bits; + *offset = i_pos & (sbi->dir_per_block - 1); +} + +static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, + struct inode *inode) +{ + loff_t i_pos; +#if BITS_PER_LONG == 32 + spin_lock(&sbi->inode_hash_lock); +#endif + i_pos = MSDOS_I(inode)->i_pos; +#if BITS_PER_LONG == 32 + spin_unlock(&sbi->inode_hash_lock); +#endif + return i_pos; +} + +static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) +{ +#ifdef __BIG_ENDIAN + while (len--) { + *dst++ = src[0] | (src[1] << 8); + src += 2; + } +#else + memcpy(dst, src, len * 2); +#endif +} + +static inline int fat_get_start(const struct msdos_sb_info *sbi, + const struct msdos_dir_entry *de) +{ + int cluster = le16_to_cpu(de->start); + if (sbi->fat_bits == 32) + cluster |= (le16_to_cpu(de->starthi) << 16); + return cluster; +} + +static inline void fat_set_start(struct msdos_dir_entry *de, int cluster) +{ + de->start = cpu_to_le16(cluster); + de->starthi = cpu_to_le16(cluster >> 16); +} + +static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) +{ +#ifdef __BIG_ENDIAN + while (len--) { + dst[0] = *src & 0x00FF; + dst[1] = (*src & 0xFF00) >> 8; + dst += 2; + src++; + } +#else + memcpy(dst, src, len * 2); +#endif +} + +/* fat/cache.c */ +extern void fat_cache_inval_inode(struct inode *inode); +extern int fat_get_cluster(struct inode *inode, int cluster, + int *fclus, int *dclus); +extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create); + +/* fat/dir.c */ +extern const struct file_operations fat_dir_operations; +extern int fat_search_long(struct inode *inode, const unsigned char *name, + int name_len, struct fat_slot_info *sinfo); +extern int fat_dir_empty(struct inode *dir); +extern int fat_subdirs(struct inode *dir); +extern int fat_scan(struct inode *dir, const unsigned char *name, + struct fat_slot_info *sinfo); +extern int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo); +extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, + struct msdos_dir_entry **de); +extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); +extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, + struct fat_slot_info *sinfo); +extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); + +/* fat/fatent.c */ +struct fat_entry { + int entry; + union { + u8 *ent12_p[2]; + __le16 *ent16_p; + __le32 *ent32_p; + } u; + int nr_bhs; + struct buffer_head *bhs[2]; + struct inode *fat_inode; +}; + +static inline void fatent_init(struct fat_entry *fatent) +{ + fatent->nr_bhs = 0; + fatent->entry = 0; + fatent->u.ent32_p = NULL; + fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; +} + +static inline void fatent_set_entry(struct fat_entry *fatent, int entry) +{ + fatent->entry = entry; + fatent->u.ent32_p = NULL; +} + +static inline void fatent_brelse(struct fat_entry *fatent) +{ + int i; + fatent->u.ent32_p = NULL; + for (i = 0; i < fatent->nr_bhs; i++) + brelse(fatent->bhs[i]); + fatent->nr_bhs = 0; + fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; +} + +extern void fat_ent_access_init(struct super_block *sb); +extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, + int entry); +extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent, + int new, int wait); +extern int fat_alloc_clusters(struct inode *inode, int *cluster, + int nr_cluster); +extern int fat_free_clusters(struct inode *inode, int cluster); +extern int fat_count_free_clusters(struct super_block *sb); + +/* fat/file.c */ +extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); +extern const struct file_operations fat_file_operations; +extern const struct inode_operations fat_file_inode_operations; +extern int fat_setattr(struct dentry *dentry, struct iattr *attr); +extern void fat_truncate_blocks(struct inode *inode, loff_t offset); +extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync); + +/* fat/inode.c */ +extern int fat_block_truncate_page(struct inode *inode, loff_t from); +extern void fat_attach(struct inode *inode, loff_t i_pos); +extern void fat_detach(struct inode *inode); +extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); +extern struct inode *fat_build_inode(struct super_block *sb, + struct msdos_dir_entry *de, loff_t i_pos); +extern int fat_sync_inode(struct inode *inode); +extern int fat_fill_super(struct super_block *sb, void *data, int silent, + int isvfat, void (*setup)(struct super_block *)); +extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); + +extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2); +static inline unsigned long fat_dir_hash(int logstart) +{ + return hash_32(logstart, FAT_HASH_BITS); +} + +/* fat/misc.c */ +extern __printf(3, 4) __cold +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); +#define fat_fs_error(sb, fmt, args...) \ + __fat_fs_error(sb, 1, fmt , ## args) +#define fat_fs_error_ratelimit(sb, fmt, args...) \ + __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) +__printf(3, 4) __cold +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); +#define fat_msg_ratelimit(sb, level, fmt, args...) \ + do { \ + if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ + fat_msg(sb, level, fmt, ## args); \ + } while (0) +extern int fat_clusters_flush(struct super_block *sb); +extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); +extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 __time, __le16 __date, u8 time_cs); +extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 *time, __le16 *date, u8 *time_cs); +extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); + +int fat_cache_init(void); +void fat_cache_destroy(void); + +/* fat/nfs.c */ +extern const struct export_operations fat_export_ops; +extern const struct export_operations fat_export_ops_nostale; + +/* helper for printk */ +typedef unsigned long long llu; + +#endif /* !_FAT_H */ diff --git a/kernel/fs/fat/fatent.c b/kernel/fs/fat/fatent.c new file mode 100644 index 000000000..822655713 --- /dev/null +++ b/kernel/fs/fat/fatent.c @@ -0,0 +1,692 @@ +/* + * Copyright (C) 2004, OGAWA Hirofumi + * Released under GPL v2. + */ + +#include +#include "fat.h" + +struct fatent_operations { + void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); + void (*ent_set_ptr)(struct fat_entry *, int); + int (*ent_bread)(struct super_block *, struct fat_entry *, + int, sector_t); + int (*ent_get)(struct fat_entry *); + void (*ent_put)(struct fat_entry *, int); + int (*ent_next)(struct fat_entry *); +}; + +static DEFINE_SPINLOCK(fat12_entry_lock); + +static void fat12_ent_blocknr(struct super_block *sb, int entry, + int *offset, sector_t *blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int bytes = entry + (entry >> 1); + WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + *offset = bytes & (sb->s_blocksize - 1); + *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); +} + +static void fat_ent_blocknr(struct super_block *sb, int entry, + int *offset, sector_t *blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int bytes = (entry << sbi->fatent_shift); + WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + *offset = bytes & (sb->s_blocksize - 1); + *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); +} + +static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + struct buffer_head **bhs = fatent->bhs; + if (fatent->nr_bhs == 1) { + WARN_ON(offset >= (bhs[0]->b_size - 1)); + fatent->u.ent12_p[0] = bhs[0]->b_data + offset; + fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1); + } else { + WARN_ON(offset != (bhs[0]->b_size - 1)); + fatent->u.ent12_p[0] = bhs[0]->b_data + offset; + fatent->u.ent12_p[1] = bhs[1]->b_data; + } +} + +static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + WARN_ON(offset & (2 - 1)); + fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset); +} + +static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + WARN_ON(offset & (4 - 1)); + fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset); +} + +static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct buffer_head **bhs = fatent->bhs; + + WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; + + bhs[0] = sb_bread(sb, blocknr); + if (!bhs[0]) + goto err; + + if ((offset + 1) < sb->s_blocksize) + fatent->nr_bhs = 1; + else { + /* This entry is block boundary, it needs the next block */ + blocknr++; + bhs[1] = sb_bread(sb, blocknr); + if (!bhs[1]) + goto err_brelse; + fatent->nr_bhs = 2; + } + fat12_ent_set_ptr(fatent, offset); + return 0; + +err_brelse: + brelse(bhs[0]); +err: + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + return -EIO; +} + +static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + + WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; + fatent->bhs[0] = sb_bread(sb, blocknr); + if (!fatent->bhs[0]) { + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); + return -EIO; + } + fatent->nr_bhs = 1; + ops->ent_set_ptr(fatent, offset); + return 0; +} + +static int fat12_ent_get(struct fat_entry *fatent) +{ + u8 **ent12_p = fatent->u.ent12_p; + int next; + + spin_lock(&fat12_entry_lock); + if (fatent->entry & 1) + next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); + else + next = (*ent12_p[1] << 8) | *ent12_p[0]; + spin_unlock(&fat12_entry_lock); + + next &= 0x0fff; + if (next >= BAD_FAT12) + next = FAT_ENT_EOF; + return next; +} + +static int fat16_ent_get(struct fat_entry *fatent) +{ + int next = le16_to_cpu(*fatent->u.ent16_p); + WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1)); + if (next >= BAD_FAT16) + next = FAT_ENT_EOF; + return next; +} + +static int fat32_ent_get(struct fat_entry *fatent) +{ + int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff; + WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1)); + if (next >= BAD_FAT32) + next = FAT_ENT_EOF; + return next; +} + +static void fat12_ent_put(struct fat_entry *fatent, int new) +{ + u8 **ent12_p = fatent->u.ent12_p; + + if (new == FAT_ENT_EOF) + new = EOF_FAT12; + + spin_lock(&fat12_entry_lock); + if (fatent->entry & 1) { + *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); + *ent12_p[1] = new >> 4; + } else { + *ent12_p[0] = new & 0xff; + *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); + } + spin_unlock(&fat12_entry_lock); + + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); + if (fatent->nr_bhs == 2) + mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); +} + +static void fat16_ent_put(struct fat_entry *fatent, int new) +{ + if (new == FAT_ENT_EOF) + new = EOF_FAT16; + + *fatent->u.ent16_p = cpu_to_le16(new); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); +} + +static void fat32_ent_put(struct fat_entry *fatent, int new) +{ + WARN_ON(new & 0xf0000000); + new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; + *fatent->u.ent32_p = cpu_to_le32(new); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); +} + +static int fat12_ent_next(struct fat_entry *fatent) +{ + u8 **ent12_p = fatent->u.ent12_p; + struct buffer_head **bhs = fatent->bhs; + u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1); + + fatent->entry++; + if (fatent->nr_bhs == 1) { + WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 2))); + WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 1))); + if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { + ent12_p[0] = nextp - 1; + ent12_p[1] = nextp; + return 1; + } + } else { + WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 1))); + WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); + ent12_p[0] = nextp - 1; + ent12_p[1] = nextp; + brelse(bhs[0]); + bhs[0] = bhs[1]; + fatent->nr_bhs = 1; + return 1; + } + ent12_p[0] = NULL; + ent12_p[1] = NULL; + return 0; +} + +static int fat16_ent_next(struct fat_entry *fatent) +{ + const struct buffer_head *bh = fatent->bhs[0]; + fatent->entry++; + if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) { + fatent->u.ent16_p++; + return 1; + } + fatent->u.ent16_p = NULL; + return 0; +} + +static int fat32_ent_next(struct fat_entry *fatent) +{ + const struct buffer_head *bh = fatent->bhs[0]; + fatent->entry++; + if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) { + fatent->u.ent32_p++; + return 1; + } + fatent->u.ent32_p = NULL; + return 0; +} + +static struct fatent_operations fat12_ops = { + .ent_blocknr = fat12_ent_blocknr, + .ent_set_ptr = fat12_ent_set_ptr, + .ent_bread = fat12_ent_bread, + .ent_get = fat12_ent_get, + .ent_put = fat12_ent_put, + .ent_next = fat12_ent_next, +}; + +static struct fatent_operations fat16_ops = { + .ent_blocknr = fat_ent_blocknr, + .ent_set_ptr = fat16_ent_set_ptr, + .ent_bread = fat_ent_bread, + .ent_get = fat16_ent_get, + .ent_put = fat16_ent_put, + .ent_next = fat16_ent_next, +}; + +static struct fatent_operations fat32_ops = { + .ent_blocknr = fat_ent_blocknr, + .ent_set_ptr = fat32_ent_set_ptr, + .ent_bread = fat_ent_bread, + .ent_get = fat32_ent_get, + .ent_put = fat32_ent_put, + .ent_next = fat32_ent_next, +}; + +static inline void lock_fat(struct msdos_sb_info *sbi) +{ + mutex_lock(&sbi->fat_lock); +} + +static inline void unlock_fat(struct msdos_sb_info *sbi) +{ + mutex_unlock(&sbi->fat_lock); +} + +void fat_ent_access_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + mutex_init(&sbi->fat_lock); + + switch (sbi->fat_bits) { + case 32: + sbi->fatent_shift = 2; + sbi->fatent_ops = &fat32_ops; + break; + case 16: + sbi->fatent_shift = 1; + sbi->fatent_ops = &fat16_ops; + break; + case 12: + sbi->fatent_shift = -1; + sbi->fatent_ops = &fat12_ops; + break; + } +} + +static void mark_fsinfo_dirty(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + if (sb->s_flags & MS_RDONLY || sbi->fat_bits != 32) + return; + + __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); +} + +static inline int fat_ent_update_ptr(struct super_block *sb, + struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct buffer_head **bhs = fatent->bhs; + + /* Is this fatent's blocks including this entry? */ + if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) + return 0; + if (sbi->fat_bits == 12) { + if ((offset + 1) < sb->s_blocksize) { + /* This entry is on bhs[0]. */ + if (fatent->nr_bhs == 2) { + brelse(bhs[1]); + fatent->nr_bhs = 1; + } + } else { + /* This entry needs the next block. */ + if (fatent->nr_bhs != 2) + return 0; + if (bhs[1]->b_blocknr != (blocknr + 1)) + return 0; + } + } + ops->ent_set_ptr(fatent, offset); + return 1; +} + +int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fatent_operations *ops = sbi->fatent_ops; + int err, offset; + sector_t blocknr; + + if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { + fatent_brelse(fatent); + fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); + return -EIO; + } + + fatent_set_entry(fatent, entry); + ops->ent_blocknr(sb, entry, &offset, &blocknr); + + if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) { + fatent_brelse(fatent); + err = ops->ent_bread(sb, fatent, offset, blocknr); + if (err) + return err; + } + return ops->ent_get(fatent); +} + +/* FIXME: We can write the blocks as more big chunk. */ +static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, + int nr_bhs) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *c_bh; + int err, n, copy; + + err = 0; + for (copy = 1; copy < sbi->fats; copy++) { + sector_t backup_fat = sbi->fat_length * copy; + + for (n = 0; n < nr_bhs; n++) { + c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr); + if (!c_bh) { + err = -ENOMEM; + goto error; + } + memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); + set_buffer_uptodate(c_bh); + mark_buffer_dirty_inode(c_bh, sbi->fat_inode); + if (sb->s_flags & MS_SYNCHRONOUS) + err = sync_dirty_buffer(c_bh); + brelse(c_bh); + if (err) + goto error; + } + } +error: + return err; +} + +int fat_ent_write(struct inode *inode, struct fat_entry *fatent, + int new, int wait) +{ + struct super_block *sb = inode->i_sb; + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + int err; + + ops->ent_put(fatent, new); + if (wait) { + err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs); + if (err) + return err; + } + return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs); +} + +static inline int fat_ent_next(struct msdos_sb_info *sbi, + struct fat_entry *fatent) +{ + if (sbi->fatent_ops->ent_next(fatent)) { + if (fatent->entry < sbi->max_cluster) + return 1; + } + return 0; +} + +static inline int fat_ent_read_block(struct super_block *sb, + struct fat_entry *fatent) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + sector_t blocknr; + int offset; + + fatent_brelse(fatent); + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + return ops->ent_bread(sb, fatent, offset, blocknr); +} + +static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs, + struct fat_entry *fatent) +{ + int n, i; + + for (n = 0; n < fatent->nr_bhs; n++) { + for (i = 0; i < *nr_bhs; i++) { + if (fatent->bhs[n] == bhs[i]) + break; + } + if (i == *nr_bhs) { + get_bh(fatent->bhs[n]); + bhs[i] = fatent->bhs[n]; + (*nr_bhs)++; + } + } +} + +int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent, prev_ent; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int i, count, err, nr_bhs, idx_clus; + + BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ + + lock_fat(sbi); + if (sbi->free_clusters != -1 && sbi->free_clus_valid && + sbi->free_clusters < nr_cluster) { + unlock_fat(sbi); + return -ENOSPC; + } + + err = nr_bhs = idx_clus = 0; + count = FAT_START_ENT; + fatent_init(&prev_ent); + fatent_init(&fatent); + fatent_set_entry(&fatent, sbi->prev_free + 1); + while (count < sbi->max_cluster) { + if (fatent.entry >= sbi->max_cluster) + fatent.entry = FAT_START_ENT; + fatent_set_entry(&fatent, fatent.entry); + err = fat_ent_read_block(sb, &fatent); + if (err) + goto out; + + /* Find the free entries in a block */ + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) { + int entry = fatent.entry; + + /* make the cluster chain */ + ops->ent_put(&fatent, FAT_ENT_EOF); + if (prev_ent.nr_bhs) + ops->ent_put(&prev_ent, entry); + + fat_collect_bhs(bhs, &nr_bhs, &fatent); + + sbi->prev_free = entry; + if (sbi->free_clusters != -1) + sbi->free_clusters--; + + cluster[idx_clus] = entry; + idx_clus++; + if (idx_clus == nr_cluster) + goto out; + + /* + * fat_collect_bhs() gets ref-count of bhs, + * so we can still use the prev_ent. + */ + prev_ent = fatent; + } + count++; + if (count == sbi->max_cluster) + break; + } while (fat_ent_next(sbi, &fatent)); + } + + /* Couldn't allocate the free entries */ + sbi->free_clusters = 0; + sbi->free_clus_valid = 1; + err = -ENOSPC; + +out: + unlock_fat(sbi); + mark_fsinfo_dirty(sb); + fatent_brelse(&fatent); + if (!err) { + if (inode_needs_sync(inode)) + err = fat_sync_bhs(bhs, nr_bhs); + if (!err) + err = fat_mirror_bhs(sb, bhs, nr_bhs); + } + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + if (err && idx_clus) + fat_free_clusters(inode, cluster[0]); + + return err; +} + +int fat_free_clusters(struct inode *inode, int cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int i, err, nr_bhs; + int first_cl = cluster, dirty_fsinfo = 0; + + nr_bhs = 0; + fatent_init(&fatent); + lock_fat(sbi); + do { + cluster = fat_ent_read(inode, &fatent, cluster); + if (cluster < 0) { + err = cluster; + goto error; + } else if (cluster == FAT_ENT_FREE) { + fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", + __func__); + err = -EIO; + goto error; + } + + if (sbi->options.discard) { + /* + * Issue discard for the sectors we no longer + * care about, batching contiguous clusters + * into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, + fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus, + GFP_NOFS, 0); + + first_cl = cluster; + } + } + + ops->ent_put(&fatent, FAT_ENT_FREE); + if (sbi->free_clusters != -1) { + sbi->free_clusters++; + dirty_fsinfo = 1; + } + + if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { + if (sb->s_flags & MS_SYNCHRONOUS) { + err = fat_sync_bhs(bhs, nr_bhs); + if (err) + goto error; + } + err = fat_mirror_bhs(sb, bhs, nr_bhs); + if (err) + goto error; + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + nr_bhs = 0; + } + fat_collect_bhs(bhs, &nr_bhs, &fatent); + } while (cluster != FAT_ENT_EOF); + + if (sb->s_flags & MS_SYNCHRONOUS) { + err = fat_sync_bhs(bhs, nr_bhs); + if (err) + goto error; + } + err = fat_mirror_bhs(sb, bhs, nr_bhs); +error: + fatent_brelse(&fatent); + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + unlock_fat(sbi); + if (dirty_fsinfo) + mark_fsinfo_dirty(sb); + + return err; +} +EXPORT_SYMBOL_GPL(fat_free_clusters); + +/* 128kb is the whole sectors for FAT12 and FAT16 */ +#define FAT_READA_SIZE (128 * 1024) + +static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, + unsigned long reada_blocks) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + sector_t blocknr; + int i, offset; + + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + + for (i = 0; i < reada_blocks; i++) + sb_breadahead(sb, blocknr + i); +} + +int fat_count_free_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + unsigned long reada_blocks, reada_mask, cur_block; + int err = 0, free; + + lock_fat(sbi); + if (sbi->free_clusters != -1 && sbi->free_clus_valid) + goto out; + + reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; + reada_mask = reada_blocks - 1; + cur_block = 0; + + free = 0; + fatent_init(&fatent); + fatent_set_entry(&fatent, FAT_START_ENT); + while (fatent.entry < sbi->max_cluster) { + /* readahead of fat blocks */ + if ((cur_block & reada_mask) == 0) { + unsigned long rest = sbi->fat_length - cur_block; + fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); + } + cur_block++; + + err = fat_ent_read_block(sb, &fatent); + if (err) + goto out; + + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) + free++; + } while (fat_ent_next(sbi, &fatent)); + } + sbi->free_clusters = free; + sbi->free_clus_valid = 1; + mark_fsinfo_dirty(sb); + fatent_brelse(&fatent); +out: + unlock_fat(sbi); + return err; +} diff --git a/kernel/fs/fat/file.c b/kernel/fs/fat/file.c new file mode 100644 index 000000000..442d50a0e --- /dev/null +++ b/kernel/fs/fat/file.c @@ -0,0 +1,459 @@ +/* + * linux/fs/fat/file.c + * + * Written 1992,1993 by Werner Almesberger + * + * regular file handling primitives for fat-based filesystems + */ + +#include +#include +#include +#include +#include +#include +#include +#include "fat.h" + +static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) +{ + u32 attr; + + mutex_lock(&inode->i_mutex); + attr = fat_make_attrs(inode); + mutex_unlock(&inode->i_mutex); + + return put_user(attr, user_attr); +} + +static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) +{ + struct inode *inode = file_inode(file); + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int is_dir = S_ISDIR(inode->i_mode); + u32 attr, oldattr; + struct iattr ia; + int err; + + err = get_user(attr, user_attr); + if (err) + goto out; + + err = mnt_want_write_file(file); + if (err) + goto out; + mutex_lock(&inode->i_mutex); + + /* + * ATTR_VOLUME and ATTR_DIR cannot be changed; this also + * prevents the user from turning us into a VFAT + * longname entry. Also, we obviously can't set + * any of the NTFS attributes in the high 24 bits. + */ + attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); + /* Merge in ATTR_VOLUME and ATTR_DIR */ + attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | + (is_dir ? ATTR_DIR : 0); + oldattr = fat_make_attrs(inode); + + /* Equivalent to a chmod() */ + ia.ia_valid = ATTR_MODE | ATTR_CTIME; + ia.ia_ctime = current_fs_time(inode->i_sb); + if (is_dir) + ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); + else { + ia.ia_mode = fat_make_mode(sbi, attr, + S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); + } + + /* The root directory has no attributes */ + if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { + err = -EINVAL; + goto out_unlock_inode; + } + + if (sbi->options.sys_immutable && + ((attr | oldattr) & ATTR_SYS) && + !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; + } + + /* + * The security check is questionable... We single + * out the RO attribute for checking by the security + * module, just because it maps to a file mode. + */ + err = security_inode_setattr(file->f_path.dentry, &ia); + if (err) + goto out_unlock_inode; + + /* This MUST be done before doing anything irreversible... */ + err = fat_setattr(file->f_path.dentry, &ia); + if (err) + goto out_unlock_inode; + + fsnotify_change(file->f_path.dentry, ia.ia_valid); + if (sbi->options.sys_immutable) { + if (attr & ATTR_SYS) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + } + + fat_save_attrs(inode, attr); + mark_inode_dirty(inode); +out_unlock_inode: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); +out: + return err; +} + +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + +long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 __user *user_attr = (u32 __user *)arg; + + switch (cmd) { + case FAT_IOCTL_GET_ATTRIBUTES: + return fat_ioctl_get_attributes(inode, user_attr); + case FAT_IOCTL_SET_ATTRIBUTES: + return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); + default: + return -ENOTTY; /* Inappropriate ioctl for device */ + } +} + +#ifdef CONFIG_COMPAT +static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) + +{ + return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static int fat_file_release(struct inode *inode, struct file *filp) +{ + if ((filp->f_mode & FMODE_WRITE) && + MSDOS_SB(inode->i_sb)->options.flush) { + fat_flush_inodes(inode->i_sb, inode, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + return 0; +} + +int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int res, err; + + res = generic_file_fsync(filp, start, end, datasync); + err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + + return res ? res : err; +} + + +const struct file_operations fat_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .release = fat_file_release, + .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_generic_compat_ioctl, +#endif + .fsync = fat_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int fat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = inode->i_size, count = size - inode->i_size; + int err; + + err = generic_cont_expand_simple(inode, size); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } +out: + return err; +} + +/* Free all clusters after the skip'th cluster. */ +static int fat_free(struct inode *inode, int skip) +{ + struct super_block *sb = inode->i_sb; + int err, wait, free_start, i_start, i_logstart; + + if (MSDOS_I(inode)->i_start == 0) + return 0; + + fat_cache_inval_inode(inode); + + wait = IS_DIRSYNC(inode); + i_start = free_start = MSDOS_I(inode)->i_start; + i_logstart = MSDOS_I(inode)->i_logstart; + + /* First, we write the new file size. */ + if (!skip) { + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + } + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + if (wait) { + err = fat_sync_inode(inode); + if (err) { + MSDOS_I(inode)->i_start = i_start; + MSDOS_I(inode)->i_logstart = i_logstart; + return err; + } + } else + mark_inode_dirty(inode); + + /* Write a new EOF, and get the remaining cluster chain for freeing. */ + if (skip) { + struct fat_entry fatent; + int ret, fclus, dclus; + + ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus); + if (ret < 0) + return ret; + else if (ret == FAT_ENT_EOF) + return 0; + + fatent_init(&fatent); + ret = fat_ent_read(inode, &fatent, dclus); + if (ret == FAT_ENT_EOF) { + fatent_brelse(&fatent); + return 0; + } else if (ret == FAT_ENT_FREE) { + fat_fs_error(sb, + "%s: invalid cluster chain (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); + ret = -EIO; + } else if (ret > 0) { + err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait); + if (err) + ret = err; + } + fatent_brelse(&fatent); + if (ret < 0) + return ret; + + free_start = ret; + } + inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); + + /* Freeing the remained cluster chain */ + return fat_free_clusters(inode, free_start); +} + +void fat_truncate_blocks(struct inode *inode, loff_t offset) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + const unsigned int cluster_size = sbi->cluster_size; + int nr_clusters; + + /* + * This protects against truncating a file bigger than it was then + * trying to write into the hole. + */ + if (MSDOS_I(inode)->mmu_private > offset) + MSDOS_I(inode)->mmu_private = offset; + + nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; + + fat_free(inode, nr_clusters); + fat_flush_inodes(inode->i_sb, inode, NULL); +} + +int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + + if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + /* Use i_pos for ino. This is used as fileid of nfs. */ + stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + } + return 0; +} +EXPORT_SYMBOL_GPL(fat_getattr); + +static int fat_sanitize_mode(const struct msdos_sb_info *sbi, + struct inode *inode, umode_t *mode_ptr) +{ + umode_t mask, perm; + + /* + * Note, the basic check is already done by a caller of + * (attr->ia_mode & ~FAT_VALID_MODE) + */ + + if (S_ISREG(inode->i_mode)) + mask = sbi->options.fs_fmask; + else + mask = sbi->options.fs_dmask; + + perm = *mode_ptr & ~(S_IFMT | mask); + + /* + * Of the r and x bits, all (subject to umask) must be present. Of the + * w bits, either all (subject to umask) or none must be present. + * + * If fat_mode_can_hold_ro(inode) is false, can't change w bits. + */ + if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) + return -EPERM; + if (fat_mode_can_hold_ro(inode)) { + if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask))) + return -EPERM; + } else { + if ((perm & S_IWUGO) != (S_IWUGO & ~mask)) + return -EPERM; + } + + *mode_ptr &= S_IFMT | perm; + + return 0; +} + +static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) +{ + umode_t allow_utime = sbi->options.allow_utime; + + if (!uid_eq(current_fsuid(), inode->i_uid)) { + if (in_group_p(inode->i_gid)) + allow_utime >>= 3; + if (allow_utime & MAY_WRITE) + return 1; + } + + /* use a default check */ + return 0; +} + +#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +/* valid file mode bits */ +#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) + +int fat_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); + struct inode *inode = d_inode(dentry); + unsigned int ia_valid; + int error; + + /* Check for setting the inode time. */ + ia_valid = attr->ia_valid; + if (ia_valid & TIMES_SET_FLAGS) { + if (fat_allow_set_time(sbi, inode)) + attr->ia_valid &= ~TIMES_SET_FLAGS; + } + + error = inode_change_ok(inode, attr); + attr->ia_valid = ia_valid; + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + + /* + * Expand the file. Since inode_setattr() updates ->i_size + * before calling the ->truncate(), but FAT needs to fill the + * hole before it. XXX: this is no longer true with new truncate + * sequence. + */ + if (attr->ia_valid & ATTR_SIZE) { + inode_dio_wait(inode); + + if (attr->ia_size > inode->i_size) { + error = fat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + goto out; + attr->ia_valid &= ~ATTR_SIZE; + } + } + + if (((attr->ia_valid & ATTR_UID) && + (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + ((attr->ia_valid & ATTR_GID) && + (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + ((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & ~FAT_VALID_MODE))) + error = -EPERM; + + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + + /* + * We don't return -EPERM here. Yes, strange, but this is too + * old behavior. + */ + if (attr->ia_valid & ATTR_MODE) { + if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) + attr->ia_valid &= ~ATTR_MODE; + } + + if (attr->ia_valid & ATTR_SIZE) { + error = fat_block_truncate_page(inode, attr->ia_size); + if (error) + goto out; + down_write(&MSDOS_I(inode)->truncate_lock); + truncate_setsize(inode, attr->ia_size); + fat_truncate_blocks(inode, attr->ia_size); + up_write(&MSDOS_I(inode)->truncate_lock); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); +out: + return error; +} +EXPORT_SYMBOL_GPL(fat_setattr); + +const struct inode_operations fat_file_inode_operations = { + .setattr = fat_setattr, + .getattr = fat_getattr, +}; diff --git a/kernel/fs/fat/inode.c b/kernel/fs/fat/inode.c new file mode 100644 index 000000000..c06774658 --- /dev/null +++ b/kernel/fs/fat/inode.c @@ -0,0 +1,1850 @@ +/* + * linux/fs/fat/inode.c + * + * Written 1992,1993 by Werner Almesberger + * VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner + * Rewritten for the constant inumbers support by Al Viro + * + * Fixes: + * + * Max Cohan: Fixed invalid FSINFO offset when info_sector is 0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fat.h" + +#ifndef CONFIG_FAT_DEFAULT_IOCHARSET +/* if user don't select VFAT, this is undefined. */ +#define CONFIG_FAT_DEFAULT_IOCHARSET "" +#endif + +#define KB_IN_SECTORS 2 + +/* + * A deserialized copy of the on-disk structure laid out in struct + * fat_boot_sector. + */ +struct fat_bios_param_block { + u16 fat_sector_size; + u8 fat_sec_per_clus; + u16 fat_reserved; + u8 fat_fats; + u16 fat_dir_entries; + u16 fat_sectors; + u16 fat_fat_length; + u32 fat_total_sect; + + u8 fat16_state; + u32 fat16_vol_id; + + u32 fat32_length; + u32 fat32_root_cluster; + u16 fat32_info_sector; + u8 fat32_state; + u32 fat32_vol_id; +}; + +static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; +static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; + +static struct fat_floppy_defaults { + unsigned nr_sectors; + unsigned sec_per_clus; + unsigned dir_entries; + unsigned media; + unsigned fat_length; +} floppy_defaults[] = { +{ + .nr_sectors = 160 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFE, + .fat_length = 1, +}, +{ + .nr_sectors = 180 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFC, + .fat_length = 2, +}, +{ + .nr_sectors = 320 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFF, + .fat_length = 1, +}, +{ + .nr_sectors = 360 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFD, + .fat_length = 2, +}, +}; + +static int fat_add_cluster(struct inode *inode) +{ + int err, cluster; + + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) + return err; + /* FIXME: this cluster should be added after data of this + * cluster is writed */ + err = fat_chain_add(inode, cluster, 1); + if (err) + fat_free_clusters(inode, cluster); + return err; +} + +static inline int __fat_get_block(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + unsigned long mapped_blocks; + sector_t phys; + int err, offset; + + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + if (err) + return err; + if (phys) { + map_bh(bh_result, sb, phys); + *max_blocks = min(mapped_blocks, *max_blocks); + return 0; + } + if (!create) + return 0; + + if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { + fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)", + MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); + return -EIO; + } + + offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); + if (!offset) { + /* TODO: multiple cluster allocation would be desirable. */ + err = fat_add_cluster(inode); + if (err) + return err; + } + /* available blocks on this cluster */ + mapped_blocks = sbi->sec_per_clus - offset; + + *max_blocks = min(mapped_blocks, *max_blocks); + MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + if (err) + return err; + + BUG_ON(!phys); + BUG_ON(*max_blocks != mapped_blocks); + set_buffer_new(bh_result); + map_bh(bh_result, sb, phys); + + return 0; +} + +static int fat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + + err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create); + if (err) + return err; + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + +static int fat_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, fat_get_block, wbc); +} + +static int fat_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, fat_get_block); +} + +static int fat_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, fat_get_block); +} + +static int fat_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, fat_get_block); +} + +static void fat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + fat_truncate_blocks(inode, inode->i_size); + } +} + +static int fat_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int err; + + *pagep = NULL; + err = cont_write_begin(file, mapping, pos, len, flags, + pagep, fsdata, fat_get_block, + &MSDOS_I(mapping->host)->mmu_private); + if (err < 0) + fat_write_failed(mapping, pos + len); + return err; +} + +static int fat_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + fat_write_failed(mapping, pos + len); + if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + mark_inode_dirty(inode); + } + return err; +} + +static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (iov_iter_rw(iter) == WRITE) { + /* + * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), + * so we need to update the ->mmu_private to block boundary. + * + * But we must fill the remaining area or hole by nul for + * updating ->mmu_private. + * + * Return 0, and fallback to normal buffered write. + */ + loff_t size = offset + count; + if (MSDOS_I(inode)->mmu_private < size) + return 0; + } + + /* + * FAT need to use the DIO_LOCKING for avoiding the race + * condition of fat_get_block() and ->truncate(). + */ + ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block); + if (ret < 0 && iov_iter_rw(iter) == WRITE) + fat_write_failed(mapping, offset + count); + + return ret; +} + +static sector_t _fat_bmap(struct address_space *mapping, sector_t block) +{ + sector_t blocknr; + + /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ + down_read(&MSDOS_I(mapping->host)->truncate_lock); + blocknr = generic_block_bmap(mapping, block, fat_get_block); + up_read(&MSDOS_I(mapping->host)->truncate_lock); + + return blocknr; +} + +/* + * fat_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This is required during truncate to physically zeroout the tail end + * of that block so it doesn't yield old data if the file is later grown. + * Also, avoid causing failure from fsx for cases of "data past EOF" + */ +int fat_block_truncate_page(struct inode *inode, loff_t from) +{ + return block_truncate_page(inode->i_mapping, from, fat_get_block); +} + +static const struct address_space_operations fat_aops = { + .readpage = fat_readpage, + .readpages = fat_readpages, + .writepage = fat_writepage, + .writepages = fat_writepages, + .write_begin = fat_write_begin, + .write_end = fat_write_end, + .direct_IO = fat_direct_IO, + .bmap = _fat_bmap +}; + +/* + * New FAT inode stuff. We do the following: + * a) i_ino is constant and has nothing with on-disk location. + * b) FAT manages its own cache of directory entries. + * c) *This* cache is indexed by on-disk location. + * d) inode has an associated directory entry, all right, but + * it may be unhashed. + * e) currently entries are stored within struct inode. That should + * change. + * f) we deal with races in the following way: + * 1. readdir() and lookup() do FAT-dir-cache lookup. + * 2. rename() unhashes the F-d-c entry and rehashes it in + * a new place. + * 3. unlink() and rmdir() unhash F-d-c entry. + * 4. fat_write_inode() checks whether the thing is unhashed. + * If it is we silently return. If it isn't we do bread(), + * check if the location is still valid and retry if it + * isn't. Otherwise we do changes. + * 5. Spinlock is used to protect hash/unhash/location check/lookup + * 6. fat_evict_inode() unhashes the F-d-c entry. + * 7. lookup() and readdir() do igrab() if they find a F-d-c entry + * and consider negative result as cache miss. + */ + +static void fat_hash_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int i; + + spin_lock_init(&sbi->inode_hash_lock); + for (i = 0; i < FAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); +} + +static inline unsigned long fat_hash(loff_t i_pos) +{ + return hash_32(i_pos, FAT_HASH_BITS); +} + +static void dir_hash_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int i; + + spin_lock_init(&sbi->dir_hash_lock); + for (i = 0; i < FAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->dir_hashtable[i]); +} + +void fat_attach(struct inode *inode, loff_t i_pos) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + + if (inode->i_ino != MSDOS_ROOT_INO) { + struct hlist_head *head = sbi->inode_hashtable + + fat_hash(i_pos); + + spin_lock(&sbi->inode_hash_lock); + MSDOS_I(inode)->i_pos = i_pos; + hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); + spin_unlock(&sbi->inode_hash_lock); + } + + /* If NFS support is enabled, cache the mapping of start cluster + * to directory inode. This is used during reconnection of + * dentries to the filesystem root. + */ + if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { + struct hlist_head *d_head = sbi->dir_hashtable; + d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart); + + spin_lock(&sbi->dir_hash_lock); + hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head); + spin_unlock(&sbi->dir_hash_lock); + } +} +EXPORT_SYMBOL_GPL(fat_attach); + +void fat_detach(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + spin_lock(&sbi->inode_hash_lock); + MSDOS_I(inode)->i_pos = 0; + hlist_del_init(&MSDOS_I(inode)->i_fat_hash); + spin_unlock(&sbi->inode_hash_lock); + + if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { + spin_lock(&sbi->dir_hash_lock); + hlist_del_init(&MSDOS_I(inode)->i_dir_hash); + spin_unlock(&sbi->dir_hash_lock); + } +} +EXPORT_SYMBOL_GPL(fat_detach); + +struct inode *fat_iget(struct super_block *sb, loff_t i_pos) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); + struct msdos_inode_info *i; + struct inode *inode = NULL; + + spin_lock(&sbi->inode_hash_lock); + hlist_for_each_entry(i, head, i_fat_hash) { + BUG_ON(i->vfs_inode.i_sb != sb); + if (i->i_pos != i_pos) + continue; + inode = igrab(&i->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->inode_hash_lock); + return inode; +} + +static int is_exec(unsigned char *extension) +{ + unsigned char exe_extensions[] = "EXECOMBAT", *walk; + + for (walk = exe_extensions; *walk; walk += 3) + if (!strncmp(extension, walk, 3)) + return 1; + return 0; +} + +static int fat_calc_dir_size(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int ret, fclus, dclus; + + inode->i_size = 0; + if (MSDOS_I(inode)->i_start == 0) + return 0; + + ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); + if (ret < 0) + return ret; + inode->i_size = (fclus + 1) << sbi->cluster_bits; + + return 0; +} + +/* doesn't deal with root inode */ +int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int error; + + MSDOS_I(inode)->i_pos = 0; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version++; + inode->i_generation = get_seconds(); + + if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { + inode->i_generation &= ~1; + inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO); + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + + MSDOS_I(inode)->i_start = fat_get_start(sbi, de); + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + error = fat_calc_dir_size(inode); + if (error < 0) + return error; + MSDOS_I(inode)->mmu_private = inode->i_size; + + set_nlink(inode, fat_subdirs(inode)); + } else { /* not a directory */ + inode->i_generation |= 1; + inode->i_mode = fat_make_mode(sbi, de->attr, + ((sbi->options.showexec && !is_exec(de->name + 8)) + ? S_IRUGO|S_IWUGO : S_IRWXUGO)); + MSDOS_I(inode)->i_start = fat_get_start(sbi, de); + + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + inode->i_size = le32_to_cpu(de->size); + inode->i_op = &fat_file_inode_operations; + inode->i_fop = &fat_file_operations; + inode->i_mapping->a_ops = &fat_aops; + MSDOS_I(inode)->mmu_private = inode->i_size; + } + if (de->attr & ATTR_SYS) { + if (sbi->options.sys_immutable) + inode->i_flags |= S_IMMUTABLE; + } + fat_save_attrs(inode, de->attr); + + inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) + & ~((loff_t)sbi->cluster_size - 1)) >> 9; + + fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + if (sbi->options.isvfat) { + fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, + de->cdate, de->ctime_cs); + fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + } else + inode->i_ctime = inode->i_atime = inode->i_mtime; + + return 0; +} + +static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_lock(&sbi->nfs_build_inode_lock); +} + +static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_unlock(&sbi->nfs_build_inode_lock); +} + +struct inode *fat_build_inode(struct super_block *sb, + struct msdos_dir_entry *de, loff_t i_pos) +{ + struct inode *inode; + int err; + + fat_lock_build_inode(MSDOS_SB(sb)); + inode = fat_iget(sb, i_pos); + if (inode) + goto out; + inode = new_inode(sb); + if (!inode) { + inode = ERR_PTR(-ENOMEM); + goto out; + } + inode->i_ino = iunique(sb, MSDOS_ROOT_INO); + inode->i_version = 1; + err = fat_fill_inode(inode, de); + if (err) { + iput(inode); + inode = ERR_PTR(err); + goto out; + } + fat_attach(inode, i_pos); + insert_inode_hash(inode); +out: + fat_unlock_build_inode(MSDOS_SB(sb)); + return inode; +} + +EXPORT_SYMBOL_GPL(fat_build_inode); + +static void fat_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + if (!inode->i_nlink) { + inode->i_size = 0; + fat_truncate_blocks(inode, 0); + } + invalidate_inode_buffers(inode); + clear_inode(inode); + fat_cache_inval_inode(inode); + fat_detach(inode); +} + +static void fat_set_state(struct super_block *sb, + unsigned int set, unsigned int force) +{ + struct buffer_head *bh; + struct fat_boot_sector *b; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* do not change any thing if mounted read only */ + if ((sb->s_flags & MS_RDONLY) && !force) + return; + + /* do not change state if fs was dirty */ + if (sbi->dirty) { + /* warn only on set (mount). */ + if (set) + fat_msg(sb, KERN_WARNING, "Volume was not properly " + "unmounted. Some data may be corrupt. " + "Please run fsck."); + return; + } + + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector " + "to mark fs as dirty"); + return; + } + + b = (struct fat_boot_sector *) bh->b_data; + + if (sbi->fat_bits == 32) { + if (set) + b->fat32.state |= FAT_STATE_DIRTY; + else + b->fat32.state &= ~FAT_STATE_DIRTY; + } else /* fat 16 and 12 */ { + if (set) + b->fat16.state |= FAT_STATE_DIRTY; + else + b->fat16.state &= ~FAT_STATE_DIRTY; + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); +} + +static void delayed_free(struct rcu_head *p) +{ + struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); + unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + if (sbi->options.iocharset != fat_default_iocharset) + kfree(sbi->options.iocharset); + kfree(sbi); +} + +static void fat_put_super(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + fat_set_state(sb, 0, 0); + + iput(sbi->fsinfo_inode); + iput(sbi->fat_inode); + + call_rcu(&sbi->rcu, delayed_free); +} + +static struct kmem_cache *fat_inode_cachep; + +static struct inode *fat_alloc_inode(struct super_block *sb) +{ + struct msdos_inode_info *ei; + ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + init_rwsem(&ei->truncate_lock); + return &ei->vfs_inode; +} + +static void fat_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); +} + +static void fat_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, fat_i_callback); +} + +static void init_once(void *foo) +{ + struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; + + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = FAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); + INIT_HLIST_NODE(&ei->i_fat_hash); + INIT_HLIST_NODE(&ei->i_dir_hash); + inode_init_once(&ei->vfs_inode); +} + +static int __init fat_init_inodecache(void) +{ + fat_inode_cachep = kmem_cache_create("fat_inode_cache", + sizeof(struct msdos_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (fat_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void __exit fat_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(fat_inode_cachep); +} + +static int fat_remount(struct super_block *sb, int *flags, char *data) +{ + int new_rdonly; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + + sync_filesystem(sb); + + /* make sure we update state on remount. */ + new_rdonly = *flags & MS_RDONLY; + if (new_rdonly != (sb->s_flags & MS_RDONLY)) { + if (new_rdonly) + fat_set_state(sb, 0, 0); + else + fat_set_state(sb, 1, 1); + } + return 0; +} + +static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + /* If the count of free cluster is still unknown, counts it here. */ + if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { + int err = fat_count_free_clusters(dentry->d_sb); + if (err) + return err; + } + + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = sbi->max_cluster - FAT_START_ENT; + buf->f_bfree = sbi->free_clusters; + buf->f_bavail = sbi->free_clusters; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = + (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; + + return 0; +} + +static int __fat_write_inode(struct inode *inode, int wait) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct msdos_dir_entry *raw_entry; + loff_t i_pos; + sector_t blocknr; + int err, offset; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + +retry: + i_pos = fat_i_pos_read(sbi, inode); + if (!i_pos) + return 0; + + fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); + if (!bh) { + fat_msg(sb, KERN_ERR, "unable to read inode block " + "for updating (i_pos %lld)", i_pos); + return -EIO; + } + spin_lock(&sbi->inode_hash_lock); + if (i_pos != MSDOS_I(inode)->i_pos) { + spin_unlock(&sbi->inode_hash_lock); + brelse(bh); + goto retry; + } + + raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; + if (S_ISDIR(inode->i_mode)) + raw_entry->size = 0; + else + raw_entry->size = cpu_to_le32(inode->i_size); + raw_entry->attr = fat_make_attrs(inode); + fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); + fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + &raw_entry->date, NULL); + if (sbi->options.isvfat) { + __le16 atime; + fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); + fat_time_unix2fat(sbi, &inode->i_atime, &atime, + &raw_entry->adate, NULL); + } + spin_unlock(&sbi->inode_hash_lock); + mark_buffer_dirty(bh); + err = 0; + if (wait) + err = sync_dirty_buffer(bh); + brelse(bh); + return err; +} + +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err; + + if (inode->i_ino == MSDOS_FSINFO_INO) { + struct super_block *sb = inode->i_sb; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + err = fat_clusters_flush(sb); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + } else + err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + + return err; +} + +int fat_sync_inode(struct inode *inode) +{ + return __fat_write_inode(inode, 1); +} + +EXPORT_SYMBOL_GPL(fat_sync_inode); + +static int fat_show_options(struct seq_file *m, struct dentry *root); +static const struct super_operations fat_sops = { + .alloc_inode = fat_alloc_inode, + .destroy_inode = fat_destroy_inode, + .write_inode = fat_write_inode, + .evict_inode = fat_evict_inode, + .put_super = fat_put_super, + .statfs = fat_statfs, + .remount_fs = fat_remount, + + .show_options = fat_show_options, +}; + +static int fat_show_options(struct seq_file *m, struct dentry *root) +{ + struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); + struct fat_mount_options *opts = &sbi->options; + int isvfat = opts->isvfat; + + if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->fs_uid)); + if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->fs_gid)); + seq_printf(m, ",fmask=%04o", opts->fs_fmask); + seq_printf(m, ",dmask=%04o", opts->fs_dmask); + if (opts->allow_utime) + seq_printf(m, ",allow_utime=%04o", opts->allow_utime); + if (sbi->nls_disk) + /* strip "cp" prefix from displayed option */ + seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]); + if (isvfat) { + if (sbi->nls_io) + seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); + + switch (opts->shortname) { + case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=win95"); + break; + case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT: + seq_puts(m, ",shortname=winnt"); + break; + case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=mixed"); + break; + case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=lower"); + break; + default: + seq_puts(m, ",shortname=unknown"); + break; + } + } + if (opts->name_check != 'n') + seq_printf(m, ",check=%c", opts->name_check); + if (opts->usefree) + seq_puts(m, ",usefree"); + if (opts->quiet) + seq_puts(m, ",quiet"); + if (opts->showexec) + seq_puts(m, ",showexec"); + if (opts->sys_immutable) + seq_puts(m, ",sys_immutable"); + if (!isvfat) { + if (opts->dotsOK) + seq_puts(m, ",dotsOK=yes"); + if (opts->nocase) + seq_puts(m, ",nocase"); + } else { + if (opts->utf8) + seq_puts(m, ",utf8"); + if (opts->unicode_xlate) + seq_puts(m, ",uni_xlate"); + if (!opts->numtail) + seq_puts(m, ",nonumtail"); + if (opts->rodir) + seq_puts(m, ",rodir"); + } + if (opts->flush) + seq_puts(m, ",flush"); + if (opts->tz_set) { + if (opts->time_offset) + seq_printf(m, ",time_offset=%d", opts->time_offset); + else + seq_puts(m, ",tz=UTC"); + } + if (opts->errors == FAT_ERRORS_CONT) + seq_puts(m, ",errors=continue"); + else if (opts->errors == FAT_ERRORS_PANIC) + seq_puts(m, ",errors=panic"); + else + seq_puts(m, ",errors=remount-ro"); + if (opts->nfs == FAT_NFS_NOSTALE_RO) + seq_puts(m, ",nfs=nostale_ro"); + else if (opts->nfs) + seq_puts(m, ",nfs=stale_rw"); + if (opts->discard) + seq_puts(m, ",discard"); + if (opts->dos1xfloppy) + seq_puts(m, ",dos1xfloppy"); + + return 0; +} + +enum { + Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid, + Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, + Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, + Opt_immutable, Opt_dots, Opt_nodots, + Opt_charset, Opt_shortname_lower, Opt_shortname_win95, + Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, + Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, + Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, +}; + +static const match_table_t fat_tokens = { + {Opt_check_r, "check=relaxed"}, + {Opt_check_s, "check=strict"}, + {Opt_check_n, "check=normal"}, + {Opt_check_r, "check=r"}, + {Opt_check_s, "check=s"}, + {Opt_check_n, "check=n"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_dmask, "dmask=%o"}, + {Opt_fmask, "fmask=%o"}, + {Opt_allow_utime, "allow_utime=%o"}, + {Opt_codepage, "codepage=%u"}, + {Opt_usefree, "usefree"}, + {Opt_nocase, "nocase"}, + {Opt_quiet, "quiet"}, + {Opt_showexec, "showexec"}, + {Opt_debug, "debug"}, + {Opt_immutable, "sys_immutable"}, + {Opt_flush, "flush"}, + {Opt_tz_utc, "tz=UTC"}, + {Opt_time_offset, "time_offset=%d"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_discard, "discard"}, + {Opt_nfs_stale_rw, "nfs"}, + {Opt_nfs_stale_rw, "nfs=stale_rw"}, + {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, + {Opt_dos1xfloppy, "dos1xfloppy"}, + {Opt_obsolete, "conv=binary"}, + {Opt_obsolete, "conv=text"}, + {Opt_obsolete, "conv=auto"}, + {Opt_obsolete, "conv=b"}, + {Opt_obsolete, "conv=t"}, + {Opt_obsolete, "conv=a"}, + {Opt_obsolete, "fat=%u"}, + {Opt_obsolete, "blocksize=%u"}, + {Opt_obsolete, "cvf_format=%20s"}, + {Opt_obsolete, "cvf_options=%100s"}, + {Opt_obsolete, "posix"}, + {Opt_err, NULL}, +}; +static const match_table_t msdos_tokens = { + {Opt_nodots, "nodots"}, + {Opt_nodots, "dotsOK=no"}, + {Opt_dots, "dots"}, + {Opt_dots, "dotsOK=yes"}, + {Opt_err, NULL} +}; +static const match_table_t vfat_tokens = { + {Opt_charset, "iocharset=%s"}, + {Opt_shortname_lower, "shortname=lower"}, + {Opt_shortname_win95, "shortname=win95"}, + {Opt_shortname_winnt, "shortname=winnt"}, + {Opt_shortname_mixed, "shortname=mixed"}, + {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */ + {Opt_utf8_no, "utf8=no"}, + {Opt_utf8_no, "utf8=false"}, + {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */ + {Opt_utf8_yes, "utf8=yes"}, + {Opt_utf8_yes, "utf8=true"}, + {Opt_utf8_yes, "utf8"}, + {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */ + {Opt_uni_xl_no, "uni_xlate=no"}, + {Opt_uni_xl_no, "uni_xlate=false"}, + {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */ + {Opt_uni_xl_yes, "uni_xlate=yes"}, + {Opt_uni_xl_yes, "uni_xlate=true"}, + {Opt_uni_xl_yes, "uni_xlate"}, + {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */ + {Opt_nonumtail_no, "nonumtail=no"}, + {Opt_nonumtail_no, "nonumtail=false"}, + {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */ + {Opt_nonumtail_yes, "nonumtail=yes"}, + {Opt_nonumtail_yes, "nonumtail=true"}, + {Opt_nonumtail_yes, "nonumtail"}, + {Opt_rodir, "rodir"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options, int is_vfat, + int silent, int *debug, struct fat_mount_options *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *iocharset; + + opts->isvfat = is_vfat; + + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask = opts->fs_dmask = current_umask(); + opts->allow_utime = -1; + opts->codepage = fat_default_codepage; + opts->iocharset = fat_default_iocharset; + if (is_vfat) { + opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; + opts->rodir = 0; + } else { + opts->shortname = 0; + opts->rodir = 1; + } + opts->name_check = 'n'; + opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; + opts->utf8 = opts->unicode_xlate = 0; + opts->numtail = 1; + opts->usefree = opts->nocase = 0; + opts->tz_set = 0; + opts->nfs = 0; + opts->errors = FAT_ERRORS_RO; + *debug = 0; + + if (!options) + goto out; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, fat_tokens, args); + if (token == Opt_err) { + if (is_vfat) + token = match_token(p, vfat_tokens, args); + else + token = match_token(p, msdos_tokens, args); + } + switch (token) { + case Opt_check_s: + opts->name_check = 's'; + break; + case Opt_check_r: + opts->name_check = 'r'; + break; + case Opt_check_n: + opts->name_check = 'n'; + break; + case Opt_usefree: + opts->usefree = 1; + break; + case Opt_nocase: + if (!is_vfat) + opts->nocase = 1; + else { + /* for backward compatibility */ + opts->shortname = VFAT_SFN_DISPLAY_WIN95 + | VFAT_SFN_CREATE_WIN95; + } + break; + case Opt_quiet: + opts->quiet = 1; + break; + case Opt_showexec: + opts->showexec = 1; + break; + case Opt_debug: + *debug = 1; + break; + case Opt_immutable: + opts->sys_immutable = 1; + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->fs_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(opts->fs_uid)) + return -EINVAL; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->fs_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(opts->fs_gid)) + return -EINVAL; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_fmask = opts->fs_dmask = option; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_dmask = option; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_fmask = option; + break; + case Opt_allow_utime: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->allow_utime = option & (S_IWGRP | S_IWOTH); + break; + case Opt_codepage: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->codepage = option; + break; + case Opt_flush: + opts->flush = 1; + break; + case Opt_time_offset: + if (match_int(&args[0], &option)) + return -EINVAL; + if (option < -12 * 60 || option > 12 * 60) + return -EINVAL; + opts->tz_set = 1; + opts->time_offset = option; + break; + case Opt_tz_utc: + opts->tz_set = 1; + opts->time_offset = 0; + break; + case Opt_err_cont: + opts->errors = FAT_ERRORS_CONT; + break; + case Opt_err_panic: + opts->errors = FAT_ERRORS_PANIC; + break; + case Opt_err_ro: + opts->errors = FAT_ERRORS_RO; + break; + case Opt_nfs_stale_rw: + opts->nfs = FAT_NFS_STALE_RW; + break; + case Opt_nfs_nostale_ro: + opts->nfs = FAT_NFS_NOSTALE_RO; + break; + case Opt_dos1xfloppy: + opts->dos1xfloppy = 1; + break; + + /* msdos specific */ + case Opt_dots: + opts->dotsOK = 1; + break; + case Opt_nodots: + opts->dotsOK = 0; + break; + + /* vfat specific */ + case Opt_charset: + if (opts->iocharset != fat_default_iocharset) + kfree(opts->iocharset); + iocharset = match_strdup(&args[0]); + if (!iocharset) + return -ENOMEM; + opts->iocharset = iocharset; + break; + case Opt_shortname_lower: + opts->shortname = VFAT_SFN_DISPLAY_LOWER + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_shortname_win95: + opts->shortname = VFAT_SFN_DISPLAY_WIN95 + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_shortname_winnt: + opts->shortname = VFAT_SFN_DISPLAY_WINNT + | VFAT_SFN_CREATE_WINNT; + break; + case Opt_shortname_mixed: + opts->shortname = VFAT_SFN_DISPLAY_WINNT + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_utf8_no: /* 0 or no or false */ + opts->utf8 = 0; + break; + case Opt_utf8_yes: /* empty or 1 or yes or true */ + opts->utf8 = 1; + break; + case Opt_uni_xl_no: /* 0 or no or false */ + opts->unicode_xlate = 0; + break; + case Opt_uni_xl_yes: /* empty or 1 or yes or true */ + opts->unicode_xlate = 1; + break; + case Opt_nonumtail_no: /* 0 or no or false */ + opts->numtail = 1; /* negated option */ + break; + case Opt_nonumtail_yes: /* empty or 1 or yes or true */ + opts->numtail = 0; /* negated option */ + break; + case Opt_rodir: + opts->rodir = 1; + break; + case Opt_discard: + opts->discard = 1; + break; + + /* obsolete mount options */ + case Opt_obsolete: + fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " + "not supported now", p); + break; + /* unknown option */ + default: + if (!silent) { + fat_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); + } + return -EINVAL; + } + } + +out: + /* UTF-8 doesn't provide FAT semantics */ + if (!strcmp(opts->iocharset, "utf8")) { + fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" + " for FAT filesystems, filesystem will be " + "case sensitive!"); + } + + /* If user doesn't specify allow_utime, it's initialized from dmask. */ + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); + if (opts->unicode_xlate) + opts->utf8 = 0; + if (opts->nfs == FAT_NFS_NOSTALE_RO) { + sb->s_flags |= MS_RDONLY; + sb->s_export_op = &fat_export_ops_nostale; + } + + return 0; +} + +static int fat_read_root(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int error; + + MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version++; + inode->i_generation = 0; + inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + if (sbi->fat_bits == 32) { + MSDOS_I(inode)->i_start = sbi->root_cluster; + error = fat_calc_dir_size(inode); + if (error < 0) + return error; + } else { + MSDOS_I(inode)->i_start = 0; + inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); + } + inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) + & ~((loff_t)sbi->cluster_size - 1)) >> 9; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->mmu_private = inode->i_size; + + fat_save_attrs(inode, ATTR_DIR); + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; + inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; + set_nlink(inode, fat_subdirs(inode)+2); + + return 0; +} + +static unsigned long calc_fat_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* Divide first to avoid overflow */ + if (sbi->fat_bits != 12) { + unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; + return ent_per_sec * sbi->fat_length; + } + + return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; +} + +static bool fat_bpb_is_zero(struct fat_boot_sector *b) +{ + if (get_unaligned_le16(&b->sector_size)) + return false; + if (b->sec_per_clus) + return false; + if (b->reserved) + return false; + if (b->fats) + return false; + if (get_unaligned_le16(&b->dir_entries)) + return false; + if (get_unaligned_le16(&b->sectors)) + return false; + if (b->media) + return false; + if (b->fat_length) + return false; + if (b->secs_track) + return false; + if (b->heads) + return false; + return true; +} + +static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, + int silent, struct fat_bios_param_block *bpb) +{ + int error = -EINVAL; + + /* Read in BPB ... */ + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = get_unaligned_le16(&b->sector_size); + bpb->fat_sec_per_clus = b->sec_per_clus; + bpb->fat_reserved = le16_to_cpu(b->reserved); + bpb->fat_fats = b->fats; + bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries); + bpb->fat_sectors = get_unaligned_le16(&b->sectors); + bpb->fat_fat_length = le16_to_cpu(b->fat_length); + bpb->fat_total_sect = le32_to_cpu(b->total_sect); + + bpb->fat16_state = b->fat16.state; + bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id); + + bpb->fat32_length = le32_to_cpu(b->fat32.length); + bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster); + bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector); + bpb->fat32_state = b->fat32.state; + bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id); + + /* Validate this looks like a FAT filesystem BPB */ + if (!bpb->fat_reserved) { + if (!silent) + fat_msg(sb, KERN_ERR, + "bogus number of reserved sectors"); + goto out; + } + if (!bpb->fat_fats) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + goto out; + } + + /* + * Earlier we checked here that b->secs_track and b->head are nonzero, + * but it turns out valid FAT filesystems can have zero there. + */ + + if (!fat_valid_media(b->media)) { + if (!silent) + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", + (unsigned)b->media); + goto out; + } + + if (!is_power_of_2(bpb->fat_sector_size) + || (bpb->fat_sector_size < 512) + || (bpb->fat_sector_size > 4096)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", + (unsigned)bpb->fat_sector_size); + goto out; + } + + if (!is_power_of_2(bpb->fat_sec_per_clus)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", + (unsigned)bpb->fat_sec_per_clus); + goto out; + } + + error = 0; + +out: + return error; +} + +static int fat_read_static_bpb(struct super_block *sb, + struct fat_boot_sector *b, int silent, + struct fat_bios_param_block *bpb) +{ + static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; + + struct fat_floppy_defaults *fdefaults = NULL; + int error = -EINVAL; + sector_t bd_sects; + unsigned i; + + bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; + + /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ + if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; no bootstrapping code", notdos1x); + goto out; + } + + /* + * If any value in this region is non-zero, it isn't archaic + * DOS. + */ + if (!fat_bpb_is_zero(b)) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; DOS 2.x BPB is non-zero", notdos1x); + goto out; + } + + for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) { + if (floppy_defaults[i].nr_sectors == bd_sects) { + fdefaults = &floppy_defaults[i]; + break; + } + } + + if (fdefaults == NULL) { + if (!silent) + fat_msg(sb, KERN_WARNING, + "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)", + (u64)bd_sects); + goto out; + } + + if (!silent) + fat_msg(sb, KERN_INFO, + "This looks like a DOS 1.x volume; assuming default BPB values"); + + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = SECTOR_SIZE; + bpb->fat_sec_per_clus = fdefaults->sec_per_clus; + bpb->fat_reserved = 1; + bpb->fat_fats = 2; + bpb->fat_dir_entries = fdefaults->dir_entries; + bpb->fat_sectors = fdefaults->nr_sectors; + bpb->fat_fat_length = fdefaults->fat_length; + + error = 0; + +out: + return error; +} + +/* + * Read the super block of an MS-DOS FS. + */ +int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, + void (*setup)(struct super_block *)) +{ + struct inode *root_inode = NULL, *fat_inode = NULL; + struct inode *fsinfo_inode = NULL; + struct buffer_head *bh; + struct fat_bios_param_block bpb; + struct msdos_sb_info *sbi; + u16 logical_sector_size; + u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; + int debug; + long error; + char buf[50]; + + /* + * GFP_KERNEL is ok here, because while we do hold the + * supeblock lock, memory pressure can't call back into + * the filesystem, since we're only just about to mount + * it and have no inodes etc active! + */ + sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + sb->s_flags |= MS_NODIRATIME; + sb->s_magic = MSDOS_SUPER_MAGIC; + sb->s_op = &fat_sops; + sb->s_export_op = &fat_export_ops; + mutex_init(&sbi->nfs_build_inode_lock); + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); + if (error) + goto out_fail; + + setup(sb); /* flavour-specific stuff that needs options */ + + error = -EIO; + sb_min_blocksize(sb, 512); + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector"); + goto out_fail; + } + + error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, + &bpb); + if (error == -EINVAL && sbi->options.dos1xfloppy) + error = fat_read_static_bpb(sb, + (struct fat_boot_sector *)bh->b_data, silent, &bpb); + brelse(bh); + + if (error == -EINVAL) + goto out_invalid; + else if (error) + goto out_fail; + + logical_sector_size = bpb.fat_sector_size; + sbi->sec_per_clus = bpb.fat_sec_per_clus; + + error = -EIO; + if (logical_sector_size < sb->s_blocksize) { + fat_msg(sb, KERN_ERR, "logical sector size too small for device" + " (logical sector size = %u)", logical_sector_size); + goto out_fail; + } + + if (logical_sector_size > sb->s_blocksize) { + struct buffer_head *bh_resize; + + if (!sb_set_blocksize(sb, logical_sector_size)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize %u", + logical_sector_size); + goto out_fail; + } + + /* Verify that the larger boot sector is fully readable */ + bh_resize = sb_bread(sb, 0); + if (bh_resize == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector" + " (logical sector size = %lu)", + sb->s_blocksize); + goto out_fail; + } + brelse(bh_resize); + } + + mutex_init(&sbi->s_lock); + sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; + sbi->cluster_bits = ffs(sbi->cluster_size) - 1; + sbi->fats = bpb.fat_fats; + sbi->fat_bits = 0; /* Don't know yet */ + sbi->fat_start = bpb.fat_reserved; + sbi->fat_length = bpb.fat_fat_length; + sbi->root_cluster = 0; + sbi->free_clusters = -1; /* Don't know yet */ + sbi->free_clus_valid = 0; + sbi->prev_free = FAT_START_ENT; + sb->s_maxbytes = 0xffffffff; + + if (!sbi->fat_length && bpb.fat32_length) { + struct fat_boot_fsinfo *fsinfo; + struct buffer_head *fsinfo_bh; + + /* Must be FAT32 */ + sbi->fat_bits = 32; + sbi->fat_length = bpb.fat32_length; + sbi->root_cluster = bpb.fat32_root_cluster; + + /* MC - if info_sector is 0, don't multiply by 0 */ + sbi->fsinfo_sector = bpb.fat32_info_sector; + if (sbi->fsinfo_sector == 0) + sbi->fsinfo_sector = 1; + + fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); + if (fsinfo_bh == NULL) { + fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" + " (sector = %lu)", sbi->fsinfo_sector); + goto out_fail; + } + + fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; + if (!IS_FSINFO(fsinfo)) { + fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", + le32_to_cpu(fsinfo->signature1), + le32_to_cpu(fsinfo->signature2), + sbi->fsinfo_sector); + } else { + if (sbi->options.usefree) + sbi->free_clus_valid = 1; + sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters); + sbi->prev_free = le32_to_cpu(fsinfo->next_cluster); + } + + brelse(fsinfo_bh); + } + + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = bpb.fat32_vol_id; + else /* fat 16 or 12 */ + sbi->vol_id = bpb.fat16_vol_id; + + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); + sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; + + sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; + sbi->dir_entries = bpb.fat_dir_entries; + if (sbi->dir_entries & (sbi->dir_per_block - 1)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus directory-entries per block" + " (%u)", sbi->dir_entries); + goto out_invalid; + } + + rootdir_sectors = sbi->dir_entries + * sizeof(struct msdos_dir_entry) / sb->s_blocksize; + sbi->data_start = sbi->dir_start + rootdir_sectors; + total_sectors = bpb.fat_sectors; + if (total_sectors == 0) + total_sectors = bpb.fat_total_sect; + + total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; + + if (sbi->fat_bits != 32) + sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; + + /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ + if (sbi->fat_bits == 32) + sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; + else /* fat 16 or 12 */ + sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; + + /* check that FAT table does not overflow */ + fat_clusters = calc_fat_clusters(sb); + total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); + if (total_clusters > MAX_FAT(sb)) { + if (!silent) + fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", + total_clusters); + goto out_invalid; + } + + sbi->max_cluster = total_clusters + FAT_START_ENT; + /* check the free_clusters, it's not necessarily correct */ + if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters) + sbi->free_clusters = -1; + /* check the prev_free, it's not necessarily correct */ + sbi->prev_free %= sbi->max_cluster; + if (sbi->prev_free < FAT_START_ENT) + sbi->prev_free = FAT_START_ENT; + + /* set up enough so that it can read an inode */ + fat_hash_init(sb); + dir_hash_init(sb); + fat_ent_access_init(sb); + + /* + * The low byte of FAT's first entry must have same value with + * media-field. But in real world, too many devices is + * writing wrong value. So, removed that validity check. + * + * if (FAT_FIRST_ENT(sb, media) != first) + */ + + error = -EINVAL; + sprintf(buf, "cp%d", sbi->options.codepage); + sbi->nls_disk = load_nls(buf); + if (!sbi->nls_disk) { + fat_msg(sb, KERN_ERR, "codepage %s not found", buf); + goto out_fail; + } + + /* FIXME: utf8 is using iocharset for upper/lower conversion */ + if (sbi->options.isvfat) { + sbi->nls_io = load_nls(sbi->options.iocharset); + if (!sbi->nls_io) { + fat_msg(sb, KERN_ERR, "IO charset %s not found", + sbi->options.iocharset); + goto out_fail; + } + } + + error = -ENOMEM; + fat_inode = new_inode(sb); + if (!fat_inode) + goto out_fail; + MSDOS_I(fat_inode)->i_pos = 0; + sbi->fat_inode = fat_inode; + + fsinfo_inode = new_inode(sb); + if (!fsinfo_inode) + goto out_fail; + fsinfo_inode->i_ino = MSDOS_FSINFO_INO; + sbi->fsinfo_inode = fsinfo_inode; + insert_inode_hash(fsinfo_inode); + + root_inode = new_inode(sb); + if (!root_inode) + goto out_fail; + root_inode->i_ino = MSDOS_ROOT_INO; + root_inode->i_version = 1; + error = fat_read_root(root_inode); + if (error < 0) { + iput(root_inode); + goto out_fail; + } + error = -ENOMEM; + insert_inode_hash(root_inode); + fat_attach(root_inode, 0); + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + fat_msg(sb, KERN_ERR, "get root inode failed"); + goto out_fail; + } + + if (sbi->options.discard) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + fat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + + fat_set_state(sb, 1, 0); + return 0; + +out_invalid: + error = -EINVAL; + if (!silent) + fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); + +out_fail: + if (fsinfo_inode) + iput(fsinfo_inode); + if (fat_inode) + iput(fat_inode); + unload_nls(sbi->nls_io); + unload_nls(sbi->nls_disk); + if (sbi->options.iocharset != fat_default_iocharset) + kfree(sbi->options.iocharset); + sb->s_fs_info = NULL; + kfree(sbi); + return error; +} + +EXPORT_SYMBOL_GPL(fat_fill_super); + +/* + * helper function for fat_flush_inodes. This writes both the inode + * and the file data blocks, waiting for in flight data blocks before + * the start of the call. It does not wait for any io started + * during the call + */ +static int writeback_inode(struct inode *inode) +{ + + int ret; + + /* if we used wait=1, sync_inode_metadata waits for the io for the + * inode to finish. So wait=0 is sent down to sync_inode_metadata + * and filemap_fdatawrite is used for the data blocks + */ + ret = sync_inode_metadata(inode, 0); + if (!ret) + ret = filemap_fdatawrite(inode->i_mapping); + return ret; +} + +/* + * write data and metadata corresponding to i1 and i2. The io is + * started but we do not wait for any of it to finish. + * + * filemap_flush is used for the block device, so if there is a dirty + * page for a block already in flight, we will not wait and start the + * io over again + */ +int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) +{ + int ret = 0; + if (!MSDOS_SB(sb)->options.flush) + return 0; + if (i1) + ret = writeback_inode(i1); + if (!ret && i2) + ret = writeback_inode(i2); + if (!ret) { + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + ret = filemap_flush(mapping); + } + return ret; +} +EXPORT_SYMBOL_GPL(fat_flush_inodes); + +static int __init init_fat_fs(void) +{ + int err; + + err = fat_cache_init(); + if (err) + return err; + + err = fat_init_inodecache(); + if (err) + goto failed; + + return 0; + +failed: + fat_cache_destroy(); + return err; +} + +static void __exit exit_fat_fs(void) +{ + fat_cache_destroy(); + fat_destroy_inodecache(); +} + +module_init(init_fat_fs) +module_exit(exit_fat_fs) + +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/fat/misc.c b/kernel/fs/fat/misc.c new file mode 100644 index 000000000..c4589e981 --- /dev/null +++ b/kernel/fs/fat/misc.c @@ -0,0 +1,278 @@ +/* + * linux/fs/fat/misc.c + * + * Written 1992,1993 by Werner Almesberger + * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 + * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) + */ + +#include "fat.h" + +/* + * fat_fs_error reports a file system problem that might indicate fa data + * corruption/inconsistency. Depending on 'errors' mount option the + * panic() is called, or error message is printed FAT and nothing is done, + * or filesystem is remounted read-only (default behavior). + * In case the file system is remounted read-only, it can be made writable + * again by remounting it. + */ +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) +{ + struct fat_mount_options *opts = &MSDOS_SB(sb)->options; + va_list args; + struct va_format vaf; + + if (report) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); + va_end(args); + } + + if (opts->errors == FAT_ERRORS_PANIC) + panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); + else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { + sb->s_flags |= MS_RDONLY; + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); + } +} +EXPORT_SYMBOL_GPL(__fat_fs_error); + +/** + * fat_msg() - print preformated FAT specific messages. Every thing what is + * not fat_fs_error() should be fat_msg(). + */ +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +/* Flushes the number of free clusters on FAT32 */ +/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ +int fat_clusters_flush(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct fat_boot_fsinfo *fsinfo; + + if (sbi->fat_bits != 32) + return 0; + + bh = sb_bread(sb, sbi->fsinfo_sector); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); + return -EIO; + } + + fsinfo = (struct fat_boot_fsinfo *)bh->b_data; + /* Sanity check */ + if (!IS_FSINFO(fsinfo)) { + fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", + le32_to_cpu(fsinfo->signature1), + le32_to_cpu(fsinfo->signature2), + sbi->fsinfo_sector); + } else { + if (sbi->free_clusters != -1) + fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters); + if (sbi->prev_free != -1) + fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); + mark_buffer_dirty(bh); + } + brelse(bh); + + return 0; +} + +/* + * fat_chain_add() adds a new cluster to the chain of clusters represented + * by inode. + */ +int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int ret, new_fclus, last; + + /* + * We must locate the last cluster of the file to add this new + * one (new_dclus) to the end of the link list (the FAT). + */ + last = new_fclus = 0; + if (MSDOS_I(inode)->i_start) { + int fclus, dclus; + + ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); + if (ret < 0) + return ret; + new_fclus = fclus + 1; + last = dclus; + } + + /* add new one to the last of the cluster chain */ + if (last) { + struct fat_entry fatent; + + fatent_init(&fatent); + ret = fat_ent_read(inode, &fatent, last); + if (ret >= 0) { + int wait = inode_needs_sync(inode); + ret = fat_ent_write(inode, &fatent, new_dclus, wait); + fatent_brelse(&fatent); + } + if (ret < 0) + return ret; + /* + * FIXME:Although we can add this cache, fat_cache_add() is + * assuming to be called after linear search with fat_cache_id. + */ +// fat_cache_add(inode, new_fclus, new_dclus); + } else { + MSDOS_I(inode)->i_start = new_dclus; + MSDOS_I(inode)->i_logstart = new_dclus; + /* + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. + */ + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { + ret = fat_sync_inode(inode); + if (ret) + return ret; + } else + mark_inode_dirty(inode); + } + if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { + fat_fs_error(sb, "clusters badly computed (%d != %llu)", + new_fclus, + (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); + fat_cache_inval_inode(inode); + } + inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); + + return 0; +} + +/* + * The epoch of FAT timestamp is 1980. + * : bits : value + * date: 0 - 4: day (1 - 31) + * date: 5 - 8: month (1 - 12) + * date: 9 - 15: year (0 - 127) from 1980 + * time: 0 - 4: sec (0 - 29) 2sec counts + * time: 5 - 10: min (0 - 59) + * time: 11 - 15: hour (0 - 23) + */ +#define SECS_PER_MIN 60 +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) +/* days between 1.1.70 and 1.1.80 (2 leap days) */ +#define DAYS_DELTA (365 * 10 + 2) +/* 120 (2100 - 1980) isn't leap year */ +#define YEAR_2100 120 +#define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) + +/* Linear day numbers of the respective 1sts in non-leap years. */ +static time_t days_in_year[] = { + /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ + 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, +}; + +/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ +void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 __time, __le16 __date, u8 time_cs) +{ + u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); + time_t second, day, leap_day, month, year; + + year = date >> 9; + month = max(1, (date >> 5) & 0xf); + day = max(1, date & 0x1f) - 1; + + leap_day = (year + 3) / 4; + if (year > YEAR_2100) /* 2100 isn't leap year */ + leap_day--; + if (IS_LEAP_YEAR(year) && month > 2) + leap_day++; + + second = (time & 0x1f) << 1; + second += ((time >> 5) & 0x3f) * SECS_PER_MIN; + second += (time >> 11) * SECS_PER_HOUR; + second += (year * 365 + leap_day + + days_in_year[month] + day + + DAYS_DELTA) * SECS_PER_DAY; + + if (!sbi->options.tz_set) + second += sys_tz.tz_minuteswest * SECS_PER_MIN; + else + second -= sbi->options.time_offset * SECS_PER_MIN; + + if (time_cs) { + ts->tv_sec = second + (time_cs / 100); + ts->tv_nsec = (time_cs % 100) * 10000000; + } else { + ts->tv_sec = second; + ts->tv_nsec = 0; + } +} + +/* Convert linear UNIX date to a FAT time/date pair. */ +void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 *time, __le16 *date, u8 *time_cs) +{ + struct tm tm; + time_to_tm(ts->tv_sec, + (sbi->options.tz_set ? sbi->options.time_offset : + -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); + + /* FAT can only support year between 1980 to 2107 */ + if (tm.tm_year < 1980 - 1900) { + *time = 0; + *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); + if (time_cs) + *time_cs = 0; + return; + } + if (tm.tm_year > 2107 - 1900) { + *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); + *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); + if (time_cs) + *time_cs = 199; + return; + } + + /* from 1900 -> from 1980 */ + tm.tm_year -= 80; + /* 0~11 -> 1~12 */ + tm.tm_mon++; + /* 0~59 -> 0~29(2sec counts) */ + tm.tm_sec >>= 1; + + *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec); + *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday); + if (time_cs) + *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; +} +EXPORT_SYMBOL_GPL(fat_time_unix2fat); + +int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) +{ + int i, err = 0; + + for (i = 0; i < nr_bhs; i++) + write_dirty_buffer(bhs[i], WRITE); + + for (i = 0; i < nr_bhs; i++) { + wait_on_buffer(bhs[i]); + if (!err && !buffer_uptodate(bhs[i])) + err = -EIO; + } + return err; +} diff --git a/kernel/fs/fat/namei_msdos.c b/kernel/fs/fat/namei_msdos.c new file mode 100644 index 000000000..b7e2b33aa --- /dev/null +++ b/kernel/fs/fat/namei_msdos.c @@ -0,0 +1,684 @@ +/* + * linux/fs/msdos/namei.c + * + * Written 1992,1993 by Werner Almesberger + * Hidden files 1995 by Albert Cahalan + * Rewritten for constant inumbers 1999 by Al Viro + */ + +#include +#include "fat.h" + +/* Characters that are undesirable in an MS-DOS file name */ +static unsigned char bad_chars[] = "*?<>|\""; +static unsigned char bad_if_strict[] = "+=,; "; + +/***** Formats an MS-DOS file name. Rejects invalid names. */ +static int msdos_format_name(const unsigned char *name, int len, + unsigned char *res, struct fat_mount_options *opts) + /* + * name is the proposed name, len is its length, res is + * the resulting name, opts->name_check is either (r)elaxed, + * (n)ormal or (s)trict, opts->dotsOK allows dots at the + * beginning of name (for hidden files) + */ +{ + unsigned char *walk; + unsigned char c; + int space; + + if (name[0] == '.') { /* dotfile because . and .. already done */ + if (opts->dotsOK) { + /* Get rid of dot - test for it elsewhere */ + name++; + len--; + } else + return -EINVAL; + } + /* + * disallow names that _really_ start with a dot + */ + space = 1; + c = 0; + for (walk = res; len && walk - res < 8; walk++) { + c = *name++; + len--; + if (opts->name_check != 'r' && strchr(bad_chars, c)) + return -EINVAL; + if (opts->name_check == 's' && strchr(bad_if_strict, c)) + return -EINVAL; + if (c >= 'A' && c <= 'Z' && opts->name_check == 's') + return -EINVAL; + if (c < ' ' || c == ':' || c == '\\') + return -EINVAL; + /* + * 0xE5 is legal as a first character, but we must substitute + * 0x05 because 0xE5 marks deleted files. Yes, DOS really + * does this. + * It seems that Microsoft hacked DOS to support non-US + * characters after the 0xE5 character was already in use to + * mark deleted files. + */ + if ((res == walk) && (c == 0xE5)) + c = 0x05; + if (c == '.') + break; + space = (c == ' '); + *walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c; + } + if (space) + return -EINVAL; + if (opts->name_check == 's' && len && c != '.') { + c = *name++; + len--; + if (c != '.') + return -EINVAL; + } + while (c != '.' && len--) + c = *name++; + if (c == '.') { + while (walk - res < 8) + *walk++ = ' '; + while (len > 0 && walk - res < MSDOS_NAME) { + c = *name++; + len--; + if (opts->name_check != 'r' && strchr(bad_chars, c)) + return -EINVAL; + if (opts->name_check == 's' && + strchr(bad_if_strict, c)) + return -EINVAL; + if (c < ' ' || c == ':' || c == '\\') + return -EINVAL; + if (c == '.') { + if (opts->name_check == 's') + return -EINVAL; + break; + } + if (c >= 'A' && c <= 'Z' && opts->name_check == 's') + return -EINVAL; + space = c == ' '; + if (!opts->nocase && c >= 'a' && c <= 'z') + *walk++ = c - 32; + else + *walk++ = c; + } + if (space) + return -EINVAL; + if (opts->name_check == 's' && len) + return -EINVAL; + } + while (walk - res < MSDOS_NAME) + *walk++ = ' '; + + return 0; +} + +/***** Locates a directory entry. Uses unformatted name. */ +static int msdos_find(struct inode *dir, const unsigned char *name, int len, + struct fat_slot_info *sinfo) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + unsigned char msdos_name[MSDOS_NAME]; + int err; + + err = msdos_format_name(name, len, msdos_name, &sbi->options); + if (err) + return -ENOENT; + + err = fat_scan(dir, msdos_name, sinfo); + if (!err && sbi->options.dotsOK) { + if (name[0] == '.') { + if (!(sinfo->de->attr & ATTR_HIDDEN)) + err = -ENOENT; + } else { + if (sinfo->de->attr & ATTR_HIDDEN) + err = -ENOENT; + } + if (err) + brelse(sinfo->bh); + } + return err; +} + +/* + * Compute the hash for the msdos name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The msdos fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) +{ + struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; + unsigned char msdos_name[MSDOS_NAME]; + int error; + + error = msdos_format_name(qstr->name, qstr->len, msdos_name, options); + if (!error) + qstr->hash = full_name_hash(msdos_name, MSDOS_NAME); + return 0; +} + +/* + * Compare two msdos names. If either of the names are invalid, + * we fall back to doing the standard name comparison. + */ +static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; + unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; + int error; + + error = msdos_format_name(name->name, name->len, a_msdos_name, options); + if (error) + goto old_compare; + error = msdos_format_name(str, len, b_msdos_name, options); + if (error) + goto old_compare; + error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); +out: + return error; + +old_compare: + error = 1; + if (name->len == len) + error = memcmp(name->name, str, len); + goto out; +} + +static const struct dentry_operations msdos_dentry_operations = { + .d_hash = msdos_hash, + .d_compare = msdos_cmp, +}; + +/* + * AV. Wrappers for FAT sb operations. Is it wise? + */ + +/***** Get inode using directory and name */ +static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + switch (err) { + case -ENOENT: + inode = NULL; + break; + case 0: + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + break; + default: + inode = ERR_PTR(err); + } + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return d_splice_alias(inode, dentry); +} + +/***** Creates a directory entry (name is already formatted). */ +static int msdos_add_entry(struct inode *dir, const unsigned char *name, + int is_dir, int is_hid, int cluster, + struct timespec *ts, struct fat_slot_info *sinfo) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + struct msdos_dir_entry de; + __le16 time, date; + int err; + + memcpy(de.name, name, MSDOS_NAME); + de.attr = is_dir ? ATTR_DIR : ATTR_ARCH; + if (is_hid) + de.attr |= ATTR_HIDDEN; + de.lcase = 0; + fat_time_unix2fat(sbi, ts, &time, &date, NULL); + de.cdate = de.adate = 0; + de.ctime = 0; + de.ctime_cs = 0; + de.time = time; + de.date = date; + fat_set_start(&de, cluster); + de.size = 0; + + err = fat_add_entries(dir, &de, 1, sinfo); + if (err) + return err; + + dir->i_ctime = dir->i_mtime = *ts; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); + + return 0; +} + +/***** Create a file */ +static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = NULL; + struct fat_slot_info sinfo; + struct timespec ts; + unsigned char msdos_name[MSDOS_NAME]; + int err, is_hid; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, + msdos_name, &MSDOS_SB(sb)->options); + if (err) + goto out; + is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.'); + /* Have to do it due to foo vs. .foo conflicts */ + if (!fat_scan(dir, msdos_name, &sinfo)) { + brelse(sinfo.bh); + err = -EINVAL; + goto out; + } + + ts = CURRENT_TIME_SEC; + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); + if (err) + goto out; + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + return err; +} + +/***** Remove a directory */ +static int msdos_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = d_inode(dentry); + struct fat_slot_info sinfo; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + /* + * Check whether the directory is not in use, then check + * whether it is empty. + */ + err = fat_dir_empty(inode); + if (err) + goto out; + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + drop_nlink(dir); + + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + + return err; +} + +/***** Make a directory */ +static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + unsigned char msdos_name[MSDOS_NAME]; + struct timespec ts; + int err, is_hid, cluster; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, + msdos_name, &MSDOS_SB(sb)->options); + if (err) + goto out; + is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.'); + /* foo vs .foo situation */ + if (!fat_scan(dir, msdos_name, &sinfo)) { + brelse(sinfo.bh); + err = -EINVAL; + goto out; + } + + ts = CURRENT_TIME_SEC; + cluster = fat_alloc_new_dir(dir, &ts); + if (cluster < 0) { + err = cluster; + goto out; + } + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); + if (err) + goto out_free; + inc_nlink(dir); + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + /* the directory was completed, just return a error */ + goto out; + } + set_nlink(inode, 2); + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); + + mutex_unlock(&MSDOS_SB(sb)->s_lock); + fat_flush_inodes(sb, dir, inode); + return 0; + +out_free: + fat_free_clusters(dir, cluster); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return err; +} + +/***** Unlink a file */ +static int msdos_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; + struct fat_slot_info sinfo; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + + return err; +} + +static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, + struct dentry *old_dentry, + struct inode *new_dir, unsigned char *new_name, + struct dentry *new_dentry, int is_hid) +{ + struct buffer_head *dotdot_bh; + struct msdos_dir_entry *dotdot_de; + struct inode *old_inode, *new_inode; + struct fat_slot_info old_sinfo, sinfo; + struct timespec ts; + loff_t new_i_pos; + int err, old_attrs, is_dir, update_dotdot, corrupt = 0; + + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + + err = fat_scan(old_dir, old_name, &old_sinfo); + if (err) { + err = -EIO; + goto out; + } + + is_dir = S_ISDIR(old_inode->i_mode); + update_dotdot = (is_dir && old_dir != new_dir); + if (update_dotdot) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { + err = -EIO; + goto out; + } + } + + old_attrs = MSDOS_I(old_inode)->i_attrs; + err = fat_scan(new_dir, new_name, &sinfo); + if (!err) { + if (!new_inode) { + /* "foo" -> ".foo" case. just change the ATTR_HIDDEN */ + if (sinfo.de != old_sinfo.de) { + err = -EINVAL; + goto out; + } + if (is_hid) + MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN; + else + MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN; + if (IS_DIRSYNC(old_dir)) { + err = fat_sync_inode(old_inode); + if (err) { + MSDOS_I(old_inode)->i_attrs = old_attrs; + goto out; + } + } else + mark_inode_dirty(old_inode); + + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + goto out; + } + } + + ts = CURRENT_TIME_SEC; + if (new_inode) { + if (err) + goto out; + if (is_dir) { + err = fat_dir_empty(new_inode); + if (err) + goto out; + } + new_i_pos = MSDOS_I(new_inode)->i_pos; + fat_detach(new_inode); + } else { + err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, + &ts, &sinfo); + if (err) + goto out; + new_i_pos = sinfo.i_pos; + } + new_dir->i_version++; + + fat_detach(old_inode); + fat_attach(old_inode, new_i_pos); + if (is_hid) + MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN; + else + MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN; + if (IS_DIRSYNC(new_dir)) { + err = fat_sync_inode(old_inode); + if (err) + goto error_inode; + } else + mark_inode_dirty(old_inode); + + if (update_dotdot) { + fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + if (IS_DIRSYNC(new_dir)) { + err = sync_dirty_buffer(dotdot_bh); + if (err) + goto error_dotdot; + } + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + err = fat_remove_entries(old_dir, &old_sinfo); /* and releases bh */ + old_sinfo.bh = NULL; + if (err) + goto error_dotdot; + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = ts; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + drop_nlink(new_inode); + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = ts; + } +out: + brelse(sinfo.bh); + brelse(dotdot_bh); + brelse(old_sinfo.bh); + return err; + +error_dotdot: + /* data cluster is shared, serious corruption */ + corrupt = 1; + + if (update_dotdot) { + fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + corrupt |= sync_dirty_buffer(dotdot_bh); + } +error_inode: + fat_detach(old_inode); + fat_attach(old_inode, old_sinfo.i_pos); + MSDOS_I(old_inode)->i_attrs = old_attrs; + if (new_inode) { + fat_attach(new_inode, new_i_pos); + if (corrupt) + corrupt |= fat_sync_inode(new_inode); + } else { + /* + * If new entry was not sharing the data cluster, it + * shouldn't be serious corruption. + */ + int err2 = fat_remove_entries(new_dir, &sinfo); + if (corrupt) + corrupt |= err2; + sinfo.bh = NULL; + } + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld)", + __func__, sinfo.i_pos); + } + goto out; +} + +/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ +static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; + int err, is_hid; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = msdos_format_name(old_dentry->d_name.name, + old_dentry->d_name.len, old_msdos_name, + &MSDOS_SB(old_dir->i_sb)->options); + if (err) + goto out; + err = msdos_format_name(new_dentry->d_name.name, + new_dentry->d_name.len, new_msdos_name, + &MSDOS_SB(new_dir->i_sb)->options); + if (err) + goto out; + + is_hid = + (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.'); + + err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, + new_dir, new_msdos_name, new_dentry, is_hid); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + if (!err) + err = fat_flush_inodes(sb, old_dir, new_dir); + return err; +} + +static const struct inode_operations msdos_dir_inode_operations = { + .create = msdos_create, + .lookup = msdos_lookup, + .unlink = msdos_unlink, + .mkdir = msdos_mkdir, + .rmdir = msdos_rmdir, + .rename = msdos_rename, + .setattr = fat_setattr, + .getattr = fat_getattr, +}; + +static void setup(struct super_block *sb) +{ + MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; + sb->s_d_op = &msdos_dentry_operations; + sb->s_flags |= MS_NOATIME; +} + +static int msdos_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, 0, setup); +} + +static struct dentry *msdos_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super); +} + +static struct file_system_type msdos_fs_type = { + .owner = THIS_MODULE, + .name = "msdos", + .mount = msdos_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("msdos"); + +static int __init init_msdos_fs(void) +{ + return register_filesystem(&msdos_fs_type); +} + +static void __exit exit_msdos_fs(void) +{ + unregister_filesystem(&msdos_fs_type); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Werner Almesberger"); +MODULE_DESCRIPTION("MS-DOS filesystem support"); + +module_init(init_msdos_fs) +module_exit(exit_msdos_fs) diff --git a/kernel/fs/fat/namei_vfat.c b/kernel/fs/fat/namei_vfat.c new file mode 100644 index 000000000..7092584f4 --- /dev/null +++ b/kernel/fs/fat/namei_vfat.c @@ -0,0 +1,1089 @@ +/* + * linux/fs/vfat/namei.c + * + * Written 1992,1993 by Werner Almesberger + * + * Windows95/Windows NT compatible extended MSDOS filesystem + * by Gordon Chaffee Copyright (C) 1995. Send bug reports for the + * VFAT filesystem to . Specify + * what file operation caused you trouble and if you can duplicate + * the problem, send a script that demonstrates it. + * + * Short name translation 1999, 2001 by Wolfram Pienkoss + * + * Support Multibyte characters and cleanup by + * OGAWA Hirofumi + */ + +#include +#include +#include +#include +#include "fat.h" + +/* + * If new entry was created in the parent, it could create the 8.3 + * alias (the shortname of logname). So, the parent may have the + * negative-dentry which matches the created 8.3 alias. + * + * If it happened, the negative dentry isn't actually negative + * anymore. So, drop it. + */ +static int vfat_revalidate_shortname(struct dentry *dentry) +{ + int ret = 1; + spin_lock(&dentry->d_lock); + if (dentry->d_time != d_inode(dentry->d_parent)->i_version) + ret = 0; + spin_unlock(&dentry->d_lock); + return ret; +} + +static int vfat_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* This is not negative dentry. Always valid. */ + if (d_really_is_positive(dentry)) + return 1; + return vfat_revalidate_shortname(dentry); +} + +static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (d_really_is_positive(dentry)) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!flags) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + + return vfat_revalidate_shortname(dentry); +} + +/* returns the length of a struct qstr, ignoring trailing dots */ +static unsigned int __vfat_striptail_len(unsigned int len, const char *name) +{ + while (len && name[len - 1] == '.') + len--; + return len; +} + +static unsigned int vfat_striptail_len(const struct qstr *qstr) +{ + return __vfat_striptail_len(qstr->len, qstr->name); +} + +/* + * Compute the hash for the vfat name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The vfat fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int vfat_hash(const struct dentry *dentry, struct qstr *qstr) +{ + qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); + return 0; +} + +/* + * Compute the hash for the vfat name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The vfat fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) +{ + struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; + const unsigned char *name; + unsigned int len; + unsigned long hash; + + name = qstr->name; + len = vfat_striptail_len(qstr); + + hash = init_name_hash(); + while (len--) + hash = partial_name_hash(nls_tolower(t, *name++), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Case insensitive compare of two vfat names. + */ +static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; + unsigned int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); + if (alen == blen) { + if (nls_strnicmp(t, name->name, str, alen) == 0) + return 0; + } + return 1; +} + +/* + * Case sensitive compare of two vfat names. + */ +static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + unsigned int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); + if (alen == blen) { + if (strncmp(name->name, str, alen) == 0) + return 0; + } + return 1; +} + +static const struct dentry_operations vfat_ci_dentry_ops = { + .d_revalidate = vfat_revalidate_ci, + .d_hash = vfat_hashi, + .d_compare = vfat_cmpi, +}; + +static const struct dentry_operations vfat_dentry_ops = { + .d_revalidate = vfat_revalidate, + .d_hash = vfat_hash, + .d_compare = vfat_cmp, +}; + +/* Characters that are undesirable in an MS-DOS file name */ + +static inline wchar_t vfat_bad_char(wchar_t w) +{ + return (w < 0x0020) + || (w == '*') || (w == '?') || (w == '<') || (w == '>') + || (w == '|') || (w == '"') || (w == ':') || (w == '/') + || (w == '\\'); +} + +static inline wchar_t vfat_replace_char(wchar_t w) +{ + return (w == '[') || (w == ']') || (w == ';') || (w == ',') + || (w == '+') || (w == '='); +} + +static wchar_t vfat_skip_char(wchar_t w) +{ + return (w == '.') || (w == ' '); +} + +static inline int vfat_is_used_badchars(const wchar_t *s, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (vfat_bad_char(s[i])) + return -EINVAL; + + if (s[i - 1] == ' ') /* last character cannot be space */ + return -EINVAL; + + return 0; +} + +static int vfat_find_form(struct inode *dir, unsigned char *name) +{ + struct fat_slot_info sinfo; + int err = fat_scan(dir, name, &sinfo); + if (err) + return -ENOENT; + brelse(sinfo.bh); + return 0; +} + +/* + * 1) Valid characters for the 8.3 format alias are any combination of + * letters, uppercase alphabets, digits, any of the + * following special characters: + * $ % ' ` - @ { } ~ ! # ( ) & _ ^ + * In this case Longfilename is not stored in disk. + * + * WinNT's Extension: + * File name and extension name is contain uppercase/lowercase + * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT. + * + * 2) File name is 8.3 format, but it contain the uppercase and + * lowercase char, muliti bytes char, etc. In this case numtail is not + * added, but Longfilename is stored. + * + * 3) When the one except for the above, or the following special + * character are contained: + * . [ ] ; , + = + * numtail is added, and Longfilename must be stored in disk . + */ +struct shortname_info { + unsigned char lower:1, + upper:1, + valid:1; +}; +#define INIT_SHORTNAME_INFO(x) do { \ + (x)->lower = 1; \ + (x)->upper = 1; \ + (x)->valid = 1; \ +} while (0) + +static inline int to_shortname_char(struct nls_table *nls, + unsigned char *buf, int buf_size, + wchar_t *src, struct shortname_info *info) +{ + int len; + + if (vfat_skip_char(*src)) { + info->valid = 0; + return 0; + } + if (vfat_replace_char(*src)) { + info->valid = 0; + buf[0] = '_'; + return 1; + } + + len = nls->uni2char(*src, buf, buf_size); + if (len <= 0) { + info->valid = 0; + buf[0] = '_'; + len = 1; + } else if (len == 1) { + unsigned char prev = buf[0]; + + if (buf[0] >= 0x7F) { + info->lower = 0; + info->upper = 0; + } + + buf[0] = nls_toupper(nls, buf[0]); + if (isalpha(buf[0])) { + if (buf[0] == prev) + info->lower = 0; + else + info->upper = 0; + } + } else { + info->lower = 0; + info->upper = 0; + } + + return len; +} + +/* + * Given a valid longname, create a unique shortname. Make sure the + * shortname does not exist + * Returns negative number on error, 0 for a normal + * return, and 1 for valid shortname + */ +static int vfat_create_shortname(struct inode *dir, struct nls_table *nls, + wchar_t *uname, int ulen, + unsigned char *name_res, unsigned char *lcase) +{ + struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; + wchar_t *ip, *ext_start, *end, *name_start; + unsigned char base[9], ext[4], buf[5], *p; + unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; + int chl, chi; + int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; + int is_shortname; + struct shortname_info base_info, ext_info; + + is_shortname = 1; + INIT_SHORTNAME_INFO(&base_info); + INIT_SHORTNAME_INFO(&ext_info); + + /* Now, we need to create a shortname from the long name */ + ext_start = end = &uname[ulen]; + while (--ext_start >= uname) { + if (*ext_start == 0x002E) { /* is `.' */ + if (ext_start == end - 1) { + sz = ulen; + ext_start = NULL; + } + break; + } + } + + if (ext_start == uname - 1) { + sz = ulen; + ext_start = NULL; + } else if (ext_start) { + /* + * Names which start with a dot could be just + * an extension eg. "...test". In this case Win95 + * uses the extension as the name and sets no extension. + */ + name_start = &uname[0]; + while (name_start < ext_start) { + if (!vfat_skip_char(*name_start)) + break; + name_start++; + } + if (name_start != ext_start) { + sz = ext_start - uname; + ext_start++; + } else { + sz = ulen; + ext_start = NULL; + } + } + + numtail_baselen = 6; + numtail2_baselen = 2; + for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) { + chl = to_shortname_char(nls, charbuf, sizeof(charbuf), + ip, &base_info); + if (chl == 0) + continue; + + if (baselen < 2 && (baselen + chl) > 2) + numtail2_baselen = baselen; + if (baselen < 6 && (baselen + chl) > 6) + numtail_baselen = baselen; + for (chi = 0; chi < chl; chi++) { + *p++ = charbuf[chi]; + baselen++; + if (baselen >= 8) + break; + } + if (baselen >= 8) { + if ((chi < chl - 1) || (ip + 1) - uname < sz) + is_shortname = 0; + break; + } + } + if (baselen == 0) { + return -EINVAL; + } + + extlen = 0; + if (ext_start) { + for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) { + chl = to_shortname_char(nls, charbuf, sizeof(charbuf), + ip, &ext_info); + if (chl == 0) + continue; + + if ((extlen + chl) > 3) { + is_shortname = 0; + break; + } + for (chi = 0; chi < chl; chi++) { + *p++ = charbuf[chi]; + extlen++; + } + if (extlen >= 3) { + if (ip + 1 != end) + is_shortname = 0; + break; + } + } + } + ext[extlen] = '\0'; + base[baselen] = '\0'; + + /* Yes, it can happen. ".\xe5" would do it. */ + if (base[0] == DELETED_FLAG) + base[0] = 0x05; + + /* OK, at this point we know that base is not longer than 8 symbols, + * ext is not longer than 3, base is nonempty, both don't contain + * any bad symbols (lowercase transformed to uppercase). + */ + + memset(name_res, ' ', MSDOS_NAME); + memcpy(name_res, base, baselen); + memcpy(name_res + 8, ext, extlen); + *lcase = 0; + if (is_shortname && base_info.valid && ext_info.valid) { + if (vfat_find_form(dir, name_res) == 0) + return -EEXIST; + + if (opts->shortname & VFAT_SFN_CREATE_WIN95) { + return (base_info.upper && ext_info.upper); + } else if (opts->shortname & VFAT_SFN_CREATE_WINNT) { + if ((base_info.upper || base_info.lower) && + (ext_info.upper || ext_info.lower)) { + if (!base_info.upper && base_info.lower) + *lcase |= CASE_LOWER_BASE; + if (!ext_info.upper && ext_info.lower) + *lcase |= CASE_LOWER_EXT; + return 1; + } + return 0; + } else { + BUG(); + } + } + + if (opts->numtail == 0) + if (vfat_find_form(dir, name_res) < 0) + return 0; + + /* + * Try to find a unique extension. This used to + * iterate through all possibilities sequentially, + * but that gave extremely bad performance. Windows + * only tries a few cases before using random + * values for part of the base. + */ + + if (baselen > 6) { + baselen = numtail_baselen; + name_res[7] = ' '; + } + name_res[baselen] = '~'; + for (i = 1; i < 10; i++) { + name_res[baselen + 1] = i + '0'; + if (vfat_find_form(dir, name_res) < 0) + return 0; + } + + i = jiffies; + sz = (jiffies >> 16) & 0x7; + if (baselen > 2) { + baselen = numtail2_baselen; + name_res[7] = ' '; + } + name_res[baselen + 4] = '~'; + name_res[baselen + 5] = '1' + sz; + while (1) { + snprintf(buf, sizeof(buf), "%04X", i & 0xffff); + memcpy(&name_res[baselen], buf, 4); + if (vfat_find_form(dir, name_res) < 0) + break; + i -= 11; + } + return 0; +} + +/* Translate a string, including coded sequences into Unicode */ +static int +xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, + int *longlen, int *outlen, int escape, int utf8, + struct nls_table *nls) +{ + const unsigned char *ip; + unsigned char nc; + unsigned char *op; + unsigned int ec; + int i, k, fill; + int charlen; + + if (utf8) { + *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN, + (wchar_t *) outname, FAT_LFN_LEN + 2); + if (*outlen < 0) + return *outlen; + else if (*outlen > FAT_LFN_LEN) + return -ENAMETOOLONG; + + op = &outname[*outlen * sizeof(wchar_t)]; + } else { + for (i = 0, ip = name, op = outname, *outlen = 0; + i < len && *outlen < FAT_LFN_LEN; + *outlen += 1) { + if (escape && (*ip == ':')) { + if (i > len - 5) + return -EINVAL; + ec = 0; + for (k = 1; k < 5; k++) { + nc = ip[k]; + ec <<= 4; + if (nc >= '0' && nc <= '9') { + ec |= nc - '0'; + continue; + } + if (nc >= 'a' && nc <= 'f') { + ec |= nc - ('a' - 10); + continue; + } + if (nc >= 'A' && nc <= 'F') { + ec |= nc - ('A' - 10); + continue; + } + return -EINVAL; + } + *op++ = ec & 0xFF; + *op++ = ec >> 8; + ip += 5; + i += 5; + } else { + charlen = nls->char2uni(ip, len - i, + (wchar_t *)op); + if (charlen < 0) + return -EINVAL; + ip += charlen; + i += charlen; + op += 2; + } + } + if (i < len) + return -ENAMETOOLONG; + } + + *longlen = *outlen; + if (*outlen % 13) { + *op++ = 0; + *op++ = 0; + *outlen += 1; + if (*outlen % 13) { + fill = 13 - (*outlen % 13); + for (i = 0; i < fill; i++) { + *op++ = 0xff; + *op++ = 0xff; + } + *outlen += fill; + } + } + + return 0; +} + +static int vfat_build_slots(struct inode *dir, const unsigned char *name, + int len, int is_dir, int cluster, + struct timespec *ts, + struct msdos_dir_slot *slots, int *nr_slots) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + struct fat_mount_options *opts = &sbi->options; + struct msdos_dir_slot *ps; + struct msdos_dir_entry *de; + unsigned char cksum, lcase; + unsigned char msdos_name[MSDOS_NAME]; + wchar_t *uname; + __le16 time, date; + u8 time_cs; + int err, ulen, usize, i; + loff_t offset; + + *nr_slots = 0; + + uname = __getname(); + if (!uname) + return -ENOMEM; + + err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize, + opts->unicode_xlate, opts->utf8, sbi->nls_io); + if (err) + goto out_free; + + err = vfat_is_used_badchars(uname, ulen); + if (err) + goto out_free; + + err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen, + msdos_name, &lcase); + if (err < 0) + goto out_free; + else if (err == 1) { + de = (struct msdos_dir_entry *)slots; + err = 0; + goto shortname; + } + + /* build the entry of long file name */ + cksum = fat_checksum(msdos_name); + + *nr_slots = usize / 13; + for (ps = slots, i = *nr_slots; i > 0; i--, ps++) { + ps->id = i; + ps->attr = ATTR_EXT; + ps->reserved = 0; + ps->alias_checksum = cksum; + ps->start = 0; + offset = (i - 1) * 13; + fatwchar_to16(ps->name0_4, uname + offset, 5); + fatwchar_to16(ps->name5_10, uname + offset + 5, 6); + fatwchar_to16(ps->name11_12, uname + offset + 11, 2); + } + slots[0].id |= 0x40; + de = (struct msdos_dir_entry *)ps; + +shortname: + /* build the entry of 8.3 alias name */ + (*nr_slots)++; + memcpy(de->name, msdos_name, MSDOS_NAME); + de->attr = is_dir ? ATTR_DIR : ATTR_ARCH; + de->lcase = lcase; + fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); + de->time = de->ctime = time; + de->date = de->cdate = de->adate = date; + de->ctime_cs = time_cs; + fat_set_start(de, cluster); + de->size = 0; +out_free: + __putname(uname); + return err; +} + +static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir, + int cluster, struct timespec *ts, + struct fat_slot_info *sinfo) +{ + struct msdos_dir_slot *slots; + unsigned int len; + int err, nr_slots; + + len = vfat_striptail_len(qname); + if (len == 0) + return -ENOENT; + + slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS); + if (slots == NULL) + return -ENOMEM; + + err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts, + slots, &nr_slots); + if (err) + goto cleanup; + + err = fat_add_entries(dir, slots, nr_slots, sinfo); + if (err) + goto cleanup; + + /* update timestamp */ + dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +cleanup: + kfree(slots); + return err; +} + +static int vfat_find(struct inode *dir, struct qstr *qname, + struct fat_slot_info *sinfo) +{ + unsigned int len = vfat_striptail_len(qname); + if (len == 0) + return -ENOENT; + return fat_search_long(dir, qname->name, len, sinfo); +} + +/* + * (nfsd's) anonymous disconnected dentry? + * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job). + */ +static int vfat_d_anon_disconn(struct dentry *dentry) +{ + return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); +} + +static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + struct dentry *alias; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) { + if (err == -ENOENT) { + inode = NULL; + goto out; + } + goto error; + } + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + + alias = d_find_alias(inode); + /* + * Checking "alias->d_parent == dentry->d_parent" to make sure + * FS is not corrupted (especially double linked dir). + */ + if (alias && alias->d_parent == dentry->d_parent && + !vfat_d_anon_disconn(alias)) { + /* + * This inode has non anonymous-DCACHE_DISCONNECTED + * dentry. This means, the user did ->lookup() by an + * another name (longname vs 8.3 alias of it) in past. + * + * Switch to new one for reason of locality if possible. + */ + BUG_ON(d_unhashed(alias)); + if (!S_ISDIR(inode->i_mode)) + d_move(alias, dentry); + iput(inode); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return alias; + } else + dput(alias); + +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + if (!inode) + dentry->d_time = dir->i_version; + return d_splice_alias(inode, dentry); +error: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return ERR_PTR(err); +} + +static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct fat_slot_info sinfo; + struct timespec ts; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + ts = CURRENT_TIME_SEC; + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); + if (err) + goto out; + dir->i_version++; + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + inode->i_version++; + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return err; +} + +static int vfat_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = fat_dir_empty(inode); + if (err) + goto out; + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + drop_nlink(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + fat_detach(inode); + dentry->d_time = dir->i_version; +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; +} + +static int vfat_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + int err; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + clear_nlink(inode); + inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + fat_detach(inode); + dentry->d_time = dir->i_version; +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; +} + +static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct fat_slot_info sinfo; + struct timespec ts; + int err, cluster; + + mutex_lock(&MSDOS_SB(sb)->s_lock); + + ts = CURRENT_TIME_SEC; + cluster = fat_alloc_new_dir(dir, &ts); + if (cluster < 0) { + err = cluster; + goto out; + } + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); + if (err) + goto out_free; + dir->i_version++; + inc_nlink(dir); + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + /* the directory was completed, just return a error */ + goto out; + } + inode->i_version++; + set_nlink(inode, 2); + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); + + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return 0; + +out_free: + fat_free_clusters(dir, cluster); +out: + mutex_unlock(&MSDOS_SB(sb)->s_lock); + return err; +} + +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *dotdot_bh; + struct msdos_dir_entry *dotdot_de; + struct inode *old_inode, *new_inode; + struct fat_slot_info old_sinfo, sinfo; + struct timespec ts; + loff_t new_i_pos; + int err, is_dir, update_dotdot, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + mutex_lock(&MSDOS_SB(sb)->s_lock); + err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); + if (err) + goto out; + + is_dir = S_ISDIR(old_inode->i_mode); + update_dotdot = (is_dir && old_dir != new_dir); + if (update_dotdot) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { + err = -EIO; + goto out; + } + } + + ts = CURRENT_TIME_SEC; + if (new_inode) { + if (is_dir) { + err = fat_dir_empty(new_inode); + if (err) + goto out; + } + new_i_pos = MSDOS_I(new_inode)->i_pos; + fat_detach(new_inode); + } else { + err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, + &ts, &sinfo); + if (err) + goto out; + new_i_pos = sinfo.i_pos; + } + new_dir->i_version++; + + fat_detach(old_inode); + fat_attach(old_inode, new_i_pos); + if (IS_DIRSYNC(new_dir)) { + err = fat_sync_inode(old_inode); + if (err) + goto error_inode; + } else + mark_inode_dirty(old_inode); + + if (update_dotdot) { + fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + if (IS_DIRSYNC(new_dir)) { + err = sync_dirty_buffer(dotdot_bh); + if (err) + goto error_dotdot; + } + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + err = fat_remove_entries(old_dir, &old_sinfo); /* and releases bh */ + old_sinfo.bh = NULL; + if (err) + goto error_dotdot; + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = ts; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + drop_nlink(new_inode); + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = ts; + } +out: + brelse(sinfo.bh); + brelse(dotdot_bh); + brelse(old_sinfo.bh); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; + +error_dotdot: + /* data cluster is shared, serious corruption */ + corrupt = 1; + + if (update_dotdot) { + fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + corrupt |= sync_dirty_buffer(dotdot_bh); + } +error_inode: + fat_detach(old_inode); + fat_attach(old_inode, old_sinfo.i_pos); + if (new_inode) { + fat_attach(new_inode, new_i_pos); + if (corrupt) + corrupt |= fat_sync_inode(new_inode); + } else { + /* + * If new entry was not sharing the data cluster, it + * shouldn't be serious corruption. + */ + int err2 = fat_remove_entries(new_dir, &sinfo); + if (corrupt) + corrupt |= err2; + sinfo.bh = NULL; + } + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld)", + __func__, sinfo.i_pos); + } + goto out; +} + +static const struct inode_operations vfat_dir_inode_operations = { + .create = vfat_create, + .lookup = vfat_lookup, + .unlink = vfat_unlink, + .mkdir = vfat_mkdir, + .rmdir = vfat_rmdir, + .rename = vfat_rename, + .setattr = fat_setattr, + .getattr = fat_getattr, +}; + +static void setup(struct super_block *sb) +{ + MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations; + if (MSDOS_SB(sb)->options.name_check != 's') + sb->s_d_op = &vfat_ci_dentry_ops; + else + sb->s_d_op = &vfat_dentry_ops; +} + +static int vfat_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, 1, setup); +} + +static struct dentry *vfat_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super); +} + +static struct file_system_type vfat_fs_type = { + .owner = THIS_MODULE, + .name = "vfat", + .mount = vfat_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("vfat"); + +static int __init init_vfat_fs(void) +{ + return register_filesystem(&vfat_fs_type); +} + +static void __exit exit_vfat_fs(void) +{ + unregister_filesystem(&vfat_fs_type); +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VFAT filesystem support"); +MODULE_AUTHOR("Gordon Chaffee"); + +module_init(init_vfat_fs) +module_exit(exit_vfat_fs) diff --git a/kernel/fs/fat/nfs.c b/kernel/fs/fat/nfs.c new file mode 100644 index 000000000..eb192656f --- /dev/null +++ b/kernel/fs/fat/nfs.c @@ -0,0 +1,301 @@ +/* fs/fat/nfs.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include "fat.h" + +struct fat_fid { + u32 i_gen; + u32 i_pos_low; + u16 i_pos_hi; + u16 parent_i_pos_hi; + u32 parent_i_pos_low; + u32 parent_i_gen; +}; + +#define FAT_FID_SIZE_WITHOUT_PARENT 3 +#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) + +/** + * Look up a directory inode given its starting cluster. + */ +static struct inode *fat_dget(struct super_block *sb, int i_logstart) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct hlist_head *head; + struct msdos_inode_info *i; + struct inode *inode = NULL; + + head = sbi->dir_hashtable + fat_dir_hash(i_logstart); + spin_lock(&sbi->dir_hash_lock); + hlist_for_each_entry(i, head, i_dir_hash) { + BUG_ON(i->vfs_inode.i_sb != sb); + if (i->i_logstart != i_logstart) + continue; + inode = igrab(&i->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->dir_hash_lock); + return inode; +} + +static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) +{ + if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) + return fat_iget(sb, i_pos); + + else { + if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) + return NULL; + return ilookup(sb, ino); + } +} + +static struct inode *__fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation, loff_t i_pos) +{ + struct inode *inode = fat_ilookup(sb, ino, i_pos); + + if (inode && generation && (inode->i_generation != generation)) { + iput(inode); + inode = NULL; + } + if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de ; + sector_t blocknr; + int offset; + fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); + if (!bh) { + fat_msg(sb, KERN_ERR, + "unable to read block(%llu) for building NFS inode", + (llu)blocknr); + return inode; + } + de = (struct msdos_dir_entry *)bh->b_data; + /* If a file is deleted on server and client is not updated + * yet, we must not build the inode upon a lookup call. + */ + if (IS_FREE(de[offset].name)) + inode = NULL; + else + inode = fat_build_inode(sb, &de[offset], i_pos); + brelse(bh); + } + + return inode; +} + +static struct inode *fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + + return __fat_nfs_get_inode(sb, ino, generation, 0); +} + +static int +fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + int len = *lenp; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fat_fid *fid = (struct fat_fid *) fh; + loff_t i_pos; + int type = FILEID_FAT_WITHOUT_PARENT; + + if (parent) { + if (len < FAT_FID_SIZE_WITH_PARENT) { + *lenp = FAT_FID_SIZE_WITH_PARENT; + return FILEID_INVALID; + } + } else { + if (len < FAT_FID_SIZE_WITHOUT_PARENT) { + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + return FILEID_INVALID; + } + } + + i_pos = fat_i_pos_read(sbi, inode); + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + fid->i_gen = inode->i_generation; + fid->i_pos_low = i_pos & 0xFFFFFFFF; + fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; + if (parent) { + i_pos = fat_i_pos_read(sbi, parent); + fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; + fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; + fid->parent_i_gen = parent->i_generation; + type = FILEID_FAT_WITH_PARENT; + *lenp = FAT_FID_SIZE_WITH_PARENT; + } + + return type; +} + +/** + * Map a NFS file handle to a corresponding dentry. + * The dentry may or may not be connected to the filesystem root. + */ +static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + fat_nfs_get_inode); +} + +static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + switch (fh_type) { + case FILEID_FAT_WITHOUT_PARENT: + if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) + return NULL; + break; + case FILEID_FAT_WITH_PARENT: + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + break; + default: + return NULL; + } + i_pos = fid->i_pos_hi; + i_pos = (i_pos << 32) | (fid->i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); + + return d_obtain_alias(inode); +} + +/* + * Find the parent for a file specified by NFS handle. + * This requires that the handle contain the i_ino of the parent. + */ +static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + fat_nfs_get_inode); +} + +static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + + switch (fh_type) { + case FILEID_FAT_WITH_PARENT: + i_pos = fid->parent_i_pos_hi; + i_pos = (i_pos << 32) | (fid->parent_i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); + break; + } + + return d_obtain_alias(inode); +} + +/* + * Rebuild the parent for a directory that is not connected + * to the filesystem root + */ +static +struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) +{ + int search_clus, clus_to_match; + struct msdos_dir_entry *de; + struct inode *parent = NULL; + struct inode *dummy_grand_parent = NULL; + struct fat_slot_info sinfo; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); + struct buffer_head *parent_bh = sb_bread(sb, blknr); + if (!parent_bh) { + fat_msg(sb, KERN_ERR, + "unable to read cluster of parent directory"); + return NULL; + } + + de = (struct msdos_dir_entry *) parent_bh->b_data; + clus_to_match = fat_get_start(sbi, &de[0]); + search_clus = fat_get_start(sbi, &de[1]); + + dummy_grand_parent = fat_dget(sb, search_clus); + if (!dummy_grand_parent) { + dummy_grand_parent = new_inode(sb); + if (!dummy_grand_parent) { + brelse(parent_bh); + return parent; + } + + dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); + fat_fill_inode(dummy_grand_parent, &de[1]); + MSDOS_I(dummy_grand_parent)->i_pos = -1; + } + + if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) + parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + + brelse(parent_bh); + iput(dummy_grand_parent); + + return parent; +} + +/* + * Find the parent for a directory that is not currently connected to + * the filesystem root. + * + * On entry, the caller holds d_inode(child_dir)->i_mutex. + */ +static struct dentry *fat_get_parent(struct dentry *child_dir) +{ + struct super_block *sb = child_dir->d_sb; + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de; + struct inode *parent_inode = NULL; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { + int parent_logstart = fat_get_start(sbi, de); + parent_inode = fat_dget(sb, parent_logstart); + if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) + parent_inode = fat_rebuild_parent(sb, parent_logstart); + } + brelse(bh); + + return d_obtain_alias(parent_inode); +} + +const struct export_operations fat_export_ops = { + .fh_to_dentry = fat_fh_to_dentry, + .fh_to_parent = fat_fh_to_parent, + .get_parent = fat_get_parent, +}; + +const struct export_operations fat_export_ops_nostale = { + .encode_fh = fat_encode_fh_nostale, + .fh_to_dentry = fat_fh_to_dentry_nostale, + .fh_to_parent = fat_fh_to_parent_nostale, + .get_parent = fat_get_parent, +}; diff --git a/kernel/fs/fcntl.c b/kernel/fs/fcntl.c new file mode 100644 index 000000000..ee85cd4e1 --- /dev/null +++ b/kernel/fs/fcntl.c @@ -0,0 +1,759 @@ +/* + * linux/fs/fcntl.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) + +static int setfl(int fd, struct file * filp, unsigned long arg) +{ + struct inode * inode = file_inode(filp); + int error = 0; + + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. + */ + if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) + return -EPERM; + + /* O_NOATIME can only be set by the owner or superuser */ + if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) + if (!inode_owner_or_capable(inode)) + return -EPERM; + + /* required for strict SunOS emulation */ + if (O_NONBLOCK != O_NDELAY) + if (arg & O_NDELAY) + arg |= O_NONBLOCK; + + if (arg & O_DIRECT) { + if (!filp->f_mapping || !filp->f_mapping->a_ops || + !filp->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + + if (filp->f_op->check_flags) + error = filp->f_op->check_flags(arg); + if (error) + return error; + + /* + * ->fasync() is responsible for setting the FASYNC bit. + */ + if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { + error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); + if (error < 0) + goto out; + if (error > 0) + error = 0; + } + spin_lock(&filp->f_lock); + filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + spin_unlock(&filp->f_lock); + + out: + return error; +} + +static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, + int force) +{ + write_lock_irq(&filp->f_owner.lock); + if (force || !filp->f_owner.pid) { + put_pid(filp->f_owner.pid); + filp->f_owner.pid = get_pid(pid); + filp->f_owner.pid_type = type; + + if (pid) { + const struct cred *cred = current_cred(); + filp->f_owner.uid = cred->uid; + filp->f_owner.euid = cred->euid; + } + } + write_unlock_irq(&filp->f_owner.lock); +} + +void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, + int force) +{ + security_file_set_fowner(filp); + f_modown(filp, pid, type, force); +} +EXPORT_SYMBOL(__f_setown); + +void f_setown(struct file *filp, unsigned long arg, int force) +{ + enum pid_type type; + struct pid *pid; + int who = arg; + type = PIDTYPE_PID; + if (who < 0) { + type = PIDTYPE_PGID; + who = -who; + } + rcu_read_lock(); + pid = find_vpid(who); + __f_setown(filp, pid, type, force); + rcu_read_unlock(); +} +EXPORT_SYMBOL(f_setown); + +void f_delown(struct file *filp) +{ + f_modown(filp, NULL, PIDTYPE_PID, 1); +} + +pid_t f_getown(struct file *filp) +{ + pid_t pid; + read_lock(&filp->f_owner.lock); + pid = pid_vnr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + read_unlock(&filp->f_owner.lock); + return pid; +} + +static int f_setown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex __user *owner_p = (void __user *)arg; + struct f_owner_ex owner; + struct pid *pid; + int type; + int ret; + + ret = copy_from_user(&owner, owner_p, sizeof(owner)); + if (ret) + return -EFAULT; + + switch (owner.type) { + case F_OWNER_TID: + type = PIDTYPE_MAX; + break; + + case F_OWNER_PID: + type = PIDTYPE_PID; + break; + + case F_OWNER_PGRP: + type = PIDTYPE_PGID; + break; + + default: + return -EINVAL; + } + + rcu_read_lock(); + pid = find_vpid(owner.pid); + if (owner.pid && !pid) + ret = -ESRCH; + else + __f_setown(filp, pid, type, 1); + rcu_read_unlock(); + + return ret; +} + +static int f_getown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex __user *owner_p = (void __user *)arg; + struct f_owner_ex owner; + int ret = 0; + + read_lock(&filp->f_owner.lock); + owner.pid = pid_vnr(filp->f_owner.pid); + switch (filp->f_owner.pid_type) { + case PIDTYPE_MAX: + owner.type = F_OWNER_TID; + break; + + case PIDTYPE_PID: + owner.type = F_OWNER_PID; + break; + + case PIDTYPE_PGID: + owner.type = F_OWNER_PGRP; + break; + + default: + WARN_ON(1); + ret = -EINVAL; + break; + } + read_unlock(&filp->f_owner.lock); + + if (!ret) { + ret = copy_to_user(owner_p, &owner, sizeof(owner)); + if (ret) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static int f_getowner_uids(struct file *filp, unsigned long arg) +{ + struct user_namespace *user_ns = current_user_ns(); + uid_t __user *dst = (void __user *)arg; + uid_t src[2]; + int err; + + read_lock(&filp->f_owner.lock); + src[0] = from_kuid(user_ns, filp->f_owner.uid); + src[1] = from_kuid(user_ns, filp->f_owner.euid); + read_unlock(&filp->f_owner.lock); + + err = put_user(src[0], &dst[0]); + err |= put_user(src[1], &dst[1]); + + return err; +} +#else +static int f_getowner_uids(struct file *filp, unsigned long arg) +{ + return -EINVAL; +} +#endif + +static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, + struct file *filp) +{ + long err = -EINVAL; + + switch (cmd) { + case F_DUPFD: + err = f_dupfd(arg, filp, 0); + break; + case F_DUPFD_CLOEXEC: + err = f_dupfd(arg, filp, O_CLOEXEC); + break; + case F_GETFD: + err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; + break; + case F_SETFD: + err = 0; + set_close_on_exec(fd, arg & FD_CLOEXEC); + break; + case F_GETFL: + err = filp->f_flags; + break; + case F_SETFL: + err = setfl(fd, filp, arg); + break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_GETLK: +#endif + case F_GETLK: + err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); + break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_SETLK: + case F_OFD_SETLKW: +#endif + /* Fallthrough */ + case F_SETLK: + case F_SETLKW: + err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + break; + case F_GETOWN: + /* + * XXX If f_owner is a process group, the + * negative return value will get converted + * into an error. Oops. If we keep the + * current syscall conventions, the only way + * to fix this will be in libc. + */ + err = f_getown(filp); + force_successful_syscall_return(); + break; + case F_SETOWN: + f_setown(filp, arg, 1); + err = 0; + break; + case F_GETOWN_EX: + err = f_getown_ex(filp, arg); + break; + case F_SETOWN_EX: + err = f_setown_ex(filp, arg); + break; + case F_GETOWNER_UIDS: + err = f_getowner_uids(filp, arg); + break; + case F_GETSIG: + err = filp->f_owner.signum; + break; + case F_SETSIG: + /* arg == 0 restores default behaviour. */ + if (!valid_signal(arg)) { + break; + } + err = 0; + filp->f_owner.signum = arg; + break; + case F_GETLEASE: + err = fcntl_getlease(filp); + break; + case F_SETLEASE: + err = fcntl_setlease(fd, filp, arg); + break; + case F_NOTIFY: + err = fcntl_dirnotify(fd, filp, arg); + break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; + case F_ADD_SEALS: + case F_GET_SEALS: + err = shmem_fcntl(filp, cmd, arg); + break; + default: + break; + } + return err; +} + +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + +SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + struct fd f = fdget_raw(fd); + long err = -EBADF; + + if (!f.file) + goto out; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out1; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (!err) + err = do_fcntl(fd, cmd, arg, f.file); + +out1: + fdput(f); +out: + return err; +} + +#if BITS_PER_LONG == 32 +SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + unsigned long, arg) +{ + struct fd f = fdget_raw(fd); + long err = -EBADF; + + if (!f.file) + goto out; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out1; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (err) + goto out1; + + switch (cmd) { + case F_GETLK64: + case F_OFD_GETLK: + err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + break; + case F_SETLK64: + case F_SETLKW64: + case F_OFD_SETLK: + case F_OFD_SETLKW: + err = fcntl_setlk64(fd, f.file, cmd, + (struct flock64 __user *) arg); + break; + default: + err = do_fcntl(fd, cmd, arg, f.file); + break; + } +out1: + fdput(f); +out: + return err; +} +#endif + +/* Table to convert sigio signal codes into poll band bitmaps */ + +static const long band_table[NSIGPOLL] = { + POLLIN | POLLRDNORM, /* POLL_IN */ + POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ + POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ + POLLERR, /* POLL_ERR */ + POLLPRI | POLLRDBAND, /* POLL_PRI */ + POLLHUP | POLLERR /* POLL_HUP */ +}; + +static inline int sigio_perm(struct task_struct *p, + struct fown_struct *fown, int sig) +{ + const struct cred *cred; + int ret; + + rcu_read_lock(); + cred = __task_cred(p); + ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || + uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || + uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && + !security_file_send_sigiotask(p, fown, sig)); + rcu_read_unlock(); + return ret; +} + +static void send_sigio_to_task(struct task_struct *p, + struct fown_struct *fown, + int fd, int reason, int group) +{ + /* + * F_SETSIG can change ->signum lockless in parallel, make + * sure we read it once and use the same value throughout. + */ + int signum = ACCESS_ONCE(fown->signum); + + if (!sigio_perm(p, fown, signum)) + return; + + switch (signum) { + siginfo_t si; + default: + /* Queue a rt signal with the appropriate fd as its + value. We use SI_SIGIO as the source, not + SI_KERNEL, since kernel signals always get + delivered even if we can't queue. Failure to + queue in this case _should_ be reported; we fall + back to SIGIO in that case. --sct */ + si.si_signo = signum; + si.si_errno = 0; + si.si_code = reason; + /* Make sure we are called with one of the POLL_* + reasons, otherwise we could leak kernel stack into + userspace. */ + BUG_ON((reason & __SI_MASK) != __SI_POLL); + if (reason - POLL_IN >= NSIGPOLL) + si.si_band = ~0L; + else + si.si_band = band_table[reason - POLL_IN]; + si.si_fd = fd; + if (!do_send_sig_info(signum, &si, p, group)) + break; + /* fall-through: fall back on the old plain SIGIO signal */ + case 0: + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); + } +} + +void send_sigio(struct fown_struct *fown, int fd, int band) +{ + struct task_struct *p; + enum pid_type type; + struct pid *pid; + int group = 1; + + read_lock(&fown->lock); + + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + + pid = fown->pid; + if (!pid) + goto out_unlock_fown; + + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigio_to_task(p, fown, fd, band, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + out_unlock_fown: + read_unlock(&fown->lock); +} + +static void send_sigurg_to_task(struct task_struct *p, + struct fown_struct *fown, int group) +{ + if (sigio_perm(p, fown, SIGURG)) + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group); +} + +int send_sigurg(struct fown_struct *fown) +{ + struct task_struct *p; + enum pid_type type; + struct pid *pid; + int group = 1; + int ret = 0; + + read_lock(&fown->lock); + + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + + pid = fown->pid; + if (!pid) + goto out_unlock_fown; + + ret = 1; + + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigurg_to_task(p, fown, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + out_unlock_fown: + read_unlock(&fown->lock); + return ret; +} + +static DEFINE_SPINLOCK(fasync_lock); +static struct kmem_cache *fasync_cache __read_mostly; + +static void fasync_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(fasync_cache, + container_of(head, struct fasync_struct, fa_rcu)); +} + +/* + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + */ +int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *fa, **fp; + int result = 0; + + spin_lock(&filp->f_lock); + spin_lock(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_file = NULL; + spin_unlock_irq(&fa->fa_lock); + + *fp = fa->fa_next; + call_rcu(&fa->fa_rcu, fasync_free_rcu); + filp->f_flags &= ~FASYNC; + result = 1; + break; + } + spin_unlock(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +struct fasync_struct *fasync_alloc(void) +{ + return kmem_cache_alloc(fasync_cache, GFP_KERNEL); +} + +/* + * NOTE! This can be used only for unused fasync entries: + * entries that actually got inserted on the fasync list + * need to be released by rcu - see fasync_remove_entry. + */ +void fasync_free(struct fasync_struct *new) +{ + kmem_cache_free(fasync_cache, new); +} + +/* + * Insert a new entry into the fasync list. Return the pointer to the + * old one if we didn't use the new one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) +{ + struct fasync_struct *fa, **fp; + + spin_lock(&filp->f_lock); + spin_lock(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_fd = fd; + spin_unlock_irq(&fa->fa_lock); + goto out; + } + + spin_lock_init(&new->fa_lock); + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + rcu_assign_pointer(*fapp, new); + filp->f_flags |= FASYNC; + +out: + spin_unlock(&fasync_lock); + spin_unlock(&filp->f_lock); + return fa; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new; + + new = fasync_alloc(); + if (!new) + return -ENOMEM; + + /* + * fasync_insert_entry() returns the old (update) entry if + * it existed. + * + * So free the (unused) new entry and return 0 to let the + * caller know that we didn't add any new fasync entries. + */ + if (fasync_insert_entry(fd, filp, fapp, new)) { + fasync_free(new); + return 0; + } + + return 1; +} + +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + +EXPORT_SYMBOL(fasync_helper); + +/* + * rcu_read_lock() is held + */ +static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) +{ + while (fa) { + struct fown_struct *fown; + unsigned long flags; + + if (fa->magic != FASYNC_MAGIC) { + printk(KERN_ERR "kill_fasync: bad magic number in " + "fasync_struct!\n"); + return; + } + spin_lock_irqsave(&fa->fa_lock, flags); + if (fa->fa_file) { + fown = &fa->fa_file->f_owner; + /* Don't send SIGURG to processes which have not set a + queued signum: SIGURG has its own default signalling + mechanism. */ + if (!(sig == SIGURG && fown->signum == 0)) + send_sigio(fown, fa->fa_fd, band); + } + spin_unlock_irqrestore(&fa->fa_lock, flags); + fa = rcu_dereference(fa->fa_next); + } +} + +void kill_fasync(struct fasync_struct **fp, int sig, int band) +{ + /* First a quick test without locking: usually + * the list is empty. + */ + if (*fp) { + rcu_read_lock(); + kill_fasync_rcu(rcu_dereference(*fp), sig, band); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(kill_fasync); + +static int __init fcntl_init(void) +{ + /* + * Please add new bits here to ensure allocation uniqueness. + * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY + * is defined as O_NONBLOCK on some platforms and not on others. + */ + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + O_RDONLY | O_WRONLY | O_RDWR | + O_CREAT | O_EXCL | O_NOCTTY | + O_TRUNC | O_APPEND | /* O_NONBLOCK | */ + __O_SYNC | O_DSYNC | FASYNC | + O_DIRECT | O_LARGEFILE | O_DIRECTORY | + O_NOFOLLOW | O_NOATIME | O_CLOEXEC | + __FMODE_EXEC | O_PATH | __O_TMPFILE | + __FMODE_NONOTIFY + )); + + fasync_cache = kmem_cache_create("fasync_cache", + sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + return 0; +} + +module_init(fcntl_init) diff --git a/kernel/fs/fhandle.c b/kernel/fs/fhandle.c new file mode 100644 index 000000000..d59712dfa --- /dev/null +++ b/kernel/fs/fhandle.c @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "mount.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need to make sure whether the file system + * support decoding of the file handle + */ + if (!path->dentry->d_sb->s_export_op || + !path->dentry->d_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == FILEID_INVALID) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id, + sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct vfsmount *mnt; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + mnt = mntget(fs->pwd.mnt); + spin_unlock(&fs->lock); + } else { + struct fd f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + mnt = mntget(f.file->f_path.mnt); + fdput(f); + } + return mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + *handle = f_handle; + if (copy_from_user(&handle->f_handle, + &ufh->f_handle, + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/kernel/fs/file.c b/kernel/fs/file.c new file mode 100644 index 000000000..93c5f89c2 --- /dev/null +++ b/kernel/fs/file.c @@ -0,0 +1,914 @@ +/* + * linux/fs/file.c + * + * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes + * + * Manage the dynamic fd arrays in the process files_struct. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_nr_open __read_mostly = 1024*1024; +int sysctl_nr_open_min = BITS_PER_LONG; +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; + +static void *alloc_fdmem(size_t size) +{ + /* + * Very large allocations can stress page reclaim, so fall back to + * vmalloc() if the allocation size will be considered "large" by the VM. + */ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + if (data != NULL) + return data; + } + return vmalloc(size); +} + +static void __free_fdtable(struct fdtable *fdt) +{ + kvfree(fdt->fd); + kvfree(fdt->open_fds); + kfree(fdt); +} + +static void free_fdtable_rcu(struct rcu_head *rcu) +{ + __free_fdtable(container_of(rcu, struct fdtable, rcu)); +} + +/* + * Expand the fdset in the files_struct. Called with the files spinlock + * held for write. + */ +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) +{ + unsigned int cpy, set; + + BUG_ON(nfdt->max_fds < ofdt->max_fds); + + cpy = ofdt->max_fds * sizeof(struct file *); + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)(nfdt->fd) + cpy, 0, set); + + cpy = ofdt->max_fds / BITS_PER_BYTE; + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)(nfdt->open_fds) + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)(nfdt->close_on_exec) + cpy, 0, set); +} + +static struct fdtable * alloc_fdtable(unsigned int nr) +{ + struct fdtable *fdt; + void *data; + + /* + * Figure out how many fds we actually want to support in this fdtable. + * Allocation steps are keyed to the size of the fdarray, since it + * grows far faster than any of the other dynamic data. We try to fit + * the fdarray into comfortable page-tuned chunks: starting at 1024B + * and growing in powers of two from there on. + */ + nr /= (1024 / sizeof(struct file *)); + nr = roundup_pow_of_two(nr + 1); + nr *= (1024 / sizeof(struct file *)); + /* + * Note that this can drive nr *below* what we had passed if sysctl_nr_open + * had been set lower between the check in expand_files() and here. Deal + * with that in caller, it's cheaper that way. + * + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + if (!fdt) + goto out; + fdt->max_fds = nr; + data = alloc_fdmem(nr * sizeof(struct file *)); + if (!data) + goto out_fdt; + fdt->fd = data; + + data = alloc_fdmem(max_t(size_t, + 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + if (!data) + goto out_arr; + fdt->open_fds = data; + data += nr / BITS_PER_BYTE; + fdt->close_on_exec = data; + + return fdt; + +out_arr: + kvfree(fdt->fd); +out_fdt: + kfree(fdt); +out: + return NULL; +} + +/* + * Expand the file descriptor table. + * This function will allocate a new fdtable and both fd array and fdset, of + * the given size. + * Return <0 error code on error; 1 on successful completion. + * The files->file_lock should be held on entry, and will be held on exit. + */ +static int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) +{ + struct fdtable *new_fdt, *cur_fdt; + + spin_unlock(&files->file_lock); + new_fdt = alloc_fdtable(nr); + spin_lock(&files->file_lock); + if (!new_fdt) + return -ENOMEM; + /* + * extremely unlikely race - sysctl_nr_open decreased between the check in + * caller and alloc_fdtable(). Cheaper to catch it here... + */ + if (unlikely(new_fdt->max_fds <= nr)) { + __free_fdtable(new_fdt); + return -EMFILE; + } + /* + * Check again since another task may have expanded the fd table while + * we dropped the lock + */ + cur_fdt = files_fdtable(files); + if (nr >= cur_fdt->max_fds) { + /* Continue as planned */ + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + } else { + /* Somebody else expanded, so undo our attempt */ + __free_fdtable(new_fdt); + } + return 1; +} + +/* + * Expand files. + * This function will expand the file structures, if the requested size exceeds + * the current capacity and there is room for expansion. + * Return <0 error code on error; 0 when nothing done; 1 when files were + * expanded and execution may have blocked. + * The files->file_lock should be held on entry, and will be held on exit. + */ +static int expand_files(struct files_struct *files, int nr) +{ + struct fdtable *fdt; + + fdt = files_fdtable(files); + + /* Do we need to expand? */ + if (nr < fdt->max_fds) + return 0; + + /* Can we expand? */ + if (nr >= sysctl_nr_open) + return -EMFILE; + + /* All good, so we try */ + return expand_fdtable(files, nr); +} + +static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +{ + __set_bit(fd, fdt->close_on_exec); +} + +static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +{ + __clear_bit(fd, fdt->close_on_exec); +} + +static inline void __set_open_fd(int fd, struct fdtable *fdt) +{ + __set_bit(fd, fdt->open_fds); +} + +static inline void __clear_open_fd(int fd, struct fdtable *fdt) +{ + __clear_bit(fd, fdt->open_fds); +} + +static int count_open_files(struct fdtable *fdt) +{ + int size = fdt->max_fds; + int i; + + /* Find the last open fd */ + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) + break; + } + i = (i + 1) * BITS_PER_LONG; + return i; +} + +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. + */ +struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +{ + struct files_struct *newf; + struct file **old_fds, **new_fds; + int open_files, size, i; + struct fdtable *old_fdt, *new_fdt; + + *errorp = -ENOMEM; + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + newf->next_fd = 0; + new_fdt = &newf->fdtab; + new_fdt->max_fds = NR_OPEN_DEFAULT; + new_fdt->close_on_exec = newf->close_on_exec_init; + new_fdt->open_fds = newf->open_fds_init; + new_fdt->fd = &newf->fd_array[0]; + + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + + /* + * Check whether we need to allocate a larger fd array and fd set. + */ + while (unlikely(open_files > new_fdt->max_fds)) { + spin_unlock(&oldf->file_lock); + + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + + new_fdt = alloc_fdtable(open_files - 1); + if (!new_fdt) { + *errorp = -ENOMEM; + goto out_release; + } + + /* beyond sysctl_nr_open; nothing to do */ + if (unlikely(new_fdt->max_fds < open_files)) { + __free_fdtable(new_fdt); + *errorp = -EMFILE; + goto out_release; + } + + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + } + + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; + + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) { + get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + __clear_open_fd(open_files - i, new_fdt); + } + rcu_assign_pointer(*new_fds++, f); + } + spin_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds - open_files) / 8; + int start = open_files / BITS_PER_LONG; + + memset(&new_fdt->open_fds[start], 0, left); + memset(&new_fdt->close_on_exec[start], 0, left); + } + + rcu_assign_pointer(newf->fdt, new_fdt); + + return newf; + +out_release: + kmem_cache_free(files_cachep, newf); +out: + return NULL; +} + +static struct fdtable *close_files(struct files_struct * files) +{ + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. + */ + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + int i, j = 0; + + for (;;) { + unsigned long set; + i = j * BITS_PER_LONG; + if (i >= fdt->max_fds) + break; + set = fdt->open_fds[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&fdt->fd[i], NULL); + if (file) { + filp_close(file, files); + cond_resched_rcu_qs(); + } + } + i++; + set >>= 1; + } + } + + return fdt; +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void put_files_struct(struct files_struct *files) +{ + if (atomic_dec_and_test(&files->count)) { + struct fdtable *fdt = close_files(files); + + /* free the arrays if they are not embedded */ + if (fdt != &files->fdtab) + __free_fdtable(fdt); + kmem_cache_free(files_cachep, files); + } +} + +void reset_files_struct(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +struct files_struct init_files = { + .count = ATOMIC_INIT(1), + .fdt = &init_files.fdtab, + .fdtab = { + .max_fds = NR_OPEN_DEFAULT, + .fd = &init_files.fd_array[0], + .close_on_exec = init_files.close_on_exec_init, + .open_fds = init_files.open_fds_init, + }, + .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), +}; + +/* + * allocate a file descriptor, mark it busy. + */ +int __alloc_fd(struct files_struct *files, + unsigned start, unsigned end, unsigned flags) +{ + unsigned int fd; + int error; + struct fdtable *fdt; + + spin_lock(&files->file_lock); +repeat: + fdt = files_fdtable(files); + fd = start; + if (fd < files->next_fd) + fd = files->next_fd; + + if (fd < fdt->max_fds) + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + error = -EMFILE; + if (fd >= end) + goto out; + + error = expand_files(files, fd); + if (error < 0) + goto out; + + /* + * If we needed to expand the fs array we + * might have blocked - try again. + */ + if (error) + goto repeat; + + if (start <= files->next_fd) + files->next_fd = fd + 1; + + __set_open_fd(fd, fdt); + if (flags & O_CLOEXEC) + __set_close_on_exec(fd, fdt); + else + __clear_close_on_exec(fd, fdt); + error = fd; +#if 1 + /* Sanity check */ + if (rcu_access_pointer(fdt->fd[fd]) != NULL) { + printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); + rcu_assign_pointer(fdt->fd[fd], NULL); + } +#endif + +out: + spin_unlock(&files->file_lock); + return error; +} + +static int alloc_fd(unsigned start, unsigned flags) +{ + return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); +} + +int get_unused_fd_flags(unsigned flags) +{ + return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); +} +EXPORT_SYMBOL(get_unused_fd_flags); + +static void __put_unused_fd(struct files_struct *files, unsigned int fd) +{ + struct fdtable *fdt = files_fdtable(files); + __clear_open_fd(fd, fdt); + if (fd < files->next_fd) + files->next_fd = fd; +} + +void put_unused_fd(unsigned int fd) +{ + struct files_struct *files = current->files; + spin_lock(&files->file_lock); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); +} + +EXPORT_SYMBOL(put_unused_fd); + +/* + * Install a file pointer in the fd array. + * + * The VFS is full of places where we drop the files lock between + * setting the open_fds bitmap and installing the file in the file + * array. At any such point, we are vulnerable to a dup2() race + * installing a file in the array before us. We need to detect this and + * fput() the struct file we are about to overwrite in this case. + * + * It should never happen - if we allow dup2() do it, _really_ bad things + * will follow. + * + * NOTE: __fd_install() variant is really, really low-level; don't + * use it unless you are forced to by truly lousy API shoved down + * your throat. 'files' *MUST* be either current->files or obtained + * by get_files_struct(current) done by whoever had given it to you, + * or really bad things will happen. Normally you want to use + * fd_install() instead. + */ + +void __fd_install(struct files_struct *files, unsigned int fd, + struct file *file) +{ + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + +void fd_install(unsigned int fd, struct file *file) +{ + __fd_install(current->files, fd, file); +} + +EXPORT_SYMBOL(fd_install); + +/* + * The same warnings as for __alloc_fd()/__fd_install() apply here... + */ +int __close_fd(struct files_struct *files, unsigned fd) +{ + struct file *file; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + goto out_unlock; + file = fdt->fd[fd]; + if (!file) + goto out_unlock; + rcu_assign_pointer(fdt->fd[fd], NULL); + __clear_close_on_exec(fd, fdt); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + return filp_close(file, files); + +out_unlock: + spin_unlock(&files->file_lock); + return -EBADF; +} + +void do_close_on_exec(struct files_struct *files) +{ + unsigned i; + struct fdtable *fdt; + + /* exec unshares first */ + spin_lock(&files->file_lock); + for (i = 0; ; i++) { + unsigned long set; + unsigned fd = i * BITS_PER_LONG; + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + break; + set = fdt->close_on_exec[i]; + if (!set) + continue; + fdt->close_on_exec[i] = 0; + for ( ; set ; fd++, set >>= 1) { + struct file *file; + if (!(set & 1)) + continue; + file = fdt->fd[fd]; + if (!file) + continue; + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + filp_close(file, files); + cond_resched(); + spin_lock(&files->file_lock); + } + + } + spin_unlock(&files->file_lock); +} + +static struct file *__fget(unsigned int fd, fmode_t mask) +{ + struct files_struct *files = current->files; + struct file *file; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if ((file->f_mode & mask) || !get_file_rcu(file)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +struct file *fget(unsigned int fd) +{ + return __fget(fd, FMODE_PATH); +} +EXPORT_SYMBOL(fget); + +struct file *fget_raw(unsigned int fd) +{ + return __fget(fd, 0); +} +EXPORT_SYMBOL(fget_raw); + +/* + * Lightweight file lookup - no refcnt increment if fd table isn't shared. + * + * You can use this instead of fget if you satisfy all of the following + * conditions: + * 1) You must call fput_light before exiting the syscall and returning control + * to userspace (i.e. you cannot remember the returned struct file * after + * returning to userspace). + * 2) You must not call filp_close on the returned struct file * in between + * calls to fget_light and fput_light. + * 3) You must not clone the current task in between the calls to fget_light + * and fput_light. + * + * The fput_needed flag returned by fget_light should be passed to the + * corresponding fput_light. + */ +static unsigned long __fget_light(unsigned int fd, fmode_t mask) +{ + struct files_struct *files = current->files; + struct file *file; + + if (atomic_read(&files->count) == 1) { + file = __fcheck_files(files, fd); + if (!file || unlikely(file->f_mode & mask)) + return 0; + return (unsigned long)file; + } else { + file = __fget(fd, mask); + if (!file) + return 0; + return FDPUT_FPUT | (unsigned long)file; + } +} +unsigned long __fdget(unsigned int fd) +{ + return __fget_light(fd, FMODE_PATH); +} +EXPORT_SYMBOL(__fdget); + +unsigned long __fdget_raw(unsigned int fd) +{ + return __fget_light(fd, 0); +} + +unsigned long __fdget_pos(unsigned int fd) +{ + unsigned long v = __fdget(fd); + struct file *file = (struct file *)(v & ~3); + + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file_count(file) > 1) { + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return v; +} + +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ + +void set_close_on_exec(unsigned int fd, int flag) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (flag) + __set_close_on_exec(fd, fdt); + else + __clear_close_on_exec(fd, fdt); + spin_unlock(&files->file_lock); +} + +bool get_close_on_exec(unsigned int fd) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + bool res; + rcu_read_lock(); + fdt = files_fdtable(files); + res = close_on_exec(fd, fdt); + rcu_read_unlock(); + return res; +} + +static int do_dup2(struct files_struct *files, + struct file *file, unsigned fd, unsigned flags) +__releases(&files->file_lock) +{ + struct file *tofree; + struct fdtable *fdt; + + /* + * We need to detect attempts to do dup2() over allocated but still + * not finished descriptor. NB: OpenBSD avoids that at the price of + * extra work in their equivalent of fget() - they insert struct + * file immediately after grabbing descriptor, mark it larval if + * more work (e.g. actual opening) is needed and make sure that + * fget() treats larval files as absent. Potentially interesting, + * but while extra work in fget() is trivial, locking implications + * and amount of surgery on open()-related paths in VFS are not. + * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" + * deadlocks in rather amusing ways, AFAICS. All of that is out of + * scope of POSIX or SUS, since neither considers shared descriptor + * tables and this condition does not arise without those. + */ + fdt = files_fdtable(files); + tofree = fdt->fd[fd]; + if (!tofree && fd_is_open(fd, fdt)) + goto Ebusy; + get_file(file); + rcu_assign_pointer(fdt->fd[fd], file); + __set_open_fd(fd, fdt); + if (flags & O_CLOEXEC) + __set_close_on_exec(fd, fdt); + else + __clear_close_on_exec(fd, fdt); + spin_unlock(&files->file_lock); + + if (tofree) + filp_close(tofree, files); + + return fd; + +Ebusy: + spin_unlock(&files->file_lock); + return -EBUSY; +} + +int replace_fd(unsigned fd, struct file *file, unsigned flags) +{ + int err; + struct files_struct *files = current->files; + + if (!file) + return __close_fd(files, fd); + + if (fd >= rlimit(RLIMIT_NOFILE)) + return -EBADF; + + spin_lock(&files->file_lock); + err = expand_files(files, fd); + if (unlikely(err < 0)) + goto out_unlock; + return do_dup2(files, file, fd, flags); + +out_unlock: + spin_unlock(&files->file_lock); + return err; +} + +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + int err = -EBADF; + struct file *file; + struct files_struct *files = current->files; + + if ((flags & ~O_CLOEXEC) != 0) + return -EINVAL; + + if (unlikely(oldfd == newfd)) + return -EINVAL; + + if (newfd >= rlimit(RLIMIT_NOFILE)) + return -EBADF; + + spin_lock(&files->file_lock); + err = expand_files(files, newfd); + file = fcheck(oldfd); + if (unlikely(!file)) + goto Ebadf; + if (unlikely(err < 0)) { + if (err == -EMFILE) + goto Ebadf; + goto out_unlock; + } + return do_dup2(files, file, newfd, flags); + +Ebadf: + err = -EBADF; +out_unlock: + spin_unlock(&files->file_lock); + return err; +} + +SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) +{ + if (unlikely(newfd == oldfd)) { /* corner case */ + struct files_struct *files = current->files; + int retval = oldfd; + + rcu_read_lock(); + if (!fcheck_files(files, oldfd)) + retval = -EBADF; + rcu_read_unlock(); + return retval; + } + return sys_dup3(oldfd, newfd, 0); +} + +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + int ret = -EBADF; + struct file *file = fget_raw(fildes); + + if (file) { + ret = get_unused_fd_flags(0); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + } + return ret; +} + +int f_dupfd(unsigned int from, struct file *file, unsigned flags) +{ + int err; + if (from >= rlimit(RLIMIT_NOFILE)) + return -EINVAL; + err = alloc_fd(from, flags); + if (err >= 0) { + get_file(file); + fd_install(err, file); + } + return err; +} + +int iterate_fd(struct files_struct *files, unsigned n, + int (*f)(const void *, struct file *, unsigned), + const void *p) +{ + struct fdtable *fdt; + int res = 0; + if (!files) + return 0; + spin_lock(&files->file_lock); + for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { + struct file *file; + file = rcu_dereference_check_fdtable(files, fdt->fd[n]); + if (!file) + continue; + res = f(p, file, n); + if (res) + break; + } + spin_unlock(&files->file_lock); + return res; +} +EXPORT_SYMBOL(iterate_fd); diff --git a/kernel/fs/file_table.c b/kernel/fs/file_table.c new file mode 100644 index 000000000..294174dcc --- /dev/null +++ b/kernel/fs/file_table.c @@ -0,0 +1,327 @@ +/* + * linux/fs/file_table.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +/* sysctl tunables... */ +struct files_stat_struct files_stat = { + .max_files = NR_FILE +}; + +/* SLAB cache for file structures */ +static struct kmem_cache *filp_cachep __read_mostly; + +static struct percpu_counter nr_files __cacheline_aligned_in_smp; + +static void file_free_rcu(struct rcu_head *head) +{ + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + + put_cred(f->f_cred); + kmem_cache_free(filp_cachep, f); +} + +static inline void file_free(struct file *f) +{ + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); +} + +/* + * Return the total number of open files in the system + */ +static long get_nr_files(void) +{ + return percpu_counter_read_positive(&nr_files); +} + +/* + * Return the maximum number of open files in the system + */ +unsigned long get_max_files(void) +{ + return files_stat.max_files; +} +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#else +int proc_nr_files(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + +/* Find an unused file structure and return a pointer to it. + * Returns an error pointer if some error happend e.g. we over file + * structures limit, run out of memory or operation is not permitted. + * + * Be very careful using this. You are responsible for + * getting write access to any mount that you might assign + * to this filp, if it is opened for write. If this is not + * done, you will imbalance int the mount's writer count + * and a warning at __fput() time. + */ +struct file *get_empty_filp(void) +{ + const struct cred *cred = current_cred(); + static long old_max; + struct file *f; + int error; + + /* + * Privileged users can go above max_files + */ + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) + goto over; + } + + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + percpu_counter_inc(&nr_files); + f->f_cred = get_cred(cred); + error = security_file_alloc(f); + if (unlikely(error)) { + file_free(f); + return ERR_PTR(error); + } + + atomic_long_set(&f->f_count, 1); + rwlock_init(&f->f_owner.lock); + spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); + eventpoll_init_file(f); + /* f->f_version: 0 */ + return f; + +over: + /* Ran out of filps - report that */ + if (get_nr_files() > old_max) { + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); + old_max = get_nr_files(); + } + return ERR_PTR(-ENFILE); +} + +/** + * alloc_file - allocate and initialize a 'struct file' + * + * @path: the (dentry, vfsmount) pair for the new file + * @mode: the mode with which the new file will be opened + * @fop: the 'struct file_operations' for the new file + */ +struct file *alloc_file(struct path *path, fmode_t mode, + const struct file_operations *fop) +{ + struct file *file; + + file = get_empty_filp(); + if (IS_ERR(file)) + return file; + + file->f_path = *path; + file->f_inode = path->dentry->d_inode; + file->f_mapping = path->dentry->d_inode->i_mapping; + if ((mode & FMODE_READ) && + likely(fop->read || fop->read_iter)) + mode |= FMODE_CAN_READ; + if ((mode & FMODE_WRITE) && + likely(fop->write || fop->write_iter)) + mode |= FMODE_CAN_WRITE; + file->f_mode = mode; + file->f_op = fop; + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(path->dentry->d_inode); + return file; +} +EXPORT_SYMBOL(alloc_file); + +/* the real guts of fput() - releasing the last reference to file + */ +static void __fput(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; + struct inode *inode = file->f_inode; + + might_sleep(); + + fsnotify_close(file); + /* + * The function eventpoll_release() should be the first called + * in the file cleanup chain. + */ + eventpoll_release(file); + locks_remove_file(file); + + if (unlikely(file->f_flags & FASYNC)) { + if (file->f_op->fasync) + file->f_op->fasync(-1, file, 0); + } + ima_file_free(file); + if (file->f_op->release) + file->f_op->release(inode, file); + security_file_free(file); + if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && + !(file->f_mode & FMODE_PATH))) { + cdev_put(inode->i_cdev); + } + fops_put(file->f_op); + put_pid(file->f_owner.pid); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } + file->f_path.dentry = NULL; + file->f_path.mnt = NULL; + file->f_inode = NULL; + file_free(file); + dput(dentry); + mntput(mnt); +} + +static LLIST_HEAD(delayed_fput_list); +static void delayed_fput(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&delayed_fput_list); + struct llist_node *next; + + for (; node; node = next) { + next = llist_next(node); + __fput(llist_entry(node, struct file, f_u.fu_llist)); + } +} + +static void ____fput(struct callback_head *work) +{ + __fput(container_of(work, struct file, f_u.fu_rcuhead)); +} + +/* + * If kernel thread really needs to have the final fput() it has done + * to complete, call this. The only user right now is the boot - we + * *do* need to make sure our writes to binaries on initramfs has + * not left us with opened struct file waiting for __fput() - execve() + * won't work without that. Please, don't add more callers without + * very good reasons; in particular, never call that with locks + * held and never call that from a thread that might need to do + * some work on any kind of umount. + */ +void flush_delayed_fput(void) +{ + delayed_fput(NULL); +} + +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + +void fput(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) { + struct task_struct *task = current; + + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_u.fu_rcuhead, ____fput); + if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + return; + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ + } + + if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); + } +} + +/* + * synchronous analog of fput(); for kernel threads that might be needed + * in some umount() (and thus can't use flush_delayed_fput() without + * risking deadlocks), need to wait for completion of __fput() and know + * for this specific struct file it won't involve anything that would + * need them. Use only if you really need it - at the very least, + * don't blindly convert fput() by kernel thread to that. + */ +void __fput_sync(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) { + struct task_struct *task = current; + BUG_ON(!(task->flags & PF_KTHREAD)); + __fput(file); + } +} + +EXPORT_SYMBOL(fput); + +void put_filp(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) { + security_file_free(file); + file_free(file); + } +} + +void __init files_init(unsigned long mempages) +{ + unsigned long n; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* + * One file with associated inode and dcache is very roughly 1K. + * Per default don't use more than 10% of our memory for files. + */ + + n = (mempages * (PAGE_SIZE / 1024)) / 10; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} diff --git a/kernel/fs/filesystems.c b/kernel/fs/filesystems.c new file mode 100644 index 000000000..5797d45a7 --- /dev/null +++ b/kernel/fs/filesystems.c @@ -0,0 +1,288 @@ +/* + * linux/fs/filesystems.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * table of configured filesystems + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Handling of filesystem drivers list. + * Rules: + * Inclusion to/removals from/scanning of list are protected by spinlock. + * During the unload module must call unregister_filesystem(). + * We can access the fields of list element if: + * 1) spinlock is held or + * 2) we hold the reference to the module. + * The latter can be guaranteed by call of try_module_get(); if it + * returned 0 we must skip the element, otherwise we got the reference. + * Once the reference is obtained we can drop the spinlock. + */ + +static struct file_system_type *file_systems; +static DEFINE_RWLOCK(file_systems_lock); + +/* WARNING: This can be used only if we _already_ own a reference */ +void get_filesystem(struct file_system_type *fs) +{ + __module_get(fs->owner); +} + +void put_filesystem(struct file_system_type *fs) +{ + module_put(fs->owner); +} + +static struct file_system_type **find_filesystem(const char *name, unsigned len) +{ + struct file_system_type **p; + for (p=&file_systems; *p; p=&(*p)->next) + if (strlen((*p)->name) == len && + strncmp((*p)->name, name, len) == 0) + break; + return p; +} + +/** + * register_filesystem - register a new filesystem + * @fs: the file system structure + * + * Adds the file system passed to the list of file systems the kernel + * is aware of for mount and other syscalls. Returns 0 on success, + * or a negative errno code on an error. + * + * The &struct file_system_type that is passed is linked into the kernel + * structures and must not be freed until the file system has been + * unregistered. + */ + +int register_filesystem(struct file_system_type * fs) +{ + int res = 0; + struct file_system_type ** p; + + BUG_ON(strchr(fs->name, '.')); + if (fs->next) + return -EBUSY; + write_lock(&file_systems_lock); + p = find_filesystem(fs->name, strlen(fs->name)); + if (*p) + res = -EBUSY; + else + *p = fs; + write_unlock(&file_systems_lock); + return res; +} + +EXPORT_SYMBOL(register_filesystem); + +/** + * unregister_filesystem - unregister a file system + * @fs: filesystem to unregister + * + * Remove a file system that was previously successfully registered + * with the kernel. An error is returned if the file system is not found. + * Zero is returned on a success. + * + * Once this function has returned the &struct file_system_type structure + * may be freed or reused. + */ + +int unregister_filesystem(struct file_system_type * fs) +{ + struct file_system_type ** tmp; + + write_lock(&file_systems_lock); + tmp = &file_systems; + while (*tmp) { + if (fs == *tmp) { + *tmp = fs->next; + fs->next = NULL; + write_unlock(&file_systems_lock); + synchronize_rcu(); + return 0; + } + tmp = &(*tmp)->next; + } + write_unlock(&file_systems_lock); + + return -EINVAL; +} + +EXPORT_SYMBOL(unregister_filesystem); + +#ifdef CONFIG_SYSFS_SYSCALL +static int fs_index(const char __user * __name) +{ + struct file_system_type * tmp; + struct filename *name; + int err, index; + + name = getname(__name); + err = PTR_ERR(name); + if (IS_ERR(name)) + return err; + + err = -EINVAL; + read_lock(&file_systems_lock); + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + if (strcmp(tmp->name, name->name) == 0) { + err = index; + break; + } + } + read_unlock(&file_systems_lock); + putname(name); + return err; +} + +static int fs_name(unsigned int index, char __user * buf) +{ + struct file_system_type * tmp; + int len, res; + + read_lock(&file_systems_lock); + for (tmp = file_systems; tmp; tmp = tmp->next, index--) + if (index <= 0 && try_module_get(tmp->owner)) + break; + read_unlock(&file_systems_lock); + if (!tmp) + return -EINVAL; + + /* OK, we got the reference, so we can safely block */ + len = strlen(tmp->name) + 1; + res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; + put_filesystem(tmp); + return res; +} + +static int fs_maxindex(void) +{ + struct file_system_type * tmp; + int index; + + read_lock(&file_systems_lock); + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) + ; + read_unlock(&file_systems_lock); + return index; +} + +/* + * Whee.. Weird sysv syscall. + */ +SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) +{ + int retval = -EINVAL; + + switch (option) { + case 1: + retval = fs_index((const char __user *) arg1); + break; + + case 2: + retval = fs_name(arg1, (char __user *) arg2); + break; + + case 3: + retval = fs_maxindex(); + break; + } + return retval; +} +#endif + +int __init get_filesystem_list(char *buf) +{ + int len = 0; + struct file_system_type * tmp; + + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp && len < PAGE_SIZE - 80) { + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); + return len; +} + +#ifdef CONFIG_PROC_FS +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_system_type * tmp; + + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp) { + seq_printf(m, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); + return 0; +} + +static int filesystems_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, filesystems_proc_show, NULL); +} + +static const struct file_operations filesystems_proc_fops = { + .open = filesystems_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_filesystems_init(void) +{ + proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + return 0; +} +module_init(proc_filesystems_init); +#endif + +static struct file_system_type *__get_fs_type(const char *name, int len) +{ + struct file_system_type *fs; + + read_lock(&file_systems_lock); + fs = *(find_filesystem(name, len)); + if (fs && !try_module_get(fs->owner)) + fs = NULL; + read_unlock(&file_systems_lock); + return fs; +} + +struct file_system_type *get_fs_type(const char *name) +{ + struct file_system_type *fs; + const char *dot = strchr(name, '.'); + int len = dot ? dot - name : strlen(name); + + fs = __get_fs_type(name, len); + if (!fs && (request_module("fs-%.*s", len, name) == 0)) + fs = __get_fs_type(name, len); + + if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { + put_filesystem(fs); + fs = NULL; + } + return fs; +} + +EXPORT_SYMBOL(get_fs_type); diff --git a/kernel/fs/freevxfs/Kconfig b/kernel/fs/freevxfs/Kconfig new file mode 100644 index 000000000..8dc1cd5c1 --- /dev/null +++ b/kernel/fs/freevxfs/Kconfig @@ -0,0 +1,16 @@ +config VXFS_FS + tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" + depends on BLOCK + help + FreeVxFS is a file system driver that support the VERITAS VxFS(TM) + file system format. VERITAS VxFS(TM) is the standard file system + of SCO UnixWare (and possibly others) and optionally available + for Sunsoft Solaris, HP-UX and many other operating systems. + Currently only readonly access is supported. + + NOTE: the file system type as used by mount(1), mount(2) and + fstab(5) is 'vxfs' as it describes the file system format, not + the actual driver. + + To compile this as a module, choose M here: the module will be + called freevxfs. If unsure, say N. diff --git a/kernel/fs/freevxfs/Makefile b/kernel/fs/freevxfs/Makefile new file mode 100644 index 000000000..87ad09744 --- /dev/null +++ b/kernel/fs/freevxfs/Makefile @@ -0,0 +1,8 @@ +# +# VxFS Makefile +# + +obj-$(CONFIG_VXFS_FS) += freevxfs.o + +freevxfs-objs := vxfs_bmap.o vxfs_fshead.o vxfs_immed.o vxfs_inode.o \ + vxfs_lookup.o vxfs_olt.o vxfs_subr.o vxfs_super.o diff --git a/kernel/fs/freevxfs/vxfs.h b/kernel/fs/freevxfs/vxfs.h new file mode 100644 index 000000000..c8a926526 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_SUPER_H_ +#define _VXFS_SUPER_H_ + +/* + * Veritas filesystem driver - superblock structure. + * + * This file contains the definition of the disk and core + * superblocks of the Veritas Filesystem. + */ +#include + + +/* + * Data types for use with the VxFS ondisk format. + */ +typedef int32_t vx_daddr_t; +typedef int32_t vx_ino_t; + +/* + * Superblock magic number (vxfs_super->vs_magic). + */ +#define VXFS_SUPER_MAGIC 0xa501FCF5 + +/* + * The root inode. + */ +#define VXFS_ROOT_INO 2 + +/* + * Num of entries in free extent array + */ +#define VXFS_NEFREE 32 + + +/* + * VxFS superblock (disk). + */ +struct vxfs_sb { + /* + * Readonly fields for the version 1 superblock. + * + * Lots of this fields are no more used by version 2 + * and never filesystems. + */ + u_int32_t vs_magic; /* Magic number */ + int32_t vs_version; /* VxFS version */ + u_int32_t vs_ctime; /* create time - secs */ + u_int32_t vs_cutime; /* create time - usecs */ + int32_t __unused1; /* unused */ + int32_t __unused2; /* unused */ + vx_daddr_t vs_old_logstart; /* obsolete */ + vx_daddr_t vs_old_logend; /* obsolete */ + int32_t vs_bsize; /* block size */ + int32_t vs_size; /* number of blocks */ + int32_t vs_dsize; /* number of data blocks */ + u_int32_t vs_old_ninode; /* obsolete */ + int32_t vs_old_nau; /* obsolete */ + int32_t __unused3; /* unused */ + int32_t vs_old_defiextsize; /* obsolete */ + int32_t vs_old_ilbsize; /* obsolete */ + int32_t vs_immedlen; /* size of immediate data area */ + int32_t vs_ndaddr; /* number of direct extentes */ + vx_daddr_t vs_firstau; /* address of first AU */ + vx_daddr_t vs_emap; /* offset of extent map in AU */ + vx_daddr_t vs_imap; /* offset of inode map in AU */ + vx_daddr_t vs_iextop; /* offset of ExtOp. map in AU */ + vx_daddr_t vs_istart; /* offset of inode list in AU */ + vx_daddr_t vs_bstart; /* offset of fdblock in AU */ + vx_daddr_t vs_femap; /* aufirst + emap */ + vx_daddr_t vs_fimap; /* aufirst + imap */ + vx_daddr_t vs_fiextop; /* aufirst + iextop */ + vx_daddr_t vs_fistart; /* aufirst + istart */ + vx_daddr_t vs_fbstart; /* aufirst + bstart */ + int32_t vs_nindir; /* number of entries in indir */ + int32_t vs_aulen; /* length of AU in blocks */ + int32_t vs_auimlen; /* length of imap in blocks */ + int32_t vs_auemlen; /* length of emap in blocks */ + int32_t vs_auilen; /* length of ilist in blocks */ + int32_t vs_aupad; /* length of pad in blocks */ + int32_t vs_aublocks; /* data blocks in AU */ + int32_t vs_maxtier; /* log base 2 of aublocks */ + int32_t vs_inopb; /* number of inodes per blk */ + int32_t vs_old_inopau; /* obsolete */ + int32_t vs_old_inopilb; /* obsolete */ + int32_t vs_old_ndiripau; /* obsolete */ + int32_t vs_iaddrlen; /* size of indirect addr ext. */ + int32_t vs_bshift; /* log base 2 of bsize */ + int32_t vs_inoshift; /* log base 2 of inobp */ + int32_t vs_bmask; /* ~( bsize - 1 ) */ + int32_t vs_boffmask; /* bsize - 1 */ + int32_t vs_old_inomask; /* old_inopilb - 1 */ + int32_t vs_checksum; /* checksum of V1 data */ + + /* + * Version 1, writable + */ + int32_t vs_free; /* number of free blocks */ + int32_t vs_ifree; /* number of free inodes */ + int32_t vs_efree[VXFS_NEFREE]; /* number of free extents by size */ + int32_t vs_flags; /* flags ?!? */ + u_int8_t vs_mod; /* filesystem has been changed */ + u_int8_t vs_clean; /* clean FS */ + u_int16_t __unused4; /* unused */ + u_int32_t vs_firstlogid; /* mount time log ID */ + u_int32_t vs_wtime; /* last time written - sec */ + u_int32_t vs_wutime; /* last time written - usec */ + u_int8_t vs_fname[6]; /* FS name */ + u_int8_t vs_fpack[6]; /* FS pack name */ + int32_t vs_logversion; /* log format version */ + int32_t __unused5; /* unused */ + + /* + * Version 2, Read-only + */ + vx_daddr_t vs_oltext[2]; /* OLT extent and replica */ + int32_t vs_oltsize; /* OLT extent size */ + int32_t vs_iauimlen; /* size of inode map */ + int32_t vs_iausize; /* size of IAU in blocks */ + int32_t vs_dinosize; /* size of inode in bytes */ + int32_t vs_old_dniaddr; /* indir levels per inode */ + int32_t vs_checksum2; /* checksum of V2 RO */ + + /* + * Actually much more... + */ +}; + + +/* + * In core superblock filesystem private data for VxFS. + */ +struct vxfs_sb_info { + struct vxfs_sb *vsi_raw; /* raw (on disk) superblock */ + struct buffer_head *vsi_bp; /* buffer for raw superblock*/ + struct inode *vsi_fship; /* fileset header inode */ + struct inode *vsi_ilist; /* inode list inode */ + struct inode *vsi_stilist; /* structural inode list inode */ + u_long vsi_iext; /* initial inode list */ + ino_t vsi_fshino; /* fileset header inode */ + daddr_t vsi_oltext; /* OLT extent */ + daddr_t vsi_oltsize; /* OLT size */ +}; + + +/* + * File modes. File types above 0xf000 are vxfs internal only, they should + * not be passed back to higher levels of the system. vxfs file types must + * never have one of the regular file type bits set. + */ +enum vxfs_mode { + VXFS_ISUID = 0x00000800, /* setuid */ + VXFS_ISGID = 0x00000400, /* setgid */ + VXFS_ISVTX = 0x00000200, /* sticky bit */ + VXFS_IREAD = 0x00000100, /* read */ + VXFS_IWRITE = 0x00000080, /* write */ + VXFS_IEXEC = 0x00000040, /* exec */ + + VXFS_IFIFO = 0x00001000, /* Named pipe */ + VXFS_IFCHR = 0x00002000, /* Character device */ + VXFS_IFDIR = 0x00004000, /* Directory */ + VXFS_IFNAM = 0x00005000, /* Xenix device ?? */ + VXFS_IFBLK = 0x00006000, /* Block device */ + VXFS_IFREG = 0x00008000, /* Regular file */ + VXFS_IFCMP = 0x00009000, /* Compressed file ?!? */ + VXFS_IFLNK = 0x0000a000, /* Symlink */ + VXFS_IFSOC = 0x0000c000, /* Socket */ + + /* VxFS internal */ + VXFS_IFFSH = 0x10000000, /* Fileset header */ + VXFS_IFILT = 0x20000000, /* Inode list */ + VXFS_IFIAU = 0x30000000, /* Inode allocation unit */ + VXFS_IFCUT = 0x40000000, /* Current usage table */ + VXFS_IFATT = 0x50000000, /* Attr. inode */ + VXFS_IFLCT = 0x60000000, /* Link count table */ + VXFS_IFIAT = 0x70000000, /* Indirect attribute file */ + VXFS_IFEMR = 0x80000000, /* Extent map reorg file */ + VXFS_IFQUO = 0x90000000, /* BSD quota file */ + VXFS_IFPTI = 0xa0000000, /* "Pass through" inode */ + VXFS_IFLAB = 0x11000000, /* Device label file */ + VXFS_IFOLT = 0x12000000, /* OLT file */ + VXFS_IFLOG = 0x13000000, /* Log file */ + VXFS_IFEMP = 0x14000000, /* Extent map file */ + VXFS_IFEAU = 0x15000000, /* Extent AU file */ + VXFS_IFAUS = 0x16000000, /* Extent AU summary file */ + VXFS_IFDEV = 0x17000000, /* Device config file */ + +}; + +#define VXFS_TYPE_MASK 0xfffff000 + +#define VXFS_IS_TYPE(ip,type) (((ip)->vii_mode & VXFS_TYPE_MASK) == (type)) +#define VXFS_ISFIFO(x) VXFS_IS_TYPE((x),VXFS_IFIFO) +#define VXFS_ISCHR(x) VXFS_IS_TYPE((x),VXFS_IFCHR) +#define VXFS_ISDIR(x) VXFS_IS_TYPE((x),VXFS_IFDIR) +#define VXFS_ISNAM(x) VXFS_IS_TYPE((x),VXFS_IFNAM) +#define VXFS_ISBLK(x) VXFS_IS_TYPE((x),VXFS_IFBLK) +#define VXFS_ISLNK(x) VXFS_IS_TYPE((x),VXFS_IFLNK) +#define VXFS_ISREG(x) VXFS_IS_TYPE((x),VXFS_IFREG) +#define VXFS_ISCMP(x) VXFS_IS_TYPE((x),VXFS_IFCMP) +#define VXFS_ISSOC(x) VXFS_IS_TYPE((x),VXFS_IFSOC) + +#define VXFS_ISFSH(x) VXFS_IS_TYPE((x),VXFS_IFFSH) +#define VXFS_ISILT(x) VXFS_IS_TYPE((x),VXFS_IFILT) + +/* + * Inmode organisation types. + */ +enum { + VXFS_ORG_NONE = 0, /* Inode has *no* format ?!? */ + VXFS_ORG_EXT4 = 1, /* Ext4 */ + VXFS_ORG_IMMED = 2, /* All data stored in inode */ + VXFS_ORG_TYPED = 3, /* Typed extents */ +}; + +#define VXFS_IS_ORG(ip,org) ((ip)->vii_orgtype == (org)) +#define VXFS_ISNONE(ip) VXFS_IS_ORG((ip), VXFS_ORG_NONE) +#define VXFS_ISEXT4(ip) VXFS_IS_ORG((ip), VXFS_ORG_EXT4) +#define VXFS_ISIMMED(ip) VXFS_IS_ORG((ip), VXFS_ORG_IMMED) +#define VXFS_ISTYPED(ip) VXFS_IS_ORG((ip), VXFS_ORG_TYPED) + + +/* + * Get filesystem private data from VFS inode. + */ +#define VXFS_INO(ip) \ + ((struct vxfs_inode_info *)(ip)->i_private) + +/* + * Get filesystem private data from VFS superblock. + */ +#define VXFS_SBI(sbp) \ + ((struct vxfs_sb_info *)(sbp)->s_fs_info) + +#endif /* _VXFS_SUPER_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_bmap.c b/kernel/fs/freevxfs/vxfs_bmap.c new file mode 100644 index 000000000..f86fd3cac --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_bmap.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - filesystem to disk block mapping. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + + +#ifdef DIAGNOSTIC +static void +vxfs_typdump(struct vxfs_typed *typ) +{ + printk(KERN_DEBUG "type=%Lu ", typ->vt_hdr >> VXFS_TYPED_TYPESHIFT); + printk("offset=%Lx ", typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + printk("block=%x ", typ->vt_block); + printk("size=%x\n", typ->vt_size); +} +#endif + +/** + * vxfs_bmap_ext4 - do bmap for ext4 extents + * @ip: pointer to the inode we do bmap for + * @iblock: logical block. + * + * Description: + * vxfs_bmap_ext4 performs the bmap operation for inodes with + * ext4-style extents (which are much like the traditional UNIX + * inode organisation). + * + * Returns: + * The physical block number on success, else Zero. + */ +static daddr_t +vxfs_bmap_ext4(struct inode *ip, long bn) +{ + struct super_block *sb = ip->i_sb; + struct vxfs_inode_info *vip = VXFS_INO(ip); + unsigned long bsize = sb->s_blocksize; + u32 indsize = vip->vii_ext4.ve4_indsize; + int i; + + if (indsize > sb->s_blocksize) + goto fail_size; + + for (i = 0; i < VXFS_NDADDR; i++) { + struct direct *d = vip->vii_ext4.ve4_direct + i; + if (bn >= 0 && bn < d->size) + return (bn + d->extent); + bn -= d->size; + } + + if ((bn / (indsize * indsize * bsize / 4)) == 0) { + struct buffer_head *buf; + daddr_t bno; + u32 *indir; + + buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]); + if (!buf || !buffer_mapped(buf)) + goto fail_buf; + + indir = (u32 *)buf->b_data; + bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize); + + brelse(buf); + return bno; + } else + printk(KERN_WARNING "no matching indir?"); + + return 0; + +fail_size: + printk("vxfs: indirect extent too big!\n"); +fail_buf: + return 0; +} + +/** + * vxfs_bmap_indir - recursion for vxfs_bmap_typed + * @ip: pointer to the inode we do bmap for + * @indir: indirect block we start reading at + * @size: size of the typed area to search + * @block: partially result from further searches + * + * Description: + * vxfs_bmap_indir reads a &struct vxfs_typed at @indir + * and performs the type-defined action. + * + * Return Value: + * The physical block number on success, else Zero. + * + * Note: + * Kernelstack is rare. Unrecurse? + */ +static daddr_t +vxfs_bmap_indir(struct inode *ip, long indir, int size, long block) +{ + struct buffer_head *bp = NULL; + daddr_t pblock = 0; + int i; + + for (i = 0; i < size * VXFS_TYPED_PER_BLOCK(ip->i_sb); i++) { + struct vxfs_typed *typ; + int64_t off; + + bp = sb_bread(ip->i_sb, + indir + (i / VXFS_TYPED_PER_BLOCK(ip->i_sb))); + if (!bp || !buffer_mapped(bp)) + return 0; + + typ = ((struct vxfs_typed *)bp->b_data) + + (i % VXFS_TYPED_PER_BLOCK(ip->i_sb)); + off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + + if (block < off) { + brelse(bp); + continue; + } + + switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + case VXFS_TYPED_INDIRECT: + pblock = vxfs_bmap_indir(ip, typ->vt_block, + typ->vt_size, block - off); + if (pblock == -2) + break; + goto out; + case VXFS_TYPED_DATA: + if ((block - off) >= typ->vt_size) + break; + pblock = (typ->vt_block + block - off); + goto out; + case VXFS_TYPED_INDIRECT_DEV4: + case VXFS_TYPED_DATA_DEV4: { + struct vxfs_typed_dev4 *typ4 = + (struct vxfs_typed_dev4 *)typ; + + printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); + printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", + (unsigned long long) typ4->vd4_block, + (unsigned long long) typ4->vd4_size, + typ4->vd4_dev); + goto fail; + } + default: + BUG(); + } + brelse(bp); + } + +fail: + pblock = 0; +out: + brelse(bp); + return (pblock); +} + +/** + * vxfs_bmap_typed - bmap for typed extents + * @ip: pointer to the inode we do bmap for + * @iblock: logical block + * + * Description: + * Performs the bmap operation for typed extents. + * + * Return Value: + * The physical block number on success, else Zero. + */ +static daddr_t +vxfs_bmap_typed(struct inode *ip, long iblock) +{ + struct vxfs_inode_info *vip = VXFS_INO(ip); + daddr_t pblock = 0; + int i; + + for (i = 0; i < VXFS_NTYPED; i++) { + struct vxfs_typed *typ = vip->vii_org.typed + i; + int64_t off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + +#ifdef DIAGNOSTIC + vxfs_typdump(typ); +#endif + if (iblock < off) + continue; + switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + case VXFS_TYPED_INDIRECT: + pblock = vxfs_bmap_indir(ip, typ->vt_block, + typ->vt_size, iblock - off); + if (pblock == -2) + break; + return (pblock); + case VXFS_TYPED_DATA: + if ((iblock - off) < typ->vt_size) + return (typ->vt_block + iblock - off); + break; + case VXFS_TYPED_INDIRECT_DEV4: + case VXFS_TYPED_DATA_DEV4: { + struct vxfs_typed_dev4 *typ4 = + (struct vxfs_typed_dev4 *)typ; + + printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); + printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", + (unsigned long long) typ4->vd4_block, + (unsigned long long) typ4->vd4_size, + typ4->vd4_dev); + return 0; + } + default: + BUG(); + } + } + + return 0; +} + +/** + * vxfs_bmap1 - vxfs-internal bmap operation + * @ip: pointer to the inode we do bmap for + * @iblock: logical block + * + * Description: + * vxfs_bmap1 perfoms a logical to physical block mapping + * for vxfs-internal purposes. + * + * Return Value: + * The physical block number on success, else Zero. + */ +daddr_t +vxfs_bmap1(struct inode *ip, long iblock) +{ + struct vxfs_inode_info *vip = VXFS_INO(ip); + + if (VXFS_ISEXT4(vip)) + return vxfs_bmap_ext4(ip, iblock); + if (VXFS_ISTYPED(vip)) + return vxfs_bmap_typed(ip, iblock); + if (VXFS_ISNONE(vip)) + goto unsupp; + if (VXFS_ISIMMED(vip)) + goto unsupp; + + printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n", + ip->i_ino, vip->vii_orgtype); + BUG(); + +unsupp: + printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n", + ip->i_ino, vip->vii_orgtype); + return 0; +} diff --git a/kernel/fs/freevxfs/vxfs_dir.h b/kernel/fs/freevxfs/vxfs_dir.h new file mode 100644 index 000000000..aaf1fb098 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_dir.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_DIR_H_ +#define _VXFS_DIR_H_ + +/* + * Veritas filesystem driver - directory structure. + * + * This file contains the definition of the vxfs directory format. + */ + + +/* + * VxFS directory block header. + * + * This entry is the head of every filesystem block in a directory. + * It is used for free space management and additionally includes + * a hash for speeding up directory search (lookup). + * + * The hash may be empty and in fact we do not use it all in the + * Linux driver for now. + */ +struct vxfs_dirblk { + u_int16_t d_free; /* free space in dirblock */ + u_int16_t d_nhash; /* no of hash chains */ + u_int16_t d_hash[1]; /* hash chain */ +}; + +/* + * VXFS_NAMELEN is the maximum length of the d_name field + * of an VxFS directory entry. + */ +#define VXFS_NAMELEN 256 + +/* + * VxFS directory entry. + */ +struct vxfs_direct { + vx_ino_t d_ino; /* inode number */ + u_int16_t d_reclen; /* record length */ + u_int16_t d_namelen; /* d_name length */ + u_int16_t d_hashnext; /* next hash entry */ + char d_name[VXFS_NAMELEN]; /* name */ +}; + +/* + * VXFS_DIRPAD defines the directory entry boundaries, is _must_ be + * a multiple of four. + * VXFS_NAMEMIN is the length of a directory entry with a NULL d_name. + * VXFS_DIRROUND is an internal macros that rounds a length to a value + * usable for directory sizes. + * VXFS_DIRLEN calculates the directory entry size for an entry with + * a d_name with size len. + */ +#define VXFS_DIRPAD 4 +#define VXFS_NAMEMIN offsetof(struct vxfs_direct, d_name) +#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1)) +#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len))) + +/* + * VXFS_DIRBLKOV is the overhead of a specific dirblock. + */ +#define VXFS_DIRBLKOV(dbp) ((sizeof(short) * dbp->d_nhash) + 4) + +#endif /* _VXFS_DIR_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_extern.h b/kernel/fs/freevxfs/vxfs_extern.h new file mode 100644 index 000000000..881aa3d21 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_extern.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_EXTERN_H_ +#define _VXFS_EXTERN_H_ + +/* + * Veritas filesystem driver - external prototypes. + * + * This file contains prototypes for all vxfs functions used + * outside their respective source files. + */ + + +struct kmem_cache; +struct super_block; +struct vxfs_inode_info; +struct inode; + + +/* vxfs_bmap.c */ +extern daddr_t vxfs_bmap1(struct inode *, long); + +/* vxfs_fshead.c */ +extern int vxfs_read_fshead(struct super_block *); + +/* vxfs_immed.c */ +extern const struct inode_operations vxfs_immed_symlink_iops; + +/* vxfs_inode.c */ +extern const struct address_space_operations vxfs_immed_aops; +extern struct kmem_cache *vxfs_inode_cachep; +extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t); +extern struct inode * vxfs_get_fake_inode(struct super_block *, + struct vxfs_inode_info *); +extern void vxfs_put_fake_inode(struct inode *); +extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); +extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); +extern struct inode * vxfs_iget(struct super_block *, ino_t); +extern void vxfs_evict_inode(struct inode *); + +/* vxfs_lookup.c */ +extern const struct inode_operations vxfs_dir_inode_ops; +extern const struct file_operations vxfs_dir_operations; + +/* vxfs_olt.c */ +extern int vxfs_read_olt(struct super_block *, u_long); + +/* vxfs_subr.c */ +extern const struct address_space_operations vxfs_aops; +extern struct page * vxfs_get_page(struct address_space *, u_long); +extern void vxfs_put_page(struct page *); +extern struct buffer_head * vxfs_bread(struct inode *, int); + +#endif /* _VXFS_EXTERN_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_fshead.c b/kernel/fs/freevxfs/vxfs_fshead.c new file mode 100644 index 000000000..c9a6a94e5 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_fshead.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - fileset header routines. + */ +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" +#include "vxfs_fshead.h" + + +#ifdef DIAGNOSTIC +static void +vxfs_dumpfsh(struct vxfs_fsh *fhp) +{ + printk("\n\ndumping fileset header:\n"); + printk("----------------------------\n"); + printk("version: %u\n", fhp->fsh_version); + printk("fsindex: %u\n", fhp->fsh_fsindex); + printk("iauino: %u\tninodes:%u\n", + fhp->fsh_iauino, fhp->fsh_ninodes); + printk("maxinode: %u\tlctino: %u\n", + fhp->fsh_maxinode, fhp->fsh_lctino); + printk("nau: %u\n", fhp->fsh_nau); + printk("ilistino[0]: %u\tilistino[1]: %u\n", + fhp->fsh_ilistino[0], fhp->fsh_ilistino[1]); +} +#endif + +/** + * vxfs_getfsh - read fileset header into memory + * @ip: the (fake) fileset header inode + * @which: 0 for the structural, 1 for the primary fsh. + * + * Description: + * vxfs_getfsh reads either the structural or primary fileset header + * described by @ip into memory. + * + * Returns: + * The fileset header structure on success, else Zero. + */ +static struct vxfs_fsh * +vxfs_getfsh(struct inode *ip, int which) +{ + struct buffer_head *bp; + + bp = vxfs_bread(ip, which); + if (bp) { + struct vxfs_fsh *fhp; + + if (!(fhp = kmalloc(sizeof(*fhp), GFP_KERNEL))) + goto out; + memcpy(fhp, bp->b_data, sizeof(*fhp)); + + put_bh(bp); + return (fhp); + } +out: + brelse(bp); + return NULL; +} + +/** + * vxfs_read_fshead - read the fileset headers + * @sbp: superblock to which the fileset belongs + * + * Description: + * vxfs_read_fshead will fill the inode and structural inode list in @sb. + * + * Returns: + * Zero on success, else a negative error code (-EINVAL). + */ +int +vxfs_read_fshead(struct super_block *sbp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + struct vxfs_fsh *pfp, *sfp; + struct vxfs_inode_info *vip, *tip; + + vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); + if (!vip) { + printk(KERN_ERR "vxfs: unable to read fsh inode\n"); + return -EINVAL; + } + if (!VXFS_ISFSH(vip)) { + printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n", + vip->vii_mode & VXFS_TYPE_MASK); + goto out_free_fship; + } + + +#ifdef DIAGNOSTIC + printk("vxfs: fsh inode dump:\n"); + vxfs_dumpi(vip, infp->vsi_fshino); +#endif + + infp->vsi_fship = vxfs_get_fake_inode(sbp, vip); + if (!infp->vsi_fship) { + printk(KERN_ERR "vxfs: unable to get fsh inode\n"); + goto out_free_fship; + } + + sfp = vxfs_getfsh(infp->vsi_fship, 0); + if (!sfp) { + printk(KERN_ERR "vxfs: unable to get structural fsh\n"); + goto out_iput_fship; + } + +#ifdef DIAGNOSTIC + vxfs_dumpfsh(sfp); +#endif + + pfp = vxfs_getfsh(infp->vsi_fship, 1); + if (!pfp) { + printk(KERN_ERR "vxfs: unable to get primary fsh\n"); + goto out_free_sfp; + } + +#ifdef DIAGNOSTIC + vxfs_dumpfsh(pfp); +#endif + + tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]); + if (!tip) + goto out_free_pfp; + + infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip); + if (!infp->vsi_stilist) { + printk(KERN_ERR "vxfs: unable to get structural list inode\n"); + kfree(tip); + goto out_free_pfp; + } + if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) { + printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n", + VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); + goto out_iput_stilist; + } + + tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]); + if (!tip) + goto out_iput_stilist; + infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip); + if (!infp->vsi_ilist) { + printk(KERN_ERR "vxfs: unable to get inode list inode\n"); + kfree(tip); + goto out_iput_stilist; + } + if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) { + printk(KERN_ERR "vxfs: inode list inode is of wrong type (%x)\n", + VXFS_INO(infp->vsi_ilist)->vii_mode & VXFS_TYPE_MASK); + goto out_iput_ilist; + } + + return 0; + + out_iput_ilist: + iput(infp->vsi_ilist); + out_iput_stilist: + iput(infp->vsi_stilist); + out_free_pfp: + kfree(pfp); + out_free_sfp: + kfree(sfp); + out_iput_fship: + iput(infp->vsi_fship); + return -EINVAL; + out_free_fship: + kfree(vip); + return -EINVAL; +} diff --git a/kernel/fs/freevxfs/vxfs_fshead.h b/kernel/fs/freevxfs/vxfs_fshead.h new file mode 100644 index 000000000..ead0d640c --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_fshead.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_FSHEAD_H_ +#define _VXFS_FSHEAD_H_ + +/* + * Veritas filesystem driver - fileset header structures. + * + * This file contains the physical structure of the VxFS + * fileset header. + */ + + +/* + * Fileset header + */ +struct vxfs_fsh { + u_int32_t fsh_version; /* fileset header version */ + u_int32_t fsh_fsindex; /* fileset index */ + u_int32_t fsh_time; /* modification time - sec */ + u_int32_t fsh_utime; /* modification time - usec */ + u_int32_t fsh_extop; /* extop flags */ + vx_ino_t fsh_ninodes; /* allocated inodes */ + u_int32_t fsh_nau; /* number of IAUs */ + u_int32_t fsh_old_ilesize; /* old size of ilist */ + u_int32_t fsh_dflags; /* flags */ + u_int32_t fsh_quota; /* quota limit */ + vx_ino_t fsh_maxinode; /* maximum inode number */ + vx_ino_t fsh_iauino; /* IAU inode */ + vx_ino_t fsh_ilistino[2]; /* ilist inodes */ + vx_ino_t fsh_lctino; /* link count table inode */ + + /* + * Slightly more fields follow, but they + * a) are not of any interest for us, and + * b) differ a lot in different vxfs versions/ports + */ +}; + +#endif /* _VXFS_FSHEAD_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_immed.c b/kernel/fs/freevxfs/vxfs_immed.c new file mode 100644 index 000000000..8b9229e2c --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_immed.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - support for 'immed' inodes. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_extern.h" +#include "vxfs_inode.h" + + +static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *); + +static int vxfs_immed_readpage(struct file *, struct page *); + +/* + * Inode operations for immed symlinks. + * + * Unliked all other operations we do not go through the pagecache, + * but do all work directly on the inode. + */ +const struct inode_operations vxfs_immed_symlink_iops = { + .readlink = generic_readlink, + .follow_link = vxfs_immed_follow_link, +}; + +/* + * Address space operations for immed files and directories. + */ +const struct address_space_operations vxfs_immed_aops = { + .readpage = vxfs_immed_readpage, +}; + +/** + * vxfs_immed_follow_link - follow immed symlink + * @dp: dentry for the link + * @np: pathname lookup data for the current path walk + * + * Description: + * vxfs_immed_follow_link restarts the pathname lookup with + * the data obtained from @dp. + * + * Returns: + * Zero on success, else a negative error code. + */ +static void * +vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) +{ + struct vxfs_inode_info *vip = VXFS_INO(d_inode(dp)); + nd_set_link(np, vip->vii_immed.vi_immed); + return NULL; +} + +/** + * vxfs_immed_readpage - read part of an immed inode into pagecache + * @file: file context (unused) + * @page: page frame to fill in. + * + * Description: + * vxfs_immed_readpage reads a part of the immed area of the + * file that hosts @pp into the pagecache. + * + * Returns: + * Zero on success, else a negative error code. + * + * Locking status: + * @page is locked and will be unlocked. + */ +static int +vxfs_immed_readpage(struct file *fp, struct page *pp) +{ + struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT; + caddr_t kaddr; + + kaddr = kmap(pp); + memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE); + kunmap(pp); + + flush_dcache_page(pp); + SetPageUptodate(pp); + unlock_page(pp); + + return 0; +} diff --git a/kernel/fs/freevxfs/vxfs_inode.c b/kernel/fs/freevxfs/vxfs_inode.c new file mode 100644 index 000000000..363e3ae25 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_inode.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - inode routines. + */ +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + + +struct kmem_cache *vxfs_inode_cachep; + + +#ifdef DIAGNOSTIC +/* + * Dump inode contents (partially). + */ +void +vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino) +{ + printk(KERN_DEBUG "\n\n"); + if (ino) + printk(KERN_DEBUG "dumping vxfs inode %ld\n", ino); + else + printk(KERN_DEBUG "dumping unknown vxfs inode\n"); + + printk(KERN_DEBUG "---------------------------\n"); + printk(KERN_DEBUG "mode is %x\n", vip->vii_mode); + printk(KERN_DEBUG "nlink:%u, uid:%u, gid:%u\n", + vip->vii_nlink, vip->vii_uid, vip->vii_gid); + printk(KERN_DEBUG "size:%Lx, blocks:%u\n", + vip->vii_size, vip->vii_blocks); + printk(KERN_DEBUG "orgtype:%u\n", vip->vii_orgtype); +} +#endif + + +/** + * vxfs_blkiget - find inode based on extent # + * @sbp: superblock of the filesystem we search in + * @extent: number of the extent to search + * @ino: inode number to search + * + * Description: + * vxfs_blkiget searches inode @ino in the filesystem described by + * @sbp in the extent @extent. + * Returns the matching VxFS inode on success, else a NULL pointer. + * + * NOTE: + * While __vxfs_iget uses the pagecache vxfs_blkiget uses the + * buffercache. This function should not be used outside the + * read_super() method, otherwise the data may be incoherent. + */ +struct vxfs_inode_info * +vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino) +{ + struct buffer_head *bp; + u_long block, offset; + + block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize); + offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE); + bp = sb_bread(sbp, block); + + if (bp && buffer_mapped(bp)) { + struct vxfs_inode_info *vip; + struct vxfs_dinode *dip; + + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) + goto fail; + dip = (struct vxfs_dinode *)(bp->b_data + offset); + memcpy(vip, dip, sizeof(*vip)); +#ifdef DIAGNOSTIC + vxfs_dumpi(vip, ino); +#endif + brelse(bp); + return (vip); + } + +fail: + printk(KERN_WARNING "vxfs: unable to read block %ld\n", block); + brelse(bp); + return NULL; +} + +/** + * __vxfs_iget - generic find inode facility + * @sbp: VFS superblock + * @ino: inode number + * @ilistp: inode list + * + * Description: + * Search the for inode number @ino in the filesystem + * described by @sbp. Use the specified inode table (@ilistp). + * Returns the matching VxFS inode on success, else an error code. + */ +static struct vxfs_inode_info * +__vxfs_iget(ino_t ino, struct inode *ilistp) +{ + struct page *pp; + u_long offset; + + offset = (ino % (PAGE_SIZE / VXFS_ISIZE)) * VXFS_ISIZE; + pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE); + + if (!IS_ERR(pp)) { + struct vxfs_inode_info *vip; + struct vxfs_dinode *dip; + caddr_t kaddr = (char *)page_address(pp); + + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) + goto fail; + dip = (struct vxfs_dinode *)(kaddr + offset); + memcpy(vip, dip, sizeof(*vip)); +#ifdef DIAGNOSTIC + vxfs_dumpi(vip, ino); +#endif + vxfs_put_page(pp); + return (vip); + } + + printk(KERN_WARNING "vxfs: error on page %p\n", pp); + return ERR_CAST(pp); + +fail: + printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino); + vxfs_put_page(pp); + return ERR_PTR(-ENOMEM); +} + +/** + * vxfs_stiget - find inode using the structural inode list + * @sbp: VFS superblock + * @ino: inode # + * + * Description: + * Find inode @ino in the filesystem described by @sbp using + * the structural inode list. + * Returns the matching VxFS inode on success, else a NULL pointer. + */ +struct vxfs_inode_info * +vxfs_stiget(struct super_block *sbp, ino_t ino) +{ + struct vxfs_inode_info *vip; + + vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist); + return IS_ERR(vip) ? NULL : vip; +} + +/** + * vxfs_transmod - mode for a VxFS inode + * @vip: VxFS inode + * + * Description: + * vxfs_transmod returns a Linux mode_t for a given + * VxFS inode structure. + */ +static __inline__ umode_t +vxfs_transmod(struct vxfs_inode_info *vip) +{ + umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; + + if (VXFS_ISFIFO(vip)) + ret |= S_IFIFO; + if (VXFS_ISCHR(vip)) + ret |= S_IFCHR; + if (VXFS_ISDIR(vip)) + ret |= S_IFDIR; + if (VXFS_ISBLK(vip)) + ret |= S_IFBLK; + if (VXFS_ISLNK(vip)) + ret |= S_IFLNK; + if (VXFS_ISREG(vip)) + ret |= S_IFREG; + if (VXFS_ISSOC(vip)) + ret |= S_IFSOCK; + + return (ret); +} + +/** + * vxfs_iinit- helper to fill inode fields + * @ip: VFS inode + * @vip: VxFS inode + * + * Description: + * vxfs_instino is a helper function to fill in all relevant + * fields in @ip from @vip. + */ +static void +vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) +{ + + ip->i_mode = vxfs_transmod(vip); + i_uid_write(ip, (uid_t)vip->vii_uid); + i_gid_write(ip, (gid_t)vip->vii_gid); + + set_nlink(ip, vip->vii_nlink); + ip->i_size = vip->vii_size; + + ip->i_atime.tv_sec = vip->vii_atime; + ip->i_ctime.tv_sec = vip->vii_ctime; + ip->i_mtime.tv_sec = vip->vii_mtime; + ip->i_atime.tv_nsec = 0; + ip->i_ctime.tv_nsec = 0; + ip->i_mtime.tv_nsec = 0; + + ip->i_blocks = vip->vii_blocks; + ip->i_generation = vip->vii_gen; + + ip->i_private = vip; + +} + +/** + * vxfs_get_fake_inode - get fake inode structure + * @sbp: filesystem superblock + * @vip: fspriv inode + * + * Description: + * vxfs_fake_inode gets a fake inode (not in the inode hash) for a + * superblock, vxfs_inode pair. + * Returns the filled VFS inode. + */ +struct inode * +vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) +{ + struct inode *ip = NULL; + + if ((ip = new_inode(sbp))) { + ip->i_ino = get_next_ino(); + vxfs_iinit(ip, vip); + ip->i_mapping->a_ops = &vxfs_aops; + } + return (ip); +} + +/** + * vxfs_put_fake_inode - free faked inode + * *ip: VFS inode + * + * Description: + * vxfs_put_fake_inode frees all data associated with @ip. + */ +void +vxfs_put_fake_inode(struct inode *ip) +{ + iput(ip); +} + +/** + * vxfs_iget - get an inode + * @sbp: the superblock to get the inode for + * @ino: the number of the inode to get + * + * Description: + * vxfs_read_inode creates an inode, reads the disk inode for @ino and fills + * in all relevant fields in the new inode. + */ +struct inode * +vxfs_iget(struct super_block *sbp, ino_t ino) +{ + struct vxfs_inode_info *vip; + const struct address_space_operations *aops; + struct inode *ip; + + ip = iget_locked(sbp, ino); + if (!ip) + return ERR_PTR(-ENOMEM); + if (!(ip->i_state & I_NEW)) + return ip; + + vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist); + if (IS_ERR(vip)) { + iget_failed(ip); + return ERR_CAST(vip); + } + + vxfs_iinit(ip, vip); + + if (VXFS_ISIMMED(vip)) + aops = &vxfs_immed_aops; + else + aops = &vxfs_aops; + + if (S_ISREG(ip->i_mode)) { + ip->i_fop = &generic_ro_fops; + ip->i_mapping->a_ops = aops; + } else if (S_ISDIR(ip->i_mode)) { + ip->i_op = &vxfs_dir_inode_ops; + ip->i_fop = &vxfs_dir_operations; + ip->i_mapping->a_ops = aops; + } else if (S_ISLNK(ip->i_mode)) { + if (!VXFS_ISIMMED(vip)) { + ip->i_op = &page_symlink_inode_operations; + ip->i_mapping->a_ops = &vxfs_aops; + } else { + ip->i_op = &vxfs_immed_symlink_iops; + vip->vii_immed.vi_immed[ip->i_size] = '\0'; + } + } else + init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); + + unlock_new_inode(ip); + return ip; +} + +static void vxfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(vxfs_inode_cachep, inode->i_private); +} + +/** + * vxfs_evict_inode - remove inode from main memory + * @ip: inode to discard. + * + * Description: + * vxfs_evict_inode() is called on the final iput and frees the private + * inode area. + */ +void +vxfs_evict_inode(struct inode *ip) +{ + truncate_inode_pages_final(&ip->i_data); + clear_inode(ip); + call_rcu(&ip->i_rcu, vxfs_i_callback); +} diff --git a/kernel/fs/freevxfs/vxfs_inode.h b/kernel/fs/freevxfs/vxfs_inode.h new file mode 100644 index 000000000..240aeb112 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_inode.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_INODE_H_ +#define _VXFS_INODE_H_ + +/* + * Veritas filesystem driver - inode structure. + * + * This file contains the definition of the disk and core + * inodes of the Veritas Filesystem. + */ + + +#define VXFS_ISIZE 0x100 /* Inode size */ + +#define VXFS_NDADDR 10 /* Number of direct addrs in inode */ +#define VXFS_NIADDR 2 /* Number of indirect addrs in inode */ +#define VXFS_NIMMED 96 /* Size of immediate data in inode */ +#define VXFS_NTYPED 6 /* Num of typed extents */ + +#define VXFS_TYPED_OFFSETMASK (0x00FFFFFFFFFFFFFFULL) +#define VXFS_TYPED_TYPEMASK (0xFF00000000000000ULL) +#define VXFS_TYPED_TYPESHIFT 56 + +#define VXFS_TYPED_PER_BLOCK(sbp) \ + ((sbp)->s_blocksize / sizeof(struct vxfs_typed)) + +/* + * Possible extent descriptor types for %VXFS_ORG_TYPED extents. + */ +enum { + VXFS_TYPED_INDIRECT = 1, + VXFS_TYPED_DATA = 2, + VXFS_TYPED_INDIRECT_DEV4 = 3, + VXFS_TYPED_DATA_DEV4 = 4, +}; + +/* + * Data stored immediately in the inode. + */ +struct vxfs_immed { + u_int8_t vi_immed[VXFS_NIMMED]; +}; + +struct vxfs_ext4 { + u_int32_t ve4_spare; /* ?? */ + u_int32_t ve4_indsize; /* Indirect extent size */ + vx_daddr_t ve4_indir[VXFS_NIADDR]; /* Indirect extents */ + struct direct { /* Direct extents */ + vx_daddr_t extent; /* Extent number */ + int32_t size; /* Size of extent */ + } ve4_direct[VXFS_NDADDR]; +}; + +struct vxfs_typed { + u_int64_t vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + vx_daddr_t vt_block; /* Extent block */ + int32_t vt_size; /* Size in blocks */ +}; + +struct vxfs_typed_dev4 { + u_int64_t vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + u_int64_t vd4_block; /* Extent block */ + u_int64_t vd4_size; /* Size in blocks */ + int32_t vd4_dev; /* Device ID */ + u_int32_t __pad1; +}; + +/* + * The inode as contained on the physical device. + */ +struct vxfs_dinode { + int32_t vdi_mode; + u_int32_t vdi_nlink; /* Link count */ + u_int32_t vdi_uid; /* UID */ + u_int32_t vdi_gid; /* GID */ + u_int64_t vdi_size; /* Inode size in bytes */ + u_int32_t vdi_atime; /* Last time accessed - sec */ + u_int32_t vdi_autime; /* Last time accessed - usec */ + u_int32_t vdi_mtime; /* Last modify time - sec */ + u_int32_t vdi_mutime; /* Last modify time - usec */ + u_int32_t vdi_ctime; /* Create time - sec */ + u_int32_t vdi_cutime; /* Create time - usec */ + u_int8_t vdi_aflags; /* Allocation flags */ + u_int8_t vdi_orgtype; /* Organisation type */ + u_int16_t vdi_eopflags; + u_int32_t vdi_eopdata; + union { + u_int32_t rdev; + u_int32_t dotdot; + struct { + u_int32_t reserved; + u_int32_t fixextsize; + } i_regular; + struct { + u_int32_t matchino; + u_int32_t fsetindex; + } i_vxspec; + u_int64_t align; + } vdi_ftarea; + u_int32_t vdi_blocks; /* How much blocks does inode occupy */ + u_int32_t vdi_gen; /* Inode generation */ + u_int64_t vdi_version; /* Version */ + union { + struct vxfs_immed immed; + struct vxfs_ext4 ext4; + struct vxfs_typed typed[VXFS_NTYPED]; + } vdi_org; + u_int32_t vdi_iattrino; +}; + +#define vdi_rdev vdi_ftarea.rdev +#define vdi_dotdot vdi_ftarea.dotdot +#define vdi_fixextsize vdi_ftarea.regular.fixextsize +#define vdi_matchino vdi_ftarea.vxspec.matchino +#define vdi_fsetindex vdi_ftarea.vxspec.fsetindex + +#define vdi_immed vdi_org.immed +#define vdi_ext4 vdi_org.ext4 +#define vdi_typed vdi_org.typed + + +/* + * The inode as represented in the main memory. + * + * TBD: This should become a separate structure... + */ +#define vxfs_inode_info vxfs_dinode + +#define vii_mode vdi_mode +#define vii_uid vdi_uid +#define vii_gid vdi_gid +#define vii_nlink vdi_nlink +#define vii_size vdi_size +#define vii_atime vdi_atime +#define vii_ctime vdi_ctime +#define vii_mtime vdi_mtime +#define vii_blocks vdi_blocks +#define vii_org vdi_org +#define vii_orgtype vdi_orgtype +#define vii_gen vdi_gen + +#define vii_rdev vdi_ftarea.rdev +#define vii_dotdot vdi_ftarea.dotdot +#define vii_fixextsize vdi_ftarea.regular.fixextsize +#define vii_matchino vdi_ftarea.vxspec.matchino +#define vii_fsetindex vdi_ftarea.vxspec.fsetindex + +#define vii_immed vdi_org.immed +#define vii_ext4 vdi_org.ext4 +#define vii_typed vdi_org.typed + +#endif /* _VXFS_INODE_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_lookup.c b/kernel/fs/freevxfs/vxfs_lookup.c new file mode 100644 index 000000000..99c7f0a37 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_lookup.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - lookup and other directory related code. + */ +#include +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_dir.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + +/* + * Number of VxFS blocks per page. + */ +#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize)) + + +static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); +static int vxfs_readdir(struct file *, struct dir_context *); + +const struct inode_operations vxfs_dir_inode_ops = { + .lookup = vxfs_lookup, +}; + +const struct file_operations vxfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = vxfs_readdir, +}; + + +static inline u_long +dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static inline u_long +dir_blocks(struct inode *ip) +{ + u_long bsize = ip->i_sb->s_blocksize; + return (ip->i_size + bsize - 1) & ~(bsize - 1); +} + +/* + * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure. + * + * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller. + */ +static inline int +vxfs_match(int len, const char * const name, struct vxfs_direct *de) +{ + if (len != de->d_namelen) + return 0; + if (!de->d_ino) + return 0; + return !memcmp(name, de->d_name, len); +} + +static inline struct vxfs_direct * +vxfs_next_entry(struct vxfs_direct *de) +{ + return ((struct vxfs_direct *)((char*)de + de->d_reclen)); +} + +/** + * vxfs_find_entry - find a mathing directory entry for a dentry + * @ip: directory inode + * @dp: dentry for which we want to find a direct + * @ppp: gets filled with the page the return value sits in + * + * Description: + * vxfs_find_entry finds a &struct vxfs_direct for the VFS directory + * cache entry @dp. @ppp will be filled with the page the return + * value resides in. + * + * Returns: + * The wanted direct on success, else a NULL pointer. + */ +static struct vxfs_direct * +vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp) +{ + u_long npages, page, nblocks, pblocks, block; + u_long bsize = ip->i_sb->s_blocksize; + const char *name = dp->d_name.name; + int namelen = dp->d_name.len; + + npages = dir_pages(ip); + nblocks = dir_blocks(ip); + pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb); + + for (page = 0; page < npages; page++) { + caddr_t kaddr; + struct page *pp; + + pp = vxfs_get_page(ip->i_mapping, page); + if (IS_ERR(pp)) + continue; + kaddr = (caddr_t)page_address(pp); + + for (block = 0; block <= nblocks && block <= pblocks; block++) { + caddr_t baddr, limit; + struct vxfs_dirblk *dbp; + struct vxfs_direct *de; + + baddr = kaddr + (block * bsize); + limit = baddr + bsize - VXFS_DIRLEN(1); + + dbp = (struct vxfs_dirblk *)baddr; + de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp)); + + for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { + if (!de->d_reclen) + break; + if (!de->d_ino) + continue; + if (vxfs_match(namelen, name, de)) { + *ppp = pp; + return (de); + } + } + } + vxfs_put_page(pp); + } + + return NULL; +} + +/** + * vxfs_inode_by_name - find inode number for dentry + * @dip: directory to search in + * @dp: dentry we search for + * + * Description: + * vxfs_inode_by_name finds out the inode number of + * the path component described by @dp in @dip. + * + * Returns: + * The wanted inode number on success, else Zero. + */ +static ino_t +vxfs_inode_by_name(struct inode *dip, struct dentry *dp) +{ + struct vxfs_direct *de; + struct page *pp; + ino_t ino = 0; + + de = vxfs_find_entry(dip, dp, &pp); + if (de) { + ino = de->d_ino; + kunmap(pp); + page_cache_release(pp); + } + + return (ino); +} + +/** + * vxfs_lookup - lookup pathname component + * @dip: dir in which we lookup + * @dp: dentry we lookup + * @flags: lookup flags + * + * Description: + * vxfs_lookup tries to lookup the pathname component described + * by @dp in @dip. + * + * Returns: + * A NULL-pointer on success, else an negative error code encoded + * in the return pointer. + */ +static struct dentry * +vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) +{ + struct inode *ip = NULL; + ino_t ino; + + if (dp->d_name.len > VXFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = vxfs_inode_by_name(dip, dp); + if (ino) { + ip = vxfs_iget(dip->i_sb, ino); + if (IS_ERR(ip)) + return ERR_CAST(ip); + } + d_add(dp, ip); + return NULL; +} + +/** + * vxfs_readdir - read a directory + * @fp: the directory to read + * @retp: return buffer + * @filler: filldir callback + * + * Description: + * vxfs_readdir fills @retp with directory entries from @fp + * using the VFS supplied callback @filler. + * + * Returns: + * Zero. + */ +static int +vxfs_readdir(struct file *fp, struct dir_context *ctx) +{ + struct inode *ip = file_inode(fp); + struct super_block *sbp = ip->i_sb; + u_long bsize = sbp->s_blocksize; + u_long page, npages, block, pblocks, nblocks, offset; + loff_t pos; + + if (ctx->pos == 0) { + if (!dir_emit_dot(fp, ctx)) + return 0; + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR)) + return 0; + ctx->pos = 2; + } + pos = ctx->pos - 2; + + if (pos > VXFS_DIRROUND(ip->i_size)) + return 0; + + npages = dir_pages(ip); + nblocks = dir_blocks(ip); + pblocks = VXFS_BLOCK_PER_PAGE(sbp); + + page = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; + + for (; page < npages; page++, block = 0) { + char *kaddr; + struct page *pp; + + pp = vxfs_get_page(ip->i_mapping, page); + if (IS_ERR(pp)) + continue; + kaddr = (char *)page_address(pp); + + for (; block <= nblocks && block <= pblocks; block++) { + char *baddr, *limit; + struct vxfs_dirblk *dbp; + struct vxfs_direct *de; + + baddr = kaddr + (block * bsize); + limit = baddr + bsize - VXFS_DIRLEN(1); + + dbp = (struct vxfs_dirblk *)baddr; + de = (struct vxfs_direct *) + (offset ? + (kaddr + offset) : + (baddr + VXFS_DIRBLKOV(dbp))); + + for (; (char *)de <= limit; de = vxfs_next_entry(de)) { + if (!de->d_reclen) + break; + if (!de->d_ino) + continue; + + offset = (char *)de - kaddr; + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + if (!dir_emit(ctx, de->d_name, de->d_namelen, + de->d_ino, DT_UNKNOWN)) { + vxfs_put_page(pp); + return 0; + } + } + offset = 0; + } + vxfs_put_page(pp); + offset = 0; + } + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + return 0; +} diff --git a/kernel/fs/freevxfs/vxfs_olt.c b/kernel/fs/freevxfs/vxfs_olt.c new file mode 100644 index 000000000..049500847 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_olt.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - object location table support. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_olt.h" +#include "vxfs_extern.h" + + +static inline void +vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) +{ + BUG_ON(infp->vsi_fshino); + infp->vsi_fshino = fshp->olt_fsino[0]; +} + +static inline void +vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) +{ + BUG_ON(infp->vsi_iext); + infp->vsi_iext = ilistp->olt_iext[0]; +} + +static inline u_long +vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) +{ + BUG_ON(sbp->s_blocksize % bsize); + return (block * (sbp->s_blocksize / bsize)); +} + + +/** + * vxfs_read_olt - read olt + * @sbp: superblock of the filesystem + * @bsize: blocksize of the filesystem + * + * Description: + * vxfs_read_olt reads the olt of the filesystem described by @sbp + * into main memory and does some basic setup. + * + * Returns: + * Zero on success, else a negative error code. + */ +int +vxfs_read_olt(struct super_block *sbp, u_long bsize) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + struct buffer_head *bp; + struct vxfs_olt *op; + char *oaddr, *eaddr; + + + bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize)); + if (!bp || !bp->b_data) + goto fail; + + op = (struct vxfs_olt *)bp->b_data; + if (op->olt_magic != VXFS_OLT_MAGIC) { + printk(KERN_NOTICE "vxfs: ivalid olt magic number\n"); + goto fail; + } + + /* + * It is in theory possible that vsi_oltsize is > 1. + * I've not seen any such filesystem yet and I'm lazy.. --hch + */ + if (infp->vsi_oltsize > 1) { + printk(KERN_NOTICE "vxfs: oltsize > 1 detected.\n"); + printk(KERN_NOTICE "vxfs: please notify hch@infradead.org\n"); + goto fail; + } + + oaddr = bp->b_data + op->olt_size; + eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize); + + while (oaddr < eaddr) { + struct vxfs_oltcommon *ocp = + (struct vxfs_oltcommon *)oaddr; + + switch (ocp->olt_type) { + case VXFS_OLT_FSHEAD: + vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp); + break; + case VXFS_OLT_ILIST: + vxfs_get_ilist((struct vxfs_oltilist *)oaddr, infp); + break; + } + + oaddr += ocp->olt_size; + } + + brelse(bp); + return 0; + +fail: + brelse(bp); + return -EINVAL; +} diff --git a/kernel/fs/freevxfs/vxfs_olt.h b/kernel/fs/freevxfs/vxfs_olt.h new file mode 100644 index 000000000..b7b3af502 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_olt.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_OLT_H_ +#define _VXFS_OLT_H_ + +/* + * Veritas filesystem driver - Object Location Table data structures. + * + * This file contains definitions for the Object Location Table used + * by the Veritas Filesystem version 2 and newer. + */ + + +/* + * OLT magic number (vxfs_olt->olt_magic). + */ +#define VXFS_OLT_MAGIC 0xa504FCF5 + +/* + * VxFS OLT entry types. + */ +enum { + VXFS_OLT_FREE = 1, + VXFS_OLT_FSHEAD = 2, + VXFS_OLT_CUT = 3, + VXFS_OLT_ILIST = 4, + VXFS_OLT_DEV = 5, + VXFS_OLT_SB = 6 +}; + +/* + * VxFS OLT header. + * + * The Object Location Table header is placed at the beginning of each + * OLT extent. It is used to fing certain filesystem-wide metadata, e.g. + * the initial inode list, the fileset header or the device configuration. + */ +struct vxfs_olt { + u_int32_t olt_magic; /* magic number */ + u_int32_t olt_size; /* size of this entry */ + u_int32_t olt_checksum; /* checksum of extent */ + u_int32_t __unused1; /* ??? */ + u_int32_t olt_mtime; /* time of last mod. (sec) */ + u_int32_t olt_mutime; /* time of last mod. (usec) */ + u_int32_t olt_totfree; /* free space in OLT extent */ + vx_daddr_t olt_extents[2]; /* addr of this extent, replica */ + u_int32_t olt_esize; /* size of this extent */ + vx_daddr_t olt_next[2]; /* addr of next extent, replica */ + u_int32_t olt_nsize; /* size of next extent */ + u_int32_t __unused2; /* align to 8 byte boundary */ +}; + +/* + * VxFS common OLT entry (on disk). + */ +struct vxfs_oltcommon { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ +}; + +/* + * VxFS free OLT entry (on disk). + */ +struct vxfs_oltfree { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_fsize; /* size of this free record */ +}; + +/* + * VxFS initial-inode list (on disk). + */ +struct vxfs_oltilist { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_iext[2]; /* initial inode list, replica */ +}; + +/* + * Current Usage Table + */ +struct vxfs_oltcut { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_cutino; /* inode of current usage table */ + u_int32_t __pad; /* unused, 8 byte align */ +}; + +/* + * Inodes containing Superblock, Intent log and OLTs + */ +struct vxfs_oltsb { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_sbino; /* inode of superblock file */ + u_int32_t __unused1; /* ??? */ + vx_ino_t olt_logino[2]; /* inode of log file,replica */ + vx_ino_t olt_oltino[2]; /* inode of OLT, replica */ +}; + +/* + * Inode containing device configuration + it's replica + */ +struct vxfs_oltdev { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_devino[2]; /* inode of device config files */ +}; + +/* + * Fileset header + */ +struct vxfs_oltfshead { + u_int32_t olt_type; /* type number */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_fsino[2]; /* inodes of fileset header */ +}; + +#endif /* _VXFS_OLT_H_ */ diff --git a/kernel/fs/freevxfs/vxfs_subr.c b/kernel/fs/freevxfs/vxfs_subr.c new file mode 100644 index 000000000..5d318c44f --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_subr.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - shared subroutines. + */ +#include +#include +#include +#include + +#include "vxfs_extern.h" + + +static int vxfs_readpage(struct file *, struct page *); +static sector_t vxfs_bmap(struct address_space *, sector_t); + +const struct address_space_operations vxfs_aops = { + .readpage = vxfs_readpage, + .bmap = vxfs_bmap, +}; + +inline void +vxfs_put_page(struct page *pp) +{ + kunmap(pp); + page_cache_release(pp); +} + +/** + * vxfs_get_page - read a page into memory. + * @ip: inode to read from + * @n: page number + * + * Description: + * vxfs_get_page reads the @n th page of @ip into the pagecache. + * + * Returns: + * The wanted page on success, else a NULL pointer. + */ +struct page * +vxfs_get_page(struct address_space *mapping, u_long n) +{ + struct page * pp; + + pp = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(pp)) { + kmap(pp); + /** if (!PageChecked(pp)) **/ + /** vxfs_check_page(pp); **/ + if (PageError(pp)) + goto fail; + } + + return (pp); + +fail: + vxfs_put_page(pp); + return ERR_PTR(-EIO); +} + +/** + * vxfs_bread - read buffer for a give inode,block tuple + * @ip: inode + * @block: logical block + * + * Description: + * The vxfs_bread function reads block no @block of + * @ip into the buffercache. + * + * Returns: + * The resulting &struct buffer_head. + */ +struct buffer_head * +vxfs_bread(struct inode *ip, int block) +{ + struct buffer_head *bp; + daddr_t pblock; + + pblock = vxfs_bmap1(ip, block); + bp = sb_bread(ip->i_sb, pblock); + + return (bp); +} + +/** + * vxfs_get_block - locate buffer for given inode,block tuple + * @ip: inode + * @iblock: logical block + * @bp: buffer skeleton + * @create: %TRUE if blocks may be newly allocated. + * + * Description: + * The vxfs_get_block function fills @bp with the right physical + * block and device number to perform a lowlevel read/write on + * it. + * + * Returns: + * Zero on success, else a negativ error code (-EIO). + */ +static int +vxfs_getblk(struct inode *ip, sector_t iblock, + struct buffer_head *bp, int create) +{ + daddr_t pblock; + + pblock = vxfs_bmap1(ip, iblock); + if (pblock != 0) { + map_bh(bp, ip->i_sb, pblock); + return 0; + } + + return -EIO; +} + +/** + * vxfs_readpage - read one page synchronously into the pagecache + * @file: file context (unused) + * @page: page frame to fill in. + * + * Description: + * The vxfs_readpage routine reads @page synchronously into the + * pagecache. + * + * Returns: + * Zero on success, else a negative error code. + * + * Locking status: + * @page is locked and will be unlocked. + */ +static int +vxfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, vxfs_getblk); +} + +/** + * vxfs_bmap - perform logical to physical block mapping + * @mapping: logical to physical mapping to use + * @block: logical block (relative to @mapping). + * + * Description: + * Vxfs_bmap find out the corresponding phsical block to the + * @mapping, @block pair. + * + * Returns: + * Physical block number on success, else Zero. + * + * Locking status: + * We are under the bkl. + */ +static sector_t +vxfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, vxfs_getblk); +} diff --git a/kernel/fs/freevxfs/vxfs_super.c b/kernel/fs/freevxfs/vxfs_super.c new file mode 100644 index 000000000..7ca8c75d5 --- /dev/null +++ b/kernel/fs/freevxfs/vxfs_super.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - superblock related routines. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_extern.h" +#include "vxfs_dir.h" +#include "vxfs_inode.h" + + +MODULE_AUTHOR("Christoph Hellwig"); +MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); +MODULE_LICENSE("Dual BSD/GPL"); + + + +static void vxfs_put_super(struct super_block *); +static int vxfs_statfs(struct dentry *, struct kstatfs *); +static int vxfs_remount(struct super_block *, int *, char *); + +static const struct super_operations vxfs_super_ops = { + .evict_inode = vxfs_evict_inode, + .put_super = vxfs_put_super, + .statfs = vxfs_statfs, + .remount_fs = vxfs_remount, +}; + +/** + * vxfs_put_super - free superblock resources + * @sbp: VFS superblock. + * + * Description: + * vxfs_put_super frees all resources allocated for @sbp + * after the last instance of the filesystem is unmounted. + */ + +static void +vxfs_put_super(struct super_block *sbp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); + + brelse(infp->vsi_bp); + kfree(infp); +} + +/** + * vxfs_statfs - get filesystem information + * @dentry: VFS dentry to locate superblock + * @bufp: output buffer + * + * Description: + * vxfs_statfs fills the statfs buffer @bufp with information + * about the filesystem described by @dentry. + * + * Returns: + * Zero. + * + * Locking: + * No locks held. + * + * Notes: + * This is everything but complete... + */ +static int +vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); + + bufp->f_type = VXFS_SUPER_MAGIC; + bufp->f_bsize = dentry->d_sb->s_blocksize; + bufp->f_blocks = infp->vsi_raw->vs_dsize; + bufp->f_bfree = infp->vsi_raw->vs_free; + bufp->f_bavail = 0; + bufp->f_files = 0; + bufp->f_ffree = infp->vsi_raw->vs_ifree; + bufp->f_namelen = VXFS_NAMELEN; + + return 0; +} + +static int vxfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + +/** + * vxfs_read_super - read superblock into memory and initialize filesystem + * @sbp: VFS superblock (to fill) + * @dp: fs private mount data + * @silent: do not complain loudly when sth is wrong + * + * Description: + * We are called on the first mount of a filesystem to read the + * superblock into memory and do some basic setup. + * + * Returns: + * The superblock on success, else %NULL. + * + * Locking: + * We are under @sbp->s_lock. + */ +static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) +{ + struct vxfs_sb_info *infp; + struct vxfs_sb *rsbp; + struct buffer_head *bp = NULL; + u_long bsize; + struct inode *root; + int ret = -EINVAL; + + sbp->s_flags |= MS_RDONLY; + + infp = kzalloc(sizeof(*infp), GFP_KERNEL); + if (!infp) { + printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); + return -ENOMEM; + } + + bsize = sb_min_blocksize(sbp, BLOCK_SIZE); + if (!bsize) { + printk(KERN_WARNING "vxfs: unable to set blocksize\n"); + goto out; + } + + bp = sb_bread(sbp, 1); + if (!bp || !buffer_mapped(bp)) { + if (!silent) { + printk(KERN_WARNING + "vxfs: unable to read disk superblock\n"); + } + goto out; + } + + rsbp = (struct vxfs_sb *)bp->b_data; + if (rsbp->vs_magic != VXFS_SUPER_MAGIC) { + if (!silent) + printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); + goto out; + } + + if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) { + printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", + rsbp->vs_version); + goto out; + } + +#ifdef DIAGNOSTIC + printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version); + printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize); +#endif + + sbp->s_magic = rsbp->vs_magic; + sbp->s_fs_info = infp; + + infp->vsi_raw = rsbp; + infp->vsi_bp = bp; + infp->vsi_oltext = rsbp->vs_oltext[0]; + infp->vsi_oltsize = rsbp->vs_oltsize; + + if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) { + printk(KERN_WARNING "vxfs: unable to set final block size\n"); + goto out; + } + + if (vxfs_read_olt(sbp, bsize)) { + printk(KERN_WARNING "vxfs: unable to read olt\n"); + goto out; + } + + if (vxfs_read_fshead(sbp)) { + printk(KERN_WARNING "vxfs: unable to read fshead\n"); + goto out; + } + + sbp->s_op = &vxfs_super_ops; + root = vxfs_iget(sbp, VXFS_ROOT_INO); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + sbp->s_root = d_make_root(root); + if (!sbp->s_root) { + printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); + goto out_free_ilist; + } + + return 0; + +out_free_ilist: + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); +out: + brelse(bp); + kfree(infp); + return ret; +} + +/* + * The usual module blurb. + */ +static struct dentry *vxfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); +} + +static struct file_system_type vxfs_fs_type = { + .owner = THIS_MODULE, + .name = "vxfs", + .mount = vxfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ +MODULE_ALIAS("vxfs"); + +static int __init +vxfs_init(void) +{ + int rv; + + vxfs_inode_cachep = kmem_cache_create("vxfs_inode", + sizeof(struct vxfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!vxfs_inode_cachep) + return -ENOMEM; + rv = register_filesystem(&vxfs_fs_type); + if (rv < 0) + kmem_cache_destroy(vxfs_inode_cachep); + return rv; +} + +static void __exit +vxfs_cleanup(void) +{ + unregister_filesystem(&vxfs_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(vxfs_inode_cachep); +} + +module_init(vxfs_init); +module_exit(vxfs_cleanup); diff --git a/kernel/fs/fs-writeback.c b/kernel/fs/fs-writeback.c new file mode 100644 index 000000000..32a8bbd7a --- /dev/null +++ b/kernel/fs/fs-writeback.c @@ -0,0 +1,1595 @@ +/* + * fs/fs-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains all the functions related to writing back and waiting + * upon dirty inodes against superblocks, and writing back dirty + * pages against inodes. ie: data writeback. Writeout of the + * inode itself is not handled here. + * + * 10Apr2002 Andrew Morton + * Split out of fs/inode.c + * Additions for address_space-based writeback + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + +/* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_work { + long nr_pages; + struct super_block *sb; + unsigned long *older_than_this; + enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + enum wb_reason reason; /* why was writeback initiated? */ + + struct list_head list; /* pending work list */ + struct completion *done; /* set if the caller waits */ +}; + +/* + * If an inode is constantly having its pages dirtied, but then the + * updates stop dirtytime_expire_interval seconds in the past, it's + * possible for the worst case time between when an inode has its + * timestamps updated and when they finally get written out to be two + * dirtytime_expire_intervals. We set the default to 12 hours (in + * seconds), which means most of the time inodes will have their + * timestamps written to disk after 12 hours, but in the worst case a + * few inodes might not their timestamps updated for 24 hours. + */ +unsigned int dirtytime_expire_interval = 12 * 60 * 60; + +/** + * writeback_in_progress - determine whether there is writeback in progress + * @bdi: the device's backing_dev_info structure. + * + * Determine whether there is writeback waiting to be handled against a + * backing device. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_writeback_running, &bdi->state); +} +EXPORT_SYMBOL(writeback_in_progress); + +struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; +} +EXPORT_SYMBOL_GPL(inode_to_bdi); + +static inline struct inode *wb_inode(struct list_head *head) +{ + return list_entry(head, struct inode, i_wb_list); +} + +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure and inline functions so that the definition + * remains local to this file. + */ +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); + +static void bdi_wakeup_thread(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + spin_unlock_bh(&bdi->wb_lock); +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + if (!test_bit(BDI_registered, &bdi->state)) { + if (work->done) + complete(work->done); + goto out_unlock; + } + list_add_tail(&work->list, &bdi->work_list); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); +out_unlock: + spin_unlock_bh(&bdi->wb_lock); +} + +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic, enum wb_reason reason) +{ + struct wb_writeback_work *work; + + /* + * This is WB_SYNC_NONE writeback, so if allocation fails just + * wakeup the thread for old dirty data writeback + */ + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + trace_writeback_nowork(bdi); + bdi_wakeup_thread(bdi); + return; + } + + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + work->reason = reason; + + bdi_queue_work(bdi, work); +} + +/** + * bdi_start_writeback - start writeback + * @bdi: the backing device to write from + * @nr_pages: the number of pages to write + * @reason: reason why some writeback work was initiated + * + * Description: + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarantees on + * completion. Caller need not hold sb s_umount semaphore. + * + */ +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) +{ + __bdi_start_writeback(bdi, nr_pages, true, reason); +} + +/** + * bdi_start_background_writeback - start background writeback + * @bdi: the backing device to write from + * + * Description: + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. + */ +void bdi_start_background_writeback(struct backing_dev_info *bdi) +{ + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + bdi_wakeup_thread(bdi); +} + +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&bdi->wb.list_lock); +} + +/* + * Redirty an inode: set its when-it-was dirtied timestamp and move it to the + * furthest end of its superblock's dirty-inode list. + * + * Before stamping the inode's ->dirtied_when, we check to see whether it is + * already the most-recently-dirtied inode on the b_dirty list. If that is + * the case then the inode must have been redirtied while it was being written + * out and we don't reset its dirtied_when. + */ +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; + + tail = wb_inode(wb->b_dirty.next); + if (time_before(inode->dirtied_when, tail->dirtied_when)) + inode->dirtied_when = jiffies; + } + list_move(&inode->i_wb_list, &wb->b_dirty); +} + +/* + * requeue inode for re-scanning after bdi->b_io list is exhausted. + */ +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + list_move(&inode->i_wb_list, &wb->b_more_io); +} + +static void inode_sync_complete(struct inode *inode) +{ + inode->i_state &= ~I_SYNC; + /* If inode is clean an unused, put it into LRU now... */ + inode_add_lru(inode); + /* Waiters must see I_SYNC cleared before being woken up */ + smp_mb(); + wake_up_bit(&inode->i_state, __I_SYNC); +} + +static bool inode_dirtied_after(struct inode *inode, unsigned long t) +{ + bool ret = time_after(inode->dirtied_when, t); +#ifndef CONFIG_64BIT + /* + * For inodes being constantly redirtied, dirtied_when can get stuck. + * It _appears_ to be in the future, but is actually in distant past. + * This test is necessary to prevent such wrapped-around relative times + * from permanently stopping the whole bdi writeback. + */ + ret = ret && time_before_eq(inode->dirtied_when, jiffies); +#endif + return ret; +} + +#define EXPIRE_DIRTY_ATIME 0x0001 + +/* + * Move expired (dirtied before work->older_than_this) dirty inodes from + * @delaying_queue to @dispatch_queue. + */ +static int move_expired_inodes(struct list_head *delaying_queue, + struct list_head *dispatch_queue, + int flags, + struct wb_writeback_work *work) +{ + unsigned long *older_than_this = NULL; + unsigned long expire_time; + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb = NULL; + struct inode *inode; + int do_sb_sort = 0; + int moved = 0; + + if ((flags & EXPIRE_DIRTY_ATIME) == 0) + older_than_this = work->older_than_this; + else if (!work->for_sync) { + expire_time = jiffies - (dirtytime_expire_interval * HZ); + older_than_this = &expire_time; + } + while (!list_empty(delaying_queue)) { + inode = wb_inode(delaying_queue->prev); + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) + break; + list_move(&inode->i_wb_list, &tmp); + moved++; + if (flags & EXPIRE_DIRTY_ATIME) + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + if (sb_is_blkdev_sb(inode->i_sb)) + continue; + if (sb && sb != inode->i_sb) + do_sb_sort = 1; + sb = inode->i_sb; + } + + /* just one sb in list, splice to dispatch_queue and we're done */ + if (!do_sb_sort) { + list_splice(&tmp, dispatch_queue); + goto out; + } + + /* Move inodes from one superblock together */ + while (!list_empty(&tmp)) { + sb = wb_inode(tmp.prev)->i_sb; + list_for_each_prev_safe(pos, node, &tmp) { + inode = wb_inode(pos); + if (inode->i_sb == sb) + list_move(&inode->i_wb_list, dispatch_queue); + } + } +out: + return moved; +} + +/* + * Queue all expired dirty inodes for io, eldest first. + * Before + * newly dirtied b_dirty b_io b_more_io + * =============> gf edc BA + * After + * newly dirtied b_dirty b_io b_more_io + * =============> g fBAedc + * | + * +--> dequeue for IO + */ +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) +{ + int moved; + + assert_spin_locked(&wb->list_lock); + list_splice_init(&wb->b_more_io, &wb->b_io); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, + EXPIRE_DIRTY_ATIME, work); + trace_writeback_queue_io(wb, work, moved); +} + +static int write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { + trace_writeback_write_inode_start(inode, wbc); + ret = inode->i_sb->s_op->write_inode(inode, wbc); + trace_writeback_write_inode(inode, wbc); + return ret; + } + return 0; +} + +/* + * Wait for writeback on an inode to complete. Called with i_lock held. + * Caller must make sure inode cannot go away when we drop i_lock. + */ +static void __inode_wait_for_writeback(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) +{ + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + } +} + +/* + * Wait for writeback on an inode to complete. Caller must have inode pinned. + */ +void inode_wait_for_writeback(struct inode *inode) +{ + spin_lock(&inode->i_lock); + __inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); +} + +/* + * Sleep until I_SYNC is cleared. This function must be called with i_lock + * held and drops it. It is aimed for callers not holding any inode reference + * so once i_lock is dropped, inode can go away. + */ +static void inode_sleep_on_writeback(struct inode *inode) + __releases(inode->i_lock) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + int sleep; + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sleep = inode->i_state & I_SYNC; + spin_unlock(&inode->i_lock); + if (sleep) + schedule(); + finish_wait(wqh, &wait); +} + +/* + * Find proper writeback list for the inode depending on its current state and + * possibly also change of its state while we were doing writeback. Here we + * handle things such as livelock prevention or fairness of writeback among + * inodes. This function can be called only by flusher thread - noone else + * processes all inodes in writeback lists and requeueing inodes behind flusher + * thread's back can have unexpected consequences. + */ +static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + if (inode->i_state & I_FREEING) + return; + + /* + * Sync livelock prevention. Each inode is tagged and synced in one + * shot. If still dirty, it will be redirty_tail()'ed below. Update + * the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + + if (wbc->pages_skipped) { + /* + * writeback is not making progress due to locked + * buffers. Skip this inode for now. + */ + redirty_tail(inode, wb); + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + /* + * We didn't write back all the pages. nfs_writepages() + * sometimes bales out without doing anything. + */ + if (wbc->nr_to_write <= 0) { + /* Slice used up. Queue for next turn. */ + requeue_io(inode, wb); + } else { + /* + * Writeback blocked by something other than + * congestion. Delay the inode for some time to + * avoid spinning on the CPU (100% iowait) + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. + */ + redirty_tail(inode, wb); + } + } else if (inode->i_state & I_DIRTY) { + /* + * Filesystems can dirty the inode during writeback operations, + * such as delayed allocation during submission or metadata + * updates after data IO completion. + */ + redirty_tail(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + list_move(&inode->i_wb_list, &wb->b_dirty_time); + } else { + /* The inode is clean. Remove from writeback lists. */ + list_del_init(&inode->i_wb_list); + } +} + +/* + * Write out an inode and its dirty pages. Do not update the writeback list + * linkage. That is left to the caller. The caller is also responsible for + * setting I_SYNC flag and calling inode_sync_complete() to clear it. + */ +static int +__writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; + unsigned dirty; + int ret; + + WARN_ON(!(inode->i_state & I_SYNC)); + + trace_writeback_single_inode_start(inode, wbc, nr_to_write); + + ret = do_writepages(mapping, wbc); + + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. We don't do it for sync(2) writeback because it has a + * separate, external IO completion path and ->sync_fs for guaranteeing + * inode metadata is written back correctly. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { + int err = filemap_fdatawait(mapping); + if (ret == 0) + ret = err; + } + + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode->i_lock); + + dirty = inode->i_state & I_DIRTY; + if (inode->i_state & I_DIRTY_TIME) { + if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || + unlikely(time_after(jiffies, + (inode->dirtied_time_when + + dirtytime_expire_interval * HZ)))) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + } else + inode->i_state &= ~I_DIRTY_TIME_EXPIRED; + inode->i_state &= ~dirty; + + /* + * Paired with smp_mb() in __mark_inode_dirty(). This allows + * __mark_inode_dirty() to test i_state without grabbing i_lock - + * either they see the I_DIRTY bits cleared or we see the dirtied + * inode. + * + * I_DIRTY_PAGES is always cleared together above even if @mapping + * still has dirty pages. The flag is reinstated after smp_mb() if + * necessary. This guarantees that either __mark_inode_dirty() + * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. + */ + smp_mb(); + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state |= I_DIRTY_PAGES; + + spin_unlock(&inode->i_lock); + + if (dirty & I_DIRTY_TIME) + mark_inode_dirty_sync(inode); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & ~I_DIRTY_PAGES) { + int err = write_inode(inode, wbc); + if (ret == 0) + ret = err; + } + trace_writeback_single_inode(inode, wbc, nr_to_write); + return ret; +} + +/* + * Write out an inode's dirty pages. Either the caller has an active reference + * on the inode or the inode has I_WILL_FREE set. + * + * This function is designed to be called for writing back one inode which + * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() + * and does more profound writeback list handling in writeback_sb_inodes(). + */ +static int +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; + + spin_lock(&inode->i_lock); + if (!atomic_read(&inode->i_count)) + WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + else + WARN_ON(inode->i_state & I_WILL_FREE); + + if (inode->i_state & I_SYNC) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out; + /* + * It's a data-integrity sync. We must wait. Since callers hold + * inode reference or inode has I_WILL_FREE set, it cannot go + * away under us. + */ + __inode_wait_for_writeback(inode); + } + WARN_ON(inode->i_state & I_SYNC); + /* + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. + */ + if (!(inode->i_state & I_DIRTY_ALL) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) + goto out; + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + + ret = __writeback_single_inode(inode, wbc); + + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); + /* + * If inode is clean, remove it from writeback lists. Otherwise don't + * touch it. See comment above for explanation. + */ + if (!(inode->i_state & I_DIRTY_ALL)) + list_del_init(&inode->i_wb_list); + spin_unlock(&wb->list_lock); + inode_sync_complete(inode); +out: + spin_unlock(&inode->i_lock); + return ret; +} + +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } + + return pages; +} + +/* + * Write a portion of b_io inodes which belong to @sb. + * + * Return the number of pages and/or inodes written. + */ +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .for_sync = work->for_sync, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + + while (!list_empty(&wb->b_io)) { + struct inode *inode = wb_inode(wb->b_io.prev); + + if (inode->i_sb != sb) { + if (work->sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode, wb); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ + break; + } + + /* + * Don't bother with new inodes or inodes being freed, first + * kind does not need periodic writeout yet, and for the latter + * kind writeout is handled by the freer. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + redirty_tail(inode, wb); + continue; + } + if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + /* + * If this inode is locked for writeback and we are not + * doing writeback-for-data-integrity, move it to + * b_more_io so that writeback can proceed with the + * other inodes on s_io. + * + * We'll have another go at writing back this inode + * when we completed a full scan of b_io. + */ + spin_unlock(&inode->i_lock); + requeue_io(inode, wb); + trace_writeback_sb_inodes_requeue(inode); + continue; + } + spin_unlock(&wb->list_lock); + + /* + * We already requeued the inode if it had I_SYNC set and we + * are doing WB_SYNC_NONE writeback. So this catches only the + * WB_SYNC_ALL case. + */ + if (inode->i_state & I_SYNC) { + /* Wait for I_SYNC. This function drops i_lock... */ + inode_sleep_on_writeback(inode); + /* Inode may be gone, start again */ + spin_lock(&wb->list_lock); + continue; + } + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + + write_chunk = writeback_chunk_size(wb->bdi, work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + /* + * We use I_SYNC to pin the inode in memory. While it is set + * evict_inode() will wait so the inode cannot be freed. + */ + __writeback_single_inode(inode, &wbc); + + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_DIRTY_ALL)) + wrote++; + requeue_inode(inode, wb, &wbc); + inode_sync_complete(inode); + spin_unlock(&inode->i_lock); + cond_resched_lock(&wb->list_lock); + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } + } + return wrote; +} + +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + unsigned long start_time = jiffies; + long wrote = 0; + + while (!list_empty(&wb->b_io)) { + struct inode *inode = wb_inode(wb->b_io.prev); + struct super_block *sb = inode->i_sb; + + if (!trylock_super(sb)) { + /* + * trylock_super() may fail consistently due to + * s_umount being grabbed by someone else. Don't use + * requeue_io() to avoid busy retrying the inode/sb. + */ + redirty_tail(inode, wb); + continue; + } + wrote += writeback_sb_inodes(sb, wb, work); + up_read(&sb->s_umount); + + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } + } + /* Leave any unwritten inodes on b_io */ + return wrote; +} + +static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) +{ + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = reason, + }; + + spin_lock(&wb->list_lock); + if (list_empty(&wb->b_io)) + queue_io(wb, &work); + __writeback_inodes_wb(wb, &work); + spin_unlock(&wb->list_lock); + + return nr_pages - work.nr_pages; +} + +static bool over_bground_thresh(struct backing_dev_info *bdi) +{ + unsigned long background_thresh, dirty_thresh; + + global_dirty_limits(&background_thresh, &dirty_thresh); + + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (bdi_stat(bdi, BDI_RECLAIMABLE) > + bdi_dirty_limit(bdi, background_thresh)) + return true; + + return false; +} + +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + unsigned long wb_start = jiffies; + long nr_pages = work->nr_pages; + unsigned long oldest_jif; + struct inode *inode; + long progress; + + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; + + spin_lock(&wb->list_lock); + for (;;) { + /* + * Stop writeback when nr_pages has been consumed + */ + if (work->nr_pages <= 0) + break; + + /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* + * For background writeout, stop when we are below the + * background dirty threshold + */ + if (work->for_background && !over_bground_thresh(wb->bdi)) + break; + + /* + * Kupdate and background works are special and we want to + * include all inodes that need writing. Livelock avoidance is + * handled by these works yielding to any other work so we are + * safe. + */ + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } else if (work->for_background) + oldest_jif = jiffies; + + trace_writeback_start(wb->bdi, work); + if (list_empty(&wb->b_io)) + queue_io(wb, work); + if (work->sb) + progress = writeback_sb_inodes(work->sb, wb, work); + else + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); + + wb_update_bandwidth(wb, wb_start); + + /* + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. + */ + if (progress) + continue; + /* + * No more inodes for IO, bail + */ + if (list_empty(&wb->b_more_io)) + break; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); + inode = wb_inode(wb->b_more_io.prev); + spin_lock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); + } + } + spin_unlock(&wb->list_lock); + + return nr_pages - work->nr_pages; +} + +/* + * Return the next wb_writeback_work struct that hasn't been processed yet. + */ +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi) +{ + struct wb_writeback_work *work = NULL; + + spin_lock_bh(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); + } + spin_unlock_bh(&bdi->wb_lock); + return work; +} + +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh(wb->bdi)) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = get_nr_dirty_pages(); + + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .for_kupdate = 1, + .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +static long wb_do_writeback(struct bdi_writeback *wb) +{ + struct backing_dev_info *bdi = wb->bdi; + struct wb_writeback_work *work; + long wrote = 0; + + set_bit(BDI_writeback_running, &wb->bdi->state); + while ((work = get_next_work_item(bdi)) != NULL) { + + trace_writeback_exec(bdi, work); + + wrote += wb_writeback(wb, work); + + /* + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. + */ + if (work->done) + complete(work->done); + else + kfree(work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); + clear_bit(BDI_writeback_running, &wb->bdi->state); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * reschedules periodically and does kupdated style flushing. + */ +void bdi_writeback_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); + struct backing_dev_info *bdi = wb->bdi; + long pages_written; + + set_worker_desc("flush-%s", dev_name(bdi->dev)); + current->flags |= PF_SWAPWRITE; + + if (likely(!current_is_workqueue_rescuer() || + !test_bit(BDI_registered, &bdi->state))) { + /* + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. + */ + do { + pages_written = wb_do_writeback(wb); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); + trace_writeback_pages_written(pages_written); + } + + if (!list_empty(&bdi->work_list)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + else if (wb_has_dirty_io(wb) && dirty_writeback_interval) + bdi_wakeup_thread_delayed(bdi); + + current->flags &= ~PF_SWAPWRITE; +} + +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) +{ + struct backing_dev_info *bdi; + + if (!nr_pages) + nr_pages = get_nr_dirty_pages(); + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (!bdi_has_dirty_io(bdi)) + continue; + __bdi_start_writeback(bdi, nr_pages, false, reason); + } + rcu_read_unlock(); +} + +/* + * Wake up bdi's periodically to make sure dirtytime inodes gets + * written back periodically. We deliberately do *not* check the + * b_dirtytime list in wb_has_dirty_io(), since this would cause the + * kernel to be constantly waking up once there are any dirtytime + * inodes on the system. So instead we define a separate delayed work + * function which gets called much more rarely. (By default, only + * once every 12 hours.) + * + * If there is any other write activity going on in the file system, + * this function won't be necessary. But if the only thing that has + * happened on the file system is a dirtytime inode caused by an atime + * update, we need this infrastructure below to make sure that inode + * eventually gets pushed out to disk. + */ +static void wakeup_dirtytime_writeback(struct work_struct *w); +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); + +static void wakeup_dirtytime_writeback(struct work_struct *w) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (list_empty(&bdi->wb.b_dirty_time)) + continue; + bdi_wakeup_thread(bdi); + } + rcu_read_unlock(); + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); +} + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + return 0; +} +__initcall(start_dirtytime_writeback); + +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + mod_delayed_work(system_wq, &dirtytime_work, 0); + return ret; +} + +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. + * + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. + */ +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +void __mark_inode_dirty(struct inode *inode, int flags) +{ + struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + int dirtytime; + + trace_writeback_mark_inode_dirty(inode, flags); + + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { + trace_writeback_dirty_inode_start(inode, flags); + + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode, flags); + + trace_writeback_dirty_inode(inode, flags); + } + if (flags & I_DIRTY_INODE) + flags &= ~I_DIRTY_TIME; + dirtytime = flags & I_DIRTY_TIME; + + /* + * Paired with smp_mb() in __writeback_single_inode() for the + * following lockless i_state test. See there for details. + */ + smp_mb(); + + if (((inode->i_state & flags) == flags) || + (dirtytime && (inode->i_state & I_DIRTY_INODE))) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode->i_lock); + if (dirtytime && (inode->i_state & I_DIRTY_INODE)) + goto out_unlock_inode; + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + if (flags & I_DIRTY_INODE) + inode->i_state &= ~I_DIRTY_TIME; + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out_unlock_inode; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (inode_unhashed(inode)) + goto out_unlock_inode; + } + if (inode->i_state & I_FREEING) + goto out_unlock_inode; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + bool wakeup_bdi = false; + bdi = inode_to_bdi(inode); + + spin_unlock(&inode->i_lock); + spin_lock(&bdi->wb.list_lock); + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; + } + + inode->dirtied_when = jiffies; + if (dirtytime) + inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + else + list_move(&inode->i_wb_list, + &bdi->wb.b_dirty_time); + spin_unlock(&bdi->wb.list_lock); + trace_writeback_dirty_inode_enqueue(inode); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; + } + } +out_unlock_inode: + spin_unlock(&inode->i_lock); + +} +EXPORT_SYMBOL(__mark_inode_dirty); + +static void wait_sb_inodes(struct super_block *sb) +{ + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + spin_lock(&inode_sb_list_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block + * @sb: the superblock + * @nr: the number of pages to write + * @reason: reason why some writeback work initiated + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, + }; + + if (sb->s_bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(writeback_inodes_sb_nr); + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * @reason: reason why some writeback work was initiated + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +{ + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); +} +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * try_to_writeback_inodes_sb_nr - try to start writeback if none underway + * @sb: the superblock + * @nr: the number of pages to write + * @reason: the reason of writeback + * + * Invoke writeback_inodes_sb_nr if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int try_to_writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) +{ + if (writeback_in_progress(sb->s_bdi)) + return 1; + + if (!down_read_trylock(&sb->s_umount)) + return 0; + + writeback_inodes_sb_nr(sb, nr, reason); + up_read(&sb->s_umount); + return 1; +} +EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); + +/** + * try_to_writeback_inodes_sb - try to start writeback if none underway + * @sb: the superblock + * @reason: reason why some writeback work was initiated + * + * Implement by try_to_writeback_inodes_sb_nr() + * Returns 1 if writeback was started, 0 if not. + */ +int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +{ + return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); +} +EXPORT_SYMBOL(try_to_writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. + */ +void sync_inodes_sb(struct super_block *sb) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + .done = &done, + .reason = WB_REASON_SYNC, + .for_sync = 1, + }; + + /* Nothing to do? */ + if (sb->s_bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + + wait_sb_inodes(sb); +} +EXPORT_SYMBOL(sync_inodes_sb); + +/** + * write_inode_now - write an inode to disk + * @inode: inode to write to disk + * @sync: whether the write should be synchronous or not + * + * This function commits an inode to disk immediately if it is dirty. This is + * primarily needed by knfsd. + * + * The caller must either have a ref on the inode or must have set I_WILL_FREE. + */ +int write_inode_now(struct inode *inode, int sync) +{ + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + return writeback_single_inode(inode, wb, &wbc); +} +EXPORT_SYMBOL(write_inode_now); + +/** + * sync_inode - write an inode and its pages to disk. + * @inode: the inode to sync + * @wbc: controls the writeback mode + * + * sync_inode() will write an inode and its pages to disk. It will also + * correctly update the inode on its superblock's dirty inode lists and will + * update inode->i_state. + * + * The caller must have a ref on the inode. + */ +int sync_inode(struct inode *inode, struct writeback_control *wbc) +{ + return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); +} +EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode_metadata - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust its dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); diff --git a/kernel/fs/fs_pin.c b/kernel/fs/fs_pin.c new file mode 100644 index 000000000..611b5408f --- /dev/null +++ b/kernel/fs/fs_pin.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include "internal.h" +#include "mount.h" + +static DEFINE_SPINLOCK(pin_lock); + +void pin_remove(struct fs_pin *pin) +{ + spin_lock(&pin_lock); + hlist_del_init(&pin->m_list); + hlist_del_init(&pin->s_list); + spin_unlock(&pin_lock); + spin_lock_irq(&pin->wait.lock); + pin->done = 1; + wake_up_locked(&pin->wait); + spin_unlock_irq(&pin->wait.lock); +} + +void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p) +{ + spin_lock(&pin_lock); + if (p) + hlist_add_head(&pin->s_list, p); + hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&pin_lock); +} + +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + pin_insert_group(pin, m, &m->mnt_sb->s_pins); +} + +void pin_kill(struct fs_pin *p) +{ + wait_queue_t wait; + + if (!p) { + rcu_read_unlock(); + return; + } + init_wait(&wait); + spin_lock_irq(&p->wait.lock); + if (likely(!p->done)) { + p->done = -1; + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + p->kill(p); + return; + } + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + return; + } + __add_wait_queue(&p->wait, &wait); + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + if (likely(list_empty(&wait.task_list))) + break; + /* OK, we know p couldn't have been freed yet */ + spin_lock_irq(&p->wait.lock); + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + break; + } + } + rcu_read_unlock(); +} + +void mnt_pin_kill(struct mount *m) +{ + while (1) { + struct hlist_node *p; + rcu_read_lock(); + p = ACCESS_ONCE(m->mnt_pins.first); + if (!p) { + rcu_read_unlock(); + break; + } + pin_kill(hlist_entry(p, struct fs_pin, m_list)); + } +} + +void group_pin_kill(struct hlist_head *p) +{ + while (1) { + struct hlist_node *q; + rcu_read_lock(); + q = ACCESS_ONCE(p->first); + if (!q) { + rcu_read_unlock(); + break; + } + pin_kill(hlist_entry(q, struct fs_pin, s_list)); + } +} diff --git a/kernel/fs/fs_struct.c b/kernel/fs/fs_struct.c new file mode 100644 index 000000000..7dca743b2 --- /dev/null +++ b/kernel/fs/fs_struct.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_root(struct fs_struct *fs, const struct path *path) +{ + struct path old_root; + + path_get(path); + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + old_root = fs->root; + fs->root = *path; + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + if (old_root.dentry) + path_put(&old_root); +} + +/* + * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_pwd(struct fs_struct *fs, const struct path *path) +{ + struct path old_pwd; + + path_get(path); + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + old_pwd = fs->pwd; + fs->pwd = *path; + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +static inline int replace_path(struct path *p, const struct path *old, const struct path *new) +{ + if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) + return 0; + *p = *new; + return 1; +} + +void chroot_fs_refs(const struct path *old_root, const struct path *new_root) +{ + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { + int hits = 0; + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + hits += replace_path(&fs->root, old_root, new_root); + hits += replace_path(&fs->pwd, old_root, new_root); + write_seqcount_end(&fs->seq); + while (hits--) { + count++; + path_get(new_root); + } + spin_unlock(&fs->lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + while (count--) + path_put(old_root); +} + +void free_fs_struct(struct fs_struct *fs) +{ + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct *fs = tsk->fs; + + if (fs) { + int kill; + task_lock(tsk); + spin_lock(&fs->lock); + tsk->fs = NULL; + kill = !--fs->users; + spin_unlock(&fs->lock); + task_unlock(tsk); + if (kill) + free_fs_struct(fs); + } +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + fs->users = 1; + fs->in_exec = 0; + spin_lock_init(&fs->lock); + seqcount_init(&fs->seq); + fs->umask = old->umask; + + spin_lock(&old->lock); + fs->root = old->root; + path_get(&fs->root); + fs->pwd = old->pwd; + path_get(&fs->pwd); + spin_unlock(&old->lock); + } + return fs; +} + +int unshare_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; + + if (!new_fs) + return -ENOMEM; + + task_lock(current); + spin_lock(&fs->lock); + kill = !--fs->users; + current->fs = new_fs; + spin_unlock(&fs->lock); + task_unlock(current); + + if (kill) + free_fs_struct(fs); + + return 0; +} +EXPORT_SYMBOL_GPL(unshare_fs_struct); + +int current_umask(void) +{ + return current->fs->umask; +} +EXPORT_SYMBOL(current_umask); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .users = 1, + .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), + .seq = SEQCNT_ZERO(init_fs.seq), + .umask = 0022, +}; diff --git a/kernel/fs/fscache/Kconfig b/kernel/fs/fscache/Kconfig new file mode 100644 index 000000000..3f6dfa989 --- /dev/null +++ b/kernel/fs/fscache/Kconfig @@ -0,0 +1,61 @@ + +config FSCACHE + tristate "General filesystem local caching manager" + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_HISTOGRAM + bool "Gather latency information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes latency information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_OBJECT_LIST + bool "Maintain global object list for debugging purposes" + depends on FSCACHE && PROC_FS + help + Maintain a global list of active fscache objects that can be + retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/kernel/fs/fscache/Makefile b/kernel/fs/fscache/Makefile new file mode 100644 index 000000000..6d561531c --- /dev/null +++ b/kernel/fs/fscache/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for general filesystem caching code +# + +fscache-y := \ + cache.o \ + cookie.o \ + fsdef.o \ + main.o \ + netfs.o \ + object.o \ + operation.o \ + page.o + +fscache-$(CONFIG_PROC_FS) += proc.o +fscache-$(CONFIG_FSCACHE_STATS) += stats.o +fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o +fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o + +obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/kernel/fs/fscache/cache.c b/kernel/fs/fscache/cache.c new file mode 100644 index 000000000..56cce7fdd --- /dev/null +++ b/kernel/fs/fscache/cache.c @@ -0,0 +1,421 @@ +/* FS-Cache cache handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include +#include "internal.h" + +LIST_HEAD(fscache_cache_list); +DECLARE_RWSEM(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_cache_cleared_wq); + +static LIST_HEAD(fscache_cache_tag_list); + +/* + * look up a cache tag + */ +struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +{ + struct fscache_cache_tag *tag, *xtag; + + /* firstly check for the existence of the tag under read lock */ + down_read(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_read(&fscache_addremove_sem); + return tag; + } + } + + up_read(&fscache_addremove_sem); + + /* the tag does not exist - create a candidate */ + xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); + if (!xtag) + /* return a dummy tag if out of memory */ + return ERR_PTR(-ENOMEM); + + atomic_set(&xtag->usage, 1); + strcpy(xtag->name, name); + + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_write(&fscache_addremove_sem); + kfree(xtag); + return tag; + } + } + + list_add_tail(&xtag->link, &fscache_cache_tag_list); + up_write(&fscache_addremove_sem); + return xtag; +} + +/* + * release a reference to a cache tag + */ +void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +{ + if (tag != ERR_PTR(-ENOMEM)) { + down_write(&fscache_addremove_sem); + + if (atomic_dec_and_test(&tag->usage)) + list_del_init(&tag->link); + else + tag = NULL; + + up_write(&fscache_addremove_sem); + + kfree(tag); + } +} + +/* + * select a cache in which to store an object + * - the cache addremove semaphore must be at least read-locked by the caller + * - the object will never be an index + */ +struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *cookie) +{ + struct fscache_cache_tag *tag; + struct fscache_object *object; + struct fscache_cache *cache; + + _enter(""); + + if (list_empty(&fscache_cache_list)) { + _leave(" = NULL [no cache]"); + return NULL; + } + + /* we check the parent to determine the cache to use */ + spin_lock(&cookie->lock); + + /* the first in the parent's backing list should be the preferred + * cache */ + if (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + cache = object->cache; + if (fscache_object_is_dying(object) || + test_bit(FSCACHE_IOERROR, &cache->flags)) + cache = NULL; + + spin_unlock(&cookie->lock); + _leave(" = %p [parent]", cache); + return cache; + } + + /* the parent is unbacked */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* cookie not an index and is unbacked */ + spin_unlock(&cookie->lock); + _leave(" = NULL [cookie ub,ni]"); + return NULL; + } + + spin_unlock(&cookie->lock); + + if (!cookie->def->select_cache) + goto no_preference; + + /* ask the netfs for its preference */ + tag = cookie->def->select_cache(cookie->parent->netfs_data, + cookie->netfs_data); + if (!tag) + goto no_preference; + + if (tag == ERR_PTR(-ENOMEM)) { + _leave(" = NULL [nomem tag]"); + return NULL; + } + + if (!tag->cache) { + _leave(" = NULL [unbacked tag]"); + return NULL; + } + + if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) + return NULL; + + _leave(" = %p [specific]", tag->cache); + return tag->cache; + +no_preference: + /* netfs has no preference - just select first cache */ + cache = list_entry(fscache_cache_list.next, + struct fscache_cache, link); + _leave(" = %p [first]", cache); + return cache; +} + +/** + * fscache_init_cache - Initialise a cache record + * @cache: The cache record to be initialised + * @ops: The cache operations to be installed in that record + * @idfmt: Format string to define identifier + * @...: sprintf-style arguments + * + * Initialise a record of a cache and fill in the name. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_init_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + const char *idfmt, + ...) +{ + va_list va; + + memset(cache, 0, sizeof(*cache)); + + cache->ops = ops; + + va_start(va, idfmt); + vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); + va_end(va); + + INIT_WORK(&cache->op_gc, fscache_operation_gc); + INIT_LIST_HEAD(&cache->link); + INIT_LIST_HEAD(&cache->object_list); + INIT_LIST_HEAD(&cache->op_gc_list); + spin_lock_init(&cache->object_list_lock); + spin_lock_init(&cache->op_gc_list_lock); +} +EXPORT_SYMBOL(fscache_init_cache); + +/** + * fscache_add_cache - Declare a cache as being open for business + * @cache: The record describing the cache + * @ifsdef: The record of the cache object describing the top-level index + * @tagname: The tag describing this cache + * + * Add a cache to the system, making it available for netfs's to use. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *ifsdef, + const char *tagname) +{ + struct fscache_cache_tag *tag; + + BUG_ON(!cache->ops); + BUG_ON(!ifsdef); + + cache->flags = 0; + ifsdef->event_mask = + ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) & + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags); + + if (!tagname) + tagname = cache->identifier; + + BUG_ON(!tagname[0]); + + _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); + + /* we use the cache tag to uniquely identify caches */ + tag = __fscache_lookup_cache_tag(tagname); + if (IS_ERR(tag)) + goto nomem; + + if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) + goto tag_in_use; + + cache->kobj = kobject_create_and_add(tagname, fscache_root); + if (!cache->kobj) + goto error; + + ifsdef->cookie = &fscache_fsdef_index; + ifsdef->cache = cache; + cache->fsdef = ifsdef; + + down_write(&fscache_addremove_sem); + + tag->cache = cache; + cache->tag = tag; + + /* add the cache to the list */ + list_add(&cache->link, &fscache_cache_list); + + /* add the cache's netfs definition index object to the cache's + * list */ + spin_lock(&cache->object_list_lock); + list_add_tail(&ifsdef->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + fscache_objlist_add(ifsdef); + + /* add the cache's netfs definition index object to the top level index + * cookie as a known backing object */ + spin_lock(&fscache_fsdef_index.lock); + + hlist_add_head(&ifsdef->cookie_link, + &fscache_fsdef_index.backing_objects); + + atomic_inc(&fscache_fsdef_index.usage); + + /* done */ + spin_unlock(&fscache_fsdef_index.lock); + up_write(&fscache_addremove_sem); + + pr_notice("Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); + kobject_uevent(cache->kobj, KOBJ_ADD); + + _leave(" = 0 [%s]", cache->identifier); + return 0; + +tag_in_use: + pr_err("Cache tag '%s' already in use\n", tagname); + __fscache_release_cache_tag(tag); + _leave(" = -EXIST"); + return -EEXIST; + +error: + __fscache_release_cache_tag(tag); + _leave(" = -EINVAL"); + return -EINVAL; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(fscache_add_cache); + +/** + * fscache_io_error - Note a cache I/O error + * @cache: The record describing the cache + * + * Note that an I/O error occurred in a cache and that it should no longer be + * used for anything. This also reports the error into the kernel log. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_io_error(struct fscache_cache *cache) +{ + if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) + pr_err("Cache '%s' stopped due to I/O error\n", + cache->ops->name); +} +EXPORT_SYMBOL(fscache_io_error); + +/* + * request withdrawal of all the objects in a cache + * - all the objects being withdrawn are moved onto the supplied list + */ +static void fscache_withdraw_all_objects(struct fscache_cache *cache, + struct list_head *dying_objects) +{ + struct fscache_object *object; + + while (!list_empty(&cache->object_list)) { + spin_lock(&cache->object_list_lock); + + if (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); + + _debug("withdraw %p", object->cookie); + + /* This must be done under object_list_lock to prevent + * a race with fscache_drop_object(). + */ + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + + spin_unlock(&cache->object_list_lock); + cond_resched(); + } +} + +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The record describing the cache + * + * Withdraw a cache from service, unbinding all its cache objects from the + * netfs cookies they're currently representing. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_withdraw_cache(struct fscache_cache *cache) +{ + LIST_HEAD(dying_objects); + + _enter(""); + + pr_notice("Withdrawing cache \"%s\"\n", + cache->tag->name); + + /* make the cache unavailable for cookie acquisition */ + if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) + BUG(); + + down_write(&fscache_addremove_sem); + list_del_init(&cache->link); + cache->tag->cache = NULL; + up_write(&fscache_addremove_sem); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disk */ + fscache_stat(&fscache_n_cop_sync_cache); + cache->ops->sync_cache(cache); + fscache_stat_d(&fscache_n_cop_sync_cache); + + /* dissociate all the netfs pages backed by this cache from the block + * mappings in the cache */ + fscache_stat(&fscache_n_cop_dissociate_pages); + cache->ops->dissociate_pages(cache); + fscache_stat_d(&fscache_n_cop_dissociate_pages); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + _debug("destroy"); + + fscache_withdraw_all_objects(cache, &dying_objects); + + /* wait for all extant objects to finish their outstanding operations + * and go away */ + _debug("wait for finish"); + wait_event(fscache_cache_cleared_wq, + atomic_read(&cache->object_count) == 0); + _debug("wait for clearance"); + wait_event(fscache_cache_cleared_wq, + list_empty(&cache->object_list)); + _debug("cleared"); + ASSERT(list_empty(&dying_objects)); + + kobject_put(cache->kobj); + + clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); + fscache_release_cache_tag(cache->tag); + cache->tag = NULL; + + _leave(""); +} +EXPORT_SYMBOL(fscache_withdraw_cache); diff --git a/kernel/fs/fscache/cookie.c b/kernel/fs/fscache/cookie.c new file mode 100644 index 000000000..89acec742 --- /dev/null +++ b/kernel/fs/fscache/cookie.c @@ -0,0 +1,722 @@ +/* netfs cookie management + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/netfs-api.txt for more information on + * the netfs API. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include "internal.h" + +struct kmem_cache *fscache_cookie_jar; + +static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie); +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object); + +/* + * initialise an cookie jar slab element prior to any use + */ +void fscache_cookie_init_once(void *_cookie) +{ + struct fscache_cookie *cookie = _cookie; + + memset(cookie, 0, sizeof(*cookie)); + spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); + INIT_HLIST_HEAD(&cookie->backing_objects); +} + +/* + * request a cookie to represent an object (index, datafile, xattr, etc) + * - parent specifies the parent object + * - the top level index cookie for each netfs is stored in the fscache_netfs + * struct upon registration + * - def points to the definition + * - the netfs_data will be passed to the functions pointed to in *def + * - all attached caches will be searched to see if they contain this object + * - index objects aren't stored on disk until there's a dependent file that + * needs storing + * - other objects are stored in a selected cache immediately, and all the + * indices forming the path to it are instantiated if necessary + * - we never let on to the netfs about errors + * - we may set a negative cookie pointer, but that's okay + */ +struct fscache_cookie *__fscache_acquire_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + void *netfs_data, + bool enable) +{ + struct fscache_cookie *cookie; + + BUG_ON(!def); + + _enter("{%s},{%s},%p,%u", + parent ? (char *) parent->def->name : "", + def->name, netfs_data, enable); + + fscache_stat(&fscache_n_acquires); + + /* if there's no parent cookie, then we don't create one here either */ + if (!parent) { + fscache_stat(&fscache_n_acquires_null); + _leave(" [no parent]"); + return NULL; + } + + /* validate the definition */ + BUG_ON(!def->get_key); + BUG_ON(!def->name[0]); + + BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && + parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) { + fscache_stat(&fscache_n_acquires_oom); + _leave(" [ENOMEM]"); + return NULL; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + /* We keep the active count elevated until relinquishment to prevent an + * attempt to wake up every time the object operations queue quiesces. + */ + atomic_set(&cookie->n_active, 1); + + atomic_inc(&parent->usage); + atomic_inc(&parent->n_children); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); + + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + + switch (cookie->def->type) { + case FSCACHE_COOKIE_TYPE_INDEX: + fscache_stat(&fscache_n_cookie_index); + break; + case FSCACHE_COOKIE_TYPE_DATAFILE: + fscache_stat(&fscache_n_cookie_data); + break; + default: + fscache_stat(&fscache_n_cookie_special); + break; + } + + if (enable) { + /* if the object is an index then we need do nothing more here + * - we create indices on disk when we need them as an index + * may exist in multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) == 0) { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } + } + + fscache_stat(&fscache_n_acquires_ok); + _leave(" = %p", cookie); + return cookie; +} +EXPORT_SYMBOL(__fscache_acquire_cookie); + +/* + * Enable a cookie to permit it to accept new operations. + */ +void __fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) +{ + _enter("%p", cookie); + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + TASK_UNINTERRUPTIBLE); + + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock; + + if (can_enable && !can_enable(data)) { + /* The netfs decided it didn't want to enable after all */ + } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* Wait for outstanding disablement to complete */ + __fscache_wait_on_invalidate(cookie); + + if (fscache_acquire_non_index_cookie(cookie) == 0) + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } + +out_unlock: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +} +EXPORT_SYMBOL(__fscache_enable_cookie); + +/* + * acquire a non-index cookie + * - this must make sure the index chain is instantiated and instantiate the + * object representation too + */ +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct fscache_cache *cache; + uint64_t i_size; + int ret; + + _enter(""); + + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + /* now we need to see whether the backing objects for this cookie yet + * exist, if not there'll be nothing to search */ + down_read(&fscache_addremove_sem); + + if (list_empty(&fscache_cache_list)) { + up_read(&fscache_addremove_sem); + _leave(" = 0 [no caches]"); + return 0; + } + + /* select a cache in which to store the object */ + cache = fscache_select_cache_for_object(cookie->parent); + if (!cache) { + up_read(&fscache_addremove_sem); + fscache_stat(&fscache_n_acquires_no_cache); + _leave(" = -ENOMEDIUM [no cache]"); + return -ENOMEDIUM; + } + + _debug("cache %s", cache->tag->name); + + set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + + /* ask the cache to allocate objects for this cookie and its parent + * chain */ + ret = fscache_alloc_object(cache, cookie); + if (ret < 0) { + up_read(&fscache_addremove_sem); + _leave(" = %d", ret); + return ret; + } + + /* pass on how big the object we're caching is supposed to be */ + cookie->def->get_attr(cookie->netfs_data, &i_size); + + spin_lock(&cookie->lock); + if (hlist_empty(&cookie->backing_objects)) { + spin_unlock(&cookie->lock); + goto unavailable; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + fscache_set_store_limit(object, i_size); + + /* initiate the process of looking up all the objects in the chain + * (done by fscache_initialise_object()) */ + fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); + + spin_unlock(&cookie->lock); + + /* we may be required to wait for lookup to complete at this point */ + if (!fscache_defer_lookup) { + _debug("non-deferred lookup %p", &cookie->flags); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + TASK_UNINTERRUPTIBLE); + _debug("complete"); + if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) + goto unavailable; + } + + up_read(&fscache_addremove_sem); + _leave(" = 0 [deferred]"); + return 0; + +unavailable: + up_read(&fscache_addremove_sem); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} + +/* + * recursively allocate cache object records for a cookie/cache combination + * - caller must be holding the addremove sem + */ +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie) +{ + struct fscache_object *object; + int ret; + + _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + + spin_lock(&cookie->lock); + hlist_for_each_entry(object, &cookie->backing_objects, + cookie_link) { + if (object->cache == cache) + goto object_already_extant; + } + spin_unlock(&cookie->lock); + + /* ask the cache to allocate an object (we may end up with duplicate + * objects at this stage, but we sort that out later) */ + fscache_stat(&fscache_n_cop_alloc_object); + object = cache->ops->alloc_object(cache, cookie); + fscache_stat_d(&fscache_n_cop_alloc_object); + if (IS_ERR(object)) { + fscache_stat(&fscache_n_object_no_alloc); + ret = PTR_ERR(object); + goto error; + } + + fscache_stat(&fscache_n_object_alloc); + + object->debug_id = atomic_inc_return(&fscache_object_debug_id); + + _debug("ALLOC OBJ%x: %s {%lx}", + object->debug_id, cookie->def->name, object->events); + + ret = fscache_alloc_object(cache, cookie->parent); + if (ret < 0) + goto error_put; + + /* only attach if we managed to allocate all we needed, otherwise + * discard the object we just allocated and instead use the one + * attached to the cookie */ + if (fscache_attach_object(cookie, object) < 0) { + fscache_stat(&fscache_n_cop_put_object); + cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); + } + + _leave(" = 0"); + return 0; + +object_already_extant: + ret = -ENOBUFS; + if (fscache_object_is_dead(object)) { + spin_unlock(&cookie->lock); + goto error; + } + spin_unlock(&cookie->lock); + _leave(" = 0 [found]"); + return 0; + +error_put: + fscache_stat(&fscache_n_cop_put_object); + cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * attach a cache object to a cookie + */ +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object) +{ + struct fscache_object *p; + struct fscache_cache *cache = object->cache; + int ret; + + _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + + spin_lock(&cookie->lock); + + /* there may be multiple initial creations of this object, but we only + * want one */ + ret = -EEXIST; + hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { + if (p->cache == object->cache) { + if (fscache_object_is_dying(p)) + ret = -ENOBUFS; + goto cant_attach_object; + } + } + + /* pin the parent object */ + spin_lock_nested(&cookie->parent->lock, 1); + hlist_for_each_entry(p, &cookie->parent->backing_objects, + cookie_link) { + if (p->cache == object->cache) { + if (fscache_object_is_dying(p)) { + ret = -ENOBUFS; + spin_unlock(&cookie->parent->lock); + goto cant_attach_object; + } + object->parent = p; + spin_lock(&p->lock); + p->n_children++; + spin_unlock(&p->lock); + break; + } + } + spin_unlock(&cookie->parent->lock); + + /* attach to the cache's object list */ + if (list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + } + + /* attach to the cookie */ + object->cookie = cookie; + atomic_inc(&cookie->usage); + hlist_add_head(&object->cookie_link, &cookie->backing_objects); + + fscache_objlist_add(object); + ret = 0; + +cant_attach_object: + spin_unlock(&cookie->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * Invalidate an object. Callable with spinlocks held. + */ +void __fscache_invalidate(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + + _enter("{%s}", cookie->def->name); + + fscache_stat(&fscache_n_invalidates); + + /* Only permit invalidation of data files. Invalidating an index will + * require the caller to release all its attachments to the tree rooted + * there, and if it's doing that, it may as well just retire the + * cookie. + */ + ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + + /* We will be updating the cookie too. */ + BUG_ON(!cookie->def->get_aux); + + /* If there's an object, we tell the object state machine to handle the + * invalidation on our behalf, otherwise there's nothing to do. + */ + if (!hlist_empty(&cookie->backing_objects)) { + spin_lock(&cookie->lock); + + if (fscache_cookie_enabled(cookie) && + !hlist_empty(&cookie->backing_objects) && + !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, + &cookie->flags)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, + cookie_link); + if (fscache_object_is_live(object)) + fscache_raise_event( + object, FSCACHE_OBJECT_EV_INVALIDATE); + } + + spin_unlock(&cookie->lock); + } + + _leave(""); +} +EXPORT_SYMBOL(__fscache_invalidate); + +/* + * Wait for object invalidation to complete. + */ +void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) +{ + _enter("%p", cookie); + + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, + TASK_UNINTERRUPTIBLE); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_wait_on_invalidate); + +/* + * update the index entries backing a cookie + */ +void __fscache_update_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + + fscache_stat(&fscache_n_updates); + + if (!cookie) { + fscache_stat(&fscache_n_updates_null); + _leave(" [no cookie]"); + return; + } + + _enter("{%s}", cookie->def->name); + + BUG_ON(!cookie->def->get_aux); + + spin_lock(&cookie->lock); + + if (fscache_cookie_enabled(cookie)) { + /* update the index entry on disk in each cache backing this + * cookie. + */ + hlist_for_each_entry(object, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } + } + + spin_unlock(&cookie->lock); + _leave(""); +} +EXPORT_SYMBOL(__fscache_update_cookie); + +/* + * Disable a cookie to stop it from accepting new requests from the netfs. + */ +void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) +{ + struct fscache_object *object; + bool awaken = false; + + _enter("%p,%u", cookie, invalidate); + + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); + + if (atomic_read(&cookie->n_children) != 0) { + pr_err("Cookie '%s' still has children\n", + cookie->def->name); + BUG(); + } + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + TASK_UNINTERRUPTIBLE); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock_enable; + + /* If the cookie is being invalidated, wait for that to complete first + * so that we can reuse the flag. + */ + __fscache_wait_on_invalidate(cookie); + + /* Dispose of the backing objects */ + set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); + + spin_lock(&cookie->lock); + if (!hlist_empty(&cookie->backing_objects)) { + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + if (invalidate) + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + } else { + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; + } + spin_unlock(&cookie->lock); + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + + /* Wait for cessation of activity requiring access to the netfs (when + * n_active reaches 0). This makes sure outstanding reads and writes + * have completed. + */ + if (!atomic_dec_and_test(&cookie->n_active)) + wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, + TASK_UNINTERRUPTIBLE); + + /* Reset the cookie state if it wasn't relinquished */ + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { + atomic_inc(&cookie->n_active); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + } + +out_unlock_enable: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + _leave(""); +} +EXPORT_SYMBOL(__fscache_disable_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +{ + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p,%d},%d", + cookie, cookie->def->name, cookie->netfs_data, + atomic_read(&cookie->n_active), retire); + + /* No further netfs-accessing operations on this cookie permitted */ + set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + + __fscache_disable_cookie(cookie, retire); + + /* Clear pointers back to the netfs */ + cookie->netfs_data = NULL; + cookie->def = NULL; + BUG_ON(cookie->stores.rnode); + + if (cookie->parent) { + ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); + atomic_dec(&cookie->parent->n_children); + } + + /* Dispose of the netfs's link to the cookie */ + ASSERTCMP(atomic_read(&cookie->usage), >, 0); + fscache_cookie_put(cookie); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); + +/* + * destroy a cookie + */ +void __fscache_cookie_put(struct fscache_cookie *cookie) +{ + struct fscache_cookie *parent; + + _enter("%p", cookie); + + for (;;) { + _debug("FREE COOKIE %p", cookie); + parent = cookie->parent; + BUG_ON(!hlist_empty(&cookie->backing_objects)); + kmem_cache_free(fscache_cookie_jar, cookie); + + if (!parent) + break; + + cookie = parent; + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (!atomic_dec_and_test(&cookie->usage)) + break; + } + + _leave(""); +} + +/* + * check the consistency between the netfs inode and the backing cache + * + * NOTE: it only serves no-index type + */ +int __fscache_check_consistency(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("%p,", cookie); + + ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + if (hlist_empty(&cookie->backing_objects)) + return 0; + + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!op) + return -ENOMEM; + + fscache_operation_init(op, NULL, NULL); + op->flags = FSCACHE_OP_MYTHREAD | + (1 << FSCACHE_OP_WAITING) | + (1 << FSCACHE_OP_UNUSE_COOKIE); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto inconsistent; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto inconsistent; + + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + + __fscache_use_cookie(cookie); + if (fscache_submit_op(object, op) < 0) + goto submit_failed; + + /* the work queue now carries its own ref on the object */ + spin_unlock(&cookie->lock); + + ret = fscache_wait_for_operation_activation(object, op, + NULL, NULL, NULL); + if (ret == 0) { + /* ask the cache to honour the operation */ + ret = object->cache->ops->check_consistency(op); + fscache_op_complete(op, false); + } else if (ret == -ENOBUFS) { + ret = 0; + } + + fscache_put_operation(op); + _leave(" = %d", ret); + return ret; + +submit_failed: + wake_cookie = __fscache_unuse_cookie(cookie); +inconsistent: + spin_unlock(&cookie->lock); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); + kfree(op); + _leave(" = -ESTALE"); + return -ESTALE; +} +EXPORT_SYMBOL(__fscache_check_consistency); diff --git a/kernel/fs/fscache/fsdef.c b/kernel/fs/fscache/fsdef.c new file mode 100644 index 000000000..5a117df2a --- /dev/null +++ b/kernel/fs/fscache/fsdef.c @@ -0,0 +1,146 @@ +/* Filesystem index definition + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include "internal.h" + +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static +enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen); + +/* + * The root index is owned by FS-Cache itself. + * + * When a netfs requests caching facilities, FS-Cache will, if one doesn't + * already exist, create an entry in the root index with the key being the name + * of the netfs ("AFS" for example), and the auxiliary data holding the index + * structure version supplied by the netfs: + * + * FSDEF + * | + * +-----------+ + * | | + * NFS AFS + * [v=1] [v=1] + * + * If an entry with the appropriate name does already exist, the version is + * compared. If the version is different, the entire subtree from that entry + * will be discarded and a new entry created. + * + * The new entry will be an index, and a cookie referring to it will be passed + * to the netfs. This is then the root handle by which the netfs accesses the + * cache. It can create whatever objects it likes in that index, including + * further indices. + */ +static struct fscache_cookie_def fscache_fsdef_index_def = { + .name = ".FS-Cache", + .type = FSCACHE_COOKIE_TYPE_INDEX, +}; + +struct fscache_cookie fscache_fsdef_index = { + .usage = ATOMIC_INIT(1), + .n_active = ATOMIC_INIT(1), + .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), + .backing_objects = HLIST_HEAD_INIT, + .def = &fscache_fsdef_index_def, + .flags = 1 << FSCACHE_COOKIE_ENABLED, +}; +EXPORT_SYMBOL(fscache_fsdef_index); + +/* + * Definition of an entry in the root index. Each entry is an index, keyed to + * a specific netfs and only applicable to a particular version of the index + * structure used by that netfs. + */ +struct fscache_cookie_def fscache_fsdef_netfs_def = { + .name = "FSDEF.netfs", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = fscache_fsdef_netfs_get_key, + .get_aux = fscache_fsdef_netfs_get_aux, + .check_aux = fscache_fsdef_netfs_check_aux, +}; + +/* + * get the key data for an FSDEF index record - this is the name of the netfs + * for which this entry is created + */ +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned klen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + klen = strlen(netfs->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, netfs->name, klen); + return klen; +} + +/* + * get the auxiliary data for an FSDEF index record - this is the index + * structure version number of the netfs for which this version is created + */ +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned dlen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + dlen = sizeof(uint32_t); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &netfs->version, dlen); + return dlen; +} + +/* + * check that the index structure version number stored in the auxiliary data + * matches the one the netfs gave us + */ +static enum fscache_checkaux fscache_fsdef_netfs_check_aux( + void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct fscache_netfs *netfs = cookie_netfs_data; + uint32_t version; + + _enter("{%s},,%hu", netfs->name, datalen); + + if (datalen != sizeof(version)) { + _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + memcpy(&version, data, sizeof(version)); + if (version != netfs->version) { + _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} diff --git a/kernel/fs/fscache/histogram.c b/kernel/fs/fscache/histogram.c new file mode 100644 index 000000000..7d637e233 --- /dev/null +++ b/kernel/fs/fscache/histogram.c @@ -0,0 +1,107 @@ +/* FS-Cache latency histogram + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include +#include +#include +#include "internal.h" + +atomic_t fscache_obj_instantiate_histogram[HZ]; +atomic_t fscache_objs_histogram[HZ]; +atomic_t fscache_ops_histogram[HZ]; +atomic_t fscache_retrieval_delay_histogram[HZ]; +atomic_t fscache_retrieval_histogram[HZ]; + +/* + * display the time-taken histogram + */ +static int fscache_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned n[5], t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); + n[1] = atomic_read(&fscache_ops_histogram[index]); + n[2] = atomic_read(&fscache_objs_histogram[index]); + n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); + n[4] = atomic_read(&fscache_retrieval_histogram[index]); + if (!(n[0] | n[1] | n[2] | n[3] | n[4])) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", + index, t, n[0], n[1], n[2], n[3], n[4]); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void fscache_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations fscache_histogram_ops = { + .start = fscache_histogram_start, + .stop = fscache_histogram_stop, + .next = fscache_histogram_next, + .show = fscache_histogram_show, +}; + +/* + * open "/proc/fs/fscache/histogram" to provide latency data + */ +static int fscache_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fscache_histogram_ops); +} + +const struct file_operations fscache_histogram_fops = { + .owner = THIS_MODULE, + .open = fscache_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/fs/fscache/internal.h b/kernel/fs/fscache/internal.h new file mode 100644 index 000000000..7872a62ef --- /dev/null +++ b/kernel/fs/fscache/internal.h @@ -0,0 +1,464 @@ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Lock order, in the order in which multiple locks should be obtained: + * - fscache_addremove_sem + * - cookie->lock + * - cookie->parent->lock + * - cache->object_list_lock + * - object->lock + * - object->parent->lock + * - cookie->stores_lock + * - fscache_thread_lock + * + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt + +#include +#include + +#define FSCACHE_MIN_THREADS 4 +#define FSCACHE_MAX_THREADS 32 + +/* + * cache.c + */ +extern struct list_head fscache_cache_list; +extern struct rw_semaphore fscache_addremove_sem; + +extern struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *); + +/* + * cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; + +extern void fscache_cookie_init_once(void *); +extern void __fscache_cookie_put(struct fscache_cookie *); + +/* + * fsdef.c + */ +extern struct fscache_cookie fscache_fsdef_index; +extern struct fscache_cookie_def fscache_fsdef_netfs_def; + +/* + * histogram.c + */ +#ifdef CONFIG_FSCACHE_HISTOGRAM +extern atomic_t fscache_obj_instantiate_histogram[HZ]; +extern atomic_t fscache_objs_histogram[HZ]; +extern atomic_t fscache_ops_histogram[HZ]; +extern atomic_t fscache_retrieval_delay_histogram[HZ]; +extern atomic_t fscache_retrieval_histogram[HZ]; + +static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +extern const struct file_operations fscache_histogram_fops; + +#else +#define fscache_hist(hist, start_jif) do {} while (0) +#endif + +/* + * main.c + */ +extern unsigned fscache_defer_lookup; +extern unsigned fscache_defer_create; +extern unsigned fscache_debug; +extern struct kobject *fscache_root; +extern struct workqueue_struct *fscache_object_wq; +extern struct workqueue_struct *fscache_op_wq; +DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +static inline bool fscache_object_congested(void) +{ + return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); +} + +extern int fscache_wait_atomic_t(atomic_t *); + +/* + * object.c + */ +extern void fscache_enqueue_object(struct fscache_object *); + +/* + * object-list.c + */ +#ifdef CONFIG_FSCACHE_OBJECT_LIST +extern const struct file_operations fscache_objlist_fops; + +extern void fscache_objlist_add(struct fscache_object *); +extern void fscache_objlist_remove(struct fscache_object *); +#else +#define fscache_objlist_add(object) do {} while(0) +#define fscache_objlist_remove(object) do {} while(0) +#endif + +/* + * operation.c + */ +extern int fscache_submit_exclusive_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_submit_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *, + void (*)(struct fscache_operation *)); +extern void fscache_cancel_all_ops(struct fscache_object *); +extern void fscache_abort_object(struct fscache_object *); +extern void fscache_start_operations(struct fscache_object *); +extern void fscache_operation_gc(struct work_struct *); + +/* + * page.c + */ +extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); +extern int fscache_wait_for_operation_activation(struct fscache_object *, + struct fscache_operation *, + atomic_t *, + atomic_t *, + void (*)(struct fscache_operation *)); +extern void fscache_invalidate_writes(struct fscache_cookie *); + +/* + * proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; +extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; + +extern atomic_t fscache_n_op_pend; +extern atomic_t fscache_n_op_run; +extern atomic_t fscache_n_op_enqueue; +extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_release; +extern atomic_t fscache_n_op_gc; +extern atomic_t fscache_n_op_cancelled; +extern atomic_t fscache_n_op_rejected; + +extern atomic_t fscache_n_attr_changed; +extern atomic_t fscache_n_attr_changed_ok; +extern atomic_t fscache_n_attr_changed_nobufs; +extern atomic_t fscache_n_attr_changed_nomem; +extern atomic_t fscache_n_attr_changed_calls; + +extern atomic_t fscache_n_allocs; +extern atomic_t fscache_n_allocs_ok; +extern atomic_t fscache_n_allocs_wait; +extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_allocs_intr; +extern atomic_t fscache_n_allocs_object_dead; +extern atomic_t fscache_n_alloc_ops; +extern atomic_t fscache_n_alloc_op_waits; + +extern atomic_t fscache_n_retrievals; +extern atomic_t fscache_n_retrievals_ok; +extern atomic_t fscache_n_retrievals_wait; +extern atomic_t fscache_n_retrievals_nodata; +extern atomic_t fscache_n_retrievals_nobufs; +extern atomic_t fscache_n_retrievals_intr; +extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrievals_object_dead; +extern atomic_t fscache_n_retrieval_ops; +extern atomic_t fscache_n_retrieval_op_waits; + +extern atomic_t fscache_n_stores; +extern atomic_t fscache_n_stores_ok; +extern atomic_t fscache_n_stores_again; +extern atomic_t fscache_n_stores_nobufs; +extern atomic_t fscache_n_stores_oom; +extern atomic_t fscache_n_store_ops; +extern atomic_t fscache_n_store_calls; +extern atomic_t fscache_n_store_pages; +extern atomic_t fscache_n_store_radix_deletes; +extern atomic_t fscache_n_store_pages_over_limit; + +extern atomic_t fscache_n_store_vmscan_not_storing; +extern atomic_t fscache_n_store_vmscan_gone; +extern atomic_t fscache_n_store_vmscan_busy; +extern atomic_t fscache_n_store_vmscan_cancelled; +extern atomic_t fscache_n_store_vmscan_wait; + +extern atomic_t fscache_n_marks; +extern atomic_t fscache_n_uncaches; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_null; +extern atomic_t fscache_n_acquires_no_cache; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_nobufs; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_invalidates; +extern atomic_t fscache_n_invalidates_run; + +extern atomic_t fscache_n_updates; +extern atomic_t fscache_n_updates_null; +extern atomic_t fscache_n_updates_run; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_null; +extern atomic_t fscache_n_relinquishes_waitcrt; +extern atomic_t fscache_n_relinquishes_retire; + +extern atomic_t fscache_n_cookie_index; +extern atomic_t fscache_n_cookie_data; +extern atomic_t fscache_n_cookie_special; + +extern atomic_t fscache_n_object_alloc; +extern atomic_t fscache_n_object_no_alloc; +extern atomic_t fscache_n_object_lookups; +extern atomic_t fscache_n_object_lookups_negative; +extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_lookups_timed_out; +extern atomic_t fscache_n_object_created; +extern atomic_t fscache_n_object_avail; +extern atomic_t fscache_n_object_dead; + +extern atomic_t fscache_n_checkaux_none; +extern atomic_t fscache_n_checkaux_okay; +extern atomic_t fscache_n_checkaux_update; +extern atomic_t fscache_n_checkaux_obsolete; + +extern atomic_t fscache_n_cop_alloc_object; +extern atomic_t fscache_n_cop_lookup_object; +extern atomic_t fscache_n_cop_lookup_complete; +extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_invalidate_object; +extern atomic_t fscache_n_cop_update_object; +extern atomic_t fscache_n_cop_drop_object; +extern atomic_t fscache_n_cop_put_object; +extern atomic_t fscache_n_cop_sync_cache; +extern atomic_t fscache_n_cop_attr_changed; +extern atomic_t fscache_n_cop_read_or_alloc_page; +extern atomic_t fscache_n_cop_read_or_alloc_pages; +extern atomic_t fscache_n_cop_allocate_page; +extern atomic_t fscache_n_cop_allocate_pages; +extern atomic_t fscache_n_cop_write_page; +extern atomic_t fscache_n_cop_uncache_page; +extern atomic_t fscache_n_cop_dissociate_pages; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +extern const struct file_operations fscache_stats_fops; +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) +#endif + +/* + * raise an event on an object + * - if the event is not masked for that object, then the object is + * queued for attention by the thread pool. + */ +static inline void fscache_raise_event(struct fscache_object *object, + unsigned event) +{ + BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); +#if 0 + printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n", + object->debug_id, object->event_mask, (1 << event)); +#endif + if (!test_and_set_bit(event, &object->events) && + test_bit(event, &object->event_mask)) + fscache_enqueue_object(object); +} + +/* + * drop a reference to a cookie + */ +static inline void fscache_cookie_put(struct fscache_cookie *cookie) +{ + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (atomic_dec_and_test(&cookie->usage)) + __fscache_cookie_put(cookie); +} + +/* + * get an extra reference to a netfs retrieval context + */ +static inline +void *fscache_get_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->get_context) + cookie->def->get_context(cookie->netfs_data, context); + return context; +} + +/* + * release a reference to a netfs retrieval context + */ +static inline +void fscache_put_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->put_context) + cookie->def->put_context(cookie->netfs_data, context); +} + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + +#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) + +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_FSCACHE_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (__do_kdebug(ENTER)) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (__do_kdebug(LEAVE)) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (__do_kdebug(DEBUG)) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif + +/* + * determine whether a particular optional debugging point should be logged + * - we need to go through three steps to persuade cpp to correctly join the + * shorthand in FSCACHE_DEBUG_LEVEL with its prefix + */ +#define ____do_kdebug(LEVEL, POINT) \ + unlikely((fscache_debug & \ + (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) +#define ___do_kdebug(LEVEL, POINT) \ + ____do_kdebug(LEVEL, POINT) +#define __do_kdebug(POINT) \ + ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) + +#define FSCACHE_DEBUG_CACHE 0 +#define FSCACHE_DEBUG_COOKIE 1 +#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OPERATION 3 + +#define FSCACHE_POINT_ENTER 1 +#define FSCACHE_POINT_LEAVE 2 +#define FSCACHE_POINT_DEBUG 4 + +#ifndef FSCACHE_DEBUG_LEVEL +#define FSCACHE_DEBUG_LEVEL CACHE +#endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/kernel/fs/fscache/main.c b/kernel/fs/fscache/main.c new file mode 100644 index 000000000..b39d487cc --- /dev/null +++ b/kernel/fs/fscache/main.c @@ -0,0 +1,206 @@ +/* General filesystem local caching manager + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("FS Cache Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned fscache_defer_lookup = 1; +module_param_named(defer_lookup, fscache_defer_lookup, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_lookup, + "Defer cookie lookup to background thread"); + +unsigned fscache_defer_create = 1; +module_param_named(defer_create, fscache_defer_create, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_create, + "Defer cookie creation to background thread"); + +unsigned fscache_debug; +module_param_named(debug, fscache_debug, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_debug, + "FS-Cache debugging mask"); + +struct kobject *fscache_root; +struct workqueue_struct *fscache_object_wq; +struct workqueue_struct *fscache_op_wq; + +DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +/* these values serve as lower bounds, will be adjusted in fscache_init() */ +static unsigned fscache_object_max_active = 4; +static unsigned fscache_op_max_active = 2; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fscache_sysctl_header; + +static int fscache_max_active_sysctl(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct workqueue_struct **wqp = table->extra1; + unsigned int *datap = table->data; + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0) + workqueue_set_max_active(*wqp, *datap); + return ret; +} + +static struct ctl_table fscache_sysctls[] = { + { + .procname = "object_max_active", + .data = &fscache_object_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_object_wq, + }, + { + .procname = "operation_max_active", + .data = &fscache_op_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_op_wq, + }, + {} +}; + +static struct ctl_table fscache_sysctls_root[] = { + { + .procname = "fscache", + .mode = 0555, + .child = fscache_sysctls, + }, + {} +}; +#endif + +/* + * initialise the fs caching module + */ +static int __init fscache_init(void) +{ + unsigned int nr_cpus = num_possible_cpus(); + unsigned int cpu; + int ret; + + fscache_object_max_active = + clamp_val(nr_cpus, + fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, + fscache_object_max_active); + if (!fscache_object_wq) + goto error_object_wq; + + fscache_op_max_active = + clamp_val(fscache_object_max_active / 2, + fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, + fscache_op_max_active); + if (!fscache_op_wq) + goto error_op_wq; + + for_each_possible_cpu(cpu) + init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); + + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; + +#ifdef CONFIG_SYSCTL + ret = -ENOMEM; + fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); + if (!fscache_sysctl_header) + goto error_sysctl; +#endif + + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", + sizeof(struct fscache_cookie), + 0, + 0, + fscache_cookie_init_once); + if (!fscache_cookie_jar) { + pr_notice("Failed to allocate a cookie jar\n"); + ret = -ENOMEM; + goto error_cookie_jar; + } + + fscache_root = kobject_create_and_add("fscache", kernel_kobj); + if (!fscache_root) + goto error_kobj; + + pr_notice("Loaded\n"); + return 0; + +error_kobj: + kmem_cache_destroy(fscache_cookie_jar); +error_cookie_jar: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +error_sysctl: +#endif + fscache_proc_cleanup(); +error_proc: + destroy_workqueue(fscache_op_wq); +error_op_wq: + destroy_workqueue(fscache_object_wq); +error_object_wq: + return ret; +} + +fs_initcall(fscache_init); + +/* + * clean up on module removal + */ +static void __exit fscache_exit(void) +{ + _enter(""); + + kobject_put(fscache_root); + kmem_cache_destroy(fscache_cookie_jar); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +#endif + fscache_proc_cleanup(); + destroy_workqueue(fscache_op_wq); + destroy_workqueue(fscache_object_wq); + pr_notice("Unloaded\n"); +} + +module_exit(fscache_exit); + +/* + * wait_on_atomic_t() sleep function for uninterruptible waiting + */ +int fscache_wait_atomic_t(atomic_t *p) +{ + schedule(); + return 0; +} diff --git a/kernel/fs/fscache/netfs.c b/kernel/fs/fscache/netfs.c new file mode 100644 index 000000000..6d941f56f --- /dev/null +++ b/kernel/fs/fscache/netfs.c @@ -0,0 +1,104 @@ +/* FS-Cache netfs (client) registration + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include "internal.h" + +static LIST_HEAD(fscache_netfs_list); + +/* + * register a network filesystem for caching + */ +int __fscache_register_netfs(struct fscache_netfs *netfs) +{ + struct fscache_netfs *ptr; + int ret; + + _enter("{%s}", netfs->name); + + INIT_LIST_HEAD(&netfs->link); + + /* allocate a cookie for the primary index */ + netfs->primary_index = + kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); + + if (!netfs->primary_index) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* initialise the primary index cookie */ + atomic_set(&netfs->primary_index->usage, 1); + atomic_set(&netfs->primary_index->n_children, 0); + atomic_set(&netfs->primary_index->n_active, 1); + + netfs->primary_index->def = &fscache_fsdef_netfs_def; + netfs->primary_index->parent = &fscache_fsdef_index; + netfs->primary_index->netfs_data = netfs; + netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED; + + atomic_inc(&netfs->primary_index->parent->usage); + atomic_inc(&netfs->primary_index->parent->n_children); + + spin_lock_init(&netfs->primary_index->lock); + INIT_HLIST_HEAD(&netfs->primary_index->backing_objects); + + /* check the netfs type is not already present */ + down_write(&fscache_addremove_sem); + + ret = -EEXIST; + list_for_each_entry(ptr, &fscache_netfs_list, link) { + if (strcmp(ptr->name, netfs->name) == 0) + goto already_registered; + } + + list_add(&netfs->link, &fscache_netfs_list); + ret = 0; + + pr_notice("Netfs '%s' registered for caching\n", netfs->name); + +already_registered: + up_write(&fscache_addremove_sem); + + if (ret < 0) { + netfs->primary_index->parent = NULL; + __fscache_cookie_put(netfs->primary_index); + netfs->primary_index = NULL; + } + + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(__fscache_register_netfs); + +/* + * unregister a network filesystem from the cache + * - all cookies must have been released first + */ +void __fscache_unregister_netfs(struct fscache_netfs *netfs) +{ + _enter("{%s.%u}", netfs->name, netfs->version); + + down_write(&fscache_addremove_sem); + + list_del(&netfs->link); + fscache_relinquish_cookie(netfs->primary_index, 0); + + up_write(&fscache_addremove_sem); + + pr_notice("Netfs '%s' unregistered from caching\n", + netfs->name); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/kernel/fs/fscache/object-list.c b/kernel/fs/fscache/object-list.c new file mode 100644 index 000000000..51dde817e --- /dev/null +++ b/kernel/fs/fscache/object-list.c @@ -0,0 +1,412 @@ +/* Global fscache object list maintainer and viewer + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include +#include +#include +#include "internal.h" + +static struct rb_root fscache_object_list; +static DEFINE_RWLOCK(fscache_object_list_lock); + +struct fscache_objlist_data { + unsigned long config; /* display configuration */ +#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ +#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ +#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ +#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ +#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ +#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ +#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ +#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ +#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ +#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ +#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ +#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ + + u8 buf[512]; /* key and aux data buffer */ +}; + +/* + * Add an object to the object list + * - we use the address of the fscache_object structure as the key into the + * tree + */ +void fscache_objlist_add(struct fscache_object *obj) +{ + struct fscache_object *xobj; + struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + + ASSERT(RB_EMPTY_NODE(&obj->objlist_link)); + + write_lock(&fscache_object_list_lock); + + while (*p) { + parent = *p; + xobj = rb_entry(parent, struct fscache_object, objlist_link); + + if (obj < xobj) + p = &(*p)->rb_left; + else if (obj > xobj) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&obj->objlist_link, parent, p); + rb_insert_color(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} + +/* + * Remove an object from the object list. + */ +void fscache_objlist_remove(struct fscache_object *obj) +{ + if (RB_EMPTY_NODE(&obj->objlist_link)) + return; + + write_lock(&fscache_object_list_lock); + + BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); + rb_erase(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} + +/* + * find the object in the tree on or after the specified index + */ +static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) +{ + struct fscache_object *pobj, *obj = NULL, *minobj = NULL; + struct rb_node *p; + unsigned long pos; + + if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) + return NULL; + pos = *_pos; + + /* banners (can't represent line 0 by pos 0 as that would involve + * returning a NULL pointer) */ + if (pos == 0) + return (struct fscache_object *)(long)++(*_pos); + if (pos < 3) + return (struct fscache_object *)pos; + + pobj = (struct fscache_object *)pos; + p = fscache_object_list.rb_node; + while (p) { + obj = rb_entry(p, struct fscache_object, objlist_link); + if (pobj < obj) { + if (!minobj || minobj > obj) + minobj = obj; + p = p->rb_left; + } else if (pobj > obj) { + p = p->rb_right; + } else { + minobj = obj; + break; + } + obj = NULL; + } + + if (!minobj) + *_pos = (unsigned long) ERR_PTR(-ENOENT); + else if (minobj != obj) + *_pos = (unsigned long) minobj; + return minobj; +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_object_list_lock) +{ + read_lock(&fscache_object_list_lock); + return fscache_objlist_lookup(_pos); +} + +/* + * move to the next line + */ +static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) +{ + (*_pos)++; + return fscache_objlist_lookup(_pos); +} + +/* + * clean up after reading + */ +static void fscache_objlist_stop(struct seq_file *m, void *v) + __releases(&fscache_object_list_lock) +{ + read_unlock(&fscache_object_list_lock); +} + +/* + * display an object + */ +static int fscache_objlist_show(struct seq_file *m, void *v) +{ + struct fscache_objlist_data *data = m->private; + struct fscache_object *obj = v; + struct fscache_cookie *cookie; + unsigned long config = data->config; + char _type[3], *type; + u8 *buf = data->buf, *p; + + if ((unsigned long) v == 1) { + seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" + " EM EV FL S" + " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " "); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_puts(m, "OBJECT_KEY"); + if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) == + (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, ", "); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + seq_puts(m, "AUX_DATA"); + seq_puts(m, "\n"); + return 0; + } + + if ((unsigned long) v == 2) { + seq_puts(m, "======== ======== ==== ===== === === === == =====" + " == == == =" + " | ================ == == ================"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " ================"); + seq_puts(m, "\n"); + return 0; + } + + /* filter out any unwanted objects */ +#define FILTER(criterion, _yes, _no) \ + do { \ + unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ + unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ + if (criterion) { \ + if (!(config & yes)) \ + return 0; \ + } else { \ + if (!(config & no)) \ + return 0; \ + } \ + } while(0) + + cookie = obj->cookie; + if (~config) { + FILTER(cookie->def, + COOKIE, NOCOOKIE); + FILTER(fscache_object_is_active(obj) || + obj->n_ops != 0 || + obj->n_obj_ops != 0 || + obj->flags || + !list_empty(&obj->dependents), + BUSY, IDLE); + FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), + PENDWR, NOPENDWR); + FILTER(atomic_read(&obj->n_reads), + READS, NOREADS); + FILTER(obj->events & obj->event_mask, + EVENTS, NOEVENTS); + FILTER(work_busy(&obj->work), WORK, NOWORK); + } + + seq_printf(m, + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ", + obj->debug_id, + obj->parent ? obj->parent->debug_id : -1, + obj->state->short_name, + obj->n_children, + obj->n_ops, + obj->n_obj_ops, + obj->n_in_progress, + obj->n_exclusive, + atomic_read(&obj->n_reads), + obj->event_mask, + obj->events, + obj->flags, + work_busy(&obj->work)); + + if (fscache_use_cookie(obj)) { + uint16_t keylen = 0, auxlen = 0; + + switch (cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", cookie->def->type); + type = _type; + break; + } + + seq_printf(m, "%-16s %s %2lx %16p", + cookie->def->name, + type, + cookie->flags, + cookie->netfs_data); + + if (cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = cookie->def->get_key(cookie->netfs_data, + buf, 400); + + if (cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = cookie->def->get_aux(cookie->netfs_data, + buf + keylen, 512 - keylen); + fscache_unuse_cookie(obj); + + if (keylen > 0 || auxlen > 0) { + seq_puts(m, " "); + for (p = buf; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_puts(m, ", "); + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + + seq_puts(m, "\n"); + } else { + seq_puts(m, "\n"); + } + return 0; +} + +static const struct seq_operations fscache_objlist_ops = { + .start = fscache_objlist_start, + .stop = fscache_objlist_stop, + .next = fscache_objlist_next, + .show = fscache_objlist_show, +}; + +/* + * get the configuration for filtering the list + */ +static void fscache_objlist_config(struct fscache_objlist_data *data) +{ +#ifdef CONFIG_KEYS + struct user_key_payload *confkey; + unsigned long config; + struct key *key; + const char *buf; + int len; + + key = request_key(&key_type_user, "fscache:objlist", NULL); + if (IS_ERR(key)) + goto no_config; + + config = 0; + rcu_read_lock(); + + confkey = key->payload.data; + buf = confkey->data; + + for (len = confkey->datalen - 1; len >= 0; len--) { + switch (buf[len]) { + case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; + case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; + case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; + case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; + case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; + case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; + case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; + case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; + case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; + case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; + case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; + case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; + } + } + + rcu_read_unlock(); + key_put(key); + + if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) + config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) + config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) + config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; + if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) + config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) + config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) + config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; + + data->config = config; + return; + +no_config: +#endif + data->config = ULONG_MAX; +} + +/* + * open "/proc/fs/fscache/objects" to provide a list of active objects + * - can be configured by a user-defined key added to the caller's keyrings + */ +static int fscache_objlist_open(struct inode *inode, struct file *file) +{ + struct fscache_objlist_data *data; + + data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data)); + if (!data) + return -ENOMEM; + + /* get the configuration key */ + fscache_objlist_config(data); + + return 0; +} + +/* + * clean up on close + */ +static int fscache_objlist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + kfree(m->private); + m->private = NULL; + return seq_release(inode, file); +} + +const struct file_operations fscache_objlist_fops = { + .owner = THIS_MODULE, + .open = fscache_objlist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fscache_objlist_release, +}; diff --git a/kernel/fs/fscache/object.c b/kernel/fs/fscache/object.c new file mode 100644 index 000000000..da032daf0 --- /dev/null +++ b/kernel/fs/fscache/object.c @@ -0,0 +1,1018 @@ +/* FS-Cache object state machine handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/object.txt for a description of the + * object state machine and the in-kernel representations. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include +#include "internal.h" + +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_drop_object(struct fscache_object *, int); +static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int); +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int); +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_object(struct fscache_object *, int); +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int); +static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_available(struct fscache_object *, int); +static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); +static const struct fscache_state *fscache_update_object(struct fscache_object *, int); + +#define __STATE_NAME(n) fscache_osm_##n +#define STATE(n) (&__STATE_NAME(n)) + +/* + * Define a work state. Work states are execution states. No event processing + * is performed by them. The function attached to a work state returns a + * pointer indicating the next state to which the state machine should + * transition. Returning NO_TRANSIT repeats the current state, but goes back + * to the scheduler first. + */ +#define WORK_STATE(n, sn, f) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = f \ + } + +/* + * Returns from work states. + */ +#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); }) + +#define NO_TRANSIT ((struct fscache_state *)NULL) + +/* + * Define a wait state. Wait states are event processing states. No execution + * is performed by them. Wait states are just tables of "if event X occurs, + * clear it and transition to state Y". The dispatcher returns to the + * scheduler if none of the events in which the wait state has an interest are + * currently pending. + */ +#define WAIT_STATE(n, sn, ...) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = NULL, \ + .transitions = { __VA_ARGS__, { 0, NULL } } \ + } + +#define TRANSIT_TO(state, emask) \ + { .events = (emask), .transit_to = STATE(state) } + +/* + * The object state machine. + */ +static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); +static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); +static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); +static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); +static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); +static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); +static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); + +static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object); +static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object); + +static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); +static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); +static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); +static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); +static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); + +static WAIT_STATE(WAIT_FOR_INIT, "?INI", + TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_PARENT, "?PRN", + TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY)); + +static WAIT_STATE(WAIT_FOR_CMD, "?CMD", + TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE), + TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE), + TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR", + TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED)); + +/* + * Out-of-band event transition tables. These are for handling unexpected + * events, such as an I/O error. If an OOB event occurs, the state machine + * clears and disables the event and forces a transition to the nominated work + * state (acurrently executing work states will complete first). + * + * In such a situation, object->state remembers the state the machine should + * have been in/gone to and returning NO_TRANSIT returns to that. + */ +static const struct fscache_transition fscache_osm_init_oob[] = { + TRANSIT_TO(ABORT_INIT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } +}; + +static const struct fscache_transition fscache_osm_lookup_oob[] = { + TRANSIT_TO(LOOKUP_FAILURE, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } +}; + +static const struct fscache_transition fscache_osm_run_oob[] = { + TRANSIT_TO(KILL_OBJECT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } +}; + +static int fscache_get_object(struct fscache_object *); +static void fscache_put_object(struct fscache_object *); +static bool fscache_enqueue_dependents(struct fscache_object *, int); +static void fscache_dequeue_object(struct fscache_object *); + +/* + * we need to notify the parent when an op completes that we had outstanding + * upon it + */ +static inline void fscache_done_parent_op(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + + _enter("OBJ%x {OBJ%x,%x}", + object->debug_id, parent->debug_id, parent->n_ops); + + spin_lock_nested(&parent->lock, 1); + parent->n_obj_ops--; + parent->n_ops--; + if (parent->n_ops == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); +} + +/* + * Object state machine dispatcher. + */ +static void fscache_object_sm_dispatcher(struct fscache_object *object) +{ + const struct fscache_transition *t; + const struct fscache_state *state, *new_state; + unsigned long events, event_mask; + int event = -1; + + ASSERT(object != NULL); + + _enter("{OBJ%x,%s,%lx}", + object->debug_id, object->state->name, object->events); + + event_mask = object->event_mask; +restart: + object->event_mask = 0; /* Mask normal event handling */ + state = object->state; +restart_masked: + events = object->events; + + /* Handle any out-of-band events (typically an error) */ + if (events & object->oob_event_mask) { + _debug("{OBJ%x} oob %lx", + object->debug_id, events & object->oob_event_mask); + for (t = object->oob_table; t->events; t++) { + if (events & t->events) { + state = t->transit_to; + ASSERT(state->work != NULL); + event = fls(events & t->events) - 1; + __clear_bit(event, &object->oob_event_mask); + clear_bit(event, &object->events); + goto execute_work_state; + } + } + } + + /* Wait states are just transition tables */ + if (!state->work) { + if (events & event_mask) { + for (t = state->transitions; t->events; t++) { + if (events & t->events) { + new_state = t->transit_to; + event = fls(events & t->events) - 1; + clear_bit(event, &object->events); + _debug("{OBJ%x} ev %d: %s -> %s", + object->debug_id, event, + state->name, new_state->name); + object->state = state = new_state; + goto execute_work_state; + } + } + + /* The event mask didn't include all the tabled bits */ + BUG(); + } + /* Randomly woke up */ + goto unmask_events; + } + +execute_work_state: + _debug("{OBJ%x} exec %s", object->debug_id, state->name); + + new_state = state->work(object, event); + event = -1; + if (new_state == NO_TRANSIT) { + _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + fscache_enqueue_object(object); + event_mask = object->oob_event_mask; + goto unmask_events; + } + + _debug("{OBJ%x} %s -> %s", + object->debug_id, state->name, new_state->name); + object->state = state = new_state; + + if (state->work) { + if (unlikely(state->work == ((void *)2UL))) { + _leave(" [dead]"); + return; + } + goto restart_masked; + } + + /* Transited to wait state */ + event_mask = object->oob_event_mask; + for (t = state->transitions; t->events; t++) + event_mask |= t->events; + +unmask_events: + object->event_mask = event_mask; + smp_mb(); + events = object->events; + if (events & event_mask) + goto restart; + _leave(" [msk %lx]", event_mask); +} + +/* + * execute an object + */ +static void fscache_object_work_func(struct work_struct *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + unsigned long start; + + _enter("{OBJ%x}", object->debug_id); + + start = jiffies; + fscache_object_sm_dispatcher(object); + fscache_hist(fscache_objs_histogram, start); + fscache_put_object(object); +} + +/** + * fscache_object_init - Initialise a cache object description + * @object: Object description + * @cookie: Cookie object will be attached to + * @cache: Cache in which backing object will be found + * + * Initialise a cache object description to its basic values. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_object_init(struct fscache_object *object, + struct fscache_cookie *cookie, + struct fscache_cache *cache) +{ + const struct fscache_transition *t; + + atomic_inc(&cache->object_count); + + object->state = STATE(WAIT_FOR_INIT); + object->oob_table = fscache_osm_init_oob; + object->flags = 1 << FSCACHE_OBJECT_IS_LIVE; + spin_lock_init(&object->lock); + INIT_LIST_HEAD(&object->cache_link); + INIT_HLIST_NODE(&object->cookie_link); + INIT_WORK(&object->work, fscache_object_work_func); + INIT_LIST_HEAD(&object->dependents); + INIT_LIST_HEAD(&object->dep_link); + INIT_LIST_HEAD(&object->pending_ops); + object->n_children = 0; + object->n_ops = object->n_in_progress = object->n_exclusive = 0; + object->events = 0; + object->store_limit = 0; + object->store_limit_l = 0; + object->cache = cache; + object->cookie = cookie; + object->parent = NULL; +#ifdef CONFIG_FSCACHE_OBJECT_LIST + RB_CLEAR_NODE(&object->objlist_link); +#endif + + object->oob_event_mask = 0; + for (t = object->oob_table; t->events; t++) + object->oob_event_mask |= t->events; + object->event_mask = object->oob_event_mask; + for (t = object->state->transitions; t->events; t++) + object->event_mask |= t->events; +} +EXPORT_SYMBOL(fscache_object_init); + +/* + * Abort object initialisation before we start it. + */ +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_event_mask = 0; + fscache_dequeue_object(object); + return transit_to(KILL_OBJECT); +} + +/* + * initialise an object + * - check the specified object's parent to see if we can make use of it + * immediately to do a creation + * - we may need to start the process of creating a parent and we need to wait + * for the parent's lookup and creation to complete if it's not there yet + */ +static const struct fscache_state *fscache_initialise_object(struct fscache_object *object, + int event) +{ + struct fscache_object *parent; + bool success; + + _enter("{OBJ%x},%d", object->debug_id, event); + + ASSERT(list_empty(&object->dep_link)); + + parent = object->parent; + if (!parent) { + _leave(" [no parent]"); + return transit_to(DROP_OBJECT); + } + + _debug("parent: %s of:%lx", parent->state->name, parent->flags); + + if (fscache_object_is_dying(parent)) { + _leave(" [bad parent]"); + return transit_to(DROP_OBJECT); + } + + if (fscache_object_is_available(parent)) { + _leave(" [ready]"); + return transit_to(PARENT_READY); + } + + _debug("wait"); + + spin_lock(&parent->lock); + fscache_stat(&fscache_n_cop_grab_object); + success = false; + if (fscache_object_is_live(parent) && + object->cache->ops->grab_object(object)) { + list_add(&object->dep_link, &parent->dependents); + success = true; + } + fscache_stat_d(&fscache_n_cop_grab_object); + spin_unlock(&parent->lock); + if (!success) { + _leave(" [grab failed]"); + return transit_to(DROP_OBJECT); + } + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD); + _leave(" [wait]"); + return transit_to(WAIT_FOR_PARENT); +} + +/* + * Once the parent object is ready, we should kick off our lookup op. + */ +static const struct fscache_state *fscache_parent_ready(struct fscache_object *object, + int event) +{ + struct fscache_object *parent = object->parent; + + _enter("{OBJ%x},%d", object->debug_id, event); + + ASSERT(parent != NULL); + + spin_lock(&parent->lock); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + spin_unlock(&parent->lock); + + _leave(""); + return transit_to(LOOK_UP_OBJECT); +} + +/* + * look an object up in the cache from which it was allocated + * - we hold an "access lock" on the parent object, so the parent object cannot + * be withdrawn by either party till we've finished + */ +static const struct fscache_state *fscache_look_up_object(struct fscache_object *object, + int event) +{ + struct fscache_cookie *cookie = object->cookie; + struct fscache_object *parent = object->parent; + int ret; + + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_table = fscache_osm_lookup_oob; + + ASSERT(parent != NULL); + ASSERTCMP(parent->n_ops, >, 0); + ASSERTCMP(parent->n_obj_ops, >, 0); + + /* make sure the parent is still available */ + ASSERT(fscache_object_is_available(parent)); + + if (fscache_object_is_dying(parent) || + test_bit(FSCACHE_IOERROR, &object->cache->flags) || + !fscache_use_cookie(object)) { + _leave(" [unavailable]"); + return transit_to(LOOKUP_FAILURE); + } + + _debug("LOOKUP \"%s\" in \"%s\"", + cookie->def->name, object->cache->tag->name); + + fscache_stat(&fscache_n_object_lookups); + fscache_stat(&fscache_n_cop_lookup_object); + ret = object->cache->ops->lookup_object(object); + fscache_stat_d(&fscache_n_cop_lookup_object); + + fscache_unuse_cookie(object); + + if (ret == -ETIMEDOUT) { + /* probably stuck behind another object, so move this one to + * the back of the queue */ + fscache_stat(&fscache_n_object_lookups_timed_out); + _leave(" [timeout]"); + return NO_TRANSIT; + } + + if (ret < 0) { + _leave(" [error]"); + return transit_to(LOOKUP_FAILURE); + } + + _leave(" [ok]"); + return transit_to(OBJECT_AVAILABLE); +} + +/** + * fscache_object_lookup_negative - Note negative cookie lookup + * @object: Object pointing to cookie to mark + * + * Note negative lookup, permitting those waiting to read data from an already + * existing backing object to continue as there's no data for them to read. + */ +void fscache_object_lookup_negative(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); + + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + fscache_stat(&fscache_n_object_lookups_negative); + + /* Allow write requests to begin stacking up and read requests to begin + * returning ENODATA. + */ + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + _debug("wake up lookup %p", &cookie->flags); + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + } + _leave(""); +} +EXPORT_SYMBOL(fscache_object_lookup_negative); + +/** + * fscache_obtained_object - Note successful object lookup or creation + * @object: Object pointing to cookie to mark + * + * Note successful lookup and/or creation, permitting those waiting to write + * data to a backing object to continue. + * + * Note that after calling this, an object's cookie may be relinquished by the + * netfs, and so must be accessed with object lock held. + */ +void fscache_obtained_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); + + /* if we were still looking up, then we must have a positive lookup + * result, in which case there may be data available */ + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + fscache_stat(&fscache_n_object_lookups_positive); + + /* We do (presumably) have data */ + clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + /* Allow write requests to begin stacking up and read requests + * to begin shovelling data. + */ + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + } else { + fscache_stat(&fscache_n_object_created); + } + + set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); + _leave(""); +} +EXPORT_SYMBOL(fscache_obtained_object); + +/* + * handle an object that has just become available + */ +static const struct fscache_state *fscache_object_available(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_table = fscache_osm_run_oob; + + spin_lock(&object->lock); + + fscache_done_parent_op(object); + if (object->n_in_progress == 0) { + if (object->n_ops > 0) { + ASSERTCMP(object->n_ops, >=, object->n_obj_ops); + fscache_start_operations(object); + } else { + ASSERT(list_empty(&object->pending_ops)); + } + } + spin_unlock(&object->lock); + + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); + + fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); + fscache_stat(&fscache_n_object_avail); + + _leave(""); + return transit_to(JUMPSTART_DEPS); +} + +/* + * Wake up this object's dependent objects now that we've become available. + */ +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY)) + return NO_TRANSIT; /* Not finished; requeue */ + return transit_to(WAIT_FOR_CMD); +} + +/* + * Handle lookup or creation failute. + */ +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object, + int event) +{ + struct fscache_cookie *cookie; + + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_event_mask = 0; + + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); + + cookie = object->cookie; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + + fscache_done_parent_op(object); + return transit_to(KILL_OBJECT); +} + +/* + * Wait for completion of all active operations on this object and the death of + * all child objects of this object. + */ +static const struct fscache_state *fscache_kill_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x,%d,%d},%d", + object->debug_id, object->n_ops, object->n_children, event); + + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + object->oob_event_mask = 0; + + if (list_empty(&object->dependents) && + object->n_ops == 0 && + object->n_children == 0) + return transit_to(DROP_OBJECT); + + if (object->n_in_progress == 0) { + spin_lock(&object->lock); + if (object->n_ops > 0 && object->n_in_progress == 0) + fscache_start_operations(object); + spin_unlock(&object->lock); + } + + if (!list_empty(&object->dependents)) + return transit_to(KILL_DEPENDENTS); + + return transit_to(WAIT_FOR_CLEARANCE); +} + +/* + * Kill dependent objects. + */ +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL)) + return NO_TRANSIT; /* Not finished */ + return transit_to(WAIT_FOR_CLEARANCE); +} + +/* + * Drop an object's attachments + */ +static const struct fscache_state *fscache_drop_object(struct fscache_object *object, + int event) +{ + struct fscache_object *parent = object->parent; + struct fscache_cookie *cookie = object->cookie; + struct fscache_cache *cache = object->cache; + bool awaken = false; + + _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event); + + ASSERT(cookie != NULL); + ASSERT(!hlist_unhashed(&object->cookie_link)); + + /* Make sure the cookie no longer points here and that the netfs isn't + * waiting for us. + */ + spin_lock(&cookie->lock); + hlist_del_init(&object->cookie_link); + if (hlist_empty(&cookie->backing_objects) && + test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; + spin_unlock(&cookie->lock); + + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + + /* Prevent a race with our last child, which has to signal EV_CLEARED + * before dropping our spinlock. + */ + spin_lock(&object->lock); + spin_unlock(&object->lock); + + /* Discard from the cache's collection of objects */ + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); + + fscache_stat(&fscache_n_cop_drop_object); + cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); + + /* The parent object wants to know when all it dependents have gone */ + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); + + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; + } + + /* this just shifts the object release to the work processor */ + fscache_put_object(object); + fscache_stat(&fscache_n_object_dead); + + _leave(""); + return transit_to(OBJECT_DEAD); +} + +/* + * get a ref on an object + */ +static int fscache_get_object(struct fscache_object *object) +{ + int ret; + + fscache_stat(&fscache_n_cop_grab_object); + ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat_d(&fscache_n_cop_grab_object); + return ret; +} + +/* + * Discard a ref on an object + */ +static void fscache_put_object(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cop_put_object); + object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); +} + +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *object) +{ + fscache_objlist_remove(object); + + /* We can get rid of the cookie now */ + fscache_cookie_put(object->cookie); + object->cookie = NULL; +} +EXPORT_SYMBOL(fscache_object_destroy); + +/* + * enqueue an object for metadata-type processing + */ +void fscache_enqueue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object) >= 0) { + wait_queue_head_t *cong_wq = + &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) + wake_up(cong_wq); + } else + fscache_put_object(object); + + put_cpu_var(fscache_object_cong_wait); + } +} + +/** + * fscache_object_sleep_till_congested - Sleep until object wq is congested + * @timeoutp: Scheduler sleep timeout + * + * Allow an object handler to sleep until the object workqueue is congested. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * %true is returned if the object wq is congested, %false otherwise. + */ +bool fscache_object_sleep_till_congested(signed long *timeoutp) +{ + wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + + add_wait_queue_exclusive(cong_wq, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); + finish_wait(cong_wq, &wait); + + return fscache_object_congested(); +} +EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); + +/* + * Enqueue the dependents of an object for metadata-type processing. + * + * If we don't manage to finish the list before the scheduler wants to run + * again then return false immediately. We return true if the list was + * cleared. + */ +static bool fscache_enqueue_dependents(struct fscache_object *object, int event) +{ + struct fscache_object *dep; + bool ret = true; + + _enter("{OBJ%x}", object->debug_id); + + if (list_empty(&object->dependents)) + return true; + + spin_lock(&object->lock); + + while (!list_empty(&object->dependents)) { + dep = list_entry(object->dependents.next, + struct fscache_object, dep_link); + list_del_init(&dep->dep_link); + + fscache_raise_event(dep, event); + fscache_put_object(dep); + + if (!list_empty(&object->dependents) && need_resched()) { + ret = false; + break; + } + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * remove an object from whatever queue it's waiting on + */ +static void fscache_dequeue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (!list_empty(&object->dep_link)) { + spin_lock(&object->parent->lock); + list_del_init(&object->dep_link); + spin_unlock(&object->parent->lock); + } + + _leave(""); +} + +/** + * fscache_check_aux - Ask the netfs whether an object on disk is still valid + * @object: The object to ask about + * @data: The auxiliary data for the object + * @datalen: The size of the auxiliary data + * + * This function consults the netfs about the coherency state of an object. + * The caller must be holding a ref on cookie->n_active (held by + * fscache_look_up_object() on behalf of the cache backend during object lookup + * and creation). + */ +enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, uint16_t datalen) +{ + enum fscache_checkaux result; + + if (!object->cookie->def->check_aux) { + fscache_stat(&fscache_n_checkaux_none); + return FSCACHE_CHECKAUX_OKAY; + } + + result = object->cookie->def->check_aux(object->cookie->netfs_data, + data, datalen); + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + fscache_stat(&fscache_n_checkaux_okay); + break; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + fscache_stat(&fscache_n_checkaux_update); + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + fscache_stat(&fscache_n_checkaux_obsolete); + break; + + default: + BUG(); + } + + return result; +} +EXPORT_SYMBOL(fscache_check_aux); + +/* + * Asynchronously invalidate an object. + */ +static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object, + int event) +{ + struct fscache_operation *op; + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x},%d", object->debug_id, event); + + /* We're going to need the cookie. If the cookie is not available then + * retire the object instead. + */ + if (!fscache_use_cookie(object)) { + ASSERT(object->cookie->stores.rnode == NULL); + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + _leave(" [no cookie]"); + return transit_to(KILL_OBJECT); + } + + /* Reject any new read/write ops and abort any that are pending. */ + fscache_invalidate_writes(cookie); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + + /* Now we have to wait for in-progress reads and writes */ + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + goto nomem; + + fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); + + spin_lock(&cookie->lock); + if (fscache_submit_exclusive_op(object, op) < 0) + goto submit_op_failed; + spin_unlock(&cookie->lock); + fscache_put_operation(op); + + /* Once we've completed the invalidation, we know there will be no data + * stored in the cache and thus we can reinstate the data-check-skip + * optimisation. + */ + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* We can allow read and write requests to come in once again. They'll + * queue up behind our exclusive invalidation operation. + */ + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + _leave(" [ok]"); + return transit_to(UPDATE_OBJECT); + +nomem: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_unuse_cookie(object); + _leave(" [ENOMEM]"); + return transit_to(KILL_OBJECT); + +submit_op_failed: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&cookie->lock); + fscache_unuse_cookie(object); + kfree(op); + _leave(" [EIO]"); + return transit_to(KILL_OBJECT); +} + +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object, + int event) +{ + const struct fscache_state *s; + + fscache_stat(&fscache_n_invalidates_run); + fscache_stat(&fscache_n_cop_invalidate_object); + s = _fscache_invalidate_object(object, event); + fscache_stat_d(&fscache_n_cop_invalidate_object); + return s; +} + +/* + * Asynchronously update an object. + */ +static const struct fscache_state *fscache_update_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); + + _leave(""); + return transit_to(WAIT_FOR_CMD); +} diff --git a/kernel/fs/fscache/operation.c b/kernel/fs/fscache/operation.c new file mode 100644 index 000000000..e7b87a0e5 --- /dev/null +++ b/kernel/fs/fscache/operation.c @@ -0,0 +1,529 @@ +/* FS-Cache worker operation management routines + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/operations.txt + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include +#include +#include +#include "internal.h" + +atomic_t fscache_op_debug_id; +EXPORT_SYMBOL(fscache_op_debug_id); + +/** + * fscache_enqueue_operation - Enqueue an operation for processing + * @op: The operation to enqueue + * + * Enqueue an operation for processing by the FS-Cache thread pool. + * + * This will get its own ref on the object. + */ +void fscache_enqueue_operation(struct fscache_operation *op) +{ + _enter("{OBJ%x OP%x,%u}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(list_empty(&op->pend_link)); + ASSERT(op->processor != NULL); + ASSERT(fscache_object_is_available(op->object)); + ASSERTCMP(atomic_read(&op->usage), >, 0); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); + + fscache_stat(&fscache_n_op_enqueue); + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_ASYNC: + _debug("queue async"); + atomic_inc(&op->usage); + if (!queue_work(fscache_op_wq, &op->work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + pr_err("Unexpected op type %lx", op->flags); + BUG(); + break; + } +} +EXPORT_SYMBOL(fscache_enqueue_operation); + +/* + * start an op running + */ +static void fscache_run_op(struct fscache_object *object, + struct fscache_operation *op) +{ + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + + op->state = FSCACHE_OP_ST_IN_PROGRESS; + object->n_in_progress++; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + fscache_stat(&fscache_n_op_run); +} + +/* + * submit an exclusive operation for an object + * - other ops are excluded from running simultaneously with this one + * - this gets any extra refs it needs on an op + */ +int fscache_submit_exclusive_op(struct fscache_object *object, + struct fscache_operation *op) +{ + int ret; + + _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); + + op->state = FSCACHE_OP_ST_PENDING; + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + + if (object->n_in_progress > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_in_progress, ==, 0); + fscache_run_op(object, op); + } + + /* need to issue a new write op after this */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + ret = 0; + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else { + /* If we're in any other state, there must have been an I/O + * error of some nature. + */ + ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); + ret = -EIO; + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* + * submit an operation for an object + * - objects may be submitted only in the following states: + * - during object creation (write ops may be submitted) + * - whilst the object is active + * - after an I/O error incurred in one of the two above states (op rejected) + * - this gets any extra refs it needs on an op + */ +int fscache_submit_op(struct fscache_object *object, + struct fscache_operation *op) +{ + const struct fscache_state *ostate; + int ret; + + _enter("{OBJ%x OP%x},{%u}", + object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); + + ostate = object->state; + smp_rmb(); + + op->state = FSCACHE_OP_ST_PENDING; + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + + if (object->n_exclusive > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_exclusive, ==, 0); + fscache_run_op(object, op); + } + ret = 0; + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + op->object = object; + object->n_ops++; + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else if (fscache_object_is_dying(object)) { + fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + fscache_report_unexpected_submission(object, op, ostate); + ASSERT(!fscache_object_is_active(object)); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * queue an object for withdrawal on error, aborting all following asynchronous + * operations + */ +void fscache_abort_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +} + +/* + * Jump start the operation processing on an object. The caller must hold + * object->lock. + */ +void fscache_start_operations(struct fscache_object *object) +{ + struct fscache_operation *op; + bool stop = false; + + while (!list_empty(&object->pending_ops) && !stop) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + if (object->n_in_progress > 0) + break; + stop = true; + } + list_del_init(&op->pend_link); + fscache_run_op(object, op); + + /* the pending queue was holding a ref on the object */ + fscache_put_operation(op); + } + + ASSERTCMP(object->n_in_progress, <=, object->n_ops); + + _debug("woke %d ops on OBJ%x", + object->n_in_progress, object->debug_id); +} + +/* + * cancel an operation that's pending on an object + */ +int fscache_cancel_op(struct fscache_operation *op, + void (*do_cancel)(struct fscache_operation *)) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + + ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); + ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + + ret = -EBUSY; + if (op->state == FSCACHE_OP_ST_PENDING) { + ASSERT(!list_empty(&op->pend_link)); + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + if (do_cancel) + do_cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + ret = 0; + } + + spin_unlock(&object->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * Cancel all pending operations on an object + */ +void fscache_cancel_all_ops(struct fscache_object *object) +{ + struct fscache_operation *op; + + _enter("OBJ%x", object->debug_id); + + spin_lock(&object->lock); + + while (!list_empty(&object->pending_ops)) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->state = FSCACHE_OP_ST_CANCELLED; + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + cond_resched_lock(&object->lock); + } + + spin_unlock(&object->lock); + _leave(""); +} + +/* + * Record the completion or cancellation of an in-progress operation. + */ +void fscache_op_complete(struct fscache_operation *op, bool cancelled) +{ + struct fscache_object *object = op->object; + + _enter("OBJ%x", object->debug_id); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); + ASSERTCMP(object->n_in_progress, >, 0); + ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), + object->n_exclusive, >, 0); + ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), + object->n_in_progress, ==, 1); + + spin_lock(&object->lock); + + op->state = cancelled ? + FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + spin_unlock(&object->lock); + _leave(""); +} +EXPORT_SYMBOL(fscache_op_complete); + +/* + * release an operation + * - queues pending ops if this is the last in-progress op + */ +void fscache_put_operation(struct fscache_operation *op) +{ + struct fscache_object *object; + struct fscache_cache *cache; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (!atomic_dec_and_test(&op->usage)) + return; + + _debug("PUT OP"); + ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + op->state, ==, FSCACHE_OP_ST_CANCELLED); + op->state = FSCACHE_OP_ST_DEAD; + + fscache_stat(&fscache_n_op_release); + + if (op->release) { + op->release(op); + op->release = NULL; + } + + object = op->object; + + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + kfree(op); + _leave(" [done]"); +} +EXPORT_SYMBOL(fscache_put_operation); + +/* + * garbage collect operations that have had their release deferred + */ +void fscache_operation_gc(struct work_struct *work) +{ + struct fscache_operation *op; + struct fscache_object *object; + struct fscache_cache *cache = + container_of(work, struct fscache_cache, op_gc); + int count = 0; + + _enter(""); + + do { + spin_lock(&cache->op_gc_list_lock); + if (list_empty(&cache->op_gc_list)) { + spin_unlock(&cache->op_gc_list_lock); + break; + } + + op = list_entry(cache->op_gc_list.next, + struct fscache_operation, pend_link); + list_del(&op->pend_link); + spin_unlock(&cache->op_gc_list_lock); + + object = op->object; + spin_lock(&object->lock); + + _debug("GC DEFERRED REL OBJ%x OP%x", + object->debug_id, op->debug_id); + fscache_stat(&fscache_n_op_gc); + + ASSERTCMP(atomic_read(&op->usage), ==, 0); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + kfree(op); + + } while (count++ < 20); + + if (!list_empty(&cache->op_gc_list)) + schedule_work(&cache->op_gc); + + _leave(""); +} + +/* + * execute an operation using fs_op_wq to provide processing context - + * the caller holds a ref to this object, so we don't need to hold one + */ +void fscache_op_work_func(struct work_struct *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, work); + unsigned long start; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + start = jiffies; + op->processor(op); + fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(op); + + _leave(""); +} diff --git a/kernel/fs/fscache/page.c b/kernel/fs/fscache/page.c new file mode 100644 index 000000000..de33b3fcc --- /dev/null +++ b/kernel/fs/fscache/page.c @@ -0,0 +1,1195 @@ +/* Cache page management and data I/O routines + * + * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL PAGE +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * check to see if a page is being written to the cache + */ +bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) +{ + void *val; + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + rcu_read_unlock(); + + return val != NULL; +} +EXPORT_SYMBOL(__fscache_check_page_write); + +/* + * wait for a page to finish being written to the cache + */ +void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + wait_event(*wq, !__fscache_check_page_write(cookie, page)); +} +EXPORT_SYMBOL(__fscache_wait_on_page_write); + +/* + * wait for a page to finish being written to the cache. Put a timeout here + * since we might be called recursively via parent fs. + */ +static +bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), + HZ); +} + +/* + * decide whether a page can be released, possibly by cancelling a store to it + * - we're allowed to sleep if __GFP_WAIT is flagged + */ +bool __fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct page *xpage; + void *val; + + _enter("%p,%p,%x", cookie, page, gfp); + +try_again: + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + if (!val) { + rcu_read_unlock(); + fscache_stat(&fscache_n_store_vmscan_not_storing); + __fscache_uncache_page(cookie, page); + return true; + } + + /* see if the page is actually undergoing storage - if so we can't get + * rid of it till the cache has finished with it */ + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + rcu_read_unlock(); + goto page_busy; + } + + /* the page is pending storage, so we attempt to cancel the store and + * discard the store request so that the page can be reclaimed */ + spin_lock(&cookie->stores_lock); + rcu_read_unlock(); + + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + /* the page started to undergo storage whilst we were looking, + * so now we can only wait or return */ + spin_unlock(&cookie->stores_lock); + goto page_busy; + } + + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + + if (xpage) { + fscache_stat(&fscache_n_store_vmscan_cancelled); + fscache_stat(&fscache_n_store_radix_deletes); + ASSERTCMP(xpage, ==, page); + } else { + fscache_stat(&fscache_n_store_vmscan_gone); + } + + wake_up_bit(&cookie->flags, 0); + if (xpage) + page_cache_release(xpage); + __fscache_uncache_page(cookie, page); + return true; + +page_busy: + /* We will wait here if we're allowed to, but that could deadlock the + * allocator as the work threads writing to the cache may all end up + * sleeping on memory allocation, so we may need to impose a timeout + * too. */ + if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { + fscache_stat(&fscache_n_store_vmscan_busy); + return false; + } + + fscache_stat(&fscache_n_store_vmscan_wait); + if (!release_page_wait_timeout(cookie, page)) + _debug("fscache writeout timeout page: %p{%lx}", + page, page->index); + + gfp &= ~__GFP_WAIT; + goto try_again; +} +EXPORT_SYMBOL(__fscache_maybe_release_page); + +/* + * note that a page has finished being written to the cache + */ +static void fscache_end_page_write(struct fscache_object *object, + struct page *page) +{ + struct fscache_cookie *cookie; + struct page *xpage = NULL; + + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* delete the page from the tree if it is now no longer + * pending */ + spin_lock(&cookie->stores_lock); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + if (!radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG)) { + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + } + spin_unlock(&cookie->stores_lock); + wake_up_bit(&cookie->flags, 0); + } + spin_unlock(&object->lock); + if (xpage) + page_cache_release(xpage); +} + +/* + * actually apply the changed attributes to a cache object + */ +static void fscache_attr_changed_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); + + fscache_stat(&fscache_n_attr_changed_calls); + + if (fscache_object_is_active(object)) { + fscache_stat(&fscache_n_cop_attr_changed); + ret = object->cache->ops->attr_changed(object); + fscache_stat_d(&fscache_n_cop_attr_changed); + if (ret < 0) + fscache_abort_object(object); + } + + fscache_op_complete(op, true); + _leave(""); +} + +/* + * notification that the attributes on an object have changed + */ +int __fscache_attr_changed(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + bool wake_cookie = false; + + _enter("%p", cookie); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + fscache_stat(&fscache_n_attr_changed); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { + fscache_stat(&fscache_n_attr_changed_nomem); + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + fscache_operation_init(op, fscache_attr_changed_op, NULL); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + __fscache_use_cookie(cookie); + if (fscache_submit_exclusive_op(object, op) < 0) + goto nobufs_dec; + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_attr_changed_ok); + fscache_put_operation(op); + _leave(" = 0"); + return 0; + +nobufs_dec: + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs: + spin_unlock(&cookie->lock); + kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); + fscache_stat(&fscache_n_attr_changed_nobufs); + _leave(" = %d", -ENOBUFS); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_attr_changed); + +/* + * release a retrieval op reference + */ +static void fscache_release_retrieval_op(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + _enter("{OP%x}", op->op.debug_id); + + ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + + fscache_hist(fscache_retrieval_histogram, op->start_time); + if (op->context) + fscache_put_context(op->op.object->cookie, op->context); + + _leave(""); +} + +/* + * allocate a retrieval op + */ +static struct fscache_retrieval *fscache_alloc_retrieval( + struct fscache_cookie *cookie, + struct address_space *mapping, + fscache_rw_complete_t end_io_func, + void *context) +{ + struct fscache_retrieval *op; + + /* allocate a retrieval operation and attempt to submit it */ + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) { + fscache_stat(&fscache_n_retrievals_nomem); + return NULL; + } + + fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + op->op.flags = FSCACHE_OP_MYTHREAD | + (1UL << FSCACHE_OP_WAITING) | + (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->mapping = mapping; + op->end_io_func = end_io_func; + op->context = context; + op->start_time = jiffies; + INIT_LIST_HEAD(&op->to_do); + return op; +} + +/* + * wait for a deferred lookup to complete + */ +int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +{ + unsigned long jif; + + _enter(""); + + if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { + _leave(" = 0 [imm]"); + return 0; + } + + fscache_stat(&fscache_n_retrievals_wait); + + jif = jiffies; + if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + TASK_INTERRUPTIBLE) != 0) { + fscache_stat(&fscache_n_retrievals_intr); + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; + } + + ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); + + smp_rmb(); + fscache_hist(fscache_retrieval_delay_histogram, jif); + _leave(" = 0 [dly]"); + return 0; +} + +/* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + +/* + * wait for an object to become active (or dead) + */ +int fscache_wait_for_operation_activation(struct fscache_object *object, + struct fscache_operation *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead, + void (*do_cancel)(struct fscache_operation *)) +{ + int ret; + + if (!test_bit(FSCACHE_OP_WAITING, &op->flags)) + goto check_if_dead; + + _debug(">>> WT"); + if (stat_op_waits) + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, + TASK_INTERRUPTIBLE) != 0) { + ret = fscache_cancel_op(op, do_cancel); + if (ret == 0) + return -ERESTARTSYS; + + /* it's been removed from the pending queue by another party, + * so we should get to run shortly */ + wait_on_bit(&op->flags, FSCACHE_OP_WAITING, + TASK_UNINTERRUPTIBLE); + } + _debug("<<< GO"); + +check_if_dead: + if (op->state == FSCACHE_OP_ST_CANCELLED) { + if (stat_object_dead) + fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [cancelled]"); + return -ENOBUFS; + } + if (unlikely(fscache_object_is_dead(object))) { + pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); + fscache_cancel_op(op, do_cancel); + if (stat_object_dead) + fscache_stat(stat_object_dead); + return -ENOBUFS; + } + return 0; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * -ENODATA - no data available in the backing object for this block + * 0 - dispatched a read - it'll call end_io_func() when finished + */ +int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(cookie, page->mapping, + end_io_func, context); + if (!op) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + atomic_set(&op->n_pages, 1); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); + + __fscache_use_cookie(cookie); + atomic_inc(&object->n_reads); + __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock_dec; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + ret = fscache_wait_for_operation_activation( + object, &op->op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_page); + ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); + if (ret == 0) + ret = -ENODATA; + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_page); + ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_page); + } + +error: + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock_dec: + atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs_unlock: + spin_unlock(&cookie->lock); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_page); + +/* + * read a list of page from the cache or allocate a block in which to store + * them + * - we return: + * -ENOMEM - out of memory, some pages may be being read + * -ERESTARTSYS - interrupted, some pages may be being read + * -ENOBUFS - no backing object or space available in which to cache any + * pages not being read + * -ENODATA - no data available in the backing object for some or all of + * the pages + * 0 - dispatched a read on all pages + * + * end_io_func() will be called for each page read from the cache as it is + * finishes being read + * + * any pages for which a read is dispatched will be removed from pages and + * nr_pages + */ +int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("%p,,%d,,,", cookie, *nr_pages); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(*nr_pages, >, 0); + ASSERT(!list_empty(pages)); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context); + if (!op) + return -ENOMEM; + atomic_set(&op->n_pages, *nr_pages); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + __fscache_use_cookie(cookie); + atomic_inc(&object->n_reads); + __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock_dec; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + ret = fscache_wait_for_operation_activation( + object, &op->op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_pages); + ret = object->cache->ops->allocate_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_allocate_pages); + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_pages); + ret = object->cache->ops->read_or_alloc_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); + } + +error: + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock_dec: + atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_pages); + +/* + * allocate a block in the cache on which to store a page + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * 0 - block allocated + */ +int __fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_allocs); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL); + if (!op) + return -ENOMEM; + atomic_set(&op->n_pages, 1); + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + __fscache_use_cookie(cookie); + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock_dec; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_alloc_ops); + + ret = fscache_wait_for_operation_activation( + object, &op->op, + __fscache_stat(&fscache_n_alloc_op_waits), + __fscache_stat(&fscache_n_allocs_object_dead), + fscache_do_cancel_retrieval); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + fscache_stat(&fscache_n_cop_allocate_page); + ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); + +error: + if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_allocs_intr); + else if (ret < 0) + fscache_stat(&fscache_n_allocs_nobufs); + else + fscache_stat(&fscache_n_allocs_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock_dec: + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); +nobufs: + fscache_stat(&fscache_n_allocs_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_alloc_page); + +/* + * Unmark pages allocate in the readahead code path (via: + * fscache_readpages_or_alloc) after delegating to the base filesystem + */ +void __fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages) +{ + struct page *page; + + list_for_each_entry(page, pages, lru) { + if (PageFsCache(page)) + __fscache_uncache_page(cookie, page); + } +} +EXPORT_SYMBOL(__fscache_readpages_cancel); + +/* + * release a write op reference + */ +static void fscache_release_write_op(struct fscache_operation *_op) +{ + _enter("{OP%x}", _op->debug_id); +} + +/* + * perform the background storage of a page into the cache + */ +static void fscache_write_op(struct fscache_operation *_op) +{ + struct fscache_storage *op = + container_of(_op, struct fscache_storage, op); + struct fscache_object *object = op->op.object; + struct fscache_cookie *cookie; + struct page *page; + unsigned n; + void *results[1]; + int ret; + + _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + + spin_lock(&object->lock); + cookie = object->cookie; + + if (!fscache_object_is_active(object)) { + /* If we get here, then the on-disk cache object likely longer + * exists, so we should just cancel this write operation. + */ + spin_unlock(&object->lock); + fscache_op_complete(&op->op, false); + _leave(" [inactive]"); + return; + } + + if (!cookie) { + /* If we get here, then the cookie belonging to the object was + * detached, probably by the cookie being withdrawn due to + * memory pressure, which means that the pages we might write + * to the cache from no longer exist - therefore, we can just + * cancel this write operation. + */ + spin_unlock(&object->lock); + fscache_op_complete(&op->op, false); + _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", + _op->flags, _op->state, object->state->short_name, + object->flags); + return; + } + + spin_lock(&cookie->stores_lock); + + fscache_stat(&fscache_n_store_calls); + + /* find a page to store */ + page = NULL; + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, + FSCACHE_COOKIE_PENDING_TAG); + if (n != 1) + goto superseded; + page = results[0]; + _debug("gang %d [%lx]", n, page->index); + if (page->index > op->store_limit) { + fscache_stat(&fscache_n_store_pages_over_limit); + goto superseded; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + + fscache_stat(&fscache_n_store_pages); + fscache_stat(&fscache_n_cop_write_page); + ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); + fscache_end_page_write(object, page); + if (ret < 0) { + fscache_abort_object(object); + fscache_op_complete(&op->op, true); + } else { + fscache_enqueue_operation(&op->op); + } + + _leave(""); + return; + +superseded: + /* this writer is going away and there aren't any more things to + * write */ + _debug("cease"); + spin_unlock(&cookie->stores_lock); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + spin_unlock(&object->lock); + fscache_op_complete(&op->op, true); + _leave(""); +} + +/* + * Clear the pages pending writing for invalidation + */ +void fscache_invalidate_writes(struct fscache_cookie *cookie) +{ + struct page *page; + void *results[16]; + int n, i; + + _enter(""); + + for (;;) { + spin_lock(&cookie->stores_lock); + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, + ARRAY_SIZE(results), + FSCACHE_COOKIE_PENDING_TAG); + if (n == 0) { + spin_unlock(&cookie->stores_lock); + break; + } + + for (i = n - 1; i >= 0; i--) { + page = results[i]; + radix_tree_delete(&cookie->stores, page->index); + } + + spin_unlock(&cookie->stores_lock); + + for (i = n - 1; i >= 0; i--) + page_cache_release(results[i]); + } + + _leave(""); +} + +/* + * request a page be stored in the cache + * - returns: + * -ENOMEM - out of memory, nothing done + * -ENOBUFS - no backing object available in which to cache the page + * 0 - dispatched a write - it'll call end_io_func() when finished + * + * if the cookie still has a backing object at this point, that object can be + * in one of a few states with respect to storage processing: + * + * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is + * set) + * + * (a) no writes yet + * + * (b) writes deferred till post-creation (mark page for writing and + * return immediately) + * + * (2) negative lookup, object created, initial fill being made from netfs + * + * (a) fill point not yet reached this page (mark page for writing and + * return) + * + * (b) fill point passed this page (queue op to store this page) + * + * (3) object extant (queue op to store this page) + * + * any other state is invalid + */ +int __fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_storage *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + _enter("%p,%x,", cookie, (u32) page->flags); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERT(PageFsCache(page)); + + fscache_stat(&fscache_n_stores); + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!op) + goto nomem; + + fscache_operation_init(&op->op, fscache_write_op, + fscache_release_write_op); + op->op.flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_WAITING) | + (1 << FSCACHE_OP_UNUSE_COOKIE); + + ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); + if (ret < 0) + goto nomem_free; + + ret = -ENOBUFS; + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto nobufs; + + /* add the page to the pending-storage radix tree on the backing + * object */ + spin_lock(&object->lock); + spin_lock(&cookie->stores_lock); + + _debug("store limit %llx", (unsigned long long) object->store_limit); + + ret = radix_tree_insert(&cookie->stores, page->index, page); + if (ret < 0) { + if (ret == -EEXIST) + goto already_queued; + _debug("insert failed %d", ret); + goto nobufs_unlock_obj; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + page_cache_get(page); + + /* we only want one writer at a time, but we do need to queue new + * writers after exclusive ops */ + if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) + goto already_pending; + + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + + op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); + op->store_limit = object->store_limit; + + __fscache_use_cookie(cookie); + if (fscache_submit_op(object, &op->op) < 0) + goto submit_failed; + + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + fscache_stat(&fscache_n_store_ops); + fscache_stat(&fscache_n_stores_ok); + + /* the work queue now carries its own ref on the object */ + fscache_put_operation(&op->op); + _leave(" = 0"); + return 0; + +already_queued: + fscache_stat(&fscache_n_stores_again); +already_pending: + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_ok); + _leave(" = 0"); + return 0; + +submit_failed: + spin_lock(&cookie->stores_lock); + radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + wake_cookie = __fscache_unuse_cookie(cookie); + page_cache_release(page); + ret = -ENOBUFS; + goto nobufs; + +nobufs_unlock_obj: + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); +nobufs: + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); + fscache_stat(&fscache_n_stores_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; + +nomem_free: + kfree(op); +nomem: + fscache_stat(&fscache_n_stores_oom); + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(__fscache_write_page); + +/* + * remove a page from the cache + */ +void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) +{ + struct fscache_object *object; + + _enter(",%p", page); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + fscache_stat(&fscache_n_uncaches); + + /* cache withdrawal may beat us to it */ + if (!PageFsCache(page)) + goto done; + + /* get the object */ + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) { + ClearPageFsCache(page); + goto done_unlock; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + /* there might now be stuff on disk we could read */ + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* only invoke the cache backend if we managed to mark the page + * uncached here; this deals with synchronisation vs withdrawal */ + if (TestClearPageFsCache(page) && + object->cache->ops->uncache_page) { + /* the cache backend releases the cookie lock */ + fscache_stat(&fscache_n_cop_uncache_page); + object->cache->ops->uncache_page(object, page); + fscache_stat_d(&fscache_n_cop_uncache_page); + goto done; + } + +done_unlock: + spin_unlock(&cookie->lock); +done: + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_page); + +/** + * fscache_mark_page_cached - Mark a page as being cached + * @op: The retrieval op pages are being marked for + * @page: The page to be marked + * + * Mark a netfs page as being cached. After this is called, the netfs + * must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) +{ + struct fscache_cookie *cookie = op->op.object->cookie; + +#ifdef CONFIG_FSCACHE_STATS + atomic_inc(&fscache_n_marks); +#endif + + _debug("- mark %p{%lx}", page, page->index); + if (TestSetPageFsCache(page)) { + static bool once_only; + if (!once_only) { + once_only = true; + pr_warn("Cookie type %s marked page %lx multiple times\n", + cookie->def->name, page->index); + } + } + + if (cookie->def->mark_page_cached) + cookie->def->mark_page_cached(cookie->netfs_data, + op->mapping, page); +} +EXPORT_SYMBOL(fscache_mark_page_cached); + +/** + * fscache_mark_pages_cached - Mark pages as being cached + * @op: The retrieval op pages are being marked for + * @pagevec: The pages to be marked + * + * Mark a bunch of netfs pages as being cached. After this is called, + * the netfs must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec) +{ + unsigned long loop; + + for (loop = 0; loop < pagevec->nr; loop++) + fscache_mark_page_cached(op, pagevec->pages[loop]); + + pagevec_reinit(pagevec); +} +EXPORT_SYMBOL(fscache_mark_pages_cached); + +/* + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + */ +void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t next; + int i; + + _enter("%p,%p", cookie, inode); + + if (!mapping || mapping->nrpages == 0) { + _leave(" [no pages]"); + return; + } + + pagevec_init(&pvec, 0); + next = 0; + do { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + next = page->index; + if (PageFsCache(page)) { + __fscache_wait_on_page_write(cookie, page); + __fscache_uncache_page(cookie, page); + } + } + pagevec_release(&pvec); + cond_resched(); + } while (++next); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/kernel/fs/fscache/proc.c b/kernel/fs/fscache/proc.c new file mode 100644 index 000000000..1d9e4951a --- /dev/null +++ b/kernel/fs/fscache/proc.c @@ -0,0 +1,81 @@ +/* FS-Cache statistics viewing interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include +#include +#include +#include "internal.h" + +/* + * initialise the /proc/fs/fscache/ directory + */ +int __init fscache_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/fscache", NULL)) + goto error_dir; + +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, + &fscache_stats_fops)) + goto error_stats; +#endif + +#ifdef CONFIG_FSCACHE_HISTOGRAM + if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_fops)) + goto error_histogram; +#endif + +#ifdef CONFIG_FSCACHE_OBJECT_LIST + if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, + &fscache_objlist_fops)) + goto error_objects; +#endif + + _leave(" = 0"); + return 0; + +#ifdef CONFIG_FSCACHE_OBJECT_LIST +error_objects: +#endif +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +error_histogram: +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +error_stats: +#endif + remove_proc_entry("fs/fscache", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/fscache/ directory + */ +void fscache_proc_cleanup(void) +{ +#ifdef CONFIG_FSCACHE_OBJECT_LIST + remove_proc_entry("fs/fscache/objects", NULL); +#endif +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +#endif + remove_proc_entry("fs/fscache", NULL); +} diff --git a/kernel/fs/fscache/stats.c b/kernel/fs/fscache/stats.c new file mode 100644 index 000000000..40d13c70e --- /dev/null +++ b/kernel/fs/fscache/stats.c @@ -0,0 +1,291 @@ +/* FS-Cache statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include +#include +#include +#include "internal.h" + +/* + * operation counters + */ +atomic_t fscache_n_op_pend; +atomic_t fscache_n_op_run; +atomic_t fscache_n_op_enqueue; +atomic_t fscache_n_op_requeue; +atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_release; +atomic_t fscache_n_op_gc; +atomic_t fscache_n_op_cancelled; +atomic_t fscache_n_op_rejected; + +atomic_t fscache_n_attr_changed; +atomic_t fscache_n_attr_changed_ok; +atomic_t fscache_n_attr_changed_nobufs; +atomic_t fscache_n_attr_changed_nomem; +atomic_t fscache_n_attr_changed_calls; + +atomic_t fscache_n_allocs; +atomic_t fscache_n_allocs_ok; +atomic_t fscache_n_allocs_wait; +atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_allocs_intr; +atomic_t fscache_n_allocs_object_dead; +atomic_t fscache_n_alloc_ops; +atomic_t fscache_n_alloc_op_waits; + +atomic_t fscache_n_retrievals; +atomic_t fscache_n_retrievals_ok; +atomic_t fscache_n_retrievals_wait; +atomic_t fscache_n_retrievals_nodata; +atomic_t fscache_n_retrievals_nobufs; +atomic_t fscache_n_retrievals_intr; +atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrievals_object_dead; +atomic_t fscache_n_retrieval_ops; +atomic_t fscache_n_retrieval_op_waits; + +atomic_t fscache_n_stores; +atomic_t fscache_n_stores_ok; +atomic_t fscache_n_stores_again; +atomic_t fscache_n_stores_nobufs; +atomic_t fscache_n_stores_oom; +atomic_t fscache_n_store_ops; +atomic_t fscache_n_store_calls; +atomic_t fscache_n_store_pages; +atomic_t fscache_n_store_radix_deletes; +atomic_t fscache_n_store_pages_over_limit; + +atomic_t fscache_n_store_vmscan_not_storing; +atomic_t fscache_n_store_vmscan_gone; +atomic_t fscache_n_store_vmscan_busy; +atomic_t fscache_n_store_vmscan_cancelled; +atomic_t fscache_n_store_vmscan_wait; + +atomic_t fscache_n_marks; +atomic_t fscache_n_uncaches; + +atomic_t fscache_n_acquires; +atomic_t fscache_n_acquires_null; +atomic_t fscache_n_acquires_no_cache; +atomic_t fscache_n_acquires_ok; +atomic_t fscache_n_acquires_nobufs; +atomic_t fscache_n_acquires_oom; + +atomic_t fscache_n_invalidates; +atomic_t fscache_n_invalidates_run; + +atomic_t fscache_n_updates; +atomic_t fscache_n_updates_null; +atomic_t fscache_n_updates_run; + +atomic_t fscache_n_relinquishes; +atomic_t fscache_n_relinquishes_null; +atomic_t fscache_n_relinquishes_waitcrt; +atomic_t fscache_n_relinquishes_retire; + +atomic_t fscache_n_cookie_index; +atomic_t fscache_n_cookie_data; +atomic_t fscache_n_cookie_special; + +atomic_t fscache_n_object_alloc; +atomic_t fscache_n_object_no_alloc; +atomic_t fscache_n_object_lookups; +atomic_t fscache_n_object_lookups_negative; +atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_lookups_timed_out; +atomic_t fscache_n_object_created; +atomic_t fscache_n_object_avail; +atomic_t fscache_n_object_dead; + +atomic_t fscache_n_checkaux_none; +atomic_t fscache_n_checkaux_okay; +atomic_t fscache_n_checkaux_update; +atomic_t fscache_n_checkaux_obsolete; + +atomic_t fscache_n_cop_alloc_object; +atomic_t fscache_n_cop_lookup_object; +atomic_t fscache_n_cop_lookup_complete; +atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_invalidate_object; +atomic_t fscache_n_cop_update_object; +atomic_t fscache_n_cop_drop_object; +atomic_t fscache_n_cop_put_object; +atomic_t fscache_n_cop_sync_cache; +atomic_t fscache_n_cop_attr_changed; +atomic_t fscache_n_cop_read_or_alloc_page; +atomic_t fscache_n_cop_read_or_alloc_pages; +atomic_t fscache_n_cop_allocate_page; +atomic_t fscache_n_cop_allocate_pages; +atomic_t fscache_n_cop_write_page; +atomic_t fscache_n_cop_uncache_page; +atomic_t fscache_n_cop_dissociate_pages; + +/* + * display the general statistics + */ +static int fscache_stats_show(struct seq_file *m, void *v) +{ + seq_puts(m, "FS-Cache statistics\n"); + + seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", + atomic_read(&fscache_n_cookie_index), + atomic_read(&fscache_n_cookie_data), + atomic_read(&fscache_n_cookie_special)); + + seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", + atomic_read(&fscache_n_object_alloc), + atomic_read(&fscache_n_object_no_alloc), + atomic_read(&fscache_n_object_avail), + atomic_read(&fscache_n_object_dead)); + seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", + atomic_read(&fscache_n_checkaux_none), + atomic_read(&fscache_n_checkaux_okay), + atomic_read(&fscache_n_checkaux_update), + atomic_read(&fscache_n_checkaux_obsolete)); + + seq_printf(m, "Pages : mrk=%u unc=%u\n", + atomic_read(&fscache_n_marks), + atomic_read(&fscache_n_uncaches)); + + seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" + " oom=%u\n", + atomic_read(&fscache_n_acquires), + atomic_read(&fscache_n_acquires_null), + atomic_read(&fscache_n_acquires_no_cache), + atomic_read(&fscache_n_acquires_ok), + atomic_read(&fscache_n_acquires_nobufs), + atomic_read(&fscache_n_acquires_oom)); + + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", + atomic_read(&fscache_n_object_lookups), + atomic_read(&fscache_n_object_lookups_negative), + atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_created), + atomic_read(&fscache_n_object_lookups_timed_out)); + + seq_printf(m, "Invals : n=%u run=%u\n", + atomic_read(&fscache_n_invalidates), + atomic_read(&fscache_n_invalidates_run)); + + seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + atomic_read(&fscache_n_updates), + atomic_read(&fscache_n_updates_null), + atomic_read(&fscache_n_updates_run)); + + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", + atomic_read(&fscache_n_relinquishes), + atomic_read(&fscache_n_relinquishes_null), + atomic_read(&fscache_n_relinquishes_waitcrt), + atomic_read(&fscache_n_relinquishes_retire)); + + seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", + atomic_read(&fscache_n_attr_changed), + atomic_read(&fscache_n_attr_changed_ok), + atomic_read(&fscache_n_attr_changed_nobufs), + atomic_read(&fscache_n_attr_changed_nomem), + atomic_read(&fscache_n_attr_changed_calls)); + + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", + atomic_read(&fscache_n_allocs), + atomic_read(&fscache_n_allocs_ok), + atomic_read(&fscache_n_allocs_wait), + atomic_read(&fscache_n_allocs_nobufs), + atomic_read(&fscache_n_allocs_intr)); + seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", + atomic_read(&fscache_n_alloc_ops), + atomic_read(&fscache_n_alloc_op_waits), + atomic_read(&fscache_n_allocs_object_dead)); + + seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" + " int=%u oom=%u\n", + atomic_read(&fscache_n_retrievals), + atomic_read(&fscache_n_retrievals_ok), + atomic_read(&fscache_n_retrievals_wait), + atomic_read(&fscache_n_retrievals_nodata), + atomic_read(&fscache_n_retrievals_nobufs), + atomic_read(&fscache_n_retrievals_intr), + atomic_read(&fscache_n_retrievals_nomem)); + seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", + atomic_read(&fscache_n_retrieval_ops), + atomic_read(&fscache_n_retrieval_op_waits), + atomic_read(&fscache_n_retrievals_object_dead)); + + seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", + atomic_read(&fscache_n_stores), + atomic_read(&fscache_n_stores_ok), + atomic_read(&fscache_n_stores_again), + atomic_read(&fscache_n_stores_nobufs), + atomic_read(&fscache_n_stores_oom)); + seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", + atomic_read(&fscache_n_store_ops), + atomic_read(&fscache_n_store_calls), + atomic_read(&fscache_n_store_pages), + atomic_read(&fscache_n_store_radix_deletes), + atomic_read(&fscache_n_store_pages_over_limit)); + + seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n", + atomic_read(&fscache_n_store_vmscan_not_storing), + atomic_read(&fscache_n_store_vmscan_gone), + atomic_read(&fscache_n_store_vmscan_busy), + atomic_read(&fscache_n_store_vmscan_cancelled), + atomic_read(&fscache_n_store_vmscan_wait)); + + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", + atomic_read(&fscache_n_op_pend), + atomic_read(&fscache_n_op_run), + atomic_read(&fscache_n_op_enqueue), + atomic_read(&fscache_n_op_cancelled), + atomic_read(&fscache_n_op_rejected)); + seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_deferred_release), + atomic_read(&fscache_n_op_release), + atomic_read(&fscache_n_op_gc)); + + seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", + atomic_read(&fscache_n_cop_alloc_object), + atomic_read(&fscache_n_cop_lookup_object), + atomic_read(&fscache_n_cop_lookup_complete), + atomic_read(&fscache_n_cop_grab_object)); + seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n", + atomic_read(&fscache_n_cop_invalidate_object), + atomic_read(&fscache_n_cop_update_object), + atomic_read(&fscache_n_cop_drop_object), + atomic_read(&fscache_n_cop_put_object), + atomic_read(&fscache_n_cop_attr_changed), + atomic_read(&fscache_n_cop_sync_cache)); + seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", + atomic_read(&fscache_n_cop_read_or_alloc_page), + atomic_read(&fscache_n_cop_read_or_alloc_pages), + atomic_read(&fscache_n_cop_allocate_page), + atomic_read(&fscache_n_cop_allocate_pages), + atomic_read(&fscache_n_cop_write_page), + atomic_read(&fscache_n_cop_uncache_page), + atomic_read(&fscache_n_cop_dissociate_pages)); + return 0; +} + +/* + * open "/proc/fs/fscache/stats" allowing provision of a statistical summary + */ +static int fscache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, fscache_stats_show, NULL); +} + +const struct file_operations fscache_stats_fops = { + .owner = THIS_MODULE, + .open = fscache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; diff --git a/kernel/fs/fuse/Kconfig b/kernel/fs/fuse/Kconfig new file mode 100644 index 000000000..1b2f6c2c3 --- /dev/null +++ b/kernel/fs/fuse/Kconfig @@ -0,0 +1,27 @@ +config FUSE_FS + tristate "FUSE (Filesystem in Userspace) support" + help + With FUSE it is possible to implement a fully functional filesystem + in a userspace program. + + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: + + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. + + See for more information. + See for needed library/utility version. + + If you want to develop a userspace FS, or if you want to use + a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. diff --git a/kernel/fs/fuse/Makefile b/kernel/fs/fuse/Makefile new file mode 100644 index 000000000..e95eeb445 --- /dev/null +++ b/kernel/fs/fuse/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the FUSE filesystem. +# + +obj-$(CONFIG_FUSE_FS) += fuse.o +obj-$(CONFIG_CUSE) += cuse.o + +fuse-objs := dev.o dir.o file.o inode.o control.o diff --git a/kernel/fs/fuse/control.c b/kernel/fs/fuse/control.c new file mode 100644 index 000000000..f863ac664 --- /dev/null +++ b/kernel/fs/fuse/control.c @@ -0,0 +1,354 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include + +#define FUSE_CTL_SUPER_MAGIC 0x65735543 + +/* + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ +static struct super_block *fuse_control_sb; + +static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) +{ + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); + fc = file_inode(file)->i_private; + if (fc) + fc = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + return fc; +} + +static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fuse_abort_conn(fc); + fuse_conn_put(fc); + } + return count; +} + +static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[32]; + size_t size; + + if (!*ppos) { + long value; + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; + fuse_conn_put(fc); + } + size = sprintf(tmp, "%ld\n", (long)file->private_data); + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos, unsigned val) +{ + char tmp[32]; + size_t size = sprintf(tmp, "%u\n", val); + + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, unsigned *val, + unsigned global_limit) +{ + unsigned long t; + unsigned limit = (1 << 16) - 1; + int err; + + if (*ppos) + return -EINVAL; + + err = kstrtoul_from_user(buf, count, 0, &t); + if (err) + return err; + + if (!capable(CAP_SYS_ADMIN)) + limit = min(limit, global_limit); + + if (t > limit) + return -EINVAL; + + *val = t; + + return count; +} + +static ssize_t fuse_conn_max_background_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->max_background; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_max_background_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned uninitialized_var(val); + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_bgreq); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->max_background = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static ssize_t fuse_conn_congestion_threshold_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->congestion_threshold; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_congestion_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned uninitialized_var(val); + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_congthresh); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->congestion_threshold = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static const struct file_operations fuse_ctl_abort_ops = { + .open = nonseekable_open, + .write = fuse_conn_abort_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_ctl_waiting_ops = { + .open = nonseekable_open, + .read = fuse_conn_waiting_read, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_max_background_ops = { + .open = nonseekable_open, + .read = fuse_conn_max_background_read, + .write = fuse_conn_max_background_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_congestion_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_congestion_threshold_read, + .write = fuse_conn_congestion_threshold_write, + .llseek = no_llseek, +}; + +static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, + struct fuse_conn *fc, + const char *name, + int mode, int nlink, + const struct inode_operations *iop, + const struct file_operations *fop) +{ + struct dentry *dentry; + struct inode *inode; + + BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + inode = new_inode(fuse_control_sb); + if (!inode) + return NULL; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = fc->user_id; + inode->i_gid = fc->group_id; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* setting ->i_op to NULL is not allowed */ + if (iop) + inode->i_op = iop; + inode->i_fop = fop; + set_nlink(inode, nlink); + inode->i_private = fc; + d_add(dentry, inode); + return dentry; +} + +/* + * Add a connection to the control filesystem (if it exists). Caller + * must hold fuse_mutex + */ +int fuse_ctl_add_conn(struct fuse_conn *fc) +{ + struct dentry *parent; + char name[32]; + + if (!fuse_control_sb) + return 0; + + parent = fuse_control_sb->s_root; + inc_nlink(d_inode(parent)); + sprintf(name, "%u", fc->dev); + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + &simple_dir_inode_operations, + &simple_dir_operations); + if (!parent) + goto err; + + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + NULL, &fuse_ctl_waiting_ops) || + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + NULL, &fuse_ctl_abort_ops) || + !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, + 1, NULL, &fuse_conn_max_background_ops) || + !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_congestion_threshold_ops)) + goto err; + + return 0; + + err: + fuse_ctl_remove_conn(fc); + return -ENOMEM; +} + +/* + * Remove a connection from the control filesystem (if it exists). + * Caller must hold fuse_mutex + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc) +{ + int i; + + if (!fuse_control_sb) + return; + + for (i = fc->ctl_ndents - 1; i >= 0; i--) { + struct dentry *dentry = fc->ctl_dentry[i]; + d_inode(dentry)->i_private = NULL; + d_drop(dentry); + dput(dentry); + } + drop_nlink(d_inode(fuse_control_sb->s_root)); +} + +static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct tree_descr empty_descr = {""}; + struct fuse_conn *fc; + int err; + + err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); + if (err) + return err; + + mutex_lock(&fuse_mutex); + BUG_ON(fuse_control_sb); + fuse_control_sb = sb; + list_for_each_entry(fc, &fuse_conn_list, entry) { + err = fuse_ctl_add_conn(fc); + if (err) { + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + return err; + } + } + mutex_unlock(&fuse_mutex); + + return 0; +} + +static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super); +} + +static void fuse_ctl_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc; + + mutex_lock(&fuse_mutex); + fuse_control_sb = NULL; + list_for_each_entry(fc, &fuse_conn_list, entry) + fc->ctl_ndents = 0; + mutex_unlock(&fuse_mutex); + + kill_litter_super(sb); +} + +static struct file_system_type fuse_ctl_fs_type = { + .owner = THIS_MODULE, + .name = "fusectl", + .mount = fuse_ctl_mount, + .kill_sb = fuse_ctl_kill_sb, +}; +MODULE_ALIAS_FS("fusectl"); + +int __init fuse_ctl_init(void) +{ + return register_filesystem(&fuse_ctl_fs_type); +} + +void __exit fuse_ctl_cleanup(void) +{ + unregister_filesystem(&fuse_ctl_fs_type); +} diff --git a/kernel/fs/fuse/cuse.c b/kernel/fs/fuse/cuse.c new file mode 100644 index 000000000..e5bbf748b --- /dev/null +++ b/kernel/fs/fuse/cuse.c @@ -0,0 +1,636 @@ +/* + * CUSE: Character device in Userspace + * + * Copyright (C) 2008-2009 SUSE Linux Products GmbH + * Copyright (C) 2008-2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * CUSE enables character devices to be implemented from userland much + * like FUSE allows filesystems. On initialization /dev/cuse is + * created. By opening the file and replying to the CUSE_INIT request + * userland CUSE server can create a character device. After that the + * operation is very similar to FUSE. + * + * A CUSE instance involves the following objects. + * + * cuse_conn : contains fuse_conn and serves as bonding structure + * channel : file handle connected to the userland CUSE server + * cdev : the implemented character device + * dev : generic device for cdev + * + * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with + * devices, it's called 'channel' to reduce confusion. + * + * channel determines when the character device dies. When channel is + * closed, everything begins to destruct. The cuse_conn is taken off + * the lookup table preventing further access from cdev, cdev and + * generic device are removed and the base reference of cuse_conn is + * put. + * + * On each open, the matching cuse_conn is looked up and if found an + * additional reference is taken which is released when the file is + * closed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fuse_i.h" + +#define CUSE_CONNTBL_LEN 64 + +struct cuse_conn { + struct list_head list; /* linked on cuse_conntbl */ + struct fuse_conn fc; /* fuse connection */ + struct cdev *cdev; /* associated character device */ + struct device *dev; /* device representing @cdev */ + + /* init parameters, set once during initialization */ + bool unrestricted_ioctl; +}; + +static DEFINE_MUTEX(cuse_lock); /* protects registration */ +static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; +static struct class *cuse_class; + +static struct cuse_conn *fc_to_cc(struct fuse_conn *fc) +{ + return container_of(fc, struct cuse_conn, fc); +} + +static struct list_head *cuse_conntbl_head(dev_t devt) +{ + return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN]; +} + + +/************************************************************************** + * CUSE frontend operations + * + * These are file operations for the character device. + * + * On open, CUSE opens a file from the FUSE mnt and stores it to + * private_data of the open file. All other ops call FUSE ops on the + * FUSE file. + */ + +static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) +{ + struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + loff_t pos = 0; + + return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); +} + +static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) +{ + struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + loff_t pos = 0; + /* + * No locking or generic_write_checks(), the server is + * responsible for locking and sanity checks. + */ + return fuse_direct_io(&io, from, &pos, + FUSE_DIO_WRITE | FUSE_DIO_CUSE); +} + +static int cuse_open(struct inode *inode, struct file *file) +{ + dev_t devt = inode->i_cdev->dev; + struct cuse_conn *cc = NULL, *pos; + int rc; + + /* look up and get the connection */ + mutex_lock(&cuse_lock); + list_for_each_entry(pos, cuse_conntbl_head(devt), list) + if (pos->dev->devt == devt) { + fuse_conn_get(&pos->fc); + cc = pos; + break; + } + mutex_unlock(&cuse_lock); + + /* dead? */ + if (!cc) + return -ENODEV; + + /* + * Generic permission check is already done against the chrdev + * file, proceed to open. + */ + rc = fuse_do_open(&cc->fc, 0, file, 0); + if (rc) + fuse_conn_put(&cc->fc); + return rc; +} + +static int cuse_release(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_sync_release(ff, file->f_flags); + fuse_conn_put(fc); + + return 0; +} + +static long cuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = 0; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = FUSE_IOCTL_COMPAT; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static const struct file_operations cuse_frontend_fops = { + .owner = THIS_MODULE, + .read_iter = cuse_read_iter, + .write_iter = cuse_write_iter, + .open = cuse_open, + .release = cuse_release, + .unlocked_ioctl = cuse_file_ioctl, + .compat_ioctl = cuse_file_compat_ioctl, + .poll = fuse_file_poll, + .llseek = noop_llseek, +}; + + +/************************************************************************** + * CUSE channel initialization and destruction + */ + +struct cuse_devinfo { + const char *name; +}; + +/** + * cuse_parse_one - parse one key=value pair + * @pp: i/o parameter for the current position + * @end: points to one past the end of the packed string + * @keyp: out parameter for key + * @valp: out parameter for value + * + * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends + * at @end - 1. This function parses one pair and set *@keyp to the + * start of the key and *@valp to the start of the value. Note that + * the original string is modified such that the key string is + * terminated with '\0'. *@pp is updated to point to the next string. + * + * RETURNS: + * 1 on successful parse, 0 on EOF, -errno on failure. + */ +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) +{ + char *p = *pp; + char *key, *val; + + while (p < end && *p == '\0') + p++; + if (p == end) + return 0; + + if (end[-1] != '\0') { + printk(KERN_ERR "CUSE: info not properly terminated\n"); + return -EINVAL; + } + + key = val = p; + p += strlen(p); + + if (valp) { + strsep(&val, "="); + if (!val) + val = key + strlen(key); + key = strstrip(key); + val = strstrip(val); + } else + key = strstrip(key); + + if (!strlen(key)) { + printk(KERN_ERR "CUSE: zero length info key specified\n"); + return -EINVAL; + } + + *pp = p; + *keyp = key; + if (valp) + *valp = val; + + return 1; +} + +/** + * cuse_parse_dev_info - parse device info + * @p: device info string + * @len: length of device info string + * @devinfo: out parameter for parsed device info + * + * Parse @p to extract device info and store it into @devinfo. String + * pointed to by @p is modified by parsing and @devinfo points into + * them, so @p shouldn't be freed while @devinfo is in use. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) +{ + char *end = p + len; + char *uninitialized_var(key), *uninitialized_var(val); + int rc; + + while (true) { + rc = cuse_parse_one(&p, end, &key, &val); + if (rc < 0) + return rc; + if (!rc) + break; + if (strcmp(key, "DEVNAME") == 0) + devinfo->name = val; + else + printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n", + key); + } + + if (!devinfo->name || !strlen(devinfo->name)) { + printk(KERN_ERR "CUSE: DEVNAME unspecified\n"); + return -EINVAL; + } + + return 0; +} + +static void cuse_gendev_release(struct device *dev) +{ + kfree(dev); +} + +/** + * cuse_process_init_reply - finish initializing CUSE channel + * + * This function creates the character device and sets up all the + * required data structures for it. Please read the comment at the + * top of this file for high level overview. + */ +static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct cuse_conn *cc = fc_to_cc(fc), *pos; + struct cuse_init_out *arg = req->out.args[0].value; + struct page *page = req->pages[0]; + struct cuse_devinfo devinfo = { }; + struct device *dev; + struct cdev *cdev; + dev_t devt; + int rc, i; + + if (req->out.h.error || + arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + goto err; + } + + fc->minor = arg->minor; + fc->max_read = max_t(unsigned, arg->max_read, 4096); + fc->max_write = max_t(unsigned, arg->max_write, 4096); + + /* parse init reply */ + cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; + + rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + &devinfo); + if (rc) + goto err; + + /* determine and reserve devt */ + devt = MKDEV(arg->dev_major, arg->dev_minor); + if (!MAJOR(devt)) + rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name); + else + rc = register_chrdev_region(devt, 1, devinfo.name); + if (rc) { + printk(KERN_ERR "CUSE: failed to register chrdev region\n"); + goto err; + } + + /* devt determined, create device */ + rc = -ENOMEM; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + goto err_region; + + device_initialize(dev); + dev_set_uevent_suppress(dev, 1); + dev->class = cuse_class; + dev->devt = devt; + dev->release = cuse_gendev_release; + dev_set_drvdata(dev, cc); + dev_set_name(dev, "%s", devinfo.name); + + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + + rc = device_add(dev); + if (rc) + goto err_unlock; + + /* register cdev */ + rc = -ENOMEM; + cdev = cdev_alloc(); + if (!cdev) + goto err_unlock; + + cdev->owner = THIS_MODULE; + cdev->ops = &cuse_frontend_fops; + + rc = cdev_add(cdev, devt, 1); + if (rc) + goto err_cdev; + + cc->dev = dev; + cc->cdev = cdev; + + /* make the device available */ + list_add(&cc->list, cuse_conntbl_head(devt)); + mutex_unlock(&cuse_lock); + + /* announce device availability */ + dev_set_uevent_suppress(dev, 0); + kobject_uevent(&dev->kobj, KOBJ_ADD); +out: + kfree(arg); + __free_page(page); + return; + +err_cdev: + cdev_del(cdev); +err_unlock: + mutex_unlock(&cuse_lock); + put_device(dev); +err_region: + unregister_chrdev_region(devt, 1); +err: + fuse_abort_conn(fc); + goto out; +} + +static int cuse_send_init(struct cuse_conn *cc) +{ + int rc; + struct fuse_req *req; + struct page *page; + struct fuse_conn *fc = &cc->fc; + struct cuse_init_in *arg; + void *outarg; + + BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); + + req = fuse_get_req_for_background(fc, 1); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + goto err; + } + + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto err_put_req; + + outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); + if (!outarg) + goto err_free_page; + + arg = &req->misc.cuse_init_in; + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->flags |= CUSE_UNRESTRICTED_IOCTL; + req->in.h.opcode = CUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct cuse_init_in); + req->in.args[0].value = arg; + req->out.numargs = 2; + req->out.args[0].size = sizeof(struct cuse_init_out); + req->out.args[0].value = outarg; + req->out.args[1].size = CUSE_INIT_INFO_MAX; + req->out.argvar = 1; + req->out.argpages = 1; + req->pages[0] = page; + req->page_descs[0].length = req->out.args[1].size; + req->num_pages = 1; + req->end = cuse_process_init_reply; + fuse_request_send_background(fc, req); + + return 0; + +err_free_page: + __free_page(page); +err_put_req: + fuse_put_request(fc, req); +err: + return rc; +} + +static void cuse_fc_release(struct fuse_conn *fc) +{ + struct cuse_conn *cc = fc_to_cc(fc); + kfree_rcu(cc, fc.rcu); +} + +/** + * cuse_channel_open - open method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being opened + * + * Userland CUSE server can create a CUSE device by opening /dev/cuse + * and replying to the initialization request kernel sends. This + * function is responsible for handling CUSE device initialization. + * Because the fd opened by this function is used during + * initialization, this function only creates cuse_conn and sends + * init. The rest is delegated to a kthread. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_open(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc; + int rc; + + /* set up cuse_conn */ + cc = kzalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + + fuse_conn_init(&cc->fc); + + INIT_LIST_HEAD(&cc->list); + cc->fc.release = cuse_fc_release; + + cc->fc.connected = 1; + cc->fc.initialized = 1; + rc = cuse_send_init(cc); + if (rc) { + fuse_conn_put(&cc->fc); + return rc; + } + file->private_data = &cc->fc; /* channel owns base reference to cc */ + + return 0; +} + +/** + * cuse_channel_release - release method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being closed + * + * Disconnect the channel, deregister CUSE device and initiate + * destruction by putting the default reference. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_release(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc = fc_to_cc(file->private_data); + int rc; + + /* remove from the conntbl, no more access from this point on */ + mutex_lock(&cuse_lock); + list_del_init(&cc->list); + mutex_unlock(&cuse_lock); + + /* remove device */ + if (cc->dev) + device_unregister(cc->dev); + if (cc->cdev) { + unregister_chrdev_region(cc->cdev->dev, 1); + cdev_del(cc->cdev); + } + + rc = fuse_dev_release(inode, file); /* puts the base reference */ + + return rc; +} + +static struct file_operations cuse_channel_fops; /* initialized during init */ + + +/************************************************************************** + * Misc stuff and module initializatiion + * + * CUSE exports the same set of attributes to sysfs as fusectl. + */ + +static ssize_t cuse_class_waiting_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); +} +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL); + +static ssize_t cuse_class_abort_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + fuse_abort_conn(&cc->fc); + return count; +} +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); + +static struct attribute *cuse_class_dev_attrs[] = { + &dev_attr_waiting.attr, + &dev_attr_abort.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cuse_class_dev); + +static struct miscdevice cuse_miscdev = { + .minor = CUSE_MINOR, + .name = "cuse", + .fops = &cuse_channel_fops, +}; + +MODULE_ALIAS_MISCDEV(CUSE_MINOR); +MODULE_ALIAS("devname:cuse"); + +static int __init cuse_init(void) +{ + int i, rc; + + /* init conntbl */ + for (i = 0; i < CUSE_CONNTBL_LEN; i++) + INIT_LIST_HEAD(&cuse_conntbl[i]); + + /* inherit and extend fuse_dev_operations */ + cuse_channel_fops = fuse_dev_operations; + cuse_channel_fops.owner = THIS_MODULE; + cuse_channel_fops.open = cuse_channel_open; + cuse_channel_fops.release = cuse_channel_release; + + cuse_class = class_create(THIS_MODULE, "cuse"); + if (IS_ERR(cuse_class)) + return PTR_ERR(cuse_class); + + cuse_class->dev_groups = cuse_class_dev_groups; + + rc = misc_register(&cuse_miscdev); + if (rc) { + class_destroy(cuse_class); + return rc; + } + + return 0; +} + +static void __exit cuse_exit(void) +{ + misc_deregister(&cuse_miscdev); + class_destroy(cuse_class); +} + +module_init(cuse_init); +module_exit(cuse_exit); + +MODULE_AUTHOR("Tejun Heo "); +MODULE_DESCRIPTION("Character device in Userspace"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/fuse/dev.c b/kernel/fs/fuse/dev.c new file mode 100644 index 000000000..c8b68ab2e --- /dev/null +++ b/kernel/fs/fuse/dev.c @@ -0,0 +1,2273 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); + +static struct kmem_cache *fuse_req_cachep; + +static struct fuse_conn *fuse_get_conn(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; +} + +static void fuse_request_init(struct fuse_req *req, struct page **pages, + struct fuse_page_desc *page_descs, + unsigned npages) +{ + memset(req, 0, sizeof(*req)); + memset(pages, 0, sizeof(*pages) * npages); + memset(page_descs, 0, sizeof(*page_descs) * npages); + INIT_LIST_HEAD(&req->list); + INIT_LIST_HEAD(&req->intr_entry); + init_waitqueue_head(&req->waitq); + atomic_set(&req->count, 1); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; +} + +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + if (req) { + struct page **pages; + struct fuse_page_desc *page_descs; + + if (npages <= FUSE_REQ_INLINE_PAGES) { + pages = req->inline_pages; + page_descs = req->inline_page_descs; + } else { + pages = kmalloc(sizeof(struct page *) * npages, flags); + page_descs = kmalloc(sizeof(struct fuse_page_desc) * + npages, flags); + } + + if (!pages || !page_descs) { + kfree(pages); + kfree(page_descs); + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + + fuse_request_init(req, pages, page_descs, npages); + } + return req; +} + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(fuse_request_alloc); + +struct fuse_req *fuse_request_alloc_nofs(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_NOFS); +} + +void fuse_request_free(struct fuse_req *req) +{ + if (req->pages != req->inline_pages) { + kfree(req->pages); + kfree(req->page_descs); + } + kmem_cache_free(fuse_req_cachep, req); +} + +static void block_sigs(sigset_t *oldset) +{ + sigset_t mask; + + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} + +void __fuse_get_request(struct fuse_req *req) +{ + atomic_inc(&req->count); +} + +/* Must be called with > 1 refcount */ +static void __fuse_put_request(struct fuse_req *req) +{ + BUG_ON(atomic_read(&req->count) < 2); + atomic_dec(&req->count); +} + +static void fuse_req_init_context(struct fuse_req *req) +{ + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); + req->in.h.pid = current->pid; +} + +void fuse_set_initialized(struct fuse_conn *fc) +{ + /* Make sure stores before this are seen on another CPU */ + smp_wmb(); + fc->initialized = 1; +} + +static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) +{ + return !fc->initialized || (for_background && fc->blocked); +} + +static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, + bool for_background) +{ + struct fuse_req *req; + int err; + atomic_inc(&fc->num_waiting); + + if (fuse_block_alloc(fc, for_background)) { + sigset_t oldset; + int intr; + + block_sigs(&oldset); + intr = wait_event_interruptible_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background)); + restore_sigs(&oldset); + err = -EINTR; + if (intr) + goto out; + } + /* Matches smp_wmb() in fuse_set_initialized() */ + smp_rmb(); + + err = -ENOTCONN; + if (!fc->connected) + goto out; + + req = fuse_request_alloc(npages); + err = -ENOMEM; + if (!req) { + if (for_background) + wake_up(&fc->blocked_waitq); + goto out; + } + + fuse_req_init_context(req); + req->waiting = 1; + req->background = for_background; + return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); +} + +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +{ + return __fuse_get_req(fc, npages, false); +} +EXPORT_SYMBOL_GPL(fuse_get_req); + +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages) +{ + return __fuse_get_req(fc, npages, true); +} +EXPORT_SYMBOL_GPL(fuse_get_req_for_background); + +/* + * Return request in fuse_file->reserved_req. However that may + * currently be in use. If that is the case, wait for it to become + * available. + */ +static struct fuse_req *get_reserved_req(struct fuse_conn *fc, + struct file *file) +{ + struct fuse_req *req = NULL; + struct fuse_file *ff = file->private_data; + + do { + wait_event(fc->reserved_req_waitq, ff->reserved_req); + spin_lock(&fc->lock); + if (ff->reserved_req) { + req = ff->reserved_req; + ff->reserved_req = NULL; + req->stolen_file = get_file(file); + } + spin_unlock(&fc->lock); + } while (!req); + + return req; +} + +/* + * Put stolen request back into fuse_file->reserved_req + */ +static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct file *file = req->stolen_file; + struct fuse_file *ff = file->private_data; + + spin_lock(&fc->lock); + fuse_request_init(req, req->pages, req->page_descs, req->max_pages); + BUG_ON(ff->reserved_req); + ff->reserved_req = req; + wake_up_all(&fc->reserved_req_waitq); + spin_unlock(&fc->lock); + fput(file); +} + +/* + * Gets a requests for a file operation, always succeeds + * + * This is used for sending the FLUSH request, which must get to + * userspace, due to POSIX locks which may need to be unlocked. + * + * If allocation fails due to OOM, use the reserved request in + * fuse_file. + * + * This is very unlikely to deadlock accidentally, since the + * filesystem should not have it's own file open. If deadlock is + * intentional, it can still be broken by "aborting" the filesystem. + */ +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file) +{ + struct fuse_req *req; + + atomic_inc(&fc->num_waiting); + wait_event(fc->blocked_waitq, fc->initialized); + /* Matches smp_wmb() in fuse_set_initialized() */ + smp_rmb(); + req = fuse_request_alloc(0); + if (!req) + req = get_reserved_req(fc, file); + + fuse_req_init_context(req); + req->waiting = 1; + req->background = 0; + return req; +} + +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + if (unlikely(req->background)) { + /* + * We get here in the unlikely case that a background + * request was allocated but not sent + */ + spin_lock(&fc->lock); + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->lock); + } + + if (req->waiting) + atomic_dec(&fc->num_waiting); + + if (req->stolen_file) + put_reserved_req(fc, req); + else + fuse_request_free(req); + } +} +EXPORT_SYMBOL_GPL(fuse_put_request); + +static unsigned len_args(unsigned numargs, struct fuse_arg *args) +{ + unsigned nbytes = 0; + unsigned i; + + for (i = 0; i < numargs; i++) + nbytes += args[i].size; + + return nbytes; +} + +static u64 fuse_get_unique(struct fuse_conn *fc) +{ + fc->reqctr++; + /* zero is special */ + if (fc->reqctr == 0) + fc->reqctr = 1; + + return fc->reqctr; +} + +static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.len = sizeof(struct fuse_in_header) + + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + list_add_tail(&req->list, &fc->pending); + req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup) +{ + forget->forget_one.nodeid = nodeid; + forget->forget_one.nlookup = nlookup; + + spin_lock(&fc->lock); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } + spin_unlock(&fc->lock); +} + +static void flush_bg_queue(struct fuse_conn *fc) +{ + while (fc->active_background < fc->max_background && + !list_empty(&fc->bg_queue)) { + struct fuse_req *req; + + req = list_entry(fc->bg_queue.next, struct fuse_req, list); + list_del(&req->list); + fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + } +} + +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was aborted (and not yet sent) or some error + * occurred during communication with userspace, or the device file + * was closed. The requester thread is woken up (if still waiting), + * the 'end' callback is called if given, else the reference to the + * request is released + * + * Called with fc->lock, unlocks it + */ +static void request_end(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +{ + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + list_del(&req->list); + list_del(&req->intr_entry); + req->state = FUSE_REQ_FINISHED; + if (req->background) { + req->background = 0; + + if (fc->num_background == fc->max_background) + fc->blocked = 0; + + /* Wake up next waiter, if any */ + if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + + if (fc->num_background == fc->congestion_threshold && + fc->connected && fc->bdi_initialized) { + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + fc->num_background--; + fc->active_background--; + flush_bg_queue(fc); + } + spin_unlock(&fc->lock); + wake_up(&req->waitq); + if (end) + end(fc, req); + fuse_put_request(fc, req); +} + +static void wait_answer_interruptible(struct fuse_conn *fc, + struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (signal_pending(current)) + return; + + spin_unlock(&fc->lock); + wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); +} + +static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +{ + list_add_tail(&req->intr_entry, &fc->interrupts); + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (!fc->no_interrupt) { + /* Any signal may interrupt this */ + wait_answer_interruptible(fc, req); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + req->interrupted = 1; + if (req->state == FUSE_REQ_SENT) + queue_interrupt(fc, req); + } + + if (!req->force) { + sigset_t oldset; + + /* Only fatal signals may interrupt this */ + block_sigs(&oldset); + wait_answer_interruptible(fc, req); + restore_sigs(&oldset); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + /* Request is not yet in userspace, bail out */ + if (req->state == FUSE_REQ_PENDING) { + list_del(&req->list); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return; + } + } + + /* + * Either request is already in userspace, or it was forced. + * Wait it out. + */ + spin_unlock(&fc->lock); + wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); + + if (!req->aborted) + return; + + aborted: + BUG_ON(req->state != FUSE_REQ_FINISHED); + if (req->locked) { + /* This is uninterruptible sleep, because data is + being copied to/from the buffers of req. During + locked state, there mustn't be any filesystem + operation (e.g. page fault), since that could lead + to deadlock */ + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + spin_lock(&fc->lock); + } +} + +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + BUG_ON(req->background); + spin_lock(&fc->lock); + if (!fc->connected) + req->out.h.error = -ENOTCONN; + else if (fc->conn_error) + req->out.h.error = -ECONNREFUSED; + else { + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + /* acquire extra reference, since request is still needed + after request_end() */ + __fuse_get_request(req); + + request_wait_answer(fc, req); + } + spin_unlock(&fc->lock); +} + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + __fuse_request_send(fc, req); +} +EXPORT_SYMBOL_GPL(fuse_request_send); + +static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) +{ + if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) + args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + + if (fc->minor < 9) { + switch (args->in.h.opcode) { + case FUSE_LOOKUP: + case FUSE_CREATE: + case FUSE_MKNOD: + case FUSE_MKDIR: + case FUSE_SYMLINK: + case FUSE_LINK: + args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + break; + case FUSE_GETATTR: + case FUSE_SETATTR: + args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + break; + } + } + if (fc->minor < 12) { + switch (args->in.h.opcode) { + case FUSE_CREATE: + args->in.args[0].size = sizeof(struct fuse_open_in); + break; + case FUSE_MKNOD: + args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + break; + } + } +} + +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +{ + struct fuse_req *req; + ssize_t ret; + + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* Needs to be done after fuse_get_req() so that fc->minor is valid */ + fuse_adjust_compat(fc, args); + + req->in.h.opcode = args->in.h.opcode; + req->in.h.nodeid = args->in.h.nodeid; + req->in.numargs = args->in.numargs; + memcpy(req->in.args, args->in.args, + args->in.numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out.argvar; + req->out.numargs = args->out.numargs; + memcpy(req->out.args, args->out.args, + args->out.numargs * sizeof(struct fuse_arg)); + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret && args->out.argvar) { + BUG_ON(args->out.numargs != 1); + ret = req->out.args[0].size; + } + fuse_put_request(fc, req); + + return ret; +} + +static void fuse_request_send_nowait_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + BUG_ON(!req->background); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && + fc->bdi_initialized) { + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); +} + +static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fuse_request_send_nowait_locked(fc, req); + spin_unlock(&fc->lock); + } else { + req->out.h.error = -ENOTCONN; + request_end(fc, req); + } +} + +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait(fc, req); +} +EXPORT_SYMBOL_GPL(fuse_request_send_background); + +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait_locked(fc, req); +} + +void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_forget_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + req = fuse_get_req_nofail_nopages(fc, file); + req->in.h.opcode = FUSE_FORGET; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->isreply = 0; + __fuse_request_send(fc, req); + /* ignore errors */ + fuse_put_request(fc, req); +} + +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * aborted bail out. + */ +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + int err = 0; + if (req) { + spin_lock(&fc->lock); + if (req->aborted) + err = -ENOENT; + else + req->locked = 1; + spin_unlock(&fc->lock); + } + return err; +} + +/* + * Unlock request. If it was aborted during being locked, the + * requester thread is currently waiting for it to be unlocked, so + * wake it up. + */ +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (req) { + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) + wake_up(&req->waitq); + spin_unlock(&fc->lock); + } +} + +struct fuse_copy_state { + struct fuse_conn *fc; + int write; + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned len; + unsigned offset; + unsigned move_pages:1; +}; + +static void fuse_copy_init(struct fuse_copy_state *cs, + struct fuse_conn *fc, + int write, + struct iov_iter *iter) +{ + memset(cs, 0, sizeof(*cs)); + cs->fc = fc; + cs->write = write; + cs->iter = iter; +} + +/* Unmap and put previous page of userspace buffer */ +static void fuse_copy_finish(struct fuse_copy_state *cs) +{ + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (cs->write) + buf->len = PAGE_SIZE - cs->len; + cs->currbuf = NULL; + } else if (cs->pg) { + if (cs->write) { + flush_dcache_page(cs->pg); + set_page_dirty_lock(cs->pg); + } + put_page(cs->pg); + } + cs->pg = NULL; +} + +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ +static int fuse_copy_fill(struct fuse_copy_state *cs) +{ + struct page *page; + int err; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->pg = buf->page; + cs->offset = buf->offset; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + } else { + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->pg = page; + cs->offset = 0; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + size_t off; + err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off); + if (err < 0) + return err; + BUG_ON(!err); + cs->len = err; + cs->offset = off; + cs->pg = page; + cs->offset = off; + iov_iter_advance(cs->iter, err); + } + + return lock_request(cs->fc, cs->req); +} + +/* Do as much copy to/from userspace buffer as we can */ +static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) +{ + unsigned ncpy = min(*size, cs->len); + if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + + if (cs->write) + memcpy(buf, *val, ncpy); + else + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); + *val += ncpy; + } + *size -= ncpy; + cs->len -= ncpy; + cs->offset += ncpy; + return ncpy; +} + +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); + if (err) { + unlock_page(newpage); + return err; + } + + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->pg = buf->page; + cs->offset = buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, + unsigned offset, unsigned count, int zeroing) +{ + int err; + struct page *page = *pagep; + + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + + while (count) { + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } + } + if (page) { + void *mapaddr = kmap_atomic(page); + void *buf = mapaddr + offset; + offset += fuse_copy_do(cs, &buf, &count); + kunmap_atomic(mapaddr); + } else + offset += fuse_copy_do(cs, NULL, &count); + } + if (page && !cs->write) + flush_dcache_page(page); + return 0; +} + +/* Copy pages in the request to/from userspace buffer */ +static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) +{ + unsigned i; + struct fuse_req *req = cs->req; + + for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + int err; + unsigned offset = req->page_descs[i].offset; + unsigned count = min(nbytes, req->page_descs[i].length); + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); + if (err) + return err; + + nbytes -= count; + } + return 0; +} + +/* Copy a single argument in the request to/from userspace buffer */ +static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) +{ + while (size) { + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } + fuse_copy_do(cs, &val, &size); + } + return 0; +} + +/* Copy request arguments to/from userspace buffer */ +static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) +{ + int err = 0; + unsigned i; + + for (i = 0; !err && i < numargs; i++) { + struct fuse_arg *arg = &args[i]; + if (i == numargs - 1 && argpages) + err = fuse_copy_pages(cs, arg->size, zeroing); + else + err = fuse_copy_one(cs, arg->value, arg->size); + } + return err; +} + +static int forget_pending(struct fuse_conn *fc) +{ + return fc->forget_list_head.next != NULL; +} + +static int request_pending(struct fuse_conn *fc) +{ + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || + forget_pending(fc); +} + +/* Wait until a request is available on the pending list */ +static void request_wait(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&fc->waitq, &wait); + while (fc->connected && !request_pending(fc)) { + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; + + spin_unlock(&fc->lock); + schedule(); + spin_lock(&fc->lock); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&fc->waitq, &wait); +} + +/* + * Transfer an interrupt request to userspace + * + * Unlike other requests this is assembled on demand, without a need + * to allocate a separate fuse_req structure. + * + * Called with fc->lock held, releases it + */ +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) +__releases(fc->lock) +{ + struct fuse_in_header ih; + struct fuse_interrupt_in arg; + unsigned reqsize = sizeof(ih) + sizeof(arg); + int err; + + list_del_init(&req->intr_entry); + req->intr_unique = fuse_get_unique(fc); + memset(&ih, 0, sizeof(ih)); + memset(&arg, 0, sizeof(arg)); + ih.len = reqsize; + ih.opcode = FUSE_INTERRUPT; + ih.unique = req->intr_unique; + arg.unique = req->in.h.unique; + + spin_unlock(&fc->lock); + if (nbytes < reqsize) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + return err ? err : reqsize; +} + +static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, + unsigned max, + unsigned *countp) +{ + struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link **newhead = &head; + unsigned count; + + for (count = 0; *newhead != NULL && count < max; count++) + newhead = &(*newhead)->next; + + fc->forget_list_head.next = *newhead; + *newhead = NULL; + if (fc->forget_list_head.next == NULL) + fc->forget_list_tail = &fc->forget_list_head; + + if (countp != NULL) + *countp = count; + + return head; +} + +static int fuse_read_single_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + int err; + struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_in arg = { + .nlookup = forget->forget_one.nlookup, + }; + struct fuse_in_header ih = { + .opcode = FUSE_FORGET, + .nodeid = forget->forget_one.nodeid, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + spin_unlock(&fc->lock); + kfree(forget); + if (nbytes < ih.len) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +__releases(fc->lock) +{ + int err; + unsigned max_forgets; + unsigned count; + struct fuse_forget_link *head; + struct fuse_batch_forget_in arg = { .count = 0 }; + struct fuse_in_header ih = { + .opcode = FUSE_BATCH_FORGET, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + if (nbytes < ih.len) { + spin_unlock(&fc->lock); + return -EINVAL; + } + + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); + head = dequeue_forget(fc, max_forgets, &count); + spin_unlock(&fc->lock); + + arg.count = count; + ih.len += count * sizeof(struct fuse_forget_one); + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + + while (head) { + struct fuse_forget_link *forget = head; + + if (!err) { + err = fuse_copy_one(cs, &forget->forget_one, + sizeof(forget->forget_one)); + } + head = forget->next; + kfree(forget); + } + + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fc, cs, nbytes); + else + return fuse_read_batch_forget(fc, cs, nbytes); +} + +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_in *in; + unsigned reqsize; + + restart: + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + !request_pending(fc)) + goto err_unlock; + + request_wait(fc); + err = -ENODEV; + if (!fc->connected) + goto err_unlock; + err = -ERESTARTSYS; + if (!request_pending(fc)) + goto err_unlock; + + if (!list_empty(&fc->interrupts)) { + req = list_entry(fc->interrupts.next, struct fuse_req, + intr_entry); + return fuse_read_interrupt(fc, cs, nbytes, req); + } + + if (forget_pending(fc)) { + if (list_empty(&fc->pending) || fc->forget_batch-- > 0) + return fuse_read_forget(fc, cs, nbytes); + + if (fc->forget_batch <= -8) + fc->forget_batch = 16; + } + + req = list_entry(fc->pending.next, struct fuse_req, list); + req->state = FUSE_REQ_READING; + list_move(&req->list, &fc->io); + + in = &req->in; + reqsize = in->h.len; + /* If request is too large, reply with an error and restart the read */ + if (nbytes < reqsize) { + req->out.h.error = -EIO; + /* SETXATTR is special, since it may contain too large data */ + if (in->h.opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; + request_end(fc, req); + goto restart; + } + spin_unlock(&fc->lock); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + if (!err) + err = fuse_copy_args(cs, in->numargs, in->argpages, + (struct fuse_arg *) in->args, 0); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) { + request_end(fc, req); + return -ENODEV; + } + if (err) { + req->out.h.error = -EIO; + request_end(fc, req); + return err; + } + if (!req->isreply) + request_end(fc, req); + else { + req->state = FUSE_REQ_SENT; + list_move_tail(&req->list, &fc->processing); + if (req->interrupted) + queue_interrupt(fc, req); + spin_unlock(&fc->lock); + } + return reqsize; + + err_unlock: + spin_unlock(&fc->lock); + return err; +} + +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + +static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + if (!iter_is_iovec(to)) + return -EINVAL; + + fuse_copy_init(&cs, fc, 1, to); + + return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to)); +} + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + buf->ops = &nosteal_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->files) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + fuse_copy_finish(cs); + return fuse_notify_poll_wakeup(fc, &outarg); + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_inode_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + err = fuse_reverse_inval_inode(fc->sb, outarg.ino, + outarg.off, outarg.len); + } + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_entry_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_delete_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, + outarg.child, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && + (this_num == PAGE_CACHE_SIZE || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + release_pages(req->pages, req->num_pages, false); +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len = 0; + int num_pages; + + offset = outarg->offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + + req = fuse_get_req(fc, num_pages); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_descs[0].offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + + while (num && req->num_pages < num_pages) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = this_num; + req->num_pages++; + + offset = 0; + num -= this_num; + total_len += this_num; + index++; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + case FUSE_NOTIFY_INVAL_INODE: + return fuse_notify_inval_inode(fc, size, cs); + + case FUSE_NOTIFY_INVAL_ENTRY: + return fuse_notify_inval_entry(fc, size, cs); + + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + + case FUSE_NOTIFY_DELETE: + return fuse_notify_delete(fc, size, cs); + + default: + fuse_copy_finish(cs); + return -EINVAL; + } +} + +/* Look up request on processing list by unique ID */ +static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +{ + struct fuse_req *req; + + list_for_each_entry(req, &fc->processing, list) { + if (req->in.h.unique == unique || req->intr_unique == unique) + return req; + } + return NULL; +} + +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, + unsigned nbytes) +{ + unsigned reqsize = sizeof(struct fuse_out_header); + + if (out->h.error) + return nbytes != reqsize ? -EINVAL : 0; + + reqsize += len_args(out->numargs, out->args); + + if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + return -EINVAL; + else if (reqsize > nbytes) { + struct fuse_arg *lastarg = &out->args[out->numargs-1]; + unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) + return -EINVAL; + lastarg->size -= diffsize; + } + return fuse_copy_args(cs, out->numargs, out->argpages, out->args, + out->page_zeroing); +} + +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_out_header oh; + + if (nbytes < sizeof(struct fuse_out_header)) + return -EINVAL; + + err = fuse_copy_one(cs, &oh, sizeof(oh)); + if (err) + goto err_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto err_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); + return err ? err : nbytes; + } + + err = -EINVAL; + if (oh.error <= -1000 || oh.error > 0) + goto err_finish; + + spin_lock(&fc->lock); + err = -ENOENT; + if (!fc->connected) + goto err_unlock; + + req = request_find(fc, oh.unique); + if (!req) + goto err_unlock; + + if (req->aborted) { + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + request_end(fc, req); + return -ENOENT; + } + /* Is it an interrupt reply? */ + if (req->intr_unique == oh.unique) { + err = -EINVAL; + if (nbytes != sizeof(struct fuse_out_header)) + goto err_unlock; + + if (oh.error == -ENOSYS) + fc->no_interrupt = 1; + else if (oh.error == -EAGAIN) + queue_interrupt(fc, req); + + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + return nbytes; + } + + req->state = FUSE_REQ_WRITING; + list_move(&req->list, &fc->io); + req->out.h = oh; + req->locked = 1; + cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; + spin_unlock(&fc->lock); + + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); + + spin_lock(&fc->lock); + req->locked = 0; + if (!err) { + if (req->aborted) + err = -ENOENT; + } else if (!req->aborted) + req->out.h.error = -EIO; + request_end(fc, req); + + return err ? err : nbytes; + + err_unlock: + spin_unlock(&fc->lock); + err_finish: + fuse_copy_finish(cs); + return err; +} + +static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + if (!iter_is_iovec(from)) + return -EINVAL; + + fuse_copy_init(&cs, fc, 0, from); + + return fuse_dev_do_write(fc, &cs, iov_iter_count(from)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, fc, 0, NULL); + cs.pipebufs = bufs; + cs.nr_segs = nbuf; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + +static unsigned fuse_dev_poll(struct file *file, poll_table *wait) +{ + unsigned mask = POLLOUT | POLLWRNORM; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return POLLERR; + + poll_wait(file, &fc->waitq, wait); + + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (request_pending(fc)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); + + return mask; +} + +/* + * Abort all requests on the given list (pending or processing) + * + * This function releases and reacquires fc->lock + */ +static void end_requests(struct fuse_conn *fc, struct list_head *head) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(head)) { + struct fuse_req *req; + req = list_entry(head->next, struct fuse_req, list); + req->out.h.error = -ECONNABORTED; + request_end(fc, req); + spin_lock(&fc->lock); + } +} + +/* + * Abort requests under I/O + * + * The requests are set to aborted and finished, and the request + * waiter is woken up. This will make request_wait_answer() wait + * until the request is unlocked and then return. + * + * If the request is asynchronous, then the end function needs to be + * called after waiting for the request to be unlocked (if it was + * locked). + */ +static void end_io_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(&fc->io)) { + struct fuse_req *req = + list_entry(fc->io.next, struct fuse_req, list); + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + + req->aborted = 1; + req->out.h.error = -ECONNABORTED; + req->state = FUSE_REQ_FINISHED; + list_del_init(&req->list); + wake_up(&req->waitq); + if (end) { + req->end = NULL; + __fuse_get_request(req); + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + end(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); + } + } +} + +static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); + while (forget_pending(fc)) + kfree(dequeue_forget(fc, 1, NULL)); +} + +static void end_polls(struct fuse_conn *fc) +{ + struct rb_node *p; + + p = rb_first(&fc->polled_files); + + while (p) { + struct fuse_file *ff; + ff = rb_entry(p, struct fuse_file, polled_node); + wake_up_interruptible_all(&ff->poll_wait); + + p = rb_next(p); + } +} + +/* + * Abort all requests. + * + * Emergency exit in case of a malicious or accidental deadlock, or + * just a hung filesystem. + * + * The same effect is usually achievable through killing the + * filesystem daemon and all users of the filesystem. The exception + * is the combination of an asynchronous request and the tricky + * deadlock (see Documentation/filesystems/fuse.txt). + * + * During the aborting, progression of requests from the pending and + * processing lists onto the io list, and progression of new requests + * onto the pending list is prevented by req->connected being false. + * + * Progression of requests under I/O to the processing list is + * prevented by the req->aborted flag being true for these requests. + * For this reason requests on the io list must be aborted first. + */ +void fuse_abort_conn(struct fuse_conn *fc) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fc->connected = 0; + fc->blocked = 0; + fuse_set_initialized(fc); + end_io_requests(fc); + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_abort_conn); + +int fuse_dev_release(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (fc) { + spin_lock(&fc->lock); + fc->connected = 0; + fc->blocked = 0; + fuse_set_initialized(fc); + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->blocked_waitq); + spin_unlock(&fc->lock); + fuse_conn_put(fc); + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_dev_release); + +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + +const struct file_operations fuse_dev_operations = { + .owner = THIS_MODULE, + .open = fuse_dev_open, + .llseek = no_llseek, + .read_iter = fuse_dev_read, + .splice_read = fuse_dev_splice_read, + .write_iter = fuse_dev_write, + .splice_write = fuse_dev_splice_write, + .poll = fuse_dev_poll, + .release = fuse_dev_release, + .fasync = fuse_dev_fasync, +}; +EXPORT_SYMBOL_GPL(fuse_dev_operations); + +static struct miscdevice fuse_miscdevice = { + .minor = FUSE_MINOR, + .name = "fuse", + .fops = &fuse_dev_operations, +}; + +int __init fuse_dev_init(void) +{ + int err = -ENOMEM; + fuse_req_cachep = kmem_cache_create("fuse_request", + sizeof(struct fuse_req), + 0, 0, NULL); + if (!fuse_req_cachep) + goto out; + + err = misc_register(&fuse_miscdevice); + if (err) + goto out_cache_clean; + + return 0; + + out_cache_clean: + kmem_cache_destroy(fuse_req_cachep); + out: + return err; +} + +void fuse_dev_cleanup(void) +{ + misc_deregister(&fuse_miscdevice); + kmem_cache_destroy(fuse_req_cachep); +} diff --git a/kernel/fs/fuse/dir.c b/kernel/fs/fuse/dir.c new file mode 100644 index 000000000..0572bca49 --- /dev/null +++ b/kernel/fs/fuse/dir.c @@ -0,0 +1,1952 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return entry->d_time; +} +#else +/* + * On 32 bit archs store the high 32 bits of time in d_fsdata + */ +static void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; + entry->d_fsdata = (void *) (unsigned long) (time >> 32); +} + +static u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64) entry->d_time + + ((u64) (unsigned long) entry->d_fsdata << 32); +} +#endif + +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_time and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ +static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +{ + if (sec || nsec) { + struct timespec ts = {sec, nsec}; + return get_jiffies_64() + timespec_to_jiffies(&ts); + } else + return 0; +} + +/* + * Set dentry and possibly attribute timeouts from the lookup/mk* + * replies + */ +static void fuse_change_entry_timeout(struct dentry *entry, + struct fuse_entry_out *o) +{ + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); +} + +static u64 attr_timeout(struct fuse_attr_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +static u64 entry_attr_timeout(struct fuse_entry_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +/* + * Mark the attributes as stale, so that at the next call to + * ->getattr() they will be fetched from userspace + */ +void fuse_invalidate_attr(struct inode *inode) +{ + get_fuse_inode(inode)->i_time = 0; +} + +/** + * Mark the attributes as stale due to an atime change. Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ + if (!IS_RDONLY(inode)) + fuse_invalidate_attr(inode); +} + +/* + * Just mark the entry as stale, so that a next attempt to look it up + * will result in a new lookup call to userspace + * + * This is called when a dentry is about to become negative and the + * timeout is unknown (unlink, rmdir, rename and in some cases + * lookup) + */ +void fuse_invalidate_entry_cache(struct dentry *entry) +{ + fuse_dentry_settime(entry, 0); +} + +/* + * Same as fuse_invalidate_entry_cache(), but also try to remove the + * dentry from the hash + */ +static void fuse_invalidate_entry(struct dentry *entry) +{ + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); +} + +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, + u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg) +{ + memset(outarg, 0, sizeof(struct fuse_entry_out)); + args->in.h.opcode = FUSE_LOOKUP; + args->in.h.nodeid = nodeid; + args->in.numargs = 1; + args->in.args[0].size = name->len + 1; + args->in.args[0].value = name->name; + args->out.numargs = 1; + args->out.args[0].size = sizeof(struct fuse_entry_out); + args->out.args[0].value = outarg; +} + +u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + u64 curr_version; + + /* + * The spin lock isn't actually needed on 64bit archs, but we + * don't yet care too much about such optimizations. + */ + spin_lock(&fc->lock); + curr_version = fc->attr_version; + spin_unlock(&fc->lock); + + return curr_version; +} + +/* + * Check whether the dentry is still valid + * + * If the entry validity timeout has expired and the dentry is + * positive, try to redo the lookup. If the lookup results in a + * different inode, then let the VFS invalidate the dentry and redo + * the lookup once more. If the lookup results in the same inode, + * then refresh the attributes, timeouts and mark the dentry valid. + */ +static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +{ + struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; + struct fuse_inode *fi; + int ret; + + inode = d_inode_rcu(entry); + if (inode && is_bad_inode(inode)) + goto invalid; + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { + struct fuse_entry_out outarg; + FUSE_ARGS(args); + struct fuse_forget_link *forget; + u64 attr_version; + + /* For negative dentries, always do a fresh lookup */ + if (!inode) + goto invalid; + + ret = -ECHILD; + if (flags & LOOKUP_RCU) + goto out; + + fc = get_fuse_conn(inode); + + forget = fuse_alloc_forget(); + ret = -ENOMEM; + if (!forget) + goto out; + + attr_version = fuse_get_attr_version(fc); + + parent = dget_parent(entry); + fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), + &entry->d_name, &outarg); + ret = fuse_simple_request(fc, &args); + dput(parent); + /* Zero nodeid is same as -ENOENT */ + if (!ret && !outarg.nodeid) + ret = -ENOENT; + if (!ret) { + fi = get_fuse_inode(inode); + if (outarg.nodeid != get_node_id(inode)) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + goto invalid; + } + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + } + kfree(forget); + if (ret == -ENOMEM) + goto out; + if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + goto invalid; + + fuse_change_attributes(inode, &outarg.attr, + entry_attr_timeout(&outarg), + attr_version); + fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fi = get_fuse_inode(inode); + if (flags & LOOKUP_RCU) { + if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) + return -ECHILD; + } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(d_inode(parent)); + dput(parent); + } + } + ret = 1; +out: + return ret; + +invalid: + ret = 0; + goto out; +} + +static int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + +const struct dentry_operations fuse_dentry_operations = { + .d_revalidate = fuse_dentry_revalidate, +}; + +int fuse_valid_type(int m) +{ + return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || + S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); +} + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + FUSE_ARGS(args); + struct fuse_forget_link *forget; + u64 attr_version; + int err; + + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; + + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) + goto out; + + attr_version = fuse_get_attr_version(fc); + + fuse_lookup_init(fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fc, &args); + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (!fuse_valid_type(outarg->attr.mode)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_queue_forget(fc, forget, outarg->nodeid, 1); + goto out; + } + err = 0; + + out_put_forget: + kfree(forget); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + unsigned int flags) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + bool outarg_valid = true; + + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; + + newent = d_splice_alias(inode, entry); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_err; + + entry = newent ? newent : entry; + if (outarg_valid) + fuse_change_entry_timeout(entry, &outarg); + else + fuse_invalidate_entry_cache(entry); + + fuse_advise_use_readdirplus(dir); + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +/* + * Atomic create+open operation + * + * If the filesystem doesn't support this, then fall back to separate + * 'mknod' + 'open' requests. + */ +static int fuse_create_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + int err; + struct inode *inode; + struct fuse_conn *fc = get_fuse_conn(dir); + FUSE_ARGS(args); + struct fuse_forget_link *forget; + struct fuse_create_in inarg; + struct fuse_open_out outopen; + struct fuse_entry_out outentry; + struct fuse_file *ff; + + /* Userspace expects S_IFREG in create mode */ + BUG_ON((mode & S_IFMT) != S_IFREG); + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) + goto out_err; + + err = -ENOMEM; + ff = fuse_file_alloc(fc); + if (!ff) + goto out_put_forget_req; + + if (!fc->dont_mask) + mode &= ~current_umask(); + + flags &= ~O_NOCTTY; + memset(&inarg, 0, sizeof(inarg)); + memset(&outentry, 0, sizeof(outentry)); + inarg.flags = flags; + inarg.mode = mode; + inarg.umask = current_umask(); + args.in.h.opcode = FUSE_CREATE; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + args.out.numargs = 2; + args.out.args[0].size = sizeof(outentry); + args.out.args[0].value = &outentry; + args.out.args[1].size = sizeof(outopen); + args.out.args[1].value = &outopen; + err = fuse_simple_request(fc, &args); + if (err) + goto out_free_ff; + + err = -EIO; + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + goto out_free_ff; + + ff->fh = outopen.fh; + ff->nodeid = outentry.nodeid; + ff->open_flags = outopen.open_flags; + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, + &outentry.attr, entry_attr_timeout(&outentry), 0); + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(ff, flags); + fuse_queue_forget(fc, forget, outentry.nodeid, 1); + err = -ENOMEM; + goto out_err; + } + kfree(forget); + d_instantiate(entry, inode); + fuse_change_entry_timeout(entry, &outentry); + fuse_invalidate_attr(dir); + err = finish_open(file, entry, generic_file_open, opened); + if (err) { + fuse_sync_release(ff, flags); + } else { + file->private_data = fuse_file_get(ff); + fuse_finish_open(inode, file); + } + return err; + +out_free_ff: + fuse_file_free(ff); +out_put_forget_req: + kfree(forget); +out_err: + return err; +} + +static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct dentry *res = NULL; + + if (d_unhashed(entry)) { + res = fuse_lookup(dir, entry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + entry = res; + } + + if (!(flags & O_CREAT) || d_really_is_positive(entry)) + goto no_open; + + /* Only creates */ + *opened |= FILE_CREATED; + + if (fc->no_create) + goto mknod; + + err = fuse_create_open(dir, entry, file, flags, mode, opened); + if (err == -ENOSYS) { + fc->no_create = 1; + goto mknod; + } +out_dput: + dput(res); + return err; + +mknod: + err = fuse_mknod(dir, entry, mode, 0); + if (err) + goto out_dput; +no_open: + return finish_no_open(file, res); +} + +/* + * Code shared between mknod, mkdir, symlink and link + */ +static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, + struct inode *dir, struct dentry *entry, + umode_t mode) +{ + struct fuse_entry_out outarg; + struct inode *inode; + int err; + struct fuse_forget_link *forget; + + forget = fuse_alloc_forget(); + if (!forget) + return -ENOMEM; + + memset(&outarg, 0, sizeof(outarg)); + args->in.h.nodeid = get_node_id(dir); + args->out.numargs = 1; + args->out.args[0].size = sizeof(outarg); + args->out.args[0].value = &outarg; + err = fuse_simple_request(fc, args); + if (err) + goto out_put_forget_req; + + err = -EIO; + if (invalid_nodeid(outarg.nodeid)) + goto out_put_forget_req; + + if ((outarg.attr.mode ^ mode) & S_IFMT) + goto out_put_forget_req; + + inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, + &outarg.attr, entry_attr_timeout(&outarg), 0); + if (!inode) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + + err = d_instantiate_no_diralias(entry, inode); + if (err) + return err; + + fuse_change_entry_timeout(entry, &outarg); + fuse_invalidate_attr(dir); + return 0; + + out_put_forget_req: + kfree(forget); + return err; +} + +static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, + dev_t rdev) +{ + struct fuse_mknod_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + FUSE_ARGS(args); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.rdev = new_encode_dev(rdev); + inarg.umask = current_umask(); + args.in.h.opcode = FUSE_MKNOD; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, mode); +} + +static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, + bool excl) +{ + return fuse_mknod(dir, entry, mode, 0); +} + +static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) +{ + struct fuse_mkdir_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + FUSE_ARGS(args); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.umask = current_umask(); + args.in.h.opcode = FUSE_MKDIR; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, S_IFDIR); +} + +static int fuse_symlink(struct inode *dir, struct dentry *entry, + const char *link) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + unsigned len = strlen(link) + 1; + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_SYMLINK; + args.in.numargs = 2; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + args.in.args[1].size = len; + args.in.args[1].value = link; + return create_new_entry(fc, &args, dir, entry, S_IFLNK); +} + +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + +static int fuse_unlink(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_UNLINK; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); + if (!err) { + struct inode *inode = d_inode(entry); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rmdir(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_RMDIR; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); + if (!err) { + clear_nlink(d_inode(entry)); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) +{ + int err; + struct fuse_rename2_in inarg; + struct fuse_conn *fc = get_fuse_conn(olddir); + FUSE_ARGS(args); + + memset(&inarg, 0, argsize); + inarg.newdir = get_node_id(newdir); + inarg.flags = flags; + args.in.h.opcode = opcode; + args.in.h.nodeid = get_node_id(olddir); + args.in.numargs = 3; + args.in.args[0].size = argsize; + args.in.args[0].value = &inarg; + args.in.args[1].size = oldent->d_name.len + 1; + args.in.args[1].value = oldent->d_name.name; + args.in.args[2].size = newent->d_name.len + 1; + args.in.args[2].value = newent->d_name.name; + err = fuse_simple_request(fc, &args); + if (!err) { + /* ctime changes */ + fuse_invalidate_attr(d_inode(oldent)); + fuse_update_ctime(d_inode(oldent)); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(d_inode(newent)); + fuse_update_ctime(d_inode(newent)); + } + + fuse_invalidate_attr(olddir); + if (olddir != newdir) + fuse_invalidate_attr(newdir); + + /* newent will end up negative */ + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { + fuse_invalidate_attr(d_inode(newent)); + fuse_invalidate_entry_cache(newent); + fuse_update_ctime(d_inode(newent)); + } + } else if (err == -EINTR) { + /* If request was interrupted, DEITY only knows if the + rename actually took place. If the invalidation + fails (e.g. some process has CWD under the renamed + directory), then there can be inconsistency between + the dcache and the real filesystem. Tough luck. */ + fuse_invalidate_entry(oldent); + if (d_really_is_positive(newent)) + fuse_invalidate_entry(newent); + } + + return err; +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); + } + + return err; +} + +static int fuse_link(struct dentry *entry, struct inode *newdir, + struct dentry *newent) +{ + int err; + struct fuse_link_in inarg; + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.oldnodeid = get_node_id(inode); + args.in.h.opcode = FUSE_LINK; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = newent->d_name.len + 1; + args.in.args[1].value = newent->d_name.name; + err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); + /* Contrary to "normal" filesystems it can happen that link + makes two "logical" inodes point to the same "physical" + inode. We invalidate the attributes of the old one, so it + will reflect changes in the backing inode (link count, + etc.) + */ + if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + inc_nlink(inode); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } else if (err == -EINTR) { + fuse_invalidate_attr(inode); + } + return err; +} + +static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, + struct kstat *stat) +{ + unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { + attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } + + stat->dev = inode->i_sb->s_dev; + stat->ino = attr->ino; + stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + stat->nlink = attr->nlink; + stat->uid = make_kuid(&init_user_ns, attr->uid); + stat->gid = make_kgid(&init_user_ns, attr->gid); + stat->rdev = inode->i_rdev; + stat->atime.tv_sec = attr->atime; + stat->atime.tv_nsec = attr->atimensec; + stat->mtime.tv_sec = attr->mtime; + stat->mtime.tv_nsec = attr->mtimensec; + stat->ctime.tv_sec = attr->ctime; + stat->ctime.tv_nsec = attr->ctimensec; + stat->size = attr->size; + stat->blocks = attr->blocks; + + if (attr->blksize != 0) + blkbits = ilog2(attr->blksize); + else + blkbits = inode->i_sb->s_blocksize_bits; + + stat->blksize = 1 << blkbits; +} + +static int fuse_do_getattr(struct inode *inode, struct kstat *stat, + struct file *file) +{ + int err; + struct fuse_getattr_in inarg; + struct fuse_attr_out outarg; + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + u64 attr_version; + + attr_version = fuse_get_attr_version(fc); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + args.in.h.opcode = FUSE_GETATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (!err) { + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + } else { + fuse_change_attributes(inode, &outarg.attr, + attr_timeout(&outarg), + attr_version); + if (stat) + fuse_fillattr(inode, &outarg.attr, stat); + } + } + return err; +} + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + bool r; + + if (time_before64(fi->i_time, get_jiffies_64())) { + r = true; + err = fuse_do_getattr(inode, stat, file); + } else { + r = false; + err = 0; + if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; + } + } + + if (refreshed != NULL) + *refreshed = r; + + return err; +} + +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name) +{ + int err = -ENOTDIR; + struct inode *parent; + struct dentry *dir; + struct dentry *entry; + + parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + if (!parent) + return -ENOENT; + + mutex_lock(&parent->i_mutex); + if (!S_ISDIR(parent->i_mode)) + goto unlock; + + err = -ENOENT; + dir = d_find_alias(parent); + if (!dir) + goto unlock; + + entry = d_lookup(dir, name); + dput(dir); + if (!entry) + goto unlock; + + fuse_invalidate_attr(parent); + fuse_invalidate_entry(entry); + + if (child_nodeid != 0 && d_really_is_positive(entry)) { + mutex_lock(&d_inode(entry)->i_mutex); + if (get_node_id(d_inode(entry)) != child_nodeid) { + err = -ENOENT; + goto badentry; + } + if (d_mountpoint(entry)) { + err = -EBUSY; + goto badentry; + } + if (d_is_dir(entry)) { + shrink_dcache_parent(entry); + if (!simple_empty(entry)) { + err = -ENOTEMPTY; + goto badentry; + } + d_inode(entry)->i_flags |= S_DEAD; + } + dont_mount(entry); + clear_nlink(d_inode(entry)); + err = 0; + badentry: + mutex_unlock(&d_inode(entry)->i_mutex); + if (!err) + d_delete(entry); + } else { + err = 0; + } + dput(entry); + + unlock: + mutex_unlock(&parent->i_mutex); + iput(parent); + return err; +} + +/* + * Calling into a user-controlled filesystem gives the filesystem + * daemon ptrace-like capabilities over the current process. This + * means, that the filesystem daemon is able to record the exact + * filesystem operations performed, and can also control the behavior + * of the requester process in otherwise impossible ways. For example + * it can delay the operation for arbitrary length of time allowing + * DoS against the requester. + * + * For this reason only those processes can call into the filesystem, + * for which the owner of the mount has ptrace privilege. This + * excludes processes started by other users, suid or sgid processes. + */ +int fuse_allow_current_process(struct fuse_conn *fc) +{ + const struct cred *cred; + + if (fc->flags & FUSE_ALLOW_OTHER) + return 1; + + cred = current_cred(); + if (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)) + return 1; + + return 0; +} + +static int fuse_access(struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_access_in inarg; + int err; + + BUG_ON(mask & MAY_NOT_BLOCK); + + if (fc->no_access) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); + args.in.h.opcode = FUSE_ACCESS; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_access = 1; + err = 0; + } + return err; +} + +static int fuse_perm_getattr(struct inode *inode, int mask) +{ + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + return fuse_do_getattr(inode, NULL, NULL); +} + +/* + * Check permission. The two basic access models of FUSE are: + * + * 1) Local access checking ('default_permissions' mount option) based + * on file mode. This is the plain old disk filesystem permission + * modell. + * + * 2) "Remote" access checking, where server is responsible for + * checking permission in each inode operation. An exception to this + * is if ->permission() was invoked from sys_access() in which case an + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ +static int fuse_permission(struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + bool refreshed = false; + int err = 0; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + /* + * If attributes are needed, refresh them before proceeding + */ + if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { + struct fuse_inode *fi = get_fuse_inode(inode); + + if (time_before64(fi->i_time, get_jiffies_64())) { + refreshed = true; + + err = fuse_perm_getattr(inode, mask); + if (err) + return err; + } + } + + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + err = generic_permission(inode, mask); + + /* If permission is denied, try to refresh file + attributes. This is also needed, because the root + node will at first have no permissions */ + if (err == -EACCES && !refreshed) { + err = fuse_perm_getattr(inode, mask); + if (!err) + err = generic_permission(inode, mask); + } + + /* Note: the opposite of the above test does not + exist. So if permissions are revoked this won't be + noticed immediately, only after the attribute + timeout has expired */ + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { + err = fuse_access(inode, mask); + } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { + if (!(inode->i_mode & S_IXUGO)) { + if (refreshed) + return -EACCES; + + err = fuse_perm_getattr(inode, mask); + if (!err && !(inode->i_mode & S_IXUGO)) + return -EACCES; + } + } + return err; +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + int err; + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry) { + inode = d_inode(dentry); + if (!inode) { + d_drop(dentry); + } else if (get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + d_invalidate(dentry); + } else if (is_bad_inode(inode)) { + err = -EIO; + goto out; + } else { + struct fuse_inode *fi; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + + /* + * The other branch to 'found' comes via fuse_iget() + * which bumps nlookup inside + */ + goto found; + } + dput(dentry); + } + + dentry = d_alloc(parent, &name); + err = -ENOMEM; + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), attr_version); + if (!inode) + goto out; + + alias = d_splice_alias(inode, dentry); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; + + if (alias) { + dput(dentry); + dentry = alias; + } + +found: + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + err = 0; +out: + dput(dentry); + return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type); + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + int plus, err; + size_t nbytes; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version = 0; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + plus = fuse_use_readdirplus(inode, ctx); + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + fuse_request_send(fc, req); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, ctx, + attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return err; +} + +static char *read_link(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + char *link; + ssize_t ret; + + link = (char *) __get_free_page(GFP_KERNEL); + if (!link) + return ERR_PTR(-ENOMEM); + + args.in.h.opcode = FUSE_READLINK; + args.in.h.nodeid = get_node_id(inode); + args.out.argvar = 1; + args.out.numargs = 1; + args.out.args[0].size = PAGE_SIZE - 1; + args.out.args[0].value = link; + ret = fuse_simple_request(fc, &args); + if (ret < 0) { + free_page((unsigned long) link); + link = ERR_PTR(ret); + } else { + link[ret] = '\0'; + } + fuse_invalidate_atime(inode); + return link; +} + +static void free_link(char *link) +{ + if (!IS_ERR(link)) + free_page((unsigned long) link); +} + +static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, read_link(dentry)); + return NULL; +} + +static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + free_link(nd_get_link(nd)); +} + +static int fuse_dir_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, true); +} + +static int fuse_dir_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, FUSE_RELEASEDIR); + + return 0; +} + +static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + return fuse_fsync_common(file, start, end, datasync, 1); +} + +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, + FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); +} + +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) +{ + /* Always update if mtime is explicitly set */ + if (ivalid & ATTR_MTIME_SET) + return true; + + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ + if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) + return false; + + /* In all other cases update */ + return true; +} + +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) +{ + unsigned ivalid = iattr->ia_valid; + + if (ivalid & ATTR_MODE) + arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; + if (ivalid & ATTR_UID) + arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); + if (ivalid & ATTR_GID) + arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); + if (ivalid & ATTR_SIZE) + arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; + if (ivalid & ATTR_ATIME) { + arg->valid |= FATTR_ATIME; + arg->atime = iattr->ia_atime.tv_sec; + arg->atimensec = iattr->ia_atime.tv_nsec; + if (!(ivalid & ATTR_ATIME_SET)) + arg->valid |= FATTR_ATIME_NOW; + } + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { + arg->valid |= FATTR_MTIME; + arg->mtime = iattr->ia_mtime.tv_sec; + arg->mtimensec = iattr->ia_mtime.tv_nsec; + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) + arg->valid |= FATTR_MTIME_NOW; + } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } +} + +/* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(!mutex_is_locked(&inode->i_mutex)); + + spin_lock(&fc->lock); + BUG_ON(fi->writectr < 0); + fi->writectr += FUSE_NOWRITE; + spin_unlock(&fc->lock); + wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); +} + +/* + * Allow writepages on inode + * + * Remove the bias from the writecounter and send any queued + * writepages. + */ +static void __fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(fi->writectr != FUSE_NOWRITE); + fi->writectr = 0; + fuse_flush_writepages(inode); +} + +void fuse_release_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + spin_lock(&fc->lock); + __fuse_release_nowrite(inode); + spin_unlock(&fc->lock); +} + +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + args->in.h.opcode = FUSE_SETATTR; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg_p); + args->in.args[0].value = inarg_p; + args->out.numargs = 1; + args->out.args[0].size = sizeof(*outarg_p); + args->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid = FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + + return fuse_simple_request(fc, &args); +} + +/* + * Set attributes, and at the same time refresh them. + * + * Truncation is slightly complicated, because the 'truncate' request + * may fail, in which case we don't want to touch the mapping. + * vmtruncate() doesn't allow for this case, so do the rlimit checking + * and the actual truncation by hand. + */ +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + FUSE_ARGS(args); + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + bool is_truncate = false; + bool is_wb = fc->writeback_cache; + loff_t oldsize; + int err; + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + attr->ia_valid |= ATTR_FORCE; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } + + if (attr->ia_valid & ATTR_SIZE) + is_truncate = true; + + if (is_truncate) { + fuse_set_nowrite(inode); + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); + if (file) { + struct fuse_file *ff = file->private_data; + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + if (attr->ia_valid & ATTR_SIZE) { + /* For mandatory locking in truncate */ + inarg.valid |= FATTR_LOCKOWNER; + inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + } + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + err = fuse_simple_request(fc, &args); + if (err) { + if (err == -EINTR) + fuse_invalidate_attr(inode); + goto error; + } + + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + goto error; + } + + spin_lock(&fc->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ + } + + fuse_change_attributes_common(inode, &outarg.attr, + attr_timeout(&outarg)); + oldsize = inode->i_size; + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + i_size_write(inode, outarg.attr.size); + + if (is_truncate) { + /* NOTE: this may release/reacquire fc->lock */ + __fuse_release_nowrite(inode); + } + spin_unlock(&fc->lock); + + /* + * Only call invalidate_inode_pages2() after removing + * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + */ + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + truncate_pagecache(inode, outarg.attr.size); + invalidate_inode_pages2(inode->i_mapping); + } + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + return 0; + +error: + if (is_truncate) + fuse_release_nowrite(inode); + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + return err; +} + +static int fuse_setattr(struct dentry *entry, struct iattr *attr) +{ + struct inode *inode = d_inode(entry); + + if (!fuse_allow_current_process(get_fuse_conn(inode))) + return -EACCES; + + if (attr->ia_valid & ATTR_FILE) + return fuse_do_setattr(inode, attr, attr->ia_file); + else + return fuse_do_setattr(inode, attr, NULL); +} + +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + return fuse_update_attributes(inode, stat, NULL, NULL); +} + +static int fuse_setxattr(struct dentry *entry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +static ssize_t fuse_getxattr(struct dentry *entry, const char *name, + void *value, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static int fuse_removexattr(struct dentry *entry, const char *name) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +static const struct inode_operations fuse_dir_inode_operations = { + .lookup = fuse_lookup, + .mkdir = fuse_mkdir, + .symlink = fuse_symlink, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .rename2 = fuse_rename2, + .link = fuse_link, + .setattr = fuse_setattr, + .create = fuse_create, + .atomic_open = fuse_atomic_open, + .mknod = fuse_mknod, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct file_operations fuse_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = fuse_readdir, + .open = fuse_dir_open, + .release = fuse_dir_release, + .fsync = fuse_dir_fsync, + .unlocked_ioctl = fuse_dir_ioctl, + .compat_ioctl = fuse_dir_compat_ioctl, +}; + +static const struct inode_operations fuse_common_inode_operations = { + .setattr = fuse_setattr, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct inode_operations fuse_symlink_inode_operations = { + .setattr = fuse_setattr, + .follow_link = fuse_follow_link, + .put_link = fuse_put_link, + .readlink = generic_readlink, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +void fuse_init_common(struct inode *inode) +{ + inode->i_op = &fuse_common_inode_operations; +} + +void fuse_init_dir(struct inode *inode) +{ + inode->i_op = &fuse_dir_inode_operations; + inode->i_fop = &fuse_dir_operations; +} + +void fuse_init_symlink(struct inode *inode) +{ + inode->i_op = &fuse_symlink_inode_operations; +} diff --git a/kernel/fs/fuse/file.c b/kernel/fs/fuse/file.c new file mode 100644 index 000000000..5ef05b5c4 --- /dev/null +++ b/kernel/fs/fuse/file.c @@ -0,0 +1,2994 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct file_operations fuse_direct_io_file_operations; + +static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + int opcode, struct fuse_open_out *outargp) +{ + struct fuse_open_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fc->atomic_o_trunc) + inarg.flags &= ~O_TRUNC; + args.in.h.opcode = opcode; + args.in.h.nodeid = nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(*outargp); + args.out.args[0].value = outargp; + + return fuse_simple_request(fc, &args); +} + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +{ + struct fuse_file *ff; + + ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + if (unlikely(!ff)) + return NULL; + + ff->fc = fc; + ff->reserved_req = fuse_request_alloc(0); + if (unlikely(!ff->reserved_req)) { + kfree(ff); + return NULL; + } + + INIT_LIST_HEAD(&ff->write_entry); + atomic_set(&ff->count, 0); + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); + + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); + + return ff; +} + +void fuse_file_free(struct fuse_file *ff) +{ + fuse_request_free(ff->reserved_req); + kfree(ff); +} + +struct fuse_file *fuse_file_get(struct fuse_file *ff) +{ + atomic_inc(&ff->count); + return ff; +} + +static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +{ + iput(req->misc.release.inode); +} + +static void fuse_file_put(struct fuse_file *ff, bool sync) +{ + if (atomic_dec_and_test(&ff->count)) { + struct fuse_req *req = ff->reserved_req; + + if (ff->fc->no_open) { + /* + * Drop the release request when client does not + * implement 'open' + */ + req->background = 0; + iput(req->misc.release.inode); + fuse_put_request(ff->fc, req); + } else if (sync) { + req->background = 0; + fuse_request_send(ff->fc, req); + iput(req->misc.release.inode); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + req->background = 1; + fuse_request_send_background(ff->fc, req); + } + kfree(ff); + } +} + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_file *ff; + int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + + ff = fuse_file_alloc(fc); + if (!ff) + return -ENOMEM; + + ff->fh = 0; + ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ + if (!fc->no_open || isdir) { + struct fuse_open_out outarg; + int err; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + + } else if (err != -ENOSYS || isdir) { + fuse_file_free(ff); + return err; + } else { + fc->no_open = 1; + } + } + + if (isdir) + ff->open_flags &= ~FOPEN_DIRECT_IO; + + ff->nodeid = nodeid; + file->private_data = fuse_file_get(ff); + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_do_open); + +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); +} + +void fuse_finish_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (ff->open_flags & FOPEN_DIRECT_IO) + file->f_op = &fuse_direct_io_file_operations; + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); + } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); +} + +int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; + + err = generic_file_open(inode, file); + if (err) + return err; + + if (lock_inode) + mutex_lock(&inode->i_mutex); + + err = fuse_do_open(fc, get_node_id(inode), file, isdir); + + if (!err) + fuse_finish_open(inode, file); + + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; +} + +static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +{ + struct fuse_conn *fc = ff->fc; + struct fuse_req *req = ff->reserved_req; + struct fuse_release_in *inarg = &req->misc.release.in; + + spin_lock(&fc->lock); + list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); + spin_unlock(&fc->lock); + + wake_up_interruptible_all(&ff->poll_wait); + + inarg->fh = ff->fh; + inarg->flags = flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_release_in); + req->in.args[0].value = inarg; +} + +void fuse_release_common(struct file *file, int opcode) +{ + struct fuse_file *ff; + struct fuse_req *req; + + ff = file->private_data; + if (unlikely(!ff)) + return; + + req = ff->reserved_req; + fuse_prepare_release(ff, file->f_flags, opcode); + + if (ff->flock) { + struct fuse_release_in *inarg = &req->misc.release.in; + inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + inarg->lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); + } + /* Hold inode until release is finished */ + req->misc.release.inode = igrab(file_inode(file)); + + /* + * Normally this will send the RELEASE request, however if + * some asynchronous READ or WRITE requests are outstanding, + * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. + */ + fuse_file_put(ff, ff->fc->destroy_req != NULL); +} + +static int fuse_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, false); +} + +static int fuse_release(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see fuse_vma_close() for !writeback_cache case */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + + fuse_release_common(file, FUSE_RELEASE); + + /* return value is ignored by VFS */ + return 0; +} + +void fuse_sync_release(struct fuse_file *ff, int flags) +{ + WARN_ON(atomic_read(&ff->count) > 1); + fuse_prepare_release(ff, flags, FUSE_RELEASE); + ff->reserved_req->force = 1; + ff->reserved_req->background = 0; + fuse_request_send(ff->fc, ff->reserved_req); + fuse_put_request(ff->fc, ff->reserved_req); + kfree(ff); +} +EXPORT_SYMBOL_GPL(fuse_sync_release); + +/* + * Scramble the ID space with XTEA, so that the value of the files_struct + * pointer is not exposed to userspace. + */ +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) +{ + u32 *k = fc->scramble_key; + u64 v = (unsigned long) id; + u32 v0 = v; + u32 v1 = v >> 32; + u32 sum = 0; + int i; + + for (i = 0; i < 32; i++) { + v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); + sum += 0x9E3779B9; + v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); + } + + return (u64) v0 + ((u64) v1 << 32); +} + +/* + * Check if any page in a range is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + bool found = false; + + spin_lock(&fc->lock); + list_for_each_entry(req, &fi->writepages, writepages_entry) { + pgoff_t curr_index; + + BUG_ON(req->inode != inode); + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (idx_from < curr_index + req->num_pages && + curr_index <= idx_to) { + found = true; + break; + } + } + spin_unlock(&fc->lock); + + return found; +} + +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ +static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + return 0; +} + +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_req *req; + struct fuse_flush_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + if (fc->no_flush) + return 0; + + err = write_inode_now(inode, 1); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + mutex_unlock(&inode->i_mutex); + + req = fuse_get_req_nofail_nopages(fc, file); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fc, id); + req->in.h.opcode = FUSE_FLUSH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->force = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_flush = 1; + err = 0; + } + return err; +} + +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + FUSE_ARGS(args); + struct fuse_fsync_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + mutex_lock(&inode->i_mutex); + + /* + * Start writeback against all dirty pages of the inode, then + * wait for all outstanding writes, before sending the FSYNC + * request. + */ + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + goto out; + + fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; + + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.fsync_flags = datasync ? 1 : 0; + args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + if (isdir) + fc->no_fsyncdir = 1; + else + fc->no_fsync = 1; + err = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return err; +} + +static int fuse_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + return fuse_fsync_common(file, start, end, datasync, 0); +} + +void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_read_in *inarg = &req->misc.read.in; + struct fuse_file *ff = file->private_data; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + inarg->flags = file->f_flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_read_in); + req->in.args[0].value = inarg; + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = count; +} + +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ + unsigned i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (write) + set_page_dirty_lock(page); + put_page(page); + } +} + +static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) +{ + if (io->err) + return io->err; + + if (io->bytes >= 0 && io->write) + return -EIO; + + return io->bytes < 0 ? io->size : io->bytes; +} + +/** + * In case of short read, the caller sets 'pos' to the position of + * actual end of fuse request in IO request. Otherwise, if bytes_requested + * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. + * + * An example: + * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * both submitted asynchronously. The first of them was ACKed by userspace as + * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The + * second request was ACKed as short, e.g. only 1K was read, resulting in + * pos == 33K. + * + * Thus, when all fuse requests are completed, the minimal non-negative 'pos' + * will be equal to the length of the longest contiguous fragment of + * transferred data starting from the beginning of IO request. + */ +static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) +{ + bool is_sync = is_sync_kiocb(io->iocb); + int left; + + spin_lock(&io->lock); + if (err) + io->err = io->err ? : err; + else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) + io->bytes = pos; + + left = --io->reqs; + if (!left && is_sync) + complete(io->done); + spin_unlock(&io->lock); + + if (!left && !is_sync) { + ssize_t res = fuse_get_res_by_io(io); + + if (res >= 0) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); + } + + io->iocb->ki_complete(io->iocb, res, 0); + kfree(io); + } +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_io_priv *io = req->io; + ssize_t pos = -1; + + fuse_release_user_pages(req, !io->write); + + if (io->write) { + if (req->misc.write.in.size != req->misc.write.out.size) + pos = req->misc.write.in.offset - io->offset + + req->misc.write.out.size; + } else { + if (req->misc.read.in.size != req->out.args[0].size) + pos = req->misc.read.in.offset - io->offset + + req->out.args[0].size; + } + + fuse_aio_complete(io, req->out.h.error, pos); +} + +static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, + size_t num_bytes, struct fuse_io_priv *io) +{ + spin_lock(&io->lock); + io->size += num_bytes; + io->reqs++; + spin_unlock(&io->lock); + + req->io = io; + req->end = fuse_aio_complete_req; + + __fuse_get_request(req); + fuse_request_send_background(fc, req); + + return num_bytes; +} + +static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct file *file = io->file; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_read_fill(req, file, pos, count, FUSE_READ); + if (owner != NULL) { + struct fuse_read_in *inarg = &req->misc.read.in; + + inarg->read_flags |= FUSE_READ_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + + fuse_request_send(fc, req); + return req->out.args[0].size; +} + +static void fuse_read_update_size(struct inode *inode, loff_t size, + u64 attr_ver) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + if (attr_ver == fi->attr_version && size < inode->i_size && + !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { + fi->attr_version = ++fc->attr_version; + i_size_write(inode, size); + } + spin_unlock(&fc->lock); +} + +static void fuse_short_read(struct fuse_req *req, struct inode *inode, + u64 attr_ver) +{ + size_t num_read = req->out.args[0].size; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fc->writeback_cache) { + /* + * A hole in a file. Some data after the hole are in page cache, + * but have not reached the client fs yet. So, the hole is not + * present there. + */ + int i; + int start_idx = num_read >> PAGE_CACHE_SHIFT; + size_t off = num_read & (PAGE_CACHE_SIZE - 1); + + for (i = start_idx; i < req->num_pages; i++) { + zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + off = 0; + } + } else { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } +} + +static int fuse_do_readpage(struct file *file, struct page *page) +{ + struct fuse_io_priv io = { .async = 0, .file = file }; + struct inode *inode = page->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + size_t num_read; + loff_t pos = page_offset(page); + size_t count = PAGE_CACHE_SIZE; + u64 attr_ver; + int err; + + /* + * Page writeback can extend beyond the lifetime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + attr_ver = fuse_get_attr_version(fc); + + req->out.page_zeroing = 1; + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = count; + num_read = fuse_send_read(req, &io, pos, count, NULL); + err = req->out.h.error; + + if (!err) { + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (num_read < count) + fuse_short_read(req, inode, attr_ver); + + SetPageUptodate(page); + } + + fuse_put_request(fc, req); + + return err; +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = fuse_do_readpage(file, page); + fuse_invalidate_atime(inode); + out: + unlock_page(page); + return err; +} + +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + size_t count = req->misc.read.in.size; + size_t num_read = req->out.args[0].size; + struct address_space *mapping = NULL; + + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) + fuse_short_read(req, inode, req->misc.read.attr_ver); + + fuse_invalidate_atime(inode); + } + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (!req->out.h.error) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + page_cache_release(page); + } + if (req->ff) + fuse_file_put(req->ff, false); +} + +static void fuse_send_readpages(struct fuse_req *req, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + loff_t pos = page_offset(req->pages[0]); + size_t count = req->num_pages << PAGE_CACHE_SHIFT; + + req->out.argpages = 1; + req->out.page_zeroing = 1; + req->out.page_replace = 1; + fuse_read_fill(req, file, pos, count, FUSE_READ); + req->misc.read.attr_ver = fuse_get_attr_version(fc); + if (fc->async_read) { + req->ff = fuse_file_get(ff); + req->end = fuse_readpages_end; + fuse_request_send_background(fc, req); + } else { + fuse_request_send(fc, req); + fuse_readpages_end(fc, req); + fuse_put_request(fc, req); + } +} + +struct fuse_fill_data { + struct fuse_req *req; + struct file *file; + struct inode *inode; + unsigned nr_pages; +}; + +static int fuse_readpages_fill(void *_data, struct page *page) +{ + struct fuse_fill_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + fuse_wait_on_page_writeback(inode, page->index); + + if (req->num_pages && + (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || + req->pages[req->num_pages - 1]->index + 1 != page->index)) { + int nr_alloc = min_t(unsigned, data->nr_pages, + FUSE_MAX_PAGES_PER_REQ); + fuse_send_readpages(req, data->file); + if (fc->async_read) + req = fuse_get_req_for_background(fc, nr_alloc); + else + req = fuse_get_req(fc, nr_alloc); + + data->req = req; + if (IS_ERR(req)) { + unlock_page(page); + return PTR_ERR(req); + } + } + + if (WARN_ON(req->num_pages >= req->max_pages)) { + fuse_put_request(fc, req); + return -EIO; + } + + page_cache_get(page); + req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = PAGE_SIZE; + req->num_pages++; + data->nr_pages--; + return 0; +} + +static int fuse_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_data data; + int err; + int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.file = file; + data.inode = inode; + if (fc->async_read) + data.req = fuse_get_req_for_background(fc, nr_alloc); + else + data.req = fuse_get_req(fc, nr_alloc); + data.nr_pages = nr_pages; + err = PTR_ERR(data.req); + if (IS_ERR(data.req)) + goto out; + + err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file); + else + fuse_put_request(fc, data.req); + } +out: + return err; +} + +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * In auto invalidate mode, always update attributes on read. + * Otherwise, only update if we attempt to read past EOF (to ensure + * i_size is up to date). + */ + if (fc->auto_inval_data || + (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { + int err; + err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + if (err) + return err; + } + + return generic_file_read_iter(iocb, to); +} + +static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_out *outarg = &req->misc.write.out; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + req->in.h.opcode = FUSE_WRITE; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 2; + if (ff->fc->minor < 9) + req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + req->in.args[0].size = sizeof(struct fuse_write_in); + req->in.args[0].value = inarg; + req->in.args[1].size = count; + req->out.numargs = 1; + req->out.args[0].size = sizeof(struct fuse_write_out); + req->out.args[0].value = outarg; +} + +static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct file *file = io->file; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_write_in *inarg = &req->misc.write.in; + + fuse_write_fill(req, ff, pos, count); + inarg->flags = file->f_flags; + if (owner != NULL) { + inarg->write_flags |= FUSE_WRITE_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + + fuse_request_send(fc, req); + return req->misc.write.out.size; +} + +bool fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + if (pos > inode->i_size) { + i_size_write(inode, pos); + ret = true; + } + spin_unlock(&fc->lock); + + return ret; +} + +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + struct fuse_io_priv io = { .async = 0, .file = file }; + + for (i = 0; i < req->num_pages; i++) + fuse_wait_on_page_writeback(inode, req->pages[i]->index); + + res = fuse_send_write(req, &io, pos, count, NULL); + + offset = req->page_descs[0].offset; + count = res; + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + + if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (count > PAGE_CACHE_SIZE - offset) + count -= PAGE_CACHE_SIZE - offset; + else + count = 0; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req->in.argpages = 1; + req->page_descs[0].offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = tmp; + req->num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && + req->num_pages < req->max_pages && offset == 0); + + return count > 0 ? count : err; +} + +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ + return min_t(unsigned, + ((pos + len - 1) >> PAGE_CACHE_SHIFT) - + (pos >> PAGE_CACHE_SHIFT) + 1, + FUSE_MAX_PAGES_PER_REQ); +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + if (inode->i_size < pos + iov_iter_count(ii)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + do { + struct fuse_req *req; + ssize_t count; + unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); + + req = fuse_get_req(fc, nr_pages); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count <= 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req->out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count) + err = -EIO; + } + } + fuse_put_request(fc, req); + } while (!err && iov_iter_count(ii)); + + if (res > 0) + fuse_write_update_size(inode, pos); + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + fuse_invalidate_attr(inode); + + return res > 0 ? res : err; +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + ssize_t written = 0; + ssize_t written_buffered = 0; + struct inode *inode = mapping->host; + ssize_t err; + loff_t endbyte = 0; + + if (get_fuse_conn(inode)->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, NULL, file, NULL); + if (err) + return err; + + return generic_file_write_iter(iocb, from); + } + + mutex_lock(&inode->i_mutex); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + + err = generic_write_checks(iocb, from); + if (err <= 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos = iocb->ki_pos; + written = generic_file_direct_write(iocb, from, pos); + if (written < 0 || !iov_iter_count(from)) + goto out; + + pos += written; + + written_buffered = fuse_perform_write(file, mapping, from, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + + err = filemap_write_and_wait_range(file->f_mapping, pos, + endbyte); + if (err) + goto out; + + invalidate_mapping_pages(file->f_mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + } else { + written = fuse_perform_write(file, mapping, from, iocb->ki_pos); + if (written >= 0) + iocb->ki_pos += written; + } +out: + current->backing_dev_info = NULL; + mutex_unlock(&inode->i_mutex); + + return written ? written : err; +} + +static inline void fuse_page_descs_length_init(struct fuse_req *req, + unsigned index, unsigned nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + req->page_descs[i].length = PAGE_SIZE - + req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, + size_t *nbytesp, int write) +{ + size_t nbytes = 0; /* # bytes already packed in req */ + + /* Special case for kernel I/O: can copy directly into the buffer */ + if (ii->type & ITER_KVEC) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + + if (write) + req->in.args[1].value = (void *) user_addr; + else + req->out.args[0].value = (void *) user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + unsigned npages; + size_t start; + ssize_t ret = iov_iter_get_pages(ii, + &req->pages[req->num_pages], + *nbytesp - nbytes, + req->max_pages - req->num_pages, + &start); + if (ret < 0) + return ret; + + iov_iter_advance(ii, ret); + nbytes += ret; + + ret += start; + npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + + req->page_descs[req->num_pages].offset = start; + fuse_page_descs_length_init(req, req->num_pages, npages); + + req->num_pages += npages; + req->page_descs[req->num_pages - 1].length -= + (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + } + + if (write) + req->in.argpages = 1; + else + req->out.argpages = 1; + + *nbytesp = nbytes; + + return 0; +} + +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ + return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags) +{ + int write = flags & FUSE_DIO_WRITE; + int cuse = flags & FUSE_DIO_CUSE; + struct file *file = io->file; + struct inode *inode = file->f_mapping->host; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + size_t nmax = write ? fc->max_write : fc->max_read; + loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; + ssize_t res = 0; + struct fuse_req *req; + + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); + else + req = fuse_get_req(fc, fuse_iter_npages(iter)); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!write) + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + if (!write) + mutex_unlock(&inode->i_mutex); + } + + while (count) { + size_t nres; + fl_owner_t owner = current->files; + size_t nbytes = min(count, nmax); + int err = fuse_get_user_pages(req, iter, &nbytes, write); + if (err) { + res = err; + break; + } + + if (write) + nres = fuse_send_write(req, io, pos, nbytes, owner); + else + nres = fuse_send_read(req, io, pos, nbytes, owner); + + if (!io->async) + fuse_release_user_pages(req, !write); + if (req->out.h.error) { + if (!res) + res = req->out.h.error; + break; + } else if (nres > nbytes) { + res = -EIO; + break; + } + count -= nres; + res += nres; + pos += nres; + if (nres != nbytes) + break; + if (count) { + fuse_put_request(fc, req); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(iter)); + else + req = fuse_get_req(fc, fuse_iter_npages(iter)); + if (IS_ERR(req)) + break; + } + } + if (!IS_ERR(req)) + fuse_put_request(fc, req); + if (res > 0) + *ppos = pos; + + return res; +} +EXPORT_SYMBOL_GPL(fuse_direct_io); + +static ssize_t __fuse_direct_read(struct fuse_io_priv *io, + struct iov_iter *iter, + loff_t *ppos) +{ + ssize_t res; + struct file *file = io->file; + struct inode *inode = file_inode(file); + + if (is_bad_inode(inode)) + return -EIO; + + res = fuse_direct_io(io, iter, ppos, 0); + + fuse_invalidate_attr(inode); + + return res; +} + +static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp }; + return __fuse_direct_read(&io, to, &iocb->ki_pos); +} + +static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_io_priv io = { .async = 0, .file = file }; + ssize_t res; + + if (is_bad_inode(inode)) + return -EIO; + + /* Don't allow parallel writes to the same file */ + mutex_lock(&inode->i_mutex); + res = generic_write_checks(iocb, from); + if (res > 0) + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + fuse_invalidate_attr(inode); + if (res > 0) + fuse_write_update_size(inode, iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + return res; +} + +static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + + for (i = 0; i < req->num_pages; i++) + __free_page(req->pages[i]); + + if (req->ff) + fuse_file_put(req->ff, false); +} + +static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); + int i; + + list_del(&req->writepages_entry); + for (i = 0; i < req->num_pages; i++) { + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + } + wake_up(&fi->page_waitq); +} + +/* Called under fc->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, + loff_t size) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_inode *fi = get_fuse_inode(req->inode); + struct fuse_write_in *inarg = &req->misc.write.in; + __u64 data_size = req->num_pages * PAGE_CACHE_SIZE; + + if (!fc->connected) + goto out_free; + + if (inarg->offset + data_size <= size) { + inarg->size = data_size; + } else if (inarg->offset < size) { + inarg->size = size - inarg->offset; + } else { + /* Got truncated off completely */ + goto out_free; + } + + req->in.args[1].size = inarg->size; + fi->writectr++; + fuse_request_send_background_locked(fc, req); + return; + + out_free: + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); +} + +/* + * If fi->writectr is positive (no truncate or fsync going on) send + * all queued writepage requests. + * + * Called with fc->lock + */ +void fuse_flush_writepages(struct inode *inode) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + size_t crop = i_size_read(inode); + struct fuse_req *req; + + while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { + req = list_entry(fi->queued_writes.next, struct fuse_req, list); + list_del_init(&req->list); + fuse_send_writepage(fc, req, crop); + } +} + +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + + mapping_set_error(inode->i_mapping, req->out.h.error); + spin_lock(&fc->lock); + while (req->misc.write.next) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_req *next = req->misc.write.next; + req->misc.write.next = next->misc.write.next; + next->misc.write.next = NULL; + next->ff = fuse_file_get(req->ff); + list_add(&next->writepages_entry, &fi->writepages); + + /* + * Skip fuse_flush_writepages() to make it easy to crop requests + * based on primary request size. + * + * 1st case (trivial): there are no concurrent activities using + * fuse_set/release_nowrite. Then we're on safe side because + * fuse_flush_writepages() would call fuse_send_writepage() + * anyway. + * + * 2nd case: someone called fuse_set_nowrite and it is waiting + * now for completion of all in-flight requests. This happens + * rarely and no more than once per page, so this should be + * okay. + * + * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle + * of fuse_set_nowrite..fuse_release_nowrite section. The fact + * that fuse_set_nowrite returned implies that all in-flight + * requests were completed along with all of their secondary + * requests. Further primary requests are blocked by negative + * writectr. Hence there cannot be any in-flight requests and + * no invocations of fuse_writepage_end() while we're in + * fuse_set_nowrite..fuse_release_nowrite section. + */ + fuse_send_writepage(fc, next, inarg->offset + inarg->size); + } + fi->writectr--; + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); +} + +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = NULL; + + spin_lock(&fc->lock); + if (!list_empty(&fi->write_files)) { + ff = list_entry(fi->write_files.next, struct fuse_file, + write_entry); + fuse_file_get(ff); + } + spin_unlock(&fc->lock); + + return ff; +} + +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + +static int fuse_writepage_locked(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + struct page *tmp_page; + int error = -ENOMEM; + + set_page_writeback(page); + + req = fuse_request_alloc_nofs(1); + if (!req) + goto err; + + req->background = 1; /* writeback always goes to bg_queue */ + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + error = -EIO; + req->ff = fuse_write_file_get(fc, fi); + if (!req->ff) + goto err_nofile; + + fuse_write_fill(req, req->ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; + req->in.argpages = 1; + req->num_pages = 1; + req->pages[0] = tmp_page; + req->page_descs[0].offset = 0; + req->page_descs[0].length = PAGE_SIZE; + req->end = fuse_writepage_end; + req->inode = inode; + + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + end_page_writeback(page); + + return 0; + +err_nofile: + __free_page(tmp_page); +err_free: + fuse_request_free(req); +err: + end_page_writeback(page); + return error; +} + +static int fuse_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + + if (fuse_page_is_writeback(page->mapping->host, page->index)) { + /* + * ->writepages() should be called for sync() and friends. We + * should only get here on direct reclaim and then we are + * allowed to skip a page which is already in flight + */ + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + + redirty_page_for_writepage(wbc, page); + return 0; + } + + err = fuse_writepage_locked(page); + unlock_page(page); + + return err; +} + +struct fuse_fill_wb_data { + struct fuse_req *req; + struct fuse_file *ff; + struct inode *inode; + struct page **orig_pages; +}; + +static void fuse_writepages_send(struct fuse_fill_wb_data *data) +{ + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + int num_pages = req->num_pages; + int i; + + req->ff = fuse_file_get(data->ff); + spin_lock(&fc->lock); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); +} + +static bool fuse_writepage_in_flight(struct fuse_req *new_req, + struct page *page) +{ + struct fuse_conn *fc = get_fuse_conn(new_req->inode); + struct fuse_inode *fi = get_fuse_inode(new_req->inode); + struct fuse_req *tmp; + struct fuse_req *old_req; + bool found = false; + pgoff_t curr_index; + + BUG_ON(new_req->num_pages != 0); + + spin_lock(&fc->lock); + list_del(&new_req->writepages_entry); + list_for_each_entry(old_req, &fi->writepages, writepages_entry) { + BUG_ON(old_req->inode != new_req->inode); + curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index <= page->index && + page->index < curr_index + old_req->num_pages) { + found = true; + break; + } + } + if (!found) { + list_add(&new_req->writepages_entry, &fi->writepages); + goto out_unlock; + } + + new_req->num_pages = 1; + for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { + BUG_ON(tmp->inode != new_req->inode); + curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (tmp->num_pages == 1 && + curr_index == page->index) { + old_req = tmp; + } + } + + if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || + old_req->state == FUSE_REQ_PENDING)) { + struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); + + copy_highpage(old_req->pages[0], page); + spin_unlock(&fc->lock); + + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(page, NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + fuse_writepage_free(fc, new_req); + fuse_request_free(new_req); + goto out; + } else { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } +out_unlock: + spin_unlock(&fc->lock); +out: + return found; +} + +static int fuse_writepages_fill(struct page *page, + struct writeback_control *wbc, void *_data) +{ + struct fuse_fill_wb_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *tmp_page; + bool is_writeback; + int err; + + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + if (!data->ff) + goto out_unlock; + } + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + is_writeback = fuse_page_is_writeback(inode, page->index); + + if (req && req->num_pages && + (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write || + data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + fuse_writepages_send(data); + data->req = NULL; + } + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + + /* + * The page must not be redirtied until the writeout is completed + * (i.e. userspace has sent a reply to the write request). Otherwise + * there could be more than one temporary page instance for each real + * page. + * + * This is ensured by holding the page lock in page_mkwrite() while + * checking fuse_page_is_writeback(). We already hold the page lock + * since clear_page_dirty_for_io() and keep it held until we add the + * request to the fi->writepages list and increment req->num_pages. + * After this fuse_page_is_writeback() will indicate that the page is + * under writeback, so we can release the page lock. + */ + if (data->req == NULL) { + struct fuse_inode *fi = get_fuse_inode(inode); + + err = -ENOMEM; + req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + if (!req) { + __free_page(tmp_page); + goto out_unlock; + } + + fuse_write_fill(req, data->ff, page_offset(page), 0); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; + req->in.argpages = 1; + req->background = 1; + req->num_pages = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + spin_unlock(&fc->lock); + + data->req = req; + } + set_page_writeback(page); + + copy_highpage(tmp_page, page); + req->pages[req->num_pages] = tmp_page; + req->page_descs[req->num_pages].offset = 0; + req->page_descs[req->num_pages].length = PAGE_SIZE; + + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + + err = 0; + if (is_writeback && fuse_writepage_in_flight(req, page)) { + end_page_writeback(page); + data->req = NULL; + goto out_unlock; + } + data->orig_pages[req->num_pages] = page; + + /* + * Protected by fc->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fc->lock); + req->num_pages++; + spin_unlock(&fc->lock); + +out_unlock: + unlock_page(page); + + return err; +} + +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_fill_wb_data data; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.inode = inode; + data.req = NULL; + data.ff = NULL; + + err = -ENOMEM; + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), + GFP_NOFS); + if (!data.orig_pages) + goto out; + + err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); + if (data.req) { + /* Ignore errors if we can write at least one page */ + BUG_ON(!data.req->num_pages); + fuse_writepages_send(&data); + err = 0; + } + if (data.ff) + fuse_file_put(data.ff, false); + + kfree(data.orig_pages); +out: + return err; +} + +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct fuse_conn *fc = get_fuse_conn(file_inode(file)); + struct page *page; + loff_t fsize; + int err = -ENOMEM; + + WARN_ON(!fc->writeback_cache); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + goto error; + + fuse_wait_on_page_writeback(mapping->host, page->index); + + if (PageUptodate(page) || len == PAGE_CACHE_SIZE) + goto success; + /* + * Check if the start this page comes after the end of file, in which + * case the readpage can be optimized away. + */ + fsize = i_size_read(mapping->host); + if (fsize <= (pos & PAGE_CACHE_MASK)) { + size_t off = pos & ~PAGE_CACHE_MASK; + if (off) + zero_user_segment(page, 0, off); + goto success; + } + err = fuse_do_readpage(file, page); + if (err) + goto cleanup; +success: + *pagep = page; + return 0; + +cleanup: + unlock_page(page); + page_cache_release(page); +error: + return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + if (!PageUptodate(page)) { + /* Zero any unwritten bytes at the end of the page */ + size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; + if (endoff) + zero_user_segment(page, endoff, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + fuse_write_update_size(inode, pos + copied); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + +static int fuse_launder_page(struct page *page) +{ + int err = 0; + if (clear_page_dirty_for_io(page)) { + struct inode *inode = page->mapping->host; + err = fuse_writepage_locked(page); + if (!err) + fuse_wait_on_page_writeback(inode, page->index); + } + return err; +} + +/* + * Write back dirty pages now, because there may not be any suitable + * open files later + */ +static void fuse_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +/* + * Wait for writeback against this page to complete before allowing it + * to be marked dirty again, and hence written back again, possibly + * before the previous writepage completed. + * + * Block here, instead of in ->writepage(), so that the userspace fs + * can only block processes actually operating on the filesystem. + * + * Otherwise unprivileged userspace fs would be able to block + * unrelated: + * + * - page migration + * - sync(2) + * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER + */ +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return VM_FAULT_NOPAGE; + } + + fuse_wait_on_page_writeback(inode, page->index); + return VM_FAULT_LOCKED; +} + +static const struct vm_operations_struct fuse_file_vm_ops = { + .close = fuse_vma_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = fuse_page_mkwrite, +}; + +static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + + file_accessed(file); + vma->vm_ops = &fuse_file_vm_ops; + return 0; +} + +static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); +} + +static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, + struct file_lock *fl) +{ + switch (ffl->type) { + case F_UNLCK: + break; + + case F_RDLCK: + case F_WRLCK: + if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || + ffl->end < ffl->start) + return -EIO; + + fl->fl_start = ffl->start; + fl->fl_end = ffl->end; + fl->fl_pid = ffl->pid; + break; + + default: + return -EIO; + } + fl->fl_type = ffl->type; + return 0; +} + +static void fuse_lk_fill(struct fuse_args *args, struct file *file, + const struct file_lock *fl, int opcode, pid_t pid, + int flock, struct fuse_lk_in *inarg) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + + memset(inarg, 0, sizeof(*inarg)); + inarg->fh = ff->fh; + inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->lk.start = fl->fl_start; + inarg->lk.end = fl->fl_end; + inarg->lk.type = fl->fl_type; + inarg->lk.pid = pid; + if (flock) + inarg->lk_flags |= FUSE_LK_FLOCK; + args->in.h.opcode = opcode; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg); + args->in.args[0].value = inarg; +} + +static int fuse_getlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_lk_in inarg; + struct fuse_lk_out outarg; + int err; + + fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (!err) + err = convert_fuse_file_lock(&outarg.lk, fl); + + return err; +} + +static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_lk_in inarg; + int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + int err; + + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + + /* Unlock on close is handled by the flush method */ + if (fl->fl_flags & FL_CLOSE) + return 0; + + fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + err = fuse_simple_request(fc, &args); + + /* locking is restartable */ + if (err == -EINTR) + err = -ERESTARTSYS; + + return err; +} + +static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { + if (fc->no_lock) { + posix_test_lock(file, fl); + err = 0; + } else + err = fuse_getlk(file, fl); + } else { + if (fc->no_lock) + err = posix_lock_file(file, fl, NULL); + else + err = fuse_setlk(file, fl, 0); + } + return err; +} + +static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fc->no_flock) { + err = flock_lock_file_wait(file, fl); + } else { + struct fuse_file *ff = file->private_data; + + /* emulate flock with POSIX locks */ + ff->flock = true; + err = fuse_setlk(file, fl, 1); + } + + return err; +} + +static sector_t fuse_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_bmap_in inarg; + struct fuse_bmap_out outarg; + int err; + + if (!inode->i_sb->s_bdev || fc->no_bmap) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.block = block; + inarg.blocksize = inode->i_sb->s_blocksize; + args.in.h.opcode = FUSE_BMAP; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) + fc->no_bmap = 1; + + return err ? 0 : outarg.block; +} + +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t retval; + struct inode *inode = file_inode(file); + + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ + if (whence == SEEK_CUR || whence == SEEK_SET) + return generic_file_llseek(file, offset, whence); + + mutex_lock(&inode->i_mutex); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return retval; +} + +static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, + unsigned int nr_segs, size_t bytes, bool to_user) +{ + struct iov_iter ii; + int page_idx = 0; + + if (!bytes) + return 0; + + iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); + + while (iov_iter_count(&ii)) { + struct page *page = pages[page_idx++]; + size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); + void *kaddr; + + kaddr = kmap(page); + + while (todo) { + char __user *uaddr = ii.iov->iov_base + ii.iov_offset; + size_t iov_len = ii.iov->iov_len - ii.iov_offset; + size_t copy = min(todo, iov_len); + size_t left; + + if (!to_user) + left = copy_from_user(kaddr, uaddr, copy); + else + left = copy_to_user(uaddr, kaddr, copy); + + if (unlikely(left)) + return -EFAULT; + + iov_iter_advance(&ii, copy); + todo -= copy; + kaddr += copy; + } + + kunmap(page); + } + + return 0; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++, iov++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct fuse_req *req = NULL; + struct page **pages = NULL; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; + size_t in_size, out_size, transferred; + int err; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!pages || !iov_page) + goto out; + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > FUSE_MAX_PAGES_PER_REQ) + goto out; + while (num_pages < max_pages) { + pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!pages[num_pages]) + goto out; + num_pages++; + } + + req = fuse_get_req(fc, num_pages); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); + req->num_pages = num_pages; + fuse_page_descs_length_init(req, 0, req->num_pages); + + /* okay, let's send it to the client */ + req->in.h.opcode = FUSE_IOCTL; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + if (in_size) { + req->in.numargs++; + req->in.args[1].size = in_size; + req->in.argpages = 1; + + err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, + false); + if (err) + goto out; + } + + req->out.numargs = 2; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + req->out.args[1].size = out_size; + req->out.argpages = 1; + req->out.argvar = 1; + + fuse_request_send(fc, req); + err = req->out.h.error; + transferred = req->out.args[1].size; + fuse_put_request(fc, req); + req = NULL; + if (err) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_atomic(pages[0]); + err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_atomic(vaddr); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + out: + if (req) + fuse_put_request(fc, req); + free_page((unsigned long) iov_page); + while (num_pages) + __free_page(pages[--num_pages]); + kfree(pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (is_bad_inode(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long fuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, 0); +} + +static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *uninitialized_var(parent); + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +unsigned fuse_file_poll(struct file *file, poll_table *wait) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + FUSE_ARGS(args); + int err; + + if (fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + inarg.events = (__u32)poll_requested_events(wait); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fc, ff); + } + + args.in.h.opcode = FUSE_POLL; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + + if (!err) + return outarg.revents; + if (err == -ENOSYS) { + fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return POLLERR; +} +EXPORT_SYMBOL_GPL(fuse_file_poll); + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + +static void fuse_do_truncate(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + struct iattr attr; + + attr.ia_valid = ATTR_SIZE; + attr.ia_size = i_size_read(inode); + + attr.ia_file = file; + attr.ia_valid |= ATTR_FILE; + + fuse_do_setattr(inode, &attr, file); +} + +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + +static ssize_t +fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t ret = 0; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; + loff_t pos = 0; + struct inode *inode; + loff_t i_size; + size_t count = iov_iter_count(iter); + struct fuse_io_priv *io; + + pos = offset; + inode = file->f_mapping->host; + i_size = i_size_read(inode); + + if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + return 0; + + /* optimization for short read */ + if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { + if (offset >= i_size) + return 0; + iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + count = iov_iter_count(iter); + } + + io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); + if (!io) + return -ENOMEM; + spin_lock_init(&io->lock); + io->reqs = 1; + io->bytes = -1; + io->size = 0; + io->offset = offset; + io->write = (iov_iter_rw(iter) == WRITE); + io->err = 0; + io->file = file; + /* + * By default, we want to optimize all I/Os with async request + * submission to the client filesystem if supported. + */ + io->async = async_dio; + io->iocb = iocb; + + /* + * We cannot asynchronously extend the size of a file. We have no method + * to wait on real async I/O requests, so we must submit this request + * synchronously. + */ + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && + iov_iter_rw(iter) == WRITE) + io->async = false; + + if (io->async && is_sync_kiocb(iocb)) + io->done = &wait; + + if (iov_iter_rw(iter) == WRITE) { + ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); + fuse_invalidate_attr(inode); + } else { + ret = __fuse_direct_read(io, iter, &pos); + } + + if (io->async) { + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); + + /* we have a non-extending, async request, so return */ + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + wait_for_completion(&wait); + ret = fuse_get_res_by_io(io); + } + + kfree(io); + + if (iov_iter_rw(iter) == WRITE) { + if (ret > 0) + fuse_write_update_size(inode, pos); + else if (ret < 0 && offset + count > i_size) + fuse_do_truncate(file); + } + + return ret; +} + +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = ff->fc; + FUSE_ARGS(args); + struct fuse_fallocate_in inarg = { + .fh = ff->fh, + .offset = offset, + .length = length, + .mode = mode + }; + int err; + bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_PUNCH_HOLE); + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (fc->no_fallocate) + return -EOPNOTSUPP; + + if (lock_inode) { + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) { + loff_t endbyte = offset + length - 1; + err = filemap_write_and_wait_range(inode->i_mapping, + offset, endbyte); + if (err) + goto out; + + fuse_sync_writes(inode); + } + } + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + args.in.h.opcode = FUSE_FALLOCATE; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_fallocate = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + bool changed = fuse_write_update_size(inode, offset + length); + + if (changed && fc->writeback_cache) + file_update_time(file); + } + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + +out: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; +} + +static const struct file_operations fuse_file_operations = { + .llseek = fuse_file_llseek, + .read_iter = fuse_file_read_iter, + .write_iter = fuse_file_write_iter, + .mmap = fuse_file_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, +}; + +static const struct file_operations fuse_direct_io_file_operations = { + .llseek = fuse_file_llseek, + .read_iter = fuse_direct_read_iter, + .write_iter = fuse_direct_write_iter, + .mmap = fuse_direct_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, + /* no splice_read */ +}; + +static const struct address_space_operations fuse_file_aops = { + .readpage = fuse_readpage, + .writepage = fuse_writepage, + .writepages = fuse_writepages, + .launder_page = fuse_launder_page, + .readpages = fuse_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .bmap = fuse_bmap, + .direct_IO = fuse_direct_IO, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, +}; + +void fuse_init_file_inode(struct inode *inode) +{ + inode->i_fop = &fuse_file_operations; + inode->i_data.a_ops = &fuse_file_aops; +} diff --git a/kernel/fs/fuse/fuse_i.h b/kernel/fs/fuse/fuse_i.h new file mode 100644 index 000000000..7354dc142 --- /dev/null +++ b/kernel/fs/fuse/fuse_i.h @@ -0,0 +1,912 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** Max number of pages that can be used in a single read request */ +#define FUSE_MAX_PAGES_PER_REQ 32 + +/** Bias for fi->writectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 5 + +/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem + module will check permissions based on the file mode. Otherwise no + permission checking is done in the kernel */ +#define FUSE_DEFAULT_PERMISSIONS (1 << 0) + +/** If the FUSE_ALLOW_OTHER flag is given, then not only the user + doing the mount will be allowed to access the filesystem */ +#define FUSE_ALLOW_OTHER (1 << 1) + +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 + +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; + +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + +/** FUSE inode */ +struct fuse_inode { + /** Inode data */ + struct inode inode; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** Number of lookups on this inode */ + u64 nlookup; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; + + /** Time in jiffies until the file attributes are valid */ + u64 i_time; + + /** The sticky bit in inode->i_mode may have been removed, so + preserve the original mode */ + umode_t orig_i_mode; + + /** 64 bit inode number */ + u64 orig_ino; + + /** Version of last attribute change */ + u64 attr_version; + + /** Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /** Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /** Number of sent writes, a negative bias (FUSE_NOWRITE) + * means more writes are blocked */ + int writectr; + + /** Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /** List of writepage requestst (pending or sent) */ + struct list_head writepages; + + /** Miscellaneous bits describing inode state */ + unsigned long state; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, + /** Initialized with readdirplus */ + FUSE_I_INIT_RDPLUS, + /** An operation changing file size is in progress */ + FUSE_I_SIZE_UNSTABLE, +}; + +struct fuse_conn; + +/** FUSE specific file data */ +struct fuse_file { + /** Fuse connection for this file */ + struct fuse_conn *fc; + + /** Request reserved for flush and release */ + struct fuse_req *reserved_req; + + /** Kernel file handle guaranteed to be unique */ + u64 kh; + + /** File handle used by userspace */ + u64 fh; + + /** Node id of this file */ + u64 nodeid; + + /** Refcount */ + atomic_t count; + + /** FOPEN_* flags returned by open */ + u32 open_flags; + + /** Entry on inode's write_files list */ + struct list_head write_entry; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; +}; + +/** One input argument of a request */ +struct fuse_in_arg { + unsigned size; + const void *value; +}; + +/** The request input */ +struct fuse_in { + /** The request header */ + struct fuse_in_header h; + + /** True if the data for the last argument is in req->pages */ + unsigned argpages:1; + + /** Number of arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_in_arg args[3]; +}; + +/** One output argument of a request */ +struct fuse_arg { + unsigned size; + void *value; +}; + +/** The request output */ +struct fuse_out { + /** Header returned from userspace */ + struct fuse_out_header h; + + /* + * The following bitfields are not changed during the request + * processing + */ + + /** Last argument is variable length (can be shorter than + arg->size) */ + unsigned argvar:1; + + /** Last argument is a list of pages to copy data to */ + unsigned argpages:1; + + /** Zero partially or not copied pages */ + unsigned page_zeroing:1; + + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + + /** Number or arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_arg args[2]; +}; + +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + +struct fuse_args { + struct { + struct { + uint32_t opcode; + uint64_t nodeid; + } h; + unsigned numargs; + struct fuse_in_arg args[3]; + + } in; + struct { + unsigned argvar:1; + unsigned numargs; + struct fuse_arg args[2]; + } out; +}; + +#define FUSE_ARGS(args) struct fuse_args args = {} + +/** The request state */ +enum fuse_req_state { + FUSE_REQ_INIT = 0, + FUSE_REQ_PENDING, + FUSE_REQ_READING, + FUSE_REQ_SENT, + FUSE_REQ_WRITING, + FUSE_REQ_FINISHED +}; + +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { + int async; + spinlock_t lock; + unsigned reqs; + ssize_t bytes; + size_t size; + __u64 offset; + bool write; + int err; + struct kiocb *iocb; + struct file *file; + struct completion *done; +}; + +/** + * A request to the client + */ +struct fuse_req { + /** This can be on either pending processing or io lists in + fuse_conn */ + struct list_head list; + + /** Entry on the interrupts list */ + struct list_head intr_entry; + + /** refcount */ + atomic_t count; + + /** Unique ID for the interrupt request */ + u64 intr_unique; + + /* + * The following bitfields are either set once before the + * request is queued or setting/clearing them is protected by + * fuse_conn->lock + */ + + /** True if the request has reply */ + unsigned isreply:1; + + /** Force sending of the request even if interrupted */ + unsigned force:1; + + /** The request was aborted */ + unsigned aborted:1; + + /** Request is sent in the background */ + unsigned background:1; + + /** The request has been interrupted */ + unsigned interrupted:1; + + /** Data is being copied to/from the request */ + unsigned locked:1; + + /** Request is counted as "waiting" */ + unsigned waiting:1; + + /** State of the request */ + enum fuse_req_state state; + + /** The request input */ + struct fuse_in in; + + /** The request output */ + struct fuse_out out; + + /** Used to wake up the task waiting for completion of request*/ + wait_queue_head_t waitq; + + /** Data for asynchronous requests */ + union { + struct { + struct fuse_release_in in; + struct inode *inode; + } release; + struct fuse_init_in init_in; + struct fuse_init_out init_out; + struct cuse_init_in cuse_init_in; + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + struct fuse_req *next; + } write; + struct fuse_notify_retrieve_in retrieve_in; + } misc; + + /** page vector */ + struct page **pages; + + /** page-descriptor vector */ + struct fuse_page_desc *page_descs; + + /** size of the 'pages' array */ + unsigned max_pages; + + /** inline page vector */ + struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + + /** inline page-descriptor vector */ + struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; + + /** number of pages in vector */ + unsigned num_pages; + + /** File used in the request (or NULL) */ + struct fuse_file *ff; + + /** Inode used in the request or NULL */ + struct inode *inode; + + /** AIO control block */ + struct fuse_io_priv *io; + + /** Link on fi->writepages */ + struct list_head writepages_entry; + + /** Request completion callback */ + void (*end)(struct fuse_conn *, struct fuse_req *); + + /** Request is stolen from fuse_file->reserved_req */ + struct file *stolen_file; +}; + +/** + * A Fuse connection. + * + * This structure is created, when the filesystem is mounted, and is + * destroyed, when the client device is closed and the filesystem is + * unmounted. + */ +struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Refcount */ + atomic_t count; + + struct rcu_head rcu; + + /** The user id for this mount */ + kuid_t user_id; + + /** The group id for this mount */ + kgid_t group_id; + + /** The fuse mount flags for this mount */ + unsigned flags; + + /** Maximum read size */ + unsigned max_read; + + /** Maximum write size */ + unsigned max_write; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The list of pending requests */ + struct list_head pending; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; + + /** The next unique kernel file handle */ + u64 khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /** Number of requests currently in the background */ + unsigned num_background; + + /** Number of background requests currently queued for userspace */ + unsigned active_background; + + /** The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** Flag indicating that INIT reply has been received. Allocating + * any fuse request will be suspended until the flag is set */ + int initialized; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /** waitq for reserved requests */ + wait_queue_head_t reserved_req_waitq; + + /** The next unique request id */ + u64 reqctr; + + /** Connection established, cleared on umount, connection + abort and device release */ + unsigned connected; + + /** Connection failed (version mismatch). Cannot race with + setting other bitfields since it is only set once in INIT + reply, before any other request, and never cleared */ + unsigned conn_error:1; + + /** Connection successful. Only set in INIT */ + unsigned conn_init:1; + + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read:1; + + /** Do not send separate SETATTR request before open(O_TRUNC) */ + unsigned atomic_o_trunc:1; + + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support:1; + + /** Set if bdi is valid */ + unsigned bdi_initialized:1; + + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + + /* + * The following bitfields are only for optimization purposes + * and hence races in setting them will not cause malfunction + */ + + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + + /** Is fsync not implemented by fs? */ + unsigned no_fsync:1; + + /** Is fsyncdir not implemented by fs? */ + unsigned no_fsyncdir:1; + + /** Is flush not implemented by fs? */ + unsigned no_flush:1; + + /** Is setxattr not implemented by fs? */ + unsigned no_setxattr:1; + + /** Is getxattr not implemented by fs? */ + unsigned no_getxattr:1; + + /** Is listxattr not implemented by fs? */ + unsigned no_listxattr:1; + + /** Is removexattr not implemented by fs? */ + unsigned no_removexattr:1; + + /** Are posix file locking primitives not implemented by fs? */ + unsigned no_lock:1; + + /** Is access not implemented by fs? */ + unsigned no_access:1; + + /** Is create not implemented by fs? */ + unsigned no_create:1; + + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt:1; + + /** Is bmap not implemented by fs? */ + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; + + /** Do multi-page cached writes */ + unsigned big_writes:1; + + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + + /** Is fallocate not implemented by fs? */ + unsigned no_fallocate:1; + + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + + /** Use enhanced/automatic page cache invalidation. */ + unsigned auto_inval_data:1; + + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + + /** Does the filesystem support asynchronous direct-IO submission? */ + unsigned async_dio:1; + + /** The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Negotiated minor version */ + unsigned minor; + + /** Backing dev info */ + struct backing_dev_info bdi; + + /** Entry on the fuse_conn_list */ + struct list_head entry; + + /** Device ID from super block */ + dev_t dev; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; + + /** Reserved request for the DESTROY message */ + struct fuse_req *destroy_req; + + /** Version counter for attribute changes */ + u64 attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); + + /** Super block for this connection. */ + struct super_block *sb; + + /** Read/write semaphore to hold when accessing sb. */ + struct rw_semaphore killsb; +}; + +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct fuse_conn *get_fuse_conn(struct inode *inode) +{ + return get_fuse_conn_super(inode->i_sb); +} + +static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +{ + return container_of(inode, struct fuse_inode, inode); +} + +static inline u64 get_node_id(struct inode *inode) +{ + return get_fuse_inode(inode)->nodeid; +} + +/** Device operations */ +extern const struct file_operations fuse_dev_operations; + +extern const struct dentry_operations fuse_dentry_operations; + +/** + * Inode to nodeid comparison. + */ +int fuse_inode_eq(struct inode *inode, void *_nodeidp); + +/** + * Get a filled in inode + */ +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + +/** + * Send FORGET command + */ +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); + +/** + * Initialize READ or READDIR request + */ +void fuse_read_fill(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, int opcode); + +/** + * Send OPEN or OPENDIR request + */ +int fuse_open_common(struct inode *inode, struct file *file, bool isdir); + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_get(struct fuse_file *ff); +void fuse_file_free(struct fuse_file *ff); +void fuse_finish_open(struct inode *inode, struct file *file); + +void fuse_sync_release(struct fuse_file *ff, int flags); + +/** + * Send RELEASE or RELEASEDIR request + */ +void fuse_release_common(struct file *file, int opcode); + +/** + * Send FSYNC or FSYNCDIR request + */ +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir); + +/** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** + * Initialize file operations on a regular file + */ +void fuse_init_file_inode(struct inode *inode); + +/** + * Initialize inode operations on regular files and special files + */ +void fuse_init_common(struct inode *inode); + +/** + * Initialize inode and file operations on a directory + */ +void fuse_init_dir(struct inode *inode); + +/** + * Initialize inode operations on a symlink + */ +void fuse_init_symlink(struct inode *inode); + +/** + * Change attributes of an inode + */ +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid); + +/** + * Initialize the client device + */ +int fuse_dev_init(void); + +/** + * Cleanup the client device + */ +void fuse_dev_cleanup(void); + +int fuse_ctl_init(void); +void __exit fuse_ctl_cleanup(void); + +/** + * Allocate a request + */ +struct fuse_req *fuse_request_alloc(unsigned npages); + +struct fuse_req *fuse_request_alloc_nofs(unsigned npages); + +/** + * Free a request + */ +void fuse_request_free(struct fuse_req *req); + +/** + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly + */ +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages); + +/* + * Increment reference count on request + */ +void __fuse_get_request(struct fuse_req *req); + +/** + * Gets a requests for a file operation, always succeeds + */ +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file); + +/** + * Decrement reference count of a request. If count goes to zero free + * the request. + */ +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Send a request (synchronous) + */ +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Simple request sending that does request allocation and freeing + */ +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); + +/** + * Send a request in the background + */ +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); + +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); + +/* Abort all requests */ +void fuse_abort_conn(struct fuse_conn *fc); + +/** + * Invalidate inode attributes + */ +void fuse_invalidate_attr(struct inode *inode); + +void fuse_invalidate_entry_cache(struct dentry *entry); + +void fuse_invalidate_atime(struct inode *inode); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +/** + * Initialize fuse_conn + */ +void fuse_conn_init(struct fuse_conn *fc); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); + +/** + * Is file type valid? + */ +int fuse_valid_type(int m); + +/** + * Is current process allowed to perform filesystem operation? + */ +int fuse_allow_current_process(struct fuse_conn *fc); + +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +u64 fuse_get_attr_version(struct fuse_conn *fc); + +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). + */ +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name); + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags); +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); +unsigned fuse_file_poll(struct file *file, poll_table *wait); +int fuse_dev_release(struct inode *inode, struct file *file); + +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); + +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file); + +void fuse_set_initialized(struct fuse_conn *fc); + +#endif /* _FS_FUSE_I_H */ diff --git a/kernel/fs/fuse/inode.c b/kernel/fs/fuse/inode.c new file mode 100644 index 000000000..18dacf9ed --- /dev/null +++ b/kernel/fs/fuse/inode.c @@ -0,0 +1,1320 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Filesystem in Userspace"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *fuse_inode_cachep; +struct list_head fuse_conn_list; +DEFINE_MUTEX(fuse_mutex); + +static int set_global_limit(const char *val, struct kernel_param *kp); + +unsigned max_user_bgreq; +module_param_call(max_user_bgreq, set_global_limit, param_get_uint, + &max_user_bgreq, 0644); +__MODULE_PARM_TYPE(max_user_bgreq, "uint"); +MODULE_PARM_DESC(max_user_bgreq, + "Global limit for the maximum number of backgrounded requests an " + "unprivileged user can set"); + +unsigned max_user_congthresh; +module_param_call(max_user_congthresh, set_global_limit, param_get_uint, + &max_user_congthresh, 0644); +__MODULE_PARM_TYPE(max_user_congthresh, "uint"); +MODULE_PARM_DESC(max_user_congthresh, + "Global limit for the maximum congestion threshold an " + "unprivileged user can set"); + +#define FUSE_SUPER_MAGIC 0x65735546 + +#define FUSE_DEFAULT_BLKSIZE 512 + +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + +struct fuse_mount_data { + int fd; + unsigned rootmode; + kuid_t user_id; + kgid_t group_id; + unsigned fd_present:1; + unsigned rootmode_present:1; + unsigned user_id_present:1; + unsigned group_id_present:1; + unsigned flags; + unsigned max_read; + unsigned blksize; +}; + +struct fuse_forget_link *fuse_alloc_forget(void) +{ + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +} + +static struct inode *fuse_alloc_inode(struct super_block *sb) +{ + struct inode *inode; + struct fuse_inode *fi; + + inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + if (!inode) + return NULL; + + fi = get_fuse_inode(inode); + fi->i_time = 0; + fi->nodeid = 0; + fi->nlookup = 0; + fi->attr_version = 0; + fi->writectr = 0; + fi->orig_ino = 0; + fi->state = 0; + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->page_waitq); + fi->forget = fuse_alloc_forget(); + if (!fi->forget) { + kmem_cache_free(fuse_inode_cachep, inode); + return NULL; + } + + return inode; +} + +static void fuse_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(fuse_inode_cachep, inode); +} + +static void fuse_destroy_inode(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + BUG_ON(!list_empty(&fi->write_files)); + BUG_ON(!list_empty(&fi->queued_writes)); + kfree(fi->forget); + call_rcu(&inode->i_rcu, fuse_i_callback); +} + +static void fuse_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (inode->i_sb->s_flags & MS_ACTIVE) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); + fi->forget = NULL; + } +} + +static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + if (*flags & MS_MANDLOCK) + return -EINVAL; + + return 0; +} + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->attr_version = ++fc->attr_version; + fi->i_time = attr_valid; + + inode->i_ino = fuse_squash_ino(attr->ino); + inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + set_nlink(inode, attr->nlink); + inode->i_uid = make_kuid(&init_user_ns, attr->uid); + inode->i_gid = make_kgid(&init_user_ns, attr->gid); + inode->i_blocks = attr->blocks; + inode->i_atime.tv_sec = attr->atime; + inode->i_atime.tv_nsec = attr->atimensec; + /* mtime from server may be stale due to local buffered write */ + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + } + + if (attr->blksize != 0) + inode->i_blkbits = ilog2(attr->blksize); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + + /* + * Don't set the sticky bit in i_mode, unless we want the VFS + * to check permissions. This prevents failures due to the + * check in may_delete(). + */ + fi->orig_i_mode = inode->i_mode; + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; +} + +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + bool is_wb = fc->writeback_cache; + loff_t oldsize; + struct timespec old_mtime; + + spin_lock(&fc->lock); + if ((attr_version != 0 && fi->attr_version > attr_version) || + test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { + spin_unlock(&fc->lock); + return; + } + + old_mtime = inode->i_mtime; + fuse_change_attributes_common(inode, attr, attr_valid); + + oldsize = inode->i_size; + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!is_wb || !S_ISREG(inode->i_mode)) + i_size_write(inode, attr->size); + spin_unlock(&fc->lock); + + if (!is_wb && S_ISREG(inode->i_mode)) { + bool inval = false; + + if (oldsize != attr->size) { + truncate_pagecache(inode, attr->size); + inval = true; + } else if (fc->auto_inval_data) { + struct timespec new_mtime = { + .tv_sec = attr->mtime, + .tv_nsec = attr->mtimensec, + }; + + /* + * Auto inval mode also checks and invalidates if mtime + * has changed. + */ + if (!timespec_equal(&old_mtime, &new_mtime)) + inval = true; + } + + if (inval) + invalidate_inode_pages2(inode->i_mapping); + } +} + +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +{ + inode->i_mode = attr->mode & S_IFMT; + inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + if (S_ISREG(inode->i_mode)) { + fuse_init_common(inode); + fuse_init_file_inode(inode); + } else if (S_ISDIR(inode->i_mode)) + fuse_init_dir(inode); + else if (S_ISLNK(inode->i_mode)) + fuse_init_symlink(inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + fuse_init_common(inode); + init_special_inode(inode, inode->i_mode, + new_decode_dev(attr->rdev)); + } else + BUG(); +} + +int fuse_inode_eq(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + if (get_node_id(inode) == nodeid) + return 1; + else + return 0; +} + +static int fuse_inode_set(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + get_fuse_inode(inode)->nodeid = nodeid; + return 0; +} + +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct inode *inode; + struct fuse_inode *fi; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + retry: + inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); + if (!inode) + return NULL; + + if ((inode->i_state & I_NEW)) { + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(attr->mode)) + inode->i_flags |= S_NOCMTIME; + inode->i_generation = generation; + fuse_init_inode(inode, attr); + unlock_new_inode(inode); + } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { + /* Inode has changed type, any I/O on the old should fail */ + make_bad_inode(inode); + iput(inode); + goto retry; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + fuse_change_attributes(inode, attr, attr_valid, attr_version); + + return inode; +} + +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len) +{ + struct inode *inode; + pgoff_t pg_start; + pgoff_t pg_end; + + inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + return -ENOENT; + + fuse_invalidate_attr(inode); + if (offset >= 0) { + pg_start = offset >> PAGE_CACHE_SHIFT; + if (len <= 0) + pg_end = -1; + else + pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + } + iput(inode); + return 0; +} + +static void fuse_umount_begin(struct super_block *sb) +{ + fuse_abort_conn(get_fuse_conn_super(sb)); +} + +static void fuse_send_destroy(struct fuse_conn *fc) +{ + struct fuse_req *req = fc->destroy_req; + if (req && fc->conn_init) { + fc->destroy_req = NULL; + req->in.h.opcode = FUSE_DESTROY; + req->force = 1; + req->background = 0; + fuse_request_send(fc, req); + fuse_put_request(fc, req); + } +} + +static void fuse_bdi_destroy(struct fuse_conn *fc) +{ + if (fc->bdi_initialized) + bdi_destroy(&fc->bdi); +} + +static void fuse_put_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fuse_send_destroy(fc); + + fuse_abort_conn(fc); + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + fuse_bdi_destroy(fc); + + fuse_conn_put(fc); +} + +static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) +{ + stbuf->f_type = FUSE_SUPER_MAGIC; + stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; + stbuf->f_blocks = attr->blocks; + stbuf->f_bfree = attr->bfree; + stbuf->f_bavail = attr->bavail; + stbuf->f_files = attr->files; + stbuf->f_ffree = attr->ffree; + stbuf->f_namelen = attr->namelen; + /* fsid is left zero */ +} + +static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + FUSE_ARGS(args); + struct fuse_statfs_out outarg; + int err; + + if (!fuse_allow_current_process(fc)) { + buf->f_type = FUSE_SUPER_MAGIC; + return 0; + } + + memset(&outarg, 0, sizeof(outarg)); + args.in.numargs = 0; + args.in.h.opcode = FUSE_STATFS; + args.in.h.nodeid = get_node_id(d_inode(dentry)); + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (!err) + convert_fuse_statfs(buf, &outarg.st); + return err; +} + +enum { + OPT_FD, + OPT_ROOTMODE, + OPT_USER_ID, + OPT_GROUP_ID, + OPT_DEFAULT_PERMISSIONS, + OPT_ALLOW_OTHER, + OPT_MAX_READ, + OPT_BLKSIZE, + OPT_ERR +}; + +static const match_table_t tokens = { + {OPT_FD, "fd=%u"}, + {OPT_ROOTMODE, "rootmode=%o"}, + {OPT_USER_ID, "user_id=%u"}, + {OPT_GROUP_ID, "group_id=%u"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_ALLOW_OTHER, "allow_other"}, + {OPT_MAX_READ, "max_read=%u"}, + {OPT_BLKSIZE, "blksize=%u"}, + {OPT_ERR, NULL} +}; + +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +{ + char *p; + memset(d, 0, sizeof(struct fuse_mount_data)); + d->max_read = ~0; + d->blksize = FUSE_DEFAULT_BLKSIZE; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + int value; + unsigned uv; + substring_t args[MAX_OPT_ARGS]; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case OPT_FD: + if (match_int(&args[0], &value)) + return 0; + d->fd = value; + d->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (match_octal(&args[0], &value)) + return 0; + if (!fuse_valid_type(value)) + return 0; + d->rootmode = value; + d->rootmode_present = 1; + break; + + case OPT_USER_ID: + if (fuse_match_uint(&args[0], &uv)) + return 0; + d->user_id = make_kuid(current_user_ns(), uv); + if (!uid_valid(d->user_id)) + return 0; + d->user_id_present = 1; + break; + + case OPT_GROUP_ID: + if (fuse_match_uint(&args[0], &uv)) + return 0; + d->group_id = make_kgid(current_user_ns(), uv); + if (!gid_valid(d->group_id)) + return 0; + d->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + d->flags |= FUSE_DEFAULT_PERMISSIONS; + break; + + case OPT_ALLOW_OTHER: + d->flags |= FUSE_ALLOW_OTHER; + break; + + case OPT_MAX_READ: + if (match_int(&args[0], &value)) + return 0; + d->max_read = value; + break; + + case OPT_BLKSIZE: + if (!is_bdev || match_int(&args[0], &value)) + return 0; + d->blksize = value; + break; + + default: + return 0; + } + } + + if (!d->fd_present || !d->rootmode_present || + !d->user_id_present || !d->group_id_present) + return 0; + + return 1; +} + +static int fuse_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + seq_puts(m, ",default_permissions"); + if (fc->flags & FUSE_ALLOW_OTHER) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + return 0; +} + +void fuse_conn_init(struct fuse_conn *fc) +{ + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + init_rwsem(&fc->killsb); + atomic_set(&fc->count, 1); + init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); + INIT_LIST_HEAD(&fc->pending); + INIT_LIST_HEAD(&fc->processing); + INIT_LIST_HEAD(&fc->io); + INIT_LIST_HEAD(&fc->interrupts); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + fc->forget_list_tail = &fc->forget_list_head; + atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; + fc->khctr = 0; + fc->polled_files = RB_ROOT; + fc->reqctr = 0; + fc->blocked = 0; + fc->initialized = 0; + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); +} +EXPORT_SYMBOL_GPL(fuse_conn_init); + +void fuse_conn_put(struct fuse_conn *fc) +{ + if (atomic_dec_and_test(&fc->count)) { + if (fc->destroy_req) + fuse_request_free(fc->destroy_req); + fc->release(fc); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_put); + +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) +{ + atomic_inc(&fc->count); + return fc; +} +EXPORT_SYMBOL_GPL(fuse_conn_get); + +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +{ + struct fuse_attr attr; + memset(&attr, 0, sizeof(attr)); + + attr.mode = mode; + attr.ino = FUSE_ROOT_ID; + attr.nlink = 1; + return fuse_iget(sb, 1, 0, &attr, 0, 0); +} + +struct fuse_inode_handle { + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + struct qstr name; + + if (!fc->export_support) + goto out_err; + + name.len = 1; + name.name = "."; + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_obtain_alias(inode); + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(entry); + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + int len = parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (parent) { + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = d_inode(child); + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + struct qstr name; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + name.len = 2; + name.name = ".."; + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &name, &outarg, &inode); + if (err) { + if (err == -ENOENT) + return ERR_PTR(-ESTALE); + return ERR_PTR(err); + } + + parent = d_obtain_alias(inode); + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(parent); + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + +static const struct super_operations fuse_super_operations = { + .alloc_inode = fuse_alloc_inode, + .destroy_inode = fuse_destroy_inode, + .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, + .drop_inode = generic_delete_inode, + .remount_fs = fuse_remount_fs, + .put_super = fuse_put_super, + .umount_begin = fuse_umount_begin, + .statfs = fuse_statfs, + .show_options = fuse_show_options, +}; + +static void sanitize_global_limit(unsigned *limit) +{ + if (*limit == 0) + *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + sizeof(struct fuse_req); + + if (*limit >= 1 << 16) + *limit = (1 << 16) - 1; +} + +static int set_global_limit(const char *val, struct kernel_param *kp) +{ + int rv; + + rv = param_set_uint(val, kp); + if (rv) + return rv; + + sanitize_global_limit((unsigned *)kp->arg); + + return 0; +} + +static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) +{ + int cap_sys_admin = capable(CAP_SYS_ADMIN); + + if (arg->minor < 13) + return; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + if (arg->max_background) { + fc->max_background = arg->max_background; + + if (!cap_sys_admin && fc->max_background > max_user_bgreq) + fc->max_background = max_user_bgreq; + } + if (arg->congestion_threshold) { + fc->congestion_threshold = arg->congestion_threshold; + + if (!cap_sys_admin && + fc->congestion_threshold > max_user_congthresh) + fc->congestion_threshold = max_user_congthresh; + } +} + +static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_out *arg = &req->misc.init_out; + + if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) + fc->conn_error = 1; + else { + unsigned long ra_pages; + + process_init_limits(fc, arg); + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_lock = 1; + if (arg->minor >= 17) { + if (!(arg->flags & FUSE_FLOCK_LOCKS)) + fc->no_flock = 1; + } else { + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_flock = 1; + } + if (arg->flags & FUSE_ATOMIC_O_TRUNC) + fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (arg->flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } + if (arg->flags & FUSE_BIG_WRITES) + fc->big_writes = 1; + if (arg->flags & FUSE_DONT_MASK) + fc->dont_mask = 1; + if (arg->flags & FUSE_AUTO_INVAL_DATA) + fc->auto_inval_data = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) { + fc->do_readdirplus = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } + if (arg->flags & FUSE_ASYNC_DIO) + fc->async_dio = 1; + if (arg->flags & FUSE_WRITEBACK_CACHE) + fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + } else { + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + fc->no_lock = 1; + fc->no_flock = 1; + } + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->minor = arg->minor; + fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + fc->max_write = max_t(unsigned, 4096, fc->max_write); + fc->conn_init = 1; + } + fuse_set_initialized(fc); + wake_up_all(&fc->blocked_waitq); +} + +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_in *arg = &req->misc.init_in; + + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | + FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; + req->in.h.opcode = FUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; + req->out.numargs = 1; + /* Variable length argument used for backward compatibility + with interface version < 7.5. Rest of init_out is zeroed + by do_get_request(), so a short reply is not a problem */ + req->out.argvar = 1; + req->out.args[0].size = sizeof(struct fuse_init_out); + req->out.args[0].value = &req->misc.init_out; + req->end = process_init_reply; + fuse_request_send_background(fc, req); +} + +static void fuse_free_conn(struct fuse_conn *fc) +{ + kfree_rcu(fc, rcu); +} + +static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) +{ + int err; + + fc->bdi.name = "fuse"; + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + + err = bdi_init(&fc->bdi); + if (err) + return err; + + fc->bdi_initialized = 1; + + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); + } + + if (err) + return err; + + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi//max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); + + return 0; +} + +static int fuse_fill_super(struct super_block *sb, void *data, int silent) +{ + struct fuse_conn *fc; + struct inode *root; + struct fuse_mount_data d; + struct file *file; + struct dentry *root_dentry; + struct fuse_req *init_req; + int err; + int is_bdev = sb->s_bdev != NULL; + + err = -EINVAL; + if (sb->s_flags & MS_MANDLOCK) + goto err; + + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); + + if (!parse_fuse_opt(data, &d, is_bdev)) + goto err; + + if (is_bdev) { +#ifdef CONFIG_BLOCK + err = -EINVAL; + if (!sb_set_blocksize(sb, d.blksize)) + goto err; +#endif + } else { + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + } + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + + file = fget(d.fd); + err = -EINVAL; + if (!file) + goto err; + + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != &init_user_ns)) + goto err_fput; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc); + + fc->dev = sb->s_dev; + fc->sb = sb; + err = fuse_bdi_init(fc, sb); + if (err) + goto err_put_conn; + + sb->s_bdi = &fc->bdi; + + /* Handle umasking inside the fuse code */ + if (sb->s_flags & MS_POSIXACL) + fc->dont_mask = 1; + sb->s_flags |= MS_POSIXACL; + + fc->release = fuse_free_conn; + fc->flags = d.flags; + fc->user_id = d.user_id; + fc->group_id = d.group_id; + fc->max_read = max_t(unsigned, 4096, d.max_read); + + /* Used by get_root_inode() */ + sb->s_fs_info = fc; + + err = -ENOMEM; + root = fuse_get_root_inode(sb, d.rootmode); + root_dentry = d_make_root(root); + if (!root_dentry) + goto err_put_conn; + /* only now - we want root dentry with NULL ->d_op */ + sb->s_d_op = &fuse_dentry_operations; + + init_req = fuse_request_alloc(0); + if (!init_req) + goto err_put_root; + init_req->background = 1; + + if (is_bdev) { + fc->destroy_req = fuse_request_alloc(0); + if (!fc->destroy_req) + goto err_free_init_req; + } + + mutex_lock(&fuse_mutex); + err = -EINVAL; + if (file->private_data) + goto err_unlock; + + err = fuse_ctl_add_conn(fc); + if (err) + goto err_unlock; + + list_add_tail(&fc->entry, &fuse_conn_list); + sb->s_root = root_dentry; + fc->connected = 1; + file->private_data = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); + + fuse_send_init(fc, init_req); + + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + err_free_init_req: + fuse_request_free(init_req); + err_put_root: + dput(root_dentry); + err_put_conn: + fuse_bdi_destroy(fc); + fuse_conn_put(fc); + err_fput: + fput(file); + err: + return err; +} + +static struct dentry *fuse_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_anon(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_anon_super(sb); +} + +static struct file_system_type fuse_fs_type = { + .owner = THIS_MODULE, + .name = "fuse", + .fs_flags = FS_HAS_SUBTYPE, + .mount = fuse_mount, + .kill_sb = fuse_kill_sb_anon, +}; +MODULE_ALIAS_FS("fuse"); + +#ifdef CONFIG_BLOCK +static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_blk(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_block_super(sb); +} + +static struct file_system_type fuseblk_fs_type = { + .owner = THIS_MODULE, + .name = "fuseblk", + .mount = fuse_mount_blk, + .kill_sb = fuse_kill_sb_blk, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, +}; +MODULE_ALIAS_FS("fuseblk"); + +static inline int register_fuseblk(void) +{ + return register_filesystem(&fuseblk_fs_type); +} + +static inline void unregister_fuseblk(void) +{ + unregister_filesystem(&fuseblk_fs_type); +} +#else +static inline int register_fuseblk(void) +{ + return 0; +} + +static inline void unregister_fuseblk(void) +{ +} +#endif + +static void fuse_inode_init_once(void *foo) +{ + struct inode *inode = foo; + + inode_init_once(inode); +} + +static int __init fuse_fs_init(void) +{ + int err; + + fuse_inode_cachep = kmem_cache_create("fuse_inode", + sizeof(struct fuse_inode), + 0, SLAB_HWCACHE_ALIGN, + fuse_inode_init_once); + err = -ENOMEM; + if (!fuse_inode_cachep) + goto out; + + err = register_fuseblk(); + if (err) + goto out2; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out3; + + return 0; + + out3: + unregister_fuseblk(); + out2: + kmem_cache_destroy(fuse_inode_cachep); + out: + return err; +} + +static void fuse_fs_cleanup(void) +{ + unregister_filesystem(&fuse_fs_type); + unregister_fuseblk(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(fuse_inode_cachep); +} + +static struct kobject *fuse_kobj; + +static int fuse_sysfs_init(void) +{ + int err; + + fuse_kobj = kobject_create_and_add("fuse", fs_kobj); + if (!fuse_kobj) { + err = -ENOMEM; + goto out_err; + } + + err = sysfs_create_mount_point(fuse_kobj, "connections"); + if (err) + goto out_fuse_unregister; + + return 0; + + out_fuse_unregister: + kobject_put(fuse_kobj); + out_err: + return err; +} + +static void fuse_sysfs_cleanup(void) +{ + sysfs_remove_mount_point(fuse_kobj, "connections"); + kobject_put(fuse_kobj); +} + +static int __init fuse_init(void) +{ + int res; + + printk(KERN_INFO "fuse init (API version %i.%i)\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + + INIT_LIST_HEAD(&fuse_conn_list); + res = fuse_fs_init(); + if (res) + goto err; + + res = fuse_dev_init(); + if (res) + goto err_fs_cleanup; + + res = fuse_sysfs_init(); + if (res) + goto err_dev_cleanup; + + res = fuse_ctl_init(); + if (res) + goto err_sysfs_cleanup; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + return 0; + + err_sysfs_cleanup: + fuse_sysfs_cleanup(); + err_dev_cleanup: + fuse_dev_cleanup(); + err_fs_cleanup: + fuse_fs_cleanup(); + err: + return res; +} + +static void __exit fuse_exit(void) +{ + printk(KERN_DEBUG "fuse exit\n"); + + fuse_ctl_cleanup(); + fuse_sysfs_cleanup(); + fuse_fs_cleanup(); + fuse_dev_cleanup(); +} + +module_init(fuse_init); +module_exit(fuse_exit); diff --git a/kernel/fs/gfs2/Kconfig b/kernel/fs/gfs2/Kconfig new file mode 100644 index 000000000..90c6a8faa --- /dev/null +++ b/kernel/fs/gfs2/Kconfig @@ -0,0 +1,34 @@ +config GFS2_FS + tristate "GFS2 file system support" + depends on (64BIT || LBDAF) + select FS_POSIX_ACL + select CRC32 + select QUOTACTL + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + + To use the GFS2 filesystem in a cluster, you will need to enable + the locking module below. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster + + The "nolock" lock module is now built in to GFS2 by default. If + you want to use the DLM, be sure to enable IPv4/6 networking. + +config GFS2_FS_LOCKING_DLM + bool "GFS2 DLM locking" + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) + help + Multiple node locking module for GFS2 + + Most users of GFS2 will require this. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. diff --git a/kernel/fs/gfs2/Makefile b/kernel/fs/gfs2/Makefile new file mode 100644 index 000000000..861282023 --- /dev/null +++ b/kernel/fs/gfs2/Makefile @@ -0,0 +1,10 @@ +ccflags-y := -I$(src) +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ + glops.o log.o lops.o main.o meta_io.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o inode.o quota.o \ + recovery.o rgrp.o super.o sys.o trans.o util.o + +gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o + diff --git a/kernel/fs/gfs2/acl.c b/kernel/fs/gfs2/acl.c new file mode 100644 index 000000000..1be3b061c --- /dev/null +++ b/kernel/fs/gfs2/acl.c @@ -0,0 +1,117 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +static const char *gfs2_acl_name(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return GFS2_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return GFS2_POSIX_ACL_DEFAULT; + } + return NULL; +} + +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct posix_acl *acl; + const char *name; + char *data; + int len; + + if (!ip->i_eattr) + return NULL; + + name = gfs2_acl_name(type); + if (name == NULL) + return ERR_PTR(-EINVAL); + + len = gfs2_xattr_acl_get(ip, name, &data); + if (len < 0) + return ERR_PTR(len); + if (len == 0) + return NULL; + + acl = posix_acl_from_xattr(&init_user_ns, data, len); + kfree(data); + return acl; +} + +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + int len; + char *data; + const char *name = gfs2_acl_name(type); + + BUG_ON(name == NULL); + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; + + if (type == ACL_TYPE_ACCESS) { + umode_t mode = inode->i_mode; + + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + + if (error == 0) + acl = NULL; + + if (mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } + } + + if (acl) { + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) + goto out; + } else { + data = NULL; + len = 0; + } + + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (error) + goto out; + set_cached_acl(inode, type, acl); +out: + kfree(data); + return error; +} diff --git a/kernel/fs/gfs2/acl.h b/kernel/fs/gfs2/acl.h new file mode 100644 index 000000000..2d65ec4cd --- /dev/null +++ b/kernel/fs/gfs2/acl.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __ACL_DOT_H__ +#define __ACL_DOT_H__ + +#include "incore.h" + +#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" +#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" +#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) + +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); +extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); + +#endif /* __ACL_DOT_H__ */ diff --git a/kernel/fs/gfs2/aops.c b/kernel/fs/gfs2/aops.c new file mode 100644 index 000000000..5551fea0a --- /dev/null +++ b/kernel/fs/gfs2/aops.c @@ -0,0 +1,1229 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "super.h" +#include "util.h" +#include "glops.h" + + +static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) +{ + struct buffer_head *head = page_buffers(page); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + unsigned int start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from || start >= to) + continue; + if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); + gfs2_trans_add_data(ip->i_gl, bh); + } +} + +/** + * gfs2_get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int error; + + error = gfs2_block_map(inode, lblock, bh_result, 0); + if (error) + return error; + if (!buffer_mapped(bh_result)) + return -EIO; + return 0; +} + +static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + return gfs2_block_map(inode, lblock, bh_result, 0); +} + +/** + * gfs2_writepage_common - Common bits of writepage + * @page: The page to be written + * @wbc: The writeback control + * + * Returns: 1 if writepage is ok, otherwise an error code or zero if no error. + */ + +static int gfs2_writepage_common(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (current->journal_info) + goto redirty; + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + goto out; + } + return 1; +redirty: + redirty_page_for_writepage(wbc, page); +out: + unlock_page(page); + return 0; +} + +/** + * gfs2_writepage - Write page for writeback mappings + * @page: The page + * @wbc: The writeback control + * + */ + +static int gfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); +} + +/** + * __gfs2_jdata_writepage - The core of jdata writepage + * @page: The page to write + * @wbc: The writeback control + * + * This is shared between writepage and writepages and implements the + * core of the writepage operation. If a transaction is required then + * PageChecked will have been set and the transaction will have + * already been started before this is called. + */ + +static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (PageChecked(page)) { + ClearPageChecked(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + } + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * gfs2_jdata_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + */ + +static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + int ret; + int done_trans = 0; + + if (PageChecked(page)) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out_ignore; + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) + goto out_ignore; + done_trans = 1; + } + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); + if (done_trans) + gfs2_trans_end(sdp); + return ret; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +/** + * gfs2_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * Used for both ordered and writeback modes. + */ +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); +} + +/** + * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * @mapping: The mapping + * @wbc: The writeback control + * @writepage: The writepage function to call for each page + * @pvec: The vector of pages + * @nr_pages: The number of pages to write + * + * Returns: non-zero if loop should terminate, zero otherwise + */ + +static int gfs2_write_jdata_pagevec(struct address_space *mapping, + struct writeback_control *wbc, + struct pagevec *pvec, + int nr_pages, pgoff_t end, + pgoff_t *done_index) +{ + struct inode *inode = mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); + int i; + int ret; + + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); + if (ret < 0) + return ret; + + for(i = 0; i < nr_pages; i++) { + struct page *page = pvec->pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + ret = 1; + break; + } + + *done_index = page->index; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, inode_to_bdi(inode)); + + ret = __gfs2_jdata_writepage(page, wbc); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + *done_index = page->index + 1; + ret = 1; + break; + } + } + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { + ret = 1; + break; + } + + } + gfs2_trans_end(sdp); + return ret; +} + +/** + * gfs2_write_cache_jdata - Like write_cache_pages but different + * @mapping: The mapping to write + * @wbc: The writeback control + * @writepage: The writepage function to call + * @data: The data to pass to writepage + * + * The reason that we use our own function here is that we need to + * start transactions before we grab page locks. This allows us + * to get the ordering right. + */ + +static int gfs2_write_cache_jdata(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); + if (ret) + done = 1; + if (ret > 0) + ret = 0; + pagevec_release(&pvec); + cond_resched(); + } + + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + + +/** + * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: The writeback control + * + */ + +static int gfs2_jdata_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + int ret; + + ret = gfs2_write_cache_jdata(mapping, wbc); + if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + ret = gfs2_write_cache_jdata(mapping, wbc); + } + return ret; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ + +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); + void *kaddr; + int error; + + /* + * Due to the order of unstuffing files and ->fault(), we can be + * asked for a zero page in the case of a stuffed file being extended, + * so we need to supply one here. It doesn't happen often. + */ + if (unlikely(page->index)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_atomic(page); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + kunmap_atomic(kaddr); + flush_dcache_page(page); + brelse(dibh); + SetPageUptodate(page); + + return 0; +} + + +/** + * __gfs2_readpage - readpage + * @file: The file to read a page for + * @page: The page to read + * + * This is the core of gfs2's readpage. Its used by the internal file + * reading code as in that case we already hold the glock. Also its + * called by gfs2_readpage() once the required lock has been granted. + * + */ + +static int __gfs2_readpage(void *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + int error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, page); + unlock_page(page); + } else { + error = mpage_readpage(page, gfs2_block_map); + } + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return error; +} + +/** + * gfs2_readpage - read a page of a file + * @file: The file to read + * @page: The page of the file + * + * This deals with the locking required. We have to unlock and + * relock the page in order to get the locking in the right + * order. + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder gh; + int error; + + unlock_page(page); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) + goto out; + error = AOP_TRUNCATED_PAGE; + lock_page(page); + if (page->mapping == mapping && !PageUptodate(page)) + error = __gfs2_readpage(file, page); + else + unlock_page(page); + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + if (error && error != AOP_TRUNCATED_PAGE) + lock_page(page); + return error; +} + +/** + * gfs2_internal_read - read an internal file + * @ip: The gfs2 inode + * @buf: The buffer to fill + * @pos: The file position + * @size: The amount to read + * + */ + +int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + unsigned size) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + unsigned long index = *pos / PAGE_CACHE_SIZE; + unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); + unsigned copied = 0; + unsigned amt; + struct page *page; + void *p; + + do { + amt = size - copied; + if (offset + size > PAGE_CACHE_SIZE) + amt = PAGE_CACHE_SIZE - offset; + page = read_cache_page(mapping, index, __gfs2_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + p = kmap_atomic(page); + memcpy(buf + copied, p + offset, amt); + kunmap_atomic(p); + page_cache_release(page); + copied += amt; + index++; + offset = 0; + } while(copied < size); + (*pos) += size; + return size; +} + +/** + * gfs2_readpages - Read a bunch of pages at once + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We don't handle stuffed files here we let readpage do the honours. + * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. + */ + +static int gfs2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (unlikely(ret)) + goto out_uninit; + if (!gfs2_is_stuffed(ip)) + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = -EIO; + return ret; +} + +/** + * gfs2_write_begin - Begin to write to a file + * @file: The file to write to + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) + * + * Returns: errno + */ + +static int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + unsigned requested = 0; + int alloc_required; + int error = 0; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + struct page *page; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } + + alloc_required = gfs2_write_alloc_required(ip, pos, len); + + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + + if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, &ap); + if (error) + goto out_unlock; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(ip, requested); + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = -ENOMEM; + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); + *pagep = page; + if (unlikely(!page)) + goto out_endtrans; + + if (gfs2_is_stuffed(ip)) { + error = 0; + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) { + error = stuffed_readpage(ip, page); + } + goto out; + } + +prepare_write: + error = __block_write_begin(page, from, len, gfs2_block_map); +out: + if (error == 0) + return 0; + + unlock_page(page); + page_cache_release(page); + + gfs2_trans_end(sdp); + if (pos + len > ip->i_inode.i_size) + gfs2_trim_blocks(&ip->i_inode); + goto out_trans_fail; + +out_endtrans: + gfs2_trans_end(sdp); +out_trans_fail: + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); + } +out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +static void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + u64 fs_total, new_free; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); +} + +/** + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (copied) { + if (inode->i_size < to) + i_size_write(inode, to); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) { + adjust_fs_space(inode); + sdp->sd_rindex_uptodate = 0; + } + + brelse(dibh); + gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/** + * gfs2_write_end + * @file: The file to write to + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. + * + * Returns: errno + */ + +static int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct buffer_head *dibh; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!tr); + + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } + + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + else + gfs2_trans_add_meta(ip->i_gl, dibh); + + + if (inode == sdp->sd_rindex) { + adjust_fs_space(inode); + sdp->sd_rindex_uptodate = 0; + } + + brelse(dibh); +failed: + gfs2_trans_end(sdp); + gfs2_inplace_release(ip); + if (ip->i_res->rs_qa_qd_num) + gfs2_quota_unlock(ip); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return ret; +} + +/** + * gfs2_set_page_dirty - Page dirtying function + * @page: The page to dirty + * + * Returns: 1 if it dirtyed the page, or 0 otherwise + */ + +static int gfs2_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_buffers(page); +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = generic_block_bmap(mapping, lblock, gfs2_block_map); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + lock_buffer(bh); + gfs2_log_lock(sdp); + clear_buffer_dirty(bh); + bd = bh->b_private; + if (bd) { + if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_list); + else + gfs2_remove_from_journal(bh, current->journal_info, 0); + } + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void gfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); + struct buffer_head *bh, *head; + unsigned long pos = 0; + + BUG_ON(!PageLocked(page)); + if (!partial_page) + ClearPageChecked(page); + if (!page_has_buffers(page)) + goto out; + + bh = head = page_buffers(page); + do { + if (pos + bh->b_size > stop) + return; + + if (offset <= pos) + gfs2_discard(sdp, bh); + pos += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +out: + if (!partial_page) + try_to_release_page(page, 0); +} + +/** + * gfs2_ok_for_dio - check that dio is valid on this file + * @ip: The inode + * @offset: The offset at which we are reading or writing + * + * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) + * 1 (to accept the i/o request) + */ +static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) +{ + /* + * Should we return an error here? I can't see that O_DIRECT for + * a stuffed file makes any sense. For now we'll silently fall + * back to buffered I/O + */ + if (gfs2_is_stuffed(ip)) + return 0; + + if (offset >= i_size_read(&ip->i_inode)) + return 0; + return 1; +} + + + +static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int rv; + + /* + * Deferred lock, even if its a write, since we do no allocation + * on this path. All we need change is atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately have the option of only flushing a range like + * the VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); + if (rv) + return rv; + rv = gfs2_ok_for_dio(ip, offset); + if (rv != 1) + goto out; /* dio not valid, fall back to buffered i/o */ + + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_iter_count(iter); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + goto out; + if (iov_iter_rw(iter) == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + + rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + offset, gfs2_get_block_direct, NULL, NULL, 0); +out: + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); + return rv; +} + +/** + * gfs2_releasepage - free the metadata associated with a page + * @page: the page that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Call try_to_free_buffers() if the buffers in this page can be + * released. + * + * Returns: 0 + */ + +int gfs2_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct address_space *mapping = page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + + if (!page_has_buffers(page)) + return 0; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + head = bh = page_buffers(page); + do { + if (atomic_read(&bh->b_count)) + goto cannot_release; + bd = bh->b_private; + if (bd && bd->bd_tr) + goto cannot_release; + if (buffer_pinned(bh) || buffer_dirty(bh)) + goto not_possible; + bh = bh->b_this_page; + } while(bh != head); + spin_unlock(&sdp->sd_ail_lock); + + head = bh = page_buffers(page); + do { + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + + bh = bh->b_this_page; + } while (bh != head); + gfs2_log_unlock(sdp); + + return try_to_free_buffers(page); + +not_possible: /* Should never happen */ + WARN_ON(buffer_dirty(bh)); + WARN_ON(buffer_pinned(bh)); +cannot_release: + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + return 0; +} + +static const struct address_space_operations gfs2_writeback_aops = { + .writepage = gfs2_writepage, + .writepages = gfs2_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations gfs2_ordered_aops = { + .writepage = gfs2_writepage, + .writepages = gfs2_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations gfs2_jdata_aops = { + .writepage = gfs2_jdata_writepage, + .writepages = gfs2_jdata_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void gfs2_set_aops(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (gfs2_is_writeback(ip)) + inode->i_mapping->a_ops = &gfs2_writeback_aops; + else if (gfs2_is_ordered(ip)) + inode->i_mapping->a_ops = &gfs2_ordered_aops; + else if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else + BUG(); +} + diff --git a/kernel/fs/gfs2/bmap.c b/kernel/fs/gfs2/bmap.c new file mode 100644 index 000000000..61296ecbd --- /dev/null +++ b/kernel/fs/gfs2/bmap.c @@ -0,0 +1,1495 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "log.h" +#include "super.h" +#include "trans.h" +#include "dir.h" +#include "util.h" +#include "trace_gfs2.h" + +/* This doesn't need to be that large as max 64 bit pointers in a 4k + * block is 512, so __u16 is fine for that. It saves stack space to + * keep it small. + */ +struct metapath { + struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; + __u16 mp_list[GFS2_MAX_META_HEIGHT]; +}; + +struct strip_mine { + int sm_first; + unsigned int sm_height; +}; + +/** + * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * @ip: the inode + * @dibh: the dinode buffer + * @block: the block number that was allocated + * @page: The (optional) page. This is looked up if @page is NULL + * + * Returns: errno + */ + +static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct page *page) +{ + struct inode *inode = &ip->i_inode; + struct buffer_head *bh; + int release = 0; + + if (!page || page->index) { + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + if (!page) + return -ENOMEM; + release = 1; + } + + if (!PageUptodate(page)) { + void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); + + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + kunmap(page); + + SetPageUptodate(page); + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Uptodate)); + + bh = page_buffers(page); + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); + + set_buffer_uptodate(bh); + if (!gfs2_is_jdata(ip)) + mark_buffer_dirty(bh); + if (!gfs2_is_writeback(ip)) + gfs2_trans_add_data(ip->i_gl, bh); + + if (release) { + unlock_page(page); + page_cache_release(page); + } + + return 0; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * @page: The (optional) page. This is looked up if the @page is NULL + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *bh, *dibh; + struct gfs2_dinode *di; + u64 block = 0; + int isdir = gfs2_is_dir(ip); + int error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (i_size_read(&ip->i_inode)) { + /* Get a free block, fill it with the stuffed data, + and write it out to disk */ + + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + goto out_brelse; + if (isdir) { + gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); + error = gfs2_dir_get_new_buffer(ip, block, &bh); + if (error) + goto out_brelse; + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(bh); + } else { + error = gfs2_unstuffer_page(ip, dibh, block, page); + if (error) + goto out_brelse; + } + } + + /* Set up the pointer to the new block */ + + gfs2_trans_add_meta(ip->i_gl, dibh); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + if (i_size_read(&ip->i_inode)) { + *(__be64 *)(di + 1) = cpu_to_be64(block); + gfs2_add_inode_blocks(&ip->i_inode, 1); + di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + } + + ip->i_height = 1; + di->di_height = cpu_to_be16(1); + +out_brelse: + brelse(dibh); +out: + up_write(&ip->i_rw_mutex); + return error; +} + + +/** + * find_metapath - Find path through the metadata tree + * @sdp: The superblock + * @mp: The metapath to return the result in + * @block: The disk block to look up + * @height: The pre-calculated height of the metadata tree + * + * This routine returns a struct metapath structure that defines a path + * through the metadata of inode "ip" to get to block "block". + * + * Example: + * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a + * filesystem with a blocksize of 4096. + * + * find_metapath() would return a struct metapath structure set to: + * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, + * and mp_list[2] = 165. + * + * That means that in order to get to the block containing the byte at + * offset 101342453, we would load the indirect block pointed to by pointer + * 0 in the dinode. We would then load the indirect block pointed to by + * pointer 48 in that indirect block. We would then load the data block + * pointed to by pointer 165 in that indirect block. + * + * ---------------------------------------- + * | Dinode | | + * | | 4| + * | |0 1 2 3 4 5 9| + * | | 6| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 5| + * | 4 4 4 4 4 5 5 1| + * |0 5 6 7 8 9 0 1 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 1 1 1 1 1 5| + * | 6 6 6 6 6 1| + * |0 3 4 5 6 7 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Data block containing offset | + * | 101342453 | + * | | + * | | + * ---------------------------------------- + * + */ + +static void find_metapath(const struct gfs2_sbd *sdp, u64 block, + struct metapath *mp, unsigned int height) +{ + unsigned int i; + + for (i = height; i--;) + mp->mp_list[i] = do_div(block, sdp->sd_inptrs); + +} + +static inline unsigned int metapath_branch_start(const struct metapath *mp) +{ + if (mp->mp_list[0] == 0) + return 2; + return 1; +} + +/** + * metapointer - Return pointer to start of metadata in a buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + * + * Return a pointer to the block number of the next height of the metadata + * tree given a buffer containing the pointer to the current height of the + * metadata tree. + */ + +static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + unsigned int head_size = (height > 0) ? + sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); + return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; +} + +static void gfs2_metapath_ra(struct gfs2_glock *gl, + const struct buffer_head *bh, const __be64 *pos) +{ + struct buffer_head *rabh; + const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); + const __be64 *t; + + for (t = pos; t < endp; t++) { + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } +} + +/** + * lookup_metapath - Walk the metadata tree to a specific point + * @ip: The inode + * @mp: The metapath + * + * Assumes that the inode's buffer has already been looked up and + * hooked onto mp->mp_bh[0] and that the metapath has been initialised + * by find_metapath(). + * + * If this function encounters part of the tree which has not been + * allocated, it returns the current height of the tree at the point + * at which it found the unallocated block. Blocks which are found are + * added to the mp->mp_bh[] list. + * + * Returns: error or height of metadata tree + */ + +static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) +{ + unsigned int end_of_metadata = ip->i_height - 1; + unsigned int x; + __be64 *ptr; + u64 dblock; + int ret; + + for (x = 0; x < end_of_metadata; x++) { + ptr = metapointer(x, mp); + dblock = be64_to_cpu(*ptr); + if (!dblock) + return x + 1; + + ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); + if (ret) + return ret; + } + + return ip->i_height; +} + +static inline void release_metapath(struct metapath *mp) +{ + int i; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + } +} + +/** + * gfs2_extent_length - Returns length of an extent of blocks + * @start: Start of the buffer + * @len: Length of the buffer in bytes + * @ptr: Current position in the buffer + * @limit: Max extent length to return (0 = unlimited) + * @eob: Set to 1 if we hit "end of block" + * + * If the first block is zero (unallocated) it will return the number of + * unallocated blocks in the extent, otherwise it will return the number + * of contiguous blocks in the extent. + * + * Returns: The length of the extent (minimum of one block) + */ + +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob) +{ + const __be64 *end = (start + len); + const __be64 *first = ptr; + u64 d = be64_to_cpu(*ptr); + + *eob = 0; + do { + ptr++; + if (ptr >= end) + break; + if (limit && --limit == 0) + break; + if (d) + d++; + } while(be64_to_cpu(*ptr) == d); + if (ptr >= end) + *eob = 1; + return (ptr - first); +} + +static inline void bmap_lock(struct gfs2_inode *ip, int create) +{ + if (create) + down_write(&ip->i_rw_mutex); + else + down_read(&ip->i_rw_mutex); +} + +static inline void bmap_unlock(struct gfs2_inode *ip, int create) +{ + if (create) + up_write(&ip->i_rw_mutex); + else + up_read(&ip->i_rw_mutex); +} + +static inline __be64 *gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) +{ + __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + + ((i > 1) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode))); + BUG_ON(i < 1); + BUG_ON(mp->mp_bh[i] != NULL); + mp->mp_bh[i] = gfs2_meta_new(gl, bn); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); + gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); + ptr += offset; + *ptr = cpu_to_be64(bn); + return ptr; +} + +enum alloc_state { + ALLOC_DATA = 0, + ALLOC_GROW_DEPTH = 1, + ALLOC_GROW_HEIGHT = 2, + /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ +}; + +/** + * gfs2_bmap_alloc - Build a metadata tree of the requested height + * @inode: The GFS2 inode + * @lblock: The logical starting block of the extent + * @bh_map: This is used to return the mapping details + * @mp: The metapath + * @sheight: The starting height (i.e. whats already mapped) + * @height: The height to build to + * @maxlen: The max number of data blocks to alloc + * + * In this routine we may have to alloc: + * i) Indirect blocks to grow the metadata tree height + * ii) Indirect blocks to fill in lower part of the metadata tree + * iii) Data blocks + * + * The function is in two parts. The first part works out the total + * number of blocks which we need. The second part does the actual + * allocation asking for an extent at a time (if enough contiguous free + * blocks are available, there will only be one request per bmap call) + * and uses the state machine to initialise the blocks in order. + * + * Returns: errno on error + */ + +static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, + struct buffer_head *bh_map, struct metapath *mp, + const unsigned int sheight, + const unsigned int height, + const size_t maxlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct super_block *sb = sdp->sd_vfs; + struct buffer_head *dibh = mp->mp_bh[0]; + u64 bn, dblock = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; + unsigned dblks = 0; + unsigned ptrs_per_blk; + const unsigned end_of_metadata = height - 1; + int ret; + int eob = 0; + enum alloc_state state; + __be64 *ptr; + __be64 zero_bn = 0; + + BUG_ON(sheight < 1); + BUG_ON(dibh == NULL); + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (height == sheight) { + struct buffer_head *bh; + /* Bottom indirect block exists, find unalloced extent size */ + ptr = metapointer(end_of_metadata, mp); + bh = mp->mp_bh[end_of_metadata]; + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, + &eob); + BUG_ON(dblks < 1); + state = ALLOC_DATA; + } else { + /* Need to allocate indirect blocks */ + ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + dblks = min(maxlen, (size_t)(ptrs_per_blk - + mp->mp_list[end_of_metadata])); + if (height == ip->i_height) { + /* Writing into existing tree, extend tree down */ + iblks = height - sheight; + state = ALLOC_GROW_DEPTH; + } else { + /* Building up tree height */ + state = ALLOC_GROW_HEIGHT; + iblks = height - ip->i_height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); + } + } + + /* start of the second part of the function (state machine) */ + + blks = dblks + iblks; + i = sheight; + do { + int error; + n = blks - alloced; + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (error) + return error; + alloced += n; + if (state != ALLOC_DATA || gfs2_is_jdata(ip)) + gfs2_trans_add_unrevoke(sdp, bn, n); + switch (state) { + /* Growing height of tree */ + case ALLOC_GROW_HEIGHT: + if (i == 1) { + ptr = (__be64 *)(dibh->b_data + + sizeof(struct gfs2_dinode)); + zero_bn = *ptr; + } + for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); + if (i - 1 == height - ip->i_height) { + i--; + gfs2_buffer_copy_tail(mp->mp_bh[i], + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + gfs2_buffer_clear_tail(dibh, + sizeof(struct gfs2_dinode) + + sizeof(__be64)); + ptr = (__be64 *)(mp->mp_bh[i]->b_data + + sizeof(struct gfs2_meta_header)); + *ptr = zero_bn; + state = ALLOC_GROW_DEPTH; + for(i = branch_start; i < height; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } + i = branch_start; + } + if (n == 0) + break; + /* Branching from existing tree */ + case ALLOC_GROW_DEPTH: + if (i > 1 && i < height) + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); + for (; i < height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, + mp->mp_list[i-1], bn++); + if (i == height) + state = ALLOC_DATA; + if (n == 0) + break; + /* Tree complete, adding data blocks */ + case ALLOC_DATA: + BUG_ON(n > dblks); + BUG_ON(mp->mp_bh[end_of_metadata] == NULL); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); + dblks = n; + ptr = metapointer(end_of_metadata, mp); + dblock = bn; + while (n-- > 0) + *ptr++ = cpu_to_be64(bn++); + if (buffer_zeronew(bh_map)) { + ret = sb_issue_zeroout(sb, dblock, dblks, + GFP_NOFS); + if (ret) { + fs_err(sdp, + "Failed to zero data buffers\n"); + clear_buffer_zeronew(bh_map); + } + } + break; + } + } while ((state != ALLOC_DATA) || !dblock); + + ip->i_height = height; + gfs2_add_inode_blocks(&ip->i_inode, alloced); + gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); + map_bh(bh_map, inode->i_sb, dblock); + bh_map->b_size = dblks << inode->i_blkbits; + set_buffer_new(bh_map); + return 0; +} + +/** + * gfs2_block_map - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request + * + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. + * + * Returns: errno + */ + +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int bsize = sdp->sd_sb.sb_bsize; + const size_t maxlen = bh_map->b_size >> inode->i_blkbits; + const u64 *arr = sdp->sd_heightsize; + __be64 *ptr; + u64 size; + struct metapath mp; + int ret; + int eob; + unsigned int len; + struct buffer_head *bh; + u8 height; + + BUG_ON(maxlen == 0); + + memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + bmap_lock(ip, create); + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + if (gfs2_is_dir(ip)) { + bsize = sdp->sd_jbsize; + arr = sdp->sd_jheightsize; + } + + ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + if (ret) + goto out; + + height = ip->i_height; + size = (lblock + 1) * bsize; + while (size > arr[height]) + height++; + find_metapath(sdp, lblock, &mp, height); + ret = 1; + if (height > ip->i_height || gfs2_is_stuffed(ip)) + goto do_alloc; + ret = lookup_metapath(ip, &mp); + if (ret < 0) + goto out; + if (ret != ip->i_height) + goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); + if (*ptr == 0) + goto do_alloc; + map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + bh = mp.mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); + bh_map->b_size = (len << inode->i_blkbits); + if (eob) + set_buffer_boundary(bh_map); + ret = 0; +out: + release_metapath(&mp); + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + bmap_unlock(ip, create); + return ret; + +do_alloc: + /* All allocations are done here, firstly check create flag */ + if (!create) { + BUG_ON(gfs2_is_stuffed(ip)); + ret = 0; + goto out; + } + + /* At this point ret is the tree depth of already allocated blocks */ + ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); + goto out; +} + +/* + * Deprecated: do not use in new code + */ +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) +{ + struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; + int ret; + int create = *new; + + BUG_ON(!extlen); + BUG_ON(!dblock); + BUG_ON(!new); + + bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + ret = gfs2_block_map(inode, lblock, &bh, create); + *extlen = bh.b_size >> inode->i_blkbits; + *dblock = bh.b_blocknr; + if (buffer_new(&bh)) + *new = 1; + else + *new = 0; + return ret; +} + +/** + * do_strip - Look for a layer a particular layer of the file and strip it off + * @ip: the inode + * @dibh: the dinode buffer + * @bh: A buffer of pointers + * @top: The first pointer in the buffer + * @bottom: One more than the last pointer + * @height: the height this buffer is at + * @sm: a pointer to a struct strip_mine + * + * Returns: errno + */ + +static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, __be64 *top, __be64 *bottom, + unsigned int height, struct strip_mine *sm) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + u64 bn, bstart; + u32 blen, btotal; + __be64 *p; + unsigned int rg_blocks = 0; + int metadata; + unsigned int revokes = 0; + int x; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (!*top) + sm->sm_first = 0; + + if (height != sm->sm_height) + return 0; + + if (sm->sm_first) { + top++; + sm->sm_first = 0; + } + + metadata = (height != ip->i_height - 1); + if (metadata) + revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + else if (ip->i_depth) + revokes = sdp->sd_inptrs; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + + bstart = bn; + blen = 1; + } + } + + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + else + goto out; /* Nothing to do */ + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ + gfs2_rs_deltree(ip->i_res); + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + RES_QUOTA, + revokes); + if (error) + goto out_rg_gunlock; + + down_write(&ip->i_rw_mutex); + + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_trans_add_meta(ip->i_gl, bh); + + bstart = 0; + blen = 0; + btotal = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) { + __gfs2_free_blocks(ip, bstart, blen, metadata); + btotal += blen; + } + + bstart = bn; + blen = 1; + } + + *p = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, blen, metadata); + btotal += blen; + } + + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + + gfs2_dinode_out(ip, dibh->b_data); + + up_write(&ip->i_rw_mutex); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); +out: + return error; +} + +/** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @sm: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, struct strip_mine *sm) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, &bh); + if (error) + return error; + + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = do_strip(ip, dibh, bh, top, bottom, height, sm); + if (error) + goto out; + + if (height < ip->i_height - 1) { + + gfs2_metapath_ra(ip->i_gl, bh, top); + + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, sm); + if (error) + break; + } + } +out: + brelse(bh); + return error; +} + + +/** + * gfs2_block_truncate_page - Deal with zeroing out data for truncate + * + * This is partly borrowed from ext3. + */ +static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct buffer_head *bh; + struct page *page; + int err; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + err = 0; + } + + if (!gfs2_is_writeback(ip)) + gfs2_trans_add_data(ip->i_gl, bh); + + zero_user(page, offset, length); + mark_buffer_dirty(bh); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +#define GFS2_JTRUNC_REVOKES 8192 + +/** + * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files + * @inode: The inode being truncated + * @oldsize: The original (larger) size + * @newsize: The new smaller size + * + * With jdata files, we have to journal a revoke for each block which is + * truncated. As a result, we need to split this into separate transactions + * if the number of pages being truncated gets too large. + */ + +static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + u64 chunk; + int error; + + while (oldsize != newsize) { + chunk = oldsize - newsize; + if (chunk > max_chunk) + chunk = max_chunk; + truncate_pagecache(inode, oldsize - chunk); + oldsize -= chunk; + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + + return 0; +} + +static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct buffer_head *dibh; + int journaled = gfs2_is_jdata(ip); + int error; + + if (journaled) + error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (gfs2_is_stuffed(ip)) { + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); + } else { + if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { + error = gfs2_block_truncate_page(mapping, newsize); + if (error) + goto out_brelse; + } + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; + } + + i_size_write(inode, newsize); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + + if (journaled) + error = gfs2_journaled_truncate(inode, oldsize, newsize); + else + truncate_pagecache(inode, newsize); + + if (error) { + brelse(dibh); + return error; + } + +out_brelse: + brelse(dibh); +out: + gfs2_trans_end(sdp); + return error; +} + +static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int height = ip->i_height; + u64 lblock; + struct metapath mp; + int error; + + if (!size) + lblock = 0; + else + lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; + + find_metapath(sdp, lblock, &mp, ip->i_height); + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + while (height--) { + struct strip_mine sm; + sm.sm_first = !!size; + sm.sm_height = height; + + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); + if (error) + break; + } + + gfs2_quota_unhold(ip); + + return error; +} + +static int trunc_end(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (!i_size_read(&ip->i_inode)) { + ip->i_height = 0; + ip->i_goal = ip->i_no_addr; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); + } + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; + + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +out: + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + return error; +} + +/** + * do_shrink - make a file smaller + * @inode: the inode + * @oldsize: the current inode size + * @newsize: the size to make the file + * + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. + * + * Returns: errno + */ + +static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = trunc_start(inode, oldsize, newsize); + if (error < 0) + return error; + if (gfs2_is_stuffed(ip)) + return 0; + + error = trunc_dealloc(ip, newsize); + if (error == 0) + error = trunc_end(ip); + + return error; +} + +void gfs2_trim_blocks(struct inode *inode) +{ + u64 size = inode->i_size; + int ret; + + ret = do_shrink(inode, size, size); + WARN_ON(ret != 0); +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occurring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .target = 1, }; + struct buffer_head *dibh; + int error; + int unstuff = 0; + + if (gfs2_is_stuffed(ip) && + (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto do_grow_qunlock; + unstuff = 1; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? + 0 : RES_QUOTA), 0); + if (error) + goto do_grow_release; + + if (unstuff) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto do_end_trans; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto do_end_trans; + + i_size_write(inode, size); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +do_end_trans: + gfs2_trans_end(sdp); +do_grow_release: + if (unstuff) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); + } + return error; +} + +/** + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file + * + * The file size can grow, shrink, or stay the same size. This + * is called holding i_mutex and an exclusive glock on the inode + * in question. + * + * Returns: errno + */ + +int gfs2_setattr_size(struct inode *inode, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + u64 oldsize; + + BUG_ON(!S_ISREG(inode->i_mode)); + + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; + + ret = get_write_access(inode); + if (ret) + return ret; + + inode_dio_wait(inode); + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out; + + oldsize = inode->i_size; + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } + + gfs2_rs_deltree(ip->i_res); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; +} + +int gfs2_truncatei_resume(struct gfs2_inode *ip) +{ + int error; + error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); + if (!error) + error = trunc_end(ip); + return error; +} + +int gfs2_file_dealloc(struct gfs2_inode *ip) +{ + return trunc_dealloc(ip, 0); +} + +/** + * gfs2_free_journal_extents - Free cached journal bmap info + * @jd: The journal + * + */ + +void gfs2_free_journal_extents(struct gfs2_jdesc *jd) +{ + struct gfs2_journal_extent *jext; + + while(!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list); + list_del(&jext->list); + kfree(jext); + } +} + +/** + * gfs2_add_jextent - Add or merge a new extent to extent cache + * @jd: The journal descriptor + * @lblock: The logical block at start of new extent + * @dblock: The physical block at start of new extent + * @blocks: Size of extent in fs blocks + * + * Returns: 0 on success or -ENOMEM + */ + +static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) +{ + struct gfs2_journal_extent *jext; + + if (!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list); + if ((jext->dblock + jext->blocks) == dblock) { + jext->blocks += blocks; + return 0; + } + } + + jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); + if (jext == NULL) + return -ENOMEM; + jext->dblock = dblock; + jext->lblock = lblock; + jext->blocks = blocks; + list_add_tail(&jext->list, &jd->extent_list); + jd->nr_extents++; + return 0; +} + +/** + * gfs2_map_journal_extents - Cache journal bmap info + * @sdp: The super block + * @jd: The journal to map + * + * Create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * Returns: 0 on success, or error on failure + */ + +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + u64 lblock = 0; + u64 lblock_stop; + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct buffer_head bh; + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 size; + int rc; + + lblock_stop = i_size_read(jd->jd_inode) >> shift; + size = (lblock_stop - lblock) << shift; + jd->nr_extents = 0; + WARN_ON(!list_empty(&jd->extent_list)); + + do { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = size; + rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); + if (rc || !buffer_mapped(&bh)) + goto fail; + rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); + if (rc) + goto fail; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, + jd->nr_extents); + return 0; + +fail: + fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", + rc, jd->jd_jid, + (unsigned long long)(i_size_read(jd->jd_inode) - size), + jd->nr_extents); + fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", + rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, + bh.b_state, (unsigned long long)bh.b_size); + gfs2_free_journal_extents(jd); + return rc; +} + +/** + * gfs2_write_alloc_required - figure out if a write will require an allocation + * @ip: the file being written to + * @offset: the offset to write to + * @len: the number of bytes being written + * + * Returns: 1 if an alloc is required, 0 otherwise + */ + +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head bh; + unsigned int shift; + u64 lblock, lblock_stop, size; + u64 end_of_file; + + if (!len) + return 0; + + if (gfs2_is_stuffed(ip)) { + if (offset + len > + sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return 1; + return 0; + } + + shift = sdp->sd_sb.sb_bsize_shift; + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) + return 1; + + size = (lblock_stop - lblock) << shift; + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(&ip->i_inode, lblock, &bh, 0); + if (!buffer_mapped(&bh)) + return 1; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + return 0; +} + diff --git a/kernel/fs/gfs2/bmap.h b/kernel/fs/gfs2/bmap.h new file mode 100644 index 000000000..81ded5e2a --- /dev/null +++ b/kernel/fs/gfs2/bmap.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __BMAP_DOT_H__ +#define __BMAP_DOT_H__ + +#include "inode.h" + +struct inode; +struct gfs2_inode; +struct page; + + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, + u64 *dblock, unsigned *extlen); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern void gfs2_trim_blocks(struct inode *inode); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); +extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); + +#endif /* __BMAP_DOT_H__ */ diff --git a/kernel/fs/gfs2/dentry.c b/kernel/fs/gfs2/dentry.c new file mode 100644 index 000000000..30822b148 --- /dev/null +++ b/kernel/fs/gfs2/dentry.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "super.h" +#include "util.h" +#include "inode.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @flags: lookup flags + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip = NULL; + int error; + int had_lock = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); + inode = d_inode(dentry); + + if (inode) { + if (is_bad_inode(inode)) + goto invalid; + ip = GFS2_I(inode); + } + + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto valid; + + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + } + + error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); + switch (error) { + case 0: + if (!inode) + goto invalid_gunlock; + break; + case -ENOENT: + if (!inode) + goto valid_gunlock; + goto invalid_gunlock; + default: + goto fail_gunlock; + } + +valid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +valid: + dput(parent); + return 1; + +invalid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +invalid: + dput(parent); + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&d_gh); +fail: + dput(parent); + return 0; +} + +static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +static int gfs2_dentry_delete(const struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (d_really_is_negative(dentry)) + return 0; + + ginode = GFS2_I(d_inode(dentry)); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + +const struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, +}; + diff --git a/kernel/fs/gfs2/dir.c b/kernel/fs/gfs2/dir.c new file mode 100644 index 000000000..487527b42 --- /dev/null +++ b/kernel/fs/gfs2/dir.c @@ -0,0 +1,2090 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Implements Extendible Hashing as described in: + * "Extendible Hashing" by Fagin, et al in + * __ACM Trans. on Database Systems__, Sept 1979. + * + * + * Here's the layout of dirents which is essentially the same as that of ext2 + * within a single block. The field de_name_len is the number of bytes + * actually required for the name (no null terminator). The field de_rec_len + * is the number of bytes allocated to the dirent. The offset of the next + * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is + * deleted, the preceding dirent inherits its allocated space, ie + * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained + * by adding de_rec_len to the current dirent, this essentially causes the + * deleted dirent to get jumped over when iterating through all the dirents. + * + * When deleting the first dirent in a block, there is no previous dirent so + * the field de_ino is set to zero to designate it as deleted. When allocating + * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the + * first dirent has (de_ino == 0) and de_rec_len is large enough, this first + * dirent is allocated. Otherwise it must go through all the 'used' dirents + * searching for one in which the amount of total space minus the amount of + * used space will provide enough space for the new dirent. + * + * There are two types of blocks in which dirents reside. In a stuffed dinode, + * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of + * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the + * beginning of the leaf block. The dirents reside in leaves when + * + * dip->i_diskflags & GFS2_DIF_EXHASH is true + * + * Otherwise, the dirents are "linear", within a single stuffed dinode block. + * + * When the dirents are in leaves, the actual contents of the directory file are + * used as an array of 64-bit block pointers pointing to the leaf blocks. The + * dirents are NOT in the directory file itself. There can be more than one + * block pointer in the array that points to the same leaf. In fact, when a + * directory is first converted from linear to exhash, all of the pointers + * point to the same leaf. + * + * When a leaf is completely full, the size of the hash table can be + * doubled unless it is already at the maximum size which is hard coded into + * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, + * but never before the maximum hash table size has been reached. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "bmap.h" +#include "util.h" + +#define IS_LEAF 1 /* Hashed (leaf) directory */ +#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ + +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ + +#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) + +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + +typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, + const struct qstr *name, void *opaque); + +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + *bhp = bh; + return 0; +} + +static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + int error; + + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + if (error) + return error; + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { + brelse(bh); + return -EIO; + } + *bhp = bh; + return 0; +} + +static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, + unsigned int offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, dibh); + memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + + brelse(dibh); + + return size; +} + + + +/** + * gfs2_dir_write_data - Write directory information to the inode + * @ip: The GFS2 inode + * @buf: The buffer containing information to be written + * @offset: The file offset to start writing at + * @size: The amount of data to write + * + * Returns: The number of bytes correctly written or error code + */ +static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, + u64 offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + int new = 0; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip) && + offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + return error; + } + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 1; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error) + goto fail; + error = -EIO; + if (gfs2_assert_withdraw(sdp, dblock)) + goto fail; + } + + if (amount == sdp->sd_jbsize || new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + + if (error) + goto fail; + + gfs2_trans_add_meta(ip->i_gl, bh); + memcpy(bh->b_data + o, buf, amount); + brelse(bh); + + buf += amount; + copied += amount; + lblock++; + dblock++; + extlen--; + + o = sizeof(struct gfs2_meta_header); + } + +out: + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + + return copied; +fail: + if (copied) + goto out; + return error; +} + +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); + } + + return (error) ? error : size; +} + + +/** + * gfs2_dir_read_data - Read a data from a directory inode + * @ip: The GFS2 Inode + * @buf: The buffer to place result into + * @size: Amount of data to transfer + * + * Returns: The amount of data actually copied or the error + */ +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (gfs2_is_stuffed(ip)) + return gfs2_dir_read_stuffed(ip, buf, size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + lblock = 0; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 0; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error || !dblock) + goto fail; + BUG_ON(extlen < 1); + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + } else { + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + if (error) + goto fail; + } + error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD); + if (error) { + brelse(bh); + goto fail; + } + dblock++; + extlen--; + memcpy(buf, bh->b_data + o, amount); + brelse(bh); + buf += (amount/sizeof(__be64)); + copied += amount; + lblock++; + o = sizeof(struct gfs2_meta_header); + } + + return copied; +fail: + return (copied) ? copied : error; +} + +/** + * gfs2_dir_get_hash_table - Get pointer to the dir hash table + * @ip: The inode in question + * + * Returns: The hash table or an error + */ + +static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + int ret; + u32 hsize; + __be64 *hc; + + BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); + + hc = ip->i_hash_cache; + if (hc) + return hc; + + hsize = 1 << ip->i_depth; + hsize *= sizeof(__be64); + if (hsize != i_size_read(&ip->i_inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + + if (hc == NULL) + return ERR_PTR(-ENOMEM); + + ret = gfs2_dir_read_data(ip, hc, hsize); + if (ret < 0) { + kvfree(hc); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + if (likely(!ip->i_hash_cache)) { + ip->i_hash_cache = hc; + hc = NULL; + } + spin_unlock(&inode->i_lock); + kvfree(hc); + + return ip->i_hash_cache; +} + +/** + * gfs2_dir_hash_inval - Invalidate dir hash + * @ip: The directory inode + * + * Must be called with an exclusive glock, or during glock invalidation. + */ +void gfs2_dir_hash_inval(struct gfs2_inode *ip) +{ + __be64 *hc = ip->i_hash_cache; + ip->i_hash_cache = NULL; + kvfree(hc); +} + +static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) +{ + return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; +} + +static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, int ret) +{ + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) == name->hash && + be16_to_cpu(dent->de_name_len) == name->len && + memcmp(dent+1, name->name, name->len) == 0) + return ret; + return 0; +} + +static int gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 1); +} + +static int gfs2_dirent_prev(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 2); +} + +/* + * name->name holds ptr to start of block. + * name->len holds size of block. + */ +static int gfs2_dirent_last(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + const char *start = name->name; + const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len); + if (name->len == (end - start)) + return 1; + return 0; +} + +static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (totlen - actual >= required) + return 1; + return 0; +} + +struct dirent_gather { + const struct gfs2_dirent **pdent; + unsigned offset; +}; + +static int gfs2_dirent_gather(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + struct dirent_gather *g = opaque; + if (!gfs2_dirent_sentinel(dent)) { + g->pdent[g->offset++] = dent; + } + return 0; +} + +/* + * Other possible things to check: + * - Inode located within filesystem size (and on valid block) + * - Valid directory entry type + * Not sure how heavy-weight we want to make this... could also check + * hash is correct for example, but that would take a lot of extra time. + * For now the most important thing is to check that the various sizes + * are correct. + */ +static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, + unsigned int size, unsigned int len, int first) +{ + const char *msg = "gfs2_dirent too small"; + if (unlikely(size < sizeof(struct gfs2_dirent))) + goto error; + msg = "gfs2_dirent misaligned"; + if (unlikely(offset & 0x7)) + goto error; + msg = "gfs2_dirent points beyond end of block"; + if (unlikely(offset + size > len)) + goto error; + msg = "zero inode number"; + if (unlikely(!first && gfs2_dirent_sentinel(dent))) + goto error; + msg = "name length is greater than space in dirent"; + if (!gfs2_dirent_sentinel(dent) && + unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) > + size)) + goto error; + return 0; +error: + pr_warn("%s: %s (%s)\n", + __func__, msg, first ? "first in block" : "not first in block"); + return -EIO; +} + +static int gfs2_dirent_offset(const void *buf) +{ + const struct gfs2_meta_header *h = buf; + int offset; + + BUG_ON(buf == NULL); + + switch(be32_to_cpu(h->mh_type)) { + case GFS2_METATYPE_LF: + offset = sizeof(struct gfs2_leaf); + break; + case GFS2_METATYPE_DI: + offset = sizeof(struct gfs2_dinode); + break; + default: + goto wrong_type; + } + return offset; +wrong_type: + pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type)); + return -1; +} + +static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, + unsigned int len, gfs2_dscan_t scan, + const struct qstr *name, + void *opaque) +{ + struct gfs2_dirent *dent, *prev; + unsigned offset; + unsigned size; + int ret = 0; + + ret = gfs2_dirent_offset(buf); + if (ret < 0) + goto consist_inode; + + offset = ret; + prev = NULL; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 1)) + goto consist_inode; + do { + ret = scan(dent, name, opaque); + if (ret) + break; + offset += size; + if (offset == len) + break; + prev = dent; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 0)) + goto consist_inode; + } while(1); + + switch(ret) { + case 0: + return NULL; + case 1: + return dent; + case 2: + return prev ? prev : dent; + default: + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + +consist_inode: + gfs2_consist_inode(GFS2_I(inode)); + return ERR_PTR(-EIO); +} + +static int dirent_check_reclen(struct gfs2_inode *dip, + const struct gfs2_dirent *d, const void *end_p) +{ + const void *ptr = d; + u16 rec_len = be16_to_cpu(d->de_rec_len); + + if (unlikely(rec_len < sizeof(struct gfs2_dirent))) + goto broken; + ptr += rec_len; + if (ptr < end_p) + return rec_len; + if (ptr == end_p) + return -ENOENT; +broken: + gfs2_consist_inode(dip); + return -EIO; +} + +/** + * dirent_next - Next dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * Returns: 0 on success, error code otherwise + */ + +static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_dirent *cur = *dent, *tmp; + char *bh_end = bh->b_data + bh->b_size; + int ret; + + ret = dirent_check_reclen(dip, cur, bh_end); + if (ret < 0) + return ret; + + tmp = (void *)cur + ret; + ret = dirent_check_reclen(dip, tmp, bh_end); + if (ret == -EIO) + return ret; + + /* Only the first dent could ever have de_inum.no_addr == 0 */ + if (gfs2_dirent_sentinel(tmp)) { + gfs2_consist_inode(dip); + return -EIO; + } + + *dent = tmp; + return 0; +} + +/** + * dirent_del - Delete a dirent + * @dip: The GFS2 inode + * @bh: The buffer + * @prev: The previous dirent + * @cur: The current dirent + * + */ + +static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + u16 cur_rec_len, prev_rec_len; + + if (gfs2_dirent_sentinel(cur)) { + gfs2_consist_inode(dip); + return; + } + + gfs2_trans_add_meta(dip->i_gl, bh); + + /* If there is no prev entry, this is the first entry in the block. + The de_rec_len is already as big as it needs to be. Just zero + out the inode number and return. */ + + if (!prev) { + cur->de_inum.no_addr = 0; + cur->de_inum.no_formal_ino = 0; + return; + } + + /* Combine this dentry with the previous one. */ + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)prev + prev_rec_len != (char *)cur) + gfs2_consist_inode(dip); + if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size) + gfs2_consist_inode(dip); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + +/* + * Takes a dent from which to grab space as an argument. Returns the + * newly created dent. + */ +static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned offset = 0, totlen; + + if (!gfs2_dirent_sentinel(dent)) + offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_meta(ip->i_gl, bh); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + +static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name) +{ + struct gfs2_dirent *dent; + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_find_space, name, NULL); + if (!dent || IS_ERR(dent)) + return dent; + return gfs2_init_dirent(inode, dent, name, bh); +} + +static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, + struct buffer_head **bhp) +{ + int error; + + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { + /* pr_info("block num=%llu\n", leaf_no); */ + error = -EIO; + } + + return error; +} + +/** + * get_leaf_nr - Get a leaf number associated with the index + * @dip: The GFS2 inode + * @index: + * @leaf_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, + u64 *leaf_out) +{ + __be64 *hash; + + hash = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hash)) + return PTR_ERR(hash); + *leaf_out = be64_to_cpu(*(hash + index)); + return 0; +} + +static int get_first_leaf(struct gfs2_inode *dip, u32 index, + struct buffer_head **bh_out) +{ + u64 leaf_no; + int error; + + error = get_leaf_nr(dip, index, &leaf_no); + if (!error) + error = get_leaf(dip, leaf_no, bh_out); + + return error; +} + +static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, + const struct qstr *name, + gfs2_dscan_t scan, + struct buffer_head **pbh) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf; + unsigned hsize = 1 << ip->i_depth; + unsigned index; + u64 ln; + if (hsize * sizeof(u64) != i_size_read(inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &bh); + if (error) + return ERR_PTR(error); + do { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + scan, name, NULL); + if (dent) + goto got_dent; + leaf = (struct gfs2_leaf *)bh->b_data; + ln = be64_to_cpu(leaf->lf_next); + brelse(bh); + if (!ln) + break; + + error = get_leaf(ip, ln, &bh); + } while(!error); + + return error ? ERR_PTR(error) : NULL; + } + + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return ERR_PTR(error); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); +got_dent: + if (unlikely(dent == NULL || IS_ERR(dent))) { + brelse(bh); + bh = NULL; + } + *pbh = bh; + return dent; +} + +static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int n = 1; + u64 bn; + int error; + struct buffer_head *bh; + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; + + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); + if (!bh) + return NULL; + + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_depth = cpu_to_be16(depth); + leaf->lf_entries = 0; + leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); + leaf->lf_next = 0; + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); + dent = (struct gfs2_dirent *)(leaf+1); + gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); + *pbh = bh; + return leaf; +} + +/** + * dir_make_exhash - Convert a stuffed directory into an ExHash directory + * @dip: The GFS2 inode + * + * Returns: 0 on success, error code otherwise + */ + +static int dir_make_exhash(struct inode *inode) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_dirent *dent; + struct qstr args; + struct buffer_head *bh, *dibh; + struct gfs2_leaf *leaf; + int y; + u32 x; + __be64 *lp; + u64 bn; + int error; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + /* Turn over a new leaf */ + + leaf = new_leaf(inode, &bh, 0); + if (!leaf) + return -ENOSPC; + bn = bh->b_blocknr; + + gfs2_assert(sdp, dip->i_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); + + /* Copy dirents */ + + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh, + sizeof(struct gfs2_dinode)); + + /* Find last entry */ + + x = 0; + args.len = bh->b_size - sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_leaf); + args.name = bh->b_data; + dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size, + gfs2_dirent_last, &args, NULL); + if (!dent) { + brelse(bh); + brelse(dibh); + return -EIO; + } + if (IS_ERR(dent)) { + brelse(bh); + brelse(dibh); + return PTR_ERR(dent); + } + + /* Adjust the last dirent's record length + (Remember that dent still points to the last entry.) */ + + dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) + + sizeof(struct gfs2_dinode) - + sizeof(struct gfs2_leaf)); + + brelse(bh); + + /* We're done with the new leaf block, now setup the new + hash table. */ + + gfs2_trans_add_meta(dip->i_gl, dibh); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); + gfs2_add_inode_blocks(&dip->i_inode, 1); + dip->i_diskflags |= GFS2_DIF_EXHASH; + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_depth = y; + + gfs2_dinode_out(dip, dibh->b_data); + + brelse(dibh); + + return 0; +} + +/** + * dir_split_leaf - Split a leaf block into two + * @dip: The GFS2 inode + * @index: + * @leaf_no: + * + * Returns: 0 on success, error code on failure + */ + +static int dir_split_leaf(struct inode *inode, const struct qstr *name) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct buffer_head *nbh, *obh, *dibh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new; + u32 start, len, half_len, divider; + u64 bn, leaf_no; + __be64 *lp; + u32 index; + int x, moved = 0; + int error; + + index = name->hash >> (32 - dip->i_depth); + error = get_leaf_nr(dip, index, &leaf_no); + if (error) + return error; + + /* Get the old leaf block */ + error = get_leaf(dip, leaf_no, &obh); + if (error) + return error; + + oleaf = (struct gfs2_leaf *)obh->b_data; + if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) { + brelse(obh); + return 1; /* can't split */ + } + + gfs2_trans_add_meta(dip->i_gl, obh); + + nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); + if (!nleaf) { + brelse(obh); + return -ENOSPC; + } + bn = nbh->b_blocknr; + + /* Compute the start and len of leaf pointers in the hash table. */ + len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + if (!half_len) { + pr_warn("i_depth %u lf_depth %u index %u\n", + dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); + gfs2_consist_inode(dip); + error = -EIO; + goto fail_brelse; + } + + start = (index & ~(len - 1)); + + /* Change the pointers. + Don't bother distinguishing stuffed from non-stuffed. + This code is complicated enough already. */ + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + + /* Change the pointers */ + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + gfs2_dir_hash_inval(dip); + + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), + half_len * sizeof(u64)); + if (error != half_len * sizeof(u64)) { + if (error >= 0) + error = -EIO; + goto fail_lpfree; + } + + kfree(lp); + + /* Compute the divider */ + divider = (start + half_len) << (32 - dip->i_depth); + + /* Copy the entries */ + dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf)); + + do { + next = dent; + if (dirent_next(dip, obh, &next)) + next = NULL; + + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) < divider) { + struct qstr str; + str.name = (char*)(dent+1); + str.len = be16_to_cpu(dent->de_name_len); + str.hash = be32_to_cpu(dent->de_hash); + new = gfs2_dirent_alloc(inode, nbh, &str); + if (IS_ERR(new)) { + error = PTR_ERR(new); + break; + } + + new->de_inum = dent->de_inum; /* No endian worries */ + new->de_type = dent->de_type; /* No endian worries */ + be16_add_cpu(&nleaf->lf_entries, 1); + + dirent_del(dip, obh, prev, dent); + + if (!oleaf->lf_entries) + gfs2_consist_inode(dip); + be16_add_cpu(&oleaf->lf_entries, -1); + + if (!prev) + prev = dent; + + moved = 1; + } else { + prev = dent; + } + dent = next; + } while (dent); + + oleaf->lf_depth = nleaf->lf_depth; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { + gfs2_trans_add_meta(dip->i_gl, dibh); + gfs2_add_inode_blocks(&dip->i_inode, 1); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + } + + brelse(obh); + brelse(nbh); + + return error; + +fail_lpfree: + kfree(lp); + +fail_brelse: + brelse(obh); + brelse(nbh); + return error; +} + +/** + * dir_double_exhash - Double size of ExHash table + * @dip: The GFS2 dinode + * + * Returns: 0 on success, error code on failure + */ + +static int dir_double_exhash(struct gfs2_inode *dip) +{ + struct buffer_head *dibh; + u32 hsize; + u32 hsize_bytes; + __be64 *hc; + __be64 *hc2, *h; + int x; + int error = 0; + + hsize = 1 << dip->i_depth; + hsize_bytes = hsize * sizeof(__be64); + + hc = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hc)) + return PTR_ERR(hc); + + hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + + if (!hc2) + return -ENOMEM; + + h = hc2; + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_kfree; + + for (x = 0; x < hsize; x++) { + *h++ = *hc; + *h++ = *hc; + hc++; + } + + error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2); + if (error != (hsize_bytes * 2)) + goto fail; + + gfs2_dir_hash_inval(dip); + dip->i_hash_cache = hc2; + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + return 0; + +fail: + /* Replace original hash table & size */ + gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes); + i_size_write(&dip->i_inode, hsize_bytes); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); +out_kfree: + kvfree(hc2); + return error; +} + +/** + * compare_dents - compare directory entries by hash value + * @a: first dent + * @b: second dent + * + * When comparing the hash entries of @a to @b: + * gt: returns 1 + * lt: returns -1 + * eq: returns 0 + */ + +static int compare_dents(const void *a, const void *b) +{ + const struct gfs2_dirent *dent_a, *dent_b; + u32 hash_a, hash_b; + int ret = 0; + + dent_a = *(const struct gfs2_dirent **)a; + hash_a = be32_to_cpu(dent_a->de_hash); + + dent_b = *(const struct gfs2_dirent **)b; + hash_b = be32_to_cpu(dent_b->de_hash); + + if (hash_a > hash_b) + ret = 1; + else if (hash_a < hash_b) + ret = -1; + else { + unsigned int len_a = be16_to_cpu(dent_a->de_name_len); + unsigned int len_b = be16_to_cpu(dent_b->de_name_len); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else + ret = memcmp(dent_a + 1, dent_b + 1, len_a); + } + + return ret; +} + +/** + * do_filldir_main - read out directory entries + * @dip: The GFS2 inode + * @ctx: what to feed the entries to + * @darr: an array of struct gfs2_dirent pointers to read + * @entries: the number of entries in darr + * @copied: pointer to int that's non-zero if a entry has been copied out + * + * Jump through some hoops to make sure that if there are hash collsions, + * they are read out at the beginning of a buffer. We want to minimize + * the possibility that they will fall into different readdir buffers or + * that someone will want to seek to that location. + * + * Returns: errno, >0 if the actor tells you to stop + */ + +static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, + const struct gfs2_dirent **darr, u32 entries, + int *copied) +{ + const struct gfs2_dirent *dent, *dent_next; + u64 off, off_next; + unsigned int x, y; + int run = 0; + + sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + + dent_next = darr[0]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + for (x = 0, y = 1; x < entries; x++, y++) { + dent = dent_next; + off = off_next; + + if (y < entries) { + dent_next = darr[y]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + if (off < ctx->pos) + continue; + ctx->pos = off; + + if (off_next == off) { + if (*copied && !run) + return 1; + run = 1; + } else + run = 0; + } else { + if (off < ctx->pos) + continue; + ctx->pos = off; + } + + if (!dir_emit(ctx, (const char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) + return 1; + + *copied = 1; + } + + /* Increment the ctx->pos by one, so the next time we come into the + do_filldir fxn, we get the next entry instead of the last one in the + current leaf */ + + ctx->pos++; + + return 0; +} + +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, + int *copied, unsigned *depth, + u64 leaf_no) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_leaf *lf; + unsigned entries = 0, entries2 = 0; + unsigned leaves = 0; + const struct gfs2_dirent **darr, *dent; + struct dirent_gather g; + struct buffer_head **larr; + int leaf = 0; + int error, i; + u64 lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out; + lf = (struct gfs2_leaf *)bh->b_data; + if (leaves == 0) + *depth = be16_to_cpu(lf->lf_depth); + entries += be16_to_cpu(lf->lf_entries); + leaves++; + lfn = be64_to_cpu(lf->lf_next); + brelse(bh); + } while(lfn); + + if (!entries) + return 0; + + error = -ENOMEM; + /* + * The extra 99 entries are not normally used, but are a buffer + * zone in case the number of entries in the leaf is corrupt. + * 99 is the maximum number of entries that can fit in a single + * leaf block. + */ + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); + if (!larr) + goto out; + darr = (const struct gfs2_dirent **)(larr + leaves); + g.pdent = darr; + g.offset = 0; + lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out_free; + lf = (struct gfs2_leaf *)bh->b_data; + lfn = be64_to_cpu(lf->lf_next); + if (lf->lf_entries) { + entries2 += be16_to_cpu(lf->lf_entries); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_gather, NULL, &g); + error = PTR_ERR(dent); + if (IS_ERR(dent)) + goto out_free; + if (entries2 != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir " + "leaf %llu, entries2 (%u) != " + "g.offset (%u)\n", + (unsigned long long)bh->b_blocknr, + entries2, g.offset); + + error = -EIO; + goto out_free; + } + error = 0; + larr[leaf++] = bh; + } else { + brelse(bh); + } + } while(lfn); + + BUG_ON(entries2 != entries); + error = do_filldir_main(ip, ctx, darr, entries, copied); +out_free: + for(i = 0; i < leaf; i++) + brelse(larr[i]); + kvfree(larr); +out: + return error; +} + +/** + * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. + */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count; + + /* First check if we've already read-ahead for the whole range. */ + if (index + MAX_RA_BLOCKS < f_ra->start) + return; + + f_ra->start = max((pgoff_t)index, f_ra->start); + for (count = 0; count < MAX_RA_BLOCKS; count++) { + if (f_ra->start >= hsize) /* if exceeded the hash table */ + break; + + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]); + f_ra->start++; + if (blocknr == last) + continue; + + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, bh); + continue; + } + brelse(bh); + } +} + +/** + * dir_e_read - Reads the entries from a directory into a filldir buffer + * @dip: dinode pointer + * @ctx: actor to feed the entries to + * + * Returns: errno + */ + +static int dir_e_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *dip = GFS2_I(inode); + u32 hsize, len = 0; + u32 hash, index; + __be64 *lp; + int copied = 0; + int error = 0; + unsigned depth = 0; + + hsize = 1 << dip->i_depth; + hash = gfs2_dir_offset2hash(ctx->pos); + index = hash >> (32 - dip->i_depth); + + if (dip->i_hash_cache == NULL) + f_ra->start = 0; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); + + gfs2_dir_readahead(inode, hsize, index, f_ra); + + while (index < hsize) { + error = gfs2_dir_read_leaf(inode, ctx, + &copied, &depth, + be64_to_cpu(lp[index])); + if (error) + break; + + len = 1 << (dip->i_depth - depth); + index = (index & ~(len - 1)) + len; + } + + if (error > 0) + error = 0; + return error; +} + +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct dirent_gather g; + const struct gfs2_dirent **darr, *dent; + struct buffer_head *dibh; + int copied = 0; + int error; + + if (!dip->i_entries) + return 0; + + if (dip->i_diskflags & GFS2_DIF_EXHASH) + return dir_e_read(inode, ctx, f_ra); + + if (!gfs2_is_stuffed(dip)) { + gfs2_consist_inode(dip); + return -EIO; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + error = -ENOMEM; + /* 96 is max number of dirents which can be stuffed into an inode */ + darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); + if (darr) { + g.pdent = darr; + g.offset = 0; + dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, + gfs2_dirent_gather, NULL, &g); + if (IS_ERR(dent)) { + error = PTR_ERR(dent); + goto out; + } + if (dip->i_entries != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir %llu, " + "ip->i_entries (%u) != g.offset (%u)\n", + (unsigned long long)dip->i_no_addr, + dip->i_entries, + g.offset); + error = -EIO; + goto out; + } + error = do_filldir_main(dip, ctx, darr, + dip->i_entries, &copied); +out: + kfree(darr); + } + + if (error > 0) + error = 0; + + brelse(dibh); + + return error; +} + +/** + * gfs2_dir_search - Search a directory + * @dip: The GFS2 dir inode + * @name: The name we are looking up + * @fail_on_exist: Fail if the name exists rather than looking it up + * + * This routine searches a directory for a file or another directory. + * Assumes a glock is held on dip. + * + * Returns: errno + */ + +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, + bool fail_on_exist) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + u64 addr, formal_ino; + u16 dtype; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return ERR_CAST(dent); + dtype = be16_to_cpu(dent->de_type); + addr = be64_to_cpu(dent->de_inum.no_addr); + formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); + brelse(bh); + if (fail_on_exist) + return ERR_PTR(-EEXIST); + return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + } + return ERR_PTR(-ENOENT); +} + +int gfs2_dir_check(struct inode *dir, const struct qstr *name, + const struct gfs2_inode *ip) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int ret = -ENOENT; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + if (ip) { + if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr) + goto out; + if (be64_to_cpu(dent->de_inum.no_formal_ino) != + ip->i_no_formal_ino) + goto out; + if (unlikely(IF2DT(ip->i_inode.i_mode) != + be16_to_cpu(dent->de_type))) { + gfs2_consist_inode(GFS2_I(dir)); + ret = -EIO; + goto out; + } + } + ret = 0; +out: + brelse(bh); + } + return ret; +} + +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + +static int dir_new_leaf(struct inode *inode, const struct qstr *name) +{ + struct buffer_head *bh, *obh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; + int error; + u32 index; + u64 bn; + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &obh); + if (error) + return error; + do { + dist++; + oleaf = (struct gfs2_leaf *)obh->b_data; + bn = be64_to_cpu(oleaf->lf_next); + if (!bn) + break; + brelse(obh); + error = get_leaf(ip, bn, &obh); + if (error) + return error; + } while(1); + + gfs2_trans_add_meta(ip->i_gl, obh); + + leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); + if (!leaf) { + brelse(obh); + return -ENOSPC; + } + leaf->lf_dist = cpu_to_be32(dist); + oleaf->lf_next = cpu_to_be64(bh->b_blocknr); + brelse(bh); + brelse(obh); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_add_inode_blocks(&ip->i_inode, 1); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + return 0; +} + +static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip) +{ + u64 where = ip->i_no_addr + 1; + if (ip->i_eattr == where) + return 1; + return 0; +} + +/** + * gfs2_dir_add - Add new filename into directory + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_add(struct inode *inode, const struct qstr *name, + const struct gfs2_inode *nip, struct gfs2_diradd *da) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; + struct gfs2_leaf *leaf; + int error; + + while(1) { + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + dent = gfs2_init_dirent(inode, dent, name, bh); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); + tv = CURRENT_TIME; + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + leaf = (struct gfs2_leaf *)bh->b_data; + be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + } + da->dent = NULL; + da->bh = NULL; + brelse(bh); + ip->i_entries++; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; + if (S_ISDIR(nip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); + mark_inode_dirty(inode); + error = 0; + break; + } + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = dir_make_exhash(inode); + if (error) + break; + continue; + } + error = dir_split_leaf(inode, name); + if (error == 0) + continue; + if (error < 0) + break; + if (ip->i_depth < GFS2_DIR_MAX_DEPTH) { + error = dir_double_exhash(ip); + if (error) + break; + error = dir_split_leaf(inode, name); + if (error < 0) + break; + if (error == 0) + continue; + } + error = dir_new_leaf(inode, name); + if (!error) + continue; + error = -ENOSPC; + break; + } + return error; +} + + +/** + * gfs2_dir_del - Delete a directory entry + * @dip: The GFS2 inode + * @filename: The filename + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) +{ + const struct qstr *name = &dentry->d_name; + struct gfs2_dirent *dent, *prev = NULL; + struct buffer_head *bh; + struct timespec tv = CURRENT_TIME; + + /* Returns _either_ the entry (if its first in block) or the + previous entry otherwise */ + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) { + gfs2_consist_inode(dip); + return PTR_ERR(dent); + } + /* If not first in block, adjust pointers accordingly */ + if (gfs2_dirent_find(dent, name, NULL) == 0) { + prev = dent; + dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len)); + } + + dirent_del(dip, bh, prev, dent); + if (dip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + u16 entries = be16_to_cpu(leaf->lf_entries); + if (!entries) + gfs2_consist_inode(dip); + leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + } + brelse(bh); + + if (!dip->i_entries) + gfs2_consist_inode(dip); + dip->i_entries--; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; + if (d_is_dir(dentry)) + drop_nlink(&dip->i_inode); + mark_inode_dirty(&dip->i_inode); + + return 0; +} + +/** + * gfs2_dir_mvino - Change inode number of directory entry + * @dip: The GFS2 inode + * @filename: + * @new_inode: + * + * This routine changes the inode number of a directory entry. It's used + * by rename to change ".." when a directory is moved. + * Assumes a glock is held on dvp. + * + * Returns: errno + */ + +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int error; + + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + gfs2_trans_add_meta(dip->i_gl, bh); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(new_type); + + if (dip->i_diskflags & GFS2_DIF_EXHASH) { + brelse(bh); + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + gfs2_trans_add_meta(dip->i_gl, bh); + } + + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(dip, bh->b_data); + brelse(bh); + return 0; +} + +/** + * leaf_dealloc - Deallocate a directory leaf + * @dip: the directory + * @index: the hash table offset in the directory + * @len: the number of pointers to this leaf + * @leaf_no: the leaf number + * @leaf_bh: buffer_head for the starting leaf + * last_dealloc: 1 if this is the final dealloc for the leaf, else 0 + * + * Returns: errno + */ + +static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, + u64 leaf_no, struct buffer_head *leaf_bh, + int last_dealloc) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_leaf *tmp_leaf; + struct gfs2_rgrp_list rlist; + struct buffer_head *bh, *dibh; + u64 blk, nblk; + unsigned int rg_blocks = 0, l_blocks = 0; + char *ht; + unsigned int x, size = len * sizeof(u64); + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); + if (ht == NULL) + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, + PAGE_KERNEL); + if (!ht) + return -ENOMEM; + + error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + goto out; + + /* Count the number of leaves */ + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + gfs2_rlist_add(dip, &rlist, blk); + l_blocks++; + } + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, + rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + + RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks); + if (error) + goto out_rg_gunlock; + + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + gfs2_free_meta(dip, blk, 1); + gfs2_add_inode_blocks(&dip->i_inode, -1); + } + + error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); + if (error != size) { + if (error >= 0) + error = -EIO; + goto out_end_trans; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_meta(dip->i_gl, dibh); + /* On the last dealloc, make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + if (last_dealloc) + dip->i_inode.i_mode = S_IFREG; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + +out_end_trans: + gfs2_trans_end(sdp); +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); + gfs2_quota_unhold(dip); +out: + kvfree(ht); + return error; +} + +/** + * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory + * @dip: the directory + * + * Dealloc all on-disk directory leaves to FREEMETA state + * Change on-disk inode type to "regular file" + * + * Returns: errno + */ + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) +{ + struct buffer_head *bh; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 index = 0, next_index; + __be64 *lp; + u64 leaf_no; + int error = 0, last; + + hsize = 1 << dip->i_depth; + + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); + + while (index < hsize) { + leaf_no = be64_to_cpu(lp[index]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + + next_index = (index & ~(len - 1)) + len; + last = ((next_index >= hsize) ? 1 : 0); + error = leaf_dealloc(dip, index, len, leaf_no, bh, + last); + brelse(bh); + if (error) + goto out; + index = next_index; + } else + index++; + } + + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + + return error; +} + +/** + * gfs2_diradd_alloc_required - find if adding entry will require an allocation + * @ip: the file being written to + * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info + * + * Returns: 0 if ok, -ve on error + */ + +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); + struct gfs2_dirent *dent; + struct buffer_head *bh; + + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + if (!dent) { + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + if (da->save_loc) { + da->bh = bh; + da->dent = dent; + } else { + brelse(bh); + } + return 0; +} + diff --git a/kernel/fs/gfs2/dir.h b/kernel/fs/gfs2/dir.h new file mode 100644 index 000000000..e1b309c24 --- /dev/null +++ b/kernel/fs/gfs2/dir.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +#include +#include + +struct inode; +struct gfs2_inode; +struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; + int save_loc; +}; + +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename, + bool fail_on_exist); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); + +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename, + struct gfs2_diradd *da); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); +extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); + +static inline u32 gfs2_disk_hash(const char *data, int len) +{ + return crc32_le((u32)~0, data, len) ^ (u32)~0; +} + + +static inline void gfs2_str2qstr(struct qstr *name, const char *fname) +{ + name->name = fname; + name->len = strlen(fname); + name->hash = gfs2_disk_hash(name->name, name->len); +} + +/* N.B. This probably ought to take inum & type as args as well */ +static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) +{ + dent->de_inum.no_addr = cpu_to_be64(0); + dent->de_inum.no_formal_ino = cpu_to_be64(0); + dent->de_hash = cpu_to_be32(name->hash); + dent->de_rec_len = cpu_to_be16(reclen); + dent->de_name_len = cpu_to_be16(name->len); + dent->de_type = cpu_to_be16(0); + memset(dent->__pad, 0, sizeof(dent->__pad)); + memcpy(dent + 1, name->name, name->len); +} + +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + +#endif /* __DIR_DOT_H__ */ diff --git a/kernel/fs/gfs2/export.c b/kernel/fs/gfs2/export.c new file mode 100644 index 000000000..5d15e9498 --- /dev/null +++ b/kernel/fs/gfs2/export.c @@ -0,0 +1,208 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "super.h" +#include "rgrp.h" +#include "util.h" + +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 8 +#define GFS2_OLD_FH_SIZE 10 + +static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, + struct inode *parent) +{ + __be32 *fh = (__force __be32 *)p; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (parent && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; + return FILEID_INVALID; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return FILEID_INVALID; + } + + fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[2] = cpu_to_be32(ip->i_no_addr >> 32); + fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_SMALL_FH_SIZE; + + if (!parent || inode == d_inode(sb->s_root)) + return *len; + + ip = GFS2_I(parent); + + fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[6] = cpu_to_be32(ip->i_no_addr >> 32); + fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_LARGE_FH_SIZE; + + return *len; +} + +struct get_name_filldir { + struct dir_context ctx; + struct gfs2_inum_host inum; + char *name; +}; + +static int get_name_filldir(struct dir_context *ctx, const char *name, + int length, loff_t offset, u64 inum, + unsigned int type) +{ + struct get_name_filldir *gnfd = + container_of(ctx, struct get_name_filldir, ctx); + + if (inum != gnfd->inum.no_addr) + return 0; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return 1; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd = { + .ctx.actor = get_name_filldir, + .name = name + }; + struct gfs2_holder gh; + int error; + struct file_ra_state f_ra = { .start = 0 }; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum.no_addr = ip->i_no_addr; + gnfd.inum.no_formal_ino = ip->i_no_formal_ino; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1)); +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, + struct gfs2_inum_host *inum) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct inode *inode; + + inode = gfs2_ilookup(sb, inum->no_addr, 0); + if (inode) { + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + goto out_inode; + } + + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + +out_inode: + return d_obtain_alias(inode); +} + +static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host this; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_SMALL_FH_SIZE: + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_SMALL_FH_SIZE) + return NULL; + this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this.no_formal_ino |= be32_to_cpu(fh[1]); + this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + this.no_addr |= be32_to_cpu(fh[3]); + return gfs2_get_dentry(sb, &this); + default: + return NULL; + } +} + +static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host parent; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_LARGE_FH_SIZE) + return NULL; + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + return gfs2_get_dentry(sb, &parent); + default: + return NULL; + } +} + +const struct export_operations gfs2_export_ops = { + .encode_fh = gfs2_encode_fh, + .fh_to_dentry = gfs2_fh_to_dentry, + .fh_to_parent = gfs2_fh_to_parent, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, +}; + diff --git a/kernel/fs/gfs2/file.c b/kernel/fs/gfs2/file.c new file mode 100644 index 000000000..31892871e --- /dev/null +++ b/kernel/fs/gfs2/file.c @@ -0,0 +1,1159 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + switch (whence) { + case SEEK_END: /* These reference inode->i_size */ + case SEEK_DATA: + case SEEK_HOLE: + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = generic_file_llseek(file, offset, whence); + gfs2_glock_dq_uninit(&i_gh); + } + break; + case SEEK_CUR: + case SEEK_SET: + error = generic_file_llseek(file, offset, whence); + break; + default: + error = -EINVAL; + } + + return error; +} + +/** + * gfs2_readdir - Iterator for a directory + * @file: The directory to read from + * @ctx: What to feed directory entries to + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return error; + + error = gfs2_dir_read(dir, ctx, &file->f_ra); + + gfs2_glock_dq_uninit(&d_gh); + + return error; +} + +/** + * fsflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between fsflags values and + * GFS2's own flags values. + * + * Returns: the converted flags + */ +static u32 fsflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} + +static const u32 fsflags_to_gfs2[32] = { + [3] = GFS2_DIF_SYNC, + [4] = GFS2_DIF_IMMUTABLE, + [5] = GFS2_DIF_APPENDONLY, + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, + [17] = GFS2_DIF_TOPDIR, +}; + +static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_Sync] = FS_SYNC_FL, + [gfs2fl_Immutable] = FS_IMMUTABLE_FL, + [gfs2fl_AppendOnly] = FS_APPEND_FL, + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_TopLevel] = FS_TOPDIR_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = file_inode(filp); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (error) + return error; + + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) + fsflags |= FS_JOURNAL_DATA_FL; + if (put_user(fsflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); + return error; +} + +void gfs2_set_inode_flags(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; + + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) + flags |= S_NOATIME; + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_TOPDIR| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * do_gfs2_set_flags - set flags on an inode + * @filp: file pointer + * @reqflags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = file_inode(filp); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = mnt_want_write_file(filp); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_drop_write; + + error = -EACCES; + if (!inode_owner_or_capable(inode)) + goto out; + + error = 0; + flags = ip->i_diskflags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = gfs2_permission(inode, MAY_WRITE); + if (error) + goto out; + } + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + } + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_meta(ip->i_gl, bh); + ip->i_diskflags = new_flags; + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); +out_drop_write: + mnt_drop_write_file(filp); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = file_inode(filp); + u32 fsflags, gfsflags; + + if (get_user(fsflags, ptr)) + return -EFAULT; + + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); + if (!S_ISDIR(inode->i_mode)) { + gfsflags &= ~GFS2_DIF_TOPDIR; + if (gfsflags & GFS2_DIF_INHERIT_JDATA) + gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~0); + } + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FS_IOC_GETFLAGS: + return gfs2_get_flags(filp, (u32 __user *)arg); + case FS_IOC_SETFLAGS: + return gfs2_set_flags(filp, (u32 __user *)arg); + case FITRIM: + return gfs2_fitrim(filp, (void __user *)arg); + } + return -ENOTTY; +} + +/** + * gfs2_size_hint - Give a hint to the size of a write request + * @filep: The struct file + * @offset: The file offset of the write + * @size: The length of the write + * + * When we are about to do a write, this function records the total + * write size in order to provide a suitable hint to the lower layers + * about how many blocks will be required. + * + */ + +static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) +{ + struct inode *inode = file_inode(filep); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; + int hint = min_t(size_t, INT_MAX, blks); + + if (hint > atomic_read(&ip->i_res->rs_sizehint)) + atomic_set(&ip->i_res->rs_sizehint, hint); +} + +/** + * gfs2_allocate_page_backing - Use bmap to allocate blocks + * @page: The (locked) page to allocate backing for + * + * We try to allocate all the blocks required for the page in + * one go. This might fail for various reasons, so we keep + * trying until all the blocks to back this page are allocated. + * If some of the blocks are already allocated, thats ok too. + */ + +static int gfs2_allocate_page_backing(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head bh; + unsigned long size = PAGE_CACHE_SIZE; + u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(inode, lblock, &bh, 1); + if (!buffer_mapped(&bh)) + return -EIO; + size -= bh.b_size; + lblock += (bh.b_size >> inode->i_blkbits); + } while(size > 0); + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vma: The virtual memory area + * @vmf: The virtual memory fault containing the page to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned long last_index; + u64 pos = page->index << PAGE_CACHE_SHIFT; + unsigned int data_blocks, ind_blocks, rblocks; + struct gfs2_holder gh; + loff_t size; + int ret; + + sb_start_pagefault(inode->i_sb); + + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + + ret = get_write_access(inode); + if (ret) + goto out; + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out_write_access; + + gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(page); + } + goto out_unlock; + } + + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_unlock; + + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_unlock; + ret = gfs2_inplace_reserve(ip, &ap); + if (ret) + goto out_quota_unlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) { + rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); + } + ret = gfs2_trans_begin(sdp, rblocks, 0); + if (ret) + goto out_trans_fail; + + lock_page(page); + ret = -EINVAL; + size = i_size_read(inode); + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + /* Check page index against inode size */ + if (size == 0 || (page->index > last_index)) + goto out_trans_end; + + ret = -EAGAIN; + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) + goto out_trans_end; + + /* Unstuff, if required, and allocate backing blocks for page */ + ret = 0; + if (gfs2_is_stuffed(ip)) + ret = gfs2_unstuff_dinode(ip, page); + if (ret == 0) + ret = gfs2_allocate_page_backing(page); + +out_trans_end: + if (ret) + unlock_page(page); + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (ret == 0) { + set_page_dirty(page); + wait_for_stable_page(page); + } +out_write_access: + put_write_access(inode); +out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); +} + +static const struct vm_operations_struct gfs2_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = gfs2_page_mkwrite, +}; + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); + } + vma->vm_ops = &gfs2_vm_ops; + + return 0; +} + +/** + * gfs2_open_common - This is common to open and atomic_open + * @inode: The inode being opened + * @file: The file being opened + * + * This maybe called under a glock or not depending upon how it has + * been called. We must always be called under a glock for regular + * files, however. For other file types, it does not matter whether + * we hold the glock or not. + * + * Returns: Error code or 0 for success + */ + +int gfs2_open_common(struct inode *inode, struct file *file) +{ + struct gfs2_file *fp; + int ret; + + if (S_ISREG(inode->i_mode)) { + ret = generic_file_open(inode, file); + if (ret) + return ret; + } + + fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * After atomic_open, this function is only used for opening files + * which are already cached. We must still get the glock for regular + * files to ensure that we have the file size uptodate for the large + * file check which is in the common code. That is only an issue for + * regular files though. + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + bool need_unlock = false; + + if (S_ISREG(ip->i_inode.i_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + need_unlock = true; + } + + error = gfs2_open_common(inode, file); + + if (need_unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_release - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_release(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + kfree(file->private_data); + file->private_data = NULL; + + if (!(file->f_mode & FMODE_WRITE)) + return 0; + + gfs2_rs_delete(ip, &inode->i_writecount); + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry + * @start: the start position in the file to sync + * @end: the end position in the file to sync + * @datasync: set if we can ignore timestamp changes + * + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + int sync_state = inode->i_state & I_DIRTY_ALL; + struct gfs2_inode *ip = GFS2_I(inode); + int ret = 0, ret1 = 0; + + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } + + if (!gfs2_is_jdata(ip)) + sync_state &= ~I_DIRTY_PAGES; + if (datasync) + sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); + + if (sync_state) { + ret = sync_inode_metadata(inode, 1); + if (ret) + return ret; + if (gfs2_is_jdata(ip)) + filemap_write_and_wait(mapping); + gfs2_ail_flush(ip->i_gl, 1); + } + + if (mapping->nrpages) + ret = filemap_fdatawait_range(mapping, start, end); + + return ret ? ret : ret1; +} + +/** + * gfs2_file_write_iter - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file_inode(file)); + int ret; + + ret = gfs2_rs_alloc(ip); + if (ret) + return ret; + + gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); + + if (iocb->ki_flags & IOCB_APPEND) { + struct gfs2_holder gh; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_write_iter(iocb, from); +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + unsigned int nr_blks; + sector_t lblock = offset >> inode->i_blkbits; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + return error; + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + while (len) { + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + bh_map.b_size = len; + set_buffer_zeronew(&bh_map); + + error = gfs2_block_map(inode, lblock, &bh_map, 1); + if (unlikely(error)) + goto out; + len -= bh_map.b_size; + nr_blks = bh_map.b_size >> inode->i_blkbits; + lblock += nr_blks; + if (!buffer_new(&bh_map)) + continue; + if (unlikely(!buffer_zeronew(&bh_map))) { + error = -EIO; + goto out; + } + } +out: + brelse(dibh); + return error; +} +/** + * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of + * blocks, determine how many bytes can be written. + * @ip: The inode in question. + * @len: Max cap of bytes. What we return in *len must be <= this. + * @data_blocks: Compute and return the number of data blocks needed + * @ind_blocks: Compute and return the number of indirect blocks needed + * @max_blocks: The total blocks available to work with. + * + * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. + */ +static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks, + unsigned int max_blocks) +{ + loff_t max = *len; + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes, max_blks = UINT_MAX; + int error; + const loff_t pos = offset; + const loff_t count = len; + loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; + + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + offset &= bsize_mask; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + + gfs2_size_hint(file, offset, len); + + gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + ap.min_target = data_blocks + ind_blocks; + + while (len > 0) { + if (len < bytes) + bytes = len; + if (!gfs2_write_alloc_required(ip, offset, bytes)) { + len -= bytes; + offset += bytes; + continue; + } + + /* We need to determine how many bytes we can actually + * fallocate without exceeding quota or going over the + * end of the fs. We start off optimistically by assuming + * we can write max_bytes */ + max_bytes = (len > max_chunk_size) ? max_chunk_size : len; + + /* Since max_bytes is most likely a theoretical max, we + * calculate a more realistic 'bytes' to serve as a good + * starting point for the number of bytes we may be able + * to write */ + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + /* ap.allowed tells us how many blocks quota will allow + * us to write. Check if this reduces max_blks */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_qunlock; + + /* check if the selected rgrp limits our max_blks further */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; + + /* Almost done. Calculate bytes that can be written using + * max_blks. We also recompute max_bytes, data_blocks and + * ind_blocks */ + calc_max_reserv(ip, &max_bytes, &data_blocks, + &ind_blocks, max_blks); + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) { + i_size_write(inode, pos + count); + /* Marks the inode as dirty */ + file_update_time(file); + } + + return generic_write_sync(file, pos, count); + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); + return error; +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (offset + len) > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out_unlock; + } + + ret = get_write_access(inode); + if (ret) + goto out_unlock; + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out_putw; + + ret = __gfs2_fallocate(file, mode, offset, len); + if (ret) + gfs2_rs_deltree(ip->i_res); +out_putw: + put_write_access(inode); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + mutex_unlock(&inode->i_mutex); + return ret; +} + +static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + int error; + struct gfs2_inode *ip = GFS2_I(out->f_mapping->host); + + error = gfs2_rs_alloc(ip); + if (error) + return (ssize_t)error; + + gfs2_size_hint(out, *ppos, len); + + return iter_file_splice_write(pipe, out, ppos, len, flags); +} + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (cmd == F_CANCELLK) { + /* Hack: */ + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (fl->fl_type == F_UNLCK) + posix_lock_file_wait(file, fl); + return -EIO; + } + if (IS_GETLK(cmd)) + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file_inode(file)); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + int sleeptime; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq(fl_gh); + gfs2_holder_reinit(state, flags, fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); + if (error) + goto out; + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + } + for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { + error = gfs2_glock_nq(fl_gh); + if (error != GLR_TRYFAILED) + break; + fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_error = 0; + msleep(sleeptime); + } + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) { + gfs2_glock_dq(fl_gh); + gfs2_holder_uninit(fl_gh); + } + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read_iter = generic_file_read_iter, + .write_iter = gfs2_file_write_iter, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = gfs2_file_splice_write, + .setlease = simple_nosetlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops = { + .iterate = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .llseek = default_llseek, +}; + +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read_iter = generic_file_read_iter, + .write_iter = gfs2_file_write_iter, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = gfs2_file_splice_write, + .setlease = generic_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .iterate = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .llseek = default_llseek, +}; + diff --git a/kernel/fs/gfs2/gfs2.h b/kernel/fs/gfs2/gfs2.h new file mode 100644 index 000000000..ef606e3a5 --- /dev/null +++ b/kernel/fs/gfs2/gfs2.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +enum { + NO_CREATE = 0, + CREATE = 1, +}; + +enum { + NO_FORCE = 0, + FORCE = 1, +}; + +#define GFS2_FAST_NAME_SIZE 8 + +#endif /* __GFS2_DOT_H__ */ + diff --git a/kernel/fs/gfs2/glock.c b/kernel/fs/gfs2/glock.c new file mode 100644 index 000000000..0fa8062f8 --- /dev/null +++ b/kernel/fs/gfs2/glock.c @@ -0,0 +1,2119 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "super.h" +#include "util.h" +#include "bmap.h" +#define CREATE_TRACE_POINTS +#include "trace_gfs2.h" + +struct gfs2_glock_iter { + int hash; /* hash bucket index */ + unsigned nhash; /* Index within current bucket */ + struct gfs2_sbd *sdp; /* incore superblock */ + struct gfs2_glock *gl; /* current glock struct */ + loff_t last_pos; /* last position */ +}; + +typedef void (*glock_examiner) (struct gfs2_glock * gl); + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); + +static struct dentry *gfs2_root; +static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); + +#define GFS2_GL_HASH_SHIFT 15 +#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) + +static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; +static struct dentry *gfs2_root; + +/** + * gl_hash() - Turn glock number into hash bucket number + * @lock: The glock number + * + * Returns: The number of the corresponding hash bucket + */ + +static unsigned int gl_hash(const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + unsigned int h; + + h = jhash(&name->ln_number, sizeof(u64), 0); + h = jhash(&name->ln_type, sizeof(unsigned int), h); + h = jhash(&sdp, sizeof(struct gfs2_sbd *), h); + h &= GFS2_GL_HASH_MASK; + + return h; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&gl_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&gl_hash_table[hash]); +} + +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + } else { + kfree(gl->gl_lksb.sb_lvbptr); + kmem_cache_free(gfs2_glock_cachep, gl); + } +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); +} + +/** + * gfs2_glock_hold() - increment reference count on glock + * @gl: The glock to hold + * + */ + +static void gfs2_glock_hold(struct gfs2_glock *gl) +{ + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + + +void gfs2_glock_add_to_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + + list_add_tail(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); + spin_unlock(&lru_lock); +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); + } + spin_unlock(&lru_lock); +} + +/** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); + + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + lockref_mark_dead(&gl->gl_lockref); + + gfs2_glock_remove_from_lru(gl); + spin_unlock(&gl->gl_lockref.lock); + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); +} + +/** + * search_bucket() - Find struct gfs2_glock by lock number + * @bucket: the bucket to search + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + struct gfs2_glock *gl; + struct hlist_bl_node *h; + + hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { + if (!lm_name_equal(&gl->gl_name, name)) + continue; + if (gl->gl_sbd != sdp) + continue; + if (lockref_get_not_dead(&gl->gl_lockref)) + return gl; + } + + return NULL; +} + +/** + * may_grant - check if its ok to grant a new lock + * @gl: The glock + * @gh: The lock request which we wish to grant + * + * Returns: true if its ok to grant the lock + */ + +static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) +{ + const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list); + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) + return 0; + if (gl->gl_state == gh->gh_state) + return 1; + if (gh->gh_flags & GL_EXACT) + return 0; + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) + return 1; + if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) + return 1; + } + if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) + return 1; + return 0; +} + +static void gfs2_holder_wake(struct gfs2_holder *gh) +{ + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb__after_atomic(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); +} + +/** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock + * + * Returns: 1 if there is a blocked holder at the head of the list, or 2 + * if a type specific operation is underway. + */ + +static int do_promote(struct gfs2_glock *gl) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh, *tmp; + int ret; + +restart: + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (may_grant(gl, gh)) { + if (gh->gh_list.prev == &gl->gl_holders && + glops->go_lock) { + spin_unlock(&gl->gl_spin); + /* FIXME: eliminate this eventually */ + ret = glops->go_lock(gh); + spin_lock(&gl->gl_spin); + if (ret) { + if (ret == 1) + return 2; + gh->gh_error = ret; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 1); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 0); + gfs2_holder_wake(gh); + continue; + } + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + do_error(gl, 0); + break; + } + return 0; +} + +/** + * find_first_waiter - find the first gh that's waiting for the glock + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state + * + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + if (held2) + gl->gl_lockref.count++; + else + gl->gl_lockref.count--; + } + if (held1 && held2 && list_empty(&gl->gl_holders)) + clear_bit(GLF_QUEUED, &gl->gl_flags); + + if (new_state != gl->gl_target) + /* shorten our minimum hold time */ + gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, + GL_GLOCK_MIN_HOLD); + gl->gl_state = new_state; + gl->gl_tchange = jiffies; +} + +static void gfs2_demote_wake(struct gfs2_glock *gl) +{ + gl->gl_demote_state = LM_ST_EXCLUSIVE; + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_atomic(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); +} + +/** + * finish_xmote - The DLM has replied to one of our lock requests + * @gl: The glock + * @ret: The status from the DLM + * + */ + +static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh; + unsigned state = ret & LM_OUT_ST_MASK; + int rv; + + spin_lock(&gl->gl_spin); + trace_gfs2_glock_state_change(gl, state); + state_change(gl, state); + gh = find_first_waiter(gl); + + /* Demote to UN request arrived during demote to SH or DF */ + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_target = LM_ST_UNLOCKED; + + /* Check for state != intended state */ + if (unlikely(state != gl->gl_target)) { + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + /* move to back of queue and try next entry */ + if (ret & LM_OUT_CANCELED) { + if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + goto retry; + } + /* Some error or failed "try lock" - report it */ + if ((ret & LM_OUT_ERROR) || + (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { + gl->gl_target = gl->gl_state; + do_error(gl, ret); + goto out; + } + } + switch(state) { + /* Unlocked due to conversion deadlock, try again */ + case LM_ST_UNLOCKED: +retry: + do_xmote(gl, gh, gl->gl_target); + break; + /* Conversion fails, unlock and try again */ + case LM_ST_SHARED: + case LM_ST_DEFERRED: + do_xmote(gl, gh, LM_ST_UNLOCKED); + break; + default: /* Everything else */ + pr_err("wanted %u got %u\n", gl->gl_target, state); + GLOCK_BUG_ON(gl, 1); + } + spin_unlock(&gl->gl_spin); + return; + } + + /* Fast path - we got what we asked for */ + if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (state != LM_ST_UNLOCKED) { + if (glops->go_xmote_bh) { + spin_unlock(&gl->gl_spin); + rv = glops->go_xmote_bh(gl, gh); + spin_lock(&gl->gl_spin); + if (rv) { + do_error(gl, rv); + goto out; + } + } + rv = do_promote(gl); + if (rv == 2) + goto out_locked; + } +out: + clear_bit(GLF_LOCK, &gl->gl_flags); +out_locked: + spin_unlock(&gl->gl_spin); +} + +/** + * do_xmote - Calls the DLM to change the state of a lock + * @gl: The lock state + * @gh: The holder (only for promotes) + * @target: The target lock state + * + */ + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int lck_flags = gh ? gh->gh_flags : 0; + int ret; + + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | + LM_FLAG_PRIORITY); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); + if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && + glops->go_inval) { + set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + do_error(gl, 0); /* Fail queued try locks */ + } + gl->gl_req = target; + set_bit(GLF_BLOCKING, &gl->gl_flags); + if ((gl->gl_req == LM_ST_UNLOCKED) || + (gl->gl_state == LM_ST_EXCLUSIVE) || + (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) + clear_bit(GLF_BLOCKING, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + if (glops->go_sync) + glops->go_sync(gl); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + + gfs2_glock_hold(gl); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + if (ret) { + pr_err("lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, 1); + } + } else { /* lock_nolock */ + finish_xmote(gl, target); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + + spin_lock(&gl->gl_spin); +} + +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question + * @nonblock: True if we must not block in run_queue + * + */ + +static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + struct gfs2_holder *gh = NULL; + int ret; + + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + return; + + GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + + if (test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_demote_state != gl->gl_state) { + if (find_first_holder(gl)) + goto out_unlock; + if (nonblock) + goto out_sched; + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); + gl->gl_target = gl->gl_demote_state; + } else { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + gfs2_demote_wake(gl); + ret = do_promote(gl); + if (ret == 0) + goto out_unlock; + if (ret == 2) + goto out; + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + } + do_xmote(gl, gh, gl->gl_target); +out: + return; + +out_sched: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + return; + +out_unlock: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + return; +} + +static void delete_work_func(struct work_struct *work) +{ + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip; + struct inode *inode; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ + + if (ip) + inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); + } + gfs2_glock_put(gl); +} + +static void glock_work_func(struct work_struct *work) +{ + unsigned long delay = 0; + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + int drop_ref = 0; + + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { + finish_xmote(gl, gl->gl_reply); + drop_ref = 1; + } + spin_lock(&gl->gl_spin); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state != LM_ST_EXCLUSIVE) { + unsigned long holdtime, now = jiffies; + + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } + } + run_queue(gl, 0); + spin_unlock(&gl->gl_spin); + if (!delay) + gfs2_glock_put(gl); + else { + if (gl->gl_name.ln_type != LM_TYPE_INODE) + delay = 0; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); + } + if (drop_ref) + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. + * + * Returns: errno + */ + +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) +{ + struct super_block *s = sdp->sd_vfs; + struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; + struct gfs2_glock *gl, *tmp; + unsigned int hash = gl_hash(sdp, &name); + struct address_space *mapping; + struct kmem_cache *cachep; + + rcu_read_lock(); + gl = search_bucket(hash, sdp, &name); + rcu_read_unlock(); + + *glp = gl; + if (gl) + return 0; + if (!create) + return -ENOENT; + + if (glops->go_flags & GLOF_ASPACE) + cachep = gfs2_glock_aspace_cachep; + else + cachep = gfs2_glock_cachep; + gl = kmem_cache_alloc(cachep, GFP_NOFS); + if (!gl) + return -ENOMEM; + + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + + if (glops->go_flags & GLOF_LVB) { + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); + if (!gl->gl_lksb.sb_lvbptr) { + kmem_cache_free(cachep, gl); + return -ENOMEM; + } + } + + atomic_inc(&sdp->sd_glock_disposal); + gl->gl_sbd = sdp; + gl->gl_flags = 0; + gl->gl_name = name; + gl->gl_lockref.count = 1; + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_target = LM_ST_UNLOCKED; + gl->gl_demote_state = LM_ST_EXCLUSIVE; + gl->gl_hash = hash; + gl->gl_ops = glops; + gl->gl_dstamp = ktime_set(0, 0); + preempt_disable(); + /* We use the global stats to estimate the initial per-glock stats */ + gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; + preempt_enable(); + gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; + gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; + gl->gl_tchange = jiffies; + gl->gl_object = NULL; + gl->gl_hold_time = GL_GLOCK_DFT_HOLD; + INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); + + mapping = gfs2_glock2aspace(gl); + if (mapping) { + mapping->a_ops = &gfs2_meta_aops; + mapping->host = s->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->writeback_index = 0; + } + + spin_lock_bucket(hash); + tmp = search_bucket(hash, sdp, &name); + if (tmp) { + spin_unlock_bucket(hash); + kfree(gl->gl_lksb.sb_lvbptr); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); + gl = tmp; + } else { + hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); + spin_unlock_bucket(hash); + } + + *glp = gl; + + return 0; +} + +/** + * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + */ + +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh) +{ + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gl; + gh->gh_ip = _RET_IP_; + gh->gh_owner_pid = get_pid(task_pid(current)); + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_error = 0; + gh->gh_iflags = 0; + gfs2_glock_hold(gl); +} + +/** + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Don't mess with the glock. + * + */ + +void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +{ + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_iflags = 0; + gh->gh_ip = _RET_IP_; + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); +} + +/** + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure + * + */ + +void gfs2_holder_uninit(struct gfs2_holder *gh) +{ + put_pid(gh->gh_owner_pid); + gfs2_glock_put(gh->gh_gl); + gh->gh_gl = NULL; + gh->gh_ip = 0; +} + +/** + * gfs2_glock_wait - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) +{ + unsigned long time1 = jiffies; + + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ + /* Lengthen the minimum hold time. */ + gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + + GL_GLOCK_HOLD_INCR, + GL_GLOCK_MAX_HOLD); + return gh->gh_error; +} + +/** + * handle_callback - process a demote request + * @gl: the glock + * @state: the state the caller wants us to change to + * + * There are only two requests that we are going to see in actual + * practise: LM_ST_SHARED and LM_ST_UNLOCKED + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote) +{ + int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; + + set_bit(bit, &gl->gl_flags); + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { + gl->gl_demote_state = state; + gl->gl_demote_time = jiffies; + } else if (gl->gl_demote_state != LM_ST_UNLOCKED && + gl->gl_demote_state != state) { + gl->gl_demote_state = LM_ST_UNLOCKED; + } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); +} + +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + if (seq) { + seq_vprintf(seq, fmt, args); + } else { + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("%pV", &vaf); + } + + va_end(args); +} + +/** + * add_to_queue - Add a holder to the wait queue (but look for recursion) + * @gh: the holder structure to add + * + * Eventually we should move the recursive locking trap to a + * debugging option or something like that. This is the fast + * path and needs to have the minimum number of distractions. + * + */ + +static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *insert_pt = NULL; + struct gfs2_holder *gh2; + int try_futile = 0; + + BUG_ON(gh->gh_owner_pid == NULL); + if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) + BUG(); + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + try_futile = !may_grant(gl, gh); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + goto trap_recursive; + if (try_futile && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { +fail: + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; + } + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) + insert_pt = &gh2->gh_list; + } + set_bit(GLF_QUEUED, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 1); + gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); + if (likely(insert_pt == NULL)) { + list_add_tail(&gh->gh_list, &gl->gl_holders); + if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) + goto do_cancel; + return; + } + list_add_tail(&gh->gh_list, insert_pt); +do_cancel: + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { + spin_unlock(&gl->gl_spin); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_spin); + } + return; + +trap_recursive: + pr_err("original: %pSR\n", (void *)gh2->gh_ip); + pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", + gh2->gh_gl->gl_name.ln_type, gh2->gh_state); + pr_err("new: %pSR\n", (void *)gh->gh_ip); + pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", + gh->gh_gl->gl_name.ln_type, gh->gh_state); + gfs2_dump_glock(NULL, gl); + BUG(); +} + +/** + * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) + * @gh: the holder structure + * + * if (gh->gh_flags & GL_ASYNC), this never returns an error + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_nq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + int error = 0; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + if (test_bit(GLF_LRU, &gl->gl_flags)) + gfs2_glock_remove_from_lru(gl); + + spin_lock(&gl->gl_spin); + add_to_queue(gh); + if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + } + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); + + if (!(gh->gh_flags & GL_ASYNC)) + error = gfs2_glock_wait(gh); + + return error; +} + +/** + * gfs2_glock_poll - poll to see if an async request has been completed + * @gh: the holder + * + * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on + */ + +int gfs2_glock_poll(struct gfs2_holder *gh) +{ + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ + +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned delay = 0; + int fast_path = 0; + + spin_lock(&gl->gl_spin); + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + + list_del_init(&gh->gh_list); + if (find_first_holder(gl) == NULL) { + if (glops->go_unlock) { + GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); + spin_unlock(&gl->gl_spin); + glops->go_unlock(gh); + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + } + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; + } + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + gfs2_glock_add_to_lru(gl); + + trace_gfs2_glock_queue(gh, 0); + spin_unlock(&gl->gl_spin); + if (likely(fast_path)) + return; + + gfs2_glock_hold(gl); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); +} + +void gfs2_glock_dq_wait(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + gfs2_glock_dq(gh); + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); +} + +/** + * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it + * @gh: the holder structure + * + */ + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_dq(gh); + gfs2_holder_uninit(gh); +} + +/** + * gfs2_glock_nq_num - acquire a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the acquisition + * @gh: the struct gfs2_holder + * + * Returns: errno + */ + +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh) +{ + struct gfs2_glock *gl; + int error; + + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + error = gfs2_glock_nq_init(gl, state, flags, gh); + gfs2_glock_put(gl); + } + + return error; +} + +/** + * glock_compare - Compare two struct gfs2_glock structures for sorting + * @arg_a: the first structure + * @arg_b: the second structure + * + */ + +static int glock_compare(const void *arg_a, const void *arg_b) +{ + const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; + const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; + const struct lm_lockname *a = &gh_a->gh_gl->gl_name; + const struct lm_lockname *b = &gh_b->gh_gl->gl_name; + + if (a->ln_number > b->ln_number) + return 1; + if (a->ln_number < b->ln_number) + return -1; + BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); + return 0; +} + +/** + * nq_m_sync - synchonously acquire more than one glock in deadlock free order + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, + struct gfs2_holder **p) +{ + unsigned int x; + int error = 0; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + error = gfs2_glock_nq(p[x]); + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + return error; +} + +/** + * gfs2_glock_nq_m - acquire multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_holder *tmp[4]; + struct gfs2_holder **pph = tmp; + int error = 0; + + switch(num_gh) { + case 0: + return 0; + case 1: + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + return gfs2_glock_nq(ghs); + default: + if (num_gh <= 4) + break; + pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS); + if (!pph) + return -ENOMEM; + } + + error = nq_m_sync(num_gh, ghs, pph); + + if (pph != tmp) + kfree(pph); + + return error; +} + +/** + * gfs2_glock_dq_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); +} + +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) +{ + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; + + gfs2_glock_hold(gl); + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_hold_time; + } + + spin_lock(&gl->gl_spin); + handle_callback(gl, state, delay, true); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); +} + +/** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** + * gfs2_glock_complete - Callback used by locking + * @gl: Pointer to the glock + * @ret: The return value from the dlm + * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. + */ + +void gfs2_glock_complete(struct gfs2_glock *gl, int ret) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + + spin_lock(&gl->gl_spin); + gl->gl_reply = ret; + + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { + if (gfs2_should_freeze(gl)) { + set_bit(GLF_FROZEN, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + return; + } + } + + gl->gl_lockref.count++; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +} + +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_glock *gla, *glb; + + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); + + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; + + return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) +{ + struct gfs2_glock *gl; + + list_sort(NULL, list, glock_cmp); + + while(!list_empty(list)) { + gl = list_entry(list->next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: + list_add(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + continue; + } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } + clear_bit(GLF_LRU, &gl->gl_flags); + gl->gl_lockref.count++; + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + spin_unlock(&gl->gl_spin); + cond_resched_lock(&lru_lock); + } +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static long gfs2_scan_glock_lru(int nr) +{ + struct gfs2_glock *gl; + LIST_HEAD(skipped); + LIST_HEAD(dispose); + long freed = 0; + + spin_lock(&lru_lock); + while ((nr-- >= 0) && !list_empty(&lru_list)) { + gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + + /* Test for being demotable */ + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + freed++; + continue; + } + + list_move(&gl->gl_lru, &skipped); + } + list_splice(&skipped, &lru_list); + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); + spin_unlock(&lru_lock); + + return freed; +} + +static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + return gfs2_scan_glock_lru(sc->nr_to_scan); +} + +static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&lru_count)); +} + +static struct shrinker glock_shrinker = { + .seeks = DEFAULT_SEEKS, + .count_objects = gfs2_glock_shrink_count, + .scan_objects = gfs2_glock_shrink_scan, +}; + +/** + * examine_bucket - Call a function for glock in a hash bucket + * @examiner: the function + * @sdp: the filesystem + * @bucket: the bucket + * + */ + +static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, + unsigned int hash) +{ + struct gfs2_glock *gl; + struct hlist_bl_head *head = &gl_hash_table[hash]; + struct hlist_bl_node *pos; + + rcu_read_lock(); + hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { + if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) + examiner(gl); + } + rcu_read_unlock(); + cond_resched(); +} + +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + unsigned x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(examiner, sdp, x); +} + + +/** + * thaw_glock - thaw out a glock which has an unprocessed reply waiting + * @gl: The glock to thaw + * + */ + +static void thaw_glock(struct gfs2_glock *gl) +{ + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + goto out; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { +out: + gfs2_glock_put(gl); + } +} + +/** + * clear_glock - look at a glock and see if we can free it from glock cache + * @gl: the glock to look at + * + */ + +static void clear_glock(struct gfs2_glock *gl) +{ + gfs2_glock_remove_from_lru(gl); + + spin_lock(&gl->gl_spin); + if (gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_thaw - Thaw any frozen glocks + * @sdp: The super block + * + */ + +void gfs2_glock_thaw(struct gfs2_sbd *sdp) +{ + glock_hash_walk(thaw_glock, sdp); +} + +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + gfs2_dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl); +} + +/** + * gfs2_gl_hash_clear - Empty out the glock hash table + * @sdp: the filesystem + * @wait: wait until it's all gone + * + * Called when unmounting the filesystem. + */ + +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) +{ + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); + glock_hash_walk(clear_glock, sdp); + flush_workqueue(glock_workqueue); + wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + glock_hash_walk(dump_glock_func, sdp); +} + +void gfs2_glock_finish_truncate(struct gfs2_inode *ip) +{ + struct gfs2_glock *gl = ip->i_gl; + int ret; + + ret = gfs2_truncatei_resume(ip); + gfs2_assert_withdraw(gl->gl_sbd, ret == 0); + + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); +} + +static const char *state2str(unsigned state) +{ + switch(state) { + case LM_ST_UNLOCKED: + return "UN"; + case LM_ST_SHARED: + return "SH"; + case LM_ST_DEFERRED: + return "DF"; + case LM_ST_EXCLUSIVE: + return "EX"; + } + return "??"; +} + +static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +{ + char *p = buf; + if (flags & LM_FLAG_TRY) + *p++ = 't'; + if (flags & LM_FLAG_TRY_1CB) + *p++ = 'T'; + if (flags & LM_FLAG_NOEXP) + *p++ = 'e'; + if (flags & LM_FLAG_ANY) + *p++ = 'A'; + if (flags & LM_FLAG_PRIORITY) + *p++ = 'p'; + if (flags & GL_ASYNC) + *p++ = 'a'; + if (flags & GL_EXACT) + *p++ = 'E'; + if (flags & GL_NOCACHE) + *p++ = 'c'; + if (test_bit(HIF_HOLDER, &iflags)) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; + if (test_bit(HIF_FIRST, &iflags)) + *p++ = 'F'; + *p = 0; + return buf; +} + +/** + * dump_holder - print information about a glock holder + * @seq: the seq_file struct + * @gh: the glock holder + * + */ + +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +{ + struct task_struct *gh_owner = NULL; + char flags_buf[32]; + + rcu_read_lock(); + if (gh->gh_owner_pid) + gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); + rcu_read_unlock(); +} + +static const char *gflags2str(char *buf, const struct gfs2_glock *gl) +{ + const unsigned long *gflags = &gl->gl_flags; + char *p = buf; + + if (test_bit(GLF_LOCK, gflags)) + *p++ = 'l'; + if (test_bit(GLF_DEMOTE, gflags)) + *p++ = 'D'; + if (test_bit(GLF_PENDING_DEMOTE, gflags)) + *p++ = 'd'; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) + *p++ = 'p'; + if (test_bit(GLF_DIRTY, gflags)) + *p++ = 'y'; + if (test_bit(GLF_LFLUSH, gflags)) + *p++ = 'f'; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) + *p++ = 'i'; + if (test_bit(GLF_REPLY_PENDING, gflags)) + *p++ = 'r'; + if (test_bit(GLF_INITIAL, gflags)) + *p++ = 'I'; + if (test_bit(GLF_FROZEN, gflags)) + *p++ = 'F'; + if (test_bit(GLF_QUEUED, gflags)) + *p++ = 'q'; + if (test_bit(GLF_LRU, gflags)) + *p++ = 'L'; + if (gl->gl_object) + *p++ = 'o'; + if (test_bit(GLF_BLOCKING, gflags)) + *p++ = 'b'; + *p = 0; + return buf; +} + +/** + * gfs2_dump_glock - print information about a glock + * @seq: The seq_file struct + * @gl: the glock + * + * The file format is as follows: + * One line per object, capital letters are used to indicate objects + * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, + * other objects are indented by a single space and follow the glock to + * which they are related. Fields are indicated by lower case letters + * followed by a colon and the field value, except for strings which are in + * [] so that its possible to see if they are composed of spaces for + * example. The field's are n = number (id of the object), f = flags, + * t = type, s = state, r = refcount, e = error, p = pid. + * + */ + +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned long long dtime; + const struct gfs2_holder *gh; + char gflags_buf[32]; + + dtime = jiffies - gl->gl_demote_time; + dtime *= 1000000/HZ; /* demote time in uSec */ + if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) + dtime = 0; + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n", + state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + (int)gl->gl_lockref.count, gl->gl_hold_time); + + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); + + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) + glops->go_dump(seq, gl); +} + +static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock *gl = iter_ptr; + + seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + (long long)gl->gl_stats.stats[GFS2_LKS_SRTT], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRT], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], + (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); + return 0; +} + +static const char *gfs2_gltype[] = { + "type", + "reserved", + "nondisk", + "inode", + "rgrp", + "meta", + "iopen", + "flock", + "plock", + "quota", + "journal", +}; + +static const char *gfs2_stype[] = { + [GFS2_LKS_SRTT] = "srtt", + [GFS2_LKS_SRTTVAR] = "srttvar", + [GFS2_LKS_SRTTB] = "srttb", + [GFS2_LKS_SRTTVARB] = "srttvarb", + [GFS2_LKS_SIRT] = "sirt", + [GFS2_LKS_SIRTVAR] = "sirtvar", + [GFS2_LKS_DCOUNT] = "dlm", + [GFS2_LKS_QCOUNT] = "queue", +}; + +#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) + +static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock_iter *gi = seq->private; + struct gfs2_sbd *sdp = gi->sdp; + unsigned index = gi->hash >> 3; + unsigned subindex = gi->hash & 0x07; + s64 value; + int i; + + if (index == 0 && subindex != 0) + return 0; + + seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], + (index == 0) ? "cpu": gfs2_stype[subindex]); + + for_each_possible_cpu(i) { + const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); + if (index == 0) { + value = i; + } else { + value = lkstats->lkstats[index - 1].stats[subindex]; + } + seq_printf(seq, " %15lld", (long long)value); + } + seq_putc(seq, '\n'); + return 0; +} + +int __init gfs2_glock_init(void) +{ + unsigned i; + for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { + INIT_HLIST_BL_HEAD(&gl_hash_table[i]); + } + + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZABLE, 0); + if (!glock_workqueue) + return -ENOMEM; + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZABLE, + 0); + if (!gfs2_delete_workqueue) { + destroy_workqueue(glock_workqueue); + return -ENOMEM; + } + + register_shrinker(&glock_shrinker); + + return 0; +} + +void gfs2_glock_exit(void) +{ + unregister_shrinker(&glock_shrinker); + destroy_workqueue(glock_workqueue); + destroy_workqueue(gfs2_delete_workqueue); +} + +static inline struct gfs2_glock *glock_hash_chain(unsigned hash) +{ + return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), + struct gfs2_glock, gl_list); +} + +static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) +{ + return hlist_bl_entry(rcu_dereference(gl->gl_list.next), + struct gfs2_glock, gl_list); +} + +static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) +{ + struct gfs2_glock *gl; + + do { + gl = gi->gl; + if (gl) { + gi->gl = glock_hash_next(gl); + gi->nhash++; + } else { + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; + } + while (gi->gl == NULL) { + gi->hash++; + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; + } + /* Skip entries for other sb and dead entries */ + } while (gi->sdp != gi->gl->gl_sbd || + __lockref_is_dead(&gi->gl->gl_lockref)); + + return 0; +} + +static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + loff_t n = *pos; + + if (gi->last_pos <= *pos) + n = gi->nhash + (*pos - gi->last_pos); + else + gi->hash = 0; + + gi->nhash = 0; + rcu_read_lock(); + + do { + if (gfs2_glock_iter_next(gi)) + return NULL; + } while (n--); + + gi->last_pos = *pos; + return gi->gl; +} + +static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + + (*pos)++; + gi->last_pos = *pos; + if (gfs2_glock_iter_next(gi)) + return NULL; + + return gi->gl; +} + +static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock_iter *gi = seq->private; + + if (gi->gl) + rcu_read_unlock(); + gi->gl = NULL; +} + +static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) +{ + dump_glock(seq, iter_ptr); + return 0; +} + +static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + + gi->hash = *pos; + if (*pos >= GFS2_NR_SBSTATS) + return NULL; + preempt_disable(); + return SEQ_START_TOKEN; +} + +static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + (*pos)++; + gi->hash++; + if (gi->hash >= GFS2_NR_SBSTATS) { + preempt_enable(); + return NULL; + } + return SEQ_START_TOKEN; +} + +static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + preempt_enable(); +} + +static const struct seq_operations gfs2_glock_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glock_seq_show, +}; + +static const struct seq_operations gfs2_glstats_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glstats_seq_show, +}; + +static const struct seq_operations gfs2_sbstats_seq_ops = { + .start = gfs2_sbstats_seq_start, + .next = gfs2_sbstats_seq_next, + .stop = gfs2_sbstats_seq_stop, + .show = gfs2_sbstats_seq_show, +}; + +#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) + +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glock_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; + } + return ret; +} + +static int gfs2_glstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; + } + return ret; +} + +static int gfs2_sbstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_sbstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; +} + +static const struct file_operations gfs2_glocks_fops = { + .owner = THIS_MODULE, + .open = gfs2_glocks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations gfs2_glstats_fops = { + .owner = THIS_MODULE, + .open = gfs2_glstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations gfs2_sbstats_fops = { + .owner = THIS_MODULE, + .open = gfs2_sbstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) +{ + struct dentry *dent; + + dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root); + if (IS_ERR_OR_NULL(dent)) + goto fail; + sdp->debugfs_dir = dent; + + dent = debugfs_create_file("glocks", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glocks_fops); + if (IS_ERR_OR_NULL(dent)) + goto fail; + sdp->debugfs_dentry_glocks = dent; + + dent = debugfs_create_file("glstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + if (IS_ERR_OR_NULL(dent)) + goto fail; + sdp->debugfs_dentry_glstats = dent; + + dent = debugfs_create_file("sbstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); + if (IS_ERR_OR_NULL(dent)) + goto fail; + sdp->debugfs_dentry_sbstats = dent; + + return 0; +fail: + gfs2_delete_debugfs_file(sdp); + return dent ? PTR_ERR(dent) : -ENOMEM; +} + +void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) +{ + if (sdp->debugfs_dir) { + if (sdp->debugfs_dentry_glocks) { + debugfs_remove(sdp->debugfs_dentry_glocks); + sdp->debugfs_dentry_glocks = NULL; + } + if (sdp->debugfs_dentry_glstats) { + debugfs_remove(sdp->debugfs_dentry_glstats); + sdp->debugfs_dentry_glstats = NULL; + } + if (sdp->debugfs_dentry_sbstats) { + debugfs_remove(sdp->debugfs_dentry_sbstats); + sdp->debugfs_dentry_sbstats = NULL; + } + debugfs_remove(sdp->debugfs_dir); + sdp->debugfs_dir = NULL; + } +} + +int gfs2_register_debugfs(void) +{ + gfs2_root = debugfs_create_dir("gfs2", NULL); + if (IS_ERR(gfs2_root)) + return PTR_ERR(gfs2_root); + return gfs2_root ? 0 : -ENOMEM; +} + +void gfs2_unregister_debugfs(void) +{ + debugfs_remove(gfs2_root); + gfs2_root = NULL; +} diff --git a/kernel/fs/gfs2/glock.h b/kernel/fs/gfs2/glock.h new file mode 100644 index 000000000..32572f71f --- /dev/null +++ b/kernel/fs/gfs2/glock.h @@ -0,0 +1,250 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOCK_DOT_H__ +#define __GLOCK_DOT_H__ + +#include +#include +#include "incore.h" + +/* Options for hostdata parser */ + +enum { + Opt_jid, + Opt_id, + Opt_first, + Opt_nodir, + Opt_err, +}; + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 +#define GL_ASYNC 0x00000040 +#define GL_EXACT 0x00000080 +#define GL_SKIP 0x00000100 +#define GL_NOCACHE 0x00000400 + +/* + * lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ERROR 0x00000004 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 + +#define GLR_TRYFAILED 13 + +#define GL_GLOCK_MAX_HOLD (long)(HZ / 5) +#define GL_GLOCK_DFT_HOLD (long)(HZ / 5) +#define GL_GLOCK_MIN_HOLD (long)(10) +#define GL_GLOCK_HOLD_INCR (long)(HZ / 20) +#define GL_GLOCK_HOLD_DECR (long)(HZ / 40) + +struct lm_lockops { + const char *lm_proto_name; + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); + void (*lm_withdraw) (struct gfs2_sbd *sdp); + void (*lm_put_lock) (struct gfs2_glock *gl); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); + void (*lm_cancel) (struct gfs2_glock *gl); + const match_table_t *lm_tokens; +}; + +extern struct workqueue_struct *gfs2_delete_workqueue; +static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + struct pid *pid; + + /* Look in glock's list of holders for one with current task as owner */ + spin_lock(&gl->gl_spin); + pid = task_pid(current); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + break; + if (gh->gh_owner_pid == pid) + goto out; + } + gh = NULL; +out: + spin_unlock(&gl->gl_spin); + + return gh; +} + +static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_EXCLUSIVE; +} + +static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_DEFERRED; +} + +static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_SHARED; +} + +static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) +{ + if (gl->gl_ops->go_flags & GLOF_ASPACE) + return (struct address_space *)(gl + 1); + return NULL; +} + +extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + unsigned flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +extern void gfs2_holder_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq(struct gfs2_holder *gh); +extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq(struct gfs2_holder *gh); +extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, + struct gfs2_holder *gh); +extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) +extern __printf(2, 3) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); + +/** + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Returns: 0, GLR_*, or errno + */ + +static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, + unsigned int state, int flags, + struct gfs2_holder *gh) +{ + int error; + + gfs2_holder_init(gl, state, flags, gh); + + error = gfs2_glock_nq(gh); + if (error) + gfs2_holder_uninit(gh); + + return error; +} + +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); + +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); + +extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern int gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); + +extern const struct lm_lockops gfs2_dlm_ops; + +#endif /* __GLOCK_DOT_H__ */ diff --git a/kernel/fs/gfs2/glops.c b/kernel/fs/gfs2/glops.c new file mode 100644 index 000000000..fe91951c3 --- /dev/null +++ b/kernel/fs/gfs2/glops.c @@ -0,0 +1,616 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "util.h" +#include "trans.h" +#include "dir.h" + +struct workqueue_struct *gfs2_freeze_wq; + +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_page->mapping, bh->b_page->flags); + fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); +} + +/** + * __gfs2_ail_flush - remove all buffers for a given lock from the AIL + * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) + * + * None of the buffers should be dirty, locked, or pinned. + */ + +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *head = &gl->gl_ail_list; + struct gfs2_bufdata *bd, *tmp; + struct buffer_head *bh; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; + bh = bd->bd_bh; + if (bh->b_state & b_state) { + if (fsync) + continue; + gfs2_ail_error(gl, bh); + } + gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; + } + GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + + +static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_trans tr; + + memset(&tr, 0, sizeof(tr)); + INIT_LIST_HEAD(&tr.tr_buf); + INIT_LIST_HEAD(&tr.tr_databuf); + tr.tr_revokes = atomic_read(&gl->gl_ail_count); + + if (!tr.tr_revokes) + return; + + /* A shortened, inline version of gfs2_trans_begin() + * tr->alloced is not set since the transaction structure is + * on the stack */ + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_ip = _RET_IP_; + if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) + return; + WARN_ON_ONCE(current->journal_info); + current->journal_info = &tr; + + __gfs2_ail_flush(gl, 0, tr.tr_revokes); + + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); +} + +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + int ret; + + if (!revokes) + return; + + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + + ret = gfs2_trans_begin(sdp, 0, max_revokes); + if (ret) + return; + __gfs2_ail_flush(gl, fsync, max_revokes); + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); +} + +/** + * rgrp_go_sync - sync out the metadata for this glock + * @gl: the glock + * + * Called when demoting or unlocking an EX glock. We must flush + * to disk all dirty buffers/pages relating to this glock, and must not + * not return to caller to demote/unlock the glock until I/O is complete. + */ + +static void rgrp_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd; + int error; + + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(sdp, gl, NORMAL_FLUSH); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); + gfs2_ail_empty_gl(gl); + + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_free_clones(rgd); + spin_unlock(&gl->gl_spin); +} + +/** + * rgrp_go_inval - invalidate the metadata for this glock + * @gl: the glock + * @flags: + * + * We never used LM_ST_DEFERRED with resource groups, so that we + * should always see the metadata flag set here. + * + */ + +static void rgrp_go_inval(struct gfs2_glock *gl, int flags) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; + + WARN_ON_ONCE(!(flags & DIO_METADATA)); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + + if (gl->gl_object) { + struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + } +} + +/** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * + */ + +static void inode_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + if (ip && !S_ISREG(ip->i_inode.i_mode)) + ip = NULL; + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; + + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH); + filemap_fdatawrite(metamapping); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + mapping_set_error(mapping, error); + } + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_atomic(); + clear_bit(GLF_DIRTY, &gl->gl_flags); +} + +/** + * inode_go_inval - prepare a inode glock to be released + * @gl: the glock + * @flags: + * + * Normally we invalidate everything, but if we are moving into + * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we + * can keep hold of the metadata, since it won't have changed. + * + */ + +static void inode_go_inval(struct gfs2_glock *gl, int flags) +{ + struct gfs2_inode *ip = gl->gl_object; + + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + + if (flags & DIO_METADATA) { + struct address_space *mapping = gfs2_glock2aspace(gl); + truncate_inode_pages(mapping, 0); + if (ip) { + set_bit(GIF_INVALID, &ip->i_flags); + forget_all_cached_acls(&ip->i_inode); + gfs2_dir_hash_inval(ip); + } + } + + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH); + gl->gl_sbd->sd_rindex_uptodate = 0; + } + if (ip && S_ISREG(ip->i_inode.i_mode)) + truncate_inode_pages(ip->i_inode.i_mapping, 0); +} + +/** + * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int inode_go_demote_ok(const struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_holder *gh; + + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (gh->gh_list.next != &gl->gl_holders) + return 0; + } + + return 1; +} + +/** + * gfs2_set_nlink - Set the inode's link count based on on-disk info + * @inode: The inode in question + * @nlink: The link count + * + * If the link count has hit zero, it must never be raised, whatever the + * on-disk inode might say. When new struct inodes are created the link + * count is set to 1, so that we can safely use this test even when reading + * in on disk information for the first time. + */ + +static void gfs2_set_nlink(struct inode *inode, u32 nlink) +{ + /* + * We will need to review setting the nlink count here in the + * light of the forthcoming ro bind mount work. This is a reminder + * to do that. + */ + if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { + if (nlink == 0) + clear_nlink(inode); + else + set_nlink(inode, nlink); + } +} + +static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +{ + const struct gfs2_dinode *str = buf; + struct timespec atime; + u16 height, depth; + + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); + ip->i_inode.i_mode = be32_to_cpu(str->di_mode); + ip->i_inode.i_rdev = 0; + switch (ip->i_inode.i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + }; + + i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); + i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); + gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; + ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + + ip->i_goal = be64_to_cpu(str->di_goal_meta); + ip->i_generation = be64_to_cpu(str->di_generation); + + ip->i_diskflags = be32_to_cpu(str->di_flags); + ip->i_eattr = be64_to_cpu(str->di_eattr); + /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ + gfs2_set_inode_flags(&ip->i_inode); + height = be16_to_cpu(str->di_height); + if (unlikely(height > GFS2_MAX_META_HEIGHT)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; + ip->i_entries = be32_to_cpu(str->di_entries); + + if (S_ISREG(ip->i_inode.i_mode)) + gfs2_set_aops(&ip->i_inode); + + return 0; +corrupt: + gfs2_consist_inode(ip); + return -EIO; +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + error = gfs2_dinode_in(ip, dibh->b_data); + brelse(dibh); + clear_bit(GIF_INVALID, &ip->i_flags); + + return error; +} + +/** + * inode_go_lock - operation done after an inode lock is locked by a process + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int inode_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + int error = 0; + + if (!ip || (gh->gh_flags & GL_SKIP)) + return 0; + + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + return error; + } + + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && + (gl->gl_state == LM_ST_EXCLUSIVE) && + (gh->gh_state == LM_ST_EXCLUSIVE)) { + spin_lock(&sdp->sd_trunc_lock); + if (list_empty(&ip->i_trunc_list)) + list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + wake_up(&sdp->sd_quota_wait); + return 1; + } + + return error; +} + +/** + * inode_go_dump - print information about an inode + * @seq: The iterator + * @ip: the inode + * + */ + +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_inode *ip = gl->gl_object; + if (ip == NULL) + return; + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + IF2DT(ip->i_inode.i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)i_size_read(&ip->i_inode)); +} + +/** + * freeze_go_sync - promote/demote the freeze glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void freeze_go_sync(struct gfs2_glock *gl) +{ + int error = 0; + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (gl->gl_state == LM_ST_SHARED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); + error = freeze_super(sdp->sd_vfs); + if (error) { + printk(KERN_INFO "GFS2: couldn't freeze filesystem: %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); + gfs2_log_flush(sdp, NULL, FREEZE_FLUSH); + } +} + +/** + * freeze_go_xmote_bh - After promoting/demoting the freeze glock + * @gl: the glock + * + */ + +static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header_host head; + int error; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + gfs2_consist(sdp); + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) + gfs2_consist(sdp); + + /* Initialize some head of the log stuff */ + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + } + } + return 0; +} + +/** + * trans_go_demote_ok + * @gl: the glock + * + * Always returns 0 + */ + +static int freeze_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) +{ + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) + return; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && ip) { + gl->gl_lockref.count++; + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gl->gl_lockref.count--; + } +} + +const struct gfs2_glock_operations gfs2_meta_glops = { + .go_type = LM_TYPE_META, +}; + +const struct gfs2_glock_operations gfs2_inode_glops = { + .go_sync = inode_go_sync, + .go_inval = inode_go_inval, + .go_demote_ok = inode_go_demote_ok, + .go_lock = inode_go_lock, + .go_dump = inode_go_dump, + .go_type = LM_TYPE_INODE, + .go_flags = GLOF_ASPACE, +}; + +const struct gfs2_glock_operations gfs2_rgrp_glops = { + .go_sync = rgrp_go_sync, + .go_inval = rgrp_go_inval, + .go_lock = gfs2_rgrp_go_lock, + .go_unlock = gfs2_rgrp_go_unlock, + .go_dump = gfs2_rgrp_dump, + .go_type = LM_TYPE_RGRP, + .go_flags = GLOF_LVB, +}; + +const struct gfs2_glock_operations gfs2_freeze_glops = { + .go_sync = freeze_go_sync, + .go_xmote_bh = freeze_go_xmote_bh, + .go_demote_ok = freeze_go_demote_ok, + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_iopen_glops = { + .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, +}; + +const struct gfs2_glock_operations gfs2_flock_glops = { + .go_type = LM_TYPE_FLOCK, +}; + +const struct gfs2_glock_operations gfs2_nondisk_glops = { + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_quota_glops = { + .go_type = LM_TYPE_QUOTA, + .go_flags = GLOF_LVB, +}; + +const struct gfs2_glock_operations gfs2_journal_glops = { + .go_type = LM_TYPE_JOURNAL, +}; + +const struct gfs2_glock_operations *gfs2_glops_list[] = { + [LM_TYPE_META] = &gfs2_meta_glops, + [LM_TYPE_INODE] = &gfs2_inode_glops, + [LM_TYPE_RGRP] = &gfs2_rgrp_glops, + [LM_TYPE_IOPEN] = &gfs2_iopen_glops, + [LM_TYPE_FLOCK] = &gfs2_flock_glops, + [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, + [LM_TYPE_QUOTA] = &gfs2_quota_glops, + [LM_TYPE_JOURNAL] = &gfs2_journal_glops, +}; + diff --git a/kernel/fs/gfs2/glops.h b/kernel/fs/gfs2/glops.h new file mode 100644 index 000000000..8ed1857c1 --- /dev/null +++ b/kernel/fs/gfs2/glops.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOPS_DOT_H__ +#define __GLOPS_DOT_H__ + +#include "incore.h" + +extern struct workqueue_struct *gfs2_freeze_wq; + +extern const struct gfs2_glock_operations gfs2_meta_glops; +extern const struct gfs2_glock_operations gfs2_inode_glops; +extern const struct gfs2_glock_operations gfs2_rgrp_glops; +extern const struct gfs2_glock_operations gfs2_freeze_glops; +extern const struct gfs2_glock_operations gfs2_iopen_glops; +extern const struct gfs2_glock_operations gfs2_flock_glops; +extern const struct gfs2_glock_operations gfs2_nondisk_glops; +extern const struct gfs2_glock_operations gfs2_quota_glops; +extern const struct gfs2_glock_operations gfs2_journal_glops; +extern const struct gfs2_glock_operations *gfs2_glops_list[]; + +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); + +#endif /* __GLOPS_DOT_H__ */ diff --git a/kernel/fs/gfs2/incore.h b/kernel/fs/gfs2/incore.h new file mode 100644 index 000000000..58b75abf6 --- /dev/null +++ b/kernel/fs/gfs2/incore.h @@ -0,0 +1,843 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIO_WAIT 0x00000010 +#define DIO_METADATA 0x00000020 + +struct gfs2_log_operations; +struct gfs2_bufdata; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_quota_data; +struct gfs2_trans; +struct gfs2_jdesc; +struct gfs2_sbd; +struct lm_lockops; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +struct gfs2_log_header_host { + u64 lh_sequence; /* Sequence number of this transaction */ + u32 lh_flags; /* GFS2_LOG_HEAD_... */ + u32 lh_tail; /* Block number of log tail */ + u32 lh_blkno; + u32 lh_hash; +}; + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + const char *lo_name; +}; + +#define GBF_FULL 1 + +struct gfs2_bitmap { + struct buffer_head *bi_bh; + char *bi_clone; + unsigned long bi_flags; + u32 bi_offset; + u32 bi_start; + u32 bi_len; + u32 bi_blocks; +}; + +struct gfs2_rgrpd { + struct rb_node rd_node; /* Link with superblock */ + struct gfs2_glock *rd_gl; /* Glock for this rgrp */ + u64 rd_addr; /* grp block disk address */ + u64 rd_data0; /* first data location */ + u32 rd_length; /* length of rgrp header in fs blocks */ + u32 rd_data; /* num of data blocks in rgrp */ + u32 rd_bitbytes; /* number of bytes in data bitmaps */ + u32 rd_free; + u32 rd_reserved; /* number of blocks reserved */ + u32 rd_free_clone; + u32 rd_dinodes; + u64 rd_igeneration; + struct gfs2_bitmap *rd_bits; + struct gfs2_sbd *rd_sbd; + struct gfs2_rgrp_lvb *rd_rgl; + u32 rd_last_alloc; + u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ + spinlock_t rd_rsspin; /* protects reservation related vars */ + struct rb_root rd_rstree; /* multi-block reservation tree */ +}; + +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ +}; + +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + +static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, + const struct gfs2_rbm *rbm2) +{ + return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && + (rbm1->offset == rbm2->offset); +} + +enum gfs2_state_bits { + BH_Pinned = BH_PrivateStart, + BH_Escaped = BH_PrivateStart + 1, + BH_Zeronew = BH_PrivateStart + 2, +}; + +BUFFER_FNS(Pinned, pinned) +TAS_BUFFER_FNS(Pinned, pinned) +BUFFER_FNS(Escaped, escaped) +TAS_BUFFER_FNS(Escaped, escaped) +BUFFER_FNS(Zeronew, zeronew) +TAS_BUFFER_FNS(Zeronew, zeronew) + +struct gfs2_bufdata { + struct buffer_head *bd_bh; + struct gfs2_glock *bd_gl; + u64 bd_blkno; + + struct list_head bd_list; + const struct gfs2_log_operations *bd_ops; + + struct gfs2_trans *bd_tr; + struct list_head bd_ail_st_list; + struct list_head bd_ail_gl_list; +}; + +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. + */ + +#define GDLM_STRNAME_BYTES 25 +#define GDLM_LVB_SIZE 32 + +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + +enum { + DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, +}; + +struct lm_lockname { + u64 ln_number; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type)) + + +struct gfs2_glock_operations { + void (*go_sync) (struct gfs2_glock *gl); + int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); + void (*go_inval) (struct gfs2_glock *gl, int flags); + int (*go_demote_ok) (const struct gfs2_glock *gl); + int (*go_lock) (struct gfs2_holder *gh); + void (*go_unlock) (struct gfs2_holder *gh); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback)(struct gfs2_glock *gl, bool remote); + const int go_type; + const unsigned long go_flags; +#define GLOF_ASPACE 1 +#define GLOF_LVB 2 +}; + +enum { + GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */ + GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */ + GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */ + GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */ + GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */ + GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */ + GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */ + GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */ + GFS2_NR_LKSTATS +}; + +struct gfs2_lkstats { + s64 stats[GFS2_NR_LKSTATS]; +}; + +enum { + /* States */ + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ + HIF_FIRST = 7, + HIF_WAIT = 10, +}; + +struct gfs2_holder { + struct list_head gh_list; + + struct gfs2_glock *gh_gl; + struct pid *gh_owner_pid; + unsigned int gh_state; + unsigned gh_flags; + + int gh_error; + unsigned long gh_iflags; /* HIF_... */ + unsigned long gh_ip; +}; + +/* Number of quota types we support */ +#define GFS2_MAXQUOTAS 2 + +/* Resource group multi-block reservation, in order of appearance: + + Step 1. Function prepares to write, allocates a mb, sets the size hint. + Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info + Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use + Step 4. Bits are assigned from the rgrp based on either the reservation + or wherever it can. +*/ + +struct gfs2_blkreserv { + /* components used during write (step 1): */ + atomic_t rs_sizehint; /* hint of the write size */ + + struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ + struct rb_node rs_node; /* link to other block reservations */ + struct gfs2_rbm rs_rbm; /* Start of reservation */ + u32 rs_free; /* how many blocks are still free */ + u64 rs_inum; /* Inode number for reservation */ + + /* ancillary quota stuff */ + struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; + unsigned int rs_qa_qd_num; +}; + +/* + * Allocation parameters + * @target: The number of blocks we'd ideally like to allocate + * @aflags: The flags (e.g. Orlov flag) + * + * The intent is to gradually expand this structure over time in + * order to give more information, e.g. alignment, min extent size + * to the allocation code. + */ +struct gfs2_alloc_parms { + u64 target; + u32 min_target; + u32 aflags; + u64 allowed; +}; + +enum { + GLF_LOCK = 1, + GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, + GLF_DEMOTE_IN_PROGRESS = 5, + GLF_DIRTY = 6, + GLF_LFLUSH = 7, + GLF_INVALIDATE_IN_PROGRESS = 8, + GLF_REPLY_PENDING = 9, + GLF_INITIAL = 10, + GLF_FROZEN = 11, + GLF_QUEUED = 12, + GLF_LRU = 13, + GLF_OBJECT = 14, /* Used only for tracing */ + GLF_BLOCKING = 15, +}; + +struct gfs2_glock { + struct hlist_bl_node gl_list; + struct gfs2_sbd *gl_sbd; + unsigned long gl_flags; /* GLF_... */ + struct lm_lockname gl_name; + + struct lockref gl_lockref; +#define gl_spin gl_lockref.lock + + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + + unsigned int gl_hash; + unsigned long gl_demote_time; /* time of first demote request */ + long gl_hold_time; + struct list_head gl_holders; + + const struct gfs2_glock_operations *gl_ops; + ktime_t gl_dstamp; + struct gfs2_lkstats gl_stats; + struct dlm_lksb gl_lksb; + unsigned long gl_tchange; + void *gl_object; + + struct list_head gl_lru; + struct list_head gl_ail_list; + atomic_t gl_ail_count; + atomic_t gl_revokes; + struct delayed_work gl_work; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; + struct rcu_head gl_rcu; +}; + +#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ + +enum { + GIF_INVALID = 0, + GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, + GIF_SW_PAGED = 3, + GIF_ORDERED = 4, + GIF_FREE_VFS_INODE = 5, +}; + +struct gfs2_inode { + struct inode i_inode; + u64 i_no_addr; + u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; + unsigned long i_flags; /* GIF_... */ + struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_holder i_iopen_gh; + struct gfs2_holder i_gh; /* for prepare/commit_write only */ + struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */ + struct gfs2_rgrpd *i_rgd; + u64 i_goal; /* goal block for allocations */ + struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; + struct list_head i_trunc_list; + __be64 *i_hash_cache; + u32 i_entries; + u32 i_diskflags; + u8 i_height; + u8 i_depth; +}; + +/* + * Since i_inode is the first element of struct gfs2_inode, + * this is effectively a cast. + */ +static inline struct gfs2_inode *GFS2_I(struct inode *inode) +{ + return container_of(inode, struct gfs2_inode, i_inode); +} + +static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +struct gfs2_file { + struct mutex f_fl_mutex; + struct gfs2_holder f_fl_gh; +}; + +struct gfs2_revoke_replay { + struct list_head rr_list; + u64 rr_blkno; + unsigned int rr_where; +}; + +enum { + QDF_CHANGE = 1, + QDF_LOCKED = 2, + QDF_REFRESH = 3, +}; + +struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; + struct list_head qd_list; + struct kqid qd_id; + struct gfs2_sbd *qd_sbd; + struct lockref qd_lockref; + struct list_head qd_lru; + unsigned qd_hash; + + unsigned long qd_flags; /* QDF_... */ + + s64 qd_change; + s64 qd_change_sync; + + unsigned int qd_slot; + unsigned int qd_slot_count; + + struct buffer_head *qd_bh; + struct gfs2_quota_change *qd_bh_qc; + unsigned int qd_bh_count; + + struct gfs2_glock *qd_gl; + struct gfs2_quota_lvb qd_qb; + + u64 qd_sync_gen; + unsigned long qd_last_warn; + struct rcu_head qd_rcu; +}; + +struct gfs2_trans { + unsigned long tr_ip; + + unsigned int tr_blocks; + unsigned int tr_revokes; + unsigned int tr_reserved; + unsigned int tr_touched:1; + unsigned int tr_attached:1; + unsigned int tr_alloced:1; + + unsigned int tr_num_buf_new; + unsigned int tr_num_databuf_new; + unsigned int tr_num_buf_rm; + unsigned int tr_num_databuf_rm; + unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; + + struct list_head tr_list; + struct list_head tr_databuf; + struct list_head tr_buf; + + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; +}; + +struct gfs2_journal_extent { + struct list_head list; + + unsigned int lblock; /* First logical block */ + u64 dblock; /* First disk block */ + u64 blocks; +}; + +struct gfs2_jdesc { + struct list_head jd_list; + struct list_head extent_list; + unsigned int nr_extents; + struct work_struct jd_work; + struct inode *jd_inode; + unsigned long jd_flags; +#define JDF_RECOVERY 1 + unsigned int jd_jid; + unsigned int jd_blocks; + int jd_recover_error; + /* Replay stuff */ + + unsigned int jd_found_blocks; + unsigned int jd_found_revokes; + unsigned int jd_replayed_blocks; + + struct list_head jd_revoke_list; + unsigned int jd_replay_tail; + +}; + +struct gfs2_statfs_change_host { + s64 sc_total; + s64 sc_free; + s64 sc_dinodes; +}; + +#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF +#define GFS2_QUOTA_OFF 0 +#define GFS2_QUOTA_ACCOUNT 1 +#define GFS2_QUOTA_ON 2 + +#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED +#define GFS2_DATA_WRITEBACK 1 +#define GFS2_DATA_ORDERED 2 + +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + +struct gfs2_args { + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ + unsigned int ar_nobarrier:1; /* do not send barriers */ + unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + int ar_commit; /* Commit interval */ + int ar_statfs_quantum; /* The fast statfs interval */ + int ar_quota_quantum; /* The quota interval */ + int ar_statfs_percent; /* The % change to force sync */ +}; + +struct gfs2_tune { + spinlock_t gt_spin; + + unsigned int gt_logd_secs; + + unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ + unsigned int gt_quota_scale_num; /* Numerator */ + unsigned int gt_quota_scale_den; /* Denominator */ + unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ + unsigned int gt_new_files_jdata; + unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ + unsigned int gt_complain_secs; + unsigned int gt_statfs_quantum; + unsigned int gt_statfs_slow; +}; + +enum { + SDF_JOURNAL_CHECKED = 0, + SDF_JOURNAL_LIVE = 1, + SDF_SHUTDOWN = 2, + SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ + SDF_SKIP_DLM_UNLOCK = 8, +}; + +enum gfs2_freeze_state { + SFS_UNFROZEN = 0, + SFS_STARTING_FREEZE = 1, + SFS_FROZEN = 2, +}; + +#define GFS2_FSNAME_LEN 256 + +struct gfs2_inum_host { + u64 no_formal_ino; + u64 no_addr; +}; + +struct gfs2_sb_host { + u32 sb_magic; + u32 sb_type; + u32 sb_format; + + u32 sb_fs_format; + u32 sb_multihost_format; + u32 sb_bsize; + u32 sb_bsize_shift; + + struct gfs2_inum_host sb_master_dir; + struct gfs2_inum_host sb_root_dir; + + char sb_lockproto[GFS2_LOCKNAME_LEN]; + char sb_locktable[GFS2_LOCKNAME_LEN]; +}; + +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + */ + +struct lm_lockstruct { + int ls_jid; + unsigned int ls_first; + const struct lm_lockops *ls_ops; + dlm_lockspace_t *ls_dlm; + + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ +}; + +struct gfs2_pcpu_lkstats { + /* One struct for each glock type */ + struct gfs2_lkstats lkstats[10]; +}; + +struct gfs2_sbd { + struct super_block *sd_vfs; + struct gfs2_pcpu_lkstats __percpu *sd_lkstats; + struct kobject sd_kobj; + unsigned long sd_flags; /* SDF_... */ + struct gfs2_sb_host sd_sb; + + /* Constants computed on mount */ + + u32 sd_fsb2bb; + u32 sd_fsb2bb_shift; + u32 sd_diptrs; /* Number of pointers in a dinode */ + u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_jbsize; /* Size of a journaled data block */ + u32 sd_hash_bsize; /* sizeof(exhash block) */ + u32 sd_hash_bsize_shift; + u32 sd_hash_ptrs; /* Number of pointers in a hash block */ + u32 sd_qc_per_block; + u32 sd_blocks_per_bitmap; + u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ + u32 sd_max_height; /* Max height of a file's metadata tree */ + u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; + u32 sd_max_jheight; /* Max height of journaled file's meta tree */ + u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; + + struct gfs2_args sd_args; /* Mount arguments */ + struct gfs2_tune sd_tune; /* Filesystem tuning structure */ + + /* Lock Stuff */ + + struct lm_lockstruct sd_lockstruct; + struct gfs2_holder sd_live_gh; + struct gfs2_glock *sd_rename_gl; + struct gfs2_glock *sd_freeze_gl; + struct work_struct sd_freeze_work; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; + struct completion sd_locking_init; + struct completion sd_wdack; + struct delayed_work sd_control_work; + + /* Inode Stuff */ + + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + + struct inode *sd_jindex; + struct inode *sd_statfs_inode; + struct inode *sd_sc_inode; + struct inode *sd_qc_inode; + struct inode *sd_rindex; + struct inode *sd_quota_inode; + + /* StatFS stuff */ + + spinlock_t sd_statfs_spin; + struct gfs2_statfs_change_host sd_statfs_master; + struct gfs2_statfs_change_host sd_statfs_local; + int sd_statfs_force_sync; + + /* Resource group stuff */ + + int sd_rindex_uptodate; + spinlock_t sd_rindex_spin; + struct rb_root sd_rindex_tree; + unsigned int sd_rgrps; + unsigned int sd_max_rg_data; + + /* Journal index stuff */ + + struct list_head sd_jindex_list; + spinlock_t sd_jindex_spin; + struct mutex sd_jindex_mutex; + unsigned int sd_journals; + + struct gfs2_jdesc *sd_jdesc; + struct gfs2_holder sd_journal_gh; + struct gfs2_holder sd_jinode_gh; + + struct gfs2_holder sd_sc_gh; + struct gfs2_holder sd_qc_gh; + + struct completion sd_journal_ready; + + /* Daemon stuff */ + + struct task_struct *sd_logd_process; + struct task_struct *sd_quotad_process; + + /* Quota stuff */ + + struct list_head sd_quota_list; + atomic_t sd_quota_count; + struct mutex sd_quota_mutex; + struct mutex sd_quota_sync_mutex; + wait_queue_head_t sd_quota_wait; + struct list_head sd_trunc_list; + spinlock_t sd_trunc_lock; + + unsigned int sd_quota_slots; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; + + u64 sd_quota_sync_gen; + + /* Log stuff */ + + struct address_space sd_aspace; + + spinlock_t sd_log_lock; + + struct gfs2_trans *sd_log_tr; + unsigned int sd_log_blks_reserved; + int sd_log_commited_revoke; + + atomic_t sd_log_pinned; + unsigned int sd_log_num_revoke; + + struct list_head sd_log_le_revoke; + struct list_head sd_log_le_ordered; + spinlock_t sd_ordered_lock; + + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; + atomic_t sd_log_blks_free; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; + + u64 sd_log_sequence; + unsigned int sd_log_head; + unsigned int sd_log_tail; + int sd_log_idle; + + struct rw_semaphore sd_log_flush_lock; + atomic_t sd_log_in_flight; + struct bio *sd_log_bio; + wait_queue_head_t sd_log_flush_wait; + int sd_log_error; + + atomic_t sd_reserving_log; + wait_queue_head_t sd_reserving_log_wait; + + unsigned int sd_log_flush_head; + u64 sd_log_flush_wrapped; + + spinlock_t sd_ail_lock; + struct list_head sd_ail1_list; + struct list_head sd_ail2_list; + + /* For quiescing the filesystem */ + struct gfs2_holder sd_freeze_gh; + atomic_t sd_freeze_state; + struct mutex sd_freeze_mutex; + + char sd_fsname[GFS2_FSNAME_LEN]; + char sd_table_name[GFS2_FSNAME_LEN]; + char sd_proto_name[GFS2_FSNAME_LEN]; + + /* Debugging crud */ + + unsigned long sd_last_warning; + struct dentry *debugfs_dir; /* debugfs directory */ + struct dentry *debugfs_dentry_glocks; + struct dentry *debugfs_dentry_glstats; + struct dentry *debugfs_dentry_sbstats; +}; + +static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) +{ + gl->gl_stats.stats[which]++; +} + +static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) +{ + const struct gfs2_sbd *sdp = gl->gl_sbd; + preempt_disable(); + this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; + preempt_enable(); +} + +#endif /* __INCORE_DOT_H__ */ + diff --git a/kernel/fs/gfs2/inode.c b/kernel/fs/gfs2/inode.c new file mode 100644 index 000000000..1b3ca7a2e --- /dev/null +++ b/kernel/fs/gfs2/inode.c @@ -0,0 +1,1973 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "super.h" +#include "glops.h" + +struct gfs2_skip_data { + u64 no_addr; + int skipped; + int non_block; +}; + +static int iget_test(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_skip_data *data = opaque; + + if (ip->i_no_addr == data->no_addr) { + if (data->non_block && + inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { + data->skipped = 1; + return 0; + } + return 1; + } + return 0; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_skip_data *data = opaque; + + if (data->skipped) + return -ENOENT; + inode->i_ino = (unsigned long)(data->no_addr); + ip->i_no_addr = data->no_addr; + return 0; +} + +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block) +{ + unsigned long hash = (unsigned long)no_addr; + struct gfs2_skip_data data; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return ilookup5(sb, hash, iget_test, &data); +} + +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr, + int non_block) +{ + struct gfs2_skip_data data; + unsigned long hash = (unsigned long)no_addr; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return iget5_locked(sb, hash, iget_test, iget_set, &data); +} + +/** + * gfs2_set_iop - Sets inode operations + * @inode: The inode with correct i_mode filled in + * + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). + */ + +static void gfs2_set_iop(struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + umode_t mode = inode->i_mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_file_fops_nolock; + else + inode->i_fop = &gfs2_file_fops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_dir_fops_nolock; + else + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_file_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } +} + +/** + * gfs2_inode_lookup - Lookup an inode + * @sb: The super block + * @no_addr: The inode number + * @type: The type of the inode + * non_block: Can we block on inodes that are being freed? + * + * Returns: A VFS inode, or an error + */ + +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, + u64 no_addr, u64 no_formal_ino, int non_block) +{ + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_glock *io_gl = NULL; + int error; + + inode = gfs2_iget(sb, no_addr, non_block); + ip = GFS2_I(inode); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + struct gfs2_sbd *sdp = GFS2_SB(inode); + ip->i_no_formal_ino = no_formal_ino; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + set_bit(GIF_INVALID, &ip->i_flags); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (unlikely(error)) + goto fail_iopen; + + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + io_gl = NULL; + + if (type == DT_UNKNOWN) { + /* Inode glock must be locked already */ + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_refresh; + } else { + inode->i_mode = DT2IF(type); + } + + gfs2_set_iop(inode); + unlock_new_inode(inode); + } + + return inode; + +fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); +fail_iopen: + if (io_gl) + gfs2_glock_put(io_gl); +fail_put: + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + iget_failed(inode); + return ERR_PTR(error); +} + +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; + struct inode *inode = NULL; + int error; + + /* Must not read in block until block type is verified */ + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); + if (error) + return ERR_PTR(error); + + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) + goto fail; + + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); + if (IS_ERR(inode)) + goto fail; + + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; + + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; + + error = 0; + } + +fail: + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; +} + + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +{ + struct qstr qstr; + struct inode *inode; + gfs2_str2qstr(&qstr, name); + inode = gfs2_lookupi(dip, &qstr, 1); + /* gfs2_lookupi has inconsistent callers: vfs + * related routines expect NULL for no entry found, + * gfs2_lookup_simple callers expect ENOENT + * and do not check for NULL. + */ + if (inode == NULL) + return ERR_PTR(-ENOENT); + else + return inode; +} + + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @d_gh: An initialized holder for the directory glock + * @name: The name of the inode to look for + * @is_root: If 1, ignore the caller's permissions + * @i_gh: An uninitialized holder for the new inode glock + * + * This can be called via the VFS filldir function when NFS is doing + * a readdirplus and the inode which its intending to stat isn't + * already in cache. In this case we must not take the directory glock + * again, since the readdir call will have already taken that lock. + * + * Returns: errno + */ + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root) +{ + struct super_block *sb = dir->i_sb; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error = 0; + struct inode *inode = NULL; + int unlock = 0; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || + (name->len == 2 && memcmp(name->name, "..", 2) == 0 && + dir == d_inode(sb->s_root))) { + igrab(dir); + return dir; + } + + if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + unlock = 1; + } + + if (!is_root) { + error = gfs2_permission(dir, MAY_EXEC); + if (error) + goto out; + } + + inode = gfs2_dir_search(dir, name, false); + if (IS_ERR(inode)) + error = PTR_ERR(inode); +out: + if (unlock) + gfs2_glock_dq_uninit(&d_gh); + if (error == -ENOENT) + return NULL; + return inode ? inode : ERR_PTR(error); +} + +/** + * create_ok - OK to create a new on-disk inode here? + * @dip: Directory in which dinode is to be created + * @name: Name of new dinode + * @mode: + * + * Returns: errno + */ + +static int create_ok(struct gfs2_inode *dip, const struct qstr *name, + umode_t mode) +{ + int error; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + /* Don't create entries in an unlinked directory */ + if (!dip->i_inode.i_nlink) + return -ENOENT; + + if (dip->i_entries == (u32)-1) + return -EFBIG; + if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) + return -EMLINK; + + return 0; +} + +static void munge_mode_uid_gid(const struct gfs2_inode *dip, + struct inode *inode) +{ + if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && + (dip->i_inode.i_mode & S_ISUID) && + !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) { + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISUID; + else if (!uid_eq(dip->i_inode.i_uid, current_fsuid())) + inode->i_mode &= ~07111; + inode->i_uid = dip->i_inode.i_uid; + } else + inode->i_uid = current_fsuid(); + + if (dip->i_inode.i_mode & S_ISGID) { + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + inode->i_gid = dip->i_inode.i_gid; + } else + inode->i_gid = current_fsgid(); +} + +static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; + int error; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + goto out; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipreserv; + + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + ip->i_no_formal_ino = ip->i_generation; + ip->i_inode.i_ino = ip->i_no_addr; + ip->i_goal = ip->i_no_addr; + + gfs2_trans_end(sdp); + +out_ipreserv: + gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); +out: + return error; +} + +static void gfs2_init_dir(struct buffer_head *dibh, + const struct gfs2_inode *parent) +{ + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_inum_out(parent, dent); + dent->de_type = cpu_to_be16(DT_DIR); + +} + +/** + * gfs2_init_xattr - Initialise an xattr block for a new inode + * @ip: The inode in question + * + * This sets up an empty xattr block for a new inode, ready to + * take any ACLs, LSM xattrs, etc. + */ + +static void gfs2_init_xattr(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + struct gfs2_ea_header *ea; + + bh = gfs2_meta_new(ip->i_gl, ip->i_eattr); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(bh); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + + brelse(bh); +} + +/** + * init_dinode - Fill in a new dinode structure + * @dip: The directory this inode is being created in + * @ip: The inode + * @symname: The symlink destination (if a symlink) + * @bhp: The buffer head (returned to caller) + * + */ + +static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname) +{ + struct gfs2_dinode *di; + struct buffer_head *dibh; + + dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); + gfs2_trans_add_meta(ip->i_gl, dibh); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); + + di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); + di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->__pad1 = 0; + di->__pad2 = 0; + di->__pad3 = 0; + memset(&di->__pad4, 0, sizeof(di->__pad4)); + memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + switch(ip->i_inode.i_mode & S_IFMT) { + case S_IFDIR: + gfs2_init_dir(dibh, dip); + break; + case S_IFLNK: + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size); + break; + } + + set_buffer_uptodate(dibh); + brelse(dibh); +} + +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + +static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip, struct gfs2_diradd *da) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; + int error; + + if (da->nr_blocks) { + error = gfs2_quota_lock_check(dip, &ap); + if (error) + goto fail_quota_locks; + + error = gfs2_inplace_reserve(dip, &ap); + if (error) + goto fail_quota_locks; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); + if (error) + goto fail_ipreserv; + } else { + error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0); + if (error) + goto fail_quota_locks; + } + + error = gfs2_dir_add(&dip->i_inode, name, ip, da); + + gfs2_trans_end(sdp); +fail_ipreserv: + gfs2_inplace_release(dip); +fail_quota_locks: + gfs2_quota_unlock(dip); + return error; +} + +static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; + } + return err; +} + +/** + * gfs2_create_inode - Create a new inode + * @dir: The parent directory + * @dentry: The new dentry + * @file: If non-NULL, the file which is being opened + * @mode: The permissions on the new inode + * @dev: For device nodes, this is the device number + * @symname: For symlinks, this is the link destination + * @size: The initial size of the inode (ignored for directories) + * + * Returns: 0 on success, or error code + */ + +static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + struct file *file, + umode_t mode, dev_t dev, const char *symname, + unsigned int size, int excl, int *opened) +{ + const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; + struct gfs2_holder ghs[2]; + struct inode *inode = NULL; + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_glock *io_gl; + int error, free_vfs_inode = 0; + u32 aflags = 0; + unsigned blocks = 1; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return -ENAMETOOLONG; + + error = gfs2_rs_alloc(dip); + if (error) + return error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (error) + goto fail; + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock; + + inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); + error = PTR_ERR(inode); + if (!IS_ERR(inode)) { + if (S_ISDIR(inode->i_mode)) { + iput(inode); + inode = ERR_PTR(-EISDIR); + goto fail_gunlock; + } + d_instantiate(dentry, inode); + error = 0; + if (file) { + if (S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common, opened); + else + error = finish_no_open(file, NULL); + } + gfs2_glock_dq_uninit(ghs); + return error; + } else if (error != -ENOENT) { + goto fail_gunlock; + } + + error = gfs2_diradd_alloc_required(dir, name, &da); + if (error < 0) + goto fail_gunlock; + + inode = new_inode(sdp->sd_vfs); + error = -ENOMEM; + if (!inode) + goto fail_gunlock; + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto fail_free_vfs_inode; + + ip = GFS2_I(inode); + error = gfs2_rs_alloc(ip); + if (error) + goto fail_free_acls; + + inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); + inode->i_rdev = dev; + inode->i_size = size; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + gfs2_set_inode_blocks(inode, 1); + munge_mode_uid_gid(dip, inode); + check_and_update_goal(dip); + ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + gfs2_set_inode_flags(inode); + + if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || + (dip->i_diskflags & GFS2_DIF_TOPDIR)) + aflags |= GFS2_AF_ORLOV; + + if (default_acl || acl) + blocks++; + + error = alloc_dinode(ip, aflags, &blocks); + if (error) + goto fail_free_inode; + + gfs2_set_inode_blocks(inode, blocks); + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (error) + goto fail_free_inode; + + ip->i_gl->gl_object = ip; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_free_inode; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto fail_gunlock2; + + if (blocks > 1) { + ip->i_eattr = ip->i_no_addr + 1; + gfs2_init_xattr(ip); + } + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (error) + goto fail_gunlock2; + + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (error) + goto fail_gunlock2; + + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + gfs2_set_iop(inode); + insert_inode_hash(inode); + + if (default_acl) { + error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + + if (error) + goto fail_gunlock3; + + error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, + &gfs2_initxattrs, NULL); + if (error) + goto fail_gunlock3; + + error = link_dinode(dip, name, ip, &da); + if (error) + goto fail_gunlock3; + + mark_inode_dirty(inode); + d_instantiate(dentry, inode); + if (file) { + *opened |= FILE_CREATED; + error = finish_open(file, dentry, gfs2_open_common, opened); + } + gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(ghs + 1); + return error; + +fail_gunlock3: + gfs2_glock_dq_uninit(ghs + 1); + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + goto fail_gunlock; + +fail_gunlock2: + gfs2_glock_dq_uninit(ghs + 1); +fail_free_inode: + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + gfs2_rs_delete(ip, NULL); +fail_free_acls: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); +fail_free_vfs_inode: + free_vfs_inode = 1; +fail_gunlock: + gfs2_dir_no_add(&da); + gfs2_glock_dq_uninit(ghs); + if (inode && !IS_ERR(inode)) { + clear_nlink(inode); + if (!free_vfs_inode) + mark_inode_dirty(inode); + set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, + &GFS2_I(inode)->i_flags); + iput(inode); + } +fail: + return error; +} + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL); +} + +/** + * __gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @file: File to be opened + * @opened: atomic_open flags + * + * + * Returns: errno + */ + +static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct file *file, int *opened) +{ + struct inode *inode; + struct dentry *d; + struct gfs2_holder gh; + struct gfs2_glock *gl; + int error; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode == NULL) { + d_add(dentry, NULL); + return NULL; + } + if (IS_ERR(inode)) + return ERR_CAST(inode); + + gl = GFS2_I(inode)->i_gl; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + + d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + gfs2_glock_dq_uninit(&gh); + return d; + } + if (file && S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common, opened); + + gfs2_glock_dq_uninit(&gh); + if (error) { + dput(d); + return ERR_PTR(error); + } + return d; +} + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned flags) +{ + return __gfs2_lookup(dir, dentry, NULL, NULL); +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = d_inode(old_dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + struct buffer_head *dibh; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; + int error; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + error = gfs2_rs_alloc(dip); + if (error) + return error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_gunlock; + + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(dir, &dentry->d_name, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_inode.i_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_inode.i_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_inode.i_nlink == (u32)-1) + goto out_gunlock; + + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); + if (error < 0) + goto out_gunlock; + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(dip, &ap); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(dip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); + if (error) + goto out_brelse; + + gfs2_trans_add_meta(ip->i_gl, dibh); + inc_nlink(&ip->i_inode); + ip->i_inode.i_ctime = CURRENT_TIME; + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + +out_brelse: + brelse(dibh); +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (da.nr_blocks) + gfs2_inplace_release(dip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(dip); +out_gunlock: + gfs2_dir_no_add(&da); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + !uid_eq(dip->i_inode.i_uid, current_fsuid()) && + !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + return gfs2_dir_check(&dip->i_inode, name, ip); +} + +/** + * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it + * @dip: The parent directory + * @name: The name of the entry in the parent directory + * @inode: The inode to be removed + * + * Called with all the locks and in a transaction. This will only be + * called for a directory after it has been checked to ensure it is empty. + * + * Returns: 0 on success, or an error + */ + +static int gfs2_unlink_inode(struct gfs2_inode *dip, + const struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = gfs2_dir_del(dip, dentry); + if (error) + return error; + + ip->i_entries = 0; + inode->i_ctime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else + drop_nlink(inode); + mark_inode_dirty(inode); + if (inode->i_nlink == 0) + gfs2_unlink_di(inode); + return 0; +} + + +/** + * gfs2_unlink - Unlink an inode (this does rmdir as well) + * @dir: The inode of the directory containing the inode to unlink + * @dentry: The file itself + * + * This routine uses the type of the inode as a flag to figure out + * whether this is an unlink or an rmdir. + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = -EROFS; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) + goto out_inodes; + + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_rgrp; + + if (S_ISDIR(inode->i_mode)) { + error = -ENOTEMPTY; + if (ip->i_entries > 2 || inode->i_nlink > 2) + goto out_rgrp; + } + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_end_trans; + + error = gfs2_unlink_inode(dip, dentry); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq(ghs + 2); +out_rgrp: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs + 2); +out_inodes: + gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned int size; + + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @dev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t dev) +{ + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL); +} + +/** + * gfs2_atomic_open - Atomically open a file + * @dir: The directory + * @dentry: The proposed new entry + * @file: The proposed new struct file + * @flags: open flags + * @mode: File mode + * @opened: Flag to say whether the file has been opened or not + * + * Returns: error code or 0 for success + */ + +static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + struct dentry *d; + bool excl = !!(flags & O_EXCL); + + if (!d_unhashed(dentry)) + goto skip_lookup; + + d = __gfs2_lookup(dir, dentry, file, opened); + if (IS_ERR(d)) + return PTR_ERR(d); + if (d != NULL) + dentry = d; + if (d_really_is_positive(dentry)) { + if (!(*opened & FILE_OPENED)) + return finish_no_open(file, d); + dput(d); + return 0; + } + + BUG_ON(d != NULL); + +skip_lookup: + if (!(flags & O_CREAT)) + return -ENOENT; + + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened); +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + int error = 0; + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == d_inode(sb->s_root)) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (!tmp) { + error = -ENOENT; + break; + } + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_rgrpd *nrgd; + unsigned int num_gh; + int dir_rename = 0; + struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, }; + unsigned int x; + int error; + + if (d_really_is_positive(ndentry)) { + nip = GFS2_I(d_inode(ndentry)); + if (ip == nip) + return 0; + } + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_rs_alloc(ndip); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); + } + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (ip->i_inode.i_nlink == 0) + goto out_gunlock; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (nip->i_inode.i_nlink == 0) { + error = -EAGAIN; + goto out_gunlock; + } + + if (S_ISDIR(nip->i_inode.i_mode)) { + if (nip->i_entries < 2) { + gfs2_consist_inode(nip); + error = -EIO; + goto out_gunlock; + } + if (nip->i_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_inode.i_nlink) { + error = -ENOENT; + goto out_gunlock; + } + if (ndip->i_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_inode.i_mode) && + ndip->i_inode.i_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = gfs2_permission(d_inode(odentry), MAY_WRITE); + if (error) + goto out_gunlock; + } + + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(ndip, &ap); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(ndip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF + 4, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) + error = gfs2_unlink_inode(ndip, ndentry); + + if (dir_rename) { + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, odentry); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (da.nr_blocks) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(ndip); +out_gunlock: + gfs2_dir_no_add(&da); + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int size; + char *buf; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + nd_set_link(nd, ERR_PTR(error)); + return NULL; + } + + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { + gfs2_consist_inode(ip); + buf = ERR_PTR(-EIO); + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) { + buf = ERR_PTR(error); + goto out; + } + + buf = kzalloc(size + 1, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + nd_set_link(nd, buf); + return NULL; +} + +/** + * gfs2_permission - + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. + * + * Returns: errno + */ + +int gfs2_permission(struct inode *inode, int mask) +{ + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + int error; + int unlock = 0; + + + ip = GFS2_I(inode); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } + + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EACCES; + else + error = generic_permission(inode, mask); + if (unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +/** + * gfs2_setattr_simple - + * @ip: + * @attr: + * + * Returns: errno + */ + +int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(inode, attr); + + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); + if (error) + return error; + + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); + return error; +} + +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + kuid_t ouid, nuid; + kgid_t ogid, ngid; + int error; + struct gfs2_alloc_parms ap; + + ouid = inode->i_uid; + ogid = inode->i_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) + ouid = nuid = NO_UID_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) + ogid = ngid = NO_GID_QUOTA_CHANGE; + + error = get_write_access(inode); + if (error) + return error; + + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + + ap.target = gfs2_get_inode_blocks(&ip->i_inode); + + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + error = gfs2_quota_check(ip, nuid, ngid, &ap); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_setattr_simple(inode, attr); + if (error) + goto out_end_trans; + + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + gfs2_quota_change(ip, -ap.target, ouid, ogid); + gfs2_quota_change(ip, ap.target, nuid, ngid); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + put_write_access(inode); + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_rs_alloc(ip); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = gfs2_setattr_size(inode, attr->ia_size); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else { + error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + } + +out: + if (!error) + mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @mnt: The vfsmount the inode is being accessed from + * @dentry: The dentry to stat + * @stat: The inode's stats + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. Note that its the NFS + * readdirplus operation which causes this to be called (from filldir) + * with the glock already held. + * + * Returns: errno + */ + +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + int unlock = 0; + + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } + + generic_fillattr(inode, stat); + if (unlock) + gfs2_glock_dq_uninit(&gh); + + return 0; +} + +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +const struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, + .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, +}; + +const struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_unlink, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, + .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, + .atomic_open = gfs2_atomic_open, +}; + +const struct inode_operations gfs2_symlink_iops = { + .readlink = generic_readlink, + .follow_link = gfs2_follow_link, + .put_link = kfree_put_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + diff --git a/kernel/fs/gfs2/inode.h b/kernel/fs/gfs2/inode.h new file mode 100644 index 000000000..ba4d9492d --- /dev/null +++ b/kernel/fs/gfs2/inode.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INODE_DOT_H__ +#define __INODE_DOT_H__ + +#include +#include +#include +#include "util.h" + +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); + +static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) +{ + return !ip->i_height; +} + +static inline int gfs2_is_jdata(const struct gfs2_inode *ip) +{ + return ip->i_diskflags & GFS2_DIF_JDATA; +} + +static inline int gfs2_is_writeback(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); +} + +static inline int gfs2_is_ordered(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); +} + +static inline int gfs2_is_dir(const struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_inode.i_mode); +} + +static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) +{ + inode->i_blocks = blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline u64 gfs2_get_inode_blocks(const struct inode *inode) +{ + return inode->i_blocks >> + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) +{ + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); + change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + inode->i_blocks += change; +} + +static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, + u64 no_formal_ino) +{ + return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; +} + +static inline void gfs2_inum_out(const struct gfs2_inode *ip, + struct gfs2_dirent *dent) +{ + dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); +} + +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & ((1 << inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} + +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + int non_block); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); + +extern int gfs2_inode_refresh(struct gfs2_inode *ip); + +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern int gfs2_open_common(struct inode *inode, struct file *file); + +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_localflocks; +} +#else /* Single node only */ +#define gfs2_file_fops gfs2_file_fops_nolock +#define gfs2_dir_fops gfs2_dir_fops_nolock + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return 1; +} +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +#endif /* __INODE_DOT_H__ */ + diff --git a/kernel/fs/gfs2/lock_dlm.c b/kernel/fs/gfs2/lock_dlm.c new file mode 100644 index 000000000..641383a9c --- /dev/null +++ b/kernel/fs/gfs2/lock_dlm.c @@ -0,0 +1,1336 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "incore.h" +#include "glock.h" +#include "util.h" +#include "sys.h" +#include "trace_gfs2.h" + +extern struct workqueue_struct *gfs2_control_wq; + +/** + * gfs2_update_stats - Update time based stats + * @mv: Pointer to mean/variance structure to update + * @sample: New data to include + * + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The varience estimate is a bit + * more complicated. We subtract the abs value of the @delta from + * the current variance estimate and add 1/4 of that to the running + * total. + * + * Note that the index points at the array entry containing the smoothed + * mean value, and the variance is always in the following entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP case, + * they are not scaled fixed point. + */ + +static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, + s64 sample) +{ + s64 delta = sample - s->stats[index]; + s->stats[index] += (delta >> 3); + index++; + s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2); +} + +/** + * gfs2_update_reply_times - Update locking statistics + * @gl: The glock to update + * + * This assumes that gl->gl_dstamp has been set earlier. + * + * The rtt (lock round trip time) is an estimate of the time + * taken to perform a dlm lock request. We update it on each + * reply from the dlm. + * + * The blocking flag is set on the glock for all dlm requests + * which may potentially block due to lock requests from other nodes. + * DLM requests where the current lock state is exclusive, the + * requested state is null (or unlocked) or where the TRY or + * TRY_1CB flags are set are classified as non-blocking. All + * other DLM requests are counted as (potentially) blocking. + */ +static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? + GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + s64 rtt; + + preempt_disable(); + rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */ + preempt_enable(); + + trace_gfs2_glock_lock_time(gl, rtt); +} + +/** + * gfs2_update_request_times - Update locking statistics + * @gl: The glock to update + * + * The irt (lock inter-request times) measures the average time + * between requests to the dlm. It is updated immediately before + * each dlm call. + */ + +static inline void gfs2_update_request_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + ktime_t dstamp; + s64 irt; + + preempt_disable(); + dstamp = gl->gl_dstamp; + gl->gl_dstamp = ktime_get_real(); + irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */ + preempt_enable(); +} + +static void gdlm_ast(void *arg) +{ + struct gfs2_glock *gl = arg; + unsigned ret = gl->gl_state; + + gfs2_update_reply_times(gl); + BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); + + if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) + memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + switch (gl->gl_lksb.sb_status) { + case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + gfs2_glock_free(gl); + return; + case -DLM_ECANCEL: /* Cancel while getting lock */ + ret |= LM_OUT_CANCELED; + goto out; + case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ + goto out; + case -ETIMEDOUT: /* Canceled due to timeout */ + ret |= LM_OUT_ERROR; + goto out; + case 0: /* Success */ + break; + default: /* Something unexpected */ + BUG(); + } + + ret = gl->gl_req; + if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { + if (gl->gl_req == LM_ST_SHARED) + ret = LM_ST_DEFERRED; + else if (gl->gl_req == LM_ST_DEFERRED) + ret = LM_ST_SHARED; + else + BUG(); + } + + set_bit(GLF_INITIAL, &gl->gl_flags); + gfs2_glock_complete(gl, ret); + return; +out: + if (!test_bit(GLF_INITIAL, &gl->gl_flags)) + gl->gl_lksb.sb_lkid = 0; + gfs2_glock_complete(gl, ret); +} + +static void gdlm_bast(void *arg, int mode) +{ + struct gfs2_glock *gl = arg; + + switch (mode) { + case DLM_LOCK_EX: + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + break; + case DLM_LOCK_CW: + gfs2_glock_cb(gl, LM_ST_DEFERRED); + break; + case DLM_LOCK_PR: + gfs2_glock_cb(gl, LM_ST_SHARED); + break; + default: + pr_err("unknown bast mode %d\n", mode); + BUG(); + } +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int make_mode(const unsigned int lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + pr_err("unknown LM state %d\n", lmstate); + BUG(); + return -1; +} + +static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, + const int req) +{ + u32 lkf = 0; + + if (gl->gl_lksb.sb_lvbptr) + lkf |= DLM_LKF_VALBLK; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + else + BUG(); + } + + if (gl->gl_lksb.sb_lkid != 0) { + lkf |= DLM_LKF_CONVERT; + if (test_bit(GLF_BLOCKING, &gl->gl_flags)) + lkf |= DLM_LKF_QUECVT; + } + + return lkf; +} + +static void gfs2_reverse_hex(char *c, u64 value) +{ + *c = '0'; + while (value) { + *c-- = hex_asc[value & 0x0f]; + value >>= 4; + } +} + +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + int req; + u32 lkf; + char strname[GDLM_STRNAME_BYTES] = ""; + + req = make_mode(req_state); + lkf = make_flags(gl, flags, req); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + if (gl->gl_lksb.sb_lkid) { + gfs2_update_request_times(gl); + } else { + memset(strname, ' ', GDLM_STRNAME_BYTES - 1); + strname[GDLM_STRNAME_BYTES - 1] = '\0'; + gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type); + gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number); + gl->gl_dstamp = ktime_get_real(); + } + /* + * Submit the actual lock request. + */ + + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); +} + +static void gdlm_put_lock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; + int error; + + if (gl->gl_lksb.sb_lkid == 0) { + gfs2_glock_free(gl); + return; + } + + clear_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_update_request_times(gl); + + /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && + !lvb_needs_unlock) { + gfs2_glock_free(gl); + return; + } + + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + NULL, gl); + if (error) { + pr_err("gdlm_unlock %x,%llx err=%d\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, error); + return; + } +} + +static void gdlm_cancel(struct gfs2_glock *gl) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); +} + +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(__le32)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); +} + +static int all_jid_bits_clear(char *lvb) +{ + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + fs_info(sdp, "control_mount wait1 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) + fs_err(sdp, "control_first_done control NL error %d\n", error); + + return error; +} + +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + + if (old_size >= max_jid + 1) + return 0; + + new_size = old_size + RECOVER_SIZE_INC; + + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_lvb_bits); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u\n", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_atomic(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + +static void gdlm_unmount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: + if (ls->ls_dlm) { + dlm_release_lockspace(ls->ls_dlm, 2); + ls->ls_dlm = NULL; + } + + free_recover_size(ls); +} + +static const match_table_t dlm_tokens = { + { Opt_jid, "jid=%d"}, + { Opt_id, "id=%d"}, + { Opt_first, "first=%d"}, + { Opt_nodir, "nodir=%d"}, + { Opt_err, NULL }, +}; + +const struct lm_lockops gfs2_dlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, + .lm_unmount = gdlm_unmount, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_cancel = gdlm_cancel, + .lm_tokens = &dlm_tokens, +}; + diff --git a/kernel/fs/gfs2/log.c b/kernel/fs/gfs2/log.c new file mode 100644 index 000000000..536e7a625 --- /dev/null +++ b/kernel/fs/gfs2/log.c @@ -0,0 +1,950 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "util.h" +#include "dir.h" +#include "trace_gfs2.h" + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * @ssize: the size of the structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize) +{ + unsigned int blks; + unsigned int first, second; + + blks = 1; + first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + + if (nstruct > first) { + second = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / ssize; + blks += DIV_ROUND_UP(nstruct - first, second); + } + + return blks; +} + +/** + * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters + * @mapping: The associated mapping (maybe NULL) + * @bd: The gfs2_bufdata to remove + * + * The ail lock _must_ be held when calling this function + * + */ + +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +{ + bd->bd_tr = NULL; + list_del_init(&bd->bd_ail_st_list); + list_del_init(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bd->bd_bh); +} + +/** + * gfs2_ail1_start_one - Start I/O on a part of the AIL + * @sdp: the filesystem + * @wbc: The writeback control structure + * @ai: The ail structure + * + */ + +static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, + struct writeback_control *wbc, + struct gfs2_trans *tr) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) +{ + struct gfs2_glock *gl = NULL; + struct address_space *mapping; + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_tr == tr); + + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); + continue; + } + + if (!buffer_dirty(bh)) + continue; + if (gl == bd->bd_gl) + continue; + gl = bd->bd_gl; + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); + mapping = bh->b_page->mapping; + if (!mapping) + continue; + spin_unlock(&sdp->sd_ail_lock); + generic_writepages(mapping, wbc); + spin_lock(&sdp->sd_ail_lock); + if (wbc->nr_to_write <= 0) + break; + return 1; + } + + return 0; +} + + +/** + * gfs2_ail1_flush - start writeback of some ail1 entries + * @sdp: The super block + * @wbc: The writeback control structure + * + * Writes back some ail1 entries, according to the limits in the + * writeback control structure + */ + +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) +{ + struct list_head *head = &sdp->sd_ail1_list; + struct gfs2_trans *tr; + struct blk_plug plug; + + trace_gfs2_ail_flush(sdp, wbc, 1); + blk_start_plug(&plug); + spin_lock(&sdp->sd_ail_lock); +restart: + list_for_each_entry_reverse(tr, head, tr_list) { + if (wbc->nr_to_write <= 0) + break; + if (gfs2_ail1_start_one(sdp, wbc, tr)) + goto restart; + } + spin_unlock(&sdp->sd_ail_lock); + blk_finish_plug(&plug); + trace_gfs2_ail_flush(sdp, wbc, 0); +} + +/** + * gfs2_ail1_start - start writeback of all ail1 entries + * @sdp: The superblock + */ + +static void gfs2_ail1_start(struct gfs2_sbd *sdp) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return gfs2_ail1_flush(sdp, &wbc); +} + +/** + * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + gfs2_assert(sdp, bd->bd_tr == tr); + if (buffer_busy(bh)) + continue; + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); + } + +} + +/** + * gfs2_ail1_empty - Try to empty the ail1 lists + * @sdp: The superblock + * + * Tries to empty the ail1 lists, starting with the oldest first + */ + +static int gfs2_ail1_empty(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr, *s; + int oldest_tr = 1; + int ret; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + gfs2_ail1_empty_one(sdp, tr); + if (list_empty(&tr->tr_ail1_list) && oldest_tr) + list_move(&tr->tr_list, &sdp->sd_ail2_list); + else + oldest_tr = 0; + } + ret = list_empty(&sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + return ret; +} + +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &tr->tr_ail2_list; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->prev, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_tr == tr); + gfs2_remove_from_ail(bd); + } +} + +static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + struct gfs2_trans *tr, *safe; + unsigned int old_tail = sdp->sd_log_tail; + int wrap = (new_tail < old_tail); + int a, b, rm; + + spin_lock(&sdp->sd_ail_lock); + + list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { + a = (old_tail <= tr->tr_first); + b = (tr->tr_first < new_tail); + rm = (wrap) ? (a || b) : (a && b); + if (!rm) + continue; + + gfs2_ail2_empty_one(sdp, tr); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + kfree(tr); + } + + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * + * Note that we never give out the last few blocks of the journal. Thats + * due to the fact that there is a small number of header blocks + * associated with each log flush. The exact number can't be known until + * flush time, so we ensure that we have just enough free blocks at all + * times to avoid running out during a log flush. + * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * + * Returns: errno + */ + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +{ + int ret = 0; + unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; + + if (gfs2_assert_warn(sdp, blks) || + gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); + } + atomic_inc(&sdp->sd_reserving_log); + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) { + if (atomic_dec_and_test(&sdp->sd_reserving_log)) + wake_up(&sdp->sd_reserving_log_wait); + goto retry; + } + trace_gfs2_log_blocks(sdp, -blks); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); + + down_read(&sdp->sd_log_flush_lock); + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release(sdp, blks); + ret = -EROFS; + } + if (atomic_dec_and_test(&sdp->sd_reserving_log)) + wake_up(&sdp->sd_reserving_log_wait); + return ret; +} + +/** + * log_distance - Compute distance between two journal blocks + * @sdp: The GFS2 superblock + * @newer: The most recent journal block of the pair + * @older: The older journal block of the pair + * + * Compute the distance (in the journal direction) between two + * blocks in the journal + * + * Returns: the distance in blocks + */ + +static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer, + unsigned int older) +{ + int dist; + + dist = newer - older; + if (dist < 0) + dist += sdp->sd_jdesc->jd_blocks; + + return dist; +} + +/** + * calc_reserved - Calculate the number of blocks to reserve when + * refunding a transaction's unused buffers. + * @sdp: The GFS2 superblock + * + * This is complex. We need to reserve room for all our currently used + * metadata buffers (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data buffers for journaled files (e.g. files in the + * meta_fs like rindex, or files for which chattr +j was done.) + * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush + * will count it as free space (sd_log_blks_free) and corruption will follow. + * + * We can have metadata bufs and jdata bufs in the same journal. So each + * type gets its own log header, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one header + * in cases where we have more buffers than will fit on a journal page. + * Metadata journal entries take up half the space of journaled buffer entries. + * Thus, metadata entries have buf_limit (502) and journaled buffers have + * databuf_limit (251) before they cause a wrap around. + * + * Also, we need to reserve blocks for revoke journal entries and one for an + * overall header for the lot. + * + * Returns: the number of blocks reserved + */ +static unsigned int calc_reserved(struct gfs2_sbd *sdp) +{ + unsigned int reserved = 0; + unsigned int mbuf; + unsigned int dbuf; + struct gfs2_trans *tr = sdp->sd_log_tr; + + if (tr) { + mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved = mbuf + dbuf; + /* Account for header blocks */ + reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); + reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + } + + if (sdp->sd_log_commited_revoke > 0) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(u64)); + /* One for the overall header */ + if (reserved) + reserved++; + return reserved; +} + +static unsigned int current_tail(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + unsigned int tail; + + spin_lock(&sdp->sd_ail_lock); + + if (list_empty(&sdp->sd_ail1_list)) { + tail = sdp->sd_log_head; + } else { + tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr_list); + tail = tr->tr_first; + } + + spin_unlock(&sdp->sd_ail_lock); + + return tail; +} + +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + + ail2_empty(sdp, new_tail); + + atomic_add(dist, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, dist); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + + sdp->sd_log_tail = new_tail; +} + + +static void log_flush_wait(struct gfs2_sbd *sdp) +{ + DEFINE_WAIT(wait); + + if (atomic_read(&sdp->sd_log_in_flight)) { + do { + prepare_to_wait(&sdp->sd_log_flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_in_flight)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_in_flight)); + finish_wait(&sdp->sd_log_flush_wait, &wait); + } +} + +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_inode *ipa, *ipb; + + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); + + if (ipa->i_no_addr < ipb->i_no_addr) + return -1; + if (ipa->i_no_addr > ipb->i_no_addr) + return 1; + return 0; +} + +static void gfs2_ordered_write(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + LIST_HEAD(written); + + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); + while (!list_empty(&sdp->sd_log_le_ordered)) { + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_move(&ip->i_ordered, &written); + if (ip->i_inode.i_mapping->nrpages == 0) + continue; + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + list_splice(&written, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); +} + +static void gfs2_ordered_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + spin_lock(&sdp->sd_ordered_lock); + while (!list_empty(&sdp->sd_log_le_ordered)) { + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_del(&ip->i_ordered); + WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + if (ip->i_inode.i_mapping->nrpages == 0) + continue; + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + spin_lock(&sdp->sd_ordered_lock); + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + list_del(&ip->i_ordered); + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct buffer_head *bh = bd->bd_bh; + struct gfs2_glock *gl = bd->bd_gl; + + bh->b_private = NULL; + bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; + bd->bd_ops = &gfs2_revoke_lops; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); +} + +void gfs2_write_revokes(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd, *tmp; + int have_revokes = 0; + int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + + gfs2_ail1_empty(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { + if (list_empty(&bd->bd_list)) { + have_revokes = 1; + goto done; + } + } + } +done: + spin_unlock(&sdp->sd_ail_lock); + if (have_revokes == 0) + return; + while (sdp->sd_log_num_revoke > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + max_revokes -= sdp->sd_log_num_revoke; + if (!sdp->sd_log_num_revoke) { + atomic_dec(&sdp->sd_log_blks_free); + /* If no blocks have been reserved, we need to also + * reserve a block for the header */ + if (!sdp->sd_log_blks_reserved) + atomic_dec(&sdp->sd_log_blks_free); + } + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { + if (max_revokes == 0) + goto out_of_blocks; + if (!list_empty(&bd->bd_list)) + continue; + gfs2_add_revoke(sdp, bd); + max_revokes--; + } + } +out_of_blocks: + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + if (!sdp->sd_log_num_revoke) { + atomic_inc(&sdp->sd_log_blks_free); + if (!sdp->sd_log_blks_reserved) + atomic_inc(&sdp->sd_log_blks_free); + } +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags) +{ + struct gfs2_log_header *lh; + unsigned int tail; + u32 hash; + int rw = WRITE_FLUSH_FUA | REQ_META; + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + lh = page_address(page); + clear_page(lh); + + gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); + + tail = current_tail(sdp); + + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); + hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { + gfs2_ordered_wait(sdp); + log_flush_wait(sdp); + rw = WRITE_SYNC | REQ_META | REQ_PRIO; + } + + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + gfs2_log_write_page(sdp, page); + gfs2_log_flush_bio(sdp, rw); + log_flush_wait(sdp); + + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail); +} + +/** + * gfs2_log_flush - flush incore transaction(s) + * @sdp: the filesystem + * @gl: The glock structure to flush. If NULL, flush the whole incore log + * + */ + +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type) +{ + struct gfs2_trans *tr; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + + down_write(&sdp->sd_log_flush_lock); + + /* Log might have been flushed while we waited for the flush lock */ + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { + up_write(&sdp->sd_log_flush_lock); + return; + } + trace_gfs2_log_flush(sdp, 1); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + tr = sdp->sd_log_tr; + if (tr) { + sdp->sd_log_tr = NULL; + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + tr->tr_first = sdp->sd_log_flush_head; + if (unlikely (state == SFS_FROZEN)) + gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new); + } + + if (unlikely(state == SFS_FROZEN)) + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + + gfs2_ordered_write(sdp); + lops_before_commit(sdp, tr); + gfs2_log_flush_bio(sdp, WRITE); + + if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_flush_wait(sdp); + log_write_header(sdp, 0); + } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + log_write_header(sdp, 0); + } + lops_after_commit(sdp, tr); + + gfs2_log_lock(sdp); + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_blks_reserved = 0; + sdp->sd_log_commited_revoke = 0; + + spin_lock(&sdp->sd_ail_lock); + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + if (type != NORMAL_FLUSH) { + if (!sdp->sd_log_idle) { + for (;;) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) + break; + } + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + sdp->sd_log_flush_wrapped = 0; + log_write_header(sdp, 0); + sdp->sd_log_head = sdp->sd_log_flush_head; + } + if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) + gfs2_log_shutdown(sdp); + if (type == FREEZE_FLUSH) + atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); + } + + trace_gfs2_log_flush(sdp, 0); + up_write(&sdp->sd_log_flush_lock); + + kfree(tr); +} + +/** + * gfs2_merge_trans - Merge a new transaction into a cached transaction + * @old: Original transaction to be expanded + * @new: New transaction to be merged + */ + +static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +{ + WARN_ON_ONCE(old->tr_attached != 1); + + old->tr_num_buf_new += new->tr_num_buf_new; + old->tr_num_databuf_new += new->tr_num_databuf_new; + old->tr_num_buf_rm += new->tr_num_buf_rm; + old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; + + list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); + list_splice_tail_init(&new->tr_buf, &old->tr_buf); +} + +static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int reserved; + unsigned int unused; + unsigned int maxres; + + gfs2_log_lock(sdp); + + if (sdp->sd_log_tr) { + gfs2_merge_trans(sdp->sd_log_tr, tr); + } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { + gfs2_assert_withdraw(sdp, tr->tr_alloced); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } + + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; + reserved = calc_reserved(sdp); + maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, maxres >= reserved); + unused = maxres - reserved; + atomic_add(unused, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, unused); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + sdp->sd_log_blks_reserved = reserved; + + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_commit - Commit a transaction to the log + * @sdp: the filesystem + * @tr: the transaction + * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * + * Returns: errno + */ + +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + log_refund(sdp, tr); + + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); +} + +/** + * gfs2_log_shutdown - write a shutdown header into a journal + * @sdp: the filesystem + * + */ + +void gfs2_log_shutdown(struct gfs2_sbd *sdp) +{ + gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); + + gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); + gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); + + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_tail = sdp->sd_log_head; +} + +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @sdp: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t = 1; + DEFINE_WAIT(wait); + + while (!kthread_should_stop()) { + + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + } + + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + } + + if (!gfs2_ail_flush_reqd(sdp)) + wake_up(&sdp->sd_log_waitq); + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + + try_to_freeze(); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); + } + + return 0; +} + diff --git a/kernel/fs/gfs2/log.h b/kernel/fs/gfs2/log.h new file mode 100644 index 000000000..9499a6049 --- /dev/null +++ b/kernel/fs/gfs2/log.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOG_DOT_H__ +#define __LOG_DOT_H__ + +#include +#include +#include +#include "incore.h" + +/** + * gfs2_log_lock - acquire the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +__acquires(&sdp->sd_log_lock) +{ + spin_lock(&sdp->sd_log_lock); +} + +/** + * gfs2_log_unlock - release the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +__releases(&sdp->sd_log_lock) +{ + spin_unlock(&sdp->sd_log_lock); +} + +static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + unsigned int value) +{ + if (++value == sdp->sd_jdesc->jd_blocks) { + value = 0; + } + sdp->sd_log_head = sdp->sd_log_tail = value; +} + +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + spin_lock(&sdp->sd_ordered_lock); + if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } +} +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize); + +extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +enum gfs2_flush_type { + NORMAL_FLUSH = 0, + SYNC_FLUSH, + SHUTDOWN_FLUSH, + FREEZE_FLUSH +}; +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); + +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); +extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_write_revokes(struct gfs2_sbd *sdp); + +#endif /* __LOG_DOT_H__ */ diff --git a/kernel/fs/gfs2/lops.c b/kernel/fs/gfs2/lops.c new file mode 100644 index 000000000..2c1ae861d --- /dev/null +++ b/kernel/fs/gfs2/lops.c @@ -0,0 +1,886 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "inode.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +/** + * gfs2_pin - Pin a buffer in memory + * @sdp: The superblock + * @bh: The buffer to be pinned + * + * The log lock must be held when calling this function + */ +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + BUG_ON(!current->journal_info); + + clear_buffer_dirty(bh); + if (test_set_buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + bd = bh->b_private; + /* If this buffer is in the AIL and it has already been written + * to in-place disk block, remove it from the AIL. + */ + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); + get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); + trace_gfs2_pin(bd, 1); +} + +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + if (bi->bi_clone == NULL) + return; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; +} + +/** + * gfs2_unpin - Unpin a buffer + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to unpin + * @ai: + * @flags: The inode dirty flags + * + */ + +static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd = bh->b_private; + + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!buffer_pinned(bh)); + + lock_buffer(bh); + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); + + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) { + list_del(&bd->bd_ail_st_list); + brelse(bh); + } else { + struct gfs2_glock *gl = bd->bd_gl; + list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); + atomic_inc(&gl->gl_ail_count); + } + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + trace_gfs2_pin(bd, 0); + unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); +} + +static void gfs2_log_incr_head(struct gfs2_sbd *sdp) +{ + BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && + (sdp->sd_log_flush_head != sdp->sd_log_head)); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } +} + +static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +{ + unsigned int lbn = sdp->sd_log_flush_head; + struct gfs2_journal_extent *je; + u64 block; + + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { + if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { + block = je->dblock + lbn - je->lblock; + gfs2_log_incr_head(sdp); + return block; + } + } + + return -1; +} + +/** + * gfs2_end_log_write_bh - end log write of pagecache data with buffers + * @sdp: The superblock + * @bvec: The bio_vec + * @error: The i/o status + * + * This finds the relavent buffers and unlocks then and sets the + * error flag according to the status of the i/o request. This is + * used when the log is writing data which has an in-place version + * that is pinned in the pagecache. + */ + +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, + int error) +{ + struct buffer_head *bh, *next; + struct page *page = bvec->bv_page; + unsigned size; + + bh = page_buffers(page); + size = bvec->bv_len; + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + if (error) + set_buffer_write_io_error(bh); + unlock_buffer(bh); + next = bh->b_this_page; + size -= bh->b_size; + brelse(bh); + bh = next; + } while(bh && size); +} + +/** + * gfs2_end_log_write - end of i/o to the log + * @bio: The bio + * @error: Status of i/o request + * + * Each bio_vec contains either data from the pagecache or data + * relating to the log itself. Here we iterate over the bio_vec + * array, processing both kinds of data. + * + */ + +static void gfs2_end_log_write(struct bio *bio, int error) +{ + struct gfs2_sbd *sdp = bio->bi_private; + struct bio_vec *bvec; + struct page *page; + int i; + + if (error) { + sdp->sd_log_error = error; + fs_err(sdp, "Error %d writing to log\n", error); + } + + bio_for_each_segment_all(bvec, bio, i) { + page = bvec->bv_page; + if (page_has_buffers(page)) + gfs2_end_log_write_bh(sdp, bvec, error); + else + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_flush_bio - Submit any pending log bio + * @sdp: The superblock + * @rw: The rw flags + * + * Submit any pending part-built or full bio to the block device. If + * there is no pending bio, then this is a no-op. + */ + +void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) +{ + if (sdp->sd_log_bio) { + atomic_inc(&sdp->sd_log_in_flight); + submit_bio(rw, sdp->sd_log_bio); + sdp->sd_log_bio = NULL; + } +} + +/** + * gfs2_log_alloc_bio - Allocate a new bio for log writing + * @sdp: The superblock + * @blkno: The next device block number we want to write to + * + * This should never be called when there is a cached bio in the + * super block. When it returns, there will be a cached bio in the + * super block which will have as many bio_vecs as the device is + * happy to handle. + * + * Returns: Newly allocated bio + */ + +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +{ + struct super_block *sb = sdp->sd_vfs; + unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); + struct bio *bio; + + BUG_ON(sdp->sd_log_bio); + + while (1) { + bio = bio_alloc(GFP_NOIO, nrvecs); + if (likely(bio)) + break; + nrvecs = max(nrvecs/2, 1U); + } + + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio->bi_end_io = gfs2_end_log_write; + bio->bi_private = sdp; + + sdp->sd_log_bio = bio; + + return bio; +} + +/** + * gfs2_log_get_bio - Get cached log bio, or allocate a new one + * @sdp: The superblock + * @blkno: The device block number we want to write to + * + * If there is a cached bio, then if the next block number is sequential + * with the previous one, return it, otherwise flush the bio to the + * device. If there is not a cached bio, or we just flushed it, then + * allocate a new one. + * + * Returns: The bio to use for log writes + */ + +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +{ + struct bio *bio = sdp->sd_log_bio; + u64 nblk; + + if (bio) { + nblk = bio_end_sector(bio); + nblk >>= sdp->sd_fsb2bb_shift; + if (blkno == nblk) + return bio; + gfs2_log_flush_bio(sdp, WRITE); + } + + return gfs2_log_alloc_bio(sdp, blkno); +} + + +/** + * gfs2_log_write - write to log + * @sdp: the filesystem + * @page: the page to write + * @size: the size of the data to write + * @offset: the offset within the page + * + * Try and add the page segment to the current bio. If that fails, + * submit the current bio to the device and create a new one, and + * then add the page segment to that. + */ + +static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, + unsigned size, unsigned offset) +{ + u64 blkno = gfs2_log_bmap(sdp); + struct bio *bio; + int ret; + + bio = gfs2_log_get_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + if (ret == 0) { + gfs2_log_flush_bio(sdp, WRITE); + bio = gfs2_log_alloc_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + WARN_ON(ret == 0); + } +} + +/** + * gfs2_log_write_bh - write a buffer's content to the log + * @sdp: The super block + * @bh: The buffer pointing to the in-place location + * + * This writes the content of the buffer to the next available location + * in the log. The buffer will be unlocked once the i/o to the log has + * completed. + */ + +static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/** + * gfs2_log_write_page - write one block stored in a page, into the log + * @sdp: The superblock + * @page: The struct page + * + * This writes the first block-sized part of the page into the log. Note + * that the page must have been allocated from the gfs2_page_pool mempool + * and that after this has been called, ownership has been transferred and + * the page may be freed at any time. + */ + +void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +{ + struct super_block *sb = sdp->sd_vfs; + gfs2_log_write(sdp, page, sb->s_blocksize, 0); +} + +static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, + u32 ld_length, u32 ld_data1) +{ + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct gfs2_log_descriptor *ld = page_address(page); + clear_page(ld); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(ld_type); + ld->ld_length = cpu_to_be32(ld_length); + ld->ld_data1 = cpu_to_be32(ld_data1); + ld->ld_data2 = 0; + return page; +} + +static void gfs2_check_magic(struct buffer_head *bh) +{ + void *kaddr; + __be32 *ptr; + + clear_buffer_escaped(bh); + kaddr = kmap_atomic(bh->b_page); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + set_buffer_escaped(bh); + kunmap_atomic(kaddr); +} + +static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_bufdata *bda, *bdb; + + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; +} + +static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, + unsigned int total, struct list_head *blist, + bool is_databuf) +{ + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd1 = NULL, *bd2; + struct page *page; + unsigned int num; + unsigned n; + __be64 *ptr; + + gfs2_log_lock(sdp); + list_sort(NULL, blist, blocknr_cmp); + bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); + while(total) { + num = total; + if (total > limit) + num = limit; + gfs2_log_unlock(sdp); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); + ld = page_address(page); + gfs2_log_lock(sdp); + ptr = (__be64 *)(ld + 1); + + n = 0; + list_for_each_entry_continue(bd1, blist, bd_list) { + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (is_databuf) { + gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0); + } + if (++n >= num) + break; + } + + gfs2_log_unlock(sdp); + gfs2_log_write_page(sdp, page); + gfs2_log_lock(sdp); + + n = 0; + list_for_each_entry_continue(bd2, blist, bd_list) { + get_bh(bd2->bd_bh); + gfs2_log_unlock(sdp); + lock_buffer(bd2->bd_bh); + + if (buffer_escaped(bd2->bd_bh)) { + void *kaddr; + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + ptr = page_address(page); + kaddr = kmap_atomic(bd2->bd_bh->b_page); + memcpy(ptr, kaddr + bh_offset(bd2->bd_bh), + bd2->bd_bh->b_size); + kunmap_atomic(kaddr); + *(__be32 *)ptr = 0; + clear_buffer_escaped(bd2->bd_bh); + unlock_buffer(bd2->bd_bh); + brelse(bd2->bd_bh); + gfs2_log_write_page(sdp, page); + } else { + gfs2_log_write_bh(sdp, bd2->bd_bh); + } + gfs2_log_lock(sdp); + if (++n >= num) + break; + } + + BUG_ON(total < num); + total -= num; + } + gfs2_log_unlock(sdp); +} + +static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0); +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head; + struct gfs2_bufdata *bd; + + if (tr == NULL) + return; + + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); + } +} + +static void buf_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + if (pass != 0) + return; + + jd->jd_found_blocks = 0; + jd->jd_replayed_blocks = 0; +} + +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + + jd->jd_found_blocks++; + + if (gfs2_revoke_check(jd, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + if (gfs2_meta_check(sdp, bh_ip)) + error = -EIO; + else + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + if (error) + break; + + jd->jd_replayed_blocks++; + } + + return error; +} + +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * + */ + +static void gfs2_meta_sync(struct gfs2_glock *gl) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + int error; + + if (mapping == NULL) + mapping = &sdp->sd_aspace; + + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + +static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); +} + +static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct gfs2_meta_header *mh; + unsigned int offset; + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + struct page *page; + unsigned int length; + + gfs2_write_revokes(sdp); + if (!sdp->sd_log_num_revoke) + return; + + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); + offset = sizeof(struct gfs2_log_descriptor); + + list_for_each_entry(bd, head, bd_list) { + sdp->sd_log_num_revoke--; + + if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { + + gfs2_log_write_page(sdp, page); + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + mh = page_address(page); + clear_page(mh); + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); + mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); + offset = sizeof(struct gfs2_meta_header); + } + + *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno); + offset += sizeof(u64); + } + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + + gfs2_log_write_page(sdp, page); +} + +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gl = bd->bd_gl; + atomic_dec(&gl->gl_revokes); + clear_bit(GLF_LFLUSH, &gl->gl_flags); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +static void revoke_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + if (pass != 0) + return; + + jd->jd_found_revokes = 0; + jd->jd_replay_tail = head->lh_tail; +} + +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh; + unsigned int offset; + u64 blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + + if (!first) + gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB); + + while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + + error = gfs2_revoke_add(jd, blkno, start); + if (error < 0) { + brelse(bh); + return error; + } + else if (error) + jd->jd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(u64); + } + + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + + return 0; +} + +static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_revoke_clean(jd); + return; + } + if (pass != 1) + return; + + fs_info(sdp, "jid=%u: Found %u revoke tags\n", + jd->jd_jid, jd->jd_found_revokes); + + gfs2_revoke_clean(jd); +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * + */ + +static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int limit = databuf_limit(sdp); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + u64 esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + jd->jd_found_blocks++; + + if (gfs2_revoke_check(jd, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + jd->jd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head; + struct gfs2_bufdata *bd; + + if (tr == NULL) + return; + + head = &tr->tr_databuf; + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); + } +} + + +const struct gfs2_log_operations gfs2_buf_lops = { + .lo_before_commit = buf_lo_before_commit, + .lo_after_commit = buf_lo_after_commit, + .lo_before_scan = buf_lo_before_scan, + .lo_scan_elements = buf_lo_scan_elements, + .lo_after_scan = buf_lo_after_scan, + .lo_name = "buf", +}; + +const struct gfs2_log_operations gfs2_revoke_lops = { + .lo_before_commit = revoke_lo_before_commit, + .lo_after_commit = revoke_lo_after_commit, + .lo_before_scan = revoke_lo_before_scan, + .lo_scan_elements = revoke_lo_scan_elements, + .lo_after_scan = revoke_lo_after_scan, + .lo_name = "revoke", +}; + +const struct gfs2_log_operations gfs2_databuf_lops = { + .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, + .lo_name = "databuf", +}; + +const struct gfs2_log_operations *gfs2_log_ops[] = { + &gfs2_databuf_lops, + &gfs2_buf_lops, + &gfs2_revoke_lops, + NULL, +}; + diff --git a/kernel/fs/gfs2/lops.h b/kernel/fs/gfs2/lops.h new file mode 100644 index 000000000..a65a7ba32 --- /dev/null +++ b/kernel/fs/gfs2/lops.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOPS_DOT_H__ +#define __LOPS_DOT_H__ + +#include +#include "incore.h" + +#define BUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \ + ~(sizeof(__be64) - 1)) +#define DATABUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ + ~(2 * sizeof(__be64) - 1)) + +extern const struct gfs2_log_operations gfs2_glock_lops; +extern const struct gfs2_log_operations gfs2_buf_lops; +extern const struct gfs2_log_operations gfs2_revoke_lops; +extern const struct gfs2_log_operations gfs2_databuf_lops; + +extern const struct gfs2_log_operations *gfs2_log_ops[]; +extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); +extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); + +static inline unsigned int buf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64); + return limit; +} + +static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64)); + return limit; +} + +static inline void lops_before_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_commit) + gfs2_log_ops[x]->lo_before_commit(sdp, tr); +} + +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_after_commit) + gfs2_log_ops[x]->lo_after_commit(sdp, tr); +} + +static inline void lops_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_before_scan(jd, head, pass); +} + +static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, + unsigned int pass) +{ + int x, error; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_scan_elements) { + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, + ld, ptr, pass); + if (error) + return error; + } + + return 0; +} + +static inline void lops_after_scan(struct gfs2_jdesc *jd, int error, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_after_scan(jd, error, pass); +} + +#endif /* __LOPS_DOT_H__ */ + diff --git a/kernel/fs/gfs2/main.c b/kernel/fs/gfs2/main.c new file mode 100644 index 000000000..241a399bf --- /dev/null +++ b/kernel/fs/gfs2/main.c @@ -0,0 +1,258 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "glock.h" +#include "quota.h" +#include "recovery.h" +#include "dir.h" +#include "glops.h" + +struct workqueue_struct *gfs2_control_wq; + +static void gfs2_init_inode_once(void *foo) +{ + struct gfs2_inode *ip = foo; + + inode_init_once(&ip->i_inode); + init_rwsem(&ip->i_rw_mutex); + INIT_LIST_HEAD(&ip->i_trunc_list); + ip->i_res = NULL; + ip->i_hash_cache = NULL; +} + +static void gfs2_init_glock_once(void *foo) +{ + struct gfs2_glock *gl = foo; + + INIT_HLIST_BL_NODE(&gl->gl_list); + spin_lock_init(&gl->gl_spin); + INIT_LIST_HEAD(&gl->gl_holders); + INIT_LIST_HEAD(&gl->gl_lru); + INIT_LIST_HEAD(&gl->gl_ail_list); + atomic_set(&gl->gl_ail_count, 0); + atomic_set(&gl->gl_revokes, 0); +} + +static void gfs2_init_gl_aspace_once(void *foo) +{ + struct gfs2_glock *gl = foo; + struct address_space *mapping = (struct address_space *)(gl + 1); + + gfs2_init_glock_once(gl); + address_space_init_once(mapping); +} + +/** + * init_gfs2_fs - Register GFS2 as a filesystem + * + * Returns: 0 on success, error code on failure + */ + +static int __init init_gfs2_fs(void) +{ + int error; + + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); + + error = gfs2_sys_init(); + if (error) + return error; + + error = list_lru_init(&gfs2_qd_lru); + if (error) + goto fail_lru; + + error = gfs2_glock_init(); + if (error) + goto fail; + + error = -ENOMEM; + gfs2_glock_cachep = kmem_cache_create("gfs2_glock", + sizeof(struct gfs2_glock), + 0, 0, + gfs2_init_glock_once); + if (!gfs2_glock_cachep) + goto fail; + + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", + sizeof(struct gfs2_glock) + + sizeof(struct address_space), + 0, 0, gfs2_init_gl_aspace_once); + + if (!gfs2_glock_aspace_cachep) + goto fail; + + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", + sizeof(struct gfs2_inode), + 0, SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD, + gfs2_init_inode_once); + if (!gfs2_inode_cachep) + goto fail; + + gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", + sizeof(struct gfs2_bufdata), + 0, 0, NULL); + if (!gfs2_bufdata_cachep) + goto fail; + + gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", + sizeof(struct gfs2_rgrpd), + 0, 0, NULL); + if (!gfs2_rgrpd_cachep) + goto fail; + + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, 0, NULL); + if (!gfs2_quotad_cachep) + goto fail; + + gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk", + sizeof(struct gfs2_blkreserv), + 0, 0, NULL); + if (!gfs2_rsrv_cachep) + goto fail; + + register_shrinker(&gfs2_qd_shrinker); + + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_unregister; + + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + if (!gfs_recovery_wq) + goto fail_wq; + + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_recovery; + + gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0); + + if (!gfs2_freeze_wq) + goto fail_control; + + gfs2_page_pool = mempool_create_page_pool(64, 0); + if (!gfs2_page_pool) + goto fail_freeze; + + gfs2_register_debugfs(); + + pr_info("GFS2 installed\n"); + + return 0; + +fail_freeze: + destroy_workqueue(gfs2_freeze_wq); +fail_control: + destroy_workqueue(gfs2_control_wq); +fail_recovery: + destroy_workqueue(gfs_recovery_wq); +fail_wq: + unregister_filesystem(&gfs2meta_fs_type); +fail_unregister: + unregister_filesystem(&gfs2_fs_type); +fail: + list_lru_destroy(&gfs2_qd_lru); +fail_lru: + unregister_shrinker(&gfs2_qd_shrinker); + gfs2_glock_exit(); + + if (gfs2_rsrv_cachep) + kmem_cache_destroy(gfs2_rsrv_cachep); + + if (gfs2_quotad_cachep) + kmem_cache_destroy(gfs2_quotad_cachep); + + if (gfs2_rgrpd_cachep) + kmem_cache_destroy(gfs2_rgrpd_cachep); + + if (gfs2_bufdata_cachep) + kmem_cache_destroy(gfs2_bufdata_cachep); + + if (gfs2_inode_cachep) + kmem_cache_destroy(gfs2_inode_cachep); + + if (gfs2_glock_aspace_cachep) + kmem_cache_destroy(gfs2_glock_aspace_cachep); + + if (gfs2_glock_cachep) + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); + return error; +} + +/** + * exit_gfs2_fs - Unregister the file system + * + */ + +static void __exit exit_gfs2_fs(void) +{ + unregister_shrinker(&gfs2_qd_shrinker); + gfs2_glock_exit(); + gfs2_unregister_debugfs(); + unregister_filesystem(&gfs2_fs_type); + unregister_filesystem(&gfs2meta_fs_type); + destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_control_wq); + destroy_workqueue(gfs2_freeze_wq); + list_lru_destroy(&gfs2_qd_lru); + + rcu_barrier(); + + mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_rsrv_cachep); + kmem_cache_destroy(gfs2_quotad_cachep); + kmem_cache_destroy(gfs2_rgrpd_cachep); + kmem_cache_destroy(gfs2_bufdata_cachep); + kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_aspace_cachep); + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); +} + +MODULE_DESCRIPTION("Global File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_gfs2_fs); +module_exit(exit_gfs2_fs); + diff --git a/kernel/fs/gfs2/meta_io.c b/kernel/fs/gfs2/meta_io.c new file mode 100644 index 000000000..b984a6e19 --- /dev/null +++ b/kernel/fs/gfs2/meta_io.c @@ -0,0 +1,403 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + int nr_underway = 0; + int write_op = REQ_META | REQ_PRIO | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from flusher thread and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +} + +const struct address_space_operations gfs2_meta_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + +const struct address_space_operations gfs2_rgrp_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + +/** + * gfs2_getbuf - Get a buffer with a given address space + * @gl: the glock + * @blkno: the block number (filesystem scope) + * @create: 1 if the buffer should be created + * + * Returns: the buffer + */ + +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct page *page; + struct buffer_head *bh; + unsigned int shift; + unsigned long index; + unsigned int bufnum; + + if (mapping == NULL) + mapping = &sdp->sd_aspace; + + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; + index = blkno >> shift; /* convert block to page */ + bufnum = blkno - (index << shift); /* block buf index within page */ + + if (create) { + for (;;) { + page = grab_cache_page(mapping, index); + if (page) + break; + yield(); + } + } else { + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + + if (!buffer_mapped(bh)) + map_bh(bh, sdp->sd_vfs, blkno); + + unlock_page(page); + page_cache_release(page); + + return bh; +} + +static void meta_prep_new(struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); +} + +/** + * gfs2_meta_new - Get a block + * @gl: The glock associated with this block + * @blkno: The block number + * + * Returns: The buffer + */ + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + bh = gfs2_getbuf(gl, blkno, CREATE); + meta_prep_new(bh); + return bh; +} + +/** + * gfs2_meta_read - Read a block from disk + * @gl: The glock covering the block + * @blkno: The block number + * @flags: flags + * @bhp: the place where the buffer is returned (NULL on failure) + * + * Returns: errno + */ + +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *bh; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + *bhp = NULL; + return -EIO; + } + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + if (!(flags & DIO_WAIT)) + return 0; + + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + *bhp = NULL; + return -EIO; + } + + return 0; +} + +/** + * gfs2_meta_wait - Reread a block from disk + * @sdp: the filesystem + * @bh: The block to wait for + * + * Returns: errno + */ + +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + return -EIO; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return 0; +} + +void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) +{ + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; + + if (test_clear_buffer_pinned(bh)) { + trace_gfs2_pin(bd, 0); + atomic_dec(&sdp->sd_log_pinned); + list_del_init(&bd->bd_list); + if (meta) + tr->tr_num_buf_rm++; + else + tr->tr_num_databuf_rm++; + tr->tr_touched = 1; + was_pinned = 1; + brelse(bh); + } + if (bd) { + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) { + gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + spin_unlock(&sdp->sd_ail_lock); + } + clear_buffer_dirty(bh); + clear_buffer_uptodate(bh); +} + +/** + * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * @ip: the inode who owns the buffers + * @bstart: the first buffer in the run + * @blen: the number of buffers in the run + * + */ + +void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + + while (blen) { + bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (bh) { + lock_buffer(bh); + gfs2_log_lock(sdp); + gfs2_remove_from_journal(bh, current->journal_info, 1); + gfs2_log_unlock(sdp); + unlock_buffer(bh); + brelse(bh); + } + + bstart++; + blen--; + } +} + +/** + * gfs2_meta_indirect_buffer - Get a metadata buffer + * @ip: The GFS2 inode + * @height: The level of this buf in the metadata (indir addr) tree (if any) + * @num: The block number (device relative) of the buffer + * @bhp: the buffer is returned here + * + * Returns: errno + */ + +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + int ret = 0; + u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + + ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; + } + *bhp = bh; + return ret; +} + +/** + * gfs2_meta_ra - start readahead on an extent of a file + * @gl: the glock the blocks belong to + * @dblock: the starting disk block + * @extlen: the number of blocks in the extent + * + * returns: the first buffer in the extent + */ + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *first_bh, *bh; + u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> + sdp->sd_sb.sb_bsize_shift; + + BUG_ON(!extlen); + + if (max_ra < 1) + max_ra = 1; + if (extlen > max_ra) + extlen = max_ra; + + first_bh = gfs2_getbuf(gl, dblock, CREATE); + + if (buffer_uptodate(first_bh)) + goto out; + if (!buffer_locked(first_bh)) + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + + dblock++; + extlen--; + + while (extlen) { + bh = gfs2_getbuf(gl, dblock, CREATE); + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) + ll_rw_block(READA | REQ_META, 1, &bh); + brelse(bh); + dblock++; + extlen--; + if (!buffer_locked(first_bh) && buffer_uptodate(first_bh)) + goto out; + } + + wait_on_buffer(first_bh); +out: + return first_bh; +} + diff --git a/kernel/fs/gfs2/meta_io.h b/kernel/fs/gfs2/meta_io.h new file mode 100644 index 000000000..ac5d8027d --- /dev/null +++ b/kernel/fs/gfs2/meta_io.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIO_DOT_H__ +#define __DIO_DOT_H__ + +#include +#include +#include "incore.h" + +static inline void gfs2_buffer_clear(struct buffer_head *bh) +{ + memset(bh->b_data, 0, bh->b_size); +} + +static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head) +{ + BUG_ON(head > bh->b_size); + memset(bh->b_data + head, 0, bh->b_size - head); +} + +static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, + int to_head, + struct buffer_head *from_bh, + int from_head) +{ + BUG_ON(from_head < to_head); + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + from_bh->b_size - from_head); + memset(to_bh->b_data + to_bh->b_size + to_head - from_head, + 0, from_head - to_head); +} + +extern const struct address_space_operations gfs2_meta_aops; +extern const struct address_space_operations gfs2_rgrp_aops; + +static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + if (mapping->a_ops == &gfs2_meta_aops) + return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else if (mapping->a_ops == &gfs2_rgrp_aops) + return container_of(mapping, struct gfs2_sbd, sd_aspace); + else + return inode->i_sb->s_fs_info; +} + +extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp); +extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); +extern void gfs2_remove_from_journal(struct buffer_head *bh, + struct gfs2_trans *tr, int meta); +extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + struct buffer_head **bhp); + +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, + struct buffer_head **bhp) +{ + return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp); +} + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); + +#define buffer_busy(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) + +#endif /* __DIO_DOT_H__ */ + diff --git a/kernel/fs/gfs2/ops_fstype.c b/kernel/fs/gfs2/ops_fstype.c new file mode 100644 index 000000000..35b49f44c --- /dev/null +++ b/kernel/fs/gfs2/ops_fstype.c @@ -0,0 +1,1409 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "log.h" +#include "quota.h" +#include "dir.h" +#include "meta_io.h" +#include "trace_gfs2.h" + +#define DO 0 +#define UNDO 1 + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = 1 << 18; + gt->gt_complain_secs = 10; +} + +static struct gfs2_sbd *init_sbd(struct super_block *sb) +{ + struct gfs2_sbd *sdp; + struct address_space *mapping; + + sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); + if (!sdp) + return NULL; + + sb->s_fs_info = sdp; + sdp->sd_vfs = sb; + sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); + if (!sdp->sd_lkstats) { + kfree(sdp); + return NULL; + } + + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); + gfs2_tune_init(&sdp->sd_tune); + + init_waitqueue_head(&sdp->sd_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); + spin_lock_init(&sdp->sd_statfs_spin); + + spin_lock_init(&sdp->sd_rindex_spin); + sdp->sd_rindex_tree.rb_node = NULL; + + INIT_LIST_HEAD(&sdp->sd_jindex_list); + spin_lock_init(&sdp->sd_jindex_spin); + mutex_init(&sdp->sd_jindex_mutex); + init_completion(&sdp->sd_journal_ready); + + INIT_LIST_HEAD(&sdp->sd_quota_list); + mutex_init(&sdp->sd_quota_mutex); + mutex_init(&sdp->sd_quota_sync_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + INIT_LIST_HEAD(&sdp->sd_trunc_list); + spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_rgrp_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->writeback_index = 0; + + spin_lock_init(&sdp->sd_log_lock); + atomic_set(&sdp->sd_log_pinned, 0); + INIT_LIST_HEAD(&sdp->sd_log_le_revoke); + INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + spin_lock_init(&sdp->sd_ordered_lock); + + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); + INIT_LIST_HEAD(&sdp->sd_ail1_list); + INIT_LIST_HEAD(&sdp->sd_ail2_list); + + init_rwsem(&sdp->sd_log_flush_lock); + atomic_set(&sdp->sd_log_in_flight, 0); + atomic_set(&sdp->sd_reserving_log, 0); + init_waitqueue_head(&sdp->sd_reserving_log_wait); + init_waitqueue_head(&sdp->sd_log_flush_wait); + atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); + mutex_init(&sdp->sd_freeze_mutex); + + return sdp; +} + + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + pr_warn("not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + + return -EINVAL; +} + +static void end_bio_io_page(struct bio *bio, int error) +{ + struct page *page = bio->bi_private; + + if (!error) + SetPageUptodate(page); + else + pr_warn("error %d reading superblock\n", error); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + struct super_block *s = sdp->sd_vfs; + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); + memcpy(s->s_uuid, str->sb_uuid, 16); +} + +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @error: The error code to return + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ + +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOMEM; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | REQ_META, bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(sdp, p); + kunmap(page); + __free_page(page); + return gfs2_check_sb(sdp, silent); +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; +} + +static int init_names(struct gfs2_sbd *sdp, int silent) +{ + char *proto, *table; + int error = 0; + + proto = sdp->sd_args.ar_lockproto; + table = sdp->sd_args.ar_locktable; + + /* Try to autodetect */ + + if (!proto[0] || !table[0]) { + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) + return error; + + if (!proto[0]) + proto = sdp->sd_sb.sb_lockproto; + if (!table[0]) + table = sdp->sd_sb.sb_locktable; + } + + if (!table[0]) + table = sdp->sd_vfs->s_id; + + strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); + strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + + table = sdp->sd_table_name; + while ((table = strchr(table, '/'))) + *table = '_'; + + return error; +} + +static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, + int undo) +{ + int error = 0; + + if (undo) + goto fail_trans; + + error = gfs2_glock_nq_num(sdp, + GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + mount_gh); + if (error) { + fs_err(sdp, "can't acquire mount glock: %d\n", error); + goto fail; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_LIVE_LOCK, &gfs2_nondisk_glops, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_live_gh); + if (error) { + fs_err(sdp, "can't acquire live glock: %d\n", error); + goto fail_mount; + } + + error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops, + CREATE, &sdp->sd_rename_gl); + if (error) { + fs_err(sdp, "can't create rename glock: %d\n", error); + goto fail_live; + } + + error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, + CREATE, &sdp->sd_freeze_gl); + if (error) { + fs_err(sdp, "can't create transaction glock: %d\n", error); + goto fail_rename; + } + + return 0; + +fail_trans: + gfs2_glock_put(sdp->sd_freeze_gl); +fail_rename: + gfs2_glock_put(sdp->sd_rename_gl); +fail_live: + gfs2_glock_dq_uninit(&sdp->sd_live_gh); +fail_mount: + gfs2_glock_dq_uninit(mount_gh); +fail: + return error; +} + +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_make_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + return -ENOMEM; + } + *dptr = dentry; + return 0; +} + +static int init_sb(struct gfs2_sbd *sdp, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder sb_gh; + u64 no_addr; + int ret; + + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; + } + + ret = gfs2_read_sb(sdp, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); + goto out; + } + + /* Set up the buffer cache and SB for real */ + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too small for device " + "block size (%u)\n", + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); + goto out; + } + if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too big for machine " + "page size (%u)\n", + sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); + goto out; + } + sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + + /* Get the root inode */ + no_addr = sdp->sd_sb.sb_root_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) + goto out; + + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); +out: + gfs2_glock_dq_uninit(&sb_gh); + return ret; +} + +static void gfs2_others_may_mount(struct gfs2_sbd *sdp) +{ + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); +} + +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * Returns: errno + */ + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + INIT_LIST_HEAD(&jd->jd_revoke_list); + + INIT_WORK(&jd->jd_work, gfs2_recover_func); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +/** + * check_journal_clean - Make sure a journal is clean for a spectator mount + * @sdp: The GFS2 superblock + * @jd: The journal descriptor + * + * Returns: 0 if the journal is clean or locked, else an error + */ +static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + int error; + struct gfs2_holder j_gh; + struct gfs2_log_header_host head; + struct gfs2_inode *ip; + + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | + GL_EXACT | GL_NOCACHE, &j_gh); + if (error) { + fs_err(sdp, "Error locking journal for spectator mount.\n"); + return -EPERM; + } + error = gfs2_jdesc_check(jd); + if (error) { + fs_err(sdp, "Error checking journal for spectator mount.\n"); + goto out_unlock; + } + error = gfs2_find_jhead(jd, &head); + if (error) { + fs_err(sdp, "Error parsing journal for spectator mount.\n"); + goto out_unlock; + } + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EPERM; + fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter " + "must not be a spectator.\n", jd->jd_jid); + } + +out_unlock: + gfs2_glock_dq_uninit(&j_gh); + return error; +} + +static int init_journal(struct gfs2_sbd *sdp, int undo) +{ + struct inode *master = d_inode(sdp->sd_master_dir); + struct gfs2_holder ji_gh; + struct gfs2_inode *ip; + int jindex = 1; + int error = 0; + + if (undo) { + jindex = 0; + goto fail_jinode_gh; + } + + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); + if (IS_ERR(sdp->sd_jindex)) { + fs_err(sdp, "can't lookup journal index: %d\n", error); + return PTR_ERR(sdp->sd_jindex); + } + + /* Load in the journal index special file */ + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) { + fs_err(sdp, "can't read journal index: %d\n", error); + goto fail; + } + + error = -EUSERS; + if (!gfs2_jindex_size(sdp)) { + fs_err(sdp, "no journals!\n"); + goto fail_jindex; + } + + if (sdp->sd_args.ar_spectator) { + sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + } else { + if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { + fs_err(sdp, "can't mount journal #%u\n", + sdp->sd_lockstruct.ls_jid); + fs_err(sdp, "there are only %u journals (0 - %u)\n", + gfs2_jindex_size(sdp), + gfs2_jindex_size(sdp) - 1); + goto fail_jindex; + } + sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid); + + error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, + &gfs2_journal_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP, + &sdp->sd_journal_gh); + if (error) { + fs_err(sdp, "can't acquire journal glock: %d\n", error); + goto fail_jindex; + } + + ip = GFS2_I(sdp->sd_jdesc->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + &sdp->sd_jinode_gh); + if (error) { + fs_err(sdp, "can't acquire journal inode glock: %d\n", + error); + goto fail_journal_gh; + } + + error = gfs2_jdesc_check(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "my journal (%u) is bad: %d\n", + sdp->sd_jdesc->jd_jid, error); + goto fail_jinode_gh; + } + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + + /* Map the extents for this journal's blocks */ + gfs2_map_journal_extents(sdp, sdp->sd_jdesc); + } + trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + + if (sdp->sd_lockstruct.ls_first) { + unsigned int x; + for (x = 0; x < sdp->sd_journals; x++) { + struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x); + + if (sdp->sd_args.ar_spectator) { + error = check_journal_clean(sdp, jd); + if (error) + goto fail_jinode_gh; + continue; + } + error = gfs2_recover_journal(jd, true); + if (error) { + fs_err(sdp, "error recovering journal %u: %d\n", + x, error); + goto fail_jinode_gh; + } + } + + gfs2_others_may_mount(sdp); + } else if (!sdp->sd_args.ar_spectator) { + error = gfs2_recover_journal(sdp->sd_jdesc, true); + if (error) { + fs_err(sdp, "error recovering my journal: %d\n", error); + goto fail_jinode_gh; + } + } + + set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); + gfs2_glock_dq_uninit(&ji_gh); + jindex = 0; + INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); + return 0; + +fail_jinode_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); +fail_journal_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); +fail_jindex: + gfs2_jindex_free(sdp); + if (jindex) + gfs2_glock_dq_uninit(&ji_gh); +fail: + iput(sdp->sd_jindex); + return error; +} + +static struct lock_class_key gfs2_quota_imutex_key; + +static int init_inodes(struct gfs2_sbd *sdp, int undo) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + + if (undo) + goto fail_qinode; + + error = init_journal(sdp, undo); + complete_all(&sdp->sd_journal_ready); + if (error) + goto fail; + + /* Read in the master statfs inode */ + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail_journal; + } + + /* Read in the resource index inode */ + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); + if (IS_ERR(sdp->sd_rindex)) { + error = PTR_ERR(sdp->sd_rindex); + fs_err(sdp, "can't get resource index inode: %d\n", error); + goto fail_statfs; + } + sdp->sd_rindex_uptodate = 0; + + /* Read in the quota inode */ + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); + if (IS_ERR(sdp->sd_quota_inode)) { + error = PTR_ERR(sdp->sd_quota_inode); + fs_err(sdp, "can't get quota file inode: %d\n", error); + goto fail_rindex; + } + /* + * i_mutex on quota files is special. Since this inode is hidden system + * file, we are safe to define locking ourselves. + */ + lockdep_set_class(&sdp->sd_quota_inode->i_mutex, + &gfs2_quota_imutex_key); + + error = gfs2_rindex_update(sdp); + if (error) + goto fail_qinode; + + return 0; + +fail_qinode: + iput(sdp->sd_quota_inode); +fail_rindex: + gfs2_clear_rgrpd(sdp); + iput(sdp->sd_rindex); +fail_statfs: + iput(sdp->sd_statfs_inode); +fail_journal: + init_journal(sdp, UNDO); +fail: + return error; +} + +static int init_per_node(struct gfs2_sbd *sdp, int undo) +{ + struct inode *pn = NULL; + char buf[30]; + int error = 0; + struct gfs2_inode *ip; + struct inode *master = d_inode(sdp->sd_master_dir); + + if (sdp->sd_args.ar_spectator) + return 0; + + if (undo) + goto fail_qc_gh; + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + return error; + } + + sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_sc_inode)) { + error = PTR_ERR(sdp->sd_sc_inode); + fs_err(sdp, "can't find local \"sc\" file: %d\n", error); + goto fail; + } + + sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_qc_inode)) { + error = PTR_ERR(sdp->sd_qc_inode); + fs_err(sdp, "can't find local \"qc\" file: %d\n", error); + goto fail_ut_i; + } + + iput(pn); + pn = NULL; + + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto fail_qc_i; + } + + ip = GFS2_I(sdp->sd_qc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_qc_gh); + if (error) { + fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); + goto fail_ut_gh; + } + + return 0; + +fail_qc_gh: + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); +fail_ut_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); +fail_qc_i: + iput(sdp->sd_qc_inode); +fail_ut_i: + iput(sdp->sd_sc_inode); +fail: + if (pn) + iput(pn); + return error; +} + +static const match_table_t nolock_tokens = { + { Opt_jid, "jid=%d\n", }, + { Opt_err, NULL }, +}; + +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_put_lock = gfs2_glock_free, + .lm_tokens = &nolock_tokens, +}; + +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguments + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + const struct lm_lockops *lm; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + struct gfs2_args *args = &sdp->sd_args; + const char *proto = sdp->sd_proto_name; + const char *table = sdp->sd_table_name; + char *o, *options; + int ret; + + if (!strcmp("lock_nolock", proto)) { + lm = &nolock_ops; + sdp->sd_args.ar_localflocks = 1; +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + } else if (!strcmp("lock_dlm", proto)) { + lm = &gfs2_dlm_ops; +#endif + } else { + pr_info("can't find protocol %s\n", proto); + return -ENOENT; + } + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + ls->ls_ops = lm; + ls->ls_first = 1; + + for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { + substring_t tmp[MAX_OPT_ARGS]; + int token, option; + + if (!o || !*o) + continue; + + token = match_token(o, *lm->lm_tokens, tmp); + switch (token) { + case Opt_jid: + ret = match_int(&tmp[0], &option); + if (ret || option < 0) + goto hostdata_error; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; + break; + case Opt_id: + case Opt_nodir: + /* Obsolete, but left for backward compat purposes */ + break; + case Opt_first: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_first = option; + break; + case Opt_err: + default: +hostdata_error: + fs_info(sdp, "unknown hostdata (%s)\n", o); + return -EINVAL; + } + } + + if (lm->lm_mount == NULL) { + fs_info(sdp, "Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return 0; + } + ret = lm->lm_mount(sdp, table); + if (ret == 0) + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return ret; +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + lm->lm_unmount) + lm->lm_unmount(sdp); +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; +} + +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + +/** + * fill_super - Read in superblock + * @sb: The VFS superblock + * @data: Mount options + * @silent: Don't complain if it's not a GFS2 filesystem + * + * Returns: errno + */ + +static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +{ + struct gfs2_sbd *sdp; + struct gfs2_holder mount_gh; + int error; + + sdp = init_sbd(sb); + if (!sdp) { + pr_warn("can't alloc struct gfs2_sbd\n"); + return -ENOMEM; + } + sdp->sd_args = *args; + + if (sdp->sd_args.ar_spectator) { + sb->s_flags |= MS_RDONLY; + set_bit(SDF_RORECOVERY, &sdp->sd_flags); + } + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + + sb->s_flags |= MS_NOSEC; + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; + if (sdp->sd_args.ar_statfs_quantum) { + sdp->sd_tune.gt_statfs_slow = 0; + sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; + } else { + sdp->sd_tune.gt_statfs_slow = 1; + sdp->sd_tune.gt_statfs_quantum = 30; + } + + error = init_names(sdp, silent); + if (error) { + /* In this case, we haven't initialized sysfs, so we have to + manually free the sdp. */ + free_percpu(sdp->sd_lkstats); + kfree(sdp); + sb->s_fs_info = NULL; + return error; + } + + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + + error = gfs2_sys_fs_add(sdp); + /* + * If we hit an error here, gfs2_sys_fs_add will have called function + * kobject_put which causes the sysfs usage count to go to zero, which + * causes sysfs to call function gfs2_sbd_release, which frees sdp. + * Subsequent error paths here will call gfs2_sys_fs_del, which also + * kobject_put to free sdp. + */ + if (error) + return error; + + gfs2_create_debugfs_file(sdp); + + error = gfs2_lm_mount(sdp, silent); + if (error) + goto fail_debug; + + error = init_locking(sdp, &mount_gh, DO); + if (error) + goto fail_lm; + + error = init_sb(sdp, silent); + if (error) + goto fail_locking; + + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + + error = init_inodes(sdp, DO); + if (error) + goto fail_sb; + + error = init_per_node(sdp, DO); + if (error) + goto fail_inodes; + + error = gfs2_statfs_init(sdp); + if (error) { + fs_err(sdp, "can't initialize statfs subsystem: %d\n", error); + goto fail_per_node; + } + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_per_node; + } + } + + gfs2_glock_dq_uninit(&mount_gh); + gfs2_online_uevent(sdp); + return 0; + +fail_per_node: + init_per_node(sdp, UNDO); +fail_inodes: + init_inodes(sdp, UNDO); +fail_sb: + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + if (sb->s_root) + dput(sb->s_root); + sb->s_root = NULL; +fail_locking: + init_locking(sdp, &mount_gh, UNDO); +fail_lm: + complete_all(&sdp->sd_journal_ready); + gfs2_gl_hash_clear(sdp); + gfs2_lm_unmount(sdp); +fail_debug: + gfs2_delete_debugfs_file(sdp); + free_percpu(sdp->sd_lkstats); + /* gfs2_sys_fs_del must be the last thing we do, since it causes + * sysfs to call function gfs2_sbd_release, which frees sdp. */ + gfs2_sys_fs_del(sdp); + sb->s_fs_info = NULL; + return error; +} + +static int set_gfs2_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + return 0; +} + +static int test_gfs2_super(struct super_block *s, void *ptr) +{ + struct block_device *bdev = ptr; + return (bdev == s->s_bdev); +} + +/** + * gfs2_mount - Get the GFS2 superblock + * @fs_type: The GFS2 filesystem type + * @flags: Mount flags + * @dev_name: The name of the device + * @data: The mount arguments + * + * Q. Why not use get_sb_bdev() ? + * A. We need to select one of two root directories to mount, independent + * of whether this is the initial, or subsequent, mount of this sb + * + * Returns: 0 or -ve on error + */ + +static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error; + struct gfs2_args args; + struct gfs2_sbd *sdp; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = PTR_ERR(s); + if (IS_ERR(s)) + goto error_bdev; + + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } + + memset(&args, 0, sizeof(args)); + args.ar_quota = GFS2_QUOTA_DEFAULT; + args.ar_data = GFS2_DATA_DEFAULT; + args.ar_commit = 30; + args.ar_statfs_quantum = 30; + args.ar_quota_quantum = 60; + args.ar_errors = GFS2_ERRORS_DEFAULT; + + error = gfs2_mount_args(&args, data); + if (error) { + pr_warn("can't parse mount arguments\n"); + goto error_super; + } + + if (s->s_root) { + error = -EBUSY; + if ((flags ^ s->s_flags) & MS_RDONLY) + goto error_super; + } else { + char b[BDEVNAME_SIZE]; + + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + if (error) + goto error_super; + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; + } + + sdp = s->s_fs_info; + if (args.ar_meta) + return dget(sdp->sd_master_dir); + else + return dget(sdp->sd_root_dir); + +error_super: + deactivate_locked_super(s); + return ERR_PTR(error); +error_bdev: + blkdev_put(bdev, mode); + return ERR_PTR(error); +} + +static int set_meta_super(struct super_block *s, void *ptr) +{ + return -EINVAL; +} + +static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct super_block *s; + struct gfs2_sbd *sdp; + struct path path; + int error; + + error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + if (error) { + pr_warn("path_lookup on %s returned error %d\n", + dev_name, error); + return ERR_PTR(error); + } + s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, + d_inode(path.dentry)->i_sb->s_bdev); + path_put(&path); + if (IS_ERR(s)) { + pr_warn("gfs2 mount does not exist\n"); + return ERR_CAST(s); + } + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + return ERR_PTR(-EBUSY); + } + sdp = s->s_fs_info; + return dget(sdp->sd_master_dir); +} + +static void gfs2_kill_sb(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + if (sdp == NULL) { + kill_block_super(sb); + return; + } + + gfs2_log_flush(sdp, NULL, SYNC_FLUSH); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; + shrink_dcache_sb(sb); + gfs2_delete_debugfs_file(sdp); + free_percpu(sdp->sd_lkstats); + kill_block_super(sb); +} + +struct file_system_type gfs2_fs_type = { + .name = "gfs2", + .fs_flags = FS_REQUIRES_DEV, + .mount = gfs2_mount, + .kill_sb = gfs2_kill_sb, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_FS("gfs2"); + +struct file_system_type gfs2meta_fs_type = { + .name = "gfs2meta", + .fs_flags = FS_REQUIRES_DEV, + .mount = gfs2_mount_meta, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_FS("gfs2meta"); diff --git a/kernel/fs/gfs2/quota.c b/kernel/fs/gfs2/quota.c new file mode 100644 index 000000000..e3065cb9a --- /dev/null +++ b/kernel/fs/gfs2/quota.c @@ -0,0 +1,1680 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Quota change tags are associated with each transaction that allocates or + * deallocates space. Those changes are accumulated locally to each node (in a + * per-node file) and then are periodically synced to the quota file. This + * avoids the bottleneck of constantly touching the quota file, but introduces + * fuzziness in the current usage value of IDs that are being used on different + * nodes in the cluster simultaneously. So, it is possible for a user on + * multiple nodes to overrun their quota, but that overrun is controlable. + * Since quota tags are part of transactions, there is no need for a quota check + * program to be run on node crashes or anything like that. + * + * There are couple of knobs that let the administrator manage the quota + * fuzziness. "quota_quantum" sets the maximum time a quota change can be + * sitting on one node before being synced to the quota file. (The default is + * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency + * of quota file syncs increases as the user moves closer to their limit. The + * more frequent the syncs, the more accurate the quota enforcement, but that + * means that there is more contention between the nodes for the quota file. + * The default value is one. This sets the maximum theoretical quota overrun + * (with infinite node with infinite bandwidth) to twice the user's limit. (In + * practice, the maximum overrun you see should be much less.) A "quota_scale" + * number greater than one makes quota syncs more frequent and reduces the + * maximum overrun. Numbers less than one (but greater than zero) make quota + * syncs less frequent. + * + * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of + * the quota file, so it is not being constantly read. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "inode.h" +#include "util.h" + +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) + +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ +static DEFINE_SPINLOCK(qd_lock); +struct list_lru gfs2_qd_lru; + +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + +static void gfs2_qd_dispose(struct list_head *list) +{ + struct gfs2_quota_data *qd; + struct gfs2_sbd *sdp; + + while (!list_empty(list)) { + qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); + sdp = qd->qd_gl->gl_sbd; + + list_del(&qd->qd_lru); + + /* Free from the filesystem-specific list */ + spin_lock(&qd_lock); + list_del(&qd->qd_list); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_glock_put(qd->qd_gl); + atomic_dec(&sdp->sd_quota_count); + + /* Delete it from the common reclaim list */ + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); + } +} + + +static enum lru_status gfs2_qd_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + + if (!spin_trylock(&qd->qd_lockref.lock)) + return LRU_SKIP; + + if (qd->qd_lockref.count == 0) { + lockref_mark_dead(&qd->qd_lockref); + list_lru_isolate_move(lru, &qd->qd_lru, dispose); + } + + spin_unlock(&qd->qd_lockref.lock); + return LRU_REMOVED; +} + +static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, + gfs2_qd_isolate, &dispose); + + gfs2_qd_dispose(&dispose); + + return freed; +} + +static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); +} + +struct shrinker gfs2_qd_shrinker = { + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + + +static u64 qd2index(struct gfs2_quota_data *qd) +{ + struct kqid qid = qd->qd_id; + return (2 * (u64)from_kqid(&init_user_ns, qid)) + + ((qid.type == USRQUOTA) ? 0 : 1); +} + +static u64 qd2offset(struct gfs2_quota_data *qd) +{ + u64 offset; + + offset = qd2index(qd); + offset *= sizeof(struct gfs2_quota); + + return offset; +} + +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); + if (!qd) + return NULL; + + qd->qd_sbd = sdp; + qd->qd_lockref.count = 1; + spin_lock_init(&qd->qd_lockref.lock); + qd->qd_id = qid; + qd->qd_slot = -1; + INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; + + error = gfs2_glock_get(sdp, qd2index(qd), + &gfs2_quota_glops, CREATE, &qd->qd_gl); + if (error) + goto fail; + + return qd; + +fail: + kmem_cache_free(gfs2_quotad_cachep, qd); + return NULL; +} + +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) +{ + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; + + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; + } + } + + return NULL; +} + + +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); + + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); + + if (qd) + return 0; + + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); + } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + + return 0; +} + + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); + lockref_get(&qd->qd_lockref); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + if (lockref_put_or_lock(&qd->qd_lockref)) + return; + + qd->qd_lockref.count = 0; + list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + spin_unlock(&qd->qd_lockref.lock); + +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; + + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; + + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; + error = 0; +out: + qd->qd_slot_count++; + } + spin_unlock(&sdp->sd_bitmap_lock); + + return error; +} + +static void slot_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + spin_lock(&sdp->sd_bitmap_lock); + gfs2_assert(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + spin_unlock(&sdp->sd_bitmap_lock); +} + +static void slot_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + spin_lock(&sdp->sd_bitmap_lock); + gfs2_assert(sdp, qd->qd_slot_count); + if (!--qd->qd_slot_count) { + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); + qd->qd_slot = -1; + } + spin_unlock(&sdp->sd_bitmap_lock); +} + +static int bh_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int block, offset; + struct buffer_head *bh; + int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + + mutex_lock(&sdp->sd_quota_mutex); + + if (qd->qd_bh_count++) { + mutex_unlock(&sdp->sd_quota_mutex); + return 0; + } + + block = qd->qd_slot / sdp->sd_qc_per_block; + offset = qd->qd_slot % sdp->sd_qc_per_block; + + bh_map.b_size = 1 << ip->i_inode.i_blkbits; + error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); + if (error) + goto fail; + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) + goto fail_brelse; + + qd->qd_bh = bh; + qd->qd_bh_qc = (struct gfs2_quota_change *) + (bh->b_data + sizeof(struct gfs2_meta_header) + + offset * sizeof(struct gfs2_quota_change)); + + mutex_unlock(&sdp->sd_quota_mutex); + + return 0; + +fail_brelse: + brelse(bh); +fail: + qd->qd_bh_count--; + mutex_unlock(&sdp->sd_quota_mutex); + return error; +} + +static void bh_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_assert(sdp, qd->qd_bh_count); + if (!--qd->qd_bh_count) { + brelse(qd->qd_bh); + qd->qd_bh = NULL; + qd->qd_bh_qc = NULL; + } + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, + u64 *sync_gen) +{ + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + (sync_gen && (qd->qd_sync_gen >= *sync_gen))) + return 0; + + if (!lockref_get_not_dead(&qd->qd_lockref)) + return 0; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + set_bit(QDF_LOCKED, &qd->qd_flags); + qd->qd_change_sync = qd->qd_change; + slot_hold(qd); + return 1; +} + +static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL; + int error; + int found = 0; + + *qdp = NULL; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&qd_lock); + + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen); + if (found) + break; + } + + if (!found) + qd = NULL; + + spin_unlock(&qd_lock); + + if (qd) { + gfs2_assert_warn(sdp, qd->qd_change_sync); + error = bh_get(qd); + if (error) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; + } + } + + *qdp = qd; + + return 0; +} + +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_gl->gl_sbd, + test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + int error; + + error = qd_get(sdp, qid, qdp); + if (error) + return error; + + error = slot_get(*qdp); + if (error) + goto fail; + + error = bh_get(*qdp); + if (error) + goto fail_slot; + + return 0; + +fail_slot: + slot_put(*qdp); +fail: + qd_put(*qdp); + return error; +} + +static void qdsb_put(struct gfs2_quota_data *qd) +{ + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data **qd; + int error; + + if (ip->i_res == NULL) { + error = gfs2_rs_alloc(ip); + if (error) + return error; + } + + qd = ip->i_res->rs_qa_qd; + + if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) || + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) + return -EIO; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); + if (error) + goto out; + ip->i_res->rs_qa_qd_num++; + qd++; + + error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); + if (error) + goto out; + ip->i_res->rs_qa_qd_num++; + qd++; + + if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && + !uid_eq(uid, ip->i_inode.i_uid)) { + error = qdsb_get(sdp, make_kqid_uid(uid), qd); + if (error) + goto out; + ip->i_res->rs_qa_qd_num++; + qd++; + } + + if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && + !gid_eq(gid, ip->i_inode.i_gid)) { + error = qdsb_get(sdp, make_kqid_gid(gid), qd); + if (error) + goto out; + ip->i_res->rs_qa_qd_num++; + qd++; + } + +out: + if (error) + gfs2_quota_unhold(ip); + return error; +} + +void gfs2_quota_unhold(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int x; + + if (ip->i_res == NULL) + return; + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); + + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qdsb_put(ip->i_res->rs_qa_qd[x]); + ip->i_res->rs_qa_qd[x] = NULL; + } + ip->i_res->rs_qa_qd_num = 0; +} + +static int sort_qd(const void *a, const void *b) +{ + const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; + const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; + + if (qid_lt(qd_a->qd_id, qd_b->qd_id)) + return -1; + if (qid_lt(qd_b->qd_id, qd_a->qd_id)) + return 1; + return 0; +} + +static void do_qc(struct gfs2_quota_data *qd, s64 change) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct gfs2_quota_change *qc = qd->qd_bh_qc; + s64 x; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); + + if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { + qc->qc_change = 0; + qc->qc_flags = 0; + if (qd->qd_id.type == USRQUOTA) + qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); + qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); + } + + x = be64_to_cpu(qc->qc_change) + change; + qc->qc_change = cpu_to_be64(x); + + spin_lock(&qd_lock); + qd->qd_change = x; + spin_unlock(&qd_lock); + + if (!x) { + gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); + clear_bit(QDF_CHANGE, &qd->qd_flags); + qc->qc_flags = 0; + qc->qc_id = 0; + slot_put(qd); + qd_put(qd); + } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + + mutex_unlock(&sdp->sd_quota_mutex); +} + +/** + * gfs2_adjust_quota - adjust record of current block usage + * @ip: The quota inode + * @loc: Offset of the entry in the quota file + * @change: The amount of usage change to record + * @qd: The quota data + * @fdq: The updated limits to record + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + * + * Returns: 0 or -ve on error + */ + +static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, + s64 change, struct gfs2_quota_data *qd, + struct qc_dqblk *fdq) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long index = loc >> PAGE_CACHE_SHIFT; + unsigned offset = loc & (PAGE_CACHE_SIZE - 1); + unsigned blocksize, iblock, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr, *ptr; + struct gfs2_quota q; + int err, nbytes; + u64 size; + + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip, NULL); + if (err) + return err; + } + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; + if (fdq) { + if (fdq->d_fieldmask & QC_SPC_SOFT) { + q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_warn = q.qu_warn; + } + if (fdq->d_fieldmask & QC_SPC_HARD) { + q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_limit = q.qu_limit; + } + if (fdq->d_fieldmask & QC_SPACE) { + q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_value = q.qu_value; + } + } + + /* Write the quota into the quota file on disk */ + ptr = &q; + nbytes = sizeof(struct gfs2_quota); +get_a_page: + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, iblock, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); + } + + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + ll_rw_block(READ | REQ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + + gfs2_trans_add_data(ip->i_gl, bh); + + kaddr = kmap_atomic(page); + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); + flush_dcache_page(page); + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode->i_atime = CURRENT_TIME; + mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); + return 0; + +unlock_out: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) +{ + struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned int data_blocks, ind_blocks; + struct gfs2_holder *ghs, i_gh; + unsigned int qx, x; + struct gfs2_quota_data *qd; + unsigned reserved; + loff_t offset; + unsigned int nalloc = 0, blocks; + int error; + + error = gfs2_rs_alloc(ip); + if (error) + return error; + + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); + if (!ghs) + return -ENOMEM; + + sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + mutex_lock(&ip->i_inode.i_mutex); + for (qx = 0; qx < num_qd; qx++) { + error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &ghs[qx]); + if (error) + goto out; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out; + + for (x = 0; x < num_qd; x++) { + offset = qd2offset(qda[x]); + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) + nalloc++; + } + + /* + * 1 blk for unstuffing inode if stuffed. We add this extra + * block to the reservation unconditionally. If the inode + * doesn't need unstuffing, the block will be released to the + * rgrp since it won't be allocated during the transaction + */ + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; + + reserved = 1 + (nalloc * (data_blocks + ind_blocks)); + ap.target = reserved; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_alloc; + + if (nalloc) + blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto out_ipres; + + for (x = 0; x < num_qd; x++) { + qd = qda[x]; + offset = qd2offset(qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); + if (error) + goto out_end_trans; + + do_qc(qd, -qd->qd_change_sync); + set_bit(QDF_REFRESH, &qd->qd_flags); + } + + error = 0; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + gfs2_inplace_release(ip); +out_alloc: + gfs2_glock_dq_uninit(&i_gh); +out: + while (qx--) + gfs2_glock_dq_uninit(&ghs[qx]); + mutex_unlock(&ip->i_inode.i_mutex); + kfree(ghs); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH); + return error; +} + +static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota q; + struct gfs2_quota_lvb *qlvb; + loff_t pos; + int error; + + memset(&q, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); + if (error < 0) + return error; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; + qd->qd_qb = *qlvb; + + return 0; +} + +static int do_glock(struct gfs2_quota_data *qd, int force_refresh, + struct gfs2_holder *q_gh) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_holder i_gh; + int error; + +restart: + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); + if (error) + return error; + + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force_refresh = FORCE; + + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + + if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { + gfs2_glock_dq_uninit(q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, q_gh); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + if (error) + goto fail; + + error = update_qd(sdp, qd); + if (error) + goto fail_gunlock; + + gfs2_glock_dq_uninit(&i_gh); + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + gfs2_glock_dq_uninit(q_gh); + return error; +} + +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qd; + unsigned int x; + int error = 0; + + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; + + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num, + sizeof(struct gfs2_quota_data *), sort_qd, NULL); + + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; + error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]); + if (error) + break; + } + + if (!error) + set_bit(GIF_QD_LOCKED, &ip->i_flags); + else { + while (x--) + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + gfs2_quota_unhold(ip); + } + + return error; +} + +static int need_sync(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_tune *gt = &sdp->sd_tune; + s64 value; + unsigned int num, den; + int do_sync = 1; + + if (!qd->qd_qb.qb_limit) + return 0; + + spin_lock(&qd_lock); + value = qd->qd_change; + spin_unlock(&qd_lock); + + spin_lock(>->gt_spin); + num = gt->gt_quota_scale_num; + den = gt->gt_quota_scale_den; + spin_unlock(>->gt_spin); + + if (value < 0) + do_sync = 0; + else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= + (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + else { + value *= gfs2_jindex_size(sdp) * num; + value = div_s64(value, den); + value += (s64)be64_to_cpu(qd->qd_qb.qb_value); + if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + } + + return do_sync; +} + +void gfs2_quota_unlock(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qda[4]; + unsigned int count = 0; + unsigned int x; + int found; + + if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) + goto out; + + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + struct gfs2_quota_data *qd; + int sync; + + qd = ip->i_res->rs_qa_qd[x]; + sync = need_sync(qd); + + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + if (!sync) + continue; + + spin_lock(&qd_lock); + found = qd_check_sync(sdp, qd, NULL); + spin_unlock(&qd_lock); + + if (!found) + continue; + + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + continue; + } + + qda[count++] = qd; + } + + if (count) { + do_sync(count, qda); + for (x = 0; x < count; x++) + qd_unlock(qda[x]); + } + +out: + gfs2_quota_unhold(ip); +} + +#define MAX_LINE 256 + +static int print_message(struct gfs2_quota_data *qd, char *type) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); + + return 0; +} + +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. ap->min_target, if set, contains the minimum blks + * requested. + * + * Returns: 0 on success. + * min_req = ap->min_target ? ap->min_target : ap->target; + * quota must allow atleast min_req blks for success and + * ap->allowed is set to the number of blocks allowed + * + * -EDQUOT otherwise, quota violation. ap->allowed is set to number + * of blocks available. + */ +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qd; + s64 value, warn, limit; + unsigned int x; + int error = 0; + + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ + if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) + return 0; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; + + if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid)))) + continue; + + warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); + limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); + value = (s64)be64_to_cpu(qd->qd_qb.qb_value); + spin_lock(&qd_lock); + value += qd->qd_change; + spin_unlock(&qd_lock); + + if (limit > 0 && (limit - value) < ap->allowed) + ap->allowed = limit - value; + /* If we can't meet the target */ + if (limit && limit < (value + (s64)ap->target)) { + /* If no min_target specified or we don't meet + * min_target, return -EDQUOT */ + if (!ap->min_target || ap->min_target > ap->allowed) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + error = -EDQUOT; + break; + } + } else if (warn && warn < value && + time_after_eq(jiffies, qd->qd_last_warn + + gfs2_tune_get(sdp, gt_quota_warn_period) + * HZ)) { + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); + error = print_message(qd, "warning"); + qd->qd_last_warn = jiffies; + } + } + return error; +} + +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid) +{ + struct gfs2_quota_data *qd; + unsigned int x; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + return; + if (ip->i_diskflags & GFS2_DIF_SYSTEM) + return; + + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; + + if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid))) { + do_qc(qd, change); + } + } +} + +int gfs2_quota_sync(struct super_block *sb, int type) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_data **qda; + unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); + unsigned int num_qd; + unsigned int x; + int error = 0; + + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); + if (!qda) + return -ENOMEM; + + mutex_lock(&sdp->sd_quota_sync_mutex); + sdp->sd_quota_sync_gen++; + + do { + num_qd = 0; + + for (;;) { + error = qd_fish(sdp, qda + num_qd); + if (error || !qda[num_qd]) + break; + if (++num_qd == max_qd) + break; + } + + if (num_qd) { + if (!error) + error = do_sync(num_qd, qda); + if (!error) + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = + sdp->sd_quota_sync_gen; + + for (x = 0; x < num_qd; x++) + qd_unlock(qda[x]); + } + } while (!error && num_qd == max_qd); + + mutex_unlock(&sdp->sd_quota_sync_mutex); + kfree(qda); + + return error; +} + +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + + error = do_glock(qd, FORCE, &q_gh); + if (!error) + gfs2_glock_dq_uninit(&q_gh); + + qd_put(qd); + return error; +} + +int gfs2_quota_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; + unsigned int x, slot = 0; + unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; + u64 dblock; + u32 extlen = 0; + int error; + + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) + return -EIO; + + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); + error = -ENOMEM; + sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | + __GFP_ZERO, PAGE_KERNEL); + if (!sdp->sd_quota_bitmap) + return error; + + for (x = 0; x < blocks; x++) { + struct buffer_head *bh; + const struct gfs2_quota_change *qc; + unsigned int y; + + if (!extlen) { + int new = 0; + error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen); + if (error) + goto fail; + } + error = -EIO; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + if (!bh) + goto fail; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) { + brelse(bh); + goto fail; + } + + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); + for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; + y++, slot++) { + struct gfs2_quota_data *qd; + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) + continue; + + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { + brelse(bh); + goto fail; + } + + set_bit(QDF_CHANGE, &qd->qd_flags); + qd->qd_change = qc_change; + qd->qd_slot = slot; + qd->qd_slot_count = 1; + + spin_lock(&qd_lock); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + spin_unlock(&qd_lock); + + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); + + found++; + } + + brelse(bh); + dblock++; + extlen--; + } + + if (found) + fs_info(sdp, "found %u quota changes\n", found); + + return 0; + +fail: + gfs2_quota_cleanup(sdp); + return error; +} + +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_quota_list; + struct gfs2_quota_data *qd; + + spin_lock(&qd_lock); + while (!list_empty(head)) { + qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); + + list_del(&qd->qd_list); + + /* Also remove if this qd exists in the reclaim list */ + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + atomic_dec(&sdp->sd_quota_count); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_glock_put(qd->qd_gl); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); + + spin_lock(&qd_lock); + } + spin_unlock(&qd_lock); + + gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + + kvfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; +} + +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) +{ + if (error == 0 || error == -EROFS) + return; + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); +} + +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct super_block *sb, int type), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp->sd_vfs, 0); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} + +static void quotad_check_trunc_list(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + while(1) { + ip = NULL; + spin_lock(&sdp->sd_trunc_lock); + if (!list_empty(&sdp->sd_trunc_list)) { + ip = list_entry(sdp->sd_trunc_list.next, + struct gfs2_inode, i_trunc_list); + list_del_init(&ip->i_trunc_list); + } + spin_unlock(&sdp->sd_trunc_lock); + if (ip == NULL) + return; + gfs2_glock_finish_truncate(ip); + } +} + +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { + if (!sdp->sd_statfs_force_sync) { + sdp->sd_statfs_force_sync = 1; + wake_up(&sdp->sd_quota_wait); + } +} + + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + DEFINE_WAIT(wait); + int empty; + + while (!kthread_should_stop()) { + + /* Update the master statfs file */ + if (sdp->sd_statfs_force_sync) { + int error = gfs2_statfs_sync(sdp->sd_vfs, 0); + quotad_error(sdp, "statfs", error); + statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + } + else + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, + &tune->gt_statfs_quantum); + + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + "ad_timeo, &tune->gt_quota_quantum); + + /* Check for & recover partially truncated inodes */ + quotad_check_trunc_list(sdp); + + try_to_freeze(); + + t = min(quotad_timeo, statfs_timeo); + + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); + spin_lock(&sdp->sd_trunc_lock); + empty = list_empty(&sdp->sd_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + if (empty && !sdp->sd_statfs_force_sync) + t -= schedule_timeout(t); + else + t = 0; + finish_wait(&sdp->sd_quota_wait, &wait); + } + + return 0; +} + +static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + memset(state, 0, sizeof(*state)); + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; + break; + case GFS2_QUOTA_OFF: + break; + } + if (sdp->sd_quota_inode) { + state->s_state[USRQUOTA].ino = + GFS2_I(sdp->sd_quota_inode)->i_no_addr; + state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks; + } + state->s_state[USRQUOTA].nextents = 1; /* unsupported */ + state->s_state[GRPQUOTA] = state->s_state[USRQUOTA]; + state->s_incoredqs = list_lru_count(&gfs2_qd_lru); + return 0; +} + +static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_lvb *qlvb; + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + memset(fdq, 0, sizeof(*fdq)); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + error = do_glock(qd, FORCE, &q_gh); + if (error) + goto out; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift; + fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift; + fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift; + + gfs2_glock_dq_uninit(&q_gh); +out: + qd_put(qd); + return error; +} + +/* GFS2 only supports a subset of the XFS fields */ +#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE) + +static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh, i_gh; + unsigned int data_blocks, ind_blocks; + unsigned int blocks = 0; + int alloc_required; + loff_t offset; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + if (fdq->d_fieldmask & ~GFS2_FIELDMASK) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + + error = gfs2_rs_alloc(ip); + if (error) + goto out_put; + + mutex_lock(&ip->i_inode.i_mutex); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); + if (error) + goto out_unlockput; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_q; + + /* Check for existing entry, if none then alloc new blocks */ + error = update_qd(sdp, qd); + if (error) + goto out_i; + + /* If nothing has changed, this is a no-op */ + if ((fdq->d_fieldmask & QC_SPC_SOFT) && + ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= QC_SPC_SOFT; + + if ((fdq->d_fieldmask & QC_SPC_HARD) && + ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= QC_SPC_HARD; + + if ((fdq->d_fieldmask & QC_SPACE) && + ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= QC_SPACE; + + if (fdq->d_fieldmask == 0) + goto out_i; + + offset = qd2offset(qd); + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; + if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + blocks = 1 + data_blocks + ind_blocks; + ap.target = blocks; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_i; + blocks += gfs2_rg_blocks(ip, blocks); + } + + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); + if (error) + goto out_release; + + /* Apply changes */ + error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + + gfs2_trans_end(sdp); +out_release: + if (alloc_required) + gfs2_inplace_release(ip); +out_i: + gfs2_glock_dq_uninit(&i_gh); +out_q: + gfs2_glock_dq_uninit(&q_gh); +out_unlockput: + mutex_unlock(&ip->i_inode.i_mutex); +out_put: + qd_put(qd); + return error; +} + +const struct quotactl_ops gfs2_quotactl_ops = { + .quota_sync = gfs2_quota_sync, + .get_state = gfs2_quota_get_state, + .get_dqblk = gfs2_get_dqblk, + .set_dqblk = gfs2_set_dqblk, +}; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/kernel/fs/gfs2/quota.h b/kernel/fs/gfs2/quota.h new file mode 100644 index 000000000..ad04b3aca --- /dev/null +++ b/kernel/fs/gfs2/quota.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __QUOTA_DOT_H__ +#define __QUOTA_DOT_H__ + +#include + +struct gfs2_inode; +struct gfs2_sbd; + +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID + +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); + +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); + +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid); + +extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); + +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); + +extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); + +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int ret; + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + return ret; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); + if (ret) + gfs2_quota_unlock(ip); + return ret; +} + +extern const struct quotactl_ops gfs2_quotactl_ops; +extern struct shrinker gfs2_qd_shrinker; +extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); + +#endif /* __QUOTA_DOT_H__ */ diff --git a/kernel/fs/gfs2/recovery.c b/kernel/fs/gfs2/recovery.c new file mode 100644 index 000000000..1b645773c --- /dev/null +++ b/kernel/fs/gfs2/recovery.c @@ -0,0 +1,611 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "super.h" +#include "util.h" +#include "dir.h" + +struct workqueue_struct *gfs_recovery_wq; + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + int new = 0; + u64 dblock; + u32 extlen; + int error; + + error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + *bh = gfs2_meta_ra(gl, dblock, extlen); + + return error; +} + +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) +{ + struct list_head *head = &jd->jd_revoke_list; + struct gfs2_revoke_replay *rr; + int found = 0; + + list_for_each_entry(rr, head, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (found) { + rr->rr_where = where; + return 0; + } + + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + list_add(&rr->rr_list, head); + + return 1; +} + +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) +{ + struct gfs2_revoke_replay *rr; + int wrap, a, b, revoke; + int found = 0; + + list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (!found) + return 0; + + wrap = (rr->rr_where < jd->jd_replay_tail); + a = (jd->jd_replay_tail < where); + b = (where < rr->rr_where); + revoke = (wrap) ? (a || b) : (a && b); + + return revoke; +} + +void gfs2_revoke_clean(struct gfs2_jdesc *jd) +{ + struct list_head *head = &jd->jd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!list_empty(head)) { + rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list); + list_del(&rr->rr_list); + kfree(rr); + } +} + +static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) +{ + const struct gfs2_log_header *str = buf; + + if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH)) + return 1; + + lh->lh_sequence = be64_to_cpu(str->lh_sequence); + lh->lh_flags = be32_to_cpu(str->lh_flags); + lh->lh_tail = be32_to_cpu(str->lh_tail); + lh->lh_blkno = be32_to_cpu(str->lh_blkno); + lh->lh_hash = be32_to_cpu(str->lh_hash); + return 0; +} + +/** + * get_log_header - read the log header for a given segment + * @jd: the journal + * @blk: the block to look at + * @lh: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, + struct gfs2_log_header_host *head) +{ + struct buffer_head *bh; + struct gfs2_log_header_host uninitialized_var(lh); + const u32 nothing = 0; + u32 hash; + int error; + + error = gfs2_replay_read_block(jd, blk, &bh); + if (error) + return error; + + hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) - + sizeof(u32)); + hash = crc32_le(hash, (unsigned char const *)¬hing, sizeof(nothing)); + hash ^= (u32)~0; + error = gfs2_log_header_in(&lh, bh->b_data); + brelse(bh); + + if (error || lh.lh_blkno != blk || lh.lh_hash != hash) + return 1; + + *head = lh; + + return 0; +} + +/** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header_host *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header_host lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_log_header_host lh_1, lh_m; + u32 blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @jd: the journal + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, + unsigned int end, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + u32 length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + + while (start != end) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + if (gfs2_meta_check(sdp, bh)) { + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header_host lh; + error = get_log_header(jd, start, &lh); + if (!error) { + gfs2_replay_incr_blk(sdp, &start); + brelse(bh); + continue; + } + if (error == 1) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + error = -EIO; + } + brelse(bh); + return error; + } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = lops_scan_elements(jd, start, ld, ptr, pass); + if (error) { + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(sdp, &start); + + brelse(bh); + } + + return 0; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @sdp: the filesystem + * @jd: the journal + * @gl: the journal's glock + * @head: the head journal to start from + * + * Returns: errno + */ + +static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int lblock; + struct gfs2_log_header *lh; + u32 hash; + struct buffer_head *bh; + int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + + lblock = head->lh_blkno; + gfs2_replay_incr_blk(sdp, &lblock); + bh_map.b_size = 1 << ip->i_inode.i_blkbits; + error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); + if (error) + return error; + if (!bh_map.b_blocknr) { + gfs2_consist_inode(ip); + return -EIO; + } + + bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); + lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); + lh->lh_blkno = cpu_to_be32(lblock); + hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + return error; +} + + +static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + ls->ls_recover_jid_done = jid; + ls->ls_recover_jid_status = message; + sprintf(env_jid, "JID=%u", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); +} + +void gfs2_recover_func(struct work_struct *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host head; + struct gfs2_holder j_gh, ji_gh, thaw_gh; + unsigned long t; + int ro = 0; + unsigned int pass; + int error; + int jlocked = 0; + + if (sdp->sd_args.ar_spectator || + (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { + fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", + jd->jd_jid); + jlocked = 1; + /* Acquire the journal lock so we can do recovery */ + + error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, + &j_gh); + switch (error) { + case 0: + break; + + case GLR_TRYFAILED: + fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); + error = 0; + + default: + goto fail; + }; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); + if (error) + goto fail_gunlock_j; + } else { + fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); + } + + fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); + + error = gfs2_jdesc_check(jd); + if (error) + goto fail_gunlock_ji; + + error = gfs2_find_jhead(jd, &head); + if (error) + goto fail_gunlock_ji; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", + jd->jd_jid); + + t = jiffies; + + /* Acquire a shared hold on the freeze lock */ + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY, + &thaw_gh); + if (error) + goto fail_gunlock_ji; + + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + ro = 1; + } else { + if (sdp->sd_vfs->s_flags & MS_RDONLY) { + /* check if device itself is read-only */ + ro = bdev_read_only(sdp->sd_vfs->s_bdev); + if (!ro) { + fs_info(sdp, "recovery required on " + "read-only filesystem.\n"); + fs_info(sdp, "write access will be " + "enabled during recovery.\n"); + } + } + } + + if (ro) { + fs_warn(sdp, "jid=%u: Can't replay: read-only block " + "device\n", jd->jd_jid); + error = -EROFS; + goto fail_gunlock_thaw; + } + + fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); + + for (pass = 0; pass < 2; pass++) { + lops_before_scan(jd, &head, pass); + error = foreach_descriptor(jd, head.lh_tail, + head.lh_blkno, pass); + lops_after_scan(jd, error, pass); + if (error) + goto fail_gunlock_thaw; + } + + error = clean_journal(jd, &head); + if (error) + goto fail_gunlock_thaw; + + gfs2_glock_dq_uninit(&thaw_gh); + t = DIV_ROUND_UP(jiffies - t, HZ); + fs_info(sdp, "jid=%u: Journal replayed in %lus\n", + jd->jd_jid, t); + } + + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); + goto done; + +fail_gunlock_thaw: + gfs2_glock_dq_uninit(&thaw_gh); +fail_gunlock_ji: + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); +fail_gunlock_j: + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); +fail: + jd->jd_recover_error = error; + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_atomic(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); +} + +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) +{ + int rv; + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + TASK_UNINTERRUPTIBLE); + + return wait ? jd->jd_recover_error : 0; +} + diff --git a/kernel/fs/gfs2/recovery.h b/kernel/fs/gfs2/recovery.h new file mode 100644 index 000000000..6142836cc --- /dev/null +++ b/kernel/fs/gfs2/recovery.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RECOVERY_DOT_H__ +#define __RECOVERY_DOT_H__ + +#include "incore.h" + +extern struct workqueue_struct *gfs_recovery_wq; + +static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +{ + if (++*blk == sdp->sd_jdesc->jd_blocks) + *blk = 0; +} + +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh); + +extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); + +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); + +#endif /* __RECOVERY_DOT_H__ */ + diff --git a/kernel/fs/gfs2/rgrp.c b/kernel/fs/gfs2/rgrp.c new file mode 100644 index 000000000..6af2396a3 --- /dev/null +++ b/kernel/fs/gfs2/rgrp.c @@ -0,0 +1,2623 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "log.h" +#include "inode.h" +#include "trace_gfs2.h" + +#define BFITNOENT ((u32)~0) +#define NO_BLOCK ((u64)~0) + +#if BITS_PER_LONG == 32 +#define LBITMASK (0x55555555UL) +#define LBITSKIP55 (0x55555555UL) +#define LBITSKIP00 (0x00000000UL) +#else +#define LBITMASK (0x5555555555555555UL) +#define LBITSKIP55 (0x5555555555555555UL) +#define LBITSKIP00 (0x0000000000000000UL) +#endif + +/* + * These routines are used by the resource group routines (rgrp.c) + * to keep track of block allocation. Each block is represented by two + * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks. + * + * 0 = Free + * 1 = Used (not metadata) + * 2 = Unlinked (still in use) inode + * 3 = Used (metadata) + */ + +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + +static const char valid_change[16] = { + /* current */ + /* n */ 0, 1, 1, 1, + /* e */ 1, 0, 0, 0, + /* w */ 0, 0, 0, 1, + 1, 0, 0, 0 +}; + +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); + + +/** + * gfs2_setbit - Set a bit in the bitmaps + * @rbm: The position of the bit to set + * @do_clone: Also set the clone bitmap, if it exists + * @new_state: the new state of the block + * + */ + +static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, + unsigned char new_state) +{ + unsigned char *byte1, *byte2, *end, cur_state; + struct gfs2_bitmap *bi = rbm_bi(rbm); + unsigned int buflen = bi->bi_len; + const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; + + BUG_ON(byte1 >= end); + + cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; + + if (unlikely(!valid_change[new_state * 4 + cur_state])) { + pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n", + rbm->offset, cur_state, new_state); + pr_warn("rgrp=0x%llx bi_start=0x%x\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); + pr_warn("bi_offset=0x%x bi_len=0x%x\n", + bi->bi_offset, bi->bi_len); + dump_stack(); + gfs2_consist_rgrpd(rbm->rgd); + return; + } + *byte1 ^= (cur_state ^ new_state) << bit; + + if (do_clone && bi->bi_clone) { + byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY); + cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; + *byte2 ^= (cur_state ^ new_state) << bit; + } +} + +/** + * gfs2_testbit - test a bit in the bitmaps + * @rbm: The bit to test + * + * Returns: The two bit block state of the requested bit + */ + +static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) +{ + struct gfs2_bitmap *bi = rbm_bi(rbm); + const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset; + const u8 *byte; + unsigned int bit; + + byte = buffer + (rbm->offset / GFS2_NBBY); + bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + return (*byte >> bit) & GFS2_BIT_MASK; +} + +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) +{ + u64 tmp; + static const u64 search[] = { + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + +/** + * rs_cmp - multi-block reservation range compare + * @blk: absolute file system block number of the new reservation + * @len: number of blocks in the new reservation + * @rs: existing reservation to compare against + * + * returns: 1 if the block range is beyond the reach of the reservation + * -1 if the block range is before the start of the reservation + * 0 if the block range overlaps with the reservation + */ +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) +{ + u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm); + + if (blk >= startblk + rs->rs_free) + return 1; + if (blk + len - 1 < startblk) + return -1; + return 0; +} + +/** + * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing + * a block in a given allocation state. + * @buf: the buffer that holds the bitmaps + * @len: the length (in bytes) of the buffer + * @goal: start search at this block's bit-pair (within @buffer) + * @state: GFS2_BLKST_XXX the state of the block we're looking for. + * + * Scope of @goal and returned block number is only within this bitmap buffer, + * not entire rgrp or filesystem. @buffer will be offset from the actual + * beginning of a bitmap block buffer, skipping any header structures, but + * headers are always a multiple of 64 bits long so that the buffer is + * always aligned to a 64 bit boundary. + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to read a complete multiple of 64 bits at the end + * of the block in case the end is no aligned to a natural boundary. + * + * Return: the block number (bitmap buffer scope) that was found + */ + +static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, + u32 goal, u8 state) +{ + u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); + const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); + const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); + u64 tmp; + u64 mask = 0x5555555555555555ULL; + u32 bit; + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); + ptr++; + } + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(u64) - 1))) + tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = __ffs64(tmp); + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; +} + +/** + * gfs2_rbm_from_block - Set the rbm based upon rgd and block number + * @rbm: The rbm with rgd already set correctly + * @block: The block number (filesystem relative) + * + * This sets the bi and offset members of an rbm based on a + * resource group and a filesystem relative block number. The + * resource group must be set in the rbm on entry, the bi and + * offset members will be set by this function. + * + * Returns: 0 on success, or an error code + */ + +static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) +{ + u64 rblock = block - rbm->rgd->rd_data0; + + if (WARN_ON_ONCE(rblock > UINT_MAX)) + return -EINVAL; + if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) + return -E2BIG; + + rbm->bii = 0; + rbm->offset = (u32)(rblock); + /* Check if the block is within the first block */ + if (rbm->offset < rbm_bi(rbm)->bi_blocks) + return 0; + + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + return 0; +} + +/** + * gfs2_rbm_incr - increment an rbm structure + * @rbm: The rbm with rgd already set correctly + * + * This function takes an existing rbm structure and increments it to the next + * viable block offset. + * + * Returns: If incrementing the offset would cause the rbm to go past the + * end of the rgrp, true is returned, otherwise false. + * + */ + +static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +{ + if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ + rbm->offset++; + return false; + } + if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ + return true; + + rbm->offset = 0; + rbm->bii++; + return false; +} + +/** + * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned + * @rbm: Position to search (value/result) + * @n_unaligned: Number of unaligned blocks to check + * @len: Decremented for each block found (terminate on zero) + * + * Returns: true if a non-free block is encountered + */ + +static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) +{ + u32 n; + u8 res; + + for (n = 0; n < n_unaligned; n++) { + res = gfs2_testbit(rbm); + if (res != GFS2_BLKST_FREE) + return true; + (*len)--; + if (*len == 0) + return true; + if (gfs2_rbm_incr(rbm)) + return true; + } + + return false; +} + +/** + * gfs2_free_extlen - Return extent length of free blocks + * @rrbm: Starting position + * @len: Max length to check + * + * Starting at the block specified by the rbm, see how many free blocks + * there are, not reading more than len blocks ahead. This can be done + * using memchr_inv when the blocks are byte aligned, but has to be done + * on a block by block basis in case of unaligned blocks. Also this + * function can cope with bitmap boundaries (although it must stop on + * a resource group boundary) + * + * Returns: Number of free blocks in the extent + */ + +static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) +{ + struct gfs2_rbm rbm = *rrbm; + u32 n_unaligned = rbm.offset & 3; + u32 size = len; + u32 bytes; + u32 chunk_size; + u8 *ptr, *start, *end; + u64 block; + struct gfs2_bitmap *bi; + + if (n_unaligned && + gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) + goto out; + + n_unaligned = len & 3; + /* Start is now byte aligned */ + while (len > 3) { + bi = rbm_bi(&rbm); + start = bi->bi_bh->b_data; + if (bi->bi_clone) + start = bi->bi_clone; + end = start + bi->bi_bh->b_size; + start += bi->bi_offset; + BUG_ON(rbm.offset & 3); + start += (rbm.offset / GFS2_NBBY); + bytes = min_t(u32, len / GFS2_NBBY, (end - start)); + ptr = memchr_inv(start, 0, bytes); + chunk_size = ((ptr == NULL) ? bytes : (ptr - start)); + chunk_size *= GFS2_NBBY; + BUG_ON(len < chunk_size); + len -= chunk_size; + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; + break; + } + if (ptr) { + n_unaligned = 3; + break; + } + n_unaligned = len & 3; + } + + /* Deal with any bits left over at the end */ + if (n_unaligned) + gfs2_unaligned_extlen(&rbm, n_unaligned, &len); +out: + return size - len; +} + +/** + * gfs2_bitcount - count the number of bits in a certain state + * @rgd: the resource group descriptor + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @state: the state of the block we're looking for + * + * Returns: The number of bits + */ + +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, + unsigned int buflen, u8 state) +{ + const u8 *byte = buffer; + const u8 *end = buffer + buflen; + const u8 state1 = state << 2; + const u8 state2 = state << 4; + const u8 state3 = state << 6; + u32 count = 0; + + for (; byte < end; byte++) { + if (((*byte) & 0x03) == state) + count++; + if (((*byte) & 0x0C) == state1) + count++; + if (((*byte) & 0x30) == state2) + count++; + if (((*byte) & 0xC0) == state3) + count++; + } + + return count; +} + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi = NULL; + u32 length = rgd->rd_length; + u32 count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(u32)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + for (x = 0; x < 4; x++) + count[x] += gfs2_bitcount(rgd, + bi->bi_bh->b_data + + bi->bi_offset, + bi->bi_len, x); + } + + if (count[0] != rgd->rd_free) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_free); + return; + } + + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; + if (count[1] != tmp) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + return; + } + + if (count[2] + count[3] != rgd->rd_dinodes) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used metadata mismatch: %u != %u\n", + count[2] + count[3], rgd->rd_dinodes); + return; + } +} + +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @blk: The data block number + * @exact: True if this needs to be an exact match + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) +{ + struct rb_node *n, *next; + struct gfs2_rgrpd *cur; + + spin_lock(&sdp->sd_rindex_spin); + n = sdp->sd_rindex_tree.rb_node; + while (n) { + cur = rb_entry(n, struct gfs2_rgrpd, rd_node); + next = NULL; + if (blk < cur->rd_addr) + next = n->rb_left; + else if (blk >= cur->rd_data0 + cur->rd_data) + next = n->rb_right; + if (next == NULL) { + spin_unlock(&sdp->sd_rindex_spin); + if (exact) { + if (blk < cur->rd_addr) + return NULL; + if (blk >= cur->rd_data0 + cur->rd_data) + return NULL; + } + return cur; + } + n = next; + } + spin_unlock(&sdp->sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: the resource group descriptor + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); + return NULL; + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +void check_and_update_goal(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL) + ip->i_goal = ip->i_no_addr; +} + +void gfs2_free_clones(struct gfs2_rgrpd *rgd) +{ + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } +} + +/** + * gfs2_rs_alloc - make sure we have a reservation assigned to the inode + * @ip: the inode for this reservation + */ +int gfs2_rs_alloc(struct gfs2_inode *ip) +{ + int error = 0; + + down_write(&ip->i_rw_mutex); + if (ip->i_res) + goto out; + + ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!ip->i_res) { + error = -ENOMEM; + goto out; + } + + RB_CLEAR_NODE(&ip->i_res->rs_node); +out: + up_write(&ip->i_rw_mutex); + return error; +} + +static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) +{ + gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n", + (unsigned long long)rs->rs_inum, + (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), + rs->rs_rbm.offset, rs->rs_free); +} + +/** + * __rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +static void __rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + if (!gfs2_rs_active(rs)) + return; + + rgd = rs->rs_rbm.rgd; + trace_gfs2_rs(rs, TRACE_RS_TREEDEL); + rb_erase(&rs->rs_node, &rgd->rd_rstree); + RB_CLEAR_NODE(&rs->rs_node); + + if (rs->rs_free) { + struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm); + + /* return reserved blocks to the rgrp */ + BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); + rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; + rs->rs_free = 0; + clear_bit(GBF_FULL, &bi->bi_flags); + } +} + +/** + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + rgd = rs->rs_rbm.rgd; + if (rgd) { + spin_lock(&rgd->rd_rsspin); + __rs_deltree(rs); + spin_unlock(&rgd->rd_rsspin); + } +} + +/** + * gfs2_rs_delete - delete a multi-block reservation + * @ip: The inode for this reservation + * @wcount: The inode's write count, or NULL + * + */ +void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +{ + down_write(&ip->i_rw_mutex); + if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { + gfs2_rs_deltree(ip->i_res); + BUG_ON(ip->i_res->rs_free); + kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); + ip->i_res = NULL; + } + up_write(&ip->i_rw_mutex); +} + +/** + * return_all_reservations - return all reserved blocks back to the rgrp. + * @rgd: the rgrp that needs its space back + * + * We previously reserved a bunch of blocks for allocation. Now we need to + * give them back. This leave the reservation structures in tact, but removes + * all of their corresponding "no-fly zones". + */ +static void return_all_reservations(struct gfs2_rgrpd *rgd) +{ + struct rb_node *n; + struct gfs2_blkreserv *rs; + + spin_lock(&rgd->rd_rsspin); + while ((n = rb_first(&rgd->rd_rstree))) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + __rs_deltree(rs); + } + spin_unlock(&rgd->rd_rsspin); +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + struct rb_node *n; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + gl = rgd->rd_gl; + + rb_erase(n, &sdp->sd_rindex_tree); + + if (gl) { + spin_lock(&gl->gl_spin); + gl->gl_object = NULL; + spin_unlock(&gl->gl_spin); + gfs2_glock_add_to_lru(gl); + gfs2_glock_put(gl); + } + + gfs2_free_clones(rgd); + kfree(rgd->rd_bits); + return_all_reservations(rgd); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); + } +} + +static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) +{ + pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + pr_info("ri_length = %u\n", rgd->rd_length); + pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + pr_info("ri_data = %u\n", rgd->rd_data); + pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes); +} + +/** + * gfs2_compute_bitstructs - Compute the bitmap sizes + * @rgd: The resource group descriptor + * + * Calculates bitmap descriptors, one for each block that contains bitmap data + * + * Returns: errno + */ + +static int compute_bitstructs(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi; + u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */ + u32 bytes_left, bytes; + int x; + + if (!length) + return -EINVAL; + + rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS); + if (!rgd->rd_bits) + return -ENOMEM; + + bytes_left = rgd->rd_bitbytes; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + + bi->bi_flags = 0; + /* small rgrp; bitmap stored completely in header block */ + if (length == 1) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* header block */ + } else if (x == 0) { + bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* last block */ + } else if (x + 1 == length) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* other blocks */ + } else { + bytes = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + } + + bytes_left -= bytes; + } + + if (bytes_left) { + gfs2_consist_rgrpd(rgd); + return -EIO; + } + bi = rgd->rd_bits + (length - 1); + if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) { + if (gfs2_consist_rgrpd(rgd)) { + gfs2_rindex_print(rgd); + fs_err(sdp, "start=%u len=%u offset=%u\n", + bi->bi_start, bi->bi_len, bi->bi_offset); + } + return -EIO; + } + + return 0; +} + +/** + * gfs2_ri_total - Total up the file system space, according to the rindex. + * @sdp: the filesystem + * + */ +u64 gfs2_ri_total(struct gfs2_sbd *sdp) +{ + u64 total_data = 0; + struct inode *inode = sdp->sd_rindex; + struct gfs2_inode *ip = GFS2_I(inode); + char buf[sizeof(struct gfs2_rindex)]; + int error, rgrps; + + for (rgrps = 0;; rgrps++) { + loff_t pos = rgrps * sizeof(struct gfs2_rindex); + + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) + break; + error = gfs2_internal_read(ip, buf, &pos, + sizeof(struct gfs2_rindex)); + if (error != sizeof(struct gfs2_rindex)) + break; + total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); + } + return total_data; +} + +static int rgd_insert(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); + sdp->sd_rgrps++; + return 0; +} + +/** + * read_rindex_entry - Pull in a new resource index entry from the disk + * @ip: Pointer to the rindex inode + * + * Returns: 0 on success, > 0 on EOF, error code otherwise + */ + +static int read_rindex_entry(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + struct gfs2_rindex buf; + int error; + struct gfs2_rgrpd *rgd; + + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, (char *)&buf, &pos, + sizeof(struct gfs2_rindex)); + + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; + + rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); + error = -ENOMEM; + if (!rgd) + return error; + + rgd->rd_sbd = sdp; + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); + spin_lock_init(&rgd->rd_rsspin); + + error = compute_bitstructs(rgd); + if (error) + goto fail; + + error = gfs2_glock_get(sdp, rgd->rd_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + goto fail; + + rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; + rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + error = rgd_insert(rgd); + spin_unlock(&sdp->sd_rindex_spin); + if (!error) + return 0; + + error = 0; /* someone else read in the rgrp; free it and ignore it */ + gfs2_glock_put(rgd->rd_gl); + +fail: + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); + return error; +} + +/** + * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use + * @sdp: the GFS2 superblock + * + * The purpose of this function is to select a subset of the resource groups + * and mark them as PREFERRED. We do it in such a way that each node prefers + * to use a unique set of rgrps to minimize glock contention. + */ +static void set_rgrp_preferences(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd, *first; + int i; + + /* Skip an initial number of rgrps, based on this node's journal ID. + That should start each node out on its own set. */ + rgd = gfs2_rgrpd_get_first(sdp); + for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++) + rgd = gfs2_rgrpd_get_next(rgd); + first = rgd; + + do { + rgd->rd_flags |= GFS2_RDF_PREFERRED; + for (i = 0; i < sdp->sd_journals; i++) { + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == first) + break; + } + } while (rgd != first); +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @ip: pointer to the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ + +static int gfs2_ri_update(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + + do { + error = read_rindex_entry(ip); + } while (error == 0); + + if (error < 0) + return error; + + set_rgrp_preferences(sdp); + + sdp->sd_rindex_uptodate = 1; + return 0; +} + +/** + * gfs2_rindex_update - Update the rindex if required + * @sdp: The GFS2 superblock + * + * We grab a lock on the rindex inode to make sure that it doesn't + * change whilst we are performing an operation. We keep this lock + * for quite long periods of time compared to other locks. This + * doesn't matter, since it is shared and it is very, very rarely + * accessed in the exclusive mode (i.e. only when expanding the filesystem). + * + * This makes sure that we're using the latest copy of the resource index + * special file, which might have been updated if someone expanded the + * filesystem (via gfs2_grow utility), which adds new resource groups. + * + * Returns: 0 on succeess, error code otherwise + */ + +int gfs2_rindex_update(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + struct gfs2_glock *gl = ip->i_gl; + struct gfs2_holder ri_gh; + int error = 0; + int unlock_required = 0; + + /* Read new copy from disk if we don't have the latest */ + if (!sdp->sd_rindex_uptodate) { + if (!gfs2_glock_is_locked_by_me(gl)) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + unlock_required = 1; + } + if (!sdp->sd_rindex_uptodate) + error = gfs2_ri_update(ip); + if (unlock_required) + gfs2_glock_dq_uninit(&ri_gh); + } + + return error; +} + +static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + u32 rg_flags; + + rg_flags = be32_to_cpu(str->rg_flags); + rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); +} + +static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) +{ + struct gfs2_rgrp *str = buf; + + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); + str->__pad = cpu_to_be32(0); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); +} + +static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data; + + if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free || + rgl->rl_dinodes != str->rg_dinodes || + rgl->rl_igeneration != str->rg_igeneration) + return 0; + return 1; +} + +static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); + rgl->rl_flags = str->rg_flags; + rgl->rl_free = str->rg_free; + rgl->rl_dinodes = str->rg_dinodes; + rgl->rl_igeneration = str->rg_igeneration; + rgl->__pad = 0UL; +} + +static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change; + rgl->rl_unlinked = cpu_to_be32(unlinked); +} + +static u32 count_unlinked(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + const u32 length = rgd->rd_length; + const u8 *buffer = NULL; + u32 i, goal, count = 0; + + for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) { + goal = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); + while (goal < bi->bi_len * GFS2_NBBY) { + goal = gfs2_bitfit(buffer, bi->bi_len, goal, + GFS2_BLKST_UNLINKED); + if (goal == BFITNOENT) + break; + count++; + goal++; + } + } + + return count; +} + + +/** + * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + * Read in all of a Resource Group's header and bitmap blocks. + * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * + * Returns: errno + */ + +static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl = rgd->rd_gl; + unsigned int length = rgd->rd_length; + struct gfs2_bitmap *bi; + unsigned int x, y; + int error; + + if (rgd->rd_bits[0].bi_bh != NULL) + return 0; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + if (error) + goto fail; + } + + for (y = length; y--;) { + bi = rgd->rd_bits + y; + error = gfs2_meta_wait(sdp, bi->bi_bh); + if (error) + goto fail; + if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB : + GFS2_METATYPE_RG)) { + error = -EIO; + goto fail; + } + } + + if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + for (x = 0; x < length; x++) + clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; + } + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, + rgd->rd_bits[0].bi_bh->b_data); + } + else if (sdp->sd_args.ar_rgrplvb) { + if (!gfs2_rgrp_lvb_valid(rgd)){ + gfs2_consist_rgrpd(rgd); + error = -EIO; + goto fail; + } + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + } + return 0; + +fail: + while (x--) { + bi = rgd->rd_bits + x; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + gfs2_assert_warn(sdp, !bi->bi_clone); + } + + return error; +} + +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) +{ + u32 rl_flags; + + if (rgd->rd_flags & GFS2_RDF_UPTODATE) + return 0; + + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + return gfs2_rgrp_bh_get(rgd); + + rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); + rl_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); + rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); + return 0; +} + +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; + struct gfs2_sbd *sdp = rgd->rd_sbd; + + if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) + return 0; + return gfs2_rgrp_bh_get(rgd); +} + +/** + * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @gh: The glock holder for the resource group + * + */ + +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; + int x, length = rgd->rd_length; + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + if (bi->bi_bh) { + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } + } + +} + +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) +{ + struct super_block *sb = sdp->sd_vfs; + u64 blk; + sector_t start = 0; + sector_t nr_blks = 0; + int rv; + unsigned int x; + u32 trimmed = 0; + u8 diff; + + for (x = 0; x < bi->bi_len; x++) { + const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; + clone += bi->bi_offset; + clone += x; + if (bh) { + const u8 *orig = bh->b_data + bi->bi_offset + x; + diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + } else { + diff = ~(*clone | (*clone >> 1)); + } + diff &= 0x55; + if (diff == 0) + continue; + blk = offset + ((bi->bi_start + x) * GFS2_NBBY); + while(diff) { + if (diff & 1) { + if (nr_blks == 0) + goto start_new_extent; + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, + GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + nr_blks = 0; +start_new_extent: + start = blk; + } + nr_blks++; + } + diff >>= 2; + blk++; + } + } + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + if (ptrimmed) + *ptrimmed = trimmed; + return 0; + +fail: + if (sdp->sd_args.ar_discard) + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + sdp->sd_args.ar_discard = 0; + return -EIO; +} + +/** + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem + * @filp: Any file on the filesystem + * @argp: Pointer to the arguments (also used to pass result) + * + * Returns: 0 on success, otherwise error code + */ + +int gfs2_fitrim(struct file *filp, void __user *argp) +{ + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct buffer_head *bh; + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd *rgd_end; + struct gfs2_holder gh; + struct fstrim_range r; + int ret = 0; + u64 amt; + u64 trimmed = 0; + u64 start, end, minlen; + unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&r, argp, sizeof(r))) + return -EFAULT; + + ret = gfs2_rindex_update(sdp); + if (ret) + return ret; + + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, + q->limits.discard_granularity) >> bs_shift; + + if (end <= start || minlen > sdp->sd_max_rg_data) + return -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end, 0); + + if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end)) + && (start > rgd_end->rd_data0 + rgd_end->rd_data)) + return -EINVAL; /* start is beyond the end of the fs */ + + while (1) { + + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto out; + + if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { + /* Trim each bitmap in the rgrp */ + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); + if (ret) { + gfs2_glock_dq_uninit(&gh); + goto out; + } + trimmed += amt; + } + + /* Mark rgrp as having been trimmed */ + ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); + if (ret == 0) { + bh = rgd->rd_bits[0].bi_bh; + rgd->rd_flags |= GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, bh); + gfs2_rgrp_out(rgd, bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); + gfs2_trans_end(sdp); + } + } + gfs2_glock_dq_uninit(&gh); + + if (rgd == rgd_end) + break; + + rgd = gfs2_rgrpd_get_next(rgd); + } + +out: + r.len = trimmed << bs_shift; + if (copy_to_user(argp, &r, sizeof(r))) + return -EFAULT; + + return ret; +} + +/** + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree + * @ip: the inode structure + * + */ +static void rs_insert(struct gfs2_inode *ip) +{ + struct rb_node **newn, *parent = NULL; + int rc; + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; + u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); + + BUG_ON(gfs2_rs_active(rs)); + + spin_lock(&rgd->rd_rsspin); + newn = &rgd->rd_rstree.rb_node; + while (*newn) { + struct gfs2_blkreserv *cur = + rb_entry(*newn, struct gfs2_blkreserv, rs_node); + + parent = *newn; + rc = rs_cmp(fsblock, rs->rs_free, cur); + if (rc > 0) + newn = &((*newn)->rb_right); + else if (rc < 0) + newn = &((*newn)->rb_left); + else { + spin_unlock(&rgd->rd_rsspin); + WARN_ON(1); + return; + } + } + + rb_link_node(&rs->rs_node, parent, newn); + rb_insert_color(&rs->rs_node, &rgd->rd_rstree); + + /* Do our rgrp accounting for the reservation */ + rgd->rd_reserved += rs->rs_free; /* blocks reserved */ + spin_unlock(&rgd->rd_rsspin); + trace_gfs2_rs(rs, TRACE_RS_INSERT); +} + +/** + * rg_mblk_search - find a group of multiple free blocks to form a reservation + * @rgd: the resource group descriptor + * @ip: pointer to the inode for which we're reserving blocks + * @ap: the allocation parameters + * + */ + +static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap) +{ + struct gfs2_rbm rbm = { .rgd = rgd, }; + u64 goal; + struct gfs2_blkreserv *rs = ip->i_res; + u32 extlen; + u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; + int ret; + struct inode *inode = &ip->i_inode; + + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + } + if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + return; + + /* Find bitmap block that contains bits for goal block */ + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rgd->rd_last_alloc + rgd->rd_data0; + + if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) + return; + + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); + if (ret == 0) { + rs->rs_rbm = rbm; + rs->rs_free = extlen; + rs->rs_inum = ip->i_no_addr; + rs_insert(ip); + } else { + if (goal == rgd->rd_last_alloc + rgd->rd_data0) + rgd->rd_last_alloc = 0; + } +} + +/** + * gfs2_next_unreserved_block - Return next block that is not reserved + * @rgd: The resource group + * @block: The starting block + * @length: The required length + * @ip: Ignore any reservations for this inode + * + * If the block does not appear in any reservation, then return the + * block number unchanged. If it does appear in the reservation, then + * keep looking through the tree of reservations in order to find the + * first block number which is not reserved. + */ + +static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, + u32 length, + const struct gfs2_inode *ip) +{ + struct gfs2_blkreserv *rs; + struct rb_node *n; + int rc; + + spin_lock(&rgd->rd_rsspin); + n = rgd->rd_rstree.rb_node; + while (n) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + rc = rs_cmp(block, length, rs); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else + break; + } + + if (n) { + while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) { + block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; + n = n->rb_right; + if (n == NULL) + break; + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + } + } + + spin_unlock(&rgd->rd_rsspin); + return block; +} + +/** + * gfs2_reservation_check_and_update - Check for reservations during block alloc + * @rbm: The current position in the resource group + * @ip: The inode for which we are searching for blocks + * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure + * + * This checks the current position in the rgrp to see whether there is + * a reservation covering this block. If not then this function is a + * no-op. If there is, then the position is moved to the end of the + * contiguous reservation(s) so that we are pointing at the first + * non-reserved block. + * + * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error + */ + +static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, + u32 minext, + struct gfs2_extent *maxext) +{ + u64 block = gfs2_rbm_to_block(rbm); + u32 extlen = 1; + u64 nblock; + int ret; + + /* + * If we have a minimum extent length, then skip over any extent + * which is less than the min extent length in size. + */ + if (minext) { + extlen = gfs2_free_extlen(rbm, minext); + if (extlen <= maxext->len) + goto fail; + } + + /* + * Check the extent which has been found against the reservations + * and skip if parts of it are already reserved + */ + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } +fail: + nblock = block + extlen; + } + ret = gfs2_rbm_from_block(rbm, nblock); + if (ret < 0) + return ret; + return 1; +} + +/** + * gfs2_rbm_find - Look for blocks of a particular state + * @rbm: Value/result starting position and final position + * @state: The state which we want to find + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. + * @ip: If set, check for reservations + * @nowrap: Stop looking at the end of the rgrp, rather than wrapping + * around until we've reached the starting point. + * @ap: the allocation parameters + * + * Side effects: + * - If looking for free blocks, we set GBF_FULL on each bitmap which + * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. + * + * Returns: 0 on success, -ENOSPC if there is no block of the requested state + */ + +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) +{ + struct buffer_head *bh; + int initial_bii; + u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; + u32 offset; + u8 *buffer; + int n = 0; + int iters = rbm->rgd->rd_length; + int ret; + struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; + + /* If we are not starting at the beginning of a bitmap, then we + * need to add one to the bitmap count to ensure that we search + * the starting bitmap twice. + */ + if (rbm->offset != 0) + iters++; + + while(1) { + bi = rbm_bi(rbm); + if (test_bit(GBF_FULL, &bi->bi_flags) && + (state == GFS2_BLKST_FREE)) + goto next_bitmap; + + bh = bi->bi_bh; + buffer = bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bh)); + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; + initial_offset = rbm->offset; + offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state); + if (offset == BFITNOENT) + goto bitmap_full; + rbm->offset = offset; + if (ip == NULL) + return 0; + + initial_bii = rbm->bii; + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); + if (ret == 0) + return 0; + if (ret > 0) { + n += (rbm->bii - initial_bii); + goto next_iter; + } + if (ret == -E2BIG) { + rbm->bii = 0; + rbm->offset = 0; + n += (rbm->bii - initial_bii); + goto res_covered_end_of_rgrp; + } + return ret; + +bitmap_full: /* Mark bitmap as full and fall through */ + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { + struct gfs2_bitmap *bi = rbm_bi(rbm); + set_bit(GBF_FULL, &bi->bi_flags); + } + +next_bitmap: /* Find next bitmap in the rgrp */ + rbm->offset = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->rd_length) + rbm->bii = 0; +res_covered_end_of_rgrp: + if ((rbm->bii == 0) && nowrap) + break; + n++; +next_iter: + if (n >= iters) + break; + } + + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + + return -ENOSPC; +} + +/** + * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes + * @rgd: The rgrp + * @last_unlinked: block address of the last dinode we unlinked + * @skip: block address we should explicitly not unlink + * + * Returns: 0 if no error + * The inode, if one has been found, in inode. + */ + +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) +{ + u64 block; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; + struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; + + while (1) { + down_write(&sdp->sd_log_flush_lock); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); + up_write(&sdp->sd_log_flush_lock); + if (error == -ENOSPC) + break; + if (WARN_ON_ONCE(error)) + break; + + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + 1)) + break; + if (*last_unlinked != NO_BLOCK && block <= *last_unlinked) + continue; + if (block == skip) + continue; + *last_unlinked = block; + + error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > NR_CPUS) + return; + } + + rgd->rd_flags &= ~GFS2_RDF_CHECK; + return; +} + +/** + * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested + * @rgd: The rgrp in question + * @loops: An indication of how picky we can be (0=very, 1=less so) + * + * This function uses the recently added glock statistics in order to + * figure out whether a parciular resource group is suffering from + * contention from multiple nodes. This is done purely on the basis + * of timings, since this is the only data we have to work with and + * our aim here is to reject a resource group which is highly contended + * but (very important) not to do this too often in order to ensure that + * we do not land up introducing fragmentation by changing resource + * groups when not actually required. + * + * The calculation is fairly simple, we want to know whether the SRTTB + * (i.e. smoothed round trip time for blocking operations) to acquire + * the lock for this rgrp's glock is significantly greater than the + * time taken for resource groups on average. We introduce a margin in + * the form of the variable @var which is computed as the sum of the two + * respective variences, and multiplied by a factor depending on @loops + * and whether we have a lot of data to base the decision on. This is + * then tested against the square difference of the means in order to + * decide whether the result is statistically significant or not. + * + * Returns: A boolean verdict on the congestion status + */ + +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) +{ + const struct gfs2_glock *gl = rgd->rd_gl; + const struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_lkstats *st; + s64 r_dcount, l_dcount; + s64 r_srttb, l_srttb; + s64 srttb_diff; + s64 sqr_diff; + s64 var; + + preempt_disable(); + st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; + r_srttb = st->stats[GFS2_LKS_SRTTB]; + r_dcount = st->stats[GFS2_LKS_DCOUNT]; + var = st->stats[GFS2_LKS_SRTTVARB] + + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + preempt_enable(); + + l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + + if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + return false; + + srttb_diff = r_srttb - l_srttb; + sqr_diff = srttb_diff * srttb_diff; + + var *= 2; + if (l_dcount < 8 || r_dcount < 8) + var *= 2; + if (loops == 1) + var *= 2; + + return ((srttb_diff < 0) && (sqr_diff > var)); +} + +/** + * gfs2_rgrp_used_recently + * @rs: The block reservation with the rgrp to test + * @msecs: The time limit in milliseconds + * + * Returns: True if the rgrp glock has been used within the time limit + */ +static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, + u64 msecs) +{ + u64 tdiff; + + tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), + rs->rs_rbm.rgd->rd_gl->gl_dstamp)); + + return tdiff > (msecs * 1000 * 1000); +} + +static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 skip; + + get_random_bytes(&skip, sizeof(skip)); + return skip % sdp->sd_rgrps; +} + +static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) +{ + struct gfs2_rgrpd *rgd = *pos; + struct gfs2_sbd *sdp = rgd->rd_sbd; + + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == NULL) + rgd = gfs2_rgrpd_get_first(sdp); + *pos = rgd; + if (rgd != begin) /* If we didn't wrap */ + return true; + return false; +} + +/** + * fast_to_acquire - determine if a resource group will be fast to acquire + * + * If this is one of our preferred rgrps, it should be quicker to acquire, + * because we tried to set ourselves up as dlm lock master. + */ +static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) +{ + struct gfs2_glock *gl = rgd->rd_gl; + + if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + return 1; + if (rgd->rd_flags & GFS2_RDF_PREFERRED) + return 1; + return 0; +} + +/** + * gfs2_inplace_reserve - Reserve space in the filesystem + * @ip: the inode to reserve space for + * @ap: the allocation parameters + * + * We try our best to find an rgrp that has at least ap->target blocks + * available. After a couple of passes (loops == 2), the prospects of finding + * such an rgrp diminish. At this stage, we return the first rgrp that has + * atleast ap->min_target blocks available. Either way, we set ap->allowed to + * the number of blocks available in the chosen rgrp. + * + * Returns: 0 on success, + * -ENOMEM if a suitable rgrp can't be found + * errno otherwise + */ + +int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *begin = NULL; + struct gfs2_blkreserv *rs = ip->i_res; + int error = 0, rg_locked, flags = 0; + u64 last_unlinked = NO_BLOCK; + int loops = 0; + u32 skip = 0; + + if (sdp->sd_args.ar_rgrplvb) + flags |= GL_SKIP; + if (gfs2_assert_warn(sdp, ap->target)) + return -EINVAL; + if (gfs2_rs_active(rs)) { + begin = rs->rs_rbm.rgd; + } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { + rs->rs_rbm.rgd = begin = ip->i_rgd; + } else { + check_and_update_goal(ip); + rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + } + if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) + skip = gfs2_orlov_skip(ip); + if (rs->rs_rbm.rgd == NULL) + return -EBADSLT; + + while (loops < 3) { + rg_locked = 1; + + if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { + rg_locked = 0; + if (skip && skip--) + goto next_rgrp; + if (!gfs2_rs_active(rs)) { + if (loops == 0 && + !fast_to_acquire(rs->rs_rbm.rgd)) + goto next_rgrp; + if ((loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; + } + error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, + LM_ST_EXCLUSIVE, flags, + &rs->rs_rgd_gh); + if (unlikely(error)) + return error; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto skip_rgrp; + if (sdp->sd_args.ar_rgrplvb) { + error = update_rgrp_lvb(rs->rs_rbm.rgd); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + return error; + } + } + } + + /* Skip unuseable resource groups */ + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) + goto skip_rgrp; + + if (sdp->sd_args.ar_rgrplvb) + gfs2_rgrp_bh_get(rs->rs_rbm.rgd); + + /* Get a reservation if we don't already have one */ + if (!gfs2_rs_active(rs)) + rg_mblk_search(rs->rs_rbm.rgd, ip, ap); + + /* Skip rgrps when we can't get a reservation on first pass */ + if (!gfs2_rs_active(rs) && (loops < 1)) + goto check_rgrp; + + /* If rgrp has enough free space, use it */ + if (rs->rs_rbm.rgd->rd_free_clone >= ap->target || + (loops == 2 && ap->min_target && + rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { + ip->i_rgd = rs->rs_rbm.rgd; + ap->allowed = ip->i_rgd->rd_free_clone; + return 0; + } +check_rgrp: + /* Check for unlinked inodes which can be reclaimed */ + if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, + ip->i_no_addr); +skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + + /* Unlock rgrp if required */ + if (!rg_locked) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); +next_rgrp: + /* Find the next rgrp, and continue looking */ + if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) + continue; + if (skip) + continue; + + /* If we've scanned all the rgrps, but found no free blocks + * then this checks for some less likely conditions before + * trying again. + */ + loops++; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + } + /* Flushing the log may release space */ + if (loops == 2) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + } + + return -ENOSPC; +} + +/** + * gfs2_inplace_release - release an inplace reservation + * @ip: the inode the reservation was taken out on + * + * Release a reservation made by gfs2_inplace_reserve(). + */ + +void gfs2_inplace_release(struct gfs2_inode *ip) +{ + struct gfs2_blkreserv *rs = ip->i_res; + + if (rs->rs_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); +} + +/** + * gfs2_get_block_type - Check a block in a RG is of given type + * @rgd: the resource group holding the block + * @block: the block number + * + * Returns: The block type (GFS2_BLKST_*) + */ + +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +{ + struct gfs2_rbm rbm = { .rgd = rgd, }; + int ret; + + ret = gfs2_rbm_from_block(&rbm, block); + WARN_ON_ONCE(ret != 0); + + return gfs2_testbit(&rbm); +} + + +/** + * gfs2_alloc_extent - allocate an extent from a given bitmap + * @rbm: the resource group information + * @dinode: TRUE if the first block we allocate is for a dinode + * @n: The extent length (value/result) + * + * Add the bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + */ +static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, + unsigned int *n) +{ + struct gfs2_rbm pos = { .rgd = rbm->rgd, }; + const unsigned int elen = *n; + u64 block; + int ret; + + *n = 1; + block = gfs2_rbm_to_block(rbm); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh); + gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + block++; + while (*n < elen) { + ret = gfs2_rbm_from_block(&pos, block); + if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) + break; + gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); + gfs2_setbit(&pos, true, GFS2_BLKST_USED); + (*n)++; + block++; + } +} + +/** + * rgblk_free - Change alloc state of given block(s) + * @sdp: the filesystem + * @bstart: the start of a run of blocks to free + * @blen: the length of the block run (all must lie within ONE RG!) + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Returns: Resource group containing the block(s) + */ + +static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, + u32 blen, unsigned char new_state) +{ + struct gfs2_rbm rbm; + struct gfs2_bitmap *bi, *bi_prev = NULL; + + rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); + if (!rbm.rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); + return NULL; + } + + gfs2_rbm_from_block(&rbm, bstart); + while (blen--) { + bi = rbm_bi(&rbm); + if (bi != bi_prev) { + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); + bi_prev = bi; + } + gfs2_setbit(&rbm, false, new_state); + gfs2_rbm_incr(&rbm); + } + + return rbm.rgd; +} + +/** + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question + * + */ + +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_blkreserv *trs; + const struct rb_node *n; + + if (rgd == NULL) + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, + rgd->rd_reserved, rgd->rd_extfail_pt); + spin_lock(&rgd->rd_rsspin); + for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { + trs = rb_entry(n, struct gfs2_blkreserv, rs_node); + dump_rs(seq, trs); + } + spin_unlock(&rgd->rd_rsspin); +} + +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + +/** + * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation + * @ip: The inode we have just allocated blocks for + * @rbm: The start of the allocated blocks + * @len: The extent length + * + * Adjusts a reservation after an allocation has taken place. If the + * reservation does not match the allocation, or if it is now empty + * then it is removed. + */ + +static void gfs2_adjust_reservation(struct gfs2_inode *ip, + const struct gfs2_rbm *rbm, unsigned len) +{ + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rbm->rgd; + unsigned rlen; + u64 block; + int ret; + + spin_lock(&rgd->rd_rsspin); + if (gfs2_rs_active(rs)) { + if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) { + block = gfs2_rbm_to_block(rbm); + ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len); + rlen = min(rs->rs_free, len); + rs->rs_free -= rlen; + rgd->rd_reserved -= rlen; + trace_gfs2_rs(rs, TRACE_RS_CLAIM); + if (rs->rs_free && !ret) + goto out; + /* We used up our block reservation, so we should + reserve more blocks next time. */ + atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint); + } + __rs_deltree(rs); + } +out: + spin_unlock(&rgd->rd_rsspin); +} + +/** + * gfs2_set_alloc_start - Set starting point for block allocation + * @rbm: The rbm which will be set to the required location + * @ip: The gfs2 inode + * @dinode: Flag to say if allocation includes a new inode + * + * This sets the starting point from the reservation if one is active + * otherwise it falls back to guessing a start point based on the + * inode's goal block or the last allocation point in the rgrp. + */ + +static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, bool dinode) +{ + u64 goal; + + if (gfs2_rs_active(ip->i_res)) { + *rbm = ip->i_res->rs_rbm; + return; + } + + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; + + gfs2_rbm_from_block(rbm, goal); +} + +/** + * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode + * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @nblocks: requested number of blocks/extent length (value/result) + * @dinode: 1 if we're allocating a dinode block, else 0 + * @generation: the generation number of the inode + * + * Returns: 0 or error + */ + +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, + bool dinode, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; + unsigned int ndata; + u64 block; /* block, within the file system scope */ + int error; + + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); + + if (error == -ENOSPC) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); + } + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (error) { + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", + (unsigned long long)ip->i_no_addr, error, *nblocks, + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); + goto rgrp_error; + } + + gfs2_alloc_extent(&rbm, dinode, nblocks); + block = gfs2_rbm_to_block(&rbm); + rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; + if (gfs2_rs_active(ip->i_res)) + gfs2_adjust_reservation(ip, &rbm, *nblocks); + ndata = *nblocks; + if (dinode) + ndata--; + + if (!dinode) { + ip->i_goal = block + ndata - 1; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = + (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_meta(ip->i_gl, dibh); + di->di_goal_meta = di->di_goal_data = + cpu_to_be64(ip->i_goal); + brelse(dibh); + } + } + if (rbm.rgd->rd_free < *nblocks) { + pr_warn("nblocks=%u\n", *nblocks); + goto rgrp_error; + } + + rbm.rgd->rd_free -= *nblocks; + if (dinode) { + rbm.rgd->rd_dinodes++; + *generation = rbm.rgd->rd_igeneration++; + if (*generation == 0) + *generation = rbm.rgd->rd_igeneration++; + } + + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); + + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); + if (dinode) + gfs2_trans_add_unrevoke(sdp, block, *nblocks); + + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); + + rbm.rgd->rd_free_clone -= *nblocks; + trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rbm.rgd); + return -EIO; +} + +/** + * __gfs2_free_blocks - free a contiguous run of block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * @meta: 1 if the blocks represent metadata + * + */ + +void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); + rgd->rd_free += blen; + rgd->rd_flags &= ~GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + + /* Directories keep their data in the metadata address space */ + if (meta || ip->i_depth) + gfs2_meta_wipe(ip, bstart, blen); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_blocks(ip, bstart, blen, 1); + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); +} + +void gfs2_unlink_di(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_rgrpd *rgd; + u64 blkno = ip->i_no_addr; + + rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + if (!rgd) + return; + trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, 1); +} + +static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *tmp_rgd; + + tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE); + if (!tmp_rgd) + return; + gfs2_assert_withdraw(sdp, rgd == tmp_rgd); + + if (!rgd->rd_dinodes) + gfs2_consist_rgrpd(rgd); + rgd->rd_dinodes--; + rgd->rd_free++; + + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, -1); + + gfs2_statfs_change(sdp, 0, +1, -1); +} + + +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +{ + gfs2_free_uninit_di(rgd, ip->i_no_addr); + trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); + gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); + gfs2_meta_wipe(ip, ip->i_no_addr, 1); +} + +/** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder rgd_gh; + int error = -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); + if (!rgd) + goto fail; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail: + return error; +} + +/** + * gfs2_rlist_add - add a RG to a list of RGs + * @ip: the inode + * @rlist: the list of resource groups + * @block: the block + * + * Figure out what RG a block belongs to and add that RG to the list + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd **tmp; + unsigned int new_space; + unsigned int x; + + if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) + return; + + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) + rgd = ip->i_rgd; + else + rgd = gfs2_blk2rgrpd(sdp, block, 1); + if (!rgd) { + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); + return; + } + ip->i_rgd = rgd; + + for (x = 0; x < rlist->rl_rgrps; x++) + if (rlist->rl_rgd[x] == rgd) + return; + + if (rlist->rl_rgrps == rlist->rl_space) { + new_space = rlist->rl_space + 10; + + tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS | __GFP_NOFAIL); + + if (rlist->rl_rgd) { + memcpy(tmp, rlist->rl_rgd, + rlist->rl_space * sizeof(struct gfs2_rgrpd *)); + kfree(rlist->rl_rgd); + } + + rlist->rl_space = new_space; + rlist->rl_rgd = tmp; + } + + rlist->rl_rgd[rlist->rl_rgrps++] = rgd; +} + +/** + * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate + * and initialize an array of glock holders for them + * @rlist: the list of resource groups + * @state: the lock state to acquire the RG lock in + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) +{ + unsigned int x; + + rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, + state, 0, + &rlist->rl_ghs[x]); +} + +/** + * gfs2_rlist_free - free a resource group list + * @rlist: the list of resource groups + * + */ + +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) +{ + unsigned int x; + + kfree(rlist->rl_rgd); + + if (rlist->rl_ghs) { + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_uninit(&rlist->rl_ghs[x]); + kfree(rlist->rl_ghs); + rlist->rl_ghs = NULL; + } +} + diff --git a/kernel/fs/gfs2/rgrp.h b/kernel/fs/gfs2/rgrp.h new file mode 100644 index 000000000..68972ecfb --- /dev/null +++ b/kernel/fs/gfs2/rgrp.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +#include +#include + +/* Since each block in the file system is represented by two bits in the + * bitmap, one 64-bit word in the bitmap will represent 32 blocks. + * By reserving 32 blocks at a time, we can optimize / shortcut how we search + * through the bitmaps by looking a word at a time. + */ +#define RGRP_RSRV_MINBYTES 8 +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY)) +#define RGRP_RSRV_ADDBLKS 64 + +struct gfs2_rgrpd; +struct gfs2_sbd; +struct gfs2_holder; + +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); + +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); + +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); + +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); + +#define GFS2_AF_ORLOV 1 +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap); +extern void gfs2_inplace_release(struct gfs2_inode *ip); + +extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode, u64 *generation); + +extern int gfs2_rs_alloc(struct gfs2_inode *ip); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); +extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); + +struct gfs2_rgrp_list { + unsigned int rl_rgrps; + unsigned int rl_space; + struct gfs2_rgrpd **rl_rgd; + struct gfs2_holder *rl_ghs; +}; + +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); +extern int gfs2_fitrim(struct file *filp, void __user *argp); + +/* This is how to tell if a reservation is in the rgrp tree: */ +static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) +{ + return rs && !RB_EMPTY_NODE(&rs->rs_node); +} + +extern void check_and_update_goal(struct gfs2_inode *ip); +#endif /* __RGRP_DOT_H__ */ diff --git a/kernel/fs/gfs2/super.c b/kernel/fs/gfs2/super.c new file mode 100644 index 000000000..859c6edbf --- /dev/null +++ b/kernel/fs/gfs2/super.c @@ -0,0 +1,1666 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "sys.h" +#include "xattr.h" + +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + +enum { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_nodebug, + Opt_upgrade, + Opt_acl, + Opt_noacl, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, + Opt_quota, + Opt_noquota, + Opt_suiddir, + Opt_nosuiddir, + Opt_data_writeback, + Opt_data_ordered, + Opt_meta, + Opt_discard, + Opt_nodiscard, + Opt_commit, + Opt_err_withdraw, + Opt_err_panic, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_nobarrier, + Opt_rgrplvb, + Opt_norgrplvb, + Opt_error, +}; + +static const match_table_t tokens = { + {Opt_lockproto, "lockproto=%s"}, + {Opt_locktable, "locktable=%s"}, + {Opt_hostdata, "hostdata=%s"}, + {Opt_spectator, "spectator"}, + {Opt_spectator, "norecovery"}, + {Opt_ignore_local_fs, "ignore_local_fs"}, + {Opt_localflocks, "localflocks"}, + {Opt_localcaching, "localcaching"}, + {Opt_debug, "debug"}, + {Opt_nodebug, "nodebug"}, + {Opt_upgrade, "upgrade"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_quota_off, "quota=off"}, + {Opt_quota_account, "quota=account"}, + {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_suiddir, "suiddir"}, + {Opt_nosuiddir, "nosuiddir"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, + {Opt_statfs_quantum, "statfs_quantum=%d"}, + {Opt_statfs_percent, "statfs_percent=%d"}, + {Opt_quota_quantum, "quota_quantum=%d"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_rgrplvb, "rgrplvb"}, + {Opt_norgrplvb, "norgrplvb"}, + {Opt_error, NULL} +}; + +/** + * gfs2_mount_args - Parse mount options + * @args: The structure into which the parsed options will be written + * @options: The options to parse + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_args *args, char *options) +{ + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; + int rv; + + /* Split the options into tokens with the "," character and + process them */ + + while (1) { + o = strsep(&options, ","); + if (o == NULL) + break; + if (*o == '\0') + continue; + + token = match_token(o, tokens, tmp); + switch (token) { + case Opt_lockproto: + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); + return -EINVAL; + } + args->ar_debug = 1; + break; + case Opt_nodebug: + args->ar_debug = 0; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = 1; + break; + case Opt_noacl: + args->ar_posix_acl = 0; + break; + case Opt_quota_off: + case Opt_noquota: + args->ar_quota = GFS2_QUOTA_OFF; + break; + case Opt_quota_account: + args->ar_quota = GFS2_QUOTA_ACCOUNT; + break; + case Opt_quota_on: + case Opt_quota: + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = 1; + break; + case Opt_nosuiddir: + args->ar_suiddir = 0; + break; + case Opt_data_writeback: + args->ar_data = GFS2_DATA_WRITEBACK; + break; + case Opt_data_ordered: + args->ar_data = GFS2_DATA_ORDERED; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + pr_warn("commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_quantum: + rv = match_int(&tmp[0], &args->ar_statfs_quantum); + if (rv || args->ar_statfs_quantum < 0) { + pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_quota_quantum: + rv = match_int(&tmp[0], &args->ar_quota_quantum); + if (rv || args->ar_quota_quantum <= 0) { + pr_warn("quota_quantum mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_percent: + rv = match_int(&tmp[0], &args->ar_statfs_percent); + if (rv || args->ar_statfs_percent < 0 || + args->ar_statfs_percent > 100) { + pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; + case Opt_barrier: + args->ar_nobarrier = 0; + break; + case Opt_nobarrier: + args->ar_nobarrier = 1; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = 1; + break; + case Opt_norgrplvb: + args->ar_rgrplvb = 0; + break; + case Opt_error: + default: + pr_warn("invalid mount option: %s\n", o); + return -EINVAL; + } + } + + return 0; +} + +/** + * gfs2_jindex_free - Clear all the journal index information + * @sdp: The GFS2 superblock + * + */ + +void gfs2_jindex_free(struct gfs2_sbd *sdp) +{ + struct list_head list; + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_add(&list, &sdp->sd_jindex_list); + list_del_init(&sdp->sd_jindex_list); + sdp->sd_journals = 0; + spin_unlock(&sdp->sd_jindex_spin); + + while (!list_empty(&list)) { + jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + gfs2_free_journal_extents(jd); + list_del(&jd->jd_list); + iput(jd->jd_inode); + kfree(jd); + } +} + +static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) +{ + struct gfs2_jdesc *jd; + int found = 0; + + list_for_each_entry(jd, head, jd_list) { + if (jd->jd_jid == jid) { + found = 1; + break; + } + } + + if (!found) + jd = NULL; + + return jd; +} + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + spin_unlock(&sdp->sd_jindex_spin); + + return jd; +} + +int gfs2_jdesc_check(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); + + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + return -EIO; + + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; + + if (gfs2_write_alloc_required(ip, 0, size)) { + gfs2_consist_inode(ip); + return -EIO; + } + + return 0; +} + +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + +/** + * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_holder freeze_gh; + struct gfs2_log_header_host head; + int error; + + error = init_threads(sdp); + if (error) + return error; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &freeze_gh); + if (error) + goto fail_threads; + + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + goto fail; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + gfs2_consist(sdp); + error = -EIO; + goto fail; + } + + /* Initialize some head of the log stuff */ + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + + error = gfs2_quota_init(sdp); + if (error) + goto fail; + + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + gfs2_glock_dq_uninit(&freeze_gh); + + return 0; + +fail: + freeze_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&freeze_gh); +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + return error; +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +{ + const struct gfs2_statfs_change *str = buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +{ + struct gfs2_statfs_change *str = buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + +int gfs2_statfs_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + if (sdp->sd_args.ar_spectator) { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + } else { + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_m_bh; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_in(l_sc, l_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + } + +out_m_bh: + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return 0; +} + +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes) +{ + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct buffer_head *l_bh; + s64 x, y; + int need_sync = 0; + int error; + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + return; + + gfs2_trans_add_meta(l_ip->i_gl, l_bh); + + spin_lock(&sdp->sd_statfs_spin); + l_sc->sc_total += total; + l_sc->sc_free += free; + l_sc->sc_dinodes += dinodes; + gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + if (sdp->sd_args.ar_statfs_percent) { + x = 100 * l_sc->sc_free; + y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; + if (x >= y || x <= -y) + need_sync = 1; + } + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + if (need_sync) + gfs2_wake_up_statfs(sdp); +} + +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_meta(l_ip->i_gl, l_bh); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_meta(m_ip->i_gl, m_bh); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); +} + +int gfs2_statfs_sync(struct super_block *sb, int type) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder gh; + struct buffer_head *m_bh, *l_bh; + int error; + + sb_start_write(sb); + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + goto out; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out_unlock; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { + spin_unlock(&sdp->sd_statfs_spin); + goto out_bh; + } + spin_unlock(&sdp->sd_statfs_spin); + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_bh; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out_bh2; + + update_statfs(sdp, m_bh, l_bh); + sdp->sd_statfs_force_sync = 0; + + gfs2_trans_end(sdp); + +out_bh2: + brelse(l_bh); +out_bh: + brelse(m_bh); +out_unlock: + gfs2_glock_dq_uninit(&gh); +out: + sb_end_write(sb); + return error; +} + +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + +/** + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * @state: the state to put the transaction lock into + * @t_gh: the hold on the transaction lock + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, + struct gfs2_holder *freeze_gh) +{ + struct gfs2_inode *ip; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header_host lh; + int error; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, freeze_gh); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (error) + gfs2_glock_dq_uninit(freeze_gh); + +out: + while (!list_empty(&list)) { + lfcc = list_entry(list.next, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + return error; +} + +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(ip->i_inode.i_mode); + str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); + str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); + str->di_generation = cpu_to_be64(ip->i_generation); + + str->di_flags = cpu_to_be32(ip->i_diskflags); + str->di_height = cpu_to_be16(ip->i_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(ip->i_depth); + str->di_entries = cpu_to_be32(ip->i_entries); + + str->di_eattr = cpu_to_be64(ip->i_eattr); + str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); +} + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @wbc: The writeback control structure + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); + int ret = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); + if (wbc->sync_mode == WB_SYNC_ALL) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + return ret; +} + +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and transaction lock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + return; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + return; + } + need_unlock = 1; + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + } + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) + gfs2_glock_dq_uninit(&gh); +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder freeze_gh; + int error; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, + &freeze_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + + flush_workqueue(gfs2_delete_workqueue); + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); + + down_write(&sdp->sd_log_flush_lock); + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + up_write(&sdp->sd_log_flush_lock); + + gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); + + if (freeze_gh.gh_gl) + gfs2_glock_dq_uninit(&freeze_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + struct gfs2_jdesc *jd; + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_jindex); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_freeze_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_quota_sync(sb, -1); + if (wait && sdp) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + return 0; +} + +void gfs2_freeze_func(struct work_struct *work) +{ + int error; + struct gfs2_holder freeze_gh; + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); + struct super_block *sb = sdp->sd_vfs; + + atomic_inc(&sb->s_active); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &freeze_gh); + if (error) { + printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + else { + atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); + error = thaw_super(sb); + if (error) { + printk(KERN_INFO "GFS2: couldn't thaw filesystem: %d\n", + error); + gfs2_assert_withdraw(sdp, 0); + } + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + freeze_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&freeze_gh); + } + deactivate_super(sb); + return; +} + +/** + * gfs2_freeze - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error = 0; + + mutex_lock(&sdp->sd_freeze_mutex); + if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) + goto out; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + error = -EINVAL; + goto out; + } + + for (;;) { + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + error = 0; +out: + mutex_unlock(&sdp->sd_freeze_mutex); + return error; +} + +/** + * gfs2_unfreeze - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_unfreeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + mutex_lock(&sdp->sd_freeze_mutex); + if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || + sdp->sd_freeze_gh.gh_gl == NULL) { + mutex_unlock(&sdp->sd_freeze_mutex); + return 0; + } + + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_mutex); + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = d_inode(dentry)->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; + int error; + + sync_filesystem(sb); + + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_logd_secs; + args.ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + args.ar_statfs_quantum = 0; + else + args.ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); + error = gfs2_mount_args(&args, data); + if (error) + return error; + + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; + + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else + error = gfs2_make_fs_rw(sdp); + if (error) + return error; + } + + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = args.ar_commit; + gt->gt_quota_quantum = args.ar_quota_quantum; + if (args.ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = args.ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then its because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static int gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + return generic_drop_inode(inode); +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @root: root of this (sub)tree + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct gfs2_sbd *sdp = root->d_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + int val; + + if (is_ancestor(root, sdp->sd_master_dir)) + seq_puts(s, ",meta"); + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_puts(s, ",spectator"); + if (args->ar_localflocks) + seq_puts(s, ",localflocks"); + if (args->ar_debug) + seq_puts(s, ",debug"); + if (args->ar_posix_acl) + seq_puts(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_puts(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_puts(s, ",discard"); + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) + seq_printf(s, ",commit=%d", val); + val = sdp->sd_tune.gt_statfs_quantum; + if (val != 30) + seq_printf(s, ",statfs_quantum=%d", val); + else if (sdp->sd_tune.gt_statfs_slow) + seq_puts(s, ",statfs_quantum=0"); + val = sdp->sd_tune.gt_quota_quantum; + if (val != 60) + seq_printf(s, ",quota_quantum=%d", val); + if (args->ar_statfs_percent) + seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + seq_puts(s, ",nobarrier"); + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_puts(s, ",demote_interface_used"); + if (args->ar_rgrplvb) + seq_puts(s, ",rgrplvb"); + return 0; +} + +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +static int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_qs; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_qs; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&gh); +out_qs: + gfs2_quota_unhold(ip); + return error; +} + +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } + + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) + goto out; + + /* Must not read inode block until block type has been verified */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); + if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + } + + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + goto out_truncate; + } + + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; + + /* Case 1 starts here */ + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + goto out_unlock; + +out_truncate: + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + filemap_fdatawrite(metamapping); + filemap_fdatawait(metamapping); + } + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + + /* Case 2 starts here */ + error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (error) + goto out_unlock; + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + gfs2_trans_end(sdp); + +out_unlock: + /* Error path for case 1 */ + if (gfs2_rs_active(ip->i_res)) + gfs2_rs_deltree(ip->i_res); + + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq(&ip->i_iopen_gh); + } + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error && error != GLR_TRYFAILED && error != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", error); +out: + /* Case 3 starts here */ + truncate_inode_pages_final(&inode->i_data); + gfs2_rs_delete(ip, NULL); + gfs2_ordered_del_inode(ip); + clear_inode(inode); + gfs2_dir_hash_inval(ip); + ip->i_gl->gl_object = NULL; + flush_delayed_work(&ip->i_gl->gl_work); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + ip->i_rgd = NULL; + ip->i_res = NULL; + } + return &ip->i_inode; +} + +static void gfs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(gfs2_inode_cachep, inode); +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, gfs2_i_callback); +} + +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, + .evict_inode = gfs2_evict_inode, + .put_super = gfs2_put_super, + .sync_fs = gfs2_sync_fs, + .freeze_super = gfs2_freeze, + .thaw_super = gfs2_unfreeze, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + diff --git a/kernel/fs/gfs2/super.h b/kernel/fs/gfs2/super.h new file mode 100644 index 000000000..73c97dcca --- /dev/null +++ b/kernel/fs/gfs2/super.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SUPER_DOT_H__ +#define __SUPER_DOT_H__ + +#include +#include +#include "incore.h" + +extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); + +static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) +{ + unsigned int x; + spin_lock(&sdp->sd_jindex_spin); + x = sdp->sd_journals; + spin_unlock(&sdp->sd_jindex_spin); + return x; +} + +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); + +extern int gfs2_mount_args(struct gfs2_args *args, char *data); + +extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); + +extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); +extern int gfs2_statfs_init(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); +extern int gfs2_statfs_sync(struct super_block *sb, int type); +extern void gfs2_freeze_func(struct work_struct *work); + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern const struct dentry_operations gfs2_dops; +extern const struct xattr_handler *gfs2_xattr_handlers[]; + +#endif /* __SUPER_DOT_H__ */ + diff --git a/kernel/fs/gfs2/sys.c b/kernel/fs/gfs2/sys.c new file mode 100644 index 000000000..ae8e8811f --- /dev/null +++ b/kernel/fs/gfs2/sys.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "sys.h" +#include "super.h" +#include "glock.h" +#include "quota.h" +#include "util.h" +#include "glops.h" +#include "recovery.h" + +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static const struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; + +static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u:%u\n", + MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev)); +} + +static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); +} + +static int gfs2_uuid_valid(const u8 *uuid) +{ + int i; + + for (i = 0; i < 16; i++) { + if (uuid[i]) + return 1; + } + return 0; +} + +static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + buf[0] = '\0'; + if (!gfs2_uuid_valid(uuid)) + return 0; + return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; + + return snprintf(buf, PAGE_SIZE, "%d\n", frozen); +} + +static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int error; + int n = simple_strtol(buf, NULL, 0); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (n) { + case 0: + error = thaw_super(sdp->sd_vfs); + break; + case 1: + error = freeze_super(sdp->sd_vfs); + break; + default: + return -EINVAL; + } + + if (error) { + fs_warn(sdp, "freeze %d error %d", n, error); + return error; + } + + return len; +} + +static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags); + return snprintf(buf, PAGE_SIZE, "%u\n", b); +} + +static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); + + return len; +} + +static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_statfs_sync(sdp->sd_vfs, 0); + return len; +} + +static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_quota_sync(sdp->sd_vfs, 0); + return len; +} + +static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct kqid qid; + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + id = simple_strtoul(buf, NULL, 0); + + qid = make_kqid(current_user_ns(), USRQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; +} + +static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct kqid qid; + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + id = simple_strtoul(buf, NULL, 0); + + qid = make_kqid(current_user_ns(), GRPQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; +} + +static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct gfs2_glock *gl; + const struct gfs2_glock_operations *glops; + unsigned int glmode; + unsigned int gltype; + unsigned long long glnum; + char mode[16]; + int rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, + mode); + if (rv != 3) + return -EINVAL; + + if (strcmp(mode, "EX") == 0) + glmode = LM_ST_UNLOCKED; + else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0)) + glmode = LM_ST_DEFERRED; + else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0)) + glmode = LM_ST_SHARED; + else + return -EINVAL; + + if (gltype > LM_TYPE_JOURNAL) + return -EINVAL; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK) + glops = &gfs2_freeze_glops; + else + glops = gfs2_glops_list[gltype]; + if (glops == NULL) + return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); + rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); + if (rv) + return rv; + gfs2_glock_cb(gl, glmode); + gfs2_glock_put(gl); + return len; +} + + +#define GFS2_ATTR(name, mode, show, store) \ +static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) + +GFS2_ATTR(id, 0444, id_show, NULL); +GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(uuid, 0444, uuid_show, NULL); +GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); +GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); +GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); +GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); +GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); +GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); + +static struct attribute *gfs2_attrs[] = { + &gfs2_attr_id.attr, + &gfs2_attr_fsname.attr, + &gfs2_attr_uuid.attr, + &gfs2_attr_freeze.attr, + &gfs2_attr_withdraw.attr, + &gfs2_attr_statfs_sync.attr, + &gfs2_attr_quota_sync.attr, + &gfs2_attr_quota_refresh_user.attr, + &gfs2_attr_quota_refresh_group.attr, + &gfs2_attr_demote_rq.attr, + NULL, +}; + +static void gfs2_sbd_release(struct kobject *kobj) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + + kfree(sdp); +} + +static struct kobj_type gfs2_ktype = { + .release = gfs2_sbd_release, + .default_attrs = gfs2_attrs, + .sysfs_ops = &gfs2_attr_ops, +}; + + +/* + * lock_module. Originally from lock_dlm + */ + +static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf) +{ + const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops; + return sprintf(buf, "%s\n", ops->lm_proto_name); +} + +static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + smp_mb__after_atomic(); + gfs2_glock_thaw(sdp); + } else { + ret = -EINVAL; + } + return ret; +} + +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + ret = -EINVAL; + return ret; +} + +static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first); +} + +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); +} + +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) +{ + struct gfs2_jdesc *jd; + int rv; + + /* Wait for our primary journal to be initialized */ + wait_for_completion(&sdp->sd_journal_ready); + + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (sdp->sd_jdesc->jd_jid == jid) + goto out; + rv = -ENOENT; + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + rv = gfs2_recover_journal(jd, false); + break; + } +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { + rv = -ESHUTDOWN; + goto out; + } + + rv = gfs2_recover_set(sdp, jid); +out: + return rv ? rv : len; +} + +static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_done); +} + +static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_status); +} + +static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); +} + +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int jid; + int rv; + + rv = sscanf(buf, "%d", &jid); + if (rv != 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; + sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *lock_module_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, + NULL, +}; + +/* + * get and set struct gfs2_tune fields + */ + +static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u %u\n", + sdp->sd_tune.gt_quota_scale_num, + sdp->sd_tune.gt_quota_scale_den); +} + +static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x, y; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) + return -EINVAL; + + spin_lock(>->gt_spin); + gt->gt_quota_scale_num = x; + gt->gt_quota_scale_den = y; + spin_unlock(>->gt_spin); + return len; +} + +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, + int check_zero, const char *buf, size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + spin_lock(>->gt_spin); + *field = x; + spin_unlock(>->gt_spin); + return len; +} + +#define TUNE_ATTR_3(name, show, store) \ +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) + +#define TUNE_ATTR_2(name, store) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \ +} \ +TUNE_ATTR_3(name, name##_show, store) + +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +TUNE_ATTR(quota_warn_period, 0); +TUNE_ATTR(quota_quantum, 0); +TUNE_ATTR(max_readahead, 0); +TUNE_ATTR(complain_secs, 0); +TUNE_ATTR(statfs_slow, 0); +TUNE_ATTR(new_files_jdata, 0); +TUNE_ATTR(statfs_quantum, 1); +TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); + +static struct attribute *tune_attrs[] = { + &tune_attr_quota_warn_period.attr, + &tune_attr_quota_quantum.attr, + &tune_attr_max_readahead.attr, + &tune_attr_complain_secs.attr, + &tune_attr_statfs_slow.attr, + &tune_attr_statfs_quantum.attr, + &tune_attr_quota_scale.attr, + &tune_attr_new_files_jdata.attr, + NULL, +}; + +static struct attribute_group tune_group = { + .name = "tune", + .attrs = tune_attrs, +}; + +static struct attribute_group lock_module_group = { + .name = "lock_module", + .attrs = lock_module_attrs, +}; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + int sysfs_frees_sdp = 0; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + + sdp->sd_kobj.kset = gfs2_kset; + error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, + "%s", sdp->sd_table_name); + if (error) + goto fail_reg; + + sysfs_frees_sdp = 1; /* Freeing sdp is now done by sysfs calling + function gfs2_sbd_release. */ + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); + if (error) + goto fail_tune; + + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); + return 0; + +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); +fail_tune: + sysfs_remove_group(&sdp->sd_kobj, &tune_group); +fail_reg: + free_percpu(sdp->sd_lkstats); + fs_err(sdp, "error %d adding sysfs files", error); + if (sysfs_frees_sdp) + kobject_put(&sdp->sd_kobj); + else + kfree(sdp); + sb->s_fs_info = NULL; + return error; +} + +void gfs2_sys_fs_del(struct gfs2_sbd *sdp) +{ + sysfs_remove_link(&sdp->sd_kobj, "device"); + sysfs_remove_group(&sdp->sd_kobj, &tune_group); + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); + kobject_put(&sdp->sd_kobj); +} + +static int gfs2_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); + if (gfs2_uuid_valid(uuid)) + add_uevent_var(env, "UUID=%pUB", uuid); + return 0; +} + +static const struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + +int gfs2_sys_init(void) +{ + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); + if (!gfs2_kset) + return -ENOMEM; + return 0; +} + +void gfs2_sys_uninit(void) +{ + kset_unregister(gfs2_kset); +} + diff --git a/kernel/fs/gfs2/sys.h b/kernel/fs/gfs2/sys.h new file mode 100644 index 000000000..79182d6ad --- /dev/null +++ b/kernel/fs/gfs2/sys.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SYS_DOT_H__ +#define __SYS_DOT_H__ + +#include +struct gfs2_sbd; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp); +void gfs2_sys_fs_del(struct gfs2_sbd *sdp); + +int gfs2_sys_init(void); +void gfs2_sys_uninit(void); + +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + +#endif /* __SYS_DOT_H__ */ + diff --git a/kernel/fs/gfs2/trace_gfs2.h b/kernel/fs/gfs2/trace_gfs2.h new file mode 100644 index 000000000..20c007d74 --- /dev/null +++ b/kernel/fs/gfs2/trace_gfs2.h @@ -0,0 +1,558 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + +#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_GFS2_H + +#include + +#include +#include +#include +#include +#include +#include +#include "incore.h" +#include "glock.h" +#include "rgrp.h" + +#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } +#define glock_trace_name(x) __print_symbolic(x, \ + dlm_state_name(IV), \ + dlm_state_name(NL), \ + dlm_state_name(CR), \ + dlm_state_name(CW), \ + dlm_state_name(PR), \ + dlm_state_name(PW), \ + dlm_state_name(EX)) + +#define block_state_name(x) __print_symbolic(x, \ + { GFS2_BLKST_FREE, "free" }, \ + { GFS2_BLKST_USED, "used" }, \ + { GFS2_BLKST_DINODE, "dinode" }, \ + { GFS2_BLKST_UNLINKED, "unlinked" }) + +#define TRACE_RS_DELETE 0 +#define TRACE_RS_TREEDEL 1 +#define TRACE_RS_INSERT 2 +#define TRACE_RS_CLAIM 3 + +#define rs_func_name(x) __print_symbolic(x, \ + { 0, "del " }, \ + { 1, "tdel" }, \ + { 2, "ins " }, \ + { 3, "clm " }) + +#define show_glock_flags(flags) __print_flags(flags, "", \ + {(1UL << GLF_LOCK), "l" }, \ + {(1UL << GLF_DEMOTE), "D" }, \ + {(1UL << GLF_PENDING_DEMOTE), "d" }, \ + {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ + {(1UL << GLF_DIRTY), "y" }, \ + {(1UL << GLF_LFLUSH), "f" }, \ + {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_REPLY_PENDING), "r" }, \ + {(1UL << GLF_INITIAL), "I" }, \ + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_QUEUED), "q" }, \ + {(1UL << GLF_LRU), "L" }, \ + {(1UL << GLF_OBJECT), "o" }, \ + {(1UL << GLF_BLOCKING), "b" }) + +#ifndef NUMPTY +#define NUMPTY +static inline u8 glock_trace_state(unsigned int state) +{ + switch(state) { + case LM_ST_SHARED: + return DLM_LOCK_PR; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + } + return DLM_LOCK_NL; +} +#endif + +/* Section 1 - Locking + * + * Objectives: + * Latency: Remote demote request to state change + * Latency: Local lock request to state change + * Latency: State change to lock grant + * Correctness: Ordering of local lock state vs. I/O requests + * Correctness: Responses to remote demote requests + */ + +/* General glock state change (DLM lock request completes) */ +TRACE_EVENT(gfs2_glock_state_change, + + TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state), + + TP_ARGS(gl, new_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, new_state ) + __field( u8, dmt_state ) + __field( u8, tgt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->new_state = glock_trace_state(new_state); + __entry->tgt_state = glock_trace_state(gl->gl_target); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->new_state), + glock_trace_name(__entry->tgt_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) +); + +/* State change -> unlocked, glock is being deallocated */ +TRACE_EVENT(gfs2_glock_put, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL< %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->gltype, (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(DLM_LOCK_IV), + show_glock_flags(__entry->flags)) + +); + +/* Callback (local or remote) requesting lock demotion */ +TRACE_EVENT(gfs2_demote_rq, + + TP_PROTO(const struct gfs2_glock *gl, bool remote), + + TP_ARGS(gl, remote), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, dmt_state ) + __field( unsigned long, flags ) + __field( bool, remote ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<remote = remote; + ), + + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") + +); + +/* Promotion/grant of a glock */ +TRACE_EVENT(gfs2_promote, + + TP_PROTO(const struct gfs2_holder *gh, int first), + + TP_ARGS(gh, first), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, first ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->first = first; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu promote %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->first ? "first": "other", + glock_trace_name(__entry->state)) +); + +/* Queue/dequeue a lock request */ +TRACE_EVENT(gfs2_glock_queue, + + TP_PROTO(const struct gfs2_holder *gh, int queue), + + TP_ARGS(gh, queue), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, queue ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->queue = queue; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu %squeue %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->queue ? "" : "de", + glock_trace_name(__entry->state)) +); + +/* DLM sends a reply to GFS2 */ +TRACE_EVENT(gfs2_glock_lock_time, + + TP_PROTO(const struct gfs2_glock *gl, s64 tdiff), + + TP_ARGS(gl, tdiff), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, status ) + __field( char, flags ) + __field( s64, tdiff ) + __field( s64, srtt ) + __field( s64, srttvar ) + __field( s64, srttb ) + __field( s64, srttvarb ) + __field( s64, sirt ) + __field( s64, sirtvar ) + __field( s64, dcount ) + __field( s64, qcount ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->status = gl->gl_lksb.sb_status; + __entry->flags = gl->gl_lksb.sb_flags; + __entry->tdiff = tdiff; + __entry->srtt = gl->gl_stats.stats[GFS2_LKS_SRTT]; + __entry->srttvar = gl->gl_stats.stats[GFS2_LKS_SRTTVAR]; + __entry->srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + __entry->srttvarb = gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + __entry->sirt = gl->gl_stats.stats[GFS2_LKS_SIRT]; + __entry->sirtvar = gl->gl_stats.stats[GFS2_LKS_SIRTVAR]; + __entry->dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + __entry->qcount = gl->gl_stats.stats[GFS2_LKS_QCOUNT]; + ), + + TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->status, __entry->flags, + (long long)__entry->tdiff, + (long long)__entry->srtt, + (long long)__entry->srttvar, + (long long)__entry->srttb, + (long long)__entry->srttvarb, + (long long)__entry->sirt, + (long long)__entry->sirtvar, + (long long)__entry->dcount, + (long long)__entry->qcount) +); + +/* Section 2 - Log/journal + * + * Objectives: + * Latency: Log flush time + * Correctness: pin/unpin vs. disk I/O ordering + * Performance: Log usage stats + */ + +/* Pin/unpin a block in the log */ +TRACE_EVENT(gfs2_pin, + + TP_PROTO(const struct gfs2_bufdata *bd, int pin), + + TP_ARGS(bd, pin), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, pin ) + __field( u32, len ) + __field( sector_t, block ) + __field( u64, ino ) + ), + + TP_fast_assign( + __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev; + __entry->pin = pin; + __entry->len = bd->bd_bh->b_size; + __entry->block = bd->bd_bh->b_blocknr; + __entry->ino = bd->bd_gl->gl_name.ln_number; + ), + + TP_printk("%u,%u log %s %llu/%lu inode %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->pin ? "pin" : "unpin", + (unsigned long long)__entry->block, + (unsigned long)__entry->len, + (unsigned long long)__entry->ino) +); + +/* Flushing the log */ +TRACE_EVENT(gfs2_log_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, int start), + + TP_ARGS(sdp, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( u64, log_seq ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->log_seq = sdp->sd_log_sequence; + ), + + TP_printk("%u,%u log flush %s %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start ? "start" : "end", + (unsigned long long)__entry->log_seq) +); + +/* Reserving/releasing blocks in the log */ +TRACE_EVENT(gfs2_log_blocks, + + TP_PROTO(const struct gfs2_sbd *sdp, int blocks), + + TP_ARGS(sdp, blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, blocks ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->blocks = blocks; + ), + + TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks) +); + +/* Writing back the AIL */ +TRACE_EVENT(gfs2_ail_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start), + + TP_ARGS(sdp, wbc, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( int, sync_mode ) + __field( long, nr_to_write ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->sync_mode = wbc->sync_mode; + __entry->nr_to_write = wbc->nr_to_write; + ), + + TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->start ? "start" : "end", + __entry->sync_mode == WB_SYNC_ALL ? "all" : "none", + __entry->nr_to_write) +); + +/* Section 3 - bmap + * + * Objectives: + * Latency: Bmap request time + * Performance: Block allocator tracing + * Correctness: Test of disard generation vs. blocks allocated + */ + +/* Map an extent of blocks, possibly a new allocation */ +TRACE_EVENT(gfs2_bmap, + + TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh, + sector_t lblock, int create, int errno), + + TP_ARGS(ip, bh, lblock, create, errno), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, lblock ) + __field( sector_t, pblock ) + __field( u64, inum ) + __field( unsigned long, state ) + __field( u32, len ) + __field( int, create ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->lblock = lblock; + __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; + __entry->inum = ip->i_no_addr; + __entry->state = bh->b_state; + __entry->len = bh->b_size; + __entry->create = create; + __entry->errno = errno; + ), + + TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->lblock, + (unsigned long)__entry->len, + (unsigned long long)__entry->pblock, + __entry->state, __entry->create ? "create " : "nocreate", + __entry->errno) +); + +/* Keep track of blocks as they are allocated/freed */ +TRACE_EVENT(gfs2_block_alloc, + + TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 block, unsigned len, u8 block_state), + + TP_ARGS(ip, rgd, block, len, block_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, start ) + __field( u64, inum ) + __field( u32, len ) + __field( u8, block_state ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) + ), + + TP_fast_assign( + __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev; + __entry->start = block; + __entry->inum = ip->i_no_addr; + __entry->len = len; + __entry->block_state = block_state; + __entry->rd_addr = rgd->rd_addr; + __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_reserved = rgd->rd_reserved; + ), + + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long)__entry->len, + block_state_name(__entry->block_state), + (unsigned long long)__entry->rd_addr, + __entry->rd_free_clone, (unsigned long)__entry->rd_reserved) +); + +/* Keep track of multi-block reservations as they are allocated/freed */ +TRACE_EVENT(gfs2_rs, + + TP_PROTO(const struct gfs2_blkreserv *rs, u8 func), + + TP_ARGS(rs, func), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) + __field( u64, inum ) + __field( u64, start ) + __field( u32, free ) + __field( u8, func ) + ), + + TP_fast_assign( + __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; + __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; + __entry->inum = rs->rs_inum; + __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); + __entry->free = rs->rs_free; + __entry->func = func; + ), + + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long long)__entry->rd_addr, + (unsigned long)__entry->rd_free_clone, + (unsigned long)__entry->rd_reserved, + rs_func_name(__entry->func), (unsigned long)__entry->free) +); + +#endif /* _TRACE_GFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 +#include + diff --git a/kernel/fs/gfs2/trans.c b/kernel/fs/gfs2/trans.c new file mode 100644 index 000000000..88bff2430 --- /dev/null +++ b/kernel/fs/gfs2/trans.c @@ -0,0 +1,279 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + + BUG_ON(current->journal_info); + BUG_ON(blocks == 0 && revokes == 0); + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + if (!tr) + return -ENOMEM; + + tr->tr_ip = _RET_IP_; + tr->tr_blocks = blocks; + tr->tr_revokes = revokes; + tr->tr_reserved = 1; + tr->tr_alloced = 1; + if (blocks) + tr->tr_reserved += 6 + blocks; + if (revokes) + tr->tr_reserved += gfs2_struct2blk(sdp, revokes, + sizeof(u64)); + INIT_LIST_HEAD(&tr->tr_databuf); + INIT_LIST_HEAD(&tr->tr_buf); + + sb_start_intwrite(sdp->sd_vfs); + + error = gfs2_log_reserve(sdp, tr->tr_reserved); + if (error) + goto fail; + + current->journal_info = tr; + + return 0; + +fail: + sb_end_intwrite(sdp->sd_vfs); + kfree(tr); + + return error; +} + +static void gfs2_print_trans(const struct gfs2_trans *tr) +{ + pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); + pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke, tr->tr_num_revoke_rm); +} + +void gfs2_trans_end(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr = current->journal_info; + s64 nbuf; + int alloced = tr->tr_alloced; + + BUG_ON(!tr); + current->journal_info = NULL; + + if (!tr->tr_touched) { + gfs2_log_release(sdp, tr->tr_reserved); + if (alloced) { + kfree(tr); + sb_end_intwrite(sdp->sd_vfs); + } + return; + } + + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; + nbuf -= tr->tr_num_buf_rm; + nbuf -= tr->tr_num_databuf_rm; + + if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && + (tr->tr_num_revoke <= tr->tr_revokes))) + gfs2_print_trans(tr); + + gfs2_log_commit(sdp, tr); + if (alloced && !tr->tr_attached) + kfree(tr); + up_read(&sdp->sd_log_flush_lock); + + if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + if (alloced) + sb_end_intwrite(sdp->sd_vfs); +} + +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh, + const struct gfs2_log_operations *lops) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + bd->bd_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bh->b_private = bd; + return bd; +} + +/** + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer + * @bh: The buffer to add + * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_bufdata *bd; + + if (!gfs2_is_jdata(ip)) { + gfs2_ordered_add_inode(ip); + return; + } + + lock_buffer(bh); + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + tr->tr_touched = 1; + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + list_add_tail(&bd->bd_list, &tr->tr_databuf); + } + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + + tr = current->journal_info; + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list)) + return; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + if (unlikely(state == SFS_FROZEN)) { + printk(KERN_INFO "GFS2:adding buf while frozen\n"); + gfs2_assert_withdraw(sdp, 0); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + list_add(&bd->bd_list, &tr->tr_buf); + tr->tr_num_buf_new++; +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ + + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_bufdata *bd; + + lock_buffer(bh); + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + unlock_page(bh->b_page); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + meta_lo_add(sdp, bd); + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!list_empty(&bd->bd_list)); + gfs2_add_revoke(sdp, bd); + tr->tr_touched = 1; + tr->tr_num_revoke++; +} + +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) +{ + struct gfs2_bufdata *bd, *tmp; + struct gfs2_trans *tr = current->journal_info; + unsigned int n = len; + + gfs2_log_lock(sdp); + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) { + if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { + list_del_init(&bd->bd_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + kmem_cache_free(gfs2_bufdata_cachep, bd); + tr->tr_num_revoke_rm++; + if (--n == 0) + break; + } + } + gfs2_log_unlock(sdp); +} + diff --git a/kernel/fs/gfs2/trans.h b/kernel/fs/gfs2/trans.h new file mode 100644 index 000000000..1e6e7da25 --- /dev/null +++ b/kernel/fs/gfs2/trans.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __TRANS_DOT_H__ +#define __TRANS_DOT_H__ + +#include +struct gfs2_sbd; +struct gfs2_rgrpd; +struct gfs2_glock; + +#define RES_DINODE 1 +#define RES_INDIRECT 1 +#define RES_JDATA 1 +#define RES_DATA 1 +#define RES_LEAF 1 +#define RES_RG_HDR 1 +#define RES_RG_BIT 2 +#define RES_EATTR 1 +#define RES_STATFS 1 +#define RES_QUOTA 2 + +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) +{ + if (requested < ip->i_rgd->rd_length) + return requested + 1; + return ip->i_rgd->rd_length; +} + +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); + +#endif /* __TRANS_DOT_H__ */ diff --git a/kernel/fs/gfs2/util.c b/kernel/fs/gfs2/util.c new file mode 100644 index 000000000..86d2035ac --- /dev/null +++ b/kernel/fs/gfs2/util.c @@ -0,0 +1,265 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "util.h" + +struct kmem_cache *gfs2_glock_cachep __read_mostly; +struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly; +struct kmem_cache *gfs2_inode_cachep __read_mostly; +struct kmem_cache *gfs2_bufdata_cachep __read_mostly; +struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; +struct kmem_cache *gfs2_rsrv_cachep __read_mostly; +mempool_t *gfs2_page_pool __read_mostly; + +void gfs2_assert_i(struct gfs2_sbd *sdp) +{ + fs_emerg(sdp, "fatal assertion failed\n"); +} + +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; + va_list args; + struct va_format vaf; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + fs_err(sdp, "%pV", &vaf); + + va_end(args); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); + + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); + } + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); + + return -1; +} + +/** + * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); + dump_stack(); + return (me) ? -1 : -2; +} + +/** + * gfs2_assert_warn_i - Print a message to the console if @assertion is false + * Returns: -1 if we printed something + * -2 if we didn't + */ + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + if (time_before(jiffies, + sdp->sd_last_warning + + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) + return -2; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", + assertion, function, file, line); + + if (sdp->sd_args.ar_debug) + BUG(); + else + dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + sdp->sd_last_warning = jiffies; + + return -1; +} + +/** + * gfs2_consist_i - Flag a filesystem consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, + char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); + return rv; +} + +/** + * gfs2_consist_inode_i - Flag an inode consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int rv; + rv = gfs2_lm_withdraw(sdp, + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); + return rv; +} + +/** + * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int rv; + rv = gfs2_lm_withdraw(sdp, + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); + return rv; +} + +/** + * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, char *file, + unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_io_error_i - Flag an I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); + return rv; +} + +/** + * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); + return rv; +} + diff --git a/kernel/fs/gfs2/util.h b/kernel/fs/gfs2/util.h new file mode 100644 index 000000000..cbdcbdf39 --- /dev/null +++ b/kernel/fs/gfs2/util.h @@ -0,0 +1,171 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +#include + +#include "incore.h" + +#define fs_emerg(fs, fmt, ...) \ + pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_warn(fs, fmt, ...) \ + pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_err(fs, fmt, ...) \ + pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_info(fs, fmt, ...) \ + pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) + +void gfs2_assert_i(struct gfs2_sbd *sdp); + +#define gfs2_assert(sdp, assertion) \ +do { \ + if (unlikely(!(assertion))) { \ + gfs2_assert_i(sdp); \ + BUG(); \ + } \ +} while (0) + + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_withdraw(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__)) + + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_warn(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__)) + + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist(sdp) \ +gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__) + + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_inode(ip) \ +gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__) + + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_rgrpd(rgd) \ +gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__) + + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, + char *file, unsigned int line); + +static inline int gfs2_meta_check(struct gfs2_sbd *sdp, + struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + if (unlikely(magic != GFS2_MAGIC)) { + pr_err("Magic number missing at %llu\n", + (unsigned long long)bh->b_blocknr); + return -EIO; + } + return 0; +} + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); + +static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + u16 type, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + u16 t = be32_to_cpu(mh->mh_type); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + if (unlikely(t != type)) + return gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return 0; +} + +#define gfs2_metatype_check(sdp, bh, type) \ +gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__) + +static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, + u16 format) +{ + struct gfs2_meta_header *mh; + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_type = cpu_to_be32(type); + mh->mh_format = cpu_to_be32(format); +} + + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); + +#define gfs2_io_error(sdp) \ +gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); + + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line); + +#define gfs2_io_error_bh(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); + + +extern struct kmem_cache *gfs2_glock_cachep; +extern struct kmem_cache *gfs2_glock_aspace_cachep; +extern struct kmem_cache *gfs2_inode_cachep; +extern struct kmem_cache *gfs2_bufdata_cachep; +extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; +extern struct kmem_cache *gfs2_rsrv_cachep; +extern mempool_t *gfs2_page_pool; + +static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, + unsigned int *p) +{ + unsigned int x; + spin_lock(>->gt_spin); + x = *p; + spin_unlock(>->gt_spin); + return x; +} + +#define gfs2_tune_get(sdp, field) \ +gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) + +__printf(2, 3) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...); + +#endif /* __UTIL_DOT_H__ */ diff --git a/kernel/fs/gfs2/xattr.c b/kernel/fs/gfs2/xattr.c new file mode 100644 index 000000000..4c096fa9e --- /dev/null +++ b/kernel/fs/gfs2/xattr.c @@ -0,0 +1,1508 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * ea_calc_size - returns the acutal number of bytes the request will take up + * (not counting any unstuffed data blocks) + * @sdp: + * @er: + * @size: + * + * Returns: 1 if the EA should be stuffed + */ + +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, + unsigned int *size) +{ + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) + return 1; + + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); + + return 0; +} + +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) +{ + unsigned int size; + + if (dsize > GFS2_EA_MAX_DATA_LEN) + return -ERANGE; + + ea_calc_size(sdp, nsize, dsize, &size); + + /* This can only happen with 512 byte blocks */ + if (size > sdp->sd_jbsize) + return -ERANGE; + + return 0; +} + +typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private); + +static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, + ea_call_t ea_call, void *data) +{ + struct gfs2_ea_header *ea, *prev = NULL; + int error = 0; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) + return -EIO; + + for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { + if (!GFS2_EA_REC_LEN(ea)) + goto fail; + if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= + bh->b_data + bh->b_size)) + goto fail; + if (!GFS2_EATYPE_VALID(ea->ea_type)) + goto fail; + + error = ea_call(ip, bh, ea, prev, data); + if (error) + return error; + + if (GFS2_EA_IS_LAST(ea)) { + if ((char *)GFS2_EA2NEXT(ea) != + bh->b_data + bh->b_size) + goto fail; + break; + } + } + + return error; + +fail: + gfs2_consist_inode(ip); + return -EIO; +} + +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +{ + struct buffer_head *bh, *eabh; + __be64 *eablk, *end; + int error; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { + error = ea_foreach_i(ip, bh, ea_call, data); + goto out; + } + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + if (error) + break; + error = ea_foreach_i(ip, eabh, ea_call, data); + brelse(eabh); + if (error) + break; + } +out: + brelse(bh); + return error; +} + +struct ea_find { + int type; + const char *name; + size_t namel; + struct gfs2_ea_location *ef_el; +}; + +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_find *ef = private; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { + struct gfs2_ea_location *el = ef->ef_el; + get_bh(bh); + el->el_bh = bh; + el->el_ea = ea; + el->el_prev = prev; + return 1; + } + } + + return 0; +} + +static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el) +{ + struct ea_find ef; + int error; + + ef.type = type; + ef.name = name; + ef.namel = strlen(name); + ef.ef_el = el; + + memset(el, 0, sizeof(struct gfs2_ea_location)); + + error = ea_foreach(ip, ea_find_i, &ef); + if (error > 0) + return 0; + + return error; +} + +/** + * ea_dealloc_unstuffed - + * @ip: + * @bh: + * @ea: + * @prev: + * @private: + * + * Take advantage of the fact that all unstuffed blocks are + * allocated from the same RG. But watch, this may not always + * be true. + * + * Returns: errno + */ + +static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + int *leave = private; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder rg_gh; + struct buffer_head *dibh; + __be64 *dataptrs; + u64 bn = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (GFS2_EA_IS_STUFFED(ea)) + return 0; + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (*dataptrs) { + blks++; + bn = be64_to_cpu(*dataptrs); + } + } + if (!blks) + return 0; + + rgd = gfs2_blk2rgrpd(sdp, bn, 1); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE + + RES_EATTR + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_meta(ip->i_gl, bh); + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (!*dataptrs) + break; + bn = be64_to_cpu(*dataptrs); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *dataptrs = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + if (prev && !leave) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_num_ptrs = 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&rg_gh); + return error; +} + +static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, int leave) +{ + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); + + gfs2_quota_unhold(ip); +out_alloc: + return error; +} + +struct ea_list { + struct gfs2_ea_request *ei_er; + unsigned int ei_size; +}; + +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_list *ei = private; + struct gfs2_ea_request *er = ei->ei_er; + unsigned int ea_size = gfs2_ea_strlen(ea); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (er->er_data_len) { + char *prefix = NULL; + unsigned int l = 0; + char c = 0; + + if (ei->ei_size + ea_size > er->er_data_len) + return -ERANGE; + + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + } + + BUG_ON(l == 0); + + memcpy(er->er_data + ei->ei_size, prefix, l); + memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), + ea->ea_name_len); + memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1); + } + + ei->ei_size += ea_size; + + return 0; +} + +/** + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer + * + * Returns: actual size of data on success, -errno on error + */ + +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_ea_request er; + struct gfs2_holder i_gh; + int error; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + + if (ip->i_eattr) { + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; + + error = ea_foreach(ip, ea_list_i, &ei); + if (!error) + error = ei.ei_size; + } + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_iter_unstuffed - copies the unstuffed xattr data to/from the + * request buffer + * @ip: The GFS2 inode + * @ea: The extended attribute header structure + * @din: The data to be copied in + * @dout: The data to be copied out (one of din,dout will be NULL) + * + * Returns: errno + */ + +static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + const char *din, char *dout) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error = 0; + unsigned char *pos; + unsigned cp_size; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); + if (!bh) + return -ENOMEM; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto out; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto out; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto out; + } + + pos = bh[x]->b_data + sizeof(struct gfs2_meta_header); + cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize; + + if (dout) { + memcpy(dout, pos, cp_size); + dout += sdp->sd_jbsize; + } + + if (din) { + gfs2_trans_add_meta(ip->i_gl, bh[x]); + memcpy(pos, din, cp_size); + din += sdp->sd_jbsize; + } + + amount -= sdp->sd_jbsize; + brelse(bh[x]); + } + +out: + kfree(bh); + return error; +} + +static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size) +{ + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data); + if (ret < 0) + return ret; + return len; +} + +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) +{ + struct gfs2_ea_location el; + int error; + int len; + char *data; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); + if (error) + return error; + if (!el.el_ea) + goto out; + if (!GFS2_EA_DATA_LEN(el.el_ea)) + goto out; + + len = GFS2_EA_DATA_LEN(el.el_ea); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + + error = gfs2_ea_get_copy(ip, &el, data, len); + if (error < 0) + kfree(data); + else + *ppdata = data; +out: + brelse(el.el_bh); + return error; +} + +/** + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer + * @type: The type of extended attribute + * + * Returns: actual size of data on success, -errno on error + */ +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else + error = GFS2_EA_DATA_LEN(el.el_ea); + brelse(el.el_bh); + + return error; +} + +/** + * ea_alloc_blk - allocates a new block for extended attributes. + * @ip: A pointer to the inode that's getting extended attributes + * @bhp: Pointer to pointer to a struct buffer_head + * + * Returns: errno + */ + +static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea; + unsigned int n = 1; + u64 block; + int error; + + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); + *bhp = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, *bhp); + gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(*bhp); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + ea->ea_num_ptrs = 0; + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + return 0; +} + +/** + * ea_write - writes the request info to an ea, creating new blocks if + * necessary + * @ip: inode that is being modified + * @ea: the location of the new ea in a block + * @er: the write request + * + * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags + * + * returns : errno + */ + +static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + struct gfs2_ea_request *er) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + + ea->ea_data_len = cpu_to_be32(er->er_data_len); + ea->ea_name_len = er->er_name_len; + ea->ea_type = er->er_type; + ea->__pad = 0; + + memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); + + if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { + ea->ea_num_ptrs = 0; + memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); + } else { + __be64 *dataptr = GFS2_EA2DATAPTRS(ea); + const char *data = er->er_data; + unsigned int data_len = er->er_data_len; + unsigned int copy; + unsigned int x; + + ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); + for (x = 0; x < ea->ea_num_ptrs; x++) { + struct buffer_head *bh; + u64 block; + int mh_size = sizeof(struct gfs2_meta_header); + unsigned int n = 1; + + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : + data_len; + memcpy(bh->b_data + mh_size, data, copy); + if (copy < sdp->sd_jbsize) + memset(bh->b_data + mh_size + copy, 0, + sdp->sd_jbsize - copy); + + *dataptr++ = cpu_to_be64(bh->b_blocknr); + data += copy; + data_len -= copy; + + brelse(bh); + } + + gfs2_assert_withdraw(sdp, !data_len); + } + + return 0; +} + +typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private); + +static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, + unsigned int blks, + ea_skeleton_call_t skeleton_call, void *private) +{ + struct gfs2_alloc_parms ap = { .target = blks }; + struct buffer_head *dibh; + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), + blks + gfs2_rg_blocks(ip, blks) + + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + error = skeleton_call(ip, er, private); + if (error) + goto out_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + +out_end_trans: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); + return error; +} + +static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct buffer_head *bh; + int error; + + error = ea_alloc_blk(ip, &bh); + if (error) + return error; + + ip->i_eattr = bh->b_blocknr; + error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); + + brelse(bh); + + return error; +} + +/** + * ea_init - initializes a new eattr block + * @ip: + * @er: + * + * Returns: errno + */ + +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) +{ + struct gfs2_ea_request er; + unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; + unsigned int blks = 1; + + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); +} + +static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) +{ + u32 ea_size = GFS2_EA_SIZE(ea); + struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + + ea_size); + u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size; + int last = ea->ea_flags & GFS2_EAFLAG_LAST; + + ea->ea_rec_len = cpu_to_be32(ea_size); + ea->ea_flags ^= last; + + new->ea_rec_len = cpu_to_be32(new_size); + new->ea_flags = last; + + return new; +} + +static void ea_set_remove_stuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + u32 len; + + gfs2_trans_add_meta(ip->i_gl, el->el_bh); + + if (!prev || !GFS2_EA_IS_STUFFED(ea)) { + ea->ea_type = GFS2_EATYPE_UNUSED; + return; + } else if (GFS2_EA2NEXT(prev) != ea) { + prev = GFS2_EA2NEXT(prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); + } + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; +} + +struct ea_set { + int ea_split; + + struct gfs2_ea_request *es_er; + struct gfs2_ea_location *es_el; + + struct buffer_head *es_bh; + struct gfs2_ea_header *es_ea; +}; + +static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct ea_set *es) +{ + struct gfs2_ea_request *er = es->es_er; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, bh); + + if (es->ea_split) + ea = ea_split_ea(ea); + + ea_write(ip, ea, er); + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); +out: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + +static int ea_set_simple_alloc(struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private) +{ + struct ea_set *es = private; + struct gfs2_ea_header *ea = es->es_ea; + int error; + + gfs2_trans_add_meta(ip->i_gl, es->es_bh); + + if (es->ea_split) + ea = ea_split_ea(ea); + + error = ea_write(ip, ea, er); + if (error) + return error; + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + return 0; +} + +static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_set *es = private; + unsigned int size; + int stuffed; + int error; + + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) { + if (GFS2_EA_REC_LEN(ea) < size) + return 0; + if (!GFS2_EA_IS_STUFFED(ea)) { + error = ea_remove_unstuffed(ip, bh, ea, prev, 1); + if (error) + return error; + } + es->ea_split = 0; + } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) + es->ea_split = 1; + else + return 0; + + if (stuffed) { + error = ea_set_simple_noalloc(ip, bh, ea, es); + if (error) + return error; + } else { + unsigned int blks; + + es->es_bh = bh; + es->es_ea = ea; + blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, + GFS2_SB(&ip->i_inode)->sd_jbsize); + + error = ea_alloc_skeleton(ip, es->es_er, blks, + ea_set_simple_alloc, es); + if (error) + return error; + } + + return 1; +} + +static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *indbh, *newbh; + __be64 *eablk; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + __be64 *end; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + mh_size); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) + if (!*eablk) + break; + + if (eablk == end) { + error = -ENOSPC; + goto out; + } + + gfs2_trans_add_meta(ip->i_gl, indbh); + } else { + u64 blk; + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, blk, 1); + indbh = gfs2_meta_new(ip->i_gl, blk); + gfs2_trans_add_meta(ip->i_gl, indbh); + gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(indbh, mh_size); + + eablk = (__be64 *)(indbh->b_data + mh_size); + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; + gfs2_add_inode_blocks(&ip->i_inode, 1); + + eablk++; + } + + error = ea_alloc_blk(ip, &newbh); + if (error) + goto out; + + *eablk = cpu_to_be64((u64)newbh->b_blocknr); + error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); + brelse(newbh); + if (error) + goto out; + + if (private) + ea_set_remove_stuffed(ip, private); + +out: + brelse(indbh); + return error; +} + +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) +{ + struct gfs2_ea_request er; + struct ea_set es; + unsigned int blks = 2; + int error; + + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + + memset(&es, 0, sizeof(struct ea_set)); + es.es_er = &er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, &es); + if (error > 0) + return 0; + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); +} + +static int ea_set_remove_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { + el->el_prev = GFS2_EA2NEXT(el->el_prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + GFS2_EA2NEXT(el->el_prev) == el->el_ea); + } + + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); +} + +static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, el->el_bh); + + if (prev) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @ip: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) + error = ea_remove_stuffed(ip, &el); + else + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); + + brelse(el.el_bh); + + return error; +} + +/** + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace + * @type: The type of the extended attribute + * + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure + */ + +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); + int error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; + + if (value == NULL) + return gfs2_xattr_remove(ip, type, name); + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } + + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); + + return error; +} + +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(d_inode(dentry), name, value, + size, flags, type); +} + + +static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_header *ea, char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + int ret; + + ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (ret) + return ret; + + ret = gfs2_iter_unstuffed(ip, ea, data, NULL); + gfs2_trans_end(sdp); + + return ret; +} + +int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; + int error; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); + if (error) + return error; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) { + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_meta(ip->i_gl, el.el_bh); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { + error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); + } + + brelse(el.el_bh); + if (error) + return error; + + error = gfs2_setattr_simple(inode, attr); + gfs2_trans_end(sdp); + return error; +} + +static int ea_dealloc_indirect(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + struct buffer_head *indbh, *dibh; + __be64 *eablk, *end; + unsigned int rg_blocks = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + bstart = bn; + blen = 1; + } + blks++; + } + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + else + goto out; + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist_free; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_meta(ip->i_gl, indbh); + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + bstart = 0; + blen = 0; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *eablk = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist_free: + gfs2_rlist_free(&rlist); +out: + brelse(indbh); + return error; +} + +static int ea_dealloc_block(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct buffer_head *dibh; + struct gfs2_holder gh; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS + + RES_QUOTA, 1); + if (error) + goto out_gunlock; + + gfs2_free_meta(ip, ip->i_eattr, 1); + + ip->i_eattr = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&gh); + return error; +} + +/** + * gfs2_ea_dealloc - deallocate the extended attribute fork + * @ip: the inode + * + * Returns: errno + */ + +int gfs2_ea_dealloc(struct gfs2_inode *ip) +{ + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); + if (error) + goto out_quota; + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_quota; + } + + error = ea_dealloc_block(ip); + +out_quota: + gfs2_quota_unhold(ip); + return error; +} + +static const struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +static const struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; + diff --git a/kernel/fs/gfs2/xattr.h b/kernel/fs/gfs2/xattr.h new file mode 100644 index 000000000..d392f8358 --- /dev/null +++ b/kernel/fs/gfs2/xattr.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __EATTR_DOT_H__ +#define __EATTR_DOT_H__ + +struct gfs2_inode; +struct iattr; + +#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len) +#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len) + +#define GFS2_EA_SIZE(ea) \ +ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ + ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + +#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) +#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) + +#define GFS2_EAREQ_SIZE_STUFFED(er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) + +#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) +#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) + +#define GFS2_EA2DATAPTRS(ea) \ +((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8))) + +#define GFS2_EA2NEXT(ea) \ +((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea))) + +#define GFS2_EA_BH2FIRST(bh) \ +((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) + +struct gfs2_ea_request { + const char *er_name; + char *er_data; + unsigned int er_name_len; + unsigned int er_data_len; + unsigned int er_type; /* GFS2_EATYPE_... */ +}; + +struct gfs2_ea_location { + struct buffer_head *el_bh; + struct gfs2_ea_header *el_ea; + struct gfs2_ea_header *el_prev; +}; + +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); + +/* Exported to acl.c */ + +extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); + +#endif /* __EATTR_DOT_H__ */ diff --git a/kernel/fs/hfs/Kconfig b/kernel/fs/hfs/Kconfig new file mode 100644 index 000000000..998e3a6de --- /dev/null +++ b/kernel/fs/hfs/Kconfig @@ -0,0 +1,12 @@ +config HFS_FS + tristate "Apple Macintosh file system support" + depends on BLOCK + select NLS + help + If you say Y here, you will be able to mount Macintosh-formatted + floppy disks and hard drive partitions with full read-write access. + Please read to learn about + the available mount options. + + To compile this file system support as a module, choose M here: the + module will be called hfs. diff --git a/kernel/fs/hfs/Makefile b/kernel/fs/hfs/Makefile new file mode 100644 index 000000000..c41f5a85f --- /dev/null +++ b/kernel/fs/hfs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux hfs filesystem routines. +# + +obj-$(CONFIG_HFS_FS) += hfs.o + +hfs-objs := bitmap.o bfind.o bnode.o brec.o btree.o \ + catalog.o dir.o extent.o inode.o attr.o mdb.o \ + part_tbl.o string.o super.o sysdep.o trans.o + diff --git a/kernel/fs/hfs/attr.c b/kernel/fs/hfs/attr.c new file mode 100644 index 000000000..8d931b157 --- /dev/null +++ b/kernel/fs/hfs/attr.c @@ -0,0 +1,121 @@ +/* + * linux/fs/hfs/attr.c + * + * (C) 2003 Ardis Technologies + * + * Export hfs data via xattr + */ + + +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +int hfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = d_inode(dentry); + struct hfs_find_data fd; + hfs_cat_rec rec; + struct hfs_cat_file *file; + int res; + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + fd.search_key->cat = HFS_I(inode)->cat_key; + res = hfs_brec_find(&fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + file = &rec.file; + + if (!strcmp(name, "hfs.type")) { + if (size == 4) + memcpy(&file->UsrWds.fdType, value, 4); + else + res = -ERANGE; + } else if (!strcmp(name, "hfs.creator")) { + if (size == 4) + memcpy(&file->UsrWds.fdCreator, value, 4); + else + res = -ERANGE; + } else + res = -EOPNOTSUPP; + if (!res) + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); +out: + hfs_find_exit(&fd); + return res; +} + +ssize_t hfs_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct hfs_find_data fd; + hfs_cat_rec rec; + struct hfs_cat_file *file; + ssize_t res = 0; + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (size) { + res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + fd.search_key->cat = HFS_I(inode)->cat_key; + res = hfs_brec_find(&fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } + file = &rec.file; + + if (!strcmp(name, "hfs.type")) { + if (size >= 4) { + memcpy(value, &file->UsrWds.fdType, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else if (!strcmp(name, "hfs.creator")) { + if (size >= 4) { + memcpy(value, &file->UsrWds.fdCreator, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else + res = -ENODATA; +out: + if (size) + hfs_find_exit(&fd); + return res; +} + +#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) + +ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!buffer || !size) + return HFS_ATTRLIST_SIZE; + if (size < HFS_ATTRLIST_SIZE) + return -ERANGE; + strcpy(buffer, "hfs.type"); + strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); + + return HFS_ATTRLIST_SIZE; +} diff --git a/kernel/fs/hfs/bfind.c b/kernel/fs/hfs/bfind.c new file mode 100644 index 000000000..de69d8a24 --- /dev/null +++ b/kernel/fs/hfs/bfind.c @@ -0,0 +1,224 @@ +/* + * linux/fs/hfs/bfind.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Search routines for btrees + */ + +#include +#include "btree.h" + +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) +{ + void *ptr; + + fd->tree = tree; + fd->bnode = NULL; + ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + fd->search_key = ptr; + fd->key = ptr + tree->max_key_len + 2; + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); + mutex_lock(&tree->tree_lock); + return 0; +} + +void hfs_find_exit(struct hfs_find_data *fd) +{ + hfs_bnode_put(fd->bnode); + kfree(fd->search_key); + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); + mutex_unlock(&fd->tree->tree_lock); + fd->tree = NULL; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +{ + int cmpval; + u16 off, len, keylen; + int rec; + int b, e; + int res; + + b = 0; + e = bnode->num_recs - 1; + res = -ENOENT; + do { + rec = (e + b) / 2; + len = hfs_brec_lenoff(bnode, rec, &off); + keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + e = rec; + res = 0; + goto done; + } + if (cmpval < 0) + b = rec + 1; + else + e = rec - 1; + } while (b <= e); + if (rec != e && e >= 0) { + len = hfs_brec_lenoff(bnode, e, &off); + keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + } +done: + fd->record = e; + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; +fail: + return res; +} + +/* Traverse a B*Tree from the root to a leaf finding best fit to key */ +/* Return allocated copy of node found, set recnum to best record */ +int hfs_brec_find(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + u32 nidx, parent; + __be32 data; + int height, res; + + tree = fd->tree; + if (fd->bnode) + hfs_bnode_put(fd->bnode); + fd->bnode = NULL; + nidx = tree->root; + if (!nidx) + return -ENOENT; + height = tree->depth; + res = 0; + parent = 0; + for (;;) { + bnode = hfs_bnode_find(tree, nidx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + break; + } + if (bnode->height != height) + goto invalid; + if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) + goto invalid; + bnode->parent = parent; + + res = __hfs_brec_find(bnode, fd); + if (!height) + break; + if (fd->record < 0) + goto release; + + parent = nidx; + hfs_bnode_read(bnode, &data, fd->entryoffset, 4); + nidx = be32_to_cpu(data); + hfs_bnode_put(bnode); + } + fd->bnode = bnode; + return res; + +invalid: + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); + res = -EIO; +release: + hfs_bnode_put(bnode); + return res; +} + +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +{ + int res; + + res = hfs_brec_find(fd); + if (res) + return res; + if (fd->entrylength > rec_len) + return -EINVAL; + hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); + return 0; +} + +int hfs_brec_goto(struct hfs_find_data *fd, int cnt) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int idx, res = 0; + u16 off, len, keylen; + + bnode = fd->bnode; + tree = bnode->tree; + + if (cnt < 0) { + cnt = -cnt; + while (cnt > fd->record) { + cnt -= fd->record + 1; + fd->record = bnode->num_recs - 1; + idx = bnode->prev; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record -= cnt; + } else { + while (cnt >= bnode->num_recs - fd->record) { + cnt -= bnode->num_recs - fd->record; + fd->record = 0; + idx = bnode->next; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record += cnt; + } + + len = hfs_brec_lenoff(bnode, fd->record, &off); + keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; + hfs_bnode_read(bnode, fd->key, off, keylen); +out: + fd->bnode = bnode; + return res; +} diff --git a/kernel/fs/hfs/bitmap.c b/kernel/fs/hfs/bitmap.c new file mode 100644 index 000000000..28307bc9e --- /dev/null +++ b/kernel/fs/hfs/bitmap.c @@ -0,0 +1,243 @@ +/* + * linux/fs/hfs/bitmap.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * Based on GPLed code Copyright (C) 1995 Michael Dreher + * + * This file contains the code to modify the volume bitmap: + * search/set/clear bits. + */ + +#include "hfs_fs.h" + +/* + * hfs_find_zero_bit() + * + * Description: + * Given a block of memory, its length in bits, and a starting bit number, + * determine the number of the first zero bits (in left-to-right ordering) + * in that range. + * + * Returns >= 'size' if no zero bits are found in the range. + * + * Accesses memory in 32-bit aligned chunks of 32-bits and thus + * may read beyond the 'size'th bit. + */ +static u32 hfs_find_set_zero_bits(__be32 *bitmap, u32 size, u32 offset, u32 *max) +{ + __be32 *curr, *end; + u32 mask, start, len, n; + __be32 val; + int i; + + len = *max; + if (!len) + return size; + + curr = bitmap + (offset / 32); + end = bitmap + ((size + 31) / 32); + + /* scan the first partial u32 for zero bits */ + val = *curr; + if (~val) { + n = be32_to_cpu(val); + i = offset % 32; + mask = (1U << 31) >> i; + for (; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + + /* scan complete u32s for the first zero bit */ + while (++curr < end) { + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = 1 << 31; + for (i = 0; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + } + return size; + +found: + start = (curr - bitmap) * 32 + i; + if (start >= size) + return start; + /* do any partial u32 at the start */ + len = min(size - start, len); + while (1) { + n |= mask; + if (++i >= 32) + break; + mask >>= 1; + if (!--len || n & mask) + goto done; + } + if (!--len) + goto done; + *curr++ = cpu_to_be32(n); + /* do full u32s */ + while (1) { + n = be32_to_cpu(*curr); + if (len < 32) + break; + if (n) { + len = 32; + break; + } + *curr++ = cpu_to_be32(0xffffffff); + len -= 32; + } + /* do any partial u32 at end */ + mask = 1U << 31; + for (i = 0; i < len; i++) { + if (n & mask) + break; + n |= mask; + mask >>= 1; + } +done: + *curr = cpu_to_be32(n); + *max = (curr - bitmap) * 32 + i - start; + return start; +} + +/* + * hfs_vbm_search_free() + * + * Description: + * Search for 'num_bits' consecutive cleared bits in the bitmap blocks of + * the hfs MDB. 'mdb' had better be locked or the returned range + * may be no longer free, when this functions returns! + * XXX Currently the search starts from bit 0, but it should start with + * the bit number stored in 's_alloc_ptr' of the MDB. + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * u16 *num_bits: Pointer to the number of cleared bits + * to search for + * Output Variable(s): + * u16 *num_bits: The number of consecutive clear bits of the + * returned range. If the bitmap is fragmented, this will be less than + * requested and it will be zero, when the disk is full. + * Returns: + * The number of the first bit of the range of cleared bits which has been + * found. When 'num_bits' is zero, this is invalid! + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * 'num_bits' points to a variable of type (u16), which contains + * the number of cleared bits to find. + * Postconditions: + * 'num_bits' is set to the length of the found sequence. + */ +u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) +{ + void *bitmap; + u32 pos; + + /* make sure we have actual work to perform */ + if (!*num_bits) + return 0; + + mutex_lock(&HFS_SB(sb)->bitmap_lock); + bitmap = HFS_SB(sb)->bitmap; + + pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits); + if (pos >= HFS_SB(sb)->fs_ablocks) { + if (goal) + pos = hfs_find_set_zero_bits(bitmap, goal, 0, num_bits); + if (pos >= HFS_SB(sb)->fs_ablocks) { + *num_bits = pos = 0; + goto out; + } + } + + hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + HFS_SB(sb)->free_ablocks -= *num_bits; + hfs_bitmap_dirty(sb); +out: + mutex_unlock(&HFS_SB(sb)->bitmap_lock); + return pos; +} + + +/* + * hfs_clear_vbm_bits() + * + * Description: + * Clear the requested bits in the volume bitmap of the hfs filesystem + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * u16 start: The offset of the first bit + * u16 count: The number of bits + * Output Variable(s): + * None + * Returns: + * 0: no error + * -1: One of the bits was already clear. This is a strange + * error and when it happens, the filesystem must be repaired! + * -2: One or more of the bits are out of range of the bitmap. + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * Postconditions: + * Starting with bit number 'start', 'count' bits in the volume bitmap + * are cleared. The affected bitmap blocks are marked "dirty", the free + * block count of the MDB is updated and the MDB is marked dirty. + */ +int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) +{ + __be32 *curr; + u32 mask; + int i, len; + + /* is there any actual work to be done? */ + if (!count) + return 0; + + hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + /* are all of the bits in range? */ + if ((start + count) > HFS_SB(sb)->fs_ablocks) + return -2; + + mutex_lock(&HFS_SB(sb)->bitmap_lock); + /* bitmap is always on a 32-bit boundary */ + curr = HFS_SB(sb)->bitmap + (start / 32); + len = count; + + /* do any partial u32 at the start */ + i = start % 32; + if (i) { + int j = 32 - i; + mask = 0xffffffffU << j; + if (j > count) { + mask |= 0xffffffffU >> (i + count); + *curr &= cpu_to_be32(mask); + goto out; + } + *curr++ &= cpu_to_be32(mask); + count -= j; + } + + /* do full u32s */ + while (count >= 32) { + *curr++ = 0; + count -= 32; + } + /* do any partial u32 at end */ + if (count) { + mask = 0xffffffffU >> count; + *curr &= cpu_to_be32(mask); + } +out: + HFS_SB(sb)->free_ablocks += len; + mutex_unlock(&HFS_SB(sb)->bitmap_lock); + hfs_bitmap_dirty(sb); + + return 0; +} diff --git a/kernel/fs/hfs/bnode.c b/kernel/fs/hfs/bnode.c new file mode 100644 index 000000000..d3fa6bd95 --- /dev/null +++ b/kernel/fs/hfs/bnode.c @@ -0,0 +1,485 @@ +/* + * linux/fs/hfs/bnode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle basic btree node operations + */ + +#include +#include +#include + +#include "btree.h" + +void hfs_bnode_read(struct hfs_bnode *node, void *buf, + int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memcpy(buf, kmap(page) + off, len); + kunmap(page); +} + +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +{ + __be16 data; + // optimize later... + hfs_bnode_read(node, &data, off, 2); + return be16_to_cpu(data); +} + +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +{ + u8 data; + // optimize later... + hfs_bnode_read(node, &data, off, 1); + return data; +} + +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +{ + struct hfs_btree *tree; + int key_len; + + tree = node->tree; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = hfs_bnode_read_u8(node, off) + 1; + else + key_len = tree->max_key_len + 1; + + hfs_bnode_read(node, key, off, key_len); +} + +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memcpy(kmap(page) + off, buf, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +{ + __be16 v = cpu_to_be16(data); + // optimize later... + hfs_bnode_write(node, &v, off, 2); +} + +void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data) +{ + // optimize later... + hfs_bnode_write(node, &data, off, 1); +} + +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memset(kmap(page) + off, 0, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len) +{ + struct hfs_btree *tree; + struct page *src_page, *dst_page; + + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + tree = src_node->tree; + src += src_node->page_offset; + dst += dst_node->page_offset; + src_page = src_node->page[0]; + dst_page = dst_node->page[0]; + + memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); + kunmap(src_page); + kunmap(dst_page); + set_page_dirty(dst_page); +} + +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +{ + struct page *page; + void *ptr; + + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + src += node->page_offset; + dst += node->page_offset; + page = node->page[0]; + ptr = kmap(page); + memmove(ptr + dst, ptr + src, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_dump(struct hfs_bnode *node) +{ + struct hfs_bnode_desc desc; + __be32 cnid; + int i, off, key_off; + + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_bnode_read(node, &desc, 0, sizeof(desc)); + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + be32_to_cpu(desc.next), be32_to_cpu(desc.prev), + desc.type, desc.height, be16_to_cpu(desc.num_recs)); + + off = node->tree->node_size - 2; + for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { + key_off = hfs_bnode_read_u16(node, off); + hfs_dbg_cont(BNODE_MOD, " %d", key_off); + if (i && node->type == HFS_NODE_INDEX) { + int tmp; + + if (node->tree->attributes & HFS_TREE_VARIDXKEYS) + tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; + else + tmp = node->tree->max_key_len + 1; + hfs_dbg_cont(BNODE_MOD, " (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); + hfs_bnode_read(node, &cnid, key_off + tmp, 4); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + } else if (i && node->type == HFS_NODE_LEAF) { + int tmp; + + tmp = hfs_bnode_read_u8(node, key_off); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + } + } + hfs_dbg_cont(BNODE_MOD, "\n"); +} + +void hfs_bnode_unlink(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct hfs_bnode *tmp; + __be32 cnid; + + tree = node->tree; + if (node->prev) { + tmp = hfs_bnode_find(tree, node->prev); + if (IS_ERR(tmp)) + return; + tmp->next = node->next; + cnid = cpu_to_be32(tmp->next); + hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_head = node->next; + + if (node->next) { + tmp = hfs_bnode_find(tree, node->next); + if (IS_ERR(tmp)) + return; + tmp->prev = node->prev; + cnid = cpu_to_be32(tmp->prev); + hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_tail = node->prev; + + // move down? + if (!node->prev && !node->next) { + printk(KERN_DEBUG "hfs_btree_del_level\n"); + } + if (!node->parent) { + tree->root = 0; + tree->depth = 0; + } + set_bit(HFS_BNODE_DELETED, &node->flags); +} + +static inline int hfs_bnode_hash(u32 num) +{ + num = (num >> 16) + num; + num += num >> 8; + return num & (NODE_HASH_SIZE - 1); +} + +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) +{ + struct hfs_bnode *node; + + if (cnid >= tree->node_count) { + pr_err("request for non-existent node %d in B*Tree\n", cnid); + return NULL; + } + + for (node = tree->node_hash[hfs_bnode_hash(cnid)]; + node; node = node->next_hash) { + if (node->this == cnid) { + return node; + } + } + return NULL; +} + +static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) +{ + struct super_block *sb; + struct hfs_bnode *node, *node2; + struct address_space *mapping; + struct page *page; + int size, block, i, hash; + loff_t off; + + if (cnid >= tree->node_count) { + pr_err("request for non-existent node %d in B*Tree\n", cnid); + return NULL; + } + + sb = tree->inode->i_sb; + size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * + sizeof(struct page *); + node = kzalloc(size, GFP_KERNEL); + if (!node) + return NULL; + node->tree = tree; + node->this = cnid; + set_bit(HFS_BNODE_NEW, &node->flags); + atomic_set(&node->refcnt, 1); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); + init_waitqueue_head(&node->lock_wq); + spin_lock(&tree->hash_lock); + node2 = hfs_bnode_findhash(tree, cnid); + if (!node2) { + hash = hfs_bnode_hash(cnid); + node->next_hash = tree->node_hash[hash]; + tree->node_hash[hash] = node; + tree->node_hash_cnt++; + } else { + spin_unlock(&tree->hash_lock); + kfree(node); + wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); + return node2; + } + spin_unlock(&tree->hash_lock); + + mapping = tree->inode->i_mapping; + off = (loff_t)cnid * tree->node_size; + block = off >> PAGE_CACHE_SHIFT; + node->page_offset = off & ~PAGE_CACHE_MASK; + for (i = 0; i < tree->pages_per_bnode; i++) { + page = read_mapping_page(mapping, block++, NULL); + if (IS_ERR(page)) + goto fail; + if (PageError(page)) { + page_cache_release(page); + goto fail; + } + page_cache_release(page); + node->page[i] = page; + } + + return node; +fail: + set_bit(HFS_BNODE_ERROR, &node->flags); + return node; +} + +void hfs_bnode_unhash(struct hfs_bnode *node) +{ + struct hfs_bnode **p; + + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; + *p && *p != node; p = &(*p)->next_hash) + ; + BUG_ON(!*p); + *p = node->next_hash; + node->tree->node_hash_cnt--; +} + +/* Load a particular node out of a tree */ +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct hfs_bnode_desc *desc; + int i, rec_off, off, next_off; + int entry_size, key_size; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + if (node) { + hfs_bnode_get(node); + spin_unlock(&tree->hash_lock); + wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + return node; + } + spin_unlock(&tree->hash_lock); + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + if (!test_bit(HFS_BNODE_NEW, &node->flags)) + return node; + + desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + node->prev = be32_to_cpu(desc->prev); + node->next = be32_to_cpu(desc->next); + node->num_recs = be16_to_cpu(desc->num_recs); + node->type = desc->type; + node->height = desc->height; + kunmap(node->page[0]); + + switch (node->type) { + case HFS_NODE_HEADER: + case HFS_NODE_MAP: + if (node->height != 0) + goto node_error; + break; + case HFS_NODE_LEAF: + if (node->height != 1) + goto node_error; + break; + case HFS_NODE_INDEX: + if (node->height <= 1 || node->height > tree->depth) + goto node_error; + break; + default: + goto node_error; + } + + rec_off = tree->node_size - 2; + off = hfs_bnode_read_u16(node, rec_off); + if (off != sizeof(struct hfs_bnode_desc)) + goto node_error; + for (i = 1; i <= node->num_recs; off = next_off, i++) { + rec_off -= 2; + next_off = hfs_bnode_read_u16(node, rec_off); + if (next_off <= off || + next_off > tree->node_size || + next_off & 1) + goto node_error; + entry_size = next_off - off; + if (node->type != HFS_NODE_INDEX && + node->type != HFS_NODE_LEAF) + continue; + key_size = hfs_bnode_read_u8(node, off) + 1; + if (key_size >= entry_size /*|| key_size & 1*/) + goto node_error; + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + return node; + +node_error: + set_bit(HFS_BNODE_ERROR, &node->flags); + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + hfs_bnode_put(node); + return ERR_PTR(-EIO); +} + +void hfs_bnode_free(struct hfs_bnode *node) +{ + //int i; + + //for (i = 0; i < node->tree->pages_per_bnode; i++) + // if (node->page[i]) + // page_cache_release(node->page[i]); + kfree(node); +} + +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct page **pagep; + int i; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + spin_unlock(&tree->hash_lock); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + + pagep = node->page; + memset(kmap(*pagep) + node->page_offset, 0, + min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + set_page_dirty(*pagep); + kunmap(*pagep); + for (i = 1; i < tree->pages_per_bnode; i++) { + memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + set_page_dirty(*pagep); + kunmap(*pagep); + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + + return node; +} + +void hfs_bnode_get(struct hfs_bnode *node) +{ + if (node) { + atomic_inc(&node->refcnt); + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + } +} + +/* Dispose of resources used by a node */ +void hfs_bnode_put(struct hfs_bnode *node) +{ + if (node) { + struct hfs_btree *tree = node->tree; + int i; + + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + BUG_ON(!atomic_read(&node->refcnt)); + if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) + return; + for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; + mark_page_accessed(node->page[i]); + } + + if (test_bit(HFS_BNODE_DELETED, &node->flags)) { + hfs_bnode_unhash(node); + spin_unlock(&tree->hash_lock); + hfs_bmap_free(node); + hfs_bnode_free(node); + return; + } + spin_unlock(&tree->hash_lock); + } +} diff --git a/kernel/fs/hfs/brec.c b/kernel/fs/hfs/brec.c new file mode 100644 index 000000000..9f4ee7f52 --- /dev/null +++ b/kernel/fs/hfs/brec.c @@ -0,0 +1,520 @@ +/* + * linux/fs/hfs/brec.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle individual btree records + */ + +#include "btree.h" + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd); +static int hfs_brec_update_parent(struct hfs_find_data *fd); +static int hfs_btree_inc_height(struct hfs_btree *tree); + +/* Get the length and offset of the given record in the given node */ +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off) +{ + __be16 retval[2]; + u16 dataoff; + + dataoff = node->tree->node_size - (rec + 2) * 2; + hfs_bnode_read(node, retval, dataoff, 4); + *off = be16_to_cpu(retval[1]); + return be16_to_cpu(retval[0]) - *off; +} + +/* Get the length of the key from a keyed record */ +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) +{ + u16 retval, recoff; + + if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) + return 0; + + if ((node->type == HFS_NODE_INDEX) && + !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { + if (node->tree->attributes & HFS_TREE_BIGKEYS) + retval = node->tree->max_key_len + 2; + else + retval = node->tree->max_key_len + 1; + } else { + recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); + if (!recoff) + return 0; + if (node->tree->attributes & HFS_TREE_BIGKEYS) { + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + pr_err("keylen %d too large\n", retval); + retval = 0; + } + } else { + retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; + if (retval > node->tree->max_key_len + 1) { + pr_err("keylen %d too large\n", retval); + retval = 0; + } + } + } + return retval; +} + +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node; + int size, key_len, rec; + int data_off, end_off; + int idx_rec_off, data_rec_off, end_rec_off; + __be32 cnid; + + tree = fd->tree; + if (!fd->bnode) { + if (!tree->root) + hfs_btree_inc_height(tree); + fd->bnode = hfs_bnode_find(tree, tree->leaf_head); + if (IS_ERR(fd->bnode)) + return PTR_ERR(fd->bnode); + fd->record = -1; + } + new_node = NULL; + key_len = (fd->search_key->key_len | 1) + 1; +again: + /* new record idx and complete record size */ + rec = fd->record + 1; + size = key_len + entry_len; + + node = fd->bnode; + hfs_bnode_dump(node); + /* get last offset */ + end_rec_off = tree->node_size - (node->num_recs + 1) * 2; + end_off = hfs_bnode_read_u16(node, end_rec_off); + end_rec_off -= 2; + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); + if (size > end_rec_off - end_off) { + if (new_node) + panic("not enough room!\n"); + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + goto again; + } + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count++; + mark_inode_dirty(tree->inode); + } + node->num_recs++; + /* write new last offset */ + hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + hfs_bnode_write_u16(node, end_rec_off, end_off + size); + data_off = end_off; + data_rec_off = end_rec_off + 2; + idx_rec_off = tree->node_size - (rec + 1) * 2; + if (idx_rec_off == data_rec_off) + goto skip; + /* move all following entries */ + do { + data_off = hfs_bnode_read_u16(node, data_rec_off + 2); + hfs_bnode_write_u16(node, data_rec_off, data_off + size); + data_rec_off += 2; + } while (data_rec_off < idx_rec_off); + + /* move data away */ + hfs_bnode_move(node, data_off + size, data_off, + end_off - data_off); + +skip: + hfs_bnode_write(node, fd->search_key, data_off, key_len); + hfs_bnode_write(node, entry, data_off + key_len, entry_len); + hfs_bnode_dump(node); + + if (new_node) { + /* update parent key if we inserted a key + * at the start of the first node + */ + if (!rec && new_node != node) + hfs_brec_update_parent(fd); + + hfs_bnode_put(fd->bnode); + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } + fd->bnode = hfs_bnode_find(tree, new_node->parent); + + /* create index data entry */ + cnid = cpu_to_be32(new_node->this); + entry = &cnid; + entry_len = sizeof(cnid); + + /* get index key */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + __hfs_brec_find(fd->bnode, fd); + + hfs_bnode_put(new_node); + new_node = NULL; + + if (tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = fd->search_key->key_len + 1; + else { + fd->search_key->key_len = tree->max_key_len; + key_len = tree->max_key_len + 1; + } + goto again; + } + + if (!rec) + hfs_brec_update_parent(fd); + + return 0; +} + +int hfs_brec_remove(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *parent; + int end_off, rec_off, data_off, size; + + tree = fd->tree; + node = fd->bnode; +again: + rec_off = tree->node_size - (fd->record + 2) * 2; + end_off = tree->node_size - (node->num_recs + 1) * 2; + + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count--; + mark_inode_dirty(tree->inode); + } + hfs_bnode_dump(node); + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); + if (!--node->num_recs) { + hfs_bnode_unlink(node); + if (!node->parent) + return 0; + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + hfs_bnode_put(node); + node = fd->bnode = parent; + + __hfs_brec_find(node, fd); + goto again; + } + hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + + if (rec_off == end_off) + goto skip; + size = fd->keylength + fd->entrylength; + + do { + data_off = hfs_bnode_read_u16(node, rec_off); + hfs_bnode_write_u16(node, rec_off + 2, data_off - size); + rec_off -= 2; + } while (rec_off >= end_off); + + /* fill hole */ + hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, + data_off - fd->keyoffset - size); +skip: + hfs_bnode_dump(node); + if (!fd->record) + hfs_brec_update_parent(fd); + return 0; +} + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *next_node; + struct hfs_bnode_desc node_desc; + int num_recs, new_rec_off, new_off, old_rec_off; + int data_start, data_end, size; + + tree = fd->tree; + node = fd->bnode; + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) + return new_node; + hfs_bnode_get(node); + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + node->this, new_node->this, node->next); + new_node->next = node->next; + new_node->prev = node->this; + new_node->parent = node->parent; + new_node->type = node->type; + new_node->height = node->height; + + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + + size = tree->node_size / 2 - node->num_recs * 2 - 14; + old_rec_off = tree->node_size - 4; + num_recs = 1; + for (;;) { + data_start = hfs_bnode_read_u16(node, old_rec_off); + if (data_start > size) + break; + old_rec_off -= 2; + if (++num_recs < node->num_recs) + continue; + /* panic? */ + hfs_bnode_put(node); + hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); + return ERR_PTR(-ENOSPC); + } + + if (fd->record + 1 < num_recs) { + /* new record is in the lower half, + * so leave some more space there + */ + old_rec_off += 2; + num_recs--; + data_start = hfs_bnode_read_u16(node, old_rec_off); + } else { + hfs_bnode_put(node); + hfs_bnode_get(new_node); + fd->bnode = new_node; + fd->record -= num_recs; + fd->keyoffset -= data_start - 14; + fd->entryoffset -= data_start - 14; + } + new_node->num_recs = node->num_recs - num_recs; + node->num_recs = num_recs; + + new_rec_off = tree->node_size - 2; + new_off = 14; + size = data_start - new_off; + num_recs = new_node->num_recs; + data_end = data_start; + while (num_recs) { + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + old_rec_off -= 2; + new_rec_off -= 2; + data_end = hfs_bnode_read_u16(node, old_rec_off); + new_off = data_end - size; + num_recs--; + } + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start); + + /* update new bnode header */ + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + /* update previous bnode header */ + node->next = new_node->this; + hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc)); + node_desc.next = cpu_to_be32(node->next); + node_desc.num_recs = cpu_to_be16(node->num_recs); + hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); + + /* update next bnode header */ + if (next_node) { + next_node->prev = new_node->this; + hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); + node_desc.prev = cpu_to_be32(next_node->prev); + hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc)); + hfs_bnode_put(next_node); + } else if (node->this == tree->leaf_tail) { + /* if there is no next node, this might be the new tail */ + tree->leaf_tail = new_node->this; + mark_inode_dirty(tree->inode); + } + + hfs_bnode_dump(node); + hfs_bnode_dump(new_node); + hfs_bnode_put(node); + + return new_node; +} + +static int hfs_brec_update_parent(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *parent; + int newkeylen, diff; + int rec, rec_off, end_rec_off; + int start_off, end_off; + + tree = fd->tree; + node = fd->bnode; + new_node = NULL; + if (!node->parent) + return 0; + +again: + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + __hfs_brec_find(parent, fd); + hfs_bnode_dump(parent); + rec = fd->record; + + /* size difference between old and new key */ + if (tree->attributes & HFS_TREE_VARIDXKEYS) + newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; + else + fd->keylength = newkeylen = tree->max_key_len + 1; + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); + + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + diff = newkeylen - fd->keylength; + if (!diff) + goto skip; + if (diff > 0) { + end_off = hfs_bnode_read_u16(parent, end_rec_off); + if (end_rec_off - end_off < diff) { + + printk(KERN_DEBUG "splitting index node...\n"); + fd->bnode = parent; + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + parent = fd->bnode; + rec = fd->record; + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + } + } + + end_off = start_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, start_off + diff); + start_off -= 4; /* move previous cnid too */ + + while (rec_off > end_rec_off) { + rec_off -= 2; + end_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, end_off + diff); + } + hfs_bnode_move(parent, start_off + diff, start_off, + end_off - start_off); +skip: + hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen); + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) + hfs_bnode_write_u8(parent, fd->keyoffset, newkeylen - 1); + hfs_bnode_dump(parent); + + hfs_bnode_put(node); + node = parent; + + if (new_node) { + __be32 cnid; + + fd->bnode = hfs_bnode_find(tree, new_node->parent); + /* create index key and entry */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + cnid = cpu_to_be32(new_node->this); + + __hfs_brec_find(fd->bnode, fd); + hfs_brec_insert(fd, &cnid, sizeof(cnid)); + hfs_bnode_put(fd->bnode); + hfs_bnode_put(new_node); + + if (!rec) { + if (new_node == node) + goto out; + /* restore search_key */ + hfs_bnode_read_key(node, fd->search_key, 14); + } + } + + if (!rec && node->parent) + goto again; +out: + fd->bnode = node; + return 0; +} + +static int hfs_btree_inc_height(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *new_node; + struct hfs_bnode_desc node_desc; + int key_size, rec; + __be32 cnid; + + node = NULL; + if (tree->root) { + node = hfs_bnode_find(tree, tree->root); + if (IS_ERR(node)) + return PTR_ERR(node); + } + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) { + hfs_bnode_put(node); + return PTR_ERR(new_node); + } + + tree->root = new_node->this; + if (!tree->depth) { + tree->leaf_head = tree->leaf_tail = new_node->this; + new_node->type = HFS_NODE_LEAF; + new_node->num_recs = 0; + } else { + new_node->type = HFS_NODE_INDEX; + new_node->num_recs = 1; + } + new_node->parent = 0; + new_node->next = 0; + new_node->prev = 0; + new_node->height = ++tree->depth; + + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + rec = tree->node_size - 2; + hfs_bnode_write_u16(new_node, rec, 14); + + if (node) { + /* insert old root idx into new root */ + node->parent = tree->root; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_size = hfs_bnode_read_u8(node, 14) + 1; + else + key_size = tree->max_key_len + 1; + hfs_bnode_copy(new_node, 14, node, 14, key_size); + + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + key_size = tree->max_key_len + 1; + hfs_bnode_write_u8(new_node, 14, tree->max_key_len); + } + key_size = (key_size + 1) & -2; + cnid = cpu_to_be32(node->this); + hfs_bnode_write(new_node, &cnid, 14 + key_size, 4); + + rec -= 2; + hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4); + + hfs_bnode_put(node); + } + hfs_bnode_put(new_node); + mark_inode_dirty(tree->inode); + + return 0; +} diff --git a/kernel/fs/hfs/btree.c b/kernel/fs/hfs/btree.c new file mode 100644 index 000000000..1ab19e660 --- /dev/null +++ b/kernel/fs/hfs/btree.c @@ -0,0 +1,369 @@ +/* + * linux/fs/hfs/btree.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle opening/closing btree + */ + +#include +#include +#include + +#include "btree.h" + +/* Get a reference to a B*Tree and do some initial checks */ +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp) +{ + struct hfs_btree *tree; + struct hfs_btree_header_rec *head; + struct address_space *mapping; + struct page *page; + unsigned int size; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) + return NULL; + + mutex_init(&tree->tree_lock); + spin_lock_init(&tree->hash_lock); + /* Set the correct compare function */ + tree->sb = sb; + tree->cnid = id; + tree->keycmp = keycmp; + + tree->inode = iget_locked(sb, id); + if (!tree->inode) + goto free_tree; + BUG_ON(!(tree->inode->i_state & I_NEW)); + { + struct hfs_mdb *mdb = HFS_SB(sb)->mdb; + HFS_I(tree->inode)->flags = 0; + mutex_init(&HFS_I(tree->inode)->extents_lock); + switch (id) { + case HFS_EXT_CNID: + hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, + mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + pr_err("invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + + tree->inode->i_mapping->a_ops = &hfs_btree_aops; + break; + case HFS_CAT_CNID: + hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, + mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + pr_err("invalid btree extent records (0 size)\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + + tree->inode->i_mapping->a_ops = &hfs_btree_aops; + break; + default: + BUG(); + } + } + unlock_new_inode(tree->inode); + + mapping = tree->inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + goto free_inode; + + /* Load the header */ + head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + tree->root = be32_to_cpu(head->root); + tree->leaf_count = be32_to_cpu(head->leaf_count); + tree->leaf_head = be32_to_cpu(head->leaf_head); + tree->leaf_tail = be32_to_cpu(head->leaf_tail); + tree->node_count = be32_to_cpu(head->node_count); + tree->free_nodes = be32_to_cpu(head->free_nodes); + tree->attributes = be32_to_cpu(head->attributes); + tree->node_size = be16_to_cpu(head->node_size); + tree->max_key_len = be16_to_cpu(head->max_key_len); + tree->depth = be16_to_cpu(head->depth); + + size = tree->node_size; + if (!is_power_of_2(size)) + goto fail_page; + if (!tree->node_count) + goto fail_page; + switch (id) { + case HFS_EXT_CNID: + if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { + pr_err("invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + break; + case HFS_CAT_CNID: + if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { + pr_err("invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + break; + default: + BUG(); + } + + tree->node_size_shift = ffs(size) - 1; + tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + kunmap(page); + page_cache_release(page); + return tree; + +fail_page: + page_cache_release(page); +free_inode: + tree->inode->i_mapping->a_ops = &hfs_aops; + iput(tree->inode); +free_tree: + kfree(tree); + return NULL; +} + +/* Release resources used by a btree */ +void hfs_btree_close(struct hfs_btree *tree) +{ + struct hfs_bnode *node; + int i; + + if (!tree) + return; + + for (i = 0; i < NODE_HASH_SIZE; i++) { + while ((node = tree->node_hash[i])) { + tree->node_hash[i] = node->next_hash; + if (atomic_read(&node->refcnt)) + pr_err("node %d:%d still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + hfs_bnode_free(node); + tree->node_hash_cnt--; + } + } + iput(tree->inode); + kfree(tree); +} + +void hfs_btree_write(struct hfs_btree *tree) +{ + struct hfs_btree_header_rec *head; + struct hfs_bnode *node; + struct page *page; + + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + /* panic? */ + return; + /* Load the header */ + page = node->page[0]; + head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + + head->root = cpu_to_be32(tree->root); + head->leaf_count = cpu_to_be32(tree->leaf_count); + head->leaf_head = cpu_to_be32(tree->leaf_head); + head->leaf_tail = cpu_to_be32(tree->leaf_tail); + head->node_count = cpu_to_be32(tree->node_count); + head->free_nodes = cpu_to_be32(tree->free_nodes); + head->attributes = cpu_to_be32(tree->attributes); + head->depth = cpu_to_be16(tree->depth); + + kunmap(page); + set_page_dirty(page); + hfs_bnode_put(node); +} + +static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) +{ + struct hfs_btree *tree = prev->tree; + struct hfs_bnode *node; + struct hfs_bnode_desc desc; + __be32 cnid; + + node = hfs_bnode_create(tree, idx); + if (IS_ERR(node)) + return node; + + if (!tree->free_nodes) + panic("FIXME!!!"); + tree->free_nodes--; + prev->next = idx; + cnid = cpu_to_be32(idx); + hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + + node->type = HFS_NODE_MAP; + node->num_recs = 1; + hfs_bnode_clear(node, 0, tree->node_size); + desc.next = 0; + desc.prev = 0; + desc.type = HFS_NODE_MAP; + desc.height = 0; + desc.num_recs = cpu_to_be16(1); + desc.reserved = 0; + hfs_bnode_write(node, &desc, 0, sizeof(desc)); + hfs_bnode_write_u16(node, 14, 0x8000); + hfs_bnode_write_u16(node, tree->node_size - 2, 14); + hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); + + return node; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i; + + while (!tree->free_nodes) { + struct inode *inode = tree->inode; + u32 count; + int res; + + res = hfs_extend_file(inode); + if (res) + return ERR_PTR(res); + HFS_I(inode)->phys_size = inode->i_size = + (loff_t)HFS_I(inode)->alloc_blocks * + HFS_SB(tree->sb)->alloc_blksz; + HFS_I(inode)->fs_blocks = inode->i_size >> + tree->sb->s_blocksize_bits; + inode_set_bytes(inode, inode->i_size); + count = inode->i_size >> tree->node_size_shift; + tree->free_nodes = count - tree->node_count; + tree->node_count = count; + } + + nidx = 0; + node = hfs_bnode_find(tree, nidx); + if (IS_ERR(node)) + return node; + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + idx = 0; + + for (;;) { + while (len) { + byte = data[off]; + if (byte != 0xff) { + for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { + if (!(byte & m)) { + idx += i; + data[off] |= m; + set_page_dirty(*pagep); + kunmap(*pagep); + tree->free_nodes--; + mark_inode_dirty(tree->inode); + hfs_bnode_put(node); + return hfs_bnode_create(tree, idx); + } + } + } + if (++off >= PAGE_CACHE_SIZE) { + kunmap(*pagep); + data = kmap(*++pagep); + off = 0; + } + idx += 8; + len--; + } + kunmap(*pagep); + nidx = node->next; + if (!nidx) { + printk(KERN_DEBUG "create new bmap node...\n"); + next_node = hfs_bmap_new_bmap(node, idx); + } else + next_node = hfs_bnode_find(tree, nidx); + hfs_bnode_put(node); + if (IS_ERR(next_node)) + return next_node; + node = next_node; + + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + } +} + +void hfs_bmap_free(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct page *page; + u16 off, len; + u32 nidx; + u8 *data, byte, m; + + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + tree = node->tree; + nidx = node->this; + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + return; + len = hfs_brec_lenoff(node, 2, &off); + while (nidx >= len * 8) { + u32 i; + + nidx -= len * 8; + i = node->next; + hfs_bnode_put(node); + if (!i) { + /* panic */; + pr_crit("unable to free bnode %u. bmap not found!\n", + node->this); + return; + } + node = hfs_bnode_find(tree, i); + if (IS_ERR(node)) + return; + if (node->type != HFS_NODE_MAP) { + /* panic */; + pr_crit("invalid bmap found! (%u,%d)\n", + node->this, node->type); + hfs_bnode_put(node); + return; + } + len = hfs_brec_lenoff(node, 0, &off); + } + off += node->page_offset + nidx / 8; + page = node->page[off >> PAGE_CACHE_SHIFT]; + data = kmap(page); + off &= ~PAGE_CACHE_MASK; + m = 1 << (~nidx & 7); + byte = data[off]; + if (!(byte & m)) { + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); + kunmap(page); + hfs_bnode_put(node); + return; + } + data[off] = byte & ~m; + set_page_dirty(page); + kunmap(page); + hfs_bnode_put(node); + tree->free_nodes++; + mark_inode_dirty(tree->inode); +} diff --git a/kernel/fs/hfs/btree.h b/kernel/fs/hfs/btree.h new file mode 100644 index 000000000..f6bd266d7 --- /dev/null +++ b/kernel/fs/hfs/btree.h @@ -0,0 +1,163 @@ +/* + * linux/fs/hfs/btree.h + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + */ + +#include "hfs_fs.h" + +typedef int (*btree_keycmp)(const btree_key *, const btree_key *); + +#define NODE_HASH_SIZE 256 + +/* A HFS BTree held in memory */ +struct hfs_btree { + struct super_block *sb; + struct inode *inode; + btree_keycmp keycmp; + + u32 cnid; + u32 root; + u32 leaf_count; + u32 leaf_head; + u32 leaf_tail; + u32 node_count; + u32 free_nodes; + u32 attributes; + + unsigned int node_size; + unsigned int node_size_shift; + unsigned int max_key_len; + unsigned int depth; + + //unsigned int map1_size, map_size; + struct mutex tree_lock; + + unsigned int pages_per_bnode; + spinlock_t hash_lock; + struct hfs_bnode *node_hash[NODE_HASH_SIZE]; + int node_hash_cnt; +}; + +/* A HFS BTree node in memory */ +struct hfs_bnode { + struct hfs_btree *tree; + + u32 prev; + u32 this; + u32 next; + u32 parent; + + u16 num_recs; + u8 type; + u8 height; + + struct hfs_bnode *next_hash; + unsigned long flags; + wait_queue_head_t lock_wq; + atomic_t refcnt; + unsigned int page_offset; + struct page *page[0]; +}; + +#define HFS_BNODE_ERROR 0 +#define HFS_BNODE_NEW 1 +#define HFS_BNODE_DELETED 2 + +struct hfs_find_data { + btree_key *key; + btree_key *search_key; + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int record; + int keyoffset, keylength; + int entryoffset, entrylength; +}; + + +/* btree.c */ +extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp); +extern void hfs_btree_close(struct hfs_btree *); +extern void hfs_btree_write(struct hfs_btree *); +extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *); +extern void hfs_bmap_free(struct hfs_bnode *node); + +/* bnode.c */ +extern void hfs_bnode_read(struct hfs_bnode *, void *, int, int); +extern u16 hfs_bnode_read_u16(struct hfs_bnode *, int); +extern u8 hfs_bnode_read_u8(struct hfs_bnode *, int); +extern void hfs_bnode_read_key(struct hfs_bnode *, void *, int); +extern void hfs_bnode_write(struct hfs_bnode *, void *, int, int); +extern void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); +extern void hfs_bnode_write_u8(struct hfs_bnode *, int, u8); +extern void hfs_bnode_clear(struct hfs_bnode *, int, int); +extern void hfs_bnode_copy(struct hfs_bnode *, int, + struct hfs_bnode *, int, int); +extern void hfs_bnode_move(struct hfs_bnode *, int, int, int); +extern void hfs_bnode_dump(struct hfs_bnode *); +extern void hfs_bnode_unlink(struct hfs_bnode *); +extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); +extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); +extern void hfs_bnode_unhash(struct hfs_bnode *); +extern void hfs_bnode_free(struct hfs_bnode *); +extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); +extern void hfs_bnode_get(struct hfs_bnode *); +extern void hfs_bnode_put(struct hfs_bnode *); + +/* brec.c */ +extern u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); +extern u16 hfs_brec_keylen(struct hfs_bnode *, u16); +extern int hfs_brec_insert(struct hfs_find_data *, void *, int); +extern int hfs_brec_remove(struct hfs_find_data *); + +/* bfind.c */ +extern int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); +extern void hfs_find_exit(struct hfs_find_data *); +extern int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); +extern int hfs_brec_find(struct hfs_find_data *); +extern int hfs_brec_read(struct hfs_find_data *, void *, int); +extern int hfs_brec_goto(struct hfs_find_data *, int); + + +struct hfs_bnode_desc { + __be32 next; /* (V) Number of the next node at this level */ + __be32 prev; /* (V) Number of the prev node at this level */ + u8 type; /* (F) The type of node */ + u8 height; /* (F) The level of this node (leaves=1) */ + __be16 num_recs; /* (V) The number of records in this node */ + u16 reserved; +} __packed; + +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +struct hfs_btree_header_rec { + __be16 depth; /* (V) The number of levels in this B-tree */ + __be32 root; /* (V) The node number of the root node */ + __be32 leaf_count; /* (V) The number of leaf records */ + __be32 leaf_head; /* (V) The number of the first leaf node */ + __be32 leaf_tail; /* (V) The number of the last leaf node */ + __be16 node_size; /* (F) The number of bytes in a node (=512) */ + __be16 max_key_len; /* (F) The length of a key in an index node */ + __be32 node_count; /* (V) The total number of nodes */ + __be32 free_nodes; /* (V) The number of unused nodes */ + u16 reserved1; + __be32 clump_size; /* (F) clump size. not usually used. */ + u8 btree_type; /* (F) BTree type */ + u8 reserved2; + __be32 attributes; /* (F) attributes */ + u32 reserved3[16]; +} __packed; + +#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not + used by hfsplus. */ +#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. + used by hfsplus. */ +#define HFS_TREE_VARIDXKEYS 0x00000004 /* variable key length instead of + max key length. use din catalog + b-tree but not in extents + b-tree (hfsplus). */ diff --git a/kernel/fs/hfs/catalog.c b/kernel/fs/hfs/catalog.c new file mode 100644 index 000000000..db458ee3a --- /dev/null +++ b/kernel/fs/hfs/catalog.c @@ -0,0 +1,366 @@ +/* + * linux/fs/hfs/catalog.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the functions related to the catalog B-tree. + * + * Cache code shamelessly stolen from + * linux/fs/inode.c Copyright (C) 1991, 1992 Linus Torvalds + * re-shamelessly stolen Copyright (C) 1997 Linus Torvalds + */ + +#include "hfs_fs.h" +#include "btree.h" + +/* + * hfs_cat_build_key() + * + * Given the ID of the parent and the name build a search key. + */ +void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name) +{ + key->cat.reserved = 0; + key->cat.ParID = cpu_to_be32(parent); + if (name) { + hfs_asc2mac(sb, &key->cat.CName, name); + key->key_len = 6 + key->cat.CName.len; + } else { + memset(&key->cat.CName, 0, sizeof(struct hfs_name)); + key->key_len = 6; + } +} + +static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode) +{ + __be32 mtime = hfs_mtime(); + + memset(rec, 0, sizeof(*rec)); + if (S_ISDIR(inode->i_mode)) { + rec->type = HFS_CDR_DIR; + rec->dir.DirID = cpu_to_be32(cnid); + rec->dir.CrDat = mtime; + rec->dir.MdDat = mtime; + rec->dir.BkDat = 0; + rec->dir.UsrInfo.frView = cpu_to_be16(0xff); + return sizeof(struct hfs_cat_dir); + } else { + /* init some fields for the file record */ + rec->type = HFS_CDR_FIL; + rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD; + if (!(inode->i_mode & S_IWUSR)) + rec->file.Flags |= HFS_FIL_LOCK; + rec->file.FlNum = cpu_to_be32(cnid); + rec->file.CrDat = mtime; + rec->file.MdDat = mtime; + rec->file.BkDat = 0; + rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type; + rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator; + return sizeof(struct hfs_cat_file); + } +} + +static int hfs_cat_build_thread(struct super_block *sb, + hfs_cat_rec *rec, int type, + u32 parentid, struct qstr *name) +{ + rec->type = type; + memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved)); + rec->thread.ParID = cpu_to_be32(parentid); + hfs_asc2mac(sb, &rec->thread.CName, name); + return sizeof(struct hfs_cat_thread); +} + +/* + * create_entry() + * + * Add a new file or directory to the catalog B-tree and + * return a (struct hfs_cat_entry) for it in '*result'. + */ +int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) +{ + struct hfs_find_data fd; + struct super_block *sb; + union hfs_cat_rec entry; + int entry_size; + int err; + + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); + if (dir->i_size >= HFS_MAX_VALENCE) + return -ENOSPC; + + sb = dir->i_sb; + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; + + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? + HFS_CDR_THD : HFS_CDR_FTH, + dir->i_ino, str); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto err2; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err2; + + hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); + entry_size = hfs_cat_build_record(&entry, cnid, inode); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + /* panic? */ + if (!err) + err = -EEXIST; + goto err1; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err1; + + dir->i_size++; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + hfs_find_exit(&fd); + return 0; + +err1: + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + if (!hfs_brec_find(&fd)) + hfs_brec_remove(&fd); +err2: + hfs_find_exit(&fd); + return err; +} + +/* + * hfs_cat_compare() + * + * Description: + * This is the comparison function used for the catalog B-tree. In + * comparing catalog B-tree entries, the parent id is the most + * significant field (compared as unsigned ints). The name field is + * the least significant (compared in "Macintosh lexical order", + * see hfs_strcmp() in string.c) + * Input Variable(s): + * struct hfs_cat_key *key1: pointer to the first key to compare + * struct hfs_cat_key *key2: pointer to the second key to compare + * Output Variable(s): + * NONE + * Returns: + * int: negative if key1key2, and 0 if key1==key2 + * Preconditions: + * key1 and key2 point to "valid" (struct hfs_cat_key)s. + * Postconditions: + * This function has no side-effects + */ +int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) +{ + __be32 k1p, k2p; + + k1p = key1->cat.ParID; + k2p = key2->cat.ParID; + + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, + key2->cat.CName.name, key2->cat.CName.len); +} + +/* Try to get a catalog entry for given catalog id */ +// move to read_super??? +int hfs_cat_find_brec(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd) +{ + hfs_cat_rec rec; + int res, len, type; + + hfs_cat_build_key(sb, fd->search_key, cnid, NULL); + res = hfs_brec_read(fd, &rec, sizeof(rec)); + if (res) + return res; + + type = rec.type; + if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { + pr_err("found bad thread record in catalog\n"); + return -EIO; + } + + fd->search_key->cat.ParID = rec.thread.ParID; + len = fd->search_key->cat.CName.len = rec.thread.CName.len; + if (len > HFS_NAMELEN) { + pr_err("bad catalog namelength\n"); + return -EIO; + } + memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); + return hfs_brec_find(fd); +} + + +/* + * hfs_cat_delete() + * + * Delete the indicated file or directory. + * The associated thread is also removed unless ('with_thread'==0). + */ +int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) +{ + struct super_block *sb; + struct hfs_find_data fd; + struct list_head *pos; + int res, type; + + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + sb = dir->i_sb; + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + return res; + + hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); + res = hfs_brec_find(&fd); + if (res) + goto out; + + type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset); + if (type == HFS_CDR_FIL) { + struct hfs_cat_file file; + hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file)); + if (be32_to_cpu(file.FlNum) == cnid) { +#if 0 + hfs_free_fork(sb, &file, HFS_FK_DATA); +#endif + hfs_free_fork(sb, &file, HFS_FK_RSRC); + } + } + + list_for_each(pos, &HFS_I(dir)->open_dir_list) { + struct hfs_readdir_data *rd = + list_entry(pos, struct hfs_readdir_data, list); + if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) + rd->file->f_pos--; + } + + res = hfs_brec_remove(&fd); + if (res) + goto out; + + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + res = hfs_brec_find(&fd); + if (!res) { + res = hfs_brec_remove(&fd); + if (res) + goto out; + } + + dir->i_size--; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + res = 0; +out: + hfs_find_exit(&fd); + + return res; +} + +/* + * hfs_cat_move() + * + * Rename a file or directory, possibly to a new directory. + * If the destination exists it is removed and a + * (struct hfs_cat_entry) for it is returned in '*result'. + */ +int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name) +{ + struct super_block *sb; + struct hfs_find_data src_fd, dst_fd; + union hfs_cat_rec entry; + int entry_size, type; + int err; + + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, + dst_dir->i_ino, dst_name->name); + sb = src_dir->i_sb; + err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; + dst_fd = src_fd; + + /* find the old dir entry and read the data */ + hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, + src_fd.entrylength); + + /* create new dir entry with the data from the old entry */ + hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + + err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength); + if (err) + goto out; + dst_dir->i_size++; + dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dst_dir); + + /* finally remove the old entry */ + hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + src_dir->i_size--; + src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(src_dir); + + type = entry.type; + if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD)) + goto out; + + /* remove old thread entry */ + hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + + /* create new thread entry */ + hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD, + dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + err = hfs_brec_insert(&dst_fd, &entry, entry_size); +out: + hfs_bnode_put(dst_fd.bnode); + hfs_find_exit(&src_fd); + return err; +} diff --git a/kernel/fs/hfs/dir.c b/kernel/fs/hfs/dir.c new file mode 100644 index 000000000..70788e038 --- /dev/null +++ b/kernel/fs/hfs/dir.c @@ -0,0 +1,319 @@ +/* + * linux/fs/hfs/dir.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains directory-related functions independent of which + * scheme is being used to represent forks. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include "hfs_fs.h" +#include "btree.h" + +/* + * hfs_lookup() + */ +static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + hfs_cat_rec rec; + struct hfs_find_data fd; + struct inode *inode = NULL; + int res; + + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) + return ERR_PTR(res); + hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); + res = hfs_brec_read(&fd, &rec, sizeof(rec)); + if (res) { + hfs_find_exit(&fd); + if (res == -ENOENT) { + /* No such entry */ + inode = NULL; + goto done; + } + return ERR_PTR(res); + } + inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); + hfs_find_exit(&fd); + if (!inode) + return ERR_PTR(-EACCES); +done: + d_add(dentry, inode); + return NULL; +} + +/* + * hfs_readdir + */ +static int hfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + int len, err; + char strbuf[HFS_MAX_NAMELEN]; + union hfs_cat_rec entry; + struct hfs_find_data fd; + struct hfs_readdir_data *rd; + u16 type; + + if (ctx->pos >= inode->i_size) + return 0; + + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; + hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + err = hfs_brec_find(&fd); + if (err) + goto out; + + if (ctx->pos == 0) { + /* This is completely artificial... */ + if (!dir_emit_dot(file, ctx)) + goto out; + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + if (entry.type != HFS_CDR_THD) { + pr_err("bad catalog folder thread\n"); + err = -EIO; + goto out; + } + //if (fd.entrylength < HFS_MIN_THREAD_SZ) { + // pr_err("truncated catalog thread\n"); + // err = -EIO; + // goto out; + //} + if (!dir_emit(ctx, "..", 2, + be32_to_cpu(entry.thread.ParID), DT_DIR)) + goto out; + ctx->pos = 2; + } + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; + + for (;;) { + if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { + pr_err("walked past end of dir\n"); + err = -EIO; + goto out; + } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + type = entry.type; + len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); + if (type == HFS_CDR_DIR) { + if (fd.entrylength < sizeof(struct hfs_cat_dir)) { + pr_err("small dir entry\n"); + err = -EIO; + goto out; + } + if (!dir_emit(ctx, strbuf, len, + be32_to_cpu(entry.dir.DirID), DT_DIR)) + break; + } else if (type == HFS_CDR_FIL) { + if (fd.entrylength < sizeof(struct hfs_cat_file)) { + pr_err("small file entry\n"); + err = -EIO; + goto out; + } + if (!dir_emit(ctx, strbuf, len, + be32_to_cpu(entry.file.FlNum), DT_REG)) + break; + } else { + pr_err("bad catalog entry type %d\n", type); + err = -EIO; + goto out; + } + ctx->pos++; + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, 1); + if (err) + goto out; + } + rd = file->private_data; + if (!rd) { + rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); + if (!rd) { + err = -ENOMEM; + goto out; + } + file->private_data = rd; + rd->file = file; + list_add(&rd->list, &HFS_I(inode)->open_dir_list); + } + memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); +out: + hfs_find_exit(&fd); + return err; +} + +static int hfs_dir_release(struct inode *inode, struct file *file) +{ + struct hfs_readdir_data *rd = file->private_data; + if (rd) { + mutex_lock(&inode->i_mutex); + list_del(&rd->list); + mutex_unlock(&inode->i_mutex); + kfree(rd); + } + return 0; +} + +/* + * hfs_create() + * + * This is the create() entry in the inode_operations structure for + * regular HFS directories. The purpose is to create a new file in + * a directory and return a corresponding inode, given the inode for + * the directory and the name (and its length) of the new file. + */ +static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + int res; + + inode = hfs_new_inode(dir, &dentry->d_name, mode); + if (!inode) + return -ENOMEM; + + res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); + if (res) { + clear_nlink(inode); + hfs_delete_inode(inode); + iput(inode); + return res; + } + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_mkdir() + * + * This is the mkdir() entry in the inode_operations structure for + * regular HFS directories. The purpose is to create a new directory + * in a directory, given the inode for the parent directory and the + * name (and its length) of the new directory. + */ +static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int res; + + inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); + if (!inode) + return -ENOMEM; + + res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); + if (res) { + clear_nlink(inode); + hfs_delete_inode(inode); + iput(inode); + return res; + } + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_remove() + * + * This serves as both unlink() and rmdir() in the inode_operations + * structure for regular HFS directories. The purpose is to delete + * an existing child, given the inode for the parent directory and + * the name (and its length) of the existing directory. + * + * HFS does not have hardlinks, so both rmdir and unlink set the + * link count to 0. The only difference is the emptiness check. + */ +static int hfs_remove(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int res; + + if (S_ISDIR(inode->i_mode) && inode->i_size != 2) + return -ENOTEMPTY; + res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); + if (res) + return res; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + hfs_delete_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_rename() + * + * This is the rename() entry in the inode_operations structure for + * regular HFS directories. The purpose is to rename an existing + * file or directory, given the inode for the current directory and + * the name (and its length) of the existing file/directory and the + * inode for the new directory and the name (and its length) of the + * new file/directory. + * XXX: how do you handle must_be dir? + */ +static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int res; + + /* Unlink destination if it already exists */ + if (d_really_is_positive(new_dentry)) { + res = hfs_remove(new_dir, new_dentry); + if (res) + return res; + } + + res = hfs_cat_move(d_inode(old_dentry)->i_ino, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + if (!res) + hfs_cat_build_key(old_dir->i_sb, + (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, + new_dir->i_ino, &new_dentry->d_name); + return res; +} + +const struct file_operations hfs_dir_operations = { + .read = generic_read_dir, + .iterate = hfs_readdir, + .llseek = generic_file_llseek, + .release = hfs_dir_release, +}; + +const struct inode_operations hfs_dir_inode_operations = { + .create = hfs_create, + .lookup = hfs_lookup, + .unlink = hfs_remove, + .mkdir = hfs_mkdir, + .rmdir = hfs_remove, + .rename = hfs_rename, + .setattr = hfs_inode_setattr, +}; diff --git a/kernel/fs/hfs/extent.c b/kernel/fs/hfs/extent.c new file mode 100644 index 000000000..e33a0d36a --- /dev/null +++ b/kernel/fs/hfs/extent.c @@ -0,0 +1,545 @@ +/* + * linux/fs/hfs/extent.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the functions related to the extents B-tree. + */ + +#include + +#include "hfs_fs.h" +#include "btree.h" + +/*================ File-local functions ================*/ + +/* + * build_key + */ +static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type) +{ + key->key_len = 7; + key->ext.FkType = type; + key->ext.FNum = cpu_to_be32(cnid); + key->ext.FABN = cpu_to_be16(block); +} + +/* + * hfs_ext_compare() + * + * Description: + * This is the comparison function used for the extents B-tree. In + * comparing extent B-tree entries, the file id is the most + * significant field (compared as unsigned ints); the fork type is + * the second most significant field (compared as unsigned chars); + * and the allocation block number field is the least significant + * (compared as unsigned ints). + * Input Variable(s): + * struct hfs_ext_key *key1: pointer to the first key to compare + * struct hfs_ext_key *key2: pointer to the second key to compare + * Output Variable(s): + * NONE + * Returns: + * int: negative if key1key2, and 0 if key1==key2 + * Preconditions: + * key1 and key2 point to "valid" (struct hfs_ext_key)s. + * Postconditions: + * This function has no side-effects */ +int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2) +{ + __be32 fnum1, fnum2; + __be16 block1, block2; + + fnum1 = key1->ext.FNum; + fnum2 = key2->ext.FNum; + if (fnum1 != fnum2) + return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1; + if (key1->ext.FkType != key2->ext.FkType) + return key1->ext.FkType < key2->ext.FkType ? -1 : 1; + + block1 = key1->ext.FABN; + block2 = key2->ext.FABN; + if (block1 == block2) + return 0; + return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1; +} + +/* + * hfs_ext_find_block + * + * Find a block within an extent record + */ +static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) +{ + int i; + u16 count; + + for (i = 0; i < 3; ext++, i++) { + count = be16_to_cpu(ext->count); + if (off < count) + return be16_to_cpu(ext->block) + off; + off -= count; + } + /* panic? */ + return 0; +} + +static int hfs_ext_block_count(struct hfs_extent *ext) +{ + int i; + u16 count = 0; + + for (i = 0; i < 3; ext++, i++) + count += be16_to_cpu(ext->count); + return count; +} + +static u16 hfs_ext_lastblock(struct hfs_extent *ext) +{ + int i; + + ext += 2; + for (i = 0; i < 2; ext--, i++) + if (ext->count) + break; + return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); +} + +static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +{ + int res; + + hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start, + HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); + res = hfs_brec_find(fd); + if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { + if (res != -ENOENT) + return res; + hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + } else { + if (res) + return res; + hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); + HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; + } + return 0; +} + +int hfs_ext_write_extent(struct inode *inode) +{ + struct hfs_find_data fd; + int res = 0; + + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; + res = __hfs_ext_write_extent(inode, &fd); + hfs_find_exit(&fd); + } + return res; +} + +static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, + u32 cnid, u32 block, u8 type) +{ + int res; + + hfs_ext_build_key(fd->search_key, cnid, block, type); + fd->key->ext.FNum = 0; + res = hfs_brec_find(fd); + if (res && res != -ENOENT) + return res; + if (fd->key->ext.FNum != fd->search_key->ext.FNum || + fd->key->ext.FkType != fd->search_key->ext.FkType) + return -ENOENT; + if (fd->entrylength != sizeof(hfs_extent_rec)) + return -EIO; + hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec)); + return 0; +} + +static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) +{ + int res; + + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + res = __hfs_ext_write_extent(inode, fd); + if (res) + return res; + } + + res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, + block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); + if (!res) { + HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN); + HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents); + } else { + HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + } + return res; +} + +static int hfs_ext_read_extent(struct inode *inode, u16 block) +{ + struct hfs_find_data fd; + int res; + + if (block >= HFS_I(inode)->cached_start && + block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) + return 0; + + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfs_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } + return res; +} + +static void hfs_dump_extent(struct hfs_extent *extent) +{ + int i; + + hfs_dbg(EXTENT, " "); + for (i = 0; i < 3; i++) + hfs_dbg_cont(EXTENT, " %u:%u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg_cont(EXTENT, "\n"); +} + +static int hfs_add_extent(struct hfs_extent *extent, u16 offset, + u16 alloc_block, u16 block_count) +{ + u16 count, start; + int i; + + hfs_dump_extent(extent); + for (i = 0; i < 3; extent++, i++) { + count = be16_to_cpu(extent->count); + if (offset == count) { + start = be16_to_cpu(extent->block); + if (alloc_block != start + count) { + if (++i >= 3) + return -ENOSPC; + extent++; + extent->block = cpu_to_be16(alloc_block); + } else + block_count += count; + extent->count = cpu_to_be16(block_count); + return 0; + } else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +} + +static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent, + u16 offset, u16 block_nr) +{ + u16 count, start; + int i; + + hfs_dump_extent(extent); + for (i = 0; i < 3; extent++, i++) { + count = be16_to_cpu(extent->count); + if (offset == count) + goto found; + else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +found: + for (;;) { + start = be16_to_cpu(extent->block); + if (count <= block_nr) { + hfs_clear_vbm_bits(sb, start, count); + extent->block = 0; + extent->count = 0; + block_nr -= count; + } else { + count -= block_nr; + hfs_clear_vbm_bits(sb, start + count, block_nr); + extent->count = cpu_to_be16(count); + block_nr = 0; + } + if (!block_nr || !i) + return 0; + i--; + extent--; + count = be16_to_cpu(extent->count); + } +} + +int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) +{ + struct hfs_find_data fd; + u32 total_blocks, blocks, start; + u32 cnid = be32_to_cpu(file->FlNum); + struct hfs_extent *extent; + int res, i; + + if (type == HFS_FK_DATA) { + total_blocks = be32_to_cpu(file->PyLen); + extent = file->ExtRec; + } else { + total_blocks = be32_to_cpu(file->RPyLen); + extent = file->RExtRec; + } + total_blocks /= HFS_SB(sb)->alloc_blksz; + if (!total_blocks) + return 0; + + blocks = 0; + for (i = 0; i < 3; extent++, i++) + blocks += be16_to_cpu(extent[i].count); + + res = hfs_free_extents(sb, extent, blocks, blocks); + if (res) + return res; + if (total_blocks == blocks) + return 0; + + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) + return res; + do { + res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); + if (res) + break; + start = be16_to_cpu(fd.key->ext.FABN); + hfs_free_extents(sb, extent, total_blocks - start, total_blocks); + hfs_brec_remove(&fd); + total_blocks = start; + } while (total_blocks > blocks); + hfs_find_exit(&fd); + + return res; +} + +/* + * hfs_get_block + */ +int hfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb; + u16 dblock, ablock; + int res; + + sb = inode->i_sb; + /* Convert inode block to disk allocation block */ + ablock = (u32)block / HFS_SB(sb)->fs_div; + + if (block >= HFS_I(inode)->fs_blocks) { + if (block > HFS_I(inode)->fs_blocks || !create) + return -EIO; + if (ablock >= HFS_I(inode)->alloc_blocks) { + res = hfs_extend_file(inode); + if (res) + return res; + } + } else + create = 0; + + if (ablock < HFS_I(inode)->first_blocks) { + dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock); + goto done; + } + + mutex_lock(&HFS_I(inode)->extents_lock); + res = hfs_ext_read_extent(inode, ablock); + if (!res) + dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, + ablock - HFS_I(inode)->cached_start); + else { + mutex_unlock(&HFS_I(inode)->extents_lock); + return -EIO; + } + mutex_unlock(&HFS_I(inode)->extents_lock); + +done: + map_bh(bh_result, sb, HFS_SB(sb)->fs_start + + dblock * HFS_SB(sb)->fs_div + + (u32)block % HFS_SB(sb)->fs_div); + + if (create) { + set_buffer_new(bh_result); + HFS_I(inode)->phys_size += sb->s_blocksize; + HFS_I(inode)->fs_blocks++; + inode_add_bytes(inode, sb->s_blocksize); + mark_inode_dirty(inode); + } + return 0; +} + +int hfs_extend_file(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + u32 start, len, goal; + int res; + + mutex_lock(&HFS_I(inode)->extents_lock); + if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) + goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); + else { + res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks); + if (res) + goto out; + goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents); + } + + len = HFS_I(inode)->clump_blocks; + start = hfs_vbm_search_free(sb, goal, &len); + if (!len) { + res = -ENOSPC; + goto out; + } + + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { + if (!HFS_I(inode)->first_blocks) { + hfs_dbg(EXTENT, "first extents\n"); + /* no extents yet */ + HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); + HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); + res = 0; + } else { + /* try to append to extents in inode */ + res = hfs_add_extent(HFS_I(inode)->first_extents, + HFS_I(inode)->alloc_blocks, + start, len); + if (res == -ENOSPC) + goto insert_extent; + } + if (!res) { + hfs_dump_extent(HFS_I(inode)->first_extents); + HFS_I(inode)->first_blocks += len; + } + } else { + res = hfs_add_extent(HFS_I(inode)->cached_extents, + HFS_I(inode)->alloc_blocks - + HFS_I(inode)->cached_start, + start, len); + if (!res) { + hfs_dump_extent(HFS_I(inode)->cached_extents); + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; + HFS_I(inode)->cached_blocks += len; + } else if (res == -ENOSPC) + goto insert_extent; + } +out: + mutex_unlock(&HFS_I(inode)->extents_lock); + if (!res) { + HFS_I(inode)->alloc_blocks += len; + mark_inode_dirty(inode); + if (inode->i_ino < HFS_FIRSTUSER_CNID) + set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags); + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + hfs_mark_mdb_dirty(sb); + } + return res; + +insert_extent: + hfs_dbg(EXTENT, "insert new extent\n"); + res = hfs_ext_write_extent(inode); + if (res) + goto out; + + memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); + HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); + HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len); + hfs_dump_extent(HFS_I(inode)->cached_extents); + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW; + HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks; + HFS_I(inode)->cached_blocks = len; + + res = 0; + goto out; +} + +void hfs_file_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + u16 blk_cnt, alloc_cnt, start; + u32 size; + int res; + + hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)HFS_I(inode)->phys_size, + inode->i_size); + if (inode->i_size > HFS_I(inode)->phys_size) { + struct address_space *mapping = inode->i_mapping; + void *fsdata; + struct page *page; + + /* XXX: Can use generic_cont_expand? */ + size = inode->i_size - 1; + res = pagecache_write_begin(NULL, mapping, size+1, 0, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (!res) { + res = pagecache_write_end(NULL, mapping, size+1, 0, 0, + page, fsdata); + } + if (res) + inode->i_size = HFS_I(inode)->phys_size; + return; + } else if (inode->i_size == HFS_I(inode)->phys_size) + return; + size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1; + blk_cnt = size / HFS_SB(sb)->alloc_blksz; + alloc_cnt = HFS_I(inode)->alloc_blocks; + if (blk_cnt == alloc_cnt) + goto out; + + mutex_lock(&HFS_I(inode)->extents_lock); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&HFS_I(inode)->extents_lock); + /* XXX: We lack error handling of hfs_file_truncate() */ + return; + } + while (1) { + if (alloc_cnt == HFS_I(inode)->first_blocks) { + hfs_free_extents(sb, HFS_I(inode)->first_extents, + alloc_cnt, alloc_cnt - blk_cnt); + hfs_dump_extent(HFS_I(inode)->first_extents); + HFS_I(inode)->first_blocks = blk_cnt; + break; + } + res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt); + if (res) + break; + start = HFS_I(inode)->cached_start; + hfs_free_extents(sb, HFS_I(inode)->cached_extents, + alloc_cnt - start, alloc_cnt - blk_cnt); + hfs_dump_extent(HFS_I(inode)->cached_extents); + if (blk_cnt > start) { + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; + break; + } + alloc_cnt = start; + HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + hfs_brec_remove(&fd); + } + hfs_find_exit(&fd); + mutex_unlock(&HFS_I(inode)->extents_lock); + + HFS_I(inode)->alloc_blocks = blk_cnt; +out: + HFS_I(inode)->phys_size = inode->i_size; + HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); + mark_inode_dirty(inode); +} diff --git a/kernel/fs/hfs/hfs.h b/kernel/fs/hfs/hfs.h new file mode 100644 index 000000000..6f194d076 --- /dev/null +++ b/kernel/fs/hfs/hfs.h @@ -0,0 +1,289 @@ +/* + * linux/fs/hfs/hfs.h + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + */ + +#ifndef _HFS_H +#define _HFS_H + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +/* various FIXED size parameters */ +#define HFS_SECTOR_SIZE 512 /* size of an HFS sector */ +#define HFS_SECTOR_SIZE_BITS 9 /* log_2(HFS_SECTOR_SIZE) */ +#define HFS_NAMELEN 31 /* maximum length of an HFS filename */ +#define HFS_MAX_NAMELEN 128 +#define HFS_MAX_VALENCE 32767U + +/* Meanings of the drAtrb field of the MDB, + * Reference: _Inside Macintosh: Files_ p. 2-61 + */ +#define HFS_SB_ATTRIB_HLOCK (1 << 7) +#define HFS_SB_ATTRIB_UNMNT (1 << 8) +#define HFS_SB_ATTRIB_SPARED (1 << 9) +#define HFS_SB_ATTRIB_INCNSTNT (1 << 11) +#define HFS_SB_ATTRIB_SLOCK (1 << 15) + +/* Some special File ID numbers */ +#define HFS_POR_CNID 1 /* Parent Of the Root */ +#define HFS_ROOT_CNID 2 /* ROOT directory */ +#define HFS_EXT_CNID 3 /* EXTents B-tree */ +#define HFS_CAT_CNID 4 /* CATalog B-tree */ +#define HFS_BAD_CNID 5 /* BAD blocks file */ +#define HFS_ALLOC_CNID 6 /* ALLOCation file (HFS+) */ +#define HFS_START_CNID 7 /* STARTup file (HFS+) */ +#define HFS_ATTR_CNID 8 /* ATTRibutes file (HFS+) */ +#define HFS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFS_FIRSTUSER_CNID 16 + +/* values for hfs_cat_rec.cdrType */ +#define HFS_CDR_DIR 0x01 /* folder (directory) */ +#define HFS_CDR_FIL 0x02 /* file */ +#define HFS_CDR_THD 0x03 /* folder (directory) thread */ +#define HFS_CDR_FTH 0x04 /* file thread */ + +/* legal values for hfs_ext_key.FkType and hfs_file.fork */ +#define HFS_FK_DATA 0x00 +#define HFS_FK_RSRC 0xFF + +/* bits in hfs_fil_entry.Flags */ +#define HFS_FIL_LOCK 0x01 /* locked */ +#define HFS_FIL_THD 0x02 /* file thread */ +#define HFS_FIL_DOPEN 0x04 /* data fork open */ +#define HFS_FIL_ROPEN 0x08 /* resource fork open */ +#define HFS_FIL_DIR 0x10 /* directory (always clear) */ +#define HFS_FIL_NOCOPY 0x40 /* copy-protected file */ +#define HFS_FIL_USED 0x80 /* open */ + +/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */ +#define HFS_DIR_LOCK 0x01 /* locked */ +#define HFS_DIR_THD 0x02 /* directory thread */ +#define HFS_DIR_INEXPFOLDER 0x04 /* in a shared area */ +#define HFS_DIR_MOUNTED 0x08 /* mounted */ +#define HFS_DIR_DIR 0x10 /* directory (always set) */ +#define HFS_DIR_EXPFOLDER 0x20 /* share point */ + +/* bits hfs_finfo.fdFlags */ +#define HFS_FLG_INITED 0x0100 +#define HFS_FLG_LOCKED 0x1000 +#define HFS_FLG_INVISIBLE 0x4000 + +/*======== HFS structures as they appear on the disk ========*/ + +/* Pascal-style string of up to 31 characters */ +struct hfs_name { + u8 len; + u8 name[HFS_NAMELEN]; +} __packed; + +struct hfs_point { + __be16 v; + __be16 h; +} __packed; + +struct hfs_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + +struct hfs_finfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfs_point fdLocation; + __be16 fdFldr; +} __packed; + +struct hfs_fxinfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +struct hfs_dinfo { + struct hfs_rect frRect; + __be16 frFlags; + struct hfs_point frLocation; + __be16 frView; +} __packed; + +struct hfs_dxinfo { + struct hfs_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +union hfs_finder_info { + struct { + struct hfs_finfo finfo; + struct hfs_fxinfo fxinfo; + } file; + struct { + struct hfs_dinfo dinfo; + struct hfs_dxinfo dxinfo; + } dir; +} __packed; + +/* Cast to a pointer to a generic bkey */ +#define HFS_BKEY(X) (((void)((X)->KeyLen)), ((struct hfs_bkey *)(X))) + +/* The key used in the catalog b-tree: */ +struct hfs_cat_key { + u8 key_len; /* number of bytes in the key */ + u8 reserved; /* padding */ + __be32 ParID; /* CNID of the parent dir */ + struct hfs_name CName; /* The filename of the entry */ +} __packed; + +/* The key used in the extents b-tree: */ +struct hfs_ext_key { + u8 key_len; /* number of bytes in the key */ + u8 FkType; /* HFS_FK_{DATA,RSRC} */ + __be32 FNum; /* The File ID of the file */ + __be16 FABN; /* allocation blocks number*/ +} __packed; + +typedef union hfs_btree_key { + u8 key_len; /* number of bytes in the key */ + struct hfs_cat_key cat; + struct hfs_ext_key ext; +} hfs_btree_key; + +#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8)) +#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8)) + +typedef union hfs_btree_key btree_key; + +struct hfs_extent { + __be16 block; + __be16 count; +}; +typedef struct hfs_extent hfs_extent_rec[3]; + +/* The catalog record for a file */ +struct hfs_cat_file { + s8 type; /* The type of entry */ + u8 reserved; + u8 Flags; /* Flags such as read-only */ + s8 Typ; /* file version number = 0 */ + struct hfs_finfo UsrWds; /* data used by the Finder */ + __be32 FlNum; /* The CNID */ + __be16 StBlk; /* obsolete */ + __be32 LgLen; /* The logical EOF of the data fork*/ + __be32 PyLen; /* The physical EOF of the data fork */ + __be16 RStBlk; /* obsolete */ + __be32 RLgLen; /* The logical EOF of the rsrc fork */ + __be32 RPyLen; /* The physical EOF of the rsrc fork */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modified date */ + __be32 BkDat; /* The last backup date */ + struct hfs_fxinfo FndrInfo; /* more data for the Finder */ + __be16 ClpSize; /* number of bytes to allocate + when extending files */ + hfs_extent_rec ExtRec; /* first extent record + for the data fork */ + hfs_extent_rec RExtRec; /* first extent record + for the resource fork */ + u32 Resrv; /* reserved by Apple */ +} __packed; + +/* the catalog record for a directory */ +struct hfs_cat_dir { + s8 type; /* The type of entry */ + u8 reserved; + __be16 Flags; /* flags */ + __be16 Val; /* Valence: number of files and + dirs in the directory */ + __be32 DirID; /* The CNID */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modification date */ + __be32 BkDat; /* The last backup date */ + struct hfs_dinfo UsrInfo; /* data used by the Finder */ + struct hfs_dxinfo FndrInfo; /* more data used by Finder */ + u8 Resrv[16]; /* reserved by Apple */ +} __packed; + +/* the catalog record for a thread */ +struct hfs_cat_thread { + s8 type; /* The type of entry */ + u8 reserved[9]; /* reserved by Apple */ + __be32 ParID; /* CNID of parent directory */ + struct hfs_name CName; /* The name of this entry */ +} __packed; + +/* A catalog tree record */ +typedef union hfs_cat_rec { + s8 type; /* The type of entry */ + struct hfs_cat_file file; + struct hfs_cat_dir dir; + struct hfs_cat_thread thread; +} hfs_cat_rec; + +struct hfs_mdb { + __be16 drSigWord; /* Signature word indicating fs type */ + __be32 drCrDate; /* fs creation date/time */ + __be32 drLsMod; /* fs modification date/time */ + __be16 drAtrb; /* fs attributes */ + __be16 drNmFls; /* number of files in root directory */ + __be16 drVBMSt; /* location (in 512-byte blocks) + of the volume bitmap */ + __be16 drAllocPtr; /* location (in allocation blocks) + to begin next allocation search */ + __be16 drNmAlBlks; /* number of allocation blocks */ + __be32 drAlBlkSiz; /* bytes in an allocation block */ + __be32 drClpSiz; /* clumpsize, the number of bytes to + allocate when extending a file */ + __be16 drAlBlSt; /* location (in 512-byte blocks) + of the first allocation block */ + __be32 drNxtCNID; /* CNID to assign to the next + file or directory created */ + __be16 drFreeBks; /* number of free allocation blocks */ + u8 drVN[28]; /* the volume label */ + __be32 drVolBkUp; /* fs backup date/time */ + __be16 drVSeqNum; /* backup sequence number */ + __be32 drWrCnt; /* fs write count */ + __be32 drXTClpSiz; /* clumpsize for the extents B-tree */ + __be32 drCTClpSiz; /* clumpsize for the catalog B-tree */ + __be16 drNmRtDirs; /* number of directories in + the root directory */ + __be32 drFilCnt; /* number of files in the fs */ + __be32 drDirCnt; /* number of directories in the fs */ + u8 drFndrInfo[32]; /* data used by the Finder */ + __be16 drEmbedSigWord; /* embedded volume signature */ + __be32 drEmbedExtent; /* starting block number (xdrStABN) + and number of allocation blocks + (xdrNumABlks) occupied by embedded + volume */ + __be32 drXTFlSize; /* bytes in the extents B-tree */ + hfs_extent_rec drXTExtRec; /* extents B-tree's first 3 extents */ + __be32 drCTFlSize; /* bytes in the catalog B-tree */ + hfs_extent_rec drCTExtRec; /* catalog B-tree's first 3 extents */ +} __packed; + +/*======== Data structures kept in memory ========*/ + +struct hfs_readdir_data { + struct list_head list; + struct file *file; + struct hfs_cat_key key; +}; + +#endif diff --git a/kernel/fs/hfs/hfs_fs.h b/kernel/fs/hfs/hfs_fs.h new file mode 100644 index 000000000..95d255219 --- /dev/null +++ b/kernel/fs/hfs/hfs_fs.h @@ -0,0 +1,290 @@ +/* + * linux/fs/hfs/hfs_fs.h + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + */ + +#ifndef _LINUX_HFS_FS_H +#define _LINUX_HFS_FS_H + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "hfs.h" + +#define DBG_BNODE_REFS 0x00000001 +#define DBG_BNODE_MOD 0x00000002 +#define DBG_CAT_MOD 0x00000004 +#define DBG_INODE 0x00000008 +#define DBG_SUPER 0x00000010 +#define DBG_EXTENT 0x00000020 +#define DBG_BITMAP 0x00000040 + +//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) +//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) +//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#define DBG_MASK (0) + +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) + + +/* + * struct hfs_inode_info + * + * The HFS-specific part of a Linux (struct inode) + */ +struct hfs_inode_info { + atomic_t opencnt; + + unsigned int flags; + + /* to deal with localtime ugliness */ + int tz_secondswest; + + struct hfs_cat_key cat_key; + + struct list_head open_dir_list; + struct inode *rsrc_inode; + + struct mutex extents_lock; + + u16 alloc_blocks, clump_blocks; + sector_t fs_blocks; + /* Allocation extents from catlog record or volume header */ + hfs_extent_rec first_extents; + u16 first_blocks; + hfs_extent_rec cached_extents; + u16 cached_start, cached_blocks; + + loff_t phys_size; + struct inode vfs_inode; +}; + +#define HFS_FLG_RSRC 0x0001 +#define HFS_FLG_EXT_DIRTY 0x0002 +#define HFS_FLG_EXT_NEW 0x0004 + +#define HFS_IS_RSRC(inode) (HFS_I(inode)->flags & HFS_FLG_RSRC) + +/* + * struct hfs_sb_info + * + * The HFS-specific part of a Linux (struct super_block) + */ +struct hfs_sb_info { + struct buffer_head *mdb_bh; /* The hfs_buffer + holding the real + superblock (aka VIB + or MDB) */ + struct hfs_mdb *mdb; + struct buffer_head *alt_mdb_bh; /* The hfs_buffer holding + the alternate superblock */ + struct hfs_mdb *alt_mdb; + __be32 *bitmap; /* The page holding the + allocation bitmap */ + struct hfs_btree *ext_tree; /* Information about + the extents b-tree */ + struct hfs_btree *cat_tree; /* Information about + the catalog b-tree */ + u32 file_count; /* The number of + regular files in + the filesystem */ + u32 folder_count; /* The number of + directories in the + filesystem */ + u32 next_id; /* The next available + file id number */ + u32 clumpablks; /* The number of allocation + blocks to try to add when + extending a file */ + u32 fs_start; /* The first 512-byte + block represented + in the bitmap */ + u32 part_start; + u16 root_files; /* The number of + regular + (non-directory) + files in the root + directory */ + u16 root_dirs; /* The number of + directories in the + root directory */ + u16 fs_ablocks; /* The number of + allocation blocks + in the filesystem */ + u16 free_ablocks; /* the number of unused + allocation blocks + in the filesystem */ + u32 alloc_blksz; /* The size of an + "allocation block" */ + int s_quiet; /* Silent failure when + changing owner or mode? */ + __be32 s_type; /* Type for new files */ + __be32 s_creator; /* Creator for new files */ + umode_t s_file_umask; /* The umask applied to the + permissions on all files */ + umode_t s_dir_umask; /* The umask applied to the + permissions on all dirs */ + kuid_t s_uid; /* The uid of all files */ + kgid_t s_gid; /* The gid of all files */ + + int session, part; + struct nls_table *nls_io, *nls_disk; + struct mutex bitmap_lock; + unsigned long flags; + u16 blockoffset; + int fs_div; + struct super_block *sb; + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work mdb_work; /* MDB flush delayed work */ + spinlock_t work_lock; /* protects mdb_work and work_queued */ +}; + +#define HFS_FLG_BITMAP_DIRTY 0 +#define HFS_FLG_MDB_DIRTY 1 +#define HFS_FLG_ALT_MDB_DIRTY 2 + +/* bitmap.c */ +extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *); +extern int hfs_clear_vbm_bits(struct super_block *, u16, u16); + +/* catalog.c */ +extern int hfs_cat_keycmp(const btree_key *, const btree_key *); +struct hfs_find_data; +extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *); +extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *); +extern int hfs_cat_delete(u32, struct inode *, struct qstr *); +extern int hfs_cat_move(u32, struct inode *, struct qstr *, + struct inode *, struct qstr *); +extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *); + +/* dir.c */ +extern const struct file_operations hfs_dir_operations; +extern const struct inode_operations hfs_dir_inode_operations; + +/* extent.c */ +extern int hfs_ext_keycmp(const btree_key *, const btree_key *); +extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); +extern int hfs_ext_write_extent(struct inode *); +extern int hfs_extend_file(struct inode *); +extern void hfs_file_truncate(struct inode *); + +extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); + +/* inode.c */ +extern const struct address_space_operations hfs_aops; +extern const struct address_space_operations hfs_btree_aops; + +extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t); +extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); +extern int hfs_write_inode(struct inode *, struct writeback_control *); +extern int hfs_inode_setattr(struct dentry *, struct iattr *); +extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, + __be32 log_size, __be32 phys_size, u32 clump_size); +extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); +extern void hfs_evict_inode(struct inode *); +extern void hfs_delete_inode(struct inode *); + +/* attr.c */ +extern int hfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* mdb.c */ +extern int hfs_mdb_get(struct super_block *); +extern void hfs_mdb_commit(struct super_block *); +extern void hfs_mdb_close(struct super_block *); +extern void hfs_mdb_put(struct super_block *); + +/* part_tbl.c */ +extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); + +/* string.c */ +extern const struct dentry_operations hfs_dentry_operations; + +extern int hfs_hash_dentry(const struct dentry *, struct qstr *); +extern int hfs_strcmp(const unsigned char *, unsigned int, + const unsigned char *, unsigned int); +extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); + +/* trans.c */ +extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); +extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); + +/* super.c */ +extern void hfs_mark_mdb_dirty(struct super_block *sb); + +/* + * There are two time systems. Both are based on seconds since + * a particular time/date. + * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970 + * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 + * + */ +#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) +#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) + +#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) + +#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } +#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) +#define hfs_mtime() __hfs_u_to_mtime(get_seconds()) + +static inline const char *hfs_mdb_name(struct super_block *sb) +{ + return sb->s_id; +} + +static inline void hfs_bitmap_dirty(struct super_block *sb) +{ + set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); + hfs_mark_mdb_dirty(sb); +} + +#define sb_bread512(sb, sec, data) ({ \ + struct buffer_head *__bh; \ + sector_t __block; \ + loff_t __start; \ + int __offset; \ + \ + __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\ + __block = __start >> (sb)->s_blocksize_bits; \ + __offset = __start & ((sb)->s_blocksize - 1); \ + __bh = sb_bread((sb), __block); \ + if (likely(__bh != NULL)) \ + data = (void *)(__bh->b_data + __offset);\ + else \ + data = NULL; \ + __bh; \ +}) + +#endif diff --git a/kernel/fs/hfs/inode.c b/kernel/fs/hfs/inode.c new file mode 100644 index 000000000..b99ebddb1 --- /dev/null +++ b/kernel/fs/hfs/inode.c @@ -0,0 +1,692 @@ +/* + * linux/fs/hfs/inode.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains inode-related functions which do not depend on + * which scheme is being used to represent forks. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +static const struct file_operations hfs_file_operations; +static const struct inode_operations hfs_file_inode_operations; + +/*================ Variable-like macros ================*/ + +#define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO) + +static int hfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, hfs_get_block, wbc); +} + +static int hfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, hfs_get_block); +} + +static void hfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + hfs_file_truncate(inode); + } +} + +static int hfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hfs_get_block, + &HFS_I(mapping->host)->phys_size); + if (unlikely(ret)) + hfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t hfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, hfs_get_block); +} + +static int hfs_releasepage(struct page *page, gfp_t mask) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct hfs_btree *tree; + struct hfs_bnode *node; + u32 nidx; + int i, res = 1; + + switch (inode->i_ino) { + case HFS_EXT_CNID: + tree = HFS_SB(sb)->ext_tree; + break; + case HFS_CAT_CNID: + tree = HFS_SB(sb)->cat_tree; + break; + default: + BUG(); + return 0; + } + + if (!tree) + return 0; + + if (tree->node_size >= PAGE_CACHE_SIZE) { + nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, nidx); + if (!node) + ; + else if (atomic_read(&node->refcnt)) + res = 0; + if (res && node) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } + spin_unlock(&tree->hash_lock); + } else { + nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + spin_lock(&tree->hash_lock); + do { + node = hfs_bnode_findhash(tree, nidx++); + if (!node) + continue; + if (atomic_read(&node->refcnt)) { + res = 0; + break; + } + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } while (--i && nidx < tree->node_count); + spin_unlock(&tree->hash_lock); + } + return res ? try_to_free_buffers(page) : 0; +} + +static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + hfs_write_failed(mapping, end); + } + + return ret; +} + +static int hfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hfs_get_block); +} + +const struct address_space_operations hfs_btree_aops = { + .readpage = hfs_readpage, + .writepage = hfs_writepage, + .write_begin = hfs_write_begin, + .write_end = generic_write_end, + .bmap = hfs_bmap, + .releasepage = hfs_releasepage, +}; + +const struct address_space_operations hfs_aops = { + .readpage = hfs_readpage, + .writepage = hfs_writepage, + .write_begin = hfs_write_begin, + .write_end = generic_write_end, + .bmap = hfs_bmap, + .direct_IO = hfs_direct_IO, + .writepages = hfs_writepages, +}; + +/* + * hfs_new_inode + */ +struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + + mutex_init(&HFS_I(inode)->extents_lock); + INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); + inode->i_ino = HFS_SB(sb)->next_id++; + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + set_nlink(inode, 1); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + HFS_I(inode)->flags = 0; + HFS_I(inode)->rsrc_inode = NULL; + HFS_I(inode)->fs_blocks = 0; + if (S_ISDIR(mode)) { + inode->i_size = 2; + HFS_SB(sb)->folder_count++; + if (dir->i_ino == HFS_ROOT_CNID) + HFS_SB(sb)->root_dirs++; + inode->i_op = &hfs_dir_inode_operations; + inode->i_fop = &hfs_dir_operations; + inode->i_mode |= S_IRWXUGO; + inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; + } else if (S_ISREG(mode)) { + HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; + HFS_SB(sb)->file_count++; + if (dir->i_ino == HFS_ROOT_CNID) + HFS_SB(sb)->root_files++; + inode->i_op = &hfs_file_inode_operations; + inode->i_fop = &hfs_file_operations; + inode->i_mapping->a_ops = &hfs_aops; + inode->i_mode |= S_IRUGO|S_IXUGO; + if (mode & S_IWUSR) + inode->i_mode |= S_IWUGO; + inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask; + HFS_I(inode)->phys_size = 0; + HFS_I(inode)->alloc_blocks = 0; + HFS_I(inode)->first_blocks = 0; + HFS_I(inode)->cached_start = 0; + HFS_I(inode)->cached_blocks = 0; + memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec)); + memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); + } + insert_inode_hash(inode); + mark_inode_dirty(inode); + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + hfs_mark_mdb_dirty(sb); + + return inode; +} + +void hfs_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + if (S_ISDIR(inode->i_mode)) { + HFS_SB(sb)->folder_count--; + if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) + HFS_SB(sb)->root_dirs--; + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + hfs_mark_mdb_dirty(sb); + return; + } + HFS_SB(sb)->file_count--; + if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) + HFS_SB(sb)->root_files--; + if (S_ISREG(inode->i_mode)) { + if (!inode->i_nlink) { + inode->i_size = 0; + hfs_file_truncate(inode); + } + } + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + hfs_mark_mdb_dirty(sb); +} + +void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, + __be32 __log_size, __be32 phys_size, u32 clump_size) +{ + struct super_block *sb = inode->i_sb; + u32 log_size = be32_to_cpu(__log_size); + u16 count; + int i; + + memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec)); + for (count = 0, i = 0; i < 3; i++) + count += be16_to_cpu(ext[i].count); + HFS_I(inode)->first_blocks = count; + + inode->i_size = HFS_I(inode)->phys_size = log_size; + HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); + HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) / + HFS_SB(sb)->alloc_blksz; + HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz; + if (!HFS_I(inode)->clump_blocks) + HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; +} + +struct hfs_iget_data { + struct hfs_cat_key *key; + hfs_cat_rec *rec; +}; + +static int hfs_test_inode(struct inode *inode, void *data) +{ + struct hfs_iget_data *idata = data; + hfs_cat_rec *rec; + + rec = idata->rec; + switch (rec->type) { + case HFS_CDR_DIR: + return inode->i_ino == be32_to_cpu(rec->dir.DirID); + case HFS_CDR_FIL: + return inode->i_ino == be32_to_cpu(rec->file.FlNum); + default: + BUG(); + return 1; + } +} + +/* + * hfs_read_inode + */ +static int hfs_read_inode(struct inode *inode, void *data) +{ + struct hfs_iget_data *idata = data; + struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); + hfs_cat_rec *rec; + + HFS_I(inode)->flags = 0; + HFS_I(inode)->rsrc_inode = NULL; + mutex_init(&HFS_I(inode)->extents_lock); + INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + + /* Initialize the inode */ + inode->i_uid = hsb->s_uid; + inode->i_gid = hsb->s_gid; + set_nlink(inode, 1); + + if (idata->key) + HFS_I(inode)->cat_key = *idata->key; + else + HFS_I(inode)->flags |= HFS_FLG_RSRC; + HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; + + rec = idata->rec; + switch (rec->type) { + case HFS_CDR_FIL: + if (!HFS_IS_RSRC(inode)) { + hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen, + rec->file.PyLen, be16_to_cpu(rec->file.ClpSize)); + } else { + hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen, + rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize)); + } + + inode->i_ino = be32_to_cpu(rec->file.FlNum); + inode->i_mode = S_IRUGO | S_IXUGO; + if (!(rec->file.Flags & HFS_FIL_LOCK)) + inode->i_mode |= S_IWUGO; + inode->i_mode &= ~hsb->s_file_umask; + inode->i_mode |= S_IFREG; + inode->i_ctime = inode->i_atime = inode->i_mtime = + hfs_m_to_utime(rec->file.MdDat); + inode->i_op = &hfs_file_inode_operations; + inode->i_fop = &hfs_file_operations; + inode->i_mapping->a_ops = &hfs_aops; + break; + case HFS_CDR_DIR: + inode->i_ino = be32_to_cpu(rec->dir.DirID); + inode->i_size = be16_to_cpu(rec->dir.Val) + 2; + HFS_I(inode)->fs_blocks = 0; + inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); + inode->i_ctime = inode->i_atime = inode->i_mtime = + hfs_m_to_utime(rec->dir.MdDat); + inode->i_op = &hfs_dir_inode_operations; + inode->i_fop = &hfs_dir_operations; + break; + default: + make_bad_inode(inode); + } + return 0; +} + +/* + * __hfs_iget() + * + * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in + * the catalog B-tree and the 'type' of the desired file return the + * inode for that file/directory or NULL. Note that 'type' indicates + * whether we want the actual file or directory, or the corresponding + * metadata (AppleDouble header file or CAP metadata file). + */ +struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec) +{ + struct hfs_iget_data data = { key, rec }; + struct inode *inode; + u32 cnid; + + switch (rec->type) { + case HFS_CDR_DIR: + cnid = be32_to_cpu(rec->dir.DirID); + break; + case HFS_CDR_FIL: + cnid = be32_to_cpu(rec->file.FlNum); + break; + default: + return NULL; + } + inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); + if (inode && (inode->i_state & I_NEW)) + unlock_new_inode(inode); + return inode; +} + +void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, + __be32 *log_size, __be32 *phys_size) +{ + memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec)); + + if (log_size) + *log_size = cpu_to_be32(inode->i_size); + if (phys_size) + *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks * + HFS_SB(inode->i_sb)->alloc_blksz); +} + +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct inode *main_inode = inode; + struct hfs_find_data fd; + hfs_cat_rec rec; + int res; + + hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + res = hfs_ext_write_extent(inode); + if (res) + return res; + + if (inode->i_ino < HFS_FIRSTUSER_CNID) { + switch (inode->i_ino) { + case HFS_ROOT_CNID: + break; + case HFS_EXT_CNID: + hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree); + return 0; + case HFS_CAT_CNID: + hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree); + return 0; + default: + BUG(); + return -EIO; + } + } + + if (HFS_IS_RSRC(inode)) + main_inode = HFS_I(inode)->rsrc_inode; + + if (!main_inode->i_nlink) + return 0; + + if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd)) + /* panic? */ + return -EIO; + + fd.search_key->cat = HFS_I(main_inode)->cat_key; + if (hfs_brec_find(&fd)) + /* panic? */ + goto out; + + if (S_ISDIR(main_inode->i_mode)) { + if (fd.entrylength < sizeof(struct hfs_cat_dir)) + /* panic? */; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_dir)); + if (rec.type != HFS_CDR_DIR || + be32_to_cpu(rec.dir.DirID) != inode->i_ino) { + } + + rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.dir.Val = cpu_to_be16(inode->i_size - 2); + + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_dir)); + } else if (HFS_IS_RSRC(inode)) { + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + hfs_inode_write_fork(inode, rec.file.RExtRec, + &rec.file.RLgLen, &rec.file.RPyLen); + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } else { + if (fd.entrylength < sizeof(struct hfs_cat_file)) + /* panic? */; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + if (rec.type != HFS_CDR_FIL || + be32_to_cpu(rec.file.FlNum) != inode->i_ino) { + } + + if (inode->i_mode & S_IWUSR) + rec.file.Flags &= ~HFS_FIL_LOCK; + else + rec.file.Flags |= HFS_FIL_LOCK; + hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); + rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime); + + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } +out: + hfs_find_exit(&fd); + return 0; +} + +static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + hfs_cat_rec rec; + struct hfs_find_data fd; + int res; + + if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) + goto out; + + inode = HFS_I(dir)->rsrc_inode; + if (inode) + goto out; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } + fd.search_key->cat = HFS_I(dir)->cat_key; + res = hfs_brec_read(&fd, &rec, sizeof(rec)); + if (!res) { + struct hfs_iget_data idata = { NULL, &rec }; + hfs_read_inode(inode, &idata); + } + hfs_find_exit(&fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } + HFS_I(inode)->rsrc_inode = dir; + HFS_I(dir)->rsrc_inode = inode; + igrab(dir); + hlist_add_fake(&inode->i_hash); + mark_inode_dirty(inode); +out: + d_add(dentry, inode); + return NULL; +} + +void hfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { + HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFS_I(inode)->rsrc_inode); + } +} + +static int hfs_file_open(struct inode *inode, struct file *file) +{ + if (HFS_IS_RSRC(inode)) + inode = HFS_I(inode)->rsrc_inode; + atomic_inc(&HFS_I(inode)->opencnt); + return 0; +} + +static int hfs_file_release(struct inode *inode, struct file *file) +{ + //struct super_block *sb = inode->i_sb; + + if (HFS_IS_RSRC(inode)) + inode = HFS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { + mutex_lock(&inode->i_mutex); + hfs_file_truncate(inode); + //if (inode->i_flags & S_DEAD) { + // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); + // hfs_delete_inode(inode); + //} + mutex_unlock(&inode->i_mutex); + } + return 0; +} + +/* + * hfs_notify_change() + * + * Based very closely on fs/msdos/inode.c by Werner Almesberger + * + * This is the notify_change() field in the super_operations structure + * for HFS file systems. The purpose is to take that changes made to + * an inode and apply then in a filesystem-dependent manner. In this + * case the process has a few of tasks to do: + * 1) prevent changes to the i_uid and i_gid fields. + * 2) map file permissions to the closest allowable permissions + * 3) Since multiple Linux files can share the same on-disk inode under + * HFS (for instance the data and resource forks of a file) a change + * to permissions must be applied to all other in-core inodes which + * correspond to the same HFS file. + */ + +int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) +{ + struct inode *inode = d_inode(dentry); + struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); + int error; + + error = inode_change_ok(inode, attr); /* basic permission checks */ + if (error) + return error; + + /* no uig/gid changes and limit which mode bits can be set */ + if (((attr->ia_valid & ATTR_UID) && + (!uid_eq(attr->ia_uid, hsb->s_uid))) || + ((attr->ia_valid & ATTR_GID) && + (!gid_eq(attr->ia_gid, hsb->s_gid))) || + ((attr->ia_valid & ATTR_MODE) && + ((S_ISDIR(inode->i_mode) && + (attr->ia_mode != inode->i_mode)) || + (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) { + return hsb->s_quiet ? 0 : error; + } + + if (attr->ia_valid & ATTR_MODE) { + /* Only the 'w' bits can ever change and only all together. */ + if (attr->ia_mode & S_IWUSR) + attr->ia_mode = inode->i_mode | S_IWUGO; + else + attr->ia_mode = inode->i_mode & ~S_IWUGO; + attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + truncate_setsize(inode, attr->ia_size); + hfs_file_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + struct super_block * sb; + int ret, err; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + + /* sync the inode to buffers */ + ret = write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + flush_delayed_work(&HFS_SB(sb)->mdb_work); + /* .. finally sync the buffers to disk */ + err = sync_blockdev(sb->s_bdev); + if (!ret) + ret = err; + mutex_unlock(&inode->i_mutex); + return ret; +} + +static const struct file_operations hfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .fsync = hfs_file_fsync, + .open = hfs_file_open, + .release = hfs_file_release, +}; + +static const struct inode_operations hfs_file_inode_operations = { + .lookup = hfs_file_lookup, + .setattr = hfs_inode_setattr, + .setxattr = hfs_setxattr, + .getxattr = hfs_getxattr, + .listxattr = hfs_listxattr, +}; diff --git a/kernel/fs/hfs/mdb.c b/kernel/fs/hfs/mdb.c new file mode 100644 index 000000000..aa3f0d6d0 --- /dev/null +++ b/kernel/fs/hfs/mdb.c @@ -0,0 +1,366 @@ +/* + * linux/fs/hfs/mdb.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains functions for reading/writing the MDB. + */ + +#include +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +/*================ File-local data types ================*/ + +/* + * The HFS Master Directory Block (MDB). + * + * Also known as the Volume Information Block (VIB), this structure is + * the HFS equivalent of a superblock. + * + * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62 + * + * modified for HFS Extended + */ + +static int hfs_get_last_session(struct super_block *sb, + sector_t *start, sector_t *size) +{ + struct cdrom_multisession ms_info; + struct cdrom_tocentry te; + int res; + + /* default values */ + *start = 0; + *size = sb->s_bdev->bd_inode->i_size >> 9; + + if (HFS_SB(sb)->session >= 0) { + te.cdte_track = HFS_SB(sb)->session; + te.cdte_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); + if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { + *start = (sector_t)te.cdte_addr.lba << 2; + return 0; + } + pr_err("invalid session number or type of track\n"); + return -EINVAL; + } + ms_info.addr_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + if (!res && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; + return 0; +} + +/* + * hfs_mdb_get() + * + * Build the in-core MDB for a filesystem, including + * the B-trees and the volume bitmap. + */ +int hfs_mdb_get(struct super_block *sb) +{ + struct buffer_head *bh; + struct hfs_mdb *mdb, *mdb2; + unsigned int block; + char *ptr; + int off2, len, size, sect; + sector_t part_start, part_size; + loff_t off; + __be16 attrib; + + /* set the device driver to 512-byte blocks */ + size = sb_min_blocksize(sb, HFS_SECTOR_SIZE); + if (!size) + return -EINVAL; + + if (hfs_get_last_session(sb, &part_start, &part_size)) + return -EINVAL; + while (1) { + /* See if this is an HFS filesystem */ + bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); + if (!bh) + goto out; + + if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) + break; + brelse(bh); + + /* check for a partition block + * (should do this only for cdrom/loop though) + */ + if (hfs_part_find(sb, &part_start, &part_size)) + goto out; + } + + HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); + if (!size || (size & (HFS_SECTOR_SIZE - 1))) { + pr_err("bad allocation block size %d\n", size); + goto out_bh; + } + + size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE); + /* size must be a multiple of 512 */ + while (size & (size - 1)) + size -= HFS_SECTOR_SIZE; + sect = be16_to_cpu(mdb->drAlBlSt) + part_start; + /* align block size to first sector */ + while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS)) + size >>= 1; + /* align block size to weird alloc size */ + while (HFS_SB(sb)->alloc_blksz & (size - 1)) + size >>= 1; + brelse(bh); + if (!sb_set_blocksize(sb, size)) { + pr_err("unable to set blocksize to %u\n", size); + goto out; + } + + bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); + if (!bh) + goto out; + if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) + goto out_bh; + + HFS_SB(sb)->mdb_bh = bh; + HFS_SB(sb)->mdb = mdb; + + /* These parameters are read from the MDB, and never written */ + HFS_SB(sb)->part_start = part_start; + HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks); + HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits; + HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) / + HFS_SB(sb)->alloc_blksz; + if (!HFS_SB(sb)->clumpablks) + HFS_SB(sb)->clumpablks = 1; + HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >> + (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS); + + /* These parameters are read from and written to the MDB */ + HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); + HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); + HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); + HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); + HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); + HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); + + /* TRY to get the alternate (backup) MDB. */ + sect = part_start + part_size - 2; + bh = sb_bread512(sb, sect, mdb2); + if (bh) { + if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) { + HFS_SB(sb)->alt_mdb_bh = bh; + HFS_SB(sb)->alt_mdb = mdb2; + } else + brelse(bh); + } + + if (!HFS_SB(sb)->alt_mdb) { + pr_warn("unable to locate alternate MDB\n"); + pr_warn("continuing without an alternate MDB\n"); + } + + HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); + if (!HFS_SB(sb)->bitmap) + goto out; + + /* read in the bitmap */ + block = be16_to_cpu(mdb->drVBMSt) + part_start; + off = (loff_t)block << HFS_SECTOR_SIZE_BITS; + size = (HFS_SB(sb)->fs_ablocks + 8) / 8; + ptr = (u8 *)HFS_SB(sb)->bitmap; + while (size) { + bh = sb_bread(sb, off >> sb->s_blocksize_bits); + if (!bh) { + pr_err("unable to read volume bitmap\n"); + goto out; + } + off2 = off & (sb->s_blocksize - 1); + len = min((int)sb->s_blocksize - off2, size); + memcpy(ptr, bh->b_data + off2, len); + brelse(bh); + ptr += len; + off += len; + size -= len; + } + + HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); + if (!HFS_SB(sb)->ext_tree) { + pr_err("unable to open extent tree\n"); + goto out; + } + HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); + if (!HFS_SB(sb)->cat_tree) { + pr_err("unable to open catalog tree\n"); + goto out; + } + + attrib = mdb->drAtrb; + if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { + pr_warn("filesystem is marked locked, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + if (!(sb->s_flags & MS_RDONLY)) { + /* Mark the volume uncleanly unmounted in case we crash */ + attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); + attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); + mdb->drAtrb = attrib; + be32_add_cpu(&mdb->drWrCnt, 1); + mdb->drLsMod = hfs_mtime(); + + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->mdb_bh); + } + + return 0; + +out_bh: + brelse(bh); +out: + hfs_mdb_put(sb); + return -EIO; +} + +/* + * hfs_mdb_commit() + * + * Description: + * This updates the MDB on disk. + * It does not check, if the superblock has been modified, or + * if the filesystem has been mounted read-only. It is mainly + * called by hfs_sync_fs() and flush_mdb(). + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * int backup; + * Output Variable(s): + * NONE + * Returns: + * void + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * Postconditions: + * The HFS MDB and on disk will be updated, by copying the possibly + * modified fields from the in memory MDB (in native byte order) to + * the disk block buffer. + * If 'backup' is non-zero then the alternate MDB is also written + * and the function doesn't return until it is actually on disk. + */ +void hfs_mdb_commit(struct super_block *sb) +{ + struct hfs_mdb *mdb = HFS_SB(sb)->mdb; + + if (sb->s_flags & MS_RDONLY) + return; + + lock_buffer(HFS_SB(sb)->mdb_bh); + if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { + /* These parameters may have been modified, so write them back */ + mdb->drLsMod = hfs_mtime(); + mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); + mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); + mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); + mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); + mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); + mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); + + /* write MDB to disk */ + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); + } + + /* write the backup MDB, not returning until it is written. + * we only do this when either the catalog or extents overflow + * files grow. */ + if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) && + HFS_SB(sb)->alt_mdb) { + hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec, + &mdb->drXTFlSize, NULL); + hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, + &mdb->drCTFlSize, NULL); + + lock_buffer(HFS_SB(sb)->alt_mdb_bh); + memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); + HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); + HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); + unlock_buffer(HFS_SB(sb)->alt_mdb_bh); + + mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); + } + + if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { + struct buffer_head *bh; + sector_t block; + char *ptr; + int off, size, len; + + block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start; + off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1); + block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS; + size = (HFS_SB(sb)->fs_ablocks + 7) / 8; + ptr = (u8 *)HFS_SB(sb)->bitmap; + while (size) { + bh = sb_bread(sb, block); + if (!bh) { + pr_err("unable to read volume bitmap\n"); + break; + } + len = min((int)sb->s_blocksize - off, size); + + lock_buffer(bh); + memcpy(bh->b_data + off, ptr, len); + unlock_buffer(bh); + + mark_buffer_dirty(bh); + brelse(bh); + block++; + off = 0; + ptr += len; + size -= len; + } + } + unlock_buffer(HFS_SB(sb)->mdb_bh); +} + +void hfs_mdb_close(struct super_block *sb) +{ + /* update volume attributes */ + if (sb->s_flags & MS_RDONLY) + return; + HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); + HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); +} + +/* + * hfs_mdb_put() + * + * Release the resources associated with the in-core MDB. */ +void hfs_mdb_put(struct super_block *sb) +{ + if (!HFS_SB(sb)) + return; + /* free the B-trees */ + hfs_btree_close(HFS_SB(sb)->ext_tree); + hfs_btree_close(HFS_SB(sb)->cat_tree); + + /* free the buffers holding the primary and alternate MDBs */ + brelse(HFS_SB(sb)->mdb_bh); + brelse(HFS_SB(sb)->alt_mdb_bh); + + unload_nls(HFS_SB(sb)->nls_io); + unload_nls(HFS_SB(sb)->nls_disk); + + free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); + kfree(HFS_SB(sb)); + sb->s_fs_info = NULL; +} diff --git a/kernel/fs/hfs/part_tbl.c b/kernel/fs/hfs/part_tbl.c new file mode 100644 index 000000000..36add537d --- /dev/null +++ b/kernel/fs/hfs/part_tbl.c @@ -0,0 +1,117 @@ +/* + * linux/fs/hfs/part_tbl.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * Original code to handle the new style Mac partition table based on + * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). + */ + +#include "hfs_fs.h" + +/* + * The new style Mac partition map + * + * For each partition on the media there is a physical block (512-byte + * block) containing one of these structures. These blocks are + * contiguous starting at block 1. + */ +struct new_pmap { + __be16 pmSig; /* signature */ + __be16 reSigPad; /* padding */ + __be32 pmMapBlkCnt; /* partition blocks count */ + __be32 pmPyPartStart; /* physical block start of partition */ + __be32 pmPartBlkCnt; /* physical block count of partition */ + u8 pmPartName[32]; /* (null terminated?) string + giving the name of this + partition */ + u8 pmPartType[32]; /* (null terminated?) string + giving the type of this + partition */ + /* a bunch more stuff we don't need */ +} __packed; + +/* + * The old style Mac partition map + * + * The partition map consists for a 2-byte signature followed by an + * array of these structures. The map is terminated with an all-zero + * one of these. + */ +struct old_pmap { + __be16 pdSig; /* Signature bytes */ + struct old_pmap_entry { + __be32 pdStart; + __be32 pdSize; + __be32 pdFSID; + } pdEntry[42]; +} __packed; + +/* + * hfs_part_find() + * + * Parse the partition map looking for the + * start and length of the 'part'th HFS partition. + */ +int hfs_part_find(struct super_block *sb, + sector_t *part_start, sector_t *part_size) +{ + struct buffer_head *bh; + __be16 *data; + int i, size, res; + + res = -ENOENT; + bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); + if (!bh) + return -EIO; + + switch (be16_to_cpu(*data)) { + case HFS_OLD_PMAP_MAGIC: + { + struct old_pmap *pm; + struct old_pmap_entry *p; + + pm = (struct old_pmap *)bh->b_data; + p = pm->pdEntry; + size = 42; + for (i = 0; i < size; p++, i++) { + if (p->pdStart && p->pdSize && + p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && + (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { + *part_start += be32_to_cpu(p->pdStart); + *part_size = be32_to_cpu(p->pdSize); + res = 0; + } + } + break; + } + case HFS_NEW_PMAP_MAGIC: + { + struct new_pmap *pm; + + pm = (struct new_pmap *)bh->b_data; + size = be32_to_cpu(pm->pmMapBlkCnt); + for (i = 0; i < size;) { + if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && + (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { + *part_start += be32_to_cpu(pm->pmPyPartStart); + *part_size = be32_to_cpu(pm->pmPartBlkCnt); + res = 0; + break; + } + brelse(bh); + bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm); + if (!bh) + return -EIO; + if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC)) + break; + } + break; + } + } + brelse(bh); + + return res; +} diff --git a/kernel/fs/hfs/string.c b/kernel/fs/hfs/string.c new file mode 100644 index 000000000..85b610c39 --- /dev/null +++ b/kernel/fs/hfs/string.c @@ -0,0 +1,114 @@ +/* + * linux/fs/hfs/string.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the string comparison function for the + * Macintosh character set. + * + * The code in this file is derived from code which is copyright + * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI) + * It is used here by the permission of ARDI's president Cliff Matthews. + */ + +#include "hfs_fs.h" +#include + +/*================ File-local variables ================*/ + +/* + * unsigned char caseorder[] + * + * Defines the lexical ordering of characters on the Macintosh + * + * Composition of the 'casefold' and 'order' tables from ARDI's code + * with the entry for 0x20 changed to match that for 0xCA to remove + * special case for those two characters. + */ +static unsigned char caseorder[256] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, + 0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36, + 0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46, + 0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, + 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD, + 0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, + 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3, + 0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63, + 0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98, + 0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81, + 0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81, + 0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82, + 0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, + 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, + 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF +}; + +/*================ Global functions ================*/ + +/* + * Hash a string to an integer in a case-independent way + */ +int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) +{ + const unsigned char *name = this->name; + unsigned int hash, len = this->len; + + if (len > HFS_NAMELEN) + len = HFS_NAMELEN; + + hash = init_name_hash(); + for (; len; len--) + hash = partial_name_hash(caseorder[*name++], hash); + this->hash = end_name_hash(hash); + return 0; +} + +/* + * Compare two strings in the HFS filename character ordering + * Returns positive, negative, or zero, not just 0 or (+/-)1 + * + * Equivalent to ARDI's call: + * ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0]) + */ +int hfs_strcmp(const unsigned char *s1, unsigned int len1, + const unsigned char *s2, unsigned int len2) +{ + int len, tmp; + + len = (len1 > len2) ? len2 : len1; + + while (len--) { + tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)]; + if (tmp) + return tmp; + } + return len1 - len2; +} + +/* + * Test for equality of two strings in the HFS filename character ordering. + * return 1 on failure and 0 on success + */ +int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + const unsigned char *n1, *n2; + + if (len >= HFS_NAMELEN) { + if (name->len < HFS_NAMELEN) + return 1; + len = HFS_NAMELEN; + } else if (len != name->len) + return 1; + + n1 = str; + n2 = name->name; + while (len--) { + if (caseorder[*n1++] != caseorder[*n2++]) + return 1; + } + return 0; +} diff --git a/kernel/fs/hfs/super.c b/kernel/fs/hfs/super.c new file mode 100644 index 000000000..eee7206c3 --- /dev/null +++ b/kernel/fs/hfs/super.c @@ -0,0 +1,508 @@ +/* + * linux/fs/hfs/super.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains hfs_read_super(), some of the super_ops and + * init_hfs_fs() and exit_hfs_fs(). The remaining super_ops are in + * inode.c since they deal with inodes. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +static struct kmem_cache *hfs_inode_cachep; + +MODULE_LICENSE("GPL"); + +static int hfs_sync_fs(struct super_block *sb, int wait) +{ + hfs_mdb_commit(sb); + return 0; +} + +/* + * hfs_put_super() + * + * This is the put_super() entry in the super_operations structure for + * HFS filesystems. The purpose is to release the resources + * associated with the superblock sb. + */ +static void hfs_put_super(struct super_block *sb) +{ + cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work); + hfs_mdb_close(sb); + /* release the MDB's resources */ + hfs_mdb_put(sb); +} + +static void flush_mdb(struct work_struct *work) +{ + struct hfs_sb_info *sbi; + struct super_block *sb; + + sbi = container_of(work, struct hfs_sb_info, mdb_work.work); + sb = sbi->sb; + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + hfs_mdb_commit(sb); +} + +void hfs_mark_mdb_dirty(struct super_block *sb) +{ + struct hfs_sb_info *sbi = HFS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->mdb_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + +/* + * hfs_statfs() + * + * This is the statfs() entry in the super_operations structure for + * HFS filesystems. The purpose is to return various data about the + * filesystem. + * + * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks. + */ +static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = HFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div; + buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div; + buf->f_bavail = buf->f_bfree; + buf->f_files = HFS_SB(sb)->fs_ablocks; + buf->f_ffree = HFS_SB(sb)->free_ablocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = HFS_NAMELEN; + + return 0; +} + +static int hfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_NODIRATIME; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (!(*flags & MS_RDONLY)) { + if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { + pr_warn("filesystem is marked locked, leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } + } + return 0; +} + +static int hfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct hfs_sb_info *sbi = HFS_SB(root->d_sb); + + if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) + seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) + seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_printf(seq, ",uid=%u,gid=%u", + from_kuid_munged(&init_user_ns, sbi->s_uid), + from_kgid_munged(&init_user_ns, sbi->s_gid)); + if (sbi->s_file_umask != 0133) + seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); + if (sbi->s_dir_umask != 0022) + seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask); + if (sbi->part >= 0) + seq_printf(seq, ",part=%u", sbi->part); + if (sbi->session >= 0) + seq_printf(seq, ",session=%u", sbi->session); + if (sbi->nls_disk) + seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset); + if (sbi->nls_io) + seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset); + if (sbi->s_quiet) + seq_printf(seq, ",quiet"); + return 0; +} + +static struct inode *hfs_alloc_inode(struct super_block *sb) +{ + struct hfs_inode_info *i; + + i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); + return i ? &i->vfs_inode : NULL; +} + +static void hfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); +} + +static void hfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfs_i_callback); +} + +static const struct super_operations hfs_super_operations = { + .alloc_inode = hfs_alloc_inode, + .destroy_inode = hfs_destroy_inode, + .write_inode = hfs_write_inode, + .evict_inode = hfs_evict_inode, + .put_super = hfs_put_super, + .sync_fs = hfs_sync_fs, + .statfs = hfs_statfs, + .remount_fs = hfs_remount, + .show_options = hfs_show_options, +}; + +enum { + opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, + opt_part, opt_session, opt_type, opt_creator, opt_quiet, + opt_codepage, opt_iocharset, + opt_err +}; + +static const match_table_t tokens = { + { opt_uid, "uid=%u" }, + { opt_gid, "gid=%u" }, + { opt_umask, "umask=%o" }, + { opt_file_umask, "file_umask=%o" }, + { opt_dir_umask, "dir_umask=%o" }, + { opt_part, "part=%u" }, + { opt_session, "session=%u" }, + { opt_type, "type=%s" }, + { opt_creator, "creator=%s" }, + { opt_quiet, "quiet" }, + { opt_codepage, "codepage=%s" }, + { opt_iocharset, "iocharset=%s" }, + { opt_err, NULL } +}; + +static inline int match_fourchar(substring_t *arg, u32 *result) +{ + if (arg->to - arg->from != 4) + return -EINVAL; + memcpy(result, arg->from, 4); + return 0; +} + +/* + * parse_options() + * + * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger + * This function is called by hfs_read_super() to parse the mount options. + */ +static int parse_options(char *options, struct hfs_sb_info *hsb) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int tmp, token; + + /* initialize the sb with defaults */ + hsb->s_uid = current_uid(); + hsb->s_gid = current_gid(); + hsb->s_file_umask = 0133; + hsb->s_dir_umask = 0022; + hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_quiet = 0; + hsb->part = -1; + hsb->session = -1; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_uid: + if (match_int(&args[0], &tmp)) { + pr_err("uid requires an argument\n"); + return 0; + } + hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); + if (!uid_valid(hsb->s_uid)) { + pr_err("invalid uid %d\n", tmp); + return 0; + } + break; + case opt_gid: + if (match_int(&args[0], &tmp)) { + pr_err("gid requires an argument\n"); + return 0; + } + hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); + if (!gid_valid(hsb->s_gid)) { + pr_err("invalid gid %d\n", tmp); + return 0; + } + break; + case opt_umask: + if (match_octal(&args[0], &tmp)) { + pr_err("umask requires a value\n"); + return 0; + } + hsb->s_file_umask = (umode_t)tmp; + hsb->s_dir_umask = (umode_t)tmp; + break; + case opt_file_umask: + if (match_octal(&args[0], &tmp)) { + pr_err("file_umask requires a value\n"); + return 0; + } + hsb->s_file_umask = (umode_t)tmp; + break; + case opt_dir_umask: + if (match_octal(&args[0], &tmp)) { + pr_err("dir_umask requires a value\n"); + return 0; + } + hsb->s_dir_umask = (umode_t)tmp; + break; + case opt_part: + if (match_int(&args[0], &hsb->part)) { + pr_err("part requires an argument\n"); + return 0; + } + break; + case opt_session: + if (match_int(&args[0], &hsb->session)) { + pr_err("session requires an argument\n"); + return 0; + } + break; + case opt_type: + if (match_fourchar(&args[0], &hsb->s_type)) { + pr_err("type requires a 4 character value\n"); + return 0; + } + break; + case opt_creator: + if (match_fourchar(&args[0], &hsb->s_creator)) { + pr_err("creator requires a 4 character value\n"); + return 0; + } + break; + case opt_quiet: + hsb->s_quiet = 1; + break; + case opt_codepage: + if (hsb->nls_disk) { + pr_err("unable to change codepage\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + hsb->nls_disk = load_nls(p); + if (!hsb->nls_disk) { + pr_err("unable to load codepage \"%s\"\n", p); + kfree(p); + return 0; + } + kfree(p); + break; + case opt_iocharset: + if (hsb->nls_io) { + pr_err("unable to change iocharset\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + hsb->nls_io = load_nls(p); + if (!hsb->nls_io) { + pr_err("unable to load iocharset \"%s\"\n", p); + kfree(p); + return 0; + } + kfree(p); + break; + default: + return 0; + } + } + + if (hsb->nls_disk && !hsb->nls_io) { + hsb->nls_io = load_nls_default(); + if (!hsb->nls_io) { + pr_err("unable to load default iocharset\n"); + return 0; + } + } + hsb->s_dir_umask &= 0777; + hsb->s_file_umask &= 0577; + + return 1; +} + +/* + * hfs_read_super() + * + * This is the function that is responsible for mounting an HFS + * filesystem. It performs all the tasks necessary to get enough data + * from the disk to read the root inode. This includes parsing the + * mount options, dealing with Macintosh partitions, reading the + * superblock and the allocation bitmap blocks, calling + * hfs_btree_init() to get the necessary data about the extents and + * catalog B-trees and, finally, reading the root inode into memory. + */ +static int hfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct hfs_sb_info *sbi; + struct hfs_find_data fd; + hfs_cat_rec rec; + struct inode *root_inode; + int res; + + sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->sb = sb; + sb->s_fs_info = sbi; + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); + + res = -EINVAL; + if (!parse_options((char *)data, sbi)) { + pr_err("unable to parse mount options\n"); + goto bail; + } + + sb->s_op = &hfs_super_operations; + sb->s_flags |= MS_NODIRATIME; + mutex_init(&sbi->bitmap_lock); + + res = hfs_mdb_get(sb); + if (res) { + if (!silent) + pr_warn("can't find a HFS filesystem on dev %s\n", + hfs_mdb_name(sb)); + res = -EINVAL; + goto bail; + } + + /* try to get the root inode */ + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + goto bail_no_root; + res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; + goto bail; + } + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } + if (res) { + hfs_find_exit(&fd); + goto bail_no_root; + } + res = -EINVAL; + root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); + hfs_find_exit(&fd); + if (!root_inode) + goto bail_no_root; + + sb->s_d_op = &hfs_dentry_operations; + res = -ENOMEM; + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) + goto bail_no_root; + + /* everything's okay */ + return 0; + +bail_no_root: + pr_err("get root inode failed\n"); +bail: + hfs_mdb_put(sb); + return res; +} + +static struct dentry *hfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); +} + +static struct file_system_type hfs_fs_type = { + .owner = THIS_MODULE, + .name = "hfs", + .mount = hfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("hfs"); + +static void hfs_init_once(void *p) +{ + struct hfs_inode_info *i = p; + + inode_init_once(&i->vfs_inode); +} + +static int __init init_hfs_fs(void) +{ + int err; + + hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", + sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, + hfs_init_once); + if (!hfs_inode_cachep) + return -ENOMEM; + err = register_filesystem(&hfs_fs_type); + if (err) + kmem_cache_destroy(hfs_inode_cachep); + return err; +} + +static void __exit exit_hfs_fs(void) +{ + unregister_filesystem(&hfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(hfs_inode_cachep); +} + +module_init(init_hfs_fs) +module_exit(exit_hfs_fs) diff --git a/kernel/fs/hfs/sysdep.c b/kernel/fs/hfs/sysdep.c new file mode 100644 index 000000000..2875961fd --- /dev/null +++ b/kernel/fs/hfs/sysdep.c @@ -0,0 +1,45 @@ +/* + * linux/fs/hfs/sysdep.c + * + * Copyright (C) 1996 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the code to do various system dependent things. + */ + +#include +#include "hfs_fs.h" + +/* dentry case-handling: just lowercase everything */ + +static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + int diff; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + if(!inode) + return 1; + + /* fix up inode on a timezone change */ + diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; + if (diff) { + inode->i_ctime.tv_sec += diff; + inode->i_atime.tv_sec += diff; + inode->i_mtime.tv_sec += diff; + HFS_I(inode)->tz_secondswest += diff; + } + return 1; +} + +const struct dentry_operations hfs_dentry_operations = +{ + .d_revalidate = hfs_revalidate_dentry, + .d_hash = hfs_hash_dentry, + .d_compare = hfs_compare_dentry, +}; + diff --git a/kernel/fs/hfs/trans.c b/kernel/fs/hfs/trans.c new file mode 100644 index 000000000..b1ce4c7ad --- /dev/null +++ b/kernel/fs/hfs/trans.c @@ -0,0 +1,150 @@ +/* + * linux/fs/hfs/trans.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains routines for converting between the Macintosh + * character set and various other encodings. This includes dealing + * with ':' vs. '/' as the path-element separator. + */ + +#include +#include + +#include "hfs_fs.h" + +/*================ Global functions ================*/ + +/* + * hfs_mac2asc() + * + * Given a 'Pascal String' (a string preceded by a length byte) in + * the Macintosh character set produce the corresponding filename using + * the 'trivial' name-mangling scheme, returning the length of the + * mangled filename. Note that the output string is not NULL + * terminated. + * + * The name-mangling works as follows: + * The character '/', which is illegal in Linux filenames is replaced + * by ':' which never appears in HFS filenames. All other characters + * are passed unchanged from input to output. + */ +int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) +{ + struct nls_table *nls_disk = HFS_SB(sb)->nls_disk; + struct nls_table *nls_io = HFS_SB(sb)->nls_io; + const char *src; + char *dst; + int srclen, dstlen, size; + + src = in->name; + srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; + dst = out; + dstlen = HFS_MAX_NAMELEN; + if (nls_io) { + wchar_t ch; + + while (srclen > 0) { + if (nls_disk) { + size = nls_disk->char2uni(src, srclen, &ch); + if (size <= 0) { + ch = '?'; + size = 1; + } + src += size; + srclen -= size; + } else { + ch = *src++; + srclen--; + } + if (ch == '/') + ch = ':'; + size = nls_io->uni2char(ch, dst, dstlen); + if (size < 0) { + if (size == -ENAMETOOLONG) + goto out; + *dst = '?'; + size = 1; + } + dst += size; + dstlen -= size; + } + } else { + char ch; + + while (--srclen >= 0) + *dst++ = (ch = *src++) == '/' ? ':' : ch; + } +out: + return dst - out; +} + +/* + * hfs_asc2mac() + * + * Given an ASCII string (not null-terminated) and its length, + * generate the corresponding filename in the Macintosh character set + * using the 'trivial' name-mangling scheme, returning the length of + * the mangled filename. Note that the output string is not NULL + * terminated. + * + * This routine is a inverse to hfs_mac2triv(). + * A ':' is replaced by a '/'. + */ +void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in) +{ + struct nls_table *nls_disk = HFS_SB(sb)->nls_disk; + struct nls_table *nls_io = HFS_SB(sb)->nls_io; + const char *src; + char *dst; + int srclen, dstlen, size; + + src = in->name; + srclen = in->len; + dst = out->name; + dstlen = HFS_NAMELEN; + if (nls_io) { + wchar_t ch; + + while (srclen > 0) { + size = nls_io->char2uni(src, srclen, &ch); + if (size < 0) { + ch = '?'; + size = 1; + } + src += size; + srclen -= size; + if (ch == ':') + ch = '/'; + if (nls_disk) { + size = nls_disk->uni2char(ch, dst, dstlen); + if (size < 0) { + if (size == -ENAMETOOLONG) + goto out; + *dst = '?'; + size = 1; + } + dst += size; + dstlen -= size; + } else { + *dst++ = ch > 0xff ? '?' : ch; + dstlen--; + } + } + } else { + char ch; + + if (dstlen > srclen) + dstlen = srclen; + while (--dstlen >= 0) + *dst++ = (ch = *src++) == ':' ? '/' : ch; + } +out: + out->len = dst - (char *)out->name; + dstlen = HFS_NAMELEN - out->len; + while (--dstlen >= 0) + *dst++ = 0; +} diff --git a/kernel/fs/hfsplus/Kconfig b/kernel/fs/hfsplus/Kconfig new file mode 100644 index 000000000..24bc20fd4 --- /dev/null +++ b/kernel/fs/hfsplus/Kconfig @@ -0,0 +1,31 @@ +config HFSPLUS_FS + tristate "Apple Extended HFS file system support" + depends on BLOCK + select NLS + select NLS_UTF8 + help + If you say Y here, you will be able to mount extended format + Macintosh-formatted hard drive partitions with full read-write access. + + This file system is often called HFS+ and was introduced with + MacOS 8. It includes all Mac specific filesystem data such as + data forks and creator codes, but it also has several UNIX + style features such as file ownership and permissions. + +config HFSPLUS_FS_POSIX_ACL + bool "HFS+ POSIX Access Control Lists" + depends on HFSPLUS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + It needs to understand that POSIX ACLs are treated only under + Linux. POSIX ACLs doesn't mean something under Mac OS X. + Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, + which are part of the NFSv4 standard. + + If you don't know what Access Control Lists are, say N diff --git a/kernel/fs/hfsplus/Makefile b/kernel/fs/hfsplus/Makefile new file mode 100644 index 000000000..683fca2e5 --- /dev/null +++ b/kernel/fs/hfsplus/Makefile @@ -0,0 +1,11 @@ +# +## Makefile for the linux hfsplus filesystem routines. +# + +obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o + +hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ + bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ + attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o + +hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o diff --git a/kernel/fs/hfsplus/acl.h b/kernel/fs/hfsplus/acl.h new file mode 100644 index 000000000..95c8ed9ec --- /dev/null +++ b/kernel/fs/hfsplus/acl.h @@ -0,0 +1,27 @@ +/* + * linux/fs/hfsplus/acl.h + * + * Vyacheslav Dubeyko + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include + +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + +/* posix_acl.c */ +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type); +extern int hfsplus_init_posix_acl(struct inode *, struct inode *); + +#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ +#define hfsplus_get_posix_acl NULL +#define hfsplus_set_posix_acl NULL + +static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ diff --git a/kernel/fs/hfsplus/attributes.c b/kernel/fs/hfsplus/attributes.c new file mode 100644 index 000000000..e5b221de7 --- /dev/null +++ b/kernel/fs/hfsplus/attributes.c @@ -0,0 +1,371 @@ +/* + * linux/fs/hfsplus/attributes.c + * + * Vyacheslav Dubeyko + * + * Handling of records in attributes tree + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct kmem_cache *hfsplus_attr_tree_cachep; + +int __init hfsplus_create_attr_tree_cache(void) +{ + if (hfsplus_attr_tree_cachep) + return -EEXIST; + + hfsplus_attr_tree_cachep = + kmem_cache_create("hfsplus_attr_cache", + sizeof(hfsplus_attr_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!hfsplus_attr_tree_cachep) + return -ENOMEM; + + return 0; +} + +void hfsplus_destroy_attr_tree_cache(void) +{ + kmem_cache_destroy(hfsplus_attr_tree_cachep); +} + +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1_cnid, k2_cnid; + + k1_cnid = k1->attr.cnid; + k2_cnid = k2->attr.cnid; + if (k1_cnid != k2_cnid) + return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1; + + return hfsplus_strcmp( + (const struct hfsplus_unistr *)&k1->attr.key_name, + (const struct hfsplus_unistr *)&k2->attr.key_name); +} + +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name) +{ + int len; + + memset(key, 0, sizeof(struct hfsplus_attr_key)); + key->attr.cnid = cpu_to_be32(cnid); + if (name) { + int res = hfsplus_asc2uni(sb, + (struct hfsplus_unistr *)&key->attr.key_name, + HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name)); + if (res) + return res; + len = be16_to_cpu(key->attr.key_name.length); + } else { + key->attr.key_name.length = 0; + len = 0; + } + + /* The length of the key, as stored in key_len field, does not include + * the size of the key_len field itself. + * So, offsetof(hfsplus_attr_key, key_name) is a trick because + * it takes into consideration key_len field (__be16) of + * hfsplus_attr_key structure instead of length field (__be16) of + * hfsplus_attr_unistr structure. + */ + key->key_len = + cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + + 2 * len); + + return 0; +} + +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) +{ + return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); +} + +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry) +{ + if (entry) + kmem_cache_free(hfsplus_attr_tree_cachep, entry); +} + +#define HFSPLUS_INVALID_ATTR_RECORD -1 + +static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type, + u32 cnid, const void *value, size_t size) +{ + if (record_type == HFSPLUS_ATTR_FORK_DATA) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_fork_data); + } else if (record_type == HFSPLUS_ATTR_EXTENTS) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing. + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_extents); + } else if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + u16 len; + + memset(entry, 0, sizeof(struct hfsplus_attr_inline_data)); + entry->inline_data.record_type = cpu_to_be32(record_type); + if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE) + len = size; + else + return HFSPLUS_INVALID_ATTR_RECORD; + entry->inline_data.length = cpu_to_be16(len); + memcpy(entry->inline_data.raw_bytes, value, len); + /* + * Align len on two-byte boundary. + * It needs to add pad byte if we have odd len. + */ + len = round_up(len, 2); + return offsetof(struct hfsplus_attr_inline_data, raw_bytes) + + len; + } else /* invalid input */ + memset(entry, 0, sizeof(*entry)); + + return HFSPLUS_INVALID_ATTR_RECORD; +} + +int hfsplus_find_attr(struct super_block *sb, u32 cnid, + const char *name, struct hfs_find_data *fd) +{ + int err = 0; + + hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + + if (!HFSPLUS_SB(sb)->attr_tree) { + pr_err("attributes file doesn't exist\n"); + return -EINVAL; + } + + if (name) { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_rec_by_key); + if (err) + goto failed_find_attr; + } else { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid); + if (err) + goto failed_find_attr; + } + +failed_find_attr: + return err; +} + +int hfsplus_attr_exists(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + if (!HFSPLUS_SB(sb)->attr_tree) + return 0; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return 0; + + err = hfsplus_find_attr(sb, inode->i_ino, name, &fd); + if (err) + goto attr_not_found; + + hfs_find_exit(&fd); + return 1; + +attr_not_found: + hfs_find_exit(&fd); + return 0; +} + +int hfsplus_create_attr(struct inode *inode, + const char *name, + const void *value, size_t size) +{ + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + hfsplus_attr_entry *entry_ptr; + int entry_size; + int err; + + hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + pr_err("attributes file doesn't exist\n"); + return -EINVAL; + } + + entry_ptr = hfsplus_alloc_attr_entry(); + if (!entry_ptr) + return -ENOMEM; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + goto failed_init_create_attr; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto failed_create_attr; + } else { + err = -EINVAL; + goto failed_create_attr; + } + + /* Mac OS X supports only inline data attributes. */ + entry_size = hfsplus_attr_build_record(entry_ptr, + HFSPLUS_ATTR_INLINE_DATA, + inode->i_ino, + value, size); + if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) { + err = -EINVAL; + goto failed_create_attr; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto failed_create_attr; + } + + err = hfs_brec_insert(&fd, entry_ptr, entry_size); + if (err) + goto failed_create_attr; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + +failed_create_attr: + hfs_find_exit(&fd); + +failed_init_create_attr: + hfsplus_destroy_attr_entry(entry_ptr); + return err; +} + +static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, + struct hfs_find_data *fd) +{ + int err = 0; + __be32 found_cnid, record_type; + + hfs_bnode_read(fd->bnode, &found_cnid, + fd->keyoffset + + offsetof(struct hfsplus_attr_key, cnid), + sizeof(__be32)); + if (cnid != be32_to_cpu(found_cnid)) + return -ENOENT; + + hfs_bnode_read(fd->bnode, &record_type, + fd->entryoffset, sizeof(record_type)); + + switch (be32_to_cpu(record_type)) { + case HFSPLUS_ATTR_INLINE_DATA: + /* All is OK. Do nothing. */ + break; + case HFSPLUS_ATTR_FORK_DATA: + case HFSPLUS_ATTR_EXTENTS: + pr_err("only inline data xattr are supported\n"); + return -EOPNOTSUPP; + default: + pr_err("invalid extended attribute record\n"); + return -ENOENT; + } + + err = hfs_brec_remove(fd); + if (err) + return err; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + return err; +} + +int hfsplus_delete_attr(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + pr_err("attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return err; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto out; + } else { + pr_err("invalid extended attribute name\n"); + err = -EINVAL; + goto out; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + err = __hfsplus_delete_attr(inode, inode->i_ino, &fd); + if (err) + goto out; + +out: + hfs_find_exit(&fd); + return err; +} + +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) +{ + int err = 0; + struct hfs_find_data fd; + + hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + + if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { + pr_err("attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd); + if (err) + return err; + + for (;;) { + err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); + if (err) { + if (err != -ENOENT) + pr_err("xattr search failed\n"); + goto end_delete_all; + } + + err = __hfsplus_delete_attr(dir, cnid, &fd); + if (err) + goto end_delete_all; + } + +end_delete_all: + hfs_find_exit(&fd); + return err; +} diff --git a/kernel/fs/hfsplus/bfind.c b/kernel/fs/hfsplus/bfind.c new file mode 100644 index 000000000..528e38b5a --- /dev/null +++ b/kernel/fs/hfsplus/bfind.c @@ -0,0 +1,293 @@ +/* + * linux/fs/hfsplus/bfind.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Search routines for btrees + */ + +#include +#include "hfsplus_fs.h" + +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) +{ + void *ptr; + + fd->tree = tree; + fd->bnode = NULL; + ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + fd->search_key = ptr; + fd->key = ptr + tree->max_key_len + 2; + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); + switch (tree->cnid) { + case HFSPLUS_CAT_CNID: + mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); + break; + case HFSPLUS_EXT_CNID: + mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); + break; + case HFSPLUS_ATTR_CNID: + mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); + break; + default: + BUG(); + } + return 0; +} + +void hfs_find_exit(struct hfs_find_data *fd) +{ + hfs_bnode_put(fd->bnode); + kfree(fd->search_key); + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); + mutex_unlock(&fd->tree->tree_lock); + fd->tree = NULL; +} + +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) +{ + __be32 cur_cnid; + __be32 search_cnid; + + if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { + cur_cnid = fd->key->ext.cnid; + search_cnid = fd->search_key->ext.cnid; + } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { + cur_cnid = fd->key->cat.parent; + search_cnid = fd->search_key->cat.parent; + } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { + cur_cnid = fd->key->attr.cnid; + search_cnid = fd->search_key->attr.cnid; + } else { + cur_cnid = 0; /* used-uninitialized warning */ + search_cnid = 0; + BUG(); + } + + if (cur_cnid == search_cnid) { + (*end) = (*cur_rec); + if ((*begin) == (*end)) + return 1; + } else { + if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) + (*begin) = (*cur_rec) + 1; + else + (*end) = (*cur_rec) - 1; + } + + return 0; +} + +int hfs_find_rec_by_key(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) +{ + int cmpval; + + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + (*end) = (*cur_rec); + return 1; + } + if (cmpval < 0) + (*begin) = (*cur_rec) + 1; + else + *(end) = (*cur_rec) - 1; + + return 0; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found) +{ + u16 off, len, keylen; + int rec; + int b, e; + int res; + + BUG_ON(!rec_found); + b = 0; + e = bnode->num_recs - 1; + res = -ENOENT; + do { + rec = (e + b) / 2; + len = hfs_brec_lenoff(bnode, rec, &off); + keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + if (rec_found(bnode, fd, &b, &e, &rec)) { + res = 0; + goto done; + } + } while (b <= e); + + if (rec != e && e >= 0) { + len = hfs_brec_lenoff(bnode, e, &off); + keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + } + +done: + fd->record = e; + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; + +fail: + return res; +} + +/* Traverse a B*Tree from the root to a leaf finding best fit to key */ +/* Return allocated copy of node found, set recnum to best record */ +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + u32 nidx, parent; + __be32 data; + int height, res; + + tree = fd->tree; + if (fd->bnode) + hfs_bnode_put(fd->bnode); + fd->bnode = NULL; + nidx = tree->root; + if (!nidx) + return -ENOENT; + height = tree->depth; + res = 0; + parent = 0; + for (;;) { + bnode = hfs_bnode_find(tree, nidx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + break; + } + if (bnode->height != height) + goto invalid; + if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) + goto invalid; + bnode->parent = parent; + + res = __hfs_brec_find(bnode, fd, do_key_compare); + if (!height) + break; + if (fd->record < 0) + goto release; + + parent = nidx; + hfs_bnode_read(bnode, &data, fd->entryoffset, 4); + nidx = be32_to_cpu(data); + hfs_bnode_put(bnode); + } + fd->bnode = bnode; + return res; + +invalid: + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); + res = -EIO; +release: + hfs_bnode_put(bnode); + return res; +} + +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +{ + int res; + + res = hfs_brec_find(fd, hfs_find_rec_by_key); + if (res) + return res; + if (fd->entrylength > rec_len) + return -EINVAL; + hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); + return 0; +} + +int hfs_brec_goto(struct hfs_find_data *fd, int cnt) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int idx, res = 0; + u16 off, len, keylen; + + bnode = fd->bnode; + tree = bnode->tree; + + if (cnt < 0) { + cnt = -cnt; + while (cnt > fd->record) { + cnt -= fd->record + 1; + fd->record = bnode->num_recs - 1; + idx = bnode->prev; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record -= cnt; + } else { + while (cnt >= bnode->num_recs - fd->record) { + cnt -= bnode->num_recs - fd->record; + fd->record = 0; + idx = bnode->next; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record += cnt; + } + + len = hfs_brec_lenoff(bnode, fd->record, &off); + keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; + hfs_bnode_read(bnode, fd->key, off, keylen); +out: + fd->bnode = bnode; + return res; +} diff --git a/kernel/fs/hfsplus/bitmap.c b/kernel/fs/hfsplus/bitmap.c new file mode 100644 index 000000000..d29544515 --- /dev/null +++ b/kernel/fs/hfsplus/bitmap.c @@ -0,0 +1,245 @@ +/* + * linux/fs/hfsplus/bitmap.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of allocation file + */ + +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) + +int hfsplus_block_allocate(struct super_block *sb, u32 size, + u32 offset, u32 *max) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct page *page; + struct address_space *mapping; + __be32 *pptr, *curr, *end; + u32 mask, start, len, n; + __be32 val; + int i; + + len = *max; + if (!len) + return size; + + hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + pptr = kmap(page); + curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; + i = offset % 32; + offset &= ~(PAGE_CACHE_BITS - 1); + if ((size ^ offset) / PAGE_CACHE_BITS) + end = pptr + PAGE_CACHE_BITS / 32; + else + end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; + + /* scan the first partial u32 for zero bits */ + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = (1U << 31) >> i; + for (; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + curr++; + + /* scan complete u32s for the first zero bit */ + while (1) { + while (curr < end) { + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = 1 << 31; + for (i = 0; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + curr++; + } + kunmap(page); + offset += PAGE_CACHE_BITS; + if (offset >= size) + break; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, + NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + curr = pptr = kmap(page); + if ((size ^ offset) / PAGE_CACHE_BITS) + end = pptr + PAGE_CACHE_BITS / 32; + else + end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; + } + hfs_dbg(BITMAP, "bitmap full\n"); + start = size; + goto out; + +found: + start = offset + (curr - pptr) * 32 + i; + if (start >= size) { + hfs_dbg(BITMAP, "bitmap full\n"); + goto out; + } + /* do any partial u32 at the start */ + len = min(size - start, len); + while (1) { + n |= mask; + if (++i >= 32) + break; + mask >>= 1; + if (!--len || n & mask) + goto done; + } + if (!--len) + goto done; + *curr++ = cpu_to_be32(n); + /* do full u32s */ + while (1) { + while (curr < end) { + n = be32_to_cpu(*curr); + if (len < 32) + goto last; + if (n) { + len = 32; + goto last; + } + *curr++ = cpu_to_be32(0xffffffff); + len -= 32; + } + set_page_dirty(page); + kunmap(page); + offset += PAGE_CACHE_BITS; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, + NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + pptr = kmap(page); + curr = pptr; + end = pptr + PAGE_CACHE_BITS / 32; + } +last: + /* do any partial u32 at end */ + mask = 1U << 31; + for (i = 0; i < len; i++) { + if (n & mask) + break; + n |= mask; + mask >>= 1; + } +done: + *curr = cpu_to_be32(n); + set_page_dirty(page); + kunmap(page); + *max = offset + (curr - pptr) * 32 + i - start; + sbi->free_blocks -= *max; + hfsplus_mark_mdb_dirty(sb); + hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); +out: + mutex_unlock(&sbi->alloc_mutex); + return start; +} + +int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct page *page; + struct address_space *mapping; + __be32 *pptr, *curr, *end; + u32 mask, len, pnr; + int i; + + /* is there any actual work to be done? */ + if (!count) + return 0; + + hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + /* are all of the bits in range? */ + if ((offset + count) > sbi->total_blocks) + return -ENOENT; + + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; + pnr = offset / PAGE_CACHE_BITS; + page = read_mapping_page(mapping, pnr, NULL); + if (IS_ERR(page)) + goto kaboom; + pptr = kmap(page); + curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; + end = pptr + PAGE_CACHE_BITS / 32; + len = count; + + /* do any partial u32 at the start */ + i = offset % 32; + if (i) { + int j = 32 - i; + mask = 0xffffffffU << j; + if (j > count) { + mask |= 0xffffffffU >> (i + count); + *curr++ &= cpu_to_be32(mask); + goto out; + } + *curr++ &= cpu_to_be32(mask); + count -= j; + } + + /* do full u32s */ + while (1) { + while (curr < end) { + if (count < 32) + goto done; + *curr++ = 0; + count -= 32; + } + if (!count) + break; + set_page_dirty(page); + kunmap(page); + page = read_mapping_page(mapping, ++pnr, NULL); + if (IS_ERR(page)) + goto kaboom; + pptr = kmap(page); + curr = pptr; + end = pptr + PAGE_CACHE_BITS / 32; + } +done: + /* do any partial u32 at end */ + if (count) { + mask = 0xffffffffU >> count; + *curr &= cpu_to_be32(mask); + } +out: + set_page_dirty(page); + kunmap(page); + sbi->free_blocks += len; + hfsplus_mark_mdb_dirty(sb); + mutex_unlock(&sbi->alloc_mutex); + + return 0; + +kaboom: + pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page)); + mutex_unlock(&sbi->alloc_mutex); + + return -EIO; +} diff --git a/kernel/fs/hfsplus/bnode.c b/kernel/fs/hfsplus/bnode.c new file mode 100644 index 000000000..759708fd9 --- /dev/null +++ b/kernel/fs/hfsplus/bnode.c @@ -0,0 +1,671 @@ +/* + * linux/fs/hfsplus/bnode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle basic btree node operations + */ + +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Copy a specified range of bytes from the raw data of a node */ +void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min_t(int, len, PAGE_CACHE_SIZE - off); + memcpy(buf, kmap(*pagep) + off, l); + kunmap(*pagep); + + while ((len -= l) != 0) { + buf += l; + l = min_t(int, len, PAGE_CACHE_SIZE); + memcpy(buf, kmap(*++pagep), l); + kunmap(*pagep); + } +} + +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +{ + __be16 data; + /* TODO: optimize later... */ + hfs_bnode_read(node, &data, off, 2); + return be16_to_cpu(data); +} + +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +{ + u8 data; + /* TODO: optimize later... */ + hfs_bnode_read(node, &data, off, 1); + return data; +} + +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +{ + struct hfs_btree *tree; + int key_len; + + tree = node->tree; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) + key_len = hfs_bnode_read_u16(node, off) + 2; + else + key_len = tree->max_key_len + 2; + + hfs_bnode_read(node, key, off, key_len); +} + +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min_t(int, len, PAGE_CACHE_SIZE - off); + memcpy(kmap(*pagep) + off, buf, l); + set_page_dirty(*pagep); + kunmap(*pagep); + + while ((len -= l) != 0) { + buf += l; + l = min_t(int, len, PAGE_CACHE_SIZE); + memcpy(kmap(*++pagep), buf, l); + set_page_dirty(*pagep); + kunmap(*pagep); + } +} + +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +{ + __be16 v = cpu_to_be16(data); + /* TODO: optimize later... */ + hfs_bnode_write(node, &v, off, 2); +} + +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min_t(int, len, PAGE_CACHE_SIZE - off); + memset(kmap(*pagep) + off, 0, l); + set_page_dirty(*pagep); + kunmap(*pagep); + + while ((len -= l) != 0) { + l = min_t(int, len, PAGE_CACHE_SIZE); + memset(kmap(*++pagep), 0, l); + set_page_dirty(*pagep); + kunmap(*pagep); + } +} + +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len) +{ + struct hfs_btree *tree; + struct page **src_page, **dst_page; + int l; + + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + tree = src_node->tree; + src += src_node->page_offset; + dst += dst_node->page_offset; + src_page = src_node->page + (src >> PAGE_CACHE_SHIFT); + src &= ~PAGE_CACHE_MASK; + dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT); + dst &= ~PAGE_CACHE_MASK; + + if (src == dst) { + l = min_t(int, len, PAGE_CACHE_SIZE - src); + memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + + while ((len -= l) != 0) { + l = min_t(int, len, PAGE_CACHE_SIZE); + memcpy(kmap(*++dst_page), kmap(*++src_page), l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { + l = PAGE_CACHE_SIZE - src; + src = 0; + dst += l; + } else { + l = PAGE_CACHE_SIZE - dst; + src += l; + dst = 0; + } + l = min(len, l); + memcpy(dst_ptr, src_ptr, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (!dst) + dst_page++; + else + src_page++; + } while ((len -= l)); + } +} + +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +{ + struct page **src_page, **dst_page; + int l; + + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + src += node->page_offset; + dst += node->page_offset; + if (dst > src) { + src += len - 1; + src_page = node->page + (src >> PAGE_CACHE_SHIFT); + src = (src & ~PAGE_CACHE_MASK) + 1; + dst += len - 1; + dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); + dst = (dst & ~PAGE_CACHE_MASK) + 1; + + if (src == dst) { + while (src < len) { + memmove(kmap(*dst_page), kmap(*src_page), src); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + len -= src; + src = PAGE_CACHE_SIZE; + src_page--; + dst_page--; + } + src -= len; + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, len); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (src < dst) { + l = src; + src = PAGE_CACHE_SIZE; + dst -= l; + } else { + l = dst; + src -= l; + dst = PAGE_CACHE_SIZE; + } + l = min(len, l); + memmove(dst_ptr - l, src_ptr - l, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (dst == PAGE_CACHE_SIZE) + dst_page--; + else + src_page--; + } while ((len -= l)); + } + } else { + src_page = node->page + (src >> PAGE_CACHE_SHIFT); + src &= ~PAGE_CACHE_MASK; + dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); + dst &= ~PAGE_CACHE_MASK; + + if (src == dst) { + l = min_t(int, len, PAGE_CACHE_SIZE - src); + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + + while ((len -= l) != 0) { + l = min_t(int, len, PAGE_CACHE_SIZE); + memmove(kmap(*++dst_page), + kmap(*++src_page), l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (PAGE_CACHE_SIZE - src < + PAGE_CACHE_SIZE - dst) { + l = PAGE_CACHE_SIZE - src; + src = 0; + dst += l; + } else { + l = PAGE_CACHE_SIZE - dst; + src += l; + dst = 0; + } + l = min(len, l); + memmove(dst_ptr, src_ptr, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (!dst) + dst_page++; + else + src_page++; + } while ((len -= l)); + } + } +} + +void hfs_bnode_dump(struct hfs_bnode *node) +{ + struct hfs_bnode_desc desc; + __be32 cnid; + int i, off, key_off; + + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_bnode_read(node, &desc, 0, sizeof(desc)); + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + be32_to_cpu(desc.next), be32_to_cpu(desc.prev), + desc.type, desc.height, be16_to_cpu(desc.num_recs)); + + off = node->tree->node_size - 2; + for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { + key_off = hfs_bnode_read_u16(node, off); + hfs_dbg(BNODE_MOD, " %d", key_off); + if (i && node->type == HFS_NODE_INDEX) { + int tmp; + + if (node->tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) + tmp = hfs_bnode_read_u16(node, key_off) + 2; + else + tmp = node->tree->max_key_len + 2; + hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_bnode_read(node, &cnid, key_off + tmp, 4); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + } else if (i && node->type == HFS_NODE_LEAF) { + int tmp; + + tmp = hfs_bnode_read_u16(node, key_off); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + } + } + hfs_dbg_cont(BNODE_MOD, "\n"); +} + +void hfs_bnode_unlink(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct hfs_bnode *tmp; + __be32 cnid; + + tree = node->tree; + if (node->prev) { + tmp = hfs_bnode_find(tree, node->prev); + if (IS_ERR(tmp)) + return; + tmp->next = node->next; + cnid = cpu_to_be32(tmp->next); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, next), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_head = node->next; + + if (node->next) { + tmp = hfs_bnode_find(tree, node->next); + if (IS_ERR(tmp)) + return; + tmp->prev = node->prev; + cnid = cpu_to_be32(tmp->prev); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, prev), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_tail = node->prev; + + /* move down? */ + if (!node->prev && !node->next) + hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + if (!node->parent) { + tree->root = 0; + tree->depth = 0; + } + set_bit(HFS_BNODE_DELETED, &node->flags); +} + +static inline int hfs_bnode_hash(u32 num) +{ + num = (num >> 16) + num; + num += num >> 8; + return num & (NODE_HASH_SIZE - 1); +} + +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) +{ + struct hfs_bnode *node; + + if (cnid >= tree->node_count) { + pr_err("request for non-existent node %d in B*Tree\n", + cnid); + return NULL; + } + + for (node = tree->node_hash[hfs_bnode_hash(cnid)]; + node; node = node->next_hash) + if (node->this == cnid) + return node; + return NULL; +} + +static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) +{ + struct super_block *sb; + struct hfs_bnode *node, *node2; + struct address_space *mapping; + struct page *page; + int size, block, i, hash; + loff_t off; + + if (cnid >= tree->node_count) { + pr_err("request for non-existent node %d in B*Tree\n", + cnid); + return NULL; + } + + sb = tree->inode->i_sb; + size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * + sizeof(struct page *); + node = kzalloc(size, GFP_KERNEL); + if (!node) + return NULL; + node->tree = tree; + node->this = cnid; + set_bit(HFS_BNODE_NEW, &node->flags); + atomic_set(&node->refcnt, 1); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); + init_waitqueue_head(&node->lock_wq); + spin_lock(&tree->hash_lock); + node2 = hfs_bnode_findhash(tree, cnid); + if (!node2) { + hash = hfs_bnode_hash(cnid); + node->next_hash = tree->node_hash[hash]; + tree->node_hash[hash] = node; + tree->node_hash_cnt++; + } else { + spin_unlock(&tree->hash_lock); + kfree(node); + wait_event(node2->lock_wq, + !test_bit(HFS_BNODE_NEW, &node2->flags)); + return node2; + } + spin_unlock(&tree->hash_lock); + + mapping = tree->inode->i_mapping; + off = (loff_t)cnid << tree->node_size_shift; + block = off >> PAGE_CACHE_SHIFT; + node->page_offset = off & ~PAGE_CACHE_MASK; + for (i = 0; i < tree->pages_per_bnode; block++, i++) { + page = read_mapping_page(mapping, block, NULL); + if (IS_ERR(page)) + goto fail; + if (PageError(page)) { + page_cache_release(page); + goto fail; + } + page_cache_release(page); + node->page[i] = page; + } + + return node; +fail: + set_bit(HFS_BNODE_ERROR, &node->flags); + return node; +} + +void hfs_bnode_unhash(struct hfs_bnode *node) +{ + struct hfs_bnode **p; + + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; + *p && *p != node; p = &(*p)->next_hash) + ; + BUG_ON(!*p); + *p = node->next_hash; + node->tree->node_hash_cnt--; +} + +/* Load a particular node out of a tree */ +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct hfs_bnode_desc *desc; + int i, rec_off, off, next_off; + int entry_size, key_size; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + if (node) { + hfs_bnode_get(node); + spin_unlock(&tree->hash_lock); + wait_event(node->lock_wq, + !test_bit(HFS_BNODE_NEW, &node->flags)); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + return node; + } + spin_unlock(&tree->hash_lock); + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + if (!test_bit(HFS_BNODE_NEW, &node->flags)) + return node; + + desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + + node->page_offset); + node->prev = be32_to_cpu(desc->prev); + node->next = be32_to_cpu(desc->next); + node->num_recs = be16_to_cpu(desc->num_recs); + node->type = desc->type; + node->height = desc->height; + kunmap(node->page[0]); + + switch (node->type) { + case HFS_NODE_HEADER: + case HFS_NODE_MAP: + if (node->height != 0) + goto node_error; + break; + case HFS_NODE_LEAF: + if (node->height != 1) + goto node_error; + break; + case HFS_NODE_INDEX: + if (node->height <= 1 || node->height > tree->depth) + goto node_error; + break; + default: + goto node_error; + } + + rec_off = tree->node_size - 2; + off = hfs_bnode_read_u16(node, rec_off); + if (off != sizeof(struct hfs_bnode_desc)) + goto node_error; + for (i = 1; i <= node->num_recs; off = next_off, i++) { + rec_off -= 2; + next_off = hfs_bnode_read_u16(node, rec_off); + if (next_off <= off || + next_off > tree->node_size || + next_off & 1) + goto node_error; + entry_size = next_off - off; + if (node->type != HFS_NODE_INDEX && + node->type != HFS_NODE_LEAF) + continue; + key_size = hfs_bnode_read_u16(node, off) + 2; + if (key_size >= entry_size || key_size & 1) + goto node_error; + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + return node; + +node_error: + set_bit(HFS_BNODE_ERROR, &node->flags); + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + hfs_bnode_put(node); + return ERR_PTR(-EIO); +} + +void hfs_bnode_free(struct hfs_bnode *node) +{ +#if 0 + int i; + + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); +#endif + kfree(node); +} + +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct page **pagep; + int i; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + spin_unlock(&tree->hash_lock); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + + pagep = node->page; + memset(kmap(*pagep) + node->page_offset, 0, + min_t(int, PAGE_CACHE_SIZE, tree->node_size)); + set_page_dirty(*pagep); + kunmap(*pagep); + for (i = 1; i < tree->pages_per_bnode; i++) { + memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + set_page_dirty(*pagep); + kunmap(*pagep); + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + + return node; +} + +void hfs_bnode_get(struct hfs_bnode *node) +{ + if (node) { + atomic_inc(&node->refcnt); + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + } +} + +/* Dispose of resources used by a node */ +void hfs_bnode_put(struct hfs_bnode *node) +{ + if (node) { + struct hfs_btree *tree = node->tree; + int i; + + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + BUG_ON(!atomic_read(&node->refcnt)); + if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) + return; + for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; + mark_page_accessed(node->page[i]); + } + + if (test_bit(HFS_BNODE_DELETED, &node->flags)) { + hfs_bnode_unhash(node); + spin_unlock(&tree->hash_lock); + if (hfs_bnode_need_zeroout(tree)) + hfs_bnode_clear(node, 0, tree->node_size); + hfs_bmap_free(node); + hfs_bnode_free(node); + return; + } + spin_unlock(&tree->hash_lock); + } +} + +/* + * Unused nodes have to be zeroed if this is the catalog tree and + * a corresponding flag in the volume header is set. + */ +bool hfs_bnode_need_zeroout(struct hfs_btree *tree) +{ + struct super_block *sb = tree->inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes); + + return tree->cnid == HFSPLUS_CAT_CNID && + volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX; +} diff --git a/kernel/fs/hfsplus/brec.c b/kernel/fs/hfsplus/brec.c new file mode 100644 index 000000000..754fdf8c6 --- /dev/null +++ b/kernel/fs/hfsplus/brec.c @@ -0,0 +1,527 @@ +/* + * linux/fs/hfsplus/brec.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle individual btree records + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd); +static int hfs_brec_update_parent(struct hfs_find_data *fd); +static int hfs_btree_inc_height(struct hfs_btree *); + +/* Get the length and offset of the given record in the given node */ +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off) +{ + __be16 retval[2]; + u16 dataoff; + + dataoff = node->tree->node_size - (rec + 2) * 2; + hfs_bnode_read(node, retval, dataoff, 4); + *off = be16_to_cpu(retval[1]); + return be16_to_cpu(retval[0]) - *off; +} + +/* Get the length of the key from a keyed record */ +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) +{ + u16 retval, recoff; + + if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) + return 0; + + if ((node->type == HFS_NODE_INDEX) && + !(node->tree->attributes & HFS_TREE_VARIDXKEYS) && + (node->tree->cnid != HFSPLUS_ATTR_CNID)) { + retval = node->tree->max_key_len + 2; + } else { + recoff = hfs_bnode_read_u16(node, + node->tree->node_size - (rec + 1) * 2); + if (!recoff) + return 0; + if (recoff > node->tree->node_size - 2) { + pr_err("recoff %d too large\n", recoff); + return 0; + } + + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + pr_err("keylen %d too large\n", + retval); + retval = 0; + } + } + return retval; +} + +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node; + int size, key_len, rec; + int data_off, end_off; + int idx_rec_off, data_rec_off, end_rec_off; + __be32 cnid; + + tree = fd->tree; + if (!fd->bnode) { + if (!tree->root) + hfs_btree_inc_height(tree); + fd->bnode = hfs_bnode_find(tree, tree->leaf_head); + if (IS_ERR(fd->bnode)) + return PTR_ERR(fd->bnode); + fd->record = -1; + } + new_node = NULL; + key_len = be16_to_cpu(fd->search_key->key_len) + 2; +again: + /* new record idx and complete record size */ + rec = fd->record + 1; + size = key_len + entry_len; + + node = fd->bnode; + hfs_bnode_dump(node); + /* get last offset */ + end_rec_off = tree->node_size - (node->num_recs + 1) * 2; + end_off = hfs_bnode_read_u16(node, end_rec_off); + end_rec_off -= 2; + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); + if (size > end_rec_off - end_off) { + if (new_node) + panic("not enough room!\n"); + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + goto again; + } + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count++; + mark_inode_dirty(tree->inode); + } + node->num_recs++; + /* write new last offset */ + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); + hfs_bnode_write_u16(node, end_rec_off, end_off + size); + data_off = end_off; + data_rec_off = end_rec_off + 2; + idx_rec_off = tree->node_size - (rec + 1) * 2; + if (idx_rec_off == data_rec_off) + goto skip; + /* move all following entries */ + do { + data_off = hfs_bnode_read_u16(node, data_rec_off + 2); + hfs_bnode_write_u16(node, data_rec_off, data_off + size); + data_rec_off += 2; + } while (data_rec_off < idx_rec_off); + + /* move data away */ + hfs_bnode_move(node, data_off + size, data_off, + end_off - data_off); + +skip: + hfs_bnode_write(node, fd->search_key, data_off, key_len); + hfs_bnode_write(node, entry, data_off + key_len, entry_len); + hfs_bnode_dump(node); + + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + + if (new_node) { + hfs_bnode_put(fd->bnode); + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } + fd->bnode = hfs_bnode_find(tree, new_node->parent); + + /* create index data entry */ + cnid = cpu_to_be32(new_node->this); + entry = &cnid; + entry_len = sizeof(cnid); + + /* get index key */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); + + hfs_bnode_put(new_node); + new_node = NULL; + + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) + key_len = be16_to_cpu(fd->search_key->key_len) + 2; + else { + fd->search_key->key_len = + cpu_to_be16(tree->max_key_len); + key_len = tree->max_key_len + 2; + } + goto again; + } + + return 0; +} + +int hfs_brec_remove(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *parent; + int end_off, rec_off, data_off, size; + + tree = fd->tree; + node = fd->bnode; +again: + rec_off = tree->node_size - (fd->record + 2) * 2; + end_off = tree->node_size - (node->num_recs + 1) * 2; + + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count--; + mark_inode_dirty(tree->inode); + } + hfs_bnode_dump(node); + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); + if (!--node->num_recs) { + hfs_bnode_unlink(node); + if (!node->parent) + return 0; + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + hfs_bnode_put(node); + node = fd->bnode = parent; + + __hfs_brec_find(node, fd, hfs_find_rec_by_key); + goto again; + } + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); + + if (rec_off == end_off) + goto skip; + size = fd->keylength + fd->entrylength; + + do { + data_off = hfs_bnode_read_u16(node, rec_off); + hfs_bnode_write_u16(node, rec_off + 2, data_off - size); + rec_off -= 2; + } while (rec_off >= end_off); + + /* fill hole */ + hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, + data_off - fd->keyoffset - size); +skip: + hfs_bnode_dump(node); + if (!fd->record) + hfs_brec_update_parent(fd); + return 0; +} + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *next_node; + struct hfs_bnode_desc node_desc; + int num_recs, new_rec_off, new_off, old_rec_off; + int data_start, data_end, size; + + tree = fd->tree; + node = fd->bnode; + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) + return new_node; + hfs_bnode_get(node); + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + node->this, new_node->this, node->next); + new_node->next = node->next; + new_node->prev = node->this; + new_node->parent = node->parent; + new_node->type = node->type; + new_node->height = node->height; + + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + + size = tree->node_size / 2 - node->num_recs * 2 - 14; + old_rec_off = tree->node_size - 4; + num_recs = 1; + for (;;) { + data_start = hfs_bnode_read_u16(node, old_rec_off); + if (data_start > size) + break; + old_rec_off -= 2; + if (++num_recs < node->num_recs) + continue; + /* panic? */ + hfs_bnode_put(node); + hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); + return ERR_PTR(-ENOSPC); + } + + if (fd->record + 1 < num_recs) { + /* new record is in the lower half, + * so leave some more space there + */ + old_rec_off += 2; + num_recs--; + data_start = hfs_bnode_read_u16(node, old_rec_off); + } else { + hfs_bnode_put(node); + hfs_bnode_get(new_node); + fd->bnode = new_node; + fd->record -= num_recs; + fd->keyoffset -= data_start - 14; + fd->entryoffset -= data_start - 14; + } + new_node->num_recs = node->num_recs - num_recs; + node->num_recs = num_recs; + + new_rec_off = tree->node_size - 2; + new_off = 14; + size = data_start - new_off; + num_recs = new_node->num_recs; + data_end = data_start; + while (num_recs) { + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + old_rec_off -= 2; + new_rec_off -= 2; + data_end = hfs_bnode_read_u16(node, old_rec_off); + new_off = data_end - size; + num_recs--; + } + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start); + + /* update new bnode header */ + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + /* update previous bnode header */ + node->next = new_node->this; + hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc)); + node_desc.next = cpu_to_be32(node->next); + node_desc.num_recs = cpu_to_be16(node->num_recs); + hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); + + /* update next bnode header */ + if (next_node) { + next_node->prev = new_node->this; + hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); + node_desc.prev = cpu_to_be32(next_node->prev); + hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc)); + hfs_bnode_put(next_node); + } else if (node->this == tree->leaf_tail) { + /* if there is no next node, this might be the new tail */ + tree->leaf_tail = new_node->this; + mark_inode_dirty(tree->inode); + } + + hfs_bnode_dump(node); + hfs_bnode_dump(new_node); + hfs_bnode_put(node); + + return new_node; +} + +static int hfs_brec_update_parent(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *parent; + int newkeylen, diff; + int rec, rec_off, end_rec_off; + int start_off, end_off; + + tree = fd->tree; + node = fd->bnode; + new_node = NULL; + if (!node->parent) + return 0; + +again: + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + __hfs_brec_find(parent, fd, hfs_find_rec_by_key); + if (fd->record < 0) + return -ENOENT; + hfs_bnode_dump(parent); + rec = fd->record; + + /* size difference between old and new key */ + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) + newkeylen = hfs_bnode_read_u16(node, 14) + 2; + else + fd->keylength = newkeylen = tree->max_key_len + 2; + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); + + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + diff = newkeylen - fd->keylength; + if (!diff) + goto skip; + if (diff > 0) { + end_off = hfs_bnode_read_u16(parent, end_rec_off); + if (end_rec_off - end_off < diff) { + + hfs_dbg(BNODE_MOD, "splitting index node\n"); + fd->bnode = parent; + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + parent = fd->bnode; + rec = fd->record; + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - + (parent->num_recs + 1) * 2; + } + } + + end_off = start_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, start_off + diff); + start_off -= 4; /* move previous cnid too */ + + while (rec_off > end_rec_off) { + rec_off -= 2; + end_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, end_off + diff); + } + hfs_bnode_move(parent, start_off + diff, start_off, + end_off - start_off); +skip: + hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen); + hfs_bnode_dump(parent); + + hfs_bnode_put(node); + node = parent; + + if (new_node) { + __be32 cnid; + + fd->bnode = hfs_bnode_find(tree, new_node->parent); + /* create index key and entry */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + cnid = cpu_to_be32(new_node->this); + + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); + hfs_brec_insert(fd, &cnid, sizeof(cnid)); + hfs_bnode_put(fd->bnode); + hfs_bnode_put(new_node); + + if (!rec) { + if (new_node == node) + goto out; + /* restore search_key */ + hfs_bnode_read_key(node, fd->search_key, 14); + } + } + + if (!rec && node->parent) + goto again; +out: + fd->bnode = node; + return 0; +} + +static int hfs_btree_inc_height(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *new_node; + struct hfs_bnode_desc node_desc; + int key_size, rec; + __be32 cnid; + + node = NULL; + if (tree->root) { + node = hfs_bnode_find(tree, tree->root); + if (IS_ERR(node)) + return PTR_ERR(node); + } + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) { + hfs_bnode_put(node); + return PTR_ERR(new_node); + } + + tree->root = new_node->this; + if (!tree->depth) { + tree->leaf_head = tree->leaf_tail = new_node->this; + new_node->type = HFS_NODE_LEAF; + new_node->num_recs = 0; + } else { + new_node->type = HFS_NODE_INDEX; + new_node->num_recs = 1; + } + new_node->parent = 0; + new_node->next = 0; + new_node->prev = 0; + new_node->height = ++tree->depth; + + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + rec = tree->node_size - 2; + hfs_bnode_write_u16(new_node, rec, 14); + + if (node) { + /* insert old root idx into new root */ + node->parent = tree->root; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS || + tree->cnid == HFSPLUS_ATTR_CNID) + key_size = hfs_bnode_read_u16(node, 14) + 2; + else + key_size = tree->max_key_len + 2; + hfs_bnode_copy(new_node, 14, node, 14, key_size); + + if (!(tree->attributes & HFS_TREE_VARIDXKEYS) && + (tree->cnid != HFSPLUS_ATTR_CNID)) { + key_size = tree->max_key_len + 2; + hfs_bnode_write_u16(new_node, 14, tree->max_key_len); + } + cnid = cpu_to_be32(node->this); + hfs_bnode_write(new_node, &cnid, 14 + key_size, 4); + + rec -= 2; + hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4); + + hfs_bnode_put(node); + } + hfs_bnode_put(new_node); + mark_inode_dirty(tree->inode); + + return 0; +} diff --git a/kernel/fs/hfsplus/btree.c b/kernel/fs/hfsplus/btree.c new file mode 100644 index 000000000..3345c7553 --- /dev/null +++ b/kernel/fs/hfsplus/btree.c @@ -0,0 +1,497 @@ +/* + * linux/fs/hfsplus/btree.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle opening/closing btree + */ + +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* + * Initial source code of clump size calculation is gotten + * from http://opensource.apple.com/tarballs/diskdev_cmds/ + */ +#define CLUMP_ENTRIES 15 + +static short clumptbl[CLUMP_ENTRIES * 3] = { +/* + * Volume Attributes Catalog Extents + * Size Clump (MB) Clump (MB) Clump (MB) + */ + /* 1GB */ 4, 4, 4, + /* 2GB */ 6, 6, 4, + /* 4GB */ 8, 8, 4, + /* 8GB */ 11, 11, 5, + /* + * For volumes 16GB and larger, we want to make sure that a full OS + * install won't require fragmentation of the Catalog or Attributes + * B-trees. We do this by making the clump sizes sufficiently large, + * and by leaving a gap after the B-trees for them to grow into. + * + * For SnowLeopard 10A298, a FullNetInstall with all packages selected + * results in: + * Catalog B-tree Header + * nodeSize: 8192 + * totalNodes: 31616 + * freeNodes: 1978 + * (used = 231.55 MB) + * Attributes B-tree Header + * nodeSize: 8192 + * totalNodes: 63232 + * freeNodes: 958 + * (used = 486.52 MB) + * + * We also want Time Machine backup volumes to have a sufficiently + * large clump size to reduce fragmentation. + * + * The series of numbers for Catalog and Attribute form a geometric + * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times + * the previous term. For Attributes (16GB to 512GB), each term is + * 4**(1/5) times the previous term. For 1TB to 16TB, each term is + * 2**(1/5) times the previous term. + */ + /* 16GB */ 64, 32, 5, + /* 32GB */ 84, 49, 6, + /* 64GB */ 111, 74, 7, + /* 128GB */ 147, 111, 8, + /* 256GB */ 194, 169, 9, + /* 512GB */ 256, 256, 11, + /* 1TB */ 294, 294, 14, + /* 2TB */ 338, 338, 16, + /* 4TB */ 388, 388, 20, + /* 8TB */ 446, 446, 25, + /* 16TB */ 512, 512, 32 +}; + +u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, + u64 sectors, int file_id) +{ + u32 mod = max(node_size, block_size); + u32 clump_size; + int column; + int i; + + /* Figure out which column of the above table to use for this file. */ + switch (file_id) { + case HFSPLUS_ATTR_CNID: + column = 0; + break; + case HFSPLUS_CAT_CNID: + column = 1; + break; + default: + column = 2; + break; + } + + /* + * The default clump size is 0.8% of the volume size. And + * it must also be a multiple of the node and block size. + */ + if (sectors < 0x200000) { + clump_size = sectors << 2; /* 0.8 % */ + if (clump_size < (8 * node_size)) + clump_size = 8 * node_size; + } else { + /* turn exponent into table index... */ + for (i = 0, sectors = sectors >> 22; + sectors && (i < CLUMP_ENTRIES - 1); + ++i, sectors = sectors >> 1) { + /* empty body */ + } + + clump_size = clumptbl[column + (i) * 3] * 1024 * 1024; + } + + /* + * Round the clump size to a multiple of node and block size. + * NOTE: This rounds down. + */ + clump_size /= mod; + clump_size *= mod; + + /* + * Rounding down could have rounded down to 0 if the block size was + * greater than the clump size. If so, just use one block or node. + */ + if (clump_size == 0) + clump_size = mod; + + return clump_size; +} + +/* Get a reference to a B*Tree and do some initial checks */ +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) +{ + struct hfs_btree *tree; + struct hfs_btree_header_rec *head; + struct address_space *mapping; + struct inode *inode; + struct page *page; + unsigned int size; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) + return NULL; + + mutex_init(&tree->tree_lock); + spin_lock_init(&tree->hash_lock); + tree->sb = sb; + tree->cnid = id; + inode = hfsplus_iget(sb, id); + if (IS_ERR(inode)) + goto free_tree; + tree->inode = inode; + + if (!HFSPLUS_I(tree->inode)->first_blocks) { + pr_err("invalid btree extent records (0 size)\n"); + goto free_inode; + } + + mapping = tree->inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + goto free_inode; + + /* Load the header */ + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); + tree->root = be32_to_cpu(head->root); + tree->leaf_count = be32_to_cpu(head->leaf_count); + tree->leaf_head = be32_to_cpu(head->leaf_head); + tree->leaf_tail = be32_to_cpu(head->leaf_tail); + tree->node_count = be32_to_cpu(head->node_count); + tree->free_nodes = be32_to_cpu(head->free_nodes); + tree->attributes = be32_to_cpu(head->attributes); + tree->node_size = be16_to_cpu(head->node_size); + tree->max_key_len = be16_to_cpu(head->max_key_len); + tree->depth = be16_to_cpu(head->depth); + + /* Verify the tree and set the correct compare function */ + switch (id) { + case HFSPLUS_EXT_CNID: + if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { + pr_err("invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (tree->attributes & HFS_TREE_VARIDXKEYS) { + pr_err("invalid extent btree flag\n"); + goto fail_page; + } + + tree->keycmp = hfsplus_ext_cmp_key; + break; + case HFSPLUS_CAT_CNID: + if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { + pr_err("invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + pr_err("invalid catalog btree flag\n"); + goto fail_page; + } + + if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && + (head->key_type == HFSPLUS_KEY_BINARY)) + tree->keycmp = hfsplus_cat_bin_cmp_key; + else { + tree->keycmp = hfsplus_cat_case_cmp_key; + set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + } + break; + case HFSPLUS_ATTR_CNID: + if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { + pr_err("invalid attributes max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + tree->keycmp = hfsplus_attr_bin_cmp_key; + break; + default: + pr_err("unknown B*Tree requested\n"); + goto fail_page; + } + + if (!(tree->attributes & HFS_TREE_BIGKEYS)) { + pr_err("invalid btree flag\n"); + goto fail_page; + } + + size = tree->node_size; + if (!is_power_of_2(size)) + goto fail_page; + if (!tree->node_count) + goto fail_page; + + tree->node_size_shift = ffs(size) - 1; + + tree->pages_per_bnode = + (tree->node_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + kunmap(page); + page_cache_release(page); + return tree; + + fail_page: + page_cache_release(page); + free_inode: + tree->inode->i_mapping->a_ops = &hfsplus_aops; + iput(tree->inode); + free_tree: + kfree(tree); + return NULL; +} + +/* Release resources used by a btree */ +void hfs_btree_close(struct hfs_btree *tree) +{ + struct hfs_bnode *node; + int i; + + if (!tree) + return; + + for (i = 0; i < NODE_HASH_SIZE; i++) { + while ((node = tree->node_hash[i])) { + tree->node_hash[i] = node->next_hash; + if (atomic_read(&node->refcnt)) + pr_crit("node %d:%d " + "still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + hfs_bnode_free(node); + tree->node_hash_cnt--; + } + } + iput(tree->inode); + kfree(tree); +} + +int hfs_btree_write(struct hfs_btree *tree) +{ + struct hfs_btree_header_rec *head; + struct hfs_bnode *node; + struct page *page; + + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + /* panic? */ + return -EIO; + /* Load the header */ + page = node->page[0]; + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); + + head->root = cpu_to_be32(tree->root); + head->leaf_count = cpu_to_be32(tree->leaf_count); + head->leaf_head = cpu_to_be32(tree->leaf_head); + head->leaf_tail = cpu_to_be32(tree->leaf_tail); + head->node_count = cpu_to_be32(tree->node_count); + head->free_nodes = cpu_to_be32(tree->free_nodes); + head->attributes = cpu_to_be32(tree->attributes); + head->depth = cpu_to_be16(tree->depth); + + kunmap(page); + set_page_dirty(page); + hfs_bnode_put(node); + return 0; +} + +static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) +{ + struct hfs_btree *tree = prev->tree; + struct hfs_bnode *node; + struct hfs_bnode_desc desc; + __be32 cnid; + + node = hfs_bnode_create(tree, idx); + if (IS_ERR(node)) + return node; + + tree->free_nodes--; + prev->next = idx; + cnid = cpu_to_be32(idx); + hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + + node->type = HFS_NODE_MAP; + node->num_recs = 1; + hfs_bnode_clear(node, 0, tree->node_size); + desc.next = 0; + desc.prev = 0; + desc.type = HFS_NODE_MAP; + desc.height = 0; + desc.num_recs = cpu_to_be16(1); + desc.reserved = 0; + hfs_bnode_write(node, &desc, 0, sizeof(desc)); + hfs_bnode_write_u16(node, 14, 0x8000); + hfs_bnode_write_u16(node, tree->node_size - 2, 14); + hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); + + return node; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i; + + while (!tree->free_nodes) { + struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int res; + + res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); + if (res) + return ERR_PTR(res); + hip->phys_size = inode->i_size = + (loff_t)hip->alloc_blocks << + HFSPLUS_SB(tree->sb)->alloc_blksz_shift; + hip->fs_blocks = + hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; + inode_set_bytes(inode, inode->i_size); + count = inode->i_size >> tree->node_size_shift; + tree->free_nodes = count - tree->node_count; + tree->node_count = count; + } + + nidx = 0; + node = hfs_bnode_find(tree, nidx); + if (IS_ERR(node)) + return node; + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + idx = 0; + + for (;;) { + while (len) { + byte = data[off]; + if (byte != 0xff) { + for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { + if (!(byte & m)) { + idx += i; + data[off] |= m; + set_page_dirty(*pagep); + kunmap(*pagep); + tree->free_nodes--; + mark_inode_dirty(tree->inode); + hfs_bnode_put(node); + return hfs_bnode_create(tree, + idx); + } + } + } + if (++off >= PAGE_CACHE_SIZE) { + kunmap(*pagep); + data = kmap(*++pagep); + off = 0; + } + idx += 8; + len--; + } + kunmap(*pagep); + nidx = node->next; + if (!nidx) { + hfs_dbg(BNODE_MOD, "create new bmap node\n"); + next_node = hfs_bmap_new_bmap(node, idx); + } else + next_node = hfs_bnode_find(tree, nidx); + hfs_bnode_put(node); + if (IS_ERR(next_node)) + return next_node; + node = next_node; + + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + } +} + +void hfs_bmap_free(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct page *page; + u16 off, len; + u32 nidx; + u8 *data, byte, m; + + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + BUG_ON(!node->this); + tree = node->tree; + nidx = node->this; + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + return; + len = hfs_brec_lenoff(node, 2, &off); + while (nidx >= len * 8) { + u32 i; + + nidx -= len * 8; + i = node->next; + hfs_bnode_put(node); + if (!i) { + /* panic */; + pr_crit("unable to free bnode %u. " + "bmap not found!\n", + node->this); + return; + } + node = hfs_bnode_find(tree, i); + if (IS_ERR(node)) + return; + if (node->type != HFS_NODE_MAP) { + /* panic */; + pr_crit("invalid bmap found! " + "(%u,%d)\n", + node->this, node->type); + hfs_bnode_put(node); + return; + } + len = hfs_brec_lenoff(node, 0, &off); + } + off += node->page_offset + nidx / 8; + page = node->page[off >> PAGE_CACHE_SHIFT]; + data = kmap(page); + off &= ~PAGE_CACHE_MASK; + m = 1 << (~nidx & 7); + byte = data[off]; + if (!(byte & m)) { + pr_crit("trying to free free bnode " + "%u(%d)\n", + node->this, node->type); + kunmap(page); + hfs_bnode_put(node); + return; + } + data[off] = byte & ~m; + set_page_dirty(page); + kunmap(page); + hfs_bnode_put(node); + tree->free_nodes++; + mark_inode_dirty(tree->inode); +} diff --git a/kernel/fs/hfsplus/catalog.c b/kernel/fs/hfsplus/catalog.c new file mode 100644 index 000000000..022974ab6 --- /dev/null +++ b/kernel/fs/hfsplus/catalog.c @@ -0,0 +1,521 @@ +/* + * linux/fs/hfsplus/catalog.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of catalog records + */ + + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1p, k2p; + + k1p = k1->cat.parent; + k2p = k2->cat.parent; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfsplus_strcasecmp(&k1->cat.name, &k2->cat.name); +} + +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1p, k2p; + + k1p = k1->cat.parent; + k2p = k2->cat.parent; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfsplus_strcmp(&k1->cat.name, &k2->cat.name); +} + +/* Generates key for catalog file/folders record. */ +int hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *key, u32 parent, struct qstr *str) +{ + int len, err; + + key->cat.parent = cpu_to_be32(parent); + err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); + if (unlikely(err < 0)) + return err; + + len = be16_to_cpu(key->cat.name.length); + key->key_len = cpu_to_be16(6 + 2 * len); + return 0; +} + +/* Generates key for catalog thread record. */ +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent) +{ + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = 0; + key->key_len = cpu_to_be16(6); +} + +static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, + struct hfsplus_unistr *name) +{ + int ustrlen; + + ustrlen = be16_to_cpu(name->length); + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = cpu_to_be16(ustrlen); + ustrlen *= 2; + memcpy(key->cat.name.unicode, name->unicode, ustrlen); + key->key_len = cpu_to_be16(6 + ustrlen); +} + +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) +{ + if (inode->i_flags & S_IMMUTABLE) + perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; + else + perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE; + if (inode->i_flags & S_APPEND) + perms->rootflags |= HFSPLUS_FLG_APPEND; + else + perms->rootflags &= ~HFSPLUS_FLG_APPEND; + + perms->userflags = HFSPLUS_I(inode)->userflags; + perms->mode = cpu_to_be16(inode->i_mode); + perms->owner = cpu_to_be32(i_uid_read(inode)); + perms->group = cpu_to_be32(i_gid_read(inode)); + + if (S_ISREG(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_nlink); + else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_rdev); + else + perms->dev = 0; +} + +static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, + u32 cnid, struct inode *inode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + + if (S_ISDIR(inode->i_mode)) { + struct hfsplus_cat_folder *folder; + + folder = &entry->folder; + memset(folder, 0, sizeof(*folder)); + folder->type = cpu_to_be16(HFSPLUS_FOLDER); + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) + folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT); + folder->id = cpu_to_be32(inode->i_ino); + HFSPLUS_I(inode)->create_date = + folder->create_date = + folder->content_mod_date = + folder->attribute_mod_date = + folder->access_date = hfsp_now2mt(); + hfsplus_cat_set_perms(inode, &folder->permissions); + if (inode == sbi->hidden_dir) + /* invisible and namelocked */ + folder->user_info.frFlags = cpu_to_be16(0x5000); + return sizeof(*folder); + } else { + struct hfsplus_cat_file *file; + + file = &entry->file; + memset(file, 0, sizeof(*file)); + file->type = cpu_to_be16(HFSPLUS_FILE); + file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); + file->id = cpu_to_be32(cnid); + HFSPLUS_I(inode)->create_date = + file->create_date = + file->content_mod_date = + file->attribute_mod_date = + file->access_date = hfsp_now2mt(); + if (cnid == inode->i_ino) { + hfsplus_cat_set_perms(inode, &file->permissions); + if (S_ISLNK(inode->i_mode)) { + file->user_info.fdType = + cpu_to_be32(HFSP_SYMLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_SYMLINK_CREATOR); + } else { + file->user_info.fdType = + cpu_to_be32(sbi->type); + file->user_info.fdCreator = + cpu_to_be32(sbi->creator); + } + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) + file->flags |= + cpu_to_be16(HFSPLUS_FILE_LOCKED); + } else { + file->user_info.fdType = + cpu_to_be32(HFSP_HARDLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_HFSPLUS_CREATOR); + file->user_info.fdFlags = + cpu_to_be16(0x100); + file->create_date = + HFSPLUS_I(sbi->hidden_dir)->create_date; + file->permissions.dev = + cpu_to_be32(HFSPLUS_I(inode)->linkid); + } + return sizeof(*file); + } +} + +static int hfsplus_fill_cat_thread(struct super_block *sb, + hfsplus_cat_entry *entry, int type, + u32 parentid, struct qstr *str) +{ + int err; + + entry->type = cpu_to_be16(type); + entry->thread.reserved = 0; + entry->thread.parentID = cpu_to_be32(parentid); + err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + str->name, str->len); + if (unlikely(err < 0)) + return err; + + return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; +} + +/* Try to get a catalog entry for given catalog id */ +int hfsplus_find_cat(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd) +{ + hfsplus_cat_entry tmp; + int err; + u16 type; + + hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid); + err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry)); + if (err) + return err; + + type = be16_to_cpu(tmp.type); + if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { + pr_err("found bad thread record in catalog\n"); + return -EIO; + } + + if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { + pr_err("catalog name length corrupted\n"); + return -EIO; + } + + hfsplus_cat_build_key_uni(fd->search_key, + be32_to_cpu(tmp.thread.parentID), + &tmp.thread.nodeName); + return hfs_brec_find(fd, hfs_find_rec_by_key); +} + +static void hfsplus_subfolders_inc(struct inode *dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + /* + * Increment subfolder count. Note, the value is only meaningful + * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set. + */ + HFSPLUS_I(dir)->subfolders++; + } +} + +static void hfsplus_subfolders_dec(struct inode *dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + /* + * Decrement subfolder count. Note, the value is only meaningful + * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set. + * + * Check for zero. Some subfolders may have been created + * by an implementation ignorant of this counter. + */ + if (HFSPLUS_I(dir)->subfolders) + HFSPLUS_I(dir)->subfolders--; + } +} + +int hfsplus_create_cat(u32 cnid, struct inode *dir, + struct qstr *str, struct inode *inode) +{ + struct super_block *sb = dir->i_sb; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + int entry_size; + int err; + + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; + + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); + entry_size = hfsplus_fill_cat_thread(sb, &entry, + S_ISDIR(inode->i_mode) ? + HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, + dir->i_ino, str); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto err2; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto err2; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err2; + + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto err1; + + entry_size = hfsplus_cat_build_record(&entry, cnid, inode); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + /* panic? */ + if (!err) + err = -EEXIST; + goto err1; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err1; + + dir->i_size++; + if (S_ISDIR(inode->i_mode)) + hfsplus_subfolders_inc(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + + hfs_find_exit(&fd); + return 0; + +err1: + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); + if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) + hfs_brec_remove(&fd); +err2: + hfs_find_exit(&fd); + return err; +} + +int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) +{ + struct super_block *sb = dir->i_sb; + struct hfs_find_data fd; + struct hfsplus_fork_raw fork; + struct list_head *pos; + int err, off; + u16 type; + + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; + + if (!str) { + int len; + + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + off = fd.entryoffset + + offsetof(struct hfsplus_cat_thread, nodeName); + fd.search_key->cat.parent = cpu_to_be32(dir->i_ino); + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.length, off, 2); + len = be16_to_cpu(fd.search_key->cat.name.length) * 2; + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.unicode, + off + 2, len); + fd.search_key->key_len = cpu_to_be16(6 + len); + } else { + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto out; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + if (type == HFSPLUS_FILE) { +#if 0 + off = fd.entryoffset + offsetof(hfsplus_cat_file, data_fork); + hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); + hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA); +#endif + + off = fd.entryoffset + + offsetof(struct hfsplus_cat_file, rsrc_fork); + hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); + hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); + } + + list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { + struct hfsplus_readdir_data *rd = + list_entry(pos, struct hfsplus_readdir_data, list); + if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) + rd->file->f_pos--; + } + + err = hfs_brec_remove(&fd); + if (err) + goto out; + + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + err = hfs_brec_remove(&fd); + if (err) + goto out; + + dir->i_size--; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_dec(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + + if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { + if (HFSPLUS_SB(sb)->attr_tree) + hfsplus_delete_all_attrs(dir, cnid); + } + +out: + hfs_find_exit(&fd); + + return err; +} + +int hfsplus_rename_cat(u32 cnid, + struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name) +{ + struct super_block *sb = src_dir->i_sb; + struct hfs_find_data src_fd, dst_fd; + hfsplus_cat_entry entry; + int entry_size, type; + int err; + + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, + dst_dir->i_ino, dst_name->name); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; + dst_fd = src_fd; + + /* find the old dir entry and read the data */ + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); + if (err) + goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, + src_fd.entrylength); + type = be16_to_cpu(entry.type); + + /* create new dir entry with the data from the old entry */ + err = hfsplus_cat_build_key(sb, dst_fd.search_key, + dst_dir->i_ino, dst_name); + if (unlikely(err)) + goto out; + + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + + err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength); + if (err) + goto out; + dst_dir->i_size++; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_inc(dst_dir); + dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + + /* finally remove the old entry */ + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + src_dir->i_size--; + if (type == HFSPLUS_FOLDER) + hfsplus_subfolders_dec(src_dir); + src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + + /* remove old thread entry */ + hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); + if (err) + goto out; + type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset); + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + + /* create new thread entry */ + hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid); + entry_size = hfsplus_fill_cat_thread(sb, &entry, type, + dst_dir->i_ino, dst_name); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto out; + } + + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + err = hfs_brec_insert(&dst_fd, &entry, entry_size); + + hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY); + hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY); +out: + hfs_bnode_put(dst_fd.bnode); + hfs_find_exit(&src_fd); + return err; +} diff --git a/kernel/fs/hfsplus/dir.c b/kernel/fs/hfsplus/dir.c new file mode 100644 index 000000000..d0f39dcbb --- /dev/null +++ b/kernel/fs/hfsplus/dir.c @@ -0,0 +1,576 @@ +/* + * linux/fs/hfsplus/dir.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of directories + */ + +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" +#include "xattr.h" +#include "acl.h" + +static inline void hfsplus_instantiate(struct dentry *dentry, + struct inode *inode, u32 cnid) +{ + dentry->d_fsdata = (void *)(unsigned long)cnid; + d_instantiate(dentry, inode); +} + +/* Find the entry inside dir named dentry->d_name */ +static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct hfs_find_data fd; + struct super_block *sb; + hfsplus_cat_entry entry; + int err; + u32 cnid, linkid = 0; + u16 type; + + sb = dir->i_sb; + + dentry->d_fsdata = NULL; + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return ERR_PTR(err); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, + &dentry->d_name); + if (unlikely(err < 0)) + goto fail; +again: + err = hfs_brec_read(&fd, &entry, sizeof(entry)); + if (err) { + if (err == -ENOENT) { + hfs_find_exit(&fd); + /* No such entry */ + inode = NULL; + goto out; + } + goto fail; + } + type = be16_to_cpu(entry.type); + if (type == HFSPLUS_FOLDER) { + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { + err = -EIO; + goto fail; + } + cnid = be32_to_cpu(entry.folder.id); + dentry->d_fsdata = (void *)(unsigned long)cnid; + } else if (type == HFSPLUS_FILE) { + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + err = -EIO; + goto fail; + } + cnid = be32_to_cpu(entry.file.id); + if (entry.file.user_info.fdType == + cpu_to_be32(HFSP_HARDLINK_TYPE) && + entry.file.user_info.fdCreator == + cpu_to_be32(HFSP_HFSPLUS_CREATOR) && + (entry.file.create_date == + HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> + create_date || + entry.file.create_date == + HFSPLUS_I(d_inode(sb->s_root))-> + create_date) && + HFSPLUS_SB(sb)->hidden_dir) { + struct qstr str; + char name[32]; + + if (dentry->d_fsdata) { + /* + * We found a link pointing to another link, + * so ignore it and treat it as regular file. + */ + cnid = (unsigned long)dentry->d_fsdata; + linkid = 0; + } else { + dentry->d_fsdata = (void *)(unsigned long)cnid; + linkid = + be32_to_cpu(entry.file.permissions.dev); + str.len = sprintf(name, "iNode%d", linkid); + str.name = name; + err = hfsplus_cat_build_key(sb, fd.search_key, + HFSPLUS_SB(sb)->hidden_dir->i_ino, + &str); + if (unlikely(err < 0)) + goto fail; + goto again; + } + } else if (!dentry->d_fsdata) + dentry->d_fsdata = (void *)(unsigned long)cnid; + } else { + pr_err("invalid catalog entry type in lookup\n"); + err = -EIO; + goto fail; + } + hfs_find_exit(&fd); + inode = hfsplus_iget(dir->i_sb, cnid); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (S_ISREG(inode->i_mode)) + HFSPLUS_I(inode)->linkid = linkid; +out: + d_add(dentry, inode); + return NULL; +fail: + hfs_find_exit(&fd); + return ERR_PTR(err); +} + +static int hfsplus_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + int len, err; + char *strbuf; + hfsplus_cat_entry entry; + struct hfs_find_data fd; + struct hfsplus_readdir_data *rd; + u16 type; + + if (file->f_pos >= inode->i_size) + return 0; + + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL); + if (!strbuf) { + err = -ENOMEM; + goto out; + } + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + if (ctx->pos == 0) { + /* This is completely artificial... */ + if (!dir_emit_dot(file, ctx)) + goto out; + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); + if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { + pr_err("bad catalog folder thread\n"); + err = -EIO; + goto out; + } + if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { + pr_err("truncated catalog thread\n"); + err = -EIO; + goto out; + } + if (!dir_emit(ctx, "..", 2, + be32_to_cpu(entry.thread.parentID), DT_DIR)) + goto out; + ctx->pos = 2; + } + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; + for (;;) { + if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { + pr_err("walked past end of dir\n"); + err = -EIO; + goto out; + } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); + type = be16_to_cpu(entry.type); + len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; + err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + if (err) + goto out; + if (type == HFSPLUS_FOLDER) { + if (fd.entrylength < + sizeof(struct hfsplus_cat_folder)) { + pr_err("small dir entry\n"); + err = -EIO; + goto out; + } + if (HFSPLUS_SB(sb)->hidden_dir && + HFSPLUS_SB(sb)->hidden_dir->i_ino == + be32_to_cpu(entry.folder.id)) + goto next; + if (!dir_emit(ctx, strbuf, len, + be32_to_cpu(entry.folder.id), DT_DIR)) + break; + } else if (type == HFSPLUS_FILE) { + u16 mode; + unsigned type = DT_UNKNOWN; + + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + pr_err("small file entry\n"); + err = -EIO; + goto out; + } + + mode = be16_to_cpu(entry.file.permissions.mode); + if (S_ISREG(mode)) + type = DT_REG; + else if (S_ISLNK(mode)) + type = DT_LNK; + else if (S_ISFIFO(mode)) + type = DT_FIFO; + else if (S_ISCHR(mode)) + type = DT_CHR; + else if (S_ISBLK(mode)) + type = DT_BLK; + else if (S_ISSOCK(mode)) + type = DT_SOCK; + + if (!dir_emit(ctx, strbuf, len, + be32_to_cpu(entry.file.id), type)) + break; + } else { + pr_err("bad catalog entry type\n"); + err = -EIO; + goto out; + } +next: + ctx->pos++; + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, 1); + if (err) + goto out; + } + rd = file->private_data; + if (!rd) { + rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); + if (!rd) { + err = -ENOMEM; + goto out; + } + file->private_data = rd; + rd->file = file; + list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); + } + memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); +out: + kfree(strbuf); + hfs_find_exit(&fd); + return err; +} + +static int hfsplus_dir_release(struct inode *inode, struct file *file) +{ + struct hfsplus_readdir_data *rd = file->private_data; + if (rd) { + mutex_lock(&inode->i_mutex); + list_del(&rd->list); + mutex_unlock(&inode->i_mutex); + kfree(rd); + } + return 0; +} + +static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, + struct dentry *dst_dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); + struct inode *inode = d_inode(src_dentry); + struct inode *src_dir = d_inode(src_dentry->d_parent); + struct qstr str; + char name[32]; + u32 cnid, id; + int res; + + if (HFSPLUS_IS_RSRC(inode)) + return -EPERM; + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + mutex_lock(&sbi->vh_mutex); + if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { + for (;;) { + get_random_bytes(&id, sizeof(cnid)); + id &= 0x3fffffff; + str.name = name; + str.len = sprintf(name, "iNode%d", id); + res = hfsplus_rename_cat(inode->i_ino, + src_dir, &src_dentry->d_name, + sbi->hidden_dir, &str); + if (!res) + break; + if (res != -EEXIST) + goto out; + } + HFSPLUS_I(inode)->linkid = id; + cnid = sbi->next_cnid++; + src_dentry->d_fsdata = (void *)(unsigned long)cnid; + res = hfsplus_create_cat(cnid, src_dir, + &src_dentry->d_name, inode); + if (res) + /* panic? */ + goto out; + sbi->file_count++; + } + cnid = sbi->next_cnid++; + res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); + if (res) + goto out; + + inc_nlink(inode); + hfsplus_instantiate(dst_dentry, inode, cnid); + ihold(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + sbi->file_count++; + hfsplus_mark_mdb_dirty(dst_dir->i_sb); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = d_inode(dentry); + struct qstr str; + char name[32]; + u32 cnid; + int res; + + if (HFSPLUS_IS_RSRC(inode)) + return -EPERM; + + mutex_lock(&sbi->vh_mutex); + cnid = (u32)(unsigned long)dentry->d_fsdata; + if (inode->i_ino == cnid && + atomic_read(&HFSPLUS_I(inode)->opencnt)) { + str.name = name; + str.len = sprintf(name, "temp%lu", inode->i_ino); + res = hfsplus_rename_cat(inode->i_ino, + dir, &dentry->d_name, + sbi->hidden_dir, &str); + if (!res) { + inode->i_flags |= S_DEAD; + drop_nlink(inode); + } + goto out; + } + res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); + if (res) + goto out; + + if (inode->i_nlink > 0) + drop_nlink(inode); + if (inode->i_ino == cnid) + clear_nlink(inode); + if (!inode->i_nlink) { + if (inode->i_ino != cnid) { + sbi->file_count--; + if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { + res = hfsplus_delete_cat(inode->i_ino, + sbi->hidden_dir, + NULL); + if (!res) + hfsplus_delete_inode(inode); + } else + inode->i_flags |= S_DEAD; + } else + hfsplus_delete_inode(inode); + } else + sbi->file_count--; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = d_inode(dentry); + int res; + + if (inode->i_size != 2) + return -ENOTEMPTY; + + mutex_lock(&sbi->vh_mutex); + res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + if (res) + goto out; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + hfsplus_delete_inode(inode); + mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode; + int res = -ENOMEM; + + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + if (!inode) + goto out; + + res = page_symlink(inode, symname, strlen(symname) + 1); + if (res) + goto out_err; + + res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) + goto out_err; + + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto out_err; + } + + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); + goto out; + +out_err: + clear_nlink(inode); + hfsplus_delete_inode(inode); + iput(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode; + int res = -ENOMEM; + + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, mode); + if (!inode) + goto out; + + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) + init_special_inode(inode, mode, rdev); + + res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) + goto failed_mknod; + + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto failed_mknod; + } + + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); + goto out; + +failed_mknod: + clear_nlink(inode); + hfsplus_delete_inode(inode); + iput(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return hfsplus_mknod(dir, dentry, mode, 0); +} + +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); +} + +static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int res; + + /* Unlink destination if it already exists */ + if (d_really_is_positive(new_dentry)) { + if (d_is_dir(new_dentry)) + res = hfsplus_rmdir(new_dir, new_dentry); + else + res = hfsplus_unlink(new_dir, new_dentry); + if (res) + return res; + } + + res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + if (!res) + new_dentry->d_fsdata = old_dentry->d_fsdata; + return res; +} + +const struct inode_operations hfsplus_dir_inode_operations = { + .lookup = hfsplus_lookup, + .create = hfsplus_create, + .link = hfsplus_link, + .unlink = hfsplus_unlink, + .mkdir = hfsplus_mkdir, + .rmdir = hfsplus_rmdir, + .symlink = hfsplus_symlink, + .mknod = hfsplus_mknod, + .rename = hfsplus_rename, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = hfsplus_listxattr, + .removexattr = generic_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, +#endif +}; + +const struct file_operations hfsplus_dir_operations = { + .fsync = hfsplus_file_fsync, + .read = generic_read_dir, + .iterate = hfsplus_readdir, + .unlocked_ioctl = hfsplus_ioctl, + .llseek = generic_file_llseek, + .release = hfsplus_dir_release, +}; diff --git a/kernel/fs/hfsplus/extents.c b/kernel/fs/hfsplus/extents.c new file mode 100644 index 000000000..feca524ce --- /dev/null +++ b/kernel/fs/hfsplus/extents.c @@ -0,0 +1,611 @@ +/* + * linux/fs/hfsplus/extents.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of Extents both in catalog and extents overflow trees + */ + +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Compare two extents keys, returns 0 on same, pos/neg for difference */ +int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1id, k2id; + __be32 k1s, k2s; + + k1id = k1->ext.cnid; + k2id = k2->ext.cnid; + if (k1id != k2id) + return be32_to_cpu(k1id) < be32_to_cpu(k2id) ? -1 : 1; + + if (k1->ext.fork_type != k2->ext.fork_type) + return k1->ext.fork_type < k2->ext.fork_type ? -1 : 1; + + k1s = k1->ext.start_block; + k2s = k2->ext.start_block; + if (k1s == k2s) + return 0; + return be32_to_cpu(k1s) < be32_to_cpu(k2s) ? -1 : 1; +} + +static void hfsplus_ext_build_key(hfsplus_btree_key *key, u32 cnid, + u32 block, u8 type) +{ + key->key_len = cpu_to_be16(HFSPLUS_EXT_KEYLEN - 2); + key->ext.cnid = cpu_to_be32(cnid); + key->ext.start_block = cpu_to_be32(block); + key->ext.fork_type = type; + key->ext.pad = 0; +} + +static u32 hfsplus_ext_find_block(struct hfsplus_extent *ext, u32 off) +{ + int i; + u32 count; + + for (i = 0; i < 8; ext++, i++) { + count = be32_to_cpu(ext->block_count); + if (off < count) + return be32_to_cpu(ext->start_block) + off; + off -= count; + } + /* panic? */ + return 0; +} + +static int hfsplus_ext_block_count(struct hfsplus_extent *ext) +{ + int i; + u32 count = 0; + + for (i = 0; i < 8; ext++, i++) + count += be32_to_cpu(ext->block_count); + return count; +} + +static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) +{ + int i; + + ext += 7; + for (i = 0; i < 7; ext--, i++) + if (ext->block_count) + break; + return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); +} + +static int __hfsplus_ext_write_extent(struct inode *inode, + struct hfs_find_data *fd) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res; + + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start, + HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + + res = hfs_brec_find(fd, hfs_find_rec_by_key); + if (hip->extent_state & HFSPLUS_EXT_NEW) { + if (res != -ENOENT) + return res; + hfs_brec_insert(fd, hip->cached_extents, + sizeof(hfsplus_extent_rec)); + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + } else { + if (res) + return res; + hfs_bnode_write(fd->bnode, hip->cached_extents, + fd->entryoffset, fd->entrylength); + hip->extent_state &= ~HFSPLUS_EXT_DIRTY; + } + + /* + * We can't just use hfsplus_mark_inode_dirty here, because we + * also get called from hfsplus_write_inode, which should not + * redirty the inode. Instead the callers have to be careful + * to explicily mark the inode dirty, too. + */ + set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); + + return 0; +} + +static int hfsplus_ext_write_extent_locked(struct inode *inode) +{ + int res = 0; + + if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { + struct hfs_find_data fd; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; + res = __hfsplus_ext_write_extent(inode, &fd); + hfs_find_exit(&fd); + } + return res; +} + +int hfsplus_ext_write_extent(struct inode *inode) +{ + int res; + + mutex_lock(&HFSPLUS_I(inode)->extents_lock); + res = hfsplus_ext_write_extent_locked(inode); + mutex_unlock(&HFSPLUS_I(inode)->extents_lock); + + return res; +} + +static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, + struct hfsplus_extent *extent, + u32 cnid, u32 block, u8 type) +{ + int res; + + hfsplus_ext_build_key(fd->search_key, cnid, block, type); + fd->key->ext.cnid = 0; + res = hfs_brec_find(fd, hfs_find_rec_by_key); + if (res && res != -ENOENT) + return res; + if (fd->key->ext.cnid != fd->search_key->ext.cnid || + fd->key->ext.fork_type != fd->search_key->ext.fork_type) + return -ENOENT; + if (fd->entrylength != sizeof(hfsplus_extent_rec)) + return -EIO; + hfs_bnode_read(fd->bnode, extent, fd->entryoffset, + sizeof(hfsplus_extent_rec)); + return 0; +} + +static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, + struct inode *inode, u32 block) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res; + + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + if (hip->extent_state & HFSPLUS_EXT_DIRTY) { + res = __hfsplus_ext_write_extent(inode, fd); + if (res) + return res; + } + + res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, + block, HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : + HFSPLUS_TYPE_DATA); + if (!res) { + hip->cached_start = be32_to_cpu(fd->key->ext.start_block); + hip->cached_blocks = + hfsplus_ext_block_count(hip->cached_extents); + } else { + hip->cached_start = hip->cached_blocks = 0; + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + } + return res; +} + +static int hfsplus_ext_read_extent(struct inode *inode, u32 block) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfs_find_data fd; + int res; + + if (block >= hip->cached_start && + block < hip->cached_start + hip->cached_blocks) + return 0; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfsplus_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } + return res; +} + +/* Get a block at iblock for inode, possibly allocating if create */ +int hfsplus_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res = -EIO; + u32 ablock, dblock, mask; + sector_t sector; + int was_dirty = 0; + + /* Convert inode block to disk allocation block */ + ablock = iblock >> sbi->fs_shift; + + if (iblock >= hip->fs_blocks) { + if (iblock > hip->fs_blocks || !create) + return -EIO; + if (ablock >= hip->alloc_blocks) { + res = hfsplus_file_extend(inode, false); + if (res) + return res; + } + } else + create = 0; + + if (ablock < hip->first_blocks) { + dblock = hfsplus_ext_find_block(hip->first_extents, ablock); + goto done; + } + + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + + mutex_lock(&hip->extents_lock); + + /* + * hfsplus_ext_read_extent will write out a cached extent into + * the extents btree. In that case we may have to mark the inode + * dirty even for a pure read of an extent here. + */ + was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY); + res = hfsplus_ext_read_extent(inode, ablock); + if (res) { + mutex_unlock(&hip->extents_lock); + return -EIO; + } + dblock = hfsplus_ext_find_block(hip->cached_extents, + ablock - hip->cached_start); + mutex_unlock(&hip->extents_lock); + +done: + hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + inode->i_ino, (long long)iblock, dblock); + + mask = (1 << sbi->fs_shift) - 1; + sector = ((sector_t)dblock << sbi->fs_shift) + + sbi->blockoffset + (iblock & mask); + map_bh(bh_result, sb, sector); + + if (create) { + set_buffer_new(bh_result); + hip->phys_size += sb->s_blocksize; + hip->fs_blocks++; + inode_add_bytes(inode, sb->s_blocksize); + } + if (create || was_dirty) + mark_inode_dirty(inode); + return 0; +} + +static void hfsplus_dump_extent(struct hfsplus_extent *extent) +{ + int i; + + hfs_dbg(EXTENT, " "); + for (i = 0; i < 8; i++) + hfs_dbg_cont(EXTENT, " %u:%u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg_cont(EXTENT, "\n"); +} + +static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, + u32 alloc_block, u32 block_count) +{ + u32 count, start; + int i; + + hfsplus_dump_extent(extent); + for (i = 0; i < 8; extent++, i++) { + count = be32_to_cpu(extent->block_count); + if (offset == count) { + start = be32_to_cpu(extent->start_block); + if (alloc_block != start + count) { + if (++i >= 8) + return -ENOSPC; + extent++; + extent->start_block = cpu_to_be32(alloc_block); + } else + block_count += count; + extent->block_count = cpu_to_be32(block_count); + return 0; + } else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +} + +static int hfsplus_free_extents(struct super_block *sb, + struct hfsplus_extent *extent, + u32 offset, u32 block_nr) +{ + u32 count, start; + int i; + int err = 0; + + hfsplus_dump_extent(extent); + for (i = 0; i < 8; extent++, i++) { + count = be32_to_cpu(extent->block_count); + if (offset == count) + goto found; + else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +found: + for (;;) { + start = be32_to_cpu(extent->start_block); + if (count <= block_nr) { + err = hfsplus_block_free(sb, start, count); + if (err) { + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", + start, count); + } + extent->block_count = 0; + extent->start_block = 0; + block_nr -= count; + } else { + count -= block_nr; + err = hfsplus_block_free(sb, start + count, block_nr); + if (err) { + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", + start, count); + } + extent->block_count = cpu_to_be32(count); + block_nr = 0; + } + if (!block_nr || !i) { + /* + * Try to free all extents and + * return only last error + */ + return err; + } + i--; + extent--; + count = be32_to_cpu(extent->block_count); + } +} + +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type) +{ + struct hfs_find_data fd; + hfsplus_extent_rec ext_entry; + u32 total_blocks, blocks, start; + int res, i; + + total_blocks = be32_to_cpu(fork->total_blocks); + if (!total_blocks) + return 0; + + blocks = 0; + for (i = 0; i < 8; i++) + blocks += be32_to_cpu(fork->extents[i].block_count); + + res = hfsplus_free_extents(sb, fork->extents, blocks, blocks); + if (res) + return res; + if (total_blocks == blocks) + return 0; + + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) + return res; + do { + res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, + total_blocks, type); + if (res) + break; + start = be32_to_cpu(fd.key->ext.start_block); + hfsplus_free_extents(sb, ext_entry, + total_blocks - start, + total_blocks); + hfs_brec_remove(&fd); + total_blocks = start; + } while (total_blocks > blocks); + hfs_find_exit(&fd); + + return res; +} + +int hfsplus_file_extend(struct inode *inode, bool zeroout) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 start, len, goal; + int res; + + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { + /* extend alloc file */ + pr_err("extend alloc file! (%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); + return -ENOSPC; + } + + mutex_lock(&hip->extents_lock); + if (hip->alloc_blocks == hip->first_blocks) + goal = hfsplus_ext_lastblock(hip->first_extents); + else { + res = hfsplus_ext_read_extent(inode, hip->alloc_blocks); + if (res) + goto out; + goal = hfsplus_ext_lastblock(hip->cached_extents); + } + + len = hip->clump_blocks; + start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len); + if (start >= sbi->total_blocks) { + start = hfsplus_block_allocate(sb, goal, 0, &len); + if (start >= goal) { + res = -ENOSPC; + goto out; + } + } + + if (zeroout) { + res = sb_issue_zeroout(sb, start, len, GFP_NOFS); + if (res) + goto out; + } + + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + + if (hip->alloc_blocks <= hip->first_blocks) { + if (!hip->first_blocks) { + hfs_dbg(EXTENT, "first extents\n"); + /* no extents yet */ + hip->first_extents[0].start_block = cpu_to_be32(start); + hip->first_extents[0].block_count = cpu_to_be32(len); + res = 0; + } else { + /* try to append to extents in inode */ + res = hfsplus_add_extent(hip->first_extents, + hip->alloc_blocks, + start, len); + if (res == -ENOSPC) + goto insert_extent; + } + if (!res) { + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks += len; + } + } else { + res = hfsplus_add_extent(hip->cached_extents, + hip->alloc_blocks - hip->cached_start, + start, len); + if (!res) { + hfsplus_dump_extent(hip->cached_extents); + hip->extent_state |= HFSPLUS_EXT_DIRTY; + hip->cached_blocks += len; + } else if (res == -ENOSPC) + goto insert_extent; + } +out: + if (!res) { + hip->alloc_blocks += len; + mutex_unlock(&hip->extents_lock); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); + return 0; + } + mutex_unlock(&hip->extents_lock); + return res; + +insert_extent: + hfs_dbg(EXTENT, "insert new extent\n"); + res = hfsplus_ext_write_extent_locked(inode); + if (res) + goto out; + + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_extents[0].start_block = cpu_to_be32(start); + hip->cached_extents[0].block_count = cpu_to_be32(len); + hfsplus_dump_extent(hip->cached_extents); + hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW; + hip->cached_start = hip->alloc_blocks; + hip->cached_blocks = len; + + res = 0; + goto out; +} + +void hfsplus_file_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfs_find_data fd; + u32 alloc_cnt, blk_cnt, start; + int res; + + hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); + + if (inode->i_size > hip->phys_size) { + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + loff_t size = inode->i_size; + + res = pagecache_write_begin(NULL, mapping, size, 0, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (res) + return; + res = pagecache_write_end(NULL, mapping, size, + 0, 0, page, fsdata); + if (res < 0) + return; + mark_inode_dirty(inode); + return; + } else if (inode->i_size == hip->phys_size) + return; + + blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> + HFSPLUS_SB(sb)->alloc_blksz_shift; + + mutex_lock(&hip->extents_lock); + + alloc_cnt = hip->alloc_blocks; + if (blk_cnt == alloc_cnt) + goto out_unlock; + + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&hip->extents_lock); + /* XXX: We lack error handling of hfsplus_file_truncate() */ + return; + } + while (1) { + if (alloc_cnt == hip->first_blocks) { + hfsplus_free_extents(sb, hip->first_extents, + alloc_cnt, alloc_cnt - blk_cnt); + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks = blk_cnt; + break; + } + res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); + if (res) + break; + start = hip->cached_start; + hfsplus_free_extents(sb, hip->cached_extents, + alloc_cnt - start, alloc_cnt - blk_cnt); + hfsplus_dump_extent(hip->cached_extents); + if (blk_cnt > start) { + hip->extent_state |= HFSPLUS_EXT_DIRTY; + break; + } + alloc_cnt = start; + hip->cached_start = hip->cached_blocks = 0; + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + hfs_brec_remove(&fd); + } + hfs_find_exit(&fd); + + hip->alloc_blocks = blk_cnt; +out_unlock: + mutex_unlock(&hip->extents_lock); + hip->phys_size = inode->i_size; + hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); +} diff --git a/kernel/fs/hfsplus/hfsplus_fs.h b/kernel/fs/hfsplus/hfsplus_fs.h new file mode 100644 index 000000000..b0441d65f --- /dev/null +++ b/kernel/fs/hfsplus/hfsplus_fs.h @@ -0,0 +1,540 @@ +/* + * linux/include/linux/hfsplus_fs.h + * + * Copyright (C) 1999 + * Brad Boyer (flar@pants.nu) + * (C) 2003 Ardis Technologies + * + */ + +#ifndef _LINUX_HFSPLUS_FS_H +#define _LINUX_HFSPLUS_FS_H + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "hfsplus_raw.h" + +#define DBG_BNODE_REFS 0x00000001 +#define DBG_BNODE_MOD 0x00000002 +#define DBG_CAT_MOD 0x00000004 +#define DBG_INODE 0x00000008 +#define DBG_SUPER 0x00000010 +#define DBG_EXTENT 0x00000020 +#define DBG_BITMAP 0x00000040 +#define DBG_ATTR_MOD 0x00000080 +#define DBG_ACL_MOD 0x00000100 + +#if 0 +#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) +#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) +#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#endif +#define DBG_MASK (0) + +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) + +/* Runtime config options */ +#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ + +#define HFSPLUS_TYPE_DATA 0x00 +#define HFSPLUS_TYPE_RSRC 0xFF + +typedef int (*btree_keycmp)(const hfsplus_btree_key *, + const hfsplus_btree_key *); + +#define NODE_HASH_SIZE 256 + +/* B-tree mutex nested subclasses */ +enum hfsplus_btree_mutex_classes { + CATALOG_BTREE_MUTEX, + EXTENTS_BTREE_MUTEX, + ATTR_BTREE_MUTEX, +}; + +/* An HFS+ BTree held in memory */ +struct hfs_btree { + struct super_block *sb; + struct inode *inode; + btree_keycmp keycmp; + + u32 cnid; + u32 root; + u32 leaf_count; + u32 leaf_head; + u32 leaf_tail; + u32 node_count; + u32 free_nodes; + u32 attributes; + + unsigned int node_size; + unsigned int node_size_shift; + unsigned int max_key_len; + unsigned int depth; + + struct mutex tree_lock; + + unsigned int pages_per_bnode; + spinlock_t hash_lock; + struct hfs_bnode *node_hash[NODE_HASH_SIZE]; + int node_hash_cnt; +}; + +struct page; + +/* An HFS+ BTree node in memory */ +struct hfs_bnode { + struct hfs_btree *tree; + + u32 prev; + u32 this; + u32 next; + u32 parent; + + u16 num_recs; + u8 type; + u8 height; + + struct hfs_bnode *next_hash; + unsigned long flags; + wait_queue_head_t lock_wq; + atomic_t refcnt; + unsigned int page_offset; + struct page *page[0]; +}; + +#define HFS_BNODE_LOCK 0 +#define HFS_BNODE_ERROR 1 +#define HFS_BNODE_NEW 2 +#define HFS_BNODE_DIRTY 3 +#define HFS_BNODE_DELETED 4 + +/* + * Attributes file states + */ +#define HFSPLUS_EMPTY_ATTR_TREE 0 +#define HFSPLUS_CREATING_ATTR_TREE 1 +#define HFSPLUS_VALID_ATTR_TREE 2 +#define HFSPLUS_FAILED_ATTR_TREE 3 + +/* + * HFS+ superblock info (built from Volume Header on disk) + */ + +struct hfsplus_vh; +struct hfs_btree; + +struct hfsplus_sb_info { + void *s_vhdr_buf; + struct hfsplus_vh *s_vhdr; + void *s_backup_vhdr_buf; + struct hfsplus_vh *s_backup_vhdr; + struct hfs_btree *ext_tree; + struct hfs_btree *cat_tree; + struct hfs_btree *attr_tree; + atomic_t attr_tree_state; + struct inode *alloc_file; + struct inode *hidden_dir; + struct nls_table *nls; + + /* Runtime variables */ + u32 blockoffset; + sector_t part_start; + sector_t sect_count; + int fs_shift; + + /* immutable data from the volume header */ + u32 alloc_blksz; + int alloc_blksz_shift; + u32 total_blocks; + u32 data_clump_blocks, rsrc_clump_blocks; + + /* mutable data from the volume header, protected by alloc_mutex */ + u32 free_blocks; + struct mutex alloc_mutex; + + /* mutable data from the volume header, protected by vh_mutex */ + u32 next_cnid; + u32 file_count; + u32 folder_count; + struct mutex vh_mutex; + + /* Config options */ + u32 creator; + u32 type; + + umode_t umask; + kuid_t uid; + kgid_t gid; + + int part, session; + unsigned long flags; + + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work sync_work; /* FS sync delayed work */ + spinlock_t work_lock; /* protects sync_work and work_queued */ +}; + +#define HFSPLUS_SB_WRITEBACKUP 0 +#define HFSPLUS_SB_NODECOMPOSE 1 +#define HFSPLUS_SB_FORCE 2 +#define HFSPLUS_SB_HFSX 3 +#define HFSPLUS_SB_CASEFOLD 4 +#define HFSPLUS_SB_NOBARRIER 5 + +static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + + +struct hfsplus_inode_info { + atomic_t opencnt; + + /* + * Extent allocation information, protected by extents_lock. + */ + u32 first_blocks; + u32 clump_blocks; + u32 alloc_blocks; + u32 cached_start; + u32 cached_blocks; + hfsplus_extent_rec first_extents; + hfsplus_extent_rec cached_extents; + unsigned int extent_state; + struct mutex extents_lock; + + /* + * Immutable data. + */ + struct inode *rsrc_inode; + __be32 create_date; + + /* + * Protected by sbi->vh_mutex. + */ + u32 linkid; + + /* + * Accessed using atomic bitops. + */ + unsigned long flags; + + /* + * Protected by i_mutex. + */ + sector_t fs_blocks; + u8 userflags; /* BSD user file flags */ + u32 subfolders; /* Subfolder count (HFSX only) */ + struct list_head open_dir_list; + loff_t phys_size; + + struct inode vfs_inode; +}; + +#define HFSPLUS_EXT_DIRTY 0x0001 +#define HFSPLUS_EXT_NEW 0x0002 + +#define HFSPLUS_I_RSRC 0 /* represents a resource fork */ +#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */ +#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */ +#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */ +#define HFSPLUS_I_ATTR_DIRTY 4 /* has changes in the attributes tree */ + +#define HFSPLUS_IS_RSRC(inode) \ + test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) + +static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) +{ + return list_entry(inode, struct hfsplus_inode_info, vfs_inode); +} + +/* + * Mark an inode dirty, and also mark the btree in which the + * specific type of metadata is stored. + * For data or metadata that gets written back by into the catalog btree + * by hfsplus_write_inode a plain mark_inode_dirty call is enough. + */ +static inline void hfsplus_mark_inode_dirty(struct inode *inode, + unsigned int flag) +{ + set_bit(flag, &HFSPLUS_I(inode)->flags); + mark_inode_dirty(inode); +} + +struct hfs_find_data { + /* filled by caller */ + hfsplus_btree_key *search_key; + hfsplus_btree_key *key; + /* filled by find */ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + /* filled by findrec */ + int record; + int keyoffset, keylength; + int entryoffset, entrylength; +}; + +struct hfsplus_readdir_data { + struct list_head list; + struct file *file; + struct hfsplus_cat_key key; +}; + +/* + * Find minimum acceptible I/O size for an hfsplus sb. + */ +static inline unsigned short hfsplus_min_io_size(struct super_block *sb) +{ + return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + HFSPLUS_SECTOR_SIZE); +} + +#define hfs_btree_open hfsplus_btree_open +#define hfs_btree_close hfsplus_btree_close +#define hfs_btree_write hfsplus_btree_write +#define hfs_bmap_alloc hfsplus_bmap_alloc +#define hfs_bmap_free hfsplus_bmap_free +#define hfs_bnode_read hfsplus_bnode_read +#define hfs_bnode_read_u16 hfsplus_bnode_read_u16 +#define hfs_bnode_read_u8 hfsplus_bnode_read_u8 +#define hfs_bnode_read_key hfsplus_bnode_read_key +#define hfs_bnode_write hfsplus_bnode_write +#define hfs_bnode_write_u16 hfsplus_bnode_write_u16 +#define hfs_bnode_clear hfsplus_bnode_clear +#define hfs_bnode_copy hfsplus_bnode_copy +#define hfs_bnode_move hfsplus_bnode_move +#define hfs_bnode_dump hfsplus_bnode_dump +#define hfs_bnode_unlink hfsplus_bnode_unlink +#define hfs_bnode_findhash hfsplus_bnode_findhash +#define hfs_bnode_find hfsplus_bnode_find +#define hfs_bnode_unhash hfsplus_bnode_unhash +#define hfs_bnode_free hfsplus_bnode_free +#define hfs_bnode_create hfsplus_bnode_create +#define hfs_bnode_get hfsplus_bnode_get +#define hfs_bnode_put hfsplus_bnode_put +#define hfs_brec_lenoff hfsplus_brec_lenoff +#define hfs_brec_keylen hfsplus_brec_keylen +#define hfs_brec_insert hfsplus_brec_insert +#define hfs_brec_remove hfsplus_brec_remove +#define hfs_find_init hfsplus_find_init +#define hfs_find_exit hfsplus_find_exit +#define __hfs_brec_find __hfsplus_brec_find +#define hfs_brec_find hfsplus_brec_find +#define hfs_brec_read hfsplus_brec_read +#define hfs_brec_goto hfsplus_brec_goto +#define hfs_part_find hfsplus_part_find + +/* + * definitions for ext2 flag ioctls (linux really needs a generic + * interface for this). + */ + +/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support + * chattr/lsattr */ +#define HFSPLUS_IOC_EXT2_GETFLAGS FS_IOC_GETFLAGS +#define HFSPLUS_IOC_EXT2_SETFLAGS FS_IOC_SETFLAGS + + +/* + * hfs+-specific ioctl for making the filesystem bootable + */ +#define HFSPLUS_IOC_BLESS _IO('h', 0x80) + +typedef int (*search_strategy_t)(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); + +/* + * Functions in any *.c used in other files + */ + +/* attributes.c */ +int __init hfsplus_create_attr_tree_cache(void); +void hfsplus_destroy_attr_tree_cache(void); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name); +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry); +int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name, + struct hfs_find_data *fd); +int hfsplus_attr_exists(struct inode *inode, const char *name); +int hfsplus_create_attr(struct inode *inode, const char *name, + const void *value, size_t size); +int hfsplus_delete_attr(struct inode *inode, const char *name); +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); + +/* bitmap.c */ +int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, + u32 *max); +int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count); + +/* btree.c */ +u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, + int file_id); +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); +void hfs_btree_close(struct hfs_btree *tree); +int hfs_btree_write(struct hfs_btree *tree); +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); +void hfs_bmap_free(struct hfs_bnode *node); + +/* bnode.c */ +void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len); +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off); +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off); +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off); +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len); +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data); +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len); +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len); +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len); +void hfs_bnode_dump(struct hfs_bnode *node); +void hfs_bnode_unlink(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid); +void hfs_bnode_unhash(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num); +void hfs_bnode_free(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num); +void hfs_bnode_get(struct hfs_bnode *node); +void hfs_bnode_put(struct hfs_bnode *node); +bool hfs_bnode_need_zeroout(struct hfs_btree *tree); + +/* brec.c */ +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off); +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec); +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len); +int hfs_brec_remove(struct hfs_find_data *fd); + +/* bfind.c */ +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd); +void hfs_find_exit(struct hfs_find_data *fd); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found); +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare); +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len); +int hfs_brec_goto(struct hfs_find_data *fd, int cnt); + +/* catalog.c */ +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 parent, struct qstr *str); +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent); +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); +int hfsplus_find_cat(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd); +int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, + struct inode *inode); +int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str); +int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name); + +/* dir.c */ +extern const struct inode_operations hfsplus_dir_inode_operations; +extern const struct file_operations hfsplus_dir_operations; + +/* extents.c */ +int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_ext_write_extent(struct inode *inode); +int hfsplus_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type); +int hfsplus_file_extend(struct inode *inode, bool zeroout); +void hfsplus_file_truncate(struct inode *inode); + +/* inode.c */ +extern const struct address_space_operations hfsplus_aops; +extern const struct address_space_operations hfsplus_btree_aops; +extern const struct dentry_operations hfsplus_dentry_operations; + +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +void hfsplus_delete_inode(struct inode *inode); +void hfsplus_inode_read_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); +int hfsplus_cat_write_inode(struct inode *inode); +int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync); + +/* ioctl.c */ +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + +/* options.c */ +void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); +int hfsplus_parse_options_remount(char *input, int *force); +int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_show_options(struct seq_file *seq, struct dentry *root); + +/* part_tbl.c */ +int hfs_part_find(struct super_block *sb, sector_t *part_start, + sector_t *part_size); + +/* super.c */ +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino); +void hfsplus_mark_mdb_dirty(struct super_block *sb); + +/* tables.c */ +extern u16 hfsplus_case_fold_table[]; +extern u16 hfsplus_decompose_table[]; +extern u16 hfsplus_compose_table[]; + +/* unicode.c */ +int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + char *astr, int *len_p); +int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + int max_unistr_len, const char *astr, int len); +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); + +/* wrapper.c */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, + void **data, int rw); +int hfsplus_read_wrapper(struct super_block *sb); + +/* time macros */ +#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) +#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U)) + +/* compatibility */ +#define hfsp_mt2ut(t) (struct timespec){ .tv_sec = __hfsp_mt2ut(t) } +#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) +#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) + +#endif diff --git a/kernel/fs/hfsplus/hfsplus_raw.h b/kernel/fs/hfsplus/hfsplus_raw.h new file mode 100644 index 000000000..8298d0985 --- /dev/null +++ b/kernel/fs/hfsplus/hfsplus_raw.h @@ -0,0 +1,407 @@ +/* + * linux/include/linux/hfsplus_raw.h + * + * Copyright (C) 1999 + * Brad Boyer (flar@pants.nu) + * (C) 2003 Ardis Technologies + * + * Format of structures on disk + * Information taken from Apple Technote #1150 (HFS Plus Volume Format) + * + */ + +#ifndef _LINUX_HFSPLUS_RAW_H +#define _LINUX_HFSPLUS_RAW_H + +#include + +/* Some constants */ +#define HFSPLUS_SECTOR_SIZE 512 +#define HFSPLUS_SECTOR_SHIFT 9 +#define HFSPLUS_VOLHEAD_SECTOR 2 +#define HFSPLUS_VOLHEAD_SIG 0x482b +#define HFSPLUS_VOLHEAD_SIGX 0x4858 +#define HFSPLUS_SUPER_MAGIC 0x482b +#define HFSPLUS_MIN_VERSION 4 +#define HFSPLUS_CURRENT_VERSION 5 + +#define HFSP_WRAP_MAGIC 0x4244 +#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 +#define HFSP_WRAP_ATTRIB_SPARED 0x0200 + +#define HFSP_WRAPOFF_SIG 0x00 +#define HFSP_WRAPOFF_ATTRIB 0x0A +#define HFSP_WRAPOFF_ABLKSIZE 0x14 +#define HFSP_WRAPOFF_ABLKSTART 0x1C +#define HFSP_WRAPOFF_EMBEDSIG 0x7C +#define HFSP_WRAPOFF_EMBEDEXT 0x7E + +#define HFSP_HIDDENDIR_NAME \ + "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" + +#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ +#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ + +#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */ +#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */ + +#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ + +/* Structures used on disk */ + +typedef __be32 hfsplus_cnid; +typedef __be16 hfsplus_unichr; + +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 + +/* A "string" as used in filenames, etc. */ +struct hfsplus_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; +} __packed; + +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed; + +/* POSIX permissions */ +struct hfsplus_perm { + __be32 owner; + __be32 group; + u8 rootflags; + u8 userflags; + __be16 mode; + __be32 dev; +} __packed; + +#define HFSPLUS_FLG_NODUMP 0x01 +#define HFSPLUS_FLG_IMMUTABLE 0x02 +#define HFSPLUS_FLG_APPEND 0x04 + +/* A single contiguous area of a file */ +struct hfsplus_extent { + __be32 start_block; + __be32 block_count; +} __packed; +typedef struct hfsplus_extent hfsplus_extent_rec[8]; + +/* Information for a "Fork" in a file */ +struct hfsplus_fork_raw { + __be64 total_size; + __be32 clump_size; + __be32 total_blocks; + hfsplus_extent_rec extents; +} __packed; + +/* HFS+ Volume Header */ +struct hfsplus_vh { + __be16 signature; + __be16 version; + __be32 attributes; + __be32 last_mount_vers; + u32 reserved; + + __be32 create_date; + __be32 modify_date; + __be32 backup_date; + __be32 checked_date; + + __be32 file_count; + __be32 folder_count; + + __be32 blocksize; + __be32 total_blocks; + __be32 free_blocks; + + __be32 next_alloc; + __be32 rsrc_clump_sz; + __be32 data_clump_sz; + hfsplus_cnid next_cnid; + + __be32 write_count; + __be64 encodings_bmp; + + u32 finder_info[8]; + + struct hfsplus_fork_raw alloc_file; + struct hfsplus_fork_raw ext_file; + struct hfsplus_fork_raw cat_file; + struct hfsplus_fork_raw attr_file; + struct hfsplus_fork_raw start_file; +} __packed; + +/* HFS+ volume attributes */ +#define HFSPLUS_VOL_UNMNT (1 << 8) +#define HFSPLUS_VOL_SPARE_BLK (1 << 9) +#define HFSPLUS_VOL_NOCACHE (1 << 10) +#define HFSPLUS_VOL_INCNSTNT (1 << 11) +#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) +#define HFSPLUS_VOL_JOURNALED (1 << 13) +#define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) + +/* HFS+ BTree node descriptor */ +struct hfs_bnode_desc { + __be32 next; + __be32 prev; + s8 type; + u8 height; + __be16 num_recs; + u16 reserved; +} __packed; + +/* HFS+ BTree node types */ +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +/* HFS+ BTree header */ +struct hfs_btree_header_rec { + __be16 depth; + __be32 root; + __be32 leaf_count; + __be32 leaf_head; + __be32 leaf_tail; + __be16 node_size; + __be16 max_key_len; + __be32 node_count; + __be32 free_nodes; + u16 reserved1; + __be32 clump_size; + u8 btree_type; + u8 key_type; + __be32 attributes; + u32 reserved3[16]; +} __packed; + +/* BTree attributes */ +#define HFS_TREE_BIGKEYS 2 +#define HFS_TREE_VARIDXKEYS 4 + +/* HFS+ BTree misc info */ +#define HFSPLUS_TREE_HEAD 0 +#define HFSPLUS_NODE_MXSZ 32768 +#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 +#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_USER_BYTES 128 + +/* Some special File ID numbers (stolen from hfs.h) */ +#define HFSPLUS_POR_CNID 1 /* Parent Of the Root */ +#define HFSPLUS_ROOT_CNID 2 /* ROOT directory */ +#define HFSPLUS_EXT_CNID 3 /* EXTents B-tree */ +#define HFSPLUS_CAT_CNID 4 /* CATalog B-tree */ +#define HFSPLUS_BAD_CNID 5 /* BAD blocks file */ +#define HFSPLUS_ALLOC_CNID 6 /* ALLOCation file */ +#define HFSPLUS_START_CNID 7 /* STARTup file */ +#define HFSPLUS_ATTR_CNID 8 /* ATTRibutes file */ +#define HFSPLUS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFSPLUS_FIRSTUSER_CNID 16 /* first available user id */ + +/* btree key type */ +#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ +#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */ + +/* HFS+ catalog entry key */ +struct hfsplus_cat_key { + __be16 key_len; + hfsplus_cnid parent; + struct hfsplus_unistr name; +} __packed; + +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) + +/* Structs from hfs.h */ +struct hfsp_point { + __be16 v; + __be16 h; +} __packed; + +struct hfsp_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + + +/* HFS directory info (stolen from hfs.h */ +struct DInfo { + struct hfsp_rect frRect; + __be16 frFlags; + struct hfsp_point frLocation; + __be16 frView; +} __packed; + +struct DXInfo { + struct hfsp_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +/* HFS+ folder data (part of an hfsplus_cat_entry) */ +struct hfsplus_cat_folder { + __be16 type; + __be16 flags; + __be32 valence; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct DInfo user_info; + struct DXInfo finder_info; + __be32 text_encoding; + __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ +} __packed; + +/* HFS file info (stolen from hfs.h) */ +struct FInfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfsp_point fdLocation; + __be16 fdFldr; +} __packed; + +struct FXInfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +/* HFS+ file data (part of a cat_entry) */ +struct hfsplus_cat_file { + __be16 type; + __be16 flags; + u32 reserved1; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct FInfo user_info; + struct FXInfo finder_info; + __be32 text_encoding; + u32 reserved2; + + struct hfsplus_fork_raw data_fork; + struct hfsplus_fork_raw rsrc_fork; +} __packed; + +/* File and folder flag bits */ +#define HFSPLUS_FILE_LOCKED 0x0001 +#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 +#define HFSPLUS_XATTR_EXISTS 0x0004 +#define HFSPLUS_ACL_EXISTS 0x0008 +#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count + * (HFSX only) */ + +/* HFS+ catalog thread (part of a cat_entry) */ +struct hfsplus_cat_thread { + __be16 type; + s16 reserved; + hfsplus_cnid parentID; + struct hfsplus_unistr nodeName; +} __packed; + +#define HFSPLUS_MIN_THREAD_SZ 10 + +/* A data record in the catalog tree */ +typedef union { + __be16 type; + struct hfsplus_cat_folder folder; + struct hfsplus_cat_file file; + struct hfsplus_cat_thread thread; +} __packed hfsplus_cat_entry; + +/* HFS+ catalog entry type */ +#define HFSPLUS_FOLDER 0x0001 +#define HFSPLUS_FILE 0x0002 +#define HFSPLUS_FOLDER_THREAD 0x0003 +#define HFSPLUS_FILE_THREAD 0x0004 + +/* HFS+ extents tree key */ +struct hfsplus_ext_key { + __be16 key_len; + u8 fork_type; + u8 pad; + hfsplus_cnid cnid; + __be32 start_block; +} __packed; + +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) + +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA 0x20 +#define HFSPLUS_ATTR_EXTENTS 0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { + __be16 key_len; + __be16 pad; + hfsplus_cnid cnid; + __be32 start_block; + struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { + __be32 record_type; + __be32 reserved; + struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { + __be32 record_type; + __be32 reserved; + struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { + __be32 record_type; + __be32 reserved1; + u8 reserved2[6]; + __be16 length; + u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { + __be32 record_type; + struct hfsplus_attr_fork_data fork_data; + struct hfsplus_attr_extents extents; + struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; + +/* HFS+ generic BTree key */ +typedef union { + __be16 key_len; + struct hfsplus_cat_key cat; + struct hfsplus_ext_key ext; + struct hfsplus_attr_key attr; +} __packed hfsplus_btree_key; + +#endif diff --git a/kernel/fs/hfsplus/inode.c b/kernel/fs/hfsplus/inode.c new file mode 100644 index 000000000..6dd107d74 --- /dev/null +++ b/kernel/fs/hfsplus/inode.c @@ -0,0 +1,617 @@ +/* + * linux/fs/hfsplus/inode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Inode handling routines + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" +#include "xattr.h" +#include "acl.h" + +static int hfsplus_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, hfsplus_get_block); +} + +static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, hfsplus_get_block, wbc); +} + +static void hfsplus_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + hfsplus_file_truncate(inode); + } +} + +static int hfsplus_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hfsplus_get_block, + &HFSPLUS_I(mapping->host)->phys_size); + if (unlikely(ret)) + hfsplus_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, hfsplus_get_block); +} + +static int hfsplus_releasepage(struct page *page, gfp_t mask) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct hfs_btree *tree; + struct hfs_bnode *node; + u32 nidx; + int i, res = 1; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + tree = HFSPLUS_SB(sb)->ext_tree; + break; + case HFSPLUS_CAT_CNID: + tree = HFSPLUS_SB(sb)->cat_tree; + break; + case HFSPLUS_ATTR_CNID: + tree = HFSPLUS_SB(sb)->attr_tree; + break; + default: + BUG(); + return 0; + } + if (!tree) + return 0; + if (tree->node_size >= PAGE_CACHE_SIZE) { + nidx = page->index >> + (tree->node_size_shift - PAGE_CACHE_SHIFT); + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, nidx); + if (!node) + ; + else if (atomic_read(&node->refcnt)) + res = 0; + if (res && node) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } + spin_unlock(&tree->hash_lock); + } else { + nidx = page->index << + (PAGE_CACHE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + spin_lock(&tree->hash_lock); + do { + node = hfs_bnode_findhash(tree, nidx++); + if (!node) + continue; + if (atomic_read(&node->refcnt)) { + res = 0; + break; + } + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } while (--i && nidx < tree->node_count); + spin_unlock(&tree->hash_lock); + } + return res ? try_to_free_buffers(page) : 0; +} + +static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + hfsplus_write_failed(mapping, end); + } + + return ret; +} + +static int hfsplus_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hfsplus_get_block); +} + +const struct address_space_operations hfsplus_btree_aops = { + .readpage = hfsplus_readpage, + .writepage = hfsplus_writepage, + .write_begin = hfsplus_write_begin, + .write_end = generic_write_end, + .bmap = hfsplus_bmap, + .releasepage = hfsplus_releasepage, +}; + +const struct address_space_operations hfsplus_aops = { + .readpage = hfsplus_readpage, + .writepage = hfsplus_writepage, + .write_begin = hfsplus_write_begin, + .write_end = generic_write_end, + .bmap = hfsplus_bmap, + .direct_IO = hfsplus_direct_IO, + .writepages = hfsplus_writepages, +}; + +const struct dentry_operations hfsplus_dentry_operations = { + .d_hash = hfsplus_hash_dentry, + .d_compare = hfsplus_compare_dentry, +}; + +static void hfsplus_get_perms(struct inode *inode, + struct hfsplus_perm *perms, int dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + u16 mode; + + mode = be16_to_cpu(perms->mode); + + i_uid_write(inode, be32_to_cpu(perms->owner)); + if (!i_uid_read(inode) && !mode) + inode->i_uid = sbi->uid; + + i_gid_write(inode, be32_to_cpu(perms->group)); + if (!i_gid_read(inode) && !mode) + inode->i_gid = sbi->gid; + + if (dir) { + mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); + mode |= S_IFDIR; + } else if (!mode) + mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); + inode->i_mode = mode; + + HFSPLUS_I(inode)->userflags = perms->userflags; + if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (perms->rootflags & HFSPLUS_FLG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; +} + +static int hfsplus_file_open(struct inode *inode, struct file *file) +{ + if (HFSPLUS_IS_RSRC(inode)) + inode = HFSPLUS_I(inode)->rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; + atomic_inc(&HFSPLUS_I(inode)->opencnt); + return 0; +} + +static int hfsplus_file_release(struct inode *inode, struct file *file) +{ + struct super_block *sb = inode->i_sb; + + if (HFSPLUS_IS_RSRC(inode)) + inode = HFSPLUS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { + mutex_lock(&inode->i_mutex); + hfsplus_file_truncate(inode); + if (inode->i_flags & S_DEAD) { + hfsplus_delete_cat(inode->i_ino, + HFSPLUS_SB(sb)->hidden_dir, NULL); + hfsplus_delete_inode(inode); + } + mutex_unlock(&inode->i_mutex); + } + return 0; +} + +static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand_simple(inode, + attr->ia_size); + if (error) + return error; + } + truncate_setsize(inode, attr->ia_size); + hfsplus_file_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) { + error = posix_acl_chmod(inode, inode->i_mode); + if (unlikely(error)) + return error; + } + + return 0; +} + +int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + int error = 0, error2; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + mutex_lock(&inode->i_mutex); + + /* + * Sync inode metadata into the catalog and extent trees. + */ + sync_inode_metadata(inode, 1); + + /* + * And explicitly write out the btrees. + */ + if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags)) + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + + if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) { + error2 = + filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + } + + if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) { + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait( + sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } else { + pr_err("sync non-existent attributes tree\n"); + } + } + + if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + } + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + mutex_unlock(&inode->i_mutex); + + return error; +} + +static const struct inode_operations hfsplus_file_inode_operations = { + .setattr = hfsplus_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = hfsplus_listxattr, + .removexattr = generic_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, +#endif +}; + +static const struct file_operations hfsplus_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .fsync = hfsplus_file_fsync, + .open = hfsplus_file_open, + .release = hfsplus_file_release, + .unlocked_ioctl = hfsplus_ioctl, +}; + +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct inode *inode = new_inode(sb); + struct hfsplus_inode_info *hip; + + if (!inode) + return NULL; + + inode->i_ino = sbi->next_cnid++; + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + set_nlink(inode, 1); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + hip = HFSPLUS_I(inode); + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + atomic_set(&hip->opencnt, 0); + hip->extent_state = 0; + hip->flags = 0; + hip->userflags = 0; + hip->subfolders = 0; + memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->alloc_blocks = 0; + hip->first_blocks = 0; + hip->cached_start = 0; + hip->cached_blocks = 0; + hip->phys_size = 0; + hip->fs_blocks = 0; + hip->rsrc_inode = NULL; + if (S_ISDIR(inode->i_mode)) { + inode->i_size = 2; + sbi->folder_count++; + inode->i_op = &hfsplus_dir_inode_operations; + inode->i_fop = &hfsplus_dir_operations; + } else if (S_ISREG(inode->i_mode)) { + sbi->file_count++; + inode->i_op = &hfsplus_file_inode_operations; + inode->i_fop = &hfsplus_file_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + hip->clump_blocks = sbi->data_clump_blocks; + } else if (S_ISLNK(inode->i_mode)) { + sbi->file_count++; + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + hip->clump_blocks = 1; + } else + sbi->file_count++; + insert_inode_hash(inode); + mark_inode_dirty(inode); + hfsplus_mark_mdb_dirty(sb); + + return inode; +} + +void hfsplus_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (S_ISDIR(inode->i_mode)) { + HFSPLUS_SB(sb)->folder_count--; + hfsplus_mark_mdb_dirty(sb); + return; + } + HFSPLUS_SB(sb)->file_count--; + if (S_ISREG(inode->i_mode)) { + if (!inode->i_nlink) { + inode->i_size = 0; + hfsplus_file_truncate(inode); + } + } else if (S_ISLNK(inode->i_mode)) { + inode->i_size = 0; + hfsplus_file_truncate(inode); + } + hfsplus_mark_mdb_dirty(sb); +} + +void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int i; + + memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); + for (count = 0, i = 0; i < 8; i++) + count += be32_to_cpu(fork->extents[i].block_count); + hip->first_blocks = count; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_start = 0; + hip->cached_blocks = 0; + + hip->alloc_blocks = be32_to_cpu(fork->total_blocks); + hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); + hip->fs_blocks = + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hip->clump_blocks = + be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; + if (!hip->clump_blocks) { + hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? + sbi->rsrc_clump_blocks : + sbi->data_clump_blocks; + } +} + +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork) +{ + memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, + sizeof(hfsplus_extent_rec)); + fork->total_size = cpu_to_be64(inode->i_size); + fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); +} + +int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) +{ + hfsplus_cat_entry entry; + int res = 0; + u16 type; + + type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); + + HFSPLUS_I(inode)->linkid = 0; + if (type == HFSPLUS_FOLDER) { + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) + /* panic? */; + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_get_perms(inode, &folder->permissions, 1); + set_nlink(inode, 1); + inode->i_size = 2 + be32_to_cpu(folder->valence); + inode->i_atime = hfsp_mt2ut(folder->access_date); + inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); + inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + HFSPLUS_I(inode)->create_date = folder->create_date; + HFSPLUS_I(inode)->fs_blocks = 0; + if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { + HFSPLUS_I(inode)->subfolders = + be32_to_cpu(folder->subfolders); + } + inode->i_op = &hfsplus_dir_inode_operations; + inode->i_fop = &hfsplus_dir_operations; + } else if (type == HFSPLUS_FILE) { + struct hfsplus_cat_file *file = &entry.file; + + if (fd->entrylength < sizeof(struct hfsplus_cat_file)) + /* panic? */; + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_file)); + + hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? + &file->rsrc_fork : &file->data_fork); + hfsplus_get_perms(inode, &file->permissions, 0); + set_nlink(inode, 1); + if (S_ISREG(inode->i_mode)) { + if (file->permissions.dev) + set_nlink(inode, + be32_to_cpu(file->permissions.dev)); + inode->i_op = &hfsplus_file_inode_operations; + inode->i_fop = &hfsplus_file_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + } else { + init_special_inode(inode, inode->i_mode, + be32_to_cpu(file->permissions.dev)); + } + inode->i_atime = hfsp_mt2ut(file->access_date); + inode->i_mtime = hfsp_mt2ut(file->content_mod_date); + inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + HFSPLUS_I(inode)->create_date = file->create_date; + } else { + pr_err("bad catalog entry used to create inode\n"); + res = -EIO; + } + return res; +} + +int hfsplus_cat_write_inode(struct inode *inode) +{ + struct inode *main_inode = inode; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + + if (HFSPLUS_IS_RSRC(inode)) + main_inode = HFSPLUS_I(inode)->rsrc_inode; + + if (!main_inode->i_nlink) + return 0; + + if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) + /* panic? */ + return -EIO; + + if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd)) + /* panic? */ + goto out; + + if (S_ISDIR(main_inode->i_mode)) { + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) + /* panic? */; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + /* simple node checks? */ + hfsplus_cat_set_perms(inode, &folder->permissions); + folder->access_date = hfsp_ut2mt(inode->i_atime); + folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); + folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + folder->valence = cpu_to_be32(inode->i_size - 2); + if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { + folder->subfolders = + cpu_to_be32(HFSPLUS_I(inode)->subfolders); + } + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + } else if (HFSPLUS_IS_RSRC(inode)) { + struct hfsplus_cat_file *file = &entry.file; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_inode_write_fork(inode, &file->rsrc_fork); + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + } else { + struct hfsplus_cat_file *file = &entry.file; + + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) + /* panic? */; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_inode_write_fork(inode, &file->data_fork); + hfsplus_cat_set_perms(inode, &file->permissions); + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) + file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); + else + file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); + file->access_date = hfsp_ut2mt(inode->i_atime); + file->content_mod_date = hfsp_ut2mt(inode->i_mtime); + file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + } + + set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); +out: + hfs_find_exit(&fd); + return 0; +} diff --git a/kernel/fs/hfsplus/ioctl.c b/kernel/fs/hfsplus/ioctl.c new file mode 100644 index 000000000..0624ce4e0 --- /dev/null +++ b/kernel/fs/hfsplus/ioctl.c @@ -0,0 +1,150 @@ +/* + * linux/fs/hfsplus/ioctl.c + * + * Copyright (C) 2003 + * Ethan Benson + * partially derived from linux/fs/ext2/ioctl.c + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * hfsplus ioctls + */ + +#include +#include +#include +#include +#include +#include "hfsplus_fs.h" + +/* + * "Blessing" an HFS+ filesystem writes metadata to the superblock informing + * the platform firmware which file to boot from + */ +static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vh = sbi->s_vhdr; + struct hfsplus_vh *bvh = sbi->s_backup_vhdr; + u32 cnid = (unsigned long)dentry->d_fsdata; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sbi->vh_mutex); + + /* Directory containing the bootable system */ + vh->finder_info[0] = bvh->finder_info[0] = + cpu_to_be32(parent_ino(dentry)); + + /* + * Bootloader. Just using the inode here breaks in the case of + * hard links - the firmware wants the ID of the hard link file, + * but the inode points at the indirect inode + */ + vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(cnid); + + /* Per spec, the OS X system folder - same as finder_info[0] here */ + vh->finder_info[5] = bvh->finder_info[5] = + cpu_to_be32(parent_ino(dentry)); + + mutex_unlock(&sbi->vh_mutex); + return 0; +} + +static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file_inode(file); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags = 0; + + if (inode->i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (inode->i_flags & S_APPEND) + flags |= FS_APPEND_FL; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + flags |= FS_NODUMP_FL; + + return put_user(flags, user_flags); +} + +static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file_inode(file); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags, new_fl = 0; + int err = 0; + + err = mnt_want_write_file(file); + if (err) + goto out; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto out_drop_write; + } + + if (get_user(flags, user_flags)) { + err = -EFAULT; + goto out_drop_write; + } + + mutex_lock(&inode->i_mutex); + + if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || + inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; + } + } + + /* don't silently ignore unsupported ext2 flags */ + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto out_unlock_inode; + } + + if (flags & FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + + if (flags & FS_APPEND_FL) + new_fl |= S_APPEND; + + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); + + if (flags & FS_NODUMP_FL) + hip->userflags |= HFSPLUS_FLG_NODUMP; + else + hip->userflags &= ~HFSPLUS_FLG_NODUMP; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +out_unlock_inode: + mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write_file(file); +out: + return err; +} + +long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + + switch (cmd) { + case HFSPLUS_IOC_EXT2_GETFLAGS: + return hfsplus_ioctl_getflags(file, argp); + case HFSPLUS_IOC_EXT2_SETFLAGS: + return hfsplus_ioctl_setflags(file, argp); + case HFSPLUS_IOC_BLESS: + return hfsplus_ioctl_bless(file, argp); + default: + return -ENOTTY; + } +} diff --git a/kernel/fs/hfsplus/options.c b/kernel/fs/hfsplus/options.c new file mode 100644 index 000000000..c90b72ee6 --- /dev/null +++ b/kernel/fs/hfsplus/options.c @@ -0,0 +1,238 @@ +/* + * linux/fs/hfsplus/options.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Option parsing + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hfsplus_fs.h" + +enum { + opt_creator, opt_type, + opt_umask, opt_uid, opt_gid, + opt_part, opt_session, opt_nls, + opt_nodecompose, opt_decompose, + opt_barrier, opt_nobarrier, + opt_force, opt_err +}; + +static const match_table_t tokens = { + { opt_creator, "creator=%s" }, + { opt_type, "type=%s" }, + { opt_umask, "umask=%o" }, + { opt_uid, "uid=%u" }, + { opt_gid, "gid=%u" }, + { opt_part, "part=%u" }, + { opt_session, "session=%u" }, + { opt_nls, "nls=%s" }, + { opt_decompose, "decompose" }, + { opt_nodecompose, "nodecompose" }, + { opt_barrier, "barrier" }, + { opt_nobarrier, "nobarrier" }, + { opt_force, "force" }, + { opt_err, NULL } +}; + +/* Initialize an options object to reasonable defaults */ +void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) +{ + if (!opts) + return; + + opts->creator = HFSPLUS_DEF_CR_TYPE; + opts->type = HFSPLUS_DEF_CR_TYPE; + opts->umask = current_umask(); + opts->uid = current_uid(); + opts->gid = current_gid(); + opts->part = -1; + opts->session = -1; +} + +/* convert a "four byte character" to a 32 bit int with error checks */ +static inline int match_fourchar(substring_t *arg, u32 *result) +{ + if (arg->to - arg->from != 4) + return -EINVAL; + memcpy(result, arg->from, 4); + return 0; +} + +int hfsplus_parse_options_remount(char *input, int *force) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!input) + return 1; + + while ((p = strsep(&input, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_force: + *force = 1; + break; + default: + break; + } + } + + return 1; +} + +/* Parse options from mount. Returns 0 on failure */ +/* input is the options passed to mount() as a string */ +int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int tmp, token; + + if (!input) + goto done; + + while ((p = strsep(&input, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_creator: + if (match_fourchar(&args[0], &sbi->creator)) { + pr_err("creator requires a 4 character value\n"); + return 0; + } + break; + case opt_type: + if (match_fourchar(&args[0], &sbi->type)) { + pr_err("type requires a 4 character value\n"); + return 0; + } + break; + case opt_umask: + if (match_octal(&args[0], &tmp)) { + pr_err("umask requires a value\n"); + return 0; + } + sbi->umask = (umode_t)tmp; + break; + case opt_uid: + if (match_int(&args[0], &tmp)) { + pr_err("uid requires an argument\n"); + return 0; + } + sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); + if (!uid_valid(sbi->uid)) { + pr_err("invalid uid specified\n"); + return 0; + } + break; + case opt_gid: + if (match_int(&args[0], &tmp)) { + pr_err("gid requires an argument\n"); + return 0; + } + sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); + if (!gid_valid(sbi->gid)) { + pr_err("invalid gid specified\n"); + return 0; + } + break; + case opt_part: + if (match_int(&args[0], &sbi->part)) { + pr_err("part requires an argument\n"); + return 0; + } + break; + case opt_session: + if (match_int(&args[0], &sbi->session)) { + pr_err("session requires an argument\n"); + return 0; + } + break; + case opt_nls: + if (sbi->nls) { + pr_err("unable to change nls mapping\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + sbi->nls = load_nls(p); + if (!sbi->nls) { + pr_err("unable to load nls mapping \"%s\"\n", + p); + kfree(p); + return 0; + } + kfree(p); + break; + case opt_decompose: + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_nodecompose: + set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_barrier: + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_nobarrier: + set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_force: + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); + break; + default: + return 0; + } + } + +done: + if (!sbi->nls) { + /* try utf8 first, as this is the old default behaviour */ + sbi->nls = load_nls("utf8"); + if (!sbi->nls) + sbi->nls = load_nls_default(); + if (!sbi->nls) + return 0; + } + + return 1; +} + +int hfsplus_show_options(struct seq_file *seq, struct dentry *root) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); + + if (sbi->creator != HFSPLUS_DEF_CR_TYPE) + seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + if (sbi->type != HFSPLUS_DEF_CR_TYPE) + seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, + from_kuid_munged(&init_user_ns, sbi->uid), + from_kgid_munged(&init_user_ns, sbi->gid)); + if (sbi->part >= 0) + seq_printf(seq, ",part=%u", sbi->part); + if (sbi->session >= 0) + seq_printf(seq, ",session=%u", sbi->session); + if (sbi->nls) + seq_printf(seq, ",nls=%s", sbi->nls->charset); + if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) + seq_puts(seq, ",nodecompose"); + if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + seq_puts(seq, ",nobarrier"); + return 0; +} diff --git a/kernel/fs/hfsplus/part_tbl.c b/kernel/fs/hfsplus/part_tbl.c new file mode 100644 index 000000000..eb355d81e --- /dev/null +++ b/kernel/fs/hfsplus/part_tbl.c @@ -0,0 +1,157 @@ +/* + * linux/fs/hfsplus/part_tbl.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * This file may be distributed under the terms of + * the GNU General Public License. + * + * Original code to handle the new style Mac partition table based on + * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). + * + * In function preconditions the term "valid" applied to a pointer to + * a structure means that the pointer is non-NULL and the structure it + * points to has all fields initialized to consistent values. + * + */ + +#include +#include "hfsplus_fs.h" + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +/* + * The new style Mac partition map + * + * For each partition on the media there is a physical block (512-byte + * block) containing one of these structures. These blocks are + * contiguous starting at block 1. + */ +struct new_pmap { + __be16 pmSig; /* signature */ + __be16 reSigPad; /* padding */ + __be32 pmMapBlkCnt; /* partition blocks count */ + __be32 pmPyPartStart; /* physical block start of partition */ + __be32 pmPartBlkCnt; /* physical block count of partition */ + u8 pmPartName[32]; /* (null terminated?) string + giving the name of this + partition */ + u8 pmPartType[32]; /* (null terminated?) string + giving the type of this + partition */ + /* a bunch more stuff we don't need */ +} __packed; + +/* + * The old style Mac partition map + * + * The partition map consists for a 2-byte signature followed by an + * array of these structures. The map is terminated with an all-zero + * one of these. + */ +struct old_pmap { + __be16 pdSig; /* Signature bytes */ + struct old_pmap_entry { + __be32 pdStart; + __be32 pdSize; + __be32 pdFSID; + } pdEntry[42]; +} __packed; + +static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, + sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int i; + + for (i = 0; i < 42; i++) { + struct old_pmap_entry *p = &pm->pdEntry[i]; + + if (p->pdStart && p->pdSize && + p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(p->pdStart); + *part_size = be32_to_cpu(p->pdSize); + return 0; + } + } + + return -ENOENT; +} + +static int hfs_parse_new_pmap(struct super_block *sb, void *buf, + struct new_pmap *pm, sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int size = be32_to_cpu(pm->pmMapBlkCnt); + int buf_size = hfsplus_min_io_size(sb); + int res; + int i = 0; + + do { + if (!memcmp(pm->pmPartType, "Apple_HFS", 9) && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(pm->pmPyPartStart); + *part_size = be32_to_cpu(pm->pmPartBlkCnt); + return 0; + } + + if (++i >= size) + return -ENOENT; + + pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); + if ((u8 *)pm - (u8 *)buf >= buf_size) { + res = hfsplus_submit_bio(sb, + *part_start + HFS_PMAP_BLK + i, + buf, (void **)&pm, READ); + if (res) + return res; + } + } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); + + return -ENOENT; +} + +/* + * Parse the partition map looking for the start and length of a + * HFS/HFS+ partition. + */ +int hfs_part_find(struct super_block *sb, + sector_t *part_start, sector_t *part_size) +{ + void *buf, *data; + int res; + + buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, + buf, &data, READ); + if (res) + goto out; + + switch (be16_to_cpu(*((__be16 *)data))) { + case HFS_OLD_PMAP_MAGIC: + res = hfs_parse_old_pmap(sb, data, part_start, part_size); + break; + case HFS_NEW_PMAP_MAGIC: + res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); + break; + default: + res = -ENOENT; + break; + } +out: + kfree(buf); + return res; +} diff --git a/kernel/fs/hfsplus/posix_acl.c b/kernel/fs/hfsplus/posix_acl.c new file mode 100644 index 000000000..df0c9af68 --- /dev/null +++ b/kernel/fs/hfsplus/posix_acl.c @@ -0,0 +1,140 @@ +/* + * linux/fs/hfsplus/posix_acl.c + * + * Vyacheslav Dubeyko + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" +#include "acl.h" + +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *xattr_name; + char *value = NULL; + ssize_t size; + + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __hfsplus_getxattr(inode, xattr_name, NULL, 0); + + if (size > 0) { + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return ERR_PTR(-ENOMEM); + size = __hfsplus_getxattr(inode, xattr_name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) +{ + int err; + char *xattr_name; + size_t size = 0; + char *value = NULL; + + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + err = posix_acl_equiv_mode(acl, &inode->i_mode); + if (err < 0) + return err; + } + err = 0; + break; + + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE)) + return -ENOMEM; + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return -ENOMEM; + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (unlikely(err < 0)) + goto end_set_acl; + } + + err = __hfsplus_setxattr(inode, xattr_name, value, size, 0); + +end_set_acl: + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!err) + set_cached_acl(inode, type, acl); + + return err; +} + +int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + int err = 0; + struct posix_acl *default_acl, *acl; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, dir->ino %lu\n", + __func__, inode->i_ino, dir->i_ino); + + if (S_ISLNK(inode->i_mode)) + return 0; + + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + + if (default_acl) { + err = hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + + if (acl) { + if (!err) + err = hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return err; +} diff --git a/kernel/fs/hfsplus/super.c b/kernel/fs/hfsplus/super.c new file mode 100644 index 000000000..593af2fdc --- /dev/null +++ b/kernel/fs/hfsplus/super.c @@ -0,0 +1,700 @@ +/* + * linux/fs/hfsplus/super.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct inode *hfsplus_alloc_inode(struct super_block *sb); +static void hfsplus_destroy_inode(struct inode *inode); + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_system_read_inode(struct inode *inode) +{ + struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + hfsplus_inode_read_fork(inode, &vhdr->ext_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + case HFSPLUS_CAT_CNID: + hfsplus_inode_read_fork(inode, &vhdr->cat_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + case HFSPLUS_ALLOC_CNID: + hfsplus_inode_read_fork(inode, &vhdr->alloc_file); + inode->i_mapping->a_ops = &hfsplus_aops; + break; + case HFSPLUS_START_CNID: + hfsplus_inode_read_fork(inode, &vhdr->start_file); + break; + case HFSPLUS_ATTR_CNID: + hfsplus_inode_read_fork(inode, &vhdr->attr_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + default: + return -EIO; + } + + return 0; +} + +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +{ + struct hfs_find_data fd; + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + mutex_init(&HFSPLUS_I(inode)->extents_lock); + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->extent_state = 0; + HFSPLUS_I(inode)->rsrc_inode = NULL; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) { + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (!err) { + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } + } else { + err = hfsplus_system_read_inode(inode); + } + + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + +static int hfsplus_system_write_inode(struct inode *inode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + struct hfsplus_fork_raw *fork; + struct hfs_btree *tree = NULL; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + fork = &vhdr->ext_file; + tree = sbi->ext_tree; + break; + case HFSPLUS_CAT_CNID: + fork = &vhdr->cat_file; + tree = sbi->cat_tree; + break; + case HFSPLUS_ALLOC_CNID: + fork = &vhdr->alloc_file; + break; + case HFSPLUS_START_CNID: + fork = &vhdr->start_file; + break; + case HFSPLUS_ATTR_CNID: + fork = &vhdr->attr_file; + tree = sbi->attr_tree; + break; + default: + return -EIO; + } + + if (fork->total_size != cpu_to_be64(inode->i_size)) { + set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); + hfsplus_mark_mdb_dirty(inode->i_sb); + } + hfsplus_inode_write_fork(inode, fork); + if (tree) { + int err = hfs_btree_write(tree); + + if (err) { + pr_err("b-tree write err: %d, ino %lu\n", + err, inode->i_ino); + return err; + } + } + return 0; +} + +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + int err; + + hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + + err = hfsplus_ext_write_extent(inode); + if (err) + return err; + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) + return hfsplus_cat_write_inode(inode); + else + return hfsplus_system_write_inode(inode); +} + +static void hfsplus_evict_inode(struct inode *inode) +{ + hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (HFSPLUS_IS_RSRC(inode)) { + HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFSPLUS_I(inode)->rsrc_inode); + } +} + +static int hfsplus_sync_fs(struct super_block *sb, int wait) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + int write_backup = 0; + int error, error2; + + if (!wait) + return 0; + + hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + + /* + * Explicitly write out the special metadata inodes. + * + * While these special inodes are marked as hashed and written + * out peridocically by the flusher threads we redirty them + * during writeout of normal inodes, and thus the life lock + * prevents us from getting the latest state to disk. + */ + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + + mutex_lock(&sbi->vh_mutex); + mutex_lock(&sbi->alloc_mutex); + vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); + vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); + vhdr->folder_count = cpu_to_be32(sbi->folder_count); + vhdr->file_count = cpu_to_be32(sbi->file_count); + + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { + memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); + write_backup = 1; + } + + error2 = hfsplus_submit_bio(sb, + sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, NULL, WRITE_SYNC); + if (!error) + error = error2; + if (!write_backup) + goto out; + + error2 = hfsplus_submit_bio(sb, + sbi->part_start + sbi->sect_count - 2, + sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); + if (!error) + error2 = error; +out: + mutex_unlock(&sbi->alloc_mutex); + mutex_unlock(&sbi->vh_mutex); + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + + return error; +} + +static void delayed_sync_fs(struct work_struct *work) +{ + int err; + struct hfsplus_sb_info *sbi; + + sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); + if (err) + pr_err("delayed sync fs err %d\n", err); +} + +void hfsplus_mark_mdb_dirty(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sync_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + +static void hfsplus_put_super(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + + hfs_dbg(SUPER, "hfsplus_put_super\n"); + + cancel_delayed_work_sync(&sbi->sync_work); + + if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { + struct hfsplus_vh *vhdr = sbi->s_vhdr; + + vhdr->modify_date = hfsp_now2mt(); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); + + hfsplus_sync_fs(sb, 1); + } + + hfs_btree_close(sbi->attr_tree); + hfs_btree_close(sbi->cat_tree); + hfs_btree_close(sbi->ext_tree); + iput(sbi->alloc_file); + iput(sbi->hidden_dir); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); + unload_nls(sbi->nls); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = HFSPLUS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->total_blocks << sbi->fs_shift; + buf->f_bfree = sbi->free_blocks << sbi->fs_shift; + buf->f_bavail = buf->f_bfree; + buf->f_files = 0xFFFFFFFF; + buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = HFSPLUS_MAX_STRLEN; + + return 0; +} + +static int hfsplus_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (!(*flags & MS_RDONLY)) { + struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; + int force = 0; + + if (!hfsplus_parse_options_remount(data, &force)) + return -EINVAL; + + if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { + pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (force) { + /* nothing */ + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { + pr_warn("filesystem is marked locked, leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + pr_warn("filesystem is marked journaled, leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } + } + return 0; +} + +static const struct super_operations hfsplus_sops = { + .alloc_inode = hfsplus_alloc_inode, + .destroy_inode = hfsplus_destroy_inode, + .write_inode = hfsplus_write_inode, + .evict_inode = hfsplus_evict_inode, + .put_super = hfsplus_put_super, + .sync_fs = hfsplus_sync_fs, + .statfs = hfsplus_statfs, + .remount_fs = hfsplus_remount, + .show_options = hfsplus_show_options, +}; + +static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) +{ + struct hfsplus_vh *vhdr; + struct hfsplus_sb_info *sbi; + hfsplus_cat_entry entry; + struct hfs_find_data fd; + struct inode *root, *inode; + struct qstr str; + struct nls_table *nls = NULL; + u64 last_fs_block, last_fs_page; + int err; + + err = -ENOMEM; + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto out; + + sb->s_fs_info = sbi; + mutex_init(&sbi->alloc_mutex); + mutex_init(&sbi->vh_mutex); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); + hfsplus_fill_defaults(sbi); + + err = -EINVAL; + if (!hfsplus_parse_options(data, sbi)) { + pr_err("unable to parse mount options\n"); + goto out_unload_nls; + } + + /* temporarily use utf8 to correctly find the hidden dir below */ + nls = sbi->nls; + sbi->nls = load_nls("utf8"); + if (!sbi->nls) { + pr_err("unable to load nls for utf8\n"); + goto out_unload_nls; + } + + /* Grab the volume header */ + if (hfsplus_read_wrapper(sb)) { + if (!silent) + pr_warn("unable to find HFS+ superblock\n"); + goto out_unload_nls; + } + vhdr = sbi->s_vhdr; + + /* Copy parts of the volume header into the superblock */ + sb->s_magic = HFSPLUS_VOLHEAD_SIG; + if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || + be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { + pr_err("wrong filesystem version\n"); + goto out_free_vhdr; + } + sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); + sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); + sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); + sbi->file_count = be32_to_cpu(vhdr->file_count); + sbi->folder_count = be32_to_cpu(vhdr->folder_count); + sbi->data_clump_blocks = + be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->data_clump_blocks) + sbi->data_clump_blocks = 1; + sbi->rsrc_clump_blocks = + be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->rsrc_clump_blocks) + sbi->rsrc_clump_blocks = 1; + + err = -EFBIG; + last_fs_block = sbi->total_blocks - 1; + last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >> + PAGE_CACHE_SHIFT; + + if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { + pr_err("filesystem size too large\n"); + goto out_free_vhdr; + } + + /* Set up operations so we can load metadata */ + sb->s_op = &hfsplus_sops; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { + pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { + /* nothing */ + } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { + pr_warn("Filesystem is marked locked, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && + !(sb->s_flags & MS_RDONLY)) { + pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + + err = -EINVAL; + + /* Load metadata objects (B*Trees) */ + sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); + if (!sbi->ext_tree) { + pr_err("failed to load extents file\n"); + goto out_free_vhdr; + } + sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); + if (!sbi->cat_tree) { + pr_err("failed to load catalog file\n"); + goto out_close_ext_tree; + } + atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); + if (vhdr->attr_file.total_blocks != 0) { + sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); + if (!sbi->attr_tree) { + pr_err("failed to load attributes file\n"); + goto out_close_cat_tree; + } + atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); + } + sb->s_xattr = hfsplus_xattr_handlers; + + inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); + if (IS_ERR(inode)) { + pr_err("failed to load allocation file\n"); + err = PTR_ERR(inode); + goto out_close_attr_tree; + } + sbi->alloc_file = inode; + + /* Load the root directory */ + root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); + if (IS_ERR(root)) { + pr_err("failed to load root directory\n"); + err = PTR_ERR(root); + goto out_put_alloc_file; + } + + sb->s_d_op = &hfsplus_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_put_alloc_file; + } + + str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; + str.name = HFSP_HIDDENDIR_NAME; + err = hfs_find_init(sbi->cat_tree, &fd); + if (err) + goto out_put_root; + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + if (unlikely(err < 0)) + goto out_put_root; + if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { + hfs_find_exit(&fd); + if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) + goto out_put_root; + inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_put_root; + } + sbi->hidden_dir = inode; + } else + hfs_find_exit(&fd); + + if (!(sb->s_flags & MS_RDONLY)) { + /* + * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused + * all three are registered with Apple for our use + */ + vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); + vhdr->modify_date = hfsp_now2mt(); + be32_add_cpu(&vhdr->write_count, 1); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); + hfsplus_sync_fs(sb, 1); + + if (!sbi->hidden_dir) { + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + if (!sbi->hidden_dir) { + mutex_unlock(&sbi->vh_mutex); + err = -ENOMEM; + goto out_put_root; + } + err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, + &str, sbi->hidden_dir); + if (err) { + mutex_unlock(&sbi->vh_mutex); + goto out_put_hidden_dir; + } + + err = hfsplus_init_inode_security(sbi->hidden_dir, + root, &str); + if (err == -EOPNOTSUPP) + err = 0; /* Operation is not supported. */ + else if (err) { + /* + * Try to delete anyway without + * error analysis. + */ + hfsplus_delete_cat(sbi->hidden_dir->i_ino, + root, &str); + mutex_unlock(&sbi->vh_mutex); + goto out_put_hidden_dir; + } + + mutex_unlock(&sbi->vh_mutex); + hfsplus_mark_inode_dirty(sbi->hidden_dir, + HFSPLUS_I_CAT_DIRTY); + } + } + + unload_nls(sbi->nls); + sbi->nls = nls; + return 0; + +out_put_hidden_dir: + iput(sbi->hidden_dir); +out_put_root: + dput(sb->s_root); + sb->s_root = NULL; +out_put_alloc_file: + iput(sbi->alloc_file); +out_close_attr_tree: + hfs_btree_close(sbi->attr_tree); +out_close_cat_tree: + hfs_btree_close(sbi->cat_tree); +out_close_ext_tree: + hfs_btree_close(sbi->ext_tree); +out_free_vhdr: + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); +out_unload_nls: + unload_nls(sbi->nls); + unload_nls(nls); + kfree(sbi); +out: + return err; +} + +MODULE_AUTHOR("Brad Boyer"); +MODULE_DESCRIPTION("Extended Macintosh Filesystem"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *hfsplus_inode_cachep; + +static struct inode *hfsplus_alloc_inode(struct super_block *sb) +{ + struct hfsplus_inode_info *i; + + i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL); + return i ? &i->vfs_inode : NULL; +} + +static void hfsplus_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); +} + +static void hfsplus_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfsplus_i_callback); +} + +#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) + +static struct dentry *hfsplus_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); +} + +static struct file_system_type hfsplus_fs_type = { + .owner = THIS_MODULE, + .name = "hfsplus", + .mount = hfsplus_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("hfsplus"); + +static void hfsplus_init_once(void *p) +{ + struct hfsplus_inode_info *i = p; + + inode_init_once(&i->vfs_inode); +} + +static int __init init_hfsplus_fs(void) +{ + int err; + + hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + hfsplus_init_once); + if (!hfsplus_inode_cachep) + return -ENOMEM; + err = hfsplus_create_attr_tree_cache(); + if (err) + goto destroy_inode_cache; + err = register_filesystem(&hfsplus_fs_type); + if (err) + goto destroy_attr_tree_cache; + return 0; + +destroy_attr_tree_cache: + hfsplus_destroy_attr_tree_cache(); + +destroy_inode_cache: + kmem_cache_destroy(hfsplus_inode_cachep); + + return err; +} + +static void __exit exit_hfsplus_fs(void) +{ + unregister_filesystem(&hfsplus_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + hfsplus_destroy_attr_tree_cache(); + kmem_cache_destroy(hfsplus_inode_cachep); +} + +module_init(init_hfsplus_fs) +module_exit(exit_hfsplus_fs) diff --git a/kernel/fs/hfsplus/tables.c b/kernel/fs/hfsplus/tables.c new file mode 100644 index 000000000..1b911730a --- /dev/null +++ b/kernel/fs/hfsplus/tables.c @@ -0,0 +1,3245 @@ +/* + * linux/fs/hfsplus/tables.c + * + * Various data tables + */ + +#include "hfsplus_fs.h" + +/* + * Unicode case folding table taken from Apple Technote #1150 + * (HFS Plus Volume Format) + */ + +u16 hfsplus_case_fold_table[] = { +/* + * The lower case table consists of a 256-entry high-byte table followed by + * some number of 256-entry subtables. The high-byte table contains either an + * offset to the subtable for characters with that high byte or zero, which + * means that there are no case mappings or ignored characters in that block. + * Ignored characters are mapped to zero. + */ + + // High-byte indices ( == 0 iff no case mapping and no ignorables ) + + + /* 0 */ 0x0100, 0x0200, 0x0000, 0x0300, 0x0400, 0x0500, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x0600, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 2 */ 0x0700, 0x0800, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0900, 0x0A00, + + // Table 1 (for high byte 0x00) + + /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, + /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, + /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF, + /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, + + // Table 2 (for high byte 0x01) + + /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107, + 0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F, + /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117, + 0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F, + /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127, + 0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F, + /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137, + 0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140, + /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147, + 0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F, + /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157, + 0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F, + /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167, + 0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F, + /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177, + 0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F, + /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188, + 0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259, + /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268, + 0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275, + /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8, + 0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF, + /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292, + 0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF, + /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9, + 0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF, + /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7, + 0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF, + /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7, + 0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF, + /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7, + 0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF, + + // Table 3 (for high byte 0x03) + + /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, + /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, + /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, + /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, + /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, + 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, + /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, + /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, + /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, + 0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F, + /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, + 0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F, + /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + 0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF, + /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF, + /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7, + 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, + /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7, + 0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF, + /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7, + 0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, + + // Table 4 (for high byte 0x04) + + /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407, + 0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F, + /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, + /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467, + 0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F, + /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477, + 0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F, + /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, + /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497, + 0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F, + /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7, + 0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF, + /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7, + 0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF, + /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8, + 0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF, + /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7, + 0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF, + /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7, + 0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF, + /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7, + 0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, + + // Table 5 (for high byte 0x05) + + /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507, + 0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F, + /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517, + 0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F, + /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, + 0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F, + /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557, + 0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F, + /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587, + 0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F, + /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, + 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F, + /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, + 0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF, + /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, + 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF, + /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7, + 0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF, + /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, + 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, + /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, + 0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF, + /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7, + 0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF, + + // Table 6 (for high byte 0x10) + + /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007, + 0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F, + /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017, + 0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F, + /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027, + 0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F, + /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, + 0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F, + /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047, + 0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F, + /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057, + 0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F, + /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067, + 0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F, + /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077, + 0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F, + /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087, + 0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F, + /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097, + 0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F, + /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, + 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, + 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7, + 0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF, + /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, + 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, + 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7, + 0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF, + + // Table 7 (for high byte 0x20) + + /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, + 0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, + 0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F, + /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, + 0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F, + /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, + 0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F, + /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047, + 0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F, + /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, + 0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F, + /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, + 0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077, + 0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F, + /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, + 0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F, + /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, + 0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F, + /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7, + 0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF, + /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7, + 0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF, + /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7, + 0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF, + /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7, + 0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF, + /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7, + 0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF, + /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7, + 0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF, + + // Table 8 (for high byte 0x21) + + /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, + 0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F, + /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, + 0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F, + /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, + 0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F, + /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, + 0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F, + /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, + 0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F, + /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, + 0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F, + /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, + 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, + 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187, + 0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F, + /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, + 0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F, + /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7, + 0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF, + /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7, + 0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF, + /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7, + 0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF, + /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7, + 0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF, + /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7, + 0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF, + /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7, + 0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF, + + // Table 9 (for high byte 0xFE) + + /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07, + 0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, + /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17, + 0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F, + /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27, + 0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F, + /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37, + 0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F, + /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47, + 0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F, + /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57, + 0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F, + /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67, + 0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F, + /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77, + 0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F, + /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87, + 0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F, + /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97, + 0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F, + /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7, + 0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF, + /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7, + 0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF, + /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7, + 0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF, + /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7, + 0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF, + /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7, + 0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF, + /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7, + 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000, + + // Table 10 (for high byte 0xFF) + + /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07, + 0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F, + /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17, + 0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F, + /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, + 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, + 0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F, + /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, + 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, + 0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F, + /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67, + 0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F, + /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77, + 0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F, + /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87, + 0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F, + /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97, + 0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F, + /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7, + 0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF, + /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7, + 0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF, + /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7, + 0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF, + /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7, + 0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF, + /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7, + 0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF, + /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7, + 0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF, +}; + +u16 hfsplus_decompose_table[] = { + /* base table */ + 0x0010, 0x04c0, 0x0000, 0x06f0, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x07b0, + /* char table 0x0___ */ + 0x0020, 0x0070, 0x0160, 0x0190, 0x0230, 0x0000, 0x0000, 0x0000, + 0x0000, 0x02d0, 0x0340, 0x0360, 0x03b0, 0x03e0, 0x0400, 0x0430, + /* char table 0x00__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0030, 0x0040, 0x0050, 0x0060, + /* char values 0x00c_ */ + 0x2042, 0x204a, 0x2052, 0x205a, 0x2062, 0x206a, 0x0000, 0x2072, + 0x207a, 0x2082, 0x208a, 0x2092, 0x209a, 0x20a2, 0x20aa, 0x20b2, + /* char values 0x00d_ */ + 0x0000, 0x20ba, 0x20c2, 0x20ca, 0x20d2, 0x20da, 0x20e2, 0x0000, + 0x0000, 0x20ea, 0x20f2, 0x20fa, 0x2102, 0x210a, 0x0000, 0x0000, + /* char values 0x00e_ */ + 0x2112, 0x211a, 0x2122, 0x212a, 0x2132, 0x213a, 0x0000, 0x2142, + 0x214a, 0x2152, 0x215a, 0x2162, 0x216a, 0x2172, 0x217a, 0x2182, + /* char values 0x00f_ */ + 0x0000, 0x218a, 0x2192, 0x219a, 0x21a2, 0x21aa, 0x21b2, 0x0000, + 0x0000, 0x21ba, 0x21c2, 0x21ca, 0x21d2, 0x21da, 0x0000, 0x21e2, + /* char table 0x01__ */ + 0x0080, 0x0090, 0x00a0, 0x00b0, 0x00c0, 0x00d0, 0x00e0, 0x00f0, + 0x0000, 0x0000, 0x0100, 0x0110, 0x0120, 0x0130, 0x0140, 0x0150, + /* char values 0x010_ */ + 0x21ea, 0x21f2, 0x21fa, 0x2202, 0x220a, 0x2212, 0x221a, 0x2222, + 0x222a, 0x2232, 0x223a, 0x2242, 0x224a, 0x2252, 0x225a, 0x2262, + /* char values 0x011_ */ + 0x0000, 0x0000, 0x226a, 0x2272, 0x227a, 0x2282, 0x228a, 0x2292, + 0x229a, 0x22a2, 0x22aa, 0x22b2, 0x22ba, 0x22c2, 0x22ca, 0x22d2, + /* char values 0x012_ */ + 0x22da, 0x22e2, 0x22ea, 0x22f2, 0x22fa, 0x2302, 0x0000, 0x0000, + 0x230a, 0x2312, 0x231a, 0x2322, 0x232a, 0x2332, 0x233a, 0x2342, + /* char values 0x013_ */ + 0x234a, 0x0000, 0x0000, 0x0000, 0x2352, 0x235a, 0x2362, 0x236a, + 0x0000, 0x2372, 0x237a, 0x2382, 0x238a, 0x2392, 0x239a, 0x0000, + /* char values 0x014_ */ + 0x0000, 0x0000, 0x0000, 0x23a2, 0x23aa, 0x23b2, 0x23ba, 0x23c2, + 0x23ca, 0x0000, 0x0000, 0x0000, 0x23d2, 0x23da, 0x23e2, 0x23ea, + /* char values 0x015_ */ + 0x23f2, 0x23fa, 0x0000, 0x0000, 0x2402, 0x240a, 0x2412, 0x241a, + 0x2422, 0x242a, 0x2432, 0x243a, 0x2442, 0x244a, 0x2452, 0x245a, + /* char values 0x016_ */ + 0x2462, 0x246a, 0x2472, 0x247a, 0x2482, 0x248a, 0x0000, 0x0000, + 0x2492, 0x249a, 0x24a2, 0x24aa, 0x24b2, 0x24ba, 0x24c2, 0x24ca, + /* char values 0x017_ */ + 0x24d2, 0x24da, 0x24e2, 0x24ea, 0x24f2, 0x24fa, 0x2502, 0x250a, + 0x2512, 0x251a, 0x2522, 0x252a, 0x2532, 0x253a, 0x2542, 0x0000, + /* char values 0x01a_ */ + 0x254a, 0x2552, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x255a, + /* char values 0x01b_ */ + 0x2562, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x01c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x256a, 0x2572, 0x257a, + /* char values 0x01d_ */ + 0x2582, 0x258a, 0x2592, 0x259a, 0x25a2, 0x25ab, 0x25b7, 0x25c3, + 0x25cf, 0x25db, 0x25e7, 0x25f3, 0x25ff, 0x0000, 0x260b, 0x2617, + /* char values 0x01e_ */ + 0x2623, 0x262f, 0x263a, 0x2642, 0x0000, 0x0000, 0x264a, 0x2652, + 0x265a, 0x2662, 0x266a, 0x2672, 0x267b, 0x2687, 0x2692, 0x269a, + /* char values 0x01f_ */ + 0x26a2, 0x0000, 0x0000, 0x0000, 0x26aa, 0x26b2, 0x0000, 0x0000, + 0x0000, 0x0000, 0x26bb, 0x26c7, 0x26d2, 0x26da, 0x26e2, 0x26ea, + /* char table 0x02__ */ + 0x0170, 0x0180, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x020_ */ + 0x26f2, 0x26fa, 0x2702, 0x270a, 0x2712, 0x271a, 0x2722, 0x272a, + 0x2732, 0x273a, 0x2742, 0x274a, 0x2752, 0x275a, 0x2762, 0x276a, + /* char values 0x021_ */ + 0x2772, 0x277a, 0x2782, 0x278a, 0x2792, 0x279a, 0x27a2, 0x27aa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x03__ */ + 0x0000, 0x01a0, 0x0000, 0x0000, 0x01b0, 0x0000, 0x0000, 0x01c0, + 0x01d0, 0x01e0, 0x01f0, 0x0200, 0x0210, 0x0220, 0x0000, 0x0000, + /* char values 0x031_ */ + 0x27b2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x034_ */ + 0x27b9, 0x27bd, 0x0000, 0x27c1, 0x27c6, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x037_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x27cd, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d1, 0x0000, + /* char values 0x038_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d6, 0x27de, 0x27e5, + 0x27ea, 0x27f2, 0x27fa, 0x0000, 0x2802, 0x0000, 0x280a, 0x2812, + /* char values 0x039_ */ + 0x281b, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x03a_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2826, 0x282e, 0x2836, 0x283e, 0x2846, 0x284e, + /* char values 0x03b_ */ + 0x2857, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x03c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2862, 0x286a, 0x2872, 0x287a, 0x2882, 0x0000, + /* char values 0x03d_ */ + 0x0000, 0x0000, 0x0000, 0x288a, 0x2892, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x04__ */ + 0x0240, 0x0250, 0x0000, 0x0260, 0x0000, 0x0270, 0x0000, 0x0280, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0290, 0x02a0, 0x02b0, 0x02c0, + /* char values 0x040_ */ + 0x0000, 0x289a, 0x0000, 0x28a2, 0x0000, 0x0000, 0x0000, 0x28aa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x28b2, 0x0000, 0x28ba, 0x0000, + /* char values 0x041_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x28c2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x043_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x28ca, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x045_ */ + 0x0000, 0x28d2, 0x0000, 0x28da, 0x0000, 0x0000, 0x0000, 0x28e2, + 0x0000, 0x0000, 0x0000, 0x0000, 0x28ea, 0x0000, 0x28f2, 0x0000, + /* char values 0x047_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x28fa, 0x2902, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x04c_ */ + 0x0000, 0x290a, 0x2912, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x04d_ */ + 0x291a, 0x2922, 0x292a, 0x2932, 0x2939, 0x293d, 0x2942, 0x294a, + 0x2951, 0x2955, 0x295a, 0x2962, 0x296a, 0x2972, 0x297a, 0x2982, + /* char values 0x04e_ */ + 0x2989, 0x298d, 0x2992, 0x299a, 0x29a2, 0x29aa, 0x29b2, 0x29ba, + 0x29c1, 0x29c5, 0x29ca, 0x29d2, 0x0000, 0x0000, 0x29da, 0x29e2, + /* char values 0x04f_ */ + 0x29ea, 0x29f2, 0x29fa, 0x2a02, 0x2a0a, 0x2a12, 0x0000, 0x0000, + 0x2a1a, 0x2a22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x09__ */ + 0x0000, 0x0000, 0x02e0, 0x02f0, 0x0000, 0x0300, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0310, 0x0320, 0x0330, 0x0000, 0x0000, + /* char values 0x092_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2a2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x093_ */ + 0x0000, 0x2a32, 0x0000, 0x0000, 0x2a3a, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x095_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2a42, 0x2a4a, 0x2a52, 0x2a5a, 0x2a62, 0x2a6a, 0x2a72, 0x2a7a, + /* char values 0x09b_ */ + 0x2a82, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x09c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2a8a, 0x2a92, 0x0000, 0x0000, 0x0000, + /* char values 0x09d_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2a9a, 0x2aa2, 0x0000, 0x2aaa, + /* char table 0x0a__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0350, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0a5_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2ab2, 0x2aba, 0x2ac2, 0x2aca, 0x0000, 0x2ad2, 0x0000, + /* char table 0x0b__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0370, 0x0380, 0x0000, 0x0000, + 0x0000, 0x0390, 0x0000, 0x0000, 0x03a0, 0x0000, 0x0000, 0x0000, + /* char values 0x0b4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2ada, 0x0000, 0x0000, 0x2ae2, 0x2aea, 0x0000, 0x0000, 0x0000, + /* char values 0x0b5_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2af2, 0x2afa, 0x0000, 0x2b02, + /* char values 0x0b9_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x2b0a, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0bc_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2b12, 0x2b1a, 0x2b22, 0x0000, 0x0000, 0x0000, + /* char table 0x0c__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x03c0, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x03d0, 0x0000, 0x0000, 0x0000, + /* char values 0x0c4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2b2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0cc_ */ + 0x2b32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b3a, + 0x2b42, 0x0000, 0x2b4a, 0x2b53, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x0d__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x03f0, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0d4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2b5e, 0x2b66, 0x2b6e, 0x0000, 0x0000, 0x0000, + /* char table 0x0e__ */ + 0x0000, 0x0000, 0x0000, 0x0410, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0420, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0e3_ */ + 0x0000, 0x0000, 0x0000, 0x2b76, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0eb_ */ + 0x0000, 0x0000, 0x0000, 0x2b7e, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x0f__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0440, 0x0450, 0x0460, 0x0470, + 0x0480, 0x0490, 0x04a0, 0x04b0, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f4_ */ + 0x0000, 0x0000, 0x0000, 0x2b86, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b8e, 0x0000, 0x0000, + /* char values 0x0f5_ */ + 0x0000, 0x0000, 0x2b96, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b9e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2ba6, 0x0000, 0x0000, 0x0000, + /* char values 0x0f6_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2bae, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f7_ */ + 0x0000, 0x0000, 0x0000, 0x2bb6, 0x0000, 0x2bbe, 0x2bc6, 0x2bcf, + 0x2bda, 0x2be3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f8_ */ + 0x0000, 0x2bee, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f9_ */ + 0x0000, 0x0000, 0x0000, 0x2bf6, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2bfe, 0x0000, 0x0000, + /* char values 0x0fa_ */ + 0x0000, 0x0000, 0x2c06, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c0e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2c16, 0x0000, 0x0000, 0x0000, + /* char values 0x0fb_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2c1e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x1___ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x04d0, 0x05e0, + /* char table 0x1e__ */ + 0x04e0, 0x04f0, 0x0500, 0x0510, 0x0520, 0x0530, 0x0540, 0x0550, + 0x0560, 0x0570, 0x0580, 0x0590, 0x05a0, 0x05b0, 0x05c0, 0x05d0, + /* char values 0x1e0_ */ + 0x2c26, 0x2c2e, 0x2c36, 0x2c3e, 0x2c46, 0x2c4e, 0x2c56, 0x2c5e, + 0x2c67, 0x2c73, 0x2c7e, 0x2c86, 0x2c8e, 0x2c96, 0x2c9e, 0x2ca6, + /* char values 0x1e1_ */ + 0x2cae, 0x2cb6, 0x2cbe, 0x2cc6, 0x2ccf, 0x2cdb, 0x2ce7, 0x2cf3, + 0x2cfe, 0x2d06, 0x2d0e, 0x2d16, 0x2d1f, 0x2d2b, 0x2d36, 0x2d3e, + /* char values 0x1e2_ */ + 0x2d46, 0x2d4e, 0x2d56, 0x2d5e, 0x2d66, 0x2d6e, 0x2d76, 0x2d7e, + 0x2d86, 0x2d8e, 0x2d96, 0x2d9e, 0x2da6, 0x2dae, 0x2db7, 0x2dc3, + /* char values 0x1e3_ */ + 0x2dce, 0x2dd6, 0x2dde, 0x2de6, 0x2dee, 0x2df6, 0x2dfe, 0x2e06, + 0x2e0f, 0x2e1b, 0x2e26, 0x2e2e, 0x2e36, 0x2e3e, 0x2e46, 0x2e4e, + /* char values 0x1e4_ */ + 0x2e56, 0x2e5e, 0x2e66, 0x2e6e, 0x2e76, 0x2e7e, 0x2e86, 0x2e8e, + 0x2e96, 0x2e9e, 0x2ea6, 0x2eae, 0x2eb7, 0x2ec3, 0x2ecf, 0x2edb, + /* char values 0x1e5_ */ + 0x2ee7, 0x2ef3, 0x2eff, 0x2f0b, 0x2f16, 0x2f1e, 0x2f26, 0x2f2e, + 0x2f36, 0x2f3e, 0x2f46, 0x2f4e, 0x2f57, 0x2f63, 0x2f6e, 0x2f76, + /* char values 0x1e6_ */ + 0x2f7e, 0x2f86, 0x2f8e, 0x2f96, 0x2f9f, 0x2fab, 0x2fb7, 0x2fc3, + 0x2fcf, 0x2fdb, 0x2fe6, 0x2fee, 0x2ff6, 0x2ffe, 0x3006, 0x300e, + /* char values 0x1e7_ */ + 0x3016, 0x301e, 0x3026, 0x302e, 0x3036, 0x303e, 0x3046, 0x304e, + 0x3057, 0x3063, 0x306f, 0x307b, 0x3086, 0x308e, 0x3096, 0x309e, + /* char values 0x1e8_ */ + 0x30a6, 0x30ae, 0x30b6, 0x30be, 0x30c6, 0x30ce, 0x30d6, 0x30de, + 0x30e6, 0x30ee, 0x30f6, 0x30fe, 0x3106, 0x310e, 0x3116, 0x311e, + /* char values 0x1e9_ */ + 0x3126, 0x312e, 0x3136, 0x313e, 0x3146, 0x314e, 0x3156, 0x315e, + 0x3166, 0x316e, 0x0000, 0x3176, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x1ea_ */ + 0x317e, 0x3186, 0x318e, 0x3196, 0x319f, 0x31ab, 0x31b7, 0x31c3, + 0x31cf, 0x31db, 0x31e7, 0x31f3, 0x31ff, 0x320b, 0x3217, 0x3223, + /* char values 0x1eb_ */ + 0x322f, 0x323b, 0x3247, 0x3253, 0x325f, 0x326b, 0x3277, 0x3283, + 0x328e, 0x3296, 0x329e, 0x32a6, 0x32ae, 0x32b6, 0x32bf, 0x32cb, + /* char values 0x1ec_ */ + 0x32d7, 0x32e3, 0x32ef, 0x32fb, 0x3307, 0x3313, 0x331f, 0x332b, + 0x3336, 0x333e, 0x3346, 0x334e, 0x3356, 0x335e, 0x3366, 0x336e, + /* char values 0x1ed_ */ + 0x3377, 0x3383, 0x338f, 0x339b, 0x33a7, 0x33b3, 0x33bf, 0x33cb, + 0x33d7, 0x33e3, 0x33ef, 0x33fb, 0x3407, 0x3413, 0x341f, 0x342b, + /* char values 0x1ee_ */ + 0x3437, 0x3443, 0x344f, 0x345b, 0x3466, 0x346e, 0x3476, 0x347e, + 0x3487, 0x3493, 0x349f, 0x34ab, 0x34b7, 0x34c3, 0x34cf, 0x34db, + /* char values 0x1ef_ */ + 0x34e7, 0x34f3, 0x34fe, 0x3506, 0x350e, 0x3516, 0x351e, 0x3526, + 0x352e, 0x3536, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x1f__ */ + 0x05f0, 0x0600, 0x0610, 0x0620, 0x0630, 0x0640, 0x0650, 0x0660, + 0x0670, 0x0680, 0x0690, 0x06a0, 0x06b0, 0x06c0, 0x06d0, 0x06e0, + /* char values 0x1f0_ */ + 0x353e, 0x3546, 0x354f, 0x355b, 0x3567, 0x3573, 0x357f, 0x358b, + 0x3596, 0x359e, 0x35a7, 0x35b3, 0x35bf, 0x35cb, 0x35d7, 0x35e3, + /* char values 0x1f1_ */ + 0x35ee, 0x35f6, 0x35ff, 0x360b, 0x3617, 0x3623, 0x0000, 0x0000, + 0x362e, 0x3636, 0x363f, 0x364b, 0x3657, 0x3663, 0x0000, 0x0000, + /* char values 0x1f2_ */ + 0x366e, 0x3676, 0x367f, 0x368b, 0x3697, 0x36a3, 0x36af, 0x36bb, + 0x36c6, 0x36ce, 0x36d7, 0x36e3, 0x36ef, 0x36fb, 0x3707, 0x3713, + /* char values 0x1f3_ */ + 0x371e, 0x3726, 0x372f, 0x373b, 0x3747, 0x3753, 0x375f, 0x376b, + 0x3776, 0x377e, 0x3787, 0x3793, 0x379f, 0x37ab, 0x37b7, 0x37c3, + /* char values 0x1f4_ */ + 0x37ce, 0x37d6, 0x37df, 0x37eb, 0x37f7, 0x3803, 0x0000, 0x0000, + 0x380e, 0x3816, 0x381f, 0x382b, 0x3837, 0x3843, 0x0000, 0x0000, + /* char values 0x1f5_ */ + 0x384e, 0x3856, 0x385f, 0x386b, 0x3877, 0x3883, 0x388f, 0x389b, + 0x0000, 0x38a6, 0x0000, 0x38af, 0x0000, 0x38bb, 0x0000, 0x38c7, + /* char values 0x1f6_ */ + 0x38d2, 0x38da, 0x38e3, 0x38ef, 0x38fb, 0x3907, 0x3913, 0x391f, + 0x392a, 0x3932, 0x393b, 0x3947, 0x3953, 0x395f, 0x396b, 0x3977, + /* char values 0x1f7_ */ + 0x3982, 0x398a, 0x3992, 0x399a, 0x39a2, 0x39aa, 0x39b2, 0x39ba, + 0x39c2, 0x39ca, 0x39d2, 0x39da, 0x39e2, 0x39ea, 0x0000, 0x0000, + /* char values 0x1f8_ */ + 0x39f3, 0x39ff, 0x3a0c, 0x3a1c, 0x3a2c, 0x3a3c, 0x3a4c, 0x3a5c, + 0x3a6b, 0x3a77, 0x3a84, 0x3a94, 0x3aa4, 0x3ab4, 0x3ac4, 0x3ad4, + /* char values 0x1f9_ */ + 0x3ae3, 0x3aef, 0x3afc, 0x3b0c, 0x3b1c, 0x3b2c, 0x3b3c, 0x3b4c, + 0x3b5b, 0x3b67, 0x3b74, 0x3b84, 0x3b94, 0x3ba4, 0x3bb4, 0x3bc4, + /* char values 0x1fa_ */ + 0x3bd3, 0x3bdf, 0x3bec, 0x3bfc, 0x3c0c, 0x3c1c, 0x3c2c, 0x3c3c, + 0x3c4b, 0x3c57, 0x3c64, 0x3c74, 0x3c84, 0x3c94, 0x3ca4, 0x3cb4, + /* char values 0x1fb_ */ + 0x3cc2, 0x3cca, 0x3cd3, 0x3cde, 0x3ce7, 0x0000, 0x3cf2, 0x3cfb, + 0x3d06, 0x3d0e, 0x3d16, 0x3d1e, 0x3d26, 0x0000, 0x3d2d, 0x0000, + /* char values 0x1fc_ */ + 0x0000, 0x3d32, 0x3d3b, 0x3d46, 0x3d4f, 0x0000, 0x3d5a, 0x3d63, + 0x3d6e, 0x3d76, 0x3d7e, 0x3d86, 0x3d8e, 0x3d96, 0x3d9e, 0x3da6, + /* char values 0x1fd_ */ + 0x3dae, 0x3db6, 0x3dbf, 0x3dcb, 0x0000, 0x0000, 0x3dd6, 0x3ddf, + 0x3dea, 0x3df2, 0x3dfa, 0x3e02, 0x0000, 0x3e0a, 0x3e12, 0x3e1a, + /* char values 0x1fe_ */ + 0x3e22, 0x3e2a, 0x3e33, 0x3e3f, 0x3e4a, 0x3e52, 0x3e5a, 0x3e63, + 0x3e6e, 0x3e76, 0x3e7e, 0x3e86, 0x3e8e, 0x3e96, 0x3e9e, 0x3ea5, + /* char values 0x1ff_ */ + 0x0000, 0x0000, 0x3eab, 0x3eb6, 0x3ebf, 0x0000, 0x3eca, 0x3ed3, + 0x3ede, 0x3ee6, 0x3eee, 0x3ef6, 0x3efe, 0x3f05, 0x0000, 0x0000, + /* char table 0x3___ */ + 0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x30__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0710, 0x0720, 0x0730, 0x0740, + 0x0000, 0x0750, 0x0760, 0x0770, 0x0780, 0x0790, 0x0000, 0x07a0, + /* char values 0x304_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x3f0a, 0x0000, 0x3f12, 0x0000, + /* char values 0x305_ */ + 0x3f1a, 0x0000, 0x3f22, 0x0000, 0x3f2a, 0x0000, 0x3f32, 0x0000, + 0x3f3a, 0x0000, 0x3f42, 0x0000, 0x3f4a, 0x0000, 0x3f52, 0x0000, + /* char values 0x306_ */ + 0x3f5a, 0x0000, 0x3f62, 0x0000, 0x0000, 0x3f6a, 0x0000, 0x3f72, + 0x0000, 0x3f7a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x307_ */ + 0x3f82, 0x3f8a, 0x0000, 0x3f92, 0x3f9a, 0x0000, 0x3fa2, 0x3faa, + 0x0000, 0x3fb2, 0x3fba, 0x0000, 0x3fc2, 0x3fca, 0x0000, 0x0000, + /* char values 0x309_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x3fd2, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x3fda, 0x0000, + /* char values 0x30a_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x3fe2, 0x0000, 0x3fea, 0x0000, + /* char values 0x30b_ */ + 0x3ff2, 0x0000, 0x3ffa, 0x0000, 0x4002, 0x0000, 0x400a, 0x0000, + 0x4012, 0x0000, 0x401a, 0x0000, 0x4022, 0x0000, 0x402a, 0x0000, + /* char values 0x30c_ */ + 0x4032, 0x0000, 0x403a, 0x0000, 0x0000, 0x4042, 0x0000, 0x404a, + 0x0000, 0x4052, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x30d_ */ + 0x405a, 0x4062, 0x0000, 0x406a, 0x4072, 0x0000, 0x407a, 0x4082, + 0x0000, 0x408a, 0x4092, 0x0000, 0x409a, 0x40a2, 0x0000, 0x0000, + /* char values 0x30f_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x40aa, 0x0000, 0x0000, 0x40b2, + 0x40ba, 0x40c2, 0x40ca, 0x0000, 0x0000, 0x0000, 0x40d2, 0x0000, + /* char table 0xf___ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x07c0, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0xfb__ */ + 0x0000, 0x07d0, 0x07e0, 0x07f0, 0x0800, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0xfb1_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x40da, + /* char values 0xfb2_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x40e2, 0x40ea, 0x40f3, 0x40ff, 0x410a, 0x4112, + /* char values 0xfb3_ */ + 0x411a, 0x4122, 0x412a, 0x4132, 0x413a, 0x4142, 0x414a, 0x0000, + 0x4152, 0x415a, 0x4162, 0x416a, 0x4172, 0x0000, 0x417a, 0x0000, + /* char values 0xfb4_ */ + 0x4182, 0x418a, 0x0000, 0x4192, 0x419a, 0x0000, 0x41a2, 0x41aa, + 0x41b2, 0x41ba, 0x41c2, 0x41ca, 0x41d2, 0x41da, 0x41e2, 0x0000, + /* decomposed characters */ + 0x0041, 0x0300, 0x0041, 0x0301, 0x0041, 0x0302, 0x0041, 0x0303, + 0x0041, 0x0308, 0x0041, 0x030a, 0x0043, 0x0327, 0x0045, 0x0300, + 0x0045, 0x0301, 0x0045, 0x0302, 0x0045, 0x0308, 0x0049, 0x0300, + 0x0049, 0x0301, 0x0049, 0x0302, 0x0049, 0x0308, 0x004e, 0x0303, + 0x004f, 0x0300, 0x004f, 0x0301, 0x004f, 0x0302, 0x004f, 0x0303, + 0x004f, 0x0308, 0x0055, 0x0300, 0x0055, 0x0301, 0x0055, 0x0302, + 0x0055, 0x0308, 0x0059, 0x0301, 0x0061, 0x0300, 0x0061, 0x0301, + 0x0061, 0x0302, 0x0061, 0x0303, 0x0061, 0x0308, 0x0061, 0x030a, + 0x0063, 0x0327, 0x0065, 0x0300, 0x0065, 0x0301, 0x0065, 0x0302, + 0x0065, 0x0308, 0x0069, 0x0300, 0x0069, 0x0301, 0x0069, 0x0302, + 0x0069, 0x0308, 0x006e, 0x0303, 0x006f, 0x0300, 0x006f, 0x0301, + 0x006f, 0x0302, 0x006f, 0x0303, 0x006f, 0x0308, 0x0075, 0x0300, + 0x0075, 0x0301, 0x0075, 0x0302, 0x0075, 0x0308, 0x0079, 0x0301, + 0x0079, 0x0308, 0x0041, 0x0304, 0x0061, 0x0304, 0x0041, 0x0306, + 0x0061, 0x0306, 0x0041, 0x0328, 0x0061, 0x0328, 0x0043, 0x0301, + 0x0063, 0x0301, 0x0043, 0x0302, 0x0063, 0x0302, 0x0043, 0x0307, + 0x0063, 0x0307, 0x0043, 0x030c, 0x0063, 0x030c, 0x0044, 0x030c, + 0x0064, 0x030c, 0x0045, 0x0304, 0x0065, 0x0304, 0x0045, 0x0306, + 0x0065, 0x0306, 0x0045, 0x0307, 0x0065, 0x0307, 0x0045, 0x0328, + 0x0065, 0x0328, 0x0045, 0x030c, 0x0065, 0x030c, 0x0047, 0x0302, + 0x0067, 0x0302, 0x0047, 0x0306, 0x0067, 0x0306, 0x0047, 0x0307, + 0x0067, 0x0307, 0x0047, 0x0327, 0x0067, 0x0327, 0x0048, 0x0302, + 0x0068, 0x0302, 0x0049, 0x0303, 0x0069, 0x0303, 0x0049, 0x0304, + 0x0069, 0x0304, 0x0049, 0x0306, 0x0069, 0x0306, 0x0049, 0x0328, + 0x0069, 0x0328, 0x0049, 0x0307, 0x004a, 0x0302, 0x006a, 0x0302, + 0x004b, 0x0327, 0x006b, 0x0327, 0x004c, 0x0301, 0x006c, 0x0301, + 0x004c, 0x0327, 0x006c, 0x0327, 0x004c, 0x030c, 0x006c, 0x030c, + 0x004e, 0x0301, 0x006e, 0x0301, 0x004e, 0x0327, 0x006e, 0x0327, + 0x004e, 0x030c, 0x006e, 0x030c, 0x004f, 0x0304, 0x006f, 0x0304, + 0x004f, 0x0306, 0x006f, 0x0306, 0x004f, 0x030b, 0x006f, 0x030b, + 0x0052, 0x0301, 0x0072, 0x0301, 0x0052, 0x0327, 0x0072, 0x0327, + 0x0052, 0x030c, 0x0072, 0x030c, 0x0053, 0x0301, 0x0073, 0x0301, + 0x0053, 0x0302, 0x0073, 0x0302, 0x0053, 0x0327, 0x0073, 0x0327, + 0x0053, 0x030c, 0x0073, 0x030c, 0x0054, 0x0327, 0x0074, 0x0327, + 0x0054, 0x030c, 0x0074, 0x030c, 0x0055, 0x0303, 0x0075, 0x0303, + 0x0055, 0x0304, 0x0075, 0x0304, 0x0055, 0x0306, 0x0075, 0x0306, + 0x0055, 0x030a, 0x0075, 0x030a, 0x0055, 0x030b, 0x0075, 0x030b, + 0x0055, 0x0328, 0x0075, 0x0328, 0x0057, 0x0302, 0x0077, 0x0302, + 0x0059, 0x0302, 0x0079, 0x0302, 0x0059, 0x0308, 0x005a, 0x0301, + 0x007a, 0x0301, 0x005a, 0x0307, 0x007a, 0x0307, 0x005a, 0x030c, + 0x007a, 0x030c, 0x004f, 0x031b, 0x006f, 0x031b, 0x0055, 0x031b, + 0x0075, 0x031b, 0x0041, 0x030c, 0x0061, 0x030c, 0x0049, 0x030c, + 0x0069, 0x030c, 0x004f, 0x030c, 0x006f, 0x030c, 0x0055, 0x030c, + 0x0075, 0x030c, 0x0055, 0x0308, 0x0304, 0x0075, 0x0308, 0x0304, + 0x0055, 0x0308, 0x0301, 0x0075, 0x0308, 0x0301, 0x0055, 0x0308, + 0x030c, 0x0075, 0x0308, 0x030c, 0x0055, 0x0308, 0x0300, 0x0075, + 0x0308, 0x0300, 0x0041, 0x0308, 0x0304, 0x0061, 0x0308, 0x0304, + 0x0041, 0x0307, 0x0304, 0x0061, 0x0307, 0x0304, 0x00c6, 0x0304, + 0x00e6, 0x0304, 0x0047, 0x030c, 0x0067, 0x030c, 0x004b, 0x030c, + 0x006b, 0x030c, 0x004f, 0x0328, 0x006f, 0x0328, 0x004f, 0x0328, + 0x0304, 0x006f, 0x0328, 0x0304, 0x01b7, 0x030c, 0x0292, 0x030c, + 0x006a, 0x030c, 0x0047, 0x0301, 0x0067, 0x0301, 0x0041, 0x030a, + 0x0301, 0x0061, 0x030a, 0x0301, 0x00c6, 0x0301, 0x00e6, 0x0301, + 0x00d8, 0x0301, 0x00f8, 0x0301, 0x0041, 0x030f, 0x0061, 0x030f, + 0x0041, 0x0311, 0x0061, 0x0311, 0x0045, 0x030f, 0x0065, 0x030f, + 0x0045, 0x0311, 0x0065, 0x0311, 0x0049, 0x030f, 0x0069, 0x030f, + 0x0049, 0x0311, 0x0069, 0x0311, 0x004f, 0x030f, 0x006f, 0x030f, + 0x004f, 0x0311, 0x006f, 0x0311, 0x0052, 0x030f, 0x0072, 0x030f, + 0x0052, 0x0311, 0x0072, 0x0311, 0x0055, 0x030f, 0x0075, 0x030f, + 0x0055, 0x0311, 0x0075, 0x0311, 0x0306, 0x0307, 0x0300, 0x0301, + 0x0313, 0x0308, 0x030d, 0x02b9, 0x003b, 0x00a8, 0x030d, 0x0391, + 0x030d, 0x00b7, 0x0395, 0x030d, 0x0397, 0x030d, 0x0399, 0x030d, + 0x039f, 0x030d, 0x03a5, 0x030d, 0x03a9, 0x030d, 0x03b9, 0x0308, + 0x030d, 0x0399, 0x0308, 0x03a5, 0x0308, 0x03b1, 0x030d, 0x03b5, + 0x030d, 0x03b7, 0x030d, 0x03b9, 0x030d, 0x03c5, 0x0308, 0x030d, + 0x03b9, 0x0308, 0x03c5, 0x0308, 0x03bf, 0x030d, 0x03c5, 0x030d, + 0x03c9, 0x030d, 0x03d2, 0x030d, 0x03d2, 0x0308, 0x0415, 0x0308, + 0x0413, 0x0301, 0x0406, 0x0308, 0x041a, 0x0301, 0x0423, 0x0306, + 0x0418, 0x0306, 0x0438, 0x0306, 0x0435, 0x0308, 0x0433, 0x0301, + 0x0456, 0x0308, 0x043a, 0x0301, 0x0443, 0x0306, 0x0474, 0x030f, + 0x0475, 0x030f, 0x0416, 0x0306, 0x0436, 0x0306, 0x0410, 0x0306, + 0x0430, 0x0306, 0x0410, 0x0308, 0x0430, 0x0308, 0x00c6, 0x00e6, + 0x0415, 0x0306, 0x0435, 0x0306, 0x018f, 0x0259, 0x018f, 0x0308, + 0x0259, 0x0308, 0x0416, 0x0308, 0x0436, 0x0308, 0x0417, 0x0308, + 0x0437, 0x0308, 0x01b7, 0x0292, 0x0418, 0x0304, 0x0438, 0x0304, + 0x0418, 0x0308, 0x0438, 0x0308, 0x041e, 0x0308, 0x043e, 0x0308, + 0x019f, 0x0275, 0x019f, 0x0308, 0x0275, 0x0308, 0x0423, 0x0304, + 0x0443, 0x0304, 0x0423, 0x0308, 0x0443, 0x0308, 0x0423, 0x030b, + 0x0443, 0x030b, 0x0427, 0x0308, 0x0447, 0x0308, 0x042b, 0x0308, + 0x044b, 0x0308, 0x0928, 0x093c, 0x0930, 0x093c, 0x0933, 0x093c, + 0x0915, 0x093c, 0x0916, 0x093c, 0x0917, 0x093c, 0x091c, 0x093c, + 0x0921, 0x093c, 0x0922, 0x093c, 0x092b, 0x093c, 0x092f, 0x093c, + 0x09ac, 0x09bc, 0x09c7, 0x09be, 0x09c7, 0x09d7, 0x09a1, 0x09bc, + 0x09a2, 0x09bc, 0x09af, 0x09bc, 0x0a16, 0x0a3c, 0x0a17, 0x0a3c, + 0x0a1c, 0x0a3c, 0x0a21, 0x0a3c, 0x0a2b, 0x0a3c, 0x0b47, 0x0b56, + 0x0b47, 0x0b3e, 0x0b47, 0x0b57, 0x0b21, 0x0b3c, 0x0b22, 0x0b3c, + 0x0b2f, 0x0b3c, 0x0b92, 0x0bd7, 0x0bc6, 0x0bbe, 0x0bc7, 0x0bbe, + 0x0bc6, 0x0bd7, 0x0c46, 0x0c56, 0x0cbf, 0x0cd5, 0x0cc6, 0x0cd5, + 0x0cc6, 0x0cd6, 0x0cc6, 0x0cc2, 0x0cc6, 0x0cc2, 0x0cd5, 0x0d46, + 0x0d3e, 0x0d47, 0x0d3e, 0x0d46, 0x0d57, 0x0e4d, 0x0e32, 0x0ecd, + 0x0eb2, 0x0f42, 0x0fb7, 0x0f4c, 0x0fb7, 0x0f51, 0x0fb7, 0x0f56, + 0x0fb7, 0x0f5b, 0x0fb7, 0x0f40, 0x0fb5, 0x0f72, 0x0f71, 0x0f74, + 0x0f71, 0x0fb2, 0x0f80, 0x0fb2, 0x0f80, 0x0f71, 0x0fb3, 0x0f80, + 0x0fb3, 0x0f80, 0x0f71, 0x0f80, 0x0f71, 0x0f92, 0x0fb7, 0x0f9c, + 0x0fb7, 0x0fa1, 0x0fb7, 0x0fa6, 0x0fb7, 0x0fab, 0x0fb7, 0x0f90, + 0x0fb5, 0x0041, 0x0325, 0x0061, 0x0325, 0x0042, 0x0307, 0x0062, + 0x0307, 0x0042, 0x0323, 0x0062, 0x0323, 0x0042, 0x0331, 0x0062, + 0x0331, 0x0043, 0x0327, 0x0301, 0x0063, 0x0327, 0x0301, 0x0044, + 0x0307, 0x0064, 0x0307, 0x0044, 0x0323, 0x0064, 0x0323, 0x0044, + 0x0331, 0x0064, 0x0331, 0x0044, 0x0327, 0x0064, 0x0327, 0x0044, + 0x032d, 0x0064, 0x032d, 0x0045, 0x0304, 0x0300, 0x0065, 0x0304, + 0x0300, 0x0045, 0x0304, 0x0301, 0x0065, 0x0304, 0x0301, 0x0045, + 0x032d, 0x0065, 0x032d, 0x0045, 0x0330, 0x0065, 0x0330, 0x0045, + 0x0327, 0x0306, 0x0065, 0x0327, 0x0306, 0x0046, 0x0307, 0x0066, + 0x0307, 0x0047, 0x0304, 0x0067, 0x0304, 0x0048, 0x0307, 0x0068, + 0x0307, 0x0048, 0x0323, 0x0068, 0x0323, 0x0048, 0x0308, 0x0068, + 0x0308, 0x0048, 0x0327, 0x0068, 0x0327, 0x0048, 0x032e, 0x0068, + 0x032e, 0x0049, 0x0330, 0x0069, 0x0330, 0x0049, 0x0308, 0x0301, + 0x0069, 0x0308, 0x0301, 0x004b, 0x0301, 0x006b, 0x0301, 0x004b, + 0x0323, 0x006b, 0x0323, 0x004b, 0x0331, 0x006b, 0x0331, 0x004c, + 0x0323, 0x006c, 0x0323, 0x004c, 0x0323, 0x0304, 0x006c, 0x0323, + 0x0304, 0x004c, 0x0331, 0x006c, 0x0331, 0x004c, 0x032d, 0x006c, + 0x032d, 0x004d, 0x0301, 0x006d, 0x0301, 0x004d, 0x0307, 0x006d, + 0x0307, 0x004d, 0x0323, 0x006d, 0x0323, 0x004e, 0x0307, 0x006e, + 0x0307, 0x004e, 0x0323, 0x006e, 0x0323, 0x004e, 0x0331, 0x006e, + 0x0331, 0x004e, 0x032d, 0x006e, 0x032d, 0x004f, 0x0303, 0x0301, + 0x006f, 0x0303, 0x0301, 0x004f, 0x0303, 0x0308, 0x006f, 0x0303, + 0x0308, 0x004f, 0x0304, 0x0300, 0x006f, 0x0304, 0x0300, 0x004f, + 0x0304, 0x0301, 0x006f, 0x0304, 0x0301, 0x0050, 0x0301, 0x0070, + 0x0301, 0x0050, 0x0307, 0x0070, 0x0307, 0x0052, 0x0307, 0x0072, + 0x0307, 0x0052, 0x0323, 0x0072, 0x0323, 0x0052, 0x0323, 0x0304, + 0x0072, 0x0323, 0x0304, 0x0052, 0x0331, 0x0072, 0x0331, 0x0053, + 0x0307, 0x0073, 0x0307, 0x0053, 0x0323, 0x0073, 0x0323, 0x0053, + 0x0301, 0x0307, 0x0073, 0x0301, 0x0307, 0x0053, 0x030c, 0x0307, + 0x0073, 0x030c, 0x0307, 0x0053, 0x0323, 0x0307, 0x0073, 0x0323, + 0x0307, 0x0054, 0x0307, 0x0074, 0x0307, 0x0054, 0x0323, 0x0074, + 0x0323, 0x0054, 0x0331, 0x0074, 0x0331, 0x0054, 0x032d, 0x0074, + 0x032d, 0x0055, 0x0324, 0x0075, 0x0324, 0x0055, 0x0330, 0x0075, + 0x0330, 0x0055, 0x032d, 0x0075, 0x032d, 0x0055, 0x0303, 0x0301, + 0x0075, 0x0303, 0x0301, 0x0055, 0x0304, 0x0308, 0x0075, 0x0304, + 0x0308, 0x0056, 0x0303, 0x0076, 0x0303, 0x0056, 0x0323, 0x0076, + 0x0323, 0x0057, 0x0300, 0x0077, 0x0300, 0x0057, 0x0301, 0x0077, + 0x0301, 0x0057, 0x0308, 0x0077, 0x0308, 0x0057, 0x0307, 0x0077, + 0x0307, 0x0057, 0x0323, 0x0077, 0x0323, 0x0058, 0x0307, 0x0078, + 0x0307, 0x0058, 0x0308, 0x0078, 0x0308, 0x0059, 0x0307, 0x0079, + 0x0307, 0x005a, 0x0302, 0x007a, 0x0302, 0x005a, 0x0323, 0x007a, + 0x0323, 0x005a, 0x0331, 0x007a, 0x0331, 0x0068, 0x0331, 0x0074, + 0x0308, 0x0077, 0x030a, 0x0079, 0x030a, 0x017f, 0x0307, 0x0041, + 0x0323, 0x0061, 0x0323, 0x0041, 0x0309, 0x0061, 0x0309, 0x0041, + 0x0302, 0x0301, 0x0061, 0x0302, 0x0301, 0x0041, 0x0302, 0x0300, + 0x0061, 0x0302, 0x0300, 0x0041, 0x0302, 0x0309, 0x0061, 0x0302, + 0x0309, 0x0041, 0x0302, 0x0303, 0x0061, 0x0302, 0x0303, 0x0041, + 0x0323, 0x0302, 0x0061, 0x0323, 0x0302, 0x0041, 0x0306, 0x0301, + 0x0061, 0x0306, 0x0301, 0x0041, 0x0306, 0x0300, 0x0061, 0x0306, + 0x0300, 0x0041, 0x0306, 0x0309, 0x0061, 0x0306, 0x0309, 0x0041, + 0x0306, 0x0303, 0x0061, 0x0306, 0x0303, 0x0041, 0x0323, 0x0306, + 0x0061, 0x0323, 0x0306, 0x0045, 0x0323, 0x0065, 0x0323, 0x0045, + 0x0309, 0x0065, 0x0309, 0x0045, 0x0303, 0x0065, 0x0303, 0x0045, + 0x0302, 0x0301, 0x0065, 0x0302, 0x0301, 0x0045, 0x0302, 0x0300, + 0x0065, 0x0302, 0x0300, 0x0045, 0x0302, 0x0309, 0x0065, 0x0302, + 0x0309, 0x0045, 0x0302, 0x0303, 0x0065, 0x0302, 0x0303, 0x0045, + 0x0323, 0x0302, 0x0065, 0x0323, 0x0302, 0x0049, 0x0309, 0x0069, + 0x0309, 0x0049, 0x0323, 0x0069, 0x0323, 0x004f, 0x0323, 0x006f, + 0x0323, 0x004f, 0x0309, 0x006f, 0x0309, 0x004f, 0x0302, 0x0301, + 0x006f, 0x0302, 0x0301, 0x004f, 0x0302, 0x0300, 0x006f, 0x0302, + 0x0300, 0x004f, 0x0302, 0x0309, 0x006f, 0x0302, 0x0309, 0x004f, + 0x0302, 0x0303, 0x006f, 0x0302, 0x0303, 0x004f, 0x0323, 0x0302, + 0x006f, 0x0323, 0x0302, 0x004f, 0x031b, 0x0301, 0x006f, 0x031b, + 0x0301, 0x004f, 0x031b, 0x0300, 0x006f, 0x031b, 0x0300, 0x004f, + 0x031b, 0x0309, 0x006f, 0x031b, 0x0309, 0x004f, 0x031b, 0x0303, + 0x006f, 0x031b, 0x0303, 0x004f, 0x031b, 0x0323, 0x006f, 0x031b, + 0x0323, 0x0055, 0x0323, 0x0075, 0x0323, 0x0055, 0x0309, 0x0075, + 0x0309, 0x0055, 0x031b, 0x0301, 0x0075, 0x031b, 0x0301, 0x0055, + 0x031b, 0x0300, 0x0075, 0x031b, 0x0300, 0x0055, 0x031b, 0x0309, + 0x0075, 0x031b, 0x0309, 0x0055, 0x031b, 0x0303, 0x0075, 0x031b, + 0x0303, 0x0055, 0x031b, 0x0323, 0x0075, 0x031b, 0x0323, 0x0059, + 0x0300, 0x0079, 0x0300, 0x0059, 0x0323, 0x0079, 0x0323, 0x0059, + 0x0309, 0x0079, 0x0309, 0x0059, 0x0303, 0x0079, 0x0303, 0x03b1, + 0x0313, 0x03b1, 0x0314, 0x03b1, 0x0313, 0x0300, 0x03b1, 0x0314, + 0x0300, 0x03b1, 0x0313, 0x0301, 0x03b1, 0x0314, 0x0301, 0x03b1, + 0x0313, 0x0342, 0x03b1, 0x0314, 0x0342, 0x0391, 0x0313, 0x0391, + 0x0314, 0x0391, 0x0313, 0x0300, 0x0391, 0x0314, 0x0300, 0x0391, + 0x0313, 0x0301, 0x0391, 0x0314, 0x0301, 0x0391, 0x0313, 0x0342, + 0x0391, 0x0314, 0x0342, 0x03b5, 0x0313, 0x03b5, 0x0314, 0x03b5, + 0x0313, 0x0300, 0x03b5, 0x0314, 0x0300, 0x03b5, 0x0313, 0x0301, + 0x03b5, 0x0314, 0x0301, 0x0395, 0x0313, 0x0395, 0x0314, 0x0395, + 0x0313, 0x0300, 0x0395, 0x0314, 0x0300, 0x0395, 0x0313, 0x0301, + 0x0395, 0x0314, 0x0301, 0x03b7, 0x0313, 0x03b7, 0x0314, 0x03b7, + 0x0313, 0x0300, 0x03b7, 0x0314, 0x0300, 0x03b7, 0x0313, 0x0301, + 0x03b7, 0x0314, 0x0301, 0x03b7, 0x0313, 0x0342, 0x03b7, 0x0314, + 0x0342, 0x0397, 0x0313, 0x0397, 0x0314, 0x0397, 0x0313, 0x0300, + 0x0397, 0x0314, 0x0300, 0x0397, 0x0313, 0x0301, 0x0397, 0x0314, + 0x0301, 0x0397, 0x0313, 0x0342, 0x0397, 0x0314, 0x0342, 0x03b9, + 0x0313, 0x03b9, 0x0314, 0x03b9, 0x0313, 0x0300, 0x03b9, 0x0314, + 0x0300, 0x03b9, 0x0313, 0x0301, 0x03b9, 0x0314, 0x0301, 0x03b9, + 0x0313, 0x0342, 0x03b9, 0x0314, 0x0342, 0x0399, 0x0313, 0x0399, + 0x0314, 0x0399, 0x0313, 0x0300, 0x0399, 0x0314, 0x0300, 0x0399, + 0x0313, 0x0301, 0x0399, 0x0314, 0x0301, 0x0399, 0x0313, 0x0342, + 0x0399, 0x0314, 0x0342, 0x03bf, 0x0313, 0x03bf, 0x0314, 0x03bf, + 0x0313, 0x0300, 0x03bf, 0x0314, 0x0300, 0x03bf, 0x0313, 0x0301, + 0x03bf, 0x0314, 0x0301, 0x039f, 0x0313, 0x039f, 0x0314, 0x039f, + 0x0313, 0x0300, 0x039f, 0x0314, 0x0300, 0x039f, 0x0313, 0x0301, + 0x039f, 0x0314, 0x0301, 0x03c5, 0x0313, 0x03c5, 0x0314, 0x03c5, + 0x0313, 0x0300, 0x03c5, 0x0314, 0x0300, 0x03c5, 0x0313, 0x0301, + 0x03c5, 0x0314, 0x0301, 0x03c5, 0x0313, 0x0342, 0x03c5, 0x0314, + 0x0342, 0x03a5, 0x0314, 0x03a5, 0x0314, 0x0300, 0x03a5, 0x0314, + 0x0301, 0x03a5, 0x0314, 0x0342, 0x03c9, 0x0313, 0x03c9, 0x0314, + 0x03c9, 0x0313, 0x0300, 0x03c9, 0x0314, 0x0300, 0x03c9, 0x0313, + 0x0301, 0x03c9, 0x0314, 0x0301, 0x03c9, 0x0313, 0x0342, 0x03c9, + 0x0314, 0x0342, 0x03a9, 0x0313, 0x03a9, 0x0314, 0x03a9, 0x0313, + 0x0300, 0x03a9, 0x0314, 0x0300, 0x03a9, 0x0313, 0x0301, 0x03a9, + 0x0314, 0x0301, 0x03a9, 0x0313, 0x0342, 0x03a9, 0x0314, 0x0342, + 0x03b1, 0x0300, 0x03b1, 0x0301, 0x03b5, 0x0300, 0x03b5, 0x0301, + 0x03b7, 0x0300, 0x03b7, 0x0301, 0x03b9, 0x0300, 0x03b9, 0x0301, + 0x03bf, 0x0300, 0x03bf, 0x0301, 0x03c5, 0x0300, 0x03c5, 0x0301, + 0x03c9, 0x0300, 0x03c9, 0x0301, 0x03b1, 0x0345, 0x0313, 0x03b1, + 0x0345, 0x0314, 0x03b1, 0x0345, 0x0313, 0x0300, 0x03b1, 0x0345, + 0x0314, 0x0300, 0x03b1, 0x0345, 0x0313, 0x0301, 0x03b1, 0x0345, + 0x0314, 0x0301, 0x03b1, 0x0345, 0x0313, 0x0342, 0x03b1, 0x0345, + 0x0314, 0x0342, 0x0391, 0x0345, 0x0313, 0x0391, 0x0345, 0x0314, + 0x0391, 0x0345, 0x0313, 0x0300, 0x0391, 0x0345, 0x0314, 0x0300, + 0x0391, 0x0345, 0x0313, 0x0301, 0x0391, 0x0345, 0x0314, 0x0301, + 0x0391, 0x0345, 0x0313, 0x0342, 0x0391, 0x0345, 0x0314, 0x0342, + 0x03b7, 0x0345, 0x0313, 0x03b7, 0x0345, 0x0314, 0x03b7, 0x0345, + 0x0313, 0x0300, 0x03b7, 0x0345, 0x0314, 0x0300, 0x03b7, 0x0345, + 0x0313, 0x0301, 0x03b7, 0x0345, 0x0314, 0x0301, 0x03b7, 0x0345, + 0x0313, 0x0342, 0x03b7, 0x0345, 0x0314, 0x0342, 0x0397, 0x0345, + 0x0313, 0x0397, 0x0345, 0x0314, 0x0397, 0x0345, 0x0313, 0x0300, + 0x0397, 0x0345, 0x0314, 0x0300, 0x0397, 0x0345, 0x0313, 0x0301, + 0x0397, 0x0345, 0x0314, 0x0301, 0x0397, 0x0345, 0x0313, 0x0342, + 0x0397, 0x0345, 0x0314, 0x0342, 0x03c9, 0x0345, 0x0313, 0x03c9, + 0x0345, 0x0314, 0x03c9, 0x0345, 0x0313, 0x0300, 0x03c9, 0x0345, + 0x0314, 0x0300, 0x03c9, 0x0345, 0x0313, 0x0301, 0x03c9, 0x0345, + 0x0314, 0x0301, 0x03c9, 0x0345, 0x0313, 0x0342, 0x03c9, 0x0345, + 0x0314, 0x0342, 0x03a9, 0x0345, 0x0313, 0x03a9, 0x0345, 0x0314, + 0x03a9, 0x0345, 0x0313, 0x0300, 0x03a9, 0x0345, 0x0314, 0x0300, + 0x03a9, 0x0345, 0x0313, 0x0301, 0x03a9, 0x0345, 0x0314, 0x0301, + 0x03a9, 0x0345, 0x0313, 0x0342, 0x03a9, 0x0345, 0x0314, 0x0342, + 0x03b1, 0x0306, 0x03b1, 0x0304, 0x03b1, 0x0345, 0x0300, 0x03b1, + 0x0345, 0x03b1, 0x0345, 0x0301, 0x03b1, 0x0342, 0x03b1, 0x0345, + 0x0342, 0x0391, 0x0306, 0x0391, 0x0304, 0x0391, 0x0300, 0x0391, + 0x0301, 0x0391, 0x0345, 0x03b9, 0x00a8, 0x0342, 0x03b7, 0x0345, + 0x0300, 0x03b7, 0x0345, 0x03b7, 0x0345, 0x0301, 0x03b7, 0x0342, + 0x03b7, 0x0345, 0x0342, 0x0395, 0x0300, 0x0395, 0x0301, 0x0397, + 0x0300, 0x0397, 0x0301, 0x0397, 0x0345, 0x1fbf, 0x0300, 0x1fbf, + 0x0301, 0x1fbf, 0x0342, 0x03b9, 0x0306, 0x03b9, 0x0304, 0x03b9, + 0x0308, 0x0300, 0x03b9, 0x0308, 0x0301, 0x03b9, 0x0342, 0x03b9, + 0x0308, 0x0342, 0x0399, 0x0306, 0x0399, 0x0304, 0x0399, 0x0300, + 0x0399, 0x0301, 0x1ffe, 0x0300, 0x1ffe, 0x0301, 0x1ffe, 0x0342, + 0x03c5, 0x0306, 0x03c5, 0x0304, 0x03c5, 0x0308, 0x0300, 0x03c5, + 0x0308, 0x0301, 0x03c1, 0x0313, 0x03c1, 0x0314, 0x03c5, 0x0342, + 0x03c5, 0x0308, 0x0342, 0x03a5, 0x0306, 0x03a5, 0x0304, 0x03a5, + 0x0300, 0x03a5, 0x0301, 0x03a1, 0x0314, 0x00a8, 0x0300, 0x00a8, + 0x0301, 0x0060, 0x03c9, 0x0345, 0x0300, 0x03c9, 0x0345, 0x03bf, + 0x0345, 0x0301, 0x03c9, 0x0342, 0x03c9, 0x0345, 0x0342, 0x039f, + 0x0300, 0x039f, 0x0301, 0x03a9, 0x0300, 0x03a9, 0x0301, 0x03a9, + 0x0345, 0x00b4, 0x304b, 0x3099, 0x304d, 0x3099, 0x304f, 0x3099, + 0x3051, 0x3099, 0x3053, 0x3099, 0x3055, 0x3099, 0x3057, 0x3099, + 0x3059, 0x3099, 0x305b, 0x3099, 0x305d, 0x3099, 0x305f, 0x3099, + 0x3061, 0x3099, 0x3064, 0x3099, 0x3066, 0x3099, 0x3068, 0x3099, + 0x306f, 0x3099, 0x306f, 0x309a, 0x3072, 0x3099, 0x3072, 0x309a, + 0x3075, 0x3099, 0x3075, 0x309a, 0x3078, 0x3099, 0x3078, 0x309a, + 0x307b, 0x3099, 0x307b, 0x309a, 0x3046, 0x3099, 0x309d, 0x3099, + 0x30ab, 0x3099, 0x30ad, 0x3099, 0x30af, 0x3099, 0x30b1, 0x3099, + 0x30b3, 0x3099, 0x30b5, 0x3099, 0x30b7, 0x3099, 0x30b9, 0x3099, + 0x30bb, 0x3099, 0x30bd, 0x3099, 0x30bf, 0x3099, 0x30c1, 0x3099, + 0x30c4, 0x3099, 0x30c6, 0x3099, 0x30c8, 0x3099, 0x30cf, 0x3099, + 0x30cf, 0x309a, 0x30d2, 0x3099, 0x30d2, 0x309a, 0x30d5, 0x3099, + 0x30d5, 0x309a, 0x30d8, 0x3099, 0x30d8, 0x309a, 0x30db, 0x3099, + 0x30db, 0x309a, 0x30a6, 0x3099, 0x30ef, 0x3099, 0x30f0, 0x3099, + 0x30f1, 0x3099, 0x30f2, 0x3099, 0x30fd, 0x3099, 0x05f2, 0x05b7, + 0x05e9, 0x05c1, 0x05e9, 0x05c2, 0x05e9, 0x05bc, 0x05c1, 0x05e9, + 0x05bc, 0x05c2, 0x05d0, 0x05b7, 0x05d0, 0x05b8, 0x05d0, 0x05bc, + 0x05d1, 0x05bc, 0x05d2, 0x05bc, 0x05d3, 0x05bc, 0x05d4, 0x05bc, + 0x05d5, 0x05bc, 0x05d6, 0x05bc, 0x05d8, 0x05bc, 0x05d9, 0x05bc, + 0x05da, 0x05bc, 0x05db, 0x05bc, 0x05dc, 0x05bc, 0x05de, 0x05bc, + 0x05e0, 0x05bc, 0x05e1, 0x05bc, 0x05e3, 0x05bc, 0x05e4, 0x05bc, + 0x05e6, 0x05bc, 0x05e7, 0x05bc, 0x05e8, 0x05bc, 0x05e9, 0x05bc, + 0x05ea, 0x05bc, 0x05d5, 0x05b9, 0x05d1, 0x05bf, 0x05db, 0x05bf, + 0x05e4, 0x05bf +}; + +u16 hfsplus_compose_table[] = { + /* base */ + 0x0000, 0x0050, 0x0300, 0x00a4, 0x0301, 0x00e4, 0x0302, 0x015c, + 0x0303, 0x0192, 0x0304, 0x01b4, 0x0306, 0x01e6, 0x0307, 0x0220, + 0x0308, 0x0270, 0x0309, 0x02d2, 0x030a, 0x02ec, 0x030b, 0x02fa, + 0x030c, 0x0308, 0x030d, 0x034c, 0x030f, 0x0370, 0x0311, 0x038e, + 0x0313, 0x03a8, 0x0314, 0x03c6, 0x031b, 0x03e8, 0x0323, 0x03f2, + 0x0324, 0x0440, 0x0325, 0x0446, 0x0327, 0x044c, 0x0328, 0x047a, + 0x032d, 0x0490, 0x032e, 0x04aa, 0x0330, 0x04b0, 0x0331, 0x04be, + 0x0342, 0x04e2, 0x0345, 0x04f4, 0x05b7, 0x0504, 0x05b8, 0x050a, + 0x05b9, 0x050e, 0x05bc, 0x0512, 0x05bf, 0x0540, 0x05c1, 0x0548, + 0x05c2, 0x054c, 0x093c, 0x0550, 0x09bc, 0x0568, 0x09be, 0x0572, + 0x09d7, 0x0576, 0x0a3c, 0x057a, 0x0b3c, 0x0586, 0x0b3e, 0x058e, + 0x0b56, 0x0592, 0x0b57, 0x0596, 0x0bbe, 0x059a, 0x0bd7, 0x05a0, + 0x0c56, 0x05a6, 0x0cc2, 0x05aa, 0x0cd5, 0x05ae, 0x0cd6, 0x05b4, + 0x0d3e, 0x05b8, 0x0d57, 0x05be, 0x0e32, 0x05c2, 0x0eb2, 0x05c6, + 0x0f71, 0x05ca, 0x0f80, 0x05d2, 0x0fb5, 0x05d8, 0x0fb7, 0x05de, + 0x1100, 0x00a2, 0x1101, 0x00a2, 0x1102, 0x00a2, 0x1103, 0x00a2, + 0x1104, 0x00a2, 0x1105, 0x00a2, 0x1106, 0x00a2, 0x1107, 0x00a2, + 0x1108, 0x00a2, 0x1109, 0x00a2, 0x110a, 0x00a2, 0x110b, 0x00a2, + 0x110c, 0x00a2, 0x110d, 0x00a2, 0x110e, 0x00a2, 0x110f, 0x00a2, + 0x1110, 0x00a2, 0x1111, 0x00a2, 0x1112, 0x00a2, 0x3099, 0x05f4, + 0x309a, 0x0656, + /* hangul marker */ + 0xffff, 0x0000, + /* 0x0300 */ + 0x0340, 0x001f, 0x0041, 0x066c, 0x0045, 0x066e, 0x0049, 0x0670, + 0x004f, 0x0672, 0x0055, 0x0674, 0x0057, 0x0676, 0x0059, 0x0678, + 0x0061, 0x067a, 0x0065, 0x067c, 0x0069, 0x067e, 0x006f, 0x0680, + 0x0075, 0x0682, 0x0077, 0x0684, 0x0079, 0x0686, 0x00a8, 0x0688, + 0x0391, 0x068a, 0x0395, 0x068c, 0x0397, 0x068e, 0x0399, 0x0690, + 0x039f, 0x0692, 0x03a5, 0x0694, 0x03a9, 0x0696, 0x03b1, 0x0698, + 0x03b5, 0x069a, 0x03b7, 0x069c, 0x03b9, 0x069e, 0x03bf, 0x06a0, + 0x03c5, 0x06a2, 0x03c9, 0x06a4, 0x1fbf, 0x06a6, 0x1ffe, 0x06a8, + /* 0x0301 */ + 0x0341, 0x003b, 0x0041, 0x06aa, 0x0043, 0x06ac, 0x0045, 0x06ae, + 0x0047, 0x06b0, 0x0049, 0x06b2, 0x004b, 0x06b4, 0x004c, 0x06b6, + 0x004d, 0x06b8, 0x004e, 0x06ba, 0x004f, 0x06bc, 0x0050, 0x06be, + 0x0052, 0x06c0, 0x0053, 0x06c2, 0x0055, 0x06c6, 0x0057, 0x06c8, + 0x0059, 0x06ca, 0x005a, 0x06cc, 0x0061, 0x06ce, 0x0063, 0x06d0, + 0x0065, 0x06d2, 0x0067, 0x06d4, 0x0069, 0x06d6, 0x006b, 0x06d8, + 0x006c, 0x06da, 0x006d, 0x06dc, 0x006e, 0x06de, 0x006f, 0x06e0, + 0x0070, 0x06e2, 0x0072, 0x06e4, 0x0073, 0x06e6, 0x0075, 0x06ea, + 0x0077, 0x06ec, 0x0079, 0x06ee, 0x007a, 0x06f0, 0x00a8, 0x06f2, + 0x00c6, 0x06f4, 0x00d8, 0x06f6, 0x00e6, 0x06f8, 0x00f8, 0x06fa, + 0x0391, 0x06fc, 0x0395, 0x06fe, 0x0397, 0x0700, 0x0399, 0x0702, + 0x039f, 0x0704, 0x03a5, 0x0706, 0x03a9, 0x0708, 0x03b1, 0x070a, + 0x03b5, 0x070c, 0x03b7, 0x070e, 0x03b9, 0x0710, 0x03bf, 0x0712, + 0x03c5, 0x0714, 0x03c9, 0x0716, 0x0413, 0x0718, 0x041a, 0x071a, + 0x0433, 0x071c, 0x043a, 0x071e, 0x1fbf, 0x0720, 0x1ffe, 0x0722, + /* 0x0302 */ + 0x0000, 0x001a, 0x0041, 0x0724, 0x0043, 0x072e, 0x0045, 0x0730, + 0x0047, 0x073a, 0x0048, 0x073c, 0x0049, 0x073e, 0x004a, 0x0740, + 0x004f, 0x0742, 0x0053, 0x074c, 0x0055, 0x074e, 0x0057, 0x0750, + 0x0059, 0x0752, 0x005a, 0x0754, 0x0061, 0x0756, 0x0063, 0x0760, + 0x0065, 0x0762, 0x0067, 0x076c, 0x0068, 0x076e, 0x0069, 0x0770, + 0x006a, 0x0772, 0x006f, 0x0774, 0x0073, 0x077e, 0x0075, 0x0780, + 0x0077, 0x0782, 0x0079, 0x0784, 0x007a, 0x0786, + /* 0x0303 */ + 0x0000, 0x0010, 0x0041, 0x0788, 0x0045, 0x078a, 0x0049, 0x078c, + 0x004e, 0x078e, 0x004f, 0x0790, 0x0055, 0x0796, 0x0056, 0x079a, + 0x0059, 0x079c, 0x0061, 0x079e, 0x0065, 0x07a0, 0x0069, 0x07a2, + 0x006e, 0x07a4, 0x006f, 0x07a6, 0x0075, 0x07ac, 0x0076, 0x07b0, + 0x0079, 0x07b2, + /* 0x0304 */ + 0x0000, 0x0018, 0x0041, 0x07b4, 0x0045, 0x07b6, 0x0047, 0x07bc, + 0x0049, 0x07be, 0x004f, 0x07c0, 0x0055, 0x07c6, 0x0061, 0x07ca, + 0x0065, 0x07cc, 0x0067, 0x07d2, 0x0069, 0x07d4, 0x006f, 0x07d6, + 0x0075, 0x07dc, 0x00c6, 0x07e0, 0x00e6, 0x07e2, 0x0391, 0x07e4, + 0x0399, 0x07e6, 0x03a5, 0x07e8, 0x03b1, 0x07ea, 0x03b9, 0x07ec, + 0x03c5, 0x07ee, 0x0418, 0x07f0, 0x0423, 0x07f2, 0x0438, 0x07f4, + 0x0443, 0x07f6, + /* 0x0306 */ + 0x0000, 0x001c, 0x0041, 0x07f8, 0x0045, 0x0802, 0x0047, 0x0804, + 0x0049, 0x0806, 0x004f, 0x0808, 0x0055, 0x080a, 0x0061, 0x080c, + 0x0065, 0x0816, 0x0067, 0x0818, 0x0069, 0x081a, 0x006f, 0x081c, + 0x0075, 0x081e, 0x0391, 0x0820, 0x0399, 0x0822, 0x03a5, 0x0824, + 0x03b1, 0x0826, 0x03b9, 0x0828, 0x03c5, 0x082a, 0x0410, 0x082c, + 0x0415, 0x082e, 0x0416, 0x0830, 0x0418, 0x0832, 0x0423, 0x0834, + 0x0430, 0x0836, 0x0435, 0x0838, 0x0436, 0x083a, 0x0438, 0x083c, + 0x0443, 0x083e, + /* 0x0307 */ + 0x0000, 0x0027, 0x0041, 0x0840, 0x0042, 0x0844, 0x0043, 0x0846, + 0x0044, 0x0848, 0x0045, 0x084a, 0x0046, 0x084c, 0x0047, 0x084e, + 0x0048, 0x0850, 0x0049, 0x0852, 0x004d, 0x0854, 0x004e, 0x0856, + 0x0050, 0x0858, 0x0052, 0x085a, 0x0053, 0x085c, 0x0054, 0x085e, + 0x0057, 0x0860, 0x0058, 0x0862, 0x0059, 0x0864, 0x005a, 0x0866, + 0x0061, 0x0868, 0x0062, 0x086c, 0x0063, 0x086e, 0x0064, 0x0870, + 0x0065, 0x0872, 0x0066, 0x0874, 0x0067, 0x0876, 0x0068, 0x0878, + 0x006d, 0x087a, 0x006e, 0x087c, 0x0070, 0x087e, 0x0072, 0x0880, + 0x0073, 0x0882, 0x0074, 0x0884, 0x0077, 0x0886, 0x0078, 0x0888, + 0x0079, 0x088a, 0x007a, 0x088c, 0x017f, 0x088e, 0x0306, 0x0890, + /* 0x0308 */ + 0x0000, 0x0030, 0x0041, 0x0892, 0x0045, 0x0896, 0x0048, 0x0898, + 0x0049, 0x089a, 0x004f, 0x089e, 0x0055, 0x08a0, 0x0057, 0x08aa, + 0x0058, 0x08ac, 0x0059, 0x08ae, 0x0061, 0x08b0, 0x0065, 0x08b4, + 0x0068, 0x08b6, 0x0069, 0x08b8, 0x006f, 0x08bc, 0x0074, 0x08be, + 0x0075, 0x08c0, 0x0077, 0x08ca, 0x0078, 0x08cc, 0x0079, 0x08ce, + 0x018f, 0x08d0, 0x019f, 0x08d2, 0x0259, 0x08d4, 0x0275, 0x08d6, + 0x0399, 0x08d8, 0x03a5, 0x08da, 0x03b9, 0x08dc, 0x03c5, 0x08e6, + 0x03d2, 0x08f0, 0x0406, 0x08f2, 0x0410, 0x08f4, 0x0415, 0x08f6, + 0x0416, 0x08f8, 0x0417, 0x08fa, 0x0418, 0x08fc, 0x041e, 0x08fe, + 0x0423, 0x0900, 0x0427, 0x0902, 0x042b, 0x0904, 0x0430, 0x0906, + 0x0435, 0x0908, 0x0436, 0x090a, 0x0437, 0x090c, 0x0438, 0x090e, + 0x043e, 0x0910, 0x0443, 0x0912, 0x0447, 0x0914, 0x044b, 0x0916, + 0x0456, 0x0918, + /* 0x0309 */ + 0x0000, 0x000c, 0x0041, 0x091a, 0x0045, 0x091c, 0x0049, 0x091e, + 0x004f, 0x0920, 0x0055, 0x0922, 0x0059, 0x0924, 0x0061, 0x0926, + 0x0065, 0x0928, 0x0069, 0x092a, 0x006f, 0x092c, 0x0075, 0x092e, + 0x0079, 0x0930, + /* 0x030a */ + 0x0000, 0x0006, 0x0041, 0x0932, 0x0055, 0x0936, 0x0061, 0x0938, + 0x0075, 0x093c, 0x0077, 0x093e, 0x0079, 0x0940, + /* 0x030b */ + 0x0000, 0x0006, 0x004f, 0x0942, 0x0055, 0x0944, 0x006f, 0x0946, + 0x0075, 0x0948, 0x0423, 0x094a, 0x0443, 0x094c, + /* 0x030c */ + 0x0000, 0x0021, 0x0041, 0x094e, 0x0043, 0x0950, 0x0044, 0x0952, + 0x0045, 0x0954, 0x0047, 0x0956, 0x0049, 0x0958, 0x004b, 0x095a, + 0x004c, 0x095c, 0x004e, 0x095e, 0x004f, 0x0960, 0x0052, 0x0962, + 0x0053, 0x0964, 0x0054, 0x0968, 0x0055, 0x096a, 0x005a, 0x096c, + 0x0061, 0x096e, 0x0063, 0x0970, 0x0064, 0x0972, 0x0065, 0x0974, + 0x0067, 0x0976, 0x0069, 0x0978, 0x006a, 0x097a, 0x006b, 0x097c, + 0x006c, 0x097e, 0x006e, 0x0980, 0x006f, 0x0982, 0x0072, 0x0984, + 0x0073, 0x0986, 0x0074, 0x098a, 0x0075, 0x098c, 0x007a, 0x098e, + 0x01b7, 0x0990, 0x0292, 0x0992, + /* 0x030d */ + 0x0000, 0x0011, 0x00a8, 0x0994, 0x0308, 0x0996, 0x0391, 0x0998, + 0x0395, 0x099a, 0x0397, 0x099c, 0x0399, 0x099e, 0x039f, 0x09a0, + 0x03a5, 0x09a2, 0x03a9, 0x09a4, 0x03b1, 0x09a6, 0x03b5, 0x09a8, + 0x03b7, 0x09aa, 0x03b9, 0x09ac, 0x03bf, 0x09ae, 0x03c5, 0x09b0, + 0x03c9, 0x09b2, 0x03d2, 0x09b4, + /* 0x030f */ + 0x0000, 0x000e, 0x0041, 0x09b6, 0x0045, 0x09b8, 0x0049, 0x09ba, + 0x004f, 0x09bc, 0x0052, 0x09be, 0x0055, 0x09c0, 0x0061, 0x09c2, + 0x0065, 0x09c4, 0x0069, 0x09c6, 0x006f, 0x09c8, 0x0072, 0x09ca, + 0x0075, 0x09cc, 0x0474, 0x09ce, 0x0475, 0x09d0, + /* 0x0311 */ + 0x0000, 0x000c, 0x0041, 0x09d2, 0x0045, 0x09d4, 0x0049, 0x09d6, + 0x004f, 0x09d8, 0x0052, 0x09da, 0x0055, 0x09dc, 0x0061, 0x09de, + 0x0065, 0x09e0, 0x0069, 0x09e2, 0x006f, 0x09e4, 0x0072, 0x09e6, + 0x0075, 0x09e8, + /* 0x0313 */ + 0x0343, 0x000e, 0x0391, 0x09ea, 0x0395, 0x09f2, 0x0397, 0x09f8, + 0x0399, 0x0a00, 0x039f, 0x0a08, 0x03a9, 0x0a0e, 0x03b1, 0x0a16, + 0x03b5, 0x0a1e, 0x03b7, 0x0a24, 0x03b9, 0x0a2c, 0x03bf, 0x0a34, + 0x03c1, 0x0a3a, 0x03c5, 0x0a3c, 0x03c9, 0x0a44, + /* 0x0314 */ + 0x0000, 0x0010, 0x0391, 0x0a4c, 0x0395, 0x0a54, 0x0397, 0x0a5a, + 0x0399, 0x0a62, 0x039f, 0x0a6a, 0x03a1, 0x0a70, 0x03a5, 0x0a72, + 0x03a9, 0x0a7a, 0x03b1, 0x0a82, 0x03b5, 0x0a8a, 0x03b7, 0x0a90, + 0x03b9, 0x0a98, 0x03bf, 0x0aa0, 0x03c1, 0x0aa6, 0x03c5, 0x0aa8, + 0x03c9, 0x0ab0, + /* 0x031b */ + 0x0000, 0x0004, 0x004f, 0x0ab8, 0x0055, 0x0ac4, 0x006f, 0x0ad0, + 0x0075, 0x0adc, + /* 0x0323 */ + 0x0000, 0x0026, 0x0041, 0x0ae8, 0x0042, 0x0aee, 0x0044, 0x0af0, + 0x0045, 0x0af2, 0x0048, 0x0af6, 0x0049, 0x0af8, 0x004b, 0x0afa, + 0x004c, 0x0afc, 0x004d, 0x0b00, 0x004e, 0x0b02, 0x004f, 0x0b04, + 0x0052, 0x0b08, 0x0053, 0x0b0c, 0x0054, 0x0b10, 0x0055, 0x0b12, + 0x0056, 0x0b14, 0x0057, 0x0b16, 0x0059, 0x0b18, 0x005a, 0x0b1a, + 0x0061, 0x0b1c, 0x0062, 0x0b22, 0x0064, 0x0b24, 0x0065, 0x0b26, + 0x0068, 0x0b2a, 0x0069, 0x0b2c, 0x006b, 0x0b2e, 0x006c, 0x0b30, + 0x006d, 0x0b34, 0x006e, 0x0b36, 0x006f, 0x0b38, 0x0072, 0x0b3c, + 0x0073, 0x0b40, 0x0074, 0x0b44, 0x0075, 0x0b46, 0x0076, 0x0b48, + 0x0077, 0x0b4a, 0x0079, 0x0b4c, 0x007a, 0x0b4e, + /* 0x0324 */ + 0x0000, 0x0002, 0x0055, 0x0b50, 0x0075, 0x0b52, + /* 0x0325 */ + 0x0000, 0x0002, 0x0041, 0x0b54, 0x0061, 0x0b56, + /* 0x0327 */ + 0x0000, 0x0016, 0x0043, 0x0b58, 0x0044, 0x0b5c, 0x0045, 0x0b5e, + 0x0047, 0x0b62, 0x0048, 0x0b64, 0x004b, 0x0b66, 0x004c, 0x0b68, + 0x004e, 0x0b6a, 0x0052, 0x0b6c, 0x0053, 0x0b6e, 0x0054, 0x0b70, + 0x0063, 0x0b72, 0x0064, 0x0b76, 0x0065, 0x0b78, 0x0067, 0x0b7c, + 0x0068, 0x0b7e, 0x006b, 0x0b80, 0x006c, 0x0b82, 0x006e, 0x0b84, + 0x0072, 0x0b86, 0x0073, 0x0b88, 0x0074, 0x0b8a, + /* 0x0328 */ + 0x0000, 0x000a, 0x0041, 0x0b8c, 0x0045, 0x0b8e, 0x0049, 0x0b90, + 0x004f, 0x0b92, 0x0055, 0x0b96, 0x0061, 0x0b98, 0x0065, 0x0b9a, + 0x0069, 0x0b9c, 0x006f, 0x0b9e, 0x0075, 0x0ba2, + /* 0x032d */ + 0x0000, 0x000c, 0x0044, 0x0ba4, 0x0045, 0x0ba6, 0x004c, 0x0ba8, + 0x004e, 0x0baa, 0x0054, 0x0bac, 0x0055, 0x0bae, 0x0064, 0x0bb0, + 0x0065, 0x0bb2, 0x006c, 0x0bb4, 0x006e, 0x0bb6, 0x0074, 0x0bb8, + 0x0075, 0x0bba, + /* 0x032e */ + 0x0000, 0x0002, 0x0048, 0x0bbc, 0x0068, 0x0bbe, + /* 0x0330 */ + 0x0000, 0x0006, 0x0045, 0x0bc0, 0x0049, 0x0bc2, 0x0055, 0x0bc4, + 0x0065, 0x0bc6, 0x0069, 0x0bc8, 0x0075, 0x0bca, + /* 0x0331 */ + 0x0000, 0x0011, 0x0042, 0x0bcc, 0x0044, 0x0bce, 0x004b, 0x0bd0, + 0x004c, 0x0bd2, 0x004e, 0x0bd4, 0x0052, 0x0bd6, 0x0054, 0x0bd8, + 0x005a, 0x0bda, 0x0062, 0x0bdc, 0x0064, 0x0bde, 0x0068, 0x0be0, + 0x006b, 0x0be2, 0x006c, 0x0be4, 0x006e, 0x0be6, 0x0072, 0x0be8, + 0x0074, 0x0bea, 0x007a, 0x0bec, + /* 0x0342 */ + 0x0000, 0x0008, 0x00a8, 0x0bee, 0x03b1, 0x0bf0, 0x03b7, 0x0bf2, + 0x03b9, 0x0bf4, 0x03c5, 0x0bf6, 0x03c9, 0x0bf8, 0x1fbf, 0x0bfa, + 0x1ffe, 0x0bfc, + /* 0x0345 */ + 0x0000, 0x0007, 0x0391, 0x0bfe, 0x0397, 0x0c04, 0x03a9, 0x0c0a, + 0x03b1, 0x0c10, 0x03b7, 0x0c1c, 0x03bf, 0x0c28, 0x03c9, 0x0c2c, + /* 0x05b7 */ + 0x0000, 0x0002, 0x05d0, 0x0c36, 0x05f2, 0x0c38, + /* 0x05b8 */ + 0x0000, 0x0001, 0x05d0, 0x0c3a, + /* 0x05b9 */ + 0x0000, 0x0001, 0x05d5, 0x0c3c, + /* 0x05bc */ + 0x0000, 0x0016, 0x05d0, 0x0c3e, 0x05d1, 0x0c40, 0x05d2, 0x0c42, + 0x05d3, 0x0c44, 0x05d4, 0x0c46, 0x05d5, 0x0c48, 0x05d6, 0x0c4a, + 0x05d8, 0x0c4c, 0x05d9, 0x0c4e, 0x05da, 0x0c50, 0x05db, 0x0c52, + 0x05dc, 0x0c54, 0x05de, 0x0c56, 0x05e0, 0x0c58, 0x05e1, 0x0c5a, + 0x05e3, 0x0c5c, 0x05e4, 0x0c5e, 0x05e6, 0x0c60, 0x05e7, 0x0c62, + 0x05e8, 0x0c64, 0x05e9, 0x0c66, 0x05ea, 0x0c6c, + /* 0x05bf */ + 0x0000, 0x0003, 0x05d1, 0x0c6e, 0x05db, 0x0c70, 0x05e4, 0x0c72, + /* 0x05c1 */ + 0x0000, 0x0001, 0x05e9, 0x0c74, + /* 0x05c2 */ + 0x0000, 0x0001, 0x05e9, 0x0c76, + /* 0x093c */ + 0x0000, 0x000b, 0x0915, 0x0c78, 0x0916, 0x0c7a, 0x0917, 0x0c7c, + 0x091c, 0x0c7e, 0x0921, 0x0c80, 0x0922, 0x0c82, 0x0928, 0x0c84, + 0x092b, 0x0c86, 0x092f, 0x0c88, 0x0930, 0x0c8a, 0x0933, 0x0c8c, + /* 0x09bc */ + 0x0000, 0x0004, 0x09a1, 0x0c8e, 0x09a2, 0x0c90, 0x09ac, 0x0c92, + 0x09af, 0x0c94, + /* 0x09be */ + 0x0000, 0x0001, 0x09c7, 0x0c96, + /* 0x09d7 */ + 0x0000, 0x0001, 0x09c7, 0x0c98, + /* 0x0a3c */ + 0x0000, 0x0005, 0x0a16, 0x0c9a, 0x0a17, 0x0c9c, 0x0a1c, 0x0c9e, + 0x0a21, 0x0ca0, 0x0a2b, 0x0ca2, + /* 0x0b3c */ + 0x0000, 0x0003, 0x0b21, 0x0ca4, 0x0b22, 0x0ca6, 0x0b2f, 0x0ca8, + /* 0x0b3e */ + 0x0000, 0x0001, 0x0b47, 0x0caa, + /* 0x0b56 */ + 0x0000, 0x0001, 0x0b47, 0x0cac, + /* 0x0b57 */ + 0x0000, 0x0001, 0x0b47, 0x0cae, + /* 0x0bbe */ + 0x0000, 0x0002, 0x0bc6, 0x0cb0, 0x0bc7, 0x0cb2, + /* 0x0bd7 */ + 0x0000, 0x0002, 0x0b92, 0x0cb4, 0x0bc6, 0x0cb6, + /* 0x0c56 */ + 0x0000, 0x0001, 0x0c46, 0x0cb8, + /* 0x0cc2 */ + 0x0000, 0x0001, 0x0cc6, 0x0cba, + /* 0x0cd5 */ + 0x0000, 0x0002, 0x0cbf, 0x0cbe, 0x0cc6, 0x0cc0, + /* 0x0cd6 */ + 0x0000, 0x0001, 0x0cc6, 0x0cc2, + /* 0x0d3e */ + 0x0000, 0x0002, 0x0d46, 0x0cc4, 0x0d47, 0x0cc6, + /* 0x0d57 */ + 0x0000, 0x0001, 0x0d46, 0x0cc8, + /* 0x0e32 */ + 0x0000, 0x0001, 0x0e4d, 0x0cca, + /* 0x0eb2 */ + 0x0000, 0x0001, 0x0ecd, 0x0ccc, + /* 0x0f71 */ + 0x0000, 0x0003, 0x0f72, 0x0cce, 0x0f74, 0x0cd0, 0x0f80, 0x0cd2, + /* 0x0f80 */ + 0x0000, 0x0002, 0x0fb2, 0x0cd4, 0x0fb3, 0x0cd8, + /* 0x0fb5 */ + 0x0000, 0x0002, 0x0f40, 0x0cdc, 0x0f90, 0x0cde, + /* 0x0fb7 */ + 0x0000, 0x000a, 0x0f42, 0x0ce0, 0x0f4c, 0x0ce2, 0x0f51, 0x0ce4, + 0x0f56, 0x0ce6, 0x0f5b, 0x0ce8, 0x0f92, 0x0cea, 0x0f9c, 0x0cec, + 0x0fa1, 0x0cee, 0x0fa6, 0x0cf0, 0x0fab, 0x0cf2, + /* 0x3099 */ + 0x0000, 0x0030, 0x3046, 0x0cf4, 0x304b, 0x0cf6, 0x304d, 0x0cf8, + 0x304f, 0x0cfa, 0x3051, 0x0cfc, 0x3053, 0x0cfe, 0x3055, 0x0d00, + 0x3057, 0x0d02, 0x3059, 0x0d04, 0x305b, 0x0d06, 0x305d, 0x0d08, + 0x305f, 0x0d0a, 0x3061, 0x0d0c, 0x3064, 0x0d0e, 0x3066, 0x0d10, + 0x3068, 0x0d12, 0x306f, 0x0d14, 0x3072, 0x0d16, 0x3075, 0x0d18, + 0x3078, 0x0d1a, 0x307b, 0x0d1c, 0x309d, 0x0d1e, 0x30a6, 0x0d20, + 0x30ab, 0x0d22, 0x30ad, 0x0d24, 0x30af, 0x0d26, 0x30b1, 0x0d28, + 0x30b3, 0x0d2a, 0x30b5, 0x0d2c, 0x30b7, 0x0d2e, 0x30b9, 0x0d30, + 0x30bb, 0x0d32, 0x30bd, 0x0d34, 0x30bf, 0x0d36, 0x30c1, 0x0d38, + 0x30c4, 0x0d3a, 0x30c6, 0x0d3c, 0x30c8, 0x0d3e, 0x30cf, 0x0d40, + 0x30d2, 0x0d42, 0x30d5, 0x0d44, 0x30d8, 0x0d46, 0x30db, 0x0d48, + 0x30ef, 0x0d4a, 0x30f0, 0x0d4c, 0x30f1, 0x0d4e, 0x30f2, 0x0d50, + 0x30fd, 0x0d52, + /* 0x309a */ + 0x0000, 0x000a, 0x306f, 0x0d54, 0x3072, 0x0d56, 0x3075, 0x0d58, + 0x3078, 0x0d5a, 0x307b, 0x0d5c, 0x30cf, 0x0d5e, 0x30d2, 0x0d60, + 0x30d5, 0x0d62, 0x30d8, 0x0d64, 0x30db, 0x0d66, + /* 0x0041 0x0300 */ + 0x00c0, 0x0000, + /* 0x0045 0x0300 */ + 0x00c8, 0x0000, + /* 0x0049 0x0300 */ + 0x00cc, 0x0000, + /* 0x004f 0x0300 */ + 0x00d2, 0x0000, + /* 0x0055 0x0300 */ + 0x00d9, 0x0000, + /* 0x0057 0x0300 */ + 0x1e80, 0x0000, + /* 0x0059 0x0300 */ + 0x1ef2, 0x0000, + /* 0x0061 0x0300 */ + 0x00e0, 0x0000, + /* 0x0065 0x0300 */ + 0x00e8, 0x0000, + /* 0x0069 0x0300 */ + 0x00ec, 0x0000, + /* 0x006f 0x0300 */ + 0x00f2, 0x0000, + /* 0x0075 0x0300 */ + 0x00f9, 0x0000, + /* 0x0077 0x0300 */ + 0x1e81, 0x0000, + /* 0x0079 0x0300 */ + 0x1ef3, 0x0000, + /* 0x00a8 0x0300 */ + 0x1fed, 0x0000, + /* 0x0391 0x0300 */ + 0x1fba, 0x0000, + /* 0x0395 0x0300 */ + 0x1fc8, 0x0000, + /* 0x0397 0x0300 */ + 0x1fca, 0x0000, + /* 0x0399 0x0300 */ + 0x1fda, 0x0000, + /* 0x039f 0x0300 */ + 0x1ff8, 0x0000, + /* 0x03a5 0x0300 */ + 0x1fea, 0x0000, + /* 0x03a9 0x0300 */ + 0x1ffa, 0x0000, + /* 0x03b1 0x0300 */ + 0x1f70, 0x0000, + /* 0x03b5 0x0300 */ + 0x1f72, 0x0000, + /* 0x03b7 0x0300 */ + 0x1f74, 0x0000, + /* 0x03b9 0x0300 */ + 0x1f76, 0x0000, + /* 0x03bf 0x0300 */ + 0x1f78, 0x0000, + /* 0x03c5 0x0300 */ + 0x1f7a, 0x0000, + /* 0x03c9 0x0300 */ + 0x1f7c, 0x0000, + /* 0x1fbf 0x0300 */ + 0x1fcd, 0x0000, + /* 0x1ffe 0x0300 */ + 0x1fdd, 0x0000, + /* 0x0041 0x0301 */ + 0x00c1, 0x0000, + /* 0x0043 0x0301 */ + 0x0106, 0x0000, + /* 0x0045 0x0301 */ + 0x00c9, 0x0000, + /* 0x0047 0x0301 */ + 0x01f4, 0x0000, + /* 0x0049 0x0301 */ + 0x00cd, 0x0000, + /* 0x004b 0x0301 */ + 0x1e30, 0x0000, + /* 0x004c 0x0301 */ + 0x0139, 0x0000, + /* 0x004d 0x0301 */ + 0x1e3e, 0x0000, + /* 0x004e 0x0301 */ + 0x0143, 0x0000, + /* 0x004f 0x0301 */ + 0x00d3, 0x0000, + /* 0x0050 0x0301 */ + 0x1e54, 0x0000, + /* 0x0052 0x0301 */ + 0x0154, 0x0000, + /* 0x0053 0x0301 */ + 0x015a, 0x0001, 0x0307, 0x0d68, + /* 0x0055 0x0301 */ + 0x00da, 0x0000, + /* 0x0057 0x0301 */ + 0x1e82, 0x0000, + /* 0x0059 0x0301 */ + 0x00dd, 0x0000, + /* 0x005a 0x0301 */ + 0x0179, 0x0000, + /* 0x0061 0x0301 */ + 0x00e1, 0x0000, + /* 0x0063 0x0301 */ + 0x0107, 0x0000, + /* 0x0065 0x0301 */ + 0x00e9, 0x0000, + /* 0x0067 0x0301 */ + 0x01f5, 0x0000, + /* 0x0069 0x0301 */ + 0x00ed, 0x0000, + /* 0x006b 0x0301 */ + 0x1e31, 0x0000, + /* 0x006c 0x0301 */ + 0x013a, 0x0000, + /* 0x006d 0x0301 */ + 0x1e3f, 0x0000, + /* 0x006e 0x0301 */ + 0x0144, 0x0000, + /* 0x006f 0x0301 */ + 0x00f3, 0x0000, + /* 0x0070 0x0301 */ + 0x1e55, 0x0000, + /* 0x0072 0x0301 */ + 0x0155, 0x0000, + /* 0x0073 0x0301 */ + 0x015b, 0x0001, 0x0307, 0x0d6a, + /* 0x0075 0x0301 */ + 0x00fa, 0x0000, + /* 0x0077 0x0301 */ + 0x1e83, 0x0000, + /* 0x0079 0x0301 */ + 0x00fd, 0x0000, + /* 0x007a 0x0301 */ + 0x017a, 0x0000, + /* 0x00a8 0x0301 */ + 0x1fee, 0x0000, + /* 0x00c6 0x0301 */ + 0x01fc, 0x0000, + /* 0x00d8 0x0301 */ + 0x01fe, 0x0000, + /* 0x00e6 0x0301 */ + 0x01fd, 0x0000, + /* 0x00f8 0x0301 */ + 0x01ff, 0x0000, + /* 0x0391 0x0301 */ + 0x1fbb, 0x0000, + /* 0x0395 0x0301 */ + 0x1fc9, 0x0000, + /* 0x0397 0x0301 */ + 0x1fcb, 0x0000, + /* 0x0399 0x0301 */ + 0x1fdb, 0x0000, + /* 0x039f 0x0301 */ + 0x1ff9, 0x0000, + /* 0x03a5 0x0301 */ + 0x1feb, 0x0000, + /* 0x03a9 0x0301 */ + 0x1ffb, 0x0000, + /* 0x03b1 0x0301 */ + 0x1f71, 0x0000, + /* 0x03b5 0x0301 */ + 0x1f73, 0x0000, + /* 0x03b7 0x0301 */ + 0x1f75, 0x0000, + /* 0x03b9 0x0301 */ + 0x1f77, 0x0000, + /* 0x03bf 0x0301 */ + 0x1f79, 0x0000, + /* 0x03c5 0x0301 */ + 0x1f7b, 0x0000, + /* 0x03c9 0x0301 */ + 0x1f7d, 0x0000, + /* 0x0413 0x0301 */ + 0x0403, 0x0000, + /* 0x041a 0x0301 */ + 0x040c, 0x0000, + /* 0x0433 0x0301 */ + 0x0453, 0x0000, + /* 0x043a 0x0301 */ + 0x045c, 0x0000, + /* 0x1fbf 0x0301 */ + 0x1fce, 0x0000, + /* 0x1ffe 0x0301 */ + 0x1fde, 0x0000, + /* 0x0041 0x0302 */ + 0x00c2, 0x0004, 0x0300, 0x0d6c, 0x0301, 0x0d6e, 0x0303, 0x0d70, + 0x0309, 0x0d72, + /* 0x0043 0x0302 */ + 0x0108, 0x0000, + /* 0x0045 0x0302 */ + 0x00ca, 0x0004, 0x0300, 0x0d74, 0x0301, 0x0d76, 0x0303, 0x0d78, + 0x0309, 0x0d7a, + /* 0x0047 0x0302 */ + 0x011c, 0x0000, + /* 0x0048 0x0302 */ + 0x0124, 0x0000, + /* 0x0049 0x0302 */ + 0x00ce, 0x0000, + /* 0x004a 0x0302 */ + 0x0134, 0x0000, + /* 0x004f 0x0302 */ + 0x00d4, 0x0004, 0x0300, 0x0d7c, 0x0301, 0x0d7e, 0x0303, 0x0d80, + 0x0309, 0x0d82, + /* 0x0053 0x0302 */ + 0x015c, 0x0000, + /* 0x0055 0x0302 */ + 0x00db, 0x0000, + /* 0x0057 0x0302 */ + 0x0174, 0x0000, + /* 0x0059 0x0302 */ + 0x0176, 0x0000, + /* 0x005a 0x0302 */ + 0x1e90, 0x0000, + /* 0x0061 0x0302 */ + 0x00e2, 0x0004, 0x0300, 0x0d84, 0x0301, 0x0d86, 0x0303, 0x0d88, + 0x0309, 0x0d8a, + /* 0x0063 0x0302 */ + 0x0109, 0x0000, + /* 0x0065 0x0302 */ + 0x00ea, 0x0004, 0x0300, 0x0d8c, 0x0301, 0x0d8e, 0x0303, 0x0d90, + 0x0309, 0x0d92, + /* 0x0067 0x0302 */ + 0x011d, 0x0000, + /* 0x0068 0x0302 */ + 0x0125, 0x0000, + /* 0x0069 0x0302 */ + 0x00ee, 0x0000, + /* 0x006a 0x0302 */ + 0x0135, 0x0000, + /* 0x006f 0x0302 */ + 0x00f4, 0x0004, 0x0300, 0x0d94, 0x0301, 0x0d96, 0x0303, 0x0d98, + 0x0309, 0x0d9a, + /* 0x0073 0x0302 */ + 0x015d, 0x0000, + /* 0x0075 0x0302 */ + 0x00fb, 0x0000, + /* 0x0077 0x0302 */ + 0x0175, 0x0000, + /* 0x0079 0x0302 */ + 0x0177, 0x0000, + /* 0x007a 0x0302 */ + 0x1e91, 0x0000, + /* 0x0041 0x0303 */ + 0x00c3, 0x0000, + /* 0x0045 0x0303 */ + 0x1ebc, 0x0000, + /* 0x0049 0x0303 */ + 0x0128, 0x0000, + /* 0x004e 0x0303 */ + 0x00d1, 0x0000, + /* 0x004f 0x0303 */ + 0x00d5, 0x0002, 0x0301, 0x0d9c, 0x0308, 0x0d9e, + /* 0x0055 0x0303 */ + 0x0168, 0x0001, 0x0301, 0x0da0, + /* 0x0056 0x0303 */ + 0x1e7c, 0x0000, + /* 0x0059 0x0303 */ + 0x1ef8, 0x0000, + /* 0x0061 0x0303 */ + 0x00e3, 0x0000, + /* 0x0065 0x0303 */ + 0x1ebd, 0x0000, + /* 0x0069 0x0303 */ + 0x0129, 0x0000, + /* 0x006e 0x0303 */ + 0x00f1, 0x0000, + /* 0x006f 0x0303 */ + 0x00f5, 0x0002, 0x0301, 0x0da2, 0x0308, 0x0da4, + /* 0x0075 0x0303 */ + 0x0169, 0x0001, 0x0301, 0x0da6, + /* 0x0076 0x0303 */ + 0x1e7d, 0x0000, + /* 0x0079 0x0303 */ + 0x1ef9, 0x0000, + /* 0x0041 0x0304 */ + 0x0100, 0x0000, + /* 0x0045 0x0304 */ + 0x0112, 0x0002, 0x0300, 0x0da8, 0x0301, 0x0daa, + /* 0x0047 0x0304 */ + 0x1e20, 0x0000, + /* 0x0049 0x0304 */ + 0x012a, 0x0000, + /* 0x004f 0x0304 */ + 0x014c, 0x0002, 0x0300, 0x0dac, 0x0301, 0x0dae, + /* 0x0055 0x0304 */ + 0x016a, 0x0001, 0x0308, 0x0db0, + /* 0x0061 0x0304 */ + 0x0101, 0x0000, + /* 0x0065 0x0304 */ + 0x0113, 0x0002, 0x0300, 0x0db2, 0x0301, 0x0db4, + /* 0x0067 0x0304 */ + 0x1e21, 0x0000, + /* 0x0069 0x0304 */ + 0x012b, 0x0000, + /* 0x006f 0x0304 */ + 0x014d, 0x0002, 0x0300, 0x0db6, 0x0301, 0x0db8, + /* 0x0075 0x0304 */ + 0x016b, 0x0001, 0x0308, 0x0dba, + /* 0x00c6 0x0304 */ + 0x01e2, 0x0000, + /* 0x00e6 0x0304 */ + 0x01e3, 0x0000, + /* 0x0391 0x0304 */ + 0x1fb9, 0x0000, + /* 0x0399 0x0304 */ + 0x1fd9, 0x0000, + /* 0x03a5 0x0304 */ + 0x1fe9, 0x0000, + /* 0x03b1 0x0304 */ + 0x1fb1, 0x0000, + /* 0x03b9 0x0304 */ + 0x1fd1, 0x0000, + /* 0x03c5 0x0304 */ + 0x1fe1, 0x0000, + /* 0x0418 0x0304 */ + 0x04e2, 0x0000, + /* 0x0423 0x0304 */ + 0x04ee, 0x0000, + /* 0x0438 0x0304 */ + 0x04e3, 0x0000, + /* 0x0443 0x0304 */ + 0x04ef, 0x0000, + /* 0x0041 0x0306 */ + 0x0102, 0x0004, 0x0300, 0x0dbc, 0x0301, 0x0dbe, 0x0303, 0x0dc0, + 0x0309, 0x0dc2, + /* 0x0045 0x0306 */ + 0x0114, 0x0000, + /* 0x0047 0x0306 */ + 0x011e, 0x0000, + /* 0x0049 0x0306 */ + 0x012c, 0x0000, + /* 0x004f 0x0306 */ + 0x014e, 0x0000, + /* 0x0055 0x0306 */ + 0x016c, 0x0000, + /* 0x0061 0x0306 */ + 0x0103, 0x0004, 0x0300, 0x0dc4, 0x0301, 0x0dc6, 0x0303, 0x0dc8, + 0x0309, 0x0dca, + /* 0x0065 0x0306 */ + 0x0115, 0x0000, + /* 0x0067 0x0306 */ + 0x011f, 0x0000, + /* 0x0069 0x0306 */ + 0x012d, 0x0000, + /* 0x006f 0x0306 */ + 0x014f, 0x0000, + /* 0x0075 0x0306 */ + 0x016d, 0x0000, + /* 0x0391 0x0306 */ + 0x1fb8, 0x0000, + /* 0x0399 0x0306 */ + 0x1fd8, 0x0000, + /* 0x03a5 0x0306 */ + 0x1fe8, 0x0000, + /* 0x03b1 0x0306 */ + 0x1fb0, 0x0000, + /* 0x03b9 0x0306 */ + 0x1fd0, 0x0000, + /* 0x03c5 0x0306 */ + 0x1fe0, 0x0000, + /* 0x0410 0x0306 */ + 0x04d0, 0x0000, + /* 0x0415 0x0306 */ + 0x04d6, 0x0000, + /* 0x0416 0x0306 */ + 0x04c1, 0x0000, + /* 0x0418 0x0306 */ + 0x0419, 0x0000, + /* 0x0423 0x0306 */ + 0x040e, 0x0000, + /* 0x0430 0x0306 */ + 0x04d1, 0x0000, + /* 0x0435 0x0306 */ + 0x04d7, 0x0000, + /* 0x0436 0x0306 */ + 0x04c2, 0x0000, + /* 0x0438 0x0306 */ + 0x0439, 0x0000, + /* 0x0443 0x0306 */ + 0x045e, 0x0000, + /* 0x0041 0x0307 */ + 0x0000, 0x0001, 0x0304, 0x0dcc, + /* 0x0042 0x0307 */ + 0x1e02, 0x0000, + /* 0x0043 0x0307 */ + 0x010a, 0x0000, + /* 0x0044 0x0307 */ + 0x1e0a, 0x0000, + /* 0x0045 0x0307 */ + 0x0116, 0x0000, + /* 0x0046 0x0307 */ + 0x1e1e, 0x0000, + /* 0x0047 0x0307 */ + 0x0120, 0x0000, + /* 0x0048 0x0307 */ + 0x1e22, 0x0000, + /* 0x0049 0x0307 */ + 0x0130, 0x0000, + /* 0x004d 0x0307 */ + 0x1e40, 0x0000, + /* 0x004e 0x0307 */ + 0x1e44, 0x0000, + /* 0x0050 0x0307 */ + 0x1e56, 0x0000, + /* 0x0052 0x0307 */ + 0x1e58, 0x0000, + /* 0x0053 0x0307 */ + 0x1e60, 0x0000, + /* 0x0054 0x0307 */ + 0x1e6a, 0x0000, + /* 0x0057 0x0307 */ + 0x1e86, 0x0000, + /* 0x0058 0x0307 */ + 0x1e8a, 0x0000, + /* 0x0059 0x0307 */ + 0x1e8e, 0x0000, + /* 0x005a 0x0307 */ + 0x017b, 0x0000, + /* 0x0061 0x0307 */ + 0x0000, 0x0001, 0x0304, 0x0dce, + /* 0x0062 0x0307 */ + 0x1e03, 0x0000, + /* 0x0063 0x0307 */ + 0x010b, 0x0000, + /* 0x0064 0x0307 */ + 0x1e0b, 0x0000, + /* 0x0065 0x0307 */ + 0x0117, 0x0000, + /* 0x0066 0x0307 */ + 0x1e1f, 0x0000, + /* 0x0067 0x0307 */ + 0x0121, 0x0000, + /* 0x0068 0x0307 */ + 0x1e23, 0x0000, + /* 0x006d 0x0307 */ + 0x1e41, 0x0000, + /* 0x006e 0x0307 */ + 0x1e45, 0x0000, + /* 0x0070 0x0307 */ + 0x1e57, 0x0000, + /* 0x0072 0x0307 */ + 0x1e59, 0x0000, + /* 0x0073 0x0307 */ + 0x1e61, 0x0000, + /* 0x0074 0x0307 */ + 0x1e6b, 0x0000, + /* 0x0077 0x0307 */ + 0x1e87, 0x0000, + /* 0x0078 0x0307 */ + 0x1e8b, 0x0000, + /* 0x0079 0x0307 */ + 0x1e8f, 0x0000, + /* 0x007a 0x0307 */ + 0x017c, 0x0000, + /* 0x017f 0x0307 */ + 0x1e9b, 0x0000, + /* 0x0306 0x0307 */ + 0x0310, 0x0000, + /* 0x0041 0x0308 */ + 0x00c4, 0x0001, 0x0304, 0x0dd0, + /* 0x0045 0x0308 */ + 0x00cb, 0x0000, + /* 0x0048 0x0308 */ + 0x1e26, 0x0000, + /* 0x0049 0x0308 */ + 0x00cf, 0x0001, 0x0301, 0x0dd2, + /* 0x004f 0x0308 */ + 0x00d6, 0x0000, + /* 0x0055 0x0308 */ + 0x00dc, 0x0004, 0x0300, 0x0dd4, 0x0301, 0x0dd6, 0x0304, 0x0dd8, + 0x030c, 0x0dda, + /* 0x0057 0x0308 */ + 0x1e84, 0x0000, + /* 0x0058 0x0308 */ + 0x1e8c, 0x0000, + /* 0x0059 0x0308 */ + 0x0178, 0x0000, + /* 0x0061 0x0308 */ + 0x00e4, 0x0001, 0x0304, 0x0ddc, + /* 0x0065 0x0308 */ + 0x00eb, 0x0000, + /* 0x0068 0x0308 */ + 0x1e27, 0x0000, + /* 0x0069 0x0308 */ + 0x00ef, 0x0001, 0x0301, 0x0dde, + /* 0x006f 0x0308 */ + 0x00f6, 0x0000, + /* 0x0074 0x0308 */ + 0x1e97, 0x0000, + /* 0x0075 0x0308 */ + 0x00fc, 0x0004, 0x0300, 0x0de0, 0x0301, 0x0de2, 0x0304, 0x0de4, + 0x030c, 0x0de6, + /* 0x0077 0x0308 */ + 0x1e85, 0x0000, + /* 0x0078 0x0308 */ + 0x1e8d, 0x0000, + /* 0x0079 0x0308 */ + 0x00ff, 0x0000, + /* 0x018f 0x0308 */ + 0x04da, 0x0000, + /* 0x019f 0x0308 */ + 0x04ea, 0x0000, + /* 0x0259 0x0308 */ + 0x04db, 0x0000, + /* 0x0275 0x0308 */ + 0x04eb, 0x0000, + /* 0x0399 0x0308 */ + 0x03aa, 0x0000, + /* 0x03a5 0x0308 */ + 0x03ab, 0x0000, + /* 0x03b9 0x0308 */ + 0x03ca, 0x0004, 0x0300, 0x0de8, 0x0301, 0x0dea, 0x030d, 0x0dec, + 0x0342, 0x0dee, + /* 0x03c5 0x0308 */ + 0x03cb, 0x0004, 0x0300, 0x0df0, 0x0301, 0x0df2, 0x030d, 0x0df4, + 0x0342, 0x0df6, + /* 0x03d2 0x0308 */ + 0x03d4, 0x0000, + /* 0x0406 0x0308 */ + 0x0407, 0x0000, + /* 0x0410 0x0308 */ + 0x04d2, 0x0000, + /* 0x0415 0x0308 */ + 0x0401, 0x0000, + /* 0x0416 0x0308 */ + 0x04dc, 0x0000, + /* 0x0417 0x0308 */ + 0x04de, 0x0000, + /* 0x0418 0x0308 */ + 0x04e4, 0x0000, + /* 0x041e 0x0308 */ + 0x04e6, 0x0000, + /* 0x0423 0x0308 */ + 0x04f0, 0x0000, + /* 0x0427 0x0308 */ + 0x04f4, 0x0000, + /* 0x042b 0x0308 */ + 0x04f8, 0x0000, + /* 0x0430 0x0308 */ + 0x04d3, 0x0000, + /* 0x0435 0x0308 */ + 0x0451, 0x0000, + /* 0x0436 0x0308 */ + 0x04dd, 0x0000, + /* 0x0437 0x0308 */ + 0x04df, 0x0000, + /* 0x0438 0x0308 */ + 0x04e5, 0x0000, + /* 0x043e 0x0308 */ + 0x04e7, 0x0000, + /* 0x0443 0x0308 */ + 0x04f1, 0x0000, + /* 0x0447 0x0308 */ + 0x04f5, 0x0000, + /* 0x044b 0x0308 */ + 0x04f9, 0x0000, + /* 0x0456 0x0308 */ + 0x0457, 0x0000, + /* 0x0041 0x0309 */ + 0x1ea2, 0x0000, + /* 0x0045 0x0309 */ + 0x1eba, 0x0000, + /* 0x0049 0x0309 */ + 0x1ec8, 0x0000, + /* 0x004f 0x0309 */ + 0x1ece, 0x0000, + /* 0x0055 0x0309 */ + 0x1ee6, 0x0000, + /* 0x0059 0x0309 */ + 0x1ef6, 0x0000, + /* 0x0061 0x0309 */ + 0x1ea3, 0x0000, + /* 0x0065 0x0309 */ + 0x1ebb, 0x0000, + /* 0x0069 0x0309 */ + 0x1ec9, 0x0000, + /* 0x006f 0x0309 */ + 0x1ecf, 0x0000, + /* 0x0075 0x0309 */ + 0x1ee7, 0x0000, + /* 0x0079 0x0309 */ + 0x1ef7, 0x0000, + /* 0x0041 0x030a */ + 0x00c5, 0x0001, 0x0301, 0x0df8, + /* 0x0055 0x030a */ + 0x016e, 0x0000, + /* 0x0061 0x030a */ + 0x00e5, 0x0001, 0x0301, 0x0dfa, + /* 0x0075 0x030a */ + 0x016f, 0x0000, + /* 0x0077 0x030a */ + 0x1e98, 0x0000, + /* 0x0079 0x030a */ + 0x1e99, 0x0000, + /* 0x004f 0x030b */ + 0x0150, 0x0000, + /* 0x0055 0x030b */ + 0x0170, 0x0000, + /* 0x006f 0x030b */ + 0x0151, 0x0000, + /* 0x0075 0x030b */ + 0x0171, 0x0000, + /* 0x0423 0x030b */ + 0x04f2, 0x0000, + /* 0x0443 0x030b */ + 0x04f3, 0x0000, + /* 0x0041 0x030c */ + 0x01cd, 0x0000, + /* 0x0043 0x030c */ + 0x010c, 0x0000, + /* 0x0044 0x030c */ + 0x010e, 0x0000, + /* 0x0045 0x030c */ + 0x011a, 0x0000, + /* 0x0047 0x030c */ + 0x01e6, 0x0000, + /* 0x0049 0x030c */ + 0x01cf, 0x0000, + /* 0x004b 0x030c */ + 0x01e8, 0x0000, + /* 0x004c 0x030c */ + 0x013d, 0x0000, + /* 0x004e 0x030c */ + 0x0147, 0x0000, + /* 0x004f 0x030c */ + 0x01d1, 0x0000, + /* 0x0052 0x030c */ + 0x0158, 0x0000, + /* 0x0053 0x030c */ + 0x0160, 0x0001, 0x0307, 0x0dfc, + /* 0x0054 0x030c */ + 0x0164, 0x0000, + /* 0x0055 0x030c */ + 0x01d3, 0x0000, + /* 0x005a 0x030c */ + 0x017d, 0x0000, + /* 0x0061 0x030c */ + 0x01ce, 0x0000, + /* 0x0063 0x030c */ + 0x010d, 0x0000, + /* 0x0064 0x030c */ + 0x010f, 0x0000, + /* 0x0065 0x030c */ + 0x011b, 0x0000, + /* 0x0067 0x030c */ + 0x01e7, 0x0000, + /* 0x0069 0x030c */ + 0x01d0, 0x0000, + /* 0x006a 0x030c */ + 0x01f0, 0x0000, + /* 0x006b 0x030c */ + 0x01e9, 0x0000, + /* 0x006c 0x030c */ + 0x013e, 0x0000, + /* 0x006e 0x030c */ + 0x0148, 0x0000, + /* 0x006f 0x030c */ + 0x01d2, 0x0000, + /* 0x0072 0x030c */ + 0x0159, 0x0000, + /* 0x0073 0x030c */ + 0x0161, 0x0001, 0x0307, 0x0dfe, + /* 0x0074 0x030c */ + 0x0165, 0x0000, + /* 0x0075 0x030c */ + 0x01d4, 0x0000, + /* 0x007a 0x030c */ + 0x017e, 0x0000, + /* 0x01b7 0x030c */ + 0x01ee, 0x0000, + /* 0x0292 0x030c */ + 0x01ef, 0x0000, + /* 0x00a8 0x030d */ + 0x0385, 0x0000, + /* 0x0308 0x030d */ + 0x0344, 0x0000, + /* 0x0391 0x030d */ + 0x0386, 0x0000, + /* 0x0395 0x030d */ + 0x0388, 0x0000, + /* 0x0397 0x030d */ + 0x0389, 0x0000, + /* 0x0399 0x030d */ + 0x038a, 0x0000, + /* 0x039f 0x030d */ + 0x038c, 0x0000, + /* 0x03a5 0x030d */ + 0x038e, 0x0000, + /* 0x03a9 0x030d */ + 0x038f, 0x0000, + /* 0x03b1 0x030d */ + 0x03ac, 0x0000, + /* 0x03b5 0x030d */ + 0x03ad, 0x0000, + /* 0x03b7 0x030d */ + 0x03ae, 0x0000, + /* 0x03b9 0x030d */ + 0x03af, 0x0000, + /* 0x03bf 0x030d */ + 0x03cc, 0x0000, + /* 0x03c5 0x030d */ + 0x03cd, 0x0000, + /* 0x03c9 0x030d */ + 0x03ce, 0x0000, + /* 0x03d2 0x030d */ + 0x03d3, 0x0000, + /* 0x0041 0x030f */ + 0x0200, 0x0000, + /* 0x0045 0x030f */ + 0x0204, 0x0000, + /* 0x0049 0x030f */ + 0x0208, 0x0000, + /* 0x004f 0x030f */ + 0x020c, 0x0000, + /* 0x0052 0x030f */ + 0x0210, 0x0000, + /* 0x0055 0x030f */ + 0x0214, 0x0000, + /* 0x0061 0x030f */ + 0x0201, 0x0000, + /* 0x0065 0x030f */ + 0x0205, 0x0000, + /* 0x0069 0x030f */ + 0x0209, 0x0000, + /* 0x006f 0x030f */ + 0x020d, 0x0000, + /* 0x0072 0x030f */ + 0x0211, 0x0000, + /* 0x0075 0x030f */ + 0x0215, 0x0000, + /* 0x0474 0x030f */ + 0x0476, 0x0000, + /* 0x0475 0x030f */ + 0x0477, 0x0000, + /* 0x0041 0x0311 */ + 0x0202, 0x0000, + /* 0x0045 0x0311 */ + 0x0206, 0x0000, + /* 0x0049 0x0311 */ + 0x020a, 0x0000, + /* 0x004f 0x0311 */ + 0x020e, 0x0000, + /* 0x0052 0x0311 */ + 0x0212, 0x0000, + /* 0x0055 0x0311 */ + 0x0216, 0x0000, + /* 0x0061 0x0311 */ + 0x0203, 0x0000, + /* 0x0065 0x0311 */ + 0x0207, 0x0000, + /* 0x0069 0x0311 */ + 0x020b, 0x0000, + /* 0x006f 0x0311 */ + 0x020f, 0x0000, + /* 0x0072 0x0311 */ + 0x0213, 0x0000, + /* 0x0075 0x0311 */ + 0x0217, 0x0000, + /* 0x0391 0x0313 */ + 0x1f08, 0x0003, 0x0300, 0x0e00, 0x0301, 0x0e02, 0x0342, 0x0e04, + /* 0x0395 0x0313 */ + 0x1f18, 0x0002, 0x0300, 0x0e06, 0x0301, 0x0e08, + /* 0x0397 0x0313 */ + 0x1f28, 0x0003, 0x0300, 0x0e0a, 0x0301, 0x0e0c, 0x0342, 0x0e0e, + /* 0x0399 0x0313 */ + 0x1f38, 0x0003, 0x0300, 0x0e10, 0x0301, 0x0e12, 0x0342, 0x0e14, + /* 0x039f 0x0313 */ + 0x1f48, 0x0002, 0x0300, 0x0e16, 0x0301, 0x0e18, + /* 0x03a9 0x0313 */ + 0x1f68, 0x0003, 0x0300, 0x0e1a, 0x0301, 0x0e1c, 0x0342, 0x0e1e, + /* 0x03b1 0x0313 */ + 0x1f00, 0x0003, 0x0300, 0x0e20, 0x0301, 0x0e22, 0x0342, 0x0e24, + /* 0x03b5 0x0313 */ + 0x1f10, 0x0002, 0x0300, 0x0e26, 0x0301, 0x0e28, + /* 0x03b7 0x0313 */ + 0x1f20, 0x0003, 0x0300, 0x0e2a, 0x0301, 0x0e2c, 0x0342, 0x0e2e, + /* 0x03b9 0x0313 */ + 0x1f30, 0x0003, 0x0300, 0x0e30, 0x0301, 0x0e32, 0x0342, 0x0e34, + /* 0x03bf 0x0313 */ + 0x1f40, 0x0002, 0x0300, 0x0e36, 0x0301, 0x0e38, + /* 0x03c1 0x0313 */ + 0x1fe4, 0x0000, + /* 0x03c5 0x0313 */ + 0x1f50, 0x0003, 0x0300, 0x0e3a, 0x0301, 0x0e3c, 0x0342, 0x0e3e, + /* 0x03c9 0x0313 */ + 0x1f60, 0x0003, 0x0300, 0x0e40, 0x0301, 0x0e42, 0x0342, 0x0e44, + /* 0x0391 0x0314 */ + 0x1f09, 0x0003, 0x0300, 0x0e46, 0x0301, 0x0e48, 0x0342, 0x0e4a, + /* 0x0395 0x0314 */ + 0x1f19, 0x0002, 0x0300, 0x0e4c, 0x0301, 0x0e4e, + /* 0x0397 0x0314 */ + 0x1f29, 0x0003, 0x0300, 0x0e50, 0x0301, 0x0e52, 0x0342, 0x0e54, + /* 0x0399 0x0314 */ + 0x1f39, 0x0003, 0x0300, 0x0e56, 0x0301, 0x0e58, 0x0342, 0x0e5a, + /* 0x039f 0x0314 */ + 0x1f49, 0x0002, 0x0300, 0x0e5c, 0x0301, 0x0e5e, + /* 0x03a1 0x0314 */ + 0x1fec, 0x0000, + /* 0x03a5 0x0314 */ + 0x1f59, 0x0003, 0x0300, 0x0e60, 0x0301, 0x0e62, 0x0342, 0x0e64, + /* 0x03a9 0x0314 */ + 0x1f69, 0x0003, 0x0300, 0x0e66, 0x0301, 0x0e68, 0x0342, 0x0e6a, + /* 0x03b1 0x0314 */ + 0x1f01, 0x0003, 0x0300, 0x0e6c, 0x0301, 0x0e6e, 0x0342, 0x0e70, + /* 0x03b5 0x0314 */ + 0x1f11, 0x0002, 0x0300, 0x0e72, 0x0301, 0x0e74, + /* 0x03b7 0x0314 */ + 0x1f21, 0x0003, 0x0300, 0x0e76, 0x0301, 0x0e78, 0x0342, 0x0e7a, + /* 0x03b9 0x0314 */ + 0x1f31, 0x0003, 0x0300, 0x0e7c, 0x0301, 0x0e7e, 0x0342, 0x0e80, + /* 0x03bf 0x0314 */ + 0x1f41, 0x0002, 0x0300, 0x0e82, 0x0301, 0x0e84, + /* 0x03c1 0x0314 */ + 0x1fe5, 0x0000, + /* 0x03c5 0x0314 */ + 0x1f51, 0x0003, 0x0300, 0x0e86, 0x0301, 0x0e88, 0x0342, 0x0e8a, + /* 0x03c9 0x0314 */ + 0x1f61, 0x0003, 0x0300, 0x0e8c, 0x0301, 0x0e8e, 0x0342, 0x0e90, + /* 0x004f 0x031b */ + 0x01a0, 0x0005, 0x0300, 0x0e92, 0x0301, 0x0e94, 0x0303, 0x0e96, + 0x0309, 0x0e98, 0x0323, 0x0e9a, + /* 0x0055 0x031b */ + 0x01af, 0x0005, 0x0300, 0x0e9c, 0x0301, 0x0e9e, 0x0303, 0x0ea0, + 0x0309, 0x0ea2, 0x0323, 0x0ea4, + /* 0x006f 0x031b */ + 0x01a1, 0x0005, 0x0300, 0x0ea6, 0x0301, 0x0ea8, 0x0303, 0x0eaa, + 0x0309, 0x0eac, 0x0323, 0x0eae, + /* 0x0075 0x031b */ + 0x01b0, 0x0005, 0x0300, 0x0eb0, 0x0301, 0x0eb2, 0x0303, 0x0eb4, + 0x0309, 0x0eb6, 0x0323, 0x0eb8, + /* 0x0041 0x0323 */ + 0x1ea0, 0x0002, 0x0302, 0x0eba, 0x0306, 0x0ebc, + /* 0x0042 0x0323 */ + 0x1e04, 0x0000, + /* 0x0044 0x0323 */ + 0x1e0c, 0x0000, + /* 0x0045 0x0323 */ + 0x1eb8, 0x0001, 0x0302, 0x0ebe, + /* 0x0048 0x0323 */ + 0x1e24, 0x0000, + /* 0x0049 0x0323 */ + 0x1eca, 0x0000, + /* 0x004b 0x0323 */ + 0x1e32, 0x0000, + /* 0x004c 0x0323 */ + 0x1e36, 0x0001, 0x0304, 0x0ec0, + /* 0x004d 0x0323 */ + 0x1e42, 0x0000, + /* 0x004e 0x0323 */ + 0x1e46, 0x0000, + /* 0x004f 0x0323 */ + 0x1ecc, 0x0001, 0x0302, 0x0ec2, + /* 0x0052 0x0323 */ + 0x1e5a, 0x0001, 0x0304, 0x0ec4, + /* 0x0053 0x0323 */ + 0x1e62, 0x0001, 0x0307, 0x0ec6, + /* 0x0054 0x0323 */ + 0x1e6c, 0x0000, + /* 0x0055 0x0323 */ + 0x1ee4, 0x0000, + /* 0x0056 0x0323 */ + 0x1e7e, 0x0000, + /* 0x0057 0x0323 */ + 0x1e88, 0x0000, + /* 0x0059 0x0323 */ + 0x1ef4, 0x0000, + /* 0x005a 0x0323 */ + 0x1e92, 0x0000, + /* 0x0061 0x0323 */ + 0x1ea1, 0x0002, 0x0302, 0x0ec8, 0x0306, 0x0eca, + /* 0x0062 0x0323 */ + 0x1e05, 0x0000, + /* 0x0064 0x0323 */ + 0x1e0d, 0x0000, + /* 0x0065 0x0323 */ + 0x1eb9, 0x0001, 0x0302, 0x0ecc, + /* 0x0068 0x0323 */ + 0x1e25, 0x0000, + /* 0x0069 0x0323 */ + 0x1ecb, 0x0000, + /* 0x006b 0x0323 */ + 0x1e33, 0x0000, + /* 0x006c 0x0323 */ + 0x1e37, 0x0001, 0x0304, 0x0ece, + /* 0x006d 0x0323 */ + 0x1e43, 0x0000, + /* 0x006e 0x0323 */ + 0x1e47, 0x0000, + /* 0x006f 0x0323 */ + 0x1ecd, 0x0001, 0x0302, 0x0ed0, + /* 0x0072 0x0323 */ + 0x1e5b, 0x0001, 0x0304, 0x0ed2, + /* 0x0073 0x0323 */ + 0x1e63, 0x0001, 0x0307, 0x0ed4, + /* 0x0074 0x0323 */ + 0x1e6d, 0x0000, + /* 0x0075 0x0323 */ + 0x1ee5, 0x0000, + /* 0x0076 0x0323 */ + 0x1e7f, 0x0000, + /* 0x0077 0x0323 */ + 0x1e89, 0x0000, + /* 0x0079 0x0323 */ + 0x1ef5, 0x0000, + /* 0x007a 0x0323 */ + 0x1e93, 0x0000, + /* 0x0055 0x0324 */ + 0x1e72, 0x0000, + /* 0x0075 0x0324 */ + 0x1e73, 0x0000, + /* 0x0041 0x0325 */ + 0x1e00, 0x0000, + /* 0x0061 0x0325 */ + 0x1e01, 0x0000, + /* 0x0043 0x0327 */ + 0x00c7, 0x0001, 0x0301, 0x0ed6, + /* 0x0044 0x0327 */ + 0x1e10, 0x0000, + /* 0x0045 0x0327 */ + 0x0000, 0x0001, 0x0306, 0x0ed8, + /* 0x0047 0x0327 */ + 0x0122, 0x0000, + /* 0x0048 0x0327 */ + 0x1e28, 0x0000, + /* 0x004b 0x0327 */ + 0x0136, 0x0000, + /* 0x004c 0x0327 */ + 0x013b, 0x0000, + /* 0x004e 0x0327 */ + 0x0145, 0x0000, + /* 0x0052 0x0327 */ + 0x0156, 0x0000, + /* 0x0053 0x0327 */ + 0x015e, 0x0000, + /* 0x0054 0x0327 */ + 0x0162, 0x0000, + /* 0x0063 0x0327 */ + 0x00e7, 0x0001, 0x0301, 0x0eda, + /* 0x0064 0x0327 */ + 0x1e11, 0x0000, + /* 0x0065 0x0327 */ + 0x0000, 0x0001, 0x0306, 0x0edc, + /* 0x0067 0x0327 */ + 0x0123, 0x0000, + /* 0x0068 0x0327 */ + 0x1e29, 0x0000, + /* 0x006b 0x0327 */ + 0x0137, 0x0000, + /* 0x006c 0x0327 */ + 0x013c, 0x0000, + /* 0x006e 0x0327 */ + 0x0146, 0x0000, + /* 0x0072 0x0327 */ + 0x0157, 0x0000, + /* 0x0073 0x0327 */ + 0x015f, 0x0000, + /* 0x0074 0x0327 */ + 0x0163, 0x0000, + /* 0x0041 0x0328 */ + 0x0104, 0x0000, + /* 0x0045 0x0328 */ + 0x0118, 0x0000, + /* 0x0049 0x0328 */ + 0x012e, 0x0000, + /* 0x004f 0x0328 */ + 0x01ea, 0x0001, 0x0304, 0x0ede, + /* 0x0055 0x0328 */ + 0x0172, 0x0000, + /* 0x0061 0x0328 */ + 0x0105, 0x0000, + /* 0x0065 0x0328 */ + 0x0119, 0x0000, + /* 0x0069 0x0328 */ + 0x012f, 0x0000, + /* 0x006f 0x0328 */ + 0x01eb, 0x0001, 0x0304, 0x0ee0, + /* 0x0075 0x0328 */ + 0x0173, 0x0000, + /* 0x0044 0x032d */ + 0x1e12, 0x0000, + /* 0x0045 0x032d */ + 0x1e18, 0x0000, + /* 0x004c 0x032d */ + 0x1e3c, 0x0000, + /* 0x004e 0x032d */ + 0x1e4a, 0x0000, + /* 0x0054 0x032d */ + 0x1e70, 0x0000, + /* 0x0055 0x032d */ + 0x1e76, 0x0000, + /* 0x0064 0x032d */ + 0x1e13, 0x0000, + /* 0x0065 0x032d */ + 0x1e19, 0x0000, + /* 0x006c 0x032d */ + 0x1e3d, 0x0000, + /* 0x006e 0x032d */ + 0x1e4b, 0x0000, + /* 0x0074 0x032d */ + 0x1e71, 0x0000, + /* 0x0075 0x032d */ + 0x1e77, 0x0000, + /* 0x0048 0x032e */ + 0x1e2a, 0x0000, + /* 0x0068 0x032e */ + 0x1e2b, 0x0000, + /* 0x0045 0x0330 */ + 0x1e1a, 0x0000, + /* 0x0049 0x0330 */ + 0x1e2c, 0x0000, + /* 0x0055 0x0330 */ + 0x1e74, 0x0000, + /* 0x0065 0x0330 */ + 0x1e1b, 0x0000, + /* 0x0069 0x0330 */ + 0x1e2d, 0x0000, + /* 0x0075 0x0330 */ + 0x1e75, 0x0000, + /* 0x0042 0x0331 */ + 0x1e06, 0x0000, + /* 0x0044 0x0331 */ + 0x1e0e, 0x0000, + /* 0x004b 0x0331 */ + 0x1e34, 0x0000, + /* 0x004c 0x0331 */ + 0x1e3a, 0x0000, + /* 0x004e 0x0331 */ + 0x1e48, 0x0000, + /* 0x0052 0x0331 */ + 0x1e5e, 0x0000, + /* 0x0054 0x0331 */ + 0x1e6e, 0x0000, + /* 0x005a 0x0331 */ + 0x1e94, 0x0000, + /* 0x0062 0x0331 */ + 0x1e07, 0x0000, + /* 0x0064 0x0331 */ + 0x1e0f, 0x0000, + /* 0x0068 0x0331 */ + 0x1e96, 0x0000, + /* 0x006b 0x0331 */ + 0x1e35, 0x0000, + /* 0x006c 0x0331 */ + 0x1e3b, 0x0000, + /* 0x006e 0x0331 */ + 0x1e49, 0x0000, + /* 0x0072 0x0331 */ + 0x1e5f, 0x0000, + /* 0x0074 0x0331 */ + 0x1e6f, 0x0000, + /* 0x007a 0x0331 */ + 0x1e95, 0x0000, + /* 0x00a8 0x0342 */ + 0x1fc1, 0x0000, + /* 0x03b1 0x0342 */ + 0x1fb6, 0x0000, + /* 0x03b7 0x0342 */ + 0x1fc6, 0x0000, + /* 0x03b9 0x0342 */ + 0x1fd6, 0x0000, + /* 0x03c5 0x0342 */ + 0x1fe6, 0x0000, + /* 0x03c9 0x0342 */ + 0x1ff6, 0x0000, + /* 0x1fbf 0x0342 */ + 0x1fcf, 0x0000, + /* 0x1ffe 0x0342 */ + 0x1fdf, 0x0000, + /* 0x0391 0x0345 */ + 0x1fbc, 0x0002, 0x0313, 0x0ee2, 0x0314, 0x0eea, + /* 0x0397 0x0345 */ + 0x1fcc, 0x0002, 0x0313, 0x0ef2, 0x0314, 0x0efa, + /* 0x03a9 0x0345 */ + 0x1ffc, 0x0002, 0x0313, 0x0f02, 0x0314, 0x0f0a, + /* 0x03b1 0x0345 */ + 0x1fb3, 0x0005, 0x0300, 0x0f12, 0x0301, 0x0f14, 0x0313, 0x0f16, + 0x0314, 0x0f1e, 0x0342, 0x0f26, + /* 0x03b7 0x0345 */ + 0x1fc3, 0x0005, 0x0300, 0x0f28, 0x0301, 0x0f2a, 0x0313, 0x0f2c, + 0x0314, 0x0f34, 0x0342, 0x0f3c, + /* 0x03bf 0x0345 */ + 0x0000, 0x0001, 0x0301, 0x0f3e, + /* 0x03c9 0x0345 */ + 0x1ff3, 0x0004, 0x0300, 0x0f40, 0x0313, 0x0f42, 0x0314, 0x0f4a, + 0x0342, 0x0f52, + /* 0x05d0 0x05b7 */ + 0xfb2e, 0x0000, + /* 0x05f2 0x05b7 */ + 0xfb1f, 0x0000, + /* 0x05d0 0x05b8 */ + 0xfb2f, 0x0000, + /* 0x05d5 0x05b9 */ + 0xfb4b, 0x0000, + /* 0x05d0 0x05bc */ + 0xfb30, 0x0000, + /* 0x05d1 0x05bc */ + 0xfb31, 0x0000, + /* 0x05d2 0x05bc */ + 0xfb32, 0x0000, + /* 0x05d3 0x05bc */ + 0xfb33, 0x0000, + /* 0x05d4 0x05bc */ + 0xfb34, 0x0000, + /* 0x05d5 0x05bc */ + 0xfb35, 0x0000, + /* 0x05d6 0x05bc */ + 0xfb36, 0x0000, + /* 0x05d8 0x05bc */ + 0xfb38, 0x0000, + /* 0x05d9 0x05bc */ + 0xfb39, 0x0000, + /* 0x05da 0x05bc */ + 0xfb3a, 0x0000, + /* 0x05db 0x05bc */ + 0xfb3b, 0x0000, + /* 0x05dc 0x05bc */ + 0xfb3c, 0x0000, + /* 0x05de 0x05bc */ + 0xfb3e, 0x0000, + /* 0x05e0 0x05bc */ + 0xfb40, 0x0000, + /* 0x05e1 0x05bc */ + 0xfb41, 0x0000, + /* 0x05e3 0x05bc */ + 0xfb43, 0x0000, + /* 0x05e4 0x05bc */ + 0xfb44, 0x0000, + /* 0x05e6 0x05bc */ + 0xfb46, 0x0000, + /* 0x05e7 0x05bc */ + 0xfb47, 0x0000, + /* 0x05e8 0x05bc */ + 0xfb48, 0x0000, + /* 0x05e9 0x05bc */ + 0xfb49, 0x0002, 0x05c1, 0x0f54, 0x05c2, 0x0f56, + /* 0x05ea 0x05bc */ + 0xfb4a, 0x0000, + /* 0x05d1 0x05bf */ + 0xfb4c, 0x0000, + /* 0x05db 0x05bf */ + 0xfb4d, 0x0000, + /* 0x05e4 0x05bf */ + 0xfb4e, 0x0000, + /* 0x05e9 0x05c1 */ + 0xfb2a, 0x0000, + /* 0x05e9 0x05c2 */ + 0xfb2b, 0x0000, + /* 0x0915 0x093c */ + 0x0958, 0x0000, + /* 0x0916 0x093c */ + 0x0959, 0x0000, + /* 0x0917 0x093c */ + 0x095a, 0x0000, + /* 0x091c 0x093c */ + 0x095b, 0x0000, + /* 0x0921 0x093c */ + 0x095c, 0x0000, + /* 0x0922 0x093c */ + 0x095d, 0x0000, + /* 0x0928 0x093c */ + 0x0929, 0x0000, + /* 0x092b 0x093c */ + 0x095e, 0x0000, + /* 0x092f 0x093c */ + 0x095f, 0x0000, + /* 0x0930 0x093c */ + 0x0931, 0x0000, + /* 0x0933 0x093c */ + 0x0934, 0x0000, + /* 0x09a1 0x09bc */ + 0x09dc, 0x0000, + /* 0x09a2 0x09bc */ + 0x09dd, 0x0000, + /* 0x09ac 0x09bc */ + 0x09b0, 0x0000, + /* 0x09af 0x09bc */ + 0x09df, 0x0000, + /* 0x09c7 0x09be */ + 0x09cb, 0x0000, + /* 0x09c7 0x09d7 */ + 0x09cc, 0x0000, + /* 0x0a16 0x0a3c */ + 0x0a59, 0x0000, + /* 0x0a17 0x0a3c */ + 0x0a5a, 0x0000, + /* 0x0a1c 0x0a3c */ + 0x0a5b, 0x0000, + /* 0x0a21 0x0a3c */ + 0x0a5c, 0x0000, + /* 0x0a2b 0x0a3c */ + 0x0a5e, 0x0000, + /* 0x0b21 0x0b3c */ + 0x0b5c, 0x0000, + /* 0x0b22 0x0b3c */ + 0x0b5d, 0x0000, + /* 0x0b2f 0x0b3c */ + 0x0b5f, 0x0000, + /* 0x0b47 0x0b3e */ + 0x0b4b, 0x0000, + /* 0x0b47 0x0b56 */ + 0x0b48, 0x0000, + /* 0x0b47 0x0b57 */ + 0x0b4c, 0x0000, + /* 0x0bc6 0x0bbe */ + 0x0bca, 0x0000, + /* 0x0bc7 0x0bbe */ + 0x0bcb, 0x0000, + /* 0x0b92 0x0bd7 */ + 0x0b94, 0x0000, + /* 0x0bc6 0x0bd7 */ + 0x0bcc, 0x0000, + /* 0x0c46 0x0c56 */ + 0x0c48, 0x0000, + /* 0x0cc6 0x0cc2 */ + 0x0cca, 0x0001, 0x0cd5, 0x0f58, + /* 0x0cbf 0x0cd5 */ + 0x0cc0, 0x0000, + /* 0x0cc6 0x0cd5 */ + 0x0cc7, 0x0000, + /* 0x0cc6 0x0cd6 */ + 0x0cc8, 0x0000, + /* 0x0d46 0x0d3e */ + 0x0d4a, 0x0000, + /* 0x0d47 0x0d3e */ + 0x0d4b, 0x0000, + /* 0x0d46 0x0d57 */ + 0x0d4c, 0x0000, + /* 0x0e4d 0x0e32 */ + 0x0e33, 0x0000, + /* 0x0ecd 0x0eb2 */ + 0x0eb3, 0x0000, + /* 0x0f72 0x0f71 */ + 0x0f73, 0x0000, + /* 0x0f74 0x0f71 */ + 0x0f75, 0x0000, + /* 0x0f80 0x0f71 */ + 0x0f81, 0x0000, + /* 0x0fb2 0x0f80 */ + 0x0f76, 0x0001, 0x0f71, 0x0f5a, + /* 0x0fb3 0x0f80 */ + 0x0f78, 0x0001, 0x0f71, 0x0f5c, + /* 0x0f40 0x0fb5 */ + 0x0f69, 0x0000, + /* 0x0f90 0x0fb5 */ + 0x0fb9, 0x0000, + /* 0x0f42 0x0fb7 */ + 0x0f43, 0x0000, + /* 0x0f4c 0x0fb7 */ + 0x0f4d, 0x0000, + /* 0x0f51 0x0fb7 */ + 0x0f52, 0x0000, + /* 0x0f56 0x0fb7 */ + 0x0f57, 0x0000, + /* 0x0f5b 0x0fb7 */ + 0x0f5c, 0x0000, + /* 0x0f92 0x0fb7 */ + 0x0f93, 0x0000, + /* 0x0f9c 0x0fb7 */ + 0x0f9d, 0x0000, + /* 0x0fa1 0x0fb7 */ + 0x0fa2, 0x0000, + /* 0x0fa6 0x0fb7 */ + 0x0fa7, 0x0000, + /* 0x0fab 0x0fb7 */ + 0x0fac, 0x0000, + /* 0x3046 0x3099 */ + 0x3094, 0x0000, + /* 0x304b 0x3099 */ + 0x304c, 0x0000, + /* 0x304d 0x3099 */ + 0x304e, 0x0000, + /* 0x304f 0x3099 */ + 0x3050, 0x0000, + /* 0x3051 0x3099 */ + 0x3052, 0x0000, + /* 0x3053 0x3099 */ + 0x3054, 0x0000, + /* 0x3055 0x3099 */ + 0x3056, 0x0000, + /* 0x3057 0x3099 */ + 0x3058, 0x0000, + /* 0x3059 0x3099 */ + 0x305a, 0x0000, + /* 0x305b 0x3099 */ + 0x305c, 0x0000, + /* 0x305d 0x3099 */ + 0x305e, 0x0000, + /* 0x305f 0x3099 */ + 0x3060, 0x0000, + /* 0x3061 0x3099 */ + 0x3062, 0x0000, + /* 0x3064 0x3099 */ + 0x3065, 0x0000, + /* 0x3066 0x3099 */ + 0x3067, 0x0000, + /* 0x3068 0x3099 */ + 0x3069, 0x0000, + /* 0x306f 0x3099 */ + 0x3070, 0x0000, + /* 0x3072 0x3099 */ + 0x3073, 0x0000, + /* 0x3075 0x3099 */ + 0x3076, 0x0000, + /* 0x3078 0x3099 */ + 0x3079, 0x0000, + /* 0x307b 0x3099 */ + 0x307c, 0x0000, + /* 0x309d 0x3099 */ + 0x309e, 0x0000, + /* 0x30a6 0x3099 */ + 0x30f4, 0x0000, + /* 0x30ab 0x3099 */ + 0x30ac, 0x0000, + /* 0x30ad 0x3099 */ + 0x30ae, 0x0000, + /* 0x30af 0x3099 */ + 0x30b0, 0x0000, + /* 0x30b1 0x3099 */ + 0x30b2, 0x0000, + /* 0x30b3 0x3099 */ + 0x30b4, 0x0000, + /* 0x30b5 0x3099 */ + 0x30b6, 0x0000, + /* 0x30b7 0x3099 */ + 0x30b8, 0x0000, + /* 0x30b9 0x3099 */ + 0x30ba, 0x0000, + /* 0x30bb 0x3099 */ + 0x30bc, 0x0000, + /* 0x30bd 0x3099 */ + 0x30be, 0x0000, + /* 0x30bf 0x3099 */ + 0x30c0, 0x0000, + /* 0x30c1 0x3099 */ + 0x30c2, 0x0000, + /* 0x30c4 0x3099 */ + 0x30c5, 0x0000, + /* 0x30c6 0x3099 */ + 0x30c7, 0x0000, + /* 0x30c8 0x3099 */ + 0x30c9, 0x0000, + /* 0x30cf 0x3099 */ + 0x30d0, 0x0000, + /* 0x30d2 0x3099 */ + 0x30d3, 0x0000, + /* 0x30d5 0x3099 */ + 0x30d6, 0x0000, + /* 0x30d8 0x3099 */ + 0x30d9, 0x0000, + /* 0x30db 0x3099 */ + 0x30dc, 0x0000, + /* 0x30ef 0x3099 */ + 0x30f7, 0x0000, + /* 0x30f0 0x3099 */ + 0x30f8, 0x0000, + /* 0x30f1 0x3099 */ + 0x30f9, 0x0000, + /* 0x30f2 0x3099 */ + 0x30fa, 0x0000, + /* 0x30fd 0x3099 */ + 0x30fe, 0x0000, + /* 0x306f 0x309a */ + 0x3071, 0x0000, + /* 0x3072 0x309a */ + 0x3074, 0x0000, + /* 0x3075 0x309a */ + 0x3077, 0x0000, + /* 0x3078 0x309a */ + 0x307a, 0x0000, + /* 0x307b 0x309a */ + 0x307d, 0x0000, + /* 0x30cf 0x309a */ + 0x30d1, 0x0000, + /* 0x30d2 0x309a */ + 0x30d4, 0x0000, + /* 0x30d5 0x309a */ + 0x30d7, 0x0000, + /* 0x30d8 0x309a */ + 0x30da, 0x0000, + /* 0x30db 0x309a */ + 0x30dd, 0x0000, + /* 0x0307 0x0053 0x0301 */ + 0x1e64, 0x0000, + /* 0x0307 0x0073 0x0301 */ + 0x1e65, 0x0000, + /* 0x0300 0x0041 0x0302 */ + 0x1ea6, 0x0000, + /* 0x0301 0x0041 0x0302 */ + 0x1ea4, 0x0000, + /* 0x0303 0x0041 0x0302 */ + 0x1eaa, 0x0000, + /* 0x0309 0x0041 0x0302 */ + 0x1ea8, 0x0000, + /* 0x0300 0x0045 0x0302 */ + 0x1ec0, 0x0000, + /* 0x0301 0x0045 0x0302 */ + 0x1ebe, 0x0000, + /* 0x0303 0x0045 0x0302 */ + 0x1ec4, 0x0000, + /* 0x0309 0x0045 0x0302 */ + 0x1ec2, 0x0000, + /* 0x0300 0x004f 0x0302 */ + 0x1ed2, 0x0000, + /* 0x0301 0x004f 0x0302 */ + 0x1ed0, 0x0000, + /* 0x0303 0x004f 0x0302 */ + 0x1ed6, 0x0000, + /* 0x0309 0x004f 0x0302 */ + 0x1ed4, 0x0000, + /* 0x0300 0x0061 0x0302 */ + 0x1ea7, 0x0000, + /* 0x0301 0x0061 0x0302 */ + 0x1ea5, 0x0000, + /* 0x0303 0x0061 0x0302 */ + 0x1eab, 0x0000, + /* 0x0309 0x0061 0x0302 */ + 0x1ea9, 0x0000, + /* 0x0300 0x0065 0x0302 */ + 0x1ec1, 0x0000, + /* 0x0301 0x0065 0x0302 */ + 0x1ebf, 0x0000, + /* 0x0303 0x0065 0x0302 */ + 0x1ec5, 0x0000, + /* 0x0309 0x0065 0x0302 */ + 0x1ec3, 0x0000, + /* 0x0300 0x006f 0x0302 */ + 0x1ed3, 0x0000, + /* 0x0301 0x006f 0x0302 */ + 0x1ed1, 0x0000, + /* 0x0303 0x006f 0x0302 */ + 0x1ed7, 0x0000, + /* 0x0309 0x006f 0x0302 */ + 0x1ed5, 0x0000, + /* 0x0301 0x004f 0x0303 */ + 0x1e4c, 0x0000, + /* 0x0308 0x004f 0x0303 */ + 0x1e4e, 0x0000, + /* 0x0301 0x0055 0x0303 */ + 0x1e78, 0x0000, + /* 0x0301 0x006f 0x0303 */ + 0x1e4d, 0x0000, + /* 0x0308 0x006f 0x0303 */ + 0x1e4f, 0x0000, + /* 0x0301 0x0075 0x0303 */ + 0x1e79, 0x0000, + /* 0x0300 0x0045 0x0304 */ + 0x1e14, 0x0000, + /* 0x0301 0x0045 0x0304 */ + 0x1e16, 0x0000, + /* 0x0300 0x004f 0x0304 */ + 0x1e50, 0x0000, + /* 0x0301 0x004f 0x0304 */ + 0x1e52, 0x0000, + /* 0x0308 0x0055 0x0304 */ + 0x1e7a, 0x0000, + /* 0x0300 0x0065 0x0304 */ + 0x1e15, 0x0000, + /* 0x0301 0x0065 0x0304 */ + 0x1e17, 0x0000, + /* 0x0300 0x006f 0x0304 */ + 0x1e51, 0x0000, + /* 0x0301 0x006f 0x0304 */ + 0x1e53, 0x0000, + /* 0x0308 0x0075 0x0304 */ + 0x1e7b, 0x0000, + /* 0x0300 0x0041 0x0306 */ + 0x1eb0, 0x0000, + /* 0x0301 0x0041 0x0306 */ + 0x1eae, 0x0000, + /* 0x0303 0x0041 0x0306 */ + 0x1eb4, 0x0000, + /* 0x0309 0x0041 0x0306 */ + 0x1eb2, 0x0000, + /* 0x0300 0x0061 0x0306 */ + 0x1eb1, 0x0000, + /* 0x0301 0x0061 0x0306 */ + 0x1eaf, 0x0000, + /* 0x0303 0x0061 0x0306 */ + 0x1eb5, 0x0000, + /* 0x0309 0x0061 0x0306 */ + 0x1eb3, 0x0000, + /* 0x0304 0x0041 0x0307 */ + 0x01e0, 0x0000, + /* 0x0304 0x0061 0x0307 */ + 0x01e1, 0x0000, + /* 0x0304 0x0041 0x0308 */ + 0x01de, 0x0000, + /* 0x0301 0x0049 0x0308 */ + 0x1e2e, 0x0000, + /* 0x0300 0x0055 0x0308 */ + 0x01db, 0x0000, + /* 0x0301 0x0055 0x0308 */ + 0x01d7, 0x0000, + /* 0x0304 0x0055 0x0308 */ + 0x01d5, 0x0000, + /* 0x030c 0x0055 0x0308 */ + 0x01d9, 0x0000, + /* 0x0304 0x0061 0x0308 */ + 0x01df, 0x0000, + /* 0x0301 0x0069 0x0308 */ + 0x1e2f, 0x0000, + /* 0x0300 0x0075 0x0308 */ + 0x01dc, 0x0000, + /* 0x0301 0x0075 0x0308 */ + 0x01d8, 0x0000, + /* 0x0304 0x0075 0x0308 */ + 0x01d6, 0x0000, + /* 0x030c 0x0075 0x0308 */ + 0x01da, 0x0000, + /* 0x0300 0x03b9 0x0308 */ + 0x1fd2, 0x0000, + /* 0x0301 0x03b9 0x0308 */ + 0x1fd3, 0x0000, + /* 0x030d 0x03b9 0x0308 */ + 0x0390, 0x0000, + /* 0x0342 0x03b9 0x0308 */ + 0x1fd7, 0x0000, + /* 0x0300 0x03c5 0x0308 */ + 0x1fe2, 0x0000, + /* 0x0301 0x03c5 0x0308 */ + 0x1fe3, 0x0000, + /* 0x030d 0x03c5 0x0308 */ + 0x03b0, 0x0000, + /* 0x0342 0x03c5 0x0308 */ + 0x1fe7, 0x0000, + /* 0x0301 0x0041 0x030a */ + 0x01fa, 0x0000, + /* 0x0301 0x0061 0x030a */ + 0x01fb, 0x0000, + /* 0x0307 0x0053 0x030c */ + 0x1e66, 0x0000, + /* 0x0307 0x0073 0x030c */ + 0x1e67, 0x0000, + /* 0x0300 0x0391 0x0313 */ + 0x1f0a, 0x0000, + /* 0x0301 0x0391 0x0313 */ + 0x1f0c, 0x0000, + /* 0x0342 0x0391 0x0313 */ + 0x1f0e, 0x0000, + /* 0x0300 0x0395 0x0313 */ + 0x1f1a, 0x0000, + /* 0x0301 0x0395 0x0313 */ + 0x1f1c, 0x0000, + /* 0x0300 0x0397 0x0313 */ + 0x1f2a, 0x0000, + /* 0x0301 0x0397 0x0313 */ + 0x1f2c, 0x0000, + /* 0x0342 0x0397 0x0313 */ + 0x1f2e, 0x0000, + /* 0x0300 0x0399 0x0313 */ + 0x1f3a, 0x0000, + /* 0x0301 0x0399 0x0313 */ + 0x1f3c, 0x0000, + /* 0x0342 0x0399 0x0313 */ + 0x1f3e, 0x0000, + /* 0x0300 0x039f 0x0313 */ + 0x1f4a, 0x0000, + /* 0x0301 0x039f 0x0313 */ + 0x1f4c, 0x0000, + /* 0x0300 0x03a9 0x0313 */ + 0x1f6a, 0x0000, + /* 0x0301 0x03a9 0x0313 */ + 0x1f6c, 0x0000, + /* 0x0342 0x03a9 0x0313 */ + 0x1f6e, 0x0000, + /* 0x0300 0x03b1 0x0313 */ + 0x1f02, 0x0000, + /* 0x0301 0x03b1 0x0313 */ + 0x1f04, 0x0000, + /* 0x0342 0x03b1 0x0313 */ + 0x1f06, 0x0000, + /* 0x0300 0x03b5 0x0313 */ + 0x1f12, 0x0000, + /* 0x0301 0x03b5 0x0313 */ + 0x1f14, 0x0000, + /* 0x0300 0x03b7 0x0313 */ + 0x1f22, 0x0000, + /* 0x0301 0x03b7 0x0313 */ + 0x1f24, 0x0000, + /* 0x0342 0x03b7 0x0313 */ + 0x1f26, 0x0000, + /* 0x0300 0x03b9 0x0313 */ + 0x1f32, 0x0000, + /* 0x0301 0x03b9 0x0313 */ + 0x1f34, 0x0000, + /* 0x0342 0x03b9 0x0313 */ + 0x1f36, 0x0000, + /* 0x0300 0x03bf 0x0313 */ + 0x1f42, 0x0000, + /* 0x0301 0x03bf 0x0313 */ + 0x1f44, 0x0000, + /* 0x0300 0x03c5 0x0313 */ + 0x1f52, 0x0000, + /* 0x0301 0x03c5 0x0313 */ + 0x1f54, 0x0000, + /* 0x0342 0x03c5 0x0313 */ + 0x1f56, 0x0000, + /* 0x0300 0x03c9 0x0313 */ + 0x1f62, 0x0000, + /* 0x0301 0x03c9 0x0313 */ + 0x1f64, 0x0000, + /* 0x0342 0x03c9 0x0313 */ + 0x1f66, 0x0000, + /* 0x0300 0x0391 0x0314 */ + 0x1f0b, 0x0000, + /* 0x0301 0x0391 0x0314 */ + 0x1f0d, 0x0000, + /* 0x0342 0x0391 0x0314 */ + 0x1f0f, 0x0000, + /* 0x0300 0x0395 0x0314 */ + 0x1f1b, 0x0000, + /* 0x0301 0x0395 0x0314 */ + 0x1f1d, 0x0000, + /* 0x0300 0x0397 0x0314 */ + 0x1f2b, 0x0000, + /* 0x0301 0x0397 0x0314 */ + 0x1f2d, 0x0000, + /* 0x0342 0x0397 0x0314 */ + 0x1f2f, 0x0000, + /* 0x0300 0x0399 0x0314 */ + 0x1f3b, 0x0000, + /* 0x0301 0x0399 0x0314 */ + 0x1f3d, 0x0000, + /* 0x0342 0x0399 0x0314 */ + 0x1f3f, 0x0000, + /* 0x0300 0x039f 0x0314 */ + 0x1f4b, 0x0000, + /* 0x0301 0x039f 0x0314 */ + 0x1f4d, 0x0000, + /* 0x0300 0x03a5 0x0314 */ + 0x1f5b, 0x0000, + /* 0x0301 0x03a5 0x0314 */ + 0x1f5d, 0x0000, + /* 0x0342 0x03a5 0x0314 */ + 0x1f5f, 0x0000, + /* 0x0300 0x03a9 0x0314 */ + 0x1f6b, 0x0000, + /* 0x0301 0x03a9 0x0314 */ + 0x1f6d, 0x0000, + /* 0x0342 0x03a9 0x0314 */ + 0x1f6f, 0x0000, + /* 0x0300 0x03b1 0x0314 */ + 0x1f03, 0x0000, + /* 0x0301 0x03b1 0x0314 */ + 0x1f05, 0x0000, + /* 0x0342 0x03b1 0x0314 */ + 0x1f07, 0x0000, + /* 0x0300 0x03b5 0x0314 */ + 0x1f13, 0x0000, + /* 0x0301 0x03b5 0x0314 */ + 0x1f15, 0x0000, + /* 0x0300 0x03b7 0x0314 */ + 0x1f23, 0x0000, + /* 0x0301 0x03b7 0x0314 */ + 0x1f25, 0x0000, + /* 0x0342 0x03b7 0x0314 */ + 0x1f27, 0x0000, + /* 0x0300 0x03b9 0x0314 */ + 0x1f33, 0x0000, + /* 0x0301 0x03b9 0x0314 */ + 0x1f35, 0x0000, + /* 0x0342 0x03b9 0x0314 */ + 0x1f37, 0x0000, + /* 0x0300 0x03bf 0x0314 */ + 0x1f43, 0x0000, + /* 0x0301 0x03bf 0x0314 */ + 0x1f45, 0x0000, + /* 0x0300 0x03c5 0x0314 */ + 0x1f53, 0x0000, + /* 0x0301 0x03c5 0x0314 */ + 0x1f55, 0x0000, + /* 0x0342 0x03c5 0x0314 */ + 0x1f57, 0x0000, + /* 0x0300 0x03c9 0x0314 */ + 0x1f63, 0x0000, + /* 0x0301 0x03c9 0x0314 */ + 0x1f65, 0x0000, + /* 0x0342 0x03c9 0x0314 */ + 0x1f67, 0x0000, + /* 0x0300 0x004f 0x031b */ + 0x1edc, 0x0000, + /* 0x0301 0x004f 0x031b */ + 0x1eda, 0x0000, + /* 0x0303 0x004f 0x031b */ + 0x1ee0, 0x0000, + /* 0x0309 0x004f 0x031b */ + 0x1ede, 0x0000, + /* 0x0323 0x004f 0x031b */ + 0x1ee2, 0x0000, + /* 0x0300 0x0055 0x031b */ + 0x1eea, 0x0000, + /* 0x0301 0x0055 0x031b */ + 0x1ee8, 0x0000, + /* 0x0303 0x0055 0x031b */ + 0x1eee, 0x0000, + /* 0x0309 0x0055 0x031b */ + 0x1eec, 0x0000, + /* 0x0323 0x0055 0x031b */ + 0x1ef0, 0x0000, + /* 0x0300 0x006f 0x031b */ + 0x1edd, 0x0000, + /* 0x0301 0x006f 0x031b */ + 0x1edb, 0x0000, + /* 0x0303 0x006f 0x031b */ + 0x1ee1, 0x0000, + /* 0x0309 0x006f 0x031b */ + 0x1edf, 0x0000, + /* 0x0323 0x006f 0x031b */ + 0x1ee3, 0x0000, + /* 0x0300 0x0075 0x031b */ + 0x1eeb, 0x0000, + /* 0x0301 0x0075 0x031b */ + 0x1ee9, 0x0000, + /* 0x0303 0x0075 0x031b */ + 0x1eef, 0x0000, + /* 0x0309 0x0075 0x031b */ + 0x1eed, 0x0000, + /* 0x0323 0x0075 0x031b */ + 0x1ef1, 0x0000, + /* 0x0302 0x0041 0x0323 */ + 0x1eac, 0x0000, + /* 0x0306 0x0041 0x0323 */ + 0x1eb6, 0x0000, + /* 0x0302 0x0045 0x0323 */ + 0x1ec6, 0x0000, + /* 0x0304 0x004c 0x0323 */ + 0x1e38, 0x0000, + /* 0x0302 0x004f 0x0323 */ + 0x1ed8, 0x0000, + /* 0x0304 0x0052 0x0323 */ + 0x1e5c, 0x0000, + /* 0x0307 0x0053 0x0323 */ + 0x1e68, 0x0000, + /* 0x0302 0x0061 0x0323 */ + 0x1ead, 0x0000, + /* 0x0306 0x0061 0x0323 */ + 0x1eb7, 0x0000, + /* 0x0302 0x0065 0x0323 */ + 0x1ec7, 0x0000, + /* 0x0304 0x006c 0x0323 */ + 0x1e39, 0x0000, + /* 0x0302 0x006f 0x0323 */ + 0x1ed9, 0x0000, + /* 0x0304 0x0072 0x0323 */ + 0x1e5d, 0x0000, + /* 0x0307 0x0073 0x0323 */ + 0x1e69, 0x0000, + /* 0x0301 0x0043 0x0327 */ + 0x1e08, 0x0000, + /* 0x0306 0x0045 0x0327 */ + 0x1e1c, 0x0000, + /* 0x0301 0x0063 0x0327 */ + 0x1e09, 0x0000, + /* 0x0306 0x0065 0x0327 */ + 0x1e1d, 0x0000, + /* 0x0304 0x004f 0x0328 */ + 0x01ec, 0x0000, + /* 0x0304 0x006f 0x0328 */ + 0x01ed, 0x0000, + /* 0x0313 0x0391 0x0345 */ + 0x1f88, 0x0003, 0x0300, 0x0f5e, 0x0301, 0x0f60, 0x0342, 0x0f62, + /* 0x0314 0x0391 0x0345 */ + 0x1f89, 0x0003, 0x0300, 0x0f64, 0x0301, 0x0f66, 0x0342, 0x0f68, + /* 0x0313 0x0397 0x0345 */ + 0x1f98, 0x0003, 0x0300, 0x0f6a, 0x0301, 0x0f6c, 0x0342, 0x0f6e, + /* 0x0314 0x0397 0x0345 */ + 0x1f99, 0x0003, 0x0300, 0x0f70, 0x0301, 0x0f72, 0x0342, 0x0f74, + /* 0x0313 0x03a9 0x0345 */ + 0x1fa8, 0x0003, 0x0300, 0x0f76, 0x0301, 0x0f78, 0x0342, 0x0f7a, + /* 0x0314 0x03a9 0x0345 */ + 0x1fa9, 0x0003, 0x0300, 0x0f7c, 0x0301, 0x0f7e, 0x0342, 0x0f80, + /* 0x0300 0x03b1 0x0345 */ + 0x1fb2, 0x0000, + /* 0x0301 0x03b1 0x0345 */ + 0x1fb4, 0x0000, + /* 0x0313 0x03b1 0x0345 */ + 0x1f80, 0x0003, 0x0300, 0x0f82, 0x0301, 0x0f84, 0x0342, 0x0f86, + /* 0x0314 0x03b1 0x0345 */ + 0x1f81, 0x0003, 0x0300, 0x0f88, 0x0301, 0x0f8a, 0x0342, 0x0f8c, + /* 0x0342 0x03b1 0x0345 */ + 0x1fb7, 0x0000, + /* 0x0300 0x03b7 0x0345 */ + 0x1fc2, 0x0000, + /* 0x0301 0x03b7 0x0345 */ + 0x1fc4, 0x0000, + /* 0x0313 0x03b7 0x0345 */ + 0x1f90, 0x0003, 0x0300, 0x0f8e, 0x0301, 0x0f90, 0x0342, 0x0f92, + /* 0x0314 0x03b7 0x0345 */ + 0x1f91, 0x0003, 0x0300, 0x0f94, 0x0301, 0x0f96, 0x0342, 0x0f98, + /* 0x0342 0x03b7 0x0345 */ + 0x1fc7, 0x0000, + /* 0x0301 0x03bf 0x0345 */ + 0x1ff4, 0x0000, + /* 0x0300 0x03c9 0x0345 */ + 0x1ff2, 0x0000, + /* 0x0313 0x03c9 0x0345 */ + 0x1fa0, 0x0003, 0x0300, 0x0f9a, 0x0301, 0x0f9c, 0x0342, 0x0f9e, + /* 0x0314 0x03c9 0x0345 */ + 0x1fa1, 0x0003, 0x0300, 0x0fa0, 0x0301, 0x0fa2, 0x0342, 0x0fa4, + /* 0x0342 0x03c9 0x0345 */ + 0x1ff7, 0x0000, + /* 0x05c1 0x05e9 0x05bc */ + 0xfb2c, 0x0000, + /* 0x05c2 0x05e9 0x05bc */ + 0xfb2d, 0x0000, + /* 0x0cd5 0x0cc6 0x0cc2 */ + 0x0ccb, 0x0000, + /* 0x0f71 0x0fb2 0x0f80 */ + 0x0f77, 0x0000, + /* 0x0f71 0x0fb3 0x0f80 */ + 0x0f79, 0x0000, + /* 0x0300 0x0313 0x0391 0x0345 */ + 0x1f8a, 0x0000, + /* 0x0301 0x0313 0x0391 0x0345 */ + 0x1f8c, 0x0000, + /* 0x0342 0x0313 0x0391 0x0345 */ + 0x1f8e, 0x0000, + /* 0x0300 0x0314 0x0391 0x0345 */ + 0x1f8b, 0x0000, + /* 0x0301 0x0314 0x0391 0x0345 */ + 0x1f8d, 0x0000, + /* 0x0342 0x0314 0x0391 0x0345 */ + 0x1f8f, 0x0000, + /* 0x0300 0x0313 0x0397 0x0345 */ + 0x1f9a, 0x0000, + /* 0x0301 0x0313 0x0397 0x0345 */ + 0x1f9c, 0x0000, + /* 0x0342 0x0313 0x0397 0x0345 */ + 0x1f9e, 0x0000, + /* 0x0300 0x0314 0x0397 0x0345 */ + 0x1f9b, 0x0000, + /* 0x0301 0x0314 0x0397 0x0345 */ + 0x1f9d, 0x0000, + /* 0x0342 0x0314 0x0397 0x0345 */ + 0x1f9f, 0x0000, + /* 0x0300 0x0313 0x03a9 0x0345 */ + 0x1faa, 0x0000, + /* 0x0301 0x0313 0x03a9 0x0345 */ + 0x1fac, 0x0000, + /* 0x0342 0x0313 0x03a9 0x0345 */ + 0x1fae, 0x0000, + /* 0x0300 0x0314 0x03a9 0x0345 */ + 0x1fab, 0x0000, + /* 0x0301 0x0314 0x03a9 0x0345 */ + 0x1fad, 0x0000, + /* 0x0342 0x0314 0x03a9 0x0345 */ + 0x1faf, 0x0000, + /* 0x0300 0x0313 0x03b1 0x0345 */ + 0x1f82, 0x0000, + /* 0x0301 0x0313 0x03b1 0x0345 */ + 0x1f84, 0x0000, + /* 0x0342 0x0313 0x03b1 0x0345 */ + 0x1f86, 0x0000, + /* 0x0300 0x0314 0x03b1 0x0345 */ + 0x1f83, 0x0000, + /* 0x0301 0x0314 0x03b1 0x0345 */ + 0x1f85, 0x0000, + /* 0x0342 0x0314 0x03b1 0x0345 */ + 0x1f87, 0x0000, + /* 0x0300 0x0313 0x03b7 0x0345 */ + 0x1f92, 0x0000, + /* 0x0301 0x0313 0x03b7 0x0345 */ + 0x1f94, 0x0000, + /* 0x0342 0x0313 0x03b7 0x0345 */ + 0x1f96, 0x0000, + /* 0x0300 0x0314 0x03b7 0x0345 */ + 0x1f93, 0x0000, + /* 0x0301 0x0314 0x03b7 0x0345 */ + 0x1f95, 0x0000, + /* 0x0342 0x0314 0x03b7 0x0345 */ + 0x1f97, 0x0000, + /* 0x0300 0x0313 0x03c9 0x0345 */ + 0x1fa2, 0x0000, + /* 0x0301 0x0313 0x03c9 0x0345 */ + 0x1fa4, 0x0000, + /* 0x0342 0x0313 0x03c9 0x0345 */ + 0x1fa6, 0x0000, + /* 0x0300 0x0314 0x03c9 0x0345 */ + 0x1fa3, 0x0000, + /* 0x0301 0x0314 0x03c9 0x0345 */ + 0x1fa5, 0x0000, + /* 0x0342 0x0314 0x03c9 0x0345 */ + 0x1fa7, 0x0000, +}; diff --git a/kernel/fs/hfsplus/unicode.c b/kernel/fs/hfsplus/unicode.c new file mode 100644 index 000000000..e8ef121a4 --- /dev/null +++ b/kernel/fs/hfsplus/unicode.c @@ -0,0 +1,469 @@ +/* + * linux/fs/hfsplus/unicode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handler routines for unicode strings + */ + +#include +#include +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Fold the case of a unicode char, given the 16 bit value */ +/* Returns folded char, or 0 if ignorable */ +static inline u16 case_fold(u16 c) +{ + u16 tmp; + + tmp = hfsplus_case_fold_table[c >> 8]; + if (tmp) + tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; + else + tmp = c; + return tmp; +} + +/* Compare unicode strings, return values like normal strcmp */ +int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2) +{ + u16 len1, len2, c1, c2; + const hfsplus_unichr *p1, *p2; + + len1 = be16_to_cpu(s1->length); + len2 = be16_to_cpu(s2->length); + p1 = s1->unicode; + p2 = s2->unicode; + + while (1) { + c1 = c2 = 0; + + while (len1 && !c1) { + c1 = case_fold(be16_to_cpu(*p1)); + p1++; + len1--; + } + while (len2 && !c2) { + c2 = case_fold(be16_to_cpu(*p2)); + p2++; + len2--; + } + + if (c1 != c2) + return (c1 < c2) ? -1 : 1; + if (!c1 && !c2) + return 0; + } +} + +/* Compare names as a sequence of 16-bit unsigned integers */ +int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2) +{ + u16 len1, len2, c1, c2; + const hfsplus_unichr *p1, *p2; + int len; + + len1 = be16_to_cpu(s1->length); + len2 = be16_to_cpu(s2->length); + p1 = s1->unicode; + p2 = s2->unicode; + + for (len = min(len1, len2); len > 0; len--) { + c1 = be16_to_cpu(*p1); + c2 = be16_to_cpu(*p2); + if (c1 != c2) + return c1 < c2 ? -1 : 1; + p1++; + p2++; + } + + return len1 < len2 ? -1 : + len1 > len2 ? 1 : 0; +} + + +#define Hangul_SBase 0xac00 +#define Hangul_LBase 0x1100 +#define Hangul_VBase 0x1161 +#define Hangul_TBase 0x11a7 +#define Hangul_SCount 11172 +#define Hangul_LCount 19 +#define Hangul_VCount 21 +#define Hangul_TCount 28 +#define Hangul_NCount (Hangul_VCount * Hangul_TCount) + + +static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) +{ + int i, s, e; + + s = 1; + e = p[1]; + if (!e || cc < p[s * 2] || cc > p[e * 2]) + return NULL; + do { + i = (s + e) / 2; + if (cc > p[i * 2]) + s = i + 1; + else if (cc < p[i * 2]) + e = i - 1; + else + return hfsplus_compose_table + p[i * 2 + 1]; + } while (s <= e); + return NULL; +} + +int hfsplus_uni2asc(struct super_block *sb, + const struct hfsplus_unistr *ustr, + char *astr, int *len_p) +{ + const hfsplus_unichr *ip; + struct nls_table *nls = HFSPLUS_SB(sb)->nls; + u8 *op; + u16 cc, c0, c1; + u16 *ce1, *ce2; + int i, len, ustrlen, res, compose; + + op = astr; + ip = ustr->unicode; + ustrlen = be16_to_cpu(ustr->length); + len = *len_p; + ce1 = NULL; + compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + + while (ustrlen > 0) { + c0 = be16_to_cpu(*ip++); + ustrlen--; + /* search for single decomposed char */ + if (likely(compose)) + ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0); + if (ce1) + cc = ce1[0]; + else + cc = 0; + if (cc) { + /* start of a possibly decomposed Hangul char */ + if (cc != 0xffff) + goto done; + if (!ustrlen) + goto same; + c1 = be16_to_cpu(*ip) - Hangul_VBase; + if (c1 < Hangul_VCount) { + /* compose the Hangul char */ + cc = (c0 - Hangul_LBase) * Hangul_VCount; + cc = (cc + c1) * Hangul_TCount; + cc += Hangul_SBase; + ip++; + ustrlen--; + if (!ustrlen) + goto done; + c1 = be16_to_cpu(*ip) - Hangul_TBase; + if (c1 > 0 && c1 < Hangul_TCount) { + cc += c1; + ip++; + ustrlen--; + } + goto done; + } + } + while (1) { + /* main loop for common case of not composed chars */ + if (!ustrlen) + goto same; + c1 = be16_to_cpu(*ip); + if (likely(compose)) + ce1 = hfsplus_compose_lookup( + hfsplus_compose_table, c1); + if (ce1) + break; + switch (c0) { + case 0: + c0 = 0x2400; + break; + case '/': + c0 = ':'; + break; + } + res = nls->uni2char(c0, op, len); + if (res < 0) { + if (res == -ENAMETOOLONG) + goto out; + *op = '?'; + res = 1; + } + op += res; + len -= res; + c0 = c1; + ip++; + ustrlen--; + } + ce2 = hfsplus_compose_lookup(ce1, c0); + if (ce2) { + i = 1; + while (i < ustrlen) { + ce1 = hfsplus_compose_lookup(ce2, + be16_to_cpu(ip[i])); + if (!ce1) + break; + i++; + ce2 = ce1; + } + cc = ce2[0]; + if (cc) { + ip += i; + ustrlen -= i; + goto done; + } + } +same: + switch (c0) { + case 0: + cc = 0x2400; + break; + case '/': + cc = ':'; + break; + default: + cc = c0; + } +done: + res = nls->uni2char(cc, op, len); + if (res < 0) { + if (res == -ENAMETOOLONG) + goto out; + *op = '?'; + res = 1; + } + op += res; + len -= res; + } + res = 0; +out: + *len_p = (char *)op - astr; + return res; +} + +/* + * Convert one or more ASCII characters into a single unicode character. + * Returns the number of ASCII characters corresponding to the unicode char. + */ +static inline int asc2unichar(struct super_block *sb, const char *astr, int len, + wchar_t *uc) +{ + int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); + if (size <= 0) { + *uc = '?'; + size = 1; + } + switch (*uc) { + case 0x2400: + *uc = 0; + break; + case ':': + *uc = '/'; + break; + } + return size; +} + +/* Decomposes a single unicode character. */ +static inline u16 *decompose_unichar(wchar_t uc, int *size) +{ + int off; + + off = hfsplus_decompose_table[(uc >> 12) & 0xf]; + if (off == 0 || off == 0xffff) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + (uc & 0xf)]; + *size = off & 3; + if (*size == 0) + return NULL; + return hfsplus_decompose_table + (off / 4); +} + +int hfsplus_asc2uni(struct super_block *sb, + struct hfsplus_unistr *ustr, int max_unistr_len, + const char *astr, int len) +{ + int size, dsize, decompose; + u16 *dstr, outlen = 0; + wchar_t c; + + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + while (outlen < max_unistr_len && len > 0) { + size = asc2unichar(sb, astr, len, &c); + + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { + if (outlen + dsize > max_unistr_len) + break; + do { + ustr->unicode[outlen++] = cpu_to_be16(*dstr++); + } while (--dsize > 0); + } else + ustr->unicode[outlen++] = cpu_to_be16(c); + + astr += size; + len -= size; + } + ustr->length = cpu_to_be16(outlen); + if (len > 0) + return -ENAMETOOLONG; + return 0; +} + +/* + * Hash a string to an integer as appropriate for the HFS+ filesystem. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) +{ + struct super_block *sb = dentry->d_sb; + const char *astr; + const u16 *dstr; + int casefold, decompose, size, len; + unsigned long hash; + wchar_t c; + u16 c2; + + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + hash = init_name_hash(); + astr = str->name; + len = str->len; + while (len > 0) { + int uninitialized_var(dsize); + size = asc2unichar(sb, astr, len, &c); + astr += size; + len -= size; + + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { + do { + c2 = *dstr++; + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) + hash = partial_name_hash(c2, hash); + } while (--dsize > 0); + } else { + c2 = c; + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) + hash = partial_name_hash(c2, hash); + } + } + str->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare strings with HFS+ filename ordering. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct super_block *sb = parent->d_sb; + int casefold, decompose, size; + int dsize1, dsize2, len1, len2; + const u16 *dstr1, *dstr2; + const char *astr1, *astr2; + u16 c1, c2; + wchar_t c; + + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + astr1 = str; + len1 = len; + astr2 = name->name; + len2 = name->len; + dsize1 = dsize2 = 0; + dstr1 = dstr2 = NULL; + + while (len1 > 0 && len2 > 0) { + if (!dsize1) { + size = asc2unichar(sb, astr1, len1, &c); + astr1 += size; + len1 -= size; + + if (decompose) + dstr1 = decompose_unichar(c, &dsize1); + if (!decompose || !dstr1) { + c1 = c; + dstr1 = &c1; + dsize1 = 1; + } + } + + if (!dsize2) { + size = asc2unichar(sb, astr2, len2, &c); + astr2 += size; + len2 -= size; + + if (decompose) + dstr2 = decompose_unichar(c, &dsize2); + if (!decompose || !dstr2) { + c2 = c; + dstr2 = &c2; + dsize2 = 1; + } + } + + c1 = *dstr1; + c2 = *dstr2; + if (casefold) { + c1 = case_fold(c1); + if (!c1) { + dstr1++; + dsize1--; + continue; + } + c2 = case_fold(c2); + if (!c2) { + dstr2++; + dsize2--; + continue; + } + } + if (c1 < c2) + return -1; + else if (c1 > c2) + return 1; + + dstr1++; + dsize1--; + dstr2++; + dsize2--; + } + + if (len1 < len2) + return -1; + if (len1 > len2) + return 1; + return 0; +} diff --git a/kernel/fs/hfsplus/wrapper.c b/kernel/fs/hfsplus/wrapper.c new file mode 100644 index 000000000..cc6235671 --- /dev/null +++ b/kernel/fs/hfsplus/wrapper.c @@ -0,0 +1,261 @@ +/* + * linux/fs/hfsplus/wrapper.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of HFS wrappers around HFS+ volumes + */ + +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +struct hfsplus_wd { + u32 ablk_size; + u16 ablk_start; + u16 embed_start; + u16 embed_count; +}; + +/** + * hfsplus_submit_bio - Perform block I/O + * @sb: super block of volume for I/O + * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes + * @buf: buffer for I/O + * @data: output pointer for location of requested data + * @rw: direction of I/O + * + * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than + * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads + * @data will return a pointer to the start of the requested sector, + * which may not be the same location as @buf. + * + * If @sector is not aligned to the bdev logical block size it will + * be rounded down. For writes this means that @buf should contain data + * that starts at the rounded-down address. As long as the data was + * read using hfsplus_submit_bio() and the same buffer is used things + * will work correctly. + */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw) +{ + struct bio *bio; + int ret = 0; + u64 io_size; + loff_t start; + int offset; + + /* + * Align sector to hardware sector size and find offset. We + * assume that io_size is a power of two, which _should_ + * be true. + */ + io_size = hfsplus_min_io_size(sb); + start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + offset = start & (io_size - 1); + sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); + + bio = bio_alloc(GFP_NOIO, 1); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = sb->s_bdev; + + if (!(rw & WRITE) && data) + *data = (u8 *)buf + offset; + + while (io_size > 0) { + unsigned int page_offset = offset_in_page(buf); + unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, + io_size); + + ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); + if (ret != len) { + ret = -EIO; + goto out; + } + io_size -= len; + buf = (u8 *)buf + len; + } + + ret = submit_bio_wait(rw, bio); +out: + bio_put(bio); + return ret < 0 ? ret : 0; +} + +static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) +{ + u32 extent; + u16 attrib; + __be16 sig; + + sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG); + if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) && + sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) + return 0; + + attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB)); + if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) || + !(attrib & HFSP_WRAP_ATTRIB_SPARED)) + return 0; + + wd->ablk_size = + be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); + if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) + return 0; + if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) + return 0; + wd->ablk_start = + be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); + + extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); + wd->embed_start = (extent >> 16) & 0xFFFF; + wd->embed_count = extent & 0xFFFF; + + return 1; +} + +static int hfsplus_get_last_session(struct super_block *sb, + sector_t *start, sector_t *size) +{ + struct cdrom_multisession ms_info; + struct cdrom_tocentry te; + int res; + + /* default values */ + *start = 0; + *size = sb->s_bdev->bd_inode->i_size >> 9; + + if (HFSPLUS_SB(sb)->session >= 0) { + te.cdte_track = HFSPLUS_SB(sb)->session; + te.cdte_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, + CDROMREADTOCENTRY, (unsigned long)&te); + if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { + *start = (sector_t)te.cdte_addr.lba << 2; + return 0; + } + pr_err("invalid session number or type of track\n"); + return -EINVAL; + } + ms_info.addr_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, + (unsigned long)&ms_info); + if (!res && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; + return 0; +} + +/* Find the volume header and fill in some minimum bits in superblock */ +/* Takes in super block, returns true if good data read */ +int hfsplus_read_wrapper(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_wd wd; + sector_t part_start, part_size; + u32 blocksize; + int error = 0; + + error = -EINVAL; + blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); + if (!blocksize) + goto out; + + if (hfsplus_get_last_session(sb, &part_start, &part_size)) + goto out; + + error = -ENOMEM; + sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_vhdr_buf) + goto out; + sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_backup_vhdr_buf) + goto out_free_vhdr; + +reread: + error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, + READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + switch (sbi->s_vhdr->signature) { + case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): + set_bit(HFSPLUS_SB_HFSX, &sbi->flags); + /*FALLTHRU*/ + case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): + break; + case cpu_to_be16(HFSP_WRAP_MAGIC): + if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) + goto out_free_backup_vhdr; + wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; + part_start += (sector_t)wd.ablk_start + + (sector_t)wd.embed_start * wd.ablk_size; + part_size = (sector_t)wd.embed_count * wd.ablk_size; + goto reread; + default: + /* + * Check for a partition block. + * + * (should do this only for cdrom/loop though) + */ + if (hfs_part_find(sb, &part_start, &part_size)) + goto out_free_backup_vhdr; + goto reread; + } + + error = hfsplus_submit_bio(sb, part_start + part_size - 2, + sbi->s_backup_vhdr_buf, + (void **)&sbi->s_backup_vhdr, READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { + pr_warn("invalid secondary volume header\n"); + goto out_free_backup_vhdr; + } + + blocksize = be32_to_cpu(sbi->s_vhdr->blocksize); + + /* + * Block size must be at least as large as a sector and a multiple of 2. + */ + if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) + goto out_free_backup_vhdr; + sbi->alloc_blksz = blocksize; + sbi->alloc_blksz_shift = ilog2(blocksize); + blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE); + + /* + * Align block size to block offset. + */ + while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) + blocksize >>= 1; + + if (sb_set_blocksize(sb, blocksize) != blocksize) { + pr_err("unable to set blocksize to %u!\n", blocksize); + goto out_free_backup_vhdr; + } + + sbi->blockoffset = + part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); + sbi->part_start = part_start; + sbi->sect_count = part_size; + sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; + return 0; + +out_free_backup_vhdr: + kfree(sbi->s_backup_vhdr_buf); +out_free_vhdr: + kfree(sbi->s_vhdr_buf); +out: + return error; +} diff --git a/kernel/fs/hfsplus/xattr.c b/kernel/fs/hfsplus/xattr.c new file mode 100644 index 000000000..416b1dbaf --- /dev/null +++ b/kernel/fs/hfsplus/xattr.c @@ -0,0 +1,911 @@ +/* + * linux/fs/hfsplus/xattr.c + * + * Vyacheslav Dubeyko + * + * Logic of processing extended attributes + */ + +#include "hfsplus_fs.h" +#include +#include +#include "xattr.h" +#include "acl.h" + +static int hfsplus_removexattr(struct inode *inode, const char *name); + +const struct xattr_handler *hfsplus_xattr_handlers[] = { + &hfsplus_xattr_osx_handler, + &hfsplus_xattr_user_handler, + &hfsplus_xattr_trusted_handler, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &hfsplus_xattr_security_handler, + NULL +}; + +static int strcmp_xattr_finder_info(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME, + sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME)); + } + return -1; +} + +static int strcmp_xattr_acl(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_ACL_NAME, + sizeof(HFSPLUS_XATTR_ACL_NAME)); + } + return -1; +} + +static bool is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +static void hfsplus_init_header_node(struct inode *attr_file, + u32 clump_size, + char *buf, u16 node_size) +{ + struct hfs_bnode_desc *desc; + struct hfs_btree_header_rec *head; + u16 offset; + __be16 *rec_offsets; + u32 hdr_node_map_rec_bits; + char *bmp; + u32 used_nodes; + u32 used_bmp_bytes; + u64 tmp; + + hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + clump_size, node_size); + + /* The end of the node contains list of record offsets */ + rec_offsets = (__be16 *)(buf + node_size); + + desc = (struct hfs_bnode_desc *)buf; + desc->type = HFS_NODE_HEADER; + desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT); + offset = sizeof(struct hfs_bnode_desc); + *--rec_offsets = cpu_to_be16(offset); + + head = (struct hfs_btree_header_rec *)(buf + offset); + head->node_size = cpu_to_be16(node_size); + tmp = i_size_read(attr_file); + do_div(tmp, node_size); + head->node_count = cpu_to_be32(tmp); + head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1); + head->clump_size = cpu_to_be32(clump_size); + head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS); + head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16)); + offset += sizeof(struct hfs_btree_header_rec); + *--rec_offsets = cpu_to_be16(offset); + offset += HFSPLUS_BTREE_HDR_USER_BYTES; + *--rec_offsets = cpu_to_be16(offset); + + hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16))); + if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) { + u32 map_node_bits; + u32 map_nodes; + + desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1); + map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) - + (2 * sizeof(u16)) - 2); + map_nodes = (be32_to_cpu(head->node_count) - + hdr_node_map_rec_bits + + (map_node_bits - 1)) / map_node_bits; + be32_add_cpu(&head->free_nodes, 0 - map_nodes); + } + + bmp = buf + offset; + used_nodes = + be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes); + used_bmp_bytes = used_nodes / 8; + if (used_bmp_bytes) { + memset(bmp, 0xFF, used_bmp_bytes); + bmp += used_bmp_bytes; + used_nodes %= 8; + } + *bmp = ~(0xFF >> used_nodes); + offset += hdr_node_map_rec_bits / 8; + *--rec_offsets = cpu_to_be16(offset); +} + +static int hfsplus_create_attributes_file(struct super_block *sb) +{ + int err = 0; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct inode *attr_file; + struct hfsplus_inode_info *hip; + u32 clump_size; + u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE; + char *buf; + int index, written; + struct address_space *mapping; + struct page *page; + int old_state = HFSPLUS_EMPTY_ATTR_TREE; + + hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + +check_attr_tree_state_again: + switch (atomic_read(&sbi->attr_tree_state)) { + case HFSPLUS_EMPTY_ATTR_TREE: + if (old_state != atomic_cmpxchg(&sbi->attr_tree_state, + old_state, + HFSPLUS_CREATING_ATTR_TREE)) + goto check_attr_tree_state_again; + break; + case HFSPLUS_CREATING_ATTR_TREE: + /* + * This state means that another thread is in process + * of AttributesFile creation. Theoretically, it is + * possible to be here. But really __setxattr() method + * first of all calls hfs_find_init() for lookup in + * B-tree of CatalogFile. This method locks mutex of + * CatalogFile's B-tree. As a result, if some thread + * is inside AttributedFile creation operation then + * another threads will be waiting unlocking of + * CatalogFile's B-tree's mutex. However, if code will + * change then we will return error code (-EAGAIN) from + * here. Really, it means that first try to set of xattr + * fails with error but second attempt will have success. + */ + return -EAGAIN; + case HFSPLUS_VALID_ATTR_TREE: + return 0; + case HFSPLUS_FAILED_ATTR_TREE: + return -EOPNOTSUPP; + default: + BUG(); + } + + attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID); + if (IS_ERR(attr_file)) { + pr_err("failed to load attributes file\n"); + return PTR_ERR(attr_file); + } + + BUG_ON(i_size_read(attr_file) != 0); + + hip = HFSPLUS_I(attr_file); + + clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize, + node_size, + sbi->sect_count, + HFSPLUS_ATTR_CNID); + + mutex_lock(&hip->extents_lock); + hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift; + mutex_unlock(&hip->extents_lock); + + if (sbi->free_blocks <= (hip->clump_blocks << 1)) { + err = -ENOSPC; + goto end_attr_file_creation; + } + + while (hip->alloc_blocks < hip->clump_blocks) { + err = hfsplus_file_extend(attr_file, false); + if (unlikely(err)) { + pr_err("failed to extend attributes file\n"); + goto end_attr_file_creation; + } + hip->phys_size = attr_file->i_size = + (loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift; + hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift; + inode_set_bytes(attr_file, attr_file->i_size); + } + + buf = kzalloc(node_size, GFP_NOFS); + if (!buf) { + pr_err("failed to allocate memory for header node\n"); + err = -ENOMEM; + goto end_attr_file_creation; + } + + hfsplus_init_header_node(attr_file, clump_size, buf, node_size); + + mapping = attr_file->i_mapping; + + index = 0; + written = 0; + for (; written < node_size; index++, written += PAGE_CACHE_SIZE) { + void *kaddr; + + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed_header_node_init; + } + + kaddr = kmap_atomic(page); + memcpy(kaddr, buf + written, + min_t(size_t, PAGE_CACHE_SIZE, node_size - written)); + kunmap_atomic(kaddr); + + set_page_dirty(page); + page_cache_release(page); + } + + hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY); + + sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); + if (!sbi->attr_tree) + pr_err("failed to load attributes file\n"); + +failed_header_node_init: + kfree(buf); + +end_attr_file_creation: + iput(attr_file); + + if (!err) + atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); + else if (err == -ENOSPC) + atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); + else + atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE); + + return err; +} + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err = 0; + struct hfs_find_data cat_fd; + hfsplus_cat_entry entry; + u16 cat_entry_flags, cat_entry_type; + u16 folder_finderinfo_len = sizeof(struct DInfo) + + sizeof(struct DXInfo); + u16 file_finderinfo_len = sizeof(struct FInfo) + + sizeof(struct FXInfo); + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (value == NULL) + return hfsplus_removexattr(inode, name); + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + pr_err("can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + pr_err("catalog searching failed\n"); + goto end_setxattr; + } + + if (!strcmp_xattr_finder_info(name)) { + if (flags & XATTR_CREATE) { + pr_err("xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset, + sizeof(hfsplus_cat_entry)); + if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { + if (size == folder_finderinfo_len) { + memcpy(&entry.folder.user_info, value, + folder_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { + if (size == file_finderinfo_len) { + memcpy(&entry.file.user_info, value, + file_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else { + err = -EOPNOTSUPP; + goto end_setxattr; + } + goto end_setxattr; + } + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { + err = hfsplus_create_attributes_file(inode->i_sb); + if (unlikely(err)) + goto end_setxattr; + } + + if (hfsplus_attr_exists(inode, name)) { + if (flags & XATTR_CREATE) { + pr_err("xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_setxattr; + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } else { + if (flags & XATTR_REPLACE) { + pr_err("cannot replace xattr\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + if (cat_entry_type == HFSPLUS_FOLDER) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + pr_err("invalid catalog entry type\n"); + err = -EIO; + goto end_setxattr; + } + +end_setxattr: + hfs_find_exit(&cat_fd); + return err; +} + +static int name_len(const char *xattr_name, int xattr_name_len) +{ + int len = xattr_name_len + 1; + + if (!is_known_namespace(xattr_name)) + len += XATTR_MAC_OSX_PREFIX_LEN; + + return len; +} + +static int copy_name(char *buffer, const char *xattr_name, int name_len) +{ + int len = name_len; + int offset = 0; + + if (!is_known_namespace(xattr_name)) { + strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); + offset += XATTR_MAC_OSX_PREFIX_LEN; + len += XATTR_MAC_OSX_PREFIX_LEN; + } + + strncpy(buffer + offset, xattr_name, name_len); + memset(buffer + offset + name_len, 0, 1); + len += 1; + + return len; +} + +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + const char *prefix, size_t prefixlen) +{ + char *xattr_name; + int res; + + if (!strcmp(name, "")) + return -EINVAL; + + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + strcpy(xattr_name, prefix); + strcpy(xattr_name + prefixlen, name); + res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, + flags); + kfree(xattr_name); + return res; +} + +static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, + void *value, size_t size) +{ + ssize_t res = 0; + struct hfs_find_data fd; + u16 entry_type; + u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); + u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); + u16 record_len = max(folder_rec_len, file_rec_len); + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + + if (size >= record_len) { + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + pr_err("can't init xattr find struct\n"); + return res; + } + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_getxattr_finder_info; + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + + if (entry_type == HFSPLUS_FOLDER) { + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + folder_rec_len); + memcpy(value, folder_finder_info, folder_rec_len); + res = folder_rec_len; + } else if (entry_type == HFSPLUS_FILE) { + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + file_rec_len); + memcpy(value, file_finder_info, file_rec_len); + res = file_rec_len; + } else { + res = -EOPNOTSUPP; + goto end_getxattr_finder_info; + } + } else + res = size ? -ERANGE : record_len; + +end_getxattr_finder_info: + if (size >= record_len) + hfs_find_exit(&fd); + return res; +} + +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, + void *value, size_t size) +{ + struct hfs_find_data fd; + hfsplus_attr_entry *entry; + __be32 xattr_record_type; + u32 record_type; + u16 record_length = 0; + ssize_t res = 0; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!strcmp_xattr_finder_info(name)) + return hfsplus_getxattr_finder_info(inode, value, size); + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + entry = hfsplus_alloc_attr_entry(); + if (!entry) { + pr_err("can't allocate xattr entry\n"); + return -ENOMEM; + } + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (res) { + pr_err("can't init xattr find struct\n"); + goto failed_getxattr_init; + } + + res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd); + if (res) { + if (res == -ENOENT) + res = -ENODATA; + else + pr_err("xattr searching failed\n"); + goto out; + } + + hfs_bnode_read(fd.bnode, &xattr_record_type, + fd.entryoffset, sizeof(xattr_record_type)); + record_type = be32_to_cpu(xattr_record_type); + if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + record_length = hfs_bnode_read_u16(fd.bnode, + fd.entryoffset + + offsetof(struct hfsplus_attr_inline_data, + length)); + if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { + pr_err("invalid xattr record size\n"); + res = -EIO; + goto out; + } + } else if (record_type == HFSPLUS_ATTR_FORK_DATA || + record_type == HFSPLUS_ATTR_EXTENTS) { + pr_err("only inline data xattr are supported\n"); + res = -EOPNOTSUPP; + goto out; + } else { + pr_err("invalid xattr record\n"); + res = -EIO; + goto out; + } + + if (size) { + hfs_bnode_read(fd.bnode, entry, fd.entryoffset, + offsetof(struct hfsplus_attr_inline_data, + raw_bytes) + record_length); + } + + if (size >= record_length) { + memcpy(value, entry->inline_data.raw_bytes, record_length); + res = record_length; + } else + res = size ? -ERANGE : record_length; + +out: + hfs_find_exit(&fd); + +failed_getxattr_init: + hfsplus_destroy_attr_entry(entry); + return res; +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size, + const char *prefix, size_t prefixlen) +{ + int res; + char *xattr_name; + + if (!strcmp(name, "")) + return -EINVAL; + + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + + strcpy(xattr_name, prefix); + strcpy(xattr_name + prefixlen, name); + + res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size); + kfree(xattr_name); + return res; + +} + +static inline int can_list(const char *xattr_name) +{ + if (!xattr_name) + return 0; + + return strncmp(xattr_name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN); +} + +static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, + char *buffer, size_t size) +{ + ssize_t res = 0; + struct inode *inode = d_inode(dentry); + struct hfs_find_data fd; + u16 entry_type; + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + unsigned long len, found_bit; + int xattr_name_len, symbols_count; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + pr_err("can't init xattr find struct\n"); + return res; + } + + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_listxattr_finder_info; + + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + if (entry_type == HFSPLUS_FOLDER) { + len = sizeof(struct DInfo) + sizeof(struct DXInfo); + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + len); + found_bit = find_first_bit((void *)folder_finder_info, len*8); + } else if (entry_type == HFSPLUS_FILE) { + len = sizeof(struct FInfo) + sizeof(struct FXInfo); + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + len); + found_bit = find_first_bit((void *)file_finder_info, len*8); + } else { + res = -EOPNOTSUPP; + goto end_listxattr_finder_info; + } + + if (found_bit >= (len*8)) + res = 0; + else { + symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1; + xattr_name_len = + name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); + if (!buffer || !size) { + if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) + res = xattr_name_len; + } else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) { + if (size < xattr_name_len) + res = -ERANGE; + else { + res = copy_name(buffer, + HFSPLUS_XATTR_FINDER_INFO_NAME, + symbols_count); + } + } + } + +end_listxattr_finder_info: + hfs_find_exit(&fd); + + return res; +} + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + ssize_t err; + ssize_t res = 0; + struct inode *inode = d_inode(dentry); + struct hfs_find_data fd; + u16 key_len = 0; + struct hfsplus_attr_key attr_key; + char *strbuf; + int xattr_name_len; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfsplus_listxattr_finder_info(dentry, buffer, size); + if (res < 0) + return res; + else if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return (res == 0) ? -EOPNOTSUPP : res; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (err) { + pr_err("can't init xattr find struct\n"); + return err; + } + + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!strbuf) { + res = -ENOMEM; + goto out; + } + + err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); + if (err) { + if (err == -ENOENT) { + if (res == 0) + res = -ENODATA; + goto end_listxattr; + } else { + res = err; + goto end_listxattr; + } + } + + for (;;) { + key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + if (key_len == 0 || key_len > fd.tree->max_key_len) { + pr_err("invalid xattr key length: %d\n", key_len); + res = -EIO; + goto end_listxattr; + } + + hfs_bnode_read(fd.bnode, &attr_key, + fd.keyoffset, key_len + sizeof(key_len)); + + if (be32_to_cpu(attr_key.cnid) != inode->i_ino) + goto end_listxattr; + + xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; + if (hfsplus_uni2asc(inode->i_sb, + (const struct hfsplus_unistr *)&fd.key->attr.key_name, + strbuf, &xattr_name_len)) { + pr_err("unicode conversion failed\n"); + res = -EIO; + goto end_listxattr; + } + + if (!buffer || !size) { + if (can_list(strbuf)) + res += name_len(strbuf, xattr_name_len); + } else if (can_list(strbuf)) { + if (size < (res + name_len(strbuf, xattr_name_len))) { + res = -ERANGE; + goto end_listxattr; + } else + res += copy_name(buffer + res, + strbuf, xattr_name_len); + } + + if (hfs_brec_goto(&fd, 1)) + goto end_listxattr; + } + +end_listxattr: + kfree(strbuf); +out: + hfs_find_exit(&fd); + return res; +} + +static int hfsplus_removexattr(struct inode *inode, const char *name) +{ + int err = 0; + struct hfs_find_data cat_fd; + u16 flags; + u16 cat_entry_type; + int is_xattr_acl_deleted = 0; + int is_all_xattrs_deleted = 0; + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + if (!strcmp_xattr_finder_info(name)) + return -EOPNOTSUPP; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + pr_err("can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + pr_err("catalog searching failed\n"); + goto end_removexattr; + } + + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_removexattr; + + is_xattr_acl_deleted = !strcmp_xattr_acl(name); + is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL); + + if (!is_xattr_acl_deleted && !is_all_xattrs_deleted) + goto end_removexattr; + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + + if (cat_entry_type == HFSPLUS_FOLDER) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + pr_err("invalid catalog entry type\n"); + err = -EIO; + goto end_removexattr; + } + +end_removexattr: + hfs_find_exit(&cat_fd); + return err; +} + +static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + + /* + * osx is the namespace we use to indicate an unprefixed + * attribute on the filesystem (like the ones that OS X + * creates), so we pass the name through unmodified (after + * ensuring it doesn't conflict with another namespace). + */ + return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); +} + +static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + /* + * Don't allow setting properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + + /* + * osx is the namespace we use to indicate an unprefixed + * attribute on the filesystem (like the ones that OS X + * creates), so we pass the name through unmodified (after + * ensuring it doesn't conflict with another namespace). + */ + return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); +} + +static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_osx_handler = { + .prefix = XATTR_MAC_OSX_PREFIX, + .list = hfsplus_osx_listxattr, + .get = hfsplus_osx_getxattr, + .set = hfsplus_osx_setxattr, +}; diff --git a/kernel/fs/hfsplus/xattr.h b/kernel/fs/hfsplus/xattr.h new file mode 100644 index 000000000..f9b0955b3 --- /dev/null +++ b/kernel/fs/hfsplus/xattr.h @@ -0,0 +1,43 @@ +/* + * linux/fs/hfsplus/xattr.h + * + * Vyacheslav Dubeyko + * + * Logic of processing extended attributes + */ + +#ifndef _LINUX_HFSPLUS_XATTR_H +#define _LINUX_HFSPLUS_XATTR_H + +#include + +extern const struct xattr_handler hfsplus_xattr_osx_handler; +extern const struct xattr_handler hfsplus_xattr_user_handler; +extern const struct xattr_handler hfsplus_xattr_trusted_handler; +extern const struct xattr_handler hfsplus_xattr_security_handler; + +extern const struct xattr_handler *hfsplus_xattr_handlers[]; + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + const char *prefix, size_t prefixlen); + +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, + void *value, size_t size); + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size, + const char *prefix, size_t prefixlen); + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +int hfsplus_init_inode_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +#endif diff --git a/kernel/fs/hfsplus/xattr_security.c b/kernel/fs/hfsplus/xattr_security.c new file mode 100644 index 000000000..aacff00a9 --- /dev/null +++ b/kernel/fs/hfsplus/xattr_security.c @@ -0,0 +1,98 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko + * + * Handler for storing security labels as extended attributes. + */ + +#include +#include + +#include "hfsplus_fs.h" +#include "xattr.h" +#include "acl.h" + +static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); +} + +static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); +} + +static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +static int hfsplus_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + char *xattr_name; + int err = 0; + + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + + if (!strcmp(xattr->name, "")) + continue; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + + XATTR_SECURITY_PREFIX_LEN, xattr->name); + memset(xattr_name + + XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1); + + err = __hfsplus_setxattr(inode, xattr_name, + xattr->value, xattr->value_len, 0); + if (err) + break; + } + kfree(xattr_name); + return err; +} + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &hfsplus_initxattrs, NULL); +} + +int hfsplus_init_inode_security(struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = hfsplus_init_posix_acl(inode, dir); + if (!err) + err = hfsplus_init_security(inode, dir, qstr); + return err; +} + +const struct xattr_handler hfsplus_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = hfsplus_security_listxattr, + .get = hfsplus_security_getxattr, + .set = hfsplus_security_setxattr, +}; diff --git a/kernel/fs/hfsplus/xattr_trusted.c b/kernel/fs/hfsplus/xattr_trusted.c new file mode 100644 index 000000000..bcf65089b --- /dev/null +++ b/kernel/fs/hfsplus/xattr_trusted.c @@ -0,0 +1,44 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko + * + * Handler for trusted extended attributes. + */ + +#include + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN); +} + +static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); +} + +static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = hfsplus_trusted_listxattr, + .get = hfsplus_trusted_getxattr, + .set = hfsplus_trusted_setxattr, +}; diff --git a/kernel/fs/hfsplus/xattr_user.c b/kernel/fs/hfsplus/xattr_user.c new file mode 100644 index 000000000..5aa0e6dc4 --- /dev/null +++ b/kernel/fs/hfsplus/xattr_user.c @@ -0,0 +1,44 @@ +/* + * linux/fs/hfsplus/xattr_user.c + * + * Vyacheslav Dubeyko + * + * Handler for user extended attributes. + */ + +#include + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = hfsplus_user_listxattr, + .get = hfsplus_user_getxattr, + .set = hfsplus_user_setxattr, +}; diff --git a/kernel/fs/hostfs/Makefile b/kernel/fs/hostfs/Makefile new file mode 100644 index 000000000..d5beaffad --- /dev/null +++ b/kernel/fs/hostfs/Makefile @@ -0,0 +1,11 @@ +# +# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +hostfs-objs := hostfs_kern.o hostfs_user.o + +obj-y := +obj-$(CONFIG_HOSTFS) += hostfs.o + +include arch/um/scripts/Makefile.rules diff --git a/kernel/fs/hostfs/hostfs.h b/kernel/fs/hostfs/hostfs.h new file mode 100644 index 000000000..91e19f9df --- /dev/null +++ b/kernel/fs/hostfs/hostfs.h @@ -0,0 +1,98 @@ +#ifndef __UM_FS_HOSTFS +#define __UM_FS_HOSTFS + +#include + +/* + * These are exactly the same definitions as in fs.h, but the names are + * changed so that this file can be included in both kernel and user files. + */ + +#define HOSTFS_ATTR_MODE 1 +#define HOSTFS_ATTR_UID 2 +#define HOSTFS_ATTR_GID 4 +#define HOSTFS_ATTR_SIZE 8 +#define HOSTFS_ATTR_ATIME 16 +#define HOSTFS_ATTR_MTIME 32 +#define HOSTFS_ATTR_CTIME 64 +#define HOSTFS_ATTR_ATIME_SET 128 +#define HOSTFS_ATTR_MTIME_SET 256 + +/* These two are unused by hostfs. */ +#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ +#define HOSTFS_ATTR_ATTR_FLAG 1024 + +/* + * If you are very careful, you'll notice that these two are missing: + * + * #define ATTR_KILL_SUID 2048 + * #define ATTR_KILL_SGID 4096 + * + * and this is because they were added in 2.5 development. + * Actually, they are not needed by most ->setattr() methods - they are set by + * callers of notify_change() to notify that the setuid/setgid bits must be + * dropped. + * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE + * is on, and remove the appropriate bits from attr->ia_mode (attr is a + * "struct iattr *"). -BlaisorBlade + */ + +struct hostfs_iattr { + unsigned int ia_valid; + unsigned short ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; +}; + +struct hostfs_stat { + unsigned long long ino; + unsigned int mode; + unsigned int nlink; + unsigned int uid; + unsigned int gid; + unsigned long long size; + struct timespec atime, mtime, ctime; + unsigned int blksize; + unsigned long long blocks; + unsigned int maj; + unsigned int min; +}; + +extern int stat_file(const char *path, struct hostfs_stat *p, int fd); +extern int access_file(char *path, int r, int w, int x); +extern int open_file(char *path, int r, int w, int append); +extern void *open_dir(char *path, int *err_out); +extern void seek_dir(void *stream, unsigned long long pos); +extern char *read_dir(void *stream, unsigned long long *pos_out, + unsigned long long *ino_out, int *len_out, + unsigned int *type_out); +extern void close_file(void *stream); +extern int replace_file(int oldfd, int fd); +extern void close_dir(void *stream); +extern int read_file(int fd, unsigned long long *offset, char *buf, int len); +extern int write_file(int fd, unsigned long long *offset, const char *buf, + int len); +extern int lseek_file(int fd, long long offset, int whence); +extern int fsync_file(int fd, int datasync); +extern int file_create(char *name, int mode); +extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd); +extern int make_symlink(const char *from, const char *to); +extern int unlink_file(const char *file); +extern int do_mkdir(const char *file, int mode); +extern int do_rmdir(const char *file); +extern int do_mknod(const char *file, int mode, unsigned int major, + unsigned int minor); +extern int link_file(const char *from, const char *to); +extern int hostfs_do_readlink(char *file, char *buf, int size); +extern int rename_file(char *from, char *to); +extern int rename2_file(char *from, char *to, unsigned int flags); +extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out); + +#endif diff --git a/kernel/fs/hostfs/hostfs_kern.c b/kernel/fs/hostfs/hostfs_kern.c new file mode 100644 index 000000000..07d8d8f52 --- /dev/null +++ b/kernel/fs/hostfs/hostfs_kern.c @@ -0,0 +1,1023 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + * + * Ported the filesystem routines to 2.5. + * 2003-02-10 Petr Baudis + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hostfs.h" +#include +#include + +struct hostfs_inode_info { + int fd; + fmode_t mode; + struct inode vfs_inode; + struct mutex open_mutex; +}; + +static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) +{ + return list_entry(inode, struct hostfs_inode_info, vfs_inode); +} + +#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) + +/* Changed in hostfs_args before the kernel starts running */ +static char *root_ino = ""; +static int append = 0; + +static const struct inode_operations hostfs_iops; +static const struct inode_operations hostfs_dir_iops; +static const struct inode_operations hostfs_link_iops; + +#ifndef MODULE +static int __init hostfs_args(char *options, int *add) +{ + char *ptr; + + ptr = strchr(options, ','); + if (ptr != NULL) + *ptr++ = '\0'; + if (*options != '\0') + root_ino = options; + + options = ptr; + while (options) { + ptr = strchr(options, ','); + if (ptr != NULL) + *ptr++ = '\0'; + if (*options != '\0') { + if (!strcmp(options, "append")) + append = 1; + else printf("hostfs_args - unsupported option - %s\n", + options); + } + options = ptr; + } + return 0; +} + +__uml_setup("hostfs=", hostfs_args, +"hostfs=,,...\n" +" This is used to set hostfs parameters. The root directory argument\n" +" is used to confine all hostfs mounts to within the specified directory\n" +" tree on the host. If this isn't specified, then a user inside UML can\n" +" mount anything on the host that's accessible to the user that's running\n" +" it.\n" +" The only flag currently supported is 'append', which specifies that all\n" +" files opened by hostfs will be opened in append mode.\n\n" +); +#endif + +static char *__dentry_name(struct dentry *dentry, char *name) +{ + char *p = dentry_path_raw(dentry, name, PATH_MAX); + char *root; + size_t len; + + root = dentry->d_sb->s_fs_info; + len = strlen(root); + if (IS_ERR(p)) { + __putname(name); + return NULL; + } + + /* + * This function relies on the fact that dentry_path_raw() will place + * the path name at the end of the provided buffer. + */ + BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); + + strlcpy(name, root, PATH_MAX); + if (len > p - name) { + __putname(name); + return NULL; + } + + if (p > name + len) + strcpy(name + len, p); + + return name; +} + +static char *dentry_name(struct dentry *dentry) +{ + char *name = __getname(); + if (!name) + return NULL; + + return __dentry_name(dentry, name); +} + +static char *inode_name(struct inode *ino) +{ + struct dentry *dentry; + char *name; + + dentry = d_find_alias(ino); + if (!dentry) + return NULL; + + name = dentry_name(dentry); + + dput(dentry); + + return name; +} + +static char *follow_link(char *link) +{ + int len, n; + char *name, *resolved, *end; + + name = __getname(); + if (!name) { + n = -ENOMEM; + goto out_free; + } + + n = hostfs_do_readlink(link, name, PATH_MAX); + if (n < 0) + goto out_free; + else if (n == PATH_MAX) { + n = -E2BIG; + goto out_free; + } + + if (*name == '/') + return name; + + end = strrchr(link, '/'); + if (end == NULL) + return name; + + *(end + 1) = '\0'; + len = strlen(link) + strlen(name) + 1; + + resolved = kmalloc(len, GFP_KERNEL); + if (resolved == NULL) { + n = -ENOMEM; + goto out_free; + } + + sprintf(resolved, "%s%s", link, name); + __putname(name); + kfree(link); + return resolved; + + out_free: + __putname(name); + return ERR_PTR(n); +} + +static struct inode *hostfs_iget(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + return inode; +} + +static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) +{ + /* + * do_statfs uses struct statfs64 internally, but the linux kernel + * struct statfs still has 32-bit versions for most of these fields, + * so we convert them here + */ + int err; + long long f_blocks; + long long f_bfree; + long long f_bavail; + long long f_files; + long long f_ffree; + + err = do_statfs(dentry->d_sb->s_fs_info, + &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, + &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), + &sf->f_namelen); + if (err) + return err; + sf->f_blocks = f_blocks; + sf->f_bfree = f_bfree; + sf->f_bavail = f_bavail; + sf->f_files = f_files; + sf->f_ffree = f_ffree; + sf->f_type = HOSTFS_SUPER_MAGIC; + return 0; +} + +static struct inode *hostfs_alloc_inode(struct super_block *sb) +{ + struct hostfs_inode_info *hi; + + hi = kmalloc(sizeof(*hi), GFP_KERNEL); + if (hi == NULL) + return NULL; + hi->fd = -1; + hi->mode = 0; + inode_init_once(&hi->vfs_inode); + mutex_init(&hi->open_mutex); + return &hi->vfs_inode; +} + +static void hostfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (HOSTFS_I(inode)->fd != -1) { + close_file(&HOSTFS_I(inode)->fd); + HOSTFS_I(inode)->fd = -1; + } +} + +static void hostfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(HOSTFS_I(inode)); +} + +static void hostfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hostfs_i_callback); +} + +static int hostfs_show_options(struct seq_file *seq, struct dentry *root) +{ + const char *root_path = root->d_sb->s_fs_info; + size_t offset = strlen(root_ino) + 1; + + if (strlen(root_path) > offset) + seq_printf(seq, ",%s", root_path + offset); + + if (append) + seq_puts(seq, ",append"); + + return 0; +} + +static const struct super_operations hostfs_sbops = { + .alloc_inode = hostfs_alloc_inode, + .destroy_inode = hostfs_destroy_inode, + .evict_inode = hostfs_evict_inode, + .statfs = hostfs_statfs, + .show_options = hostfs_show_options, +}; + +static int hostfs_readdir(struct file *file, struct dir_context *ctx) +{ + void *dir; + char *name; + unsigned long long next, ino; + int error, len; + unsigned int type; + + name = dentry_name(file->f_path.dentry); + if (name == NULL) + return -ENOMEM; + dir = open_dir(name, &error); + __putname(name); + if (dir == NULL) + return -error; + next = ctx->pos; + seek_dir(dir, next); + while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { + if (!dir_emit(ctx, name, len, ino, type)) + break; + ctx->pos = next; + } + close_dir(dir); + return 0; +} + +static int hostfs_open(struct inode *ino, struct file *file) +{ + char *name; + fmode_t mode; + int err; + int r, w, fd; + + mode = file->f_mode & (FMODE_READ | FMODE_WRITE); + if ((mode & HOSTFS_I(ino)->mode) == mode) + return 0; + + mode |= HOSTFS_I(ino)->mode; + +retry: + r = w = 0; + + if (mode & FMODE_READ) + r = 1; + if (mode & FMODE_WRITE) + r = w = 1; + + name = dentry_name(file->f_path.dentry); + if (name == NULL) + return -ENOMEM; + + fd = open_file(name, r, w, append); + __putname(name); + if (fd < 0) + return fd; + + mutex_lock(&HOSTFS_I(ino)->open_mutex); + /* somebody else had handled it first? */ + if ((mode & HOSTFS_I(ino)->mode) == mode) { + mutex_unlock(&HOSTFS_I(ino)->open_mutex); + close_file(&fd); + return 0; + } + if ((mode | HOSTFS_I(ino)->mode) != mode) { + mode |= HOSTFS_I(ino)->mode; + mutex_unlock(&HOSTFS_I(ino)->open_mutex); + close_file(&fd); + goto retry; + } + if (HOSTFS_I(ino)->fd == -1) { + HOSTFS_I(ino)->fd = fd; + } else { + err = replace_file(fd, HOSTFS_I(ino)->fd); + close_file(&fd); + if (err < 0) { + mutex_unlock(&HOSTFS_I(ino)->open_mutex); + return err; + } + } + HOSTFS_I(ino)->mode = mode; + mutex_unlock(&HOSTFS_I(ino)->open_mutex); + + return 0; +} + +static int hostfs_file_release(struct inode *inode, struct file *file) +{ + filemap_write_and_wait(inode->i_mapping); + + return 0; +} + +static int hostfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ret = fsync_file(HOSTFS_I(inode)->fd, datasync); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static const struct file_operations hostfs_file_fops = { + .llseek = generic_file_llseek, + .splice_read = generic_file_splice_read, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = hostfs_open, + .release = hostfs_file_release, + .fsync = hostfs_fsync, +}; + +static const struct file_operations hostfs_dir_fops = { + .llseek = generic_file_llseek, + .iterate = hostfs_readdir, + .read = generic_read_dir, + .open = hostfs_open, + .fsync = hostfs_fsync, +}; + +static int hostfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *buffer; + loff_t base = page_offset(page); + int count = PAGE_CACHE_SIZE; + int end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int err; + + if (page->index >= end_index) + count = inode->i_size & (PAGE_CACHE_SIZE-1); + + buffer = kmap(page); + + err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); + if (err != count) { + ClearPageUptodate(page); + goto out; + } + + if (base > inode->i_size) + inode->i_size = base; + + if (PageError(page)) + ClearPageError(page); + err = 0; + + out: + kunmap(page); + + unlock_page(page); + return err; +} + +static int hostfs_readpage(struct file *file, struct page *page) +{ + char *buffer; + loff_t start = page_offset(page); + int bytes_read, ret = 0; + + buffer = kmap(page); + bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, + PAGE_CACHE_SIZE); + if (bytes_read < 0) { + ClearPageUptodate(page); + SetPageError(page); + ret = bytes_read; + goto out; + } + + memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read); + + ClearPageError(page); + SetPageUptodate(page); + + out: + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static int hostfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + *pagep = grab_cache_page_write_begin(mapping, index, flags); + if (!*pagep) + return -ENOMEM; + return 0; +} + +static int hostfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + void *buffer; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int err; + + buffer = kmap(page); + err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); + kunmap(page); + + if (!PageUptodate(page) && err == PAGE_CACHE_SIZE) + SetPageUptodate(page); + + /* + * If err > 0, write_file has added err to pos, so we are comparing + * i_size against the last byte written. + */ + if (err > 0 && (pos > inode->i_size)) + inode->i_size = pos; + unlock_page(page); + page_cache_release(page); + + return err; +} + +static const struct address_space_operations hostfs_aops = { + .writepage = hostfs_writepage, + .readpage = hostfs_readpage, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = hostfs_write_begin, + .write_end = hostfs_write_end, +}; + +static int read_name(struct inode *ino, char *name) +{ + dev_t rdev; + struct hostfs_stat st; + int err = stat_file(name, &st, -1); + if (err) + return err; + + /* Reencode maj and min with the kernel encoding.*/ + rdev = MKDEV(st.maj, st.min); + + switch (st.mode & S_IFMT) { + case S_IFLNK: + ino->i_op = &hostfs_link_iops; + break; + case S_IFDIR: + ino->i_op = &hostfs_dir_iops; + ino->i_fop = &hostfs_dir_fops; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + init_special_inode(ino, st.mode & S_IFMT, rdev); + ino->i_op = &hostfs_iops; + break; + case S_IFREG: + ino->i_op = &hostfs_iops; + ino->i_fop = &hostfs_file_fops; + ino->i_mapping->a_ops = &hostfs_aops; + break; + default: + return -EIO; + } + + ino->i_ino = st.ino; + ino->i_mode = st.mode; + set_nlink(ino, st.nlink); + i_uid_write(ino, st.uid); + i_gid_write(ino, st.gid); + ino->i_atime = st.atime; + ino->i_mtime = st.mtime; + ino->i_ctime = st.ctime; + ino->i_size = st.size; + ino->i_blocks = st.blocks; + return 0; +} + +static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + char *name; + int error, fd; + + inode = hostfs_iget(dir->i_sb); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto out; + } + + error = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + fd = file_create(name, mode & 0777); + if (fd < 0) + error = fd; + else + error = read_name(inode, name); + + __putname(name); + if (error) + goto out_put; + + HOSTFS_I(inode)->fd = fd; + HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; + d_instantiate(dentry, inode); + return 0; + + out_put: + iput(inode); + out: + return error; +} + +static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode; + char *name; + int err; + + inode = hostfs_iget(ino->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + err = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + err = read_name(inode, name); + + __putname(name); + if (err == -ENOENT) { + iput(inode); + inode = NULL; + } + else if (err) + goto out_put; + + d_add(dentry, inode); + return NULL; + + out_put: + iput(inode); + out: + return ERR_PTR(err); +} + +static int hostfs_link(struct dentry *to, struct inode *ino, + struct dentry *from) +{ + char *from_name, *to_name; + int err; + + if ((from_name = dentry_name(from)) == NULL) + return -ENOMEM; + to_name = dentry_name(to); + if (to_name == NULL) { + __putname(from_name); + return -ENOMEM; + } + err = link_file(to_name, from_name); + __putname(from_name); + __putname(to_name); + return err; +} + +static int hostfs_unlink(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if (append) + return -EPERM; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + + err = unlink_file(file); + __putname(file); + return err; +} + +static int hostfs_symlink(struct inode *ino, struct dentry *dentry, + const char *to) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = make_symlink(file, to); + __putname(file); + return err; +} + +static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = do_mkdir(file, mode); + __putname(file); + return err; +} + +static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = do_rmdir(file); + __putname(file); + return err; +} + +static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode *inode; + char *name; + int err; + + inode = hostfs_iget(dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + err = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + init_special_inode(inode, mode, dev); + err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); + if (!err) + goto out_free; + + err = read_name(inode, name); + __putname(name); + if (err) + goto out_put; + if (err) + goto out_put; + + d_instantiate(dentry, inode); + return 0; + + out_free: + __putname(name); + out_put: + iput(inode); + out: + return err; +} + +static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + char *old_name, *new_name; + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + old_name = dentry_name(old_dentry); + if (old_name == NULL) + return -ENOMEM; + new_name = dentry_name(new_dentry); + if (new_name == NULL) { + __putname(old_name); + return -ENOMEM; + } + if (!flags) + err = rename_file(old_name, new_name); + else + err = rename2_file(old_name, new_name, flags); + + __putname(old_name); + __putname(new_name); + return err; +} + +static int hostfs_permission(struct inode *ino, int desired) +{ + char *name; + int r = 0, w = 0, x = 0, err; + + if (desired & MAY_NOT_BLOCK) + return -ECHILD; + + if (desired & MAY_READ) r = 1; + if (desired & MAY_WRITE) w = 1; + if (desired & MAY_EXEC) x = 1; + name = inode_name(ino); + if (name == NULL) + return -ENOMEM; + + if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) || + S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode)) + err = 0; + else + err = access_file(name, r, w, x); + __putname(name); + if (!err) + err = generic_permission(ino, desired); + return err; +} + +static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct hostfs_iattr attrs; + char *name; + int err; + + int fd = HOSTFS_I(inode)->fd; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (append) + attr->ia_valid &= ~ATTR_SIZE; + + attrs.ia_valid = 0; + if (attr->ia_valid & ATTR_MODE) { + attrs.ia_valid |= HOSTFS_ATTR_MODE; + attrs.ia_mode = attr->ia_mode; + } + if (attr->ia_valid & ATTR_UID) { + attrs.ia_valid |= HOSTFS_ATTR_UID; + attrs.ia_uid = from_kuid(&init_user_ns, attr->ia_uid); + } + if (attr->ia_valid & ATTR_GID) { + attrs.ia_valid |= HOSTFS_ATTR_GID; + attrs.ia_gid = from_kgid(&init_user_ns, attr->ia_gid); + } + if (attr->ia_valid & ATTR_SIZE) { + attrs.ia_valid |= HOSTFS_ATTR_SIZE; + attrs.ia_size = attr->ia_size; + } + if (attr->ia_valid & ATTR_ATIME) { + attrs.ia_valid |= HOSTFS_ATTR_ATIME; + attrs.ia_atime = attr->ia_atime; + } + if (attr->ia_valid & ATTR_MTIME) { + attrs.ia_valid |= HOSTFS_ATTR_MTIME; + attrs.ia_mtime = attr->ia_mtime; + } + if (attr->ia_valid & ATTR_CTIME) { + attrs.ia_valid |= HOSTFS_ATTR_CTIME; + attrs.ia_ctime = attr->ia_ctime; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; + } + if (attr->ia_valid & ATTR_MTIME_SET) { + attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; + } + name = dentry_name(dentry); + if (name == NULL) + return -ENOMEM; + err = set_attr(name, &attrs, fd); + __putname(name); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) + truncate_setsize(inode, attr->ia_size); + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static const struct inode_operations hostfs_iops = { + .permission = hostfs_permission, + .setattr = hostfs_setattr, +}; + +static const struct inode_operations hostfs_dir_iops = { + .create = hostfs_create, + .lookup = hostfs_lookup, + .link = hostfs_link, + .unlink = hostfs_unlink, + .symlink = hostfs_symlink, + .mkdir = hostfs_mkdir, + .rmdir = hostfs_rmdir, + .mknod = hostfs_mknod, + .rename2 = hostfs_rename2, + .permission = hostfs_permission, + .setattr = hostfs_setattr, +}; + +static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *link = __getname(); + if (link) { + char *path = dentry_name(dentry); + int err = -ENOMEM; + if (path) { + err = hostfs_do_readlink(path, link, PATH_MAX); + if (err == PATH_MAX) + err = -E2BIG; + __putname(path); + } + if (err < 0) { + __putname(link); + link = ERR_PTR(err); + } + } else { + link = ERR_PTR(-ENOMEM); + } + + nd_set_link(nd, link); + return NULL; +} + +static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); +} + +static const struct inode_operations hostfs_link_iops = { + .readlink = generic_readlink, + .follow_link = hostfs_follow_link, + .put_link = hostfs_put_link, +}; + +static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + char *host_root_path, *req_root = d; + int err; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HOSTFS_SUPER_MAGIC; + sb->s_op = &hostfs_sbops; + sb->s_d_op = &simple_dentry_operations; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + /* NULL is printed as by sprintf: avoid that. */ + if (req_root == NULL) + req_root = ""; + + err = -ENOMEM; + sb->s_fs_info = host_root_path = + kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + if (host_root_path == NULL) + goto out; + + sprintf(host_root_path, "%s/%s", root_ino, req_root); + + root_inode = new_inode(sb); + if (!root_inode) + goto out; + + err = read_name(root_inode, host_root_path); + if (err) + goto out_put; + + if (S_ISLNK(root_inode->i_mode)) { + char *name = follow_link(host_root_path); + if (IS_ERR(name)) + err = PTR_ERR(name); + else + err = read_name(root_inode, name); + kfree(name); + if (err) + goto out_put; + } + + err = -ENOMEM; + sb->s_root = d_make_root(root_inode); + if (sb->s_root == NULL) + goto out; + + return 0; + +out_put: + iput(root_inode); +out: + return err; +} + +static struct dentry *hostfs_read_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return mount_nodev(type, flags, data, hostfs_fill_sb_common); +} + +static void hostfs_kill_sb(struct super_block *s) +{ + kill_anon_super(s); + kfree(s->s_fs_info); +} + +static struct file_system_type hostfs_type = { + .owner = THIS_MODULE, + .name = "hostfs", + .mount = hostfs_read_sb, + .kill_sb = hostfs_kill_sb, + .fs_flags = 0, +}; +MODULE_ALIAS_FS("hostfs"); + +static int __init init_hostfs(void) +{ + return register_filesystem(&hostfs_type); +} + +static void __exit exit_hostfs(void) +{ + unregister_filesystem(&hostfs_type); +} + +module_init(init_hostfs) +module_exit(exit_hostfs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/hostfs/hostfs_user.c b/kernel/fs/hostfs/hostfs_user.c new file mode 100644 index 000000000..9c1e0f019 --- /dev/null +++ b/kernel/fs/hostfs/hostfs_user.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hostfs.h" +#include + +static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +{ + p->ino = buf->st_ino; + p->mode = buf->st_mode; + p->nlink = buf->st_nlink; + p->uid = buf->st_uid; + p->gid = buf->st_gid; + p->size = buf->st_size; + p->atime.tv_sec = buf->st_atime; + p->atime.tv_nsec = 0; + p->ctime.tv_sec = buf->st_ctime; + p->ctime.tv_nsec = 0; + p->mtime.tv_sec = buf->st_mtime; + p->mtime.tv_nsec = 0; + p->blksize = buf->st_blksize; + p->blocks = buf->st_blocks; + p->maj = os_major(buf->st_rdev); + p->min = os_minor(buf->st_rdev); +} + +int stat_file(const char *path, struct hostfs_stat *p, int fd) +{ + struct stat64 buf; + + if (fd >= 0) { + if (fstat64(fd, &buf) < 0) + return -errno; + } else if (lstat64(path, &buf) < 0) { + return -errno; + } + stat64_to_hostfs(&buf, p); + return 0; +} + +int access_file(char *path, int r, int w, int x) +{ + int mode = 0; + + if (r) + mode = R_OK; + if (w) + mode |= W_OK; + if (x) + mode |= X_OK; + if (access(path, mode) != 0) + return -errno; + else return 0; +} + +int open_file(char *path, int r, int w, int append) +{ + int mode = 0, fd; + + if (r && !w) + mode = O_RDONLY; + else if (!r && w) + mode = O_WRONLY; + else if (r && w) + mode = O_RDWR; + else panic("Impossible mode in open_file"); + + if (append) + mode |= O_APPEND; + fd = open64(path, mode); + if (fd < 0) + return -errno; + else return fd; +} + +void *open_dir(char *path, int *err_out) +{ + DIR *dir; + + dir = opendir(path); + *err_out = errno; + + return dir; +} + +void seek_dir(void *stream, unsigned long long pos) +{ + DIR *dir = stream; + + seekdir(dir, pos); +} + +char *read_dir(void *stream, unsigned long long *pos_out, + unsigned long long *ino_out, int *len_out, + unsigned int *type_out) +{ + DIR *dir = stream; + struct dirent *ent; + + ent = readdir(dir); + if (ent == NULL) + return NULL; + *len_out = strlen(ent->d_name); + *ino_out = ent->d_ino; + *type_out = ent->d_type; + *pos_out = ent->d_off; + return ent->d_name; +} + +int read_file(int fd, unsigned long long *offset, char *buf, int len) +{ + int n; + + n = pread64(fd, buf, len, *offset); + if (n < 0) + return -errno; + *offset += n; + return n; +} + +int write_file(int fd, unsigned long long *offset, const char *buf, int len) +{ + int n; + + n = pwrite64(fd, buf, len, *offset); + if (n < 0) + return -errno; + *offset += n; + return n; +} + +int lseek_file(int fd, long long offset, int whence) +{ + int ret; + + ret = lseek64(fd, offset, whence); + if (ret < 0) + return -errno; + return 0; +} + +int fsync_file(int fd, int datasync) +{ + int ret; + if (datasync) + ret = fdatasync(fd); + else + ret = fsync(fd); + + if (ret < 0) + return -errno; + return 0; +} + +int replace_file(int oldfd, int fd) +{ + return dup2(oldfd, fd); +} + +void close_file(void *stream) +{ + close(*((int *) stream)); +} + +void close_dir(void *stream) +{ + closedir(stream); +} + +int file_create(char *name, int mode) +{ + int fd; + + fd = open64(name, O_CREAT | O_RDWR, mode); + if (fd < 0) + return -errno; + return fd; +} + +int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) +{ + struct hostfs_stat st; + struct timeval times[2]; + int err, ma; + + if (attrs->ia_valid & HOSTFS_ATTR_MODE) { + if (fd >= 0) { + if (fchmod(fd, attrs->ia_mode) != 0) + return -errno; + } else if (chmod(file, attrs->ia_mode) != 0) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_UID) { + if (fd >= 0) { + if (fchown(fd, attrs->ia_uid, -1)) + return -errno; + } else if (chown(file, attrs->ia_uid, -1)) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_GID) { + if (fd >= 0) { + if (fchown(fd, -1, attrs->ia_gid)) + return -errno; + } else if (chown(file, -1, attrs->ia_gid)) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_SIZE) { + if (fd >= 0) { + if (ftruncate(fd, attrs->ia_size)) + return -errno; + } else if (truncate(file, attrs->ia_size)) { + return -errno; + } + } + + /* + * Update accessed and/or modified time, in two parts: first set + * times according to the changes to perform, and then call futimes() + * or utimes() to apply them. + */ + ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET); + if (attrs->ia_valid & ma) { + err = stat_file(file, &st, fd); + if (err != 0) + return err; + + times[0].tv_sec = st.atime.tv_sec; + times[0].tv_usec = st.atime.tv_nsec / 1000; + times[1].tv_sec = st.mtime.tv_sec; + times[1].tv_usec = st.mtime.tv_nsec / 1000; + + if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { + times[0].tv_sec = attrs->ia_atime.tv_sec; + times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000; + } + if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) { + times[1].tv_sec = attrs->ia_mtime.tv_sec; + times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000; + } + + if (fd >= 0) { + if (futimes(fd, times) != 0) + return -errno; + } else if (utimes(file, times) != 0) { + return -errno; + } + } + + /* Note: ctime is not handled */ + if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) { + err = stat_file(file, &st, fd); + attrs->ia_atime = st.atime; + attrs->ia_mtime = st.mtime; + if (err != 0) + return err; + } + return 0; +} + +int make_symlink(const char *from, const char *to) +{ + int err; + + err = symlink(to, from); + if (err) + return -errno; + return 0; +} + +int unlink_file(const char *file) +{ + int err; + + err = unlink(file); + if (err) + return -errno; + return 0; +} + +int do_mkdir(const char *file, int mode) +{ + int err; + + err = mkdir(file, mode); + if (err) + return -errno; + return 0; +} + +int do_rmdir(const char *file) +{ + int err; + + err = rmdir(file); + if (err) + return -errno; + return 0; +} + +int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor) +{ + int err; + + err = mknod(file, mode, os_makedev(major, minor)); + if (err) + return -errno; + return 0; +} + +int link_file(const char *to, const char *from) +{ + int err; + + err = link(to, from); + if (err) + return -errno; + return 0; +} + +int hostfs_do_readlink(char *file, char *buf, int size) +{ + int n; + + n = readlink(file, buf, size); + if (n < 0) + return -errno; + if (n < size) + buf[n] = '\0'; + return n; +} + +int rename_file(char *from, char *to) +{ + int err; + + err = rename(from, to); + if (err < 0) + return -errno; + return 0; +} + +int rename2_file(char *from, char *to, unsigned int flags) +{ + int err; + +#ifndef SYS_renameat2 +# ifdef __x86_64__ +# define SYS_renameat2 316 +# endif +# ifdef __i386__ +# define SYS_renameat2 353 +# endif +#endif + +#ifdef SYS_renameat2 + err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags); + if (err < 0) { + if (errno != ENOSYS) + return -errno; + else + return -EINVAL; + } + return 0; +#else + return -EINVAL; +#endif +} + +int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out) +{ + struct statfs64 buf; + int err; + + err = statfs64(root, &buf); + if (err < 0) + return -errno; + + *bsize_out = buf.f_bsize; + *blocks_out = buf.f_blocks; + *bfree_out = buf.f_bfree; + *bavail_out = buf.f_bavail; + *files_out = buf.f_files; + *ffree_out = buf.f_ffree; + memcpy(fsid_out, &buf.f_fsid, + sizeof(buf.f_fsid) > fsid_size ? fsid_size : + sizeof(buf.f_fsid)); + *namelen_out = buf.f_namelen; + + return 0; +} diff --git a/kernel/fs/hpfs/Kconfig b/kernel/fs/hpfs/Kconfig new file mode 100644 index 000000000..56bd15c5b --- /dev/null +++ b/kernel/fs/hpfs/Kconfig @@ -0,0 +1,14 @@ +config HPFS_FS + tristate "OS/2 HPFS file system support" + depends on BLOCK + help + OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS + is the file system used for organizing files on OS/2 hard disk + partitions. Say Y if you want to be able to read files from and + write files to an OS/2 HPFS partition on your hard drive. OS/2 + floppies however are in regular MSDOS format, so you don't need this + option in order to be able to read them. Read + . + + To compile this file system support as a module, choose M here: the + module will be called hpfs. If unsure, say N. diff --git a/kernel/fs/hpfs/Makefile b/kernel/fs/hpfs/Makefile new file mode 100644 index 000000000..57b786fb9 --- /dev/null +++ b/kernel/fs/hpfs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux hpfs filesystem routines. +# + +obj-$(CONFIG_HPFS_FS) += hpfs.o + +hpfs-objs := alloc.o anode.o buffer.o dentry.o dir.o dnode.o ea.o file.o \ + inode.o map.o name.o namei.o super.o diff --git a/kernel/fs/hpfs/alloc.c b/kernel/fs/hpfs/alloc.c new file mode 100644 index 000000000..f005046e1 --- /dev/null +++ b/kernel/fs/hpfs/alloc.c @@ -0,0 +1,486 @@ +/* + * linux/fs/hpfs/alloc.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * HPFS bitmap operations + */ + +#include "hpfs_fn.h" + +static void hpfs_claim_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free)) { + hpfs_error(s, "free count underflow, allocating sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free--; + } +} + +static void hpfs_claim_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) { + hpfs_error(s, "free count overflow, freeing sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free++; + } +} + +static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free_dnodes)) { + hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes--; + } +} + +static void hpfs_claim_dirband_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) { + hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes++; + } +} + +/* + * Check if a sector is allocated in bitmap + * This is really slow. Turned on only if chk==2 + */ + +static int chk_if_allocated(struct super_block *s, secno sec, char *msg) +{ + struct quad_buffer_head qbh; + __le32 *bmp; + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; + if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { + hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); + goto fail1; + } + hpfs_brelse4(&qbh); + if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { + unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4; + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail; + if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) { + hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec); + goto fail1; + } + hpfs_brelse4(&qbh); + } + return 0; + fail1: + hpfs_brelse4(&qbh); + fail: + return 1; +} + +/* + * Check if sector(s) have proper number and additionally check if they're + * allocated in bitmap. + */ + +int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg) +{ + if (start + len < start || start < 0x12 || + start + len > hpfs_sb(s)->sb_fs_size) { + hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start); + return 1; + } + if (hpfs_sb(s)->sb_chk>=2) { + int i; + for (i = 0; i < len; i++) + if (chk_if_allocated(s, start + i, msg)) return 1; + } + return 0; +} + +static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward) +{ + struct quad_buffer_head qbh; + __le32 *bmp; + unsigned bs = near & ~0x3fff; + unsigned nr = (near & 0x3fff) & ~(n - 1); + /*unsigned mnr;*/ + unsigned i, q; + int a, b; + secno ret = 0; + if (n != 1 && n != 4) { + hpfs_error(s, "Bad allocation size: %d", n); + return 0; + } + if (bs != ~0x3fff) { + if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls; + } else { + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto uls; + } + if (!tstbits(bmp, nr, n + forward)) { + ret = bs + nr; + goto rt; + } + q = nr + n; b = 0; + while ((a = tstbits(bmp, q, n + forward)) != 0) { + q += a; + if (n != 1) q = ((q-1)&~(n-1))+n; + if (!b) { + if (q>>5 != nr>>5) { + b = 1; + q = nr & 0x1f; + } + } else if (q > nr) break; + } + if (!a) { + ret = bs + q; + goto rt; + } + nr >>= 5; + /*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */ + i = nr; + do { + if (!le32_to_cpu(bmp[i])) goto cont; + if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont; + q = i<<5; + if (i > 0) { + unsigned k = le32_to_cpu(bmp[i-1]); + while (k & 0x80000000) { + q--; k <<= 1; + } + } + if (n != 1) q = ((q-1)&~(n-1))+n; + while ((a = tstbits(bmp, q, n + forward)) != 0) { + q += a; + if (n != 1) q = ((q-1)&~(n-1))+n; + if (q>>5 > i) break; + } + if (!a) { + ret = bs + q; + goto rt; + } + cont: + i++, i &= 0x1ff; + } while (i != nr); + rt: + if (ret) { + if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) { + hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret); + ret = 0; + goto b; + } + bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f))); + hpfs_mark_4buffers_dirty(&qbh); + } + b: + hpfs_brelse4(&qbh); + uls: + return ret; +} + +/* + * Allocation strategy: 1) search place near the sector specified + * 2) search bitmap where free sectors last found + * 3) search all bitmaps + * 4) search all bitmaps ignoring number of pre-allocated + * sectors + */ + +secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward) +{ + secno sec; + int i; + unsigned n_bmps; + struct hpfs_sb_info *sbi = hpfs_sb(s); + int f_p = 0; + int near_bmp; + if (forward < 0) { + forward = -forward; + f_p = 1; + } + n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14; + if (near && near < sbi->sb_fs_size) { + if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret; + near_bmp = near >> 14; + } else near_bmp = n_bmps / 2; + /* + if (b != -1) { + if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) { + b &= 0x0fffffff; + goto ret; + } + if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret; + */ + if (!f_p) if (forward > sbi->sb_max_fwd_alloc) forward = sbi->sb_max_fwd_alloc; + less_fwd: + for (i = 0; i < n_bmps; i++) { + if (near_bmp+i < n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp+i; + goto ret; + } + if (!forward) { + if (near_bmp-i-1 >= 0 && ((sec = alloc_in_bmp(s, (near_bmp-i-1) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp-i-1; + goto ret; + } + } else { + if (near_bmp+i >= n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i-n_bmps) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp+i-n_bmps; + goto ret; + } + } + if (i == 1 && sbi->sb_c_bitmap != -1 && ((sec = alloc_in_bmp(s, (sbi->sb_c_bitmap) << 14, n, forward)))) { + goto ret; + } + } + if (!f_p) { + if (forward) { + sbi->sb_max_fwd_alloc = forward * 3 / 4; + forward /= 2; + goto less_fwd; + } + } + sec = 0; + ret: + if (sec) { + i = 0; + do + hpfs_claim_alloc(s, sec + i); + while (unlikely(++i < n)); + } + if (sec && f_p) { + for (i = 0; i < forward; i++) { + if (!hpfs_alloc_if_possible(s, sec + n + i)) { + hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); + sec = 0; + break; + } + } + } + return sec; +} + +static secno alloc_in_dirband(struct super_block *s, secno near) +{ + unsigned nr = near; + secno sec; + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (nr < sbi->sb_dirband_start) + nr = sbi->sb_dirband_start; + if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size) + nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4; + nr -= sbi->sb_dirband_start; + nr >>= 2; + sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); + if (!sec) return 0; + hpfs_claim_dirband_alloc(s, sec); + return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; +} + +/* Alloc sector if it's free */ + +int hpfs_alloc_if_possible(struct super_block *s, secno sec) +{ + struct quad_buffer_head qbh; + __le32 *bmp; + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; + if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { + bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_claim_alloc(s, sec); + return 1; + } + hpfs_brelse4(&qbh); + end: + return 0; +} + +/* Free sectors in bitmaps */ + +void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) +{ + struct quad_buffer_head qbh; + __le32 *bmp; + struct hpfs_sb_info *sbi = hpfs_sb(s); + /*pr_info("2 - ");*/ + if (!n) return; + if (sec < 0x12) { + hpfs_error(s, "Trying to free reserved sector %08x", sec); + return; + } + sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n; + if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff; + new_map: + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) { + return; + } + new_tst: + if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) { + hpfs_error(s, "sector %08x not allocated", sec); + hpfs_brelse4(&qbh); + return; + } + bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); + hpfs_claim_free(s, sec); + if (!--n) { + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + return; + } + if (!(++sec & 0x3fff)) { + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + goto new_map; + } + goto new_tst; +} + +/* + * Check if there are at least n free dnodes on the filesystem. + * Called before adding to dnode. If we run out of space while + * splitting dnodes, it would corrupt dnode tree. + */ + +int hpfs_check_free_dnodes(struct super_block *s, int n) +{ + int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; + int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; + int i, j; + __le32 *bmp; + struct quad_buffer_head qbh; + if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + for (j = 0; j < 512; j++) { + unsigned k; + if (!le32_to_cpu(bmp[j])) continue; + for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) { + hpfs_brelse4(&qbh); + return 0; + } + } + } + hpfs_brelse4(&qbh); + i = 0; + if (hpfs_sb(s)->sb_c_bitmap != -1) { + bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1"); + goto chk_bmp; + } + chk_next: + if (i == b) i++; + if (i >= n_bmps) return 1; + bmp = hpfs_map_bitmap(s, i, &qbh, "chkdn2"); + chk_bmp: + if (bmp) { + for (j = 0; j < 512; j++) { + u32 k; + if (!le32_to_cpu(bmp[j])) continue; + for (k = 0xf; k; k <<= 4) + if ((le32_to_cpu(bmp[j]) & k) == k) { + if (!--n) { + hpfs_brelse4(&qbh); + return 0; + } + } + } + hpfs_brelse4(&qbh); + } + i++; + goto chk_next; +} + +void hpfs_free_dnode(struct super_block *s, dnode_secno dno) +{ + if (hpfs_sb(s)->sb_chk) if (dno & 3) { + hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno); + return; + } + if (dno < hpfs_sb(s)->sb_dirband_start || + dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { + hpfs_free_sectors(s, dno, 4); + } else { + struct quad_buffer_head qbh; + __le32 *bmp; + unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + return; + } + bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_claim_dirband_free(s, dno); + } +} + +struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, + dnode_secno *dno, struct quad_buffer_head *qbh) +{ + struct dnode *d; + if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) { + if (!(*dno = alloc_in_dirband(s, near))) + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; + } else { + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) + if (!(*dno = alloc_in_dirband(s, near))) return NULL; + } + if (!(d = hpfs_get_4sectors(s, *dno, qbh))) { + hpfs_free_dnode(s, *dno); + return NULL; + } + memset(d, 0, 2048); + d->magic = cpu_to_le32(DNODE_MAGIC); + d->first_free = cpu_to_le32(52); + d->dirent[0] = 32; + d->dirent[2] = 8; + d->dirent[30] = 1; + d->dirent[31] = 255; + d->self = cpu_to_le32(*dno); + return d; +} + +struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *fno, + struct buffer_head **bh) +{ + struct fnode *f; + if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL; + if (!(f = hpfs_get_sector(s, *fno, bh))) { + hpfs_free_sectors(s, *fno, 1); + return NULL; + } + memset(f, 0, 512); + f->magic = cpu_to_le32(FNODE_MAGIC); + f->ea_offs = cpu_to_le16(0xc4); + f->btree.n_free_nodes = 8; + f->btree.first_free = cpu_to_le16(8); + return f; +} + +struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *ano, + struct buffer_head **bh) +{ + struct anode *a; + if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL; + if (!(a = hpfs_get_sector(s, *ano, bh))) { + hpfs_free_sectors(s, *ano, 1); + return NULL; + } + memset(a, 0, 512); + a->magic = cpu_to_le32(ANODE_MAGIC); + a->self = cpu_to_le32(*ano); + a->btree.n_free_nodes = 40; + a->btree.n_used_nodes = 0; + a->btree.first_free = cpu_to_le16(8); + return a; +} diff --git a/kernel/fs/hpfs/anode.c b/kernel/fs/hpfs/anode.c new file mode 100644 index 000000000..2d5b254ad --- /dev/null +++ b/kernel/fs/hpfs/anode.c @@ -0,0 +1,496 @@ +/* + * linux/fs/hpfs/anode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling HPFS anode tree that contains file allocation info + */ + +#include "hpfs_fn.h" + +/* Find a sector in allocation tree */ + +secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, + struct bplus_header *btree, unsigned sec, + struct buffer_head *bh) +{ + anode_secno a = -1; + struct anode *anode; + int i; + int c1, c2 = 0; + go_down: + if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1; + if (bp_internal(btree)) { + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) { + a = le32_to_cpu(btree->u.internal[i].down); + brelse(bh); + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + btree = &anode->btree; + goto go_down; + } + hpfs_error(s, "sector %08x not found in internal anode %08x", sec, a); + brelse(bh); + return -1; + } + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.external[i].file_secno) <= sec && + le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > sec) { + a = le32_to_cpu(btree->u.external[i].disk_secno) + sec - le32_to_cpu(btree->u.external[i].file_secno); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) { + brelse(bh); + return -1; + } + if (inode) { + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + hpfs_inode->i_file_sec = le32_to_cpu(btree->u.external[i].file_secno); + hpfs_inode->i_disk_sec = le32_to_cpu(btree->u.external[i].disk_secno); + hpfs_inode->i_n_secs = le32_to_cpu(btree->u.external[i].length); + } + brelse(bh); + return a; + } + hpfs_error(s, "sector %08x not found in external anode %08x", sec, a); + brelse(bh); + return -1; +} + +/* Add a sector to tree */ + +secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsigned fsecno) +{ + struct bplus_header *btree; + struct anode *anode = NULL, *ranode = NULL; + struct fnode *fnode; + anode_secno a, na = -1, ra, up = -1; + secno se; + struct buffer_head *bh, *bh1, *bh2; + int n; + unsigned fs; + int c1, c2 = 0; + if (fnod) { + if (!(fnode = hpfs_map_fnode(s, node, &bh))) return -1; + btree = &fnode->btree; + } else { + if (!(anode = hpfs_map_anode(s, node, &bh))) return -1; + btree = &anode->btree; + } + a = node; + go_down: + if ((n = btree->n_used_nodes - 1) < -!!fnod) { + hpfs_error(s, "anode %08x has no entries", a); + brelse(bh); + return -1; + } + if (bp_internal(btree)) { + a = le32_to_cpu(btree->u.internal[n].down); + btree->u.internal[n].file_secno = cpu_to_le32(-1); + mark_buffer_dirty(bh); + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1; + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + btree = &anode->btree; + goto go_down; + } + if (n >= 0) { + if (le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length) != fsecno) { + hpfs_error(s, "allocated size %08x, trying to add sector %08x, %cnode %08x", + le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length), fsecno, + fnod?'f':'a', node); + brelse(bh); + return -1; + } + if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) { + le32_add_cpu(&btree->u.external[n].length, 1); + mark_buffer_dirty(bh); + brelse(bh); + return se; + } + } else { + if (fsecno) { + hpfs_error(s, "empty file %08x, trying to add sector %08x", node, fsecno); + brelse(bh); + return -1; + } + se = !fnod ? node : (node + 16384) & ~16383; + } + if (!(se = hpfs_alloc_sector(s, se, 1, fsecno*ALLOC_M>ALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_Mu.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length); + if (!btree->n_free_nodes) { + up = a != node ? le32_to_cpu(anode->up) : -1; + if (!(anode = hpfs_alloc_anode(s, a, &na, &bh1))) { + brelse(bh); + hpfs_free_sectors(s, se, 1); + return -1; + } + if (a == node && fnod) { + anode->up = cpu_to_le32(node); + anode->btree.flags |= BP_fnode_parent; + anode->btree.n_used_nodes = btree->n_used_nodes; + anode->btree.first_free = btree->first_free; + anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes; + memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12); + btree->flags |= BP_internal; + btree->n_free_nodes = 11; + btree->n_used_nodes = 1; + btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(-1); + btree->u.internal[0].down = cpu_to_le32(na); + mark_buffer_dirty(bh); + } else if (!(ranode = hpfs_alloc_anode(s, /*a*/0, &ra, &bh2))) { + brelse(bh); + brelse(bh1); + hpfs_free_sectors(s, se, 1); + hpfs_free_sectors(s, na, 1); + return -1; + } + brelse(bh); + bh = bh1; + btree = &anode->btree; + } + btree->n_free_nodes--; n = btree->n_used_nodes++; + le16_add_cpu(&btree->first_free, 12); + btree->u.external[n].disk_secno = cpu_to_le32(se); + btree->u.external[n].file_secno = cpu_to_le32(fs); + btree->u.external[n].length = cpu_to_le32(1); + mark_buffer_dirty(bh); + brelse(bh); + if ((a == node && fnod) || na == -1) return se; + c2 = 0; + while (up != (anode_secno)-1) { + struct anode *new_anode; + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1; + if (up != node || !fnod) { + if (!(anode = hpfs_map_anode(s, up, &bh))) return -1; + btree = &anode->btree; + } else { + if (!(fnode = hpfs_map_fnode(s, up, &bh))) return -1; + btree = &fnode->btree; + } + if (btree->n_free_nodes) { + btree->n_free_nodes--; n = btree->n_used_nodes++; + le16_add_cpu(&btree->first_free, 8); + btree->u.internal[n].file_secno = cpu_to_le32(-1); + btree->u.internal[n].down = cpu_to_le32(na); + btree->u.internal[n-1].file_secno = cpu_to_le32(fs); + mark_buffer_dirty(bh); + brelse(bh); + brelse(bh2); + hpfs_free_sectors(s, ra, 1); + if ((anode = hpfs_map_anode(s, na, &bh))) { + anode->up = cpu_to_le32(up); + if (up == node && fnod) + anode->btree.flags |= BP_fnode_parent; + else + anode->btree.flags &= ~BP_fnode_parent; + mark_buffer_dirty(bh); + brelse(bh); + } + return se; + } + up = up != node ? le32_to_cpu(anode->up) : -1; + btree->u.internal[btree->n_used_nodes - 1].file_secno = cpu_to_le32(/*fs*/-1); + mark_buffer_dirty(bh); + brelse(bh); + a = na; + if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) { + anode = new_anode; + /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/ + anode->btree.flags |= BP_internal; + anode->btree.n_used_nodes = 1; + anode->btree.n_free_nodes = 59; + anode->btree.first_free = cpu_to_le16(16); + anode->btree.u.internal[0].down = cpu_to_le32(a); + anode->btree.u.internal[0].file_secno = cpu_to_le32(-1); + mark_buffer_dirty(bh); + brelse(bh); + if ((anode = hpfs_map_anode(s, a, &bh))) { + anode->up = cpu_to_le32(na); + mark_buffer_dirty(bh); + brelse(bh); + } + } else na = a; + } + if ((anode = hpfs_map_anode(s, na, &bh))) { + anode->up = cpu_to_le32(node); + if (fnod) + anode->btree.flags |= BP_fnode_parent; + mark_buffer_dirty(bh); + brelse(bh); + } + if (!fnod) { + if (!(anode = hpfs_map_anode(s, node, &bh))) { + brelse(bh2); + return -1; + } + btree = &anode->btree; + } else { + if (!(fnode = hpfs_map_fnode(s, node, &bh))) { + brelse(bh2); + return -1; + } + btree = &fnode->btree; + } + ranode->up = cpu_to_le32(node); + memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); + if (fnod) + ranode->btree.flags |= BP_fnode_parent; + ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes; + if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) { + struct anode *unode; + if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { + unode->up = cpu_to_le32(ra); + unode->btree.flags &= ~BP_fnode_parent; + mark_buffer_dirty(bh1); + brelse(bh1); + } + } + btree->flags |= BP_internal; + btree->n_free_nodes = fnod ? 10 : 58; + btree->n_used_nodes = 2; + btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(fs); + btree->u.internal[0].down = cpu_to_le32(ra); + btree->u.internal[1].file_secno = cpu_to_le32(-1); + btree->u.internal[1].down = cpu_to_le32(na); + mark_buffer_dirty(bh); + brelse(bh); + mark_buffer_dirty(bh2); + brelse(bh2); + return se; +} + +/* + * Remove allocation tree. Recursion would look much nicer but + * I want to avoid it because it can cause stack overflow. + */ + +void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) +{ + struct bplus_header *btree1 = btree; + struct anode *anode = NULL; + anode_secno ano = 0, oano; + struct buffer_head *bh; + int level = 0; + int pos = 0; + int i; + int c1, c2 = 0; + int d1, d2; + go_down: + d2 = 0; + while (bp_internal(btree1)) { + ano = le32_to_cpu(btree1->u.internal[pos].down); + if (level) brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1")) + return; + if (!(anode = hpfs_map_anode(s, ano, &bh))) return; + btree1 = &anode->btree; + level++; + pos = 0; + } + for (i = 0; i < btree1->n_used_nodes; i++) + hpfs_free_sectors(s, le32_to_cpu(btree1->u.external[i].disk_secno), le32_to_cpu(btree1->u.external[i].length)); + go_up: + if (!level) return; + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return; + hpfs_free_sectors(s, ano, 1); + oano = ano; + ano = le32_to_cpu(anode->up); + if (--level) { + if (!(anode = hpfs_map_anode(s, ano, &bh))) return; + btree1 = &anode->btree; + } else btree1 = btree; + for (i = 0; i < btree1->n_used_nodes; i++) { + if (le32_to_cpu(btree1->u.internal[i].down) == oano) { + if ((pos = i + 1) < btree1->n_used_nodes) + goto go_down; + else + goto go_up; + } + } + hpfs_error(s, + "reference to anode %08x not found in anode %08x " + "(probably bad up pointer)", + oano, level ? ano : -1); + if (level) + brelse(bh); +} + +/* Just a wrapper around hpfs_bplus_lookup .. used for reading eas */ + +static secno anode_lookup(struct super_block *s, anode_secno a, unsigned sec) +{ + struct anode *anode; + struct buffer_head *bh; + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + return hpfs_bplus_lookup(s, NULL, &anode->btree, sec, bh); +} + +int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos, + unsigned len, char *buf) +{ + struct buffer_head *bh; + char *data; + secno sec; + unsigned l; + while (len) { + if (ano) { + if ((sec = anode_lookup(s, a, pos >> 9)) == -1) + return -1; + } else sec = a + (pos >> 9); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1; + if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9))) + return -1; + l = 0x200 - (pos & 0x1ff); if (l > len) l = len; + memcpy(buf, data + (pos & 0x1ff), l); + brelse(bh); + buf += l; pos += l; len -= l; + } + return 0; +} + +int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, + unsigned len, const char *buf) +{ + struct buffer_head *bh; + char *data; + secno sec; + unsigned l; + while (len) { + if (ano) { + if ((sec = anode_lookup(s, a, pos >> 9)) == -1) + return -1; + } else sec = a + (pos >> 9); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1; + if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9))) + return -1; + l = 0x200 - (pos & 0x1ff); if (l > len) l = len; + memcpy(data + (pos & 0x1ff), buf, l); + mark_buffer_dirty(bh); + brelse(bh); + buf += l; pos += l; len -= l; + } + return 0; +} + +void hpfs_ea_remove(struct super_block *s, secno a, int ano, unsigned len) +{ + struct anode *anode; + struct buffer_head *bh; + if (ano) { + if (!(anode = hpfs_map_anode(s, a, &bh))) return; + hpfs_remove_btree(s, &anode->btree); + brelse(bh); + hpfs_free_sectors(s, a, 1); + } else hpfs_free_sectors(s, a, (len + 511) >> 9); +} + +/* Truncate allocation tree. Doesn't join anodes - I hope it doesn't matter */ + +void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) +{ + struct fnode *fnode; + struct anode *anode; + struct buffer_head *bh; + struct bplus_header *btree; + anode_secno node = f; + int i, j, nodes; + int c1, c2 = 0; + if (fno) { + if (!(fnode = hpfs_map_fnode(s, f, &bh))) return; + btree = &fnode->btree; + } else { + if (!(anode = hpfs_map_anode(s, f, &bh))) return; + btree = &anode->btree; + } + if (!secs) { + hpfs_remove_btree(s, btree); + if (fno) { + btree->n_free_nodes = 8; + btree->n_used_nodes = 0; + btree->first_free = cpu_to_le16(8); + btree->flags &= ~BP_internal; + mark_buffer_dirty(bh); + } else hpfs_free_sectors(s, f, 1); + brelse(bh); + return; + } + while (bp_internal(btree)) { + nodes = btree->n_used_nodes + btree->n_free_nodes; + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f; + brelse(bh); + hpfs_error(s, "internal btree %08x doesn't end with -1", node); + return; + f: + for (j = i + 1; j < btree->n_used_nodes; j++) + hpfs_ea_remove(s, le32_to_cpu(btree->u.internal[j].down), 1, 0); + btree->n_used_nodes = i + 1; + btree->n_free_nodes = nodes - btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 8 * btree->n_used_nodes); + mark_buffer_dirty(bh); + if (btree->u.internal[i].file_secno == cpu_to_le32(secs)) { + brelse(bh); + return; + } + node = le32_to_cpu(btree->u.internal[i].down); + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree")) + return; + if (!(anode = hpfs_map_anode(s, node, &bh))) return; + btree = &anode->btree; + } + nodes = btree->n_used_nodes + btree->n_free_nodes; + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) >= secs) goto ff; + brelse(bh); + return; + ff: + if (secs <= le32_to_cpu(btree->u.external[i].file_secno)) { + hpfs_error(s, "there is an allocation error in file %08x, sector %08x", f, secs); + if (i) i--; + } + else if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > secs) { + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[i].disk_secno) + secs - + le32_to_cpu(btree->u.external[i].file_secno), le32_to_cpu(btree->u.external[i].length) + - secs + le32_to_cpu(btree->u.external[i].file_secno)); /* I hope gcc optimizes this :-) */ + btree->u.external[i].length = cpu_to_le32(secs - le32_to_cpu(btree->u.external[i].file_secno)); + } + for (j = i + 1; j < btree->n_used_nodes; j++) + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[j].disk_secno), le32_to_cpu(btree->u.external[j].length)); + btree->n_used_nodes = i + 1; + btree->n_free_nodes = nodes - btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 12 * btree->n_used_nodes); + mark_buffer_dirty(bh); + brelse(bh); +} + +/* Remove file or directory and it's eas - note that directory must + be empty when this is called. */ + +void hpfs_remove_fnode(struct super_block *s, fnode_secno fno) +{ + struct buffer_head *bh; + struct fnode *fnode; + struct extended_attribute *ea; + struct extended_attribute *ea_end; + if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; + if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree); + else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); + ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (ea_indirect(ea)) + hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea)); + hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l)); + brelse(bh); + hpfs_free_sectors(s, fno, 1); +} diff --git a/kernel/fs/hpfs/buffer.c b/kernel/fs/hpfs/buffer.c new file mode 100644 index 000000000..8057fe4e6 --- /dev/null +++ b/kernel/fs/hpfs/buffer.c @@ -0,0 +1,204 @@ +/* + * linux/fs/hpfs/buffer.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * general buffer i/o + */ +#include +#include +#include +#include "hpfs_fn.h" + +void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) +{ + struct buffer_head *bh; + struct blk_plug plug; + + if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + return; + + bh = sb_find_get_block(s, secno); + if (bh) { + if (buffer_uptodate(bh)) { + brelse(bh); + return; + } + brelse(bh); + }; + + blk_start_plug(&plug); + while (n > 0) { + if (unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + break; + sb_breadahead(s, secno); + secno++; + n--; + } + blk_finish_plug(&plug); +} + +/* Map a sector into a buffer and return pointers to it and to the buffer. */ + +void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, + int ahead) +{ + struct buffer_head *bh; + + hpfs_lock_assert(s); + + hpfs_prefetch_sectors(s, secno, ahead); + + cond_resched(); + + *bhp = bh = sb_bread(s, secno); + if (bh != NULL) + return bh->b_data; + else { + pr_err("%s(): read error\n", __func__); + return NULL; + } +} + +/* Like hpfs_map_sector but don't read anything */ + +void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp) +{ + struct buffer_head *bh; + /*return hpfs_map_sector(s, secno, bhp, 0);*/ + + hpfs_lock_assert(s); + + cond_resched(); + + if ((*bhp = bh = sb_getblk(s, secno)) != NULL) { + if (!buffer_uptodate(bh)) wait_on_buffer(bh); + set_buffer_uptodate(bh); + return bh->b_data; + } else { + pr_err("%s(): getblk failed\n", __func__); + return NULL; + } +} + +/* Map 4 sectors into a 4buffer and return pointers to it and to the buffer. */ + +void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, + int ahead) +{ + char *data; + + hpfs_lock_assert(s); + + cond_resched(); + + if (secno & 3) { + pr_err("%s(): unaligned read\n", __func__); + return NULL; + } + + hpfs_prefetch_sectors(s, secno, 4 + ahead); + + if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0; + if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1; + if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2; + if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + + qbh->data = data = kmalloc(2048, GFP_NOFS); + if (!data) { + pr_err("%s(): out of memory\n", __func__); + goto bail4; + } + + memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512); + memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512); + memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512); + memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512); + + return data; + + bail4: + brelse(qbh->bh[3]); + bail3: + brelse(qbh->bh[2]); + bail2: + brelse(qbh->bh[1]); + bail1: + brelse(qbh->bh[0]); + bail0: + return NULL; +} + +/* Don't read sectors */ + +void *hpfs_get_4sectors(struct super_block *s, unsigned secno, + struct quad_buffer_head *qbh) +{ + cond_resched(); + + hpfs_lock_assert(s); + + if (secno & 3) { + pr_err("%s(): unaligned read\n", __func__); + return NULL; + } + + if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0; + if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1; + if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2; + if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + + if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { + pr_err("%s(): out of memory\n", __func__); + goto bail4; + } + return qbh->data; + +bail4: + brelse(qbh->bh[3]); +bail3: + brelse(qbh->bh[2]); +bail2: + brelse(qbh->bh[1]); +bail1: + brelse(qbh->bh[0]); +bail0: + return NULL; +} + + +void hpfs_brelse4(struct quad_buffer_head *qbh) +{ + if (unlikely(qbh->data != qbh->bh[0]->b_data)) + kfree(qbh->data); + brelse(qbh->bh[0]); + brelse(qbh->bh[1]); + brelse(qbh->bh[2]); + brelse(qbh->bh[3]); +} + +void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) +{ + if (unlikely(qbh->data != qbh->bh[0]->b_data)) { + memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512); + memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512); + memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); + memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + } + mark_buffer_dirty(qbh->bh[0]); + mark_buffer_dirty(qbh->bh[1]); + mark_buffer_dirty(qbh->bh[2]); + mark_buffer_dirty(qbh->bh[3]); +} diff --git a/kernel/fs/hpfs/dentry.c b/kernel/fs/hpfs/dentry.c new file mode 100644 index 000000000..fa27980f2 --- /dev/null +++ b/kernel/fs/hpfs/dentry.c @@ -0,0 +1,61 @@ +/* + * linux/fs/hpfs/dentry.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * dcache operations + */ + +#include "hpfs_fn.h" + +/* + * Note: the dentry argument is the parent dentry. + */ + +static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) +{ + unsigned long hash; + int i; + unsigned l = qstr->len; + + if (l == 1) if (qstr->name[0]=='.') goto x; + if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; + hpfs_adjust_length(qstr->name, &l); + /*if (hpfs_chk_name(qstr->name,&l))*/ + /*return -ENAMETOOLONG;*/ + /*return -ENOENT;*/ + x: + + hash = init_name_hash(); + for (i = 0; i < l; i++) + hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + unsigned al = len; + unsigned bl = name->len; + + hpfs_adjust_length(str, &al); + /*hpfs_adjust_length(b->name, &bl);*/ + + /* + * 'str' is the nane of an already existing dentry, so the name + * must be valid. 'name' must be validated first. + */ + + if (hpfs_chk_name(name->name, &bl)) + return 1; + if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0)) + return 1; + return 0; +} + +const struct dentry_operations hpfs_dentry_operations = { + .d_hash = hpfs_hash_dentry, + .d_compare = hpfs_compare_dentry, +}; diff --git a/kernel/fs/hpfs/dir.c b/kernel/fs/hpfs/dir.c new file mode 100644 index 000000000..2a8e07425 --- /dev/null +++ b/kernel/fs/hpfs/dir.c @@ -0,0 +1,330 @@ +/* + * linux/fs/hpfs/dir.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * directory VFS functions + */ + +#include +#include "hpfs_fn.h" + +static int hpfs_dir_release(struct inode *inode, struct file *filp) +{ + hpfs_lock(inode->i_sb); + hpfs_del_pos(inode, &filp->f_pos); + /*hpfs_write_if_changed(inode);*/ + hpfs_unlock(inode->i_sb); + return 0; +} + +/* This is slow, but it's not used often */ + +static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) +{ + loff_t new_off = off + (whence == 1 ? filp->f_pos : 0); + loff_t pos; + struct quad_buffer_head qbh; + struct inode *i = file_inode(filp); + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct super_block *s = i->i_sb; + + /* Somebody else will have to figure out what to do here */ + if (whence == SEEK_DATA || whence == SEEK_HOLE) + return -EINVAL; + + mutex_lock(&i->i_mutex); + hpfs_lock(s); + + /*pr_info("dir lseek\n");*/ + if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; + pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; + while (pos != new_off) { + if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); + else goto fail; + if (pos == 12) goto fail; + } + hpfs_add_pos(i, &filp->f_pos); +ok: + filp->f_pos = new_off; + hpfs_unlock(s); + mutex_unlock(&i->i_mutex); + return new_off; +fail: + /*pr_warn("illegal lseek: %016llx\n", new_off);*/ + hpfs_unlock(s); + mutex_unlock(&i->i_mutex); + return -ESPIPE; +} + +static int hpfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + int lc; + loff_t next_pos; + unsigned char *tempname; + int c1, c2 = 0; + int ret = 0; + + hpfs_lock(inode->i_sb); + + if (hpfs_sb(inode->i_sb)->sb_chk) { + if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { + ret = -EFSERROR; + goto out; + } + if (hpfs_chk_sectors(inode->i_sb, hpfs_inode->i_dno, 4, "dir_dnode")) { + ret = -EFSERROR; + goto out; + } + } + if (hpfs_sb(inode->i_sb)->sb_chk >= 2) { + struct buffer_head *bh; + struct fnode *fno; + int e = 0; + if (!(fno = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) { + ret = -EIOERROR; + goto out; + } + if (!fnode_is_dir(fno)) { + e = 1; + hpfs_error(inode->i_sb, "not a directory, fnode %08lx", + (unsigned long)inode->i_ino); + } + if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { + e = 1; + hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno)); + } + brelse(bh); + if (e) { + ret = -EFSERROR; + goto out; + } + } + lc = hpfs_sb(inode->i_sb)->sb_lowercase; + if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */ + ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */ + goto out; + } + if (ctx->pos == 13) { + ret = -ENOENT; + goto out; + } + + while (1) { + again: + /* This won't work when cycle is longer than number of dirents + accepted by filldir, but what can I do? + maybe killall -9 ls helps */ + if (hpfs_sb(inode->i_sb)->sb_chk) + if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) { + ret = -EFSERROR; + goto out; + } + if (ctx->pos == 12) + goto out; + if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { + pr_err("pos==%d\n", (int)ctx->pos); + goto out; + } + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + goto out; + ctx->pos = 11; + } + if (ctx->pos == 11) { + if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR)) + goto out; + ctx->pos = 1; + } + if (ctx->pos == 1) { + ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; + hpfs_add_pos(inode, &file->f_pos); + file->f_version = inode->i_version; + } + next_pos = ctx->pos; + if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { + ctx->pos = next_pos; + ret = -EIOERROR; + goto out; + } + if (de->first || de->last) { + if (hpfs_sb(inode->i_sb)->sb_chk) { + if (de->first && !de->last && (de->namelen != 2 + || de ->name[0] != 1 || de->name[1] != 1)) + hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos); + if (de->last && (de->namelen != 1 || de ->name[0] != 255)) + hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos); + } + hpfs_brelse4(&qbh); + ctx->pos = next_pos; + goto again; + } + tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); + if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) { + if (tempname != de->name) kfree(tempname); + hpfs_brelse4(&qbh); + goto out; + } + ctx->pos = next_pos; + if (tempname != de->name) kfree(tempname); + hpfs_brelse4(&qbh); + } +out: + hpfs_unlock(inode->i_sb); + return ret; +} + +/* + * lookup. Search the specified directory for the specified name, set + * *result to the corresponding inode. + * + * lookup uses the inode number to tell read_inode whether it is reading + * the inode of a directory or a file -- file ino's are odd, directory + * ino's are even. read_inode avoids i/o for file inodes; everything + * needed is up here in the directory. (And file fnodes are out in + * the boondocks.) + * + * - M.P.: this is over, sometimes we've got to read file's fnode for eas + * inode numbers are just fnode sector numbers; iget lock is used + * to tell read_inode to read fnode or not. + */ + +struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + ino_t ino; + int err; + struct inode *result = NULL; + struct hpfs_inode_info *hpfs_result; + + hpfs_lock(dir->i_sb); + if ((err = hpfs_chk_name(name, &len))) { + if (err == -ENAMETOOLONG) { + hpfs_unlock(dir->i_sb); + return ERR_PTR(-ENAMETOOLONG); + } + goto end_add; + } + + /* + * '.' and '..' will never be passed here. + */ + + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh); + + /* + * This is not really a bailout, just means file not found. + */ + + if (!de) goto end; + + /* + * Get inode number, what we're after. + */ + + ino = le32_to_cpu(de->fnode); + + /* + * Go find or make an inode. + */ + + result = iget_locked(dir->i_sb, ino); + if (!result) { + hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode"); + goto bail1; + } + if (result->i_state & I_NEW) { + hpfs_init_inode(result); + if (de->directory) + hpfs_read_inode(result); + else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas) + hpfs_read_inode(result); + else { + result->i_mode |= S_IFREG; + result->i_mode &= ~0111; + result->i_op = &hpfs_file_iops; + result->i_fop = &hpfs_file_ops; + set_nlink(result, 1); + } + unlock_new_inode(result); + } + hpfs_result = hpfs_i(result); + if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; + + if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { + hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); + goto bail1; + } + + /* + * Fill in the info from the directory if this is a newly created + * inode. + */ + + if (!result->i_ctime.tv_sec) { + if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) + result->i_ctime.tv_sec = 1; + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); + result->i_atime.tv_nsec = 0; + hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); + if (!hpfs_result->i_ea_mode && de->read_only) + result->i_mode &= ~0222; + if (!de->directory) { + if (result->i_size == -1) { + result->i_size = le32_to_cpu(de->file_size); + result->i_data.a_ops = &hpfs_aops; + hpfs_i(result)->mmu_private = result->i_size; + /* + * i_blocks should count the fnode and any anodes. + * We count 1 for the fnode and don't bother about + * anodes -- the disk heads are on the directory band + * and we want them to stay there. + */ + result->i_blocks = 1 + ((result->i_size + 511) >> 9); + } + } + } + + hpfs_brelse4(&qbh); + + /* + * Made it. + */ + + end: + end_add: + hpfs_unlock(dir->i_sb); + d_add(dentry, result); + return NULL; + + /* + * Didn't. + */ + bail1: + + hpfs_brelse4(&qbh); + + /*bail:*/ + + hpfs_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); +} + +const struct file_operations hpfs_dir_ops = +{ + .llseek = hpfs_dir_lseek, + .read = generic_read_dir, + .iterate = hpfs_readdir, + .release = hpfs_dir_release, + .fsync = hpfs_file_fsync, +}; diff --git a/kernel/fs/hpfs/dnode.c b/kernel/fs/hpfs/dnode.c new file mode 100644 index 000000000..2923a7bd8 --- /dev/null +++ b/kernel/fs/hpfs/dnode.c @@ -0,0 +1,1093 @@ +/* + * linux/fs/hpfs/dnode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling directory dnode tree - adding, deleteing & searching for dirents + */ + +#include "hpfs_fn.h" + +static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + int i = 1; + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; + i++; + } + pr_info("%s(): not_found\n", __func__); + return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; +} + +void hpfs_add_pos(struct inode *inode, loff_t *pos) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + int i = 0; + loff_t **ppos; + + if (hpfs_inode->i_rddir_off) + for (; hpfs_inode->i_rddir_off[i]; i++) + if (hpfs_inode->i_rddir_off[i] == pos) return; + if (!(i&0x0f)) { + if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { + pr_err("out of memory for position list\n"); + return; + } + if (hpfs_inode->i_rddir_off) { + memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t)); + kfree(hpfs_inode->i_rddir_off); + } + hpfs_inode->i_rddir_off = ppos; + } + hpfs_inode->i_rddir_off[i] = pos; + hpfs_inode->i_rddir_off[i + 1] = NULL; +} + +void hpfs_del_pos(struct inode *inode, loff_t *pos) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + loff_t **i, **j; + + if (!hpfs_inode->i_rddir_off) goto not_f; + for (i = hpfs_inode->i_rddir_off; *i; i++) if (*i == pos) goto fnd; + goto not_f; + fnd: + for (j = i + 1; *j; j++) ; + *i = *(j - 1); + *(j - 1) = NULL; + if (j - 1 == hpfs_inode->i_rddir_off) { + kfree(hpfs_inode->i_rddir_off); + hpfs_inode->i_rddir_off = NULL; + } + return; + not_f: + /*pr_warn("position pointer %p->%08x not found\n", + pos, (int)*pos);*/ + return; +} + +static void for_all_poss(struct inode *inode, void (*f)(loff_t *, loff_t, loff_t), + loff_t p1, loff_t p2) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + loff_t **i; + + if (!hpfs_inode->i_rddir_off) return; + for (i = hpfs_inode->i_rddir_off; *i; i++) (*f)(*i, p1, p2); + return; +} + +static void hpfs_pos_subst(loff_t *p, loff_t f, loff_t t) +{ + if (*p == f) *p = t; +} + +/*void hpfs_hpfs_pos_substd(loff_t *p, loff_t f, loff_t t) +{ + if ((*p & ~0x3f) == (f & ~0x3f)) *p = (t & ~0x3f) | (*p & 0x3f); +}*/ + +static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c) +{ + if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { + int n = (*p & 0x3f) + c; + if (n > 0x3f) + pr_err("%s(): %08x + %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; + } +} + +static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c) +{ + if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { + int n = (*p & 0x3f) - c; + if (n < 1) + pr_err("%s(): %08x - %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; + } +} + +static struct hpfs_dirent *dnode_pre_last_de(struct dnode *d) +{ + struct hpfs_dirent *de, *de_end, *dee = NULL, *deee = NULL; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + deee = dee; dee = de; + } + return deee; +} + +static struct hpfs_dirent *dnode_last_de(struct dnode *d) +{ + struct hpfs_dirent *de, *de_end, *dee = NULL; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + dee = de; + } + return dee; +} + +static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno ptr) +{ + struct hpfs_dirent *de; + if (!(de = dnode_last_de(d))) { + hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self)); + return; + } + if (hpfs_sb(s)->sb_chk) { + if (de->down) { + hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x", + le32_to_cpu(d->self), de_down_pointer(de)); + return; + } + if (le16_to_cpu(de->length) != 32) { + hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self)); + return; + } + } + if (ptr) { + le32_add_cpu(&d->first_free, 4); + if (le32_to_cpu(d->first_free) > 2048) { + hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); + le32_add_cpu(&d->first_free, -4); + return; + } + de->length = cpu_to_le16(36); + de->down = 1; + *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr); + } +} + +/* Add an entry to dnode and don't care if it grows over 2048 bytes */ + +struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, + const unsigned char *name, + unsigned namelen, secno down_ptr) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + unsigned d_size = de_size(namelen, down_ptr); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last); + if (!c) { + hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self)); + return NULL; + } + if (c < 0) break; + } + memmove((char *)de + d_size, de, (char *)de_end - (char *)de); + memset(de, 0, d_size); + if (down_ptr) { + *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); + de->down = 1; + } + de->length = cpu_to_le16(d_size); + de->not_8x3 = hpfs_is_name_long(name, namelen); + de->namelen = namelen; + memcpy(de->name, name, namelen); + le32_add_cpu(&d->first_free, d_size); + return de; +} + +/* Delete dirent and don't care about its subtree */ + +static void hpfs_delete_de(struct super_block *s, struct dnode *d, + struct hpfs_dirent *de) +{ + if (de->last) { + hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self)); + return; + } + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length)); + memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de); +} + +static void fix_up_ptrs(struct super_block *s, struct dnode *d) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + dnode_secno dno = le32_to_cpu(d->self); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) + if (de->down) { + struct quad_buffer_head qbh; + struct dnode *dd; + if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) { + if (le32_to_cpu(dd->up) != dno || dd->root_dnode) { + dd->up = cpu_to_le32(dno); + dd->root_dnode = 0; + hpfs_mark_4buffers_dirty(&qbh); + } + hpfs_brelse4(&qbh); + } + } +} + +/* Add an entry to dnode and do dnode splitting if required */ + +static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, + const unsigned char *name, unsigned namelen, + struct hpfs_dirent *new_de, dnode_secno down_ptr) +{ + struct quad_buffer_head qbh, qbh1, qbh2; + struct dnode *d, *ad, *rd, *nd = NULL; + dnode_secno adno, rdno; + struct hpfs_dirent *de; + struct hpfs_dirent nde; + unsigned char *nname; + int h; + int pos; + struct buffer_head *bh; + struct fnode *fnode; + int c1, c2 = 0; + if (!(nname = kmalloc(256, GFP_NOFS))) { + pr_err("out of memory, can't add to dnode\n"); + return 1; + } + go_up: + if (namelen >= 256) { + hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen); + kfree(nd); + kfree(nname); + return 1; + } + if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) { + kfree(nd); + kfree(nname); + return 1; + } + go_up_a: + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) { + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 1; + } + if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) { + loff_t t; + copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de); + t = get_pos(d, de); + for_all_poss(i, hpfs_pos_ins, t, 1); + for_all_poss(i, hpfs_pos_subst, 4, t); + for_all_poss(i, hpfs_pos_subst, 5, t + 1); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 0; + } + if (!nd) if (!(nd = kmalloc(0x924, GFP_NOFS))) { + /* 0x924 is a max size of dnode after adding a dirent with + max name length. We alloc this only once. There must + not be any error while splitting dnodes, otherwise the + whole directory, not only file we're adding, would + be lost. */ + pr_err("out of memory for dnode splitting\n"); + hpfs_brelse4(&qbh); + kfree(nname); + return 1; + } + memcpy(nd, d, le32_to_cpu(d->first_free)); + copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de); + for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1); + h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10; + if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) { + hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 1; + } + i->i_size += 2048; + i->i_blocks += 4; + pos = 1; + for (de = dnode_first_de(nd); (char *)de_next_de(de) - (char *)nd < h; de = de_next_de(de)) { + copy_de(hpfs_add_de(i->i_sb, ad, de->name, de->namelen, de->down ? de_down_pointer(de) : 0), de); + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, ((loff_t)adno << 4) | pos); + pos++; + } + copy_de(new_de = &nde, de); + memcpy(nname, de->name, de->namelen); + name = nname; + namelen = de->namelen; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); + down_ptr = adno; + set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); + de = de_next_de(de); + memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); + le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20)); + memcpy(d, nd, le32_to_cpu(nd->first_free)); + for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); + fix_up_ptrs(i->i_sb, ad); + if (!d->root_dnode) { + ad->up = d->up; + dno = le32_to_cpu(ad->up); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + goto go_up; + } + if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) { + hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); + hpfs_brelse4(&qbh); + hpfs_brelse4(&qbh1); + kfree(nd); + kfree(nname); + return 1; + } + i->i_size += 2048; + i->i_blocks += 4; + rd->root_dnode = 1; + rd->up = d->up; + if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) { + hpfs_free_dnode(i->i_sb, rdno); + hpfs_brelse4(&qbh); + hpfs_brelse4(&qbh1); + hpfs_brelse4(&qbh2); + kfree(nd); + kfree(nname); + return 1; + } + fnode->u.external[0].disk_secno = cpu_to_le32(rdno); + mark_buffer_dirty(bh); + brelse(bh); + hpfs_i(i)->i_dno = rdno; + d->up = ad->up = cpu_to_le32(rdno); + d->root_dnode = ad->root_dnode = 0; + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + qbh = qbh2; + set_last_pointer(i->i_sb, rd, dno); + dno = rdno; + d = rd; + goto go_up_a; +} + +/* + * Add an entry to directory btree. + * I hate such crazy directory structure. + * It's easy to read but terrible to write. + * I wrote this directory code 4 times. + * I hope, now it's finally bug-free. + */ + +int hpfs_add_dirent(struct inode *i, + const unsigned char *name, unsigned namelen, + struct hpfs_dirent *new_de) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct dnode *d; + struct hpfs_dirent *de, *de_end; + struct quad_buffer_head qbh; + dnode_secno dno; + int c; + int c1, c2 = 0; + dno = hpfs_inode->i_dno; + down: + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1; + if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + if (!(c = hpfs_compare_names(i->i_sb, name, namelen, de->name, de->namelen, de->last))) { + hpfs_brelse4(&qbh); + return -1; + } + if (c < 0) { + if (de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto down; + } + break; + } + } + hpfs_brelse4(&qbh); + if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) { + c = 1; + goto ret; + } + i->i_version++; + c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); + ret: + return c; +} + +/* + * Find dirent with higher name in 'from' subtree and move it to 'to' dnode. + * Return the dnode we moved from (to be checked later if it's empty) + */ + +static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) +{ + dnode_secno dno, ddno; + dnode_secno chk_up = to; + struct dnode *dnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de, *nde; + int a; + loff_t t; + int c1, c2 = 0; + dno = from; + while (1) { + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top")) + return 0; + if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0; + if (hpfs_sb(i->i_sb)->sb_chk) { + if (le32_to_cpu(dnode->up) != chk_up) { + hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x", + dno, chk_up, le32_to_cpu(dnode->up)); + hpfs_brelse4(&qbh); + return 0; + } + chk_up = dno; + } + if (!(de = dnode_last_de(dnode))) { + hpfs_error(i->i_sb, "move_to_top: dnode %08x has no last de", dno); + hpfs_brelse4(&qbh); + return 0; + } + if (!de->down) break; + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + } + while (!(de = dnode_pre_last_de(dnode))) { + dnode_secno up = le32_to_cpu(dnode->up); + hpfs_brelse4(&qbh); + hpfs_free_dnode(i->i_sb, dno); + i->i_size -= 2048; + i->i_blocks -= 4; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, 5); + if (up == to) return to; + if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return 0; + if (dnode->root_dnode) { + hpfs_error(i->i_sb, "move_to_top: got to root_dnode while moving from %08x to %08x", from, to); + hpfs_brelse4(&qbh); + return 0; + } + de = dnode_last_de(dnode); + if (!de || !de->down) { + hpfs_error(i->i_sb, "move_to_top: dnode %08x doesn't point down to %08x", up, dno); + hpfs_brelse4(&qbh); + return 0; + } + le32_add_cpu(&dnode->first_free, -4); + le16_add_cpu(&de->length, -4); + de->down = 0; + hpfs_mark_4buffers_dirty(&qbh); + dno = up; + } + t = get_pos(dnode, de); + for_all_poss(i, hpfs_pos_subst, t, 4); + for_all_poss(i, hpfs_pos_subst, t + 1, 5); + if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { + hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted"); + hpfs_brelse4(&qbh); + return 0; + } + memcpy(nde, de, le16_to_cpu(de->length)); + ddno = de->down ? de_down_pointer(de) : 0; + hpfs_delete_de(i->i_sb, dnode, de); + set_last_pointer(i->i_sb, dnode, ddno); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + a = hpfs_add_to_dnode(i, to, nde->name, nde->namelen, nde, from); + kfree(nde); + if (a) return 0; + return dno; +} + +/* + * Check if a dnode is empty and delete it from the tree + * (chkdsk doesn't like empty dnodes) + */ + +static void delete_empty_dnode(struct inode *i, dnode_secno dno) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct quad_buffer_head qbh; + struct dnode *dnode; + dnode_secno down, up, ndown; + int p; + struct hpfs_dirent *de; + int c1, c2 = 0; + try_it_again: + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return; + if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return; + if (le32_to_cpu(dnode->first_free) > 56) goto end; + if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) { + struct hpfs_dirent *de_end; + int root = dnode->root_dnode; + up = le32_to_cpu(dnode->up); + de = dnode_first_de(dnode); + down = de->down ? de_down_pointer(de) : 0; + if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) { + hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno); + goto end; + } + hpfs_brelse4(&qbh); + hpfs_free_dnode(i->i_sb, dno); + i->i_size -= 2048; + i->i_blocks -= 4; + if (root) { + struct fnode *fnode; + struct buffer_head *bh; + struct dnode *d1; + struct quad_buffer_head qbh1; + if (hpfs_sb(i->i_sb)->sb_chk) + if (up != i->i_ino) { + hpfs_error(i->i_sb, + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + dno, up, + (unsigned long)i->i_ino); + return; + } + if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { + d1->up = cpu_to_le32(up); + d1->root_dnode = 1; + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) { + fnode->u.external[0].disk_secno = cpu_to_le32(down); + mark_buffer_dirty(bh); + brelse(bh); + } + hpfs_inode->i_dno = down; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12); + return; + } + if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return; + p = 1; + de_end = dnode_end_de(dnode); + for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de), p++) + if (de->down) if (de_down_pointer(de) == dno) goto fnd; + hpfs_error(i->i_sb, "delete_empty_dnode: pointer to dnode %08x not found in dnode %08x", dno, up); + goto end; + fnd: + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); + if (!down) { + de->down = 0; + le16_add_cpu(&de->length, -4); + le32_add_cpu(&dnode->first_free, -4); + memmove(de_next_de(de), (char *)de_next_de(de) + 4, + (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); + } else { + struct dnode *d1; + struct quad_buffer_head qbh1; + *(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down; + if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { + d1->up = cpu_to_le32(up); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + } + } else { + hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free)); + goto end; + } + + if (!de->last) { + struct hpfs_dirent *de_next = de_next_de(de); + struct hpfs_dirent *de_cp; + struct dnode *d1; + struct quad_buffer_head qbh1; + if (!de_next->down) goto endm; + ndown = de_down_pointer(de_next); + if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { + pr_err("out of memory for dtree balancing\n"); + goto endm; + } + memcpy(de_cp, de, le16_to_cpu(de->length)); + hpfs_delete_de(i->i_sb, dnode, de); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4); + for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1); + if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) { + d1->up = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0); + /*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", + up, ndown, down, dno);*/ + dno = up; + kfree(de_cp); + goto try_it_again; + } else { + struct hpfs_dirent *de_prev = dnode_pre_last_de(dnode); + struct hpfs_dirent *de_cp; + struct dnode *d1; + struct quad_buffer_head qbh1; + dnode_secno dlp; + if (!de_prev) { + hpfs_error(i->i_sb, "delete_empty_dnode: empty dnode %08x", up); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + dno = up; + goto try_it_again; + } + if (!de_prev->down) goto endm; + ndown = de_down_pointer(de_prev); + if ((d1 = hpfs_map_dnode(i->i_sb, ndown, &qbh1))) { + struct hpfs_dirent *del = dnode_last_de(d1); + dlp = del->down ? de_down_pointer(del) : 0; + if (!dlp && down) { + if (le32_to_cpu(d1->first_free) > 2044) { + if (hpfs_sb(i->i_sb)->sb_chk >= 2) { + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("terminating balancing operation\n"); + } + hpfs_brelse4(&qbh1); + goto endm; + } + if (hpfs_sb(i->i_sb)->sb_chk >= 2) { + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("goin'on\n"); + } + le16_add_cpu(&del->length, 4); + del->down = 1; + le32_add_cpu(&d1->first_free, 4); + } + if (dlp && !down) { + le16_add_cpu(&del->length, -4); + del->down = 0; + le32_add_cpu(&d1->first_free, -4); + } else if (down) + *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); + } else goto endm; + if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { + pr_err("out of memory for dtree balancing\n"); + hpfs_brelse4(&qbh1); + goto endm; + } + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); + hpfs_delete_de(i->i_sb, dnode, de_prev); + if (!de_prev->down) { + le16_add_cpu(&de_prev->length, 4); + de_prev->down = 1; + le32_add_cpu(&dnode->first_free, 4); + } + *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1)); + if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) { + d1->up = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, dlp); + dno = up; + kfree(de_cp); + goto try_it_again; + } + endm: + hpfs_mark_4buffers_dirty(&qbh); + end: + hpfs_brelse4(&qbh); +} + + +/* Delete dirent from directory */ + +int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, + struct quad_buffer_head *qbh, int depth) +{ + struct dnode *dnode = qbh->data; + dnode_secno down = 0; + loff_t t; + if (de->first || de->last) { + hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno); + hpfs_brelse4(qbh); + return 1; + } + if (de->down) down = de_down_pointer(de); + if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) { + if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) { + hpfs_brelse4(qbh); + return 2; + } + } + i->i_version++; + for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1); + hpfs_delete_de(i->i_sb, dnode, de); + hpfs_mark_4buffers_dirty(qbh); + hpfs_brelse4(qbh); + if (down) { + dnode_secno a = move_to_top(i, down, dno); + for_all_poss(i, hpfs_pos_subst, 5, t); + if (a) delete_empty_dnode(i, a); + return !a; + } + delete_empty_dnode(i, dno); + return 0; +} + +void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes, + int *n_subdirs, int *n_items) +{ + struct dnode *dnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + dnode_secno ptr, odno = 0; + int c1, c2 = 0; + int d1, d2 = 0; + go_down: + if (n_dnodes) (*n_dnodes)++; + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return; + ptr = 0; + go_up: + if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; + if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno) + hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up)); + de = dnode_first_de(dnode); + if (ptr) while(1) { + if (de->down) if (de_down_pointer(de) == ptr) goto process_de; + if (de->last) { + hpfs_brelse4(&qbh); + hpfs_error(s, "hpfs_count_dnodes: pointer to dnode %08x not found in dnode %08x, got here from %08x", + ptr, dno, odno); + return; + } + de = de_next_de(de); + } + next_de: + if (de->down) { + odno = dno; + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto go_down; + } + process_de: + if (!de->first && !de->last && de->directory && n_subdirs) (*n_subdirs)++; + if (!de->first && !de->last && n_items) (*n_items)++; + if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de; + ptr = dno; + dno = le32_to_cpu(dnode->up); + if (dnode->root_dnode) { + hpfs_brelse4(&qbh); + return; + } + hpfs_brelse4(&qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return; + odno = -1; + goto go_up; +} + +static struct hpfs_dirent *map_nth_dirent(struct super_block *s, dnode_secno dno, int n, + struct quad_buffer_head *qbh, struct dnode **dn) +{ + int i; + struct hpfs_dirent *de, *de_end; + struct dnode *dnode; + dnode = hpfs_map_dnode(s, dno, qbh); + if (!dnode) return NULL; + if (dn) *dn=dnode; + de = dnode_first_de(dnode); + de_end = dnode_end_de(dnode); + for (i = 1; de < de_end; i++, de = de_next_de(de)) { + if (i == n) { + return de; + } + if (de->last) break; + } + hpfs_brelse4(qbh); + hpfs_error(s, "map_nth_dirent: n too high; dnode = %08x, requested %08x", dno, n); + return NULL; +} + +dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno) +{ + struct quad_buffer_head qbh; + dnode_secno d = dno; + dnode_secno up = 0; + struct hpfs_dirent *de; + int c1, c2 = 0; + + again: + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible")) + return d; + if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno; + if (hpfs_sb(s)->sb_chk) + if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up) + hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up)); + if (!de->down) { + hpfs_brelse4(&qbh); + return d; + } + up = d; + d = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto again; +} + +struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, + struct quad_buffer_head *qbh) +{ + loff_t pos; + unsigned c; + dnode_secno dno; + struct hpfs_dirent *de, *d; + struct hpfs_dirent *up_de; + struct hpfs_dirent *end_up_de; + struct dnode *dnode; + struct dnode *up_dnode; + struct quad_buffer_head qbh0; + + pos = *posp; + dno = pos >> 6 << 2; + pos &= 077; + if (!(de = map_nth_dirent(inode->i_sb, dno, pos, qbh, &dnode))) + goto bail; + + /* Going to the next dirent */ + if ((d = de_next_de(de)) < dnode_end_de(dnode)) { + if (!(++*posp & 077)) { + hpfs_error(inode->i_sb, + "map_pos_dirent: pos crossed dnode boundary; pos = %08llx", + (unsigned long long)*posp); + goto bail; + } + /* We're going down the tree */ + if (d->down) { + *posp = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, de_down_pointer(d)) << 4) + 1; + } + + return de; + } + + /* Going up */ + if (dnode->root_dnode) goto bail; + + if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0))) + goto bail; + + end_up_de = dnode_end_de(up_dnode); + c = 0; + for (up_de = dnode_first_de(up_dnode); up_de < end_up_de; + up_de = de_next_de(up_de)) { + if (!(++c & 077)) hpfs_error(inode->i_sb, + "map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up)); + if (up_de->down && de_down_pointer(up_de) == dno) { + *posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c; + hpfs_brelse4(&qbh0); + return de; + } + } + + hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x", + dno, le32_to_cpu(dnode->up)); + hpfs_brelse4(&qbh0); + + bail: + *posp = 12; + return de; +} + +/* Find a dirent in tree */ + +struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, + const unsigned char *name, unsigned len, + dnode_secno *dd, struct quad_buffer_head *qbh) +{ + struct dnode *dnode; + struct hpfs_dirent *de; + struct hpfs_dirent *de_end; + int c1, c2 = 0; + + if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n"); + again: + if (hpfs_sb(inode->i_sb)->sb_chk) + if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL; + if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL; + + de_end = dnode_end_de(dnode); + for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de)) { + int t = hpfs_compare_names(inode->i_sb, name, len, de->name, de->namelen, de->last); + if (!t) { + if (dd) *dd = dno; + return de; + } + if (t < 0) { + if (de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(qbh); + goto again; + } + break; + } + } + hpfs_brelse4(qbh); + return NULL; +} + +/* + * Remove empty directory. In normal cases it is only one dnode with two + * entries, but we must handle also such obscure cases when it's a tree + * of empty dnodes. + */ + +void hpfs_remove_dtree(struct super_block *s, dnode_secno dno) +{ + struct quad_buffer_head qbh; + struct dnode *dnode; + struct hpfs_dirent *de; + dnode_secno d1, d2, rdno = dno; + while (1) { + if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; + de = dnode_first_de(dnode); + if (de->last) { + if (de->down) d1 = de_down_pointer(de); + else goto error; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + dno = d1; + } else break; + } + if (!de->first) goto error; + d1 = de->down ? de_down_pointer(de) : 0; + de = de_next_de(de); + if (!de->last) goto error; + d2 = de->down ? de_down_pointer(de) : 0; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + do { + while (d1) { + if (!(dnode = hpfs_map_dnode(s, dno = d1, &qbh))) return; + de = dnode_first_de(dnode); + if (!de->last) goto error; + d1 = de->down ? de_down_pointer(de) : 0; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + } + d1 = d2; + d2 = 0; + } while (d1); + return; + error: + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + hpfs_error(s, "directory %08x is corrupted or not empty", rdno); +} + +/* + * Find dirent for specified fnode. Use truncated 15-char name in fnode as + * a help for searching. + */ + +struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, + struct fnode *f, struct quad_buffer_head *qbh) +{ + unsigned char *name1; + unsigned char *name2; + int name1len, name2len; + struct dnode *d; + dnode_secno dno, downd; + struct fnode *upf; + struct buffer_head *bh; + struct hpfs_dirent *de, *de_end; + int c; + int c1, c2 = 0; + int d1, d2 = 0; + name1 = f->name; + if (!(name2 = kmalloc(256, GFP_NOFS))) { + pr_err("out of memory, can't map dirent\n"); + return NULL; + } + if (f->len <= 15) + memcpy(name2, name1, name1len = name2len = f->len); + else { + memcpy(name2, name1, 15); + memset(name2 + 15, 0xff, 256 - 15); + /*name2[15] = 0xff;*/ + name1len = 15; name2len = 256; + } + if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) { + kfree(name2); + return NULL; + } + if (!fnode_is_dir(upf)) { + brelse(bh); + hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); + kfree(name2); + return NULL; + } + dno = le32_to_cpu(upf->u.external[0].disk_secno); + brelse(bh); + go_down: + downd = 0; + go_up: + if (!(d = hpfs_map_dnode(s, dno, qbh))) { + kfree(name2); + return NULL; + } + de_end = dnode_end_de(d); + de = dnode_first_de(d); + if (downd) { + while (de < de_end) { + if (de->down) if (de_down_pointer(de) == downd) goto f; + de = de_next_de(de); + } + hpfs_error(s, "pointer to dnode %08x not found in dnode %08x", downd, dno); + hpfs_brelse4(qbh); + kfree(name2); + return NULL; + } + next_de: + if (le32_to_cpu(de->fnode) == fno) { + kfree(name2); + return de; + } + c = hpfs_compare_names(s, name1, name1len, de->name, de->namelen, de->last); + if (c < 0 && de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { + kfree(name2); + return NULL; + } + goto go_down; + } + f: + if (le32_to_cpu(de->fnode) == fno) { + kfree(name2); + return de; + } + c = hpfs_compare_names(s, name2, name2len, de->name, de->namelen, de->last); + if (c < 0 && !de->last) goto not_found; + if ((de = de_next_de(de)) < de_end) goto next_de; + if (d->root_dnode) goto not_found; + downd = dno; + dno = le32_to_cpu(d->up); + hpfs_brelse4(qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) { + kfree(name2); + return NULL; + } + goto go_up; + not_found: + hpfs_brelse4(qbh); + hpfs_error(s, "dirent for fnode %08x not found", fno); + kfree(name2); + return NULL; +} diff --git a/kernel/fs/hpfs/ea.c b/kernel/fs/hpfs/ea.c new file mode 100644 index 000000000..ce3f98ba9 --- /dev/null +++ b/kernel/fs/hpfs/ea.c @@ -0,0 +1,367 @@ +/* + * linux/fs/hpfs/ea.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling extended attributes + */ + +#include "hpfs_fn.h" + +/* Remove external extended attributes. ano specifies whether a is a + direct sector where eas starts or an anode */ + +void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) +{ + unsigned pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + struct extended_attribute *ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; + if (ea_indirect(ea)) { + if (ea_valuelen(ea) != 8) { + hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x", + ano ? "anode" : "sectors", a, pos); + return; + } + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4)) + return; + hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea)); + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9); + else { + struct buffer_head *bh; + struct anode *anode; + if ((anode = hpfs_map_anode(s, a, &bh))) { + hpfs_remove_btree(s, &anode->btree); + brelse(bh); + hpfs_free_sectors(s, a, 1); + } + } +} + +static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size) +{ + char *ret; + if (!(ret = kmalloc(size + 1, GFP_NOFS))) { + pr_err("out of memory for EA\n"); + return NULL; + } + if (hpfs_ea_read(s, a, ano, 0, size, ret)) { + kfree(ret); + return NULL; + } + ret[size] = 0; + return ret; +} + +static void set_indirect_ea(struct super_block *s, int ano, secno a, + const char *data, int size) +{ + hpfs_ea_write(s, a, ano, 0, size, data); +} + +/* Read an extended attribute named 'key' into the provided buffer */ + +int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, + char *buf, int size) +{ + unsigned pos; + int ano, len; + secno a; + char ex[4 + 255 + 1 + 8]; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) + goto indirect; + if (ea_valuelen(ea) >= size) + return -EINVAL; + memcpy(buf, ea_data(ea), ea_valuelen(ea)); + buf[ea_valuelen(ea)] = 0; + return 0; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode_in_anode(fnode); + pos = 0; + while (pos < len) { + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return -EIO; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) + return -EIO; + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) + goto indirect; + if (ea_valuelen(ea) >= size) + return -EINVAL; + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), buf)) + return -EIO; + buf[ea_valuelen(ea)] = 0; + return 0; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + return -ENOENT; +indirect: + if (ea_len(ea) >= size) + return -EINVAL; + if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf)) + return -EIO; + buf[ea_len(ea)] = 0; + return 0; +} + +/* Read an extended attribute named 'key' */ +char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *size) +{ + char *ret; + unsigned pos; + int ano, len; + secno a; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) + return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { + pr_err("out of memory for EA\n"); + return NULL; + } + memcpy(ret, ea_data(ea), ea_valuelen(ea)); + ret[ea_valuelen(ea)] = 0; + return ret; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode_in_anode(fnode); + pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return NULL; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) + return NULL; + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) + return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { + pr_err("out of memory for EA\n"); + return NULL; + } + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) { + kfree(ret); + return NULL; + } + ret[ea_valuelen(ea)] = 0; + return ret; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + return NULL; +} + +/* + * Update or create extended attribute 'key' with value 'data'. Note that + * when this ea exists, it MUST have the same size as size of data. + * This driver can't change sizes of eas ('cause I just don't need it). + */ + +void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, + const char *data, int size) +{ + fnode_secno fno = inode->i_ino; + struct super_block *s = inode->i_sb; + unsigned pos; + int ano, len; + secno a; + unsigned char h[4]; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) { + if (ea_len(ea) == size) + set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size); + } else if (ea_valuelen(ea) == size) { + memcpy(ea_data(ea), data, size); + } + return; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode_in_anode(fnode); + pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) + return; + if (!strcmp(ea->name, key)) { + if (ea_indirect(ea)) { + if (ea_len(ea) == size) + set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size); + } + else { + if (ea_valuelen(ea) == size) + hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data); + } + return; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + if (!le16_to_cpu(fnode->ea_offs)) { + /*if (le16_to_cpu(fnode->ea_size_s)) { + hpfs_error(s, "fnode %08x: ea_size_s == %03x, ea_offs == 0", + inode->i_ino, le16_to_cpu(fnode->ea_size_s)); + return; + }*/ + fnode->ea_offs = cpu_to_le16(0xc4); + } + if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { + hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", + (unsigned long)inode->i_ino, + le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); + return; + } + if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) && + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5 <= 0x200) { + ea = fnode_end_ea(fnode); + *(char *)ea = 0; + ea->namelen = strlen(key); + ea->valuelen_lo = size; + ea->valuelen_hi = size >> 8; + strcpy(ea->name, key); + memcpy(ea_data(ea), data, size); + fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5); + goto ret; + } + /* Most the code here is 99.9993422% unused. I hope there are no bugs. + But what .. HPFS.IFS has also bugs in ea management. */ + if (le16_to_cpu(fnode->ea_size_s) && !le32_to_cpu(fnode->ea_size_l)) { + secno n; + struct buffer_head *bh; + char *data; + if (!(n = hpfs_alloc_sector(s, fno, 1, 0))) return; + if (!(data = hpfs_get_sector(s, n, &bh))) { + hpfs_free_sectors(s, n, 1); + return; + } + memcpy(data, fnode_ea(fnode), le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_s = cpu_to_le16(0); + fnode->ea_secno = cpu_to_le32(n); + fnode->flags &= ~FNODE_anode; + mark_buffer_dirty(bh); + brelse(bh); + } + pos = le32_to_cpu(fnode->ea_size_l) + 5 + strlen(key) + size; + len = (le32_to_cpu(fnode->ea_size_l) + 511) >> 9; + if (pos >= 30000) goto bail; + while (((pos + 511) >> 9) > len) { + if (!len) { + secno q = hpfs_alloc_sector(s, fno, 1, 0); + if (!q) goto bail; + fnode->ea_secno = cpu_to_le32(q); + fnode->flags &= ~FNODE_anode; + len++; + } else if (!fnode_in_anode(fnode)) { + if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) { + len++; + } else { + /* Aargh... don't know how to create ea anodes :-( */ + /*struct buffer_head *bh; + struct anode *anode; + anode_secno a_s; + if (!(anode = hpfs_alloc_anode(s, fno, &a_s, &bh))) + goto bail; + anode->up = cpu_to_le32(fno); + anode->btree.fnode_parent = 1; + anode->btree.n_free_nodes--; + anode->btree.n_used_nodes++; + anode->btree.first_free = cpu_to_le16(le16_to_cpu(anode->btree.first_free) + 12); + anode->u.external[0].disk_secno = cpu_to_le32(le32_to_cpu(fnode->ea_secno)); + anode->u.external[0].file_secno = cpu_to_le32(0); + anode->u.external[0].length = cpu_to_le32(len); + mark_buffer_dirty(bh); + brelse(bh); + fnode->flags |= FNODE_anode; + fnode->ea_secno = cpu_to_le32(a_s);*/ + secno new_sec; + int i; + if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9)))) + goto bail; + for (i = 0; i < len; i++) { + struct buffer_head *bh1, *bh2; + void *b1, *b2; + if (!(b1 = hpfs_map_sector(s, le32_to_cpu(fnode->ea_secno) + i, &bh1, len - i - 1))) { + hpfs_free_sectors(s, new_sec, (pos + 511) >> 9); + goto bail; + } + if (!(b2 = hpfs_get_sector(s, new_sec + i, &bh2))) { + brelse(bh1); + hpfs_free_sectors(s, new_sec, (pos + 511) >> 9); + goto bail; + } + memcpy(b2, b1, 512); + brelse(bh1); + mark_buffer_dirty(bh2); + brelse(bh2); + } + hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno), len); + fnode->ea_secno = cpu_to_le32(new_sec); + len = (pos + 511) >> 9; + } + } + if (fnode_in_anode(fnode)) { + if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno), + 0, len) != -1) { + len++; + } else { + goto bail; + } + } + } + h[0] = 0; + h[1] = strlen(key); + h[2] = size & 0xff; + h[3] = size >> 8; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; + fnode->ea_size_l = cpu_to_le32(pos); + ret: + hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size; + return; + bail: + if (le32_to_cpu(fnode->ea_secno)) + if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); + else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9)); + else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0); +} + diff --git a/kernel/fs/hpfs/file.c b/kernel/fs/hpfs/file.c new file mode 100644 index 000000000..6d8cfe9b5 --- /dev/null +++ b/kernel/fs/hpfs/file.c @@ -0,0 +1,211 @@ +/* + * linux/fs/hpfs/file.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * file VFS functions + */ + +#include "hpfs_fn.h" +#include + +#define BLOCKS(size) (((size) + 511) >> 9) + +static int hpfs_file_release(struct inode *inode, struct file *file) +{ + hpfs_lock(inode->i_sb); + hpfs_write_if_changed(inode); + hpfs_unlock(inode->i_sb); + return 0; +} + +int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(file->f_mapping, start, end); + if (ret) + return ret; + return sync_blockdev(inode->i_sb->s_bdev); +} + +/* + * generic_file_read often calls bmap with non-existing sector, + * so we must ignore such errors. + */ + +static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + unsigned n, disk_secno; + struct fnode *fnode; + struct buffer_head *bh; + if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; + n = file_secno - hpfs_inode->i_file_sec; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } + if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; + disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); + if (disk_secno == -1) return 0; + if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; + n = file_secno - hpfs_inode->i_file_sec; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } + *n_secs = 1; + return disk_secno; +} + +void hpfs_truncate(struct inode *i) +{ + if (IS_IMMUTABLE(i)) return /*-EPERM*/; + hpfs_lock_assert(i->i_sb); + + hpfs_i(i)->i_n_secs = 0; + i->i_blocks = 1 + ((i->i_size + 511) >> 9); + hpfs_i(i)->mmu_private = i->i_size; + hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); + hpfs_write_inode(i); + hpfs_i(i)->i_n_secs = 0; +} + +static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + int r; + secno s; + unsigned n_secs; + hpfs_lock(inode->i_sb); + s = hpfs_bmap(inode, iblock, &n_secs); + if (s) { + if (bh_result->b_size >> 9 < n_secs) + n_secs = bh_result->b_size >> 9; + map_bh(bh_result, inode->i_sb, s); + bh_result->b_size = n_secs << 9; + goto ret_0; + } + if (!create) goto ret_0; + if (iblock<<9 != hpfs_i(inode)->mmu_private) { + BUG(); + r = -EIO; + goto ret_r; + } + if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) { + hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1); + r = -ENOSPC; + goto ret_r; + } + inode->i_blocks++; + hpfs_i(inode)->mmu_private += 512; + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, s); + ret_0: + r = 0; + ret_r: + hpfs_unlock(inode->i_sb); + return r; +} + +static int hpfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, hpfs_get_block); +} + +static int hpfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, hpfs_get_block, wbc); +} + +static int hpfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); +} + +static int hpfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hpfs_get_block); +} + +static void hpfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + hpfs_lock(inode->i_sb); + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + hpfs_truncate(inode); + } + + hpfs_unlock(inode->i_sb); +} + +static int hpfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hpfs_get_block, + &hpfs_i(mapping->host)->mmu_private); + if (unlikely(ret)) + hpfs_write_failed(mapping, pos + len); + + return ret; +} + +static int hpfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + hpfs_write_failed(mapping, pos + len); + if (!(err < 0)) { + /* make sure we write it on close, if not earlier */ + hpfs_lock(inode->i_sb); + hpfs_i(inode)->i_dirty = 1; + hpfs_unlock(inode->i_sb); + } + return err; +} + +static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,hpfs_get_block); +} + +const struct address_space_operations hpfs_aops = { + .readpage = hpfs_readpage, + .writepage = hpfs_writepage, + .readpages = hpfs_readpages, + .writepages = hpfs_writepages, + .write_begin = hpfs_write_begin, + .write_end = hpfs_write_end, + .bmap = _hpfs_bmap +}; + +const struct file_operations hpfs_file_ops = +{ + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .release = hpfs_file_release, + .fsync = hpfs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations hpfs_file_iops = +{ + .setattr = hpfs_setattr, +}; diff --git a/kernel/fs/hpfs/hpfs.h b/kernel/fs/hpfs/hpfs.h new file mode 100644 index 000000000..cce025aff --- /dev/null +++ b/kernel/fs/hpfs/hpfs.h @@ -0,0 +1,559 @@ +/* + * linux/fs/hpfs/hpfs.h + * + * HPFS structures by Chris Smith, 1993 + * + * a little bit modified by Mikulas Patocka, 1998-1999 + */ + +/* The paper + + Duncan, Roy + Design goals and implementation of the new High Performance File System + Microsoft Systems Journal Sept 1989 v4 n5 p1(13) + + describes what HPFS looked like when it was new, and it is the source + of most of the information given here. The rest is conjecture. + + For definitive information on the Duncan paper, see it, not this file. + For definitive information on HPFS, ask somebody else -- this is guesswork. + There are certain to be many mistakes. */ + +#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +#error unknown endian +#endif + +/* Notation */ + +typedef u32 secno; /* sector number, partition relative */ + +typedef secno dnode_secno; /* sector number of a dnode */ +typedef secno fnode_secno; /* sector number of an fnode */ +typedef secno anode_secno; /* sector number of an anode */ + +typedef u32 time32_t; /* 32-bit time_t type */ + +/* sector 0 */ + +/* The boot block is very like a FAT boot block, except that the + 29h signature byte is 28h instead, and the ID string is "HPFS". */ + +#define BB_MAGIC 0xaa55 + +struct hpfs_boot_block +{ + u8 jmp[3]; + u8 oem_id[8]; + u8 bytes_per_sector[2]; /* 512 */ + u8 sectors_per_cluster; + u8 n_reserved_sectors[2]; + u8 n_fats; + u8 n_rootdir_entries[2]; + u8 n_sectors_s[2]; + u8 media_byte; + __le16 sectors_per_fat; + __le16 sectors_per_track; + __le16 heads_per_cyl; + __le32 n_hidden_sectors; + __le32 n_sectors_l; /* size of partition */ + u8 drive_number; + u8 mbz; + u8 sig_28h; /* 28h */ + u8 vol_serno[4]; + u8 vol_label[11]; + u8 sig_hpfs[8]; /* "HPFS " */ + u8 pad[448]; + __le16 magic; /* aa55 */ +}; + + +/* sector 16 */ + +/* The super block has the pointer to the root directory. */ + +#define SB_MAGIC 0xf995e849 + +struct hpfs_super_block +{ + __le32 magic; /* f995 e849 */ + __le32 magic1; /* fa53 e9c5, more magic? */ + u8 version; /* version of a filesystem usually 2 */ + u8 funcversion; /* functional version - oldest version + of filesystem that can understand + this disk */ + __le16 zero; /* 0 */ + __le32 root; /* fnode of root directory */ + __le32 n_sectors; /* size of filesystem */ + __le32 n_badblocks; /* number of bad blocks */ + __le32 bitmaps; /* pointers to free space bit maps */ + __le32 zero1; /* 0 */ + __le32 badblocks; /* bad block list */ + __le32 zero3; /* 0 */ + __le32 last_chkdsk; /* date last checked, 0 if never */ + __le32 last_optimize; /* date last optimized, 0 if never */ + __le32 n_dir_band; /* number of sectors in dir band */ + __le32 dir_band_start; /* first sector in dir band */ + __le32 dir_band_end; /* last sector in dir band */ + __le32 dir_band_bitmap; /* free space map, 1 dnode per bit */ + u8 volume_name[32]; /* not used */ + __le32 user_id_table; /* 8 preallocated sectors - user id */ + u32 zero6[103]; /* 0 */ +}; + + +/* sector 17 */ + +/* The spare block has pointers to spare sectors. */ + +#define SP_MAGIC 0xf9911849 + +struct hpfs_spare_block +{ + __le32 magic; /* f991 1849 */ + __le32 magic1; /* fa52 29c5, more magic? */ + +#ifdef __LITTLE_ENDIAN + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 fast: 1; /* partition was fast formatted */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ +#else + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 fast: 1; /* partition was fast formatted */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 install_dasd_limits: 1; /* HPFS386 flags */ + u8 resynch_dasd_limits: 1; + u8 dasd_limits_operational: 1; + u8 multimedia_active: 1; + u8 dce_acls_active: 1; + u8 dasd_limits_dirty: 1; + u8 flag67: 2; +#else + u8 flag67: 2; + u8 dasd_limits_dirty: 1; + u8 dce_acls_active: 1; + u8 multimedia_active: 1; + u8 dasd_limits_operational: 1; + u8 resynch_dasd_limits: 1; + u8 install_dasd_limits: 1; /* HPFS386 flags */ +#endif + + u8 mm_contlgulty; + u8 unused; + + __le32 hotfix_map; /* info about remapped bad sectors */ + __le32 n_spares_used; /* number of hotfixes */ + __le32 n_spares; /* number of spares in hotfix map */ + __le32 n_dnode_spares_free; /* spare dnodes unused */ + __le32 n_dnode_spares; /* length of spare_dnodes[] list, + follows in this block*/ + __le32 code_page_dir; /* code page directory block */ + __le32 n_code_pages; /* number of code pages */ + __le32 super_crc; /* on HPFS386 and LAN Server this is + checksum of superblock, on normal + OS/2 unused */ + __le32 spare_crc; /* on HPFS386 checksum of spareblock */ + __le32 zero1[15]; /* unused */ + __le32 spare_dnodes[100]; /* emergency free dnode list */ + __le32 zero2[1]; /* room for more? */ +}; + +/* The bad block list is 4 sectors long. The first word must be zero, + the remaining words give n_badblocks bad block numbers. + I bet you can see it coming... */ + +#define BAD_MAGIC 0 + +/* The hotfix map is 4 sectors long. It looks like + + secno from[n_spares]; + secno to[n_spares]; + + The to[] list is initialized to point to n_spares preallocated empty + sectors. The from[] list contains the sector numbers of bad blocks + which have been remapped to corresponding sectors in the to[] list. + n_spares_used gives the length of the from[] list. */ + + +/* Sectors 18 and 19 are preallocated and unused. + Maybe they're spares for 16 and 17, but simple substitution fails. */ + + +/* The code page info pointed to by the spare block consists of an index + block and blocks containing uppercasing tables. I don't know what + these are for (CHKDSK, maybe?) -- OS/2 does not seem to use them + itself. Linux doesn't use them either. */ + +/* block pointed to by spareblock->code_page_dir */ + +#define CP_DIR_MAGIC 0x494521f7 + +struct code_page_directory +{ + __le32 magic; /* 4945 21f7 */ + __le32 n_code_pages; /* number of pointers following */ + __le32 zero1[2]; + struct { + __le16 ix; /* index */ + __le16 code_page_number; /* code page number */ + __le32 bounds; /* matches corresponding word + in data block */ + __le32 code_page_data; /* sector number of a code_page_data + containing c.p. array */ + __le16 index; /* index in c.p. array in that sector*/ + __le16 unknown; /* some unknown value; usually 0; + 2 in Japanese version */ + } array[31]; /* unknown length */ +}; + +/* blocks pointed to by code_page_directory */ + +#define CP_DATA_MAGIC 0x894521f7 + +struct code_page_data +{ + __le32 magic; /* 8945 21f7 */ + __le32 n_used; /* # elements used in c_p_data[] */ + __le32 bounds[3]; /* looks a bit like + (beg1,end1), (beg2,end2) + one byte each */ + __le16 offs[3]; /* offsets from start of sector + to start of c_p_data[ix] */ + struct { + __le16 ix; /* index */ + __le16 code_page_number; /* code page number */ + __le16 unknown; /* the same as in cp directory */ + u8 map[128]; /* upcase table for chars 80..ff */ + __le16 zero2; + } code_page[3]; + u8 incognita[78]; +}; + + +/* Free space bitmaps are 4 sectors long, which is 16384 bits. + 16384 sectors is 8 meg, and each 8 meg band has a 4-sector bitmap. + Bit order in the maps is little-endian. 0 means taken, 1 means free. + + Bit map sectors are marked allocated in the bit maps, and so are sectors + off the end of the partition. + + Band 0 is sectors 0-3fff, its map is in sectors 18-1b. + Band 1 is 4000-7fff, its map is in 7ffc-7fff. + Band 2 is 8000-ffff, its map is in 8000-8003. + The remaining bands have maps in their first (even) or last (odd) 4 sectors + -- if the last, partial, band is odd its map is in its last 4 sectors. + + The bitmap locations are given in a table pointed to by the super block. + No doubt they aren't constrained to be at 18, 7ffc, 8000, ...; that is + just where they usually are. + + The "directory band" is a bunch of sectors preallocated for dnodes. + It has a 4-sector free space bitmap of its own. Each bit in the map + corresponds to one 4-sector dnode, bit 0 of the map corresponding to + the first 4 sectors of the directory band. The entire band is marked + allocated in the main bitmap. The super block gives the locations + of the directory band and its bitmap. ("band" doesn't mean it is + 8 meg long; it isn't.) */ + + +/* dnode: directory. 4 sectors long */ + +/* A directory is a tree of dnodes. The fnode for a directory + contains one pointer, to the root dnode of the tree. The fnode + never moves, the dnodes do the B-tree thing, splitting and merging + as files are added and removed. */ + +#define DNODE_MAGIC 0x77e40aae + +struct dnode { + __le32 magic; /* 77e4 0aae */ + __le32 first_free; /* offset from start of dnode to + first free dir entry */ +#ifdef __LITTLE_ENDIAN + u8 root_dnode: 1; /* Is it root dnode? */ + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares + if you change this word */ +#else + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares + if you change this word */ + u8 root_dnode: 1; /* Is it root dnode? */ +#endif + u8 increment_me2[3]; + __le32 up; /* (root dnode) directory's fnode + (nonroot) parent dnode */ + __le32 self; /* pointer to this dnode */ + u8 dirent[2028]; /* one or more dirents */ +}; + +struct hpfs_dirent { + __le16 length; /* offset to next dirent */ + +#ifdef __LITTLE_ENDIAN + u8 first: 1; /* set on phony ^A^A (".") entry */ + u8 has_acl: 1; + u8 down: 1; /* down pointer present (after name) */ + u8 last: 1; /* set on phony \377 entry */ + u8 has_ea: 1; /* entry has EA */ + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_explicit_acl: 1; + u8 has_needea: 1; /* ?? some EA has NEEDEA set + I have no idea why this is + interesting in a dir entry */ +#else + u8 has_needea: 1; /* ?? some EA has NEEDEA set + I have no idea why this is + interesting in a dir entry */ + u8 has_explicit_acl: 1; + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_ea: 1; /* entry has EA */ + u8 last: 1; /* set on phony \377 entry */ + u8 down: 1; /* down pointer present (after name) */ + u8 has_acl: 1; + u8 first: 1; /* set on phony ^A^A (".") entry */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 read_only: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 system: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 archive: 1; /* dos attrib */ + u8 not_8x3: 1; /* name is not 8.3 */ + u8 flag15: 1; +#else + u8 flag15: 1; + u8 not_8x3: 1; /* name is not 8.3 */ + u8 archive: 1; /* dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 system: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 read_only: 1; /* dos attrib */ +#endif + + __le32 fnode; /* fnode giving allocation info */ + __le32 write_date; /* mtime */ + __le32 file_size; /* file length, bytes */ + __le32 read_date; /* atime */ + __le32 creation_date; /* ctime */ + __le32 ea_size; /* total EA length, bytes */ + u8 no_of_acls; /* number of ACL's (low 3 bits) */ + u8 ix; /* code page index (of filename), see + struct code_page_data */ + u8 namelen, name[1]; /* file name */ + /* dnode_secno down; btree down pointer, if present, + follows name on next word boundary, or maybe it + precedes next dirent, which is on a word boundary. */ +}; + + +/* B+ tree: allocation info in fnodes and anodes */ + +/* dnodes point to fnodes which are responsible for listing the sectors + assigned to the file. This is done with trees of (length,address) + pairs. (Actually triples, of (length, file-address, disk-address) + which can represent holes. Find out if HPFS does that.) + At any rate, fnodes contain a small tree; if subtrees are needed + they occupy essentially a full block in anodes. A leaf-level tree node + has 3-word entries giving sector runs, a non-leaf node has 2-word + entries giving subtree pointers. A flag in the header says which. */ + +struct bplus_leaf_node +{ + __le32 file_secno; /* first file sector in extent */ + __le32 length; /* length, sectors */ + __le32 disk_secno; /* first corresponding disk sector */ +}; + +struct bplus_internal_node +{ + __le32 file_secno; /* subtree maps sectors < this */ + __le32 down; /* pointer to subtree */ +}; + +enum { + BP_hbff = 1, + BP_fnode_parent = 0x20, + BP_binary_search = 0x40, + BP_internal = 0x80 +}; +struct bplus_header +{ + u8 flags; /* bit 0 - high bit of first free entry offset + bit 5 - we're pointed to by an fnode, + the data btree or some ea or the + main ea bootage pointer ea_secno + bit 6 - suggest binary search (unused) + bit 7 - 1 -> (internal) tree of anodes + 0 -> (leaf) list of extents */ + u8 fill[3]; + u8 n_free_nodes; /* free nodes in following array */ + u8 n_used_nodes; /* used nodes in following array */ + __le16 first_free; /* offset from start of header to + first free node in array */ + union { + struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving + subtree pointers */ + struct bplus_leaf_node external[0]; /* (external) 3-word entries giving + sector runs */ + } u; +}; + +static inline bool bp_internal(struct bplus_header *bp) +{ + return bp->flags & BP_internal; +} + +static inline bool bp_fnode_parent(struct bplus_header *bp) +{ + return bp->flags & BP_fnode_parent; +} + +/* fnode: root of allocation b+ tree, and EA's */ + +/* Every file and every directory has one fnode, pointed to by the directory + entry and pointing to the file's sectors or directory's root dnode. EA's + are also stored here, and there are said to be ACL's somewhere here too. */ + +#define FNODE_MAGIC 0xf7e40aae + +enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)}; +struct fnode +{ + __le32 magic; /* f7e4 0aae */ + __le32 zero1[2]; /* read history */ + u8 len, name[15]; /* true length, truncated name */ + __le32 up; /* pointer to file's directory fnode */ + __le32 acl_size_l; + __le32 acl_secno; + __le16 acl_size_s; + u8 acl_anode; + u8 zero2; /* history bit count */ + __le32 ea_size_l; /* length of disk-resident ea's */ + __le32 ea_secno; /* first sector of disk-resident ea's*/ + __le16 ea_size_s; /* length of fnode-resident ea's */ + + __le16 flags; /* bit 1 set -> ea_secno is an anode */ + /* bit 8 set -> directory. first & only extent + points to dnode. */ + struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ + union { + struct bplus_leaf_node external[8]; + struct bplus_internal_node internal[12]; + } u; + + __le32 file_size; /* file length, bytes */ + __le32 n_needea; /* number of EA's with NEEDEA set */ + u8 user_id[16]; /* unused */ + __le16 ea_offs; /* offset from start of fnode + to first fnode-resident ea */ + u8 dasd_limit_treshhold; + u8 dasd_limit_delta; + __le32 dasd_limit; + __le32 dasd_usage; + u8 ea[316]; /* zero or more EA's, packed together + with no alignment padding. + (Do not use this name, get here + via fnode + ea_offs. I think.) */ +}; + +static inline bool fnode_in_anode(struct fnode *p) +{ + return (p->flags & FNODE_anode) != 0; +} + +static inline bool fnode_is_dir(struct fnode *p) +{ + return (p->flags & FNODE_dir) != 0; +} + + +/* anode: 99.44% pure allocation tree */ + +#define ANODE_MAGIC 0x37e40aae + +struct anode +{ + __le32 magic; /* 37e4 0aae */ + __le32 self; /* pointer to this anode */ + __le32 up; /* parent anode or fnode */ + + struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */ + union { + struct bplus_leaf_node external[40]; + struct bplus_internal_node internal[60]; + } u; + + __le32 fill[3]; /* unused */ +}; + + +/* extended attributes. + + A file's EA info is stored as a list of (name,value) pairs. It is + usually in the fnode, but (if it's large) it is moved to a single + sector run outside the fnode, or to multiple runs with an anode tree + that points to them. + + The value of a single EA is stored along with the name, or (if large) + it is moved to a single sector run, or multiple runs pointed to by an + anode tree, pointed to by the value field of the (name,value) pair. + + Flags in the EA tell whether the value is immediate, in a single sector + run, or in multiple runs. Flags in the fnode tell whether the EA list + is immediate, in a single run, or in multiple runs. */ + +enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 }; +struct extended_attribute +{ + u8 flags; /* bit 0 set -> value gives sector number + where real value starts */ + /* bit 1 set -> sector is an anode + that points to fragmented value */ + /* bit 7 set -> required ea */ + u8 namelen; /* length of name, bytes */ + u8 valuelen_lo; /* length of value, bytes */ + u8 valuelen_hi; /* length of value, bytes */ + u8 name[]; + /* + u8 name[namelen]; ascii attrib name + u8 nul; terminating '\0', not counted + u8 value[valuelen]; value, arbitrary + if this.flags & 1, valuelen is 8 and the value is + u32 length; real length of value, bytes + secno secno; sector address where it starts + if this.anode, the above sector number is the root of an anode tree + which points to the value. + */ +}; + +static inline bool ea_indirect(struct extended_attribute *ea) +{ + return ea->flags & EA_indirect; +} + +static inline bool ea_in_anode(struct extended_attribute *ea) +{ + return ea->flags & EA_anode; +} + +/* + Local Variables: + comment-column: 40 + End: +*/ diff --git a/kernel/fs/hpfs/hpfs_fn.h b/kernel/fs/hpfs/hpfs_fn.h new file mode 100644 index 000000000..b63b75fa0 --- /dev/null +++ b/kernel/fs/hpfs/hpfs_fn.h @@ -0,0 +1,363 @@ +/* + * linux/fs/hpfs/hpfs_fn.h + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * function headers + */ + +//#define DBG +//#define DEBUG_LOCKS +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include "hpfs.h" + +#define EIOERROR EIO +#define EFSERROR EPERM +#define EMEMERROR ENOMEM + +#define ANODE_ALLOC_FWD 512 +#define FNODE_ALLOC_FWD 0 +#define ALLOC_FWD_MIN 16 +#define ALLOC_FWD_MAX 128 +#define ALLOC_M 1 +#define FNODE_RD_AHEAD 16 +#define ANODE_RD_AHEAD 0 +#define DNODE_RD_AHEAD 72 +#define COUNT_RD_AHEAD 62 + +#define FREE_DNODES_ADD 58 +#define FREE_DNODES_DEL 29 + +#define CHKCOND(x,y) if (!(x)) printk y + +struct hpfs_inode_info { + loff_t mmu_private; + ino_t i_parent_dir; /* (directories) gives fnode of parent dir */ + unsigned i_dno; /* (directories) root dnode */ + unsigned i_dpos; /* (directories) temp for readdir */ + unsigned i_dsubdno; /* (directories) temp for readdir */ + unsigned i_file_sec; /* (files) minimalist cache of alloc info */ + unsigned i_disk_sec; /* (files) minimalist cache of alloc info */ + unsigned i_n_secs; /* (files) minimalist cache of alloc info */ + unsigned i_ea_size; /* size of extended attributes */ + unsigned i_ea_mode : 1; /* file's permission is stored in ea */ + unsigned i_ea_uid : 1; /* file's uid is stored in ea */ + unsigned i_ea_gid : 1; /* file's gid is stored in ea */ + unsigned i_dirty : 1; + loff_t **i_rddir_off; + struct inode vfs_inode; +}; + +struct hpfs_sb_info { + struct mutex hpfs_mutex; /* global hpfs lock */ + ino_t sb_root; /* inode number of root dir */ + unsigned sb_fs_size; /* file system size, sectors */ + unsigned sb_bitmaps; /* sector number of bitmap list */ + unsigned sb_dirband_start; /* directory band start sector */ + unsigned sb_dirband_size; /* directory band size, dnodes */ + unsigned sb_dmap; /* sector number of dnode bit map */ + unsigned sb_n_free; /* free blocks for statfs, or -1 */ + unsigned sb_n_free_dnodes; /* free dnodes for statfs, or -1 */ + kuid_t sb_uid; /* uid from mount options */ + kgid_t sb_gid; /* gid from mount options */ + umode_t sb_mode; /* mode from mount options */ + unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */ + unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */ + unsigned sb_chk : 2; /* checks: 0-no, 1-normal, 2-strict */ + unsigned sb_lowercase : 1; /* downcase filenames hackery */ + unsigned sb_was_error : 1; /* there was an error, set dirty flag */ + unsigned sb_chkdsk : 2; /* chkdsk: 0-no, 1-on errs, 2-allways */ + unsigned char *sb_cp_table; /* code page tables: */ + /* 128 bytes uppercasing table & */ + /* 128 bytes lowercasing table */ + __le32 *sb_bmp_dir; /* main bitmap directory */ + unsigned sb_c_bitmap; /* current bitmap */ + unsigned sb_max_fwd_alloc; /* max forwad allocation */ + int sb_timeshift; + struct rcu_head rcu; +}; + +/* Four 512-byte buffers and the 2k block obtained by concatenating them */ + +struct quad_buffer_head { + struct buffer_head *bh[4]; + void *data; +}; + +/* The b-tree down pointer from a dir entry */ + +static inline dnode_secno de_down_pointer (struct hpfs_dirent *de) +{ + CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n")); + return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4)); +} + +/* The first dir entry in a dnode */ + +static inline struct hpfs_dirent *dnode_first_de (struct dnode *dnode) +{ + return (void *) dnode->dirent; +} + +/* The end+1 of the dir entries */ + +static inline struct hpfs_dirent *dnode_end_de (struct dnode *dnode) +{ + CHKCOND(le32_to_cpu(dnode->first_free)>=0x14 && le32_to_cpu(dnode->first_free)<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %x\n",(unsigned)le32_to_cpu(dnode->first_free))); + return (void *) dnode + le32_to_cpu(dnode->first_free); +} + +/* The dir entry after dir entry de */ + +static inline struct hpfs_dirent *de_next_de (struct hpfs_dirent *de) +{ + CHKCOND(le16_to_cpu(de->length)>=0x20 && le16_to_cpu(de->length)<0x800,("HPFS: de_next_de: de->length = %x\n",(unsigned)le16_to_cpu(de->length))); + return (void *) de + le16_to_cpu(de->length); +} + +static inline struct extended_attribute *fnode_ea(struct fnode *fnode) +{ + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s)); +} + +static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode) +{ + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s)); +} + +static unsigned ea_valuelen(struct extended_attribute *ea) +{ + return ea->valuelen_lo + 256 * ea->valuelen_hi; +} + +static inline struct extended_attribute *next_ea(struct extended_attribute *ea) +{ + return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea_valuelen(ea)); +} + +static inline secno ea_sec(struct extended_attribute *ea) +{ + return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen))); +} + +static inline secno ea_len(struct extended_attribute *ea) +{ + return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen))); +} + +static inline char *ea_data(struct extended_attribute *ea) +{ + return (char *)((char *)ea + 5 + ea->namelen); +} + +static inline unsigned de_size(int namelen, secno down_ptr) +{ + return ((0x1f + namelen + 3) & ~3) + (down_ptr ? 4 : 0); +} + +static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src) +{ + int a; + int n; + if (!dst || !src) return; + a = dst->down; + n = dst->not_8x3; + memcpy((char *)dst + 2, (char *)src + 2, 28); + dst->down = a; + dst->not_8x3 = n; +} + +static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n) +{ + int i; + if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n; + if (!((le32_to_cpu(bmp[(b & 0x3fff) >> 5]) >> (b & 0x1f)) & 1)) return 1; + for (i = 1; i < n; i++) + if (!((le32_to_cpu(bmp[((b+i) & 0x3fff) >> 5]) >> ((b+i) & 0x1f)) & 1)) + return i + 1; + return 0; +} + +/* alloc.c */ + +int hpfs_chk_sectors(struct super_block *, secno, int, char *); +secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int); +int hpfs_alloc_if_possible(struct super_block *, secno); +void hpfs_free_sectors(struct super_block *, secno, unsigned); +int hpfs_check_free_dnodes(struct super_block *, int); +void hpfs_free_dnode(struct super_block *, secno); +struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); +struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); +struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); + +/* anode.c */ + +secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_header *, unsigned, struct buffer_head *); +secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); +void hpfs_remove_btree(struct super_block *, struct bplus_header *); +int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); +int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *); +void hpfs_ea_remove(struct super_block *, secno, int, unsigned); +void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); +void hpfs_remove_fnode(struct super_block *, fnode_secno fno); + +/* buffer.c */ + +void hpfs_prefetch_sectors(struct super_block *, unsigned, int); +void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); +void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); +void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); +void *hpfs_get_4sectors(struct super_block *, unsigned, struct quad_buffer_head *); +void hpfs_brelse4(struct quad_buffer_head *); +void hpfs_mark_4buffers_dirty(struct quad_buffer_head *); + +/* dentry.c */ + +extern const struct dentry_operations hpfs_dentry_operations; + +/* dir.c */ + +struct dentry *hpfs_lookup(struct inode *, struct dentry *, unsigned int); +extern const struct file_operations hpfs_dir_ops; + +/* dnode.c */ + +void hpfs_add_pos(struct inode *, loff_t *); +void hpfs_del_pos(struct inode *, loff_t *); +struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, + const unsigned char *, unsigned, secno); +int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned, + struct hpfs_dirent *); +int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); +void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); +dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); +struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); +struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, + const unsigned char *, unsigned, dnode_secno *, + struct quad_buffer_head *); +void hpfs_remove_dtree(struct super_block *, dnode_secno); +struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); + +/* ea.c */ + +void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); +int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); +char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); +void hpfs_set_ea(struct inode *, struct fnode *, const char *, + const char *, int); + +/* file.c */ + +int hpfs_file_fsync(struct file *, loff_t, loff_t, int); +void hpfs_truncate(struct inode *); +extern const struct file_operations hpfs_file_ops; +extern const struct inode_operations hpfs_file_iops; +extern const struct address_space_operations hpfs_aops; + +/* inode.c */ + +void hpfs_init_inode(struct inode *); +void hpfs_read_inode(struct inode *); +void hpfs_write_inode(struct inode *); +void hpfs_write_inode_nolock(struct inode *); +int hpfs_setattr(struct dentry *, struct iattr *); +void hpfs_write_if_changed(struct inode *); +void hpfs_evict_inode(struct inode *); + +/* map.c */ + +__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); +__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); +void hpfs_prefetch_bitmap(struct super_block *, unsigned); +unsigned char *hpfs_load_code_page(struct super_block *, secno); +__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); +struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); +struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); +struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); +dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino); + +/* name.c */ + +unsigned char hpfs_upcase(unsigned char *, unsigned char); +int hpfs_chk_name(const unsigned char *, unsigned *); +unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); +int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned, + const unsigned char *, unsigned, int); +int hpfs_is_name_long(const unsigned char *, unsigned); +void hpfs_adjust_length(const unsigned char *, unsigned *); + +/* namei.c */ + +extern const struct inode_operations hpfs_dir_iops; +extern const struct address_space_operations hpfs_symlink_aops; + +static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) +{ + return list_entry(inode, struct hpfs_inode_info, vfs_inode); +} + +static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* super.c */ + +__printf(2, 3) +void hpfs_error(struct super_block *, const char *, ...); +int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); +unsigned hpfs_get_free_dnodes(struct super_block *); + +/* + * local time (HPFS) to GMT (Unix) + */ + +static inline time_t local_to_gmt(struct super_block *s, time32_t t) +{ + extern struct timezone sys_tz; + return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift; +} + +static inline time32_t gmt_to_local(struct super_block *s, time_t t) +{ + extern struct timezone sys_tz; + return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; +} + +/* + * Locking: + * + * hpfs_lock() locks the whole filesystem. It must be taken + * on any method called by the VFS. + * + * We don't do any per-file locking anymore, it is hard to + * review and HPFS is not performance-sensitive anyway. + */ +static inline void hpfs_lock(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_lock(&sbi->hpfs_mutex); +} + +static inline void hpfs_unlock(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_unlock(&sbi->hpfs_mutex); +} + +static inline void hpfs_lock_assert(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + WARN_ON(!mutex_is_locked(&sbi->hpfs_mutex)); +} diff --git a/kernel/fs/hpfs/inode.c b/kernel/fs/hpfs/inode.c new file mode 100644 index 000000000..933c73780 --- /dev/null +++ b/kernel/fs/hpfs/inode.c @@ -0,0 +1,315 @@ +/* + * linux/fs/hpfs/inode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * inode VFS functions + */ + +#include +#include +#include "hpfs_fn.h" + +void hpfs_init_inode(struct inode *i) +{ + struct super_block *sb = i->i_sb; + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + + i->i_uid = hpfs_sb(sb)->sb_uid; + i->i_gid = hpfs_sb(sb)->sb_gid; + i->i_mode = hpfs_sb(sb)->sb_mode; + i->i_size = -1; + i->i_blocks = -1; + + hpfs_inode->i_dno = 0; + hpfs_inode->i_n_secs = 0; + hpfs_inode->i_file_sec = 0; + hpfs_inode->i_disk_sec = 0; + hpfs_inode->i_dpos = 0; + hpfs_inode->i_dsubdno = 0; + hpfs_inode->i_ea_mode = 0; + hpfs_inode->i_ea_uid = 0; + hpfs_inode->i_ea_gid = 0; + hpfs_inode->i_ea_size = 0; + + hpfs_inode->i_rddir_off = NULL; + hpfs_inode->i_dirty = 0; + + i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0; + i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; + i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; +} + +void hpfs_read_inode(struct inode *i) +{ + struct buffer_head *bh; + struct fnode *fnode; + struct super_block *sb = i->i_sb; + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + void *ea; + int ea_size; + + if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { + /*i->i_mode |= S_IFREG; + i->i_mode &= ~0111; + i->i_op = &hpfs_file_iops; + i->i_fop = &hpfs_file_ops; + clear_nlink(i);*/ + make_bad_inode(i); + return; + } + if (hpfs_sb(i->i_sb)->sb_eas) { + if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) { + if (ea_size == 2) { + i_uid_write(i, le16_to_cpu(*(__le16*)ea)); + hpfs_inode->i_ea_uid = 1; + } + kfree(ea); + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) { + if (ea_size == 2) { + i_gid_write(i, le16_to_cpu(*(__le16*)ea)); + hpfs_inode->i_ea_gid = 1; + } + kfree(ea); + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "SYMLINK", &ea_size))) { + kfree(ea); + i->i_mode = S_IFLNK | 0777; + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &hpfs_symlink_aops; + set_nlink(i, 1); + i->i_size = ea_size; + i->i_blocks = 1; + brelse(bh); + return; + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "MODE", &ea_size))) { + int rdev = 0; + umode_t mode = hpfs_sb(sb)->sb_mode; + if (ea_size == 2) { + mode = le16_to_cpu(*(__le16*)ea); + hpfs_inode->i_ea_mode = 1; + } + kfree(ea); + i->i_mode = mode; + if (S_ISBLK(mode) || S_ISCHR(mode)) { + if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) { + if (ea_size == 4) + rdev = le32_to_cpu(*(__le32*)ea); + kfree(ea); + } + } + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { + brelse(bh); + set_nlink(i, 1); + i->i_size = 0; + i->i_blocks = 1; + init_special_inode(i, mode, + new_decode_dev(rdev)); + return; + } + } + } + if (fnode_is_dir(fnode)) { + int n_dnodes, n_subdirs; + i->i_mode |= S_IFDIR; + i->i_op = &hpfs_dir_iops; + i->i_fop = &hpfs_dir_ops; + hpfs_inode->i_parent_dir = le32_to_cpu(fnode->up); + hpfs_inode->i_dno = le32_to_cpu(fnode->u.external[0].disk_secno); + if (hpfs_sb(sb)->sb_chk >= 2) { + struct buffer_head *bh0; + if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0); + } + n_dnodes = 0; n_subdirs = 0; + hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); + i->i_blocks = 4 * n_dnodes; + i->i_size = 2048 * n_dnodes; + set_nlink(i, 2 + n_subdirs); + } else { + i->i_mode |= S_IFREG; + if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; + i->i_op = &hpfs_file_iops; + i->i_fop = &hpfs_file_ops; + set_nlink(i, 1); + i->i_size = le32_to_cpu(fnode->file_size); + i->i_blocks = ((i->i_size + 511) >> 9) + 1; + i->i_data.a_ops = &hpfs_aops; + hpfs_i(i)->mmu_private = i->i_size; + } + brelse(bh); +} + +static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) { + Some unknown structures like ACL may be in fnode, + we'd better not overwrite them + hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino); + } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { + __le32 ea; + if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { + ea = cpu_to_le32(i_uid_read(i)); + hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2); + hpfs_inode->i_ea_uid = 1; + } + if (!gid_eq(i->i_gid, hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) { + ea = cpu_to_le32(i_gid_read(i)); + hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2); + hpfs_inode->i_ea_gid = 1; + } + if (!S_ISLNK(i->i_mode)) + if ((i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111)) + | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG)) + && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333)) + | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) { + ea = cpu_to_le32(i->i_mode); + /* sick, but legal */ + hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2); + hpfs_inode->i_ea_mode = 1; + } + if (S_ISBLK(i->i_mode) || S_ISCHR(i->i_mode)) { + ea = cpu_to_le32(new_encode_dev(i->i_rdev)); + hpfs_set_ea(i, fnode, "DEV", (char *)&ea, 4); + } + } +} + +void hpfs_write_inode(struct inode *i) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct inode *parent; + if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; + if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (*hpfs_inode->i_rddir_off) + pr_err("write_inode: some position still there\n"); + kfree(hpfs_inode->i_rddir_off); + hpfs_inode->i_rddir_off = NULL; + } + if (!i->i_nlink) { + return; + } + parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); + if (parent) { + hpfs_inode->i_dirty = 0; + if (parent->i_state & I_NEW) { + hpfs_init_inode(parent); + hpfs_read_inode(parent); + unlock_new_inode(parent); + } + hpfs_write_inode_nolock(i); + iput(parent); + } +} + +void hpfs_write_inode_nolock(struct inode *i) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct buffer_head *bh; + struct fnode *fnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; + if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) return; + if (i->i_ino != hpfs_sb(i->i_sb)->sb_root && i->i_nlink) { + if (!(de = map_fnode_dirent(i->i_sb, i->i_ino, fnode, &qbh))) { + brelse(bh); + return; + } + } else de = NULL; + if (S_ISREG(i->i_mode)) { + fnode->file_size = cpu_to_le32(i->i_size); + if (de) de->file_size = cpu_to_le32(i->i_size); + } else if (S_ISDIR(i->i_mode)) { + fnode->file_size = cpu_to_le32(0); + if (de) de->file_size = cpu_to_le32(0); + } + hpfs_write_inode_ea(i, fnode); + if (de) { + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->read_only = !(i->i_mode & 0222); + de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + } + if (S_ISDIR(i->i_mode)) { + if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->read_only = !(i->i_mode & 0222); + de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); + de->file_size = cpu_to_le32(0); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + } else + hpfs_error(i->i_sb, + "directory %08lx doesn't have '.' entry", + (unsigned long)i->i_ino); + } + mark_buffer_dirty(bh); + brelse(bh); +} + +int hpfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error = -EINVAL; + + hpfs_lock(inode->i_sb); + if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) + goto out_unlock; + if ((attr->ia_valid & ATTR_UID) && + from_kuid(&init_user_ns, attr->ia_uid) >= 0x10000) + goto out_unlock; + if ((attr->ia_valid & ATTR_GID) && + from_kgid(&init_user_ns, attr->ia_gid) >= 0x10000) + goto out_unlock; + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) + goto out_unlock; + + error = inode_change_ok(inode, attr); + if (error) + goto out_unlock; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + goto out_unlock; + + truncate_setsize(inode, attr->ia_size); + hpfs_truncate(inode); + } + + setattr_copy(inode, attr); + + hpfs_write_inode(inode); + + out_unlock: + hpfs_unlock(inode->i_sb); + return error; +} + +void hpfs_write_if_changed(struct inode *inode) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + + if (hpfs_inode->i_dirty) + hpfs_write_inode(inode); +} + +void hpfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (!inode->i_nlink) { + hpfs_lock(inode->i_sb); + hpfs_remove_fnode(inode->i_sb, inode->i_ino); + hpfs_unlock(inode->i_sb); + } +} diff --git a/kernel/fs/hpfs/map.c b/kernel/fs/hpfs/map.c new file mode 100644 index 000000000..442770edc --- /dev/null +++ b/kernel/fs/hpfs/map.c @@ -0,0 +1,308 @@ +/* + * linux/fs/hpfs/map.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * mapping structures to memory with some minimal checks + */ + +#include "hpfs_fn.h" + +__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) +{ + return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0); +} + +__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, + struct quad_buffer_head *qbh, char *id) +{ + secno sec; + __le32 *ret; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) { + hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); + return NULL; + } + sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); + if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) { + hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); + return NULL; + } + ret = hpfs_map_4sectors(s, sec, qbh, 4); + if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1); + return ret; +} + +void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block) +{ + unsigned to_prefetch, next_prefetch; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (unlikely(bmp_block >= n_bands)) + return; + to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); + if (unlikely(bmp_block + 1 >= n_bands)) + next_prefetch = 0; + else + next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]); + hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch)); +} + +/* + * Load first code page into kernel memory, return pointer to 256-byte array, + * first 128 bytes are uppercasing table for chars 128-255, next 128 bytes are + * lowercasing table + */ + +unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) +{ + struct buffer_head *bh; + secno cpds; + unsigned cpi; + unsigned char *ptr; + unsigned char *cp_table; + int i; + struct code_page_data *cpd; + struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); + if (!cp) return NULL; + if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { + pr_err("Code page directory magic doesn't match (magic = %08x)\n", + le32_to_cpu(cp->magic)); + brelse(bh); + return NULL; + } + if (!le32_to_cpu(cp->n_code_pages)) { + pr_err("n_code_pages == 0\n"); + brelse(bh); + return NULL; + } + cpds = le32_to_cpu(cp->array[0].code_page_data); + cpi = le16_to_cpu(cp->array[0].index); + brelse(bh); + + if (cpi >= 3) { + pr_err("Code page index out of array\n"); + return NULL; + } + + if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; + if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { + pr_err("Code page index out of sector\n"); + brelse(bh); + return NULL; + } + ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; + if (!(cp_table = kmalloc(256, GFP_KERNEL))) { + pr_err("out of memory for code page table\n"); + brelse(bh); + return NULL; + } + memcpy(cp_table, ptr, 128); + brelse(bh); + + /* Try to build lowercasing table from uppercasing one */ + + for (i=128; i<256; i++) cp_table[i]=i; + for (i=128; i<256; i++) if (cp_table[i-128]!=i && cp_table[i-128]>=128) + cp_table[cp_table[i-128]] = i; + + return cp_table; +} + +__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) +{ + struct buffer_head *bh; + int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; + int i; + __le32 *b; + if (!(b = kmalloc(n * 512, GFP_KERNEL))) { + pr_err("can't allocate memory for bitmap directory\n"); + return NULL; + } + for (i=0;isb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) { + return NULL; + } + if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) { + if (hpfs_sb(s)->sb_chk) { + struct extended_attribute *ea; + struct extended_attribute *ea_end; + if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) { + hpfs_error(s, "bad magic on fnode %08lx", + (unsigned long)ino); + goto bail; + } + if (!fnode_is_dir(fnode)) { + if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != + (bp_internal(&fnode->btree) ? 12 : 8)) { + hpfs_error(s, + "bad number of nodes in fnode %08lx", + (unsigned long)ino); + goto bail; + } + if (le16_to_cpu(fnode->btree.first_free) != + 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) { + hpfs_error(s, + "bad first_free pointer in fnode %08lx", + (unsigned long)ino); + goto bail; + } + } + if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 || + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) { + hpfs_error(s, + "bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x", + (unsigned long)ino, + le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); + goto bail; + } + ea = fnode_ea(fnode); + ea_end = fnode_end_ea(fnode); + while (ea != ea_end) { + if (ea > ea_end) { + hpfs_error(s, "bad EA in fnode %08lx", + (unsigned long)ino); + goto bail; + } + ea = next_ea(ea); + } + } + } + return fnode; + bail: + brelse(*bhp); + return NULL; +} + +struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp) +{ + struct anode *anode; + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL; + if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD))) + if (hpfs_sb(s)->sb_chk) { + if (le32_to_cpu(anode->magic) != ANODE_MAGIC) { + hpfs_error(s, "bad magic on anode %08x", ano); + goto bail; + } + if (le32_to_cpu(anode->self) != ano) { + hpfs_error(s, "self pointer invalid on anode %08x", ano); + goto bail; + } + if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != + (bp_internal(&anode->btree) ? 60 : 40)) { + hpfs_error(s, "bad number of nodes in anode %08x", ano); + goto bail; + } + if (le16_to_cpu(anode->btree.first_free) != + 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) { + hpfs_error(s, "bad first_free pointer in anode %08x", ano); + goto bail; + } + } + return anode; + bail: + brelse(*bhp); + return NULL; +} + +/* + * Load dnode to memory and do some checks + */ + +struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, + struct quad_buffer_head *qbh) +{ + struct dnode *dnode; + if (hpfs_sb(s)->sb_chk) { + if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL; + if (secno & 3) { + hpfs_error(s, "dnode %08x not byte-aligned", secno); + return NULL; + } + } + if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) + if (hpfs_sb(s)->sb_chk) { + unsigned p, pp = 0; + unsigned char *d = (unsigned char *)dnode; + int b = 0; + if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) { + hpfs_error(s, "bad magic on dnode %08x", secno); + goto bail; + } + if (le32_to_cpu(dnode->self) != secno) + hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self)); + /* Check dirents - bad dirents would cause infinite + loops or shooting to memory */ + if (le32_to_cpu(dnode->first_free) > 2048) { + hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free)); + goto bail; + } + for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) { + struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p); + if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) { + hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { + if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; + hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + ok: + if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down; + if (de->down) if (de_down_pointer(de) < 0x10) { + hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + pp = p; + + } + if (p != le32_to_cpu(dnode->first_free)) { + hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno); + goto bail; + } + if (d[pp + 30] != 1 || d[pp + 31] != 255) { + hpfs_error(s, "dnode %08x does not end with \\377 entry", secno); + goto bail; + } + if (b == 3) + pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", + secno); + } + return dnode; + bail: + hpfs_brelse4(qbh); + return NULL; +} + +dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino) +{ + struct buffer_head *bh; + struct fnode *fnode; + dnode_secno dno; + + fnode = hpfs_map_fnode(s, ino, &bh); + if (!fnode) + return 0; + + dno = le32_to_cpu(fnode->u.external[0].disk_secno); + brelse(bh); + return dno; +} diff --git a/kernel/fs/hpfs/name.c b/kernel/fs/hpfs/name.c new file mode 100644 index 000000000..b00d396d2 --- /dev/null +++ b/kernel/fs/hpfs/name.c @@ -0,0 +1,113 @@ +/* + * linux/fs/hpfs/name.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * operations with filenames + */ + +#include "hpfs_fn.h" + +static inline int not_allowed_char(unsigned char c) +{ + return c<' ' || c=='"' || c=='*' || c=='/' || c==':' || c=='<' || + c=='>' || c=='?' || c=='\\' || c=='|'; +} + +static inline int no_dos_char(unsigned char c) +{ /* Characters that are allowed in HPFS but not in DOS */ + return c=='+' || c==',' || c==';' || c=='=' || c=='[' || c==']'; +} + +static inline unsigned char upcase(unsigned char *dir, unsigned char a) +{ + if (a<128 || a==255) return a>='a' && a<='z' ? a - 0x20 : a; + if (!dir) return a; + return dir[a-128]; +} + +unsigned char hpfs_upcase(unsigned char *dir, unsigned char a) +{ + return upcase(dir, a); +} + +static inline unsigned char locase(unsigned char *dir, unsigned char a) +{ + if (a<128 || a==255) return a>='A' && a<='Z' ? a + 0x20 : a; + if (!dir) return a; + return dir[a]; +} + +int hpfs_chk_name(const unsigned char *name, unsigned *len) +{ + int i; + if (*len > 254) return -ENAMETOOLONG; + hpfs_adjust_length(name, len); + if (!*len) return -EINVAL; + for (i = 0; i < *len; i++) if (not_allowed_char(name[i])) return -EINVAL; + if (*len == 1) if (name[0] == '.') return -EINVAL; + if (*len == 2) if (name[0] == '.' && name[1] == '.') return -EINVAL; + return 0; +} + +unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from, + unsigned len, int lc, int lng) +{ + unsigned char *to; + int i; + if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { + pr_err("Long name flag mismatch - name "); + for (i = 0; i < len; i++) + pr_cont("%c", from[i]); + pr_cont(" misidentified as %s.\n", lng ? "short" : "long"); + pr_err("It's nothing serious. It could happen because of bug in OS/2.\nSet checks=normal to disable this message.\n"); + } + if (!lc) return from; + if (!(to = kmalloc(len, GFP_KERNEL))) { + pr_err("can't allocate memory for name conversion buffer\n"); + return from; + } + for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]); + return to; +} + +int hpfs_compare_names(struct super_block *s, + const unsigned char *n1, unsigned l1, + const unsigned char *n2, unsigned l2, int last) +{ + unsigned l = l1 < l2 ? l1 : l2; + unsigned i; + if (last) return -1; + for (i = 0; i < l; i++) { + unsigned char c1 = upcase(hpfs_sb(s)->sb_cp_table,n1[i]); + unsigned char c2 = upcase(hpfs_sb(s)->sb_cp_table,n2[i]); + if (c1 < c2) return -1; + if (c1 > c2) return 1; + } + if (l1 < l2) return -1; + if (l1 > l2) return 1; + return 0; +} + +int hpfs_is_name_long(const unsigned char *name, unsigned len) +{ + int i,j; + for (i = 0; i < len && name[i] != '.'; i++) + if (no_dos_char(name[i])) return 1; + if (!i || i > 8) return 1; + if (i == len) return 0; + for (j = i + 1; j < len; j++) + if (name[j] == '.' || no_dos_char(name[i])) return 1; + return j - i > 4; +} + +/* OS/2 clears dots and spaces at the end of file name, so we have to */ + +void hpfs_adjust_length(const unsigned char *name, unsigned *len) +{ + if (!*len) return; + if (*len == 1 && name[0] == '.') return; + if (*len == 2 && name[0] == '.' && name[1] == '.') return; + while (*len && (name[*len - 1] == '.' || name[*len - 1] == ' ')) + (*len)--; +} diff --git a/kernel/fs/hpfs/namei.c b/kernel/fs/hpfs/namei.c new file mode 100644 index 000000000..a0872f239 --- /dev/null +++ b/kernel/fs/hpfs/namei.c @@ -0,0 +1,628 @@ +/* + * linux/fs/hpfs/namei.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * adding & removing files & directories + */ +#include +#include "hpfs_fn.h" + +static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh0; + struct buffer_head *bh; + struct hpfs_dirent *de; + struct fnode *fnode; + struct dnode *dnode; + struct inode *result; + fnode_secno fno; + dnode_secno dno; + int r; + struct hpfs_dirent dee; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0); + if (!dnode) + goto bail1; + memset(&dee, 0, sizeof dee); + dee.directory = 1; + if (!(mode & 0222)) dee.read_only = 1; + /*dee.archive = 0;*/ + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + result = new_inode(dir->i_sb); + if (!result) + goto bail2; + hpfs_init_inode(result); + result->i_ino = fno; + hpfs_i(result)->i_parent_dir = dir->i_ino; + hpfs_i(result)->i_dno = dno; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_mode |= S_IFDIR; + result->i_op = &hpfs_dir_iops; + result->i_fop = &hpfs_dir_ops; + result->i_blocks = 4; + result->i_size = 2048; + set_nlink(result, 2); + if (dee.read_only) + result->i_mode &= ~0222; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail3; + if (r == -1) { + err = -EEXIST; + goto bail3; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + fnode->flags |= FNODE_dir; + fnode->btree.n_free_nodes = 7; + fnode->btree.n_used_nodes = 1; + fnode->btree.first_free = cpu_to_le16(0x14); + fnode->u.external[0].disk_secno = cpu_to_le32(dno); + fnode->u.external[0].file_secno = cpu_to_le32(-1); + dnode->root_dnode = 1; + dnode->up = cpu_to_le32(fno); + de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0); + de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + if (!(mode & 0222)) de->read_only = 1; + de->first = de->directory = 1; + /*de->hidden = de->system = 0;*/ + de->fnode = cpu_to_le32(fno); + mark_buffer_dirty(bh); + brelse(bh); + hpfs_mark_4buffers_dirty(&qbh0); + hpfs_brelse4(&qbh0); + inc_nlink(dir); + insert_inode_hash(result); + + if (!uid_eq(result->i_uid, current_fsuid()) || + !gid_eq(result->i_gid, current_fsgid()) || + result->i_mode != (mode | S_IFDIR)) { + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_mode = mode | S_IFDIR; + hpfs_write_inode_nolock(result); + } + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; +bail3: + iput(result); +bail2: + hpfs_brelse4(&qbh0); + hpfs_free_dnode(dir->i_sb, dno); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct inode *result = NULL; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + int err; + if ((err = hpfs_chk_name(name, &len))) + return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + if (!(mode & 0222)) dee.read_only = 1; + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + + hpfs_init_inode(result); + result->i_ino = fno; + result->i_mode |= S_IFREG; + result->i_mode &= ~0111; + result->i_op = &hpfs_file_iops; + result->i_fop = &hpfs_file_ops; + set_nlink(result, 1); + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + if (dee.read_only) + result->i_mode &= ~0222; + result->i_blocks = 1; + result->i_size = 0; + result->i_data.a_ops = &hpfs_aops; + hpfs_i(result)->mmu_private = 0; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + insert_inode_hash(result); + + if (!uid_eq(result->i_uid, current_fsuid()) || + !gid_eq(result->i_gid, current_fsgid()) || + result->i_mode != (mode | S_IFREG)) { + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_mode = mode | S_IFREG; + hpfs_write_inode_nolock(result); + } + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; + +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + struct inode *result = NULL; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; + if (!new_valid_dev(rdev)) + return -EINVAL; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + if (!(mode & 0222)) dee.read_only = 1; + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + + hpfs_init_inode(result); + result->i_ino = fno; + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + set_nlink(result, 1); + result->i_size = 0; + result->i_blocks = 1; + init_special_inode(result, mode, rdev); + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + mark_buffer_dirty(bh); + + insert_inode_hash(result); + + hpfs_write_inode_nolock(result); + d_instantiate(dentry, result); + brelse(bh); + hpfs_unlock(dir->i_sb); + return 0; +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + struct inode *result; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + if (hpfs_sb(dir->i_sb)->sb_eas < 2) { + hpfs_unlock(dir->i_sb); + return -EPERM; + } + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + result->i_ino = fno; + hpfs_init_inode(result); + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_mode = S_IFLNK | 0777; + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_blocks = 1; + set_nlink(result, 1); + result->i_size = strlen(symlink); + result->i_op = &page_symlink_inode_operations; + result->i_data.a_ops = &hpfs_symlink_aops; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink)); + mark_buffer_dirty(bh); + brelse(bh); + + insert_inode_hash(result); + + hpfs_write_inode_nolock(result); + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_unlink(struct inode *dir, struct dentry *dentry) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + struct inode *inode = d_inode(dentry); + dnode_secno dno; + int r; + int rep = 0; + int err; + + hpfs_lock(dir->i_sb); + hpfs_adjust_length(name, &len); +again: + err = -ENOENT; + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); + if (!de) + goto out; + + err = -EPERM; + if (de->first) + goto out1; + + err = -EISDIR; + if (de->directory) + goto out1; + + r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); + switch (r) { + case 1: + hpfs_error(dir->i_sb, "there was error when removing dirent"); + err = -EFSERROR; + break; + case 2: /* no space for deleting, try to truncate file */ + + err = -ENOSPC; + if (rep++) + break; + + dentry_unhash(dentry); + if (!d_unhashed(dentry)) { + hpfs_unlock(dir->i_sb); + return -ENOSPC; + } + if (generic_permission(inode, MAY_WRITE) || + !S_ISREG(inode->i_mode) || + get_write_access(inode)) { + d_rehash(dentry); + } else { + struct iattr newattrs; + /*pr_info("truncating file before delete.\n");*/ + newattrs.ia_size = 0; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + err = notify_change(dentry, &newattrs, NULL); + put_write_access(inode); + if (!err) + goto again; + } + hpfs_unlock(dir->i_sb); + return -ENOSPC; + default: + drop_nlink(inode); + err = 0; + } + goto out; + +out1: + hpfs_brelse4(&qbh); +out: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + struct inode *inode = d_inode(dentry); + dnode_secno dno; + int n_items = 0; + int err; + int r; + + hpfs_adjust_length(name, &len); + hpfs_lock(dir->i_sb); + err = -ENOENT; + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); + if (!de) + goto out; + + err = -EPERM; + if (de->first) + goto out1; + + err = -ENOTDIR; + if (!de->directory) + goto out1; + + hpfs_count_dnodes(dir->i_sb, hpfs_i(inode)->i_dno, NULL, NULL, &n_items); + err = -ENOTEMPTY; + if (n_items) + goto out1; + + r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); + switch (r) { + case 1: + hpfs_error(dir->i_sb, "there was error when removing dirent"); + err = -EFSERROR; + break; + case 2: + err = -ENOSPC; + break; + default: + drop_nlink(dir); + clear_nlink(inode); + err = 0; + } + goto out; +out1: + hpfs_brelse4(&qbh); +out: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_symlink_readpage(struct file *file, struct page *page) +{ + char *link = kmap(page); + struct inode *i = page->mapping->host; + struct fnode *fnode; + struct buffer_head *bh; + int err; + + err = -EIO; + hpfs_lock(i->i_sb); + if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) + goto fail; + err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); + brelse(bh); + if (err) + goto fail; + hpfs_unlock(i->i_sb); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +fail: + hpfs_unlock(i->i_sb); + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations hpfs_symlink_aops = { + .readpage = hpfs_symlink_readpage +}; + +static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + const unsigned char *old_name = old_dentry->d_name.name; + unsigned old_len = old_dentry->d_name.len; + const unsigned char *new_name = new_dentry->d_name.name; + unsigned new_len = new_dentry->d_name.len; + struct inode *i = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct quad_buffer_head qbh, qbh1; + struct hpfs_dirent *dep, *nde; + struct hpfs_dirent de; + dnode_secno dno; + int r; + struct buffer_head *bh; + struct fnode *fnode; + int err; + + if ((err = hpfs_chk_name(new_name, &new_len))) return err; + err = 0; + hpfs_adjust_length(old_name, &old_len); + + hpfs_lock(i->i_sb); + /* order doesn't matter, due to VFS exclusion */ + + /* Erm? Moving over the empty non-busy directory is perfectly legal */ + if (new_inode && S_ISDIR(new_inode->i_mode)) { + err = -EINVAL; + goto end1; + } + + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { + hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); + err = -ENOENT; + goto end1; + } + copy_de(&de, dep); + de.hidden = new_name[0] == '.'; + + if (new_inode) { + int r; + if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { + if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) { + clear_nlink(new_inode); + copy_de(nde, &de); + memcpy(nde->name, new_name, new_len); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + goto end; + } + hpfs_error(new_dir->i_sb, "hpfs_rename: could not find dirent"); + err = -EFSERROR; + goto end1; + } + err = r == 2 ? -ENOSPC : r == 1 ? -EFSERROR : 0; + goto end1; + } + + if (new_dir == old_dir) hpfs_brelse4(&qbh); + + if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de))) { + if (r == -1) hpfs_error(new_dir->i_sb, "hpfs_rename: dirent already exists!"); + err = r == 1 ? -ENOSPC : -EFSERROR; + if (new_dir != old_dir) hpfs_brelse4(&qbh); + goto end1; + } + + if (new_dir == old_dir) + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { + hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); + err = -ENOENT; + goto end1; + } + + if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 0))) { + hpfs_error(i->i_sb, "hpfs_rename: could not remove dirent"); + err = r == 2 ? -ENOSPC : -EFSERROR; + goto end1; + } + + end: + hpfs_i(i)->i_parent_dir = new_dir->i_ino; + if (S_ISDIR(i->i_mode)) { + inc_nlink(new_dir); + drop_nlink(old_dir); + } + if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) { + fnode->up = cpu_to_le32(new_dir->i_ino); + fnode->len = new_len; + memcpy(fnode->name, new_name, new_len>15?15:new_len); + if (new_len < 15) memset(&fnode->name[new_len], 0, 15 - new_len); + mark_buffer_dirty(bh); + brelse(bh); + } +end1: + hpfs_unlock(i->i_sb); + return err; +} + +const struct inode_operations hpfs_dir_iops = +{ + .create = hpfs_create, + .lookup = hpfs_lookup, + .unlink = hpfs_unlink, + .symlink = hpfs_symlink, + .mkdir = hpfs_mkdir, + .rmdir = hpfs_rmdir, + .mknod = hpfs_mknod, + .rename = hpfs_rename, + .setattr = hpfs_setattr, +}; diff --git a/kernel/fs/hpfs/super.c b/kernel/fs/hpfs/super.c new file mode 100644 index 000000000..7cd00d3a7 --- /dev/null +++ b/kernel/fs/hpfs/super.c @@ -0,0 +1,753 @@ +/* + * linux/fs/hpfs/super.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * mounting, unmounting, error handling + */ + +#include "hpfs_fn.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ + +static void mark_dirty(struct super_block *s, int remount) +{ + if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) { + struct buffer_head *bh; + struct hpfs_spare_block *sb; + if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { + sb->dirty = 1; + sb->old_wrote = 0; + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +} + +/* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there + were errors) */ + +static void unmark_dirty(struct super_block *s) +{ + struct buffer_head *bh; + struct hpfs_spare_block *sb; + if (s->s_flags & MS_RDONLY) return; + sync_blockdev(s->s_bdev); + if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { + sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; + sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } +} + +/* Filesystem error... */ +static char err_buf[1024]; + +void hpfs_error(struct super_block *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(err_buf, sizeof(err_buf), fmt, args); + va_end(args); + + pr_err("filesystem error: %s", err_buf); + if (!hpfs_sb(s)->sb_was_error) { + if (hpfs_sb(s)->sb_err == 2) { + pr_cont("; crashing the system because you wanted it\n"); + mark_dirty(s, 0); + panic("HPFS panic"); + } else if (hpfs_sb(s)->sb_err == 1) { + if (s->s_flags & MS_RDONLY) + pr_cont("; already mounted read-only\n"); + else { + pr_cont("; remounting read-only\n"); + mark_dirty(s, 0); + s->s_flags |= MS_RDONLY; + } + } else if (s->s_flags & MS_RDONLY) + pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); + else + pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); + } else + pr_cont("\n"); + hpfs_sb(s)->sb_was_error = 1; +} + +/* + * A little trick to detect cycles in many hpfs structures and don't let the + * kernel crash on corrupted filesystem. When first called, set c2 to 0. + * + * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories + * nested each in other, chkdsk locked up happilly. + */ + +int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, + char *msg) +{ + if (*c2 && *c1 == key) { + hpfs_error(s, "cycle detected on key %08x in %s", key, msg); + return 1; + } + (*c2)++; + if (!((*c2 - 1) & *c2)) *c1 = key; + return 0; +} + +static void free_sbi(struct hpfs_sb_info *sbi) +{ + kfree(sbi->sb_cp_table); + kfree(sbi->sb_bmp_dir); + kfree(sbi); +} + +static void lazy_free_sbi(struct rcu_head *rcu) +{ + free_sbi(container_of(rcu, struct hpfs_sb_info, rcu)); +} + +static void hpfs_put_super(struct super_block *s) +{ + hpfs_lock(s); + unmark_dirty(s); + hpfs_unlock(s); + call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); +} + +static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) +{ + struct quad_buffer_head qbh; + unsigned long *bits; + unsigned count; + + bits = hpfs_map_4sectors(s, secno, &qbh, 0); + if (!bits) + return (unsigned)-1; + count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); + hpfs_brelse4(&qbh); + return count; +} + +static unsigned count_bitmaps(struct super_block *s) +{ + unsigned n, count, n_bands; + n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + count = 0; + for (n = 0; n < COUNT_RD_AHEAD; n++) { + hpfs_prefetch_bitmap(s, n); + } + for (n = 0; n < n_bands; n++) { + unsigned c; + hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); + c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + if (c != (unsigned)-1) + count += c; + } + return count; +} + +unsigned hpfs_get_free_dnodes(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes == (unsigned)-1) { + unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); + if (c == (unsigned)-1) + return 0; + sbi->sb_n_free_dnodes = c; + } + return sbi->sb_n_free_dnodes; +} + +static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct hpfs_sb_info *sbi = hpfs_sb(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + + hpfs_lock(s); + + if (sbi->sb_n_free == (unsigned)-1) + sbi->sb_n_free = count_bitmaps(s); + + buf->f_type = s->s_magic; + buf->f_bsize = 512; + buf->f_blocks = sbi->sb_fs_size; + buf->f_bfree = sbi->sb_n_free; + buf->f_bavail = sbi->sb_n_free; + buf->f_files = sbi->sb_dirband_size / 4; + buf->f_ffree = hpfs_get_free_dnodes(s); + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = 254; + + hpfs_unlock(s); + + return 0; +} + +static struct kmem_cache * hpfs_inode_cachep; + +static struct inode *hpfs_alloc_inode(struct super_block *sb) +{ + struct hpfs_inode_info *ei; + ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void hpfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); +} + +static void hpfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hpfs_i_callback); +} + +static void init_once(void *foo) +{ + struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", + sizeof(struct hpfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (hpfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(hpfs_inode_cachep); +} + +/* + * A tiny parser for option strings, stolen from dosfs. + * Stolen again from read-only hpfs. + * And updated for table-driven option parsing. + */ + +enum { + Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, + Opt_check_none, Opt_check_normal, Opt_check_strict, + Opt_err_cont, Opt_err_ro, Opt_err_panic, + Opt_eas_no, Opt_eas_ro, Opt_eas_rw, + Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always, + Opt_timeshift, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_help, "help"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_case_lower, "case=lower"}, + {Opt_case_asis, "case=asis"}, + {Opt_check_none, "check=none"}, + {Opt_check_normal, "check=normal"}, + {Opt_check_strict, "check=strict"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_err_panic, "errors=panic"}, + {Opt_eas_no, "eas=no"}, + {Opt_eas_ro, "eas=ro"}, + {Opt_eas_rw, "eas=rw"}, + {Opt_chkdsk_no, "chkdsk=no"}, + {Opt_chkdsk_errors, "chkdsk=errors"}, + {Opt_chkdsk_always, "chkdsk=always"}, + {Opt_timeshift, "timeshift=%d"}, + {Opt_err, NULL}, +}; + +static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, + int *lowercase, int *eas, int *chk, int *errs, + int *chkdsk, int *timeshift) +{ + char *p; + int option; + + if (!opts) + return 1; + + /*pr_info("Parsing opts: '%s'\n",opts);*/ + + while ((p = strsep(&opts, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_help: + return 2; + case Opt_uid: + if (match_int(args, &option)) + return 0; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 0; + break; + case Opt_gid: + if (match_int(args, &option)) + return 0; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 0; + break; + case Opt_umask: + if (match_octal(args, &option)) + return 0; + *umask = option; + break; + case Opt_case_lower: + *lowercase = 1; + break; + case Opt_case_asis: + *lowercase = 0; + break; + case Opt_check_none: + *chk = 0; + break; + case Opt_check_normal: + *chk = 1; + break; + case Opt_check_strict: + *chk = 2; + break; + case Opt_err_cont: + *errs = 0; + break; + case Opt_err_ro: + *errs = 1; + break; + case Opt_err_panic: + *errs = 2; + break; + case Opt_eas_no: + *eas = 0; + break; + case Opt_eas_ro: + *eas = 1; + break; + case Opt_eas_rw: + *eas = 2; + break; + case Opt_chkdsk_no: + *chkdsk = 0; + break; + case Opt_chkdsk_errors: + *chkdsk = 1; + break; + case Opt_chkdsk_always: + *chkdsk = 2; + break; + case Opt_timeshift: + { + int m = 1; + char *rhs = args[0].from; + if (!rhs || !*rhs) + return 0; + if (*rhs == '-') m = -1; + if (*rhs == '+' || *rhs == '-') rhs++; + *timeshift = simple_strtoul(rhs, &rhs, 0) * m; + if (*rhs) + return 0; + break; + } + default: + return 0; + } + } + return 1; +} + +static inline void hpfs_help(void) +{ + pr_info("\n\ +HPFS filesystem options:\n\ + help do not mount and display this text\n\ + uid=xxx set uid of files that don't have uid specified in eas\n\ + gid=xxx set gid of files that don't have gid specified in eas\n\ + umask=xxx set mode of files that don't have mode specified in eas\n\ + case=lower lowercase all files\n\ + case=asis do not lowercase files (default)\n\ + check=none no fs checks - kernel may crash on corrupted filesystem\n\ + check=normal do some checks - it should not crash (default)\n\ + check=strict do extra time-consuming checks, used for debugging\n\ + errors=continue continue on errors\n\ + errors=remount-ro remount read-only if errors found (default)\n\ + errors=panic panic on errors\n\ + chkdsk=no do not mark fs for chkdsking even if there were errors\n\ + chkdsk=errors mark fs dirty if errors found (default)\n\ + chkdsk=always always mark fs dirty - used for debugging\n\ + eas=no ignore extended attributes\n\ + eas=ro read but do not write extended attributes\n\ + eas=rw r/w eas => enables chmod, chown, mknod, ln -s (default)\n\ + timeshift=nnn add nnn seconds to file times\n\ +\n"); +} + +static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) +{ + kuid_t uid; + kgid_t gid; + umode_t umask; + int lowercase, eas, chk, errs, chkdsk, timeshift; + int o; + struct hpfs_sb_info *sbi = hpfs_sb(s); + char *new_opts = kstrdup(data, GFP_KERNEL); + + sync_filesystem(s); + + *flags |= MS_NOATIME; + + hpfs_lock(s); + uid = sbi->sb_uid; gid = sbi->sb_gid; + umask = 0777 & ~sbi->sb_mode; + lowercase = sbi->sb_lowercase; + eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; + errs = sbi->sb_err; timeshift = sbi->sb_timeshift; + + if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, + &eas, &chk, &errs, &chkdsk, ×hift))) { + pr_err("bad mount options.\n"); + goto out_err; + } + if (o == 2) { + hpfs_help(); + goto out_err; + } + if (timeshift != sbi->sb_timeshift) { + pr_err("timeshift can't be changed using remount.\n"); + goto out_err; + } + + unmark_dirty(s); + + sbi->sb_uid = uid; sbi->sb_gid = gid; + sbi->sb_mode = 0777 & ~umask; + sbi->sb_lowercase = lowercase; + sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; + sbi->sb_err = errs; sbi->sb_timeshift = timeshift; + + if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); + + replace_mount_options(s, new_opts); + + hpfs_unlock(s); + return 0; + +out_err: + hpfs_unlock(s); + kfree(new_opts); + return -EINVAL; +} + +/* Super operations */ + +static const struct super_operations hpfs_sops = +{ + .alloc_inode = hpfs_alloc_inode, + .destroy_inode = hpfs_destroy_inode, + .evict_inode = hpfs_evict_inode, + .put_super = hpfs_put_super, + .statfs = hpfs_statfs, + .remount_fs = hpfs_remount_fs, + .show_options = generic_show_options, +}; + +static int hpfs_fill_super(struct super_block *s, void *options, int silent) +{ + struct buffer_head *bh0, *bh1, *bh2; + struct hpfs_boot_block *bootblock; + struct hpfs_super_block *superblock; + struct hpfs_spare_block *spareblock; + struct hpfs_sb_info *sbi; + struct inode *root; + + kuid_t uid; + kgid_t gid; + umode_t umask; + int lowercase, eas, chk, errs, chkdsk, timeshift; + + dnode_secno root_dno; + struct hpfs_dirent *de = NULL; + struct quad_buffer_head qbh; + + int o; + + save_mount_options(s, options); + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) { + return -ENOMEM; + } + s->s_fs_info = sbi; + + mutex_init(&sbi->hpfs_mutex); + hpfs_lock(s); + + uid = current_uid(); + gid = current_gid(); + umask = current_umask(); + lowercase = 0; + eas = 2; + chk = 1; + errs = 1; + chkdsk = 1; + timeshift = 0; + + if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, + &eas, &chk, &errs, &chkdsk, ×hift))) { + pr_err("bad mount options.\n"); + goto bail0; + } + if (o==2) { + hpfs_help(); + goto bail0; + } + + /*sbi->sb_mounting = 1;*/ + sb_set_blocksize(s, 512); + sbi->sb_fs_size = -1; + if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; + if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; + if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; + + /* Check magics */ + if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC + ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC + || le32_to_cpu(spareblock->magic) != SP_MAGIC) { + if (!silent) + pr_err("Bad magic ... probably not HPFS\n"); + goto bail4; + } + + /* Check version */ + if (!(s->s_flags & MS_RDONLY) && + superblock->funcversion != 2 && superblock->funcversion != 3) { + pr_err("Bad version %d,%d. Mount readonly to go around\n", + (int)superblock->version, (int)superblock->funcversion); + pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); + goto bail4; + } + + s->s_flags |= MS_NOATIME; + + /* Fill superblock stuff */ + s->s_magic = HPFS_SUPER_MAGIC; + s->s_op = &hpfs_sops; + s->s_d_op = &hpfs_dentry_operations; + + sbi->sb_root = le32_to_cpu(superblock->root); + sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); + sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); + sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); + sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); + sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); + sbi->sb_uid = uid; + sbi->sb_gid = gid; + sbi->sb_mode = 0777 & ~umask; + sbi->sb_n_free = -1; + sbi->sb_n_free_dnodes = -1; + sbi->sb_lowercase = lowercase; + sbi->sb_eas = eas; + sbi->sb_chk = chk; + sbi->sb_chkdsk = chkdsk; + sbi->sb_err = errs; + sbi->sb_timeshift = timeshift; + sbi->sb_was_error = 0; + sbi->sb_cp_table = NULL; + sbi->sb_c_bitmap = -1; + sbi->sb_max_fwd_alloc = 0xffffff; + + if (sbi->sb_fs_size >= 0x80000000) { + hpfs_error(s, "invalid size in superblock: %08x", + (unsigned)sbi->sb_fs_size); + goto bail4; + } + + /* Load bitmap directory */ + if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) + goto bail4; + + /* Check for general fs errors*/ + if (spareblock->dirty && !spareblock->old_wrote) { + if (errs == 2) { + pr_err("Improperly stopped, not mounted\n"); + goto bail4; + } + hpfs_error(s, "improperly stopped"); + } + + if (!(s->s_flags & MS_RDONLY)) { + spareblock->dirty = 1; + spareblock->old_wrote = 0; + mark_buffer_dirty(bh2); + } + + if (spareblock->hotfixes_used || spareblock->n_spares_used) { + if (errs >= 2) { + pr_err("Hotfixes not supported here, try chkdsk\n"); + mark_dirty(s, 0); + goto bail4; + } + hpfs_error(s, "hotfixes not supported here, try chkdsk"); + if (errs == 0) + pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n"); + else + pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n"); + } + if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { + if (errs >= 2) { + pr_err("Spare dnodes used, try chkdsk\n"); + mark_dirty(s, 0); + goto bail4; + } + hpfs_error(s, "warning: spare dnodes used, try chkdsk"); + if (errs == 0) + pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); + } + if (chk) { + unsigned a; + if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || + le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { + hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", + le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); + goto bail4; + } + a = sbi->sb_dirband_size; + sbi->sb_dirband_size = 0; + if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { + mark_dirty(s, 0); + goto bail4; + } + sbi->sb_dirband_size = a; + } else + pr_err("You really don't want any checks? You are crazy...\n"); + + /* Load code page table */ + if (le32_to_cpu(spareblock->n_code_pages)) + if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) + pr_err("code page support is disabled\n"); + + brelse(bh2); + brelse(bh1); + brelse(bh0); + + root = iget_locked(s, sbi->sb_root); + if (!root) + goto bail0; + hpfs_init_inode(root); + hpfs_read_inode(root); + unlock_new_inode(root); + s->s_root = d_make_root(root); + if (!s->s_root) + goto bail0; + + /* + * find the root directory's . pointer & finish filling in the inode + */ + + root_dno = hpfs_fnode_dno(s, sbi->sb_root); + if (root_dno) + de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh); + if (!de) + hpfs_error(s, "unable to find root dir"); + else { + root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date)); + root->i_atime.tv_nsec = 0; + root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); + root->i_mtime.tv_nsec = 0; + root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); + root->i_ctime.tv_nsec = 0; + hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); + hpfs_i(root)->i_parent_dir = root->i_ino; + if (root->i_size == -1) + root->i_size = 2048; + if (root->i_blocks == -1) + root->i_blocks = 5; + hpfs_brelse4(&qbh); + } + hpfs_unlock(s); + return 0; + +bail4: brelse(bh2); +bail3: brelse(bh1); +bail2: brelse(bh0); +bail1: +bail0: + hpfs_unlock(s); + free_sbi(sbi); + return -EINVAL; +} + +static struct dentry *hpfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); +} + +static struct file_system_type hpfs_fs_type = { + .owner = THIS_MODULE, + .name = "hpfs", + .mount = hpfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("hpfs"); + +static int __init init_hpfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&hpfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_hpfs_fs(void) +{ + unregister_filesystem(&hpfs_fs_type); + destroy_inodecache(); +} + +module_init(init_hpfs_fs) +module_exit(exit_hpfs_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/hppfs/Makefile b/kernel/fs/hppfs/Makefile new file mode 100644 index 000000000..3a982bd97 --- /dev/null +++ b/kernel/fs/hppfs/Makefile @@ -0,0 +1,6 @@ +# +# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# Licensed under the GPL +# + +obj-$(CONFIG_HPPFS) += hppfs.o diff --git a/kernel/fs/hppfs/hppfs.c b/kernel/fs/hppfs/hppfs.c new file mode 100644 index 000000000..fa2bd5366 --- /dev/null +++ b/kernel/fs/hppfs/hppfs.c @@ -0,0 +1,766 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct inode *get_inode(struct super_block *, struct dentry *); + +struct hppfs_data { + struct list_head list; + char contents[PAGE_SIZE - sizeof(struct list_head)]; +}; + +struct hppfs_private { + struct file *proc_file; + int host_fd; + loff_t len; + struct hppfs_data *contents; +}; + +struct hppfs_inode_info { + struct dentry *proc_dentry; + struct inode vfs_inode; +}; + +static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) +{ + return container_of(inode, struct hppfs_inode_info, vfs_inode); +} + +#define HPPFS_SUPER_MAGIC 0xb00000ee + +static const struct super_operations hppfs_sbops; + +static int is_pid(struct dentry *dentry) +{ + struct super_block *sb; + int i; + + sb = dentry->d_sb; + if (dentry->d_parent != sb->s_root) + return 0; + + for (i = 0; i < dentry->d_name.len; i++) { + if (!isdigit(dentry->d_name.name[i])) + return 0; + } + return 1; +} + +static char *dentry_name(struct dentry *dentry, int extra) +{ + struct dentry *parent; + char *root, *name; + const char *seg_name; + int len, seg_len, root_len; + + len = 0; + parent = dentry; + while (parent->d_parent != parent) { + if (is_pid(parent)) + len += strlen("pid") + 1; + else len += parent->d_name.len + 1; + parent = parent->d_parent; + } + + root = "proc"; + root_len = strlen(root); + len += root_len; + name = kmalloc(len + extra + 1, GFP_KERNEL); + if (name == NULL) + return NULL; + + name[len] = '\0'; + parent = dentry; + while (parent->d_parent != parent) { + if (is_pid(parent)) { + seg_name = "pid"; + seg_len = strlen(seg_name); + } + else { + seg_name = parent->d_name.name; + seg_len = parent->d_name.len; + } + + len -= seg_len + 1; + name[len] = '/'; + memcpy(&name[len + 1], seg_name, seg_len); + parent = parent->d_parent; + } + memcpy(name, root, root_len); + return name; +} + +static int file_removed(struct dentry *dentry, const char *file) +{ + char *host_file; + int extra, fd; + + extra = 0; + if (file != NULL) + extra += strlen(file) + 1; + + host_file = dentry_name(dentry, extra + strlen("/remove")); + if (host_file == NULL) { + printk(KERN_ERR "file_removed : allocation failed\n"); + return -ENOMEM; + } + + if (file != NULL) { + strcat(host_file, "/"); + strcat(host_file, file); + } + strcat(host_file, "/remove"); + + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + kfree(host_file); + if (fd > 0) { + os_close_file(fd); + return 1; + } + return 0; +} + +static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, + unsigned int flags) +{ + struct dentry *proc_dentry, *parent; + struct qstr *name = &dentry->d_name; + struct inode *inode; + int err, deleted; + + deleted = file_removed(dentry, NULL); + if (deleted < 0) + return ERR_PTR(deleted); + else if (deleted) + return ERR_PTR(-ENOENT); + + parent = HPPFS_I(ino)->proc_dentry; + mutex_lock(&d_inode(parent)->i_mutex); + proc_dentry = lookup_one_len(name->name, parent, name->len); + mutex_unlock(&d_inode(parent)->i_mutex); + + if (IS_ERR(proc_dentry)) + return proc_dentry; + + err = -ENOMEM; + inode = get_inode(ino->i_sb, proc_dentry); + if (!inode) + goto out; + + d_add(dentry, inode); + return NULL; + + out: + return ERR_PTR(err); +} + +static const struct inode_operations hppfs_file_iops = { +}; + +static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, + loff_t *ppos, int is_user) +{ + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + ssize_t n; + + read = file_inode(file)->i_fop->read; + + if (!is_user) + set_fs(KERNEL_DS); + + n = (*read)(file, buf, count, &file->f_pos); + + if (!is_user) + set_fs(USER_DS); + + if (ppos) + *ppos = file->f_pos; + return n; +} + +static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) +{ + ssize_t n; + int cur, err; + char *new_buf; + + n = -ENOMEM; + new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (new_buf == NULL) { + printk(KERN_ERR "hppfs_read_file : kmalloc failed\n"); + goto out; + } + n = 0; + while (count > 0) { + cur = min_t(ssize_t, count, PAGE_SIZE); + err = os_read_file(fd, new_buf, cur); + if (err < 0) { + printk(KERN_ERR "hppfs_read : read failed, " + "errno = %d\n", err); + n = err; + goto out_free; + } else if (err == 0) + break; + + if (copy_to_user(buf, new_buf, err)) { + n = -EFAULT; + goto out_free; + } + n += err; + count -= err; + } + out_free: + kfree(new_buf); + out: + return n; +} + +static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct hppfs_private *hppfs = file->private_data; + struct hppfs_data *data; + loff_t off; + int err; + + if (hppfs->contents != NULL) { + int rem; + + if (*ppos >= hppfs->len) + return 0; + + data = hppfs->contents; + off = *ppos; + while (off >= sizeof(data->contents)) { + data = list_entry(data->list.next, struct hppfs_data, + list); + off -= sizeof(data->contents); + } + + if (off + count > hppfs->len) + count = hppfs->len - off; + rem = copy_to_user(buf, &data->contents[off], count); + *ppos += count - rem; + if (rem > 0) + return -EFAULT; + } else if (hppfs->host_fd != -1) { + err = os_seek_file(hppfs->host_fd, *ppos); + if (err) { + printk(KERN_ERR "hppfs_read : seek failed, " + "errno = %d\n", err); + return err; + } + err = hppfs_read_file(hppfs->host_fd, buf, count); + if (err < 0) { + printk(KERN_ERR "hppfs_read: read failed: %d\n", err); + return err; + } + count = err; + if (count > 0) + *ppos += count; + } + else count = read_proc(hppfs->proc_file, buf, count, ppos, 1); + + return count; +} + +static ssize_t hppfs_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + + write = file_inode(proc_file)->i_fop->write; + return (*write)(proc_file, buf, len, ppos); +} + +static int open_host_sock(char *host_file, int *filter_out) +{ + char *end; + int fd; + + end = &host_file[strlen(host_file)]; + strcpy(end, "/rw"); + *filter_out = 1; + fd = os_connect_socket(host_file); + if (fd > 0) + return fd; + + strcpy(end, "/r"); + *filter_out = 0; + fd = os_connect_socket(host_file); + return fd; +} + +static void free_contents(struct hppfs_data *head) +{ + struct hppfs_data *data; + struct list_head *ele, *next; + + if (head == NULL) + return; + + list_for_each_safe(ele, next, &head->list) { + data = list_entry(ele, struct hppfs_data, list); + kfree(data); + } + kfree(head); +} + +static struct hppfs_data *hppfs_get_data(int fd, int filter, + struct file *proc_file, + struct file *hppfs_file, + loff_t *size_out) +{ + struct hppfs_data *data, *new, *head; + int n, err; + + err = -ENOMEM; + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) { + printk(KERN_ERR "hppfs_get_data : head allocation failed\n"); + goto failed; + } + + INIT_LIST_HEAD(&data->list); + + head = data; + *size_out = 0; + + if (filter) { + while ((n = read_proc(proc_file, data->contents, + sizeof(data->contents), NULL, 0)) > 0) + os_write_file(fd, data->contents, n); + err = os_shutdown_socket(fd, 0, 1); + if (err) { + printk(KERN_ERR "hppfs_get_data : failed to shut down " + "socket\n"); + goto failed_free; + } + } + while (1) { + n = os_read_file(fd, data->contents, sizeof(data->contents)); + if (n < 0) { + err = n; + printk(KERN_ERR "hppfs_get_data : read failed, " + "errno = %d\n", err); + goto failed_free; + } else if (n == 0) + break; + + *size_out += n; + + if (n < sizeof(data->contents)) + break; + + new = kmalloc(sizeof(*data), GFP_KERNEL); + if (new == 0) { + printk(KERN_ERR "hppfs_get_data : data allocation " + "failed\n"); + err = -ENOMEM; + goto failed_free; + } + + INIT_LIST_HEAD(&new->list); + list_add(&new->list, &data->list); + data = new; + } + return head; + + failed_free: + free_contents(head); + failed: + return ERR_PTR(err); +} + +static struct hppfs_private *hppfs_data(void) +{ + struct hppfs_private *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return data; + + *data = ((struct hppfs_private ) { .host_fd = -1, + .len = -1, + .contents = NULL } ); + return data; +} + +static int file_mode(int fmode) +{ + if (fmode == (FMODE_READ | FMODE_WRITE)) + return O_RDWR; + if (fmode == FMODE_READ) + return O_RDONLY; + if (fmode == FMODE_WRITE) + return O_WRONLY; + return 0; +} + +static int hppfs_open(struct inode *inode, struct file *file) +{ + const struct cred *cred = file->f_cred; + struct hppfs_private *data; + struct path path; + char *host_file; + int err, fd, type, filter; + + err = -ENOMEM; + data = hppfs_data(); + if (data == NULL) + goto out; + + host_file = dentry_name(file->f_path.dentry, strlen("/rw")); + if (host_file == NULL) + goto out_free2; + + path.mnt = inode->i_sb->s_fs_info; + path.dentry = HPPFS_I(inode)->proc_dentry; + + data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); + err = PTR_ERR(data->proc_file); + if (IS_ERR(data->proc_file)) + goto out_free1; + + type = os_file_type(host_file); + if (type == OS_TYPE_FILE) { + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + if (fd >= 0) + data->host_fd = fd; + else + printk(KERN_ERR "hppfs_open : failed to open '%s', " + "errno = %d\n", host_file, -fd); + + data->contents = NULL; + } else if (type == OS_TYPE_DIR) { + fd = open_host_sock(host_file, &filter); + if (fd > 0) { + data->contents = hppfs_get_data(fd, filter, + data->proc_file, + file, &data->len); + if (!IS_ERR(data->contents)) + data->host_fd = fd; + } else + printk(KERN_ERR "hppfs_open : failed to open a socket " + "in '%s', errno = %d\n", host_file, -fd); + } + kfree(host_file); + + file->private_data = data; + return 0; + + out_free1: + kfree(host_file); + out_free2: + free_contents(data->contents); + kfree(data); + out: + return err; +} + +static int hppfs_dir_open(struct inode *inode, struct file *file) +{ + const struct cred *cred = file->f_cred; + struct hppfs_private *data; + struct path path; + int err; + + err = -ENOMEM; + data = hppfs_data(); + if (data == NULL) + goto out; + + path.mnt = inode->i_sb->s_fs_info; + path.dentry = HPPFS_I(inode)->proc_dentry; + data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); + err = PTR_ERR(data->proc_file); + if (IS_ERR(data->proc_file)) + goto out_free; + + file->private_data = data; + return 0; + + out_free: + kfree(data); + out: + return err; +} + +static loff_t hppfs_llseek(struct file *file, loff_t off, int where) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + loff_t (*llseek)(struct file *, loff_t, int); + loff_t ret; + + llseek = file_inode(proc_file)->i_fop->llseek; + if (llseek != NULL) { + ret = (*llseek)(proc_file, off, where); + if (ret < 0) + return ret; + } + + return default_llseek(file, off, where); +} + +static int hppfs_release(struct inode *inode, struct file *file) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + if (proc_file) + fput(proc_file); + kfree(data); + return 0; +} + +static const struct file_operations hppfs_file_fops = { + .owner = NULL, + .llseek = hppfs_llseek, + .read = hppfs_read, + .write = hppfs_write, + .open = hppfs_open, + .release = hppfs_release, +}; + +struct hppfs_dirent { + struct dir_context ctx; + struct dir_context *caller; + struct dentry *dentry; +}; + +static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, + loff_t offset, u64 inode, unsigned int type) +{ + struct hppfs_dirent *dirent = + container_of(ctx, struct hppfs_dirent, ctx); + + if (file_removed(dirent->dentry, name)) + return 0; + + dirent->caller->pos = dirent->ctx.pos; + return !dir_emit(dirent->caller, name, size, inode, type); +} + +static int hppfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + struct hppfs_dirent d = { + .ctx.actor = hppfs_filldir, + .caller = ctx, + .dentry = file->f_path.dentry + }; + int err; + proc_file->f_pos = ctx->pos; + err = iterate_dir(proc_file, &d.ctx); + ctx->pos = d.ctx.pos; + return err; +} + +static const struct file_operations hppfs_dir_fops = { + .owner = NULL, + .iterate = hppfs_readdir, + .open = hppfs_dir_open, + .llseek = default_llseek, + .release = hppfs_release, +}; + +static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) +{ + sf->f_blocks = 0; + sf->f_bfree = 0; + sf->f_bavail = 0; + sf->f_files = 0; + sf->f_ffree = 0; + sf->f_type = HPPFS_SUPER_MAGIC; + return 0; +} + +static struct inode *hppfs_alloc_inode(struct super_block *sb) +{ + struct hppfs_inode_info *hi; + + hi = kmalloc(sizeof(*hi), GFP_KERNEL); + if (!hi) + return NULL; + + hi->proc_dentry = NULL; + inode_init_once(&hi->vfs_inode); + return &hi->vfs_inode; +} + +void hppfs_evict_inode(struct inode *ino) +{ + clear_inode(ino); + dput(HPPFS_I(ino)->proc_dentry); + mntput(ino->i_sb->s_fs_info); +} + +static void hppfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(HPPFS_I(inode)); +} + +static void hppfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hppfs_i_callback); +} + +static const struct super_operations hppfs_sbops = { + .alloc_inode = hppfs_alloc_inode, + .destroy_inode = hppfs_destroy_inode, + .evict_inode = hppfs_evict_inode, + .statfs = hppfs_statfs, +}; + +static int hppfs_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; + return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer, + buflen); +} + +static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; + + return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd); +} + +static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; + + if (d_inode(proc_dentry)->i_op->put_link) + d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie); +} + +static const struct inode_operations hppfs_dir_iops = { + .lookup = hppfs_lookup, +}; + +static const struct inode_operations hppfs_link_iops = { + .readlink = hppfs_readlink, + .follow_link = hppfs_follow_link, + .put_link = hppfs_put_link, +}; + +static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) +{ + struct inode *proc_ino = d_inode(dentry); + struct inode *inode = new_inode(sb); + + if (!inode) { + dput(dentry); + return NULL; + } + + if (d_is_dir(dentry)) { + inode->i_op = &hppfs_dir_iops; + inode->i_fop = &hppfs_dir_fops; + } else if (d_is_symlink(dentry)) { + inode->i_op = &hppfs_link_iops; + inode->i_fop = &hppfs_file_fops; + } else { + inode->i_op = &hppfs_file_iops; + inode->i_fop = &hppfs_file_fops; + } + + HPPFS_I(inode)->proc_dentry = dentry; + + inode->i_uid = proc_ino->i_uid; + inode->i_gid = proc_ino->i_gid; + inode->i_atime = proc_ino->i_atime; + inode->i_mtime = proc_ino->i_mtime; + inode->i_ctime = proc_ino->i_ctime; + inode->i_ino = proc_ino->i_ino; + inode->i_mode = proc_ino->i_mode; + set_nlink(inode, proc_ino->i_nlink); + inode->i_size = proc_ino->i_size; + inode->i_blocks = proc_ino->i_blocks; + + return inode; +} + +static int hppfs_fill_super(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + struct vfsmount *proc_mnt; + int err = -ENOENT; + + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); + if (IS_ERR(proc_mnt)) + goto out; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HPPFS_SUPER_MAGIC; + sb->s_op = &hppfs_sbops; + sb->s_fs_info = proc_mnt; + + err = -ENOMEM; + root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) + goto out_mntput; + + return 0; + + out_mntput: + mntput(proc_mnt); + out: + return(err); +} + +static struct dentry *hppfs_read_super(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return mount_nodev(type, flags, data, hppfs_fill_super); +} + +static struct file_system_type hppfs_type = { + .owner = THIS_MODULE, + .name = "hppfs", + .mount = hppfs_read_super, + .kill_sb = kill_anon_super, + .fs_flags = 0, +}; +MODULE_ALIAS_FS("hppfs"); + +static int __init init_hppfs(void) +{ + return register_filesystem(&hppfs_type); +} + +static void __exit exit_hppfs(void) +{ + unregister_filesystem(&hppfs_type); +} + +module_init(init_hppfs) +module_exit(exit_hppfs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/hugetlbfs/Makefile b/kernel/fs/hugetlbfs/Makefile new file mode 100644 index 000000000..6adf870c6 --- /dev/null +++ b/kernel/fs/hugetlbfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux ramfs routines. +# + +obj-$(CONFIG_HUGETLBFS) += hugetlbfs.o + +hugetlbfs-objs := inode.o diff --git a/kernel/fs/hugetlbfs/inode.c b/kernel/fs/hugetlbfs/inode.c new file mode 100644 index 000000000..87724c1d7 --- /dev/null +++ b/kernel/fs/hugetlbfs/inode.c @@ -0,0 +1,1114 @@ +/* + * hugetlbpage-backed filesystem. Based on ramfs. + * + * Nadia Yvette Chambers, 2002 + * + * Copyright (C) 2002 Linus Torvalds. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include /* remove ASAP */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const struct super_operations hugetlbfs_ops; +static const struct address_space_operations hugetlbfs_aops; +const struct file_operations hugetlbfs_file_operations; +static const struct inode_operations hugetlbfs_dir_inode_operations; +static const struct inode_operations hugetlbfs_inode_operations; + +struct hugetlbfs_config { + kuid_t uid; + kgid_t gid; + umode_t mode; + long max_hpages; + long nr_inodes; + struct hstate *hstate; + long min_hpages; +}; + +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + +int sysctl_hugetlb_shm_group; + +enum { + Opt_size, Opt_nr_inodes, + Opt_mode, Opt_uid, Opt_gid, + Opt_pagesize, Opt_min_size, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_size, "size=%s"}, + {Opt_nr_inodes, "nr_inodes=%s"}, + {Opt_mode, "mode=%o"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, + {Opt_err, NULL}, +}; + +static void huge_pagevec_release(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); ++i) + put_page(pvec->pages[i]); + + pagevec_reinit(pvec); +} + +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + loff_t len, vma_len; + int ret; + struct hstate *h = hstate_file(file); + + /* + * vma address alignment (but not the pgoff alignment) has + * already been checked by prepare_hugepage_range. If you add + * any error returns here, do so after setting VM_HUGETLB, so + * is_vm_hugetlb_page tests below unmap_region go the right + * way when do_mmap_pgoff unwinds (may be important on powerpc + * and ia64). + */ + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma->vm_ops = &hugetlb_vm_ops; + + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + return -EINVAL; + + vma_len = (loff_t)(vma->vm_end - vma->vm_start); + + mutex_lock(&inode->i_mutex); + file_accessed(file); + + ret = -ENOMEM; + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + if (hugetlb_reserve_pages(inode, + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, + vma->vm_flags)) + goto out; + + ret = 0; + hugetlb_prefault_arch_hook(vma->vm_mm); + if (vma->vm_flags & VM_WRITE && inode->i_size < len) + inode->i_size = len; +out: + mutex_unlock(&inode->i_mutex); + + return ret; +} + +/* + * Called under down_write(mmap_sem). + */ + +#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} +#endif + +static size_t +hugetlbfs_read_actor(struct page *page, unsigned long offset, + struct iov_iter *to, unsigned long size) +{ + size_t copied = 0; + int i, chunksize; + + /* Find which 4k chunk and offset with in that chunk */ + i = offset >> PAGE_CACHE_SHIFT; + offset = offset & ~PAGE_CACHE_MASK; + + while (size) { + size_t n; + chunksize = PAGE_CACHE_SIZE; + if (offset) + chunksize -= offset; + if (chunksize > size) + chunksize = size; + n = copy_page_to_iter(&page[i], offset, chunksize, to); + copied += n; + if (n != chunksize) + return copied; + offset = 0; + size -= chunksize; + i++; + } + return copied; +} + +/* + * Support for read() - Find the page attached to f_mapping and copy out the + * data. Its *very* similar to do_generic_mapping_read(), we can't use that + * since it has PAGE_CACHE_SIZE assumptions. + */ +static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct hstate *h = hstate_file(file); + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long index = iocb->ki_pos >> huge_page_shift(h); + unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); + unsigned long end_index; + loff_t isize; + ssize_t retval = 0; + + while (iov_iter_count(to)) { + struct page *page; + size_t nr, copied; + + /* nr is the maximum number of bytes to copy from this page */ + nr = huge_page_size(h); + isize = i_size_read(inode); + if (!isize) + break; + end_index = (isize - 1) >> huge_page_shift(h); + if (index > end_index) + break; + if (index == end_index) { + nr = ((isize - 1) & ~huge_page_mask(h)) + 1; + if (nr <= offset) + break; + } + nr = nr - offset; + + /* Find the page */ + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + /* + * We have a HOLE, zero out the user-buffer for the + * length of the hole or request. + */ + copied = iov_iter_zero(nr, to); + } else { + unlock_page(page); + + /* + * We have the page, copy it to user space buffer. + */ + copied = hugetlbfs_read_actor(page, offset, to, nr); + page_cache_release(page); + } + offset += copied; + retval += copied; + if (copied != nr && iov_iter_count(to)) { + if (!retval) + retval = -EFAULT; + break; + } + index += offset >> huge_page_shift(h); + offset &= ~huge_page_mask(h); + } + iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; + return retval; +} + +static int hugetlbfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return -EINVAL; +} + +static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + BUG(); + return -EINVAL; +} + +static void truncate_huge_page(struct page *page) +{ + ClearPageDirty(page); + ClearPageUptodate(page); + delete_from_page_cache(page); +} + +static void truncate_hugepages(struct inode *inode, loff_t lstart) +{ + struct hstate *h = hstate_inode(inode); + struct address_space *mapping = &inode->i_data; + const pgoff_t start = lstart >> huge_page_shift(h); + struct pagevec pvec; + pgoff_t next; + int i, freed = 0; + + pagevec_init(&pvec, 0); + next = start; + while (1) { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + + for (i = 0; i < pagevec_count(&pvec); ++i) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (page->index > next) + next = page->index; + ++next; + truncate_huge_page(page); + unlock_page(page); + freed++; + } + huge_pagevec_release(&pvec); + } + BUG_ON(!lstart && mapping->nrpages); + hugetlb_unreserve_pages(inode, start, freed); +} + +static void hugetlbfs_evict_inode(struct inode *inode) +{ + struct resv_map *resv_map; + + truncate_hugepages(inode, 0); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); + clear_inode(inode); +} + +static inline void +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) +{ + struct vm_area_struct *vma; + + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { + unsigned long v_offset; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < pgoff) + v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + unmap_hugepage_range(vma, vma->vm_start + v_offset, + vma->vm_end, NULL); + } +} + +static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +{ + pgoff_t pgoff; + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + + BUG_ON(offset & ~huge_page_mask(h)); + pgoff = offset >> PAGE_SHIFT; + + i_size_write(inode, offset); + i_mmap_lock_write(mapping); + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); + i_mmap_unlock_write(mapping); + truncate_hugepages(inode, offset); + return 0; +} + +static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct hstate *h = hstate_inode(inode); + int error; + unsigned int ia_valid = attr->ia_valid; + + BUG_ON(!inode); + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE) { + error = -EINVAL; + if (attr->ia_size & ~huge_page_mask(h)) + return -EINVAL; + error = hugetlb_vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static struct inode *hugetlbfs_get_root(struct super_block *sb, + struct hugetlbfs_config *config) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode->i_mode = S_IFDIR | config->mode; + inode->i_uid = config->uid; + inode->i_gid = config->gid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = HUGETLBFS_I(inode); + mpol_shared_policy_init(&info->policy, NULL); + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + lockdep_annotate_inode_mutex_key(inode); + } + return inode; +} + +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_rwsem. + */ +static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; + +static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct inode *dir, + umode_t mode, dev_t dev) +{ + struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, + &hugetlbfs_i_mmap_rwsem_key); + inode->i_mapping->a_ops = &hugetlbfs_aops; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->private_data = resv_map; + info = HUGETLBFS_I(inode); + /* + * The policy is initialized here even if we are creating a + * private inode because initialization simply creates an + * an empty rb tree and calls spin_lock_init(), later when we + * call mpol_free_shared_policy() it will just return because + * the rb tree will still be empty. + */ + mpol_shared_policy_init(&info->policy, NULL); + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &hugetlbfs_inode_operations; + inode->i_fop = &hugetlbfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + lockdep_annotate_inode_mutex_key(inode); + } else + kref_put(&resv_map->refs, resv_map_release); + + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); + if (inode) { + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + } + return error; +} + +static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); + if (!retval) + inc_nlink(dir); + return retval; +} + +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +{ + return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int hugetlbfs_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + if (inode) { + int l = strlen(symname)+1; + error = page_symlink(inode, symname, l); + if (!error) { + d_instantiate(dentry, inode); + dget(dentry); + } else + iput(inode); + } + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + return error; +} + +/* + * mark the head page dirty + */ +static int hugetlbfs_set_page_dirty(struct page *page) +{ + struct page *head = compound_head(page); + + SetPageDirty(head); + return 0; +} + +static int hugetlbfs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int rc; + + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} + +static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); + struct hstate *h = hstate_inode(d_inode(dentry)); + + buf->f_type = HUGETLBFS_MAGIC; + buf->f_bsize = huge_page_size(h); + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + /* If no limits set, just report 0 for max/free/used + * blocks, like simple_statfs() */ + if (sbinfo->spool) { + long free_pages; + + spin_lock(&sbinfo->spool->lock); + buf->f_blocks = sbinfo->spool->max_hpages; + free_pages = sbinfo->spool->max_hpages + - sbinfo->spool->used_hpages; + buf->f_bavail = buf->f_bfree = free_pages; + spin_unlock(&sbinfo->spool->lock); + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } + spin_unlock(&sbinfo->stat_lock); + } + buf->f_namelen = NAME_MAX; + return 0; +} + +static void hugetlbfs_put_super(struct super_block *sb) +{ + struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); + + if (sbi) { + sb->s_fs_info = NULL; + + if (sbi->spool) + hugepage_put_subpool(sbi->spool); + + kfree(sbi); + } +} + +static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + if (unlikely(!sbinfo->free_inodes)) { + spin_unlock(&sbinfo->stat_lock); + return 0; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + + return 1; +} + +static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + + +static struct kmem_cache *hugetlbfs_inode_cachep; + +static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + struct hugetlbfs_inode_info *p; + + if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) + return NULL; + p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); + if (unlikely(!p)) { + hugetlbfs_inc_free_inodes(sbinfo); + return NULL; + } + return &p->vfs_inode; +} + +static void hugetlbfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + +static void hugetlbfs_destroy_inode(struct inode *inode) +{ + hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); + mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + call_rcu(&inode->i_rcu, hugetlbfs_i_callback); +} + +static const struct address_space_operations hugetlbfs_aops = { + .write_begin = hugetlbfs_write_begin, + .write_end = hugetlbfs_write_end, + .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, +}; + + +static void init_once(void *foo) +{ + struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + + inode_init_once(&ei->vfs_inode); +} + +const struct file_operations hugetlbfs_file_operations = { + .read_iter = hugetlbfs_read_iter, + .mmap = hugetlbfs_file_mmap, + .fsync = noop_fsync, + .get_unmapped_area = hugetlb_get_unmapped_area, + .llseek = default_llseek, +}; + +static const struct inode_operations hugetlbfs_dir_inode_operations = { + .create = hugetlbfs_create, + .lookup = simple_lookup, + .link = simple_link, + .unlink = simple_unlink, + .symlink = hugetlbfs_symlink, + .mkdir = hugetlbfs_mkdir, + .rmdir = simple_rmdir, + .mknod = hugetlbfs_mknod, + .rename = simple_rename, + .setattr = hugetlbfs_setattr, +}; + +static const struct inode_operations hugetlbfs_inode_operations = { + .setattr = hugetlbfs_setattr, +}; + +static const struct super_operations hugetlbfs_ops = { + .alloc_inode = hugetlbfs_alloc_inode, + .destroy_inode = hugetlbfs_destroy_inode, + .evict_inode = hugetlbfs_evict_inode, + .statfs = hugetlbfs_statfs, + .put_super = hugetlbfs_put_super, + .show_options = generic_show_options, +}; + +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + +static int +hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +{ + char *p, *rest; + substring_t args[MAX_OPT_ARGS]; + int option; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(pconfig->uid)) + goto bad_val; + break; + + case Opt_gid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(pconfig->gid)) + goto bad_val; + break; + + case Opt_mode: + if (match_octal(&args[0], &option)) + goto bad_val; + pconfig->mode = option & 01777U; + break; + + case Opt_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; + if (*rest == '%') + max_val_type = SIZE_PERCENT; + break; + } + + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + pconfig->nr_inodes = memparse(args[0].from, &rest); + break; + + case Opt_pagesize: { + unsigned long ps; + ps = memparse(args[0].from, &rest); + pconfig->hstate = size_to_hstate(ps); + if (!pconfig->hstate) { + pr_err("Unsupported page size %lu MB\n", + ps >> 20); + return -EINVAL; + } + break; + } + + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + + default: + pr_err("Bad mount option: \"%s\"\n", p); + return -EINVAL; + break; + } + } + + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; + } + + return 0; + +bad_val: + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); + return -EINVAL; +} + +static int +hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +{ + int ret; + struct hugetlbfs_config config; + struct hugetlbfs_sb_info *sbinfo; + + save_mount_options(sb, data); + + config.max_hpages = -1; /* No limit on size by default */ + config.nr_inodes = -1; /* No limit on number of inodes by default */ + config.uid = current_fsuid(); + config.gid = current_fsgid(); + config.mode = 0755; + config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ + ret = hugetlbfs_parse_options(data, &config); + if (ret) + return ret; + + sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_inodes = config.nr_inodes; + sbinfo->free_inodes = config.nr_inodes; + sbinfo->spool = NULL; + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); + if (!sbinfo->spool) + goto out_free; + } + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_magic = HUGETLBFS_MAGIC; + sb->s_op = &hugetlbfs_ops; + sb->s_time_gran = 1; + sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + if (!sb->s_root) + goto out_free; + return 0; +out_free: + kfree(sbinfo->spool); + kfree(sbinfo); + return -ENOMEM; +} + +static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); +} + +static struct file_system_type hugetlbfs_fs_type = { + .name = "hugetlbfs", + .mount = hugetlbfs_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("hugetlbfs"); + +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; + +static int can_do_hugetlb_shm(void) +{ + kgid_t shm_group; + shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); + return capable(CAP_IPC_LOCK) || in_group_p(shm_group); +} + +static int get_hstate_idx(int page_size_log) +{ + struct hstate *h = hstate_sizelog(page_size_log); + + if (!h) + return -1; + return h - hstates; +} + +static const struct dentry_operations anon_ops = { + .d_dname = simple_dname +}; + +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, + int creat_flags, int page_size_log) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct inode *inode; + struct path path; + struct super_block *sb; + struct qstr quick_string; + int hstate_idx; + + hstate_idx = get_hstate_idx(page_size_log); + if (hstate_idx < 0) + return ERR_PTR(-ENODEV); + + *user = NULL; + if (!hugetlbfs_vfsmount[hstate_idx]) + return ERR_PTR(-ENOENT); + + if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { + *user = current_user(); + if (user_shm_lock(size, *user)) { + task_lock(current); + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + current->comm, current->pid); + task_unlock(current); + } else { + *user = NULL; + return ERR_PTR(-EPERM); + } + } + + sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; + quick_string.name = name; + quick_string.len = strlen(quick_string.name); + quick_string.hash = 0; + path.dentry = d_alloc_pseudo(sb, &quick_string); + if (!path.dentry) + goto out_shm_unlock; + + d_set_d_op(path.dentry, &anon_ops); + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); + file = ERR_PTR(-ENOSPC); + inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto out_dentry; + + file = ERR_PTR(-ENOMEM); + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) + goto out_inode; + + d_instantiate(path.dentry, inode); + inode->i_size = size; + clear_nlink(inode); + + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + &hugetlbfs_file_operations); + if (IS_ERR(file)) + goto out_dentry; /* inode is already attached */ + + return file; + +out_inode: + iput(inode); +out_dentry: + path_put(&path); +out_shm_unlock: + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } + return file; +} + +static int __init init_hugetlbfs_fs(void) +{ + struct hstate *h; + int error; + int i; + + if (!hugepages_supported()) { + pr_info("disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + + error = -ENOMEM; + hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", + sizeof(struct hugetlbfs_inode_info), + 0, 0, init_once); + if (hugetlbfs_inode_cachep == NULL) + goto out2; + + error = register_filesystem(&hugetlbfs_fs_type); + if (error) + goto out; + + i = 0; + for_each_hstate(h) { + char buf[50]; + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); + + snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, + buf); + + if (IS_ERR(hugetlbfs_vfsmount[i])) { + pr_err("Cannot mount internal hugetlbfs for " + "page size %uK", ps_kb); + error = PTR_ERR(hugetlbfs_vfsmount[i]); + hugetlbfs_vfsmount[i] = NULL; + } + i++; + } + /* Non default hstates are optional */ + if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) + return 0; + + out: + kmem_cache_destroy(hugetlbfs_inode_cachep); + out2: + return error; +} + +static void __exit exit_hugetlbfs_fs(void) +{ + struct hstate *h; + int i; + + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(hugetlbfs_inode_cachep); + i = 0; + for_each_hstate(h) + kern_unmount(hugetlbfs_vfsmount[i++]); + unregister_filesystem(&hugetlbfs_fs_type); +} + +module_init(init_hugetlbfs_fs) +module_exit(exit_hugetlbfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/inode.c b/kernel/fs/inode.c new file mode 100644 index 000000000..6e342cade --- /dev/null +++ b/kernel/fs/inode.c @@ -0,0 +1,1977 @@ +/* + * (C) 1997 Linus Torvalds + * (C) 1999 Andrea Arcangeli (dynamic inode allocation) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for inode_has_buffers */ +#include +#include +#include +#include "internal.h" + +/* + * Inode locking rules: + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * Inode LRU list locks protect: + * inode->i_sb->s_inode_lru, inode->i_lru + * inode_sb_list_lock protects: + * sb->s_inodes, inode->i_sb_list + * bdi->wb.list_lock protects: + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash + * + * Lock ordering: + * + * inode_sb_list_lock + * inode->i_lock + * Inode LRU list locks + * + * bdi->wb.list_lock + * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock + */ + +static unsigned int i_hash_mask __read_mostly; +static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); + +/* + * Empty aops. Can be used for the cases where the user does not + * define any of the address_space operations. + */ +const struct address_space_operations empty_aops = { +}; +EXPORT_SYMBOL(empty_aops); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); + +static struct kmem_cache *inode_cachep __read_mostly; + +static long get_nr_inodes(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; +} + +static inline long get_nr_inodes_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_unused, i); + return sum < 0 ? 0 : sum; +} + +long get_nr_dirty_inodes(void) +{ + /* not actually dirty inodes, but a wild approximation */ + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + return nr_dirty > 0 ? nr_dirty : 0; +} + +/* + * Handle nr_inode sysctl + */ +#ifdef CONFIG_SYSCTL +int proc_nr_inodes(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + inodes_stat.nr_unused = get_nr_inodes_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +static int no_open(struct inode *inode, struct file *file) +{ + return -ENXIO; +} + +/** + * inode_init_always - perform inode structure intialisation + * @sb: superblock inode belongs to + * @inode: inode to initialise + * + * These are initializations that need to be done on every inode + * allocation as the fields are not initialised by slab allocation. + */ +int inode_init_always(struct super_block *sb, struct inode *inode) +{ + static const struct inode_operations empty_iops; + static const struct file_operations no_open_fops = {.open = no_open}; + struct address_space *const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &no_open_fops; + inode->__i_nlink = 1; + inode->i_opflags = 0; + i_uid_write(inode, 0); + i_gid_write(inode, 0); + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->dirtied_when = 0; + + if (security_inode_alloc(inode)) + goto out; + spin_lock_init(&inode->i_lock); + lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + + mutex_init(&inode->i_mutex); + lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + + atomic_set(&inode->i_dio_count, 0); + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); + mapping->private_data = NULL; + mapping->writeback_index = 0; + inode->i_private = NULL; + inode->i_mapping = mapping; + INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; +#endif + +#ifdef CONFIG_FSNOTIFY + inode->i_fsnotify_mask = 0; +#endif + inode->i_flctx = NULL; + this_cpu_inc(nr_inodes); + + return 0; +out: + return -ENOMEM; +} +EXPORT_SYMBOL(inode_init_always); + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; +} + +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + +void __destroy_inode(struct inode *inode) +{ + BUG_ON(inode_has_buffers(inode)); + security_inode_free(inode); + fsnotify_inode_delete(inode); + locks_free_lock_context(inode->i_flctx); + if (!inode->i_nlink) { + WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + +#ifdef CONFIG_FS_POSIX_ACL + if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) + posix_acl_release(inode->i_acl); + if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) + posix_acl_release(inode->i_default_acl); +#endif + this_cpu_dec(nr_inodes); +} +EXPORT_SYMBOL(__destroy_inode); + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(inode_cachep, inode); +} + +static void destroy_inode(struct inode *inode) +{ + BUG_ON(!list_empty(&inode->i_lru)); + __destroy_inode(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + call_rcu(&inode->i_rcu, i_callback); +} + +/** + * drop_nlink - directly drop an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. In cases + * where we are attempting to track writes to the + * filesystem, a decrement to zero means an imminent + * write when the file is truncated and actually unlinked + * on the filesystem. + */ +void drop_nlink(struct inode *inode) +{ + WARN_ON(inode->i_nlink == 0); + inode->__i_nlink--; + if (!inode->i_nlink) + atomic_long_inc(&inode->i_sb->s_remove_count); +} +EXPORT_SYMBOL(drop_nlink); + +/** + * clear_nlink - directly zero an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. See + * drop_nlink() for why we care about i_nlink hitting zero. + */ +void clear_nlink(struct inode *inode) +{ + if (inode->i_nlink) { + inode->__i_nlink = 0; + atomic_long_inc(&inode->i_sb->s_remove_count); + } +} +EXPORT_SYMBOL(clear_nlink); + +/** + * set_nlink - directly set an inode's link count + * @inode: inode + * @nlink: new nlink (should be non-zero) + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. + */ +void set_nlink(struct inode *inode, unsigned int nlink) +{ + if (!nlink) { + clear_nlink(inode); + } else { + /* Yes, some filesystems do change nlink from zero to one */ + if (inode->i_nlink == 0) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink = nlink; + } +} +EXPORT_SYMBOL(set_nlink); + +/** + * inc_nlink - directly increment an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. Currently, + * it is only here for parity with dec_nlink(). + */ +void inc_nlink(struct inode *inode) +{ + if (unlikely(inode->i_nlink == 0)) { + WARN_ON(!(inode->i_state & I_LINKABLE)); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + + inode->__i_nlink++; +} +EXPORT_SYMBOL(inc_nlink); + +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + init_rwsem(&mapping->i_mmap_rwsem); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + mapping->i_mmap = RB_ROOT; +} +EXPORT_SYMBOL(address_space_init_once); + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_lru); + address_space_init_once(&inode->i_data); + i_size_ordered_init(inode); +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&inode->i_fsnotify_marks); +#endif +} +EXPORT_SYMBOL(inode_init_once); + +static void init_once(void *foo) +{ + struct inode *inode = (struct inode *) foo; + + inode_init_once(inode); +} + +/* + * inode->i_lock must be held + */ +void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + +/* + * get additional reference to inode; caller must already hold one. + */ +void ihold(struct inode *inode) +{ + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + +static void inode_lru_list_add(struct inode *inode) +{ + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + this_cpu_inc(nr_unused); +} + +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + inode_lru_list_add(inode); +} + + +static void inode_lru_list_del(struct inode *inode) +{ + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + this_cpu_dec(nr_unused); +} + +/** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add + */ +void inode_sb_list_add(struct inode *inode) +{ + spin_lock(&inode_sb_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_sb_list_lock); +} +EXPORT_SYMBOL_GPL(inode_sb_list_add); + +static inline void inode_sb_list_del(struct inode *inode) +{ + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } +} + +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); + return tmp & i_hash_mask; +} + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_add_head(&inode->i_hash, b); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void __remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL(__remove_inode_hash); + +void clear_inode(struct inode *inode) +{ + might_sleep(); + /* + * We have to cycle tree_lock here because reclaim can be still in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free mapping under it. + */ + spin_lock_irq(&inode->i_data.tree_lock); + BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); + spin_unlock_irq(&inode->i_data.tree_lock); + BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); + /* don't need i_lock here, no concurrent mods to i_state */ + inode->i_state = I_FREEING | I_CLEAR; +} +EXPORT_SYMBOL(clear_inode); + +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ +static void evict(struct inode *inode) +{ + const struct super_operations *op = inode->i_sb->s_op; + + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + if (!list_empty(&inode->i_wb_list)) + inode_wb_list_del(inode); + + inode_sb_list_del(inode); + + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); + + if (op->evict_inode) { + op->evict_inode(inode); + } else { + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + } + if (S_ISBLK(inode->i_mode) && inode->i_bdev) + bd_forget(inode); + if (S_ISCHR(inode->i_mode) && inode->i_cdev) + cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); +} + +/* + * dispose_list - dispose of the contents of a local list + * @head: the head of the list to free + * + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct inode *inode; + + inode = list_first_entry(head, struct inode, i_lru); + list_del_init(&inode->i_lru); + + evict(inode); + } +} + +/** + * evict_inodes - evict all evictable inodes for a superblock + * @sb: superblock to operate on + * + * Make sure that no inodes with zero refcount are retained. This is + * called by superblock shutdown after having MS_ACTIVE flag removed, + * so any inode reaching zero refcount during or after that call will + * be immediately evicted. + */ +void evict_inodes(struct super_block *sb) +{ + struct inode *inode, *next; + LIST_HEAD(dispose); + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (atomic_read(&inode->i_count)) + continue; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + } + spin_unlock(&inode_sb_list_lock); + + dispose_list(&dispose); +} + +/** + * invalidate_inodes - attempt to free all inodes on a superblock + * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes + * + * Attempts to free all inodes for a given superblock. If there were any + * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. + */ +int invalidate_inodes(struct super_block *sb, bool kill_dirty) +{ + int busy = 0; + struct inode *inode, *next; + LIST_HEAD(dispose); + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + } + spin_unlock(&inode_sb_list_lock); + + dispose_list(&dispose); + + return busy; +} + +/* + * Isolate the inode from the LRU in preparation for freeing it. + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. + * + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. + */ +static enum lru_status inode_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); + + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_lru_isolate(lru, &inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } + + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; + } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } + + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_lru_isolate_move(lru, &inode->i_lru, freeable); + spin_unlock(&inode->i_lock); + + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} + +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) +{ + LIST_HEAD(freeable); + long freed; + + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, + inode_lru_isolate, &freeable); + dispose_list(&freeable); + return freed; +} + +static void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + */ +static struct inode *find_inode(struct super_block *sb, + struct hlist_head *head, + int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode *find_inode_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * Each cpu owns a range of LAST_INO_BATCH numbers. + * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, + * to renew the exhausted range. + * + * This does not significantly increase overflow rate because every CPU can + * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is + * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the + * 2^32 range, and is a worst-case. Even a 50% wastage would only increase + * overflow rate by 2x, which does not seem too significant. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(unsigned int, last_ino); + +unsigned int get_next_ino(void) +{ + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; + +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; + int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); + + res = next - LAST_INO_BATCH; + } +#endif + + *p = ++res; + put_cpu_var(last_ino); + return res; +} +EXPORT_SYMBOL(get_next_ino); + +/** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. The default gfp_mask + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. + * If HIGHMEM pages are unsuitable or it is known that pages allocated + * for the page cache are not reclaimable or migratable, + * mapping_set_gfp_mask() must be called with suitable flags on the + * newly created inode's mapping + * + */ +struct inode *new_inode(struct super_block *sb) +{ + struct inode *inode; + + spin_lock_prefetch(&inode_sb_list_lock); + + inode = new_inode_pseudo(sb); + if (inode) + inode_sb_list_add(inode); + return inode; +} +EXPORT_SYMBOL(new_inode); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) { + struct file_system_type *type = inode->i_sb->s_type; + + /* Set new key only if filesystem hasn't already changed it */ + if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { + /* + * ensure nobody is actually holding i_mutex + */ + mutex_destroy(&inode->i_mutex); + mutex_init(&inode->i_mutex); + lockdep_set_class(&inode->i_mutex, + &type->i_mutex_dir_key); + } + } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); +#endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(unlock_new_inode); + +/** + * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + */ +void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_lock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); +} +EXPORT_SYMBOL(lock_two_nondirectories); + +/** + * unlock_two_nondirectories - release locks from lock_two_nondirectories() + * @inode1: first inode to unlock + * @inode2: second inode to unlock + */ +void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_unlock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); +} +EXPORT_SYMBOL(unlock_two_nondirectories); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + if (inode) { + wait_on_inode(inode); + return inode; + } + + inode = alloc_inode(sb); + if (inode) { + struct inode *old; + + spin_lock(&inode_hash_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + return NULL; +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + wait_on_inode(inode); + return inode; + } + + inode = alloc_inode(sb); + if (inode) { + struct inode *old; + + spin_lock(&inode_hash_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} +EXPORT_SYMBOL(iget_locked); + +/* + * search the inode cache for a matching inode number. + * If we find one, then the inode number we are trying to + * allocate is not unique and so we should not use it. + * + * Returns 1 if the inode number is unique, 0 if it is not. + */ +static int test_inode_iunique(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct inode *inode; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); + return 0; + } + } + spin_unlock(&inode_hash_lock); + + return 1; +} + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + /* + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ + static DEFINE_SPINLOCK(iunique_lock); + static unsigned int counter; + ino_t res; + + spin_lock(&iunique_lock); + do { + if (counter <= max_reserved) + counter = max_reserved + 1; + res = counter++; + } while (!test_inode_iunique(sb, res)); + spin_unlock(&iunique_lock); + + return res; +} +EXPORT_SYMBOL(iunique); + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + __iget(inode); + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + } + return inode; +} +EXPORT_SYMBOL(igrab); + +/** + * ilookup5_nowait - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache. + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. + * + * Note2: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + return inode; +} +EXPORT_SYMBOL(ilookup5_nowait); + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. + * returned with an incremented reference count. + * + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. + * + * Note: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); + + if (inode) + wait_on_inode(inode); + return inode; +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + + if (inode) + wait_on_inode(inode); + return inode; +} +EXPORT_SYMBOL(ilookup); + +/** + * find_inode_nowait - find an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @match: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @match + * + * Search for the inode specified by @hashval and @data in the inode + * cache, where the helper function @match will return 0 if the inode + * does not match, 1 if the inode does match, and -1 if the search + * should be stopped. The @match function must be responsible for + * taking the i_lock spin_lock and checking i_state for an inode being + * freed or being initialized, and incrementing the reference count + * before returning 1. It also must not sleep, since it is called with + * the inode_hash_lock spinlock held. + * + * This is a even more generalized version of ilookup5() when the + * function must never block --- find_inode() can block in + * __wait_on_freeing_inode() --- or when the caller can not increment + * the reference count because the resulting iput() might cause an + * inode eviction. The tradeoff is that the @match funtion must be + * very carefully implemented. + */ +struct inode *find_inode_nowait(struct super_block *sb, + unsigned long hashval, + int (*match)(struct inode *, unsigned long, + void *), + void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *ret_inode = NULL; + int mval; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); + if (mval == 0) + continue; + if (mval == 1) + ret_inode = inode; + goto out; + } +out: + spin_unlock(&inode_hash_lock); + return ret_inode; +} +EXPORT_SYMBOL(find_inode_nowait); + +int insert_inode_locked(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + while (1) { + struct inode *old = NULL; + spin_lock(&inode_hash_lock); + hlist_for_each_entry(old, head, i_hash) { + if (old->i_ino != ino) + continue; + if (old->i_sb != sb) + continue; + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); + continue; + } + break; + } + if (likely(!old)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + return 0; + } + __iget(old); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} +EXPORT_SYMBOL(insert_inode_locked); + +int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct super_block *sb = inode->i_sb; + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + while (1) { + struct inode *old = NULL; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(old, head, i_hash) { + if (old->i_sb != sb) + continue; + if (!test(old, data)) + continue; + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); + continue; + } + break; + } + if (likely(!old)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + return 0; + } + __iget(old); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} +EXPORT_SYMBOL(insert_inode_locked4); + + +int generic_delete_inode(struct inode *inode) +{ + return 1; +} +EXPORT_SYMBOL(generic_delete_inode); + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop_inode()" function, defaulting to + * the legacy UNIX filesystem behaviour. If it tells + * us to evict inode, do so. Otherwise, retain inode + * in cache if fs is alive, sync and evict if fs is + * shutting down. + */ +static void iput_final(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + const struct super_operations *op = inode->i_sb->s_op; + int drop; + + WARN_ON(inode->i_state & I_NEW); + + if (op->drop_inode) + drop = op->drop_inode(inode); + else + drop = generic_drop_inode(inode); + + if (!drop && (sb->s_flags & MS_ACTIVE)) { + inode->i_state |= I_REFERENCED; + inode_add_lru(inode); + spin_unlock(&inode->i_lock); + return; + } + + if (!drop) { + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; + } + + inode->i_state |= I_FREEING; + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + + evict(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero, the inode is then freed and may also be destroyed. + * + * Consequently, iput() can sleep. + */ +void iput(struct inode *inode) +{ + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); + } +} +EXPORT_SYMBOL(iput); + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ +sector_t bmap(struct inode *inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} +EXPORT_SYMBOL(bmap); + +/* + * With relative atime, only update atime if the previous atime is + * earlier than either the ctime or mtime or if at least a day has + * passed since the last atime update. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than a day? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + +int generic_update_time(struct inode *inode, struct timespec *time, int flags) +{ + int iflags = I_DIRTY_TIME; + + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_VERSION) + inode_inc_iversion(inode); + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); + return 0; +} +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} + +/** + * touch_atime - update the access time + * @path: the &struct path to update + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ +void touch_atime(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + struct inode *inode = d_inode(path->dentry); + struct timespec now; + + if (inode->i_flags & S_NOATIME) + return; + if (IS_NOATIME(inode)) + return; + if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + return; + + if (mnt->mnt_flags & MNT_NOATIME) + return; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return; + + now = current_fs_time(inode->i_sb); + + if (!relatime_need_update(mnt, inode, now)) + return; + + if (timespec_equal(&inode->i_atime, &now)) + return; + + if (!sb_start_write_trylock(inode->i_sb)) + return; + + if (__mnt_want_write(mnt)) + goto skip_update; + /* + * File systems can error out when updating inodes if they need to + * allocate new space to modify an inode (such is the case for + * Btrfs), but since we touch atime while walking down the path we + * really don't care if we failed to update the atime of the file, + * so just ignore the return value. + * We may also fail on filesystems that have the ability to make parts + * of the fs read only, e.g. subvolumes in Btrfs. + */ + update_time(inode, &now, S_ATIME); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); +} +EXPORT_SYMBOL(touch_atime); + +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int should_remove_suid(struct dentry *dentry) +{ + umode_t mode = d_inode(dentry)->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +static int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + /* + * Note we call this on write, so notify_change will not + * encounter any conflicting delegations: + */ + return notify_change(dentry, &newattrs, NULL); +} + +int file_remove_suid(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + int killsuid; + int killpriv; + int error = 0; + + /* Fast path for nothing security related */ + if (IS_NOSEC(inode)) + return 0; + + killsuid = should_remove_suid(dentry); + killpriv = security_inode_need_killpriv(dentry); + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); + if (!error) + inode_has_no_xattr(inode); + + return error; +} +EXPORT_SYMBOL(file_remove_suid); + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode + * for writeback. Note that this function is meant exclusively for + * usage in the file write path of filesystems, and filesystems may + * choose to explicitly ignore update via this function with the + * S_NOCMTIME inode flag, e.g. for network filesystem where these + * timestamps are handled by the server. This can return an error for + * file systems who need to allocate space in order to update an inode. + */ + +int file_update_time(struct file *file) +{ + struct inode *inode = file_inode(file); + struct timespec now; + int sync_it = 0; + int ret; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return 0; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; + + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return 0; + + /* Finally allowed to write? Takes lock. */ + if (__mnt_want_write_file(file)) + return 0; + + ret = update_time(inode, &now, sync_it); + __mnt_drop_write_file(file); + + return ret; +} +EXPORT_SYMBOL(file_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * If we try to find an inode in the inode hash while it is being + * deleted, we have to wait until the filesystem completes its + * deletion before reporting that it isn't found. This function waits + * until the deletion _might_ have completed. Callers are responsible + * to recheck inode state. + * + * It doesn't matter if I_NEW is not set initially, a call to + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + schedule(); + finish_wait(wq, &wait.wait); + spin_lock(&inode_hash_lock); +} + +static __initdata unsigned long ihash_entries; +static int __init set_ihash_entries(char *str) +{ + if (!str) + return 0; + ihash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("ihash_entries=", set_ihash_entries); + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init_early(void) +{ + unsigned int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + HASH_EARLY, + &i_hash_shift, + &i_hash_mask, + 0, + 0); + + for (loop = 0; loop < (1U << i_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); +} + +void __init inode_init(void) +{ + unsigned int loop; + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", + sizeof(struct inode), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + init_once); + + /* Hash may have been set up in inode_init_early */ + if (!hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + 0, + &i_hash_shift, + &i_hash_mask, + 0, + 0); + + for (loop = 0; loop < (1U << i_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &pipefifo_fops; + else if (S_ISSOCK(mode)) + ; /* leave it no_open_fops */ + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" + " inode %s:%lu\n", mode, inode->i_sb->s_id, + inode->i_ino); +} +EXPORT_SYMBOL(init_special_inode); + +/** + * inode_init_owner - Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL(inode_init_owner); + +/** + * inode_owner_or_capable - check current task permissions to inode + * @inode: inode being checked + * + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. + */ +bool inode_owner_or_capable(const struct inode *inode) +{ + struct user_namespace *ns; + + if (uid_eq(current_fsuid(), inode->i_uid)) + return true; + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) + return true; + return false; +} +EXPORT_SYMBOL(inode_owner_or_capable); + +/* + * Direct i/o helper functions + */ +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wait); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL(inode_dio_wait); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); diff --git a/kernel/fs/internal.h b/kernel/fs/internal.h new file mode 100644 index 000000000..01dce1d14 --- /dev/null +++ b/kernel/fs/internal.h @@ -0,0 +1,153 @@ +/* fs/ internal definitions + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +struct super_block; +struct file_system_type; +struct linux_binprm; +struct path; +struct mount; +struct shrink_control; + +/* + * block_dev.c + */ +#ifdef CONFIG_BLOCK +extern void __init bdev_cache_init(void); + +extern int __sync_blockdev(struct block_device *bdev, int wait); + +#else +static inline void bdev_cache_init(void) +{ +} + +static inline int __sync_blockdev(struct block_device *bdev, int wait) +{ + return 0; +} +#endif + +/* + * buffer.c + */ +extern void guard_bio_eod(int rw, struct bio *bio); + +/* + * char_dev.c + */ +extern void __init chrdev_init(void); + +/* + * namei.c + */ +extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); +extern int vfs_path_lookup(struct dentry *, struct vfsmount *, + const char *, unsigned int, struct path *); + +/* + * namespace.c + */ +extern int copy_mount_options(const void __user *, unsigned long *); +extern char *copy_mount_string(const void __user *); + +extern struct vfsmount *lookup_mnt(struct path *); +extern int finish_automount(struct vfsmount *, struct path *); + +extern int sb_prepare_remount_readonly(struct super_block *); + +extern void __init mnt_init(void); + +extern int __mnt_want_write(struct vfsmount *); +extern int __mnt_want_write_file(struct file *); +extern void __mnt_drop_write(struct vfsmount *); +extern void __mnt_drop_write_file(struct file *); + +/* + * fs_struct.c + */ +extern void chroot_fs_refs(const struct path *, const struct path *); + +/* + * file_table.c + */ +extern struct file *get_empty_filp(void); + +/* + * super.c + */ +extern int do_remount_sb(struct super_block *, int, void *, int); +extern bool trylock_super(struct super_block *sb); +extern struct dentry *mount_fs(struct file_system_type *, + int, const char *, void *); +extern struct super_block *user_get_super(dev_t); + +/* + * open.c + */ +struct open_flags { + int open_flag; + umode_t mode; + int acc_mode; + int intent; + int lookup_flags; +}; +extern struct file *do_filp_open(int dfd, struct filename *pathname, + const struct open_flags *op); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); +extern int open_check_o_direct(struct file *f); + +/* + * inode.c + */ +extern spinlock_t inode_sb_list_lock; +extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); +extern void inode_add_lru(struct inode *inode); + +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + +extern long get_nr_dirty_inodes(void); +extern void evict_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); + +/* + * dcache.c + */ +extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); +extern int d_set_mounted(struct dentry *dentry); +extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); + +/* + * read_write.c + */ +extern int rw_verify_area(int, struct file *, const loff_t *, size_t); + +/* + * pipe.c + */ +extern const struct file_operations pipefifo_fops; + +/* + * fs_pin.c + */ +extern void group_pin_kill(struct hlist_head *p); +extern void mnt_pin_kill(struct mount *m); + +/* + * fs/nsfs.c + */ +extern struct dentry_operations ns_dentry_operations; diff --git a/kernel/fs/ioctl.c b/kernel/fs/ioctl.c new file mode 100644 index 000000000..5d01d2638 --- /dev/null +++ b/kernel/fs/ioctl.c @@ -0,0 +1,625 @@ +/* + * linux/fs/ioctl.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +/** + * vfs_ioctl - call filesystem specific ioctl methods + * @filp: open file to invoke ioctl method on + * @cmd: ioctl command to execute + * @arg: command-specific argument for ioctl + * + * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise + * returns -ENOTTY. + * + * Returns 0 on success, -errno on error. + */ +static long vfs_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int error = -ENOTTY; + + if (!filp->f_op->unlocked_ioctl) + goto out; + + error = filp->f_op->unlocked_ioctl(filp, cmd, arg); + if (error == -ENOIOCTLCMD) + error = -ENOTTY; + out: + return error; +} + +static int ioctl_fibmap(struct file *filp, int __user *p) +{ + struct address_space *mapping = filp->f_mapping; + int res, block; + + /* do we support this mess? */ + if (!mapping->a_ops->bmap) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + res = get_user(block, p); + if (res) + return res; + res = mapping->a_ops->bmap(mapping, block); + return put_user(res, p); +} + +/** + * fiemap_fill_next_extent - Fiemap helper function + * @fieinfo: Fiemap context passed into ->fiemap + * @logical: Extent logical start offset, in bytes + * @phys: Extent physical start offset, in bytes + * @len: Extent length, in bytes + * @flags: FIEMAP_EXTENT flags that describe this extent + * + * Called from file system ->fiemap callback. Will populate extent + * info as passed in via arguments and copy to user memory. On + * success, extent count on fieinfo is incremented. + * + * Returns 0 on success, -errno on error, 1 if this was the last + * extent that will fit in user array. + */ +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) +int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, + u64 phys, u64 len, u32 flags) +{ + struct fiemap_extent extent; + struct fiemap_extent __user *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + if (flags & SET_UNKNOWN_FLAGS) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (flags & SET_NO_UNMOUNTED_IO_FLAGS) + flags |= FIEMAP_EXTENT_ENCODED; + if (flags & SET_NOT_ALIGNED_FLAGS) + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + + memset(&extent, 0, sizeof(extent)); + extent.fe_logical = logical; + extent.fe_physical = phys; + extent.fe_length = len; + extent.fe_flags = flags; + + dest += fieinfo->fi_extents_mapped; + if (copy_to_user(dest, &extent, sizeof(extent))) + return -EFAULT; + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} +EXPORT_SYMBOL(fiemap_fill_next_extent); + +/** + * fiemap_check_flags - check validity of requested flags for fiemap + * @fieinfo: Fiemap context passed into ->fiemap + * @fs_flags: Set of fiemap flags that the file system understands + * + * Called from file system ->fiemap callback. This will compute the + * intersection of valid fiemap flags and those that the fs supports. That + * value is then compared against the user supplied flags. In case of bad user + * flags, the invalid values will be written into the fieinfo structure, and + * -EBADR is returned, which tells ioctl_fiemap() to return those values to + * userspace. For this reason, a return code of -EBADR should be preserved. + * + * Returns 0 on success, -EBADR on bad flags. + */ +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +{ + u32 incompat_flags; + + incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } + return 0; +} +EXPORT_SYMBOL(fiemap_check_flags); + +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + u64 maxbytes = (u64) sb->s_maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +static int ioctl_fiemap(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (!inode->i_op->fiemap) + return -EOPNOTSUPP; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + if (fiemap.fm_extent_count != 0 && + !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_BLOCK + +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} + +/** + * __generic_block_fiemap - FIEMAP for block based inodes (no locking) + * @inode: the inode to map + * @fieinfo: the fiemap info struct that will be passed back to userspace + * @start: where to start mapping in the inode + * @len: how much space to map + * @get_block: the fs's get_block function + * + * This does FIEMAP for block based inodes. Basically it will just loop + * through get_block until we hit the number of extents we want to map, or we + * go past the end of the file and hit a hole. + * + * If it is possible to have data blocks beyond a hole past @inode->i_size, then + * please do not use this function, it will stop at the first unmapped block + * beyond i_size. + * + * If you use this function directly, you need to do your own locking. Use + * generic_block_fiemap if you want the locking done for you. + */ + +int __generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, loff_t start, + loff_t len, get_block_t *get_block) +{ + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); + u64 logical = 0, phys = 0, size = 0; + u32 flags = FIEMAP_EXTENT_MERGED; + bool past_eof = false, whole_file = false; + int ret = 0; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + /* + * Either the i_mutex or other appropriate locking needs to be held + * since we expect isize to not change at all through the duration of + * this call. + */ + if (len >= isize) { + whole_file = true; + len = isize; + } + + /* + * Some filesystems can't deal with being asked to map less than + * blocksize, so make sure our len is at least block length. + */ + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); + + do { + /* + * we set b_size to the total size we want so it will map as + * many contiguous blocks as possible at once + */ + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; + + ret = get_block(inode, start_blk, &map_bh, 0); + if (ret) + break; + + /* HOLE */ + if (!buffer_mapped(&map_bh)) { + start_blk++; + + /* + * We want to handle the case where there is an + * allocated block at the front of the file, and then + * nothing but holes up to the end of the file properly, + * to make sure that extent at the front gets properly + * marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && + blk_to_logical(inode, start_blk) >= isize) + past_eof = 1; + + /* + * First hole after going past the EOF, this is our + * last extent + */ + if (past_eof && size) { + flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; + } + + /* if we have holes up to/past EOF then we're done */ + if (start_blk > last_blk || past_eof || ret) + break; + } else { + /* + * We have gone over the length of what we wanted to + * map, and it wasn't the entire file, so add the extent + * we got last time and exit. + * + * This is for the case where say we want to map all the + * way up to the second to the last block in a file, but + * the last block is a hole, making the second to last + * block FIEMAP_EXTENT_LAST. In this case we want to + * see if there is a hole after the second to last block + * so we can mark it properly. If we found data after + * we exceeded the length we were requesting, then we + * are good to go, just add the extent to the fieinfo + * and break + */ + if (start_blk > last_blk && !whole_file) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + break; + } + + /* + * if size != 0 then we know we already have an extent + * to add, so add it. + */ + if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + if (ret) + break; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = FIEMAP_EXTENT_MERGED; + + start_blk += logical_to_blk(inode, size); + + /* + * If we are past the EOF, then we need to make sure as + * soon as we find a hole that the last extent we found + * is marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && logical + size >= isize) + past_eof = true; + } + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + } while (1); + + /* If ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + return ret; +} +EXPORT_SYMBOL(__generic_block_fiemap); + +/** + * generic_block_fiemap - FIEMAP for block based inodes + * @inode: The inode to map + * @fieinfo: The mapping information + * @start: The initial block to map + * @len: The length of the extect to attempt to map + * @get_block: The block mapping function for the fs + * + * Calls __generic_block_fiemap to map the inode, after taking + * the inode's mutex lock. + */ + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + int ret; + mutex_lock(&inode->i_mutex); + ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); + mutex_unlock(&inode->i_mutex); + return ret; +} +EXPORT_SYMBOL(generic_block_fiemap); + +#endif /* CONFIG_BLOCK */ + +/* + * This provides compatibility with legacy XFS pre-allocation ioctls + * which predate the fallocate syscall. + * + * Only the l_start, l_len and l_whence fields of the 'struct space_resv' + * are used here, rest are ignored. + */ +int ioctl_preallocate(struct file *filp, void __user *argp) +{ + struct inode *inode = file_inode(filp); + struct space_resv sr; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + + switch (sr.l_whence) { + case SEEK_SET: + break; + case SEEK_CUR: + sr.l_start += filp->f_pos; + break; + case SEEK_END: + sr.l_start += i_size_read(inode); + break; + default: + return -EINVAL; + } + + return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); +} + +static int file_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(filp); + int __user *p = (int __user *)arg; + + switch (cmd) { + case FIBMAP: + return ioctl_fibmap(filp, p); + case FIONREAD: + return put_user(i_size_read(inode) - filp->f_pos, p); + case FS_IOC_RESVSP: + case FS_IOC_RESVSP64: + return ioctl_preallocate(filp, p); + } + + return vfs_ioctl(filp, cmd, arg); +} + +static int ioctl_fionbio(struct file *filp, int __user *argp) +{ + unsigned int flag; + int on, error; + + error = get_user(on, argp); + if (error) + return error; + flag = O_NONBLOCK; +#ifdef __sparc__ + /* SunOS compatibility item. */ + if (O_NONBLOCK != O_NDELAY) + flag |= O_NDELAY; +#endif + spin_lock(&filp->f_lock); + if (on) + filp->f_flags |= flag; + else + filp->f_flags &= ~flag; + spin_unlock(&filp->f_lock); + return error; +} + +static int ioctl_fioasync(unsigned int fd, struct file *filp, + int __user *argp) +{ + unsigned int flag; + int on, error; + + error = get_user(on, argp); + if (error) + return error; + flag = on ? FASYNC : 0; + + /* Did FASYNC state change ? */ + if ((flag ^ filp->f_flags) & FASYNC) { + if (filp->f_op->fasync) + /* fasync() adjusts filp->f_flags */ + error = filp->f_op->fasync(fd, filp, on); + else + error = -ENOTTY; + } + return error < 0 ? error : 0; +} + +static int ioctl_fsfreeze(struct file *filp) +{ + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If filesystem doesn't support freeze feature, return. */ + if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) + return -EOPNOTSUPP; + + /* Freeze */ + if (sb->s_op->freeze_super) + return sb->s_op->freeze_super(sb); + return freeze_super(sb); +} + +static int ioctl_fsthaw(struct file *filp) +{ + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* Thaw */ + if (sb->s_op->thaw_super) + return sb->s_op->thaw_super(sb); + return thaw_super(sb); +} + +/* + * When you add any new common ioctls to the switches above and below + * please update compat_sys_ioctl() too. + * + * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. + * It's just a simple helper for sys_ioctl and compat_sys_ioctl. + */ +int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + int error = 0; + int __user *argp = (int __user *)arg; + struct inode *inode = file_inode(filp); + + switch (cmd) { + case FIOCLEX: + set_close_on_exec(fd, 1); + break; + + case FIONCLEX: + set_close_on_exec(fd, 0); + break; + + case FIONBIO: + error = ioctl_fionbio(filp, argp); + break; + + case FIOASYNC: + error = ioctl_fioasync(fd, filp, argp); + break; + + case FIOQSIZE: + if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + loff_t res = inode_get_bytes(inode); + error = copy_to_user(argp, &res, sizeof(res)) ? + -EFAULT : 0; + } else + error = -ENOTTY; + break; + + case FIFREEZE: + error = ioctl_fsfreeze(filp); + break; + + case FITHAW: + error = ioctl_fsthaw(filp); + break; + + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); + + case FIGETBSZ: + return put_user(inode->i_sb->s_blocksize, argp); + + default: + if (S_ISREG(inode->i_mode)) + error = file_ioctl(filp, cmd, arg); + else + error = vfs_ioctl(filp, cmd, arg); + break; + } + return error; +} + +SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + int error; + struct fd f = fdget(fd); + + if (!f.file) + return -EBADF; + error = security_file_ioctl(f.file, cmd, arg); + if (!error) + error = do_vfs_ioctl(f.file, fd, cmd, arg); + fdput(f); + return error; +} diff --git a/kernel/fs/isofs/Kconfig b/kernel/fs/isofs/Kconfig new file mode 100644 index 000000000..8ab9878e3 --- /dev/null +++ b/kernel/fs/isofs/Kconfig @@ -0,0 +1,39 @@ +config ISO9660_FS + tristate "ISO 9660 CDROM file system support" + help + This is the standard file system used on CD-ROMs. It was previously + known as "High Sierra File System" and is called "hsfs" on other + Unix systems. The so-called Rock-Ridge extensions which allow for + long Unix filenames and symbolic links are also supported by this + driver. If you have a CD-ROM drive and want to do more with it than + just listen to audio CDs and watch its LEDs, say Y (and read + and the CD-ROM-HOWTO, + available from ), thereby + enlarging your kernel by about 27 KB; otherwise say N. + + To compile this file system support as a module, choose M here: the + module will be called isofs. + +config JOLIET + bool "Microsoft Joliet CDROM extensions" + depends on ISO9660_FS + select NLS + help + Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system + which allows for long filenames in unicode format (unicode is the + new 16 bit character code, successor to ASCII, which encodes the + characters of almost all languages of the world; see + for more information). Say Y here if you + want to be able to read Joliet CD-ROMs under Linux. + +config ZISOFS + bool "Transparent decompression extension" + depends on ISO9660_FS + select ZLIB_INFLATE + help + This is a Linux-specific extension to RockRidge which lets you store + data in compressed form on a CD-ROM and have it transparently + decompressed when the CD-ROM is accessed. See + for the tools + necessary to create such a filesystem. Say Y here if you want to be + able to read such compressed CD-ROMs. diff --git a/kernel/fs/isofs/Makefile b/kernel/fs/isofs/Makefile new file mode 100644 index 000000000..bf162f094 --- /dev/null +++ b/kernel/fs/isofs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux isofs filesystem routines. +# + +obj-$(CONFIG_ISO9660_FS) += isofs.o + +isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o +isofs-objs-$(CONFIG_JOLIET) += joliet.o +isofs-objs-$(CONFIG_ZISOFS) += compress.o +isofs-objs := $(isofs-objs-y) diff --git a/kernel/fs/isofs/compress.c b/kernel/fs/isofs/compress.c new file mode 100644 index 000000000..f311bf084 --- /dev/null +++ b/kernel/fs/isofs/compress.c @@ -0,0 +1,380 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2001 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * linux/fs/isofs/compress.c + * + * Transparent decompression of files on an iso9660 filesystem + */ + +#include +#include + +#include +#include + +#include "isofs.h" +#include "zisofs.h" + +/* This should probably be global. */ +static char zisofs_sink_page[PAGE_CACHE_SIZE]; + +/* + * This contains the zlib memory allocation and the mutex for the + * allocation; this avoids failures at block-decompression time. + */ +static void *zisofs_zlib_workspace; +static DEFINE_MUTEX(zisofs_zlib_lock); + +/* + * Read data of @inode from @block_start to @block_end and uncompress + * to one zisofs block. Store the data in the @pages array with @pcount + * entries. Start storing at offset @poffset of the first page. + */ +static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, + loff_t block_end, int pcount, + struct page **pages, unsigned poffset, + int *errp) +{ + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned int bufshift = ISOFS_BUFFER_BITS(inode); + unsigned int bufmask = bufsize - 1; + int i, block_size = block_end - block_start; + z_stream stream = { .total_out = 0, + .avail_in = 0, + .avail_out = 0, }; + int zerr; + int needblocks = (block_size + (block_start & bufmask) + bufmask) + >> bufshift; + int haveblocks; + blkcnt_t blocknum; + struct buffer_head *bhs[needblocks + 1]; + int curbh, curpage; + + if (block_size > deflateBound(1UL << zisofs_block_shift)) { + *errp = -EIO; + return 0; + } + /* Empty block? */ + if (block_size == 0) { + for ( i = 0 ; i < pcount ; i++ ) { + if (!pages[i]) + continue; + memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE); + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + return ((loff_t)pcount) << PAGE_CACHE_SHIFT; + } + + /* Because zlib is not thread-safe, do all the I/O at the top. */ + blocknum = block_start >> bufshift; + memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); + ll_rw_block(READ, haveblocks, bhs); + + curbh = 0; + curpage = 0; + /* + * First block is special since it may be fractional. We also wait for + * it before grabbing the zlib mutex; odds are that the subsequent + * blocks are going to come in in short order so we don't hold the zlib + * mutex longer than necessary. + */ + + if (!bhs[0]) + goto b_eio; + + wait_on_buffer(bhs[0]); + if (!buffer_uptodate(bhs[0])) { + *errp = -EIO; + goto b_eio; + } + + stream.workspace = zisofs_zlib_workspace; + mutex_lock(&zisofs_zlib_lock); + + zerr = zlib_inflateInit(&stream); + if (zerr != Z_OK) { + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else + *errp = -EIO; + printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", + zerr); + goto z_eio; + } + + while (curpage < pcount && curbh < haveblocks && + zerr != Z_STREAM_END) { + if (!stream.avail_out) { + if (pages[curpage]) { + stream.next_out = page_address(pages[curpage]) + + poffset; + stream.avail_out = PAGE_CACHE_SIZE - poffset; + poffset = 0; + } else { + stream.next_out = (void *)&zisofs_sink_page; + stream.avail_out = PAGE_CACHE_SIZE; + } + } + if (!stream.avail_in) { + wait_on_buffer(bhs[curbh]); + if (!buffer_uptodate(bhs[curbh])) { + *errp = -EIO; + break; + } + stream.next_in = bhs[curbh]->b_data + + (block_start & bufmask); + stream.avail_in = min_t(unsigned, bufsize - + (block_start & bufmask), + block_size); + block_size -= stream.avail_in; + block_start = 0; + } + + while (stream.avail_out && stream.avail_in) { + zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); + if (zerr == Z_BUF_ERROR && stream.avail_in == 0) + break; + if (zerr == Z_STREAM_END) + break; + if (zerr != Z_OK) { + /* EOF, error, or trying to read beyond end of input */ + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else { + printk(KERN_DEBUG + "zisofs: zisofs_inflate returned" + " %d, inode = %lu," + " page idx = %d, bh idx = %d," + " avail_in = %ld," + " avail_out = %ld\n", + zerr, inode->i_ino, curpage, + curbh, stream.avail_in, + stream.avail_out); + *errp = -EIO; + } + goto inflate_out; + } + } + + if (!stream.avail_out) { + /* This page completed */ + if (pages[curpage]) { + flush_dcache_page(pages[curpage]); + SetPageUptodate(pages[curpage]); + } + curpage++; + } + if (!stream.avail_in) + curbh++; + } +inflate_out: + zlib_inflateEnd(&stream); + +z_eio: + mutex_unlock(&zisofs_zlib_lock); + +b_eio: + for (i = 0; i < haveblocks; i++) + brelse(bhs[i]); + return stream.total_out; +} + +/* + * Uncompress data so that pages[full_page] is fully uptodate and possibly + * fills in other pages if we have data for them. + */ +static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, + struct page **pages) +{ + loff_t start_off, end_off; + loff_t block_start, block_end; + unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int blockptr; + loff_t poffset = 0; + blkcnt_t cstart_block, cend_block; + struct buffer_head *bh; + unsigned int blkbits = ISOFS_BUFFER_BITS(inode); + unsigned int blksize = 1 << blkbits; + int err; + loff_t ret; + + BUG_ON(!pages[full_page]); + + /* + * We want to read at least 'full_page' page. Because we have to + * uncompress the whole compression block anyway, fill the surrounding + * pages with the data we have anyway... + */ + start_off = page_offset(pages[full_page]); + end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size); + + cstart_block = start_off >> zisofs_block_shift; + cend_block = (end_off + (1 << zisofs_block_shift) - 1) + >> zisofs_block_shift; + + WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) != + ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK)); + + /* Find the pointer to this specific chunk */ + /* Note: we're not using isonum_731() here because the data is known aligned */ + /* Note: header_size is in 32-bit words (4 bytes) */ + blockptr = (header_size + cstart_block) << 2; + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + block_start = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + + while (cstart_block < cend_block && pcount > 0) { + /* Load end of the compressed block in the file */ + blockptr += 4; + /* Traversed to next block? */ + if (!(blockptr & (blksize - 1))) { + brelse(bh); + + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + } + block_end = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + if (block_start > block_end) { + brelse(bh); + return -EIO; + } + err = 0; + ret = zisofs_uncompress_block(inode, block_start, block_end, + pcount, pages, poffset, &err); + poffset += ret; + pages += poffset >> PAGE_CACHE_SHIFT; + pcount -= poffset >> PAGE_CACHE_SHIFT; + full_page -= poffset >> PAGE_CACHE_SHIFT; + poffset &= ~PAGE_CACHE_MASK; + + if (err) { + brelse(bh); + /* + * Did we finish reading the page we really wanted + * to read? + */ + if (full_page < 0) + return 0; + return err; + } + + block_start = block_end; + cstart_block++; + } + + if (poffset && *pages) { + memset(page_address(*pages) + poffset, 0, + PAGE_CACHE_SIZE - poffset); + flush_dcache_page(*pages); + SetPageUptodate(*pages); + } + return 0; +} + +/* + * When decompressing, we typically obtain more than one page + * per reference. We inject the additional pages into the page + * cache as a form of readahead. + */ +static int zisofs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + int err; + int i, pcount, full_page; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int zisofs_pages_per_cblock = + PAGE_CACHE_SHIFT <= zisofs_block_shift ? + (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0; + struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + pgoff_t index = page->index, end_index; + + end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* + * If this page is wholly outside i_size we just return zero; + * do_generic_file_read() will handle this for us + */ + if (index >= end_index) { + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + if (PAGE_CACHE_SHIFT <= zisofs_block_shift) { + /* We have already been given one page, this is the one + we must do. */ + full_page = index & (zisofs_pages_per_cblock - 1); + pcount = min_t(int, zisofs_pages_per_cblock, + end_index - (index & ~(zisofs_pages_per_cblock - 1))); + index -= full_page; + } else { + full_page = 0; + pcount = 1; + } + pages[full_page] = page; + + for (i = 0; i < pcount; i++, index++) { + if (i != full_page) + pages[i] = grab_cache_page_nowait(mapping, index); + if (pages[i]) { + ClearPageError(pages[i]); + kmap(pages[i]); + } + } + + err = zisofs_fill_pages(inode, full_page, pcount, pages); + + /* Release any residual pages, do not SetPageUptodate */ + for (i = 0; i < pcount; i++) { + if (pages[i]) { + flush_dcache_page(pages[i]); + if (i == full_page && err) + SetPageError(pages[i]); + kunmap(pages[i]); + unlock_page(pages[i]); + if (i != full_page) + page_cache_release(pages[i]); + } + } + + /* At this point, err contains 0 or -EIO depending on the "critical" page */ + return err; +} + +const struct address_space_operations zisofs_aops = { + .readpage = zisofs_readpage, + /* No sync_page operation supported? */ + /* No bmap operation supported */ +}; + +int __init zisofs_init(void) +{ + zisofs_zlib_workspace = vmalloc(zlib_inflate_workspacesize()); + if ( !zisofs_zlib_workspace ) + return -ENOMEM; + + return 0; +} + +void zisofs_cleanup(void) +{ + vfree(zisofs_zlib_workspace); +} diff --git a/kernel/fs/isofs/dir.c b/kernel/fs/isofs/dir.c new file mode 100644 index 000000000..b943cbd96 --- /dev/null +++ b/kernel/fs/isofs/dir.c @@ -0,0 +1,283 @@ +/* + * linux/fs/isofs/dir.c + * + * (C) 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * Steve Beynon : Missing last directory entries fixed + * (stephen@askone.demon.co.uk) : 21st June 1996 + * + * isofs directory handling functions + */ +#include +#include "isofs.h" + +int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) +{ + char * old = de->name; + int len = de->name_len[0]; + int i; + + for (i = 0; i < len; i++) { + unsigned char c = old[i]; + if (!c) + break; + + if (c >= 'A' && c <= 'Z') + c |= 0x20; /* lower case */ + + /* Drop trailing '.;1' (ISO 9660:1988 7.5.1 requires period) */ + if (c == '.' && i == len - 3 && old[i + 1] == ';' && old[i + 2] == '1') + break; + + /* Drop trailing ';1' */ + if (c == ';' && i == len - 2 && old[i + 1] == '1') + break; + + /* Convert remaining ';' to '.' */ + /* Also '/' to '.' (broken Acorn-generated ISO9660 images) */ + if (c == ';' || c == '/') + c = '.'; + + new[i] = c; + } + return i; +} + +/* Acorn extensions written by Matthew Wilcox 1998 */ +int get_acorn_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) +{ + int std; + unsigned char *chr; + int retnamlen = isofs_name_translate(de, retname, inode); + + if (retnamlen == 0) + return 0; + std = sizeof(struct iso_directory_record) + de->name_len[0]; + if (std & 1) + std++; + if ((*((unsigned char *) de) - std) != 32) + return retnamlen; + chr = ((unsigned char *) de) + std; + if (strncmp(chr, "ARCHIMEDES", 10)) + return retnamlen; + if ((*retname == '_') && ((chr[19] & 1) == 1)) + *retname = '!'; + if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) + && ((chr[12] & 0xf0) == 0xf0)) { + retname[retnamlen] = ','; + sprintf(retname+retnamlen+1, "%3.3x", + ((chr[12] & 0xf) << 8) | chr[11]); + retnamlen += 4; + } + return retnamlen; +} + +/* + * This should _really_ be cleaned up some day.. + */ +static int do_isofs_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, + char *tmpname, struct iso_directory_record *tmpde) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned char bufbits = ISOFS_BUFFER_BITS(inode); + unsigned long block, offset, block_saved, offset_saved; + unsigned long inode_number = 0; /* Quiet GCC */ + struct buffer_head *bh = NULL; + int len; + int map; + int first_de = 1; + char *p = NULL; /* Quiet GCC */ + struct iso_directory_record *de; + struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); + + offset = ctx->pos & (bufsize - 1); + block = ctx->pos >> bufbits; + + while (ctx->pos < inode->i_size) { + int de_len; + + if (!bh) { + bh = isofs_bread(inode, block); + if (!bh) + return 0; + } + + de = (struct iso_directory_record *) (bh->b_data + offset); + + de_len = *(unsigned char *)de; + + /* + * If the length byte is zero, we should move on to the next + * CDROM sector. If we are at the end of the directory, we + * kick out of the while loop. + */ + + if (de_len == 0) { + brelse(bh); + bh = NULL; + ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = ctx->pos >> bufbits; + offset = 0; + continue; + } + + block_saved = block; + offset_saved = offset; + offset += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = isofs_bread(inode, block); + if (!bh) + return 0; + memcpy((void *) tmpde + slop, bh->b_data, offset); + } + de = tmpde; + } + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < de->name_len[0] + + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + inode->i_ino); + return -EIO; + } + + if (first_de) { + isofs_normalize_block_and_offset(de, + &block_saved, + &offset_saved); + inode_number = isofs_get_ino(block_saved, + offset_saved, bufbits); + } + + if (de->flags[-sbi->s_high_sierra] & 0x80) { + first_de = 0; + ctx->pos += de_len; + continue; + } + first_de = 1; + + /* Handle the case of the '.' directory */ + if (de->name_len[0] == 1 && de->name[0] == 0) { + if (!dir_emit_dot(file, ctx)) + break; + ctx->pos += de_len; + continue; + } + + len = 0; + + /* Handle the case of the '..' directory */ + if (de->name_len[0] == 1 && de->name[0] == 1) { + if (!dir_emit_dotdot(file, ctx)) + break; + ctx->pos += de_len; + continue; + } + + /* Handle everything else. Do name translation if there + is no Rock Ridge NM field. */ + + /* + * Do not report hidden files if so instructed, or associated + * files unless instructed to do so + */ + if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || + (!sbi->s_showassoc && + (de->flags[-sbi->s_high_sierra] & 4))) { + ctx->pos += de_len; + continue; + } + + map = 1; + if (sbi->s_rock) { + len = get_rock_ridge_filename(de, tmpname, inode); + if (len != 0) { /* may be -1 */ + p = tmpname; + map = 0; + } + } + if (map) { +#ifdef CONFIG_JOLIET + if (sbi->s_joliet_level) { + len = get_joliet_filename(de, tmpname, inode); + p = tmpname; + } else +#endif + if (sbi->s_mapping == 'a') { + len = get_acorn_filename(de, tmpname, inode); + p = tmpname; + } else + if (sbi->s_mapping == 'n') { + len = isofs_name_translate(de, tmpname, inode); + p = tmpname; + } else { + p = de->name; + len = de->name_len[0]; + } + } + if (len > 0) { + if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN)) + break; + } + ctx->pos += de_len; + + continue; + } + if (bh) + brelse(bh); + return 0; +} + +/* + * Handle allocation of temporary space for name translation and + * handling split directory entries.. The real work is done by + * "do_isofs_readdir()". + */ +static int isofs_readdir(struct file *file, struct dir_context *ctx) +{ + int result; + char *tmpname; + struct iso_directory_record *tmpde; + struct inode *inode = file_inode(file); + + tmpname = (char *)__get_free_page(GFP_KERNEL); + if (tmpname == NULL) + return -ENOMEM; + + tmpde = (struct iso_directory_record *) (tmpname+1024); + + result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde); + + free_page((unsigned long) tmpname); + return result; +} + +const struct file_operations isofs_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = isofs_readdir, +}; + +/* + * directories can handle most operations... + */ +const struct inode_operations isofs_dir_inode_operations = +{ + .lookup = isofs_lookup, +}; + + diff --git a/kernel/fs/isofs/export.c b/kernel/fs/isofs/export.c new file mode 100644 index 000000000..0c5f721b4 --- /dev/null +++ b/kernel/fs/isofs/export.c @@ -0,0 +1,192 @@ +/* + * fs/isofs/export.c + * + * (C) 2004 Paul Serice - The new inode scheme requires switching + * from iget() to iget5_locked() which means + * the NFS export operations have to be hand + * coded because the default routines rely on + * iget(). + * + * The following files are helpful: + * + * Documentation/filesystems/nfs/Exporting + * fs/exportfs/expfs.c. + */ + +#include "isofs.h" + +static struct dentry * +isofs_export_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + __u32 generation) +{ + struct inode *inode; + + if (block == 0) + return ERR_PTR(-ESTALE); + inode = isofs_iget(sb, block, offset); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +/* This function is surprisingly simple. The trick is understanding + * that "child" is always a directory. So, to find its parent, you + * simply need to find its ".." entry, normalize its block and offset, + * and return the underlying inode. See the comments for + * isofs_normalize_block_and_offset(). */ +static struct dentry *isofs_export_get_parent(struct dentry *child) +{ + unsigned long parent_block = 0; + unsigned long parent_offset = 0; + struct inode *child_inode = d_inode(child); + struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); + struct iso_directory_record *de = NULL; + struct buffer_head * bh = NULL; + struct dentry *rv = NULL; + + /* "child" must always be a directory. */ + if (!S_ISDIR(child_inode->i_mode)) { + printk(KERN_ERR "isofs: isofs_export_get_parent(): " + "child is not a directory!\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* It is an invariant that the directory offset is zero. If + * it is not zero, it means the directory failed to be + * normalized for some reason. */ + if (e_child_inode->i_iget5_offset != 0) { + printk(KERN_ERR "isofs: isofs_export_get_parent(): " + "child directory not normalized!\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* The child inode has been normalized such that its + * i_iget5_block value points to the "." entry. Fortunately, + * the ".." entry is located in the same block. */ + parent_block = e_child_inode->i_iget5_block; + + /* Get the block in question. */ + bh = sb_bread(child_inode->i_sb, parent_block); + if (bh == NULL) { + rv = ERR_PTR(-EACCES); + goto out; + } + + /* This is the "." entry. */ + de = (struct iso_directory_record*)bh->b_data; + + /* The ".." entry is always the second entry. */ + parent_offset = (unsigned long)isonum_711(de->length); + de = (struct iso_directory_record*)(bh->b_data + parent_offset); + + /* Verify it is in fact the ".." entry. */ + if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { + printk(KERN_ERR "isofs: Unable to find the \"..\" " + "directory for NFS.\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* Normalize */ + isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); + + rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, + parent_offset)); + out: + if (bh) + brelse(bh); + return rv; +} + +static int +isofs_export_encode_fh(struct inode *inode, + __u32 *fh32, + int *max_len, + struct inode *parent) +{ + struct iso_inode_info * ei = ISOFS_I(inode); + int len = *max_len; + int type = 1; + __u16 *fh16 = (__u16*)fh32; + + /* + * WARNING: max_len is 5 for NFSv2. Because of this + * limitation, we use the lower 16 bits of fh32[1] to hold the + * offset of the inode and the upper 16 bits of fh32[1] to + * hold the offset of the parent. + */ + if (parent && (len < 5)) { + *max_len = 5; + return FILEID_INVALID; + } else if (len < 3) { + *max_len = 3; + return FILEID_INVALID; + } + + len = 3; + fh32[0] = ei->i_iget5_block; + fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ + fh16[3] = 0; /* avoid leaking uninitialized data */ + fh32[2] = inode->i_generation; + if (parent) { + struct iso_inode_info *eparent; + eparent = ISOFS_I(parent); + fh32[3] = eparent->i_iget5_block; + fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ + fh32[4] = parent->i_generation; + len = 5; + type = 2; + } + *max_len = len; + return type; +} + +struct isofs_fid { + u32 block; + u16 offset; + u16 parent_offset; + u32 generation; + u32 parent_block; + u32 parent_generation; +}; + +static struct dentry *isofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct isofs_fid *ifid = (struct isofs_fid *)fid; + + if (fh_len < 3 || fh_type > 2) + return NULL; + + return isofs_export_iget(sb, ifid->block, ifid->offset, + ifid->generation); +} + +static struct dentry *isofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct isofs_fid *ifid = (struct isofs_fid *)fid; + + if (fh_len < 2 || fh_type != 2) + return NULL; + + return isofs_export_iget(sb, + fh_len > 2 ? ifid->parent_block : 0, + ifid->parent_offset, + fh_len > 4 ? ifid->parent_generation : 0); +} + +const struct export_operations isofs_export_ops = { + .encode_fh = isofs_export_encode_fh, + .fh_to_dentry = isofs_fh_to_dentry, + .fh_to_parent = isofs_fh_to_parent, + .get_parent = isofs_export_get_parent, +}; diff --git a/kernel/fs/isofs/inode.c b/kernel/fs/isofs/inode.c new file mode 100644 index 000000000..d67a16f2a --- /dev/null +++ b/kernel/fs/isofs/inode.c @@ -0,0 +1,1557 @@ +/* + * linux/fs/isofs/inode.c + * + * (C) 1991 Linus Torvalds - minix filesystem + * 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. + * 1994 Eberhard Mönkeberg - multi session handling. + * 1995 Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs. + * 1997 Gordon Chaffee - Joliet CDs + * 1998 Eric Lammerts - ISO 9660 Level 3 + * 2004 Paul Serice - Inode Support pushed out from 4GB to 128GB + * 2004 Paul Serice - NFS Export Operations + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "isofs.h" +#include "zisofs.h" + +#define BEQUIET + +static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); +static int isofs_dentry_cmpi(const struct dentry *parent, + const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); + +#ifdef CONFIG_JOLIET +static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); +static int isofs_dentry_cmpi_ms(const struct dentry *parent, + const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp_ms(const struct dentry *parent, + const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name); +#endif + +static void isofs_put_super(struct super_block *sb) +{ + struct isofs_sb_info *sbi = ISOFS_SB(sb); + +#ifdef CONFIG_JOLIET + unload_nls(sbi->s_nls_iocharset); +#endif + + kfree(sbi); + sb->s_fs_info = NULL; + return; +} + +static int isofs_read_inode(struct inode *, int relocated); +static int isofs_statfs (struct dentry *, struct kstatfs *); + +static struct kmem_cache *isofs_inode_cachep; + +static struct inode *isofs_alloc_inode(struct super_block *sb) +{ + struct iso_inode_info *ei; + ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void isofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); +} + +static void isofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, isofs_i_callback); +} + +static void init_once(void *foo) +{ + struct iso_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", + sizeof(struct iso_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (isofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(isofs_inode_cachep); +} + +static int isofs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + if (!(*flags & MS_RDONLY)) + return -EROFS; + return 0; +} + +static const struct super_operations isofs_sops = { + .alloc_inode = isofs_alloc_inode, + .destroy_inode = isofs_destroy_inode, + .put_super = isofs_put_super, + .statfs = isofs_statfs, + .remount_fs = isofs_remount, + .show_options = generic_show_options, +}; + + +static const struct dentry_operations isofs_dentry_ops[] = { + { + .d_hash = isofs_hashi, + .d_compare = isofs_dentry_cmpi, + }, +#ifdef CONFIG_JOLIET + { + .d_hash = isofs_hash_ms, + .d_compare = isofs_dentry_cmp_ms, + }, + { + .d_hash = isofs_hashi_ms, + .d_compare = isofs_dentry_cmpi_ms, + }, +#endif +}; + +struct iso9660_options{ + unsigned int rock:1; + unsigned int joliet:1; + unsigned int cruft:1; + unsigned int hide:1; + unsigned int showassoc:1; + unsigned int nocompress:1; + unsigned int overriderockperm:1; + unsigned int uid_set:1; + unsigned int gid_set:1; + unsigned int utf8:1; + unsigned char map; + unsigned char check; + unsigned int blocksize; + umode_t fmode; + umode_t dmode; + kgid_t gid; + kuid_t uid; + char *iocharset; + /* LVE */ + s32 session; + s32 sbsector; +}; + +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hashi_common(struct qstr *qstr, int ms) +{ + const char *name; + int len; + char c; + unsigned long hash; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + hash = init_name_hash(); + while (len--) { + c = tolower(*name++); + hash = partial_name_hash(c, hash); + } + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare of two isofs names. + */ +static int isofs_dentry_cmp_common( + unsigned int len, const char *str, + const struct qstr *name, int ms, int ci) +{ + int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = name->len; + blen = len; + if (ms) { + while (alen && name->name[alen-1] == '.') + alen--; + while (blen && str[blen-1] == '.') + blen--; + } + if (alen == blen) { + if (ci) { + if (strncasecmp(name->name, str, alen) == 0) + return 0; + } else { + if (strncmp(name->name, str, alen) == 0) + return 0; + } + } + return 1; +} + +static int +isofs_hashi(const struct dentry *dentry, struct qstr *qstr) +{ + return isofs_hashi_common(qstr, 0); +} + +static int +isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 0, 1); +} + +#ifdef CONFIG_JOLIET +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hash_common(struct qstr *qstr, int ms) +{ + const char *name; + int len; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + qstr->hash = full_name_hash(name, len); + + return 0; +} + +static int +isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) +{ + return isofs_hash_common(qstr, 1); +} + +static int +isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) +{ + return isofs_hashi_common(qstr, 1); +} + +static int +isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 1, 0); +} + +static int +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 1, 1); +} +#endif + +enum { + Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, + Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, + Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, + Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, +}; + +static const match_table_t tokens = { + {Opt_norock, "norock"}, + {Opt_nojoliet, "nojoliet"}, + {Opt_unhide, "unhide"}, + {Opt_hide, "hide"}, + {Opt_showassoc, "showassoc"}, + {Opt_cruft, "cruft"}, + {Opt_utf8, "utf8"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_map_a, "map=acorn"}, + {Opt_map_a, "map=a"}, + {Opt_map_n, "map=normal"}, + {Opt_map_n, "map=n"}, + {Opt_map_o, "map=off"}, + {Opt_map_o, "map=o"}, + {Opt_session, "session=%u"}, + {Opt_sb, "sbsector=%u"}, + {Opt_check_r, "check=relaxed"}, + {Opt_check_r, "check=r"}, + {Opt_check_s, "check=strict"}, + {Opt_check_s, "check=s"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%u"}, + {Opt_dmode, "dmode=%u"}, + {Opt_overriderockperm, "overriderockperm"}, + {Opt_block, "block=%u"}, + {Opt_ignore, "conv=binary"}, + {Opt_ignore, "conv=b"}, + {Opt_ignore, "conv=text"}, + {Opt_ignore, "conv=t"}, + {Opt_ignore, "conv=mtext"}, + {Opt_ignore, "conv=m"}, + {Opt_ignore, "conv=auto"}, + {Opt_ignore, "conv=a"}, + {Opt_nocompress, "nocompress"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct iso9660_options *popt) +{ + char *p; + int option; + + popt->map = 'n'; + popt->rock = 1; + popt->joliet = 1; + popt->cruft = 0; + popt->hide = 0; + popt->showassoc = 0; + popt->check = 'u'; /* unset */ + popt->nocompress = 0; + popt->blocksize = 1024; + popt->fmode = popt->dmode = ISOFS_INVALID_MODE; + popt->uid_set = 0; + popt->gid_set = 0; + popt->gid = GLOBAL_ROOT_GID; + popt->uid = GLOBAL_ROOT_UID; + popt->iocharset = NULL; + popt->utf8 = 0; + popt->overriderockperm = 0; + popt->session=-1; + popt->sbsector=-1; + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + unsigned n; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_norock: + popt->rock = 0; + break; + case Opt_nojoliet: + popt->joliet = 0; + break; + case Opt_hide: + popt->hide = 1; + break; + case Opt_unhide: + case Opt_showassoc: + popt->showassoc = 1; + break; + case Opt_cruft: + popt->cruft = 1; + break; + case Opt_utf8: + popt->utf8 = 1; + break; +#ifdef CONFIG_JOLIET + case Opt_iocharset: + popt->iocharset = match_strdup(&args[0]); + break; +#endif + case Opt_map_a: + popt->map = 'a'; + break; + case Opt_map_o: + popt->map = 'o'; + break; + case Opt_map_n: + popt->map = 'n'; + break; + case Opt_session: + if (match_int(&args[0], &option)) + return 0; + n = option; + if (n > 99) + return 0; + popt->session = n + 1; + break; + case Opt_sb: + if (match_int(&args[0], &option)) + return 0; + popt->sbsector = option; + break; + case Opt_check_r: + popt->check = 'r'; + break; + case Opt_check_s: + popt->check = 's'; + break; + case Opt_ignore: + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + popt->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(popt->uid)) + return 0; + popt->uid_set = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + popt->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(popt->gid)) + return 0; + popt->gid_set = 1; + break; + case Opt_mode: + if (match_int(&args[0], &option)) + return 0; + popt->fmode = option; + break; + case Opt_dmode: + if (match_int(&args[0], &option)) + return 0; + popt->dmode = option; + break; + case Opt_overriderockperm: + popt->overriderockperm = 1; + break; + case Opt_block: + if (match_int(&args[0], &option)) + return 0; + n = option; + if (n != 512 && n != 1024 && n != 2048) + return 0; + popt->blocksize = n; + break; + case Opt_nocompress: + popt->nocompress = 1; + break; + default: + return 0; + } + } + return 1; +} + +/* + * look if the driver can tell the multi session redirection value + * + * don't change this if you don't know what you do, please! + * Multisession is legal only with XA disks. + * A non-XA disk with more than one volume descriptor may do it right, but + * usually is written in a nowhere standardized "multi-partition" manner. + * Multisession uses absolute addressing (solely the first frame of the whole + * track is #0), multi-partition uses relative addressing (each first frame of + * each track is #0), and a track is not a session. + * + * A broken CDwriter software or drive firmware does not set new standards, + * at least not if conflicting with the existing ones. + * + * emoenke@gwdg.de + */ +#define WE_OBEY_THE_WRITTEN_STANDARDS 1 + +static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) +{ + struct cdrom_multisession ms_info; + unsigned int vol_desc_start; + struct block_device *bdev = sb->s_bdev; + int i; + + vol_desc_start=0; + ms_info.addr_format=CDROM_LBA; + if(session >= 0 && session <= 99) { + struct cdrom_tocentry Te; + Te.cdte_track=session; + Te.cdte_format=CDROM_LBA; + i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); + if (!i) { + printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", + session, Te.cdte_addr.lba, + Te.cdte_ctrl&CDROM_DATA_TRACK); + if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) + return Te.cdte_addr.lba; + } + + printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); + } + i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); + if (session > 0) + printk(KERN_ERR "ISOFS: Invalid session number\n"); +#if 0 + printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i); + if (i==0) { + printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); + printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); + } +#endif + if (i==0) +#if WE_OBEY_THE_WRITTEN_STANDARDS + if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ +#endif + vol_desc_start=ms_info.addr.lba; + return vol_desc_start; +} + +/* + * Check if root directory is empty (has less than 3 files). + * + * Used to detect broken CDs where ISO root directory is empty but Joliet root + * directory is OK. If such CD has Rock Ridge extensions, they will be disabled + * (and Joliet used instead) or else no files would be visible. + */ +static bool rootdir_empty(struct super_block *sb, unsigned long block) +{ + int offset = 0, files = 0, de_len; + struct iso_directory_record *de; + struct buffer_head *bh; + + bh = sb_bread(sb, block); + if (!bh) + return true; + while (files < 3) { + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + if (de_len == 0) + break; + files++; + offset += de_len; + } + brelse(bh); + return files < 3; +} + +/* + * Initialize the superblock and read the root inode. + * + * Note: a check_disk_change() has been done immediately prior + * to this call, so we don't need to check again. + */ +static int isofs_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh = NULL, *pri_bh = NULL; + struct hs_primary_descriptor *h_pri = NULL; + struct iso_primary_descriptor *pri = NULL; + struct iso_supplementary_descriptor *sec = NULL; + struct iso_directory_record *rootp; + struct inode *inode; + struct iso9660_options opt; + struct isofs_sb_info *sbi; + unsigned long first_data_zone; + int joliet_level = 0; + int iso_blknum, block; + int orig_zonesize; + int table, error = -EINVAL; + unsigned int vol_desc_start; + + save_mount_options(s, data); + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + + if (!parse_options((char *)data, &opt)) + goto out_freesbi; + + /* + * First of all, get the hardware blocksize for this device. + * If we don't know what it is, or the hardware blocksize is + * larger than the blocksize the user specified, then use + * that value. + */ + /* + * What if bugger tells us to go beyond page size? + */ + opt.blocksize = sb_min_blocksize(s, opt.blocksize); + + sbi->s_high_sierra = 0; /* default is iso9660 */ + + vol_desc_start = (opt.sbsector != -1) ? + opt.sbsector : isofs_get_last_session(s,opt.session); + + for (iso_blknum = vol_desc_start+16; + iso_blknum < vol_desc_start+100; iso_blknum++) { + struct hs_volume_descriptor *hdp; + struct iso_volume_descriptor *vdp; + + block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); + if (!(bh = sb_bread(s, block))) + goto out_no_read; + + vdp = (struct iso_volume_descriptor *)bh->b_data; + hdp = (struct hs_volume_descriptor *)bh->b_data; + + /* + * Due to the overlapping physical location of the descriptors, + * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure + * proper identification in this case, we first check for ISO. + */ + if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { + if (isonum_711(vdp->type) == ISO_VD_END) + break; + if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { + if (pri == NULL) { + pri = (struct iso_primary_descriptor *)vdp; + /* Save the buffer in case we need it ... */ + pri_bh = bh; + bh = NULL; + } + } +#ifdef CONFIG_JOLIET + else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { + sec = (struct iso_supplementary_descriptor *)vdp; + if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { + if (opt.joliet) { + if (sec->escape[2] == 0x40) + joliet_level = 1; + else if (sec->escape[2] == 0x43) + joliet_level = 2; + else if (sec->escape[2] == 0x45) + joliet_level = 3; + + printk(KERN_DEBUG "ISO 9660 Extensions: " + "Microsoft Joliet Level %d\n", + joliet_level); + } + goto root_found; + } else { + /* Unknown supplementary volume descriptor */ + sec = NULL; + } + } +#endif + } else { + if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { + if (isonum_711(hdp->type) != ISO_VD_PRIMARY) + goto out_freebh; + + sbi->s_high_sierra = 1; + opt.rock = 0; + h_pri = (struct hs_primary_descriptor *)vdp; + goto root_found; + } + } + + /* Just skip any volume descriptors we don't recognize */ + + brelse(bh); + bh = NULL; + } + /* + * If we fall through, either no volume descriptor was found, + * or else we passed a primary descriptor looking for others. + */ + if (!pri) + goto out_unknown_format; + brelse(bh); + bh = pri_bh; + pri_bh = NULL; + +root_found: + + if (joliet_level && (pri == NULL || !opt.rock)) { + /* This is the case of Joliet with the norock mount flag. + * A disc with both Joliet and Rock Ridge is handled later + */ + pri = (struct iso_primary_descriptor *) sec; + } + + if(sbi->s_high_sierra){ + rootp = (struct iso_directory_record *) h_pri->root_directory_record; + sbi->s_nzones = isonum_733(h_pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size); + sbi->s_max_size = isonum_733(h_pri->volume_space_size); + } else { + if (!pri) + goto out_freebh; + rootp = (struct iso_directory_record *) pri->root_directory_record; + sbi->s_nzones = isonum_733(pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(pri->logical_block_size); + sbi->s_max_size = isonum_733(pri->volume_space_size); + } + + sbi->s_ninodes = 0; /* No way to figure this out easily */ + + orig_zonesize = sbi->s_log_zone_size; + /* + * If the zone size is smaller than the hardware sector size, + * this is a fatal error. This would occur if the disc drive + * had sectors that were 2048 bytes, but the filesystem had + * blocks that were 512 bytes (which should only very rarely + * happen.) + */ + if (orig_zonesize < opt.blocksize) + goto out_bad_size; + + /* RDE: convert log zone size to bit shift */ + switch (sbi->s_log_zone_size) { + case 512: sbi->s_log_zone_size = 9; break; + case 1024: sbi->s_log_zone_size = 10; break; + case 2048: sbi->s_log_zone_size = 11; break; + + default: + goto out_bad_zone_size; + } + + s->s_magic = ISOFS_SUPER_MAGIC; + + /* + * With multi-extent files, file size is only limited by the maximum + * size of a file system, which is 8 TB. + */ + s->s_maxbytes = 0x80000000000LL; + + /* Set this for reference. Its not currently used except on write + which we don't have .. */ + + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); + sbi->s_firstdatazone = first_data_zone; +#ifndef BEQUIET + printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", + sbi->s_max_size, 1UL << sbi->s_log_zone_size); + printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); + if(sbi->s_high_sierra) + printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); +#endif + + /* + * If the Joliet level is set, we _may_ decide to use the + * secondary descriptor, but can't be sure until after we + * read the root inode. But before reading the root inode + * we may need to change the device blocksize, and would + * rather release the old buffer first. So, we cache the + * first_data_zone value from the secondary descriptor. + */ + if (joliet_level) { + pri = (struct iso_primary_descriptor *) sec; + rootp = (struct iso_directory_record *) + pri->root_directory_record; + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); + } + + /* + * We're all done using the volume descriptor, and may need + * to change the device blocksize, so release the buffer now. + */ + brelse(pri_bh); + brelse(bh); + + /* + * Force the blocksize to 512 for 512 byte sectors. The file + * read primitives really get it wrong in a bad way if we don't + * do this. + * + * Note - we should never be setting the blocksize to something + * less than the hardware sector size for the device. If we + * do, we would end up having to read larger buffers and split + * out portions to satisfy requests. + * + * Note2- the idea here is that we want to deal with the optimal + * zonesize in the filesystem. If we have it set to something less, + * then we have horrible problems with trying to piece together + * bits of adjacent blocks in order to properly read directory + * entries. By forcing the blocksize in this way, we ensure + * that we will never be required to do this. + */ + sb_set_blocksize(s, orig_zonesize); + + sbi->s_nls_iocharset = NULL; + +#ifdef CONFIG_JOLIET + if (joliet_level && opt.utf8 == 0) { + char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; + sbi->s_nls_iocharset = load_nls(p); + if (! sbi->s_nls_iocharset) { + /* Fail only if explicit charset specified */ + if (opt.iocharset) + goto out_freesbi; + sbi->s_nls_iocharset = load_nls_default(); + } + } +#endif + s->s_op = &isofs_sops; + s->s_export_op = &isofs_export_ops; + sbi->s_mapping = opt.map; + sbi->s_rock = (opt.rock ? 2 : 0); + sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ + sbi->s_cruft = opt.cruft; + sbi->s_hide = opt.hide; + sbi->s_showassoc = opt.showassoc; + sbi->s_uid = opt.uid; + sbi->s_gid = opt.gid; + sbi->s_uid_set = opt.uid_set; + sbi->s_gid_set = opt.gid_set; + sbi->s_utf8 = opt.utf8; + sbi->s_nocompress = opt.nocompress; + sbi->s_overriderockperm = opt.overriderockperm; + /* + * It would be incredibly stupid to allow people to mark every file + * on the disk as suid, so we merely allow them to set the default + * permissions. + */ + if (opt.fmode != ISOFS_INVALID_MODE) + sbi->s_fmode = opt.fmode & 0777; + else + sbi->s_fmode = ISOFS_INVALID_MODE; + if (opt.dmode != ISOFS_INVALID_MODE) + sbi->s_dmode = opt.dmode & 0777; + else + sbi->s_dmode = ISOFS_INVALID_MODE; + + /* + * Read the root inode, which _may_ result in changing + * the s_rock flag. Once we have the final s_rock value, + * we then decide whether to use the Joliet descriptor. + */ + inode = isofs_iget(s, sbi->s_firstdatazone, 0); + if (IS_ERR(inode)) + goto out_no_root; + + /* + * Fix for broken CDs with Rock Ridge and empty ISO root directory but + * correct Joliet root directory. + */ + if (sbi->s_rock == 1 && joliet_level && + rootdir_empty(s, sbi->s_firstdatazone)) { + printk(KERN_NOTICE + "ISOFS: primary root directory is empty. " + "Disabling Rock Ridge and switching to Joliet."); + sbi->s_rock = 0; + } + + /* + * If this disk has both Rock Ridge and Joliet on it, then we + * want to use Rock Ridge by default. This can be overridden + * by using the norock mount option. There is still one other + * possibility that is not taken into account: a Rock Ridge + * CD with Unicode names. Until someone sees such a beast, it + * will not be supported. + */ + if (sbi->s_rock == 1) { + joliet_level = 0; + } else if (joliet_level) { + sbi->s_rock = 0; + if (sbi->s_firstdatazone != first_data_zone) { + sbi->s_firstdatazone = first_data_zone; + printk(KERN_DEBUG + "ISOFS: changing to secondary root\n"); + iput(inode); + inode = isofs_iget(s, sbi->s_firstdatazone, 0); + if (IS_ERR(inode)) + goto out_no_root; + } + } + + if (opt.check == 'u') { + /* Only Joliet is case insensitive by default */ + if (joliet_level) + opt.check = 'r'; + else + opt.check = 's'; + } + sbi->s_joliet_level = joliet_level; + + /* Make sure the root inode is a directory */ + if (!S_ISDIR(inode->i_mode)) { + printk(KERN_WARNING + "isofs_fill_super: root inode is not a directory. " + "Corrupted media?\n"); + goto out_iput; + } + + table = 0; + if (joliet_level) + table += 2; + if (opt.check == 'r') + table++; + + if (table) + s->s_d_op = &isofs_dentry_ops[table - 1]; + + /* get the root dentry */ + s->s_root = d_make_root(inode); + if (!(s->s_root)) { + error = -ENOMEM; + goto out_no_inode; + } + + kfree(opt.iocharset); + + return 0; + + /* + * Display error messages and free resources. + */ +out_iput: + iput(inode); + goto out_no_inode; +out_no_root: + error = PTR_ERR(inode); + if (error != -ENOMEM) + printk(KERN_WARNING "%s: get root inode failed\n", __func__); +out_no_inode: +#ifdef CONFIG_JOLIET + unload_nls(sbi->s_nls_iocharset); +#endif + goto out_freesbi; +out_no_read: + printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", + __func__, s->s_id, iso_blknum, block); + goto out_freebh; +out_bad_zone_size: + printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", + sbi->s_log_zone_size); + goto out_freebh; +out_bad_size: + printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", + orig_zonesize, opt.blocksize); + goto out_freebh; +out_unknown_format: + if (!silent) + printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); + +out_freebh: + brelse(bh); + brelse(pri_bh); +out_freesbi: + kfree(opt.iocharset); + kfree(sbi); + s->s_fs_info = NULL; + return error; +} + +static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ISOFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (ISOFS_SB(sb)->s_nzones + << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = ISOFS_SB(sb)->s_ninodes; + buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = NAME_MAX; + return 0; +} + +/* + * Get a set of blocks; filling in buffer_heads if already allocated + * or getblk() if they are not. Returns the number of blocks inserted + * (-ve == error.) + */ +int isofs_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head **bh, unsigned long nblocks) +{ + unsigned long b_off = iblock; + unsigned offset, sect_size; + unsigned int firstext; + unsigned long nextblk, nextoff; + int section, rv, error; + struct iso_inode_info *ei = ISOFS_I(inode); + + error = -EIO; + rv = 0; + if (iblock != b_off) { + printk(KERN_DEBUG "%s: block number too large\n", __func__); + goto abort; + } + + + offset = 0; + firstext = ei->i_first_extent; + sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); + nextblk = ei->i_next_section_block; + nextoff = ei->i_next_section_offset; + section = 0; + + while (nblocks) { + /* If we are *way* beyond the end of the file, print a message. + * Access beyond the end of the file up to the next page boundary + * is normal, however because of the way the page cache works. + * In this case, we just return 0 so that we can properly fill + * the page with useless information without generating any + * I/O errors. + */ + if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { + printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + __func__, b_off, + (unsigned long long)inode->i_size); + goto abort; + } + + /* On the last section, nextblk == 0, section size is likely to + * exceed sect_size by a partial block, and access beyond the + * end of the file will reach beyond the section size, too. + */ + while (nextblk && (b_off >= (offset + sect_size))) { + struct inode *ninode; + + offset += sect_size; + ninode = isofs_iget(inode->i_sb, nextblk, nextoff); + if (IS_ERR(ninode)) { + error = PTR_ERR(ninode); + goto abort; + } + firstext = ISOFS_I(ninode)->i_first_extent; + sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); + nextblk = ISOFS_I(ninode)->i_next_section_block; + nextoff = ISOFS_I(ninode)->i_next_section_offset; + iput(ninode); + + if (++section > 100) { + printk(KERN_DEBUG "%s: More than 100 file sections ?!?" + " aborting...\n", __func__); + printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " + "nextblk=%lu nextoff=%lu\n", __func__, + b_off, firstext, (unsigned) sect_size, + nextblk, nextoff); + goto abort; + } + } + + if (*bh) { + map_bh(*bh, inode->i_sb, firstext + b_off - offset); + } else { + *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); + if (!*bh) + goto abort; + } + bh++; /* Next buffer head */ + b_off++; /* Next buffer offset */ + nblocks--; + rv++; + } + + error = 0; +abort: + return rv != 0 ? rv : error; +} + +/* + * Used by the standard interfaces. + */ +static int isofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + if (create) { + printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); + return -EROFS; + } + + ret = isofs_get_blocks(inode, iblock, &bh_result, 1); + return ret < 0 ? ret : 0; +} + +static int isofs_bmap(struct inode *inode, sector_t block) +{ + struct buffer_head dummy; + int error; + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + error = isofs_get_block(inode, block, &dummy, 0); + if (!error) + return dummy.b_blocknr; + return 0; +} + +struct buffer_head *isofs_bread(struct inode *inode, sector_t block) +{ + sector_t blknr = isofs_bmap(inode, block); + if (!blknr) + return NULL; + return sb_bread(inode->i_sb, blknr); +} + +static int isofs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); +} + +static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,isofs_get_block); +} + +static const struct address_space_operations isofs_aops = { + .readpage = isofs_readpage, + .readpages = isofs_readpages, + .bmap = _isofs_bmap +}; + +static int isofs_read_level3_size(struct inode *inode) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; + struct buffer_head *bh = NULL; + unsigned long block, offset, block_saved, offset_saved; + int i = 0; + int more_entries = 0; + struct iso_directory_record *tmpde = NULL; + struct iso_inode_info *ei = ISOFS_I(inode); + + inode->i_size = 0; + + /* The first 16 blocks are reserved as the System Area. Thus, + * no inodes can appear in block 0. We use this to flag that + * this is the last section. */ + ei->i_next_section_block = 0; + ei->i_next_section_offset = 0; + + block = ei->i_iget5_block; + offset = ei->i_iget5_offset; + + do { + struct iso_directory_record *de; + unsigned int de_len; + + if (!bh) { + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + } + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + + if (de_len == 0) { + brelse(bh); + bh = NULL; + ++block; + offset = 0; + continue; + } + + block_saved = block; + offset_saved = offset; + offset += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + if (!tmpde) { + tmpde = kmalloc(256, GFP_KERNEL); + if (!tmpde) + goto out_nomem; + } + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + memcpy((void *)tmpde+slop, bh->b_data, offset); + } + de = tmpde; + } + + inode->i_size += isonum_733(de->size); + if (i == 1) { + ei->i_next_section_block = block_saved; + ei->i_next_section_offset = offset_saved; + } + + more_entries = de->flags[-high_sierra] & 0x80; + + i++; + if (i > 100) + goto out_toomany; + } while (more_entries); +out: + kfree(tmpde); + if (bh) + brelse(bh); + return 0; + +out_nomem: + if (bh) + brelse(bh); + return -ENOMEM; + +out_noread: + printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block); + kfree(tmpde); + return -EIO; + +out_toomany: + printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" + "isofs_read_level3_size: inode=%lu\n", + __func__, inode->i_ino); + goto out; +} + +static int isofs_read_inode(struct inode *inode, int relocated) +{ + struct super_block *sb = inode->i_sb; + struct isofs_sb_info *sbi = ISOFS_SB(sb); + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned long block; + int high_sierra = sbi->s_high_sierra; + struct buffer_head *bh = NULL; + struct iso_directory_record *de; + struct iso_directory_record *tmpde = NULL; + unsigned int de_len; + unsigned long offset; + struct iso_inode_info *ei = ISOFS_I(inode); + int ret = -EIO; + + block = ei->i_iget5_block; + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_badread; + + offset = ei->i_iget5_offset; + + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + + if (offset + de_len > bufsize) { + int frag1 = bufsize - offset; + + tmpde = kmalloc(de_len, GFP_KERNEL); + if (tmpde == NULL) { + printk(KERN_INFO "%s: out of memory\n", __func__); + ret = -ENOMEM; + goto fail; + } + memcpy(tmpde, bh->b_data + offset, frag1); + brelse(bh); + bh = sb_bread(inode->i_sb, ++block); + if (!bh) + goto out_badread; + memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1); + de = tmpde; + } + + inode->i_ino = isofs_get_ino(ei->i_iget5_block, + ei->i_iget5_offset, + ISOFS_BUFFER_BITS(inode)); + + /* Assume it is a normal-format file unless told otherwise */ + ei->i_file_format = isofs_file_normal; + + if (de->flags[-high_sierra] & 2) { + if (sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + else + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + set_nlink(inode, 1); /* + * Set to 1. We know there are 2, but + * the find utility tries to optimize + * if it is 2, and it screws up. It is + * easier to give 1 which tells find to + * do it the hard way. + */ + } else { + if (sbi->s_fmode != ISOFS_INVALID_MODE) { + inode->i_mode = S_IFREG | sbi->s_fmode; + } else { + /* + * Set default permissions: r-x for all. The disc + * could be shared with DOS machines so virtually + * anything could be a valid executable. + */ + inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; + } + set_nlink(inode, 1); + } + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + inode->i_blocks = 0; + + ei->i_format_parm[0] = 0; + ei->i_format_parm[1] = 0; + ei->i_format_parm[2] = 0; + + ei->i_section_size = isonum_733(de->size); + if (de->flags[-high_sierra] & 0x80) { + ret = isofs_read_level3_size(inode); + if (ret < 0) + goto fail; + ret = -EIO; + } else { + ei->i_next_section_block = 0; + ei->i_next_section_offset = 0; + inode->i_size = isonum_733(de->size); + } + + /* + * Some dipshit decided to store some other bit of information + * in the high byte of the file length. Truncate size in case + * this CDROM was mounted with the cruft option. + */ + + if (sbi->s_cruft) + inode->i_size &= 0x00ffffff; + + if (de->interleave[0]) { + printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); + inode->i_size = 0; + } + + /* I have no idea what file_unit_size is used for, so + we will flag it for now */ + if (de->file_unit_size[0] != 0) { + printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", + inode->i_ino); + } + + /* I have no idea what other flag bits are used for, so + we will flag it for now */ +#ifdef DEBUG + if((de->flags[-high_sierra] & ~2)!= 0){ + printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " + "(%ld %x).\n", + inode->i_ino, de->flags[-high_sierra]); + } +#endif + + inode->i_mtime.tv_sec = + inode->i_atime.tv_sec = + inode->i_ctime.tv_sec = iso_date(de->date, high_sierra); + inode->i_mtime.tv_nsec = + inode->i_atime.tv_nsec = + inode->i_ctime.tv_nsec = 0; + + ei->i_first_extent = (isonum_733(de->extent) + + isonum_711(de->ext_attr_length)); + + /* Set the number of blocks for stat() - should be done before RR */ + inode->i_blocks = (inode->i_size + 511) >> 9; + + /* + * Now test for possible Rock Ridge extensions which will override + * some of these numbers in the inode structure. + */ + + if (!high_sierra) { + parse_rock_ridge_inode(de, inode, relocated); + /* if we want uid/gid set, override the rock ridge setting */ + if (sbi->s_uid_set) + inode->i_uid = sbi->s_uid; + if (sbi->s_gid_set) + inode->i_gid = sbi->s_gid; + } + /* Now set final access rights if overriding rock ridge setting */ + if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_fmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFREG | sbi->s_fmode; + + /* Install the inode operations vector */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + switch (ei->i_file_format) { +#ifdef CONFIG_ZISOFS + case isofs_file_compressed: + inode->i_data.a_ops = &zisofs_aops; + break; +#endif + default: + inode->i_data.a_ops = &isofs_aops; + break; + } + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &isofs_dir_inode_operations; + inode->i_fop = &isofs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &isofs_symlink_aops; + } else + /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + ret = 0; +out: + kfree(tmpde); + if (bh) + brelse(bh); + return ret; + +out_badread: + printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); +fail: + goto out; +} + +struct isofs_iget5_callback_data { + unsigned long block; + unsigned long offset; +}; + +static int isofs_iget5_test(struct inode *ino, void *data) +{ + struct iso_inode_info *i = ISOFS_I(ino); + struct isofs_iget5_callback_data *d = + (struct isofs_iget5_callback_data*)data; + return (i->i_iget5_block == d->block) + && (i->i_iget5_offset == d->offset); +} + +static int isofs_iget5_set(struct inode *ino, void *data) +{ + struct iso_inode_info *i = ISOFS_I(ino); + struct isofs_iget5_callback_data *d = + (struct isofs_iget5_callback_data*)data; + i->i_iget5_block = d->block; + i->i_iget5_offset = d->offset; + return 0; +} + +/* Store, in the inode's containing structure, the block and block + * offset that point to the underlying meta-data for the inode. The + * code below is otherwise similar to the iget() code in + * include/linux/fs.h */ +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated) +{ + unsigned long hashval; + struct inode *inode; + struct isofs_iget5_callback_data data; + long ret; + + if (offset >= 1ul << sb->s_blocksize_bits) + return ERR_PTR(-EINVAL); + + data.block = block; + data.offset = offset; + + hashval = (block << sb->s_blocksize_bits) | offset; + + inode = iget5_locked(sb, hashval, &isofs_iget5_test, + &isofs_iget5_set, &data); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + ret = isofs_read_inode(inode, relocated); + if (ret < 0) { + iget_failed(inode); + inode = ERR_PTR(ret); + } else { + unlock_new_inode(inode); + } + } + + return inode; +} + +static struct dentry *isofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + /* We don't support read-write mounts */ + if (!(flags & MS_RDONLY)) + return ERR_PTR(-EACCES); + return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); +} + +static struct file_system_type iso9660_fs_type = { + .owner = THIS_MODULE, + .name = "iso9660", + .mount = isofs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("iso9660"); +MODULE_ALIAS("iso9660"); + +static int __init init_iso9660_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out; +#ifdef CONFIG_ZISOFS + err = zisofs_init(); + if (err) + goto out1; +#endif + err = register_filesystem(&iso9660_fs_type); + if (err) + goto out2; + return 0; +out2: +#ifdef CONFIG_ZISOFS + zisofs_cleanup(); +out1: +#endif + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_iso9660_fs(void) +{ + unregister_filesystem(&iso9660_fs_type); +#ifdef CONFIG_ZISOFS + zisofs_cleanup(); +#endif + destroy_inodecache(); +} + +module_init(init_iso9660_fs) +module_exit(exit_iso9660_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/isofs/isofs.h b/kernel/fs/isofs/isofs.h new file mode 100644 index 000000000..0ac4c1f73 --- /dev/null +++ b/kernel/fs/isofs/isofs.h @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include + +enum isofs_file_format { + isofs_file_normal = 0, + isofs_file_sparse = 1, + isofs_file_compressed = 2, +}; + +/* + * iso fs inode data in memory + */ +struct iso_inode_info { + unsigned long i_iget5_block; + unsigned long i_iget5_offset; + unsigned int i_first_extent; + unsigned char i_file_format; + unsigned char i_format_parm[3]; + unsigned long i_next_section_block; + unsigned long i_next_section_offset; + off_t i_section_size; + struct inode vfs_inode; +}; + +/* + * iso9660 super-block data in memory + */ +struct isofs_sb_info { + unsigned long s_ninodes; + unsigned long s_nzones; + unsigned long s_firstdatazone; + unsigned long s_log_zone_size; + unsigned long s_max_size; + + int s_rock_offset; /* offset of SUSP fields within SU area */ + unsigned char s_joliet_level; + unsigned char s_mapping; + unsigned int s_high_sierra:1; + unsigned int s_rock:2; + unsigned int s_utf8:1; + unsigned int s_cruft:1; /* Broken disks with high byte of length + * containing junk */ + unsigned int s_nocompress:1; + unsigned int s_hide:1; + unsigned int s_showassoc:1; + unsigned int s_overriderockperm:1; + unsigned int s_uid_set:1; + unsigned int s_gid_set:1; + + umode_t s_fmode; + umode_t s_dmode; + kgid_t s_gid; + kuid_t s_uid; + struct nls_table *s_nls_iocharset; /* Native language support table */ +}; + +#define ISOFS_INVALID_MODE ((umode_t) -1) + +static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct iso_inode_info *ISOFS_I(struct inode *inode) +{ + return container_of(inode, struct iso_inode_info, vfs_inode); +} + +static inline int isonum_711(char *p) +{ + return *(u8 *)p; +} +static inline int isonum_712(char *p) +{ + return *(s8 *)p; +} +static inline unsigned int isonum_721(char *p) +{ + return get_unaligned_le16(p); +} +static inline unsigned int isonum_722(char *p) +{ + return get_unaligned_be16(p); +} +static inline unsigned int isonum_723(char *p) +{ + /* Ignore bigendian datum due to broken mastering programs */ + return get_unaligned_le16(p); +} +static inline unsigned int isonum_731(char *p) +{ + return get_unaligned_le32(p); +} +static inline unsigned int isonum_732(char *p) +{ + return get_unaligned_be32(p); +} +static inline unsigned int isonum_733(char *p) +{ + /* Ignore bigendian datum due to broken mastering programs */ + return get_unaligned_le32(p); +} +extern int iso_date(char *, int); + +struct inode; /* To make gcc happy */ + +extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated); +extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); +extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); + +int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *); +int get_acorn_filename(struct iso_directory_record *, char *, struct inode *); + +extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags); +extern struct buffer_head *isofs_bread(struct inode *, sector_t); +extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); + +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated); + +static inline struct inode *isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 0); +} + +static inline struct inode *isofs_iget_reloc(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 1); +} + +/* Because the inode number is no longer relevant to finding the + * underlying meta-data for an inode, we are free to choose a more + * convenient 32-bit number as the inode number. The inode numbering + * scheme was recommended by Sergey Vlasov and Eric Lammerts. */ +static inline unsigned long isofs_get_ino(unsigned long block, + unsigned long offset, + unsigned long bufbits) +{ + return (block << (bufbits - 5)) | (offset >> 5); +} + +/* Every directory can have many redundant directory entries scattered + * throughout the directory tree. First there is the directory entry + * with the name of the directory stored in the parent directory. + * Then, there is the "." directory entry stored in the directory + * itself. Finally, there are possibly many ".." directory entries + * stored in all the subdirectories. + * + * In order for the NFS get_parent() method to work and for the + * general consistency of the dcache, we need to make sure the + * "i_iget5_block" and "i_iget5_offset" all point to exactly one of + * the many redundant entries for each directory. We normalize the + * block and offset by always making them point to the "." directory. + * + * Notice that we do not use the entry for the directory with the name + * that is located in the parent directory. Even though choosing this + * first directory is more natural, it is much easier to find the "." + * entry in the NFS get_parent() method because it is implicitly + * encoded in the "extent + ext_attr_length" fields of _all_ the + * redundant entries for the directory. Thus, it can always be + * reached regardless of which directory entry you have in hand. + * + * This works because the "." entry is simply the first directory + * record when you start reading the file that holds all the directory + * records, and this file starts at "extent + ext_attr_length" blocks. + * Because the "." entry is always the first entry listed in the + * directories file, the normalized "offset" value is always 0. + * + * You should pass the directory entry in "de". On return, "block" + * and "offset" will hold normalized values. Only directories are + * affected making it safe to call even for non-directory file + * types. */ +static inline void +isofs_normalize_block_and_offset(struct iso_directory_record* de, + unsigned long *block, + unsigned long *offset) +{ + /* Only directories are normalized. */ + if (de->flags[0] & 2) { + *offset = 0; + *block = (unsigned long)isonum_733(de->extent) + + (unsigned long)isonum_711(de->ext_attr_length); + } +} + +extern const struct inode_operations isofs_dir_inode_operations; +extern const struct file_operations isofs_dir_operations; +extern const struct address_space_operations isofs_symlink_aops; +extern const struct export_operations isofs_export_ops; diff --git a/kernel/fs/isofs/joliet.c b/kernel/fs/isofs/joliet.c new file mode 100644 index 000000000..a048de81c --- /dev/null +++ b/kernel/fs/isofs/joliet.c @@ -0,0 +1,69 @@ +/* + * linux/fs/isofs/joliet.c + * + * (C) 1996 Gordon Chaffee + * + * Joliet: Microsoft's Unicode extensions to iso9660 + */ + +#include +#include +#include "isofs.h" + +/* + * Convert Unicode 16 to UTF-8 or ASCII. + */ +static int +uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) +{ + __be16 *ip, ch; + unsigned char *op; + + ip = uni; + op = ascii; + + while ((ch = get_unaligned(ip)) && len) { + int llen; + llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE); + if (llen > 0) + op += llen; + else + *op++ = '?'; + ip++; + + len--; + } + *op = 0; + return (op - ascii); +} + +int +get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) +{ + unsigned char utf8; + struct nls_table *nls; + unsigned char len = 0; + + utf8 = ISOFS_SB(inode->i_sb)->s_utf8; + nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; + + if (utf8) { + len = utf16s_to_utf8s((const wchar_t *) de->name, + de->name_len[0] >> 1, UTF16_BIG_ENDIAN, + outname, PAGE_SIZE); + } else { + len = uni16_to_x8(outname, (__be16 *) de->name, + de->name_len[0] >> 1, nls); + } + if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) + len -= 2; + + /* + * Windows doesn't like periods at the end of a name, + * so neither do we + */ + while (len >= 2 && (outname[len-1] == '.')) + len--; + + return len; +} diff --git a/kernel/fs/isofs/namei.c b/kernel/fs/isofs/namei.c new file mode 100644 index 000000000..7b543e6b6 --- /dev/null +++ b/kernel/fs/isofs/namei.c @@ -0,0 +1,172 @@ +/* + * linux/fs/isofs/namei.c + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include +#include "isofs.h" + +/* + * ok, we cannot use strncmp, as the name is not in our data space. + * Thus we'll have to use isofs_match. No big problem. Match also makes + * some sanity tests. + */ +static int +isofs_cmp(struct dentry *dentry, const char *compare, int dlen) +{ + struct qstr qstr; + qstr.name = compare; + qstr.len = dlen; + if (likely(!dentry->d_op)) + return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen); + return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr); +} + +/* + * isofs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the inode number of the found entry, or 0 on error. + */ +static unsigned long +isofs_find_entry(struct inode *dir, struct dentry *dentry, + unsigned long *block_rv, unsigned long *offset_rv, + char *tmpname, struct iso_directory_record *tmpde) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); + unsigned char bufbits = ISOFS_BUFFER_BITS(dir); + unsigned long block, f_pos, offset, block_saved, offset_saved; + struct buffer_head *bh = NULL; + struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); + + if (!ISOFS_I(dir)->i_first_extent) + return 0; + + f_pos = 0; + offset = 0; + block = 0; + + while (f_pos < dir->i_size) { + struct iso_directory_record *de; + int de_len, match, i, dlen; + char *dpnt; + + if (!bh) { + bh = isofs_bread(dir, block); + if (!bh) + return 0; + } + + de = (struct iso_directory_record *) (bh->b_data + offset); + + de_len = *(unsigned char *) de; + if (!de_len) { + brelse(bh); + bh = NULL; + f_pos = (f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = f_pos >> bufbits; + offset = 0; + continue; + } + + block_saved = bh->b_blocknr; + offset_saved = offset; + offset += de_len; + f_pos += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = isofs_bread(dir, block); + if (!bh) + return 0; + memcpy((void *) tmpde + slop, bh->b_data, offset); + } + de = tmpde; + } + + dlen = de->name_len[0]; + dpnt = de->name; + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < dlen + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + dir->i_ino); + return 0; + } + + if (sbi->s_rock && + ((i = get_rock_ridge_filename(de, tmpname, dir)))) { + dlen = i; /* possibly -1 */ + dpnt = tmpname; +#ifdef CONFIG_JOLIET + } else if (sbi->s_joliet_level) { + dlen = get_joliet_filename(de, tmpname, dir); + dpnt = tmpname; +#endif + } else if (sbi->s_mapping == 'a') { + dlen = get_acorn_filename(de, tmpname, dir); + dpnt = tmpname; + } else if (sbi->s_mapping == 'n') { + dlen = isofs_name_translate(de, tmpname, dir); + dpnt = tmpname; + } + + /* + * Skip hidden or associated files unless hide or showassoc, + * respectively, is set + */ + match = 0; + if (dlen > 0 && + (!sbi->s_hide || + (!(de->flags[-sbi->s_high_sierra] & 1))) && + (sbi->s_showassoc || + (!(de->flags[-sbi->s_high_sierra] & 4)))) { + if (dpnt && (dlen > 1 || dpnt[0] > 1)) + match = (isofs_cmp(dentry, dpnt, dlen) == 0); + } + if (match) { + isofs_normalize_block_and_offset(de, + &block_saved, + &offset_saved); + *block_rv = block_saved; + *offset_rv = offset_saved; + brelse(bh); + return 1; + } + } + brelse(bh); + return 0; +} + +struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + int found; + unsigned long uninitialized_var(block); + unsigned long uninitialized_var(offset); + struct inode *inode; + struct page *page; + + page = alloc_page(GFP_USER); + if (!page) + return ERR_PTR(-ENOMEM); + + found = isofs_find_entry(dir, dentry, + &block, &offset, + page_address(page), + 1024 + page_address(page)); + __free_page(page); + + inode = found ? isofs_iget(dir->i_sb, block, offset) : NULL; + + return d_splice_alias(inode, dentry); +} diff --git a/kernel/fs/isofs/rock.c b/kernel/fs/isofs/rock.c new file mode 100644 index 000000000..735d7522a --- /dev/null +++ b/kernel/fs/isofs/rock.c @@ -0,0 +1,801 @@ +/* + * linux/fs/isofs/rock.c + * + * (C) 1992, 1993 Eric Youngdale + * + * Rock Ridge Extensions to iso9660 + */ + +#include +#include + +#include "isofs.h" +#include "rock.h" + +/* + * These functions are designed to read the system areas of a directory record + * and extract relevant information. There are different functions provided + * depending upon what information we need at the time. One function fills + * out an inode structure, a second one extracts a filename, a third one + * returns a symbolic link name, and a fourth one returns the extent number + * for the file. + */ + +#define SIG(A,B) ((A) | ((B) << 8)) /* isonum_721() */ + +struct rock_state { + void *buffer; + unsigned char *chr; + int len; + int cont_size; + int cont_extent; + int cont_offset; + int cont_loops; + struct inode *inode; +}; + +/* + * This is a way of ensuring that we have something in the system + * use fields that is compatible with Rock Ridge. Return zero on success. + */ + +static int check_sp(struct rock_ridge *rr, struct inode *inode) +{ + if (rr->u.SP.magic[0] != 0xbe) + return -1; + if (rr->u.SP.magic[1] != 0xef) + return -1; + ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip; + return 0; +} + +static void setup_rock_ridge(struct iso_directory_record *de, + struct inode *inode, struct rock_state *rs) +{ + rs->len = sizeof(struct iso_directory_record) + de->name_len[0]; + if (rs->len & 1) + (rs->len)++; + rs->chr = (unsigned char *)de + rs->len; + rs->len = *((unsigned char *)de) - rs->len; + if (rs->len < 0) + rs->len = 0; + + if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) { + rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset; + rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset; + if (rs->len < 0) + rs->len = 0; + } +} + +static void init_rock_state(struct rock_state *rs, struct inode *inode) +{ + memset(rs, 0, sizeof(*rs)); + rs->inode = inode; +} + +/* Maximum number of Rock Ridge continuation entries */ +#define RR_MAX_CE_ENTRIES 32 + +/* + * Returns 0 if the caller should continue scanning, 1 if the scan must end + * and -ve on error. + */ +static int rock_continue(struct rock_state *rs) +{ + int ret = 1; + int blocksize = 1 << rs->inode->i_blkbits; + const int min_de_size = offsetof(struct rock_ridge, u); + + kfree(rs->buffer); + rs->buffer = NULL; + + if ((unsigned)rs->cont_offset > blocksize - min_de_size || + (unsigned)rs->cont_size > blocksize || + (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) { + printk(KERN_NOTICE "rock: corrupted directory entry. " + "extent=%d, offset=%d, size=%d\n", + rs->cont_extent, rs->cont_offset, rs->cont_size); + ret = -EIO; + goto out; + } + + if (rs->cont_extent) { + struct buffer_head *bh; + + rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL); + if (!rs->buffer) { + ret = -ENOMEM; + goto out; + } + ret = -EIO; + if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) + goto out; + bh = sb_bread(rs->inode->i_sb, rs->cont_extent); + if (bh) { + memcpy(rs->buffer, bh->b_data + rs->cont_offset, + rs->cont_size); + put_bh(bh); + rs->chr = rs->buffer; + rs->len = rs->cont_size; + rs->cont_extent = 0; + rs->cont_size = 0; + rs->cont_offset = 0; + return 0; + } + printk("Unable to read rock-ridge attributes\n"); + } +out: + kfree(rs->buffer); + rs->buffer = NULL; + return ret; +} + +/* + * We think there's a record of type `sig' at rs->chr. Parse the signature + * and make sure that there's really room for a record of that type. + */ +static int rock_check_overflow(struct rock_state *rs, int sig) +{ + int len; + + switch (sig) { + case SIG('S', 'P'): + len = sizeof(struct SU_SP_s); + break; + case SIG('C', 'E'): + len = sizeof(struct SU_CE_s); + break; + case SIG('E', 'R'): + len = sizeof(struct SU_ER_s); + break; + case SIG('R', 'R'): + len = sizeof(struct RR_RR_s); + break; + case SIG('P', 'X'): + len = sizeof(struct RR_PX_s); + break; + case SIG('P', 'N'): + len = sizeof(struct RR_PN_s); + break; + case SIG('S', 'L'): + len = sizeof(struct RR_SL_s); + break; + case SIG('N', 'M'): + len = sizeof(struct RR_NM_s); + break; + case SIG('C', 'L'): + len = sizeof(struct RR_CL_s); + break; + case SIG('P', 'L'): + len = sizeof(struct RR_PL_s); + break; + case SIG('T', 'F'): + len = sizeof(struct RR_TF_s); + break; + case SIG('Z', 'F'): + len = sizeof(struct RR_ZF_s); + break; + default: + len = 0; + break; + } + len += offsetof(struct rock_ridge, u); + if (len > rs->len) { + printk(KERN_NOTICE "rock: directory entry would overflow " + "storage\n"); + printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n", + sig, len, rs->len); + return -EIO; + } + return 0; +} + +/* + * return length of name field; 0: not found, -1: to be ignored + */ +int get_rock_ridge_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) +{ + struct rock_state rs; + struct rock_ridge *rr; + int sig; + int retnamlen = 0; + int truncate = 0; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + *retname = 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); +repeat: + + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ + if (rs.len < 0) + goto out; /* Something got screwed up here */ + + switch (sig) { + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & RR_NM) == 0) + goto out; + break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('N', 'M'): + if (truncate) + break; + if (rr->len < 5) + break; + /* + * If the flags are 2 or 4, this indicates '.' or '..'. + * We don't want to do anything with this, because it + * screws up the code that calls us. We don't really + * care anyways, since we can just use the non-RR + * name. + */ + if (rr->u.NM.flags & 6) + break; + + if (rr->u.NM.flags & ~1) { + printk("Unsupported NM flag settings (%d)\n", + rr->u.NM.flags); + break; + } + if ((strlen(retname) + rr->len - 5) >= 254) { + truncate = 1; + break; + } + strncat(retname, rr->u.NM.name, rr->len - 5); + retnamlen += rr->len - 5; + break; + case SIG('R', 'E'): + kfree(rs.buffer); + return -1; + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + return retnamlen; /* If 0, this file did not have a NM field */ +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; +} + +#define RR_REGARD_XA 1 +#define RR_RELOC_DE 2 + +static int +parse_rock_ridge_inode_internal(struct iso_directory_record *de, + struct inode *inode, int flags) +{ + int symlink_len = 0; + int cnt, sig; + unsigned int reloc_block; + struct inode *reloc; + struct rock_ridge *rr; + int rootflag; + struct rock_state rs; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); + if (flags & RR_REGARD_XA) { + rs.chr += 14; + rs.len -= 14; + if (rs.len < 0) + rs.len = 0; + } + +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ + if (rs.len < 0) + goto out; /* Something got screwed up here */ + + switch (sig) { +#ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & + (RR_PX | RR_TF | RR_SL | RR_CL)) == 0) + goto out; + break; +#endif + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('E', 'R'): + /* Invalid length of ER tag id? */ + if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) + goto out; + ISOFS_SB(inode->i_sb)->s_rock = 1; + printk(KERN_DEBUG "ISO 9660 Extensions: "); + { + int p; + for (p = 0; p < rr->u.ER.len_id; p++) + printk("%c", rr->u.ER.data[p]); + } + printk("\n"); + break; + case SIG('P', 'X'): + inode->i_mode = isonum_733(rr->u.PX.mode); + set_nlink(inode, isonum_733(rr->u.PX.n_links)); + i_uid_write(inode, isonum_733(rr->u.PX.uid)); + i_gid_write(inode, isonum_733(rr->u.PX.gid)); + break; + case SIG('P', 'N'): + { + int high, low; + high = isonum_733(rr->u.PN.dev_high); + low = isonum_733(rr->u.PN.dev_low); + /* + * The Rock Ridge standard specifies that if + * sizeof(dev_t) <= 4, then the high field is + * unused, and the device number is completely + * stored in the low field. Some writers may + * ignore this subtlety, + * and as a result we test to see if the entire + * device number is + * stored in the low field, and use that. + */ + if ((low & ~0xff) && high == 0) { + inode->i_rdev = + MKDEV(low >> 8, low & 0xff); + } else { + inode->i_rdev = + MKDEV(high, low); + } + } + break; + case SIG('T', 'F'): + /* + * Some RRIP writers incorrectly place ctime in the + * TF_CREATE field. Try to handle this correctly for + * either case. + */ + /* Rock ridge never appears on a High Sierra disk */ + cnt = 0; + if (rr->u.TF.flags & TF_CREATE) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_MODIFY) { + inode->i_mtime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_mtime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ACCESS) { + inode->i_atime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_atime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ATTRIBUTES) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + break; + case SIG('S', 'L'): + { + int slen; + struct SL_component *slp; + struct SL_component *oldslp; + slen = rr->len - 5; + slp = &rr->u.SL.link; + inode->i_size = symlink_len; + while (slen > 1) { + rootflag = 0; + switch (slp->flags & ~1) { + case 0: + inode->i_size += + slp->len; + break; + case 2: + inode->i_size += 1; + break; + case 4: + inode->i_size += 2; + break; + case 8: + rootflag = 1; + inode->i_size += 1; + break; + default: + printk("Symlink component flag " + "not implemented\n"); + } + slen -= slp->len + 2; + oldslp = slp; + slp = (struct SL_component *) + (((char *)slp) + slp->len + 2); + + if (slen < 2) { + if (((rr->u.SL. + flags & 1) != 0) + && + ((oldslp-> + flags & 1) == 0)) + inode->i_size += + 1; + break; + } + + /* + * If this component record isn't + * continued, then append a '/'. + */ + if (!rootflag + && (oldslp->flags & 1) == 0) + inode->i_size += 1; + } + } + symlink_len = inode->i_size; + break; + case SIG('R', 'E'): + printk(KERN_WARNING "Attempt to read inode for " + "relocated directory\n"); + goto out; + case SIG('C', 'L'): + if (flags & RR_RELOC_DE) { + printk(KERN_ERR + "ISOFS: Recursive directory relocation " + "is not supported\n"); + goto eio; + } + reloc_block = isonum_733(rr->u.CL.location); + if (reloc_block == ISOFS_I(inode)->i_iget5_block && + ISOFS_I(inode)->i_iget5_offset == 0) { + printk(KERN_ERR + "ISOFS: Directory relocation points to " + "itself\n"); + goto eio; + } + ISOFS_I(inode)->i_first_extent = reloc_block; + reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0); + if (IS_ERR(reloc)) { + ret = PTR_ERR(reloc); + goto out; + } + inode->i_mode = reloc->i_mode; + set_nlink(inode, reloc->i_nlink); + inode->i_uid = reloc->i_uid; + inode->i_gid = reloc->i_gid; + inode->i_rdev = reloc->i_rdev; + inode->i_size = reloc->i_size; + inode->i_blocks = reloc->i_blocks; + inode->i_atime = reloc->i_atime; + inode->i_ctime = reloc->i_ctime; + inode->i_mtime = reloc->i_mtime; + iput(reloc); + break; +#ifdef CONFIG_ZISOFS + case SIG('Z', 'F'): { + int algo; + + if (ISOFS_SB(inode->i_sb)->s_nocompress) + break; + algo = isonum_721(rr->u.ZF.algorithm); + if (algo == SIG('p', 'z')) { + int block_shift = + isonum_711(&rr->u.ZF.parms[1]); + if (block_shift > 17) { + printk(KERN_WARNING "isofs: " + "Can't handle ZF block " + "size of 2^%d\n", + block_shift); + } else { + /* + * Note: we don't change + * i_blocks here + */ + ISOFS_I(inode)->i_file_format = + isofs_file_compressed; + /* + * Parameters to compression + * algorithm (header size, + * block size) + */ + ISOFS_I(inode)->i_format_parm[0] = + isonum_711(&rr->u.ZF.parms[0]); + ISOFS_I(inode)->i_format_parm[1] = + isonum_711(&rr->u.ZF.parms[1]); + inode->i_size = + isonum_733(rr->u.ZF. + real_size); + } + } else { + printk(KERN_WARNING + "isofs: Unknown ZF compression " + "algorithm: %c%c\n", + rr->u.ZF.algorithm[0], + rr->u.ZF.algorithm[1]); + } + break; + } +#endif + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + ret = 0; +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; +} + +static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) +{ + int slen; + int rootflag; + struct SL_component *oldslp; + struct SL_component *slp; + slen = rr->len - 5; + slp = &rr->u.SL.link; + while (slen > 1) { + rootflag = 0; + switch (slp->flags & ~1) { + case 0: + if (slp->len > plimit - rpnt) + return NULL; + memcpy(rpnt, slp->text, slp->len); + rpnt += slp->len; + break; + case 2: + if (rpnt >= plimit) + return NULL; + *rpnt++ = '.'; + break; + case 4: + if (2 > plimit - rpnt) + return NULL; + *rpnt++ = '.'; + *rpnt++ = '.'; + break; + case 8: + if (rpnt >= plimit) + return NULL; + rootflag = 1; + *rpnt++ = '/'; + break; + default: + printk("Symlink component flag not implemented (%d)\n", + slp->flags); + } + slen -= slp->len + 2; + oldslp = slp; + slp = (struct SL_component *)((char *)slp + slp->len + 2); + + if (slen < 2) { + /* + * If there is another SL record, and this component + * record isn't continued, then add a slash. + */ + if ((!rootflag) && (rr->u.SL.flags & 1) && + !(oldslp->flags & 1)) { + if (rpnt >= plimit) + return NULL; + *rpnt++ = '/'; + } + break; + } + + /* + * If this component record isn't continued, then append a '/'. + */ + if (!rootflag && !(oldslp->flags & 1)) { + if (rpnt >= plimit) + return NULL; + *rpnt++ = '/'; + } + } + return rpnt; +} + +int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode, + int relocated) +{ + int flags = relocated ? RR_RELOC_DE : 0; + int result = parse_rock_ridge_inode_internal(de, inode, flags); + + /* + * if rockridge flag was reset and we didn't look for attributes + * behind eventual XA attributes, have a look there + */ + if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) + && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { + result = parse_rock_ridge_inode_internal(de, inode, + flags | RR_REGARD_XA); + } + return result; +} + +/* + * readpage() for symlinks: reads symlink contents into the page and either + * makes it uptodate and returns 0 or returns error (-EIO) + */ +static int rock_ridge_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct iso_inode_info *ei = ISOFS_I(inode); + struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); + char *link = kmap(page); + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + struct buffer_head *bh; + char *rpnt = link; + unsigned char *pnt; + struct iso_directory_record *raw_de; + unsigned long block, offset; + int sig; + struct rock_ridge *rr; + struct rock_state rs; + int ret; + + if (!sbi->s_rock) + goto error; + + init_rock_state(&rs, inode); + block = ei->i_iget5_block; + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + + offset = ei->i_iget5_offset; + pnt = (unsigned char *)bh->b_data + offset; + + raw_de = (struct iso_directory_record *)pnt; + + /* + * If we go past the end of the buffer, there is some sort of error. + */ + if (offset + *pnt > bufsize) + goto out_bad_span; + + /* + * Now test for possible Rock Ridge extensions which will override + * some of these numbers in the inode structure. + */ + + setup_rock_ridge(raw_de, inode, &rs); + +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto out; + rs.chr += rr->len; + rs.len -= rr->len; + if (rs.len < 0) + goto out; /* corrupted isofs */ + + switch (sig) { + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & RR_SL) == 0) + goto out; + break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('S', 'L'): + rpnt = get_symlink_chunk(rpnt, rr, + link + (PAGE_SIZE - 1)); + if (rpnt == NULL) + goto out; + break; + case SIG('C', 'E'): + /* This tells is if there is a continuation record */ + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret < 0) + goto fail; + + if (rpnt == link) + goto fail; + brelse(bh); + *rpnt = '\0'; + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + + /* error exit from macro */ +out: + kfree(rs.buffer); + goto fail; +out_noread: + printk("unable to read i-node block"); + goto fail; +out_bad_span: + printk("symlink spans iso9660 blocks\n"); +fail: + brelse(bh); +error: + SetPageError(page); + kunmap(page); + unlock_page(page); + return -EIO; +} + +const struct address_space_operations isofs_symlink_aops = { + .readpage = rock_ridge_symlink_readpage +}; diff --git a/kernel/fs/isofs/rock.h b/kernel/fs/isofs/rock.h new file mode 100644 index 000000000..ed09e2b08 --- /dev/null +++ b/kernel/fs/isofs/rock.h @@ -0,0 +1,122 @@ +/* + * These structs are used by the system-use-sharing protocol, in which the + * Rock Ridge extensions are embedded. It is quite possible that other + * extensions are present on the disk, and this is fine as long as they + * all use SUSP + */ + +struct SU_SP_s { + unsigned char magic[2]; + unsigned char skip; +} __attribute__ ((packed)); + +struct SU_CE_s { + char extent[8]; + char offset[8]; + char size[8]; +}; + +struct SU_ER_s { + unsigned char len_id; + unsigned char len_des; + unsigned char len_src; + unsigned char ext_ver; + char data[0]; +} __attribute__ ((packed)); + +struct RR_RR_s { + char flags[1]; +} __attribute__ ((packed)); + +struct RR_PX_s { + char mode[8]; + char n_links[8]; + char uid[8]; + char gid[8]; +}; + +struct RR_PN_s { + char dev_high[8]; + char dev_low[8]; +}; + +struct SL_component { + unsigned char flags; + unsigned char len; + char text[0]; +} __attribute__ ((packed)); + +struct RR_SL_s { + unsigned char flags; + struct SL_component link; +} __attribute__ ((packed)); + +struct RR_NM_s { + unsigned char flags; + char name[0]; +} __attribute__ ((packed)); + +struct RR_CL_s { + char location[8]; +}; + +struct RR_PL_s { + char location[8]; +}; + +struct stamp { + char time[7]; +} __attribute__ ((packed)); + +struct RR_TF_s { + char flags; + struct stamp times[0]; /* Variable number of these beasts */ +} __attribute__ ((packed)); + +/* Linux-specific extension for transparent decompression */ +struct RR_ZF_s { + char algorithm[2]; + char parms[2]; + char real_size[8]; +}; + +/* + * These are the bits and their meanings for flags in the TF structure. + */ +#define TF_CREATE 1 +#define TF_MODIFY 2 +#define TF_ACCESS 4 +#define TF_ATTRIBUTES 8 +#define TF_BACKUP 16 +#define TF_EXPIRATION 32 +#define TF_EFFECTIVE 64 +#define TF_LONG_FORM 128 + +struct rock_ridge { + char signature[2]; + unsigned char len; + unsigned char version; + union { + struct SU_SP_s SP; + struct SU_CE_s CE; + struct SU_ER_s ER; + struct RR_RR_s RR; + struct RR_PX_s PX; + struct RR_PN_s PN; + struct RR_SL_s SL; + struct RR_NM_s NM; + struct RR_CL_s CL; + struct RR_PL_s PL; + struct RR_TF_s TF; + struct RR_ZF_s ZF; + } u; +}; + +#define RR_PX 1 /* POSIX attributes */ +#define RR_PN 2 /* POSIX devices */ +#define RR_SL 4 /* Symbolic link */ +#define RR_NM 8 /* Alternate Name */ +#define RR_CL 16 /* Child link */ +#define RR_PL 32 /* Parent link */ +#define RR_RE 64 /* Relocation directory */ +#define RR_TF 128 /* Timestamps */ diff --git a/kernel/fs/isofs/util.c b/kernel/fs/isofs/util.c new file mode 100644 index 000000000..005a15cfd --- /dev/null +++ b/kernel/fs/isofs/util.c @@ -0,0 +1,70 @@ +/* + * linux/fs/isofs/util.c + */ + +#include +#include "isofs.h" + +/* + * We have to convert from a MM/DD/YY format to the Unix ctime format. + * We have to take into account leap years and all of that good stuff. + * Unfortunately, the kernel does not have the information on hand to + * take into account daylight savings time, but it shouldn't matter. + * The time stored should be localtime (with or without DST in effect), + * and the timezone offset should hold the offset required to get back + * to GMT. Thus we should always be correct. + */ + +int iso_date(char * p, int flag) +{ + int year, month, day, hour, minute, second, tz; + int crtime; + + year = p[0]; + month = p[1]; + day = p[2]; + hour = p[3]; + minute = p[4]; + second = p[5]; + if (flag == 0) tz = p[6]; /* High sierra has no time zone */ + else tz = 0; + + if (year < 0) { + crtime = 0; + } else { + crtime = mktime64(year+1900, month, day, hour, minute, second); + + /* sign extend */ + if (tz & 0x80) + tz |= (-1 << 8); + + /* + * The timezone offset is unreliable on some disks, + * so we make a sanity check. In no case is it ever + * more than 13 hours from GMT, which is 52*15min. + * The time is always stored in localtime with the + * timezone offset being what get added to GMT to + * get to localtime. Thus we need to subtract the offset + * to get to true GMT, which is what we store the time + * as internally. On the local system, the user may set + * their timezone any way they wish, of course, so GMT + * gets converted back to localtime on the receiving + * system. + * + * NOTE: mkisofs in versions prior to mkisofs-1.10 had + * the sign wrong on the timezone offset. This has now + * been corrected there too, but if you are getting screwy + * results this may be the explanation. If enough people + * complain, a user configuration option could be added + * to add the timezone offset in with the wrong sign + * for 'compatibility' with older discs, but I cannot see how + * it will matter that much. + * + * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) + * for pointing out the sign error. + */ + if (-52 <= tz && tz <= 52) + crtime -= tz * 15 * 60; + } + return crtime; +} diff --git a/kernel/fs/isofs/zisofs.h b/kernel/fs/isofs/zisofs.h new file mode 100644 index 000000000..273795709 --- /dev/null +++ b/kernel/fs/isofs/zisofs.h @@ -0,0 +1,21 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2001 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Prototypes for functions exported from the compressed isofs subsystem + */ + +#ifdef CONFIG_ZISOFS +extern const struct address_space_operations zisofs_aops; +extern int __init zisofs_init(void); +extern void zisofs_cleanup(void); +#endif diff --git a/kernel/fs/jbd/Kconfig b/kernel/fs/jbd/Kconfig new file mode 100644 index 000000000..4e28beeed --- /dev/null +++ b/kernel/fs/jbd/Kconfig @@ -0,0 +1,30 @@ +config JBD + tristate + help + This is a generic journalling layer for block devices. It is + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block + devices such as RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD && DEBUG_FS + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/kernel/fs/jbd/Makefile b/kernel/fs/jbd/Makefile new file mode 100644 index 000000000..54aca4868 --- /dev/null +++ b/kernel/fs/jbd/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux journaling routines. +# + +obj-$(CONFIG_JBD) += jbd.o + +jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/kernel/fs/jbd/checkpoint.c b/kernel/fs/jbd/checkpoint.c new file mode 100644 index 000000000..95debd71e --- /dev/null +++ b/kernel/fs/jbd/checkpoint.c @@ -0,0 +1,784 @@ +/* + * linux/fs/jbd/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink_first(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; +} + +/* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * + * Requires j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __try_to_free_cp_buf(struct journal_head *jh) +{ + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); + JBUFFER_TRACE(jh, "remove from checkpoint list"); + ret = __journal_remove_checkpoint(jh) + 1; + jbd_unlock_bh_state(bh); + BUFFER_TRACE(bh, "release"); + __brelse(bh); + } else { + jbd_unlock_bh_state(bh); + } + return ret; +} + +/* + * __log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __log_wait_for_space(journal_t *journal) +{ + int nblocks, space_left; + assert_spin_locked(&journal->j_state_lock); + + nblocks = jbd_space_needed(journal); + while (__log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + spin_unlock(&journal->j_state_lock); + if (current->plug) + io_schedule(); + mutex_lock(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + nblocks = jbd_space_needed(journal); + space_left = __log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; + spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + if (chkpt) { + log_do_checkpoint(journal); + } else if (cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + log_wait_commit(journal, tid); + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space\n", __func__); + WARN_ON(1); + journal_abort(journal, 0); + } + spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +/* + * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. + * The caller must restart a list walk. Wait for someone else to run + * jbd_unlock_bh_state(). + */ +static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) + __releases(journal->j_list_lock) +{ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_lock_bh_state(bh); + jbd_unlock_bh_state(bh); + put_bh(bh); +} + +/* + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. + * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * + * Called with j_list_lock held. + */ +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + tid_t this_tid; + int released = 0; + int ret = 0; + + this_tid = transaction->t_tid; +restart: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return ret; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + + /* + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list + */ + released = __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + __brelse(bh); + } + + return ret; +} + +#define NR_BATCH 64 + +static void +__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +{ + int i; + struct blk_plug plug; + + blk_start_plug(&plug); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); + + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = bhs[i]; + clear_buffer_jwrite(bh); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; +} + +/* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. + * + * Called with j_list_lock held and drops it if 1 is returned + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __process_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count) +{ + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + ret = 1; + } else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + ret = 1; + } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + get_bh(bh); + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); + } else { + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + set_buffer_jwrite(bh); + bhs[*batch_count] = bh; + __buffer_relink_io(jh); + jbd_unlock_bh_state(bh); + (*batch_count)++; + if (*batch_count == NR_BATCH) { + spin_unlock(&journal->j_list_lock); + __flush_batch(journal, bhs, batch_count); + ret = 1; + } + } + return ret; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. + */ +int log_do_checkpoint(journal_t *journal) +{ + transaction_t *transaction; + tid_t this_tid; + int result; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + result = 0; + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions == transaction && + transaction->t_tid == this_tid) { + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + struct journal_head *jh; + int retry = 0, err; + + while (!retry && transaction->t_checkpoint_list) { + struct buffer_head *bh; + + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + retry = 1; + break; + } + retry = __process_buffer(journal, jh, bhs,&batch_count); + if (retry < 0 && !result) + result = retry; + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { + spin_unlock(&journal->j_list_lock); + retry = 1; + break; + } + } + + if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } + __flush_batch(journal, bhs, &batch_count); + } + + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } + /* + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one + */ + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; + } +out: + spin_unlock(&journal->j_list_lock); + if (result < 0) + journal_abort(journal, result); + else + result = cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. + */ + +int cleanup_journal_tail(journal_t *journal) +{ + transaction_t * transaction; + tid_t first_tid; + unsigned int blocknr, freed; + + if (is_journal_aborted(journal)) + return 1; + + /* + * OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal->j_list_lock); + J_ASSERT(blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) { + spin_unlock(&journal->j_state_lock); + return 1; + } + spin_unlock(&journal->j_state_lock); + + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by log_do_checkpoint() --- are flushed out before we + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. + */ + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); + + spin_lock(&journal->j_state_lock); + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + spin_unlock(&journal->j_state_lock); + return 0; +} + + +/* Checkpoint list management */ + +/* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release + * them. + * + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +int __journal_clean_checkpoint_list(journal_t *journal) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + int released; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; + } while (transaction != last_transaction); +out: + return ret; +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. + * + * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) + */ + +int __journal_remove_checkpoint(struct journal_head *jh) +{ + transaction_t *transaction; + journal_t *journal; + int ret = 0; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + journal = transaction->t_journal; + + JBUFFER_TRACE(jh, "removing from transaction"); + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); + + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out; + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! + * + * The locking here around t_state is a bit sleazy. + * See the comment at the end of journal_commit_transaction(). + */ + if (transaction->t_state != T_FINISHED) + goto out; + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + + __journal_drop_transaction(journal, transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + ret = 1; +out: + return ret; +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_sync_datalist == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_iobuf_list == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_log_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(transaction->t_updates == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + trace_jbd_drop_transaction(journal, transaction); + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); + kfree(transaction); +} diff --git a/kernel/fs/jbd/commit.c b/kernel/fs/jbd/commit.c new file mode 100644 index 000000000..bb217dcb4 --- /dev/null +++ b/kernel/fs/jbd/commit.c @@ -0,0 +1,1021 @@ +/* + * linux/fs/jbd/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); +} + +/* + * When an ext3-ordered file is truncated, it is possible that many pages are + * not successfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + page = bh->b_page; + if (!page) + goto nope; + if (page->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (!trylock_page(page)) + goto nope; + + page_cache_get(page); + __brelse(bh); + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + return; + +nope: + __brelse(bh); +} + +/* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + WARN_ON_ONCE(buffer_dirty(bh)); + clear_buffer_freed(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + release_buffer_page(bh); + } else + put_bh(bh); +} + +/* + * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is + * held. For ranking reasons we must trylock. If we lose, schedule away and + * return 0. j_list_lock is dropped in this case. + */ +static int inverted_lock(journal_t *journal, struct buffer_head *bh) +{ + if (!jbd_trylock_bh_state(bh)) { + spin_unlock(&journal->j_list_lock); + schedule(); + return 0; + } + return 1; +} + +/* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_write_commit_record(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *descriptor; + struct buffer_head *bh; + journal_header_t *header; + int ret; + + if (is_journal_aborted(journal)) + return 0; + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return 1; + + bh = jh2bh(descriptor); + + header = (journal_header_t *)(bh->b_data); + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + JBUFFER_TRACE(descriptor, "write commit block"); + set_buffer_dirty(bh); + + if (journal->j_flags & JFS_BARRIER) + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); + else + ret = sync_dirty_buffer(bh); + + put_bh(bh); /* One for getblk() */ + journal_put_journal_head(descriptor); + + return (ret == -EIO); +} + +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, + int write_op) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); + } +} + +/* + * Submit all the data buffers to disk + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction, + int write_op) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (!trylock_buffer(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) || bh2jh(bh) != jh + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + release_data_buffer(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + goto write_out_data; + } + } else if (!locked && buffer_locked(bh)) { + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + release_data_buffer(bh); + } + + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); + journal_do_submit_data(wbuf, bufs, write_op); + + return err; +} + +/* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void journal_commit_transaction(journal_t *journal) +{ + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int flags; + int err; + unsigned int blocknr; + ktime_t start_time; + u64 commit_time; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + struct blk_plug plug; + int write_op = WRITE; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + /* Do we need to erase the effects of a prior journal_flush? */ + if (journal->j_flags & JFS_FLUSHED) { + jbd_debug(3, "super block updated\n"); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + + trace_jbd_start_commit(journal, commit_transaction); + jbd_debug(1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); + commit_transaction->t_state = T_LOCKED; + + trace_jbd_commit_locking(journal, commit_transaction); + spin_lock(&commit_transaction->t_handle_lock); + while (commit_transaction->t_updates) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (commit_transaction->t_updates) { + spin_unlock(&commit_transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); + + J_ASSERT (commit_transaction->t_outstanding_credits <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple journal_get_write_access() calls to the same + * buffer are perfectly permissible. + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A journal_get_undo_access()+journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + jbd_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + jbd_unlock_bh_state(bh); + } + journal_refile_buffer(journal, jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal->j_list_lock); + + jbd_debug (3, "JBD: commit phase 1\n"); + + /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + journal_clear_buffer_revoked_flags(journal); + + /* + * Switch to a new revoke table. + */ + journal_switch_revoke_table(journal); + + trace_jbd_commit_flushing(journal, commit_transaction); + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + start_time = ktime_get(); + commit_transaction->t_log_start = journal->j_head; + wake_up(&journal->j_wait_transaction_locked); + spin_unlock(&journal->j_state_lock); + + jbd_debug (3, "JBD: commit phase 2\n"); + + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + blk_start_plug(&plug); + err = journal_submit_data_buffers(journal, commit_transaction, + write_op); + blk_finish_plug(&plug); + + /* + * Wait for all previously submitted IO to complete. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + spin_lock(&journal->j_list_lock); + } + if (unlikely(!buffer_uptodate(bh))) { + if (!trylock_page(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + release_data_buffer(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (err) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); + err = 0; + } + + blk_start_plug(&plug); + + journal_write_revoke_records(journal, commit_transaction, write_op); + + /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + spin_lock(&journal->j_state_lock); + commit_transaction->t_state = T_COMMIT; + spin_unlock(&journal->j_state_lock); + + trace_jbd_commit_logging(journal, commit_transaction); + J_ASSERT(commit_transaction->t_nr_buffers <= + commit_transaction->t_outstanding_credits); + + descriptor = NULL; + bufs = 0; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it. */ + + if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); + JBUFFER_TRACE(jh, "journal is aborting: refile"); + journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) { + journal_abort(journal, -EIO); + continue; + } + + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %llu (%p)\n", + (unsigned long long)bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(bh); + set_buffer_dirty(bh); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + err = journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + journal_abort(journal, err); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log, but this counter is changed + * by journal_next_log_block() also. + */ + commit_transaction->t_outstanding_credits--; + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + get_bh(jh2bh(jh)); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_buffer_jwrite(jh2bh(jh)); + /* + * akpm: journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + set_buffer_jwrite(jh2bh(new_jh)); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JFS_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); + +start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); + } + cond_resched(); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + blk_finish_plug(&plug); + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* + * akpm: these are BJ_IO, and j_list_lock is not needed. + * See __journal_try_to_free_buffer. + */ +wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_iobuf; + } + if (cond_resched()) + goto wait_for_iobuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + clear_buffer_jwrite(bh); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + journal_unfile_buffer(journal, jh); + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + journal_put_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_buffer_jwrite(bh); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); + wake_up_bit(&bh->b_state, BH_Unshadow); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 5\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_ctlbuf; + } + if (cond_resched()) + goto wait_for_ctlbuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + journal_unfile_buffer(journal, jh); + journal_put_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + if (err) + journal_abort(journal, err); + + jbd_debug(3, "JBD: commit phase 6\n"); + + /* All metadata is written, now write commit record and do cleanup */ + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_RECORD; + spin_unlock(&journal->j_state_lock); + + if (journal_write_commit_record(journal, commit_transaction)) + err = -EIO; + + if (err) + journal_abort(journal, err); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + int try_to_free = 0; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || + jh->b_transaction == journal->j_running_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + */ + if (jh->b_committed_data) { + jbd_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + } + } else if (jh->b_frozen_data) { + jbd_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + __journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather throughout in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* + * The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; + } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 8\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); + + commit_transaction->t_state = T_FINISHED; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time*3 + + journal->j_average_commit_time) / 4; + else + journal->j_average_commit_time = commit_time; + + spin_unlock(&journal->j_state_lock); + + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __journal_drop_transaction(journal, commit_transaction); + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal->j_list_lock); + + trace_jbd_end_commit(journal, commit_transaction); + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + wake_up(&journal->j_wait_done_commit); +} diff --git a/kernel/fs/jbd/journal.c b/kernel/fs/jbd/journal.c new file mode 100644 index 000000000..c46a79adb --- /dev/null +++ b/kernel/fs/jbd/journal.c @@ -0,0 +1,2145 @@ +/* + * linux/fs/jbd/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include +#include + +EXPORT_SYMBOL(journal_start); +EXPORT_SYMBOL(journal_restart); +EXPORT_SYMBOL(journal_extend); +EXPORT_SYMBOL(journal_stop); +EXPORT_SYMBOL(journal_lock_updates); +EXPORT_SYMBOL(journal_unlock_updates); +EXPORT_SYMBOL(journal_get_write_access); +EXPORT_SYMBOL(journal_get_create_access); +EXPORT_SYMBOL(journal_get_undo_access); +EXPORT_SYMBOL(journal_dirty_data); +EXPORT_SYMBOL(journal_dirty_metadata); +EXPORT_SYMBOL(journal_release_buffer); +EXPORT_SYMBOL(journal_forget); +#if 0 +EXPORT_SYMBOL(journal_sync_buffer); +#endif +EXPORT_SYMBOL(journal_flush); +EXPORT_SYMBOL(journal_revoke); + +EXPORT_SYMBOL(journal_init_dev); +EXPORT_SYMBOL(journal_init_inode); +EXPORT_SYMBOL(journal_update_format); +EXPORT_SYMBOL(journal_check_used_features); +EXPORT_SYMBOL(journal_check_available_features); +EXPORT_SYMBOL(journal_set_features); +EXPORT_SYMBOL(journal_create); +EXPORT_SYMBOL(journal_load); +EXPORT_SYMBOL(journal_destroy); +EXPORT_SYMBOL(journal_abort); +EXPORT_SYMBOL(journal_errno); +EXPORT_SYMBOL(journal_ack_err); +EXPORT_SYMBOL(journal_clear_err); +EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); +EXPORT_SYMBOL(journal_start_commit); +EXPORT_SYMBOL(journal_force_commit_nested); +EXPORT_SYMBOL(journal_wipe); +EXPORT_SYMBOL(journal_blocks_per_page); +EXPORT_SYMBOL(journal_invalidatepage); +EXPORT_SYMBOL(journal_try_to_free_buffers); +EXPORT_SYMBOL(journal_force_commit); + +static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +static void __journal_abort_soft (journal_t *journal, int errno); +static const char *journal_dev_name(journal_t *journal, char *buffer); + +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * kjournald: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); + + set_freezable(); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", + journal->j_commit_interval / HZ); + + /* + * And now, wait forever for commit wakeup events. + */ + spin_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JFS_UNMOUNT) + goto end_loop; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal_commit_transaction(journal); + spin_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd_debug(1, "Now suspending kjournald\n"); + spin_unlock(&journal->j_state_lock); + try_to_freeze(); + spin_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + if (journal->j_commit_sequence != journal->j_commit_request) + should_sleep = 0; + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, + transaction->t_expires)) + should_sleep = 0; + if (journal->j_flags & JFS_UNMOUNT) + should_sleep = 0; + if (should_sleep) { + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd_debug(1, "kjournald wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; +} + +static int journal_start_thread(journal_t *journal) +{ + struct task_struct *t; + + t = kthread_run(kjournald, journal, "kjournald"); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; +} + +static void journal_kill_thread(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + journal->j_task == NULL); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); +} + +/* + * journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + +int journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + unsigned int blocknr) +{ + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head *new_jh; + struct page *new_page; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + jbd_lock_bh_state(bh_in); +repeat: + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = offset_in_page(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = offset_in_page(jh2bh(jh_in)->b_data); + } + + mapped_data = kmap_atomic(new_page); + /* + * Check for escaping + */ + if (*((__be32 *)(mapped_data + new_offset)) == + cpu_to_be32(JFS_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + kunmap_atomic(mapped_data); + + /* + * Do we need to do a data copy? + */ + if (need_copy_out && !done_copy_out) { + char *tmp; + + jbd_unlock_bh_state(bh_in); + tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd_free(tmp, bh_in->b_size); + goto repeat; + } + + jh_in->b_frozen_data = tmp; + mapped_data = kmap_atomic(new_page); + memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + kunmap_atomic(mapped_data); + + new_page = virt_to_page(tmp); + new_offset = offset_in_page(tmp); + done_copy_out = 1; + } + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + if (do_escape) { + mapped_data = kmap_atomic(new_page); + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data); + } + + set_bh_page(new_bh, new_page, new_offset); + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *jh_out = new_jh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * __log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + * + * Called under j_state_lock + */ + +int __log_space_left(journal_t *journal) +{ + int left = journal->j_free; + + assert_spin_locked(&journal->j_state_lock); + + /* + * Be pessimistic here about the number of those free blocks which + * might be required for log descriptor control blocks. + */ + +#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; +} + +/* + * Called under j_state_lock. Returns true if a transaction commit was started. + */ +int __log_start_commit(journal_t *journal, tid_t target) +{ + /* + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. + */ + if (journal->j_commit_request != target && + journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { + /* + * We want a new commit: OK, mark the request and wakeup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + return 1; + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + return 0; +} + +int log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + spin_lock(&journal->j_state_lock); + ret = __log_start_commit(journal, tid); + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * We can only force the running transaction if we don't have an active handle; + * otherwise, we will deadlock. + * + * Returns true if a transaction was started. + */ +int journal_force_commit_nested(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return 0; /* Nothing to retry */ + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + return 1; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid + */ +int journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) + *ptid = tid; + ret = 1; + } else if (journal->j_committing_transaction) { + /* + * If commit has been started, then we have to wait for + * completion of that transaction. + */ + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + +#ifdef CONFIG_JBD_DEBUG + spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_ERR + "%s: error: j_commit_request=%d, tid=%d\n", + __func__, journal->j_commit_request, tid); + } + spin_unlock(&journal->j_state_lock); +#endif + spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + spin_lock(&journal->j_state_lock); + } +out_unlock: + spin_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) + err = -EIO; + return err; +} + +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JFS_BARRIER)) + return 0; + spin_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + /* + * Transaction is being committed and we already proceeded to + * writing commit record? + */ + commit_trans = journal->j_committing_transaction; + if (commit_trans && commit_trans->t_tid == tid && + commit_trans->t_state >= T_COMMIT_RECORD) + goto out; + ret = 1; +out: + spin_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(journal_trans_will_send_data_barrier); + +/* + * Log buffer allocation routines: + */ + +int journal_next_log_block(journal_t *journal, unsigned int *retp) +{ + unsigned int blocknr; + + spin_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + spin_unlock(&journal->j_state_lock); + return journal_bmap(journal, blocknr, retp); +} + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) +{ + int err = 0; + unsigned int ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + if (ret) + *retp = ret; + else { + char b[BDEVNAME_SIZE]; + + printk(KERN_ALERT "%s: journal block not found " + "at offset %u on %s\n", + __func__, + blocknr, + bdevname(journal->j_dev, b)); + err = -EIO; + __journal_abort_soft(journal, err); + } + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct journal_head *journal_get_descriptor_buffer(journal_t *journal) +{ + struct buffer_head *bh; + unsigned int blocknr; + int err; + + err = journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return journal_add_journal_head(bh); +} + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + +static journal_t * journal_init_common (void) +{ + journal_t *journal; + int err; + + journal = kzalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + goto fail; + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JFS_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + goto fail; + } + return journal; +fail: + return NULL; +} + +/* journal_init_dev and journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * journal_init_dev() - creates and initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Length of the journal in blocks. + * @blocksize: blocksize of journalling device + * + * Returns: a newly created journal_t * + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t * journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + int start, int len, int blocksize) +{ + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + int n; + + if (!journal) + return NULL; + + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; +} + +/** + * journal_t * journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t * journal_init_inode (struct inode *inode) +{ + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + int err; + int n; + unsigned int blocknr; + + if (!journal) + return NULL; + + journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; + journal->j_inode = inode; + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, inode->i_sb->s_id, inode->i_ino, + (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + err = journal_bmap(journal, 0, &blocknr); + /* If that failed, give up */ + if (err) { + printk(KERN_ERR "%s: Cannot locate journal superblock\n", + __func__); + goto out_err; + } + + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock (journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned int first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } + return journal_start_thread(journal); +} + +/** + * int journal_create() - Initialise the new journal file + * @journal: Journal to create. This structure must have been initialised + * + * Given a journal_t structure which tells us which disk blocks we can + * use, create a new journal superblock and initialise all of the + * journal fields from scratch. + **/ +int journal_create(journal_t *journal) +{ + unsigned int blocknr; + struct buffer_head *bh; + journal_superblock_t *sb; + int i, err; + + if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { + printk (KERN_ERR "Journal length (%d blocks) too short.\n", + journal->j_maxlen); + journal_fail_superblock(journal); + return -EINVAL; + } + + if (journal->j_inode == NULL) { + /* + * We don't know what block to start at! + */ + printk(KERN_EMERG + "%s: creation of journal on external device!\n", + __func__); + BUG(); + } + + /* Zero out the entire journal on disk. We cannot afford to + have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ + jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); + for (i = 0; i < journal->j_maxlen; i++) { + err = journal_bmap(journal, i, &blocknr); + if (err) + return err; + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (unlikely(!bh)) + return -ENOMEM; + lock_buffer(bh); + memset (bh->b_data, 0, journal->j_blocksize); + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + __brelse(bh); + } + + sync_blockdev(journal->j_dev); + jbd_debug(1, "JBD: journal cleared.\n"); + + /* OK, fill in the initial static fields in the new superblock */ + sb = journal->j_superblock; + + sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + + sb->s_blocksize = cpu_to_be32(journal->j_blocksize); + sb->s_maxlen = cpu_to_be32(journal->j_maxlen); + sb->s_first = cpu_to_be32(1); + + journal->j_transaction_sequence = 1; + + journal->j_flags &= ~JFS_ABORT; + journal->j_format_version = 2; + + return journal_reset(journal); +} + +static void journal_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD: previous I/O error detected " + "for journal superblock update for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + journal_write_superblock(journal, write_op); + + /* Log is no longer empty */ + spin_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + spin_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + spin_unlock(&journal->j_state_lock); + return; + } + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(0); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_FUA); + + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + spin_lock(&journal->j_state_lock); + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); +} + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err = -EIO; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + goto out; + } + } + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + goto out; + } + + switch(be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JFS_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_last = be32_to_cpu(sb->s_maxlen); + journal->j_errno = be32_to_cpu(sb->s_errno); + + return 0; +} + + +/** + * int journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (journal_recover(journal)) + goto recovery_error; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JFS_ABORT; + journal->j_flags |= JFS_LOADED; + return 0; + +recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; +} + +/** + * void journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + * Return <0 if we couldn't clean up the journal. + */ +int journal_destroy(journal_t *journal) +{ + int err = 0; + + + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + log_do_checkpoint(journal); + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + if (journal->j_sb_buffer) { + if (!is_journal_aborted(journal)) { + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + mark_journal_empty(journal); + } else + err = -EIO; + brelse(journal->j_sb_buffer); + } + mutex_unlock(&journal->j_checkpoint_mutex); + + iput(journal->j_inode); + if (journal->j_revoke) + journal_destroy_revoke(journal); + kfree(journal->j_wbuf); + kfree(journal); + + return err; +} + + +/** + *int journal_check_used_features () - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * int journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + if (!compat && !ro && !incompat) + return 1; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && + (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +/** + * int journal_set_features () - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; +} + + +/** + * int journal_update_format () - Update on-disk journal structure. + * @journal: Journal to act on. + * + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ +int journal_update_format (journal_t *journal) +{ + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V2: + return 0; + case JFS_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; +} + +static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) +{ + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = be32_to_cpu(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + return 0; +} + + +/** + * int journal_flush () - Flush journal + * @journal: Journal to act on. + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + +int journal_flush(journal_t *journal) +{ + int err = 0; + transaction_t *transaction = NULL; + + spin_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + } else { + spin_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + err = log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + mutex_lock(&journal->j_checkpoint_mutex); + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_state_lock); + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + spin_unlock(&journal->j_state_lock); + return 0; +} + +/** + * int journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int journal_wipe(journal_t *journal, int write) +{ + int err = 0; + + J_ASSERT (!(journal->j_flags & JFS_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = journal_skip_recovery(journal); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } + + no_recovery: + return err; +} + +/* + * journal_dev_name: format a character string to describe on what + * device this journal is present. + */ + +static const char *journal_dev_name(journal_t *journal, char *buffer) +{ + struct block_device *bdev; + + if (journal->j_inode) + bdev = journal->j_inode->i_sb->s_bdev; + else + bdev = journal->j_dev; + + return bdevname(bdev, buffer); +} + +/* + * Journal abort has very specific semantics, which we describe + * for journal abort. + * + * Two internal function, which provide abort to te jbd layer + * itself are here. + */ + +/* + * Quick version for internal journal use (doesn't lock the journal). + * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, + * and don't attempt to make any other journal updates. + */ +static void __journal_abort_hard(journal_t *journal) +{ + transaction_t *transaction; + char b[BDEVNAME_SIZE]; + + if (journal->j_flags & JFS_ABORT) + return; + + printk(KERN_ERR "Aborting journal on device %s.\n", + journal_dev_name(journal, b)); + + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); +} + +/* Soft abort: record the abort error status in the journal superblock, + * but don't do any other IO. */ +static void __journal_abort_soft (journal_t *journal, int errno) +{ + if (journal->j_flags & JFS_ABORT) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __journal_abort_hard(journal); + + if (errno) + journal_update_sb_errno(journal); +} + +/** + * void journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + +void journal_abort(journal_t *journal, int errno) +{ + __journal_abort_soft(journal, errno); +} + +/** + * int journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno numbet set with journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int journal_errno(journal_t *journal) +{ + int err; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + err = journal->j_errno; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * int journal_clear_err () - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +int journal_clear_err(journal_t *journal) +{ + int err = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * void journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +void journal_ack_err(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JFS_ACK_ERR; + spin_unlock(&journal->j_state_lock); +} + +int journal_blocks_per_page(struct inode *inode) +{ + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); +} + +/* + * Journal_head storage management + */ +static struct kmem_cache *journal_head_cache; +#ifdef CONFIG_JBD_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int journal_init_journal_head_cache(void) +{ + int retval; + + J_ASSERT(journal_head_cache == NULL); + journal_head_cache = kmem_cache_create("journal_head", + sizeof(struct journal_head), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + retval = 0; + if (!journal_head_cache) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; +} + +static void journal_destroy_journal_head_cache(void) +{ + if (journal_head_cache) { + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; + } +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + +#ifdef CONFIG_JBD_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); + if (ret == NULL) { + jbd_debug(1, "out of memory for journal_head\n"); + printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __func__); + + while (ret == NULL) { + yield(); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); + } + } + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = journal_add_journal_head(bh); + * ... + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); + */ + +/* + * Give a buffer_head a journal_head. + * + * May sleep. + */ +struct journal_head *journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) + new_jh = journal_alloc_journal_head(); + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_jcount >= 0); + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then + * release the journal_head from the buffer_head. + */ +void journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount) { + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); + __brelse(bh); + } else + jbd_unlock_bh_journal_head(bh); +} + +/* + * debugfs tunables + */ +#ifdef CONFIG_JBD_DEBUG + +u8 journal_enable_debug __read_mostly; +EXPORT_SYMBOL(journal_enable_debug); + +static struct dentry *jbd_debugfs_dir; +static struct dentry *jbd_debug; + +static void __init jbd_create_debugfs_entry(void) +{ + jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); + if (jbd_debugfs_dir) + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, + jbd_debugfs_dir, + &journal_enable_debug); +} + +static void __exit jbd_remove_debugfs_entry(void) +{ + debugfs_remove(jbd_debug); + debugfs_remove(jbd_debugfs_dir); +} + +#else + +static inline void jbd_create_debugfs_entry(void) +{ +} + +static inline void jbd_remove_debugfs_entry(void) +{ +} + +#endif + +struct kmem_cache *jbd_handle_cache; + +static int __init journal_init_handle_cache(void) +{ + jbd_handle_cache = kmem_cache_create("journal_handle", + sizeof(handle_t), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + if (jbd_handle_cache == NULL) { + printk(KERN_EMERG "JBD: failed to create handle cache\n"); + return -ENOMEM; + } + return 0; +} + +static void journal_destroy_handle_cache(void) +{ + if (jbd_handle_cache) + kmem_cache_destroy(jbd_handle_cache); +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_journal_head_cache(); + if (ret == 0) + ret = journal_init_handle_cache(); + return ret; +} + +static void journal_destroy_caches(void) +{ + journal_destroy_revoke_caches(); + journal_destroy_journal_head_cache(); + journal_destroy_handle_cache(); +} + +static int __init journal_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); + + ret = journal_init_caches(); + if (ret != 0) + journal_destroy_caches(); + jbd_create_debugfs_entry(); + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); +#endif + jbd_remove_debugfs_entry(); + journal_destroy_caches(); +} + +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + diff --git a/kernel/fs/jbd/recovery.c b/kernel/fs/jbd/recovery.c new file mode 100644 index 000000000..a748fe214 --- /dev/null +++ b/kernel/fs/jbd/recovery.c @@ -0,0 +1,594 @@ +/* + * linux/fs/jbd/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +static void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static int do_readahead(journal_t *journal, unsigned int start) +{ + int err; + unsigned int max, nbufs, next; + unsigned int blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + err = journal_bmap(journal, next, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned int blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_maxlen) { + printk(KERN_ERR "JBD: corrupted journal superblock\n"); + return -EIO; + } + + err = journal_bmap(journal, offset, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(struct buffer_head *bh, int size) +{ + char * tagp; + journal_block_tag_t * tag; + int nr = 0; + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += sizeof(journal_block_tag_t); + if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +/** + * journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int journal_recover(journal_t *journal) +{ + int err, err2; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + be32_to_cpu(sb->s_sequence)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(1, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + journal_clear_revoke(journal); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + /* Flush disk caches to get replayed data on the permanent storage */ + if (journal->j_flags & JFS_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } + + return err; +} + +/** + * journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function does'nt appear to be exorted.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int journal_skip_recovery(journal_t *journal) +{ + int err; + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { +#ifdef CONFIG_JBD_DEBUG + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd_debug(1, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); +#endif + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + return err; +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned int next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + cond_resched(); + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %u\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; otherwise, just skip over the + * blocks it describes. */ + if (pass != PASS_REPLAY) { + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); + brelse(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + unsigned int io_block; + + tag = (journal_block_tag_t *) tagp; + flags = be32_to_cpu(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %u in log\n", + err, io_block); + } else { + unsigned int blocknr; + + J_ASSERT(obh != NULL); + blocknr = be32_to_cpu(tag->t_blocknr); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = __getblk(journal->j_fs_dev, + blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JFS_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JFS_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JFS_COMMIT_BLOCK: + /* Found an expected commit block: not much to + * do other than move on to the next sequence + * number. */ + brelse(bh); + next_commit_ID++; + continue; + + case JFS_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + brelse(bh); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) + info->end_transaction = next_commit_ID; + else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; +} + + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) +{ + journal_revoke_header_t *header; + int offset, max; + + header = (journal_revoke_header_t *) bh->b_data; + offset = sizeof(journal_revoke_header_t); + max = be32_to_cpu(header->r_count); + + while (offset < max) { + unsigned int blocknr; + int err; + + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + offset += 4; + err = journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; +} diff --git a/kernel/fs/jbd/revoke.c b/kernel/fs/jbd/revoke.c new file mode 100644 index 000000000..dcead636c --- /dev/null +++ b/kernel/fs/jbd/revoke.c @@ -0,0 +1,733 @@ +/* + * linux/fs/jbd/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif +#include +#include + +static struct kmem_cache *revoke_record_cache; +static struct kmem_cache *revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned int blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); +#endif + +/* Utility functions to maintain the revoke table */ + +static inline int hash(journal_t *journal, unsigned int block) +{ + struct jbd_revoke_table_s *table = journal->j_revoke; + + return hash_32(block, table->hash_shift); +} + +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + +repeat: + record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; + +oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); + yield(); + goto repeat; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned int blocknr) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +void journal_destroy_revoke_caches(void) +{ + if (revoke_record_cache) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + } + if (revoke_table_cache) { + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; + } +} + +int __init journal_init_revoke_caches(void) +{ + J_ASSERT(!revoke_record_cache); + J_ASSERT(!revoke_table_cache); + + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (!revoke_record_cache) + goto record_cache_failure; + + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, SLAB_TEMPORARY, NULL); + if (!revoke_table_cache) + goto table_cache_failure; + + return 0; + +table_cache_failure: + journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; +} + +static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) +{ + int i; + struct jbd_revoke_table_s *table; + + table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; + + table->hash_size = hash_size; + table->hash_shift = ilog2(hash_size); + table->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!table->hash_table) { + kmem_cache_free(revoke_table_cache, table); + table = NULL; + goto out; + } + + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); + +out: + return table; +} + +static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); +} + +/* Initialise the revoke table for a given journal to a given size. */ +int journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); + + journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; + + journal->j_revoke = journal->j_revoke_table[1]; + + spin_lock_init(&journal->j_revoke_lock); + + return 0; + +fail1: + journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} + +/* Destroy a journal's revoke table. The table must already be empty! */ +void journal_destroy_revoke(journal_t *journal) +{ + journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + journal_destroy_revoke_table(journal->j_revoke_table[1]); +} + + +#ifdef __KERNEL__ + +/* + * journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, journal_revoke() will decrement its b_count + * by one. + */ + +int journal_revoke(handle_t *handle, unsigned int blocknr, + struct buffer_head *bh_in) +{ + struct buffer_head *bh = NULL; + journal_t *journal; + struct block_device *bdev; + int err; + + might_sleep(); + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + bdev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } +#ifdef JBD_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if (bh2 != bh && buffer_revokevalid(bh2)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, buffer_revoked(bh2)); + put_bh(bh2); + } + } +#endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + if (!J_EXPECT_BH(bh, !buffer_revoked(bh), + "inconsistent data on disk")) { + if (!bh_in) + brelse(bh); + return -EIO; + } + set_buffer_revoked(bh); + set_buffer_revokevalid(bh); + if (bh_in) { + BUFFER_TRACE(bh_in, "call journal_forget"); + journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + BUFFER_TRACE(bh_in, "exit"); + return err; +} + +/* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from journal_get_write_access). + * + * We trust buffer_revoked() on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + */ +int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +{ + struct jbd_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_set_buffer_revokevalid(bh)) { + need_cancel = test_clear_buffer_revoked(bh); + } else { + need_cancel = 1; + clear_buffer_revoked(bh); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %llu\n", (unsigned long long)bh->b_blocknr); + spin_lock(&journal->j_revoke_lock); + list_del(&record->hash); + spin_unlock(&journal->j_revoke_lock); + kmem_cache_free(revoke_record_cache, record); + did_revoke = 1; + } + } + +#ifdef JBD_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); +#endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel) { + struct buffer_head *bh2; + bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + if (bh2) { + if (bh2 != bh) + clear_buffer_revoked(bh2); + __brelse(bh2); + } + } + return did_revoke; +} + +/* + * journal_clear_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffer in the next + * transaction which is going to be started. + */ +void journal_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + +/* journal_switch_revoke table select j_revoke for next transaction + * we do not want to suspend any processing until all revokes are + * written -bzzz + */ +void journal_switch_revoke_table(journal_t *journal) +{ + int i; + + if (journal->j_revoke == journal->j_revoke_table[0]) + journal->j_revoke = journal->j_revoke_table[1]; + else + journal->j_revoke = journal->j_revoke_table[0]; + + for (i = 0; i < journal->j_revoke->hash_size; i++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); +} + +/* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + */ +void journal_write_revoke_records(journal_t *journal, + transaction_t *transaction, int write_op) +{ + struct journal_head *descriptor; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + + /* select revoke table for committing transaction */ + revoke = journal->j_revoke == journal->j_revoke_table[0] ? + journal->j_revoke_table[1] : journal->j_revoke_table[0]; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, + &descriptor, &offset, + record, write_op); + count++; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset, write_op); + jbd_debug(1, "Wrote %d revoke records\n", count); +} + +/* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + +static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct journal_head **descriptorp, + int *offsetp, + struct jbd_revoke_record_s *record, + int write_op) +{ + struct journal_head *descriptor; + int offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset == journal->j_blocksize) { + flush_descriptor(journal, descriptor, offset, write_op); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return; + header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); + header->h_sequence = cpu_to_be32(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); + journal_file_buffer(descriptor, transaction, BJ_LogCtl); + + offset = sizeof(journal_revoke_header_t); + *descriptorp = descriptor; + } + + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += 4; + *offsetp = offset; +} + +/* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + +static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset, int write_op) +{ + journal_revoke_header_t *header; + struct buffer_head *bh = jh2bh(descriptor); + + if (is_journal_aborted(journal)) { + put_bh(bh); + return; + } + + header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header->r_count = cpu_to_be32(offset); + set_buffer_jwrite(bh); + BUFFER_TRACE(bh, "write"); + set_buffer_dirty(bh); + write_dirty_buffer(bh, write_op); +} +#endif + +/* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + +/* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + +int journal_set_revoke(journal_t *journal, + unsigned int blocknr, + tid_t sequence) +{ + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); +} + +/* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + +int journal_test_revoke(journal_t *journal, + unsigned int blocknr, + tid_t sequence) +{ + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; +} + +/* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + +void journal_clear_revoke(journal_t *journal) +{ + int i; + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } +} diff --git a/kernel/fs/jbd/transaction.c b/kernel/fs/jbd/transaction.c new file mode 100644 index 000000000..1695ba833 --- /dev/null +++ b/kernel/fs/jbd/transaction.c @@ -0,0 +1,2237 @@ +/* + * linux/fs/jbd/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __journal_temp_unlink_buffer(struct journal_head *jh); + +/* + * get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + * Called under j_state_lock + */ + +static transaction_t * +get_transaction(journal_t *journal, transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + spin_lock_init(&transaction->t_handle_lock); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + + return transaction; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle) +{ + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + transaction_t *new_transaction = NULL; + int ret = 0; + + if (nblocks > journal->j_max_transaction_buffers) { + printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + current->comm, nblocks, + journal->j_max_transaction_buffers); + ret = -ENOSPC; + goto out; + } + +alloc_transaction: + if (!journal->j_running_transaction) { + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); + if (!new_transaction) { + ret = -ENOMEM; + goto out; + } + } + + jbd_debug(3, "New handle %p going live.\n", handle); + +repeat: + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ + spin_lock(&journal->j_state_lock); +repeat_locked: + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + spin_unlock(&journal->j_state_lock); + ret = -EROFS; + goto out; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + if (!new_transaction) { + spin_unlock(&journal->j_state_lock); + goto alloc_transaction; + } + get_transaction(journal, new_transaction); + new_transaction = NULL; + } + + transaction = journal->j_running_transaction; + + /* + * If the current transaction is locked down for commit, wait for the + * lock to be released. + */ + if (transaction->t_state == T_LOCKED) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_transaction_locked, + &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * If there is not enough space left in the log to write all potential + * buffers requested by this operation, we need to stall pending a log + * checkpoint to free some more log space. + */ + spin_lock(&transaction->t_handle_lock); + needed = transaction->t_outstanding_credits + nblocks; + + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, then start + * to commit it: we can then go back and attach this handle to + * a new transaction. + */ + DEFINE_WAIT(wait); + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + spin_unlock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsistent with the matching one in + * journal_extend(). + */ + if (__log_space_left(journal) < jbd_space_needed(journal)) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + spin_unlock(&transaction->t_handle_lock); + __log_wait_for_space(journal); + goto repeat_locked; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + transaction->t_handle_count++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + __log_space_left(journal)); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); +out: + if (unlikely(new_transaction)) /* It's usually NULL */ + kfree(new_transaction); + return ret; +} + +static struct lock_class_key jbd_handle_key; + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + + lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); + + return handle; +} + +/** + * handle_t *journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext3fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ +handle_t *journal_start(journal_t *journal, int nblocks) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + + current->journal_info = handle; + + err = start_this_handle(journal, handle); + if (err < 0) { + jbd_free_handle(handle); + current->journal_info = NULL; + handle = ERR_PTR(err); + } + return handle; +} + +/** + * int journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int journal_extend(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + result = -EIO; + if (is_handle_aborted(handle)) + goto out; + + result = 1; + + spin_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + spin_lock(&transaction->t_handle_lock); + wanted = transaction->t_outstanding_credits + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto unlock; + } + + if (wanted > __log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto unlock; + } + + handle->h_buffer_credits += nblocks; + transaction->t_outstanding_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); +unlock: + spin_unlock(&transaction->t_handle_lock); +error_out: + spin_unlock(&journal->j_state_lock); +out: + return result; +} + + +/** + * int journal_restart() - restart a handle. + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ + +int journal_restart(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + J_ASSERT(transaction->t_updates > 0); + J_ASSERT(journal_current_handle() == handle); + + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + + if (!transaction->t_updates) + wake_up(&journal->j_wait_updates); + spin_unlock(&transaction->t_handle_lock); + + jbd_debug(2, "restarting handle %p\n", handle); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + lock_map_release(&handle->h_lockdep_map); + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; +} + + +/** + * void journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. + * + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. + */ +void journal_lock_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + spin_lock(&transaction->t_handle_lock); + if (!transaction->t_updates) { + spin_unlock(&transaction->t_handle_lock); + break; + } + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); +} + +/** + * void journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with journal_lock_updates(). + */ +void journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + spin_lock(&journal->j_state_lock); + --journal->j_barrier_count; + spin_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_transaction_locked); +} + +static void warn_dirty_buffer(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy) +{ + struct buffer_head *bh; + transaction_t *transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + + transaction = handle->h_transaction; + journal = transaction->t_journal; + + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + lock_buffer(bh); + jbd_lock_bh_state(bh); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh)) { + /* + * First question: is this buffer already part of the current + * transaction or the existing committing transaction? + */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, + jh->b_transaction == transaction || + jh->b_transaction == + journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == + transaction); + warn_dirty_buffer(bh); + } + /* + * In any case we need to clean the dirty flag and we must + * do it under the buffer lock to be sure we don't race + * with running write-out. + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } + + unlock_buffer(bh); + + error = -EROFS; + if (is_handle_aborted(handle)) { + jbd_unlock_bh_state(bh); + goto out; + } + error = 0; + + /* + * The buffer is already part of this transaction if b_transaction or + * b_next_transaction points to it + */ + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done; + + /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* + * If there is already a copy-out version of this buffer, then we don't + * need to make another one + */ + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + goto done; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); + + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + /* commit wakes up all shadow buffers after IO */ + for ( ; ; ) { + prepare_to_wait(wqh, &wait.wait, + TASK_UNINTERRUPTIBLE); + if (jh->b_jlist != BJ_Shadow) + break; + schedule(); + } + finish_wait(wqh, &wait.wait); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + jbd_unlock_bh_state(bh); + frozen_buffer = + jbd_alloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!frozen_buffer) { + printk(KERN_ERR + "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + jbd_lock_bh_state(bh); + goto done; + } + goto repeat; + } + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + + /* + * Finally, if the buffer is not journaled right now, we need to make + * sure it doesn't get written to disk before the caller actually + * commits the new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + } + +done: + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), + "Possible IO failure.\n"); + page = jh2bh(jh)->b_page; + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap_atomic(source); + } + jbd_unlock_bh_state(bh); + + /* + * If we are about to journal a buffer, then any revoke pending on it is + * no longer valid + */ + journal_cancel_revoke(handle, jh); + +out: + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd_free(frozen_buffer, bh->b_size); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/** + * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +int journal_get_write_access(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh = journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0); + journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * int journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + journal_cancel_revoke(handle, jh); +out: + journal_put_journal_head(jh); + return err; +} + +/** + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) +{ + int err; + struct journal_head *jh = journal_add_journal_head(bh); + char *committed_data = NULL; + + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) { + committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); + if (!committed_data) { + printk(KERN_ERR "%s: No memory for committed data\n", + __func__); + err = -ENOMEM; + goto out; + } + } + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + jbd_unlock_bh_state(bh); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + jbd_unlock_bh_state(bh); +out: + journal_put_journal_head(jh); + if (unlikely(committed_data)) + jbd_free(committed_data, bh->b_size); + return err; +} + +/** + * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed + * @handle: transaction + * @bh: bufferhead to mark + * + * Description: + * Mark a buffer as containing dirty data which needs to be flushed before + * we can commit the current transaction. + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * Returns error number or 0 on success. + * + * journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. + */ +int journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + struct journal_head *jh; + int ret = 0; + + if (is_handle_aborted(handle)) + return ret; + + jh = journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + /* Now that we have bh_state locked, are we really still mapped? */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); + goto no_journal; + } + + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + need_brelse = 1; + sync_dirty_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + /* Since we dropped the lock... */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "buffer got unmapped"); + goto no_journal; + } + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "unfile from commit"); + __journal_temp_unlink_buffer(jh); + /* It still points to the committing + * transaction; move it to this one so + * that the refile assert checks are + * happy. */ + jh->b_transaction = handle->h_transaction; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + JBUFFER_TRACE(jh, "file as data"); + __journal_file_buffer(jh, handle->h_transaction, + BJ_SyncData); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); + } +no_journal: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + journal_put_journal_head(jh); + return ret; +} + +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * Mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + if (is_handle_aborted(handle)) + goto out; + + jbd_lock_bh_state(bh); + + if (jh->b_modified == 0) { + /* + * This buffer's got modified and becoming part + * of the transaction. This needs to be done + * once a transaction -bzzz + */ + jh->b_modified = 1; + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + } + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_running_transaction); + goto out_unlock_bh; + } + + set_buffer_jbddirty(bh); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + goto out_unlock_bh; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + jbd_unlock_bh_state(bh); +out: + JBUFFER_TRACE(jh, "exit"); + return 0; +} + +/* + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + */ +void +journal_release_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUFFER_TRACE(bh, "entry"); +} + +/** + * void journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +int journal_forget (handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + int drop_reserve = 0; + int err = 0; + int was_modified = 0; + + BUFFER_TRACE(bh, "entry"); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; + goto not_jbd; + } + + /* keep track of whether or not this transaction modified us */ + was_modified = jh->b_modified; + + /* + * The buffer's going from the transaction, we must drop + * all references -bzzz + */ + jh->b_modified = 0; + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __journal_temp_unlink_buffer(jh); + __journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __journal_unfile_buffer(jh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; + } + } + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; + } + } + +not_jbd: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +drop: + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; + } + return err; +} + +/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a journal_abort has been executed since the + * transaction began. + */ +int journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int err; + pid_t pid; + + J_ASSERT(journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else { + J_ASSERT(transaction->t_updates > 0); + err = 0; + } + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this transaction. + * Keep doing that while new threads continue to arrive. + * It doesn't cost much - we're about to run a commit and sleep + * on IO anyway. Speeds up many-threaded, many-dir operations + * by 30x or more... + * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is useful for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. + */ + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + } + + current->journal_info = NULL; + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + if (!transaction->t_updates) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + transaction->t_outstanding_credits > + journal->j_max_transaction_buffers || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + tid_t tid = transaction->t_tid; + + spin_unlock(&transaction->t_handle_lock); + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + /* + * Special case: JFS_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + err = log_wait_commit(journal, tid); + } else { + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + } + + lock_map_release(&handle->h_lockdep_map); + + jbd_free_handle(handle); + return err; +} + +/** + * int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * For synchronous operations: force any uncommitted transactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ +int journal_force_commit(journal_t *journal) +{ + handle_t *handle; + int ret; + + handle = journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + } else { + handle->h_sync = 1; + ret = journal_stop(handle); + } + return ret; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = NULL; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. + * + * Called under j_list_lock. The journal may not be locked. + */ +static void __journal_temp_unlink_buffer(struct journal_head *jh) +{ + struct journal_head **list = NULL; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != NULL); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +void __journal_unfile_buffer(struct journal_head *jh) +{ + __journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; + journal_put_journal_head(jh); +} + +void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +} + +/* + * Called from journal_try_to_free_buffers(). + * + * Called under jbd_lock_bh_state(bh) + */ +static void +__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +{ + struct journal_head *jh; + + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) + goto out; + + if (jh->b_next_transaction != NULL) + goto out; + + spin_lock(&journal->j_list_lock); + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __journal_unfile_buffer(jh); + } + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + } + } + spin_unlock(&journal->j_list_lock); +out: + return; +} + +/** + * int journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a journal_dirty_data on this + * buffer. So we need to lock against that. journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success + */ +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) +{ + struct buffer_head *head; + struct buffer_head *bh; + int ret = 0; + + J_ASSERT(PageLocked(page)); + + head = page_buffers(page); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * journal_put_journal_head(). + */ + jh = journal_grab_journal_head(bh); + if (!jh) + continue; + + jbd_lock_bh_state(bh); + __journal_try_to_free_buffer(journal, bh); + journal_put_journal_head(jh); + jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + + ret = try_to_free_buffers(page); + +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jbd_lock_bh_state(bh). + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); + __journal_file_buffer(jh, transaction, BJ_Forget); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __journal_unfile_buffer(jh); + } + return may_free; +} + +/* + * journal_invalidatepage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidatepage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + + BUFFER_TRACE(bh, "entry"); + +retry: + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + if (!buffer_jbd(bh)) + goto zap_buffer_unlocked; + + spin_lock(&journal->j_state_lock); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + jh = journal_grab_journal_head(bh); + if (!jh) + goto zap_buffer_no_jh; + + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + may_free = __dispose_buffer(jh, + journal->j_running_transaction); + goto zap_buffer; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + may_free = __dispose_buffer(jh, + journal->j_committing_transaction); + goto zap_buffer; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); + if (jh->b_jlist == BJ_Locked) { + /* + * The buffer is on the committing transaction's locked + * list. We have the buffer locked, so I/O has + * completed. So we can nail the buffer now. + */ + may_free = __dispose_buffer(jh, transaction); + goto zap_buffer; + } + /* + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); + log_wait_commit(journal, tid); + lock_buffer(bh); + goto retry; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; + journal_put_journal_head(jh); +zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * void journal_invalidatepage() - invalidate a journal page + * @journal: journal to use for flush + * @page: page to flush + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate + * + * Reap page buffers containing data in specified range in page. + */ +void journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned int offset, + unsigned int length) +{ + struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; + unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page_has_buffers(page)) + return; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + head = bh = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (next_off > stop) + return; + + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh, + partial_page); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!partial_page) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); + } +} + +/* + * File a buffer on the given transaction list. + */ +void __journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = NULL; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == NULL); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&transaction->t_journal->j_list_lock); + __journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns + */ +void __journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __journal_unfile_buffer(jh); + return; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +/* + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. + */ +void journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_list_lock); + __brelse(bh); +} diff --git a/kernel/fs/jbd2/Kconfig b/kernel/fs/jbd2/Kconfig new file mode 100644 index 000000000..5a9f5534d --- /dev/null +++ b/kernel/fs/jbd2/Kconfig @@ -0,0 +1,35 @@ +config JBD2 + tristate + select CRC32 + select CRYPTO + select CRYPTO_CRC32C + help + This is a generic journaling layer for block devices that support + both 32-bit and 64-bit block numbers. It is currently used by + the ext4 and OCFS2 filesystems, but it could also be used to add + journal support to other file systems or block devices such + as RAID or LVM. + + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. + + To compile this device as a module, choose M here. The module will be + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, + you cannot compile this code as a module. + +config JBD2_DEBUG + bool "JBD2 (ext4) debugging support" + depends on JBD2 + help + If you are using the ext4 journaled file system (or + potentially any other filesystem/device using JBD2), this option + allows you to enable debugging output while the system is running, + in order to help track down any problems you are having. + By default, the debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a + number between 1 and 5. The higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/module/jbd2/parameters/jbd2_debug". diff --git a/kernel/fs/jbd2/Makefile b/kernel/fs/jbd2/Makefile new file mode 100644 index 000000000..802a34138 --- /dev/null +++ b/kernel/fs/jbd2/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux journaling routines. +# + +obj-$(CONFIG_JBD2) += jbd2.o + +jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/kernel/fs/jbd2/checkpoint.c b/kernel/fs/jbd2/checkpoint.c new file mode 100644 index 000000000..6f691c289 --- /dev/null +++ b/kernel/fs/jbd2/checkpoint.c @@ -0,0 +1,647 @@ +/* + * linux/fs/jbd2/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink_first(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; +} + +/* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * + * Requires j_list_lock + */ +static int __try_to_free_cp_buf(struct journal_head *jh) +{ + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_transaction == NULL && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + ret = __jbd2_journal_remove_checkpoint(jh) + 1; + } + return ret; +} + +/* + * __jbd2_log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __jbd2_log_wait_for_space(journal_t *journal) +{ + int nblocks, space_left; + /* assert_spin_locked(&journal->j_state_lock); */ + + nblocks = jbd2_space_needed(journal); + while (jbd2_log_space_left(journal) < nblocks) { + write_unlock(&journal->j_state_lock); + if (current->plug) + io_schedule(); + mutex_lock(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + mutex_unlock(&journal->j_checkpoint_mutex); + return; + } + spin_lock(&journal->j_list_lock); + nblocks = jbd2_space_needed(journal); + space_left = jbd2_log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + if (chkpt) { + jbd2_log_do_checkpoint(journal); + } else if (jbd2_cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set. So we need to temporarily drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); + jbd2_log_wait_commit(journal, tid); + write_lock(&journal->j_state_lock); + continue; + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space in %s\n", __func__, + journal->j_devname); + WARN_ON(1); + jbd2_journal_abort(journal, 0); + } + write_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +static void +__flush_batch(journal_t *journal, int *batch_count) +{ + int i; + struct blk_plug plug; + + blk_start_plug(&plug); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); + + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = journal->j_chkpt_bhs[i]; + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. + */ +int jbd2_log_do_checkpoint(journal_t *journal) +{ + struct journal_head *jh; + struct buffer_head *bh; + transaction_t *transaction; + tid_t this_tid; + int result, batch_count = 0; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = jbd2_cleanup_journal_tail(journal); + trace_jbd2_checkpoint(journal, result); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + result = 0; + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + if (transaction->t_chp_stats.cs_chp_time == 0) + transaction->t_chp_stats.cs_chp_time = jiffies; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + /* checkpoint all of the transaction's buffers */ + while (transaction->t_checkpoint_list) { + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; + } + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so + * starting and waiting for a commit + * to finish will cause us to wait for + * a _very_ long time. + */ + printk(KERN_ERR + "JBD2: %s: Waiting for Godot: block %llu\n", + journal->j_devname, (unsigned long long) bh->b_blocknr); + + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + if (!buffer_dirty(bh)) { + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + BUFFER_TRACE(bh, "remove from checkpoint"); + if (__jbd2_journal_remove_checkpoint(jh)) + /* The transaction was released; we're done */ + goto out; + continue; + } + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal + * lock. We cannot afford to let the transaction + * logic start messing around with this buffer before + * we write it to disk, as that would break + * recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + __buffer_relink_io(jh); + transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || + need_resched() || + spin_needbreak(&journal->j_list_lock)) + goto unlock_and_flush; + } + + if (batch_count) { + unlock_and_flush: + spin_unlock(&journal->j_list_lock); + retry: + if (batch_count) + __flush_batch(journal, &batch_count); + spin_lock(&journal->j_list_lock); + goto restart; + } + + /* + * Now we issued all of the transaction's buffers, let's deal + * with the buffers that are out for I/O. + */ +restart2: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + while (transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart2; + } + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + + /* + * Now in whatever state the buffer currently is, we + * know that it has been written out and so we can + * drop it from the list + */ + if (__jbd2_journal_remove_checkpoint(jh)) + break; + } +out: + spin_unlock(&journal->j_list_lock); + if (result < 0) + jbd2_journal_abort(journal, result); + else + result = jbd2_cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. + */ + +int jbd2_cleanup_journal_tail(journal_t *journal) +{ + tid_t first_tid; + unsigned long blocknr; + + if (is_journal_aborted(journal)) + return 1; + + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) + return 1; + J_ASSERT(blocknr != 0); + + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. + */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + + __jbd2_update_log_tail(journal, first_tid, blocknr); + return 0; +} + + +/* Checkpoint list management */ + +/* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and + * release them. + * + * Called with j_list_lock held. + * Returns 1 if we freed the transaction, 0 otherwise. + */ +static int journal_clean_one_cp_list(struct journal_head *jh) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret; + int freed = 0; + + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + ret = __try_to_free_cp_buf(jh); + if (!ret) + return freed; + if (ret == 2) + return 1; + freed = 1; + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with j_list_lock held. + */ +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + int ret; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + return; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + return; + if (ret) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); + if (need_resched()) + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; + } while (transaction != last_transaction); +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. + * + * This function is called with j_list_lock held. + */ +int __jbd2_journal_remove_checkpoint(struct journal_head *jh) +{ + struct transaction_chp_stats_s *stats; + transaction_t *transaction; + journal_t *journal; + int ret = 0; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + journal = transaction->t_journal; + + JBUFFER_TRACE(jh, "removing from transaction"); + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + jbd2_journal_put_journal_head(jh); + + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out; + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! + * + * The locking here around t_state is a bit sleazy. + * See the comment at the end of jbd2_journal_commit_transaction(). + */ + if (transaction->t_state != T_FINISHED) + goto out; + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); + + __jbd2_journal_drop_transaction(journal, transaction); + jbd2_journal_free_transaction(transaction); + ret = 1; +out: + return ret; +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __jbd2_journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + trace_jbd2_drop_transaction(journal, transaction); + + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); +} diff --git a/kernel/fs/jbd2/commit.c b/kernel/fs/jbd2/commit.c new file mode 100644 index 000000000..b73e0215b --- /dev/null +++ b/kernel/fs/jbd2/commit.c @@ -0,0 +1,1164 @@ +/* + * linux/fs/jbd2/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * IO end handler for temporary buffer_heads handling writes to the journal. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + struct buffer_head *orig_bh = bh->b_private; + + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + if (orig_bh) { + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); + smp_mb__after_atomic(); + wake_up_bit(&orig_bh->b_state, BH_Shadow); + } + unlock_buffer(bh); +} + +/* + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under lock_journal(), and possibly under journal_datalist_lock. The + * caller provided us with a ref against the buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + page = bh->b_page; + if (!page) + goto nope; + if (page->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (!trylock_page(page)) + goto nope; + + page_cache_get(page); + __brelse(bh); + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + return; + +nope: + __brelse(bh); +} + +static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct commit_header *h; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + h = (struct commit_header *)(bh->b_data); + h->h_chksum_type = 0; + h->h_chksum_size = 0; + h->h_chksum[0] = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + h->h_chksum[0] = cpu_to_be32(csum); +} + +/* + * Done it all: now submit the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) +{ + struct commit_header *tmp; + struct buffer_head *bh; + int ret; + struct timespec now = current_kernel_time(); + + *cbh = NULL; + + if (is_journal_aborted(journal)) + return 0; + + bh = jbd2_journal_get_descriptor_buffer(journal); + if (!bh) + return 1; + + tmp = (struct commit_header *)bh->b_data; + tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); + + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); + } + jbd2_commit_block_csum_set(journal, bh); + + BUFFER_TRACE(bh, "submit commit block"); + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); + else + ret = submit_bh(WRITE_SYNC, bh); + + *cbh = bh; + return ret; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + + return ret; +} + +/* + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). + */ +static int journal_submit_inode_data_buffers(struct address_space *mapping) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + }; + + ret = generic_writepages(mapping, &wbc); + return ret; +} + +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; + + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + spin_unlock(&journal->j_list_lock); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + err = journal_submit_inode_data_buffers(mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_atomic(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + spin_unlock(&journal->j_list_lock); + return ret; +} + +/* + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * + */ +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; + + /* For locking, see the comment in journal_submit_data_buffers() */ + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (err) { + /* + * Because AS_EIO is cleared by + * filemap_fdatawait_range(), set it again so + * that user process can get -EIO from fsync(). + */ + set_bit(AS_EIO, + &jinode->i_vfs_inode->i_mapping->flags); + + if (!ret) + ret = err; + } + spin_lock(&journal->j_list_lock); + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_atomic(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; + } + } + spin_unlock(&journal->j_list_lock); + + return ret; +} + +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + char *addr; + __u32 checksum; + + addr = kmap_atomic(page); + checksum = crc32_be(crc32_sum, + (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); + kunmap_atomic(addr); + + return checksum; +} + +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, + unsigned long long block) +{ + tag->t_blocknr = cpu_to_be32(block & (u32)~0); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) + tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); +} + +static void jbd2_descr_block_csum_set(journal_t *j, + struct buffer_head *bh) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + +static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, + struct buffer_head *bh, __u32 sequence) +{ + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; + struct page *page = bh->b_page; + __u8 *addr; + __u32 csum32; + __be32 seq; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + seq = cpu_to_be32(sequence); + addr = kmap_atomic(page); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), + bh->b_size); + kunmap_atomic(addr); + + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); +} +/* + * jbd2_journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void jbd2_journal_commit_transaction(journal_t *journal) +{ + struct transaction_stats_s stats; + transaction_t *commit_transaction; + struct journal_head *jh; + struct buffer_head *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int flags; + int err; + unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; + struct blk_plug plug; + /* Tail of the journal */ + unsigned long first_block; + tid_t first_tid; + int update_tail; + int csum_size = 0; + LIST_HEAD(io_bufs); + LIST_HEAD(log_bufs); + + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_block_tail); + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + /* Do we need to erase the effects of a prior jbd2_journal_flush? */ + if (journal->j_flags & JBD2_FLUSHED) { + jbd_debug(3, "super block updated\n"); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + + trace_jbd2_start_commit(journal, commit_transaction); + jbd_debug(1, "JBD2: starting commit of transaction %d\n", + commit_transaction->t_tid); + + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); + commit_transaction->t_state = T_LOCKED; + + trace_jbd2_commit_locking(journal, commit_transaction); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_request_delay = 0; + stats.run.rs_locked = jiffies; + if (commit_transaction->t_requested) + stats.run.rs_request_delay = + jbd2_time_diff(commit_transaction->t_requested, + stats.run.rs_locked); + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); + + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a jbd2_journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple jbd2_journal_get_write_access() calls to the same + * buffer are perfectly permissible. + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + jbd_unlock_bh_state(bh); + } + jbd2_journal_refile_buffer(journal, jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __jbd2_journal_clean_checkpoint_list(journal); + spin_unlock(&journal->j_list_lock); + + jbd_debug(3, "JBD2: commit phase 1\n"); + + /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + + /* + * Switch to a new revoke table. + */ + jbd2_journal_switch_revoke_table(journal); + + /* + * Reserved credits cannot be claimed anymore, free them + */ + atomic_sub(atomic_read(&journal->j_reserved_credits), + &commit_transaction->t_outstanding_credits); + + trace_jbd2_commit_flushing(journal, commit_transaction); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); + + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + start_time = ktime_get(); + commit_transaction->t_log_start = journal->j_head; + wake_up(&journal->j_wait_transaction_locked); + write_unlock(&journal->j_state_lock); + + jbd_debug(3, "JBD2: commit phase 2a\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + err = journal_submit_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + + blk_start_plug(&plug); + jbd2_journal_write_revoke_records(journal, commit_transaction, + &log_bufs, WRITE_SYNC); + + jbd_debug(3, "JBD2: commit phase 2b\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + write_lock(&journal->j_state_lock); + commit_transaction->t_state = T_COMMIT; + write_unlock(&journal->j_state_lock); + + trace_jbd2_commit_logging(journal, commit_transaction); + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks_logged = 0; + + J_ASSERT(commit_transaction->t_nr_buffers <= + atomic_read(&commit_transaction->t_outstanding_credits)); + + err = 0; + bufs = 0; + descriptor = NULL; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it. */ + + if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); + JBUFFER_TRACE(jh, "journal is aborting: refile"); + jbd2_buffer_abort_trigger(jh, + jh->b_frozen_data ? + jh->b_frozen_triggers : + jh->b_triggers); + jbd2_journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD2: get descriptor\n"); + + descriptor = jbd2_journal_get_descriptor_buffer(journal); + if (!descriptor) { + jbd2_journal_abort(journal, -EIO); + continue; + } + + jbd_debug(4, "JBD2: got buffer %llu (%p)\n", + (unsigned long long)descriptor->b_blocknr, + descriptor->b_data); + header = (journal_header_t *)descriptor->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + tagp = &descriptor->b_data[sizeof(journal_header_t)]; + space_left = descriptor->b_size - + sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(descriptor); + set_buffer_dirty(descriptor); + wbuf[bufs++] = descriptor; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(descriptor, "ph3: file as descriptor"); + jbd2_file_log_bh(&log_bufs, descriptor); + } + + /* Where is the buffer to be written? */ + + err = jbd2_journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + jbd2_journal_abort(journal, err); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log, but this counter is changed + * by jbd2_journal_next_log_block() also. + */ + atomic_dec(&commit_transaction->t_outstanding_credits); + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* + * Make a temporary IO buffer with which to write it out + * (this will requeue the metadata buffer to BJ_Shadow). + */ + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = jbd2_journal_write_metadata_buffer(commit_transaction, + jh, &wbuf[bufs], blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } + jbd2_file_log_bh(&io_bufs, wbuf[bufs]); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JBD2_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JBD2_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be16(tag_flag); + jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], + commit_transaction->t_tid); + tagp += tag_bytes; + space_left -= tag_bytes; + bufs++; + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < tag_bytes + 16 + csum_size) { + + jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); + + jbd2_descr_block_csum_set(journal, descriptor); +start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + /* + * Compute checksum. + */ + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE_SYNC, bh); + } + cond_resched(); + stats.run.rs_blocks_logged += bufs; + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING + "JBD2: Detected IO errors while flushing file data " + "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); + err = 0; + } + + /* + * Get current oldest transaction in the log before we issue flush + * to the filesystem device. After the flush we can be sure that + * blocks of all older transactions are checkpointed to persistent + * storage and we will be safe to update journal start in the + * superblock with the numbers we get here. + */ + update_tail = + jbd2_journal_get_log_tail(journal, &first_tid, &first_block); + + write_lock(&journal->j_state_lock); + if (update_tail) { + long freed = first_block - journal->j_tail; + + if (first_block < journal->j_tail) + freed += journal->j_last - journal->j_first; + /* Update tail only if we free significant amount of space */ + if (freed < journal->j_maxlen / 4) + update_tail = 0; + } + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); + + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_need_data_flush && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + + /* Done it all: now write the commit record asynchronously. */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + + blk_finish_plug(&plug); + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the io_bufs list. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD2: commit phase 3\n"); + + while (!list_empty(&io_bufs)) { + struct buffer_head *bh = list_entry(io_bufs.prev, + struct buffer_head, + b_assoc_buffers); + + wait_on_buffer(bh); + cond_resched(); + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + jbd2_unfile_log_bh(bh); + + /* + * The list contains temporary buffer heads created by + * jbd2_journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to refile the corresponding shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_buffer_jwrite(bh); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + J_ASSERT_BH(bh, !buffer_shadow(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD2: commit phase 4\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + while (!list_empty(&log_bufs)) { + struct buffer_head *bh; + + bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); + wait_on_buffer(bh); + cond_resched(); + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + jbd2_unfile_log_bh(bh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + if (err) + jbd2_journal_abort(journal, err); + + jbd_debug(3, "JBD2: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + if (cbh) + err = journal_wait_on_commit_record(journal, cbh); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + journal->j_flags & JBD2_BARRIER) { + blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + } + + if (err) + jbd2_journal_abort(journal, err); + + /* + * Now disk caches for filesystem device are flushed so we are safe to + * erase checkpointed transactions from the log by updating journal + * superblock. + */ + if (update_tail) + jbd2_update_log_tail(journal, first_tid, first_block); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd_debug(3, "JBD2: commit phase 6\n"); + + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + int try_to_free = 0; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + * + * We also know that the frozen data has already fired + * its triggers if they exist, so we can clear that too. + */ + if (jh->b_committed_data) { + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + } else if (jh->b_frozen_data) { + jbd2_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; + __jbd2_journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by jbd2_journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather through in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* + * The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; + } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... + */ + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + + /* Done with this transaction! */ + + jbd_debug(3, "JBD2: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); + + commit_transaction->t_start = jiffies; + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction statistics + */ + stats.ts_tid = commit_transaction->t_tid; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; + + commit_transaction->t_state = T_COMMIT_CALLBACK; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; + + write_unlock(&journal->j_state_lock); + + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + + trace_jbd2_end_commit(journal, commit_transaction); + jbd_debug(1, "JBD2: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Check if the transaction can be dropped now that we are finished */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_done_commit); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); +} diff --git a/kernel/fs/jbd2/journal.c b/kernel/fs/jbd2/journal.c new file mode 100644 index 000000000..b96bd8076 --- /dev/null +++ b/kernel/fs/jbd2/journal.c @@ -0,0 +1,2671 @@ +/* + * linux/fs/jbd2/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include +#include + +#ifdef CONFIG_JBD2_DEBUG +ushort jbd2_journal_enable_debug __read_mostly; +EXPORT_SYMBOL(jbd2_journal_enable_debug); + +module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); +MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); +#endif + +EXPORT_SYMBOL(jbd2_journal_extend); +EXPORT_SYMBOL(jbd2_journal_stop); +EXPORT_SYMBOL(jbd2_journal_lock_updates); +EXPORT_SYMBOL(jbd2_journal_unlock_updates); +EXPORT_SYMBOL(jbd2_journal_get_write_access); +EXPORT_SYMBOL(jbd2_journal_get_create_access); +EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_set_triggers); +EXPORT_SYMBOL(jbd2_journal_dirty_metadata); +EXPORT_SYMBOL(jbd2_journal_forget); +#if 0 +EXPORT_SYMBOL(journal_sync_buffer); +#endif +EXPORT_SYMBOL(jbd2_journal_flush); +EXPORT_SYMBOL(jbd2_journal_revoke); + +EXPORT_SYMBOL(jbd2_journal_init_dev); +EXPORT_SYMBOL(jbd2_journal_init_inode); +EXPORT_SYMBOL(jbd2_journal_check_used_features); +EXPORT_SYMBOL(jbd2_journal_check_available_features); +EXPORT_SYMBOL(jbd2_journal_set_features); +EXPORT_SYMBOL(jbd2_journal_load); +EXPORT_SYMBOL(jbd2_journal_destroy); +EXPORT_SYMBOL(jbd2_journal_abort); +EXPORT_SYMBOL(jbd2_journal_errno); +EXPORT_SYMBOL(jbd2_journal_ack_err); +EXPORT_SYMBOL(jbd2_journal_clear_err); +EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); +EXPORT_SYMBOL(jbd2_journal_start_commit); +EXPORT_SYMBOL(jbd2_journal_force_commit_nested); +EXPORT_SYMBOL(jbd2_journal_wipe); +EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_invalidatepage); +EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); +EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_inode_cache); + +static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); + +#ifdef CONFIG_JBD2_DEBUG +void __jbd2_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > jbd2_journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd2_debug); +#endif + +/* Checksumming functions */ +static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +{ + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; +} + +static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +{ + __u32 csum; + __be32 old_csum; + + old_csum = sb->s_checksum; + sb->s_checksum = 0; + csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + sb->s_checksum = old_csum; + + return cpu_to_be32(csum); +} + +static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +{ + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + return sb->s_checksum == jbd2_superblock_csum(j, sb); +} + +static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +{ + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + sb->s_checksum = jbd2_superblock_csum(j, sb); +} + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * kjournald2: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald2(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); + + set_freezable(); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + /* + * And now, wait forever for commit wakeup events. + */ + write_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JBD2_UNMOUNT) + goto end_loop; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + write_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + jbd2_journal_commit_transaction(journal); + write_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd_debug(1, "Now suspending kjournald2\n"); + write_unlock(&journal->j_state_lock); + try_to_freeze(); + write_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + if (journal->j_commit_sequence != journal->j_commit_request) + should_sleep = 0; + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, + transaction->t_expires)) + should_sleep = 0; + if (journal->j_flags & JBD2_UNMOUNT) + should_sleep = 0; + if (should_sleep) { + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd_debug(1, "kjournald2 wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + write_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; +} + +static int jbd2_journal_start_thread(journal_t *journal) +{ + struct task_struct *t; + + t = kthread_run(kjournald2, journal, "jbd2/%s", + journal->j_devname); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; +} + +static void journal_kill_thread(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_UNMOUNT; + + while (journal->j_task) { + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); + wait_event(journal->j_wait_done_commit, journal->j_task == NULL); + write_lock(&journal->j_state_lock); + } + write_unlock(&journal->j_state_lock); +} + +/* + * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we have to make sure nobody modifies it while the + * IO is in progress. do_get_write_access() handles this. + * + * The function returns a pointer to the buffer_head to be used for IO. + * + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + +int jbd2_journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct buffer_head **bh_out, + sector_t blocknr) +{ + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct page *new_page; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + + jbd_lock_bh_state(bh_in); +repeat: + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = offset_in_page(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = offset_in_page(jh2bh(jh_in)->b_data); + } + + mapped_data = kmap_atomic(new_page); + /* + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. + */ + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); + + /* + * Check for escaping + */ + if (*((__be32 *)(mapped_data + new_offset)) == + cpu_to_be32(JBD2_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + kunmap_atomic(mapped_data); + + /* + * Do we need to do a data copy? + */ + if (need_copy_out && !done_copy_out) { + char *tmp; + + jbd_unlock_bh_state(bh_in); + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + brelse(new_bh); + return -ENOMEM; + } + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd2_free(tmp, bh_in->b_size); + goto repeat; + } + + jh_in->b_frozen_data = tmp; + mapped_data = kmap_atomic(new_page); + memcpy(tmp, mapped_data + new_offset, bh_in->b_size); + kunmap_atomic(mapped_data); + + new_page = virt_to_page(tmp); + new_offset = offset_in_page(tmp); + done_copy_out = 1; + + /* + * This isn't strictly necessary, as we're using frozen + * data for the escaping, but it keeps consistency with + * b_frozen_data usage. + */ + jh_in->b_frozen_triggers = jh_in->b_triggers; + } + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + if (do_escape) { + mapped_data = kmap_atomic(new_page); + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data); + } + + set_bh_page(new_bh, new_page, new_offset); + new_bh->b_size = bh_in->b_size; + new_bh->b_bdev = journal->j_dev; + new_bh->b_blocknr = blocknr; + new_bh->b_private = bh_in; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *bh_out = new_bh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + set_buffer_shadow(bh_in); + jbd_unlock_bh_state(bh_in); + + return do_escape | (done_copy_out << 1); +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * Called with j_state_lock locked for writing. + * Returns true if a transaction commit was started. + */ +int __jbd2_log_start_commit(journal_t *journal, tid_t target) +{ + /* Return if the txn has already requested to be committed */ + if (journal->j_commit_request == target) + return 0; + + /* + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. + */ + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { + /* + * We want a new commit: OK, mark the request and wakeup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD2: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + journal->j_running_transaction->t_requested = jiffies; + wake_up(&journal->j_wait_commit); + return 1; + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + return 0; +} + +int jbd2_log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + write_lock(&journal->j_state_lock); + ret = __jbd2_log_start_commit(journal, tid); + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait any uncommitted transactions. We can only force the running + * transaction if we don't have an active handle, otherwise, we will deadlock. + * Returns: <0 in case of error, + * 0 if nothing to commit, + * 1 if transaction was successfully committed. + */ +static int __jbd2_journal_force_commit(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + int need_to_start = 0, ret = 0; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + if (!tid_geq(journal->j_commit_request, transaction->t_tid)) + need_to_start = 1; + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + /* Nothing to commit */ + read_unlock(&journal->j_state_lock); + return 0; + } + tid = transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + ret = jbd2_log_wait_commit(journal, tid); + if (!ret) + ret = 1; + + return ret; +} + +/** + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * @journal: journal to force + * Returns true if progress was made. + */ +int jbd2_journal_force_commit_nested(journal_t *journal) +{ + int ret; + + ret = __jbd2_journal_force_commit(journal); + return ret > 0; +} + +/** + * int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * Caller want unconditional commit. We can only force the running transaction + * if we don't have an active handle, otherwise, we will deadlock. + */ +int jbd2_journal_force_commit(journal_t *journal) +{ + int ret; + + J_ASSERT(!current->journal_info); + ret = __jbd2_journal_force_commit(journal); + if (ret > 0) + ret = 0; + return ret; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid + */ +int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) + *ptid = tid; + ret = 1; + } else if (journal->j_committing_transaction) { + /* + * If commit has been started, then we have to wait for + * completion of that transaction. + */ + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int jbd2_log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + + read_lock(&journal->j_state_lock); +#ifdef CONFIG_JBD2_DEBUG + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_ERR + "%s: error: j_commit_request=%d, tid=%d\n", + __func__, journal->j_commit_request, tid); + } +#endif + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + read_lock(&journal->j_state_lock); + } + read_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) + err = -EIO; + return err; +} + +/* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + +/* + * Log buffer allocation routines: + */ + +int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) +{ + unsigned long blocknr; + + write_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + write_unlock(&journal->j_state_lock); + return jbd2_journal_bmap(journal, blocknr, retp); +} + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, + unsigned long long *retp) +{ + int err = 0; + unsigned long long ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + if (ret) + *retp = ret; + else { + printk(KERN_ALERT "%s: journal block not found " + "at offset %lu on %s\n", + __func__, blocknr, journal->j_devname); + err = -EIO; + __journal_abort_soft(journal, err); + } + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +{ + struct buffer_head *bh; + unsigned long long blocknr; + int err; + + err = jbd2_journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return bh; +} + +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); +} + +/* + * This is a variaon of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + mutex_lock(&journal->j_checkpoint_mutex); + if (tid_gt(tid, journal->j_tail_sequence)) + __jbd2_update_log_tail(journal, tid, block); + mutex_unlock(&journal->j_checkpoint_mutex); +} + +struct jbd2_stats_proc_session { + journal_t *journal; + struct transaction_stats_s *stats; + int start; + int max; +}; + +static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int jbd2_seq_info_show(struct seq_file *seq, void *v) +{ + struct jbd2_stats_proc_session *s = seq->private; + + if (v != SEQ_START_TOKEN) + return 0; + seq_printf(seq, "%lu transactions (%lu requested), " + "each up to %u blocks\n", + s->stats->ts_tid, s->stats->ts_requested, + s->journal->j_max_transaction_buffers); + if (s->stats->ts_tid == 0) + return 0; + seq_printf(seq, "average: \n %ums waiting for transaction\n", + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums request delay\n", + (s->stats->ts_requested == 0) ? 0 : + jiffies_to_msecs(s->stats->run.rs_request_delay / + s->stats->ts_requested)); + seq_printf(seq, " %ums running transaction\n", + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); + seq_printf(seq, " %ums transaction was being locked\n", + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); + seq_printf(seq, " %ums flushing data (in ordered mode)\n", + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); + seq_printf(seq, " %ums logging transaction\n", + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %lluus average transaction commit time\n", + div_u64(s->journal->j_average_commit_time, 1000)); + seq_printf(seq, " %lu handles per transaction\n", + s->stats->run.rs_handle_count / s->stats->ts_tid); + seq_printf(seq, " %lu blocks per transaction\n", + s->stats->run.rs_blocks / s->stats->ts_tid); + seq_printf(seq, " %lu logged blocks per transaction\n", + s->stats->run.rs_blocks_logged / s->stats->ts_tid); + return 0; +} + +static void jbd2_seq_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations jbd2_seq_info_ops = { + .start = jbd2_seq_info_start, + .next = jbd2_seq_info_next, + .stop = jbd2_seq_info_stop, + .show = jbd2_seq_info_show, +}; + +static int jbd2_seq_info_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE_DATA(inode); + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s); + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, &journal->j_stats, size); + s->journal = journal; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_info_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static const struct file_operations jbd2_seq_info_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_info_release, +}; + +static struct proc_dir_entry *proc_jbd2_stats; + +static void jbd2_stats_proc_init(journal_t *journal) +{ + journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); + if (journal->j_proc_entry) { + proc_create_data("info", S_IRUGO, journal->j_proc_entry, + &jbd2_seq_info_fops, journal); + } +} + +static void jbd2_stats_proc_exit(journal_t *journal) +{ + remove_proc_entry("info", journal->j_proc_entry); + remove_proc_entry(journal->j_devname, proc_jbd2_stats); +} + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + +static journal_t * journal_init_common (void) +{ + journal_t *journal; + int err; + + journal = kzalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + return NULL; + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + init_waitqueue_head(&journal->j_wait_reserved); + mutex_init(&journal->j_barrier); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + rwlock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ + atomic_set(&journal->j_reserved_credits, 0); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JBD2_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + return NULL; + } + + spin_lock_init(&journal->j_history_lock); + + return journal; +} + +/* jbd2_journal_init_dev and jbd2_journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Length of the journal in blocks. + * @blocksize: blocksize of journalling device + * + * Returns: a newly created journal_t * + * + * jbd2_journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t * jbd2_journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) +{ + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + char *p; + int n; + + if (!journal) + return NULL; + + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + jbd2_stats_proc_init(journal); + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; +} + +/** + * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * jbd2_journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t * jbd2_journal_init_inode (struct inode *inode) +{ + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + char *p; + int err; + int n; + unsigned long long blocknr; + + if (!journal) + return NULL; + + journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; + journal->j_inode = inode; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + p = journal->j_devname + strlen(journal->j_devname); + sprintf(p, "-%lu", journal->j_inode->i_ino); + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, inode->i_sb->s_id, inode->i_ino, + (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + jbd2_stats_proc_init(journal); + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + err = jbd2_journal_bmap(journal, 0, &blocknr); + /* If that failed, give up */ + if (err) { + printk(KERN_ERR "%s: Cannot locate journal superblock\n", + __func__); + goto out_err; + } + + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock (journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JBD2_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " + "(start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JBD2_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } + return jbd2_journal_start_thread(journal); +} + +static void jbd2_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + journal_superblock_t *sb = journal->j_superblock; + int ret; + + trace_jbd2_write_superblock(journal, write_op); + if (!(journal->j_flags & JBD2_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); + if (buffer_write_io_error(bh)) { + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD2: previous I/O error detected " + "for journal superblock update for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + jbd2_superblock_csum_set(journal, sb); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: Error %d detected when updating " + "journal superblock for %s.\n", ret, + journal->j_devname); + } +} + +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + jbd2_write_superblock(journal, write_op); + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + read_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + read_unlock(&journal->j_state_lock); + return; + } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(0); + read_unlock(&journal->j_state_lock); + + jbd2_write_superblock(journal, WRITE_FUA); + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +void jbd2_journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + read_lock(&journal->j_state_lock); + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + read_unlock(&journal->j_state_lock); + + jbd2_write_superblock(journal, WRITE_SYNC); +} +EXPORT_SYMBOL(jbd2_journal_update_sb_errno); + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err = -EIO; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); + goto out; + } + } + + if (buffer_verified(bh)) + return 0; + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + goto out; + } + + switch(be32_to_cpu(sb->s_header.h_blocktype)) { + case JBD2_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JBD2_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + goto out; + } + + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + goto out; + } + + if (!jbd2_verify_csum_type(journal, sb)) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + goto out; + } + + /* Load the checksum driver */ + if (jbd2_journal_has_csum_v2or3(journal)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + goto out; + } + } + + /* Check superblock checksum */ + if (!jbd2_superblock_csum_verify(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + goto out; + } + + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + + set_buffer_verified(bh); + + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_last = be32_to_cpu(sb->s_maxlen); + journal->j_errno = be32_to_cpu(sb->s_errno); + + return 0; +} + + +/** + * int jbd2_journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int jbd2_journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING + "JBD2: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (jbd2_journal_recover(journal)) + goto recovery_error; + + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JBD2_ABORT; + journal->j_flags |= JBD2_LOADED; + return 0; + +recovery_error: + printk(KERN_WARNING "JBD2: recovery failed\n"); + return -EIO; +} + +/** + * void jbd2_journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + * Return <0 if we couldn't clean up the journal. + */ +int jbd2_journal_destroy(journal_t *journal) +{ + int err = 0; + + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + jbd2_journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + if (journal->j_sb_buffer) { + if (!is_journal_aborted(journal)) { + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } else + err = -EIO; + brelse(journal->j_sb_buffer); + } + + if (journal->j_proc_entry) + jbd2_stats_proc_exit(journal); + iput(journal->j_inode); + if (journal->j_revoke) + jbd2_journal_destroy_revoke(journal); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); + kfree(journal->j_wbuf); + kfree(journal); + + return err; +} + + +/** + *int jbd2_journal_check_used_features () - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + /* Load journal superblock if it is not loaded yet. */ + if (journal->j_format_version == 0 && + journal_get_superblock(journal) != 0) + return 0; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * int jbd2_journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + if (!compat && !ro && !incompat) + return 1; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && + (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +/** + * int jbd2_journal_set_features () - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int jbd2_journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ +#define INCOMPAT_FEATURE_ON(f) \ + ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) +#define COMPAT_FEATURE_ON(f) \ + ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) + journal_superblock_t *sb; + + if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + /* If enabling v2 checksums, turn on v3 instead */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; + } + + /* Asking for checksumming v3 and v1? Only give them v3. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && + compat & JBD2_FEATURE_COMPAT_CHECKSUM) + compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + /* If enabling v3 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + sb->s_checksum_type = JBD2_CRC32C_CHKSUM; + sb->s_feature_compat &= + ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + + /* Load the checksum driver */ + if (journal->j_chksum_driver == NULL) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", + 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c " + "driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, + sb->s_uuid, + sizeof(sb->s_uuid)); + } + } + + /* If enabling v1 checksums, downgrade superblock */ + if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) + sb->s_feature_incompat &= + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | + JBD2_FEATURE_INCOMPAT_CSUM_V3); + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; +#undef COMPAT_FEATURE_ON +#undef INCOMPAT_FEATURE_ON +} + +/* + * jbd2_journal_clear_features () - Clear a given journal feature in the + * superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Clear a given journal feature as present on the + * superblock. + */ +void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat &= ~cpu_to_be32(compat); + sb->s_feature_ro_compat &= ~cpu_to_be32(ro); + sb->s_feature_incompat &= ~cpu_to_be32(incompat); +} +EXPORT_SYMBOL(jbd2_journal_clear_features); + +/** + * int jbd2_journal_flush () - Flush journal + * @journal: Journal to act on. + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + +int jbd2_journal_flush(journal_t *journal) +{ + int err = 0; + transaction_t *transaction = NULL; + + write_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __jbd2_log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + write_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); + } else { + write_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + write_lock(&journal->j_state_lock); + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + write_unlock(&journal->j_state_lock); + return 0; +} + +/** + * int jbd2_journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and jbd2_journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int jbd2_journal_wipe(journal_t *journal, int write) +{ + int err = 0; + + J_ASSERT (!(journal->j_flags & JBD2_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + if (!journal->j_tail) + goto no_recovery; + + printk(KERN_WARNING "JBD2: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = jbd2_journal_skip_recovery(journal); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } + + no_recovery: + return err; +} + +/* + * Journal abort has very specific semantics, which we describe + * for journal abort. + * + * Two internal functions, which provide abort to the jbd layer + * itself are here. + */ + +/* + * Quick version for internal journal use (doesn't lock the journal). + * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, + * and don't attempt to make any other journal updates. + */ +void __jbd2_journal_abort_hard(journal_t *journal) +{ + transaction_t *transaction; + + if (journal->j_flags & JBD2_ABORT) + return; + + printk(KERN_ERR "Aborting journal on device %s.\n", + journal->j_devname); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); +} + +/* Soft abort: record the abort error status in the journal superblock, + * but don't do any other IO. */ +static void __journal_abort_soft (journal_t *journal, int errno) +{ + if (journal->j_flags & JBD2_ABORT) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __jbd2_journal_abort_hard(journal); + + if (errno) + jbd2_journal_update_sb_errno(journal); +} + +/** + * void jbd2_journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The jbd2_journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * jbd2_journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final jbd2_journal_stop, which will receive the -EIO error. + * + * Finally, the jbd2_journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + +void jbd2_journal_abort(journal_t *journal, int errno) +{ + __journal_abort_soft(journal, errno); +} + +/** + * int jbd2_journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno number set with jbd2_journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int jbd2_journal_errno(journal_t *journal) +{ + int err; + + read_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + err = journal->j_errno; + read_unlock(&journal->j_state_lock); + return err; +} + +/** + * int jbd2_journal_clear_err () - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +int jbd2_journal_clear_err(journal_t *journal) +{ + int err = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + write_unlock(&journal->j_state_lock); + return err; +} + +/** + * void jbd2_journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +void jbd2_journal_ack_err(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JBD2_ACK_ERR; + write_unlock(&journal->j_state_lock); +} + +int jbd2_journal_blocks_per_page(struct inode *inode) +{ + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); +} + +/* + * helper functions to deal with 32 or 64bit block numbers. + */ +size_t journal_tag_bytes(journal_t *journal) +{ + size_t sz; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return sizeof(journal_block_tag3_t); + + sz = sizeof(journal_block_tag_t); + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + sz += sizeof(__u16); + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + return sz; + else + return sz - sizeof(__u32); +} + +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + static DEFINE_MUTEX(jbd2_slab_create_mutex); + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + mutex_lock(&jbd2_slab_create_mutex); + if (jbd2_slab[i]) { + mutex_unlock(&jbd2_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + mutex_unlock(&jbd2_slab_create_mutex); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == NULL); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + +/* + * Journal_head storage management + */ +static struct kmem_cache *jbd2_journal_head_cache; +#ifdef CONFIG_JBD2_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int jbd2_journal_init_journal_head_cache(void) +{ + int retval; + + J_ASSERT(jbd2_journal_head_cache == NULL); + jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", + sizeof(struct journal_head), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + retval = 0; + if (!jbd2_journal_head_cache) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); + } + return retval; +} + +static void jbd2_journal_destroy_journal_head_cache(void) +{ + if (jbd2_journal_head_cache) { + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; + } +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + +#ifdef CONFIG_JBD2_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); + if (!ret) { + jbd_debug(1, "out of memory for journal_head\n"); + pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); + while (!ret) { + yield(); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); + } + } + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD2_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD2_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(jbd2_journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, jbd2_journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * jbd2_journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = jbd2_journal_add_journal_head(bh); + * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * jbd2_journal_put_journal_head(jh); + */ + +/* + * Give a buffer_head a journal_head. + * + * May sleep. + */ +struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) + new_jh = journal_alloc_journal_head(); + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_jcount >= 0); + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, bh->b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then + * release the journal_head from the buffer_head. + */ +void jbd2_journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount) { + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); + __brelse(bh); + } else + jbd_unlock_bh_journal_head(bh); +} + +/* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + + +#ifdef CONFIG_PROC_FS + +#define JBD2_STATS_PROC_NAME "fs/jbd2" + +static void __init jbd2_create_jbd_stats_proc_entry(void) +{ + proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); +} + +static void __exit jbd2_remove_jbd_stats_proc_entry(void) +{ + if (proc_jbd2_stats) + remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); +} + +#else + +#define jbd2_create_jbd_stats_proc_entry() do {} while (0) +#define jbd2_remove_jbd_stats_proc_entry() do {} while (0) + +#endif + +struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; + +static int __init jbd2_journal_init_handle_cache(void) +{ + jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); + if (jbd2_handle_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create handle cache\n"); + return -ENOMEM; + } + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (jbd2_inode_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create inode cache\n"); + kmem_cache_destroy(jbd2_handle_cache); + return -ENOMEM; + } + return 0; +} + +static void jbd2_journal_destroy_handle_cache(void) +{ + if (jbd2_handle_cache) + kmem_cache_destroy(jbd2_handle_cache); + if (jbd2_inode_cache) + kmem_cache_destroy(jbd2_inode_cache); + +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = jbd2_journal_init_revoke_caches(); + if (ret == 0) + ret = jbd2_journal_init_journal_head_cache(); + if (ret == 0) + ret = jbd2_journal_init_handle_cache(); + if (ret == 0) + ret = jbd2_journal_init_transaction_cache(); + return ret; +} + +static void jbd2_journal_destroy_caches(void) +{ + jbd2_journal_destroy_revoke_caches(); + jbd2_journal_destroy_journal_head_cache(); + jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_transaction_cache(); + jbd2_journal_destroy_slabs(); +} + +static int __init journal_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); + + ret = journal_init_caches(); + if (ret == 0) { + jbd2_create_jbd_stats_proc_entry(); + } else { + jbd2_journal_destroy_caches(); + } + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD2_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); +#endif + jbd2_remove_jbd_stats_proc_entry(); + jbd2_journal_destroy_caches(); +} + +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + diff --git a/kernel/fs/jbd2/recovery.c b/kernel/fs/jbd2/recovery.c new file mode 100644 index 000000000..a9079d035 --- /dev/null +++ b/kernel/fs/jbd2/recovery.c @@ -0,0 +1,880 @@ +/* + * linux/fs/jbd2/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +static void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static int do_readahead(journal_t *journal, unsigned int start) +{ + int err; + unsigned int max, nbufs, next; + unsigned long long blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + err = jbd2_journal_bmap(journal, next, &blocknr); + + if (err) { + printk(KERN_ERR "JBD2: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned long long blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_maxlen) { + printk(KERN_ERR "JBD2: corrupted journal superblock\n"); + return -EIO; + } + + err = jbd2_journal_bmap(journal, offset, &blocknr); + + if (err) { + printk(KERN_ERR "JBD2: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk(KERN_ERR "JBD2: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + +static int jbd2_descr_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_block_tail *tail; + __be32 provided; + __u32 calculated; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + provided = tail->t_checksum; + tail->t_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->t_checksum = provided; + + return provided == cpu_to_be32(calculated); +} + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(journal_t *journal, struct buffer_head *bh) +{ + char * tagp; + journal_block_tag_t * tag; + int nr = 0, size = journal->j_blocksize; + int tag_bytes = journal_tag_bytes(journal); + + if (jbd2_journal_has_csum_v2or3(journal)) + size -= sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + tag_bytes) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += tag_bytes; + if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +/** + * jbd2_journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int jbd2_journal_recover(journal_t *journal) +{ + int err, err2; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + be32_to_cpu(sb->s_sequence)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(1, "JBD2: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + jbd2_journal_clear_revoke(journal); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } + return err; +} + +/** + * jbd2_journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function does'nt appear to be exorted.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int jbd2_journal_skip_recovery(journal_t *journal) +{ + int err; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD2: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { +#ifdef CONFIG_JBD2_DEBUG + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd_debug(1, + "JBD2: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); +#endif + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + return err; +} + +static inline unsigned long long read_tag_block(journal_t *journal, + journal_block_tag_t *tag) +{ + unsigned long long block = be32_to_cpu(tag->t_blocknr); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; + return block; +} + +/* + * calc_chksums calculates the checksums for the blocks described in the + * descriptor block. + */ +static int calc_chksums(journal_t *journal, struct buffer_head *bh, + unsigned long *next_log_block, __u32 *crc32_sum) +{ + int i, num_blks, err; + unsigned long io_block; + struct buffer_head *obh; + + num_blks = count_tags(journal, bh); + /* Calculate checksum of the descriptor block. */ + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); + + for (i = 0; i < num_blks; i++) { + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + printk(KERN_ERR "JBD2: IO error %d recovering block " + "%lu in log\n", err, io_block); + return 1; + } else { + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } + put_bh(obh); + } + return 0; +} + +static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) +{ + struct commit_header *h; + __be32 provided; + __u32 calculated; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + h = buf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + h->h_chksum[0] = provided; + + return provided == cpu_to_be32(calculated); +} + +static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + void *buf, __u32 sequence) +{ + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; + __u32 csum32; + __be32 seq; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + seq = cpu_to_be32(sequence); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return tag3->t_checksum == cpu_to_be32(csum32); + else + return tag->t_checksum == cpu_to_be16(csum32); +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + int tag_bytes = journal_tag_bytes(journal); + __u32 crc32_sum = ~0; /* Transactional Checksums */ + int descr_csum_size = 0; + int block_error = 0; + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + cond_resched(); + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JBD2_DESCRIPTOR_BLOCK: + /* Verify checksum first */ + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = + sizeof(struct jbd2_journal_block_tail); + if (descr_csum_size > 0 && + !jbd2_descr_block_csum_verify(journal, + bh->b_data)) { + printk(KERN_ERR "JBD2: Invalid checksum " + "recovering block %lu in log\n", + next_log_block); + err = -EIO; + brelse(bh); + goto failed; + } + + /* If it is a valid descriptor block, replay it + * in pass REPLAY; if journal_checksums enabled, then + * calculate checksums in PASS_SCAN, otherwise, + * just skip over the blocks it describes. */ + if (pass != PASS_REPLAY) { + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM) && + !info->end_transaction) { + if (calc_chksums(journal, bh, + &next_log_block, + &crc32_sum)) { + put_bh(bh); + break; + } + put_bh(bh); + continue; + } + next_log_block += count_tags(journal, bh); + wrap(journal, next_log_block); + put_bh(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data + tag_bytes) + <= journal->j_blocksize - descr_csum_size) { + unsigned long io_block; + + tag = (journal_block_tag_t *) tagp; + flags = be16_to_cpu(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk(KERN_ERR + "JBD2: IO error %d recovering " + "block %ld in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, + tag); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (jbd2_journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify( + journal, tag, obh->b_data, + be32_to_cpu(tmp->h_sequence))) { + brelse(obh); + success = -EIO; + printk(KERN_ERR "JBD2: Invalid " + "checksum recovering " + "block %llu in log\n", + blocknr); + block_error = 1; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = __getblk(journal->j_fs_dev, + blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD2: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JBD2_COMMIT_BLOCK: + /* How to differentiate between interrupted commit + * and journal corruption ? + * + * {nth transaction} + * Checksum Verification Failed + * | + * ____________________ + * | | + * async_commit sync_commit + * | | + * | GO TO NEXT "Journal Corruption" + * | TRANSACTION + * | + * {(n+1)th transanction} + * | + * _______|______________ + * | | + * Commit block found Commit block not found + * | | + * "Journal Corruption" | + * _____________|_________ + * | | + * nth trans corrupt OR nth trans + * and (n+1)th interrupted interrupted + * before commit block + * could reach the disk. + * (Cannot find the difference in above + * mentioned conditions. Hence assume + * "Interrupted Commit".) + */ + + /* Found an expected commit block: if checksums + * are present verify them in PASS_SCAN; else not + * much to do other than move on to the next sequence + * number. */ + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + int chksum_err, chksum_seen; + struct commit_header *cbh = + (struct commit_header *)bh->b_data; + unsigned found_chksum = + be32_to_cpu(cbh->h_chksum[0]); + + chksum_err = chksum_seen = 0; + + if (info->end_transaction) { + journal->j_failed_commit = + info->end_transaction; + brelse(bh); + break; + } + + if (crc32_sum == found_chksum && + cbh->h_chksum_type == JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) + chksum_seen = 1; + else if (!(cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0 && + !chksum_seen)) + /* + * If fs is mounted using an old kernel and then + * kernel with journal_chksum is used then we + * get a situation where the journal flag has + * checksum flag set but checksums are not + * present i.e chksum = 0, in the individual + * commit blocks. + * Hence to avoid checksum failures, in this + * situation, this extra check is added. + */ + chksum_err = 1; + + if (chksum_err) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + journal->j_failed_commit = + next_commit_ID; + brelse(bh); + break; + } + } + crc32_sum = ~0; + } + if (pass == PASS_SCAN && + !jbd2_commit_block_csum_verify(journal, + bh->b_data)) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + journal->j_failed_commit = + next_commit_ID; + brelse(bh); + break; + } + } + brelse(bh); + next_commit_ID++; + continue; + + case JBD2_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + brelse(bh); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) { + if (!info->end_transaction) + info->end_transaction = next_commit_ID; + } else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk(KERN_ERR "JBD2: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + if (block_error && success == 0) + success = -EIO; + return success; + + failed: + return err; +} + +static int jbd2_revoke_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_revoke_tail *tail; + __be32 provided; + __u32 calculated; + + if (!jbd2_journal_has_csum_v2or3(j)) + return 1; + + tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + provided = tail->r_checksum; + tail->r_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->r_checksum = provided; + + return provided == cpu_to_be32(calculated); +} + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) +{ + jbd2_journal_revoke_header_t *header; + int offset, max; + int csum_size = 0; + __u32 rcount; + int record_len = 4; + + header = (jbd2_journal_revoke_header_t *) bh->b_data; + offset = sizeof(jbd2_journal_revoke_header_t); + rcount = be32_to_cpu(header->r_count); + + if (!jbd2_revoke_block_csum_verify(journal, header)) + return -EINVAL; + + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (rcount > journal->j_blocksize - csum_size) + return -EINVAL; + max = rcount; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + record_len = 8; + + while (offset + record_len <= max) { + unsigned long long blocknr; + int err; + + if (record_len == 4) + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + else + blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); + offset += record_len; + err = jbd2_journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; +} diff --git a/kernel/fs/jbd2/revoke.c b/kernel/fs/jbd2/revoke.c new file mode 100644 index 000000000..14214da80 --- /dev/null +++ b/kernel/fs/jbd2/revoke.c @@ -0,0 +1,764 @@ +/* + * linux/fs/jbd2/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +static struct kmem_cache *jbd2_revoke_record_cache; +static struct kmem_cache *jbd2_revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd2_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long long blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd2_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(journal_t *, transaction_t *, + struct list_head *, + struct buffer_head **, int *, + struct jbd2_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct buffer_head *, int, int); +#endif + +/* Utility functions to maintain the revoke table */ + +static inline int hash(journal_t *journal, unsigned long long block) +{ + return hash_64(block, journal->j_revoke->hash_shift); +} + +static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + +repeat: + record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; + +oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); + yield(); + goto repeat; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long long blocknr) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd2_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd2_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +void jbd2_journal_destroy_revoke_caches(void) +{ + if (jbd2_revoke_record_cache) { + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + } + if (jbd2_revoke_table_cache) { + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; + } +} + +int __init jbd2_journal_init_revoke_caches(void) +{ + J_ASSERT(!jbd2_revoke_record_cache); + J_ASSERT(!jbd2_revoke_table_cache); + + jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); + if (!jbd2_revoke_record_cache) + goto record_cache_failure; + + jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, + SLAB_TEMPORARY); + if (!jbd2_revoke_table_cache) + goto table_cache_failure; + return 0; +table_cache_failure: + jbd2_journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; +} + +static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +{ + int shift = 0; + int tmp = hash_size; + struct jbd2_revoke_table_s *table; + + table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; + + while((tmp >>= 1UL) != 0UL) + shift++; + + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!table->hash_table) { + kmem_cache_free(jbd2_revoke_table_cache, table); + table = NULL; + goto out; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&table->hash_table[tmp]); + +out: + return table; +} + +static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(jbd2_revoke_table_cache, table); +} + +/* Initialise the revoke table for a given journal to a given size. */ +int jbd2_journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); + + journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; + + journal->j_revoke = journal->j_revoke_table[1]; + + spin_lock_init(&journal->j_revoke_lock); + + return 0; + +fail1: + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} + +/* Destroy a journal's revoke table. The table must already be empty! */ +void jbd2_journal_destroy_revoke(journal_t *journal) +{ + journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); +} + + +#ifdef __KERNEL__ + +/* + * jbd2_journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count + * by one. + */ + +int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, + struct buffer_head *bh_in) +{ + struct buffer_head *bh = NULL; + journal_t *journal; + struct block_device *bdev; + int err; + + might_sleep(); + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + bdev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } +#ifdef JBD2_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if (bh2 != bh && buffer_revokevalid(bh2)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, buffer_revoked(bh2)); + put_bh(bh2); + } + } +#endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + if (!J_EXPECT_BH(bh, !buffer_revoked(bh), + "inconsistent data on disk")) { + if (!bh_in) + brelse(bh); + return -EIO; + } + set_buffer_revoked(bh); + set_buffer_revokevalid(bh); + if (bh_in) { + BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); + jbd2_journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + BUFFER_TRACE(bh_in, "exit"); + return err; +} + +/* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from jbd2_journal_get_write_access). + * + * We trust buffer_revoked() on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + */ +int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +{ + struct jbd2_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_set_buffer_revokevalid(bh)) { + need_cancel = test_clear_buffer_revoked(bh); + } else { + need_cancel = 1; + clear_buffer_revoked(bh); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %llu\n", (unsigned long long)bh->b_blocknr); + spin_lock(&journal->j_revoke_lock); + list_del(&record->hash); + spin_unlock(&journal->j_revoke_lock); + kmem_cache_free(jbd2_revoke_record_cache, record); + did_revoke = 1; + } + } + +#ifdef JBD2_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); +#endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel) { + struct buffer_head *bh2; + bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + if (bh2) { + if (bh2 != bh) + clear_buffer_revoked(bh2); + __brelse(bh2); + } + } + return did_revoke; +} + +/* + * journal_clear_revoked_flag clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + +/* journal_switch_revoke table select j_revoke for next transaction + * we do not want to suspend any processing until all revokes are + * written -bzzz + */ +void jbd2_journal_switch_revoke_table(journal_t *journal) +{ + int i; + + if (journal->j_revoke == journal->j_revoke_table[0]) + journal->j_revoke = journal->j_revoke_table[1]; + else + journal->j_revoke = journal->j_revoke_table[0]; + + for (i = 0; i < journal->j_revoke->hash_size; i++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); +} + +/* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + */ +void jbd2_journal_write_revoke_records(journal_t *journal, + transaction_t *transaction, + struct list_head *log_bufs, + int write_op) +{ + struct buffer_head *descriptor; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + + /* select revoke table for committing transaction */ + revoke = journal->j_revoke == journal->j_revoke_table[0] ? + journal->j_revoke_table[1] : journal->j_revoke_table[0]; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, log_bufs, + &descriptor, &offset, + record, write_op); + count++; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset, write_op); + jbd_debug(1, "Wrote %d revoke records\n", count); +} + +/* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + +static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct list_head *log_bufs, + struct buffer_head **descriptorp, + int *offsetp, + struct jbd2_revoke_record_s *record, + int write_op) +{ + int csum_size = 0; + struct buffer_head *descriptor; + int sz, offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + jbd2_journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Do we need to leave space at the end for a checksum? */ + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + sz = 8; + else + sz = 4; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset + sz > journal->j_blocksize - csum_size) { + flush_descriptor(journal, descriptor, offset, write_op); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = jbd2_journal_get_descriptor_buffer(journal); + if (!descriptor) + return; + header = (journal_header_t *)descriptor->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); + header->h_sequence = cpu_to_be32(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + BUFFER_TRACE(descriptor, "file in log_bufs"); + jbd2_file_log_bh(log_bufs, descriptor); + + offset = sizeof(jbd2_journal_revoke_header_t); + *descriptorp = descriptor; + } + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + * ((__be64 *)(&descriptor->b_data[offset])) = + cpu_to_be64(record->blocknr); + else + * ((__be32 *)(&descriptor->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += sz; + + *offsetp = offset; +} + +static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct jbd2_journal_revoke_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + tail->r_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->r_checksum = cpu_to_be32(csum); +} + +/* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + +static void flush_descriptor(journal_t *journal, + struct buffer_head *descriptor, + int offset, int write_op) +{ + jbd2_journal_revoke_header_t *header; + + if (is_journal_aborted(journal)) { + put_bh(descriptor); + return; + } + + header = (jbd2_journal_revoke_header_t *)descriptor->b_data; + header->r_count = cpu_to_be32(offset); + jbd2_revoke_csum_set(journal, descriptor); + + set_buffer_jwrite(descriptor); + BUFFER_TRACE(descriptor, "write"); + set_buffer_dirty(descriptor); + write_dirty_buffer(descriptor, write_op); +} +#endif + +/* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + +/* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + +int jbd2_journal_set_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); +} + +/* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + +int jbd2_journal_test_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; +} + +/* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + +void jbd2_journal_clear_revoke(journal_t *journal) +{ + int i; + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } +} diff --git a/kernel/fs/jbd2/transaction.c b/kernel/fs/jbd2/transaction.c new file mode 100644 index 000000000..ff2f2e6ad --- /dev/null +++ b/kernel/fs/jbd2/transaction.c @@ -0,0 +1,2487 @@ +/* + * linux/fs/jbd2/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); + +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ + J_ASSERT(!transaction_cache); + transaction_cache = kmem_cache_create("jbd2_transaction_s", + sizeof(transaction_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (transaction_cache) + return 0; + return -ENOMEM; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ + if (transaction_cache) { + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; + } +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ + if (unlikely(ZERO_OR_NULL_PTR(transaction))) + return; + kmem_cache_free(transaction_cache, transaction); +} + +/* + * jbd2_get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + */ + +static transaction_t * +jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, + atomic_read(&journal->j_reserved_credits)); + atomic_set(&transaction->t_handle_count, 0); + INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + transaction->t_max_wait = 0; + transaction->t_start = jiffies; + transaction->t_requested = 0; + + return transaction; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * Update transaction's maximum wait time, if debugging is enabled. + * + * In order for t_max_wait to be reliable, it must be protected by a + * lock. But doing so will mean that start_this_handle() can not be + * run in parallel on SMP systems, which limits our scalability. So + * unless debugging is enabled, we no longer update t_max_wait, which + * means that maximum wait time reported by the jbd2_run_stats + * tracepoint will always be zero. + */ +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) +{ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { + ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); + if (ts > transaction->t_max_wait) + transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); + } +#endif +} + +/* + * Wait until running transaction passes T_LOCKED state. Also starts the commit + * if needed. The function expects running transaction to exist and releases + * j_state_lock. + */ +static void wait_transaction_locked(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + int need_to_start; + tid_t tid = journal->j_running_transaction->t_tid; + + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + +static void sub_reserved_credits(journal_t *journal, int blocks) +{ + atomic_sub(blocks, &journal->j_reserved_credits); + wake_up(&journal->j_wait_reserved); +} + +/* + * Wait until we can add credits for handle to the running transaction. Called + * with j_state_lock held for reading. Returns 0 if handle joined the running + * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and + * caller must retry. + */ +static int add_transaction_credits(journal_t *journal, int blocks, + int rsv_blocks) +{ + transaction_t *t = journal->j_running_transaction; + int needed; + int total = blocks + rsv_blocks; + + /* + * If the current transaction is locked down for commit, wait + * for the lock to be released. + */ + if (t->t_state == T_LOCKED) { + wait_transaction_locked(journal); + return 1; + } + + /* + * If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log space. + */ + needed = atomic_add_return(total, &t->t_outstanding_credits); + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, + * then start to commit it: we can then go back and + * attach this handle to a new transaction. + */ + atomic_sub(total, &t->t_outstanding_credits); + wait_transaction_locked(journal); + return 1; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + */ + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + return 1; + } + + /* No reservation? We are done... */ + if (!rsv_blocks) + return 0; + + needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); + /* We allow at most half of a transaction to be reserved */ + if (needed > journal->j_max_transaction_buffers / 2) { + sub_reserved_credits(journal, rsv_blocks); + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + rsv_blocks + <= journal->j_max_transaction_buffers / 2); + return 1; + } + return 0; +} + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle, + gfp_t gfp_mask) +{ + transaction_t *transaction, *new_transaction = NULL; + int blocks = handle->h_buffer_credits; + int rsv_blocks = 0; + unsigned long ts = jiffies; + + /* + * 1/2 of transaction can be reserved so we can practically handle + * only 1/2 of maximum transaction size per operation + */ + if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { + printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", + current->comm, blocks, + journal->j_max_transaction_buffers / 2); + return -ENOSPC; + } + + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + +alloc_transaction: + if (!journal->j_running_transaction) { + new_transaction = kmem_cache_zalloc(transaction_cache, + gfp_mask); + if (!new_transaction) { + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; + } + } + + jbd_debug(3, "New handle %p going live.\n", handle); + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ +repeat: + read_lock(&journal->j_state_lock); + BUG_ON(journal->j_flags & JBD2_UNMOUNT); + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { + read_unlock(&journal->j_state_lock); + jbd2_journal_free_transaction(new_transaction); + return -EROFS; + } + + /* + * Wait on the journal's transaction barrier if necessary. Specifically + * we allow reserved handles to proceed because otherwise commit could + * deadlock on page writeback not being able to complete. + */ + if (!handle->h_reserved && journal->j_barrier_count) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + read_unlock(&journal->j_state_lock); + if (!new_transaction) + goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction && + (handle->h_reserved || !journal->j_barrier_count)) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; + } + write_unlock(&journal->j_state_lock); + goto repeat; + } + + transaction = journal->j_running_transaction; + + if (!handle->h_reserved) { + /* We may have dropped j_state_lock - restart in that case */ + if (add_transaction_credits(journal, blocks, rsv_blocks)) + goto repeat; + } else { + /* + * We have handle reserved so we are allowed to join T_LOCKED + * transaction and we don't have to check for transaction size + * and journal space. + */ + sub_reserved_credits(journal, blocks); + handle->h_reserved = 0; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. + */ + update_t_max_wait(transaction, ts); + handle->h_transaction = transaction; + handle->h_requested_credits = blocks; + handle->h_start_jiffies = jiffies; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); + jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + handle, blocks, + atomic_read(&transaction->t_outstanding_credits), + jbd2_log_space_left(journal)); + read_unlock(&journal->j_state_lock); + current->journal_info = handle; + + lock_map_acquire(&handle->h_lockdep_map); + jbd2_journal_free_transaction(new_transaction); + return 0; +} + +static struct lock_class_key jbd2_handle_key; + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd2_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + + lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", + &jbd2_handle_key, 0); + + return handle; +} + +/** + * handle_t *jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, + gfp_t gfp_mask, unsigned int type, + unsigned int line_no) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + if (rsv_blocks) { + handle_t *rsv_handle; + + rsv_handle = new_handle(rsv_blocks); + if (!rsv_handle) { + jbd2_free_handle(handle); + return ERR_PTR(-ENOMEM); + } + rsv_handle->h_reserved = 1; + rsv_handle->h_journal = journal; + handle->h_rsv_handle = rsv_handle; + } + + err = start_this_handle(journal, handle, gfp_mask); + if (err < 0) { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + jbd2_free_handle(handle); + return ERR_PTR(err); + } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, nblocks); + return handle; +} +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); +} +EXPORT_SYMBOL(jbd2_journal_start); + +void jbd2_journal_free_reserved(handle_t *handle) +{ + journal_t *journal = handle->h_journal; + + WARN_ON(!handle->h_reserved); + sub_reserved_credits(journal, handle->h_buffer_credits); + jbd2_free_handle(handle); +} +EXPORT_SYMBOL(jbd2_journal_free_reserved); + +/** + * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle + * @handle: handle to start + * + * Start handle that has been previously reserved with jbd2_journal_reserve(). + * This attaches @handle to the running transaction (or creates one if there's + * not transaction running). Unlike jbd2_journal_start() this function cannot + * block on journal commit, checkpointing, or similar stuff. It can block on + * memory allocation or frozen journal though. + * + * Return 0 on success, non-zero on error - handle is freed in that case. + */ +int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, + unsigned int line_no) +{ + journal_t *journal = handle->h_journal; + int ret = -EIO; + + if (WARN_ON(!handle->h_reserved)) { + /* Someone passed in normal handle? Just stop it. */ + jbd2_journal_stop(handle); + return ret; + } + /* + * Usefulness of mixing of reserved and unreserved handles is + * questionable. So far nobody seems to need it so just error out. + */ + if (WARN_ON(current->journal_info)) { + jbd2_journal_free_reserved(handle); + return ret; + } + + handle->h_journal = NULL; + /* + * GFP_NOFS is here because callers are likely from writeback or + * similarly constrained call sites + */ + ret = start_this_handle(journal, handle, GFP_NOFS); + if (ret < 0) { + jbd2_journal_free_reserved(handle); + return ret; + } + handle->h_type = type; + handle->h_line_no = line_no; + return 0; +} +EXPORT_SYMBOL(jbd2_journal_start_reserved); + +/** + * int jbd2_journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * jbd2_journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int jbd2_journal_extend(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int result; + int wanted; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + result = 1; + + read_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + spin_lock(&transaction->t_handle_lock); + wanted = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); + goto unlock; + } + + if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > + jbd2_log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); + goto unlock; + } + + trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, + transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_buffer_credits, + nblocks); + + handle->h_buffer_credits += nblocks; + handle->h_requested_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); +unlock: + spin_unlock(&transaction->t_handle_lock); +error_out: + read_unlock(&journal->j_state_lock); + return result; +} + + +/** + * int jbd2_journal_restart() - restart a handle . + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the jbd2_journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to jbd2_journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. We preserve reserved handle if there's any attached to the + * passed in handle. + */ +int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + tid_t tid; + int need_to_start, ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + journal = transaction->t_journal; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + J_ASSERT(journal_current_handle() == handle); + + read_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) { + sub_reserved_credits(journal, + handle->h_rsv_handle->h_buffer_credits); + } + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + tid = transaction->t_tid; + spin_unlock(&transaction->t_handle_lock); + handle->h_transaction = NULL; + current->journal_info = NULL; + + jbd_debug(2, "restarting handle %p\n", handle); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + + lock_map_release(&handle->h_lockdep_map); + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle, gfp_mask); + return ret; +} +EXPORT_SYMBOL(jbd2__journal_restart); + + +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + +/** + * void jbd2_journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ +void jbd2_journal_lock_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + + write_lock(&journal->j_state_lock); + ++journal->j_barrier_count; + + /* Wait until there are no reserved handles */ + if (atomic_read(&journal->j_reserved_credits)) { + write_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) == 0); + write_lock(&journal->j_state_lock); + } + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + spin_lock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&transaction->t_updates)) { + spin_unlock(&transaction->t_handle_lock); + finish_wait(&journal->j_wait_updates, &wait); + break; + } + spin_unlock(&transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + write_lock(&journal->j_state_lock); + } + write_unlock(&journal->j_state_lock); + + /* + * We have now established a barrier against other normal updates, but + * we also need to barrier against other jbd2_journal_lock_updates() calls + * to make sure that we serialise special journal-locked operations + * too. + */ + mutex_lock(&journal->j_barrier); +} + +/** + * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with jbd2_journal_lock_updates(). + * + * Should be called without the journal lock held. + */ +void jbd2_journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + mutex_unlock(&journal->j_barrier); + write_lock(&journal->j_state_lock); + --journal->j_barrier_count; + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_transaction_locked); +} + +static void warn_dirty_buffer(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy) +{ + struct buffer_head *bh; + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + unsigned long start_lock, time_lock; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + start_lock = jiffies; + lock_buffer(bh); + jbd_lock_bh_state(bh); + + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh)) { + /* + * First question: is this buffer already part of the current + * transaction or the existing committing transaction? + */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, + jh->b_transaction == transaction || + jh->b_transaction == + journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == + transaction); + warn_dirty_buffer(bh); + } + /* + * In any case we need to clean the dirty flag and we must + * do it under the buffer lock to be sure we don't race + * with running write-out. + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } + + unlock_buffer(bh); + + error = -EROFS; + if (is_handle_aborted(handle)) { + jbd_unlock_bh_state(bh); + goto out; + } + error = 0; + + /* + * The buffer is already part of this transaction if b_transaction or + * b_next_transaction points to it + */ + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done; + + /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* + * If there is already a copy-out version of this buffer, then we don't + * need to make another one + */ + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + goto done; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, + TASK_UNINTERRUPTIBLE); + goto repeat; + } + + /* + * Only do the copy if the currently-owning transaction still + * needs it. If buffer isn't on BJ_Metadata list, the + * committing transaction is past that stage (here we use the + * fact that BH_Shadow is set under bh_state lock together with + * refiling to BJ_Shadow list and at this point we know the + * buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + jbd_unlock_bh_state(bh); + frozen_buffer = + jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!frozen_buffer) { + printk(KERN_ERR + "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + jbd_lock_bh_state(bh); + goto done; + } + goto repeat; + } + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + + /* + * Finally, if the buffer is not journaled right now, we need to make + * sure it doesn't get written to disk before the caller actually + * commits the new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + } + +done: + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), + "Possible IO failure.\n"); + page = jh2bh(jh)->b_page; + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store + * any matching triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; + } + jbd_unlock_bh_state(bh); + + /* + * If we are about to journal a buffer, then any revoke pending on it is + * no longer valid + */ + jbd2_journal_cancel_revoke(handle, jh); + +out: + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd2_free(frozen_buffer, bh->b_size); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/** + * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0); + jbd2_journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * int jbd2_journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + journal = transaction->t_journal; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + if (jh->b_transaction == NULL) { + /* + * Previous jbd2_journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + jbd2_journal_cancel_revoke(handle, jh); +out: + jbd2_journal_put_journal_head(jh); + return err; +} + +/** + * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with + * non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, jbd2_journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) +{ + int err; + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + char *committed_data = NULL; + + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) { + committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); + if (!committed_data) { + printk(KERN_ERR "%s: No memory for committed data\n", + __func__); + err = -ENOMEM; + goto out; + } + } + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + jbd_unlock_bh_state(bh); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + jbd_unlock_bh_state(bh); +out: + jbd2_journal_put_journal_head(jh); + if (unlikely(committed_data)) + jbd2_free(committed_data, bh->b_size); + return err; +} + +/** + * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * @bh: buffer to trigger on + * @type: struct jbd2_buffer_trigger_type containing the trigger(s). + * + * Set any triggers on this journal_head. This is always safe, because + * triggers for a committing buffer will be saved off, and triggers for + * a running transaction will match the buffer in that transaction. + * + * Call with NULL to clear the triggers. + */ +void jbd2_journal_set_triggers(struct buffer_head *bh, + struct jbd2_buffer_trigger_type *type) +{ + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + + if (WARN_ON(!jh)) + return; + jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); +} + +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, + struct jbd2_buffer_trigger_type *triggers) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!triggers || !triggers->t_frozen) + return; + + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); +} + +void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers) +{ + if (!triggers || !triggers->t_abort) + return; + + triggers->t_abort(triggers, jh2bh(jh)); +} + + + +/** + * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh; + int ret = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + ret = -EUCLEAN; + goto out; + } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + + jbd_lock_bh_state(bh); + + if (jh->b_modified == 0) { + /* + * This buffer's got modified and becoming part + * of the transaction. This needs to be done + * once a transaction -bzzz + */ + jh->b_modified = 1; + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } + handle->h_buffer_credits--; + } + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_ERR "JBD2: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } + goto out_unlock_bh; + } + + set_buffer_jbddirty(bh); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, + jh->b_transaction, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + jh->b_jlist); + WARN_ON(1); + ret = -EINVAL; + } + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + goto out_unlock_bh; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); +out: + JBUFFER_TRACE(jh, "exit"); + return ret; +} + +/** + * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + struct journal_head *jh; + int drop_reserve = 0; + int err = 0; + int was_modified = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + BUFFER_TRACE(bh, "entry"); + + jbd_lock_bh_state(bh); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; + goto not_jbd; + } + + /* keep track of whether or not this transaction modified us */ + was_modified = jh->b_modified; + + /* + * The buffer's going from the transaction, we must drop + * all references -bzzz + */ + jh->b_modified = 0; + + if (jh->b_transaction == transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + spin_lock(&journal->j_list_lock); + if (jh->b_cp_transaction) { + __jbd2_journal_temp_unlink_buffer(jh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __jbd2_journal_unfile_buffer(jh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; + } + } + spin_unlock(&journal->j_list_lock); + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + spin_lock(&journal->j_list_lock); + jh->b_next_transaction = NULL; + spin_unlock(&journal->j_list_lock); + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; + } + } + +not_jbd: + jbd_unlock_bh_state(bh); + __brelse(bh); +drop: + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; + } + return err; +} + +/** + * int jbd2_journal_stop() - complete a transaction + * @handle: tranaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * jbd2_journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a jbd2_journal_abort has been executed since the + * transaction began. + */ +int jbd2_journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + int err = 0, wait_for_commit = 0; + tid_t tid; + pid_t pid; + + if (!transaction) { + /* + * Handle is already detached from the transaction so + * there is nothing to do other than decrease a refcount, + * or free the handle if refcount drops to zero + */ + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } else { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + goto free_and_exit; + } + } + journal = transaction->t_journal; + + J_ASSERT(journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, + handle->h_type, handle->h_line_no, + jiffies - handle->h_start_jiffies, + handle->h_sync, handle->h_requested_credits, + (handle->h_requested_credits - + handle->h_buffer_credits)); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... + * + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. + */ + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + read_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + read_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); + commit_time = min_t(u64, commit_time, + 1000*journal->j_max_batch_time); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + } + + if (handle->h_sync) + transaction->t_synchronous_commit = 1; + current->journal_info = NULL; + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + jbd2_log_start_commit(journal, transaction->t_tid); + + /* + * Special case: JBD2_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + wait_for_commit = 1; + } + + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start committing on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + + lock_map_release(&handle->h_lockdep_map); + + if (handle->h_rsv_handle) + jbd2_journal_free_reserved(handle->h_rsv_handle); +free_and_exit: + jbd2_free_handle(handle); + return err; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = NULL; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or + * t_reserved_list. If the caller is holding onto a copy of one of these + * pointers, it could go bad. Generally the caller needs to re-read the + * pointer from the transaction_t. + * + * Called under j_list_lock. + */ +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +{ + struct journal_head **list = NULL; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != NULL); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) +{ + __jbd2_journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; + jbd2_journal_put_journal_head(jh); +} + +void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __jbd2_journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +} + +/* + * Called from jbd2_journal_try_to_free_buffers(). + * + * Called under jbd_lock_bh_state(bh) + */ +static void +__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +{ + struct journal_head *jh; + + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) + goto out; + + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) + goto out; + + spin_lock(&journal->j_list_lock); + if (jh->b_cp_transaction != NULL) { + /* written-back checkpointed metadata buffer */ + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __jbd2_journal_remove_checkpoint(jh); + } + spin_unlock(&journal->j_list_lock); +out: + return; +} + +/** + * int jbd2_journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a jbd2_journal_dirty_data on this + * buffer. So we need to lock against that. jbd2_journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success + */ +int jbd2_journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) +{ + struct buffer_head *head; + struct buffer_head *bh; + int ret = 0; + + J_ASSERT(PageLocked(page)); + + head = page_buffers(page); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * jbd2_journal_put_journal_head(). + */ + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + continue; + + jbd_lock_bh_state(bh); + __journal_try_to_free_buffer(journal, bh); + jbd2_journal_put_journal_head(jh); + jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + + ret = try_to_free_buffers(page); + +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jbd_lock_bh_state(bh). + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __jbd2_journal_unfile_buffer(jh); + } + return may_free; +} + +/* + * jbd2_journal_invalidatepage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidatepage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + + BUFFER_TRACE(bh, "entry"); + + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + if (!buffer_jbd(bh)) + goto zap_buffer_unlocked; + + /* OK, we have data buffer in journaled mode */ + write_lock(&journal->j_state_lock); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + goto zap_buffer_no_jh; + + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + may_free = __dispose_buffer(jh, + journal->j_running_transaction); + goto zap_buffer; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + may_free = __dispose_buffer(jh, + journal->j_committing_transaction); + goto zap_buffer; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); + /* + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + return -EBUSY; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. + */ + jh->b_modified = 0; + jbd2_journal_put_journal_head(jh); +zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * void jbd2_journal_invalidatepage() + * @journal: journal to use for flush... + * @page: page to flush + * @offset: start of the range to invalidate + * @length: length of the range to invalidate + * + * Reap page buffers containing data after in the specified range in page. + * Can return -EBUSY if buffers are part of the committing transaction and + * the page is straddling i_size. Caller then has to wait for current commit + * and try again. + */ +int jbd2_journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned int offset, + unsigned int length) +{ + struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; + unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); + int may_free = 1; + int ret = 0; + + if (!PageLocked(page)) + BUG(); + if (!page_has_buffers(page)) + return 0; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + head = bh = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (next_off > stop) + return 0; + + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + ret = journal_unmap_buffer(journal, bh, partial_page); + unlock_buffer(bh); + if (ret < 0) + return ret; + may_free &= ret; + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!partial_page) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); + } + return 0; +} + +/* + * File a buffer on the given transaction list. + */ +void __jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = NULL; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == NULL); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&transaction->t_journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns + */ +void __jbd2_journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __jbd2_journal_unfile_buffer(jh); + return; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __jbd2_journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +/* + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. + */ +void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_list_lock); + __brelse(bh); +} + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal; + + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). + */ +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, + loff_t new_size) +{ + transaction_t *inode_trans, *commit_trans; + int ret = 0; + + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) + goto out; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ + read_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + read_unlock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} diff --git a/kernel/fs/jffs2/Kconfig b/kernel/fs/jffs2/Kconfig new file mode 100644 index 000000000..d8bb6c411 --- /dev/null +++ b/kernel/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at . + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support" + depends on JFFS2_FS + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support" + depends on JFFS2_FS + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/kernel/fs/jffs2/LICENCE b/kernel/fs/jffs2/LICENCE new file mode 100644 index 000000000..562885908 --- /dev/null +++ b/kernel/fs/jffs2/LICENCE @@ -0,0 +1,30 @@ +The files in this directory and elsewhere which refer to this LICENCE +file are part of JFFS2, the Journalling Flash File System v2. + + Copyright © 2001-2007 Red Hat, Inc. and others + +JFFS2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 or (at your option) any later +version. + +JFFS2 is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with JFFS2; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + +As a special exception, if other files instantiate templates or use +macros or inline functions from these files, or you compile these +files and link them with other works to produce a work based on these +files, these files do not by themselves cause the resulting work to be +covered by the GNU General Public License. However the source code for +these files must still be made available in accordance with section (3) +of the GNU General Public License. + +This exception does not invalidate any other reasons why a work based on +this file might be covered by the GNU General Public License. + diff --git a/kernel/fs/jffs2/Makefile b/kernel/fs/jffs2/Makefile new file mode 100644 index 000000000..60e5d49ca --- /dev/null +++ b/kernel/fs/jffs2/Makefile @@ -0,0 +1,21 @@ +# +# Makefile for the Linux Journalling Flash File System v2 (JFFS2) +# +# + +obj-$(CONFIG_JFFS2_FS) += jffs2.o + +jffs2-y := compr.o dir.o file.o ioctl.o nodelist.o malloc.o +jffs2-y += read.o nodemgmt.o readinode.o write.o scan.o gc.o +jffs2-y += symlink.o build.o erase.o background.o fs.o writev.o +jffs2-y += super.o debug.o + +jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER) += wbuf.o +jffs2-$(CONFIG_JFFS2_FS_XATTR) += xattr.o xattr_trusted.o xattr_user.o +jffs2-$(CONFIG_JFFS2_FS_SECURITY) += security.o +jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL) += acl.o +jffs2-$(CONFIG_JFFS2_RUBIN) += compr_rubin.o +jffs2-$(CONFIG_JFFS2_RTIME) += compr_rtime.o +jffs2-$(CONFIG_JFFS2_ZLIB) += compr_zlib.o +jffs2-$(CONFIG_JFFS2_LZO) += compr_lzo.o +jffs2-$(CONFIG_JFFS2_SUMMARY) += summary.o diff --git a/kernel/fs/jffs2/README.Locking b/kernel/fs/jffs2/README.Locking new file mode 100644 index 000000000..3ea365541 --- /dev/null +++ b/kernel/fs/jffs2/README.Locking @@ -0,0 +1,172 @@ + + JFFS2 LOCKING DOCUMENTATION + --------------------------- + +At least theoretically, JFFS2 does not require the Big Kernel Lock +(BKL), which was always helpfully obtained for it by Linux 2.4 VFS +code. It has its own locking, as described below. + +This document attempts to describe the existing locking rules for +JFFS2. It is not expected to remain perfectly up to date, but ought to +be fairly close. + + + alloc_sem + --------- + +The alloc_sem is a per-filesystem mutex, used primarily to ensure +contiguous allocation of space on the medium. It is automatically +obtained during space allocations (jffs2_reserve_space()) and freed +upon write completion (jffs2_complete_reservation()). Note that +the garbage collector will obtain this right at the beginning of +jffs2_garbage_collect_pass() and release it at the end, thereby +preventing any other write activity on the file system during a +garbage collect pass. + +When writing new nodes, the alloc_sem must be held until the new nodes +have been properly linked into the data structures for the inode to +which they belong. This is for the benefit of NAND flash - adding new +nodes to an inode may obsolete old ones, and by holding the alloc_sem +until this happens we ensure that any data in the write-buffer at the +time this happens are part of the new node, not just something that +was written afterwards. Hence, we can ensure the newly-obsoleted nodes +don't actually get erased until the write-buffer has been flushed to +the medium. + +With the introduction of NAND flash support and the write-buffer, +the alloc_sem is also used to protect the wbuf-related members of the +jffs2_sb_info structure. Atomically reading the wbuf_len member to see +if the wbuf is currently holding any data is permitted, though. + +Ordering constraints: See f->sem. + + + File Mutex f->sem + --------------------- + +This is the JFFS2-internal equivalent of the inode mutex i->i_sem. +It protects the contents of the jffs2_inode_info private inode data, +including the linked list of node fragments (but see the notes below on +erase_completion_lock), etc. + +The reason that the i_sem itself isn't used for this purpose is to +avoid deadlocks with garbage collection -- the VFS will lock the i_sem +before calling a function which may need to allocate space. The +allocation may trigger garbage-collection, which may need to move a +node belonging to the inode which was locked in the first place by the +VFS. If the garbage collection code were to attempt to lock the i_sem +of the inode from which it's garbage-collecting a physical node, this +lead to deadlock, unless we played games with unlocking the i_sem +before calling the space allocation functions. + +Instead of playing such games, we just have an extra internal +mutex, which is obtained by the garbage collection code and also +by the normal file system code _after_ allocation of space. + +Ordering constraints: + + 1. Never attempt to allocate space or lock alloc_sem with + any f->sem held. + 2. Never attempt to lock two file mutexes in one thread. + No ordering rules have been made for doing so. + + + erase_completion_lock spinlock + ------------------------------ + +This is used to serialise access to the eraseblock lists, to the +per-eraseblock lists of physical jffs2_raw_node_ref structures, and +(NB) the per-inode list of physical nodes. The latter is a special +case - see below. + +As the MTD API no longer permits erase-completion callback functions +to be called from bottom-half (timer) context (on the basis that nobody +ever actually implemented such a thing), it's now sufficient to use +a simple spin_lock() rather than spin_lock_bh(). + +Note that the per-inode list of physical nodes (f->nodes) is a special +case. Any changes to _valid_ nodes (i.e. ->flash_offset & 1 == 0) in +the list are protected by the file mutex f->sem. But the erase code +may remove _obsolete_ nodes from the list while holding only the +erase_completion_lock. So you can walk the list only while holding the +erase_completion_lock, and can drop the lock temporarily mid-walk as +long as the pointer you're holding is to a _valid_ node, not an +obsolete one. + +The erase_completion_lock is also used to protect the c->gc_task +pointer when the garbage collection thread exits. The code to kill the +GC thread locks it, sends the signal, then unlocks it - while the GC +thread itself locks it, zeroes c->gc_task, then unlocks on the exit path. + + + inocache_lock spinlock + ---------------------- + +This spinlock protects the hashed list (c->inocache_list) of the +in-core jffs2_inode_cache objects (each inode in JFFS2 has the +correspondent jffs2_inode_cache object). So, the inocache_lock +has to be locked while walking the c->inocache_list hash buckets. + +This spinlock also covers allocation of new inode numbers, which is +currently just '++->highest_ino++', but might one day get more complicated +if we need to deal with wrapping after 4 milliard inode numbers are used. + +Note, the f->sem guarantees that the correspondent jffs2_inode_cache +will not be removed. So, it is allowed to access it without locking +the inocache_lock spinlock. + +Ordering constraints: + + If both erase_completion_lock and inocache_lock are needed, the + c->erase_completion has to be acquired first. + + + erase_free_sem + -------------- + +This mutex is only used by the erase code which frees obsolete node +references and the jffs2_garbage_collect_deletion_dirent() function. +The latter function on NAND flash must read _obsolete_ nodes to +determine whether the 'deletion dirent' under consideration can be +discarded or whether it is still required to show that an inode has +been unlinked. Because reading from the flash may sleep, the +erase_completion_lock cannot be held, so an alternative, more +heavyweight lock was required to prevent the erase code from freeing +the jffs2_raw_node_ref structures in question while the garbage +collection code is looking at them. + +Suggestions for alternative solutions to this problem would be welcomed. + + + wbuf_sem + -------- + +This read/write semaphore protects against concurrent access to the +write-behind buffer ('wbuf') used for flash chips where we must write +in blocks. It protects both the contents of the wbuf and the metadata +which indicates which flash region (if any) is currently covered by +the buffer. + +Ordering constraints: + Lock wbuf_sem last, after the alloc_sem or and f->sem. + + + c->xattr_sem + ------------ + +This read/write semaphore protects against concurrent access to the +xattr related objects which include stuff in superblock and ic->xref. +In read-only path, write-semaphore is too much exclusion. It's enough +by read-semaphore. But you must hold write-semaphore when updating, +creating or deleting any xattr related object. + +Once xattr_sem released, there would be no assurance for the existence +of those objects. Thus, a series of processes is often required to retry, +when updating such a object is necessary under holding read semaphore. +For example, do_jffs2_getxattr() holds read-semaphore to scan xref and +xdatum at first. But it retries this process with holding write-semaphore +after release read-semaphore, if it's necessary to load name/value pair +from medium. + +Ordering constraints: + Lock xattr_sem last, after the alloc_sem. diff --git a/kernel/fs/jffs2/TODO b/kernel/fs/jffs2/TODO new file mode 100644 index 000000000..ca28964ab --- /dev/null +++ b/kernel/fs/jffs2/TODO @@ -0,0 +1,37 @@ + + - support asynchronous operation -- add a per-fs 'reserved_space' count, + let each outstanding write reserve the _maximum_ amount of physical + space it could take. Let GC flush the outstanding writes because the + reservations will necessarily be pessimistic. With this we could even + do shared writable mmap, if we can have a fs hook for do_wp_page() to + make the reservation. + - disable compression in commit_write()? + - fine-tune the allocation / GC thresholds + - chattr support - turning on/off and tuning compression per-inode + - checkpointing (do we need this? scan is quite fast) + - make the scan code populate real inodes so read_inode just after + mount doesn't have to read the flash twice for large files. + Make this a per-inode option, changeable with chattr, so you can + decide which inodes should be in-core immediately after mount. + - test, test, test + + - NAND flash support: + - almost done :) + - use bad block check instead of the hardwired byte check + + - Optimisations: + - Split writes so they go to two separate blocks rather than just c->nextblock. + By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE + nodes to a different one, we can separate clean nodes from those which + are likely to become dirty, and end up with blocks which are each far + closer to 100% or 0% clean, hence speeding up later GC progress dramatically. + - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in + the full dirent, we only need to go to the flash in lookup() when we think we've + got a match, and in readdir(). + - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately? + - Remove size from jffs2_raw_node_frag. + +dedekind: +1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate. +2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in + case of failure? scan() does not clean everything. Fix. diff --git a/kernel/fs/jffs2/acl.c b/kernel/fs/jffs2/acl.c new file mode 100644 index 000000000..2f7a3c090 --- /dev/null +++ b/kernel/fs/jffs2/acl.c @@ -0,0 +1,309 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static size_t jffs2_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct jffs2_acl_header) + + count * sizeof(struct jffs2_acl_entry_short); + } else { + return sizeof(struct jffs2_acl_header) + + 4 * sizeof(struct jffs2_acl_entry_short) + + (count - 4) * sizeof(struct jffs2_acl_entry); + } +} + +static int jffs2_acl_count(size_t size) +{ + size_t s; + + size -= sizeof(struct jffs2_acl_header); + if (size < 4 * sizeof(struct jffs2_acl_entry_short)) { + if (size % sizeof(struct jffs2_acl_entry_short)) + return -1; + return size / sizeof(struct jffs2_acl_entry_short); + } else { + s = size - 4 * sizeof(struct jffs2_acl_entry_short); + if (s % sizeof(struct jffs2_acl_entry)) + return -1; + return s / sizeof(struct jffs2_acl_entry) + 4; + } +} + +static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size) +{ + void *end = value + size; + struct jffs2_acl_header *header = value; + struct jffs2_acl_entry *entry; + struct posix_acl *acl; + uint32_t ver; + int i, count; + + if (!value) + return NULL; + if (size < sizeof(struct jffs2_acl_header)) + return ERR_PTR(-EINVAL); + ver = je32_to_cpu(header->a_version); + if (ver != JFFS2_ACL_VERSION) { + JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver); + return ERR_PTR(-EINVAL); + } + + value += sizeof(struct jffs2_acl_header); + count = jffs2_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i=0; i < count; i++) { + entry = value; + if (value + sizeof(struct jffs2_acl_entry_short) > end) + goto fail; + acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm); + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value += sizeof(struct jffs2_acl_entry_short); + break; + + case ACL_USER: + value += sizeof(struct jffs2_acl_entry); + if (value > end) + goto fail; + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + je32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value += sizeof(struct jffs2_acl_entry); + if (value > end) + goto fail; + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + je32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) +{ + struct jffs2_acl_header *header; + struct jffs2_acl_entry *entry; + void *e; + size_t i; + + *size = jffs2_acl_size(acl->a_count); + header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL); + if (!header) + return ERR_PTR(-ENOMEM); + header->a_version = cpu_to_je32(JFFS2_ACL_VERSION); + e = header + 1; + for (i=0; i < acl->a_count; i++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[i]; + entry = e; + entry->e_tag = cpu_to_je16(acl_e->e_tag); + entry->e_perm = cpu_to_je16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_je32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(struct jffs2_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_je32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(struct jffs2_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(struct jffs2_acl_entry_short); + break; + + default: + goto fail; + } + } + return header; + fail: + kfree(header); + return ERR_PTR(-EINVAL); +} + +struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *value = NULL; + int rc, xprefix; + + switch (type) { + case ACL_TYPE_ACCESS: + xprefix = JFFS2_XPREFIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xprefix = JFFS2_XPREFIX_ACL_DEFAULT; + break; + default: + BUG(); + } + rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = do_jffs2_getxattr(inode, xprefix, "", value, rc); + } + if (rc > 0) { + acl = jffs2_acl_from_medium(value, rc); + } else if (rc == -ENODATA || rc == -ENOSYS) { + acl = NULL; + } else { + acl = ERR_PTR(rc); + } + kfree(value); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + return acl; +} + +static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *acl) +{ + char *value = NULL; + size_t size = 0; + int rc; + + if (acl) { + value = jffs2_acl_to_medium(acl, &size); + if (IS_ERR(value)) + return PTR_ERR(value); + } + rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0); + if (!value && rc == -ENODATA) + rc = 0; + kfree(value); + + return rc; +} + +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc, xprefix; + + switch (type) { + case ACL_TYPE_ACCESS: + xprefix = JFFS2_XPREFIX_ACL_ACCESS; + if (acl) { + umode_t mode = inode->i_mode; + rc = posix_acl_equiv_mode(acl, &mode); + if (rc < 0) + return rc; + if (inode->i_mode != mode) { + struct iattr attr; + + attr.ia_valid = ATTR_MODE | ATTR_CTIME; + attr.ia_mode = mode; + attr.ia_ctime = CURRENT_TIME_SEC; + rc = jffs2_do_setattr(inode, &attr); + if (rc < 0) + return rc; + } + if (rc == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + xprefix = JFFS2_XPREFIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + rc = __jffs2_set_acl(inode, xprefix, acl); + if (!rc) + set_cached_acl(inode, type, acl); + return rc; +} + +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) +{ + struct posix_acl *default_acl, *acl; + int rc; + + cache_no_acl(inode); + + rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl); + if (rc) + return rc; + + if (default_acl) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + if (acl) { + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + } + return 0; +} + +int jffs2_init_acl_post(struct inode *inode) +{ + int rc; + + if (inode->i_default_acl) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl); + if (rc) + return rc; + } + + if (inode->i_acl) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl); + if (rc) + return rc; + } + + return 0; +} diff --git a/kernel/fs/jffs2/acl.h b/kernel/fs/jffs2/acl.h new file mode 100644 index 000000000..2e2b5745c --- /dev/null +++ b/kernel/fs/jffs2/acl.h @@ -0,0 +1,41 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +struct jffs2_acl_entry { + jint16_t e_tag; + jint16_t e_perm; + jint32_t e_id; +}; + +struct jffs2_acl_entry_short { + jint16_t e_tag; + jint16_t e_perm; +}; + +struct jffs2_acl_header { + jint32_t a_version; +}; + +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + +struct posix_acl *jffs2_get_acl(struct inode *inode, int type); +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); +extern int jffs2_init_acl_post(struct inode *); + +#else + +#define jffs2_get_acl (NULL) +#define jffs2_set_acl (NULL) +#define jffs2_init_acl_pre(dir_i,inode,mode) (0) +#define jffs2_init_acl_post(inode) (0) + +#endif /* CONFIG_JFFS2_FS_POSIX_ACL */ diff --git a/kernel/fs/jffs2/background.c b/kernel/fs/jffs2/background.c new file mode 100644 index 000000000..bb9cebc9c --- /dev/null +++ b/kernel/fs/jffs2/background.c @@ -0,0 +1,168 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + + +static int jffs2_garbage_collect_thread(void *); + +void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) +{ + assert_spin_locked(&c->erase_completion_lock); + if (c->gc_task && jffs2_thread_should_wake(c)) + send_sig(SIGHUP, c->gc_task, 1); +} + +/* This must only ever be called when no GC thread is currently running */ +int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) +{ + struct task_struct *tsk; + int ret = 0; + + BUG_ON(c->gc_task); + + init_completion(&c->gc_thread_start); + init_completion(&c->gc_thread_exit); + + tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index); + if (IS_ERR(tsk)) { + pr_warn("fork failed for JFFS2 garbage collect thread: %ld\n", + -PTR_ERR(tsk)); + complete(&c->gc_thread_exit); + ret = PTR_ERR(tsk); + } else { + /* Wait for it... */ + jffs2_dbg(1, "Garbage collect thread is pid %d\n", tsk->pid); + wait_for_completion(&c->gc_thread_start); + ret = tsk->pid; + } + + return ret; +} + +void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) +{ + int wait = 0; + spin_lock(&c->erase_completion_lock); + if (c->gc_task) { + jffs2_dbg(1, "Killing GC task %d\n", c->gc_task->pid); + send_sig(SIGKILL, c->gc_task, 1); + wait = 1; + } + spin_unlock(&c->erase_completion_lock); + if (wait) + wait_for_completion(&c->gc_thread_exit); +} + +static int jffs2_garbage_collect_thread(void *_c) +{ + struct jffs2_sb_info *c = _c; + sigset_t hupmask; + + siginitset(&hupmask, sigmask(SIGHUP)); + allow_signal(SIGKILL); + allow_signal(SIGSTOP); + allow_signal(SIGCONT); + allow_signal(SIGHUP); + + c->gc_task = current; + complete(&c->gc_thread_start); + + set_user_nice(current, 10); + + set_freezable(); + for (;;) { + sigprocmask(SIG_UNBLOCK, &hupmask, NULL); + again: + spin_lock(&c->erase_completion_lock); + if (!jffs2_thread_should_wake(c)) { + set_current_state (TASK_INTERRUPTIBLE); + spin_unlock(&c->erase_completion_lock); + jffs2_dbg(1, "%s(): sleeping...\n", __func__); + schedule(); + } else { + spin_unlock(&c->erase_completion_lock); + } + /* Problem - immediately after bootup, the GCD spends a lot + * of time in places like jffs2_kill_fragtree(); so much so + * that userspace processes (like gdm and X) are starved + * despite plenty of cond_resched()s and renicing. Yield() + * doesn't help, either (presumably because userspace and GCD + * are generally competing for a higher latency resource - + * disk). + * This forces the GCD to slow the hell down. Pulling an + * inode in with read_inode() is much preferable to having + * the GC thread get there first. */ + schedule_timeout_interruptible(msecs_to_jiffies(50)); + + if (kthread_should_stop()) { + jffs2_dbg(1, "%s(): kthread_stop() called\n", __func__); + goto die; + } + + /* Put_super will send a SIGKILL and then wait on the sem. + */ + while (signal_pending(current) || freezing(current)) { + siginfo_t info; + unsigned long signr; + + if (try_to_freeze()) + goto again; + + signr = dequeue_signal_lock(current, ¤t->blocked, &info); + + switch(signr) { + case SIGSTOP: + jffs2_dbg(1, "%s(): SIGSTOP received\n", + __func__); + set_current_state(TASK_STOPPED); + schedule(); + break; + + case SIGKILL: + jffs2_dbg(1, "%s(): SIGKILL received\n", + __func__); + goto die; + + case SIGHUP: + jffs2_dbg(1, "%s(): SIGHUP received\n", + __func__); + break; + default: + jffs2_dbg(1, "%s(): signal %ld received\n", + __func__, signr); + } + } + /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ + sigprocmask(SIG_BLOCK, &hupmask, NULL); + + jffs2_dbg(1, "%s(): pass\n", __func__); + if (jffs2_garbage_collect_pass(c) == -ENOSPC) { + pr_notice("No space for garbage collection. Aborting GC thread\n"); + goto die; + } + } + die: + spin_lock(&c->erase_completion_lock); + c->gc_task = NULL; + spin_unlock(&c->erase_completion_lock); + complete_and_exit(&c->gc_thread_exit, 0); +} diff --git a/kernel/fs/jffs2/build.c b/kernel/fs/jffs2/build.c new file mode 100644 index 000000000..a3750f902 --- /dev/null +++ b/kernel/fs/jffs2/build.c @@ -0,0 +1,394 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, + struct jffs2_inode_cache *, struct jffs2_full_dirent **); + +static inline struct jffs2_inode_cache * +first_inode_chain(int *i, struct jffs2_sb_info *c) +{ + for (; *i < c->inocache_hashsize; (*i)++) { + if (c->inocache_list[*i]) + return c->inocache_list[*i]; + } + return NULL; +} + +static inline struct jffs2_inode_cache * +next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c) +{ + /* More in this chain? */ + if (ic->next) + return ic->next; + (*i)++; + return first_inode_chain(i, c); +} + +#define for_each_inode(i, c, ic) \ + for (i = 0, ic = first_inode_chain(&i, (c)); \ + ic; \ + ic = next_inode(&i, ic, (c))) + + +static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic) +{ + struct jffs2_full_dirent *fd; + + dbg_fsbuild("building directory inode #%u\n", ic->ino); + + /* For each child, increase nlink */ + for(fd = ic->scan_dents; fd; fd = fd->next) { + struct jffs2_inode_cache *child_ic; + if (!fd->ino) + continue; + + /* we can get high latency here with huge directories */ + + child_ic = jffs2_get_ino_cache(c, fd->ino); + if (!child_ic) { + dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n", + fd->name, fd->ino, ic->ino); + jffs2_mark_node_obsolete(c, fd->raw); + continue; + } + + if (fd->type == DT_DIR) { + if (child_ic->pino_nlink) { + JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", + fd->name, fd->ino, ic->ino); + /* TODO: What do we do about it? */ + } else { + child_ic->pino_nlink = ic->ino; + } + } else + child_ic->pino_nlink++; + + dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino); + /* Can't free scan_dents so far. We might need them in pass 2 */ + } +} + +/* Scan plan: + - Scan physical nodes. Build map of inodes/dirents. Allocate inocaches as we go + - Scan directory tree from top down, setting nlink in inocaches + - Scan inocaches for inodes with nlink==0 +*/ +static int jffs2_build_filesystem(struct jffs2_sb_info *c) +{ + int ret; + int i; + struct jffs2_inode_cache *ic; + struct jffs2_full_dirent *fd; + struct jffs2_full_dirent *dead_fds = NULL; + + dbg_fsbuild("build FS data structures\n"); + + /* First, scan the medium and build all the inode caches with + lists of physical nodes */ + + c->flags |= JFFS2_SB_FLAG_SCANNING; + ret = jffs2_scan_medium(c); + c->flags &= ~JFFS2_SB_FLAG_SCANNING; + if (ret) + goto exit; + + dbg_fsbuild("scanned flash completely\n"); + jffs2_dbg_dump_block_lists_nolock(c); + + dbg_fsbuild("pass 1 starting\n"); + c->flags |= JFFS2_SB_FLAG_BUILDING; + /* Now scan the directory tree, increasing nlink according to every dirent found. */ + for_each_inode(i, c, ic) { + if (ic->scan_dents) { + jffs2_build_inode_pass1(c, ic); + cond_resched(); + } + } + + dbg_fsbuild("pass 1 complete\n"); + + /* Next, scan for inodes with nlink == 0 and remove them. If + they were directories, then decrement the nlink of their + children too, and repeat the scan. As that's going to be + a fairly uncommon occurrence, it's not so evil to do it this + way. Recursion bad. */ + dbg_fsbuild("pass 2 starting\n"); + + for_each_inode(i, c, ic) { + if (ic->pino_nlink) + continue; + + jffs2_build_remove_unlinked_inode(c, ic, &dead_fds); + cond_resched(); + } + + dbg_fsbuild("pass 2a starting\n"); + + while (dead_fds) { + fd = dead_fds; + dead_fds = fd->next; + + ic = jffs2_get_ino_cache(c, fd->ino); + + if (ic) + jffs2_build_remove_unlinked_inode(c, ic, &dead_fds); + jffs2_free_full_dirent(fd); + } + + dbg_fsbuild("pass 2a complete\n"); + dbg_fsbuild("freeing temporary data structures\n"); + + /* Finally, we can scan again and free the dirent structs */ + for_each_inode(i, c, ic) { + while(ic->scan_dents) { + fd = ic->scan_dents; + ic->scan_dents = fd->next; + jffs2_free_full_dirent(fd); + } + ic->scan_dents = NULL; + cond_resched(); + } + jffs2_build_xattr_subsystem(c); + c->flags &= ~JFFS2_SB_FLAG_BUILDING; + + dbg_fsbuild("FS build complete\n"); + + /* Rotate the lists by some number to ensure wear levelling */ + jffs2_rotate_lists(c); + + ret = 0; + +exit: + if (ret) { + for_each_inode(i, c, ic) { + while(ic->scan_dents) { + fd = ic->scan_dents; + ic->scan_dents = fd->next; + jffs2_free_full_dirent(fd); + } + } + jffs2_clear_xattr_subsystem(c); + } + + return ret; +} + +static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_full_dirent **dead_fds) +{ + struct jffs2_raw_node_ref *raw; + struct jffs2_full_dirent *fd; + + dbg_fsbuild("removing ino #%u with nlink == zero.\n", ic->ino); + + raw = ic->nodes; + while (raw != (void *)ic) { + struct jffs2_raw_node_ref *next = raw->next_in_ino; + dbg_fsbuild("obsoleting node at 0x%08x\n", ref_offset(raw)); + jffs2_mark_node_obsolete(c, raw); + raw = next; + } + + if (ic->scan_dents) { + int whinged = 0; + dbg_fsbuild("inode #%u was a directory which may have children...\n", ic->ino); + + while(ic->scan_dents) { + struct jffs2_inode_cache *child_ic; + + fd = ic->scan_dents; + ic->scan_dents = fd->next; + + if (!fd->ino) { + /* It's a deletion dirent. Ignore it */ + dbg_fsbuild("child \"%s\" is a deletion dirent, skipping...\n", fd->name); + jffs2_free_full_dirent(fd); + continue; + } + if (!whinged) + whinged = 1; + + dbg_fsbuild("removing child \"%s\", ino #%u\n", fd->name, fd->ino); + + child_ic = jffs2_get_ino_cache(c, fd->ino); + if (!child_ic) { + dbg_fsbuild("cannot remove child \"%s\", ino #%u, because it doesn't exist\n", + fd->name, fd->ino); + jffs2_free_full_dirent(fd); + continue; + } + + /* Reduce nlink of the child. If it's now zero, stick it on the + dead_fds list to be cleaned up later. Else just free the fd */ + + if (fd->type == DT_DIR) + child_ic->pino_nlink = 0; + else + child_ic->pino_nlink--; + + if (!child_ic->pino_nlink) { + dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n", + fd->ino, fd->name); + fd->next = *dead_fds; + *dead_fds = fd; + } else { + dbg_fsbuild("inode #%u (\"%s\") has now got nlink %d. Ignoring.\n", + fd->ino, fd->name, child_ic->pino_nlink); + jffs2_free_full_dirent(fd); + } + } + } + + /* + We don't delete the inocache from the hash list and free it yet. + The erase code will do that, when all the nodes are completely gone. + */ +} + +static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c) +{ + uint32_t size; + + /* Deletion should almost _always_ be allowed. We're fairly + buggered once we stop allowing people to delete stuff + because there's not enough free space... */ + c->resv_blocks_deletion = 2; + + /* Be conservative about how much space we need before we allow writes. + On top of that which is required for deletia, require an extra 2% + of the medium to be available, for overhead caused by nodes being + split across blocks, etc. */ + + size = c->flash_size / 50; /* 2% of flash size */ + size += c->nr_blocks * 100; /* And 100 bytes per eraseblock */ + size += c->sector_size - 1; /* ... and round up */ + + c->resv_blocks_write = c->resv_blocks_deletion + (size / c->sector_size); + + /* When do we let the GC thread run in the background */ + + c->resv_blocks_gctrigger = c->resv_blocks_write + 1; + + /* When do we allow garbage collection to merge nodes to make + long-term progress at the expense of short-term space exhaustion? */ + c->resv_blocks_gcmerge = c->resv_blocks_deletion + 1; + + /* When do we allow garbage collection to eat from bad blocks rather + than actually making progress? */ + c->resv_blocks_gcbad = 0;//c->resv_blocks_deletion + 2; + + /* What number of 'very dirty' eraseblocks do we allow before we + trigger the GC thread even if we don't _need_ the space. When we + can't mark nodes obsolete on the medium, the old dirty nodes cause + performance problems because we have to inspect and discard them. */ + c->vdirty_blocks_gctrigger = c->resv_blocks_gctrigger; + if (jffs2_can_mark_obsolete(c)) + c->vdirty_blocks_gctrigger *= 10; + + /* If there's less than this amount of dirty space, don't bother + trying to GC to make more space. It'll be a fruitless task */ + c->nospc_dirty_size = c->sector_size + (c->flash_size / 100); + + dbg_fsbuild("trigger levels (size %d KiB, block size %d KiB, %d blocks)\n", + c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks); + dbg_fsbuild("Blocks required to allow deletion: %d (%d KiB)\n", + c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024); + dbg_fsbuild("Blocks required to allow writes: %d (%d KiB)\n", + c->resv_blocks_write, c->resv_blocks_write*c->sector_size/1024); + dbg_fsbuild("Blocks required to quiesce GC thread: %d (%d KiB)\n", + c->resv_blocks_gctrigger, c->resv_blocks_gctrigger*c->sector_size/1024); + dbg_fsbuild("Blocks required to allow GC merges: %d (%d KiB)\n", + c->resv_blocks_gcmerge, c->resv_blocks_gcmerge*c->sector_size/1024); + dbg_fsbuild("Blocks required to GC bad blocks: %d (%d KiB)\n", + c->resv_blocks_gcbad, c->resv_blocks_gcbad*c->sector_size/1024); + dbg_fsbuild("Amount of dirty space required to GC: %d bytes\n", + c->nospc_dirty_size); + dbg_fsbuild("Very dirty blocks before GC triggered: %d\n", + c->vdirty_blocks_gctrigger); +} + +int jffs2_do_mount_fs(struct jffs2_sb_info *c) +{ + int ret; + int i; + int size; + + c->free_size = c->flash_size; + c->nr_blocks = c->flash_size / c->sector_size; + size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; +#ifndef __ECOS + if (jffs2_blocks_use_vmalloc(c)) + c->blocks = vzalloc(size); + else +#endif + c->blocks = kzalloc(size, GFP_KERNEL); + if (!c->blocks) + return -ENOMEM; + + for (i=0; inr_blocks; i++) { + INIT_LIST_HEAD(&c->blocks[i].list); + c->blocks[i].offset = i * c->sector_size; + c->blocks[i].free_size = c->sector_size; + } + + INIT_LIST_HEAD(&c->clean_list); + INIT_LIST_HEAD(&c->very_dirty_list); + INIT_LIST_HEAD(&c->dirty_list); + INIT_LIST_HEAD(&c->erasable_list); + INIT_LIST_HEAD(&c->erasing_list); + INIT_LIST_HEAD(&c->erase_checking_list); + INIT_LIST_HEAD(&c->erase_pending_list); + INIT_LIST_HEAD(&c->erasable_pending_wbuf_list); + INIT_LIST_HEAD(&c->erase_complete_list); + INIT_LIST_HEAD(&c->free_list); + INIT_LIST_HEAD(&c->bad_list); + INIT_LIST_HEAD(&c->bad_used_list); + c->highest_ino = 1; + c->summary = NULL; + + ret = jffs2_sum_init(c); + if (ret) + goto out_free; + + if (jffs2_build_filesystem(c)) { + dbg_fsbuild("build_fs failed\n"); + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + ret = -EIO; + goto out_free; + } + + jffs2_calc_trigger_levels(c); + + return 0; + + out_free: +#ifndef __ECOS + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else +#endif + kfree(c->blocks); + + return ret; +} diff --git a/kernel/fs/jffs2/compr.c b/kernel/fs/jffs2/compr.c new file mode 100644 index 000000000..4849a4c9a --- /dev/null +++ b/kernel/fs/jffs2/compr.c @@ -0,0 +1,418 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * Copyright © 2004 Ferenc Havasi , + * University of Szeged, Hungary + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "compr.h" + +static DEFINE_SPINLOCK(jffs2_compressor_list_lock); + +/* Available compressors are on this list */ +static LIST_HEAD(jffs2_compressor_list); + +/* Actual compression mode */ +static int jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY; + +/* Statistics for blocks stored without compression */ +static uint32_t none_stat_compr_blocks=0,none_stat_decompr_blocks=0,none_stat_compr_size=0; + + +/* + * Return 1 to use this compression + */ +static int jffs2_is_best_compression(struct jffs2_compressor *this, + struct jffs2_compressor *best, uint32_t size, uint32_t bestsize) +{ + switch (jffs2_compression_mode) { + case JFFS2_COMPR_MODE_SIZE: + if (bestsize > size) + return 1; + return 0; + case JFFS2_COMPR_MODE_FAVOURLZO: + if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > size)) + return 1; + if ((best->compr != JFFS2_COMPR_LZO) && (bestsize > size)) + return 1; + if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > (size * FAVOUR_LZO_PERCENT / 100))) + return 1; + if ((bestsize * FAVOUR_LZO_PERCENT / 100) > size) + return 1; + + return 0; + } + /* Shouldn't happen */ + return 0; +} + +/* + * jffs2_selected_compress: + * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). + * If 0, just take the first available compression mode. + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: the compression type used. Zero is used to show that the data + * could not be compressed; probably because we couldn't find the requested + * compression mode. + */ +static int jffs2_selected_compress(u8 compr, unsigned char *data_in, + unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) +{ + struct jffs2_compressor *this; + int err, ret = JFFS2_COMPR_NONE; + uint32_t orig_slen, orig_dlen; + char *output_buf; + + output_buf = kmalloc(*cdatalen, GFP_KERNEL); + if (!output_buf) { + pr_warn("No memory for compressor allocation. Compression failed.\n"); + return ret; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only and disabled modules */ + if (!this->compress || this->disabled) + continue; + + /* Skip if not the desired compression type */ + if (compr && (compr != this->compr)) + continue; + + /* + * Either compression type was unspecified, or we found our + * compressor; either way, we're good to go. + */ + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + + *datalen = orig_slen; + *cdatalen = orig_dlen; + err = this->compress(data_in, output_buf, datalen, cdatalen); + + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!err) { + /* Success */ + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + else + *cpage_out = output_buf; + + return ret; +} + +/* jffs2_compress: + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: Lower byte to be stored with data indicating compression type used. + * Zero is used to show that the data could not be compressed - the + * compressed version was actually larger than the original. + * Upper byte will be used later. (soon) + * + * If the cdata buffer isn't large enough to hold all the uncompressed data, + * jffs2_compress should compress as much as will fit, and should set + * *datalen accordingly to show the amount of data which were compressed. + */ +uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *data_in, unsigned char **cpage_out, + uint32_t *datalen, uint32_t *cdatalen) +{ + int ret = JFFS2_COMPR_NONE; + int mode, compr_ret; + struct jffs2_compressor *this, *best=NULL; + unsigned char *output_buf = NULL, *tmp_buf; + uint32_t orig_slen, orig_dlen; + uint32_t best_slen=0, best_dlen=0; + + if (c->mount_opts.override_compr) + mode = c->mount_opts.compr; + else + mode = jffs2_compression_mode; + + switch (mode) { + case JFFS2_COMPR_MODE_NONE: + break; + case JFFS2_COMPR_MODE_PRIORITY: + ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, + cdatalen); + break; + case JFFS2_COMPR_MODE_SIZE: + case JFFS2_COMPR_MODE_FAVOURLZO: + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only backwards-compatibility and disabled modules */ + if ((!this->compress)||(this->disabled)) + continue; + /* Allocating memory for output buffer if necessary */ + if ((this->compr_buf_size < orig_slen) && (this->compr_buf)) { + spin_unlock(&jffs2_compressor_list_lock); + kfree(this->compr_buf); + spin_lock(&jffs2_compressor_list_lock); + this->compr_buf_size=0; + this->compr_buf=NULL; + } + if (!this->compr_buf) { + spin_unlock(&jffs2_compressor_list_lock); + tmp_buf = kmalloc(orig_slen, GFP_KERNEL); + spin_lock(&jffs2_compressor_list_lock); + if (!tmp_buf) { + pr_warn("No memory for compressor allocation. (%d bytes)\n", + orig_slen); + continue; + } + else { + this->compr_buf = tmp_buf; + this->compr_buf_size = orig_slen; + } + } + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + *datalen = orig_slen; + *cdatalen = orig_dlen; + compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen); + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!compr_ret) { + if (((!best_dlen) || jffs2_is_best_compression(this, best, *cdatalen, best_dlen)) + && (*cdatalen < *datalen)) { + best_dlen = *cdatalen; + best_slen = *datalen; + best = this; + } + } + } + if (best_dlen) { + *cdatalen = best_dlen; + *datalen = best_slen; + output_buf = best->compr_buf; + best->compr_buf = NULL; + best->compr_buf_size = 0; + best->stat_compr_blocks++; + best->stat_compr_orig_size += best_slen; + best->stat_compr_new_size += best_dlen; + ret = best->compr; + *cpage_out = output_buf; + } + spin_unlock(&jffs2_compressor_list_lock); + break; + case JFFS2_COMPR_MODE_FORCELZO: + ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, + cpage_out, datalen, cdatalen); + break; + case JFFS2_COMPR_MODE_FORCEZLIB: + ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, + cpage_out, datalen, cdatalen); + break; + default: + pr_err("unknown compression mode\n"); + } + + if (ret == JFFS2_COMPR_NONE) { + *cpage_out = data_in; + *datalen = *cdatalen; + none_stat_compr_blocks++; + none_stat_compr_size += *datalen; + } + return ret; +} + +int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint16_t comprtype, unsigned char *cdata_in, + unsigned char *data_out, uint32_t cdatalen, uint32_t datalen) +{ + struct jffs2_compressor *this; + int ret; + + /* Older code had a bug where it would write non-zero 'usercompr' + fields. Deal with it. */ + if ((comprtype & 0xff) <= JFFS2_COMPR_ZLIB) + comprtype &= 0xff; + + switch (comprtype & 0xff) { + case JFFS2_COMPR_NONE: + /* This should be special-cased elsewhere, but we might as well deal with it */ + memcpy(data_out, cdata_in, datalen); + none_stat_decompr_blocks++; + break; + case JFFS2_COMPR_ZERO: + memset(data_out, 0, datalen); + break; + default: + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + if (comprtype == this->compr) { + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + ret = this->decompress(cdata_in, data_out, cdatalen, datalen); + spin_lock(&jffs2_compressor_list_lock); + if (ret) { + pr_warn("Decompressor \"%s\" returned %d\n", + this->name, ret); + } + else { + this->stat_decompr_blocks++; + } + this->usecount--; + spin_unlock(&jffs2_compressor_list_lock); + return ret; + } + } + pr_warn("compression type 0x%02x not available\n", comprtype); + spin_unlock(&jffs2_compressor_list_lock); + return -EIO; + } + return 0; +} + +int jffs2_register_compressor(struct jffs2_compressor *comp) +{ + struct jffs2_compressor *this; + + if (!comp->name) { + pr_warn("NULL compressor name at registering JFFS2 compressor. Failed.\n"); + return -1; + } + comp->compr_buf_size=0; + comp->compr_buf=NULL; + comp->usecount=0; + comp->stat_compr_orig_size=0; + comp->stat_compr_new_size=0; + comp->stat_compr_blocks=0; + comp->stat_decompr_blocks=0; + jffs2_dbg(1, "Registering JFFS2 compressor \"%s\"\n", comp->name); + + spin_lock(&jffs2_compressor_list_lock); + + list_for_each_entry(this, &jffs2_compressor_list, list) { + if (this->priority < comp->priority) { + list_add(&comp->list, this->list.prev); + goto out; + } + } + list_add_tail(&comp->list, &jffs2_compressor_list); +out: + D2(list_for_each_entry(this, &jffs2_compressor_list, list) { + printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority); + }) + + spin_unlock(&jffs2_compressor_list_lock); + + return 0; +} + +int jffs2_unregister_compressor(struct jffs2_compressor *comp) +{ + D2(struct jffs2_compressor *this); + + jffs2_dbg(1, "Unregistering JFFS2 compressor \"%s\"\n", comp->name); + + spin_lock(&jffs2_compressor_list_lock); + + if (comp->usecount) { + spin_unlock(&jffs2_compressor_list_lock); + pr_warn("Compressor module is in use. Unregister failed.\n"); + return -1; + } + list_del(&comp->list); + + D2(list_for_each_entry(this, &jffs2_compressor_list, list) { + printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority); + }) + spin_unlock(&jffs2_compressor_list_lock); + return 0; +} + +void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig) +{ + if (orig != comprbuf) + kfree(comprbuf); +} + +int __init jffs2_compressors_init(void) +{ +/* Registering compressors */ +#ifdef CONFIG_JFFS2_ZLIB + jffs2_zlib_init(); +#endif +#ifdef CONFIG_JFFS2_RTIME + jffs2_rtime_init(); +#endif +#ifdef CONFIG_JFFS2_RUBIN + jffs2_rubinmips_init(); + jffs2_dynrubin_init(); +#endif +#ifdef CONFIG_JFFS2_LZO + jffs2_lzo_init(); +#endif +/* Setting default compression mode */ +#ifdef CONFIG_JFFS2_CMODE_NONE + jffs2_compression_mode = JFFS2_COMPR_MODE_NONE; + jffs2_dbg(1, "default compression mode: none\n"); +#else +#ifdef CONFIG_JFFS2_CMODE_SIZE + jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE; + jffs2_dbg(1, "default compression mode: size\n"); +#else +#ifdef CONFIG_JFFS2_CMODE_FAVOURLZO + jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO; + jffs2_dbg(1, "default compression mode: favourlzo\n"); +#else + jffs2_dbg(1, "default compression mode: priority\n"); +#endif +#endif +#endif + return 0; +} + +int jffs2_compressors_exit(void) +{ +/* Unregistering compressors */ +#ifdef CONFIG_JFFS2_LZO + jffs2_lzo_exit(); +#endif +#ifdef CONFIG_JFFS2_RUBIN + jffs2_dynrubin_exit(); + jffs2_rubinmips_exit(); +#endif +#ifdef CONFIG_JFFS2_RTIME + jffs2_rtime_exit(); +#endif +#ifdef CONFIG_JFFS2_ZLIB + jffs2_zlib_exit(); +#endif + return 0; +} diff --git a/kernel/fs/jffs2/compr.h b/kernel/fs/jffs2/compr.h new file mode 100644 index 000000000..5e91d578f --- /dev/null +++ b/kernel/fs/jffs2/compr.h @@ -0,0 +1,105 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * University of Szeged, Hungary + * Copyright © 2004-2010 David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_COMPR_H__ +#define __JFFS2_COMPR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "jffs2_fs_i.h" +#include "jffs2_fs_sb.h" +#include "nodelist.h" + +#define JFFS2_RUBINMIPS_PRIORITY 10 +#define JFFS2_DYNRUBIN_PRIORITY 20 +#define JFFS2_LZARI_PRIORITY 30 +#define JFFS2_RTIME_PRIORITY 50 +#define JFFS2_ZLIB_PRIORITY 60 +#define JFFS2_LZO_PRIORITY 80 + + +#define JFFS2_RUBINMIPS_DISABLED /* RUBINs will be used only */ +#define JFFS2_DYNRUBIN_DISABLED /* for decompression */ + +#define JFFS2_COMPR_MODE_NONE 0 +#define JFFS2_COMPR_MODE_PRIORITY 1 +#define JFFS2_COMPR_MODE_SIZE 2 +#define JFFS2_COMPR_MODE_FAVOURLZO 3 +#define JFFS2_COMPR_MODE_FORCELZO 4 +#define JFFS2_COMPR_MODE_FORCEZLIB 5 + +#define FAVOUR_LZO_PERCENT 80 + +struct jffs2_compressor { + struct list_head list; + int priority; /* used by prirority comr. mode */ + char *name; + char compr; /* JFFS2_COMPR_XXX */ + int (*compress)(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *srclen, uint32_t *destlen); + int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, + uint32_t cdatalen, uint32_t datalen); + int usecount; + int disabled; /* if set the compressor won't compress */ + unsigned char *compr_buf; /* used by size compr. mode */ + uint32_t compr_buf_size; /* used by size compr. mode */ + uint32_t stat_compr_orig_size; + uint32_t stat_compr_new_size; + uint32_t stat_compr_blocks; + uint32_t stat_decompr_blocks; +}; + +int jffs2_register_compressor(struct jffs2_compressor *comp); +int jffs2_unregister_compressor(struct jffs2_compressor *comp); + +int jffs2_compressors_init(void); +int jffs2_compressors_exit(void); + +uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *data_in, unsigned char **cpage_out, + uint32_t *datalen, uint32_t *cdatalen); + +int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint16_t comprtype, unsigned char *cdata_in, + unsigned char *data_out, uint32_t cdatalen, uint32_t datalen); + +void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig); + +/* Compressor modules */ +/* These functions will be called by jffs2_compressors_init/exit */ + +#ifdef CONFIG_JFFS2_RUBIN +int jffs2_rubinmips_init(void); +void jffs2_rubinmips_exit(void); +int jffs2_dynrubin_init(void); +void jffs2_dynrubin_exit(void); +#endif +#ifdef CONFIG_JFFS2_RTIME +int jffs2_rtime_init(void); +void jffs2_rtime_exit(void); +#endif +#ifdef CONFIG_JFFS2_ZLIB +int jffs2_zlib_init(void); +void jffs2_zlib_exit(void); +#endif +#ifdef CONFIG_JFFS2_LZO +int jffs2_lzo_init(void); +void jffs2_lzo_exit(void); +#endif + +#endif /* __JFFS2_COMPR_H__ */ diff --git a/kernel/fs/jffs2/compr_lzo.c b/kernel/fs/jffs2/compr_lzo.c new file mode 100644 index 000000000..c553bd650 --- /dev/null +++ b/kernel/fs/jffs2/compr_lzo.c @@ -0,0 +1,110 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2007 Nokia Corporation. All rights reserved. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Richard Purdie + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "compr.h" + +static void *lzo_mem; +static void *lzo_compress_buf; +static DEFINE_MUTEX(deflate_mutex); /* for lzo_mem and lzo_compress_buf */ + +static void free_workspace(void) +{ + vfree(lzo_mem); + vfree(lzo_compress_buf); +} + +static int __init alloc_workspace(void) +{ + lzo_mem = vmalloc(LZO1X_MEM_COMPRESS); + lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + + if (!lzo_mem || !lzo_compress_buf) { + free_workspace(); + return -ENOMEM; + } + + return 0; +} + +static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + size_t compress_size; + int ret; + + mutex_lock(&deflate_mutex); + ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem); + if (ret != LZO_E_OK) + goto fail; + + if (compress_size > *dstlen) + goto fail; + + memcpy(cpage_out, lzo_compress_buf, compress_size); + mutex_unlock(&deflate_mutex); + + *dstlen = compress_size; + return 0; + + fail: + mutex_unlock(&deflate_mutex); + return -1; +} + +static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + size_t dl = destlen; + int ret; + + ret = lzo1x_decompress_safe(data_in, srclen, cpage_out, &dl); + + if (ret != LZO_E_OK || dl != destlen) + return -1; + + return 0; +} + +static struct jffs2_compressor jffs2_lzo_comp = { + .priority = JFFS2_LZO_PRIORITY, + .name = "lzo", + .compr = JFFS2_COMPR_LZO, + .compress = &jffs2_lzo_compress, + .decompress = &jffs2_lzo_decompress, + .disabled = 0, +}; + +int __init jffs2_lzo_init(void) +{ + int ret; + + ret = alloc_workspace(); + if (ret < 0) + return ret; + + ret = jffs2_register_compressor(&jffs2_lzo_comp); + if (ret) + free_workspace(); + + return ret; +} + +void jffs2_lzo_exit(void) +{ + jffs2_unregister_compressor(&jffs2_lzo_comp); + free_workspace(); +} diff --git a/kernel/fs/jffs2/compr_rtime.c b/kernel/fs/jffs2/compr_rtime.c new file mode 100644 index 000000000..406d9cc84 --- /dev/null +++ b/kernel/fs/jffs2/compr_rtime.c @@ -0,0 +1,130 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + * + * + * Very simple lz77-ish encoder. + * + * Theory of operation: Both encoder and decoder have a list of "last + * occurrences" for every possible source-value; after sending the + * first source-byte, the second byte indicated the "run" length of + * matches + * + * The algorithm is intended to only send "whole bytes", no bit-messing. + * + */ + +#include +#include +#include +#include +#include +#include "compr.h" + +/* _compress returns the compressed size, -1 if bigger */ +static int jffs2_rtime_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + unsigned short positions[256]; + int outpos = 0; + int pos=0; + + memset(positions,0,sizeof(positions)); + + while (pos < (*sourcelen) && outpos <= (*dstlen)-2) { + int backpos, runlen=0; + unsigned char value; + + value = data_in[pos]; + + cpage_out[outpos++] = data_in[pos++]; + + backpos = positions[value]; + positions[value]=pos; + + while ((backpos < pos) && (pos < (*sourcelen)) && + (data_in[pos]==data_in[backpos++]) && (runlen<255)) { + pos++; + runlen++; + } + cpage_out[outpos++] = runlen; + } + + if (outpos >= pos) { + /* We failed */ + return -1; + } + + /* Tell the caller how much we managed to compress, and how much space it took */ + *sourcelen = pos; + *dstlen = outpos; + return 0; +} + + +static int jffs2_rtime_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + unsigned short positions[256]; + int outpos = 0; + int pos=0; + + memset(positions,0,sizeof(positions)); + + while (outpos= outpos) { + while(repeat) { + cpage_out[outpos++] = cpage_out[backoffs++]; + repeat--; + } + } else { + memcpy(&cpage_out[outpos],&cpage_out[backoffs],repeat); + outpos+=repeat; + } + } + } + return 0; +} + +static struct jffs2_compressor jffs2_rtime_comp = { + .priority = JFFS2_RTIME_PRIORITY, + .name = "rtime", + .compr = JFFS2_COMPR_RTIME, + .compress = &jffs2_rtime_compress, + .decompress = &jffs2_rtime_decompress, +#ifdef JFFS2_RTIME_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_rtime_init(void) +{ + return jffs2_register_compressor(&jffs2_rtime_comp); +} + +void jffs2_rtime_exit(void) +{ + jffs2_unregister_compressor(&jffs2_rtime_comp); +} diff --git a/kernel/fs/jffs2/compr_rubin.c b/kernel/fs/jffs2/compr_rubin.c new file mode 100644 index 000000000..556de100e --- /dev/null +++ b/kernel/fs/jffs2/compr_rubin.c @@ -0,0 +1,452 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "compr.h" + + +#define RUBIN_REG_SIZE 16 +#define UPPER_BIT_RUBIN (((long) 1)<<(RUBIN_REG_SIZE-1)) +#define LOWER_BITS_RUBIN ((((long) 1)<<(RUBIN_REG_SIZE-1))-1) + + +#define BIT_DIVIDER_MIPS 1043 +static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; + +struct pushpull { + unsigned char *buf; + unsigned int buflen; + unsigned int ofs; + unsigned int reserve; +}; + +struct rubin_state { + unsigned long p; + unsigned long q; + unsigned long rec_q; + long bit_number; + struct pushpull pp; + int bit_divider; + int bits[8]; +}; + +static inline void init_pushpull(struct pushpull *pp, char *buf, + unsigned buflen, unsigned ofs, + unsigned reserve) +{ + pp->buf = buf; + pp->buflen = buflen; + pp->ofs = ofs; + pp->reserve = reserve; +} + +static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) +{ + if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) + return -ENOSPC; + + if (bit) + pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); + else + pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); + + pp->ofs++; + + return 0; +} + +static inline int pushedbits(struct pushpull *pp) +{ + return pp->ofs; +} + +static inline int pullbit(struct pushpull *pp) +{ + int bit; + + bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1; + + pp->ofs++; + return bit; +} + + +static void init_rubin(struct rubin_state *rs, int div, int *bits) +{ + int c; + + rs->q = 0; + rs->p = (long) (2 * UPPER_BIT_RUBIN); + rs->bit_number = (long) 0; + rs->bit_divider = div; + + for (c=0; c<8; c++) + rs->bits[c] = bits[c]; +} + + +static int encode(struct rubin_state *rs, long A, long B, int symbol) +{ + + long i0, i1; + int ret; + + while ((rs->q >= UPPER_BIT_RUBIN) || + ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { + rs->bit_number++; + + ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); + if (ret) + return ret; + rs->q &= LOWER_BITS_RUBIN; + rs->q <<= 1; + rs->p <<= 1; + } + i0 = A * rs->p / (A + B); + if (i0 <= 0) + i0 = 1; + + if (i0 >= rs->p) + i0 = rs->p - 1; + + i1 = rs->p - i0; + + if (symbol == 0) + rs->p = i0; + else { + rs->p = i1; + rs->q += i0; + } + return 0; +} + + +static void end_rubin(struct rubin_state *rs) +{ + + int i; + + for (i = 0; i < RUBIN_REG_SIZE; i++) { + pushbit(&rs->pp, (UPPER_BIT_RUBIN & rs->q) ? 1 : 0, 1); + rs->q &= LOWER_BITS_RUBIN; + rs->q <<= 1; + } +} + + +static void init_decode(struct rubin_state *rs, int div, int *bits) +{ + init_rubin(rs, div, bits); + + /* behalve lower */ + rs->rec_q = 0; + + for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; + rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) + ; +} + +static void __do_decode(struct rubin_state *rs, unsigned long p, + unsigned long q) +{ + register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; + unsigned long rec_q; + int c, bits = 0; + + /* + * First, work out how many bits we need from the input stream. + * Note that we have already done the initial check on this + * loop prior to calling this function. + */ + do { + bits++; + q &= lower_bits_rubin; + q <<= 1; + p <<= 1; + } while ((q >= UPPER_BIT_RUBIN) || ((p + q) <= UPPER_BIT_RUBIN)); + + rs->p = p; + rs->q = q; + + rs->bit_number += bits; + + /* + * Now get the bits. We really want this to be "get n bits". + */ + rec_q = rs->rec_q; + do { + c = pullbit(&rs->pp); + rec_q &= lower_bits_rubin; + rec_q <<= 1; + rec_q += c; + } while (--bits); + rs->rec_q = rec_q; +} + +static int decode(struct rubin_state *rs, long A, long B) +{ + unsigned long p = rs->p, q = rs->q; + long i0, threshold; + int symbol; + + if (q >= UPPER_BIT_RUBIN || ((p + q) <= UPPER_BIT_RUBIN)) + __do_decode(rs, p, q); + + i0 = A * rs->p / (A + B); + if (i0 <= 0) + i0 = 1; + + if (i0 >= rs->p) + i0 = rs->p - 1; + + threshold = rs->q + i0; + symbol = rs->rec_q >= threshold; + if (rs->rec_q >= threshold) { + rs->q += i0; + i0 = rs->p - i0; + } + + rs->p = i0; + + return symbol; +} + + + +static int out_byte(struct rubin_state *rs, unsigned char byte) +{ + int i, ret; + struct rubin_state rs_copy; + rs_copy = *rs; + + for (i=0; i<8; i++) { + ret = encode(rs, rs->bit_divider-rs->bits[i], + rs->bits[i], byte & 1); + if (ret) { + /* Failed. Restore old state */ + *rs = rs_copy; + return ret; + } + byte >>= 1 ; + } + return 0; +} + +static int in_byte(struct rubin_state *rs) +{ + int i, result = 0, bit_divider = rs->bit_divider; + + for (i = 0; i < 8; i++) + result |= decode(rs, bit_divider - rs->bits[i], + rs->bits[i]) << i; + + return result; +} + + + +static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, + unsigned char *cpage_out, uint32_t *sourcelen, + uint32_t *dstlen) + { + int outpos = 0; + int pos=0; + struct rubin_state rs; + + init_pushpull(&rs.pp, cpage_out, *dstlen * 8, 0, 32); + + init_rubin(&rs, bit_divider, bits); + + while (pos < (*sourcelen) && !out_byte(&rs, data_in[pos])) + pos++; + + end_rubin(&rs); + + if (outpos > pos) { + /* We failed */ + return -1; + } + + /* Tell the caller how much we managed to compress, + * and how much space it took */ + + outpos = (pushedbits(&rs.pp)+7)/8; + + if (outpos >= pos) + return -1; /* We didn't actually compress */ + *sourcelen = pos; + *dstlen = outpos; + return 0; +} +#if 0 +/* _compress returns the compressed size, -1 if bigger */ +int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); +} +#endif +static int jffs2_dynrubin_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + int bits[8]; + unsigned char histo[256]; + int i; + int ret; + uint32_t mysrclen, mydstlen; + + mysrclen = *sourcelen; + mydstlen = *dstlen - 8; + + if (*dstlen <= 12) + return -1; + + memset(histo, 0, 256); + for (i=0; i 255) bits[i] = 255; + cpage_out[i] = bits[i]; + } + + ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, + &mydstlen); + if (ret) + return ret; + + /* Add back the 8 bytes we took for the probabilities */ + mydstlen += 8; + + if (mysrclen <= mydstlen) { + /* We compressed */ + return -1; + } + + *sourcelen = mysrclen; + *dstlen = mydstlen; + return 0; +} + +static void rubin_do_decompress(int bit_divider, int *bits, + unsigned char *cdata_in, + unsigned char *page_out, uint32_t srclen, + uint32_t destlen) +{ + int outpos = 0; + struct rubin_state rs; + + init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); + init_decode(&rs, bit_divider, bits); + + while (outpos < destlen) + page_out[outpos++] = in_byte(&rs); +} + + +static int jffs2_rubinmips_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t sourcelen, uint32_t dstlen) +{ + rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); + return 0; +} + +static int jffs2_dynrubin_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t sourcelen, uint32_t dstlen) +{ + int bits[8]; + int c; + + for (c=0; c<8; c++) + bits[c] = data_in[c]; + + rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, + dstlen); + return 0; +} + +static struct jffs2_compressor jffs2_rubinmips_comp = { + .priority = JFFS2_RUBINMIPS_PRIORITY, + .name = "rubinmips", + .compr = JFFS2_COMPR_DYNRUBIN, + .compress = NULL, /*&jffs2_rubinmips_compress,*/ + .decompress = &jffs2_rubinmips_decompress, +#ifdef JFFS2_RUBINMIPS_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_rubinmips_init(void) +{ + return jffs2_register_compressor(&jffs2_rubinmips_comp); +} + +void jffs2_rubinmips_exit(void) +{ + jffs2_unregister_compressor(&jffs2_rubinmips_comp); +} + +static struct jffs2_compressor jffs2_dynrubin_comp = { + .priority = JFFS2_DYNRUBIN_PRIORITY, + .name = "dynrubin", + .compr = JFFS2_COMPR_RUBINMIPS, + .compress = jffs2_dynrubin_compress, + .decompress = &jffs2_dynrubin_decompress, +#ifdef JFFS2_DYNRUBIN_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_dynrubin_init(void) +{ + return jffs2_register_compressor(&jffs2_dynrubin_comp); +} + +void jffs2_dynrubin_exit(void) +{ + jffs2_unregister_compressor(&jffs2_dynrubin_comp); +} diff --git a/kernel/fs/jffs2/compr_zlib.c b/kernel/fs/jffs2/compr_zlib.c new file mode 100644 index 000000000..5698dae5d --- /dev/null +++ b/kernel/fs/jffs2/compr_zlib.c @@ -0,0 +1,222 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#if !defined(__KERNEL__) && !defined(__ECOS) +#error "The userspace support got too messy and was removed. Update your mkfs.jffs2" +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + + /* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? + */ +#define STREAM_END_SPACE 12 + +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); +static z_stream inf_strm, def_strm; + +#ifdef __KERNEL__ /* Linux-only */ +#include +#include +#include + +static int __init alloc_workspaces(void) +{ + def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS, + MAX_MEM_LEVEL)); + if (!def_strm.workspace) + return -ENOMEM; + + jffs2_dbg(1, "Allocated %d bytes for deflate workspace\n", + zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)); + inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!inf_strm.workspace) { + vfree(def_strm.workspace); + return -ENOMEM; + } + jffs2_dbg(1, "Allocated %d bytes for inflate workspace\n", + zlib_inflate_workspacesize()); + return 0; +} + +static void free_workspaces(void) +{ + vfree(def_strm.workspace); + vfree(inf_strm.workspace); +} +#else +#define alloc_workspaces() (0) +#define free_workspaces() do { } while(0) +#endif /* __KERNEL__ */ + +static int jffs2_zlib_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + int ret; + + if (*dstlen <= STREAM_END_SPACE) + return -1; + + mutex_lock(&deflate_mutex); + + if (Z_OK != zlib_deflateInit(&def_strm, 3)) { + pr_warn("deflateInit failed\n"); + mutex_unlock(&deflate_mutex); + return -1; + } + + def_strm.next_in = data_in; + def_strm.total_in = 0; + + def_strm.next_out = cpage_out; + def_strm.total_out = 0; + + while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { + def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); + def_strm.avail_in = min_t(unsigned long, + (*sourcelen-def_strm.total_in), def_strm.avail_out); + jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n", + def_strm.avail_in, def_strm.avail_out); + ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); + jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n", + def_strm.avail_in, def_strm.avail_out, + def_strm.total_in, def_strm.total_out); + if (ret != Z_OK) { + jffs2_dbg(1, "deflate in loop returned %d\n", ret); + zlib_deflateEnd(&def_strm); + mutex_unlock(&deflate_mutex); + return -1; + } + } + def_strm.avail_out += STREAM_END_SPACE; + def_strm.avail_in = 0; + ret = zlib_deflate(&def_strm, Z_FINISH); + zlib_deflateEnd(&def_strm); + + if (ret != Z_STREAM_END) { + jffs2_dbg(1, "final deflate returned %d\n", ret); + ret = -1; + goto out; + } + + if (def_strm.total_out >= def_strm.total_in) { + jffs2_dbg(1, "zlib compressed %ld bytes into %ld; failing\n", + def_strm.total_in, def_strm.total_out); + ret = -1; + goto out; + } + + jffs2_dbg(1, "zlib compressed %ld bytes into %ld\n", + def_strm.total_in, def_strm.total_out); + + *dstlen = def_strm.total_out; + *sourcelen = def_strm.total_in; + ret = 0; + out: + mutex_unlock(&deflate_mutex); + return ret; +} + +static int jffs2_zlib_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + int ret; + int wbits = MAX_WBITS; + + mutex_lock(&inflate_mutex); + + inf_strm.next_in = data_in; + inf_strm.avail_in = srclen; + inf_strm.total_in = 0; + + inf_strm.next_out = cpage_out; + inf_strm.avail_out = destlen; + inf_strm.total_out = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + jffs2_dbg(2, "inflate skipping adler32\n"); + wbits = -((data_in[0] >> 4) + 8); + inf_strm.next_in += 2; + inf_strm.avail_in -= 2; + } else { + /* Let this remain D1 for now -- it should never happen */ + jffs2_dbg(1, "inflate not skipping adler32\n"); + } + + + if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) { + pr_warn("inflateInit failed\n"); + mutex_unlock(&inflate_mutex); + return 1; + } + + while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK) + ; + if (ret != Z_STREAM_END) { + pr_notice("inflate returned %d\n", ret); + } + zlib_inflateEnd(&inf_strm); + mutex_unlock(&inflate_mutex); + return 0; +} + +static struct jffs2_compressor jffs2_zlib_comp = { + .priority = JFFS2_ZLIB_PRIORITY, + .name = "zlib", + .compr = JFFS2_COMPR_ZLIB, + .compress = &jffs2_zlib_compress, + .decompress = &jffs2_zlib_decompress, +#ifdef JFFS2_ZLIB_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int __init jffs2_zlib_init(void) +{ + int ret; + + ret = alloc_workspaces(); + if (ret) + return ret; + + ret = jffs2_register_compressor(&jffs2_zlib_comp); + if (ret) + free_workspaces(); + + return ret; +} + +void jffs2_zlib_exit(void) +{ + jffs2_unregister_compressor(&jffs2_zlib_comp); + free_workspaces(); +} diff --git a/kernel/fs/jffs2/debug.c b/kernel/fs/jffs2/debug.c new file mode 100644 index 000000000..1090eb64b --- /dev/null +++ b/kernel/fs/jffs2/debug.c @@ -0,0 +1,866 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "debug.h" + +#ifdef JFFS2_DBG_SANITY_CHECKS + +void +__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + if (unlikely(jeb && jeb->used_size + jeb->dirty_size + + jeb->free_size + jeb->wasted_size + + jeb->unchecked_size != c->sector_size)) { + JFFS2_ERROR("eeep, space accounting for block at 0x%08x is screwed.\n", jeb->offset); + JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n", + jeb->free_size, jeb->dirty_size, jeb->used_size, + jeb->wasted_size, jeb->unchecked_size, c->sector_size); + BUG(); + } + + if (unlikely(c->used_size + c->dirty_size + c->free_size + c->erasing_size + c->bad_size + + c->wasted_size + c->unchecked_size != c->flash_size)) { + JFFS2_ERROR("eeep, space accounting superblock info is screwed.\n"); + JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + erasing %#08x + bad %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n", + c->free_size, c->dirty_size, c->used_size, c->erasing_size, c->bad_size, + c->wasted_size, c->unchecked_size, c->flash_size); + BUG(); + } +} + +void +__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +#endif /* JFFS2_DBG_SANITY_CHECKS */ + +#ifdef JFFS2_DBG_PARANOIA_CHECKS +/* + * Check the fragtree. + */ +void +__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f) +{ + mutex_lock(&f->sem); + __jffs2_dbg_fragtree_paranoia_check_nolock(f); + mutex_unlock(&f->sem); +} + +void +__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *frag; + int bitched = 0; + + for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) { + struct jffs2_full_dnode *fn = frag->node; + + if (!fn || !fn->raw) + continue; + + if (ref_flags(fn->raw) == REF_PRISTINE) { + if (fn->frags > 1) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x had %d frags. Tell dwmw2.\n", + ref_offset(fn->raw), fn->frags); + bitched = 1; + } + + /* A hole node which isn't multi-page should be garbage-collected + and merged anyway, so we just check for the frag size here, + rather than mucking around with actually reading the node + and checking the compression type, which is the real way + to tell a hole node. */ + if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag) + && frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n", + ref_offset(fn->raw)); + bitched = 1; + } + + if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag) + && frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n", + ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size); + bitched = 1; + } + } + } + + if (bitched) { + JFFS2_ERROR("fragtree is corrupted.\n"); + __jffs2_dbg_dump_fragtree_nolock(f); + BUG(); + } +} + +/* + * Check if the flash contains all 0xFF before we start writing. + */ +void +__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, + uint32_t ofs, int len) +{ + size_t retlen; + int ret, i; + unsigned char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return; + + ret = jffs2_flash_read(c, ofs, len, &retlen, buf); + if (ret || (retlen != len)) { + JFFS2_WARNING("read %d bytes failed or short. ret %d, retlen %zd.\n", + len, ret, retlen); + kfree(buf); + return; + } + + ret = 0; + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + ret = 1; + + if (ret) { + JFFS2_ERROR("argh, about to write node to %#08x on flash, but there are data already there. The first corrupted byte is at %#08x offset.\n", + ofs, ofs + i); + __jffs2_dbg_dump_buffer(buf, len, ofs); + kfree(buf); + BUG(); + } + + kfree(buf); +} + +void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *jeb; + uint32_t free = 0, dirty = 0, used = 0, wasted = 0, + erasing = 0, bad = 0, unchecked = 0; + int nr_counted = 0; + int dump = 0; + + if (c->gcblock) { + nr_counted++; + free += c->gcblock->free_size; + dirty += c->gcblock->dirty_size; + used += c->gcblock->used_size; + wasted += c->gcblock->wasted_size; + unchecked += c->gcblock->unchecked_size; + } + if (c->nextblock) { + nr_counted++; + free += c->nextblock->free_size; + dirty += c->nextblock->dirty_size; + used += c->nextblock->used_size; + wasted += c->nextblock->wasted_size; + unchecked += c->nextblock->unchecked_size; + } + list_for_each_entry(jeb, &c->clean_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->very_dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_pending_wbuf_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erase_pending_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->free_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->bad_used_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + + list_for_each_entry(jeb, &c->erasing_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_checking_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_complete_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->bad_list, list) { + nr_counted++; + bad += c->sector_size; + } + +#define check(sz) \ +do { \ + if (sz != c->sz##_size) { \ + pr_warn("%s_size mismatch counted 0x%x, c->%s_size 0x%x\n", \ + #sz, sz, #sz, c->sz##_size); \ + dump = 1; \ + } \ +} while (0) + + check(free); + check(dirty); + check(used); + check(wasted); + check(unchecked); + check(bad); + check(erasing); + +#undef check + + if (nr_counted != c->nr_blocks) { + pr_warn("%s counted only 0x%x blocks of 0x%x. Where are the others?\n", + __func__, nr_counted, c->nr_blocks); + dump = 1; + } + + if (dump) { + __jffs2_dbg_dump_block_lists_nolock(c); + BUG(); + } +} + +/* + * Check the space accounting and node_ref list correctness for the JFFS2 erasable block 'jeb'. + */ +void +__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + uint32_t my_used_size = 0; + uint32_t my_unchecked_size = 0; + uint32_t my_dirty_size = 0; + struct jffs2_raw_node_ref *ref2 = jeb->first_node; + + while (ref2) { + uint32_t totlen = ref_totlen(c, jeb, ref2); + + if (ref_offset(ref2) < jeb->offset || + ref_offset(ref2) > jeb->offset + c->sector_size) { + JFFS2_ERROR("node_ref %#08x shouldn't be in block at %#08x.\n", + ref_offset(ref2), jeb->offset); + goto error; + + } + if (ref_flags(ref2) == REF_UNCHECKED) + my_unchecked_size += totlen; + else if (!ref_obsolete(ref2)) + my_used_size += totlen; + else + my_dirty_size += totlen; + + if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) { + JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n", + ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2), + ref_offset(jeb->last_node), jeb->last_node); + goto error; + } + ref2 = ref_next(ref2); + } + + if (my_used_size != jeb->used_size) { + JFFS2_ERROR("Calculated used size %#08x != stored used size %#08x.\n", + my_used_size, jeb->used_size); + goto error; + } + + if (my_unchecked_size != jeb->unchecked_size) { + JFFS2_ERROR("Calculated unchecked size %#08x != stored unchecked size %#08x.\n", + my_unchecked_size, jeb->unchecked_size); + goto error; + } + +#if 0 + /* This should work when we implement ref->__totlen elemination */ + if (my_dirty_size != jeb->dirty_size + jeb->wasted_size) { + JFFS2_ERROR("Calculated dirty+wasted size %#08x != stored dirty + wasted size %#08x\n", + my_dirty_size, jeb->dirty_size + jeb->wasted_size); + goto error; + } + + if (jeb->free_size == 0 + && my_used_size + my_unchecked_size + my_dirty_size != c->sector_size) { + JFFS2_ERROR("The sum of all nodes in block (%#x) != size of block (%#x)\n", + my_used_size + my_unchecked_size + my_dirty_size, + c->sector_size); + goto error; + } +#endif + + if (!(c->flags & (JFFS2_SB_FLAG_BUILDING|JFFS2_SB_FLAG_SCANNING))) + __jffs2_dbg_superblock_counts(c); + + return; + +error: + __jffs2_dbg_dump_node_refs_nolock(c, jeb); + __jffs2_dbg_dump_jeb_nolock(jeb); + __jffs2_dbg_dump_block_lists_nolock(c); + BUG(); + +} +#endif /* JFFS2_DBG_PARANOIA_CHECKS */ + +#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS) +/* + * Dump the node_refs of the 'jeb' JFFS2 eraseblock. + */ +void +__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_node_refs_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + struct jffs2_raw_node_ref *ref; + int i = 0; + + printk(JFFS2_DBG_MSG_PREFIX " Dump node_refs of the eraseblock %#08x\n", jeb->offset); + if (!jeb->first_node) { + printk(JFFS2_DBG_MSG_PREFIX " no nodes in the eraseblock %#08x\n", jeb->offset); + return; + } + + printk(JFFS2_DBG); + for (ref = jeb->first_node; ; ref = ref_next(ref)) { + printk("%#08x", ref_offset(ref)); +#ifdef TEST_TOTLEN + printk("(%x)", ref->__totlen); +#endif + if (ref_next(ref)) + printk("->"); + else + break; + if (++i == 4) { + i = 0; + printk("\n" JFFS2_DBG); + } + } + printk("\n"); +} + +/* + * Dump an eraseblock's space accounting. + */ +void +__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_jeb_nolock(jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb) +{ + if (!jeb) + return; + + printk(JFFS2_DBG_MSG_PREFIX " dump space accounting for the eraseblock at %#08x:\n", + jeb->offset); + + printk(JFFS2_DBG "used_size: %#08x\n", jeb->used_size); + printk(JFFS2_DBG "dirty_size: %#08x\n", jeb->dirty_size); + printk(JFFS2_DBG "wasted_size: %#08x\n", jeb->wasted_size); + printk(JFFS2_DBG "unchecked_size: %#08x\n", jeb->unchecked_size); + printk(JFFS2_DBG "free_size: %#08x\n", jeb->free_size); +} + +void +__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_block_lists_nolock(c); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c) +{ + printk(JFFS2_DBG_MSG_PREFIX " dump JFFS2 blocks lists:\n"); + + printk(JFFS2_DBG "flash_size: %#08x\n", c->flash_size); + printk(JFFS2_DBG "used_size: %#08x\n", c->used_size); + printk(JFFS2_DBG "dirty_size: %#08x\n", c->dirty_size); + printk(JFFS2_DBG "wasted_size: %#08x\n", c->wasted_size); + printk(JFFS2_DBG "unchecked_size: %#08x\n", c->unchecked_size); + printk(JFFS2_DBG "free_size: %#08x\n", c->free_size); + printk(JFFS2_DBG "erasing_size: %#08x\n", c->erasing_size); + printk(JFFS2_DBG "bad_size: %#08x\n", c->bad_size); + printk(JFFS2_DBG "sector_size: %#08x\n", c->sector_size); + printk(JFFS2_DBG "jffs2_reserved_blocks size: %#08x\n", + c->sector_size * c->resv_blocks_write); + + if (c->nextblock) + printk(JFFS2_DBG "nextblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + c->nextblock->offset, c->nextblock->used_size, + c->nextblock->dirty_size, c->nextblock->wasted_size, + c->nextblock->unchecked_size, c->nextblock->free_size); + else + printk(JFFS2_DBG "nextblock: NULL\n"); + + if (c->gcblock) + printk(JFFS2_DBG "gcblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + c->gcblock->offset, c->gcblock->used_size, c->gcblock->dirty_size, + c->gcblock->wasted_size, c->gcblock->unchecked_size, c->gcblock->free_size); + else + printk(JFFS2_DBG "gcblock: NULL\n"); + + if (list_empty(&c->clean_list)) { + printk(JFFS2_DBG "clean_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->clean_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + numblocks ++; + dirty += jeb->wasted_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "clean_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "Contains %d blocks with total wasted size %u, average wasted size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->very_dirty_list)) { + printk(JFFS2_DBG "very_dirty_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->very_dirty_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + numblocks ++; + dirty += jeb->dirty_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "very_dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "Contains %d blocks with total dirty size %u, average dirty size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->dirty_list)) { + printk(JFFS2_DBG "dirty_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->dirty_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + numblocks ++; + dirty += jeb->dirty_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "contains %d blocks with total dirty size %u, average dirty size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->erasable_list)) { + printk(JFFS2_DBG "erasable_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasable_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasable_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erasing_list)) { + printk(JFFS2_DBG "erasing_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasing_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasing_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + if (list_empty(&c->erase_checking_list)) { + printk(JFFS2_DBG "erase_checking_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erase_checking_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erase_checking_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erase_pending_list)) { + printk(JFFS2_DBG "erase_pending_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erase_pending_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erase_pending_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erasable_pending_wbuf_list)) { + printk(JFFS2_DBG "erasable_pending_wbuf_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasable_pending_wbuf_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasable_pending_wbuf_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->free_list)) { + printk(JFFS2_DBG "free_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->free_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "free_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->bad_list)) { + printk(JFFS2_DBG "bad_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->bad_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "bad_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->bad_used_list)) { + printk(JFFS2_DBG "bad_used_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->bad_used_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "bad_used_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } +} + +void +__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f) +{ + mutex_lock(&f->sem); + jffs2_dbg_dump_fragtree_nolock(f); + mutex_unlock(&f->sem); +} + +void +__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *this = frag_first(&f->fragtree); + uint32_t lastofs = 0; + int buggy = 0; + + printk(JFFS2_DBG_MSG_PREFIX " dump fragtree of ino #%u\n", f->inocache->ino); + while(this) { + if (this->node) + printk(JFFS2_DBG "frag %#04x-%#04x: %#08x(%d) on flash (*%p), left (%p), right (%p), parent (%p)\n", + this->ofs, this->ofs+this->size, ref_offset(this->node->raw), + ref_flags(this->node->raw), this, frag_left(this), frag_right(this), + frag_parent(this)); + else + printk(JFFS2_DBG "frag %#04x-%#04x: hole (*%p). left (%p), right (%p), parent (%p)\n", + this->ofs, this->ofs+this->size, this, frag_left(this), + frag_right(this), frag_parent(this)); + if (this->ofs != lastofs) + buggy = 1; + lastofs = this->ofs + this->size; + this = frag_next(this); + } + + if (f->metadata) + printk(JFFS2_DBG "metadata at 0x%08x\n", ref_offset(f->metadata->raw)); + + if (buggy) { + JFFS2_ERROR("frag tree got a hole in it.\n"); + BUG(); + } +} + +#define JFFS2_BUFDUMP_BYTES_PER_LINE 32 +void +__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs) +{ + int skip; + int i; + + printk(JFFS2_DBG_MSG_PREFIX " dump from offset %#08x to offset %#08x (%x bytes).\n", + offs, offs + len, len); + i = skip = offs % JFFS2_BUFDUMP_BYTES_PER_LINE; + offs = offs & ~(JFFS2_BUFDUMP_BYTES_PER_LINE - 1); + + if (skip != 0) + printk(JFFS2_DBG "%#08x: ", offs); + + while (skip--) + printk(" "); + + while (i < len) { + if ((i % JFFS2_BUFDUMP_BYTES_PER_LINE) == 0 && i != len -1) { + if (i != 0) + printk("\n"); + offs += JFFS2_BUFDUMP_BYTES_PER_LINE; + printk(JFFS2_DBG "%0#8x: ", offs); + } + + printk("%02x ", buf[i]); + + i += 1; + } + + printk("\n"); +} + +/* + * Dump a JFFS2 node. + */ +void +__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs) +{ + union jffs2_node_union node; + int len = sizeof(union jffs2_node_union); + size_t retlen; + uint32_t crc; + int ret; + + printk(JFFS2_DBG_MSG_PREFIX " dump node at offset %#08x.\n", ofs); + + ret = jffs2_flash_read(c, ofs, len, &retlen, (unsigned char *)&node); + if (ret || (retlen != len)) { + JFFS2_ERROR("read %d bytes failed or short. ret %d, retlen %zd.\n", + len, ret, retlen); + return; + } + + printk(JFFS2_DBG "magic:\t%#04x\n", je16_to_cpu(node.u.magic)); + printk(JFFS2_DBG "nodetype:\t%#04x\n", je16_to_cpu(node.u.nodetype)); + printk(JFFS2_DBG "totlen:\t%#08x\n", je32_to_cpu(node.u.totlen)); + printk(JFFS2_DBG "hdr_crc:\t%#08x\n", je32_to_cpu(node.u.hdr_crc)); + + crc = crc32(0, &node.u, sizeof(node.u) - 4); + if (crc != je32_to_cpu(node.u.hdr_crc)) { + JFFS2_ERROR("wrong common header CRC.\n"); + return; + } + + if (je16_to_cpu(node.u.magic) != JFFS2_MAGIC_BITMASK && + je16_to_cpu(node.u.magic) != JFFS2_OLD_MAGIC_BITMASK) + { + JFFS2_ERROR("wrong node magic: %#04x instead of %#04x.\n", + je16_to_cpu(node.u.magic), JFFS2_MAGIC_BITMASK); + return; + } + + switch(je16_to_cpu(node.u.nodetype)) { + + case JFFS2_NODETYPE_INODE: + + printk(JFFS2_DBG "the node is inode node\n"); + printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.i.ino)); + printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.i.version)); + printk(JFFS2_DBG "mode:\t%#08x\n", node.i.mode.m); + printk(JFFS2_DBG "uid:\t%#04x\n", je16_to_cpu(node.i.uid)); + printk(JFFS2_DBG "gid:\t%#04x\n", je16_to_cpu(node.i.gid)); + printk(JFFS2_DBG "isize:\t%#08x\n", je32_to_cpu(node.i.isize)); + printk(JFFS2_DBG "atime:\t%#08x\n", je32_to_cpu(node.i.atime)); + printk(JFFS2_DBG "mtime:\t%#08x\n", je32_to_cpu(node.i.mtime)); + printk(JFFS2_DBG "ctime:\t%#08x\n", je32_to_cpu(node.i.ctime)); + printk(JFFS2_DBG "offset:\t%#08x\n", je32_to_cpu(node.i.offset)); + printk(JFFS2_DBG "csize:\t%#08x\n", je32_to_cpu(node.i.csize)); + printk(JFFS2_DBG "dsize:\t%#08x\n", je32_to_cpu(node.i.dsize)); + printk(JFFS2_DBG "compr:\t%#02x\n", node.i.compr); + printk(JFFS2_DBG "usercompr:\t%#02x\n", node.i.usercompr); + printk(JFFS2_DBG "flags:\t%#04x\n", je16_to_cpu(node.i.flags)); + printk(JFFS2_DBG "data_crc:\t%#08x\n", je32_to_cpu(node.i.data_crc)); + printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.i.node_crc)); + + crc = crc32(0, &node.i, sizeof(node.i) - 8); + if (crc != je32_to_cpu(node.i.node_crc)) { + JFFS2_ERROR("wrong node header CRC.\n"); + return; + } + break; + + case JFFS2_NODETYPE_DIRENT: + + printk(JFFS2_DBG "the node is dirent node\n"); + printk(JFFS2_DBG "pino:\t%#08x\n", je32_to_cpu(node.d.pino)); + printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.d.version)); + printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.d.ino)); + printk(JFFS2_DBG "mctime:\t%#08x\n", je32_to_cpu(node.d.mctime)); + printk(JFFS2_DBG "nsize:\t%#02x\n", node.d.nsize); + printk(JFFS2_DBG "type:\t%#02x\n", node.d.type); + printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.d.node_crc)); + printk(JFFS2_DBG "name_crc:\t%#08x\n", je32_to_cpu(node.d.name_crc)); + + node.d.name[node.d.nsize] = '\0'; + printk(JFFS2_DBG "name:\t\"%s\"\n", node.d.name); + + crc = crc32(0, &node.d, sizeof(node.d) - 8); + if (crc != je32_to_cpu(node.d.node_crc)) { + JFFS2_ERROR("wrong node header CRC.\n"); + return; + } + break; + + default: + printk(JFFS2_DBG "node type is unknown\n"); + break; + } +} +#endif /* JFFS2_DBG_DUMPS || JFFS2_DBG_PARANOIA_CHECKS */ diff --git a/kernel/fs/jffs2/debug.h b/kernel/fs/jffs2/debug.h new file mode 100644 index 000000000..4fd9be4cb --- /dev/null +++ b/kernel/fs/jffs2/debug.h @@ -0,0 +1,275 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_DEBUG_H_ +#define _JFFS2_DEBUG_H_ + +#include + +#ifndef CONFIG_JFFS2_FS_DEBUG +#define CONFIG_JFFS2_FS_DEBUG 0 +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 0 +/* Enable "paranoia" checks and dumps */ +#define JFFS2_DBG_PARANOIA_CHECKS +#define JFFS2_DBG_DUMPS + +/* + * By defining/undefining the below macros one may select debugging messages + * fro specific JFFS2 subsystems. + */ +#define JFFS2_DBG_READINODE_MESSAGES +#define JFFS2_DBG_FRAGTREE_MESSAGES +#define JFFS2_DBG_DENTLIST_MESSAGES +#define JFFS2_DBG_NODEREF_MESSAGES +#define JFFS2_DBG_INOCACHE_MESSAGES +#define JFFS2_DBG_SUMMARY_MESSAGES +#define JFFS2_DBG_FSBUILD_MESSAGES +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 1 +#define JFFS2_DBG_FRAGTREE2_MESSAGES +#define JFFS2_DBG_READINODE2_MESSAGES +#define JFFS2_DBG_MEMALLOC_MESSAGES +#endif + +/* Sanity checks are supposed to be light-weight and enabled by default */ +#define JFFS2_DBG_SANITY_CHECKS + +/* + * Dx() are mainly used for debugging messages, they must go away and be + * superseded by nicer dbg_xxx() macros... + */ +#if CONFIG_JFFS2_FS_DEBUG > 0 +#define DEBUG +#define D1(x) x +#else +#define D1(x) +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 1 +#define D2(x) x +#else +#define D2(x) +#endif + +#define jffs2_dbg(level, fmt, ...) \ +do { \ + if (CONFIG_JFFS2_FS_DEBUG >= level) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + +/* The prefixes of JFFS2 messages */ +#define JFFS2_DBG KERN_DEBUG +#define JFFS2_DBG_PREFIX "[JFFS2 DBG]" +#define JFFS2_DBG_MSG_PREFIX JFFS2_DBG JFFS2_DBG_PREFIX + +/* JFFS2 message macros */ +#define JFFS2_ERROR(fmt, ...) \ + pr_err("error: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) + +#define JFFS2_WARNING(fmt, ...) \ + pr_warn("warning: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) + +#define JFFS2_NOTICE(fmt, ...) \ + pr_notice("notice: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) + +#define JFFS2_DEBUG(fmt, ...) \ + printk(KERN_DEBUG "[JFFS2 DBG] (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) + +/* + * We split our debugging messages on several parts, depending on the JFFS2 + * subsystem the message belongs to. + */ +/* Read inode debugging messages */ +#ifdef JFFS2_DBG_READINODE_MESSAGES +#define dbg_readinode(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_readinode(fmt, ...) +#endif +#ifdef JFFS2_DBG_READINODE2_MESSAGES +#define dbg_readinode2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_readinode2(fmt, ...) +#endif + +/* Fragtree build debugging messages */ +#ifdef JFFS2_DBG_FRAGTREE_MESSAGES +#define dbg_fragtree(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fragtree(fmt, ...) +#endif +#ifdef JFFS2_DBG_FRAGTREE2_MESSAGES +#define dbg_fragtree2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fragtree2(fmt, ...) +#endif + +/* Directory entry list manilulation debugging messages */ +#ifdef JFFS2_DBG_DENTLIST_MESSAGES +#define dbg_dentlist(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_dentlist(fmt, ...) +#endif + +/* Print the messages about manipulating node_refs */ +#ifdef JFFS2_DBG_NODEREF_MESSAGES +#define dbg_noderef(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_noderef(fmt, ...) +#endif + +/* Manipulations with the list of inodes (JFFS2 inocache) */ +#ifdef JFFS2_DBG_INOCACHE_MESSAGES +#define dbg_inocache(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_inocache(fmt, ...) +#endif + +/* Summary debugging messages */ +#ifdef JFFS2_DBG_SUMMARY_MESSAGES +#define dbg_summary(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_summary(fmt, ...) +#endif + +/* File system build messages */ +#ifdef JFFS2_DBG_FSBUILD_MESSAGES +#define dbg_fsbuild(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fsbuild(fmt, ...) +#endif + +/* Watch the object allocations */ +#ifdef JFFS2_DBG_MEMALLOC_MESSAGES +#define dbg_memalloc(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_memalloc(fmt, ...) +#endif + +/* Watch the XATTR subsystem */ +#ifdef JFFS2_DBG_XATTR_MESSAGES +#define dbg_xattr(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_xattr(fmt, ...) +#endif + +/* "Sanity" checks */ +void +__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); + +/* "Paranoia" checks */ +void +__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f); +void +__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f); +void +__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, + uint32_t ofs, int len); + +/* "Dump" functions */ +void +__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c); +void +__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c); +void +__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f); +void +__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f); +void +__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs); +void +__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs); + +#ifdef JFFS2_DBG_PARANOIA_CHECKS +#define jffs2_dbg_fragtree_paranoia_check(f) \ + __jffs2_dbg_fragtree_paranoia_check(f) +#define jffs2_dbg_fragtree_paranoia_check_nolock(f) \ + __jffs2_dbg_fragtree_paranoia_check_nolock(f) +#define jffs2_dbg_acct_paranoia_check(c, jeb) \ + __jffs2_dbg_acct_paranoia_check(c,jeb) +#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb) \ + __jffs2_dbg_acct_paranoia_check_nolock(c,jeb) +#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len) \ + __jffs2_dbg_prewrite_paranoia_check(c, ofs, len) +#else +#define jffs2_dbg_fragtree_paranoia_check(f) +#define jffs2_dbg_fragtree_paranoia_check_nolock(f) +#define jffs2_dbg_acct_paranoia_check(c, jeb) +#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb) +#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len) +#endif /* !JFFS2_PARANOIA_CHECKS */ + +#ifdef JFFS2_DBG_DUMPS +#define jffs2_dbg_dump_jeb(c, jeb) \ + __jffs2_dbg_dump_jeb(c, jeb); +#define jffs2_dbg_dump_jeb_nolock(jeb) \ + __jffs2_dbg_dump_jeb_nolock(jeb); +#define jffs2_dbg_dump_block_lists(c) \ + __jffs2_dbg_dump_block_lists(c) +#define jffs2_dbg_dump_block_lists_nolock(c) \ + __jffs2_dbg_dump_block_lists_nolock(c) +#define jffs2_dbg_dump_fragtree(f) \ + __jffs2_dbg_dump_fragtree(f); +#define jffs2_dbg_dump_fragtree_nolock(f) \ + __jffs2_dbg_dump_fragtree_nolock(f); +#define jffs2_dbg_dump_buffer(buf, len, offs) \ + __jffs2_dbg_dump_buffer(*buf, len, offs); +#define jffs2_dbg_dump_node(c, ofs) \ + __jffs2_dbg_dump_node(c, ofs); +#else +#define jffs2_dbg_dump_jeb(c, jeb) +#define jffs2_dbg_dump_jeb_nolock(jeb) +#define jffs2_dbg_dump_block_lists(c) +#define jffs2_dbg_dump_block_lists_nolock(c) +#define jffs2_dbg_dump_fragtree(f) +#define jffs2_dbg_dump_fragtree_nolock(f) +#define jffs2_dbg_dump_buffer(buf, len, offs) +#define jffs2_dbg_dump_node(c, ofs) +#endif /* !JFFS2_DBG_DUMPS */ + +#ifdef JFFS2_DBG_SANITY_CHECKS +#define jffs2_dbg_acct_sanity_check(c, jeb) \ + __jffs2_dbg_acct_sanity_check(c, jeb) +#define jffs2_dbg_acct_sanity_check_nolock(c, jeb) \ + __jffs2_dbg_acct_sanity_check_nolock(c, jeb) +#else +#define jffs2_dbg_acct_sanity_check(c, jeb) +#define jffs2_dbg_acct_sanity_check_nolock(c, jeb) +#endif /* !JFFS2_DBG_SANITY_CHECKS */ + +#endif /* _JFFS2_DEBUG_H_ */ diff --git a/kernel/fs/jffs2/dir.c b/kernel/fs/jffs2/dir.c new file mode 100644 index 000000000..1ba5c9794 --- /dev/null +++ b/kernel/fs/jffs2/dir.c @@ -0,0 +1,862 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "jffs2_fs_i.h" +#include "jffs2_fs_sb.h" +#include +#include "nodelist.h" + +static int jffs2_readdir (struct file *, struct dir_context *); + +static int jffs2_create (struct inode *,struct dentry *,umode_t, + bool); +static struct dentry *jffs2_lookup (struct inode *,struct dentry *, + unsigned int); +static int jffs2_link (struct dentry *,struct inode *,struct dentry *); +static int jffs2_unlink (struct inode *,struct dentry *); +static int jffs2_symlink (struct inode *,struct dentry *,const char *); +static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); +static int jffs2_rmdir (struct inode *,struct dentry *); +static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); +static int jffs2_rename (struct inode *, struct dentry *, + struct inode *, struct dentry *); + +const struct file_operations jffs2_dir_operations = +{ + .read = generic_read_dir, + .iterate = jffs2_readdir, + .unlocked_ioctl=jffs2_ioctl, + .fsync = jffs2_fsync, + .llseek = generic_file_llseek, +}; + + +const struct inode_operations jffs2_dir_inode_operations = +{ + .create = jffs2_create, + .lookup = jffs2_lookup, + .link = jffs2_link, + .unlink = jffs2_unlink, + .symlink = jffs2_symlink, + .mkdir = jffs2_mkdir, + .rmdir = jffs2_rmdir, + .mknod = jffs2_mknod, + .rename = jffs2_rename, + .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +/***********************************************************************/ + + +/* We keep the dirent list sorted in increasing order of name hash, + and we use the same hash function as the dentries. Makes this + nice and simple +*/ +static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, + unsigned int flags) +{ + struct jffs2_inode_info *dir_f; + struct jffs2_full_dirent *fd = NULL, *fd_list; + uint32_t ino = 0; + struct inode *inode = NULL; + + jffs2_dbg(1, "jffs2_lookup()\n"); + + if (target->d_name.len > JFFS2_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + dir_f = JFFS2_INODE_INFO(dir_i); + + mutex_lock(&dir_f->sem); + + /* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */ + for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) { + if (fd_list->nhash == target->d_name.hash && + (!fd || fd_list->version > fd->version) && + strlen(fd_list->name) == target->d_name.len && + !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) { + fd = fd_list; + } + } + if (fd) + ino = fd->ino; + mutex_unlock(&dir_f->sem); + if (ino) { + inode = jffs2_iget(dir_i->i_sb, ino); + if (IS_ERR(inode)) + pr_warn("iget() failed for ino #%u\n", ino); + } + + return d_splice_alias(inode, target); +} + +/***********************************************************************/ + + +static int jffs2_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_full_dirent *fd; + unsigned long curofs = 1; + + jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino); + + if (!dir_emit_dots(file, ctx)) + return 0; + + mutex_lock(&f->sem); + for (fd = f->dents; fd; fd = fd->next) { + curofs++; + /* First loop: curofs = 2; pos = 2 */ + if (curofs < ctx->pos) { + jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", + fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos); + continue; + } + if (!fd->ino) { + jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", + fd->name); + ctx->pos++; + continue; + } + jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", + (unsigned long)ctx->pos, fd->name, fd->ino, fd->type); + if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type)) + break; + ctx->pos++; + } + mutex_unlock(&f->sem); + return 0; +} + +/***********************************************************************/ + + +static int jffs2_create(struct inode *dir_i, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct jffs2_raw_inode *ri; + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + int ret; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + jffs2_dbg(1, "%s()\n", __func__); + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + jffs2_dbg(1, "jffs2_new_inode() failed\n"); + jffs2_free_raw_inode(ri); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_file_inode_operations; + inode->i_fop = &jffs2_file_operations; + inode->i_mapping->a_ops = &jffs2_file_address_operations; + inode->i_mapping->nrpages = 0; + + f = JFFS2_INODE_INFO(inode); + dir_f = JFFS2_INODE_INFO(dir_i); + + /* jffs2_do_create() will want to lock it, _after_ reserving + space and taking c-alloc_sem. If we keep it locked here, + lockdep gets unhappy (although it's a false positive; + nothing else will be looking at this inode yet so there's + no chance of AB-BA deadlock involving its f->sem). */ + mutex_unlock(&f->sem); + + ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name); + if (ret) + goto fail; + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + + jffs2_free_raw_inode(ri); + + jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", + __func__, inode->i_ino, inode->i_mode, inode->i_nlink, + f->inocache->pino_nlink, inode->i_mapping->nrpages); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + + fail: + iget_failed(inode); + jffs2_free_raw_inode(ri); + return ret; +} + +/***********************************************************************/ + + +static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry)); + int ret; + uint32_t now = get_seconds(); + + ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, + dentry->d_name.len, dead_f, now); + if (dead_f->inocache) + set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); + if (!ret) + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + return ret; +} +/***********************************************************************/ + + +static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + int ret; + uint8_t type; + uint32_t now; + + /* Don't let people make hard links to bad inodes. */ + if (!f->inocache) + return -EIO; + + if (d_is_dir(old_dentry)) + return -EPERM; + + /* XXX: This is ugly */ + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; + if (!type) type = DT_REG; + + now = get_seconds(); + ret = jffs2_do_link(c, dir_f, f->inocache->ino, type, dentry->d_name.name, dentry->d_name.len, now); + + if (!ret) { + mutex_lock(&f->sem); + set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); + mutex_unlock(&f->sem); + d_instantiate(dentry, d_inode(old_dentry)); + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + ihold(d_inode(old_dentry)); + } + return ret; +} + +/***********************************************************************/ + +static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + uint32_t alloclen; + int ret, targetlen = strlen(target); + + /* FIXME: If you care. We'd need to use frags for the target + if it grows much more than this */ + if (targetlen > 254) + return -ENAMETOOLONG; + + ri = jffs2_alloc_raw_inode(); + + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_symlink_inode_operations; + + f = JFFS2_INODE_INFO(inode); + + inode->i_size = targetlen; + ri->isize = ri->dsize = ri->csize = cpu_to_je32(inode->i_size); + ri->totlen = cpu_to_je32(sizeof(*ri) + inode->i_size); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->compr = JFFS2_COMPR_NONE; + ri->data_crc = cpu_to_je32(crc32(0, target, targetlen)); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + + /* We use f->target field to store the target path. */ + f->target = kmemdup(target, targetlen + 1, GFP_KERNEL); + if (!f->target) { + pr_warn("Can't allocate %d bytes of memory\n", targetlen + 1); + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + jffs2_dbg(1, "%s(): symlink's target '%s' cached\n", + __func__, (char *)f->target); + + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + rd->type = DT_LNK; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + + +static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + uint32_t alloclen; + int ret; + + mode |= S_IFDIR; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, + JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_dir_inode_operations; + inode->i_fop = &jffs2_dir_operations; + + f = JFFS2_INODE_INFO(inode); + + /* Directories get nlink 2 at start */ + set_nlink(inode, 2); + /* but ic->pino_nlink is the parent ino# */ + f->inocache->pino_nlink = dir_i->i_ino; + + ri->data_crc = cpu_to_je32(0); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + rd->type = DT_DIR; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + inc_nlink(dir_i); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + +static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); + struct jffs2_full_dirent *fd; + int ret; + uint32_t now = get_seconds(); + + for (fd = f->dents ; fd; fd = fd->next) { + if (fd->ino) + return -ENOTEMPTY; + } + + ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, + dentry->d_name.len, f, now); + if (!ret) { + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + clear_nlink(d_inode(dentry)); + drop_nlink(dir_i); + } + return ret; +} + +static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + union jffs2_device_node dev; + int devlen = 0; + uint32_t alloclen; + int ret; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + if (S_ISBLK(mode) || S_ISCHR(mode)) + devlen = jffs2_encode_dev(&dev, rdev); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + inode->i_op = &jffs2_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + f = JFFS2_INODE_INFO(inode); + + ri->dsize = ri->csize = cpu_to_je32(devlen); + ri->totlen = cpu_to_je32(sizeof(*ri) + devlen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->compr = JFFS2_COMPR_NONE; + ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen)); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + + /* XXX: This is ugly. */ + rd->type = (mode & S_IFMT) >> 12; + + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + +static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, + struct inode *new_dir_i, struct dentry *new_dentry) +{ + int ret; + struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb); + struct jffs2_inode_info *victim_f = NULL; + uint8_t type; + uint32_t now; + + /* The VFS will check for us and prevent trying to rename a + * file over a directory and vice versa, but if it's a directory, + * the VFS can't check whether the victim is empty. The filesystem + * needs to do that for itself. + */ + if (d_really_is_positive(new_dentry)) { + victim_f = JFFS2_INODE_INFO(d_inode(new_dentry)); + if (d_is_dir(new_dentry)) { + struct jffs2_full_dirent *fd; + + mutex_lock(&victim_f->sem); + for (fd = victim_f->dents; fd; fd = fd->next) { + if (fd->ino) { + mutex_unlock(&victim_f->sem); + return -ENOTEMPTY; + } + } + mutex_unlock(&victim_f->sem); + } + } + + /* XXX: We probably ought to alloc enough space for + both nodes at the same time. Writing the new link, + then getting -ENOSPC, is quite bad :) + */ + + /* Make a hard link */ + + /* XXX: This is ugly */ + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; + if (!type) type = DT_REG; + + now = get_seconds(); + ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i), + d_inode(old_dentry)->i_ino, type, + new_dentry->d_name.name, new_dentry->d_name.len, now); + + if (ret) + return ret; + + if (victim_f) { + /* There was a victim. Kill it off nicely */ + if (d_is_dir(new_dentry)) + clear_nlink(d_inode(new_dentry)); + else + drop_nlink(d_inode(new_dentry)); + /* Don't oops if the victim was a dirent pointing to an + inode which didn't exist. */ + if (victim_f->inocache) { + mutex_lock(&victim_f->sem); + if (d_is_dir(new_dentry)) + victim_f->inocache->pino_nlink = 0; + else + victim_f->inocache->pino_nlink--; + mutex_unlock(&victim_f->sem); + } + } + + /* If it was a directory we moved, and there was no victim, + increase i_nlink on its new parent */ + if (d_is_dir(old_dentry) && !victim_f) + inc_nlink(new_dir_i); + + /* Unlink the original */ + ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i), + old_dentry->d_name.name, old_dentry->d_name.len, NULL, now); + + /* We don't touch inode->i_nlink */ + + if (ret) { + /* Oh shit. We really ought to make a single node which can do both atomically */ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); + mutex_lock(&f->sem); + inc_nlink(d_inode(old_dentry)); + if (f->inocache && !d_is_dir(old_dentry)) + f->inocache->pino_nlink++; + mutex_unlock(&f->sem); + + pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", + __func__, ret); + /* Might as well let the VFS know */ + d_instantiate(new_dentry, d_inode(old_dentry)); + ihold(d_inode(old_dentry)); + new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + return ret; + } + + if (d_is_dir(old_dentry)) + drop_nlink(old_dir_i); + + new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + + return 0; +} + diff --git a/kernel/fs/jffs2/erase.c b/kernel/fs/jffs2/erase.c new file mode 100644 index 000000000..4a6cf289b --- /dev/null +++ b/kernel/fs/jffs2/erase.c @@ -0,0 +1,515 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +struct erase_priv_struct { + struct jffs2_eraseblock *jeb; + struct jffs2_sb_info *c; +}; + +#ifndef __ECOS +static void jffs2_erase_callback(struct erase_info *); +#endif +static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); +static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); + +static void jffs2_erase_block(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + int ret; + uint32_t bad_offset; +#ifdef __ECOS + ret = jffs2_flash_erase(c, jeb); + if (!ret) { + jffs2_erase_succeeded(c, jeb); + return; + } + bad_offset = jeb->offset; +#else /* Linux */ + struct erase_info *instr; + + jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", + __func__, + jeb->offset, jeb->offset, jeb->offset + c->sector_size); + instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); + if (!instr) { + pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + + memset(instr, 0, sizeof(*instr)); + + instr->mtd = c->mtd; + instr->addr = jeb->offset; + instr->len = c->sector_size; + instr->callback = jffs2_erase_callback; + instr->priv = (unsigned long)(&instr[1]); + + ((struct erase_priv_struct *)instr->priv)->jeb = jeb; + ((struct erase_priv_struct *)instr->priv)->c = c; + + ret = mtd_erase(c->mtd, instr); + if (!ret) + return; + + bad_offset = instr->fail_addr; + kfree(instr); +#endif /* __ECOS */ + + if (ret == -ENOMEM || ret == -EAGAIN) { + /* Erase failed immediately. Refile it on the list */ + jffs2_dbg(1, "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", + jeb->offset, ret); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + + if (ret == -EROFS) + pr_warn("Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", + jeb->offset); + else + pr_warn("Erase at 0x%08x failed immediately: errno %d\n", + jeb->offset, ret); + + jffs2_erase_failed(c, jeb, bad_offset); +} + +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) +{ + struct jffs2_eraseblock *jeb; + int work_done = 0; + + mutex_lock(&c->erase_free_sem); + + spin_lock(&c->erase_completion_lock); + + while (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + + if (!list_empty(&c->erase_complete_list)) { + jeb = list_entry(c->erase_complete_list.next, struct jffs2_eraseblock, list); + list_move(&jeb->list, &c->erase_checking_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + jffs2_mark_erased_block(c, jeb); + + work_done++; + if (!--count) { + jffs2_dbg(1, "Count reached. jffs2_erase_pending_blocks leaving\n"); + goto done; + } + + } else if (!list_empty(&c->erase_pending_list)) { + jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list); + jffs2_dbg(1, "Starting erase of pending block 0x%08x\n", + jeb->offset); + list_del(&jeb->list); + c->erasing_size += c->sector_size; + c->wasted_size -= jeb->wasted_size; + c->free_size -= jeb->free_size; + c->used_size -= jeb->used_size; + c->dirty_size -= jeb->dirty_size; + jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0; + jffs2_free_jeb_node_refs(c, jeb); + list_add(&jeb->list, &c->erasing_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + + jffs2_erase_block(c, jeb); + + } else { + BUG(); + } + + /* Be nice */ + cond_resched(); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + } + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + done: + jffs2_dbg(1, "jffs2_erase_pending_blocks completed\n"); + return work_done; +} + +static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + jffs2_dbg(1, "Erase completed successfully at 0x%08x\n", jeb->offset); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move_tail(&jeb->list, &c->erase_complete_list); + /* Wake the GC thread to mark them clean */ + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); +} + +static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) +{ + /* For NAND, if the failure did not occur at the device level for a + specific physical page, don't bother updating the bad block table. */ + if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { + /* We had a device-level failure to erase. Let's see if we've + failed too many times. */ + if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { + /* We'd like to give this block another try. */ + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + } + + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + c->erasing_size -= c->sector_size; + c->bad_size += c->sector_size; + list_move(&jeb->list, &c->bad_list); + c->nr_erasing_blocks--; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); +} + +#ifndef __ECOS +static void jffs2_erase_callback(struct erase_info *instr) +{ + struct erase_priv_struct *priv = (void *)instr->priv; + + if(instr->state != MTD_ERASE_DONE) { + pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + (unsigned long long)instr->addr, instr->state); + jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); + } else { + jffs2_erase_succeeded(priv->c, priv->jeb); + } + kfree(instr); +} +#endif /* !__ECOS */ + +/* Hmmm. Maybe we should accept the extra space it takes and make + this a standard doubly-linked list? */ +static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, + struct jffs2_raw_node_ref *ref, struct jffs2_eraseblock *jeb) +{ + struct jffs2_inode_cache *ic = NULL; + struct jffs2_raw_node_ref **prev; + + prev = &ref->next_in_ino; + + /* Walk the inode's list once, removing any nodes from this eraseblock */ + while (1) { + if (!(*prev)->next_in_ino) { + /* We're looking at the jffs2_inode_cache, which is + at the end of the linked list. Stash it and continue + from the beginning of the list */ + ic = (struct jffs2_inode_cache *)(*prev); + prev = &ic->nodes; + continue; + } + + if (SECTOR_ADDR((*prev)->flash_offset) == jeb->offset) { + /* It's in the block we're erasing */ + struct jffs2_raw_node_ref *this; + + this = *prev; + *prev = this->next_in_ino; + this->next_in_ino = NULL; + + if (this == ref) + break; + + continue; + } + /* Not to be deleted. Skip */ + prev = &((*prev)->next_in_ino); + } + + /* PARANOIA */ + if (!ic) { + JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref" + " not found in remove_node_refs()!!\n"); + return; + } + + jffs2_dbg(1, "Removed nodes in range 0x%08x-0x%08x from ino #%u\n", + jeb->offset, jeb->offset + c->sector_size, ic->ino); + + D2({ + int i=0; + struct jffs2_raw_node_ref *this; + printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n"); + + this = ic->nodes; + + printk(KERN_DEBUG); + while(this) { + pr_cont("0x%08x(%d)->", + ref_offset(this), ref_flags(this)); + if (++i == 5) { + printk(KERN_DEBUG); + i=0; + } + this = this->next_in_ino; + } + pr_cont("\n"); + }); + + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) + jffs2_del_ino_cache(c, ic); + } +} + +void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + struct jffs2_raw_node_ref *block, *ref; + jffs2_dbg(1, "Freeing all node refs for eraseblock offset 0x%08x\n", + jeb->offset); + + block = ref = jeb->first_node; + + while (ref) { + if (ref->flash_offset == REF_LINK_NODE) { + ref = ref->next_in_ino; + jffs2_free_refblock(block); + block = ref; + continue; + } + if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino) + jffs2_remove_node_refs_from_ino_list(c, ref, jeb); + /* else it was a non-inode node or already removed, so don't bother */ + + ref++; + } + jeb->first_node = jeb->last_node = NULL; +} + +static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset) +{ + void *ebuf; + uint32_t ofs; + size_t retlen; + int ret; + unsigned long *wordebuf; + + ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen, + &ebuf, NULL); + if (ret != -EOPNOTSUPP) { + if (ret) { + jffs2_dbg(1, "MTD point failed %d\n", ret); + goto do_flash_read; + } + if (retlen < c->sector_size) { + /* Don't muck about if it won't let us point to the whole erase sector */ + jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n", + retlen); + mtd_unpoint(c->mtd, jeb->offset, retlen); + goto do_flash_read; + } + wordebuf = ebuf-sizeof(*wordebuf); + retlen /= sizeof(*wordebuf); + do { + if (*++wordebuf != ~0) + break; + } while(--retlen); + mtd_unpoint(c->mtd, jeb->offset, c->sector_size); + if (retlen) { + pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08tx\n", + *wordebuf, + jeb->offset + + c->sector_size-retlen * sizeof(*wordebuf)); + return -EIO; + } + return 0; + } + do_flash_read: + ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ebuf) { + pr_warn("Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", + jeb->offset); + return -EAGAIN; + } + + jffs2_dbg(1, "Verifying erase at 0x%08x\n", jeb->offset); + + for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) { + uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs); + int i; + + *bad_offset = ofs; + + ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf); + if (ret) { + pr_warn("Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", + ofs, ret); + ret = -EIO; + goto fail; + } + if (retlen != readlen) { + pr_warn("Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", + ofs, readlen, retlen); + ret = -EIO; + goto fail; + } + for (i=0; ioffset); + bad_offset = jeb->offset; + + /* Cleanmarker in oob area or no cleanmarker at all ? */ + if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) { + + if (jffs2_cleanmarker_oob(c)) { + if (jffs2_write_nand_cleanmarker(c, jeb)) + goto filebad; + } + } else { + + struct kvec vecs[1]; + struct jffs2_unknown_node marker = { + .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), + .totlen = cpu_to_je32(c->cleanmarker_size) + }; + + jffs2_prealloc_raw_node_refs(c, jeb, 1); + + marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4)); + + vecs[0].iov_base = (unsigned char *) ▮ + vecs[0].iov_len = sizeof(marker); + ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen); + + if (ret || retlen != sizeof(marker)) { + if (ret) + pr_warn("Write clean marker to block at 0x%08x failed: %d\n", + jeb->offset, ret); + else + pr_warn("Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", + jeb->offset, sizeof(marker), retlen); + + goto filebad; + } + } + /* Everything else got zeroed before the erase */ + jeb->free_size = c->sector_size; + + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + + c->erasing_size -= c->sector_size; + c->free_size += c->sector_size; + + /* Account for cleanmarker now, if it's in-band */ + if (c->cleanmarker_size && !jffs2_cleanmarker_oob(c)) + jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL); + + list_move_tail(&jeb->list, &c->free_list); + c->nr_erasing_blocks--; + c->nr_free_blocks++; + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); + return; + +filebad: + jffs2_erase_failed(c, jeb, bad_offset); + return; + +refile: + /* Stick it back on the list from whence it came and come back later */ + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + list_move(&jeb->list, &c->erase_complete_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; +} diff --git a/kernel/fs/jffs2/file.c b/kernel/fs/jffs2/file.c new file mode 100644 index 000000000..f509f62e1 --- /dev/null +++ b/kernel/fs/jffs2/file.c @@ -0,0 +1,337 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pg, void *fsdata); +static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +static int jffs2_readpage (struct file *filp, struct page *pg); + +int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + /* Trigger GC to flush any pending writes for this inode */ + jffs2_flush_wbuf_gc(c, inode->i_ino); + mutex_unlock(&inode->i_mutex); + + return 0; +} + +const struct file_operations jffs2_file_operations = +{ + .llseek = generic_file_llseek, + .open = generic_file_open, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .unlocked_ioctl=jffs2_ioctl, + .mmap = generic_file_readonly_mmap, + .fsync = jffs2_fsync, + .splice_read = generic_file_splice_read, +}; + +/* jffs2_file_inode_operations */ + +const struct inode_operations jffs2_file_inode_operations = +{ + .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +const struct address_space_operations jffs2_file_address_operations = +{ + .readpage = jffs2_readpage, + .write_begin = jffs2_write_begin, + .write_end = jffs2_write_end, +}; + +static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + unsigned char *pg_buf; + int ret; + + jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", + __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT); + + BUG_ON(!PageLocked(pg)); + + pg_buf = kmap(pg); + /* FIXME: Can kmap fail? */ + + ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE); + + if (ret) { + ClearPageUptodate(pg); + SetPageError(pg); + } else { + SetPageUptodate(pg); + ClearPageError(pg); + } + + flush_dcache_page(pg); + kunmap(pg); + + jffs2_dbg(2, "readpage finished\n"); + return ret; +} + +int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) +{ + int ret = jffs2_do_readpage_nolock(inode, pg); + unlock_page(pg); + return ret; +} + + +static int jffs2_readpage (struct file *filp, struct page *pg) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + int ret; + + mutex_lock(&f->sem); + ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + mutex_unlock(&f->sem); + return ret; +} + +static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *pg; + struct inode *inode = mapping->host; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; + uint32_t alloc_len = 0; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + uint32_t pageofs = index << PAGE_CACHE_SHIFT; + int ret = 0; + + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + return ret; + } + + mutex_lock(&f->sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + if (alloc_len) + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); + return -ENOMEM; + } + *pagep = pg; + + if (alloc_len) { + /* Make new hole frag from old EOF to new page */ + struct jffs2_full_dnode *fn; + + jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", + (unsigned int)inode->i_size, pageofs); + + memset(&ri, 0, sizeof(ri)); + + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri)); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(inode->i_mode); + ri.uid = cpu_to_je16(i_uid_read(inode)); + ri.gid = cpu_to_je16(i_gid_read(inode)); + ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); + ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds()); + ri.offset = cpu_to_je32(inode->i_size); + ri.dsize = cpu_to_je32(pageofs - inode->i_size); + ri.csize = cpu_to_je32(0); + ri.compr = JFFS2_COMPR_ZERO; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(0); + + fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL); + + if (IS_ERR(fn)) { + ret = PTR_ERR(fn); + jffs2_complete_reservation(c); + goto out_page; + } + ret = jffs2_add_full_dnode_to_inode(c, f, fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + if (ret) { + jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", + ret); + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + jffs2_complete_reservation(c); + goto out_page; + } + jffs2_complete_reservation(c); + inode->i_size = pageofs; + } + + /* + * Read in the page if it wasn't already present. Cannot optimize away + * the whole page write case until jffs2_write_end can handle the + * case of a short-copy. + */ + if (!PageUptodate(pg)) { + ret = jffs2_do_readpage_nolock(inode, pg); + if (ret) + goto out_page; + } + mutex_unlock(&f->sem); + jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); + return ret; + +out_page: + unlock_page(pg); + page_cache_release(pg); + mutex_unlock(&f->sem); + return ret; +} + +static int jffs2_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pg, void *fsdata) +{ + /* Actually commit the write from the page cache page we're looking at. + * For now, we write the full page out each time. It sucks, but it's simple + */ + struct inode *inode = mapping->host; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode *ri; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + unsigned aligned_start = start & ~3; + int ret = 0; + uint32_t writtenlen = 0; + + jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", + __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT, + start, end, pg->flags); + + /* We need to avoid deadlock with page_cache_read() in + jffs2_garbage_collect_pass(). So the page must be + up to date to prevent page_cache_read() from trying + to re-lock it. */ + BUG_ON(!PageUptodate(pg)); + + if (end == PAGE_CACHE_SIZE) { + /* When writing out the end of a page, write out the + _whole_ page. This helps to reduce the number of + nodes in files which have many short writes, like + syslog files. */ + aligned_start = 0; + } + + ri = jffs2_alloc_raw_inode(); + + if (!ri) { + jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", + __func__); + unlock_page(pg); + page_cache_release(pg); + return -ENOMEM; + } + + /* Set the fields that the generic jffs2_write_inode_range() code can't find */ + ri->ino = cpu_to_je32(inode->i_ino); + ri->mode = cpu_to_jemode(inode->i_mode); + ri->uid = cpu_to_je16(i_uid_read(inode)); + ri->gid = cpu_to_je16(i_gid_read(inode)); + ri->isize = cpu_to_je32((uint32_t)inode->i_size); + ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds()); + + /* In 2.4, it was already kmapped by generic_file_write(). Doesn't + hurt to do it again. The alternative is ifdefs, which are ugly. */ + kmap(pg); + + ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, + (pg->index << PAGE_CACHE_SHIFT) + aligned_start, + end - aligned_start, &writtenlen); + + kunmap(pg); + + if (ret) { + /* There was an error writing. */ + SetPageError(pg); + } + + /* Adjust writtenlen for the padding we did, so we don't confuse our caller */ + writtenlen -= min(writtenlen, (start - aligned_start)); + + if (writtenlen) { + if (inode->i_size < pos + writtenlen) { + inode->i_size = pos + writtenlen; + inode->i_blocks = (inode->i_size + 511) >> 9; + + inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + } + } + + jffs2_free_raw_inode(ri); + + if (start+writtenlen < end) { + /* generic_file_write has written more to the page cache than we've + actually written to the medium. Mark the page !Uptodate so that + it gets reread */ + jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", + __func__); + SetPageError(pg); + ClearPageUptodate(pg); + } + + jffs2_dbg(1, "%s() returning %d\n", + __func__, writtenlen > 0 ? writtenlen : ret); + unlock_page(pg); + page_cache_release(pg); + return writtenlen > 0 ? writtenlen : ret; +} diff --git a/kernel/fs/jffs2/fs.c b/kernel/fs/jffs2/fs.c new file mode 100644 index 000000000..fe5ea080b --- /dev/null +++ b/kernel/fs/jffs2/fs.c @@ -0,0 +1,766 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_flash_setup(struct jffs2_sb_info *c); + +int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) +{ + struct jffs2_full_dnode *old_metadata, *new_metadata; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode *ri; + union jffs2_device_node dev; + unsigned char *mdata = NULL; + int mdatalen = 0; + unsigned int ivalid; + uint32_t alloclen; + int ret; + int alloc_type = ALLOC_NORMAL; + + jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino); + + /* Special cases - we don't want more than one data node + for these types on the medium at any time. So setattr + must read the original data associated with the node + (i.e. the device numbers or the target name) and write + it out again with the appropriate data attached */ + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { + /* For these, we don't actually need to read the old node */ + mdatalen = jffs2_encode_dev(&dev, inode->i_rdev); + mdata = (char *)&dev; + jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n", + __func__, mdatalen); + } else if (S_ISLNK(inode->i_mode)) { + mutex_lock(&f->sem); + mdatalen = f->metadata->size; + mdata = kmalloc(f->metadata->size, GFP_USER); + if (!mdata) { + mutex_unlock(&f->sem); + return -ENOMEM; + } + ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen); + if (ret) { + mutex_unlock(&f->sem); + kfree(mdata); + return ret; + } + mutex_unlock(&f->sem); + jffs2_dbg(1, "%s(): Writing %d bytes of symlink target\n", + __func__, mdatalen); + } + + ri = jffs2_alloc_raw_inode(); + if (!ri) { + if (S_ISLNK(inode->i_mode)) + kfree(mdata); + return -ENOMEM; + } + + ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + jffs2_free_raw_inode(ri); + if (S_ISLNK(inode->i_mode)) + kfree(mdata); + return ret; + } + mutex_lock(&f->sem); + ivalid = iattr->ia_valid; + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(sizeof(*ri) + mdatalen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->ino = cpu_to_je32(inode->i_ino); + ri->version = cpu_to_je32(++f->highest_version); + + ri->uid = cpu_to_je16((ivalid & ATTR_UID)? + from_kuid(&init_user_ns, iattr->ia_uid):i_uid_read(inode)); + ri->gid = cpu_to_je16((ivalid & ATTR_GID)? + from_kgid(&init_user_ns, iattr->ia_gid):i_gid_read(inode)); + + if (ivalid & ATTR_MODE) + ri->mode = cpu_to_jemode(iattr->ia_mode); + else + ri->mode = cpu_to_jemode(inode->i_mode); + + + ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); + ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); + ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); + ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime)); + + ri->offset = cpu_to_je32(0); + ri->csize = ri->dsize = cpu_to_je32(mdatalen); + ri->compr = JFFS2_COMPR_NONE; + if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) { + /* It's an extension. Make it a hole node */ + ri->compr = JFFS2_COMPR_ZERO; + ri->dsize = cpu_to_je32(iattr->ia_size - inode->i_size); + ri->offset = cpu_to_je32(inode->i_size); + } else if (ivalid & ATTR_SIZE && !iattr->ia_size) { + /* For truncate-to-zero, treat it as deletion because + it'll always be obsoleting all previous nodes */ + alloc_type = ALLOC_DELETION; + } + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + if (mdatalen) + ri->data_crc = cpu_to_je32(crc32(0, mdata, mdatalen)); + else + ri->data_crc = cpu_to_je32(0); + + new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, alloc_type); + if (S_ISLNK(inode->i_mode)) + kfree(mdata); + + if (IS_ERR(new_metadata)) { + jffs2_complete_reservation(c); + jffs2_free_raw_inode(ri); + mutex_unlock(&f->sem); + return PTR_ERR(new_metadata); + } + /* It worked. Update the inode */ + inode->i_atime = ITIME(je32_to_cpu(ri->atime)); + inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode->i_mode = jemode_to_cpu(ri->mode); + i_uid_write(inode, je16_to_cpu(ri->uid)); + i_gid_write(inode, je16_to_cpu(ri->gid)); + + + old_metadata = f->metadata; + + if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) + jffs2_truncate_fragtree (c, &f->fragtree, iattr->ia_size); + + if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) { + jffs2_add_full_dnode_to_inode(c, f, new_metadata); + inode->i_size = iattr->ia_size; + inode->i_blocks = (inode->i_size + 511) >> 9; + f->metadata = NULL; + } else { + f->metadata = new_metadata; + } + if (old_metadata) { + jffs2_mark_node_obsolete(c, old_metadata->raw); + jffs2_free_full_dnode(old_metadata); + } + jffs2_free_raw_inode(ri); + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + /* We have to do the truncate_setsize() without f->sem held, since + some pages may be locked and waiting for it in readpage(). + We are protected from a simultaneous write() extending i_size + back past iattr->ia_size, because do_truncate() holds the + generic inode semaphore. */ + if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { + truncate_setsize(inode, iattr->ia_size); + inode->i_blocks = (inode->i_size + 511) >> 9; + } + + return 0; +} + +int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + rc = jffs2_do_setattr(inode, iattr); + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = posix_acl_chmod(inode, inode->i_mode); + + return rc; +} + +int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb); + unsigned long avail; + + buf->f_type = JFFS2_SUPER_MAGIC; + buf->f_bsize = 1 << PAGE_SHIFT; + buf->f_blocks = c->flash_size >> PAGE_SHIFT; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; + + spin_lock(&c->erase_completion_lock); + avail = c->dirty_size + c->free_size; + if (avail > c->sector_size * c->resv_blocks_write) + avail -= c->sector_size * c->resv_blocks_write; + else + avail = 0; + spin_unlock(&c->erase_completion_lock); + + buf->f_bavail = buf->f_bfree = avail >> PAGE_SHIFT; + + return 0; +} + + +void jffs2_evict_inode (struct inode *inode) +{ + /* We can forget about this inode for now - drop all + * the nodelists associated with it, etc. + */ + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + jffs2_dbg(1, "%s(): ino #%lu mode %o\n", + __func__, inode->i_ino, inode->i_mode); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + jffs2_do_clear_inode(c, f); +} + +struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) +{ + struct jffs2_inode_info *f; + struct jffs2_sb_info *c; + struct jffs2_raw_inode latest_node; + union jffs2_device_node jdev; + struct inode *inode; + dev_t rdev = 0; + int ret; + + jffs2_dbg(1, "%s(): ino == %lu\n", __func__, ino); + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + f = JFFS2_INODE_INFO(inode); + c = JFFS2_SB_INFO(inode->i_sb); + + jffs2_init_inode_info(f); + mutex_lock(&f->sem); + + ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); + + if (ret) { + mutex_unlock(&f->sem); + iget_failed(inode); + return ERR_PTR(ret); + } + inode->i_mode = jemode_to_cpu(latest_node.mode); + i_uid_write(inode, je16_to_cpu(latest_node.uid)); + i_gid_write(inode, je16_to_cpu(latest_node.gid)); + inode->i_size = je32_to_cpu(latest_node.isize); + inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); + inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); + inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + + set_nlink(inode, f->inocache->pino_nlink); + + inode->i_blocks = (inode->i_size + 511) >> 9; + + switch (inode->i_mode & S_IFMT) { + + case S_IFLNK: + inode->i_op = &jffs2_symlink_inode_operations; + break; + + case S_IFDIR: + { + struct jffs2_full_dirent *fd; + set_nlink(inode, 2); /* parent and '.' */ + + for (fd=f->dents; fd; fd = fd->next) { + if (fd->type == DT_DIR && fd->ino) + inc_nlink(inode); + } + /* Root dir gets i_nlink 3 for some reason */ + if (inode->i_ino == 1) + inc_nlink(inode); + + inode->i_op = &jffs2_dir_inode_operations; + inode->i_fop = &jffs2_dir_operations; + break; + } + case S_IFREG: + inode->i_op = &jffs2_file_inode_operations; + inode->i_fop = &jffs2_file_operations; + inode->i_mapping->a_ops = &jffs2_file_address_operations; + inode->i_mapping->nrpages = 0; + break; + + case S_IFBLK: + case S_IFCHR: + /* Read the device numbers from the media */ + if (f->metadata->size != sizeof(jdev.old_id) && + f->metadata->size != sizeof(jdev.new_id)) { + pr_notice("Device node has strange size %d\n", + f->metadata->size); + goto error_io; + } + jffs2_dbg(1, "Reading device numbers from flash\n"); + ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size); + if (ret < 0) { + /* Eep */ + pr_notice("Read device numbers for inode %lu failed\n", + (unsigned long)inode->i_ino); + goto error; + } + if (f->metadata->size == sizeof(jdev.old_id)) + rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); + else + rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); + + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &jffs2_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + + default: + pr_warn("%s(): Bogus i_mode %o for ino %lu\n", + __func__, inode->i_mode, (unsigned long)inode->i_ino); + } + + mutex_unlock(&f->sem); + + jffs2_dbg(1, "jffs2_read_inode() returning\n"); + unlock_new_inode(inode); + return inode; + +error_io: + ret = -EIO; +error: + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + iget_failed(inode); + return ERR_PTR(ret); +} + +void jffs2_dirty_inode(struct inode *inode, int flags) +{ + struct iattr iattr; + + if (!(inode->i_state & I_DIRTY_DATASYNC)) { + jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", + __func__, inode->i_ino); + return; + } + + jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n", + __func__, inode->i_ino); + + iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME; + iattr.ia_mode = inode->i_mode; + iattr.ia_uid = inode->i_uid; + iattr.ia_gid = inode->i_gid; + iattr.ia_atime = inode->i_atime; + iattr.ia_mtime = inode->i_mtime; + iattr.ia_ctime = inode->i_ctime; + + jffs2_do_setattr(inode, &iattr); +} + +int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + if (c->flags & JFFS2_SB_FLAG_RO && !(sb->s_flags & MS_RDONLY)) + return -EROFS; + + /* We stop if it was running, then restart if it needs to. + This also catches the case where it was stopped and this + is just a remount to restart it. + Flush the writebuffer, if neccecary, else we loose it */ + if (!(sb->s_flags & MS_RDONLY)) { + jffs2_stop_garbage_collect_thread(c); + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + } + + if (!(*flags & MS_RDONLY)) + jffs2_start_garbage_collect_thread(c); + + *flags |= MS_NOATIME; + return 0; +} + +/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, + fill in the raw_inode while you're at it. */ +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri) +{ + struct inode *inode; + struct super_block *sb = dir_i->i_sb; + struct jffs2_sb_info *c; + struct jffs2_inode_info *f; + int ret; + + jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n", + __func__, dir_i->i_ino, mode); + + c = JFFS2_SB_INFO(sb); + + inode = new_inode(sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + f = JFFS2_INODE_INFO(inode); + jffs2_init_inode_info(f); + mutex_lock(&f->sem); + + memset(ri, 0, sizeof(*ri)); + /* Set OS-specific defaults for new inodes */ + ri->uid = cpu_to_je16(from_kuid(&init_user_ns, current_fsuid())); + + if (dir_i->i_mode & S_ISGID) { + ri->gid = cpu_to_je16(i_gid_read(dir_i)); + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + ri->gid = cpu_to_je16(from_kgid(&init_user_ns, current_fsgid())); + } + + /* POSIX ACLs have to be processed now, at least partly. + The umask is only applied if there's no default ACL */ + ret = jffs2_init_acl_pre(dir_i, inode, &mode); + if (ret) { + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); + } + ret = jffs2_do_new_inode (c, f, mode, ri); + if (ret) { + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); + } + set_nlink(inode, 1); + inode->i_ino = je32_to_cpu(ri->ino); + inode->i_mode = jemode_to_cpu(ri->mode); + i_gid_write(inode, je16_to_cpu(ri->gid)); + i_uid_write(inode, je16_to_cpu(ri->uid)); + inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); + + inode->i_blocks = 0; + inode->i_size = 0; + + if (insert_inode_locked(inode) < 0) { + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + + return inode; +} + +static int calculate_inocache_hashsize(uint32_t flash_size) +{ + /* + * Pick a inocache hash size based on the size of the medium. + * Count how many megabytes we're dealing with, apply a hashsize twice + * that size, but rounding down to the usual big powers of 2. And keep + * to sensible bounds. + */ + + int size_mb = flash_size / 1024 / 1024; + int hashsize = (size_mb * 2) & ~0x3f; + + if (hashsize < INOCACHE_HASHSIZE_MIN) + return INOCACHE_HASHSIZE_MIN; + if (hashsize > INOCACHE_HASHSIZE_MAX) + return INOCACHE_HASHSIZE_MAX; + + return hashsize; +} + +int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jffs2_sb_info *c; + struct inode *root_i; + int ret; + size_t blocks; + + c = JFFS2_SB_INFO(sb); + + /* Do not support the MLC nand */ + if (c->mtd->type == MTD_MLCNANDFLASH) + return -EINVAL; + +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER + if (c->mtd->type == MTD_NANDFLASH) { + pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n"); + return -EINVAL; + } + if (c->mtd->type == MTD_DATAFLASH) { + pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n"); + return -EINVAL; + } +#endif + + c->flash_size = c->mtd->size; + c->sector_size = c->mtd->erasesize; + blocks = c->flash_size / c->sector_size; + + /* + * Size alignment check + */ + if ((c->sector_size * blocks) != c->flash_size) { + c->flash_size = c->sector_size * blocks; + pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n", + c->flash_size / 1024); + } + + if (c->flash_size < 5*c->sector_size) { + pr_err("Too few erase blocks (%d)\n", + c->flash_size / c->sector_size); + return -EINVAL; + } + + c->cleanmarker_size = sizeof(struct jffs2_unknown_node); + + /* NAND (or other bizarre) flash... do setup accordingly */ + ret = jffs2_flash_setup(c); + if (ret) + return ret; + + c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size); + c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); + if (!c->inocache_list) { + ret = -ENOMEM; + goto out_wbuf; + } + + jffs2_init_xattr_subsystem(c); + + if ((ret = jffs2_do_mount_fs(c))) + goto out_inohash; + + jffs2_dbg(1, "%s(): Getting root inode\n", __func__); + root_i = jffs2_iget(sb, 1); + if (IS_ERR(root_i)) { + jffs2_dbg(1, "get root inode failed\n"); + ret = PTR_ERR(root_i); + goto out_root; + } + + ret = -ENOMEM; + + jffs2_dbg(1, "%s(): d_make_root()\n", __func__); + sb->s_root = d_make_root(root_i); + if (!sb->s_root) + goto out_root; + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = JFFS2_SUPER_MAGIC; + if (!(sb->s_flags & MS_RDONLY)) + jffs2_start_garbage_collect_thread(c); + return 0; + +out_root: + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else + kfree(c->blocks); + out_inohash: + jffs2_clear_xattr_subsystem(c); + kfree(c->inocache_list); + out_wbuf: + jffs2_flash_cleanup(c); + + return ret; +} + +void jffs2_gc_release_inode(struct jffs2_sb_info *c, + struct jffs2_inode_info *f) +{ + iput(OFNI_EDONI_2SFFJ(f)); +} + +struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, + int inum, int unlinked) +{ + struct inode *inode; + struct jffs2_inode_cache *ic; + + if (unlinked) { + /* The inode has zero nlink but its nodes weren't yet marked + obsolete. This has to be because we're still waiting for + the final (close() and) iput() to happen. + + There's a possibility that the final iput() could have + happened while we were contemplating. In order to ensure + that we don't cause a new read_inode() (which would fail) + for the inode in question, we use ilookup() in this case + instead of iget(). + + The nlink can't _become_ zero at this point because we're + holding the alloc_sem, and jffs2_do_unlink() would also + need that while decrementing nlink on any inode. + */ + inode = ilookup(OFNI_BS_2SFFJ(c), inum); + if (!inode) { + jffs2_dbg(1, "ilookup() failed for ino #%u; inode is probably deleted.\n", + inum); + + spin_lock(&c->inocache_lock); + ic = jffs2_get_ino_cache(c, inum); + if (!ic) { + jffs2_dbg(1, "Inode cache for ino #%u is gone\n", + inum); + spin_unlock(&c->inocache_lock); + return NULL; + } + if (ic->state != INO_STATE_CHECKEDABSENT) { + /* Wait for progress. Don't just loop */ + jffs2_dbg(1, "Waiting for ino #%u in state %d\n", + ic->ino, ic->state); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + } else { + spin_unlock(&c->inocache_lock); + } + + return NULL; + } + } else { + /* Inode has links to it still; they're not going away because + jffs2_do_unlink() would need the alloc_sem and we have it. + Just iget() it, and if read_inode() is necessary that's OK. + */ + inode = jffs2_iget(OFNI_BS_2SFFJ(c), inum); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + if (is_bad_inode(inode)) { + pr_notice("Eep. read_inode() failed for ino #%u. unlinked %d\n", + inum, unlinked); + /* NB. This will happen again. We need to do something appropriate here. */ + iput(inode); + return ERR_PTR(-EIO); + } + + return JFFS2_INODE_INFO(inode); +} + +unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + unsigned long offset, + unsigned long *priv) +{ + struct inode *inode = OFNI_EDONI_2SFFJ(f); + struct page *pg; + + pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + (void *)jffs2_do_readpage_unlock, inode); + if (IS_ERR(pg)) + return (void *)pg; + + *priv = (unsigned long)pg; + return kmap(pg); +} + +void jffs2_gc_release_page(struct jffs2_sb_info *c, + unsigned char *ptr, + unsigned long *priv) +{ + struct page *pg = (void *)*priv; + + kunmap(pg); + page_cache_release(pg); +} + +static int jffs2_flash_setup(struct jffs2_sb_info *c) { + int ret = 0; + + if (jffs2_cleanmarker_oob(c)) { + /* NAND flash... do setup accordingly */ + ret = jffs2_nand_flash_setup(c); + if (ret) + return ret; + } + + /* and Dataflash */ + if (jffs2_dataflash(c)) { + ret = jffs2_dataflash_setup(c); + if (ret) + return ret; + } + + /* and Intel "Sibley" flash */ + if (jffs2_nor_wbuf_flash(c)) { + ret = jffs2_nor_wbuf_flash_setup(c); + if (ret) + return ret; + } + + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + ret = jffs2_ubivol_setup(c); + if (ret) + return ret; + } + + return ret; +} + +void jffs2_flash_cleanup(struct jffs2_sb_info *c) { + + if (jffs2_cleanmarker_oob(c)) { + jffs2_nand_flash_cleanup(c); + } + + /* and DataFlash */ + if (jffs2_dataflash(c)) { + jffs2_dataflash_cleanup(c); + } + + /* and Intel "Sibley" flash */ + if (jffs2_nor_wbuf_flash(c)) { + jffs2_nor_wbuf_flash_cleanup(c); + } + + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + jffs2_ubivol_cleanup(c); + } +} diff --git a/kernel/fs/jffs2/gc.c b/kernel/fs/jffs2/gc.c new file mode 100644 index 000000000..5a2dec2b0 --- /dev/null +++ b/kernel/fs/jffs2/gc.c @@ -0,0 +1,1378 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + +static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_raw_node_ref *raw); +static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fd); +static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd); +static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd); +static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end); +static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end); +static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f); + +/* Called with erase_completion_lock held */ +static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *ret; + struct list_head *nextlist = NULL; + int n = jiffies % 128; + + /* Pick an eraseblock to garbage collect next. This is where we'll + put the clever wear-levelling algorithms. Eventually. */ + /* We possibly want to favour the dirtier blocks more when the + number of free blocks is low. */ +again: + if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) { + jffs2_dbg(1, "Picking block from bad_used_list to GC next\n"); + nextlist = &c->bad_used_list; + } else if (n < 50 && !list_empty(&c->erasable_list)) { + /* Note that most of them will have gone directly to be erased. + So don't favour the erasable_list _too_ much. */ + jffs2_dbg(1, "Picking block from erasable_list to GC next\n"); + nextlist = &c->erasable_list; + } else if (n < 110 && !list_empty(&c->very_dirty_list)) { + /* Most of the time, pick one off the very_dirty list */ + jffs2_dbg(1, "Picking block from very_dirty_list to GC next\n"); + nextlist = &c->very_dirty_list; + } else if (n < 126 && !list_empty(&c->dirty_list)) { + jffs2_dbg(1, "Picking block from dirty_list to GC next\n"); + nextlist = &c->dirty_list; + } else if (!list_empty(&c->clean_list)) { + jffs2_dbg(1, "Picking block from clean_list to GC next\n"); + nextlist = &c->clean_list; + } else if (!list_empty(&c->dirty_list)) { + jffs2_dbg(1, "Picking block from dirty_list to GC next (clean_list was empty)\n"); + + nextlist = &c->dirty_list; + } else if (!list_empty(&c->very_dirty_list)) { + jffs2_dbg(1, "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n"); + nextlist = &c->very_dirty_list; + } else if (!list_empty(&c->erasable_list)) { + jffs2_dbg(1, "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n"); + + nextlist = &c->erasable_list; + } else if (!list_empty(&c->erasable_pending_wbuf_list)) { + /* There are blocks are wating for the wbuf sync */ + jffs2_dbg(1, "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n"); + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + goto again; + } else { + /* Eep. All were empty */ + jffs2_dbg(1, "No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n"); + return NULL; + } + + ret = list_entry(nextlist->next, struct jffs2_eraseblock, list); + list_del(&ret->list); + c->gcblock = ret; + ret->gc_node = ret->first_node; + if (!ret->gc_node) { + pr_warn("Eep. ret->gc_node for block at 0x%08x is NULL\n", + ret->offset); + BUG(); + } + + /* Have we accidentally picked a clean block with wasted space ? */ + if (ret->wasted_size) { + jffs2_dbg(1, "Converting wasted_size %08x to dirty_size\n", + ret->wasted_size); + ret->dirty_size += ret->wasted_size; + c->wasted_size -= ret->wasted_size; + c->dirty_size += ret->wasted_size; + ret->wasted_size = 0; + } + + return ret; +} + +/* jffs2_garbage_collect_pass + * Make a single attempt to progress GC. Move one node, and possibly + * start erasing one eraseblock. + */ +int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) +{ + struct jffs2_inode_info *f; + struct jffs2_inode_cache *ic; + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + uint32_t gcblock_dirty; + int ret = 0, inum, nlink; + int xattr = 0; + + if (mutex_lock_interruptible(&c->alloc_sem)) + return -EINTR; + + for (;;) { + spin_lock(&c->erase_completion_lock); + if (!c->unchecked_size) + break; + + /* We can't start doing GC yet. We haven't finished checking + the node CRCs etc. Do it now. */ + + /* checked_ino is protected by the alloc_sem */ + if (c->checked_ino > c->highest_ino && xattr) { + pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n", + c->unchecked_size); + jffs2_dbg_dump_block_lists_nolock(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + spin_unlock(&c->erase_completion_lock); + + if (!xattr) + xattr = jffs2_verify_xattr(c); + + spin_lock(&c->inocache_lock); + + ic = jffs2_get_ino_cache(c, c->checked_ino++); + + if (!ic) { + spin_unlock(&c->inocache_lock); + continue; + } + + if (!ic->pino_nlink) { + jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n", + ic->ino); + spin_unlock(&c->inocache_lock); + jffs2_xattr_delete_inode(c, ic); + continue; + } + switch(ic->state) { + case INO_STATE_CHECKEDABSENT: + case INO_STATE_PRESENT: + jffs2_dbg(1, "Skipping ino #%u already checked\n", + ic->ino); + spin_unlock(&c->inocache_lock); + continue; + + case INO_STATE_GC: + case INO_STATE_CHECKING: + pr_warn("Inode #%u is in state %d during CRC check phase!\n", + ic->ino, ic->state); + spin_unlock(&c->inocache_lock); + BUG(); + + case INO_STATE_READING: + /* We need to wait for it to finish, lest we move on + and trigger the BUG() above while we haven't yet + finished checking all its nodes */ + jffs2_dbg(1, "Waiting for ino #%u to finish reading\n", + ic->ino); + /* We need to come back again for the _same_ inode. We've + made no progress in this case, but that should be OK */ + c->checked_ino--; + + mutex_unlock(&c->alloc_sem); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + return 0; + + default: + BUG(); + + case INO_STATE_UNCHECKED: + ; + } + ic->state = INO_STATE_CHECKING; + spin_unlock(&c->inocache_lock); + + jffs2_dbg(1, "%s(): triggering inode scan of ino#%u\n", + __func__, ic->ino); + + ret = jffs2_do_crccheck_inode(c, ic); + if (ret) + pr_warn("Returned error for crccheck of ino #%u. Expect badness...\n", + ic->ino); + + jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT); + mutex_unlock(&c->alloc_sem); + return ret; + } + + /* If there are any blocks which need erasing, erase them now */ + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + jffs2_dbg(1, "%s(): erasing pending blocks\n", __func__); + if (jffs2_erase_pending_blocks(c, 1)) + return 0; + + jffs2_dbg(1, "No progress from erasing block; doing GC anyway\n"); + mutex_lock(&c->alloc_sem); + spin_lock(&c->erase_completion_lock); + } + + /* First, work out which block we're garbage-collecting */ + jeb = c->gcblock; + + if (!jeb) + jeb = jffs2_find_gc_block(c); + + if (!jeb) { + /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ + if (c->nr_erasing_blocks) { + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -EAGAIN; + } + jffs2_dbg(1, "Couldn't find erase block to garbage collect!\n"); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -EIO; + } + + jffs2_dbg(1, "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size); + D1(if (c->nextblock) + printk(KERN_DEBUG "Nextblock at %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size)); + + if (!jeb->used_size) { + mutex_unlock(&c->alloc_sem); + goto eraseit; + } + + raw = jeb->gc_node; + gcblock_dirty = jeb->dirty_size; + + while(ref_obsolete(raw)) { + jffs2_dbg(1, "Node at 0x%08x is obsolete... skipping\n", + ref_offset(raw)); + raw = ref_next(raw); + if (unlikely(!raw)) { + pr_warn("eep. End of raw list while still supposedly nodes to GC\n"); + pr_warn("erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n", + jeb->offset, jeb->free_size, + jeb->dirty_size, jeb->used_size); + jeb->gc_node = raw; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + BUG(); + } + } + jeb->gc_node = raw; + + jffs2_dbg(1, "Going to garbage collect node at 0x%08x\n", + ref_offset(raw)); + + if (!raw->next_in_ino) { + /* Inode-less node. Clean marker, snapshot or something like that */ + spin_unlock(&c->erase_completion_lock); + if (ref_flags(raw) == REF_PRISTINE) { + /* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */ + jffs2_garbage_collect_pristine(c, NULL, raw); + } else { + /* Just mark it obsolete */ + jffs2_mark_node_obsolete(c, raw); + } + mutex_unlock(&c->alloc_sem); + goto eraseit_lock; + } + + ic = jffs2_raw_ref_to_ic(raw); + +#ifdef CONFIG_JFFS2_FS_XATTR + /* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr. + * We can decide whether this node is inode or xattr by ic->class. */ + if (ic->class == RAWNODE_CLASS_XATTR_DATUM + || ic->class == RAWNODE_CLASS_XATTR_REF) { + spin_unlock(&c->erase_completion_lock); + + if (ic->class == RAWNODE_CLASS_XATTR_DATUM) { + ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw); + } else { + ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw); + } + goto test_gcnode; + } +#endif + + /* We need to hold the inocache. Either the erase_completion_lock or + the inocache_lock are sufficient; we trade down since the inocache_lock + causes less contention. */ + spin_lock(&c->inocache_lock); + + spin_unlock(&c->erase_completion_lock); + + jffs2_dbg(1, "%s(): collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", + __func__, jeb->offset, ref_offset(raw), ref_flags(raw), + ic->ino); + + /* Three possibilities: + 1. Inode is already in-core. We must iget it and do proper + updating to its fragtree, etc. + 2. Inode is not in-core, node is REF_PRISTINE. We lock the + inocache to prevent a read_inode(), copy the node intact. + 3. Inode is not in-core, node is not pristine. We must iget() + and take the slow path. + */ + + switch(ic->state) { + case INO_STATE_CHECKEDABSENT: + /* It's been checked, but it's not currently in-core. + We can just copy any pristine nodes, but have + to prevent anyone else from doing read_inode() while + we're at it, so we set the state accordingly */ + if (ref_flags(raw) == REF_PRISTINE) + ic->state = INO_STATE_GC; + else { + jffs2_dbg(1, "Ino #%u is absent but node not REF_PRISTINE. Reading.\n", + ic->ino); + } + break; + + case INO_STATE_PRESENT: + /* It's in-core. GC must iget() it. */ + break; + + case INO_STATE_UNCHECKED: + case INO_STATE_CHECKING: + case INO_STATE_GC: + /* Should never happen. We should have finished checking + by the time we actually start doing any GC, and since + we're holding the alloc_sem, no other garbage collection + can happen. + */ + pr_crit("Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n", + ic->ino, ic->state); + mutex_unlock(&c->alloc_sem); + spin_unlock(&c->inocache_lock); + BUG(); + + case INO_STATE_READING: + /* Someone's currently trying to read it. We must wait for + them to finish and then go through the full iget() route + to do the GC. However, sometimes read_inode() needs to get + the alloc_sem() (for marking nodes invalid) so we must + drop the alloc_sem before sleeping. */ + + mutex_unlock(&c->alloc_sem); + jffs2_dbg(1, "%s(): waiting for ino #%u in state %d\n", + __func__, ic->ino, ic->state); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + /* And because we dropped the alloc_sem we must start again from the + beginning. Ponder chance of livelock here -- we're returning success + without actually making any progress. + + Q: What are the chances that the inode is back in INO_STATE_READING + again by the time we next enter this function? And that this happens + enough times to cause a real delay? + + A: Small enough that I don't care :) + */ + return 0; + } + + /* OK. Now if the inode is in state INO_STATE_GC, we are going to copy the + node intact, and we don't have to muck about with the fragtree etc. + because we know it's not in-core. If it _was_ in-core, we go through + all the iget() crap anyway */ + + if (ic->state == INO_STATE_GC) { + spin_unlock(&c->inocache_lock); + + ret = jffs2_garbage_collect_pristine(c, ic, raw); + + spin_lock(&c->inocache_lock); + ic->state = INO_STATE_CHECKEDABSENT; + wake_up(&c->inocache_wq); + + if (ret != -EBADFD) { + spin_unlock(&c->inocache_lock); + goto test_gcnode; + } + + /* Fall through if it wanted us to, with inocache_lock held */ + } + + /* Prevent the fairly unlikely race where the gcblock is + entirely obsoleted by the final close of a file which had + the only valid nodes in the block, followed by erasure, + followed by freeing of the ic because the erased block(s) + held _all_ the nodes of that inode.... never been seen but + it's vaguely possible. */ + + inum = ic->ino; + nlink = ic->pino_nlink; + spin_unlock(&c->inocache_lock); + + f = jffs2_gc_fetch_inode(c, inum, !nlink); + if (IS_ERR(f)) { + ret = PTR_ERR(f); + goto release_sem; + } + if (!f) { + ret = 0; + goto release_sem; + } + + ret = jffs2_garbage_collect_live(c, jeb, raw, f); + + jffs2_gc_release_inode(c, f); + + test_gcnode: + if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) { + /* Eep. This really should never happen. GC is broken */ + pr_err("Error garbage collecting node at %08x!\n", + ref_offset(jeb->gc_node)); + ret = -ENOSPC; + } + release_sem: + mutex_unlock(&c->alloc_sem); + + eraseit_lock: + /* If we've finished this block, start it erasing */ + spin_lock(&c->erase_completion_lock); + + eraseit: + if (c->gcblock && !c->gcblock->used_size) { + jffs2_dbg(1, "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", + c->gcblock->offset); + /* We're GC'ing an empty block? */ + list_add_tail(&c->gcblock->list, &c->erase_pending_list); + c->gcblock = NULL; + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + spin_unlock(&c->erase_completion_lock); + + return ret; +} + +static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *frag; + struct jffs2_full_dnode *fn = NULL; + struct jffs2_full_dirent *fd; + uint32_t start = 0, end = 0, nrfrags = 0; + int ret = 0; + + mutex_lock(&f->sem); + + /* Now we have the lock for this inode. Check that it's still the one at the head + of the list. */ + + spin_lock(&c->erase_completion_lock); + + if (c->gcblock != jeb) { + spin_unlock(&c->erase_completion_lock); + jffs2_dbg(1, "GC block is no longer gcblock. Restart\n"); + goto upnout; + } + if (ref_obsolete(raw)) { + spin_unlock(&c->erase_completion_lock); + jffs2_dbg(1, "node to be GC'd was obsoleted in the meantime.\n"); + /* They'll call again */ + goto upnout; + } + spin_unlock(&c->erase_completion_lock); + + /* OK. Looks safe. And nobody can get us now because we have the semaphore. Move the block */ + if (f->metadata && f->metadata->raw == raw) { + fn = f->metadata; + ret = jffs2_garbage_collect_metadata(c, jeb, f, fn); + goto upnout; + } + + /* FIXME. Read node and do lookup? */ + for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) { + if (frag->node && frag->node->raw == raw) { + fn = frag->node; + end = frag->ofs + frag->size; + if (!nrfrags++) + start = frag->ofs; + if (nrfrags == frag->node->frags) + break; /* We've found them all */ + } + } + if (fn) { + if (ref_flags(raw) == REF_PRISTINE) { + ret = jffs2_garbage_collect_pristine(c, f->inocache, raw); + if (!ret) { + /* Urgh. Return it sensibly. */ + frag->node->raw = f->inocache->nodes; + } + if (ret != -EBADFD) + goto upnout; + } + /* We found a datanode. Do the GC */ + if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) { + /* It crosses a page boundary. Therefore, it must be a hole. */ + ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end); + } else { + /* It could still be a hole. But we GC the page this way anyway */ + ret = jffs2_garbage_collect_dnode(c, jeb, f, fn, start, end); + } + goto upnout; + } + + /* Wasn't a dnode. Try dirent */ + for (fd = f->dents; fd; fd=fd->next) { + if (fd->raw == raw) + break; + } + + if (fd && fd->ino) { + ret = jffs2_garbage_collect_dirent(c, jeb, f, fd); + } else if (fd) { + ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd); + } else { + pr_warn("Raw node at 0x%08x wasn't in node lists for ino #%u\n", + ref_offset(raw), f->inocache->ino); + if (ref_obsolete(raw)) { + pr_warn("But it's obsolete so we don't mind too much\n"); + } else { + jffs2_dbg_dump_node(c, ref_offset(raw)); + BUG(); + } + } + upnout: + mutex_unlock(&f->sem); + + return ret; +} + +static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_raw_node_ref *raw) +{ + union jffs2_node_union *node; + size_t retlen; + int ret; + uint32_t phys_ofs, alloclen; + uint32_t crc, rawlen; + int retried = 0; + + jffs2_dbg(1, "Going to GC REF_PRISTINE node at 0x%08x\n", + ref_offset(raw)); + + alloclen = rawlen = ref_totlen(c, c->gcblock, raw); + + /* Ask for a small amount of space (or the totlen if smaller) because we + don't want to force wastage of the end of a block if splitting would + work. */ + if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN) + alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN; + + ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen); + /* 'rawlen' is not the exact summary size; it is only an upper estimation */ + + if (ret) + return ret; + + if (alloclen < rawlen) { + /* Doesn't fit untouched. We'll go the old route and split it */ + return -EBADFD; + } + + node = kmalloc(rawlen, GFP_KERNEL); + if (!node) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)node); + if (!ret && retlen != rawlen) + ret = -EIO; + if (ret) + goto out_node; + + crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4); + if (je32_to_cpu(node->u.hdr_crc) != crc) { + pr_warn("Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc); + goto bail; + } + + switch(je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + crc = crc32(0, node, sizeof(node->i)-8); + if (je32_to_cpu(node->i.node_crc) != crc) { + pr_warn("Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->i.node_crc), + crc); + goto bail; + } + + if (je32_to_cpu(node->i.dsize)) { + crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize)); + if (je32_to_cpu(node->i.data_crc) != crc) { + pr_warn("Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->i.data_crc), crc); + goto bail; + } + } + break; + + case JFFS2_NODETYPE_DIRENT: + crc = crc32(0, node, sizeof(node->d)-8); + if (je32_to_cpu(node->d.node_crc) != crc) { + pr_warn("Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->d.node_crc), crc); + goto bail; + } + + if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) { + pr_warn("Name in dirent node at 0x%08x contains zeroes\n", + ref_offset(raw)); + goto bail; + } + + if (node->d.nsize) { + crc = crc32(0, node->d.name, node->d.nsize); + if (je32_to_cpu(node->d.name_crc) != crc) { + pr_warn("Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->d.name_crc), crc); + goto bail; + } + } + break; + default: + /* If it's inode-less, we don't _know_ what it is. Just copy it intact */ + if (ic) { + pr_warn("Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n", + ref_offset(raw), je16_to_cpu(node->u.nodetype)); + goto bail; + } + } + + /* OK, all the CRCs are good; this node can just be copied as-is. */ + retry: + phys_ofs = write_ofs(c); + + ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node); + + if (ret || (retlen != rawlen)) { + pr_notice("Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n", + rawlen, phys_ofs, ret, retlen); + if (retlen) { + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL); + } else { + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + phys_ofs); + } + if (!retried) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size]; + + retried = 1; + + jffs2_dbg(1, "Retrying failed write of REF_PRISTINE node.\n"); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen); + /* this is not the exact summary size of it, + it is only an upper estimation */ + + if (!ret) { + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n", + phys_ofs); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + goto retry; + } + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); + } + + if (!ret) + ret = -EIO; + goto out_node; + } + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic); + + jffs2_mark_node_obsolete(c, raw); + jffs2_dbg(1, "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", + ref_offset(raw)); + + out_node: + kfree(node); + return ret; + bail: + ret = -EBADFD; + goto out_node; +} + +static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn) +{ + struct jffs2_full_dnode *new_fn; + struct jffs2_raw_inode ri; + struct jffs2_node_frag *last_frag; + union jffs2_device_node dev; + char *mdata = NULL; + int mdatalen = 0; + uint32_t alloclen, ilen; + int ret; + + if (S_ISBLK(JFFS2_F_I_MODE(f)) || + S_ISCHR(JFFS2_F_I_MODE(f)) ) { + /* For these, we don't actually need to read the old node */ + mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f)); + mdata = (char *)&dev; + jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n", + __func__, mdatalen); + } else if (S_ISLNK(JFFS2_F_I_MODE(f))) { + mdatalen = fn->size; + mdata = kmalloc(fn->size, GFP_KERNEL); + if (!mdata) { + pr_warn("kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n"); + return -ENOMEM; + } + ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen); + if (ret) { + pr_warn("read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", + ret); + kfree(mdata); + return ret; + } + jffs2_dbg(1, "%s(): Writing %d bites of symlink target\n", + __func__, mdatalen); + + } + + ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen, + JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n", + sizeof(ri) + mdatalen, ret); + goto out; + } + + last_frag = frag_last(&f->fragtree); + if (last_frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = last_frag->ofs + last_frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + + memset(&ri, 0, sizeof(ri)); + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri) + mdatalen); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(ilen); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.offset = cpu_to_je32(0); + ri.csize = cpu_to_je32(mdatalen); + ri.dsize = cpu_to_je32(mdatalen); + ri.compr = JFFS2_COMPR_NONE; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen)); + + new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC); + + if (IS_ERR(new_fn)) { + pr_warn("Error writing new dnode: %ld\n", PTR_ERR(new_fn)); + ret = PTR_ERR(new_fn); + goto out; + } + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + f->metadata = new_fn; + out: + if (S_ISLNK(JFFS2_F_I_MODE(f))) + kfree(mdata); + return ret; +} + +static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent *new_fd; + struct jffs2_raw_dirent rd; + uint32_t alloclen; + int ret; + + rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd.nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd.nsize = strlen(fd->name); + rd.totlen = cpu_to_je32(sizeof(rd) + rd.nsize); + rd.hdr_crc = cpu_to_je32(crc32(0, &rd, sizeof(struct jffs2_unknown_node)-4)); + + rd.pino = cpu_to_je32(f->inocache->ino); + rd.version = cpu_to_je32(++f->highest_version); + rd.ino = cpu_to_je32(fd->ino); + /* If the times on this inode were set by explicit utime() they can be different, + so refrain from splatting them. */ + if (JFFS2_F_I_MTIME(f) == JFFS2_F_I_CTIME(f)) + rd.mctime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + else + rd.mctime = cpu_to_je32(0); + rd.type = fd->type; + rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8)); + rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize)); + + ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen, + JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize)); + if (ret) { + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n", + sizeof(rd)+rd.nsize, ret); + return ret; + } + new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC); + + if (IS_ERR(new_fd)) { + pr_warn("jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", + PTR_ERR(new_fd)); + return PTR_ERR(new_fd); + } + jffs2_add_fd_to_list(c, new_fd, &f->dents); + return 0; +} + +static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent **fdp = &f->dents; + int found = 0; + + /* On a medium where we can't actually mark nodes obsolete + pernamently, such as NAND flash, we need to work out + whether this deletion dirent is still needed to actively + delete a 'real' dirent with the same name that's still + somewhere else on the flash. */ + if (!jffs2_can_mark_obsolete(c)) { + struct jffs2_raw_dirent *rd; + struct jffs2_raw_node_ref *raw; + int ret; + size_t retlen; + int name_len = strlen(fd->name); + uint32_t name_crc = crc32(0, fd->name, name_len); + uint32_t rawlen = ref_totlen(c, jeb, fd->raw); + + rd = kmalloc(rawlen, GFP_KERNEL); + if (!rd) + return -ENOMEM; + + /* Prevent the erase code from nicking the obsolete node refs while + we're looking at them. I really don't like this extra lock but + can't see any alternative. Suggestions on a postcard to... */ + mutex_lock(&c->erase_free_sem); + + for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) { + + cond_resched(); + + /* We only care about obsolete ones */ + if (!(ref_obsolete(raw))) + continue; + + /* Any dirent with the same name is going to have the same length... */ + if (ref_totlen(c, NULL, raw) != rawlen) + continue; + + /* Doesn't matter if there's one in the same erase block. We're going to + delete it too at the same time. */ + if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset)) + continue; + + jffs2_dbg(1, "Check potential deletion dirent at %08x\n", + ref_offset(raw)); + + /* This is an obsolete node belonging to the same directory, and it's of the right + length. We need to take a closer look...*/ + ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd); + if (ret) { + pr_warn("%s(): Read error (%d) reading obsolete node at %08x\n", + __func__, ret, ref_offset(raw)); + /* If we can't read it, we don't need to continue to obsolete it. Continue */ + continue; + } + if (retlen != rawlen) { + pr_warn("%s(): Short read (%zd not %u) reading header from obsolete node at %08x\n", + __func__, retlen, rawlen, + ref_offset(raw)); + continue; + } + + if (je16_to_cpu(rd->nodetype) != JFFS2_NODETYPE_DIRENT) + continue; + + /* If the name CRC doesn't match, skip */ + if (je32_to_cpu(rd->name_crc) != name_crc) + continue; + + /* If the name length doesn't match, or it's another deletion dirent, skip */ + if (rd->nsize != name_len || !je32_to_cpu(rd->ino)) + continue; + + /* OK, check the actual name now */ + if (memcmp(rd->name, fd->name, name_len)) + continue; + + /* OK. The name really does match. There really is still an older node on + the flash which our deletion dirent obsoletes. So we have to write out + a new deletion dirent to replace it */ + mutex_unlock(&c->erase_free_sem); + + jffs2_dbg(1, "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n", + ref_offset(fd->raw), fd->name, + ref_offset(raw), je32_to_cpu(rd->ino)); + kfree(rd); + + return jffs2_garbage_collect_dirent(c, jeb, f, fd); + } + + mutex_unlock(&c->erase_free_sem); + kfree(rd); + } + + /* FIXME: If we're deleting a dirent which contains the current mtime and ctime, + we should update the metadata node with those times accordingly */ + + /* No need for it any more. Just mark it obsolete and remove it from the list */ + while (*fdp) { + if ((*fdp) == fd) { + found = 1; + *fdp = fd->next; + break; + } + fdp = &(*fdp)->next; + } + if (!found) { + pr_warn("Deletion dirent \"%s\" not found in list for ino #%u\n", + fd->name, f->inocache->ino); + } + jffs2_mark_node_obsolete(c, fd->raw); + jffs2_free_full_dirent(fd); + return 0; +} + +static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end) +{ + struct jffs2_raw_inode ri; + struct jffs2_node_frag *frag; + struct jffs2_full_dnode *new_fn; + uint32_t alloclen, ilen; + int ret; + + jffs2_dbg(1, "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end); + + memset(&ri, 0, sizeof(ri)); + + if(fn->frags > 1) { + size_t readlen; + uint32_t crc; + /* It's partially obsoleted by a later write. So we have to + write it out again with the _same_ version as before */ + ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri); + if (readlen != sizeof(ri) || ret) { + pr_warn("Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", + ret, readlen); + goto fill; + } + if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) { + pr_warn("%s(): Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n", + __func__, ref_offset(fn->raw), + je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE); + return -EIO; + } + if (je32_to_cpu(ri.totlen) != sizeof(ri)) { + pr_warn("%s(): Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n", + __func__, ref_offset(fn->raw), + je32_to_cpu(ri.totlen), sizeof(ri)); + return -EIO; + } + crc = crc32(0, &ri, sizeof(ri)-8); + if (crc != je32_to_cpu(ri.node_crc)) { + pr_warn("%s: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n", + __func__, ref_offset(fn->raw), + je32_to_cpu(ri.node_crc), crc); + /* FIXME: We could possibly deal with this by writing new holes for each frag */ + pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); + goto fill; + } + if (ri.compr != JFFS2_COMPR_ZERO) { + pr_warn("%s(): Node 0x%08x wasn't a hole node!\n", + __func__, ref_offset(fn->raw)); + pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); + goto fill; + } + } else { + fill: + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri)); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.offset = cpu_to_je32(start); + ri.dsize = cpu_to_je32(end - start); + ri.csize = cpu_to_je32(0); + ri.compr = JFFS2_COMPR_ZERO; + } + + frag = frag_last(&f->fragtree); + if (frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = frag->ofs + frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(ilen); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.data_crc = cpu_to_je32(0); + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + + ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen, + JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n", + sizeof(ri), ret); + return ret; + } + new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC); + + if (IS_ERR(new_fn)) { + pr_warn("Error writing new hole node: %ld\n", PTR_ERR(new_fn)); + return PTR_ERR(new_fn); + } + if (je32_to_cpu(ri.version) == f->highest_version) { + jffs2_add_full_dnode_to_inode(c, f, new_fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + return 0; + } + + /* + * We should only get here in the case where the node we are + * replacing had more than one frag, so we kept the same version + * number as before. (Except in case of error -- see 'goto fill;' + * above.) + */ + D1(if(unlikely(fn->frags <= 1)) { + pr_warn("%s(): Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n", + __func__, fn->frags, je32_to_cpu(ri.version), + f->highest_version, je32_to_cpu(ri.ino)); + }); + + /* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */ + mark_ref_normal(new_fn->raw); + + for (frag = jffs2_lookup_node_frag(&f->fragtree, fn->ofs); + frag; frag = frag_next(frag)) { + if (frag->ofs > fn->size + fn->ofs) + break; + if (frag->node == fn) { + frag->node = new_fn; + new_fn->frags++; + fn->frags--; + } + } + if (fn->frags) { + pr_warn("%s(): Old node still has frags!\n", __func__); + BUG(); + } + if (!new_fn->frags) { + pr_warn("%s(): New node has no frags!\n", __func__); + BUG(); + } + + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + + return 0; +} + +static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *orig_jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end) +{ + struct jffs2_full_dnode *new_fn; + struct jffs2_raw_inode ri; + uint32_t alloclen, offset, orig_end, orig_start; + int ret = 0; + unsigned char *comprbuf = NULL, *writebuf; + unsigned long pg; + unsigned char *pg_ptr; + + memset(&ri, 0, sizeof(ri)); + + jffs2_dbg(1, "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end); + + orig_end = end; + orig_start = start; + + if (c->nr_free_blocks + c->nr_erasing_blocks > c->resv_blocks_gcmerge) { + /* Attempt to do some merging. But only expand to cover logically + adjacent frags if the block containing them is already considered + to be dirty. Otherwise we end up with GC just going round in + circles dirtying the nodes it already wrote out, especially + on NAND where we have small eraseblocks and hence a much higher + chance of nodes having to be split to cross boundaries. */ + + struct jffs2_node_frag *frag; + uint32_t min, max; + + min = start & ~(PAGE_CACHE_SIZE-1); + max = min + PAGE_CACHE_SIZE; + + frag = jffs2_lookup_node_frag(&f->fragtree, start); + + /* BUG_ON(!frag) but that'll happen anyway... */ + + BUG_ON(frag->ofs != start); + + /* First grow down... */ + while((frag = frag_prev(frag)) && frag->ofs >= min) { + + /* If the previous frag doesn't even reach the beginning, there's + excessive fragmentation. Just merge. */ + if (frag->ofs > min) { + jffs2_dbg(1, "Expanding down to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size); + start = frag->ofs; + continue; + } + /* OK. This frag holds the first byte of the page. */ + if (!frag->node || !frag->node->raw) { + jffs2_dbg(1, "First frag in page is hole (0x%x-0x%x). Not expanding down.\n", + frag->ofs, frag->ofs+frag->size); + break; + } else { + + /* OK, it's a frag which extends to the beginning of the page. Does it live + in a block which is still considered clean? If so, don't obsolete it. + If not, cover it anyway. */ + + struct jffs2_raw_node_ref *raw = frag->node->raw; + struct jffs2_eraseblock *jeb; + + jeb = &c->blocks[raw->flash_offset / c->sector_size]; + + if (jeb == c->gcblock) { + jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, + frag->ofs + frag->size, + ref_offset(raw)); + start = frag->ofs; + break; + } + if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { + jffs2_dbg(1, "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); + break; + } + + jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); + start = frag->ofs; + break; + } + } + + /* ... then up */ + + /* Find last frag which is actually part of the node we're to GC. */ + frag = jffs2_lookup_node_frag(&f->fragtree, end-1); + + while((frag = frag_next(frag)) && frag->ofs+frag->size <= max) { + + /* If the previous frag doesn't even reach the beginning, there's lots + of fragmentation. Just merge. */ + if (frag->ofs+frag->size < max) { + jffs2_dbg(1, "Expanding up to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size); + end = frag->ofs + frag->size; + continue; + } + + if (!frag->node || !frag->node->raw) { + jffs2_dbg(1, "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n", + frag->ofs, frag->ofs+frag->size); + break; + } else { + + /* OK, it's a frag which extends to the beginning of the page. Does it live + in a block which is still considered clean? If so, don't obsolete it. + If not, cover it anyway. */ + + struct jffs2_raw_node_ref *raw = frag->node->raw; + struct jffs2_eraseblock *jeb; + + jeb = &c->blocks[raw->flash_offset / c->sector_size]; + + if (jeb == c->gcblock) { + jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, + frag->ofs + frag->size, + ref_offset(raw)); + end = frag->ofs + frag->size; + break; + } + if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { + jffs2_dbg(1, "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); + break; + } + + jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); + end = frag->ofs + frag->size; + break; + } + } + jffs2_dbg(1, "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n", + orig_start, orig_end, start, end); + + D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size)); + BUG_ON(end < orig_end); + BUG_ON(start > orig_start); + } + + /* First, use readpage() to read the appropriate page into the page cache */ + /* Q: What happens if we actually try to GC the _same_ page for which commit_write() + * triggered garbage collection in the first place? + * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the + * page OK. We'll actually write it out again in commit_write, which is a little + * suboptimal, but at least we're correct. + */ + pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg); + + if (IS_ERR(pg_ptr)) { + pr_warn("read_cache_page() returned error: %ld\n", + PTR_ERR(pg_ptr)); + return PTR_ERR(pg_ptr); + } + + offset = start; + while(offset < orig_end) { + uint32_t datalen; + uint32_t cdatalen; + uint16_t comprtype = JFFS2_COMPR_NONE; + + ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN, + &alloclen, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n", + sizeof(ri) + JFFS2_MIN_DATA_LEN, ret); + break; + } + cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset); + datalen = end - offset; + + writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1)); + + comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen); + + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri) + cdatalen); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f)); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.offset = cpu_to_je32(offset); + ri.csize = cpu_to_je32(cdatalen); + ri.dsize = cpu_to_je32(datalen); + ri.compr = comprtype & 0xff; + ri.usercompr = (comprtype >> 8) & 0xff; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen)); + + new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC); + + jffs2_free_comprbuf(comprbuf, writebuf); + + if (IS_ERR(new_fn)) { + pr_warn("Error writing new dnode: %ld\n", + PTR_ERR(new_fn)); + ret = PTR_ERR(new_fn); + break; + } + ret = jffs2_add_full_dnode_to_inode(c, f, new_fn); + offset += datalen; + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + } + + jffs2_gc_release_page(c, pg_ptr, &pg); + return ret; +} diff --git a/kernel/fs/jffs2/ioctl.c b/kernel/fs/jffs2/ioctl.c new file mode 100644 index 000000000..859a598af --- /dev/null +++ b/kernel/fs/jffs2/ioctl.c @@ -0,0 +1,22 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include "nodelist.h" + +long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + /* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which + will include compression support etc. */ + return -ENOTTY; +} + diff --git a/kernel/fs/jffs2/jffs2_fs_i.h b/kernel/fs/jffs2/jffs2_fs_i.h new file mode 100644 index 000000000..2e4a86763 --- /dev/null +++ b/kernel/fs/jffs2/jffs2_fs_i.h @@ -0,0 +1,56 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_I +#define _JFFS2_FS_I + +#include +#include +#include + +struct jffs2_inode_info { + /* We need an internal mutex similar to inode->i_mutex. + Unfortunately, we can't used the existing one, because + either the GC would deadlock, or we'd have to release it + before letting GC proceed. Or we'd have to put ugliness + into the GC code so it didn't attempt to obtain the i_mutex + for the inode(s) which are already locked */ + struct mutex sem; + + /* The highest (datanode) version number used for this ino */ + uint32_t highest_version; + + /* List of data fragments which make up the file */ + struct rb_root fragtree; + + /* There may be one datanode which isn't referenced by any of the + above fragments, if it contains a metadata update but no actual + data - or if this is a directory inode */ + /* This also holds the _only_ dnode for symlinks/device nodes, + etc. */ + struct jffs2_full_dnode *metadata; + + /* Directory entries */ + struct jffs2_full_dirent *dents; + + /* The target path if this is the inode of a symlink */ + unsigned char *target; + + /* Some stuff we just have to keep in-core at all times, for each inode. */ + struct jffs2_inode_cache *inocache; + + uint16_t flags; + uint8_t usercompr; + struct inode vfs_inode; +}; + +#endif /* _JFFS2_FS_I */ diff --git a/kernel/fs/jffs2/jffs2_fs_sb.h b/kernel/fs/jffs2/jffs2_fs_sb.h new file mode 100644 index 000000000..046fee8b6 --- /dev/null +++ b/kernel/fs/jffs2/jffs2_fs_sb.h @@ -0,0 +1,162 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_SB +#define _JFFS2_FS_SB + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define JFFS2_SB_FLAG_RO 1 +#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */ +#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */ + +struct jffs2_inodirty; + +struct jffs2_mount_opts { + bool override_compr; + unsigned int compr; + + /* The size of the reserved pool. The reserved pool is the JFFS2 flash + * space which may only be used by root cannot be used by the other + * users. This is implemented simply by means of not allowing the + * latter users to write to the file system if the amount if the + * available space is less then 'rp_size'. */ + unsigned int rp_size; +}; + +/* A struct for the overall file system control. Pointers to + jffs2_sb_info structs are named `c' in the source code. + Nee jffs_control +*/ +struct jffs2_sb_info { + struct mtd_info *mtd; + + uint32_t highest_ino; + uint32_t checked_ino; + + unsigned int flags; + + struct task_struct *gc_task; /* GC task struct */ + struct completion gc_thread_start; /* GC thread start completion */ + struct completion gc_thread_exit; /* GC thread exit completion port */ + + struct mutex alloc_sem; /* Used to protect all the following + fields, and also to protect against + out-of-order writing of nodes. And GC. */ + uint32_t cleanmarker_size; /* Size of an _inline_ CLEANMARKER + (i.e. zero for OOB CLEANMARKER */ + + uint32_t flash_size; + uint32_t used_size; + uint32_t dirty_size; + uint32_t wasted_size; + uint32_t free_size; + uint32_t erasing_size; + uint32_t bad_size; + uint32_t sector_size; + uint32_t unchecked_size; + + uint32_t nr_free_blocks; + uint32_t nr_erasing_blocks; + + /* Number of free blocks there must be before we... */ + uint8_t resv_blocks_write; /* ... allow a normal filesystem write */ + uint8_t resv_blocks_deletion; /* ... allow a normal filesystem deletion */ + uint8_t resv_blocks_gctrigger; /* ... wake up the GC thread */ + uint8_t resv_blocks_gcbad; /* ... pick a block from the bad_list to GC */ + uint8_t resv_blocks_gcmerge; /* ... merge pages when garbage collecting */ + /* Number of 'very dirty' blocks before we trigger immediate GC */ + uint8_t vdirty_blocks_gctrigger; + + uint32_t nospc_dirty_size; + + uint32_t nr_blocks; + struct jffs2_eraseblock *blocks; /* The whole array of blocks. Used for getting blocks + * from the offset (blocks[ofs / sector_size]) */ + struct jffs2_eraseblock *nextblock; /* The block we're currently filling */ + + struct jffs2_eraseblock *gcblock; /* The block we're currently garbage-collecting */ + + struct list_head clean_list; /* Blocks 100% full of clean data */ + struct list_head very_dirty_list; /* Blocks with lots of dirty space */ + struct list_head dirty_list; /* Blocks with some dirty space */ + struct list_head erasable_list; /* Blocks which are completely dirty, and need erasing */ + struct list_head erasable_pending_wbuf_list; /* Blocks which need erasing but only after the current wbuf is flushed */ + struct list_head erasing_list; /* Blocks which are currently erasing */ + struct list_head erase_checking_list; /* Blocks which are being checked and marked */ + struct list_head erase_pending_list; /* Blocks which need erasing now */ + struct list_head erase_complete_list; /* Blocks which are erased and need the clean marker written to them */ + struct list_head free_list; /* Blocks which are free and ready to be used */ + struct list_head bad_list; /* Bad blocks. */ + struct list_head bad_used_list; /* Bad blocks with valid data in. */ + + spinlock_t erase_completion_lock; /* Protect free_list and erasing_list + against erase completion handler */ + wait_queue_head_t erase_wait; /* For waiting for erases to complete */ + + wait_queue_head_t inocache_wq; + int inocache_hashsize; + struct jffs2_inode_cache **inocache_list; + spinlock_t inocache_lock; + + /* Sem to allow jffs2_garbage_collect_deletion_dirent to + drop the erase_completion_lock while it's holding a pointer + to an obsoleted node. I don't like this. Alternatives welcomed. */ + struct mutex erase_free_sem; + + uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */ + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + unsigned char *wbuf_verify; /* read-back buffer for verification */ +#endif +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + unsigned char *wbuf; /* Write-behind buffer for NAND flash */ + uint32_t wbuf_ofs; + uint32_t wbuf_len; + struct jffs2_inodirty *wbuf_inodes; + struct rw_semaphore wbuf_sem; /* Protects the write buffer */ + + struct delayed_work wbuf_dwork; /* write-buffer write-out work */ + + unsigned char *oobbuf; + int oobavail; /* How many bytes are available for JFFS2 in OOB */ +#endif + + struct jffs2_summary *summary; /* Summary information */ + struct jffs2_mount_opts mount_opts; + +#ifdef CONFIG_JFFS2_FS_XATTR +#define XATTRINDEX_HASHSIZE (57) + uint32_t highest_xid; + uint32_t highest_xseqno; + struct list_head xattrindex[XATTRINDEX_HASHSIZE]; + struct list_head xattr_unchecked; + struct list_head xattr_dead_list; + struct jffs2_xattr_ref *xref_dead_list; + struct jffs2_xattr_ref *xref_temp; + struct rw_semaphore xattr_sem; + uint32_t xdatum_mem_usage; + uint32_t xdatum_mem_threshold; +#endif + /* OS-private pointer for getting back to master superblock info */ + void *os_priv; +}; + +#endif /* _JFFS2_FS_SB */ diff --git a/kernel/fs/jffs2/malloc.c b/kernel/fs/jffs2/malloc.c new file mode 100644 index 000000000..b8fd65130 --- /dev/null +++ b/kernel/fs/jffs2/malloc.c @@ -0,0 +1,324 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "nodelist.h" + +/* These are initialised to NULL in the kernel startup code. + If you're porting to other operating systems, beware */ +static struct kmem_cache *full_dnode_slab; +static struct kmem_cache *raw_dirent_slab; +static struct kmem_cache *raw_inode_slab; +static struct kmem_cache *tmp_dnode_info_slab; +static struct kmem_cache *raw_node_ref_slab; +static struct kmem_cache *node_frag_slab; +static struct kmem_cache *inode_cache_slab; +#ifdef CONFIG_JFFS2_FS_XATTR +static struct kmem_cache *xattr_datum_cache; +static struct kmem_cache *xattr_ref_cache; +#endif + +int __init jffs2_create_slab_caches(void) +{ + full_dnode_slab = kmem_cache_create("jffs2_full_dnode", + sizeof(struct jffs2_full_dnode), + 0, 0, NULL); + if (!full_dnode_slab) + goto err; + + raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", + sizeof(struct jffs2_raw_dirent), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!raw_dirent_slab) + goto err; + + raw_inode_slab = kmem_cache_create("jffs2_raw_inode", + sizeof(struct jffs2_raw_inode), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!raw_inode_slab) + goto err; + + tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode", + sizeof(struct jffs2_tmp_dnode_info), + 0, 0, NULL); + if (!tmp_dnode_info_slab) + goto err; + + raw_node_ref_slab = kmem_cache_create("jffs2_refblock", + sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1), + 0, 0, NULL); + if (!raw_node_ref_slab) + goto err; + + node_frag_slab = kmem_cache_create("jffs2_node_frag", + sizeof(struct jffs2_node_frag), + 0, 0, NULL); + if (!node_frag_slab) + goto err; + + inode_cache_slab = kmem_cache_create("jffs2_inode_cache", + sizeof(struct jffs2_inode_cache), + 0, 0, NULL); + if (!inode_cache_slab) + goto err; + +#ifdef CONFIG_JFFS2_FS_XATTR + xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum", + sizeof(struct jffs2_xattr_datum), + 0, 0, NULL); + if (!xattr_datum_cache) + goto err; + + xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref", + sizeof(struct jffs2_xattr_ref), + 0, 0, NULL); + if (!xattr_ref_cache) + goto err; +#endif + + return 0; + err: + jffs2_destroy_slab_caches(); + return -ENOMEM; +} + +void jffs2_destroy_slab_caches(void) +{ + if(full_dnode_slab) + kmem_cache_destroy(full_dnode_slab); + if(raw_dirent_slab) + kmem_cache_destroy(raw_dirent_slab); + if(raw_inode_slab) + kmem_cache_destroy(raw_inode_slab); + if(tmp_dnode_info_slab) + kmem_cache_destroy(tmp_dnode_info_slab); + if(raw_node_ref_slab) + kmem_cache_destroy(raw_node_ref_slab); + if(node_frag_slab) + kmem_cache_destroy(node_frag_slab); + if(inode_cache_slab) + kmem_cache_destroy(inode_cache_slab); +#ifdef CONFIG_JFFS2_FS_XATTR + if (xattr_datum_cache) + kmem_cache_destroy(xattr_datum_cache); + if (xattr_ref_cache) + kmem_cache_destroy(xattr_ref_cache); +#endif +} + +struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize) +{ + struct jffs2_full_dirent *ret; + ret = kmalloc(sizeof(struct jffs2_full_dirent) + namesize, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_full_dirent(struct jffs2_full_dirent *x) +{ + dbg_memalloc("%p\n", x); + kfree(x); +} + +struct jffs2_full_dnode *jffs2_alloc_full_dnode(void) +{ + struct jffs2_full_dnode *ret; + ret = kmem_cache_alloc(full_dnode_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_full_dnode(struct jffs2_full_dnode *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(full_dnode_slab, x); +} + +struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void) +{ + struct jffs2_raw_dirent *ret; + ret = kmem_cache_alloc(raw_dirent_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_raw_dirent(struct jffs2_raw_dirent *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_dirent_slab, x); +} + +struct jffs2_raw_inode *jffs2_alloc_raw_inode(void) +{ + struct jffs2_raw_inode *ret; + ret = kmem_cache_alloc(raw_inode_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_raw_inode(struct jffs2_raw_inode *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_inode_slab, x); +} + +struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void) +{ + struct jffs2_tmp_dnode_info *ret; + ret = kmem_cache_alloc(tmp_dnode_info_slab, GFP_KERNEL); + dbg_memalloc("%p\n", + ret); + return ret; +} + +void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(tmp_dnode_info_slab, x); +} + +static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void) +{ + struct jffs2_raw_node_ref *ret; + + ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL); + if (ret) { + int i = 0; + for (i=0; i < REFS_PER_BLOCK; i++) { + ret[i].flash_offset = REF_EMPTY_NODE; + ret[i].next_in_ino = NULL; + } + ret[i].flash_offset = REF_LINK_NODE; + ret[i].next_in_ino = NULL; + } + return ret; +} + +int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int nr) +{ + struct jffs2_raw_node_ref **p, *ref; + int i = nr; + + dbg_memalloc("%d\n", nr); + + p = &jeb->last_node; + ref = *p; + + dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset); + + /* If jeb->last_node is really a valid node then skip over it */ + if (ref && ref->flash_offset != REF_EMPTY_NODE) + ref++; + + while (i) { + if (!ref) { + dbg_memalloc("Allocating new refblock linked from %p\n", p); + ref = *p = jffs2_alloc_refblock(); + if (!ref) + return -ENOMEM; + } + if (ref->flash_offset == REF_LINK_NODE) { + p = &ref->next_in_ino; + ref = *p; + continue; + } + i--; + ref++; + } + jeb->allocated_refs = nr; + + dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n", + nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset, + jeb->last_node->next_in_ino); + + return 0; +} + +void jffs2_free_refblock(struct jffs2_raw_node_ref *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_node_ref_slab, x); +} + +struct jffs2_node_frag *jffs2_alloc_node_frag(void) +{ + struct jffs2_node_frag *ret; + ret = kmem_cache_alloc(node_frag_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_node_frag(struct jffs2_node_frag *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(node_frag_slab, x); +} + +struct jffs2_inode_cache *jffs2_alloc_inode_cache(void) +{ + struct jffs2_inode_cache *ret; + ret = kmem_cache_alloc(inode_cache_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_inode_cache(struct jffs2_inode_cache *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(inode_cache_slab, x); +} + +#ifdef CONFIG_JFFS2_FS_XATTR +struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) +{ + struct jffs2_xattr_datum *xd; + xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); + dbg_memalloc("%p\n", xd); + if (!xd) + return NULL; + + xd->class = RAWNODE_CLASS_XATTR_DATUM; + xd->node = (void *)xd; + INIT_LIST_HEAD(&xd->xindex); + return xd; +} + +void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd) +{ + dbg_memalloc("%p\n", xd); + kmem_cache_free(xattr_datum_cache, xd); +} + +struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) +{ + struct jffs2_xattr_ref *ref; + ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); + dbg_memalloc("%p\n", ref); + if (!ref) + return NULL; + + ref->class = RAWNODE_CLASS_XATTR_REF; + ref->node = (void *)ref; + return ref; +} + +void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref) +{ + dbg_memalloc("%p\n", ref); + kmem_cache_free(xattr_ref_cache, ref); +} +#endif diff --git a/kernel/fs/jffs2/nodelist.c b/kernel/fs/jffs2/nodelist.c new file mode 100644 index 000000000..9a5449bc3 --- /dev/null +++ b/kernel/fs/jffs2/nodelist.c @@ -0,0 +1,755 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, + struct jffs2_node_frag *this); + +void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list) +{ + struct jffs2_full_dirent **prev = list; + + dbg_dentlist("add dirent \"%s\", ino #%u\n", new->name, new->ino); + + while ((*prev) && (*prev)->nhash <= new->nhash) { + if ((*prev)->nhash == new->nhash && !strcmp((*prev)->name, new->name)) { + /* Duplicate. Free one */ + if (new->version < (*prev)->version) { + dbg_dentlist("Eep! Marking new dirent node obsolete, old is \"%s\", ino #%u\n", + (*prev)->name, (*prev)->ino); + jffs2_mark_node_obsolete(c, new->raw); + jffs2_free_full_dirent(new); + } else { + dbg_dentlist("marking old dirent \"%s\", ino #%u obsolete\n", + (*prev)->name, (*prev)->ino); + new->next = (*prev)->next; + /* It may have been a 'placeholder' deletion dirent, + if jffs2_can_mark_obsolete() (see jffs2_do_unlink()) */ + if ((*prev)->raw) + jffs2_mark_node_obsolete(c, ((*prev)->raw)); + jffs2_free_full_dirent(*prev); + *prev = new; + } + return; + } + prev = &((*prev)->next); + } + new->next = *prev; + *prev = new; +} + +uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size) +{ + struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size); + + dbg_fragtree("truncating fragtree to 0x%08x bytes\n", size); + + /* We know frag->ofs <= size. That's what lookup does for us */ + if (frag && frag->ofs != size) { + if (frag->ofs+frag->size > size) { + frag->size = size - frag->ofs; + } + frag = frag_next(frag); + } + while (frag && frag->ofs >= size) { + struct jffs2_node_frag *next = frag_next(frag); + + frag_erase(frag, list); + jffs2_obsolete_node_frag(c, frag); + frag = next; + } + + if (size == 0) + return 0; + + frag = frag_last(list); + + /* Sanity check for truncation to longer than we started with... */ + if (!frag) + return 0; + if (frag->ofs + frag->size < size) + return frag->ofs + frag->size; + + /* If the last fragment starts at the RAM page boundary, it is + * REF_PRISTINE irrespective of its size. */ + if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) { + dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n", + frag->ofs, frag->ofs + frag->size); + frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE; + } + return size; +} + +static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, + struct jffs2_node_frag *this) +{ + if (this->node) { + this->node->frags--; + if (!this->node->frags) { + /* The node has no valid frags left. It's totally obsoleted */ + dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) obsolete\n", + ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size); + jffs2_mark_node_obsolete(c, this->node->raw); + jffs2_free_full_dnode(this->node); + } else { + dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) REF_NORMAL. frags is %d\n", + ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size, this->node->frags); + mark_ref_normal(this->node->raw); + } + + } + jffs2_free_node_frag(this); +} + +static void jffs2_fragtree_insert(struct jffs2_node_frag *newfrag, struct jffs2_node_frag *base) +{ + struct rb_node *parent = &base->rb; + struct rb_node **link = &parent; + + dbg_fragtree2("insert frag (0x%04x-0x%04x)\n", newfrag->ofs, newfrag->ofs + newfrag->size); + + while (*link) { + parent = *link; + base = rb_entry(parent, struct jffs2_node_frag, rb); + + if (newfrag->ofs > base->ofs) + link = &base->rb.rb_right; + else if (newfrag->ofs < base->ofs) + link = &base->rb.rb_left; + else { + JFFS2_ERROR("duplicate frag at %08x (%p,%p)\n", newfrag->ofs, newfrag, base); + BUG(); + } + } + + rb_link_node(&newfrag->rb, &base->rb, link); +} + +/* + * Allocate and initializes a new fragment. + */ +static struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size) +{ + struct jffs2_node_frag *newfrag; + + newfrag = jffs2_alloc_node_frag(); + if (likely(newfrag)) { + newfrag->ofs = ofs; + newfrag->size = size; + newfrag->node = fn; + } else { + JFFS2_ERROR("cannot allocate a jffs2_node_frag object\n"); + } + + return newfrag; +} + +/* + * Called when there is no overlapping fragment exist. Inserts a hole before the new + * fragment and inserts the new fragment to the fragtree. + */ +static int no_overlapping_node(struct jffs2_sb_info *c, struct rb_root *root, + struct jffs2_node_frag *newfrag, + struct jffs2_node_frag *this, uint32_t lastend) +{ + if (lastend < newfrag->node->ofs) { + /* put a hole in before the new fragment */ + struct jffs2_node_frag *holefrag; + + holefrag= new_fragment(NULL, lastend, newfrag->node->ofs - lastend); + if (unlikely(!holefrag)) { + jffs2_free_node_frag(newfrag); + return -ENOMEM; + } + + if (this) { + /* By definition, the 'this' node has no right-hand child, + because there are no frags with offset greater than it. + So that's where we want to put the hole */ + dbg_fragtree2("add hole frag %#04x-%#04x on the right of the new frag.\n", + holefrag->ofs, holefrag->ofs + holefrag->size); + rb_link_node(&holefrag->rb, &this->rb, &this->rb.rb_right); + } else { + dbg_fragtree2("Add hole frag %#04x-%#04x to the root of the tree.\n", + holefrag->ofs, holefrag->ofs + holefrag->size); + rb_link_node(&holefrag->rb, NULL, &root->rb_node); + } + rb_insert_color(&holefrag->rb, root); + this = holefrag; + } + + if (this) { + /* By definition, the 'this' node has no right-hand child, + because there are no frags with offset greater than it. + So that's where we want to put new fragment */ + dbg_fragtree2("add the new node at the right\n"); + rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right); + } else { + dbg_fragtree2("insert the new node at the root of the tree\n"); + rb_link_node(&newfrag->rb, NULL, &root->rb_node); + } + rb_insert_color(&newfrag->rb, root); + + return 0; +} + +/* Doesn't set inode->i_size */ +static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *root, struct jffs2_node_frag *newfrag) +{ + struct jffs2_node_frag *this; + uint32_t lastend; + + /* Skip all the nodes which are completed before this one starts */ + this = jffs2_lookup_node_frag(root, newfrag->node->ofs); + + if (this) { + dbg_fragtree2("lookup gave frag 0x%04x-0x%04x; phys 0x%08x (*%p)\n", + this->ofs, this->ofs+this->size, this->node?(ref_offset(this->node->raw)):0xffffffff, this); + lastend = this->ofs + this->size; + } else { + dbg_fragtree2("lookup gave no frag\n"); + lastend = 0; + } + + /* See if we ran off the end of the fragtree */ + if (lastend <= newfrag->ofs) { + /* We did */ + + /* Check if 'this' node was on the same page as the new node. + If so, both 'this' and the new node get marked REF_NORMAL so + the GC can take a look. + */ + if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) { + if (this->node) + mark_ref_normal(this->node->raw); + mark_ref_normal(newfrag->node->raw); + } + + return no_overlapping_node(c, root, newfrag, this, lastend); + } + + if (this->node) + dbg_fragtree2("dealing with frag %u-%u, phys %#08x(%d).\n", + this->ofs, this->ofs + this->size, + ref_offset(this->node->raw), ref_flags(this->node->raw)); + else + dbg_fragtree2("dealing with hole frag %u-%u.\n", + this->ofs, this->ofs + this->size); + + /* OK. 'this' is pointing at the first frag that newfrag->ofs at least partially obsoletes, + * - i.e. newfrag->ofs < this->ofs+this->size && newfrag->ofs >= this->ofs + */ + if (newfrag->ofs > this->ofs) { + /* This node isn't completely obsoleted. The start of it remains valid */ + + /* Mark the new node and the partially covered node REF_NORMAL -- let + the GC take a look at them */ + mark_ref_normal(newfrag->node->raw); + if (this->node) + mark_ref_normal(this->node->raw); + + if (this->ofs + this->size > newfrag->ofs + newfrag->size) { + /* The new node splits 'this' frag into two */ + struct jffs2_node_frag *newfrag2; + + if (this->node) + dbg_fragtree2("split old frag 0x%04x-0x%04x, phys 0x%08x\n", + this->ofs, this->ofs+this->size, ref_offset(this->node->raw)); + else + dbg_fragtree2("split old hole frag 0x%04x-0x%04x\n", + this->ofs, this->ofs+this->size); + + /* New second frag pointing to this's node */ + newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size, + this->ofs + this->size - newfrag->ofs - newfrag->size); + if (unlikely(!newfrag2)) + return -ENOMEM; + if (this->node) + this->node->frags++; + + /* Adjust size of original 'this' */ + this->size = newfrag->ofs - this->ofs; + + /* Now, we know there's no node with offset + greater than this->ofs but smaller than + newfrag2->ofs or newfrag->ofs, for obvious + reasons. So we can do a tree insert from + 'this' to insert newfrag, and a tree insert + from newfrag to insert newfrag2. */ + jffs2_fragtree_insert(newfrag, this); + rb_insert_color(&newfrag->rb, root); + + jffs2_fragtree_insert(newfrag2, newfrag); + rb_insert_color(&newfrag2->rb, root); + + return 0; + } + /* New node just reduces 'this' frag in size, doesn't split it */ + this->size = newfrag->ofs - this->ofs; + + /* Again, we know it lives down here in the tree */ + jffs2_fragtree_insert(newfrag, this); + rb_insert_color(&newfrag->rb, root); + } else { + /* New frag starts at the same point as 'this' used to. Replace + it in the tree without doing a delete and insertion */ + dbg_fragtree2("inserting newfrag (*%p),%d-%d in before 'this' (*%p),%d-%d\n", + newfrag, newfrag->ofs, newfrag->ofs+newfrag->size, this, this->ofs, this->ofs+this->size); + + rb_replace_node(&this->rb, &newfrag->rb, root); + + if (newfrag->ofs + newfrag->size >= this->ofs+this->size) { + dbg_fragtree2("obsoleting node frag %p (%x-%x)\n", this, this->ofs, this->ofs+this->size); + jffs2_obsolete_node_frag(c, this); + } else { + this->ofs += newfrag->size; + this->size -= newfrag->size; + + jffs2_fragtree_insert(this, newfrag); + rb_insert_color(&this->rb, root); + return 0; + } + } + /* OK, now we have newfrag added in the correct place in the tree, but + frag_next(newfrag) may be a fragment which is overlapped by it + */ + while ((this = frag_next(newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) { + /* 'this' frag is obsoleted completely. */ + dbg_fragtree2("obsoleting node frag %p (%x-%x) and removing from tree\n", + this, this->ofs, this->ofs+this->size); + rb_erase(&this->rb, root); + jffs2_obsolete_node_frag(c, this); + } + /* Now we're pointing at the first frag which isn't totally obsoleted by + the new frag */ + + if (!this || newfrag->ofs + newfrag->size == this->ofs) + return 0; + + /* Still some overlap but we don't need to move it in the tree */ + this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size); + this->ofs = newfrag->ofs + newfrag->size; + + /* And mark them REF_NORMAL so the GC takes a look at them */ + if (this->node) + mark_ref_normal(this->node->raw); + mark_ref_normal(newfrag->node->raw); + + return 0; +} + +/* + * Given an inode, probably with existing tree of fragments, add the new node + * to the fragment tree. + */ +int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn) +{ + int ret; + struct jffs2_node_frag *newfrag; + + if (unlikely(!fn->size)) + return 0; + + newfrag = new_fragment(fn, fn->ofs, fn->size); + if (unlikely(!newfrag)) + return -ENOMEM; + newfrag->node->frags = 1; + + dbg_fragtree("adding node %#04x-%#04x @0x%08x on flash, newfrag *%p\n", + fn->ofs, fn->ofs+fn->size, ref_offset(fn->raw), newfrag); + + ret = jffs2_add_frag_to_fragtree(c, &f->fragtree, newfrag); + if (unlikely(ret)) + return ret; + + /* If we now share a page with other nodes, mark either previous + or next node REF_NORMAL, as appropriate. */ + if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) { + struct jffs2_node_frag *prev = frag_prev(newfrag); + + mark_ref_normal(fn->raw); + /* If we don't start at zero there's _always_ a previous */ + if (prev->node) + mark_ref_normal(prev->node->raw); + } + + if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) { + struct jffs2_node_frag *next = frag_next(newfrag); + + if (next) { + mark_ref_normal(fn->raw); + if (next->node) + mark_ref_normal(next->node->raw); + } + } + jffs2_dbg_fragtree_paranoia_check_nolock(f); + + return 0; +} + +void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state) +{ + spin_lock(&c->inocache_lock); + ic->state = state; + wake_up(&c->inocache_wq); + spin_unlock(&c->inocache_lock); +} + +/* During mount, this needs no locking. During normal operation, its + callers want to do other stuff while still holding the inocache_lock. + Rather than introducing special case get_ino_cache functions or + callbacks, we just let the caller do the locking itself. */ + +struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inode_cache *ret; + + ret = c->inocache_list[ino % c->inocache_hashsize]; + while (ret && ret->ino < ino) { + ret = ret->next; + } + + if (ret && ret->ino != ino) + ret = NULL; + + return ret; +} + +void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new) +{ + struct jffs2_inode_cache **prev; + + spin_lock(&c->inocache_lock); + if (!new->ino) + new->ino = ++c->highest_ino; + + dbg_inocache("add %p (ino #%u)\n", new, new->ino); + + prev = &c->inocache_list[new->ino % c->inocache_hashsize]; + + while ((*prev) && (*prev)->ino < new->ino) { + prev = &(*prev)->next; + } + new->next = *prev; + *prev = new; + + spin_unlock(&c->inocache_lock); +} + +void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) +{ + struct jffs2_inode_cache **prev; + +#ifdef CONFIG_JFFS2_FS_XATTR + BUG_ON(old->xref); +#endif + dbg_inocache("del %p (ino #%u)\n", old, old->ino); + spin_lock(&c->inocache_lock); + + prev = &c->inocache_list[old->ino % c->inocache_hashsize]; + + while ((*prev) && (*prev)->ino < old->ino) { + prev = &(*prev)->next; + } + if ((*prev) == old) { + *prev = old->next; + } + + /* Free it now unless it's in READING or CLEARING state, which + are the transitions upon read_inode() and clear_inode(). The + rest of the time we know nobody else is looking at it, and + if it's held by read_inode() or clear_inode() they'll free it + for themselves. */ + if (old->state != INO_STATE_READING && old->state != INO_STATE_CLEARING) + jffs2_free_inode_cache(old); + + spin_unlock(&c->inocache_lock); +} + +void jffs2_free_ino_caches(struct jffs2_sb_info *c) +{ + int i; + struct jffs2_inode_cache *this, *next; + + for (i=0; i < c->inocache_hashsize; i++) { + this = c->inocache_list[i]; + while (this) { + next = this->next; + jffs2_xattr_free_inode(c, this); + jffs2_free_inode_cache(this); + this = next; + } + c->inocache_list[i] = NULL; + } +} + +void jffs2_free_raw_node_refs(struct jffs2_sb_info *c) +{ + int i; + struct jffs2_raw_node_ref *this, *next; + + for (i=0; inr_blocks; i++) { + this = c->blocks[i].first_node; + while (this) { + if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE) + next = this[REFS_PER_BLOCK].next_in_ino; + else + next = NULL; + + jffs2_free_refblock(this); + this = next; + } + c->blocks[i].first_node = c->blocks[i].last_node = NULL; + } +} + +struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset) +{ + /* The common case in lookup is that there will be a node + which precisely matches. So we go looking for that first */ + struct rb_node *next; + struct jffs2_node_frag *prev = NULL; + struct jffs2_node_frag *frag = NULL; + + dbg_fragtree2("root %p, offset %d\n", fragtree, offset); + + next = fragtree->rb_node; + + while(next) { + frag = rb_entry(next, struct jffs2_node_frag, rb); + + if (frag->ofs + frag->size <= offset) { + /* Remember the closest smaller match on the way down */ + if (!prev || frag->ofs > prev->ofs) + prev = frag; + next = frag->rb.rb_right; + } else if (frag->ofs > offset) { + next = frag->rb.rb_left; + } else { + return frag; + } + } + + /* Exact match not found. Go back up looking at each parent, + and return the closest smaller one */ + + if (prev) + dbg_fragtree2("no match. Returning frag %#04x-%#04x, closest previous\n", + prev->ofs, prev->ofs+prev->size); + else + dbg_fragtree2("returning NULL, empty fragtree\n"); + + return prev; +} + +/* Pass 'c' argument to indicate that nodes should be marked obsolete as + they're killed. */ +void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) +{ + struct jffs2_node_frag *frag, *next; + + dbg_fragtree("killing\n"); + rbtree_postorder_for_each_entry_safe(frag, next, root, rb) { + if (frag->node && !(--frag->node->frags)) { + /* Not a hole, and it's the final remaining frag + of this node. Free the node */ + if (c) + jffs2_mark_node_obsolete(c, frag->node->raw); + + jffs2_free_full_dnode(frag->node); + } + + jffs2_free_node_frag(frag); + cond_resched(); + } +} + +struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + struct jffs2_raw_node_ref *ref; + + BUG_ON(!jeb->allocated_refs); + jeb->allocated_refs--; + + ref = jeb->last_node; + + dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset, + ref->next_in_ino); + + while (ref->flash_offset != REF_EMPTY_NODE) { + if (ref->flash_offset == REF_LINK_NODE) + ref = ref->next_in_ino; + else + ref++; + } + + dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, + ref->flash_offset, ofs, ref->next_in_ino, len); + + ref->flash_offset = ofs; + + if (!jeb->first_node) { + jeb->first_node = ref; + BUG_ON(ref_offset(ref) != jeb->offset); + } else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) { + uint32_t last_len = ref_totlen(c, jeb, jeb->last_node); + + JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n", + ref, ref_offset(ref), ref_offset(ref)+len, + ref_offset(jeb->last_node), + ref_offset(jeb->last_node)+last_len); + BUG(); + } + jeb->last_node = ref; + + if (ic) { + ref->next_in_ino = ic->nodes; + ic->nodes = ref; + } else { + ref->next_in_ino = NULL; + } + + switch(ref_flags(ref)) { + case REF_UNCHECKED: + c->unchecked_size += len; + jeb->unchecked_size += len; + break; + + case REF_NORMAL: + case REF_PRISTINE: + c->used_size += len; + jeb->used_size += len; + break; + + case REF_OBSOLETE: + c->dirty_size += len; + jeb->dirty_size += len; + break; + } + c->free_size -= len; + jeb->free_size -= len; + +#ifdef TEST_TOTLEN + /* Set (and test) __totlen field... for now */ + ref->__totlen = len; + ref_totlen(c, jeb, ref); +#endif + return ref; +} + +/* No locking, no reservation of 'ref'. Do not use on a live file system */ +int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + uint32_t size) +{ + if (!size) + return 0; + if (unlikely(size > jeb->free_size)) { + pr_crit("Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n", + size, jeb->free_size, jeb->wasted_size); + BUG(); + } + /* REF_EMPTY_NODE is !obsolete, so that works OK */ + if (jeb->last_node && ref_obsolete(jeb->last_node)) { +#ifdef TEST_TOTLEN + jeb->last_node->__totlen += size; +#endif + c->dirty_size += size; + c->free_size -= size; + jeb->dirty_size += size; + jeb->free_size -= size; + } else { + uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size; + ofs |= REF_OBSOLETE; + + jffs2_link_node_ref(c, jeb, ofs, size, NULL); + } + + return 0; +} + +/* Calculate totlen from surrounding nodes or eraseblock */ +static inline uint32_t __ref_totlen(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref) +{ + uint32_t ref_end; + struct jffs2_raw_node_ref *next_ref = ref_next(ref); + + if (next_ref) + ref_end = ref_offset(next_ref); + else { + if (!jeb) + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + + /* Last node in block. Use free_space */ + if (unlikely(ref != jeb->last_node)) { + pr_crit("ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n", + ref, ref_offset(ref), jeb->last_node, + jeb->last_node ? + ref_offset(jeb->last_node) : 0); + BUG(); + } + ref_end = jeb->offset + c->sector_size - jeb->free_size; + } + return ref_end - ref_offset(ref); +} + +uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref) +{ + uint32_t ret; + + ret = __ref_totlen(c, jeb, ref); + +#ifdef TEST_TOTLEN + if (unlikely(ret != ref->__totlen)) { + if (!jeb) + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + + pr_crit("Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n", + ref, ref_offset(ref), ref_offset(ref) + ref->__totlen, + ret, ref->__totlen); + if (ref_next(ref)) { + pr_crit("next %p (0x%08x-0x%08x)\n", + ref_next(ref), ref_offset(ref_next(ref)), + ref_offset(ref_next(ref)) + ref->__totlen); + } else + pr_crit("No next ref. jeb->last_node is %p\n", + jeb->last_node); + + pr_crit("jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", + jeb->wasted_size, jeb->dirty_size, jeb->used_size, + jeb->free_size); + +#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS) + __jffs2_dbg_dump_node_refs_nolock(c, jeb); +#endif + + WARN_ON(1); + + ret = ref->__totlen; + } +#endif /* TEST_TOTLEN */ + return ret; +} diff --git a/kernel/fs/jffs2/nodelist.h b/kernel/fs/jffs2/nodelist.h new file mode 100644 index 000000000..fa35ff79a --- /dev/null +++ b/kernel/fs/jffs2/nodelist.h @@ -0,0 +1,480 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_NODELIST_H__ +#define __JFFS2_NODELIST_H__ + +#include +#include +#include +#include "jffs2_fs_sb.h" +#include "jffs2_fs_i.h" +#include "xattr.h" +#include "acl.h" +#include "summary.h" + +#ifdef __ECOS +#include "os-ecos.h" +#else +#include "os-linux.h" +#endif + +#define JFFS2_NATIVE_ENDIAN + +/* Note we handle mode bits conversion from JFFS2 (i.e. Linux) to/from + whatever OS we're actually running on here too. */ + +#if defined(JFFS2_NATIVE_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){x}) +#define cpu_to_je32(x) ((jint32_t){x}) +#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)}) + +#define constant_cpu_to_je16(x) ((jint16_t){x}) +#define constant_cpu_to_je32(x) ((jint32_t){x}) + +#define je16_to_cpu(x) ((x).v16) +#define je32_to_cpu(x) ((x).v32) +#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m)) +#elif defined(JFFS2_BIG_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){cpu_to_be16(x)}) +#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)}) +#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))}) + +#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)}) +#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)}) + +#define je16_to_cpu(x) (be16_to_cpu(x.v16)) +#define je32_to_cpu(x) (be32_to_cpu(x.v32)) +#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m))) +#elif defined(JFFS2_LITTLE_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){cpu_to_le16(x)}) +#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)}) +#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))}) + +#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)}) +#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)}) + +#define je16_to_cpu(x) (le16_to_cpu(x.v16)) +#define je32_to_cpu(x) (le32_to_cpu(x.v32)) +#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m))) +#else +#error wibble +#endif + +/* The minimal node header size */ +#define JFFS2_MIN_NODE_HEADER sizeof(struct jffs2_raw_dirent) + +/* + This is all we need to keep in-core for each raw node during normal + operation. As and when we do read_inode on a particular inode, we can + scan the nodes which are listed for it and build up a proper map of + which nodes are currently valid. JFFSv1 always used to keep that whole + map in core for each inode. +*/ +struct jffs2_raw_node_ref +{ + struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref + for this object. If this _is_ the last, it points to the inode_cache, + xattr_ref or xattr_datum instead. The common part of those structures + has NULL in the first word. See jffs2_raw_ref_to_ic() below */ + uint32_t flash_offset; +#undef TEST_TOTLEN +#ifdef TEST_TOTLEN + uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */ +#endif +}; + +#define REF_LINK_NODE ((int32_t)-1) +#define REF_EMPTY_NODE ((int32_t)-2) + +/* Use blocks of about 256 bytes */ +#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1) + +static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref) +{ + ref++; + + /* Link to another block of refs */ + if (ref->flash_offset == REF_LINK_NODE) { + ref = ref->next_in_ino; + if (!ref) + return ref; + } + + /* End of chain */ + if (ref->flash_offset == REF_EMPTY_NODE) + return NULL; + + return ref; +} + +static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw) +{ + while(raw->next_in_ino) + raw = raw->next_in_ino; + + /* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and + not actually a jffs2_inode_cache. Check ->class */ + return ((struct jffs2_inode_cache *)raw); +} + + /* flash_offset & 3 always has to be zero, because nodes are + always aligned at 4 bytes. So we have a couple of extra bits + to play with, which indicate the node's status; see below: */ +#define REF_UNCHECKED 0 /* We haven't yet checked the CRC or built its inode */ +#define REF_OBSOLETE 1 /* Obsolete, can be completely ignored */ +#define REF_PRISTINE 2 /* Completely clean. GC without looking */ +#define REF_NORMAL 3 /* Possibly overlapped. Read the page and write again on GC */ +#define ref_flags(ref) ((ref)->flash_offset & 3) +#define ref_offset(ref) ((ref)->flash_offset & ~3) +#define ref_obsolete(ref) (((ref)->flash_offset & 3) == REF_OBSOLETE) +#define mark_ref_normal(ref) do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0) + +/* Dirent nodes should be REF_PRISTINE only if they are not a deletion + dirent. Deletion dirents should be REF_NORMAL so that GC gets to + throw them away when appropriate */ +#define dirent_node_state(rd) ( (je32_to_cpu((rd)->ino)?REF_PRISTINE:REF_NORMAL) ) + +/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates + it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get + copied. If you need to do anything different to GC inode-less nodes, then + you need to modify gc.c accordingly. */ + +/* For each inode in the filesystem, we need to keep a record of + nlink, because it would be a PITA to scan the whole directory tree + at read_inode() time to calculate it, and to keep sufficient information + in the raw_node_ref (basically both parent and child inode number for + dirent nodes) would take more space than this does. We also keep + a pointer to the first physical node which is part of this inode, too. +*/ +struct jffs2_inode_cache { + /* First part of structure is shared with other objects which + can terminate the raw node refs' next_in_ino list -- which + currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */ + + struct jffs2_full_dirent *scan_dents; /* Used during scan to hold + temporary lists of dirents, and later must be set to + NULL to mark the end of the raw_node_ref->next_in_ino + chain. */ + struct jffs2_raw_node_ref *nodes; + uint8_t class; /* It's used for identification */ + + /* end of shared structure */ + + uint8_t flags; + uint16_t state; + uint32_t ino; + struct jffs2_inode_cache *next; +#ifdef CONFIG_JFFS2_FS_XATTR + struct jffs2_xattr_ref *xref; +#endif + uint32_t pino_nlink; /* Directories store parent inode + here; other inodes store nlink. + Zero always means that it's + completely unlinked. */ +}; + +/* Inode states for 'state' above. We need the 'GC' state to prevent + someone from doing a read_inode() while we're moving a 'REF_PRISTINE' + node without going through all the iget() nonsense */ +#define INO_STATE_UNCHECKED 0 /* CRC checks not yet done */ +#define INO_STATE_CHECKING 1 /* CRC checks in progress */ +#define INO_STATE_PRESENT 2 /* In core */ +#define INO_STATE_CHECKEDABSENT 3 /* Checked, cleared again */ +#define INO_STATE_GC 4 /* GCing a 'pristine' node */ +#define INO_STATE_READING 5 /* In read_inode() */ +#define INO_STATE_CLEARING 6 /* In clear_inode() */ + +#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */ + +#define RAWNODE_CLASS_INODE_CACHE 0 +#define RAWNODE_CLASS_XATTR_DATUM 1 +#define RAWNODE_CLASS_XATTR_REF 2 + +#define INOCACHE_HASHSIZE_MIN 128 +#define INOCACHE_HASHSIZE_MAX 1024 + +#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) + +/* + Larger representation of a raw node, kept in-core only when the + struct inode for this particular ino is instantiated. +*/ + +struct jffs2_full_dnode +{ + struct jffs2_raw_node_ref *raw; + uint32_t ofs; /* The offset to which the data of this node belongs */ + uint32_t size; + uint32_t frags; /* Number of fragments which currently refer + to this node. When this reaches zero, + the node is obsolete. */ +}; + +/* + Even larger representation of a raw node, kept in-core only while + we're actually building up the original map of which nodes go where, + in read_inode() +*/ +struct jffs2_tmp_dnode_info +{ + struct rb_node rb; + struct jffs2_full_dnode *fn; + uint32_t version; + uint32_t data_crc; + uint32_t partial_crc; + uint32_t csize; + uint16_t overlapped; +}; + +/* Temporary data structure used during readinode. */ +struct jffs2_readinode_info +{ + struct rb_root tn_root; + struct jffs2_tmp_dnode_info *mdata_tn; + uint32_t highest_version; + uint32_t latest_mctime; + uint32_t mctime_ver; + struct jffs2_full_dirent *fds; + struct jffs2_raw_node_ref *latest_ref; +}; + +struct jffs2_full_dirent +{ + struct jffs2_raw_node_ref *raw; + struct jffs2_full_dirent *next; + uint32_t version; + uint32_t ino; /* == zero for unlink */ + unsigned int nhash; + unsigned char type; + unsigned char name[0]; +}; + +/* + Fragments - used to build a map of which raw node to obtain + data from for each part of the ino +*/ +struct jffs2_node_frag +{ + struct rb_node rb; + struct jffs2_full_dnode *node; /* NULL for holes */ + uint32_t size; + uint32_t ofs; /* The offset to which this fragment belongs */ +}; + +struct jffs2_eraseblock +{ + struct list_head list; + int bad_count; + uint32_t offset; /* of this block in the MTD */ + + uint32_t unchecked_size; + uint32_t used_size; + uint32_t dirty_size; + uint32_t wasted_size; + uint32_t free_size; /* Note that sector_size - free_size + is the address of the first free space */ + uint32_t allocated_refs; + struct jffs2_raw_node_ref *first_node; + struct jffs2_raw_node_ref *last_node; + + struct jffs2_raw_node_ref *gc_node; /* Next node to be garbage collected */ +}; + +static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c) +{ + return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024); +} + +#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c)) + +#define ALLOC_NORMAL 0 /* Normal allocation */ +#define ALLOC_DELETION 1 /* Deletion node. Best to allow it */ +#define ALLOC_GC 2 /* Space requested for GC. Give it or die */ +#define ALLOC_NORETRY 3 /* For jffs2_write_dnode: On failure, return -EAGAIN instead of retrying */ + +/* How much dirty space before it goes on the very_dirty_list */ +#define VERYDIRTY(c, size) ((size) >= ((c)->sector_size / 2)) + +/* check if dirty space is more than 255 Byte */ +#define ISDIRTY(size) ((size) > sizeof (struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN) + +#define PAD(x) (((x)+3)&~3) + +static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) +{ + if (old_valid_dev(rdev)) { + jdev->old_id = cpu_to_je16(old_encode_dev(rdev)); + return sizeof(jdev->old_id); + } else { + jdev->new_id = cpu_to_je32(new_encode_dev(rdev)); + return sizeof(jdev->new_id); + } +} + +static inline struct jffs2_node_frag *frag_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + + if (!node) + return NULL; + + return rb_entry(node, struct jffs2_node_frag, rb); +} + +static inline struct jffs2_node_frag *frag_last(struct rb_root *root) +{ + struct rb_node *node = rb_last(root); + + if (!node) + return NULL; + + return rb_entry(node, struct jffs2_node_frag, rb); +} + +#define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_left(frag) rb_entry((frag)->rb.rb_left, struct jffs2_node_frag, rb) +#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb) +#define frag_erase(frag, list) rb_erase(&frag->rb, list); + +#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb) +#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb) +#define tn_erase(tn, list) rb_erase(&tn->rb, list); +#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb) +#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb) + +/* nodelist.c */ +void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list); +void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state); +struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino); +void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new); +void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old); +void jffs2_free_ino_caches(struct jffs2_sb_info *c); +void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); +struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); +void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); +int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); +uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); +struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic); +extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref); + +/* nodemgmt.c */ +int jffs2_thread_should_wake(struct jffs2_sb_info *c); +int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, int prio, uint32_t sumsize); +int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize); +struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic); +void jffs2_complete_reservation(struct jffs2_sb_info *c); +void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw); + +/* write.c */ +int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri); + +struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const unsigned char *data, + uint32_t datalen, int alloc_mode); +struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_dirent *rd, const unsigned char *name, + uint32_t namelen, int alloc_mode); +int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, unsigned char *buf, + uint32_t offset, uint32_t writelen, uint32_t *retlen); +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const struct qstr *qstr); +int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, + int namelen, struct jffs2_inode_info *dead_f, uint32_t time); +int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, + uint8_t type, const char *name, int namelen, uint32_t time); + + +/* readinode.c */ +int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t ino, struct jffs2_raw_inode *latest_node); +int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); +void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); + +/* malloc.c */ +int jffs2_create_slab_caches(void); +void jffs2_destroy_slab_caches(void); + +struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize); +void jffs2_free_full_dirent(struct jffs2_full_dirent *); +struct jffs2_full_dnode *jffs2_alloc_full_dnode(void); +void jffs2_free_full_dnode(struct jffs2_full_dnode *); +struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void); +void jffs2_free_raw_dirent(struct jffs2_raw_dirent *); +struct jffs2_raw_inode *jffs2_alloc_raw_inode(void); +void jffs2_free_raw_inode(struct jffs2_raw_inode *); +struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void); +void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *); +int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int nr); +void jffs2_free_refblock(struct jffs2_raw_node_ref *); +struct jffs2_node_frag *jffs2_alloc_node_frag(void); +void jffs2_free_node_frag(struct jffs2_node_frag *); +struct jffs2_inode_cache *jffs2_alloc_inode_cache(void); +void jffs2_free_inode_cache(struct jffs2_inode_cache *); +#ifdef CONFIG_JFFS2_FS_XATTR +struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void); +void jffs2_free_xattr_datum(struct jffs2_xattr_datum *); +struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void); +void jffs2_free_xattr_ref(struct jffs2_xattr_ref *); +#endif + +/* gc.c */ +int jffs2_garbage_collect_pass(struct jffs2_sb_info *c); + +/* read.c */ +int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_full_dnode *fd, unsigned char *buf, + int ofs, int len); +int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *buf, uint32_t offset, uint32_t len); +char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f); + +/* scan.c */ +int jffs2_scan_medium(struct jffs2_sb_info *c); +void jffs2_rotate_lists(struct jffs2_sb_info *c); +struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size); + +/* build.c */ +int jffs2_do_mount_fs(struct jffs2_sb_info *c); + +/* erase.c */ +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); +void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER +/* wbuf.c */ +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +#endif + +#include "debug.h" + +#endif /* __JFFS2_NODELIST_H__ */ diff --git a/kernel/fs/jffs2/nodemgmt.c b/kernel/fs/jffs2/nodemgmt.c new file mode 100644 index 000000000..b6bd4affd --- /dev/null +++ b/kernel/fs/jffs2/nodemgmt.c @@ -0,0 +1,883 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include /* For cond_resched() */ +#include "nodelist.h" +#include "debug.h" + +/* + * Check whether the user is allowed to write. + */ +static int jffs2_rp_can_write(struct jffs2_sb_info *c) +{ + uint32_t avail; + struct jffs2_mount_opts *opts = &c->mount_opts; + + avail = c->dirty_size + c->free_size + c->unchecked_size + + c->erasing_size - c->resv_blocks_write * c->sector_size + - c->nospc_dirty_size; + + if (avail < 2 * opts->rp_size) + jffs2_dbg(1, "rpsize %u, dirty_size %u, free_size %u, " + "erasing_size %u, unchecked_size %u, " + "nr_erasing_blocks %u, avail %u, resrv %u\n", + opts->rp_size, c->dirty_size, c->free_size, + c->erasing_size, c->unchecked_size, + c->nr_erasing_blocks, avail, c->nospc_dirty_size); + + if (avail > opts->rp_size) + return 1; + + /* Always allow root */ + if (capable(CAP_SYS_RESOURCE)) + return 1; + + jffs2_dbg(1, "forbid writing\n"); + return 0; +} + +/** + * jffs2_reserve_space - request physical space to write nodes to flash + * @c: superblock info + * @minsize: Minimum acceptable size of allocation + * @len: Returned value of allocation length + * @prio: Allocation type - ALLOC_{NORMAL,DELETION} + * + * Requests a block of physical space on the flash. Returns zero for success + * and puts 'len' into the appropriate place, or returns -ENOSPC or other + * error if appropriate. Doesn't return len since that's + * + * If it returns zero, jffs2_reserve_space() also downs the per-filesystem + * allocation semaphore, to prevent more than one allocation from being + * active at any time. The semaphore is later released by jffs2_commit_allocation() + * + * jffs2_reserve_space() may trigger garbage collection in order to make room + * for the requested allocation. + */ + +static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize); + +int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, int prio, uint32_t sumsize) +{ + int ret = -EAGAIN; + int blocksneeded = c->resv_blocks_write; + /* align it */ + minsize = PAD(minsize); + + jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); + mutex_lock(&c->alloc_sem); + + jffs2_dbg(1, "%s(): alloc sem got\n", __func__); + + spin_lock(&c->erase_completion_lock); + + /* + * Check if the free space is greater then size of the reserved pool. + * If not, only allow root to proceed with writing. + */ + if (prio != ALLOC_DELETION && !jffs2_rp_can_write(c)) { + ret = -ENOSPC; + goto out; + } + + /* this needs a little more thought (true :)) */ + while(ret == -EAGAIN) { + while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) { + uint32_t dirty, avail; + + /* calculate real dirty size + * dirty_size contains blocks on erase_pending_list + * those blocks are counted in c->nr_erasing_blocks. + * If one block is actually erased, it is not longer counted as dirty_space + * but it is counted in c->nr_erasing_blocks, so we add it and subtract it + * with c->nr_erasing_blocks * c->sector_size again. + * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks + * This helps us to force gc and pick eventually a clean block to spread the load. + * We add unchecked_size here, as we hopefully will find some space to use. + * This will affect the sum only once, as gc first finishes checking + * of nodes. + */ + dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size; + if (dirty < c->nospc_dirty_size) { + if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { + jffs2_dbg(1, "%s(): Low on dirty space to GC, but it's a deletion. Allowing...\n", + __func__); + break; + } + jffs2_dbg(1, "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n", + dirty, c->unchecked_size, + c->sector_size); + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + /* Calc possibly available space. Possibly available means that we + * don't know, if unchecked size contains obsoleted nodes, which could give us some + * more usable space. This will affect the sum only once, as gc first finishes checking + * of nodes. + + Return -ENOSPC, if the maximum possibly available space is less or equal than + * blocksneeded * sector_size. + * This blocks endless gc looping on a filesystem, which is nearly full, even if + * the check above passes. + */ + avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size; + if ( (avail / c->sector_size) <= blocksneeded) { + if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { + jffs2_dbg(1, "%s(): Low on possibly available space, but it's a deletion. Allowing...\n", + __func__); + break; + } + + jffs2_dbg(1, "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n", + avail, blocksneeded * c->sector_size); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + mutex_unlock(&c->alloc_sem); + + jffs2_dbg(1, "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n", + c->nr_free_blocks, c->nr_erasing_blocks, + c->free_size, c->dirty_size, c->wasted_size, + c->used_size, c->erasing_size, c->bad_size, + c->free_size + c->dirty_size + + c->wasted_size + c->used_size + + c->erasing_size + c->bad_size, + c->flash_size); + spin_unlock(&c->erase_completion_lock); + + ret = jffs2_garbage_collect_pass(c); + + if (ret == -EAGAIN) { + spin_lock(&c->erase_completion_lock); + if (c->nr_erasing_blocks && + list_empty(&c->erase_pending_list) && + list_empty(&c->erase_complete_list)) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&c->erase_wait, &wait); + jffs2_dbg(1, "%s waiting for erase to complete\n", + __func__); + spin_unlock(&c->erase_completion_lock); + + schedule(); + remove_wait_queue(&c->erase_wait, &wait); + } else + spin_unlock(&c->erase_completion_lock); + } else if (ret) + return ret; + + cond_resched(); + + if (signal_pending(current)) + return -EINTR; + + mutex_lock(&c->alloc_sem); + spin_lock(&c->erase_completion_lock); + } + + ret = jffs2_do_reserve_space(c, minsize, len, sumsize); + if (ret) { + jffs2_dbg(1, "%s(): ret is %d\n", __func__, ret); + } + } + +out: + spin_unlock(&c->erase_completion_lock); + if (!ret) + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + if (ret) + mutex_unlock(&c->alloc_sem); + return ret; +} + +int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize) +{ + int ret; + minsize = PAD(minsize); + + jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); + + while (true) { + spin_lock(&c->erase_completion_lock); + ret = jffs2_do_reserve_space(c, minsize, len, sumsize); + if (ret) { + jffs2_dbg(1, "%s(): looping, ret is %d\n", + __func__, ret); + } + spin_unlock(&c->erase_completion_lock); + + if (ret == -EAGAIN) + cond_resched(); + else + break; + } + if (!ret) + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + + return ret; +} + + +/* Classify nextblock (clean, dirty of verydirty) and force to select an other one */ + +static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + + if (c->nextblock == NULL) { + jffs2_dbg(1, "%s(): Erase block at 0x%08x has already been placed in a list\n", + __func__, jeb->offset); + return; + } + /* Check, if we have a dirty block now, or if it was dirty already */ + if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) { + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->dirty_size += jeb->wasted_size; + jeb->wasted_size = 0; + if (VERYDIRTY(c, jeb->dirty_size)) { + jffs2_dbg(1, "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); + list_add_tail(&jeb->list, &c->very_dirty_list); + } else { + jffs2_dbg(1, "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); + list_add_tail(&jeb->list, &c->dirty_list); + } + } else { + jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); + list_add_tail(&jeb->list, &c->clean_list); + } + c->nextblock = NULL; + +} + +/* Select a new jeb for nextblock */ + +static int jffs2_find_nextblock(struct jffs2_sb_info *c) +{ + struct list_head *next; + + /* Take the next block off the 'free' list */ + + if (list_empty(&c->free_list)) { + + if (!c->nr_erasing_blocks && + !list_empty(&c->erasable_list)) { + struct jffs2_eraseblock *ejeb; + + ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); + list_move_tail(&ejeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + jffs2_dbg(1, "%s(): Triggering erase of erasable block at 0x%08x\n", + __func__, ejeb->offset); + } + + if (!c->nr_erasing_blocks && + !list_empty(&c->erasable_pending_wbuf_list)) { + jffs2_dbg(1, "%s(): Flushing write buffer\n", + __func__); + /* c->nextblock is NULL, no update to c->nextblock allowed */ + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + /* Have another go. It'll be on the erasable_list now */ + return -EAGAIN; + } + + if (!c->nr_erasing_blocks) { + /* Ouch. We're in GC, or we wouldn't have got here. + And there's no space left. At all. */ + pr_crit("Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n", + c->nr_erasing_blocks, c->nr_free_blocks, + list_empty(&c->erasable_list) ? "yes" : "no", + list_empty(&c->erasing_list) ? "yes" : "no", + list_empty(&c->erase_pending_list) ? "yes" : "no"); + return -ENOSPC; + } + + spin_unlock(&c->erase_completion_lock); + /* Don't wait for it; just erase one right now */ + jffs2_erase_pending_blocks(c, 1); + spin_lock(&c->erase_completion_lock); + + /* An erase may have failed, decreasing the + amount of free space available. So we must + restart from the beginning */ + return -EAGAIN; + } + + next = c->free_list.next; + list_del(next); + c->nextblock = list_entry(next, struct jffs2_eraseblock, list); + c->nr_free_blocks--; + + jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; +#endif + + jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n", + __func__, c->nextblock->offset); + + return 0; +} + +/* Called with alloc sem _and_ erase_completion_lock */ +static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize) +{ + struct jffs2_eraseblock *jeb = c->nextblock; + uint32_t reserved_size; /* for summary information at the end of the jeb */ + int ret; + + restart: + reserved_size = 0; + + if (jffs2_sum_active() && (sumsize != JFFS2_SUMMARY_NOSUM_SIZE)) { + /* NOSUM_SIZE means not to generate summary */ + + if (jeb) { + reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE); + dbg_summary("minsize=%d , jeb->free=%d ," + "summary->size=%d , sumsize=%d\n", + minsize, jeb->free_size, + c->summary->sum_size, sumsize); + } + + /* Is there enough space for writing out the current node, or we have to + write out summary information now, close this jeb and select new nextblock? */ + if (jeb && (PAD(minsize) + PAD(c->summary->sum_size + sumsize + + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size)) { + + /* Has summary been disabled for this jeb? */ + if (jffs2_sum_is_disabled(c->summary)) { + sumsize = JFFS2_SUMMARY_NOSUM_SIZE; + goto restart; + } + + /* Writing out the collected summary information */ + dbg_summary("generating summary for 0x%08x.\n", jeb->offset); + ret = jffs2_sum_write_sumnode(c); + + if (ret) + return ret; + + if (jffs2_sum_is_disabled(c->summary)) { + /* jffs2_write_sumnode() couldn't write out the summary information + diabling summary for this jeb and free the collected information + */ + sumsize = JFFS2_SUMMARY_NOSUM_SIZE; + goto restart; + } + + jffs2_close_nextblock(c, jeb); + jeb = NULL; + /* keep always valid value in reserved_size */ + reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE); + } + } else { + if (jeb && minsize > jeb->free_size) { + uint32_t waste; + + /* Skip the end of this block and file it as having some dirty space */ + /* If there's a pending write to it, flush now */ + + if (jffs2_wbuf_dirty(c)) { + spin_unlock(&c->erase_completion_lock); + jffs2_dbg(1, "%s(): Flushing write buffer\n", + __func__); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + jeb = c->nextblock; + goto restart; + } + + spin_unlock(&c->erase_completion_lock); + + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + + /* Just lock it again and continue. Nothing much can change because + we hold c->alloc_sem anyway. In fact, it's not entirely clear why + we hold c->erase_completion_lock in the majority of this function... + but that's a question for another (more caffeine-rich) day. */ + spin_lock(&c->erase_completion_lock); + + if (ret) + return ret; + + waste = jeb->free_size; + jffs2_link_node_ref(c, jeb, + (jeb->offset + c->sector_size - waste) | REF_OBSOLETE, + waste, NULL); + /* FIXME: that made it count as dirty. Convert to wasted */ + jeb->dirty_size -= waste; + c->dirty_size -= waste; + jeb->wasted_size += waste; + c->wasted_size += waste; + + jffs2_close_nextblock(c, jeb); + jeb = NULL; + } + } + + if (!jeb) { + + ret = jffs2_find_nextblock(c); + if (ret) + return ret; + + jeb = c->nextblock; + + if (jeb->free_size != c->sector_size - c->cleanmarker_size) { + pr_warn("Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", + jeb->offset, jeb->free_size); + goto restart; + } + } + /* OK, jeb (==c->nextblock) is now pointing at a block which definitely has + enough space */ + *len = jeb->free_size - reserved_size; + + if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size && + !jeb->first_node->next_in_ino) { + /* Only node in it beforehand was a CLEANMARKER node (we think). + So mark it obsolete now that there's going to be another node + in the block. This will reduce used_size to zero but We've + already set c->nextblock so that jffs2_mark_node_obsolete() + won't try to refile it to the dirty_list. + */ + spin_unlock(&c->erase_completion_lock); + jffs2_mark_node_obsolete(c, jeb->first_node); + spin_lock(&c->erase_completion_lock); + } + + jffs2_dbg(1, "%s(): Giving 0x%x bytes at 0x%x\n", + __func__, + *len, jeb->offset + (c->sector_size - jeb->free_size)); + return 0; +} + +/** + * jffs2_add_physical_node_ref - add a physical node reference to the list + * @c: superblock info + * @new: new node reference to add + * @len: length of this physical node + * + * Should only be used to report nodes for which space has been allocated + * by jffs2_reserve_space. + * + * Must be called with the alloc_sem held. + */ + +struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *new; + + jeb = &c->blocks[ofs / c->sector_size]; + + jffs2_dbg(1, "%s(): Node at 0x%x(%d), size 0x%x\n", + __func__, ofs & ~3, ofs & 3, len); +#if 1 + /* Allow non-obsolete nodes only to be added at the end of c->nextblock, + if c->nextblock is set. Note that wbuf.c will file obsolete nodes + even after refiling c->nextblock */ + if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE)) + && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) { + pr_warn("argh. node added in wrong place at 0x%08x(%d)\n", + ofs & ~3, ofs & 3); + if (c->nextblock) + pr_warn("nextblock 0x%08x", c->nextblock->offset); + else + pr_warn("No nextblock"); + pr_cont(", expected at %08x\n", + jeb->offset + (c->sector_size - jeb->free_size)); + return ERR_PTR(-EINVAL); + } +#endif + spin_lock(&c->erase_completion_lock); + + new = jffs2_link_node_ref(c, jeb, ofs, len, ic); + + if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) { + /* If it lives on the dirty_list, jffs2_reserve_space will put it there */ + jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); + if (jffs2_wbuf_dirty(c)) { + /* Flush the last write in the block if it's outstanding */ + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + } + + list_add_tail(&jeb->list, &c->clean_list); + c->nextblock = NULL; + } + jffs2_dbg_acct_sanity_check_nolock(c,jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + spin_unlock(&c->erase_completion_lock); + + return new; +} + + +void jffs2_complete_reservation(struct jffs2_sb_info *c) +{ + jffs2_dbg(1, "jffs2_complete_reservation()\n"); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); +} + +static inline int on_list(struct list_head *obj, struct list_head *head) +{ + struct list_head *this; + + list_for_each(this, head) { + if (this == obj) { + jffs2_dbg(1, "%p is on list at %p\n", obj, head); + return 1; + + } + } + return 0; +} + +void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref) +{ + struct jffs2_eraseblock *jeb; + int blocknr; + struct jffs2_unknown_node n; + int ret, addedsize; + size_t retlen; + uint32_t freed_len; + + if(unlikely(!ref)) { + pr_notice("EEEEEK. jffs2_mark_node_obsolete called with NULL node\n"); + return; + } + if (ref_obsolete(ref)) { + jffs2_dbg(1, "%s(): called with already obsolete node at 0x%08x\n", + __func__, ref_offset(ref)); + return; + } + blocknr = ref->flash_offset / c->sector_size; + if (blocknr >= c->nr_blocks) { + pr_notice("raw node at 0x%08x is off the end of device!\n", + ref->flash_offset); + BUG(); + } + jeb = &c->blocks[blocknr]; + + if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) && + !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) { + /* Hm. This may confuse static lock analysis. If any of the above + three conditions is false, we're going to return from this + function without actually obliterating any nodes or freeing + any jffs2_raw_node_refs. So we don't need to stop erases from + happening, or protect against people holding an obsolete + jffs2_raw_node_ref without the erase_completion_lock. */ + mutex_lock(&c->erase_free_sem); + } + + spin_lock(&c->erase_completion_lock); + + freed_len = ref_totlen(c, jeb, ref); + + if (ref_flags(ref) == REF_UNCHECKED) { + D1(if (unlikely(jeb->unchecked_size < freed_len)) { + pr_notice("raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n", + freed_len, blocknr, + ref->flash_offset, jeb->used_size); + BUG(); + }) + jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n", + ref_offset(ref), freed_len); + jeb->unchecked_size -= freed_len; + c->unchecked_size -= freed_len; + } else { + D1(if (unlikely(jeb->used_size < freed_len)) { + pr_notice("raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n", + freed_len, blocknr, + ref->flash_offset, jeb->used_size); + BUG(); + }) + jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ", + ref_offset(ref), freed_len); + jeb->used_size -= freed_len; + c->used_size -= freed_len; + } + + // Take care, that wasted size is taken into concern + if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) { + jffs2_dbg(1, "Dirtying\n"); + addedsize = freed_len; + jeb->dirty_size += freed_len; + c->dirty_size += freed_len; + + /* Convert wasted space to dirty, if not a bad block */ + if (jeb->wasted_size) { + if (on_list(&jeb->list, &c->bad_used_list)) { + jffs2_dbg(1, "Leaving block at %08x on the bad_used_list\n", + jeb->offset); + addedsize = 0; /* To fool the refiling code later */ + } else { + jffs2_dbg(1, "Converting %d bytes of wasted space to dirty in block at %08x\n", + jeb->wasted_size, jeb->offset); + addedsize += jeb->wasted_size; + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + } + } + } else { + jffs2_dbg(1, "Wasting\n"); + addedsize = 0; + jeb->wasted_size += freed_len; + c->wasted_size += freed_len; + } + ref->flash_offset = ref_offset(ref) | REF_OBSOLETE; + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + if (c->flags & JFFS2_SB_FLAG_SCANNING) { + /* Flash scanning is in progress. Don't muck about with the block + lists because they're not ready yet, and don't actually + obliterate nodes that look obsolete. If they weren't + marked obsolete on the flash at the time they _became_ + obsolete, there was probably a reason for that. */ + spin_unlock(&c->erase_completion_lock); + /* We didn't lock the erase_free_sem */ + return; + } + + if (jeb == c->nextblock) { + jffs2_dbg(2, "Not moving nextblock 0x%08x to dirty/erase_pending list\n", + jeb->offset); + } else if (!jeb->used_size && !jeb->unchecked_size) { + if (jeb == c->gcblock) { + jffs2_dbg(1, "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", + jeb->offset); + c->gcblock = NULL; + } else { + jffs2_dbg(1, "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", + jeb->offset); + list_del(&jeb->list); + } + if (jffs2_wbuf_dirty(c)) { + jffs2_dbg(1, "...and adding to erasable_pending_wbuf_list\n"); + list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list); + } else { + if (jiffies & 127) { + /* Most of the time, we just erase it immediately. Otherwise we + spend ages scanning it on mount, etc. */ + jffs2_dbg(1, "...and adding to erase_pending_list\n"); + list_add_tail(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } else { + /* Sometimes, however, we leave it elsewhere so it doesn't get + immediately reused, and we spread the load a bit. */ + jffs2_dbg(1, "...and adding to erasable_list\n"); + list_add_tail(&jeb->list, &c->erasable_list); + } + } + jffs2_dbg(1, "Done OK\n"); + } else if (jeb == c->gcblock) { + jffs2_dbg(2, "Not moving gcblock 0x%08x to dirty_list\n", + jeb->offset); + } else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) { + jffs2_dbg(1, "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", + jeb->offset); + list_del(&jeb->list); + jffs2_dbg(1, "...and adding to dirty_list\n"); + list_add_tail(&jeb->list, &c->dirty_list); + } else if (VERYDIRTY(c, jeb->dirty_size) && + !VERYDIRTY(c, jeb->dirty_size - addedsize)) { + jffs2_dbg(1, "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", + jeb->offset); + list_del(&jeb->list); + jffs2_dbg(1, "...and adding to very_dirty_list\n"); + list_add_tail(&jeb->list, &c->very_dirty_list); + } else { + jffs2_dbg(1, "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); + } + + spin_unlock(&c->erase_completion_lock); + + if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) || + (c->flags & JFFS2_SB_FLAG_BUILDING)) { + /* We didn't lock the erase_free_sem */ + return; + } + + /* The erase_free_sem is locked, and has been since before we marked the node obsolete + and potentially put its eraseblock onto the erase_pending_list. Thus, we know that + the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet + by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */ + + jffs2_dbg(1, "obliterating obsoleted node at 0x%08x\n", + ref_offset(ref)); + ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); + if (ret) { + pr_warn("Read error reading from obsoleted node at 0x%08x: %d\n", + ref_offset(ref), ret); + goto out_erase_sem; + } + if (retlen != sizeof(n)) { + pr_warn("Short read from obsoleted node at 0x%08x: %zd\n", + ref_offset(ref), retlen); + goto out_erase_sem; + } + if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) { + pr_warn("Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", + je32_to_cpu(n.totlen), freed_len); + goto out_erase_sem; + } + if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) { + jffs2_dbg(1, "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", + ref_offset(ref), je16_to_cpu(n.nodetype)); + goto out_erase_sem; + } + /* XXX FIXME: This is ugly now */ + n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE); + ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); + if (ret) { + pr_warn("Write error in obliterating obsoleted node at 0x%08x: %d\n", + ref_offset(ref), ret); + goto out_erase_sem; + } + if (retlen != sizeof(n)) { + pr_warn("Short write in obliterating obsoleted node at 0x%08x: %zd\n", + ref_offset(ref), retlen); + goto out_erase_sem; + } + + /* Nodes which have been marked obsolete no longer need to be + associated with any inode. Remove them from the per-inode list. + + Note we can't do this for NAND at the moment because we need + obsolete dirent nodes to stay on the lists, because of the + horridness in jffs2_garbage_collect_deletion_dirent(). Also + because we delete the inocache, and on NAND we need that to + stay around until all the nodes are actually erased, in order + to stop us from giving the same inode number to another newly + created inode. */ + if (ref->next_in_ino) { + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref **p; + + spin_lock(&c->erase_completion_lock); + + ic = jffs2_raw_ref_to_ic(ref); + for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino)) + ; + + *p = ref->next_in_ino; + ref->next_in_ino = NULL; + + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) + jffs2_del_ino_cache(c, ic); + break; + } + spin_unlock(&c->erase_completion_lock); + } + + out_erase_sem: + mutex_unlock(&c->erase_free_sem); +} + +int jffs2_thread_should_wake(struct jffs2_sb_info *c) +{ + int ret = 0; + uint32_t dirty; + int nr_very_dirty = 0; + struct jffs2_eraseblock *jeb; + + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) + return 1; + + if (c->unchecked_size) { + jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", + c->unchecked_size, c->checked_ino); + return 1; + } + + /* dirty_size contains blocks on erase_pending_list + * those blocks are counted in c->nr_erasing_blocks. + * If one block is actually erased, it is not longer counted as dirty_space + * but it is counted in c->nr_erasing_blocks, so we add it and subtract it + * with c->nr_erasing_blocks * c->sector_size again. + * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks + * This helps us to force gc and pick eventually a clean block to spread the load. + */ + dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size; + + if (c->nr_free_blocks + c->nr_erasing_blocks < c->resv_blocks_gctrigger && + (dirty > c->nospc_dirty_size)) + ret = 1; + + list_for_each_entry(jeb, &c->very_dirty_list, list) { + nr_very_dirty++; + if (nr_very_dirty == c->vdirty_blocks_gctrigger) { + ret = 1; + /* In debug mode, actually go through and count them all */ + D1(continue); + break; + } + } + + jffs2_dbg(1, "%s(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n", + __func__, c->nr_free_blocks, c->nr_erasing_blocks, + c->dirty_size, nr_very_dirty, ret ? "yes" : "no"); + + return ret; +} diff --git a/kernel/fs/jffs2/os-linux.h b/kernel/fs/jffs2/os-linux.h new file mode 100644 index 000000000..d200a9b8f --- /dev/null +++ b/kernel/fs/jffs2/os-linux.h @@ -0,0 +1,199 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_OS_LINUX_H__ +#define __JFFS2_OS_LINUX_H__ + +/* JFFS2 uses Linux mode bits natively -- no need for conversion */ +#define os_to_jffs2_mode(x) (x) +#define jffs2_to_os_mode(x) (x) + +struct kstatfs; +struct kvec; + +#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) +#define JFFS2_SB_INFO(sb) (sb->s_fs_info) +#define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) + + +#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size) +#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode) +#define JFFS2_F_I_UID(f) (i_uid_read(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_GID(f) (i_gid_read(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev) + +#define ITIME(sec) ((struct timespec){sec, 0}) +#define I_SEC(tv) ((tv).tv_sec) +#define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime.tv_sec) +#define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime.tv_sec) +#define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime.tv_sec) + +#define sleep_on_spinunlock(wq, s) \ + do { \ + DECLARE_WAITQUEUE(__wait, current); \ + add_wait_queue((wq), &__wait); \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + spin_unlock(s); \ + schedule(); \ + remove_wait_queue((wq), &__wait); \ + } while(0) + +static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) +{ + f->highest_version = 0; + f->fragtree = RB_ROOT; + f->metadata = NULL; + f->dents = NULL; + f->target = NULL; + f->flags = 0; + f->usercompr = 0; +} + + +#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY) + +#define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) ) +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER + + +#ifdef CONFIG_JFFS2_SUMMARY +#define jffs2_can_mark_obsolete(c) (0) +#else +#define jffs2_can_mark_obsolete(c) (1) +#endif + +#define jffs2_is_writebuffered(c) (0) +#define jffs2_cleanmarker_oob(c) (0) +#define jffs2_write_nand_cleanmarker(c,jeb) (-EIO) + +#define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf) +#define jffs2_flash_read(c, ofs, len, retlen, buf) (mtd_read((c)->mtd, ofs, len, retlen, buf)) +#define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; }) +#define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; }) +#define jffs2_write_nand_badblock(c,jeb,bad_offset) (1) +#define jffs2_nand_flash_setup(c) (0) +#define jffs2_nand_flash_cleanup(c) do {} while(0) +#define jffs2_wbuf_dirty(c) (0) +#define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e) +#define jffs2_wbuf_timeout NULL +#define jffs2_wbuf_process NULL +#define jffs2_dataflash(c) (0) +#define jffs2_dataflash_setup(c) (0) +#define jffs2_dataflash_cleanup(c) do {} while (0) +#define jffs2_nor_wbuf_flash(c) (0) +#define jffs2_nor_wbuf_flash_setup(c) (0) +#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) +#define jffs2_ubivol(c) (0) +#define jffs2_ubivol_setup(c) (0) +#define jffs2_ubivol_cleanup(c) do {} while (0) +#define jffs2_dirty_trigger(c) do {} while (0) + +#else /* NAND and/or ECC'd NOR support present */ + +#define jffs2_is_writebuffered(c) (c->wbuf != NULL) + +#ifdef CONFIG_JFFS2_SUMMARY +#define jffs2_can_mark_obsolete(c) (0) +#else +#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE)) +#endif + +#define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH) + +#define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len) + +/* wbuf.c */ +int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino); +int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf); +int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf); +int jffs2_check_oob_empty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,int mode); +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); +void jffs2_wbuf_timeout(unsigned long data); +void jffs2_wbuf_process(void *data); +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); +int jffs2_nand_flash_setup(struct jffs2_sb_info *c); +void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c); + +#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) +int jffs2_dataflash_setup(struct jffs2_sb_info *c); +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); +#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME) +int jffs2_ubivol_setup(struct jffs2_sb_info *c); +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c); + +#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) +int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); +void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); +void jffs2_dirty_trigger(struct jffs2_sb_info *c); + +#endif /* WRITEBUFFER */ + +/* background.c */ +int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c); +void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c); +void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c); + +/* dir.c */ +extern const struct file_operations jffs2_dir_operations; +extern const struct inode_operations jffs2_dir_inode_operations; + +/* file.c */ +extern const struct file_operations jffs2_file_operations; +extern const struct inode_operations jffs2_file_inode_operations; +extern const struct address_space_operations jffs2_file_address_operations; +int jffs2_fsync(struct file *, loff_t, loff_t, int); +int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); + +/* ioctl.c */ +long jffs2_ioctl(struct file *, unsigned int, unsigned long); + +/* symlink.c */ +extern const struct inode_operations jffs2_symlink_inode_operations; + +/* fs.c */ +int jffs2_setattr (struct dentry *, struct iattr *); +int jffs2_do_setattr (struct inode *, struct iattr *); +struct inode *jffs2_iget(struct super_block *, unsigned long); +void jffs2_evict_inode (struct inode *); +void jffs2_dirty_inode(struct inode *inode, int flags); +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, + struct jffs2_raw_inode *ri); +int jffs2_statfs (struct dentry *, struct kstatfs *); +int jffs2_do_remount_fs(struct super_block *, int *, char *); +int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); +void jffs2_gc_release_inode(struct jffs2_sb_info *c, + struct jffs2_inode_info *f); +struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, + int inum, int unlinked); + +unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + unsigned long offset, + unsigned long *priv); +void jffs2_gc_release_page(struct jffs2_sb_info *c, + unsigned char *pg, + unsigned long *priv); +void jffs2_flash_cleanup(struct jffs2_sb_info *c); + + +/* writev.c */ +int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen); +int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf); + +#endif /* __JFFS2_OS_LINUX_H__ */ + + diff --git a/kernel/fs/jffs2/read.c b/kernel/fs/jffs2/read.c new file mode 100644 index 000000000..0b042b1fc --- /dev/null +++ b/kernel/fs/jffs2/read.c @@ -0,0 +1,228 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + +int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_full_dnode *fd, unsigned char *buf, + int ofs, int len) +{ + struct jffs2_raw_inode *ri; + size_t readlen; + uint32_t crc; + unsigned char *decomprbuf = NULL; + unsigned char *readbuf = NULL; + int ret = 0; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri); + if (ret) { + jffs2_free_raw_inode(ri); + pr_warn("Error reading node from 0x%08x: %d\n", + ref_offset(fd->raw), ret); + return ret; + } + if (readlen != sizeof(*ri)) { + jffs2_free_raw_inode(ri); + pr_warn("Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n", + ref_offset(fd->raw), sizeof(*ri), readlen); + return -EIO; + } + crc = crc32(0, ri, sizeof(*ri)-8); + + jffs2_dbg(1, "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n", + ref_offset(fd->raw), je32_to_cpu(ri->node_crc), + crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize), + je32_to_cpu(ri->offset), buf); + if (crc != je32_to_cpu(ri->node_crc)) { + pr_warn("Node CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw)); + ret = -EIO; + goto out_ri; + } + /* There was a bug where we wrote hole nodes out with csize/dsize + swapped. Deal with it */ + if (ri->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(ri->dsize) && + je32_to_cpu(ri->csize)) { + ri->dsize = ri->csize; + ri->csize = cpu_to_je32(0); + } + + D1(if(ofs + len > je32_to_cpu(ri->dsize)) { + pr_warn("jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n", + len, ofs, je32_to_cpu(ri->dsize)); + ret = -EINVAL; + goto out_ri; + }); + + + if (ri->compr == JFFS2_COMPR_ZERO) { + memset(buf, 0, len); + goto out_ri; + } + + /* Cases: + Reading whole node and it's uncompressed - read directly to buffer provided, check CRC. + Reading whole node and it's compressed - read into comprbuf, check CRC and decompress to buffer provided + Reading partial node and it's uncompressed - read into readbuf, check CRC, and copy + Reading partial node and it's compressed - read into readbuf, check checksum, decompress to decomprbuf and copy + */ + if (ri->compr == JFFS2_COMPR_NONE && len == je32_to_cpu(ri->dsize)) { + readbuf = buf; + } else { + readbuf = kmalloc(je32_to_cpu(ri->csize), GFP_KERNEL); + if (!readbuf) { + ret = -ENOMEM; + goto out_ri; + } + } + if (ri->compr != JFFS2_COMPR_NONE) { + if (len < je32_to_cpu(ri->dsize)) { + decomprbuf = kmalloc(je32_to_cpu(ri->dsize), GFP_KERNEL); + if (!decomprbuf) { + ret = -ENOMEM; + goto out_readbuf; + } + } else { + decomprbuf = buf; + } + } else { + decomprbuf = readbuf; + } + + jffs2_dbg(2, "Read %d bytes to %p\n", je32_to_cpu(ri->csize), + readbuf); + ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri), + je32_to_cpu(ri->csize), &readlen, readbuf); + + if (!ret && readlen != je32_to_cpu(ri->csize)) + ret = -EIO; + if (ret) + goto out_decomprbuf; + + crc = crc32(0, readbuf, je32_to_cpu(ri->csize)); + if (crc != je32_to_cpu(ri->data_crc)) { + pr_warn("Data CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw)); + ret = -EIO; + goto out_decomprbuf; + } + jffs2_dbg(2, "Data CRC matches calculated CRC %08x\n", crc); + if (ri->compr != JFFS2_COMPR_NONE) { + jffs2_dbg(2, "Decompress %d bytes from %p to %d bytes at %p\n", + je32_to_cpu(ri->csize), readbuf, + je32_to_cpu(ri->dsize), decomprbuf); + ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize)); + if (ret) { + pr_warn("Error: jffs2_decompress returned %d\n", ret); + goto out_decomprbuf; + } + } + + if (len < je32_to_cpu(ri->dsize)) { + memcpy(buf, decomprbuf+ofs, len); + } + out_decomprbuf: + if(decomprbuf != buf && decomprbuf != readbuf) + kfree(decomprbuf); + out_readbuf: + if(readbuf != buf) + kfree(readbuf); + out_ri: + jffs2_free_raw_inode(ri); + + return ret; +} + +int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *buf, uint32_t offset, uint32_t len) +{ + uint32_t end = offset + len; + struct jffs2_node_frag *frag; + int ret; + + jffs2_dbg(1, "%s(): ino #%u, range 0x%08x-0x%08x\n", + __func__, f->inocache->ino, offset, offset + len); + + frag = jffs2_lookup_node_frag(&f->fragtree, offset); + + /* XXX FIXME: Where a single physical node actually shows up in two + frags, we read it twice. Don't do that. */ + /* Now we're pointing at the first frag which overlaps our page + * (or perhaps is before it, if we've been asked to read off the + * end of the file). */ + while(offset < end) { + jffs2_dbg(2, "%s(): offset %d, end %d\n", + __func__, offset, end); + if (unlikely(!frag || frag->ofs > offset || + frag->ofs + frag->size <= offset)) { + uint32_t holesize = end - offset; + if (frag && frag->ofs > offset) { + jffs2_dbg(1, "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", + f->inocache->ino, frag->ofs, offset); + holesize = min(holesize, frag->ofs - offset); + } + jffs2_dbg(1, "Filling non-frag hole from %d-%d\n", + offset, offset + holesize); + memset(buf, 0, holesize); + buf += holesize; + offset += holesize; + continue; + } else if (unlikely(!frag->node)) { + uint32_t holeend = min(end, frag->ofs + frag->size); + jffs2_dbg(1, "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", + offset, holeend, frag->ofs, + frag->ofs + frag->size); + memset(buf, 0, holeend - offset); + buf += holeend - offset; + offset = holeend; + frag = frag_next(frag); + continue; + } else { + uint32_t readlen; + uint32_t fragofs; /* offset within the frag to start reading */ + + fragofs = offset - frag->ofs; + readlen = min(frag->size - fragofs, end - offset); + jffs2_dbg(1, "Reading %d-%d from node at 0x%08x (%d)\n", + frag->ofs+fragofs, + frag->ofs + fragofs+readlen, + ref_offset(frag->node->raw), + ref_flags(frag->node->raw)); + ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen); + jffs2_dbg(2, "node read done\n"); + if (ret) { + jffs2_dbg(1, "%s(): error %d\n", + __func__, ret); + memset(buf, 0, readlen); + return ret; + } + buf += readlen; + offset += readlen; + frag = frag_next(frag); + jffs2_dbg(2, "node read was OK. Looping\n"); + } + } + return 0; +} + diff --git a/kernel/fs/jffs2/readinode.c b/kernel/fs/jffs2/readinode.c new file mode 100644 index 000000000..dddbde4f5 --- /dev/null +++ b/kernel/fs/jffs2/readinode.c @@ -0,0 +1,1451 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +/* + * Check the data CRC of the node. + * + * Returns: 0 if the data CRC is correct; + * 1 - if incorrect; + * error code if an error occurred. + */ +static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + struct jffs2_raw_node_ref *ref = tn->fn->raw; + int err = 0, pointed = 0; + struct jffs2_eraseblock *jeb; + unsigned char *buffer; + uint32_t crc, ofs, len; + size_t retlen; + + BUG_ON(tn->csize == 0); + + /* Calculate how many bytes were already checked */ + ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode); + len = tn->csize; + + if (jffs2_is_writebuffered(c)) { + int adj = ofs % c->wbuf_pagesize; + if (likely(adj)) + adj = c->wbuf_pagesize - adj; + + if (adj >= tn->csize) { + dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n", + ref_offset(ref), tn->csize, ofs); + goto adj_acc; + } + + ofs += adj; + len -= adj; + } + + dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n", + ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len); + +#ifndef __ECOS + /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), + * adding and jffs2_flash_read_end() interface. */ + err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL); + if (!err && retlen < len) { + JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); + mtd_unpoint(c->mtd, ofs, retlen); + } else if (err) { + if (err != -EOPNOTSUPP) + JFFS2_WARNING("MTD point failed: error code %d.\n", err); + } else + pointed = 1; /* succefully pointed to device */ +#endif + + if (!pointed) { + buffer = kmalloc(len, GFP_KERNEL); + if (unlikely(!buffer)) + return -ENOMEM; + + /* TODO: this is very frequent pattern, make it a separate + * routine */ + err = jffs2_flash_read(c, ofs, len, &retlen, buffer); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err); + goto free_out; + } + + if (retlen != len) { + JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len); + err = -EIO; + goto free_out; + } + } + + /* Continue calculating CRC */ + crc = crc32(tn->partial_crc, buffer, len); + if(!pointed) + kfree(buffer); +#ifndef __ECOS + else + mtd_unpoint(c->mtd, ofs, len); +#endif + + if (crc != tn->data_crc) { + JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", + ref_offset(ref), tn->data_crc, crc); + return 1; + } + +adj_acc: + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + /* If it should be REF_NORMAL, it'll get marked as such when + we build the fragtree, shortly. No need to worry about GC + moving it while it's marked REF_PRISTINE -- GC won't happen + till we've finished checking every inode anyway. */ + ref->flash_offset |= REF_PRISTINE; + /* + * Mark the node as having been checked and fix the + * accounting accordingly. + */ + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); + + return 0; + +free_out: + if(!pointed) + kfree(buffer); +#ifndef __ECOS + else + mtd_unpoint(c->mtd, ofs, len); +#endif + return err; +} + +/* + * Helper function for jffs2_add_older_frag_to_fragtree(). + * + * Checks the node if we are in the checking stage. + */ +static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + int ret; + + BUG_ON(ref_obsolete(tn->fn->raw)); + + /* We only check the data CRC of unchecked nodes */ + if (ref_flags(tn->fn->raw) != REF_UNCHECKED) + return 0; + + dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n", + tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw)); + + ret = check_node_data(c, tn); + if (unlikely(ret < 0)) { + JFFS2_ERROR("check_node_data() returned error: %d.\n", + ret); + } else if (unlikely(ret > 0)) { + dbg_readinode("CRC error, mark it obsolete.\n"); + jffs2_mark_node_obsolete(c, tn->fn->raw); + } + + return ret; +} + +static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset) +{ + struct rb_node *next; + struct jffs2_tmp_dnode_info *tn = NULL; + + dbg_readinode("root %p, offset %d\n", tn_root, offset); + + next = tn_root->rb_node; + + while (next) { + tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb); + + if (tn->fn->ofs < offset) + next = tn->rb.rb_right; + else if (tn->fn->ofs >= offset) + next = tn->rb.rb_left; + else + break; + } + + return tn; +} + + +static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + jffs2_mark_node_obsolete(c, tn->fn->raw); + jffs2_free_full_dnode(tn->fn); + jffs2_free_tmp_dnode_info(tn); +} +/* + * This function is used when we read an inode. Data nodes arrive in + * arbitrary order -- they may be older or newer than the nodes which + * are already in the tree. Where overlaps occur, the older node can + * be discarded as long as the newer passes the CRC check. We don't + * bother to keep track of holes in this rbtree, and neither do we deal + * with frags -- we can have multiple entries starting at the same + * offset, and the one with the smallest length will come first in the + * ordering. + * + * Returns 0 if the node was handled (including marking it obsolete) + * < 0 an if error occurred + */ +static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, + struct jffs2_readinode_info *rii, + struct jffs2_tmp_dnode_info *tn) +{ + uint32_t fn_end = tn->fn->ofs + tn->fn->size; + struct jffs2_tmp_dnode_info *this, *ptn; + + dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); + + /* If a node has zero dsize, we only have to keep it if it might be the + node with highest version -- i.e. the one which will end up as f->metadata. + Note that such nodes won't be REF_UNCHECKED since there are no data to + check anyway. */ + if (!tn->fn->size) { + if (rii->mdata_tn) { + if (rii->mdata_tn->version < tn->version) { + /* We had a candidate mdata node already */ + dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version); + jffs2_kill_tn(c, rii->mdata_tn); + } else { + dbg_readinode("kill new mdata with ver %d (older than existing %d\n", + tn->version, rii->mdata_tn->version); + jffs2_kill_tn(c, tn); + return 0; + } + } + rii->mdata_tn = tn; + dbg_readinode("keep new mdata with ver %d\n", tn->version); + return 0; + } + + /* Find the earliest node which _may_ be relevant to this one */ + this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs); + if (this) { + /* If the node is coincident with another at a lower address, + back up until the other node is found. It may be relevant */ + while (this->overlapped) { + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } + dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); + } + + while (this) { + if (this->fn->ofs > fn_end) + break; + dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n", + this->version, this->fn->ofs, this->fn->size); + + if (this->version == tn->version) { + /* Version number collision means REF_PRISTINE GC. Accept either of them + as long as the CRC is correct. Check the one we have already... */ + if (!check_tn_node(c, this)) { + /* The one we already had was OK. Keep it and throw away the new one */ + dbg_readinode("Like old node. Throw away new\n"); + jffs2_kill_tn(c, tn); + return 0; + } else { + /* Who cares if the new one is good; keep it for now anyway. */ + dbg_readinode("Like new node. Throw away old\n"); + rb_replace_node(&this->rb, &tn->rb, &rii->tn_root); + jffs2_kill_tn(c, this); + /* Same overlapping from in front and behind */ + return 0; + } + } + if (this->version < tn->version && + this->fn->ofs >= tn->fn->ofs && + this->fn->ofs + this->fn->size <= fn_end) { + /* New node entirely overlaps 'this' */ + if (check_tn_node(c, tn)) { + dbg_readinode("new node bad CRC\n"); + jffs2_kill_tn(c, tn); + return 0; + } + /* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */ + while (this && this->fn->ofs + this->fn->size <= fn_end) { + struct jffs2_tmp_dnode_info *next = tn_next(this); + if (this->version < tn->version) { + tn_erase(this, &rii->tn_root); + dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + jffs2_kill_tn(c, this); + } + this = next; + } + dbg_readinode("Done killing overlapped nodes\n"); + continue; + } + if (this->version > tn->version && + this->fn->ofs <= tn->fn->ofs && + this->fn->ofs+this->fn->size >= fn_end) { + /* New node entirely overlapped by 'this' */ + if (!check_tn_node(c, this)) { + dbg_readinode("Good CRC on old node. Kill new\n"); + jffs2_kill_tn(c, tn); + return 0; + } + /* ... but 'this' was bad. Replace it... */ + dbg_readinode("Bad CRC on old overlapping node. Kill it\n"); + tn_erase(this, &rii->tn_root); + jffs2_kill_tn(c, this); + break; + } + + this = tn_next(this); + } + + /* We neither completely obsoleted nor were completely + obsoleted by an earlier node. Insert into the tree */ + { + struct rb_node *parent; + struct rb_node **link = &rii->tn_root.rb_node; + struct jffs2_tmp_dnode_info *insert_point = NULL; + + while (*link) { + parent = *link; + insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); + if (tn->fn->ofs > insert_point->fn->ofs) + link = &insert_point->rb.rb_right; + else if (tn->fn->ofs < insert_point->fn->ofs || + tn->fn->size < insert_point->fn->size) + link = &insert_point->rb.rb_left; + else + link = &insert_point->rb.rb_right; + } + rb_link_node(&tn->rb, &insert_point->rb, link); + rb_insert_color(&tn->rb, &rii->tn_root); + } + + /* If there's anything behind that overlaps us, note it */ + this = tn_prev(tn); + if (this) { + while (1) { + if (this->fn->ofs + this->fn->size > tn->fn->ofs) { + dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n", + this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + tn->overlapped = 1; + break; + } + if (!this->overlapped) + break; + + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } + } + + /* If the new node overlaps anything ahead, note it */ + this = tn_next(tn); + while (this && this->fn->ofs < fn_end) { + this->overlapped = 1; + dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + this = tn_next(this); + } + return 0; +} + +/* Trivial function to remove the last node in the tree. Which by definition + has no right-hand child — so can be removed just by making its left-hand + child (if any) take its place under its parent. Since this is only done + when we're consuming the whole tree, there's no need to use rb_erase() + and let it worry about adjusting colours and balancing the tree. That + would just be a waste of time. */ +static void eat_last(struct rb_root *root, struct rb_node *node) +{ + struct rb_node *parent = rb_parent(node); + struct rb_node **link; + + /* LAST! */ + BUG_ON(node->rb_right); + + if (!parent) + link = &root->rb_node; + else if (node == parent->rb_left) + link = &parent->rb_left; + else + link = &parent->rb_right; + + *link = node->rb_left; + if (node->rb_left) + node->rb_left->__rb_parent_color = node->__rb_parent_color; +} + +/* We put the version tree in reverse order, so we can use the same eat_last() + function that we use to consume the tmpnode tree (tn_root). */ +static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) +{ + struct rb_node **link = &ver_root->rb_node; + struct rb_node *parent = NULL; + struct jffs2_tmp_dnode_info *this_tn; + + while (*link) { + parent = *link; + this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); + + if (tn->version > this_tn->version) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root); + rb_link_node(&tn->rb, parent, link); + rb_insert_color(&tn->rb, ver_root); +} + +/* Build final, normal fragtree from tn tree. It doesn't matter which order + we add nodes to the real fragtree, as long as they don't overlap. And + having thrown away the majority of overlapped nodes as we went, there + really shouldn't be many sets of nodes which do overlap. If we start at + the end, we can use the overlap markers -- we can just eat nodes which + aren't overlapped, and when we encounter nodes which _do_ overlap we + sort them all into a temporary tree in version order before replaying them. */ +static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_readinode_info *rii) +{ + struct jffs2_tmp_dnode_info *pen, *last, *this; + struct rb_root ver_root = RB_ROOT; + uint32_t high_ver = 0; + + if (rii->mdata_tn) { + dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn); + high_ver = rii->mdata_tn->version; + rii->latest_ref = rii->mdata_tn->fn->raw; + } +#ifdef JFFS2_DBG_READINODE_MESSAGES + this = tn_last(&rii->tn_root); + while (this) { + dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size, this->overlapped); + this = tn_prev(this); + } +#endif + pen = tn_last(&rii->tn_root); + while ((last = pen)) { + pen = tn_prev(last); + + eat_last(&rii->tn_root, &last->rb); + ver_insert(&ver_root, last); + + if (unlikely(last->overlapped)) { + if (pen) + continue; + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + last->overlapped = 0; + } + + /* Now we have a bunch of nodes in reverse version + order, in the tree at ver_root. Most of the time, + there'll actually be only one node in the 'tree', + in fact. */ + this = tn_last(&ver_root); + + while (this) { + struct jffs2_tmp_dnode_info *vers_next; + int ret; + vers_next = tn_prev(this); + eat_last(&ver_root, &this->rb); + if (check_tn_node(c, this)) { + dbg_readinode("node ver %d, 0x%x-0x%x failed CRC\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + jffs2_kill_tn(c, this); + } else { + if (this->version > high_ver) { + /* Note that this is different from the other + highest_version, because this one is only + counting _valid_ nodes which could give the + latest inode metadata */ + high_ver = this->version; + rii->latest_ref = this->fn->raw; + } + dbg_readinode("Add %p (v %d, 0x%x-0x%x, ov %d) to fragtree\n", + this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size, this->overlapped); + + ret = jffs2_add_full_dnode_to_inode(c, f, this->fn); + if (ret) { + /* Free the nodes in vers_root; let the caller + deal with the rest */ + JFFS2_ERROR("Add node to tree failed %d\n", ret); + while (1) { + vers_next = tn_prev(this); + if (check_tn_node(c, this)) + jffs2_mark_node_obsolete(c, this->fn->raw); + jffs2_free_full_dnode(this->fn); + jffs2_free_tmp_dnode_info(this); + this = vers_next; + if (!this) + break; + eat_last(&ver_root, &vers_next->rb); + } + return ret; + } + jffs2_free_tmp_dnode_info(this); + } + this = vers_next; + } + } + return 0; +} + +static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) +{ + struct jffs2_tmp_dnode_info *tn, *next; + + rbtree_postorder_for_each_entry_safe(tn, next, list, rb) { + jffs2_free_full_dnode(tn->fn); + jffs2_free_tmp_dnode_info(tn); + } + + *list = RB_ROOT; +} + +static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent *next; + + while (fd) { + next = fd->next; + jffs2_free_full_dirent(fd); + fd = next; + } +} + +/* Returns first valid node after 'ref'. May return 'ref' */ +static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_ref *ref) +{ + while (ref && ref->next_in_ino) { + if (!ref_obsolete(ref)) + return ref; + dbg_noderef("node at 0x%08x is obsoleted. Ignoring.\n", ref_offset(ref)); + ref = ref->next_in_ino; + } + return NULL; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an directory entry node is found. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + struct jffs2_raw_dirent *rd, size_t read, + struct jffs2_readinode_info *rii) +{ + struct jffs2_full_dirent *fd; + uint32_t crc; + + /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ + BUG_ON(ref_obsolete(ref)); + + crc = crc32(0, rd, sizeof(*rd) - 8); + if (unlikely(crc != je32_to_cpu(rd->node_crc))) { + JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n", + ref_offset(ref), je32_to_cpu(rd->node_crc), crc); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + /* If we've never checked the CRCs on this node, check them now */ + if (ref_flags(ref) == REF_UNCHECKED) { + struct jffs2_eraseblock *jeb; + int len; + + /* Sanity check */ + if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) { + JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n", + ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + ref->flash_offset = ref_offset(ref) | dirent_node_state(rd); + spin_unlock(&c->erase_completion_lock); + } + + fd = jffs2_alloc_full_dirent(rd->nsize + 1); + if (unlikely(!fd)) + return -ENOMEM; + + fd->raw = ref; + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->type = rd->type; + + if (fd->version > rii->highest_version) + rii->highest_version = fd->version; + + /* Pick out the mctime of the latest dirent */ + if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) { + rii->mctime_ver = fd->version; + rii->latest_mctime = je32_to_cpu(rd->mctime); + } + + /* + * Copy as much of the name as possible from the raw + * dirent we've already read from the flash. + */ + if (read > sizeof(*rd)) + memcpy(&fd->name[0], &rd->name[0], + min_t(uint32_t, rd->nsize, (read - sizeof(*rd)) )); + + /* Do we need to copy any more of the name directly from the flash? */ + if (rd->nsize + sizeof(*rd) > read) { + /* FIXME: point() */ + int err; + int already = read - sizeof(*rd); + + err = jffs2_flash_read(c, (ref_offset(ref)) + read, + rd->nsize - already, &read, &fd->name[already]); + if (unlikely(read != rd->nsize - already) && likely(!err)) + return -EIO; + + if (unlikely(err)) { + JFFS2_ERROR("read remainder of name: error %d\n", err); + jffs2_free_full_dirent(fd); + return -EIO; + } + } + + fd->nhash = full_name_hash(fd->name, rd->nsize); + fd->next = NULL; + fd->name[rd->nsize] = '\0'; + + /* + * Wheee. We now have a complete jffs2_full_dirent structure, with + * the name in it and everything. Link it into the list + */ + jffs2_add_fd_to_list(c, fd, &rii->fds); + + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an inode node is found. + * + * Returns: 0 on success (possibly after marking a bad node obsolete); + * negative error code on failure. + */ +static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + struct jffs2_raw_inode *rd, int rdlen, + struct jffs2_readinode_info *rii) +{ + struct jffs2_tmp_dnode_info *tn; + uint32_t len, csize; + int ret = 0; + uint32_t crc; + + /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ + BUG_ON(ref_obsolete(ref)); + + crc = crc32(0, rd, sizeof(*rd) - 8); + if (unlikely(crc != je32_to_cpu(rd->node_crc))) { + JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n", + ref_offset(ref), je32_to_cpu(rd->node_crc), crc); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + tn = jffs2_alloc_tmp_dnode_info(); + if (!tn) { + JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn)); + return -ENOMEM; + } + + tn->partial_crc = 0; + csize = je32_to_cpu(rd->csize); + + /* If we've never checked the CRCs on this node, check them now */ + if (ref_flags(ref) == REF_UNCHECKED) { + + /* Sanity checks */ + if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) || + unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) { + JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref)); + jffs2_dbg_dump_node(c, ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto free_out; + } + + if (jffs2_is_writebuffered(c) && csize != 0) { + /* At this point we are supposed to check the data CRC + * of our unchecked node. But thus far, we do not + * know whether the node is valid or obsolete. To + * figure this out, we need to walk all the nodes of + * the inode and build the inode fragtree. We don't + * want to spend time checking data of nodes which may + * later be found to be obsolete. So we put off the full + * data CRC checking until we have read all the inode + * nodes and have started building the fragtree. + * + * The fragtree is being built starting with nodes + * having the highest version number, so we'll be able + * to detect whether a node is valid (i.e., it is not + * overlapped by a node with higher version) or not. + * And we'll be able to check only those nodes, which + * are not obsolete. + * + * Of course, this optimization only makes sense in case + * of NAND flashes (or other flashes with + * !jffs2_can_mark_obsolete()), since on NOR flashes + * nodes are marked obsolete physically. + * + * Since NAND flashes (or other flashes with + * jffs2_is_writebuffered(c)) are anyway read by + * fractions of c->wbuf_pagesize, and we have just read + * the node header, it is likely that the starting part + * of the node data is also read when we read the + * header. So we don't mind to check the CRC of the + * starting part of the data of the node now, and check + * the second part later (in jffs2_check_node_data()). + * Of course, we will not need to re-read and re-check + * the NAND page which we have just read. This is why we + * read the whole NAND page at jffs2_get_inode_nodes(), + * while we needed only the node header. + */ + unsigned char *buf; + + /* 'buf' will point to the start of data */ + buf = (unsigned char *)rd + sizeof(*rd); + /* len will be the read data length */ + len = min_t(uint32_t, rdlen - sizeof(*rd), csize); + tn->partial_crc = crc32(0, buf, len); + + dbg_readinode("Calculates CRC (%#08x) for %d bytes, csize %d\n", tn->partial_crc, len, csize); + + /* If we actually calculated the whole data CRC + * and it is wrong, drop the node. */ + if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) { + JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", + ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc)); + jffs2_mark_node_obsolete(c, ref); + goto free_out; + } + + } else if (csize == 0) { + /* + * We checked the header CRC. If the node has no data, adjust + * the space accounting now. For other nodes this will be done + * later either when the node is marked obsolete or when its + * data is checked. + */ + struct jffs2_eraseblock *jeb; + + dbg_readinode("the node has no data.\n"); + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + ref->flash_offset = ref_offset(ref) | REF_NORMAL; + spin_unlock(&c->erase_completion_lock); + } + } + + tn->fn = jffs2_alloc_full_dnode(); + if (!tn->fn) { + JFFS2_ERROR("alloc fn failed\n"); + ret = -ENOMEM; + goto free_out; + } + + tn->version = je32_to_cpu(rd->version); + tn->fn->ofs = je32_to_cpu(rd->offset); + tn->data_crc = je32_to_cpu(rd->data_crc); + tn->csize = csize; + tn->fn->raw = ref; + tn->overlapped = 0; + + if (tn->version > rii->highest_version) + rii->highest_version = tn->version; + + /* There was a bug where we wrote hole nodes out with + csize/dsize swapped. Deal with it */ + if (rd->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(rd->dsize) && csize) + tn->fn->size = csize; + else // normal case... + tn->fn->size = je32_to_cpu(rd->dsize); + + dbg_readinode2("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n", + ref_offset(ref), je32_to_cpu(rd->version), + je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize); + + ret = jffs2_add_tn_to_tree(c, rii, tn); + + if (ret) { + jffs2_free_full_dnode(tn->fn); + free_out: + jffs2_free_tmp_dnode_info(tn); + return ret; + } +#ifdef JFFS2_DBG_READINODE2_MESSAGES + dbg_readinode2("After adding ver %d:\n", je32_to_cpu(rd->version)); + tn = tn_first(&rii->tn_root); + while (tn) { + dbg_readinode2("%p: v %d r 0x%x-0x%x ov %d\n", + tn, tn->version, tn->fn->ofs, + tn->fn->ofs+tn->fn->size, tn->overlapped); + tn = tn_next(tn); + } +#endif + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an unknown node is found. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un) +{ + /* We don't mark unknown nodes as REF_UNCHECKED */ + if (ref_flags(ref) == REF_UNCHECKED) { + JFFS2_ERROR("REF_UNCHECKED but unknown node at %#08x\n", + ref_offset(ref)); + JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n", + je16_to_cpu(un->magic), je16_to_cpu(un->nodetype), + je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype)); + + switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) { + + case JFFS2_FEATURE_INCOMPAT: + JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + /* EEP */ + BUG(); + break; + + case JFFS2_FEATURE_ROCOMPAT: + JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO)); + break; + + case JFFS2_FEATURE_RWCOMPAT_COPY: + JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + break; + + case JFFS2_FEATURE_RWCOMPAT_DELETE: + JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * The function detects whether more data should be read and reads it if yes. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + int needed_len, int *rdlen, unsigned char *buf) +{ + int err, to_read = needed_len - *rdlen; + size_t retlen; + uint32_t offs; + + if (jffs2_is_writebuffered(c)) { + int rem = to_read % c->wbuf_pagesize; + + if (rem) + to_read += c->wbuf_pagesize - rem; + } + + /* We need to read more data */ + offs = ref_offset(ref) + *rdlen; + + dbg_readinode("read more %d bytes\n", to_read); + + err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, " + "error code: %d.\n", to_read, offs, err); + return err; + } + + if (retlen < to_read) { + JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", + offs, retlen, to_read); + return -EIO; + } + + *rdlen += to_read; + return 0; +} + +/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated + with this ino. Perform a preliminary ordering on data nodes, throwing away + those which are completely obsoleted by newer ones. The naïve approach we + use to take of just returning them _all_ in version order will cause us to + run out of memory in certain degenerate cases. */ +static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_readinode_info *rii) +{ + struct jffs2_raw_node_ref *ref, *valid_ref; + unsigned char *buf = NULL; + union jffs2_node_union *node; + size_t retlen; + int len, err; + + rii->mctime_ver = 0; + + dbg_readinode("ino #%u\n", f->inocache->ino); + + /* FIXME: in case of NOR and available ->point() this + * needs to be fixed. */ + len = sizeof(union jffs2_node_union) + c->wbuf_pagesize; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock(&c->erase_completion_lock); + valid_ref = jffs2_first_valid_node(f->inocache->nodes); + if (!valid_ref && f->inocache->ino != 1) + JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino); + while (valid_ref) { + /* We can hold a pointer to a non-obsolete node without the spinlock, + but _obsolete_ nodes may disappear at any time, if the block + they're in gets erased. So if we mark 'ref' obsolete while we're + not holding the lock, it can go away immediately. For that reason, + we find the next valid node first, before processing 'ref'. + */ + ref = valid_ref; + valid_ref = jffs2_first_valid_node(ref->next_in_ino); + spin_unlock(&c->erase_completion_lock); + + cond_resched(); + + /* + * At this point we don't know the type of the node we're going + * to read, so we do not know the size of its header. In order + * to minimize the amount of flash IO we assume the header is + * of size = JFFS2_MIN_NODE_HEADER. + */ + len = JFFS2_MIN_NODE_HEADER; + if (jffs2_is_writebuffered(c)) { + int end, rem; + + /* + * We are about to read JFFS2_MIN_NODE_HEADER bytes, + * but this flash has some minimal I/O unit. It is + * possible that we'll need to read more soon, so read + * up to the next min. I/O unit, in order not to + * re-read the same min. I/O unit twice. + */ + end = ref_offset(ref) + len; + rem = end % c->wbuf_pagesize; + if (rem) + end += c->wbuf_pagesize - rem; + len = end - ref_offset(ref); + } + + dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref)); + + /* FIXME: point() */ + err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ref_offset(ref), err); + goto free_out; + } + + if (retlen < len) { + JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len); + err = -EIO; + goto free_out; + } + + node = (union jffs2_node_union *)buf; + + /* No need to mask in the valid bit; it shouldn't be invalid */ + if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) { + JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n", + ref_offset(ref), je16_to_cpu(node->u.magic), + je16_to_cpu(node->u.nodetype), + je32_to_cpu(node->u.totlen), + je32_to_cpu(node->u.hdr_crc)); + jffs2_dbg_dump_node(c, ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto cont; + } + if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) { + /* Not a JFFS2 node, whinge and move on */ + JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n", + je16_to_cpu(node->u.magic), ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto cont; + } + + switch (je16_to_cpu(node->u.nodetype)) { + + case JFFS2_NODETYPE_DIRENT: + + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent) && + len < sizeof(struct jffs2_raw_dirent)) { + err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_direntry(c, ref, &node->d, retlen, rii); + if (unlikely(err)) + goto free_out; + + break; + + case JFFS2_NODETYPE_INODE: + + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode) && + len < sizeof(struct jffs2_raw_inode)) { + err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_dnode(c, ref, &node->i, len, rii); + if (unlikely(err)) + goto free_out; + + break; + + default: + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node) && + len < sizeof(struct jffs2_unknown_node)) { + err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_unknown(c, ref, &node->u); + if (unlikely(err)) + goto free_out; + + } + cont: + spin_lock(&c->erase_completion_lock); + } + + spin_unlock(&c->erase_completion_lock); + kfree(buf); + + f->highest_version = rii->highest_version; + + dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n", + f->inocache->ino, rii->highest_version, rii->latest_mctime, + rii->mctime_ver); + return 0; + + free_out: + jffs2_free_tmp_dnode_info_list(&rii->tn_root); + jffs2_free_full_dirent_list(rii->fds); + rii->fds = NULL; + kfree(buf); + return err; +} + +static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_raw_inode *latest_node) +{ + struct jffs2_readinode_info rii; + uint32_t crc, new_size; + size_t retlen; + int ret; + + dbg_readinode("ino #%u pino/nlink is %d\n", f->inocache->ino, + f->inocache->pino_nlink); + + memset(&rii, 0, sizeof(rii)); + + /* Grab all nodes relevant to this ino */ + ret = jffs2_get_inode_nodes(c, f, &rii); + + if (ret) { + JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + return ret; + } + + ret = jffs2_build_inode_fragtree(c, f, &rii); + if (ret) { + JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n", + f->inocache->ino, ret); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + jffs2_free_tmp_dnode_info_list(&rii.tn_root); + /* FIXME: We could at least crc-check them all */ + if (rii.mdata_tn) { + jffs2_free_full_dnode(rii.mdata_tn->fn); + jffs2_free_tmp_dnode_info(rii.mdata_tn); + rii.mdata_tn = NULL; + } + return ret; + } + + if (rii.mdata_tn) { + if (rii.mdata_tn->fn->raw == rii.latest_ref) { + f->metadata = rii.mdata_tn->fn; + jffs2_free_tmp_dnode_info(rii.mdata_tn); + } else { + jffs2_kill_tn(c, rii.mdata_tn); + } + rii.mdata_tn = NULL; + } + + f->dents = rii.fds; + + jffs2_dbg_fragtree_paranoia_check_nolock(f); + + if (unlikely(!rii.latest_ref)) { + /* No data nodes for this inode. */ + if (f->inocache->ino != 1) { + JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino); + if (!rii.fds) { + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + return -EIO; + } + JFFS2_NOTICE("but it has children so we fake some modes for it\n"); + } + latest_node->mode = cpu_to_jemode(S_IFDIR|S_IRUGO|S_IWUSR|S_IXUGO); + latest_node->version = cpu_to_je32(0); + latest_node->atime = latest_node->ctime = latest_node->mtime = cpu_to_je32(0); + latest_node->isize = cpu_to_je32(0); + latest_node->gid = cpu_to_je16(0); + latest_node->uid = cpu_to_je16(0); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT); + return 0; + } + + ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node); + if (ret || retlen != sizeof(*latest_node)) { + JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", + ret, retlen, sizeof(*latest_node)); + /* FIXME: If this fails, there seems to be a memory leak. Find it. */ + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return ret?ret:-EIO; + } + + crc = crc32(0, latest_node, sizeof(*latest_node)-8); + if (crc != je32_to_cpu(latest_node->node_crc)) { + JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", + f->inocache->ino, ref_offset(rii.latest_ref)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + + switch(jemode_to_cpu(latest_node->mode) & S_IFMT) { + case S_IFDIR: + if (rii.mctime_ver > je32_to_cpu(latest_node->version)) { + /* The times in the latest_node are actually older than + mctime in the latest dirent. Cheat. */ + latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime); + } + break; + + + case S_IFREG: + /* If it was a regular file, truncate it to the latest node's isize */ + new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize)); + if (new_size != je32_to_cpu(latest_node->isize)) { + JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n", + f->inocache->ino, je32_to_cpu(latest_node->isize), new_size); + latest_node->isize = cpu_to_je32(new_size); + } + break; + + case S_IFLNK: + /* Hack to work around broken isize in old symlink code. + Remove this when dwmw2 comes to his senses and stops + symlinks from being an entirely gratuitous special + case. */ + if (!je32_to_cpu(latest_node->isize)) + latest_node->isize = latest_node->dsize; + + if (f->inocache->state != INO_STATE_CHECKING) { + /* Symlink's inode data is the target path. Read it and + * keep in RAM to facilitate quick follow symlink + * operation. */ + uint32_t csize = je32_to_cpu(latest_node->csize); + if (csize > JFFS2_MAX_NAME_LEN) { + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -ENAMETOOLONG; + } + f->target = kmalloc(csize + 1, GFP_KERNEL); + if (!f->target) { + JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -ENOMEM; + } + + ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node), + csize, &retlen, (char *)f->target); + + if (ret || retlen != csize) { + if (retlen != csize) + ret = -EIO; + kfree(f->target); + f->target = NULL; + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return ret; + } + + f->target[csize] = '\0'; + dbg_readinode("symlink's target '%s' cached\n", f->target); + } + + /* fall through... */ + + case S_IFBLK: + case S_IFCHR: + /* Certain inode types should have only one data node, and it's + kept as the metadata node */ + if (f->metadata) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + if (!frag_first(&f->fragtree)) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + /* ASSERT: f->fraglist != NULL */ + if (frag_next(frag_first(&f->fragtree))) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + /* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */ + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + /* OK. We're happy */ + f->metadata = frag_first(&f->fragtree)->node; + jffs2_free_node_frag(frag_first(&f->fragtree)); + f->fragtree = RB_ROOT; + break; + } + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT); + + return 0; +} + +/* Scan the list of all nodes present for this ino, build map of versions, etc. */ +int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t ino, struct jffs2_raw_inode *latest_node) +{ + dbg_readinode("read inode #%u\n", ino); + + retry_inocache: + spin_lock(&c->inocache_lock); + f->inocache = jffs2_get_ino_cache(c, ino); + + if (f->inocache) { + /* Check its state. We may need to wait before we can use it */ + switch(f->inocache->state) { + case INO_STATE_UNCHECKED: + case INO_STATE_CHECKEDABSENT: + f->inocache->state = INO_STATE_READING; + break; + + case INO_STATE_CHECKING: + case INO_STATE_GC: + /* If it's in either of these states, we need + to wait for whoever's got it to finish and + put it back. */ + dbg_readinode("waiting for ino #%u in state %d\n", ino, f->inocache->state); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + goto retry_inocache; + + case INO_STATE_READING: + case INO_STATE_PRESENT: + /* Eep. This should never happen. It can + happen if Linux calls read_inode() again + before clear_inode() has finished though. */ + JFFS2_ERROR("Eep. Trying to read_inode #%u when it's already in state %d!\n", ino, f->inocache->state); + /* Fail. That's probably better than allowing it to succeed */ + f->inocache = NULL; + break; + + default: + BUG(); + } + } + spin_unlock(&c->inocache_lock); + + if (!f->inocache && ino == 1) { + /* Special case - no root inode on medium */ + f->inocache = jffs2_alloc_inode_cache(); + if (!f->inocache) { + JFFS2_ERROR("cannot allocate inocache for root inode\n"); + return -ENOMEM; + } + dbg_readinode("creating inocache for root inode\n"); + memset(f->inocache, 0, sizeof(struct jffs2_inode_cache)); + f->inocache->ino = f->inocache->pino_nlink = 1; + f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; + f->inocache->state = INO_STATE_READING; + jffs2_add_ino_cache(c, f->inocache); + } + if (!f->inocache) { + JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino); + return -ENOENT; + } + + return jffs2_do_read_inode_internal(c, f, latest_node); +} + +int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + struct jffs2_raw_inode n; + struct jffs2_inode_info *f = kzalloc(sizeof(*f), GFP_KERNEL); + int ret; + + if (!f) + return -ENOMEM; + + mutex_init(&f->sem); + mutex_lock(&f->sem); + f->inocache = ic; + + ret = jffs2_do_read_inode_internal(c, f, &n); + if (!ret) { + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + } + jffs2_xattr_do_crccheck_inode(c, ic); + kfree (f); + return ret; +} + +void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) +{ + struct jffs2_full_dirent *fd, *fds; + int deleted; + + jffs2_xattr_delete_inode(c, f->inocache); + mutex_lock(&f->sem); + deleted = f->inocache && !f->inocache->pino_nlink; + + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING); + + if (f->metadata) { + if (deleted) + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + } + + jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); + + if (f->target) { + kfree(f->target); + f->target = NULL; + } + + fds = f->dents; + while(fds) { + fd = fds; + fds = fd->next; + jffs2_free_full_dirent(fd); + } + + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) { + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + if (f->inocache->nodes == (void *)f->inocache) + jffs2_del_ino_cache(c, f->inocache); + } + + mutex_unlock(&f->sem); +} diff --git a/kernel/fs/jffs2/scan.c b/kernel/fs/jffs2/scan.c new file mode 100644 index 000000000..9ad5ba4b2 --- /dev/null +++ b/kernel/fs/jffs2/scan.c @@ -0,0 +1,1176 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "summary.h" +#include "debug.h" + +#define DEFAULT_EMPTY_SCAN_SIZE 256 + +#define noisy_printk(noise, fmt, ...) \ +do { \ + if (*(noise)) { \ + pr_notice(fmt, ##__VA_ARGS__); \ + (*(noise))--; \ + if (!(*(noise))) \ + pr_notice("Further such events for this erase block will not be printed\n"); \ + } \ +} while (0) + +static uint32_t pseudo_random; + +static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s); + +/* These helper functions _must_ increase ofs and also do the dirty/used space accounting. + * Returning an error will abort the mount - bad checksums etc. should just mark the space + * as dirty. + */ +static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s); +static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s); + +static inline int min_free(struct jffs2_sb_info *c) +{ + uint32_t min = 2 * sizeof(struct jffs2_raw_inode); +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize) + return c->wbuf_pagesize; +#endif + return min; + +} + +static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) { + if (sector_size < DEFAULT_EMPTY_SCAN_SIZE) + return sector_size; + else + return DEFAULT_EMPTY_SCAN_SIZE; +} + +static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + int ret; + + if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1))) + return ret; + if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size))) + return ret; + /* Turned wasted size into dirty, since we apparently + think it's recoverable now. */ + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + if (VERYDIRTY(c, jeb->dirty_size)) { + list_add(&jeb->list, &c->very_dirty_list); + } else { + list_add(&jeb->list, &c->dirty_list); + } + return 0; +} + +int jffs2_scan_medium(struct jffs2_sb_info *c) +{ + int i, ret; + uint32_t empty_blocks = 0, bad_blocks = 0; + unsigned char *flashbuf = NULL; + uint32_t buf_size = 0; + struct jffs2_summary *s = NULL; /* summary info collected by the scan process */ +#ifndef __ECOS + size_t pointlen, try_size; + + ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); + if (!ret && pointlen < c->mtd->size) { + /* Don't muck about if it won't let us point to the whole flash */ + jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n", + pointlen); + mtd_unpoint(c->mtd, 0, pointlen); + flashbuf = NULL; + } + if (ret && ret != -EOPNOTSUPP) + jffs2_dbg(1, "MTD point failed %d\n", ret); +#endif + if (!flashbuf) { + /* For NAND it's quicker to read a whole eraseblock at a time, + apparently */ + if (jffs2_cleanmarker_oob(c)) + try_size = c->sector_size; + else + try_size = PAGE_SIZE; + + jffs2_dbg(1, "Trying to allocate readbuf of %zu " + "bytes\n", try_size); + + flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size); + if (!flashbuf) + return -ENOMEM; + + jffs2_dbg(1, "Allocated readbuf of %zu bytes\n", + try_size); + + buf_size = (uint32_t)try_size; + } + + if (jffs2_sum_active()) { + s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + if (!s) { + JFFS2_WARNING("Can't allocate memory for summary\n"); + ret = -ENOMEM; + goto out; + } + } + + for (i=0; inr_blocks; i++) { + struct jffs2_eraseblock *jeb = &c->blocks[i]; + + cond_resched(); + + /* reset summary info for next eraseblock scan */ + jffs2_sum_reset_collected(s); + + ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset), + buf_size, s); + + if (ret < 0) + goto out; + + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + /* Now decide which list to put it on */ + switch(ret) { + case BLK_STATE_ALLFF: + /* + * Empty block. Since we can't be sure it + * was entirely erased, we just queue it for erase + * again. It will be marked as such when the erase + * is complete. Meanwhile we still count it as empty + * for later checks. + */ + empty_blocks++; + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + break; + + case BLK_STATE_CLEANMARKER: + /* Only a CLEANMARKER node is valid */ + if (!jeb->dirty_size) { + /* It's actually free */ + list_add(&jeb->list, &c->free_list); + c->nr_free_blocks++; + } else { + /* Dirt */ + jffs2_dbg(1, "Adding all-dirty block at 0x%08x to erase_pending_list\n", + jeb->offset); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + } + break; + + case BLK_STATE_CLEAN: + /* Full (or almost full) of clean data. Clean list */ + list_add(&jeb->list, &c->clean_list); + break; + + case BLK_STATE_PARTDIRTY: + /* Some data, but not full. Dirty list. */ + /* We want to remember the block with most free space + and stick it in the 'nextblock' position to start writing to it. */ + if (jeb->free_size > min_free(c) && + (!c->nextblock || c->nextblock->free_size < jeb->free_size)) { + /* Better candidate for the next writes to go to */ + if (c->nextblock) { + ret = file_dirty(c, c->nextblock); + if (ret) + goto out; + /* deleting summary information of the old nextblock */ + jffs2_sum_reset_collected(c->summary); + } + /* update collected summary information for the current nextblock */ + jffs2_sum_move_collected(c, s); + jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n", + __func__, jeb->offset); + c->nextblock = jeb; + } else { + ret = file_dirty(c, jeb); + if (ret) + goto out; + } + break; + + case BLK_STATE_ALLDIRTY: + /* Nothing valid - not even a clean marker. Needs erasing. */ + /* For now we just put it on the erasing list. We'll start the erases later */ + jffs2_dbg(1, "Erase block at 0x%08x is not formatted. It will be erased\n", + jeb->offset); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + break; + + case BLK_STATE_BADBLOCK: + jffs2_dbg(1, "Block at 0x%08x is bad\n", jeb->offset); + list_add(&jeb->list, &c->bad_list); + c->bad_size += c->sector_size; + c->free_size -= c->sector_size; + bad_blocks++; + break; + default: + pr_warn("%s(): unknown block state\n", __func__); + BUG(); + } + } + + /* Nextblock dirty is always seen as wasted, because we cannot recycle it now */ + if (c->nextblock && (c->nextblock->dirty_size)) { + c->nextblock->wasted_size += c->nextblock->dirty_size; + c->wasted_size += c->nextblock->dirty_size; + c->dirty_size -= c->nextblock->dirty_size; + c->nextblock->dirty_size = 0; + } +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { + /* If we're going to start writing into a block which already + contains data, and the end of the data isn't page-aligned, + skip a little and align it. */ + + uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize; + + jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n", + __func__, skip); + jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + jffs2_scan_dirty_space(c, c->nextblock, skip); + } +#endif + if (c->nr_erasing_blocks) { + if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) { + pr_notice("Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n"); + pr_notice("empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n", + empty_blocks, bad_blocks, c->nr_blocks); + ret = -EIO; + goto out; + } + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + } + ret = 0; + out: + if (buf_size) + kfree(flashbuf); +#ifndef __ECOS + else + mtd_unpoint(c->mtd, 0, c->mtd->size); +#endif + kfree(s); + return ret; +} + +static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf, + uint32_t ofs, uint32_t len) +{ + int ret; + size_t retlen; + + ret = jffs2_flash_read(c, ofs, len, &retlen, buf); + if (ret) { + jffs2_dbg(1, "mtd->read(0x%x bytes from 0x%x) returned %d\n", + len, ofs, ret); + return ret; + } + if (retlen < len) { + jffs2_dbg(1, "Read at 0x%x gave only 0x%zx bytes\n", + ofs, retlen); + return -EIO; + } + return 0; +} + +int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size + && (!jeb->first_node || !ref_next(jeb->first_node)) ) + return BLK_STATE_CLEANMARKER; + + /* move blocks with max 4 byte dirty space to cleanlist */ + else if (!ISDIRTY(c->sector_size - (jeb->used_size + jeb->unchecked_size))) { + c->dirty_size -= jeb->dirty_size; + c->wasted_size += jeb->dirty_size; + jeb->wasted_size += jeb->dirty_size; + jeb->dirty_size = 0; + return BLK_STATE_CLEAN; + } else if (jeb->used_size || jeb->unchecked_size) + return BLK_STATE_PARTDIRTY; + else + return BLK_STATE_ALLDIRTY; +} + +#ifdef CONFIG_JFFS2_FS_XATTR +static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_xattr *rx, uint32_t ofs, + struct jffs2_summary *s) +{ + struct jffs2_xattr_datum *xd; + uint32_t xid, version, totlen, crc; + int err; + + crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4); + if (crc != je32_to_cpu(rx->node_crc)) { + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rx->node_crc), crc); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen)))) + return err; + return 0; + } + + xid = je32_to_cpu(rx->xid); + version = je32_to_cpu(rx->version); + + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + rx->name_len + 1 + je16_to_cpu(rx->value_len)); + if (totlen != je32_to_cpu(rx->totlen)) { + JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n", + ofs, je32_to_cpu(rx->totlen), totlen); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen)))) + return err; + return 0; + } + + xd = jffs2_setup_xattr_datum(c, xid, version); + if (IS_ERR(xd)) + return PTR_ERR(xd); + + if (xd->version > version) { + struct jffs2_raw_node_ref *raw + = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = version; + xd->xprefix = rx->xprefix; + xd->name_len = rx->name_len; + xd->value_len = je16_to_cpu(rx->value_len); + xd->data_crc = je32_to_cpu(rx->data_crc); + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd); + } + + if (jffs2_sum_active()) + jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset); + dbg_xattr("scanning xdatum at %#08x (xid=%u, version=%u)\n", + ofs, xd->xid, xd->version); + return 0; +} + +static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_xref *rr, uint32_t ofs, + struct jffs2_summary *s) +{ + struct jffs2_xattr_ref *ref; + uint32_t crc; + int err; + + crc = crc32(0, rr, sizeof(*rr) - 4); + if (crc != je32_to_cpu(rr->node_crc)) { + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rr->node_crc), crc); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen))))) + return err; + return 0; + } + + if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) { + JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n", + ofs, je32_to_cpu(rr->totlen), + PAD(sizeof(struct jffs2_raw_xref))); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen)))) + return err; + return 0; + } + + ref = jffs2_alloc_xattr_ref(); + if (!ref) + return -ENOMEM; + + /* BEFORE jffs2_build_xattr_subsystem() called, + * and AFTER xattr_ref is marked as a dead xref, + * ref->xid is used to store 32bit xid, xd is not used + * ref->ino is used to store 32bit inode-number, ic is not used + * Thoes variables are declared as union, thus using those + * are exclusive. In a similar way, ref->next is temporarily + * used to chain all xattr_ref object. It's re-chained to + * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly. + */ + ref->ino = je32_to_cpu(rr->ino); + ref->xid = je32_to_cpu(rr->xid); + ref->xseqno = je32_to_cpu(rr->xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); + ref->next = c->xref_temp; + c->xref_temp = ref; + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref); + + if (jffs2_sum_active()) + jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset); + dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n", + ofs, ref->xid, ref->ino); + return 0; +} +#endif + +/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into + the flash, XIP-style */ +static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { + struct jffs2_unknown_node *node; + struct jffs2_unknown_node crcnode; + uint32_t ofs, prevofs, max_ofs; + uint32_t hdr_crc, buf_ofs, buf_len; + int err; + int noise = 0; + + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + int cleanmarkerfound = 0; +#endif + + ofs = jeb->offset; + prevofs = jeb->offset - 1; + + jffs2_dbg(1, "%s(): Scanning block at 0x%x\n", __func__, ofs); + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (jffs2_cleanmarker_oob(c)) { + int ret; + + if (mtd_block_isbad(c->mtd, jeb->offset)) + return BLK_STATE_BADBLOCK; + + ret = jffs2_check_nand_cleanmarker(c, jeb); + jffs2_dbg(2, "jffs_check_nand_cleanmarker returned %d\n", ret); + + /* Even if it's not found, we still scan to see + if the block is empty. We use this information + to decide whether to erase it or not. */ + switch (ret) { + case 0: cleanmarkerfound = 1; break; + case 1: break; + default: return ret; + } + } +#endif + + if (jffs2_sum_active()) { + struct jffs2_sum_marker *sm; + void *sumptr = NULL; + uint32_t sumlen; + + if (!buf_size) { + /* XIP case. Just look, point at the summary if it's there */ + sm = (void *)buf + c->sector_size - sizeof(*sm); + if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) { + sumptr = buf + je32_to_cpu(sm->offset); + sumlen = c->sector_size - je32_to_cpu(sm->offset); + } + } else { + /* If NAND flash, read a whole page of it. Else just the end */ + if (c->wbuf_pagesize) + buf_len = c->wbuf_pagesize; + else + buf_len = sizeof(*sm); + + /* Read as much as we want into the _end_ of the preallocated buffer */ + err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, + jeb->offset + c->sector_size - buf_len, + buf_len); + if (err) + return err; + + sm = (void *)buf + buf_size - sizeof(*sm); + if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) { + sumlen = c->sector_size - je32_to_cpu(sm->offset); + sumptr = buf + buf_size - sumlen; + + /* sm->offset maybe wrong but MAGIC maybe right */ + if (sumlen > c->sector_size) + goto full_scan; + + /* Now, make sure the summary itself is available */ + if (sumlen > buf_size) { + /* Need to kmalloc for this. */ + sumptr = kmalloc(sumlen, GFP_KERNEL); + if (!sumptr) + return -ENOMEM; + memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len); + } + if (buf_len < sumlen) { + /* Need to read more so that the entire summary node is present */ + err = jffs2_fill_scan_buf(c, sumptr, + jeb->offset + c->sector_size - sumlen, + sumlen - buf_len); + if (err) + return err; + } + } + + } + + if (sumptr) { + err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random); + + if (buf_size && sumlen > buf_size) + kfree(sumptr); + /* If it returns with a real error, bail. + If it returns positive, that's a block classification + (i.e. BLK_STATE_xxx) so return that too. + If it returns zero, fall through to full scan. */ + if (err) + return err; + } + } + +full_scan: + buf_ofs = jeb->offset; + + if (!buf_size) { + /* This is the XIP case -- we're reading _directly_ from the flash chip */ + buf_len = c->sector_size; + } else { + buf_len = EMPTY_SCAN_SIZE(c->sector_size); + err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len); + if (err) + return err; + } + + /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ + ofs = 0; + max_ofs = EMPTY_SCAN_SIZE(c->sector_size); + /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */ + while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) + ofs += 4; + + if (ofs == max_ofs) { +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (jffs2_cleanmarker_oob(c)) { + /* scan oob, take care of cleanmarker */ + int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound); + jffs2_dbg(2, "jffs2_check_oob_empty returned %d\n", + ret); + switch (ret) { + case 0: return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF; + case 1: return BLK_STATE_ALLDIRTY; + default: return ret; + } + } +#endif + jffs2_dbg(1, "Block at 0x%08x is empty (erased)\n", + jeb->offset); + if (c->cleanmarker_size == 0) + return BLK_STATE_CLEANMARKER; /* don't bother with re-erase */ + else + return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */ + } + if (ofs) { + jffs2_dbg(1, "Free space at %08x ends at %08x\n", jeb->offset, + jeb->offset + ofs); + if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1))) + return err; + if ((err = jffs2_scan_dirty_space(c, jeb, ofs))) + return err; + } + + /* Now ofs is a complete physical flash offset as it always was... */ + ofs += jeb->offset; + + noise = 10; + + dbg_summary("no summary found in jeb 0x%08x. Apply original scan.\n",jeb->offset); + +scan_more: + while(ofs < jeb->offset + c->sector_size) { + + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + /* Make sure there are node refs available for use */ + err = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (err) + return err; + + cond_resched(); + + if (ofs & 3) { + pr_warn("Eep. ofs 0x%08x not word-aligned!\n", ofs); + ofs = PAD(ofs); + continue; + } + if (ofs == prevofs) { + pr_warn("ofs 0x%08x has already been seen. Skipping\n", + ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + prevofs = ofs; + + if (jeb->offset + c->sector_size < ofs + sizeof(*node)) { + jffs2_dbg(1, "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", + sizeof(struct jffs2_unknown_node), + jeb->offset, c->sector_size, ofs, + sizeof(*node)); + if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs))) + return err; + break; + } + + if (buf_ofs + buf_len < ofs + sizeof(*node)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + jffs2_dbg(1, "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_unknown_node), + buf_len, ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + } + + node = (struct jffs2_unknown_node *)&buf[ofs-buf_ofs]; + + if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) { + uint32_t inbuf_ofs; + uint32_t empty_start, scan_end; + + empty_start = ofs; + ofs += 4; + scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len); + + jffs2_dbg(1, "Found empty flash at 0x%08x\n", ofs); + more_empty: + inbuf_ofs = ofs - buf_ofs; + while (inbuf_ofs < scan_end) { + if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) { + pr_warn("Empty flash at 0x%08x ends at 0x%08x\n", + empty_start, ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start))) + return err; + goto scan_more; + } + + inbuf_ofs+=4; + ofs += 4; + } + /* Ran off end. */ + jffs2_dbg(1, "Empty flash to end of buffer at 0x%08x\n", + ofs); + + /* If we're only checking the beginning of a block with a cleanmarker, + bail now */ + if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) && + c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) { + jffs2_dbg(1, "%d bytes at start of block seems clean... assuming all clean\n", + EMPTY_SCAN_SIZE(c->sector_size)); + return BLK_STATE_CLEANMARKER; + } + if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */ + scan_end = buf_len; + goto more_empty; + } + + /* See how much more there is to read in this eraseblock... */ + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + if (!buf_len) { + /* No more to read. Break out of main loop without marking + this range of empty space as dirty (because it's not) */ + jffs2_dbg(1, "Empty flash at %08x runs to end of block. Treating as free_space\n", + empty_start); + break; + } + /* point never reaches here */ + scan_end = buf_len; + jffs2_dbg(1, "Reading another 0x%x at 0x%08x\n", + buf_len, ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + goto more_empty; + } + + if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) { + pr_warn("Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", + ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) { + jffs2_dbg(1, "Dirty bitmask at 0x%08x\n", ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) { + pr_warn("Old JFFS2 bitmask found at 0x%08x\n", ofs); + pr_warn("You cannot use older JFFS2 filesystems with newer kernels\n"); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) { + /* OK. We're out of possibilities. Whinge and move on */ + noisy_printk(&noise, "%s(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n", + __func__, + JFFS2_MAGIC_BITMASK, ofs, + je16_to_cpu(node->magic)); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + /* We seem to have a node of sorts. Check the CRC */ + crcnode.magic = node->magic; + crcnode.nodetype = cpu_to_je16( je16_to_cpu(node->nodetype) | JFFS2_NODE_ACCURATE); + crcnode.totlen = node->totlen; + hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4); + + if (hdr_crc != je32_to_cpu(node->hdr_crc)) { + noisy_printk(&noise, "%s(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n", + __func__, + ofs, je16_to_cpu(node->magic), + je16_to_cpu(node->nodetype), + je32_to_cpu(node->totlen), + je32_to_cpu(node->hdr_crc), + hdr_crc); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + + if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) { + /* Eep. Node goes over the end of the erase block. */ + pr_warn("Node at 0x%08x with length 0x%08x would run over the end of the erase block\n", + ofs, je32_to_cpu(node->totlen)); + pr_warn("Perhaps the file system was created with the wrong erase size?\n"); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + + if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) { + /* Wheee. This is an obsoleted node */ + jffs2_dbg(2, "Node at 0x%08x is obsolete. Skipping\n", + ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + continue; + } + + switch(je16_to_cpu(node->nodetype)) { + case JFFS2_NODETYPE_INODE: + if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + jffs2_dbg(1, "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_raw_inode), + buf_len, ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_inode_node(c, jeb, (void *)node, ofs, s); + if (err) return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_NODETYPE_DIRENT: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + jffs2_dbg(1, "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_dirent_node(c, jeb, (void *)node, ofs, s); + if (err) return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + jffs2_dbg(1, "Fewer than %d bytes (xattr node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s); + if (err) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + case JFFS2_NODETYPE_XREF: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + jffs2_dbg(1, "Fewer than %d bytes (xref node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s); + if (err) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; +#endif /* CONFIG_JFFS2_FS_XATTR */ + + case JFFS2_NODETYPE_CLEANMARKER: + jffs2_dbg(1, "CLEANMARKER node found at 0x%08x\n", ofs); + if (je32_to_cpu(node->totlen) != c->cleanmarker_size) { + pr_notice("CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n", + ofs, je32_to_cpu(node->totlen), + c->cleanmarker_size); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) + return err; + ofs += PAD(sizeof(struct jffs2_unknown_node)); + } else if (jeb->first_node) { + pr_notice("CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", + ofs, jeb->offset); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) + return err; + ofs += PAD(sizeof(struct jffs2_unknown_node)); + } else { + jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL); + + ofs += PAD(c->cleanmarker_size); + } + break; + + case JFFS2_NODETYPE_PADDING: + if (jffs2_sum_active()) + jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen)); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + default: + switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) { + case JFFS2_FEATURE_ROCOMPAT: + pr_notice("Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); + c->flags |= JFFS2_SB_FLAG_RO; + if (!(jffs2_is_readonly(c))) + return -EROFS; + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_FEATURE_INCOMPAT: + pr_notice("Incompatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); + return -EINVAL; + + case JFFS2_FEATURE_RWCOMPAT_DELETE: + jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_FEATURE_RWCOMPAT_COPY: { + jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL); + + /* We can't summarise nodes we don't grok */ + jffs2_sum_disable_collecting(s); + ofs += PAD(je32_to_cpu(node->totlen)); + break; + } + } + } + } + + if (jffs2_sum_active()) { + if (PAD(s->sum_size + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size) { + dbg_summary("There is not enough space for " + "summary information, disabling for this jeb!\n"); + jffs2_sum_disable_collecting(s); + } + } + + jffs2_dbg(1, "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->unchecked_size, jeb->used_size, jeb->wasted_size); + + /* mark_node_obsolete can add to wasted !! */ + if (jeb->wasted_size) { + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + } + + return jffs2_scan_classify_jeb(c, jeb); +} + +struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inode_cache *ic; + + ic = jffs2_get_ino_cache(c, ino); + if (ic) + return ic; + + if (ino > c->highest_ino) + c->highest_ino = ino; + + ic = jffs2_alloc_inode_cache(); + if (!ic) { + pr_notice("%s(): allocation of inode cache failed\n", __func__); + return NULL; + } + memset(ic, 0, sizeof(*ic)); + + ic->ino = ino; + ic->nodes = (void *)ic; + jffs2_add_ino_cache(c, ic); + if (ino == 1) + ic->pino_nlink = 1; + return ic; +} + +static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s) +{ + struct jffs2_inode_cache *ic; + uint32_t crc, ino = je32_to_cpu(ri->ino); + + jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs); + + /* We do very little here now. Just check the ino# to which we should attribute + this node; we can do all the CRC checking etc. later. There's a tradeoff here -- + we used to scan the flash once only, reading everything we want from it into + memory, then building all our in-core data structures and freeing the extra + information. Now we allow the first part of the mount to complete a lot quicker, + but we have to go _back_ to the flash in order to finish the CRC checking, etc. + Which means that the _full_ amount of time to get to proper write mode with GC + operational may actually be _longer_ than before. Sucks to be me. */ + + /* Check the node CRC in any case. */ + crc = crc32(0, ri, sizeof(*ri)-8); + if (crc != je32_to_cpu(ri->node_crc)) { + pr_notice("%s(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(ri->node_crc), crc); + /* + * We believe totlen because the CRC on the node + * _header_ was OK, just the node itself failed. + */ + return jffs2_scan_dirty_space(c, jeb, + PAD(je32_to_cpu(ri->totlen))); + } + + ic = jffs2_get_ino_cache(c, ino); + if (!ic) { + ic = jffs2_scan_make_ino_cache(c, ino); + if (!ic) + return -ENOMEM; + } + + /* Wheee. It worked */ + jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic); + + jffs2_dbg(1, "Node is ino #%u, version %d. Range 0x%x-0x%x\n", + je32_to_cpu(ri->ino), je32_to_cpu(ri->version), + je32_to_cpu(ri->offset), + je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize)); + + pseudo_random += je32_to_cpu(ri->version); + + if (jffs2_sum_active()) { + jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset); + } + + return 0; +} + +static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s) +{ + struct jffs2_full_dirent *fd; + struct jffs2_inode_cache *ic; + uint32_t checkedlen; + uint32_t crc; + int err; + + jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs); + + /* We don't get here unless the node is still valid, so we don't have to + mask in the ACCURATE bit any more. */ + crc = crc32(0, rd, sizeof(*rd)-8); + + if (crc != je32_to_cpu(rd->node_crc)) { + pr_notice("%s(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(rd->node_crc), crc); + /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */ + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen))))) + return err; + return 0; + } + + pseudo_random += je32_to_cpu(rd->version); + + /* Should never happen. Did. (OLPC trac #4184)*/ + checkedlen = strnlen(rd->name, rd->nsize); + if (checkedlen < rd->nsize) { + pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n", + ofs, checkedlen); + } + fd = jffs2_alloc_full_dirent(checkedlen+1); + if (!fd) { + return -ENOMEM; + } + memcpy(&fd->name, rd->name, checkedlen); + fd->name[checkedlen] = 0; + + crc = crc32(0, fd->name, rd->nsize); + if (crc != je32_to_cpu(rd->name_crc)) { + pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(rd->name_crc), crc); + jffs2_dbg(1, "Name for which CRC failed is (now) '%s', ino #%d\n", + fd->name, je32_to_cpu(rd->ino)); + jffs2_free_full_dirent(fd); + /* FIXME: Why do we believe totlen? */ + /* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */ + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen))))) + return err; + return 0; + } + ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino)); + if (!ic) { + jffs2_free_full_dirent(fd); + return -ENOMEM; + } + + fd->raw = jffs2_link_node_ref(c, jeb, ofs | dirent_node_state(rd), + PAD(je32_to_cpu(rd->totlen)), ic); + + fd->next = NULL; + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->nhash = full_name_hash(fd->name, checkedlen); + fd->type = rd->type; + jffs2_add_fd_to_list(c, fd, &ic->scan_dents); + + if (jffs2_sum_active()) { + jffs2_sum_add_dirent_mem(s, rd, ofs - jeb->offset); + } + + return 0; +} + +static int count_list(struct list_head *l) +{ + uint32_t count = 0; + struct list_head *tmp; + + list_for_each(tmp, l) { + count++; + } + return count; +} + +/* Note: This breaks if list_empty(head). I don't care. You + might, if you copy this code and use it elsewhere :) */ +static void rotate_list(struct list_head *head, uint32_t count) +{ + struct list_head *n = head->next; + + list_del(head); + while(count--) { + n = n->next; + } + list_add(head, n); +} + +void jffs2_rotate_lists(struct jffs2_sb_info *c) +{ + uint32_t x; + uint32_t rotateby; + + x = count_list(&c->clean_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->clean_list), rotateby); + } + + x = count_list(&c->very_dirty_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->very_dirty_list), rotateby); + } + + x = count_list(&c->dirty_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->dirty_list), rotateby); + } + + x = count_list(&c->erasable_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->erasable_list), rotateby); + } + + if (c->nr_erasing_blocks) { + rotateby = pseudo_random % c->nr_erasing_blocks; + rotate_list((&c->erase_pending_list), rotateby); + } + + if (c->nr_free_blocks) { + rotateby = pseudo_random % c->nr_free_blocks; + rotate_list((&c->free_list), rotateby); + } +} diff --git a/kernel/fs/jffs2/security.c b/kernel/fs/jffs2/security.c new file mode 100644 index 000000000..d4b43fb7a --- /dev/null +++ b/kernel/fs/jffs2/security.c @@ -0,0 +1,89 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +/* ---- Initial Security Label(s) Attachment callback --- */ +static int jffs2_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; + } + return err; +} + +/* ---- Initial Security Label(s) Attachment ----------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jffs2_initxattrs, NULL); +} + +/* ---- XATTR Handler for "security.*" ----------------- */ +static int jffs2_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + name, buffer, size); +} + +static int jffs2_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + name, buffer, size, flags); +} + +static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; + + if (list && retlen <= list_size) { + strcpy(list, XATTR_SECURITY_PREFIX); + strcpy(list + XATTR_SECURITY_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = jffs2_security_listxattr, + .set = jffs2_security_setxattr, + .get = jffs2_security_getxattr +}; diff --git a/kernel/fs/jffs2/summary.c b/kernel/fs/jffs2/summary.c new file mode 100644 index 000000000..bc5385471 --- /dev/null +++ b/kernel/fs/jffs2/summary.c @@ -0,0 +1,874 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * Zoltan Sogor , + * Patrik Kluba , + * University of Szeged, Hungary + * 2006 KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "debug.h" + +int jffs2_sum_init(struct jffs2_sb_info *c) +{ + uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); + + c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + + if (!c->summary) { + JFFS2_WARNING("Can't allocate memory for summary information!\n"); + return -ENOMEM; + } + + c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL); + + if (!c->summary->sum_buf) { + JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n"); + kfree(c->summary); + return -ENOMEM; + } + + dbg_summary("returned successfully\n"); + + return 0; +} + +void jffs2_sum_exit(struct jffs2_sb_info *c) +{ + dbg_summary("called\n"); + + jffs2_sum_disable_collecting(c->summary); + + kfree(c->summary->sum_buf); + c->summary->sum_buf = NULL; + + kfree(c->summary); + c->summary = NULL; +} + +static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item) +{ + if (!s->sum_list_head) + s->sum_list_head = (union jffs2_sum_mem *) item; + if (s->sum_list_tail) + s->sum_list_tail->u.next = (union jffs2_sum_mem *) item; + s->sum_list_tail = (union jffs2_sum_mem *) item; + + switch (je16_to_cpu(item->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + s->sum_size += JFFS2_SUMMARY_INODE_SIZE; + s->sum_num++; + dbg_summary("inode (%u) added to summary\n", + je32_to_cpu(item->i.inode)); + break; + case JFFS2_NODETYPE_DIRENT: + s->sum_size += JFFS2_SUMMARY_DIRENT_SIZE(item->d.nsize); + s->sum_num++; + dbg_summary("dirent (%u) added to summary\n", + je32_to_cpu(item->d.ino)); + break; +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: + s->sum_size += JFFS2_SUMMARY_XATTR_SIZE; + s->sum_num++; + dbg_summary("xattr (xid=%u, version=%u) added to summary\n", + je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version)); + break; + case JFFS2_NODETYPE_XREF: + s->sum_size += JFFS2_SUMMARY_XREF_SIZE; + s->sum_num++; + dbg_summary("xref added to summary\n"); + break; +#endif + default: + JFFS2_WARNING("UNKNOWN node type %u\n", + je16_to_cpu(item->u.nodetype)); + return 1; + } + return 0; +} + + +/* The following 3 functions are called from scan.c to collect summary info for not closed jeb */ + +int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size) +{ + dbg_summary("called with %u\n", size); + s->sum_padded += size; + return 0; +} + +int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, + uint32_t ofs) +{ + struct jffs2_sum_inode_mem *temp = kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL); + + if (!temp) + return -ENOMEM; + + temp->nodetype = ri->nodetype; + temp->inode = ri->ino; + temp->version = ri->version; + temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */ + temp->totlen = ri->totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, + uint32_t ofs) +{ + struct jffs2_sum_dirent_mem *temp = + kmalloc(sizeof(struct jffs2_sum_dirent_mem) + rd->nsize, GFP_KERNEL); + + if (!temp) + return -ENOMEM; + + temp->nodetype = rd->nodetype; + temp->totlen = rd->totlen; + temp->offset = cpu_to_je32(ofs); /* relative from the beginning of the jeb */ + temp->pino = rd->pino; + temp->version = rd->version; + temp->ino = rd->ino; + temp->nsize = rd->nsize; + temp->type = rd->type; + temp->next = NULL; + + memcpy(temp->name, rd->name, rd->nsize); + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +#ifdef CONFIG_JFFS2_FS_XATTR +int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs) +{ + struct jffs2_sum_xattr_mem *temp; + + temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL); + if (!temp) + return -ENOMEM; + + temp->nodetype = rx->nodetype; + temp->xid = rx->xid; + temp->version = rx->version; + temp->offset = cpu_to_je32(ofs); + temp->totlen = rx->totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs) +{ + struct jffs2_sum_xref_mem *temp; + + temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL); + if (!temp) + return -ENOMEM; + + temp->nodetype = rr->nodetype; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} +#endif +/* Cleanup every collected summary information */ + +static void jffs2_sum_clean_collected(struct jffs2_summary *s) +{ + union jffs2_sum_mem *temp; + + if (!s->sum_list_head) { + dbg_summary("already empty\n"); + } + while (s->sum_list_head) { + temp = s->sum_list_head; + s->sum_list_head = s->sum_list_head->u.next; + kfree(temp); + } + s->sum_list_tail = NULL; + s->sum_padded = 0; + s->sum_num = 0; +} + +void jffs2_sum_reset_collected(struct jffs2_summary *s) +{ + dbg_summary("called\n"); + jffs2_sum_clean_collected(s); + s->sum_size = 0; +} + +void jffs2_sum_disable_collecting(struct jffs2_summary *s) +{ + dbg_summary("called\n"); + jffs2_sum_clean_collected(s); + s->sum_size = JFFS2_SUMMARY_NOSUM_SIZE; +} + +int jffs2_sum_is_disabled(struct jffs2_summary *s) +{ + return (s->sum_size == JFFS2_SUMMARY_NOSUM_SIZE); +} + +/* Move the collected summary information into sb (called from scan.c) */ + +void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s) +{ + dbg_summary("oldsize=0x%x oldnum=%u => newsize=0x%x newnum=%u\n", + c->summary->sum_size, c->summary->sum_num, + s->sum_size, s->sum_num); + + c->summary->sum_size = s->sum_size; + c->summary->sum_num = s->sum_num; + c->summary->sum_padded = s->sum_padded; + c->summary->sum_list_head = s->sum_list_head; + c->summary->sum_list_tail = s->sum_list_tail; + + s->sum_list_head = s->sum_list_tail = NULL; +} + +/* Called from wbuf.c to collect writed node info */ + +int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, uint32_t ofs) +{ + union jffs2_node_union *node; + struct jffs2_eraseblock *jeb; + + if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) { + dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n"); + return 0; + } + + node = invecs[0].iov_base; + jeb = &c->blocks[ofs / c->sector_size]; + ofs -= jeb->offset; + + switch (je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_mem *temp = + kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL); + + if (!temp) + goto no_mem; + + temp->nodetype = node->i.nodetype; + temp->inode = node->i.ino; + temp->version = node->i.version; + temp->offset = cpu_to_je32(ofs); + temp->totlen = node->i.totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_mem *temp = + kmalloc(sizeof(struct jffs2_sum_dirent_mem) + node->d.nsize, GFP_KERNEL); + + if (!temp) + goto no_mem; + + temp->nodetype = node->d.nodetype; + temp->totlen = node->d.totlen; + temp->offset = cpu_to_je32(ofs); + temp->pino = node->d.pino; + temp->version = node->d.version; + temp->ino = node->d.ino; + temp->nsize = node->d.nsize; + temp->type = node->d.type; + temp->next = NULL; + + switch (count) { + case 1: + memcpy(temp->name,node->d.name,node->d.nsize); + break; + + case 2: + memcpy(temp->name,invecs[1].iov_base,node->d.nsize); + break; + + default: + BUG(); /* impossible count value */ + break; + } + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_sum_xattr_mem *temp; + temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL); + if (!temp) + goto no_mem; + + temp->nodetype = node->x.nodetype; + temp->xid = node->x.xid; + temp->version = node->x.version; + temp->totlen = node->x.totlen; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_sum_xref_mem *temp; + temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL); + if (!temp) + goto no_mem; + temp->nodetype = node->r.nodetype; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } +#endif + case JFFS2_NODETYPE_PADDING: + dbg_summary("node PADDING\n"); + c->summary->sum_padded += je32_to_cpu(node->u.totlen); + break; + + case JFFS2_NODETYPE_CLEANMARKER: + dbg_summary("node CLEANMARKER\n"); + break; + + case JFFS2_NODETYPE_SUMMARY: + dbg_summary("node SUMMARY\n"); + break; + + default: + /* If you implement a new node type you should also implement + summary support for it or disable summary. + */ + BUG(); + break; + } + + return 0; + +no_mem: + JFFS2_WARNING("MEMORY ALLOCATION ERROR!"); + return -ENOMEM; +} + +static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + /* If there was a gap, mark it dirty */ + if ((ofs & ~3) > c->sector_size - jeb->free_size) { + /* Ew. Summary doesn't actually tell us explicitly about dirty space */ + jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size)); + } + + return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic); +} + +/* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */ + +static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t *pseudo_random) +{ + struct jffs2_inode_cache *ic; + struct jffs2_full_dirent *fd; + void *sp; + int i, ino; + int err; + + sp = summary->sum; + + for (i=0; isum_num); i++) { + dbg_summary("processing summary index %d\n", i); + + cond_resched(); + + /* Make sure there's a spare ref for dirty space */ + err = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (err) + return err; + + switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_flash *spi; + spi = sp; + + ino = je32_to_cpu(spi->inode); + + dbg_summary("Inode at 0x%08x-0x%08x\n", + jeb->offset + je32_to_cpu(spi->offset), + jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen)); + + ic = jffs2_scan_make_ino_cache(c, ino); + if (!ic) { + JFFS2_NOTICE("scan_make_ino_cache failed\n"); + return -ENOMEM; + } + + sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spi->totlen)), ic); + + *pseudo_random += je32_to_cpu(spi->version); + + sp += JFFS2_SUMMARY_INODE_SIZE; + + break; + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_flash *spd; + int checkedlen; + spd = sp; + + dbg_summary("Dirent at 0x%08x-0x%08x\n", + jeb->offset + je32_to_cpu(spd->offset), + jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen)); + + + /* This should never happen, but https://dev.laptop.org/ticket/4184 */ + checkedlen = strnlen(spd->name, spd->nsize); + if (!checkedlen) { + pr_err("Dirent at %08x has zero at start of name. Aborting mount.\n", + jeb->offset + + je32_to_cpu(spd->offset)); + return -EIO; + } + if (checkedlen < spd->nsize) { + pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n", + jeb->offset + + je32_to_cpu(spd->offset), + checkedlen); + } + + + fd = jffs2_alloc_full_dirent(checkedlen+1); + if (!fd) + return -ENOMEM; + + memcpy(&fd->name, spd->name, checkedlen); + fd->name[checkedlen] = 0; + + ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino)); + if (!ic) { + jffs2_free_full_dirent(fd); + return -ENOMEM; + } + + fd->raw = sum_link_node_ref(c, jeb, je32_to_cpu(spd->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spd->totlen)), ic); + + fd->next = NULL; + fd->version = je32_to_cpu(spd->version); + fd->ino = je32_to_cpu(spd->ino); + fd->nhash = full_name_hash(fd->name, checkedlen); + fd->type = spd->type; + + jffs2_add_fd_to_list(c, fd, &ic->scan_dents); + + *pseudo_random += je32_to_cpu(spd->version); + + sp += JFFS2_SUMMARY_DIRENT_SIZE(spd->nsize); + + break; + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_xattr_datum *xd; + struct jffs2_sum_xattr_flash *spx; + + spx = (struct jffs2_sum_xattr_flash *)sp; + dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", + jeb->offset + je32_to_cpu(spx->offset), + jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen), + je32_to_cpu(spx->xid), je32_to_cpu(spx->version)); + + xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid), + je32_to_cpu(spx->version)); + if (IS_ERR(xd)) + return PTR_ERR(xd); + if (xd->version > je32_to_cpu(spx->version)) { + /* node is not the newest one */ + struct jffs2_raw_node_ref *raw + = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = je32_to_cpu(spx->version); + sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), (void *)xd); + } + *pseudo_random += je32_to_cpu(spx->xid); + sp += JFFS2_SUMMARY_XATTR_SIZE; + + break; + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_xattr_ref *ref; + struct jffs2_sum_xref_flash *spr; + + spr = (struct jffs2_sum_xref_flash *)sp; + dbg_summary("xref at %#08x-%#08x\n", + jeb->offset + je32_to_cpu(spr->offset), + jeb->offset + je32_to_cpu(spr->offset) + + (uint32_t)PAD(sizeof(struct jffs2_raw_xref))); + + ref = jffs2_alloc_xattr_ref(); + if (!ref) { + JFFS2_NOTICE("allocation of xattr_datum failed\n"); + return -ENOMEM; + } + ref->next = c->xref_temp; + c->xref_temp = ref; + + sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED, + PAD(sizeof(struct jffs2_raw_xref)), (void *)ref); + + *pseudo_random += ref->node->flash_offset; + sp += JFFS2_SUMMARY_XREF_SIZE; + + break; + } +#endif + default : { + uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype); + JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype); + if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT) + return -EIO; + + /* For compatible node types, just fall back to the full scan */ + c->wasted_size -= jeb->wasted_size; + c->free_size += c->sector_size - jeb->free_size; + c->used_size -= jeb->used_size; + c->dirty_size -= jeb->dirty_size; + jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0; + jeb->free_size = c->sector_size; + + jffs2_free_jeb_node_refs(c, jeb); + return -ENOTRECOVERABLE; + } + } + } + return 0; +} + +/* Process the summary node - called from jffs2_scan_eraseblock() */ +int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t sumsize, + uint32_t *pseudo_random) +{ + struct jffs2_unknown_node crcnode; + int ret, ofs; + uint32_t crc; + + ofs = c->sector_size - sumsize; + + dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n", + jeb->offset, jeb->offset + ofs, sumsize); + + /* OK, now check for node validity and CRC */ + crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + crcnode.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY); + crcnode.totlen = summary->totlen; + crc = crc32(0, &crcnode, sizeof(crcnode)-4); + + if (je32_to_cpu(summary->hdr_crc) != crc) { + dbg_summary("Summary node header is corrupt (bad CRC or " + "no summary at all)\n"); + goto crc_err; + } + + if (je32_to_cpu(summary->totlen) != sumsize) { + dbg_summary("Summary node is corrupt (wrong erasesize?)\n"); + goto crc_err; + } + + crc = crc32(0, summary, sizeof(struct jffs2_raw_summary)-8); + + if (je32_to_cpu(summary->node_crc) != crc) { + dbg_summary("Summary node is corrupt (bad CRC)\n"); + goto crc_err; + } + + crc = crc32(0, summary->sum, sumsize - sizeof(struct jffs2_raw_summary)); + + if (je32_to_cpu(summary->sum_crc) != crc) { + dbg_summary("Summary node data is corrupt (bad CRC)\n"); + goto crc_err; + } + + if ( je32_to_cpu(summary->cln_mkr) ) { + + dbg_summary("Summary : CLEANMARKER node \n"); + + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + if (ret) + return ret; + + if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) { + dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n", + je32_to_cpu(summary->cln_mkr), c->cleanmarker_size); + if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr))))) + return ret; + } else if (jeb->first_node) { + dbg_summary("CLEANMARKER node not first node in block " + "(0x%08x)\n", jeb->offset); + if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr))))) + return ret; + } else { + jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, + je32_to_cpu(summary->cln_mkr), NULL); + } + } + + ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random); + /* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full + scan of this eraseblock. So return zero */ + if (ret == -ENOTRECOVERABLE) + return 0; + if (ret) + return ret; /* real error */ + + /* for PARANOIA_CHECK */ + ret = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (ret) + return ret; + + sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL); + + if (unlikely(jeb->free_size)) { + JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n", + jeb->free_size, jeb->offset); + jeb->wasted_size += jeb->free_size; + c->wasted_size += jeb->free_size; + c->free_size -= jeb->free_size; + jeb->free_size = 0; + } + + return jffs2_scan_classify_jeb(c, jeb); + +crc_err: + JFFS2_WARNING("Summary node crc error, skipping summary information.\n"); + + return 0; +} + +/* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */ + +static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + uint32_t infosize, uint32_t datasize, int padsize) +{ + struct jffs2_raw_summary isum; + union jffs2_sum_mem *temp; + struct jffs2_sum_marker *sm; + struct kvec vecs[2]; + uint32_t sum_ofs; + void *wpage; + int ret; + size_t retlen; + + if (padsize + datasize > MAX_SUMMARY_SIZE) { + /* It won't fit in the buffer. Abort summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n", + datasize, padsize, jeb->offset); + /* Non-fatal */ + return 0; + } + /* Is there enough space for summary? */ + if (padsize < 0) { + /* don't try to write out summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Not enough space for summary, padsize = %d\n", + padsize); + /* Non-fatal */ + return 0; + } + + memset(c->summary->sum_buf, 0xff, datasize); + memset(&isum, 0, sizeof(isum)); + + isum.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + isum.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY); + isum.totlen = cpu_to_je32(infosize); + isum.hdr_crc = cpu_to_je32(crc32(0, &isum, sizeof(struct jffs2_unknown_node) - 4)); + isum.padded = cpu_to_je32(c->summary->sum_padded); + isum.cln_mkr = cpu_to_je32(c->cleanmarker_size); + isum.sum_num = cpu_to_je32(c->summary->sum_num); + wpage = c->summary->sum_buf; + + while (c->summary->sum_num) { + temp = c->summary->sum_list_head; + + switch (je16_to_cpu(temp->u.nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_flash *sino_ptr = wpage; + + sino_ptr->nodetype = temp->i.nodetype; + sino_ptr->inode = temp->i.inode; + sino_ptr->version = temp->i.version; + sino_ptr->offset = temp->i.offset; + sino_ptr->totlen = temp->i.totlen; + + wpage += JFFS2_SUMMARY_INODE_SIZE; + + break; + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage; + + sdrnt_ptr->nodetype = temp->d.nodetype; + sdrnt_ptr->totlen = temp->d.totlen; + sdrnt_ptr->offset = temp->d.offset; + sdrnt_ptr->pino = temp->d.pino; + sdrnt_ptr->version = temp->d.version; + sdrnt_ptr->ino = temp->d.ino; + sdrnt_ptr->nsize = temp->d.nsize; + sdrnt_ptr->type = temp->d.type; + + memcpy(sdrnt_ptr->name, temp->d.name, + temp->d.nsize); + + wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize); + + break; + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_sum_xattr_flash *sxattr_ptr = wpage; + + temp = c->summary->sum_list_head; + sxattr_ptr->nodetype = temp->x.nodetype; + sxattr_ptr->xid = temp->x.xid; + sxattr_ptr->version = temp->x.version; + sxattr_ptr->offset = temp->x.offset; + sxattr_ptr->totlen = temp->x.totlen; + + wpage += JFFS2_SUMMARY_XATTR_SIZE; + break; + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_sum_xref_flash *sxref_ptr = wpage; + + temp = c->summary->sum_list_head; + sxref_ptr->nodetype = temp->r.nodetype; + sxref_ptr->offset = temp->r.offset; + + wpage += JFFS2_SUMMARY_XREF_SIZE; + break; + } +#endif + default : { + if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK) + == JFFS2_FEATURE_RWCOMPAT_COPY) { + dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n", + je16_to_cpu(temp->u.nodetype)); + jffs2_sum_disable_collecting(c->summary); + } else { + BUG(); /* unknown node in summary information */ + } + } + } + + c->summary->sum_list_head = temp->u.next; + kfree(temp); + + c->summary->sum_num--; + } + + jffs2_sum_reset_collected(c->summary); + + wpage += padsize; + + sm = wpage; + sm->offset = cpu_to_je32(c->sector_size - jeb->free_size); + sm->magic = cpu_to_je32(JFFS2_SUM_MAGIC); + + isum.sum_crc = cpu_to_je32(crc32(0, c->summary->sum_buf, datasize)); + isum.node_crc = cpu_to_je32(crc32(0, &isum, sizeof(isum) - 8)); + + vecs[0].iov_base = &isum; + vecs[0].iov_len = sizeof(isum); + vecs[1].iov_base = c->summary->sum_buf; + vecs[1].iov_len = datasize; + + sum_ofs = jeb->offset + c->sector_size - jeb->free_size; + + dbg_summary("writing out data to flash to pos : 0x%08x\n", sum_ofs); + + ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0); + + if (ret || (retlen != infosize)) { + + JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n", + infosize, sum_ofs, ret, retlen); + + if (retlen) { + /* Waste remaining space */ + spin_lock(&c->erase_completion_lock); + jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL); + spin_unlock(&c->erase_completion_lock); + } + + c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE; + + return 0; + } + + spin_lock(&c->erase_completion_lock); + jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL); + spin_unlock(&c->erase_completion_lock); + + return 0; +} + +/* Write out summary information - called from jffs2_do_reserve_space */ + +int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) + __must_hold(&c->erase_completion_block) +{ + int datasize, infosize, padsize; + struct jffs2_eraseblock *jeb; + int ret = 0; + + dbg_summary("called\n"); + + spin_unlock(&c->erase_completion_lock); + + jeb = c->nextblock; + jffs2_prealloc_raw_node_refs(c, jeb, 1); + + if (!c->summary->sum_num || !c->summary->sum_list_head) { + JFFS2_WARNING("Empty summary info!!!\n"); + BUG(); + } + + datasize = c->summary->sum_size + sizeof(struct jffs2_sum_marker); + infosize = sizeof(struct jffs2_raw_summary) + datasize; + padsize = jeb->free_size - infosize; + infosize += padsize; + datasize += padsize; + + ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize); + spin_lock(&c->erase_completion_lock); + return ret; +} diff --git a/kernel/fs/jffs2/summary.h b/kernel/fs/jffs2/summary.h new file mode 100644 index 000000000..60207a2ae --- /dev/null +++ b/kernel/fs/jffs2/summary.h @@ -0,0 +1,213 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * Zoltan Sogor , + * Patrik Kluba , + * University of Szeged, Hungary + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef JFFS2_SUMMARY_H +#define JFFS2_SUMMARY_H + +/* Limit summary size to 64KiB so that we can kmalloc it. If the summary + is larger than that, we have to just ditch it and avoid using summary + for the eraseblock in question... and it probably doesn't hurt us much + anyway. */ +#define MAX_SUMMARY_SIZE 65536 + +#include +#include + +#define BLK_STATE_ALLFF 0 +#define BLK_STATE_CLEAN 1 +#define BLK_STATE_PARTDIRTY 2 +#define BLK_STATE_CLEANMARKER 3 +#define BLK_STATE_ALLDIRTY 4 +#define BLK_STATE_BADBLOCK 5 + +#define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff +#define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash)) +#define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x)) +#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash)) +#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash)) + +/* Summary structures used on flash */ + +struct jffs2_sum_unknown_flash +{ + jint16_t nodetype; /* node type */ +}; + +struct jffs2_sum_inode_flash +{ + jint16_t nodetype; /* node type */ + jint32_t inode; /* inode number */ + jint32_t version; /* inode version */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* record length */ +} __attribute__((packed)); + +struct jffs2_sum_dirent_flash +{ + jint16_t nodetype; /* == JFFS_NODETYPE_DIRENT */ + jint32_t totlen; /* record length */ + jint32_t offset; /* offset on jeb */ + jint32_t pino; /* parent inode */ + jint32_t version; /* dirent version */ + jint32_t ino; /* == zero for unlink */ + uint8_t nsize; /* dirent name size */ + uint8_t type; /* dirent type */ + uint8_t name[0]; /* dirent name */ +} __attribute__((packed)); + +struct jffs2_sum_xattr_flash +{ + jint16_t nodetype; /* == JFFS2_NODETYPE_XATR */ + jint32_t xid; /* xattr identifier */ + jint32_t version; /* version number */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* node length */ +} __attribute__((packed)); + +struct jffs2_sum_xref_flash +{ + jint16_t nodetype; /* == JFFS2_NODETYPE_XREF */ + jint32_t offset; /* offset on jeb */ +} __attribute__((packed)); + +union jffs2_sum_flash +{ + struct jffs2_sum_unknown_flash u; + struct jffs2_sum_inode_flash i; + struct jffs2_sum_dirent_flash d; + struct jffs2_sum_xattr_flash x; + struct jffs2_sum_xref_flash r; +}; + +/* Summary structures used in the memory */ + +struct jffs2_sum_unknown_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* node type */ +}; + +struct jffs2_sum_inode_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* node type */ + jint32_t inode; /* inode number */ + jint32_t version; /* inode version */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* record length */ +} __attribute__((packed)); + +struct jffs2_sum_dirent_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* == JFFS_NODETYPE_DIRENT */ + jint32_t totlen; /* record length */ + jint32_t offset; /* ofset on jeb */ + jint32_t pino; /* parent inode */ + jint32_t version; /* dirent version */ + jint32_t ino; /* == zero for unlink */ + uint8_t nsize; /* dirent name size */ + uint8_t type; /* dirent type */ + uint8_t name[0]; /* dirent name */ +} __attribute__((packed)); + +struct jffs2_sum_xattr_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; + jint32_t xid; + jint32_t version; + jint32_t offset; + jint32_t totlen; +} __attribute__((packed)); + +struct jffs2_sum_xref_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; + jint32_t offset; +} __attribute__((packed)); + +union jffs2_sum_mem +{ + struct jffs2_sum_unknown_mem u; + struct jffs2_sum_inode_mem i; + struct jffs2_sum_dirent_mem d; + struct jffs2_sum_xattr_mem x; + struct jffs2_sum_xref_mem r; +}; + +/* Summary related information stored in superblock */ + +struct jffs2_summary +{ + uint32_t sum_size; /* collected summary information for nextblock */ + uint32_t sum_num; + uint32_t sum_padded; + union jffs2_sum_mem *sum_list_head; + union jffs2_sum_mem *sum_list_tail; + + jint32_t *sum_buf; /* buffer for writing out summary */ +}; + +/* Summary marker is stored at the end of every sumarized erase block */ + +struct jffs2_sum_marker +{ + jint32_t offset; /* offset of the summary node in the jeb */ + jint32_t magic; /* == JFFS2_SUM_MAGIC */ +}; + +#define JFFS2_SUMMARY_FRAME_SIZE (sizeof(struct jffs2_raw_summary) + sizeof(struct jffs2_sum_marker)) + +#ifdef CONFIG_JFFS2_SUMMARY /* SUMMARY SUPPORT ENABLED */ + +#define jffs2_sum_active() (1) +int jffs2_sum_init(struct jffs2_sb_info *c); +void jffs2_sum_exit(struct jffs2_sb_info *c); +void jffs2_sum_disable_collecting(struct jffs2_summary *s); +int jffs2_sum_is_disabled(struct jffs2_summary *s); +void jffs2_sum_reset_collected(struct jffs2_summary *s); +void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s); +int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, uint32_t to); +int jffs2_sum_write_sumnode(struct jffs2_sb_info *c); +int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size); +int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs); +int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs); +int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs); +int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs); +int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t sumlen, + uint32_t *pseudo_random); + +#else /* SUMMARY DISABLED */ + +#define jffs2_sum_active() (0) +#define jffs2_sum_init(a) (0) +#define jffs2_sum_exit(a) +#define jffs2_sum_disable_collecting(a) +#define jffs2_sum_is_disabled(a) (0) +#define jffs2_sum_reset_collected(a) +#define jffs2_sum_add_kvec(a,b,c,d) (0) +#define jffs2_sum_move_collected(a,b) +#define jffs2_sum_write_sumnode(a) (0) +#define jffs2_sum_add_padding_mem(a,b) +#define jffs2_sum_add_inode_mem(a,b,c) +#define jffs2_sum_add_dirent_mem(a,b,c) +#define jffs2_sum_add_xattr_mem(a,b,c) +#define jffs2_sum_add_xref_mem(a,b,c) +#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) + +#endif /* CONFIG_JFFS2_SUMMARY */ + +#endif /* JFFS2_SUMMARY_H */ diff --git a/kernel/fs/jffs2/super.c b/kernel/fs/jffs2/super.c new file mode 100644 index 000000000..d86c5e317 --- /dev/null +++ b/kernel/fs/jffs2/super.c @@ -0,0 +1,442 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compr.h" +#include "nodelist.h" + +static void jffs2_put_super(struct super_block *); + +static struct kmem_cache *jffs2_inode_cachep; + +static struct inode *jffs2_alloc_inode(struct super_block *sb) +{ + struct jffs2_inode_info *f; + + f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); + if (!f) + return NULL; + return &f->vfs_inode; +} + +static void jffs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); +} + +static void jffs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, jffs2_i_callback); +} + +static void jffs2_i_init_once(void *foo) +{ + struct jffs2_inode_info *f = foo; + + mutex_init(&f->sem); + inode_init_once(&f->vfs_inode); +} + +static const char *jffs2_compr_name(unsigned int compr) +{ + switch (compr) { + case JFFS2_COMPR_MODE_NONE: + return "none"; +#ifdef CONFIG_JFFS2_LZO + case JFFS2_COMPR_MODE_FORCELZO: + return "lzo"; +#endif +#ifdef CONFIG_JFFS2_ZLIB + case JFFS2_COMPR_MODE_FORCEZLIB: + return "zlib"; +#endif + default: + /* should never happen; programmer error */ + WARN_ON(1); + return ""; + } +} + +static int jffs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb); + struct jffs2_mount_opts *opts = &c->mount_opts; + + if (opts->override_compr) + seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); + if (opts->rp_size) + seq_printf(s, ",rp_size=%u", opts->rp_size / 1024); + + return 0; +} + +static int jffs2_sync_fs(struct super_block *sb, int wait) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + cancel_delayed_work_sync(&c->wbuf_dwork); +#endif + + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + return 0; +} + +static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino, + uint32_t generation) +{ + /* We don't care about i_generation. We'll destroy the flash + before we start re-using inode numbers anyway. And even + if that wasn't true, we'd have other problems...*/ + return jffs2_iget(sb, ino); +} + +static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_get_parent(struct dentry *child) +{ + struct jffs2_inode_info *f; + uint32_t pino; + + BUG_ON(!d_is_dir(child)); + + f = JFFS2_INODE_INFO(d_inode(child)); + + pino = f->inocache->pino_nlink; + + JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", + f->inocache->ino, pino); + + return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino)); +} + +static const struct export_operations jffs2_export_ops = { + .get_parent = jffs2_get_parent, + .fh_to_dentry = jffs2_fh_to_dentry, + .fh_to_parent = jffs2_fh_to_parent, +}; + +/* + * JFFS2 mount options. + * + * Opt_override_compr: override default compressor + * Opt_rp_size: size of reserved pool in KiB + * Opt_err: just end of array marker + */ +enum { + Opt_override_compr, + Opt_rp_size, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_override_compr, "compr=%s"}, + {Opt_rp_size, "rp_size=%u"}, + {Opt_err, NULL}, +}; + +static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + unsigned int opt; + + if (!data) + return 0; + + while ((p = strsep(&data, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_override_compr: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; +#ifdef CONFIG_JFFS2_LZO + else if (!strcmp(name, "lzo")) + c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; +#endif +#ifdef CONFIG_JFFS2_ZLIB + else if (!strcmp(name, "zlib")) + c->mount_opts.compr = + JFFS2_COMPR_MODE_FORCEZLIB; +#endif + else { + pr_err("Error: unknown compressor \"%s\"\n", + name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = true; + break; + case Opt_rp_size: + if (match_int(&args[0], &opt)) + return -EINVAL; + opt *= 1024; + if (opt > c->mtd->size) { + pr_warn("Too large reserve pool specified, max " + "is %llu KB\n", c->mtd->size / 1024); + return -EINVAL; + } + c->mount_opts.rp_size = opt; + break; + default: + pr_err("Error: unrecognized mount option '%s' or missing value\n", + p); + return -EINVAL; + } + } + + return 0; +} + +static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + int err; + + sync_filesystem(sb); + err = jffs2_parse_options(c, data); + if (err) + return -EINVAL; + + return jffs2_do_remount_fs(sb, flags, data); +} + +static const struct super_operations jffs2_super_operations = +{ + .alloc_inode = jffs2_alloc_inode, + .destroy_inode =jffs2_destroy_inode, + .put_super = jffs2_put_super, + .statfs = jffs2_statfs, + .remount_fs = jffs2_remount_fs, + .evict_inode = jffs2_evict_inode, + .dirty_inode = jffs2_dirty_inode, + .show_options = jffs2_show_options, + .sync_fs = jffs2_sync_fs, +}; + +/* + * fill in the superblock + */ +static int jffs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jffs2_sb_info *c; + int ret; + + jffs2_dbg(1, "jffs2_get_sb_mtd():" + " New superblock for device %d (\"%s\")\n", + sb->s_mtd->index, sb->s_mtd->name); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + + c->mtd = sb->s_mtd; + c->os_priv = sb; + sb->s_fs_info = c; + + ret = jffs2_parse_options(c, data); + if (ret) { + kfree(c); + return -EINVAL; + } + + /* Initialize JFFS2 superblock locks, the further initialization will + * be done later */ + mutex_init(&c->alloc_sem); + mutex_init(&c->erase_free_sem); + init_waitqueue_head(&c->erase_wait); + init_waitqueue_head(&c->inocache_wq); + spin_lock_init(&c->erase_completion_lock); + spin_lock_init(&c->inocache_lock); + + sb->s_op = &jffs2_super_operations; + sb->s_export_op = &jffs2_export_ops; + sb->s_flags = sb->s_flags | MS_NOATIME; + sb->s_xattr = jffs2_xattr_handlers; +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + ret = jffs2_do_fill_super(sb, data, silent); + return ret; +} + +static struct dentry *jffs2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super); +} + +static void jffs2_put_super (struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + jffs2_dbg(2, "%s()\n", __func__); + + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + + jffs2_sum_exit(c); + + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else + kfree(c->blocks); + jffs2_flash_cleanup(c); + kfree(c->inocache_list); + jffs2_clear_xattr_subsystem(c); + mtd_sync(c->mtd); + jffs2_dbg(1, "%s(): returning\n", __func__); +} + +static void jffs2_kill_sb(struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + if (!(sb->s_flags & MS_RDONLY)) + jffs2_stop_garbage_collect_thread(c); + kill_mtd_super(sb); + kfree(c); +} + +static struct file_system_type jffs2_fs_type = { + .owner = THIS_MODULE, + .name = "jffs2", + .mount = jffs2_mount, + .kill_sb = jffs2_kill_sb, +}; +MODULE_ALIAS_FS("jffs2"); + +static int __init init_jffs2_fs(void) +{ + int ret; + + /* Paranoia checks for on-medium structures. If we ask GCC + to pack them with __attribute__((packed)) then it _also_ + assumes that they're not aligned -- so it emits crappy + code on some architectures. Ideally we want an attribute + which means just 'no padding', without the alignment + thing. But GCC doesn't have that -- we have to just + hope the structs are the right sizes, instead. */ + BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12); + BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40); + BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68); + BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32); + + pr_info("version 2.2." +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + " (NAND)" +#endif +#ifdef CONFIG_JFFS2_SUMMARY + " (SUMMARY) " +#endif + " © 2001-2006 Red Hat, Inc.\n"); + + jffs2_inode_cachep = kmem_cache_create("jffs2_i", + sizeof(struct jffs2_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + jffs2_i_init_once); + if (!jffs2_inode_cachep) { + pr_err("error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = jffs2_compressors_init(); + if (ret) { + pr_err("error: Failed to initialise compressors\n"); + goto out; + } + ret = jffs2_create_slab_caches(); + if (ret) { + pr_err("error: Failed to initialise slab caches\n"); + goto out_compressors; + } + ret = register_filesystem(&jffs2_fs_type); + if (ret) { + pr_err("error: Failed to register filesystem\n"); + goto out_slab; + } + return 0; + + out_slab: + jffs2_destroy_slab_caches(); + out_compressors: + jffs2_compressors_exit(); + out: + kmem_cache_destroy(jffs2_inode_cachep); + return ret; +} + +static void __exit exit_jffs2_fs(void) +{ + unregister_filesystem(&jffs2_fs_type); + jffs2_destroy_slab_caches(); + jffs2_compressors_exit(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(jffs2_inode_cachep); +} + +module_init(init_jffs2_fs); +module_exit(exit_jffs2_fs); + +MODULE_DESCRIPTION("The Journalling Flash File System, v2"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); // Actually dual-licensed, but it doesn't matter for + // the sake of this tag. It's Free Software. diff --git a/kernel/fs/jffs2/symlink.c b/kernel/fs/jffs2/symlink.c new file mode 100644 index 000000000..1fefa25d0 --- /dev/null +++ b/kernel/fs/jffs2/symlink.c @@ -0,0 +1,66 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "nodelist.h" + +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); + +const struct inode_operations jffs2_symlink_inode_operations = +{ + .readlink = generic_readlink, + .follow_link = jffs2_follow_link, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); + char *p = (char *)f->target; + + /* + * We don't acquire the f->sem mutex here since the only data we + * use is f->target. + * + * 1. If we are here the inode has already built and f->target has + * to point to the target path. + * 2. Nobody uses f->target (if the inode is symlink's inode). The + * exception is inode freeing function which frees f->target. But + * it can't be called while we are here and before VFS has + * stopped using our f->target string which we provide by means of + * nd_set_link() call. + */ + + if (!p) { + pr_err("%s(): can't find symlink target\n", __func__); + p = ERR_PTR(-EIO); + } + jffs2_dbg(1, "%s(): target path is '%s'\n", + __func__, (char *)f->target); + + nd_set_link(nd, p); + + /* + * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe + * since the only way that may cause f->target to be changed is iput() operation. + * But VFS will not use f->target after iput() has been called. + */ + return NULL; +} + diff --git a/kernel/fs/jffs2/wbuf.c b/kernel/fs/jffs2/wbuf.c new file mode 100644 index 000000000..09ed55190 --- /dev/null +++ b/kernel/fs/jffs2/wbuf.c @@ -0,0 +1,1353 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004 Thomas Gleixner + * + * Created by David Woodhouse + * Modified debugged and enhanced by Thomas Gleixner + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nodelist.h" + +/* For testing write failures */ +#undef BREAKME +#undef BREAKMEHEADER + +#ifdef BREAKME +static unsigned char *brokenbuf; +#endif + +#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) ) +#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) ) + +/* max. erase failures before we mark a block bad */ +#define MAX_ERASE_FAILURES 2 + +struct jffs2_inodirty { + uint32_t ino; + struct jffs2_inodirty *next; +}; + +static struct jffs2_inodirty inodirty_nomem; + +static int jffs2_wbuf_pending_for_ino(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inodirty *this = c->wbuf_inodes; + + /* If a malloc failed, consider _everything_ dirty */ + if (this == &inodirty_nomem) + return 1; + + /* If ino == 0, _any_ non-GC writes mean 'yes' */ + if (this && !ino) + return 1; + + /* Look to see if the inode in question is pending in the wbuf */ + while (this) { + if (this->ino == ino) + return 1; + this = this->next; + } + return 0; +} + +static void jffs2_clear_wbuf_ino_list(struct jffs2_sb_info *c) +{ + struct jffs2_inodirty *this; + + this = c->wbuf_inodes; + + if (this != &inodirty_nomem) { + while (this) { + struct jffs2_inodirty *next = this->next; + kfree(this); + this = next; + } + } + c->wbuf_inodes = NULL; +} + +static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inodirty *new; + + /* Schedule delayed write-buffer write-out */ + jffs2_dirty_trigger(c); + + if (jffs2_wbuf_pending_for_ino(c, ino)) + return; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) { + jffs2_dbg(1, "No memory to allocate inodirty. Fallback to all considered dirty\n"); + jffs2_clear_wbuf_ino_list(c); + c->wbuf_inodes = &inodirty_nomem; + return; + } + new->ino = ino; + new->next = c->wbuf_inodes; + c->wbuf_inodes = new; + return; +} + +static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) +{ + struct list_head *this, *next; + static int n; + + if (list_empty(&c->erasable_pending_wbuf_list)) + return; + + list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + jffs2_dbg(1, "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", + jeb->offset); + list_del(this); + if ((jiffies + (n++)) & 127) { + /* Most of the time, we just erase it immediately. Otherwise we + spend ages scanning it on mount, etc. */ + jffs2_dbg(1, "...and adding to erase_pending_list\n"); + list_add_tail(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } else { + /* Sometimes, however, we leave it elsewhere so it doesn't get + immediately reused, and we spread the load a bit. */ + jffs2_dbg(1, "...and adding to erasable_list\n"); + list_add_tail(&jeb->list, &c->erasable_list); + } + } +} + +#define REFILE_NOTEMPTY 0 +#define REFILE_ANYWAY 1 + +static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty) +{ + jffs2_dbg(1, "About to refile bad block at %08x\n", jeb->offset); + + /* File the existing block on the bad_used_list.... */ + if (c->nextblock == jeb) + c->nextblock = NULL; + else /* Not sure this should ever happen... need more coffee */ + list_del(&jeb->list); + if (jeb->first_node) { + jffs2_dbg(1, "Refiling block at %08x to bad_used_list\n", + jeb->offset); + list_add(&jeb->list, &c->bad_used_list); + } else { + BUG_ON(allow_empty == REFILE_NOTEMPTY); + /* It has to have had some nodes or we couldn't be here */ + jffs2_dbg(1, "Refiling block at %08x to erase_pending_list\n", + jeb->offset); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + + if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { + uint32_t oldfree = jeb->free_size; + + jffs2_link_node_ref(c, jeb, + (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE, + oldfree, NULL); + /* convert to wasted */ + c->wasted_size += oldfree; + jeb->wasted_size += oldfree; + c->dirty_size -= oldfree; + jeb->dirty_size -= oldfree; + } + + jffs2_dbg_dump_block_lists_nolock(c); + jffs2_dbg_acct_sanity_check_nolock(c,jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); +} + +static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_raw_node_ref *raw, + union jffs2_node_union *node) +{ + struct jffs2_node_frag *frag; + struct jffs2_full_dirent *fd; + + dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n", + node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype)); + + BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 && + je16_to_cpu(node->u.magic) != 0); + + switch (je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + if (f->metadata && f->metadata->raw == raw) { + dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata); + return &f->metadata->raw; + } + frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset)); + BUG_ON(!frag); + /* Find a frag which refers to the full_dnode we want to modify */ + while (!frag->node || frag->node->raw != raw) { + frag = frag_next(frag); + BUG_ON(!frag); + } + dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node); + return &frag->node->raw; + + case JFFS2_NODETYPE_DIRENT: + for (fd = f->dents; fd; fd = fd->next) { + if (fd->raw == raw) { + dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd); + return &fd->raw; + } + } + BUG(); + + default: + dbg_noderef("Don't care about replacing raw for nodetype %x\n", + je16_to_cpu(node->u.nodetype)); + break; + } + return NULL; +} + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY +static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, + uint32_t ofs) +{ + int ret; + size_t retlen; + char *eccstr; + + ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); + if (ret && ret != -EUCLEAN && ret != -EBADMSG) { + pr_warn("%s(): Read back of page at %08x failed: %d\n", + __func__, c->wbuf_ofs, ret); + return ret; + } else if (retlen != c->wbuf_pagesize) { + pr_warn("%s(): Read back of page at %08x gave short read: %zd not %d\n", + __func__, ofs, retlen, c->wbuf_pagesize); + return -EIO; + } + if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize)) + return 0; + + if (ret == -EUCLEAN) + eccstr = "corrected"; + else if (ret == -EBADMSG) + eccstr = "correction failed"; + else + eccstr = "OK or unused"; + + pr_warn("Write verify error (ECC %s) at %08x. Wrote:\n", + eccstr, c->wbuf_ofs); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, + c->wbuf, c->wbuf_pagesize, 0); + + pr_warn("Read back:\n"); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, + c->wbuf_verify, c->wbuf_pagesize, 0); + + return -EIO; +} +#else +#define jffs2_verify_write(c,b,o) (0) +#endif + +/* Recover from failure to write wbuf. Recover the nodes up to the + * wbuf, not the one which we were starting to try to write. */ + +static void jffs2_wbuf_recover(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *jeb, *new_jeb; + struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL; + size_t retlen; + int ret; + int nr_refile = 0; + unsigned char *buf; + uint32_t start, end, ofs, len; + + jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; + + spin_lock(&c->erase_completion_lock); + if (c->wbuf_ofs % c->mtd->erasesize) + jffs2_block_refile(c, jeb, REFILE_NOTEMPTY); + else + jffs2_block_refile(c, jeb, REFILE_ANYWAY); + spin_unlock(&c->erase_completion_lock); + + BUG_ON(!ref_obsolete(jeb->last_node)); + + /* Find the first node to be recovered, by skipping over every + node which ends before the wbuf starts, or which is obsolete. */ + for (next = raw = jeb->first_node; next; raw = next) { + next = ref_next(raw); + + if (ref_obsolete(raw) || + (next && ref_offset(next) <= c->wbuf_ofs)) { + dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n", + ref_offset(raw), ref_flags(raw), + (ref_offset(raw) + ref_totlen(c, jeb, raw)), + c->wbuf_ofs); + continue; + } + dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n", + ref_offset(raw), ref_flags(raw), + (ref_offset(raw) + ref_totlen(c, jeb, raw))); + + first_raw = raw; + break; + } + + if (!first_raw) { + /* All nodes were obsolete. Nothing to recover. */ + jffs2_dbg(1, "No non-obsolete nodes to be recovered. Just filing block bad\n"); + c->wbuf_len = 0; + return; + } + + start = ref_offset(first_raw); + end = ref_offset(jeb->last_node); + nr_refile = 1; + + /* Count the number of refs which need to be copied */ + while ((raw = ref_next(raw)) != jeb->last_node) + nr_refile++; + + dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n", + start, end, end - start, nr_refile); + + buf = NULL; + if (start < c->wbuf_ofs) { + /* First affected node was already partially written. + * Attempt to reread the old data into our buffer. */ + + buf = kmalloc(end - start, GFP_KERNEL); + if (!buf) { + pr_crit("Malloc failure in wbuf recovery. Data loss ensues.\n"); + + goto read_failed; + } + + /* Do the read... */ + ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen, + buf); + + /* ECC recovered ? */ + if ((ret == -EUCLEAN || ret == -EBADMSG) && + (retlen == c->wbuf_ofs - start)) + ret = 0; + + if (ret || retlen != c->wbuf_ofs - start) { + pr_crit("Old data are already lost in wbuf recovery. Data loss ensues.\n"); + + kfree(buf); + buf = NULL; + read_failed: + first_raw = ref_next(first_raw); + nr_refile--; + while (first_raw && ref_obsolete(first_raw)) { + first_raw = ref_next(first_raw); + nr_refile--; + } + + /* If this was the only node to be recovered, give up */ + if (!first_raw) { + c->wbuf_len = 0; + return; + } + + /* It wasn't. Go on and try to recover nodes complete in the wbuf */ + start = ref_offset(first_raw); + dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n", + start, end, end - start, nr_refile); + + } else { + /* Read succeeded. Copy the remaining data from the wbuf */ + memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs); + } + } + /* OK... we're to rewrite (end-start) bytes of data from first_raw onwards. + Either 'buf' contains the data, or we find it in the wbuf */ + + /* ... and get an allocation of space from a shiny new block instead */ + ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE); + if (ret) { + pr_warn("Failed to allocate space for wbuf recovery. Data loss ensues.\n"); + kfree(buf); + return; + } + + /* The summary is not recovered, so it must be disabled for this erase block */ + jffs2_sum_disable_collecting(c->summary); + + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile); + if (ret) { + pr_warn("Failed to allocate node refs for wbuf recovery. Data loss ensues.\n"); + kfree(buf); + return; + } + + ofs = write_ofs(c); + + if (end-start >= c->wbuf_pagesize) { + /* Need to do another write immediately, but it's possible + that this is just because the wbuf itself is completely + full, and there's nothing earlier read back from the + flash. Hence 'buf' isn't necessarily what we're writing + from. */ + unsigned char *rewrite_buf = buf?:c->wbuf; + uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize); + + jffs2_dbg(1, "Write 0x%x bytes at 0x%08x in wbuf recover\n", + towrite, ofs); + +#ifdef BREAKMEHEADER + static int breakme; + if (breakme++ == 20) { + pr_notice("Faking write error at 0x%08x\n", ofs); + breakme = 0; + mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf); + ret = -EIO; + } else +#endif + ret = mtd_write(c->mtd, ofs, towrite, &retlen, + rewrite_buf); + + if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { + /* Argh. We tried. Really we did. */ + pr_crit("Recovery of wbuf failed due to a second write error\n"); + kfree(buf); + + if (retlen) + jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL); + + return; + } + pr_notice("Recovery of wbuf succeeded to %08x\n", ofs); + + c->wbuf_len = (end - start) - towrite; + c->wbuf_ofs = ofs + towrite; + memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len); + /* Don't muck about with c->wbuf_inodes. False positives are harmless. */ + } else { + /* OK, now we're left with the dregs in whichever buffer we're using */ + if (buf) { + memcpy(c->wbuf, buf, end-start); + } else { + memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start); + } + c->wbuf_ofs = ofs; + c->wbuf_len = end - start; + } + + /* Now sort out the jffs2_raw_node_refs, moving them from the old to the next block */ + new_jeb = &c->blocks[ofs / c->sector_size]; + + spin_lock(&c->erase_completion_lock); + for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) { + uint32_t rawlen = ref_totlen(c, jeb, raw); + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref *new_ref; + struct jffs2_raw_node_ref **adjust_ref = NULL; + struct jffs2_inode_info *f = NULL; + + jffs2_dbg(1, "Refiling block of %08x at %08x(%d) to %08x\n", + rawlen, ref_offset(raw), ref_flags(raw), ofs); + + ic = jffs2_raw_ref_to_ic(raw); + + /* Ick. This XATTR mess should be fixed shortly... */ + if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) { + struct jffs2_xattr_datum *xd = (void *)ic; + BUG_ON(xd->node != raw); + adjust_ref = &xd->node; + raw->next_in_ino = NULL; + ic = NULL; + } else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) { + struct jffs2_xattr_datum *xr = (void *)ic; + BUG_ON(xr->node != raw); + adjust_ref = &xr->node; + raw->next_in_ino = NULL; + ic = NULL; + } else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) { + struct jffs2_raw_node_ref **p = &ic->nodes; + + /* Remove the old node from the per-inode list */ + while (*p && *p != (void *)ic) { + if (*p == raw) { + (*p) = (raw->next_in_ino); + raw->next_in_ino = NULL; + break; + } + p = &((*p)->next_in_ino); + } + + if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) { + /* If it's an in-core inode, then we have to adjust any + full_dirent or full_dnode structure to point to the + new version instead of the old */ + f = jffs2_gc_fetch_inode(c, ic->ino, !ic->pino_nlink); + if (IS_ERR(f)) { + /* Should never happen; it _must_ be present */ + JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n", + ic->ino, PTR_ERR(f)); + BUG(); + } + /* We don't lock f->sem. There's a number of ways we could + end up in here with it already being locked, and nobody's + going to modify it on us anyway because we hold the + alloc_sem. We're only changing one ->raw pointer too, + which we can get away with without upsetting readers. */ + adjust_ref = jffs2_incore_replace_raw(c, f, raw, + (void *)(buf?:c->wbuf) + (ref_offset(raw) - start)); + } else if (unlikely(ic->state != INO_STATE_PRESENT && + ic->state != INO_STATE_CHECKEDABSENT && + ic->state != INO_STATE_GC)) { + JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state); + BUG(); + } + } + + new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic); + + if (adjust_ref) { + BUG_ON(*adjust_ref != raw); + *adjust_ref = new_ref; + } + if (f) + jffs2_gc_release_inode(c, f); + + if (!ref_obsolete(raw)) { + jeb->dirty_size += rawlen; + jeb->used_size -= rawlen; + c->dirty_size += rawlen; + c->used_size -= rawlen; + raw->flash_offset = ref_offset(raw) | REF_OBSOLETE; + BUG_ON(raw->next_in_ino); + } + ofs += rawlen; + } + + kfree(buf); + + /* Fix up the original jeb now it's on the bad_list */ + if (first_raw == jeb->first_node) { + jffs2_dbg(1, "Failing block at %08x is now empty. Moving to erase_pending_list\n", + jeb->offset); + list_move(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + jffs2_dbg_acct_sanity_check_nolock(c, new_jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb); + + spin_unlock(&c->erase_completion_lock); + + jffs2_dbg(1, "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", + c->wbuf_ofs, c->wbuf_len); + +} + +/* Meaning of pad argument: + 0: Do not pad. Probably pointless - we only ever use this when we can't pad anyway. + 1: Pad, do not adjust nextblock free_size + 2: Pad, adjust nextblock free_size +*/ +#define NOPAD 0 +#define PAD_NOACCOUNT 1 +#define PAD_ACCOUNTING 2 + +static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) +{ + struct jffs2_eraseblock *wbuf_jeb; + int ret; + size_t retlen; + + /* Nothing to do if not write-buffering the flash. In particular, we shouldn't + del_timer() the timer we never initialised. */ + if (!jffs2_is_writebuffered(c)) + return 0; + + if (!mutex_is_locked(&c->alloc_sem)) { + pr_crit("jffs2_flush_wbuf() called with alloc_sem not locked!\n"); + BUG(); + } + + if (!c->wbuf_len) /* already checked c->wbuf above */ + return 0; + + wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; + if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1)) + return -ENOMEM; + + /* claim remaining space on the page + this happens, if we have a change to a new block, + or if fsync forces us to flush the writebuffer. + if we have a switch to next page, we will not have + enough remaining space for this. + */ + if (pad ) { + c->wbuf_len = PAD(c->wbuf_len); + + /* Pad with JFFS2_DIRTY_BITMASK initially. this helps out ECC'd NOR + with 8 byte page size */ + memset(c->wbuf + c->wbuf_len, 0, c->wbuf_pagesize - c->wbuf_len); + + if ( c->wbuf_len + sizeof(struct jffs2_unknown_node) < c->wbuf_pagesize) { + struct jffs2_unknown_node *padnode = (void *)(c->wbuf + c->wbuf_len); + padnode->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + padnode->nodetype = cpu_to_je16(JFFS2_NODETYPE_PADDING); + padnode->totlen = cpu_to_je32(c->wbuf_pagesize - c->wbuf_len); + padnode->hdr_crc = cpu_to_je32(crc32(0, padnode, sizeof(*padnode)-4)); + } + } + /* else jffs2_flash_writev has actually filled in the rest of the + buffer for us, and will deal with the node refs etc. later. */ + +#ifdef BREAKME + static int breakme; + if (breakme++ == 20) { + pr_notice("Faking write error at 0x%08x\n", c->wbuf_ofs); + breakme = 0; + mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, + brokenbuf); + ret = -EIO; + } else +#endif + + ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, + &retlen, c->wbuf); + + if (ret) { + pr_warn("jffs2_flush_wbuf(): Write failed with %d\n", ret); + goto wfail; + } else if (retlen != c->wbuf_pagesize) { + pr_warn("jffs2_flush_wbuf(): Write was short: %zd instead of %d\n", + retlen, c->wbuf_pagesize); + ret = -EIO; + goto wfail; + } else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) { + wfail: + jffs2_wbuf_recover(c); + + return ret; + } + + /* Adjust free size of the block if we padded. */ + if (pad) { + uint32_t waste = c->wbuf_pagesize - c->wbuf_len; + + jffs2_dbg(1, "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n", + (wbuf_jeb == c->nextblock) ? "next" : "", + wbuf_jeb->offset); + + /* wbuf_pagesize - wbuf_len is the amount of space that's to be + padded. If there is less free space in the block than that, + something screwed up */ + if (wbuf_jeb->free_size < waste) { + pr_crit("jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n", + c->wbuf_ofs, c->wbuf_len, waste); + pr_crit("jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n", + wbuf_jeb->offset, wbuf_jeb->free_size); + BUG(); + } + + spin_lock(&c->erase_completion_lock); + + jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL); + /* FIXME: that made it count as dirty. Convert to wasted */ + wbuf_jeb->dirty_size -= waste; + c->dirty_size -= waste; + wbuf_jeb->wasted_size += waste; + c->wasted_size += waste; + } else + spin_lock(&c->erase_completion_lock); + + /* Stick any now-obsoleted blocks on the erase_pending_list */ + jffs2_refile_wbuf_blocks(c); + jffs2_clear_wbuf_ino_list(c); + spin_unlock(&c->erase_completion_lock); + + memset(c->wbuf,0xff,c->wbuf_pagesize); + /* adjust write buffer offset, else we get a non contiguous write bug */ + c->wbuf_ofs += c->wbuf_pagesize; + c->wbuf_len = 0; + return 0; +} + +/* Trigger garbage collection to flush the write-buffer. + If ino arg is zero, do it if _any_ real (i.e. not GC) writes are + outstanding. If ino arg non-zero, do it only if a write for the + given inode is outstanding. */ +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) +{ + uint32_t old_wbuf_ofs; + uint32_t old_wbuf_len; + int ret = 0; + + jffs2_dbg(1, "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino); + + if (!c->wbuf) + return 0; + + mutex_lock(&c->alloc_sem); + if (!jffs2_wbuf_pending_for_ino(c, ino)) { + jffs2_dbg(1, "Ino #%d not pending in wbuf. Returning\n", ino); + mutex_unlock(&c->alloc_sem); + return 0; + } + + old_wbuf_ofs = c->wbuf_ofs; + old_wbuf_len = c->wbuf_len; + + if (c->unchecked_size) { + /* GC won't make any progress for a while */ + jffs2_dbg(1, "%s(): padding. Not finished checking\n", + __func__); + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + up_write(&c->wbuf_sem); + } else while (old_wbuf_len && + old_wbuf_ofs == c->wbuf_ofs) { + + mutex_unlock(&c->alloc_sem); + + jffs2_dbg(1, "%s(): calls gc pass\n", __func__); + + ret = jffs2_garbage_collect_pass(c); + if (ret) { + /* GC failed. Flush it with padding instead */ + mutex_lock(&c->alloc_sem); + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + up_write(&c->wbuf_sem); + break; + } + mutex_lock(&c->alloc_sem); + } + + jffs2_dbg(1, "%s(): ends...\n", __func__); + + mutex_unlock(&c->alloc_sem); + return ret; +} + +/* Pad write-buffer to end and write it, wasting space. */ +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c) +{ + int ret; + + if (!c->wbuf) + return 0; + + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + /* retry - maybe wbuf recover left some data in wbuf. */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + up_write(&c->wbuf_sem); + + return ret; +} + +static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf, + size_t len) +{ + if (len && !c->wbuf_len && (len >= c->wbuf_pagesize)) + return 0; + + if (len > (c->wbuf_pagesize - c->wbuf_len)) + len = c->wbuf_pagesize - c->wbuf_len; + memcpy(c->wbuf + c->wbuf_len, buf, len); + c->wbuf_len += (uint32_t) len; + return len; +} + +int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, loff_t to, size_t *retlen, + uint32_t ino) +{ + struct jffs2_eraseblock *jeb; + size_t wbuf_retlen, donelen = 0; + uint32_t outvec_to = to; + int ret, invec; + + /* If not writebuffered flash, don't bother */ + if (!jffs2_is_writebuffered(c)) + return jffs2_flash_direct_writev(c, invecs, count, to, retlen); + + down_write(&c->wbuf_sem); + + /* If wbuf_ofs is not initialized, set it to target address */ + if (c->wbuf_ofs == 0xFFFFFFFF) { + c->wbuf_ofs = PAGE_DIV(to); + c->wbuf_len = PAGE_MOD(to); + memset(c->wbuf,0xff,c->wbuf_pagesize); + } + + /* + * Sanity checks on target address. It's permitted to write + * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to + * write at the beginning of a new erase block. Anything else, + * and you die. New block starts at xxx000c (0-b = block + * header) + */ + if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) { + /* It's a write to a new block */ + if (c->wbuf_len) { + jffs2_dbg(1, "%s(): to 0x%lx causes flush of wbuf at 0x%08x\n", + __func__, (unsigned long)to, c->wbuf_ofs); + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + if (ret) + goto outerr; + } + /* set pointer to new block */ + c->wbuf_ofs = PAGE_DIV(to); + c->wbuf_len = PAGE_MOD(to); + } + + if (to != PAD(c->wbuf_ofs + c->wbuf_len)) { + /* We're not writing immediately after the writebuffer. Bad. */ + pr_crit("%s(): Non-contiguous write to %08lx\n", + __func__, (unsigned long)to); + if (c->wbuf_len) + pr_crit("wbuf was previously %08x-%08x\n", + c->wbuf_ofs, c->wbuf_ofs + c->wbuf_len); + BUG(); + } + + /* adjust alignment offset */ + if (c->wbuf_len != PAGE_MOD(to)) { + c->wbuf_len = PAGE_MOD(to); + /* take care of alignment to next page */ + if (!c->wbuf_len) { + c->wbuf_len = c->wbuf_pagesize; + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + } + + for (invec = 0; invec < count; invec++) { + int vlen = invecs[invec].iov_len; + uint8_t *v = invecs[invec].iov_base; + + wbuf_retlen = jffs2_fill_wbuf(c, v, vlen); + + if (c->wbuf_len == c->wbuf_pagesize) { + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + donelen += wbuf_retlen; + v += wbuf_retlen; + + if (vlen >= c->wbuf_pagesize) { + ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen), + &wbuf_retlen, v); + if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen)) + goto outfile; + + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + c->wbuf_ofs = outvec_to; + donelen += wbuf_retlen; + v += wbuf_retlen; + } + + wbuf_retlen = jffs2_fill_wbuf(c, v, vlen); + if (c->wbuf_len == c->wbuf_pagesize) { + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + + outvec_to += wbuf_retlen; + donelen += wbuf_retlen; + } + + /* + * If there's a remainder in the wbuf and it's a non-GC write, + * remember that the wbuf affects this ino + */ + *retlen = donelen; + + if (jffs2_sum_active()) { + int res = jffs2_sum_add_kvec(c, invecs, count, (uint32_t) to); + if (res) + return res; + } + + if (c->wbuf_len && ino) + jffs2_wbuf_dirties_inode(c, ino); + + ret = 0; + up_write(&c->wbuf_sem); + return ret; + +outfile: + /* + * At this point we have no problem, c->wbuf is empty. However + * refile nextblock to avoid writing again to same address. + */ + + spin_lock(&c->erase_completion_lock); + + jeb = &c->blocks[outvec_to / c->sector_size]; + jffs2_block_refile(c, jeb, REFILE_ANYWAY); + + spin_unlock(&c->erase_completion_lock); + +outerr: + *retlen = 0; + up_write(&c->wbuf_sem); + return ret; +} + +/* + * This is the entry for flash write. + * Check, if we work on NAND FLASH, if so build an kvec and write it via vritev +*/ +int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf) +{ + struct kvec vecs[1]; + + if (!jffs2_is_writebuffered(c)) + return jffs2_flash_direct_write(c, ofs, len, retlen, buf); + + vecs[0].iov_base = (unsigned char *) buf; + vecs[0].iov_len = len; + return jffs2_flash_writev(c, vecs, 1, ofs, retlen, 0); +} + +/* + Handle readback from writebuffer and ECC failure return +*/ +int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf) +{ + loff_t orbf = 0, owbf = 0, lwbf = 0; + int ret; + + if (!jffs2_is_writebuffered(c)) + return mtd_read(c->mtd, ofs, len, retlen, buf); + + /* Read flash */ + down_read(&c->wbuf_sem); + ret = mtd_read(c->mtd, ofs, len, retlen, buf); + + if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { + if (ret == -EBADMSG) + pr_warn("mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n", + len, ofs); + /* + * We have the raw data without ECC correction in the buffer, + * maybe we are lucky and all data or parts are correct. We + * check the node. If data are corrupted node check will sort + * it out. We keep this block, it will fail on write or erase + * and the we mark it bad. Or should we do that now? But we + * should give him a chance. Maybe we had a system crash or + * power loss before the ecc write or a erase was completed. + * So we return success. :) + */ + ret = 0; + } + + /* if no writebuffer available or write buffer empty, return */ + if (!c->wbuf_pagesize || !c->wbuf_len) + goto exit; + + /* if we read in a different block, return */ + if (SECTOR_ADDR(ofs) != SECTOR_ADDR(c->wbuf_ofs)) + goto exit; + + if (ofs >= c->wbuf_ofs) { + owbf = (ofs - c->wbuf_ofs); /* offset in write buffer */ + if (owbf > c->wbuf_len) /* is read beyond write buffer ? */ + goto exit; + lwbf = c->wbuf_len - owbf; /* number of bytes to copy */ + if (lwbf > len) + lwbf = len; + } else { + orbf = (c->wbuf_ofs - ofs); /* offset in read buffer */ + if (orbf > len) /* is write beyond write buffer ? */ + goto exit; + lwbf = len - orbf; /* number of bytes to copy */ + if (lwbf > c->wbuf_len) + lwbf = c->wbuf_len; + } + if (lwbf > 0) + memcpy(buf+orbf,c->wbuf+owbf,lwbf); + +exit: + up_read(&c->wbuf_sem); + return ret; +} + +#define NR_OOB_SCAN_PAGES 4 + +/* For historical reasons we use only 8 bytes for OOB clean marker */ +#define OOB_CM_SIZE 8 + +static const struct jffs2_unknown_node oob_cleanmarker = +{ + .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), + .totlen = constant_cpu_to_je32(8) +}; + +/* + * Check, if the out of band area is empty. This function knows about the clean + * marker and if it is present in OOB, treats the OOB as empty anyway. + */ +int jffs2_check_oob_empty(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int mode) +{ + int i, ret; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + struct mtd_oob_ops ops; + + ops.mode = MTD_OPS_AUTO_OOB; + ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; + ops.oobbuf = c->oobbuf; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); + if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { + pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret || mtd_is_bitflip(ret)) + ret = -EIO; + return ret; + } + + for(i = 0; i < ops.ooblen; i++) { + if (mode && i < cmlen) + /* Yeah, we know about the cleanmarker */ + continue; + + if (ops.oobbuf[i] != 0xFF) { + jffs2_dbg(2, "Found %02x at %x in OOB for " + "%08x\n", ops.oobbuf[i], i, jeb->offset); + return 1; + } + } + + return 0; +} + +/* + * Check for a valid cleanmarker. + * Returns: 0 if a valid cleanmarker was found + * 1 if no cleanmarker was found + * negative error code if an error occurred + */ +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + struct mtd_oob_ops ops; + int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + + ops.mode = MTD_OPS_AUTO_OOB; + ops.ooblen = cmlen; + ops.oobbuf = c->oobbuf; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); + if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { + pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret || mtd_is_bitflip(ret)) + ret = -EIO; + return ret; + } + + return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen); +} + +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + int ret; + struct mtd_oob_ops ops; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + + ops.mode = MTD_OPS_AUTO_OOB; + ops.ooblen = cmlen; + ops.oobbuf = (uint8_t *)&oob_cleanmarker; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = mtd_write_oob(c->mtd, jeb->offset, &ops); + if (ret || ops.oobretlen != ops.ooblen) { + pr_err("cannot write OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; + return ret; + } + + return 0; +} + +/* + * On NAND we try to mark this block bad. If the block was erased more + * than MAX_ERASE_FAILURES we mark it finally bad. + * Don't care about failures. This block remains on the erase-pending + * or badblock list as long as nobody manipulates the flash with + * a bootloader or something like that. + */ + +int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) +{ + int ret; + + /* if the count is < max, we try to write the counter to the 2nd page oob area */ + if( ++jeb->bad_count < MAX_ERASE_FAILURES) + return 0; + + pr_warn("marking eraseblock at %08x as bad\n", bad_offset); + ret = mtd_block_markbad(c->mtd, bad_offset); + + if (ret) { + jffs2_dbg(1, "%s(): Write failed for block at %08x: error %d\n", + __func__, jeb->offset, ret); + return ret; + } + return 1; +} + +static struct jffs2_sb_info *work_to_sb(struct work_struct *work) +{ + struct delayed_work *dwork; + + dwork = container_of(work, struct delayed_work, work); + return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); +} + +static void delayed_wbuf_sync(struct work_struct *work) +{ + struct jffs2_sb_info *c = work_to_sb(work); + struct super_block *sb = OFNI_BS_2SFFJ(c); + + if (!(sb->s_flags & MS_RDONLY)) { + jffs2_dbg(1, "%s()\n", __func__); + jffs2_flush_wbuf_gc(c, 0); + } +} + +void jffs2_dirty_trigger(struct jffs2_sb_info *c) +{ + struct super_block *sb = OFNI_BS_2SFFJ(c); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay)) + jffs2_dbg(1, "%s()\n", __func__); +} + +int jffs2_nand_flash_setup(struct jffs2_sb_info *c) +{ + struct nand_ecclayout *oinfo = c->mtd->ecclayout; + + if (!c->mtd->oobsize) + return 0; + + /* Cleanmarker is out-of-band, so inline size zero */ + c->cleanmarker_size = 0; + + if (!oinfo || oinfo->oobavail == 0) { + pr_err("inconsistent device description\n"); + return -EINVAL; + } + + jffs2_dbg(1, "using OOB on NAND\n"); + + c->oobavail = oinfo->oobavail; + + /* Initialise write buffer */ + init_rwsem(&c->wbuf_sem); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL); + if (!c->oobbuf) { + kfree(c->wbuf); + return -ENOMEM; + } + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->oobbuf); + kfree(c->wbuf); + return -ENOMEM; + } +#endif + return 0; +} + +void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c) +{ +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); + kfree(c->oobbuf); +} + +int jffs2_dataflash_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; /* No cleanmarkers needed */ + + /* Initialize write buffer */ + init_rwsem(&c->wbuf_sem); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); + c->wbuf_pagesize = c->mtd->erasesize; + + /* Find a suitable c->sector_size + * - Not too much sectors + * - Sectors have to be at least 4 K + some bytes + * - All known dataflashes have erase sizes of 528 or 1056 + * - we take at least 8 eraseblocks and want to have at least 8K size + * - The concatenation should be a power of 2 + */ + + c->sector_size = 8 * c->mtd->erasesize; + + while (c->sector_size < 8192) { + c->sector_size *= 2; + } + + /* It may be necessary to adjust the flash size */ + c->flash_size = c->mtd->size; + + if ((c->flash_size % c->sector_size) != 0) { + c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; + pr_warn("flash size adjusted to %dKiB\n", c->flash_size); + }; + + c->wbuf_ofs = 0xFFFFFFFF; + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->oobbuf); + kfree(c->wbuf); + return -ENOMEM; + } +#endif + + pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n", + c->wbuf_pagesize, c->sector_size); + + return 0; +} + +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); +} + +int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { + /* Cleanmarker currently occupies whole programming regions, + * either one or 2 for 8Byte STMicro flashes. */ + c->cleanmarker_size = max(16u, c->mtd->writesize); + + /* Initialize write buffer */ + init_rwsem(&c->wbuf_sem); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); + + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->wbuf); + return -ENOMEM; + } +#endif + return 0; +} + +void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); +} + +int jffs2_ubivol_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; + + if (c->mtd->writesize == 1) + /* We do not need write-buffer */ + return 0; + + init_rwsem(&c->wbuf_sem); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); + + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n", + c->wbuf_pagesize, c->sector_size); + + return 0; +} + +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) { + kfree(c->wbuf); +} diff --git a/kernel/fs/jffs2/write.c b/kernel/fs/jffs2/write.c new file mode 100644 index 000000000..b634de4c8 --- /dev/null +++ b/kernel/fs/jffs2/write.c @@ -0,0 +1,722 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + + +int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t mode, struct jffs2_raw_inode *ri) +{ + struct jffs2_inode_cache *ic; + + ic = jffs2_alloc_inode_cache(); + if (!ic) { + return -ENOMEM; + } + + memset(ic, 0, sizeof(*ic)); + + f->inocache = ic; + f->inocache->pino_nlink = 1; /* Will be overwritten shortly for directories */ + f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; + f->inocache->state = INO_STATE_PRESENT; + + jffs2_add_ino_cache(c, f->inocache); + jffs2_dbg(1, "%s(): Assigned ino# %d\n", __func__, f->inocache->ino); + ri->ino = cpu_to_je32(f->inocache->ino); + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(PAD(sizeof(*ri))); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + ri->mode = cpu_to_jemode(mode); + + f->highest_version = 1; + ri->version = cpu_to_je32(f->highest_version); + + return 0; +} + +/* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it, + write it to the flash, link it into the existing inode/fragment list */ + +struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const unsigned char *data, + uint32_t datalen, int alloc_mode) + +{ + struct jffs2_full_dnode *fn; + size_t retlen; + uint32_t flash_ofs; + struct kvec vecs[2]; + int ret; + int retried = 0; + unsigned long cnt = 2; + + D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) { + pr_crit("Eep. CRC not correct in jffs2_write_dnode()\n"); + BUG(); + } + ); + vecs[0].iov_base = ri; + vecs[0].iov_len = sizeof(*ri); + vecs[1].iov_base = (unsigned char *)data; + vecs[1].iov_len = datalen; + + if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) { + pr_warn("%s(): ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", + __func__, je32_to_cpu(ri->totlen), + sizeof(*ri), datalen); + } + + fn = jffs2_alloc_full_dnode(); + if (!fn) + return ERR_PTR(-ENOMEM); + + /* check number of valid vecs */ + if (!datalen || !data) + cnt = 1; + retry: + flash_ofs = write_ofs(c); + + jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len); + + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) { + BUG_ON(!retried); + jffs2_dbg(1, "%s(): dnode_version %d, highest version %d -> updating dnode\n", + __func__, + je32_to_cpu(ri->version), f->highest_version); + ri->version = cpu_to_je32(++f->highest_version); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + } + + ret = jffs2_flash_writev(c, vecs, cnt, flash_ofs, &retlen, + (alloc_mode==ALLOC_GC)?0:f->inocache->ino); + + if (ret || (retlen != sizeof(*ri) + datalen)) { + pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*ri) + datalen, flash_ofs, ret, retlen); + + /* Mark the space as dirtied */ + if (retlen) { + /* Don't change raw->size to match retlen. We may have + written the node header already, and only the data will + seem corrupted, in which case the scan would skip over + any node we write before the original intended end of + this node */ + jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL); + } else { + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + flash_ofs); + } + if (!retried && alloc_mode != ALLOC_NORETRY) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size]; + + retried = 1; + + jffs2_dbg(1, "Retrying failed write.\n"); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + if (alloc_mode == ALLOC_GC) { + ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy, + JFFS2_SUMMARY_INODE_SIZE); + } else { + /* Locking pain */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy, + alloc_mode, JFFS2_SUMMARY_INODE_SIZE); + mutex_lock(&f->sem); + } + + if (!ret) { + flash_ofs = write_ofs(c); + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n", + flash_ofs); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + goto retry; + } + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); + } + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dnode(fn); + return ERR_PTR(ret?ret:-EIO); + } + /* Mark the space used */ + /* If node covers at least a whole page, or if it starts at the + beginning of a page and runs to the end of the file, or if + it's a hole node, mark it REF_PRISTINE, else REF_NORMAL. + */ + if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) || + ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) && + (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) == je32_to_cpu(ri->isize)))) { + flash_ofs |= REF_PRISTINE; + } else { + flash_ofs |= REF_NORMAL; + } + fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache); + if (IS_ERR(fn->raw)) { + void *hold_err = fn->raw; + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dnode(fn); + return ERR_CAST(hold_err); + } + fn->ofs = je32_to_cpu(ri->offset); + fn->size = je32_to_cpu(ri->dsize); + fn->frags = 0; + + jffs2_dbg(1, "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n", + flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize), + je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc), + je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)); + + if (retried) { + jffs2_dbg_acct_sanity_check(c,NULL); + } + + return fn; +} + +struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_dirent *rd, const unsigned char *name, + uint32_t namelen, int alloc_mode) +{ + struct jffs2_full_dirent *fd; + size_t retlen; + struct kvec vecs[2]; + uint32_t flash_ofs; + int retried = 0; + int ret; + + jffs2_dbg(1, "%s(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n", + __func__, + je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), + je32_to_cpu(rd->name_crc)); + + D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) { + pr_crit("Eep. CRC not correct in jffs2_write_dirent()\n"); + BUG(); + }); + + if (strnlen(name, namelen) != namelen) { + /* This should never happen, but seems to have done on at least one + occasion: https://dev.laptop.org/ticket/4184 */ + pr_crit("Error in jffs2_write_dirent() -- name contains zero bytes!\n"); + pr_crit("Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n", + je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), + je32_to_cpu(rd->name_crc)); + WARN_ON(1); + return ERR_PTR(-EIO); + } + + vecs[0].iov_base = rd; + vecs[0].iov_len = sizeof(*rd); + vecs[1].iov_base = (unsigned char *)name; + vecs[1].iov_len = namelen; + + fd = jffs2_alloc_full_dirent(namelen+1); + if (!fd) + return ERR_PTR(-ENOMEM); + + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->nhash = full_name_hash(name, namelen); + fd->type = rd->type; + memcpy(fd->name, name, namelen); + fd->name[namelen]=0; + + retry: + flash_ofs = write_ofs(c); + + jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len); + + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) { + BUG_ON(!retried); + jffs2_dbg(1, "%s(): dirent_version %d, highest version %d -> updating dirent\n", + __func__, + je32_to_cpu(rd->version), f->highest_version); + rd->version = cpu_to_je32(++f->highest_version); + fd->version = je32_to_cpu(rd->version); + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + } + + ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen, + (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino)); + if (ret || (retlen != sizeof(*rd) + namelen)) { + pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*rd) + namelen, flash_ofs, ret, retlen); + /* Mark the space as dirtied */ + if (retlen) { + jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL); + } else { + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + flash_ofs); + } + if (!retried) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size]; + + retried = 1; + + jffs2_dbg(1, "Retrying failed write.\n"); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + if (alloc_mode == ALLOC_GC) { + ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy, + JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + } else { + /* Locking pain */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy, + alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + mutex_lock(&f->sem); + } + + if (!ret) { + flash_ofs = write_ofs(c); + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write\n", + flash_ofs); + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + goto retry; + } + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); + } + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dirent(fd); + return ERR_PTR(ret?ret:-EIO); + } + /* Mark the space used */ + fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | dirent_node_state(rd), + PAD(sizeof(*rd)+namelen), f->inocache); + if (IS_ERR(fd->raw)) { + void *hold_err = fd->raw; + /* Release the full_dirent which is now useless, and return */ + jffs2_free_full_dirent(fd); + return ERR_CAST(hold_err); + } + + if (retried) { + jffs2_dbg_acct_sanity_check(c,NULL); + } + + return fd; +} + +/* The OS-specific code fills in the metadata in the jffs2_raw_inode for us, so that + we don't have to go digging in struct inode or its equivalent. It should set: + mode, uid, gid, (starting)isize, atime, ctime, mtime */ +int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, unsigned char *buf, + uint32_t offset, uint32_t writelen, uint32_t *retlen) +{ + int ret = 0; + uint32_t writtenlen = 0; + + jffs2_dbg(1, "%s(): Ino #%u, ofs 0x%x, len 0x%x\n", + __func__, f->inocache->ino, offset, writelen); + + while(writelen) { + struct jffs2_full_dnode *fn; + unsigned char *comprbuf = NULL; + uint16_t comprtype = JFFS2_COMPR_NONE; + uint32_t alloclen; + uint32_t datalen, cdatalen; + int retried = 0; + + retry: + jffs2_dbg(2, "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", + writelen, offset); + + ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, + &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + jffs2_dbg(1, "jffs2_reserve_space returned %d\n", ret); + break; + } + mutex_lock(&f->sem); + datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1))); + cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen); + + comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen); + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(sizeof(*ri) + cdatalen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->ino = cpu_to_je32(f->inocache->ino); + ri->version = cpu_to_je32(++f->highest_version); + ri->isize = cpu_to_je32(max(je32_to_cpu(ri->isize), offset + datalen)); + ri->offset = cpu_to_je32(offset); + ri->csize = cpu_to_je32(cdatalen); + ri->dsize = cpu_to_je32(datalen); + ri->compr = comprtype & 0xff; + ri->usercompr = (comprtype >> 8 ) & 0xff; + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen)); + + fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY); + + jffs2_free_comprbuf(comprbuf, buf); + + if (IS_ERR(fn)) { + ret = PTR_ERR(fn); + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + if (!retried) { + /* Write error to be retried */ + retried = 1; + jffs2_dbg(1, "Retrying node write in jffs2_write_inode_range()\n"); + goto retry; + } + break; + } + ret = jffs2_add_full_dnode_to_inode(c, f, fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + if (ret) { + /* Eep */ + jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", + ret); + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + break; + } + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + if (!datalen) { + pr_warn("Eep. We didn't actually write any data in jffs2_write_inode_range()\n"); + ret = -EIO; + break; + } + jffs2_dbg(1, "increasing writtenlen by %d\n", datalen); + writtenlen += datalen; + offset += datalen; + writelen -= datalen; + buf += datalen; + } + *retlen = writtenlen; + return ret; +} + +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, + const struct qstr *qstr) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, + JFFS2_SUMMARY_INODE_SIZE); + jffs2_dbg(1, "%s(): reserved 0x%x bytes\n", __func__, alloclen); + if (ret) + return ret; + + mutex_lock(&f->sem); + + ri->data_crc = cpu_to_je32(0); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL); + + jffs2_dbg(1, "jffs2_do_create created file with mode 0x%x\n", + jemode_to_cpu(ri->mode)); + + if (IS_ERR(fn)) { + jffs2_dbg(1, "jffs2_write_dnode() failed\n"); + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + return PTR_ERR(fn); + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr); + if (ret) + return ret; + ret = jffs2_init_acl_post(&f->vfs_inode); + if (ret) + return ret; + + ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len)); + + if (ret) { + /* Eep. */ + jffs2_dbg(1, "jffs2_reserve_space() for dirent failed\n"); + return ret; + } + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + return -ENOMEM; + } + + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = ri->ino; + rd->mctime = ri->ctime; + rd->nsize = qstr->len; + rd->type = DT_REG; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len)); + + fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + + return 0; +} + + +int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + const char *name, int namelen, struct jffs2_inode_info *dead_f, + uint32_t time) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + if (!jffs2_can_mark_obsolete(c)) { + /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */ + + rd = jffs2_alloc_raw_dirent(); + if (!rd) + return -ENOMEM; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) { + jffs2_free_raw_dirent(rd); + return ret; + } + + mutex_lock(&dir_f->sem); + + /* Build a deletion node */ + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(0); + rd->mctime = cpu_to_je32(time); + rd->nsize = namelen; + rd->type = DT_UNKNOWN; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* File it. This will mark the old one obsolete. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + mutex_unlock(&dir_f->sem); + } else { + uint32_t nhash = full_name_hash(name, namelen); + + fd = dir_f->dents; + /* We don't actually want to reserve any space, but we do + want to be holding the alloc_sem when we write to flash */ + mutex_lock(&c->alloc_sem); + mutex_lock(&dir_f->sem); + + for (fd = dir_f->dents; fd; fd = fd->next) { + if (fd->nhash == nhash && + !memcmp(fd->name, name, namelen) && + !fd->name[namelen]) { + + jffs2_dbg(1, "Marking old dirent node (ino #%u) @%08x obsolete\n", + fd->ino, ref_offset(fd->raw)); + jffs2_mark_node_obsolete(c, fd->raw); + /* We don't want to remove it from the list immediately, + because that screws up getdents()/seek() semantics even + more than they're screwed already. Turn it into a + node-less deletion dirent instead -- a placeholder */ + fd->raw = NULL; + fd->ino = 0; + break; + } + } + mutex_unlock(&dir_f->sem); + } + + /* dead_f is NULL if this was a rename not a real unlink */ + /* Also catch the !f->inocache case, where there was a dirent + pointing to an inode which didn't exist. */ + if (dead_f && dead_f->inocache) { + + mutex_lock(&dead_f->sem); + + if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) { + while (dead_f->dents) { + /* There can be only deleted ones */ + fd = dead_f->dents; + + dead_f->dents = fd->next; + + if (fd->ino) { + pr_warn("Deleting inode #%u with active dentry \"%s\"->ino #%u\n", + dead_f->inocache->ino, + fd->name, fd->ino); + } else { + jffs2_dbg(1, "Removing deletion dirent for \"%s\" from dir ino #%u\n", + fd->name, + dead_f->inocache->ino); + } + if (fd->raw) + jffs2_mark_node_obsolete(c, fd->raw); + jffs2_free_full_dirent(fd); + } + dead_f->inocache->pino_nlink = 0; + } else + dead_f->inocache->pino_nlink--; + /* NB: Caller must set inode nlink if appropriate */ + mutex_unlock(&dead_f->sem); + } + + jffs2_complete_reservation(c); + + return 0; +} + + +int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) + return -ENOMEM; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) { + jffs2_free_raw_dirent(rd); + return ret; + } + + mutex_lock(&dir_f->sem); + + /* Build a deletion node */ + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(ino); + rd->mctime = cpu_to_je32(time); + rd->nsize = namelen; + + rd->type = type; + + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* File it. This will mark the old one obsolete. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + + return 0; +} diff --git a/kernel/fs/jffs2/writev.c b/kernel/fs/jffs2/writev.c new file mode 100644 index 000000000..a1bda9dab --- /dev/null +++ b/kernel/fs/jffs2/writev.c @@ -0,0 +1,51 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include "nodelist.h" + +int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen) +{ + if (!jffs2_is_writebuffered(c)) { + if (jffs2_sum_active()) { + int res; + res = jffs2_sum_add_kvec(c, vecs, count, (uint32_t) to); + if (res) { + return res; + } + } + } + + return mtd_writev(c->mtd, vecs, count, to, retlen); +} + +int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf) +{ + int ret; + ret = mtd_write(c->mtd, ofs, len, retlen, buf); + + if (jffs2_sum_active()) { + struct kvec vecs[1]; + int res; + + vecs[0].iov_base = (unsigned char *) buf; + vecs[0].iov_len = len; + + res = jffs2_sum_add_kvec(c, vecs, 1, (uint32_t) ofs); + if (res) { + return res; + } + } + return ret; +} diff --git a/kernel/fs/jffs2/xattr.c b/kernel/fs/jffs2/xattr.c new file mode 100644 index 000000000..f092fee5b --- /dev/null +++ b/kernel/fs/jffs2/xattr.c @@ -0,0 +1,1340 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define JFFS2_XATTR_IS_CORRUPTED 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +/* -------- xdatum related functions ---------------- + * xattr_datum_hashkey(xprefix, xname, xvalue, xsize) + * is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is + * the index of the xattr name/value pair cache (c->xattrindex). + * is_xattr_datum_unchecked(c, xd) + * returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not + * unchecked, it returns 0. + * unload_xattr_datum(c, xd) + * is used to release xattr name/value pair and detach from c->xattrindex. + * reclaim_xattr_datum(c) + * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when + * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold + * is hard coded as 32KiB. + * do_verify_xattr_datum(c, xd) + * is used to load the xdatum informations without name/value pair from the medium. + * It's necessary once, because those informations are not collected during mounting + * process when EBS is enabled. + * 0 will be returned, if success. An negative return value means recoverable error, and + * positive return value means unrecoverable error. Thus, caller must remove this xdatum + * and xref when it returned positive value. + * do_load_xattr_datum(c, xd) + * is used to load name/value pair from the medium. + * The meanings of return value is same as do_verify_xattr_datum(). + * load_xattr_datum(c, xd) + * is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum(). + * If xd need to call do_verify_xattr_datum() at first, it's called before calling + * do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum(). + * save_xattr_datum(c, xd) + * is used to write xdatum to medium. xd->version will be incremented. + * create_xattr_datum(c, xprefix, xname, xvalue, xsize) + * is used to create new xdatum and write to medium. + * unrefer_xattr_datum(c, xd) + * is used to delete a xdatum. When nobody refers this xdatum, JFFS2_XFLAGS_DEAD + * is set on xd->flags and chained xattr_dead_list or release it immediately. + * In the first case, the garbage collector release it later. + * -------------------------------------------------- */ +static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize) +{ + int name_len = strlen(xname); + + return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize); +} + +static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + struct jffs2_raw_node_ref *raw; + int rc = 0; + + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) == REF_UNCHECKED) { + rc = 1; + break; + } + } + spin_unlock(&c->erase_completion_lock); + return rc; +} + +static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version)); + if (xd->xname) { + c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len); + kfree(xd->xname); + } + + list_del_init(&xd->xindex); + xd->hashkey = 0; + xd->xname = NULL; + xd->xvalue = NULL; +} + +static void reclaim_xattr_datum(struct jffs2_sb_info *c) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd, *_xd; + uint32_t target, before; + static int index = 0; + int count; + + if (c->xdatum_mem_threshold > c->xdatum_mem_usage) + return; + + before = c->xdatum_mem_usage; + target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */ + for (count = 0; count < XATTRINDEX_HASHSIZE; count++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) { + if (xd->flags & JFFS2_XFLAGS_HOT) { + xd->flags &= ~JFFS2_XFLAGS_HOT; + } else if (!(xd->flags & JFFS2_XFLAGS_BIND)) { + unload_xattr_datum(c, xd); + } + if (c->xdatum_mem_usage <= target) + goto out; + } + index = (index+1) % XATTRINDEX_HASHSIZE; + } + out: + JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n", + before, c->xdatum_mem_usage, before - c->xdatum_mem_usage); +} + +static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + struct jffs2_raw_xattr rx; + size_t readlen; + uint32_t crc, offset, totlen; + int rc; + + spin_lock(&c->erase_completion_lock); + offset = ref_offset(xd->node); + if (ref_flags(xd->node) == REF_PRISTINE) + goto complete; + spin_unlock(&c->erase_completion_lock); + + rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx); + if (rc || readlen != sizeof(rx)) { + JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n", + rc, sizeof(rx), readlen, offset); + return rc ? rc : -EIO; + } + crc = crc32(0, &rx, sizeof(rx) - 4); + if (crc != je32_to_cpu(rx.node_crc)) { + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rx.hdr_crc), crc); + xd->flags |= JFFS2_XFLAGS_INVALID; + return JFFS2_XATTR_IS_CORRUPTED; + } + totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); + if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK + || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR + || je32_to_cpu(rx.totlen) != totlen + || je32_to_cpu(rx.xid) != xd->xid + || je32_to_cpu(rx.version) != xd->version) { + JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, " + "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n", + offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK, + je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR, + je32_to_cpu(rx.totlen), totlen, + je32_to_cpu(rx.xid), xd->xid, + je32_to_cpu(rx.version), xd->version); + xd->flags |= JFFS2_XFLAGS_INVALID; + return JFFS2_XATTR_IS_CORRUPTED; + } + xd->xprefix = rx.xprefix; + xd->name_len = rx.name_len; + xd->value_len = je16_to_cpu(rx.value_len); + xd->data_crc = je32_to_cpu(rx.data_crc); + + spin_lock(&c->erase_completion_lock); + complete: + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL); + } + spin_unlock(&c->erase_completion_lock); + + /* unchecked xdatum is chained with c->xattr_unchecked */ + list_del_init(&xd->xindex); + + dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n", + xd->xid, xd->version); + + return 0; +} + +static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + char *data; + size_t readlen; + uint32_t crc, length; + int i, ret, retry = 0; + + BUG_ON(ref_flags(xd->node) != REF_PRISTINE); + BUG_ON(!list_empty(&xd->xindex)); + retry: + length = xd->name_len + 1 + xd->value_len; + data = kmalloc(length, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr), + length, &readlen, data); + + if (ret || length!=readlen) { + JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n", + ret, length, readlen, ref_offset(xd->node)); + kfree(data); + return ret ? ret : -EIO; + } + + data[xd->name_len] = '\0'; + crc = crc32(0, data, length); + if (crc != xd->data_crc) { + JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XATTR)" + " at %#08x, read: 0x%08x calculated: 0x%08x\n", + ref_offset(xd->node), xd->data_crc, crc); + kfree(data); + xd->flags |= JFFS2_XFLAGS_INVALID; + return JFFS2_XATTR_IS_CORRUPTED; + } + + xd->flags |= JFFS2_XFLAGS_HOT; + xd->xname = data; + xd->xvalue = data + xd->name_len+1; + + c->xdatum_mem_usage += length; + + xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len); + i = xd->hashkey % XATTRINDEX_HASHSIZE; + list_add(&xd->xindex, &c->xattrindex[i]); + if (!retry) { + retry = 1; + reclaim_xattr_datum(c); + if (!xd->xname) + goto retry; + } + + dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n", + xd->xid, xd->xprefix, xd->xname); + + return 0; +} + +static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem); + * rc < 0 : recoverable error, try again + * rc = 0 : success + * rc > 0 : Unrecoverable error, this node should be deleted. + */ + int rc = 0; + + BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD); + if (xd->xname) + return 0; + if (xd->flags & JFFS2_XFLAGS_INVALID) + return JFFS2_XATTR_IS_CORRUPTED; + if (unlikely(is_xattr_datum_unchecked(c, xd))) + rc = do_verify_xattr_datum(c, xd); + if (!rc) + rc = do_load_xattr_datum(c, xd); + return rc; +} + +static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_raw_xattr rx; + struct kvec vecs[2]; + size_t length; + int rc, totlen; + uint32_t phys_ofs = write_ofs(c); + + BUG_ON(!xd->xname); + BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)); + + vecs[0].iov_base = ℞ + vecs[0].iov_len = sizeof(rx); + vecs[1].iov_base = xd->xname; + vecs[1].iov_len = xd->name_len + 1 + xd->value_len; + totlen = vecs[0].iov_len + vecs[1].iov_len; + + /* Setup raw-xattr */ + memset(&rx, 0, sizeof(rx)); + rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR); + rx.totlen = cpu_to_je32(PAD(totlen)); + rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4)); + + rx.xid = cpu_to_je32(xd->xid); + rx.version = cpu_to_je32(++xd->version); + rx.xprefix = xd->xprefix; + rx.name_len = xd->name_len; + rx.value_len = cpu_to_je16(xd->value_len); + rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len)); + rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4)); + + rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0); + if (rc || totlen != length) { + JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n", + rc, totlen, length, phys_ofs); + rc = rc ? rc : -EIO; + if (length) + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL); + + return rc; + } + /* success */ + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd); + + dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n", + xd->xid, xd->version, xd->xprefix, xd->xname); + + return 0; +} + +static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, + int xprefix, const char *xname, + const char *xvalue, int xsize) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd; + uint32_t hashkey, name_len; + char *data; + int i, rc; + + /* Search xattr_datum has same xname/xvalue by index */ + hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize); + i = hashkey % XATTRINDEX_HASHSIZE; + list_for_each_entry(xd, &c->xattrindex[i], xindex) { + if (xd->hashkey==hashkey + && xd->xprefix==xprefix + && xd->value_len==xsize + && !strcmp(xd->xname, xname) + && !memcmp(xd->xvalue, xvalue, xsize)) { + atomic_inc(&xd->refcnt); + return xd; + } + } + + /* Not found, Create NEW XATTR-Cache */ + name_len = strlen(xname); + + xd = jffs2_alloc_xattr_datum(); + if (!xd) + return ERR_PTR(-ENOMEM); + + data = kmalloc(name_len + 1 + xsize, GFP_KERNEL); + if (!data) { + jffs2_free_xattr_datum(xd); + return ERR_PTR(-ENOMEM); + } + strcpy(data, xname); + memcpy(data + name_len + 1, xvalue, xsize); + + atomic_set(&xd->refcnt, 1); + xd->xid = ++c->highest_xid; + xd->flags |= JFFS2_XFLAGS_HOT; + xd->xprefix = xprefix; + + xd->hashkey = hashkey; + xd->xname = data; + xd->xvalue = data + name_len + 1; + xd->name_len = name_len; + xd->value_len = xsize; + xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len); + + rc = save_xattr_datum(c, xd); + if (rc) { + kfree(xd->xname); + jffs2_free_xattr_datum(xd); + return ERR_PTR(rc); + } + + /* Insert Hash Index */ + i = hashkey % XATTRINDEX_HASHSIZE; + list_add(&xd->xindex, &c->xattrindex[i]); + + c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len); + reclaim_xattr_datum(c); + + return xd; +} + +static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) { + unload_xattr_datum(c, xd); + xd->flags |= JFFS2_XFLAGS_DEAD; + if (xd->node == (void *)xd) { + BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID)); + jffs2_free_xattr_datum(xd); + } else { + list_add(&xd->xindex, &c->xattr_dead_list); + } + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", + xd->xid, xd->version); + } +} + +/* -------- xref related functions ------------------ + * verify_xattr_ref(c, ref) + * is used to load xref information from medium. Because summary data does not + * contain xid/ino, it's necessary to verify once while mounting process. + * save_xattr_ref(c, ref) + * is used to write xref to medium. If delete marker is marked, it write + * a delete marker of xref into medium. + * create_xattr_ref(c, ic, xd) + * is used to create a new xref and write to medium. + * delete_xattr_ref(c, ref) + * is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER, + * and allows GC to reclaim those physical nodes. + * jffs2_xattr_delete_inode(c, ic) + * is called to remove xrefs related to obsolete inode when inode is unlinked. + * jffs2_xattr_free_inode(c, ic) + * is called to release xattr related objects when unmounting. + * check_xattr_ref_inode(c, ic) + * is used to confirm inode does not have duplicate xattr name/value pair. + * jffs2_xattr_do_crccheck_inode(c, ic) + * is used to force xattr data integrity check during the initial gc scan. + * -------------------------------------------------- */ +static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + struct jffs2_raw_xref rr; + size_t readlen; + uint32_t crc, offset, totlen; + int rc; + + spin_lock(&c->erase_completion_lock); + if (ref_flags(ref->node) != REF_UNCHECKED) + goto complete; + offset = ref_offset(ref->node); + spin_unlock(&c->erase_completion_lock); + + rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr); + if (rc || sizeof(rr) != readlen) { + JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n", + rc, sizeof(rr), readlen, offset); + return rc ? rc : -EIO; + } + /* obsolete node */ + crc = crc32(0, &rr, sizeof(rr) - 4); + if (crc != je32_to_cpu(rr.node_crc)) { + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rr.node_crc), crc); + return JFFS2_XATTR_IS_CORRUPTED; + } + if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK + || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF + || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) { + JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, " + "nodetype=%#04x/%#04x, totlen=%u/%zu\n", + offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, + je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, + je32_to_cpu(rr.totlen), PAD(sizeof(rr))); + return JFFS2_XATTR_IS_CORRUPTED; + } + ref->ino = je32_to_cpu(rr.ino); + ref->xid = je32_to_cpu(rr.xid); + ref->xseqno = je32_to_cpu(rr.xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); + + spin_lock(&c->erase_completion_lock); + complete: + for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL); + } + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n", + ref->ino, ref->xid, ref_offset(ref->node)); + return 0; +} + +static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_raw_xref rr; + size_t length; + uint32_t xseqno, phys_ofs = write_ofs(c); + int ret; + + rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF); + rr.totlen = cpu_to_je32(PAD(sizeof(rr))); + rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4)); + + xseqno = (c->highest_xseqno += 2); + if (is_xattr_ref_dead(ref)) { + xseqno |= XREF_DELETE_MARKER; + rr.ino = cpu_to_je32(ref->ino); + rr.xid = cpu_to_je32(ref->xid); + } else { + rr.ino = cpu_to_je32(ref->ic->ino); + rr.xid = cpu_to_je32(ref->xd->xid); + } + rr.xseqno = cpu_to_je32(xseqno); + rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4)); + + ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr); + if (ret || sizeof(rr) != length) { + JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n", + ret, sizeof(rr), length, phys_ofs); + ret = ret ? ret : -EIO; + if (length) + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL); + + return ret; + } + /* success */ + ref->xseqno = xseqno; + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref); + + dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid); + + return 0; +} + +static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, + struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_ref *ref; + int ret; + + ref = jffs2_alloc_xattr_ref(); + if (!ref) + return ERR_PTR(-ENOMEM); + ref->ic = ic; + ref->xd = xd; + + ret = save_xattr_ref(c, ref); + if (ret) { + jffs2_free_xattr_ref(ref); + return ERR_PTR(ret); + } + + /* Chain to inode */ + ref->next = ic->xref; + ic->xref = ref; + + return ref; /* success */ +} + +static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd; + + xd = ref->xd; + ref->xseqno |= XREF_DELETE_MARKER; + ref->ino = ref->ic->ino; + ref->xid = ref->xd->xid; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n", + ref->ino, ref->xid, ref->xseqno); + + unrefer_xattr_datum(c, xd); +} + +void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* It's called from jffs2_evict_inode() on inode removing. + When an inode with XATTR is removed, those XATTRs must be removed. */ + struct jffs2_xattr_ref *ref, *_ref; + + if (!ic || ic->pino_nlink > 0) + return; + + down_write(&c->xattr_sem); + for (ref = ic->xref; ref; ref = _ref) { + _ref = ref->next; + delete_xattr_ref(c, ref); + } + ic->xref = NULL; + up_write(&c->xattr_sem); +} + +void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* It's called from jffs2_free_ino_caches() until unmounting FS. */ + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, *_ref; + + down_write(&c->xattr_sem); + for (ref = ic->xref; ref; ref = _ref) { + _ref = ref->next; + xd = ref->xd; + if (atomic_dec_and_test(&xd->refcnt)) { + unload_xattr_datum(c, xd); + jffs2_free_xattr_datum(xd); + } + jffs2_free_xattr_ref(ref); + } + ic->xref = NULL; + up_write(&c->xattr_sem); +} + +static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* success of check_xattr_ref_inode() means that inode (ic) dose not have + * duplicate name/value pairs. If duplicate name/value pair would be found, + * one will be removed. + */ + struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp; + int rc = 0; + + if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED)) + return 0; + down_write(&c->xattr_sem); + retry: + rc = 0; + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + if (!ref->xd->xname) { + rc = load_xattr_datum(c, ref->xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) { + if (!cmp->xd->xname) { + ref->xd->flags |= JFFS2_XFLAGS_BIND; + rc = load_xattr_datum(c, cmp->xd); + ref->xd->flags &= ~JFFS2_XFLAGS_BIND; + if (unlikely(rc > 0)) { + *pcmp = cmp->next; + delete_xattr_ref(c, cmp); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + if (ref->xd->xprefix == cmp->xd->xprefix + && !strcmp(ref->xd->xname, cmp->xd->xname)) { + if (ref->xseqno > cmp->xseqno) { + *pcmp = cmp->next; + delete_xattr_ref(c, cmp); + } else { + *pref = ref->next; + delete_xattr_ref(c, ref); + } + goto retry; + } + } + } + ic->flags |= INO_FLAGS_XATTR_CHECKED; + out: + up_write(&c->xattr_sem); + + return rc; +} + +void jffs2_xattr_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + check_xattr_ref_inode(c, ic); +} + +/* -------- xattr subsystem functions --------------- + * jffs2_init_xattr_subsystem(c) + * is used to initialize semaphore and list_head, and some variables. + * jffs2_find_xattr_datum(c, xid) + * is used to lookup xdatum while scanning process. + * jffs2_clear_xattr_subsystem(c) + * is used to release any xattr related objects. + * jffs2_build_xattr_subsystem(c) + * is used to associate xdatum and xref while super block building process. + * jffs2_setup_xattr_datum(c, xid, version) + * is used to insert xdatum while scanning process. + * -------------------------------------------------- */ +void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c) +{ + int i; + + for (i=0; i < XATTRINDEX_HASHSIZE; i++) + INIT_LIST_HEAD(&c->xattrindex[i]); + INIT_LIST_HEAD(&c->xattr_unchecked); + INIT_LIST_HEAD(&c->xattr_dead_list); + c->xref_dead_list = NULL; + c->xref_temp = NULL; + + init_rwsem(&c->xattr_sem); + c->highest_xid = 0; + c->highest_xseqno = 0; + c->xdatum_mem_usage = 0; + c->xdatum_mem_threshold = 32 * 1024; /* Default 32KB */ +} + +static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid) +{ + struct jffs2_xattr_datum *xd; + int i = xid % XATTRINDEX_HASHSIZE; + + /* It's only used in scanning/building process. */ + BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING))); + + list_for_each_entry(xd, &c->xattrindex[i], xindex) { + if (xd->xid==xid) + return xd; + } + return NULL; +} + +void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_xattr_ref *ref, *_ref; + int i; + + for (ref=c->xref_temp; ref; ref = _ref) { + _ref = ref->next; + jffs2_free_xattr_ref(ref); + } + + for (ref=c->xref_dead_list; ref; ref = _ref) { + _ref = ref->next; + jffs2_free_xattr_ref(ref); + } + + for (i=0; i < XATTRINDEX_HASHSIZE; i++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { + list_del(&xd->xindex); + kfree(xd->xname); + jffs2_free_xattr_datum(xd); + } + } + + list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) { + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); + } + list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) { + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); + } +} + +#define XREF_TMPHASH_SIZE (128) +void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_ref *ref, *_ref; + struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref *raw; + int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0; + int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0; + + BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); + + /* Phase.1 : Merge same xref */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) + xref_tmphash[i] = NULL; + for (ref=c->xref_temp; ref; ref=_ref) { + struct jffs2_xattr_ref *tmp; + + _ref = ref->next; + if (ref_flags(ref->node) != REF_PRISTINE) { + if (verify_xattr_ref(c, ref)) { + BUG_ON(ref->node->next_in_ino != (void *)ref); + ref->node->next_in_ino = NULL; + jffs2_mark_node_obsolete(c, ref->node); + jffs2_free_xattr_ref(ref); + continue; + } + } + + i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE; + for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) { + if (tmp->ino == ref->ino && tmp->xid == ref->xid) + break; + } + if (tmp) { + raw = ref->node; + if (ref->xseqno > tmp->xseqno) { + tmp->xseqno = ref->xseqno; + raw->next_in_ino = tmp->node; + tmp->node = raw; + } else { + raw->next_in_ino = tmp->node->next_in_ino; + tmp->node->next_in_ino = raw; + } + jffs2_free_xattr_ref(ref); + continue; + } else { + ref->next = xref_tmphash[i]; + xref_tmphash[i] = ref; + } + } + c->xref_temp = NULL; + + /* Phase.2 : Bind xref with inode_cache and xattr_datum */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) { + for (ref=xref_tmphash[i]; ref; ref=_ref) { + xref_count++; + _ref = ref->next; + if (is_xattr_ref_dead(ref)) { + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_dead_count++; + continue; + } + /* At this point, ref->xid and ref->ino contain XID and inode number. + ref->xd and ref->ic are not valid yet. */ + xd = jffs2_find_xattr_datum(c, ref->xid); + ic = jffs2_get_ino_cache(c, ref->ino); + if (!xd || !ic || !ic->pino_nlink) { + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n", + ref->ino, ref->xid, ref->xseqno); + ref->xseqno |= XREF_DELETE_MARKER; + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_orphan_count++; + continue; + } + ref->xd = xd; + ref->ic = ic; + atomic_inc(&xd->refcnt); + ref->next = ic->xref; + ic->xref = ref; + } + } + + /* Phase.3 : Link unchecked xdatum to xattr_unchecked list */ + for (i=0; i < XATTRINDEX_HASHSIZE; i++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { + xdatum_count++; + list_del_init(&xd->xindex); + if (!atomic_read(&xd->refcnt)) { + dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n", + xd->xid, xd->version); + xd->flags |= JFFS2_XFLAGS_DEAD; + list_add(&xd->xindex, &c->xattr_unchecked); + xdatum_orphan_count++; + continue; + } + if (is_xattr_datum_unchecked(c, xd)) { + dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n", + xd->xid, xd->version); + list_add(&xd->xindex, &c->xattr_unchecked); + xdatum_unchecked_count++; + } + } + } + /* build complete */ + JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum" + " (%u unchecked, %u orphan) and " + "%u of xref (%u dead, %u orphan) found.\n", + xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, + xref_count, xref_dead_count, xref_orphan_count); +} + +struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, + uint32_t xid, uint32_t version) +{ + struct jffs2_xattr_datum *xd; + + xd = jffs2_find_xattr_datum(c, xid); + if (!xd) { + xd = jffs2_alloc_xattr_datum(); + if (!xd) + return ERR_PTR(-ENOMEM); + xd->xid = xid; + xd->version = version; + if (xd->xid > c->highest_xid) + c->highest_xid = xd->xid; + list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]); + } + return xd; +} + +/* -------- xattr subsystem functions --------------- + * xprefix_to_handler(xprefix) + * is used to translate xprefix into xattr_handler. + * jffs2_listxattr(dentry, buffer, size) + * is an implementation of listxattr handler on jffs2. + * do_jffs2_getxattr(inode, xprefix, xname, buffer, size) + * is an implementation of getxattr handler on jffs2. + * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) + * is an implementation of setxattr handler on jffs2. + * -------------------------------------------------- */ +const struct xattr_handler *jffs2_xattr_handlers[] = { + &jffs2_user_xattr_handler, +#ifdef CONFIG_JFFS2_FS_SECURITY + &jffs2_security_xattr_handler, +#endif +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &jffs2_trusted_xattr_handler, + NULL +}; + +static const struct xattr_handler *xprefix_to_handler(int xprefix) { + const struct xattr_handler *ret; + + switch (xprefix) { + case JFFS2_XPREFIX_USER: + ret = &jffs2_user_xattr_handler; + break; +#ifdef CONFIG_JFFS2_FS_SECURITY + case JFFS2_XPREFIX_SECURITY: + ret = &jffs2_security_xattr_handler; + break; +#endif +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + case JFFS2_XPREFIX_ACL_ACCESS: + ret = &posix_acl_access_xattr_handler; + break; + case JFFS2_XPREFIX_ACL_DEFAULT: + ret = &posix_acl_default_xattr_handler; + break; +#endif + case JFFS2_XPREFIX_TRUSTED: + ret = &jffs2_trusted_xattr_handler; + break; + default: + ret = NULL; + break; + } + return ret; +} + +ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_ref *ref, **pref; + struct jffs2_xattr_datum *xd; + const struct xattr_handler *xhandle; + ssize_t len, rc; + int retry = 0; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + down_read(&c->xattr_sem); + retry: + len = 0; + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + BUG_ON(ref->ic != ic); + xd = ref->xd; + if (!xd->xname) { + /* xdatum is unchached */ + if (!retry) { + retry = 1; + up_read(&c->xattr_sem); + down_write(&c->xattr_sem); + goto retry; + } else { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + } + xhandle = xprefix_to_handler(xd->xprefix); + if (!xhandle) + continue; + if (buffer) { + rc = xhandle->list(dentry, buffer+len, size-len, + xd->xname, xd->name_len, xd->flags); + } else { + rc = xhandle->list(dentry, NULL, 0, xd->xname, + xd->name_len, xd->flags); + } + if (rc < 0) + goto out; + len += rc; + } + rc = len; + out: + if (!retry) { + up_read(&c->xattr_sem); + } else { + up_write(&c->xattr_sem); + } + return rc; +} + +int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname, + char *buffer, size_t size) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, **pref; + int rc, retry = 0; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + down_read(&c->xattr_sem); + retry: + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + BUG_ON(ref->ic!=ic); + + xd = ref->xd; + if (xd->xprefix != xprefix) + continue; + if (!xd->xname) { + /* xdatum is unchached */ + if (!retry) { + retry = 1; + up_read(&c->xattr_sem); + down_write(&c->xattr_sem); + goto retry; + } else { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) { + goto out; + } + } + } + if (!strcmp(xname, xd->xname)) { + rc = xd->value_len; + if (buffer) { + if (size < rc) { + rc = -ERANGE; + } else { + memcpy(buffer, xd->xvalue, rc); + } + } + goto out; + } + } + rc = -ENODATA; + out: + if (!retry) { + up_read(&c->xattr_sem); + } else { + up_write(&c->xattr_sem); + } + return rc; +} + +int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, + const char *buffer, size_t size, int flags) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, *newref, **pref; + uint32_t length, request; + int rc; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size); + rc = jffs2_reserve_space(c, request, &length, + ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); + return rc; + } + + /* Find existing xattr */ + down_write(&c->xattr_sem); + retry: + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + xd = ref->xd; + if (xd->xprefix != xprefix) + continue; + if (!xd->xname) { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + if (!strcmp(xd->xname, xname)) { + if (flags & XATTR_CREATE) { + rc = -EEXIST; + goto out; + } + if (!buffer) { + ref->ino = ic->ino; + ref->xid = xd->xid; + ref->xseqno |= XREF_DELETE_MARKER; + rc = save_xattr_ref(c, ref); + if (!rc) { + *pref = ref->next; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + unrefer_xattr_datum(c, xd); + } else { + ref->ic = ic; + ref->xd = xd; + ref->xseqno &= ~XREF_DELETE_MARKER; + } + goto out; + } + goto found; + } + } + /* not found */ + if (flags & XATTR_REPLACE) { + rc = -ENODATA; + goto out; + } + if (!buffer) { + rc = -ENODATA; + goto out; + } + found: + xd = create_xattr_datum(c, xprefix, xname, buffer, size); + if (IS_ERR(xd)) { + rc = PTR_ERR(xd); + goto out; + } + up_write(&c->xattr_sem); + jffs2_complete_reservation(c); + + /* create xattr_ref */ + request = PAD(sizeof(struct jffs2_raw_xref)); + rc = jffs2_reserve_space(c, request, &length, + ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE); + down_write(&c->xattr_sem); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); + unrefer_xattr_datum(c, xd); + up_write(&c->xattr_sem); + return rc; + } + if (ref) + *pref = ref->next; + newref = create_xattr_ref(c, ic, xd); + if (IS_ERR(newref)) { + if (ref) { + ref->next = ic->xref; + ic->xref = ref; + } + rc = PTR_ERR(newref); + unrefer_xattr_datum(c, xd); + } else if (ref) { + delete_xattr_ref(c, ref); + } + out: + up_write(&c->xattr_sem); + jffs2_complete_reservation(c); + return rc; +} + +/* -------- garbage collector functions ------------- + * jffs2_garbage_collect_xattr_datum(c, xd, raw) + * is used to move xdatum into new node. + * jffs2_garbage_collect_xattr_ref(c, ref, raw) + * is used to move xref into new node. + * jffs2_verify_xattr(c) + * is used to call do_verify_xattr_datum() before garbage collecting. + * jffs2_release_xattr_datum(c, xd) + * is used to release an in-memory object of xdatum. + * jffs2_release_xattr_ref(c, ref) + * is used to release an in-memory object of xref. + * -------------------------------------------------- */ +int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw) +{ + uint32_t totlen, length, old_ofs; + int rc = 0; + + down_write(&c->xattr_sem); + if (xd->node != raw) + goto out; + if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)) + goto out; + + rc = load_xattr_datum(c, xd); + if (unlikely(rc)) { + rc = (rc > 0) ? 0 : rc; + goto out; + } + old_ofs = ref_offset(xd->node); + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + xd->name_len + 1 + xd->value_len); + rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen); + goto out; + } + rc = save_xattr_datum(c, xd); + if (!rc) + dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n", + xd->xid, xd->version, old_ofs, ref_offset(xd->node)); + out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); + up_write(&c->xattr_sem); + return rc; +} + +int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw) +{ + uint32_t totlen, length, old_ofs; + int rc = 0; + + down_write(&c->xattr_sem); + BUG_ON(!ref->node); + + if (ref->node != raw) + goto out; + if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref)) + goto out; + + old_ofs = ref_offset(ref->node); + totlen = ref_totlen(c, c->gcblock, ref->node); + + rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE); + if (rc) { + JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", + __func__, rc, totlen); + goto out; + } + rc = save_xattr_ref(c, ref); + if (!rc) + dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n", + ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node)); + out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); + up_write(&c->xattr_sem); + return rc; +} + +int jffs2_verify_xattr(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + uint32_t totlen; + int rc; + + down_write(&c->xattr_sem); + list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) { + rc = do_verify_xattr_datum(c, xd); + if (rc < 0) + continue; + list_del_init(&xd->xindex); + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) != REF_UNCHECKED) + continue; + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + raw->flash_offset = ref_offset(raw) + | ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL); + } + if (xd->flags & JFFS2_XFLAGS_DEAD) + list_add(&xd->xindex, &c->xattr_dead_list); + spin_unlock(&c->erase_completion_lock); + } + up_write(&c->xattr_sem); + return list_empty(&c->xattr_unchecked) ? 1 : 0; +} + +void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + if (atomic_read(&xd->refcnt) || xd->node != (void *)xd) + return; + + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); +} + +void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + struct jffs2_xattr_ref *tmp, **ptmp; + + if (ref->node != (void *)ref) + return; + + for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) { + if (ref == tmp) { + *ptmp = tmp->next; + break; + } + } + jffs2_free_xattr_ref(ref); +} diff --git a/kernel/fs/jffs2/xattr.h b/kernel/fs/jffs2/xattr.h new file mode 100644 index 000000000..467ff376e --- /dev/null +++ b/kernel/fs/jffs2/xattr.h @@ -0,0 +1,133 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_XATTR_H_ +#define _JFFS2_FS_XATTR_H_ + +#include +#include + +#define JFFS2_XFLAGS_HOT (0x01) /* This datum is HOT */ +#define JFFS2_XFLAGS_BIND (0x02) /* This datum is not reclaimed */ +#define JFFS2_XFLAGS_DEAD (0x40) /* This datum is already dead */ +#define JFFS2_XFLAGS_INVALID (0x80) /* This datum contains crc error */ + +struct jffs2_xattr_datum +{ + void *always_null; + struct jffs2_raw_node_ref *node; + uint8_t class; + uint8_t flags; + uint16_t xprefix; /* see JFFS2_XATTR_PREFIX_* */ + + struct list_head xindex; /* chained from c->xattrindex[n] */ + atomic_t refcnt; /* # of xattr_ref refers this */ + uint32_t xid; + uint32_t version; + + uint32_t data_crc; + uint32_t hashkey; + char *xname; /* XATTR name without prefix */ + uint32_t name_len; /* length of xname */ + char *xvalue; /* XATTR value */ + uint32_t value_len; /* length of xvalue */ +}; + +struct jffs2_inode_cache; +struct jffs2_xattr_ref +{ + void *always_null; + struct jffs2_raw_node_ref *node; + uint8_t class; + uint8_t flags; /* Currently unused */ + u16 unused; + + uint32_t xseqno; + union { + struct jffs2_inode_cache *ic; /* reference to jffs2_inode_cache */ + uint32_t ino; /* only used in scanning/building */ + }; + union { + struct jffs2_xattr_datum *xd; /* reference to jffs2_xattr_datum */ + uint32_t xid; /* only used in sccanning/building */ + }; + struct jffs2_xattr_ref *next; /* chained from ic->xref_list */ +}; + +#define XREF_DELETE_MARKER (0x00000001) +static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) +{ + return ((ref->xseqno & XREF_DELETE_MARKER) != 0); +} + +#ifdef CONFIG_JFFS2_FS_XATTR + +extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); +extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); +extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c); + +extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, + uint32_t xid, uint32_t version); + +extern void jffs2_xattr_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); +extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); +extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); + +extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw); +extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw); +extern int jffs2_verify_xattr(struct jffs2_sb_info *c); +extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd); +extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref); + +extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname, + char *buffer, size_t size); +extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, + const char *buffer, size_t size, int flags); + +extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler jffs2_user_xattr_handler; +extern const struct xattr_handler jffs2_trusted_xattr_handler; + +extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); +#define jffs2_getxattr generic_getxattr +#define jffs2_setxattr generic_setxattr +#define jffs2_removexattr generic_removexattr + +#else + +#define jffs2_init_xattr_subsystem(c) +#define jffs2_build_xattr_subsystem(c) +#define jffs2_clear_xattr_subsystem(c) + +#define jffs2_xattr_do_crccheck_inode(c, ic) +#define jffs2_xattr_delete_inode(c, ic) +#define jffs2_xattr_free_inode(c, ic) +#define jffs2_verify_xattr(c) (1) + +#define jffs2_xattr_handlers NULL +#define jffs2_listxattr NULL +#define jffs2_getxattr NULL +#define jffs2_setxattr NULL +#define jffs2_removexattr NULL + +#endif /* CONFIG_JFFS2_FS_XATTR */ + +#ifdef CONFIG_JFFS2_FS_SECURITY +extern int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); +extern const struct xattr_handler jffs2_security_xattr_handler; +#else +#define jffs2_init_security(inode,dir,qstr) (0) +#endif /* CONFIG_JFFS2_FS_SECURITY */ + +#endif /* _JFFS2_FS_XATTR_H_ */ diff --git a/kernel/fs/jffs2/xattr_trusted.c b/kernel/fs/jffs2/xattr_trusted.c new file mode 100644 index 000000000..ceaf9c693 --- /dev/null +++ b/kernel/fs/jffs2/xattr_trusted.c @@ -0,0 +1,55 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + name, buffer, size); +} + +static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + name, buffer, size, flags); +} + +static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; + + if (list && retlen<=list_size) { + strcpy(list, XATTR_TRUSTED_PREFIX); + strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = jffs2_trusted_listxattr, + .set = jffs2_trusted_setxattr, + .get = jffs2_trusted_getxattr +}; diff --git a/kernel/fs/jffs2/xattr_user.c b/kernel/fs/jffs2/xattr_user.c new file mode 100644 index 000000000..a71391eba --- /dev/null +++ b/kernel/fs/jffs2/xattr_user.c @@ -0,0 +1,55 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + name, buffer, size); +} + +static int jffs2_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + name, buffer, size, flags); +} + +static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; + + if (list && retlen <= list_size) { + strcpy(list, XATTR_USER_PREFIX); + strcpy(list + XATTR_USER_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .list = jffs2_user_listxattr, + .set = jffs2_user_setxattr, + .get = jffs2_user_getxattr +}; diff --git a/kernel/fs/jfs/Kconfig b/kernel/fs/jfs/Kconfig new file mode 100644 index 000000000..57cef1995 --- /dev/null +++ b/kernel/fs/jfs/Kconfig @@ -0,0 +1,50 @@ +config JFS_FS + tristate "JFS filesystem support" + select NLS + select CRC32 + help + This is a port of IBM's Journaled Filesystem . More information is + available in the file . + + If you do not intend to use the JFS filesystem, say N. + +config JFS_POSIX_ACL + bool "JFS POSIX Access Control Lists" + depends on JFS_FS + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config JFS_SECURITY + bool "JFS Security Labels" + depends on JFS_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jfs filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFS_DEBUG + bool "JFS debugging" + depends on JFS_FS + help + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + +config JFS_STATISTICS + bool "JFS statistics" + depends on JFS_FS + help + Enabling this option will cause statistics from the JFS file system + to be made available to the user in the /proc/fs/jfs/ directory. diff --git a/kernel/fs/jfs/Makefile b/kernel/fs/jfs/Makefile new file mode 100644 index 000000000..d20d4737b --- /dev/null +++ b/kernel/fs/jfs/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the Linux JFS filesystem routines. +# + +obj-$(CONFIG_JFS_FS) += jfs.o + +jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ + jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ + jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \ + jfs_extent.o symlink.o jfs_metapage.o \ + jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ + resize.o xattr.o ioctl.o + +jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o + +ccflags-y := -D_JFS_4K diff --git a/kernel/fs/jfs/acl.c b/kernel/fs/jfs/acl.c new file mode 100644 index 000000000..0c8ca830b --- /dev/null +++ b/kernel/fs/jfs/acl.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) International Business Machines Corp., 2002-2004 + * Copyright (C) Andreas Gruenbacher, 2001 + * Copyright (C) Linus Torvalds, 1991, 1992 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +struct posix_acl *jfs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *ea_name; + int size; + char *value = NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch(type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __jfs_getxattr(inode, ea_name, NULL, 0); + + if (size > 0) { + value = kmalloc(size, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = __jfs_getxattr(inode, ea_name, value, size); + } + + if (size < 0) { + if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + } else { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + } + kfree(value); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + return acl; +} + +static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, + struct posix_acl *acl) +{ + char *ea_name; + int rc; + int size = 0; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + rc = posix_acl_equiv_mode(acl, &inode->i_mode); + if (rc < 0) + return rc; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + if (rc == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (rc < 0) + goto out; + } + rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); +out: + kfree(value); + + if (!rc) + set_cached_acl(inode, type, acl); + + return rc; +} + +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc; + tid_t tid; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = __jfs_set_acl(tid, inode, type, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + +int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int rc = 0; + + rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (rc) + return rc; + + if (default_acl) { + rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + + if (acl) { + if (!rc) + rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + } + + JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | + inode->i_mode; + + return rc; +} diff --git a/kernel/fs/jfs/file.c b/kernel/fs/jfs/file.c new file mode 100644 index 000000000..e98d39d75 --- /dev/null +++ b/kernel/fs/jfs/file.c @@ -0,0 +1,165 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_dmap.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int rc = 0; + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + + mutex_lock(&inode->i_mutex); + if (!(inode->i_state & I_DIRTY_ALL) || + (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); + mutex_unlock(&inode->i_mutex); + return rc; + } + + rc |= jfs_commit_inode(inode, 1); + mutex_unlock(&inode->i_mutex); + + return rc ? -EIO : 0; +} + +static int jfs_open(struct inode *inode, struct file *file) +{ + int rc; + + if ((rc = dquot_file_open(inode, file))) + return rc; + + /* + * We attempt to allow only one "active" file open per aggregate + * group. Otherwise, appending to files in parallel can cause + * fragmentation within the files. + * + * If the file is empty, it was probably just created and going + * to be written to. If it has a size, we'll hold off until the + * file is actually grown. + */ + if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && + (inode->i_size == 0)) { + struct jfs_inode_info *ji = JFS_IP(inode); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + } + spin_unlock_irq(&ji->ag_lock); + } + + return 0; +} +static int jfs_release(struct inode *inode, struct file *file) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + + return 0; +} + +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + + rc = inode_newsize_ok(inode, iattr->ia_size); + if (rc) + return rc; + + truncate_setsize(inode, iattr->ia_size); + jfs_truncate(inode); + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) + rc = posix_acl_chmod(inode, inode->i_mode); + return rc; +} + +const struct inode_operations jfs_file_inode_operations = { + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, +#endif +}; + +const struct file_operations jfs_file_operations = { + .open = jfs_open, + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fsync = jfs_fsync, + .release = jfs_release, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif +}; diff --git a/kernel/fs/jfs/inode.c b/kernel/fs/jfs/inode.c new file mode 100644 index 000000000..070dc4b33 --- /dev/null +++ b/kernel/fs/jfs/inode.c @@ -0,0 +1,423 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_extent.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + + +struct inode *jfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ret = diRead(inode); + if (ret < 0) { + iget_failed(inode); + return ERR_PTR(ret); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &jfs_file_inode_operations; + inode->i_fop = &jfs_file_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &jfs_dir_inode_operations; + inode->i_fop = &jfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_size >= IDATASIZE) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else { + inode->i_op = &jfs_fast_symlink_inode_operations; + /* + * The inline data should be null-terminated, but + * don't let on-disk corruption crash the kernel + */ + JFS_IP(inode)->i_inline[inode->i_size] = '\0'; + } + } else { + inode->i_op = &jfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } + unlock_new_inode(inode); + return inode; +} + +/* + * Workhorse of both fsync & write_inode + */ +int jfs_commit_inode(struct inode *inode, int wait) +{ + int rc = 0; + tid_t tid; + static int noisy = 5; + + jfs_info("In jfs_commit_inode, inode = 0x%p", inode); + + /* + * Don't commit if inode has been committed since last being + * marked dirty, or if it has been deleted. + */ + if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode)) + return 0; + + if (isReadOnly(inode)) { + /* kernel allows writes to devices on read-only + * partitions and may think inode is dirty + */ + if (!special_file(inode->i_mode) && noisy) { + jfs_err("jfs_commit_inode(0x%p) called on " + "read-only volume", inode); + jfs_err("Is remount racy?"); + noisy--; + } + return 0; + } + + tid = txBegin(inode->i_sb, COMMIT_INODE); + mutex_lock(&JFS_IP(inode)->commit_mutex); + + /* + * Retest inode state after taking commit_mutex + */ + if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode)) + rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0); + + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int wait = wbc->sync_mode == WB_SYNC_ALL; + + if (inode->i_nlink == 0) + return 0; + /* + * If COMMIT_DIRTY is not set, the inode isn't really dirty. + * It has been committed since the last change, but was still + * on the dirty inode list. + */ + if (!test_cflag(COMMIT_Dirty, inode)) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); + return 0; + } + + if (jfs_commit_inode(inode, wait)) { + jfs_err("jfs_write_inode: jfs_commit_inode failed!"); + return -EIO; + } else + return 0; +} + +void jfs_evict_inode(struct inode *inode) +{ + jfs_info("In jfs_evict_inode, inode = 0x%p", inode); + + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + + if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + truncate_inode_pages_final(&inode->i_data); + + if (test_cflag(COMMIT_Freewmap, inode)) + jfs_free_zero_link(inode); + + diFree(inode); + + /* + * Free the inode from the quota allocation. + */ + dquot_initialize(inode); + dquot_free_inode(inode); + } + } else { + truncate_inode_pages_final(&inode->i_data); + } + clear_inode(inode); + dquot_drop(inode); +} + +void jfs_dirty_inode(struct inode *inode, int flags) +{ + static int noisy = 5; + + if (isReadOnly(inode)) { + if (!special_file(inode->i_mode) && noisy) { + /* kernel allows writes to devices on read-only + * partitions and may try to mark inode dirty + */ + jfs_err("jfs_dirty_inode called on read-only volume"); + jfs_err("Is remount racy?"); + noisy--; + } + return; + } + + set_cflag(COMMIT_Dirty, inode); +} + +int jfs_get_block(struct inode *ip, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + s64 lblock64 = lblock; + int rc = 0; + xad_t xad; + s64 xaddr; + int xflag; + s32 xlen = bh_result->b_size >> ip->i_blkbits; + + /* + * Take appropriate lock on inode + */ + if (create) + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + else + IREAD_LOCK(ip, RDWRLOCK_NORMAL); + + if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && + (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && + xaddr) { + if (xflag & XAD_NOTRECORDED) { + if (!create) + /* + * Allocated but not recorded, read treats + * this as a hole + */ + goto unlock; +#ifdef _JFS_4K + XADoffset(&xad, lblock64); + XADlength(&xad, xlen); + XADaddress(&xad, xaddr); +#else /* _JFS_4K */ + /* + * As long as block size = 4K, this isn't a problem. + * We should mark the whole page not ABNR, but how + * will we know to mark the other blocks BH_New? + */ + BUG(); +#endif /* _JFS_4K */ + rc = extRecord(ip, &xad); + if (rc) + goto unlock; + set_buffer_new(bh_result); + } + + map_bh(bh_result, ip->i_sb, xaddr); + bh_result->b_size = xlen << ip->i_blkbits; + goto unlock; + } + if (!create) + goto unlock; + + /* + * Allocate a new block + */ +#ifdef _JFS_4K + if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) + goto unlock; + rc = extAlloc(ip, xlen, lblock64, &xad, false); + if (rc) + goto unlock; + + set_buffer_new(bh_result); + map_bh(bh_result, ip->i_sb, addressXAD(&xad)); + bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; + +#else /* _JFS_4K */ + /* + * We need to do whatever it takes to keep all but the last buffers + * in 4K pages - see jfs_write.c + */ + BUG(); +#endif /* _JFS_4K */ + + unlock: + /* + * Release lock on inode + */ + if (create) + IWRITE_UNLOCK(ip); + else + IREAD_UNLOCK(ip); + return rc; +} + +static int jfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, jfs_get_block, wbc); +} + +static int jfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, jfs_get_block); +} + +static int jfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, jfs_get_block); +} + +static int jfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); +} + +static void jfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + jfs_truncate(inode); + } +} + +static int jfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, + jfs_get_block); + if (unlikely(ret)) + jfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t jfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, jfs_get_block); +} + +static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + jfs_write_failed(mapping, end); + } + + return ret; +} + +const struct address_space_operations jfs_aops = { + .readpage = jfs_readpage, + .readpages = jfs_readpages, + .writepage = jfs_writepage, + .writepages = jfs_writepages, + .write_begin = jfs_write_begin, + .write_end = nobh_write_end, + .bmap = jfs_bmap, + .direct_IO = jfs_direct_IO, +}; + +/* + * Guts of jfs_truncate. Called with locks already held. Can be called + * with directory for truncating directory index table. + */ +void jfs_truncate_nolock(struct inode *ip, loff_t length) +{ + loff_t newsize; + tid_t tid; + + ASSERT(length >= 0); + + if (test_cflag(COMMIT_Nolink, ip)) { + xtTruncate(0, ip, length, COMMIT_WMAP); + return; + } + + do { + tid = txBegin(ip->i_sb, 0); + + /* + * The commit_mutex cannot be taken before txBegin. + * txBegin may block and there is a chance the inode + * could be marked dirty and need to be committed + * before txBegin unblocks + */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + newsize = xtTruncate(tid, ip, length, + COMMIT_TRUNCATE | COMMIT_PWMAP); + if (newsize < 0) { + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + break; + } + + ip->i_mtime = ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(ip); + + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } while (newsize > length); /* Truncate isn't always atomic */ +} + +void jfs_truncate(struct inode *ip) +{ + jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); + + nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + jfs_truncate_nolock(ip, ip->i_size); + IWRITE_UNLOCK(ip); +} diff --git a/kernel/fs/jfs/ioctl.c b/kernel/fs/jfs/ioctl.c new file mode 100644 index 000000000..93a123289 --- /dev/null +++ b/kernel/fs/jfs/ioctl.c @@ -0,0 +1,189 @@ +/* + * linux/fs/jfs/ioctl.c + * + * Copyright (C) 2006 Herbert Poetzl + * adapted from Remy Card's ext2/ioctl.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jfs_filsys.h" +#include "jfs_debug.h" +#include "jfs_incore.h" +#include "jfs_dinode.h" +#include "jfs_inode.h" +#include "jfs_dmap.h" +#include "jfs_discard.h" + +static struct { + long jfs_flag; + long ext2_flag; +} jfs_map[] = { + {JFS_NOATIME_FL, FS_NOATIME_FL}, + {JFS_DIRSYNC_FL, FS_DIRSYNC_FL}, + {JFS_SYNC_FL, FS_SYNC_FL}, + {JFS_SECRM_FL, FS_SECRM_FL}, + {JFS_UNRM_FL, FS_UNRM_FL}, + {JFS_APPEND_FL, FS_APPEND_FL}, + {JFS_IMMUTABLE_FL, FS_IMMUTABLE_FL}, + {0, 0}, +}; + +static long jfs_map_ext2(unsigned long flags, int from) +{ + int index=0; + long mapped=0; + + while (jfs_map[index].jfs_flag) { + if (from) { + if (jfs_map[index].ext2_flag & flags) + mapped |= jfs_map[index].jfs_flag; + } else { + if (jfs_map[index].jfs_flag & flags) + mapped |= jfs_map[index].ext2_flag; + } + index++; + } + return mapped; +} + + +long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct jfs_inode_info *jfs_inode = JFS_IP(inode); + unsigned int flags; + + switch (cmd) { + case JFS_IOC_GETFLAGS: + jfs_get_inode_flags(jfs_inode); + flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; + flags = jfs_map_ext2(flags, 0); + return put_user(flags, (int __user *) arg); + case JFS_IOC_SETFLAGS: { + unsigned int oldflags; + int err; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto setflags_out; + } + + flags = jfs_map_ext2(flags, 1); + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + + /* Lock against other parallel changes of flags */ + mutex_lock(&inode->i_mutex); + + jfs_get_inode_flags(jfs_inode); + oldflags = jfs_inode->mode2; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + if ((oldflags & JFS_IMMUTABLE_FL) || + ((flags ^ oldflags) & + (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + err = -EPERM; + goto setflags_out; + } + } + + flags = flags & JFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; + + jfs_set_inode_flags(inode); + mutex_unlock(&inode->i_mutex); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + return err; + } + + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + s64 ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) { + jfs_warn("FITRIM not supported on device"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + ret = jfs_ioc_trim(inode, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + /* While these ioctl numbers defined with 'long' and have different + * numbers than the 64bit ABI, + * the actual implementation only deals with ints and is compatible. + */ + switch (cmd) { + case JFS_IOC_GETFLAGS32: + cmd = JFS_IOC_GETFLAGS; + break; + case JFS_IOC_SETFLAGS32: + cmd = JFS_IOC_SETFLAGS; + break; + case FITRIM: + cmd = FITRIM; + break; + } + return jfs_ioctl(filp, cmd, arg); +} +#endif diff --git a/kernel/fs/jfs/jfs_acl.h b/kernel/fs/jfs/jfs_acl.h new file mode 100644 index 000000000..489f993b7 --- /dev/null +++ b/kernel/fs/jfs/jfs_acl.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_ACL +#define _H_JFS_ACL + +#ifdef CONFIG_JFS_POSIX_ACL + +struct posix_acl *jfs_get_acl(struct inode *inode, int type); +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jfs_init_acl(tid_t, struct inode *, struct inode *); + +#else + +static inline int jfs_init_acl(tid_t tid, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +#endif +#endif /* _H_JFS_ACL */ diff --git a/kernel/fs/jfs/jfs_btree.h b/kernel/fs/jfs/jfs_btree.h new file mode 100644 index 000000000..79c61805b --- /dev/null +++ b/kernel/fs/jfs/jfs_btree.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_BTREE +#define _H_JFS_BTREE + +/* + * jfs_btree.h: B+-tree + * + * JFS B+-tree (dtree and xtree) common definitions + */ + +/* + * basic btree page - btpage + * +struct btpage { + s64 next; right sibling bn + s64 prev; left sibling bn + + u8 flag; + u8 rsrvd[7]; type specific + s64 self; self address + + u8 entry[4064]; +}; */ + +/* btpaget_t flag */ +#define BT_TYPE 0x07 /* B+-tree index */ +#define BT_ROOT 0x01 /* root page */ +#define BT_LEAF 0x02 /* leaf page */ +#define BT_INTERNAL 0x04 /* internal page */ +#define BT_RIGHTMOST 0x10 /* rightmost page */ +#define BT_LEFTMOST 0x20 /* leftmost page */ +#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */ + +/* btorder (in inode) */ +#define BT_RANDOM 0x0000 +#define BT_SEQUENTIAL 0x0001 +#define BT_LOOKUP 0x0010 +#define BT_INSERT 0x0020 +#define BT_DELETE 0x0040 + +/* + * btree page buffer cache access + */ +#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0) + +/* get page from buffer page */ +#define BT_PAGE(IP, MP, TYPE, ROOT)\ + (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data) + +/* get the page buffer and the page for specified block address */ +#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\ +{\ + if ((BN) == 0)\ + {\ + MP = (struct metapage *)&JFS_IP(IP)->bxflag;\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + RC = 0;\ + }\ + else\ + {\ + MP = read_metapage((IP), BN, SIZE, 1);\ + if (MP) {\ + RC = 0;\ + P = (MP)->data;\ + } else {\ + P = NULL;\ + jfs_err("bread failed!");\ + RC = -EIO;\ + }\ + }\ +} + +#define BT_MARK_DIRTY(MP, IP)\ +{\ + if (BT_IS_ROOT(MP))\ + mark_inode_dirty(IP);\ + else\ + mark_metapage_dirty(MP);\ +} + +/* put the page buffer */ +#define BT_PUTPAGE(MP)\ +{\ + if (! BT_IS_ROOT(MP)) \ + release_metapage(MP); \ +} + + +/* + * btree traversal stack + * + * record the path traversed during the search; + * top frame record the leaf page/entry selected. + */ +struct btframe { /* stack frame */ + s64 bn; /* 8: */ + s16 index; /* 2: */ + s16 lastindex; /* 2: unused */ + struct metapage *mp; /* 4/8: */ +}; /* (16/24) */ + +struct btstack { + struct btframe *top; + int nsplit; + struct btframe stack[MAXTREEHEIGHT]; +}; + +#define BT_CLR(btstack)\ + (btstack)->top = (btstack)->stack + +#define BT_STACK_FULL(btstack)\ + ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1])) + +#define BT_PUSH(BTSTACK, BN, INDEX)\ +{\ + assert(!BT_STACK_FULL(BTSTACK));\ + (BTSTACK)->top->bn = BN;\ + (BTSTACK)->top->index = INDEX;\ + ++(BTSTACK)->top;\ +} + +#define BT_POP(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top ) + +#define BT_STACK(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top ) + +static inline void BT_STACK_DUMP(struct btstack *btstack) +{ + int i; + printk("btstack dump:\n"); + for (i = 0; i < MAXTREEHEIGHT; i++) + printk(KERN_ERR "bn = %Lx, index = %d\n", + (long long)btstack->stack[i].bn, + btstack->stack[i].index); +} + +/* retrieve search results */ +#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\ +{\ + BN = (LEAF)->bn;\ + MP = (LEAF)->mp;\ + if (BN)\ + P = (TYPE *)MP->data;\ + else\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + INDEX = (LEAF)->index;\ +} + +/* put the page buffer of search */ +#define BT_PUTSEARCH(BTSTACK)\ +{\ + if (! BT_IS_ROOT((BTSTACK)->top->mp))\ + release_metapage((BTSTACK)->top->mp);\ +} +#endif /* _H_JFS_BTREE */ diff --git a/kernel/fs/jfs/jfs_debug.c b/kernel/fs/jfs/jfs_debug.c new file mode 100644 index 000000000..dd824d9b0 --- /dev/null +++ b/kernel/fs/jfs/jfs_debug.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_debug.h" + +#ifdef PROC_FS_JFS /* see jfs_debug.h */ + +static struct proc_dir_entry *base; +#ifdef CONFIG_JFS_DEBUG +static int jfs_loglevel_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", jfsloglevel); + return 0; +} + +static int jfs_loglevel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_loglevel_proc_show, NULL); +} + +static ssize_t jfs_loglevel_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + + if (get_user(c, buffer)) + return -EFAULT; + + /* yes, I know this is an ASCIIism. --hch */ + if (c < '0' || c > '9') + return -EINVAL; + jfsloglevel = c - '0'; + return count; +} + +static const struct file_operations jfs_loglevel_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_loglevel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = jfs_loglevel_proc_write, +}; +#endif + +static struct { + const char *name; + const struct file_operations *proc_fops; +} Entries[] = { +#ifdef CONFIG_JFS_STATISTICS + { "lmstats", &jfs_lmstats_proc_fops, }, + { "txstats", &jfs_txstats_proc_fops, }, + { "xtstat", &jfs_xtstat_proc_fops, }, + { "mpstat", &jfs_mpstat_proc_fops, }, +#endif +#ifdef CONFIG_JFS_DEBUG + { "TxAnchor", &jfs_txanchor_proc_fops, }, + { "loglevel", &jfs_loglevel_proc_fops } +#endif +}; +#define NPROCENT ARRAY_SIZE(Entries) + +void jfs_proc_init(void) +{ + int i; + + if (!(base = proc_mkdir("fs/jfs", NULL))) + return; + + for (i = 0; i < NPROCENT; i++) + proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); +} + +void jfs_proc_clean(void) +{ + int i; + + if (base) { + for (i = 0; i < NPROCENT; i++) + remove_proc_entry(Entries[i].name, base); + remove_proc_entry("fs/jfs", NULL); + } +} + +#endif /* PROC_FS_JFS */ diff --git a/kernel/fs/jfs/jfs_debug.h b/kernel/fs/jfs/jfs_debug.h new file mode 100644 index 000000000..eafd1300a --- /dev/null +++ b/kernel/fs/jfs/jfs_debug.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DEBUG +#define _H_JFS_DEBUG + +/* + * jfs_debug.h + * + * global debug message, data structure/macro definitions + * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS; + */ + +/* + * Create /proc/fs/jfs if procfs is enabled andeither + * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined + */ +#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS)) +#define PROC_FS_JFS +extern void jfs_proc_init(void); +extern void jfs_proc_clean(void); +#endif + +/* + * assert with traditional printf/panic + */ +#define assert(p) do { \ + if (!(p)) { \ + printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ + __FILE__, __LINE__, #p); \ + BUG(); \ + } \ +} while (0) + +/* + * debug ON + * -------- + */ +#ifdef CONFIG_JFS_DEBUG +#define ASSERT(p) assert(p) + +/* printk verbosity */ +#define JFS_LOGLEVEL_ERR 1 +#define JFS_LOGLEVEL_WARN 2 +#define JFS_LOGLEVEL_DEBUG 3 +#define JFS_LOGLEVEL_INFO 4 + +extern int jfsloglevel; + +extern const struct file_operations jfs_txanchor_proc_fops; + +/* information message: e.g., configuration, major event */ +#define jfs_info(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_INFO) \ + printk(KERN_INFO fmt "\n", ## arg); \ +} while (0) + +/* debug message: ad hoc */ +#define jfs_debug(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_DEBUG) \ + printk(KERN_DEBUG fmt "\n", ## arg); \ +} while (0) + +/* warn message: */ +#define jfs_warn(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_WARN) \ + printk(KERN_WARNING fmt "\n", ## arg); \ +} while (0) + +/* error event message: e.g., i/o error */ +#define jfs_err(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_ERR) \ + printk(KERN_ERR fmt "\n", ## arg); \ +} while (0) + +/* + * debug OFF + * --------- + */ +#else /* CONFIG_JFS_DEBUG */ +#define ASSERT(p) do {} while (0) +#define jfs_info(fmt, arg...) do {} while (0) +#define jfs_debug(fmt, arg...) do {} while (0) +#define jfs_warn(fmt, arg...) do {} while (0) +#define jfs_err(fmt, arg...) do {} while (0) +#endif /* CONFIG_JFS_DEBUG */ + +/* + * statistics + * ---------- + */ +#ifdef CONFIG_JFS_STATISTICS +extern const struct file_operations jfs_lmstats_proc_fops; +extern const struct file_operations jfs_txstats_proc_fops; +extern const struct file_operations jfs_mpstat_proc_fops; +extern const struct file_operations jfs_xtstat_proc_fops; + +#define INCREMENT(x) ((x)++) +#define DECREMENT(x) ((x)--) +#define HIGHWATERMARK(x,y) ((x) = max((x), (y))) +#else +#define INCREMENT(x) +#define DECREMENT(x) +#define HIGHWATERMARK(x,y) +#endif /* CONFIG_JFS_STATISTICS */ + +#endif /* _H_JFS_DEBUG */ diff --git a/kernel/fs/jfs/jfs_dinode.h b/kernel/fs/jfs/jfs_dinode.h new file mode 100644 index 000000000..395c4c0d0 --- /dev/null +++ b/kernel/fs/jfs/jfs_dinode.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DINODE +#define _H_JFS_DINODE + +/* + * jfs_dinode.h: on-disk inode manager + */ + +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ + + +/* + * on-disk inode : 512 bytes + * + * note: align 64-bit fields on 8-byte boundary. + */ +struct dinode { + /* + * I. base area (128 bytes) + * ------------------------ + * + * define generic/POSIX attributes + */ + __le32 di_inostamp; /* 4: stamp to show inode belongs to fileset */ + __le32 di_fileset; /* 4: fileset number */ + __le32 di_number; /* 4: inode number, aka file serial number */ + __le32 di_gen; /* 4: inode generation number */ + + pxd_t di_ixpxd; /* 8: inode extent descriptor */ + + __le64 di_size; /* 8: size */ + __le64 di_nblocks; /* 8: number of blocks allocated */ + + __le32 di_nlink; /* 4: number of links to the object */ + + __le32 di_uid; /* 4: user id of owner */ + __le32 di_gid; /* 4: group id of owner */ + + __le32 di_mode; /* 4: attribute, format and permission */ + + struct timestruc_t di_atime; /* 8: time last data accessed */ + struct timestruc_t di_ctime; /* 8: time last status changed */ + struct timestruc_t di_mtime; /* 8: time last data modified */ + struct timestruc_t di_otime; /* 8: time created */ + + dxd_t di_acl; /* 16: acl descriptor */ + + dxd_t di_ea; /* 16: ea descriptor */ + + __le32 di_next_index; /* 4: Next available dir_table index */ + + __le32 di_acltype; /* 4: Type of ACL */ + + /* + * Extension Areas. + * + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. + */ + union { + struct { + /* + * This table contains the information needed to + * find a directory entry from a 32-bit index. + * If the index is small enough, the table is inline, + * otherwise, an x-tree root overlays this table + */ + struct dir_table_slot _table[12]; /* 96: inline */ + + dtroot_t _dtroot; /* 288: dtree root */ + } _dir; /* (384) */ +#define di_dirtable u._dir._table +#define di_dtroot u._dir._dtroot +#define di_parent di_dtroot.header.idotdot +#define di_DASD di_dtroot.header.DASD + + struct { + union { + u8 _data[96]; /* 96: unused */ + struct { + void *_imap; /* 4: unused */ + __le32 _gengen; /* 4: generator */ + } _imap; + } _u1; /* 96: */ +#define di_gengen u._file._u1._imap._gengen + + union { + xtpage_t _xtroot; + struct { + u8 unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + union { + __le32 _rdev; /* 4: */ + u8 _fastsymlink[128]; + } _u; + u8 _inlineea[128]; + } _special; + } _u2; + } _file; +#define di_xtroot u._file._u2._xtroot +#define di_dxd u._file._u2._special._dxd +#define di_btroot di_xtroot +#define di_inlinedata u._file._u2._special._u +#define di_rdev u._file._u2._special._u._rdev +#define di_fastsymlink u._file._u2._special._u._fastsymlink +#define di_inlineea u._file._u2._special._inlineea + } u; +}; + +/* extended mode bits (on-disk inode di_mode) */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ +#define ISWAPFILE 0x00800000 /* file open for pager swap space */ + +/* more extended mode bits: attributes for OS/2 */ +#define IREADONLY 0x02000000 /* no write access to file */ +#define IHIDDEN 0x04000000 /* hidden file */ +#define ISYSTEM 0x08000000 /* system file */ + +#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */ +#define IARCHIVE 0x40000000 /* file archive bit */ +#define INEWNAME 0x80000000 /* non-8.3 filename format */ + +#define IRASH 0x4E000000 /* mask for changeable attributes */ +#define ATTRSHIFT 25 /* bits to shift to move attribute + specification to mode position */ + +/* extended attributes for Linux */ + +#define JFS_NOATIME_FL 0x00080000 /* do not update atime */ + +#define JFS_DIRSYNC_FL 0x00100000 /* dirsync behaviour */ +#define JFS_SYNC_FL 0x00200000 /* Synchronous updates */ +#define JFS_SECRM_FL 0x00400000 /* Secure deletion */ +#define JFS_UNRM_FL 0x00800000 /* allow for undelete */ + +#define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ +#define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ + +#define JFS_FL_USER_VISIBLE 0x03F80000 +#define JFS_FL_USER_MODIFIABLE 0x03F80000 +#define JFS_FL_INHERIT 0x03C80000 + +/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ +#define JFS_IOC_GETFLAGS _IOR('f', 1, long) +#define JFS_IOC_SETFLAGS _IOW('f', 2, long) + +#define JFS_IOC_GETFLAGS32 _IOR('f', 1, int) +#define JFS_IOC_SETFLAGS32 _IOW('f', 2, int) + +#endif /*_H_JFS_DINODE */ diff --git a/kernel/fs/jfs/jfs_discard.c b/kernel/fs/jfs/jfs_discard.c new file mode 100644 index 000000000..dfcd50304 --- /dev/null +++ b/kernel/fs/jfs/jfs_discard.c @@ -0,0 +1,121 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_discard.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" + + +/* + * NAME: jfs_issue_discard() + * + * FUNCTION: TRIM the specified block range on device, if supported + * + * PARAMETERS: + * ip - pointer to in-core inode + * blkno - starting block number to be trimmed (0..N) + * nblocks - number of blocks to be trimmed + * + * RETURN VALUES: + * none + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks) +{ + struct super_block *sb = ip->i_sb; + int r = 0; + + r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0); + if (unlikely(r != 0)) { + jfs_err("JFS: sb_issue_discard" \ + "(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + } + + jfs_info("JFS: sb_issue_discard" \ + "(%p, %llu, %llu, GFP_NOFS, 0) = %d\n", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + + return; +} + +/* + * NAME: jfs_ioc_trim() + * + * FUNCTION: attempt to discard (TRIM) all free blocks from the + * filesystem. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * range - the range, given by user space + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; + int agno, agno_end; + u64 start, end, minlen; + u64 trimmed = 0; + + /** + * convert byte values to block size of filesystem: + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + */ + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + if (minlen == 0) + minlen = 1; + + if (minlen > bmp->db_agsize || + start >= bmp->db_mapsize || + range->len < sb->s_blocksize) + return -EINVAL; + + if (end >= bmp->db_mapsize) + end = bmp->db_mapsize - 1; + + /** + * we trim all ag's within the range + */ + agno = BLKTOAG(start, JFS_SBI(ip->i_sb)); + agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb)); + while (agno <= agno_end) { + trimmed += dbDiscardAG(ip, agno, minlen); + agno++; + } + range->len = trimmed << sb->s_blocksize_bits; + + return 0; +} diff --git a/kernel/fs/jfs/jfs_discard.h b/kernel/fs/jfs/jfs_discard.h new file mode 100644 index 000000000..40d1ee608 --- /dev/null +++ b/kernel/fs/jfs/jfs_discard.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DISCARD +#define _H_JFS_DISCARD + +struct fstrim_range; + +extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks); +extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range); + +#endif /* _H_JFS_DISCARD */ diff --git a/kernel/fs/jfs/jfs_dmap.c b/kernel/fs/jfs/jfs_dmap.c new file mode 100644 index 000000000..2d514c7af --- /dev/null +++ b/kernel/fs/jfs/jfs_dmap.c @@ -0,0 +1,4092 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_lock.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" +#include "jfs_discard.h" + +/* + * SERIALIZATION of the Block Allocation Map. + * + * the working state of the block allocation map is accessed in + * two directions: + * + * 1) allocation and free requests that start at the dmap + * level and move up through the dmap control pages (i.e. + * the vast majority of requests). + * + * 2) allocation requests that start at dmap control page + * level and work down towards the dmaps. + * + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level + * of dmap control page. + * requests that start at the top are serialized against each other + * and request that start from the bottom by the multiple read/single + * write inode lock of the bmap inode. requests starting at the top + * take this lock in write mode while request starting at the bottom + * take the lock in read mode. a single top-down request may proceed + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * + * in addition to information found in dmaps and dmap control pages, + * the working state of the block allocation map also includes read/ + * write information maintained in the bmap descriptor (i.e. total + * free block count, allocation group level free block counts). + * a single exclusive lock (BMAP_LOCK) is used to guard this information + * in the face of multiple-bottoms up requests. + * (lock ordering: IREAD_LOCK, BMAP_LOCK); + * + * accesses to the persistent state of the block allocation map (limited + * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. + */ + +#define BMAP_LOCK_INIT(bmp) mutex_init(&bmp->db_bmaplock) +#define BMAP_LOCK(bmp) mutex_lock(&bmp->db_bmaplock) +#define BMAP_UNLOCK(bmp) mutex_unlock(&bmp->db_bmaplock) + +/* + * forward references + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); +static int dbBackSplit(dmtree_t * tp, int leafno); +static int dbJoin(dmtree_t * tp, int leafno, int newval); +static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, + int level); +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks, + int l2nb, s64 * results); +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, + int l2nb, + s64 * results); +static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, + s64 * results); +static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, + s64 * results); +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); +static int dbFindBits(u32 word, int l2nb); +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbMaxBud(u8 * cp); +static int blkstol2(s64 nb); + +static int cntlz(u32 value); +static int cnttz(u32 word); + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); +static int dbInitDmapTree(struct dmap * dp); +static int dbInitTree(struct dmaptree * dtp); +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); +static int dbGetL2AGSize(s64 nblocks); + +/* + * buddy table + * + * table used for determining buddy sizes within characters of + * dmap bitmap words. the characters themselves serve as indexes + * into the table, with the table elements yielding the maximum + * binary buddy of free bits within the character. + */ +static const s8 budtab[256] = { + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 +}; + +/* + * NAME: dbMount() + * + * FUNCTION: initializate the block allocation map. + * + * memory is allocated for the in-core bmap descriptor and + * the in-core descriptor is initialized from disk. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error + */ +int dbMount(struct inode *ipbmap) +{ + struct bmap *bmp; + struct dbmap_disk *dbmp_le; + struct metapage *mp; + int i; + + /* + * allocate/initialize the in-memory bmap descriptor + */ + /* allocate memory for the in-memory bmap descriptor */ + bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); + if (bmp == NULL) + return -ENOMEM; + + /* read the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(bmp); + return -EIO; + } + + /* copy the on-disk bmap descriptor to its in-memory version. */ + dbmp_le = (struct dbmap_disk *) mp->data; + bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); + bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); + bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); + bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); + bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); + bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); + bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + for (i = 0; i < MAXAG; i++) + bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); + bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); + bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; + + /* release the buffer. */ + release_metapage(mp); + + /* bind the bmap inode and the bmap descriptor to each other. */ + bmp->db_ipbmap = ipbmap; + JFS_SBI(ipbmap->i_sb)->bmap = bmp; + + memset(bmp->db_active, 0, sizeof(bmp->db_active)); + + /* + * allocate/initialize the bmap lock + */ + BMAP_LOCK_INIT(bmp); + + return (0); +} + + +/* + * NAME: dbUnmount() + * + * FUNCTION: terminate the block allocation map in preparation for + * file system unmount. + * + * the in-core bmap descriptor is written to disk and + * the memory for this descriptor is freed. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbUnmount(struct inode *ipbmap, int mounterror) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + if (!(mounterror || isReadOnly(ipbmap))) + dbSync(ipbmap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipbmap->i_mapping, 0); + + /* free the memory for the in-memory bmap. */ + kfree(bmp); + + return (0); +} + +/* + * dbSync() + */ +int dbSync(struct inode *ipbmap) +{ + struct dbmap_disk *dbmp_le; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *mp; + int i; + + /* + * write bmap global control page + */ + /* get the buffer for the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("dbSync: read_metapage failed!"); + return -EIO; + } + /* copy the in-memory version of the bmap to the on-disk version */ + dbmp_le = (struct dbmap_disk *) mp->data; + dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); + dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); + dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); + dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); + dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); + dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); + dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); + dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); + dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight); + dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); + dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); + dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); + for (i = 0; i < MAXAG; i++) + dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); + dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); + dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; + + /* write the buffer */ + write_metapage(mp); + + /* + * write out dirty pages of bmap + */ + filemap_write_and_wait(ipbmap->i_mapping); + + diWriteSpecial(ipbmap, 0); + + return (0); +} + +/* + * NAME: dbFree() + * + * FUNCTION: free the specified block range from the working block + * allocation map. + * + * the blocks will be free from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbFree(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be freed better be within the mapsize. */ + if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { + IREAD_UNLOCK(ipbmap); + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ip->i_sb, "block to be freed is outside the map\n"); + return -EIO; + } + + /** + * TRIM the blocks, when mounted with discard option + */ + if (JFS_SBI(sb)->flag & JFS_DISCARD) + if (JFS_SBI(sb)->minblks_trim <= nblocks) + jfs_issue_discard(ipbmap, blkno, nblocks); + + /* + * free the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be freed from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* free the blocks. */ + if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + jfs_error(ip->i_sb, "error in block map\n"); + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +/* + * NAME: dbUpdatePMap() + * + * FUNCTION: update the allocation state (free or allocate) of the + * specified block range in the persistent block allocation map. + * + * the blocks will be updated in the persistent map one + * dmap at a time. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * free - 'true' if block range is to be freed from the persistent + * map; 'false' if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int +dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk) +{ + int nblks, dbitno, wbitno, rbits; + int word, nbits, nwords; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + s64 lblkno, rem, lastlblkno; + u32 mask; + struct dmap *dp; + struct metapage *mp; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + /* the blocks better be within the mapsize. */ + if (blkno + nblocks > bmp->db_mapsize) { + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ipbmap->i_sb, "blocks are outside the map\n"); + return -EIO; + } + + /* compute delta of transaction lsn from log syncpt */ + lsn = tblk->lsn; + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + logdiff(difft, lsn, log); + + /* + * update the block state a dmap at a time. + */ + mp = NULL; + lastlblkno = 0; + for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + if (lblkno != lastlblkno) { + if (mp) { + write_metapage(mp); + } + + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, + 0); + if (mp == NULL) + return -EIO; + metapage_wait_for_io(mp); + } + dp = (struct dmap *) mp->data; + + /* determine the bit number and word within the dmap of + * the starting block. also determine how many blocks + * are to be updated within this dmap. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + nblks = min(rem, (s64)BPERDMAP - dbitno); + + /* update the bits of the dmap words. the first and last + * words may only have a subset of their bits updated. if + * this is the case, we'll work against that word (i.e. + * partial first and/or last) only in a single pass. a + * single pass will also be used to update all words that + * are to have all their bits updated. + */ + for (rbits = nblks; rbits > 0; + rbits -= nbits, dbitno += nbits) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nbits = min(rbits, DBWORD - wbitno); + + /* check if only part of the word is to be updated. */ + if (nbits < DBWORD) { + /* update (free or allocate) the bits + * in this word. + */ + mask = + (ONES << (DBWORD - nbits) >> wbitno); + if (free) + dp->pmap[word] &= + cpu_to_le32(~mask); + else + dp->pmap[word] |= + cpu_to_le32(mask); + + word += 1; + } else { + /* one or more words are to have all + * their bits updated. determine how + * many words and how many bits. + */ + nwords = rbits >> L2DBWORD; + nbits = nwords << L2DBWORD; + + /* update (free or allocate) the bits + * in these words. + */ + if (free) + memset(&dp->pmap[word], 0, + nwords * 4); + else + memset(&dp->pmap[word], (int) ONES, + nwords * 4); + + word += nwords; + } + } + + /* + * update dmap lsn + */ + if (lblkno == lastlblkno) + continue; + + lastlblkno = lblkno; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + + /* move bp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + + /* inherit younger/larger clsn */ + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + + /* insert bp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + } + + /* write the last buffer. */ + if (mp) { + write_metapage(mp); + } + + return (0); +} + + +/* + * NAME: dbNextAG() + * + * FUNCTION: find the preferred allocation group for new allocations. + * + * Within the allocation groups, we maintain a preferred + * allocation group which consists of a group with at least + * average free space. It is the preferred group that we target + * new inode allocation towards. The tie-in between inode + * allocation and block allocation occurs as we allocate the + * first (data) block of an inode and specify the inode (block) + * as the allocation hint for this block. + * + * We try to avoid having more than one open file growing in + * an allocation group, as this will lead to fragmentation. + * This differs from the old OS/2 method of trying to keep + * empty ags around for large allocations. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * the preferred allocation group number. + */ +int dbNextAG(struct inode *ipbmap) +{ + s64 avgfree; + int agpref; + s64 hwm = 0; + int i; + int next_best = -1; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + BMAP_LOCK(bmp); + + /* determine the average number of free blocks within the ags. */ + avgfree = (u32)bmp->db_nfree / bmp->db_numag; + + /* + * if the current preferred ag does not have an active allocator + * and has at least average freespace, return it + */ + agpref = bmp->db_agpref; + if ((atomic_read(&bmp->db_active[agpref]) == 0) && + (bmp->db_agfree[agpref] >= avgfree)) + goto unlock; + + /* From the last preferred ag, find the next one with at least + * average free space. + */ + for (i = 0 ; i < bmp->db_numag; i++, agpref++) { + if (agpref == bmp->db_numag) + agpref = 0; + + if (atomic_read(&bmp->db_active[agpref])) + /* open file is currently growing in this ag */ + continue; + if (bmp->db_agfree[agpref] >= avgfree) { + /* Return this one */ + bmp->db_agpref = agpref; + goto unlock; + } else if (bmp->db_agfree[agpref] > hwm) { + /* Less than avg. freespace, but best so far */ + hwm = bmp->db_agfree[agpref]; + next_best = agpref; + } + } + + /* + * If no inactive ag was found with average freespace, use the + * next best + */ + if (next_best != -1) + bmp->db_agpref = next_best; + /* else leave db_agpref unchanged */ +unlock: + BMAP_UNLOCK(bmp); + + /* return the preferred group. + */ + return (bmp->db_agpref); +} + +/* + * NAME: dbAlloc() + * + * FUNCTION: attempt to allocate a specified number of contiguous free + * blocks from the working allocation block map. + * + * the block allocation policy uses hints and a multi-step + * approach. + * + * for allocation requests smaller than the number of blocks + * per dmap, we first try to allocate the new blocks + * immediately following the hint. if these blocks are not + * available, we try to allocate blocks near the hint. if + * no blocks near the hint are available, we next try to + * allocate within the same dmap as contains the hint. + * + * if no blocks are available in the dmap or the allocation + * request is larger than the dmap size, we try to allocate + * within the same allocation group as contains the hint. if + * this does not succeed, we finally try to allocate anywhere + * within the aggregate. + * + * we also try to allocate anywhere within the aggregate for + * for allocation requests larger than the allocation group + * size or requests that specify no hint value. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number + * of the newly allocated contiguous range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) +{ + int rc, agno; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp; + struct metapage *mp; + s64 lblkno, blkno; + struct dmap *dp; + int l2nb; + s64 mapSize; + int writers; + + /* assert that nblocks is valid */ + assert(nblocks > 0); + + /* get the log2 number of blocks to be allocated. + * if the number of blocks is not a log2 multiple, + * it will be rounded up to the next log2 multiple. + */ + l2nb = BLKSTOL2(nblocks); + + bmp = JFS_SBI(ip->i_sb)->bmap; + + mapSize = bmp->db_mapsize; + + /* the hint should be within the map */ + if (hint >= mapSize) { + jfs_error(ip->i_sb, "the hint is outside the map\n"); + return -EIO; + } + + /* if the number of blocks to be allocated is greater than the + * allocation group size, try to allocate anywhere. + */ + if (l2nb > bmp->db_agl2size) { + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + goto write_unlock; + } + + /* + * If no hint, let dbNextAG recommend an allocation group + */ + if (hint == 0) + goto pref_ag; + + /* we would like to allocate close to the hint. adjust the + * hint to the block following the hint since the allocators + * will start looking for free space starting at this point. + */ + blkno = hint + 1; + + if (blkno >= bmp->db_mapsize) + goto pref_ag; + + agno = blkno >> bmp->db_agl2size; + + /* check if blkno crosses over into a new allocation group. + * if so, check if we should allow allocations within this + * allocation group. + */ + if ((blkno & (bmp->db_agsize - 1)) == 0) + /* check if the AG is currently being written to. + * if so, call dbNextAG() to find a non-busy + * AG with sufficient free space. + */ + if (atomic_read(&bmp->db_active[agno])) + goto pref_ag; + + /* check if the allocation request size can be satisfied from a + * single dmap. if so, try to allocate from the dmap containing + * the hint using a tiered strategy. + */ + if (nblocks <= BPERDMAP) { + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* get the buffer for the dmap containing the hint. + */ + rc = -EIO; + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + goto read_unlock; + + dp = (struct dmap *) mp->data; + + /* first, try to satisfy the allocation request with the + * blocks beginning at the hint. + */ + if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) + != -ENOSPC) { + if (rc == 0) { + *results = blkno; + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + writers = atomic_read(&bmp->db_active[agno]); + if ((writers > 1) || + ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { + /* + * Someone else is writing in this allocation + * group. To avoid fragmenting, try another ag + */ + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + goto pref_ag; + } + + /* next, try to satisfy the allocation request with blocks + * near the hint. + */ + if ((rc = + dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + /* try to satisfy the allocation request with blocks within + * the same dmap as the hint. + */ + if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + } + + /* try to satisfy the allocation request with blocks within + * the same allocation group as the hint. + */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) + goto write_unlock; + + IWRITE_UNLOCK(ipbmap); + + + pref_ag: + /* + * Let dbNextAG recommend a preferred allocation group + */ + agno = dbNextAG(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* Try to allocate within this allocation group. if that fails, try to + * allocate anywhere in the map. + */ + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + write_unlock: + IWRITE_UNLOCK(ipbmap); + + return (rc); + + read_unlock: + IREAD_UNLOCK(ipbmap); + + return (rc); +} + +#ifdef _NOTYET +/* + * NAME: dbAllocExact() + * + * FUNCTION: try to allocate the requested extent; + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) +{ + int rc; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct dmap *dp; + s64 lblkno; + struct metapage *mp; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* + * validate extent request: + * + * note: defragfs policy: + * max 64 blocks will be moved. + * allocation request size must be satisfied from a single dmap. + */ + if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + return -EINVAL; + } + + if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { + /* the free space is no longer available */ + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* read in the dmap covering the extent */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* try to allocate the requested extent */ + rc = dbAllocNext(bmp, dp, blkno, nblocks); + + IREAD_UNLOCK(ipbmap); + + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); +} +#endif /* _NOTYET */ + +/* + * NAME: dbReAlloc() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. if these + * blocks are not available, this routine will attempt to + * allocate a new set of contiguous blocks large enough + * to cover the existing allocation plus the additional + * number of blocks required. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number + * of the existing allocation if the existing allocation + * was extended in place or to a newly allocated contiguous + * range if the existing allocation could not be extended + * in place. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int +dbReAlloc(struct inode *ip, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) +{ + int rc; + + /* try to extend the allocation in place. + */ + if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { + *results = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* could not extend the allocation in place, so allocate a + * new set of blocks for the entire request (i.e. try to get + * a range of contiguous blocks large enough to cover the + * existing allocation plus the additional blocks.) + */ + return (dbAlloc + (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); +} + + +/* + * NAME: dbExtend() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 lblkno, lastblkno, extblkno; + uint rel_block; + struct metapage *mp; + struct dmap *dp; + int rc; + struct inode *ipbmap = sbi->ipbmap; + struct bmap *bmp; + + /* + * We don't want a non-aligned extent to cross a page boundary + */ + if (((rel_block = blkno & (sbi->nbperpage - 1))) && + (rel_block + nblocks + addnblocks > sbi->nbperpage)) + return -ENOSPC; + + /* get the last block of the current allocation */ + lastblkno = blkno + nblocks - 1; + + /* determine the block number of the block following + * the existing allocation. + */ + extblkno = lastblkno + 1; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* better be within the file system */ + bmp = sbi->bmap; + if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + jfs_error(ip->i_sb, "the block is outside the filesystem\n"); + return -EIO; + } + + /* we'll attempt to extend the current allocation in place by + * allocating the additional blocks as the blocks immediately + * following the current allocation. we only try to extend the + * current allocation in place if the number of additional blocks + * can fit into a dmap, the last block of the current allocation + * is not the last block of the file system, and the start of the + * inplace extension is not on an allocation group boundary. + */ + if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || + (extblkno & (bmp->db_agsize - 1)) == 0) { + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* get the buffer for the dmap containing the first block + * of the extension. + */ + lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks immediately following the + * current allocation. + */ + rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); + + IREAD_UNLOCK(ipbmap); + + /* were we successful ? */ + if (rc == 0) + write_metapage(mp); + else + /* we were not successful */ + release_metapage(mp); + + return (rc); +} + + +/* + * NAME: dbAllocNext() + * + * FUNCTION: attempt to allocate the blocks of the specified block + * range within a dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw; + int l2size; + s8 *leaf; + u32 mask; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); + return -EIO; + } + + /* pick up a pointer to the leaves of the dmap tree. + */ + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* check if the specified block range is contained within + * this dmap. + */ + if (dbitno + nblocks > BPERDMAP) + return -ENOSPC; + + /* check if the starting leaf indicates that anything + * is free. + */ + if (leaf[word] == NOFREE) + return -ENOSPC; + + /* check the dmaps words corresponding to block range to see + * if the block range is free. not all bits of the first and + * last words may be contained within the block range. if this + * is the case, we'll work against those words (i.e. partial first + * and/or last) on an individual basis (a single pass) and examine + * the actual bits to determine if they are free. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the leaves of the dmap + * tree will be examined to determine if the blocks are free. a + * single leaf may describe the free space of multiple dmap + * words, so we may visit only a subset of the actual leaves + * corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of the word is to be examined. + */ + if (nb < DBWORD) { + /* check if the bits are free. + */ + mask = (ONES << (DBWORD - nb) >> wbitno); + if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) + return -ENOSPC; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and how many bits. + */ + nwords = rembits >> L2DBWORD; + nb = nwords << L2DBWORD; + + /* now examine the appropriate leaves to determine + * if the blocks are free. + */ + while (nwords > 0) { + /* does the leaf describe any free space ? + */ + if (leaf[word] < BUDMIN) + return -ENOSPC; + + /* determine the l2 number of bits provided + * by this leaf. + */ + l2size = + min_t(int, leaf[word], NLSTOL2BSZ(nwords)); + + /* determine how many words were handled. + */ + nw = BUDSIZE(l2size, BUDMIN); + + nwords -= nw; + word += nw; + } + } + } + + /* allocate the blocks. + */ + return (dbAllocDmap(bmp, dp, blkno, nblocks)); +} + + +/* + * NAME: dbAllocNear() + * + * FUNCTION: attempt to allocate a number of contiguous free blocks near + * a specified block (hint) within a dmap. + * + * starting with the dmap leaf that covers the hint, we'll + * check the next four contiguous leaves for sufficient free + * space. if sufficient free space is found, we'll allocate + * the desired free space. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocNear(struct bmap * bmp, + struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results) +{ + int word, lword, rc; + s8 *leaf; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); + return -EIO; + } + + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the word within the dmap that holds the hint + * (i.e. blkno). also, determine the last word in the dmap + * that we'll include in our examination. + */ + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + lword = min(word + 4, LPERDMAP); + + /* examine the leaves for sufficient free space. + */ + for (; word < lword; word++) { + /* does the leaf describe sufficient free space ? + */ + if (leaf[word] < l2nb) + continue; + + /* determine the block number within the file system + * of the first block described by this dmap word. + */ + blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); + + /* if not all bits of the dmap word are free, get the + * starting bit number within the dmap word of the required + * string of free bits and adjust the block number with the + * value. + */ + if (leaf[word] < BUDMIN) + blkno += + dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); + + /* allocate the blocks. + */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); + } + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAG() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks within the specified allocation group. + * + * unless the allocation group size is equal to the number + * of blocks per dmap, the dmap control pages will be used to + * find the required free space, if available. we start the + * search at the highest dmap control page level which + * distinctly describes the allocation group's free space + * (i.e. the highest level at which the allocation group's + * free space is not mixed in with that of any other group). + * in addition, we start the search within this level at a + * height of the dmapctl dmtree at which the nodes distinctly + * describe the allocation group's free space. at this height, + * the allocation group's free space may be represented by 1 + * or two sub-trees, depending on the allocation group size. + * we search the top nodes of these subtrees left to right for + * sufficient free space. if sufficient free space is found, + * the subtree is searched to find the leftmost leaf that + * has free space. once we have made it to the leaf, we + * move the search to the next lower level dmap control page + * corresponding to this leaf. we continue down the dmap control + * pages until we find the dmap that contains or starts the + * sufficient free space and we allocate at this dmap. + * + * if the allocation group size is equal to the dmap size, + * we'll start at the dmap corresponding to the allocation + * group and attempt the allocation at this level. + * + * the dmap control page search is also not performed if the + * allocation group is completely free and we go to the first + * dmap of the allocation group to do the allocation. this is + * done because the allocation group may be part (not the first + * part) of a larger binary buddy system, causing the dmap + * control pages to indicate no free space (NOFREE) within + * the allocation group. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * agno - allocation group number. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * note: IWRITE_LOCK(ipmap) held on entry/exit; + */ +static int +dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) +{ + struct metapage *mp; + struct dmapctl *dcp; + int rc, ti, i, k, m, n, agperlev; + s64 blkno, lblkno; + int budmin; + + /* allocation request should not be for more than the + * allocation group size. + */ + if (l2nb > bmp->db_agl2size) { + jfs_error(bmp->db_ipbmap->i_sb, + "allocation request is larger than the allocation group size\n"); + return -EIO; + } + + /* determine the starting block number of the allocation + * group. + */ + blkno = (s64) agno << bmp->db_agl2size; + + /* check if the allocation group size is the minimum allocation + * group size or if the allocation group is completely free. if + * the allocation group size is the minimum size of BPERDMAP (i.e. + * 1 dmap), there is no need to search the dmap control page (below) + * that fully describes the allocation group since the allocation + * group is already fully described by a dmap. in this case, we + * just call dbAllocCtl() to search the dmap tree and allocate the + * required space if available. + * + * if the allocation group is completely free, dbAllocCtl() is + * also called to allocate the required space. this is done for + * two reasons. first, it makes no sense searching the dmap control + * pages for free space when we know that free space exists. second, + * the dmap control pages may indicate that the allocation group + * has no free space if the allocation group is part (not the first + * part) of a larger binary buddy system. + */ + if (bmp->db_agsize == BPERDMAP + || bmp->db_agfree[agno] == bmp->db_agsize) { + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if ((rc == -ENOSPC) && + (bmp->db_agfree[agno] == bmp->db_agsize)) { + printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl failed in free AG\n"); + } + return (rc); + } + + /* the buffer for the dmap control page that fully describes the + * allocation group. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* search the subtree(s) of the dmap control page that describes + * the allocation group, looking for sufficient free space. to begin, + * determine how many allocation groups are represented in a dmap + * control page at the control page level (i.e. L0, L1, L2) that + * fully describes an allocation group. next, determine the starting + * tree index of this allocation group within the control page. + */ + agperlev = + (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; + ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + + /* dmap control page trees fan-out by 4 and a single allocation + * group may be described by 1 or 2 subtrees within the ag level + * dmap control page, depending upon the ag size. examine the ag's + * subtrees for sufficient free space, starting with the leftmost + * subtree. + */ + for (i = 0; i < bmp->db_agwidth; i++, ti++) { + /* is there sufficient free space ? + */ + if (l2nb > dcp->stree[ti]) + continue; + + /* sufficient free space found in a subtree. now search down + * the subtree to find the leftmost leaf that describes this + * free space. + */ + for (k = bmp->db_agheight; k > 0; k--) { + for (n = 0, m = (ti << 2) + 1; n < 4; n++) { + if (l2nb <= dcp->stree[m + n]) { + ti = m + n; + break; + } + } + if (n == 4) { + jfs_error(bmp->db_ipbmap->i_sb, + "failed descending stree\n"); + release_metapage(mp); + return -EIO; + } + } + + /* determine the block number within the file system + * that corresponds to this leaf. + */ + if (bmp->db_aglevel == 2) + blkno = 0; + else if (bmp->db_aglevel == 1) + blkno &= ~(MAXL1SIZE - 1); + else /* bmp->db_aglevel == 0 */ + blkno &= ~(MAXL0SIZE - 1); + + blkno += + ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; + + /* release the buffer in preparation for going down + * the next level of dmap control pages. + */ + release_metapage(mp); + + /* check if we need to continue to search down the lower + * level dmap control pages. we need to if the number of + * blocks required is less than maximum number of blocks + * described at the next lower level. + */ + if (l2nb < budmin) { + + /* search the lower level dmap control pages to get + * the starting block number of the dmap that + * contains or starts off the free space. + */ + if ((rc = + dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, + &blkno))) { + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "control page inconsistent\n"); + return -EIO; + } + return (rc); + } + } + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "unable to allocate blocks\n"); + rc = -EIO; + } + return (rc); + } + + /* no space in the allocation group. release the buffer and + * return -ENOSPC. + */ + release_metapage(mp); + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAny() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks anywhere in the file system. + * + * dbAllocAny() attempts to find the sufficient free space by + * searching down the dmap control pages, starting with the + * highest level (i.e. L0, L1, L2) control page. if free space + * large enough to satisfy the desired free space is found, the + * desired free space is allocated. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) +{ + int rc; + s64 blkno = 0; + + /* starting with the top level dmap control page, search + * down the dmap control levels for sufficient free space. + * if free space is found, dbFindCtl() returns the starting + * block number of the dmap that contains or starts off the + * range of free space. + */ + if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) + return (rc); + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n"); + return -EIO; + } + return (rc); +} + + +/* + * NAME: dbDiscardAG() + * + * FUNCTION: attempt to discard (TRIM) all free blocks of specific AG + * + * algorithm: + * 1) allocate blocks, as large as possible and save them + * while holding IWRITE_LOCK on ipbmap + * 2) trim all these saved block/length values + * 3) mark the blocks free again + * + * benefit: + * - we work only on one ag at some time, minimizing how long we + * need to lock ipbmap + * - reading / writing the fs is possible most time, even on + * trimming + * + * downside: + * - we write two times to the dmapctl and dmap pages + * - but for me, this seems the best way, better ideas? + * /TR 2012 + * + * PARAMETERS: + * ip - pointer to in-core inode + * agno - ag to trim + * minlen - minimum value of contiguous blocks + * + * RETURN VALUES: + * s64 - actual number of blocks trimmed + */ +s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + s64 nblocks, blkno; + u64 trimmed = 0; + int rc, l2nb; + struct super_block *sb = ipbmap->i_sb; + + struct range2trim { + u64 blkno; + u64 nblocks; + } *totrim, *tt; + + /* max blkno / nblocks pairs to trim */ + int count = 0, range_cnt; + u64 max_ranges; + + /* prevent others from writing new stuff here, while trimming */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + nblocks = bmp->db_agfree[agno]; + max_ranges = nblocks; + do_div(max_ranges, minlen); + range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); + totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); + if (totrim == NULL) { + jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n"); + IWRITE_UNLOCK(ipbmap); + return 0; + } + + tt = totrim; + while (nblocks >= minlen) { + l2nb = BLKSTOL2(nblocks); + + /* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */ + rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno); + if (rc == 0) { + tt->blkno = blkno; + tt->nblocks = nblocks; + tt++; count++; + + /* the whole ag is free, trim now */ + if (bmp->db_agfree[agno] == 0) + break; + + /* give a hint for the next while */ + nblocks = bmp->db_agfree[agno]; + continue; + } else if (rc == -ENOSPC) { + /* search for next smaller log2 block */ + l2nb = BLKSTOL2(nblocks) - 1; + nblocks = 1 << l2nb; + } else { + /* Trim any already allocated blocks */ + jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n"); + break; + } + + /* check, if our trim array is full */ + if (unlikely(count >= range_cnt - 1)) + break; + } + IWRITE_UNLOCK(ipbmap); + + tt->nblocks = 0; /* mark the current end */ + for (tt = totrim; tt->nblocks != 0; tt++) { + /* when mounted with online discard, dbFree() will + * call jfs_issue_discard() itself */ + if (!(JFS_SBI(sb)->flag & JFS_DISCARD)) + jfs_issue_discard(ip, tt->blkno, tt->nblocks); + dbFree(ip, tt->blkno, tt->nblocks); + trimmed += tt->nblocks; + } + kfree(totrim); + + return trimmed; +} + +/* + * NAME: dbFindCtl() + * + * FUNCTION: starting at a specified dmap control page level and block + * number, search down the dmap control levels for a range of + * contiguous free blocks large enough to satisfy an allocation + * request for the specified number of free blocks. + * + * if sufficient contiguous free blocks are found, this routine + * returns the starting block number within a dmap page that + * contains or starts a range of contiqious free blocks that + * is sufficient in size. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. + * on successful return, the first block within a dmap page + * that contains or starts a range of contiguous free blocks. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) +{ + int rc, leafidx, lev; + s64 b, lblkno; + struct dmapctl *dcp; + int budmin; + struct metapage *mp; + + /* starting at the specified dmap control page level and block + * number, search down the dmap control levels for the starting + * block number of a dmap page that contains or starts off + * sufficient free blocks. + */ + for (lev = level, b = *blkno; lev >= 0; lev--) { + /* get the buffer of the dmap control page for the block + * number and level (i.e. L0, L1, L2). + */ + lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* search the tree within the dmap control page for + * sufficient free space. if sufficient free space is found, + * dbFindLeaf() returns the index of the leaf at which + * free space was found. + */ + rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); + + /* release the buffer. + */ + release_metapage(mp); + + /* space found ? + */ + if (rc) { + if (lev != level) { + jfs_error(bmp->db_ipbmap->i_sb, + "dmap inconsistent\n"); + return -EIO; + } + return -ENOSPC; + } + + /* adjust the block number to reflect the location within + * the dmap control page (i.e. the leaf) at which free + * space was found. + */ + b += (((s64) leafidx) << budmin); + + /* we stop the search at this dmap control page level if + * the number of blocks required is greater than or equal + * to the maximum number of blocks described at the next + * (lower) level. + */ + if (l2nb >= budmin) + break; + } + + *blkno = b; + return (0); +} + + +/* + * NAME: dbAllocCtl() + * + * FUNCTION: attempt to allocate a specified number of contiguous + * blocks starting within a specific dmap. + * + * this routine is called by higher level routines that search + * the dmap control pages above the actual dmaps for contiguous + * free space. the result of successful searches by these + * routines are the starting block numbers within dmaps, with + * the dmaps themselves containing the desired contiguous free + * space or starting a contiguous free space of desired size + * that is made up of the blocks of one or more dmaps. these + * calls should not fail due to insufficent resources. + * + * this routine is called in some cases where it is not known + * whether it will fail due to insufficient resources. more + * specifically, this occurs when allocating from an allocation + * group whose size is equal to the number of blocks per dmap. + * in this case, the dmap control pages are not examined prior + * to calling this routine (to save pathlength) and the call + * might fail. + * + * for a request size that fits within a dmap, this routine relies + * upon the dmap's dmtree to find the requested contiguous free + * space. for request sizes that are larger than a dmap, the + * requested free space will start at the first block of the + * first dmap (i.e. blkno). + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation + * from. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) +{ + int rc, nb; + s64 b, lblkno, n; + struct metapage *mp; + struct dmap *dp; + + /* check if the allocation request is confined to a single dmap. + */ + if (l2nb <= L2BPERDMAP) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks. + */ + rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); + } + + /* allocation request involving multiple dmaps. it must start on + * a dmap boundary. + */ + assert((blkno & (BPERDMAP - 1)) == 0); + + /* allocate the blocks dmap by dmap. + */ + for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + rc = -EIO; + goto backout; + } + dp = (struct dmap *) mp->data; + + /* the dmap better be all free. + */ + if (dp->tree.stree[ROOT] != L2BPERDMAP) { + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, + "the dmap is not all free\n"); + rc = -EIO; + goto backout; + } + + /* determine how many blocks to allocate from this dmap. + */ + nb = min_t(s64, n, BPERDMAP); + + /* allocate the blocks from the dmap. + */ + if ((rc = dbAllocDmap(bmp, dp, b, nb))) { + release_metapage(mp); + goto backout; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + /* set the results (starting block number) and return. + */ + *results = blkno; + return (0); + + /* something failed in handling an allocation request involving + * multiple dmaps. we'll try to clean up by backing out any + * allocation that has already happened for this request. if + * we fail in backing out the allocation, we'll mark the file + * system to indicate that blocks have been leaked. + */ + backout: + + /* try to backout the allocations dmap by dmap. + */ + for (n = nblocks - n, b = blkno; n > 0; + n -= BPERDMAP, b += BPERDMAP) { + /* get the buffer for this dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + jfs_error(bmp->db_ipbmap->i_sb, + "I/O Error: Block Leakage\n"); + continue; + } + dp = (struct dmap *) mp->data; + + /* free the blocks is this dmap. + */ + if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n"); + continue; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + return (rc); +} + + +/* + * NAME: dbAllocDmapLev() + * + * FUNCTION: attempt to allocate a specified number of contiguous blocks + * from a specified dmap. + * + * this routine checks if the contiguous blocks are available. + * if so, nblocks of blocks are allocated; otherwise, ENOSPC is + * returned. + * + * PARAMETERS: + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or + * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; + */ +static int +dbAllocDmapLev(struct bmap * bmp, + struct dmap * dp, int nblocks, int l2nb, s64 * results) +{ + s64 blkno; + int leafidx, rc; + + /* can't be more than a dmaps worth of blocks */ + assert(l2nb <= L2BPERDMAP); + + /* search the tree within the dmap page for sufficient + * free space. if sufficient free space is found, dbFindLeaf() + * returns the index of the leaf at which free space was found. + */ + if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) + return -ENOSPC; + + /* determine the block number within the file system corresponding + * to the leaf at which free space was found. + */ + blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); + + /* if not all bits of the dmap word are free, get the starting + * bit number within the dmap word of the required string of free + * bits and adjust the block number with this value. + */ + if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) + blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); + + /* allocate the blocks */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); +} + + +/* + * NAME: dbAllocDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine allocates the specified blocks from the dmap + * through a call to dbAllocBits(). if the allocation of the + * block range causes the maximum string of free blocks within + * the dmap to change (i.e. the value of the root of the dmap's + * dmtree), this routine will cause this change to be reflected + * up through the appropriate levels of the dmap control pages + * by a call to dbAdjCtl() for the L0 dmap control page that + * covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* allocate the specified (blocks) bits */ + dbAllocBits(bmp, dp, blkno, nblocks); + + /* if the root has not changed, done. */ + if (dp->tree.stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbFreeDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine frees the specified blocks from the dmap through + * a call to dbFreeBits(). if the deallocation of the block range + * causes the maximum string of free blocks within the dmap to + * change (i.e. the value of the root of the dmap's dmtree), this + * routine will cause this change to be reflected up through the + * appropriate levels of the dmap control pages by a call to + * dbAdjCtl() for the L0 dmap control page that covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc = 0, word; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* free the specified (blocks) bits */ + rc = dbFreeBits(bmp, dp, blkno, nblocks); + + /* if error or the root has not changed, done. */ + if (rc || (dp->tree.stree[ROOT] == oldroot)) + return (rc); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the deallocation. + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + + /* as part of backing out the deallocation, we will have + * to back split the dmap tree if the deallocation caused + * the freed blocks to become part of a larger binary buddy + * system. + */ + if (dp->tree.stree[word] == NOFREE) + dbBackSplit((dmtree_t *) & dp->tree, word); + + dbAllocBits(bmp, dp, blkno, nblocks); + } + + return (rc); +} + + +/* + * NAME: dbAllocBits() + * + * FUNCTION: allocate a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits allocated. it also causes the + * dmap's dmtree, as a whole, to reflect the allocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int size; + s8 *leaf; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = dp->tree.stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + /* update the leaf for this dmap word. in addition + * to setting the leaf value to the binary buddy max + * of the updated dmap word, dbSplit() will split + * the binary system of the leaves if need be. + */ + dbSplit(tp, word, BUDMIN, + dbMaxBud((u8 *) & dp->wmap[word])); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the allocated words. + */ + for (; nwords > 0; nwords -= nw) { + if (leaf[word] < BUDMIN) { + jfs_error(bmp->db_ipbmap->i_sb, + "leaf page corrupt\n"); + break; + } + + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being allocated and the l2 number + * of bits currently described by this leaf. + */ + size = min_t(int, leaf[word], + NLSTOL2BSZ(nwords)); + + /* update the leaf to reflect the allocation. + * in addition to setting the leaf value to + * NOFREE, dbSplit() will split the binary + * system of the leaves to reflect the current + * allocation (size). + */ + dbSplit(tp, word, size, NOFREE); + + /* get the number of dmap words handled */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the maximum allocation group number if this allocation + * group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); +} + + +/* + * NAME: dbFreeBits() + * + * FUNCTION: free a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits freed. it also causes the dmap's + * dmtree, as a whole, to reflect the deallocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. + * + * RETURN VALUES: 0 for success + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int rc = 0; + int size; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap. + */ + assert(dbitno + nblocks <= BPERDMAP); + + /* free the bits of the dmaps words corresponding to the block range. + * not all bits of the first and last words may be contained within + * the block range. if this is the case, we'll work against those + * words (i.e. partial first and/or last) on an individual basis + * (a single pass), freeing the bits of interest by hand and updating + * the leaf corresponding to the dmap word. a single pass will be used + * for all dmap words fully contained within the specified range. + * within this pass, the bits of all fully contained dmap words will + * be marked as free in a single shot and the leaves will be updated. a + * single leaf may describe the free space of multiple dmap words, + * so we may update only a subset of the actual leaves corresponding + * to the dmap words of the block range. + * + * dbJoin() is used to update leaf values and will join the binary + * buddy system of the leaves if the new leaf values indicate this + * should be done. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be freed. + */ + if (nb < DBWORD) { + /* free (zero) the appropriate bits within this + * dmap word. + */ + dp->wmap[word] &= + cpu_to_le32(~(ONES << (DBWORD - nb) + >> wbitno)); + + /* update the leaf for this dmap word. + */ + rc = dbJoin(tp, word, + dbMaxBud((u8 *) & dp->wmap[word])); + if (rc) + return rc; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and free (zero) the bits of these words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], 0, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the freed words. + */ + for (; nwords > 0; nwords -= nw) { + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being freed and the l2 (max) number + * of bits that can be described by this leaf. + */ + size = + min(LITOL2BSZ + (word, L2LPERDMAP, BUDMIN), + NLSTOL2BSZ(nwords)); + + /* update the leaf. + */ + rc = dbJoin(tp, word, size); + if (rc) + return rc; + + /* get the number of dmap words handled. + */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap. + */ + le32_add_cpu(&dp->nfree, nblocks); + + BMAP_LOCK(bmp); + + /* update the free count for the allocation group and + * map. + */ + agno = blkno >> bmp->db_agl2size; + bmp->db_nfree += nblocks; + bmp->db_agfree[agno] += nblocks; + + /* check if this allocation group is not completely free and + * if it is currently the maximum (rightmost) allocation group. + * if so, establish the new maximum allocation group number by + * searching left for the first allocation group with allocation. + */ + if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) || + (agno == bmp->db_numag - 1 && + bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) { + while (bmp->db_maxag > 0) { + bmp->db_maxag -= 1; + if (bmp->db_agfree[bmp->db_maxag] != + bmp->db_agsize) + break; + } + + /* re-establish the allocation group preference if the + * current preference is right of the maximum allocation + * group. + */ + if (bmp->db_agpref > bmp->db_maxag) + bmp->db_agpref = bmp->db_maxag; + } + + BMAP_UNLOCK(bmp); + + return 0; +} + + +/* + * NAME: dbAdjCtl() + * + * FUNCTION: adjust a dmap control page at a specified level to reflect + * the change in a lower level dmap or dmap control page's + * maximum string of free blocks (i.e. a change in the root + * of the lower level object's dmtree) due to the allocation + * or deallocation of a range of blocks with a single dmap. + * + * on entry, this routine is provided with the new value of + * the lower level dmap or dmap control page root and the + * starting block number of the block range whose allocation + * or deallocation resulted in the root change. this range + * is respresented by a single leaf of the current dmapctl + * and the leaf will be updated with this value, possibly + * causing a binary buddy system within the leaves to be + * split or joined. the update may also cause the dmapctl's + * dmtree to be updated. + * + * if the adjustment of the dmap control page, itself, causes its + * root to change, this change will be bubbled up to the next dmap + * control level by a recursive call to this routine, specifying + * the new root value and the next dmap control page level to + * be adjusted. + * PARAMETERS: + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is + * the allocation or deallocation of this block range that + * requires the dmap control page to be adjusted. + * newval - the new value of the lower level dmap or dmap control + * page root. + * alloc - 'true' if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to + * be adjusted. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) +{ + struct metapage *mp; + s8 oldroot; + int oldval; + s64 lblkno; + struct dmapctl *dcp; + int rc, leafno, ti; + + /* get the buffer for the dmap control page for the specified + * block number and control page level. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* determine the leaf number corresponding to the block and + * the index within the dmap control tree. + */ + leafno = BLKTOCTLLEAF(blkno, dcp->budmin); + ti = leafno + le32_to_cpu(dcp->leafidx); + + /* save the current leaf value and the current root level (i.e. + * maximum l2 free string described by this dmapctl). + */ + oldval = dcp->stree[ti]; + oldroot = dcp->stree[ROOT]; + + /* check if this is a control page update for an allocation. + * if so, update the leaf to reflect the new leaf value using + * dbSplit(); otherwise (deallocation), use dbJoin() to update + * the leaf with the new value. in addition to updating the + * leaf, dbSplit() will also split the binary buddy system of + * the leaves, if required, and bubble new values within the + * dmapctl tree, if required. similarly, dbJoin() will join + * the binary buddy system of leaves and bubble new values up + * the dmapctl tree as required by the new leaf value. + */ + if (alloc) { + /* check if we are in the middle of a binary buddy + * system. this happens when we are performing the + * first allocation out of an allocation group that + * is part (not the first part) of a larger binary + * buddy system. if we are in the middle, back split + * the system prior to calling dbSplit() which assumes + * that it is at the front of a binary buddy system. + */ + if (oldval == NOFREE) { + rc = dbBackSplit((dmtree_t *) dcp, leafno); + if (rc) + return rc; + oldval = dcp->stree[ti]; + } + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + } else { + rc = dbJoin((dmtree_t *) dcp, leafno, newval); + if (rc) + return rc; + } + + /* check if the root of the current dmap control page changed due + * to the update and if the current dmap control page is not at + * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. + * root changed and this is not the top level), call this routine + * again (recursion) for the next higher level of the mapping to + * reflect the change in root for the current dmap control page. + */ + if (dcp->stree[ROOT] != oldroot) { + /* are we below the top level of the map. if so, + * bubble the root up to the next higher level. + */ + if (level < bmp->db_maxlevel) { + /* bubble up the new root of this dmap control page to + * the next level. + */ + if ((rc = + dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, + level + 1))) { + /* something went wrong in bubbling up the new + * root value, so backout the changes to the + * current dmap control page. + */ + if (alloc) { + dbJoin((dmtree_t *) dcp, leafno, + oldval); + } else { + /* the dbJoin() above might have + * caused a larger binary buddy system + * to form and we may now be in the + * middle of it. if this is the case, + * back split the buddies. + */ + if (dcp->stree[ti] == NOFREE) + dbBackSplit((dmtree_t *) + dcp, leafno); + dbSplit((dmtree_t *) dcp, leafno, + dcp->budmin, oldval); + } + + /* release the buffer and return the error. + */ + release_metapage(mp); + return (rc); + } + } else { + /* we're at the top level of the map. update + * the bmap control page to reflect the size + * of the maximum free buddy system. + */ + assert(level == bmp->db_maxlevel); + if (bmp->db_maxfreebud != oldroot) { + jfs_error(bmp->db_ipbmap->i_sb, + "the maximum free buddy is not the old root\n"); + } + bmp->db_maxfreebud = dcp->stree[ROOT]; + } + } + + /* write the buffer. + */ + write_metapage(mp); + + return (0); +} + + +/* + * NAME: dbSplit() + * + * FUNCTION: update the leaf of a dmtree with a new value, splitting + * the leaf from the binary buddy system of the dmtree's + * leaves, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf + * must be split to, specified as the log2 number of blocks. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +{ + int budsz; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* check if the leaf needs to be split. + */ + if (leaf[leafno] > tp->dmt_budmin) { + /* the split occurs by cutting the buddy system in half + * at the specified leaf until we reach the specified + * size. pick up the starting split size (current size + * - 1 in l2) and the corresponding buddy size. + */ + cursz = leaf[leafno] - 1; + budsz = BUDSIZE(cursz, tp->dmt_budmin); + + /* split until we reach the specified size. + */ + while (cursz >= splitsz) { + /* update the buddy's leaf with its new value. + */ + dbAdjTree(tp, leafno ^ budsz, cursz); + + /* on to the next size and buddy. + */ + cursz -= 1; + budsz >>= 1; + } + } + + /* adjust the dmap tree to reflect the specified leaf's new + * value. + */ + dbAdjTree(tp, leafno, newval); +} + + +/* + * NAME: dbBackSplit() + * + * FUNCTION: back split the binary buddy system of dmtree leaves + * that hold a specified leaf until the specified leaf + * starts its own binary buddy system. + * + * the allocators typically perform allocations at the start + * of binary buddy systems and dbSplit() is used to accomplish + * any required splits. in some cases, however, allocation + * may occur in the middle of a binary system and requires a + * back split, with the split proceeding out from the middle of + * the system (less efficient) rather than the start of the + * system (more efficient). the cases in which a back split + * is required are rare and are limited to the first allocation + * within an allocation group which is a part (not first part) + * of a larger binary buddy system and a few exception cases + * in which a previous join operation must be backed out. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbBackSplit(dmtree_t * tp, int leafno) +{ + int budsz, bud, w, bsz, size; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* leaf should be part (not first part) of a binary + * buddy system. + */ + assert(leaf[leafno] == NOFREE); + + /* the back split is accomplished by iteratively finding the leaf + * that starts the buddy system that contains the specified leaf and + * splitting that system in two. this iteration continues until + * the specified leaf becomes the start of a buddy system. + * + * determine maximum possible l2 size for the specified leaf. + */ + size = + LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), + tp->dmt_budmin); + + /* determine the number of leaves covered by this size. this + * is the buddy size that we will start with as we search for + * the buddy system that contains the specified leaf. + */ + budsz = BUDSIZE(size, tp->dmt_budmin); + + /* back split. + */ + while (leaf[leafno] == NOFREE) { + /* find the leftmost buddy leaf. + */ + for (w = leafno, bsz = budsz;; bsz <<= 1, + w = (w < bud) ? w : bud) { + if (bsz >= le32_to_cpu(tp->dmt_nleafs)) { + jfs_err("JFS: block map error in dbBackSplit"); + return -EIO; + } + + /* determine the buddy. + */ + bud = w ^ bsz; + + /* check if this buddy is the start of the system. + */ + if (leaf[bud] != NOFREE) { + /* split the leaf at the start of the + * system in two. + */ + cursz = leaf[bud] - 1; + dbSplit(tp, bud, cursz, cursz); + break; + } + } + } + + if (leaf[leafno] != size) { + jfs_err("JFS: wrong leaf value in dbBackSplit"); + return -EIO; + } + return 0; +} + + +/* + * NAME: dbJoin() + * + * FUNCTION: update the leaf of a dmtree with a new value, joining + * the leaf with other leaves of the dmtree into a multi-leaf + * binary buddy system, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static int dbJoin(dmtree_t * tp, int leafno, int newval) +{ + int budsz, buddy; + s8 *leaf; + + /* can the new leaf value require a join with other leaves ? + */ + if (newval >= tp->dmt_budmin) { + /* pickup a pointer to the leaves of the tree. + */ + leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* try to join the specified leaf into a large binary + * buddy system. the join proceeds by attempting to join + * the specified leafno with its buddy (leaf) at new value. + * if the join occurs, we attempt to join the left leaf + * of the joined buddies with its buddy at new value + 1. + * we continue to join until we find a buddy that cannot be + * joined (does not have a value equal to the size of the + * last join) or until all leaves have been joined into a + * single system. + * + * get the buddy size (number of words covered) of + * the new value. + */ + budsz = BUDSIZE(newval, tp->dmt_budmin); + + /* try to join. + */ + while (budsz < le32_to_cpu(tp->dmt_nleafs)) { + /* get the buddy leaf. + */ + buddy = leafno ^ budsz; + + /* if the leaf's new value is greater than its + * buddy's value, we join no more. + */ + if (newval > leaf[buddy]) + break; + + /* It shouldn't be less */ + if (newval < leaf[buddy]) + return -EIO; + + /* check which (leafno or buddy) is the left buddy. + * the left buddy gets to claim the blocks resulting + * from the join while the right gets to claim none. + * the left buddy is also eligible to participate in + * a join at the next higher level while the right + * is not. + * + */ + if (leafno < buddy) { + /* leafno is the left buddy. + */ + dbAdjTree(tp, buddy, NOFREE); + } else { + /* buddy is the left buddy and becomes + * leafno. + */ + dbAdjTree(tp, leafno, NOFREE); + leafno = buddy; + } + + /* on to try the next join. + */ + newval += 1; + budsz <<= 1; + } + } + + /* update the leaf value. + */ + dbAdjTree(tp, leafno, newval); + + return 0; +} + + +/* + * NAME: dbAdjTree() + * + * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * the dmtree, as required, to reflect the new leaf value. + * the combination of any buddies must already be done before + * this is called. + * + * PARAMETERS: + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +{ + int lp, pp, k; + int max; + + /* pick up the index of the leaf for this leafno. + */ + lp = leafno + le32_to_cpu(tp->dmt_leafidx); + + /* is the current value the same as the old value ? if so, + * there is nothing to do. + */ + if (tp->dmt_stree[lp] == newval) + return; + + /* set the new value. + */ + tp->dmt_stree[lp] = newval; + + /* bubble the new value up the tree as required. + */ + for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { + /* get the index of the first leaf of the 4 leaf + * group containing the specified leaf (leafno). + */ + lp = ((lp - 1) & ~0x03) + 1; + + /* get the index of the parent of this 4 leaf group. + */ + pp = (lp - 1) >> 2; + + /* determine the maximum of the 4 leaves. + */ + max = TREEMAX(&tp->dmt_stree[lp]); + + /* if the maximum of the 4 is the same as the + * parent's value, we're done. + */ + if (tp->dmt_stree[pp] == max) + break; + + /* parent gets new value. + */ + tp->dmt_stree[pp] = max; + + /* parent becomes leaf for next go-round. + */ + lp = pp; + } +} + + +/* + * NAME: dbFindLeaf() + * + * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * the index of a leaf describing the free blocks if + * sufficient free blocks are found. + * + * the search starts at the top of the dmtree_t tree and + * proceeds down the tree to the leftmost leaf with sufficient + * free space. + * + * PARAMETERS: + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. + * leafidx - return pointer to be set to the index of the leaf + * describing at least l2nb free blocks if sufficient + * free blocks are found. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient free blocks. + */ +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) +{ + int ti, n = 0, k, x = 0; + + /* first check the root of the tree to see if there is + * sufficient free space. + */ + if (l2nb > tp->dmt_stree[ROOT]) + return -ENOSPC; + + /* sufficient free space available. now search down the tree + * starting at the next level for the leftmost leaf that + * describes sufficient free space. + */ + for (k = le32_to_cpu(tp->dmt_height), ti = 1; + k > 0; k--, ti = ((ti + n) << 2) + 1) { + /* search the four nodes at this level, starting from + * the left. + */ + for (x = ti, n = 0; n < 4; n++) { + /* sufficient free space found. move to the next + * level (or quit if this is the last level). + */ + if (l2nb <= tp->dmt_stree[x + n]) + break; + } + + /* better have found something since the higher + * levels of the tree said it was here. + */ + assert(n < 4); + } + + /* set the return to the leftmost leaf describing sufficient + * free space. + */ + *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); + + return (0); +} + + +/* + * NAME: dbFindBits() + * + * FUNCTION: find a specified number of binary buddy free bits within a + * dmap bitmap word value. + * + * this routine searches the bitmap value for (1 << l2nb) free + * bits at (1 << l2nb) alignments within the value. + * + * PARAMETERS: + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. + * + * RETURN VALUES: + * starting bit number of free bits. + */ +static int dbFindBits(u32 word, int l2nb) +{ + int bitno, nb; + u32 mask; + + /* get the number of bits. + */ + nb = 1 << l2nb; + assert(nb <= DBWORD); + + /* complement the word so we can use a mask (i.e. 0s represent + * free bits) and compute the mask. + */ + word = ~word; + mask = ONES << (DBWORD - nb); + + /* scan the word for nb free bits at nb alignments. + */ + for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { + if ((mask & word) == mask) + break; + } + + ASSERT(bitno < 32); + + /* return the bit number. + */ + return (bitno); +} + + +/* + * NAME: dbMaxBud(u8 *cp) + * + * FUNCTION: determine the largest binary buddy string of free + * bits within 32-bits of the map. + * + * PARAMETERS: + * cp - pointer to the 32-bit value. + * + * RETURN VALUES: + * largest binary buddy of free bits within a dmap word. + */ +static int dbMaxBud(u8 * cp) +{ + signed char tmp1, tmp2; + + /* check if the wmap word is all free. if so, the + * free buddy size is BUDMIN. + */ + if (*((uint *) cp) == 0) + return (BUDMIN); + + /* check if the wmap word is half free. if so, the + * free buddy size is BUDMIN-1. + */ + if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) + return (BUDMIN - 1); + + /* not all free or half free. determine the free buddy + * size thru table lookup using quarters of the wmap word. + */ + tmp1 = max(budtab[cp[2]], budtab[cp[3]]); + tmp2 = max(budtab[cp[0]], budtab[cp[1]]); + return (max(tmp1, tmp2)); +} + + +/* + * NAME: cnttz(uint word) + * + * FUNCTION: determine the number of trailing zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of trailing zeros + */ +static int cnttz(u32 word) +{ + int n; + + for (n = 0; n < 32; n++, word >>= 1) { + if (word & 0x01) + break; + } + + return (n); +} + + +/* + * NAME: cntlz(u32 value) + * + * FUNCTION: determine the number of leading zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of leading zeros + */ +static int cntlz(u32 value) +{ + int n; + + for (n = 0; n < 32; n++, value <<= 1) { + if (value & HIGHORDER) + break; + } + return (n); +} + + +/* + * NAME: blkstol2(s64 nb) + * + * FUNCTION: convert a block count to its log2 value. if the block + * count is not a l2 multiple, it is rounded up to the next + * larger l2 multiple. + * + * PARAMETERS: + * nb - number of blocks + * + * RETURN VALUES: + * log2 number of blocks + */ +static int blkstol2(s64 nb) +{ + int l2nb; + s64 mask; /* meant to be signed */ + + mask = (s64) 1 << (64 - 1); + + /* count the leading bits. + */ + for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { + /* leading bit found. + */ + if (nb & mask) { + /* determine the l2 value. + */ + l2nb = (64 - 1) - l2nb; + + /* check if we need to round up. + */ + if (~mask & nb) + l2nb++; + + return (l2nb); + } + } + assert(0); + return 0; /* fix compiler warning */ +} + + +/* + * NAME: dbAllocBottomUp() + * + * FUNCTION: alloc the specified block range from the working block + * allocation map. + * + * the blocks will be alloc from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be allocated better be within the mapsize. */ + ASSERT(nblocks <= bmp->db_mapsize - blkno); + + /* + * allocate the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be allocated from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* allocate the blocks. */ + if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int rc; + int dbitno, word, rembits, nb, nwords, wbitno, agno; + s8 oldroot; + struct dmaptree *tp = (struct dmaptree *) & dp->tree; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = tp->stree[ROOT]; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + word++; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits */ + nb = nwords << L2DBWORD; + word += nwords; + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + /* reconstruct summary tree */ + dbInitDmapTree(dp); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the highest active allocation group number + * if this allocation group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); + + /* if the root has not changed, done. */ + if (tp->stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbExtendFS() + * + * FUNCTION: extend bmap from blkno for nblocks; + * dbExtendFS() updates bmap ready for dbAllocBottomUp(); + * + * L2 + * | + * L1---------------------------------L1 + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm + * + * <---old---><----------------------------extend-----------------------> + */ +int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); + int nbperpage = sbi->nbperpage; + int i, i0 = true, j, j0 = true, k, n; + s64 newsize; + s64 p; + struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL; + struct dmapctl *l2dcp, *l1dcp, *l0dcp; + struct dmap *dp; + s8 *l0leaf, *l1leaf, *l2leaf; + struct bmap *bmp = sbi->bmap; + int agno, l2agsize, oldl2agsize; + s64 ag_rem; + + newsize = blkno + nblocks; + + jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld", + (long long) blkno, (long long) nblocks, (long long) newsize); + + /* + * initialize bmap control page. + * + * all the data in bmap control page should exclude + * the mkfs hidden dmap page. + */ + + /* update mapsize */ + bmp->db_mapsize = newsize; + bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); + + /* compute new AG size */ + l2agsize = dbGetL2AGSize(newsize); + oldl2agsize = bmp->db_agl2size; + + bmp->db_agl2size = l2agsize; + bmp->db_agsize = 1 << l2agsize; + + /* compute new number of AG */ + agno = bmp->db_numag; + bmp->db_numag = newsize >> l2agsize; + bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; + + /* + * reconfigure db_agfree[] + * from old AG configuration to new AG configuration; + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + if (l2agsize == oldl2agsize) + goto extend; + k = 1 << (l2agsize - oldl2agsize); + ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ + for (i = 0, n = 0; i < agno; n++) { + bmp->db_agfree[n] = 0; /* init collection point */ + + /* coalesce contiguous k AGs; */ + for (j = 0; j < k && i < agno; j++, i++) { + /* merge AGi to AGn */ + bmp->db_agfree[n] += bmp->db_agfree[i]; + } + } + bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ + + for (; n < MAXAG; n++) + bmp->db_agfree[n] = 0; + + /* + * update highest active ag number + */ + + bmp->db_maxag = bmp->db_maxag / k; + + /* + * extend bmap + * + * update bit maps and corresponding level control pages; + * global control page db_nfree, db_agfree[agno], db_maxfreebud; + */ + extend: + /* get L2 page */ + p = BMAPBLKNO + nbperpage; /* L2 page */ + l2mp = read_metapage(ipbmap, p, PSIZE, 0); + if (!l2mp) { + jfs_error(ipbmap->i_sb, "L2 page could not be read\n"); + return -EIO; + } + l2dcp = (struct dmapctl *) l2mp->data; + + /* compute start L1 */ + k = blkno >> L2MAXL1SIZE; + l2leaf = l2dcp->stree + CTLLEAFIND + k; + p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ + + /* + * extend each L1 in L2 + */ + for (; k < LPERCTL; k++, p += nbperpage) { + /* get L1 page */ + if (j0) { + /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ + l1mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; + l1leaf = l1dcp->stree + CTLLEAFIND + j; + p = BLKTOL0(blkno, sbi->l2nbperpage); + j0 = false; + } else { + /* assign/init L1 page */ + l1mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = 0; + l1leaf = l1dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st L0 of L1.k */ + } + + /* + * extend each L0 in L1 + */ + for (; j < LPERCTL; j++) { + /* get L0 page */ + if (i0) { + /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ + + l0mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = (blkno & (MAXL0SIZE - 1)) >> + L2BPERDMAP; + l0leaf = l0dcp->stree + CTLLEAFIND + i; + p = BLKTODMAP(blkno, + sbi->l2nbperpage); + i0 = false; + } else { + /* assign/init L0 page */ + l0mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = 0; + l0leaf = l0dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st dmap of L0.j */ + } + + /* + * extend each dmap in L0 + */ + for (; i < LPERCTL; i++) { + /* + * reconstruct the dmap page, and + * initialize corresponding parent L0 leaf + */ + if ((n = blkno & (BPERDMAP - 1))) { + /* read in dmap page: */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + n = min(nblocks, (s64)BPERDMAP - n); + } else { + /* assign/init dmap page */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + + n = min_t(s64, nblocks, BPERDMAP); + } + + dp = (struct dmap *) mp->data; + *l0leaf = dbInitDmap(dp, blkno, n); + + bmp->db_nfree += n; + agno = le64_to_cpu(dp->start) >> l2agsize; + bmp->db_agfree[agno] += n; + + write_metapage(mp); + + l0leaf++; + p += nbperpage; + + blkno += n; + nblocks -= n; + if (nblocks == 0) + break; + } /* for each dmap in a L0 */ + + /* + * build current L0 page from its leaves, and + * initialize corresponding parent L1 leaf + */ + *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); + write_metapage(l0mp); + l0mp = NULL; + + if (nblocks) + l1leaf++; /* continue for next L0 */ + else { + /* more than 1 L0 ? */ + if (j > 0) + break; /* build L1 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l1leaf; + release_metapage(l1mp); + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L0 in a L1 */ + + /* + * build current L1 page from its leaves, and + * initialize corresponding parent L2 leaf + */ + *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); + write_metapage(l1mp); + l1mp = NULL; + + if (nblocks) + l2leaf++; /* continue for next L1 */ + else { + /* more than 1 L1 ? */ + if (k > 0) + break; /* build L2 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l2leaf; + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L1 in a L2 */ + + jfs_error(ipbmap->i_sb, "function has not returned as expected\n"); +errout: + if (l0mp) + release_metapage(l0mp); + if (l1mp) + release_metapage(l1mp); + release_metapage(l2mp); + return -EIO; + + /* + * finalize bmap control page + */ +finalize: + + return 0; +} + + +/* + * dbFinalizeBmap() + */ +void dbFinalizeBmap(struct inode *ipbmap) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + int actags, inactags, l2nl; + s64 ag_rem, actfree, inactfree, avgfree; + int i, n; + + /* + * finalize bmap control page + */ +//finalize: + /* + * compute db_agpref: preferred ag to allocate from + * (the leftmost ag with average free space in it); + */ +//agpref: + /* get the number of active ags and inacitve ags */ + actags = bmp->db_maxag + 1; + inactags = bmp->db_numag - actags; + ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ + + /* determine how many blocks are in the inactive allocation + * groups. in doing this, we must account for the fact that + * the rightmost group might be a partial group (i.e. file + * system size is not a multiple of the group size). + */ + inactfree = (inactags && ag_rem) ? + ((inactags - 1) << bmp->db_agl2size) + ag_rem + : inactags << bmp->db_agl2size; + + /* determine how many free blocks are in the active + * allocation groups plus the average number of free blocks + * within the active ags. + */ + actfree = bmp->db_nfree - inactfree; + avgfree = (u32) actfree / (u32) actags; + + /* if the preferred allocation group has not average free space. + * re-establish the preferred group as the leftmost + * group with average free space. + */ + if (bmp->db_agfree[bmp->db_agpref] < avgfree) { + for (bmp->db_agpref = 0; bmp->db_agpref < actags; + bmp->db_agpref++) { + if (bmp->db_agfree[bmp->db_agpref] >= avgfree) + break; + } + if (bmp->db_agpref >= bmp->db_numag) { + jfs_error(ipbmap->i_sb, + "cannot find ag with average freespace\n"); + } + } + + /* + * compute db_aglevel, db_agheight, db_width, db_agstart: + * an ag is covered in aglevel dmapctl summary tree, + * at agheight level height (from leaf) with agwidth number of nodes + * each, which starts at agstart index node of the smmary tree node + * array; + */ + bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); + l2nl = + bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); + bmp->db_agheight = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1)); + for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0; + i--) { + bmp->db_agstart += n; + n <<= 2; + } + +} + + +/* + * NAME: dbInitDmap()/ujfs_idmap_page() + * + * FUNCTION: initialize working/persistent bitmap of the dmap page + * for the specified number of blocks: + * + * at entry, the bitmaps had been initialized as free (ZEROS); + * The number of blocks will only account for the actually + * existing blocks. Blocks which don't actually exist in + * the aggregate will be marked as allocated (ONES); + * + * PARAMETERS: + * dp - pointer to page of map + * nblocks - number of blocks this page + * + * RETURNS: NONE + */ +static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) +{ + int blkno, w, b, r, nw, nb, i; + + /* starting block number within the dmap */ + blkno = Blkno & (BPERDMAP - 1); + + if (blkno == 0) { + dp->nblocks = dp->nfree = cpu_to_le32(nblocks); + dp->start = cpu_to_le64(Blkno); + + if (nblocks == BPERDMAP) { + memset(&dp->wmap[0], 0, LPERDMAP * 4); + memset(&dp->pmap[0], 0, LPERDMAP * 4); + goto initTree; + } + } else { + le32_add_cpu(&dp->nblocks, nblocks); + le32_add_cpu(&dp->nfree, nblocks); + } + + /* word number containing start block number */ + w = blkno >> L2DBWORD; + + /* + * free the bits corresponding to the block range (ZEROS): + * note: not all bits of the first and last words may be contained + * within the block range. + */ + for (r = nblocks; r > 0; r -= nb, blkno += nb) { + /* number of bits preceding range to be freed in the word */ + b = blkno & (DBWORD - 1); + /* number of bits to free in the word */ + nb = min(r, DBWORD - b); + + /* is partial word to be freed ? */ + if (nb < DBWORD) { + /* free (set to 0) from the bitmap word */ + dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + + /* skip the word freed */ + w++; + } else { + /* free (set to 0) contiguous bitmap words */ + nw = r >> L2DBWORD; + memset(&dp->wmap[w], 0, nw * 4); + memset(&dp->pmap[w], 0, nw * 4); + + /* skip the words freed */ + nb = nw << L2DBWORD; + w += nw; + } + } + + /* + * mark bits following the range to be freed (non-existing + * blocks) as allocated (ONES) + */ + + if (blkno == BPERDMAP) + goto initTree; + + /* the first word beyond the end of existing blocks */ + w = blkno >> L2DBWORD; + + /* does nblocks fall on a 32-bit boundary ? */ + b = blkno & (DBWORD - 1); + if (b) { + /* mark a partial word allocated */ + dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); + w++; + } + + /* set the rest of the words in the page to allocated (ONES) */ + for (i = w; i < LPERDMAP; i++) + dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES); + + /* + * init tree + */ + initTree: + return (dbInitDmapTree(dp)); +} + + +/* + * NAME: dbInitDmapTree()/ujfs_complete_dmap() + * + * FUNCTION: initialize summary tree of the specified dmap: + * + * at entry, bitmap of the dmap has been initialized; + * + * PARAMETERS: + * dp - dmap to complete + * blkno - starting block number for this dmap + * treemax - will be filled in with max free for this dmap + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitDmapTree(struct dmap * dp) +{ + struct dmaptree *tp; + s8 *cp; + int i; + + /* init fixed info of tree */ + tp = &dp->tree; + tp->nleafs = cpu_to_le32(LPERDMAP); + tp->l2nleafs = cpu_to_le32(L2LPERDMAP); + tp->leafidx = cpu_to_le32(LEAFIND); + tp->height = cpu_to_le32(4); + tp->budmin = BUDMIN; + + /* init each leaf from corresponding wmap word: + * note: leaf is set to NOFREE(-1) if all blocks of corresponding + * bitmap word are allocated. + */ + cp = tp->stree + le32_to_cpu(tp->leafidx); + for (i = 0; i < LPERDMAP; i++) + *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree(tp)); +} + + +/* + * NAME: dbInitTree()/ujfs_adjtree() + * + * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. + * + * at entry, the leaves of the tree has been initialized + * from corresponding bitmap word or root of summary tree + * of the child control page; + * configure binary buddy system at the leaf level, then + * bubble up the values of the leaf nodes up the tree. + * + * PARAMETERS: + * cp - Pointer to the root of the tree + * l2leaves- Number of leaf nodes as a power of 2 + * l2min - Number of blocks that can be covered by a leaf + * as a power of 2 + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitTree(struct dmaptree * dtp) +{ + int l2max, l2free, bsize, nextb, i; + int child, parent, nparent; + s8 *tp, *cp, *cp1; + + tp = dtp->stree; + + /* Determine the maximum free string possible for the leaves */ + l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; + + /* + * configure the leaf levevl into binary buddy system + * + * Try to combine buddies starting with a buddy size of 1 + * (i.e. two leaves). At a buddy size of 1 two buddy leaves + * can be combined if both buddies have a maximum free of l2min; + * the combination will result in the left-most buddy leaf having + * a maximum free of l2min+1. + * After processing all buddies for a given size, process buddies + * at the next higher buddy size (i.e. current size * 2) and + * the next maximum free (current free + 1). + * This continues until the maximum possible buddy combination + * yields maximum free. + */ + for (l2free = dtp->budmin, bsize = 1; l2free < l2max; + l2free++, bsize = nextb) { + /* get next buddy size == current buddy pair size */ + nextb = bsize << 1; + + /* scan each adjacent buddy pair at current buddy size */ + for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); + i < le32_to_cpu(dtp->nleafs); + i += nextb, cp += nextb) { + /* coalesce if both adjacent buddies are max free */ + if (*cp == l2free && *(cp + bsize) == l2free) { + *cp = l2free + 1; /* left take right */ + *(cp + bsize) = -1; /* right give left */ + } + } + } + + /* + * bubble summary information of leaves up the tree. + * + * Starting at the leaf node level, the four nodes described by + * the higher level parent node are compared for a maximum free and + * this maximum becomes the value of the parent node. + * when all lower level nodes are processed in this fashion then + * move up to the next level (parent becomes a lower level node) and + * continue the process for that level. + */ + for (child = le32_to_cpu(dtp->leafidx), + nparent = le32_to_cpu(dtp->nleafs) >> 2; + nparent > 0; nparent >>= 2, child = parent) { + /* get index of 1st node of parent level */ + parent = (child - 1) >> 2; + + /* set the value of the parent node as the maximum + * of the four nodes of the current level. + */ + for (i = 0, cp = tp + child, cp1 = tp + parent; + i < nparent; i++, cp += 4, cp1++) + *cp1 = TREEMAX(cp); + } + + return (*tp); +} + + +/* + * dbInitDmapCtl() + * + * function: initialize dmapctl page + */ +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) +{ /* start leaf index not covered by range */ + s8 *cp; + + dcp->nleafs = cpu_to_le32(LPERCTL); + dcp->l2nleafs = cpu_to_le32(L2LPERCTL); + dcp->leafidx = cpu_to_le32(CTLLEAFIND); + dcp->height = cpu_to_le32(5); + dcp->budmin = L2BPERDMAP + L2LPERCTL * level; + + /* + * initialize the leaves of current level that were not covered + * by the specified input block range (i.e. the leaves have no + * low level dmapctl or dmap). + */ + cp = &dcp->stree[CTLLEAFIND + i]; + for (; i < LPERCTL; i++) + *cp++ = NOFREE; + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree((struct dmaptree *) dcp)); +} + + +/* + * NAME: dbGetL2AGSize()/ujfs_getagl2size() + * + * FUNCTION: Determine log2(allocation group size) from aggregate size + * + * PARAMETERS: + * nblocks - Number of blocks in aggregate + * + * RETURNS: log2(allocation group size) in aggregate blocks + */ +static int dbGetL2AGSize(s64 nblocks) +{ + s64 sz; + s64 m; + int l2sz; + + if (nblocks < BPERDMAP * MAXAG) + return (L2BPERDMAP); + + /* round up aggregate size to power of 2 */ + m = ((u64) 1 << (64 - 1)); + for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { + if (m & nblocks) + break; + } + + sz = (s64) 1 << l2sz; + if (sz < nblocks) + l2sz += 1; + + /* agsize = roundupSize/max_number_of_ag */ + return (l2sz - L2MAXAG); +} + + +/* + * NAME: dbMapFileSizeToMapSize() + * + * FUNCTION: compute number of blocks the block allocation map file + * can cover from the map file size; + * + * RETURNS: Number of blocks which can be covered by this block map file; + */ + +/* + * maximum number of map pages at each level including control pages + */ +#define MAXL0PAGES (1 + LPERCTL) +#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) +#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) + +/* + * convert number of map pages to the zero origin top dmapctl level + */ +#define BMAPPGTOLEV(npages) \ + (((npages) <= 3 + MAXL0PAGES) ? 0 : \ + ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + +s64 dbMapFileSizeToMapSize(struct inode * ipbmap) +{ + struct super_block *sb = ipbmap->i_sb; + s64 nblocks; + s64 npages, ndmaps; + int level, i; + int complete, factor; + + nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; + npages = nblocks >> JFS_SBI(sb)->l2nbperpage; + level = BMAPPGTOLEV(npages); + + /* At each level, accumulate the number of dmap pages covered by + * the number of full child levels below it; + * repeat for the last incomplete child level. + */ + ndmaps = 0; + npages--; /* skip the first global control page */ + /* skip higher level control pages above top level covered by map */ + npages -= (2 - level); + npages--; /* skip top level's control page */ + for (i = level; i >= 0; i--) { + factor = + (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); + complete = (u32) npages / factor; + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL : + ((i == 1) ? LPERCTL : 1)); + + /* pages in last/incomplete child */ + npages = (u32) npages % factor; + /* skip incomplete child's level control page */ + npages--; + } + + /* convert the number of dmaps into the number of blocks + * which can be covered by the dmaps; + */ + nblocks = ndmaps << L2BPERDMAP; + + return (nblocks); +} diff --git a/kernel/fs/jfs/jfs_dmap.h b/kernel/fs/jfs/jfs_dmap.h new file mode 100644 index 000000000..562b9a7e4 --- /dev/null +++ b/kernel/fs/jfs/jfs_dmap.h @@ -0,0 +1,316 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DMAP +#define _H_JFS_DMAP + +#include "jfs_txnmgr.h" + +#define BMAPVERSION 1 /* version number */ +#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */ +#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */ +#define LPERDMAP 256 /* num leaves per dmap tree */ +#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ +#define DBWORD 32 /* # of blks covered by a map word */ +#define L2DBWORD 5 /* l2 # of blks covered by a mword */ +#define BUDMIN L2DBWORD /* max free string in a map word */ +#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ +#define L2BPERDMAP 13 /* l2 num of blks per dmap */ +#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ +#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */ +#define LPERCTL 1024 /* num of leaves per dmapctl tree */ +#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */ +#define ROOT 0 /* index of the root of a tree */ +#define NOFREE ((s8) -1) /* no blocks free */ +#define MAXAG 128 /* max number of allocation groups */ +#define L2MAXAG 7 /* l2 max num of AG */ +#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */ +#define BMAPBLKNO 0 /* lblkno of bmap within the map */ + +/* + * maximum l2 number of disk blocks at the various dmapctl levels. + */ +#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL) +#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL) +#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL) + +/* + * maximum number of disk blocks at the various dmapctl levels. + */ +#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE) +#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE) +#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE) + +#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ + +/* + * determine the maximum free string for four (lower level) nodes + * of the tree. + */ +static inline signed char TREEMAX(signed char *cp) +{ + signed char tmp1, tmp2; + + tmp1 = max(*(cp+2), *(cp+3)); + tmp2 = max(*(cp), *(cp+1)); + + return max(tmp1, tmp2); +} + +/* + * convert disk block number to the logical block number of the dmap + * describing the disk block. s is the log2(number of logical blocks per page) + * + * The calculation figures out how many logical pages are in front of the dmap. + * - the number of dmaps preceding it + * - the number of L0 pages preceding its L0 page + * - the number of L1 pages preceding its L1 page + * - 3 is added to account for the L2, L1, and L0 page for this dmap + * - 1 is added to account for the control page of the map. + */ +#define BLKTODMAP(b,s) \ + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 0 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L0. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding its L1 page + * - 2 is added to account for the L2, and L1 page for this L0 + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL0(b,s) \ + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 1 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L1. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding it + * - 1 is added to account for the L2 page + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL1(b,s) \ + (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the dmapctl + * at the specified level which describes the disk block. + */ +#define BLKTOCTL(b,s,l) \ + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + +/* + * convert aggregate map size to the zero origin dmapctl level of the + * top dmapctl. + */ +#define BMAPSZTOLEV(size) \ + (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2) + +/* convert disk block number to allocation group number. + */ +#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size)) + +/* convert allocation group number to starting disk block + * number. + */ +#define AGTOBLK(a,ip) \ + ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size)) + +/* + * dmap summary tree + * + * dmaptree must be consistent with dmapctl. + */ +struct dmaptree { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of first tree leaf */ + __le32 height; /* 4: height of the tree */ + s8 budmin; /* 1: min l2 tree leaf value to combine */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +}; /* - 360 - */ + +/* + * dmap page per 8K blocks bitmap + */ +struct dmap { + __le32 nblocks; /* 4: num blks covered by this dmap */ + __le32 nfree; /* 4: num of free blks in this dmap */ + __le64 start; /* 8: starting blkno for this dmap */ + struct dmaptree tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +}; /* - 4096 - */ + +/* + * disk map control page per level. + * + * dmapctl must be consistent with dmaptree. + */ +struct dmapctl { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of the first tree leaf */ + __le32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +}; /* - 4096 - */ + +/* + * common definition for dmaptree within dmap and dmapctl + */ +typedef union dmtree { + struct dmaptree t1; + struct dmapctl t2; +} dmtree_t; + +/* macros for accessing fields within dmtree */ +#define dmt_nleafs t1.nleafs +#define dmt_l2nleafs t1.l2nleafs +#define dmt_leafidx t1.leafidx +#define dmt_height t1.height +#define dmt_budmin t1.budmin +#define dmt_stree t1.stree + +/* + * on-disk aggregate disk allocation map descriptor. + */ +struct dbmap_disk { + __le64 dn_mapsize; /* 8: number of blocks in aggregate */ + __le64 dn_nfree; /* 8: num free blks in aggregate map */ + __le32 dn_l2nbperpage; /* 4: number of blks per page */ + __le32 dn_numag; /* 4: total number of ags */ + __le32 dn_maxlevel; /* 4: number of active ags */ + __le32 dn_maxag; /* 4: max active alloc group number */ + __le32 dn_agpref; /* 4: preferred alloc group (hint) */ + __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ + __le32 dn_agheight; /* 4: height in dmapctl of the AG */ + __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ + __le32 dn_agstart; /* 4: start tree index at AG height */ + __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ + __le64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +}; /* - 4096 - */ + +struct dbmap { + s64 dn_mapsize; /* number of blocks in aggregate */ + s64 dn_nfree; /* num free blks in aggregate map */ + int dn_l2nbperpage; /* number of blks per page */ + int dn_numag; /* total number of ags */ + int dn_maxlevel; /* number of active ags */ + int dn_maxag; /* max active alloc group number */ + int dn_agpref; /* preferred alloc group (hint) */ + int dn_aglevel; /* dmapctl level holding the AG */ + int dn_agheight; /* height in dmapctl of the AG */ + int dn_agwidth; /* width in dmapctl of the AG */ + int dn_agstart; /* start tree index at AG height */ + int dn_agl2size; /* l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* per AG free count */ + s64 dn_agsize; /* num of blks per alloc group */ + signed char dn_maxfreebud; /* max free buddy system */ +}; /* - 4096 - */ +/* + * in-memory aggregate disk allocation map descriptor. + */ +struct bmap { + struct dbmap db_bmap; /* on-disk aggregate map descriptor */ + struct inode *db_ipbmap; /* ptr to aggregate map incore inode */ + struct mutex db_bmaplock; /* aggregate map lock */ + atomic_t db_active[MAXAG]; /* count of active, open files in AG */ + u32 *db_DBmap; +}; + +/* macros for accessing fields within in-memory aggregate map descriptor */ +#define db_mapsize db_bmap.dn_mapsize +#define db_nfree db_bmap.dn_nfree +#define db_agfree db_bmap.dn_agfree +#define db_agsize db_bmap.dn_agsize +#define db_agl2size db_bmap.dn_agl2size +#define db_agwidth db_bmap.dn_agwidth +#define db_agheight db_bmap.dn_agheight +#define db_agstart db_bmap.dn_agstart +#define db_numag db_bmap.dn_numag +#define db_maxlevel db_bmap.dn_maxlevel +#define db_aglevel db_bmap.dn_aglevel +#define db_agpref db_bmap.dn_agpref +#define db_maxag db_bmap.dn_maxag +#define db_maxfreebud db_bmap.dn_maxfreebud +#define db_l2nbperpage db_bmap.dn_l2nbperpage + +/* + * macros for various conversions needed by the allocators. + * blkstol2(), cntlz(), and cnttz() are operating system dependent functions. + */ +/* convert number of blocks to log2 number of blocks, rounding up to + * the next log2 value if blocks is not a l2 multiple. + */ +#define BLKSTOL2(d) (blkstol2(d)) + +/* convert number of leafs to log2 leaf value */ +#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN) + +/* convert leaf index to log2 leaf value */ +#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b)) + +/* convert a block number to a dmap control leaf index */ +#define BLKTOCTLLEAF(b,m) \ + (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m)) + +/* convert log2 leaf value to buddy size */ +#define BUDSIZE(s,m) (1 << ((s) - (m))) + +/* + * external references. + */ +extern int dbMount(struct inode *ipbmap); + +extern int dbUnmount(struct inode *ipbmap, int mounterror); + +extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks); + +extern int dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk); + +extern int dbNextAG(struct inode *ipbmap); + +extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results); + +extern int dbReAlloc(struct inode *ipbmap, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results); + +extern int dbSync(struct inode *ipbmap); +extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); +extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); +extern void dbFinalizeBmap(struct inode *ipbmap); +extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen); + +#endif /* _H_JFS_DMAP */ diff --git a/kernel/fs/jfs/jfs_dtree.c b/kernel/fs/jfs/jfs_dtree.c new file mode 100644 index 000000000..d88576e23 --- /dev/null +++ b/kernel/fs/jfs/jfs_dtree.c @@ -0,0 +1,4576 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_dtree.c: directory B+-tree manager + * + * B+-tree with variable length key directory: + * + * each directory page is structured as an array of 32-byte + * directory entry slots initialized as a freelist + * to avoid search/compaction of free space at insertion. + * when an entry is inserted, a number of slots are allocated + * from the freelist as required to store variable length data + * of the entry; when the entry is deleted, slots of the entry + * are returned to freelist. + * + * leaf entry stores full name as key and file serial number + * (aka inode number) as data. + * internal/router entry stores sufffix compressed name + * as key and simple extent descriptor as data. + * + * each directory page maintains a sorted entry index table + * which stores the start slot index of sorted entries + * to allow binary search on the table. + * + * directory starts as a root/leaf page in on-disk inode + * inline data area. + * when it becomes full, it starts a leaf of a external extent + * of length of 1 block. each time the first leaf becomes full, + * it is extended rather than split (its size is doubled), + * until its length becoms 4 KBytes, from then the extent is split + * with new 4 Kbyte extent when it becomes full + * to reduce external fragmentation of small directories. + * + * blah, blah, blah, for linear scan of directory in pieces by + * readdir(). + * + * + * case-insensitive directory file system + * + * names are stored in case-sensitive way in leaf entry. + * but stored, searched and compared in case-insensitive (uppercase) order + * (i.e., both search key and entry key are folded for search/compare): + * (note that case-sensitive order is BROKEN in storage, e.g., + * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad + * + * entries which folds to the same key makes up a equivalent class + * whose members are stored as contiguous cluster (may cross page boundary) + * but whose order is arbitrary and acts as duplicate, e.g., + * abc, Abc, aBc, abC) + * + * once match is found at leaf, requires scan forward/backward + * either for, in case-insensitive search, duplicate + * or for, in case-sensitive search, for exact match + * + * router entry must be created/stored in case-insensitive way + * in internal entry: + * (right most key of left page and left most key of right page + * are folded, and its suffix compression is propagated as router + * key in parent) + * (e.g., if split occurs and , trather than + * should be made the router key for the split) + * + * case-insensitive search: + * + * fold search key; + * + * case-insensitive search of B-tree: + * for internal entry, router key is already folded; + * for leaf entry, fold the entry key before comparison. + * + * if (leaf entry case-insensitive match found) + * if (next entry satisfies case-insensitive match) + * return EDUPLICATE; + * if (prev entry satisfies case-insensitive match) + * return EDUPLICATE; + * return match; + * else + * return no match; + * + * serialization: + * target directory inode lock is being held on entry/exit + * of all main directory service routines. + * + * log based recovery: + */ + +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* dtree split parameter */ +struct dtsplit { + struct metapage *mp; + s16 index; + s16 nslot; + struct component_name *key; + ddata_t *data; + struct pxdlist *pxdlist; +}; + +#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) + +/* get page buffer for specified block address */ +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ + if (!(RC)) { \ + if (((P)->header.nextindex > \ + (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + BT_PUTPAGE(MP); \ + jfs_error((IP)->i_sb, \ + "DT_GETPAGE: dtree page corrupt\n"); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) + +/* for consistency */ +#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) + +/* + * forward references + */ +static int dtSplitUp(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); + +static int dtExtendPage(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitRoot(tid_t tid, struct inode *ip, + struct dtsplit * split, struct metapage ** rmpp); + +static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + dtpage_t * fp, struct btstack * btstack); + +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); + +static int dtReadFirst(struct inode *ip, struct btstack * btstack); + +static int dtReadNext(struct inode *ip, + loff_t * offset, struct btstack * btstack); + +static int dtCompare(struct component_name * key, dtpage_t * p, int si); + +static int ciCompare(struct component_name * key, dtpage_t * p, int si, + int flag); + +static void dtGetKey(dtpage_t * p, int i, struct component_name * key, + int flag); + +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag); + +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock **); + +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index); + +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); + +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); + +static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); + +#define ciToUpper(c) UniStrupr((c)->name) + +/* + * read_index_page() + * + * Reads a page of a directory's index table. + * Having metadata mapped into the directory inode's address space + * presents a multitude of problems. We avoid this by mapping to + * the absolute address space outside of the *_metapage routines + */ +static struct metapage *read_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return read_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * get_index_page() + * + * Same as get_index_page(), but get's a new page without reading + */ +static struct metapage *get_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return get_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * find_index() + * + * Returns dtree page containing directory table entry for specified + * index and pointer to its entry. + * + * mp must be released by caller. + */ +static struct dir_table_slot *find_index(struct inode *ip, u32 index, + struct metapage ** mp, s64 *lblock) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + s64 blkno; + s64 offset; + int page_offset; + struct dir_table_slot *slot; + static int maxWarnings = 10; + + if (index < 2) { + if (maxWarnings) { + jfs_warn("find_entry called with index = %d", index); + maxWarnings--; + } + return NULL; + } + + if (index >= jfs_ip->next_index) { + jfs_warn("find_entry called with index >= next_index"); + return NULL; + } + + if (jfs_dirtable_inline(ip)) { + /* + * Inline directory table + */ + *mp = NULL; + slot = &jfs_ip->i_dirtable[index - 2]; + } else { + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << + JFS_SBI(ip->i_sb)->l2nbperpage; + + if (*mp && (*lblock != blkno)) { + release_metapage(*mp); + *mp = NULL; + } + if (!(*mp)) { + *lblock = blkno; + *mp = read_index_page(ip, blkno); + } + if (!(*mp)) { + jfs_err("free_index: error reading directory table"); + return NULL; + } + + slot = + (struct dir_table_slot *) ((char *) (*mp)->data + + page_offset); + } + return slot; +} + +static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, + u32 index) +{ + struct tlock *tlck; + struct linelock *llck; + struct lv *lv; + + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) tlck->lock; + + if (llck->index >= llck->maxcnt) + llck = txLinelock(llck); + lv = &llck->lv[llck->index]; + + /* + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. + */ + lv->offset = ((index - 2) & 511) >> 1; + lv->length = 1; + llck->index++; +} + +/* + * add_index() + * + * Adds an entry to the directory index table. This is used to provide + * each directory entry with a persistent index in which to resume + * directory traversals + */ +static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) +{ + struct super_block *sb = ip->i_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + u64 blkno; + struct dir_table_slot *dirtab_slot; + u32 index; + struct linelock *llck; + struct lv *lv; + struct metapage *mp; + s64 offset; + uint page_offset; + struct tlock *tlck; + s64 xaddr; + + ASSERT(DO_INDEX(ip)); + + if (jfs_ip->next_index < 2) { + jfs_warn("add_index: next_index = %d. Resetting!", + jfs_ip->next_index); + jfs_ip->next_index = 2; + } + + index = jfs_ip->next_index++; + + if (index <= MAX_INLINE_DIRTABLE_ENTRY) { + /* + * i_size reflects size of index table, or 8 bytes per entry. + */ + ip->i_size = (loff_t) (index - 1) << 3; + + /* + * dir table fits inline within inode + */ + dirtab_slot = &jfs_ip->i_dirtable[index-2]; + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + set_cflag(COMMIT_Dirtable, ip); + + return index; + } + if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + struct dir_table_slot temp_table[12]; + + /* + * It's time to move the inline table to an external + * page and begin to build the xtree + */ + if (dquot_alloc_block(ip, sbi->nbperpage)) + goto clean_up; + if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + + /* + * Save the table, we're going to overwrite it with the + * xtree root + */ + memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); + + /* + * Initialize empty x-tree + */ + xtInitRoot(tid, ip); + + /* + * Add the first block to the xtree + */ + if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { + /* This really shouldn't fail */ + jfs_warn("add_index: xtInsert failed!"); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + dbFree(ip, xaddr, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + ip->i_size = PSIZE; + + mp = get_index_page(ip, 0); + if (!mp) { + jfs_err("add_index: get_metapage failed!"); + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + goto clean_up; + } + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) & tlck->lock; + ASSERT(llck->index == 0); + lv = &llck->lv[0]; + + lv->offset = 0; + lv->length = 6; /* tlckDATA slot size is 16 bytes */ + llck->index++; + + memcpy(mp->data, temp_table, sizeof(temp_table)); + + mark_metapage_dirty(mp); + release_metapage(mp); + + /* + * Logging is now directed by xtree tlocks + */ + clear_cflag(COMMIT_Dirtable, ip); + } + + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; + if (page_offset == 0) { + /* + * This will be the beginning of a new page + */ + xaddr = 0; + if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { + jfs_warn("add_index: xtInsert failed!"); + goto clean_up; + } + ip->i_size += PSIZE; + + if ((mp = get_index_page(ip, blkno))) + memset(mp->data, 0, PSIZE); /* Just looks better */ + else + xtTruncate(tid, ip, offset, COMMIT_PWMAP); + } else + mp = read_index_page(ip, blkno); + + if (!mp) { + jfs_err("add_index: get/read_metapage failed!"); + goto clean_up; + } + + lock_index(tid, ip, mp, index); + + dirtab_slot = + (struct dir_table_slot *) ((char *) mp->data + page_offset); + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + mark_metapage_dirty(mp); + release_metapage(mp); + + return index; + + clean_up: + + jfs_ip->next_index--; + + return 0; +} + +/* + * free_index() + * + * Marks an entry to the directory index table as free. + */ +static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) +{ + struct dir_table_slot *dirtab_slot; + s64 lblock; + struct metapage *mp = NULL; + + dirtab_slot = find_index(ip, index, &mp, &lblock); + + if (!dirtab_slot) + return; + + dirtab_slot->flag = DIR_INDEX_FREE; + dirtab_slot->slot = dirtab_slot->addr1 = 0; + dirtab_slot->addr2 = cpu_to_le32(next); + + if (mp) { + lock_index(tid, ip, mp, index); + mark_metapage_dirty(mp); + release_metapage(mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * modify_index() + * + * Changes an entry in the directory index table + */ +static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, + int slot, struct metapage ** mp, s64 *lblock) +{ + struct dir_table_slot *dirtab_slot; + + dirtab_slot = find_index(ip, index, mp, lblock); + + if (!dirtab_slot) + return; + + DTSaddress(dirtab_slot, bn); + dirtab_slot->slot = slot; + + if (*mp) { + lock_index(tid, ip, *mp, index); + mark_metapage_dirty(*mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * read_index() + * + * reads a directory table slot + */ +static int read_index(struct inode *ip, u32 index, + struct dir_table_slot * dirtab_slot) +{ + s64 lblock; + struct metapage *mp = NULL; + struct dir_table_slot *slot; + + slot = find_index(ip, index, &mp, &lblock); + if (!slot) { + return -EIO; + } + + memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); + + if (mp) + release_metapage(mp); + + return 0; +} + +/* + * dtSearch() + * + * function: + * Search for the entry with specified key + * + * parameter: + * + * return: 0 - search result on stack, leaf page pinned; + * errno - I/O error + */ +int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, + struct btstack * btstack, int flag) +{ + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + int base, index, lim; + struct btframe *btsp; + pxd_t *pxd; + int psize = 288; /* initial in-line directory */ + ino_t inumber; + struct component_name ciKey; + struct super_block *sb = ip->i_sb; + + ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS); + if (!ciKey.name) { + rc = -ENOMEM; + goto dtSearch_Exit2; + } + + + /* uppercase search key for c-i directory */ + UniStrcpy(ciKey.name, key->name); + ciKey.namlen = key->namlen; + + /* only uppercase if case-insensitive support is on */ + if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { + ciToUpper(&ciKey); + } + BT_CLR(btstack); /* reset stack */ + + /* init level count for max pages to split */ + btstack->nsplit = 1; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + goto dtSearch_Exit1; + + /* get sorted entry table of the page */ + stbl = DT_GETSTBL(p); + + /* + * binary search with search key K on the current page. + */ + for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { + index = base + (lim >> 1); + + if (p->header.flag & BT_LEAF) { + /* uppercase leaf name to compare */ + cmp = + ciCompare(&ciKey, p, stbl[index], + JFS_SBI(sb)->mntflag); + } else { + /* router key is in uppercase */ + + cmp = dtCompare(&ciKey, p, stbl[index]); + + + } + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + inumber = le32_to_cpu( + ((struct ldtentry *) & p->slot[stbl[index]])->inumber); + + /* + * search for JFS_LOOKUP + */ + if (flag == JFS_LOOKUP) { + *data = inumber; + rc = 0; + goto out; + } + + /* + * search for JFS_CREATE + */ + if (flag == JFS_CREATE) { + *data = inumber; + rc = -EEXIST; + goto out; + } + + /* + * search for JFS_REMOVE or JFS_RENAME + */ + if ((flag == JFS_REMOVE || + flag == JFS_RENAME) && + *data != inumber) { + rc = -ESTALE; + goto out; + } + + /* + * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME + */ + /* save search result */ + *data = inumber; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto getChild; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or (maxindex + 1) index. + */ + /* + * search miss - leaf page + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + /* + * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME + */ + if (flag == JFS_LOOKUP || flag == JFS_REMOVE || + flag == JFS_RENAME) { + rc = -ENOENT; + goto out; + } + + /* + * search for JFS_CREATE|JFS_FINDDIR: + * + * save search result + */ + *data = 0; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* + * search miss - internal page + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + getChild: + /* update max. number of pages to split */ + if (BT_STACK_FULL(btstack)) { + /* Something's corrupted, mark filesystem dirty so + * chkdsk will fix it. + */ + jfs_error(sb, "stack overrun!\n"); + BT_STACK_DUMP(btstack); + rc = -EIO; + goto out; + } + btstack->nsplit++; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + pxd = (pxd_t *) & p->slot[stbl[index]]; + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + out: + DT_PUTPAGE(mp); + + dtSearch_Exit1: + + kfree(ciKey.name); + + dtSearch_Exit2: + + return rc; +} + + +/* + * dtInsert() + * + * function: insert an entry to directory tree + * + * parameter: + * + * return: 0 - success; + * errno - failure; + */ +int dtInsert(tid_t tid, struct inode *ip, + struct component_name * name, ino_t * fsn, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + dtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index; + struct dtsplit split; /* split information */ + ddata_t data; + struct dt_lock *dtlck; + int n; + struct tlock *tlck; + struct lv *lv; + + /* + * retrieve search result + * + * dtSearch() returns (leaf page pinned, index at which to insert). + * n.b. dtSearch() may return index of (maxindex + 1) of + * the full page. + */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* + * insert entry for new key + */ + if (DO_INDEX(ip)) { + if (JFS_IP(ip)->next_index == DIREND) { + DT_PUTPAGE(mp); + return -EMLINK; + } + n = NDTLEAF(name->namlen); + data.leaf.tid = tid; + data.leaf.ip = ip; + } else { + n = NDTLEAF_LEGACY(name->namlen); + data.leaf.ip = NULL; /* signifies legacy directory format */ + } + data.leaf.ino = *fsn; + + /* + * leaf page does not have enough room for new entry: + * + * extend/split the leaf page; + * + * dtSplitUp() will insert the entry and unpin the leaf page. + */ + if (n > p->header.freecnt) { + split.mp = mp; + split.index = index; + split.nslot = n; + split.key = name; + split.data = &data; + rc = dtSplitUp(tid, ip, &split, btstack); + return rc; + } + + /* + * leaf page does have enough room for new entry: + * + * insert the new data entry into the leaf page; + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + dtInsertEntry(p, index, name, &data, &dtlck); + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + n = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + n; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + + +/* + * dtSplitUp() + * + * function: propagate insertion bottom up; + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * leaf page unpinned; + */ +static int dtSplitUp(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int rc = 0; + struct metapage *smp; + dtpage_t *sp; /* split page */ + struct metapage *rmp; + dtpage_t *rp; /* new right page split from sp */ + pxd_t rpxd; /* new right page extent descriptor */ + struct metapage *lmp; + dtpage_t *lp; /* left child page */ + int skip; /* index of entry of insertion */ + struct btframe *parent; /* parent page entry on traverse stack */ + s64 xaddr, nxaddr; + int xlen, xsize; + struct pxdlist pxdlist; + pxd_t *pxd; + struct component_name key = { 0, NULL }; + ddata_t *data = split->data; + int n; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int quota_allocation = 0; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS); + if (!key.name) { + DT_PUTPAGE(smp); + rc = -ENOMEM; + goto dtSplitUp_Exit; + } + + /* + * split leaf page + * + * The split routines insert the new entry, and + * acquire txLock as appropriate. + */ + /* + * split root leaf page: + */ + if (sp->header.flag & BT_ROOT) { + /* + * allocate a single extent child page + */ + xlen = 1; + n = sbi->bsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ + if (n <= split->nslot) + xlen++; + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { + DT_PUTPAGE(smp); + goto freeKeyName; + } + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + split->pxdlist = &pxdlist; + rc = dtSplitRoot(tid, ip, split, &rmp); + + if (rc) + dbFree(ip, xaddr, xlen); + else + DT_PUTPAGE(rmp); + + DT_PUTPAGE(smp); + + if (!DO_INDEX(ip)) + ip->i_size = xlen << sbi->l2bsize; + + goto freeKeyName; + } + + /* + * extend first leaf page + * + * extend the 1st extent if less than buffer page size + * (dtExtendPage() reurns leaf page unpinned) + */ + pxd = &sp->header.self; + xlen = lengthPXD(pxd); + xsize = xlen << sbi->l2bsize; + if (xsize < PSIZE) { + xaddr = addressPXD(pxd); + n = xsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + if ((n + sp->header.freecnt) <= split->nslot) + n = xlen + (xlen << 1); + else + n = xlen; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, n); + if (rc) + goto extendOut; + quota_allocation += n; + + if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, + (s64) n, &nxaddr))) + goto extendOut; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, nxaddr); + PXDlength(pxd, xlen + n); + split->pxdlist = &pxdlist; + if ((rc = dtExtendPage(tid, ip, split, btstack))) { + nxaddr = addressPXD(pxd); + if (xaddr != nxaddr) { + /* free relocated extent */ + xlen = lengthPXD(pxd); + dbFree(ip, nxaddr, (s64) xlen); + } else { + /* free extended delta */ + xlen = lengthPXD(pxd) - n; + xaddr = addressPXD(pxd) + xlen; + dbFree(ip, xaddr, (s64) n); + } + } else if (!DO_INDEX(ip)) + ip->i_size = lengthPXD(pxd) << sbi->l2bsize; + + + extendOut: + DT_PUTPAGE(smp); + goto freeKeyName; + } + + /* + * split leaf page into and a new right page . + * + * return pinned and its extent descriptor + */ + /* + * allocate new directory page extent and + * new index page(s) to cover page split(s) + * + * allocation hint: ? + */ + n = btstack->nsplit; + pxdlist.maxnpxd = pxdlist.npxd = 0; + xlen = sbi->nbperpage; + for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + pxdlist.maxnpxd++; + continue; + } + + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + split->pxdlist = &pxdlist; + if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + if (!DO_INDEX(ip)) + ip->i_size += PSIZE; + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 4 pages pinned at any time: + * two children, left parent and right parent (when the parent splits). + * keep the child pages pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages (, ) pinned */ + lmp = smp; + lp = sp; + + /* + * insert router entry in parent for new right child page + */ + /* get the parent page */ + DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + goto splitOut; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * compute the key for the router entry + * + * key suffix compression: + * for internal pages that have leaf pages as children, + * retain only what's needed to distinguish between + * the new entry and the entry on the page to its left. + * If the keys compare equal, retain the entire key. + * + * note that compression is performed only at computing + * router key at the lowest internal level. + * further compression of the key between pairs of higher + * level internal pages loses too much information and + * the search may fail. + * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} + * results in two adjacent parent entries (a)(xx). + * if split occurs between these two entries, and + * if compression is applied, the router key of parent entry + * of right page (x) will divert search for x into right + * subtree and miss x in the left subtree.) + * + * the entire key must be retained for the next-to-leftmost + * internal key at any level of the tree, or search may fail + * (e.g., ?) + */ + switch (rp->header.flag & BT_TYPE) { + case BT_LEAF: + /* + * compute the length of prefix for suffix compression + * between last entry of left page and first entry + * of right page + */ + if ((sp->header.flag & BT_ROOT && skip > 1) || + sp->header.prev != 0 || skip > 1) { + /* compute uppercase router prefix key */ + rc = ciGetLeafPrefixKey(lp, + lp->header.nextindex-1, + rp, 0, &key, + sbi->mntflag); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + DT_PUTPAGE(smp); + goto splitOut; + } + } else { + /* next to leftmost entry of + lowest internal level */ + + /* compute uppercase router key */ + dtGetKey(rp, 0, &key, sbi->mntflag); + key.name[key.namlen] = 0; + + if ((sbi->mntflag & JFS_OS2) == JFS_OS2) + ciToUpper(&key); + } + + n = NDTINTERNAL(key.namlen); + break; + + case BT_INTERNAL: + dtGetKey(rp, 0, &key, sbi->mntflag); + n = NDTINTERNAL(key.namlen); + break; + + default: + jfs_err("dtSplitUp(): UFO!"); + break; + } + + /* unpin left child page */ + DT_PUTPAGE(lmp); + + /* + * compute the data for the router entry + */ + data->xd = rpxd; /* child page xd */ + + /* + * parent page is full - split the parent page + */ + if (n > sp->header.freecnt) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->nslot = n; + split->key = &key; + /* split->data = data; */ + + /* unpin right child page */ + DT_PUTPAGE(rmp); + + /* The split routines insert the new entry, + * acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + dtSplitRoot(tid, ip, split, &rmp) : + dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); + if (rc) { + DT_PUTPAGE(smp); + goto splitOut; + } + + /* smp and rmp are pinned */ + } + /* + * parent page is not full - insert router entry in parent page + */ + else { + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the parent page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root parent page */ + if (!(sp->header.flag & BT_ROOT)) { + lv++; + n = skip >> L2DTSLOTSIZE; + lv->offset = sp->header.stblindex + n; + lv->length = + ((sp->header.nextindex - + 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + dtInsertEntry(sp, skip, &key, data, &dtlck); + + /* exit propagate up */ + break; + } + } + + /* unpin current split and its right page */ + DT_PUTPAGE(smp); + DT_PUTPAGE(rmp); + + /* + * free remaining extents allocated for split + */ + splitOut: + n = pxdlist.npxd; + pxd = &pxdlist.pxd[n]; + for (; n < pxdlist.maxnpxd; n++, pxd++) + dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); + + freeKeyName: + kfree(key.name); + + /* Rollback quota allocation */ + if (rc && quota_allocation) + dquot_free_block(ip, quota_allocation); + + dtSplitUp_Exit: + + return rc; +} + + +/* + * dtSplitPage() + * + * function: Split a non-root page of a btree. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return split and new page pinned; + */ +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) +{ + int rc = 0; + struct metapage *smp; + dtpage_t *sp; + struct metapage *rmp; + dtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + dtpage_t *p; + s64 nextbn; + struct pxdlist *pxdlist; + pxd_t *pxd; + int skip, nextindex, half, left, nxt, off, si; + struct ldtentry *ldtentry; + struct idtentry *idtentry; + u8 *stbl; + struct dtslot *f; + int fsi, stblsize; + int n; + struct dt_lock *sdtlck, *rdtlck; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *slv, *rlv, *lv; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* + * allocate the new right page for the split + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + rdtlck = (struct dt_lock *) & tlck->lock; + + rp = (dtpage_t *) rmp->data; + *rpp = rp; + rp->header.self = *pxd; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the split page + * + * action: + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + sdtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of split page */ + ASSERT(sdtlck->index == 0); + slv = & sdtlck->lv[0]; + slv->offset = 0; + slv->length = 1; + sdtlck->index++; + + /* + * initialize/update sibling pointers between sp and rp + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + /* + * initialize new right page + */ + rp->header.flag = sp->header.flag; + + /* compute sorted entry table at start of extent data area */ + rp->header.nextindex = 0; + rp->header.stblindex = 1; + + n = PSIZE >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ + + /* init freelist */ + fsi = rp->header.stblindex + stblsize; + rp->header.freelist = fsi; + rp->header.freecnt = rp->header.maxslot - fsi; + + /* + * sequential append at tail: append without split + * + * If splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. Adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * If we're wrong it's no big deal, we'll just do the split the right + * way next time. + * (It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, + * but it's not. Be my guest.) + */ + if (nextbn == 0 && split->index == sp->header.nextindex) { + /* linelock header + stbl (first slot) of new page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 2; + rdtlck->index++; + + /* + * initialize freelist of new right page + */ + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* insert entry at the first entry of the new right page */ + dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); + + goto out; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update prev pointer of previous right sibling page; + */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + discard_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of previous right sibling page */ + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(rbn); + + DT_PUTPAGE(mp); + } + + /* + * split the data between the split and right pages. + */ + skip = split->index; + half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ + left = 0; + + /* + * compute fill factor for split pages + * + * traces the next entry to move to rp + * traces the next entry to stay in sp + */ + stbl = (u8 *) & sp->slot[sp->header.stblindex]; + nextindex = sp->header.nextindex; + for (nxt = off = 0; nxt < nextindex; ++off) { + if (off == skip) + /* check for fill factor with new entry size */ + n = split->nslot; + else { + si = stbl[nxt]; + switch (sp->header.flag & BT_TYPE) { + case BT_LEAF: + ldtentry = (struct ldtentry *) & sp->slot[si]; + if (DO_INDEX(ip)) + n = NDTLEAF(ldtentry->namlen); + else + n = NDTLEAF_LEGACY(ldtentry-> + namlen); + break; + + case BT_INTERNAL: + idtentry = (struct idtentry *) & sp->slot[si]; + n = NDTINTERNAL(idtentry->namlen); + break; + + default: + break; + } + + ++nxt; /* advance to next entry to move in sp */ + } + + left += n; + if (left >= half) + break; + } + + /* poins to the 1st entry to move */ + + /* + * move entries to right page + * + * dtMoveEntry() initializes rp and reserves entry for insertion + * + * split page moved out entries are linelocked; + * new/right page moved in entries are linelocked; + */ + /* linelock header + stbl of new right page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 5; + rdtlck->index++; + + dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); + + sp->header.nextindex = nxt; + + /* + * finalize freelist of new right page + */ + fsi = rp->header.freelist; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + + /* + * the skipped index was on the left page, + */ + if (skip <= off) { + /* insert the new entry in the split page */ + dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); + + /* linelock stbl of split page */ + if (sdtlck->index >= sdtlck->maxcnt) + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[sdtlck->index]; + n = skip >> L2DTSLOTSIZE; + slv->offset = sp->header.stblindex + n; + slv->length = + ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + sdtlck->index++; + } + /* + * the skipped index was on the right page, + */ + else { + /* adjust the skip index to reflect the new position */ + skip -= nxt; + + /* insert the new entry in the right page */ + dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); + } + + out: + *rmpp = rmp; + *rpxdp = *pxd; + + return rc; +} + + +/* + * dtExtendPage() + * + * function: extend 1st/only directory leaf page + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return extended page pinned; + */ +static int dtExtendPage(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct super_block *sb = ip->i_sb; + int rc; + struct metapage *smp, *pmp, *mp; + dtpage_t *sp, *pp; + struct pxdlist *pxdlist; + pxd_t *pxd, *tpxd; + int xlen, xsize; + int newstblindex, newstblsize; + int oldstblindex, oldstblsize; + int fsi, last; + struct dtslot *f; + struct btframe *parent; + int n; + struct dt_lock *dtlck; + s64 xaddr, txaddr; + struct tlock *tlck; + struct pxd_lock *pxdlock; + struct lv *lv; + uint type; + struct ldtentry *ldtentry; + u8 *stbl; + + /* get page to extend */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* get parent/root page */ + parent = BT_POP(btstack); + DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); + if (rc) + return (rc); + + /* + * extend the extent + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + + xaddr = addressPXD(pxd); + tpxd = &sp->header.self; + txaddr = addressPXD(tpxd); + /* in-place extension */ + if (xaddr == txaddr) { + type = tlckEXTEND; + } + /* relocation */ + else { + type = tlckNEW; + + /* save moved extent descriptor for later free */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = sp->header.self; + pxdlock->index = 1; + + /* + * Update directory index table to reflect new page address + */ + if (DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(sp); + for (n = 0; n < sp->header.nextindex; n++) { + ldtentry = + (struct ldtentry *) & sp->slot[stbl[n]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + xaddr, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + /* + * extend the page + */ + sp->header.self = *pxd; + + jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the extended/leaf page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | type); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[0]; + + /* update buffer extent descriptor of extended page */ + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + + /* + * copy old stbl to new stbl at start of extended area + */ + oldstblindex = sp->header.stblindex; + oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; + newstblindex = sp->header.maxslot; + n = xsize >> L2DTSLOTSIZE; + newstblsize = (n + 31) >> L2DTSLOTSIZE; + memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], + sp->header.nextindex); + + /* + * in-line extension: linelock old area of extended page + */ + if (type == tlckEXTEND) { + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + lv++; + + /* linelock new stbl of extended page */ + lv->offset = newstblindex; + lv->length = newstblsize; + } + /* + * relocation: linelock whole relocated area + */ + else { + lv->offset = 0; + lv->length = sp->header.maxslot + newstblsize; + } + + dtlck->index++; + + sp->header.maxslot = n; + sp->header.stblindex = newstblindex; + /* sp->header.nextindex remains the same */ + + /* + * add old stbl region at head of freelist + */ + fsi = oldstblindex; + f = &sp->slot[fsi]; + last = sp->header.freelist; + for (n = 0; n < oldstblsize; n++, fsi++, f++) { + f->next = last; + last = fsi; + } + sp->header.freelist = last; + sp->header.freecnt += oldstblsize; + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = newstblindex + newstblsize; + f = &sp->slot[fsi]; + for (fsi++; fsi < sp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + sp->header.freelist = n; + else { + do { + f = &sp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + sp->header.freecnt += sp->header.maxslot - n; + + /* + * insert the new entry + */ + dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); + + BT_MARK_DIRTY(pmp, ip); + /* + * linelock any freeslots residing in old extent + */ + if (type == tlckEXTEND) { + n = sp->header.maxslot >> 2; + if (sp->header.freelist < n) + dtLinelockFreelist(sp, n, &dtlck); + } + + /* + * update parent entry on the parent/root page + */ + /* + * acquire a transaction lock on the parent/root page + */ + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* linelock parent entry - 1st slot */ + lv->offset = 1; + lv->length = 1; + dtlck->index++; + + /* update the parent pxd for page extension */ + tpxd = (pxd_t *) & pp->slot[1]; + *tpxd = *pxd; + + DT_PUTPAGE(pmp); + return 0; +} + + +/* + * dtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return new page pinned; + */ +static int dtSplitRoot(tid_t tid, + struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) +{ + struct super_block *sb = ip->i_sb; + struct metapage *smp; + dtroot_t *sp; + struct metapage *rmp; + dtpage_t *rp; + s64 rbn; + int xlen; + int xsize; + struct dtslot *f; + s8 *stbl; + int fsi, stblsize, n; + struct idtentry *s; + pxd_t *ppxd; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int rc; + + /* get split root page */ + smp = split->mp; + sp = &JFS_IP(ip)->i_dtroot; + + /* + * allocate/initialize a single (right) child page + * + * N.B. at first split, a one (or two) block to fit new entry + * is allocated; at subsequent split, a full page is allocated; + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + rmp = get_metapage(ip, rbn, xsize, 1); + if (!rmp) + return -EIO; + + rp = rmp->data; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * move in-line root page into new right page extent + */ + /* linelock header + copied entries + new stbl (1st slot) in new page */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 10; /* 1 + 8 + 1 */ + dtlck->index++; + + n = xsize >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; + + /* copy old stbl to new stbl at start of extended area */ + rp->header.stblindex = DTROOTMAXSLOT; + stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; + memcpy(stbl, sp->header.stbl, sp->header.nextindex); + rp->header.nextindex = sp->header.nextindex; + + /* copy old data area to start of new data area */ + memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = DTROOTMAXSLOT + stblsize; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + rp->header.freelist = n; + else { + rp->header.freelist = fsi; + + do { + f = &rp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + struct metapage *mp = NULL; + struct ldtentry *ldtentry; + + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); + + /* + * reset parent/root page + * + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the root page (in-memory inode) + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + /* update page header of root */ + if (sp->header.flag & BT_LEAF) { + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + } + + /* init the first entry */ + s = (struct idtentry *) & sp->slot[DTENTRYSTART]; + ppxd = (pxd_t *) s; + *ppxd = *pxd; + s->next = -1; + s->namlen = 0; + + stbl = sp->header.stbl; + stbl[0] = DTENTRYSTART; + sp->header.nextindex = 1; + + /* init freelist */ + fsi = DTENTRYSTART + 1; + f = &sp->slot[fsi]; + + /* init free region of remaining area */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + sp->header.freelist = DTENTRYSTART + 1; + sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); + + *rmpp = rmp; + + return 0; +} + + +/* + * dtDelete() + * + * function: delete the entry(s) referenced by a key. + * + * parameter: + * + * return: + */ +int dtDelete(tid_t tid, + struct inode *ip, struct component_name * key, ino_t * ino, int flag) +{ + int rc = 0; + s64 bn; + struct metapage *mp, *imp; + dtpage_t *p; + int index; + struct btstack btstack; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int i; + struct ldtentry *ldtentry; + u8 *stbl; + u32 table_index, next_index; + struct metapage *nmp; + dtpage_t *np; + + /* + * search for the entry to delete: + * + * dtSearch() returns (leaf page pinned, index at which to delete). + */ + if ((rc = dtSearch(ip, key, ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* + * We need to find put the index of the next entry into the + * directory index table in order to resume a readdir from this + * entry. + */ + if (DO_INDEX(ip)) { + stbl = DT_GETSTBL(p); + ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; + table_index = le32_to_cpu(ldtentry->index); + if (index == (p->header.nextindex - 1)) { + /* + * Last entry in this leaf page + */ + if ((p->header.flag & BT_ROOT) + || (p->header.next == 0)) + next_index = -1; + else { + /* Read next leaf page */ + DT_GETPAGE(ip, le64_to_cpu(p->header.next), + nmp, PSIZE, np, rc); + if (rc) + next_index = -1; + else { + stbl = DT_GETSTBL(np); + ldtentry = + (struct ldtentry *) & np-> + slot[stbl[0]]; + next_index = + le32_to_cpu(ldtentry->index); + DT_PUTPAGE(nmp); + } + } + } else { + ldtentry = + (struct ldtentry *) & p->slot[stbl[index + 1]]; + next_index = le32_to_cpu(ldtentry->index); + } + free_index(tid, ip, table_index, next_index); + } + /* + * the leaf page becomes empty, delete the page + */ + if (p->header.nextindex == 1) { + /* delete empty page */ + rc = dtDeleteUp(tid, ip, mp, p, &btstack); + } + /* + * the leaf page has other entries remaining: + * + * delete the entry from the leaf page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* + * Do not assume that dtlck->index will be zero. During a + * rename within a directory, this transaction may have + * modified this page already when adding the new entry. + */ + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the leaf entry */ + dtDeleteEntry(p, index, &dtlck); + + /* + * Update directory index table for entries moved in stbl + */ + if (DO_INDEX(ip) && index < p->header.nextindex) { + s64 lblock; + + imp = NULL; + stbl = DT_GETSTBL(p); + for (i = index; i < p->header.nextindex; i++) { + ldtentry = + (struct ldtentry *) & p->slot[stbl[i]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + bn, i, &imp, &lblock); + } + if (imp) + release_metapage(imp); + } + + DT_PUTPAGE(mp); + } + + return rc; +} + + +/* + * dtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int dtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + dtpage_t *p; + int index, nextindex; + int xlen; + struct btframe *parent; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + struct pxd_lock *pxdlock; + int i; + + /* + * keep the root leaf page which has become empty + */ + if (BT_IS_ROOT(fmp)) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(fmp); + + return 0; + } + + /* + * free the non-root leaf page + */ + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor, and + * the buffer page is freed; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = fp->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, fp))) { + BT_PUTPAGE(fmp); + return rc; + } + + xlen = lengthPXD(&fp->header.self); + + /* Free quota allocation. */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the directory tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* pin the parent page */ + DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * free the extent of the child page deleted + */ + index = parent->index; + + /* + * delete the entry for the child page from parent + */ + nextindex = p->header.nextindex; + + /* + * the parent has the single entry being deleted: + * + * free the parent page which has become empty. + */ + if (nextindex == 1) { + /* + * keep the root internal page which has become empty + */ + if (p->header.flag & BT_ROOT) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(mp); + + return 0; + } + /* + * free the parent page + */ + else { + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + */ + tlck = + txMaplock(tid, ip, + tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = p->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, p))) { + DT_PUTPAGE(mp); + return rc; + } + + xlen = lengthPXD(&p->header.self); + + /* Free quota allocation */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + + /* + * the parent has other entries remaining: + * + * delete the router entry from the parent page. + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the page + * + * action: router entry deletion + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the router entry */ + dtDeleteEntry(p, index, &dtlck); + + /* reset key of new leftmost entry of level (for consistency) */ + if (index == 0 && + ((p->header.flag & BT_ROOT) || p->header.prev == 0)) + dtTruncateEntry(p, 0, &dtlck); + + /* unpin the parent page */ + DT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + if (!DO_INDEX(ip)) + ip->i_size -= PSIZE; + + return 0; +} + +#ifdef _NOTYET +/* + * NAME: dtRelocate() + * + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. + */ +int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, + s64 nxaddr) +{ + int rc = 0; + struct metapage *mp, *pmp, *lmp, *rmp; + dtpage_t *p, *pp, *rp = 0, *lp= 0; + s64 bn; + int index; + struct btstack btstack; + pxd_t *pxd; + s64 oxaddr, nextbn, prevbn; + int xlen, xsize; + struct tlock *tlck; + struct dt_lock *dtlck; + struct pxd_lock *pxdlock; + s8 *stbl; + struct lv *lv; + + oxaddr = addressPXD(opxd); + xlen = lengthPXD(opxd); + + jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", + (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, + xlen); + + /* + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; + */ + rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); + if (rc) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("dtRelocate: parent router entry validated."); + + /* + * 2. relocate the target dtpage + */ + /* read in the target page from src extent */ + DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + /* release the pinned parent page */ + DT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + if (rmp) + DT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling dtpages if any; + */ + if (lmp) { + tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + lp->header.next = cpu_to_le64(nxaddr); + DT_PUTPAGE(lmp); + } + + if (rmp) { + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + rp->header.prev = cpu_to_le64(nxaddr); + DT_PUTPAGE(rmp); + } + + /* + * update the target dtpage to be relocated + * + * write LOG_REDOPAGE of LOG_NEW type for dst page + * for the whole target page (logredo() will apply + * after image and update bmap for allocation of the + * dst extent), and update bmap for allocation of + * the dst extent; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* update the self address in the dtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* the dst page is the same as the src page, i.e., + * linelock for afterimage of the whole page; + */ + lv->offset = 0; + lv->length = p->header.maxslot; + dtlck->index++; + + /* update the buffer extent descriptor of the dtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the relocated page */ + DT_PUTPAGE(mp); + jfs_info("dtRelocate: target dtpage relocated."); + + /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec + * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec + * will also force a bmap update ). + */ + + /* + * 3. acquire maplock for the source extent to be freed; + */ + /* for dtpage relocation, write a LOG_NOREDOPAGE record + * for the source dtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * dtpage), and upadte bmap for free of the source dtpage; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent router entry for relocation; + * + * acquire tlck for the parent entry covering the target dtpage; + * write LOG_REDOPAGE to apply after image only; + */ + jfs_info("dtRelocate: update parent router entry."); + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* update the PXD with the new address */ + stbl = DT_GETSTBL(pp); + pxd = (pxd_t *) & pp->slot[stbl[index]]; + PXDaddress(pxd, nxaddr); + lv->offset = stbl[index]; + lv->length = 1; + dtlck->index++; + + /* unpin the parent dtpage */ + DT_PUTPAGE(pmp); + + return rc; +} + +/* + * NAME: dtSearchNode() + * + * FUNCTION: Search for an dtpage containing a specified address + * This function is mainly used by defragfs utility. + * + * NOTE: Search result on stack, the found page is pinned at exit. + * The result page must be an internal dtpage. + * lmxaddr give the address of the left most page of the + * dtree level, in which the required dtpage resides. + */ +static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, + struct btstack * btstack) +{ + int rc = 0; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int psize = 288; /* initial in-line directory */ + s8 *stbl; + int i; + pxd_t *pxd; + struct btframe *btsp; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend tree to the level with specified leftmost page + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* does the xaddr of leftmost page of the levevl + * matches levevl search key ? + */ + if (p->header.flag & BT_ROOT) { + if (lmxaddr == 0) + break; + } else if (addressPXD(&p->header.self) == lmxaddr) + break; + + /* + * descend down to leftmost child page + */ + if (p->header.flag & BT_LEAF) { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + pxd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + /* + * search each page at the current levevl + */ + loop: + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + pxd = (pxd_t *) & p->slot[stbl[i]]; + + /* found the specified router entry */ + if (addressPXD(pxd) == addressPXD(kpxd) && + lengthPXD(pxd) == lengthPXD(kpxd)) { + btsp = btstack->top; + btsp->bn = bn; + btsp->index = i; + btsp->mp = mp; + + return 0; + } + } + + /* get the right sibling page if any */ + if (p->header.next) + bn = le64_to_cpu(p->header.next); + else { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* unpin current page */ + DT_PUTPAGE(mp); + + /* get the right sibling page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + goto loop; +} +#endif /* _NOTYET */ + +/* + * dtRelink() + * + * function: + * link around a freed page. + * + * parameter: + * fp: page to be freed + * + * return: + */ +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) +{ + int rc; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + * + * action: update prev pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(prevbn); + DT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the prev page + * + * action: update next pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.next = cpu_to_le64(nextbn); + DT_PUTPAGE(mp); + } + + return 0; +} + + +/* + * dtInitRoot() + * + * initialize directory root (inline in inode) + */ +void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + dtroot_t *p; + int fsi; + struct dtslot *f; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + u16 xflag_save; + + /* + * If this was previously an non-empty directory, we need to remove + * the old directory table. + */ + if (DO_INDEX(ip)) { + if (!jfs_dirtable_inline(ip)) { + struct tblock *tblk = tid_to_tblock(tid); + /* + * We're playing games with the tid's xflag. If + * we're removing a regular file, the file's xtree + * is committed with COMMIT_PMAP, but we always + * commit the directories xtree with COMMIT_PWMAP. + */ + xflag_save = tblk->xflag; + tblk->xflag = 0; + /* + * xtTruncate isn't guaranteed to fully truncate + * the xtree. The caller needs to check i_size + * after committing the transaction to see if + * additional truncation is needed. The + * COMMIT_Stale flag tells caller that we + * initiated the truncation. + */ + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + set_cflag(COMMIT_Stale, ip); + + tblk->xflag = xflag_save; + } else + ip->i_size = 1; + + jfs_ip->next_index = 2; + } else + ip->i_size = IDATASIZE; + + /* + * acquire a transaction lock on the root + * + * action: directory initialization; + */ + tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, + tlckDTREE | tlckENTRY | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + p = &jfs_ip->i_dtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + + p->header.nextindex = 0; + + /* init freelist */ + fsi = 1; + f = &p->slot[fsi]; + + /* init data area of root */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + p->header.freelist = 1; + p->header.freecnt = 8; + + /* init '..' entry */ + p->header.idotdot = cpu_to_le32(idotdot); + + return; +} + +/* + * add_missing_indices() + * + * function: Fix dtree page in which one or more entries has an invalid index. + * fsck.jfs should really fix this, but it currently does not. + * Called from jfs_readdir when bad index is detected. + */ +static void add_missing_indices(struct inode *inode, s64 bn) +{ + struct ldtentry *d; + struct dt_lock *dtlck; + int i; + uint index; + struct lv *lv; + struct metapage *mp; + dtpage_t *p; + int rc; + s8 *stbl; + tid_t tid; + struct tlock *tlck; + + tid = txBegin(inode->i_sb, 0); + + DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); + + if (rc) { + printk(KERN_ERR "DT_GETPAGE failed!\n"); + goto end; + } + BT_MARK_DIRTY(mp, inode); + + ASSERT(p->header.flag & BT_LEAF); + + tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); + if (BT_IS_ROOT(mp)) + tlck->type |= tlckBTROOT; + + dtlck = (struct dt_lock *) &tlck->lock; + + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + d = (struct ldtentry *) &p->slot[stbl[i]]; + index = le32_to_cpu(d->index); + if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { + d->index = cpu_to_le32(add_index(tid, inode, bn, i)); + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = &dtlck->lv[dtlck->index]; + lv->offset = stbl[i]; + lv->length = 1; + dtlck->index++; + } + } + + DT_PUTPAGE(mp); + (void) txCommit(tid, 1, &inode, 0); +end: + txEnd(tid); +} + +/* + * Buffer to hold directory entry info while traversing a dtree page + * before being fed to the filldir function + */ +struct jfs_dirent { + loff_t position; + int ino; + u16 name_len; + char name[0]; +}; + +/* + * function to determine next variable-sized jfs_dirent in buffer + */ +static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) +{ + return (struct jfs_dirent *) + ((char *)dirent + + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + + sizeof (loff_t) - 1) & + ~(sizeof (loff_t) - 1))); +} + +/* + * jfs_readdir() + * + * function: read directory entries sequentially + * from the specified entry offset + * + * parameter: + * + * return: offset = (pn, index) of start entry + * of next jfs_readdir()/dtRead() + */ +int jfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *ip = file_inode(file); + struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; + int rc = 0; + loff_t dtpos; /* legacy OS/2 style position */ + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) &dtpos; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + s8 *stbl; + struct btstack btstack; + int i, next; + struct ldtentry *d; + struct dtslot *t; + int d_namleft, len, outlen; + unsigned long dirent_buf; + char *name_ptr; + u32 dir_index; + int do_index = 0; + uint loop_count = 0; + struct jfs_dirent *jfs_dirent; + int jfs_dirents; + int overflow, fix_page, page_fixed = 0; + static int unique_pos = 2; /* If we can't fix broken index */ + + if (ctx->pos == DIREND) + return 0; + + if (DO_INDEX(ip)) { + /* + * persistent index is stored in directory entries. + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory + */ + do_index = 1; + + dir_index = (u32) ctx->pos; + + /* + * NFSv4 reserves cookies 1 and 2 for . and .. so the value + * we return to the vfs is one greater than the one we use + * internally. + */ + if (dir_index) + dir_index--; + + if (dir_index > 1) { + struct dir_table_slot dirtab_slot; + + if (dtEmpty(ip) || + (dir_index >= JFS_IP(ip)->next_index)) { + /* Stale position. Directory has shrunk */ + ctx->pos = DIREND; + return 0; + } + repeat: + rc = read_index(ip, dir_index, &dirtab_slot); + if (rc) { + ctx->pos = DIREND; + return rc; + } + if (dirtab_slot.flag == DIR_INDEX_FREE) { + if (loop_count++ > JFS_IP(ip)->next_index) { + jfs_err("jfs_readdir detected " + "infinite loop!"); + ctx->pos = DIREND; + return 0; + } + dir_index = le32_to_cpu(dirtab_slot.addr2); + if (dir_index == -1) { + ctx->pos = DIREND; + return 0; + } + goto repeat; + } + bn = addressDTS(&dirtab_slot); + index = dirtab_slot.slot; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + ctx->pos = DIREND; + return 0; + } + if (p->header.flag & BT_INTERNAL) { + jfs_err("jfs_readdir: bad index table"); + DT_PUTPAGE(mp); + ctx->pos = DIREND; + return 0; + } + } else { + if (dir_index == 0) { + /* + * self "." + */ + ctx->pos = 1; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) + return 0; + } + /* + * parent ".." + */ + ctx->pos = 2; + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) + return 0; + + /* + * Find first entry of left-most leaf + */ + if (dtEmpty(ip)) { + ctx->pos = DIREND; + return 0; + } + + if ((rc = dtReadFirst(ip, &btstack))) + return rc; + + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + } + } else { + /* + * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 + * + * pn = 0; index = 1: First entry "." + * pn = 0; index = 2: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries + */ + dtpos = ctx->pos; + if (dtpos < 2) { + /* build "." entry */ + ctx->pos = 1; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) + return 0; + dtoffset->index = 2; + ctx->pos = dtpos; + } + + if (dtoffset->pn == 0) { + if (dtoffset->index == 2) { + /* build ".." entry */ + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) + return 0; + } else { + jfs_err("jfs_readdir called with " + "invalid offset!"); + } + dtoffset->pn = 1; + dtoffset->index = 0; + ctx->pos = dtpos; + } + + if (dtEmpty(ip)) { + ctx->pos = DIREND; + return 0; + } + + if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { + jfs_err("jfs_readdir: unexpected rc = %d " + "from dtReadNext", rc); + ctx->pos = DIREND; + return 0; + } + /* get start leaf page and index */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* offset beyond directory eof ? */ + if (bn < 0) { + ctx->pos = DIREND; + return 0; + } + } + + dirent_buf = __get_free_page(GFP_KERNEL); + if (dirent_buf == 0) { + DT_PUTPAGE(mp); + jfs_warn("jfs_readdir: __get_free_page failed!"); + ctx->pos = DIREND; + return -ENOMEM; + } + + while (1) { + jfs_dirent = (struct jfs_dirent *) dirent_buf; + jfs_dirents = 0; + overflow = fix_page = 0; + + stbl = DT_GETSTBL(p); + + for (i = index; i < p->header.nextindex; i++) { + d = (struct ldtentry *) & p->slot[stbl[i]]; + + if (((long) jfs_dirent + d->namlen + 1) > + (dirent_buf + PAGE_SIZE)) { + /* DBCS codepages could overrun dirent_buf */ + index = i; + overflow = 1; + break; + } + + d_namleft = d->namlen; + name_ptr = jfs_dirent->name; + jfs_dirent->ino = le32_to_cpu(d->inumber); + + if (do_index) { + len = min(d_namleft, DTLHDRDATALEN); + jfs_dirent->position = le32_to_cpu(d->index); + /* + * d->index should always be valid, but it + * isn't. fsck.jfs doesn't create the + * directory index for the lost+found + * directory. Rather than let it go, + * we can try to fix it. + */ + if ((jfs_dirent->position < 2) || + (jfs_dirent->position >= + JFS_IP(ip)->next_index)) { + if (!page_fixed && !isReadOnly(ip)) { + fix_page = 1; + /* + * setting overflow and setting + * index to i will cause the + * same page to be processed + * again starting here + */ + overflow = 1; + index = i; + break; + } + jfs_dirent->position = unique_pos++; + } + /* + * We add 1 to the index because we may + * use a value of 2 internally, and NFSv4 + * doesn't like that. + */ + jfs_dirent->position++; + } else { + jfs_dirent->position = dtpos; + len = min(d_namleft, DTLHDRDATALEN_LEGACY); + } + + /* copy the name of head/only segment */ + outlen = jfs_strfromUCS_le(name_ptr, d->name, len, + codepage); + jfs_dirent->name_len = outlen; + + /* copy name in the additional segment(s) */ + next = d->next; + while (next >= 0) { + t = (struct dtslot *) & p->slot[next]; + name_ptr += outlen; + d_namleft -= len; + /* Sanity Check */ + if (d_namleft == 0) { + jfs_error(ip->i_sb, + "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", + (long)ip->i_ino, + (long long)bn, + i); + goto skip_one; + } + len = min(d_namleft, DTSLOTDATALEN); + outlen = jfs_strfromUCS_le(name_ptr, t->name, + len, codepage); + jfs_dirent->name_len += outlen; + + next = t->next; + } + + jfs_dirents++; + jfs_dirent = next_jfs_dirent(jfs_dirent); +skip_one: + if (!do_index) + dtoffset->index++; + } + + if (!overflow) { + /* Point to next leaf page */ + if (p->header.flag & BT_ROOT) + bn = 0; + else { + bn = le64_to_cpu(p->header.next); + index = 0; + /* update offset (pn:index) for new page */ + if (!do_index) { + dtoffset->pn++; + dtoffset->index = 0; + } + } + page_fixed = 0; + } + + /* unpin previous leaf page */ + DT_PUTPAGE(mp); + + jfs_dirent = (struct jfs_dirent *) dirent_buf; + while (jfs_dirents--) { + ctx->pos = jfs_dirent->position; + if (!dir_emit(ctx, jfs_dirent->name, + jfs_dirent->name_len, + jfs_dirent->ino, DT_UNKNOWN)) + goto out; + jfs_dirent = next_jfs_dirent(jfs_dirent); + } + + if (fix_page) { + add_missing_indices(ip, bn); + page_fixed = 1; + } + + if (!overflow && (bn == 0)) { + ctx->pos = DIREND; + break; + } + + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + free_page(dirent_buf); + return rc; + } + } + + out: + free_page(dirent_buf); + + return rc; +} + + +/* + * dtReadFirst() + * + * function: get the leftmost page of the directory + */ +static int dtReadFirst(struct inode *ip, struct btstack * btstack) +{ + int rc = 0; + s64 bn; + int psize = 288; /* initial in-line directory */ + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + struct btframe *btsp; + pxd_t *xd; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend leftmost path of the tree + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* + * leftmost leaf page + */ + if (p->header.flag & BT_LEAF) { + /* return leftmost entry */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = 0; + btsp->mp = mp; + + return 0; + } + + /* + * descend down to leftmost child page + */ + if (BT_STACK_FULL(btstack)) { + DT_PUTPAGE(mp); + jfs_error(ip->i_sb, "btstack overrun\n"); + BT_STACK_DUMP(btstack); + return -EIO; + } + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, 0); + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(xd); + psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } +} + + +/* + * dtReadNext() + * + * function: get the page of the specified offset (pn:index) + * + * return: if (offset > eof), bn = -1; + * + * note: if index > nextindex of the target leaf page, + * start with 1st entry of next leaf page; + */ +static int dtReadNext(struct inode *ip, loff_t * offset, + struct btstack * btstack) +{ + int rc = 0; + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) offset; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + int pn; + s8 *stbl; + struct btframe *btsp, *parent; + pxd_t *xd; + + /* + * get leftmost leaf page pinned + */ + if ((rc = dtReadFirst(ip, btstack))) + return rc; + + /* get leaf page */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* get the start offset (pn:index) */ + pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ + index = dtoffset->index; + + /* start at leftmost page ? */ + if (pn == 0) { + /* offset beyond eof ? */ + if (index < p->header.nextindex) + goto out; + + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = index = 0; + goto a; + } + + /* start at non-leftmost page: scan parent pages for large pn */ + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start after next leaf page ? */ + if (pn > 1) + goto b; + + /* get leaf page pn = 1 */ + a: + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + goto c; + + /* + * scan last internal page level to get target leaf page + */ + b: + /* unpin leftmost leaf page */ + DT_PUTPAGE(mp); + + /* get left most parent page */ + btsp = btstack->top; + parent = btsp - 1; + bn = parent->bn; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* scan parent pages at last internal page level */ + while (pn >= p->header.nextindex) { + pn -= p->header.nextindex; + + /* get next parent page address */ + bn = le64_to_cpu(p->header.next); + + /* unpin current parent page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next parent page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* update parent page stack frame */ + parent->bn = bn; + } + + /* get leaf page address */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[pn]]; + bn = addressPXD(xd); + + /* unpin parent page */ + DT_PUTPAGE(mp); + + /* + * get target leaf page + */ + c: + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * leaf page has been completed: + * start with 1st entry of next leaf page + */ + if (index >= p->header.nextindex) { + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next leaf page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = 0; + } + + out: + /* return target leaf page pinned */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = dtoffset->index; + btsp->mp = mp; + + return 0; +} + + +/* + * dtCompare() + * + * function: compare search key with an internal entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int dtCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si) +{ /* entry slot index */ + wchar_t *kname; + __le16 *name; + int klen, namlen, len, rc; + struct idtentry *ih; + struct dtslot *t; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + + /* compare with head/only segment */ + len = min(klen, len); + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + kname += len; + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + kname += len; + si = t->next; + } + + return (klen - namlen); +} + + + + +/* + * ciCompare() + * + * function: compare search key with an (leaf/internal) entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int ciCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si, /* entry slot index */ + int flag) +{ + wchar_t *kname, x; + __le16 *name; + int klen, namlen, len, rc; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int i; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + /* + * leaf page entry + */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + name = lh->name; + namlen = lh->namlen; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } + /* + * internal page entry + */ + else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + } + + /* compare with head/only segment */ + len = min(klen, len); + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + si = t->next; + } + + return (klen - namlen); +} + + +/* + * ciGetLeafPrefixKey() + * + * function: compute prefix of suffix compression + * from two adjacent leaf entries + * across page boundary + * + * return: non-zero on error + * + */ +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag) +{ + int klen, namlen; + wchar_t *pl, *pr, *kname; + struct component_name lkey; + struct component_name rkey; + + lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_KERNEL); + if (lkey.name == NULL) + return -ENOMEM; + + rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_KERNEL); + if (rkey.name == NULL) { + kfree(lkey.name); + return -ENOMEM; + } + + /* get left and right key */ + dtGetKey(lp, li, &lkey, flag); + lkey.name[lkey.namlen] = 0; + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&lkey); + + dtGetKey(rp, ri, &rkey, flag); + rkey.name[rkey.namlen] = 0; + + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&rkey); + + /* compute prefix */ + klen = 0; + kname = key->name; + namlen = min(lkey.namlen, rkey.namlen); + for (pl = lkey.name, pr = rkey.name; + namlen; pl++, pr++, namlen--, klen++, kname++) { + *kname = *pr; + if (*pl != *pr) { + key->namlen = klen + 1; + goto free_names; + } + } + + /* l->namlen <= r->namlen since l <= r */ + if (lkey.namlen < rkey.namlen) { + *kname = *pr; + key->namlen = klen + 1; + } else /* l->namelen == r->namelen */ + key->namlen = klen; + +free_names: + kfree(lkey.name); + kfree(rkey.name); + return 0; +} + + + +/* + * dtGetKey() + * + * function: get key of the entry + */ +static void dtGetKey(dtpage_t * p, int i, /* entry index */ + struct component_name * key, int flag) +{ + int si; + s8 *stbl; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int namlen, len; + wchar_t *kname; + __le16 *name; + + /* get entry */ + stbl = DT_GETSTBL(p); + si = stbl[i]; + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + namlen = lh->namlen; + name = lh->name; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + namlen = ih->namlen; + name = ih->name; + len = min(namlen, DTIHDRDATALEN); + } + + key->namlen = namlen; + kname = key->name; + + /* + * move head/only segment + */ + UniStrncpy_from_le(kname, name, len); + + /* + * move additional segment(s) + */ + while (si >= 0) { + /* get next segment */ + t = &p->slot[si]; + kname += len; + namlen -= len; + len = min(namlen, DTSLOTDATALEN); + UniStrncpy_from_le(kname, t->name, len); + + si = t->next; + } +} + + +/* + * dtInsertEntry() + * + * function: allocate free slot(s) and + * write a leaf/internal entry + * + * return: entry slot index + */ +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock ** dtlock) +{ + struct dtslot *h, *t; + struct ldtentry *lh = NULL; + struct idtentry *ih = NULL; + int hsi, fsi, klen, len, nextindex; + wchar_t *kname; + __le16 *name; + s8 *stbl; + pxd_t *xd; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + s64 bn = 0; + struct metapage *mp = NULL; + + klen = key->namlen; + kname = key->name; + + /* allocate a free slot */ + hsi = fsi = p->header.freelist; + h = &p->slot[fsi]; + p->header.freelist = h->next; + --p->header.freecnt; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + + lv = & dtlck->lv[dtlck->index]; + lv->offset = hsi; + + /* write head/only segment */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) h; + lh->next = h->next; + lh->inumber = cpu_to_le32(data->leaf.ino); + lh->namlen = klen; + name = lh->name; + if (data->leaf.ip) { + len = min(klen, DTLHDRDATALEN); + if (!(p->header.flag & BT_ROOT)) + bn = addressPXD(&p->header.self); + lh->index = cpu_to_le32(add_index(data->leaf.tid, + data->leaf.ip, + bn, index)); + } else + len = min(klen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) h; + ih->next = h->next; + xd = (pxd_t *) ih; + *xd = data->xd; + ih->namlen = klen; + name = ih->name; + len = min(klen, DTIHDRDATALEN); + } + + UniStrncpy_to_le(name, kname, len); + + n = 1; + xsi = hsi; + + /* write additional segment(s) */ + t = h; + klen -= len; + while (klen) { + /* get free slot */ + fsi = p->header.freelist; + t = &p->slot[fsi]; + p->header.freelist = t->next; + --p->header.freecnt; + + /* is next slot contiguous ? */ + if (fsi != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = fsi; + n = 0; + } + + kname += len; + len = min(klen, DTSLOTDATALEN); + UniStrncpy_to_le(t->name, kname, len); + + n++; + xsi = fsi; + klen -= len; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* terminate last/only segment */ + if (h == t) { + /* single segment entry */ + if (p->header.flag & BT_LEAF) + lh->next = -1; + else + ih->next = -1; + } else + /* multi-segment entry */ + t->next = -1; + + /* if insert into middle, shift right succeeding entries in stbl */ + stbl = DT_GETSTBL(p); + nextindex = p->header.nextindex; + if (index < nextindex) { + memmove(stbl + index + 1, stbl + index, nextindex - index); + + if ((p->header.flag & BT_LEAF) && data->leaf.ip) { + s64 lblock; + + /* + * Need to update slot number for entries that moved + * in the stbl + */ + mp = NULL; + for (n = index + 1; n <= nextindex; n++) { + lh = (struct ldtentry *) & (p->slot[stbl[n]]); + modify_index(data->leaf.tid, data->leaf.ip, + le32_to_cpu(lh->index), bn, n, + &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + stbl[index] = hsi; + + /* advance next available entry index of stbl */ + ++p->header.nextindex; +} + + +/* + * dtMoveEntry() + * + * function: move entries from split/left page to new/right page + * + * nextindex of dst page and freelist/freecnt of both pages + * are updated. + */ +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index) +{ + int ssi, next; /* src slot index */ + int di; /* dst entry index */ + int dsi; /* dst slot index */ + s8 *sstbl, *dstbl; /* sorted entry table */ + int snamlen, len; + struct ldtentry *slh, *dlh = NULL; + struct idtentry *sih, *dih = NULL; + struct dtslot *h, *s, *d; + struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; + struct lv *slv, *dlv; + int xssi, ns, nd; + int sfsi; + + sstbl = (s8 *) & sp->slot[sp->header.stblindex]; + dstbl = (s8 *) & dp->slot[dp->header.stblindex]; + + dsi = dp->header.freelist; /* first (whole page) free slot */ + sfsi = sp->header.freelist; + + /* linelock destination entry slot */ + dlv = & ddtlck->lv[ddtlck->index]; + dlv->offset = dsi; + + /* linelock source entry slot */ + slv = & sdtlck->lv[sdtlck->index]; + slv->offset = sstbl[si]; + xssi = slv->offset - 1; + + /* + * move entries + */ + ns = nd = 0; + for (di = 0; si < sp->header.nextindex; si++, di++) { + ssi = sstbl[si]; + dstbl[di] = dsi; + + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* + * move head/only segment of an entry + */ + /* get dst slot */ + h = d = &dp->slot[dsi]; + + /* get src slot and move */ + s = &sp->slot[ssi]; + if (sp->header.flag & BT_LEAF) { + /* get source entry */ + slh = (struct ldtentry *) s; + dlh = (struct ldtentry *) h; + snamlen = slh->namlen; + + if (do_index) { + len = min(snamlen, DTLHDRDATALEN); + dlh->index = slh->index; /* little-endian */ + } else + len = min(snamlen, DTLHDRDATALEN_LEGACY); + + memcpy(dlh, slh, 6 + len * 2); + + next = slh->next; + + /* update dst head/only segment next field */ + dsi++; + dlh->next = dsi; + } else { + sih = (struct idtentry *) s; + snamlen = sih->namlen; + + len = min(snamlen, DTIHDRDATALEN); + dih = (struct idtentry *) h; + memcpy(dih, sih, 10 + len * 2); + next = sih->next; + + dsi++; + dih->next = dsi; + } + + /* free src head/only segment */ + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + ns++; + nd++; + xssi = ssi; + + /* + * move additional segment(s) of the entry + */ + snamlen -= len; + while ((ssi = next) >= 0) { + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = + (struct dt_lock *) + txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* get next source segment */ + s = &sp->slot[ssi]; + + /* get next destination free slot */ + d++; + + len = min(snamlen, DTSLOTDATALEN); + UniStrncpy_le(d->name, s->name, len); + + ns++; + nd++; + xssi = ssi; + + dsi++; + d->next = dsi; + + /* free source segment */ + next = s->next; + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + snamlen -= len; + } /* end while */ + + /* terminate dst last/only segment */ + if (h == d) { + /* single segment entry */ + if (dp->header.flag & BT_LEAF) + dlh->next = -1; + else + dih->next = -1; + } else + /* multi-segment entry */ + d->next = -1; + } /* end for */ + + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + *sdtlock = sdtlck; + + dlv->length = nd; + ddtlck->index++; + *ddtlock = ddtlck; + + /* update source header */ + sp->header.freelist = sfsi; + sp->header.freecnt += nd; + + /* update destination header */ + dp->header.nextindex = di; + + dp->header.freelist = dsi; + dp->header.freecnt -= nd; +} + + +/* + * dtDeleteEntry() + * + * function: free a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + fsi = stbl[fi]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + /* get the head/only segment */ + t = &p->slot[fsi]; + if (p->header.flag & BT_LEAF) + si = ((struct ldtentry *) t)->next; + else + si = ((struct idtentry *) t)->next; + t->next = si; + t->cnt = 1; + + n = freecnt = 1; + xsi = fsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; + + /* if delete from middle, + * shift left the succedding entries in the stbl + */ + si = p->header.nextindex; + if (fi < si - 1) + memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); + + p->header.nextindex--; +} + + +/* + * dtTruncateEntry() + * + * function: truncate a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) +{ + int tsi; /* truncate entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int fsi, xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + tsi = stbl[ti]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = tsi; + + /* get the head/only segment */ + t = &p->slot[tsi]; + ASSERT(p->header.flag & BT_INTERNAL); + ((struct idtentry *) t)->namlen = 0; + si = ((struct idtentry *) t)->next; + ((struct idtentry *) t)->next = -1; + + n = 1; + freecnt = 0; + fsi = si; + xsi = tsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + if (freecnt == 0) + return; + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; +} + + +/* + * dtLinelockFreelist() + */ +static void dtLinelockFreelist(dtpage_t * p, /* directory page */ + int m, /* max slot index */ + struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + struct dtslot *t; + int si; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + fsi = p->header.freelist; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + n = 1; + xsi = fsi; + + t = &p->slot[fsi]; + si = t->next; + + /* find the last/only segment */ + while (si < m && si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + + t = &p->slot[si]; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; +} + + +/* + * NAME: dtModify + * + * FUNCTION: Modify the inode number part of a directory entry + * + * PARAMETERS: + * tid - Transaction id + * ip - Inode of parent directory + * key - Name of entry to be modified + * orig_ino - Original inode number expected in entry + * new_ino - New inode number to put into entry + * flag - JFS_RENAME + * + * RETURNS: + * -ESTALE - If entry found does not match orig_ino passed in + * -ENOENT - If no entry can be found to match key + * 0 - If successfully modified entry + */ +int dtModify(tid_t tid, struct inode *ip, + struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) +{ + int rc; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + struct btstack btstack; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + s8 *stbl; + int entry_si; /* entry slot index */ + struct ldtentry *entry; + + /* + * search for the entry to modify: + * + * dtSearch() returns (leaf page pinned, index at which to modify). + */ + if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page of named entry + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* get slot index of the entry */ + stbl = DT_GETSTBL(p); + entry_si = stbl[index]; + + /* linelock entry */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = entry_si; + lv->length = 1; + dtlck->index++; + + /* get the head/only segment */ + entry = (struct ldtentry *) & p->slot[entry_si]; + + /* substitute the inode number of the entry */ + entry->inumber = cpu_to_le32(new_ino); + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} diff --git a/kernel/fs/jfs/jfs_dtree.h b/kernel/fs/jfs/jfs_dtree.h new file mode 100644 index 000000000..fd4169e6e --- /dev/null +++ b/kernel/fs/jfs/jfs_dtree.h @@ -0,0 +1,269 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DTREE +#define _H_JFS_DTREE + +/* + * jfs_dtree.h: directory B+-tree manager + */ + +#include "jfs_btree.h" + +typedef union { + struct { + tid_t tid; + struct inode *ip; + u32 ino; + } leaf; + pxd_t xd; +} ddata_t; + + +/* + * entry segment/slot + * + * an entry consists of type dependent head/only segment/slot and + * additional segments/slots linked vi next field; + * N.B. last/only segment of entry is terminated by next = -1; + */ +/* + * directory page slot + */ +struct dtslot { + s8 next; /* 1: */ + s8 cnt; /* 1: */ + __le16 name[15]; /* 30: */ +}; /* (32) */ + + +#define DATASLOTSIZE 16 +#define L2DATASLOTSIZE 4 +#define DTSLOTSIZE 32 +#define L2DTSLOTSIZE 5 +#define DTSLOTHDRSIZE 2 +#define DTSLOTDATASIZE 30 +#define DTSLOTDATALEN 15 + +/* + * internal node entry head/only segment + */ +struct idtentry { + pxd_t xd; /* 8: child extent descriptor */ + + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ +}; /* (32) */ + +#define DTIHDRSIZE 10 +#define DTIHDRDATALEN 11 + +/* compute number of slots for entry */ +#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15)) + + +/* + * leaf node entry head/only segment + * + * For legacy filesystems, name contains 13 wchars -- no index field + */ +struct ldtentry { + __le32 inumber; /* 4: 4-byte aligned */ + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ + __le32 index; /* 4: index into dir_table */ +}; /* (32) */ + +#define DTLHDRSIZE 6 +#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */ +#define DTLHDRDATALEN 11 + +/* + * dir_table used for directory traversal during readdir + */ + +/* + * Keep persistent index for directory entries + */ +#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX) + +/* + * Maximum entry in inline directory table + */ +#define MAX_INLINE_DIRTABLE_ENTRY 13 + +struct dir_table_slot { + u8 rsrvd; /* 1: */ + u8 flag; /* 1: 0 if free */ + u8 slot; /* 1: slot within leaf page of entry */ + u8 addr1; /* 1: upper 8 bits of leaf page address */ + __le32 addr2; /* 4: lower 32 bits of leaf page address -OR- + index of next entry when this entry was deleted */ +}; /* (8) */ + +/* + * flag values + */ +#define DIR_INDEX_VALID 1 +#define DIR_INDEX_FREE 0 + +#define DTSaddress(dir_table_slot, address64)\ +{\ + (dir_table_slot)->addr1 = ((u64)address64) >> 32;\ + (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +#define addressDTS(dts)\ + ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) + +/* compute number of slots for entry */ +#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15)) +#define NDTLEAF NDTINTERNAL + + +/* + * directory root page (in-line in on-disk inode): + * + * cf. dtpage_t below. + */ +typedef union { + struct { + struct dasd DASD; /* 16: DASD limit/usage info */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next free entry in stbl */ + s8 freecnt; /* 1: free count */ + s8 freelist; /* 1: freelist header */ + + __le32 idotdot; /* 4: parent inode number */ + + s8 stbl[8]; /* 8: sorted entry index table */ + } header; /* (32) */ + + struct dtslot slot[9]; +} dtroot_t; + +#define PARENT(IP) \ + (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot)) + +#define DTROOTMAXSLOT 9 + +#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0) + + +/* + * directory regular page: + * + * entry slot array of 32 byte slot + * + * sorted entry slot index table (stbl): + * contiguous slots at slot specified by stblindex, + * 1-byte per entry + * 512 byte block: 16 entry tbl (1 slot) + * 1024 byte block: 32 entry tbl (1 slot) + * 2048 byte block: 64 entry tbl (2 slot) + * 4096 byte block: 128 entry tbl (4 slot) + * + * data area: + * 512 byte block: 16 - 2 = 14 slot + * 1024 byte block: 32 - 2 = 30 slot + * 2048 byte block: 64 - 3 = 61 slot + * 4096 byte block: 128 - 5 = 123 slot + * + * N.B. index is 0-based; index fields refer to slot index + * except nextindex which refers to entry index in stbl; + * end of entry stot list or freelist is marked with -1. + */ +typedef union { + struct { + __le64 next; /* 8: next sibling */ + __le64 prev; /* 8: previous sibling */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next entry index in stbl */ + s8 freecnt; /* 1: */ + s8 freelist; /* 1: slot index of head of freelist */ + + u8 maxslot; /* 1: number of slots in page slot[] */ + u8 stblindex; /* 1: slot index of start of stbl */ + u8 rsrvd[2]; /* 2: */ + + pxd_t self; /* 8: self pxd */ + } header; /* (32) */ + + struct dtslot slot[128]; +} dtpage_t; + +#define DTPAGEMAXSLOT 128 + +#define DT8THPGNODEBYTES 512 +#define DT8THPGNODETSLOTS 1 +#define DT8THPGNODESLOTS 16 + +#define DTQTRPGNODEBYTES 1024 +#define DTQTRPGNODETSLOTS 1 +#define DTQTRPGNODESLOTS 32 + +#define DTHALFPGNODEBYTES 2048 +#define DTHALFPGNODETSLOTS 2 +#define DTHALFPGNODESLOTS 64 + +#define DTFULLPGNODEBYTES 4096 +#define DTFULLPGNODETSLOTS 4 +#define DTFULLPGNODESLOTS 128 + +#define DTENTRYSTART 1 + +/* get sorted entry table of the page */ +#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\ + ((dtroot_t *)(p))->header.stbl : \ + (s8 *)&(p)->slot[(p)->header.stblindex] ) + +/* + * Flags for dtSearch + */ +#define JFS_CREATE 1 +#define JFS_LOOKUP 2 +#define JFS_REMOVE 3 +#define JFS_RENAME 4 + +/* + * Maximum file offset for directories. + */ +#define DIREND INT_MAX + +/* + * external declarations + */ +extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot); + +extern int dtSearch(struct inode *ip, struct component_name * key, + ino_t * data, struct btstack * btstack, int flag); + +extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * ino, struct btstack * btstack); + +extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * data, int flag); + +extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * orig_ino, ino_t new_ino, int flag); + +extern int jfs_readdir(struct file *file, struct dir_context *ctx); +#endif /* !_H_JFS_DTREE */ diff --git a/kernel/fs/jfs/jfs_extent.c b/kernel/fs/jfs/jfs_extent.c new file mode 100644 index 000000000..2ae7d59ab --- /dev/null +++ b/kernel/fs/jfs/jfs_extent.c @@ -0,0 +1,651 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_extent.h" +#include "jfs_debug.h" + +/* + * forward references + */ +static int extBalloc(struct inode *, s64, s64 *, s64 *); +#ifdef _NOTYET +static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); +#endif +static s64 extRoundDown(s64 nb); + +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) +#define DPL1(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x ",(a)); \ + else \ + printk("(a): %x ",(a) << 32); \ +} +#define DPL(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x\n",(a)); \ + else \ + printk("(a): %x\n",(a) << 32); \ +} + +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) + + +/* + * NAME: extAlloc() + * + * FUNCTION: allocate an extent for a specified page range within a + * file. + * + * PARAMETERS: + * ip - the inode of the file. + * xlen - requested extent length. + * pno - the starting page number with the file. + * xp - pointer to an xad. on entry, xad describes an + * extent that is used as an allocation hint if the + * xaddr of the xad is non-zero. on successful exit, + * the xad describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int +extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nxlen, nxaddr, xoff, hint, xaddr = 0; + int rc; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + /* Avoid race with jfs_commit_inode() */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* validate extent length */ + if (xlen > MAXXLEN) + xlen = MAXXLEN; + + /* get the page's starting extent offset */ + xoff = pno << sbi->l2nbperpage; + + /* check if an allocation hint was provided */ + if ((hint = addressXAD(xp))) { + /* get the size of the extent described by the hint */ + nxlen = lengthXAD(xp); + + /* check if the hint is for the portion of the file + * immediately previous to the current allocation + * request and if hint extent has the same abnr + * value as the current request. if so, we can + * extend the hint extent to include the current + * extent if we can allocate the blocks immediately + * following the hint extent. + */ + if (offsetXAD(xp) + nxlen == xoff && + abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false)) + xaddr = hint + nxlen; + + /* adjust the hint to the last block of the extent */ + hint += (nxlen - 1); + } + + /* allocate the disk blocks for the extent. initially, extBalloc() + * will try to allocate disk blocks for the requested size (xlen). + * if this fails (xlen contiguous free blocks not available), it'll + * try to allocate a smaller number of blocks (producing a smaller + * extent), with this smaller number of blocks consisting of the + * requested number of blocks rounded down to the next smaller + * power of 2 number (i.e. 16 -> 8). it'll continue to round down + * and retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + */ + nxlen = xlen; + if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) { + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + /* determine the value of the extent flag */ + xflag = abnr ? XAD_NOTRECORDED : 0; + + /* if we can extend the hint extent to cover the current request, + * extend it. otherwise, insert a new extent to + * cover the current request. + */ + if (xaddr && xaddr == nxaddr) + rc = xtExtend(0, ip, xoff, (int) nxlen, 0); + else + rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); + + /* if the extend or insert failed, + * free the newly allocated blocks and return the error. + */ + if (rc) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* set the results of the extent allocation */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + /* + * COMMIT_SyncList flags an anonymous tlock on page that is on + * sync list. + * We need to commit the inode to get the page written disk. + */ + if (test_and_clear_cflag(COMMIT_Synclist,ip)) + jfs_commit_inode(ip, 0); + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extRealloc() + * + * FUNCTION: extend the allocation of a file extent containing a + * partial back last page. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf for the partial backed last page. + * xlen - request size of the resulting extent. + * xp - pointer to an xad. on successful exit, the xad + * describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) +{ + struct super_block *sb = ip->i_sb; + s64 xaddr, xlen, nxaddr, delta, xoff; + s64 ntail, nextend, ninsert; + int rc, nbperpage = JFS_SBI(sb)->nbperpage; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + /* validate extent length */ + if (nxlen > MAXXLEN) + nxlen = MAXXLEN; + + /* get the extend (partial) page's disk block address and + * number of blocks. + */ + xaddr = addressXAD(xp); + xlen = lengthXAD(xp); + xoff = offsetXAD(xp); + + /* if the extend page is abnr and if the request is for + * the extent to be allocated and recorded, + * make the page allocated and recorded. + */ + if ((xp->flag & XAD_NOTRECORDED) && !abnr) { + xp->flag = 0; + if ((rc = xtUpdate(0, ip, xp))) + goto exit; + } + + /* try to allocated the request number of blocks for the + * extent. dbRealloc() first tries to satisfy the request + * by extending the allocation in place. otherwise, it will + * try to allocate a new set of blocks large enough for the + * request. in satisfying a request, dbReAlloc() may allocate + * less than what was request but will always allocate enough + * space as to satisfy the extend page. + */ + if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) + goto exit; + + /* Allocat blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + delta = nxlen - xlen; + + /* check if the extend page is not abnr but the request is abnr + * and the allocated disk space is for more than one page. if this + * is the case, there is a miss match of abnr between the extend page + * and the one or more pages following the extend page. as a result, + * two extents will have to be manipulated. the first will be that + * of the extent of the extend page and will be manipulated thru + * an xtExtend() or an xtTailgate(), depending upon whether the + * disk allocation occurred as an inplace extension. the second + * extent will be manipulated (created) through an xtInsert() and + * will be for the pages following the extend page. + */ + if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { + ntail = nbperpage; + nextend = ntail - xlen; + ninsert = nxlen - nbperpage; + + xflag = XAD_NOTRECORDED; + } else { + ntail = nxlen; + nextend = delta; + ninsert = 0; + + xflag = xp->flag; + } + + /* if we were able to extend the disk allocation in place, + * extend the extent. otherwise, move the extent to a + * new disk location. + */ + if (xaddr == nxaddr) { + /* extend the extent */ + if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { + dbFree(ip, xaddr + xlen, delta); + dquot_free_block(ip, nxlen); + goto exit; + } + } else { + /* + * move the extent to a new location: + * + * xtTailgate() accounts for relocated tail extent; + */ + if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + goto exit; + } + } + + + /* check if we need to also insert a new extent */ + if (ninsert) { + /* perform the insert. if it fails, free the blocks + * to be inserted and make it appear that we only did + * the xtExtend() or xtTailgate() above. + */ + xaddr = nxaddr + ntail; + if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, + &xaddr, 0)) { + dbFree(ip, xaddr, (s64) ninsert); + delta = nextend; + nxlen = ntail; + xflag = 0; + } + } + + /* set the return results */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); +exit: + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); +} +#endif /* _NOTYET */ + + +/* + * NAME: extHint() + * + * FUNCTION: produce an extent allocation hint for a file offset. + * + * PARAMETERS: + * ip - the inode of the file. + * offset - file offset for which the hint is needed. + * xp - pointer to the xad that is to be filled in with + * the hint. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int extHint(struct inode *ip, s64 offset, xad_t * xp) +{ + struct super_block *sb = ip->i_sb; + int nbperpage = JFS_SBI(sb)->nbperpage; + s64 prev; + int rc = 0; + s64 xaddr; + int xlen; + int xflag; + + /* init the hint as "no hint provided" */ + XADaddress(xp, 0); + + /* determine the starting extent offset of the page previous + * to the page containing the offset. + */ + prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; + + /* if the offset is in the first page of the file, no hint provided. + */ + if (prev < 0) + goto out; + + rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0); + + if ((rc == 0) && xlen) { + if (xlen != nbperpage) { + jfs_error(ip->i_sb, "corrupt xtree\n"); + rc = -EIO; + } + XADaddress(xp, xaddr); + XADlength(xp, xlen); + XADoffset(xp, prev); + /* + * only preserve the abnr flag within the xad flags + * of the returned hint. + */ + xp->flag = xflag & XAD_NOTRECORDED; + } else + rc = 0; + +out: + return (rc); +} + + +/* + * NAME: extRecord() + * + * FUNCTION: change a page with a file from not recorded to recorded. + * + * PARAMETERS: + * ip - inode of the file. + * cp - cbuf of the file page. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRecord(struct inode *ip, xad_t * xp) +{ + int rc; + + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* update the extent */ + rc = xtUpdate(0, ip, xp); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; +} + + +#ifdef _NOTYET +/* + * NAME: extFill() + * + * FUNCTION: allocate disk space for a file page that represents + * a file hole. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf of the file page represent the hole. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extFill(struct inode *ip, xad_t * xp) +{ + int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; + s64 blkno = offsetXAD(xp) >> ip->i_blkbits; + +// assert(ISSPARSE(ip)); + + /* initialize the extent allocation hint */ + XADaddress(xp, 0); + + /* allocate an extent to fill the hole */ + if ((rc = extAlloc(ip, nbperpage, blkno, xp, false))) + return (rc); + + assert(lengthPXD(xp) == nbperpage); + + return (0); +} +#endif /* _NOTYET */ + + +/* + * NAME: extBalloc() + * + * FUNCTION: allocate disk blocks to form an extent. + * + * initially, we will try to allocate disk blocks for the + * requested size (nblocks). if this fails (nblocks + * contiguous free blocks not available), we'll try to allocate + * a smaller number of blocks (producing a smaller extent), with + * this smaller number of blocks consisting of the requested + * number of blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). we'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * hint - disk block number to be used as an allocation hint. + * *nblocks - pointer to an s64 value. on entry, this value specifies + * the desired number of block to be allocated. on successful + * exit, this value is set to the number of blocks actually + * allocated. + * blkno - pointer to a block address that is filled in on successful + * return with the starting block number of the newly + * allocated block range. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nb, nblks, daddr, max; + int rc, nbperpage = sbi->nbperpage; + struct bmap *bmp = sbi->bmap; + int ag; + + /* get the number of blocks to initially attempt to allocate. + * we'll first try the number of blocks requested unless this + * number is greater than the maximum number of contiguous free + * blocks in the map. in that case, we'll start off with the + * maximum free. + */ + max = (s64) 1 << bmp->db_maxfreebud; + if (*nblocks >= max && *nblocks > nbperpage) + nb = nblks = (max > nbperpage) ? max : nbperpage; + else + nb = nblks = *nblocks; + + /* try to allocate blocks */ + while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) { + /* if something other than an out of space error, + * stop and return this error. + */ + if (rc != -ENOSPC) + return (rc); + + /* decrease the allocation request size */ + nb = min(nblks, extRoundDown(nb)); + + /* give up if we cannot cover a page */ + if (nb < nbperpage) + return (rc); + } + + *nblocks = nb; + *blkno = daddr; + + if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) { + ag = BLKTOAG(daddr, sbi); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } else if (ji->active_ag != ag) { + atomic_dec(&bmp->db_active[ji->active_ag]); + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } + spin_unlock_irq(&ji->ag_lock); + } + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extBrealloc() + * + * FUNCTION: attempt to extend an extent's allocation. + * + * Initially, we will try to extend the extent's allocation + * in place. If this fails, we'll try to move the extent + * to a new set of blocks. If moving the extent, we initially + * will try to allocate disk blocks for the requested size + * (newnblks). if this fails (new contiguous free blocks not + * available), we'll try to allocate a smaller number of + * blocks (producing a smaller extent), with this smaller + * number of blocks consisting of the requested number of + * blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). We'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. + * newnblks - pointer to a s64 value. on entry, this value is the + * the new desired extent size (number of blocks). on + * successful exit, this value is set to the extent's actual + * new size (new number of blocks). + * newblkno - the starting block number of the extents new allocation. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBrealloc(struct inode *ip, + s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) +{ + int rc; + + /* try to extend in place */ + if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { + *newblkno = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* in place extension not possible. + * try to move the extent to a new set of blocks. + */ + return (extBalloc(ip, blkno, newnblks, newblkno)); +} +#endif /* _NOTYET */ + + +/* + * NAME: extRoundDown() + * + * FUNCTION: round down a specified number of blocks to the next + * smallest power of 2 number. + * + * PARAMETERS: + * nb - the inode of the file. + * + * RETURN VALUES: + * next smallest power of 2 number. + */ +static s64 extRoundDown(s64 nb) +{ + int i; + u64 m, k; + + for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) { + if (m & nb) + break; + } + + i = 63 - i; + k = (u64) 1 << i; + k = ((k - 1) & nb) ? k : k >> 1; + + return (k); +} diff --git a/kernel/fs/jfs/jfs_extent.h b/kernel/fs/jfs/jfs_extent.h new file mode 100644 index 000000000..b567e12c5 --- /dev/null +++ b/kernel/fs/jfs/jfs_extent.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_EXTENT +#define _H_JFS_EXTENT + +/* get block allocation allocation hint as location of disk inode */ +#define INOHINT(ip) \ + (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) + +extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); +extern int extFill(struct inode *, xad_t *); +extern int extHint(struct inode *, s64, xad_t *); +extern int extRealloc(struct inode *, s64, xad_t *, bool); +extern int extRecord(struct inode *, xad_t *); + +#endif /* _H_JFS_EXTENT */ diff --git a/kernel/fs/jfs/jfs_filsys.h b/kernel/fs/jfs/jfs_filsys.h new file mode 100644 index 000000000..b67d64671 --- /dev/null +++ b/kernel/fs/jfs/jfs_filsys.h @@ -0,0 +1,285 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_FILSYS +#define _H_JFS_FILSYS + +/* + * jfs_filsys.h + * + * file system (implementation-dependent) constants + * + * refer to for system wide implementation-dependent constants + */ + +/* + * file system option (superblock flag) + */ + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ + +/* mount time flags for error handling */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ + +/* Quota support */ +#define JFS_USRQUOTA 0x00000010 +#define JFS_GRPQUOTA 0x00000020 + +/* mount time flag to disable journaling to disk */ +#define JFS_NOINTEGRITY 0x00000040 + +/* mount time flag to enable TRIM to ssd disks */ +#define JFS_DISCARD 0x00000080 + +/* commit option */ +#define JFS_COMMIT 0x00000f00 /* commit option mask */ +#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ +#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ +#define JFS_TMPFS 0x00000400 /* temporary file system - + * do not log/commit: + * Never implemented + */ + +/* log logical volume option */ +#define JFS_INLINELOG 0x00000800 /* inline log within file system */ +#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */ + +/* Secondary aggregate inode table */ +#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */ + +/* sparse regular file support */ +#define JFS_SPARSE 0x00020000 /* sparse regular file */ + +/* DASD Limits F226941 */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ + +/* big endian flag */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ + +/* Directory index */ +#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ + +/* platform options */ +#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ +/* Never implemented */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_AIX 0x80000000 /* AIX support */ + +/* + * buffer cache configuration + */ +/* page size */ +#ifdef PSIZE +#undef PSIZE +#endif +#define PSIZE 4096 /* page size (in byte) */ +#define L2PSIZE 12 /* log2(PSIZE) */ +#define POFFSET 4095 /* offset within page */ + +/* buffer page size */ +#define BPSIZE PSIZE + +/* + * fs fundamental size + * + * PSIZE >= file system block size >= PBSIZE >= DISIZE + */ +#define PBSIZE 512 /* physical block size (in byte) */ +#define L2PBSIZE 9 /* log2(PBSIZE) */ + +#define DISIZE 512 /* on-disk inode size (in byte) */ +#define L2DISIZE 9 /* log2(DISIZE) */ + +#define IDATASIZE 256 /* inode inline data size */ +#define IXATTRSIZE 128 /* inode inline extended attribute size */ + +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 + +#define IAG_SIZE 4096 +#define IAG_EXTENT_SIZE 4096 +#define INOSPERIAG 4096 /* number of disk inodes per iag */ +#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ +#define INOSPEREXT 32 /* number of disk inode per extent */ +#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */ +#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */ +#define INOSPERPAGE 8 /* number of disk inodes per 4K page */ +#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */ + +#define IAGFREELIST_LWM 64 + +#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */ +#define NUM_INODE_PER_EXTENT INOSPEREXT +#define NUM_INODE_PER_IAG INOSPERIAG + +#define MINBLOCKSIZE 512 +#define MAXBLOCKSIZE 4096 +#define MAXFILESIZE ((s64)1 << 52) + +#define JFS_LINK_MAX 0xffffffff + +/* Minimum number of bytes supported for a JFS partition */ +#define MINJFS (0x1000000) +#define MINJFSTEXT "16" + +/* + * file system block size -> physical block size + */ +#define LBOFFSET(x) ((x) & (PBSIZE - 1)) +#define LBNUMBER(x) ((x) >> L2PBSIZE) +#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE)) +#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE)) +/* size in byte -> last page number */ +#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) ) +/* size in byte -> last file system block number */ +#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) ) + +/* + * fixed physical block address (physical block size = 512 byte) + * + * NOTE: since we can't guarantee a physical block size of 512 bytes the use of + * these macros should be removed and the byte offset macros used instead. + */ +#define SUPER1_B 64 /* primary superblock */ +#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */ +#define AITBL_B (AIMAP_B + 16) /* + * 1st extent of aggregate inode table + */ +#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */ +#define BMAP_B (SUPER2_B + 8) /* block allocation map */ + +/* + * SIZE_OF_SUPER defines the total amount of space reserved on disk for the + * superblock. This is not the same as the superblock structure, since all of + * this space is not currently being used. + */ +#define SIZE_OF_SUPER PSIZE + +/* + * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table + */ +#define SIZE_OF_AG_TABLE PSIZE + +/* + * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of + * the inode allocation map (to hold iag) + */ +#define SIZE_OF_MAP_PAGE PSIZE + +/* + * fixed byte offset address + */ +#define SUPER1_OFF 0x8000 /* primary superblock */ +#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER) + /* + * Control page of aggregate inode map + * followed by 1st extent of map + */ +#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) + /* + * 1st extent of aggregate inode table + */ +#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) + /* + * secondary superblock + */ +#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER) + /* + * block allocation map + */ + +/* + * The following macro is used to indicate the number of reserved disk blocks at + * the front of an aggregate, in terms of physical blocks. This value is + * currently defined to be 32K. This turns out to be the same as the primary + * superblock's address, since it directly follows the reserved blocks. + */ +#define AGGR_RSVD_BLOCKS SUPER1_B + +/* + * The following macro is used to indicate the number of reserved bytes at the + * front of an aggregate. This value is currently defined to be 32K. This + * turns out to be the same as the primary superblock's byte offset, since it + * directly follows the reserved blocks. + */ +#define AGGR_RSVD_BYTES SUPER1_OFF + +/* + * The following macro defines the byte offset for the first inode extent in + * the aggregate inode table. This allows us to find the self inode to find the + * rest of the table. Currently this value is 44K. + */ +#define AGGR_INODE_TABLE_START AITBL_OFF + +/* + * fixed reserved inode number + */ +/* aggregate inode */ +#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */ +#define AGGREGATE_I 1 /* aggregate inode map inode */ +#define BMAP_I 2 /* aggregate block allocation map inode */ +#define LOG_I 3 /* aggregate inline log inode */ +#define BADBLOCK_I 4 /* aggregate bad block inode */ +#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait: + * fileset inode map inode + */ + +/* per fileset inode */ +#define FILESET_RSVD_I 0 /* fileset inode (reserved) */ +#define FILESET_EXT_I 1 /* fileset inode extension */ +#define ROOT_I 2 /* fileset root inode */ +#define ACL_I 3 /* fileset ACL inode */ + +#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file + * or directory or link... + */ +#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes + * an inode. (To fsck this is also the first + * inode in part 2 of the agg inode table.) + */ + +/* + * directory configuration + */ +#define JFS_NAME_MAX 255 +#define JFS_PATH_MAX BPSIZE + + +/* + * file system state (superblock state) + */ +#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ +#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ +#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean + * when mounted or + * commit failure occurred while being mounted: + * fsck() must be run to repair + */ +#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: + * fsck() must be run to repair + */ +#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ + +#endif /* _H_JFS_FILSYS */ diff --git a/kernel/fs/jfs/jfs_imap.c b/kernel/fs/jfs/jfs_imap.c new file mode 100644 index 000000000..f321986e7 --- /dev/null +++ b/kernel/fs/jfs/jfs_imap.c @@ -0,0 +1,3178 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_imap.c: inode allocation map manager + * + * Serialization: + * Each AG has a simple lock which is used to control the serialization of + * the AG level lists. This lock should be taken first whenever an AG + * level list will be modified or accessed. + * + * Each IAG is locked by obtaining the buffer for the IAG page. + * + * There is also a inode lock for the inode map inode. A read lock needs to + * be taken whenever an IAG is read from the map or the global level + * information is read. A write lock needs to be taken whenever the global + * level information is modified or an atomic operation needs to be used. + * + * If more than one IAG is read at one time, the read lock may not + * be given up until all of the IAG's are read. Otherwise, a deadlock + * may occur when trying to obtain the read lock while another thread + * holding the read lock is waiting on the IAG already being held. + * + * The control page of the inode map is read into memory by diMount(). + * Thereafter it should only be modified in memory and then it will be + * written out when the filesystem is unmounted by diUnmount(). + */ + +#include +#include +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * imap locks + */ +/* iag free list lock */ +#define IAGFREE_LOCK_INIT(imap) mutex_init(&imap->im_freelock) +#define IAGFREE_LOCK(imap) mutex_lock(&imap->im_freelock) +#define IAGFREE_UNLOCK(imap) mutex_unlock(&imap->im_freelock) + +/* per ag iag list locks */ +#define AG_LOCK_INIT(imap,index) mutex_init(&(imap->im_aglock[index])) +#define AG_LOCK(imap,agno) mutex_lock(&imap->im_aglock[agno]) +#define AG_UNLOCK(imap,agno) mutex_unlock(&imap->im_aglock[agno]) + +/* + * forward references + */ +static int diAllocAG(struct inomap *, int, bool, struct inode *); +static int diAllocAny(struct inomap *, int, bool, struct inode *); +static int diAllocBit(struct inomap *, struct iag *, int); +static int diAllocExt(struct inomap *, int, struct inode *); +static int diAllocIno(struct inomap *, int, struct inode *); +static int diFindFree(u32, int); +static int diNewExt(struct inomap *, struct iag *, int); +static int diNewIAG(struct inomap *, int *, int, struct metapage **); +static void duplicateIXtree(struct super_block *, s64, int, s64 *); + +static int diIAGRead(struct inomap * imap, int, struct metapage **); +static int copy_from_dinode(struct dinode *, struct inode *); +static void copy_to_dinode(struct dinode *, struct inode *); + +/* + * NAME: diMount() + * + * FUNCTION: initialize the incore inode map control structures for + * a fileset or aggregate init time. + * + * the inode map's control structure (dinomap) is + * brought in from disk and placed in virtual memory. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diMount(struct inode *ipimap) +{ + struct inomap *imap; + struct metapage *mp; + int index; + struct dinomap_disk *dinom_le; + + /* + * allocate/initialize the in-memory inode map control structure + */ + /* allocate the in-memory inode map control structure. */ + imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + if (imap == NULL) { + jfs_err("diMount: kmalloc returned NULL!"); + return -ENOMEM; + } + + /* read the on-disk inode map control structure. */ + + mp = read_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(imap); + return -EIO; + } + + /* copy the on-disk version to the in-memory version. */ + dinom_le = (struct dinomap_disk *) mp->data; + imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); + imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); + atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); + atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); + imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); + imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + imap->im_agctl[index].inofree = + le32_to_cpu(dinom_le->in_agctl[index].inofree); + imap->im_agctl[index].extfree = + le32_to_cpu(dinom_le->in_agctl[index].extfree); + imap->im_agctl[index].numinos = + le32_to_cpu(dinom_le->in_agctl[index].numinos); + imap->im_agctl[index].numfree = + le32_to_cpu(dinom_le->in_agctl[index].numfree); + } + + /* release the buffer. */ + release_metapage(mp); + + /* + * allocate/initialize inode allocation map locks + */ + /* allocate and init iag free list lock */ + IAGFREE_LOCK_INIT(imap); + + /* allocate and init ag list locks */ + for (index = 0; index < MAXAG; index++) { + AG_LOCK_INIT(imap, index); + } + + /* bind the inode map inode and inode map control structure + * to each other. + */ + imap->im_ipimap = ipimap; + JFS_IP(ipimap)->i_imap = imap; + + return (0); +} + + +/* + * NAME: diUnmount() + * + * FUNCTION: write to disk the incore inode map control structures for + * a fileset or aggregate at unmount time. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diUnmount(struct inode *ipimap, int mounterror) +{ + struct inomap *imap = JFS_IP(ipimap)->i_imap; + + /* + * update the on-disk inode map control structure + */ + + if (!(mounterror || isReadOnly(ipimap))) + diSync(ipimap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipimap->i_mapping, 0); + + /* + * free in-memory control structure + */ + kfree(imap); + + return (0); +} + + +/* + * diSync() + */ +int diSync(struct inode *ipimap) +{ + struct dinomap_disk *dinom_le; + struct inomap *imp = JFS_IP(ipimap)->i_imap; + struct metapage *mp; + int index; + + /* + * write imap global conrol page + */ + /* read the on-disk inode map control structure */ + mp = get_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("diSync: get_metapage failed!"); + return -EIO; + } + + /* copy the in-memory version to the on-disk version */ + dinom_le = (struct dinomap_disk *) mp->data; + dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); + dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); + dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); + dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); + dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); + dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + dinom_le->in_agctl[index].inofree = + cpu_to_le32(imp->im_agctl[index].inofree); + dinom_le->in_agctl[index].extfree = + cpu_to_le32(imp->im_agctl[index].extfree); + dinom_le->in_agctl[index].numinos = + cpu_to_le32(imp->im_agctl[index].numinos); + dinom_le->in_agctl[index].numfree = + cpu_to_le32(imp->im_agctl[index].numfree); + } + + /* write out the control structure */ + write_metapage(mp); + + /* + * write out dirty pages of imap + */ + filemap_write_and_wait(ipimap->i_mapping); + + diWriteSpecial(ipimap, 0); + + return (0); +} + + +/* + * NAME: diRead() + * + * FUNCTION: initialize an incore inode from disk. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * this routine handles incore inode initialization for + * both "special" and "regular" inodes. special inodes + * are those required early in the mount process and + * require special handling since much of the file system + * is not yet initialized. these "special" inodes are + * identified by a NULL inode map inode pointer and are + * actually initialized by a call to diReadSpecial(). + * + * for regular inodes, the iag describing the disk inode + * is read from disk to determine the inode extent address + * for the disk inode. with the inode extent address in + * hand, the page of the extent that contains the disk + * inode is read and the disk inode is copied to the + * incore inode. + * + * PARAMETERS: + * ip - pointer to incore inode to be initialized from disk. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOMEM - insufficient memory + * + */ +int diRead(struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int iagno, ino, extno, rc; + struct inode *ipimap; + struct dinode *dp; + struct iag *iagp; + struct metapage *mp; + s64 blkno, agstart; + struct inomap *imap; + int block_offset; + int inodes_left; + unsigned long pageno; + int rel_inode; + + jfs_info("diRead: ino = %ld", ip->i_ino); + + ipimap = sbi->ipimap; + JFS_IP(ip)->ipimap = ipimap; + + /* determine the iag number for this inode (number) */ + iagno = INOTOIAG(ip->i_ino); + + /* read the iag */ + imap = JFS_IP(ipimap)->i_imap; + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) { + jfs_err("diRead: diIAGRead returned %d", rc); + return (rc); + } + + iagp = (struct iag *) mp->data; + + /* determine inode extent that holds the disk inode */ + ino = ip->i_ino & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + + if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + release_metapage(mp); + return -ESTALE; + } + + /* get disk block number of the page within the inode extent + * that holds the disk inode. + */ + blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); + + /* get the ag for the iag */ + agstart = le64_to_cpu(iagp->agstart); + + release_metapage(mp); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + + /* read the page of disk inode */ + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) { + jfs_err("diRead: read_metapage failed"); + return -EIO; + } + + /* locate the disk inode requested */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + if (ip->i_ino != le32_to_cpu(dp->di_number)) { + jfs_error(ip->i_sb, "i_ino != di_number\n"); + rc = -EIO; + } else if (le32_to_cpu(dp->di_nlink) == 0) + rc = -ESTALE; + else + /* copy the disk inode to the in-memory inode */ + rc = copy_from_dinode(dp, ip); + + release_metapage(mp); + + /* set the ag for the inode */ + JFS_IP(ip)->agstart = agstart; + JFS_IP(ip)->active_ag = -1; + + return (rc); +} + + +/* + * NAME: diReadSpecial() + * + * FUNCTION: initialize a 'special' inode from disk. + * + * this routines handles aggregate level inodes. The + * inode cache cannot differentiate between the + * aggregate inodes and the filesystem inodes, so we + * handle these here. We don't actually use the aggregate + * inode map, since these inodes are at a fixed location + * and in some cases the aggregate inode map isn't initialized + * yet. + * + * PARAMETERS: + * sb - filesystem superblock + * inum - aggregate inode number + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: + * new inode - success + * NULL - i/o error. + */ +struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + uint address; + struct dinode *dp; + struct inode *ip; + struct metapage *mp; + + ip = new_inode(sb); + if (ip == NULL) { + jfs_err("diReadSpecial: new_inode returned NULL!"); + return ip; + } + + if (secondary) { + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + JFS_IP(ip)->ipimap = sbi->ipaimap2; + } else { + address = AITBL_OFF >> L2PSIZE; + JFS_IP(ip)->ipimap = sbi->ipaimap; + } + + ASSERT(inum < INOSPEREXT); + + ip->i_ino = inum; + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + set_nlink(ip, 1); /* Don't want iput() deleting it */ + iput(ip); + return (NULL); + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + if ((copy_from_dinode(dp, ip)) != 0) { + /* handle bad return by returning NULL for ip */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ + iput(ip); + /* release the page */ + release_metapage(mp); + return (NULL); + + } + + ip->i_mapping->a_ops = &jfs_metapage_aops; + mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS); + + /* Allocations to metadata inodes should not affect quotas */ + ip->i_flags |= S_NOQUOTA; + + if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { + sbi->gengen = le32_to_cpu(dp->di_gengen); + sbi->inostamp = le32_to_cpu(dp->di_inostamp); + } + + /* release the page */ + release_metapage(mp); + + /* + * __mark_inode_dirty expects inodes to be hashed. Since we don't + * want special inodes in the fileset inode space, we make them + * appear hashed, but do not put on any lists. hlist_del() + * will work fine and require no locking. + */ + hlist_add_fake(&ip->i_hash); + + return (ip); +} + +/* + * NAME: diWriteSpecial() + * + * FUNCTION: Write the special inode to disk + * + * PARAMETERS: + * ip - special inode + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: none + */ + +void diWriteSpecial(struct inode *ip, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + uint address; + struct dinode *dp; + ino_t inum = ip->i_ino; + struct metapage *mp; + + if (secondary) + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + else + address = AITBL_OFF >> L2PSIZE; + + ASSERT(inum < INOSPEREXT); + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + jfs_err("diWriteSpecial: failed to read aggregate inode " + "extent!"); + return; + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + copy_to_dinode(dp, ip); + memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); + + if (inum == FILESYSTEM_I) + dp->di_gengen = cpu_to_le32(sbi->gengen); + + /* write the page */ + write_metapage(mp); +} + +/* + * NAME: diFreeSpecial() + * + * FUNCTION: Free allocated space for special inode + */ +void diFreeSpecial(struct inode *ip) +{ + if (ip == NULL) { + jfs_err("diFreeSpecial called with NULL ip!"); + return; + } + filemap_write_and_wait(ip->i_mapping); + truncate_inode_pages(ip->i_mapping, 0); + iput(ip); +} + + + +/* + * NAME: diWrite() + * + * FUNCTION: write the on-disk inode portion of the in-memory inode + * to its corresponding on-disk inode. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * the inode contains the inode extent address for the disk + * inode. with the inode extent address in hand, the + * page of the extent that contains the disk inode is + * read and the disk inode portion of the incore inode + * is copied to the disk inode. + * + * PARAMETERS: + * tid - transacation id + * ip - pointer to incore inode to be written to the inode extent. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diWrite(tid_t tid, struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + s32 ino; + struct dinode *dp; + s64 blkno; + int block_offset; + int inodes_left; + struct metapage *mp; + unsigned long pageno; + int rel_inode; + int dioffset; + struct inode *ipimap; + uint type; + lid_t lid; + struct tlock *ditlck, *tlck; + struct linelock *dilinelock, *ilinelock; + struct lv *lv; + int n; + + ipimap = jfs_ip->ipimap; + + ino = ip->i_ino & (INOSPERIAG - 1); + + if (!addressPXD(&(jfs_ip->ixpxd)) || + (lengthPXD(&(jfs_ip->ixpxd)) != + JFS_IP(ipimap)->i_imap->im_nbperiext)) { + jfs_error(ip->i_sb, "ixpxd invalid\n"); + return -EIO; + } + + /* + * read the page of disk inode containing the specified inode: + */ + /* compute the block address of the page */ + blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + /* read the page of disk inode */ + retry: + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) + return -EIO; + + /* get the pointer to the disk inode */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; + + /* + * acquire transaction lock on the on-disk inode; + * N.B. tlock is acquired on ipimap not ip; + */ + if ((ditlck = + txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) + goto retry; + dilinelock = (struct linelock *) & ditlck->lock; + + /* + * copy btree root from in-memory inode to on-disk inode + * + * (tlock is taken from inline B+-tree root in in-memory + * inode when the B+-tree root is updated, which is pointed + * by jfs_ip->blid as well as being on tx tlock list) + * + * further processing of btree root is based on the copy + * in in-memory inode, where txLog() will log from, and, + * for xtree root, txUpdateMap() will update map and reset + * XAD_NEW bit; + */ + + if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { + /* + * This is the special xtree inside the directory for storing + * the directory table + */ + xtpage_t *p, *xp; + xad_t *xad; + + jfs_ip->xtlid = 0; + tlck = lid_to_tlock(lid); + assert(tlck->type & tlckXTREE); + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = (xtpage_t *) &dp->di_dirtable; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + + if ((lid = jfs_ip->blid) == 0) + goto inlineData; + jfs_ip->blid = 0; + + tlck = lid_to_tlock(lid); + type = tlck->type; + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * regular file: 16 byte (XAD slot) granularity + */ + if (type & tlckXTREE) { + xtpage_t *p, *xp; + xad_t *xad; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = &dp->di_xtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + /* + * directory: 32 byte (directory entry slot) granularity + */ + else if (type & tlckDTREE) { + dtpage_t *p, *xp; + + /* + * copy dtree root from inode to dinode: + */ + p = (dtpage_t *) &jfs_ip->i_dtroot; + xp = (dtpage_t *) & dp->di_dtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], + lv->length << L2DTSLOTSIZE); + } + } else { + jfs_err("diWrite: UFO tlock"); + } + + inlineData: + /* + * copy inline symlink from in-memory inode to on-disk inode + */ + if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; + lv->length = 2; + memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); + dilinelock->index++; + } + /* + * copy inline data from in-memory inode to on-disk inode: + * 128 byte slot granularity + */ + if (test_cflag(COMMIT_Inlineea, ip)) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; + lv->length = 1; + memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE); + dilinelock->index++; + + clear_cflag(COMMIT_Inlineea, ip); + } + + /* + * lock/copy inode base: 128 byte slot granularity + */ + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = dioffset >> L2INODESLOTSIZE; + copy_to_dinode(dp, ip); + if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { + lv->length = 2; + memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); + } else + lv->length = 1; + dilinelock->index++; + + /* release the buffer holding the updated on-disk inode. + * the buffer will be later written by commit processing. + */ + write_metapage(mp); + + return (rc); +} + + +/* + * NAME: diFree(ip) + * + * FUNCTION: free a specified inode from the inode working map + * for a fileset or aggregate. + * + * if the inode to be freed represents the first (only) + * free inode within the iag, the iag will be placed on + * the ag free inode list. + * + * freeing the inode will cause the inode extent to be + * freed if the inode is the only allocated inode within + * the extent. in this case all the disk resource backing + * up the inode extent will be freed. in addition, the iag + * will be placed on the ag extent free list if the extent + * is the first free extent in the iag. if freeing the + * extent also means that no free inodes will exist for + * the iag, the iag will also be removed from the ag free + * inode list. + * + * the iag describing the inode will be freed if the extent + * is to be freed and it is the only backed extent within + * the iag. in this case, the iag will be removed from the + * ag free extent list and ag free inode list and placed on + * the inode map's free iag list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held until all updates are complete. + * + * PARAMETERS: + * ip - inode to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diFree(struct inode *ip) +{ + int rc; + ino_t inum = ip->i_ino; + struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp; + struct metapage *mp, *amp, *bmp, *cmp, *dmp; + int iagno, ino, extno, bitno, sword, agno; + int back, fwd; + u32 bitmap, mask; + struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + pxd_t freepxd; + tid_t tid; + struct inode *iplist[3]; + struct tlock *tlck; + struct pxd_lock *pxdlock; + + /* + * This is just to suppress compiler warnings. The same logic that + * references these variables is used to initialize them. + */ + aiagp = biagp = ciagp = diagp = NULL; + + /* get the iag number containing the inode. + */ + iagno = INOTOIAG(inum); + + /* make sure that the iag is contained within + * the map. + */ + if (iagno >= imap->im_nextiag) { + print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, + imap, 32, 0); + jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n", + (uint) inum, iagno, imap->im_nextiag); + return -EIO; + } + + /* get the allocation group for this ino. + */ + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); + + /* Lock the AG specific inode map information + */ + AG_LOCK(imap, agno); + + /* Obtain read lock in imap inode. Don't release it until we have + * read all of the IAG's that we are going to. + */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ip->i_sb, "wmap shows inode already free\n"); + } + + if (!addressPXD(&iagp->inoext[extno])) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "invalid inoext\n"); + return -EIO; + } + + /* compute the bitmap for the extent reflecting the freed inode. + */ + bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; + + if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "numfree > numinos\n"); + return -EIO; + } + /* + * inode extent still has some inodes or below low water mark: + * keep the inode extent; + */ + if (bitmap || + imap->im_agctl[agno].numfree < 96 || + (imap->im_agctl[agno].numfree < 288 && + (((imap->im_agctl[agno].numfree * 100) / + imap->im_agctl[agno].numinos) <= 25))) { + /* if the iag currently has no free inodes (i.e., + * the inode being freed is the first free inode of iag), + * insert the iag at head of the inode free list for the ag. + */ + if (iagp->nfreeinos == 0) { + /* check if there are any iags on the ag inode + * free list. if so, read the first one so that + * we can link the current iag onto the list at + * the head. + */ + if ((fwd = imap->im_agctl[agno].inofree) >= 0) { + /* read the iag that currently is the head + * of the list. + */ + if ((rc = diIAGRead(imap, fwd, &))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + release_metapage(mp); + return (rc); + } + aiagp = (struct iag *) amp->data; + + /* make current head point back to the iag. + */ + aiagp->inofreeback = cpu_to_le32(iagno); + + write_metapage(amp); + } + + /* iag points forward to current head and iag + * becomes the new head of the list. + */ + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + IREAD_UNLOCK(ipimap); + + /* update the free inode summary map for the extent if + * freeing the inode means the extent will now have free + * inodes (i.e., the inode being freed is the first free + * inode of extent), + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] &= + cpu_to_le32(~(HIGHORDER >> bitno)); + } + + /* update the bitmap. + */ + iagp->wmap[extno] = cpu_to_le32(bitmap); + + /* update the free inode counts at the iag, ag and + * map level. + */ + le32_add_cpu(&iagp->nfreeinos, 1); + imap->im_agctl[agno].numfree += 1; + atomic_inc(&imap->im_numfree); + + /* release the AG inode map lock + */ + AG_UNLOCK(imap, agno); + + /* write the iag */ + write_metapage(mp); + + return (0); + } + + + /* + * inode extent has become free and above low water mark: + * free the inode extent; + */ + + /* + * prepare to update iag list(s) (careful update step 1) + */ + amp = bmp = cmp = dmp = NULL; + fwd = back = -1; + + /* check if the iag currently has no free extents. if so, + * it will be placed on the head of the ag extent free list. + */ + if (iagp->nfreeexts == 0) { + /* check if the ag extent free list has any iags. + * if so, read the iag at the head of the list now. + * this (head) iag will be updated later to reflect + * the addition of the current iag at the head of + * the list. + */ + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } else { + /* iag has free extents. check if the addition of a free + * extent will cause all extents to be free within this + * iag. if so, the iag will be removed from the ag extent + * free list and placed on the inode map's free iag list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + /* in preparation for removing the iag from the + * ag extent free list, read the iags preceding + * and following the iag on the ag extent free + * list. + */ + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent cause the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + int inofreeback = le32_to_cpu(iagp->inofreeback); + int inofreefwd = le32_to_cpu(iagp->inofreefwd); + + /* in preparation for removing the iag from the + * ag inode free list, read the iags preceding + * and following the iag on the ag inode free + * list. before reading these iags, we must make + * sure that we already don't have them in hand + * from up above, since re-reading an iag (buffer) + * we are currently holding would cause a deadlock. + */ + if (inofreefwd >= 0) { + + if (inofreefwd == fwd) + ciagp = (struct iag *) amp->data; + else if (inofreefwd == back) + ciagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreefwd, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + assert(ciagp != NULL); + } + + if (inofreeback >= 0) { + if (inofreeback == fwd) + diagp = (struct iag *) amp->data; + else if (inofreeback == back) + diagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreeback, &dmp))) + goto error_out; + diagp = (struct iag *) dmp->data; + } + assert(diagp != NULL); + } + } + + IREAD_UNLOCK(ipimap); + + /* + * invalidate any page of the inode extent freed from buffer cache; + */ + freepxd = iagp->inoext[extno]; + invalidate_pxd_metapages(ip, freepxd); + + /* + * update iag list(s) (careful update step 2) + */ + /* add the iag to the ag extent free list if this is the + * first free extent for the iag. + */ + if (iagp->nfreeexts == 0) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = + cpu_to_le32(imap->im_agctl[agno].extfree); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } else { + /* remove the iag from the ag extent list if all extents + * are now free and place it on the inode map iag free list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent causes the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) + ciagp->inofreeback = iagp->inofreeback; + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) + diagp->inofreefwd = iagp->inofreefwd; + else + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the inode extent address and working map + * to reflect the free extent. + * the permanent map should have been updated already + * for the inode being freed. + */ + if (iagp->pmap[extno] != 0) { + jfs_error(ip->i_sb, "the pmap does not show inode free\n"); + } + iagp->wmap[extno] = 0; + PXDlength(&iagp->inoext[extno], 0); + PXDaddress(&iagp->inoext[extno], 0); + + /* update the free extent and free inode summary maps + * to reflect the freed extent. + * the inode summary map is marked to indicate no inodes + * available for the freed extent. + */ + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + mask = HIGHORDER >> bitno; + iagp->inosmap[sword] |= cpu_to_le32(mask); + iagp->extsmap[sword] &= cpu_to_le32(~mask); + + /* update the number of free inodes and number of free extents + * for the iag. + */ + le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, 1); + + /* update the number of free inodes and backed inodes + * at the ag and inode map level. + */ + imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); + imap->im_agctl[agno].numinos -= INOSPEREXT; + atomic_sub(INOSPEREXT - 1, &imap->im_numfree); + atomic_sub(INOSPEREXT, &imap->im_numinos); + + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + if (dmp) + write_metapage(dmp); + + /* + * start transaction to update block allocation map + * for the inode extent freed; + * + * N.B. AG_LOCK is released and iag will be released below, and + * other thread may allocate inode from/reusing the ixad freed + * BUT with new/different backing inode extent from the extent + * to be freed by the transaction; + */ + tid = txBegin(ipimap->i_sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* acquire tlock of the iag page of the freed ixad + * to force the page NOHOMEOK (even though no data is + * logged from the iag page) until NOREDOPAGE|FREEXTENT log + * for the free of the extent is committed; + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor; + */ + tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = freepxd; + pxdlock->index = 1; + + write_metapage(mp); + + iplist[0] = ipimap; + + /* + * logredo needs the IAG number and IAG extent index in order + * to ensure that the IMap is consistent. The least disruptive + * way to pass these values through to the transaction manager + * is in the iplist array. + * + * It's not pretty, but it works. + */ + iplist[1] = (struct inode *) (size_t)iagno; + iplist[2] = (struct inode *) (size_t)extno; + + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* unlock the AG inode map information */ + AG_UNLOCK(imap, agno); + + return (0); + + error_out: + IREAD_UNLOCK(ipimap); + + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + if (dmp) + release_metapage(dmp); + + AG_UNLOCK(imap, agno); + + release_metapage(mp); + + return (rc); +} + +/* + * There are several places in the diAlloc* routines where we initialize + * the inode. + */ +static inline void +diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + ip->i_ino = (iagno << L2INOSPERIAG) + ino; + jfs_ip->ixpxd = iagp->inoext[extno]; + jfs_ip->agstart = le64_to_cpu(iagp->agstart); + jfs_ip->active_ag = -1; +} + + +/* + * NAME: diAlloc(pip,dir,ip) + * + * FUNCTION: allocate a disk inode from the inode working map + * for a fileset or aggregate. + * + * PARAMETERS: + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +int diAlloc(struct inode *pip, bool dir, struct inode *ip) +{ + int rc, ino, iagno, addext, extno, bitno, sword; + int nwords, rem, i, agno; + u32 mask, inosmap, extsmap; + struct inode *ipimap; + struct metapage *mp; + ino_t inum; + struct iag *iagp; + struct inomap *imap; + + /* get the pointers to the inode map inode and the + * corresponding imap control structure. + */ + ipimap = JFS_SBI(pip->i_sb)->ipimap; + imap = JFS_IP(ipimap)->i_imap; + JFS_IP(ip)->ipimap = ipimap; + JFS_IP(ip)->fileset = FILESYSTEM_I; + + /* for a directory, the allocation policy is to start + * at the ag level using the preferred ag. + */ + if (dir) { + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + /* for files, the policy starts off by trying to allocate from + * the same iag containing the parent disk inode: + * try to allocate the new disk inode close to the parent disk + * inode, using parent disk inode number + 1 as the allocation + * hint. (we use a left-to-right policy to attempt to avoid + * moving backward on the disk.) compute the hint within the + * file system and the iag. + */ + + /* get the ag number of this iag */ + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); + + if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { + /* + * There is an open file actively growing. We want to + * allocate new inodes from a different ag to avoid + * fragmentation problems. + */ + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + inum = pip->i_ino + 1; + ino = inum & (INOSPERIAG - 1); + + /* back off the hint if it is outside of the iag */ + if (ino == 0) + inum = pip->i_ino; + + /* lock the AG inode map information */ + AG_LOCK(imap, agno); + + /* Get read lock on imap inode */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* get the iag number and read the iag */ + iagno = INOTOIAG(inum); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* determine if new inode extent is allowed to be added to the iag. + * new inode extent can be added to the iag if the ag + * has less than 32 free disk inodes and the iag has free extents. + */ + addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); + + /* + * try to allocate from the IAG + */ + /* check if the inode may be allocated from the iag + * (i.e. the inode has free inodes or new extent can be added). + */ + if (iagp->nfreeinos || addext) { + /* determine the extent number of the hint. + */ + extno = ino >> L2INOSPEREXT; + + /* check if the extent containing the hint has backed + * inodes. if so, try to allocate within this extent. + */ + if (addressPXD(&iagp->inoext[extno])) { + bitno = ino & (INOSPEREXT - 1); + if ((bitno = + diFindFree(le32_to_cpu(iagp->wmap[extno]), + bitno)) + < INOSPEREXT) { + ino = (extno << L2INOSPEREXT) + bitno; + + /* a free inode (bit) was found within this + * extent, so allocate it. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) { + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + } + + if (!addext) + extno = + (extno == + EXTSPERIAG - 1) ? 0 : extno + 1; + } + + /* + * no free inodes within the extent containing the hint. + * + * try to allocate from the backed extents following + * hint or, if appropriate (i.e. addext is true), allocate + * an extent of free inodes at or following the extent + * containing the hint. + * + * the free inode and free extent summary maps are used + * here, so determine the starting summary map position + * and the number of words we'll have to examine. again, + * the approach is to allocate following the hint, so we + * might have to initially ignore prior bits of the summary + * map that represent extents prior to the extent containing + * the hint and later revisit these bits. + */ + bitno = extno & (EXTSPERSUM - 1); + nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; + sword = extno >> L2EXTSPERSUM; + + /* mask any prior bits for the starting words of the + * summary map. + */ + mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno)); + inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; + extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; + + /* scan the free inode and free extent summary maps for + * free resources. + */ + for (i = 0; i < nwords; i++) { + /* check if this word of the free inode summary + * map describes an extent with free inodes. + */ + if (~inosmap) { + /* an extent with free inodes has been + * found. determine the extent number + * and the inode number within the extent. + */ + rem = diFindFree(inosmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), + 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(ipimap); + release_metapage(mp); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, + "can't find free bit in wmap\n"); + return -EIO; + } + + /* determine the inode number within the + * iag and allocate the inode from the + * map. + */ + ino = (extno << L2INOSPEREXT) + rem; + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) + assert(rc == -EIO); + else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + + } + + /* check if we may allocate an extent of free + * inodes and whether this word of the free + * extents summary map describes a free extent. + */ + if (addext && ~extsmap) { + /* a free extent has been found. determine + * the extent number. + */ + rem = diFindFree(extsmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + + /* allocate an extent of free inodes. + */ + if ((rc = diNewExt(imap, iagp, extno))) { + /* if there is no disk space for a + * new extent, try to allocate the + * disk inode from somewhere else. + */ + if (rc == -ENOSPC) + break; + + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, + extno << L2INOSPEREXT, + extno, iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + /* free the imap inode & the AG lock & return. + */ + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + + /* move on to the next set of summary map words. + */ + sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; + inosmap = le32_to_cpu(iagp->inosmap[sword]); + extsmap = le32_to_cpu(iagp->extsmap[sword]); + } + } + /* unlock imap inode */ + IREAD_UNLOCK(ipimap); + + /* nothing doing in this iag, so release it. */ + release_metapage(mp); + + tryag: + /* + * try to allocate anywhere within the same AG as the parent inode. + */ + rc = diAllocAG(imap, agno, dir, ip); + + AG_UNLOCK(imap, agno); + + if (rc != -ENOSPC) + return (rc); + + /* + * try to allocate in any AG. + */ + return (diAllocAny(imap, agno, dir, ip)); +} + + +/* + * NAME: diAllocAG(imap,agno,dir,ip) + * + * FUNCTION: allocate a disk inode from the allocation group. + * + * this routine first determines if a new extent of free + * inodes should be added for the allocation group, with + * the current request satisfied from this extent. if this + * is the case, an attempt will be made to do just that. if + * this attempt fails or it has been determined that a new + * extent should not be added, an attempt is made to satisfy + * the request by allocating an existing (backed) free inode + * from the allocation group. + * + * PRE CONDITION: Already have the AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int rc, addext, numfree, numinos; + + /* get the number of free and the number of backed disk + * inodes currently within the ag. + */ + numfree = imap->im_agctl[agno].numfree; + numinos = imap->im_agctl[agno].numinos; + + if (numfree > numinos) { + jfs_error(ip->i_sb, "numfree > numinos\n"); + return -EIO; + } + + /* determine if we should allocate a new extent of free inodes + * within the ag: for directory inodes, add a new extent + * if there are a small number of free inodes or number of free + * inodes is a small percentage of the number of backed inodes. + */ + if (dir) + addext = (numfree < 64 || + (numfree < 256 + && ((numfree * 100) / numinos) <= 20)); + else + addext = (numfree == 0); + + /* + * try to allocate a new extent of free inodes. + */ + if (addext) { + /* if free space is not available for this new extent, try + * below to allocate a free and existing (already backed) + * inode from the ag. + */ + if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC) + return (rc); + } + + /* + * try to allocate an existing free inode from the ag. + */ + return (diAllocIno(imap, agno, ip)); +} + + +/* + * NAME: diAllocAny(imap,agno,dir,iap) + * + * FUNCTION: allocate a disk inode from any other allocation group. + * + * this routine is called when an allocation attempt within + * the primary allocation group has failed. if attempts to + * allocate an inode from any allocation group other than the + * specified primary group. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int ag, rc; + int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; + + + /* try to allocate from the ags following agno up to + * the maximum ag number. + */ + for (ag = agno + 1; ag <= maxag; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* try to allocate from the ags in front of agno. + */ + for (ag = 0; ag < agno; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* no free disk inodes. + */ + return -ENOSPC; +} + + +/* + * NAME: diAllocIno(imap,agno,ip) + * + * FUNCTION: allocate a disk inode from the allocation group's free + * inode list, returning an error if this free list is + * empty (i.e. no iags on the list). + * + * allocation occurs from the first iag on the list using + * the iag's free inode summary map to find the leftmost + * free inode in the iag. + * + * PRE CONDITION: Already have AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) +{ + int iagno, ino, rc, rem, extno, sword; + struct metapage *mp; + struct iag *iagp; + + /* check if there are iags on the ag's free inode list. + */ + if ((iagno = imap->im_agctl[agno].inofree) < 0) + return -ENOSPC; + + /* obtain read lock on imap inode */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + + /* read the iag at the head of the list. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* better be free inodes in this iag if it is on the + * list. + */ + if (!iagp->nfreeinos) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n"); + return -EIO; + } + + /* scan the free inode summary map to find an extent + * with free inodes. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, + "free inode not found in summary map\n"); + return -EIO; + } + + if (~iagp->inosmap[sword]) + break; + } + + /* found a extent with free inodes. determine + * the extent number. + */ + rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); + if (rem >= EXTSPERSUM) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "no free extent found\n"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* find the first free inode in the extent. + */ + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "free inode not found\n"); + return -EIO; + } + + /* compute the inode number within the iag. + */ + ino = (extno << L2INOSPEREXT) + rem; + + /* allocate the inode. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + release_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, ino, extno, iagp); + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocExt(imap,agno,ip) + * + * FUNCTION: add a new extent of free inodes to an iag, allocating + * an inode from this extent to satisfy the current allocation + * request. + * + * this routine first tries to find an existing iag with free + * extents through the ag free extent list. if list is not + * empty, the head of the list will be selected as the home + * of the new extent of free inodes. otherwise (the list is + * empty), a new iag will be allocated for the ag to contain + * the extent. + * + * once an iag has been selected, the free extent summary map + * is used to locate a free extent within the iag and diNewExt() + * is called to initialize the extent, with initialization + * including the allocation of the first inode of the extent + * for the purpose of satisfying this request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) +{ + int rem, iagno, sword, extno, rc; + struct metapage *mp; + struct iag *iagp; + + /* check if the ag has any iags with free extents. if not, + * allocate a new iag for the ag. + */ + if ((iagno = imap->im_agctl[agno].extfree) < 0) { + /* If successful, diNewIAG will obtain the read lock on the + * imap inode. + */ + if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { + return (rc); + } + iagp = (struct iag *) mp->data; + + /* set the ag number if this a brand new iag + */ + iagp->agstart = + cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); + } else { + /* read the iag. + */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "error reading iag\n"); + return rc; + } + iagp = (struct iag *) mp->data; + } + + /* using the free extent summary map, find a free extent. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "free ext summary map not found\n"); + return -EIO; + } + if (~iagp->extsmap[sword]) + break; + } + + /* determine the extent number of the free extent. + */ + rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); + if (rem >= EXTSPERSUM) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "free extent not found\n"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* initialize the new extent. + */ + rc = diNewExt(imap, iagp, extno); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + /* something bad happened. if a new iag was allocated, + * place it back on the inode map's iag free list, and + * clear the ag number information. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + write_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); + + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocBit(imap,iagp,ino) + * + * FUNCTION: allocate a backed inode from an iag. + * + * this routine performs the mechanics of allocating a + * specified inode from a backed extent. + * + * if the inode to be allocated represents the last free + * inode within the iag, the iag will be removed from the + * ag free inode list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held all are updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) +{ + int extno, bitno, agno, sword, rc; + struct metapage *amp = NULL, *bmp = NULL; + struct iag *aiagp = NULL, *biagp = NULL; + u32 mask; + + /* check if this is the last free inode within the iag. + * if so, it will have to be removed from the ag free + * inode list, so get the iags preceding and following + * it on the list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { + if ((rc = + diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), + &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { + if ((rc = + diIAGRead(imap, + le32_to_cpu(iagp->inofreeback), + &bmp))) { + if (amp) + release_metapage(amp); + return (rc); + } + biagp = (struct iag *) bmp->data; + } + } + + /* get the ag number, extent number, inode number within + * the extent. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + + /* compute the mask for setting the map. + */ + mask = HIGHORDER >> bitno; + + /* the inode should be free and backed. + */ + if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || + ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + + jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n"); + return -EIO; + } + + /* mark the inode as allocated in the working map. + */ + iagp->wmap[extno] |= cpu_to_le32(mask); + + /* check if all inodes within the extent are now + * allocated. if so, update the free inode summary + * map to reflect this. + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); + } + + /* if this was the last free inode in the iag, remove the + * iag from the ag free inode list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if (amp) { + aiagp->inofreeback = iagp->inofreeback; + write_metapage(amp); + } + + if (bmp) { + biagp->inofreefwd = iagp->inofreefwd; + write_metapage(bmp); + } else { + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + } + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the free inode count at the iag, ag, inode + * map levels. + */ + le32_add_cpu(&iagp->nfreeinos, -1); + imap->im_agctl[agno].numfree -= 1; + atomic_dec(&imap->im_numfree); + + return (0); +} + + +/* + * NAME: diNewExt(imap,iagp,extno) + * + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. + * + * disk resources are allocated for the new extent of inodes + * and the inodes themselves are initialized to reflect their + * existence within the extent (i.e. their inode numbers and + * inode extent addresses are set) and their initial state + * (mode and link count are set to zero). + * + * if the iag is new, it is not yet on an ag extent free list + * but will now be placed on this list. + * + * if the allocation of the new extent causes the iag to + * have no free extent, the iag will be removed from the + * ag extent free list. + * + * if the iag has no free backed inodes, it will be placed + * on the ag free inode list, since the addition of the new + * extent will now cause it to have free inodes. + * + * a careful update approach is used to provide consistency + * (i.e. list consistency) in the face of updates to multiple + * buffers. under this approach, all required buffers are + * obtained before making any updates and are held until all + * updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) +{ + int agno, iagno, fwd, back, freei = 0, sword, rc; + struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL; + struct metapage *amp, *bmp, *cmp, *dmp; + struct inode *ipimap; + s64 blkno, hint; + int i, j; + u32 mask; + ino_t ino; + struct dinode *dp; + struct jfs_sb_info *sbi; + + /* better have free extents. + */ + if (!iagp->nfreeexts) { + jfs_error(imap->im_ipimap->i_sb, "no free extents\n"); + return -EIO; + } + + /* get the inode map inode. + */ + ipimap = imap->im_ipimap; + sbi = JFS_SBI(ipimap->i_sb); + + amp = bmp = cmp = NULL; + + /* get the ag and iag numbers for this iag. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + iagno = le32_to_cpu(iagp->iagnum); + + /* check if this is the last free extent within the + * iag. if so, the iag must be removed from the ag + * free extent list, so get the iags preceding and + * following the iag on this list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } else { + /* the iag has free extents. if all extents are free + * (as is the case for a newly allocated iag), the iag + * must be added to the ag free extent list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. + */ + fwd = back = -1; + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } + } + + /* check if the iag has no free inodes. if so, the iag + * will have to be added to the ag free inode list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. in doing this, we must + * check if we already have the iag at the head of + * the list in hand. + */ + if (iagp->nfreeinos == 0) { + freei = imap->im_agctl[agno].inofree; + + if (freei >= 0) { + if (freei == fwd) { + ciagp = aiagp; + } else if (freei == back) { + ciagp = biagp; + } else { + if ((rc = diIAGRead(imap, freei, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + if (ciagp == NULL) { + jfs_error(imap->im_ipimap->i_sb, + "ciagp == NULL\n"); + rc = -EIO; + goto error_out; + } + } + } + + /* allocate disk space for the inode extent. + */ + if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) + hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; + else + hint = addressPXD(&iagp->inoext[extno - 1]) + + lengthPXD(&iagp->inoext[extno - 1]) - 1; + + if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) + goto error_out; + + /* compute the inode number of the first inode within the + * extent. + */ + ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); + + /* initialize the inodes within the newly allocated extent a + * page at a time. + */ + for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { + /* get a buffer for this page of disk inodes. + */ + dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); + if (dmp == NULL) { + rc = -EIO; + goto error_out; + } + dp = (struct dinode *) dmp->data; + + /* initialize the inode number, mode, link count and + * inode extent address. + */ + for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { + dp->di_inostamp = cpu_to_le32(sbi->inostamp); + dp->di_number = cpu_to_le32(ino); + dp->di_fileset = cpu_to_le32(FILESYSTEM_I); + dp->di_mode = 0; + dp->di_nlink = 0; + PXDaddress(&(dp->di_ixpxd), blkno); + PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); + } + write_metapage(dmp); + } + + /* if this is the last free extent within the iag, remove the + * iag from the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + } else { + /* if the iag has all free extents (newly allocated iag), + * add the iag to the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = cpu_to_le32(fwd); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } + } + + /* if the iag has no free inodes, add the iag to the + * ag free inode list. + */ + if (iagp->nfreeinos == 0) { + if (freei >= 0) + ciagp->inofreeback = cpu_to_le32(iagno); + + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + + /* initialize the extent descriptor of the extent. */ + PXDlength(&iagp->inoext[extno], imap->im_nbperiext); + PXDaddress(&iagp->inoext[extno], blkno); + + /* initialize the working and persistent map of the extent. + * the working map will be initialized such that + * it indicates the first inode of the extent is allocated. + */ + iagp->wmap[extno] = cpu_to_le32(HIGHORDER); + iagp->pmap[extno] = 0; + + /* update the free inode and free extent summary maps + * for the extent to indicate the extent has free inodes + * and no longer represents a free extent. + */ + sword = extno >> L2EXTSPERSUM; + mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); + iagp->extsmap[sword] |= cpu_to_le32(mask); + iagp->inosmap[sword] &= cpu_to_le32(~mask); + + /* update the free inode and free extent counts for the + * iag. + */ + le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, -1); + + /* update the free and backed inode counts for the ag. + */ + imap->im_agctl[agno].numfree += (INOSPEREXT - 1); + imap->im_agctl[agno].numinos += INOSPEREXT; + + /* update the free and backed inode counts for the inode map. + */ + atomic_add(INOSPEREXT - 1, &imap->im_numfree); + atomic_add(INOSPEREXT, &imap->im_numinos); + + /* write the iags. + */ + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + + return (0); + + error_out: + + /* release the iags. + */ + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + + return (rc); +} + + +/* + * NAME: diNewIAG(imap,iagnop,agno) + * + * FUNCTION: allocate a new iag for an allocation group. + * + * first tries to allocate the iag from the inode map + * iagfree list: + * if the list has free iags, the head of the list is removed + * and returned to satisfy the request. + * if the inode map's iag free list is empty, the inode map + * is extended to hold a new iag. this new iag is initialized + * and returned to satisfy the request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the + * newly allocated iag upon successful return. + * agno - allocation group number. + * bpp - Buffer pointer to be filled in with new IAG's buffer + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + * + * serialization: + * AG lock held on entry/exit; + * write lock on the map is held inside; + * read lock on the map is held on successful completion; + * + * note: new iag transaction: + * . synchronously write iag; + * . write log of xtree and inode of imap; + * . commit; + * . synchronous write of xtree (right to left, bottom to top); + * . at start of logredo(): init in-memory imap with one additional iag page; + * . at end of logredo(): re-read imap inode to determine + * new imap size; + */ +static int +diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) +{ + int rc; + int iagno, i, xlen; + struct inode *ipimap; + struct super_block *sb; + struct jfs_sb_info *sbi; + struct metapage *mp; + struct iag *iagp; + s64 xaddr = 0; + s64 blkno; + tid_t tid; + struct inode *iplist[1]; + + /* pick up pointers to the inode map and mount inodes */ + ipimap = imap->im_ipimap; + sb = ipimap->i_sb; + sbi = JFS_SBI(sb); + + /* acquire the free iag lock */ + IAGFREE_LOCK(imap); + + /* if there are any iags on the inode map free iag list, + * allocate the iag from the head of the list. + */ + if (imap->im_freeiag >= 0) { + /* pick up the iag number at the head of the list */ + iagno = imap->im_freeiag; + + /* determine the logical block number of the iag */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + } else { + /* no free iags. the inode map will have to be extented + * to include a new iag. + */ + + /* acquire inode map lock */ + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); + + if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { + IWRITE_UNLOCK(ipimap); + IAGFREE_UNLOCK(imap); + jfs_error(imap->im_ipimap->i_sb, + "ipimap->i_size is wrong\n"); + return -EIO; + } + + + /* get the next available iag number */ + iagno = imap->im_nextiag; + + /* make sure that we have not exceeded the maximum inode + * number limit. + */ + if (iagno > (MAXIAGS - 1)) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -ENOSPC; + goto out; + } + + /* + * synchronously append new iag page. + */ + /* determine the logical address of iag page to append */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + + /* Allocate extent for new iag page */ + xlen = sbi->nbperpage; + if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* + * start transaction of update of the inode map + * addressing structure pointing to the new iag page; + */ + tid = txBegin(sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* update the inode map addressing structure to point to it */ + if ((rc = + xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* update the inode map's inode to reflect the extension */ + ipimap->i_size += PSIZE; + inode_add_bytes(ipimap, PSIZE); + + /* assign a buffer for the page */ + mp = get_metapage(ipimap, blkno, PSIZE, 0); + if (!mp) { + /* + * This is very unlikely since we just created the + * extent, but let's try to handle it correctly + */ + xtTruncate(tid, ipimap, ipimap->i_size - PSIZE, + COMMIT_PWMAP); + + txAbort(tid, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* init the iag */ + memset(iagp, 0, sizeof(struct iag)); + iagp->iagnum = cpu_to_le32(iagno); + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + iagp->iagfree = cpu_to_le32(-1); + iagp->nfreeinos = 0; + iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); + + /* initialize the free inode summary map (free extent + * summary map initialization handled by bzero). + */ + for (i = 0; i < SMAPSZ; i++) + iagp->inosmap[i] = cpu_to_le32(ONES); + + /* + * Write and sync the metapage + */ + flush_metapage(mp); + + /* + * txCommit(COMMIT_FORCE) will synchronously write address + * index pages and inode after commit in careful update order + * of address index pages (right to left, bottom up); + */ + iplist[0] = ipimap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + duplicateIXtree(sb, blkno, xlen, &xaddr); + + /* update the next available iag number */ + imap->im_nextiag += 1; + + /* Add the iag to the iag free list so we don't lose the iag + * if a failure happens now. + */ + imap->im_freeiag = iagno; + + /* Until we have logredo working, we want the imap inode & + * control page to be up to date. + */ + diSync(ipimap); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + } + + /* obtain read lock on map */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* remove the iag from the iag free list */ + imap->im_freeiag = le32_to_cpu(iagp->iagfree); + iagp->iagfree = cpu_to_le32(-1); + + /* set the return iag number and buffer pointer */ + *iagnop = iagno; + *mpp = mp; + + out: + /* release the iag free lock */ + IAGFREE_UNLOCK(imap); + + return (rc); +} + +/* + * NAME: diIAGRead() + * + * FUNCTION: get the buffer for the specified iag within a fileset + * or aggregate inode map. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful + * exit. + * + * SERIALIZATION: + * must have read lock on imap inode + * (When called by diExtendFS, the filesystem is quiesced, therefore + * the read lock is unnecessary.) + * + * RETURN VALUES: + * 0 - success. + * -EIO - i/o error. + */ +static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) +{ + struct inode *ipimap = imap->im_ipimap; + s64 blkno; + + /* compute the logical block number of the iag. */ + blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); + + /* read the iag. */ + *mpp = read_metapage(ipimap, blkno, PSIZE, 0); + if (*mpp == NULL) { + return -EIO; + } + + return (0); +} + +/* + * NAME: diFindFree() + * + * FUNCTION: find the first free bit in a word starting at + * the specified bit position. + * + * PARAMETERS: + * word - word to be examined. + * start - starting bit position. + * + * RETURN VALUES: + * bit position of first free bit in the word or 32 if + * no free bits were found. + */ +static int diFindFree(u32 word, int start) +{ + int bitno; + assert(start < 32); + /* scan the word for the first free bit. */ + for (word <<= start, bitno = start; bitno < 32; + bitno++, word <<= 1) { + if ((word & HIGHORDER) == 0) + break; + } + return (bitno); +} + +/* + * NAME: diUpdatePMap() + * + * FUNCTION: Update the persistent map in an IAG for the allocation or + * freeing of the specified inode. + * + * PRE CONDITIONS: Working map has already been updated for allocate. + * + * PARAMETERS: + * ipimap - Incore inode map inode + * inum - Number of inode to mark in permanent map + * is_free - If 'true' indicates inode should be marked freed, otherwise + * indicates inode should be marked allocated. + * + * RETURN VALUES: + * 0 for success + */ +int +diUpdatePMap(struct inode *ipimap, + unsigned long inum, bool is_free, struct tblock * tblk) +{ + int rc; + struct iag *iagp; + struct metapage *mp; + int iagno, ino, extno, bitno; + struct inomap *imap; + u32 mask; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + imap = JFS_IP(ipimap)->i_imap; + /* get the iag number containing the inode */ + iagno = INOTOIAG(inum); + /* make sure that the iag is contained within the map */ + if (iagno >= imap->im_nextiag) { + jfs_error(ipimap->i_sb, "the iag is outside the map\n"); + return -EIO; + } + /* read the iag */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) + return (rc); + metapage_wait_for_io(mp); + iagp = (struct iag *) mp->data; + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + /* + * mark the inode free in persistent map: + */ + if (is_free) { + /* The inode should have been allocated both in working + * map and in persistent map; + * the inode will be freed from working map at the release + * of last reference release; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "inode %ld not marked as allocated in wmap!\n", + inum); + } + if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "inode %ld not marked as allocated in pmap!\n", + inum); + } + /* update the bitmap for the extent of the freed inode */ + iagp->pmap[extno] &= cpu_to_le32(~mask); + } + /* + * mark the inode allocated in persistent map: + */ + else { + /* The inode should be already allocated in the working map + * and should be free in persistent map; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "the inode is not allocated in the working map\n"); + return -EIO; + } + if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "the inode is not free in the persistent map\n"); + return -EIO; + } + /* update the bitmap for the extent of the allocated inode */ + iagp->pmap[extno] |= cpu_to_le32(mask); + } + /* + * update iag lsn + */ + lsn = tblk->lsn; + log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(difft, lsn, log); + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + /* move mp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + /* inherit younger/larger clsn */ + assert(mp->clsn); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + /* insert mp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + write_metapage(mp); + return (0); +} + +/* + * diExtendFS() + * + * function: update imap for extendfs(); + * + * note: AG size has been increased s.t. each k old contiguous AGs are + * coalesced into a new AG; + */ +int diExtendFS(struct inode *ipimap, struct inode *ipbmap) +{ + int rc, rcx = 0; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + struct iag *iagp = NULL, *hiagp = NULL; + struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *bp, *hbp; + int i, n, head; + int numinos, xnuminos = 0, xnumfree = 0; + s64 agstart; + + jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d", + imap->im_nextiag, atomic_read(&imap->im_numinos), + atomic_read(&imap->im_numfree)); + + /* + * reconstruct imap + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + + /* init per AG control information im_agctl[] */ + for (i = 0; i < MAXAG; i++) { + imap->im_agctl[i].inofree = -1; + imap->im_agctl[i].extfree = -1; + imap->im_agctl[i].numinos = 0; /* number of backed inodes */ + imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ + } + + /* + * process each iag page of the map. + * + * rebuild AG Free Inode List, AG Free Inode Extent List; + */ + for (i = 0; i < imap->im_nextiag; i++) { + if ((rc = diIAGRead(imap, i, &bp))) { + rcx = rc; + continue; + } + iagp = (struct iag *) bp->data; + if (le32_to_cpu(iagp->iagnum) != i) { + release_metapage(bp); + jfs_error(ipimap->i_sb, "unexpected value of iagnum\n"); + return -EIO; + } + + /* leave free iag in the free iag list */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + release_metapage(bp); + continue; + } + + agstart = le64_to_cpu(iagp->agstart); + n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); + + /* compute backed inodes */ + numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) + << L2INOSPEREXT; + if (numinos > 0) { + /* merge AG backed inodes */ + imap->im_agctl[n].numinos += numinos; + xnuminos += numinos; + } + + /* if any backed free inodes, insert at AG free inode list */ + if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { + if ((head = imap->im_agctl[n].inofree) == -1) { + iagp->inofreefwd = cpu_to_le32(-1); + iagp->inofreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->inofreeback = iagp->iagnum; + iagp->inofreefwd = cpu_to_le32(head); + iagp->inofreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].inofree = + le32_to_cpu(iagp->iagnum); + + /* merge AG backed free inodes */ + imap->im_agctl[n].numfree += + le32_to_cpu(iagp->nfreeinos); + xnumfree += le32_to_cpu(iagp->nfreeinos); + } + + /* if any free extents, insert at AG free extent list */ + if (le32_to_cpu(iagp->nfreeexts) > 0) { + if ((head = imap->im_agctl[n].extfree) == -1) { + iagp->extfreefwd = cpu_to_le32(-1); + iagp->extfreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->extfreeback = iagp->iagnum; + iagp->extfreefwd = cpu_to_le32(head); + iagp->extfreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].extfree = + le32_to_cpu(iagp->iagnum); + } + + nextiag: + write_metapage(bp); + } + + if (xnuminos != atomic_read(&imap->im_numinos) || + xnumfree != atomic_read(&imap->im_numfree)) { + jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n"); + return -EIO; + } + + return rcx; +} + + +/* + * duplicateIXtree() + * + * serialization: IWRITE_LOCK held on entry/exit + * + * note: shadow page with regular inode (rel.2); + */ +static void duplicateIXtree(struct super_block *sb, s64 blkno, + int xlen, s64 *xaddr) +{ + struct jfs_superblock *j_sb; + struct buffer_head *bh; + struct inode *ip; + tid_t tid; + + /* if AIT2 ipmap2 is bad, do not try to update it */ + if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ + return; + ip = diReadSpecial(sb, FILESYSTEM_I, 1); + if (ip == NULL) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + if (readSuper(sb, &bh)) + return; + j_sb = (struct jfs_superblock *)bh->b_data; + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + return; + } + + /* start transaction */ + tid = txBegin(sb, COMMIT_FORCE); + /* update the inode map addressing structure to point to it */ + if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + txAbort(tid, 1); + goto cleanup; + + } + /* update the inode map's inode to reflect the extension */ + ip->i_size += PSIZE; + inode_add_bytes(ip, PSIZE); + txCommit(tid, 1, &ip, COMMIT_FORCE); + cleanup: + txEnd(tid); + diFreeSpecial(ip); +} + +/* + * NAME: copy_from_dinode() + * + * FUNCTION: Copies inode info from disk inode to in-memory inode + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + */ +static int copy_from_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); + jfs_set_inode_flags(ip); + + ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; + if (sbi->umask != -1) { + ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask); + /* For directories, add x permission if r is allowed by umask */ + if (S_ISDIR(ip->i_mode)) { + if (ip->i_mode & 0400) + ip->i_mode |= 0100; + if (ip->i_mode & 0040) + ip->i_mode |= 0010; + if (ip->i_mode & 0004) + ip->i_mode |= 0001; + } + } + set_nlink(ip, le32_to_cpu(dip->di_nlink)); + + jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid)); + if (!uid_valid(sbi->uid)) + ip->i_uid = jfs_ip->saved_uid; + else { + ip->i_uid = sbi->uid; + } + + jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid)); + if (!gid_valid(sbi->gid)) + ip->i_gid = jfs_ip->saved_gid; + else { + ip->i_gid = sbi->gid; + } + + ip->i_size = le64_to_cpu(dip->di_size); + ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); + ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); + ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); + ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); + ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); + ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); + ip->i_generation = le32_to_cpu(dip->di_gen); + + jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ + jfs_ip->acl = dip->di_acl; /* as are dxd's */ + jfs_ip->ea = dip->di_ea; + jfs_ip->next_index = le32_to_cpu(dip->di_next_index); + jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); + jfs_ip->acltype = le32_to_cpu(dip->di_acltype); + + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) { + jfs_ip->dev = le32_to_cpu(dip->di_rdev); + ip->i_rdev = new_decode_dev(jfs_ip->dev); + } + + if (S_ISDIR(ip->i_mode)) { + memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); + } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { + memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); + } else + memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128); + + /* Zero the in-memory-only stuff */ + jfs_ip->cflag = 0; + jfs_ip->btindex = 0; + jfs_ip->btorder = 0; + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + jfs_ip->atlhead = 0; + jfs_ip->atltail = 0; + jfs_ip->xtlid = 0; + return (0); +} + +/* + * NAME: copy_to_dinode() + * + * FUNCTION: Copies inode info from in-memory inode to disk inode + */ +static void copy_to_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + dip->di_fileset = cpu_to_le32(jfs_ip->fileset); + dip->di_inostamp = cpu_to_le32(sbi->inostamp); + dip->di_number = cpu_to_le32(ip->i_ino); + dip->di_gen = cpu_to_le32(ip->i_generation); + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); + if (!uid_valid(sbi->uid)) + dip->di_uid = cpu_to_le32(i_uid_read(ip)); + else + dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns, + jfs_ip->saved_uid)); + if (!gid_valid(sbi->gid)) + dip->di_gid = cpu_to_le32(i_gid_read(ip)); + else + dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, + jfs_ip->saved_gid)); + jfs_get_inode_flags(jfs_ip); + /* + * mode2 is only needed for storing the higher order bits. + * Trust i_mode for the lower order ones + */ + if (sbi->umask == -1) + dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | + ip->i_mode); + else /* Leave the original permissions alone */ + dip->di_mode = cpu_to_le32(jfs_ip->mode2); + + dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); + dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); + dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); + dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); + dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); + dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); + dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ + dip->di_acl = jfs_ip->acl; /* as are dxd's */ + dip->di_ea = jfs_ip->ea; + dip->di_next_index = cpu_to_le32(jfs_ip->next_index); + dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); + dip->di_otime.tv_nsec = 0; + dip->di_acltype = cpu_to_le32(jfs_ip->acltype); + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) + dip->di_rdev = cpu_to_le32(jfs_ip->dev); +} diff --git a/kernel/fs/jfs/jfs_imap.h b/kernel/fs/jfs/jfs_imap.h new file mode 100644 index 000000000..610a0e9d8 --- /dev/null +++ b/kernel/fs/jfs/jfs_imap.h @@ -0,0 +1,175 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_IMAP +#define _H_JFS_IMAP + +#include "jfs_txnmgr.h" + +/* + * jfs_imap.h: disk inode manager + */ + +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERSUM 32 /* number of extents per summary map entry */ +#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ +#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ + +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ + +/* convert inode number to iag number */ +#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) + +/* convert iag number to logical block number of the iag page */ +#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg)) + +/* get the starting block number of the 4K page of an inode extent + * that contains ino. + */ +#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ + ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) + +/* + * inode allocation map: + * + * inode allocation map consists of + * . the inode map control page and + * . inode allocation group pages (per 4096 inodes) + * which are addressed by standard JFS xtree. + */ +/* + * inode allocation group page (per 4096 inodes of an AG) + */ +struct iag { + __le64 agstart; /* 8: starting block of ag */ + __le32 iagnum; /* 4: inode allocation group number */ + __le32 inofreefwd; /* 4: ag inode free list forward */ + __le32 inofreeback; /* 4: ag inode free list back */ + __le32 extfreefwd; /* 4: ag inode extent free list forward */ + __le32 extfreeback; /* 4: ag inode extent free list back */ + __le32 iagfree; /* 4: iag free list */ + + /* summary map: 1 bit per inode extent */ + __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. + */ + __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ + __le32 nfreeinos; /* 4: number of free inodes */ + __le32 nfreeexts; /* 4: number of free extents */ + /* (72) */ + u8 pad[1976]; /* 1976: pad to 2048 bytes */ + /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ + __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ + pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ +}; /* (4096) */ + +/* + * per AG control information (in inode map control page) + */ +struct iagctl_disk { + __le32 inofree; /* 4: free inode list anchor */ + __le32 extfree; /* 4: free extent list anchor */ + __le32 numinos; /* 4: number of backed inodes */ + __le32 numfree; /* 4: number of free inodes */ +}; /* (16) */ + +struct iagctl { + int inofree; /* free inode list anchor */ + int extfree; /* free extent list anchor */ + int numinos; /* number of backed inodes */ + int numfree; /* number of free inodes */ +}; + +/* + * per fileset/aggregate inode map control page + */ +struct dinomap_disk { + __le32 in_freeiag; /* 4: free iag list anchor */ + __le32 in_nextiag; /* 4: next free iag number */ + __le32 in_numinos; /* 4: num of backed inodes */ + __le32 in_numfree; /* 4: num of free backed inodes */ + __le32 in_nbperiext; /* 4: num of blocks per inode extent */ + __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + __le32 in_diskblock; /* 4: for standalone test driver */ + __le32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ + struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ +}; /* (4096) */ + +struct dinomap { + int in_freeiag; /* free iag list anchor */ + int in_nextiag; /* next free iag number */ + int in_numinos; /* num of backed inodes */ + int in_numfree; /* num of free backed inodes */ + int in_nbperiext; /* num of blocks per inode extent */ + int in_l2nbperiext; /* l2 of in_nbperiext */ + int in_diskblock; /* for standalone test driver */ + int in_maxag; /* for standalone test driver */ + struct iagctl in_agctl[MAXAG]; /* AG control information */ +}; + +/* + * In-core inode map control page + */ +struct inomap { + struct dinomap im_imap; /* 4096: inode allocation control */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct mutex im_freelock; /* 4: iag free list lock */ + struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ + u32 *im_DBGdimap; + atomic_t im_numinos; /* num of backed inodes */ + atomic_t im_numfree; /* num of free backed inodes */ +}; + +#define im_freeiag im_imap.in_freeiag +#define im_nextiag im_imap.in_nextiag +#define im_agctl im_imap.in_agctl +#define im_nbperiext im_imap.in_nbperiext +#define im_l2nbperiext im_imap.in_l2nbperiext + +/* for standalone testdriver + */ +#define im_diskblock im_imap.in_diskblock +#define im_maxag im_imap.in_maxag + +extern int diFree(struct inode *); +extern int diAlloc(struct inode *, bool, struct inode *); +extern int diSync(struct inode *); +/* external references */ +extern int diUpdatePMap(struct inode *ipimap, unsigned long inum, + bool is_free, struct tblock * tblk); +extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap); +extern int diMount(struct inode *); +extern int diUnmount(struct inode *, int); +extern int diRead(struct inode *); +extern struct inode *diReadSpecial(struct super_block *, ino_t, int); +extern void diWriteSpecial(struct inode *, int); +extern void diFreeSpecial(struct inode *); +extern int diWrite(tid_t tid, struct inode *); +#endif /* _H_JFS_IMAP */ diff --git a/kernel/fs/jfs/jfs_incore.h b/kernel/fs/jfs/jfs_incore.h new file mode 100644 index 000000000..fa7e795bd --- /dev/null +++ b/kernel/fs/jfs/jfs_incore.h @@ -0,0 +1,228 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INCORE +#define _H_JFS_INCORE + +#include +#include +#include +#include +#include "jfs_types.h" +#include "jfs_xtree.h" +#include "jfs_dtree.h" + +/* + * JFS magic number + */ +#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ + +/* + * JFS-private inode information + */ +struct jfs_inode_info { + int fileset; /* fileset number (always 16)*/ + uint mode2; /* jfs-specific mode */ + kuid_t saved_uid; /* saved for uid mount option */ + kgid_t saved_gid; /* saved for gid mount option */ + pxd_t ixpxd; /* inode extent descriptor */ + dxd_t acl; /* dxd describing acl */ + dxd_t ea; /* dxd describing ea */ + time_t otime; /* time created */ + uint next_index; /* next available directory entry index */ + int acltype; /* Type of ACL */ + short btorder; /* access order */ + short btindex; /* btpage entry index*/ + struct inode *ipimap; /* inode map */ + unsigned long cflag; /* commit flags */ + u64 agstart; /* agstart of the containing IAG */ + u16 bxflag; /* xflag of pseudo buffer? */ + unchar pad; + signed char active_ag; /* ag currently allocating from */ + lid_t blid; /* lid of pseudo buffer? */ + lid_t atlhead; /* anonymous tlock list head */ + lid_t atltail; /* anonymous tlock list tail */ + spinlock_t ag_lock; /* protects active_ag */ + struct list_head anon_inode_list; /* inodes having anonymous txns */ + /* + * rdwrlock serializes xtree between reads & writes and synchronizes + * changes to special inodes. It's use would be redundant on + * directories since the i_mutex taken in the VFS is sufficient. + */ + struct rw_semaphore rdwrlock; + /* + * commit_mutex serializes transaction processing on an inode. + * It must be taken after beginning a transaction (txBegin), since + * dirty inodes may be committed while a new transaction on the + * inode is blocked in txBegin or TxBeginAnon + */ + struct mutex commit_mutex; + /* xattr_sem allows us to access the xattrs without taking i_mutex */ + struct rw_semaphore xattr_sem; + lid_t xtlid; /* lid of xtree lock on directory */ + union { + struct { + xtpage_t _xtroot; /* 288: xtree root */ + struct inomap *_imap; /* 4: inode map header */ + } file; + struct { + struct dir_table_slot _table[12]; /* 96: dir index */ + dtroot_t _dtroot; /* 288: dtree root */ + } dir; + struct { + unchar _unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + unchar _inline[128]; /* 128: inline symlink */ + /* _inline_ea may overlay the last part of + * file._xtroot if maxentry = XTROOTINITSLOT + */ + unchar _inline_ea[128]; /* 128: inline extended attr */ + } link; + } u; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + u32 dev; /* will die when we get wide dev_t */ + struct inode vfs_inode; +}; +#define i_xtroot u.file._xtroot +#define i_imap u.file._imap +#define i_dirtable u.dir._table +#define i_dtroot u.dir._dtroot +#define i_inline u.link._inline +#define i_inline_ea u.link._inline_ea + +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) + +/* + * cflag + */ +enum cflags { + COMMIT_Nolink, /* inode committed with zero link count */ + COMMIT_Inlineea, /* commit inode inline EA */ + COMMIT_Freewmap, /* free WMAP at iClose() */ + COMMIT_Dirty, /* Inode is really dirty */ + COMMIT_Dirtable, /* commit changes to di_dirtable */ + COMMIT_Stale, /* data extent is no longer valid */ + COMMIT_Synclist, /* metadata pages on group commit synclist */ +}; + +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + +#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) +#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_and_clear_cflag(flag, ip) \ + test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) +/* + * JFS-private superblock information. + */ +struct jfs_sb_info { + struct super_block *sb; /* Point back to vfs super block */ + unsigned long mntflag; /* aggregate attributes */ + struct inode *ipbmap; /* block map inode */ + struct inode *ipaimap; /* aggregate inode map inode */ + struct inode *ipaimap2; /* secondary aimap inode */ + struct inode *ipimap; /* aggregate inode map inode */ + struct jfs_log *log; /* log */ + struct list_head log_list; /* volumes associated with a journal */ + short bsize; /* logical block size */ + short l2bsize; /* log2 logical block size */ + short nbperpage; /* blocks per page */ + short l2nbperpage; /* log2 blocks per page */ + short l2niperblk; /* log2 inodes per page */ + dev_t logdev; /* external log device */ + uint aggregate; /* volume identifier in log record */ + pxd_t logpxd; /* pxd describing log */ + pxd_t fsckpxd; /* pxd describing fsck wkspc */ + pxd_t ait2; /* pxd describing AIT copy */ + char uuid[16]; /* 128-bit uuid for volume */ + char loguuid[16]; /* 128-bit uuid for log */ + /* + * commit_state is used for synchronization of the jfs_commit + * threads. It is protected by LAZY_LOCK(). + */ + int commit_state; /* commit state */ + /* Formerly in ipimap */ + uint gengen; /* inode generation generator*/ + uint inostamp; /* shows inode belongs to fileset*/ + + /* Formerly in ipbmap */ + struct bmap *bmap; /* incore bmap descriptor */ + struct nls_table *nls_tab; /* current codepage */ + struct inode *direct_inode; /* metadata inode */ + uint state; /* mount/recovery state */ + unsigned long flag; /* mount time flags */ + uint p_state; /* state prior to going no integrity */ + kuid_t uid; /* uid to override on-disk uid */ + kgid_t gid; /* gid to override on-disk gid */ + uint umask; /* umask to override on-disk umask */ + uint minblks_trim; /* minimum blocks, for online trim */ +}; + +/* jfs_sb_info commit_state */ +#define IN_LAZYCOMMIT 1 + +static inline struct jfs_inode_info *JFS_IP(struct inode *inode) +{ + return list_entry(inode, struct jfs_inode_info, vfs_inode); +} + +static inline int jfs_dirtable_inline(struct inode *inode) +{ + return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)); +} + +static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int isReadOnly(struct inode *inode) +{ + if (JFS_SBI(inode->i_sb)->log) + return 0; + return 1; +} +#endif /* _H_JFS_INCORE */ diff --git a/kernel/fs/jfs/jfs_inode.c b/kernel/fs/jfs/jfs_inode.c new file mode 100644 index 000000000..6b0f81620 --- /dev/null +++ b/kernel/fs/jfs/jfs_inode.c @@ -0,0 +1,165 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_dinode.h" +#include "jfs_debug.h" + + +void jfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = JFS_IP(inode)->mode2; + unsigned int new_fl = 0; + + if (flags & JFS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & JFS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & JFS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & JFS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (flags & JFS_SYNC_FL) + new_fl |= S_SYNC; + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME | + S_DIRSYNC | S_SYNC); +} + +void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) +{ + unsigned int flags = jfs_ip->vfs_inode.i_flags; + + jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | + JFS_DIRSYNC_FL | JFS_SYNC_FL); + if (flags & S_IMMUTABLE) + jfs_ip->mode2 |= JFS_IMMUTABLE_FL; + if (flags & S_APPEND) + jfs_ip->mode2 |= JFS_APPEND_FL; + if (flags & S_NOATIME) + jfs_ip->mode2 |= JFS_NOATIME_FL; + if (flags & S_DIRSYNC) + jfs_ip->mode2 |= JFS_DIRSYNC_FL; + if (flags & S_SYNC) + jfs_ip->mode2 |= JFS_SYNC_FL; +} + +/* + * NAME: ialloc() + * + * FUNCTION: Allocate a new inode + * + */ +struct inode *ialloc(struct inode *parent, umode_t mode) +{ + struct super_block *sb = parent->i_sb; + struct inode *inode; + struct jfs_inode_info *jfs_inode; + int rc; + + inode = new_inode(sb); + if (!inode) { + jfs_warn("ialloc: new_inode returned NULL!"); + rc = -ENOMEM; + goto fail; + } + + jfs_inode = JFS_IP(inode); + + rc = diAlloc(parent, S_ISDIR(mode), inode); + if (rc) { + jfs_warn("ialloc: diAlloc returned %d!", rc); + if (rc == -EIO) + make_bad_inode(inode); + goto fail_put; + } + + if (insert_inode_locked(inode) < 0) { + rc = -EINVAL; + goto fail_put; + } + + inode_init_owner(inode, parent, mode); + /* + * New inodes need to save sane values on disk when + * uid & gid mount options are used + */ + jfs_inode->saved_uid = inode->i_uid; + jfs_inode->saved_gid = inode->i_gid; + + /* + * Allocate inode to quota. + */ + dquot_initialize(inode); + rc = dquot_alloc_inode(inode); + if (rc) + goto fail_drop; + + /* inherit flags from parent */ + jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; + + if (S_ISDIR(mode)) { + jfs_inode->mode2 |= IDIRECTORY; + jfs_inode->mode2 &= ~JFS_DIRSYNC_FL; + } + else { + jfs_inode->mode2 |= INLINEEA | ISPARSE; + if (S_ISLNK(mode)) + jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); + } + jfs_inode->mode2 |= inode->i_mode; + + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + jfs_inode->otime = inode->i_ctime.tv_sec; + inode->i_generation = JFS_SBI(sb)->gengen++; + + jfs_inode->cflag = 0; + + /* Zero remaining fields */ + memset(&jfs_inode->acl, 0, sizeof(dxd_t)); + memset(&jfs_inode->ea, 0, sizeof(dxd_t)); + jfs_inode->next_index = 0; + jfs_inode->acltype = 0; + jfs_inode->btorder = 0; + jfs_inode->btindex = 0; + jfs_inode->bxflag = 0; + jfs_inode->blid = 0; + jfs_inode->atlhead = 0; + jfs_inode->atltail = 0; + jfs_inode->xtlid = 0; + jfs_set_inode_flags(inode); + + jfs_info("ialloc returns inode = 0x%p\n", inode); + + return inode; + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + clear_nlink(inode); + unlock_new_inode(inode); +fail_put: + iput(inode); +fail: + return ERR_PTR(rc); +} diff --git a/kernel/fs/jfs/jfs_inode.h b/kernel/fs/jfs/jfs_inode.h new file mode 100644 index 000000000..9271cfe4a --- /dev/null +++ b/kernel/fs/jfs/jfs_inode.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INODE +#define _H_JFS_INODE + +struct fid; + +extern struct inode *ialloc(struct inode *, umode_t); +extern int jfs_fsync(struct file *, loff_t, loff_t, int); +extern long jfs_ioctl(struct file *, unsigned int, unsigned long); +extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); +extern struct inode *jfs_iget(struct super_block *, unsigned long); +extern int jfs_commit_inode(struct inode *, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); +extern void jfs_evict_inode(struct inode *); +extern void jfs_dirty_inode(struct inode *, int); +extern void jfs_truncate(struct inode *); +extern void jfs_truncate_nolock(struct inode *, loff_t); +extern void jfs_free_zero_link(struct inode *); +extern struct dentry *jfs_get_parent(struct dentry *dentry); +extern void jfs_get_inode_flags(struct jfs_inode_info *); +extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern void jfs_set_inode_flags(struct inode *); +extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); + +extern const struct address_space_operations jfs_aops; +extern const struct inode_operations jfs_dir_inode_operations; +extern const struct file_operations jfs_dir_operations; +extern const struct inode_operations jfs_file_inode_operations; +extern const struct file_operations jfs_file_operations; +extern const struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_fast_symlink_inode_operations; +extern const struct dentry_operations jfs_ci_dentry_operations; +#endif /* _H_JFS_INODE */ diff --git a/kernel/fs/jfs/jfs_lock.h b/kernel/fs/jfs/jfs_lock.h new file mode 100644 index 000000000..ecf048822 --- /dev/null +++ b/kernel/fs/jfs/jfs_lock.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOCK +#define _H_JFS_LOCK + +#include +#include +#include + +/* + * jfs_lock.h + */ + +/* + * Conditional sleep where condition is protected by spinlock + * + * lock_cmd and unlock_cmd take and release the spinlock + */ +#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \ +do { \ + DECLARE_WAITQUEUE(__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE);\ + if (cond) \ + break; \ + unlock_cmd; \ + io_schedule(); \ + lock_cmd; \ + } \ + __set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#endif /* _H_JFS_LOCK */ diff --git a/kernel/fs/jfs/jfs_logmgr.c b/kernel/fs/jfs/jfs_logmgr.c new file mode 100644 index 000000000..bc462dcd7 --- /dev/null +++ b/kernel/fs/jfs/jfs_logmgr.c @@ -0,0 +1,2533 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_logmgr.c: log manager + * + * for related information, see transaction manager (jfs_txnmgr.c), and + * recovery manager (jfs_logredo.c). + * + * note: for detail, RTFS. + * + * log buffer manager: + * special purpose buffer manager supporting log i/o requirements. + * per log serial pageout of logpage + * queuing i/o requests and redrive i/o at iodone + * maintain current logpage buffer + * no caching since append only + * appropriate jfs buffer cache buffers as needed + * + * group commit: + * transactions which wrote COMMIT records in the same in-memory + * log page during the pageout of previous/current log page(s) are + * committed together by the pageout of the page. + * + * TBD lazy commit: + * transactions are committed asynchronously when the log page + * containing it COMMIT is paged out when it becomes full; + * + * serialization: + * . a per log lock serialize log write. + * . a per log lock serialize group commit. + * . a per log lock serialize log open/close; + * + * TBD log integrity: + * careful-write (ping-pong) of last logpage to recover from crash + * in overwrite. + * detection of split (out-of-order) write of physical sectors + * of last logpage via timestamp at end of each sector + * with its mirror data array at trailer). + * + * alternatives: + * lsn - 64-bit monotonically increasing integer vs + * 32-bit lspn and page eor. + */ + +#include +#include +#include +#include +#include +#include /* for sync_blockdev() */ +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + + +/* + * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) + */ +static struct lbuf *log_redrive_list; +static DEFINE_SPINLOCK(log_redrive_lock); + + +/* + * log read/write serialization (per log) + */ +#define LOG_LOCK_INIT(log) mutex_init(&(log)->loglock) +#define LOG_LOCK(log) mutex_lock(&((log)->loglock)) +#define LOG_UNLOCK(log) mutex_unlock(&((log)->loglock)) + + +/* + * log group commit serialization (per log) + */ + +#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) +#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) +#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) +#define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) + +/* + * log sync serialization (per log) + */ +#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/4) +/* +#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/2) +*/ + + +/* + * log buffer cache synchronization + */ +static DEFINE_SPINLOCK(jfsLCacheLock); + +#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) +#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) + +/* + * See __SLEEP_COND in jfs_locks.h + */ +#define LCACHE_SLEEP_COND(wq, cond, flags) \ +do { \ + if (cond) \ + break; \ + __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ +} while (0) + +#define LCACHE_WAKEUP(event) wake_up(event) + + +/* + * lbuf buffer cache (lCache) control + */ +/* log buffer manager pageout control (cumulative, inclusive) */ +#define lbmREAD 0x0001 +#define lbmWRITE 0x0002 /* enqueue at tail of write queue; + * init pageout if at head of queue; + */ +#define lbmRELEASE 0x0004 /* remove from write queue + * at completion of pageout; + * do not free/recycle it yet: + * caller will free it; + */ +#define lbmSYNC 0x0008 /* do not return to freelist + * when removed from write queue; + */ +#define lbmFREE 0x0010 /* return to freelist + * at completion of pageout; + * the buffer may be recycled; + */ +#define lbmDONE 0x0020 +#define lbmERROR 0x0040 +#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing + * of log page + */ +#define lbmDIRECT 0x0100 + +/* + * Global list of active external journals + */ +static LIST_HEAD(jfs_external_logs); +static struct jfs_log *dummy_log; +static DEFINE_MUTEX(jfs_log_mutex); + +/* + * forward references + */ +static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, + struct lrd * lrd, struct tlock * tlck); + +static int lmNextPage(struct jfs_log * log); +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate); + +static int open_inline_log(struct super_block *sb); +static int open_dummy_log(struct super_block *sb); +static int lbmLogInit(struct jfs_log * log); +static void lbmLogShutdown(struct jfs_log * log); +static struct lbuf *lbmAllocate(struct jfs_log * log, int); +static void lbmFree(struct lbuf * bp); +static void lbmfree(struct lbuf * bp); +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); +static int lbmIOWait(struct lbuf * bp, int flag); +static bio_end_io_t lbmIODone; +static void lbmStartIO(struct lbuf * bp); +static void lmGCwrite(struct jfs_log * log, int cant_block); +static int lmLogSync(struct jfs_log * log, int hard_sync); + + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct lmStat { + uint commit; /* # of commit */ + uint pagedone; /* # of page written */ + uint submitted; /* # of pages submitted */ + uint full_page; /* # of full pages submitted */ + uint partial_page; /* # of partial pages submitted */ +} lmStat; +#endif + +static void write_special_inodes(struct jfs_log *log, + int (*writer)(struct address_space *)) +{ + struct jfs_sb_info *sbi; + + list_for_each_entry(sbi, &log->sb_list, log_list) { + writer(sbi->ipbmap->i_mapping); + writer(sbi->ipimap->i_mapping); + writer(sbi->direct_inode->i_mapping); + } +} + +/* + * NAME: lmLog() + * + * FUNCTION: write a log record; + * + * PARAMETER: + * + * RETURN: lsn - offset to the next log record to write (end-of-log); + * -1 - error; + * + * note: todo: log error handler + */ +int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn; + int diffp, difft; + struct metapage *mp = NULL; + unsigned long flags; + + jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", + log, tblk, lrd, tlck); + + LOG_LOCK(log); + + /* log by (out-of-transaction) JFS ? */ + if (tblk == NULL) + goto writeRecord; + + /* log from page ? */ + if (tlck == NULL || + tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) + goto writeRecord; + + /* + * initialize/update page/transaction recovery lsn + */ + lsn = log->lsn; + + LOGSYNC_LOCK(log, flags); + + /* + * initialize page lsn if first log write of the page + */ + if (mp->lsn == 0) { + mp->log = log; + mp->lsn = lsn; + log->count++; + + /* insert page at tail of logsynclist */ + list_add_tail(&mp->synclist, &log->synclist); + } + + /* + * initialize/update lsn of tblock of the page + * + * transaction inherits oldest lsn of pages associated + * with allocation/deallocation of resources (their + * log records are used to reconstruct allocation map + * at recovery time: inode for inode allocation map, + * B+-tree index of extent descriptors for block + * allocation map); + * allocation map pages inherit transaction lsn at + * commit time to allow forwarding log syncpt past log + * records associated with allocation/deallocation of + * resources only after persistent map of these map pages + * have been updated and propagated to home. + */ + /* + * initialize transaction lsn: + */ + if (tblk->lsn == 0) { + /* inherit lsn of its first page logged */ + tblk->lsn = mp->lsn; + log->count++; + + /* insert tblock after the page on logsynclist */ + list_add(&tblk->synclist, &mp->synclist); + } + /* + * update transaction lsn: + */ + else { + /* inherit oldest/smallest lsn of page */ + logdiff(diffp, mp->lsn, log); + logdiff(difft, tblk->lsn, log); + if (diffp < difft) { + /* update tblock lsn with page lsn */ + tblk->lsn = mp->lsn; + + /* move tblock after page on logsynclist */ + list_move(&tblk->synclist, &mp->synclist); + } + } + + LOGSYNC_UNLOCK(log, flags); + + /* + * write the log record + */ + writeRecord: + lsn = lmWriteRecord(log, tblk, lrd, tlck); + + /* + * forward log syncpt if log reached next syncpt trigger + */ + logdiff(diffp, lsn, log); + if (diffp >= log->nextsync) + lsn = lmLogSync(log, 0); + + /* update end-of-log lsn */ + log->lsn = lsn; + + LOG_UNLOCK(log); + + /* return end-of-log address */ + return lsn; +} + +/* + * NAME: lmWriteRecord() + * + * FUNCTION: move the log record to current log page + * + * PARAMETER: cd - commit descriptor + * + * RETURN: end-of-log address + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int +lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn = 0; /* end-of-log address */ + struct lbuf *bp; /* dst log page buffer */ + struct logpage *lp; /* dst log page */ + caddr_t dst; /* destination address in log page */ + int dstoffset; /* end-of-log offset in log page */ + int freespace; /* free space in log page */ + caddr_t p; /* src meta-data page */ + caddr_t src; + int srclen; + int nbytes; /* number of bytes to move */ + int i; + int len; + struct linelock *linelock; + struct lv *lv; + struct lvd *lvd; + int l2linesize; + + len = 0; + + /* retrieve destination log page to write */ + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = log->eor; + + /* any log data to write ? */ + if (tlck == NULL) + goto moveLrd; + + /* + * move log record data + */ + /* retrieve source meta-data page to log */ + if (tlck->flag & tlckPAGELOCK) { + p = (caddr_t) (tlck->mp->data); + linelock = (struct linelock *) & tlck->lock; + } + /* retrieve source in-memory inode to log */ + else if (tlck->flag & tlckINODELOCK) { + if (tlck->type & tlckDTREE) + p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; + else + p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; + linelock = (struct linelock *) & tlck->lock; + } +#ifdef _JFS_WIP + else if (tlck->flag & tlckINLINELOCK) { + + inlinelock = (struct inlinelock *) & tlck; + p = (caddr_t) & inlinelock->pxd; + linelock = (struct linelock *) & tlck; + } +#endif /* _JFS_WIP */ + else { + jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); + return 0; /* Probably should trap */ + } + l2linesize = linelock->l2linesize; + + moveData: + ASSERT(linelock->index <= linelock->maxcnt); + + lv = linelock->lv; + for (i = 0; i < linelock->index; i++, lv++) { + if (lv->length == 0) + continue; + + /* is page full ? */ + if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { + /* page become full: move on to next page */ + lmNextPage(log); + + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + } + + /* + * move log vector data + */ + src = (u8 *) p + (lv->offset << l2linesize); + srclen = lv->length << l2linesize; + len += srclen; + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + dstoffset += nbytes; + + /* is page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + break; + + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + + srclen -= nbytes; + src += nbytes; + } + + /* + * move log vector descriptor + */ + len += 4; + lvd = (struct lvd *) ((caddr_t) lp + dstoffset); + lvd->offset = cpu_to_le16(lv->offset); + lvd->length = cpu_to_le16(lv->length); + dstoffset += 4; + jfs_info("lmWriteRecord: lv offset:%d length:%d", + lv->offset, lv->length); + } + + if ((i = linelock->next)) { + linelock = (struct linelock *) lid_to_tlock(i); + goto moveData; + } + + /* + * move log record descriptor + */ + moveLrd: + lrd->length = cpu_to_le16(len); + + src = (caddr_t) lrd; + srclen = LOGRDSIZE; + + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + + dstoffset += nbytes; + srclen -= nbytes; + + /* are there more to move than freespace of page ? */ + if (srclen) + goto pageFull; + + /* + * end of log record descriptor + */ + + /* update last log record eor */ + log->eor = dstoffset; + bp->l_eor = dstoffset; + lsn = (log->page << L2LOGPSIZE) + dstoffset; + + if (lrd->type & cpu_to_le16(LOG_COMMIT)) { + tblk->clsn = lsn; + jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, + bp->l_eor); + + INCREMENT(lmStat.commit); /* # of commit */ + + /* + * enqueue tblock for group commit: + * + * enqueue tblock of non-trivial/synchronous COMMIT + * at tail of group commit queue + * (trivial/asynchronous COMMITs are ignored by + * group commit.) + */ + LOGGC_LOCK(log); + + /* init tblock gc state */ + tblk->flag = tblkGC_QUEUE; + tblk->bp = log->bp; + tblk->pn = log->page; + tblk->eor = log->eor; + + /* enqueue transaction to commit queue */ + list_add_tail(&tblk->cqueue, &log->cqueue); + + LOGGC_UNLOCK(log); + } + + jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", + le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); + + /* page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + return lsn; + + pageFull: + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + src += nbytes; + } + + return lsn; +} + + +/* + * NAME: lmNextPage() + * + * FUNCTION: write current page and allocate next page. + * + * PARAMETER: log + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmNextPage(struct jfs_log * log) +{ + struct logpage *lp; + int lspn; /* log sequence page number */ + int pn; /* current page number */ + struct lbuf *bp; + struct lbuf *nextbp; + struct tblock *tblk; + + /* get current log page number and log sequence page number */ + pn = log->page; + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lspn = le32_to_cpu(lp->h.page); + + LOGGC_LOCK(log); + + /* + * write or queue the full page at the tail of write queue + */ + /* get the tail tblk on commit queue */ + if (list_empty(&log->cqueue)) + tblk = NULL; + else + tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); + + /* every tblk who has COMMIT record on the current page, + * and has not been committed, must be on commit queue + * since tblk is queued at commit queueu at the time + * of writing its COMMIT record on the page before + * page becomes full (even though the tblk thread + * who wrote COMMIT record may have been suspended + * currently); + */ + + /* is page bound with outstanding tail tblk ? */ + if (tblk && tblk->pn == pn) { + /* mark tblk for end-of-page */ + tblk->flag |= tblkGC_EOP; + + if (log->cflag & logGC_PAGEOUT) { + /* if page is not already on write queue, + * just enqueue (no lbmWRITE to prevent redrive) + * buffer to wqueue to ensure correct serial order + * of the pages since log pages will be added + * continuously + */ + if (bp->l_wqnext == NULL) + lbmWrite(log, bp, 0, 0); + } else { + /* + * No current GC leader, initiate group commit + */ + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + /* page is not bound with outstanding tblk: + * init write or mark it to be redriven (lbmWRITE) + */ + else { + /* finalize the page */ + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); + } + LOGGC_UNLOCK(log); + + /* + * allocate/initialize next page + */ + /* if log wraps, the first data page of log is 2 + * (0 never used, 1 is superblock). + */ + log->page = (pn == log->size - 1) ? 2 : pn + 1; + log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ + + /* allocate/initialize next log page buffer */ + nextbp = lbmAllocate(log, log->page); + nextbp->l_eor = log->eor; + log->bp = nextbp; + + /* initialize next log page */ + lp = (struct logpage *) nextbp->l_ldata; + lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + return 0; +} + + +/* + * NAME: lmGroupCommit() + * + * FUNCTION: group commit + * initiate pageout of the pages with COMMIT in the order of + * page number - redrive pageout of the page at the head of + * pageout queue until full page has been written. + * + * RETURN: + * + * NOTE: + * LOGGC_LOCK serializes log group commit queue, and + * transaction blocks on the commit queue. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) +{ + int rc = 0; + + LOGGC_LOCK(log); + + /* group committed already ? */ + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); + + if (tblk->xflag & COMMIT_LAZY) + tblk->flag |= tblkGC_LAZY; + + if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && + (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) + || jfs_tlocks_low)) { + /* + * No pageout in progress + * + * start group commit as its group leader. + */ + log->cflag |= logGC_PAGEOUT; + + lmGCwrite(log, 0); + } + + if (tblk->xflag & COMMIT_LAZY) { + /* + * Lazy transactions can leave now + */ + LOGGC_UNLOCK(log); + return 0; + } + + /* lmGCwrite gives up LOGGC_LOCK, check again */ + + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + + /* upcount transaction waiting for completion + */ + log->gcrtc++; + tblk->flag |= tblkGC_READY; + + __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), + LOGGC_LOCK(log), LOGGC_UNLOCK(log)); + + /* removed from commit queue */ + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; +} + +/* + * NAME: lmGCwrite() + * + * FUNCTION: group commit write + * initiate write of log page, building a group of all transactions + * with commit records on that page. + * + * RETURN: None + * + * NOTE: + * LOGGC_LOCK must be held by caller. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +static void lmGCwrite(struct jfs_log * log, int cant_write) +{ + struct lbuf *bp; + struct logpage *lp; + int gcpn; /* group commit page number */ + struct tblock *tblk; + struct tblock *xtblk = NULL; + + /* + * build the commit group of a log page + * + * scan commit queue and make a commit group of all + * transactions with COMMIT records on the same log page. + */ + /* get the head tblk on the commit queue */ + gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; + + list_for_each_entry(tblk, &log->cqueue, cqueue) { + if (tblk->pn != gcpn) + break; + + xtblk = tblk; + + /* state transition: (QUEUE, READY) -> COMMIT */ + tblk->flag |= tblkGC_COMMIT; + } + tblk = xtblk; /* last tblk of the page */ + + /* + * pageout to commit transactions on the log page. + */ + bp = (struct lbuf *) tblk->bp; + lp = (struct logpage *) bp->l_ldata; + /* is page already full ? */ + if (tblk->flag & tblkGC_EOP) { + /* mark page to free at end of group commit of the page */ + tblk->flag &= ~tblkGC_EOP; + tblk->flag |= tblkGC_FREE; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, + cant_write); + INCREMENT(lmStat.full_page); + } + /* page is not yet full */ + else { + bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); + INCREMENT(lmStat.partial_page); + } +} + +/* + * NAME: lmPostGC() + * + * FUNCTION: group commit post-processing + * Processes transactions after their commit records have been written + * to disk, redriving log I/O if necessary. + * + * RETURN: None + * + * NOTE: + * This routine is called a interrupt time by lbmIODone + */ +static void lmPostGC(struct lbuf * bp) +{ + unsigned long flags; + struct jfs_log *log = bp->l_log; + struct logpage *lp; + struct tblock *tblk, *temp; + + //LOGGC_LOCK(log); + spin_lock_irqsave(&log->gclock, flags); + /* + * current pageout of group commit completed. + * + * remove/wakeup transactions from commit queue who were + * group committed with the current log page + */ + list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { + if (!(tblk->flag & tblkGC_COMMIT)) + break; + /* if transaction was marked GC_COMMIT then + * it has been shipped in the current pageout + * and made it to disk - it is committed. + */ + + if (bp->l_flag & lbmERROR) + tblk->flag |= tblkGC_ERROR; + + /* remove it from the commit queue */ + list_del(&tblk->cqueue); + tblk->flag &= ~tblkGC_QUEUE; + + if (tblk == log->flush_tblk) { + /* we can stop flushing the log now */ + clear_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, + tblk->flag); + + if (!(tblk->xflag & COMMIT_FORCE)) + /* + * Hand tblk over to lazy commit thread + */ + txLazyUnlock(tblk); + else { + /* state transition: COMMIT -> COMMITTED */ + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + LOGGC_WAKEUP(tblk); + } + + /* was page full before pageout ? + * (and this is the last tblk bound with the page) + */ + if (tblk->flag & tblkGC_FREE) + lbmFree(bp); + /* did page become full after pageout ? + * (and this is the last tblk bound with the page) + */ + else if (tblk->flag & tblkGC_EOP) { + /* finalize the page */ + lp = (struct logpage *) bp->l_ldata; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + jfs_info("lmPostGC: calling lbmWrite"); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, + 1); + } + + } + + /* are there any transactions who have entered lnGroupCommit() + * (whose COMMITs are after that of the last log page written. + * They are waiting for new group commit (above at (SLEEP 1)) + * or lazy transactions are on a full (queued) log page, + * select the latest ready transaction as new group leader and + * wake her up to lead her group. + */ + if ((!list_empty(&log->cqueue)) && + ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || + test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) + /* + * Call lmGCwrite with new group leader + */ + lmGCwrite(log, 1); + + /* no transaction are ready yet (transactions are only just + * queued (GC_QUEUE) and not entered for group commit yet). + * the first transaction entering group commit + * will elect herself as new group leader. + */ + else + log->cflag &= ~logGC_PAGEOUT; + + //LOGGC_UNLOCK(log); + spin_unlock_irqrestore(&log->gclock, flags); + return; +} + +/* + * NAME: lmLogSync() + * + * FUNCTION: write log SYNCPT record for specified log + * if new sync address is available + * (normally the case if sync() is executed by back-ground + * process). + * calculate new value of i_nextsync which determines when + * this code is called again. + * + * PARAMETERS: log - log structure + * hard_sync - 1 to force all metadata to be written + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmLogSync(struct jfs_log * log, int hard_sync) +{ + int logsize; + int written; /* written since last syncpt */ + int free; /* free space left available */ + int delta; /* additional delta to write normally */ + int more; /* additional write granted */ + struct lrd lrd; + int lsn; + struct logsyncblk *lp; + unsigned long flags; + + /* push dirty metapages out to disk */ + if (hard_sync) + write_special_inodes(log, filemap_fdatawrite); + else + write_special_inodes(log, filemap_flush); + + /* + * forward syncpt + */ + /* if last sync is same as last syncpt, + * invoke sync point forward processing to update sync. + */ + + if (log->sync == log->syncpt) { + LOGSYNC_LOCK(log, flags); + if (list_empty(&log->synclist)) + log->sync = log->lsn; + else { + lp = list_entry(log->synclist.next, + struct logsyncblk, synclist); + log->sync = lp->lsn; + } + LOGSYNC_UNLOCK(log, flags); + + } + + /* if sync is different from last syncpt, + * write a SYNCPT record with syncpt = sync. + * reset syncpt = sync + */ + if (log->sync != log->syncpt) { + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = cpu_to_le32(log->sync); + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + + log->syncpt = log->sync; + } else + lsn = log->lsn; + + /* + * setup next syncpt trigger (SWAG) + */ + logsize = log->logsize; + + logdiff(written, lsn, log); + free = logsize - written; + delta = LOGSYNC_DELTA(logsize); + more = min(free / 2, delta); + if (more < 2 * LOGPSIZE) { + jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); + /* + * log wrapping + * + * option 1 - panic ? No.! + * option 2 - shutdown file systems + * associated with log ? + * option 3 - extend log ? + * option 4 - second chance + * + * mark log wrapped, and continue. + * when all active transactions are completed, + * mark log valid for recovery. + * if crashed during invalid state, log state + * implies invalid log, forcing fsck(). + */ + /* mark log state log wrap in log superblock */ + /* log->state = LOGWRAP; */ + + /* reset sync point computation */ + log->syncpt = log->sync = lsn; + log->nextsync = delta; + } else + /* next syncpt trigger = written + more */ + log->nextsync = written + more; + + /* if number of bytes written from last sync point is more + * than 1/4 of the log size, stop new transactions from + * starting until all current transactions are completed + * by setting syncbarrier flag. + */ + if (!test_bit(log_SYNCBARRIER, &log->flag) && + (written > LOGSYNC_BARRIER(logsize)) && log->active) { + set_bit(log_SYNCBARRIER, &log->flag); + jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, + log->syncpt); + /* + * We may have to initiate group commit + */ + jfs_flush_journal(log, 0); + } + + return lsn; +} + +/* + * NAME: jfs_syncpt + * + * FUNCTION: write log SYNCPT record for specified log + * + * PARAMETERS: log - log structure + * hard_sync - set to 1 to force metadata to be written + */ +void jfs_syncpt(struct jfs_log *log, int hard_sync) +{ LOG_LOCK(log); + if (!test_bit(log_QUIESCE, &log->flag)) + lmLogSync(log, hard_sync); + LOG_UNLOCK(log); +} + +/* + * NAME: lmLogOpen() + * + * FUNCTION: open the log on first open; + * insert filesystem in the active list of the log. + * + * PARAMETER: ipmnt - file system mount inode + * iplog - log inode (out) + * + * RETURN: + * + * serialization: + */ +int lmLogOpen(struct super_block *sb) +{ + int rc; + struct block_device *bdev; + struct jfs_log *log; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sbi->flag & JFS_NOINTEGRITY) + return open_dummy_log(sb); + + if (sbi->mntflag & JFS_INLINELOG) + return open_inline_log(sb); + + mutex_lock(&jfs_log_mutex); + list_for_each_entry(log, &jfs_external_logs, journal_list) { + if (log->bdev->bd_dev == sbi->logdev) { + if (memcmp(log->uuid, sbi->loguuid, + sizeof(log->uuid))) { + jfs_warn("wrong uuid on JFS journal\n"); + mutex_unlock(&jfs_log_mutex); + return -EINVAL; + } + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) { + mutex_unlock(&jfs_log_mutex); + return rc; + } + goto journal_found; + } + } + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + /* + * external log as separate logical volume + * + * file systems to log may have n-to-1 relationship; + */ + + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); + if (IS_ERR(bdev)) { + rc = PTR_ERR(bdev); + goto free; + } + + log->bdev = bdev; + memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); + + /* + * initialize log: + */ + if ((rc = lmLogInit(log))) + goto close; + + list_add(&log->journal_list, &jfs_external_logs); + + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) + goto shutdown; + +journal_found: + LOG_LOCK(log); + list_add(&sbi->log_list, &log->sb_list); + sbi->log = log; + LOG_UNLOCK(log); + + mutex_unlock(&jfs_log_mutex); + return 0; + + /* + * unwind on error + */ + shutdown: /* unwind lbmLogInit() */ + list_del(&log->journal_list); + lbmLogShutdown(log); + + close: /* close external log device */ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + free: /* free log descriptor */ + mutex_unlock(&jfs_log_mutex); + kfree(log); + + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; +} + +static int open_inline_log(struct super_block *sb) +{ + struct jfs_log *log; + int rc; + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) + return -ENOMEM; + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + set_bit(log_INLINELOG, &log->flag); + log->bdev = sb->s_bdev; + log->base = addressPXD(&JFS_SBI(sb)->logpxd); + log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> + (L2LOGPSIZE - sb->s_blocksize_bits); + log->l2bsize = sb->s_blocksize_bits; + ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); + + /* + * initialize log. + */ + if ((rc = lmLogInit(log))) { + kfree(log); + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; + } + + list_add(&JFS_SBI(sb)->log_list, &log->sb_list); + JFS_SBI(sb)->log = log; + + return rc; +} + +static int open_dummy_log(struct super_block *sb) +{ + int rc; + + mutex_lock(&jfs_log_mutex); + if (!dummy_log) { + dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL); + if (!dummy_log) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&dummy_log->sb_list); + init_waitqueue_head(&dummy_log->syncwait); + dummy_log->no_integrity = 1; + /* Make up some stuff */ + dummy_log->base = 0; + dummy_log->size = 1024; + rc = lmLogInit(dummy_log); + if (rc) { + kfree(dummy_log); + dummy_log = NULL; + mutex_unlock(&jfs_log_mutex); + return rc; + } + } + + LOG_LOCK(dummy_log); + list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); + JFS_SBI(sb)->log = dummy_log; + LOG_UNLOCK(dummy_log); + mutex_unlock(&jfs_log_mutex); + + return 0; +} + +/* + * NAME: lmLogInit() + * + * FUNCTION: log initialization at first log open. + * + * logredo() (or logformat()) should have been run previously. + * initialize the log from log superblock. + * set the log state in the superblock to LOGMOUNT and + * write SYNCPT log record. + * + * PARAMETER: log - log structure + * + * RETURN: 0 - if ok + * -EINVAL - bad log magic number or superblock dirty + * error returned from logwait() + * + * serialization: single first open thread + */ +int lmLogInit(struct jfs_log * log) +{ + int rc = 0; + struct lrd lrd; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + int lsn = 0; + + jfs_info("lmLogInit: log:0x%p", log); + + /* initialize the group commit serialization lock */ + LOGGC_LOCK_INIT(log); + + /* allocate/initialize the log write serialization lock */ + LOG_LOCK_INIT(log); + + LOGSYNC_LOCK_INIT(log); + + INIT_LIST_HEAD(&log->synclist); + + INIT_LIST_HEAD(&log->cqueue); + log->flush_tblk = NULL; + + log->count = 0; + + /* + * initialize log i/o + */ + if ((rc = lbmLogInit(log))) + return rc; + + if (!test_bit(log_INLINELOG, &log->flag)) + log->l2bsize = L2LOGPSIZE; + + /* check for disabled journaling to disk */ + if (log->no_integrity) { + /* + * Journal pages will still be filled. When the time comes + * to actually do the I/O, the write is not done, and the + * endio routine is called directly. + */ + bp = lbmAllocate(log , 0); + log->bp = bp; + bp->l_pn = bp->l_eor = 0; + } else { + /* + * validate log superblock + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto errout10; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + + if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { + jfs_warn("*** Log Format Error ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* logredo() should have been run successfully. */ + if (logsuper->state != cpu_to_le32(LOGREDONE)) { + jfs_warn("*** Log Is Dirty ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* initialize log from log superblock */ + if (test_bit(log_INLINELOG,&log->flag)) { + if (log->size != le32_to_cpu(logsuper->size)) { + rc = -EINVAL; + goto errout20; + } + jfs_info("lmLogInit: inline log:0x%p base:0x%Lx " + "size:0x%x", log, + (unsigned long long) log->base, log->size); + } else { + if (memcmp(logsuper->uuid, log->uuid, 16)) { + jfs_warn("wrong uuid on JFS log device"); + goto errout20; + } + log->size = le32_to_cpu(logsuper->size); + log->l2bsize = le32_to_cpu(logsuper->l2bsize); + jfs_info("lmLogInit: external log:0x%p base:0x%Lx " + "size:0x%x", log, + (unsigned long long) log->base, log->size); + } + + log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; + log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); + + /* + * initialize for log append write mode + */ + /* establish current/end-of-log page/buffer */ + if ((rc = lbmRead(log, log->page, &bp))) + goto errout20; + + lp = (struct logpage *) bp->l_ldata; + + jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", + le32_to_cpu(logsuper->end), log->page, log->eor, + le16_to_cpu(lp->h.eor)); + + log->bp = bp; + bp->l_pn = log->page; + bp->l_eor = log->eor; + + /* if current page is full, move on to next page */ + if (log->eor >= LOGPSIZE - LOGPTLRSIZE) + lmNextPage(log); + + /* + * initialize log syncpoint + */ + /* + * write the first SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !); + * remove current page from lbm write queue at end of pageout + * (to write log superblock update), but do not release to + * freelist; + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + bp->l_ceor = bp->l_eor; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); + if ((rc = lbmIOWait(bp, 0))) + goto errout30; + + /* + * update/write superblock + */ + logsuper->state = cpu_to_le32(LOGMOUNT); + log->serial = le32_to_cpu(logsuper->serial) + 1; + logsuper->serial = cpu_to_le32(log->serial); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + if ((rc = lbmIOWait(bpsuper, lbmFREE))) + goto errout30; + } + + /* initialize logsync parameters */ + log->logsize = (log->size - 2) << L2LOGPSIZE; + log->lsn = lsn; + log->syncpt = lsn; + log->sync = log->syncpt; + log->nextsync = LOGSYNC_DELTA(log->logsize); + + jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", + log->lsn, log->syncpt, log->sync); + + /* + * initialize for lazy/group commit + */ + log->clsn = lsn; + + return 0; + + /* + * unwind on error + */ + errout30: /* release log page */ + log->wqueue = NULL; + bp->l_wqnext = NULL; + lbmFree(bp); + + errout20: /* release log superblock */ + lbmFree(bpsuper); + + errout10: /* unwind lbmLogInit() */ + lbmLogShutdown(log); + + jfs_warn("lmLogInit: exit(%d)", rc); + return rc; +} + + +/* + * NAME: lmLogClose() + * + * FUNCTION: remove file system from active list of log + * and close it on last close. + * + * PARAMETER: sb - superblock + * + * RETURN: errors from subroutines + * + * serialization: + */ +int lmLogClose(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + struct block_device *bdev; + int rc = 0; + + jfs_info("lmLogClose: log:0x%p", log); + + mutex_lock(&jfs_log_mutex); + LOG_LOCK(log); + list_del(&sbi->log_list); + LOG_UNLOCK(log); + sbi->log = NULL; + + /* + * We need to make sure all of the "written" metapages + * actually make it to disk + */ + sync_blockdev(sb->s_bdev); + + if (test_bit(log_INLINELOG, &log->flag)) { + /* + * in-line log in host file system + */ + rc = lmLogShutdown(log); + kfree(log); + goto out; + } + + if (!log->no_integrity) + lmLogFileSystem(log, sbi, 0); + + if (!list_empty(&log->sb_list)) + goto out; + + /* + * TODO: ensure that the dummy_log is in a state to allow + * lbmLogShutdown to deallocate all the buffers and call + * kfree against dummy_log. For now, leave dummy_log & its + * buffers in memory, and resuse if another no-integrity mount + * is requested. + */ + if (log->no_integrity) + goto out; + + /* + * external log as separate logical volume + */ + list_del(&log->journal_list); + bdev = log->bdev; + rc = lmLogShutdown(log); + + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + kfree(log); + + out: + mutex_unlock(&jfs_log_mutex); + jfs_info("lmLogClose: exit(%d)", rc); + return rc; +} + + +/* + * NAME: jfs_flush_journal() + * + * FUNCTION: initiate write of any outstanding transactions to the journal + * and optionally wait until they are all written to disk + * + * wait == 0 flush until latest txn is committed, don't wait + * wait == 1 flush until latest txn is committed, wait + * wait > 1 flush until all txn's are complete, wait + */ +void jfs_flush_journal(struct jfs_log *log, int wait) +{ + int i; + struct tblock *target = NULL; + + /* jfs_write_inode may call us during read-only mount */ + if (!log) + return; + + jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); + + LOGGC_LOCK(log); + + if (!list_empty(&log->cqueue)) { + /* + * This ensures that we will keep writing to the journal as long + * as there are unwritten commit records + */ + target = list_entry(log->cqueue.prev, struct tblock, cqueue); + + if (test_bit(log_FLUSH, &log->flag)) { + /* + * We're already flushing. + * if flush_tblk is NULL, we are flushing everything, + * so leave it that way. Otherwise, update it to the + * latest transaction + */ + if (log->flush_tblk) + log->flush_tblk = target; + } else { + /* Only flush until latest transaction is committed */ + log->flush_tblk = target; + set_bit(log_FLUSH, &log->flag); + + /* + * Initiate I/O on outstanding transactions + */ + if (!(log->cflag & logGC_PAGEOUT)) { + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + } + if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { + /* Flush until all activity complete */ + set_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + if (wait && target && !(target->flag & tblkGC_COMMITTED)) { + DECLARE_WAITQUEUE(__wait, current); + + add_wait_queue(&target->gcwait, &__wait); + set_current_state(TASK_UNINTERRUPTIBLE); + LOGGC_UNLOCK(log); + schedule(); + LOGGC_LOCK(log); + remove_wait_queue(&target->gcwait, &__wait); + } + LOGGC_UNLOCK(log); + + if (wait < 2) + return; + + write_special_inodes(log, filemap_fdatawrite); + + /* + * If there was recent activity, we may need to wait + * for the lazycommit thread to catch up + */ + if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { + for (i = 0; i < 200; i++) { /* Too much? */ + msleep(250); + write_special_inodes(log, filemap_fdatawrite); + if (list_empty(&log->cqueue) && + list_empty(&log->synclist)) + break; + } + } + assert(list_empty(&log->cqueue)); + +#ifdef CONFIG_JFS_DEBUG + if (!list_empty(&log->synclist)) { + struct logsyncblk *lp; + + printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); + list_for_each_entry(lp, &log->synclist, synclist) { + if (lp->xflag & COMMIT_PAGE) { + struct metapage *mp = (struct metapage *)lp; + print_hex_dump(KERN_ERR, "metapage: ", + DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(struct metapage), 0); + print_hex_dump(KERN_ERR, "page: ", + DUMP_PREFIX_ADDRESS, 16, + sizeof(long), mp->page, + sizeof(struct page), 0); + } else + print_hex_dump(KERN_ERR, "tblock:", + DUMP_PREFIX_ADDRESS, 16, 4, + lp, sizeof(struct tblock), 0); + } + } +#else + WARN_ON(!list_empty(&log->synclist)); +#endif + clear_bit(log_FLUSH, &log->flag); +} + +/* + * NAME: lmLogShutdown() + * + * FUNCTION: log shutdown at last LogClose(). + * + * write log syncpt record. + * update super block to set redone flag to 0. + * + * PARAMETER: log - log inode + * + * RETURN: 0 - success + * + * serialization: single last close thread + */ +int lmLogShutdown(struct jfs_log * log) +{ + int rc; + struct lrd lrd; + int lsn; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + + jfs_info("lmLogShutdown: log:0x%p", log); + + jfs_flush_journal(log, 2); + + /* + * write the last SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !) + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); + lbmIOWait(log->bp, lbmFREE); + log->bp = NULL; + + /* + * synchronous update log superblock + * mark log state as shutdown cleanly + * (i.e., Log does not need to be replayed). + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto out; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->end = cpu_to_le32(lsn); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", + lsn, log->page, log->eor); + + out: + /* + * shutdown per log i/o + */ + lbmLogShutdown(log); + + if (rc) { + jfs_warn("lmLogShutdown: exit(%d)", rc); + } + return rc; +} + + +/* + * NAME: lmLogFileSystem() + * + * FUNCTION: insert ( = true)/remove ( = false) + * file system into/from log active file system list. + * + * PARAMETE: log - pointer to logs inode. + * fsdev - kdev_t of filesystem. + * serial - pointer to returned log serial number + * activate - insert/remove device from active list. + * + * RETURN: 0 - success + * errors returned by vms_iowait(). + */ +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate) +{ + int rc = 0; + int i; + struct logsuper *logsuper; + struct lbuf *bpsuper; + char *uuid = sbi->uuid; + + /* + * insert/remove file system device to log active file system list. + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + return rc; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + if (activate) { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) { + memcpy(logsuper->active[i].uuid, uuid, 16); + sbi->aggregate = i; + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Too many file systems sharing journal!"); + lbmFree(bpsuper); + return -EMFILE; /* Is there a better rc? */ + } + } else { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, uuid, 16)) { + memcpy(logsuper->active[i].uuid, NULL_UUID, 16); + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Somebody stomped on the journal!"); + lbmFree(bpsuper); + return -EIO; + } + + } + + /* + * synchronous write log superblock: + * + * write sidestream bypassing write queue: + * at file system mount, log super block is updated for + * activation of the file system before any log record + * (MOUNT record) of the file system, and at file system + * unmount, all meta data for the file system has been + * flushed before log super block is updated for deactivation + * of the file system. + */ + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + return rc; +} + +/* + * log buffer manager (lbm) + * ------------------------ + * + * special purpose buffer manager supporting log i/o requirements. + * + * per log write queue: + * log pageout occurs in serial order by fifo write queue and + * restricting to a single i/o in pregress at any one time. + * a circular singly-linked list + * (log->wrqueue points to the tail, and buffers are linked via + * bp->wrqueue field), and + * maintains log page in pageout ot waiting for pageout in serial pageout. + */ + +/* + * lbmLogInit() + * + * initialize per log I/O setup at lmLogInit() + */ +static int lbmLogInit(struct jfs_log * log) +{ /* log inode */ + int i; + struct lbuf *lbuf; + + jfs_info("lbmLogInit: log:0x%p", log); + + /* initialize current buffer cursor */ + log->bp = NULL; + + /* initialize log device write queue */ + log->wqueue = NULL; + + /* + * Each log has its own buffer pages allocated to it. These are + * not managed by the page cache. This ensures that a transaction + * writing to the log does not block trying to allocate a page from + * the page cache (for the log). This would be bad, since page + * allocation waits on the kswapd thread that may be committing inodes + * which would cause log activity. Was that clear? I'm trying to + * avoid deadlock here. + */ + init_waitqueue_head(&log->free_wait); + + log->lbuf_free = NULL; + + for (i = 0; i < LOGPAGES;) { + char *buffer; + uint offset; + struct page *page; + + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (buffer == NULL) + goto error; + page = virt_to_page(buffer); + for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { + lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); + if (lbuf == NULL) { + if (offset == 0) + free_page((unsigned long) buffer); + goto error; + } + if (offset) /* we already have one reference */ + get_page(page); + lbuf->l_offset = offset; + lbuf->l_ldata = buffer + offset; + lbuf->l_page = page; + lbuf->l_log = log; + init_waitqueue_head(&lbuf->l_ioevent); + + lbuf->l_freelist = log->lbuf_free; + log->lbuf_free = lbuf; + i++; + } + } + + return (0); + + error: + lbmLogShutdown(log); + return -ENOMEM; +} + + +/* + * lbmLogShutdown() + * + * finalize per log I/O setup at lmLogShutdown() + */ +static void lbmLogShutdown(struct jfs_log * log) +{ + struct lbuf *lbuf; + + jfs_info("lbmLogShutdown: log:0x%p", log); + + lbuf = log->lbuf_free; + while (lbuf) { + struct lbuf *next = lbuf->l_freelist; + __free_page(lbuf->l_page); + kfree(lbuf); + lbuf = next; + } +} + + +/* + * lbmAllocate() + * + * allocate an empty log buffer + */ +static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) +{ + struct lbuf *bp; + unsigned long flags; + + /* + * recycle from log buffer freelist if any + */ + LCACHE_LOCK(flags); + LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); + log->lbuf_free = bp->l_freelist; + LCACHE_UNLOCK(flags); + + bp->l_flag = 0; + + bp->l_wqnext = NULL; + bp->l_freelist = NULL; + + bp->l_pn = pn; + bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); + bp->l_ceor = 0; + + return bp; +} + + +/* + * lbmFree() + * + * release a log buffer to freelist + */ +static void lbmFree(struct lbuf * bp) +{ + unsigned long flags; + + LCACHE_LOCK(flags); + + lbmfree(bp); + + LCACHE_UNLOCK(flags); +} + +static void lbmfree(struct lbuf * bp) +{ + struct jfs_log *log = bp->l_log; + + assert(bp->l_wqnext == NULL); + + /* + * return the buffer to head of freelist + */ + bp->l_freelist = log->lbuf_free; + log->lbuf_free = bp; + + wake_up(&log->free_wait); + return; +} + + +/* + * NAME: lbmRedrive + * + * FUNCTION: add a log buffer to the log redrive list + * + * PARAMETER: + * bp - log buffer + * + * NOTES: + * Takes log_redrive_lock. + */ +static inline void lbmRedrive(struct lbuf *bp) +{ + unsigned long flags; + + spin_lock_irqsave(&log_redrive_lock, flags); + bp->l_redrive_next = log_redrive_list; + log_redrive_list = bp; + spin_unlock_irqrestore(&log_redrive_lock, flags); + + wake_up_process(jfsIOthread); +} + + +/* + * lbmRead() + */ +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) +{ + struct bio *bio; + struct lbuf *bp; + + /* + * allocate a log buffer + */ + *bpp = bp = lbmAllocate(log, pn); + jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); + + bp->l_flag |= lbmREAD; + + bio = bio_alloc(GFP_NOFS, 1); + + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_bdev = log->bdev; + bio->bi_io_vec[0].bv_page = bp->l_page; + bio->bi_io_vec[0].bv_len = LOGPSIZE; + bio->bi_io_vec[0].bv_offset = bp->l_offset; + + bio->bi_vcnt = 1; + bio->bi_iter.bi_size = LOGPSIZE; + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_iter.bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } + + wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); + + return 0; +} + + +/* + * lbmWrite() + * + * buffer at head of pageout queue stays after completion of + * partial-page pageout and redriven by explicit initiation of + * pageout by caller until full-page pageout is completed and + * released. + * + * device driver i/o done redrives pageout of new buffer at + * head of pageout queue when current buffer at head of pageout + * queue is released at the completion of its full-page pageout. + * + * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). + * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() + */ +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, + int cant_block) +{ + struct lbuf *tail; + unsigned long flags; + + jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + LCACHE_LOCK(flags); /* disable+lock */ + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag; + + /* + * insert bp at tail of write queue associated with log + * + * (request is either for bp already/currently at head of queue + * or new bp to be inserted at tail) + */ + tail = log->wqueue; + + /* is buffer not already on write queue ? */ + if (bp->l_wqnext == NULL) { + /* insert at tail of wqueue */ + if (tail == NULL) { + log->wqueue = bp; + bp->l_wqnext = bp; + } else { + log->wqueue = bp; + bp->l_wqnext = tail->l_wqnext; + tail->l_wqnext = bp; + } + + tail = bp; + } + + /* is buffer at head of wqueue and for write ? */ + if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + return; + } + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + if (cant_block) + lbmRedrive(bp); + else if (flag & lbmSYNC) + lbmStartIO(bp); + else { + LOGGC_UNLOCK(log); + lbmStartIO(bp); + LOGGC_LOCK(log); + } +} + + +/* + * lbmDirectWrite() + * + * initiate pageout bypassing write queue for sidestream + * (e.g., log superblock) write; + */ +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) +{ + jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", + bp, flag, bp->l_pn); + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag | lbmDIRECT; + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + /* + * initiate pageout of the page + */ + lbmStartIO(bp); +} + + +/* + * NAME: lbmStartIO() + * + * FUNCTION: Interface to DD strategy routine + * + * RETURN: none + * + * serialization: LCACHE_LOCK() is NOT held during log i/o; + */ +static void lbmStartIO(struct lbuf * bp) +{ + struct bio *bio; + struct jfs_log *log = bp->l_log; + + jfs_info("lbmStartIO\n"); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_bdev = log->bdev; + bio->bi_io_vec[0].bv_page = bp->l_page; + bio->bi_io_vec[0].bv_len = LOGPSIZE; + bio->bi_io_vec[0].bv_offset = bp->l_offset; + + bio->bi_vcnt = 1; + bio->bi_iter.bi_size = LOGPSIZE; + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + + /* check if journaling to disk has been disabled */ + if (log->no_integrity) { + bio->bi_iter.bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(WRITE_SYNC, bio); + INCREMENT(lmStat.submitted); + } +} + + +/* + * lbmIOWait() + */ +static int lbmIOWait(struct lbuf * bp, int flag) +{ + unsigned long flags; + int rc = 0; + + jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); + + rc = (bp->l_flag & lbmERROR) ? -EIO : 0; + + if (flag & lbmFREE) + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + return rc; +} + +/* + * lbmIODone() + * + * executed at INTIODONE level + */ +static void lbmIODone(struct bio *bio, int error) +{ + struct lbuf *bp = bio->bi_private; + struct lbuf *nextbp, *tail; + struct jfs_log *log; + unsigned long flags; + + /* + * get back jfs buffer bound to the i/o buffer + */ + jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + bp->l_flag |= lbmDONE; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + bp->l_flag |= lbmERROR; + + jfs_err("lbmIODone: I/O error in JFS log"); + } + + bio_put(bio); + + /* + * pagein completion + */ + if (bp->l_flag & lbmREAD) { + bp->l_flag &= ~lbmREAD; + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + + return; + } + + /* + * pageout completion + * + * the bp at the head of write queue has completed pageout. + * + * if single-commit/full-page pageout, remove the current buffer + * from head of pageout queue, and redrive pageout with + * the new buffer at head of pageout queue; + * otherwise, the partial-page pageout buffer stays at + * the head of pageout queue to be redriven for pageout + * by lmGroupCommit() until full-page pageout is completed. + */ + bp->l_flag &= ~lbmWRITE; + INCREMENT(lmStat.pagedone); + + /* update committed lsn */ + log = bp->l_log; + log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; + + if (bp->l_flag & lbmDIRECT) { + LCACHE_WAKEUP(&bp->l_ioevent); + LCACHE_UNLOCK(flags); + return; + } + + tail = log->wqueue; + + /* single element queue */ + if (bp == tail) { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + log->wqueue = NULL; + bp->l_wqnext = NULL; + } + } + /* multi element queue */ + else { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + nextbp = tail->l_wqnext = bp->l_wqnext; + bp->l_wqnext = NULL; + + /* + * redrive pageout of next page at head of write queue: + * redrive next page without any bound tblk + * (i.e., page w/o any COMMIT records), or + * first page of new group commit which has been + * queued after current page (subsequent pageout + * is performed synchronously, except page without + * any COMMITs) by lmGroupCommit() as indicated + * by lbmWRITE flag; + */ + if (nextbp->l_flag & lbmWRITE) { + /* + * We can't do the I/O at interrupt time. + * The jfsIO thread can do it + */ + lbmRedrive(nextbp); + } + } + } + + /* + * synchronous pageout: + * + * buffer has not necessarily been removed from write queue + * (e.g., synchronous write of partial-page with COMMIT): + * leave buffer for i/o initiator to dispose + */ + if (bp->l_flag & lbmSYNC) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + } + + /* + * Group Commit pageout: + */ + else if (bp->l_flag & lbmGC) { + LCACHE_UNLOCK(flags); + lmPostGC(bp); + } + + /* + * asynchronous pageout: + * + * buffer must have been removed from write queue: + * insert buffer at head of freelist where it can be recycled + */ + else { + assert(bp->l_flag & lbmRELEASE); + assert(bp->l_flag & lbmFREE); + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + } +} + +int jfsIOWait(void *arg) +{ + struct lbuf *bp; + + do { + spin_lock_irq(&log_redrive_lock); + while ((bp = log_redrive_list)) { + log_redrive_list = bp->l_redrive_next; + bp->l_redrive_next = NULL; + spin_unlock_irq(&log_redrive_lock); + lbmStartIO(bp); + spin_lock_irq(&log_redrive_lock); + } + + if (freezing(current)) { + spin_unlock_irq(&log_redrive_lock); + try_to_freeze(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&log_redrive_lock); + schedule(); + } + } while (!kthread_should_stop()); + + jfs_info("jfsIOWait being killed!"); + return 0; +} + +/* + * NAME: lmLogFormat()/jfs_logform() + * + * FUNCTION: format file system log + * + * PARAMETERS: + * log - volume log + * logAddress - start address of log space in FS block + * logSize - length of log space in FS block; + * + * RETURN: 0 - success + * -EIO - i/o error + * + * XXX: We're synchronously writing one page at a time. This needs to + * be improved by writing multiple pages at once. + */ +int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) +{ + int rc = -EIO; + struct jfs_sb_info *sbi; + struct logsuper *logsuper; + struct logpage *lp; + int lspn; /* log sequence page number */ + struct lrd *lrd_ptr; + int npages = 0; + struct lbuf *bp; + + jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", + (long long)logAddress, logSize); + + sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); + + /* allocate a log buffer */ + bp = lbmAllocate(log, 1); + + npages = logSize >> sbi->l2nbperpage; + + /* + * log space: + * + * page 0 - reserved; + * page 1 - log superblock; + * page 2 - log data page: A SYNC log record is written + * into this page at logform time; + * pages 3-N - log data page: set to empty log data pages; + */ + /* + * init log superblock: log page 1 + */ + logsuper = (struct logsuper *) bp->l_ldata; + + logsuper->magic = cpu_to_le32(LOGMAGIC); + logsuper->version = cpu_to_le32(LOGVERSION); + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ + logsuper->size = cpu_to_le32(npages); + logsuper->bsize = cpu_to_le32(sbi->bsize); + logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); + logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); + + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + bp->l_blkno = logAddress + sbi->nbperpage; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * init pages 2 to npages-1 as log data pages: + * + * log page sequence number (lpsn) initialization: + * + * pn: 0 1 2 3 n-1 + * +-----+-----+=====+=====+===.....===+=====+ + * lspn: N-1 0 1 N-2 + * <--- N page circular file ----> + * + * the N (= npages-2) data pages of the log is maintained as + * a circular file for the log records; + * lpsn grows by 1 monotonically as each log page is written + * to the circular file of the log; + * and setLogpage() will not reset the page number even if + * the eor is equal to LOGPHDRSIZE. In order for binary search + * still work in find log end process, we have to simulate the + * log wrap situation at the log format time. + * The 1st log page written will have the highest lpsn. Then + * the succeeding log pages will have ascending order of + * the lspn starting from 0, ... (N-2) + */ + lp = (struct logpage *) bp->l_ldata; + /* + * initialize 1st log page to be written: lpsn = N - 1, + * write a SYNCPT log record is written to this page + */ + lp->h.page = lp->t.page = cpu_to_le32(npages - 3); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); + + lrd_ptr = (struct lrd *) &lp->data; + lrd_ptr->logtid = 0; + lrd_ptr->backchain = 0; + lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); + lrd_ptr->length = 0; + lrd_ptr->log.syncpt.sync = 0; + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + */ + for (lspn = 0; lspn < npages - 3; lspn++) { + lp->h.page = lp->t.page = cpu_to_le32(lspn); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + } + + rc = 0; +exit: + /* + * finalize log + */ + /* release the buffer */ + lbmFree(bp); + + return rc; +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_lmstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Logmgr stats\n" + "================\n" + "commits = %d\n" + "writes submitted = %d\n" + "writes completed = %d\n" + "full pages submitted = %d\n" + "partial pages submitted = %d\n", + lmStat.commit, + lmStat.submitted, + lmStat.pagedone, + lmStat.full_page, + lmStat.partial_page); + return 0; +} + +static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_lmstats_proc_show, NULL); +} + +const struct file_operations jfs_lmstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_lmstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_JFS_STATISTICS */ diff --git a/kernel/fs/jfs/jfs_logmgr.h b/kernel/fs/jfs/jfs_logmgr.h new file mode 100644 index 000000000..e38c21598 --- /dev/null +++ b/kernel/fs/jfs/jfs_logmgr.h @@ -0,0 +1,513 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOGMGR +#define _H_JFS_LOGMGR + +#include "jfs_filsys.h" +#include "jfs_lock.h" + +/* + * log manager configuration parameters + */ + +/* log page size */ +#define LOGPSIZE 4096 +#define L2LOGPSIZE 12 + +#define LOGPAGES 16 /* Log pages per mounted file system */ + +/* + * log logical volume + * + * a log is used to make the commit operation on journalled + * files within the same logical volume group atomic. + * a log is implemented with a logical volume. + * there is one log per logical volume group. + * + * block 0 of the log logical volume is not used (ipl etc). + * block 1 contains a log "superblock" and is used by logFormat(), + * lmLogInit(), lmLogShutdown(), and logRedo() to record status + * of the log but is not otherwise used during normal processing. + * blocks 2 - (N-1) are used to contain log records. + * + * when a volume group is varied-on-line, logRedo() must have + * been executed before the file systems (logical volumes) in + * the volume group can be mounted. + */ +/* + * log superblock (block 1 of logical volume) + */ +#define LOGSUPER_B 1 +#define LOGSTART_B 2 + +#define LOGMAGIC 0x87654321 +#define LOGVERSION 1 + +#define MAX_ACTIVE 128 /* Max active file systems sharing log */ + +struct logsuper { + __le32 magic; /* 4: log lv identifier */ + __le32 version; /* 4: version number */ + __le32 serial; /* 4: log open/mount counter */ + __le32 size; /* 4: size in number of LOGPSIZE blocks */ + __le32 bsize; /* 4: logical block size in byte */ + __le32 l2bsize; /* 4: log2 of bsize */ + + __le32 flag; /* 4: option */ + __le32 state; /* 4: state - see below */ + + __le32 end; /* 4: addr of last log record set by logredo */ + char uuid[16]; /* 16: 128-bit journal uuid */ + char label[16]; /* 16: journal label */ + struct { + char uuid[16]; + } active[MAX_ACTIVE]; /* 2048: active file systems list */ +}; + +#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + +/* log flag: commit option (see jfs_filsys.h) */ + +/* log state */ +#define LOGMOUNT 0 /* log mounted by lmLogInit() */ +#define LOGREDONE 1 /* log shutdown by lmLogShutdown(). + * log redo completed by logredo(). + */ +#define LOGWRAP 2 /* log wrapped */ +#define LOGREADERR 3 /* log read error detected in logredo() */ + + +/* + * log logical page + * + * (this comment should be rewritten !) + * the header and trailer structures (h,t) will normally have + * the same page and eor value. + * An exception to this occurs when a complete page write is not + * accomplished on a power failure. Since the hardware may "split write" + * sectors in the page, any out of order sequence may occur during powerfail + * and needs to be recognized during log replay. The xor value is + * an "exclusive or" of all log words in the page up to eor. This + * 32 bit eor is stored with the top 16 bits in the header and the + * bottom 16 bits in the trailer. logredo can easily recognize pages + * that were not completed by reconstructing this eor and checking + * the log page. + * + * Previous versions of the operating system did not allow split + * writes and detected partially written records in logredo by + * ordering the updates to the header, trailer, and the move of data + * into the logdata area. The order: (1) data is moved (2) header + * is updated (3) trailer is updated. In logredo, when the header + * differed from the trailer, the header and trailer were reconciled + * as follows: if h.page != t.page they were set to the smaller of + * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) + * h.eor != t.eor they were set to the smaller of their two values. + */ +struct logpage { + struct { /* header */ + __le32 page; /* 4: log sequence page number */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: end-of-log offset of lasrt record write */ + } h; + + __le32 data[LOGPSIZE / 4 - 4]; /* log record area */ + + struct { /* trailer */ + __le32 page; /* 4: normally the same as h.page */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: normally the same as h.eor */ + } t; +}; + +#define LOGPHDRSIZE 8 /* log page header size */ +#define LOGPTLRSIZE 8 /* log page trailer size */ + + +/* + * log record + * + * (this comment should be rewritten !) + * jfs uses only "after" log records (only a single writer is allowed + * in a page, pages are written to temporary paging space if + * if they must be written to disk before commit, and i/o is + * scheduled for modified pages to their home location after + * the log records containing the after values and the commit + * record is written to the log on disk, undo discards the copy + * in main-memory.) + * + * a log record consists of a data area of variable length followed by + * a descriptor of fixed size LOGRDSIZE bytes. + * the data area is rounded up to an integral number of 4-bytes and + * must be no longer than LOGPSIZE. + * the descriptor is of size of multiple of 4-bytes and aligned on a + * 4-byte boundary. + * records are packed one after the other in the data area of log pages. + * (sometimes a DUMMY record is inserted so that at least one record ends + * on every page or the longest record is placed on at most two pages). + * the field eor in page header/trailer points to the byte following + * the last record on a page. + */ + +/* log record types */ +#define LOG_COMMIT 0x8000 +#define LOG_SYNCPT 0x4000 +#define LOG_MOUNT 0x2000 +#define LOG_REDOPAGE 0x0800 +#define LOG_NOREDOPAGE 0x0080 +#define LOG_NOREDOINOEXT 0x0040 +#define LOG_UPDATEMAP 0x0008 +#define LOG_NOREDOFILE 0x0001 + +/* REDOPAGE/NOREDOPAGE log record data type */ +#define LOG_INODE 0x0001 +#define LOG_XTREE 0x0002 +#define LOG_DTREE 0x0004 +#define LOG_BTROOT 0x0010 +#define LOG_EA 0x0020 +#define LOG_ACL 0x0040 +#define LOG_DATA 0x0080 +#define LOG_NEW 0x0100 +#define LOG_EXTEND 0x0200 +#define LOG_RELOCATE 0x0400 +#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ + +/* UPDATEMAP log record descriptor type */ +#define LOG_ALLOCXADLIST 0x0080 +#define LOG_ALLOCPXDLIST 0x0040 +#define LOG_ALLOCXAD 0x0020 +#define LOG_ALLOCPXD 0x0010 +#define LOG_FREEXADLIST 0x0008 +#define LOG_FREEPXDLIST 0x0004 +#define LOG_FREEXAD 0x0002 +#define LOG_FREEPXD 0x0001 + + +struct lrd { + /* + * type independent area + */ + __le32 logtid; /* 4: log transaction identifier */ + __le32 backchain; /* 4: ptr to prev record of same transaction */ + __le16 type; /* 2: record type */ + __le16 length; /* 2: length of data in record (in byte) */ + __le32 aggregate; /* 4: file system lv/aggregate */ + /* (16) */ + + /* + * type dependent area (20) + */ + union { + + /* + * COMMIT: commit + * + * transaction commit: no type-dependent information; + */ + + /* + * REDOPAGE: after-image + * + * apply after-image; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: REDOPAGE record type */ + __le16 l2linesize; /* 2: log2 of line size */ + pxd_t pxd; /* 8: on-disk page pxd */ + } redopage; /* (20) */ + + /* + * NOREDOPAGE: the page is freed + * + * do not apply after-image records which precede this record + * in the log with the same page block number to this page. + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: NOREDOPAGE record type */ + __le16 rsrvd; /* 2: reserved */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredopage; /* (20) */ + + /* + * UPDATEMAP: update block allocation map + * + * either in-line PXD, + * or out-of-line XADLIST; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: UPDATEMAP record type */ + __le16 nxd; /* 2: number of extents */ + pxd_t pxd; /* 8: pxd */ + } updatemap; /* (20) */ + + /* + * NOREDOINOEXT: the inode extent is freed + * + * do not apply after-image records which precede this + * record in the log with the any of the 4 page block + * numbers in this inode extent. + * + * NOTE: The fileset and pxd fields MUST remain in + * the same fields in the REDOPAGE record format. + * + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 iagnum; /* 4: IAG number */ + __le32 inoext_idx; /* 4: inode extent index */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredoinoext; /* (20) */ + + /* + * SYNCPT: log sync point + * + * replay log up to syncpt address specified; + */ + struct { + __le32 sync; /* 4: syncpt address (0 = here) */ + } syncpt; + + /* + * MOUNT: file system mount + * + * file system mount: no type-dependent information; + */ + + /* + * ? FREEXTENT: free specified extent(s) + * + * free specified extent(s) from block allocation map + * N.B.: nextents should be length of data/sizeof(xad_t) + */ + struct { + __le32 type; /* 4: FREEXTENT record type */ + __le32 nextent; /* 4: number of extents */ + + /* data: PXD or XAD list */ + } freextent; + + /* + * ? NOREDOFILE: this file is freed + * + * do not apply records which precede this record in the log + * with the same inode number. + * + * NOREDOFILE must be the first to be written at commit + * (last to be read in logredo()) - it prevents + * replay of preceding updates of all preceding generations + * of the inumber esp. the on-disk inode itself. + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + } noredofile; + + /* + * ? NEWPAGE: + * + * metadata type dependent + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le32 type; /* 4: NEWPAGE record type */ + pxd_t pxd; /* 8: on-disk page pxd */ + } newpage; + + /* + * ? DUMMY: filler + * + * no type-dependent information + */ + } log; +}; /* (36) */ + +#define LOGRDSIZE (sizeof(struct lrd)) + +/* + * line vector descriptor + */ +struct lvd { + __le16 offset; + __le16 length; +}; + + +/* + * log logical volume + */ +struct jfs_log { + + struct list_head sb_list;/* This is used to sync metadata + * before writing syncpt. + */ + struct list_head journal_list; /* Global list */ + struct block_device *bdev; /* 4: log lv pointer */ + int serial; /* 4: log mount serial number */ + + s64 base; /* @8: log extent address (inline log ) */ + int size; /* 4: log size in log page (in page) */ + int l2bsize; /* 4: log2 of bsize */ + + unsigned long flag; /* 4: flag */ + + struct lbuf *lbuf_free; /* 4: free lbufs */ + wait_queue_head_t free_wait; /* 4: */ + + /* log write */ + int logtid; /* 4: log tid */ + int page; /* 4: page number of eol page */ + int eor; /* 4: eor of last record in eol page */ + struct lbuf *bp; /* 4: current log page buffer */ + + struct mutex loglock; /* 4: log write serialization lock */ + + /* syncpt */ + int nextsync; /* 4: bytes to write before next syncpt */ + int active; /* 4: */ + wait_queue_head_t syncwait; /* 4: */ + + /* commit */ + uint cflag; /* 4: */ + struct list_head cqueue; /* FIFO commit queue */ + struct tblock *flush_tblk; /* tblk we're waiting on for flush */ + int gcrtc; /* 4: GC_READY transaction count */ + struct tblock *gclrt; /* 4: latest GC_READY transaction */ + spinlock_t gclock; /* 4: group commit lock */ + int logsize; /* 4: log data area size in byte */ + int lsn; /* 4: end-of-log */ + int clsn; /* 4: clsn */ + int syncpt; /* 4: addr of last syncpt record */ + int sync; /* 4: addr from last logsync() */ + struct list_head synclist; /* 8: logsynclist anchor */ + spinlock_t synclock; /* 4: synclist lock */ + struct lbuf *wqueue; /* 4: log pageout queue */ + int count; /* 4: count */ + char uuid[16]; /* 16: 128-bit uuid of log device */ + + int no_integrity; /* 3: flag to disable journaling to disk */ +}; + +/* + * Log flag + */ +#define log_INLINELOG 1 +#define log_SYNCBARRIER 2 +#define log_QUIESCE 3 +#define log_FLUSH 4 + +/* + * group commit flag + */ +/* jfs_log */ +#define logGC_PAGEOUT 0x00000001 + +/* tblock/lbuf */ +#define tblkGC_QUEUE 0x0001 +#define tblkGC_READY 0x0002 +#define tblkGC_COMMIT 0x0004 +#define tblkGC_COMMITTED 0x0008 +#define tblkGC_EOP 0x0010 +#define tblkGC_FREE 0x0020 +#define tblkGC_LEADER 0x0040 +#define tblkGC_ERROR 0x0080 +#define tblkGC_LAZY 0x0100 // D230860 +#define tblkGC_UNLOCKED 0x0200 // D230860 + +/* + * log cache buffer header + */ +struct lbuf { + struct jfs_log *l_log; /* 4: log associated with buffer */ + + /* + * data buffer base area + */ + uint l_flag; /* 4: pageout control flags */ + + struct lbuf *l_wqnext; /* 4: write queue link */ + struct lbuf *l_freelist; /* 4: freelistlink */ + + int l_pn; /* 4: log page number */ + int l_eor; /* 4: log record eor */ + int l_ceor; /* 4: committed log record eor */ + + s64 l_blkno; /* 8: log page block number */ + caddr_t l_ldata; /* 4: data page */ + struct page *l_page; /* The page itself */ + uint l_offset; /* Offset of l_ldata within the page */ + + wait_queue_head_t l_ioevent; /* 4: i/o done event */ +}; + +/* Reuse l_freelist for redrive list */ +#define l_redrive_next l_freelist + +/* + * logsynclist block + * + * common logsyncblk prefix for jbuf_t and tblock + */ +struct logsyncblk { + u16 xflag; /* flags */ + u16 flag; /* only meaninful in tblock */ + lid_t lid; /* lock id */ + s32 lsn; /* log sequence number */ + struct list_head synclist; /* log sync list link */ +}; + +/* + * logsynclist serialization (per log) + */ + +#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) +#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags) +#define LOGSYNC_UNLOCK(log, flags) \ + spin_unlock_irqrestore(&(log)->synclock, flags) + +/* compute the difference in bytes of lsn from sync point */ +#define logdiff(diff, lsn, log)\ +{\ + diff = (lsn) - (log)->syncpt;\ + if (diff < 0)\ + diff += (log)->logsize;\ +} + +extern int lmLogOpen(struct super_block *sb); +extern int lmLogClose(struct super_block *sb); +extern int lmLogShutdown(struct jfs_log * log); +extern int lmLogInit(struct jfs_log * log); +extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); +extern int lmGroupCommit(struct jfs_log *, struct tblock *); +extern int jfsIOWait(void *); +extern void jfs_flush_journal(struct jfs_log * log, int wait); +extern void jfs_syncpt(struct jfs_log *log, int hard_sync); + +#endif /* _H_JFS_LOGMGR */ diff --git a/kernel/fs/jfs/jfs_metapage.c b/kernel/fs/jfs/jfs_metapage.c new file mode 100644 index 000000000..16a0922be --- /dev/null +++ b/kernel/fs/jfs/jfs_metapage.c @@ -0,0 +1,837 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint pagealloc; /* # of page allocations */ + uint pagefree; /* # of page frees */ + uint lockwait; /* # of sleeping lock_metapage() calls */ +} mpStat; +#endif + +#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) +#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag) + +static inline void unlock_metapage(struct metapage *mp) +{ + clear_bit_unlock(META_locked, &mp->flag); + wake_up(&mp->wait); +} + +static inline void __lock_metapage(struct metapage *mp) +{ + DECLARE_WAITQUEUE(wait, current); + INCREMENT(mpStat.lockwait); + add_wait_queue_exclusive(&mp->wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (metapage_locked(mp)) { + unlock_page(mp->page); + io_schedule(); + lock_page(mp->page); + } + } while (trylock_metapage(mp)); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&mp->wait, &wait); +} + +/* + * Must have mp->page locked + */ +static inline void lock_metapage(struct metapage *mp) +{ + if (trylock_metapage(mp)) + __lock_metapage(mp); +} + +#define METAPOOL_MIN_PAGES 32 +static struct kmem_cache *metapage_cache; +static mempool_t *metapage_mempool; + +#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE) + +#if MPS_PER_PAGE > 1 + +struct meta_anchor { + int mp_count; + atomic_t io_count; + struct metapage *mp[MPS_PER_PAGE]; +}; +#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) + +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + if (!PagePrivate(page)) + return NULL; + return mp_anchor(page)->mp[offset >> L2PSIZE]; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a; + int index; + int l2mp_blocks; /* log2 blocks per metapage */ + + if (PagePrivate(page)) + a = mp_anchor(page); + else { + a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS); + if (!a) + return -ENOMEM; + set_page_private(page, (unsigned long)a); + SetPagePrivate(page); + kmap(page); + } + + if (mp) { + l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + a->mp_count++; + a->mp[index] = mp; + } + + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a = mp_anchor(page); + int l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + int index; + + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + + BUG_ON(a->mp[index] != mp); + + a->mp[index] = NULL; + if (--a->mp_count == 0) { + kfree(a); + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); + } +} + +static inline void inc_io(struct page *page) +{ + atomic_inc(&mp_anchor(page)->io_count); +} + +static inline void dec_io(struct page *page, void (*handler) (struct page *)) +{ + if (atomic_dec_and_test(&mp_anchor(page)->io_count)) + handler(page); +} + +#else +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + if (mp) { + set_page_private(page, (unsigned long)mp); + SetPagePrivate(page); + kmap(page); + } + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); +} + +#define inc_io(page) do {} while(0) +#define dec_io(page, handler) handler(page) + +#endif + +static inline struct metapage *alloc_metapage(gfp_t gfp_mask) +{ + struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask); + + if (mp) { + mp->lid = 0; + mp->lsn = 0; + mp->data = NULL; + mp->clsn = 0; + mp->log = NULL; + init_waitqueue_head(&mp->wait); + } + return mp; +} + +static inline void free_metapage(struct metapage *mp) +{ + mempool_free(mp, metapage_mempool); +} + +int __init metapage_init(void) +{ + /* + * Allocate the metapage structures + */ + metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), + 0, 0, NULL); + if (metapage_cache == NULL) + return -ENOMEM; + + metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES, + metapage_cache); + + if (metapage_mempool == NULL) { + kmem_cache_destroy(metapage_cache); + return -ENOMEM; + } + + return 0; +} + +void metapage_exit(void) +{ + mempool_destroy(metapage_mempool); + kmem_cache_destroy(metapage_cache); +} + +static inline void drop_metapage(struct page *page, struct metapage *mp) +{ + if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) || + test_bit(META_io, &mp->flag)) + return; + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); +} + +/* + * Metapage address space operations + */ + +static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, + int *len) +{ + int rc = 0; + int xflag; + s64 xaddr; + sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_blkbits; + + if (lblock >= file_blocks) + return 0; + if (lblock + *len > file_blocks) + *len = file_blocks - lblock; + + if (inode->i_ino) { + rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0); + if ((rc == 0) && *len) + lblock = (sector_t)xaddr; + else + lblock = 0; + } /* else no mapping */ + + return lblock; +} + +static void last_read_complete(struct page *page) +{ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); +} + +static void metapage_read_end_io(struct bio *bio, int err) +{ + struct page *page = bio->bi_private; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "metapage_read_end_io: I/O error\n"); + SetPageError(page); + } + + dec_io(page, last_read_complete); + bio_put(bio); +} + +static void remove_from_logsync(struct metapage *mp) +{ + struct jfs_log *log = mp->log; + unsigned long flags; +/* + * This can race. Recheck that log hasn't been set to null, and after + * acquiring logsync lock, recheck lsn + */ + if (!log) + return; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn) { + mp->log = NULL; + mp->lsn = 0; + mp->clsn = 0; + log->count--; + list_del(&mp->synclist); + } + LOGSYNC_UNLOCK(log, flags); +} + +static void last_write_complete(struct page *page) +{ + struct metapage *mp; + unsigned int offset; + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (mp && test_bit(META_io, &mp->flag)) { + if (mp->lsn) + remove_from_logsync(mp); + clear_bit(META_io, &mp->flag); + } + /* + * I'd like to call drop_metapage here, but I don't think it's + * safe unless I have the page locked + */ + } + end_page_writeback(page); +} + +static void metapage_write_end_io(struct bio *bio, int err) +{ + struct page *page = bio->bi_private; + + BUG_ON(!PagePrivate(page)); + + if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "metapage_write_end_io: I/O error\n"); + SetPageError(page); + } + dec_io(page, last_write_complete); + bio_put(bio); +} + +static int metapage_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio = NULL; + int block_offset; /* block offset of mp within page */ + struct inode *inode = page->mapping->host; + int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; + int len; + int xlen; + struct metapage *mp; + int redirty = 0; + sector_t lblock; + int nr_underway = 0; + sector_t pblock; + sector_t next_block = 0; + sector_t page_start; + unsigned long bio_bytes = 0; + unsigned long bio_offset = 0; + int offset; + int bad_blocks = 0; + + page_start = (sector_t)page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp || !test_bit(META_dirty, &mp->flag)) + continue; + + if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) { + redirty = 1; + /* + * Make sure this page isn't blocked indefinitely. + * If the journal isn't undergoing I/O, push it + */ + if (mp->log && !(mp->log->cflag & logGC_PAGEOUT)) + jfs_flush_journal(mp->log, 0); + continue; + } + + clear_bit(META_dirty, &mp->flag); + set_bit(META_io, &mp->flag); + block_offset = offset >> inode->i_blkbits; + lblock = page_start + block_offset; + if (bio) { + if (xlen && lblock == next_block) { + /* Contiguous, in memory & on disk */ + len = min(xlen, blocks_per_mp); + xlen -= len; + bio_bytes += len << inode->i_blkbits; + continue; + } + /* Not contiguous */ + if (bio_add_page(bio, page, bio_bytes, bio_offset) < + bio_bytes) + goto add_failed; + /* + * Increment counter before submitting i/o to keep + * count from hitting zero before we're through + */ + inc_io(page); + if (!bio->bi_iter.bi_size) + goto dump_bio; + submit_bio(WRITE, bio); + nr_underway++; + bio = NULL; + } else + inc_io(page); + xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; + pblock = metapage_get_blocks(inode, lblock, &xlen); + if (!pblock) { + printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); + /* + * We already called inc_io(), but can't cancel it + * with dec_io() until we're done with the page + */ + bad_blocks++; + continue; + } + len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_write_end_io; + bio->bi_private = page; + + /* Don't call bio_add_page yet, we may add to this vec */ + bio_offset = offset; + bio_bytes = len << inode->i_blkbits; + + xlen -= len; + next_block = lblock + len; + } + if (bio) { + if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) + goto add_failed; + if (!bio->bi_iter.bi_size) + goto dump_bio; + + submit_bio(WRITE, bio); + nr_underway++; + } + if (redirty) + redirty_page_for_writepage(wbc, page); + + unlock_page(page); + + if (bad_blocks) + goto err_out; + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +add_failed: + /* We should never reach here, since we're only adding one vec */ + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + goto skip; +dump_bio: + print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, + 4, bio, sizeof(*bio), 0); +skip: + bio_put(bio); + unlock_page(page); + dec_io(page, last_write_complete); +err_out: + while (bad_blocks--) + dec_io(page, last_write_complete); + return -EIO; +} + +static int metapage_readpage(struct file *fp, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct bio *bio = NULL; + int block_offset; + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + sector_t page_start; /* address of page in fs blocks */ + sector_t pblock; + int xlen; + unsigned int len; + int offset; + + BUG_ON(!PageLocked(page)); + page_start = (sector_t)page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + block_offset = 0; + while (block_offset < blocks_per_page) { + xlen = blocks_per_page - block_offset; + pblock = metapage_get_blocks(inode, page_start + block_offset, + &xlen); + if (pblock) { + if (!PagePrivate(page)) + insert_metapage(page, NULL); + inc_io(page); + if (bio) + submit_bio(READ, bio); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_read_end_io; + bio->bi_private = page; + len = xlen << inode->i_blkbits; + offset = block_offset << inode->i_blkbits; + if (bio_add_page(bio, page, len, offset) < len) + goto add_failed; + block_offset += xlen; + } else + block_offset++; + } + if (bio) + submit_bio(READ, bio); + else + unlock_page(page); + + return 0; + +add_failed: + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + bio_put(bio); + dec_io(page, last_read_complete); + return -EIO; +} + +static int metapage_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct metapage *mp; + int ret = 1; + int offset; + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp) + continue; + + jfs_info("metapage_releasepage: mp = 0x%p", mp); + if (mp->count || mp->nohomeok || + test_bit(META_dirty, &mp->flag)) { + jfs_info("count = %ld, nohomeok = %d", mp->count, + mp->nohomeok); + ret = 0; + continue; + } + if (mp->lsn) + remove_from_logsync(mp); + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); + } + return ret; +} + +static void metapage_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + BUG_ON(offset || length < PAGE_CACHE_SIZE); + + BUG_ON(PageWriteback(page)); + + metapage_releasepage(page, 0); +} + +const struct address_space_operations jfs_metapage_aops = { + .readpage = metapage_readpage, + .writepage = metapage_writepage, + .releasepage = metapage_releasepage, + .invalidatepage = metapage_invalidatepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, + unsigned int size, int absolute, + unsigned long new) +{ + int l2BlocksPerPage; + int l2bsize; + struct address_space *mapping; + struct metapage *mp = NULL; + struct page *page; + unsigned long page_index; + unsigned long page_offset; + + jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", + inode->i_ino, lblock, absolute); + + l2bsize = inode->i_blkbits; + l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; + page_index = lblock >> l2BlocksPerPage; + page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize; + if ((page_offset + size) > PAGE_CACHE_SIZE) { + jfs_err("MetaData crosses page boundary!!"); + jfs_err("lblock = %lx, size = %d", lblock, size); + dump_stack(); + return NULL; + } + if (absolute) + mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping; + else { + /* + * If an nfs client tries to read an inode that is larger + * than any existing inodes, we may try to read past the + * end of the inode map + */ + if ((lblock << inode->i_blkbits) >= inode->i_size) + return NULL; + mapping = inode->i_mapping; + } + + if (new && (PSIZE == PAGE_CACHE_SIZE)) { + page = grab_cache_page(mapping, page_index); + if (!page) { + jfs_err("grab_cache_page failed!"); + return NULL; + } + SetPageUptodate(page); + } else { + page = read_mapping_page(mapping, page_index, NULL); + if (IS_ERR(page) || !PageUptodate(page)) { + jfs_err("read_mapping_page failed!"); + return NULL; + } + lock_page(page); + } + + mp = page_to_mp(page, page_offset); + if (mp) { + if (mp->logical_size != size) { + jfs_error(inode->i_sb, + "get_mp->logical_size != size\n"); + jfs_err("logical_size = %d, size = %d", + mp->logical_size, size); + dump_stack(); + goto unlock; + } + mp->count++; + lock_metapage(mp); + if (test_bit(META_discard, &mp->flag)) { + if (!new) { + jfs_error(inode->i_sb, + "using a discarded metapage\n"); + discard_metapage(mp); + goto unlock; + } + clear_bit(META_discard, &mp->flag); + } + } else { + INCREMENT(mpStat.pagealloc); + mp = alloc_metapage(GFP_NOFS); + mp->page = page; + mp->flag = 0; + mp->xflag = COMMIT_PAGE; + mp->count = 1; + mp->nohomeok = 0; + mp->logical_size = size; + mp->data = page_address(page) + page_offset; + mp->index = lblock; + if (unlikely(insert_metapage(page, mp))) { + free_metapage(mp); + goto unlock; + } + lock_metapage(mp); + } + + if (new) { + jfs_info("zeroing mp = 0x%p", mp); + memset(mp->data, 0, PSIZE); + } + + unlock_page(page); + jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data); + return mp; + +unlock: + unlock_page(page); + return NULL; +} + +void grab_metapage(struct metapage * mp) +{ + jfs_info("grab_metapage: mp = 0x%p", mp); + page_cache_get(mp->page); + lock_page(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); +} + +void force_metapage(struct metapage *mp) +{ + struct page *page = mp->page; + jfs_info("force_metapage: mp = 0x%p", mp); + set_bit(META_forcewrite, &mp->flag); + clear_bit(META_sync, &mp->flag); + page_cache_get(page); + lock_page(page); + set_page_dirty(page); + write_one_page(page, 1); + clear_bit(META_forcewrite, &mp->flag); + page_cache_release(page); +} + +void hold_metapage(struct metapage *mp) +{ + lock_page(mp->page); +} + +void put_metapage(struct metapage *mp) +{ + if (mp->count || mp->nohomeok) { + /* Someone else will release this */ + unlock_page(mp->page); + return; + } + page_cache_get(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); + release_metapage(mp); +} + +void release_metapage(struct metapage * mp) +{ + struct page *page = mp->page; + jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag); + + BUG_ON(!page); + + lock_page(page); + unlock_metapage(mp); + + assert(mp->count); + if (--mp->count || mp->nohomeok) { + unlock_page(page); + page_cache_release(page); + return; + } + + if (test_bit(META_dirty, &mp->flag)) { + set_page_dirty(page); + if (test_bit(META_sync, &mp->flag)) { + clear_bit(META_sync, &mp->flag); + write_one_page(page, 1); + lock_page(page); /* write_one_page unlocks the page */ + } + } else if (mp->lsn) /* discard_metapage doesn't remove it */ + remove_from_logsync(mp); + + /* Try to keep metapages from using up too much memory */ + drop_metapage(page, mp); + + unlock_page(page); + page_cache_release(page); +} + +void __invalidate_metapages(struct inode *ip, s64 addr, int len) +{ + sector_t lblock; + int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits; + int BlocksPerPage = 1 << l2BlocksPerPage; + /* All callers are interested in block device's mapping */ + struct address_space *mapping = + JFS_SBI(ip->i_sb)->direct_inode->i_mapping; + struct metapage *mp; + struct page *page; + unsigned int offset; + + /* + * Mark metapages to discard. They will eventually be + * released, but should not be written. + */ + for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len; + lblock += BlocksPerPage) { + page = find_lock_page(mapping, lblock >> l2BlocksPerPage); + if (!page) + continue; + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (!mp) + continue; + if (mp->index < addr) + continue; + if (mp->index >= addr + len) + break; + + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + if (mp->lsn) + remove_from_logsync(mp); + } + unlock_page(page); + page_cache_release(page); + } +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_mpstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Metapage statistics\n" + "=======================\n" + "page allocations = %d\n" + "page frees = %d\n" + "lock waits = %d\n", + mpStat.pagealloc, + mpStat.pagefree, + mpStat.lockwait); + return 0; +} + +static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_mpstat_proc_show, NULL); +} + +const struct file_operations jfs_mpstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_mpstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/kernel/fs/jfs/jfs_metapage.h b/kernel/fs/jfs/jfs_metapage.h new file mode 100644 index 000000000..337e9e51a --- /dev/null +++ b/kernel/fs/jfs/jfs_metapage.h @@ -0,0 +1,154 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_METAPAGE +#define _H_JFS_METAPAGE + +#include + +struct metapage { + /* Common logsyncblk prefix (see jfs_logmgr.h) */ + u16 xflag; + u16 unused; + lid_t lid; + int lsn; + struct list_head synclist; + /* End of logsyncblk prefix */ + + unsigned long flag; /* See Below */ + unsigned long count; /* Reference count */ + void *data; /* Data pointer */ + sector_t index; /* block address of page */ + wait_queue_head_t wait; + + /* implementation */ + struct page *page; + unsigned int logical_size; + + /* Journal management */ + int clsn; + int nohomeok; + struct jfs_log *log; +}; + +/* metapage flag */ +#define META_locked 0 +#define META_dirty 2 +#define META_sync 3 +#define META_discard 4 +#define META_forcewrite 5 +#define META_io 6 + +#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) + +/* function prototypes */ +extern int metapage_init(void); +extern void metapage_exit(void); +extern struct metapage *__get_metapage(struct inode *inode, + unsigned long lblock, unsigned int size, + int absolute, unsigned long new); + +#define read_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, false) + +#define get_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, true) + +extern void release_metapage(struct metapage *); +extern void grab_metapage(struct metapage *); +extern void force_metapage(struct metapage *); + +/* + * hold_metapage and put_metapage are used in conjunction. The page lock + * is not dropped between the two, so no other threads can get or release + * the metapage + */ +extern void hold_metapage(struct metapage *); +extern void put_metapage(struct metapage *); + +static inline void write_metapage(struct metapage *mp) +{ + set_bit(META_dirty, &mp->flag); + release_metapage(mp); +} + +static inline void flush_metapage(struct metapage *mp) +{ + set_bit(META_sync, &mp->flag); + write_metapage(mp); +} + +static inline void discard_metapage(struct metapage *mp) +{ + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + release_metapage(mp); +} + +static inline void metapage_nohomeok(struct metapage *mp) +{ + struct page *page = mp->page; + lock_page(page); + if (!mp->nohomeok++) { + mark_metapage_dirty(mp); + page_cache_get(page); + wait_on_page_writeback(page); + } + unlock_page(page); +} + +/* + * This serializes access to mp->lsn when metapages are added to logsynclist + * without setting nohomeok. i.e. updating imap & dmap + */ +static inline void metapage_wait_for_io(struct metapage *mp) +{ + if (test_bit(META_io, &mp->flag)) + wait_on_page_writeback(mp->page); +} + +/* + * This is called when already holding the metapage + */ +static inline void _metapage_homeok(struct metapage *mp) +{ + if (!--mp->nohomeok) + page_cache_release(mp->page); +} + +static inline void metapage_homeok(struct metapage *mp) +{ + hold_metapage(mp); + _metapage_homeok(mp); + put_metapage(mp); +} + +extern const struct address_space_operations jfs_metapage_aops; + +/* + * This routines invalidate all pages for an extent. + */ +extern void __invalidate_metapages(struct inode *, s64, int); +#define invalidate_pxd_metapages(ip, pxd) \ + __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd))) +#define invalidate_dxd_metapages(ip, dxd) \ + __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd))) +#define invalidate_xad_metapages(ip, xad) \ + __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad))) + +#endif /* _H_JFS_METAPAGE */ diff --git a/kernel/fs/jfs/jfs_mount.c b/kernel/fs/jfs/jfs_mount.c new file mode 100644 index 000000000..9895595fd --- /dev/null +++ b/kernel/fs/jfs/jfs_mount.c @@ -0,0 +1,507 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Module: jfs_mount.c + * + * note: file system in transition to aggregate/fileset: + * + * file system mount is interpreted as the mount of aggregate, + * if not already mounted, and mount of the single/only fileset in + * the aggregate; + * + * a file system/aggregate is represented by an internal inode + * (aka mount inode) initialized with aggregate superblock; + * each vfs represents a fileset, and points to its "fileset inode + * allocation map inode" (aka fileset inode): + * (an aggregate itself is structured recursively as a filset: + * an internal vfs is constructed and points to its "fileset inode + * allocation map inode" (aka aggregate inode) where each inode + * represents a fileset inode) so that inode number is mapped to + * on-disk inode in uniform way at both aggregate and fileset level; + * + * each vnode/inode of a fileset is linked to its vfs (to facilitate + * per fileset inode operations, e.g., unmount of a fileset, etc.); + * each inode points to the mount inode (to facilitate access to + * per aggregate information, e.g., block size, etc.) as well as + * its file set inode. + * + * aggregate + * ipmnt + * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; + * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; + */ + +#include +#include + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + + +/* + * forward references + */ +static int chkSuper(struct super_block *); +static int logMOUNT(struct super_block *sb); + +/* + * NAME: jfs_mount(sb) + * + * FUNCTION: vfs_mount() + * + * PARAMETER: sb - super block + * + * RETURN: -EBUSY - device already mounted or open for write + * -EBUSY - cvrdvp already mounted; + * -EBUSY - mount table full + * -ENOTDIR- cvrdvp not directory on a device mount + * -ENXIO - device open failure + */ +int jfs_mount(struct super_block *sb) +{ + int rc = 0; /* Return code */ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipaimap = NULL; + struct inode *ipaimap2 = NULL; + struct inode *ipimap = NULL; + struct inode *ipbmap = NULL; + + /* + * read/validate superblock + * (initialize mount inode from the superblock) + */ + if ((rc = chkSuper(sb))) { + goto errout20; + } + + ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); + if (ipaimap == NULL) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto errout20; + } + sbi->ipaimap = ipaimap; + + jfs_info("jfs_mount: ipaimap:0x%p", ipaimap); + + /* + * initialize aggregate inode allocation map + */ + if ((rc = diMount(ipaimap))) { + jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); + goto errout21; + } + + /* + * open aggregate block allocation map + */ + ipbmap = diReadSpecial(sb, BMAP_I, 0); + if (ipbmap == NULL) { + rc = -EIO; + goto errout22; + } + + jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); + + sbi->ipbmap = ipbmap; + + /* + * initialize aggregate block allocation map + */ + if ((rc = dbMount(ipbmap))) { + jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); + goto errout22; + } + + /* + * open the secondary aggregate inode allocation map + * + * This is a duplicate of the aggregate inode allocation map. + * + * hand craft a vfs in the same fashion as we did to read ipaimap. + * By adding INOSPEREXT (32) to the inode number, we are telling + * diReadSpecial that we are reading from the secondary aggregate + * inode table. This also creates a unique entry in the inode hash + * table. + */ + if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { + ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); + if (!ipaimap2) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto errout35; + } + sbi->ipaimap2 = ipaimap2; + + jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2); + + /* + * initialize secondary aggregate inode allocation map + */ + if ((rc = diMount(ipaimap2))) { + jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", + rc); + goto errout35; + } + } else + /* Secondary aggregate inode table is not valid */ + sbi->ipaimap2 = NULL; + + /* + * mount (the only/single) fileset + */ + /* + * open fileset inode allocation map (aka fileset inode) + */ + ipimap = diReadSpecial(sb, FILESYSTEM_I, 0); + if (ipimap == NULL) { + jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); + /* open fileset secondary inode allocation map */ + rc = -EIO; + goto errout40; + } + jfs_info("jfs_mount: ipimap:0x%p", ipimap); + + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + + /* initialize fileset inode allocation map */ + if ((rc = diMount(ipimap))) { + jfs_err("jfs_mount: diMount failed w/rc = %d", rc); + goto errout41; + } + + goto out; + + /* + * unwind on error + */ + errout41: /* close fileset inode allocation map inode */ + diFreeSpecial(ipimap); + + errout40: /* fileset closed */ + + /* close secondary aggregate inode allocation map */ + if (ipaimap2) { + diUnmount(ipaimap2, 1); + diFreeSpecial(ipaimap2); + } + + errout35: + + /* close aggregate block allocation map */ + dbUnmount(ipbmap, 1); + diFreeSpecial(ipbmap); + + errout22: /* close aggregate inode allocation map */ + + diUnmount(ipaimap, 1); + + errout21: /* close aggregate inodes */ + diFreeSpecial(ipaimap); + errout20: /* aggregate closed */ + + out: + + if (rc) + jfs_err("Mount JFS Failure: %d", rc); + + return rc; +} + +/* + * NAME: jfs_mount_rw(sb, remount) + * + * FUNCTION: Completes read-write mount, or remounts read-only volume + * as read-write + */ +int jfs_mount_rw(struct super_block *sb, int remount) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + /* + * If we are re-mounting a previously read-only volume, we want to + * re-read the inode and block maps, since fsck.jfs may have updated + * them. + */ + if (remount) { + if (chkSuper(sb) || (sbi->state != FM_CLEAN)) + return -EINVAL; + + truncate_inode_pages(sbi->ipimap->i_mapping, 0); + truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + diUnmount(sbi->ipimap, 1); + if ((rc = diMount(sbi->ipimap))) { + jfs_err("jfs_mount_rw: diMount failed!"); + return rc; + } + + dbUnmount(sbi->ipbmap, 1); + if ((rc = dbMount(sbi->ipbmap))) { + jfs_err("jfs_mount_rw: dbMount failed!"); + return rc; + } + } + + /* + * open/initialize log + */ + if ((rc = lmLogOpen(sb))) + return rc; + + /* + * update file system superblock; + */ + if ((rc = updateSuper(sb, FM_MOUNT))) { + jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc); + lmLogClose(sb); + return rc; + } + + /* + * write MOUNT log record of the file system + */ + logMOUNT(sb); + + return rc; +} + +/* + * chkSuper() + * + * validate the superblock of the file system to be mounted and + * get the file system parameters. + * + * returns + * 0 with fragsize set if check successful + * error code if not successful + */ +static int chkSuper(struct super_block *sb) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_superblock *j_sb; + struct buffer_head *bh; + int AIM_bytesize, AIT_bytesize; + int expected_AIM_bytesize, expected_AIT_bytesize; + s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr; + s64 byte_addr_diff0, byte_addr_diff1; + s32 bsize; + + if ((rc = readSuper(sb, &bh))) + return rc; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* + * validate superblock + */ + /* validate fs signature */ + if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) || + le32_to_cpu(j_sb->s_version) > JFS_VERSION) { + rc = -EINVAL; + goto out; + } + + bsize = le32_to_cpu(j_sb->s_bsize); +#ifdef _JFS_4K + if (bsize != PSIZE) { + jfs_err("Currently only 4K block size supported!"); + rc = -EINVAL; + goto out; + } +#endif /* _JFS_4K */ + + jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", + le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), + (unsigned long long) le64_to_cpu(j_sb->s_size)); + + /* validate the descriptors for Secondary AIM and AIT */ + if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != + cpu_to_le32(JFS_BAD_SAIT)) { + expected_AIM_bytesize = 2 * PSIZE; + AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + expected_AIT_bytesize = 4 * PSIZE; + AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; + AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; + AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; + fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; + if ((AIM_bytesize != expected_AIM_bytesize) || + (AIT_bytesize != expected_AIT_bytesize) || + (byte_addr_diff0 != AIM_bytesize) || + (byte_addr_diff1 <= AIT_bytesize)) + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + } + + if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) != + cpu_to_le32(JFS_GROUPCOMMIT)) + j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT); + + /* validate fs state */ + if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && + !(sb->s_flags & MS_RDONLY)) { + jfs_err("jfs_mount: Mount Failure: File System Dirty."); + rc = -EINVAL; + goto out; + } + + sbi->state = le32_to_cpu(j_sb->s_state); + sbi->mntflag = le32_to_cpu(j_sb->s_flag); + + /* + * JFS always does I/O by 4K pages. Don't tell the buffer cache + * that we use anything else (leave s_blocksize alone). + */ + sbi->bsize = bsize; + sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + + /* + * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer + * cache. + */ + sbi->nbperpage = PSIZE >> sbi->l2bsize; + sbi->l2nbperpage = L2PSIZE - sbi->l2bsize; + sbi->l2niperblk = sbi->l2bsize - L2DISIZE; + if (sbi->mntflag & JFS_INLINELOG) + sbi->logpxd = j_sb->s_logpxd; + else { + sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev)); + memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid)); + memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid)); + } + sbi->fsckpxd = j_sb->s_fsckpxd; + sbi->ait2 = j_sb->s_ait2; + + out: + brelse(bh); + return rc; +} + + +/* + * updateSuper() + * + * update synchronously superblock if it is mounted read-write. + */ +int updateSuper(struct super_block *sb, uint state) +{ + struct jfs_superblock *j_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct buffer_head *bh; + int rc; + + if (sbi->flag & JFS_NOINTEGRITY) { + if (state == FM_DIRTY) { + sbi->p_state = state; + return 0; + } else if (state == FM_MOUNT) { + sbi->p_state = sbi->state; + state = FM_DIRTY; + } else if (state == FM_CLEAN) { + state = sbi->p_state; + } else + jfs_err("updateSuper: bad state"); + } else if (sbi->state == FM_DIRTY) + return 0; + + if ((rc = readSuper(sb, &bh))) + return rc; + + j_sb = (struct jfs_superblock *)bh->b_data; + + j_sb->s_state = cpu_to_le32(state); + sbi->state = state; + + if (state == FM_MOUNT) { + /* record log's dev_t and mount serial number */ + j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logserial = cpu_to_le32(sbi->log->serial); + } else if (state == FM_CLEAN) { + /* + * If this volume is shared with OS/2, OS/2 will need to + * recalculate DASD usage, since we don't deal with it. + */ + if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED)) + j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME); + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} + + +/* + * readSuper() + * + * read superblock by raw sector address + */ +int readSuper(struct super_block *sb, struct buffer_head **bpp) +{ + /* read in primary superblock */ + *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + /* read in secondary/replicated superblock */ + *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + return -EIO; +} + + +/* + * logMOUNT() + * + * function: write a MOUNT log record for file system. + * + * MOUNT record keeps logredo() from processing log records + * for this file system past this point in log. + * it is harmless if mount fails. + * + * note: MOUNT record is at aggregate level, not at fileset level, + * since log records of previous mounts of a fileset + * (e.g., AFTER record of extent allocation) have to be processed + * to update block allocation map at aggregate level. + */ +static int logMOUNT(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + struct lrd lrd; + + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_MOUNT); + lrd.length = 0; + lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev)); + lmLog(log, NULL, &lrd, NULL); + + return 0; +} diff --git a/kernel/fs/jfs/jfs_superblock.h b/kernel/fs/jfs/jfs_superblock.h new file mode 100644 index 000000000..04847b8d3 --- /dev/null +++ b/kernel/fs/jfs/jfs_superblock.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_SUPERBLOCK +#define _H_JFS_SUPERBLOCK + +/* + * make the magic number something a human could read + */ +#define JFS_MAGIC "JFS1" /* Magic word */ + +#define JFS_VERSION 2 /* Version number: Version 2 */ + +#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ + +/* + * aggregate superblock + * + * The name superblock is too close to super_block, so the name has been + * changed to jfs_superblock. The utilities are still using the old name. + */ +struct jfs_superblock { + char s_magic[4]; /* 4: magic number */ + __le32 s_version; /* 4: version number */ + + __le64 s_size; /* 8: aggregate size in hardware/LVM blocks; + * VFS: number of blocks + */ + __le32 s_bsize; /* 4: aggregate block size in bytes; + * VFS: fragment size + */ + __le16 s_l2bsize; /* 2: log2 of s_bsize */ + __le16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */ + __le32 s_pbsize; /* 4: hardware/LVM block size in bytes */ + __le16 s_l2pbsize; /* 2: log2 of s_pbsize */ + __le16 pad; /* 2: padding necessary for alignment */ + + __le32 s_agsize; /* 4: allocation group size in aggr. blocks */ + + __le32 s_flag; /* 4: aggregate attributes: + * see jfs_filsys.h + */ + __le32 s_state; /* 4: mount/unmount/recovery state: + * see jfs_filsys.h + */ + __le32 s_compress; /* 4: > 0 if data compression */ + + pxd_t s_ait2; /* 8: first extent of secondary + * aggregate inode table + */ + + pxd_t s_aim2; /* 8: first extent of secondary + * aggregate inode map + */ + __le32 s_logdev; /* 4: device address of log */ + __le32 s_logserial; /* 4: log serial number at aggregate mount */ + pxd_t s_logpxd; /* 8: inline log extent */ + + pxd_t s_fsckpxd; /* 8: inline fsck work space extent */ + + struct timestruc_t s_time; /* 8: time last updated */ + + __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for + * the fsck service log. + * N.B. These blocks are divided among the + * versions kept. This is not a per + * version size. + * N.B. These blocks are included in the + * length field of s_fsckpxd. + */ + s8 s_fscklog; /* 1: which fsck service log is most recent + * 0 => no service log data yet + * 1 => the first one + * 2 => the 2nd one + */ + char s_fpack[11]; /* 11: file system volume name + * N.B. This must be 11 bytes to + * conform with the OS/2 BootSector + * requirements + * Only used when s_version is 1 + */ + + /* extendfs() parameter under s_state & FM_EXTENDFS */ + __le64 s_xsize; /* 8: extendfs s_size */ + pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ + pxd_t s_xlogpxd; /* 8: extendfs logpxd */ + /* - 128 byte boundary - */ + + char s_uuid[16]; /* 16: 128-bit uuid for volume */ + char s_label[16]; /* 16: volume label */ + char s_loguuid[16]; /* 16: 128-bit uuid for log device */ + +}; + +extern int readSuper(struct super_block *, struct buffer_head **); +extern int updateSuper(struct super_block *, uint); +__printf(2, 3) +extern void jfs_error(struct super_block *, const char *, ...); +extern int jfs_mount(struct super_block *); +extern int jfs_mount_rw(struct super_block *, int); +extern int jfs_umount(struct super_block *); +extern int jfs_umount_rw(struct super_block *); +extern int jfs_extendfs(struct super_block *, s64, int); + +extern struct task_struct *jfsIOthread; +extern struct task_struct *jfsSyncThread; + +#endif /*_H_JFS_SUPERBLOCK */ diff --git a/kernel/fs/jfs/jfs_txnmgr.c b/kernel/fs/jfs/jfs_txnmgr.c new file mode 100644 index 000000000..d59585645 --- /dev/null +++ b/kernel/fs/jfs/jfs_txnmgr.c @@ -0,0 +1,3093 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_txnmgr.c: transaction manager + * + * notes: + * transaction starts with txBegin() and ends with txCommit() + * or txAbort(). + * + * tlock is acquired at the time of update; + * (obviate scan at commit time for xtree and dtree) + * tlock and mp points to each other; + * (no hashlist for mp -> tlock). + * + * special cases: + * tlock on in-memory inode: + * in-place tlock in the in-memory inode itself; + * converted to page lock by iWrite() at commit time. + * + * tlock during write()/mmap() under anonymous transaction (tid = 0): + * transferred (?) to transaction at commit time. + * + * use the page itself to update allocation maps + * (obviate intermediate replication of allocation/deallocation data) + * hold on to mp+lock thru update of maps + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * transaction management structures + */ +static struct { + int freetid; /* index of a free tid structure */ + int freelock; /* index first free lock word */ + wait_queue_head_t freewait; /* eventlist of free tblock */ + wait_queue_head_t freelockwait; /* eventlist of free tlock */ + wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ + int tlocksInUse; /* Number of tlocks in use */ + spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ +/* struct tblock *sync_queue; * Transactions waiting for data sync */ + struct list_head unlock_queue; /* Txns waiting to be released */ + struct list_head anon_list; /* inodes having anonymous txns */ + struct list_head anon_list2; /* inodes having anonymous txns + that couldn't be sync'ed */ +} TxAnchor; + +int jfs_tlocks_low; /* Indicates low number of available tlocks */ + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint txBegin; + uint txBegin_barrier; + uint txBegin_lockslow; + uint txBegin_freetid; + uint txBeginAnon; + uint txBeginAnon_barrier; + uint txBeginAnon_lockslow; + uint txLockAlloc; + uint txLockAlloc_freelock; +} TxStat; +#endif + +static int nTxBlock = -1; /* number of transaction blocks */ +module_param(nTxBlock, int, 0); +MODULE_PARM_DESC(nTxBlock, + "Number of transaction blocks (max:65536)"); + +static int nTxLock = -1; /* number of transaction locks */ +module_param(nTxLock, int, 0); +MODULE_PARM_DESC(nTxLock, + "Number of transaction locks (max:65536)"); + +struct tblock *TxBlock; /* transaction block table */ +static int TxLockLWM; /* Low water mark for number of txLocks used */ +static int TxLockHWM; /* High water mark for number of txLocks used */ +static int TxLockVHWM; /* Very High water mark */ +struct tlock *TxLock; /* transaction lock table */ + +/* + * transaction management lock + */ +static DEFINE_SPINLOCK(jfsTxnLock); + +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) + +#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); +#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) +#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) + +static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); +static int jfs_commit_thread_waking; + +/* + * Retry logic exist outside these macros to protect from spurrious wakeups. + */ +static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(event, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + TXN_UNLOCK(); + io_schedule(); + remove_wait_queue(event, &wait); +} + +#define TXN_SLEEP(event)\ +{\ + TXN_SLEEP_DROP_LOCK(event);\ + TXN_LOCK();\ +} + +#define TXN_WAKEUP(event) wake_up_all(event) + +/* + * statistics + */ +static struct { + tid_t maxtid; /* 4: biggest tid ever used */ + lid_t maxlid; /* 4: biggest lid ever used */ + int ntid; /* 4: # of transactions performed */ + int nlid; /* 4: # of tlocks acquired */ + int waitlock; /* 4: # of tlock wait */ +} stattx; + +/* + * forward references + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd); +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk); +static void txForce(struct tblock * tblk); +static int txLog(struct jfs_log * log, struct tblock * tblk, + struct commit * cd); +static void txUpdateMap(struct tblock * tblk); +static void txRelease(struct tblock * tblk); +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void LogSyncRelease(struct metapage * mp); + +/* + * transaction block/lock management + * --------------------------------- + */ + +/* + * Get a transaction lock from the free list. If the number in use is + * greater than the high water mark, wake up the sync daemon. This should + * free some anonymous transaction locks. (TXN_LOCK must be held.) + */ +static lid_t txLockAlloc(void) +{ + lid_t lid; + + INCREMENT(TxStat.txLockAlloc); + if (!TxAnchor.freelock) { + INCREMENT(TxStat.txLockAlloc_freelock); + } + + while (!(lid = TxAnchor.freelock)) + TXN_SLEEP(&TxAnchor.freelockwait); + TxAnchor.freelock = TxLock[lid].next; + HIGHWATERMARK(stattx.maxlid, lid); + if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { + jfs_info("txLockAlloc tlocks low"); + jfs_tlocks_low = 1; + wake_up_process(jfsSyncThread); + } + + return lid; +} + +static void txLockFree(lid_t lid) +{ + TxLock[lid].tid = 0; + TxLock[lid].next = TxAnchor.freelock; + TxAnchor.freelock = lid; + TxAnchor.tlocksInUse--; + if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { + jfs_info("txLockFree jfs_tlocks_low no more"); + jfs_tlocks_low = 0; + TXN_WAKEUP(&TxAnchor.lowlockwait); + } + TXN_WAKEUP(&TxAnchor.freelockwait); +} + +/* + * NAME: txInit() + * + * FUNCTION: initialize transaction management structures + * + * RETURN: + * + * serialization: single thread at jfs_init() + */ +int txInit(void) +{ + int k, size; + struct sysinfo si; + + /* Set defaults for nTxLock and nTxBlock if unset */ + + if (nTxLock == -1) { + if (nTxBlock == -1) { + /* Base default on memory size */ + si_meminfo(&si); + if (si.totalram > (256 * 1024)) /* 1 GB */ + nTxLock = 64 * 1024; + else + nTxLock = si.totalram >> 2; + } else if (nTxBlock > (8 * 1024)) + nTxLock = 64 * 1024; + else + nTxLock = nTxBlock << 3; + } + if (nTxBlock == -1) + nTxBlock = nTxLock >> 3; + + /* Verify tunable parameters */ + if (nTxBlock < 16) + nTxBlock = 16; /* No one should set it this low */ + if (nTxBlock > 65536) + nTxBlock = 65536; + if (nTxLock < 256) + nTxLock = 256; /* No one should set it this low */ + if (nTxLock > 65536) + nTxLock = 65536; + + printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", + nTxBlock, nTxLock); + /* + * initialize transaction block (tblock) table + * + * transaction id (tid) = tblock index + * tid = 0 is reserved. + */ + TxLockLWM = (nTxLock * 4) / 10; + TxLockHWM = (nTxLock * 7) / 10; + TxLockVHWM = (nTxLock * 8) / 10; + + size = sizeof(struct tblock) * nTxBlock; + TxBlock = vmalloc(size); + if (TxBlock == NULL) + return -ENOMEM; + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + } + TxBlock[k].next = 0; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + + TxAnchor.freetid = 1; + init_waitqueue_head(&TxAnchor.freewait); + + stattx.maxtid = 1; /* statistics */ + + /* + * initialize transaction lock (tlock) table + * + * transaction lock id = tlock index + * tlock id = 0 is reserved. + */ + size = sizeof(struct tlock) * nTxLock; + TxLock = vmalloc(size); + if (TxLock == NULL) { + vfree(TxBlock); + return -ENOMEM; + } + + /* initialize tlock table */ + for (k = 1; k < nTxLock - 1; k++) + TxLock[k].next = k + 1; + TxLock[k].next = 0; + init_waitqueue_head(&TxAnchor.freelockwait); + init_waitqueue_head(&TxAnchor.lowlockwait); + + TxAnchor.freelock = 1; + TxAnchor.tlocksInUse = 0; + INIT_LIST_HEAD(&TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + + LAZY_LOCK_INIT(); + INIT_LIST_HEAD(&TxAnchor.unlock_queue); + + stattx.maxlid = 1; /* statistics */ + + return 0; +} + +/* + * NAME: txExit() + * + * FUNCTION: clean up when module is unloaded + */ +void txExit(void) +{ + vfree(TxLock); + TxLock = NULL; + vfree(TxBlock); + TxBlock = NULL; +} + +/* + * NAME: txBegin() + * + * FUNCTION: start a transaction. + * + * PARAMETER: sb - superblock + * flag - force for nested tx; + * + * RETURN: tid - transaction id + * + * note: flag force allows to start tx for nested tx + * to prevent deadlock on logsync barrier; + */ +tid_t txBegin(struct super_block *sb, int flag) +{ + tid_t t; + struct tblock *tblk; + struct jfs_log *log; + + jfs_info("txBegin: flag = 0x%x", flag); + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + + INCREMENT(TxStat.txBegin); + + retry: + if (!(flag & COMMIT_FORCE)) { + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBegin_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + } + if (flag == 0) { + /* + * Don't begin transaction if we're getting starved for tlocks + * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately + * free tlocks) + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBegin_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + } + + /* + * allocate transaction id/block + */ + if ((t = TxAnchor.freetid) == 0) { + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + tblk = tid_to_tblock(t); + + if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { + /* Don't let a non-forced transaction take the last tblk */ + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + TxAnchor.freetid = tblk->next; + + /* + * initialize transaction + */ + + /* + * We can't zero the whole thing or we screw up another thread being + * awakened after sleeping on tblk->waitor + * + * memset(tblk, 0, sizeof(struct tblock)); + */ + tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; + + tblk->sb = sb; + ++log->logtid; + tblk->logtid = log->logtid; + + ++log->active; + + HIGHWATERMARK(stattx.maxtid, t); /* statistics */ + INCREMENT(stattx.ntid); /* statistics */ + + TXN_UNLOCK(); + + jfs_info("txBegin: returning tid = %d", t); + + return t; +} + +/* + * NAME: txBeginAnon() + * + * FUNCTION: start an anonymous transaction. + * Blocks if logsync or available tlocks are low to prevent + * anonymous tlocks from depleting supply. + * + * PARAMETER: sb - superblock + * + * RETURN: none + */ +void txBeginAnon(struct super_block *sb) +{ + struct jfs_log *log; + + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + INCREMENT(TxStat.txBeginAnon); + + retry: + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBeginAnon_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + + /* + * Don't begin transaction if we're getting starved for tlocks + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBeginAnon_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + TXN_UNLOCK(); +} + +/* + * txEnd() + * + * function: free specified transaction block. + * + * logsync barrier processing: + * + * serialization: + */ +void txEnd(tid_t tid) +{ + struct tblock *tblk = tid_to_tblock(tid); + struct jfs_log *log; + + jfs_info("txEnd: tid = %d", tid); + TXN_LOCK(); + + /* + * wakeup transactions waiting on the page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + log = JFS_SBI(tblk->sb)->log; + + /* + * Lazy commit thread can't free this guy until we mark it UNLOCKED, + * otherwise, we would be left with a transaction that may have been + * reused. + * + * Lazy commit thread will turn off tblkGC_LAZY before calling this + * routine. + */ + if (tblk->flag & tblkGC_LAZY) { + jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); + TXN_UNLOCK(); + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + tblk->flag |= tblkGC_UNLOCKED; + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + return; + } + + jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); + + assert(tblk->next == 0); + + /* + * insert tblock back on freelist + */ + tblk->next = TxAnchor.freetid; + TxAnchor.freetid = tid; + + /* + * mark the tblock not active + */ + if (--log->active == 0) { + clear_bit(log_FLUSH, &log->flag); + + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag)) { + TXN_UNLOCK(); + + /* write dirty metadata & forward log syncpt */ + jfs_syncpt(log, 1); + + jfs_info("log barrier off: 0x%x", log->lsn); + + /* enable new transactions start */ + clear_bit(log_SYNCBARRIER, &log->flag); + + /* wakeup all waitors for logsync barrier */ + TXN_WAKEUP(&log->syncwait); + + goto wakeup; + } + } + + TXN_UNLOCK(); +wakeup: + /* + * wakeup all waitors for a free tblock + */ + TXN_WAKEUP(&TxAnchor.freewait); +} + +/* + * txLock() + * + * function: acquire a transaction lock on the specified + * + * parameter: + * + * return: transaction lock id + * + * serialization: + */ +struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, + int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int dir_xtree = 0; + lid_t lid; + tid_t xtid; + struct tlock *tlck; + struct xtlock *xtlck; + struct linelock *linelock; + xtpage_t *p; + struct tblock *tblk; + + TXN_LOCK(); + + if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && + !(mp->xflag & COMMIT_PAGE)) { + /* + * Directory inode is special. It can have both an xtree tlock + * and a dtree tlock associated with it. + */ + dir_xtree = 1; + lid = jfs_ip->xtlid; + } else + lid = mp->lid; + + /* is page not locked by a transaction ? */ + if (lid == 0) + goto allocateLock; + + jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); + + /* is page locked by the requester transaction ? */ + tlck = lid_to_tlock(lid); + if ((xtid = tlck->tid) == tid) { + TXN_UNLOCK(); + goto grantLock; + } + + /* + * is page locked by anonymous transaction/lock ? + * + * (page update without transaction (i.e., file write) is + * locked under anonymous transaction tid = 0: + * anonymous tlocks maintained on anonymous tlock list of + * the inode of the page and available to all anonymous + * transactions until txCommit() time at which point + * they are transferred to the transaction tlock list of + * the committing transaction of the inode) + */ + if (xtid == 0) { + tlck->tid = tid; + TXN_UNLOCK(); + tblk = tid_to_tblock(tid); + /* + * The order of the tlocks in the transaction is important + * (during truncate, child xtree pages must be freed before + * parent's tlocks change the working map). + * Take tlock off anonymous list and add to tail of + * transaction list + * + * Note: We really need to get rid of the tid & lid and + * use list_head's. This code is getting UGLY! + */ + if (jfs_ip->atlhead == lid) { + if (jfs_ip->atltail == lid) { + /* only anonymous txn. + * Remove from anon_list + */ + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + jfs_ip->atlhead = tlck->next; + } else { + lid_t last; + for (last = jfs_ip->atlhead; + lid_to_tlock(last)->next != lid; + last = lid_to_tlock(last)->next) { + assert(last); + } + lid_to_tlock(last)->next = tlck->next; + if (jfs_ip->atltail == lid) + jfs_ip->atltail = last; + } + + /* insert the tlock at tail of transaction tlock list */ + + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + + goto grantLock; + } + + goto waitLock; + + /* + * allocate a tlock + */ + allocateLock: + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + TXN_UNLOCK(); + + /* mark tlock for meta-data page */ + if (mp->xflag & COMMIT_PAGE) { + + tlck->flag = tlckPAGELOCK; + + /* mark the page dirty and nohomeok */ + metapage_nohomeok(mp); + + jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", + mp, mp->nohomeok, tid, tlck); + + /* if anonymous transaction, and buffer is on the group + * commit synclist, mark inode to show this. This will + * prevent the buffer from being marked nohomeok for too + * long a time. + */ + if ((tid == 0) && mp->lsn) + set_cflag(COMMIT_Synclist, ip); + } + /* mark tlock for in-memory inode */ + else + tlck->flag = tlckINODELOCK; + + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + + tlck->type = 0; + + /* bind the tlock and the page */ + tlck->ip = ip; + tlck->mp = mp; + if (dir_xtree) + jfs_ip->xtlid = lid; + else + mp->lid = lid; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + TXN_LOCK(); + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + TXN_UNLOCK(); + } + } + + /* initialize type dependent area for linelock */ + linelock = (struct linelock *) & tlck->lock; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKSHORT; + linelock->index = 0; + + switch (type & tlckTYPE) { + case tlckDTREE: + linelock->l2linesize = L2DTSLOTSIZE; + break; + + case tlckXTREE: + linelock->l2linesize = L2XTSLOTSIZE; + + xtlck = (struct xtlock *) linelock; + xtlck->header.offset = 0; + xtlck->header.length = 2; + + if (type & tlckNEW) { + xtlck->lwm.offset = XTENTRYSTART; + } else { + if (mp->xflag & COMMIT_PAGE) + p = (xtpage_t *) mp->data; + else + p = &jfs_ip->i_xtroot; + xtlck->lwm.offset = + le16_to_cpu(p->header.nextindex); + } + xtlck->lwm.length = 0; /* ! */ + xtlck->twm.offset = 0; + xtlck->hwm.offset = 0; + + xtlck->index = 2; + break; + + case tlckINODE: + linelock->l2linesize = L2INODESLOTSIZE; + break; + + case tlckDATA: + linelock->l2linesize = L2DATASLOTSIZE; + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + + /* + * update tlock vector + */ + grantLock: + tlck->type |= type; + + return tlck; + + /* + * page is being locked by another transaction: + */ + waitLock: + /* Only locks on ipimap or ipaimap should reach here */ + /* assert(jfs_ip->fileset == AGGREGATE_I); */ + if (jfs_ip->fileset != AGGREGATE_I) { + printk(KERN_ERR "txLock: trying to lock locked page!"); + print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, + ip, sizeof(*ip), 0); + print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(*mp), 0); + print_hex_dump(KERN_ERR, "Locker's tblock: ", + DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), + sizeof(struct tblock), 0); + print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, + tlck, sizeof(*tlck), 0); + BUG(); + } + INCREMENT(stattx.waitlock); /* statistics */ + TXN_UNLOCK(); + release_metapage(mp); + TXN_LOCK(); + xtid = tlck->tid; /* reacquire after dropping TXN_LOCK */ + + jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", + tid, xtid, lid); + + /* Recheck everything since dropping TXN_LOCK */ + if (xtid && (tlck->mp == mp) && (mp->lid == lid)) + TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); + else + TXN_UNLOCK(); + jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); + + return NULL; +} + +/* + * NAME: txRelease() + * + * FUNCTION: Release buffers associated with transaction locks, but don't + * mark homeok yet. The allows other transactions to modify + * buffers, but won't let them go to disk until commit record + * actually gets written. + * + * PARAMETER: + * tblk - + * + * RETURN: Errors from subroutines. + */ +static void txRelease(struct tblock * tblk) +{ + struct metapage *mp; + lid_t lid; + struct tlock *tlck; + + TXN_LOCK(); + + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + mp->lid = 0; + } + } + + /* + * wakeup transactions waiting on a page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + TXN_UNLOCK(); +} + +/* + * NAME: txUnlock() + * + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. + */ +static void txUnlock(struct tblock * tblk) +{ + struct tlock *tlck; + struct linelock *linelock; + lid_t lid, next, llid, k; + struct metapage *mp; + struct jfs_log *log; + int difft, diffp; + unsigned long flags; + + jfs_info("txUnlock: tblk = 0x%p", tblk); + log = JFS_SBI(tblk->sb)->log; + + /* + * mark page under tlock homeok (its log has been written): + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); + + /* unbind page from tlock */ + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + /* hold buffer + */ + hold_metapage(mp); + + assert(mp->nohomeok > 0); + _metapage_homeok(mp); + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log, flags); + if (mp->clsn) { + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log, flags); + + assert(!(tlck->flag & tlckFREEPAGE)); + + put_metapage(mp); + } + + /* insert tlock, and linelock(s) of the tlock if any, + * at head of freelist + */ + TXN_LOCK(); + + llid = ((struct linelock *) & tlck->lock)->next; + while (llid) { + linelock = (struct linelock *) lid_to_tlock(llid); + k = linelock->next; + txLockFree(llid); + llid = k; + } + txLockFree(lid); + + TXN_UNLOCK(); + } + tblk->next = tblk->last = 0; + + /* + * remove tblock from logsynclist + * (allocation map pages inherited lsn of tblk and + * has been inserted in logsync list at txUpdateMap()) + */ + if (tblk->lsn) { + LOGSYNC_LOCK(log, flags); + log->count--; + list_del(&tblk->synclist); + LOGSYNC_UNLOCK(log, flags); + } +} + +/* + * txMaplock() + * + * function: allocate a transaction lock for freed page/entry; + * for freed page, maplock is used as xtlock/dtlock type; + */ +struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + lid_t lid; + struct tblock *tblk; + struct tlock *tlck; + struct maplock *maplock; + + TXN_LOCK(); + + /* + * allocate a tlock + */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + /* bind the tlock and the object */ + tlck->flag = tlckINODELOCK; + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + tlck->ip = ip; + tlck->mp = NULL; + + tlck->type = type; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + } + } + + TXN_UNLOCK(); + + /* initialize type dependent area for maplock */ + maplock = (struct maplock *) & tlck->lock; + maplock->next = 0; + maplock->maxcnt = 0; + maplock->index = 0; + + return tlck; +} + +/* + * txLinelock() + * + * function: allocate a transaction lock for log vector list + */ +struct linelock *txLinelock(struct linelock * tlock) +{ + lid_t lid; + struct tlock *tlck; + struct linelock *linelock; + + TXN_LOCK(); + + /* allocate a TxLock structure */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + TXN_UNLOCK(); + + /* initialize linelock */ + linelock = (struct linelock *) tlck; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKLONG; + linelock->index = 0; + if (tlck->flag & tlckDIRECTORY) + linelock->flag |= tlckDIRECTORY; + + /* append linelock after tlock */ + linelock->next = tlock->next; + tlock->next = lid; + + return linelock; +} + +/* + * transaction commit management + * ----------------------------- + */ + +/* + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). + * + * PARAMETER: + * + * RETURN: + * + * serialization: + * on entry the inode lock on each segment is assumed + * to be held. + * + * i/o error: + */ +int txCommit(tid_t tid, /* transaction identifier */ + int nip, /* number of inodes to commit */ + struct inode **iplist, /* list of inode to commit */ + int flag) +{ + int rc = 0; + struct commit cd; + struct jfs_log *log; + struct tblock *tblk; + struct lrd *lrd; + struct inode *ip; + struct jfs_inode_info *jfs_ip; + int k, n; + ino_t top; + struct super_block *sb; + + jfs_info("txCommit, tid = %d, flag = %d", tid, flag); + /* is read-only file system ? */ + if (isReadOnly(iplist[0])) { + rc = -EROFS; + goto TheEnd; + } + + sb = cd.sb = iplist[0]->i_sb; + cd.tid = tid; + + if (tid == 0) + tid = txBegin(sb, 0); + tblk = tid_to_tblock(tid); + + /* + * initialize commit structure + */ + log = JFS_SBI(sb)->log; + cd.log = log; + + /* initialize log record descriptor in commit */ + lrd = &cd.lrd; + lrd->logtid = cpu_to_le32(tblk->logtid); + lrd->backchain = 0; + + tblk->xflag |= flag; + + if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) + tblk->xflag |= COMMIT_LAZY; + /* + * prepare non-journaled objects for commit + * + * flush data pages of non-journaled file + * to prevent the file getting non-initialized disk blocks + * in case of crash. + * (new blocks - ) + */ + cd.iplist = iplist; + cd.nip = nip; + + /* + * acquire transaction lock on (on-disk) inodes + * + * update on-disk inode from in-memory inode + * acquiring transaction locks for AFTER records + * on the on-disk inode of file object + * + * sort the inodes array by inode number in descending order + * to prevent deadlock when acquiring transaction lock + * of on-disk inodes on multiple on-disk inode pages by + * multiple concurrent transactions + */ + for (k = 0; k < cd.nip; k++) { + top = (cd.iplist[k])->i_ino; + for (n = k + 1; n < cd.nip; n++) { + ip = cd.iplist[n]; + if (ip->i_ino > top) { + top = ip->i_ino; + cd.iplist[n] = cd.iplist[k]; + cd.iplist[k] = ip; + } + } + + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * BUGBUG - This code has temporarily been removed. The + * intent is to ensure that any file data is written before + * the metadata is committed to the journal. This prevents + * uninitialized data from appearing in a file after the + * journal has been replayed. (The uninitialized data + * could be sensitive data removed by another user.) + * + * The problem now is that we are holding the IWRITELOCK + * on the inode, and calling filemap_fdatawrite on an + * unmapped page will cause a deadlock in jfs_get_block. + * + * The long term solution is to pare down the use of + * IWRITELOCK. We are currently holding it too long. + * We could also be smarter about which data pages need + * to be written before the transaction is committed and + * when we don't need to worry about it at all. + * + * if ((!S_ISDIR(ip->i_mode)) + * && (tblk->flag & COMMIT_DELETE) == 0) + * filemap_write_and_wait(ip->i_mapping); + */ + + /* + * Mark inode as not dirty. It will still be on the dirty + * inode list, but we'll know not to commit it again unless + * it gets marked dirty again + */ + clear_cflag(COMMIT_Dirty, ip); + + /* inherit anonymous tlock(s) of inode */ + if (jfs_ip->atlhead) { + lid_to_tlock(jfs_ip->atltail)->next = tblk->next; + tblk->next = jfs_ip->atlhead; + if (!tblk->last) + tblk->last = jfs_ip->atltail; + jfs_ip->atlhead = jfs_ip->atltail = 0; + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + + /* + * acquire transaction lock on on-disk inode page + * (become first tlock of the tblk's tlock list) + */ + if (((rc = diWrite(tid, ip)))) + goto out; + } + + /* + * write log records from transaction locks + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if ((rc = txLog(log, tblk, &cd))) + goto TheEnd; + + /* + * Ensure that inode isn't reused before + * lazy commit thread finishes processing + */ + if (tblk->xflag & COMMIT_DELETE) { + ihold(tblk->u.ip); + /* + * Avoid a rare deadlock + * + * If the inode is locked, we may be blocked in + * jfs_commit_inode. If so, we don't want the + * lazy_commit thread doing the last iput() on the inode + * since that may block on the locked inode. Instead, + * commit the transaction synchronously, so the last iput + * will be done by the calling thread (or later) + */ + /* + * I believe this code is no longer needed. Splitting I_LOCK + * into two bits, I_NEW and I_SYNC should prevent this + * deadlock as well. But since I don't have a JFS testload + * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. + * Joern + */ + if (tblk->u.ip->i_state & I_SYNC) + tblk->xflag &= ~COMMIT_LAZY; + } + + ASSERT((!(tblk->xflag & COMMIT_DELETE)) || + ((tblk->u.ip->i_nlink == 0) && + !test_cflag(COMMIT_Nolink, tblk->u.ip))); + + /* + * write COMMIT log record + */ + lrd->type = cpu_to_le16(LOG_COMMIT); + lrd->length = 0; + lmLog(log, tblk, lrd, NULL); + + lmGroupCommit(log, tblk); + + /* + * - transaction is now committed - + */ + + /* + * force pages in careful update + * (imap addressing structure update) + */ + if (flag & COMMIT_FORCE) + txForce(tblk); + + /* + * update allocation map. + * + * update inode allocation map and inode: + * free pager lock on memory object of inode if any. + * update block allocation map. + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if (tblk->xflag & COMMIT_FORCE) + txUpdateMap(tblk); + + /* + * free transaction locks and pageout/free pages + */ + txRelease(tblk); + + if ((tblk->flag & tblkGC_LAZY) == 0) + txUnlock(tblk); + + + /* + * reset in-memory object state + */ + for (k = 0; k < cd.nip; k++) { + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * reset in-memory inode state + */ + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + } + + out: + if (rc != 0) + txAbort(tid, 1); + + TheEnd: + jfs_info("txCommit: tid = %d, returning %d", tid, rc); + return rc; +} + +/* + * NAME: txLog() + * + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. + * + * PARAMETERS: + * + * RETURN : + */ +static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) +{ + int rc = 0; + struct inode *ip; + lid_t lid; + struct tlock *tlck; + struct lrd *lrd = &cd->lrd; + + /* + * write log record(s) for each tlock of transaction, + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + tlck->flag |= tlckLOG; + + /* initialize lrd common */ + ip = tlck->ip; + lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); + lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); + lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); + + /* write log record of page from the tlock */ + switch (tlck->type & tlckTYPE) { + case tlckXTREE: + xtLog(log, tblk, lrd, tlck); + break; + + case tlckDTREE: + dtLog(log, tblk, lrd, tlck); + break; + + case tlckINODE: + diLog(log, tblk, lrd, tlck, cd); + break; + + case tlckMAP: + mapLog(log, tblk, lrd, tlck); + break; + + case tlckDATA: + dataLog(log, tblk, lrd, tlck); + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + } + + return rc; +} + +/* + * diLog() + * + * function: log inode tlock and format maplock to update bmap; + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd) +{ + int rc = 0; + struct metapage *mp; + pxd_t *pxd; + struct pxd_lock *pxdlock; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_INODE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* + * inode after image + */ + if (tlck->type & tlckENTRY) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else if (tlck->type & tlckFREE) { + /* + * free inode extent + * + * (pages of the freed inode extent have been invalidated and + * a maplock for free of the extent has been formatted at + * txLock() time); + * + * the tlock had been acquired on the inode allocation map page + * (iag) that specifies the freed extent, even though the map + * page is not itself logged, to prevent pageout of the map + * page before the log; + */ + + /* log LOG_NOREDOINOEXT of the freed inode extent for + * logredo() to start NoRedoPage filters, and to update + * imap and bmap for free of the extent; + */ + lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); + /* + * For the LOG_NOREDOINOEXT record, we need + * to pass the IAG number and inode extent + * index (within that IAG) from which the + * the extent being released. These have been + * passed to us in the iplist[1] and iplist[2]. + */ + lrd->log.noredoinoext.iagnum = + cpu_to_le32((u32) (size_t) cd->iplist[1]); + lrd->log.noredoinoext.inoext_idx = + cpu_to_le32((u32) (size_t) cd->iplist[2]); + + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else + jfs_err("diLog: UFO type tlck:0x%p", tlck); +#ifdef _JFS_WIP + /* + * alloc/free external EA extent + * + * a maplock for txUpdateMap() to update bPWMAP for alloc/free + * of the extent has been formatted at txLock() time; + */ + else { + assert(tlck->type & tlckEA); + + /* log LOG_UPDATEMAP for logredo() to update bmap for + * alloc of new (and free of old) external EA extent; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +#endif /* _JFS_WIP */ + + return rc; +} + +/* + * dataLog() + * + * function: log data tlock + */ +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DATA); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + + if (jfs_dirtable_inline(tlck->ip)) { + /* + * The table has been truncated, we've must have deleted + * the last entry, so don't bother logging this + */ + mp->lid = 0; + grab_metapage(mp); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + return 0; + } + + PXDaddress(pxd, mp->index); + PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); + + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return 0; +} + +/* + * dtLog() + * + * function: log dtree tlock and format maplock to update bmap; + */ +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + struct pxd_lock *pxdlock; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + + /* + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; + */ + if (tlck->type & (tlckNEW | tlckEXTEND)) { + /* log after-image of the new page for logredo(): + * mark log (LOG_NEW) for logredo() to initialize + * freelist and update bmap for alloc of the new page; + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + if (tlck->type & tlckEXTEND) + lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); + else + lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP for + * alloc of the new page; + */ + if (tlck->type & tlckBTROOT) + return; + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckALLOCPXD; + pxdlock->pxd = *pxd; + + pxdlock->index = 1; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * entry insertion/deletion, + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckENTRY | tlckRELINK)) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * page deletion: page has been invalidated + * page relocation: source extent + * + * a maplock for free of the page has been formatted + * at txLock() time); + */ + if (tlck->type & (tlckFREE | tlckRELOCATE)) { + /* log LOG_NOREDOPAGE of the deleted page for logredo() + * to start NoRedoPage filter and to update bmap for free + * of the deletd page + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + } + return; +} + +/* + * xtLog() + * + * function: log xtree tlock and format maplock to update bmap; + */ +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct inode *ip; + struct metapage *mp; + xtpage_t *p; + struct xtlock *xtlck; + struct maplock *maplock; + struct xdlistlock *xadlock; + struct pxd_lock *pxdlock; + pxd_t *page_pxd; + int next, lwm, hwm; + + ip = tlck->ip; + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); + + page_pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) { + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + p = &JFS_IP(ip)->i_xtroot; + if (S_ISDIR(ip->i_mode)) + lrd->log.redopage.type |= + cpu_to_le16(LOG_DIR_XTREE); + } else + p = (xtpage_t *) mp->data; + next = le16_to_cpu(p->header.nextindex); + + xtlck = (struct xtlock *) & tlck->lock; + + maplock = (struct maplock *) & tlck->lock; + xadlock = (struct xdlistlock *) maplock; + + /* + * entry insertion/extension; + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { + /* log after-image for logredo(): + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + + if (lwm == next) + goto out; + if (lwm > next) { + jfs_err("xtLog: lwm > next\n"); + goto out; + } + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckALLOCPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, addressXAD(&p->xad[lwm + i])); + PXDlength(pxd, lengthXAD(&p->xad[lwm + i])); + p->xad[lwm + i].flag &= + ~(XAD_NEW | XAD_EXTENDED); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckALLOCXADLIST; + xadlock->xdlist = &p->xad[lwm]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d " + "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); + + maplock->index = 1; + + out: + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return; + } + + /* + * page deletion: file deletion/truncation (ref. xtTruncate()) + * + * (page will be invalidated after log is written and bmap + * is updated from the page); + */ + if (tlck->type & tlckFREE) { + /* LOG_NOREDOPAGE log for NoRedoPage filter: + * if page free from file delete, NoRedoFile filter from + * inode image of zero link count will subsume NoRedoPage + * filters for each page; + * if page free from file truncattion, write NoRedoPage + * filter; + * + * upadte of block allocation map for the page itself: + * if page free from deletion and truncation, LOG_UPDATEMAP + * log for the page itself is generated from processing + * its parent page xad entries; + */ + /* if page free from file truncation, log LOG_NOREDOPAGE + * of the deleted page for logredo() to start NoRedoPage + * filter for the page; + */ + if (tblk->xflag & COMMIT_TRUNCATE) { + /* write NOREDOPAGE for the page */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb-> + s_blocksize_bits); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + if (tlck->type & tlckBTROOT) { + /* Empty xtree must be logged */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + } + + /* init LOG_UPDATEMAP of the freed extents + * XAD[XTENTRYSTART:hwm) from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - XTENTRYSTART + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = XTENTRYSTART; + xtlck->header.length = hwm - XTENTRYSTART + 1; + xtlck->index = 1; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[XTENTRYSTART:hwm) from the + * deleted page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->count = hwm - XTENTRYSTART + 1; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckFREEPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, + addressXAD(&p->xad[XTENTRYSTART + i])); + PXDlength(pxd, + lengthXAD(&p->xad[XTENTRYSTART + i])); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckFREEXADLIST; + xadlock->xdlist = &p->xad[XTENTRYSTART]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", + tlck->ip, mp, xadlock->count); + + maplock->index = 1; + + /* mark page as invalid */ + if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) + && !(tlck->type & tlckBTROOT)) + tlck->flag |= tlckFREEPAGE; + /* + else (tblk->xflag & COMMIT_PMAP) + ? release the page; + */ + return; + } + + /* + * page/entry truncation: file truncation (ref. xtTruncate()) + * + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation + * header ? + */ + if (tlck->type & tlckTRUNCATE) { + /* This odd declaration suppresses a bogus gcc warning */ + pxd_t pxd = pxd; /* truncated extent of xad */ + int twm; + + /* + * For truncation the entire linelock may be used, so it would + * be difficult to store xad list in linelock itself. + * Therefore, we'll just force transaction to be committed + * synchronously, so that xtree pages won't be changed before + * txUpdateMap runs. + */ + tblk->xflag &= ~COMMIT_LAZY; + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + hwm = xtlck->hwm.offset; + twm = xtlck->twm.offset; + + /* + * write log records + */ + /* log after-image for logredo(): + * + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* init LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated delta extent of the truncated + * entry XAD[next - 1]: + * (xtlck->pxdlock = truncated delta extent); + */ + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + /* assert(pxdlock->type & tlckTRUNCATE); */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + pxd = pxdlock->pxd; /* save to format maplock */ + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* init LOG_UPDATEMAP of the freed extents + * XAD[next:hwm] from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - next + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = next; + xtlck->header.length = hwm - next + 1; + xtlck->index = 1; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + + /* + * format maplock(s) for txUpdateMap() to update bmap + */ + maplock->index = 0; + + /* + * allocate entries XAD[lwm:next): + */ + if (lwm < next) { + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + xadlock->xdlist = &p->xad[lwm]; + + jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d " + "lwm:%d next:%d", + tlck->ip, mp, xadlock->count, lwm, next); + maplock->index++; + xadlock++; + } + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* format a maplock for txUpdateMap() to update bmap + * to free truncated delta extent of the truncated + * entry XAD[next - 1]; + * (xtlck->pxdlock = truncated delta extent); + */ + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) xadlock; + pxdlock->flag = mlckFREEPXD; + pxdlock->count = 1; + pxdlock->pxd = pxd; + + jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d " + "hwm:%d", ip, mp, pxdlock->count, hwm); + maplock->index++; + xadlock++; + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[next:hwm] from thedeleted + * page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckFREEXADLIST; + xadlock->count = hwm - next + 1; + xadlock->xdlist = &p->xad[next]; + + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d " + "next:%d hwm:%d", + tlck->ip, mp, xadlock->count, next, hwm); + maplock->index++; + } + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } + return; +} + +/* + * mapLog() + * + * function: log from maplock of freed data extents; + */ +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct pxd_lock *pxdlock; + int i, nlock; + pxd_t *pxd; + + /* + * page relocation: free the source page extent + * + * a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time saving the src + * relocated page address; + */ + if (tlck->type & tlckRELOCATE) { + /* log LOG_NOREDOPAGE of the old relocated page + * for logredo() to start NoRedoPage filter; + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxd = &lrd->log.redopage.pxd; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* (N.B. currently, logredo() does NOT update bmap + * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); + * if page free from relocation, LOG_UPDATEMAP log is + * specifically generated now for logredo() + * to update bmap for free of src relocated page; + * (new flag LOG_RELOCATE may be introduced which will + * inform logredo() to start NORedoPage filter and also + * update block allocation map at the same time, thus + * avoiding an extra log write); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + return; + } + /* + + * Otherwise it's not a relocate request + * + */ + else { + /* log LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated/relocated delta extent of the data; + * e.g.: external EA extent, relocated/truncated extent + * from xtTailgate(); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", + (ulong) addressPXD(&pxdlock->pxd), + lengthPXD(&pxdlock->pxd)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +} + +/* + * txEA() + * + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; + */ +void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) +{ + struct tlock *tlck = NULL; + struct pxd_lock *maplock = NULL, *pxdlock = NULL; + + /* + * format maplock for alloc of new EA extent + */ + if (newea) { + /* Since the newea could be a completely zeroed entry we need to + * check for the two flags which indicate we should actually + * commit new EA data + */ + if (newea->flag & DXD_EXTENT) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + pxdlock->flag = mlckALLOCPXD; + PXDaddress(&pxdlock->pxd, addressDXD(newea)); + PXDlength(&pxdlock->pxd, lengthDXD(newea)); + pxdlock++; + maplock->index = 1; + } else if (newea->flag & DXD_INLINE) { + tlck = NULL; + + set_cflag(COMMIT_Inlineea, ip); + } + } + + /* + * format maplock for free of old EA extent + */ + if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { + if (tlck == NULL) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + maplock->index = 0; + } + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressDXD(oldea)); + PXDlength(&pxdlock->pxd, lengthDXD(oldea)); + maplock->index++; + } +} + +/* + * txForce() + * + * function: synchronously write pages locked by transaction + * after txLog() but before txUpdateMap(); + */ +static void txForce(struct tblock * tblk) +{ + struct tlock *tlck; + lid_t lid, next; + struct metapage *mp; + + /* + * reverse the order of transaction tlocks in + * careful update order of address index pages + * (right to left, bottom up) + */ + tlck = lid_to_tlock(tblk->next); + lid = tlck->next; + tlck->next = 0; + while (lid) { + tlck = lid_to_tlock(lid); + next = tlck->next; + tlck->next = tblk->next; + tblk->next = lid; + lid = next; + } + + /* + * synchronously write the page, and + * hold the page for txUpdateMap(); + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + if (tlck->flag & tlckWRITEPAGE) { + tlck->flag &= ~tlckWRITEPAGE; + + /* do not release page to freelist */ + force_metapage(mp); +#if 0 + /* + * The "right" thing to do here is to + * synchronously write the metadata. + * With the current implementation this + * is hard since write_metapage requires + * us to kunmap & remap the page. If we + * have tlocks pointing into the metadata + * pages, we don't want to do this. I think + * we can get by with synchronously writing + * the pages when they are released. + */ + assert(mp->nohomeok); + set_bit(META_dirty, &mp->flag); + set_bit(META_sync, &mp->flag); +#endif + } + } + } +} + +/* + * txUpdateMap() + * + * function: update persistent allocation map (and working map + * if appropriate); + * + * parameter: + */ +static void txUpdateMap(struct tblock * tblk) +{ + struct inode *ip; + struct inode *ipimap; + lid_t lid; + struct tlock *tlck; + struct maplock *maplock; + struct pxd_lock pxdlock; + int maptype; + int k, nlock; + struct metapage *mp = NULL; + + ipimap = JFS_SBI(tblk->sb)->ipimap; + + maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; + + + /* + * update block allocation map + * + * update allocation state in pmap (and wmap) and + * update lsn of the pmap page; + */ + /* + * scan each tlock/page of transaction for block allocation/free: + * + * for each tlock/page of transaction, update map. + * ? are there tlock for pmap and pwmap at the same time ? + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + if ((tlck->flag & tlckUPDATEMAP) == 0) + continue; + + if (tlck->flag & tlckFREEPAGE) { + /* + * Another thread may attempt to reuse freed space + * immediately, so we want to get rid of the metapage + * before anyone else has a chance to get it. + * Lock metapage, update maps, then invalidate + * the metapage. + */ + mp = tlck->mp; + ASSERT(mp->xflag & COMMIT_PAGE); + grab_metapage(mp); + } + + /* + * extent list: + * . in-line PXD list: + * . out-of-line XAD list: + */ + maplock = (struct maplock *) & tlck->lock; + nlock = maplock->index; + + for (k = 0; k < nlock; k++, maplock++) { + /* + * allocate blocks in persistent map: + * + * blocks have been allocated from wmap at alloc time; + */ + if (maplock->flag & mlckALLOC) { + txAllocPMap(ipimap, maplock, tblk); + } + /* + * free blocks in persistent and working map: + * blocks will be freed in pmap and then in wmap; + * + * ? tblock specifies the PMAP/PWMAP based upon + * transaction + * + * free blocks in persistent map: + * blocks will be freed from wmap at last reference + * release of the object for regular files; + * + * Alway free blocks from both persistent & working + * maps for directories + */ + else { /* (maplock->flag & mlckFREE) */ + + if (tlck->flag & tlckDIRECTORY) + txFreeMap(ipimap, maplock, + tblk, COMMIT_PWMAP); + else + txFreeMap(ipimap, maplock, + tblk, maptype); + } + } + if (tlck->flag & tlckFREEPAGE) { + if (!(tblk->flag & tblkGC_LAZY)) { + /* This is equivalent to txRelease */ + ASSERT(mp->lid == lid); + tlck->mp->lid = 0; + } + assert(mp->nohomeok == 1); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + } + } + /* + * update inode allocation map + * + * update allocation state in pmap and + * update lsn of the pmap page; + * update in-memory inode flag/state + * + * unlock mapper/write lock + */ + if (tblk->xflag & COMMIT_CREATE) { + diUpdatePMap(ipimap, tblk->ino, false, tblk); + /* update persistent block allocation map + * for the allocation of inode extent; + */ + pxdlock.flag = mlckALLOCPXD; + pxdlock.pxd = tblk->u.ixpxd; + pxdlock.index = 1; + txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); + } else if (tblk->xflag & COMMIT_DELETE) { + ip = tblk->u.ip; + diUpdatePMap(ipimap, ip->i_ino, true, tblk); + iput(ip); + } +} + +/* + * txAllocPMap() + * + * function: allocate from persistent map; + * + * parameter: + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; + */ +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + /* + * allocate from persistent map; + */ + if (maplock->flag & mlckALLOCXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, false, xaddr, + (s64) xlen, tblk); + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckALLOCPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, + tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } +} + +/* + * txFreeMap() + * + * function: free from persistent and/or working map; + * + * todo: optimization + */ +void txFreeMap(struct inode *ip, + struct maplock * maplock, struct tblock * tblk, int maptype) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", + tblk, maplock, maptype); + + /* + * free from persistent map; + */ + if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (!(xad->flag & XAD_NEW)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx " + "xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, + tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } + + /* + * free from working map; + */ + if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbFree(ip, xaddr, (s64) xlen); + xad->flag = 0; + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckFREEPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } +} + +/* + * txFreelock() + * + * function: remove tlock from inode anonymous locklist + */ +void txFreelock(struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct tlock *xtlck, *tlck; + lid_t xlid = 0, lid; + + if (!jfs_ip->atlhead) + return; + + TXN_LOCK(); + xtlck = (struct tlock *) &jfs_ip->atlhead; + + while ((lid = xtlck->next) != 0) { + tlck = lid_to_tlock(lid); + if (tlck->flag & tlckFREELOCK) { + xtlck->next = tlck->next; + txLockFree(lid); + } else { + xtlck = tlck; + xlid = lid; + } + } + + if (jfs_ip->atlhead) + jfs_ip->atltail = xlid; + else { + jfs_ip->atltail = 0; + /* + * If inode was on anon_list, remove it + */ + list_del_init(&jfs_ip->anon_inode_list); + } + TXN_UNLOCK(); +} + +/* + * txAbort() + * + * function: abort tx before commit; + * + * frees line-locks and segment locks for all + * segments in comdata structure. + * Optionally sets state of file-system to FM_DIRTY in super-block. + * log age of page-frames in memory for which caller has + * are reset to 0 (to avoid logwarap). + */ +void txAbort(tid_t tid, int dirty) +{ + lid_t lid, next; + struct metapage *mp; + struct tblock *tblk = tid_to_tblock(tid); + struct tlock *tlck; + + /* + * free tlocks of the transaction + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + mp = tlck->mp; + JFS_IP(tlck->ip)->xtlid = 0; + + if (mp) { + mp->lid = 0; + + /* + * reset lsn of page to avoid logwarap: + * + * (page may have been previously committed by another + * transaction(s) but has not been paged, i.e., + * it may be on logsync list even though it has not + * been logged for the current tx.) + */ + if (mp->xflag & COMMIT_PAGE && mp->lsn) + LogSyncRelease(mp); + } + /* insert tlock at head of freelist */ + TXN_LOCK(); + txLockFree(lid); + TXN_UNLOCK(); + } + + /* caller will free the transaction block */ + + tblk->next = tblk->last = 0; + + /* + * mark filesystem dirty + */ + if (dirty) + jfs_error(tblk->sb, "\n"); + + return; +} + +/* + * txLazyCommit(void) + * + * All transactions except those changing ipimap (COMMIT_FORCE) are + * processed by this routine. This insures that the inode and block + * allocation maps are updated in order. For synchronous transactions, + * let the user thread finish processing after txUpdateMap() is called. + */ +static void txLazyCommit(struct tblock * tblk) +{ + struct jfs_log *log; + + while (((tblk->flag & tblkGC_READY) == 0) && + ((tblk->flag & tblkGC_UNLOCKED) == 0)) { + /* We must have gotten ahead of the user thread + */ + jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); + yield(); + } + + jfs_info("txLazyCommit: processing tblk 0x%p", tblk); + + txUpdateMap(tblk); + + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP + + /* + * Can't release log->gclock until we've tested tblk->flag + */ + if (tblk->flag & tblkGC_LAZY) { + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + txUnlock(tblk); + tblk->flag &= ~tblkGC_LAZY; + txEnd(tblk - TxBlock); /* Convert back to tid */ + } else + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + + jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); +} + +/* + * jfs_lazycommit(void) + * + * To be run as a kernel daemon. If lbmIODone is called in an interrupt + * context, or where blocking is not wanted, this routine will process + * committed transactions from the unlock queue. + */ +int jfs_lazycommit(void *arg) +{ + int WorkDone; + struct tblock *tblk; + unsigned long flags; + struct jfs_sb_info *sbi; + + do { + LAZY_LOCK(flags); + jfs_commit_thread_waking = 0; /* OK to wake another thread */ + while (!list_empty(&TxAnchor.unlock_queue)) { + WorkDone = 0; + list_for_each_entry(tblk, &TxAnchor.unlock_queue, + cqueue) { + + sbi = JFS_SBI(tblk->sb); + /* + * For each volume, the transactions must be + * handled in order. If another commit thread + * is handling a tblk for this superblock, + * skip it + */ + if (sbi->commit_state & IN_LAZYCOMMIT) + continue; + + sbi->commit_state |= IN_LAZYCOMMIT; + WorkDone = 1; + + /* + * Remove transaction from queue + */ + list_del(&tblk->cqueue); + + LAZY_UNLOCK(flags); + txLazyCommit(tblk); + LAZY_LOCK(flags); + + sbi->commit_state &= ~IN_LAZYCOMMIT; + /* + * Don't continue in the for loop. (We can't + * anyway, it's unsafe!) We want to go back to + * the beginning of the list. + */ + break; + } + + /* If there was nothing to do, don't continue */ + if (!WorkDone) + break; + } + /* In case a wakeup came while all threads were active */ + jfs_commit_thread_waking = 0; + + if (freezing(current)) { + LAZY_UNLOCK(flags); + try_to_freeze(); + } else { + DECLARE_WAITQUEUE(wq, current); + + add_wait_queue(&jfs_commit_thread_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + LAZY_UNLOCK(flags); + schedule(); + remove_wait_queue(&jfs_commit_thread_wait, &wq); + } + } while (!kthread_should_stop()); + + if (!list_empty(&TxAnchor.unlock_queue)) + jfs_err("jfs_lazycommit being killed w/pending transactions!"); + else + jfs_info("jfs_lazycommit being killed\n"); + return 0; +} + +void txLazyUnlock(struct tblock * tblk) +{ + unsigned long flags; + + LAZY_LOCK(flags); + + list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); + /* + * Don't wake up a commit thread if there is already one servicing + * this superblock, or if the last one we woke up hasn't started yet. + */ + if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && + !jfs_commit_thread_waking) { + jfs_commit_thread_waking = 1; + wake_up(&jfs_commit_thread_wait); + } + LAZY_UNLOCK(flags); +} + +static void LogSyncRelease(struct metapage * mp) +{ + struct jfs_log *log = mp->log; + + assert(mp->nohomeok); + assert(log); + metapage_homeok(mp); +} + +/* + * txQuiesce + * + * Block all new transactions and push anonymous transactions to + * completion + * + * This does almost the same thing as jfs_sync below. We don't + * worry about deadlocking when jfs_tlocks_low is set, since we would + * expect jfs_sync to get us out of that jam. + */ +void txQuiesce(struct super_block *sb) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + struct jfs_log *log = JFS_SBI(sb)->log; + tid_t tid; + + set_bit(log_QUIESCE, &log->flag); + + TXN_LOCK(); +restart: + while (!list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); + mutex_lock(&jfs_ip->commit_mutex); + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } + + /* + * If jfs_sync is running in parallel, there could be some inodes + * on anon_list2. Let's check. + */ + if (!list_empty(&TxAnchor.anon_list2)) { + list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + goto restart; + } + TXN_UNLOCK(); + + /* + * We may need to kick off the group commit + */ + jfs_flush_journal(log, 0); +} + +/* + * txResume() + * + * Allows transactions to start again following txQuiesce + */ +void txResume(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + clear_bit(log_QUIESCE, &log->flag); + TXN_WAKEUP(&log->syncwait); +} + +/* + * jfs_sync(void) + * + * To be run as a kernel daemon. This is awakened when tlocks run low. + * We write any inodes that have anonymous tlocks so they will become + * available. + */ +int jfs_sync(void *arg) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + tid_t tid; + + do { + /* + * write each inode on the anonymous inode list + */ + TXN_LOCK(); + while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + if (! igrab(ip)) { + /* + * Inode is being freed + */ + list_del_init(&jfs_ip->anon_inode_list); + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE); + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + + iput(ip); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } else { + /* We can't get the commit mutex. It may + * be held by a thread waiting for tlock's + * so let's not block here. Save it to + * put back on the anon_list. + */ + + /* Move from anon_list to anon_list2 */ + list_move(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list2); + + TXN_UNLOCK(); + iput(ip); + TXN_LOCK(); + } + } + /* Add anon_list2 back to anon_list */ + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); + + if (freezing(current)) { + TXN_UNLOCK(); + try_to_freeze(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + TXN_UNLOCK(); + schedule(); + } + } while (!kthread_should_stop()); + + jfs_info("jfs_sync being killed"); + return 0; +} + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) +static int jfs_txanchor_proc_show(struct seq_file *m, void *v) +{ + char *freewait; + char *freelockwait; + char *lowlockwait; + + freewait = + waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; + freelockwait = + waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; + lowlockwait = + waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; + + seq_printf(m, + "JFS TxAnchor\n" + "============\n" + "freetid = %d\n" + "freewait = %s\n" + "freelock = %d\n" + "freelockwait = %s\n" + "lowlockwait = %s\n" + "tlocksInUse = %d\n" + "jfs_tlocks_low = %d\n" + "unlock_queue is %sempty\n", + TxAnchor.freetid, + freewait, + TxAnchor.freelock, + freelockwait, + lowlockwait, + TxAnchor.tlocksInUse, + jfs_tlocks_low, + list_empty(&TxAnchor.unlock_queue) ? "" : "not "); + return 0; +} + +static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txanchor_proc_show, NULL); +} + +const struct file_operations jfs_txanchor_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txanchor_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) +static int jfs_txstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS TxStats\n" + "===========\n" + "calls to txBegin = %d\n" + "txBegin blocked by sync barrier = %d\n" + "txBegin blocked by tlocks low = %d\n" + "txBegin blocked by no free tid = %d\n" + "calls to txBeginAnon = %d\n" + "txBeginAnon blocked by sync barrier = %d\n" + "txBeginAnon blocked by tlocks low = %d\n" + "calls to txLockAlloc = %d\n" + "tLockAlloc blocked by no free lock = %d\n", + TxStat.txBegin, + TxStat.txBegin_barrier, + TxStat.txBegin_lockslow, + TxStat.txBegin_freetid, + TxStat.txBeginAnon, + TxStat.txBeginAnon_barrier, + TxStat.txBeginAnon_lockslow, + TxStat.txLockAlloc, + TxStat.txLockAlloc_freelock); + return 0; +} + +static int jfs_txstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txstats_proc_show, NULL); +} + +const struct file_operations jfs_txstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/kernel/fs/jfs/jfs_txnmgr.h b/kernel/fs/jfs/jfs_txnmgr.h new file mode 100644 index 000000000..ab7288937 --- /dev/null +++ b/kernel/fs/jfs/jfs_txnmgr.h @@ -0,0 +1,311 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TXNMGR +#define _H_JFS_TXNMGR + +#include "jfs_logmgr.h" + +/* + * Hide implementation of TxBlock and TxLock + */ +#define tid_to_tblock(tid) (&TxBlock[tid]) + +#define lid_to_tlock(lid) (&TxLock[lid]) + +/* + * transaction block + */ +struct tblock { + /* + * tblock and jbuf_t common area: struct logsyncblk + * + * the following 5 fields are the same as struct logsyncblk + * which is common to tblock and jbuf to form logsynclist + */ + u16 xflag; /* tx commit type */ + u16 flag; /* tx commit state */ + lid_t dummy; /* Must keep structures common */ + s32 lsn; /* recovery lsn */ + struct list_head synclist; /* logsynclist link */ + + /* lock management */ + struct super_block *sb; /* super block */ + lid_t next; /* index of first tlock of tid */ + lid_t last; /* index of last tlock of tid */ + wait_queue_head_t waitor; /* tids waiting on this tid */ + + /* log management */ + u32 logtid; /* log transaction id */ + + /* commit management */ + struct list_head cqueue; /* commit queue list */ + s32 clsn; /* commit lsn */ + struct lbuf *bp; + s32 pn; /* commit record log page number */ + s32 eor; /* commit record eor */ + wait_queue_head_t gcwait; /* group commit event list: + * ready transactions wait on this + * event for group commit completion. + */ + union { + struct inode *ip; /* inode being deleted */ + pxd_t ixpxd; /* pxd of inode extent for created inode */ + } u; + u32 ino; /* inode number being created */ +}; + +extern struct tblock *TxBlock; /* transaction block table */ + +/* commit flags: tblk->xflag */ +#define COMMIT_SYNC 0x0001 /* synchronous commit */ +#define COMMIT_FORCE 0x0002 /* force pageout at end of commit */ +#define COMMIT_FLUSH 0x0004 /* init flush at end of commit */ +#define COMMIT_MAP 0x00f0 +#define COMMIT_PMAP 0x0010 /* update pmap */ +#define COMMIT_WMAP 0x0020 /* update wmap */ +#define COMMIT_PWMAP 0x0040 /* update pwmap */ +#define COMMIT_FREE 0x0f00 +#define COMMIT_DELETE 0x0100 /* inode delete */ +#define COMMIT_TRUNCATE 0x0200 /* file truncation */ +#define COMMIT_CREATE 0x0400 /* inode create */ +#define COMMIT_LAZY 0x0800 /* lazy commit */ +#define COMMIT_PAGE 0x1000 /* Identifies element as metapage */ +#define COMMIT_INODE 0x2000 /* Identifies element as inode */ + +/* group commit flags tblk->flag: see jfs_logmgr.h */ + +/* + * transaction lock + */ +struct tlock { + lid_t next; /* 2: index next lockword on tid locklist + * next lockword on freelist + */ + tid_t tid; /* 2: transaction id holding lock */ + + u16 flag; /* 2: lock control */ + u16 type; /* 2: log type */ + + struct metapage *mp; /* 4/8: object page buffer locked */ + struct inode *ip; /* 4/8: object */ + /* (16) */ + + s16 lock[24]; /* 48: overlay area */ +}; /* (64) */ + +extern struct tlock *TxLock; /* transaction lock table */ + +/* + * tlock flag + */ +/* txLock state */ +#define tlckPAGELOCK 0x8000 +#define tlckINODELOCK 0x4000 +#define tlckLINELOCK 0x2000 +#define tlckINLINELOCK 0x1000 +/* lmLog state */ +#define tlckLOG 0x0800 +/* updateMap state */ +#define tlckUPDATEMAP 0x0080 +#define tlckDIRECTORY 0x0040 +/* freeLock state */ +#define tlckFREELOCK 0x0008 +#define tlckWRITEPAGE 0x0004 +#define tlckFREEPAGE 0x0002 + +/* + * tlock type + */ +#define tlckTYPE 0xfe00 +#define tlckINODE 0x8000 +#define tlckXTREE 0x4000 +#define tlckDTREE 0x2000 +#define tlckMAP 0x1000 +#define tlckEA 0x0800 +#define tlckACL 0x0400 +#define tlckDATA 0x0200 +#define tlckBTROOT 0x0100 + +#define tlckOPERATION 0x00ff +#define tlckGROW 0x0001 /* file grow */ +#define tlckREMOVE 0x0002 /* file delete */ +#define tlckTRUNCATE 0x0004 /* file truncate */ +#define tlckRELOCATE 0x0008 /* file/directory relocate */ +#define tlckENTRY 0x0001 /* directory insert/delete */ +#define tlckEXTEND 0x0002 /* directory extend in-line */ +#define tlckSPLIT 0x0010 /* splited page */ +#define tlckNEW 0x0020 /* new page from split */ +#define tlckFREE 0x0040 /* free page */ +#define tlckRELINK 0x0080 /* update sibling pointer */ + +/* + * linelock for lmLog() + * + * note: linelock and its variations are overlaid + * at tlock.lock: watch for alignment; + */ +struct lv { + u8 offset; /* 1: */ + u8 length; /* 1: */ +}; /* (2) */ + +#define TLOCKSHORT 20 +#define TLOCKLONG 28 + +struct linelock { + lid_t next; /* 2: next linelock */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv lv[20]; /* 40: */ +}; /* (48) */ + +#define dt_lock linelock + +struct xtlock { + lid_t next; /* 2: */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv header; /* 2: */ + struct lv lwm; /* 2: low water mark */ + struct lv hwm; /* 2: high water mark */ + struct lv twm; /* 2: */ + /* (16) */ + + s32 pxdlock[8]; /* 32: */ +}; /* (48) */ + + +/* + * maplock for txUpdateMap() + * + * note: maplock and its variations are overlaid + * at tlock.lock/linelock: watch for alignment; + * N.B. next field may be set by linelock, and should not + * be modified by maplock; + * N.B. index of the first pxdlock specifies index of next + * free maplock (i.e., number of maplock) in the tlock; + */ +struct maplock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: next free maplock index */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + pxd_t pxd; /* 8: */ +}; /* (16): */ + +/* maplock flag */ +#define mlckALLOC 0x00f0 +#define mlckALLOCXADLIST 0x0080 +#define mlckALLOCPXDLIST 0x0040 +#define mlckALLOCXAD 0x0020 +#define mlckALLOCPXD 0x0010 +#define mlckFREE 0x000f +#define mlckFREEXADLIST 0x0008 +#define mlckFREEPXDLIST 0x0004 +#define mlckFREEXAD 0x0002 +#define mlckFREEPXD 0x0001 + +#define pxd_lock maplock + +struct xdlistlock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + /* + * We need xdlist to be 64 bits (8 bytes), regardless of + * whether void * is 32 or 64 bits + */ + union { + void *_xdlist; /* pxd/xad list */ + s64 pad; /* 8: Force 64-bit xdlist size */ + } union64; +}; /* (16): */ + +#define xdlist union64._xdlist + +/* + * commit + * + * parameter to the commit manager routines + */ +struct commit { + tid_t tid; /* tid = index of tblock */ + int flag; /* flags */ + struct jfs_log *log; /* log */ + struct super_block *sb; /* superblock */ + + int nip; /* number of entries in iplist */ + struct inode **iplist; /* list of pointers to inodes */ + + /* log record descriptor on 64-bit boundary */ + struct lrd lrd; /* : log record descriptor */ +}; + +/* + * external declarations + */ +extern int jfs_tlocks_low; + +extern int txInit(void); +extern void txExit(void); +extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int); +extern struct tlock *txMaplock(tid_t, struct inode *, int); +extern int txCommit(tid_t, int, struct inode **, int); +extern tid_t txBegin(struct super_block *, int); +extern void txBeginAnon(struct super_block *); +extern void txEnd(tid_t); +extern void txAbort(tid_t, int); +extern struct linelock *txLinelock(struct linelock *); +extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int); +extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *); +extern void txFreelock(struct inode *); +extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *, + struct tlock *); +extern void txQuiesce(struct super_block *); +extern void txResume(struct super_block *); +extern void txLazyUnlock(struct tblock *); +extern int jfs_lazycommit(void *); +extern int jfs_sync(void *); +#endif /* _H_JFS_TXNMGR */ diff --git a/kernel/fs/jfs/jfs_types.h b/kernel/fs/jfs/jfs_types.h new file mode 100644 index 000000000..8f602dcb5 --- /dev/null +++ b/kernel/fs/jfs/jfs_types.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TYPES +#define _H_JFS_TYPES + +/* + * jfs_types.h: + * + * basic type/utility definitions + * + * note: this header file must be the 1st include file + * of JFS include list in all JFS .c file. + */ + +#include +#include + +/* + * transaction and lock id's + * + * Don't change these without carefully considering the impact on the + * size and alignment of all of the linelock variants + */ +typedef u16 tid_t; +typedef u16 lid_t; + +/* + * Almost identical to Linux's timespec, but not quite + */ +struct timestruc_t { + __le32 tv_sec; + __le32 tv_nsec; +}; + +/* + * handy + */ + +#define LEFTMOSTONE 0x80000000 +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ + +/* + * physical xd (pxd) + * + * The leftmost 24 bits of len_addr are the extent length. + * The rightmost 8 bits of len_addr are the most signficant bits of + * the extent address + */ +typedef struct { + __le32 len_addr; + __le32 addr2; +} pxd_t; + +/* xd_t field construction */ + +static inline void PXDlength(pxd_t *pxd, __u32 len) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) | + cpu_to_le32(len & 0xffffff); +} + +static inline void PXDaddress(pxd_t *pxd, __u64 addr) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) | + cpu_to_le32((addr >> 32)<<24); + pxd->addr2 = cpu_to_le32(addr & 0xffffffff); +} + +/* xd_t field extraction */ +static inline __u32 lengthPXD(pxd_t *pxd) +{ + return le32_to_cpu((pxd)->len_addr) & 0xffffff; +} + +static inline __u64 addressPXD(pxd_t *pxd) +{ + __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff; + return (n << 8) + le32_to_cpu(pxd->addr2); +} + +#define MAXTREEHEIGHT 8 +/* pxd list */ +struct pxdlist { + s16 maxnpxd; + s16 npxd; + pxd_t pxd[MAXTREEHEIGHT]; +}; + + +/* + * data extent descriptor (dxd) + */ +typedef struct { + __u8 flag; /* 1: flags */ + __u8 rsrvd[3]; + __le32 size; /* 4: size in byte */ + pxd_t loc; /* 8: address and length in unit of fsblksize */ +} dxd_t; /* - 16 - */ + +/* dxd_t flags */ +#define DXD_INDEX 0x80 /* B+-tree index */ +#define DXD_INLINE 0x40 /* in-line data extent */ +#define DXD_EXTENT 0x20 /* out-of-line single extent */ +#define DXD_FILE 0x10 /* out-of-line file (inode) */ +#define DXD_CORRUPT 0x08 /* Inconsistency detected */ + +/* dxd_t field construction + */ +#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len) +#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr) +#define lengthDXD(dxd) lengthPXD(&(dxd)->loc) +#define addressDXD(dxd) addressPXD(&(dxd)->loc) +#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) +#define sizeDXD(dxd) le32_to_cpu((dxd)->size) + +/* + * directory entry argument + */ +struct component_name { + int namlen; + wchar_t *name; +}; + + +/* + * DASD limit information - stored in directory inode + */ +struct dasd { + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ + u8 rsrvd1; + u8 limit_hi; /* DASD limit (in logical blocks) */ + __le32 limit_lo; /* DASD limit (in logical blocks) */ + u8 rsrvd2[3]; + u8 used_hi; /* DASD usage (in logical blocks) */ + __le32 used_lo; /* DASD usage (in logical blocks) */ +}; + +#define DASDLIMIT(dasdp) \ + (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo)) +#define setDASDLIMIT(dasdp, limit)\ +{\ + (dasdp)->limit_hi = ((u64)limit) >> 32;\ + (dasdp)->limit_lo = __cpu_to_le32(limit);\ +} +#define DASDUSED(dasdp) \ + (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo)) +#define setDASDUSED(dasdp, used)\ +{\ + (dasdp)->used_hi = ((u64)used) >> 32;\ + (dasdp)->used_lo = __cpu_to_le32(used);\ +} + +#endif /* !_H_JFS_TYPES */ diff --git a/kernel/fs/jfs/jfs_umount.c b/kernel/fs/jfs/jfs_umount.c new file mode 100644 index 000000000..7971f3753 --- /dev/null +++ b/kernel/fs/jfs/jfs_umount.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_umount.c + * + * note: file system in transition to aggregate/fileset: + * (ref. jfs_mount.c) + * + * file system unmount is interpreted as mount of the single/only + * fileset in the aggregate and, if unmount of the last fileset, + * as unmount of the aggerate; + */ + +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_umount(vfsp, flags, crp) + * + * FUNCTION: vfs_umount() + * + * PARAMETERS: vfsp - virtual file system pointer + * flags - unmount for shutdown + * crp - credential + * + * RETURN : EBUSY - device has open files + */ +int jfs_umount(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipimap = sbi->ipimap; + struct inode *ipaimap = sbi->ipaimap; + struct inode *ipaimap2 = sbi->ipaimap2; + struct jfs_log *log; + int rc = 0; + + jfs_info("UnMount JFS: sb:0x%p", sb); + + /* + * update superblock and close log + * + * if mounted read-write and log based recovery was enabled + */ + if ((log = sbi->log)) + /* + * Wait for outstanding transactions to be written to log: + */ + jfs_flush_journal(log, 2); + + /* + * close fileset inode allocation map (aka fileset inode) + */ + diUnmount(ipimap, 0); + + diFreeSpecial(ipimap); + sbi->ipimap = NULL; + + /* + * close secondary aggregate inode allocation map + */ + ipaimap2 = sbi->ipaimap2; + if (ipaimap2) { + diUnmount(ipaimap2, 0); + diFreeSpecial(ipaimap2); + sbi->ipaimap2 = NULL; + } + + /* + * close aggregate inode allocation map + */ + ipaimap = sbi->ipaimap; + diUnmount(ipaimap, 0); + diFreeSpecial(ipaimap); + sbi->ipaimap = NULL; + + /* + * close aggregate block allocation map + */ + dbUnmount(ipbmap, 0); + + diFreeSpecial(ipbmap); + sbi->ipimap = NULL; + + /* + * Make sure all metadata makes it to disk before we mark + * the superblock as clean + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + /* + * ensure all file system file pages are propagated to their + * home blocks on disk (and their in-memory buffer pages are + * invalidated) BEFORE updating file system superblock state + * (to signify file system is unmounted cleanly, and thus in + * consistent state) and log superblock active file system + * list (to signify skip logredo()). + */ + if (log) { /* log = NULL if read-only mount */ + updateSuper(sb, FM_CLEAN); + + /* + * close log: + * + * remove file system from log active file system list. + */ + rc = lmLogClose(sb); + } + jfs_info("UnMount JFS Complete: rc = %d", rc); + return rc; +} + + +int jfs_umount_rw(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + + if (!log) + return 0; + + /* + * close log: + * + * remove file system from log active file system list. + */ + jfs_flush_journal(log, 2); + + /* + * Make sure all metadata makes it to disk + */ + dbSync(sbi->ipbmap); + diSync(sbi->ipimap); + + /* + * Note that we have to do this even if sync_blockdev() will + * do exactly the same a few instructions later: We can't + * mark the superblock clean before everything is flushed to + * disk. + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + updateSuper(sb, FM_CLEAN); + + return lmLogClose(sb); +} diff --git a/kernel/fs/jfs/jfs_unicode.c b/kernel/fs/jfs/jfs_unicode.c new file mode 100644 index 000000000..c7de6f5bb --- /dev/null +++ b/kernel/fs/jfs/jfs_unicode.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_strfromUCS() + * + * FUNCTION: Convert little-endian unicode string to character string + * + */ +int jfs_strfromUCS_le(char *to, const __le16 * from, + int len, struct nls_table *codepage) +{ + int i; + int outlen = 0; + static int warn_again = 5; /* Only warn up to 5 times total */ + int warn = !!warn_again; /* once per string */ + + if (codepage) { + for (i = 0; (i < len) && from[i]; i++) { + int charlen; + charlen = + codepage->uni2char(le16_to_cpu(from[i]), + &to[outlen], + NLS_MAX_CHARSET_SIZE); + if (charlen > 0) + outlen += charlen; + else + to[outlen++] = '?'; + } + } else { + for (i = 0; (i < len) && from[i]; i++) { + if (unlikely(le16_to_cpu(from[i]) & 0xff00)) { + to[i] = '?'; + if (unlikely(warn)) { + warn--; + warn_again--; + printk(KERN_ERR + "non-latin1 character 0x%x found in JFS file name\n", + le16_to_cpu(from[i])); + printk(KERN_ERR + "mount with iocharset=utf8 to access\n"); + } + + } + else + to[i] = (char) (le16_to_cpu(from[i])); + } + outlen = i; + } + to[outlen] = 0; + return outlen; +} + +/* + * NAME: jfs_strtoUCS() + * + * FUNCTION: Convert character string to unicode string + * + */ +static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, + struct nls_table *codepage) +{ + int charlen; + int i; + + if (codepage) { + for (i = 0; len && *from; i++, from += charlen, len -= charlen) + { + charlen = codepage->char2uni(from, len, &to[i]); + if (charlen < 1) { + jfs_err("jfs_strtoUCS: char2uni returned %d.", + charlen); + jfs_err("charset = %s, char = 0x%x", + codepage->charset, *from); + return charlen; + } + } + } else { + for (i = 0; (i < len) && from[i]; i++) + to[i] = (wchar_t) from[i]; + } + + to[i] = 0; + return i; +} + +/* + * NAME: get_UCSname() + * + * FUNCTION: Allocate and translate to unicode string + * + */ +int get_UCSname(struct component_name * uniName, struct dentry *dentry) +{ + struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab; + int length = dentry->d_name.len; + + if (length > JFS_NAME_MAX) + return -ENAMETOOLONG; + + uniName->name = + kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); + + if (uniName->name == NULL) + return -ENOMEM; + + uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, + length, nls_tab); + + if (uniName->namlen < 0) { + kfree(uniName->name); + return uniName->namlen; + } + + return 0; +} diff --git a/kernel/fs/jfs/jfs_unicode.h b/kernel/fs/jfs/jfs_unicode.h new file mode 100644 index 000000000..8f0f02cb6 --- /dev/null +++ b/kernel/fs/jfs/jfs_unicode.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_UNICODE +#define _H_JFS_UNICODE + +#include +#include +#include "jfs_types.h" + +typedef struct { + wchar_t start; + wchar_t end; + signed char *table; +} UNICASERANGE; + +extern signed char UniUpperTable[512]; +extern UNICASERANGE UniUpperRange[]; +extern int get_UCSname(struct component_name *, struct dentry *); +extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); + +#define free_UCSname(COMP) kfree((COMP)->name) + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)); + return anchor; +} + + + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy_to_le: Copy length limited string with pad to little-endian + */ +static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = cpu_to_le16(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_from_le: Copy length limited string with pad from little-endian + */ +static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(wchar_t uc) +{ + UNICASERANGE *rp; + + if (uc < sizeof(UniUpperTable)) { /* Latin characters */ + return uc + UniUpperTable[uc]; /* Use base tables */ + } else { + rp = UniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + + +/* + * UniStrupr: Upper case a unicode string + */ +static inline wchar_t *UniStrupr(wchar_t * upin) +{ + wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniToupper(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif /* !_H_JFS_UNICODE */ diff --git a/kernel/fs/jfs/jfs_uniupr.c b/kernel/fs/jfs/jfs_uniupr.c new file mode 100644 index 000000000..cfe50666d --- /dev/null +++ b/kernel/fs/jfs/jfs_uniupr.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "jfs_unicode.h" + +/* + * Latin upper case + */ +signed char UniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ + -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ + -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ + 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, +}; + +/* + * Upper Case Range + */ +UNICASERANGE UniUpperRange[] = { + { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, + { 0x0430, 0x045f, UniCaseRangeU0430 }, + { 0x0490, 0x04cc, UniCaseRangeU0490 }, + { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, + { 0xff40, 0xff5a, UniCaseRangeUff40 }, + { 0 } +}; diff --git a/kernel/fs/jfs/jfs_xattr.h b/kernel/fs/jfs/jfs_xattr.h new file mode 100644 index 000000000..e8d717dab --- /dev/null +++ b/kernel/fs/jfs/jfs_xattr.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef H_JFS_XATTR +#define H_JFS_XATTR + +/* + * jfs_ea_list describe the on-disk format of the extended attributes. + * I know the null-terminator is redundant since namelen is stored, but + * I am maintaining compatibility with OS/2 where possible. + */ +struct jfs_ea { + u8 flag; /* Unused? */ + u8 namelen; /* Length of name */ + __le16 valuelen; /* Length of value */ + char name[0]; /* Attribute name (includes null-terminator) */ +}; /* Value immediately follows name */ + +struct jfs_ea_list { + __le32 size; /* overall size */ + struct jfs_ea ea[0]; /* Variable length list */ +}; + +/* Macros for defining maxiumum number of bytes supported for EAs */ +#define MAXEASIZE 65535 +#define MAXEALISTSIZE MAXEASIZE + +/* + * some macros for dealing with variable length EA lists. + */ +#define EA_SIZE(ea) \ + (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \ + le16_to_cpu((ea)->valuelen)) +#define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea)))) +#define FIRST_EA(ealist) ((ealist)->ea) +#define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size) +#define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, + size_t, int); +extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, + int); +extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); +extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); +extern int jfs_removexattr(struct dentry *, const char *); + +extern const struct xattr_handler *jfs_xattr_handlers[]; + +#ifdef CONFIG_JFS_SECURITY +extern int jfs_init_security(tid_t, struct inode *, struct inode *, + const struct qstr *); +#else +static inline int jfs_init_security(tid_t tid, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif + +#endif /* H_JFS_XATTR */ diff --git a/kernel/fs/jfs/jfs_xtree.c b/kernel/fs/jfs/jfs_xtree.c new file mode 100644 index 000000000..5ad774886 --- /dev/null +++ b/kernel/fs/jfs/jfs_xtree.c @@ -0,0 +1,3903 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * jfs_xtree.c: extent allocation descriptor B+-tree manager + */ + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_dinode.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * xtree local flag + */ +#define XT_INSERT 0x00000001 + +/* + * xtree key/entry comparison: extent offset + * + * return: + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent + */ +#define XT_CMP(CMP, K, X, OFFSET64)\ +{\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ +} + +/* write a xad entry */ +#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ +{\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ +} + +#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) + +/* get page buffer for specified block address */ +/* ToDo: Replace this ugly macro with a function */ +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ + if (!(RC)) { \ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ + (le16_to_cpu((P)->header.nextindex) > \ + le16_to_cpu((P)->header.maxentry)) || \ + (le16_to_cpu((P)->header.maxentry) > \ + (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ + jfs_error((IP)->i_sb, \ + "XT_GETPAGE: xtree page corrupt\n"); \ + BT_PUTPAGE(MP); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) + +/* for consistency */ +#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) +/* xtree entry parameter descriptor */ +struct xtsplit { + struct metapage *mp; + s16 index; + u8 flag; + s64 off; + s64 addr; + int len; + struct pxdlist *pxdlist; +}; + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint search; + uint fastSearch; + uint split; +} xtStat; +#endif + + +/* + * forward references + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp, + struct btstack * btstack, int flag); + +static int xtSplitUp(tid_t tid, + struct inode *ip, + struct xtsplit * split, struct btstack * btstack); + +static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, + struct metapage ** rmpp, s64 * rbnp); + +static int xtSplitRoot(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp); + +#ifdef _STILL_TO_PORT +static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + xtpage_t * fp, struct btstack * btstack); + +static int xtSearchNode(struct inode *ip, + xad_t * xad, + int *cmpp, struct btstack * btstack, int flag); + +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); +#endif /* _STILL_TO_PORT */ + +/* + * xtLookup() + * + * function: map a single page into a physical extent; + */ +int xtLookup(struct inode *ip, s64 lstart, + s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index; + xad_t *xad; + s64 next, size, xoff, xend; + int xlen; + s64 xaddr; + + *paddr = 0; + *plen = llen; + + if (!no_check) { + /* is lookup offset beyond eof ? */ + size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + if (lstart >= size) + return 0; + } + + /* + * search for the xad entry covering the logical extent + */ +//search: + if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) { + jfs_err("xtLookup: xtSearch returned %d", rc); + return rc; + } + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad found covering start of logical extent ? + * lstart is a page start address, + * i.e., lstart cannot start in a hole; + */ + if (cmp) { + if (next) + *plen = min(next - lstart, llen); + goto out; + } + + /* + * lxd covered by xad + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xoff + xlen; + xaddr = addressXAD(xad); + + /* initialize new pxd */ + *pflag = xad->flag; + *paddr = xaddr + (lstart - xoff); + /* a page must be fully covered by an xad */ + *plen = min(xend - lstart, llen); + + out: + XT_PUTPAGE(mp); + + return rc; +} + +/* + * xtSearch() + * + * function: search for the xad entry covering specified offset. + * + * parameters: + * ip - file object; + * xoff - extent offset; + * nextp - address of next extent (if any) for search miss + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, + int *cmpp, struct btstack * btstack, int flag) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* page buffer */ + xtpage_t *p; /* page */ + xad_t *xad; + int base, index, lim, btindex; + struct btframe *btsp; + int nsplit = 0; /* number of pages to split */ + s64 t64; + s64 next = 0; + + INCREMENT(xtStat.search); + + BT_CLR(btstack); + + btstack->nsplit = 0; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* try sequential access heuristics with the previous + * access entry in target leaf page: + * once search narrowed down into the target leaf, + * key must either match an entry in the leaf or + * key entry does not exist in the tree; + */ +//fastSearch: + if ((jfs_ip->btorder & BT_SEQUENTIAL) && + (p->header.flag & BT_LEAF) && + (index = jfs_ip->btindex) < + le16_to_cpu(p->header.nextindex)) { + xad = &p->xad[index]; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* stop sequential access heuristics */ + goto binarySearch; + } else { /* (t64 + lengthXAD(xad)) <= xoff */ + + /* try next sequential entry */ + index++; + if (index < + le16_to_cpu(p->header.nextindex)) { + xad++; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* miss: key falls between + * previous and this entry + */ + *cmpp = 1; + next = t64; + goto out; + } + + /* (xoff >= t64 + lengthXAD(xad)); + * matching entry may be further out: + * stop heuristic search + */ + /* stop sequential access heuristics */ + goto binarySearch; + } + + /* (index == p->header.nextindex); + * miss: key entry does not exist in + * the target leaf/tree + */ + *cmpp = 1; + goto out; + } + + /* + * if hit, return index of the entry found, and + * if miss, where new entry with search key is + * to be inserted; + */ + out: + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == /* little-endian */ + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* update sequential access heuristics */ + jfs_ip->btindex = index; + + if (nextp) + *nextp = next; + + INCREMENT(xtStat.fastSearch); + return 0; + } + + /* well, ... full search now */ + binarySearch: + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (index == btindex || + index == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = index; + + return 0; + } + /* search hit - internal page: + * descend/search its child page + */ + if (index < le16_to_cpu(p->header.nextindex)-1) + next = offsetXAD(&p->xad[index + 1]); + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + */ + if (base < le16_to_cpu(p->header.nextindex)) + next = offsetXAD(&p->xad[base]); + /* + * search miss - leaf page: + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (base == btindex || base == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = base; + + if (nextp) + *nextp = next; + + return 0; + } + + /* + * search miss - non-leaf page: + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* update number of pages to split */ + if (p->header.nextindex == p->header.maxentry) + nsplit++; + else + nsplit = 0; + + /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + +/* + * xtInsert() + * + * function: + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - + * + * return: + */ +int xtInsert(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, + int flag) +{ + int rc = 0; + s64 xaddr, hint; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + s64 next; + struct tlock *tlck; + struct xtlock *xtlck; + + jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* This test must follow XT_GETSEARCH since mp must be valid if + * we branch to out: */ + if ((cmp == 0) || (next && (xlen > next - xoff))) { + rc = -EEXIST; + goto out; + } + + /* + * allocate data extent requested + * + * allocation hint: last xad + */ + if ((xaddr = *xaddrp) == 0) { + if (index > XTENTRYSTART) { + xad = &p->xad[index - 1]; + hint = addressXAD(xad) + lengthXAD(xad) - 1; + } else + hint = 0; + if ((rc = dquot_alloc_block(ip, xlen))) + goto out; + if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { + dquot_free_block(ip, xlen); + goto out; + } + } + + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex == le16_to_cpu(p->header.maxentry)) { + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + if (*xaddrp == 0) { + dbFree(ip, xaddr, (s64) xlen); + dquot_free_block(ip, xlen); + } + return rc; + } + + *xaddrp = xaddr; + return 0; + } + + /* + * insert the new entry into the leaf page + */ + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + + /* if insert into middle, shift right remaining entries. */ + if (index < nextindex) + memmove(&p->xad[index + 1], &p->xad[index], + (nextindex - index) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + *xaddrp = xaddr; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSplitUp() + * + * function: + * split full pages as propagating insertion up the tree + * + * parameter: + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() + * + * return: + */ +static int +xtSplitUp(tid_t tid, + struct inode *ip, struct xtsplit * split, struct btstack * btstack) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; /* split page */ + struct metapage *rmp; + s64 rbn; /* new right page block number */ + struct metapage *rcmp; + xtpage_t *rcp; /* right child page */ + s64 rcbn; /* right child page block number */ + int skip; /* index of entry of insertion */ + int nextindex; /* next available entry index of p */ + struct btframe *parent; /* parent page entry on traverse stack */ + xad_t *xad; + s64 xaddr; + int xlen; + int nsplit; /* number of pages split */ + struct pxdlist pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *xtlck; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + /* is inode xtree root extension/inline EA area free ? */ + if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && + (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && + (JFS_IP(ip)->mode2 & INLINEEA)) { + sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); + JFS_IP(ip)->mode2 &= ~INLINEEA; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + + /* if insert into middle, shift right remaining entries. */ + skip = split->index; + nextindex = le16_to_cpu(sp->header.nextindex); + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* advance next available entry index */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + return 0; + } + + /* + * allocate new index blocks to cover index page split(s) + * + * allocation hint: ? + */ + if (split->pxdlist == NULL) { + nsplit = btstack->nsplit; + split->pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + xlen = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++) { + if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) + == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + XT_PUTPAGE(smp); + return rc; + } + } + + /* + * Split leaf page into and a new right page . + * + * The split routines insert the new entry into the leaf page, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + + XT_PUTPAGE(smp); + + if (rc) + return -EIO; + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 3 pages pinned at any time: + * right child, left parent and right parent (when the parent splits) + * to keep the child page pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages pinned */ + rcmp = rmp; + rcbn = rbn; + rcp = XT_PAGE(ip, rcmp); + + /* + * insert router entry in parent for new right child page + */ + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + XT_PUTPAGE(rcmp); + return rc; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * split or shift right remaining entries of the parent page + */ + nextindex = le16_to_cpu(sp->header.nextindex); + /* + * parent page is full - split the parent page + */ + if (nextindex == le16_to_cpu(sp->header.maxentry)) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->flag = XAD_NEW; + split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); + split->len = JFS_SBI(ip->i_sb)->nbperpage; + split->addr = rcbn; + + /* unpin previous right child page */ + XT_PUTPAGE(rcmp); + + /* The split routines insert the new entry, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) { + XT_PUTPAGE(smp); + return rc; + } + + XT_PUTPAGE(smp); + /* keep new child page pinned */ + } + /* + * parent page is not full - insert in parent page + */ + else { + /* + * insert router entry in parent for the right child + * page from the first entry of the right child page: + */ + /* + * acquire a transaction lock on the parent page; + * + * action: router xad insertion; + */ + BT_MARK_DIRTY(smp, ip); + + /* + * if insert into middle, shift right remaining entries + */ + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - + skip) << L2XTSLOTSIZE); + + /* insert the router entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, XAD_NEW, + offsetXAD(&rcp->xad[XTENTRYSTART]), + JFS_SBI(ip->i_sb)->nbperpage, rcbn); + + /* advance next available entry index. */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, + tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin parent page */ + XT_PUTPAGE(smp); + + /* exit propagate up */ + break; + } + } + + /* unpin current right page */ + XT_PUTPAGE(rmp); + + return 0; +} + + +/* + * xtSplitPage() + * + * function: + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitPage(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + xtpage_t *p; + s64 nextbn; + int skip, maxentry, middle, righthalf, n; + xad_t *xad; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *sxtlck = NULL, *rxtlck = NULL; + int quota_allocation = 0; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + INCREMENT(xtStat.split); + + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) + goto clean_up; + + quota_allocation += lengthPXD(pxd); + + /* + * allocate the new right page for the split + */ + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) { + rc = -EIO; + goto clean_up; + } + + jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * action: new page; + */ + + rp = (xtpage_t *) rmp->data; + rp->header.self = *pxd; + rp->header.flag = sp->header.flag & BT_TYPE; + rp->header.maxentry = sp->header.maxentry; /* little-endian */ + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + BT_MARK_DIRTY(smp, ip); + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + /* + * acquire a transaction lock on the new right page; + */ + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + rxtlck = (struct xtlock *) & tlck->lock; + rxtlck->lwm.offset = XTENTRYSTART; + /* + * acquire a transaction lock on the split page + */ + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + sxtlck = (struct xtlock *) & tlck->lock; + } + + /* + * initialize/update sibling pointers of and + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + skip = split->index; + + /* + * sequential append at tail (after last entry of last page) + * + * if splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * if we're wrong it's no big deal - we will do the split the right + * way next time. + * (it may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Be my guest.) + */ + if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { + /* + * acquire a transaction lock on the new/right page; + * + * action: xad insertion; + */ + /* insert entry at the first entry of the new right page */ + xad = &rp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = 1; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return 0; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update previous pointer of old next/right page of + */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(rmp); + goto clean_up; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page; + * + * action:sibling pointer update; + */ + if (!test_cflag(COMMIT_Nolink, ip)) + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + p->header.prev = cpu_to_le64(rbn); + + /* sibling page may have been updated previously, or + * it may be updated later; + */ + + XT_PUTPAGE(mp); + } + + /* + * split the data between the split and new/right pages + */ + maxentry = le16_to_cpu(sp->header.maxentry); + middle = maxentry >> 1; + righthalf = maxentry - middle; + + /* + * skip index in old split/left page - insert into left page: + */ + if (skip <= middle) { + /* move right half of split page to the new right page */ + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + righthalf << L2XTSLOTSIZE); + + /* shift right tail of left half to make room for new entry */ + if (skip < middle) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (middle - skip) << L2XTSLOTSIZE); + + /* insert new entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle + 1); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(skip, (int)sxtlck->lwm.offset) : skip; + } + + rp->header.nextindex = + cpu_to_le16(XTENTRYSTART + righthalf); + } + /* + * skip index in new right page - insert into right page: + */ + else { + /* move left head of right half to right page */ + n = skip - middle; + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + n << L2XTSLOTSIZE); + + /* insert new entry */ + n += XTENTRYSTART; + xad = &rp->xad[n]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* move right tail of right half to right page */ + if (skip < maxentry) + memmove(&rp->xad[n + 1], &sp->xad[skip], + (maxentry - skip) << L2XTSLOTSIZE); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(middle, (int)sxtlck->lwm.offset) : middle; + } + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + + righthalf + 1); + } + + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - + sxtlck->lwm.offset; + + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return rc; + + clean_up: + + /* Rollback quota allocation. */ + if (quota_allocation) + dquot_free_block(ip, quota_allocation); + + return (rc); +} + + +/* + * xtSplitRoot() + * + * function: + * split the full root page into original/root/split page and new + * right page + * i.e., root remains fixed in tree anchor (inode) and the root is + * copied to a single new right child page since root page << + * non-root page, and the split root page contains a single entry + * for the new right child page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitRoot(tid_t tid, + struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) +{ + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; + s64 rbn; + int skip, nextindex; + xad_t *xad; + pxd_t *pxd; + struct pxdlist *pxdlist; + struct tlock *tlck; + struct xtlock *xtlck; + int rc; + + sp = &JFS_IP(ip)->i_xtroot; + + INCREMENT(xtStat.split); + + /* + * allocate a single (right) child page + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); + + /* + * acquire a transaction lock on the new right page; + * + * action: new page; + */ + BT_MARK_DIRTY(rmp, ip); + + rp = (xtpage_t *) rmp->data; + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * copy the in-line root page into new right page extent + */ + nextindex = le16_to_cpu(sp->header.maxentry); + memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], + (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); + + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + skip = split->index; + /* if insert into middle, shift right remaining entries */ + if (skip != nextindex) + memmove(&rp->xad[skip + 1], &rp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + xad = &rp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); + + /* update page header */ + rp->header.nextindex = cpu_to_le16(nextindex + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + /* + * reset the root + * + * init root with the single entry for the new right page + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + */ + /* + * acquire a transaction lock on the root page (in-memory inode); + * + * action: root split; + */ + BT_MARK_DIRTY(split->mp, ip); + + xad = &sp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); + + /* update page header of root */ + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + + sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = 1; + } + + *rmpp = rmp; + + jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); + return 0; +} + + +/* + * xtExtend() + * + * function: extend in-place; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: alloc whole extended extent; + */ +int xtExtend(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* delta extent offset */ + s32 xlen, /* delta extent length */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, len; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + s64 xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + + jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* there must exist extent to be extended */ + if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtSearch did not find extent\n"); + return -EIO; + } + + /* extension must be contiguous */ + xad = &p->xad[index]; + if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "extension is not contiguous\n"); + return -EIO; + } + + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* extend will overflow extent ? */ + xlen = lengthXAD(xad) + xlen; + if ((len = xlen - MAXXLEN) <= 0) + goto extendOld; + + /* + * extent overflow: insert entry for new extent + */ +//insertNew: + xoff = offsetXAD(xad) + MAXXLEN; + xaddr = addressXAD(xad) + MAXXLEN; + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = len; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old entry */ + xad = &p->xad[index]; + xlen = MAXXLEN; + + /* + * extend old extent + */ + extendOld: + XADlength(xad, xlen); + if (!(xad->flag & XAD_NEW)) + xad->flag |= XAD_EXTENDED; + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + +#ifdef _NOTYET +/* + * xtTailgate() + * + * function: split existing 'tail' extent + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: free old split tail extent, alloc new extent; + */ +int xtTailgate(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* split/new extent offset */ + s32 xlen, /* new extent length */ + s64 xaddr, /* new extent address */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, llen, rlen; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + struct tlock *tlck; + struct xtlock *xtlck = 0; + struct tlock *mtlck; + struct maplock *pxdlock; + +/* +printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* there must exist extent to be tailgated */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "couldn't find extent\n"); + return -EIO; + } + + /* entry found must be last entry */ + nextindex = le16_to_cpu(p->header.nextindex); + if (index != nextindex - 1) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "the entry found is not the last entry\n"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* completely replace extent ? */ + xad = &p->xad[index]; +/* +printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); +*/ + if ((llen = xoff - offsetXAD(xad)) == 0) + goto updateOld; + + /* + * partially replace extent: insert entry for new extent + */ +//insertNew: + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old XAD */ + xad = &p->xad[index]; + + /* + * truncate/relocate old extent at split offset + */ + updateOld: + /* update dmap for old/committed/truncated extent */ + rlen = lengthXAD(xad) - llen; + if (!(xad->flag & XAD_NEW)) { + /* free from PWMAP at commit */ + if (!test_cflag(COMMIT_Nolink, ip)) { + mtlck = txMaplock(tid, ip, tlckMAP); + pxdlock = (struct maplock *) & mtlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); + PXDlength(&pxdlock->pxd, rlen); + pxdlock->index = 1; + } + } else + /* free from WMAP */ + dbFree(ip, addressXAD(xad) + llen, (s64) rlen); + + if (llen) + /* truncate */ + XADlength(xad, llen); + else + /* replace */ + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#endif /* _NOTYET */ + +/* + * xtUpdate() + * + * function: update XAD; + * + * update extent for allocated_but_not_recorded or + * compressed extent; + * + * parameter: + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; + */ +int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) +{ /* new XAD */ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index0, index, newindex, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad, *lxad, *rxad; + int xflag; + s64 nxoff, xoff; + int nxlen, xlen, lxlen, rxlen; + s64 nxaddr, xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + int newpage = 0; + + /* there must exist extent to be tailgated */ + nxoff = offsetXAD(nxad); + nxlen = lengthXAD(nxad); + nxaddr = addressXAD(nxad); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "Could not find extent\n"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + xad = &p->xad[index0]; + xflag = xad->flag; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* nXAD must be completely contained within XAD */ + if ((xoff > nxoff) || + (nxoff + nxlen > xoff + xlen)) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "nXAD in not completely contained within XAD\n"); + return -EIO; + } + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + +#ifdef _JFS_WIP_NOCOALESCE + if (xoff < nxoff) + goto updateRight; + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto out; + } else /* (nxlen < xlen) */ + goto updateLeft; +#endif /* _JFS_WIP_NOCOALESCE */ + +/* #ifdef _JFS_WIP_COALESCE */ + if (xoff < nxoff) + goto coalesceRight; + + /* + * coalesce with left XAD + */ +//coalesceLeft: /* (xoff == nxoff) */ + /* is XAD first entry of page ? */ + if (index == XTENTRYSTART) + goto replace; + + /* is nXAD logically and physically contiguous with lXAD ? */ + lxad = &p->xad[index - 1]; + lxlen = lengthXAD(lxad); + if (!(lxad->flag & XAD_NOTRECORDED) && + (nxoff == offsetXAD(lxad) + lxlen) && + (nxaddr == addressXAD(lxad) + lxlen) && + (lxlen + nxlen < MAXXLEN)) { + /* extend right lXAD */ + index0 = index - 1; + XADlength(lxad, lxlen + nxlen); + + /* If we just merged two extents together, need to make sure the + * right extent gets logged. If the left one is marked XAD_NEW, + * then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(lxad->flag & XAD_NEW)) + lxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) { + /* truncate XAD */ + XADoffset(xad, xoff + nxlen); + XADlength(xad, xlen - nxlen); + XADaddress(xad, xaddr + nxlen); + goto out; + } else { /* (xlen == nxlen) */ + + /* remove XAD */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xoff = nxoff = offsetXAD(lxad); + xlen = nxlen = lxlen + nxlen; + xaddr = nxaddr = addressXAD(lxad); + goto coalesceRight; + } + } + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto coalesceRight; + } else /* (nxlen < xlen) */ + goto updateLeft; + + /* + * coalesce with right XAD + */ + coalesceRight: /* (xoff <= nxoff) */ + /* is XAD last entry of page ? */ + if (newindex == nextindex) { + if (xoff == nxoff) + goto out; + goto updateRight; + } + + /* is nXAD logically and physically contiguous with rXAD ? */ + rxad = &p->xad[index + 1]; + rxlen = lengthXAD(rxad); + if (!(rxad->flag & XAD_NOTRECORDED) && + (nxoff + nxlen == offsetXAD(rxad)) && + (nxaddr + nxlen == addressXAD(rxad)) && + (rxlen + nxlen < MAXXLEN)) { + /* extend left rXAD */ + XADoffset(rxad, nxoff); + XADlength(rxad, rxlen + nxlen); + XADaddress(rxad, nxaddr); + + /* If we just merged two extents together, need to make sure + * the left extent gets logged. If the right one is marked + * XAD_NEW, then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(rxad->flag & XAD_NEW)) + rxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) + /* truncate XAD */ + XADlength(xad, xlen - nxlen); + else { /* (xlen == nxlen) */ + + /* remove XAD */ + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + } + + goto out; + } else if (xoff == nxoff) + goto out; + + if (xoff >= nxoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xoff >= nxoff\n"); + return -EIO; + } +/* #endif _JFS_WIP_COALESCE */ + + /* + * split XAD into (lXAD, nXAD): + * + * |---nXAD---> + * --|----------XAD----------|-- + * |-lXAD-| + */ + updateRight: /* (xoff < nxoff) */ + /* truncate old XAD as lXAD:not_recorded */ + xad = &p->xad[index]; + XADlength(xad, nxoff - xoff); + + /* insert nXAD:recorded */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag & ~XAD_NOTRECORDED; + split.off = nxoff; + split.len = nxlen; + split.addr = nxaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } else { + /* is nXAD on new page ? */ + if (newindex > + (le16_to_cpu(p->header.maxentry) >> 1)) { + newindex = + newindex - + le16_to_cpu(p->header.nextindex) + + XTENTRYSTART; + newpage = 1; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* + * does nXAD force 3-way split ? + * + * |---nXAD--->| + * --|----------XAD-------------|-- + * |-lXAD-| |-rXAD -| + */ + if (nxoff + nxlen == xoff + xlen) + goto out; + + /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ + if (newpage) { + /* close out old page */ + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + bn = le64_to_cpu(p->header.next); + XT_PUTPAGE(mp); + + /* get new right page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + index0 = index = newindex; + } else + index++; + + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xlen = xlen - (nxoff - xoff); + xoff = nxoff; + xaddr = nxaddr; + + /* recompute split pages */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + XT_PUTPAGE(mp); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtSearch failed\n"); + return -EIO; + } + + if (index0 != index) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "unexpected value of index\n"); + return -EIO; + } + } + + /* + * split XAD into (nXAD, rXAD) + * + * ---nXAD---| + * --|----------XAD----------|-- + * |-rXAD-| + */ + updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ + /* update old XAD with nXAD:recorded */ + xad = &p->xad[index]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* insert rXAD:not_recorded */ + xoff = xoff + nxlen; + xlen = xlen - nxlen; + xaddr = xaddr + nxlen; + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate.updateLeft.split p:0x%p\n", p); +*/ + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + out: + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtAppend() + * + * function: grow in append mode from contiguous region specified ; + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - + * + * return: + */ +int xtAppend(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, + s32 * xlenp, /* (in/out) */ + s64 * xaddrp, /* (in/out) */ + int flag) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn, xaddr; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + struct tlock *tlck; + struct xtlock *xtlck; + int nsplit, nblocks, xlen; + struct pxdlist pxdlist; + pxd_t *pxd; + s64 next; + + xaddr = *xaddrp; + xlen = *xlenp; + jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", + (ulong) xoff, maxblocks, xlen, (ulong) xaddr); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp == 0) { + rc = -EEXIST; + goto out; + } + + if (next) + xlen = min(xlen, (int)(next - xoff)); +//insert: + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex < le16_to_cpu(p->header.maxentry)) + goto insertLeaf; + + /* + * allocate new index blocks to cover index page split(s) + */ + nsplit = btstack.nsplit; + split.pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + nblocks = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, nblocks); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + goto out; + } + + xlen = min(xlen, maxblocks); + + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + dbFree(ip, *xaddrp, (s64) * xlenp); + + return rc; + } + + *xaddrp = xaddr; + *xlenp = xlen; + return 0; + + /* + * insert the new entry into the leaf page + */ + insertLeaf: + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + + *xaddrp = xaddr; + *xlenp = xlen; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#ifdef _STILL_TO_PORT + +/* - TBD for defragmentaion/reorganization - + * + * xtDelete() + * + * function: + * delete the entry with the specified key. + * + * N.B.: whole extent of the entry is assumed to be deleted. + * + * parameter: + * + * return: + * ENOENT: if the entry is not found. + * + * exception: + */ +int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * find the matching entry; xtSearch() pins the page + */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + if (cmp) { + /* unpin the leaf page */ + XT_PUTPAGE(mp); + return -ENOENT; + } + + /* + * delete the entry from the leaf page + */ + nextindex = le16_to_cpu(p->header.nextindex); + le16_add_cpu(&p->header.nextindex, -1); + + /* + * if the leaf page bocome empty, free the page + */ + if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) + return (xtDeleteUp(tid, ip, mp, p, &btstack)); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + + /* if delete from middle, shift left/compact the remaining entries */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) * sizeof(xad_t)); + + XT_PUTPAGE(mp); + + return 0; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int +xtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + s64 xaddr; + int xlen; + struct btframe *parent; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * keep root leaf page which has become empty + */ + if (fp->header.flag & BT_ROOT) { + /* keep the root page */ + fp->header.flag &= ~BT_INTERNAL; + fp->header.flag |= BT_LEAF; + fp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + return 0; + } + + /* + * free non-root leaf page + */ + if ((rc = xtRelink(tid, ip, fp))) { + XT_PUTPAGE(fmp); + return rc; + } + + xaddr = addressPXD(&fp->header.self); + xlen = lengthPXD(&fp->header.self); + /* free the page extent */ + dbFree(ip, xaddr, (s64) xlen); + + /* free the buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the index tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* delete the entry for the freed child page from parent. + */ + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * the parent has the single entry being deleted: + * free the parent page which has become empty. + */ + if (nextindex == 1) { + if (p->header.flag & BT_ROOT) { + /* keep the root page */ + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = + cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(mp); */ + + break; + } else { + /* free the parent page */ + if ((rc = xtRelink(tid, ip, p))) + return rc; + + xaddr = addressPXD(&p->header.self); + /* free the page extent */ + dbFree(ip, xaddr, + (s64) JFS_SBI(ip->i_sb)->nbperpage); + + /* unpin/free the buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + /* + * the parent has other entries remaining: + * delete the router entry from the parent page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + xtlck->lwm. + offset) : index; + + /* if delete from middle, + * shift left/compact the remaining entries in the page + */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + le16_add_cpu(&p->header.nextindex, -1); + jfs_info("xtDeleteUp(entry): 0x%lx[%d]", + (ulong) parent->bn, index); + } + + /* unpin the parent page */ + XT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: xtRelocate() + * + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. + * + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. + */ +int +xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ + s64 nxaddr, /* new xaddr */ + int xtype) +{ /* extent type: XTPAGE or DATAEXT */ + int rc = 0; + struct tblock *tblk; + struct tlock *tlck; + struct xtlock *xtlck; + struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ + xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ + xad_t *xad; + pxd_t *pxd; + s64 xoff, xsize; + int xlen; + s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; + cbuf_t *cp; + s64 offset, nbytes, nbrd, pno; + int nb, npages, nblks; + s64 bn; + int cmp; + int index; + struct pxd_lock *pxdlock; + struct btstack btstack; /* traverse stack */ + + xtype = xtype & EXTENT_TYPE; + + xoff = offsetXAD(oxad); + oxaddr = addressXAD(oxad); + xlen = lengthXAD(oxad); + + /* validate extent offset */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + if (offset >= ip->i_size) + return -ESTALE; /* stale extent */ + + jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", + xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); + + /* + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; + */ + if (xtype == DATAEXT) { + /* search in leaf entry */ + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* validate for exact match with a single entry */ + xad = &pp->xad[index]; + if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + } else { /* (xtype == XTPAGE) */ + + /* search in internal entry */ + rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* xtSearchNode() validated for exact match with a single entry + */ + xad = &pp->xad[index]; + } + jfs_info("xtRelocate: parent xad entry validated."); + + /* + * 2. relocate the extent + */ + if (xtype == DATAEXT) { + /* if the extent is allocated-but-not-recorded + * there is no real data to be moved in this extent, + */ + if (xad->flag & XAD_NOTRECORDED) + goto out; + else + /* release xtpage for cmRead()/xtLookup() */ + XT_PUTPAGE(pmp); + + /* + * cmRelocate() + * + * copy target data pages to be relocated; + * + * data extent must start at page boundary and + * multiple of page size (except the last data extent); + * read in each page of the source data extent into cbuf, + * update the cbuf extent descriptor of the page to be + * homeward bound to new dst data extent + * copy the data from the old extent to new extent. + * copy is essential for compressed files to avoid problems + * that can arise if there was a change in compression + * algorithms. + * it is a good strategy because it may disrupt cache + * policy to keep the pages in memory afterwards. + */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + assert((offset & CM_OFFSET) == 0); + nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; + pno = offset >> CM_L2BSIZE; + npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; +/* + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; +*/ + sxaddr = oxaddr; + dxaddr = nxaddr; + + /* process the request one cache buffer at a time */ + for (nbrd = 0; nbrd < nbytes; nbrd += nb, + offset += nb, pno++, npages--) { + /* compute page size */ + nb = min(nbytes - nbrd, CM_BSIZE); + + /* get the cache buffer of the page */ + if (rc = cmRead(ip, offset, npages, &cp)) + break; + + assert(addressPXD(&cp->cm_pxd) == sxaddr); + assert(!cp->cm_modified); + + /* bind buffer with the new extent address */ + nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; + cmSetXD(ip, cp, pno, dxaddr, nblks); + + /* release the cbuf, mark it as modified */ + cmPut(cp, true); + + dxaddr += nblks; + sxaddr += nblks; + } + + /* get back parent page */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("xtRelocate: target data extent relocated."); + } else { /* (xtype == XTPAGE) */ + + /* + * read in the target xtpage from the source extent; + */ + XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + if (rmp) + XT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling xtpages if any; + */ + if (lmp) { + BT_MARK_DIRTY(lmp, ip); + tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + lp->header.next = cpu_to_le64(nxaddr); + XT_PUTPAGE(lmp); + } + + if (rmp) { + BT_MARK_DIRTY(rmp, ip); + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + rp->header.prev = cpu_to_le64(nxaddr); + XT_PUTPAGE(rmp); + } + + /* + * update the target xtpage to be relocated + * + * update the self address of the target page + * and write to destination extent; + * redo image covers the whole xtpage since it is new page + * to the destination extent; + * update of bmap for the free of source extent + * of the target xtpage itself: + * update of bmap for the allocation of destination extent + * of the target xtpage itself: + * update of bmap for the extents covered by xad entries in + * the target xtpage is not necessary since they are not + * updated; + * if not committed before this relocation, + * target page may contain XAD_NEW entries which must + * be scanned for bmap update (logredo() always + * scan xtpage REDOPAGE image for bmap update); + * if committed before this relocation (tlckRELOCATE), + * scan may be skipped by commit() and logredo(); + */ + BT_MARK_DIRTY(mp, ip); + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the self address in the xtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* linelock for the after image of the whole page */ + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + + /* update the buffer extent descriptor of target xtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + bmSetXD(mp, nxaddr, xsize); + + /* unpin the target page to new homeward bound */ + XT_PUTPAGE(mp); + jfs_info("xtRelocate: target xtpage relocated."); + } + + /* + * 3. acquire maplock for the source extent to be freed; + * + * acquire a maplock saving the src relocated extent address; + * to free of the extent at commit time; + */ + out: + /* if DATAEXT relocation, write a LOG_UPDATEMAP record for + * free PXD of the source data extent (logredo() will update + * bmap for free of source data extent), and update bmap for + * free of the source data extent; + */ + if (xtype == DATAEXT) + tlck = txMaplock(tid, ip, tlckMAP); + /* if XTPAGE relocation, write a LOG_NOREDOPAGE record + * for the source xtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * xtpage), and update bmap for free of the source xtpage; + * N.B. We use tlckMAP instead of tlkcXTREE because there + * is no buffer associated with this lock since the buffer + * has been redirected to the target location. + */ + else /* (xtype == XTPAGE) */ + tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); + + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent xad entry for relocation; + * + * acquire tlck for the parent entry with XAD_NEW as entry + * update which will write LOG_REDOPAGE and update bmap for + * allocation of XAD_NEW destination extent; + */ + jfs_info("xtRelocate: update parent xad entry."); + BT_MARK_DIRTY(pmp, ip); + tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the XAD with the new destination extent; */ + xad = &pp->xad[index]; + xad->flag |= XAD_NEW; + XADaddress(xad, nxaddr); + + xtlck->lwm.offset = min(index, xtlck->lwm.offset); + xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - + xtlck->lwm.offset; + + /* unpin the parent xtpage */ + XT_PUTPAGE(pmp); + + return rc; +} + + +/* + * xtSearchNode() + * + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. + * + * parameters: + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ + int *cmpp, struct btstack * btstack, int flag) +{ + int rc = 0; + s64 xoff, xaddr; + int xlen; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* page */ + int base, index, lim; + struct btframe *btsp; + s64 t64; + + BT_CLR(btstack); + + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + if (p->header.flag & BT_LEAF) { + XT_PUTPAGE(mp); + return -ESTALE; + } + + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + * + * verify for exact match; + */ + if (xaddr == addressXAD(&p->xad[index]) && + xoff == offsetXAD(&p->xad[index])) { + *cmpp = cmp; + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + return 0; + } + + /* descend/search its child page */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss - non-leaf page: + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + + +/* + * xtRelink() + * + * function: + * link around a freed page. + * + * Parameter: + * int tid, + * struct inode *ip, + * xtpage_t *p) + * + * returns: + */ +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) +{ + int rc = 0; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update prev pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.prev = cpu_to_le64(prevbn); + + XT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update next pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.next = le64_to_cpu(nextbn); + + XT_PUTPAGE(mp); + } + + return 0; +} +#endif /* _STILL_TO_PORT */ + + +/* + * xtInitRoot() + * + * initialize file root (inline in inode) + */ +void xtInitRoot(tid_t tid, struct inode *ip) +{ + xtpage_t *p; + + /* + * acquire a transaction lock on the root + * + * action: + */ + txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, + tlckXTREE | tlckNEW); + p = &JFS_IP(ip)->i_xtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + if (S_ISDIR(ip->i_mode)) + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); + else { + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); + ip->i_size = 0; + } + + + return; +} + + +/* + * We can run into a deadlock truncating a file with a large number of + * xtree pages (large fragmented file). A robust fix would entail a + * reservation system where we would reserve a number of metadata pages + * and tlocks which we would be guaranteed without a deadlock. Without + * this, a partial fix is to limit number of metadata pages we will lock + * in a single transaction. Currently we will truncate the file so that + * no more than 50 leaf pages will be locked. The caller of xtTruncate + * will be responsible for ensuring that the current transaction gets + * committed, and that subsequent transactions are created to truncate + * the file further if needed. + */ +#define MAX_TRUNCATE_LEAVES 50 + +/* + * xtTruncate() + * + * function: + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. + * + * parameter: + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * + * return: + * + * note: + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; + * 2. truncate index table of directory when last entry removed + * map update via tlock at commit time; + * PMAP: + * Call xtTruncate_pmap instead + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; + * + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; + * + * pages may already have been tlocked by anonymous transactions + * during file growth (i.e., write) before truncation; + * + * except last truncated entry, deleted entries remains as is + * in the page (nextindex is updated) for other use + * (e.g., log/update allocation map): this avoid copying the page + * info but delay free of pages; + * + */ +s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) +{ + int rc = 0; + s64 teof; + struct metapage *mp; + xtpage_t *p; + s64 bn; + int index, nextindex; + xad_t *xad; + s64 xoff, xaddr; + int xlen, len, freexlen; + struct btstack btstack; + struct btframe *parent; + struct tblock *tblk = NULL; + struct tlock *tlck = NULL; + struct xtlock *xtlck = NULL; + struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + s64 nfreed; + int freed, log; + int locked_leaves = 0; + + /* save object truncation type */ + if (tid) { + tblk = tid_to_tblock(tid); + tblk->xflag |= flag; + } + + nfreed = 0; + + flag &= COMMIT_MAP; + assert(flag != COMMIT_PMAP); + + if (flag == COMMIT_PWMAP) + log = 1; + else { + log = 0; + xadlock.flag = mlckFREEXADLIST; + xadlock.index = 1; + } + + /* + * if the newsize is not an integral number of pages, + * the file between newsize and next page boundary will + * be cleared. + * if truncating into a file hole, it will cause + * a full block to be allocated for the logical block. + */ + + /* + * release page blocks of truncated region + * + * free the data blocks from the leaf index blocks. + * delete the parent index entries corresponding to + * the freed child data/index blocks. + * free the index blocks themselves which aren't needed + * in new sized file. + * + * index blocks are updated only if the blocks are to be + * retained in the new sized file. + * if type is PMAP, the data and index pages are NOT + * freed, and the data and index blocks are NOT freed + * from working map. + * (this will allow continued access of data/index of + * temporary file (zerolink count file truncated to zero-length)). + */ + teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + + /* Since this is the rightmost page at this level, and we may have + * already freed a page that was formerly to the right, let's make + * sure that the next pointer is zero. + */ + if (p->header.next) { + if (log) + /* + * Make sure this change to the header is logged. + * If we really truncate this leaf, the flag + * will be changed to tlckTRUNCATE + */ + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + BT_MARK_DIRTY(mp, ip); + p->header.next = 0; + } + + if (p->header.flag & BT_INTERNAL) + goto getChild; + + /* + * leaf page + */ + freed = 0; + + /* does region covered by leaf page precede Teof ? */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + if (teof >= xoff + xlen) { + XT_PUTPAGE(mp); + goto getParent; + } + + /* (re)acquire tlock of the leaf page */ + if (log) { + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + XT_PUTPAGE(mp); + newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + goto getParent; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + } + BT_MARK_DIRTY(mp, ip); + + /* + * scan backward leaf page entries + */ + for (; index >= XTENTRYSTART; index--) { + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * The "data" for a directory is indexed by the block + * device's address space. This metadata must be invalidated + * here + */ + if (S_ISDIR(ip->i_mode) && (teof == 0)) + invalidate_xad_metapages(ip, *xad); + /* + * entry beyond eof: continue scan of current page + * xad + * ---|---=======-------> + * eof + */ + if (teof < xoff) { + nfreed += xlen; + continue; + } + + /* + * (xoff <= teof): last entry to be deleted from page; + * If other entries remain in page: keep and update the page. + */ + + /* + * eof == entry_start: delete the entry + * xad + * -------|=======-------> + * eof + * + */ + if (teof == xoff) { + nfreed += xlen; + + if (index == XTENTRYSTART) + break; + + nextindex = index; + } + /* + * eof within the entry: truncate the entry. + * xad + * -------===|===-------> + * eof + */ + else if (teof < xoff + xlen) { + /* update truncated entry */ + len = teof - xoff; + freexlen = xlen - len; + XADlength(xad, len); + + /* save pxd of truncated extent in tlck */ + xaddr += len; + if (log) { /* COMMIT_PWMAP */ + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = index + 1 - + xtlck->lwm.offset; + xtlck->twm.offset = index; + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + } + /* free truncated extent */ + else { /* COMMIT_WMAP */ + + pxdlock = (struct pxd_lock *) & xadlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + + /* reset map lock */ + xadlock.flag = mlckFREEXADLIST; + } + + /* current entry is new last entry; */ + nextindex = index + 1; + + nfreed += freexlen; + } + /* + * eof beyond the entry: + * xad + * -------=======---|---> + * eof + */ + else { /* (xoff + xlen < teof) */ + + nextindex = index + 1; + } + + if (nextindex < le16_to_cpu(p->header.nextindex)) { + if (!log) { /* COMMIT_WAMP */ + xadlock.xdlist = &p->xad[nextindex]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + nextindex; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + p->header.nextindex = cpu_to_le16(nextindex); + } + + XT_PUTPAGE(mp); + + /* assert(freed == 0); */ + goto getParent; + } /* end scan of leaf page entries */ + + freed = 1; + + /* + * leaf page become empty: free the page if type != PMAP + */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free data extents covered by leaf [XTENTRYSTART:hwm); + * invalidate leaf if COMMIT_PWMAP; + * if (TRUNCATE), will write LOG_NOREDOPAGE; + */ + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WAMP */ + + /* free data extents covered by leaf */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); + } + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; + + /* invalidate empty leaf page */ + discard_metapage(mp); + } + } + + /* + * the leaf page become empty: delete the parent entry + * for the leaf page if the parent page is to be kept + * in the new sized file. + */ + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * child page was not empty: + */ + if (freed == 0) { + /* has any entry deleted from parent ? */ + if (index < le16_to_cpu(p->header.nextindex) - 1) { + /* (re)acquire tlock on the parent page */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckTRUNCATE: + * free child extents covered by parent [); + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + if (!(tlck->type & tlckTRUNCATE)) { + xtlck->hwm.offset = + le16_to_cpu(p->header. + nextindex) - 1; + tlck->type = + tlckXTREE | tlckTRUNCATE; + } + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[index + 1]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + index - 1; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + p->header.nextindex = cpu_to_le16(index + 1); + } + XT_PUTPAGE(mp); + goto getParent; + } + + /* + * child page was empty: + */ + nfreed += lengthXAD(&p->xad[index]); + + /* + * During working map update, child page's tlock must be handled + * before parent's. This is because the parent's tlock will cause + * the child's disk space to be marked available in the wmap, so + * it's important that the child page be released by that time. + * + * ToDo: tlocks should be on doubly-linked list, so we can + * quickly remove it and add it to the end. + */ + + /* + * Move parent page's tlock to the end of the tid's tlock list + */ + if (log && mp->lid && (tblk->last != mp->lid) && + lid_to_tlock(mp->lid)->tid) { + lid_t lid = mp->lid; + struct tlock *prev; + + tlck = lid_to_tlock(lid); + + if (tblk->next == lid) + tblk->next = tlck->next; + else { + for (prev = lid_to_tlock(tblk->next); + prev->next != lid; + prev = lid_to_tlock(prev->next)) { + assert(prev->next); + } + prev->next = tlck->next; + } + lid_to_tlock(tblk->last)->next = lid; + tlck->next = 0; + tblk->last = lid; + } + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { + /* + * Shrink root down to allow inline + * EA (otherwise fsck complains) + */ + p->header.maxentry = + cpu_to_le16(XTROOTINITSLOT); + JFS_IP(ip)->mode2 |= INLINEEA; + } + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= + tlckFREELOCK; + + /* invalidate parent page */ + discard_metapage(mp); + } + + /* parent has become empty and freed: + * go back up to its parent page + */ + /* freed = 1; */ + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else { + /* try truncate region covered by preceding entry + * (process backward) + */ + index--; + + /* go back down to the child page corresponding + * to the entry + */ + goto getChild; + } + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + /* + * update file resource stat + */ + /* set size + */ + if (S_ISDIR(ip->i_mode) && !newsize) + ip->i_size = 1; /* fsck hates zero-length directories */ + else + ip->i_size = newsize; + + /* update quota allocation to reflect freed blocks */ + dquot_free_block(ip, nfreed); + + /* + * free tlock of invalidated pages + */ + if (flag == COMMIT_WMAP) + txFreelock(ip); + + return newsize; +} + + +/* + * xtTruncate_pmap() + * + * function: + * Perform truncate to zero length for deleted file, leaving the + * the xtree and working map untouched. This allows the file to + * be accessed via open file handles, while the delete of the file + * is committed to disk. + * + * parameter: + * tid_t tid, + * struct inode *ip, + * s64 committed_size) + * + * return: new committed size + * + * note: + * + * To avoid deadlock by holding too many transaction locks, the + * truncation may be broken up into multiple transactions. + * The committed_size keeps track of part of the file has been + * freed from the pmaps. + */ +s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) +{ + s64 bn; + struct btstack btstack; + int cmp; + int index; + int locked_leaves = 0; + struct metapage *mp; + xtpage_t *p; + struct btframe *parent; + int rc; + struct tblock *tblk; + struct tlock *tlck = NULL; + xad_t *xad; + int xlen; + s64 xoff; + struct xtlock *xtlck = NULL; + + /* save object truncation type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* clear stack */ + BT_CLR(&btstack); + + if (committed_size) { + xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "did not find extent\n"); + return -EIO; + } + } else { + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + } + + /* + * leaf page + */ + + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + XT_PUTPAGE(mp); + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckFREE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = index; + + + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + if (p->header.flag & BT_ROOT) { + + goto out; + } else { + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else + index--; + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + + return 0; +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_xtstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Xtree statistics\n" + "====================\n" + "searches = %d\n" + "fast searches = %d\n" + "splits = %d\n", + xtStat.search, + xtStat.fastSearch, + xtStat.split); + return 0; +} + +static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_xtstat_proc_show, NULL); +} + +const struct file_operations jfs_xtstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_xtstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/kernel/fs/jfs/jfs_xtree.h b/kernel/fs/jfs/jfs_xtree.h new file mode 100644 index 000000000..1e0987986 --- /dev/null +++ b/kernel/fs/jfs/jfs_xtree.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_XTREE +#define _H_JFS_XTREE + +/* + * jfs_xtree.h: extent allocation descriptor B+-tree manager + */ + +#include "jfs_btree.h" + + +/* + * extent allocation descriptor (xad) + */ +typedef struct xad { + __u8 flag; /* 1: flag */ + __u8 rsvrd[2]; /* 2: reserved */ + __u8 off1; /* 1: offset in unit of fsblksize */ + __le32 off2; /* 4: offset in unit of fsblksize */ + pxd_t loc; /* 8: length and address in unit of fsblksize */ +} xad_t; /* (16) */ + +#define MAXXLEN ((1 << 24) - 1) + +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 + +/* xad_t field construction */ +#define XADoffset(xad, offset64)\ +{\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ +} +#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64) +#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32) + +/* xad_t field extraction */ +#define offsetXAD(xad)\ + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) +#define addressXAD(xad) addressPXD(&(xad)->loc) +#define lengthXAD(xad) lengthPXD(&(xad)->loc) + +/* xad list */ +struct xadlist { + s16 maxnxad; + s16 nxad; + xad_t *xad; +}; + +/* xad_t flags */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ +#define XAD_COW 0x10 /* copy-on-write */ + + +/* possible values for maxentry */ +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 + +/* + * xtree page: + */ +typedef union { + struct xtheader { + __le64 next; /* 8: */ + __le64 prev; /* 8: */ + + u8 flag; /* 1: */ + u8 rsrvd1; /* 1: */ + __le16 nextindex; /* 2: next index = number of entries */ + __le16 maxentry; /* 2: max number of entries */ + __le16 rsrvd2; /* 2: */ + + pxd_t self; /* 8: self */ + } header; /* (32) */ + + xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ +} xtpage_t; + +/* + * external declaration + */ +extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, + int *pflag, s64 * paddr, int *plen, int flag); +extern void xtInitRoot(tid_t tid, struct inode *ip); +extern int xtInsert(tid_t tid, struct inode *ip, + int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); +extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +#ifdef _NOTYET +extern int xtTailgate(tid_t tid, struct inode *ip, + s64 xoff, int xlen, s64 xaddr, int flag); +#endif +extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); +extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); +extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); +extern int xtRelocate(tid_t tid, struct inode *ip, + xad_t * oxad, s64 nxaddr, int xtype); +extern int xtAppend(tid_t tid, + struct inode *ip, int xflag, s64 xoff, int maxblocks, + int *xlenp, s64 * xaddrp, int flag); +#endif /* !_H_JFS_XTREE */ diff --git a/kernel/fs/jfs/namei.c b/kernel/fs/jfs/namei.c new file mode 100644 index 000000000..66db7bc0e --- /dev/null +++ b/kernel/fs/jfs/namei.c @@ -0,0 +1,1606 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_inode.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +/* + * forward references + */ +const struct dentry_operations jfs_ci_dentry_operations; + +static s64 commitZeroLink(tid_t, struct inode *); + +/* + * NAME: free_ea_wmap(inode) + * + * FUNCTION: free uncommitted extended attributes from working map + * + */ +static inline void free_ea_wmap(struct inode *inode) +{ + dxd_t *ea = &JFS_IP(inode)->ea; + + if (ea->flag & DXD_EXTENT) { + /* free EA pages from cache */ + invalidate_dxd_metapages(inode, *ea); + dbFree(inode, addressDXD(ea), lengthDXD(ea)); + } + ea->flag = 0; +} + +/* + * NAME: jfs_create(dip, dentry, mode) + * + * FUNCTION: create a regular file in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of new file + * mode - create mode (rwxrwxrwx). + * nd- nd struct + * + * RETURN: Errors from subroutines + * + */ +static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, + bool excl) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); + + dquot_initialize(dip); + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_create: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child XAD tree root in-line in inode + */ + xtInitRoot(tid, ip); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_create: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + ip->i_fop = &jfs_file_operations; + ip->i_mapping->a_ops = &jfs_aops; + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + unlock_new_inode(ip); + iput(ip); + } else { + unlock_new_inode(ip); + d_instantiate(dentry, ip); + } + + out2: + free_UCSname(&dname); + + out1: + + jfs_info("jfs_create: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_mkdir(dip, dentry, mode) + * + * FUNCTION: create a child directory in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of child directory + * mode - create mode (rwxrwxrwx). + * + * RETURN: Errors from subroutines + * + * note: + * EACCESS: user needs search+write permission on the parent directory + */ +static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); + + dquot_initialize(dip); + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, S_IFDIR | mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_mkdir: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child directory in-line in inode + */ + dtInitRoot(tid, ip, dip->i_ino); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_mkdir: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + set_nlink(ip, 2); /* for '.' */ + ip->i_op = &jfs_dir_inode_operations; + ip->i_fop = &jfs_dir_operations; + + mark_inode_dirty(ip); + + /* update parent directory inode */ + inc_nlink(dip); /* for '..' from child directory */ + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + unlock_new_inode(ip); + iput(ip); + } else { + unlock_new_inode(ip); + d_instantiate(dentry, ip); + } + + out2: + free_UCSname(&dname); + + + out1: + + jfs_info("jfs_mkdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_rmdir(dip, dentry) + * + * FUNCTION: remove a link to child directory + * + * PARAMETER: dip - parent inode + * dentry - child directory dentry + * + * RETURN: -EINVAL - if name is . or .. + * -EINVAL - if . or .. exist but are invalid. + * errors from subroutines + * + * note: + * if other threads have the directory open when the last link + * is removed, the "." and ".." entries, if present, are removed before + * rmdir() returns and no new entries may be created in the directory, + * but the directory is not removed until the last reference to + * the directory is released (cf.unlink() of regular file). + */ +static int jfs_rmdir(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = d_inode(dentry); + ino_t ino; + struct component_name dname; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); + + /* Init inode for quota operations. */ + dquot_initialize(dip); + dquot_initialize(ip); + + /* directory must be empty to be removed */ + if (!dtEmpty(ip)) { + rc = -ENOTEMPTY; + goto out; + } + + if ((rc = get_UCSname(&dname, dentry))) { + goto out; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + + /* + * delete the entry of target directory from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_rmdir: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + goto out2; + } + + /* update parent directory's link count corresponding + * to ".." entry of the target directory deleted + */ + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + inode_dec_link_count(dip); + + /* + * OS/2 could have created EA and/or ACL + */ + /* free EA from both persistent and working map */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + /* free EA pages */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + } + JFS_IP(ip)->ea.flag = 0; + + /* free ACL from both persistent and working map */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + /* free ACL pages */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + } + JFS_IP(ip)->acl.flag = 0; + + /* mark the target directory as deleted */ + clear_nlink(ip); + mark_inode_dirty(ip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out2: + free_UCSname(&dname); + + out: + jfs_info("jfs_rmdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_unlink(dip, dentry) + * + * FUNCTION: remove a link to object named by + * from parent directory + * + * PARAMETER: dip - inode of parent directory + * dentry - dentry of object to be removed + * + * RETURN: errors from subroutines + * + * note: + * temporary file: if one or more processes have the file open + * when the last link is removed, the link will be removed before + * unlink() returns, but the removal of the file contents will be + * postponed until all references to the files are closed. + * + * JFS does NOT support unlink() on directories. + * + */ +static int jfs_unlink(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = d_inode(dentry); + ino_t ino; + struct component_name dname; /* object name */ + struct inode *iplist[2]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); + + /* Init inode for quota operations. */ + dquot_initialize(dip); + dquot_initialize(ip); + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + /* + * delete the entry of target file from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_unlink: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + goto out1; + } + + ASSERT(ip->i_nlink); + + ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + + /* update target's inode */ + inode_dec_link_count(ip); + + /* + * commit zero link count object + */ + if (ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + rc = new_size; + goto out1; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + /* + * If xtTruncate was incomplete, commit synchronously to avoid + * timing complications + */ + rc = txCommit(tid, 2, &iplist[0], commit_flag); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(dip->i_sb, 0); + mutex_lock(&JFS_IP(ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + } else + rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } + + if (ip->i_nlink == 0) + set_cflag(COMMIT_Nolink, ip); + + IWRITE_UNLOCK(ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out1: + free_UCSname(&dname); + out: + jfs_info("jfs_unlink: rc:%d", rc); + return rc; +} + +/* + * NAME: commitZeroLink() + * + * FUNCTION: for non-directory, called by jfs_remove(), + * truncate a regular file, directory or symbolic + * link to zero length. return 0 if type is not + * one of these. + * + * if the file is currently associated with a VM segment + * only permanent disk and inode map resources are freed, + * and neither the inode nor indirect blocks are modified + * so that the resources can be later freed in the work + * map by ctrunc1. + * if there is no VM segment on entry, the resources are + * freed in both work and permanent map. + * (? for temporary file - memory object is cached even + * after no reference: + * reference count > 0 - ) + * + * PARAMETERS: cd - pointer to commit data structure. + * current inode is the one to truncate. + * + * RETURN: Errors from subroutines + */ +static s64 commitZeroLink(tid_t tid, struct inode *ip) +{ + int filetype; + struct tblock *tblk; + + jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip); + + filetype = ip->i_mode & S_IFMT; + switch (filetype) { + case S_IFREG: + break; + case S_IFLNK: + /* fast symbolic link */ + if (ip->i_size < IDATASIZE) { + ip->i_size = 0; + return 0; + } + break; + default: + assert(filetype != S_IFDIR); + return 0; + } + + set_cflag(COMMIT_Freewmap, ip); + + /* mark transaction of block map update type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache if COMMIT_PWMAP, + * free xtree/data blocks from persistent block map, and + * free xtree/data blocks from working block map if COMMIT_PWMAP; + */ + if (ip->i_size) + return xtTruncate_pmap(tid, ip, 0); + + return 0; +} + + +/* + * NAME: jfs_free_zero_link() + * + * FUNCTION: for non-directory, called by iClose(), + * free resources of a file from cache and WORKING map + * for a file previously committed with zero link count + * while associated with a pager object, + * + * PARAMETER: ip - pointer to inode of file. + */ +void jfs_free_zero_link(struct inode *ip) +{ + int type; + + jfs_info("jfs_free_zero_link: ip = 0x%p", ip); + + /* return if not reg or symbolic link or if size is + * already ok. + */ + type = ip->i_mode & S_IFMT; + + switch (type) { + case S_IFREG: + break; + case S_IFLNK: + /* if its contained in inode nothing to do */ + if (ip->i_size < IDATASIZE) + return; + break; + default: + return; + } + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->ea); + int xlen = lengthDXD(&JFS_IP(ip)->ea); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + /* free EA pages from cache */ + invalidate_dxd_metapages(ip, JFS_IP(ip)->ea); + + /* free EA extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->acl); + int xlen = lengthDXD(&JFS_IP(ip)->acl); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + invalidate_dxd_metapages(ip, JFS_IP(ip)->acl); + + /* free ACL extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache, and + * free xtree/data blocks from working block map; + */ + if (ip->i_size) + xtTruncate(0, ip, 0, COMMIT_WMAP); +} + +/* + * NAME: jfs_link(vp, dvp, name, crp) + * + * FUNCTION: create a link to by the name = + * in the parent directory + * + * PARAMETER: vp - target object + * dvp - parent directory of new link + * name - name of new link to target object + * crp - credential + * + * RETURN: Errors from subroutines + * + * note: + * JFS does NOT support link() on directories (to prevent circular + * path in the directory hierarchy); + * EPERM: the target object is a directory, and either the caller + * does not have appropriate privileges or the implementation prohibits + * using link() on directories [XPG4.2]. + * + * JFS does NOT support links between file systems: + * EXDEV: target object and new link are on different file systems and + * implementation does not support links between file systems [XPG4.2]. + */ +static int jfs_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + int rc; + tid_t tid; + struct inode *ip = d_inode(old_dentry); + ino_t ino; + struct component_name dname; + struct btstack btstack; + struct inode *iplist[2]; + + jfs_info("jfs_link: %pd %pd", old_dentry, dentry); + + dquot_initialize(dir); + + tid = txBegin(ip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + /* + * scan parent directory for entry/freespace + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) + goto free_dname; + + /* + * create entry for new link in parent directory + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) + goto free_dname; + + /* update object inode */ + inc_nlink(ip); /* for new link */ + ip->i_ctime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + mark_inode_dirty(dir); + ihold(ip); + + iplist[0] = ip; + iplist[1] = dir; + rc = txCommit(tid, 2, &iplist[0], 0); + + if (rc) { + drop_nlink(ip); /* never instantiated */ + iput(ip); + } else + d_instantiate(dentry, ip); + + free_dname: + free_UCSname(&dname); + + out: + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + + jfs_info("jfs_link: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_symlink(dip, dentry, name) + * + * FUNCTION: creates a symbolic link to by name + * in directory + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link + * + * RETURN: errors from subroutines + * + * note: + * ENAMETOOLONG: pathname resolution of a symbolic link produced + * an intermediate result whose length exceeds PATH_MAX [XPG4.2] +*/ + +static int jfs_symlink(struct inode *dip, struct dentry *dentry, + const char *name) +{ + int rc; + tid_t tid; + ino_t ino = 0; + struct component_name dname; + int ssize; /* source pathname size */ + struct btstack btstack; + struct inode *ip = d_inode(dentry); + unchar *i_fastsymlink; + s64 xlen = 0; + int bmask = 0, xsize; + s64 xaddr; + struct metapage *mp; + struct super_block *sb; + struct tblock *tblk; + + struct inode *iplist[2]; + + jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + + dquot_initialize(dip); + + ssize = strlen(name) + 1; + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * allocate on-disk/in-memory inode for symbolic link: + * (iAlloc() returns new, locked inode) + */ + ip = ialloc(dip, S_IFLNK | 0777); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) + goto out3; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + /* fix symlink access permission + * (dir_create() ANDs in the u.u_cmask, + * but symlinks really need to be 777 access) + */ + ip->i_mode |= 0777; + + /* + * write symbolic link target path name + */ + xtInitRoot(tid, ip); + + /* + * write source path name inline in on-disk inode (fast symbolic link) + */ + + if (ssize <= IDATASIZE) { + ip->i_op = &jfs_fast_symlink_inode_operations; + + i_fastsymlink = JFS_IP(ip)->i_inline; + memcpy(i_fastsymlink, name, ssize); + ip->i_size = ssize - 1; + + /* + * if symlink is > 128 bytes, we don't have the space to + * store inline extended attributes + */ + if (ssize > sizeof (JFS_IP(ip)->i_inline)) + JFS_IP(ip)->mode2 &= ~INLINEEA; + + jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + ssize, name); + } + /* + * write source path name in a single extent + */ + else { + jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); + + ip->i_op = &jfs_symlink_inode_operations; + ip->i_mapping->a_ops = &jfs_aops; + + /* + * even though the data of symlink object (source + * path name) is treated as non-journaled user data, + * it is read/written thru buffer cache for performance. + */ + sb = ip->i_sb; + bmask = JFS_SBI(sb)->bsize - 1; + xsize = (ssize + bmask) & ~bmask; + xaddr = 0; + xlen = xsize >> JFS_SBI(sb)->l2bsize; + if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { + txAbort(tid, 0); + goto out3; + } + ip->i_size = ssize - 1; + while (ssize) { + /* This is kind of silly since PATH_MAX == 4K */ + int copy_size = min(ssize, PSIZE); + + mp = get_metapage(ip, xaddr, PSIZE, 1); + + if (mp == NULL) { + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + rc = -EIO; + txAbort(tid, 0); + goto out3; + } + memcpy(mp->data, name, copy_size); + flush_metapage(mp); + ssize -= copy_size; + name += copy_size; + xaddr += JFS_SBI(sb)->nbperpage; + } + } + + /* + * create entry for symbolic link in parent directory + */ + rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE); + if (rc == 0) { + ino = ip->i_ino; + rc = dtInsert(tid, dip, &dname, &ino, &btstack); + } + if (rc) { + if (xlen) + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + txAbort(tid, 0); + /* discard new inode */ + goto out3; + } + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + /* + * commit update of parent directory and link object + */ + + iplist[0] = dip; + iplist[1] = ip; + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + unlock_new_inode(ip); + iput(ip); + } else { + unlock_new_inode(ip); + d_instantiate(dentry, ip); + } + + out2: + free_UCSname(&dname); + + out1: + jfs_info("jfs_symlink: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_rename + * + * FUNCTION: rename a file or directory + */ +static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btstack btstack; + ino_t ino; + struct component_name new_dname; + struct inode *new_ip; + struct component_name old_dname; + struct inode *old_ip; + int rc; + tid_t tid; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + int ipcount; + struct inode *iplist[4]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + + jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_ip = d_inode(old_dentry); + new_ip = d_inode(new_dentry); + + if ((rc = get_UCSname(&old_dname, old_dentry))) + goto out1; + + if ((rc = get_UCSname(&new_dname, new_dentry))) + goto out2; + + /* + * Make sure source inode number is what we think it is + */ + rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); + if (rc || (ino != old_ip->i_ino)) { + rc = -ENOENT; + goto out3; + } + + /* + * Make sure dest inode number (if any) is what we think it is + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); + if (!rc) { + if ((!new_ip) || (ino != new_ip->i_ino)) { + rc = -ESTALE; + goto out3; + } + } else if (rc != -ENOENT) + goto out3; + else if (new_ip) { + /* no entry exists, but one was expected */ + rc = -ESTALE; + goto out3; + } + + if (S_ISDIR(old_ip->i_mode)) { + if (new_ip) { + if (!dtEmpty(new_ip)) { + rc = -ENOTEMPTY; + goto out3; + } + } + } else if (new_ip) { + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); + /* Init inode for quota operations. */ + dquot_initialize(new_ip); + } + + /* + * The real work starts here + */ + tid = txBegin(new_dir->i_sb, 0); + + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); + if (old_dir != new_dir) + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); + + if (new_ip) { + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); + /* + * Change existing directory entry to new inode number + */ + ino = new_ip->i_ino; + rc = dtModify(tid, new_dir, &new_dname, &ino, + old_ip->i_ino, JFS_RENAME); + if (rc) + goto out4; + drop_nlink(new_ip); + if (S_ISDIR(new_ip->i_mode)) { + drop_nlink(new_ip); + if (new_ip->i_nlink) { + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + if (!S_ISDIR(old_ip->i_mode) && new_ip) + IWRITE_UNLOCK(new_ip); + jfs_error(new_ip->i_sb, + "new_ip->i_nlink != 0\n"); + return -EIO; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else if (new_ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, new_ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, new_ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + goto out4; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else { + new_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(new_ip); + } + } else { + /* + * Add new directory entry + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, + JFS_CREATE); + if (rc) { + jfs_err("jfs_rename didn't expect dtSearch to fail " + "w/rc = %d", rc); + goto out4; + } + + ino = old_ip->i_ino; + rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); + if (rc) { + if (rc == -EIO) + jfs_err("jfs_rename: dtInsert returned -EIO"); + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) + inc_nlink(new_dir); + } + /* + * Remove old directory entry + */ + + ino = old_ip->i_ino; + rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); + if (rc) { + jfs_err("jfs_rename did not expect dtDelete to return rc = %d", + rc); + txAbort(tid, 1); /* Marks Filesystem dirty */ + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) { + drop_nlink(old_dir); + if (old_dir != new_dir) { + /* + * Change inode number of parent for moved directory + */ + + JFS_IP(old_ip)->i_dtroot.header.idotdot = + cpu_to_le32(new_dir->i_ino); + + /* Linelock header of dtree */ + tlck = txLock(tid, old_ip, + (struct metapage *) &JFS_IP(old_ip)->bxflag, + tlckDTREE | tlckBTROOT | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + } + } + + /* + * Update ctime on changed/moved inodes & mark dirty + */ + old_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_ip); + + new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); + mark_inode_dirty(new_dir); + + /* Build list of inodes modified by this transaction */ + ipcount = 0; + iplist[ipcount++] = old_ip; + if (new_ip) + iplist[ipcount++] = new_ip; + iplist[ipcount++] = old_dir; + + if (old_dir != new_dir) { + iplist[ipcount++] = new_dir; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + mark_inode_dirty(old_dir); + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + rc = txCommit(tid, ipcount, iplist, commit_flag); + + out4: + txEnd(tid); + if (new_ip) + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(new_ip->i_sb, 0); + mutex_lock(&JFS_IP(new_ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, new_ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); + rc = new_size; + } else + rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + } + if (new_ip && (new_ip->i_nlink == 0)) + set_cflag(COMMIT_Nolink, new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, old_dir)) { + if (old_dir->i_size > 1) + jfs_truncate_nolock(old_dir, 0); + + clear_cflag(COMMIT_Stale, old_dir); + } + + jfs_info("jfs_rename: returning %d", rc); + return rc; +} + + +/* + * NAME: jfs_mknod + * + * FUNCTION: Create a special file (device) + */ +static int jfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct jfs_inode_info *jfs_ip; + struct btstack btstack; + struct component_name dname; + ino_t ino; + struct inode *ip; + struct inode *iplist[2]; + int rc; + tid_t tid; + struct tblock *tblk; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + jfs_info("jfs_mknod: %pd", dentry); + + dquot_initialize(dir); + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + ip = ialloc(dir, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out1; + } + jfs_ip = JFS_IP(ip); + + tid = txBegin(dir->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dir); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dir, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) { + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) { + txAbort(tid, 0); + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + jfs_ip->dev = new_encode_dev(rdev); + init_special_inode(ip, ip->i_mode, rdev); + + mark_inode_dirty(ip); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dir); + + iplist[0] = dir; + iplist[1] = ip; + rc = txCommit(tid, 2, iplist, 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + unlock_new_inode(ip); + iput(ip); + } else { + unlock_new_inode(ip); + d_instantiate(dentry, ip); + } + + out1: + free_UCSname(&dname); + + out: + jfs_info("jfs_mknod: returning %d", rc); + return rc; +} + +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) +{ + struct btstack btstack; + ino_t inum; + struct inode *ip; + struct component_name key; + int rc; + + jfs_info("jfs_lookup: name = %pd", dentry); + + if ((rc = get_UCSname(&key, dentry))) + return ERR_PTR(rc); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + free_UCSname(&key); + if (rc == -ENOENT) { + ip = NULL; + } else if (rc) { + jfs_err("jfs_lookup: dtSearch returned %d", rc); + ip = ERR_PTR(rc); + } else { + ip = jfs_iget(dip->i_sb, inum); + if (IS_ERR(ip)) + jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); + } + + return d_splice_alias(ip, dentry); +} + +static struct inode *jfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = jfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_get_parent(struct dentry *dentry) +{ + unsigned long parent_ino; + + parent_ino = + le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); + + return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino)); +} + +const struct inode_operations jfs_dir_inode_operations = { + .create = jfs_create, + .lookup = jfs_lookup, + .link = jfs_link, + .unlink = jfs_unlink, + .symlink = jfs_symlink, + .mkdir = jfs_mkdir, + .rmdir = jfs_rmdir, + .mknod = jfs_mknod, + .rename = jfs_rename, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, +#endif +}; + +const struct file_operations jfs_dir_operations = { + .read = generic_read_dir, + .iterate = jfs_readdir, + .fsync = jfs_fsync, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif + .llseek = generic_file_llseek, +}; + +static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) +{ + unsigned long hash; + int i; + + hash = init_name_hash(); + for (i=0; i < this->len; i++) + hash = partial_name_hash(tolower(this->name[i]), hash); + this->hash = end_name_hash(hash); + + return 0; +} + +static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + int i, result = 1; + + if (len != name->len) + goto out; + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) + goto out; + } + result = 0; +out: + return result; +} + +static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) +{ + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (d_really_is_positive(dentry)) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!flags) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + return 1; +} + +const struct dentry_operations jfs_ci_dentry_operations = +{ + .d_hash = jfs_ci_hash, + .d_compare = jfs_ci_compare, + .d_revalidate = jfs_ci_revalidate, +}; diff --git a/kernel/fs/jfs/resize.c b/kernel/fs/jfs/resize.c new file mode 100644 index 000000000..90b3bc21e --- /dev/null +++ b/kernel/fs/jfs/resize.c @@ -0,0 +1,543 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#define BITSPERPAGE (PSIZE << 3) +#define L2MEGABYTE 20 +#define MEGABYTE (1 << L2MEGABYTE) +#define MEGABYTE32 (MEGABYTE << 5) + +/* convert block number to bmap file page number */ +#define BLKTODMAPN(b)\ + (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) + +/* + * jfs_extendfs() + * + * function: extend file system; + * + * |-------------------------------|----------|----------| + * file system space fsck inline log + * workspace space + * + * input: + * new LVSize: in LV blocks (required) + * new LogSize: in LV blocks (optional) + * new FSSize: in LV blocks (optional) + * + * new configuration: + * 1. set new LogSize as specified or default from new LVSize; + * 2. compute new FSCKSize from new LVSize; + * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where + * assert(new FSSize >= old FSSize), + * i.e., file system must not be shrunk; + */ +int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipbmap2; + struct inode *ipimap = sbi->ipimap; + struct jfs_log *log = sbi->log; + struct bmap *bmp = sbi->bmap; + s64 newLogAddress, newFSCKAddress; + int newFSCKSize; + s64 newMapSize = 0, mapSize; + s64 XAddress, XSize, nblocks, xoff, xaddr, t64; + s64 oldLVSize; + s64 newFSSize; + s64 VolumeSize; + int newNpages = 0, nPages, newPage, xlen, t32; + int tid; + int log_formatted = 0; + struct inode *iplist[1]; + struct jfs_superblock *j_sb, *j_sb2; + s64 old_agsize; + int agsizechanged = 0; + struct buffer_head *bh, *bh2; + + /* If the volume hasn't grown, get out now */ + + if (sbi->mntflag & JFS_INLINELOG) + oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd); + else + oldLVSize = addressPXD(&sbi->fsckpxd) + + lengthPXD(&sbi->fsckpxd); + + if (oldLVSize >= newLVSize) { + printk(KERN_WARNING + "jfs_extendfs: volume hasn't grown, returning\n"); + goto out; + } + + VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + if (VolumeSize) { + if (newLVSize > VolumeSize) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + } else { + /* check the device */ + bh = sb_bread(sb, newLVSize - 1); + if (!bh) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + bforget(bh); + } + + /* Can't extend write-protected drive */ + + if (isReadOnly(ipbmap)) { + printk(KERN_WARNING "jfs_extendfs: read-only file system\n"); + rc = -EROFS; + goto out; + } + + /* + * reconfigure LV spaces + * --------------------- + * + * validate new size, or, if not specified, determine new size + */ + + /* + * reconfigure inline log space: + */ + if ((sbi->mntflag & JFS_INLINELOG)) { + if (newLogSize == 0) { + /* + * no size specified: default to 1/256 of aggregate + * size; rounded up to a megabyte boundary; + */ + newLogSize = newLVSize >> 8; + t32 = (1 << (20 - sbi->l2bsize)) - 1; + newLogSize = (newLogSize + t32) & ~t32; + newLogSize = + min(newLogSize, MEGABYTE32 >> sbi->l2bsize); + } else { + /* + * convert the newLogSize to fs blocks. + * + * Since this is given in megabytes, it will always be + * an even number of pages. + */ + newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize; + } + + } else + newLogSize = 0; + + newLogAddress = newLVSize - newLogSize; + + /* + * reconfigure fsck work space: + * + * configure it to the end of the logical volume regardless of + * whether file system extends to the end of the aggregate; + * Need enough 4k pages to cover: + * - 1 bit per block in aggregate rounded up to BPERDMAP boundary + * - 1 extra page to handle control page and intermediate level pages + * - 50 extra pages for the chkdsk service log + */ + t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) + << L2BPERDMAP; + t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50; + newFSCKSize = t32 << sbi->l2nbperpage; + newFSCKAddress = newLogAddress - newFSCKSize; + + /* + * compute new file system space; + */ + newFSSize = newLVSize - newLogSize - newFSCKSize; + + /* file system cannot be shrunk */ + if (newFSSize < bmp->db_mapsize) { + rc = -EINVAL; + goto out; + } + + /* + * If we're expanding enough that the inline log does not overlap + * the old one, we can format the new log before we quiesce the + * filesystem. + */ + if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) { + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto out; + log_formatted = 1; + } + /* + * quiesce file system + * + * (prepare to move the inline log and to prevent map update) + * + * block any new transactions and wait for completion of + * all wip transactions and flush modified pages s.t. + * on-disk file system is in consistent state and + * log is not required for recovery. + */ + txQuiesce(sb); + + /* Reset size of direct inode */ + sbi->direct_inode->i_size = sb->s_bdev->bd_inode->i_size; + + if (sbi->mntflag & JFS_INLINELOG) { + /* + * deactivate old inline log + */ + lmLogShutdown(log); + + /* + * mark on-disk super block for fs in transition; + * + * update on-disk superblock for the new space configuration + * of inline log space and fsck work space descriptors: + * N.B. FS descriptor is NOT updated; + * + * crash recovery: + * logredo(): if FM_EXTENDFS, return to fsck() for cleanup; + * fsck(): if FM_EXTENDFS, reformat inline log and fsck + * workspace from superblock inline log descriptor and fsck + * workspace descriptor; + */ + + /* read in superblock */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() in progress */ + j_sb->s_state |= cpu_to_le32(FM_EXTENDFS); + j_sb->s_xsize = cpu_to_le64(newFSSize); + PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress); + PXDlength(&j_sb->s_xfsckpxd, newFSCKSize); + PXDaddress(&j_sb->s_xlogpxd, newLogAddress); + PXDlength(&j_sb->s_xlogpxd, newLogSize); + + /* synchronously update superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + /* + * format new inline log synchronously; + * + * crash recovery: if log move in progress, + * reformat log and exit success; + */ + if (!log_formatted) + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto error_out; + + /* + * activate new log + */ + log->base = newLogAddress; + log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits); + if ((rc = lmLogInit(log))) + goto error_out; + } + + /* + * extend block allocation map + * --------------------------- + * + * extendfs() for new extension, retry after crash recovery; + * + * note: both logredo() and fsck() rebuild map from + * the bitmap and configuration parameter from superblock + * (disregarding all other control information in the map); + * + * superblock: + * s_size: aggregate size in physical blocks; + */ + /* + * compute the new block allocation map configuration + * + * map dinode: + * di_size: map file size in byte; + * di_nblocks: number of blocks allocated for map file; + * di_mapsize: number of blocks in aggregate (covered by map); + * map control page: + * db_mapsize: number of blocks in aggregate (covered by map); + */ + newMapSize = newFSSize; + /* number of data pages of new bmap file: + * roundup new size to full dmap page boundary and + * add 1 extra dmap page for next extendfs() + */ + t64 = (newMapSize - 1) + BPERDMAP; + newNpages = BLKTODMAPN(t64) + 1; + + /* + * extend map from current map (WITHOUT growing mapfile) + * + * map new extension with unmapped part of the last partial + * dmap page, if applicable, and extra page(s) allocated + * at end of bmap by mkfs() or previous extendfs(); + */ + extendBmap: + /* compute number of blocks requested to extend */ + mapSize = bmp->db_mapsize; + XAddress = mapSize; /* eXtension Address */ + XSize = newMapSize - mapSize; /* eXtension Size */ + old_agsize = bmp->db_agsize; /* We need to know if this changes */ + + /* compute number of blocks that can be extended by current mapfile */ + t64 = dbMapFileSizeToMapSize(ipbmap); + if (mapSize > t64) { + printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n", + (long long) mapSize, (long long) t64); + rc = -EIO; + goto error_out; + } + nblocks = min(t64 - mapSize, XSize); + + /* + * update map pages for new extension: + * + * update/init dmap and bubble up the control hierarchy + * incrementally fold up dmaps into upper levels; + * update bmap control page; + */ + if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) + goto error_out; + + agsizechanged |= (bmp->db_agsize != old_agsize); + + /* + * the map now has extended to cover additional nblocks: + * dn_mapsize = oldMapsize + nblocks; + */ + /* ipbmap->i_mapsize += nblocks; */ + XSize -= nblocks; + + /* + * grow map file to cover remaining extension + * and/or one extra dmap page for next extendfs(); + * + * allocate new map pages and its backing blocks, and + * update map file xtree + */ + /* compute number of data pages of current bmap file */ + nPages = ipbmap->i_size >> L2PSIZE; + + /* need to grow map file ? */ + if (nPages == newNpages) + goto finalizeBmap; + + /* + * grow bmap file for the new map pages required: + * + * allocate growth at the start of newly extended region; + * bmap file only grows sequentially, i.e., both data pages + * and possibly xtree index pages may grow in append mode, + * s.t. logredo() can reconstruct pre-extension state + * by washing away bmap file of pages outside s_size boundary; + */ + /* + * journal map file growth as if a regular file growth: + * (note: bmap is created with di_mode = IFJOURNAL|IFREG); + * + * journaling of bmap file growth is not required since + * logredo() do/can not use log records of bmap file growth + * but it provides careful write semantics, pmap update, etc.; + */ + /* synchronous write of data pages: bmap data pages are + * cached in meta-data cache, and not written out + * by txCommit(); + */ + filemap_fdatawait(ipbmap->i_mapping); + filemap_write_and_wait(ipbmap->i_mapping); + diWriteSpecial(ipbmap, 0); + + newPage = nPages; /* first new page number */ + xoff = newPage << sbi->l2nbperpage; + xlen = (newNpages - nPages) << sbi->l2nbperpage; + xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1); + xaddr = XAddress; + + tid = txBegin(sb, COMMIT_FORCE); + + if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) { + txEnd(tid); + goto error_out; + } + /* update bmap file size */ + ipbmap->i_size += xlen << sbi->l2bsize; + inode_add_bytes(ipbmap, xlen << sbi->l2bsize); + + iplist[0] = ipbmap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + + if (rc) + goto error_out; + + /* + * map file has been grown now to cover extension to further out; + * di_size = new map file size; + * + * if huge extension, the previous extension based on previous + * map file size may not have been sufficient to cover whole extension + * (it could have been used up for new map pages), + * but the newly grown map file now covers lot bigger new free space + * available for further extension of map; + */ + /* any more blocks to extend ? */ + if (XSize) + goto extendBmap; + + finalizeBmap: + /* finalize bmap */ + dbFinalizeBmap(ipbmap); + + /* + * update inode allocation map + * --------------------------- + * + * move iag lists from old to new iag; + * agstart field is not updated for logredo() to reconstruct + * iag lists if system crash occurs. + * (computation of ag number from agstart based on agsize + * will correctly identify the new ag); + */ + /* if new AG size the same as old AG size, done! */ + if (agsizechanged) { + if ((rc = diExtendFS(ipimap, ipbmap))) + goto error_out; + + /* finalize imap */ + if ((rc = diSync(ipimap))) + goto error_out; + } + + /* + * finalize + * -------- + * + * extension is committed when on-disk super block is + * updated with new descriptors: logredo will recover + * crash before it to pre-extension state; + */ + + /* sync log to skip log replay of bmap file growth transaction; */ + /* lmLogSync(log, 1); */ + + /* + * synchronous write bmap global control page; + * for crash before completion of write + * logredo() will recover to pre-extendfs state; + * for crash after completion of write, + * logredo() will recover post-extendfs state; + */ + if ((rc = dbSync(ipbmap))) + goto error_out; + + /* + * copy primary bmap inode to secondary bmap inode + */ + + ipbmap2 = diReadSpecial(sb, BMAP_I, 1); + if (ipbmap2 == NULL) { + printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n"); + goto error_out; + } + memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288); + ipbmap2->i_size = ipbmap->i_size; + ipbmap2->i_blocks = ipbmap->i_blocks; + + diWriteSpecial(ipbmap2, 1); + diFreeSpecial(ipbmap2); + + /* + * update superblock + */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() completion */ + j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS); + j_sb->s_size = cpu_to_le64(bmp->db_mapsize << + le16_to_cpu(j_sb->s_l2bfactor)); + j_sb->s_agsize = cpu_to_le32(bmp->db_agsize); + + /* update inline log space descriptor */ + if (sbi->mntflag & JFS_INLINELOG) { + PXDaddress(&(j_sb->s_logpxd), newLogAddress); + PXDlength(&(j_sb->s_logpxd), newLogSize); + } + + /* record log's mount serial number */ + j_sb->s_logserial = cpu_to_le32(log->serial); + + /* update fsck work space descriptor */ + PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress); + PXDlength(&(j_sb->s_fsckpxd), newFSCKSize); + j_sb->s_fscklog = 1; + /* sb->s_fsckloglen remains the same */ + + /* Update secondary superblock */ + bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (bh2) { + j_sb2 = (struct jfs_superblock *)bh2->b_data; + memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock)); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh2); + brelse(bh2); + } + + /* write primary superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + goto resume; + + error_out: + jfs_error(sb, "\n"); + + resume: + /* + * resume file system transactions + */ + txResume(sb); + + out: + return rc; +} diff --git a/kernel/fs/jfs/super.c b/kernel/fs/jfs/super.c new file mode 100644 index 000000000..4cd9798f4 --- /dev/null +++ b/kernel/fs/jfs/super.c @@ -0,0 +1,1012 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_inode.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_acl.h" +#include "jfs_debug.h" +#include "jfs_xattr.h" + +MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); +MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *jfs_inode_cachep; + +static const struct super_operations jfs_super_operations; +static const struct export_operations jfs_export_operations; +static struct file_system_type jfs_fs_type; + +#define MAX_COMMIT_THREADS 64 +static int commit_threads; +module_param(commit_threads, int, 0); +MODULE_PARM_DESC(commit_threads, "Number of commit threads"); + +static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; +struct task_struct *jfsIOthread; +struct task_struct *jfsSyncThread; + +#ifdef CONFIG_JFS_DEBUG +int jfsloglevel = JFS_LOGLEVEL_WARN; +module_param(jfsloglevel, int, 0644); +MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); +#endif + +static void jfs_handle_error(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sb->s_flags & MS_RDONLY) + return; + + updateSuper(sb, FM_DIRTY); + + if (sbi->flag & JFS_ERR_PANIC) + panic("JFS (device %s): panic forced after error\n", + sb->s_id); + else if (sbi->flag & JFS_ERR_REMOUNT_RO) { + jfs_err("ERROR: (device %s): remounting filesystem as read-only\n", + sb->s_id); + sb->s_flags |= MS_RDONLY; + } + + /* nothing is done for continue beyond marking the superblock dirty */ +} + +void jfs_error(struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("ERROR: (device %s): %ps: %pV\n", + sb->s_id, __builtin_return_address(0), &vaf); + + va_end(args); + + jfs_handle_error(sb); +} + +static struct inode *jfs_alloc_inode(struct super_block *sb) +{ + struct jfs_inode_info *jfs_inode; + + jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + if (!jfs_inode) + return NULL; +#ifdef CONFIG_QUOTA + memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); +#endif + return &jfs_inode->vfs_inode; +} + +static void jfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct jfs_inode_info *ji = JFS_IP(inode); + kmem_cache_free(jfs_inode_cachep, ji); +} + +static void jfs_destroy_inode(struct inode *inode) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + BUG_ON(!list_empty(&ji->anon_inode_list)); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + call_rcu(&inode->i_rcu, jfs_i_callback); +} + +static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); + s64 maxinodes; + struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; + + jfs_info("In jfs_statfs"); + buf->f_type = JFS_SUPER_MAGIC; + buf->f_bsize = sbi->bsize; + buf->f_blocks = sbi->bmap->db_mapsize; + buf->f_bfree = sbi->bmap->db_nfree; + buf->f_bavail = sbi->bmap->db_nfree; + /* + * If we really return the number of allocated & free inodes, some + * applications will fail because they won't see enough free inodes. + * We'll try to calculate some guess as to how many inodes we can + * really allocate + * + * buf->f_files = atomic_read(&imap->im_numinos); + * buf->f_ffree = atomic_read(&imap->im_numfree); + */ + maxinodes = min((s64) atomic_read(&imap->im_numinos) + + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) + << L2INOSPEREXT), (s64) 0xffffffffLL); + buf->f_files = maxinodes; + buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - + atomic_read(&imap->im_numfree)); + buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2, + sizeof(sbi->uuid)/2); + + buf->f_namelen = JFS_NAME_MAX; + return 0; +} + +static void jfs_put_super(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + jfs_info("In jfs_put_super"); + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + rc = jfs_umount(sb); + if (rc) + jfs_err("jfs_umount failed with return code %d", rc); + + unload_nls(sbi->nls_tab); + + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + iput(sbi->direct_inode); + + kfree(sbi); +} + +enum { + Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, + Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, + Opt_discard, Opt_nodiscard, Opt_discard_minblk +}; + +static const match_table_t tokens = { + {Opt_integrity, "integrity"}, + {Opt_nointegrity, "nointegrity"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_resize, "resize=%u"}, + {Opt_resize_nosize, "resize"}, + {Opt_errors, "errors=%s"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%u"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_discard_minblk, "discard=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, + int *flag) +{ + void *nls_map = (void *)-1; /* -1: no change; NULL: none */ + char *p; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + *newLVSize = 0; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_integrity: + *flag &= ~JFS_NOINTEGRITY; + break; + case Opt_nointegrity: + *flag |= JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + if (!strcmp(args[0].from, "none")) + nls_map = NULL; + else { + nls_map = load_nls(args[0].from); + if (!nls_map) { + pr_err("JFS: charset not found\n"); + goto cleanup; + } + } + break; + case Opt_resize: + { + char *resize = args[0].from; + int rc = kstrtoll(resize, 0, newLVSize); + + if (rc) + goto cleanup; + break; + } + case Opt_resize_nosize: + { + *newLVSize = sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits; + if (*newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); + break; + } + case Opt_errors: + { + char *errors = args[0].from; + if (!errors || !*errors) + goto cleanup; + if (!strcmp(errors, "continue")) { + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_CONTINUE; + } else if (!strcmp(errors, "remount-ro")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_REMOUNT_RO; + } else if (!strcmp(errors, "panic")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag |= JFS_ERR_PANIC; + } else { + pr_err("JFS: %s is an invalid error handler\n", + errors); + goto cleanup; + } + break; + } + +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + *flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + *flag |= JFS_GRPQUOTA; + break; +#else + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + pr_err("JFS: quota operations not supported\n"); + break; +#endif + case Opt_uid: + { + char *uid = args[0].from; + uid_t val; + int rc = kstrtouint(uid, 0, &val); + + if (rc) + goto cleanup; + sbi->uid = make_kuid(current_user_ns(), val); + if (!uid_valid(sbi->uid)) + goto cleanup; + break; + } + + case Opt_gid: + { + char *gid = args[0].from; + gid_t val; + int rc = kstrtouint(gid, 0, &val); + + if (rc) + goto cleanup; + sbi->gid = make_kgid(current_user_ns(), val); + if (!gid_valid(sbi->gid)) + goto cleanup; + break; + } + + case Opt_umask: + { + char *umask = args[0].from; + int rc = kstrtouint(umask, 8, &sbi->umask); + + if (rc) + goto cleanup; + if (sbi->umask & ~0777) { + pr_err("JFS: Invalid value of umask\n"); + goto cleanup; + } + break; + } + + case Opt_discard: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + sbi->minblks_trim = 64; + if (blk_queue_discard(q)) + *flag |= JFS_DISCARD; + else + pr_err("JFS: discard option not supported on device\n"); + break; + } + + case Opt_nodiscard: + *flag &= ~JFS_DISCARD; + break; + + case Opt_discard_minblk: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + char *minblks_trim = args[0].from; + int rc; + if (blk_queue_discard(q)) { + *flag |= JFS_DISCARD; + rc = kstrtouint(minblks_trim, 0, + &sbi->minblks_trim); + if (rc) + goto cleanup; + } else + pr_err("JFS: discard option not supported on device\n"); + break; + } + + default: + printk("jfs: Unrecognized mount option \"%s\" or missing value\n", + p); + goto cleanup; + } + } + + if (nls_map != (void *) -1) { + /* Discard old (if remount) */ + unload_nls(sbi->nls_tab); + sbi->nls_tab = nls_map; + } + return 1; + +cleanup: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + return 0; +} + +static int jfs_remount(struct super_block *sb, int *flags, char *data) +{ + s64 newLVSize = 0; + int rc = 0; + int flag = JFS_SBI(sb)->flag; + int ret; + + sync_filesystem(sb); + if (!parse_options(data, sb, &newLVSize, &flag)) + return -EINVAL; + + if (newLVSize) { + if (sb->s_flags & MS_RDONLY) { + pr_err("JFS: resize requires volume to be mounted read-write\n"); + return -EROFS; + } + rc = jfs_extendfs(sb, newLVSize, 0); + if (rc) + return rc; + } + + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + /* + * Invalidate any previously read metadata. fsck may have + * changed the on-disk data since we mounted r/o + */ + truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + + /* mark the fs r/w for quota activity */ + sb->s_flags &= ~MS_RDONLY; + + dquot_resume(sb, -1); + return ret; + } + if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { + rc = dquot_suspend(sb, -1); + if (rc < 0) + return rc; + rc = jfs_umount_rw(sb); + JFS_SBI(sb)->flag = flag; + return rc; + } + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if (!(sb->s_flags & MS_RDONLY)) { + rc = jfs_umount_rw(sb); + if (rc) + return rc; + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + return ret; + } + JFS_SBI(sb)->flag = flag; + + return 0; +} + +static int jfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jfs_sb_info *sbi; + struct inode *inode; + int rc; + s64 newLVSize = 0; + int flag, ret = -EINVAL; + + jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); + + if (!new_valid_dev(sb->s_bdev->bd_dev)) + return -EOVERFLOW; + + sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sb->s_max_links = JFS_LINK_MAX; + sbi->sb = sb; + sbi->uid = INVALID_UID; + sbi->gid = INVALID_GID; + sbi->umask = -1; + + /* initialize the mount flag and determine the default error handler */ + flag = JFS_ERR_REMOUNT_RO; + + if (!parse_options((char *) data, sb, &newLVSize, &flag)) + goto out_kfree; + sbi->flag = flag; + +#ifdef CONFIG_JFS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + + if (newLVSize) { + pr_err("resize option for remount only\n"); + goto out_kfree; + } + + /* + * Initialize blocksize to 4K. + */ + sb_set_blocksize(sb, PSIZE); + + /* + * Set method vectors. + */ + sb->s_op = &jfs_super_operations; + sb->s_export_op = &jfs_export_operations; + sb->s_xattr = jfs_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + + /* + * Initialize direct-mapping inode/address-space + */ + inode = new_inode(sb); + if (inode == NULL) { + ret = -ENOMEM; + goto out_unload; + } + inode->i_ino = 0; + inode->i_size = sb->s_bdev->bd_inode->i_size; + inode->i_mapping->a_ops = &jfs_metapage_aops; + hlist_add_fake(&inode->i_hash); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + sbi->direct_inode = inode; + + rc = jfs_mount(sb); + if (rc) { + if (!silent) + jfs_err("jfs_mount failed w/return code = %d", rc); + goto out_mount_failed; + } + if (sb->s_flags & MS_RDONLY) + sbi->log = NULL; + else { + rc = jfs_mount_rw(sb, 0); + if (rc) { + if (!silent) { + jfs_err("jfs_mount_rw failed, return code = %d", + rc); + } + goto out_no_rw; + } + } + + sb->s_magic = JFS_SUPER_MAGIC; + + if (sbi->mntflag & JFS_OS2) + sb->s_d_op = &jfs_ci_dentry_operations; + + inode = jfs_iget(sb, ROOT_I); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out_no_rw; + } + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto out_no_root; + + /* logical blocks are represented by 40 bits in pxd_t, etc. */ + sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; +#if BITS_PER_LONG == 32 + /* + * Page cache is indexed by long. + * I would use MAX_LFS_FILESIZE, but it's only half as big + */ + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, + (u64)sb->s_maxbytes); +#endif + sb->s_time_gran = 1; + return 0; + +out_no_root: + jfs_err("jfs_read_super: get root dentry failed"); + +out_no_rw: + rc = jfs_umount(sb); + if (rc) + jfs_err("jfs_umount failed with return code %d", rc); +out_mount_failed: + filemap_write_and_wait(sbi->direct_inode->i_mapping); + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + make_bad_inode(sbi->direct_inode); + iput(sbi->direct_inode); + sbi->direct_inode = NULL; +out_unload: + unload_nls(sbi->nls_tab); +out_kfree: + kfree(sbi); + return ret; +} + +static int jfs_freeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + int rc = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + txQuiesce(sb); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "lmLogShutdown failed\n"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } + } + return 0; +} + +static int jfs_unfreeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + int rc = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "updateSuper failed\n"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "lmLogInit failed\n"); +out: + txResume(sb); + } + return rc; +} + +static struct dentry *jfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); +} + +static int jfs_sync_fs(struct super_block *sb, int wait) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + /* log == NULL indicates read-only mount */ + if (log) { + /* + * Write quota structures to quota file, sync_blockdev() will + * write them to disk later + */ + dquot_writeback_dquots(sb, -1); + jfs_flush_journal(log, wait); + jfs_syncpt(log, 0); + } + + return 0; +} + +static int jfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); + + if (uid_valid(sbi->uid)) + seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); + if (gid_valid(sbi->gid)) + seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); + if (sbi->umask != -1) + seq_printf(seq, ",umask=%03o", sbi->umask); + if (sbi->flag & JFS_NOINTEGRITY) + seq_puts(seq, ",nointegrity"); + if (sbi->flag & JFS_DISCARD) + seq_printf(seq, ",discard=%u", sbi->minblks_trim); + if (sbi->nls_tab) + seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); + if (sbi->flag & JFS_ERR_CONTINUE) + seq_printf(seq, ",errors=continue"); + if (sbi->flag & JFS_ERR_PANIC) + seq_printf(seq, ",errors=panic"); + +#ifdef CONFIG_QUOTA + if (sbi->flag & JFS_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->flag & JFS_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif + + return 0; +} + +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 0); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t jfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + mutex_lock(&inode->i_mutex); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 1); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +static struct dquot **jfs_get_dquots(struct inode *inode) +{ + return JFS_IP(inode)->i_dquot; +} +#endif + +static const struct super_operations jfs_super_operations = { + .alloc_inode = jfs_alloc_inode, + .destroy_inode = jfs_destroy_inode, + .dirty_inode = jfs_dirty_inode, + .write_inode = jfs_write_inode, + .evict_inode = jfs_evict_inode, + .put_super = jfs_put_super, + .sync_fs = jfs_sync_fs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, + .statfs = jfs_statfs, + .remount_fs = jfs_remount, + .show_options = jfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = jfs_quota_read, + .quota_write = jfs_quota_write, + .get_dquots = jfs_get_dquots, +#endif +}; + +static const struct export_operations jfs_export_operations = { + .fh_to_dentry = jfs_fh_to_dentry, + .fh_to_parent = jfs_fh_to_parent, + .get_parent = jfs_get_parent, +}; + +static struct file_system_type jfs_fs_type = { + .owner = THIS_MODULE, + .name = "jfs", + .mount = jfs_do_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("jfs"); + +static void init_once(void *foo) +{ + struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; + + memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); + INIT_LIST_HEAD(&jfs_ip->anon_inode_list); + init_rwsem(&jfs_ip->rdwrlock); + mutex_init(&jfs_ip->commit_mutex); + init_rwsem(&jfs_ip->xattr_sem); + spin_lock_init(&jfs_ip->ag_lock); + jfs_ip->active_ag = -1; + inode_init_once(&jfs_ip->vfs_inode); +} + +static int __init init_jfs_fs(void) +{ + int i; + int rc; + + jfs_inode_cachep = + kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (jfs_inode_cachep == NULL) + return -ENOMEM; + + /* + * Metapage initialization + */ + rc = metapage_init(); + if (rc) { + jfs_err("metapage_init failed w/rc = %d", rc); + goto free_slab; + } + + /* + * Transaction Manager initialization + */ + rc = txInit(); + if (rc) { + jfs_err("txInit failed w/rc = %d", rc); + goto free_metapage; + } + + /* + * I/O completion thread (endio) + */ + jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); + if (IS_ERR(jfsIOthread)) { + rc = PTR_ERR(jfsIOthread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto end_txmngr; + } + + if (commit_threads < 1) + commit_threads = num_online_cpus(); + if (commit_threads > MAX_COMMIT_THREADS) + commit_threads = MAX_COMMIT_THREADS; + + for (i = 0; i < commit_threads; i++) { + jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, + "jfsCommit"); + if (IS_ERR(jfsCommitThread[i])) { + rc = PTR_ERR(jfsCommitThread[i]); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + commit_threads = i; + goto kill_committask; + } + } + + jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); + if (IS_ERR(jfsSyncThread)) { + rc = PTR_ERR(jfsSyncThread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto kill_committask; + } + +#ifdef PROC_FS_JFS + jfs_proc_init(); +#endif + + rc = register_filesystem(&jfs_fs_type); + if (!rc) + return 0; + +#ifdef PROC_FS_JFS + jfs_proc_clean(); +#endif + kthread_stop(jfsSyncThread); +kill_committask: + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsIOthread); +end_txmngr: + txExit(); +free_metapage: + metapage_exit(); +free_slab: + kmem_cache_destroy(jfs_inode_cachep); + return rc; +} + +static void __exit exit_jfs_fs(void) +{ + int i; + + jfs_info("exit_jfs_fs called"); + + txExit(); + metapage_exit(); + + kthread_stop(jfsIOthread); + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsSyncThread); +#ifdef PROC_FS_JFS + jfs_proc_clean(); +#endif + unregister_filesystem(&jfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(jfs_inode_cachep); +} + +module_init(init_jfs_fs) +module_exit(exit_jfs_fs) diff --git a/kernel/fs/jfs/symlink.c b/kernel/fs/jfs/symlink.c new file mode 100644 index 000000000..80f42bcc4 --- /dev/null +++ b/kernel/fs/jfs/symlink.c @@ -0,0 +1,52 @@ +/* + * Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_xattr.h" + +static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = JFS_IP(d_inode(dentry))->i_inline; + nd_set_link(nd, s); + return NULL; +} + +const struct inode_operations jfs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = jfs_follow_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + +const struct inode_operations jfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + diff --git a/kernel/fs/jfs/xattr.c b/kernel/fs/jfs/xattr.c new file mode 100644 index 000000000..48b15a6e5 --- /dev/null +++ b/kernel/fs/jfs/xattr.c @@ -0,0 +1,1106 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Copyright (C) Christoph Hellwig, 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" +#include "jfs_dinode.h" +#include "jfs_extent.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +/* + * jfs_xattr.c: extended attribute service + * + * Overall design -- + * + * Format: + * + * Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit + * value) and a variable (0 or more) number of extended attribute + * entries. Each extended attribute entry (jfs_ea) is a double + * where is constructed from a null-terminated ascii string + * (1 ... 255 bytes in the name) and is arbitrary 8 bit data + * (1 ... 65535 bytes). The in-memory format is + * + * 0 1 2 4 4 + namelen + 1 + * +-------+--------+--------+----------------+-------------------+ + * | Flags | Name | Value | Name String \0 | Data . . . . | + * | | Length | Length | | | + * +-------+--------+--------+----------------+-------------------+ + * + * A jfs_ea_list then is structured as + * + * 0 4 4 + EA_SIZE(ea1) + * +------------+-------------------+--------------------+----- + * | Overall EA | First FEA Element | Second FEA Element | ..... + * | List Size | | | + * +------------+-------------------+--------------------+----- + * + * On-disk: + * + * FEALISTs are stored on disk using blocks allocated by dbAlloc() and + * written directly. An EA list may be in-lined in the inode if there is + * sufficient room available. + */ + +struct ea_buffer { + int flag; /* Indicates what storage xattr points to */ + int max_size; /* largest xattr that fits in current buffer */ + dxd_t new_ea; /* dxd to replace ea when modifying xattr */ + struct metapage *mp; /* metapage containing ea list */ + struct jfs_ea_list *xattr; /* buffer containing ea list */ +}; + +/* + * ea_buffer.flag values + */ +#define EA_INLINE 0x0001 +#define EA_EXTENT 0x0002 +#define EA_NEW 0x0004 +#define EA_MALLOC 0x0008 + + +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +/* + * These three routines are used to recognize on-disk extended attributes + * that are in a recognized namespace. If the attribute is not recognized, + * "os2." is prepended to the name + */ +static int is_os2_xattr(struct jfs_ea *ea) +{ + return !is_known_namespace(ea->name); +} + +static inline int name_size(struct jfs_ea *ea) +{ + if (is_os2_xattr(ea)) + return ea->namelen + XATTR_OS2_PREFIX_LEN; + else + return ea->namelen; +} + +static inline int copy_name(char *buffer, struct jfs_ea *ea) +{ + int len = ea->namelen; + + if (is_os2_xattr(ea)) { + memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN); + buffer += XATTR_OS2_PREFIX_LEN; + len += XATTR_OS2_PREFIX_LEN; + } + memcpy(buffer, ea->name, ea->namelen); + buffer[ea->namelen] = 0; + + return len; +} + +/* Forward references */ +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf); + +/* + * NAME: ea_write_inline + * + * FUNCTION: Attempt to write an EA inline if area is available + * + * PRE CONDITIONS: + * Already verified that the specified EA is small enough to fit inline + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in with necessary EA information + * if we successfully copy the EA inline + * + * NOTES: + * Checks if the inode's inline area is available. If so, copies EA inline + * and sets fields appropriately. Otherwise, returns failure, EA will + * have to be put into an extent. + * + * RETURNS: 0 for successful copy to inline area; -1 if area not available + */ +static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist, + int size, dxd_t * ea) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + + /* + * Make sure we have an EA -- the NULL EA list is valid, but you + * can't copy it! + */ + if (ealist && size > sizeof (struct jfs_ea_list)) { + assert(size <= sizeof (ji->i_inline_ea)); + + /* + * See if the space is available or if it is already being + * used for an inline EA. + */ + if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE)) + return -EPERM; + + DXDsize(ea, size); + DXDlength(ea, 0); + DXDaddress(ea, 0); + memcpy(ji->i_inline_ea, ealist, size); + ea->flag = DXD_INLINE; + ji->mode2 &= ~INLINEEA; + } else { + ea->flag = 0; + DXDsize(ea, 0); + DXDlength(ea, 0); + DXDaddress(ea, 0); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + + return 0; +} + +/* + * NAME: ea_write + * + * FUNCTION: Write an EA for an inode + * + * PRE CONDITIONS: EA has been verified + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in appropriately with where the + * EA was copied + * + * NOTES: Will write EA inline if able to, otherwise allocates blocks for an + * extent and synchronously writes it to those blocks. + * + * RETURNS: 0 for success; Anything else indicates failure + */ +static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, + dxd_t * ea) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + int rc = 0, i; + char *cp; + s32 nbytes, nb; + s32 bytes_to_write; + struct metapage *mp; + + /* + * Quick check to see if this is an in-linable EA. Short EAs + * and empty EAs are all in-linable, provided the space exists. + */ + if (!ealist || size <= sizeof (ji->i_inline_ea)) { + if (!ea_write_inline(ip, ealist, size, ea)) + return 0; + } + + /* figure out how many blocks we need */ + nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; + + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; + + rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); + if (rc) { + /*Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + return rc; + } + + /* + * Now have nblocks worth of storage to stuff into the FEALIST. + * loop over the FEALIST copying data into the buffer one page at + * a time. + */ + cp = (char *) ealist; + nbytes = size; + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_write = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) { + rc = -EIO; + goto failed; + } + + memcpy(mp->data, cp, nb); + + /* + * We really need a way to propagate errors for + * forced writes like this one. --hch + * + * (__write_metapage => release_metapage => flush_metapage) + */ +#ifdef _JFS_FIXME + if ((rc = flush_metapage(mp))) { + /* + * the write failed -- this means that the buffer + * is still assigned and the blocks are not being + * used. this seems like the best error recovery + * we can get ... + */ + goto failed; + } +#else + flush_metapage(mp); +#endif + + cp += PSIZE; + nbytes -= nb; + } + + ea->flag = DXD_EXTENT; + DXDsize(ea, le32_to_cpu(ealist->size)); + DXDlength(ea, nblocks); + DXDaddress(ea, blkno); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + + return 0; + + failed: + /* Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + + dbFree(ip, blkno, nblocks); + return rc; +} + +/* + * NAME: ea_read_inline + * + * FUNCTION: Read an inlined EA into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * RETURNS: 0 + */ +static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + int ea_size = sizeDXD(&ji->ea); + + if (ea_size == 0) { + ealist->size = 0; + return 0; + } + + /* Sanity Check */ + if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea))) + return -EIO; + if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size) + != ea_size) + return -EIO; + + memcpy(ealist, ji->i_inline_ea, ea_size); + return 0; +} + +/* + * NAME: ea_read + * + * FUNCTION: copy EA data into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * NOTES: If EA is inline calls ea_read_inline() to copy EA. + * + * RETURNS: 0 for success; other indicates failure + */ +static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + char *cp = (char *) ealist; + int i; + int nbytes, nb; + s32 bytes_to_read; + struct metapage *mp; + + /* quick check for in-line EA */ + if (ji->ea.flag & DXD_INLINE) + return ea_read_inline(ip, ealist); + + nbytes = sizeDXD(&ji->ea); + if (!nbytes) { + jfs_error(sb, "nbytes is 0\n"); + return -EIO; + } + + /* + * Figure out how many blocks were allocated when this EA list was + * originally written to disk. + */ + nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage; + blkno = addressDXD(&ji->ea) << sbi->l2nbperpage; + + /* + * I have found the disk blocks which were originally used to store + * the FEALIST. now i loop over each contiguous block copying the + * data into the buffer. + */ + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_read = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1))) + return -EIO; + + memcpy(cp, mp->data, nb); + release_metapage(mp); + + cp += PSIZE; + nbytes -= nb; + } + + return 0; +} + +/* + * NAME: ea_get + * + * FUNCTION: Returns buffer containing existing extended attributes. + * The size of the buffer will be the larger of the existing + * attributes size, or min_size. + * + * The buffer, which may be inlined in the inode or in the + * page cache must be release by calling ea_release or ea_put + * + * PARAMETERS: + * inode - Inode pointer + * ea_buf - Structure to be populated with ealist and its metadata + * min_size- minimum size of buffer to be returned + * + * RETURNS: 0 for success; Other indicates failure + */ +static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + struct super_block *sb = inode->i_sb; + int size; + int ea_size = sizeDXD(&ji->ea); + int blocks_needed, current_blocks; + s64 blkno; + int rc; + int quota_allocation = 0; + + /* When fsck.jfs clears a bad ea, it doesn't clear the size */ + if (ji->ea.flag == 0) + ea_size = 0; + + if (ea_size == 0) { + if (min_size == 0) { + ea_buf->flag = 0; + ea_buf->max_size = 0; + ea_buf->xattr = NULL; + return 0; + } + if ((min_size <= sizeof (ji->i_inline_ea)) && + (ji->mode2 & INLINEEA)) { + ea_buf->flag = EA_INLINE | EA_NEW; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + DXDlength(&ea_buf->new_ea, 0); + DXDaddress(&ea_buf->new_ea, 0); + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, min_size); + return 0; + } + current_blocks = 0; + } else if (ji->ea.flag & DXD_INLINE) { + if (min_size <= sizeof (ji->i_inline_ea)) { + ea_buf->flag = EA_INLINE; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + goto size_check; + } + current_blocks = 0; + } else { + if (!(ji->ea.flag & DXD_EXTENT)) { + jfs_error(sb, "invalid ea.flag\n"); + return -EIO; + } + current_blocks = (ea_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + } + size = max(min_size, ea_size); + + if (size > PSIZE) { + /* + * To keep the rest of the code simple. Allocate a + * contiguous buffer to work with + */ + ea_buf->xattr = kmalloc(size, GFP_KERNEL); + if (ea_buf->xattr == NULL) + return -ENOMEM; + + ea_buf->flag = EA_MALLOC; + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + if (ea_size == 0) + return 0; + + if ((rc = ea_read(inode, ea_buf->xattr))) { + kfree(ea_buf->xattr); + ea_buf->xattr = NULL; + return rc; + } + goto size_check; + } + blocks_needed = (min_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + if (blocks_needed > current_blocks) { + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) + return -EDQUOT; + + quota_allocation = blocks_needed; + + rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed, + &blkno); + if (rc) + goto clean_up; + + DXDlength(&ea_buf->new_ea, blocks_needed); + DXDaddress(&ea_buf->new_ea, blkno); + ea_buf->new_ea.flag = DXD_EXTENT; + DXDsize(&ea_buf->new_ea, min_size); + + ea_buf->flag = EA_EXTENT | EA_NEW; + + ea_buf->mp = get_metapage(inode, blkno, + blocks_needed << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + dbFree(inode, blkno, (s64) blocks_needed); + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (min_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + if (ea_size == 0) + return 0; + if ((rc = ea_read(inode, ea_buf->xattr))) { + discard_metapage(ea_buf->mp); + dbFree(inode, blkno, (s64) blocks_needed); + goto clean_up; + } + goto size_check; + } + ea_buf->flag = EA_EXTENT; + ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea), + lengthDXD(&ji->ea) << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (ea_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + size_check: + if (EALIST_SIZE(ea_buf->xattr) != ea_size) { + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, ea_size, 1); + ea_release(inode, ea_buf); + rc = -EIO; + goto clean_up; + } + + return ea_size; + + clean_up: + /* Rollback quota allocation */ + if (quota_allocation) + dquot_free_block(inode, quota_allocation); + + return (rc); +} + +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf) +{ + if (ea_buf->flag & EA_MALLOC) + kfree(ea_buf->xattr); + else if (ea_buf->flag & EA_EXTENT) { + assert(ea_buf->mp); + release_metapage(ea_buf->mp); + + if (ea_buf->flag & EA_NEW) + dbFree(inode, addressDXD(&ea_buf->new_ea), + lengthDXD(&ea_buf->new_ea)); + } +} + +static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, + int new_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + unsigned long old_blocks, new_blocks; + int rc = 0; + + if (new_size == 0) { + ea_release(inode, ea_buf); + ea_buf = NULL; + } else if (ea_buf->flag & EA_INLINE) { + assert(new_size <= sizeof (ji->i_inline_ea)); + ji->mode2 &= ~INLINEEA; + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, new_size); + DXDaddress(&ea_buf->new_ea, 0); + DXDlength(&ea_buf->new_ea, 0); + } else if (ea_buf->flag & EA_MALLOC) { + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + kfree(ea_buf->xattr); + } else if (ea_buf->flag & EA_NEW) { + /* We have already allocated a new dxd */ + flush_metapage(ea_buf->mp); + } else { + /* ->xattr must point to original ea's metapage */ + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + discard_metapage(ea_buf->mp); + } + if (rc) + return rc; + + old_blocks = new_blocks = 0; + + if (ji->ea.flag & DXD_EXTENT) { + invalidate_dxd_metapages(inode, ji->ea); + old_blocks = lengthDXD(&ji->ea); + } + + if (ea_buf) { + txEA(tid, inode, &ji->ea, &ea_buf->new_ea); + if (ea_buf->new_ea.flag & DXD_EXTENT) { + new_blocks = lengthDXD(&ea_buf->new_ea); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + ji->ea = ea_buf->new_ea; + } else { + txEA(tid, inode, &ji->ea, NULL); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + ji->ea.flag = 0; + ji->ea.size = 0; + } + + /* If old blocks exist, they must be removed from quota allocation. */ + if (old_blocks) + dquot_free_block(inode, old_blocks); + + inode->i_ctime = CURRENT_TIME; + + return 0; +} + +/* + * Most of the permission checking is done by xattr_permission in the vfs. + * We also need to verify that this is a namespace that we recognize. + */ +static int can_set_xattr(struct inode *inode, const char *name, + const void *value, size_t value_len) +{ + if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "os2." + */ + if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) + return -EOPNOTSUPP; + return 0; + } + + /* + * Don't allow setting an attribute in an unknown namespace. + */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; +} + +int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, + const void *value, size_t value_len, int flags) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL; + struct ea_buffer ea_buf; + int old_ea_size = 0; + int xattr_size; + int new_size; + int namelen = strlen(name); + char *os2name = NULL; + int found = 0; + int rc; + int length; + + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, + GFP_KERNEL); + if (!os2name) + return -ENOMEM; + strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); + name = os2name; + namelen -= XATTR_OS2_PREFIX_LEN; + } + + down_write(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + + again: + ealist = (struct jfs_ea_list *) ea_buf.xattr; + new_size = sizeof (struct jfs_ea_list); + + if (xattr_size) { + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); + ea = NEXT_EA(ea)) { + if ((namelen == ea->namelen) && + (memcmp(name, ea->name, namelen) == 0)) { + found = 1; + if (flags & XATTR_CREATE) { + rc = -EEXIST; + goto release; + } + old_ea = ea; + old_ea_size = EA_SIZE(ea); + next_ea = NEXT_EA(ea); + } else + new_size += EA_SIZE(ea); + } + } + + if (!found) { + if (flags & XATTR_REPLACE) { + rc = -ENODATA; + goto release; + } + if (value == NULL) { + rc = 0; + goto release; + } + } + if (value) + new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len; + + if (new_size > ea_buf.max_size) { + /* + * We need to allocate more space for merged ea list. + * We should only have loop to again: once. + */ + ea_release(inode, &ea_buf); + xattr_size = ea_get(inode, &ea_buf, new_size); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + goto again; + } + + /* Remove old ea of the same name */ + if (found) { + /* number of bytes following target EA */ + length = (char *) END_EALIST(ealist) - (char *) next_ea; + if (length > 0) + memmove(old_ea, next_ea, length); + xattr_size -= old_ea_size; + } + + /* Add new entry to the end */ + if (value) { + if (xattr_size == 0) + /* Completely new ea list */ + xattr_size = sizeof (struct jfs_ea_list); + + /* + * The size of EA value is limitted by on-disk format up to + * __le16, there would be an overflow if the size is equal + * to XATTR_SIZE_MAX (65536). In order to avoid this issue, + * we can pre-checkup the value size against USHRT_MAX, and + * return -E2BIG in this case, which is consistent with the + * VFS setxattr interface. + */ + if (value_len >= USHRT_MAX) { + rc = -E2BIG; + goto release; + } + + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); + ea->flag = 0; + ea->namelen = namelen; + ea->valuelen = (cpu_to_le16(value_len)); + memcpy(ea->name, name, namelen); + ea->name[namelen] = 0; + if (value_len) + memcpy(&ea->name[namelen + 1], value, value_len); + xattr_size += EA_SIZE(ea); + } + + /* DEBUG - If we did this right, these number match */ + if (xattr_size != new_size) { + printk(KERN_ERR + "__jfs_setxattr: xattr_size = %d, new_size = %d\n", + xattr_size, new_size); + + rc = -EINVAL; + goto release; + } + + /* + * If we're left with an empty list, there's no ea + */ + if (new_size == sizeof (struct jfs_ea_list)) + new_size = 0; + + ealist->size = cpu_to_le32(new_size); + + rc = ea_put(tid, inode, &ea_buf, new_size); + + goto out; + release: + ea_release(inode, &ea_buf); + out: + up_write(&JFS_IP(inode)->xattr_sem); + + kfree(os2name); + + return rc; +} + +int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t value_len, int flags) +{ + struct inode *inode = d_inode(dentry); + struct jfs_inode_info *ji = JFS_IP(inode); + int rc; + tid_t tid; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, value_len, flags); + + if ((rc = can_set_xattr(inode, name, value, value_len))) + return rc; + + if (value == NULL) { /* empty EA, do not remove */ + value = ""; + value_len = 0; + } + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&ji->commit_mutex); + rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len, + flags); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&ji->commit_mutex); + + return rc; +} + +ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, + size_t buf_size) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + int xattr_size; + ssize_t size; + int namelen = strlen(name); + char *value; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto not_found; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* Find the named attribute */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) + if ((namelen == ea->namelen) && + memcmp(name, ea->name, namelen) == 0) { + /* Found it */ + size = le16_to_cpu(ea->valuelen); + if (!data) + goto release; + else if (size > buf_size) { + size = -ERANGE; + goto release; + } + value = ((char *) &ea->name) + ea->namelen + 1; + memcpy(data, value, size); + goto release; + } + not_found: + size = -ENODATA; + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + + return size; +} + +ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, + size_t buf_size) +{ + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, data, buf_size); + + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + /* + * skip past "os2." prefix + */ + name += XATTR_OS2_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "os2." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + + err = __jfs_getxattr(d_inode(dentry), name, data, buf_size); + + return err; +} + +/* + * No special permissions are needed to list attributes except for trusted.* + */ +static inline int can_list(struct jfs_ea *ea) +{ + return (strncmp(ea->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN)); +} + +ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) +{ + struct inode *inode = d_inode(dentry); + char *buffer; + ssize_t size = 0; + int xattr_size; + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto release; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* compute required size of list */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) + size += name_size(ea) + 1; + } + + if (!data) + goto release; + + if (size > buf_size) { + size = -ERANGE; + goto release; + } + + /* Copy attribute names to buffer */ + buffer = data; + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) { + int namelen = copy_name(buffer, ea); + buffer += namelen + 1; + } + } + + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + return size; +} + +int jfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = d_inode(dentry); + struct jfs_inode_info *ji = JFS_IP(inode); + int rc; + tid_t tid; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if ((rc = can_set_xattr(inode, name, NULL, 0))) + return rc; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&ji->commit_mutex); + rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&ji->commit_mutex); + + return rc; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *jfs_xattr_handlers[] = { +#ifdef CONFIG_JFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + + +#ifdef CONFIG_JFS_SECURITY +static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + tid_t *tid = fs_info; + char *name; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + return err; +} + +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); +} +#endif diff --git a/kernel/fs/kernfs/Kconfig b/kernel/fs/kernfs/Kconfig new file mode 100644 index 000000000..397b5f7a7 --- /dev/null +++ b/kernel/fs/kernfs/Kconfig @@ -0,0 +1,7 @@ +# +# KERNFS should be selected by its users +# + +config KERNFS + bool + default n diff --git a/kernel/fs/kernfs/Makefile b/kernel/fs/kernfs/Makefile new file mode 100644 index 000000000..674337c76 --- /dev/null +++ b/kernel/fs/kernfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/kernel/fs/kernfs/dir.c b/kernel/fs/kernfs/dir.c new file mode 100644 index 000000000..2d48d28e1 --- /dev/null +++ b/kernel/fs/kernfs/dir.c @@ -0,0 +1,1464 @@ +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kernfs-internal.h" + +DEFINE_MUTEX(kernfs_mutex); +static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_mutex); + return atomic_read(&kn->active) >= 0; +} + +static bool kernfs_lockdep(struct kernfs_node *kn) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + return kn->flags & KERNFS_LOCKDEP; +#else + return false; +#endif +} + +static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) +{ + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); +} + +static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf, + size_t buflen) +{ + char *p = buf + buflen; + int len; + + *--p = '\0'; + + do { + len = strlen(kn->name); + if (p - buf < len + 1) { + buf[0] = '\0'; + p = NULL; + break; + } + p -= len; + memcpy(p, kn->name, len); + *--p = '/'; + kn = kn->parent; + } while (kn && kn->parent); + + return p; +} + +/** + * kernfs_name - obtain the name of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Copies the name of @kn into @buf of @buflen bytes. The behavior is + * similar to strlcpy(). It returns the length of @kn's name and if @buf + * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * + * This function can be called from any context. + */ +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_name_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} + +/** + * kernfs_path - build full path of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Builds and returns the full path of @kn in @buf of @buflen bytes. The + * path is built from the end of @buf so the returned pointer usually + * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated + * and %NULL is returned. + */ +char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + p = kernfs_path_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return p; +} +EXPORT_SYMBOL_GPL(kernfs_path); + +/** + * pr_cont_kernfs_name - pr_cont name of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_name(struct kernfs_node *kn) +{ + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + pr_cont("%s", kernfs_pr_cont_buf); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * pr_cont_kernfs_path - pr_cont path of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_path(struct kernfs_node *kn) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + p = kernfs_path_locked(kn, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); + if (p) + pr_cont("%s", p); + else + pr_cont(""); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * kernfs_get_parent - determine the parent node and pin it + * @kn: kernfs_node of interest + * + * Determines @kn's parent, pins and returns it. This function can be + * called from any context. + */ +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + parent = kn->parent; + kernfs_get(parent); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return parent; +} + +/** + * kernfs_name_hash + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 2) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash < kn->hash) + return -1; + if (hash > kn->hash) + return 1; + if (ns < kn->ns) + return -1; + if (ns > kn->ns) + return 1; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Try to unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. Returns %true if @kn was actually + * removed, %false if @kn wasn't on the rbtree. + * + * Locking: + * mutex_lock(kernfs_mutex) + */ +static bool kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (RB_EMPTY_NODE(&kn->rb)) + return false; + + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + + rb_erase(&kn->rb, &kn->parent->dir.children); + RB_CLEAR_NODE(&kn->rb); + return true; +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is NULL. + * + * RETURNS: + * Pointer to @kn on success, NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kernfs_lockdep(kn)) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + struct kernfs_root *root = kernfs_root(kn); + int v; + + if (unlikely(!kn)) + return; + + if (kernfs_lockdep(kn)) + rwsem_release(&kn->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + wake_up_all(&root->deactivate_waitq); +} + +/** + * kernfs_drain - drain kernfs_node + * @kn: kernfs_node to drain + * + * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * removers may invoke this function concurrently on @kn and all will + * return after draining is complete. + */ +static void kernfs_drain(struct kernfs_node *kn) + __releases(&kernfs_mutex) __acquires(&kernfs_mutex) +{ + struct kernfs_root *root = kernfs_root(kn); + + lockdep_assert_held(&kernfs_mutex); + WARN_ON_ONCE(kernfs_active(kn)); + + mutex_unlock(&kernfs_mutex); + + if (kernfs_lockdep(kn)) { + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) + lock_contended(&kn->dep_map, _RET_IP_); + } + + /* but everyone should wait for draining */ + wait_event(root->deactivate_waitq, + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); + + if (kernfs_lockdep(kn)) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); + } + + kernfs_unmap_bin_file(kn); + + mutex_lock(&kernfs_mutex); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* + * Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, + "kernfs_put: %s/%s: released with incorrect active_ref %d\n", + parent ? parent->name : "", kn->name, atomic_read(&kn->active)); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + + kfree_const(kn->name); + + if (kn->iattr) { + if (kn->iattr->ia_secdata) + security_release_secctx(kn->iattr->ia_secdata, + kn->iattr->ia_secdata_len); + simple_xattrs_free(&kn->iattr->xattrs); + } + kfree(kn->iattr); + ida_simple_remove(&root->ino_ida, kn->ino); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + ida_destroy(&root->ino_ida); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (d_really_is_negative(dentry)) + goto out_bad_unlocked; + + kn = dentry->d_fsdata; + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + /* The kernfs node has been moved? */ + if (dentry->d_parent->d_fsdata != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + return 0; +} + +static void kernfs_dop_release(struct dentry *dentry) +{ + kernfs_put(dentry->d_fsdata); +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, + .d_release = kernfs_dop_release, +}; + +/** + * kernfs_node_from_dentry - determine kernfs_node associated with a dentry + * @dentry: the dentry in question + * + * Return the kernfs_node associated with @dentry. If @dentry is not a + * kernfs one, %NULL is returned. + * + * While the returned kernfs_node will stay accessible as long as @dentry + * is accessible, the returned node can be in any state and the caller is + * fully responsible for determining what's accessible. + */ +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ + if (dentry->d_sb->s_op == &kernfs_sops) + return dentry->d_fsdata; + return NULL; +} + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + int ret; + + name = kstrdup_const(name, GFP_KERNEL); + if (!name) + return NULL; + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); + if (ret < 0) + goto err_out2; + kn->ino = ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, KN_DEACTIVATED_BIAS); + RB_CLEAR_NODE(&kn->rb); + + kn->name = name; + kn->mode = mode; + kn->flags = flags; + + return kn; + + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree_const(name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + struct kernfs_iattrs *ps_iattr; + bool has_ns; + int ret; + + mutex_lock(&kernfs_mutex); + + ret = -EINVAL; + has_ns = kernfs_ns_enabled(parent); + if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name)) + goto out_unlock; + + if (kernfs_type(parent) != KERNFS_DIR) + goto out_unlock; + + ret = -ENOENT; + if (parent->flags & KERNFS_EMPTY_DIR) + goto out_unlock; + + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + goto out_unlock; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + goto out_unlock; + + /* Update timestamps on the parent */ + ps_iattr = parent->iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + mutex_unlock(&kernfs_mutex); + + /* + * Activate the new node unless CREATE_DEACTIVATED is requested. + * If not activated here, the kernfs user is responsible for + * activating the node with kernfs_activate(). A node which hasn't + * been activated is not visible to userland and its removal won't + * trigger deactivation. + */ + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + return 0; + +out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. Returns pointer to + * the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_mutex); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @scops: optional syscall operations for the hierarchy + * @flags: KERNFS_ROOT_* flags + * @priv: opaque data associated with the new directory + * + * Returns the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + ida_init(&root->ino_ida); + INIT_LIST_HEAD(&root->supers); + + kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + KERNFS_DIR); + if (!kn) { + ida_destroy(&root->ino_ida); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->priv = priv; + kn->dir.root = root; + + root->syscall_ops = scops; + root->flags = flags; + root->kn = kn; + init_waitqueue_head(&root->deactivate_waitq); + + if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + kernfs_remove(root->kn); /* will also free @root */ +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + void *priv, const void *ns) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +/** + * kernfs_create_empty_dir - create an always empty directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->flags |= KERNFS_EMPTY_DIR; + kn->dir.root = parent->dir.root; + kn->ns = NULL; + kn->priv = NULL; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *ret; + struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *kn; + struct inode *inode; + const void *ns = NULL; + + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + + /* no such entry */ + if (!kn || !kernfs_active(kn)) { + ret = NULL; + goto out_unlock; + } + kernfs_get(kn); + dentry->d_fsdata = kn; + + /* attach dentry and inode */ + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_splice_alias(inode, dentry); + out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; + + if (!scops || !scops->mkdir) + return -EPERM; + + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->mkdir(parent, dentry->d_name.name, mode); + + kernfs_put_active(parent); + return ret; +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (!scops || !scops->rmdir) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ret = scops->rmdir(kn); + + kernfs_put_active(kn); + return ret; +} + +static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; + + if (!scops || !scops->rename) + return -EPERM; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + if (!kernfs_get_active(new_parent)) { + kernfs_put_active(kn); + return -ENODEV; + } + + ret = scops->rename(kn, new_parent, new_dentry->d_name.name); + + kernfs_put_active(new_parent); + kernfs_put_active(kn); + return ret; +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held(&kernfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +/** + * kernfs_activate - activate a node which started deactivated + * @kn: kernfs_node whose subtree is to be activated + * + * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node + * needs to be explicitly activated. A node which hasn't been activated + * isn't visible to userland and deactivation is skipped during its + * removal. This is useful to construct atomic init sequences where + * creation of multiple nodes should either succeed or fail atomically. + * + * The caller is responsible for ensuring that this function is not called + * after kernfs_remove*() is invoked on @kn. + */ +void kernfs_activate(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + mutex_lock(&kernfs_mutex); + + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) { + if (!pos || (pos->flags & KERNFS_ACTIVATED)) + continue; + + WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); + WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); + pos->flags |= KERNFS_ACTIVATED; + } + + mutex_unlock(&kernfs_mutex); +} + +static void __kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + lockdep_assert_held(&kernfs_mutex); + + /* + * Short-circuit if non-root @kn has already finished removal. + * This is for kernfs_remove_self() which plays with active ref + * after removal. + */ + if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + /* prevent any new usage under @kn by deactivating all nodes */ + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + + /* deactivate and unlink the subtree node-by-node */ + do { + pos = kernfs_leftmost_descendant(kn); + + /* + * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * base ref could have been put by someone else by the time + * the function returns. Make sure it doesn't go away + * underneath us. + */ + kernfs_get(pos); + + /* + * Drain iff @kn was activated. This avoids draining and + * its lockdep annotations for nodes which have never been + * activated and allows embedding kernfs_remove() in create + * error paths without worrying about draining. + */ + if (kn->flags & KERNFS_ACTIVATED) + kernfs_drain(pos); + else + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + /* + * kernfs_unlink_sibling() succeeds once per node. Use it + * to decide who's responsible for cleanups. + */ + if (!pos->parent || kernfs_unlink_sibling(pos)) { + struct kernfs_iattrs *ps_iattr = + pos->parent ? pos->parent->iattr : NULL; + + /* update timestamps on the parent */ + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + + kernfs_put(pos); + } + + kernfs_put(pos); + } while (pos != kn); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + mutex_lock(&kernfs_mutex); + __kernfs_remove(kn); + mutex_unlock(&kernfs_mutex); +} + +/** + * kernfs_break_active_protection - break out of active protection + * @kn: the self kernfs_node + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. Each invocation of + * this function must also be matched with an invocation of + * kernfs_unbreak_active_protection(). + * + * This function releases the active reference of @kn the caller is + * holding. Once this function is called, @kn may be removed at any point + * and the caller is solely responsible for ensuring that the objects it + * dereferences are accessible. + */ +void kernfs_break_active_protection(struct kernfs_node *kn) +{ + /* + * Take out ourself out of the active ref dependency chain. If + * we're called without an active ref, lockdep will complain. + */ + kernfs_put_active(kn); +} + +/** + * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() + * @kn: the self kernfs_node + * + * If kernfs_break_active_protection() was called, this function must be + * invoked before finishing the kernfs operation. Note that while this + * function restores the active reference, it doesn't and can't actually + * restore the active protection - @kn may already or be in the process of + * being removed. Once kernfs_break_active_protection() is invoked, that + * protection is irreversibly gone for the kernfs operation instance. + * + * While this function may be called at any point after + * kernfs_break_active_protection() is invoked, its most useful location + * would be right before the enclosing kernfs operation returns. + */ +void kernfs_unbreak_active_protection(struct kernfs_node *kn) +{ + /* + * @kn->active could be in any state; however, the increment we do + * here will be undone as soon as the enclosing kernfs operation + * finishes and this temporary bump can't break anything. If @kn + * is alive, nothing changes. If @kn is being deactivated, the + * soon-to-follow put will either finish deactivation or restore + * deactivated state. If @kn is already removed, the temporary + * bump is guaranteed to be gone before @kn is released. + */ + atomic_inc(&kn->active); + if (kernfs_lockdep(kn)) + rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); +} + +/** + * kernfs_remove_self - remove a kernfs_node from its own method + * @kn: the self kernfs_node to remove + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. This can be used to + * implement a file operation which deletes itself. + * + * For example, the "delete" file for a sysfs device directory can be + * implemented by invoking kernfs_remove_self() on the "delete" file + * itself. This function breaks the circular dependency of trying to + * deactivate self while holding an active ref itself. It isn't necessary + * to modify the usual removal path to use kernfs_remove_self(). The + * "delete" implementation can simply invoke kernfs_remove_self() on self + * before proceeding with the usual removal path. kernfs will ignore later + * kernfs_remove() on self. + * + * kernfs_remove_self() can be called multiple times concurrently on the + * same kernfs_node. Only the first one actually performs removal and + * returns %true. All others will wait until the kernfs operation which + * won self-removal finishes and return %false. Note that the losers wait + * for the completion of not only the winning kernfs_remove_self() but also + * the whole kernfs_ops which won the arbitration. This can be used to + * guarantee, for example, all concurrent writes to a "delete" file to + * finish only after the whole operation is complete. + */ +bool kernfs_remove_self(struct kernfs_node *kn) +{ + bool ret; + + mutex_lock(&kernfs_mutex); + kernfs_break_active_protection(kn); + + /* + * SUICIDAL is used to arbitrate among competing invocations. Only + * the first one will actually perform removal. When the removal + * is complete, SUICIDED is set and the active ref is restored + * while holding kernfs_mutex. The ones which lost arbitration + * waits for SUICDED && drained which can happen only after the + * enclosing kernfs operation which executed the winning instance + * of kernfs_remove_self() finished. + */ + if (!(kn->flags & KERNFS_SUICIDAL)) { + kn->flags |= KERNFS_SUICIDAL; + __kernfs_remove(kn); + kn->flags |= KERNFS_SUICIDED; + ret = true; + } else { + wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); + + if ((kn->flags & KERNFS_SUICIDED) && + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) + break; + + mutex_unlock(&kernfs_mutex); + schedule(); + mutex_lock(&kernfs_mutex); + } + finish_wait(waitq, &wait); + WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); + ret = false; + } + + /* + * This must be done while holding kernfs_mutex; otherwise, waiting + * for SUICIDED && deactivated could finish prematurely. + */ + kernfs_unbreak_active_protection(kn); + + mutex_unlock(&kernfs_mutex); + return ret; +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_node *kn; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + mutex_lock(&kernfs_mutex); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) + __kernfs_remove(kn); + + mutex_unlock(&kernfs_mutex); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + struct kernfs_node *old_parent; + const char *old_name = NULL; + int error; + + /* can't move or rename root */ + if (!kn->parent) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + + error = -ENOENT; + if (!kernfs_active(kn) || !kernfs_active(new_parent) || + (new_parent->flags & KERNFS_EMPTY_DIR)) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup_const(new_name, GFP_KERNEL); + if (!new_name) + goto out; + } else { + new_name = NULL; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + + /* rename_lock protects ->parent and ->name accessors */ + spin_lock_irq(&kernfs_rename_lock); + + old_parent = kn->parent; + kn->parent = new_parent; + + kn->ns = new_ns; + if (new_name) { + old_name = kn->name; + kn->name = new_name; + } + + spin_unlock_irq(&kernfs_rename_lock); + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kernfs_link_sibling(kn); + + kernfs_put(old_parent); + kfree_const(old_name); + + error = 0; + out: + mutex_unlock(&kernfs_mutex); + return error; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct kernfs_node *kn) +{ + return (kn->mode >> 12) & 15; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = kernfs_active(pos) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries which are dying/dead or in the wrong namespace */ + while (pos && (!kernfs_active(pos) || pos->ns != ns)) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) { + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && (!kernfs_active(pos) || pos->ns != ns)); + } + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *pos = file->private_data; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->ino; + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + mutex_unlock(&kernfs_mutex); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + mutex_lock(&kernfs_mutex); + } + mutex_unlock(&kernfs_mutex); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, + int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = kernfs_dir_fop_llseek, +}; diff --git a/kernel/fs/kernfs/file.c b/kernel/fs/kernfs/file.c new file mode 100644 index 000000000..2bacb9988 --- /dev/null +++ b/kernel/fs/kernfs/file.c @@ -0,0 +1,954 @@ +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kernfs-internal.h" + +/* + * There's one kernfs_open_file for each open file and one kernfs_open_node + * for each kernfs_node with one or more open files. + * + * kernfs_node->attr.open points to kernfs_open_node. attr.open is + * protected by kernfs_open_node_lock. + * + * filp->private_data points to seq_file whose ->private points to + * kernfs_open_file. kernfs_open_files are chained at + * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. + */ +static DEFINE_SPINLOCK(kernfs_open_node_lock); +static DEFINE_MUTEX(kernfs_open_file_mutex); + +struct kernfs_open_node { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ +}; + +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is primarily to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; + } +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of->kn->attr.open->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, + char __user *user_buf, size_t count, + loff_t *ppos) +{ + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + of->event = atomic_read(&of->kn->attr.open->event); + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, *ppos); + else + len = -EINVAL; + + if (len < 0) + goto out_unlock; + + if (copy_to_user(user_buf, buf, len)) { + len = -EFAULT; + goto out_unlock; + } + + *ppos += len; + + out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + out_free: + if (buf != of->prealloc_buf) + kfree(buf); + return len; +} + +/** + * kernfs_fop_read - kernfs vfs read callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + */ +static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + + if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read(file, user_buf, count, ppos); + else + return kernfs_file_direct_read(of, user_buf, count, ppos); +} + +/** + * kernfs_fop_write - kernfs vfs write callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + size_t len; + char *buf; + + if (of->atomic_write_len) { + len = count; + if (len > of->atomic_write_len) + return -E2BIG; + } else { + len = min_t(size_t, count, PAGE_SIZE); + } + + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_unlock; + } + buf[len] = '\0'; /* guarantee string termination */ + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, *ppos); + else + len = -EINVAL; + + if (len > 0) + *ppos += len; + +out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); +out_free: + if (buf != of->prealloc_buf) + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + if (rc) + goto out_put; + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on, *new_on = NULL; + + retry: + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irq(&kernfs_open_node_lock); + + if (!kn->attr.open && new_on) { + kn->attr.open = new_on; + new_on = NULL; + } + + on = kn->attr.open; + if (on) { + atomic_inc(&on->refcnt); + list_add_tail(&of->list, &on->files); + } + + spin_unlock_irq(&kernfs_open_node_lock); + mutex_unlock(&kernfs_open_file_mutex); + + if (on) { + kfree(new_on); + return 0; + } + + /* not there, initialize a new one and retry */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) + return -ENOMEM; + + atomic_set(&new_on->refcnt, 0); + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + goto retry; +} + +/** + * kernfs_put_open_node - put kernfs_open_node + * @kn: target kernfs_nodet + * @of: associated kernfs_open_file + * + * Put @kn->attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void kernfs_put_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on = kn->attr.open; + unsigned long flags; + + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (of) + list_del(&of->list); + + if (atomic_dec_and_test(&on->refcnt)) + kn->attr.open = NULL; + else + on = NULL; + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + mutex_unlock(&kernfs_open_file_mutex); + + kfree(on); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Write path needs to atomic_write_len outside active reference. + * Cache it in open_file. See kernfs_fop_write() for details. + */ + of->atomic_write_len = ops->atomic_write_len; + + error = -EINVAL; + /* + * ->seq_show is incompatible with ->prealloc, + * as seq_read does its own allocation. + * ->read must be used instead. + */ + if (ops->prealloc && ops->seq_show) + goto err_free; + if (ops->prealloc) { + int len = of->atomic_write_len ?: PAGE_SIZE; + of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); + error = -ENOMEM; + if (!of->prealloc_buf) + goto err_free; + } + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + ((struct seq_file *)file->private_data)->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_close; + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_close: + seq_release(inode, file); +err_free: + kfree(of->prealloc_buf); + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_file *of = kernfs_of(filp); + + kernfs_put_open_node(kn, of); + seq_release(inode, filp); + kfree(of->prealloc_buf); + kfree(of); + + return 0; +} + +void kernfs_unmap_bin_file(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + + if (!(kn->flags & KERNFS_HAS_MMAP)) + return; + + spin_lock_irq(&kernfs_open_node_lock); + on = kn->attr.open; + if (on) + atomic_inc(&on->refcnt); + spin_unlock_irq(&kernfs_open_node_lock); + if (!on) + return; + + mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&kernfs_open_file_mutex); + + kernfs_put_open_node(kn, NULL); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_node *on = kn->attr.open; + + /* need parent for the kobj, grab both */ + if (!kernfs_get_active(kn)) + goto trigger; + + poll_wait(filp, &on->poll, wait); + + kernfs_put_active(kn); + + if (of->event != atomic_read(&on->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +static void kernfs_notify_workfn(struct work_struct *work) +{ + struct kernfs_node *kn; + struct kernfs_open_node *on; + struct kernfs_super_info *info; +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); + return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); + + /* kick poll */ + spin_lock_irq(&kernfs_open_node_lock); + + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + + spin_unlock_irq(&kernfs_open_node_lock); + + /* kick fsnotify */ + mutex_lock(&kernfs_mutex); + + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct inode *inode; + struct dentry *dentry; + + inode = ilookup(info->sb, kn->ino); + if (!inode) + continue; + + dentry = d_find_any_alias(inode); + if (dentry) { + fsnotify_parent(NULL, dentry, FS_MODIFY); + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + NULL, 0); + dput(dentry); + } + + iput(inode); + } + + mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read = kernfs_fop_read, + .write = kernfs_fop_write, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + struct lock_class_key *key) +{ + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "s_active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accesible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + + rc = kernfs_add_one(kn); + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/kernel/fs/kernfs/inode.c b/kernel/fs/kernfs/inode.c new file mode 100644 index 000000000..756dd56aa --- /dev/null +++ b/kernel/fs/kernfs/inode.c @@ -0,0 +1,372 @@ +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kernfs-internal.h" + +static const struct address_space_operations kernfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, +}; + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; + struct iattr *iattrs; + + mutex_lock(&iattr_mutex); + + if (kn->iattr) + goto out_unlock; + + kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + if (!kn->iattr) + goto out_unlock; + iattrs = &kn->iattr->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = kn->mode; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + simple_xattrs_init(&kn->iattr->xattrs); +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; +} + +static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + iattrs = &attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = kn->mode = mode; + } + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Returns 0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + + mutex_lock(&kernfs_mutex); + ret = __kernfs_setattr(kn, iattr); + mutex_unlock(&kernfs_mutex); + return ret; +} + +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + struct kernfs_node *kn = dentry->d_fsdata; + int error; + + if (!kn) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&kernfs_mutex); + return error; +} + +static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, + u32 *secdata_len) +{ + struct kernfs_iattrs *attrs; + void *old_secdata; + size_t old_secdata_len; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_secdata = attrs->ia_secdata; + old_secdata_len = attrs->ia_secdata_len; + + attrs->ia_secdata = *secdata; + attrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + void *secdata; + int error; + u32 secdata_len = 0; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(d_inode(dentry), suffix, + value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(d_inode(dentry), + &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; + } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + return simple_xattr_set(&attrs->xattrs, name, value, size, + flags); + } + + return -EINVAL; +} + +int kernfs_iop_removexattr(struct dentry *dentry, const char *name) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_remove(&attrs->xattrs, name); +} + +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, buf, size); +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(&attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) { + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, &attrs->ia_iattr); + security_inode_notifysecctx(inode, attrs->ia_secdata, + attrs->ia_secdata_len); + } + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct inode *inode = d_inode(dentry); + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &kernfs_aops; + inode->i_op = &kernfs_iops; + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + if (kn->flags & KERNFS_EMPTY_DIR) + make_empty_dir_inode(inode); + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kn->ino); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct inode *inode, int mask) +{ + struct kernfs_node *kn; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + return generic_permission(inode, mask); +} diff --git a/kernel/fs/kernfs/kernfs-internal.h b/kernel/fs/kernfs/kernfs-internal.h new file mode 100644 index 000000000..af9fa7499 --- /dev/null +++ b/kernel/fs/kernfs/kernfs-internal.h @@ -0,0 +1,119 @@ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include +#include +#include +#include + +#include + +struct kernfs_iattrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; + + struct simple_xattrs xattrs; +}; + +/* +1 to avoid triggering overflow warning when negating it */ +#define KN_DEACTIVATED_BIAS (INT_MIN + 1) + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * mount.c + */ +struct kernfs_super_info { + struct super_block *sb; + + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; + + /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + struct list_head node; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +extern const struct super_operations kernfs_sops; +extern struct kmem_cache *kernfs_node_cache; + +/* + * inode.c + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct inode *inode, int mask); +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int kernfs_iop_removexattr(struct dentry *dentry, const char *name); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); + +/* + * dir.c + */ +extern struct mutex kernfs_mutex; +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +int kernfs_add_one(struct kernfs_node *kn); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +void kernfs_unmap_bin_file(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +#endif /* __KERNFS_INTERNAL_H */ diff --git a/kernel/fs/kernfs/mount.c b/kernel/fs/kernfs/mount.c new file mode 100644 index 000000000..8eaf41718 --- /dev/null +++ b/kernel/fs/kernfs/mount.c @@ -0,0 +1,249 @@ +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache; + +static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct kernfs_root *root = kernfs_info(sb)->root; + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->remount_fs) + return scops->remount_fs(root, flags, data); + return 0; +} + +static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_options) + return scops->show_options(sf, root); + return 0; +} + +const struct super_operations kernfs_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, + + .remount_fs = kernfs_sop_remount_fs, + .show_options = kernfs_sop_show_options, +}; + +/** + * kernfs_root_from_sb - determine kernfs_root associated with a super_block + * @sb: the super_block in question + * + * Return the kernfs_root associated with @sb. If @sb is not a kernfs one, + * %NULL is returned. + */ +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ + if (sb->s_op == &kernfs_sops) + return kernfs_info(sb)->root; + return NULL; +} + +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct dentry *root; + + info->sb = sb; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = magic; + sb->s_op = &kernfs_sops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&kernfs_mutex); + inode = kernfs_get_inode(sb, info->root->kn); + mutex_unlock(&kernfs_mutex); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + kernfs_get(info->root->kn); + root->d_fsdata = info->root->kn; + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, void *data) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = data; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_mount_ns - kernfs mount helper + * @fs_type: file_system_type of the fs being mounted + * @flags: mount flags specified for the mount + * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number + * @new_sb_created: tell the caller if we allocated a new superblock + * @ns: optional namespace tag of the mount + * + * This is to be called from each kernfs user's file_system_type->mount() + * implementation, which should pass through the specified @fs_type and + * @flags, and specify the hierarchy and namespace tag to mount via @root + * and @ns, respectively. + * + * The return value can be passed to the vfs layer verbatim. + */ +struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) +{ + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->root = root; + info->ns = ns; + + sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (new_sb_created) + *new_sb_created = !sb->s_root; + + if (!sb->s_root) { + struct kernfs_super_info *info = kernfs_info(sb); + + error = kernfs_fill_super(sb, magic); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + + mutex_lock(&kernfs_mutex); + list_add(&info->node, &root->supers); + mutex_unlock(&kernfs_mutex); + } + + return dget(sb->s_root); +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *root_kn = sb->s_root->d_fsdata; + + mutex_lock(&kernfs_mutex); + list_del(&info->node); + mutex_unlock(&kernfs_mutex); + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); + kernfs_put(root_kn); +} + +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); +} diff --git a/kernel/fs/kernfs/symlink.c b/kernel/fs/kernfs/symlink.c new file mode 100644 index 000000000..8a198898e --- /dev/null +++ b/kernel/fs/kernfs/symlink.c @@ -0,0 +1,147 @@ +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + int error; + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + error = kernfs_add_one(kn); + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + strncpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct dentry *dentry, char *path) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + int error; + + mutex_lock(&kernfs_mutex); + error = kernfs_get_target_path(parent, target, path); + mutex_unlock(&kernfs_mutex); + + return error; +} + +static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = kernfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations kernfs_symlink_iops = { + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + .readlink = generic_readlink, + .follow_link = kernfs_iop_follow_link, + .put_link = kernfs_iop_put_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; diff --git a/kernel/fs/libfs.c b/kernel/fs/libfs.c new file mode 100644 index 000000000..02813592e --- /dev/null +++ b/kernel/fs/libfs.c @@ -0,0 +1,1191 @@ +/* + * fs/libfs.c + * Library for filesystems writers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* sync_mapping_buffers */ + +#include + +#include "internal.h" + +static inline int simple_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + +int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); + return 0; +} +EXPORT_SYMBOL(simple_getattr); + +int simple_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + return 0; +} +EXPORT_SYMBOL(simple_statfs); + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +int always_delete_dentry(const struct dentry *dentry) +{ + return 1; +} +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); + +/* + * Lookup the data. This is trivial - if the dentry didn't already + * exist, we know it is negative. Set d_op to delete negative dentries. + */ +struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + if (!dentry->d_sb->s_d_op) + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, NULL); + return NULL; +} +EXPORT_SYMBOL(simple_lookup); + +int dcache_dir_open(struct inode *inode, struct file *file) +{ + static struct qstr cursor_name = QSTR_INIT(".", 1); + + file->private_data = d_alloc(file->f_path.dentry, &cursor_name); + + return file->private_data ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(dcache_dir_open); + +int dcache_dir_close(struct inode *inode, struct file *file) +{ + dput(file->private_data); + return 0; +} +EXPORT_SYMBOL(dcache_dir_close); + +loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) +{ + struct dentry *dentry = file->f_path.dentry; + mutex_lock(&d_inode(dentry)->i_mutex); + switch (whence) { + case 1: + offset += file->f_pos; + case 0: + if (offset >= 0) + break; + default: + mutex_unlock(&d_inode(dentry)->i_mutex); + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + if (file->f_pos >= 2) { + struct list_head *p; + struct dentry *cursor = file->private_data; + loff_t n = file->f_pos - 2; + + spin_lock(&dentry->d_lock); + /* d_lock not required for cursor */ + list_del(&cursor->d_child); + p = dentry->d_subdirs.next; + while (n && p != &dentry->d_subdirs) { + struct dentry *next; + next = list_entry(p, struct dentry, d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(next)) + n--; + spin_unlock(&next->d_lock); + p = p->next; + } + list_add_tail(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + } + } + mutex_unlock(&d_inode(dentry)->i_mutex); + return offset; +} +EXPORT_SYMBOL(dcache_dir_lseek); + +/* Relationship between i_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +/* + * Directory is locked and all positive dentries in it are safe, since + * for ramfs-type trees they can't go away without unlink() or rmdir(), + * both impossible due to the lock on directory. + */ + +int dcache_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *cursor = file->private_data; + struct list_head *p, *q = &cursor->d_child; + + if (!dir_emit_dots(file, ctx)) + return 0; + spin_lock(&dentry->d_lock); + if (ctx->pos == 2) + list_move(q, &dentry->d_subdirs); + + for (p = q->next; p != &dentry->d_subdirs; p = p->next) { + struct dentry *next = list_entry(p, struct dentry, d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); + continue; + } + + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); + if (!dir_emit(ctx, next->d_name.name, next->d_name.len, + d_inode(next)->i_ino, dt_type(d_inode(next)))) + return 0; + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + /* next is still alive */ + list_move(q, p); + spin_unlock(&next->d_lock); + p = q; + ctx->pos++; + } + spin_unlock(&dentry->d_lock); + return 0; +} +EXPORT_SYMBOL(dcache_readdir); + +ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) +{ + return -EISDIR; +} +EXPORT_SYMBOL(generic_read_dir); + +const struct file_operations simple_dir_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .llseek = dcache_dir_lseek, + .read = generic_read_dir, + .iterate = dcache_readdir, + .fsync = noop_fsync, +}; +EXPORT_SYMBOL(simple_dir_operations); + +const struct inode_operations simple_dir_inode_operations = { + .lookup = simple_lookup, +}; +EXPORT_SYMBOL(simple_dir_inode_operations); + +static const struct super_operations simple_super_operations = { + .statfs = simple_statfs, +}; + +/* + * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that + * will never be mountable) + */ +struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, + const struct dentry_operations *dops, unsigned long magic) +{ + struct super_block *s; + struct dentry *dentry; + struct inode *root; + struct qstr d_name = QSTR_INIT(name, strlen(name)); + + s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); + if (IS_ERR(s)) + return ERR_CAST(s); + + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = magic; + s->s_op = ops ? ops : &simple_super_operations; + s->s_time_gran = 1; + root = new_inode(s); + if (!root) + goto Enomem; + /* + * since this is the first inode, make it number 1. New inodes created + * after this must take care not to collide with it (by passing + * max_reserved of 1 to iunique). + */ + root->i_ino = 1; + root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; + root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + dentry = __d_alloc(s, &d_name); + if (!dentry) { + iput(root); + goto Enomem; + } + d_instantiate(dentry, root); + s->s_root = dentry; + s->s_d_op = dops; + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); + +Enomem: + deactivate_locked_super(s); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(mount_pseudo); + +int simple_open(struct inode *inode, struct file *file) +{ + if (inode->i_private) + file->private_data = inode->i_private; + return 0; +} +EXPORT_SYMBOL(simple_open); + +int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inc_nlink(inode); + ihold(inode); + dget(dentry); + d_instantiate(dentry, inode); + return 0; +} +EXPORT_SYMBOL(simple_link); + +int simple_empty(struct dentry *dentry) +{ + struct dentry *child; + int ret = 0; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + goto out; + } + spin_unlock(&child->d_lock); + } + ret = 1; +out: + spin_unlock(&dentry->d_lock); + return ret; +} +EXPORT_SYMBOL(simple_empty); + +int simple_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + drop_nlink(inode); + dput(dentry); + return 0; +} +EXPORT_SYMBOL(simple_unlink); + +int simple_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + drop_nlink(d_inode(dentry)); + simple_unlink(dir, dentry); + drop_nlink(dir); + return 0; +} +EXPORT_SYMBOL(simple_rmdir); + +int simple_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = d_inode(old_dentry); + int they_are_dirs = d_is_dir(old_dentry); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (d_really_is_positive(new_dentry)) { + simple_unlink(new_dir, new_dentry); + if (they_are_dirs) { + drop_nlink(d_inode(new_dentry)); + drop_nlink(old_dir); + } + } else if (they_are_dirs) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } + + old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = + new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; + + return 0; +} +EXPORT_SYMBOL(simple_rename); + +/** + * simple_setattr - setattr for simple filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr is a simple ->setattr implementation without a proper + * implementation of size changes. + * + * It can either be used for in-memory filesystems or special files + * on simple regular filesystems. Anything that needs to change on-disk + * or wire state on size changes needs its own setattr method. + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) + truncate_setsize(inode, iattr->ia_size); + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + return 0; +} +EXPORT_SYMBOL(simple_setattr); + +int simple_readpage(struct file *file, struct page *page) +{ + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; +} +EXPORT_SYMBOL(simple_readpage); + +int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *page; + pgoff_t index; + + index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + *pagep = page; + + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE); + } + return 0; +} +EXPORT_SYMBOL(simple_write_begin); + +/** + * simple_write_end - .write_end helper for non-block-device FSes + * @available: See .write_end of address_space_operations + * @file: " + * @mapping: " + * @pos: " + * @len: " + * @copied: " + * @page: " + * @fsdata: " + * + * simple_write_end does the minimum needed for updating a page after writing is + * done. It has the same API signature as the .write_end of + * address_space_operations vector. So it can just be set onto .write_end for + * FSes that don't need any other processing. i_mutex is assumed to be held. + * Block based filesystems should use generic_write_end(). + * NOTE: Even though i_size might get updated by this function, mark_inode_dirty + * is not called, so a filesystem that actually does store data in .write_inode + * should extend on what's done here with a call to mark_inode_dirty() in the + * case that i_size has changed. + */ +int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} +EXPORT_SYMBOL(simple_write_end); + +/* + * the inodes created here are not hashed. If you use iunique to generate + * unique inode values later for this filesystem, then you must take care + * to pass it an appropriate max_reserved value to avoid collisions. + */ +int simple_fill_super(struct super_block *s, unsigned long magic, + struct tree_descr *files) +{ + struct inode *inode; + struct dentry *root; + struct dentry *dentry; + int i; + + s->s_blocksize = PAGE_CACHE_SIZE; + s->s_blocksize_bits = PAGE_CACHE_SHIFT; + s->s_magic = magic; + s->s_op = &simple_super_operations; + s->s_time_gran = 1; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + /* + * because the root inode is 1, the files array must not contain an + * entry at index 1 + */ + inode->i_ino = 1; + inode->i_mode = S_IFDIR | 0755; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + root = d_make_root(inode); + if (!root) + return -ENOMEM; + for (i = 0; !files->name || files->name[0]; i++, files++) { + if (!files->name) + continue; + + /* warn if it tries to conflict with the root inode */ + if (unlikely(i == 1)) + printk(KERN_WARNING "%s: %s passed in a files array" + "with an index of 1!\n", __func__, + s->s_type->name); + + dentry = d_alloc_name(root, files->name); + if (!dentry) + goto out; + inode = new_inode(s); + if (!inode) { + dput(dentry); + goto out; + } + inode->i_mode = S_IFREG | files->mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_fop = files->ops; + inode->i_ino = i; + d_add(dentry, inode); + } + s->s_root = root; + return 0; +out: + d_genocide(root); + shrink_dcache_parent(root); + dput(root); + return -ENOMEM; +} +EXPORT_SYMBOL(simple_fill_super); + +static DEFINE_SPINLOCK(pin_fs_lock); + +int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) +{ + struct vfsmount *mnt = NULL; + spin_lock(&pin_fs_lock); + if (unlikely(!*mount)) { + spin_unlock(&pin_fs_lock); + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + spin_lock(&pin_fs_lock); + if (!*mount) + *mount = mnt; + } + mntget(*mount); + ++*count; + spin_unlock(&pin_fs_lock); + mntput(mnt); + return 0; +} +EXPORT_SYMBOL(simple_pin_fs); + +void simple_release_fs(struct vfsmount **mount, int *count) +{ + struct vfsmount *mnt; + spin_lock(&pin_fs_lock); + mnt = *mount; + if (!--*count) + *mount = NULL; + spin_unlock(&pin_fs_lock); + mntput(mnt); +} +EXPORT_SYMBOL(simple_release_fs); + +/** + * simple_read_from_buffer - copy data from the buffer to user space + * @to: the user space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The simple_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the user space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + size_t ret; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !count) + return 0; + if (count > available - pos) + count = available - pos; + ret = copy_to_user(to, from + pos, count); + if (ret == count) + return -EFAULT; + count -= ret; + *ppos = pos + count; + return count; +} +EXPORT_SYMBOL(simple_read_from_buffer); + +/** + * simple_write_to_buffer - copy data from user space to the buffer + * @to: the buffer to write to + * @available: the size of the buffer + * @ppos: the current position in the buffer + * @from: the user space buffer to read from + * @count: the maximum number of bytes to read + * + * The simple_write_to_buffer() function reads up to @count bytes from the user + * space address starting at @from into the buffer @to at offset @ppos. + * + * On success, the number of bytes written is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, + const void __user *from, size_t count) +{ + loff_t pos = *ppos; + size_t res; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !count) + return 0; + if (count > available - pos) + count = available - pos; + res = copy_from_user(to + pos, from, count); + if (res == count) + return -EFAULT; + count -= res; + *ppos = pos + count; + return count; +} +EXPORT_SYMBOL(simple_write_to_buffer); + +/** + * memory_read_from_buffer - copy data from the buffer + * @to: the kernel space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The memory_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the kernel space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + + if (pos < 0) + return -EINVAL; + if (pos >= available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy(to, from + pos, count); + *ppos = pos + count; + + return count; +} +EXPORT_SYMBOL(memory_read_from_buffer); + +/* + * Transaction based IO. + * The file expects a single write which triggers the transaction, and then + * possibly a read which collects the result - which is stored in a + * file-local buffer. + */ + +void simple_transaction_set(struct file *file, size_t n) +{ + struct simple_transaction_argresp *ar = file->private_data; + + BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); + + /* + * The barrier ensures that ar->size will really remain zero until + * ar->data is ready for reading. + */ + smp_mb(); + ar->size = n; +} +EXPORT_SYMBOL(simple_transaction_set); + +char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) +{ + struct simple_transaction_argresp *ar; + static DEFINE_SPINLOCK(simple_transaction_lock); + + if (size > SIMPLE_TRANSACTION_LIMIT - 1) + return ERR_PTR(-EFBIG); + + ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); + if (!ar) + return ERR_PTR(-ENOMEM); + + spin_lock(&simple_transaction_lock); + + /* only one write allowed per open */ + if (file->private_data) { + spin_unlock(&simple_transaction_lock); + free_page((unsigned long)ar); + return ERR_PTR(-EBUSY); + } + + file->private_data = ar; + + spin_unlock(&simple_transaction_lock); + + if (copy_from_user(ar->data, buf, size)) + return ERR_PTR(-EFAULT); + + return ar->data; +} +EXPORT_SYMBOL(simple_transaction_get); + +ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) +{ + struct simple_transaction_argresp *ar = file->private_data; + + if (!ar) + return 0; + return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); +} +EXPORT_SYMBOL(simple_transaction_read); + +int simple_transaction_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} +EXPORT_SYMBOL(simple_transaction_release); + +/* Simple attribute files */ + +struct simple_attr { + int (*get)(void *, u64 *); + int (*set)(void *, u64); + char get_buf[24]; /* enough to store a u64 and "\n\0" */ + char set_buf[24]; + void *data; + const char *fmt; /* format for read operation */ + struct mutex mutex; /* protects access to these buffers */ +}; + +/* simple_attr_open is called by an actual attribute open file operation + * to set the attribute specific access operations. */ +int simple_attr_open(struct inode *inode, struct file *file, + int (*get)(void *, u64 *), int (*set)(void *, u64), + const char *fmt) +{ + struct simple_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + attr->get = get; + attr->set = set; + attr->data = inode->i_private; + attr->fmt = fmt; + mutex_init(&attr->mutex); + + file->private_data = attr; + + return nonseekable_open(inode, file); +} +EXPORT_SYMBOL_GPL(simple_attr_open); + +int simple_attr_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} +EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ + +/* read from the buffer that is filled with the get function */ +ssize_t simple_attr_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + size_t size; + ssize_t ret; + + attr = file->private_data; + + if (!attr->get) + return -EACCES; + + ret = mutex_lock_interruptible(&attr->mutex); + if (ret) + return ret; + + if (*ppos) { /* continued read */ + size = strlen(attr->get_buf); + } else { /* first read */ + u64 val; + ret = attr->get(attr->data, &val); + if (ret) + goto out; + + size = scnprintf(attr->get_buf, sizeof(attr->get_buf), + attr->fmt, (unsigned long long)val); + } + + ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); +out: + mutex_unlock(&attr->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(simple_attr_read); + +/* interpret the buffer as a number to call the set function with */ +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + u64 val; + size_t size; + ssize_t ret; + + attr = file->private_data; + if (!attr->set) + return -EACCES; + + ret = mutex_lock_interruptible(&attr->mutex); + if (ret) + return ret; + + ret = -EFAULT; + size = min(sizeof(attr->set_buf) - 1, len); + if (copy_from_user(attr->set_buf, buf, size)) + goto out; + + attr->set_buf[size] = '\0'; + val = simple_strtoll(attr->set_buf, NULL, 0); + ret = attr->set(attr->data, val); + if (ret == 0) + ret = len; /* on success, claim we got the whole input */ +out: + mutex_unlock(&attr->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(simple_attr_write); + +/** + * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation + * @sb: filesystem to do the file handle conversion on + * @fid: file handle to convert + * @fh_len: length of the file handle in bytes + * @fh_type: type of file handle + * @get_inode: filesystem callback to retrieve inode + * + * This function decodes @fid as long as it has one of the well-known + * Linux filehandle types and calls @get_inode on it to retrieve the + * inode for the object specified in the file handle. + */ +struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type, struct inode *(*get_inode) + (struct super_block *sb, u64 ino, u32 gen)) +{ + struct inode *inode = NULL; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + inode = get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + } + + return d_obtain_alias(inode); +} +EXPORT_SYMBOL_GPL(generic_fh_to_dentry); + +/** + * generic_fh_to_parent - generic helper for the fh_to_parent export operation + * @sb: filesystem to do the file handle conversion on + * @fid: file handle to convert + * @fh_len: length of the file handle in bytes + * @fh_type: type of file handle + * @get_inode: filesystem callback to retrieve inode + * + * This function decodes @fid as long as it has one of the well-known + * Linux filehandle types and calls @get_inode on it to retrieve the + * inode for the _parent_ object specified in the file handle if it + * is specified in the file handle, or NULL otherwise. + */ +struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type, struct inode *(*get_inode) + (struct super_block *sb, u64 ino, u32 gen)) +{ + struct inode *inode = NULL; + + if (fh_len <= 2) + return NULL; + + switch (fh_type) { + case FILEID_INO32_GEN_PARENT: + inode = get_inode(sb, fid->i32.parent_ino, + (fh_len > 3 ? fid->i32.parent_gen : 0)); + break; + } + + return d_obtain_alias(inode); +} +EXPORT_SYMBOL_GPL(generic_fh_to_parent); + +/** + * __generic_file_fsync - generic fsync implementation for simple filesystems + * + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int __generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + int ret; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + goto out; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + + err = sync_inode_metadata(inode, 1); + if (ret == 0) + ret = err; + +out: + mutex_unlock(&inode->i_mutex); + return ret; +} +EXPORT_SYMBOL(__generic_file_fsync); + +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * with flush + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + */ + +int generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + + err = __generic_file_fsync(file, start, end, datasync); + if (err) + return err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); +} +EXPORT_SYMBOL(generic_file_fsync); + +/** + * generic_check_addressable - Check addressability of file system + * @blocksize_bits: log of file system block size + * @num_blocks: number of blocks in file system + * + * Determine whether a file system with @num_blocks blocks (and a + * block size of 2**@blocksize_bits) is addressable by the sector_t + * and page cache of the system. Return 0 if so and -EFBIG otherwise. + */ +int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) +{ + u64 last_fs_block = num_blocks - 1; + u64 last_fs_page = + last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); + + if (unlikely(num_blocks == 0)) + return 0; + + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + return -EINVAL; + + if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { + return -EFBIG; + } + return 0; +} +EXPORT_SYMBOL(generic_check_addressable); + +/* + * No-op implementation of ->fsync for in-memory filesystems. + */ +int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + return 0; +} +EXPORT_SYMBOL(noop_fsync); + +void kfree_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} +EXPORT_SYMBOL(kfree_put_link); + +/* + * nop .set_page_dirty method so that people can use .page_mkwrite on + * anon inodes. + */ +static int anon_set_page_dirty(struct page *page) +{ + return 0; +}; + +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +struct inode *alloc_anon_inode(struct super_block *s) +{ + static const struct address_space_operations anon_aops = { + .set_page_dirty = anon_set_page_dirty, + }; + struct inode *inode = new_inode_pseudo(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + inode->i_mapping->a_ops = &anon_aops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} +EXPORT_SYMBOL(alloc_anon_inode); + +/** + * simple_nosetlease - generic helper for prohibiting leases + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: new lease supplied for insertion + * @priv: private data for lm_setup operation + * + * Generic helper for filesystems that do not wish to allow leases to be set. + * All arguments are ignored and it just returns -EINVAL. + */ +int +simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) +{ + return -EINVAL; +} +EXPORT_SYMBOL(simple_nosetlease); + + +/* + * Operations for a permanently empty directory. + */ +static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return ERR_PTR(-ENOENT); +} + +static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + return 0; +} + +static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +{ + return -EPERM; +} + +static int empty_dir_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int empty_dir_removexattr(struct dentry *dentry, const char *name) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) +{ + return -EOPNOTSUPP; +} + +static const struct inode_operations empty_dir_inode_operations = { + .lookup = empty_dir_lookup, + .permission = generic_permission, + .setattr = empty_dir_setattr, + .getattr = empty_dir_getattr, + .setxattr = empty_dir_setxattr, + .getxattr = empty_dir_getxattr, + .removexattr = empty_dir_removexattr, + .listxattr = empty_dir_listxattr, +}; + +static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) +{ + /* An empty directory has two entries . and .. at offsets 0 and 1 */ + return generic_file_llseek_size(file, offset, whence, 2, 2); +} + +static int empty_dir_readdir(struct file *file, struct dir_context *ctx) +{ + dir_emit_dots(file, ctx); + return 0; +} + +static const struct file_operations empty_dir_operations = { + .llseek = empty_dir_llseek, + .read = generic_read_dir, + .iterate = empty_dir_readdir, + .fsync = noop_fsync, +}; + + +void make_empty_dir_inode(struct inode *inode) +{ + set_nlink(inode, 2); + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_rdev = 0; + inode->i_size = 2; + inode->i_blkbits = PAGE_SHIFT; + inode->i_blocks = 0; + + inode->i_op = &empty_dir_inode_operations; + inode->i_fop = &empty_dir_operations; +} + +bool is_empty_dir_inode(struct inode *inode) +{ + return (inode->i_fop == &empty_dir_operations) && + (inode->i_op == &empty_dir_inode_operations); +} diff --git a/kernel/fs/lockd/Makefile b/kernel/fs/lockd/Makefile new file mode 100644 index 000000000..9b320cc2a --- /dev/null +++ b/kernel/fs/lockd/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the linux lock manager stuff +# + +obj-$(CONFIG_LOCKD) += lockd.o + +lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ + svcshare.o svcproc.o svcsubs.o mon.o xdr.o +lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-objs-$(CONFIG_PROC_FS) += procfs.o +lockd-objs := $(lockd-objs-y) diff --git a/kernel/fs/lockd/clnt4xdr.c b/kernel/fs/lockd/clnt4xdr.c new file mode 100644 index 000000000..d3e40db28 --- /dev/null +++ b/kernel/fs/lockd/clnt4xdr.c @@ -0,0 +1,599 @@ +/* + * linux/fs/lockd/clnt4xdr.c + * + * XDR functions to encode/decode NLM version 4 RPC arguments and results. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN) +# error "NLM host name cannot be larger than NLM's maximum string length!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM4_void_sz (0) +#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2)) +#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz) +#define NLM4_holder_sz (6+NLM4_owner_sz) + +#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz) +#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz) +#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz) +#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz) + +#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz) +#define NLM4_res_sz (NLM4_cookie_sz+1) +#define NLM4_norep_sz (0) + + +static s64 loff_t_to_s64(loff_t offset) +{ + s64 res; + + if (offset >= NLM4_OFFSET_MAX) + res = NLM4_OFFSET_MAX; + else if (offset <= -NLM4_OFFSET_MAX) + res = -NLM4_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm4_compute_offsets(const struct nlm_lock *lock, + u64 *l_offset, u64 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + *l_offset = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv4 basic data types + * + * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 + * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter + * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + encode_netobj(xdr, (u8 *)&fh->data, fh->size); +} + +/* + * enum nlm4_stats { + * NLM4_GRANTED = 0, + * NLM4_DENIED = 1, + * NLM4_DENIED_NOLOCKS = 2, + * NLM4_BLOCKED = 3, + * NLM4_DENIED_GRACE_PERIOD = 4, + * NLM4_DEADLCK = 5, + * NLM4_ROFS = 6, + * NLM4_STALE_FH = 7, + * NLM4_FBIG = 8, + * NLM4_FAILED = 9 + * }; + * + * struct nlm4_stat { + * nlm4_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ +static void encode_nlm4_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + BUG_ON(be32_to_cpu(stat) > NLM_FAILED); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(ntohl(*p) > ntohl(nlm4_failed))) + goto out_bad_xdr; + *stat = *p; + return 0; +out_bad_xdr: + dprintk("%s: server returned invalid nlm4_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm4_holder { + * bool exclusive; + * int32 svid; + * netobj oh; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u64 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + +static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u64 l_offset, l_len; + u32 exclusive; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 8 + 8); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + p = xdr_decode_hyper(p, &l_offset); + xdr_decode_hyper(p, &l_len); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm4_lock { + * string caller_name; + * netobj fh; + * netobj oh; + * int32 svid; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u64 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 8 + 8); + *p++ = cpu_to_be32(lock->svid); + + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + + +/* + * NLMv4 XDR encode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm4_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm4_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_unlockargs { + * netobj cookie; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static void nlm4_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); +} + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static void nlm4_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); + if (result->status == nlm_lck_denied) + encode_nlm4_holder(xdr, result); +} + + +/* + * NLMv4 XDR decode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static int decode_nlm4_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm4_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm4_holder(xdr, result); +out: + return error; +} + +static int nlm4_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static int nlm4_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm4_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \ + .p_arglen = NLM4_##argtype##_sz, \ + .p_replen = NLM4_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm4_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +const struct rpc_version nlm_version4 = { + .number = 4, + .nrprocs = ARRAY_SIZE(nlm4_procedures), + .procs = nlm4_procedures, +}; diff --git a/kernel/fs/lockd/clntlock.c b/kernel/fs/lockd/clntlock.c new file mode 100644 index 000000000..41e491b8e --- /dev/null +++ b/kernel/fs/lockd/clntlock.c @@ -0,0 +1,301 @@ +/* + * linux/fs/lockd/clntlock.c + * + * Lock handling for the client side NLM implementation + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +/* + * Local function prototypes + */ +static int reclaimer(void *ptr); + +/* + * The following functions handle blocking and granting from the + * client perspective. + */ + +/* + * This is the representation of a blocked client lock. + */ +struct nlm_wait { + struct list_head b_list; /* linked list */ + wait_queue_head_t b_wait; /* where to wait on */ + struct nlm_host * b_host; + struct file_lock * b_lock; /* local file lock */ + unsigned short b_reclaim; /* got to reclaim lock */ + __be32 b_status; /* grant callback status */ +}; + +static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); + +/** + * nlmclnt_init - Set up per-NFS mount point lockd data structures + * @nlm_init: pointer to arguments structure + * + * Returns pointer to an appropriate nlm_host struct, + * or an ERR_PTR value. + */ +struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) +{ + struct nlm_host *host; + u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; + int status; + + status = lockd_up(nlm_init->net); + if (status < 0) + return ERR_PTR(status); + + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, + nlm_init->protocol, nlm_version, + nlm_init->hostname, nlm_init->noresvport, + nlm_init->net); + if (host == NULL) + goto out_nohost; + if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) + goto out_nobind; + + return host; +out_nobind: + nlmclnt_release_host(host); +out_nohost: + lockd_down(nlm_init->net); + return ERR_PTR(-ENOLCK); +} +EXPORT_SYMBOL_GPL(nlmclnt_init); + +/** + * nlmclnt_done - Release resources allocated by nlmclnt_init() + * @host: nlm_host structure reserved by nlmclnt_init() + * + */ +void nlmclnt_done(struct nlm_host *host) +{ + struct net *net = host->net; + + nlmclnt_release_host(host); + lockd_down(net); +} +EXPORT_SYMBOL_GPL(nlmclnt_done); + +/* + * Queue up a lock for blocking so that the GRANTED request can see it + */ +struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) +{ + struct nlm_wait *block; + + block = kmalloc(sizeof(*block), GFP_KERNEL); + if (block != NULL) { + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = nlm_lck_blocked; + + spin_lock(&nlm_blocked_lock); + list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); + } + return block; +} + +void nlmclnt_finish_block(struct nlm_wait *block) +{ + if (block == NULL) + return; + spin_lock(&nlm_blocked_lock); + list_del(&block->b_list); + spin_unlock(&nlm_blocked_lock); + kfree(block); +} + +/* + * Block on a lock + */ +int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) +{ + long ret; + + /* A borken server might ask us to block even if we didn't + * request it. Just say no! + */ + if (block == NULL) + return -EAGAIN; + + /* Go to sleep waiting for GRANT callback. Some servers seem + * to lose callbacks, however, so we're going to poll from + * time to time just to make sure. + * + * For now, the retry frequency is pretty high; normally + * a 1 minute timeout would do. See the comment before + * nlmclnt_lock for an explanation. + */ + ret = wait_event_interruptible_timeout(block->b_wait, + block->b_status != nlm_lck_blocked, + timeout); + if (ret < 0) + return -ERESTARTSYS; + /* Reset the lock status after a server reboot so we resend */ + if (block->b_status == nlm_lck_denied_grace_period) + block->b_status = nlm_lck_blocked; + req->a_res.status = block->b_status; + return 0; +} + +/* + * The server lockd has called us back to tell us the lock was granted + */ +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) +{ + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; + struct nlm_wait *block; + __be32 res = nlm_lck_denied; + + /* + * Look up blocked request based on arguments. + * Warning: must not use cookie to match it! + */ + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + struct file_lock *fl_blocked = block->b_lock; + + if (fl_blocked->fl_start != fl->fl_start) + continue; + if (fl_blocked->fl_end != fl->fl_end) + continue; + /* + * Careful! The NLM server will return the 32-bit "pid" that + * we put on the wire: in this case the lockowner "pid". + */ + if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) + continue; + if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) + continue; + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = nlm_granted; + wake_up(&block->b_wait); + res = nlm_granted; + } + spin_unlock(&nlm_blocked_lock); + return res; +} + +/* + * The following procedures deal with the recovery of locks after a + * server crash. + */ + +/* + * Reclaim all locks on server host. We do this by spawning a separate + * reclaimer thread. + */ +void +nlmclnt_recovery(struct nlm_host *host) +{ + struct task_struct *task; + + if (!host->h_reclaiming++) { + nlm_get_host(host); + task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name); + if (IS_ERR(task)) + printk(KERN_ERR "lockd: unable to spawn reclaimer " + "thread. Locks for %s won't be reclaimed! " + "(%ld)\n", host->h_name, PTR_ERR(task)); + } +} + +static int +reclaimer(void *ptr) +{ + struct nlm_host *host = (struct nlm_host *) ptr; + struct nlm_wait *block; + struct nlm_rqst *req; + struct file_lock *fl, *next; + u32 nsmstate; + struct net *net = host->net; + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) { + printk(KERN_ERR "lockd: reclaimer unable to alloc memory." + " Locks for %s won't be reclaimed!\n", + host->h_name); + return 0; + } + + allow_signal(SIGKILL); + + down_write(&host->h_rwsem); + lockd_up(net); /* note: this cannot fail as lockd is already running */ + + dprintk("lockd: reclaiming locks for host %s\n", host->h_name); + +restart: + nsmstate = host->h_nsmstate; + + /* Force a portmap getport - the peer's lockd will + * most likely end up on a different port. + */ + host->h_nextrebind = jiffies; + nlm_rebind_host(host); + + /* First, reclaim all locks that have been granted. */ + list_splice_init(&host->h_granted, &host->h_reclaim); + list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { + list_del_init(&fl->fl_u.nfs_fl.list); + + /* + * sending this thread a SIGKILL will result in any unreclaimed + * locks being removed from the h_granted list. This means that + * the kernel will not attempt to reclaim them again if a new + * reclaimer thread is spawned for this host. + */ + if (signalled()) + continue; + if (nlmclnt_reclaim(host, fl, req) != 0) + continue; + list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); + if (host->h_nsmstate != nsmstate) { + /* Argh! The server rebooted again! */ + goto restart; + } + } + + host->h_reclaiming = 0; + up_write(&host->h_rwsem); + dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); + + /* Now, wake up all processes that sleep on a blocked lock */ + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (block->b_host == host) { + block->b_status = nlm_lck_denied_grace_period; + wake_up(&block->b_wait); + } + } + spin_unlock(&nlm_blocked_lock); + + /* Release host handle after use */ + nlmclnt_release_host(host); + lockd_down(net); + kfree(req); + return 0; +} diff --git a/kernel/fs/lockd/clntproc.c b/kernel/fs/lockd/clntproc.c new file mode 100644 index 000000000..acd394716 --- /dev/null +++ b/kernel/fs/lockd/clntproc.c @@ -0,0 +1,849 @@ +/* + * linux/fs/lockd/clntproc.c + * + * RPC procedures for the client side NLM implementation + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT +#define NLMCLNT_GRACE_WAIT (5*HZ) +#define NLMCLNT_POLL_TIMEOUT (30*HZ) +#define NLMCLNT_MAX_RETRIES 3 + +static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); +static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); +static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); +static int nlm_stat_to_errno(__be32 stat); +static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); + +static const struct rpc_call_ops nlmclnt_unlock_ops; +static const struct rpc_call_ops nlmclnt_cancel_ops; + +/* + * Cookie counter for NLM requests + */ +static atomic_t nlm_cookie = ATOMIC_INIT(0x1234); + +void nlmclnt_next_cookie(struct nlm_cookie *c) +{ + u32 cookie = atomic_inc_return(&nlm_cookie); + + memcpy(c->data, &cookie, 4); + c->len=4; +} + +static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) +{ + atomic_inc(&lockowner->count); + return lockowner; +} + +static void nlm_put_lockowner(struct nlm_lockowner *lockowner) +{ + if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + return; + list_del(&lockowner->list); + spin_unlock(&lockowner->host->h_lock); + nlmclnt_release_host(lockowner->host); + kfree(lockowner); +} + +static inline int nlm_pidbusy(struct nlm_host *host, uint32_t pid) +{ + struct nlm_lockowner *lockowner; + list_for_each_entry(lockowner, &host->h_lockowners, list) { + if (lockowner->pid == pid) + return -EBUSY; + } + return 0; +} + +static inline uint32_t __nlm_alloc_pid(struct nlm_host *host) +{ + uint32_t res; + do { + res = host->h_pidcount++; + } while (nlm_pidbusy(host, res) < 0); + return res; +} + +static struct nlm_lockowner *__nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner) +{ + struct nlm_lockowner *lockowner; + list_for_each_entry(lockowner, &host->h_lockowners, list) { + if (lockowner->owner != owner) + continue; + return nlm_get_lockowner(lockowner); + } + return NULL; +} + +static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner) +{ + struct nlm_lockowner *res, *new = NULL; + + spin_lock(&host->h_lock); + res = __nlm_find_lockowner(host, owner); + if (res == NULL) { + spin_unlock(&host->h_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + spin_lock(&host->h_lock); + res = __nlm_find_lockowner(host, owner); + if (res == NULL && new != NULL) { + res = new; + atomic_set(&new->count, 1); + new->owner = owner; + new->pid = __nlm_alloc_pid(host); + new->host = nlm_get_host(host); + list_add(&new->list, &host->h_lockowners); + new = NULL; + } + } + spin_unlock(&host->h_lock); + kfree(new); + return res; +} + +/* + * Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls + */ +static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) +{ + struct nlm_args *argp = &req->a_args; + struct nlm_lock *lock = &argp->lock; + char *nodename = req->a_host->h_rpcclnt->cl_nodename; + + nlmclnt_next_cookie(&argp->cookie); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); + lock->caller = nodename; + lock->oh.data = req->a_owner; + lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", + (unsigned int)fl->fl_u.nfs_fl.owner->pid, + nodename); + lock->svid = fl->fl_u.nfs_fl.owner->pid; + lock->fl.fl_start = fl->fl_start; + lock->fl.fl_end = fl->fl_end; + lock->fl.fl_type = fl->fl_type; +} + +static void nlmclnt_release_lockargs(struct nlm_rqst *req) +{ + WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL); +} + +/** + * nlmclnt_proc - Perform a single client-side lock request + * @host: address of a valid nlm_host context representing the NLM server + * @cmd: fcntl-style file lock operation to perform + * @fl: address of arguments for the lock operation + * + */ +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +{ + struct nlm_rqst *call; + int status; + + call = nlm_alloc_call(host); + if (call == NULL) + return -ENOMEM; + + nlmclnt_locks_init_private(fl, host); + if (!fl->fl_u.nfs_fl.owner) { + /* lockowner allocation has failed */ + nlmclnt_release_call(call); + return -ENOMEM; + } + /* Set up the argument struct */ + nlmclnt_setlockargs(call, fl); + + if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { + if (fl->fl_type != F_UNLCK) { + call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; + status = nlmclnt_lock(call, fl); + } else + status = nlmclnt_unlock(call, fl); + } else if (IS_GETLK(cmd)) + status = nlmclnt_test(call, fl); + else + status = -EINVAL; + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + + dprintk("lockd: clnt proc returns %d\n", status); + return status; +} +EXPORT_SYMBOL_GPL(nlmclnt_proc); + +/* + * Allocate an NLM RPC call struct + */ +struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) +{ + struct nlm_rqst *call; + + for(;;) { + call = kzalloc(sizeof(*call), GFP_KERNEL); + if (call != NULL) { + atomic_set(&call->a_count, 1); + locks_init_lock(&call->a_args.lock.fl); + locks_init_lock(&call->a_res.lock.fl); + call->a_host = nlm_get_host(host); + return call; + } + if (signalled()) + break; + printk("nlm_alloc_call: failed, waiting for memory\n"); + schedule_timeout_interruptible(5*HZ); + } + return NULL; +} + +void nlmclnt_release_call(struct nlm_rqst *call) +{ + if (!atomic_dec_and_test(&call->a_count)) + return; + nlmclnt_release_host(call->a_host); + nlmclnt_release_lockargs(call); + kfree(call); +} + +static void nlmclnt_rpc_release(void *data) +{ + nlmclnt_release_call(data); +} + +static int nlm_wait_on_grace(wait_queue_head_t *queue) +{ + DEFINE_WAIT(wait); + int status = -EINTR; + + prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE); + if (!signalled ()) { + schedule_timeout(NLMCLNT_GRACE_WAIT); + try_to_freeze(); + if (!signalled ()) + status = 0; + } + finish_wait(queue, &wait); + return status; +} + +/* + * Generic NLM call + */ +static int +nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) +{ + struct nlm_host *host = req->a_host; + struct rpc_clnt *clnt; + struct nlm_args *argp = &req->a_args; + struct nlm_res *resp = &req->a_res; + struct rpc_message msg = { + .rpc_argp = argp, + .rpc_resp = resp, + .rpc_cred = cred, + }; + int status; + + dprintk("lockd: call procedure %d on %s\n", + (int)proc, host->h_name); + + do { + if (host->h_reclaiming && !argp->reclaim) + goto in_grace_period; + + /* If we have no RPC client yet, create one. */ + if ((clnt = nlm_bind_host(host)) == NULL) + return -ENOLCK; + msg.rpc_proc = &clnt->cl_procinfo[proc]; + + /* Perform the RPC call. If an error occurs, try again */ + if ((status = rpc_call_sync(clnt, &msg, 0)) < 0) { + dprintk("lockd: rpc_call returned error %d\n", -status); + switch (status) { + case -EPROTONOSUPPORT: + status = -EINVAL; + break; + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ENOTCONN: + nlm_rebind_host(host); + status = -EAGAIN; + break; + case -ERESTARTSYS: + return signalled () ? -EINTR : status; + default: + break; + } + break; + } else + if (resp->status == nlm_lck_denied_grace_period) { + dprintk("lockd: server in grace period\n"); + if (argp->reclaim) { + printk(KERN_WARNING + "lockd: spurious grace period reject?!\n"); + return -ENOLCK; + } + } else { + if (!argp->reclaim) { + /* We appear to be out of the grace period */ + wake_up_all(&host->h_gracewait); + } + dprintk("lockd: server returns status %d\n", + ntohl(resp->status)); + return 0; /* Okay, call complete */ + } + +in_grace_period: + /* + * The server has rebooted and appears to be in the grace + * period during which locks are only allowed to be + * reclaimed. + * We can only back off and try again later. + */ + status = nlm_wait_on_grace(&host->h_gracewait); + } while (status == 0); + + return status; +} + +/* + * Generic NLM call, async version. + */ +static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct nlm_host *host = req->a_host; + struct rpc_clnt *clnt; + struct rpc_task_setup task_setup_data = { + .rpc_message = msg, + .callback_ops = tk_ops, + .callback_data = req, + .flags = RPC_TASK_ASYNC, + }; + + dprintk("lockd: call procedure %d on %s (async)\n", + (int)proc, host->h_name); + + /* If we have no RPC client yet, create one. */ + clnt = nlm_bind_host(host); + if (clnt == NULL) + goto out_err; + msg->rpc_proc = &clnt->cl_procinfo[proc]; + task_setup_data.rpc_client = clnt; + + /* bootstrap and kick off the async RPC call */ + return rpc_run_task(&task_setup_data); +out_err: + tk_ops->rpc_release(req); + return ERR_PTR(-ENOLCK); +} + +static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct rpc_task *task; + + task = __nlm_async_call(req, proc, msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +/* + * NLM asynchronous call. + */ +int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + }; + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_res, + }; + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +/* + * NLM client asynchronous call. + * + * Note that although the calls are asynchronous, and are therefore + * guaranteed to complete, we still always attempt to wait for + * completion in order to be able to correctly track the lock + * state. + */ +static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + .rpc_cred = cred, + }; + struct rpc_task *task; + int err; + + task = __nlm_async_call(req, proc, &msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + err = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return err; +} + +/* + * TEST for the presence of a conflicting lock + */ +static int +nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) +{ + int status; + + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); + if (status < 0) + goto out; + + switch (req->a_res.status) { + case nlm_granted: + fl->fl_type = F_UNLCK; + break; + case nlm_lck_denied: + /* + * Report the conflicting lock back to the application. + */ + fl->fl_start = req->a_res.lock.fl.fl_start; + fl->fl_end = req->a_res.lock.fl.fl_end; + fl->fl_type = req->a_res.lock.fl.fl_type; + fl->fl_pid = 0; + break; + default: + status = nlm_stat_to_errno(req->a_res.status); + } +out: + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); + new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; + new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); + list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); +} + +static void nlmclnt_locks_release_private(struct file_lock *fl) +{ + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); + list_del(&fl->fl_u.nfs_fl.list); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); + nlm_put_lockowner(fl->fl_u.nfs_fl.owner); +} + +static const struct file_lock_operations nlmclnt_lock_ops = { + .fl_copy_lock = nlmclnt_locks_copy_lock, + .fl_release_private = nlmclnt_locks_release_private, +}; + +static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) +{ + fl->fl_u.nfs_fl.state = 0; + fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); + INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); + fl->fl_ops = &nlmclnt_lock_ops; +} + +static int do_vfs_lock(struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(fl->fl_file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(fl->fl_file, fl); + break; + default: + BUG(); + } + return res; +} + +/* + * LOCK: Try to create a lock + * + * Programmer Harassment Alert + * + * When given a blocking lock request in a sync RPC call, the HPUX lockd + * will faithfully return LCK_BLOCKED but never cares to notify us when + * the lock could be granted. This way, our local process could hang + * around forever waiting for the callback. + * + * Solution A: Implement busy-waiting + * Solution B: Use the async version of the call (NLM_LOCK_{MSG,RES}) + * + * For now I am implementing solution A, because I hate the idea of + * re-implementing lockd for a third time in two months. The async + * calls shouldn't be too hard to do, however. + * + * This is one of the lovely things about standards in the NFS area: + * they're so soft and squishy you can't really blame HP for doing this. + */ +static int +nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) +{ + struct rpc_cred *cred = nfs_file_cred(fl->fl_file); + struct nlm_host *host = req->a_host; + struct nlm_res *resp = &req->a_res; + struct nlm_wait *block = NULL; + unsigned char fl_flags = fl->fl_flags; + unsigned char fl_type; + int status = -ENOLCK; + + if (nsm_monitor(host) < 0) + goto out; + req->a_args.state = nsm_local_state; + + fl->fl_flags |= FL_ACCESS; + status = do_vfs_lock(fl); + fl->fl_flags = fl_flags; + if (status < 0) + goto out; + + block = nlmclnt_prepare_block(host, fl); +again: + /* + * Initialise resp->status to a valid non-zero value, + * since 0 == nlm_lck_granted + */ + resp->status = nlm_lck_blocked; + for(;;) { + /* Reboot protection */ + fl->fl_u.nfs_fl.state = host->h_state; + status = nlmclnt_call(cred, req, NLMPROC_LOCK); + if (status < 0) + break; + /* Did a reclaimer thread notify us of a server reboot? */ + if (resp->status == nlm_lck_denied_grace_period) + continue; + if (resp->status != nlm_lck_blocked) + break; + /* Wait on an NLM blocking lock */ + status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + if (status < 0) + break; + if (resp->status != nlm_lck_blocked) + break; + } + + /* if we were interrupted while blocking, then cancel the lock request + * and exit + */ + if (resp->status == nlm_lck_blocked) { + if (!req->a_args.block) + goto out_unlock; + if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) + goto out_unblock; + } + + if (resp->status == nlm_granted) { + down_read(&host->h_rwsem); + /* Check whether or not the server has rebooted */ + if (fl->fl_u.nfs_fl.state != host->h_state) { + up_read(&host->h_rwsem); + goto again; + } + /* Ensure the resulting lock will get added to granted list */ + fl->fl_flags |= FL_SLEEP; + if (do_vfs_lock(fl) < 0) + printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + status = 0; + } + if (status < 0) + goto out_unlock; + /* + * EAGAIN doesn't make sense for sleeping locks, and in some + * cases NLM_LCK_DENIED is returned for a permanent error. So + * turn it into an ENOLCK. + */ + if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + status = -ENOLCK; + else + status = nlm_stat_to_errno(resp->status); +out_unblock: + nlmclnt_finish_block(block); +out: + nlmclnt_release_call(req); + return status; +out_unlock: + /* Fatal error: ensure that we remove the lock altogether */ + dprintk("lockd: lock attempt ended in fatal error.\n" + " Attempting to unlock.\n"); + nlmclnt_finish_block(block); + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + down_read(&host->h_rwsem); + do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_type = fl_type; + fl->fl_flags = fl_flags; + nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + return status; +} + +/* + * RECLAIM: Try to reclaim a lock + */ +int +nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, + struct nlm_rqst *req) +{ + int status; + + memset(req, 0, sizeof(*req)); + locks_init_lock(&req->a_args.lock.fl); + locks_init_lock(&req->a_res.lock.fl); + req->a_host = host; + + /* Set up the argument struct */ + nlmclnt_setlockargs(req, fl); + req->a_args.reclaim = 1; + + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + if (status >= 0 && req->a_res.status == nlm_granted) + return 0; + + printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " + "(errno %d, status %d)\n", fl->fl_pid, + status, ntohl(req->a_res.status)); + + /* + * FIXME: This is a serious failure. We can + * + * a. Ignore the problem + * b. Send the owning process some signal (Linux doesn't have + * SIGLOST, though...) + * c. Retry the operation + * + * Until someone comes up with a simple implementation + * for b or c, I'll choose option a. + */ + + return -ENOLCK; +} + +/* + * UNLOCK: remove an existing lock + */ +static int +nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) +{ + struct nlm_host *host = req->a_host; + struct nlm_res *resp = &req->a_res; + int status; + unsigned char fl_flags = fl->fl_flags; + + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + fl->fl_flags |= FL_EXISTS; + down_read(&host->h_rwsem); + status = do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + if (status == -ENOENT) { + status = 0; + goto out; + } + + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + if (status < 0) + goto out; + + if (resp->status == nlm_granted) + goto out; + + if (resp->status != nlm_lck_denied_nolocks) + printk("lockd: unexpected unlock status: %d\n", + ntohl(resp->status)); + /* What to do now? I'm out of my depth... */ + status = -ENOLCK; +out: + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + u32 status = ntohl(req->a_res.status); + + if (RPC_ASSASSINATED(task)) + goto die; + + if (task->tk_status < 0) { + dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status); + switch (task->tk_status) { + case -EACCES: + case -EIO: + goto die; + default: + goto retry_rebind; + } + } + if (status == NLM_LCK_DENIED_GRACE_PERIOD) { + rpc_delay(task, NLMCLNT_GRACE_WAIT); + goto retry_unlock; + } + if (status != NLM_LCK_GRANTED) + printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); +die: + return; + retry_rebind: + nlm_rebind_host(req->a_host); + retry_unlock: + rpc_restart_call(task); +} + +static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_done = nlmclnt_unlock_callback, + .rpc_release = nlmclnt_rpc_release, +}; + +/* + * Cancel a blocked lock request. + * We always use an async RPC call for this in order not to hang a + * process that has been Ctrl-C'ed. + */ +static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) +{ + struct nlm_rqst *req; + int status; + + dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" + " Attempting to cancel lock.\n"); + + req = nlm_alloc_call(host); + if (!req) + return -ENOMEM; + req->a_flags = RPC_TASK_ASYNC; + + nlmclnt_setlockargs(req, fl); + req->a_args.block = block; + + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); + if (status == 0 && req->a_res.status == nlm_lck_denied) + status = -ENOLCK; + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + u32 status = ntohl(req->a_res.status); + + if (RPC_ASSASSINATED(task)) + goto die; + + if (task->tk_status < 0) { + dprintk("lockd: CANCEL call error %d, retrying.\n", + task->tk_status); + goto retry_cancel; + } + + dprintk("lockd: cancel status %u (task %u)\n", + status, task->tk_pid); + + switch (status) { + case NLM_LCK_GRANTED: + case NLM_LCK_DENIED_GRACE_PERIOD: + case NLM_LCK_DENIED: + /* Everything's good */ + break; + case NLM_LCK_DENIED_NOLOCKS: + dprintk("lockd: CANCEL failed (server has no locks)\n"); + goto retry_cancel; + default: + printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n", + status); + } + +die: + return; + +retry_cancel: + /* Don't ever retry more than 3 times */ + if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) + goto die; + nlm_rebind_host(req->a_host); + rpc_restart_call(task); + rpc_delay(task, 30 * HZ); +} + +static const struct rpc_call_ops nlmclnt_cancel_ops = { + .rpc_call_done = nlmclnt_cancel_callback, + .rpc_release = nlmclnt_rpc_release, +}; + +/* + * Convert an NLM status code to a generic kernel errno + */ +static int +nlm_stat_to_errno(__be32 status) +{ + switch(ntohl(status)) { + case NLM_LCK_GRANTED: + return 0; + case NLM_LCK_DENIED: + return -EAGAIN; + case NLM_LCK_DENIED_NOLOCKS: + case NLM_LCK_DENIED_GRACE_PERIOD: + return -ENOLCK; + case NLM_LCK_BLOCKED: + printk(KERN_NOTICE "lockd: unexpected status NLM_BLOCKED\n"); + return -ENOLCK; +#ifdef CONFIG_LOCKD_V4 + case NLM_DEADLCK: + return -EDEADLK; + case NLM_ROFS: + return -EROFS; + case NLM_STALE_FH: + return -ESTALE; + case NLM_FBIG: + return -EOVERFLOW; + case NLM_FAILED: + return -ENOLCK; +#endif + } + printk(KERN_NOTICE "lockd: unexpected server status %d\n", + ntohl(status)); + return -ENOLCK; +} diff --git a/kernel/fs/lockd/clntxdr.c b/kernel/fs/lockd/clntxdr.c new file mode 100644 index 000000000..3e9f7874b --- /dev/null +++ b/kernel/fs/lockd/clntxdr.c @@ -0,0 +1,621 @@ +/* + * linux/fs/lockd/clntxdr.c + * + * XDR functions to encode/decode NLM version 3 RPC arguments and results. + * NLM version 3 is backwards compatible with NLM versions 1 and 2. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2)) +#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz) +#define NLM_holder_sz (4+NLM_owner_sz) + +#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz) +#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz) +#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz) +#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz) + +#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz) +#define NLM_res_sz (NLM_cookie_sz+1) +#define NLM_norep_sz (0) + + +static s32 loff_t_to_s32(loff_t offset) +{ + s32 res; + + if (offset >= NLM_OFFSET_MAX) + res = NLM_OFFSET_MAX; + else if (offset <= -NLM_OFFSET_MAX) + res = -NLM_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm_compute_offsets(const struct nlm_lock *lock, + u32 *l_offset, u32 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + *l_offset = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv3 basic data types + * + * Basic NLMv3 data types are not defined in an IETF standards + * document. X/Open has a description of these data types that + * is useful. See Chapter 10 of "Protocols for Interworking: + * XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); +} + +/* + * enum nlm_stats { + * LCK_GRANTED = 0, + * LCK_DENIED = 1, + * LCK_DENIED_NOLOCKS = 2, + * LCK_BLOCKED = 3, + * LCK_DENIED_GRACE_PERIOD = 4 + * }; + * + * + * struct nlm_stat { + * nlm_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ + +static void encode_nlm_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + WARN_ON_ONCE(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm_stat(struct xdr_stream *xdr, + __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period))) + goto out_enum; + *stat = *p; + return 0; +out_enum: + dprintk("%s: server returned invalid nlm_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm_holder { + * bool exclusive; + * int uppid; + * netobj oh; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u32 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + +static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u32 exclusive, l_offset, l_len; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + l_offset = be32_to_cpup(p++); + l_len = be32_to_cpup(p); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm_lock { + * string caller_name; + * netobj fh; + * netobj oh; + * int uppid; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u32 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(lock->svid); + + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + + +/* + * NLMv3 XDR encode functions + * + * NLMv3 argument types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_unlockargs { + * netobj cookie; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static void nlm_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); +} + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static void encode_nlm_testrply(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + if (result->status == nlm_lck_denied) + encode_nlm_holder(xdr, result); +} + +static void nlm_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); + encode_nlm_testrply(xdr, result); +} + + +/* + * NLMv3 XDR decode functions + * + * NLMv3 result types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static int decode_nlm_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm_holder(xdr, result); +out: + return error; +} + +static int nlm_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static int nlm_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \ + .p_arglen = NLM_##argtype##_sz, \ + .p_replen = NLM_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +static const struct rpc_version nlm_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static const struct rpc_version nlm_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static const struct rpc_version *nlm_versions[] = { + [1] = &nlm_version1, + [3] = &nlm_version3, +#ifdef CONFIG_LOCKD_V4 + [4] = &nlm_version4, +#endif +}; + +static struct rpc_stat nlm_rpc_stats; + +const struct rpc_program nlm_program = { + .name = "lockd", + .number = NLM_PROGRAM, + .nrvers = ARRAY_SIZE(nlm_versions), + .version = nlm_versions, + .stats = &nlm_rpc_stats, +}; diff --git a/kernel/fs/lockd/host.c b/kernel/fs/lockd/host.c new file mode 100644 index 000000000..969d589c8 --- /dev/null +++ b/kernel/fs/lockd/host.c @@ -0,0 +1,675 @@ +/* + * linux/fs/lockd/host.c + * + * Management for NLM peer hosts. The nlm_host struct is shared + * between client and server implementation. The only reason to + * do so is to reduce code bloat. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "netns.h" + +#define NLMDBG_FACILITY NLMDBG_HOSTCACHE +#define NLM_HOST_NRHASH 32 +#define NLM_HOST_REBIND (60 * HZ) +#define NLM_HOST_EXPIRE (300 * HZ) +#define NLM_HOST_COLLECT (120 * HZ) + +static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH]; +static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; + +#define for_each_host(host, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry((host), (chain), h_hash) + +#define for_each_host_safe(host, next, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry_safe((host), (next), \ + (chain), h_hash) + +static unsigned long nrhosts; +static DEFINE_MUTEX(nlm_host_mutex); + +static void nlm_gc_hosts(struct net *net); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const int noresvport; /* use non-priv port */ + struct net *net; /* network namespace to bind */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +/* + * Allocate and initialize an nlm_host. Common to both client and server. + */ +static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, + struct nsm_handle *nsm) +{ + struct nlm_host *host = NULL; + unsigned long now = jiffies; + + if (nsm != NULL) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_get_handle(ni->sap, ni->salen, + ni->hostname, ni->hostname_len); + if (unlikely(nsm == NULL)) { + dprintk("lockd: %s failed; no nsm handle\n", + __func__); + goto out; + } + } + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (unlikely(host == NULL)) { + dprintk("lockd: %s failed; no memory\n", __func__); + nsm_release(nsm); + goto out; + } + + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + rpc_set_port(nlm_addr(host), 0); + host->h_srcaddrlen = 0; + + host->h_rpcclnt = NULL; + host->h_name = nsm->sm_name; + host->h_version = ni->version; + host->h_proto = ni->protocol; + host->h_reclaiming = 0; + host->h_server = ni->server; + host->h_noresvport = ni->noresvport; + host->h_inuse = 0; + init_waitqueue_head(&host->h_gracewait); + init_rwsem(&host->h_rwsem); + host->h_state = 0; + host->h_nsmstate = 0; + host->h_pidcount = 0; + atomic_set(&host->h_count, 1); + mutex_init(&host->h_mutex); + host->h_nextrebind = now + NLM_HOST_REBIND; + host->h_expires = now + NLM_HOST_EXPIRE; + INIT_LIST_HEAD(&host->h_lockowners); + spin_lock_init(&host->h_lock); + INIT_LIST_HEAD(&host->h_granted); + INIT_LIST_HEAD(&host->h_reclaim); + host->h_nsmhandle = nsm; + host->h_addrbuf = nsm->sm_addrbuf; + host->net = ni->net; + +out: + return host; +} + +/* + * Destroy an nlm_host and free associated resources + * + * Caller must hold nlm_host_mutex. + */ +static void nlm_destroy_host_locked(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + struct lockd_net *ln = net_generic(host->net, lockd_net_id); + + dprintk("lockd: destroy host %s\n", host->h_name); + + hlist_del_init(&host->h_hash); + + nsm_unmonitor(host); + nsm_release(host->h_nsmhandle); + + clnt = host->h_rpcclnt; + if (clnt != NULL) + rpc_shutdown_client(clnt); + kfree(host); + + ln->nrhosts--; + nrhosts--; +} + +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * @noresvport: 1 if non-privileged port should be used + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. + */ +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, + const char *hostname, + int noresvport, + struct net *net) +{ + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .noresvport = noresvport, + .net = net, + }; + struct hlist_head *chain; + struct nlm_host *host; + struct nsm_handle *nsm = NULL; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : ""), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + mutex_lock(&nlm_host_mutex); + + chain = &nlm_client_hosts[nlm_hash_address(sap)]; + hlist_for_each_entry(host, chain, h_hash) { + if (host->net != net) + continue; + if (!rpc_cmp_addr(nlm_addr(host), sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != protocol) + continue; + if (host->h_version != version) + continue; + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + goto out; + } + + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmclnt_release_host - release client nlm_host + * @host: nlm_host to release + * + */ +void nlmclnt_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release client host %s\n", host->h_name); + + WARN_ON_ONCE(host->h_server); + + if (atomic_dec_and_test(&host->h_count)) { + WARN_ON_ONCE(!list_empty(&host->h_lockowners)); + WARN_ON_ONCE(!list_empty(&host->h_granted)); + WARN_ON_ONCE(!list_empty(&host->h_reclaim)); + + mutex_lock(&nlm_host_mutex); + nlm_destroy_host_locked(host); + mutex_unlock(&nlm_host_mutex); + } +} + +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. + */ +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) +{ + struct hlist_head *chain; + struct nlm_host *host = NULL; + struct nsm_handle *nsm = NULL; + struct sockaddr *src_sap = svc_daddr(rqstp); + size_t src_len = rqstp->rq_daddrlen; + struct net *net = SVC_NET(rqstp); + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .net = net, + }; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + mutex_lock(&nlm_host_mutex); + + if (time_after_eq(jiffies, ln->next_gc)) + nlm_gc_hosts(net); + + chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; + hlist_for_each_entry(host, chain, h_hash) { + if (host->net != net) + continue; + if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != ni.protocol) + continue; + if (host->h_version != ni.version) + continue; + if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap)) + continue; + + /* Move to head of hash chain. */ + hlist_del(&host->h_hash); + hlist_add_head(&host->h_hash, chain); + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + goto out; + } + + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + memcpy(nlm_srcaddr(host), src_sap, src_len); + host->h_srcaddrlen = src_len; + hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmsvc_release_host - release server nlm_host + * @host: nlm_host to release + * + * Host is destroyed later in nlm_gc_host(). + */ +void nlmsvc_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release server host %s\n", host->h_name); + + WARN_ON_ONCE(!host->h_server); + atomic_dec(&host->h_count); +} + +/* + * Create the NLM RPC client for an NLM peer + */ +struct rpc_clnt * +nlm_bind_host(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + dprintk("lockd: nlm_bind_host %s (%s)\n", + host->h_name, host->h_addrbuf); + + /* Lock host handle */ + mutex_lock(&host->h_mutex); + + /* If we've already created an RPC client, check whether + * RPC rebind is required + */ + if ((clnt = host->h_rpcclnt) != NULL) { + if (time_after_eq(jiffies, host->h_nextrebind)) { + rpc_force_rebind(clnt); + host->h_nextrebind = jiffies + NLM_HOST_REBIND; + dprintk("lockd: next rebind in %lu jiffies\n", + host->h_nextrebind - jiffies); + } + } else { + unsigned long increment = nlmsvc_timeout; + struct rpc_timeout timeparms = { + .to_initval = increment, + .to_increment = increment, + .to_maxval = increment * 6UL, + .to_retries = 5U, + }; + struct rpc_create_args args = { + .net = host->net, + .protocol = host->h_proto, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .timeout = &timeparms, + .servername = host->h_name, + .program = &nlm_program, + .version = host->h_version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_AUTOBIND), + }; + + /* + * lockd retries server side blocks automatically so we want + * those to be soft RPC calls. Client side calls need to be + * hard RPC tasks. + */ + if (!host->h_server) + args.flags |= RPC_CLNT_CREATE_HARDRTRY; + if (host->h_noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); + + clnt = rpc_create(&args); + if (!IS_ERR(clnt)) + host->h_rpcclnt = clnt; + else { + printk("lockd: couldn't create RPC handle for %s\n", host->h_name); + clnt = NULL; + } + } + + mutex_unlock(&host->h_mutex); + return clnt; +} + +/* + * Force a portmap lookup of the remote lockd port + */ +void +nlm_rebind_host(struct nlm_host *host) +{ + dprintk("lockd: rebind host %s\n", host->h_name); + if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { + rpc_force_rebind(host->h_rpcclnt); + host->h_nextrebind = jiffies + NLM_HOST_REBIND; + } +} + +/* + * Increment NLM host count + */ +struct nlm_host * nlm_get_host(struct nlm_host *host) +{ + if (host) { + dprintk("lockd: get host %s\n", host->h_name); + atomic_inc(&host->h_count); + host->h_expires = jiffies + NLM_HOST_EXPIRE; + } + return host; +} + +static struct nlm_host *next_host_state(struct hlist_head *cache, + struct nsm_handle *nsm, + const struct nlm_reboot *info) +{ + struct nlm_host *host; + struct hlist_head *chain; + + mutex_lock(&nlm_host_mutex); + for_each_host(host, chain, cache) { + if (host->h_nsmhandle == nsm + && host->h_nsmstate != info->state) { + host->h_nsmstate = info->state; + host->h_state++; + + nlm_get_host(host); + mutex_unlock(&nlm_host_mutex); + return host; + } + } + + mutex_unlock(&nlm_host_mutex); + return NULL; +} + +/** + * nlm_host_rebooted - Release all resources held by rebooted host + * @info: pointer to decoded results of NLM_SM_NOTIFY call + * + * We were notified that the specified host has rebooted. Release + * all resources held by that peer. + */ +void nlm_host_rebooted(const struct nlm_reboot *info) +{ + struct nsm_handle *nsm; + struct nlm_host *host; + + nsm = nsm_reboot_lookup(info); + if (unlikely(nsm == NULL)) + return; + + /* Mark all hosts tied to this NSM state as having rebooted. + * We run the loop repeatedly, because we drop the host table + * lock for this. + * To avoid processing a host several times, we match the nsmstate. + */ + while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) { + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + } + while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) { + nlmclnt_recovery(host); + nlmclnt_release_host(host); + } + + nsm_release(nsm); +} + +static void nlm_complain_hosts(struct net *net) +{ + struct hlist_head *chain; + struct nlm_host *host; + + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + if (ln->nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); + dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + } else { + if (nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); + dprintk("lockd: %lu hosts left:\n", nrhosts); + } + + for_each_host(host, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires, host->net); + } +} + +void +nlm_shutdown_hosts_net(struct net *net) +{ + struct hlist_head *chain; + struct nlm_host *host; + + mutex_lock(&nlm_host_mutex); + + /* First, make all hosts eligible for gc */ + dprintk("lockd: nuking all hosts in net %p...\n", net); + for_each_host(host, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + host->h_expires = jiffies - 1; + if (host->h_rpcclnt) { + rpc_shutdown_client(host->h_rpcclnt); + host->h_rpcclnt = NULL; + } + } + + /* Then, perform a garbage collection pass */ + nlm_gc_hosts(net); + mutex_unlock(&nlm_host_mutex); + + nlm_complain_hosts(net); +} + +/* + * Shut down the hosts module. + * Note that this routine is called only at server shutdown time. + */ +void +nlm_shutdown_hosts(void) +{ + dprintk("lockd: shutting down host module\n"); + nlm_shutdown_hosts_net(NULL); +} + +/* + * Garbage collect any unused NLM hosts. + * This GC combines reference counting for async operations with + * mark & sweep for resources held by remote clients. + */ +static void +nlm_gc_hosts(struct net *net) +{ + struct hlist_head *chain; + struct hlist_node *next; + struct nlm_host *host; + + dprintk("lockd: host garbage collection for net %p\n", net); + for_each_host(host, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + host->h_inuse = 0; + } + + /* Mark all hosts that hold locks, blocks or shares */ + nlmsvc_mark_resources(net); + + for_each_host_safe(host, next, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + if (atomic_read(&host->h_count) || host->h_inuse + || time_before(jiffies, host->h_expires)) { + dprintk("nlm_gc_hosts skipping %s " + "(cnt %d use %d exp %ld net %p)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires, host->net); + continue; + } + nlm_destroy_host_locked(host); + } + + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + ln->next_gc = jiffies + NLM_HOST_COLLECT; + } +} diff --git a/kernel/fs/lockd/mon.c b/kernel/fs/lockd/mon.c new file mode 100644 index 000000000..47a32b6d9 --- /dev/null +++ b/kernel/fs/lockd/mon.c @@ -0,0 +1,622 @@ +/* + * linux/fs/lockd/mon.c + * + * The kernel statd client. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "netns.h" + +#define NLMDBG_FACILITY NLMDBG_MONITOR +#define NSM_PROGRAM 100024 +#define NSM_VERSION 1 + +enum { + NSMPROC_NULL, + NSMPROC_STAT, + NSMPROC_MON, + NSMPROC_UNMON, + NSMPROC_UNMON_ALL, + NSMPROC_SIMU_CRASH, + NSMPROC_NOTIFY, +}; + +struct nsm_args { + struct nsm_private *priv; + u32 prog; /* RPC callback info */ + u32 vers; + u32 proc; + + char *mon_name; + char *nodename; +}; + +struct nsm_res { + u32 status; + u32 state; +}; + +static const struct rpc_program nsm_program; +static LIST_HEAD(nsm_handles); +static DEFINE_SPINLOCK(nsm_lock); + +/* + * Local NSM state + */ +u32 __read_mostly nsm_local_state; +bool __read_mostly nsm_use_hostnames; + +static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) +{ + return (struct sockaddr *)&nsm->sm_addr; +} + +static struct rpc_clnt *nsm_create(struct net *net, const char *nodename) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpc_create_args args = { + .net = net, + .protocol = XPRT_TRANSPORT_TCP, + .address = (struct sockaddr *)&sin, + .addrsize = sizeof(sin), + .servername = "rpc.statd", + .nodename = nodename, + .program = &nsm_program, + .version = NSM_VERSION, + .authflavor = RPC_AUTH_NULL, + .flags = RPC_CLNT_CREATE_NOPING, + }; + + return rpc_create(&args); +} + +static struct rpc_clnt *nsm_client_set(struct lockd_net *ln, + struct rpc_clnt *clnt) +{ + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users == 0) { + if (clnt == NULL) + goto out; + ln->nsm_clnt = clnt; + } + clnt = ln->nsm_clnt; + ln->nsm_users++; +out: + spin_unlock(&ln->nsm_clnt_lock); + return clnt; +} + +static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename) +{ + struct rpc_clnt *clnt, *new; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + clnt = nsm_client_set(ln, NULL); + if (clnt != NULL) + goto out; + + clnt = new = nsm_create(net, nodename); + if (IS_ERR(clnt)) + goto out; + + clnt = nsm_client_set(ln, new); + if (clnt != new) + rpc_shutdown_client(new); +out: + return clnt; +} + +static void nsm_client_put(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + struct rpc_clnt *clnt = NULL; + + spin_lock(&ln->nsm_clnt_lock); + ln->nsm_users--; + if (ln->nsm_users == 0) { + clnt = ln->nsm_clnt; + ln->nsm_clnt = NULL; + } + spin_unlock(&ln->nsm_clnt_lock); + if (clnt != NULL) + rpc_shutdown_client(clnt); +} + +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, + struct rpc_clnt *clnt) +{ + int status; + struct nsm_args args = { + .priv = &nsm->sm_priv, + .prog = NLM_PROGRAM, + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, + .nodename = clnt->cl_nodename, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = res, + }; + + memset(res, 0, sizeof(*res)); + + msg.rpc_proc = &clnt->cl_procinfo[proc]; + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + if (status == -ECONNREFUSED) { + dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n", + status); + rpc_force_rebind(clnt); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + } + if (status < 0) + dprintk("lockd: NSM upcall RPC failed, status=%d\n", + status); + else + status = 0; + return status; +} + +/** + * nsm_monitor - Notify a peer in case we reboot + * @host: pointer to nlm_host of peer to notify + * + * If this peer is not already monitored, this function sends an + * upcall to the local rpc.statd to record the name/address of + * the peer to notify in case we reboot. + * + * Returns zero if the peer is monitored by the local rpc.statd; + * otherwise a negative errno value is returned. + */ +int nsm_monitor(const struct nlm_host *host) +{ + struct nsm_handle *nsm = host->h_nsmhandle; + struct nsm_res res; + int status; + struct rpc_clnt *clnt; + const char *nodename = NULL; + + dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); + + if (nsm->sm_monitored) + return 0; + + if (host->h_rpcclnt) + nodename = host->h_rpcclnt->cl_nodename; + + /* + * Choose whether to record the caller_name or IP address of + * this peer in the local rpc.statd's database. + */ + nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; + + clnt = nsm_client_get(host->net, nodename); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d, net=%p\n", status, host->net); + return status; + } + + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt); + if (unlikely(res.status != 0)) + status = -EIO; + if (unlikely(status < 0)) { + pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name); + return status; + } + + nsm->sm_monitored = 1; + if (unlikely(nsm_local_state != res.state)) { + nsm_local_state = res.state; + dprintk("lockd: NSM state changed to %d\n", nsm_local_state); + } + return 0; +} + +/** + * nsm_unmonitor - Unregister peer notification + * @host: pointer to nlm_host of peer to stop monitoring + * + * If this peer is monitored, this function sends an upcall to + * tell the local rpc.statd not to send this peer a notification + * when we reboot. + */ +void nsm_unmonitor(const struct nlm_host *host) +{ + struct nsm_handle *nsm = host->h_nsmhandle; + struct nsm_res res; + int status; + + if (atomic_read(&nsm->sm_count) == 1 + && nsm->sm_monitored && !nsm->sm_sticky) { + struct lockd_net *ln = net_generic(host->net, lockd_net_id); + + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); + + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt); + if (res.status != 0) + status = -EIO; + if (status < 0) + printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", + nsm->sm_name); + else + nsm->sm_monitored = 0; + + nsm_client_put(host->net); + } +} + +static struct nsm_handle *nsm_lookup_hostname(const char *hostname, + const size_t len) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (strlen(nsm->sm_name) == len && + memcmp(nsm->sm_name, hostname, len) == 0) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (rpc_cmp_addr(nsm_addr(nsm), sap)) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (memcmp(nsm->sm_priv.data, priv->data, + sizeof(priv->data)) == 0) + return nsm; + return NULL; +} + +/* + * Construct a unique cookie to match this nsm_handle to this monitored + * host. It is passed to the local rpc.statd via NSMPROC_MON, and + * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these + * requests. + * + * The NSM protocol requires that these cookies be unique while the + * system is running. We prefer a stronger requirement of making them + * unique across reboots. If user space bugs cause a stale cookie to + * be sent to the kernel, it could cause the wrong host to lose its + * lock state if cookies were not unique across reboots. + * + * The cookies are exposed only to local user space via loopback. They + * do not appear on the physical network. If we want greater security + * for some reason, nsm_init_private() could perform a one-way hash to + * obscure the contents of the cookie. + */ +static void nsm_init_private(struct nsm_handle *nsm) +{ + u64 *p = (u64 *)&nsm->sm_priv.data; + s64 ns; + + ns = ktime_get_ns(); + put_unaligned(ns, p); + put_unaligned((unsigned long)nsm, p + 1); +} + +static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *new; + + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); + if (unlikely(new == NULL)) + return NULL; + + atomic_set(&new->sm_count, 1); + new->sm_name = (char *)(new + 1); + memcpy(nsm_addr(new), sap, salen); + new->sm_addrlen = salen; + nsm_init_private(new); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); + memcpy(new->sm_name, hostname, hostname_len); + new->sm_name[hostname_len] = '\0'; + + return new; +} + +/** + * nsm_get_handle - Find or create a cached nsm_handle + * @sap: pointer to socket address of handle to find + * @salen: length of socket address + * @hostname: pointer to C string containing hostname to find + * @hostname_len: length of C string + * + * Behavior is modulated by the global nsm_use_hostnames variable. + * + * Returns a cached nsm_handle after bumping its ref count, or + * returns a fresh nsm_handle if a handle that matches @sap and/or + * @hostname cannot be found in the handle cache. Returns NULL if + * an error occurs. + */ +struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, + const size_t salen, const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *cached, *new = NULL; + + if (hostname && memchr(hostname, '/', hostname_len) != NULL) { + if (printk_ratelimit()) { + printk(KERN_WARNING "Invalid hostname \"%.*s\" " + "in NFS lock request\n", + (int)hostname_len, hostname); + } + return NULL; + } + +retry: + spin_lock(&nsm_lock); + + if (nsm_use_hostnames && hostname != NULL) + cached = nsm_lookup_hostname(hostname, hostname_len); + else + cached = nsm_lookup_addr(sap); + + if (cached != NULL) { + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + kfree(new); + dprintk("lockd: found nsm_handle for %s (%s), " + "cnt %d\n", cached->sm_name, + cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; + } + + if (new != NULL) { + list_add(&new->sm_link, &nsm_handles); + spin_unlock(&nsm_lock); + dprintk("lockd: created nsm_handle for %s (%s)\n", + new->sm_name, new->sm_addrbuf); + return new; + } + + spin_unlock(&nsm_lock); + + new = nsm_create_handle(sap, salen, hostname, hostname_len); + if (unlikely(new == NULL)) + return NULL; + goto retry; +} + +/** + * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @info: pointer to NLMPROC_SM_NOTIFY arguments + * + * Returns a matching nsm_handle if found in the nsm cache. The returned + * nsm_handle's reference count is bumped. Otherwise returns NULL if some + * error occurred. + */ +struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +{ + struct nsm_handle *cached; + + spin_lock(&nsm_lock); + + cached = nsm_lookup_priv(&info->priv); + if (unlikely(cached == NULL)) { + spin_unlock(&nsm_lock); + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + info->len, info->mon); + return cached; + } + + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + + dprintk("lockd: host %s (%s) rebooted, cnt %d\n", + cached->sm_name, cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; +} + +/** + * nsm_release - Release an NSM handle + * @nsm: pointer to handle to be released + * + */ +void nsm_release(struct nsm_handle *nsm) +{ + if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + list_del(&nsm->sm_link); + spin_unlock(&nsm_lock); + dprintk("lockd: destroyed nsm_handle for %s (%s)\n", + nsm->sm_name, nsm->sm_addrbuf); + kfree(nsm); + } +} + +/* + * XDR functions for NSM. + * + * See http://www.opengroup.org/ for details on the Network + * Status Monitor wire protocol. + */ + +static void encode_nsm_string(struct xdr_stream *xdr, const char *string) +{ + const u32 len = strlen(string); + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, string, len); +} + +/* + * "mon_name" specifies the host to be monitored. + */ +static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + encode_nsm_string(xdr, argp->mon_name); +} + +/* + * The "my_id" argument specifies the hostname and RPC procedure + * to be called when the status manager receives notification + * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" + * has changed. + */ +static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + __be32 *p; + + encode_nsm_string(xdr, argp->nodename); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(argp->prog); + *p++ = cpu_to_be32(argp->vers); + *p = cpu_to_be32(argp->proc); +} + +/* + * The "mon_id" argument specifies the non-private arguments + * of an NSMPROC_MON or NSMPROC_UNMON call. + */ +static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + encode_mon_name(xdr, argp); + encode_my_id(xdr, argp); +} + +/* + * The "priv" argument may contain private information required + * by the NSMPROC_MON call. This information will be supplied in the + * NLMPROC_SM_NOTIFY call. + */ +static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, SM_PRIV_SIZE); + xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); +} + +static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) +{ + encode_mon_id(xdr, argp); + encode_priv(xdr, argp); +} + +static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) +{ + encode_mon_id(xdr, argp); +} + +static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + return -EIO; + resp->status = be32_to_cpup(p++); + resp->state = be32_to_cpup(p); + + dprintk("lockd: %s status %d state %d\n", + __func__, resp->status, resp->state); + return 0; +} + +static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + resp->state = be32_to_cpup(p); + + dprintk("lockd: %s state %d\n", __func__, resp->state); + return 0; +} + +#define SM_my_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_my_id_sz (SM_my_name_sz+3) +#define SM_mon_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_mon_id_sz (SM_mon_name_sz+SM_my_id_sz) +#define SM_priv_sz (XDR_QUADLEN(SM_PRIV_SIZE)) +#define SM_mon_sz (SM_mon_id_sz+SM_priv_sz) +#define SM_monres_sz 2 +#define SM_unmonres_sz 1 + +static struct rpc_procinfo nsm_procedures[] = { +[NSMPROC_MON] = { + .p_proc = NSMPROC_MON, + .p_encode = (kxdreproc_t)nsm_xdr_enc_mon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res, + .p_arglen = SM_mon_sz, + .p_replen = SM_monres_sz, + .p_statidx = NSMPROC_MON, + .p_name = "MONITOR", + }, +[NSMPROC_UNMON] = { + .p_proc = NSMPROC_UNMON, + .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat, + .p_arglen = SM_mon_id_sz, + .p_replen = SM_unmonres_sz, + .p_statidx = NSMPROC_UNMON, + .p_name = "UNMONITOR", + }, +}; + +static const struct rpc_version nsm_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nsm_procedures), + .procs = nsm_procedures +}; + +static const struct rpc_version *nsm_version[] = { + [1] = &nsm_version1, +}; + +static struct rpc_stat nsm_stats; + +static const struct rpc_program nsm_program = { + .name = "statd", + .number = NSM_PROGRAM, + .nrvers = ARRAY_SIZE(nsm_version), + .version = nsm_version, + .stats = &nsm_stats +}; diff --git a/kernel/fs/lockd/netns.h b/kernel/fs/lockd/netns.h new file mode 100644 index 000000000..097bfa3ad --- /dev/null +++ b/kernel/fs/lockd/netns.h @@ -0,0 +1,22 @@ +#ifndef __LOCKD_NETNS_H__ +#define __LOCKD_NETNS_H__ + +#include +#include + +struct lockd_net { + unsigned int nlmsvc_users; + unsigned long next_gc; + unsigned long nrhosts; + + struct delayed_work grace_period_end; + struct lock_manager lockd_manager; + + spinlock_t nsm_clnt_lock; + unsigned int nsm_users; + struct rpc_clnt *nsm_clnt; +}; + +extern int lockd_net_id; + +#endif diff --git a/kernel/fs/lockd/procfs.c b/kernel/fs/lockd/procfs.c new file mode 100644 index 000000000..2a0a98480 --- /dev/null +++ b/kernel/fs/lockd/procfs.c @@ -0,0 +1,92 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton + */ + +#include +#include +#include +#include +#include + +#include "netns.h" +#include "procfs.h" + +/* + * We only allow strings that start with 'Y', 'y', or '1'. + */ +static ssize_t +nlm_end_grace_write(struct file *file, const char __user *buf, size_t size, + loff_t *pos) +{ + char *data; + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + + if (size < 1) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + switch(data[0]) { + case 'Y': + case 'y': + case '1': + locks_end_grace(&ln->lockd_manager); + break; + default: + return -EINVAL; + } + + return size; +} + +static ssize_t +nlm_end_grace_read(struct file *file, char __user *buf, size_t size, + loff_t *pos) +{ + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + char resp[3]; + + resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N'; + resp[1] = '\n'; + resp[2] = '\0'; + + return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); +} + +static const struct file_operations lockd_end_grace_operations = { + .write = nlm_end_grace_write, + .read = nlm_end_grace_read, + .llseek = default_llseek, + .release = simple_transaction_release, + .owner = THIS_MODULE, +}; + +int __init +lockd_create_procfs(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/lockd", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, + &lockd_end_grace_operations); + if (!entry) { + remove_proc_entry("fs/lockd", NULL); + return -ENOMEM; + } + return 0; +} + +void __exit +lockd_remove_procfs(void) +{ + remove_proc_entry("fs/lockd/nlm_end_grace", NULL); + remove_proc_entry("fs/lockd", NULL); +} diff --git a/kernel/fs/lockd/procfs.h b/kernel/fs/lockd/procfs.h new file mode 100644 index 000000000..2257a1311 --- /dev/null +++ b/kernel/fs/lockd/procfs.h @@ -0,0 +1,28 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton + */ +#ifndef _LOCKD_PROCFS_H +#define _LOCKD_PROCFS_H + +#include + +#if IS_ENABLED(CONFIG_PROC_FS) +int lockd_create_procfs(void); +void lockd_remove_procfs(void); +#else +static inline int +lockd_create_procfs(void) +{ + return 0; +} + +static inline void +lockd_remove_procfs(void) +{ + return; +} +#endif /* IS_ENABLED(CONFIG_PROC_FS) */ + +#endif /* _LOCKD_PROCFS_H */ diff --git a/kernel/fs/lockd/svc.c b/kernel/fs/lockd/svc.c new file mode 100644 index 000000000..55505cbe1 --- /dev/null +++ b/kernel/fs/lockd/svc.c @@ -0,0 +1,695 @@ +/* + * linux/fs/lockd/svc.c + * + * This is the central lockd service. + * + * FIXME: Separate the lockd NFS server functionality from the lockd NFS + * client functionality. Oh why didn't Sun create two separate + * services in the first place? + * + * Authors: Olaf Kirch (okir@monad.swb.de) + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netns.h" +#include "procfs.h" + +#define NLMDBG_FACILITY NLMDBG_SVC +#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) +#define ALLOWED_SIGS (sigmask(SIGKILL)) + +static struct svc_program nlmsvc_program; + +struct nlmsvc_binding * nlmsvc_ops; +EXPORT_SYMBOL_GPL(nlmsvc_ops); + +static DEFINE_MUTEX(nlmsvc_mutex); +static unsigned int nlmsvc_users; +static struct task_struct *nlmsvc_task; +static struct svc_rqst *nlmsvc_rqst; +unsigned long nlmsvc_timeout; + +int lockd_net_id; + +/* + * These can be set at insmod time (useful for NFS as root filesystem), + * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 + */ +static unsigned long nlm_grace_period; +static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; +static int nlm_udpport, nlm_tcpport; + +/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ +static unsigned int nlm_max_connections = 1024; + +/* + * Constants needed for the sysctl interface. + */ +static const unsigned long nlm_grace_period_min = 0; +static const unsigned long nlm_grace_period_max = 240; +static const unsigned long nlm_timeout_min = 3; +static const unsigned long nlm_timeout_max = 20; +static const int nlm_port_min = 0, nlm_port_max = 65535; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header * nlm_sysctl_table; +#endif + +static unsigned long get_lockd_grace_period(void) +{ + /* Note: nlm_timeout should always be nonzero */ + if (nlm_grace_period) + return roundup(nlm_grace_period, nlm_timeout) * HZ; + else + return nlm_timeout * 5 * HZ; +} + +static void grace_ender(struct work_struct *grace) +{ + struct delayed_work *dwork = container_of(grace, struct delayed_work, + work); + struct lockd_net *ln = container_of(dwork, struct lockd_net, + grace_period_end); + + locks_end_grace(&ln->lockd_manager); +} + +static void set_grace_period(struct net *net) +{ + unsigned long grace_period = get_lockd_grace_period(); + struct lockd_net *ln = net_generic(net, lockd_net_id); + + locks_start_grace(net, &ln->lockd_manager); + cancel_delayed_work_sync(&ln->grace_period_end); + schedule_delayed_work(&ln->grace_period_end, grace_period); +} + +static void restart_grace(void) +{ + if (nlmsvc_ops) { + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); + nlmsvc_invalidate_all(); + set_grace_period(net); + } +} + +/* + * This is the lockd kernel thread + */ +static int +lockd(void *vrqstp) +{ + int err = 0; + struct svc_rqst *rqstp = vrqstp; + + /* try_to_freeze() is called from svc_recv() */ + set_freezable(); + + /* Allow SIGKILL to tell lockd to drop all of its locks */ + allow_signal(SIGKILL); + + dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); + + /* + * The main request loop. We don't terminate until the last + * NFS mount or NFS daemon has gone away. + */ + while (!kthread_should_stop()) { + long timeout = MAX_SCHEDULE_TIMEOUT; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + /* update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nlm_max_connections; + + if (signalled()) { + flush_signals(current); + restart_grace(); + continue; + } + + timeout = nlmsvc_retry_blocked(); + + /* + * Find a socket with data available and call its + * recvfrom routine. + */ + err = svc_recv(rqstp, timeout); + if (err == -EAGAIN || err == -EINTR) + continue; + dprintk("lockd: request from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + + svc_process(rqstp); + } + flush_signals(current); + if (nlmsvc_ops) + nlmsvc_invalidate_all(); + nlm_shutdown_hosts(); + return 0; +} + +static int create_lockd_listener(struct svc_serv *serv, const char *name, + struct net *net, const int family, + const unsigned short port) +{ + struct svc_xprt *xprt; + + xprt = svc_find_xprt(serv, name, net, family, 0); + if (xprt == NULL) + return svc_create_xprt(serv, name, net, family, port, + SVC_SOCK_DEFAULTS); + svc_xprt_put(xprt); + return 0; +} + +static int create_lockd_family(struct svc_serv *serv, struct net *net, + const int family) +{ + int err; + + err = create_lockd_listener(serv, "udp", net, family, nlm_udpport); + if (err < 0) + return err; + + return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport); +} + +/* + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. + */ +static int make_socks(struct svc_serv *serv, struct net *net) +{ + static int warned; + int err; + + err = create_lockd_family(serv, net, PF_INET); + if (err < 0) + goto out_err; + + err = create_lockd_family(serv, net, PF_INET6); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_err; + + warned = 0; + return 0; + +out_err: + if (warned++ == 0) + printk(KERN_WARNING + "lockd_up: makesock failed, error=%d\n", err); + svc_shutdown_net(serv, net); + return err; +} + +static int lockd_up_net(struct svc_serv *serv, struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + int error; + + if (ln->nlmsvc_users++) + return 0; + + error = svc_bind(serv, net); + if (error) + goto err_bind; + + error = make_socks(serv, net); + if (error < 0) + goto err_bind; + set_grace_period(net); + dprintk("lockd_up_net: per-net data created; net=%p\n", net); + return 0; + +err_bind: + ln->nlmsvc_users--; + return error; +} + +static void lockd_down_net(struct svc_serv *serv, struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + + if (ln->nlmsvc_users) { + if (--ln->nlmsvc_users == 0) { + nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); + svc_shutdown_net(serv, net); + dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); + } + } else { + printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", + nlmsvc_task, net); + BUG(); + } +} + +static int lockd_start_svc(struct svc_serv *serv) +{ + int error; + + if (nlmsvc_rqst) + return 0; + + /* + * Create the kernel thread and wait for it to start. + */ + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(nlmsvc_rqst)) { + error = PTR_ERR(nlmsvc_rqst); + printk(KERN_WARNING + "lockd_up: svc_rqst allocation failed, error=%d\n", + error); + goto out_rqst; + } + + svc_sock_update_bufs(serv); + serv->sv_maxconn = nlm_max_connections; + + nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); + if (IS_ERR(nlmsvc_task)) { + error = PTR_ERR(nlmsvc_task); + printk(KERN_WARNING + "lockd_up: kthread_run failed, error=%d\n", error); + goto out_task; + } + nlmsvc_rqst->rq_task = nlmsvc_task; + wake_up_process(nlmsvc_task); + + dprintk("lockd_up: service started\n"); + return 0; + +out_task: + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; +out_rqst: + nlmsvc_rqst = NULL; + return error; +} + +static struct svc_serv *lockd_create_svc(void) +{ + struct svc_serv *serv; + + /* + * Check whether we're already up and running. + */ + if (nlmsvc_rqst) { + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(nlmsvc_rqst->rq_server); + return nlmsvc_rqst->rq_server; + } + + /* + * Sanity check: if there's no pid, + * we should be the first user ... + */ + if (nlmsvc_users) + printk(KERN_WARNING + "lockd_up: no pid, %d users??\n", nlmsvc_users); + + if (!nlm_timeout) + nlm_timeout = LOCKD_DFLT_TIMEO; + nlmsvc_timeout = nlm_timeout * HZ; + + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); + if (!serv) { + printk(KERN_WARNING "lockd_up: create service failed\n"); + return ERR_PTR(-ENOMEM); + } + dprintk("lockd_up: service created\n"); + return serv; +} + +/* + * Bring up the lockd process if it's not already up. + */ +int lockd_up(struct net *net) +{ + struct svc_serv *serv; + int error; + + mutex_lock(&nlmsvc_mutex); + + serv = lockd_create_svc(); + if (IS_ERR(serv)) { + error = PTR_ERR(serv); + goto err_create; + } + + error = lockd_up_net(serv, net); + if (error < 0) + goto err_net; + + error = lockd_start_svc(serv); + if (error < 0) + goto err_start; + + nlmsvc_users++; + /* + * Note: svc_serv structures have an initial use count of 1, + * so we exit through here on both success and failure. + */ +err_net: + svc_destroy(serv); +err_create: + mutex_unlock(&nlmsvc_mutex); + return error; + +err_start: + lockd_down_net(serv, net); + goto err_net; +} +EXPORT_SYMBOL_GPL(lockd_up); + +/* + * Decrement the user count and bring down lockd if we're the last. + */ +void +lockd_down(struct net *net) +{ + mutex_lock(&nlmsvc_mutex); + lockd_down_net(nlmsvc_rqst->rq_server, net); + if (nlmsvc_users) { + if (--nlmsvc_users) + goto out; + } else { + printk(KERN_ERR "lockd_down: no users! task=%p\n", + nlmsvc_task); + BUG(); + } + + if (!nlmsvc_task) { + printk(KERN_ERR "lockd_down: no lockd running.\n"); + BUG(); + } + kthread_stop(nlmsvc_task); + dprintk("lockd_down: service stopped\n"); + svc_exit_thread(nlmsvc_rqst); + dprintk("lockd_down: service destroyed\n"); + nlmsvc_task = NULL; + nlmsvc_rqst = NULL; +out: + mutex_unlock(&nlmsvc_mutex); +} +EXPORT_SYMBOL_GPL(lockd_down); + +#ifdef CONFIG_SYSCTL + +/* + * Sysctl parameters (same as module parameters, different interface). + */ + +static struct ctl_table nlm_sysctls[] = { + { + .procname = "nlm_grace_period", + .data = &nlm_grace_period, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (unsigned long *) &nlm_grace_period_min, + .extra2 = (unsigned long *) &nlm_grace_period_max, + }, + { + .procname = "nlm_timeout", + .data = &nlm_timeout, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (unsigned long *) &nlm_timeout_min, + .extra2 = (unsigned long *) &nlm_timeout_max, + }, + { + .procname = "nlm_udpport", + .data = &nlm_udpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *) &nlm_port_min, + .extra2 = (int *) &nlm_port_max, + }, + { + .procname = "nlm_tcpport", + .data = &nlm_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *) &nlm_port_min, + .extra2 = (int *) &nlm_port_max, + }, + { + .procname = "nsm_use_hostnames", + .data = &nsm_use_hostnames, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nsm_local_state", + .data = &nsm_local_state, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table nlm_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nlm_sysctls, + }, + { } +}; + +static struct ctl_table nlm_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nlm_sysctl_dir, + }, + { } +}; + +#endif /* CONFIG_SYSCTL */ + +/* + * Module (and sysfs) parameters. + */ + +#define param_set_min_max(name, type, which_strtol, min, max) \ +static int param_set_##name(const char *val, struct kernel_param *kp) \ +{ \ + char *endp; \ + __typeof__(type) num = which_strtol(val, &endp, 0); \ + if (endp == val || *endp || num < (min) || num > (max)) \ + return -EINVAL; \ + *((type *) kp->arg) = num; \ + return 0; \ +} + +static inline int is_callback(u32 proc) +{ + return proc == NLMPROC_GRANTED + || proc == NLMPROC_GRANTED_MSG + || proc == NLMPROC_TEST_RES + || proc == NLMPROC_LOCK_RES + || proc == NLMPROC_CANCEL_RES + || proc == NLMPROC_UNLOCK_RES + || proc == NLMPROC_NSM_NOTIFY; +} + + +static int lockd_authenticate(struct svc_rqst *rqstp) +{ + rqstp->rq_client = NULL; + switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + if (rqstp->rq_proc == 0) + return SVC_OK; + if (is_callback(rqstp->rq_proc)) { + /* Leave it to individual procedures to + * call nlmsvc_lookup_host(rqstp) + */ + return SVC_OK; + } + return svc_set_client(rqstp); + } + return SVC_DENIED; +} + + +param_set_min_max(port, int, simple_strtol, 0, 65535) +param_set_min_max(grace_period, unsigned long, simple_strtoul, + nlm_grace_period_min, nlm_grace_period_max) +param_set_min_max(timeout, unsigned long, simple_strtoul, + nlm_timeout_min, nlm_timeout_max) + +MODULE_AUTHOR("Olaf Kirch "); +MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); +MODULE_LICENSE("GPL"); + +module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, + &nlm_grace_period, 0644); +module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, + &nlm_timeout, 0644); +module_param_call(nlm_udpport, param_set_port, param_get_int, + &nlm_udpport, 0644); +module_param_call(nlm_tcpport, param_set_port, param_get_int, + &nlm_tcpport, 0644); +module_param(nsm_use_hostnames, bool, 0644); +module_param(nlm_max_connections, uint, 0644); + +static int lockd_init_net(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + + INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); + INIT_LIST_HEAD(&ln->lockd_manager.list); + spin_lock_init(&ln->nsm_clnt_lock); + return 0; +} + +static void lockd_exit_net(struct net *net) +{ +} + +static struct pernet_operations lockd_net_ops = { + .init = lockd_init_net, + .exit = lockd_exit_net, + .id = &lockd_net_id, + .size = sizeof(struct lockd_net), +}; + + +/* + * Initialising and terminating the module. + */ + +static int __init init_nlm(void) +{ + int err; + +#ifdef CONFIG_SYSCTL + err = -ENOMEM; + nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); + if (nlm_sysctl_table == NULL) + goto err_sysctl; +#endif + err = register_pernet_subsys(&lockd_net_ops); + if (err) + goto err_pernet; + + err = lockd_create_procfs(); + if (err) + goto err_procfs; + + return 0; + +err_procfs: + unregister_pernet_subsys(&lockd_net_ops); +err_pernet: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nlm_sysctl_table); +err_sysctl: +#endif + return err; +} + +static void __exit exit_nlm(void) +{ + /* FIXME: delete all NLM clients */ + nlm_shutdown_hosts(); + lockd_remove_procfs(); + unregister_pernet_subsys(&lockd_net_ops); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nlm_sysctl_table); +#endif +} + +module_init(init_nlm); +module_exit(exit_nlm); + +/* + * Define NLM program and procedures + */ +static struct svc_version nlmsvc_version1 = { + .vs_vers = 1, + .vs_nproc = 17, + .vs_proc = nlmsvc_procedures, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +static struct svc_version nlmsvc_version3 = { + .vs_vers = 3, + .vs_nproc = 24, + .vs_proc = nlmsvc_procedures, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +#ifdef CONFIG_LOCKD_V4 +static struct svc_version nlmsvc_version4 = { + .vs_vers = 4, + .vs_nproc = 24, + .vs_proc = nlmsvc_procedures4, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +#endif +static struct svc_version * nlmsvc_version[] = { + [1] = &nlmsvc_version1, + [3] = &nlmsvc_version3, +#ifdef CONFIG_LOCKD_V4 + [4] = &nlmsvc_version4, +#endif +}; + +static struct svc_stat nlmsvc_stats; + +#define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) +static struct svc_program nlmsvc_program = { + .pg_prog = NLM_PROGRAM, /* program number */ + .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ + .pg_vers = nlmsvc_version, /* version table */ + .pg_name = "lockd", /* service name */ + .pg_class = "nfsd", /* share authentication with nfsd */ + .pg_stats = &nlmsvc_stats, /* stats table */ + .pg_authenticate = &lockd_authenticate /* export authentication */ +}; diff --git a/kernel/fs/lockd/svc4proc.c b/kernel/fs/lockd/svc4proc.c new file mode 100644 index 000000000..b147d1ae7 --- /dev/null +++ b/kernel/fs/lockd/svc4proc.c @@ -0,0 +1,505 @@ +/* + * linux/fs/lockd/svc4proc.c + * + * Lockd server procedures. We don't implement the NLM_*_RES + * procedures because we don't use the async procedures. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +/* + * Obtain client and file from arguments + */ +static __be32 +nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_host **hostp, struct nlm_file **filp) +{ + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm_lock *lock = &argp->lock; + __be32 error = 0; + + /* nfsd callbacks must have been installed for this procedure */ + if (!nlmsvc_ops) + return nlm_lck_denied_nolocks; + + /* Obtain host handle */ + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) + goto no_locks; + *hostp = host; + + /* Obtain file pointer. Not used by FREE_ALL call. */ + if (filp != NULL) { + if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + goto no_locks; + *filp = file; + + /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_file = file->f_file; + lock->fl.fl_owner = (fl_owner_t) host; + lock->fl.fl_lmops = &nlmsvc_lock_operations; + } + + return 0; + +no_locks: + nlmsvc_release_host(host); + if (error) + return error; + return nlm_lck_denied_nolocks; +} + +/* + * NULL: Test for presence of service + */ +static __be32 +nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + dprintk("lockd: NULL called\n"); + return rpc_success; +} + +/* + * TEST: Check for conflicting lock + */ +static __be32 +nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: TEST4 called\n"); + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now check for conflicting locks */ + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: LOCK called\n"); + + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + +#if 0 + /* If supplied state doesn't match current state, we assume it's + * an old request that time-warped somehow. Any error return would + * do in this case because it's irrelevant anyway. + * + * NB: We don't retrieve the remote host's state yet. + */ + if (host->h_nsmstate && host->h_nsmstate != argp->state) { + resp->status = nlm_lck_denied_nolocks; + } else +#endif + + /* Now try to lock the file */ + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: CANCEL called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp))) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Try to cancel request. */ + resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); + + dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNLOCK: release a lock + */ +static __be32 +nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNLOCK called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp))) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to remove the lock */ + resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); + + dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * GRANTED: A server calls us to tell that a process' lock request + * was granted + */ +static __be32 +nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + resp->cookie = argp->cookie; + + dprintk("lockd: GRANTED called\n"); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); + return rpc_success; +} + +/* + * This is the generic lockd callback for async RPC calls + */ +static void nlm4svc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %5u callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +static void nlm4svc_callback_release(void *data) +{ + nlmsvc_release_call(data); +} + +static const struct rpc_call_ops nlm4svc_callback_ops = { + .rpc_call_done = nlm4svc_callback_exit, + .rpc_release = nlm4svc_callback_release, +}; + +/* + * `Async' versions of the above service routines. They aren't really, + * because we send the callback before the reply proper. I hope this + * doesn't break any clients. + */ +static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) +{ + struct nlm_host *host; + struct nlm_rqst *call; + __be32 stat; + + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); + if (host == NULL) + return rpc_system_err; + + call = nlm_alloc_call(host); + nlmsvc_release_host(host); + if (call == NULL) + return rpc_system_err; + + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlmsvc_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; +} + +static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: TEST_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test); +} + +static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: LOCK_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock); +} + +static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: CANCEL_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel); +} + +static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: UNLOCK_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock); +} + +static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: GRANTED_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted); +} + +/* + * SHARE: create a DOS share or alter existing share. + */ +static __be32 +nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: SHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to create the share */ + resp->status = nlmsvc_share_file(host, file, argp); + + dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNSHARE: Release a DOS share. + */ +static __be32 +nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNSHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp))) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to lock the file */ + resp->status = nlmsvc_unshare_file(host, file, argp); + + dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * NM_LOCK: Create an unmonitored lock + */ +static __be32 +nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + dprintk("lockd: NM_LOCK called\n"); + + argp->monitor = 0; /* just clean the monitor flag */ + return nlm4svc_proc_lock(rqstp, argp, resp); +} + +/* + * FREE_ALL: Release all locks and shares held by client + */ +static __be32 +nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + struct nlm_host *host; + + /* Obtain client */ + if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL)) + return rpc_success; + + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + return rpc_success; +} + +/* + * SM_NOTIFY: private callback from statd (not part of official NLM proto) + */ +static __be32 +nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, + void *resp) +{ + dprintk("lockd: SM_NOTIFY called\n"); + + if (!nlm_privileged_requester(rqstp)) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return rpc_system_err; + } + + nlm_host_rebooted(argp); + return rpc_success; +} + +/* + * client sent a GRANTED_RES, let's remove the associated block + */ +static __be32 +nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(&argp->cookie, argp->status); + return rpc_success; +} + + +/* + * NLM Server procedures. + */ + +#define nlm4svc_encode_norep nlm4svc_encode_void +#define nlm4svc_decode_norep nlm4svc_decode_void +#define nlm4svc_decode_testres nlm4svc_decode_void +#define nlm4svc_decode_lockres nlm4svc_decode_void +#define nlm4svc_decode_unlockres nlm4svc_decode_void +#define nlm4svc_decode_cancelres nlm4svc_decode_void +#define nlm4svc_decode_grantedres nlm4svc_decode_void + +#define nlm4svc_proc_none nlm4svc_proc_null +#define nlm4svc_proc_test_res nlm4svc_proc_null +#define nlm4svc_proc_lock_res nlm4svc_proc_null +#define nlm4svc_proc_cancel_res nlm4svc_proc_null +#define nlm4svc_proc_unlock_res nlm4svc_proc_null + +struct nlm_void { int dummy; }; + +#define PROC(name, xargt, xrest, argt, rest, respsize) \ + { .pc_func = (svc_procfunc) nlm4svc_proc_##name, \ + .pc_decode = (kxdrproc_t) nlm4svc_decode_##xargt, \ + .pc_encode = (kxdrproc_t) nlm4svc_encode_##xrest, \ + .pc_release = NULL, \ + .pc_argsize = sizeof(struct nlm_##argt), \ + .pc_ressize = sizeof(struct nlm_##rest), \ + .pc_xdrressize = respsize, \ + } +#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ +#define No (1+1024/4) /* netobj */ +#define St 1 /* status */ +#define Rg 4 /* range (offset + length) */ +struct svc_procedure nlmsvc_procedures4[] = { + PROC(null, void, void, void, void, 1), + PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), + PROC(lock, lockargs, res, args, res, Ck+St), + PROC(cancel, cancargs, res, args, res, Ck+St), + PROC(unlock, unlockargs, res, args, res, Ck+St), + PROC(granted, testargs, res, args, res, Ck+St), + PROC(test_msg, testargs, norep, args, void, 1), + PROC(lock_msg, lockargs, norep, args, void, 1), + PROC(cancel_msg, cancargs, norep, args, void, 1), + PROC(unlock_msg, unlockargs, norep, args, void, 1), + PROC(granted_msg, testargs, norep, args, void, 1), + PROC(test_res, testres, norep, res, void, 1), + PROC(lock_res, lockres, norep, res, void, 1), + PROC(cancel_res, cancelres, norep, res, void, 1), + PROC(unlock_res, unlockres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), + /* statd callback */ + PROC(sm_notify, reboot, void, reboot, void, 1), + PROC(none, void, void, void, void, 0), + PROC(none, void, void, void, void, 0), + PROC(none, void, void, void, void, 0), + PROC(share, shareargs, shareres, args, res, Ck+St+1), + PROC(unshare, shareargs, shareres, args, res, Ck+St+1), + PROC(nm_lock, lockargs, res, args, res, Ck+St), + PROC(free_all, notify, void, args, void, 1), + +}; diff --git a/kernel/fs/lockd/svclock.c b/kernel/fs/lockd/svclock.c new file mode 100644 index 000000000..5581e0206 --- /dev/null +++ b/kernel/fs/lockd/svclock.c @@ -0,0 +1,939 @@ +/* + * linux/fs/lockd/svclock.c + * + * Handling of server-side locks, mostly of the blocked variety. + * This is the ugliest part of lockd because we tread on very thin ice. + * GRANT and CANCEL calls may get stuck, meet in mid-flight, etc. + * IMNSHO introducing the grant callback into the NLM protocol was one + * of the worst ideas Sun ever had. Except maybe for the idea of doing + * NFS file locking at all. + * + * I'm trying hard to avoid race conditions by protecting most accesses + * to a file's list of blocked locks through a semaphore. The global + * list of blocked locks is not protected in this fashion however. + * Therefore, some functions (such as the RPC callback for the async grant + * call) move blocked locks towards the head of the list *while some other + * process might be traversing it*. This should not be a problem in + * practice, because this will only cause functions traversing the list + * to visit some blocks twice. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_SVCLOCK + +#ifdef CONFIG_LOCKD_V4 +#define nlm_deadlock nlm4_deadlock +#else +#define nlm_deadlock nlm_lck_denied +#endif + +static void nlmsvc_release_block(struct nlm_block *block); +static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); +static void nlmsvc_remove_block(struct nlm_block *block); + +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); +static void nlmsvc_freegrantargs(struct nlm_rqst *call); +static const struct rpc_call_ops nlmsvc_grant_ops; + +/* + * The list of blocked locks to retry + */ +static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + /* + * We can get away with a static buffer because this is only called + * from lockd, which is single-threaded. + */ + static char buf[2*NLM_MAXCOOKIELEN+1]; + unsigned int i, len = sizeof(buf); + char *p = buf; + + len--; /* allow for trailing \0 */ + if (len < 3) + return "???"; + for (i = 0 ; i < cookie->len ; i++) { + if (len < 2) { + strcpy(p-3, "..."); + break; + } + sprintf(p, "%02x", cookie->data[i]); + p += 2; + len -= 2; + } + *p = '\0'; + + return buf; +} +#endif + +/* + * Insert a blocked lock into the global list + */ +static void +nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when) +{ + struct nlm_block *b; + struct list_head *pos; + + dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when); + if (list_empty(&block->b_list)) { + kref_get(&block->b_count); + } else { + list_del_init(&block->b_list); + } + + pos = &nlm_blocked; + if (when != NLM_NEVER) { + if ((when += jiffies) == NLM_NEVER) + when ++; + list_for_each(pos, &nlm_blocked) { + b = list_entry(pos, struct nlm_block, b_list); + if (time_after(b->b_when,when) || b->b_when == NLM_NEVER) + break; + } + /* On normal exit from the loop, pos == &nlm_blocked, + * so we will be adding to the end of the list - good + */ + } + + list_add_tail(&block->b_list, pos); + block->b_when = when; +} + +static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) +{ + spin_lock(&nlm_blocked_lock); + nlmsvc_insert_block_locked(block, when); + spin_unlock(&nlm_blocked_lock); +} + +/* + * Remove a block from the global list + */ +static inline void +nlmsvc_remove_block(struct nlm_block *block) +{ + if (!list_empty(&block->b_list)) { + spin_lock(&nlm_blocked_lock); + list_del_init(&block->b_list); + spin_unlock(&nlm_blocked_lock); + nlmsvc_release_block(block); + } +} + +/* + * Find a block for a given lock + */ +static struct nlm_block * +nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) +{ + struct nlm_block *block; + struct file_lock *fl; + + dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", + file, lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end, lock->fl.fl_type); + list_for_each_entry(block, &nlm_blocked, b_list) { + fl = &block->b_call->a_args.lock.fl; + dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", + block->b_file, fl->fl_pid, + (long long)fl->fl_start, + (long long)fl->fl_end, fl->fl_type, + nlmdbg_cookie2a(&block->b_call->a_args.cookie)); + if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { + kref_get(&block->b_count); + return block; + } + } + + return NULL; +} + +static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b) +{ + if (a->len != b->len) + return 0; + if (memcmp(a->data, b->data, a->len)) + return 0; + return 1; +} + +/* + * Find a block with a given NLM cookie. + */ +static inline struct nlm_block * +nlmsvc_find_block(struct nlm_cookie *cookie) +{ + struct nlm_block *block; + + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) + goto found; + } + + return NULL; + +found: + dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); + kref_get(&block->b_count); + return block; +} + +/* + * Create a block and initialize it. + * + * Note: we explicitly set the cookie of the grant reply to that of + * the blocked lock request. The spec explicitly mentions that the client + * should _not_ rely on the callback containing the same cookie as the + * request, but (as I found out later) that's because some implementations + * do just this. Never mind the standards comittees, they support our + * logging industries. + * + * 10 years later: I hope we can safely ignore these old and broken + * clients by now. Let's fix this so we can uniquely identify an incoming + * GRANTED_RES message by cookie, without having to rely on the client's IP + * address. --okir + */ +static struct nlm_block * +nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, + struct nlm_file *file, struct nlm_lock *lock, + struct nlm_cookie *cookie) +{ + struct nlm_block *block; + struct nlm_rqst *call = NULL; + + call = nlm_alloc_call(host); + if (call == NULL) + return NULL; + + /* Allocate memory for block, and initialize arguments */ + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (block == NULL) + goto failed; + kref_init(&block->b_count); + INIT_LIST_HEAD(&block->b_list); + INIT_LIST_HEAD(&block->b_flist); + + if (!nlmsvc_setgrantargs(call, lock)) + goto failed_free; + + /* Set notifier function for VFS, and init args */ + call->a_args.lock.fl.fl_flags |= FL_SLEEP; + call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; + nlmclnt_next_cookie(&call->a_args.cookie); + + dprintk("lockd: created block %p...\n", block); + + /* Create and initialize the block */ + block->b_daemon = rqstp->rq_server; + block->b_host = host; + block->b_file = file; + file->f_count++; + + /* Add to file's list of blocks */ + list_add(&block->b_flist, &file->f_blocks); + + /* Set up RPC arguments for callback */ + block->b_call = call; + call->a_flags = RPC_TASK_ASYNC; + call->a_block = block; + + return block; + +failed_free: + kfree(block); +failed: + nlmsvc_release_call(call); + return NULL; +} + +/* + * Delete a block. + * It is the caller's responsibility to check whether the file + * can be closed hereafter. + */ +static int nlmsvc_unlink_block(struct nlm_block *block) +{ + int status; + dprintk("lockd: unlinking block %p...\n", block); + + /* Remove block from list */ + status = posix_unblock_lock(&block->b_call->a_args.lock.fl); + nlmsvc_remove_block(block); + return status; +} + +static void nlmsvc_free_block(struct kref *kref) +{ + struct nlm_block *block = container_of(kref, struct nlm_block, b_count); + struct nlm_file *file = block->b_file; + + dprintk("lockd: freeing block %p...\n", block); + + /* Remove block from file's list of blocks */ + list_del_init(&block->b_flist); + mutex_unlock(&file->f_mutex); + + nlmsvc_freegrantargs(block->b_call); + nlmsvc_release_call(block->b_call); + nlm_release_file(block->b_file); + kfree(block); +} + +static void nlmsvc_release_block(struct nlm_block *block) +{ + if (block != NULL) + kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex); +} + +/* + * Loop over all blocks and delete blocks held by + * a matching host. + */ +void nlmsvc_traverse_blocks(struct nlm_host *host, + struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct nlm_block *block, *next; + +restart: + mutex_lock(&file->f_mutex); + list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { + if (!match(block->b_host, host)) + continue; + /* Do not destroy blocks that are not on + * the global retry list - why? */ + if (list_empty(&block->b_list)) + continue; + kref_get(&block->b_count); + mutex_unlock(&file->f_mutex); + nlmsvc_unlink_block(block); + nlmsvc_release_block(block); + goto restart; + } + mutex_unlock(&file->f_mutex); +} + +/* + * Initialize arguments for GRANTED call. The nlm_rqst structure + * has been cleared already. + */ +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) +{ + locks_copy_lock(&call->a_args.lock.fl, &lock->fl); + memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); + call->a_args.lock.caller = utsname()->nodename; + call->a_args.lock.oh.len = lock->oh.len; + + /* set default data area */ + call->a_args.lock.oh.data = call->a_owner; + call->a_args.lock.svid = lock->fl.fl_pid; + + if (lock->oh.len > NLMCLNT_OHSIZE) { + void *data = kmalloc(lock->oh.len, GFP_KERNEL); + if (!data) + return 0; + call->a_args.lock.oh.data = (u8 *) data; + } + + memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len); + return 1; +} + +static void nlmsvc_freegrantargs(struct nlm_rqst *call) +{ + if (call->a_args.lock.oh.data != call->a_owner) + kfree(call->a_args.lock.oh.data); + + locks_release_private(&call->a_args.lock.fl); +} + +/* + * Deferred lock request handling for non-blocking lock + */ +static __be32 +nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) +{ + __be32 status = nlm_lck_denied_nolocks; + + block->b_flags |= B_QUEUED; + + nlmsvc_insert_block(block, NLM_TIMEOUT); + + block->b_cache_req = &rqstp->rq_chandle; + if (rqstp->rq_chandle.defer) { + block->b_deferred_req = + rqstp->rq_chandle.defer(block->b_cache_req); + if (block->b_deferred_req != NULL) + status = nlm_drop_reply; + } + dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n", + block, block->b_flags, ntohl(status)); + + return status; +} + +/* + * Attempt to establish a lock, and if it can't be granted, block it + * if required. + */ +__be32 +nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, int wait, + struct nlm_cookie *cookie, int reclaim) +{ + struct nlm_block *block = NULL; + int error; + __be32 ret; + + dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, + lock->fl.fl_type, lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end, + wait); + + /* Lock file against concurrent access */ + mutex_lock(&file->f_mutex); + /* Get existing block (in case client is busy-waiting) + * or create new block + */ + block = nlmsvc_lookup_block(file, lock); + if (block == NULL) { + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); + ret = nlm_lck_denied_nolocks; + if (block == NULL) + goto out; + lock = &block->b_call->a_args.lock; + } else + lock->fl.fl_flags &= ~FL_SLEEP; + + if (block->b_flags & B_QUEUED) { + dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n", + block, block->b_flags); + if (block->b_granted) { + nlmsvc_unlink_block(block); + ret = nlm_granted; + goto out; + } + if (block->b_flags & B_TIMED_OUT) { + nlmsvc_unlink_block(block); + ret = nlm_lck_denied; + goto out; + } + ret = nlm_drop_reply; + goto out; + } + + if (locks_in_grace(SVC_NET(rqstp)) && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace(SVC_NET(rqstp))) { + ret = nlm_lck_denied_grace_period; + goto out; + } + + if (!wait) + lock->fl.fl_flags &= ~FL_SLEEP; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + lock->fl.fl_flags &= ~FL_SLEEP; + + dprintk("lockd: vfs_lock_file returned %d\n", error); + switch (error) { + case 0: + ret = nlm_granted; + goto out; + case -EAGAIN: + /* + * If this is a blocking request for an + * already pending lock request then we need + * to put it back on lockd's block list + */ + if (wait) + break; + ret = nlm_lck_denied; + goto out; + case FILE_LOCK_DEFERRED: + if (wait) + break; + /* Filesystem lock operation is in progress + Add it to the queue waiting for callback */ + ret = nlmsvc_defer_lock_rqst(rqstp, block); + goto out; + case -EDEADLK: + ret = nlm_deadlock; + goto out; + default: /* includes ENOLCK */ + ret = nlm_lck_denied_nolocks; + goto out; + } + + ret = nlm_lck_blocked; + + /* Append to list of blocked */ + nlmsvc_insert_block(block, NLM_NEVER); +out: + mutex_unlock(&file->f_mutex); + nlmsvc_release_block(block); + dprintk("lockd: nlmsvc_lock returned %u\n", ret); + return ret; +} + +/* + * Test for presence of a conflicting lock. + */ +__be32 +nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock, struct nlm_cookie *cookie) +{ + int error; + __be32 ret; + + dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, + lock->fl.fl_type, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + if (locks_in_grace(SVC_NET(rqstp))) { + ret = nlm_lck_denied_grace_period; + goto out; + } + + error = vfs_test_lock(file->f_file, &lock->fl); + if (error) { + /* We can't currently deal with deferred test requests */ + if (error == FILE_LOCK_DEFERRED) + WARN_ON_ONCE(1); + + ret = nlm_lck_denied_nolocks; + goto out; + } + + if (lock->fl.fl_type == F_UNLCK) { + ret = nlm_granted; + goto out; + } + + dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", + lock->fl.fl_type, (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + conflock->caller = "somehost"; /* FIXME */ + conflock->len = strlen(conflock->caller); + conflock->oh.len = 0; /* don't return OH info */ + conflock->svid = lock->fl.fl_pid; + conflock->fl.fl_type = lock->fl.fl_type; + conflock->fl.fl_start = lock->fl.fl_start; + conflock->fl.fl_end = lock->fl.fl_end; + locks_release_private(&lock->fl); + ret = nlm_lck_denied; +out: + return ret; +} + +/* + * Remove a lock. + * This implies a CANCEL call: We send a GRANT_MSG, the client replies + * with a GRANT_RES call which gets lost, and calls UNLOCK immediately + * afterwards. In this case the block will still be there, and hence + * must be removed. + */ +__be32 +nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) +{ + int error; + + dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, + lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + /* First, cancel any lock that might be there */ + nlmsvc_cancel_blocked(net, file, lock); + + lock->fl.fl_type = F_UNLCK; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + + return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; +} + +/* + * Cancel a previously blocked request. + * + * A cancel request always overrides any grant that may currently + * be in progress. + * The calling procedure must check whether the file can be closed. + */ +__be32 +nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *lock) +{ + struct nlm_block *block; + int status = 0; + + dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, + lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + if (locks_in_grace(net)) + return nlm_lck_denied_grace_period; + + mutex_lock(&file->f_mutex); + block = nlmsvc_lookup_block(file, lock); + mutex_unlock(&file->f_mutex); + if (block != NULL) { + vfs_cancel_lock(block->b_file->f_file, + &block->b_call->a_args.lock.fl); + status = nlmsvc_unlink_block(block); + nlmsvc_release_block(block); + } + return status ? nlm_lck_denied : nlm_granted; +} + +/* + * This is a callback from the filesystem for VFS file lock requests. + * It will be used if lm_grant is defined and the filesystem can not + * respond to the request immediately. + * For SETLK or SETLKW request it will get the local posix lock. + * In all cases it will move the block to the head of nlm_blocked q where + * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the + * deferred rpc for GETLK and SETLK. + */ +static void +nlmsvc_update_deferred_block(struct nlm_block *block, int result) +{ + block->b_flags |= B_GOT_CALLBACK; + if (result == 0) + block->b_granted = 1; + else + block->b_flags |= B_TIMED_OUT; +} + +static int nlmsvc_grant_deferred(struct file_lock *fl, int result) +{ + struct nlm_block *block; + int rc = -ENOENT; + + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", + block, block->b_flags); + if (block->b_flags & B_QUEUED) { + if (block->b_flags & B_TIMED_OUT) { + rc = -ENOLCK; + break; + } + nlmsvc_update_deferred_block(block, result); + } else if (result == 0) + block->b_granted = 1; + + nlmsvc_insert_block_locked(block, 0); + svc_wake_up(block->b_daemon); + rc = 0; + break; + } + } + spin_unlock(&nlm_blocked_lock); + if (rc == -ENOENT) + printk(KERN_WARNING "lockd: grant for unknown block\n"); + return rc; +} + +/* + * Unblock a blocked lock request. This is a callback invoked from the + * VFS layer when a lock on which we blocked is removed. + * + * This function doesn't grant the blocked lock instantly, but rather moves + * the block to the head of nlm_blocked where it can be picked up by lockd. + */ +static void +nlmsvc_notify_blocked(struct file_lock *fl) +{ + struct nlm_block *block; + + dprintk("lockd: VFS unblock notification for block %p\n", fl); + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { + nlmsvc_insert_block_locked(block, 0); + spin_unlock(&nlm_blocked_lock); + svc_wake_up(block->b_daemon); + return; + } + } + spin_unlock(&nlm_blocked_lock); + printk(KERN_WARNING "lockd: notification for unknown block!\n"); +} + +static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) +{ + return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; +} + +/* + * Since NLM uses two "keys" for tracking locks, we need to hash them down + * to one for the blocked_hash. Here, we're just xor'ing the host address + * with the pid in order to create a key value for picking a hash bucket. + */ +static unsigned long +nlmsvc_owner_key(struct file_lock *fl) +{ + return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid; +} + +const struct lock_manager_operations nlmsvc_lock_operations = { + .lm_compare_owner = nlmsvc_same_owner, + .lm_owner_key = nlmsvc_owner_key, + .lm_notify = nlmsvc_notify_blocked, + .lm_grant = nlmsvc_grant_deferred, +}; + +/* + * Try to claim a lock that was previously blocked. + * + * Note that we use both the RPC_GRANTED_MSG call _and_ an async + * RPC thread when notifying the client. This seems like overkill... + * Here's why: + * - we don't want to use a synchronous RPC thread, otherwise + * we might find ourselves hanging on a dead portmapper. + * - Some lockd implementations (e.g. HP) don't react to + * RPC_GRANTED calls; they seem to insist on RPC_GRANTED_MSG calls. + */ +static void +nlmsvc_grant_blocked(struct nlm_block *block) +{ + struct nlm_file *file = block->b_file; + struct nlm_lock *lock = &block->b_call->a_args.lock; + int error; + loff_t fl_start, fl_end; + + dprintk("lockd: grant blocked lock %p\n", block); + + kref_get(&block->b_count); + + /* Unlink block request from list */ + nlmsvc_unlink_block(block); + + /* If b_granted is true this means we've been here before. + * Just retry the grant callback, possibly refreshing the RPC + * binding */ + if (block->b_granted) { + nlm_rebind_host(block->b_host); + goto callback; + } + + /* Try the lock operation again */ + /* vfs_lock_file() can mangle fl_start and fl_end, but we need + * them unchanged for the GRANT_MSG + */ + lock->fl.fl_flags |= FL_SLEEP; + fl_start = lock->fl.fl_start; + fl_end = lock->fl.fl_end; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.fl_start = fl_start; + lock->fl.fl_end = fl_end; + + switch (error) { + case 0: + break; + case FILE_LOCK_DEFERRED: + dprintk("lockd: lock still blocked error %d\n", error); + nlmsvc_insert_block(block, NLM_NEVER); + nlmsvc_release_block(block); + return; + default: + printk(KERN_WARNING "lockd: unexpected error %d in %s!\n", + -error, __func__); + nlmsvc_insert_block(block, 10 * HZ); + nlmsvc_release_block(block); + return; + } + +callback: + /* Lock was granted by VFS. */ + dprintk("lockd: GRANTing blocked lock.\n"); + block->b_granted = 1; + + /* keep block on the list, but don't reattempt until the RPC + * completes or the submission fails + */ + nlmsvc_insert_block(block, NLM_NEVER); + + /* Call the client -- use a soft RPC task since nlmsvc_retry_blocked + * will queue up a new one if this one times out + */ + error = nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, + &nlmsvc_grant_ops); + + /* RPC submission failed, wait a bit and retry */ + if (error < 0) + nlmsvc_insert_block(block, 10 * HZ); +} + +/* + * This is the callback from the RPC layer when the NLM_GRANTED_MSG + * RPC call has succeeded or timed out. + * Like all RPC callbacks, it is invoked by the rpciod process, so it + * better not sleep. Therefore, we put the blocked lock on the nlm_blocked + * chain once more in order to have it removed by lockd itself (which can + * then sleep on the file semaphore without disrupting e.g. the nfs client). + */ +static void nlmsvc_grant_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *call = data; + struct nlm_block *block = call->a_block; + unsigned long timeout; + + dprintk("lockd: GRANT_MSG RPC callback\n"); + + spin_lock(&nlm_blocked_lock); + /* if the block is not on a list at this point then it has + * been invalidated. Don't try to requeue it. + * + * FIXME: it's possible that the block is removed from the list + * after this check but before the nlmsvc_insert_block. In that + * case it will be added back. Perhaps we need better locking + * for nlm_blocked? + */ + if (list_empty(&block->b_list)) + goto out; + + /* Technically, we should down the file semaphore here. Since we + * move the block towards the head of the queue only, no harm + * can be done, though. */ + if (task->tk_status < 0) { + /* RPC error: Re-insert for retransmission */ + timeout = 10 * HZ; + } else { + /* Call was successful, now wait for client callback */ + timeout = 60 * HZ; + } + nlmsvc_insert_block_locked(block, timeout); + svc_wake_up(block->b_daemon); +out: + spin_unlock(&nlm_blocked_lock); +} + +/* + * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an + * .rpc_release rpc_call_op + */ +static void nlmsvc_grant_release(void *data) +{ + struct nlm_rqst *call = data; + nlmsvc_release_block(call->a_block); +} + +static const struct rpc_call_ops nlmsvc_grant_ops = { + .rpc_call_done = nlmsvc_grant_callback, + .rpc_release = nlmsvc_grant_release, +}; + +/* + * We received a GRANT_RES callback. Try to find the corresponding + * block. + */ +void +nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) +{ + struct nlm_block *block; + + dprintk("grant_reply: looking for cookie %x, s=%d \n", + *(unsigned int *)(cookie->data), status); + if (!(block = nlmsvc_find_block(cookie))) + return; + + if (block) { + if (status == nlm_lck_denied_grace_period) { + /* Try again in a couple of seconds */ + nlmsvc_insert_block(block, 10 * HZ); + } else { + /* Lock is now held by client, or has been rejected. + * In both cases, the block should be removed. */ + nlmsvc_unlink_block(block); + } + } + nlmsvc_release_block(block); +} + +/* Helper function to handle retry of a deferred block. + * If it is a blocking lock, call grant_blocked. + * For a non-blocking lock or test lock, revisit the request. + */ +static void +retry_deferred_block(struct nlm_block *block) +{ + if (!(block->b_flags & B_GOT_CALLBACK)) + block->b_flags |= B_TIMED_OUT; + nlmsvc_insert_block(block, NLM_TIMEOUT); + dprintk("revisit block %p flags %d\n", block, block->b_flags); + if (block->b_deferred_req) { + block->b_deferred_req->revisit(block->b_deferred_req, 0); + block->b_deferred_req = NULL; + } +} + +/* + * Retry all blocked locks that have been notified. This is where lockd + * picks up locks that can be granted, or grant notifications that must + * be retransmitted. + */ +unsigned long +nlmsvc_retry_blocked(void) +{ + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct nlm_block *block; + + spin_lock(&nlm_blocked_lock); + while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { + block = list_entry(nlm_blocked.next, struct nlm_block, b_list); + + if (block->b_when == NLM_NEVER) + break; + if (time_after(block->b_when, jiffies)) { + timeout = block->b_when - jiffies; + break; + } + spin_unlock(&nlm_blocked_lock); + + dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", + block, block->b_when); + if (block->b_flags & B_QUEUED) { + dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n", + block, block->b_granted, block->b_flags); + retry_deferred_block(block); + } else + nlmsvc_grant_blocked(block); + spin_lock(&nlm_blocked_lock); + } + spin_unlock(&nlm_blocked_lock); + + return timeout; +} diff --git a/kernel/fs/lockd/svcproc.c b/kernel/fs/lockd/svcproc.c new file mode 100644 index 000000000..21171f0c6 --- /dev/null +++ b/kernel/fs/lockd/svcproc.c @@ -0,0 +1,549 @@ +/* + * linux/fs/lockd/svcproc.c + * + * Lockd server procedures. We don't implement the NLM_*_RES + * procedures because we don't use the async procedures. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +#ifdef CONFIG_LOCKD_V4 +static __be32 +cast_to_nlm(__be32 status, u32 vers) +{ + /* Note: status is assumed to be in network byte order !!! */ + if (vers != 4){ + switch (status) { + case nlm_granted: + case nlm_lck_denied: + case nlm_lck_denied_nolocks: + case nlm_lck_blocked: + case nlm_lck_denied_grace_period: + case nlm_drop_reply: + break; + case nlm4_deadlock: + status = nlm_lck_denied; + break; + default: + status = nlm_lck_denied_nolocks; + } + } + + return (status); +} +#define cast_status(status) (cast_to_nlm(status, rqstp->rq_vers)) +#else +#define cast_status(status) (status) +#endif + +/* + * Obtain client and file from arguments + */ +static __be32 +nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_host **hostp, struct nlm_file **filp) +{ + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm_lock *lock = &argp->lock; + __be32 error = 0; + + /* nfsd callbacks must have been installed for this procedure */ + if (!nlmsvc_ops) + return nlm_lck_denied_nolocks; + + /* Obtain host handle */ + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) + goto no_locks; + *hostp = host; + + /* Obtain file pointer. Not used by FREE_ALL call. */ + if (filp != NULL) { + error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); + if (error != 0) + goto no_locks; + *filp = file; + + /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_file = file->f_file; + lock->fl.fl_owner = (fl_owner_t) host; + lock->fl.fl_lmops = &nlmsvc_lock_operations; + } + + return 0; + +no_locks: + nlmsvc_release_host(host); + if (error) + return error; + return nlm_lck_denied_nolocks; +} + +/* + * NULL: Test for presence of service + */ +static __be32 +nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + dprintk("lockd: NULL called\n"); + return rpc_success; +} + +/* + * TEST: Check for conflicting lock + */ +static __be32 +nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: TEST called\n"); + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now check for conflicting locks */ + resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: TEST status %d vers %d\n", + ntohl(resp->status), rqstp->rq_vers); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: LOCK called\n"); + + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + +#if 0 + /* If supplied state doesn't match current state, we assume it's + * an old request that time-warped somehow. Any error return would + * do in this case because it's irrelevant anyway. + * + * NB: We don't retrieve the remote host's state yet. + */ + if (host->h_nsmstate && host->h_nsmstate != argp->state) { + resp->status = nlm_lck_denied_nolocks; + } else +#endif + + /* Now try to lock the file */ + resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim)); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + struct net *net = SVC_NET(rqstp); + + dprintk("lockd: CANCEL called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace(net)) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Try to cancel request. */ + resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); + + dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNLOCK: release a lock + */ +static __be32 +nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + struct net *net = SVC_NET(rqstp); + + dprintk("lockd: UNLOCK called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace(net)) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to remove the lock */ + resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); + + dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * GRANTED: A server calls us to tell that a process' lock request + * was granted + */ +static __be32 +nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + resp->cookie = argp->cookie; + + dprintk("lockd: GRANTED called\n"); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); + return rpc_success; +} + +/* + * This is the generic lockd callback for async RPC calls + */ +static void nlmsvc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %5u callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +void nlmsvc_release_call(struct nlm_rqst *call) +{ + if (!atomic_dec_and_test(&call->a_count)) + return; + nlmsvc_release_host(call->a_host); + kfree(call); +} + +static void nlmsvc_callback_release(void *data) +{ + nlmsvc_release_call(data); +} + +static const struct rpc_call_ops nlmsvc_callback_ops = { + .rpc_call_done = nlmsvc_callback_exit, + .rpc_release = nlmsvc_callback_release, +}; + +/* + * `Async' versions of the above service routines. They aren't really, + * because we send the callback before the reply proper. I hope this + * doesn't break any clients. + */ +static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) +{ + struct nlm_host *host; + struct nlm_rqst *call; + __be32 stat; + + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); + if (host == NULL) + return rpc_system_err; + + call = nlm_alloc_call(host); + nlmsvc_release_host(host); + if (call == NULL) + return rpc_system_err; + + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlmsvc_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; +} + +static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: TEST_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test); +} + +static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: LOCK_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock); +} + +static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: CANCEL_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel); +} + +static __be32 +nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: UNLOCK_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock); +} + +static __be32 +nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: GRANTED_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted); +} + +/* + * SHARE: create a DOS share or alter existing share. + */ +static __be32 +nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: SHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to create the share */ + resp->status = cast_status(nlmsvc_share_file(host, file, argp)); + + dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNSHARE: Release a DOS share. + */ +static __be32 +nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNSHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace(SVC_NET(rqstp))) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to unshare the file */ + resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); + + dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * NM_LOCK: Create an unmonitored lock + */ +static __be32 +nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + dprintk("lockd: NM_LOCK called\n"); + + argp->monitor = 0; /* just clean the monitor flag */ + return nlmsvc_proc_lock(rqstp, argp, resp); +} + +/* + * FREE_ALL: Release all locks and shares held by client + */ +static __be32 +nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + struct nlm_host *host; + + /* Obtain client */ + if (nlmsvc_retrieve_args(rqstp, argp, &host, NULL)) + return rpc_success; + + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + return rpc_success; +} + +/* + * SM_NOTIFY: private callback from statd (not part of official NLM proto) + */ +static __be32 +nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, + void *resp) +{ + dprintk("lockd: SM_NOTIFY called\n"); + + if (!nlm_privileged_requester(rqstp)) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return rpc_system_err; + } + + nlm_host_rebooted(argp); + return rpc_success; +} + +/* + * client sent a GRANTED_RES, let's remove the associated block + */ +static __be32 +nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(&argp->cookie, argp->status); + return rpc_success; +} + +/* + * NLM Server procedures. + */ + +#define nlmsvc_encode_norep nlmsvc_encode_void +#define nlmsvc_decode_norep nlmsvc_decode_void +#define nlmsvc_decode_testres nlmsvc_decode_void +#define nlmsvc_decode_lockres nlmsvc_decode_void +#define nlmsvc_decode_unlockres nlmsvc_decode_void +#define nlmsvc_decode_cancelres nlmsvc_decode_void +#define nlmsvc_decode_grantedres nlmsvc_decode_void + +#define nlmsvc_proc_none nlmsvc_proc_null +#define nlmsvc_proc_test_res nlmsvc_proc_null +#define nlmsvc_proc_lock_res nlmsvc_proc_null +#define nlmsvc_proc_cancel_res nlmsvc_proc_null +#define nlmsvc_proc_unlock_res nlmsvc_proc_null + +struct nlm_void { int dummy; }; + +#define PROC(name, xargt, xrest, argt, rest, respsize) \ + { .pc_func = (svc_procfunc) nlmsvc_proc_##name, \ + .pc_decode = (kxdrproc_t) nlmsvc_decode_##xargt, \ + .pc_encode = (kxdrproc_t) nlmsvc_encode_##xrest, \ + .pc_release = NULL, \ + .pc_argsize = sizeof(struct nlm_##argt), \ + .pc_ressize = sizeof(struct nlm_##rest), \ + .pc_xdrressize = respsize, \ + } + +#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ +#define St 1 /* status */ +#define No (1+1024/4) /* Net Obj */ +#define Rg 2 /* range - offset + size */ + +struct svc_procedure nlmsvc_procedures[] = { + PROC(null, void, void, void, void, 1), + PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), + PROC(lock, lockargs, res, args, res, Ck+St), + PROC(cancel, cancargs, res, args, res, Ck+St), + PROC(unlock, unlockargs, res, args, res, Ck+St), + PROC(granted, testargs, res, args, res, Ck+St), + PROC(test_msg, testargs, norep, args, void, 1), + PROC(lock_msg, lockargs, norep, args, void, 1), + PROC(cancel_msg, cancargs, norep, args, void, 1), + PROC(unlock_msg, unlockargs, norep, args, void, 1), + PROC(granted_msg, testargs, norep, args, void, 1), + PROC(test_res, testres, norep, res, void, 1), + PROC(lock_res, lockres, norep, res, void, 1), + PROC(cancel_res, cancelres, norep, res, void, 1), + PROC(unlock_res, unlockres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), + /* statd callback */ + PROC(sm_notify, reboot, void, reboot, void, 1), + PROC(none, void, void, void, void, 1), + PROC(none, void, void, void, void, 1), + PROC(none, void, void, void, void, 1), + PROC(share, shareargs, shareres, args, res, Ck+St+1), + PROC(unshare, shareargs, shareres, args, res, Ck+St+1), + PROC(nm_lock, lockargs, res, args, res, Ck+St), + PROC(free_all, notify, void, args, void, 0), + +}; diff --git a/kernel/fs/lockd/svcshare.c b/kernel/fs/lockd/svcshare.c new file mode 100644 index 000000000..b0ae07008 --- /dev/null +++ b/kernel/fs/lockd/svcshare.c @@ -0,0 +1,106 @@ +/* + * linux/fs/lockd/svcshare.c + * + * Management of DOS shares. + * + * Copyright (C) 1996 Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +static inline int +nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) +{ + return share->s_owner.len == oh->len + && !memcmp(share->s_owner.data, oh->data, oh->len); +} + +__be32 +nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, + struct nlm_args *argp) +{ + struct nlm_share *share; + struct xdr_netobj *oh = &argp->lock.oh; + u8 *ohdata; + + for (share = file->f_shares; share; share = share->s_next) { + if (share->s_host == host && nlm_cmp_owner(share, oh)) + goto update; + if ((argp->fsm_access & share->s_mode) + || (argp->fsm_mode & share->s_access )) + return nlm_lck_denied; + } + + share = kmalloc(sizeof(*share) + oh->len, + GFP_KERNEL); + if (share == NULL) + return nlm_lck_denied_nolocks; + + /* Copy owner handle */ + ohdata = (u8 *) (share + 1); + memcpy(ohdata, oh->data, oh->len); + + share->s_file = file; + share->s_host = host; + share->s_owner.data = ohdata; + share->s_owner.len = oh->len; + share->s_next = file->f_shares; + file->f_shares = share; + +update: + share->s_access = argp->fsm_access; + share->s_mode = argp->fsm_mode; + return nlm_granted; +} + +/* + * Delete a share. + */ +__be32 +nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, + struct nlm_args *argp) +{ + struct nlm_share *share, **shpp; + struct xdr_netobj *oh = &argp->lock.oh; + + for (shpp = &file->f_shares; (share = *shpp) != NULL; + shpp = &share->s_next) { + if (share->s_host == host && nlm_cmp_owner(share, oh)) { + *shpp = share->s_next; + kfree(share); + return nlm_granted; + } + } + + /* X/Open spec says return success even if there was no + * corresponding share. */ + return nlm_granted; +} + +/* + * Traverse all shares for a given file, and delete + * those owned by the given (type of) host + */ +void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct nlm_share *share, **shpp; + + shpp = &file->f_shares; + while ((share = *shpp) != NULL) { + if (match(share->s_host, host)) { + *shpp = share->s_next; + kfree(share); + continue; + } + shpp = &share->s_next; + } +} diff --git a/kernel/fs/lockd/svcsubs.c b/kernel/fs/lockd/svcsubs.c new file mode 100644 index 000000000..a563ddbc1 --- /dev/null +++ b/kernel/fs/lockd/svcsubs.c @@ -0,0 +1,457 @@ +/* + * linux/fs/lockd/svcsubs.c + * + * Various support routines for the NLM server. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_SVCSUBS + + +/* + * Global file hash table + */ +#define FILE_HASH_BITS 7 +#define FILE_NRHASH (1<data; + + /* print the first 32 bytes of the fh */ + dprintk("lockd: %s (%08x %08x %08x %08x %08x %08x %08x %08x)\n", + msg, fhp[0], fhp[1], fhp[2], fhp[3], + fhp[4], fhp[5], fhp[6], fhp[7]); +} + +static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) +{ + struct inode *inode = file_inode(file->f_file); + + dprintk("lockd: %s %s/%ld\n", + msg, inode->i_sb->s_id, inode->i_ino); +} +#else +static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) +{ + return; +} + +static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) +{ + return; +} +#endif + +static inline unsigned int file_hash(struct nfs_fh *f) +{ + unsigned int tmp=0; + int i; + for (i=0; idata[i]; + return tmp & (FILE_NRHASH - 1); +} + +/* + * Lookup file info. If it doesn't exist, create a file info struct + * and open a (VFS) file for the given inode. + * + * FIXME: + * Note that we open the file O_RDONLY even when creating write locks. + * This is not quite right, but for now, we assume the client performs + * the proper R/W checking. + */ +__be32 +nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, + struct nfs_fh *f) +{ + struct nlm_file *file; + unsigned int hash; + __be32 nfserr; + + nlm_debug_print_fh("nlm_lookup_file", f); + + hash = file_hash(f); + + /* Lock file table */ + mutex_lock(&nlm_file_mutex); + + hlist_for_each_entry(file, &nlm_files[hash], f_list) + if (!nfs_compare_fh(&file->f_handle, f)) + goto found; + + nlm_debug_print_fh("creating file for", f); + + nfserr = nlm_lck_denied_nolocks; + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) + goto out_unlock; + + memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + mutex_init(&file->f_mutex); + INIT_HLIST_NODE(&file->f_list); + INIT_LIST_HEAD(&file->f_blocks); + + /* Open the file. Note that this must not sleep for too long, else + * we would lock up lockd:-) So no NFS re-exports, folks. + * + * We have to make sure we have the right credential to open + * the file. + */ + if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { + dprintk("lockd: open failed (error %d)\n", nfserr); + goto out_free; + } + + hlist_add_head(&file->f_list, &nlm_files[hash]); + +found: + dprintk("lockd: found file %p (count %d)\n", file, file->f_count); + *result = file; + file->f_count++; + nfserr = 0; + +out_unlock: + mutex_unlock(&nlm_file_mutex); + return nfserr; + +out_free: + kfree(file); + goto out_unlock; +} + +/* + * Delete a file after having released all locks, blocks and shares + */ +static inline void +nlm_delete_file(struct nlm_file *file) +{ + nlm_debug_print_file("closing file", file); + if (!hlist_unhashed(&file->f_list)) { + hlist_del(&file->f_list); + nlmsvc_ops->fclose(file->f_file); + kfree(file); + } else { + printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); + } +} + +/* + * Loop over all locks on the given file and perform the specified + * action. + */ +static int +nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct inode *inode = nlmsvc_file_inode(file); + struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; + struct nlm_host *lockhost; + + if (!flctx || list_empty_careful(&flctx->flc_posix)) + return 0; +again: + file->f_locks = 0; + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_lmops != &nlmsvc_lock_operations) + continue; + + /* update current lock count */ + file->f_locks++; + + lockhost = (struct nlm_host *) fl->fl_owner; + if (match(lockhost, host)) { + struct file_lock lock = *fl; + + spin_unlock(&flctx->flc_lock); + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { + printk("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + goto again; + } + } + spin_unlock(&flctx->flc_lock); + + return 0; +} + +static int +nlmsvc_always_match(void *dummy1, struct nlm_host *dummy2) +{ + return 1; +} + +/* + * Inspect a single file + */ +static inline int +nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match) +{ + nlmsvc_traverse_blocks(host, file, match); + nlmsvc_traverse_shares(host, file, match); + return nlm_traverse_locks(host, file, match); +} + +/* + * Quick check whether there are still any locks, blocks or + * shares on a given file. + */ +static inline int +nlm_file_inuse(struct nlm_file *file) +{ + struct inode *inode = nlmsvc_file_inode(file); + struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; + + if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) + return 1; + + if (flctx && !list_empty_careful(&flctx->flc_posix)) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_lmops == &nlmsvc_lock_operations) { + spin_unlock(&flctx->flc_lock); + return 1; + } + } + spin_unlock(&flctx->flc_lock); + } + file->f_locks = 0; + return 0; +} + +/* + * Loop over all files in the file table. + */ +static int +nlm_traverse_files(void *data, nlm_host_match_fn_t match, + int (*is_failover_file)(void *data, struct nlm_file *file)) +{ + struct hlist_node *next; + struct nlm_file *file; + int i, ret = 0; + + mutex_lock(&nlm_file_mutex); + for (i = 0; i < FILE_NRHASH; i++) { + hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) { + if (is_failover_file && !is_failover_file(data, file)) + continue; + file->f_count++; + mutex_unlock(&nlm_file_mutex); + + /* Traverse locks, blocks and shares of this file + * and update file->f_locks count */ + if (nlm_inspect_file(data, file, match)) + ret = 1; + + mutex_lock(&nlm_file_mutex); + file->f_count--; + /* No more references to this file. Let go of it. */ + if (list_empty(&file->f_blocks) && !file->f_locks + && !file->f_shares && !file->f_count) { + hlist_del(&file->f_list); + nlmsvc_ops->fclose(file->f_file); + kfree(file); + } + } + } + mutex_unlock(&nlm_file_mutex); + return ret; +} + +/* + * Release file. If there are no more remote locks on this file, + * close it and free the handle. + * + * Note that we can't do proper reference counting without major + * contortions because the code in fs/locks.c creates, deletes and + * splits locks without notification. Our only way is to walk the + * entire lock list each time we remove a lock. + */ +void +nlm_release_file(struct nlm_file *file) +{ + dprintk("lockd: nlm_release_file(%p, ct = %d)\n", + file, file->f_count); + + /* Lock file table */ + mutex_lock(&nlm_file_mutex); + + /* If there are no more locks etc, delete the file */ + if (--file->f_count == 0 && !nlm_file_inuse(file)) + nlm_delete_file(file); + + mutex_unlock(&nlm_file_mutex); +} + +/* + * Helpers function for resource traversal + * + * nlmsvc_mark_host: + * used by the garbage collector; simply sets h_inuse only for those + * hosts, which passed network check. + * Always returns 0. + * + * nlmsvc_same_host: + * returns 1 iff the two hosts match. Used to release + * all resources bound to a specific host. + * + * nlmsvc_is_client: + * returns 1 iff the host is a client. + * Used by nlmsvc_invalidate_all + */ + +static int +nlmsvc_mark_host(void *data, struct nlm_host *hint) +{ + struct nlm_host *host = data; + + if ((hint->net == NULL) || + (host->net == hint->net)) + host->h_inuse = 1; + return 0; +} + +static int +nlmsvc_same_host(void *data, struct nlm_host *other) +{ + struct nlm_host *host = data; + + return host == other; +} + +static int +nlmsvc_is_client(void *data, struct nlm_host *dummy) +{ + struct nlm_host *host = data; + + if (host->h_server) { + /* we are destroying locks even though the client + * hasn't asked us too, so don't unmonitor the + * client + */ + if (host->h_nsmhandle) + host->h_nsmhandle->sm_sticky = 1; + return 1; + } else + return 0; +} + +/* + * Mark all hosts that still hold resources + */ +void +nlmsvc_mark_resources(struct net *net) +{ + struct nlm_host hint; + + dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + hint.net = net; + nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); +} + +/* + * Release all resources held by the given client + */ +void +nlmsvc_free_host_resources(struct nlm_host *host) +{ + dprintk("lockd: nlmsvc_free_host_resources\n"); + + if (nlm_traverse_files(host, nlmsvc_same_host, NULL)) { + printk(KERN_WARNING + "lockd: couldn't remove all locks held by %s\n", + host->h_name); + BUG(); + } +} + +/** + * nlmsvc_invalidate_all - remove all locks held for clients + * + * Release all locks held by NFS clients. + * + */ +void +nlmsvc_invalidate_all(void) +{ + /* + * Previously, the code would call + * nlmsvc_free_host_resources for each client in + * turn, which is about as inefficient as it gets. + * Now we just do it once in nlm_traverse_files. + */ + nlm_traverse_files(NULL, nlmsvc_is_client, NULL); +} + +static int +nlmsvc_match_sb(void *datap, struct nlm_file *file) +{ + struct super_block *sb = datap; + + return sb == file_inode(file->f_file)->i_sb; +} + +/** + * nlmsvc_unlock_all_by_sb - release locks held on this file system + * @sb: super block + * + * Release all locks held by clients accessing this file system. + */ +int +nlmsvc_unlock_all_by_sb(struct super_block *sb) +{ + int ret; + + ret = nlm_traverse_files(sb, nlmsvc_always_match, nlmsvc_match_sb); + return ret ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); + +static int +nlmsvc_match_ip(void *datap, struct nlm_host *host) +{ + return rpc_cmp_addr(nlm_srcaddr(host), datap); +} + +/** + * nlmsvc_unlock_all_by_ip - release local locks by IP address + * @server_addr: server's IP address as seen by clients + * + * Release all locks held by clients accessing this host + * via the passed in IP address. + */ +int +nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr) +{ + int ret; + + ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL); + return ret ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip); diff --git a/kernel/fs/lockd/xdr.c b/kernel/fs/lockd/xdr.c new file mode 100644 index 000000000..5b651daad --- /dev/null +++ b/kernel/fs/lockd/xdr.c @@ -0,0 +1,337 @@ +/* + * linux/fs/lockd/xdr.c + * + * XDR support for lockd and the lock client. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + + +static inline loff_t +s32_to_loff_t(__s32 offset) +{ + return (loff_t)offset; +} + +static inline __s32 +loff_t_to_s32(loff_t offset) +{ + __s32 res; + if (offset >= NLM_OFFSET_MAX) + res = NLM_OFFSET_MAX; + else if (offset <= -NLM_OFFSET_MAX) + res = -NLM_OFFSET_MAX; + else + res = offset; + return res; +} + +/* + * XDR functions for basic NLM types + */ +static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c) +{ + unsigned int len; + + len = ntohl(*p++); + + if(len==0) + { + c->len=4; + memset(c->data, 0, 4); /* hockeypux brain damage */ + } + else if(len<=NLM_MAXCOOKIELEN) + { + c->len=len; + memcpy(c->data, p, len); + p+=XDR_QUADLEN(len); + } + else + { + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); + return NULL; + } + return p; +} + +static inline __be32 * +nlm_encode_cookie(__be32 *p, struct nlm_cookie *c) +{ + *p++ = htonl(c->len); + memcpy(p, c->data, c->len); + p+=XDR_QUADLEN(c->len); + return p; +} + +static __be32 * +nlm_decode_fh(__be32 *p, struct nfs_fh *f) +{ + unsigned int len; + + if ((len = ntohl(*p++)) != NFS2_FHSIZE) { + dprintk("lockd: bad fhandle size %d (should be %d)\n", + len, NFS2_FHSIZE); + return NULL; + } + f->size = NFS2_FHSIZE; + memset(f->data, 0, sizeof(f->data)); + memcpy(f->data, p, NFS2_FHSIZE); + return p + XDR_QUADLEN(NFS2_FHSIZE); +} + +/* + * Encode and decode owner handle + */ +static inline __be32 * +nlm_decode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_decode_netobj(p, oh); +} + +static inline __be32 * +nlm_encode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_encode_netobj(p, oh); +} + +static __be32 * +nlm_decode_lock(__be32 *p, struct nlm_lock *lock) +{ + struct file_lock *fl = &lock->fl; + s32 start, len, end; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, + NLM_MAXSTRLEN)) + || !(p = nlm_decode_fh(p, &lock->fh)) + || !(p = nlm_decode_oh(p, &lock->oh))) + return NULL; + lock->svid = ntohl(*p++); + + locks_init_lock(fl); + fl->fl_owner = current->files; + fl->fl_pid = (pid_t)lock->svid; + fl->fl_flags = FL_POSIX; + fl->fl_type = F_RDLCK; /* as good as anything else */ + start = ntohl(*p++); + len = ntohl(*p++); + end = start + len - 1; + + fl->fl_start = s32_to_loff_t(start); + + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = s32_to_loff_t(end); + return p; +} + +/* + * Encode result of a TEST/TEST_MSG call + */ +static __be32 * +nlm_encode_testres(__be32 *p, struct nlm_res *resp) +{ + s32 start, len; + + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return NULL; + *p++ = resp->status; + + if (resp->status == nlm_lck_denied) { + struct file_lock *fl = &resp->lock.fl; + + *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; + *p++ = htonl(resp->lock.svid); + + /* Encode owner handle. */ + if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) + return NULL; + + start = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + + *p++ = htonl(start); + *p++ = htonl(len); + } + + return p; +} + + +/* + * First, the server side XDR functions + */ +int +nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_testres(p, resp))) + return 0; + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + argp->reclaim = ntohl(*p++); + argp->state = ntohl(*p++); + argp->monitor = 1; /* monitor client by default */ + + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + if (!(p = nlm_decode_cookie(p, &argp->cookie)) + || !(p = nlm_decode_lock(p, &argp->lock))) + return 0; + argp->lock.fl.fl_type = F_UNLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; + + if (!(p = nlm_decode_cookie(p, &argp->cookie)) + || !(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm_decode_fh(p, &lock->fh)) + || !(p = nlm_decode_oh(p, &lock->oh))) + return 0; + argp->fsm_mode = ntohl(*p++); + argp->fsm_access = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + *p++ = xdr_zero; /* sequence argument */ + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) +{ + if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_decode_cookie(p, &resp->cookie))) + return 0; + resp->status = *p++; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} diff --git a/kernel/fs/lockd/xdr4.c b/kernel/fs/lockd/xdr4.c new file mode 100644 index 000000000..dfa4789cd --- /dev/null +++ b/kernel/fs/lockd/xdr4.c @@ -0,0 +1,334 @@ +/* + * linux/fs/lockd/xdr4.c + * + * XDR support for lockd and the lock client. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * Copyright (C) 1999, Trond Myklebust + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +static inline loff_t +s64_to_loff_t(__s64 offset) +{ + return (loff_t)offset; +} + + +static inline s64 +loff_t_to_s64(loff_t offset) +{ + s64 res; + if (offset > NLM4_OFFSET_MAX) + res = NLM4_OFFSET_MAX; + else if (offset < -NLM4_OFFSET_MAX) + res = -NLM4_OFFSET_MAX; + else + res = offset; + return res; +} + +/* + * XDR functions for basic NLM types + */ +static __be32 * +nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c) +{ + unsigned int len; + + len = ntohl(*p++); + + if(len==0) + { + c->len=4; + memset(c->data, 0, 4); /* hockeypux brain damage */ + } + else if(len<=NLM_MAXCOOKIELEN) + { + c->len=len; + memcpy(c->data, p, len); + p+=XDR_QUADLEN(len); + } + else + { + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); + return NULL; + } + return p; +} + +static __be32 * +nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c) +{ + *p++ = htonl(c->len); + memcpy(p, c->data, c->len); + p+=XDR_QUADLEN(c->len); + return p; +} + +static __be32 * +nlm4_decode_fh(__be32 *p, struct nfs_fh *f) +{ + memset(f->data, 0, sizeof(f->data)); + f->size = ntohl(*p++); + if (f->size > NFS_MAXFHSIZE) { + dprintk("lockd: bad fhandle size %d (should be <=%d)\n", + f->size, NFS_MAXFHSIZE); + return NULL; + } + memcpy(f->data, p, f->size); + return p + XDR_QUADLEN(f->size); +} + +/* + * Encode and decode owner handle + */ +static __be32 * +nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_decode_netobj(p, oh); +} + +static __be32 * +nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) +{ + struct file_lock *fl = &lock->fl; + __u64 len, start; + __s64 end; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm4_decode_fh(p, &lock->fh)) + || !(p = nlm4_decode_oh(p, &lock->oh))) + return NULL; + lock->svid = ntohl(*p++); + + locks_init_lock(fl); + fl->fl_owner = current->files; + fl->fl_pid = (pid_t)lock->svid; + fl->fl_flags = FL_POSIX; + fl->fl_type = F_RDLCK; /* as good as anything else */ + p = xdr_decode_hyper(p, &start); + p = xdr_decode_hyper(p, &len); + end = start + len - 1; + + fl->fl_start = s64_to_loff_t(start); + + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = s64_to_loff_t(end); + return p; +} + +/* + * Encode result of a TEST/TEST_MSG call + */ +static __be32 * +nlm4_encode_testres(__be32 *p, struct nlm_res *resp) +{ + s64 start, len; + + dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp); + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return NULL; + *p++ = resp->status; + + if (resp->status == nlm_lck_denied) { + struct file_lock *fl = &resp->lock.fl; + + *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; + *p++ = htonl(resp->lock.svid); + + /* Encode owner handle. */ + if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) + return NULL; + + start = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); + + p = xdr_encode_hyper(p, start); + p = xdr_encode_hyper(p, len); + dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n", + resp->status, (int)resp->lock.svid, fl->fl_type, + (long long)fl->fl_start, (long long)fl->fl_end); + } + + dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp); + return p; +} + + +/* + * First, the server side XDR functions + */ +int +nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_testres(p, resp))) + return 0; + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + argp->reclaim = ntohl(*p++); + argp->state = ntohl(*p++); + argp->monitor = 1; /* monitor client by default */ + + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + if (!(p = nlm4_decode_cookie(p, &argp->cookie)) + || !(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + argp->lock.fl.fl_type = F_UNLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie)) + || !(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm4_decode_fh(p, &lock->fh)) + || !(p = nlm4_decode_oh(p, &lock->oh))) + return 0; + argp->fsm_mode = ntohl(*p++); + argp->fsm_access = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + *p++ = xdr_zero; /* sequence argument */ + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) +{ + if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_decode_cookie(p, &resp->cookie))) + return 0; + resp->status = *p++; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} diff --git a/kernel/fs/locks.c b/kernel/fs/locks.c new file mode 100644 index 000000000..653faabb0 --- /dev/null +++ b/kernel/fs/locks.c @@ -0,0 +1,2703 @@ +/* + * linux/fs/locks.c + * + * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. + * Doug Evans (dje@spiff.uucp), August 07, 1992 + * + * Deadlock detection added. + * FIXME: one thing isn't handled yet: + * - mandatory locks (requires lots of changes elsewhere) + * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994. + * + * Miscellaneous edits, and a total rewrite of posix_lock_file() code. + * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 + * + * Converted file_lock_table to a linked list from an array, which eliminates + * the limits on how many active file locks are open. + * Chad Page (pageone@netcom.com), November 27, 1994 + * + * Removed dependency on file descriptors. dup()'ed file descriptors now + * get the same locks as the original file descriptors, and a close() on + * any file descriptor removes ALL the locks on the file for the current + * process. Since locks still depend on the process id, locks are inherited + * after an exec() but not after a fork(). This agrees with POSIX, and both + * BSD and SVR4 practice. + * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995 + * + * Scrapped free list which is redundant now that we allocate locks + * dynamically with kmalloc()/kfree(). + * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995 + * + * Implemented two lock personalities - FL_FLOCK and FL_POSIX. + * + * FL_POSIX locks are created with calls to fcntl() and lockf() through the + * fcntl() system call. They have the semantics described above. + * + * FL_FLOCK locks are created with calls to flock(), through the flock() + * system call, which is new. Old C libraries implement flock() via fcntl() + * and will continue to use the old, broken implementation. + * + * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated + * with a file pointer (filp). As a result they can be shared by a parent + * process and its children after a fork(). They are removed when the last + * file descriptor referring to the file pointer is closed (unless explicitly + * unlocked). + * + * FL_FLOCK locks never deadlock, an existing lock is always removed before + * upgrading from shared to exclusive (or vice versa). When this happens + * any processes blocked by the current lock are woken up and allowed to + * run before the new lock is applied. + * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 + * + * Removed some race conditions in flock_lock_file(), marked other possible + * races. Just grep for FIXME to see them. + * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. + * + * Addressed Dmitry's concerns. Deadlock checking no longer recursive. + * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep + * once we've checked for blocking and deadlocking. + * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996. + * + * Initial implementation of mandatory locks. SunOS turned out to be + * a rotten model, so I implemented the "obvious" semantics. + * See 'Documentation/filesystems/mandatory-locking.txt' for details. + * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. + * + * Don't allow mandatory locks on mmap()'ed files. Added simple functions to + * check if a file has mandatory locks, used by mmap(), open() and creat() to + * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference + * Manual, Section 2. + * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996. + * + * Tidied up block list handling. Added '/proc/locks' interface. + * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996. + * + * Fixed deadlock condition for pathological code that mixes calls to + * flock() and fcntl(). + * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996. + * + * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use + * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to + * guarantee sensible behaviour in the case where file system modules might + * be compiled with different options than the kernel itself. + * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. + * + * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel + * (Thomas.Meckel@mni.fh-giessen.de) for spotting this. + * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. + * + * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK + * locks. Changed process synchronisation to avoid dereferencing locks that + * have already been freed. + * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996. + * + * Made the block list a circular list to minimise searching in the list. + * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996. + * + * Made mandatory locking a mount option. Default is not to allow mandatory + * locking. + * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996. + * + * Some adaptations for NFS support. + * Olaf Kirch (okir@monad.swb.de), Dec 1996, + * + * Fixed /proc/locks interface so that we can't overrun the buffer we are handed. + * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997. + * + * Use slab allocator instead of kmalloc/kfree. + * Use generic list implementation from . + * Sped up posix_locks_deadlock by only considering blocked locks. + * Matthew Wilcox , March, 2000. + * + * Leases and LOCK_MAND + * Matthew Wilcox , June, 2000. + * Stephen Rothwell , June, 2000. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include + +#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) +#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) +#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) + +static bool lease_breaking(struct file_lock *fl) +{ + return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); +} + +static int target_leasetype(struct file_lock *fl) +{ + if (fl->fl_flags & FL_UNLOCK_PENDING) + return F_UNLCK; + if (fl->fl_flags & FL_DOWNGRADE_PENDING) + return F_RDLCK; + return fl->fl_type; +} + +int leases_enable = 1; +int lease_break_time = 45; + +/* + * The global file_lock_list is only used for displaying /proc/locks, so we + * keep a list on each CPU, with each list protected by its own spinlock via + * the file_lock_lglock. Note that alterations to the list also require that + * the relevant flc_lock is held. + */ +DEFINE_STATIC_LGLOCK(file_lock_lglock); +static DEFINE_PER_CPU(struct hlist_head, file_lock_list); + +/* + * The blocked_hash is used to find POSIX lock loops for deadlock detection. + * It is protected by blocked_lock_lock. + * + * We hash locks by lockowner in order to optimize searching for the lock a + * particular lockowner is waiting on. + * + * FIXME: make this value scale via some heuristic? We generally will want more + * buckets when we have more lockowners holding locks, but that's a little + * difficult to determine without knowing what the workload will look like. + */ +#define BLOCKED_HASH_BITS 7 +static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); + +/* + * This lock protects the blocked_hash. Generally, if you're accessing it, you + * want to be holding this lock. + * + * In addition, it also protects the fl->fl_block list, and the fl->fl_next + * pointer for file_lock structures that are acting as lock requests (in + * contrast to those that are acting as records of acquired locks). + * + * Note that when we acquire this lock in order to change the above fields, + * we often hold the flc_lock as well. In certain cases, when reading the fields + * protected by this lock, we can skip acquiring it iff we already hold the + * flc_lock. + * + * In particular, adding an entry to the fl_block list requires that you hold + * both the flc_lock and the blocked_lock_lock (acquired in that order). + * Deleting an entry from the list however only requires the file_lock_lock. + */ +static DEFINE_SPINLOCK(blocked_lock_lock); + +static struct kmem_cache *flctx_cache __read_mostly; +static struct kmem_cache *filelock_cache __read_mostly; + +static struct file_lock_context * +locks_get_lock_context(struct inode *inode, int type) +{ + struct file_lock_context *new; + + if (likely(inode->i_flctx) || type == F_UNLCK) + goto out; + + new = kmem_cache_alloc(flctx_cache, GFP_KERNEL); + if (!new) + goto out; + + spin_lock_init(&new->flc_lock); + INIT_LIST_HEAD(&new->flc_flock); + INIT_LIST_HEAD(&new->flc_posix); + INIT_LIST_HEAD(&new->flc_lease); + + /* + * Assign the pointer if it's not already assigned. If it is, then + * free the context we just allocated. + */ + if (cmpxchg(&inode->i_flctx, NULL, new)) + kmem_cache_free(flctx_cache, new); +out: + return inode->i_flctx; +} + +void +locks_free_lock_context(struct file_lock_context *ctx) +{ + if (ctx) { + WARN_ON_ONCE(!list_empty(&ctx->flc_flock)); + WARN_ON_ONCE(!list_empty(&ctx->flc_posix)); + WARN_ON_ONCE(!list_empty(&ctx->flc_lease)); + kmem_cache_free(flctx_cache, ctx); + } +} + +static void locks_init_lock_heads(struct file_lock *fl) +{ + INIT_HLIST_NODE(&fl->fl_link); + INIT_LIST_HEAD(&fl->fl_list); + INIT_LIST_HEAD(&fl->fl_block); + init_waitqueue_head(&fl->fl_wait); +} + +/* Allocate an empty lock structure. */ +struct file_lock *locks_alloc_lock(void) +{ + struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_heads(fl); + + return fl; +} +EXPORT_SYMBOL_GPL(locks_alloc_lock); + +void locks_release_private(struct file_lock *fl) +{ + if (fl->fl_ops) { + if (fl->fl_ops->fl_release_private) + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + } + + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_put_owner) { + fl->fl_lmops->lm_put_owner(fl->fl_owner); + fl->fl_owner = NULL; + } + fl->fl_lmops = NULL; + } +} +EXPORT_SYMBOL_GPL(locks_release_private); + +/* Free a lock which is not in use. */ +void locks_free_lock(struct file_lock *fl) +{ + BUG_ON(waitqueue_active(&fl->fl_wait)); + BUG_ON(!list_empty(&fl->fl_list)); + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!hlist_unhashed(&fl->fl_link)); + + locks_release_private(fl); + kmem_cache_free(filelock_cache, fl); +} +EXPORT_SYMBOL(locks_free_lock); + +static void +locks_dispose_list(struct list_head *dispose) +{ + struct file_lock *fl; + + while (!list_empty(dispose)) { + fl = list_first_entry(dispose, struct file_lock, fl_list); + list_del_init(&fl->fl_list); + locks_free_lock(fl); + } +} + +void locks_init_lock(struct file_lock *fl) +{ + memset(fl, 0, sizeof(struct file_lock)); + locks_init_lock_heads(fl); +} + +EXPORT_SYMBOL(locks_init_lock); + +/* + * Initialize a new lock from an existing file_lock structure. + */ +void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) +{ + new->fl_owner = fl->fl_owner; + new->fl_pid = fl->fl_pid; + new->fl_file = NULL; + new->fl_flags = fl->fl_flags; + new->fl_type = fl->fl_type; + new->fl_start = fl->fl_start; + new->fl_end = fl->fl_end; + new->fl_lmops = fl->fl_lmops; + new->fl_ops = NULL; + + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_get_owner) + fl->fl_lmops->lm_get_owner(fl->fl_owner); + } +} +EXPORT_SYMBOL(locks_copy_conflock); + +void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + /* "new" must be a freshly-initialized lock */ + WARN_ON_ONCE(new->fl_ops); + + locks_copy_conflock(new, fl); + + new->fl_file = fl->fl_file; + new->fl_ops = fl->fl_ops; + + if (fl->fl_ops) { + if (fl->fl_ops->fl_copy_lock) + fl->fl_ops->fl_copy_lock(new, fl); + } +} + +EXPORT_SYMBOL(locks_copy_lock); + +static inline int flock_translate_cmd(int cmd) { + if (cmd & LOCK_MAND) + return cmd & (LOCK_MAND | LOCK_RW); + switch (cmd) { + case LOCK_SH: + return F_RDLCK; + case LOCK_EX: + return F_WRLCK; + case LOCK_UN: + return F_UNLCK; + } + return -EINVAL; +} + +/* Fill in a file_lock structure with an appropriate FLOCK lock. */ +static struct file_lock * +flock_make_lock(struct file *filp, unsigned int cmd) +{ + struct file_lock *fl; + int type = flock_translate_cmd(cmd); + + if (type < 0) + return ERR_PTR(type); + + fl = locks_alloc_lock(); + if (fl == NULL) + return ERR_PTR(-ENOMEM); + + fl->fl_file = filp; + fl->fl_owner = filp; + fl->fl_pid = current->tgid; + fl->fl_flags = FL_FLOCK; + fl->fl_type = type; + fl->fl_end = OFFSET_MAX; + + return fl; +} + +static int assign_type(struct file_lock *fl, long type) +{ + switch (type) { + case F_RDLCK: + case F_WRLCK: + case F_UNLCK: + fl->fl_type = type; + break; + default: + return -EINVAL; + } + return 0; +} + +static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock64 *l) +{ + switch (l->l_whence) { + case SEEK_SET: + fl->fl_start = 0; + break; + case SEEK_CUR: + fl->fl_start = filp->f_pos; + break; + case SEEK_END: + fl->fl_start = i_size_read(file_inode(filp)); + break; + default: + return -EINVAL; + } + if (l->l_start > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_start += l->l_start; + if (fl->fl_start < 0) + return -EINVAL; + + /* POSIX-1996 leaves the case l->l_len < 0 undefined; + POSIX-2001 defines it. */ + if (l->l_len > 0) { + if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_end = fl->fl_start + l->l_len - 1; + + } else if (l->l_len < 0) { + if (fl->fl_start + l->l_len < 0) + return -EINVAL; + fl->fl_end = fl->fl_start - 1; + fl->fl_start += l->l_len; + } else + fl->fl_end = OFFSET_MAX; + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; + fl->fl_file = filp; + fl->fl_flags = FL_POSIX; + fl->fl_ops = NULL; + fl->fl_lmops = NULL; + + return assign_type(fl, l->l_type); +} + +/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX + * style lock. + */ +static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock *l) +{ + struct flock64 ll = { + .l_type = l->l_type, + .l_whence = l->l_whence, + .l_start = l->l_start, + .l_len = l->l_len, + }; + + return flock64_to_posix_lock(filp, fl, &ll); +} + +/* default lease lock manager operations */ +static bool +lease_break_callback(struct file_lock *fl) +{ + kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); + return false; +} + +static void +lease_setup(struct file_lock *fl, void **priv) +{ + struct file *filp = fl->fl_file; + struct fasync_struct *fa = *priv; + + /* + * fasync_insert_entry() returns the old entry if any. If there was no + * old entry, then it used "priv" and inserted it into the fasync list. + * Clear the pointer to indicate that it shouldn't be freed. + */ + if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa)) + *priv = NULL; + + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); +} + +static const struct lock_manager_operations lease_manager_ops = { + .lm_break = lease_break_callback, + .lm_change = lease_modify, + .lm_setup = lease_setup, +}; + +/* + * Initialize a lease, use the default lock manager operations + */ +static int lease_init(struct file *filp, long type, struct file_lock *fl) + { + if (assign_type(fl, type) != 0) + return -EINVAL; + + fl->fl_owner = filp; + fl->fl_pid = current->tgid; + + fl->fl_file = filp; + fl->fl_flags = FL_LEASE; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + fl->fl_ops = NULL; + fl->fl_lmops = &lease_manager_ops; + return 0; +} + +/* Allocate a file_lock initialised to this type of lease */ +static struct file_lock *lease_alloc(struct file *filp, long type) +{ + struct file_lock *fl = locks_alloc_lock(); + int error = -ENOMEM; + + if (fl == NULL) + return ERR_PTR(error); + + error = lease_init(filp, type, fl); + if (error) { + locks_free_lock(fl); + return ERR_PTR(error); + } + return fl; +} + +/* Check if two locks overlap each other. + */ +static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) +{ + return ((fl1->fl_end >= fl2->fl_start) && + (fl2->fl_end >= fl1->fl_start)); +} + +/* + * Check whether two locks have the same owner. + */ +static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) +{ + if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner) + return fl2->fl_lmops == fl1->fl_lmops && + fl1->fl_lmops->lm_compare_owner(fl1, fl2); + return fl1->fl_owner == fl2->fl_owner; +} + +/* Must be called with the flc_lock held! */ +static void locks_insert_global_locks(struct file_lock *fl) +{ + lg_local_lock(&file_lock_lglock); + fl->fl_link_cpu = smp_processor_id(); + hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); + lg_local_unlock(&file_lock_lglock); +} + +/* Must be called with the flc_lock held! */ +static void locks_delete_global_locks(struct file_lock *fl) +{ + /* + * Avoid taking lock if already unhashed. This is safe since this check + * is done while holding the flc_lock, and new insertions into the list + * also require that it be held. + */ + if (hlist_unhashed(&fl->fl_link)) + return; + lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + hlist_del_init(&fl->fl_link); + lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); +} + +static unsigned long +posix_owner_key(struct file_lock *fl) +{ + if (fl->fl_lmops && fl->fl_lmops->lm_owner_key) + return fl->fl_lmops->lm_owner_key(fl); + return (unsigned long)fl->fl_owner; +} + +static void locks_insert_global_blocked(struct file_lock *waiter) +{ + lockdep_assert_held(&blocked_lock_lock); + + hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); +} + +static void locks_delete_global_blocked(struct file_lock *waiter) +{ + lockdep_assert_held(&blocked_lock_lock); + + hash_del(&waiter->fl_link); +} + +/* Remove waiter from blocker's block list. + * When blocker ends up pointing to itself then the list is empty. + * + * Must be called with blocked_lock_lock held. + */ +static void __locks_delete_block(struct file_lock *waiter) +{ + locks_delete_global_blocked(waiter); + list_del_init(&waiter->fl_block); + waiter->fl_next = NULL; +} + +static void locks_delete_block(struct file_lock *waiter) +{ + spin_lock(&blocked_lock_lock); + __locks_delete_block(waiter); + spin_unlock(&blocked_lock_lock); +} + +/* Insert waiter into blocker's block list. + * We use a circular list so that processes can be easily woken up in + * the order they blocked. The documentation doesn't require this but + * it seems like the reasonable thing to do. + * + * Must be called with both the flc_lock and blocked_lock_lock held. The + * fl_block list itself is protected by the blocked_lock_lock, but by ensuring + * that the flc_lock is also held on insertions we can avoid taking the + * blocked_lock_lock in some cases when we see that the fl_block list is empty. + */ +static void __locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + BUG_ON(!list_empty(&waiter->fl_block)); + waiter->fl_next = blocker; + list_add_tail(&waiter->fl_block, &blocker->fl_block); + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) + locks_insert_global_blocked(waiter); +} + +/* Must be called with flc_lock held. */ +static void locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + spin_lock(&blocked_lock_lock); + __locks_insert_block(blocker, waiter); + spin_unlock(&blocked_lock_lock); +} + +/* + * Wake up processes blocked waiting for blocker. + * + * Must be called with the inode->flc_lock held! + */ +static void locks_wake_up_blocks(struct file_lock *blocker) +{ + /* + * Avoid taking global lock if list is empty. This is safe since new + * blocked requests are only added to the list under the flc_lock, and + * the flc_lock is always held here. Note that removal from the fl_block + * list does not require the flc_lock, so we must recheck list_empty() + * after acquiring the blocked_lock_lock. + */ + if (list_empty(&blocker->fl_block)) + return; + + spin_lock(&blocked_lock_lock); + while (!list_empty(&blocker->fl_block)) { + struct file_lock *waiter; + + waiter = list_first_entry(&blocker->fl_block, + struct file_lock, fl_block); + __locks_delete_block(waiter); + if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) + waiter->fl_lmops->lm_notify(waiter); + else + wake_up(&waiter->fl_wait); + } + spin_unlock(&blocked_lock_lock); +} + +static void +locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) +{ + fl->fl_nspid = get_pid(task_tgid(current)); + list_add_tail(&fl->fl_list, before); + locks_insert_global_locks(fl); +} + +static void +locks_unlink_lock_ctx(struct file_lock *fl) +{ + locks_delete_global_locks(fl); + list_del_init(&fl->fl_list); + if (fl->fl_nspid) { + put_pid(fl->fl_nspid); + fl->fl_nspid = NULL; + } + locks_wake_up_blocks(fl); +} + +static void +locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose) +{ + locks_unlink_lock_ctx(fl); + if (dispose) + list_add(&fl->fl_list, dispose); + else + locks_free_lock(fl); +} + +/* Determine if lock sys_fl blocks lock caller_fl. Common functionality + * checks for shared/exclusive status of overlapping locks. + */ +static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + if (sys_fl->fl_type == F_WRLCK) + return 1; + if (caller_fl->fl_type == F_WRLCK) + return 1; + return 0; +} + +/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific + * checking before calling the locks_conflict(). + */ +static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + /* POSIX locks owned by the same process do not conflict with + * each other. + */ + if (posix_same_owner(caller_fl, sys_fl)) + return (0); + + /* Check whether they overlap */ + if (!locks_overlap(caller_fl, sys_fl)) + return 0; + + return (locks_conflict(caller_fl, sys_fl)); +} + +/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific + * checking before calling the locks_conflict(). + */ +static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + /* FLOCK locks referring to the same filp do not conflict with + * each other. + */ + if (caller_fl->fl_file == sys_fl->fl_file) + return (0); + if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) + return 0; + + return (locks_conflict(caller_fl, sys_fl)); +} + +void +posix_test_lock(struct file *filp, struct file_lock *fl) +{ + struct file_lock *cfl; + struct file_lock_context *ctx; + struct inode *inode = file_inode(filp); + + ctx = inode->i_flctx; + if (!ctx || list_empty_careful(&ctx->flc_posix)) { + fl->fl_type = F_UNLCK; + return; + } + + spin_lock(&ctx->flc_lock); + list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { + if (posix_locks_conflict(fl, cfl)) { + locks_copy_conflock(fl, cfl); + if (cfl->fl_nspid) + fl->fl_pid = pid_vnr(cfl->fl_nspid); + goto out; + } + } + fl->fl_type = F_UNLCK; +out: + spin_unlock(&ctx->flc_lock); + return; +} +EXPORT_SYMBOL(posix_test_lock); + +/* + * Deadlock detection: + * + * We attempt to detect deadlocks that are due purely to posix file + * locks. + * + * We assume that a task can be waiting for at most one lock at a time. + * So for any acquired lock, the process holding that lock may be + * waiting on at most one other lock. That lock in turns may be held by + * someone waiting for at most one other lock. Given a requested lock + * caller_fl which is about to wait for a conflicting lock block_fl, we + * follow this chain of waiters to ensure we are not about to create a + * cycle. + * + * Since we do this before we ever put a process to sleep on a lock, we + * are ensured that there is never a cycle; that is what guarantees that + * the while() loop in posix_locks_deadlock() eventually completes. + * + * Note: the above assumption may not be true when handling lock + * requests from a broken NFS client. It may also fail in the presence + * of tasks (such as posix threads) sharing the same open file table. + * To handle those cases, we just bail out after a few iterations. + * + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. + * Because the owner is not even nominally tied to a thread of + * execution, the deadlock detection below can't reasonably work well. Just + * skip it for those. + * + * In principle, we could do a more limited deadlock detection on FL_OFDLCK + * locks that just checks for the case where two tasks are attempting to + * upgrade from read to write locks on the same inode. + */ + +#define MAX_DEADLK_ITERATIONS 10 + +/* Find a lock that the owner of the given block_fl is blocking on. */ +static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) +{ + struct file_lock *fl; + + hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { + if (posix_same_owner(fl, block_fl)) + return fl->fl_next; + } + return NULL; +} + +/* Must be called with the blocked_lock_lock held! */ +static int posix_locks_deadlock(struct file_lock *caller_fl, + struct file_lock *block_fl) +{ + int i = 0; + + lockdep_assert_held(&blocked_lock_lock); + + /* + * This deadlock detector can't reasonably detect deadlocks with + * FL_OFDLCK locks, since they aren't owned by a process, per-se. + */ + if (IS_OFDLCK(caller_fl)) + return 0; + + while ((block_fl = what_owner_is_waiting_for(block_fl))) { + if (i++ > MAX_DEADLK_ITERATIONS) + return 0; + if (posix_same_owner(caller_fl, block_fl)) + return 1; + } + return 0; +} + +/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks + * after any leases, but before any posix locks. + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. + */ +static int flock_lock_file(struct file *filp, struct file_lock *request) +{ + struct file_lock *new_fl = NULL; + struct file_lock *fl; + struct file_lock_context *ctx; + struct inode *inode = file_inode(filp); + int error = 0; + bool found = false; + LIST_HEAD(dispose); + + ctx = locks_get_lock_context(inode, request->fl_type); + if (!ctx) { + if (request->fl_type != F_UNLCK) + return -ENOMEM; + return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0; + } + + if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { + new_fl = locks_alloc_lock(); + if (!new_fl) + return -ENOMEM; + } + + spin_lock(&ctx->flc_lock); + if (request->fl_flags & FL_ACCESS) + goto find_conflict; + + list_for_each_entry(fl, &ctx->flc_flock, fl_list) { + if (filp != fl->fl_file) + continue; + if (request->fl_type == fl->fl_type) + goto out; + found = true; + locks_delete_lock_ctx(fl, &dispose); + break; + } + + if (request->fl_type == F_UNLCK) { + if ((request->fl_flags & FL_EXISTS) && !found) + error = -ENOENT; + goto out; + } + +find_conflict: + list_for_each_entry(fl, &ctx->flc_flock, fl_list) { + if (!flock_locks_conflict(request, fl)) + continue; + error = -EAGAIN; + if (!(request->fl_flags & FL_SLEEP)) + goto out; + error = FILE_LOCK_DEFERRED; + locks_insert_block(fl, request); + goto out; + } + if (request->fl_flags & FL_ACCESS) + goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock_ctx(new_fl, &ctx->flc_flock); + new_fl = NULL; + error = 0; + +out: + spin_unlock(&ctx->flc_lock); + if (new_fl) + locks_free_lock(new_fl); + locks_dispose_list(&dispose); + return error; +} + +static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) +{ + struct file_lock *fl, *tmp; + struct file_lock *new_fl = NULL; + struct file_lock *new_fl2 = NULL; + struct file_lock *left = NULL; + struct file_lock *right = NULL; + struct file_lock_context *ctx; + int error; + bool added = false; + LIST_HEAD(dispose); + + ctx = locks_get_lock_context(inode, request->fl_type); + if (!ctx) + return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM; + + /* + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. + * + * In some cases we can be sure, that no new locks will be needed + */ + if (!(request->fl_flags & FL_ACCESS) && + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { + new_fl = locks_alloc_lock(); + new_fl2 = locks_alloc_lock(); + } + + spin_lock(&ctx->flc_lock); + /* + * New lock request. Walk all POSIX locks and look for conflicts. If + * there are any, either return error or put the request on the + * blocker's list of waiters and the global blocked_hash. + */ + if (request->fl_type != F_UNLCK) { + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { + if (!posix_locks_conflict(request, fl)) + continue; + if (conflock) + locks_copy_conflock(conflock, fl); + error = -EAGAIN; + if (!(request->fl_flags & FL_SLEEP)) + goto out; + /* + * Deadlock detection and insertion into the blocked + * locks list must be done while holding the same lock! + */ + error = -EDEADLK; + spin_lock(&blocked_lock_lock); + if (likely(!posix_locks_deadlock(request, fl))) { + error = FILE_LOCK_DEFERRED; + __locks_insert_block(fl, request); + } + spin_unlock(&blocked_lock_lock); + goto out; + } + } + + /* If we're just looking for a conflict, we're done. */ + error = 0; + if (request->fl_flags & FL_ACCESS) + goto out; + + /* Find the first old lock with the same owner as the new lock */ + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { + if (posix_same_owner(request, fl)) + break; + } + + /* Process locks with this owner. */ + list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) { + if (!posix_same_owner(request, fl)) + break; + + /* Detect adjacent or overlapping regions (if same lock type) */ + if (request->fl_type == fl->fl_type) { + /* In all comparisons of start vs end, use + * "start - 1" rather than "end + 1". If end + * is OFFSET_MAX, end + 1 will become negative. + */ + if (fl->fl_end < request->fl_start - 1) + continue; + /* If the next lock in the list has entirely bigger + * addresses than the new one, insert the lock here. + */ + if (fl->fl_start - 1 > request->fl_end) + break; + + /* If we come here, the new and old lock are of the + * same type and adjacent or overlapping. Make one + * lock yielding from the lower start address of both + * locks to the higher end address. + */ + if (fl->fl_start > request->fl_start) + fl->fl_start = request->fl_start; + else + request->fl_start = fl->fl_start; + if (fl->fl_end < request->fl_end) + fl->fl_end = request->fl_end; + else + request->fl_end = fl->fl_end; + if (added) { + locks_delete_lock_ctx(fl, &dispose); + continue; + } + request = fl; + added = true; + } else { + /* Processing for different lock types is a bit + * more complex. + */ + if (fl->fl_end < request->fl_start) + continue; + if (fl->fl_start > request->fl_end) + break; + if (request->fl_type == F_UNLCK) + added = true; + if (fl->fl_start < request->fl_start) + left = fl; + /* If the next lock in the list has a higher end + * address than the new one, insert the new one here. + */ + if (fl->fl_end > request->fl_end) { + right = fl; + break; + } + if (fl->fl_start >= request->fl_start) { + /* The new lock completely replaces an old + * one (This may happen several times). + */ + if (added) { + locks_delete_lock_ctx(fl, &dispose); + continue; + } + /* + * Replace the old lock with new_fl, and + * remove the old one. It's safe to do the + * insert here since we know that we won't be + * using new_fl later, and that the lock is + * just replacing an existing lock. + */ + error = -ENOLCK; + if (!new_fl) + goto out; + locks_copy_lock(new_fl, request); + request = new_fl; + new_fl = NULL; + locks_insert_lock_ctx(request, &fl->fl_list); + locks_delete_lock_ctx(fl, &dispose); + added = true; + } + } + } + + /* + * The above code only modifies existing locks in case of merging or + * replacing. If new lock(s) need to be inserted all modifications are + * done below this, so it's safe yet to bail out. + */ + error = -ENOLCK; /* "no luck" */ + if (right && left == right && !new_fl2) + goto out; + + error = 0; + if (!added) { + if (request->fl_type == F_UNLCK) { + if (request->fl_flags & FL_EXISTS) + error = -ENOENT; + goto out; + } + + if (!new_fl) { + error = -ENOLCK; + goto out; + } + locks_copy_lock(new_fl, request); + locks_insert_lock_ctx(new_fl, &fl->fl_list); + fl = new_fl; + new_fl = NULL; + } + if (right) { + if (left == right) { + /* The new lock breaks the old one in two pieces, + * so we have to use the second new lock. + */ + left = new_fl2; + new_fl2 = NULL; + locks_copy_lock(left, right); + locks_insert_lock_ctx(left, &fl->fl_list); + } + right->fl_start = request->fl_end + 1; + locks_wake_up_blocks(right); + } + if (left) { + left->fl_end = request->fl_start - 1; + locks_wake_up_blocks(left); + } + out: + spin_unlock(&ctx->flc_lock); + /* + * Free any unused locks. + */ + if (new_fl) + locks_free_lock(new_fl); + if (new_fl2) + locks_free_lock(new_fl2); + locks_dispose_list(&dispose); + return error; +} + +/** + * posix_lock_file - Apply a POSIX-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * @conflock: Place to return a copy of the conflicting lock, if found. + * + * Add a POSIX style lock to a file. + * We merge adjacent & overlapping locks whenever possible. + * POSIX locks are sorted by owner task, then by starting address + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. + */ +int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) +{ + return __posix_lock_file(file_inode(filp), fl, conflock); +} +EXPORT_SYMBOL(posix_lock_file); + +/** + * posix_lock_file_wait - Apply a POSIX-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * + * Add a POSIX style lock to a file. + * We merge adjacent & overlapping locks whenever possible. + * POSIX locks are sorted by owner task, then by starting address + */ +int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + int error; + might_sleep (); + for (;;) { + error = posix_lock_file(filp, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + return error; +} +EXPORT_SYMBOL(posix_lock_file_wait); + +/** + * locks_mandatory_locked - Check for an active lock + * @file: the file to check + * + * Searches the inode's list of locks to find any POSIX locks which conflict. + * This function is called from locks_verify_locked() only. + */ +int locks_mandatory_locked(struct file *file) +{ + int ret; + struct inode *inode = file_inode(file); + struct file_lock_context *ctx; + struct file_lock *fl; + + ctx = inode->i_flctx; + if (!ctx || list_empty_careful(&ctx->flc_posix)) + return 0; + + /* + * Search the lock list for this inode for any POSIX locks. + */ + spin_lock(&ctx->flc_lock); + ret = 0; + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { + if (fl->fl_owner != current->files && + fl->fl_owner != file) { + ret = -EAGAIN; + break; + } + } + spin_unlock(&ctx->flc_lock); + return ret; +} + +/** + * locks_mandatory_area - Check for a conflicting lock + * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ + * for shared + * @inode: the file to check + * @filp: how the file was opened (if it was) + * @offset: start of area to check + * @count: length of area to check + * + * Searches the inode's list of locks to find any POSIX locks which conflict. + * This function is called from rw_verify_area() and + * locks_verify_truncate(). + */ +int locks_mandatory_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + struct file_lock fl; + int error; + bool sleep = false; + + locks_init_lock(&fl); + fl.fl_pid = current->tgid; + fl.fl_file = filp; + fl.fl_flags = FL_POSIX | FL_ACCESS; + if (filp && !(filp->f_flags & O_NONBLOCK)) + sleep = true; + fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; + fl.fl_start = offset; + fl.fl_end = offset + count - 1; + + for (;;) { + if (filp) { + fl.fl_owner = filp; + fl.fl_flags &= ~FL_SLEEP; + error = __posix_lock_file(inode, &fl, NULL); + if (!error) + break; + } + + if (sleep) + fl.fl_flags |= FL_SLEEP; + fl.fl_owner = current->files; + error = __posix_lock_file(inode, &fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); + if (!error) { + /* + * If we've been sleeping someone might have + * changed the permissions behind our back. + */ + if (__mandatory_lock(inode)) + continue; + } + + locks_delete_block(&fl); + break; + } + + return error; +} + +EXPORT_SYMBOL(locks_mandatory_area); + +static void lease_clear_pending(struct file_lock *fl, int arg) +{ + switch (arg) { + case F_UNLCK: + fl->fl_flags &= ~FL_UNLOCK_PENDING; + /* fall through: */ + case F_RDLCK: + fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + } +} + +/* We already had a lease on this file; just change its type */ +int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) +{ + int error = assign_type(fl, arg); + + if (error) + return error; + lease_clear_pending(fl, arg); + locks_wake_up_blocks(fl); + if (arg == F_UNLCK) { + struct file *filp = fl->fl_file; + + f_delown(filp); + filp->f_owner.signum = 0; + fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + if (fl->fl_fasync != NULL) { + printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); + fl->fl_fasync = NULL; + } + locks_delete_lock_ctx(fl, dispose); + } + return 0; +} +EXPORT_SYMBOL(lease_modify); + +static bool past_time(unsigned long then) +{ + if (!then) + /* 0 is a special value meaning "this never expires": */ + return false; + return time_after(jiffies, then); +} + +static void time_out_leases(struct inode *inode, struct list_head *dispose) +{ + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl, *tmp; + + lockdep_assert_held(&ctx->flc_lock); + + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { + trace_time_out_leases(inode, fl); + if (past_time(fl->fl_downgrade_time)) + lease_modify(fl, F_RDLCK, dispose); + if (past_time(fl->fl_break_time)) + lease_modify(fl, F_UNLCK, dispose); + } +} + +static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) +{ + if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) + return false; + if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) + return false; + return locks_conflict(breaker, lease); +} + +static bool +any_leases_conflict(struct inode *inode, struct file_lock *breaker) +{ + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl; + + lockdep_assert_held(&ctx->flc_lock); + + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (leases_conflict(fl, breaker)) + return true; + } + return false; +} + +/** + * __break_lease - revoke all outstanding leases on file + * @inode: the inode of the file to return + * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: + * break all leases + * @type: FL_LEASE: break leases and delegations; FL_DELEG: break + * only delegations + * + * break_lease (inlined for speed) has checked there already is at least + * some kind of lock (maybe a lease) on this file. Leases are broken on + * a call to open() or truncate(). This function can sleep unless you + * specified %O_NONBLOCK to your open(). + */ +int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +{ + int error = 0; + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *new_fl, *fl, *tmp; + unsigned long break_time; + int want_write = (mode & O_ACCMODE) != O_RDONLY; + LIST_HEAD(dispose); + + new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); + new_fl->fl_flags = type; + + /* typically we will check that ctx is non-NULL before calling */ + if (!ctx) { + WARN_ON_ONCE(1); + return error; + } + + spin_lock(&ctx->flc_lock); + + time_out_leases(inode, &dispose); + + if (!any_leases_conflict(inode, new_fl)) + goto out; + + break_time = 0; + if (lease_break_time > 0) { + break_time = jiffies + lease_break_time * HZ; + if (break_time == 0) + break_time++; /* so that 0 means no break time */ + } + + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { + if (!leases_conflict(fl, new_fl)) + continue; + if (want_write) { + if (fl->fl_flags & FL_UNLOCK_PENDING) + continue; + fl->fl_flags |= FL_UNLOCK_PENDING; + fl->fl_break_time = break_time; + } else { + if (lease_breaking(fl)) + continue; + fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->fl_downgrade_time = break_time; + } + if (fl->fl_lmops->lm_break(fl)) + locks_delete_lock_ctx(fl, &dispose); + } + + if (list_empty(&ctx->flc_lease)) + goto out; + + if (mode & O_NONBLOCK) { + trace_break_lease_noblock(inode, new_fl); + error = -EWOULDBLOCK; + goto out; + } + +restart: + fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list); + break_time = fl->fl_break_time; + if (break_time != 0) + break_time -= jiffies; + if (break_time == 0) + break_time++; + locks_insert_block(fl, new_fl); + trace_break_lease_block(inode, new_fl); + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); + error = wait_event_interruptible_timeout(new_fl->fl_wait, + !new_fl->fl_next, break_time); + spin_lock(&ctx->flc_lock); + trace_break_lease_unblock(inode, new_fl); + locks_delete_block(new_fl); + if (error >= 0) { + /* + * Wait for the next conflicting lease that has not been + * broken yet + */ + if (error == 0) + time_out_leases(inode, &dispose); + if (any_leases_conflict(inode, new_fl)) + goto restart; + error = 0; + } +out: + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); + locks_free_lock(new_fl); + return error; +} + +EXPORT_SYMBOL(__break_lease); + +/** + * lease_get_mtime - get the last modified time of an inode + * @inode: the inode + * @time: pointer to a timespec which will contain the last modified time + * + * This is to force NFS clients to flush their caches for files with + * exclusive leases. The justification is that if someone has an + * exclusive lease, then they could be modifying it. + */ +void lease_get_mtime(struct inode *inode, struct timespec *time) +{ + bool has_lease = false; + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl; + + if (ctx && !list_empty_careful(&ctx->flc_lease)) { + spin_lock(&ctx->flc_lock); + if (!list_empty(&ctx->flc_lease)) { + fl = list_first_entry(&ctx->flc_lease, + struct file_lock, fl_list); + if (fl->fl_type == F_WRLCK) + has_lease = true; + } + spin_unlock(&ctx->flc_lock); + } + + if (has_lease) + *time = current_fs_time(inode->i_sb); + else + *time = inode->i_mtime; +} + +EXPORT_SYMBOL(lease_get_mtime); + +/** + * fcntl_getlease - Enquire what lease is currently active + * @filp: the file + * + * The value returned by this function will be one of + * (if no lease break is pending): + * + * %F_RDLCK to indicate a shared lease is held. + * + * %F_WRLCK to indicate an exclusive lease is held. + * + * %F_UNLCK to indicate no lease is held. + * + * (if a lease break is pending): + * + * %F_RDLCK to indicate an exclusive lease needs to be + * changed to a shared lease (or removed). + * + * %F_UNLCK to indicate the lease needs to be removed. + * + * XXX: sfr & willy disagree over whether F_INPROGRESS + * should be returned to userspace. + */ +int fcntl_getlease(struct file *filp) +{ + struct file_lock *fl; + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx = inode->i_flctx; + int type = F_UNLCK; + LIST_HEAD(dispose); + + if (ctx && !list_empty_careful(&ctx->flc_lease)) { + spin_lock(&ctx->flc_lock); + time_out_leases(file_inode(filp), &dispose); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file != filp) + continue; + type = target_leasetype(fl); + break; + } + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); + } + return type; +} + +/** + * check_conflicting_open - see if the given dentry points to a file that has + * an existing open that would conflict with the + * desired lease. + * @dentry: dentry to check + * @arg: type of lease that we're trying to acquire + * + * Check to see if there's an existing open fd on this file that would + * conflict with the lease we're trying to set. + */ +static int +check_conflicting_open(const struct dentry *dentry, const long arg, int flags) +{ + int ret = 0; + struct inode *inode = dentry->d_inode; + + if (flags & FL_LAYOUT) + return 0; + + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + return -EAGAIN; + + if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || + (atomic_read(&inode->i_count) > 1))) + ret = -EAGAIN; + + return ret; +} + +static int +generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) +{ + struct file_lock *fl, *my_fl = NULL, *lease; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct file_lock_context *ctx; + bool is_deleg = (*flp)->fl_flags & FL_DELEG; + int error; + LIST_HEAD(dispose); + + lease = *flp; + trace_generic_add_lease(inode, lease); + + /* Note that arg is never F_UNLCK here */ + ctx = locks_get_lock_context(inode, arg); + if (!ctx) + return -ENOMEM; + + /* + * In the delegation case we need mutual exclusion with + * a number of operations that take the i_mutex. We trylock + * because delegations are an optional optimization, and if + * there's some chance of a conflict--we'd rather not + * bother, maybe that's a sign this just isn't a good file to + * hand out a delegation on. + */ + if (is_deleg && !mutex_trylock(&inode->i_mutex)) + return -EAGAIN; + + if (is_deleg && arg == F_WRLCK) { + /* Write delegations are not currently supported: */ + mutex_unlock(&inode->i_mutex); + WARN_ON_ONCE(1); + return -EINVAL; + } + + spin_lock(&ctx->flc_lock); + time_out_leases(inode, &dispose); + error = check_conflicting_open(dentry, arg, lease->fl_flags); + if (error) + goto out; + + /* + * At this point, we know that if there is an exclusive + * lease on this file, then we hold it on this filp + * (otherwise our open of this file would have blocked). + * And if we are trying to acquire an exclusive lease, + * then the file is not open by anyone (including us) + * except for this filp. + */ + error = -EAGAIN; + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file == filp && + fl->fl_owner == lease->fl_owner) { + my_fl = fl; + continue; + } + + /* + * No exclusive leases if someone else has a lease on + * this file: + */ + if (arg == F_WRLCK) + goto out; + /* + * Modifying our existing lease is OK, but no getting a + * new lease if someone else is opening for write: + */ + if (fl->fl_flags & FL_UNLOCK_PENDING) + goto out; + } + + if (my_fl != NULL) { + lease = my_fl; + error = lease->fl_lmops->lm_change(lease, arg, &dispose); + if (error) + goto out; + goto out_setup; + } + + error = -EINVAL; + if (!leases_enable) + goto out; + + locks_insert_lock_ctx(lease, &ctx->flc_lease); + /* + * The check in break_lease() is lockless. It's possible for another + * open to race in after we did the earlier check for a conflicting + * open but before the lease was inserted. Check again for a + * conflicting open and cancel the lease if there is one. + * + * We also add a barrier here to ensure that the insertion of the lock + * precedes these checks. + */ + smp_mb(); + error = check_conflicting_open(dentry, arg, lease->fl_flags); + if (error) { + locks_unlink_lock_ctx(lease); + goto out; + } + +out_setup: + if (lease->fl_lmops->lm_setup) + lease->fl_lmops->lm_setup(lease, priv); +out: + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); + if (is_deleg) + mutex_unlock(&inode->i_mutex); + if (!error && !my_fl) + *flp = NULL; + return error; +} + +static int generic_delete_lease(struct file *filp, void *owner) +{ + int error = -EAGAIN; + struct file_lock *fl, *victim = NULL; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct file_lock_context *ctx = inode->i_flctx; + LIST_HEAD(dispose); + + if (!ctx) { + trace_generic_delete_lease(inode, NULL); + return error; + } + + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file == filp && + fl->fl_owner == owner) { + victim = fl; + break; + } + } + trace_generic_delete_lease(inode, victim); + if (victim) + error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); + return error; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * @priv: private data for lm_setup (may be NULL if lm_setup + * doesn't require it) + * + * The (input) flp->fl_lmops->lm_break function is required + * by break_lease(). + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error; + + if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + + switch (arg) { + case F_UNLCK: + return generic_delete_lease(filp, *priv); + case F_RDLCK: + case F_WRLCK: + if (!(*flp)->fl_lmops->lm_break) { + WARN_ON_ONCE(1); + return -ENOLCK; + } + + return generic_add_lease(filp, arg, flp, priv); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(generic_setlease); + +/** + * vfs_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @lease: file_lock to use when adding a lease + * @priv: private info for lm_setup when adding a lease (may be + * NULL if lm_setup doesn't require it) + * + * Call this to establish a lease on the file. The "lease" argument is not + * used for F_UNLCK requests and may be NULL. For commands that set or alter + * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; + * if not, this function will return -ENOLCK (and generate a scary-looking + * stack trace). + * + * The "priv" pointer is passed directly to the lm_setup function as-is. It + * may be NULL if the lm_setup operation doesn't require it. + */ +int +vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) +{ + if (filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease, priv); + else + return generic_setlease(filp, arg, lease, priv); +} +EXPORT_SYMBOL_GPL(vfs_setlease); + +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +{ + struct file_lock *fl; + struct fasync_struct *new; + int error; + + fl = lease_alloc(filp, arg); + if (IS_ERR(fl)) + return PTR_ERR(fl); + + new = fasync_alloc(); + if (!new) { + locks_free_lock(fl); + return -ENOMEM; + } + new->fa_fd = fd; + + error = vfs_setlease(filp, arg, &fl, (void **)&new); + if (fl) + locks_free_lock(fl); + if (new) + fasync_free(new); + return error; +} + +/** + * fcntl_setlease - sets a lease on an open file + * @fd: open file descriptor + * @filp: file pointer + * @arg: type of lease to obtain + * + * Call this fcntl to establish a lease on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + if (arg == F_UNLCK) + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); + return do_fcntl_add_lease(fd, filp, arg); +} + +/** + * flock_lock_file_wait - Apply a FLOCK-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * + * Add a FLOCK style lock to a file. + */ +int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + int error; + might_sleep(); + for (;;) { + error = flock_lock_file(filp, fl); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + return error; +} + +EXPORT_SYMBOL(flock_lock_file_wait); + +/** + * sys_flock: - flock() system call. + * @fd: the file descriptor to lock. + * @cmd: the type of lock to apply. + * + * Apply a %FL_FLOCK style lock to an open file descriptor. + * The @cmd can be one of + * + * %LOCK_SH -- a shared lock. + * + * %LOCK_EX -- an exclusive lock. + * + * %LOCK_UN -- remove an existing lock. + * + * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * + * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other + * processes read and write access respectively. + */ +SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) +{ + struct fd f = fdget(fd); + struct file_lock *lock; + int can_sleep, unlock; + int error; + + error = -EBADF; + if (!f.file) + goto out; + + can_sleep = !(cmd & LOCK_NB); + cmd &= ~LOCK_NB; + unlock = (cmd == LOCK_UN); + + if (!unlock && !(cmd & LOCK_MAND) && + !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + goto out_putf; + + lock = flock_make_lock(f.file, cmd); + if (IS_ERR(lock)) { + error = PTR_ERR(lock); + goto out_putf; + } + + if (can_sleep) + lock->fl_flags |= FL_SLEEP; + + error = security_file_lock(f.file, lock->fl_type); + if (error) + goto out_free; + + if (f.file->f_op->flock) + error = f.file->f_op->flock(f.file, + (can_sleep) ? F_SETLKW : F_SETLK, + lock); + else + error = flock_lock_file_wait(f.file, lock); + + out_free: + locks_free_lock(lock); + + out_putf: + fdput(f); + out: + return error; +} + +/** + * vfs_test_lock - test file byte range lock + * @filp: The file to test lock for + * @fl: The lock to test; also used to hold result + * + * Returns -ERRNO on failure. Indicates presence of conflicting lock by + * setting conf->fl_type to something other than F_UNLCK. + */ +int vfs_test_lock(struct file *filp, struct file_lock *fl) +{ + if (filp->f_op->lock) + return filp->f_op->lock(filp, F_GETLK, fl); + posix_test_lock(filp, fl); + return 0; +} +EXPORT_SYMBOL_GPL(vfs_test_lock); + +static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) +{ + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; +#if BITS_PER_LONG == 32 + /* + * Make sure we can represent the posix lock via + * legacy 32bit flock. + */ + if (fl->fl_start > OFFT_OFFSET_MAX) + return -EOVERFLOW; + if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX) + return -EOVERFLOW; +#endif + flock->l_start = fl->fl_start; + flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; + flock->l_whence = 0; + flock->l_type = fl->fl_type; + return 0; +} + +#if BITS_PER_LONG == 32 +static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) +{ + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_start = fl->fl_start; + flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; + flock->l_whence = 0; + flock->l_type = fl->fl_type; +} +#endif + +/* Report the first existing lock that would conflict with l. + * This implements the F_GETLK command of fcntl(). + */ +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) +{ + struct file_lock file_lock; + struct flock flock; + int error; + + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + error = -EINVAL; + if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + goto out; + + error = flock_to_posix_lock(filp, &file_lock, &flock); + if (error) + goto out; + + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = filp; + } + + error = vfs_test_lock(filp, &file_lock); + if (error) + goto out; + + flock.l_type = file_lock.fl_type; + if (file_lock.fl_type != F_UNLCK) { + error = posix_lock_to_flock(&flock, &file_lock); + if (error) + goto rel_priv; + } + error = -EFAULT; + if (!copy_to_user(l, &flock, sizeof(flock))) + error = 0; +rel_priv: + locks_release_private(&file_lock); +out: + return error; +} + +/** + * vfs_lock_file - file byte range lock + * @filp: The file to apply the lock to + * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.) + * @fl: The lock to be applied + * @conf: Place to return a copy of the conflicting lock, if found. + * + * A caller that doesn't care about the conflicting lock may pass NULL + * as the final argument. + * + * If the filesystem defines a private ->lock() method, then @conf will + * be left unchanged; so a caller that cares should initialize it to + * some acceptable default. + * + * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX + * locks, the ->lock() interface may return asynchronously, before the lock has + * been granted or denied by the underlying filesystem, if (and only if) + * lm_grant is set. Callers expecting ->lock() to return asynchronously + * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) + * the request is for a blocking lock. When ->lock() does return asynchronously, + * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock + * request completes. + * If the request is for non-blocking lock the file system should return + * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine + * with the result. If the request timed out the callback routine will return a + * nonzero return code and the file system should release the lock. The file + * system is also responsible to keep a corresponding posix lock when it + * grants a lock so the VFS can find out which locks are locally held and do + * the correct lock cleanup when required. + * The underlying filesystem must not drop the kernel lock or call + * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED + * return code. + */ +int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) +{ + if (filp->f_op->lock) + return filp->f_op->lock(filp, cmd, fl); + else + return posix_lock_file(filp, fl, conf); +} +EXPORT_SYMBOL_GPL(vfs_lock_file); + +static int do_lock_file_wait(struct file *filp, unsigned int cmd, + struct file_lock *fl) +{ + int error; + + error = security_file_lock(filp, fl->fl_type); + if (error) + return error; + + for (;;) { + error = vfs_lock_file(filp, cmd, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + + return error; +} + +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + +/* Apply the lock described by l to an open file descriptor. + * This implements both the F_SETLK and F_SETLKW commands of fcntl(). + */ +int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) +{ + struct file_lock *file_lock = locks_alloc_lock(); + struct flock flock; + struct inode *inode; + struct file *f; + int error; + + if (file_lock == NULL) + return -ENOLCK; + + /* + * This might block, so we do it before checking the inode. + */ + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + + inode = file_inode(filp); + + /* Don't allow mandatory locks on files that may be memory mapped + * and shared. + */ + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { + error = -EAGAIN; + goto out; + } + +again: + error = flock_to_posix_lock(filp, file_lock, &flock); + if (error) + goto out; + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLK; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = filp; + break; + case F_OFD_SETLKW: + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = filp; + /* Fallthrough */ + case F_SETLKW: + file_lock->fl_flags |= FL_SLEEP; + } + + error = do_lock_file_wait(filp, cmd, file_lock); + + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + /* + * we need that spin_lock here - it prevents reordering between + * update of i_flctx->flc_posix and check for it done in close(). + * rcu_read_lock() wouldn't do. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; + } + +out: + locks_free_lock(file_lock); + return error; +} + +#if BITS_PER_LONG == 32 +/* Report the first existing lock that would conflict with l. + * This implements the F_GETLK command of fcntl(). + */ +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +{ + struct file_lock file_lock; + struct flock64 flock; + int error; + + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + error = -EINVAL; + if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + goto out; + + error = flock64_to_posix_lock(filp, &file_lock, &flock); + if (error) + goto out; + + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK64; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = filp; + } + + error = vfs_test_lock(filp, &file_lock); + if (error) + goto out; + + flock.l_type = file_lock.fl_type; + if (file_lock.fl_type != F_UNLCK) + posix_lock_to_flock64(&flock, &file_lock); + + error = -EFAULT; + if (!copy_to_user(l, &flock, sizeof(flock))) + error = 0; + + locks_release_private(&file_lock); +out: + return error; +} + +/* Apply the lock described by l to an open file descriptor. + * This implements both the F_SETLK and F_SETLKW commands of fcntl(). + */ +int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) +{ + struct file_lock *file_lock = locks_alloc_lock(); + struct flock64 flock; + struct inode *inode; + struct file *f; + int error; + + if (file_lock == NULL) + return -ENOLCK; + + /* + * This might block, so we do it before checking the inode. + */ + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + + inode = file_inode(filp); + + /* Don't allow mandatory locks on files that may be memory mapped + * and shared. + */ + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { + error = -EAGAIN; + goto out; + } + +again: + error = flock64_to_posix_lock(filp, file_lock, &flock); + if (error) + goto out; + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLK64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = filp; + break; + case F_OFD_SETLKW: + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = filp; + /* Fallthrough */ + case F_SETLKW64: + file_lock->fl_flags |= FL_SLEEP; + } + + error = do_lock_file_wait(filp, cmd, file_lock); + + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; + } + +out: + locks_free_lock(file_lock); + return error; +} +#endif /* BITS_PER_LONG == 32 */ + +/* + * This function is called when the file is being removed + * from the task's fd array. POSIX locks belonging to this task + * are deleted at this time. + */ +void locks_remove_posix(struct file *filp, fl_owner_t owner) +{ + struct file_lock lock; + struct file_lock_context *ctx = file_inode(filp)->i_flctx; + + /* + * If there are no locks held on this file, we don't need to call + * posix_lock_file(). Another process could be setting a lock on this + * file at the same time, but we wouldn't remove that lock anyway. + */ + if (!ctx || list_empty(&ctx->flc_posix)) + return; + + lock.fl_type = F_UNLCK; + lock.fl_flags = FL_POSIX | FL_CLOSE; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; + lock.fl_pid = current->tgid; + lock.fl_file = filp; + lock.fl_ops = NULL; + lock.fl_lmops = NULL; + + vfs_lock_file(filp, F_SETLK, &lock, NULL); + + if (lock.fl_ops && lock.fl_ops->fl_release_private) + lock.fl_ops->fl_release_private(&lock); +} + +EXPORT_SYMBOL(locks_remove_posix); + +/* The i_flctx must be valid when calling into here */ +static void +locks_remove_flock(struct file *filp) +{ + struct file_lock fl = { + .fl_owner = filp, + .fl_pid = current->tgid, + .fl_file = filp, + .fl_flags = FL_FLOCK, + .fl_type = F_UNLCK, + .fl_end = OFFSET_MAX, + }; + struct file_lock_context *flctx = file_inode(filp)->i_flctx; + + if (list_empty(&flctx->flc_flock)) + return; + + if (filp->f_op->flock) + filp->f_op->flock(filp, F_SETLKW, &fl); + else + flock_lock_file(filp, &fl); + + if (fl.fl_ops && fl.fl_ops->fl_release_private) + fl.fl_ops->fl_release_private(&fl); +} + +/* The i_flctx must be valid when calling into here */ +static void +locks_remove_lease(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl, *tmp; + LIST_HEAD(dispose); + + if (list_empty(&ctx->flc_lease)) + return; + + spin_lock(&ctx->flc_lock); + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) + if (filp == fl->fl_file) + lease_modify(fl, F_UNLCK, &dispose); + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); +} + +/* + * This function is called on the last close of an open file. + */ +void locks_remove_file(struct file *filp) +{ + if (!file_inode(filp)->i_flctx) + return; + + /* remove any OFD locks */ + locks_remove_posix(filp, filp); + + /* remove flock locks */ + locks_remove_flock(filp); + + /* remove any leases */ + locks_remove_lease(filp); +} + +/** + * posix_unblock_lock - stop waiting for a file lock + * @waiter: the lock which was waiting + * + * lockd needs to block waiting for locks. + */ +int +posix_unblock_lock(struct file_lock *waiter) +{ + int status = 0; + + spin_lock(&blocked_lock_lock); + if (waiter->fl_next) + __locks_delete_block(waiter); + else + status = -ENOENT; + spin_unlock(&blocked_lock_lock); + return status; +} +EXPORT_SYMBOL(posix_unblock_lock); + +/** + * vfs_cancel_lock - file byte range unblock lock + * @filp: The file to apply the unblock to + * @fl: The lock to be unblocked + * + * Used by lock managers to cancel blocked requests + */ +int vfs_cancel_lock(struct file *filp, struct file_lock *fl) +{ + if (filp->f_op->lock) + return filp->f_op->lock(filp, F_CANCELLK, fl); + return 0; +} + +EXPORT_SYMBOL_GPL(vfs_cancel_lock); + +#ifdef CONFIG_PROC_FS +#include +#include + +struct locks_iterator { + int li_cpu; + loff_t li_pos; +}; + +static void lock_get_status(struct seq_file *f, struct file_lock *fl, + loff_t id, char *pfx) +{ + struct inode *inode = NULL; + unsigned int fl_pid; + + if (fl->fl_nspid) + fl_pid = pid_vnr(fl->fl_nspid); + else + fl_pid = fl->fl_pid; + + if (fl->fl_file != NULL) + inode = file_inode(fl->fl_file); + + seq_printf(f, "%lld:%s ", id, pfx); + if (IS_POSIX(fl)) { + if (fl->fl_flags & FL_ACCESS) + seq_puts(f, "ACCESS"); + else if (IS_OFDLCK(fl)) + seq_puts(f, "OFDLCK"); + else + seq_puts(f, "POSIX "); + + seq_printf(f, " %s ", + (inode == NULL) ? "*NOINODE*" : + mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); + } else if (IS_FLOCK(fl)) { + if (fl->fl_type & LOCK_MAND) { + seq_puts(f, "FLOCK MSNFS "); + } else { + seq_puts(f, "FLOCK ADVISORY "); + } + } else if (IS_LEASE(fl)) { + if (fl->fl_flags & FL_DELEG) + seq_puts(f, "DELEG "); + else + seq_puts(f, "LEASE "); + + if (lease_breaking(fl)) + seq_puts(f, "BREAKING "); + else if (fl->fl_file) + seq_puts(f, "ACTIVE "); + else + seq_puts(f, "BREAKER "); + } else { + seq_puts(f, "UNKNOWN UNKNOWN "); + } + if (fl->fl_type & LOCK_MAND) { + seq_printf(f, "%s ", + (fl->fl_type & LOCK_READ) + ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " + : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); + } else { + seq_printf(f, "%s ", + (lease_breaking(fl)) + ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ " + : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ "); + } + if (inode) { + /* userspace relies on this representation of dev_t */ + seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino); + } else { + seq_printf(f, "%d :0 ", fl_pid); + } + if (IS_POSIX(fl)) { + if (fl->fl_end == OFFSET_MAX) + seq_printf(f, "%Ld EOF\n", fl->fl_start); + else + seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); + } else { + seq_puts(f, "0 EOF\n"); + } +} + +static int locks_show(struct seq_file *f, void *v) +{ + struct locks_iterator *iter = f->private; + struct file_lock *fl, *bfl; + + fl = hlist_entry(v, struct file_lock, fl_link); + + lock_get_status(f, fl, iter->li_pos, ""); + + list_for_each_entry(bfl, &fl->fl_block, fl_block) + lock_get_status(f, bfl, iter->li_pos, " ->"); + + return 0; +} + +static void __show_fd_locks(struct seq_file *f, + struct list_head *head, int *id, + struct file *filp, struct files_struct *files) +{ + struct file_lock *fl; + + list_for_each_entry(fl, head, fl_list) { + + if (filp != fl->fl_file) + continue; + if (fl->fl_owner != files && + fl->fl_owner != filp) + continue; + + (*id)++; + seq_puts(f, "lock:\t"); + lock_get_status(f, fl, *id, ""); + } +} + +void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) +{ + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx; + int id = 0; + + ctx = inode->i_flctx; + if (!ctx) + return; + + spin_lock(&ctx->flc_lock); + __show_fd_locks(f, &ctx->flc_flock, &id, filp, files); + __show_fd_locks(f, &ctx->flc_posix, &id, filp, files); + __show_fd_locks(f, &ctx->flc_lease, &id, filp, files); + spin_unlock(&ctx->flc_lock); +} + +static void *locks_start(struct seq_file *f, loff_t *pos) + __acquires(&blocked_lock_lock) +{ + struct locks_iterator *iter = f->private; + + iter->li_pos = *pos + 1; + lg_global_lock(&file_lock_lglock); + spin_lock(&blocked_lock_lock); + return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); +} + +static void *locks_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct locks_iterator *iter = f->private; + + ++iter->li_pos; + return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); +} + +static void locks_stop(struct seq_file *f, void *v) + __releases(&blocked_lock_lock) +{ + spin_unlock(&blocked_lock_lock); + lg_global_unlock(&file_lock_lglock); +} + +static const struct seq_operations locks_seq_operations = { + .start = locks_start, + .next = locks_next, + .stop = locks_stop, + .show = locks_show, +}; + +static int locks_open(struct inode *inode, struct file *filp) +{ + return seq_open_private(filp, &locks_seq_operations, + sizeof(struct locks_iterator)); +} + +static const struct file_operations proc_locks_operations = { + .open = locks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init proc_locks_init(void) +{ + proc_create("locks", 0, NULL, &proc_locks_operations); + return 0; +} +module_init(proc_locks_init); +#endif + +static int __init filelock_init(void) +{ + int i; + + flctx_cache = kmem_cache_create("file_lock_ctx", + sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL); + + filelock_cache = kmem_cache_create("file_lock_cache", + sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + + lg_lock_init(&file_lock_lglock, "file_lock_lglock"); + + for_each_possible_cpu(i) + INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + + return 0; +} + +core_initcall(filelock_init); diff --git a/kernel/fs/logfs/Kconfig b/kernel/fs/logfs/Kconfig new file mode 100644 index 000000000..09ed066c0 --- /dev/null +++ b/kernel/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system" + depends on (MTD || BLOCK) + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/kernel/fs/logfs/Makefile b/kernel/fs/logfs/Makefile new file mode 100644 index 000000000..482002778 --- /dev/null +++ b/kernel/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/kernel/fs/logfs/compr.c b/kernel/fs/logfs/compr.c new file mode 100644 index 000000000..961f02b86 --- /dev/null +++ b/kernel/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/kernel/fs/logfs/dev_bdev.c b/kernel/fs/logfs/dev_bdev.c new file mode 100644 index 000000000..76279e119 --- /dev/null +++ b/kernel/fs/logfs/dev_bdev.c @@ -0,0 +1,321 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + + bio_init(&bio); + bio.bi_max_vecs = 1; + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_bdev = bdev; + bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_size = PAGE_SIZE; + + return submit_bio_wait(rw, &bio); +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; + int i; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + + bio_for_each_segment_all(bvec, bio, i) { + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + unsigned int max_pages; + int i; + + max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_iter.bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_iter.bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_iter.bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + + +static void erase_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct bio *bio; + unsigned int max_pages; + int i; + + max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_iter.bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_iter.bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + bio->bi_io_vec[i].bv_page = super->s_erase_page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + } + bio->bi_vcnt = nr_pages; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_iter.bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len, + int ensure_write) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + if (ensure_write) { + /* + * Object store doesn't care whether erases happen or not. + * But for the journal they are required. Otherwise a scan + * can find an old commit entry and assume it is the current + * one, travelling back in time. + */ + do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); + } + + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct logfs_super *s) +{ + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static int bdev_can_write_buf(struct super_block *sb, u64 ofs) +{ + return 0; +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .can_write_buf = bdev_can_write_buf, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, + const char *devname) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + return logfs_get_sb_mtd(p, mtdnr); + } + + p->s_bdev = bdev; + p->s_mtd = NULL; + p->s_devops = &bd_devops; + return 0; +} diff --git a/kernel/fs/logfs/dev_mtd.c b/kernel/fs/logfs/dev_mtd.c new file mode 100644 index 000000000..9c5014494 --- /dev/null +++ b/kernel/fs/logfs/dev_mtd.c @@ -0,0 +1,274 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len, + void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd_read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len, + void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd_write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from logfs_mtd_erase(). What an exercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd_erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return logfs_mtd_erase_mapping(sb, ofs, len); +} + +static void logfs_mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + mtd_sync(mtd); +} + +static int logfs_mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN || err == -EBADMSG) { + /* -EBADMSG happens regularly on power failures */ + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = logfs_mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + *ofs = 0; + while (mtd_block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = logfs_mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + *ofs = mtd->size - mtd->erasesize; + while (mtd_block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void logfs_mtd_put_device(struct logfs_super *s) +{ + put_mtd_device(s->s_mtd); +} + +static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + void *buf; + int err; + + buf = kmalloc(super->s_writesize, GFP_KERNEL); + if (!buf) + return -ENOMEM; + err = logfs_mtd_read(sb, ofs, super->s_writesize, buf); + if (err) + goto out; + if (memchr_inv(buf, 0xff, super->s_writesize)) + err = -EIO; + kfree(buf); +out: + return err; +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = logfs_mtd_find_first_sb, + .find_last_sb = logfs_mtd_find_last_sb, + .readpage = logfs_mtd_readpage, + .writeseg = logfs_mtd_writeseg, + .erase = logfs_mtd_erase, + .can_write_buf = logfs_mtd_can_write_buf, + .sync = logfs_mtd_sync, + .put_device = logfs_mtd_put_device, +}; + +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) +{ + struct mtd_info *mtd = get_mtd_device(NULL, mtdnr); + if (IS_ERR(mtd)) + return PTR_ERR(mtd); + + s->s_bdev = NULL; + s->s_mtd = mtd; + s->s_devops = &mtd_devops; + return 0; +} diff --git a/kernel/fs/logfs/dir.c b/kernel/fs/logfs/dir.c new file mode 100644 index 000000000..4cf38f118 --- /dev/null +++ b/kernel/fs/logfs/dir.c @@ -0,0 +1,801 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in separate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, NULL, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + u32 i0_blocks = I0_BLOCKS; + u32 i1_blocks = I1_BLOCKS; + u32 i2_blocks = I2_BLOCKS; + u32 i3_blocks = I3_BLOCKS; + + switch (round) { + case 0: + return hash % i0_blocks; + case 1: + return i0_blocks + hash % (i1_blocks - i0_blocks); + case 2: + return i1_blocks + hash % (i2_blocks - i1_blocks); + case 3: + return i2_blocks + hash % (i3_blocks - i2_blocks); + case 4 ... 19: + return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + drop_nlink(inode); + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = d_inode(dentry); + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) { + kfree(ta); + return -ENOENT; + } + if (IS_ERR(page)) { + kfree(ta); + return PTR_ERR(page); + } + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +static int logfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file_inode(file); + loff_t pos; + struct page *page; + struct logfs_disk_dentry *dd; + + if (ctx->pos < 0) + return -EINVAL; + + if (!dir_emit_dots(file, ctx)) + return 0; + + pos = ctx->pos - 2; + BUG_ON(pos < 0); + for (;; pos++, ctx->pos++) { + bool full; + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap(page); + BUG_ON(dd->namelen == 0); + + full = !dir_emit(ctx, (char *)dd->name, + be16_to_cpu(dd->namelen), + be64_to_cpu(dd->ino), dd->type); + kunmap(page); + page_cache_release(page); + if (full) + break; + } + return 0; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) { + drop_nlink(inode); + iput(inode); + return -ENOMEM; + } + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + drop_nlink(inode); + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ihold(inode); + inc_nlink(inode); + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry)); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (d_really_is_positive(new_dentry)) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .unlocked_ioctl = logfs_ioctl, + .iterate = logfs_readdir, + .read = generic_read_dir, + .llseek = default_llseek, +}; diff --git a/kernel/fs/logfs/file.c b/kernel/fs/logfs/file.c new file mode 100644 index 000000000..1a6f0167b --- /dev/null +++ b/kernel/fs/logfs/file.c @@ -0,0 +1,285 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct logfs_block *block = logfs_block(page); + + if (block->reserved_bytes) { + struct super_block *sb = page->mapping->host->i_sb; + struct logfs_super *super = logfs_super(sb); + + super->s_dirty_pages -= block->reserved_bytes; + block->ops->free_block(sb, block); + BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR)); + } else + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct super_block *sb = file->f_mapping->host->i_sb; + struct inode *inode = file->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + logfs_get_wblocks(sb, NULL, WF_LOCK); + logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); + mutex_unlock(&inode->i_mutex); + + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int err = 0; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + err = logfs_truncate(inode, attr->ia_size); + if (err) + return err; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .fsync = logfs_fsync, + .unlocked_ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/kernel/fs/logfs/gc.c b/kernel/fs/logfs/gc.c new file mode 100644 index 000000000..d4efb061b --- /dev/null +++ b/kernel/fs/logfs/gc.c @@ -0,0 +1,732 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + /* + * Doing too many changes to the segfile at once would result + * in a large number of aliases. Write the journal before + * things get out of hand. + */ + if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) + logfs_write_anchor(sb); + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(sb); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(sb); + __logfs_gc_pass(sb, super->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + gc_level_t gc_level; + u32 cleaned, valid, ec; + u32 segno = area->a_segno; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + + if (!area->a_is_open) + return 0; + + if (super->s_devops->can_write_buf(sb, ofs) == 0) + return 0; + + printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); + /* + * The device cannot write back the write buffer. Most likely the + * wbuf was already written out and the system crashed at some point + * before the journal commit happened. In that case we wouldn't have + * to do anything. But if the crash happened before the wbuf was + * written out correctly, we must GC this segment. So assume the + * worst and always do the GC run. + */ + area->a_is_open = 0; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + cleaned = logfs_gc_segment(sb, segno); + if (cleaned != valid) + return -EIO; + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/kernel/fs/logfs/inode.c b/kernel/fs/logfs/inode.c new file mode 100644 index 000000000..af49e2d69 --- /dev/null +++ b/kernel/fs/logfs/inode.c @@ -0,0 +1,426 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * whether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + clear_nlink(inode); + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void logfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + call_rcu(&inode->i_rcu, logfs_i_callback); +} + +static void __logfs_destroy_meta_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + BUG_ON(li->li_block); + call_rcu(&inode->i_rcu, logfs_i_callback); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (inode->i_ino < LOGFS_RESERVED_INOS) { + /* + * The reserved inodes are never destroyed unless we are in + * unmont path. + */ + __logfs_destroy_meta_inode(inode); + return; + } + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + i_uid_write(inode, 0); + i_gid_write(inode, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + li->li_refcount = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + * + * AV: that's what ->put_super() is for... + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_data.a_ops = &logfs_reg_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_NOFS); + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + iput(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, NULL, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +/* called with inode->i_lock held */ +static int logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + return generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode_init_owner(inode, dir, mode); + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + logfs_get_wblocks(sb, NULL, WF_LOCK); + logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); + return 0; +} + +static void logfs_put_super(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + /* kill the meta-inodes */ + iput(super->s_segfile_inode); + iput(super->s_master_inode); + iput(super->s_mapping_inode); +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .destroy_inode = logfs_destroy_inode, + .evict_inode = logfs_evict_inode, + .drop_inode = logfs_drop_inode, + .put_super = logfs_put_super, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/kernel/fs/logfs/journal.c b/kernel/fs/logfs/journal.c new file mode 100644 index 000000000..2a09b8d73 --- /dev/null +++ b/kernel/fs/logfs/journal.c @@ -0,0 +1,894 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + return logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + return logfs_buf_recover(area, ofs, NULL, 0); +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + err = read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len = 0; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + union { + struct logfs_segment_header sh; + unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)]; + } u; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 1); + if (err) + return err; + + memset(&u, 0, sizeof(u)); + u.sh.pad = 0; + u.sh.type = SEG_JOURNAL; + u.sh.level = 0; + u.sh.segno = cpu_to_be32(area->a_segno); + u.sh.ec = cpu_to_be32(area->a_erase_count); + u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(u); + logfs_buf_write(area, ofs, &u, sizeof(u)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'E'; + jh->h_pad[2] = 'A'; + jh->h_pad[3] = 'D'; + jh->h_pad[4] = 'R'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + btree_grim_visitor32(&tree->segment_map, 0, NULL); + tree->no_shadowed_segments = 0; + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_or_create_page(mapping, index, GFP_NOFS); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY)) + return; + super->s_flags &= ~LOGFS_SB_FLAG_DIRTY; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct btree_head32 *head = &super->s_reserved_segments; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + btree_remove32(head, super->s_journal_seg[i]); + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + BUG_ON(err); /* mempool should prevent this */ + err = logfs_erase_segment(sb, segno, 1); + BUG_ON(err); /* FIXME: remount-ro would be nicer */ + } + /* Manually move journal_area */ + freeseg(sb, area->a_segno); + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(sb); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/kernel/fs/logfs/logfs.h b/kernel/fs/logfs/logfs.h new file mode 100644 index 000000000..5f0937609 --- /dev/null +++ b/kernel/fs/logfs/logfs.h @@ -0,0 +1,736 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include +#include +#include +#include +#include +#include +#include +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_DIRTY 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +struct logfs_super; /* forward */ +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + * @can_write_buf: decide whether wbuf can be written to ofs + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write); + int (*can_write_buf)(struct super_block *sb, u64 ofs); + void (*sync)(struct super_block *sb); + void (*put_device)(struct logfs_super *s); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + * @segment_map: bitfield of segments containing shadows + * @no_shadowed_segment: number of segments containing shadows + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; + struct btree_head32 segment_map; + int no_shadowed_segments; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +#define MAX_JOURNAL_ENTRIES 256 + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + struct page *s_erase_page; /* for dev_bdev.c */ + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[MAX_JOURNAL_ENTRIES]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + struct list_head s_writeback_list; /* writeback pages */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname); +#else +static inline int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); +#else +static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, umode_t mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, struct page *, long flags); +void logfs_evict_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct super_block *sb); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); +void freeseg(struct super_block *sb, u32 segno); +void free_areas(struct super_block *sb); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline int logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + return __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + return __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has separate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +static inline void logfs_mempool_destroy(mempool_t *pool) +{ + if (pool) + mempool_destroy(pool); +} + +#endif diff --git a/kernel/fs/logfs/logfs_abi.h b/kernel/fs/logfs/logfs_abi.h new file mode 100644 index 000000000..ae960519c --- /dev/null +++ b/kernel/fs/logfs/logfs_abi.h @@ -0,0 +1,629 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help separate + * short-lived metadata from longer-lived file data. In the future, + * file data should get separated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to separate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +#define LOGFS_FEATURES_INCOMPAT (0ull) +#define LOGFS_FEATURES_RO_COMPAT (0ull) +#define LOGFS_FEATURES_COMPAT (0ull) + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of separate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __u8 h_compr; + __u8 h_pad[5]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to separate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/kernel/fs/logfs/readwrite.c b/kernel/fs/logfs/readwrite.c new file mode 100644 index 000000000..380d86e1a --- /dev/null +++ b/kernel/fs/logfs/readwrite.c @@ -0,0 +1,2298 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include +#include + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + i_uid_write(inode, be32_to_cpu(di->di_uid)); + i_gid_write(inode, be32_to_cpu(di->di_gid)); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + set_nlink(inode, be32_to_cpu(di->di_refcount)); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(i_uid_read(inode)); + di->di_gid = cpu_to_be32(i_gid_read(inode)); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + ret = __logfs_write_inode(inode, NULL, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes); + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page); + val = child[pos]; + kunmap_atomic(child); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + struct page *page = block->page; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else if (bix >= maxbix(li->li_height)) + return bix; + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + int ret; + + if (block && block->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) && + !list_empty(&super->s_writeback_list)) { + block = list_entry(super->s_writeback_list.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + } + if (!ret) { + alloc_data_block(inode, page); + block = logfs_block(page); + block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + list_move_tail(&block->alias_list, &super->s_writeback_list); + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +static void mark_segment(struct shadow_tree *tree, u32 segno) +{ + int err; + + if (!btree_lookup32(&tree->segment_map, segno)) { + err = btree_insert32(&tree->segment_map, segno, (void *)1, + GFP_NOFS); + BUG_ON(err); + tree->no_shadowed_segments++; + } +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + mark_segment(tree, shadow->old_ofs >> super->s_segshift); + mark_segment(tree, shadow->new_ofs >> super->s_segshift); + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + int err; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + err = grow_inode(inode, page->index, 0); + if (err) + return err; + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + if (!err && shrink_level(gc_level) == 0) { + /* Rewrite cannot mark the inode dirty but has to + * write it immediately. + * Q: Can't we just create an alias for the inode + * instead? And if not, why not? + */ + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + err = __logfs_write_inode(inode, page, flags); + } + } + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +/* + * Truncate, by changing the segment file, can consume a fair amount + * of resources. So back off from time to time and do some GC. + * 8 or 2048 blocks should be well within safety limits even if + * every single block resided in a different segment. + */ +#define TRUNCATE_STEP (8 * 1024 * 1024) +int logfs_truncate(struct inode *inode, u64 target) +{ + struct super_block *sb = inode->i_sb; + u64 size = i_size_read(inode); + int err = 0; + + size = ALIGN(size, TRUNCATE_STEP); + while (size > target) { + if (size > TRUNCATE_STEP) + size -= TRUNCATE_STEP; + else + size = 0; + if (size < target) + size = target; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, NULL, 0); + logfs_put_wblocks(sb, NULL, 1); + } + + if (!err) { + err = inode_newsize_ok(inode, target); + if (err) + goto out; + + truncate_setsize(inode, target); + } + + out: + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di); + move_inode_to_page(page, inode); + return page; +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + if (err) + move_page_to_inode(inode, page); + + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + if (!inode->i_nlink) { + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + } + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + /* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @count: object size + * @bix: block index + * @flags: write flags + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + INIT_LIST_HEAD(&super->s_writeback_list); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + logfs_mempool_destroy(super->s_block_pool); + logfs_mempool_destroy(super->s_shadow_pool); +} diff --git a/kernel/fs/logfs/segment.c b/kernel/fs/logfs/segment.c new file mode 100644 index 000000000..7f9b096d8 --- /dev/null +++ b/kernel/fs/logfs/segment.c @@ -0,0 +1,961 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" +#include + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize, ensure_erase); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + if (page) + unlock_page(page); + } + return page; +} + +int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + if (IS_ERR(page)) + return PTR_ERR(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + SetPageUptodate(page); + memcpy(page_address(page) + offset, buf, copylen); + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); + return 0; +} + +static void pad_partial_page(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len % PAGE_SIZE) { + page = get_mapping_page(sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } + page_cache_release(page); + } +} + +static void pad_full_pages(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + u32 len = super->s_segsize - area->a_used_bytes; + pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT; + pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT; + struct page *page; + + while (no_indizes) { + page = get_mapping_page(sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + SetPageUptodate(page); + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } + page_cache_release(page); + index++; + no_indizes--; + } +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + * Also make sure we allocate (and memset) all pages for final writeout. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + pad_partial_page(area); + if (final) + pad_full_pages(area); +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page); + item->val = child[pos]; + kunmap_atomic(child); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_object_header h; + u16 len; + int err; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + } + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 0); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +void free_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned int o, + unsigned int l) +{ + return; +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + logfs_mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); +} diff --git a/kernel/fs/logfs/super.c b/kernel/fs/logfs/super.c new file mode 100644 index 000000000..54360293b --- /dev/null +++ b/kernel/fs/logfs/super.c @@ -0,0 +1,653 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; +#ifdef CONFIG_BLOCK + if (sb->s_bdev) + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; +#endif +#ifdef CONFIG_MTD + if (sb->s_mtd) + sb->s_bdi = sb->s_mtd->backing_dev_info; +#endif + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return logfs_write_sb(sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_make_root(rootdir); + if (!sb->s_root) + goto fail; + + /* at that point we know that ->put_super() will be called */ + super->s_erase_page = alloc_pages(GFP_KERNEL, 0); + if (!super->s_erase_page) + return -ENOMEM; + memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) { + __free_page(super->s_erase_page); + return err; + } + + log_super("LogFS: Finished mounting\n"); + return 0; + +fail: + iput(super->s_master_inode); + iput(super->s_segfile_inode); + iput(super->s_mapping_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(last)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EINVAL; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb, int read_only) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + btree_init_mempool32(&super->s_shadow_tree.segment_map, + super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT) + return -EIO; + if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) && + !read_only) + return -EIO; + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(sb); + free_areas(sb); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + if (super->s_erase_page) + __free_page(super->s_erase_page); + super->s_devops->put_device(super); + logfs_mempool_destroy(super->s_btree_pool); + logfs_mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +static struct dentry *logfs_get_sb_device(struct logfs_super *super, + struct file_system_type *type, int flags) +{ + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super); + if (IS_ERR(sb)) { + super->s_devops->put_device(super); + kfree(super); + return ERR_CAST(sb); + } + + if (sb->s_root) { + /* Device is already in use */ + super->s_devops->put_device(super); + kfree(super); + return dget(sb->s_root); + } + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_max_links = LOGFS_LINK_MAX; + sb->s_op = &logfs_super_operations; + + err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + return dget(sb->s_root); + +err1: + /* no ->s_root, no ->put_super() */ + iput(super->s_master_inode); + iput(super->s_segfile_inode); + iput(super->s_mapping_inode); + deactivate_locked_super(sb); + return ERR_PTR(err); +} + +static struct dentry *logfs_mount(struct file_system_type *type, int flags, + const char *devname, void *data) +{ + ulong mtdnr; + struct logfs_super *super; + int err; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return ERR_PTR(-ENOMEM); + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + if (!devname) + err = logfs_get_sb_bdev(super, type, devname); + else if (strncmp(devname, "mtd", 3)) + err = logfs_get_sb_bdev(super, type, devname); + else { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + err = -EINVAL; + else + err = logfs_get_sb_mtd(super, mtdnr); + } + + if (err) { + kfree(super); + return ERR_PTR(err); + } + + return logfs_get_sb_device(super, type, flags); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .mount = logfs_mount, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; +MODULE_ALIAS_FS("logfs"); + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + ret = register_filesystem(&logfs_fs_type); + if (!ret) + return 0; + logfs_destroy_inode_cache(); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel "); +MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/kernel/fs/mbcache.c b/kernel/fs/mbcache.c new file mode 100644 index 000000000..187477ded --- /dev/null +++ b/kernel/fs/mbcache.c @@ -0,0 +1,858 @@ +/* + * linux/fs/mbcache.c + * (C) 2001-2002 Andreas Gruenbacher, + */ + +/* + * Filesystem Meta Information Block Cache (mbcache) + * + * The mbcache caches blocks of block devices that need to be located + * by their device/block number, as well as by other criteria (such + * as the block's contents). + * + * There can only be one cache entry in a cache per device and block number. + * Additional indexes need not be unique in this sense. The number of + * additional indexes (=other criteria) can be hardwired at compile time + * or specified at cache create time. + * + * Each cache entry is of fixed size. An entry may be `valid' or `invalid' + * in the cache. A valid entry is in the main hash tables of the cache, + * and may also be in the lru list. An invalid entry is not in any hashes + * or lists. + * + * A valid cache entry is only in the lru list if no handles refer to it. + * Invalid cache entries will be freed when the last handle to the cache + * entry is released. Entries that cannot be freed immediately are put + * back on the lru list. + */ + +/* + * Lock descriptions and usage: + * + * Each hash chain of both the block and index hash tables now contains + * a built-in lock used to serialize accesses to the hash chain. + * + * Accesses to global data structures mb_cache_list and mb_cache_lru_list + * are serialized via the global spinlock mb_cache_spinlock. + * + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize + * accesses to its local data, such as e_used and e_queued. + * + * Lock ordering: + * + * Each block hash chain's lock has the highest lock order, followed by an + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's + * lock), and mb_cach_spinlock, with the lowest order. While holding + * either a block or index hash chain lock, a thread can acquire an + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. + * + * Synchronization: + * + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and + * index hash chian, it needs to lock the corresponding hash chain. For each + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to + * prevent either any simultaneous release or free on the entry and also + * to serialize accesses to either the e_used or e_queued member of the entry. + * + * To avoid having a dangling reference to an already freed + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a + * block hash chain and also no longer being referenced, both e_used, + * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is + * first removed from a block hash chain. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef MB_CACHE_DEBUG +# define mb_debug(f...) do { \ + printk(KERN_DEBUG f); \ + printk("\n"); \ + } while (0) +#define mb_assert(c) do { if (!(c)) \ + printk(KERN_ERR "assertion " #c " failed\n"); \ + } while(0) +#else +# define mb_debug(f...) do { } while(0) +# define mb_assert(c) do { } while(0) +#endif +#define mb_error(f...) do { \ + printk(KERN_ERR f); \ + printk("\n"); \ + } while(0) + +#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) + +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ + (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) + +static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); +static struct blockgroup_lock *mb_cache_bg_lock; +static struct kmem_cache *mb_cache_kmem_cache; + +MODULE_AUTHOR("Andreas Gruenbacher "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(mb_cache_create); +EXPORT_SYMBOL(mb_cache_shrink); +EXPORT_SYMBOL(mb_cache_destroy); +EXPORT_SYMBOL(mb_cache_entry_alloc); +EXPORT_SYMBOL(mb_cache_entry_insert); +EXPORT_SYMBOL(mb_cache_entry_release); +EXPORT_SYMBOL(mb_cache_entry_free); +EXPORT_SYMBOL(mb_cache_entry_get); +#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) +EXPORT_SYMBOL(mb_cache_entry_find_first); +EXPORT_SYMBOL(mb_cache_entry_find_next); +#endif + +/* + * Global data: list of all mbcache's, lru list, and a spinlock for + * accessing cache data structures on SMP machines. The lru list is + * global across all mbcaches. + */ + +static LIST_HEAD(mb_cache_list); +static LIST_HEAD(mb_cache_lru_list); +static DEFINE_SPINLOCK(mb_cache_spinlock); + +static inline void +__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_lock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + +static inline void +__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + +static inline int +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_block_list); +} + + +static inline void +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_block_hashed(ce)) + hlist_bl_del_init(&ce->e_block_list); +} + +static inline int +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_index.o_list); +} + +static inline void +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_index_hashed(ce)) + hlist_bl_del_init(&ce->e_index.o_list); +} + +/* + * __mb_cache_entry_unhash_unlock() + * + * This function is called to unhash both the block and index hash + * chain. + * It assumes both the block and index hash chain is locked upon entry. + * It also unlock both hash chains both exit + */ +static inline void +__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +{ + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); +} + +static void +__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) +{ + struct mb_cache *cache = ce->e_cache; + + mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); + kmem_cache_free(cache->c_entry_cache, ce); + atomic_dec(&cache->c_entry_count); +} + +static void +__mb_cache_entry_release(struct mb_cache_entry *ce) +{ + /* First lock the entry to serialize access to its local data. */ + __spin_lock_mb_cache_entry(ce); + /* Wake up all processes queuing for this cache entry. */ + if (ce->e_queued) + wake_up_all(&mb_cache_queue); + if (ce->e_used >= MB_CACHE_WRITER) + ce->e_used -= MB_CACHE_WRITER; + /* + * Make sure that all cache entries on lru_list have + * both e_used and e_qued of 0s. + */ + ce->e_used--; + if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { + if (!__mb_cache_entry_is_block_hashed(ce)) { + __spin_unlock_mb_cache_entry(ce); + goto forget; + } + /* + * Need access to lru list, first drop entry lock, + * then reacquire the lock in the proper order. + */ + spin_lock(&mb_cache_spinlock); + if (list_empty(&ce->e_lru_list)) + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + spin_unlock(&mb_cache_spinlock); + } + __spin_unlock_mb_cache_entry(ce); + return; +forget: + mb_assert(list_empty(&ce->e_lru_list)); + __mb_cache_entry_forget(ce, GFP_KERNEL); +} + +/* + * mb_cache_shrink_scan() memory pressure callback + * + * This function is called by the kernel memory management when memory + * gets low. + * + * @shrink: (ignored) + * @sc: shrink_control passed from reclaim + * + * Returns the number of objects freed. + */ +static unsigned long +mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(free_list); + struct mb_cache_entry *entry, *tmp; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + unsigned long freed = 0; + + mb_debug("trying to free %d entries", nr_to_scan); + spin_lock(&mb_cache_spinlock); + while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { + struct mb_cache_entry *ce = + list_entry(mb_cache_lru_list.next, + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* Prevent any find or get operation on the entry */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + list_add_tail(&ce->e_lru_list, &free_list); + spin_lock(&mb_cache_spinlock); + } + spin_unlock(&mb_cache_spinlock); + + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(entry, gfp_mask); + freed++; + } + return freed; +} + +static unsigned long +mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct mb_cache *cache; + unsigned long count = 0; + + spin_lock(&mb_cache_spinlock); + list_for_each_entry(cache, &mb_cache_list, c_cache_list) { + mb_debug("cache %s (%d)", cache->c_name, + atomic_read(&cache->c_entry_count)); + count += atomic_read(&cache->c_entry_count); + } + spin_unlock(&mb_cache_spinlock); + + return vfs_pressure_ratio(count); +} + +static struct shrinker mb_cache_shrinker = { + .count_objects = mb_cache_shrink_count, + .scan_objects = mb_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + +/* + * mb_cache_create() create a new cache + * + * All entries in one cache are equal size. Cache entries may be from + * multiple devices. If this is the first mbcache created, registers + * the cache with kernel memory management. Returns NULL if no more + * memory was available. + * + * @name: name of the cache (informal) + * @bucket_bits: log2(number of hash buckets) + */ +struct mb_cache * +mb_cache_create(const char *name, int bucket_bits) +{ + int n, bucket_count = 1 << bucket_bits; + struct mb_cache *cache = NULL; + + if (!mb_cache_bg_lock) { + mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), + GFP_KERNEL); + if (!mb_cache_bg_lock) + return NULL; + bgl_lock_init(mb_cache_bg_lock); + } + + cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); + if (!cache) + return NULL; + cache->c_name = name; + atomic_set(&cache->c_entry_count, 0); + cache->c_bucket_bits = bucket_bits; + cache->c_block_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); + if (!cache->c_block_hash) + goto fail; + for (n=0; nc_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); + if (!cache->c_index_hash) + goto fail; + for (n=0; nc_index_hash[n]); + if (!mb_cache_kmem_cache) { + mb_cache_kmem_cache = kmem_cache_create(name, + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!mb_cache_kmem_cache) + goto fail2; + } + cache->c_entry_cache = mb_cache_kmem_cache; + + /* + * Set an upper limit on the number of cache entries so that the hash + * chains won't grow too long. + */ + cache->c_max_entries = bucket_count << 4; + + spin_lock(&mb_cache_spinlock); + list_add(&cache->c_cache_list, &mb_cache_list); + spin_unlock(&mb_cache_spinlock); + return cache; + +fail2: + kfree(cache->c_index_hash); + +fail: + kfree(cache->c_block_hash); + kfree(cache); + return NULL; +} + + +/* + * mb_cache_shrink() + * + * Removes all cache entries of a device from the cache. All cache entries + * currently in use cannot be freed, and thus remain in the cache. All others + * are freed. + * + * @bdev: which device's cache entries to shrink + */ +void +mb_cache_shrink(struct block_device *bdev) +{ + LIST_HEAD(free_list); + struct list_head *l; + struct mb_cache_entry *ce, *tmp; + + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_bdev == bdev) { + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + list_add_tail(&ce->e_lru_list, &free_list); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + } + } + spin_unlock(&mb_cache_spinlock); + + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(ce, GFP_KERNEL); + } +} + + +/* + * mb_cache_destroy() + * + * Shrinks the cache to its minimum possible size (hopefully 0 entries), + * and then destroys it. If this was the last mbcache, un-registers the + * mbcache from kernel memory management. + */ +void +mb_cache_destroy(struct mb_cache *cache) +{ + LIST_HEAD(free_list); + struct mb_cache_entry *ce, *tmp; + + spin_lock(&mb_cache_spinlock); + list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { + if (ce->e_cache == cache) + list_move_tail(&ce->e_lru_list, &free_list); + } + list_del(&cache->c_cache_list); + spin_unlock(&mb_cache_spinlock); + + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + list_del_init(&ce->e_lru_list); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + __mb_cache_entry_forget(ce, GFP_KERNEL); + } + + if (atomic_read(&cache->c_entry_count) > 0) { + mb_error("cache %s: %d orphaned entries", + cache->c_name, + atomic_read(&cache->c_entry_count)); + } + + if (list_empty(&mb_cache_list)) { + kmem_cache_destroy(mb_cache_kmem_cache); + mb_cache_kmem_cache = NULL; + } + kfree(cache->c_index_hash); + kfree(cache->c_block_hash); + kfree(cache); +} + +/* + * mb_cache_entry_alloc() + * + * Allocates a new cache entry. The new entry will not be valid initially, + * and thus cannot be looked up yet. It should be filled with data, and + * then inserted into the cache using mb_cache_entry_insert(). Returns NULL + * if no more memory was available. + */ +struct mb_cache_entry * +mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) +{ + struct mb_cache_entry *ce; + + if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + struct list_head *l; + + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the + * entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + mb_assert(list_empty(&ce->e_lru_list)); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + goto found; + } + } + spin_unlock(&mb_cache_spinlock); + } + + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; + atomic_inc(&cache->c_entry_count); + INIT_LIST_HEAD(&ce->e_lru_list); + INIT_HLIST_BL_NODE(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_index.o_list); + ce->e_cache = cache; + ce->e_queued = 0; + atomic_set(&ce->e_refcnt, 0); +found: + ce->e_block_hash_p = &cache->c_block_hash[0]; + ce->e_index_hash_p = &cache->c_index_hash[0]; + ce->e_used = 1 + MB_CACHE_WRITER; + return ce; +} + + +/* + * mb_cache_entry_insert() + * + * Inserts an entry that was allocated using mb_cache_entry_alloc() into + * the cache. After this, the cache entry can be looked up, but is not yet + * in the lru list as the caller still holds a handle to it. Returns 0 on + * success, or -EBUSY if a cache entry for that device + inode exists + * already (this may happen after a failed lookup, but when another process + * has inserted the same cache entry in the meantime). + * + * @bdev: device the cache entry belongs to + * @block: block number + * @key: lookup key + */ +int +mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, + sector_t block, unsigned int key) +{ + struct mb_cache *cache = ce->e_cache; + unsigned int bucket; + struct hlist_bl_node *l; + struct hlist_bl_head *block_hash_p; + struct hlist_bl_head *index_hash_p; + struct mb_cache_entry *lce; + + mb_assert(ce); + bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), + cache->c_bucket_bits); + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { + if (lce->e_bdev == bdev && lce->e_block == block) { + hlist_bl_unlock(block_hash_p); + return -EBUSY; + } + } + mb_assert(!__mb_cache_entry_is_block_hashed(ce)); + __mb_cache_entry_unhash_block(ce); + __mb_cache_entry_unhash_index(ce); + ce->e_bdev = bdev; + ce->e_block = block; + ce->e_block_hash_p = block_hash_p; + ce->e_index.o_key = key; + hlist_bl_add_head(&ce->e_block_list, block_hash_p); + hlist_bl_unlock(block_hash_p); + bucket = hash_long(key, cache->c_bucket_bits); + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + ce->e_index_hash_p = index_hash_p; + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); + hlist_bl_unlock(index_hash_p); + return 0; +} + + +/* + * mb_cache_entry_release() + * + * Release a handle to a cache entry. When the last handle to a cache entry + * is released it is either freed (if it is invalid) or otherwise inserted + * in to the lru list. + */ +void +mb_cache_entry_release(struct mb_cache_entry *ce) +{ + __mb_cache_entry_release(ce); +} + + +/* + * mb_cache_entry_free() + * + */ +void +mb_cache_entry_free(struct mb_cache_entry *ce) +{ + mb_assert(ce); + mb_assert(list_empty(&ce->e_lru_list)); + hlist_bl_lock(ce->e_index_hash_p); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_lock(ce->e_block_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_release(ce); +} + + +/* + * mb_cache_entry_get() + * + * Get a cache entry by device / block number. (There can only be one entry + * in the cache per device and block.) Returns NULL if no such cache entry + * exists. The returned cache entry is locked for exclusive access ("single + * writer"). + */ +struct mb_cache_entry * +mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, + sector_t block) +{ + unsigned int bucket; + struct hlist_bl_node *l; + struct mb_cache_entry *ce; + struct hlist_bl_head *block_hash_p; + + bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), + cache->c_bucket_bits); + block_hash_p = &cache->c_block_hash[bucket]; + /* First serialize access to the block corresponding hash chain. */ + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { + mb_assert(ce->e_block_hash_p == block_hash_p); + if (ce->e_bdev == bdev && ce->e_block == block) { + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(block_hash_p); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + if (ce->e_used > 0) { + DEFINE_WAIT(wait); + while (ce->e_used > 0) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + } + ce->e_used += 1 + MB_CACHE_WRITER; + __spin_unlock_mb_cache_entry(ce); + + if (!list_empty(&ce->e_lru_list)) { + spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); + return NULL; + } + return ce; + } + } + hlist_bl_unlock(block_hash_p); + return NULL; +} + +#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) + +static struct mb_cache_entry * +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, + struct block_device *bdev, unsigned int key) +{ + + /* The index hash chain is alredy acquire by caller. */ + while (l != NULL) { + struct mb_cache_entry *ce = + hlist_bl_entry(l, struct mb_cache_entry, + e_index.o_list); + mb_assert(ce->e_index_hash_p == head); + if (ce->e_bdev == bdev && ce->e_index.o_key == key) { + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(head); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + ce->e_used++; + /* Incrementing before holding the lock gives readers + priority over writers. */ + if (ce->e_used >= MB_CACHE_WRITER) { + DEFINE_WAIT(wait); + + while (ce->e_used >= MB_CACHE_WRITER) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + } + __spin_unlock_mb_cache_entry(ce); + if (!list_empty(&ce->e_lru_list)) { + spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); + return ERR_PTR(-EAGAIN); + } + return ce; + } + l = l->next; + } + hlist_bl_unlock(head); + return NULL; +} + + +/* + * mb_cache_entry_find_first() + * + * Find the first cache entry on a given device with a certain key in + * an additional index. Additional matches can be found with + * mb_cache_entry_find_next(). Returns NULL if no match was found. The + * returned cache entry is locked for shared access ("multiple readers"). + * + * @cache: the cache to search + * @bdev: the device the cache entry should belong to + * @key: the key in the index + */ +struct mb_cache_entry * +mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, + unsigned int key) +{ + unsigned int bucket = hash_long(key, cache->c_bucket_bits); + struct hlist_bl_node *l; + struct mb_cache_entry *ce = NULL; + struct hlist_bl_head *index_hash_p; + + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + if (!hlist_bl_empty(index_hash_p)) { + l = hlist_bl_first(index_hash_p); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + } else + hlist_bl_unlock(index_hash_p); + return ce; +} + + +/* + * mb_cache_entry_find_next() + * + * Find the next cache entry on a given device with a certain key in an + * additional index. Returns NULL if no match could be found. The previous + * entry is atomatically released, so that mb_cache_entry_find_next() can + * be called like this: + * + * entry = mb_cache_entry_find_first(); + * while (entry) { + * ... + * entry = mb_cache_entry_find_next(entry, ...); + * } + * + * @prev: The previous match + * @bdev: the device the cache entry should belong to + * @key: the key in the index + */ +struct mb_cache_entry * +mb_cache_entry_find_next(struct mb_cache_entry *prev, + struct block_device *bdev, unsigned int key) +{ + struct mb_cache *cache = prev->e_cache; + unsigned int bucket = hash_long(key, cache->c_bucket_bits); + struct hlist_bl_node *l; + struct mb_cache_entry *ce; + struct hlist_bl_head *index_hash_p; + + index_hash_p = &cache->c_index_hash[bucket]; + mb_assert(prev->e_index_hash_p == index_hash_p); + hlist_bl_lock(index_hash_p); + mb_assert(!hlist_bl_empty(index_hash_p)); + l = prev->e_index.o_list.next; + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + __mb_cache_entry_release(prev); + return ce; +} + +#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ + +static int __init init_mbcache(void) +{ + register_shrinker(&mb_cache_shrinker); + return 0; +} + +static void __exit exit_mbcache(void) +{ + unregister_shrinker(&mb_cache_shrinker); +} + +module_init(init_mbcache) +module_exit(exit_mbcache) + diff --git a/kernel/fs/minix/Kconfig b/kernel/fs/minix/Kconfig new file mode 100644 index 000000000..f2a0cfcef --- /dev/null +++ b/kernel/fs/minix/Kconfig @@ -0,0 +1,25 @@ +config MINIX_FS + tristate "Minix file system support" + depends on BLOCK + help + Minix is a simple operating system used in many classes about OS's. + The minix file system (method to organize files on a hard disk + partition or a floppy disk) was the original file system for Linux, + but has been superseded by the second extended file system ext2fs. + You don't want to use the minix file system on your hard disk + because of certain built-in restrictions, but it is sometimes found + on older Linux floppy disks. This option will enlarge your kernel + by about 28 KB. If unsure, say N. + + To compile this file system support as a module, choose M here: the + module will be called minix. Note that the file system of your root + partition (the one containing the directory /) cannot be compiled as + a module. + +config MINIX_FS_NATIVE_ENDIAN + def_bool MINIX_FS + depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + +config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED + def_bool MINIX_FS + depends on M68K && MMU diff --git a/kernel/fs/minix/Makefile b/kernel/fs/minix/Makefile new file mode 100644 index 000000000..3063015ab --- /dev/null +++ b/kernel/fs/minix/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux minix filesystem routines. +# + +obj-$(CONFIG_MINIX_FS) += minix.o + +minix-objs := bitmap.o itree_v1.o itree_v2.o namei.o inode.o file.o dir.o diff --git a/kernel/fs/minix/bitmap.c b/kernel/fs/minix/bitmap.c new file mode 100644 index 000000000..742942a98 --- /dev/null +++ b/kernel/fs/minix/bitmap.c @@ -0,0 +1,272 @@ +/* + * linux/fs/minix/bitmap.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * Modified for 680x0 by Hamish Macdonald + * Fixed for 680x0 by Andreas Schwab + */ + +/* bitmap.c contains the code that handles the inode and block bitmaps */ + +#include "minix.h" +#include +#include +#include + +static DEFINE_SPINLOCK(bitmap_lock); + +/* + * bitmap consists of blocks filled with 16bit words + * bit set == busy, bit clear == free + * endianness is a mess, but for counting zero bits it really doesn't matter... + */ +static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) +{ + __u32 sum = 0; + unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); + + while (blocks--) { + unsigned words = blocksize / 2; + __u16 *p = (__u16 *)(*map++)->b_data; + while (words--) + sum += 16 - hweight16(*p++); + } + + return sum; +} + +void minix_free_block(struct inode *inode, unsigned long block) +{ + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long bit, zone; + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("Trying to free block not in datazone\n"); + return; + } + zone = block - sbi->s_firstdatazone + 1; + bit = zone & ((1<>= k; + if (zone >= sbi->s_zmap_blocks) { + printk("minix_free_block: nonexistent bitmap buffer\n"); + return; + } + bh = sbi->s_zmap[zone]; + spin_lock(&bitmap_lock); + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_block (%s:%lu): bit already cleared\n", + sb->s_id, block); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + return; +} + +int minix_new_block(struct inode * inode) +{ + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + int bits_per_zone = 8 * inode->i_sb->s_blocksize; + int i; + + for (i = 0; i < sbi->s_zmap_blocks; i++) { + struct buffer_head *bh = sbi->s_zmap[i]; + int j; + + spin_lock(&bitmap_lock); + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) { + minix_set_bit(j, bh->b_data); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + j += i * bits_per_zone + sbi->s_firstdatazone-1; + if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) + break; + return j; + } + spin_unlock(&bitmap_lock); + } + return 0; +} + +unsigned long minix_count_free_blocks(struct super_block *sb) +{ + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1; + + return (count_free(sbi->s_zmap, sb->s_blocksize, bits) + << sbi->s_log_zone_size); +} + +struct minix_inode * +minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) +{ + int block; + struct minix_sb_info *sbi = minix_sb(sb); + struct minix_inode *p; + + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %ld is out of range\n", + sb->s_id, (long)ino); + return NULL; + } + ino--; + block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + + ino / MINIX_INODES_PER_BLOCK; + *bh = sb_bread(sb, block); + if (!*bh) { + printk("Unable to read inode block\n"); + return NULL; + } + p = (void *)(*bh)->b_data; + return p + ino % MINIX_INODES_PER_BLOCK; +} + +struct minix2_inode * +minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) +{ + int block; + struct minix_sb_info *sbi = minix_sb(sb); + struct minix2_inode *p; + int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode); + + *bh = NULL; + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %ld is out of range\n", + sb->s_id, (long)ino); + return NULL; + } + ino--; + block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + + ino / minix2_inodes_per_block; + *bh = sb_bread(sb, block); + if (!*bh) { + printk("Unable to read inode block\n"); + return NULL; + } + p = (void *)(*bh)->b_data; + return p + ino % minix2_inodes_per_block; +} + +/* Clear the link count and mode of a deleted inode on disk. */ + +static void minix_clear_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (INODE_VERSION(inode) == MINIX_V1) { + struct minix_inode *raw_inode; + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (raw_inode) { + raw_inode->i_nlinks = 0; + raw_inode->i_mode = 0; + } + } else { + struct minix2_inode *raw_inode; + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (raw_inode) { + raw_inode->i_nlinks = 0; + raw_inode->i_mode = 0; + } + } + if (bh) { + mark_buffer_dirty(bh); + brelse (bh); + } +} + +void minix_free_inode(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long ino, bit; + + ino = inode->i_ino; + if (ino < 1 || ino > sbi->s_ninodes) { + printk("minix_free_inode: inode 0 or nonexistent inode\n"); + return; + } + bit = ino & ((1<>= k; + if (ino >= sbi->s_imap_blocks) { + printk("minix_free_inode: nonexistent imap in superblock\n"); + return; + } + + minix_clear_inode(inode); /* clear on-disk copy */ + + bh = sbi->s_imap[ino]; + spin_lock(&bitmap_lock); + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_inode: bit %lu already cleared\n", bit); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); +} + +struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) +{ + struct super_block *sb = dir->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + struct inode *inode = new_inode(sb); + struct buffer_head * bh; + int bits_per_zone = 8 * sb->s_blocksize; + unsigned long j; + int i; + + if (!inode) { + *error = -ENOMEM; + return NULL; + } + j = bits_per_zone; + bh = NULL; + *error = -ENOSPC; + spin_lock(&bitmap_lock); + for (i = 0; i < sbi->s_imap_blocks; i++) { + bh = sbi->s_imap[i]; + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) + break; + } + if (!bh || j >= bits_per_zone) { + spin_unlock(&bitmap_lock); + iput(inode); + return NULL; + } + if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ + spin_unlock(&bitmap_lock); + printk("minix_new_inode: bit already set\n"); + iput(inode); + return NULL; + } + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + j += i * bits_per_zone; + if (!j || j > sbi->s_ninodes) { + iput(inode); + return NULL; + } + inode_init_owner(inode, dir, mode); + inode->i_ino = j; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); + insert_inode_hash(inode); + mark_inode_dirty(inode); + + *error = 0; + return inode; +} + +unsigned long minix_count_free_inodes(struct super_block *sb) +{ + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_ninodes + 1; + + return count_free(sbi->s_imap, sb->s_blocksize, bits); +} diff --git a/kernel/fs/minix/dir.c b/kernel/fs/minix/dir.c new file mode 100644 index 000000000..118e4e7bc --- /dev/null +++ b/kernel/fs/minix/dir.c @@ -0,0 +1,473 @@ +/* + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * minix directory handling functions + * + * Updated to filesystem version 3 by Daniel Aragones + */ + +#include "minix.h" +#include +#include +#include + +typedef struct minix_dir_entry minix_dirent; +typedef struct minix3_dir_entry minix3_dirent; + +static int minix_readdir(struct file *, struct dir_context *); + +const struct file_operations minix_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = minix_readdir, + .fsync = generic_file_fsync, +}; + +static inline void dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +minix_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = PAGE_CACHE_SIZE; + + if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT)) + last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1); + return last_byte; +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static struct page * dir_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) +{ + return (void*)((char*)de + sbi->s_dirsize); +} + +static int minix_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + unsigned chunk_size = sbi->s_dirsize; + unsigned long npages = dir_pages(inode); + unsigned long pos = ctx->pos; + unsigned offset; + unsigned long n; + + ctx->pos = pos = ALIGN(pos, chunk_size); + if (pos >= inode->i_size) + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; + + for ( ; n < npages; n++, offset = 0) { + char *p, *kaddr, *limit; + struct page *page = dir_get_page(inode, n); + + if (IS_ERR(page)) + continue; + kaddr = (char *)page_address(page); + p = kaddr+offset; + limit = kaddr + minix_last_byte(inode, n) - chunk_size; + for ( ; p <= limit; p = minix_next_entry(p, sbi)) { + const char *name; + __u32 inumber; + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } + if (inumber) { + unsigned l = strnlen(name, sbi->s_namelen); + if (!dir_emit(ctx, name, l, + inumber, DT_UNKNOWN)) { + dir_put_page(page); + return 0; + } + } + ctx->pos += chunk_size; + } + dir_put_page(page); + } + return 0; +} + +static inline int namecompare(int len, int maxlen, + const char * name, const char * buffer) +{ + if (len < maxlen && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +/* + * minix_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + */ +minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) +{ + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct inode * dir = d_inode(dentry->d_parent); + struct super_block * sb = dir->i_sb; + struct minix_sb_info * sbi = minix_sb(sb); + unsigned long n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + char *p; + + char *namx; + __u32 inumber; + *res_page = NULL; + + for (n = 0; n < npages; n++) { + char *kaddr, *limit; + + page = dir_get_page(dir, n); + if (IS_ERR(page)) + continue; + + kaddr = (char*)page_address(page); + limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + namx = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + namx = de->name; + inumber = de->inode; + } + if (!inumber) + continue; + if (namecompare(namelen, sbi->s_namelen, name, namx)) + goto found; + } + dir_put_page(page); + } + return NULL; + +found: + *res_page = page; + return (minix_dirent *)p; +} + +int minix_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct super_block * sb = dir->i_sb; + struct minix_sb_info * sbi = minix_sb(sb); + struct page *page = NULL; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr, *p; + minix_dirent *de; + minix3_dirent *de3; + loff_t pos; + int err; + char *namx = NULL; + __u32 inumber; + + /* + * We take care of directory expansion in the same loop + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *limit, *dir_end; + + page = dir_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = (char*)page_address(page); + dir_end = kaddr + minix_last_byte(dir, n); + limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + de = (minix_dirent *)p; + de3 = (minix3_dirent *)p; + if (sbi->s_version == MINIX_V3) { + namx = de3->name; + inumber = de3->inode; + } else { + namx = de->name; + inumber = de->inode; + } + if (p == dir_end) { + /* We hit i_size */ + if (sbi->s_version == MINIX_V3) + de3->inode = 0; + else + de->inode = 0; + goto got_it; + } + if (!inumber) + goto got_it; + err = -EEXIST; + if (namecompare(namelen, sbi->s_namelen, name, namx)) + goto out_unlock; + } + unlock_page(page); + dir_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + p - (char *)page_address(page); + err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + if (err) + goto out_unlock; + memcpy (namx, name, namelen); + if (sbi->s_version == MINIX_V3) { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4); + de3->inode = inode->i_ino; + } else { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); + de->inode = inode->i_ino; + } + err = dir_commit_chunk(page, pos, sbi->s_dirsize); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +out_put: + dir_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int minix_delete_entry(struct minix_dir_entry *de, struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr = page_address(page); + loff_t pos = page_offset(page) + (char*)de - kaddr; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + unsigned len = sbi->s_dirsize; + int err; + + lock_page(page); + err = minix_prepare_chunk(page, pos, len); + if (err == 0) { + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = 0; + else + de->inode = 0; + err = dir_commit_chunk(page, pos, len); + } else { + unlock_page(page); + } + dir_put_page(page); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return err; +} + +int minix_make_empty(struct inode *inode, struct inode *dir) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + char *kaddr; + int err; + + if (!page) + return -ENOMEM; + err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page); + memset(kaddr, 0, PAGE_CACHE_SIZE); + + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)kaddr; + + de3->inode = inode->i_ino; + strcpy(de3->name, "."); + de3 = minix_next_entry(de3, sbi); + de3->inode = dir->i_ino; + strcpy(de3->name, ".."); + } else { + minix_dirent *de = (minix_dirent *)kaddr; + + de->inode = inode->i_ino; + strcpy(de->name, "."); + de = minix_next_entry(de, sbi); + de->inode = dir->i_ino; + strcpy(de->name, ".."); + } + kunmap_atomic(kaddr); + + err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int minix_empty_dir(struct inode * inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + char *name; + __u32 inumber; + + for (i = 0; i < npages; i++) { + char *p, *kaddr, *limit; + + page = dir_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = (char *)page_address(page); + limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } + + if (inumber != 0) { + /* check for . and .. */ + if (name[0] != '.') + goto not_empty; + if (!name[1]) { + if (inumber != inode->i_ino) + goto not_empty; + } else if (name[1] != '.') + goto not_empty; + else if (name[2]) + goto not_empty; + } + } + dir_put_page(page); + } + return 1; + +not_empty: + dir_put_page(page); + return 0; +} + +/* Releases the page */ +void minix_set_link(struct minix_dir_entry *de, struct page *page, + struct inode *inode) +{ + struct inode *dir = page->mapping->host; + struct minix_sb_info *sbi = minix_sb(dir->i_sb); + loff_t pos = page_offset(page) + + (char *)de-(char*)page_address(page); + int err; + + lock_page(page); + + err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + if (err == 0) { + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = inode->i_ino; + else + de->inode = inode->i_ino; + err = dir_commit_chunk(page, pos, sbi->s_dirsize); + } else { + unlock_page(page); + } + dir_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + +struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = dir_get_page(dir, 0); + struct minix_sb_info *sbi = minix_sb(dir->i_sb); + struct minix_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = minix_next_entry(page_address(page), sbi); + *p = page; + } + return de; +} + +ino_t minix_inode_by_name(struct dentry *dentry) +{ + struct page *page; + struct minix_dir_entry *de = minix_find_entry(dentry, &page); + ino_t res = 0; + + if (de) { + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + + if (sbi->s_version == MINIX_V3) + res = ((minix3_dirent *) de)->inode; + else + res = de->inode; + dir_put_page(page); + } + return res; +} diff --git a/kernel/fs/minix/file.c b/kernel/fs/minix/file.c new file mode 100644 index 000000000..94f0eb9a6 --- /dev/null +++ b/kernel/fs/minix/file.c @@ -0,0 +1,51 @@ +/* + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * minix regular file handling primitives + */ + +#include "minix.h" + +/* + * We have mostly NULLs here: the current defaults are OK for + * the minix filesystem. + */ +const struct file_operations minix_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int minix_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + truncate_setsize(inode, attr->ia_size); + minix_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations minix_file_inode_operations = { + .setattr = minix_setattr, + .getattr = minix_getattr, +}; diff --git a/kernel/fs/minix/inode.c b/kernel/fs/minix/inode.c new file mode 100644 index 000000000..1182d1e26 --- /dev/null +++ b/kernel/fs/minix/inode.c @@ -0,0 +1,690 @@ +/* + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Copyright (C) 1996 Gertjan van Wingerde + * Minix V2 fs support. + * + * Modified for 680x0 by Andreas Schwab + * Updated to filesystem version 3 by Daniel Aragones + */ + +#include +#include "minix.h" +#include +#include +#include +#include +#include +#include + +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); +static int minix_remount (struct super_block * sb, int * flags, char * data); + +static void minix_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + if (!inode->i_nlink) { + inode->i_size = 0; + minix_truncate(inode); + } + invalidate_inode_buffers(inode); + clear_inode(inode); + if (!inode->i_nlink) + minix_free_inode(inode); +} + +static void minix_put_super(struct super_block *sb) +{ + int i; + struct minix_sb_info *sbi = minix_sb(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + sbi->s_ms->s_state = sbi->s_mount_state; + mark_buffer_dirty(sbi->s_sbh); + } + for (i = 0; i < sbi->s_imap_blocks; i++) + brelse(sbi->s_imap[i]); + for (i = 0; i < sbi->s_zmap_blocks; i++) + brelse(sbi->s_zmap[i]); + brelse (sbi->s_sbh); + kfree(sbi->s_imap); + sb->s_fs_info = NULL; + kfree(sbi); +} + +static struct kmem_cache * minix_inode_cachep; + +static struct inode *minix_alloc_inode(struct super_block *sb) +{ + struct minix_inode_info *ei; + ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void minix_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(minix_inode_cachep, minix_i(inode)); +} + +static void minix_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, minix_i_callback); +} + +static void init_once(void *foo) +{ + struct minix_inode_info *ei = (struct minix_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + minix_inode_cachep = kmem_cache_create("minix_inode_cache", + sizeof(struct minix_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (minix_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(minix_inode_cachep); +} + +static const struct super_operations minix_sops = { + .alloc_inode = minix_alloc_inode, + .destroy_inode = minix_destroy_inode, + .write_inode = minix_write_inode, + .evict_inode = minix_evict_inode, + .put_super = minix_put_super, + .statfs = minix_statfs, + .remount_fs = minix_remount, +}; + +static int minix_remount (struct super_block * sb, int * flags, char * data) +{ + struct minix_sb_info * sbi = minix_sb(sb); + struct minix_super_block * ms; + + sync_filesystem(sb); + ms = sbi->s_ms; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (*flags & MS_RDONLY) { + if (ms->s_state & MINIX_VALID_FS || + !(sbi->s_mount_state & MINIX_VALID_FS)) + return 0; + /* Mounting a rw partition read-only. */ + if (sbi->s_version != MINIX_V3) + ms->s_state = sbi->s_mount_state; + mark_buffer_dirty(sbi->s_sbh); + } else { + /* Mount a partition which is read-only, read-write. */ + if (sbi->s_version != MINIX_V3) { + sbi->s_mount_state = ms->s_state; + ms->s_state &= ~MINIX_VALID_FS; + } else { + sbi->s_mount_state = MINIX_VALID_FS; + } + mark_buffer_dirty(sbi->s_sbh); + + if (!(sbi->s_mount_state & MINIX_VALID_FS)) + printk("MINIX-fs warning: remounting unchecked fs, " + "running fsck is recommended\n"); + else if ((sbi->s_mount_state & MINIX_ERROR_FS)) + printk("MINIX-fs warning: remounting fs with errors, " + "running fsck is recommended\n"); + } + return 0; +} + +static int minix_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh; + struct buffer_head **map; + struct minix_super_block *ms; + struct minix3_super_block *m3s = NULL; + unsigned long i, block; + struct inode *root_inode; + struct minix_sb_info *sbi; + int ret = -EINVAL; + + sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + + BUILD_BUG_ON(32 != sizeof (struct minix_inode)); + BUILD_BUG_ON(64 != sizeof(struct minix2_inode)); + + if (!sb_set_blocksize(s, BLOCK_SIZE)) + goto out_bad_hblock; + + if (!(bh = sb_bread(s, 1))) + goto out_bad_sb; + + ms = (struct minix_super_block *) bh->b_data; + sbi->s_ms = ms; + sbi->s_sbh = bh; + sbi->s_mount_state = ms->s_state; + sbi->s_ninodes = ms->s_ninodes; + sbi->s_nzones = ms->s_nzones; + sbi->s_imap_blocks = ms->s_imap_blocks; + sbi->s_zmap_blocks = ms->s_zmap_blocks; + sbi->s_firstdatazone = ms->s_firstdatazone; + sbi->s_log_zone_size = ms->s_log_zone_size; + sbi->s_max_size = ms->s_max_size; + s->s_magic = ms->s_magic; + if (s->s_magic == MINIX_SUPER_MAGIC) { + sbi->s_version = MINIX_V1; + sbi->s_dirsize = 16; + sbi->s_namelen = 14; + s->s_max_links = MINIX_LINK_MAX; + } else if (s->s_magic == MINIX_SUPER_MAGIC2) { + sbi->s_version = MINIX_V1; + sbi->s_dirsize = 32; + sbi->s_namelen = 30; + s->s_max_links = MINIX_LINK_MAX; + } else if (s->s_magic == MINIX2_SUPER_MAGIC) { + sbi->s_version = MINIX_V2; + sbi->s_nzones = ms->s_zones; + sbi->s_dirsize = 16; + sbi->s_namelen = 14; + s->s_max_links = MINIX2_LINK_MAX; + } else if (s->s_magic == MINIX2_SUPER_MAGIC2) { + sbi->s_version = MINIX_V2; + sbi->s_nzones = ms->s_zones; + sbi->s_dirsize = 32; + sbi->s_namelen = 30; + s->s_max_links = MINIX2_LINK_MAX; + } else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) { + m3s = (struct minix3_super_block *) bh->b_data; + s->s_magic = m3s->s_magic; + sbi->s_imap_blocks = m3s->s_imap_blocks; + sbi->s_zmap_blocks = m3s->s_zmap_blocks; + sbi->s_firstdatazone = m3s->s_firstdatazone; + sbi->s_log_zone_size = m3s->s_log_zone_size; + sbi->s_max_size = m3s->s_max_size; + sbi->s_ninodes = m3s->s_ninodes; + sbi->s_nzones = m3s->s_zones; + sbi->s_dirsize = 64; + sbi->s_namelen = 60; + sbi->s_version = MINIX_V3; + sbi->s_mount_state = MINIX_VALID_FS; + sb_set_blocksize(s, m3s->s_blocksize); + s->s_max_links = MINIX2_LINK_MAX; + } else + goto out_no_fs; + + /* + * Allocate the buffer map to keep the superblock small. + */ + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + goto out_illegal_sb; + i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); + map = kzalloc(i, GFP_KERNEL); + if (!map) + goto out_no_map; + sbi->s_imap = &map[0]; + sbi->s_zmap = &map[sbi->s_imap_blocks]; + + block=2; + for (i=0 ; i < sbi->s_imap_blocks ; i++) { + if (!(sbi->s_imap[i]=sb_bread(s, block))) + goto out_no_bitmap; + block++; + } + for (i=0 ; i < sbi->s_zmap_blocks ; i++) { + if (!(sbi->s_zmap[i]=sb_bread(s, block))) + goto out_no_bitmap; + block++; + } + + minix_set_bit(0,sbi->s_imap[0]->b_data); + minix_set_bit(0,sbi->s_zmap[0]->b_data); + + /* Apparently minix can create filesystems that allocate more blocks for + * the bitmaps than needed. We simply ignore that, but verify it didn't + * create one with not enough blocks and bail out if so. + */ + block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); + if (sbi->s_imap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "imap blocks allocated. Refusing to mount.\n"); + goto out_no_bitmap; + } + + block = minix_blocks_needed( + (sbi->s_nzones - sbi->s_firstdatazone + 1), + s->s_blocksize); + if (sbi->s_zmap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "zmap blocks allocated. Refusing to mount.\n"); + goto out_no_bitmap; + } + + /* set up enough so that it can read an inode */ + s->s_op = &minix_sops; + root_inode = minix_iget(s, MINIX_ROOT_INO); + if (IS_ERR(root_inode)) { + ret = PTR_ERR(root_inode); + goto out_no_root; + } + + ret = -ENOMEM; + s->s_root = d_make_root(root_inode); + if (!s->s_root) + goto out_no_root; + + if (!(s->s_flags & MS_RDONLY)) { + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + ms->s_state &= ~MINIX_VALID_FS; + mark_buffer_dirty(bh); + } + if (!(sbi->s_mount_state & MINIX_VALID_FS)) + printk("MINIX-fs: mounting unchecked file system, " + "running fsck is recommended\n"); + else if (sbi->s_mount_state & MINIX_ERROR_FS) + printk("MINIX-fs: mounting file system with errors, " + "running fsck is recommended\n"); + + return 0; + +out_no_root: + if (!silent) + printk("MINIX-fs: get root inode failed\n"); + goto out_freemap; + +out_no_bitmap: + printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); +out_freemap: + for (i = 0; i < sbi->s_imap_blocks; i++) + brelse(sbi->s_imap[i]); + for (i = 0; i < sbi->s_zmap_blocks; i++) + brelse(sbi->s_zmap[i]); + kfree(sbi->s_imap); + goto out_release; + +out_no_map: + ret = -ENOMEM; + if (!silent) + printk("MINIX-fs: can't allocate map\n"); + goto out_release; + +out_illegal_sb: + if (!silent) + printk("MINIX-fs: bad superblock\n"); + goto out_release; + +out_no_fs: + if (!silent) + printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 " + "on device %s.\n", s->s_id); +out_release: + brelse(bh); + goto out; + +out_bad_hblock: + printk("MINIX-fs: blocksize too small for device\n"); + goto out; + +out_bad_sb: + printk("MINIX-fs: unable to read superblock\n"); +out: + s->s_fs_info = NULL; + kfree(sbi); + return ret; +} + +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct minix_sb_info *sbi = minix_sb(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; + buf->f_bfree = minix_count_free_blocks(sb); + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_ninodes; + buf->f_ffree = minix_count_free_inodes(sb); + buf->f_namelen = sbi->s_namelen; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int minix_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_get_block(inode, block, bh_result, create); + else + return V2_minix_get_block(inode, block, bh_result, create); +} + +static int minix_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, minix_get_block, wbc); +} + +static int minix_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,minix_get_block); +} + +int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, minix_get_block); +} + +static void minix_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + minix_truncate(inode); + } +} + +static int minix_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + minix_get_block); + if (unlikely(ret)) + minix_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t minix_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,minix_get_block); +} + +static const struct address_space_operations minix_aops = { + .readpage = minix_readpage, + .writepage = minix_writepage, + .write_begin = minix_write_begin, + .write_end = generic_write_end, + .bmap = minix_bmap +}; + +static const struct inode_operations minix_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = minix_getattr, +}; + +void minix_set_inode(struct inode *inode, dev_t rdev) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &minix_file_inode_operations; + inode->i_fop = &minix_file_operations; + inode->i_mapping->a_ops = &minix_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &minix_dir_inode_operations; + inode->i_fop = &minix_dir_operations; + inode->i_mapping->a_ops = &minix_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &minix_symlink_inode_operations; + inode->i_mapping->a_ops = &minix_aops; + } else + init_special_inode(inode, inode->i_mode, rdev); +} + +/* + * The minix V1 function to read an inode. + */ +static struct inode *V1_minix_iget(struct inode *inode) +{ + struct buffer_head * bh; + struct minix_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) { + iget_failed(inode); + return ERR_PTR(-EIO); + } + inode->i_mode = raw_inode->i_mode; + i_uid_write(inode, raw_inode->i_uid); + i_gid_write(inode, raw_inode->i_gid); + set_nlink(inode, raw_inode->i_nlinks); + inode->i_size = raw_inode->i_size; + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = 0; + for (i = 0; i < 9; i++) + minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; + minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +/* + * The minix V2 function to read an inode. + */ +static struct inode *V2_minix_iget(struct inode *inode) +{ + struct buffer_head * bh; + struct minix2_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) { + iget_failed(inode); + return ERR_PTR(-EIO); + } + inode->i_mode = raw_inode->i_mode; + i_uid_write(inode, raw_inode->i_uid); + i_gid_write(inode, raw_inode->i_gid); + set_nlink(inode, raw_inode->i_nlinks); + inode->i_size = raw_inode->i_size; + inode->i_mtime.tv_sec = raw_inode->i_mtime; + inode->i_atime.tv_sec = raw_inode->i_atime; + inode->i_ctime.tv_sec = raw_inode->i_ctime; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = 0; + for (i = 0; i < 10; i++) + minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; + minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +/* + * The global function to read an inode. + */ +struct inode *minix_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_iget(inode); + else + return V2_minix_iget(inode); +} + +/* + * The minix V1 function to synchronize an inode. + */ +static struct buffer_head * V1_minix_update_inode(struct inode * inode) +{ + struct buffer_head * bh; + struct minix_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) + return NULL; + raw_inode->i_mode = inode->i_mode; + raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode)); + raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); + raw_inode->i_nlinks = inode->i_nlink; + raw_inode->i_size = inode->i_size; + raw_inode->i_time = inode->i_mtime.tv_sec; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); + else for (i = 0; i < 9; i++) + raw_inode->i_zone[i] = minix_inode->u.i1_data[i]; + mark_buffer_dirty(bh); + return bh; +} + +/* + * The minix V2 function to synchronize an inode. + */ +static struct buffer_head * V2_minix_update_inode(struct inode * inode) +{ + struct buffer_head * bh; + struct minix2_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) + return NULL; + raw_inode->i_mode = inode->i_mode; + raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode)); + raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); + raw_inode->i_nlinks = inode->i_nlink; + raw_inode->i_size = inode->i_size; + raw_inode->i_mtime = inode->i_mtime.tv_sec; + raw_inode->i_atime = inode->i_atime.tv_sec; + raw_inode->i_ctime = inode->i_ctime.tv_sec; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); + else for (i = 0; i < 10; i++) + raw_inode->i_zone[i] = minix_inode->u.i2_data[i]; + mark_buffer_dirty(bh); + return bh; +} + +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err = 0; + struct buffer_head *bh; + + if (INODE_VERSION(inode) == MINIX_V1) + bh = V1_minix_update_inode(inode); + else + bh = V2_minix_update_inode(inode); + if (!bh) + return -EIO; + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk("IO error syncing minix inode [%s:%08lx]\n", + inode->i_sb->s_id, inode->i_ino); + err = -EIO; + } + } + brelse (bh); + return err; +} + +int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct super_block *sb = dentry->d_sb; + generic_fillattr(d_inode(dentry), stat); + if (INODE_VERSION(d_inode(dentry)) == MINIX_V1) + stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); + else + stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); + stat->blksize = sb->s_blocksize; + return 0; +} + +/* + * The function that is called for file truncation. + */ +void minix_truncate(struct inode * inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + return; + if (INODE_VERSION(inode) == MINIX_V1) + V1_minix_truncate(inode); + else + V2_minix_truncate(inode); +} + +static struct dentry *minix_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super); +} + +static struct file_system_type minix_fs_type = { + .owner = THIS_MODULE, + .name = "minix", + .mount = minix_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("minix"); + +static int __init init_minix_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&minix_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_minix_fs(void) +{ + unregister_filesystem(&minix_fs_type); + destroy_inodecache(); +} + +module_init(init_minix_fs) +module_exit(exit_minix_fs) +MODULE_LICENSE("GPL"); + diff --git a/kernel/fs/minix/itree_common.c b/kernel/fs/minix/itree_common.c new file mode 100644 index 000000000..a731cabf1 --- /dev/null +++ b/kernel/fs/minix/itree_common.c @@ -0,0 +1,364 @@ +/* Generic part */ + +typedef struct { + block_t *p; + block_t key; + struct buffer_head *bh; +} Indirect; + +static DEFINE_RWLOCK(pointers_lock); + +static inline void add_chain(Indirect *p, struct buffer_head *bh, block_t *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +static inline block_t *block_end(struct buffer_head *bh) +{ + return (block_t *)((char*)bh->b_data + bh->b_size); +} + +static inline Indirect *get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[DEPTH], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, i_data(inode) + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, block_to_cpu(p->key)); + if (!bh) + goto failure; + read_lock(&pointers_lock); + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (block_t *)bh->b_data + *++offsets); + read_unlock(&pointers_lock); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + read_unlock(&pointers_lock); + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +static int alloc_branch(struct inode *inode, + int num, + int *offsets, + Indirect *branch) +{ + int n = 0; + int i; + int parent = minix_new_block(inode); + + branch[0].key = cpu_to_block(parent); + if (parent) for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = minix_new_block(inode); + if (!nr) + break; + branch[n].key = cpu_to_block(nr); + bh = sb_getblk(inode->i_sb, parent); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + branch[n].bh = bh; + branch[n].p = (block_t*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + parent = nr; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < n; i++) + minix_free_block(inode, block_to_cpu(branch[i].key)); + return -ENOSPC; +} + +static inline int splice_branch(struct inode *inode, + Indirect chain[DEPTH], + Indirect *where, + int num) +{ + int i; + + write_lock(&pointers_lock); + + /* Verify that place we are splicing to is still there and vacant */ + if (!verify_chain(chain, where-1) || *where->p) + goto changed; + + *where->p = where->key; + + write_unlock(&pointers_lock); + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + + /* had we spliced it onto indirect block? */ + if (where->bh) + mark_buffer_dirty_inode(where->bh, inode); + + mark_inode_dirty(inode); + return 0; + +changed: + write_unlock(&pointers_lock); + for (i = 1; i < num; i++) + bforget(where[i].bh); + for (i = 0; i < num; i++) + minix_free_block(inode, block_to_cpu(where[i].key)); + return -EAGAIN; +} + +static inline int get_block(struct inode * inode, sector_t block, + struct buffer_head *bh, int create) +{ + int err = -EIO; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + int left; + int depth = block_to_path(inode, block, offsets); + + if (depth == 0) + goto out; + +reread: + partial = get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { +got_it: + map_bh(bh, inode->i_sb, block_to_cpu(chain[depth-1].key)); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + left = (chain + depth) - partial; + err = alloc_branch(inode, left, offsets+(partial-chain), partial); + if (err) + goto cleanup; + + if (splice_branch(inode, chain, partial, left) < 0) + goto changed; + + set_buffer_new(bh); + goto got_it; + +changed: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + goto reread; +} + +static inline int all_zeroes(block_t *p, block_t *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +static Indirect *find_shared(struct inode *inode, + int depth, + int offsets[DEPTH], + Indirect chain[DEPTH], + block_t *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = get_branch(inode, k, offsets, chain, &err); + + write_lock(&pointers_lock); + if (!partial) + partial = chain + k-1; + if (!partial->key && *partial->p) { + write_unlock(&pointers_lock); + goto no_top; + } + for (p=partial;p>chain && all_zeroes((block_t*)p->bh->b_data,p->p);p--) + ; + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&pointers_lock); + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +static inline void free_data(struct inode *inode, block_t *p, block_t *q) +{ + unsigned long nr; + + for ( ; p < q ; p++) { + nr = block_to_cpu(*p); + if (nr) { + *p = 0; + minix_free_block(inode, nr); + } + } +} + +static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth) +{ + struct buffer_head * bh; + unsigned long nr; + + if (depth--) { + for ( ; p < q ; p++) { + nr = block_to_cpu(*p); + if (!nr) + continue; + *p = 0; + bh = sb_bread(inode->i_sb, nr); + if (!bh) + continue; + free_branches(inode, (block_t*)bh->b_data, + block_end(bh), depth); + bforget(bh); + minix_free_block(inode, nr); + mark_inode_dirty(inode); + } + } else + free_data(inode, p, q); +} + +static inline void truncate (struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + block_t *idata = i_data(inode); + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + block_t nr = 0; + int n; + int first_whole; + long iblock; + + iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits; + block_truncate_page(inode->i_mapping, inode->i_size, get_block); + + n = block_to_path(inode, iblock, offsets); + if (!n) + return; + + if (n == 1) { + free_data(inode, idata+offsets[0], idata + DIRECT); + first_whole = 0; + goto do_indirects; + } + + first_whole = offsets[0] + 1 - DIRECT; + partial = find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + mark_buffer_dirty_inode(partial->bh, inode); + free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + free_branches(inode, partial->p + 1, block_end(partial->bh), + (chain+n-1) - partial); + mark_buffer_dirty_inode(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + while (first_whole < DEPTH-1) { + nr = idata[DIRECT+first_whole]; + if (nr) { + idata[DIRECT+first_whole] = 0; + mark_inode_dirty(inode); + free_branches(inode, &nr, &nr+1, first_whole+1); + } + first_whole++; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +static inline unsigned nblocks(loff_t size, struct super_block *sb) +{ + int k = sb->s_blocksize_bits - 10; + unsigned blocks, res, direct = DIRECT, i = DEPTH; + blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k); + res = blocks; + while (--i && blocks > direct) { + blocks -= direct; + blocks += sb->s_blocksize/sizeof(block_t) - 1; + blocks /= sb->s_blocksize/sizeof(block_t); + res += blocks; + direct = 1; + } + return res; +} diff --git a/kernel/fs/minix/itree_v1.c b/kernel/fs/minix/itree_v1.c new file mode 100644 index 000000000..282e15ad8 --- /dev/null +++ b/kernel/fs/minix/itree_v1.c @@ -0,0 +1,67 @@ +#include +#include +#include "minix.h" + +enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ + +typedef u16 block_t; /* 16 bit, host order */ + +static inline unsigned long block_to_cpu(block_t n) +{ + return n; +} + +static inline block_t cpu_to_block(unsigned long n) +{ + return n; +} + +static inline block_t *i_data(struct inode *inode) +{ + return (block_t *)minix_i(inode)->u.i1_data; +} + +static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) +{ + int n = 0; + char b[BDEVNAME_SIZE]; + + if (block < 0) { + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", + block, bdevname(inode->i_sb->s_bdev, b)); + } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { + if (printk_ratelimit()) + printk("MINIX-fs: block_to_path: " + "block %ld too big on dev %s\n", + block, bdevname(inode->i_sb->s_bdev, b)); + } else if (block < 7) { + offsets[n++] = block; + } else if ((block -= 7) < 512) { + offsets[n++] = 7; + offsets[n++] = block; + } else { + block -= 512; + offsets[n++] = 8; + offsets[n++] = block>>9; + offsets[n++] = block & 511; + } + return n; +} + +#include "itree_common.c" + +int V1_minix_get_block(struct inode * inode, long block, + struct buffer_head *bh_result, int create) +{ + return get_block(inode, block, bh_result, create); +} + +void V1_minix_truncate(struct inode * inode) +{ + truncate(inode); +} + +unsigned V1_minix_blocks(loff_t size, struct super_block *sb) +{ + return nblocks(size, sb); +} diff --git a/kernel/fs/minix/itree_v2.c b/kernel/fs/minix/itree_v2.c new file mode 100644 index 000000000..78e2d93e5 --- /dev/null +++ b/kernel/fs/minix/itree_v2.c @@ -0,0 +1,76 @@ +#include +#include "minix.h" + +enum {DIRECT = 7, DEPTH = 4}; /* Have triple indirect */ + +typedef u32 block_t; /* 32 bit, host order */ + +static inline unsigned long block_to_cpu(block_t n) +{ + return n; +} + +static inline block_t cpu_to_block(unsigned long n) +{ + return n; +} + +static inline block_t *i_data(struct inode *inode) +{ + return (block_t *)minix_i(inode)->u.i2_data; +} + +#define DIRCOUNT 7 +#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2)) + +static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) +{ + int n = 0; + char b[BDEVNAME_SIZE]; + struct super_block *sb = inode->i_sb; + + if (block < 0) { + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", + block, bdevname(sb->s_bdev, b)); + } else if ((u64)block * (u64)sb->s_blocksize >= + minix_sb(sb)->s_max_size) { + if (printk_ratelimit()) + printk("MINIX-fs: block_to_path: " + "block %ld too big on dev %s\n", + block, bdevname(sb->s_bdev, b)); + } else if (block < DIRCOUNT) { + offsets[n++] = block; + } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT; + offsets[n++] = block; + } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT + 1; + offsets[n++] = block / INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); + } else { + block -= INDIRCOUNT(sb) * INDIRCOUNT(sb); + offsets[n++] = DIRCOUNT + 2; + offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb); + offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); + } + return n; +} + +#include "itree_common.c" + +int V2_minix_get_block(struct inode * inode, long block, + struct buffer_head *bh_result, int create) +{ + return get_block(inode, block, bh_result, create); +} + +void V2_minix_truncate(struct inode * inode) +{ + truncate(inode); +} + +unsigned V2_minix_blocks(loff_t size, struct super_block *sb) +{ + return nblocks(size, sb); +} diff --git a/kernel/fs/minix/minix.h b/kernel/fs/minix/minix.h new file mode 100644 index 000000000..1ebd11854 --- /dev/null +++ b/kernel/fs/minix/minix.h @@ -0,0 +1,169 @@ +#ifndef FS_MINIX_H +#define FS_MINIX_H + +#include +#include +#include + +#define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version +#define MINIX_V1 0x0001 /* original minix fs */ +#define MINIX_V2 0x0002 /* minix V2 fs */ +#define MINIX_V3 0x0003 /* minix V3 fs */ + +/* + * minix fs inode data in memory + */ +struct minix_inode_info { + union { + __u16 i1_data[16]; + __u32 i2_data[16]; + } u; + struct inode vfs_inode; +}; + +/* + * minix super-block data in memory + */ +struct minix_sb_info { + unsigned long s_ninodes; + unsigned long s_nzones; + unsigned long s_imap_blocks; + unsigned long s_zmap_blocks; + unsigned long s_firstdatazone; + unsigned long s_log_zone_size; + unsigned long s_max_size; + int s_dirsize; + int s_namelen; + struct buffer_head ** s_imap; + struct buffer_head ** s_zmap; + struct buffer_head * s_sbh; + struct minix_super_block * s_ms; + unsigned short s_mount_state; + unsigned short s_version; +}; + +extern struct inode *minix_iget(struct super_block *, unsigned long); +extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); +extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); +extern struct inode * minix_new_inode(const struct inode *, umode_t, int *); +extern void minix_free_inode(struct inode * inode); +extern unsigned long minix_count_free_inodes(struct super_block *sb); +extern int minix_new_block(struct inode * inode); +extern void minix_free_block(struct inode *inode, unsigned long block); +extern unsigned long minix_count_free_blocks(struct super_block *sb); +extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +extern void V1_minix_truncate(struct inode *); +extern void V2_minix_truncate(struct inode *); +extern void minix_truncate(struct inode *); +extern void minix_set_inode(struct inode *, dev_t); +extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); +extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); +extern unsigned V1_minix_blocks(loff_t, struct super_block *); +extern unsigned V2_minix_blocks(loff_t, struct super_block *); + +extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); +extern int minix_add_link(struct dentry*, struct inode*); +extern int minix_delete_entry(struct minix_dir_entry*, struct page*); +extern int minix_make_empty(struct inode*, struct inode*); +extern int minix_empty_dir(struct inode*); +extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); +extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); +extern ino_t minix_inode_by_name(struct dentry*); + +extern const struct inode_operations minix_file_inode_operations; +extern const struct inode_operations minix_dir_inode_operations; +extern const struct file_operations minix_file_operations; +extern const struct file_operations minix_dir_operations; + +static inline struct minix_sb_info *minix_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct minix_inode_info *minix_i(struct inode *inode) +{ + return list_entry(inode, struct minix_inode_info, vfs_inode); +} + +static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) +{ + return DIV_ROUND_UP(bits, blocksize * 8); +} + +#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ + defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) + +#error Minix file system byte order broken + +#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) + +/* + * big-endian 32 or 64 bit indexed bitmaps on big-endian system or + * little-endian bitmaps on little-endian system + */ + +#define minix_test_and_set_bit(nr, addr) \ + __test_and_set_bit((nr), (unsigned long *)(addr)) +#define minix_set_bit(nr, addr) \ + __set_bit((nr), (unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr, addr) \ + __test_and_clear_bit((nr), (unsigned long *)(addr)) +#define minix_test_bit(nr, addr) \ + test_bit((nr), (unsigned long *)(addr)) +#define minix_find_first_zero_bit(addr, size) \ + find_first_zero_bit((unsigned long *)(addr), (size)) + +#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) + +/* + * big-endian 16bit indexed bitmaps + */ + +static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) +{ + const unsigned short *p = vaddr, *addr = vaddr; + unsigned short num; + + if (!size) + return 0; + + size >>= 4; + while (*p++ == 0xffff) { + if (--size == 0) + return (p - addr) << 4; + } + + num = *--p; + return ((p - addr) << 4) + ffz(num); +} + +#define minix_test_and_set_bit(nr, addr) \ + __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr)) +#define minix_set_bit(nr, addr) \ + __set_bit((nr) ^ 16, (unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr, addr) \ + __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr)) + +static inline int minix_test_bit(int nr, const void *vaddr) +{ + const unsigned short *p = vaddr; + return (p[nr >> 4] & (1U << (nr & 15))) != 0; +} + +#else + +/* + * little-endian bitmaps + */ + +#define minix_test_and_set_bit __test_and_set_bit_le +#define minix_set_bit __set_bit_le +#define minix_test_and_clear_bit __test_and_clear_bit_le +#define minix_test_bit test_bit_le +#define minix_find_first_zero_bit find_first_zero_bit_le + +#endif + +#endif /* FS_MINIX_H */ diff --git a/kernel/fs/minix/namei.c b/kernel/fs/minix/namei.c new file mode 100644 index 000000000..a795a11e5 --- /dev/null +++ b/kernel/fs/minix/namei.c @@ -0,0 +1,270 @@ +/* + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include "minix.h" + +static int add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = minix_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) + return ERR_PTR(-ENAMETOOLONG); + + ino = minix_inode_by_name(dentry); + if (ino) { + inode = minix_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + int error; + struct inode *inode; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = minix_new_inode(dir, mode, &error); + + if (inode) { + minix_set_inode(inode, rdev); + mark_inode_dirty(inode); + error = add_nondir(dentry, inode); + } + return error; +} + +static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int error; + struct inode *inode = minix_new_inode(dir, mode, &error); + if (inode) { + minix_set_inode(inode, 0); + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + } + return error; +} + +static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return minix_mknod(dir, dentry, mode, 0); +} + +static int minix_symlink(struct inode * dir, struct dentry *dentry, + const char * symname) +{ + int err = -ENAMETOOLONG; + int i = strlen(symname)+1; + struct inode * inode; + + if (i > dir->i_sb->s_blocksize) + goto out; + + inode = minix_new_inode(dir, S_IFLNK | 0777, &err); + if (!inode) + goto out; + + minix_set_inode(inode, 0); + err = page_symlink(inode, symname, i); + if (err) + goto out_fail; + + err = add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int minix_link(struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + return add_nondir(dentry, inode); +} + +static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +{ + struct inode * inode; + int err; + + inode_inc_link_count(dir); + + inode = minix_new_inode(dir, S_IFDIR | mode, &err); + if (!inode) + goto out_dir; + + minix_set_inode(inode, 0); + + inode_inc_link_count(inode); + + err = minix_make_empty(inode, dir); + if (err) + goto out_fail; + + err = minix_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int minix_unlink(struct inode * dir, struct dentry *dentry) +{ + int err = -ENOENT; + struct inode * inode = d_inode(dentry); + struct page * page; + struct minix_dir_entry * de; + + de = minix_find_entry(dentry, &page); + if (!de) + goto end_unlink; + + err = minix_delete_entry(de, page); + if (err) + goto end_unlink; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); +end_unlink: + return err; +} + +static int minix_rmdir(struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = d_inode(dentry); + int err = -ENOTEMPTY; + + if (minix_empty_dir(inode)) { + err = minix_unlink(dir, dentry); + if (!err) { + inode_dec_link_count(dir); + inode_dec_link_count(inode); + } + } + return err; +} + +static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir, struct dentry *new_dentry) +{ + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); + struct page * dir_page = NULL; + struct minix_dir_entry * dir_de = NULL; + struct page * old_page; + struct minix_dir_entry * old_de; + int err = -ENOENT; + + old_de = minix_find_entry(old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = minix_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page * new_page; + struct minix_dir_entry * new_de; + + err = -ENOTEMPTY; + if (dir_de && !minix_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = minix_find_entry(new_dentry, &new_page); + if (!new_de) + goto out_dir; + minix_set_link(new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + err = minix_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + minix_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + minix_set_link(dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations minix_dir_inode_operations = { + .create = minix_create, + .lookup = minix_lookup, + .link = minix_link, + .unlink = minix_unlink, + .symlink = minix_symlink, + .mkdir = minix_mkdir, + .rmdir = minix_rmdir, + .mknod = minix_mknod, + .rename = minix_rename, + .getattr = minix_getattr, + .tmpfile = minix_tmpfile, +}; diff --git a/kernel/fs/mount.h b/kernel/fs/mount.h new file mode 100644 index 000000000..6a61c2b3e --- /dev/null +++ b/kernel/fs/mount.h @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include + +struct mnt_namespace { + atomic_t count; + struct ns_common ns; + struct mount * root; + struct list_head list; + struct user_namespace *user_ns; + u64 seq; /* Sequence number to prevent loops */ + wait_queue_head_t poll; + u64 event; +}; + +struct mnt_pcp { + int mnt_count; + int mnt_writers; +}; + +struct mountpoint { + struct hlist_node m_hash; + struct dentry *m_dentry; + struct hlist_head m_list; + int m_count; +}; + +struct mount { + struct hlist_node mnt_hash; + struct mount *mnt_parent; + struct dentry *mnt_mountpoint; + struct vfsmount mnt; + union { + struct rcu_head mnt_rcu; + struct llist_node mnt_llist; + }; +#ifdef CONFIG_SMP + struct mnt_pcp __percpu *mnt_pcp; +#else + int mnt_count; + int mnt_writers; +#endif + struct list_head mnt_mounts; /* list of children, anchored here */ + struct list_head mnt_child; /* and going through their mnt_child */ + struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ + struct list_head mnt_list; + struct list_head mnt_expire; /* link in fs-specific expiry list */ + struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_slave_list;/* list of slave mounts */ + struct list_head mnt_slave; /* slave list entry */ + struct mount *mnt_master; /* slave is on master->mnt_slave_list */ + struct mnt_namespace *mnt_ns; /* containing namespace */ + struct mountpoint *mnt_mp; /* where is it mounted */ + struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ +#ifdef CONFIG_FSNOTIFY + struct hlist_head mnt_fsnotify_marks; + __u32 mnt_fsnotify_mask; +#endif + int mnt_id; /* mount identifier */ + int mnt_group_id; /* peer group identifier */ + int mnt_expiry_mark; /* true if marked for expiry */ + struct hlist_head mnt_pins; + struct fs_pin mnt_umount; + struct dentry *mnt_ex_mountpoint; +}; + +#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ + +static inline struct mount *real_mount(struct vfsmount *mnt) +{ + return container_of(mnt, struct mount, mnt); +} + +static inline int mnt_has_parent(struct mount *mnt) +{ + return mnt != mnt->mnt_parent; +} + +static inline int is_mounted(struct vfsmount *mnt) +{ + /* neither detached nor internal? */ + return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); +} + +extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); +extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); + +extern bool legitimize_mnt(struct vfsmount *, unsigned); + +extern void __detach_mounts(struct dentry *dentry); + +static inline void detach_mounts(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return; + __detach_mounts(dentry); +} + +static inline void get_mnt_ns(struct mnt_namespace *ns) +{ + atomic_inc(&ns->count); +} + +extern seqlock_t mount_lock; + +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + +struct proc_mounts { + struct seq_file m; + struct mnt_namespace *ns; + struct path root; + int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; +}; + +#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) + +extern const struct seq_operations mounts_op; + +extern bool __is_local_mountpoint(struct dentry *dentry); +static inline bool is_local_mountpoint(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return false; + + return __is_local_mountpoint(dentry); +} diff --git a/kernel/fs/mpage.c b/kernel/fs/mpage.c new file mode 100644 index 000000000..3e79220ba --- /dev/null +++ b/kernel/fs/mpage.c @@ -0,0 +1,717 @@ +/* + * fs/mpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains functions related to preparing and submitting BIOs which contain + * multiple pagecache pages. + * + * 15May2002 Andrew Morton + * Initial version + * 27Jun2002 axboe@suse.de + * use bio_add_page() to build bio's just the right size + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + page_endio(page, bio_data_dir(bio), err); + } + + bio_put(bio); +} + +static struct bio *mpage_bio_submit(int rw, struct bio *bio) +{ + bio->bi_end_io = mpage_end_io; + guard_bio_eod(rw, bio); + submit_bio(rw, bio); + return NULL; +} + +static struct bio * +mpage_alloc(struct block_device *bdev, + sector_t first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_sector; + } + return bio; +} + +/* + * support function for mpage_readpages. The fs supplied get_block might + * return an up to date buffer. This is used to map that buffer into + * the page, which allows readpage to avoid triggering a duplicate call + * to get_block. + * + * The idea is to avoid adding buffers to pages that don't already have + * them. So when the buffer is up to date and the page size == block size, + * this marks the page up to date instead of adding new buffers. + */ +static void +map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bh, *head; + int block = 0; + + if (!page_has_buffers(page)) { + /* + * don't make any buffers if there is only one buffer on + * the page and the page just needs to be set up to date + */ + if (inode->i_blkbits == PAGE_CACHE_SHIFT && + buffer_uptodate(bh)) { + SetPageUptodate(page); + return; + } + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + } + head = page_buffers(page); + page_bh = head; + do { + if (block == page_block) { + page_bh->b_state = bh->b_state; + page_bh->b_bdev = bh->b_bdev; + page_bh->b_blocknr = bh->b_blocknr; + break; + } + page_bh = page_bh->b_this_page; + block++; + } while (page_bh != head); +} + +/* + * This is the worker routine which does all the work of mapping the disk + * blocks and constructs largest possible bios, submits them for IO if the + * blocks are not contiguous on the disk. + * + * We pass a buffer_head back and forth and use its buffer_mapped() flag to + * represent the validity of its disk mapping and to decide when to do the next + * get_block() call. + */ +static struct bio * +do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, + sector_t *last_block_in_bio, struct buffer_head *map_bh, + unsigned long *first_logical_block, get_block_t get_block) +{ + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + unsigned first_hole = blocks_per_page; + struct block_device *bdev = NULL; + int length; + int fully_mapped = 1; + unsigned nblocks; + unsigned relative_block; + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the result from the previous get_blocks call first. + */ + nblocks = map_bh->b_size >> blkbits; + if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && + block_in_file < (*first_logical_block + nblocks)) { + unsigned map_offset = block_in_file - *first_logical_block; + unsigned last = nblocks - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + clear_buffer_mapped(map_bh); + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map_bh->b_blocknr + map_offset + + relative_block; + page_block++; + block_in_file++; + } + bdev = map_bh->b_bdev; + } + + /* + * Then do more get_blocks calls until we are done with this page. + */ + map_bh->b_page = page; + while (page_block < blocks_per_page) { + map_bh->b_state = 0; + map_bh->b_size = 0; + + if (block_in_file < last_block) { + map_bh->b_size = (last_block-block_in_file) << blkbits; + if (get_block(inode, block_in_file, map_bh, 0)) + goto confused; + *first_logical_block = block_in_file; + } + + if (!buffer_mapped(map_bh)) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + + /* some filesystems will copy data into the page during + * the get_block call, in which case we don't want to + * read it again. map_buffer_to_page copies the data + * we just collected from get_block into the page's buffers + * so readpage doesn't have to repeat the get_block call + */ + if (buffer_uptodate(map_bh)) { + map_buffer_to_page(page, map_bh, page_block); + goto confused; + } + + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) + goto confused; + nblocks = map_bh->b_size >> blkbits; + for (relative_block = 0; ; relative_block++) { + if (relative_block == nblocks) { + clear_buffer_mapped(map_bh); + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map_bh->b_blocknr+relative_block; + page_block++; + block_in_file++; + } + bdev = map_bh->b_bdev; + } + + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this BIO off first? + */ + if (bio && (*last_block_in_bio != blocks[0] - 1)) + bio = mpage_bio_submit(READ, bio); + +alloc_new: + if (bio == NULL) { + if (first_hole == blocks_per_page) { + if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), + page)) + goto out; + } + bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + min_t(int, nr_pages, bio_get_nr_vecs(bdev)), + GFP_KERNEL); + if (bio == NULL) + goto confused; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) { + bio = mpage_bio_submit(READ, bio); + goto alloc_new; + } + + relative_block = block_in_file - *first_logical_block; + nblocks = map_bh->b_size >> blkbits; + if ((buffer_boundary(map_bh) && relative_block == nblocks) || + (first_hole != blocks_per_page)) + bio = mpage_bio_submit(READ, bio); + else + *last_block_in_bio = blocks[blocks_per_page - 1]; +out: + return bio; + +confused: + if (bio) + bio = mpage_bio_submit(READ, bio); + if (!PageUptodate(page)) + block_read_full_page(page, get_block); + else + unlock_page(page); + goto out; +} + +/** + * mpage_readpages - populate an address space with some pages & start reads against them + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * The page at @pages->prev has the lowest file offset, and reads should be + * issued in @pages->prev to @pages->next order. + * @nr_pages: The number of pages at *@pages + * @get_block: The filesystem's block mapper function. + * + * This function walks the pages and the blocks within each page, building and + * emitting large BIOs. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + * BH_Boundary explanation: + * + * There is a problem. The mpage read code assembles several pages, gets all + * their disk mappings, and then submits them all. That's fine, but obtaining + * the disk mappings may require I/O. Reads of indirect blocks, for example. + * + * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be + * submitted in the following order: + * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 + * + * because the indirect block has to be read to get the mappings of blocks + * 13,14,15,16. Obviously, this impacts performance. + * + * So what we do it to allow the filesystem's get_block() function to set + * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block + * after this one will require I/O against a block which is probably close to + * this one. So you should push what I/O you have currently accumulated. + * + * This all causes the disk requests to be issued in the correct order. + */ +int +mpage_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages, get_block_t get_block) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + struct buffer_head map_bh; + unsigned long first_logical_block = 0; + + map_bh.b_state = 0; + map_bh.b_size = 0; + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + bio = do_mpage_readpage(bio, page, + nr_pages - page_idx, + &last_block_in_bio, &map_bh, + &first_logical_block, + get_block); + } + page_cache_release(page); + } + BUG_ON(!list_empty(pages)); + if (bio) + mpage_bio_submit(READ, bio); + return 0; +} +EXPORT_SYMBOL(mpage_readpages); + +/* + * This isn't called much at all + */ +int mpage_readpage(struct page *page, get_block_t get_block) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct buffer_head map_bh; + unsigned long first_logical_block = 0; + + map_bh.b_state = 0; + map_bh.b_size = 0; + bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, + &map_bh, &first_logical_block, get_block); + if (bio) + mpage_bio_submit(READ, bio); + return 0; +} +EXPORT_SYMBOL(mpage_readpage); + +/* + * Writing is not so simple. + * + * If the page has buffers then they will be used for obtaining the disk + * mapping. We only support pages which are fully mapped-and-dirty, with a + * special case for pages which are unmapped at the end: end-of-file. + * + * If the page has no buffers (preferred) then the page is mapped here. + * + * If all blocks are found to be contiguous then the page can go into the + * BIO. Otherwise fall back to the mapping's writepage(). + * + * FIXME: This code wants an estimate of how many pages are still to be + * written, so it can intelligently allocate a suitably-sized BIO. For now, + * just allocate full-size (16-page) BIOs. + */ + +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + +/* + * We have our BIO, so we can now mark the buffers clean. Make + * sure to only clean buffers which we know we'll be writing. + */ +static void clean_buffers(struct page *page, unsigned first_unmapped) +{ + unsigned buffer_counter = 0; + struct buffer_head *bh, *head; + if (!page_has_buffers(page)) + return; + head = page_buffers(page); + bh = head; + + do { + if (buffer_counter++ == first_unmapped) + break; + clear_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* + * we cannot drop the bh if the page is not uptodate or a concurrent + * readpage would fail to serialize with the bh and it would read from + * disk before we reach the platter. + */ + if (buffer_heads_over_limit && PageUptodate(page)) + try_to_free_buffers(page); +} + +static int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct mpage_data *mpd = data; + struct bio *bio = mpd->bio; + struct address_space *mapping = page->mapping; + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + unsigned long end_index; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + sector_t last_block; + sector_t block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + unsigned first_unmapped = blocks_per_page; + struct block_device *bdev = NULL; + int boundary = 0; + sector_t boundary_block = 0; + struct block_device *boundary_bdev = NULL; + int length; + struct buffer_head map_bh; + loff_t i_size = i_size_read(inode); + int ret = 0; + + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + /* If they're all mapped and dirty, do it */ + page_block = 0; + do { + BUG_ON(buffer_locked(bh)); + if (!buffer_mapped(bh)) { + /* + * unmapped dirty buffers are created by + * __set_page_dirty_buffers -> mmapped data + */ + if (buffer_dirty(bh)) + goto confused; + if (first_unmapped == blocks_per_page) + first_unmapped = page_block; + continue; + } + + if (first_unmapped != blocks_per_page) + goto confused; /* hole -> non-hole */ + + if (!buffer_dirty(bh) || !buffer_uptodate(bh)) + goto confused; + if (page_block) { + if (bh->b_blocknr != blocks[page_block-1] + 1) + goto confused; + } + blocks[page_block++] = bh->b_blocknr; + boundary = buffer_boundary(bh); + if (boundary) { + boundary_block = bh->b_blocknr; + boundary_bdev = bh->b_bdev; + } + bdev = bh->b_bdev; + } while ((bh = bh->b_this_page) != head); + + if (first_unmapped) + goto page_is_mapped; + + /* + * Page has buffers, but they are all unmapped. The page was + * created by pagein or read over a hole which was handled by + * block_read_full_page(). If this address_space is also + * using mpage_readpages then this can rarely happen. + */ + goto confused; + } + + /* + * The page has no buffers: map it to disk + */ + BUG_ON(!PageUptodate(page)); + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = (i_size - 1) >> blkbits; + map_bh.b_page = page; + for (page_block = 0; page_block < blocks_per_page; ) { + + map_bh.b_state = 0; + map_bh.b_size = 1 << blkbits; + if (mpd->get_block(inode, block_in_file, &map_bh, 1)) + goto confused; + if (buffer_new(&map_bh)) + unmap_underlying_metadata(map_bh.b_bdev, + map_bh.b_blocknr); + if (buffer_boundary(&map_bh)) { + boundary_block = map_bh.b_blocknr; + boundary_bdev = map_bh.b_bdev; + } + if (page_block) { + if (map_bh.b_blocknr != blocks[page_block-1] + 1) + goto confused; + } + blocks[page_block++] = map_bh.b_blocknr; + boundary = buffer_boundary(&map_bh); + bdev = map_bh.b_bdev; + if (block_in_file == last_block) + break; + block_in_file++; + } + BUG_ON(page_block == 0); + + first_unmapped = page_block; + +page_is_mapped: + end_index = i_size >> PAGE_CACHE_SHIFT; + if (page->index >= end_index) { + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining memory + * is zeroed when mapped, and writes to that region are not + * written out to the file." + */ + unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); + + if (page->index > end_index || !offset) + goto confused; + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + } + + /* + * This page will go to BIO. Do we need to send this BIO off first? + */ + if (bio && mpd->last_block_in_bio != blocks[0] - 1) + bio = mpage_bio_submit(WRITE, bio); + +alloc_new: + if (bio == NULL) { + if (first_unmapped == blocks_per_page) { + if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), + page, wbc)) { + clean_buffers(page, first_unmapped); + goto out; + } + } + bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); + if (bio == NULL) + goto confused; + } + + /* + * Must try to add the page before marking the buffer clean or + * the confused fail path above (OOM) will be very confused when + * it finds all bh marked clean (i.e. it will not write anything) + */ + length = first_unmapped << blkbits; + if (bio_add_page(bio, page, length, 0) < length) { + bio = mpage_bio_submit(WRITE, bio); + goto alloc_new; + } + + clean_buffers(page, first_unmapped); + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + if (boundary || (first_unmapped != blocks_per_page)) { + bio = mpage_bio_submit(WRITE, bio); + if (boundary_block) { + write_boundary_block(boundary_bdev, + boundary_block, 1 << blkbits); + } + } else { + mpd->last_block_in_bio = blocks[blocks_per_page - 1]; + } + goto out; + +confused: + if (bio) + bio = mpage_bio_submit(WRITE, bio); + + if (mpd->use_writepage) { + ret = mapping->a_ops->writepage(page, wbc); + } else { + ret = -EAGAIN; + goto out; + } + /* + * The caller has a ref on the inode, so *mapping is stable + */ + mapping_set_error(mapping, ret); +out: + mpd->bio = bio; + return ret; +} + +/** + * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * If this is NULL then use a_ops->writepage. Otherwise, go + * direct-to-BIO. + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * If a page is already under I/O, generic_writepages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +int +mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block) +{ + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + + if (!get_block) + ret = generic_writepages(mapping, wbc); + else { + struct mpage_data mpd = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = get_block, + .use_writepage = 1, + }; + + ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); + if (mpd.bio) + mpage_bio_submit(WRITE, mpd.bio); + } + blk_finish_plug(&plug); + return ret; +} +EXPORT_SYMBOL(mpage_writepages); + +int mpage_writepage(struct page *page, get_block_t get_block, + struct writeback_control *wbc) +{ + struct mpage_data mpd = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = get_block, + .use_writepage = 0, + }; + int ret = __mpage_writepage(page, wbc, &mpd); + if (mpd.bio) + mpage_bio_submit(WRITE, mpd.bio); + return ret; +} +EXPORT_SYMBOL(mpage_writepage); diff --git a/kernel/fs/namei.c b/kernel/fs/namei.c new file mode 100644 index 000000000..fe30d3be4 --- /dev/null +++ b/kernel/fs/namei.c @@ -0,0 +1,4552 @@ +/* + * linux/fs/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * Some corrections by tytso. + */ + +/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname + * lookup logic. + */ +/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "mount.h" + +/* [Feb-1997 T. Schoebel-Theuer] + * Fundamental changes in the pathname lookup mechanisms (namei) + * were necessary because of omirr. The reason is that omirr needs + * to know the _real_ pathname, not the user-supplied one, in case + * of symlinks (and also when transname replacements occur). + * + * The new code replaces the old recursive symlink resolution with + * an iterative one (in case of non-nested symlink chains). It does + * this with calls to _follow_link(). + * As a side effect, dir_namei(), _namei() and follow_link() are now + * replaced with a single function lookup_dentry() that can handle all + * the special cases of the former code. + * + * With the new dcache, the pathname is stored at each inode, at least as + * long as the refcount of the inode is positive. As a side effect, the + * size of the dcache depends on the inode cache and thus is dynamic. + * + * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink + * resolution to correspond with current state of the code. + * + * Note that the symlink resolution is not *completely* iterative. + * There is still a significant amount of tail- and mid- recursion in + * the algorithm. Also, note that _readlink() is not used in + * lookup_dentry(): lookup_dentry() on the result of _readlink() + * may return different results than _follow_link(). Many virtual + * filesystems (including /proc) exhibit this behavior. + */ + +/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: + * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL + * and the name already exists in form of a symlink, try to create the new + * name indicated by the symlink. The old code always complained that the + * name already exists, due to not following the symlink even if its target + * is nonexistent. The new semantics affects also mknod() and link() when + * the name is a symlink pointing to a non-existent name. + * + * I don't know which semantics is the right one, since I have no access + * to standards. But I found by trial that HP-UX 9.0 has the full "new" + * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the + * "old" one. Personally, I think the new semantics is much more logical. + * Note that "ln old new" where "new" is a symlink pointing to a non-existing + * file does succeed in both HP-UX and SunOs, but not in Solaris + * and in the old Linux semantics. + */ + +/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink + * semantics. See the comments in "open_namei" and "do_link" below. + * + * [10-Sep-98 Alan Modra] Another symlink change. + */ + +/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: + * inside the path - always follow. + * in the last component in creation/removal/renaming - never follow. + * if LOOKUP_FOLLOW passed - follow. + * if the pathname has trailing slashes - follow. + * otherwise - don't follow. + * (applied in that order). + * + * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT + * restored for 2.4. This is the last surviving part of old 4.2BSD bug. + * During the 2.4 we need to fix the userland stuff depending on it - + * hopefully we will be able to get rid of that wart in 2.5. So far only + * XEmacs seems to be relying on it... + */ +/* + * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) + * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives + * any extra contention... + */ + +/* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. + * + * POSIX.1 2.4: an empty pathname is invalid (ENOENT). + * PATH_MAX includes the nul terminator --RR. + */ + +#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) + +struct filename * +getname_flags(const char __user *filename, int flags, int *empty) +{ + struct filename *result; + char *kname; + int len; + + result = audit_reusename(filename); + if (result) + return result; + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + /* + * First, try to embed the struct filename inside the names_cache + * allocation + */ + kname = (char *)result->iname; + result->name = kname; + + len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); + if (unlikely(len < 0)) { + __putname(result); + return ERR_PTR(len); + } + + /* + * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a + * separate struct filename so we can dedicate the entire + * names_cache allocation for the pathname, and re-do the copy from + * userland. + */ + if (unlikely(len == EMBEDDED_NAME_MAX)) { + const size_t size = offsetof(struct filename, iname[1]); + kname = (char *)result; + + /* + * size is chosen that way we to guarantee that + * result->iname[0] is within the same object and that + * kname can't be equal to result->iname, no matter what. + */ + result = kzalloc(size, GFP_KERNEL); + if (unlikely(!result)) { + __putname(kname); + return ERR_PTR(-ENOMEM); + } + result->name = kname; + len = strncpy_from_user(kname, filename, PATH_MAX); + if (unlikely(len < 0)) { + __putname(kname); + kfree(result); + return ERR_PTR(len); + } + if (unlikely(len == PATH_MAX)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } + } + + result->refcnt = 1; + /* The empty path is special. */ + if (unlikely(!len)) { + if (empty) + *empty = 1; + if (!(flags & LOOKUP_EMPTY)) { + putname(result); + return ERR_PTR(-ENOENT); + } + } + + result->uptr = filename; + result->aname = NULL; + audit_getname(result); + return result; +} + +struct filename * +getname(const char __user * filename) +{ + return getname_flags(filename, 0, NULL); +} + +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + int len = strlen(filename) + 1; + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)result->iname; + } else if (len <= PATH_MAX) { + struct filename *tmp; + + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); + } + tmp->name = (char *)result; + result = tmp; + } else { + __putname(result); + return ERR_PTR(-ENAMETOOLONG); + } + memcpy((char *)result->name, filename, len); + result->uptr = NULL; + result->aname = NULL; + result->refcnt = 1; + audit_getname(result); + + return result; +} + +void putname(struct filename *name) +{ + BUG_ON(name->refcnt <= 0); + + if (--name->refcnt > 0) + return; + + if (name->name != name->iname) { + __putname(name->name); + kfree(name); + } else + __putname(name); +} + +static int check_acl(struct inode *inode, int mask) +{ +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *acl; + + if (mask & MAY_NOT_BLOCK) { + acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); + if (!acl) + return -EAGAIN; + /* no ->get_acl() calls in RCU mode... */ + if (acl == ACL_NOT_CACHED) + return -ECHILD; + return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); + } + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } +#endif + + return -EAGAIN; +} + +/* + * This does the basic permission checking + */ +static int acl_permission_check(struct inode *inode, int mask) +{ + unsigned int mode = inode->i_mode; + + if (likely(uid_eq(current_fsuid(), inode->i_uid))) + mode >>= 6; + else { + if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { + int error = check_acl(inode, mask); + if (error != -EAGAIN) + return error; + } + + if (in_group_p(inode->i_gid)) + mode >>= 3; + } + + /* + * If the DACs are ok we don't need any capability check. + */ + if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + * + * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk + * request cannot be satisfied (eg. requires blocking or too much complexity). + * It would then be called again in ref-walk mode. + */ +int generic_permission(struct inode *inode, int mask) +{ + int ret; + + /* + * Do the basic permission checks. + */ + ret = acl_permission_check(inode, mask); + if (ret != -EACCES) + return ret; + + if (S_ISDIR(inode->i_mode)) { + /* DACs are overridable for directories */ + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + return 0; + if (!(mask & MAY_WRITE)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) + return 0; + return -EACCES; + } + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable when there is + * at least one exec bit set. + */ + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + if (mask == MAY_READ) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} +EXPORT_SYMBOL(generic_permission); + +/* + * We _really_ want to just do "generic_permission()" without + * even looking at the inode->i_op values. So we keep a cache + * flag in inode->i_opflags, that says "this has not special + * permission function, use the fast case". + */ +static inline int do_inode_permission(struct inode *inode, int mask) +{ + if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { + if (likely(inode->i_op->permission)) + return inode->i_op->permission(inode, mask); + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_FASTPERM; + spin_unlock(&inode->i_lock); + } + return generic_permission(inode, mask); +} + +/** + * __inode_permission - Check for access rights to a given inode + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Check for read/write/execute permissions on an inode. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. + * + * This does not check for a read-only file system. You probably want + * inode_permission(). + */ +int __inode_permission(struct inode *inode, int mask) +{ + int retval; + + if (unlikely(mask & MAY_WRITE)) { + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } + + retval = do_inode_permission(inode, mask); + if (retval) + return retval; + + retval = devcgroup_inode_permission(inode, mask); + if (retval) + return retval; + + return security_inode_permission(inode, mask); +} +EXPORT_SYMBOL(__inode_permission); + +/** + * sb_permission - Check superblock-level permissions + * @sb: Superblock of inode to check permission on + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Separate out file-system wide checks from inode-specific permission checks. + */ +static int sb_permission(struct super_block *sb, struct inode *inode, int mask) +{ + if (unlikely(mask & MAY_WRITE)) { + umode_t mode = inode->i_mode; + + /* Nobody gets write access to a read-only fs. */ + if ((sb->s_flags & MS_RDONLY) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + } + return 0; +} + +/** + * inode_permission - Check for access rights to a given inode + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Check for read/write/execute permissions on an inode. We use fs[ug]id for + * this, letting us set arbitrary permissions for filesystem access without + * changing the "normal" UIDs which are used for other things. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. + */ +int inode_permission(struct inode *inode, int mask) +{ + int retval; + + retval = sb_permission(inode->i_sb, inode, mask); + if (retval) + return retval; + return __inode_permission(inode, mask); +} +EXPORT_SYMBOL(inode_permission); + +/** + * path_get - get a reference to a path + * @path: path to get the reference to + * + * Given a path increment the reference count to the dentry and the vfsmount. + */ +void path_get(const struct path *path) +{ + mntget(path->mnt); + dget(path->dentry); +} +EXPORT_SYMBOL(path_get); + +/** + * path_put - put a reference to a path + * @path: path to put the reference to + * + * Given a path decrement the reference count to the dentry and the vfsmount. + */ +void path_put(const struct path *path) +{ + dput(path->dentry); + mntput(path->mnt); +} +EXPORT_SYMBOL(path_put); + +struct nameidata { + struct path path; + struct qstr last; + struct path root; + struct inode *inode; /* path.dentry.d_inode */ + unsigned int flags; + unsigned seq, m_seq; + int last_type; + unsigned depth; + struct file *base; + char *saved_names[MAX_NESTED_LINKS + 1]; +}; + +/* + * Path walking has 2 modes, rcu-walk and ref-walk (see + * Documentation/filesystems/path-lookup.txt). In situations when we can't + * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab + * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * mode. Refcounts are grabbed at the last known good point before rcu-walk + * got stuck, so ref-walk may continue from there. If this is not successful + * (eg. a seqcount has changed), then failure is returned and it's up to caller + * to restart the path walk from the beginning in ref-walk mode. + */ + +/** + * unlazy_walk - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * @dentry: child of nd->path.dentry or NULL + * Returns: 0 on success, -ECHILD on failure + * + * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry + * for ref-walk mode. @dentry must be a path found by a do_lookup call on + * @nd or NULL. Must be called from rcu-walk context. + */ +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +{ + struct fs_struct *fs = current->fs; + struct dentry *parent = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + + /* + * After legitimizing the bastards, terminate_walk() + * will do the right thing for non-RCU mode, and all our + * subsequent exit cases should rcu_read_unlock() + * before returning. Do vfsmount first; if dentry + * can't be legitimized, just set nd->path.dentry to NULL + * and rely on dput(NULL) being a no-op. + */ + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) + return -ECHILD; + nd->flags &= ~LOOKUP_RCU; + + if (!lockref_get_not_dead(&parent->d_lockref)) { + nd->path.dentry = NULL; + goto out; + } + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ + if (!dentry) { + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + goto out; + BUG_ON(nd->inode != parent->d_inode); + } else { + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto out; + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + goto drop_dentry; + } + + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) + goto unlock_and_drop_dentry; + path_get(&nd->root); + spin_unlock(&fs->lock); + } + + rcu_read_unlock(); + return 0; + +unlock_and_drop_dentry: + spin_unlock(&fs->lock); +drop_dentry: + rcu_read_unlock(); + dput(dentry); + goto drop_root_mnt; +out: + rcu_read_unlock(); +drop_root_mnt: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + return -ECHILD; +} + +static inline int d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return dentry->d_op->d_revalidate(dentry, flags); +} + +/** + * complete_walk - successful completion of path walk + * @nd: pointer nameidata + * + * If we had been in RCU mode, drop out of it and legitimize nd->path. + * Revalidate the final result, unless we'd already done that during + * the path walk or the filesystem doesn't ask for it. Return 0 on + * success, -error on failure. In case of failure caller does not + * need to drop nd->path. + */ +static int complete_walk(struct nameidata *nd) +{ + struct dentry *dentry = nd->path.dentry; + int status; + + if (nd->flags & LOOKUP_RCU) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { + rcu_read_unlock(); + return -ECHILD; + } + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { + rcu_read_unlock(); + mntput(nd->path.mnt); + return -ECHILD; + } + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { + rcu_read_unlock(); + dput(dentry); + mntput(nd->path.mnt); + return -ECHILD; + } + rcu_read_unlock(); + } + + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) + return 0; + + status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); + if (status > 0) + return 0; + + if (!status) + status = -ESTALE; + + path_put(&nd->path); + return status; +} + +static __always_inline void set_root(struct nameidata *nd) +{ + get_fs_root(current->fs, &nd->root); +} + +static int link_path_walk(const char *, struct nameidata *); + +static __always_inline unsigned set_root_rcu(struct nameidata *nd) +{ + struct fs_struct *fs = current->fs; + unsigned seq, res; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + res = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + return res; +} + +static void path_put_conditional(struct path *path, struct nameidata *nd) +{ + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); +} + +static inline void path_to_nameidata(const struct path *path, + struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path->mnt) + mntput(nd->path.mnt); + } + nd->path.mnt = path->mnt; + nd->path.dentry = path->dentry; +} + +/* + * Helper to directly jump to a known parsed path from ->follow_link, + * caller must have taken a reference to path beforehand. + */ +void nd_jump_link(struct nameidata *nd, struct path *path) +{ + path_put(&nd->path); + + nd->path = *path; + nd->inode = nd->path.dentry->d_inode; + nd->flags |= LOOKUP_JUMPED; +} + +void nd_set_link(struct nameidata *nd, char *path) +{ + nd->saved_names[nd->depth] = path; +} +EXPORT_SYMBOL(nd_set_link); + +char *nd_get_link(struct nameidata *nd) +{ + return nd->saved_names[nd->depth]; +} +EXPORT_SYMBOL(nd_get_link); + +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + +int sysctl_protected_symlinks __read_mostly = 0; +int sysctl_protected_hardlinks __read_mostly = 0; + +/** + * may_follow_link - Check symlink following for unsafe situations + * @link: The path of the symlink + * @nd: nameidata pathwalk data + * + * In the case of the sysctl_protected_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to be followed only when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static inline int may_follow_link(struct path *link, struct nameidata *nd) +{ + const struct inode *inode; + const struct inode *parent; + + if (!sysctl_protected_symlinks) + return 0; + + /* Allowed if owner and follower match. */ + inode = link->dentry->d_inode; + if (uid_eq(current_cred()->fsuid, inode->i_uid)) + return 0; + + /* Allowed if parent directory not sticky and world-writable. */ + parent = nd->path.dentry->d_inode; + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + return 0; + + /* Allowed if parent directory and link owner match. */ + if (uid_eq(parent->i_uid, inode->i_uid)) + return 0; + + audit_log_link_denied("follow_link", link); + path_put_conditional(link, nd); + path_put(&nd->path); + return -EACCES; +} + +/** + * safe_hardlink_source - Check for safe hardlink conditions + * @inode: the source inode to hardlink from + * + * Return false if at least one of the following conditions: + * - inode is not a regular file + * - inode is setuid + * - inode is setgid and group-exec + * - access failure for read and write + * + * Otherwise returns true. + */ +static bool safe_hardlink_source(struct inode *inode) +{ + umode_t mode = inode->i_mode; + + /* Special files should not get pinned to the filesystem. */ + if (!S_ISREG(mode)) + return false; + + /* Setuid files should not get pinned to the filesystem. */ + if (mode & S_ISUID) + return false; + + /* Executable setgid files should not get pinned to the filesystem. */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + return false; + + /* Hardlinking to unreadable or unwritable sources is dangerous. */ + if (inode_permission(inode, MAY_READ | MAY_WRITE)) + return false; + + return true; +} + +/** + * may_linkat - Check permissions for creating a hardlink + * @link: the source to hardlink from + * + * Block hardlink when all of: + * - sysctl_protected_hardlinks enabled + * - fsuid does not match inode + * - hardlink source is unsafe (see safe_hardlink_source() above) + * - not CAP_FOWNER + * + * Returns 0 if successful, -ve on error. + */ +static int may_linkat(struct path *link) +{ + const struct cred *cred; + struct inode *inode; + + if (!sysctl_protected_hardlinks) + return 0; + + cred = current_cred(); + inode = link->dentry->d_inode; + + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, + * otherwise, it must be a safe source. + */ + if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || + capable(CAP_FOWNER)) + return 0; + + audit_log_link_denied("linkat", link); + return -EPERM; +} + +static __always_inline int +follow_link(struct path *link, struct nameidata *nd, void **p) +{ + struct dentry *dentry = link->dentry; + int error; + char *s; + + BUG_ON(nd->flags & LOOKUP_RCU); + + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + + error = -ELOOP; + if (unlikely(current->total_link_count >= 40)) + goto out_put_nd_path; + + cond_resched(); + current->total_link_count++; + + touch_atime(link); + nd_set_link(nd, NULL); + + error = security_inode_follow_link(link->dentry, nd); + if (error) + goto out_put_nd_path; + + nd->last_type = LAST_BIND; + *p = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(*p); + if (IS_ERR(*p)) + goto out_put_nd_path; + + error = 0; + s = nd_get_link(nd); + if (s) { + if (unlikely(IS_ERR(s))) { + path_put(&nd->path); + put_link(nd, link, *p); + return PTR_ERR(s); + } + if (*s == '/') { + if (!nd->root.mnt) + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; + } + nd->inode = nd->path.dentry->d_inode; + error = link_path_walk(s, nd); + if (unlikely(error)) + put_link(nd, link, *p); + } + + return error; + +out_put_nd_path: + *p = NULL; + path_put(&nd->path); + path_put(link); + return error; +} + +static int follow_up_rcu(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; + struct dentry *mountpoint; + + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) + return 0; + mountpoint = mnt->mnt_mountpoint; + path->dentry = mountpoint; + path->mnt = &parent->mnt; + return 1; +} + +/* + * follow_up - Find the mountpoint of path's vfsmount + * + * Given a path, find the mountpoint of its source file system. + * Replace @path with the path of the mountpoint in the parent mount. + * Up is towards /. + * + * Return 1 if we went up a level and 0 if we were already at the + * root. + */ +int follow_up(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; + struct dentry *mountpoint; + + read_seqlock_excl(&mount_lock); + parent = mnt->mnt_parent; + if (parent == mnt) { + read_sequnlock_excl(&mount_lock); + return 0; + } + mntget(&parent->mnt); + mountpoint = dget(mnt->mnt_mountpoint); + read_sequnlock_excl(&mount_lock); + dput(path->dentry); + path->dentry = mountpoint; + mntput(path->mnt); + path->mnt = &parent->mnt; + return 1; +} +EXPORT_SYMBOL(follow_up); + +/* + * Perform an automount + * - return -EISDIR to tell follow_managed() to stop and return the path we + * were called with. + */ +static int follow_automount(struct path *path, unsigned flags, + bool *need_mntput) +{ + struct vfsmount *mnt; + int err; + + if (!path->dentry->d_op || !path->dentry->d_op->d_automount) + return -EREMOTE; + + /* We don't want to mount if someone's just doing a stat - + * unless they're stat'ing a directory and appended a '/' to + * the name. + * + * We do, however, want to mount if someone wants to open or + * create a file of any type under the mountpoint, wants to + * traverse through the mountpoint or wants to open the + * mounted directory. Also, autofs may mark negative dentries + * as being automount points. These will need the attentions + * of the daemon to instantiate them before they can be used. + */ + if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; + + current->total_link_count++; + if (current->total_link_count >= 40) + return -ELOOP; + + mnt = path->dentry->d_op->d_automount(path); + if (IS_ERR(mnt)) { + /* + * The filesystem is allowed to return -EISDIR here to indicate + * it doesn't want to automount. For instance, autofs would do + * this so that its userspace daemon can mount on this dentry. + * + * However, we can only permit this if it's a terminal point in + * the path being looked up; if it wasn't then the remainder of + * the path is inaccessible and we should say so. + */ + if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) + return -EREMOTE; + return PTR_ERR(mnt); + } + + if (!mnt) /* mount collision */ + return 0; + + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } + err = finish_automount(mnt, path); + + switch (err) { + case -EBUSY: + /* Someone else made a mount here whilst we were busy */ + return 0; + case 0: + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + return 0; + default: + return err; + } + +} + +/* + * Handle a dentry that is managed in some way. + * - Flagged for transit management (autofs) + * - Flagged as mountpoint + * - Flagged as automount point + * + * This may only be called in refwalk mode. + * + * Serialization is taken care of in namespace.c + */ +static int follow_managed(struct path *path, unsigned flags) +{ + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ + unsigned managed; + bool need_mntput = false; + int ret = 0; + + /* Given that we're not holding a lock here, we retain the value in a + * local variable for each dentry as we look at it so that we don't see + * the components of that value change under us */ + while (managed = ACCESS_ONCE(path->dentry->d_flags), + managed &= DCACHE_MANAGED_DENTRY, + unlikely(managed != 0)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage(path->dentry, false); + if (ret < 0) + break; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + if (need_mntput) + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + need_mntput = true; + continue; + } + + /* Something is mounted on this dentry in another + * namespace and/or whatever was mounted there in this + * namespace got unmounted before lookup_mnt() could + * get it */ + } + + /* Handle an automount point */ + if (managed & DCACHE_NEED_AUTOMOUNT) { + ret = follow_automount(path, flags, &need_mntput); + if (ret < 0) + break; + continue; + } + + /* We didn't change the current path point */ + break; + } + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret < 0 ? ret : need_mntput; +} + +int follow_down_one(struct path *path) +{ + struct vfsmount *mounted; + + mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + return 1; + } + return 0; +} +EXPORT_SYMBOL(follow_down_one); + +static inline int managed_dentry_rcu(struct dentry *dentry) +{ + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + dentry->d_op->d_manage(dentry, true) : 0; +} + +/* + * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if + * we meet a managed dentry that would need blocking. + */ +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, + struct inode **inode) +{ + for (;;) { + struct mount *mounted; + /* + * Don't forget we might have a non-mountpoint managed dentry + * that wants to block transit. + */ + switch (managed_dentry_rcu(path->dentry)) { + case -ECHILD: + default: + return false; + case -EISDIR: + return true; + case 0: + break; + } + + if (!d_mountpoint(path->dentry)) + return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); + + mounted = __lookup_mnt(path->mnt, path->dentry); + if (!mounted) + break; + path->mnt = &mounted->mnt; + path->dentry = mounted->mnt.mnt_root; + nd->flags |= LOOKUP_JUMPED; + nd->seq = read_seqcount_begin(&path->dentry->d_seq); + /* + * Update the inode too. We don't need to re-check the + * dentry sequence number here after this d_inode read, + * because a mount-point is always pinned. + */ + *inode = path->dentry->d_inode; + } + return !read_seqretry(&mount_lock, nd->m_seq) && + !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); +} + +static int follow_dotdot_rcu(struct nameidata *nd) +{ + struct inode *inode = nd->inode; + if (!nd->root.mnt) + set_root_rcu(nd); + + while (1) { + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + struct dentry *old = nd->path.dentry; + struct dentry *parent = old->d_parent; + unsigned seq; + + inode = parent->d_inode; + seq = read_seqcount_begin(&parent->d_seq); + if (read_seqcount_retry(&old->d_seq, nd->seq)) + goto failed; + nd->path.dentry = parent; + nd->seq = seq; + break; + } + if (!follow_up_rcu(&nd->path)) + break; + inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } + while (d_mountpoint(nd->path.dentry)) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (!mounted) + break; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + if (read_seqretry(&mount_lock, nd->m_seq)) + goto failed; + } + nd->inode = inode; + return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return -ECHILD; +} + +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path) +{ + unsigned managed; + int ret; + + while (managed = ACCESS_ONCE(path->dentry->d_flags), + unlikely(managed & DCACHE_MANAGED_DENTRY)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. + * + * We indicate to the filesystem if someone is trying to mount + * something here. This gives autofs the chance to deny anyone + * other than its daemon the right to mount on its + * superstructure. + * + * The filesystem may sleep at this point. + */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage( + path->dentry, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + continue; + } + + /* Don't handle automount points here */ + break; + } + return 0; +} +EXPORT_SYMBOL(follow_down); + +/* + * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() + */ +static void follow_mount(struct path *path) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + } +} + +static void follow_dotdot(struct nameidata *nd) +{ + if (!nd->root.mnt) + set_root(nd); + + while(1) { + struct dentry *old = nd->path.dentry; + + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + /* rare case of legitimate dget_parent()... */ + nd->path.dentry = dget_parent(nd->path.dentry); + dput(old); + break; + } + if (!follow_up(&nd->path)) + break; + } + follow_mount(&nd->path); + nd->inode = nd->path.dentry->d_inode; +} + +/* + * This looks up the name in dcache, possibly revalidates the old dentry and + * allocates a new one if not found or not valid. In the need_lookup argument + * returns whether i_op->lookup is necessary. + * + * dir->d_inode->i_mutex must be held + */ +static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, + unsigned int flags, bool *need_lookup) +{ + struct dentry *dentry; + int error; + + *need_lookup = false; + dentry = d_lookup(dir, name); + if (dentry) { + if (dentry->d_flags & DCACHE_OP_REVALIDATE) { + error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (error < 0) { + dput(dentry); + return ERR_PTR(error); + } else { + d_invalidate(dentry); + dput(dentry); + dentry = NULL; + } + } + } + } + + if (!dentry) { + dentry = d_alloc(dir, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); + + *need_lookup = true; + } + return dentry; +} + +/* + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. + * + * dir->d_inode->i_mutex must be held + */ +static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct dentry *old; + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(dir))) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + + old = dir->i_op->lookup(dir, dentry, flags); + if (unlikely(old)) { + dput(dentry); + dentry = old; + } + return dentry; +} + +static struct dentry *__lookup_hash(struct qstr *name, + struct dentry *base, unsigned int flags) +{ + bool need_lookup; + struct dentry *dentry; + + dentry = lookup_dcache(name, base, flags, &need_lookup); + if (!need_lookup) + return dentry; + + return lookup_real(base->d_inode, dentry, flags); +} + +/* + * It's more convoluted than I'd like it to be, but... it's still fairly + * small and for now I'd prefer to have fast path as straight as possible. + * It _is_ time-critical. + */ +static int lookup_fast(struct nameidata *nd, + struct path *path, struct inode **inode) +{ + struct vfsmount *mnt = nd->path.mnt; + struct dentry *dentry, *parent = nd->path.dentry; + int need_reval = 1; + int status = 1; + int err; + + /* + * Rename seqlock is not required here because in the off chance + * of a false negative due to a concurrent rename, we're going to + * do the non-racy lookup, below. + */ + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + bool negative; + dentry = __d_lookup_rcu(parent, &nd->last, &seq); + if (!dentry) + goto unlazy; + + /* + * This sequence count validates that the inode matches + * the dentry name information from lookup. + */ + *inode = dentry->d_inode; + negative = d_is_negative(dentry); + if (read_seqcount_retry(&dentry->d_seq, seq)) + return -ECHILD; + if (negative) + return -ENOENT; + + /* + * This sequence count validates that the parent had no + * changes while we did the lookup of the dentry above. + * + * The memory barrier in read_seqcount_begin of child is + * enough, we can use __read_seqcount_retry here. + */ + if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + return -ECHILD; + nd->seq = seq; + + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + status = d_revalidate(dentry, nd->flags); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } + } + path->mnt = mnt; + path->dentry = dentry; + if (likely(__follow_mount_rcu(nd, path, inode))) + return 0; +unlazy: + if (unlazy_walk(nd, dentry)) + return -ECHILD; + } else { + dentry = __d_lookup(parent, &nd->last); + } + + if (unlikely(!dentry)) + goto need_lookup; + + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd->flags); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + d_invalidate(dentry); + dput(dentry); + goto need_lookup; + } + + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } + path->mnt = mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + if (err) + nd->flags |= LOOKUP_JUMPED; + *inode = path->dentry->d_inode; + return 0; + +need_lookup: + return 1; +} + +/* Fast lookup failed, do it the slow way */ +static int lookup_slow(struct nameidata *nd, struct path *path) +{ + struct dentry *dentry, *parent; + int err; + + parent = nd->path.dentry; + BUG_ON(nd->inode != parent->d_inode); + + mutex_lock(&parent->d_inode->i_mutex); + dentry = __lookup_hash(&nd->last, parent, nd->flags); + mutex_unlock(&parent->d_inode->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + path->mnt = nd->path.mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + if (err) + nd->flags |= LOOKUP_JUMPED; + return 0; +} + +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + if (err != -ECHILD) + return err; + if (unlazy_walk(nd, NULL)) + return -ECHILD; + } + return inode_permission(nd->inode, MAY_EXEC); +} + +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } +} + +/* + * Do we need to follow links? We _really_ want to be able + * to do this check without having to look at inode->i_op, + * so we keep a cache of "no, this doesn't need follow_link" + * for the common case. + */ +static inline int should_follow_link(struct dentry *dentry, int follow) +{ + return unlikely(d_is_symlink(dentry)) ? follow : 0; +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + int follow) +{ + struct inode *inode; + int err; + /* + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. + */ + if (unlikely(nd->last_type != LAST_NORM)) + return handle_dots(nd, nd->last_type); + err = lookup_fast(nd, path, &inode); + if (unlikely(err)) { + if (err < 0) + goto out_err; + + err = lookup_slow(nd, path); + if (err < 0) + goto out_err; + + inode = path->dentry->d_inode; + err = -ENOENT; + if (d_is_negative(path->dentry)) + goto out_path_put; + } + + if (should_follow_link(path->dentry, follow)) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { + err = -ECHILD; + goto out_err; + } + } + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; + +out_path_put: + path_to_nameidata(path, nd); +out_err: + terminate_walk(nd); + return err; +} + +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (res) + break; + res = walk_component(nd, path, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; +} + +/* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + * do a "get_unaligned()" if this helps and is sufficiently + * fast. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + * do not trap on the (extremely unlikely) case of a page + * crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + * 64-bit case in order to generate the "number of bytes in + * the final mask". Again, that could be replaced with a + * efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#include + +#ifdef CONFIG_64BIT + +static inline unsigned int fold_hash(unsigned long hash) +{ + return hash_64(hash, 32); +} + +#else /* 32-bit case */ + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long a, mask; + unsigned long hash = 0; + + for (;;) { + a = load_unaligned_zeropad(name); + if (len < sizeof(unsigned long)) + break; + hash += a; + hash *= 9; + name += sizeof(unsigned long); + len -= sizeof(unsigned long); + if (!len) + goto done; + } + mask = bytemask_from_count(len); + hash += mask & a; +done: + return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +/* + * Calculate the length and hash of the path component, and + * return the "hash_len" as the result. + */ +static inline u64 hash_name(const char *name) +{ + unsigned long a, b, adata, bdata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = (hash + a) * 9; + len += sizeof(unsigned long); + a = load_unaligned_zeropad(name+len); + b = a ^ REPEAT_BYTE('/'); + } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); + + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + + mask = create_zero_mask(adata | bdata); + + hash += a & zero_bytemask(mask); + len += find_zero(mask); + return hashlen_create(fold_hash(hash), len); +} + +#else + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(*name++, hash); + return end_name_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +/* + * We know there's a real path component here of at least + * one character. + */ +static inline u64 hash_name(const char *name) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + do { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } while (c && c != '/'); + return hashlen_create(end_name_hash(hash), len); +} + +#endif + +/* + * Name resolution. + * This is the basic name resolution function, turning a pathname into + * the final dentry. We expect 'base' to be positive and a directory. + * + * Returns 0 and nd will have valid dentry and mnt on success. + * Returns error and drops reference to input namei data on failure. + */ +static int link_path_walk(const char *name, struct nameidata *nd) +{ + struct path next; + int err; + + while (*name=='/') + name++; + if (!*name) + return 0; + + /* At this point we know we have a real path component. */ + for(;;) { + u64 hash_len; + int type; + + err = may_lookup(nd); + if (err) + break; + + hash_len = hash_name(name); + + type = LAST_NORM; + if (name[0] == '.') switch (hashlen_len(hash_len)) { + case 2: + if (name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + struct qstr this = { { .hash_len = hash_len }, .name = name }; + err = parent->d_op->d_hash(parent, &this); + if (err < 0) + break; + hash_len = this.hash_len; + name = this.name; + } + } + + nd->last.hash_len = hash_len; + nd->last.name = name; + nd->last_type = type; + + name += hashlen_len(hash_len); + if (!*name) + return 0; + /* + * If it wasn't NUL, we know it was '/'. Skip that + * slash, and continue until no more slashes. + */ + do { + name++; + } while (unlikely(*name == '/')); + if (!*name) + return 0; + + err = walk_component(nd, &next, LOOKUP_FOLLOW); + if (err < 0) + return err; + + if (err) { + err = nested_symlink(&next, nd); + if (err) + return err; + } + if (!d_can_lookup(nd->path.dentry)) { + err = -ENOTDIR; + break; + } + } + terminate_walk(nd); + return err; +} + +static int path_init(int dfd, const struct filename *name, unsigned int flags, + struct nameidata *nd) +{ + int retval = 0; + const char *s = name->name; + + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; + nd->depth = 0; + nd->base = NULL; + if (flags & LOOKUP_ROOT) { + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; + if (*s) { + if (!d_can_lookup(root)) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->m_seq = read_seqbegin(&mount_lock); + } else { + path_get(&nd->path); + } + goto done; + } + + nd->root.mnt = NULL; + + nd->m_seq = read_seqbegin(&mount_lock); + if (*s == '/') { + if (flags & LOOKUP_RCU) { + rcu_read_lock(); + nd->seq = set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; + } else if (dfd == AT_FDCWD) { + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; + + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } + } else { + /* Caller must check execute permissions on the starting path component */ + struct fd f = fdget_raw(dfd); + struct dentry *dentry; + + if (!f.file) + return -EBADF; + + dentry = f.file->f_path.dentry; + + if (*s) { + if (!d_can_lookup(dentry)) { + fdput(f); + return -ENOTDIR; + } + } + + nd->path = f.file->f_path; + if (flags & LOOKUP_RCU) { + if (f.flags & FDPUT_FPUT) + nd->base = f.file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + rcu_read_lock(); + } else { + path_get(&nd->path); + fdput(f); + } + } + + nd->inode = nd->path.dentry->d_inode; + if (!(flags & LOOKUP_RCU)) + goto done; + if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) + goto done; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return -ECHILD; +done: + current->total_link_count = 0; + return link_path_walk(s, nd); +} + +static void path_cleanup(struct nameidata *nd) +{ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + if (unlikely(nd->base)) + fput(nd->base); +} + +static inline int lookup_last(struct nameidata *nd, struct path *path) +{ + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); +} + +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int path_lookupat(int dfd, const struct filename *name, + unsigned int flags, struct nameidata *nd) +{ + struct path path; + int err; + + /* + * Path walking is largely split up into 2 different synchronisation + * schemes, rcu-walk and ref-walk (explained in + * Documentation/filesystems/path-lookup.txt). These share much of the + * path walk code, but some things particularly setup, cleanup, and + * following mounts are sufficiently divergent that functions are + * duplicated. Typically there is a function foo(), and its RCU + * analogue, foo_rcu(). + * + * -ECHILD is the error number of choice (just to avoid clashes) that + * is returned if some aspect of an rcu-walk fails. Such an error must + * be handled by restarting a traditional ref-walk (which will always + * be able to complete). + */ + err = path_init(dfd, name, flags, nd); + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + err = may_follow_link(&link, nd); + if (unlikely(err)) + break; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (err) + break; + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } + } + + if (!err) + err = complete_walk(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!d_can_lookup(nd->path.dentry)) { + path_put(&nd->path); + err = -ENOTDIR; + } + } + + path_cleanup(nd); + return err; +} + +static int filename_lookup(int dfd, struct filename *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + + if (likely(!retval)) + audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + return retval; +} + +/* does lookup, returns the object with parent locked */ +struct dentry *kern_path_locked(const char *name, struct path *path) +{ + struct filename *filename = getname_kernel(name); + struct nameidata nd; + struct dentry *d; + int err; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + + err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd); + if (err) { + d = ERR_PTR(err); + goto out; + } + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + d = ERR_PTR(-EINVAL); + goto out; + } + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = __lookup_hash(&nd.last, nd.path.dentry, 0); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + goto out; + } + *path = nd.path; +out: + putname(filename); + return d; +} + +int kern_path(const char *name, unsigned int flags, struct path *path) +{ + struct nameidata nd; + struct filename *filename = getname_kernel(name); + int res = PTR_ERR(filename); + + if (!IS_ERR(filename)) { + res = filename_lookup(AT_FDCWD, filename, flags, &nd); + putname(filename); + if (!res) + *path = nd.path; + } + return res; +} +EXPORT_SYMBOL(kern_path); + +/** + * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair + * @dentry: pointer to dentry of the base directory + * @mnt: pointer to vfs mount of the base directory + * @name: pointer to file name + * @flags: lookup flags + * @path: pointer to struct path to fill + */ +int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, + const char *name, unsigned int flags, + struct path *path) +{ + struct filename *filename = getname_kernel(name); + int err = PTR_ERR(filename); + + BUG_ON(flags & LOOKUP_PARENT); + + /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */ + if (!IS_ERR(filename)) { + struct nameidata nd; + nd.root.dentry = dentry; + nd.root.mnt = mnt; + err = filename_lookup(AT_FDCWD, filename, + flags | LOOKUP_ROOT, &nd); + if (!err) + *path = nd.path; + putname(filename); + } + return err; +} +EXPORT_SYMBOL(vfs_path_lookup); + +/* + * Restricted form of lookup. Doesn't follow links, single-component only, + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +static struct dentry *lookup_hash(struct nameidata *nd) +{ + return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); +} + +/** + * lookup_one_len - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + */ +struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +{ + struct qstr this; + unsigned int c; + int err; + + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + + this.name = name; + this.len = len; + this.hash = full_name_hash(name, len); + if (!len) + return ERR_PTR(-EACCES); + + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + } + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, &this); + if (err < 0) + return ERR_PTR(err); + } + + err = inode_permission(base->d_inode, MAY_EXEC); + if (err) + return ERR_PTR(err); + + return __lookup_hash(&this, base, 0); +} +EXPORT_SYMBOL(lookup_one_len); + +int user_path_at_empty(int dfd, const char __user *name, unsigned flags, + struct path *path, int *empty) +{ + struct nameidata nd; + struct filename *tmp = getname_flags(name, flags, empty); + int err = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + + BUG_ON(flags & LOOKUP_PARENT); + + err = filename_lookup(dfd, tmp, flags, &nd); + putname(tmp); + if (!err) + *path = nd.path; + } + return err; +} + +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, NULL); +} +EXPORT_SYMBOL(user_path_at); + +/* + * NB: most callers don't do anything directly with the reference to the + * to struct filename, but the nd->last pointer points into the name string + * allocated by getname. So we must hold the reference to it until all + * path-walking is complete. + */ +static struct filename * +user_path_parent(int dfd, const char __user *path, struct nameidata *nd, + unsigned int flags) +{ + struct filename *s = getname(path); + int error; + + /* only LOOKUP_REVAL is allowed in extra flags */ + flags &= LOOKUP_REVAL; + + if (IS_ERR(s)) + return s; + + error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); + if (error) { + putname(s); + return ERR_PTR(error); + } + + return s; +} + +/** + * mountpoint_last - look up last component for umount + * @nd: pathwalk nameidata - currently pointing at parent directory of "last" + * @path: pointer to container for result + * + * This is a special lookup_last function just for umount. In this case, we + * need to resolve the path without doing any revalidation. + * + * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since + * mountpoints are always pinned in the dcache, their ancestors are too. Thus, + * in almost all cases, this lookup will be served out of the dcache. The only + * cases where it won't are if nd->last refers to a symlink or the path is + * bogus and it doesn't exist. + * + * Returns: + * -error: if there was an error during lookup. This includes -ENOENT if the + * lookup found a negative dentry. The nd->path reference will also be + * put in this case. + * + * 0: if we successfully resolved nd->path and found it to not to be a + * symlink that needs to be followed. "path" will also be populated. + * The nd->path reference will also be put. + * + * 1: if we successfully resolved nd->last and found it to be a symlink + * that needs to be followed. "path" will be populated with the path + * to the link, and nd->path will *not* be put. + */ +static int +mountpoint_last(struct nameidata *nd, struct path *path) +{ + int error = 0; + struct dentry *dentry; + struct dentry *dir = nd->path.dentry; + + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL)) { + error = -ECHILD; + goto out; + } + } + + nd->flags &= ~LOOKUP_PARENT; + + if (unlikely(nd->last_type != LAST_NORM)) { + error = handle_dots(nd, nd->last_type); + if (error) + goto out; + dentry = dget(nd->path.dentry); + goto done; + } + + mutex_lock(&dir->d_inode->i_mutex); + dentry = d_lookup(dir, &nd->last); + if (!dentry) { + /* + * No cached dentry. Mounted dentries are pinned in the cache, + * so that means that this dentry is probably a symlink or the + * path doesn't actually point to a mounted dentry. + */ + dentry = d_alloc(dir, &nd->last); + if (!dentry) { + error = -ENOMEM; + mutex_unlock(&dir->d_inode->i_mutex); + goto out; + } + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); + goto out; + } + } + mutex_unlock(&dir->d_inode->i_mutex); + +done: + if (d_is_negative(dentry)) { + error = -ENOENT; + dput(dentry); + goto out; + } + path->dentry = dentry; + path->mnt = nd->path.mnt; + if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) + return 1; + mntget(path->mnt); + follow_mount(path); + error = 0; +out: + terminate_walk(nd); + return error; +} + +/** + * path_mountpoint - look up a path to be umounted + * @dfd: directory file descriptor to start walk from + * @name: full pathname to walk + * @path: pointer to container for result + * @flags: lookup flags + * + * Look up the given name, but don't attempt to revalidate the last component. + * Returns 0 and "path" will be valid on success; Returns error otherwise. + */ +static int +path_mountpoint(int dfd, const struct filename *name, struct path *path, + unsigned int flags) +{ + struct nameidata nd; + int err; + + err = path_init(dfd, name, flags, &nd); + if (unlikely(err)) + goto out; + + err = mountpoint_last(&nd, path); + while (err > 0) { + void *cookie; + struct path link = *path; + err = may_follow_link(&link, &nd); + if (unlikely(err)) + break; + nd.flags |= LOOKUP_PARENT; + err = follow_link(&link, &nd, &cookie); + if (err) + break; + err = mountpoint_last(&nd, path); + put_link(&nd, &link, cookie); + } +out: + path_cleanup(&nd); + return err; +} + +static int +filename_mountpoint(int dfd, struct filename *name, struct path *path, + unsigned int flags) +{ + int error; + if (IS_ERR(name)) + return PTR_ERR(name); + error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU); + if (unlikely(error == -ECHILD)) + error = path_mountpoint(dfd, name, path, flags); + if (unlikely(error == -ESTALE)) + error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL); + if (likely(!error)) + audit_inode(name, path->dentry, 0); + putname(name); + return error; +} + +/** + * user_path_mountpoint_at - lookup a path from userland in order to umount it + * @dfd: directory file descriptor + * @name: pathname from userland + * @flags: lookup flags + * @path: pointer to container to hold result + * + * A umount is a special case for path walking. We're not actually interested + * in the inode in this situation, and ESTALE errors can be a problem. We + * simply want track down the dentry and vfsmount attached at the mountpoint + * and avoid revalidating the last component. + * + * Returns 0 and populates "path" on success. + */ +int +user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, + struct path *path) +{ + return filename_mountpoint(dfd, getname(name), path, flags); +} + +int +kern_path_mountpoint(int dfd, const char *name, struct path *path, + unsigned int flags) +{ + return filename_mountpoint(dfd, getname_kernel(name), path, flags); +} +EXPORT_SYMBOL(kern_path_mountpoint); + +int __check_sticky(struct inode *dir, struct inode *inode) +{ + kuid_t fsuid = current_fsuid(); + + if (uid_eq(inode->i_uid, fsuid)) + return 0; + if (uid_eq(dir->i_uid, fsuid)) + return 0; + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); +} +EXPORT_SYMBOL(__check_sticky); + +/* + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) +{ + struct inode *inode = victim->d_inode; + int error; + + if (d_is_negative(victim)) + return -ENOENT; + BUG_ON(!inode); + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) + return -EPERM; + if (isdir) { + if (!d_is_dir(victim)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (d_is_dir(victim)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* Check whether we can create an object with dentry child in directory + * dir. + * 1. We can't do it if child already exists (open has special treatment for + * this case, but since we are inlined it's OK) + * 2. We can't do it if dir is read-only (done in permission()) + * 3. We should have write and exec permissions on dir + * 4. We can't do it if dir is immutable (done in permission()) + */ +static inline int may_create(struct inode *dir, struct dentry *child) +{ + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * p1 and p2 should be directories on the same fs. + */ +struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p; + + if (p1 == p2) { + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + return NULL; + } + + mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + + p = d_ancestor(p2, p1); + if (p) { + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + return p; + } + + p = d_ancestor(p1, p2); + if (p) { + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + return p; + } + + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + return NULL; +} +EXPORT_SYMBOL(lock_rename); + +void unlock_rename(struct dentry *p1, struct dentry *p2) +{ + mutex_unlock(&p1->d_inode->i_mutex); + if (p1 != p2) { + mutex_unlock(&p2->d_inode->i_mutex); + mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + } +} +EXPORT_SYMBOL(unlock_rename); + +int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool want_excl) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ + mode &= S_IALLUGO; + mode |= S_IFREG; + error = security_inode_create(dir, dentry, mode); + if (error) + return error; + error = dir->i_op->create(dir, dentry, mode, want_excl); + if (!error) + fsnotify_create(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_create); + +static int may_open(struct path *path, int acc_mode, int flag) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + int error; + + /* O_PATH? */ + if (!acc_mode) + return 0; + + if (!inode) + return -ENOENT; + + switch (inode->i_mode & S_IFMT) { + case S_IFLNK: + return -ELOOP; + case S_IFDIR: + if (acc_mode & MAY_WRITE) + return -EISDIR; + break; + case S_IFBLK: + case S_IFCHR: + if (path->mnt->mnt_flags & MNT_NODEV) + return -EACCES; + /*FALLTHRU*/ + case S_IFIFO: + case S_IFSOCK: + flag &= ~O_TRUNC; + break; + } + + error = inode_permission(inode, acc_mode); + if (error) + return error; + + /* + * An append-only file must be opened in append mode for writing. + */ + if (IS_APPEND(inode)) { + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) + return -EPERM; + if (flag & O_TRUNC) + return -EPERM; + } + + /* O_NOATIME can only be set by the owner or superuser */ + if (flag & O_NOATIME && !inode_owner_or_capable(inode)) + return -EPERM; + + return 0; +} + +static int handle_truncate(struct file *filp) +{ + struct path *path = &filp->f_path; + struct inode *inode = path->dentry->d_inode; + int error = get_write_access(inode); + if (error) + return error; + /* + * Refuse to truncate files with mandatory locks held on them. + */ + error = locks_verify_locked(filp); + if (!error) + error = security_path_truncate(path); + if (!error) { + error = do_truncate(path->dentry, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, + filp); + } + put_write_access(inode); + return error; +} + +static inline int open_to_namei_flags(int flag) +{ + if ((flag & O_ACCMODE) == 3) + flag--; + return flag; +} + +static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) +{ + int error = security_path_mknod(dir, dentry, mode, 0); + if (error) + return error; + + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + return security_inode_create(dir->dentry->d_inode, dentry, mode); +} + +/* + * Attempt to atomically look up, create and open a file from a negative + * dentry. + * + * Returns 0 if successful. The file will have been created and attached to + * @file by the filesystem calling finish_open(). + * + * Returns 1 if the file was looked up only or didn't need creating. The + * caller will need to perform the open themselves. @path will have been + * updated to point to the new dentry. This may be negative. + * + * Returns an error code otherwise. + */ +static int atomic_open(struct nameidata *nd, struct dentry *dentry, + struct path *path, struct file *file, + const struct open_flags *op, + bool got_write, bool need_lookup, + int *opened) +{ + struct inode *dir = nd->path.dentry->d_inode; + unsigned open_flag = open_to_namei_flags(op->open_flag); + umode_t mode; + int error; + int acc_mode; + int create_error = 0; + struct dentry *const DENTRY_NOT_SET = (void *) -1UL; + bool excl; + + BUG_ON(dentry->d_inode); + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(dir))) { + error = -ENOENT; + goto out; + } + + mode = op->mode; + if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) + mode &= ~current_umask(); + + excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); + if (excl) + open_flag &= ~O_TRUNC; + + /* + * Checking write permission is tricky, bacuse we don't know if we are + * going to actually need it: O_CREAT opens should work as long as the + * file exists. But checking existence breaks atomicity. The trick is + * to check access and if not granted clear O_CREAT from the flags. + * + * Another problem is returing the "right" error value (e.g. for an + * O_EXCL open we want to return EEXIST not EROFS). + */ + if (((open_flag & (O_CREAT | O_TRUNC)) || + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { + if (!(open_flag & O_CREAT)) { + /* + * No O_CREATE -> atomicity not a requirement -> fall + * back to lookup + open + */ + goto no_open; + } else if (open_flag & (O_EXCL | O_TRUNC)) { + /* Fall back and fail with the right error */ + create_error = -EROFS; + goto no_open; + } else { + /* No side effects, safe to clear O_CREAT */ + create_error = -EROFS; + open_flag &= ~O_CREAT; + } + } + + if (open_flag & O_CREAT) { + error = may_o_create(&nd->path, dentry, mode); + if (error) { + create_error = error; + if (open_flag & O_EXCL) + goto no_open; + open_flag &= ~O_CREAT; + } + } + + if (nd->flags & LOOKUP_DIRECTORY) + open_flag |= O_DIRECTORY; + + file->f_path.dentry = DENTRY_NOT_SET; + file->f_path.mnt = nd->path.mnt; + error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode, + opened); + if (error < 0) { + if (create_error && error == -ENOENT) + error = create_error; + goto out; + } + + if (error) { /* returned 1, that is */ + if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { + error = -EIO; + goto out; + } + if (file->f_path.dentry) { + dput(dentry); + dentry = file->f_path.dentry; + } + if (*opened & FILE_CREATED) + fsnotify_create(dir, dentry); + if (!dentry->d_inode) { + WARN_ON(*opened & FILE_CREATED); + if (create_error) { + error = create_error; + goto out; + } + } else { + if (excl && !(*opened & FILE_CREATED)) { + error = -EEXIST; + goto out; + } + } + goto looked_up; + } + + /* + * We didn't have the inode before the open, so check open permission + * here. + */ + acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = MAY_OPEN; + } + error = may_open(&file->f_path, acc_mode, open_flag); + if (error) + fput(file); + +out: + dput(dentry); + return error; + +no_open: + if (need_lookup) { + dentry = lookup_real(dir, dentry, nd->flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (create_error) { + int open_flag = op->open_flag; + + error = create_error; + if ((open_flag & O_EXCL)) { + if (!dentry->d_inode) + goto out; + } else if (!dentry->d_inode) { + goto out; + } else if ((open_flag & O_TRUNC) && + d_is_reg(dentry)) { + goto out; + } + /* will fail later, go on to get the right error */ + } + } +looked_up: + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; +} + +/* + * Look up and maybe create and open the last component. + * + * Must be called with i_mutex held on parent. + * + * Returns 0 if the file was successfully atomically created (if necessary) and + * opened. In this case the file will be returned attached to @file. + * + * Returns 1 if the file was not completely opened at this time, though lookups + * and creations will have been performed and the dentry returned in @path will + * be positive upon return if O_CREAT was specified. If O_CREAT wasn't + * specified then a negative dentry may be returned. + * + * An error code is returned otherwise. + * + * FILE_CREATE will be set in @*opened if the dentry was created and will be + * cleared otherwise prior to returning. + */ +static int lookup_open(struct nameidata *nd, struct path *path, + struct file *file, + const struct open_flags *op, + bool got_write, int *opened) +{ + struct dentry *dir = nd->path.dentry; + struct inode *dir_inode = dir->d_inode; + struct dentry *dentry; + int error; + bool need_lookup; + + *opened &= ~FILE_CREATED; + dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Cached positive dentry: will open in f_op->open */ + if (!need_lookup && dentry->d_inode) + goto out_no_open; + + if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { + return atomic_open(nd, dentry, path, file, op, got_write, + need_lookup, opened); + } + + if (need_lookup) { + BUG_ON(dentry->d_inode); + + dentry = lookup_real(dir_inode, dentry, nd->flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + + /* Negative dentry, just create the file */ + if (!dentry->d_inode && (op->open_flag & O_CREAT)) { + umode_t mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); + /* + * This write is needed to ensure that a + * rw->ro transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in finish_open(). + */ + if (!got_write) { + error = -EROFS; + goto out_dput; + } + *opened |= FILE_CREATED; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto out_dput; + error = vfs_create(dir->d_inode, dentry, mode, + nd->flags & LOOKUP_EXCL); + if (error) + goto out_dput; + } +out_no_open: + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; + +out_dput: + dput(dentry); + return error; +} + +/* + * Handle the last step of open() + */ +static int do_last(struct nameidata *nd, struct path *path, + struct file *file, const struct open_flags *op, + int *opened, struct filename *name) +{ + struct dentry *dir = nd->path.dentry; + int open_flag = op->open_flag; + bool will_truncate = (open_flag & O_TRUNC) != 0; + bool got_write = false; + int acc_mode = op->acc_mode; + struct inode *inode; + bool symlink_ok = false; + struct path save_parent = { .dentry = NULL, .mnt = NULL }; + bool retried = false; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; + + if (nd->last_type != LAST_NORM) { + error = handle_dots(nd, nd->last_type); + if (error) + return error; + goto finish_open; + } + + if (!(open_flag & O_CREAT)) { + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = true; + /* we _can_ be in RCU mode here */ + error = lookup_fast(nd, path, &inode); + if (likely(!error)) + goto finish_lookup; + + if (error < 0) + goto out; + + BUG_ON(nd->inode != dir->d_inode); + } else { + /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED + * has been cleared when we got to the last component we are + * about to look up + */ + error = complete_walk(nd); + if (error) + return error; + + audit_inode(name, dir, LOOKUP_PARENT); + error = -EISDIR; + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) + goto out; + } + +retry_lookup: + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + error = mnt_want_write(nd->path.mnt); + if (!error) + got_write = true; + /* + * do _not_ fail yet - we might not need that or fail with + * a different error; let lookup_open() decide; we'll be + * dropping this one anyway. + */ + } + mutex_lock(&dir->d_inode->i_mutex); + error = lookup_open(nd, path, file, op, got_write, opened); + mutex_unlock(&dir->d_inode->i_mutex); + + if (error <= 0) { + if (error) + goto out; + + if ((*opened & FILE_CREATED) || + !S_ISREG(file_inode(file)->i_mode)) + will_truncate = false; + + audit_inode(name, file->f_path.dentry, 0); + goto opened; + } + + if (*opened & FILE_CREATED) { + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = false; + acc_mode = MAY_OPEN; + path_to_nameidata(path, nd); + goto finish_open_created; + } + + /* + * create/update audit record if it already exists. + */ + if (d_is_positive(path->dentry)) + audit_inode(name, path->dentry, 0); + + /* + * If atomic_open() acquired write access it is dropped now due to + * possible mount and symlink following (this might be optimized away if + * necessary...) + */ + if (got_write) { + mnt_drop_write(nd->path.mnt); + got_write = false; + } + + error = -EEXIST; + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) + goto exit_dput; + + error = follow_managed(path, nd->flags); + if (error < 0) + goto exit_dput; + + if (error) + nd->flags |= LOOKUP_JUMPED; + + BUG_ON(nd->flags & LOOKUP_RCU); + inode = path->dentry->d_inode; + error = -ENOENT; + if (d_is_negative(path->dentry)) { + path_to_nameidata(path, nd); + goto out; + } +finish_lookup: + /* we _can_ be in RCU mode here */ + if (should_follow_link(path->dentry, !symlink_ok)) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { + error = -ECHILD; + goto out; + } + } + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { + path_to_nameidata(path, nd); + } else { + save_parent.dentry = nd->path.dentry; + save_parent.mnt = mntget(path->mnt); + nd->path.dentry = path->dentry; + + } + nd->inode = inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ +finish_open: + error = complete_walk(nd); + if (error) { + path_put(&save_parent); + return error; + } + audit_inode(name, nd->path.dentry, 0); + error = -EISDIR; + if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) + goto out; + error = -ENOTDIR; + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) + goto out; + if (!d_is_reg(nd->path.dentry)) + will_truncate = false; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto out; + got_write = true; + } +finish_open_created: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto out; + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { + if (error == -EOPENSTALE) + goto stale_open; + goto out; + } +opened: + error = open_check_o_direct(file); + if (error) + goto exit_fput; + error = ima_file_check(file, op->acc_mode, *opened); + if (error) + goto exit_fput; + + if (will_truncate) { + error = handle_truncate(file); + if (error) + goto exit_fput; + } +out: + if (got_write) + mnt_drop_write(nd->path.mnt); + path_put(&save_parent); + terminate_walk(nd); + return error; + +exit_dput: + path_put_conditional(path, nd); + goto out; +exit_fput: + fput(file); + goto out; + +stale_open: + /* If no saved parent or already retried then can't retry */ + if (!save_parent.dentry || retried) + goto out; + + BUG_ON(save_parent.dentry != dir); + path_put(&nd->path); + nd->path = save_parent; + nd->inode = dir->d_inode; + save_parent.mnt = NULL; + save_parent.dentry = NULL; + if (got_write) { + mnt_drop_write(nd->path.mnt); + got_write = false; + } + retried = true; + goto retry_lookup; +} + +static int do_tmpfile(int dfd, struct filename *pathname, + struct nameidata *nd, int flags, + const struct open_flags *op, + struct file *file, int *opened) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *dentry, *child; + struct inode *dir; + int error = path_lookupat(dfd, pathname, + flags | LOOKUP_DIRECTORY, nd); + if (unlikely(error)) + return error; + error = mnt_want_write(nd->path.mnt); + if (unlikely(error)) + goto out; + /* we want directory to be writable */ + error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + if (error) + goto out2; + dentry = nd->path.dentry; + dir = dentry->d_inode; + if (!dir->i_op->tmpfile) { + error = -EOPNOTSUPP; + goto out2; + } + child = d_alloc(dentry, &name); + if (unlikely(!child)) { + error = -ENOMEM; + goto out2; + } + nd->flags &= ~LOOKUP_DIRECTORY; + nd->flags |= op->intent; + dput(nd->path.dentry); + nd->path.dentry = child; + error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + if (error) + goto out2; + audit_inode(pathname, nd->path.dentry, 0); + /* Don't check for other permissions, the inode was just created */ + error = may_open(&nd->path, MAY_OPEN, op->open_flag); + if (error) + goto out2; + file->f_path.mnt = nd->path.mnt; + error = finish_open(file, nd->path.dentry, NULL, opened); + if (error) + goto out2; + error = open_check_o_direct(file); + if (error) { + fput(file); + } else if (!(op->open_flag & O_EXCL)) { + struct inode *inode = file_inode(file); + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } +out2: + mnt_drop_write(nd->path.mnt); +out: + path_put(&nd->path); + return error; +} + +static struct file *path_openat(int dfd, struct filename *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) +{ + struct file *file; + struct path path; + int opened = 0; + int error; + + file = get_empty_filp(); + if (IS_ERR(file)) + return file; + + file->f_flags = op->open_flag; + + if (unlikely(file->f_flags & __O_TMPFILE)) { + error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); + goto out2; + } + + error = path_init(dfd, pathname, flags, nd); + if (unlikely(error)) + goto out; + + error = do_last(nd, &path, file, op, &opened, pathname); + while (unlikely(error > 0)) { /* trailing symlink */ + struct path link = path; + void *cookie; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + error = -ELOOP; + break; + } + error = may_follow_link(&link, nd); + if (unlikely(error)) + break; + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + break; + error = do_last(nd, &path, file, op, &opened, pathname); + put_link(nd, &link, cookie); + } +out: + path_cleanup(nd); +out2: + if (!(opened & FILE_OPENED)) { + BUG_ON(!error); + put_filp(file); + } + if (unlikely(error)) { + if (error == -EOPENSTALE) { + if (flags & LOOKUP_RCU) + error = -ECHILD; + else + error = -ESTALE; + } + file = ERR_PTR(error); + } + return file; +} + +struct file *do_filp_open(int dfd, struct filename *pathname, + const struct open_flags *op) +{ + struct nameidata nd; + int flags = op->lookup_flags; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op) +{ + struct nameidata nd; + struct file *file; + struct filename *filename; + int flags = op->lookup_flags | LOOKUP_ROOT; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + filename = getname_kernel(name); + if (unlikely(IS_ERR(filename))) + return ERR_CAST(filename); + + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, filename, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + putname(filename); + return file; +} + +static struct dentry *filename_create(int dfd, struct filename *name, + struct path *path, unsigned int lookup_flags) +{ + struct dentry *dentry = ERR_PTR(-EEXIST); + struct nameidata nd; + int err2; + int error; + bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); + + /* + * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any + * other flags passed in are ignored! + */ + lookup_flags &= LOOKUP_REVAL; + + error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd); + if (error) + return ERR_PTR(error); + + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ + if (nd.last_type != LAST_NORM) + goto out; + nd.flags &= ~LOOKUP_PARENT; + nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; + + /* don't fail immediately if it's r/o, at least try to report other errors */ + err2 = mnt_want_write(nd.path.mnt); + /* + * Do the final lookup. + */ + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + if (IS_ERR(dentry)) + goto unlock; + + error = -EEXIST; + if (d_is_positive(dentry)) + goto fail; + + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ + if (unlikely(!is_dir && nd.last.name[nd.last.len])) { + error = -ENOENT; + goto fail; + } + if (unlikely(err2)) { + error = err2; + goto fail; + } + *path = nd.path; + return dentry; +fail: + dput(dentry); + dentry = ERR_PTR(error); +unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (!err2) + mnt_drop_write(nd.path.mnt); +out: + path_put(&nd.path); + return dentry; +} + +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) +{ + struct filename *filename = getname_kernel(pathname); + struct dentry *res; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + res = filename_create(dfd, filename, path, lookup_flags); + putname(filename); + return res; +} +EXPORT_SYMBOL(kern_path_create); + +void done_path_create(struct path *path, struct dentry *dentry) +{ + dput(dentry); + mutex_unlock(&path->dentry->d_inode->i_mutex); + mnt_drop_write(path->mnt); + path_put(path); +} +EXPORT_SYMBOL(done_path_create); + +struct dentry *user_path_create(int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) +{ + struct filename *tmp = getname(pathname); + struct dentry *res; + if (IS_ERR(tmp)) + return ERR_CAST(tmp); + res = filename_create(dfd, tmp, path, lookup_flags); + putname(tmp); + return res; +} +EXPORT_SYMBOL(user_path_create); + +int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + return -EPERM; + + if (!dir->i_op->mknod) + return -EPERM; + + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; + + error = security_inode_mknod(dir, dentry, mode, dev); + if (error) + return error; + + error = dir->i_op->mknod(dir, dentry, mode, dev); + if (!error) + fsnotify_create(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_mknod); + +static int may_mknod(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + case 0: /* zero mode translates to S_IFREG */ + return 0; + case S_IFDIR: + return -EPERM; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, + unsigned, dev) +{ + struct dentry *dentry; + struct path path; + int error; + unsigned int lookup_flags = 0; + + error = may_mknod(mode); + if (error) + return error; +retry: + dentry = user_path_create(dfd, filename, &path, lookup_flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (!IS_POSIXACL(path.dentry->d_inode)) + mode &= ~current_umask(); + error = security_path_mknod(&path, dentry, mode, dev); + if (error) + goto out; + switch (mode & S_IFMT) { + case 0: case S_IFREG: + error = vfs_create(path.dentry->d_inode,dentry,mode,true); + break; + case S_IFCHR: case S_IFBLK: + error = vfs_mknod(path.dentry->d_inode,dentry,mode, + new_decode_dev(dev)); + break; + case S_IFIFO: case S_IFSOCK: + error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); + break; + } +out: + done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) +{ + return sys_mknodat(AT_FDCWD, filename, mode, dev); +} + +int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int error = may_create(dir, dentry); + unsigned max_links = dir->i_sb->s_max_links; + + if (error) + return error; + + if (!dir->i_op->mkdir) + return -EPERM; + + mode &= (S_IRWXUGO|S_ISVTX); + error = security_inode_mkdir(dir, dentry, mode); + if (error) + return error; + + if (max_links && dir->i_nlink >= max_links) + return -EMLINK; + + error = dir->i_op->mkdir(dir, dentry, mode); + if (!error) + fsnotify_mkdir(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_mkdir); + +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +{ + struct dentry *dentry; + struct path path; + int error; + unsigned int lookup_flags = LOOKUP_DIRECTORY; + +retry: + dentry = user_path_create(dfd, pathname, &path, lookup_flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (!IS_POSIXACL(path.dentry->d_inode)) + mode &= ~current_umask(); + error = security_path_mkdir(&path, dentry, mode); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) +{ + return sys_mkdirat(AT_FDCWD, pathname, mode); +} + +/* + * The dentry_unhash() helper will try to drop the dentry early: we + * should have a usage count of 1 if we're the only user of this + * dentry, and if that is true (possibly after pruning the dcache), + * then we drop the dentry now. + * + * A low-level filesystem can, if it choses, legally + * do a + * + * if (!d_unhashed(dentry)) + * return -EBUSY; + * + * if it cannot handle the case of removing a directory + * that is still in use by something else.. + */ +void dentry_unhash(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count == 1) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(dentry_unhash); + +int vfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error = may_delete(dir, dentry, 1); + + if (error) + return error; + + if (!dir->i_op->rmdir) + return -EPERM; + + dget(dentry); + mutex_lock(&dentry->d_inode->i_mutex); + + error = -EBUSY; + if (is_local_mountpoint(dentry)) + goto out; + + error = security_inode_rmdir(dir, dentry); + if (error) + goto out; + + shrink_dcache_parent(dentry); + error = dir->i_op->rmdir(dir, dentry); + if (error) + goto out; + + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + detach_mounts(dentry); + +out: + mutex_unlock(&dentry->d_inode->i_mutex); + dput(dentry); + if (!error) + d_delete(dentry); + return error; +} +EXPORT_SYMBOL(vfs_rmdir); + +static long do_rmdir(int dfd, const char __user *pathname) +{ + int error = 0; + struct filename *name; + struct dentry *dentry; + struct nameidata nd; + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch(nd.last_type) { + case LAST_DOTDOT: + error = -ENOTEMPTY; + goto exit1; + case LAST_DOT: + error = -EINVAL; + goto exit1; + case LAST_ROOT: + error = -EBUSY; + goto exit1; + } + + nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; + + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto exit2; + if (!dentry->d_inode) { + error = -ENOENT; + goto exit3; + } + error = security_path_rmdir(&nd.path, dentry); + if (error) + goto exit3; + error = vfs_rmdir(nd.path.dentry->d_inode, dentry); +exit3: + dput(dentry); +exit2: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mnt_drop_write(nd.path.mnt); +exit1: + path_put(&nd.path); + putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE1(rmdir, const char __user *, pathname) +{ + return do_rmdir(AT_FDCWD, pathname); +} + +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) +{ + struct inode *target = dentry->d_inode; + int error = may_delete(dir, dentry, 0); + + if (error) + return error; + + if (!dir->i_op->unlink) + return -EPERM; + + mutex_lock(&target->i_mutex); + if (is_local_mountpoint(dentry)) + error = -EBUSY; + else { + error = security_inode_unlink(dir, dentry); + if (!error) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + error = dir->i_op->unlink(dir, dentry); + if (!error) { + dont_mount(dentry); + detach_mounts(dentry); + } + } + } +out: + mutex_unlock(&target->i_mutex); + + /* We don't d_delete() NFS sillyrenamed files--they still exist. */ + if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + fsnotify_link_count(target); + d_delete(dentry); + } + + return error; +} +EXPORT_SYMBOL(vfs_unlink); + +/* + * Make sure that the actual truncation of the file will occur outside its + * directory's i_mutex. Truncate can take a long time if there is a lot of + * writeout happening, and we don't want to prevent access to the directory + * while waiting on the I/O. + */ +static long do_unlinkat(int dfd, const char __user *pathname) +{ + int error; + struct filename *name; + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; + struct inode *delegated_inode = NULL; + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; + + nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; +retry_deleg: + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; + inode = dentry->d_inode; + if (d_is_negative(dentry)) + goto slashes; + ihold(inode); + error = security_path_unlink(&nd.path, dentry); + if (error) + goto exit2; + error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); +exit2: + dput(dentry); + } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (inode) + iput(inode); /* truncate the inode here */ + inode = NULL; + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(nd.path.mnt); +exit1: + path_put(&nd.path); + putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + inode = NULL; + goto retry; + } + return error; + +slashes: + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; + goto exit2; +} + +SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) +{ + if ((flag & ~AT_REMOVEDIR) != 0) + return -EINVAL; + + if (flag & AT_REMOVEDIR) + return do_rmdir(dfd, pathname); + + return do_unlinkat(dfd, pathname); +} + +SYSCALL_DEFINE1(unlink, const char __user *, pathname) +{ + return do_unlinkat(AT_FDCWD, pathname); +} + +int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if (!dir->i_op->symlink) + return -EPERM; + + error = security_inode_symlink(dir, dentry, oldname); + if (error) + return error; + + error = dir->i_op->symlink(dir, dentry, oldname); + if (!error) + fsnotify_create(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_symlink); + +SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + int error; + struct filename *from; + struct dentry *dentry; + struct path path; + unsigned int lookup_flags = 0; + + from = getname(oldname); + if (IS_ERR(from)) + return PTR_ERR(from); +retry: + dentry = user_path_create(newdfd, newname, &path, lookup_flags); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_putname; + + error = security_path_symlink(&path, dentry, from->name); + if (!error) + error = vfs_symlink(path.dentry->d_inode, dentry, from->name); + done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out_putname: + putname(from); + return error; +} + +SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) +{ + return sys_symlinkat(oldname, AT_FDCWD, newname); +} + +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) +{ + struct inode *inode = old_dentry->d_inode; + unsigned max_links = dir->i_sb->s_max_links; + int error; + + if (!inode) + return -ENOENT; + + error = may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A link to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + if (!dir->i_op->link) + return -EPERM; + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + error = security_inode_link(old_dentry, dir, new_dentry); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + error = -ENOENT; + else if (max_links && inode->i_nlink >= max_links) + error = -EMLINK; + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } + + if (!error && (inode->i_state & I_LINKABLE)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_LINKABLE; + spin_unlock(&inode->i_lock); + } + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_link(dir, inode, new_dentry); + return error; +} +EXPORT_SYMBOL(vfs_link); + +/* + * Hardlinks are often used in delicate situations. We avoid + * security-related surprises by not following symlinks on the + * newname. --KAB + * + * We don't follow them on the oldname either to be compatible + * with linux 2.0, and to avoid hard-linking to directories + * and other special files. --ADM + */ +SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, int, flags) +{ + struct dentry *new_dentry; + struct path old_path, new_path; + struct inode *delegated_inode = NULL; + int how = 0; + int error; + + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; +retry: + error = user_path_at(olddfd, oldname, how, &old_path); + if (error) + return error; + + new_dentry = user_path_create(newdfd, newname, &new_path, + (how & LOOKUP_REVAL)); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto out; + + error = -EXDEV; + if (old_path.mnt != new_path.mnt) + goto out_dput; + error = may_linkat(&old_path); + if (unlikely(error)) + goto out_dput; + error = security_path_link(old_path.dentry, &new_path, new_dentry); + if (error) + goto out_dput; + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); +out_dput: + done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) { + path_put(&old_path); + goto retry; + } + } + if (retry_estale(error, how)) { + path_put(&old_path); + how |= LOOKUP_REVAL; + goto retry; + } +out: + path_put(&old_path); + + return error; +} + +SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) +{ + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} + +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * + * The worst of all namespace operations - renaming directory. "Perverted" + * doesn't even start to describe it. Somebody in UCB had a heck of a trip... + * Problems: + * a) we can get into loop creation. + * b) race potential - two innocent renames can create a loop together. + * That's where 4.4 screws up. Current fix: serialization on + * sb->s_vfs_rename_mutex. We might be more accurate, but that's another + * story. + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). + * And that - after we got ->i_mutex on parents (until then we don't know + * whether the target exists). Solution: try to be smart with locking + * order for inodes. We rely on the fact that tree topology may change + * only under ->s_vfs_rename_mutex _and_ that parent of the object we + * move will be locked. Thus we can rank directories by the tree + * (ancestors first) and rank all non-directories after them. + * That works since everybody except rename does "lock parent, lookup, + * lock child" and rename is under ->s_vfs_rename_mutex. + * HOWEVER, it relies on the assumption that any object with ->lookup() + * has no more than 1 dentry. If "hybrid" objects will ever appear, + * we'd better make sure that there's no link(2) for them. + * d) conversion from fhandle to dentry may come in the wrong moment - when + * we are removing the target. Solution: we will have to grab ->i_mutex + * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on + * ->i_mutex on parents, which works but leads to some truly excessive + * locking]. + */ +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) +{ + int error; + bool is_dir = d_is_dir(old_dentry); + const unsigned char *old_name; + struct inode *source = old_dentry->d_inode; + struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; + unsigned max_links = new_dir->i_sb->s_max_links; + + if (source == target) + return 0; + + error = may_delete(old_dir, old_dentry, is_dir); + if (error) + return error; + + if (!target) { + error = may_create(new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); + + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } + if (error) + return error; + + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) + return -EPERM; + + if (flags && !old_dir->i_op->rename2) + return -EINVAL; + + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } + } + + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (error) + return error; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + dget(new_dentry); + if (!is_dir || (flags & RENAME_EXCHANGE)) + lock_two_nondirectories(source, target); + else if (target) + mutex_lock(&target->i_mutex); + + error = -EBUSY; + if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) + goto out; + + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) + shrink_dcache_parent(new_dentry); + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + if (!old_dir->i_op->rename2) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + WARN_ON(old_dir->i_op->rename != NULL); + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } + if (error) + goto out; + + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) + target->i_flags |= S_DEAD; + dont_mount(new_dentry); + detach_mounts(new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } +out: + if (!is_dir || (flags & RENAME_EXCHANGE)) + unlock_two_nondirectories(source, target); + else if (target) + mutex_unlock(&target->i_mutex); + dput(new_dentry); + if (!error) { + fsnotify_move(old_dir, new_dir, old_name, is_dir, + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } + fsnotify_oldname_free(old_name); + + return error; +} +EXPORT_SYMBOL(vfs_rename); + +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) +{ + struct dentry *old_dir, *new_dir; + struct dentry *old_dentry, *new_dentry; + struct dentry *trap; + struct nameidata oldnd, newnd; + struct inode *delegated_inode = NULL; + struct filename *from; + struct filename *to; + unsigned int lookup_flags = 0; + bool should_retry = false; + int error; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + +retry: + from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); + if (IS_ERR(from)) { + error = PTR_ERR(from); + goto exit; + } + + to = user_path_parent(newdfd, newname, &newnd, lookup_flags); + if (IS_ERR(to)) { + error = PTR_ERR(to); + goto exit1; + } + + error = -EXDEV; + if (oldnd.path.mnt != newnd.path.mnt) + goto exit2; + + old_dir = oldnd.path.dentry; + error = -EBUSY; + if (oldnd.last_type != LAST_NORM) + goto exit2; + + new_dir = newnd.path.dentry; + if (flags & RENAME_NOREPLACE) + error = -EEXIST; + if (newnd.last_type != LAST_NORM) + goto exit2; + + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit2; + + oldnd.flags &= ~LOOKUP_PARENT; + newnd.flags &= ~LOOKUP_PARENT; + if (!(flags & RENAME_EXCHANGE)) + newnd.flags |= LOOKUP_RENAME_TARGET; + +retry_deleg: + trap = lock_rename(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; + /* source must exist */ + error = -ENOENT; + if (d_is_negative(old_dentry)) + goto exit4; + new_dentry = lookup_hash(&newnd); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + } + /* unless the source is a directory trailing slashes give -ENOTDIR */ + if (!d_is_dir(old_dentry)) { + error = -ENOTDIR; + if (oldnd.last.name[oldnd.last.len]) + goto exit5; + if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + goto exit5; + } + /* source should not be ancestor of target */ + error = -EINVAL; + if (old_dentry == trap) + goto exit5; + /* target should not be an ancestor of source */ + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; + if (new_dentry == trap) + goto exit5; + + error = security_path_rename(&oldnd.path, old_dentry, + &newnd.path, new_dentry, flags); + if (error) + goto exit5; + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry, + &delegated_inode, flags); +exit5: + dput(new_dentry); +exit4: + dput(old_dentry); +exit3: + unlock_rename(new_dir, old_dir); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(oldnd.path.mnt); +exit2: + if (retry_estale(error, lookup_flags)) + should_retry = true; + path_put(&newnd.path); + putname(to); +exit1: + path_put(&oldnd.path); + putname(from); + if (should_retry) { + should_retry = false; + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +exit: + return error; +} + +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); +} + +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +{ + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} + +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + +int readlink_copy(char __user *buffer, int buflen, const char *link) +{ + int len = PTR_ERR(link); + if (IS_ERR(link)) + goto out; + + len = strlen(link); + if (len > (unsigned) buflen) + len = buflen; + if (copy_to_user(buffer, link, len)) + len = -EFAULT; +out: + return len; +} +EXPORT_SYMBOL(readlink_copy); + +/* + * A helper for ->readlink(). This should be used *ONLY* for symlinks that + * have ->follow_link() touching nd only in nd_set_link(). Using (or not + * using) it for any given inode is up to filesystem. + */ +int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct nameidata nd; + void *cookie; + int res; + + nd.depth = 0; + cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); + if (IS_ERR(cookie)) + return PTR_ERR(cookie); + + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); + if (dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + return res; +} +EXPORT_SYMBOL(generic_readlink); + +/* get the link contents into pagecache */ +static char *page_getlink(struct dentry * dentry, struct page **ppage) +{ + char *kaddr; + struct page *page; + struct address_space *mapping = dentry->d_inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return (char*)page; + *ppage = page; + kaddr = kmap(page); + nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + return kaddr; +} + +int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct page *page = NULL; + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); + if (page) { + kunmap(page); + page_cache_release(page); + } + return res; +} +EXPORT_SYMBOL(page_readlink); + +void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + nd_set_link(nd, page_getlink(dentry, &page)); + return page; +} +EXPORT_SYMBOL(page_follow_link_light); + +void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + struct page *page = cookie; + + if (page) { + kunmap(page); + page_cache_release(page); + } +} +EXPORT_SYMBOL(page_put_link); + +/* + * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS + */ +int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + int err; + char *kaddr; + unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + if (nofs) + flags |= AOP_FLAG_NOFS; + +retry: + err = pagecache_write_begin(NULL, mapping, 0, len-1, + flags, &page, &fsdata); + if (err) + goto fail; + + kaddr = kmap_atomic(page); + memcpy(kaddr, symname, len-1); + kunmap_atomic(kaddr); + + err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, + page, fsdata); + if (err < 0) + goto fail; + if (err < len-1) + goto retry; + + mark_inode_dirty(inode); + return 0; +fail: + return err; +} +EXPORT_SYMBOL(__page_symlink); + +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); +} +EXPORT_SYMBOL(page_symlink); + +const struct inode_operations page_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; +EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/kernel/fs/namespace.c b/kernel/fs/namespace.c new file mode 100644 index 000000000..c246b29e4 --- /dev/null +++ b/kernel/fs/namespace.c @@ -0,0 +1,3298 @@ +/* + * linux/fs/namespace.c + * + * (C) Copyright Al Viro 2000, 2001 + * Released under GPL v2. + * + * Based on code from fs/super.c, copyright Linus Torvalds and others. + * Heavily rewritten. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* init_rootfs */ +#include /* get_fs_root et.al. */ +#include /* fsnotify_vfsmount_delete */ +#include +#include +#include +#include +#include +#include "pnode.h" +#include "internal.h" + +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; + +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); + +static u64 event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); +static int mnt_id_start = 0; +static int mnt_group_start = 1; + +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; +static struct kmem_cache *mnt_cache __read_mostly; +static DECLARE_RWSEM(namespace_sem); + +/* /sys/fs */ +struct kobject *fs_kobj; +EXPORT_SYMBOL_GPL(fs_kobj); + +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); + +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); + tmp += ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; +} + +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; +} + +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialize with freeing. + */ +static int mnt_alloc_id(struct mount *mnt) +{ + int res; + +retry: + ida_pre_get(&mnt_id_ida, GFP_KERNEL); + spin_lock(&mnt_id_lock); + res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); + if (!res) + mnt_id_start = mnt->mnt_id + 1; + spin_unlock(&mnt_id_lock); + if (res == -EAGAIN) + goto retry; + + return res; +} + +static void mnt_free_id(struct mount *mnt) +{ + int id = mnt->mnt_id; + spin_lock(&mnt_id_lock); + ida_remove(&mnt_id_ida, id); + if (mnt_id_start > id) + mnt_id_start = id; + spin_unlock(&mnt_id_lock); +} + +/* + * Allocate a new peer group ID + * + * mnt_group_ida is protected by namespace_sem + */ +static int mnt_alloc_group_id(struct mount *mnt) +{ + int res; + + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) + return -ENOMEM; + + res = ida_get_new_above(&mnt_group_ida, + mnt_group_start, + &mnt->mnt_group_id); + if (!res) + mnt_group_start = mnt->mnt_group_id + 1; + + return res; +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct mount *mnt) +{ + int id = mnt->mnt_group_id; + ida_remove(&mnt_group_ida, id); + if (mnt_group_start > id) + mnt_group_start = id; + mnt->mnt_group_id = 0; +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct mount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif +} + +static void drop_mountpoint(struct fs_pin *p) +{ + struct mount *m = container_of(p, struct mount, mnt_umount); + dput(m->mnt_ex_mountpoint); + pin_remove(p); + mntput(&m->mnt); +} + +static struct mount *alloc_vfsmnt(const char *name) +{ + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + if (mnt) { + int err; + + err = mnt_alloc_id(mnt); + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; + } + +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + + INIT_HLIST_NODE(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_expire); + INIT_LIST_HEAD(&mnt->mnt_share); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_NODE(&mnt->mnt_mp_list); +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); +#endif + init_fs_pin(&mnt->mnt_umount, drop_mountpoint); + } + return mnt; + +#ifdef CONFIG_SMP +out_free_devname: + kfree_const(mnt->mnt_devname); +#endif +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +static inline void mnt_inc_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_inc(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers++; +#endif +} + +static inline void mnt_dec_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_dec(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers--; +#endif +} + +static unsigned int mnt_get_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; + } + + return count; +#else + return mnt->mnt_writers; +#endif +} + +static int mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + +/* + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. + */ +/** + * __mnt_want_write - get write access to a mount without freeze protection + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. + */ +int __mnt_want_write(struct vfsmount *m) +{ + struct mount *mnt = real_mount(m); + int ret = 0; + + preempt_disable(); + mnt_inc_writers(mnt); + /* + * The store to mnt_inc_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + preempt_enable(); + cpu_chill(); + preempt_disable(); + } + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); + if (mnt_is_readonly(m)) { + mnt_dec_writers(mnt); + ret = -EROFS; + } + preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +/** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + mnt_inc_writers(real_mount(mnt)); + preempt_enable(); + return 0; +} +EXPORT_SYMBOL_GPL(mnt_clone_write); + +/** + * __mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like __mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int __mnt_want_write_file(struct file *file) +{ + if (!(file->f_mode & FMODE_WRITER)) + return __mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write_file); + +/** + * __mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * __mnt_want_write() call above. + */ +void __mnt_drop_write(struct vfsmount *mnt) +{ + preempt_disable(); + mnt_dec_writers(real_mount(mnt)); + preempt_enable(); +} + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + +void mnt_drop_write_file(struct file *file) +{ + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); + +static int mnt_make_readonly(struct mount *mnt) +{ + int ret = 0; + + lock_mount_hash(); + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + /* + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. + */ + smp_mb(); + + /* + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. + */ + if (mnt_get_writers(mnt) > 0) + ret = -EBUSY; + else + mnt->mnt.mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + unlock_mount_hash(); + return ret; +} + +static void __mnt_unmake_readonly(struct mount *mnt) +{ + lock_mount_hash(); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + unlock_mount_hash(); +} + +int sb_prepare_remount_readonly(struct super_block *sb) +{ + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } + unlock_mount_hash(); + + return err; +} + +static void free_vfsmnt(struct mount *mnt) +{ + kfree_const(mnt->mnt_devname); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_pcp); +#endif + kmem_cache_free(mnt_cache, mnt); +} + +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return false; + if (bastard == NULL) + return true; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + if (likely(!read_seqretry(&mount_lock, seq))) + return true; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return false; + } + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + return false; +} + +/* + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock() + */ +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) +{ + struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; + + hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +} + +/* + * find the last mount at @dentry on vfsmount @mnt. + * mount_lock must be held. + */ +struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) +{ + struct mount *p, *res = NULL; + p = __lookup_mnt(mnt, dentry); + if (!p) + goto out; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; + hlist_for_each_entry_continue(p, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) + break; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; + } +out: + return res; +} + +/* + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically. If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount. + */ +struct vfsmount *lookup_mnt(struct path *path) +{ + struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; +} + +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + + hlist_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret; + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + ret = d_set_mounted(dentry); + if (ret) { + kfree(mp); + return ERR_PTR(ret); + } + + mp->m_dentry = dentry; + mp->m_count = 1; + hlist_add_head(&mp->m_hash, chain); + INIT_HLIST_HEAD(&mp->m_list); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + BUG_ON(!hlist_empty(&mp->m_list)); + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + hlist_del(&mp->m_hash); + kfree(mp); + } +} + +static inline int check_mnt(struct mount *mnt) +{ + return mnt->mnt_ns == current->nsproxy->mnt_ns; +} + +/* + * vfsmount lock must be held for write + */ +static void touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns) { + ns->event = ++event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * vfsmount lock must be held for write + */ +static void __touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns && ns->event != event) { + ns->event = event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * vfsmount lock must be held for write + */ +static void unhash_mnt(struct mount *mnt) +{ + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + list_del_init(&mnt->mnt_child); + hlist_del_init_rcu(&mnt->mnt_hash); + hlist_del_init(&mnt->mnt_mp_list); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; +} + +/* + * vfsmount lock must be held for write + */ +static void detach_mnt(struct mount *mnt, struct path *old_path) +{ + old_path->dentry = mnt->mnt_mountpoint; + old_path->mnt = &mnt->mnt_parent->mnt; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ +static void umount_mnt(struct mount *mnt) +{ + /* old mountpoint will be dropped when we can do that */ + mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, + struct mount *child_mnt) +{ + mp->m_count++; + mnt_add_count(mnt, 1); /* essentially, that's mntget */ + child_mnt->mnt_mountpoint = dget(mp->m_dentry); + child_mnt->mnt_parent = mnt; + child_mnt->mnt_mp = mp; + hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); +} + +/* + * vfsmount lock must be held for write + */ +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) +{ + mnt_set_mountpoint(parent, mp, mnt); + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + +static void attach_shadowed(struct mount *mnt, + struct mount *parent, + struct mount *shadows) +{ + if (shadows) { + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); + list_add(&mnt->mnt_child, &shadows->mnt_child); + } else { + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } +} + +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt, struct mount *shadows) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + LIST_HEAD(head); + struct mnt_namespace *n = parent->mnt_ns; + + BUG_ON(parent == mnt); + + list_add_tail(&head, &mnt->mnt_list); + list_for_each_entry(m, &head, mnt_list) + m->mnt_ns = n; + + list_splice(&head, n->list.prev); + + attach_shadowed(mnt, parent, shadows); + touch_mnt_namespace(n); +} + +static struct mount *next_mnt(struct mount *p, struct mount *root) +{ + struct list_head *next = p->mnt_mounts.next; + if (next == &p->mnt_mounts) { + while (1) { + if (p == root) + return NULL; + next = p->mnt_child.next; + if (next != &p->mnt_parent->mnt_mounts) + break; + p = p->mnt_parent; + } + } + return list_entry(next, struct mount, mnt_child); +} + +static struct mount *skip_mnt_tree(struct mount *p) +{ + struct list_head *prev = p->mnt_mounts.prev; + while (prev != &p->mnt_mounts) { + p = list_entry(prev, struct mount, mnt_child); + prev = p->mnt_mounts.prev; + } + return p; +} + +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct mount *mnt; + struct dentry *root; + + if (!type) + return ERR_PTR(-ENODEV); + + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flags & MS_KERNMOUNT) + mnt->mnt.mnt_flags = MNT_INTERNAL; + + root = mount_fs(type, flags, name, data); + if (IS_ERR(root)) { + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_CAST(root); + } + + mnt->mnt.mnt_root = root; + mnt->mnt.mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + unlock_mount_hash(); + return &mnt->mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +static struct mount *clone_mnt(struct mount *old, struct dentry *root, + int flag) +{ + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt; + int err; + + mnt = alloc_vfsmnt(old->mnt_devname); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + /* Don't allow unprivileged users to change mount flags */ + if (flag & CL_UNPRIVILEGED) { + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; + + if (mnt->mnt.mnt_flags & MNT_READONLY) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + if (mnt->mnt.mnt_flags & MNT_NODEV) + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; + + if (mnt->mnt.mnt_flags & MNT_NOSUID) + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; + + if (mnt->mnt.mnt_flags & MNT_NOEXEC) + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; + } + + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && + (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) + mnt->mnt.mnt_flags |= MNT_LOCKED; + + atomic_inc(&sb->s_active); + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + unlock_mount_hash(); + + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); + } + + return mnt; + + out_free: + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_PTR(err); +} + +static void cleanup_mnt(struct mount *mnt) +{ + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void __cleanup_mnt(struct rcu_head *head) +{ + cleanup_mnt(container_of(head, struct mount, mnt_rcu)); +} + +static LLIST_HEAD(delayed_mntput_list); +static void delayed_mntput(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&delayed_mntput_list); + struct llist_node *next; + + for (; node; node = next) { + next = llist_next(node); + cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); + } +} +static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); + +static void mntput_no_expire(struct mount *mnt) +{ + rcu_read_lock(); + mnt_add_count(mnt, -1); + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + rcu_read_unlock(); + return; + } + lock_mount_hash(); + if (mnt_get_count(mnt)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); + + list_del(&mnt->mnt_instance); + + if (unlikely(!list_empty(&mnt->mnt_mounts))) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + umount_mnt(p); + } + } + unlock_mount_hash(); + + if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { + struct task_struct *task = current; + if (likely(!(task->flags & PF_KTHREAD))) { + init_task_work(&mnt->mnt_rcu, __cleanup_mnt); + if (!task_work_add(task, &mnt->mnt_rcu, true)) + return; + } + if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) + schedule_delayed_work(&delayed_mntput_work, 1); + return; + } + cleanup_mnt(mnt); +} + +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + struct mount *m = real_mount(mnt); + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); + } +} +EXPORT_SYMBOL(mntput); + +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_add_count(real_mount(mnt), 1); + return mnt; +} +EXPORT_SYMBOL(mntget); + +struct vfsmount *mnt_clone_internal(struct path *path) +{ + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; +} + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\"); +} + +/* + * Simple .show_options callback for filesystems which don't want to + * implement more complex mount option showing. + * + * See also save_mount_options(). + */ +int generic_show_options(struct seq_file *m, struct dentry *root) +{ + const char *options; + + rcu_read_lock(); + options = rcu_dereference(root->d_sb->s_options); + + if (options != NULL && options[0]) { + seq_putc(m, ','); + mangle(m, options); + } + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL(generic_show_options); + +/* + * If filesystem uses generic_show_options(), this function should be + * called from the fill_super() callback. + * + * The .remount_fs callback usually needs to be handled in a special + * way, to make sure, that previous options are not overwritten if the + * remount fails. + * + * Also note, that if the filesystem's .remount_fs function doesn't + * reset all options to their default value, but changes only newly + * given options, then the displayed options will not reflect reality + * any more. + */ +void save_mount_options(struct super_block *sb, char *options) +{ + BUG_ON(sb->s_options); + rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); +} +EXPORT_SYMBOL(save_mount_options); + +void replace_mount_options(struct super_block *sb, char *options) +{ + char *old = sb->s_options; + rcu_assign_pointer(sb->s_options, options); + if (old) { + synchronize_rcu(); + kfree(old); + } +} +EXPORT_SYMBOL(replace_mount_options); + +#ifdef CONFIG_PROC_FS +/* iterator; we want it to have access to namespace_sem, thus here... */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_mounts *p = proc_mounts(m); + + down_read(&namespace_sem); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_mounts *p = proc_mounts(m); + + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; +} + +static void m_stop(struct seq_file *m, void *v) +{ + up_read(&namespace_sem); +} + +static int m_show(struct seq_file *m, void *v) +{ + struct proc_mounts *p = proc_mounts(m); + struct mount *r = list_entry(v, struct mount, mnt_list); + return p->show(m, &r->mnt); +} + +const struct seq_operations mounts_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show, +}; +#endif /* CONFIG_PROC_FS */ + +/** + * may_umount_tree - check if a mount tree is busy + * @mnt: root of mount tree + * + * This is called to check if a tree of mounts has any + * open files, pwds, chroots or sub mounts that are + * busy. + */ +int may_umount_tree(struct vfsmount *m) +{ + struct mount *mnt = real_mount(m); + int actual_refs = 0; + int minimum_refs = 0; + struct mount *p; + BUG_ON(!m); + + /* write lock needed for mnt_get_count */ + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + actual_refs += mnt_get_count(p); + minimum_refs += 2; + } + unlock_mount_hash(); + + if (actual_refs > minimum_refs) + return 0; + + return 1; +} + +EXPORT_SYMBOL(may_umount_tree); + +/** + * may_umount - check if a mount point is busy + * @mnt: root of mount + * + * This is called to check if a mount point has any + * open files, pwds, chroots or sub mounts. If the + * mount has sub mounts this will return busy + * regardless of whether the sub mounts are busy. + * + * Doesn't take quota and stuff into account. IOW, in some cases it will + * give false negatives. The main reason why it's here is that we need + * a non-destructive way to look for easily umountable filesystems. + */ +int may_umount(struct vfsmount *mnt) +{ + int ret = 1; + down_read(&namespace_sem); + lock_mount_hash(); + if (propagate_mount_busy(real_mount(mnt), 2)) + ret = 0; + unlock_mount_hash(); + up_read(&namespace_sem); + return ret; +} + +EXPORT_SYMBOL(may_umount); + +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) +{ + struct hlist_head head; + + hlist_move_list(&unmounted, &head); + + up_write(&namespace_sem); + + if (likely(hlist_empty(&head))) + return; + + synchronize_rcu(); + + group_pin_kill(&head); +} + +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + +enum umount_tree_flags { + UMOUNT_SYNC = 1, + UMOUNT_PROPAGATE = 2, + UMOUNT_CONNECTED = 4, +}; +/* + * mount_lock must be held + * namespace_sem must be held for write + */ +static void umount_tree(struct mount *mnt, enum umount_tree_flags how) +{ + LIST_HEAD(tmp_list); + struct mount *p; + + if (how & UMOUNT_PROPAGATE) + propagate_mount_unlock(mnt); + + /* Gather the mounts to umount */ + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt.mnt_flags |= MNT_UMOUNT; + list_move(&p->mnt_list, &tmp_list); + } + + /* Hide the mounts from mnt_mounts */ + list_for_each_entry(p, &tmp_list, mnt_list) { + list_del_init(&p->mnt_child); + } + + /* Add propogated mounts to the tmp_list */ + if (how & UMOUNT_PROPAGATE) + propagate_umount(&tmp_list); + + while (!list_empty(&tmp_list)) { + bool disconnect; + p = list_first_entry(&tmp_list, struct mount, mnt_list); + list_del_init(&p->mnt_expire); + list_del_init(&p->mnt_list); + __touch_mnt_namespace(p->mnt_ns); + p->mnt_ns = NULL; + if (how & UMOUNT_SYNC) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; + + disconnect = !(((how & UMOUNT_CONNECTED) && + mnt_has_parent(p) && + (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) || + IS_MNT_LOCKED_AND_LAZY(p)); + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, + disconnect ? &unmounted : NULL); + if (mnt_has_parent(p)) { + mnt_add_count(p->mnt_parent, -1); + if (!disconnect) { + /* Don't forget about p */ + list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); + } else { + umount_mnt(p); + } + } + change_mnt_propagation(p, MS_PRIVATE); + } +} + +static void shrink_submounts(struct mount *mnt); + +static int do_umount(struct mount *mnt, int flags) +{ + struct super_block *sb = mnt->mnt.mnt_sb; + int retval; + + retval = security_sb_umount(&mnt->mnt, flags); + if (retval) + return retval; + + /* + * Allow userspace to request a mountpoint be expired rather than + * unmounting unconditionally. Unmount only happens if: + * (1) the mark is already set (the mark is cleared by mntput()) + * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] + */ + if (flags & MNT_EXPIRE) { + if (&mnt->mnt == current->fs->root.mnt || + flags & (MNT_FORCE | MNT_DETACH)) + return -EINVAL; + + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + lock_mount_hash(); + if (mnt_get_count(mnt) != 2) { + unlock_mount_hash(); + return -EBUSY; + } + unlock_mount_hash(); + + if (!xchg(&mnt->mnt_expiry_mark, 1)) + return -EAGAIN; + } + + /* + * If we may have to abort operations to get out of this + * mount, and they will themselves hold resources we must + * allow the fs to do things. In the Unix tradition of + * 'Gee thats tricky lets do it in userspace' the umount_begin + * might fail to complete on the first run through as other tasks + * must return, and the like. Thats for the mount program to worry + * about for the moment. + */ + + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + sb->s_op->umount_begin(sb); + } + + /* + * No sense to grab the lock for this test, but test itself looks + * somewhat bogus. Suggestions for better replacement? + * Ho-hum... In principle, we might treat that as umount + switch + * to rootfs. GC would eventually take care of the old vfsmount. + * Actually it makes sense, especially if rootfs would contain a + * /reboot - static binary that would close all descriptors and + * call reboot(9). Then init(8) could umount root and exec /reboot. + */ + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + /* + * Special case for "unmounting" root ... + * we just try to remount it readonly. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + down_write(&sb->s_umount); + if (!(sb->s_flags & MS_RDONLY)) + retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); + up_write(&sb->s_umount); + return retval; + } + + namespace_lock(); + lock_mount_hash(); + event++; + + if (flags & MNT_DETACH) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, UMOUNT_PROPAGATE); + retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + retval = 0; + } + } + unlock_mount_hash(); + namespace_unlock(); + return retval; +} + +/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (IS_ERR_OR_NULL(mp)) + goto out_unlock; + + lock_mount_hash(); + while (!hlist_empty(&mp->m_list)) { + mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + if (mnt->mnt.mnt_flags & MNT_UMOUNT) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + hlist_add_head(&p->mnt_umount.s_list, &unmounted); + umount_mnt(p); + } + } + else umount_tree(mnt, UMOUNT_CONNECTED); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + +/* + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ + return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} + +/* + * Now umount can handle mount points as well as block devices. + * This is important for filesystems which use unnamed block devices. + * + * We now support a flag for forced unmount like the other 'big iron' + * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD + */ + +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +{ + struct path path; + struct mount *mnt; + int retval; + int lookup_flags = 0; + + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); + if (retval) + goto out; + mnt = real_mount(path.mnt); + retval = -EINVAL; + if (path.dentry != path.mnt->mnt_root) + goto dput_and_out; + if (!check_mnt(mnt)) + goto dput_and_out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto dput_and_out; + retval = -EPERM; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + goto dput_and_out; + + retval = do_umount(mnt, flags); +dput_and_out: + /* we mustn't call path_put() as that would clear mnt_expiry_mark */ + dput(path.dentry); + mntput_no_expire(mnt); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLDUMOUNT + +/* + * The 2.0 compatible umount. No flags. + */ +SYSCALL_DEFINE1(oldumount, char __user *, name) +{ + return sys_umount(name, 0); +} + +#endif + +static bool is_mnt_ns_file(struct dentry *dentry) +{ + /* Is this a proxy for a mount namespace? */ + return dentry->d_op == &ns_dentry_operations && + dentry->d_fsdata == &mntns_operations; +} + +struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; +} + +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, + int flag) +{ + struct mount *res, *p, *q, *r, *parent; + + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); + + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) + return ERR_PTR(-EINVAL); + + res = q = clone_mnt(mnt, dentry, flag); + if (IS_ERR(q)) + return q; + + q->mnt_mountpoint = mnt->mnt_mountpoint; + + p = mnt; + list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + struct mount *s; + if (!is_subdir(r->mnt_mountpoint, dentry)) + continue; + + for (s = r; s; s = next_mnt(s, r)) { + struct mount *t = NULL; + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { + s = skip_mnt_tree(s); + continue; + } + while (p != s->mnt_parent) { + p = p->mnt_parent; + q = q->mnt_parent; + } + p = s; + parent = q; + q = clone_mnt(p, p->mnt.mnt_root, flag); + if (IS_ERR(q)) + goto out; + lock_mount_hash(); + list_add_tail(&q->mnt_list, &res->mnt_list); + mnt_set_mountpoint(parent, p->mnt_mp, q); + if (!list_empty(&parent->mnt_mounts)) { + t = list_last_entry(&parent->mnt_mounts, + struct mount, mnt_child); + if (t->mnt_mp != p->mnt_mp) + t = NULL; + } + attach_shadowed(q, parent, t); + unlock_mount_hash(); + } + } + return res; +out: + if (res) { + lock_mount_hash(); + umount_tree(res, UMOUNT_SYNC); + unlock_mount_hash(); + } + return q; +} + +/* Caller should check returned pointer for errors */ + +struct vfsmount *collect_mounts(struct path *path) +{ + struct mount *tree; + namespace_lock(); + if (!check_mnt(real_mount(path->mnt))) + tree = ERR_PTR(-EINVAL); + else + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); + namespace_unlock(); + if (IS_ERR(tree)) + return ERR_CAST(tree); + return &tree->mnt; +} + +void drop_collected_mounts(struct vfsmount *mnt) +{ + namespace_lock(); + lock_mount_hash(); + umount_tree(real_mount(mnt), UMOUNT_SYNC); + unlock_mount_hash(); + namespace_unlock(); +} + +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct mount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); + if (res) + return res; + } + return 0; +} + +static void cleanup_group_ids(struct mount *mnt, struct mount *end) +{ + struct mount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct mount *mnt, bool recurse) +{ + struct mount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; +} + +/* + * @source_mnt : mount tree to be attached + * @nd : place the mount tree @source_mnt is attached + * @parent_nd : if non-null, detach the source_mnt from its parent and + * store the parent mount and mountpoint dentry. + * (done when source_mnt is moved) + * + * NOTE: in the table below explains the semantics when a source mount + * of a given type is attached to a destination mount of a given type. + * --------------------------------------------------------------------------- + * | BIND MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (++) | shared (+) | shared(+++)| invalid | + * | | | | | | + * |non-shared| shared (+) | private | slave (*) | invalid | + * *************************************************************************** + * A bind operation clones the source mount and mounts the clone on the + * destination mount. + * + * (++) the cloned mount is propagated to all the mounts in the propagation + * tree of the destination mount and the cloned mount is added to + * the peer group of the source mount. + * (+) the cloned mount is created under the destination mount and is marked + * as shared. The cloned mount is added to the peer group of the source + * mount. + * (+++) the mount is propagated to all the mounts in the propagation tree + * of the destination mount and the cloned mount is made slave + * of the same master as that of the source mount. The cloned mount + * is marked as 'shared and slave'. + * (*) the cloned mount is made a slave of the same master as that of the + * source mount. + * + * --------------------------------------------------------------------------- + * | MOVE MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (+) | shared (+) | shared(+++) | invalid | + * | | | | | | + * |non-shared| shared (+*) | private | slave (*) | unbindable | + * *************************************************************************** + * + * (+) the mount is moved to the destination. And is then propagated to + * all the mounts in the propagation tree of the destination mount. + * (+*) the mount is moved to the destination. + * (+++) the mount is moved to the destination and is then propagated to + * all the mounts belonging to the destination mount's propagation tree. + * the mount is marked as 'shared and slave'. + * (*) the mount continues to be a slave at the new location. + * + * if the source mount is a tree, the operations explained above is + * applied to each mount in the tree. + * Must be called without spinlocks held, since this function can sleep + * in allocations. + */ +static int attach_recursive_mnt(struct mount *source_mnt, + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) +{ + HLIST_HEAD(tree_list); + struct mount *child, *p; + struct hlist_node *n; + int err; + + if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + set_mnt_shared(p); + } else { + lock_mount_hash(); + } + if (parent_path) { + detach_mnt(source_mnt, parent_path); + attach_mnt(source_mnt, dest_mnt, dest_mp); + touch_mnt_namespace(source_mnt->mnt_ns); + } else { + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + commit_tree(source_mnt, NULL); + } + + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; + hlist_del_init(&child->mnt_hash); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); + } + unlock_mount_hash(); + + return 0; + + out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, UMOUNT_SYNC); + } + unlock_mount_hash(); + cleanup_group_ids(source_mnt, NULL); + out: + return err; +} + +static struct mountpoint *lock_mount(struct path *path) +{ + struct vfsmount *mnt; + struct dentry *dentry = path->dentry; +retry: + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); + } + namespace_lock(); + mnt = lookup_mnt(path); + if (likely(!mnt)) { + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); + path->mnt = mnt; + dentry = path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct mountpoint *where) +{ + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); +} + +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +{ + if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) + return -EINVAL; + + if (d_is_dir(mp->m_dentry) != + d_is_dir(mnt->mnt.mnt_root)) + return -ENOTDIR; + + return attach_recursive_mnt(mnt, p, mp, NULL); +} + +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~(MS_REC | MS_SILENT); + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + +/* + * recursively change the type of the mountpoint. + */ +static int do_change_type(struct path *path, int flag) +{ + struct mount *m; + struct mount *mnt = real_mount(path->mnt); + int recurse = flag & MS_REC; + int type; + int err = 0; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + + namespace_lock(); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + + lock_mount_hash(); + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) + change_mnt_propagation(m, type); + unlock_mount_hash(); + + out_unlock: + namespace_unlock(); + return err; +} + +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + +/* + * do loopback mount. + */ +static int do_loopback(struct path *path, const char *old_name, + int recurse) +{ + struct path old_path; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; + int err; + if (!old_name || !*old_name) + return -EINVAL; + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (err) + return err; + + err = -EINVAL; + if (mnt_ns_loop(old_path.dentry)) + goto out; + + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) + goto out; + + old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); + + err = -EINVAL; + if (IS_MNT_UNBINDABLE(old)) + goto out2; + + if (!check_mnt(parent)) + goto out2; + + if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) + goto out2; + + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + + if (recurse) + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); + else + mnt = clone_mnt(old, old_path.dentry, 0); + + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out2; + } + + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + err = graft_tree(mnt, parent, mp); + if (err) { + lock_mount_hash(); + umount_tree(mnt, UMOUNT_SYNC); + unlock_mount_hash(); + } +out2: + unlock_mount(mp); +out: + path_put(&old_path); + return err; +} + +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (readonly_request) + error = mnt_make_readonly(real_mount(mnt)); + else + __mnt_unmake_readonly(real_mount(mnt)); + return error; +} + +/* + * change filesystem flags. dir should be a physical root of filesystem. + * If you've mounted a non-root directory somewhere and want to do remount + * on it - tough luck. + */ +static int do_remount(struct path *path, int flags, int mnt_flags, + void *data) +{ + int err; + struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); + + if (!check_mnt(mnt)) + return -EINVAL; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + /* Don't allow changing of locked mnt flags. + * + * No locks need to be held here while testing the various + * MNT_LOCK flags because those flags can never be cleared + * once they are set. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { + /* Was the nodev implicitly added in mount? */ + if ((mnt->mnt_ns->user_ns != &init_user_ns) && + !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { + mnt_flags |= MNT_NODEV; + } else { + return -EPERM; + } + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { + return -EPERM; + } + + err = security_sb_remount(sb, data); + if (err) + return err; + + down_write(&sb->s_umount); + if (flags & MS_BIND) + err = change_mount_flags(path->mnt, flags); + else if (!capable(CAP_SYS_ADMIN)) + err = -EPERM; + else + err = do_remount_sb(sb, flags, data, 0); + if (!err) { + lock_mount_hash(); + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; + mnt->mnt.mnt_flags = mnt_flags; + touch_mnt_namespace(mnt->mnt_ns); + unlock_mount_hash(); + } + up_write(&sb->s_umount); + return err; +} + +static inline int tree_contains_unbindable(struct mount *mnt) +{ + struct mount *p; + for (p = mnt; p; p = next_mnt(p, mnt)) { + if (IS_MNT_UNBINDABLE(p)) + return 1; + } + return 0; +} + +static int do_move_mount(struct path *path, const char *old_name) +{ + struct path old_path, parent_path; + struct mount *p; + struct mount *old; + struct mountpoint *mp; + int err; + if (!old_name || !*old_name) + return -EINVAL; + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + if (err) + return err; + + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) + goto out; + + old = real_mount(old_path.mnt); + p = real_mount(path->mnt); + + err = -EINVAL; + if (!check_mnt(p) || !check_mnt(old)) + goto out1; + + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out1; + + err = -EINVAL; + if (old_path.dentry != old_path.mnt->mnt_root) + goto out1; + + if (!mnt_has_parent(old)) + goto out1; + + if (d_is_dir(path->dentry) != + d_is_dir(old_path.dentry)) + goto out1; + /* + * Don't move a mount residing in a shared parent. + */ + if (IS_MNT_SHARED(old->mnt_parent)) + goto out1; + /* + * Don't move a mount tree containing unbindable mounts to a destination + * mount which is shared. + */ + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) + goto out1; + err = -ELOOP; + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) + goto out1; + + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); + if (err) + goto out1; + + /* if the mount is moved, it should no longer be expire + * automatically */ + list_del_init(&old->mnt_expire); +out1: + unlock_mount(mp); +out: + if (!err) + path_put(&parent_path); + path_put(&old_path); + return err; +} + +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) +{ + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ""; + + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt->mnt_sb->s_subtype) + goto err; + return mnt; + + err: + mntput(mnt); + return ERR_PTR(err); +} + +/* + * add a mount into a namespace's mount tree + */ +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +{ + struct mountpoint *mp; + struct mount *parent; + int err; + + mnt_flags &= ~MNT_INTERNAL_FLAGS; + + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + + parent = real_mount(path->mnt); + err = -EINVAL; + if (unlikely(!check_mnt(parent))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + goto unlock; + /* ... and for those we'd better have mountpoint still alive */ + if (!parent->mnt_ns) + goto unlock; + } + + /* Refuse the same filesystem on the same mount point */ + err = -EBUSY; + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && + path->mnt->mnt_root == path->dentry) + goto unlock; + + err = -EINVAL; + if (d_is_symlink(newmnt->mnt.mnt_root)) + goto unlock; + + newmnt->mnt.mnt_flags = mnt_flags; + err = graft_tree(newmnt, parent, mp); + +unlock: + unlock_mount(mp); + return err; +} + +static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); + +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, const char *fstype, int flags, + int mnt_flags, const char *name, void *data) +{ + struct file_system_type *type; + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct vfsmount *mnt; + int err; + + if (!fstype) + return -EINVAL; + + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; + } + if (type->fs_flags & FS_USERNS_VISIBLE) { + if (!fs_fully_visible(type, &mnt_flags)) + return -EPERM; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + err = do_add_mount(real_mount(mnt), path, mnt_flags); + if (err) + mntput(mnt); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + struct mount *mnt = real_mount(m); + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(mnt) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&mnt->mnt_expire)) { + namespace_lock(); + list_del_init(&mnt->mnt_expire); + namespace_unlock(); + } + mntput(m); + mntput(m); + return err; +} + +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + namespace_lock(); + + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); + + namespace_unlock(); +} +EXPORT_SYMBOL(mnt_set_expiry); + +/* + * process a list of expirable mountpoints with the intent of discarding any + * mountpoints that aren't in use and haven't been touched since last we came + * here + */ +void mark_mounts_for_expiry(struct list_head *mounts) +{ + struct mount *mnt, *next; + LIST_HEAD(graveyard); + + if (list_empty(mounts)) + return; + + namespace_lock(); + lock_mount_hash(); + + /* extract from the expiration list every vfsmount that matches the + * following criteria: + * - only referenced by its parent vfsmount + * - still marked for expiry (marked on the last call here; marks are + * cleared by mntput()) + */ + list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!xchg(&mnt->mnt_expiry_mark, 1) || + propagate_mount_busy(mnt, 1)) + continue; + list_move(&mnt->mnt_expire, &graveyard); + } + while (!list_empty(&graveyard)) { + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); + touch_mnt_namespace(mnt->mnt_ns); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + } + unlock_mount_hash(); + namespace_unlock(); +} + +EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); + +/* + * Ripoff of 'select_parent()' + * + * search the list of submounts for a given mountpoint, and move any + * shrinkable submounts to the 'graveyard' list. + */ +static int select_submounts(struct mount *parent, struct list_head *graveyard) +{ + struct mount *this_parent = parent; + struct list_head *next; + int found = 0; + +repeat: + next = this_parent->mnt_mounts.next; +resume: + while (next != &this_parent->mnt_mounts) { + struct list_head *tmp = next; + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); + + next = tmp->next; + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) + continue; + /* + * Descend a level if the d_mounts list is non-empty. + */ + if (!list_empty(&mnt->mnt_mounts)) { + this_parent = mnt; + goto repeat; + } + + if (!propagate_mount_busy(mnt, 1)) { + list_move_tail(&mnt->mnt_expire, graveyard); + found++; + } + } + /* + * All done at this level ... ascend and resume the search + */ + if (this_parent != parent) { + next = this_parent->mnt_child.next; + this_parent = this_parent->mnt_parent; + goto resume; + } + return found; +} + +/* + * process a list of expirable mountpoints with the intent of discarding any + * submounts of a specific parent mountpoint + * + * mount_lock must be held for write + */ +static void shrink_submounts(struct mount *mnt) +{ + LIST_HEAD(graveyard); + struct mount *m; + + /* extract submounts of 'mountpoint' from the expiration list */ + while (select_submounts(mnt, &graveyard)) { + while (!list_empty(&graveyard)) { + m = list_first_entry(&graveyard, struct mount, + mnt_expire); + touch_mnt_namespace(m->mnt_ns); + umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); + } + } +} + +/* + * Some copy_from_user() implementations do not return the exact number of + * bytes remaining to copy on a fault. But copy_mount_options() requires that. + * Note that this function differs from copy_from_user() in that it will oops + * on bad values of `to', rather than returning a short copy. + */ +static long exact_copy_from_user(void *to, const void __user * from, + unsigned long n) +{ + char *t = to; + const char __user *f = from; + char c; + + if (!access_ok(VERIFY_READ, from, n)) + return n; + + while (n) { + if (__get_user(c, f)) { + memset(t, 0, n); + break; + } + *t++ = c; + f++; + n--; + } + return n; +} + +int copy_mount_options(const void __user * data, unsigned long *where) +{ + int i; + unsigned long page; + unsigned long size; + + *where = 0; + if (!data) + return 0; + + if (!(page = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + /* We only care that *some* data at the address the user + * gave us is valid. Just in case, we'll zero + * the remainder of the page. + */ + /* copy_from_user cannot cross TASK_SIZE ! */ + size = TASK_SIZE - (unsigned long)data; + if (size > PAGE_SIZE) + size = PAGE_SIZE; + + i = size - exact_copy_from_user((void *)page, data, size); + if (!i) { + free_page(page); + return -EFAULT; + } + if (i != PAGE_SIZE) + memset((char *)page + i, 0, PAGE_SIZE - i); + *where = page; + return 0; +} + +char *copy_mount_string(const void __user *data) +{ + return data ? strndup_user(data, PAGE_SIZE) : NULL; +} + +/* + * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to + * be given to the mount() call (ie: read-only, no-dev, no-suid etc). + * + * data is a (void *) that can point to any structure up to + * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent + * information (or be NULL). + * + * Pre-0.97 versions of mount() didn't have a flags word. + * When the flags word was introduced its top half was required + * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. + * Therefore, if this magic number is present, it carries no information + * and must be discarded. + */ +long do_mount(const char *dev_name, const char __user *dir_name, + const char *type_page, unsigned long flags, void *data_page) +{ + struct path path; + int retval = 0; + int mnt_flags = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + /* Basic sanity checks */ + if (data_page) + ((char *)data_page)[PAGE_SIZE - 1] = 0; + + /* ... and get the mountpoint */ + retval = user_path(dir_name, &path); + if (retval) + return retval; + + retval = security_sb_mount(dev_name, &path, + type_page, flags, data_page); + if (!retval && !may_mount()) + retval = -EPERM; + if (retval) + goto dput_out; + + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + + /* Separate the per-mountpoint flags */ + if (flags & MS_NOSUID) + mnt_flags |= MNT_NOSUID; + if (flags & MS_NODEV) + mnt_flags |= MNT_NODEV; + if (flags & MS_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (flags & MS_NOATIME) + mnt_flags |= MNT_NOATIME; + if (flags & MS_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + } + + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); + + if (flags & MS_REMOUNT) + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, + data_page); + else if (flags & MS_BIND) + retval = do_loopback(&path, dev_name, flags & MS_REC); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + retval = do_change_type(&path, flags); + else if (flags & MS_MOVE) + retval = do_move_mount(&path, dev_name); + else + retval = do_new_mount(&path, type_page, flags, mnt_flags, + dev_name, data_page); +dput_out: + path_put(&path); + return retval; +} + +static void free_mnt_ns(struct mnt_namespace *ns) +{ + ns_free_inum(&ns->ns); + put_user_ns(ns->user_ns); + kfree(ns); +} + +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +{ + struct mnt_namespace *new_ns; + int ret; + + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + new_ns->ns.ops = &mntns_operations; + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + atomic_set(&new_ns->count, 1); + new_ns->root = NULL; + INIT_LIST_HEAD(&new_ns->list); + init_waitqueue_head(&new_ns->poll); + new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); + return new_ns; +} + +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct user_namespace *user_ns, struct fs_struct *new_fs) +{ + struct mnt_namespace *new_ns; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct mount *p, *q; + struct mount *old; + struct mount *new; + int copy_flags; + + BUG_ON(!ns); + + if (likely(!(flags & CLONE_NEWNS))) { + get_mnt_ns(ns); + return ns; + } + + old = ns->root; + + new_ns = alloc_mnt_ns(user_ns); + if (IS_ERR(new_ns)) + return new_ns; + + namespace_lock(); + /* First pass: copy the tree topology */ + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + if (user_ns != ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); + if (IS_ERR(new)) { + namespace_unlock(); + free_mnt_ns(new_ns); + return ERR_CAST(new); + } + new_ns->root = new; + list_add_tail(&new_ns->list, &new->mnt_list); + + /* + * Second pass: switch the tsk->fs->* elements and mark new vfsmounts + * as belonging to new namespace. We have already acquired a private + * fs_struct, so tsk->fs->lock is not needed. + */ + p = old; + q = new; + while (p) { + q->mnt_ns = new_ns; + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + new_fs->root.mnt = mntget(&q->mnt); + rootmnt = &p->mnt; + } + if (&p->mnt == new_fs->pwd.mnt) { + new_fs->pwd.mnt = mntget(&q->mnt); + pwdmnt = &p->mnt; + } + } + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); + } + namespace_unlock(); + + if (rootmnt) + mntput(rootmnt); + if (pwdmnt) + mntput(pwdmnt); + + return new_ns; +} + +/** + * create_mnt_ns - creates a private namespace and adds a root filesystem + * @mnt: pointer to the new root filesystem mountpoint + */ +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) +{ + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); + if (!IS_ERR(new_ns)) { + struct mount *mnt = real_mount(m); + mnt->mnt_ns = new_ns; + new_ns->root = mnt; + list_add(&mnt->mnt_list, &new_ns->list); + } else { + mntput(m); + } + return new_ns; +} + +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + int ret; + char *kernel_type; + char *kernel_dev; + unsigned long data_page; + + kernel_type = copy_mount_string(type); + ret = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) + goto out_type; + + kernel_dev = copy_mount_string(dev_name); + ret = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) + goto out_dev; + + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; + + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, + (void *) data_page); + + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + kfree(kernel_type); +out_type: + return ret; +} + +/* + * Return true if path is reachable from root + * + * namespace_sem or mount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +int path_is_under(struct path *path1, struct path *path2) +{ + int res; + read_seqlock_excl(&mount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + read_sequnlock_excl(&mount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +/* + * pivot_root Semantics: + * Moves the root file system of the current process to the directory put_old, + * makes new_root as the new root file system of the current process, and sets + * root/cwd of all processes which had them on the current root to new_root. + * + * Restrictions: + * The new_root and put_old must be directories, and must not be on the + * same file system as the current process root. The put_old must be + * underneath new_root, i.e. adding a non-zero number of /.. to the string + * pointed to by put_old must yield the same directory as new_root. No other + * file system may be mounted on put_old. After all, new_root is a mountpoint. + * + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. + * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * in this situation. + * + * Notes: + * - we don't move root/cwd if they are not at the root (reason: if something + * cared enough to change them, it's probably wrong to force them elsewhere) + * - it's okay to pick a root that isn't the root of a file system, e.g. + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root + * first. + */ +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) +{ + struct path new, old, parent_path, root_parent, root; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; + int error; + + if (!may_mount()) + return -EPERM; + + error = user_path_dir(new_root, &new); + if (error) + goto out0; + + error = user_path_dir(put_old, &old); + if (error) + goto out1; + + error = security_sb_pivotroot(&old, &new); + if (error) + goto out2; + + get_fs_root(current->fs, &root); + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) + goto out3; + + error = -EINVAL; + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || + IS_MNT_SHARED(new_mnt->mnt_parent) || + IS_MNT_SHARED(root_mnt->mnt_parent)) + goto out4; + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) + goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; + error = -ENOENT; + if (d_unlinked(new.dentry)) + goto out4; + error = -EBUSY; + if (new_mnt == root_mnt || old_mnt == root_mnt) + goto out4; /* loop, on the same file system */ + error = -EINVAL; + if (root.mnt->mnt_root != root.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(root_mnt)) + goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; + if (new.mnt->mnt_root != new.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(new_mnt)) + goto out4; /* not attached */ + /* make sure we can reach put_old from new_root */ + if (!is_path_reachable(old_mnt, old.dentry, &new)) + goto out4; + /* make certain new is below the root */ + if (!is_path_reachable(new_mnt, new.dentry, &root)) + goto out4; + root_mp->m_count++; /* pin it so it won't go away */ + lock_mount_hash(); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp); + /* mount new_root on / */ + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); + touch_mnt_namespace(current->nsproxy->mnt_ns); + /* A moved mount should not expire automatically */ + list_del_init(&new_mnt->mnt_expire); + unlock_mount_hash(); + chroot_fs_refs(&root, &new); + put_mountpoint(root_mp); + error = 0; +out4: + unlock_mount(old_mp); + if (!error) { + path_put(&root_parent); + path_put(&parent_path); + } +out3: + path_put(&root); +out2: + path_put(&old); +out1: + path_put(&new); +out0: + return error; +} + +static void __init init_mount_tree(void) +{ + struct vfsmount *mnt; + struct mnt_namespace *ns; + struct path root; + struct file_system_type *type; + + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + panic("Can't allocate initial namespace"); + + init_task.nsproxy->mnt_ns = ns; + get_mnt_ns(ns); + + root.mnt = mnt; + root.dentry = mnt->mnt_root; + mnt->mnt_flags |= MNT_LOCKED; + + set_fs_pwd(current->fs, &root); + set_fs_root(current->fs, &root); +} + +void __init mnt_init(void) +{ + unsigned u; + int err; + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct hlist_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + 0, + &mp_hash_shift, &mp_hash_mask, 0, 0); + + if (!mount_hashtable || !mountpoint_hashtable) + panic("Failed to allocate mount hash table\n"); + + for (u = 0; u <= m_hash_mask; u++) + INIT_HLIST_HEAD(&mount_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); + + kernfs_init(); + + err = sysfs_init(); + if (err) + printk(KERN_WARNING "%s: sysfs_init error: %d\n", + __func__, err); + fs_kobj = kobject_create_and_add("fs", NULL); + if (!fs_kobj) + printk(KERN_WARNING "%s: kobj create error\n", __func__); + init_rootfs(); + init_mount_tree(); +} + +void put_mnt_ns(struct mnt_namespace *ns) +{ + if (!atomic_dec_and_test(&ns->count)) + return; + drop_collected_mounts(&ns->root->mnt); + free_mnt_ns(ns); +} + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + } + return mnt; +} +EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + real_mount(mnt)->mnt_ns = NULL; + synchronize_rcu(); /* yecchhh... */ + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(real_mount(mnt)); +} + +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + int new_flags = *new_mnt_flags; + struct mount *mnt; + bool visible = false; + + if (unlikely(!ns)) + return false; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + + /* Verify the mount flags are equal to or more permissive + * than the proposed new mount. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(new_flags & MNT_READONLY)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(new_flags & MNT_NODEV)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + continue; + + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + /* Only worry about locked mounts */ + if (!(mnt->mnt.mnt_flags & MNT_LOCKED)) + continue; + /* Is the directory permanetly empty? */ + if (!is_empty_dir_inode(inode)) + goto next; + } + /* Preserve the locked attributes */ + *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_ATIME); + visible = true; + goto found; + next: ; + } +found: + up_read(&namespace_sem); + return visible; +} + +static struct ns_common *mntns_get(struct task_struct *task) +{ + struct ns_common *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = &nsproxy->mnt_ns->ns; + get_mnt_ns(to_mnt_ns(ns)); + } + task_unlock(task); + + return ns; +} + +static void mntns_put(struct ns_common *ns) +{ + put_mnt_ns(to_mnt_ns(ns)); +} + +static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = to_mnt_ns(ns); + struct path root; + + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, +}; diff --git a/kernel/fs/ncpfs/Kconfig b/kernel/fs/ncpfs/Kconfig new file mode 100644 index 000000000..c931cf22a --- /dev/null +++ b/kernel/fs/ncpfs/Kconfig @@ -0,0 +1,108 @@ +# +# NCP Filesystem configuration +# +config NCP_FS + tristate "NCP file system support (to mount NetWare volumes)" + depends on IPX!=n || INET + help + NCP (NetWare Core Protocol) is a protocol that runs over IPX and is + used by Novell NetWare clients to talk to file servers. It is to + IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you + to mount NetWare file server volumes and to access them just like + any other Unix directory. For details, please read the file + in the kernel source and + the IPX-HOWTO from . + + You do not have to say Y here if you want your Linux box to act as a + file *server* for Novell NetWare clients. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at . + + To compile this as a module, choose M here: the module will be called + ncpfs. Say N unless you are connected to a Novell network. + +config NCPFS_PACKET_SIGNING + bool "Packet signatures" + depends on NCP_FS + help + NCP allows packets to be signed for stronger security. If you want + security, say Y. Normal users can leave it off. To be able to use + packet signing you must use ncpfs > 2.0.12. + +config NCPFS_IOCTL_LOCKING + bool "Proprietary file locking" + depends on NCP_FS + help + Allows locking of records on remote volumes. Say N unless you have + special applications which are able to utilize this locking scheme. + +config NCPFS_STRONG + bool "Clear remove/delete inhibit when needed" + depends on NCP_FS + help + Allows manipulation of files flagged as Delete or Rename Inhibit. + To use this feature you must mount volumes with the ncpmount + parameter "-s" (ncpfs-2.0.12 and newer). Say Y unless you are not + mounting volumes with -f 444. + +config NCPFS_NFS_NS + bool "Use NFS namespace if available" + depends on NCP_FS + help + Allows you to utilize NFS namespace on NetWare servers. It brings + you case sensitive filenames. Say Y. You can disable it at + mount-time with the `-N nfs' parameter of ncpmount. + +config NCPFS_OS2_NS + bool "Use LONG (OS/2) namespace if available" + depends on NCP_FS + help + Allows you to utilize OS2/LONG namespace on NetWare servers. + Filenames in this namespace are limited to 255 characters, they are + case insensitive, and case in names is preserved. Say Y. You can + disable it at mount time with the -N os2 parameter of ncpmount. + +config NCPFS_SMALLDOS + bool "Lowercase DOS filenames" + depends on NCP_FS + ---help--- + If you say Y here, every filename on a NetWare server volume using + the OS2/LONG namespace and created under DOS or on a volume using + DOS namespace will be converted to lowercase characters. + Saying N here will give you these filenames in uppercase. + + This is only a cosmetic option since the OS2/LONG namespace is case + insensitive. The only major reason for this option is backward + compatibility when moving from DOS to OS2/LONG namespace support. + Long filenames (created by Win95) will not be affected. + + This option does not solve the problem that filenames appear + differently under Linux and under Windows, since Windows does an + additional conversions on the client side. You can achieve similar + effects by saying Y to "Allow using of Native Language Support" + below. + +config NCPFS_NLS + bool "Use Native Language Support" + depends on NCP_FS + select NLS + help + Allows you to use codepages and I/O charsets for file name + translation between the server file system and input/output. This + may be useful, if you want to access the server with other operating + systems, e.g. Windows 95. See also NLS for more Information. + + To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer. + +config NCPFS_EXTRAS + bool "Enable symbolic links and execute flags" + depends on NCP_FS + help + This enables the use of symbolic links and an execute permission + bit on NCPFS. The file server need not have long name space or NFS + name space loaded for these to work. + + To use the new attributes, it is recommended to use the flags + '-f 600 -d 755' on the ncpmount command line. + diff --git a/kernel/fs/ncpfs/Makefile b/kernel/fs/ncpfs/Makefile new file mode 100644 index 000000000..c66af563f --- /dev/null +++ b/kernel/fs/ncpfs/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the linux ncp filesystem routines. +# + +obj-$(CONFIG_NCP_FS) += ncpfs.o + +ncpfs-y := dir.o file.o inode.o ioctl.o mmap.o ncplib_kernel.o sock.o \ + ncpsign_kernel.o getopt.o + +ncpfs-$(CONFIG_NCPFS_EXTRAS) += symlink.o +ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o + +# If you want debugging output, please uncomment the following line +# ccflags-y := -DDEBUG_NCP=1 + +CFLAGS_ncplib_kernel.o := -finline-functions diff --git a/kernel/fs/ncpfs/dir.c b/kernel/fs/ncpfs/dir.c new file mode 100644 index 000000000..80021c709 --- /dev/null +++ b/kernel/fs/ncpfs/dir.c @@ -0,0 +1,1237 @@ +/* + * dir.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * Modified 1999 Wolfram Pienkoss for directory caching + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +static void ncp_read_volume_list(struct file *, struct dir_context *, + struct ncp_cache_control *); +static void ncp_do_readdir(struct file *, struct dir_context *, + struct ncp_cache_control *); + +static int ncp_readdir(struct file *, struct dir_context *); + +static int ncp_create(struct inode *, struct dentry *, umode_t, bool); +static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); +static int ncp_unlink(struct inode *, struct dentry *); +static int ncp_mkdir(struct inode *, struct dentry *, umode_t); +static int ncp_rmdir(struct inode *, struct dentry *); +static int ncp_rename(struct inode *, struct dentry *, + struct inode *, struct dentry *); +static int ncp_mknod(struct inode * dir, struct dentry *dentry, + umode_t mode, dev_t rdev); +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +extern int ncp_symlink(struct inode *, struct dentry *, const char *); +#else +#define ncp_symlink NULL +#endif + +const struct file_operations ncp_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = ncp_readdir, + .unlocked_ioctl = ncp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ncp_compat_ioctl, +#endif +}; + +const struct inode_operations ncp_dir_inode_operations = +{ + .create = ncp_create, + .lookup = ncp_lookup, + .unlink = ncp_unlink, + .symlink = ncp_symlink, + .mkdir = ncp_mkdir, + .rmdir = ncp_rmdir, + .mknod = ncp_mknod, + .rename = ncp_rename, + .setattr = ncp_notify_change, +}; + +/* + * Dentry operations routines + */ +static int ncp_lookup_validate(struct dentry *, unsigned int); +static int ncp_hash_dentry(const struct dentry *, struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct dentry *, + unsigned int, const char *, const struct qstr *); +static int ncp_delete_dentry(const struct dentry *); +static void ncp_d_prune(struct dentry *dentry); + +const struct dentry_operations ncp_dentry_operations = +{ + .d_revalidate = ncp_lookup_validate, + .d_hash = ncp_hash_dentry, + .d_compare = ncp_compare_dentry, + .d_delete = ncp_delete_dentry, + .d_prune = ncp_d_prune, +}; + +#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) + +static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) +{ +#ifdef CONFIG_NCPFS_SMALLDOS + int ns = ncp_namespace(i); + + if ((ns == NW_NS_DOS) +#ifdef CONFIG_NCPFS_OS2_NS + || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS)) +#endif /* CONFIG_NCPFS_OS2_NS */ + ) + return 0; +#endif /* CONFIG_NCPFS_SMALLDOS */ + return 1; +} + +#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) + +static inline int ncp_case_sensitive(const struct inode *i) +{ +#ifdef CONFIG_NCPFS_NFS_NS + return ncp_namespace(i) == NW_NS_NFS; +#else + return 0; +#endif /* CONFIG_NCPFS_NFS_NS */ +} + +/* + * Note: leave the hash unchanged if the directory + * is case-sensitive. + * + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. + */ +static int +ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) +{ + struct inode *inode = d_inode_rcu(dentry); + + if (!inode) + return 0; + + if (!ncp_case_sensitive(inode)) { + struct super_block *sb = dentry->d_sb; + struct nls_table *t; + unsigned long hash; + int i; + + t = NCP_IO_TABLE(sb); + hash = init_name_hash(); + for (i=0; ilen ; i++) + hash = partial_name_hash(ncp_tolower(t, this->name[i]), + hash); + this->hash = end_name_hash(hash); + } + return 0; +} + +/* + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. + */ +static int +ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct inode *pinode; + + if (len != name->len) + return 1; + + pinode = d_inode_rcu(parent); + if (!pinode) + return 1; + + if (ncp_case_sensitive(pinode)) + return strncmp(str, name->name, len); + + return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len); +} + +/* + * This is the callback from dput() when d_count is going to 0. + * We use this to unhash dentries with bad inodes. + * Closing files can be safely postponed until iput() - it's done there anyway. + */ +static int +ncp_delete_dentry(const struct dentry * dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode) { + if (is_bad_inode(inode)) + return 1; + } else + { + /* N.B. Unhash negative dentries? */ + } + return 0; +} + +static inline int +ncp_single_volume(struct ncp_server *server) +{ + return (server->m.mounted_vol[0] != '\0'); +} + +static inline int ncp_is_server_root(struct inode *inode) +{ + return !ncp_single_volume(NCP_SERVER(inode)) && + is_root_inode(inode); +} + + +/* + * This is the callback when the dcache has a lookup hit. + */ + + +#ifdef CONFIG_NCPFS_STRONG +/* try to delete a readonly file (NW R bit set) */ + +static int +ncp_force_unlink(struct inode *dir, struct dentry* dentry) +{ + int res=0x9c,res2; + struct nw_modify_dos_info info; + __le32 old_nwattr; + struct inode *inode; + + memset(&info, 0, sizeof(info)); + + /* remove the Read-Only flag on the NW server */ + inode = d_inode(dentry); + + old_nwattr = NCP_FINFO(inode)->nwattr; + info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); + if (res2) + goto leave_me; + + /* now try again the delete operation */ + res = ncp_del_file_or_subdir2(NCP_SERVER(dir), dentry); + + if (res) /* delete failed, set R bit again */ + { + info.attributes = old_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); + if (res2) + goto leave_me; + } +leave_me: + return(res); +} +#endif /* CONFIG_NCPFS_STRONG */ + +#ifdef CONFIG_NCPFS_STRONG +static int +ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_name, + struct inode *new_dir, struct dentry* new_dentry, char *_new_name) +{ + struct nw_modify_dos_info info; + int res=0x90,res2; + struct inode *old_inode = d_inode(old_dentry); + __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; + __le32 new_nwattr = 0; /* shut compiler warning */ + int old_nwattr_changed = 0; + int new_nwattr_changed = 0; + + memset(&info, 0, sizeof(info)); + + /* remove the Read-Only flag on the NW server */ + + info.attributes = old_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); + if (!res2) + old_nwattr_changed = 1; + if (new_dentry && d_really_is_positive(new_dentry)) { + new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr; + info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); + if (!res2) + new_nwattr_changed = 1; + } + /* now try again the rename operation */ + /* but only if something really happened */ + if (new_nwattr_changed || old_nwattr_changed) { + res = ncp_ren_or_mov_file_or_subdir(NCP_SERVER(old_dir), + old_dir, _old_name, + new_dir, _new_name); + } + if (res) + goto leave_me; + /* file was successfully renamed, so: + do not set attributes on old file - it no longer exists + copy attributes from old file to new */ + new_nwattr_changed = old_nwattr_changed; + new_nwattr = old_nwattr; + old_nwattr_changed = 0; + +leave_me:; + if (old_nwattr_changed) { + info.attributes = old_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); + /* ignore errors */ + } + if (new_nwattr_changed) { + info.attributes = new_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); + /* ignore errors */ + } + return(res); +} +#endif /* CONFIG_NCPFS_STRONG */ + + +static int +ncp_lookup_validate(struct dentry *dentry, unsigned int flags) +{ + struct ncp_server *server; + struct dentry *parent; + struct inode *dir; + struct ncp_entry_info finfo; + int res, val = 0, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + if (dentry == dentry->d_sb->s_root) + return 1; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + dir = d_inode(parent); + + if (d_really_is_negative(dentry)) + goto finished; + + server = NCP_SERVER(dir); + + /* + * Inspired by smbfs: + * The default validation is based on dentry age: + * We set the max age at mount time. (But each + * successful server lookup renews the timestamp.) + */ + val = NCP_TEST_AGE(server, dentry); + if (val) + goto finished; + + ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n", + dentry, NCP_GET_AGE(dentry)); + + len = sizeof(__name); + if (ncp_is_server_root(dir)) { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, 1); + if (!res) { + res = ncp_lookup_volume(server, __name, &(finfo.i)); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + } + } else { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (!res) + res = ncp_obtain_info(server, dir, __name, &(finfo.i)); + } + finfo.volume = finfo.i.volNumber; + ncp_dbg(2, "looked for %pd/%s, res=%d\n", + dentry->d_parent, __name, res); + /* + * If we didn't find it, or if it has a different dirEntNum to + * what we remember, it's not valid any more. + */ + if (!res) { + struct inode *inode = d_inode(dentry); + + mutex_lock(&inode->i_mutex); + if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { + ncp_new_dentry(dentry); + val=1; + } else + ncp_dbg(2, "found, but dirEntNum changed\n"); + + ncp_update_inode2(inode, &finfo); + mutex_unlock(&inode->i_mutex); + } + +finished: + ncp_dbg(2, "result=%d\n", val); + dput(parent); + return val; +} + +static time_t ncp_obtain_mtime(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct ncp_server *server = NCP_SERVER(inode); + struct nw_info_struct i; + + if (!ncp_conn_valid(server) || ncp_is_server_root(inode)) + return 0; + + if (ncp_obtain_info(server, inode, NULL, &i)) + return 0; + + return ncp_date_dos2unix(i.modifyTime, i.modifyDate); +} + +static inline void +ncp_invalidate_dircache_entries(struct dentry *parent) +{ + struct ncp_server *server = NCP_SERVER(d_inode(parent)); + struct dentry *dentry; + + spin_lock(&parent->d_lock); + list_for_each_entry(dentry, &parent->d_subdirs, d_child) { + dentry->d_fsdata = NULL; + ncp_age_dentry(server, dentry); + } + spin_unlock(&parent->d_lock); +} + +static int ncp_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + struct page *page = NULL; + struct ncp_server *server = NCP_SERVER(inode); + union ncp_dir_cache *cache = NULL; + struct ncp_cache_control ctl; + int result, mtime_valid = 0; + time_t mtime = 0; + + ctl.page = NULL; + ctl.cache = NULL; + + ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos); + + result = -EIO; + /* Do not generate '.' and '..' when server is dead. */ + if (!ncp_conn_valid(server)) + goto out; + + result = 0; + if (!dir_emit_dots(file, ctx)) + goto out; + + page = grab_cache_page(&inode->i_data, 0); + if (!page) + goto read_really; + + ctl.cache = cache = kmap(page); + ctl.head = cache->head; + + if (!PageUptodate(page) || !ctl.head.eof) + goto init_cache; + + if (ctx->pos == 2) { + if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) + goto init_cache; + + mtime = ncp_obtain_mtime(dentry); + mtime_valid = 1; + if ((!mtime) || (mtime != ctl.head.mtime)) + goto init_cache; + } + + if (ctx->pos > ctl.head.end) + goto finished; + + ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2); + ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; + ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; + + for (;;) { + if (ctl.ofs != 0) { + ctl.page = find_lock_page(&inode->i_data, ctl.ofs); + if (!ctl.page) + goto invalid_cache; + ctl.cache = kmap(ctl.page); + if (!PageUptodate(ctl.page)) + goto invalid_cache; + } + while (ctl.idx < NCP_DIRCACHE_SIZE) { + struct dentry *dent; + bool over; + + spin_lock(&dentry->d_lock); + if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { + spin_unlock(&dentry->d_lock); + goto invalid_cache; + } + dent = ctl.cache->dentry[ctl.idx]; + if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) { + spin_unlock(&dentry->d_lock); + goto invalid_cache; + } + spin_unlock(&dentry->d_lock); + if (d_really_is_negative(dent)) { + dput(dent); + goto invalid_cache; + } + over = !dir_emit(ctx, dent->d_name.name, + dent->d_name.len, + d_inode(dent)->i_ino, DT_UNKNOWN); + dput(dent); + if (over) + goto finished; + ctx->pos += 1; + ctl.idx += 1; + if (ctx->pos > ctl.head.end) + goto finished; + } + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + ctl.page = NULL; + } + ctl.idx = 0; + ctl.ofs += 1; + } +invalid_cache: + if (ctl.page) { + kunmap(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + ctl.page = NULL; + } + ctl.cache = cache; +init_cache: + ncp_invalidate_dircache_entries(dentry); + if (!mtime_valid) { + mtime = ncp_obtain_mtime(dentry); + mtime_valid = 1; + } + ctl.head.mtime = mtime; + ctl.head.time = jiffies; + ctl.head.eof = 0; + ctl.fpos = 2; + ctl.ofs = 0; + ctl.idx = NCP_DIRCACHE_START; + ctl.filled = 0; + ctl.valid = 1; +read_really: + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); + if (ncp_is_server_root(inode)) { + ncp_read_volume_list(file, ctx, &ctl); + } else { + ncp_do_readdir(file, ctx, &ctl); + } + ctl.head.end = ctl.fpos - 1; + ctl.head.eof = ctl.valid; +finished: + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + } + if (page) { + cache->head = ctl.head; + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +out: + return result; +} + +static void ncp_d_prune(struct dentry *dentry) +{ + if (!dentry->d_fsdata) /* not referenced from page cache */ + return; + NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE; +} + +static int +ncp_fill_cache(struct file *file, struct dir_context *ctx, + struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, + int inval_childs) +{ + struct dentry *newdent, *dentry = file->f_path.dentry; + struct inode *dir = d_inode(dentry); + struct ncp_cache_control ctl = *ctrl; + struct qstr qname; + int valid = 0; + int hashed = 0; + ino_t ino = 0; + __u8 __name[NCP_MAXPATHLEN + 1]; + + qname.len = sizeof(__name); + if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len, + entry->i.entryName, entry->i.nameLen, + !ncp_preserve_entry_case(dir, entry->i.NSCreator))) + return 1; /* I'm not sure */ + + qname.name = __name; + + newdent = d_hash_and_lookup(dentry, &qname); + if (unlikely(IS_ERR(newdent))) + goto end_advance; + if (!newdent) { + newdent = d_alloc(dentry, &qname); + if (!newdent) + goto end_advance; + } else { + hashed = 1; + + /* If case sensitivity changed for this volume, all entries below this one + should be thrown away. This entry itself is not affected, as its case + sensitivity is controlled by its own parent. */ + if (inval_childs) + shrink_dcache_parent(newdent); + + /* + * NetWare's OS2 namespace is case preserving yet case + * insensitive. So we update dentry's name as received from + * server. Parent dir's i_mutex is locked because we're in + * readdir. + */ + dentry_update_name_case(newdent, &qname); + } + + if (d_really_is_negative(newdent)) { + struct inode *inode; + + entry->opened = 0; + entry->ino = iunique(dir->i_sb, 2); + inode = ncp_iget(dir->i_sb, entry); + if (inode) { + d_instantiate(newdent, inode); + if (!hashed) + d_rehash(newdent); + } else { + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); + } + } else { + struct inode *inode = d_inode(newdent); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ncp_update_inode2(inode, entry); + mutex_unlock(&inode->i_mutex); + } + + if (ctl.idx >= NCP_DIRCACHE_SIZE) { + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + } + ctl.cache = NULL; + ctl.idx -= NCP_DIRCACHE_SIZE; + ctl.ofs += 1; + ctl.page = grab_cache_page(&dir->i_data, ctl.ofs); + if (ctl.page) + ctl.cache = kmap(ctl.page); + } + if (ctl.cache) { + if (d_really_is_positive(newdent)) { + newdent->d_fsdata = newdent; + ctl.cache->dentry[ctl.idx] = newdent; + ino = d_inode(newdent)->i_ino; + ncp_new_dentry(newdent); + } + valid = 1; + } + dput(newdent); +end_advance: + if (!valid) + ctl.valid = 0; + if (!ctl.filled && (ctl.fpos == ctx->pos)) { + if (!ino) + ino = iunique(dir->i_sb, 2); + ctl.filled = !dir_emit(ctx, qname.name, qname.len, + ino, DT_UNKNOWN); + if (!ctl.filled) + ctx->pos += 1; + } + ctl.fpos += 1; + ctl.idx += 1; + *ctrl = ctl; + return (ctl.valid || !ctl.filled); +} + +static void +ncp_read_volume_list(struct file *file, struct dir_context *ctx, + struct ncp_cache_control *ctl) +{ + struct inode *inode = file_inode(file); + struct ncp_server *server = NCP_SERVER(inode); + struct ncp_volume_info info; + struct ncp_entry_info entry; + int i; + + ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos); + + for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { + int inval_dentry; + + if (ncp_get_volume_info_with_number(server, i, &info) != 0) + return; + if (!strlen(info.volume_name)) + continue; + + ncp_dbg(1, "found vol: %s\n", info.volume_name); + + if (ncp_lookup_volume(server, info.volume_name, + &entry.i)) { + ncp_dbg(1, "could not lookup vol %s\n", + info.volume_name); + continue; + } + inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); + entry.volume = entry.i.volNumber; + if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry)) + return; + } +} + +static void +ncp_do_readdir(struct file *file, struct dir_context *ctx, + struct ncp_cache_control *ctl) +{ + struct inode *dir = file_inode(file); + struct ncp_server *server = NCP_SERVER(dir); + struct nw_search_sequence seq; + struct ncp_entry_info entry; + int err; + void* buf; + int more; + size_t bufsize; + + ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos); + ncp_vdbg("init %pD, volnum=%d, dirent=%u\n", + file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); + + err = ncp_initialize_search(server, dir, &seq); + if (err) { + ncp_dbg(1, "init failed, err=%d\n", err); + return; + } + /* We MUST NOT use server->buffer_size handshaked with server if we are + using UDP, as for UDP server uses max. buffer size determined by + MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). + So we use 128KB, just to be sure, as there is no way how to know + this value in advance. */ + bufsize = 131072; + buf = vmalloc(bufsize); + if (!buf) + return; + do { + int cnt; + char* rpl; + size_t rpls; + + err = ncp_search_for_fileset(server, &seq, &more, &cnt, buf, bufsize, &rpl, &rpls); + if (err) /* Error */ + break; + if (!cnt) /* prevent endless loop */ + break; + while (cnt--) { + size_t onerpl; + + if (rpls < offsetof(struct nw_info_struct, entryName)) + break; /* short packet */ + ncp_extract_file_info(rpl, &entry.i); + onerpl = offsetof(struct nw_info_struct, entryName) + entry.i.nameLen; + if (rpls < onerpl) + break; /* short packet */ + (void)ncp_obtain_nfs_info(server, &entry.i); + rpl += onerpl; + rpls -= onerpl; + entry.volume = entry.i.volNumber; + if (!ncp_fill_cache(file, ctx, ctl, &entry, 0)) + break; + } + } while (more); + vfree(buf); + return; +} + +int ncp_conn_logged_in(struct super_block *sb) +{ + struct ncp_server* server = NCP_SBP(sb); + int result; + + if (ncp_single_volume(server)) { + int len; + struct dentry* dent; + __u32 volNumber; + __le32 dirEntNum; + __le32 DosDirNum; + __u8 __name[NCP_MAXPATHLEN + 1]; + + len = sizeof(__name); + result = ncp_io2vol(server, __name, &len, server->m.mounted_vol, + strlen(server->m.mounted_vol), 1); + if (result) + goto out; + result = -ENOENT; + if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { + ncp_vdbg("%s not found\n", server->m.mounted_vol); + goto out; + } + dent = sb->s_root; + if (dent) { + struct inode* ino = d_inode(dent); + if (ino) { + ncp_update_known_namespace(server, volNumber, NULL); + NCP_FINFO(ino)->volNumber = volNumber; + NCP_FINFO(ino)->dirEntNum = dirEntNum; + NCP_FINFO(ino)->DosDirNum = DosDirNum; + result = 0; + } else { + ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n"); + } + } else { + ncp_dbg(1, "sb->s_root == NULL!\n"); + } + } else + result = 0; + +out: + return result; +} + +static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct ncp_server *server = NCP_SERVER(dir); + struct inode *inode = NULL; + struct ncp_entry_info finfo; + int error, res, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + error = -EIO; + if (!ncp_conn_valid(server)) + goto finished; + + ncp_vdbg("server lookup for %pd2\n", dentry); + + len = sizeof(__name); + if (ncp_is_server_root(dir)) { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, 1); + if (!res) + res = ncp_lookup_volume(server, __name, &(finfo.i)); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + } else { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (!res) + res = ncp_obtain_info(server, dir, __name, &(finfo.i)); + } + ncp_vdbg("looked for %pd2, res=%d\n", dentry, res); + /* + * If we didn't find an entry, make a negative dentry. + */ + if (res) + goto add_entry; + + /* + * Create an inode for the entry. + */ + finfo.opened = 0; + finfo.ino = iunique(dir->i_sb, 2); + finfo.volume = finfo.i.volNumber; + error = -EACCES; + inode = ncp_iget(dir->i_sb, &finfo); + + if (inode) { + ncp_new_dentry(dentry); +add_entry: + d_add(dentry, inode); + error = 0; + } + +finished: + ncp_vdbg("result=%d\n", error); + return ERR_PTR(error); +} + +/* + * This code is common to create, mkdir, and mknod. + */ +static int ncp_instantiate(struct inode *dir, struct dentry *dentry, + struct ncp_entry_info *finfo) +{ + struct inode *inode; + int error = -EINVAL; + + finfo->ino = iunique(dir->i_sb, 2); + inode = ncp_iget(dir->i_sb, finfo); + if (!inode) + goto out_close; + d_instantiate(dentry,inode); + error = 0; +out: + return error; + +out_close: + ncp_vdbg("%pd2 failed, closing file\n", dentry); + ncp_close_file(NCP_SERVER(dir), finfo->file_handle); + goto out; +} + +int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev, __le32 attributes) +{ + struct ncp_server *server = NCP_SERVER(dir); + struct ncp_entry_info finfo; + int error, result, len; + int opmode; + __u8 __name[NCP_MAXPATHLEN + 1]; + + ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode); + + ncp_age_dentry(server, dentry); + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + error = -EACCES; + + if (S_ISREG(mode) && + (server->m.flags & NCP_MOUNT_EXTRAS) && + (mode & S_IXUGO)) + attributes |= aSYSTEM | aSHARED; + + result = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, + attributes, AR_READ | AR_WRITE, &finfo); + opmode = O_RDWR; + if (result) { + result = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, + attributes, AR_WRITE, &finfo); + if (result) { + if (result == 0x87) + error = -ENAMETOOLONG; + else if (result < 0) + error = result; + ncp_dbg(1, "%pd2 failed\n", dentry); + goto out; + } + opmode = O_WRONLY; + } + finfo.access = opmode; + if (ncp_is_nfs_extras(server, finfo.volume)) { + finfo.i.nfs.mode = mode; + finfo.i.nfs.rdev = new_encode_dev(rdev); + if (ncp_modify_nfs_info(server, finfo.volume, + finfo.i.dirEntNum, + mode, new_encode_dev(rdev)) != 0) + goto out; + } + + error = ncp_instantiate(dir, dentry, &finfo); +out: + return error; +} + +static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ncp_create_new(dir, dentry, mode, 0, 0); +} + +static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct ncp_entry_info finfo; + struct ncp_server *server = NCP_SERVER(dir); + int error, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + ncp_dbg(1, "making %pd2\n", dentry); + + ncp_age_dentry(server, dentry); + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + error = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE, aDIR, + cpu_to_le16(0xffff), + &finfo); + if (error == 0) { + if (ncp_is_nfs_extras(server, finfo.volume)) { + mode |= S_IFDIR; + finfo.i.nfs.mode = mode; + if (ncp_modify_nfs_info(server, + finfo.volume, + finfo.i.dirEntNum, + mode, 0) != 0) + goto out; + } + error = ncp_instantiate(dir, dentry, &finfo); + } else if (error > 0) { + error = -EACCES; + } +out: + return error; +} + +static int ncp_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ncp_server *server = NCP_SERVER(dir); + int error, result, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + ncp_dbg(1, "removing %pd2\n", dentry); + + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + result = ncp_del_file_or_subdir(server, dir, __name); + switch (result) { + case 0x00: + error = 0; + break; + case 0x85: /* unauthorized to delete file */ + case 0x8A: /* unauthorized to delete file */ + error = -EACCES; + break; + case 0x8F: + case 0x90: /* read only */ + error = -EPERM; + break; + case 0x9F: /* in use by another client */ + error = -EBUSY; + break; + case 0xA0: /* directory not empty */ + error = -ENOTEMPTY; + break; + case 0xFF: /* someone deleted file */ + error = -ENOENT; + break; + default: + error = result < 0 ? result : -EACCES; + break; + } +out: + return error; +} + +static int ncp_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct ncp_server *server; + int error; + + server = NCP_SERVER(dir); + ncp_dbg(1, "unlinking %pd2\n", dentry); + + /* + * Check whether to close the file ... + */ + if (inode) { + ncp_vdbg("closing file\n"); + ncp_make_closed(inode); + } + + error = ncp_del_file_or_subdir2(server, dentry); +#ifdef CONFIG_NCPFS_STRONG + /* 9C is Invalid path.. It should be 8F, 90 - read only, but + it is not :-( */ + if ((error == 0x9C || error == 0x90) && server->m.flags & NCP_MOUNT_STRONG) { /* R/O */ + error = ncp_force_unlink(dir, dentry); + } +#endif + switch (error) { + case 0x00: + ncp_dbg(1, "removed %pd2\n", dentry); + break; + case 0x85: + case 0x8A: + error = -EACCES; + break; + case 0x8D: /* some files in use */ + case 0x8E: /* all files in use */ + error = -EBUSY; + break; + case 0x8F: /* some read only */ + case 0x90: /* all read only */ + case 0x9C: /* !!! returned when in-use or read-only by NW4 */ + error = -EPERM; + break; + case 0xFF: + error = -ENOENT; + break; + default: + error = error < 0 ? error : -EACCES; + break; + } + return error; +} + +static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ncp_server *server = NCP_SERVER(old_dir); + int error; + int old_len, new_len; + __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; + + ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); + + ncp_age_dentry(server, old_dentry); + ncp_age_dentry(server, new_dentry); + + old_len = sizeof(__old_name); + error = ncp_io2vol(server, __old_name, &old_len, + old_dentry->d_name.name, old_dentry->d_name.len, + !ncp_preserve_case(old_dir)); + if (error) + goto out; + + new_len = sizeof(__new_name); + error = ncp_io2vol(server, __new_name, &new_len, + new_dentry->d_name.name, new_dentry->d_name.len, + !ncp_preserve_case(new_dir)); + if (error) + goto out; + + error = ncp_ren_or_mov_file_or_subdir(server, old_dir, __old_name, + new_dir, __new_name); +#ifdef CONFIG_NCPFS_STRONG + if ((error == 0x90 || error == 0x8B || error == -EACCES) && + server->m.flags & NCP_MOUNT_STRONG) { /* RO */ + error = ncp_force_rename(old_dir, old_dentry, __old_name, + new_dir, new_dentry, __new_name); + } +#endif + switch (error) { + case 0x00: + ncp_dbg(1, "renamed %pd -> %pd\n", + old_dentry, new_dentry); + break; + case 0x9E: + error = -ENAMETOOLONG; + break; + case 0xFF: + error = -ENOENT; + break; + default: + error = error < 0 ? error : -EACCES; + break; + } +out: + return error; +} + +static int ncp_mknod(struct inode * dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + if (!new_valid_dev(rdev)) + return -EINVAL; + if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { + ncp_dbg(1, "mode = 0%ho\n", mode); + return ncp_create_new(dir, dentry, mode, rdev, 0); + } + return -EPERM; /* Strange, but true */ +} + +/* The following routines are taken directly from msdos-fs */ + +/* Linear day numbers of the respective 1sts in non-leap years. */ + +static int day_n[] = +{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; +/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ + +static int utc2local(int time) +{ + return time - sys_tz.tz_minuteswest * 60; +} + +static int local2utc(int time) +{ + return time + sys_tz.tz_minuteswest * 60; +} + +/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ +int +ncp_date_dos2unix(__le16 t, __le16 d) +{ + unsigned short time = le16_to_cpu(t), date = le16_to_cpu(d); + int month, year, secs; + + /* first subtract and mask after that... Otherwise, if + date == 0, bad things happen */ + month = ((date >> 5) - 1) & 15; + year = date >> 9; + secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + + 86400 * ((date & 31) - 1 + day_n[month] + (year / 4) + + year * 365 - ((year & 3) == 0 && month < 2 ? 1 : 0) + 3653); + /* days since 1.1.70 plus 80's leap day */ + return local2utc(secs); +} + + +/* Convert linear UNIX date to a MS-DOS time/date pair. */ +void +ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date) +{ + int day, year, nl_day, month; + + unix_date = utc2local(unix_date); + *time = cpu_to_le16( + (unix_date % 60) / 2 + (((unix_date / 60) % 60) << 5) + + (((unix_date / 3600) % 24) << 11)); + day = unix_date / 86400 - 3652; + year = day / 365; + if ((year + 3) / 4 + 365 * year > day) + year--; + day -= (year + 3) / 4 + 365 * year; + if (day == 59 && !(year & 3)) { + nl_day = day; + month = 2; + } else { + nl_day = (year & 3) || day <= 59 ? day : day - 1; + for (month = 1; month < 12; month++) + if (day_n[month] > nl_day) + break; + } + *date = cpu_to_le16(nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9)); +} diff --git a/kernel/fs/ncpfs/file.c b/kernel/fs/ncpfs/file.c new file mode 100644 index 000000000..011324ce9 --- /dev/null +++ b/kernel/fs/ncpfs/file.c @@ -0,0 +1,262 @@ +/* + * file.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + return filemap_write_and_wait_range(file->f_mapping, start, end); +} + +/* + * Open a file with the specified read/write mode. + */ +int ncp_make_open(struct inode *inode, int right) +{ + int error; + int access; + + error = -EINVAL; + if (!inode) { + pr_err("%s: got NULL inode\n", __func__); + goto out; + } + + ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n", + atomic_read(&NCP_FINFO(inode)->opened), + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum); + error = -EACCES; + mutex_lock(&NCP_FINFO(inode)->open_mutex); + if (!atomic_read(&NCP_FINFO(inode)->opened)) { + struct ncp_entry_info finfo; + int result; + + /* tries max. rights */ + finfo.access = O_RDWR; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_READ | AR_WRITE, &finfo); + if (!result) + goto update; + /* RDWR did not succeeded, try readonly or writeonly as requested */ + switch (right) { + case O_RDONLY: + finfo.access = O_RDONLY; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_READ, &finfo); + break; + case O_WRONLY: + finfo.access = O_WRONLY; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_WRITE, &finfo); + break; + } + if (result) { + ncp_vdbg("failed, result=%d\n", result); + goto out_unlock; + } + /* + * Update the inode information. + */ + update: + ncp_update_inode(inode, &finfo); + atomic_set(&NCP_FINFO(inode)->opened, 1); + } + + access = NCP_FINFO(inode)->access; + ncp_vdbg("file open, access=%x\n", access); + if (access == right || access == O_RDWR) { + atomic_inc(&NCP_FINFO(inode)->opened); + error = 0; + } + +out_unlock: + mutex_unlock(&NCP_FINFO(inode)->open_mutex); +out: + return error; +} + +static ssize_t +ncp_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + size_t already_read = 0; + off_t pos = iocb->ki_pos; + size_t bufsize; + int error; + void *freepage; + size_t freelen; + + ncp_dbg(1, "enter %pD2\n", file); + + if (!iov_iter_count(to)) + return 0; + if (pos > inode->i_sb->s_maxbytes) + return 0; + iov_iter_truncate(to, inode->i_sb->s_maxbytes - pos); + + error = ncp_make_open(inode, O_RDONLY); + if (error) { + ncp_dbg(1, "open failed, error=%d\n", error); + return error; + } + + bufsize = NCP_SERVER(inode)->buffer_size; + + error = -EIO; + freelen = ncp_read_bounce_size(bufsize); + freepage = vmalloc(freelen); + if (!freepage) + goto outrel; + error = 0; + /* First read in as much as possible for each bufsize. */ + while (iov_iter_count(to)) { + int read_this_time; + size_t to_read = min_t(size_t, + bufsize - (pos % bufsize), + iov_iter_count(to)); + + error = ncp_read_bounce(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_read, to, &read_this_time, + freepage, freelen); + if (error) { + error = -EIO; /* NW errno -> Linux errno */ + break; + } + pos += read_this_time; + already_read += read_this_time; + + if (read_this_time != to_read) + break; + } + vfree(freepage); + + iocb->ki_pos = pos; + + file_accessed(file); + + ncp_dbg(1, "exit %pD2\n", file); +outrel: + ncp_inode_close(inode); + return already_read ? already_read : error; +} + +static ssize_t +ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + size_t already_written = 0; + size_t bufsize; + int errno; + void *bouncebuffer; + off_t pos; + + ncp_dbg(1, "enter %pD2\n", file); + errno = generic_write_checks(iocb, from); + if (errno <= 0) + return errno; + + errno = ncp_make_open(inode, O_WRONLY); + if (errno) { + ncp_dbg(1, "open failed, error=%d\n", errno); + return errno; + } + bufsize = NCP_SERVER(inode)->buffer_size; + + errno = file_update_time(file); + if (errno) + goto outrel; + + bouncebuffer = vmalloc(bufsize); + if (!bouncebuffer) { + errno = -EIO; /* -ENOMEM */ + goto outrel; + } + pos = iocb->ki_pos; + while (iov_iter_count(from)) { + int written_this_time; + size_t to_write = min_t(size_t, + bufsize - (pos % bufsize), + iov_iter_count(from)); + + if (copy_from_iter(bouncebuffer, to_write, from) != to_write) { + errno = -EFAULT; + break; + } + if (ncp_write_kernel(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_write, bouncebuffer, &written_this_time) != 0) { + errno = -EIO; + break; + } + pos += written_this_time; + already_written += written_this_time; + + if (written_this_time != to_write) + break; + } + vfree(bouncebuffer); + + iocb->ki_pos = pos; + + if (pos > i_size_read(inode)) { + mutex_lock(&inode->i_mutex); + if (pos > i_size_read(inode)) + i_size_write(inode, pos); + mutex_unlock(&inode->i_mutex); + } + ncp_dbg(1, "exit %pD2\n", file); +outrel: + ncp_inode_close(inode); + return already_written ? already_written : errno; +} + +static int ncp_release(struct inode *inode, struct file *file) { + if (ncp_make_closed(inode)) { + ncp_dbg(1, "failed to close\n"); + } + return 0; +} + +const struct file_operations ncp_file_operations = +{ + .llseek = generic_file_llseek, + .read_iter = ncp_file_read_iter, + .write_iter = ncp_file_write_iter, + .unlocked_ioctl = ncp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ncp_compat_ioctl, +#endif + .mmap = ncp_mmap, + .release = ncp_release, + .fsync = ncp_fsync, +}; + +const struct inode_operations ncp_file_inode_operations = +{ + .setattr = ncp_notify_change, +}; diff --git a/kernel/fs/ncpfs/getopt.c b/kernel/fs/ncpfs/getopt.c new file mode 100644 index 000000000..344889cd1 --- /dev/null +++ b/kernel/fs/ncpfs/getopt.c @@ -0,0 +1,75 @@ +/* + * getopt.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include + +#include "getopt.h" + +/** + * ncp_getopt - option parser + * @caller: name of the caller, for error messages + * @options: the options string + * @opts: an array of &struct option entries controlling parser operations + * @optopt: output; will contain the current option + * @optarg: output; will contain the value (if one exists) + * @value: output; may be NULL; will be overwritten with the integer value + * of the current argument. + * + * Helper to parse options on the format used by mount ("a=b,c=d,e,f"). + * Returns opts->val if a matching entry in the 'opts' array is found, + * 0 when no more tokens are found, -1 if an error is encountered. + */ +int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, + char **optopt, char **optarg, unsigned long *value) +{ + char *token; + char *val; + + do { + if ((token = strsep(options, ",")) == NULL) + return 0; + } while (*token == '\0'); + if (optopt) + *optopt = token; + + if ((val = strchr (token, '=')) != NULL) { + *val++ = 0; + } + *optarg = val; + for (; opts->name; opts++) { + if (!strcmp(opts->name, token)) { + if (!val) { + if (opts->has_arg & OPT_NOPARAM) { + return opts->val; + } + pr_info("%s: the %s option requires an argument\n", + caller, token); + return -EINVAL; + } + if (opts->has_arg & OPT_INT) { + int rc = kstrtoul(val, 0, value); + + if (rc) { + pr_info("%s: invalid numeric value in %s=%s\n", + caller, token, val); + return rc; + } + return opts->val; + } + if (opts->has_arg & OPT_STRING) { + return opts->val; + } + pr_info("%s: unexpected argument %s to the %s option\n", + caller, val, token); + return -EINVAL; + } + } + pr_info("%s: Unrecognized mount option %s\n", caller, token); + return -EOPNOTSUPP; +} diff --git a/kernel/fs/ncpfs/getopt.h b/kernel/fs/ncpfs/getopt.h new file mode 100644 index 000000000..cccc007dc --- /dev/null +++ b/kernel/fs/ncpfs/getopt.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_GETOPT_H +#define _LINUX_GETOPT_H + +#define OPT_NOPARAM 1 +#define OPT_INT 2 +#define OPT_STRING 4 +struct ncp_option { + const char *name; + unsigned int has_arg; + int val; +}; + +extern int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, + char **optopt, char **optarg, unsigned long *value); + +#endif /* _LINUX_GETOPT_H */ diff --git a/kernel/fs/ncpfs/inode.c b/kernel/fs/ncpfs/inode.c new file mode 100644 index 000000000..9605a2f63 --- /dev/null +++ b/kernel/fs/ncpfs/inode.c @@ -0,0 +1,1072 @@ +/* + * inode.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998 Wolfram Pienkoss for NLS + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ncp_fs.h" +#include "getopt.h" + +#define NCP_DEFAULT_FILE_MODE 0600 +#define NCP_DEFAULT_DIR_MODE 0700 +#define NCP_DEFAULT_TIME_OUT 10 +#define NCP_DEFAULT_RETRY_COUNT 20 + +static void ncp_evict_inode(struct inode *); +static void ncp_put_super(struct super_block *); +static int ncp_statfs(struct dentry *, struct kstatfs *); +static int ncp_show_options(struct seq_file *, struct dentry *); + +static struct kmem_cache * ncp_inode_cachep; + +static struct inode *ncp_alloc_inode(struct super_block *sb) +{ + struct ncp_inode_info *ei; + ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void ncp_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); +} + +static void ncp_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ncp_i_callback); +} + +static void init_once(void *foo) +{ + struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; + + mutex_init(&ei->open_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", + sizeof(struct ncp_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ncp_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ncp_inode_cachep); +} + +static int ncp_remount(struct super_block *sb, int *flags, char* data) +{ + sync_filesystem(sb); + *flags |= MS_NODIRATIME; + return 0; +} + +static const struct super_operations ncp_sops = +{ + .alloc_inode = ncp_alloc_inode, + .destroy_inode = ncp_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = ncp_evict_inode, + .put_super = ncp_put_super, + .statfs = ncp_statfs, + .remount_fs = ncp_remount, + .show_options = ncp_show_options, +}; + +/* + * Fill in the ncpfs-specific information in the inode. + */ +static void ncp_update_dirent(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + NCP_FINFO(inode)->DosDirNum = nwinfo->i.DosDirNum; + NCP_FINFO(inode)->dirEntNum = nwinfo->i.dirEntNum; + NCP_FINFO(inode)->volNumber = nwinfo->volume; +} + +void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + ncp_update_dirent(inode, nwinfo); + NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; + NCP_FINFO(inode)->access = nwinfo->access; + memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, + sizeof(nwinfo->file_handle)); + ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n", + nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum); +} + +static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) +{ + /* NFS namespace mode overrides others if it's set. */ + ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode); + if (nwi->nfs.mode) { + /* XXX Security? */ + inode->i_mode = nwi->nfs.mode; + } + + inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; + + inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); + inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); + inode->i_atime.tv_sec = ncp_date_dos2unix(0, nwi->lastAccessDate); + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; +} + +static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + struct nw_info_struct *nwi = &nwinfo->i; + struct ncp_server *server = NCP_SERVER(inode); + + if (nwi->attributes & aDIR) { + inode->i_mode = server->m.dir_mode; + /* for directories dataStreamSize seems to be some + Object ID ??? */ + i_size_write(inode, NCP_BLOCK_SIZE); + } else { + u32 size; + + inode->i_mode = server->m.file_mode; + size = le32_to_cpu(nwi->dataStreamSize); + i_size_write(inode, size); +#ifdef CONFIG_NCPFS_EXTRAS + if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) + && (nwi->attributes & aSHARED)) { + switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { + case aHIDDEN: + if (server->m.flags & NCP_MOUNT_SYMLINKS) { + if (/* (size >= NCP_MIN_SYMLINK_SIZE) + && */ (size <= NCP_MAX_SYMLINK_SIZE)) { + inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; + NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; + break; + } + } + /* FALLTHROUGH */ + case 0: + if (server->m.flags & NCP_MOUNT_EXTRAS) + inode->i_mode |= S_IRUGO; + break; + case aSYSTEM: + if (server->m.flags & NCP_MOUNT_EXTRAS) + inode->i_mode |= (inode->i_mode >> 2) & S_IXUGO; + break; + /* case aSYSTEM|aHIDDEN: */ + default: + /* reserved combination */ + break; + } + } +#endif + } + if (nwi->attributes & aRONLY) inode->i_mode &= ~S_IWUGO; +} + +void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo) +{ + NCP_FINFO(inode)->flags = 0; + if (!atomic_read(&NCP_FINFO(inode)->opened)) { + NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; + ncp_update_attrs(inode, nwinfo); + } + + ncp_update_dates(inode, &nwinfo->i); + ncp_update_dirent(inode, nwinfo); +} + +/* + * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes. + */ +static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + struct ncp_server *server = NCP_SERVER(inode); + + NCP_FINFO(inode)->flags = 0; + + ncp_update_attrs(inode, nwinfo); + + ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode); + + set_nlink(inode, 1); + inode->i_uid = server->m.uid; + inode->i_gid = server->m.gid; + + ncp_update_dates(inode, &nwinfo->i); + ncp_update_inode(inode, nwinfo); +} + +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +static const struct inode_operations ncp_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ncp_notify_change, +}; +#endif + +/* + * Get a new inode. + */ +struct inode * +ncp_iget(struct super_block *sb, struct ncp_entry_info *info) +{ + struct inode *inode; + + if (info == NULL) { + pr_err("%s: info is NULL\n", __func__); + return NULL; + } + + inode = new_inode(sb); + if (inode) { + atomic_set(&NCP_FINFO(inode)->opened, info->opened); + + inode->i_ino = info->ino; + ncp_set_attr(inode, info); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ncp_file_inode_operations; + inode->i_fop = &ncp_file_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ncp_dir_inode_operations; + inode->i_fop = &ncp_dir_operations; +#ifdef CONFIG_NCPFS_NFS_NS + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, + new_decode_dev(info->i.nfs.rdev)); +#endif +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ncp_symlink_inode_operations; + inode->i_data.a_ops = &ncp_symlink_aops; +#endif + } else { + make_bad_inode(inode); + } + insert_inode_hash(inode); + } else + pr_err("%s: iget failed!\n", __func__); + return inode; +} + +static void +ncp_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (S_ISDIR(inode->i_mode)) { + ncp_dbg(2, "put directory %ld\n", inode->i_ino); + } + + if (ncp_make_closed(inode) != 0) { + /* We can't do anything but complain. */ + pr_err("%s: could not close\n", __func__); + } +} + +static void ncp_stop_tasks(struct ncp_server *server) { + struct sock* sk = server->ncp_sock->sk; + + lock_sock(sk); + sk->sk_error_report = server->error_report; + sk->sk_data_ready = server->data_ready; + sk->sk_write_space = server->write_space; + release_sock(sk); + del_timer_sync(&server->timeout_tm); + + flush_work(&server->rcv.tq); + if (sk->sk_socket->type == SOCK_STREAM) + flush_work(&server->tx.tq); + else + flush_work(&server->timeout_tq); +} + +static int ncp_show_options(struct seq_file *seq, struct dentry *root) +{ + struct ncp_server *server = NCP_SBP(root->d_sb); + unsigned int tmp; + + if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, server->m.uid)); + if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, server->m.gid)); + if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",owner=%u", + from_kuid_munged(&init_user_ns, server->m.mounted_uid)); + tmp = server->m.file_mode & S_IALLUGO; + if (tmp != NCP_DEFAULT_FILE_MODE) + seq_printf(seq, ",mode=0%o", tmp); + tmp = server->m.dir_mode & S_IALLUGO; + if (tmp != NCP_DEFAULT_DIR_MODE) + seq_printf(seq, ",dirmode=0%o", tmp); + if (server->m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) { + tmp = server->m.time_out * 100 / HZ; + seq_printf(seq, ",timeout=%u", tmp); + } + if (server->m.retry_count != NCP_DEFAULT_RETRY_COUNT) + seq_printf(seq, ",retry=%u", server->m.retry_count); + if (server->m.flags != 0) + seq_printf(seq, ",flags=%lu", server->m.flags); + if (server->m.wdog_pid != NULL) + seq_printf(seq, ",wdogpid=%u", pid_vnr(server->m.wdog_pid)); + + return 0; +} + +static const struct ncp_option ncp_opts[] = { + { "uid", OPT_INT, 'u' }, + { "gid", OPT_INT, 'g' }, + { "owner", OPT_INT, 'o' }, + { "mode", OPT_INT, 'm' }, + { "dirmode", OPT_INT, 'd' }, + { "timeout", OPT_INT, 't' }, + { "retry", OPT_INT, 'r' }, + { "flags", OPT_INT, 'f' }, + { "wdogpid", OPT_INT, 'w' }, + { "ncpfd", OPT_INT, 'n' }, + { "infofd", OPT_INT, 'i' }, /* v5 */ + { "version", OPT_INT, 'v' }, + { NULL, 0, 0 } }; + +static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) { + int optval; + char *optarg; + unsigned long optint; + int version = 0; + int ret; + + data->flags = 0; + data->int_flags = 0; + data->mounted_uid = GLOBAL_ROOT_UID; + data->wdog_pid = NULL; + data->ncp_fd = ~0; + data->time_out = NCP_DEFAULT_TIME_OUT; + data->retry_count = NCP_DEFAULT_RETRY_COUNT; + data->uid = GLOBAL_ROOT_UID; + data->gid = GLOBAL_ROOT_GID; + data->file_mode = NCP_DEFAULT_FILE_MODE; + data->dir_mode = NCP_DEFAULT_DIR_MODE; + data->info_fd = -1; + data->mounted_vol[0] = 0; + + while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) { + ret = optval; + if (ret < 0) + goto err; + switch (optval) { + case 'u': + data->uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->uid)) { + ret = -EINVAL; + goto err; + } + break; + case 'g': + data->gid = make_kgid(current_user_ns(), optint); + if (!gid_valid(data->gid)) { + ret = -EINVAL; + goto err; + } + break; + case 'o': + data->mounted_uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->mounted_uid)) { + ret = -EINVAL; + goto err; + } + break; + case 'm': + data->file_mode = optint; + break; + case 'd': + data->dir_mode = optint; + break; + case 't': + data->time_out = optint; + break; + case 'r': + data->retry_count = optint; + break; + case 'f': + data->flags = optint; + break; + case 'w': + data->wdog_pid = find_get_pid(optint); + break; + case 'n': + data->ncp_fd = optint; + break; + case 'i': + data->info_fd = optint; + break; + case 'v': + ret = -ECHRNG; + if (optint < NCP_MOUNT_VERSION_V4) + goto err; + if (optint > NCP_MOUNT_VERSION_V5) + goto err; + version = optint; + break; + + } + } + return 0; +err: + put_pid(data->wdog_pid); + data->wdog_pid = NULL; + return ret; +} + +static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) +{ + struct ncp_mount_data_kernel data; + struct ncp_server *server; + struct inode *root_inode; + struct socket *sock; + int error; + int default_bufsize; +#ifdef CONFIG_NCPFS_PACKET_SIGNING + int options; +#endif + struct ncp_entry_info finfo; + + memset(&data, 0, sizeof(data)); + server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); + if (!server) + return -ENOMEM; + sb->s_fs_info = server; + + error = -EFAULT; + if (raw_data == NULL) + goto out; + switch (*(int*)raw_data) { + case NCP_MOUNT_VERSION: + { + struct ncp_mount_data* md = (struct ncp_mount_data*)raw_data; + + data.flags = md->flags; + data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); + data.wdog_pid = find_get_pid(md->wdog_pid); + data.ncp_fd = md->ncp_fd; + data.time_out = md->time_out; + data.retry_count = md->retry_count; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); + data.file_mode = md->file_mode; + data.dir_mode = md->dir_mode; + data.info_fd = -1; + memcpy(data.mounted_vol, md->mounted_vol, + NCP_VOLNAME_LEN+1); + } + break; + case NCP_MOUNT_VERSION_V4: + { + struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; + + data.flags = md->flags; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); + data.wdog_pid = find_get_pid(md->wdog_pid); + data.ncp_fd = md->ncp_fd; + data.time_out = md->time_out; + data.retry_count = md->retry_count; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); + data.file_mode = md->file_mode; + data.dir_mode = md->dir_mode; + data.info_fd = -1; + } + break; + default: + error = -ECHRNG; + if (memcmp(raw_data, "vers", 4) == 0) { + error = ncp_parse_options(&data, raw_data); + } + if (error) + goto out; + break; + } + error = -EINVAL; + if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || + !gid_valid(data.gid)) + goto out; + sock = sockfd_lookup(data.ncp_fd, &error); + if (!sock) + goto out; + + if (sock->type == SOCK_STREAM) + default_bufsize = 0xF000; + else + default_bufsize = 1024; + + sb->s_flags |= MS_NODIRATIME; /* probably even noatime */ + sb->s_maxbytes = 0xFFFFFFFFU; + sb->s_blocksize = 1024; /* Eh... Is this correct? */ + sb->s_blocksize_bits = 10; + sb->s_magic = NCP_SUPER_MAGIC; + sb->s_op = &ncp_sops; + sb->s_d_op = &ncp_dentry_operations; + sb->s_bdi = &server->bdi; + + server = NCP_SBP(sb); + memset(server, 0, sizeof(*server)); + + error = bdi_setup_and_register(&server->bdi, "ncpfs"); + if (error) + goto out_fput; + + server->ncp_sock = sock; + + if (data.info_fd != -1) { + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); + if (!info_sock) + goto out_bdi; + server->info_sock = info_sock; + error = -EBADFD; + if (info_sock->type != SOCK_STREAM) + goto out_fput2; + } + +/* server->lock = 0; */ + mutex_init(&server->mutex); + server->packet = NULL; +/* server->buffer_size = 0; */ +/* server->conn_status = 0; */ +/* server->root_dentry = NULL; */ +/* server->root_setuped = 0; */ + mutex_init(&server->root_setup_lock); +#ifdef CONFIG_NCPFS_PACKET_SIGNING +/* server->sign_wanted = 0; */ +/* server->sign_active = 0; */ +#endif + init_rwsem(&server->auth_rwsem); + server->auth.auth_type = NCP_AUTH_NONE; +/* server->auth.object_name_len = 0; */ +/* server->auth.object_name = NULL; */ +/* server->auth.object_type = 0; */ +/* server->priv.len = 0; */ +/* server->priv.data = NULL; */ + + server->m = data; + /* Although anything producing this is buggy, it happens + now because of PATH_MAX changes.. */ + if (server->m.time_out < 1) { + server->m.time_out = 10; + pr_info("You need to recompile your ncpfs utils..\n"); + } + server->m.time_out = server->m.time_out * HZ / 100; + server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; + server->m.dir_mode = (server->m.dir_mode & S_IRWXUGO) | S_IFDIR; + +#ifdef CONFIG_NCPFS_NLS + /* load the default NLS charsets */ + server->nls_vol = load_nls_default(); + server->nls_io = load_nls_default(); +#endif /* CONFIG_NCPFS_NLS */ + + atomic_set(&server->dentry_ttl, 0); /* no caching */ + + INIT_LIST_HEAD(&server->tx.requests); + mutex_init(&server->rcv.creq_mutex); + server->tx.creq = NULL; + server->rcv.creq = NULL; + + init_timer(&server->timeout_tm); +#undef NCP_PACKET_SIZE +#define NCP_PACKET_SIZE 131072 + error = -ENOMEM; + server->packet_size = NCP_PACKET_SIZE; + server->packet = vmalloc(NCP_PACKET_SIZE); + if (server->packet == NULL) + goto out_nls; + server->txbuf = vmalloc(NCP_PACKET_SIZE); + if (server->txbuf == NULL) + goto out_packet; + server->rxbuf = vmalloc(NCP_PACKET_SIZE); + if (server->rxbuf == NULL) + goto out_txbuf; + + lock_sock(sock->sk); + server->data_ready = sock->sk->sk_data_ready; + server->write_space = sock->sk->sk_write_space; + server->error_report = sock->sk->sk_error_report; + sock->sk->sk_user_data = server; + sock->sk->sk_data_ready = ncp_tcp_data_ready; + sock->sk->sk_error_report = ncp_tcp_error_report; + if (sock->type == SOCK_STREAM) { + server->rcv.ptr = (unsigned char*)&server->rcv.buf; + server->rcv.len = 10; + server->rcv.state = 0; + INIT_WORK(&server->rcv.tq, ncp_tcp_rcv_proc); + INIT_WORK(&server->tx.tq, ncp_tcp_tx_proc); + sock->sk->sk_write_space = ncp_tcp_write_space; + } else { + INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); + INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); + server->timeout_tm.data = (unsigned long)server; + server->timeout_tm.function = ncpdgram_timeout_call; + } + release_sock(sock->sk); + + ncp_lock_server(server); + error = ncp_connect(server); + ncp_unlock_server(server); + if (error < 0) + goto out_rxbuf; + ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb)); + + error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (ncp_negotiate_size_and_options(server, default_bufsize, + NCP_DEFAULT_OPTIONS, &(server->buffer_size), &options) == 0) + { + if (options != NCP_DEFAULT_OPTIONS) + { + if (ncp_negotiate_size_and_options(server, + default_bufsize, + options & 2, + &(server->buffer_size), &options) != 0) + + { + goto out_disconnect; + } + } + ncp_lock_server(server); + if (options & 2) + server->sign_wanted = 1; + ncp_unlock_server(server); + } + else +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + if (ncp_negotiate_buffersize(server, default_bufsize, + &(server->buffer_size)) != 0) + goto out_disconnect; + ncp_dbg(1, "bufsize = %d\n", server->buffer_size); + + memset(&finfo, 0, sizeof(finfo)); + finfo.i.attributes = aDIR; + finfo.i.dataStreamSize = 0; /* ignored */ + finfo.i.dirEntNum = 0; + finfo.i.DosDirNum = 0; +#ifdef CONFIG_NCPFS_SMALLDOS + finfo.i.NSCreator = NW_NS_DOS; +#endif + finfo.volume = NCP_NUMBER_OF_VOLUMES; + /* set dates of mountpoint to Jan 1, 1986; 00:00 */ + finfo.i.creationTime = finfo.i.modifyTime + = cpu_to_le16(0x0000); + finfo.i.creationDate = finfo.i.modifyDate + = finfo.i.lastAccessDate + = cpu_to_le16(0x0C21); + finfo.i.nameLen = 0; + finfo.i.entryName[0] = '\0'; + + finfo.opened = 0; + finfo.ino = 2; /* tradition */ + + server->name_space[finfo.volume] = NW_NS_DOS; + + error = -ENOMEM; + root_inode = ncp_iget(sb, &finfo); + if (!root_inode) + goto out_disconnect; + ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber); + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) + goto out_disconnect; + return 0; + +out_disconnect: + ncp_lock_server(server); + ncp_disconnect(server); + ncp_unlock_server(server); +out_rxbuf: + ncp_stop_tasks(server); + vfree(server->rxbuf); +out_txbuf: + vfree(server->txbuf); +out_packet: + vfree(server->packet); +out_nls: +#ifdef CONFIG_NCPFS_NLS + unload_nls(server->nls_io); + unload_nls(server->nls_vol); +#endif + mutex_destroy(&server->rcv.creq_mutex); + mutex_destroy(&server->root_setup_lock); + mutex_destroy(&server->mutex); +out_fput2: + if (server->info_sock) + sockfd_put(server->info_sock); +out_bdi: + bdi_destroy(&server->bdi); +out_fput: + sockfd_put(sock); +out: + put_pid(data.wdog_pid); + sb->s_fs_info = NULL; + kfree(server); + return error; +} + +static void delayed_free(struct rcu_head *p) +{ + struct ncp_server *server = container_of(p, struct ncp_server, rcu); +#ifdef CONFIG_NCPFS_NLS + /* unload the NLS charsets */ + unload_nls(server->nls_vol); + unload_nls(server->nls_io); +#endif /* CONFIG_NCPFS_NLS */ + kfree(server); +} + +static void ncp_put_super(struct super_block *sb) +{ + struct ncp_server *server = NCP_SBP(sb); + + ncp_lock_server(server); + ncp_disconnect(server); + ncp_unlock_server(server); + + ncp_stop_tasks(server); + + mutex_destroy(&server->rcv.creq_mutex); + mutex_destroy(&server->root_setup_lock); + mutex_destroy(&server->mutex); + + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); + kill_pid(server->m.wdog_pid, SIGTERM, 1); + put_pid(server->m.wdog_pid); + + bdi_destroy(&server->bdi); + kfree(server->priv.data); + kfree(server->auth.object_name); + vfree(server->rxbuf); + vfree(server->txbuf); + vfree(server->packet); + call_rcu(&server->rcu, delayed_free); +} + +static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct dentry* d; + struct inode* i; + struct ncp_inode_info* ni; + struct ncp_server* s; + struct ncp_volume_info vi; + struct super_block *sb = dentry->d_sb; + int err; + __u8 dh; + + d = sb->s_root; + if (!d) { + goto dflt; + } + i = d_inode(d); + if (!i) { + goto dflt; + } + ni = NCP_FINFO(i); + if (!ni) { + goto dflt; + } + s = NCP_SBP(sb); + if (!s) { + goto dflt; + } + if (!s->m.mounted_vol[0]) { + goto dflt; + } + + err = ncp_dirhandle_alloc(s, ni->volNumber, ni->DosDirNum, &dh); + if (err) { + goto dflt; + } + err = ncp_get_directory_info(s, dh, &vi); + ncp_dirhandle_free(s, dh); + if (err) { + goto dflt; + } + buf->f_type = NCP_SUPER_MAGIC; + buf->f_bsize = vi.sectors_per_block * 512; + buf->f_blocks = vi.total_blocks; + buf->f_bfree = vi.free_blocks; + buf->f_bavail = vi.free_blocks; + buf->f_files = vi.total_dir_entries; + buf->f_ffree = vi.available_dir_entries; + buf->f_namelen = 12; + return 0; + + /* We cannot say how much disk space is left on a mounted + NetWare Server, because free space is distributed over + volumes, and the current user might have disk quotas. So + free space is not that simple to determine. Our decision + here is to err conservatively. */ + +dflt:; + buf->f_type = NCP_SUPER_MAGIC; + buf->f_bsize = NCP_BLOCK_SIZE; + buf->f_blocks = 0; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_namelen = 12; + return 0; +} + +int ncp_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int result = 0; + __le32 info_mask; + struct nw_modify_dos_info info; + struct ncp_server *server; + + result = -EIO; + + server = NCP_SERVER(inode); + if (!server) /* How this could happen? */ + goto out; + + result = -EPERM; + if (IS_DEADDIR(d_inode(dentry))) + goto out; + + /* ageing the dentry to force validation */ + ncp_age_dentry(server, dentry); + + result = inode_change_ok(inode, attr); + if (result < 0) + goto out; + + result = -EPERM; + if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid)) + goto out; + + if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid)) + goto out; + + if (((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & + ~(S_IFREG | S_IFDIR | S_IRWXUGO)))) + goto out; + + info_mask = 0; + memset(&info, 0, sizeof(info)); + +#if 1 + if ((attr->ia_valid & ATTR_MODE) != 0) + { + umode_t newmode = attr->ia_mode; + + info_mask |= DM_ATTRIBUTES; + + if (S_ISDIR(inode->i_mode)) { + newmode &= server->m.dir_mode; + } else { +#ifdef CONFIG_NCPFS_EXTRAS + if (server->m.flags & NCP_MOUNT_EXTRAS) { + /* any non-default execute bit set */ + if (newmode & ~server->m.file_mode & S_IXUGO) + info.attributes |= aSHARED | aSYSTEM; + /* read for group/world and not in default file_mode */ + else if (newmode & ~server->m.file_mode & S_IRUGO) + info.attributes |= aSHARED; + } else +#endif + newmode &= server->m.file_mode; + } + if (newmode & S_IWUGO) + info.attributes &= ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + else + info.attributes |= (aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + +#ifdef CONFIG_NCPFS_NFS_NS + if (ncp_is_nfs_extras(server, NCP_FINFO(inode)->volNumber)) { + result = ncp_modify_nfs_info(server, + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, + attr->ia_mode, 0); + if (result != 0) + goto out; + info.attributes &= ~(aSHARED | aSYSTEM); + { + /* mark partial success */ + struct iattr tmpattr; + + tmpattr.ia_valid = ATTR_MODE; + tmpattr.ia_mode = attr->ia_mode; + + setattr_copy(inode, &tmpattr); + mark_inode_dirty(inode); + } + } +#endif + } +#endif + + /* Do SIZE before attributes, otherwise mtime together with size does not work... + */ + if ((attr->ia_valid & ATTR_SIZE) != 0) { + int written; + + ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size); + + if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { + result = -EACCES; + goto out; + } + ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, + attr->ia_size, 0, "", &written); + + /* According to ndir, the changes only take effect after + closing the file */ + ncp_inode_close(inode); + result = ncp_make_closed(inode); + if (result) + goto out; + + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + mark_inode_dirty(inode); + } + } + if ((attr->ia_valid & ATTR_CTIME) != 0) { + info_mask |= (DM_CREATE_TIME | DM_CREATE_DATE); + ncp_date_unix2dos(attr->ia_ctime.tv_sec, + &info.creationTime, &info.creationDate); + } + if ((attr->ia_valid & ATTR_MTIME) != 0) { + info_mask |= (DM_MODIFY_TIME | DM_MODIFY_DATE); + ncp_date_unix2dos(attr->ia_mtime.tv_sec, + &info.modifyTime, &info.modifyDate); + } + if ((attr->ia_valid & ATTR_ATIME) != 0) { + __le16 dummy; + info_mask |= (DM_LAST_ACCESS_DATE); + ncp_date_unix2dos(attr->ia_atime.tv_sec, + &dummy, &info.lastAccessDate); + } + if (info_mask != 0) { + result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), + inode, info_mask, &info); + if (result != 0) { + if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { + /* NetWare seems not to allow this. I + do not know why. So, just tell the + user everything went fine. This is + a terrible hack, but I do not know + how to do this correctly. */ + result = 0; + } else + goto out; + } +#ifdef CONFIG_NCPFS_STRONG + if ((!result) && (info_mask & DM_ATTRIBUTES)) + NCP_FINFO(inode)->nwattr = info.attributes; +#endif + } + if (result) + goto out; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + +out: + if (result > 0) + result = -EACCES; + return result; +} + +static struct dentry *ncp_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, ncp_fill_super); +} + +static struct file_system_type ncp_fs_type = { + .owner = THIS_MODULE, + .name = "ncpfs", + .mount = ncp_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("ncpfs"); + +static int __init init_ncp_fs(void) +{ + int err; + ncp_dbg(1, "called\n"); + + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ncp_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_ncp_fs(void) +{ + ncp_dbg(1, "called\n"); + unregister_filesystem(&ncp_fs_type); + destroy_inodecache(); +} + +module_init(init_ncp_fs) +module_exit(exit_ncp_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/ncpfs/ioctl.c b/kernel/fs/ncpfs/ioctl.c new file mode 100644 index 000000000..79b113048 --- /dev/null +++ b/kernel/fs/ncpfs/ioctl.c @@ -0,0 +1,919 @@ +/* + * ioctl.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ncp_fs.h" + +/* maximum limit for ncp_objectname_ioctl */ +#define NCP_OBJECT_NAME_MAX_LEN 4096 +/* maximum limit for ncp_privatedata_ioctl */ +#define NCP_PRIVATE_DATA_MAX_LEN 8192 +/* maximum negotiable packet size */ +#define NCP_PACKET_SIZE_INTERNAL 65536 + +static int +ncp_get_fs_info(struct ncp_server * server, struct inode *inode, + struct ncp_fs_info __user *arg) +{ + struct ncp_fs_info info; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + if (info.version != NCP_GET_FS_INFO_VERSION) { + ncp_dbg(1, "info.version invalid: %d\n", info.version); + return -EINVAL; + } + /* TODO: info.addr = server->m.serv_addr; */ + SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); + info.connection = server->connection; + info.buffer_size = server->buffer_size; + info.volume_number = NCP_FINFO(inode)->volNumber; + info.directory_id = NCP_FINFO(inode)->DosDirNum; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int +ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, + struct ncp_fs_info_v2 __user * arg) +{ + struct ncp_fs_info_v2 info2; + + if (copy_from_user(&info2, arg, sizeof(info2))) + return -EFAULT; + + if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { + ncp_dbg(1, "info.version invalid: %d\n", info2.version); + return -EINVAL; + } + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + info2.connection = server->connection; + info2.buffer_size = server->buffer_size; + info2.volume_number = NCP_FINFO(inode)->volNumber; + info2.directory_id = NCP_FINFO(inode)->DosDirNum; + info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; + + if (copy_to_user(arg, &info2, sizeof(info2))) + return -EFAULT; + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_ncp_objectname_ioctl +{ + s32 auth_type; + u32 object_name_len; + compat_caddr_t object_name; /* a userspace data, in most cases user name */ +}; + +struct compat_ncp_fs_info_v2 { + s32 version; + u32 mounted_uid; + u32 connection; + u32 buffer_size; + + u32 volume_number; + u32 directory_id; + + u32 dummy1; + u32 dummy2; + u32 dummy3; +}; + +struct compat_ncp_ioctl_request { + u32 function; + u32 size; + compat_caddr_t data; +}; + +struct compat_ncp_privatedata_ioctl +{ + u32 len; + compat_caddr_t data; /* ~1000 for NDS */ +}; + +#define NCP_IOC_GET_FS_INFO_V2_32 _IOWR('n', 4, struct compat_ncp_fs_info_v2) +#define NCP_IOC_NCPREQUEST_32 _IOR('n', 1, struct compat_ncp_ioctl_request) +#define NCP_IOC_GETOBJECTNAME_32 _IOWR('n', 9, struct compat_ncp_objectname_ioctl) +#define NCP_IOC_SETOBJECTNAME_32 _IOR('n', 9, struct compat_ncp_objectname_ioctl) +#define NCP_IOC_GETPRIVATEDATA_32 _IOWR('n', 10, struct compat_ncp_privatedata_ioctl) +#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) + +static int +ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, + struct compat_ncp_fs_info_v2 __user * arg) +{ + struct compat_ncp_fs_info_v2 info2; + + if (copy_from_user(&info2, arg, sizeof(info2))) + return -EFAULT; + + if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { + ncp_dbg(1, "info.version invalid: %d\n", info2.version); + return -EINVAL; + } + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + info2.connection = server->connection; + info2.buffer_size = server->buffer_size; + info2.volume_number = NCP_FINFO(inode)->volNumber; + info2.directory_id = NCP_FINFO(inode)->DosDirNum; + info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; + + if (copy_to_user(arg, &info2, sizeof(info2))) + return -EFAULT; + return 0; +} +#endif + +#define NCP_IOC_GETMOUNTUID16 _IOW('n', 2, u16) +#define NCP_IOC_GETMOUNTUID32 _IOW('n', 2, u32) +#define NCP_IOC_GETMOUNTUID64 _IOW('n', 2, u64) + +#ifdef CONFIG_NCPFS_NLS +/* Here we are select the iocharset and the codepage for NLS. + * Thanks Petr Vandrovec for idea and many hints. + */ +static int +ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) +{ + struct ncp_nls_ioctl user; + struct nls_table *codepage; + struct nls_table *iocharset; + struct nls_table *oldset_io; + struct nls_table *oldset_cp; + int utf8; + int err; + + if (copy_from_user(&user, arg, sizeof(user))) + return -EFAULT; + + codepage = NULL; + user.codepage[NCP_IOCSNAME_LEN] = 0; + if (!user.codepage[0] || !strcmp(user.codepage, "default")) + codepage = load_nls_default(); + else { + codepage = load_nls(user.codepage); + if (!codepage) { + return -EBADRQC; + } + } + + iocharset = NULL; + user.iocharset[NCP_IOCSNAME_LEN] = 0; + if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { + iocharset = load_nls_default(); + utf8 = 0; + } else if (!strcmp(user.iocharset, "utf8")) { + iocharset = load_nls_default(); + utf8 = 1; + } else { + iocharset = load_nls(user.iocharset); + if (!iocharset) { + unload_nls(codepage); + return -EBADRQC; + } + utf8 = 0; + } + + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) { + oldset_cp = codepage; + oldset_io = iocharset; + err = -EBUSY; + } else { + if (utf8) + NCP_SET_FLAG(server, NCP_FLAG_UTF8); + else + NCP_CLR_FLAG(server, NCP_FLAG_UTF8); + oldset_cp = server->nls_vol; + server->nls_vol = codepage; + oldset_io = server->nls_io; + server->nls_io = iocharset; + err = 0; + } + mutex_unlock(&server->root_setup_lock); + unload_nls(oldset_cp); + unload_nls(oldset_io); + + return err; +} + +static int +ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) +{ + struct ncp_nls_ioctl user; + int len; + + memset(&user, 0, sizeof(user)); + mutex_lock(&server->root_setup_lock); + if (server->nls_vol && server->nls_vol->charset) { + len = strlen(server->nls_vol->charset); + if (len > NCP_IOCSNAME_LEN) + len = NCP_IOCSNAME_LEN; + strncpy(user.codepage, server->nls_vol->charset, len); + user.codepage[len] = 0; + } + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) + strcpy(user.iocharset, "utf8"); + else if (server->nls_io && server->nls_io->charset) { + len = strlen(server->nls_io->charset); + if (len > NCP_IOCSNAME_LEN) + len = NCP_IOCSNAME_LEN; + strncpy(user.iocharset, server->nls_io->charset, len); + user.iocharset[len] = 0; + } + mutex_unlock(&server->root_setup_lock); + + if (copy_to_user(arg, &user, sizeof(user))) + return -EFAULT; + return 0; +} +#endif /* CONFIG_NCPFS_NLS */ + +static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ncp_server *server = NCP_SERVER(inode); + int result; + struct ncp_ioctl_request request; + char* bouncebuffer; + void __user *argp = (void __user *)arg; + + switch (cmd) { +#ifdef CONFIG_COMPAT + case NCP_IOC_NCPREQUEST_32: +#endif + case NCP_IOC_NCPREQUEST: +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_NCPREQUEST_32) { + struct compat_ncp_ioctl_request request32; + if (copy_from_user(&request32, argp, sizeof(request32))) + return -EFAULT; + request.function = request32.function; + request.size = request32.size; + request.data = compat_ptr(request32.data); + } else +#endif + if (copy_from_user(&request, argp, sizeof(request))) + return -EFAULT; + + if ((request.function > 255) + || (request.size > + NCP_PACKET_SIZE - sizeof(struct ncp_request_header))) { + return -EINVAL; + } + bouncebuffer = vmalloc(NCP_PACKET_SIZE_INTERNAL); + if (!bouncebuffer) + return -ENOMEM; + if (copy_from_user(bouncebuffer, request.data, request.size)) { + vfree(bouncebuffer); + return -EFAULT; + } + ncp_lock_server(server); + + /* FIXME: We hack around in the server's structures + here to be able to use ncp_request */ + + server->has_subfunction = 0; + server->current_size = request.size; + memcpy(server->packet, bouncebuffer, request.size); + + result = ncp_request2(server, request.function, + bouncebuffer, NCP_PACKET_SIZE_INTERNAL); + if (result < 0) + result = -EIO; + else + result = server->reply_size; + ncp_unlock_server(server); + ncp_dbg(1, "copy %d bytes\n", result); + if (result >= 0) + if (copy_to_user(request.data, bouncebuffer, result)) + result = -EFAULT; + vfree(bouncebuffer); + return result; + + case NCP_IOC_CONN_LOGGED_IN: + + if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) + return -EINVAL; + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) + result = -EBUSY; + else { + result = ncp_conn_logged_in(inode->i_sb); + if (result == 0) + server->root_setuped = 1; + } + mutex_unlock(&server->root_setup_lock); + return result; + + case NCP_IOC_GET_FS_INFO: + return ncp_get_fs_info(server, inode, argp); + + case NCP_IOC_GET_FS_INFO_V2: + return ncp_get_fs_info_v2(server, inode, argp); + +#ifdef CONFIG_COMPAT + case NCP_IOC_GET_FS_INFO_V2_32: + return ncp_get_compat_fs_info_v2(server, inode, argp); +#endif + /* we have too many combinations of CONFIG_COMPAT, + * CONFIG_64BIT and CONFIG_UID16, so just handle + * any of the possible ioctls */ + case NCP_IOC_GETMOUNTUID16: + { + u16 uid; + + SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); + if (put_user(uid, (u16 __user *)argp)) + return -EFAULT; + return 0; + } + case NCP_IOC_GETMOUNTUID32: + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u32 __user *)argp)) + return -EFAULT; + return 0; + } + case NCP_IOC_GETMOUNTUID64: + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u64 __user *)argp)) + return -EFAULT; + return 0; + } + case NCP_IOC_GETROOT: + { + struct ncp_setroot_ioctl sr; + + result = -EACCES; + mutex_lock(&server->root_setup_lock); + if (server->m.mounted_vol[0]) { + struct dentry* dentry = inode->i_sb->s_root; + + if (dentry) { + struct inode* s_inode = d_inode(dentry); + + if (s_inode) { + sr.volNumber = NCP_FINFO(s_inode)->volNumber; + sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; + sr.namespace = server->name_space[sr.volNumber]; + result = 0; + } else + ncp_dbg(1, "d_inode(s_root)==NULL\n"); + } else + ncp_dbg(1, "s_root==NULL\n"); + } else { + sr.volNumber = -1; + sr.namespace = 0; + sr.dirEntNum = 0; + result = 0; + } + mutex_unlock(&server->root_setup_lock); + if (!result && copy_to_user(argp, &sr, sizeof(sr))) + result = -EFAULT; + return result; + } + + case NCP_IOC_SETROOT: + { + struct ncp_setroot_ioctl sr; + __u32 vnum; + __le32 de; + __le32 dosde; + struct dentry* dentry; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) + result = -EBUSY; + else { + if (sr.volNumber < 0) { + server->m.mounted_vol[0] = 0; + vnum = NCP_NUMBER_OF_VOLUMES; + de = 0; + dosde = 0; + result = 0; + } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { + result = -EINVAL; + } else if (ncp_mount_subdir(server, sr.volNumber, + sr.namespace, sr.dirEntNum, + &vnum, &de, &dosde)) { + result = -ENOENT; + } else + result = 0; + + if (result == 0) { + dentry = inode->i_sb->s_root; + if (dentry) { + struct inode* s_inode = d_inode(dentry); + + if (s_inode) { + NCP_FINFO(s_inode)->volNumber = vnum; + NCP_FINFO(s_inode)->dirEntNum = de; + NCP_FINFO(s_inode)->DosDirNum = dosde; + server->root_setuped = 1; + } else { + ncp_dbg(1, "d_inode(s_root)==NULL\n"); + result = -EIO; + } + } else { + ncp_dbg(1, "s_root==NULL\n"); + result = -EIO; + } + } + } + mutex_unlock(&server->root_setup_lock); + + return result; + } + +#ifdef CONFIG_NCPFS_PACKET_SIGNING + case NCP_IOC_SIGN_INIT: + { + struct ncp_sign_init sign; + + if (argp) + if (copy_from_user(&sign, argp, sizeof(sign))) + return -EFAULT; + ncp_lock_server(server); + mutex_lock(&server->rcv.creq_mutex); + if (argp) { + if (server->sign_wanted) { + memcpy(server->sign_root,sign.sign_root,8); + memcpy(server->sign_last,sign.sign_last,16); + server->sign_active = 1; + } + /* ignore when signatures not wanted */ + } else { + server->sign_active = 0; + } + mutex_unlock(&server->rcv.creq_mutex); + ncp_unlock_server(server); + return 0; + } + + case NCP_IOC_SIGN_WANTED: + { + int state; + + ncp_lock_server(server); + state = server->sign_wanted; + ncp_unlock_server(server); + if (put_user(state, (int __user *)argp)) + return -EFAULT; + return 0; + } + + case NCP_IOC_SET_SIGN_WANTED: + { + int newstate; + + /* get only low 8 bits... */ + if (get_user(newstate, (unsigned char __user *)argp)) + return -EFAULT; + result = 0; + ncp_lock_server(server); + if (server->sign_active) { + /* cannot turn signatures OFF when active */ + if (!newstate) + result = -EINVAL; + } else { + server->sign_wanted = newstate != 0; + } + ncp_unlock_server(server); + return result; + } + +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING + case NCP_IOC_LOCKUNLOCK: + { + struct ncp_lock_ioctl rqdata; + + if (copy_from_user(&rqdata, argp, sizeof(rqdata))) + return -EFAULT; + if (rqdata.origin != 0) + return -EINVAL; + /* check for cmd */ + switch (rqdata.cmd) { + case NCP_LOCK_EX: + case NCP_LOCK_SH: + if (rqdata.timeout == 0) + rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; + else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT) + rqdata.timeout = NCP_LOCK_MAX_TIMEOUT; + break; + case NCP_LOCK_LOG: + rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; /* has no effect */ + case NCP_LOCK_CLEAR: + break; + default: + return -EINVAL; + } + /* locking needs both read and write access */ + if ((result = ncp_make_open(inode, O_RDWR)) != 0) + { + return result; + } + result = -EISDIR; + if (!S_ISREG(inode->i_mode)) + goto outrel; + if (rqdata.cmd == NCP_LOCK_CLEAR) + { + result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + rqdata.offset, + rqdata.length); + if (result > 0) result = 0; /* no such lock */ + } + else + { + int lockcmd; + + switch (rqdata.cmd) + { + case NCP_LOCK_EX: lockcmd=1; break; + case NCP_LOCK_SH: lockcmd=3; break; + default: lockcmd=0; break; + } + result = ncp_LogPhysicalRecord(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + lockcmd, + rqdata.offset, + rqdata.length, + rqdata.timeout); + if (result > 0) result = -EAGAIN; + } +outrel: + ncp_inode_close(inode); + return result; + } +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +#ifdef CONFIG_COMPAT + case NCP_IOC_GETOBJECTNAME_32: + { + struct compat_ncp_objectname_ioctl user; + size_t outl; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + down_read(&server->auth_rwsem); + user.auth_type = server->auth.auth_type; + outl = user.object_name_len; + user.object_name_len = server->auth.object_name_len; + if (outl > user.object_name_len) + outl = user.object_name_len; + result = 0; + if (outl) { + if (copy_to_user(compat_ptr(user.object_name), + server->auth.object_name, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (!result && copy_to_user(argp, &user, sizeof(user))) + result = -EFAULT; + return result; + } +#endif + + case NCP_IOC_GETOBJECTNAME: + { + struct ncp_objectname_ioctl user; + size_t outl; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + down_read(&server->auth_rwsem); + user.auth_type = server->auth.auth_type; + outl = user.object_name_len; + user.object_name_len = server->auth.object_name_len; + if (outl > user.object_name_len) + outl = user.object_name_len; + result = 0; + if (outl) { + if (copy_to_user(user.object_name, + server->auth.object_name, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (!result && copy_to_user(argp, &user, sizeof(user))) + result = -EFAULT; + return result; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_SETOBJECTNAME_32: +#endif + case NCP_IOC_SETOBJECTNAME: + { + struct ncp_objectname_ioctl user; + void* newname; + void* oldname; + size_t oldnamelen; + void* oldprivate; + size_t oldprivatelen; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_SETOBJECTNAME_32) { + struct compat_ncp_objectname_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.auth_type = user32.auth_type; + user.object_name_len = user32.object_name_len; + user.object_name = compat_ptr(user32.object_name); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) + return -ENOMEM; + if (user.object_name_len) { + newname = memdup_user(user.object_name, + user.object_name_len); + if (IS_ERR(newname)) + return PTR_ERR(newname); + } else { + newname = NULL; + } + down_write(&server->auth_rwsem); + oldname = server->auth.object_name; + oldnamelen = server->auth.object_name_len; + oldprivate = server->priv.data; + oldprivatelen = server->priv.len; + server->auth.auth_type = user.auth_type; + server->auth.object_name_len = user.object_name_len; + server->auth.object_name = newname; + server->priv.len = 0; + server->priv.data = NULL; + up_write(&server->auth_rwsem); + kfree(oldprivate); + kfree(oldname); + return 0; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_GETPRIVATEDATA_32: +#endif + case NCP_IOC_GETPRIVATEDATA: + { + struct ncp_privatedata_ioctl user; + size_t outl; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_GETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.len = user32.len; + user.data = compat_ptr(user32.data); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + down_read(&server->auth_rwsem); + outl = user.len; + user.len = server->priv.len; + if (outl > user.len) outl = user.len; + result = 0; + if (outl) { + if (copy_to_user(user.data, + server->priv.data, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (result) + return result; +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_GETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + user32.len = user.len; + user32.data = (unsigned long) user.data; + if (copy_to_user(argp, &user32, sizeof(user32))) + return -EFAULT; + } else +#endif + if (copy_to_user(argp, &user, sizeof(user))) + return -EFAULT; + + return 0; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_SETPRIVATEDATA_32: +#endif + case NCP_IOC_SETPRIVATEDATA: + { + struct ncp_privatedata_ioctl user; + void* new; + void* old; + size_t oldlen; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_SETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.len = user32.len; + user.data = compat_ptr(user32.data); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + if (user.len > NCP_PRIVATE_DATA_MAX_LEN) + return -ENOMEM; + if (user.len) { + new = memdup_user(user.data, user.len); + if (IS_ERR(new)) + return PTR_ERR(new); + } else { + new = NULL; + } + down_write(&server->auth_rwsem); + old = server->priv.data; + oldlen = server->priv.len; + server->priv.len = user.len; + server->priv.data = new; + up_write(&server->auth_rwsem); + kfree(old); + return 0; + } + +#ifdef CONFIG_NCPFS_NLS + case NCP_IOC_SETCHARSETS: + return ncp_set_charsets(server, argp); + + case NCP_IOC_GETCHARSETS: + return ncp_get_charsets(server, argp); + +#endif /* CONFIG_NCPFS_NLS */ + + case NCP_IOC_SETDENTRYTTL: + { + u_int32_t user; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + /* 20 secs at most... */ + if (user > 20000) + return -EINVAL; + user = (user * HZ) / 1000; + atomic_set(&server->dentry_ttl, user); + return 0; + } + + case NCP_IOC_GETDENTRYTTL: + { + u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ; + if (copy_to_user(argp, &user, sizeof(user))) + return -EFAULT; + return 0; + } + + } + return -EINVAL; +} + +long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ncp_server *server = NCP_SERVER(inode); + kuid_t uid = current_uid(); + int need_drop_write = 0; + long ret; + + switch (cmd) { + case NCP_IOC_SETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_SETROOT: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + break; + } + if (!uid_eq(server->m.mounted_uid, uid)) { + switch (cmd) { + /* + * Only mount owner can issue these ioctls. Information + * necessary to authenticate to other NDS servers are + * stored here. + */ + case NCP_IOC_GETOBJECTNAME: + case NCP_IOC_SETOBJECTNAME: + case NCP_IOC_GETPRIVATEDATA: + case NCP_IOC_SETPRIVATEDATA: +#ifdef CONFIG_COMPAT + case NCP_IOC_GETOBJECTNAME_32: + case NCP_IOC_SETOBJECTNAME_32: + case NCP_IOC_GETPRIVATEDATA_32: + case NCP_IOC_SETPRIVATEDATA_32: +#endif + ret = -EACCES; + goto out; + /* + * These require write access on the inode if user id + * does not match. Note that they do not write to the + * file... But old code did mnt_want_write, so I keep + * it as is. Of course not for mountpoint owner, as + * that breaks read-only mounts altogether as ncpmount + * needs working NCP_IOC_NCPREQUEST and + * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl, + * signinit, setsignwanted) should be probably restricted + * to owner only, or even more to CAP_SYS_ADMIN). + */ + case NCP_IOC_GET_FS_INFO: + case NCP_IOC_GET_FS_INFO_V2: + case NCP_IOC_NCPREQUEST: + case NCP_IOC_SETDENTRYTTL: + case NCP_IOC_SIGN_INIT: + case NCP_IOC_LOCKUNLOCK: + case NCP_IOC_SET_SIGN_WANTED: +#ifdef CONFIG_COMPAT + case NCP_IOC_GET_FS_INFO_V2_32: + case NCP_IOC_NCPREQUEST_32: +#endif + ret = mnt_want_write_file(filp); + if (ret) + goto out; + need_drop_write = 1; + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto outDropWrite; + break; + /* + * Read access required. + */ + case NCP_IOC_GETMOUNTUID16: + case NCP_IOC_GETMOUNTUID32: + case NCP_IOC_GETMOUNTUID64: + case NCP_IOC_GETROOT: + case NCP_IOC_SIGN_WANTED: + ret = inode_permission(inode, MAY_READ); + if (ret) + goto out; + break; + /* + * Anybody can read these. + */ + case NCP_IOC_GETCHARSETS: + case NCP_IOC_GETDENTRYTTL: + default: + /* Three codes below are protected by CAP_SYS_ADMIN above. */ + case NCP_IOC_SETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_SETROOT: + break; + } + } + ret = __ncp_ioctl(inode, cmd, arg); +outDropWrite: + if (need_drop_write) + mnt_drop_write_file(filp); +out: + return ret; +} + +#ifdef CONFIG_COMPAT +long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret; + + arg = (unsigned long) compat_ptr(arg); + ret = ncp_ioctl(file, cmd, arg); + return ret; +} +#endif diff --git a/kernel/fs/ncpfs/mmap.c b/kernel/fs/ncpfs/mmap.c new file mode 100644 index 000000000..33b873b25 --- /dev/null +++ b/kernel/fs/ncpfs/mmap.c @@ -0,0 +1,125 @@ +/* + * mmap.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ncp_fs.h" + +/* + * Fill in the supplied page for mmap + * XXX: how are we excluding truncate/invalidate here? Maybe need to lock + * page? + */ +static int ncp_file_mmap_fault(struct vm_area_struct *area, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(area->vm_file); + char *pg_addr; + unsigned int already_read; + unsigned int count; + int bufsize; + int pos; /* XXX: loff_t ? */ + + /* + * ncpfs has nothing against high pages as long + * as recvmsg and memset works on it + */ + vmf->page = alloc_page(GFP_HIGHUSER); + if (!vmf->page) + return VM_FAULT_OOM; + pg_addr = kmap(vmf->page); + pos = vmf->pgoff << PAGE_SHIFT; + + count = PAGE_SIZE; + /* what we can read in one go */ + bufsize = NCP_SERVER(inode)->buffer_size; + + already_read = 0; + if (ncp_make_open(inode, O_RDONLY) >= 0) { + while (already_read < count) { + int read_this_time; + int to_read; + + to_read = bufsize - (pos % bufsize); + + to_read = min_t(unsigned int, to_read, count - already_read); + + if (ncp_read_kernel(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_read, + pg_addr + already_read, + &read_this_time) != 0) { + read_this_time = 0; + } + pos += read_this_time; + already_read += read_this_time; + + if (read_this_time < to_read) { + break; + } + } + ncp_inode_close(inode); + + } + + if (already_read < PAGE_SIZE) + memset(pg_addr + already_read, 0, PAGE_SIZE - already_read); + flush_dcache_page(vmf->page); + kunmap(vmf->page); + + /* + * If I understand ncp_read_kernel() properly, the above always + * fetches from the network, here the analogue of disk. + * -- nyc + */ + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); + return VM_FAULT_MAJOR; +} + +static const struct vm_operations_struct ncp_file_mmap = +{ + .fault = ncp_file_mmap_fault, +}; + + +/* This is used for a general mmap of a ncp file */ +int ncp_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + + ncp_dbg(1, "called\n"); + + if (!ncp_conn_valid(NCP_SERVER(inode))) + return -EIO; + + /* only PAGE_COW or read-only supported now */ + if (vma->vm_flags & VM_SHARED) + return -EINVAL; + /* we do not support files bigger than 4GB... We eventually + supports just 4GB... */ + if (vma_pages(vma) + vma->vm_pgoff + > (1U << (32 - PAGE_SHIFT))) + return -EFBIG; + + vma->vm_ops = &ncp_file_mmap; + file_accessed(file); + return 0; +} diff --git a/kernel/fs/ncpfs/ncp_fs.h b/kernel/fs/ncpfs/ncp_fs.h new file mode 100644 index 000000000..b9f69e1b1 --- /dev/null +++ b/kernel/fs/ncpfs/ncp_fs.h @@ -0,0 +1,100 @@ +#include +#include "ncp_fs_i.h" +#include "ncp_fs_sb.h" + +#undef NCPFS_PARANOIA +#ifdef NCPFS_PARANOIA +#define ncp_vdbg(fmt, ...) \ + pr_debug(fmt, ##__VA_ARGS__) +#else +#define ncp_vdbg(fmt, ...) \ +do { \ + if (0) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) +#endif + +#ifndef DEBUG_NCP +#define DEBUG_NCP 0 +#endif + +#if DEBUG_NCP > 0 && !defined(DEBUG) +#define DEBUG +#endif + +#define ncp_dbg(level, fmt, ...) \ +do { \ + if (level <= DEBUG_NCP) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + +#define NCP_MAX_RPC_TIMEOUT (6*HZ) + + +struct ncp_entry_info { + struct nw_info_struct i; + ino_t ino; + int opened; + int access; + unsigned int volume; + __u8 file_handle[6]; +}; + +static inline struct ncp_server *NCP_SBP(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) +static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) +{ + return container_of(inode, struct ncp_inode_info, vfs_inode); +} + +/* linux/fs/ncpfs/inode.c */ +int ncp_notify_change(struct dentry *, struct iattr *); +struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *); +void ncp_update_inode(struct inode *, struct ncp_entry_info *); +void ncp_update_inode2(struct inode *, struct ncp_entry_info *); + +/* linux/fs/ncpfs/dir.c */ +extern const struct inode_operations ncp_dir_inode_operations; +extern const struct file_operations ncp_dir_operations; +extern const struct dentry_operations ncp_dentry_operations; +int ncp_conn_logged_in(struct super_block *); +int ncp_date_dos2unix(__le16 time, __le16 date); +void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date); + +/* linux/fs/ncpfs/ioctl.c */ +long ncp_ioctl(struct file *, unsigned int, unsigned long); +long ncp_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* linux/fs/ncpfs/sock.c */ +int ncp_request2(struct ncp_server *server, int function, + void* reply, int max_reply_size); +static inline int ncp_request(struct ncp_server *server, int function) { + return ncp_request2(server, function, server->packet, server->packet_size); +} +int ncp_connect(struct ncp_server *server); +int ncp_disconnect(struct ncp_server *server); +void ncp_lock_server(struct ncp_server *server); +void ncp_unlock_server(struct ncp_server *server); + +/* linux/fs/ncpfs/symlink.c */ +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +extern const struct address_space_operations ncp_symlink_aops; +int ncp_symlink(struct inode*, struct dentry*, const char*); +#endif + +/* linux/fs/ncpfs/file.c */ +extern const struct inode_operations ncp_file_inode_operations; +extern const struct file_operations ncp_file_operations; +int ncp_make_open(struct inode *, int); + +/* linux/fs/ncpfs/mmap.c */ +int ncp_mmap(struct file *, struct vm_area_struct *); + +/* linux/fs/ncpfs/ncplib_kernel.c */ +int ncp_make_closed(struct inode *); + +#include "ncplib_kernel.h" diff --git a/kernel/fs/ncpfs/ncp_fs_i.h b/kernel/fs/ncpfs/ncp_fs_i.h new file mode 100644 index 000000000..c4794504f --- /dev/null +++ b/kernel/fs/ncpfs/ncp_fs_i.h @@ -0,0 +1,30 @@ +/* + * ncp_fs_i.h + * + * Copyright (C) 1995 Volker Lendecke + * + */ + +#ifndef _LINUX_NCP_FS_I +#define _LINUX_NCP_FS_I + +/* + * This is the ncpfs part of the inode structure. This must contain + * all the information we need to work with an inode after creation. + */ +struct ncp_inode_info { + __le32 dirEntNum; + __le32 DosDirNum; + __u8 volNumber; + __le32 nwattr; + struct mutex open_mutex; + atomic_t opened; + int access; + int flags; +#define NCPI_KLUDGE_SYMLINK 0x0001 +#define NCPI_DIR_CACHE 0x0002 + __u8 file_handle[6]; + struct inode vfs_inode; +}; + +#endif /* _LINUX_NCP_FS_I */ diff --git a/kernel/fs/ncpfs/ncp_fs_sb.h b/kernel/fs/ncpfs/ncp_fs_sb.h new file mode 100644 index 000000000..55e26fd80 --- /dev/null +++ b/kernel/fs/ncpfs/ncp_fs_sb.h @@ -0,0 +1,174 @@ +/* + * ncp_fs_sb.h + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * + */ + +#ifndef _NCP_FS_SB +#define _NCP_FS_SB + +#include +#include +#include +#include +#include +#include + +#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */ + +struct sock; + +struct ncp_mount_data_kernel { + unsigned long flags; /* NCP_MOUNT_* flags */ + unsigned int int_flags; /* internal flags */ +#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 + kuid_t mounted_uid; /* Who may umount() this filesystem? */ + struct pid *wdog_pid; /* Who cares for our watchdog packets? */ + unsigned int ncp_fd; /* The socket to the ncp port */ + unsigned int time_out; /* How long should I wait after + sending a NCP request? */ + unsigned int retry_count; /* And how often should I retry? */ + unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; + kuid_t uid; + kgid_t gid; + umode_t file_mode; + umode_t dir_mode; + int info_fd; +}; + +struct ncp_server { + struct rcu_head rcu; + struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of + interest for us later, so we store + it completely. */ + + __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; + + struct socket *ncp_sock;/* ncp socket */ + struct socket *info_sock; + + u8 sequence; + u8 task; + u16 connection; /* Remote connection number */ + + u8 completion; /* Status message from server */ + u8 conn_status; /* Bit 4 = 1 ==> Server going down, no + requests allowed anymore. + Bit 0 = 1 ==> Server is down. */ + + int buffer_size; /* Negotiated bufsize */ + + int reply_size; /* Size of last reply */ + + int packet_size; + unsigned char *packet; /* Here we prepare requests and + receive replies */ + unsigned char *txbuf; /* Storage for current request */ + unsigned char *rxbuf; /* Storage for reply to current request */ + + int lock; /* To prevent mismatch in protocols. */ + struct mutex mutex; + + int current_size; /* for packet preparation */ + int has_subfunction; + int ncp_reply_size; + + int root_setuped; + struct mutex root_setup_lock; + + /* info for packet signing */ + int sign_wanted; /* 1=Server needs signed packets */ + int sign_active; /* 0=don't do signing, 1=do */ + char sign_root[8]; /* generated from password and encr. key */ + char sign_last[16]; + + /* Authentication info: NDS or BINDERY, username */ + struct { + int auth_type; + size_t object_name_len; + void* object_name; + int object_type; + } auth; + /* Password info */ + struct { + size_t len; + void* data; + } priv; + struct rw_semaphore auth_rwsem; + + /* nls info: codepage for volume and charset for I/O */ + struct nls_table *nls_vol; + struct nls_table *nls_io; + + /* maximum age in jiffies */ + atomic_t dentry_ttl; + + /* miscellaneous */ + unsigned int flags; + + spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ + + void (*data_ready)(struct sock* sk); + void (*error_report)(struct sock* sk); + void (*write_space)(struct sock* sk); /* STREAM mode only */ + struct { + struct work_struct tq; /* STREAM/DGRAM: data/error ready */ + struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */ + struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */ + + unsigned int state; /* STREAM only: receiver state */ + struct { + __u32 magic __packed; + __u32 len __packed; + __u16 type __packed; + __u16 p1 __packed; + __u16 p2 __packed; + __u16 p3 __packed; + __u16 type2 __packed; + } buf; /* STREAM only: temporary buffer */ + unsigned char* ptr; /* STREAM only: pointer to data */ + size_t len; /* STREAM only: length of data to receive */ + } rcv; + struct { + struct list_head requests; /* STREAM only: queued requests */ + struct work_struct tq; /* STREAM only: transmitter ready */ + struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */ + } tx; + struct timer_list timeout_tm; /* DGRAM only: timeout timer */ + struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */ + int timeout_last; /* DGRAM only: current timeout length */ + int timeout_retries; /* DGRAM only: retries left */ + struct { + size_t len; + __u8 data[128]; + } unexpected_packet; + struct backing_dev_info bdi; +}; + +extern void ncp_tcp_rcv_proc(struct work_struct *work); +extern void ncp_tcp_tx_proc(struct work_struct *work); +extern void ncpdgram_rcv_proc(struct work_struct *work); +extern void ncpdgram_timeout_proc(struct work_struct *work); +extern void ncpdgram_timeout_call(unsigned long server); +extern void ncp_tcp_data_ready(struct sock* sk); +extern void ncp_tcp_write_space(struct sock* sk); +extern void ncp_tcp_error_report(struct sock* sk); + +#define NCP_FLAG_UTF8 1 + +#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag)) +#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag)) +#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag)) + +static inline int ncp_conn_valid(struct ncp_server *server) +{ + return ((server->conn_status & 0x11) == 0); +} + +static inline void ncp_invalidate_conn(struct ncp_server *server) +{ + server->conn_status |= 0x01; +} + +#endif diff --git a/kernel/fs/ncpfs/ncplib_kernel.c b/kernel/fs/ncpfs/ncplib_kernel.c new file mode 100644 index 000000000..88dbbc9fc --- /dev/null +++ b/kernel/fs/ncpfs/ncplib_kernel.c @@ -0,0 +1,1321 @@ +/* + * ncplib_kernel.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1999 Wolfram Pienkoss for NLS + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ncp_fs.h" + +static inline void assert_server_locked(struct ncp_server *server) +{ + if (server->lock == 0) { + ncp_dbg(1, "server not locked!\n"); + } +} + +static void ncp_add_byte(struct ncp_server *server, __u8 x) +{ + assert_server_locked(server); + *(__u8 *) (&(server->packet[server->current_size])) = x; + server->current_size += 1; + return; +} + +static void ncp_add_word(struct ncp_server *server, __le16 x) +{ + assert_server_locked(server); + put_unaligned(x, (__le16 *) (&(server->packet[server->current_size]))); + server->current_size += 2; + return; +} + +static void ncp_add_be16(struct ncp_server *server, __u16 x) +{ + assert_server_locked(server); + put_unaligned(cpu_to_be16(x), (__be16 *) (&(server->packet[server->current_size]))); + server->current_size += 2; +} + +static void ncp_add_dword(struct ncp_server *server, __le32 x) +{ + assert_server_locked(server); + put_unaligned(x, (__le32 *) (&(server->packet[server->current_size]))); + server->current_size += 4; + return; +} + +static void ncp_add_be32(struct ncp_server *server, __u32 x) +{ + assert_server_locked(server); + put_unaligned(cpu_to_be32(x), (__be32 *)(&(server->packet[server->current_size]))); + server->current_size += 4; +} + +static inline void ncp_add_dword_lh(struct ncp_server *server, __u32 x) { + ncp_add_dword(server, cpu_to_le32(x)); +} + +static void ncp_add_mem(struct ncp_server *server, const void *source, int size) +{ + assert_server_locked(server); + memcpy(&(server->packet[server->current_size]), source, size); + server->current_size += size; + return; +} + +static void ncp_add_pstring(struct ncp_server *server, const char *s) +{ + int len = strlen(s); + assert_server_locked(server); + if (len > 255) { + ncp_dbg(1, "string too long: %s\n", s); + len = 255; + } + ncp_add_byte(server, len); + ncp_add_mem(server, s, len); + return; +} + +static inline void ncp_init_request(struct ncp_server *server) +{ + ncp_lock_server(server); + + server->current_size = sizeof(struct ncp_request_header); + server->has_subfunction = 0; +} + +static inline void ncp_init_request_s(struct ncp_server *server, int subfunction) +{ + ncp_lock_server(server); + + server->current_size = sizeof(struct ncp_request_header) + 2; + ncp_add_byte(server, subfunction); + + server->has_subfunction = 1; +} + +static inline char * +ncp_reply_data(struct ncp_server *server, int offset) +{ + return &(server->packet[sizeof(struct ncp_reply_header) + offset]); +} + +static inline u8 BVAL(const void *data) +{ + return *(const u8 *)data; +} + +static u8 ncp_reply_byte(struct ncp_server *server, int offset) +{ + return *(const u8 *)ncp_reply_data(server, offset); +} + +static inline u16 WVAL_LH(const void *data) +{ + return get_unaligned_le16(data); +} + +static u16 +ncp_reply_le16(struct ncp_server *server, int offset) +{ + return get_unaligned_le16(ncp_reply_data(server, offset)); +} + +static u16 +ncp_reply_be16(struct ncp_server *server, int offset) +{ + return get_unaligned_be16(ncp_reply_data(server, offset)); +} + +static inline u32 DVAL_LH(const void *data) +{ + return get_unaligned_le32(data); +} + +static __le32 +ncp_reply_dword(struct ncp_server *server, int offset) +{ + return get_unaligned((__le32 *)ncp_reply_data(server, offset)); +} + +static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) { + return le32_to_cpu(ncp_reply_dword(server, offset)); +} + +int +ncp_negotiate_buffersize(struct ncp_server *server, int size, int *target) +{ + int result; + + ncp_init_request(server); + ncp_add_be16(server, size); + + if ((result = ncp_request(server, 33)) != 0) { + ncp_unlock_server(server); + return result; + } + *target = min_t(unsigned int, ncp_reply_be16(server, 0), size); + + ncp_unlock_server(server); + return 0; +} + + +/* options: + * bit 0 ipx checksum + * bit 1 packet signing + */ +int +ncp_negotiate_size_and_options(struct ncp_server *server, + int size, int options, int *ret_size, int *ret_options) { + int result; + + /* there is minimum */ + if (size < NCP_BLOCK_SIZE) size = NCP_BLOCK_SIZE; + + ncp_init_request(server); + ncp_add_be16(server, size); + ncp_add_byte(server, options); + + if ((result = ncp_request(server, 0x61)) != 0) + { + ncp_unlock_server(server); + return result; + } + + /* NCP over UDP returns 0 (!!!) */ + result = ncp_reply_be16(server, 0); + if (result >= NCP_BLOCK_SIZE) + size = min(result, size); + *ret_size = size; + *ret_options = ncp_reply_byte(server, 4); + + ncp_unlock_server(server); + return 0; +} + +int ncp_get_volume_info_with_number(struct ncp_server* server, + int n, struct ncp_volume_info* target) { + int result; + int len; + + ncp_init_request_s(server, 44); + ncp_add_byte(server, n); + + if ((result = ncp_request(server, 22)) != 0) { + goto out; + } + target->total_blocks = ncp_reply_dword_lh(server, 0); + target->free_blocks = ncp_reply_dword_lh(server, 4); + target->purgeable_blocks = ncp_reply_dword_lh(server, 8); + target->not_yet_purgeable_blocks = ncp_reply_dword_lh(server, 12); + target->total_dir_entries = ncp_reply_dword_lh(server, 16); + target->available_dir_entries = ncp_reply_dword_lh(server, 20); + target->sectors_per_block = ncp_reply_byte(server, 28); + + memset(&(target->volume_name), 0, sizeof(target->volume_name)); + + result = -EIO; + len = ncp_reply_byte(server, 29); + if (len > NCP_VOLNAME_LEN) { + ncp_dbg(1, "volume name too long: %d\n", len); + goto out; + } + memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); + result = 0; +out: + ncp_unlock_server(server); + return result; +} + +int ncp_get_directory_info(struct ncp_server* server, __u8 n, + struct ncp_volume_info* target) { + int result; + int len; + + ncp_init_request_s(server, 45); + ncp_add_byte(server, n); + + if ((result = ncp_request(server, 22)) != 0) { + goto out; + } + target->total_blocks = ncp_reply_dword_lh(server, 0); + target->free_blocks = ncp_reply_dword_lh(server, 4); + target->purgeable_blocks = 0; + target->not_yet_purgeable_blocks = 0; + target->total_dir_entries = ncp_reply_dword_lh(server, 8); + target->available_dir_entries = ncp_reply_dword_lh(server, 12); + target->sectors_per_block = ncp_reply_byte(server, 20); + + memset(&(target->volume_name), 0, sizeof(target->volume_name)); + + result = -EIO; + len = ncp_reply_byte(server, 21); + if (len > NCP_VOLNAME_LEN) { + ncp_dbg(1, "volume name too long: %d\n", len); + goto out; + } + memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); + result = 0; +out: + ncp_unlock_server(server); + return result; +} + +int +ncp_close_file(struct ncp_server *server, const char *file_id) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + + result = ncp_request(server, 66); + ncp_unlock_server(server); + return result; +} + +int +ncp_make_closed(struct inode *inode) +{ + int err; + + err = 0; + mutex_lock(&NCP_FINFO(inode)->open_mutex); + if (atomic_read(&NCP_FINFO(inode)->opened) == 1) { + atomic_set(&NCP_FINFO(inode)->opened, 0); + err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); + + if (!err) + ncp_vdbg("volnum=%d, dirent=%u, error=%d\n", + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, err); + } + mutex_unlock(&NCP_FINFO(inode)->open_mutex); + return err; +} + +static void ncp_add_handle_path(struct ncp_server *server, __u8 vol_num, + __le32 dir_base, int have_dir_base, + const char *path) +{ + ncp_add_byte(server, vol_num); + ncp_add_dword(server, dir_base); + if (have_dir_base != 0) { + ncp_add_byte(server, 1); /* dir_base */ + } else { + ncp_add_byte(server, 0xff); /* no handle */ + } + if (path != NULL) { + ncp_add_byte(server, 1); /* 1 component */ + ncp_add_pstring(server, path); + } else { + ncp_add_byte(server, 0); + } +} + +int ncp_dirhandle_alloc(struct ncp_server* server, __u8 volnum, __le32 dirent, + __u8* dirhandle) { + int result; + + ncp_init_request(server); + ncp_add_byte(server, 12); /* subfunction */ + ncp_add_byte(server, NW_NS_DOS); + ncp_add_byte(server, 0); + ncp_add_word(server, 0); + ncp_add_handle_path(server, volnum, dirent, 1, NULL); + if ((result = ncp_request(server, 87)) == 0) { + *dirhandle = ncp_reply_byte(server, 0); + } + ncp_unlock_server(server); + return result; +} + +int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) { + int result; + + ncp_init_request_s(server, 20); + ncp_add_byte(server, dirhandle); + result = ncp_request(server, 22); + ncp_unlock_server(server); + return result; +} + +void ncp_extract_file_info(const void *structure, struct nw_info_struct *target) +{ + const __u8 *name_len; + const int info_struct_size = offsetof(struct nw_info_struct, nameLen); + + memcpy(target, structure, info_struct_size); + name_len = structure + info_struct_size; + target->nameLen = *name_len; + memcpy(target->entryName, name_len + 1, *name_len); + target->entryName[*name_len] = '\0'; + target->volNumber = le32_to_cpu(target->volNumber); + return; +} + +#ifdef CONFIG_NCPFS_NFS_NS +static inline void ncp_extract_nfs_info(const unsigned char *structure, + struct nw_nfs_info *target) +{ + target->mode = DVAL_LH(structure); + target->rdev = DVAL_LH(structure + 8); +} +#endif + +int ncp_obtain_nfs_info(struct ncp_server *server, + struct nw_info_struct *target) + +{ + int result = 0; +#ifdef CONFIG_NCPFS_NFS_NS + __u32 volnum = target->volNumber; + + if (ncp_is_nfs_extras(server, volnum)) { + ncp_init_request(server); + ncp_add_byte(server, 19); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, NW_NS_NFS); + ncp_add_byte(server, 0); + ncp_add_byte(server, volnum); + ncp_add_dword(server, target->dirEntNum); + /* We must retrieve both nlinks and rdev, otherwise some server versions + report zeroes instead of valid data */ + ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); + + if ((result = ncp_request(server, 87)) == 0) { + ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); + ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n", + target->entryName, target->nfs.mode, + target->nfs.rdev); + } else { + target->nfs.mode = 0; + target->nfs.rdev = 0; + } + ncp_unlock_server(server); + + } else +#endif + { + target->nfs.mode = 0; + target->nfs.rdev = 0; + } + return result; +} + +/* + * Returns information for a (one-component) name relative to + * the specified directory. + */ +int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path, + struct nw_info_struct *target) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + if (target == NULL) { + pr_err("%s: invalid call\n", __func__); + return -EINVAL; + } + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, server->name_space[volnum]); /* N.B. twice ?? */ + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_ALL); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + if ((result = ncp_request(server, 87)) != 0) + goto out; + ncp_extract_file_info(ncp_reply_data(server, 0), target); + ncp_unlock_server(server); + + result = ncp_obtain_nfs_info(server, target); + return result; + +out: + ncp_unlock_server(server); + return result; +} + +#ifdef CONFIG_NCPFS_NFS_NS +static int +ncp_obtain_DOS_dir_base(struct ncp_server *server, + __u8 ns, __u8 volnum, __le32 dirent, + const char *path, /* At most 1 component */ + __le32 *DOS_dir_base) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, ns); + ncp_add_byte(server, ns); + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_DIRECTORY); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + if ((result = ncp_request(server, 87)) == 0) + { + if (DOS_dir_base) *DOS_dir_base=ncp_reply_dword(server, 0x34); + } + ncp_unlock_server(server); + return result; +} +#endif /* CONFIG_NCPFS_NFS_NS */ + +static inline int +ncp_get_known_namespace(struct ncp_server *server, __u8 volume) +{ +#if defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) + int result; + __u8 *namespace; + __u16 no_namespaces; + + ncp_init_request(server); + ncp_add_byte(server, 24); /* Subfunction: Get Name Spaces Loaded */ + ncp_add_word(server, 0); + ncp_add_byte(server, volume); + + if ((result = ncp_request(server, 87)) != 0) { + ncp_unlock_server(server); + return NW_NS_DOS; /* not result ?? */ + } + + result = NW_NS_DOS; + no_namespaces = ncp_reply_le16(server, 0); + namespace = ncp_reply_data(server, 2); + + while (no_namespaces > 0) { + ncp_dbg(1, "found %d on %d\n", *namespace, volume); + +#ifdef CONFIG_NCPFS_NFS_NS + if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) + { + result = NW_NS_NFS; + break; + } +#endif /* CONFIG_NCPFS_NFS_NS */ +#ifdef CONFIG_NCPFS_OS2_NS + if ((*namespace == NW_NS_OS2) && !(server->m.flags&NCP_MOUNT_NO_OS2)) + { + result = NW_NS_OS2; + } +#endif /* CONFIG_NCPFS_OS2_NS */ + namespace += 1; + no_namespaces -= 1; + } + ncp_unlock_server(server); + return result; +#else /* neither OS2 nor NFS - only DOS */ + return NW_NS_DOS; +#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ +} + +int +ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) +{ + int ns = ncp_get_known_namespace(server, volume); + + if (ret_ns) + *ret_ns = ns; + + ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]); + + if (server->name_space[volume] == ns) + return 0; + server->name_space[volume] = ns; + return 1; +} + +static int +ncp_ObtainSpecificDirBase(struct ncp_server *server, + __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, + const char *path, /* At most 1 component */ + __le32 *dirEntNum, __le32 *DosDirNum) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, nsSrc); + ncp_add_byte(server, nsDst); + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_ALL); + ncp_add_handle_path(server, vol_num, dir_base, 1, path); + + if ((result = ncp_request(server, 87)) != 0) + { + ncp_unlock_server(server); + return result; + } + + if (dirEntNum) + *dirEntNum = ncp_reply_dword(server, 0x30); + if (DosDirNum) + *DosDirNum = ncp_reply_dword(server, 0x34); + ncp_unlock_server(server); + return 0; +} + +int +ncp_mount_subdir(struct ncp_server *server, + __u8 volNumber, __u8 srcNS, __le32 dirEntNum, + __u32* volume, __le32* newDirEnt, __le32* newDosEnt) +{ + int dstNS; + int result; + + ncp_update_known_namespace(server, volNumber, &dstNS); + if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, + dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) + { + return result; + } + *volume = volNumber; + server->m.mounted_vol[1] = 0; + server->m.mounted_vol[0] = 'X'; + return 0; +} + +int +ncp_get_volume_root(struct ncp_server *server, + const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent) +{ + int result; + + ncp_dbg(1, "looking up vol %s\n", volname); + + ncp_init_request(server); + ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ + ncp_add_byte(server, 0); /* DOS namespace */ + ncp_add_byte(server, 0); /* reserved */ + ncp_add_byte(server, 0); /* reserved */ + ncp_add_byte(server, 0); /* reserved */ + + ncp_add_byte(server, 0); /* faked volume number */ + ncp_add_dword(server, 0); /* faked dir_base */ + ncp_add_byte(server, 0xff); /* Don't have a dir_base */ + ncp_add_byte(server, 1); /* 1 path component */ + ncp_add_pstring(server, volname); + + if ((result = ncp_request(server, 87)) != 0) { + ncp_unlock_server(server); + return result; + } + *dirent = *dosdirent = ncp_reply_dword(server, 4); + *volume = ncp_reply_byte(server, 8); + ncp_unlock_server(server); + return 0; +} + +int +ncp_lookup_volume(struct ncp_server *server, + const char *volname, struct nw_info_struct *target) +{ + int result; + + memset(target, 0, sizeof(*target)); + result = ncp_get_volume_root(server, volname, + &target->volNumber, &target->dirEntNum, &target->DosDirNum); + if (result) { + return result; + } + ncp_update_known_namespace(server, target->volNumber, NULL); + target->nameLen = strlen(volname); + memcpy(target->entryName, volname, target->nameLen+1); + target->attributes = aDIR; + /* set dates to Jan 1, 1986 00:00 */ + target->creationTime = target->modifyTime = cpu_to_le16(0x0000); + target->creationDate = target->modifyDate = target->lastAccessDate = cpu_to_le16(0x0C21); + target->nfs.mode = 0; + return 0; +} + +int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *server, + struct inode *dir, + const char *path, + __le32 info_mask, + const struct nw_modify_dos_info *info) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 7); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_word(server, cpu_to_le16(0x8006)); /* search attribs: all */ + + ncp_add_dword(server, info_mask); + ncp_add_mem(server, info, sizeof(*info)); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + result = ncp_request(server, 87); + ncp_unlock_server(server); + return result; +} + +int ncp_modify_file_or_subdir_dos_info(struct ncp_server *server, + struct inode *dir, + __le32 info_mask, + const struct nw_modify_dos_info *info) +{ + return ncp_modify_file_or_subdir_dos_info_path(server, dir, NULL, + info_mask, info); +} + +#ifdef CONFIG_NCPFS_NFS_NS +int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent, + __u32 mode, __u32 rdev) + +{ + int result = 0; + + ncp_init_request(server); + if (server->name_space[volnum] == NW_NS_NFS) { + ncp_add_byte(server, 25); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, NW_NS_NFS); + ncp_add_byte(server, volnum); + ncp_add_dword(server, dirent); + /* we must always operate on both nlinks and rdev, otherwise + rdev is not set */ + ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); + ncp_add_dword_lh(server, mode); + ncp_add_dword_lh(server, 1); /* nlinks */ + ncp_add_dword_lh(server, rdev); + result = ncp_request(server, 87); + } + ncp_unlock_server(server); + return result; +} +#endif + + +static int +ncp_DeleteNSEntry(struct ncp_server *server, + __u8 have_dir_base, __u8 volnum, __le32 dirent, + const char* name, __u8 ns, __le16 attr) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 8); /* subfunction */ + ncp_add_byte(server, ns); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_word(server, attr); /* search attribs: all */ + ncp_add_handle_path(server, volnum, dirent, have_dir_base, name); + + result = ncp_request(server, 87); + ncp_unlock_server(server); + return result; +} + +int +ncp_del_file_or_subdir2(struct ncp_server *server, + struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + __u8 volnum; + __le32 dirent; + + if (!inode) { + return 0xFF; /* Any error */ + } + volnum = NCP_FINFO(inode)->volNumber; + dirent = NCP_FINFO(inode)->DosDirNum; + return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); +} + +int +ncp_del_file_or_subdir(struct ncp_server *server, + struct inode *dir, const char *name) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int name_space; + + name_space = server->name_space[volnum]; +#ifdef CONFIG_NCPFS_NFS_NS + if (name_space == NW_NS_NFS) + { + int result; + + result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent); + if (result) return result; + name = NULL; + name_space = NW_NS_DOS; + } +#endif /* CONFIG_NCPFS_NFS_NS */ + return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006)); +} + +static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) +{ + __le16 *dest = (__le16 *) ret; + dest[1] = cpu_to_le16(v0); + dest[2] = cpu_to_le16(v1); + dest[0] = cpu_to_le16(v0 + 1); + return; +} + +/* If both dir and name are NULL, then in target there's already a + looked-up entry that wants to be opened. */ +int ncp_open_create_file_or_subdir(struct ncp_server *server, + struct inode *dir, const char *name, + int open_create_mode, + __le32 create_attributes, + __le16 desired_acc_rights, + struct ncp_entry_info *target) +{ + __le16 search_attribs = cpu_to_le16(0x0006); + __u8 volnum; + __le32 dirent; + int result; + + volnum = NCP_FINFO(dir)->volNumber; + dirent = NCP_FINFO(dir)->dirEntNum; + + if ((create_attributes & aDIR) != 0) { + search_attribs |= cpu_to_le16(0x8000); + } + ncp_init_request(server); + ncp_add_byte(server, 1); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, open_create_mode); + ncp_add_word(server, search_attribs); + ncp_add_dword(server, RIM_ALL); + ncp_add_dword(server, create_attributes); + /* The desired acc rights seem to be the inherited rights mask + for directories */ + ncp_add_word(server, desired_acc_rights); + ncp_add_handle_path(server, volnum, dirent, 1, name); + + if ((result = ncp_request(server, 87)) != 0) + goto out; + if (!(create_attributes & aDIR)) + target->opened = 1; + + /* in target there's a new finfo to fill */ + ncp_extract_file_info(ncp_reply_data(server, 6), &(target->i)); + target->volume = target->i.volNumber; + ConvertToNWfromDWORD(ncp_reply_le16(server, 0), + ncp_reply_le16(server, 2), + target->file_handle); + + ncp_unlock_server(server); + + (void)ncp_obtain_nfs_info(server, &(target->i)); + return 0; + +out: + ncp_unlock_server(server); + return result; +} + +int +ncp_initialize_search(struct ncp_server *server, struct inode *dir, + struct nw_search_sequence *target) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 2); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_handle_path(server, volnum, dirent, 1, NULL); + + result = ncp_request(server, 87); + if (result) + goto out; + memcpy(target, ncp_reply_data(server, 0), sizeof(*target)); + +out: + ncp_unlock_server(server); + return result; +} + +int ncp_search_for_fileset(struct ncp_server *server, + struct nw_search_sequence *seq, + int* more, + int* cnt, + char* buffer, + size_t bufsize, + char** rbuf, + size_t* rsize) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 20); + ncp_add_byte(server, server->name_space[seq->volNumber]); + ncp_add_byte(server, 0); /* datastream */ + ncp_add_word(server, cpu_to_le16(0x8006)); + ncp_add_dword(server, RIM_ALL); + ncp_add_word(server, cpu_to_le16(32767)); /* max returned items */ + ncp_add_mem(server, seq, 9); +#ifdef CONFIG_NCPFS_NFS_NS + if (server->name_space[seq->volNumber] == NW_NS_NFS) { + ncp_add_byte(server, 0); /* 0 byte pattern */ + } else +#endif + { + ncp_add_byte(server, 2); /* 2 byte pattern */ + ncp_add_byte(server, 0xff); /* following is a wildcard */ + ncp_add_byte(server, '*'); + } + result = ncp_request2(server, 87, buffer, bufsize); + if (result) { + ncp_unlock_server(server); + return result; + } + if (server->ncp_reply_size < 12) { + ncp_unlock_server(server); + return 0xFF; + } + *rsize = server->ncp_reply_size - 12; + ncp_unlock_server(server); + buffer = buffer + sizeof(struct ncp_reply_header); + *rbuf = buffer + 12; + *cnt = WVAL_LH(buffer + 10); + *more = BVAL(buffer + 9); + memcpy(seq, buffer, 9); + return 0; +} + +static int +ncp_RenameNSEntry(struct ncp_server *server, + struct inode *old_dir, const char *old_name, __le16 old_type, + struct inode *new_dir, const char *new_name) +{ + int result = -EINVAL; + + if ((old_dir == NULL) || (old_name == NULL) || + (new_dir == NULL) || (new_name == NULL)) + goto out; + + ncp_init_request(server); + ncp_add_byte(server, 4); /* subfunction */ + ncp_add_byte(server, server->name_space[NCP_FINFO(old_dir)->volNumber]); + ncp_add_byte(server, 1); /* rename flag */ + ncp_add_word(server, old_type); /* search attributes */ + + /* source Handle Path */ + ncp_add_byte(server, NCP_FINFO(old_dir)->volNumber); + ncp_add_dword(server, NCP_FINFO(old_dir)->dirEntNum); + ncp_add_byte(server, 1); + ncp_add_byte(server, 1); /* 1 source component */ + + /* dest Handle Path */ + ncp_add_byte(server, NCP_FINFO(new_dir)->volNumber); + ncp_add_dword(server, NCP_FINFO(new_dir)->dirEntNum); + ncp_add_byte(server, 1); + ncp_add_byte(server, 1); /* 1 destination component */ + + /* source path string */ + ncp_add_pstring(server, old_name); + /* dest path string */ + ncp_add_pstring(server, new_name); + + result = ncp_request(server, 87); + ncp_unlock_server(server); +out: + return result; +} + +int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, + struct inode *old_dir, const char *old_name, + struct inode *new_dir, const char *new_name) +{ + int result; + __le16 old_type = cpu_to_le16(0x06); + +/* If somebody can do it atomic, call me... vandrove@vc.cvut.cz */ + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + if (result == 0xFF) /* File Not Found, try directory */ + { + old_type = cpu_to_le16(0x16); + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + } + if (result != 0x92) return result; /* All except NO_FILES_RENAMED */ + result = ncp_del_file_or_subdir(server, new_dir, new_name); + if (result != 0) return -EACCES; + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + return result; +} + + +/* We have to transfer to/from user space */ +int +ncp_read_kernel(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_read, char *target, int *bytes_read) +{ + const char *source; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_read); + + if ((result = ncp_request(server, 72)) != 0) { + goto out; + } + *bytes_read = ncp_reply_be16(server, 0); + source = ncp_reply_data(server, 2 + (offset & 1)); + + memcpy(target, source, *bytes_read); +out: + ncp_unlock_server(server); + return result; +} + +/* There is a problem... egrep and some other silly tools do: + x = mmap(NULL, MAP_PRIVATE, PROT_READ|PROT_WRITE, , 32768); + read(, x, 32768); + Now copying read result by copy_to_user causes pagefault. This pagefault + could not be handled because of server was locked due to read. So we have + to use temporary buffer. So ncp_unlock_server must be done before + copy_to_user (and for write, copy_from_user must be done before + ncp_init_request... same applies for send raw packet ioctl). Because of + file is normally read in bigger chunks, caller provides kmalloced + (vmalloced) chunk of memory with size >= to_read... + */ +int +ncp_read_bounce(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_read, struct iov_iter *to, + int *bytes_read, void *bounce, __u32 bufsize) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_read); + result = ncp_request2(server, 72, bounce, bufsize); + ncp_unlock_server(server); + if (!result) { + int len = get_unaligned_be16((char *)bounce + + sizeof(struct ncp_reply_header)); + result = -EIO; + if (len <= to_read) { + char* source; + + source = (char*)bounce + + sizeof(struct ncp_reply_header) + 2 + + (offset & 1); + *bytes_read = len; + result = 0; + if (copy_to_iter(source, len, to) != len) + result = -EFAULT; + } + } + return result; +} + +int +ncp_write_kernel(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_write, + const char *source, int *bytes_written) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_write); + ncp_add_mem(server, source, to_write); + + if ((result = ncp_request(server, 73)) == 0) + *bytes_written = to_write; + ncp_unlock_server(server); + return result; +} + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING +int +ncp_LogPhysicalRecord(struct ncp_server *server, const char *file_id, + __u8 locktype, __u32 offset, __u32 length, __u16 timeout) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, locktype); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be32(server, length); + ncp_add_be16(server, timeout); + + if ((result = ncp_request(server, 0x1A)) != 0) + { + ncp_unlock_server(server); + return result; + } + ncp_unlock_server(server); + return 0; +} + +int +ncp_ClearPhysicalRecord(struct ncp_server *server, const char *file_id, + __u32 offset, __u32 length) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); /* who knows... lanalyzer says that */ + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be32(server, length); + + if ((result = ncp_request(server, 0x1E)) != 0) + { + ncp_unlock_server(server); + return result; + } + ncp_unlock_server(server); + return 0; +} +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +#ifdef CONFIG_NCPFS_NLS +/* This are the NLS conversion routines with inspirations and code parts + * from the vfat file system and hints from Petr Vandrovec. + */ + +int +ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, + const unsigned char *iname, unsigned int ilen, int cc) +{ + struct nls_table *in = server->nls_io; + struct nls_table *out = server->nls_vol; + unsigned char *vname_start; + unsigned char *vname_end; + const unsigned char *iname_end; + + iname_end = iname + ilen; + vname_start = vname; + vname_end = vname + *vlen - 1; + + while (iname < iname_end) { + int chl; + wchar_t ec; + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { + int k; + unicode_t u; + + k = utf8_to_utf32(iname, iname_end - iname, &u); + if (k < 0 || u > MAX_WCHAR_T) + return -EINVAL; + iname += k; + ec = u; + } else { + if (*iname == NCP_ESC) { + int k; + + if (iname_end - iname < 5) + goto nospec; + + ec = 0; + for (k = 1; k < 5; k++) { + unsigned char nc; + + nc = iname[k] - '0'; + if (nc >= 10) { + nc -= 'A' - '0' - 10; + if ((nc < 10) || (nc > 15)) { + goto nospec; + } + } + ec = (ec << 4) | nc; + } + iname += 5; + } else { +nospec:; + if ( (chl = in->char2uni(iname, iname_end - iname, &ec)) < 0) + return chl; + iname += chl; + } + } + + /* unitoupper should be here! */ + + chl = out->uni2char(ec, vname, vname_end - vname); + if (chl < 0) + return chl; + + /* this is wrong... */ + if (cc) { + int chi; + + for (chi = 0; chi < chl; chi++){ + vname[chi] = ncp_toupper(out, vname[chi]); + } + } + vname += chl; + } + + *vname = 0; + *vlen = vname - vname_start; + return 0; +} + +int +ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, + const unsigned char *vname, unsigned int vlen, int cc) +{ + struct nls_table *in = server->nls_vol; + struct nls_table *out = server->nls_io; + const unsigned char *vname_end; + unsigned char *iname_start; + unsigned char *iname_end; + unsigned char *vname_cc; + int err; + + vname_cc = NULL; + + if (cc) { + int i; + + /* this is wrong! */ + vname_cc = kmalloc(vlen, GFP_KERNEL); + if (!vname_cc) + return -ENOMEM; + for (i = 0; i < vlen; i++) + vname_cc[i] = ncp_tolower(in, vname[i]); + vname = vname_cc; + } + + iname_start = iname; + iname_end = iname + *ilen - 1; + vname_end = vname + vlen; + + while (vname < vname_end) { + wchar_t ec; + int chl; + + if ( (chl = in->char2uni(vname, vname_end - vname, &ec)) < 0) { + err = chl; + goto quit; + } + vname += chl; + + /* unitolower should be here! */ + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { + int k; + + k = utf32_to_utf8(ec, iname, iname_end - iname); + if (k < 0) { + err = -ENAMETOOLONG; + goto quit; + } + iname += k; + } else { + if ( (chl = out->uni2char(ec, iname, iname_end - iname)) >= 0) { + iname += chl; + } else { + int k; + + if (iname_end - iname < 5) { + err = -ENAMETOOLONG; + goto quit; + } + *iname = NCP_ESC; + for (k = 4; k > 0; k--) { + unsigned char v; + + v = (ec & 0xF) + '0'; + if (v > '9') { + v += 'A' - '9' - 1; + } + iname[k] = v; + ec >>= 4; + } + iname += 5; + } + } + } + + *iname = 0; + *ilen = iname - iname_start; + err = 0; +quit:; + if (cc) + kfree(vname_cc); + return err; +} + +#else + +int +ncp__io2vol(unsigned char *vname, unsigned int *vlen, + const unsigned char *iname, unsigned int ilen, int cc) +{ + int i; + + if (*vlen <= ilen) + return -ENAMETOOLONG; + + if (cc) + for (i = 0; i < ilen; i++) { + *vname = toupper(*iname); + vname++; + iname++; + } + else { + memmove(vname, iname, ilen); + vname += ilen; + } + + *vlen = ilen; + *vname = 0; + return 0; +} + +int +ncp__vol2io(unsigned char *iname, unsigned int *ilen, + const unsigned char *vname, unsigned int vlen, int cc) +{ + int i; + + if (*ilen <= vlen) + return -ENAMETOOLONG; + + if (cc) + for (i = 0; i < vlen; i++) { + *iname = tolower(*vname); + iname++; + vname++; + } + else { + memmove(iname, vname, vlen); + iname += vlen; + } + + *ilen = vlen; + *iname = 0; + return 0; +} + +#endif diff --git a/kernel/fs/ncpfs/ncplib_kernel.h b/kernel/fs/ncpfs/ncplib_kernel.h new file mode 100644 index 000000000..5233fbc17 --- /dev/null +++ b/kernel/fs/ncpfs/ncplib_kernel.h @@ -0,0 +1,214 @@ +/* + * ncplib_kernel.h + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * Modified 1999 Wolfram Pienkoss for directory caching + * + */ + +#ifndef _NCPLIB_H +#define _NCPLIB_H + + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_NCPFS_NLS +#include +#else +#include +#endif /* CONFIG_NCPFS_NLS */ + +#define NCP_MIN_SYMLINK_SIZE 8 +#define NCP_MAX_SYMLINK_SIZE 512 + +#define NCP_BLOCK_SHIFT 9 +#define NCP_BLOCK_SIZE (1 << (NCP_BLOCK_SHIFT)) + +int ncp_negotiate_buffersize(struct ncp_server *, int, int *); +int ncp_negotiate_size_and_options(struct ncp_server *server, int size, + int options, int *ret_size, int *ret_options); + +int ncp_get_volume_info_with_number(struct ncp_server* server, int n, + struct ncp_volume_info *target); + +int ncp_get_directory_info(struct ncp_server* server, __u8 dirhandle, + struct ncp_volume_info* target); + +int ncp_close_file(struct ncp_server *, const char *); +static inline int ncp_read_bounce_size(__u32 size) { + return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8; +}; +int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, + struct iov_iter *, int *, void *bounce, __u32 bouncelen); +int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, + char *, int *); +int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16, + const char *, int *); + +static inline void ncp_inode_close(struct inode *inode) { + atomic_dec(&NCP_FINFO(inode)->opened); +} + +void ncp_extract_file_info(const void* src, struct nw_info_struct* target); +int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *, + struct nw_info_struct *target); +int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); +int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns); +int ncp_get_volume_root(struct ncp_server *server, const char *volname, + __u32 *volume, __le32 *dirent, __le32 *dosdirent); +int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); +int ncp_modify_file_or_subdir_dos_info(struct ncp_server *, struct inode *, + __le32, const struct nw_modify_dos_info *info); +int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *, struct inode *, + const char* path, __le32, const struct nw_modify_dos_info *info); +int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent, + __u32 mode, __u32 rdev); + +int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); +int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *); +int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *, + int, __le32, __le16, struct ncp_entry_info *); + +int ncp_initialize_search(struct ncp_server *, struct inode *, + struct nw_search_sequence *target); +int ncp_search_for_fileset(struct ncp_server *server, + struct nw_search_sequence *seq, + int* more, int* cnt, + char* buffer, size_t bufsize, + char** rbuf, size_t* rsize); + +int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, + struct inode *, const char *, struct inode *, const char *); + + +int +ncp_LogPhysicalRecord(struct ncp_server *server, + const char *file_id, __u8 locktype, + __u32 offset, __u32 length, __u16 timeout); + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING +int +ncp_ClearPhysicalRecord(struct ncp_server *server, + const char *file_id, + __u32 offset, __u32 length); +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +int +ncp_mount_subdir(struct ncp_server *, __u8, __u8, __le32, + __u32* volume, __le32* dirent, __le32* dosdirent); +int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirhandle); +int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle); + +int ncp_create_new(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, __le32 attributes); + +static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) { +#ifdef CONFIG_NCPFS_NFS_NS + return (server->m.flags & NCP_MOUNT_NFS_EXTRAS) && + (server->name_space[volnum] == NW_NS_NFS); +#else + return 0; +#endif +} + +#ifdef CONFIG_NCPFS_NLS + +int ncp__io2vol(struct ncp_server *, unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); +int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); + +#define NCP_ESC ':' +#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io) +#define ncp_tolower(t, c) nls_tolower(t, c) +#define ncp_toupper(t, c) nls_toupper(t, c) +#define ncp_strnicmp(t, s1, s2, len) \ + nls_strnicmp(t, s1, s2, len) +#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(S,m,i,n,k,U) +#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(S,m,i,n,k,U) + +#else + +int ncp__io2vol(unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); +int ncp__vol2io(unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); + +#define NCP_IO_TABLE(sb) NULL +#define ncp_tolower(t, c) tolower(c) +#define ncp_toupper(t, c) toupper(c) +#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) +#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) + + +static inline int ncp_strnicmp(const struct nls_table *t, + const unsigned char *s1, const unsigned char *s2, int len) +{ + while (len--) { + if (tolower(*s1++) != tolower(*s2++)) + return 1; + } + + return 0; +} + +#endif /* CONFIG_NCPFS_NLS */ + +#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) +#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl) +#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) + +static inline void +ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) +{ + dentry->d_time = jiffies - NCP_MAX_AGE(server); +} + +static inline void +ncp_new_dentry(struct dentry* dentry) +{ + dentry->d_time = jiffies; +} + +struct ncp_cache_head { + time_t mtime; + unsigned long time; /* cache age */ + unsigned long end; /* last valid fpos in cache */ + int eof; +}; + +#define NCP_DIRCACHE_SIZE ((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *))) +union ncp_dir_cache { + struct ncp_cache_head head; + struct dentry *dentry[NCP_DIRCACHE_SIZE]; +}; + +#define NCP_FIRSTCACHE_SIZE ((int)((NCP_DIRCACHE_SIZE * \ + sizeof(struct dentry *) - sizeof(struct ncp_cache_head)) / \ + sizeof(struct dentry *))) + +#define NCP_DIRCACHE_START (NCP_DIRCACHE_SIZE - NCP_FIRSTCACHE_SIZE) + +struct ncp_cache_control { + struct ncp_cache_head head; + struct page *page; + union ncp_dir_cache *cache; + unsigned long fpos, ofs; + int filled, valid, idx; +}; + +#endif /* _NCPLIB_H */ diff --git a/kernel/fs/ncpfs/ncpsign_kernel.c b/kernel/fs/ncpfs/ncpsign_kernel.c new file mode 100644 index 000000000..08907599d --- /dev/null +++ b/kernel/fs/ncpfs/ncpsign_kernel.c @@ -0,0 +1,127 @@ +/* + * ncpsign_kernel.c + * + * Arne de Bruijn (arne@knoware.nl), 1997 + * + */ + + +#ifdef CONFIG_NCPFS_PACKET_SIGNING + +#include +#include +#include +#include "ncp_fs.h" +#include "ncpsign_kernel.h" + +/* i386: 32-bit, little endian, handles mis-alignment */ +#ifdef __i386__ +#define GET_LE32(p) (*(const int *)(p)) +#define PUT_LE32(p,v) { *(int *)(p)=v; } +#else +/* from include/ncplib.h */ +#define BVAL(buf,pos) (((const __u8 *)(buf))[pos]) +#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) +#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val)) + +static inline __u16 +WVAL_LH(const __u8 * buf, int pos) +{ + return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; +} +static inline __u32 +DVAL_LH(const __u8 * buf, int pos) +{ + return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; +} +static inline void +WSET_LH(__u8 * buf, int pos, __u16 val) +{ + BSET(buf, pos, val & 0xff); + BSET(buf, pos + 1, val >> 8); +} +static inline void +DSET_LH(__u8 * buf, int pos, __u32 val) +{ + WSET_LH(buf, pos, val & 0xffff); + WSET_LH(buf, pos + 2, val >> 16); +} + +#define GET_LE32(p) DVAL_LH(p,0) +#define PUT_LE32(p,v) DSET_LH(p,0,v) +#endif + +static void nwsign(char *r_data1, char *r_data2, char *outdata) { + int i; + unsigned int w0,w1,w2,w3; + static int rbit[4]={0, 2, 1, 3}; +#ifdef __i386__ + unsigned int *data2=(unsigned int *)r_data2; +#else + unsigned int data2[16]; + for (i=0;i<16;i++) + data2[i]=GET_LE32(r_data2+(i<<2)); +#endif + w0=GET_LE32(r_data1); + w1=GET_LE32(r_data1+4); + w2=GET_LE32(r_data1+8); + w3=GET_LE32(r_data1+12); + for (i=0;i<16;i+=4) { + w0=rol32(w0 + ((w1 & w2) | ((~w1) & w3)) + data2[i+0],3); + w3=rol32(w3 + ((w0 & w1) | ((~w0) & w2)) + data2[i+1],7); + w2=rol32(w2 + ((w3 & w0) | ((~w3) & w1)) + data2[i+2],11); + w1=rol32(w1 + ((w2 & w3) | ((~w2) & w0)) + data2[i+3],19); + } + for (i=0;i<4;i++) { + w0=rol32(w0 + (((w2 | w3) & w1) | (w2 & w3)) + 0x5a827999 + data2[i+0],3); + w3=rol32(w3 + (((w1 | w2) & w0) | (w1 & w2)) + 0x5a827999 + data2[i+4],5); + w2=rol32(w2 + (((w0 | w1) & w3) | (w0 & w1)) + 0x5a827999 + data2[i+8],9); + w1=rol32(w1 + (((w3 | w0) & w2) | (w3 & w0)) + 0x5a827999 + data2[i+12],13); + } + for (i=0;i<4;i++) { + w0=rol32(w0 + ((w1 ^ w2) ^ w3) + 0x6ed9eba1 + data2[rbit[i]+0],3); + w3=rol32(w3 + ((w0 ^ w1) ^ w2) + 0x6ed9eba1 + data2[rbit[i]+8],9); + w2=rol32(w2 + ((w3 ^ w0) ^ w1) + 0x6ed9eba1 + data2[rbit[i]+4],11); + w1=rol32(w1 + ((w2 ^ w3) ^ w0) + 0x6ed9eba1 + data2[rbit[i]+12],15); + } + PUT_LE32(outdata,(w0+GET_LE32(r_data1)) & 0xffffffff); + PUT_LE32(outdata+4,(w1+GET_LE32(r_data1+4)) & 0xffffffff); + PUT_LE32(outdata+8,(w2+GET_LE32(r_data1+8)) & 0xffffffff); + PUT_LE32(outdata+12,(w3+GET_LE32(r_data1+12)) & 0xffffffff); +} + +/* Make a signature for the current packet and add it at the end of the */ +/* packet. */ +void __sign_packet(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, void *sign_buff) { + unsigned char data[64]; + + memcpy(data, server->sign_root, 8); + *(__u32*)(data + 8) = totalsize; + if (size < 52) { + memcpy(data + 12, packet, size); + memset(data + 12 + size, 0, 52 - size); + } else { + memcpy(data + 12, packet, 52); + } + nwsign(server->sign_last, data, server->sign_last); + memcpy(sign_buff, server->sign_last, 8); +} + +int sign_verify_reply(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, const void *sign_buff) { + unsigned char data[64]; + unsigned char hash[16]; + + memcpy(data, server->sign_root, 8); + *(__u32*)(data + 8) = totalsize; + if (size < 52) { + memcpy(data + 12, packet, size); + memset(data + 12 + size, 0, 52 - size); + } else { + memcpy(data + 12, packet, 52); + } + nwsign(server->sign_last, data, hash); + return memcmp(sign_buff, hash, 8); +} + +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + diff --git a/kernel/fs/ncpfs/ncpsign_kernel.h b/kernel/fs/ncpfs/ncpsign_kernel.h new file mode 100644 index 000000000..d9a1438bb --- /dev/null +++ b/kernel/fs/ncpfs/ncpsign_kernel.h @@ -0,0 +1,26 @@ +/* + * ncpsign_kernel.h + * + * Arne de Bruijn (arne@knoware.nl), 1997 + * + */ + +#ifndef _NCPSIGN_KERNEL_H +#define _NCPSIGN_KERNEL_H + +#ifdef CONFIG_NCPFS_PACKET_SIGNING +void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); +int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); +#endif + +static inline size_t sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff) { +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active) { + __sign_packet(server, data, size, totalsize, sign_buff); + return 8; + } +#endif + return 0; +} + +#endif diff --git a/kernel/fs/ncpfs/sock.c b/kernel/fs/ncpfs/sock.c new file mode 100644 index 000000000..471bc3d11 --- /dev/null +++ b/kernel/fs/ncpfs/sock.c @@ -0,0 +1,881 @@ +/* + * linux/fs/ncpfs/sock.c + * + * Copyright (C) 1992, 1993 Rick Sladkey + * + * Modified 1995, 1996 by Volker Lendecke to be usable for ncp + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +#include "ncpsign_kernel.h" + +static int _recv(struct socket *sock, void *buf, int size, unsigned flags) +{ + struct msghdr msg = {NULL, }; + struct kvec iov = {buf, size}; + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +static inline int do_send(struct socket *sock, struct kvec *vec, int count, + int len, unsigned flags) +{ + struct msghdr msg = { .msg_flags = flags }; + return kernel_sendmsg(sock, &msg, vec, count, len); +} + +static int _send(struct socket *sock, const void *buff, int len) +{ + struct kvec vec; + vec.iov_base = (void *) buff; + vec.iov_len = len; + return do_send(sock, &vec, 1, len, 0); +} + +struct ncp_request_reply { + struct list_head req; + wait_queue_head_t wq; + atomic_t refs; + unsigned char* reply_buf; + size_t datalen; + int result; + enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status; + struct kvec* tx_ciov; + size_t tx_totallen; + size_t tx_iovlen; + struct kvec tx_iov[3]; + u_int16_t tx_type; + u_int32_t sign[6]; +}; + +static inline struct ncp_request_reply* ncp_alloc_req(void) +{ + struct ncp_request_reply *req; + + req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL); + if (!req) + return NULL; + + init_waitqueue_head(&req->wq); + atomic_set(&req->refs, (1)); + req->status = RQ_IDLE; + + return req; +} + +static void ncp_req_get(struct ncp_request_reply *req) +{ + atomic_inc(&req->refs); +} + +static void ncp_req_put(struct ncp_request_reply *req) +{ + if (atomic_dec_and_test(&req->refs)) + kfree(req); +} + +void ncp_tcp_data_ready(struct sock *sk) +{ + struct ncp_server *server = sk->sk_user_data; + + server->data_ready(sk); + schedule_work(&server->rcv.tq); +} + +void ncp_tcp_error_report(struct sock *sk) +{ + struct ncp_server *server = sk->sk_user_data; + + server->error_report(sk); + schedule_work(&server->rcv.tq); +} + +void ncp_tcp_write_space(struct sock *sk) +{ + struct ncp_server *server = sk->sk_user_data; + + /* We do not need any locking: we first set tx.creq, and then we do sendmsg, + not vice versa... */ + server->write_space(sk); + if (server->tx.creq) + schedule_work(&server->tx.tq); +} + +void ncpdgram_timeout_call(unsigned long v) +{ + struct ncp_server *server = (void*)v; + + schedule_work(&server->timeout_tq); +} + +static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result) +{ + req->result = result; + if (req->status != RQ_ABANDONED) + memcpy(req->reply_buf, server->rxbuf, req->datalen); + req->status = RQ_DONE; + wake_up_all(&req->wq); + ncp_req_put(req); +} + +static void __abort_ncp_connection(struct ncp_server *server) +{ + struct ncp_request_reply *req; + + ncp_invalidate_conn(server); + del_timer(&server->timeout_tm); + while (!list_empty(&server->tx.requests)) { + req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); + + list_del_init(&req->req); + ncp_finish_request(server, req, -EIO); + } + req = server->rcv.creq; + if (req) { + server->rcv.creq = NULL; + ncp_finish_request(server, req, -EIO); + server->rcv.ptr = NULL; + server->rcv.state = 0; + } + req = server->tx.creq; + if (req) { + server->tx.creq = NULL; + ncp_finish_request(server, req, -EIO); + } +} + +static inline int get_conn_number(struct ncp_reply_header *rp) +{ + return rp->conn_low | (rp->conn_high << 8); +} + +static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) +{ + /* If req is done, we got signal, but we also received answer... */ + switch (req->status) { + case RQ_IDLE: + case RQ_DONE: + break; + case RQ_QUEUED: + list_del_init(&req->req); + ncp_finish_request(server, req, err); + break; + case RQ_INPROGRESS: + req->status = RQ_ABANDONED; + break; + case RQ_ABANDONED: + break; + } +} + +static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) +{ + mutex_lock(&server->rcv.creq_mutex); + __ncp_abort_request(server, req, err); + mutex_unlock(&server->rcv.creq_mutex); +} + +static inline void __ncptcp_abort(struct ncp_server *server) +{ + __abort_ncp_connection(server); +} + +static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req) +{ + struct kvec vec[3]; + /* sock_sendmsg updates iov pointers for us :-( */ + memcpy(vec, req->tx_ciov, req->tx_iovlen * sizeof(vec[0])); + return do_send(sock, vec, req->tx_iovlen, + req->tx_totallen, MSG_DONTWAIT); +} + +static void __ncptcp_try_send(struct ncp_server *server) +{ + struct ncp_request_reply *rq; + struct kvec *iov; + struct kvec iovc[3]; + int result; + + rq = server->tx.creq; + if (!rq) + return; + + /* sock_sendmsg updates iov pointers for us :-( */ + memcpy(iovc, rq->tx_ciov, rq->tx_iovlen * sizeof(iov[0])); + result = do_send(server->ncp_sock, iovc, rq->tx_iovlen, + rq->tx_totallen, MSG_NOSIGNAL | MSG_DONTWAIT); + + if (result == -EAGAIN) + return; + + if (result < 0) { + pr_err("tcp: Send failed: %d\n", result); + __ncp_abort_request(server, rq, result); + return; + } + if (result >= rq->tx_totallen) { + server->rcv.creq = rq; + server->tx.creq = NULL; + return; + } + rq->tx_totallen -= result; + iov = rq->tx_ciov; + while (iov->iov_len <= result) { + result -= iov->iov_len; + iov++; + rq->tx_iovlen--; + } + iov->iov_base += result; + iov->iov_len -= result; + rq->tx_ciov = iov; +} + +static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h) +{ + req->status = RQ_INPROGRESS; + h->conn_low = server->connection; + h->conn_high = server->connection >> 8; + h->sequence = ++server->sequence; +} + +static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + size_t signlen; + struct ncp_request_header* h; + + req->tx_ciov = req->tx_iov + 1; + + h = req->tx_iov[1].iov_base; + ncp_init_header(server, req, h); + signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, + req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, + cpu_to_le32(req->tx_totallen), req->sign); + if (signlen) { + req->tx_ciov[1].iov_base = req->sign; + req->tx_ciov[1].iov_len = signlen; + req->tx_iovlen += 1; + req->tx_totallen += signlen; + } + server->rcv.creq = req; + server->timeout_last = server->m.time_out; + server->timeout_retries = server->m.retry_count; + ncpdgram_send(server->ncp_sock, req); + mod_timer(&server->timeout_tm, jiffies + server->m.time_out); +} + +#define NCP_TCP_XMIT_MAGIC (0x446D6454) +#define NCP_TCP_XMIT_VERSION (1) +#define NCP_TCP_RCVD_MAGIC (0x744E6350) + +static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + size_t signlen; + struct ncp_request_header* h; + + req->tx_ciov = req->tx_iov; + h = req->tx_iov[1].iov_base; + ncp_init_header(server, req, h); + signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, + req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, + cpu_to_be32(req->tx_totallen + 24), req->sign + 4) + 16; + + req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC); + req->sign[1] = htonl(req->tx_totallen + signlen); + req->sign[2] = htonl(NCP_TCP_XMIT_VERSION); + req->sign[3] = htonl(req->datalen + 8); + req->tx_iov[0].iov_base = req->sign; + req->tx_iov[0].iov_len = signlen; + req->tx_iovlen += 1; + req->tx_totallen += signlen; + + server->tx.creq = req; + __ncptcp_try_send(server); +} + +static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + /* we copy the data so that we do not depend on the caller + staying alive */ + memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len); + req->tx_iov[1].iov_base = server->txbuf; + + if (server->ncp_sock->type == SOCK_STREAM) + ncptcp_start_request(server, req); + else + ncpdgram_start_request(server, req); +} + +static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + mutex_lock(&server->rcv.creq_mutex); + if (!ncp_conn_valid(server)) { + mutex_unlock(&server->rcv.creq_mutex); + pr_err("tcp: Server died\n"); + return -EIO; + } + ncp_req_get(req); + if (server->tx.creq || server->rcv.creq) { + req->status = RQ_QUEUED; + list_add_tail(&req->req, &server->tx.requests); + mutex_unlock(&server->rcv.creq_mutex); + return 0; + } + __ncp_start_request(server, req); + mutex_unlock(&server->rcv.creq_mutex); + return 0; +} + +static void __ncp_next_request(struct ncp_server *server) +{ + struct ncp_request_reply *req; + + server->rcv.creq = NULL; + if (list_empty(&server->tx.requests)) { + return; + } + req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); + list_del_init(&req->req); + __ncp_start_request(server, req); +} + +static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len) +{ + if (server->info_sock) { + struct kvec iov[2]; + __be32 hdr[2]; + + hdr[0] = cpu_to_be32(len + 8); + hdr[1] = cpu_to_be32(id); + + iov[0].iov_base = hdr; + iov[0].iov_len = 8; + iov[1].iov_base = (void *) data; + iov[1].iov_len = len; + + do_send(server->info_sock, iov, 2, len + 8, MSG_NOSIGNAL); + } +} + +void ncpdgram_rcv_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, rcv.tq); + struct socket* sock; + + sock = server->ncp_sock; + + while (1) { + struct ncp_reply_header reply; + int result; + + result = _recv(sock, &reply, sizeof(reply), MSG_PEEK | MSG_DONTWAIT); + if (result < 0) { + break; + } + if (result >= sizeof(reply)) { + struct ncp_request_reply *req; + + if (reply.type == NCP_WATCHDOG) { + unsigned char buf[10]; + + if (server->connection != get_conn_number(&reply)) { + goto drop; + } + result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); + if (result < 0) { + ncp_dbg(1, "recv failed with %d\n", result); + continue; + } + if (result < 10) { + ncp_dbg(1, "too short (%u) watchdog packet\n", result); + continue; + } + if (buf[9] != '?') { + ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]); + continue; + } + buf[9] = 'Y'; + _send(sock, buf, sizeof(buf)); + continue; + } + if (reply.type != NCP_POSITIVE_ACK && reply.type != NCP_REPLY) { + result = _recv(sock, server->unexpected_packet.data, sizeof(server->unexpected_packet.data), MSG_DONTWAIT); + if (result < 0) { + continue; + } + info_server(server, 0, server->unexpected_packet.data, result); + continue; + } + mutex_lock(&server->rcv.creq_mutex); + req = server->rcv.creq; + if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && + server->connection == get_conn_number(&reply)))) { + if (reply.type == NCP_POSITIVE_ACK) { + server->timeout_retries = server->m.retry_count; + server->timeout_last = NCP_MAX_RPC_TIMEOUT; + mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT); + } else if (reply.type == NCP_REPLY) { + result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT); +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { + if (result < 8 + 8) { + result = -EIO; + } else { + unsigned int hdrl; + + result -= 8; + hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; + if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { + pr_info("Signature violation\n"); + result = -EIO; + } + } + } +#endif + del_timer(&server->timeout_tm); + server->rcv.creq = NULL; + ncp_finish_request(server, req, result); + __ncp_next_request(server); + mutex_unlock(&server->rcv.creq_mutex); + continue; + } + } + mutex_unlock(&server->rcv.creq_mutex); + } +drop:; + _recv(sock, &reply, sizeof(reply), MSG_DONTWAIT); + } +} + +static void __ncpdgram_timeout_proc(struct ncp_server *server) +{ + /* If timer is pending, we are processing another request... */ + if (!timer_pending(&server->timeout_tm)) { + struct ncp_request_reply* req; + + req = server->rcv.creq; + if (req) { + int timeout; + + if (server->m.flags & NCP_MOUNT_SOFT) { + if (server->timeout_retries-- == 0) { + __ncp_abort_request(server, req, -ETIMEDOUT); + return; + } + } + /* Ignore errors */ + ncpdgram_send(server->ncp_sock, req); + timeout = server->timeout_last << 1; + if (timeout > NCP_MAX_RPC_TIMEOUT) { + timeout = NCP_MAX_RPC_TIMEOUT; + } + server->timeout_last = timeout; + mod_timer(&server->timeout_tm, jiffies + timeout); + } + } +} + +void ncpdgram_timeout_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, timeout_tq); + mutex_lock(&server->rcv.creq_mutex); + __ncpdgram_timeout_proc(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) +{ + int result; + + if (buffer) { + result = _recv(server->ncp_sock, buffer, len, MSG_DONTWAIT); + } else { + static unsigned char dummy[1024]; + + if (len > sizeof(dummy)) { + len = sizeof(dummy); + } + result = _recv(server->ncp_sock, dummy, len, MSG_DONTWAIT); + } + if (result < 0) { + return result; + } + if (result > len) { + pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len); + return -EIO; + } + return result; +} + +static int __ncptcp_rcv_proc(struct ncp_server *server) +{ + /* We have to check the result, so store the complete header */ + while (1) { + int result; + struct ncp_request_reply *req; + int datalen; + int type; + + while (server->rcv.len) { + result = do_tcp_rcv(server, server->rcv.ptr, server->rcv.len); + if (result == -EAGAIN) { + return 0; + } + if (result <= 0) { + req = server->rcv.creq; + if (req) { + __ncp_abort_request(server, req, -EIO); + } else { + __ncptcp_abort(server); + } + if (result < 0) { + pr_err("tcp: error in recvmsg: %d\n", result); + } else { + ncp_dbg(1, "tcp: EOF\n"); + } + return -EIO; + } + if (server->rcv.ptr) { + server->rcv.ptr += result; + } + server->rcv.len -= result; + } + switch (server->rcv.state) { + case 0: + if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { + pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); + __ncptcp_abort(server); + return -EIO; + } + datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; + if (datalen < 10) { + pr_err("tcp: Unexpected reply len %d\n", datalen); + __ncptcp_abort(server); + return -EIO; + } +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active) { + if (datalen < 18) { + pr_err("tcp: Unexpected reply len %d\n", datalen); + __ncptcp_abort(server); + return -EIO; + } + server->rcv.buf.len = datalen - 8; + server->rcv.ptr = (unsigned char*)&server->rcv.buf.p1; + server->rcv.len = 8; + server->rcv.state = 4; + break; + } +#endif + type = ntohs(server->rcv.buf.type); +#ifdef CONFIG_NCPFS_PACKET_SIGNING +cont:; +#endif + if (type != NCP_REPLY) { + if (datalen - 8 <= sizeof(server->unexpected_packet.data)) { + *(__u16*)(server->unexpected_packet.data) = htons(type); + server->unexpected_packet.len = datalen - 8; + + server->rcv.state = 5; + server->rcv.ptr = server->unexpected_packet.data + 2; + server->rcv.len = datalen - 10; + break; + } + ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type); +skipdata2:; + server->rcv.state = 2; +skipdata:; + server->rcv.ptr = NULL; + server->rcv.len = datalen - 10; + break; + } + req = server->rcv.creq; + if (!req) { + ncp_dbg(1, "Reply without appropriate request\n"); + goto skipdata2; + } + if (datalen > req->datalen + 8) { + pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); + server->rcv.state = 3; + goto skipdata; + } + req->datalen = datalen - 8; + ((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY; + server->rcv.ptr = server->rxbuf + 2; + server->rcv.len = datalen - 10; + server->rcv.state = 1; + break; +#ifdef CONFIG_NCPFS_PACKET_SIGNING + case 4: + datalen = server->rcv.buf.len; + type = ntohs(server->rcv.buf.type2); + goto cont; +#endif + case 1: + req = server->rcv.creq; + if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { + if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { + pr_err("tcp: Bad sequence number\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { + pr_err("tcp: Connection number mismatch\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + } +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { + if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { + pr_err("tcp: Signature violation\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + } +#endif + ncp_finish_request(server, req, req->datalen); + nextreq:; + __ncp_next_request(server); + case 2: + next:; + server->rcv.ptr = (unsigned char*)&server->rcv.buf; + server->rcv.len = 10; + server->rcv.state = 0; + break; + case 3: + ncp_finish_request(server, server->rcv.creq, -EIO); + goto nextreq; + case 5: + info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len); + goto next; + } + } +} + +void ncp_tcp_rcv_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, rcv.tq); + + mutex_lock(&server->rcv.creq_mutex); + __ncptcp_rcv_proc(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +void ncp_tcp_tx_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, tx.tq); + + mutex_lock(&server->rcv.creq_mutex); + __ncptcp_try_send(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +static int do_ncp_rpc_call(struct ncp_server *server, int size, + unsigned char* reply_buf, int max_reply_size) +{ + int result; + struct ncp_request_reply *req; + + req = ncp_alloc_req(); + if (!req) + return -ENOMEM; + + req->reply_buf = reply_buf; + req->datalen = max_reply_size; + req->tx_iov[1].iov_base = server->packet; + req->tx_iov[1].iov_len = size; + req->tx_iovlen = 1; + req->tx_totallen = size; + req->tx_type = *(u_int16_t*)server->packet; + + result = ncp_add_request(server, req); + if (result < 0) + goto out; + + if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) { + ncp_abort_request(server, req, -EINTR); + result = -EINTR; + goto out; + } + + result = req->result; + +out: + ncp_req_put(req); + + return result; +} + +/* + * We need the server to be locked here, so check! + */ + +static int ncp_do_request(struct ncp_server *server, int size, + void* reply, int max_reply_size) +{ + int result; + + if (server->lock == 0) { + pr_err("Server not locked!\n"); + return -EIO; + } + if (!ncp_conn_valid(server)) { + return -EIO; + } + { + sigset_t old_set; + unsigned long mask, flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old_set = current->blocked; + if (current->flags & PF_EXITING) + mask = 0; + else + mask = sigmask(SIGKILL); + if (server->m.flags & NCP_MOUNT_INTR) { + /* FIXME: This doesn't seem right at all. So, like, + we can't handle SIGINT and get whatever to stop? + What if we've blocked it ourselves? What about + alarms? Why, in fact, are we mucking with the + sigmask at all? -- r~ */ + if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL) + mask |= sigmask(SIGINT); + if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) + mask |= sigmask(SIGQUIT); + } + siginitsetinv(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + result = do_ncp_rpc_call(server, size, reply, max_reply_size); + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->blocked = old_set; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result); + + return result; +} + +/* ncp_do_request assures that at least a complete reply header is + * received. It assumes that server->current_size contains the ncp + * request size + */ +int ncp_request2(struct ncp_server *server, int function, + void* rpl, int size) +{ + struct ncp_request_header *h; + struct ncp_reply_header* reply = rpl; + int result; + + h = (struct ncp_request_header *) (server->packet); + if (server->has_subfunction != 0) { + *(__u16 *) & (h->data[0]) = htons(server->current_size - sizeof(*h) - 2); + } + h->type = NCP_REQUEST; + /* + * The server shouldn't know or care what task is making a + * request, so we always use the same task number. + */ + h->task = 2; /* (current->pid) & 0xff; */ + h->function = function; + + result = ncp_do_request(server, server->current_size, reply, size); + if (result < 0) { + ncp_dbg(1, "ncp_request_error: %d\n", result); + goto out; + } + server->completion = reply->completion_code; + server->conn_status = reply->connection_state; + server->reply_size = result; + server->ncp_reply_size = result - sizeof(struct ncp_reply_header); + + result = reply->completion_code; + + if (result != 0) + ncp_vdbg("completion code=%x\n", result); +out: + return result; +} + +int ncp_connect(struct ncp_server *server) +{ + struct ncp_request_header *h; + int result; + + server->connection = 0xFFFF; + server->sequence = 255; + + h = (struct ncp_request_header *) (server->packet); + h->type = NCP_ALLOC_SLOT_REQUEST; + h->task = 2; /* see above */ + h->function = 0; + + result = ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); + if (result < 0) + goto out; + server->connection = h->conn_low + (h->conn_high * 256); + result = 0; +out: + return result; +} + +int ncp_disconnect(struct ncp_server *server) +{ + struct ncp_request_header *h; + + h = (struct ncp_request_header *) (server->packet); + h->type = NCP_DEALLOC_SLOT_REQUEST; + h->task = 2; /* see above */ + h->function = 0; + + return ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); +} + +void ncp_lock_server(struct ncp_server *server) +{ + mutex_lock(&server->mutex); + if (server->lock) + pr_warn("%s: was locked!\n", __func__); + server->lock = 1; +} + +void ncp_unlock_server(struct ncp_server *server) +{ + if (!server->lock) { + pr_warn("%s: was not locked!\n", __func__); + return; + } + server->lock = 0; + mutex_unlock(&server->mutex); +} diff --git a/kernel/fs/ncpfs/symlink.c b/kernel/fs/ncpfs/symlink.c new file mode 100644 index 000000000..421b6f91e --- /dev/null +++ b/kernel/fs/ncpfs/symlink.c @@ -0,0 +1,181 @@ +/* + * linux/fs/ncpfs/symlink.c + * + * Code for allowing symbolic links on NCPFS (i.e. NetWare) + * Symbolic links are not supported on native NetWare, so we use an + * infrequently-used flag (Sh) and store a two-word magic header in + * the file to make sure we don't accidentally use a non-link file + * as a link. + * + * When using the NFS namespace, we set the mode to indicate a symlink and + * don't bother with the magic numbers. + * + * from linux/fs/ext2/symlink.c + * + * Copyright (C) 1998-99, Frank A. Vorstenbosch + * + * ncpfs symlink handling code + * NLS support (c) 1999 Petr Vandrovec + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + + +#include + +#include +#include +#include +#include +#include +#include +#include "ncp_fs.h" + +/* these magic numbers must appear in the symlink file -- this makes it a bit + more resilient against the magic attributes being set on random files. */ + +#define NCP_SYMLINK_MAGIC0 cpu_to_le32(0x6c6d7973) /* "symlnk->" */ +#define NCP_SYMLINK_MAGIC1 cpu_to_le32(0x3e2d6b6e) + +/* ----- read a symbolic link ------------------------------------------ */ + +static int ncp_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error, length, len; + char *link, *rawlink; + char *buf = kmap(page); + + error = -ENOMEM; + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + if (!rawlink) + goto fail; + + if (ncp_make_open(inode,O_RDONLY)) + goto failEIO; + + error=ncp_read_kernel(NCP_SERVER(inode),NCP_FINFO(inode)->file_handle, + 0,NCP_MAX_SYMLINK_SIZE,rawlink,&length); + + ncp_inode_close(inode); + /* Close file handle if no other users... */ + ncp_make_closed(inode); + if (error) + goto failEIO; + + if (NCP_FINFO(inode)->flags & NCPI_KLUDGE_SYMLINK) { + if (lengthvolNumber)) + kludge = 0; + else +#ifdef CONFIG_NCPFS_EXTRAS + if (NCP_SERVER(dir)->m.flags & NCP_MOUNT_SYMLINKS) + kludge = 1; + else +#endif + /* EPERM is returned by VFS if symlink procedure does not exist */ + return -EPERM; + + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + if (!rawlink) + return -ENOMEM; + + if (kludge) { + mode = 0; + attr = aSHARED | aHIDDEN; + ((__le32 *)rawlink)[0]=NCP_SYMLINK_MAGIC0; + ((__le32 *)rawlink)[1]=NCP_SYMLINK_MAGIC1; + hdr = 8; + } else { + mode = S_IFLNK | S_IRWXUGO; + attr = 0; + hdr = 0; + } + + length = strlen(symname); + /* map to/from server charset, do not touch upper/lower case as + symlink can point out of ncp filesystem */ + outlen = NCP_MAX_SYMLINK_SIZE - hdr; + err = ncp_io2vol(NCP_SERVER(dir), rawlink + hdr, &outlen, symname, length, 0); + if (err) + goto failfree; + + outlen += hdr; + + err = -EIO; + if (ncp_create_new(dir,dentry,mode,0,attr)) { + goto failfree; + } + + inode=d_inode(dentry); + + if (ncp_make_open(inode, O_WRONLY)) + goto failfree; + + if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, + 0, outlen, rawlink, &i) || i!=outlen) { + goto fail; + } + + ncp_inode_close(inode); + ncp_make_closed(inode); + kfree(rawlink); + return 0; +fail:; + ncp_inode_close(inode); + ncp_make_closed(inode); +failfree:; + kfree(rawlink); + return err; +} + +/* ----- EOF ----- */ diff --git a/kernel/fs/nfs/Kconfig b/kernel/fs/nfs/Kconfig new file mode 100644 index 000000000..f31fd0dd9 --- /dev/null +++ b/kernel/fs/nfs/Kconfig @@ -0,0 +1,202 @@ +config NFS_FS + tristate "NFS client support" + depends on INET && FILE_LOCKING && MULTIUSER + select LOCKD + select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL + help + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. + + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. + + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. + + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. + + If unsure, say N. + +config NFS_V2 + tristate "NFS client support for NFS version 2" + depends on NFS_FS + default y + help + This option enables support for version 2 of the NFS protocol + (RFC 1094) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3 + tristate "NFS client support for NFS version 3" + depends on NFS_FS + default y + help + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3_ACL + bool "NFS client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. + + If unsure, say N. + +config NFS_V4 + tristate "NFS client support for NFS version 4" + depends on NFS_FS + select SUNRPC_GSS + select KEYS + help + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. + + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say Y. + +config NFS_SWAP + bool "Provide swap over NFS support" + default n + depends on NFS_FS + select SUNRPC_SWAP + help + This option enables swapon to work on files located on NFS mounts. + +config NFS_V4_1 + bool "NFS client support for NFSv4.1" + depends on NFS_V4 + select SUNRPC_BACKCHANNEL + help + This option enables support for minor version 1 of the NFSv4 protocol + (RFC 5661) in the kernel's NFS client. + + If unsure, say N. + +config NFS_V4_2 + bool "NFS client support for NFSv4.2" + depends on NFS_V4_1 + help + This option enables support for minor version 2 of the NFSv4 protocol + in the kernel's NFS client. + + If unsure, say N. + +config PNFS_FILE_LAYOUT + tristate + depends on NFS_V4_1 + default NFS_V4 + +config PNFS_BLOCK + tristate + depends on NFS_V4_1 && BLK_DEV_DM + default NFS_V4 + +config PNFS_OBJLAYOUT + tristate + depends on NFS_V4_1 && SCSI_OSD_ULD + default NFS_V4 + +config PNFS_FLEXFILE_LAYOUT + tristate + depends on NFS_V4_1 && NFS_V3 + default m + +config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN + string "NFSv4.1 Implementation ID Domain" + depends on NFS_V4_1 + default "kernel.org" + help + This option defines the domain portion of the implementation ID that + may be sent in the NFS exchange_id operation. The value must be in + the format of a DNS domain name and should be set to the DNS domain + name of the distribution. + If the NFS client is unchanged from the upstream kernel, this + option should be set to the default "kernel.org". + +config NFS_V4_1_MIGRATION + bool "NFSv4.1 client support for migration" + depends on NFS_V4_1 + default n + help + This option makes the NFS client advertise to NFSv4.1 servers that + it can support NFSv4 migration. + + The NFSv4.1 pieces of the Linux NFSv4 migration implementation are + still experimental. If you are not an NFSv4 developer, say N here. + +config NFS_V4_SECURITY_LABEL + bool + depends on NFS_V4_2 && SECURITY + default y + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + . + + Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support" + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager + +config NFS_USE_LEGACY_DNS + bool "Use the legacy NFS DNS resolver" + depends on NFS_V4 + help + The kernel now provides a method for translating a host name into an + IP address. Select Y here if you would rather use your own DNS + resolver script. + + If unsure, say N + +config NFS_USE_KERNEL_DNS + bool + depends on NFS_V4 && !NFS_USE_LEGACY_DNS + select DNS_RESOLVER + default y + +config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 + default y diff --git a/kernel/fs/nfs/Makefile b/kernel/fs/nfs/Makefile new file mode 100644 index 000000000..866441795 --- /dev/null +++ b/kernel/fs/nfs/Makefile @@ -0,0 +1,36 @@ +# +# Makefile for the Linux nfs filesystem routines. +# + +obj-$(CONFIG_NFS_FS) += nfs.o + +CFLAGS_nfstrace.o += -I$(src) +nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ + direct.o pagelist.o read.o symlink.o unlink.o \ + write.o namespace.o mount_clnt.o nfstrace.o +nfs-$(CONFIG_ROOT_NFS) += nfsroot.o +nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_NFS_V2) += nfsv2.o +nfsv2-y := nfs2super.o proc.o nfs2xdr.o + +obj-$(CONFIG_NFS_V3) += nfsv3.o +nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o +nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o + +obj-$(CONFIG_NFS_V4) += nfsv4.o +CFLAGS_nfs4trace.o += -I$(src) +nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ + dns_resolve.o nfs4trace.o +nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o +nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o +nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/kernel/fs/nfs/blocklayout/Makefile b/kernel/fs/nfs/blocklayout/Makefile new file mode 100644 index 000000000..3ca14c36d --- /dev/null +++ b/kernel/fs/nfs/blocklayout/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o + +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o diff --git a/kernel/fs/nfs/blocklayout/blocklayout.c b/kernel/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 000000000..d2554fe14 --- /dev/null +++ b/kernel/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,928 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include +#include +#include +#include +#include /* struct bio */ +#include +#include + +#include "../pnfs.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson "); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +static bool is_hole(struct pnfs_block_extent *be) +{ + switch (be->be_state) { + case PNFS_BLOCK_NONE_DATA: + return true; + case PNFS_BLOCK_INVALID_DATA: + return be->be_tag ? false : true; + default: + return false; + } +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + void (*pnfs_callback) (void *data); + void *data; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_NOFS); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static struct bio * +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, + void (*end_io)(struct bio *, int err), struct parallel_io *par) +{ + struct bio *bio; + + npg = min(npg, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, npg); + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } + + if (bio) { + bio->bi_iter.bi_sector = disk_sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + } + return bio; +} + +static struct bio * +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, + struct page *page, struct pnfs_block_dev_map *map, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par, unsigned int offset, int *len) +{ + struct pnfs_block_dev *dev = + container_of(be->be_device, struct pnfs_block_dev, node); + u64 disk_addr, end; + + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, + npg, rw, (unsigned long long)isect, offset, *len); + + /* translate to device offset */ + isect += be->be_v_offset; + isect -= be->be_f_offset; + + /* translate to physical disk offset */ + disk_addr = (u64)isect << SECTOR_SHIFT; + if (disk_addr < map->start || disk_addr >= map->start + map->len) { + if (!dev->map(dev, disk_addr, map)) + return ERR_PTR(-EIO); + bio = bl_submit_bio(rw, bio); + } + disk_addr += map->disk_offset; + disk_addr -= map->start; + + /* limit length to what the device mapping allows */ + end = disk_addr + *len; + if (end >= map->start + map->len) + *len = map->start + map->len - disk_addr; + +retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, map->bdev, + disk_addr >> SECTOR_SHIFT, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } + if (bio_add_page(bio, page, *len, offset) < *len) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; +} + +static void bl_end_io_read(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + + if (err) { + struct nfs_pgio_header *header = par->data; + + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); + } + + bio_put(bio); + put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_header *hdr; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + hdr = container_of(task, struct nfs_pgio_header, task); + pnfs_ld_read_done(hdr); +} + +static void +bl_end_par_io_read(void *data) +{ + struct nfs_pgio_header *hdr = data; + + hdr->task.tk_status = hdr->pnfs_error; + INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup); + schedule_work(&hdr->task.u.tk_work); +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_pgio_header *header) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; + struct bio *bio = NULL; + struct pnfs_block_extent be; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = header->args.offset; + size_t bytes_left = header->args.count; + unsigned int pg_offset, pg_len; + struct page **pages = header->args.pages; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + const bool is_dio = (header->dreq != NULL); + struct blk_plug plug; + int i; + + dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, + header->page_array.npages, f_offset, + (unsigned int)header->args.count); + + par = alloc_parallel(header); + if (!par) + return PNFS_NOT_ATTEMPTED; + par->pnfs_callback = bl_end_par_io_read; + + blk_start_plug(&plug); + + isect = (sector_t) (f_offset >> SECTOR_SHIFT); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { + /* We've used up the previous extent */ + bio = bl_submit_bio(READ, bio); + + /* Get the next one */ + if (!ext_tree_lookup(bl, isect, &be, false)) { + header->pnfs_error = -EIO; + goto out; + } + extent_length = be.be_length - (isect - be.be_f_offset); + } + + pg_offset = f_offset & ~PAGE_CACHE_MASK; + if (is_dio) { + if (pg_offset + bytes_left > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = bytes_left; + } else { + BUG_ON(pg_offset != 0); + pg_len = PAGE_CACHE_SIZE; + } + + isect += (pg_offset >> SECTOR_SHIFT); + extent_length -= (pg_offset >> SECTOR_SHIFT); + + if (is_hole(&be)) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user_segment(pages[i], pg_offset, pg_len); + + /* invalidate map */ + map.start = NFS4_MAX_UINT64; + } else { + bio = do_add_page_to_bio(bio, + header->page_array.npages - i, + READ, + isect, pages[i], &map, &be, + bl_end_io_read, par, + pg_offset, &pg_len); + if (IS_ERR(bio)) { + header->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + } + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= (pg_len >> SECTOR_SHIFT); + f_offset += pg_len; + bytes_left -= pg_len; + } + if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { + header->res.eof = 1; + header->res.count = header->inode->i_size - header->args.offset; + } else { + header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; + } +out: + bl_submit_bio(READ, bio); + blk_finish_plug(&plug); + put_parallel(par); + return PNFS_ATTEMPTED; +} + +static void bl_end_io_write(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nfs_pgio_header *header = par->data; + + if (!uptodate) { + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work); + struct nfs_pgio_header *hdr = + container_of(task, struct nfs_pgio_header, task); + + dprintk("%s enter\n", __func__); + + if (likely(!hdr->pnfs_error)) { + struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg); + u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK; + u64 end = (hdr->args.offset + hdr->args.count + + PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK; + + ext_tree_mark_written(bl, start >> SECTOR_SHIFT, + (end - start) >> SECTOR_SHIFT); + } + + pnfs_ld_write_done(hdr); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data) +{ + struct nfs_pgio_header *hdr = data; + + hdr->task.tk_status = hdr->pnfs_error; + hdr->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); + schedule_work(&hdr->task.u.tk_work); +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_pgio_header *header, int sync) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; + struct bio *bio = NULL; + struct pnfs_block_extent be; + sector_t isect, extent_length = 0; + struct parallel_io *par = NULL; + loff_t offset = header->args.offset; + size_t count = header->args.count; + struct page **pages = header->args.pages; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + unsigned int pg_len; + struct blk_plug plug; + int i; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + + /* At this point, header->page_aray is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. + */ + par = alloc_parallel(header); + if (!par) + return PNFS_NOT_ATTEMPTED; + par->pnfs_callback = bl_end_par_io_write; + + blk_start_plug(&plug); + + /* we always write out the whole page */ + offset = offset & (loff_t)PAGE_CACHE_MASK; + isect = offset >> SECTOR_SHIFT; + + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { + /* We've used up the previous extent */ + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + if (!ext_tree_lookup(bl, isect, &be, true)) { + header->pnfs_error = -EINVAL; + goto out; + } + + extent_length = be.be_length - (isect - be.be_f_offset); + } + + pg_len = PAGE_CACHE_SIZE; + bio = do_add_page_to_bio(bio, header->page_array.npages - i, + WRITE, isect, pages[i], &map, &be, + bl_end_io_write, par, + 0, &pg_len); + if (IS_ERR(bio)) { + header->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + + offset += pg_len; + count -= pg_len; + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= (pg_len >> SECTOR_SHIFT); + } + + header->res.count = header->args.count; +out: + bl_submit_bio(WRITE, bio); + blk_finish_plug(&plug); + put_parallel(par); + return PNFS_ATTEMPTED; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int err; + + dprintk("%s enter\n", __func__); + + err = ext_tree_remove(bl, true, 0, LLONG_MAX); + WARN_ON(err); + + kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), gfp_flags); + if (!bl) + return NULL; + + bl->bl_ext_rw = RB_ROOT; + bl->bl_ext_ro = RB_ROOT; + spin_lock_init(&bl->bl_ext_lock); + + return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; +} + +static int +bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, + struct layout_verification *lv, struct list_head *extents, + gfp_t gfp_mask) +{ + struct pnfs_block_extent *be; + struct nfs4_deviceid id; + int error; + __be32 *p; + + p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE); + if (!p) + return -EIO; + + be = kzalloc(sizeof(*be), GFP_NOFS); + if (!be) + return -ENOMEM; + + memcpy(&id, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + error = -EIO; + be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + lo->plh_lc_cred, gfp_mask); + if (!be->be_device) + goto out_free_be; + + /* + * The next three values are read in as bytes, but stored in the + * extent structure in 512-byte granularity. + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_put_deviceid; + be->be_state = be32_to_cpup(p++); + + error = verify_extent(be, lv); + if (error) { + dprintk("%s: extent verification failed\n", __func__); + goto out_put_deviceid; + } + + list_add_tail(&be->be_list, extents); + return 0; + +out_put_deviceid: + nfs4_put_deviceid_node(be->be_device); +out_free_be: + kfree(be); + return error; +} + +static struct pnfs_layout_segment * +bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, + gfp_t gfp_mask) +{ + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + struct pnfs_layout_segment *lseg; + struct xdr_buf buf; + struct xdr_stream xdr; + struct page *scratch; + int status, i; + uint32_t count; + __be32 *p; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + lseg = kzalloc(sizeof(*lseg), gfp_mask); + if (!lseg) + return ERR_PTR(-ENOMEM); + + status = -ENOMEM; + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, + lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + status = -EIO; + p = xdr_inline_decode(&xdr, 4); + if (unlikely(!p)) + goto out_free_scratch; + + count = be32_to_cpup(p++); + dprintk("%s: number of extents %d\n", __func__, count); + + /* + * Decode individual extents, putting them in temporary staging area + * until whole layout is decoded to make error recovery easier. + */ + for (i = 0; i < count; i++) { + status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask); + if (status) + goto process_extents; + } + + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + status = -EIO; + goto process_extents; + } + + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + status = -EIO; + } + +process_extents: + while (!list_empty(&extents)) { + struct pnfs_block_extent *be = + list_first_entry(&extents, struct pnfs_block_extent, + be_list); + list_del(&be->be_list); + + if (!status) + status = ext_tree_insert(bl, be); + + if (status) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } + } + +out_free_scratch: + __free_page(scratch); +out: + dprintk("%s returns %d\n", __func__, status); + if (status) { + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static void +bl_return_range(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + sector_t offset = range->offset >> SECTOR_SHIFT, end; + + if (range->offset % 8) { + dprintk("%s: offset %lld not block size aligned\n", + __func__, range->offset); + return; + } + + if (range->length != NFS4_MAX_UINT64) { + if (range->length % 8) { + dprintk("%s: length %lld not block size aligned\n", + __func__, range->length); + return; + } + + end = offset + (range->length >> SECTOR_SHIFT); + } else { + end = round_down(NFS4_MAX_UINT64, PAGE_SIZE); + } + + ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); +} + +static int +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) +{ + return ext_tree_prepare_commit(arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + ext_tree_mark_committed(&lcdata->args, lcdata->res.status); +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + if (server->pnfs_blksize > PAGE_SIZE) { + printk(KERN_ERR "%s: pNFS blksize %d not supported.\n", + __func__, server->pnfs_blksize); + return -EINVAL; + } + + return 0; +} + +static bool +is_aligned_req(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, unsigned int alignment) +{ + /* + * Always accept buffered writes, higher layers take care of the + * right alignment. + */ + if (pgio->pg_dreq == NULL) + return true; + + if (!IS_ALIGNED(req->wb_offset, alignment)) + return false; + + if (IS_ALIGNED(req->wb_bytes, alignment)) + return true; + + if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { + /* + * If the write goes up to the inode size, just write + * the full page. Data past the inode size is + * guaranteed to be zeroed by the higher level client + * code, and this behaviour is mandated by RFC 5663 + * section 2.3.2. + */ + return true; + } + + return false; +} + +static void +bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { + nfs_pageio_reset_read_mds(pgio); + return; + } + + pnfs_generic_pg_init_read(pgio, req); +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) + return 0; + return pnfs_generic_pg_test(pgio, prev, req); +} + +/* + * Return the number of contiguous bytes for a given inode + * starting at page frame idx. + */ +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t end; + + /* Optimize common case that writes from 0 to end of file */ + end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + if (end != inode->i_mapping->nrpages) { + rcu_read_lock(); + end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + rcu_read_unlock(); + } + + if (!end) + return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); + else + return (end - idx) << PAGE_CACHE_SHIFT; +} + +static void +bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + u64 wb_size; + + if (!is_aligned_req(pgio, req, PAGE_SIZE)) { + nfs_pageio_reset_write_mds(pgio); + return; + } + + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (!is_aligned_req(pgio, req, PAGE_SIZE)) + return 0; + return pnfs_generic_pg_test(pgio, prev, req); +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = bl_pg_init_read, + .pg_test = bl_pg_test_read, + .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = bl_pg_init_write, + .pg_test = bl_pg_test_write, + .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .owner = THIS_MODULE, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_READ_WHOLE_PAGE, + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .return_range = bl_return_range, + .prepare_layoutcommit = bl_prepare_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .alloc_deviceid_node = bl_alloc_deviceid_node, + .free_deviceid_node = bl_free_deviceid_node, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, +}; + +static int __init nfs4blocklayout_init(void) +{ + int ret; + + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + ret = pnfs_register_layoutdriver(&blocklayout_type); + if (ret) + goto out; + ret = bl_init_pipefs(); + if (ret) + goto out_unregister; + return 0; + +out_unregister: + pnfs_unregister_layoutdriver(&blocklayout_type); +out: + return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + bl_cleanup_pipefs(); + pnfs_unregister_layoutdriver(&blocklayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/kernel/fs/nfs/blocklayout/blocklayout.h b/kernel/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 000000000..92dca9e90 --- /dev/null +++ b/kernel/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,204 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include +#include +#include + +#include "../nfs4_fs.h" +#include "../pnfs.h" +#include "../netns.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +struct pnfs_block_dev; + +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; + +#define PNFS_BLOCK_MAX_UUIDS 4 +#define PNFS_BLOCK_MAX_DEVICES 64 + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + int len; + int nr_sigs; + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } sigs[PNFS_BLOCK_MAX_UUIDS]; + } simple; + struct { + u64 start; + u64 len; + u32 volume; + } slice; + struct { + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } concat; + struct { + u64 chunk_size; + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } stripe; + }; +}; + +struct pnfs_block_dev_map { + sector_t start; + sector_t len; + + sector_t disk_offset; + struct block_device *bdev; +}; + +struct pnfs_block_dev { + struct nfs4_deviceid_node node; + + u64 start; + u64 len; + + u32 nr_children; + struct pnfs_block_dev *children; + u64 chunk_size; + + struct block_device *bdev; + u64 disk_offset; + + bool (*map)(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map); +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + union { + struct rb_node be_node; + struct list_head be_list; + }; + struct nfs4_deviceid_node *be_device; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ +#define EXTENT_WRITTEN 1 +#define EXTENT_COMMITTING 2 + unsigned int be_tag; +}; + +/* on the wire size of the extent */ +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct rb_root bl_ext_rw; + struct rb_root bl_ext_ro; + spinlock_t bl_ext_lock; /* Protects list manipulation */ +}; + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_pipe_msg { + struct rpc_pipe_msg msg; + wait_queue_head_t *bl_wq; +}; + +struct bl_msg_hdr { + u8 type; + u16 totallen; /* length of entire message, including hdr itself */ +}; + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +/* dev.c */ +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_mask); +void bl_free_deviceid_node(struct nfs4_deviceid_node *d); + +/* extent_tree.c */ +int ext_tree_insert(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, + sector_t end); +int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len); +bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw); +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); +void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); + +/* rpc_pipefs.c */ +dev_t bl_resolve_deviceid(struct nfs_server *server, + struct pnfs_block_volume *b, gfp_t gfp_mask); +int __init bl_init_pipefs(void); +void __exit bl_cleanup_pipefs(void); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/kernel/fs/nfs/blocklayout/dev.c b/kernel/fs/nfs/blocklayout/dev.c new file mode 100644 index 000000000..e535599a0 --- /dev/null +++ b/kernel/fs/nfs/blocklayout/dev.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include +#include +#include +#include +#include + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +bl_free_device(struct pnfs_block_dev *dev) +{ + if (dev->nr_children) { + int i; + + for (i = 0; i < dev->nr_children; i++) + bl_free_device(&dev->children[i]); + kfree(dev->children); + } else { + if (dev->bdev) + blkdev_put(dev->bdev, FMODE_READ); + } +} + +void +bl_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct pnfs_block_dev *dev = + container_of(d, struct pnfs_block_dev, node); + + bl_free_device(dev); + kfree_rcu(dev, node.rcu); +} + +static int +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int i; + + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->type = be32_to_cpup(p++); + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->simple.nr_sigs = be32_to_cpup(p++); + if (!b->simple.nr_sigs) { + dprintk("no signature\n"); + return -EIO; + } + + b->simple.len = 4 + 4; + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); + b->simple.sigs[i].sig_len = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); + if (!p) + return -EIO; + memcpy(&b->simple.sigs[i].sig, p, + b->simple.sigs[i].sig_len); + + b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + } + break; + case PNFS_BLOCK_VOLUME_SLICE: + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->slice.start); + p = xdr_decode_hyper(p, &b->slice.len); + b->slice.volume = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_CONCAT: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->concat.volumes_count; i++) + b->concat.volumes[i] = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); + b->stripe.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->stripe.volumes_count; i++) + b->stripe.volumes[i] = be32_to_cpup(p++); + break; + default: + dprintk("unknown volume type!\n"); + return -EIO; + } + + return 0; +} + +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + map->start = dev->start; + map->len = dev->len; + map->disk_offset = dev->disk_offset; + map->bdev = dev->bdev; + return true; +} + +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + int i; + + for (i = 0; i < dev->nr_children; i++) { + struct pnfs_block_dev *child = &dev->children[i]; + + if (child->start > offset || + child->start + child->len <= offset) + continue; + + child->map(child, offset - child->start, map); + return true; + } + + dprintk("%s: ran off loop!\n", __func__); + return false; +} + +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + struct pnfs_block_dev *child; + u64 chunk; + u32 chunk_idx; + u64 disk_offset; + + chunk = div_u64(offset, dev->chunk_size); + div_u64_rem(chunk, dev->nr_children, &chunk_idx); + + if (chunk_idx > dev->nr_children) { + dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", + __func__, chunk_idx, offset, dev->chunk_size); + /* error, should not happen */ + return false; + } + + /* truncate offset to the beginning of the stripe */ + offset = chunk * dev->chunk_size; + + /* disk offset of the stripe */ + disk_offset = div_u64(offset, dev->nr_children); + + child = &dev->children[chunk_idx]; + child->map(child, disk_offset, map); + + map->start += offset; + map->disk_offset += disk_offset; + map->len = dev->chunk_size; + return true; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); + + +static int +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + dev_t dev; + + dev = bl_resolve_deviceid(server, v, gfp_mask); + if (!dev) + return -EIO; + + d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(d->bdev)) { + printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", + MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); + return PTR_ERR(d->bdev); + } + + + d->len = i_size_read(d->bdev->bd_inode); + d->map = bl_map_simple; + + printk(KERN_INFO "pNFS: using block device %s\n", + d->bdev->bd_disk->disk_name); + return 0; +} + +static int +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + int ret; + + ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); + if (ret) + return ret; + + d->disk_offset = v->slice.start; + d->len = v->slice.len; + return 0; +} + +static int +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->concat.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->concat.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->concat.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + d->children[i].start += len; + len += d->children[i].len; + } + + d->len = len; + d->map = bl_map_concat; + return 0; +} + +static int +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->stripe.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->stripe.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->stripe.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + len += d->children[i].len; + } + + d->len = len; + d->chunk_size = v->stripe.chunk_size; + d->map = bl_map_stripe; + return 0; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + switch (volumes[idx].type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + return bl_parse_simple(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_SLICE: + return bl_parse_slice(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_CONCAT: + return bl_parse_concat(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_STRIPE: + return bl_parse_stripe(server, d, volumes, idx, gfp_mask); + default: + dprintk("unsupported volume type: %d\n", volumes[idx].type); + return -EIO; + } +} + +struct nfs4_deviceid_node * +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node = NULL; + struct pnfs_block_volume *volumes; + struct pnfs_block_dev *top; + struct xdr_stream xdr; + struct xdr_buf buf; + struct page *scratch; + int nr_volumes, ret, i; + __be32 *p; + + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&xdr, sizeof(__be32)); + if (!p) + goto out_free_scratch; + nr_volumes = be32_to_cpup(p++); + + volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), + gfp_mask); + if (!volumes) + goto out_free_scratch; + + for (i = 0; i < nr_volumes; i++) { + ret = nfs4_block_decode_volume(&xdr, &volumes[i]); + if (ret < 0) + goto out_free_volumes; + } + + top = kzalloc(sizeof(*top), gfp_mask); + if (!top) + goto out_free_volumes; + + ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); + if (ret) { + bl_free_device(top); + kfree(top); + goto out_free_volumes; + } + + node = &top->node; + nfs4_init_deviceid_node(node, server, &pdev->dev_id); + +out_free_volumes: + kfree(volumes); +out_free_scratch: + __free_page(scratch); +out: + return node; +} diff --git a/kernel/fs/nfs/blocklayout/extent_tree.c b/kernel/fs/nfs/blocklayout/extent_tree.c new file mode 100644 index 000000000..31d0b5e53 --- /dev/null +++ b/kernel/fs/nfs/blocklayout/extent_tree.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ + +#include + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static inline struct pnfs_block_extent * +ext_node(struct rb_node *node) +{ + return rb_entry(node, struct pnfs_block_extent, be_node); +} + +static struct pnfs_block_extent * +ext_tree_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_prev(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_prev(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_next(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_next(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static inline sector_t +ext_f_end(struct pnfs_block_extent *be) +{ + return be->be_f_offset + be->be_length; +} + +static struct pnfs_block_extent * +__ext_tree_search(struct rb_root *root, sector_t start) +{ + struct rb_node *node = root->rb_node; + struct pnfs_block_extent *be = NULL; + + while (node) { + be = ext_node(node); + if (start < be->be_f_offset) + node = node->rb_left; + else if (start >= ext_f_end(be)) + node = node->rb_right; + else + return be; + } + + if (be) { + if (start < be->be_f_offset) + return be; + + if (start >= ext_f_end(be)) + return ext_tree_next(be); + } + + return NULL; +} + +static bool +ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) +{ + if (be1->be_state != be2->be_state) + return false; + if (be1->be_device != be2->be_device) + return false; + + if (be1->be_f_offset + be1->be_length != be2->be_f_offset) + return false; + + if (be1->be_state != PNFS_BLOCK_NONE_DATA && + (be1->be_v_offset + be1->be_length != be2->be_v_offset)) + return false; + + if (be1->be_state == PNFS_BLOCK_INVALID_DATA && + be1->be_tag != be2->be_tag) + return false; + + return true; +} + +static struct pnfs_block_extent * +ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + left->be_length += be->be_length; + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + return left; + } + + return be; +} + +static struct pnfs_block_extent * +ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + be->be_length += right->be_length; + rb_erase(&right->be_node, root); + nfs4_put_deviceid_node(right->be_device); + kfree(right); + } + + return be; +} + +static void +__ext_tree_insert(struct rb_root *root, + struct pnfs_block_extent *new, bool merge_ok) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + struct pnfs_block_extent *be; + + while (*p) { + parent = *p; + be = ext_node(parent); + + if (new->be_f_offset < be->be_f_offset) { + if (merge_ok && ext_can_merge(new, be)) { + be->be_f_offset = new->be_f_offset; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset = new->be_v_offset; + be->be_length += new->be_length; + be = ext_try_to_merge_left(root, be); + goto free_new; + } + p = &(*p)->rb_left; + } else if (new->be_f_offset >= ext_f_end(be)) { + if (merge_ok && ext_can_merge(be, new)) { + be->be_length += new->be_length; + be = ext_try_to_merge_right(root, be); + goto free_new; + } + p = &(*p)->rb_right; + } else { + BUG(); + } + } + + rb_link_node(&new->be_node, parent, p); + rb_insert_color(&new->be_node, root); + return; +free_new: + nfs4_put_deviceid_node(new->be_device); + kfree(new); +} + +static int +__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +{ + struct pnfs_block_extent *be; + sector_t len1 = 0, len2 = 0; + sector_t orig_v_offset; + sector_t orig_len; + + be = __ext_tree_search(root, start); + if (!be) + return 0; + if (be->be_f_offset >= end) + return 0; + + orig_v_offset = be->be_v_offset; + orig_len = be->be_length; + + if (start > be->be_f_offset) + len1 = start - be->be_f_offset; + if (ext_f_end(be) > end) + len2 = ext_f_end(be) - end; + + if (len2 > 0) { + if (len1 > 0) { + struct pnfs_block_extent *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = len1; + + new->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + new->be_v_offset = + orig_v_offset + orig_len - len2; + } + new->be_length = len2; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, true); + } else { + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + be->be_v_offset = + orig_v_offset + orig_len - len2; + } + be->be_length = len2; + } + } else { + if (len1 > 0) { + be->be_length = len1; + be = ext_tree_next(be); + } + + while (be && ext_f_end(be) <= end) { + struct pnfs_block_extent *next = ext_tree_next(be); + + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + be = next; + } + + if (be && be->be_f_offset < end) { + len1 = ext_f_end(be) - end; + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset += be->be_length - len1; + be->be_length = len1; + } + } + + return 0; +} + +int +ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be; + struct rb_root *root; + int err = 0; + + switch (new->be_state) { + case PNFS_BLOCK_READWRITE_DATA: + case PNFS_BLOCK_INVALID_DATA: + root = &bl->bl_ext_rw; + break; + case PNFS_BLOCK_READ_DATA: + case PNFS_BLOCK_NONE_DATA: + root = &bl->bl_ext_ro; + break; + default: + dprintk("invalid extent type\n"); + return -EINVAL; + } + + spin_lock(&bl->bl_ext_lock); +retry: + be = __ext_tree_search(root, new->be_f_offset); + if (!be || be->be_f_offset >= ext_f_end(new)) { + __ext_tree_insert(root, new, true); + } else if (new->be_f_offset >= be->be_f_offset) { + if (ext_f_end(new) <= ext_f_end(be)) { + nfs4_put_deviceid_node(new->be_device); + kfree(new); + } else { + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } + } else if (ext_f_end(new) <= ext_f_end(be)) { + new->be_length = be->be_f_offset - new->be_f_offset; + __ext_tree_insert(root, new, true); + } else { + struct pnfs_block_extent *split; + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + split = kmemdup(new, sizeof(*new), GFP_ATOMIC); + if (!split) { + err = -EINVAL; + goto out; + } + + split->be_length = be->be_f_offset - split->be_f_offset; + split->be_device = nfs4_get_deviceid(new->be_device); + __ext_tree_insert(root, split, true); + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static bool +__ext_tree_lookup(struct rb_root *root, sector_t isect, + struct pnfs_block_extent *ret) +{ + struct rb_node *node; + struct pnfs_block_extent *be; + + node = root->rb_node; + while (node) { + be = ext_node(node); + if (isect < be->be_f_offset) + node = node->rb_left; + else if (isect >= ext_f_end(be)) + node = node->rb_right; + else { + *ret = *be; + return true; + } + } + + return false; +} + +bool +ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw) +{ + bool found = false; + + spin_lock(&bl->bl_ext_lock); + if (!rw) + found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); + if (!found) + found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); + spin_unlock(&bl->bl_ext_lock); + + return found; +} + +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, + sector_t start, sector_t end) +{ + int err, err2; + + spin_lock(&bl->bl_ext_lock); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (rw) { + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + if (!err) + err = err2; + } + spin_unlock(&bl->bl_ext_lock); + + return err; +} + +static int +ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, + sector_t split) +{ + struct pnfs_block_extent *new; + sector_t orig_len = be->be_length; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = split - be->be_f_offset; + + new->be_f_offset = split; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + new->be_v_offset = be->be_v_offset + be->be_length; + new->be_length = orig_len - be->be_length; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, false); + return 0; +} + +int +ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len) +{ + struct rb_root *root = &bl->bl_ext_rw; + sector_t end = start + len; + struct pnfs_block_extent *be; + int err = 0; + + spin_lock(&bl->bl_ext_lock); + /* + * First remove all COW extents or holes from written to range. + */ + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (err) + goto out; + + /* + * Then mark all invalid extents in the range as written to. + */ + for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { + if (be->be_f_offset >= end) + break; + + if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) + continue; + + if (be->be_f_offset < start) { + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + sector_t diff = start - be->be_f_offset; + + left->be_length += diff; + + be->be_f_offset += diff; + be->be_v_offset += diff; + be->be_length -= diff; + } else { + err = ext_tree_split(root, be, start); + if (err) + goto out; + } + } + + if (ext_f_end(be) > end) { + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + sector_t diff = end - be->be_f_offset; + + be->be_length -= diff; + + right->be_f_offset -= diff; + right->be_v_offset -= diff; + right->be_length += diff; + } else { + err = ext_tree_split(root, be, end); + if (err) + goto out; + } + } + + if (be->be_f_offset >= start && ext_f_end(be) <= end) { + be->be_tag = EXTENT_WRITTEN; + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, + size_t buffer_size) +{ + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; + + for (i = 0; i < nr_pages; i++) + put_page(arg->layoutupdate_pages[i]); + kfree(arg->layoutupdate_pages); + } else { + put_page(arg->layoutupdate_page); + } +} + +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count) +{ + struct pnfs_block_extent *be; + int ret = 0; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (*count * BL_EXTENT_SIZE > buffer_size) { + /* keep counting.. */ + ret = -ENOSPC; + continue; + } + + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, + NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + + be->be_tag = EXTENT_COMMITTING; + } + spin_unlock(&bl->bl_ext_lock); + + return ret; +} + +int +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + size_t count = 0, buffer_size = PAGE_SIZE; + __be32 *start_p; + int ret; + + dprintk("%s enter\n", __func__); + + arg->layoutupdate_page = alloc_page(GFP_NOFS); + if (!arg->layoutupdate_page) + return -ENOMEM; + start_p = page_address(arg->layoutupdate_page); + arg->layoutupdate_pages = &arg->layoutupdate_page; + +retry: + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + if (unlikely(ret)) { + ext_tree_free_commitdata(arg, buffer_size); + + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + count = 0; + + arg->layoutupdate_pages = + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), + sizeof(struct page *), GFP_NOFS); + if (!arg->layoutupdate_pages) + return -ENOMEM; + + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + if (!start_p) { + kfree(arg->layoutupdate_pages); + return -ENOMEM; + } + + goto retry; + } + + *start_p = cpu_to_be32(count); + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { + __be32 *p = start_p; + int i = 0; + + for (p = start_p; + p < start_p + arg->layoutupdate_len; + p += PAGE_SIZE) { + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + } + } + + dprintk("%s found %zu ranges\n", __func__, count); + return 0; +} + +void +ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + struct rb_root *root = &bl->bl_ext_rw; + struct pnfs_block_extent *be; + + dprintk("%s status %d\n", __func__, status); + + ext_tree_free_commitdata(arg, arg->layoutupdate_len); + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_COMMITTING) + continue; + + if (status) { + /* + * Mark as written and try again. + * + * XXX: some real error handling here wouldn't hurt.. + */ + be->be_tag = EXTENT_WRITTEN; + } else { + be->be_state = PNFS_BLOCK_READWRITE_DATA; + be->be_tag = 0; + } + + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + spin_unlock(&bl->bl_ext_lock); +} diff --git a/kernel/fs/nfs/blocklayout/rpc_pipefs.c b/kernel/fs/nfs/blocklayout/rpc_pipefs.c new file mode 100644 index 000000000..dbe5839cd --- /dev/null +++ b/kernel/fs/nfs/blocklayout/rpc_pipefs.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2006,2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include +#include +#include + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) +{ + int i; + + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->simple.nr_sigs); + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_encode_hyper(p, b->simple.sigs[i].offset); + p = xdr_encode_opaque(p, b->simple.sigs[i].sig, + b->simple.sigs[i].sig_len); + } +} + +dev_t +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, + gfp_t gfp_mask) +{ + struct net *net = server->nfs_client->cl_net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_msg_hdr *bl_msg; + DECLARE_WAITQUEUE(wq, current); + dev_t dev = 0; + int rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + + mutex_lock(&nn->bl_mutex); + bl_pipe_msg.bl_wq = &nn->bl_wq; + + b->simple.len += 4; /* single volume */ + if (b->simple.len > PAGE_SIZE) + goto out_unlock; + + memset(msg, 0, sizeof(*msg)); + msg->len = sizeof(*bl_msg) + b->simple.len; + msg->data = kzalloc(msg->len, gfp_mask); + if (!msg->data) + goto out_free_data; + + bl_msg = msg->data; + bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->totallen = b->simple.len; + nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); + if (rc < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + goto out_free_data; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + remove_wait_queue(&nn->bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + printk(KERN_WARNING "%s failed to decode device: %d\n", + __func__, reply->status); + goto out_free_data; + } + + dev = MKDEV(reply->major, reply->minor); +out_free_data: + kfree(msg->data); +out_unlock: + mutex_unlock(&nn->bl_mutex); + return dev; +} + +static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, + nfs_net_id); + + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&nn->bl_wq); + + return mlen; +} + +static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct bl_pipe_msg *bl_pipe_msg = + container_of(msg, struct bl_pipe_msg, msg); + + if (msg->errno >= 0) + return; + wake_up(bl_pipe_msg->bl_wq); +} + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + mutex_init(&nn->bl_mutex); + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + +int __init bl_init_pipefs(void) +{ + int ret; + + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) + goto out; + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); + if (ret) + goto out_unregister_notifier; + return 0; + +out_unregister_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out: + return ret; +} + +void __exit bl_cleanup_pipefs(void) +{ + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); +} diff --git a/kernel/fs/nfs/cache_lib.c b/kernel/fs/nfs/cache_lib.c new file mode 100644 index 000000000..5f7b05372 --- /dev/null +++ b/kernel/fs/nfs/cache_lib.c @@ -0,0 +1,158 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd) +{ + int ret; + struct dentry *dir; + + dir = rpc_d_lookup_sb(sb, "cache"); + ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); + dput(dir); + return ret; +} + +int nfs_cache_register_net(struct net *net, struct cache_detail *cd) +{ + struct super_block *pipefs_sb; + int ret = 0; + + sunrpc_init_cache_detail(cd); + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + ret = nfs_cache_register_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + if (ret) + sunrpc_destroy_cache_detail(cd); + } + return ret; +} + +void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) +{ + if (cd->u.pipefs.dir) + sunrpc_cache_unregister_pipefs(cd); +} + +void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs_cache_unregister_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + } + sunrpc_destroy_cache_detail(cd); +} diff --git a/kernel/fs/nfs/cache_lib.h b/kernel/fs/nfs/cache_lib.h new file mode 100644 index 000000000..4116d2c3f --- /dev/null +++ b/kernel/fs/nfs/cache_lib.h @@ -0,0 +1,31 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust + */ + +#include +#include +#include + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); +extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); +extern int nfs_cache_register_sb(struct super_block *sb, + struct cache_detail *cd); +extern void nfs_cache_unregister_sb(struct super_block *sb, + struct cache_detail *cd); diff --git a/kernel/fs/nfs/callback.c b/kernel/fs/nfs/callback.c new file mode 100644 index 000000000..8d129bb73 --- /dev/null +++ b/kernel/fs/nfs/callback.c @@ -0,0 +1,499 @@ +/* + * linux/fs/nfs/callback.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback handling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +struct nfs_callback_data { + unsigned int users; + struct svc_serv *serv; + struct svc_rqst *rqst; + struct task_struct *task; +}; + +static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; +static DEFINE_MUTEX(nfs_callback_mutex); +static struct svc_program nfs4_callback_program; + +static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) +{ + int ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nn->nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport, PF_INET, net); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nn->nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport6, PF_INET6, net); + } else if (ret != -EAFNOSUPPORT) + goto out_err; + return 0; + +out_err: + return (ret) ? ret : -ENOMEM; +} + +/* + * This is the NFSv4 callback kernel thread. + */ +static int +nfs4_callback_svc(void *vrqstp) +{ + int err; + struct svc_rqst *rqstp = vrqstp; + + set_freezable(); + + while (!kthread_should_stop()) { + /* + * Listen for a request on the socket + */ + err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); + if (err == -EAGAIN || err == -EINTR) + continue; + svc_process(rqstp); + } + return 0; +} + +/* + * Prepare to bring up the NFSv4 callback service + */ +static struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv) +{ + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); +} + +#if defined(CONFIG_NFS_V4_1) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); +} + +/* + * The callback service for NFSv4.1 callbacks + */ +static int +nfs41_callback_svc(void *vrqstp) +{ + struct svc_rqst *rqstp = vrqstp; + struct svc_serv *serv = rqstp->rq_server; + struct rpc_rqst *req; + int error; + DEFINE_WAIT(wq); + + set_freezable(); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + spin_lock_bh(&serv->sv_cb_lock); + if (!list_empty(&serv->sv_cb_list)) { + req = list_first_entry(&serv->sv_cb_list, + struct rpc_rqst, rq_bc_list); + list_del(&req->rq_bc_list); + spin_unlock_bh(&serv->sv_cb_lock); + finish_wait(&serv->sv_cb_waitq, &wq); + dprintk("Invoking bc_svc_process()\n"); + error = bc_svc_process(serv, req, rqstp); + dprintk("bc_svc_process() returned w/ error code= %d\n", + error); + } else { + spin_unlock_bh(&serv->sv_cb_lock); + schedule(); + finish_wait(&serv->sv_cb_waitq, &wq); + } + flush_signals(current); + } + return 0; +} + +/* + * Bring up the NFSv4.1 callback service + */ +static struct svc_rqst * +nfs41_callback_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(rqstp)) { + svc_xprt_put(serv->sv_bc_xprt); + serv->sv_bc_xprt = NULL; + } + dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); + return rqstp; +} + +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + *rqstpp = nfs41_callback_up(serv); + *callback_svc = nfs41_callback_svc; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ + if (minorversion) + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; +} +#else +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + return 0; +} + +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + *rqstpp = ERR_PTR(-ENOTSUPP); + *callback_svc = ERR_PTR(-ENOTSUPP); +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + int (*callback_svc)(void *vrqstp); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int ret; + + nfs_callback_bc_serv(minorversion, xprt, serv); + + if (cb_info->task) + return 0; + + switch (minorversion) { + case 0: + /* v4.0 callback setup */ + rqstp = nfs4_callback_up(serv); + callback_svc = nfs4_callback_svc; + break; + default: + nfs_minorversion_callback_svc_setup(serv, + &rqstp, &callback_svc); + } + + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); + + svc_sock_update_bufs(serv); + + cb_info->serv = serv; + cb_info->rqst = rqstp; + cb_info->task = kthread_create(callback_svc, cb_info->rqst, + "nfsv4.%u-svc", minorversion); + if (IS_ERR(cb_info->task)) { + ret = PTR_ERR(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->rqst = NULL; + cb_info->task = NULL; + return ret; + } + rqstp->rq_task = cb_info->task; + wake_up_process(cb_info->task); + dprintk("nfs_callback_up: service started\n"); + return 0; +} + +static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + if (--nn->cb_users[minorversion]) + return; + + dprintk("NFS: destroy per-net callback data; net=%p\n", net); + svc_shutdown_net(serv, net); +} + +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + int ret; + + if (nn->cb_users[minorversion]++) + return 0; + + dprintk("NFS: create per-net callback data; net=%p\n", net); + + ret = svc_bind(serv, net); + if (ret < 0) { + printk(KERN_WARNING "NFS: bind callback service failed\n"); + goto err_bind; + } + + switch (minorversion) { + case 0: + ret = nfs4_callback_up_net(serv, net); + break; + case 1: + case 2: + ret = nfs41_callback_up_net(serv, net); + break; + default: + printk(KERN_ERR "NFS: unknown callback version: %d\n", + minorversion); + ret = -EINVAL; + break; + } + + if (ret < 0) { + printk(KERN_ERR "NFS: callback service start failed\n"); + goto err_socks; + } + return 0; + +err_socks: + svc_rpcb_cleanup(serv, net); +err_bind: + dprintk("NFS: Couldn't create callback socket: err = %d; " + "net = %p\n", ret, net); + return ret; +} + +static struct svc_serv *nfs_callback_create_svc(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; + + /* + * Check whether we're already up and running. + */ + if (cb_info->task) { + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(cb_info->serv); + return cb_info->serv; + } + + /* + * Sanity check: if there's no task, + * we should be the first user ... + */ + if (cb_info->users) + printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", + cb_info->users); + + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + if (!serv) { + printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); + return ERR_PTR(-ENOMEM); + } + /* As there is only one thread we need to over-ride the + * default maximum of 80 connections + */ + serv->sv_maxconn = 1024; + dprintk("nfs_callback_create_svc: service created\n"); + return serv; +} + +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ + struct svc_serv *serv; + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int ret; + struct net *net = xprt->xprt_net; + + mutex_lock(&nfs_callback_mutex); + + serv = nfs_callback_create_svc(minorversion); + if (IS_ERR(serv)) { + ret = PTR_ERR(serv); + goto err_create; + } + + ret = nfs_callback_up_net(minorversion, serv, net); + if (ret < 0) + goto err_net; + + ret = nfs_callback_start_svc(minorversion, xprt, serv); + if (ret < 0) + goto err_start; + + cb_info->users++; + /* + * svc_create creates the svc_serv with sv_nrthreads == 1, and then + * svc_prepare_thread increments that. So we need to call svc_destroy + * on both success and failure so that the refcount is 1 when the + * thread exits. + */ +err_net: + svc_destroy(serv); +err_create: + mutex_unlock(&nfs_callback_mutex); + return ret; + +err_start: + nfs_callback_down_net(minorversion, serv, net); + dprintk("NFS: Couldn't create server thread; err = %d\n", ret); + goto err_net; +} + +/* + * Kill the callback thread if it's no longer being used. + */ +void nfs_callback_down(int minorversion, struct net *net) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + + mutex_lock(&nfs_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); + cb_info->users--; + if (cb_info->users == 0 && cb_info->task != NULL) { + kthread_stop(cb_info->task); + dprintk("nfs_callback_down: service stopped\n"); + svc_exit_thread(cb_info->rqst); + dprintk("nfs_callback_down: service destroyed\n"); + cb_info->serv = NULL; + cb_info->rqst = NULL; + cb_info->task = NULL; + } + mutex_unlock(&nfs_callback_mutex); +} + +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) +{ + char *p = rqstp->rq_cred.cr_principal; + + if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) + return 1; + + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ + if (clp->cl_minorversion != 0) + return 0; + /* + * It might just be a normal user principal, in which case + * userspace won't bother to tell us the name at all. + */ + if (p == NULL) + return 0; + + /* + * Did we get the acceptor from userland during the SETCLIENID + * negotiation? + */ + if (clp->cl_acceptor) + return !strcmp(p, clp->cl_acceptor); + + /* + * Otherwise try to verify it using the cl_hostname. Note that this + * doesn't work if a non-canonical hostname was used in the devname. + */ + + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ + + if (memcmp(p, "nfs@", 4) != 0) + return 0; + p += 4; + if (strcmp(p, clp->cl_hostname) != 0) + return 0; + return 1; +} + +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */ +static int nfs_callback_authenticate(struct svc_rqst *rqstp) +{ + switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: + if (rqstp->rq_proc != CB_NULL) + return SVC_DROP; + break; + case RPC_AUTH_GSS: + /* No RPC_AUTH_GSS support yet in NFSv4.1 */ + if (svc_is_backchannel(rqstp)) + return SVC_DROP; + } + return SVC_OK; +} + +/* + * Define NFS4 callback program + */ +static struct svc_version *nfs4_callback_version[] = { + [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, +}; + +static struct svc_stat nfs4_callback_stats; + +static struct svc_program nfs4_callback_program = { + .pg_prog = NFS4_CALLBACK, /* RPC service number */ + .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ + .pg_vers = nfs4_callback_version, /* version table */ + .pg_name = "NFSv4 callback", /* service name */ + .pg_class = "nfs", /* authentication class */ + .pg_stats = &nfs4_callback_stats, + .pg_authenticate = nfs_callback_authenticate, +}; diff --git a/kernel/fs/nfs/callback.h b/kernel/fs/nfs/callback.h new file mode 100644 index 000000000..84326e9fb --- /dev/null +++ b/kernel/fs/nfs/callback.h @@ -0,0 +1,214 @@ +/* + * linux/fs/nfs/callback.h + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback definitions + */ +#ifndef __LINUX_FS_NFS_CALLBACK_H +#define __LINUX_FS_NFS_CALLBACK_H +#include + +#define NFS4_CALLBACK 0x40000000 +#define NFS4_CALLBACK_XDRSIZE 2048 +#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) + +enum nfs4_callback_procnum { + CB_NULL = 0, + CB_COMPOUND = 1, +}; + +enum nfs4_callback_opnum { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, +/* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, +/* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, + OP_CB_ILLEGAL = 10044, +}; + +struct cb_process_state { + __be32 drc_status; + struct nfs_client *clp; + u32 slotid; + u32 minorversion; + struct net *net; +}; + +struct cb_compound_hdr_arg { + unsigned int taglen; + const char *tag; + unsigned int minorversion; + unsigned int cb_ident; /* v4.0 callback identifier */ + unsigned nops; +}; + +struct cb_compound_hdr_res { + __be32 *status; + unsigned int taglen; + const char *tag; + __be32 *nops; +}; + +struct cb_getattrargs { + struct sockaddr *addr; + struct nfs_fh fh; + uint32_t bitmap[2]; +}; + +struct cb_getattrres { + __be32 status; + uint32_t bitmap[2]; + uint64_t size; + uint64_t change_attr; + struct timespec ctime; + struct timespec mtime; +}; + +struct cb_recallargs { + struct sockaddr *addr; + struct nfs_fh fh; + nfs4_stateid stateid; + uint32_t truncate; +}; + +#if defined(CONFIG_NFS_V4_1) + +struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +}; + +struct referring_call_list { + struct nfs4_sessionid rcl_sessionid; + uint32_t rcl_nrefcalls; + struct referring_call *rcl_refcalls; +}; + +struct cb_sequenceargs { + struct sockaddr *csa_addr; + struct nfs4_sessionid csa_sessionid; + uint32_t csa_sequenceid; + uint32_t csa_slotid; + uint32_t csa_highestslotid; + uint32_t csa_cachethis; + uint32_t csa_nrclists; + struct referring_call_list *csa_rclists; +}; + +struct cb_sequenceres { + __be32 csr_status; + struct nfs4_sessionid csr_sessionid; + uint32_t csr_sequenceid; + uint32_t csr_slotid; + uint32_t csr_highestslotid; + uint32_t csr_target_highestslotid; +}; + +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps); + +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_highest_slotid; +}; +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + union { + struct { + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_range; + nfs4_stateid cbl_stateid; + }; + struct nfs_fsid cbl_fsid; + }; +}; + +extern __be32 nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); + +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct nfs4_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +struct cb_devicenotifyargs { + int ndevs; + struct cb_devicenotifyitem *devs; +}; + +extern __be32 nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps); + +#endif /* CONFIG_NFS_V4_1 */ +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps); +#if IS_ENABLED(CONFIG_NFS_V4) +extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); +extern void nfs_callback_down(int minorversion, struct net *net); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp); +#endif /* CONFIG_NFS_V4 */ +/* + * nfs41: Callbacks are expected to not cause substantial latency, + * so we limit their concurrency to 1 by setting up the maximum number + * of slots for the backchannel. + */ +#define NFS41_BC_MIN_CALLBACKS 1 +#define NFS41_BC_MAX_CALLBACKS 1 + +extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_tcpport; + +#endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/kernel/fs/nfs/callback_proc.c b/kernel/fs/nfs/callback_proc.c new file mode 100644 index 000000000..197806fb8 --- /dev/null +++ b/kernel/fs/nfs/callback_proc.c @@ -0,0 +1,554 @@ +/* + * linux/fs/nfs/callback_proc.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback procedures + */ +#include +#include +#include +#include +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "internal.h" +#include "pnfs.h" +#include "nfs4session.h" +#include "nfs4trace.h" + +#ifdef NFS_DEBUG +#define NFSDBG_FACILITY NFSDBG_CALLBACK +#endif + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps) +{ + struct nfs_delegation *delegation; + struct nfs_inode *nfsi; + struct inode *inode; + + res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + res->bitmap[0] = res->bitmap[1] = 0; + res->status = htonl(NFS4ERR_BADHANDLE); + + dprintk_rcu("NFS: GETATTR callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + nfsi = NFS_I(inode); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) + goto out_iput; + res->size = i_size_read(inode); + res->change_attr = delegation->change_attr; + if (nfsi->nrequests != 0) + res->change_attr++; + res->ctime = inode->i_ctime; + res->mtime = inode->i_mtime; + res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & + args->bitmap[0]; + res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & + args->bitmap[1]; + res->status = 0; +out_iput: + rcu_read_unlock(); + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); + return res->status; +} + +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct inode *inode; + __be32 res; + + res = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + dprintk_rcu("NFS: RECALL callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + res = htonl(NFS4ERR_BADHANDLE); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + /* Set up a helper thread to actually return the delegation */ + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; + case -ENOENT: + res = htonl(NFS4ERR_BAD_STATEID); + break; + default: + res = htonl(NFS4ERR_RESOURCE); + } + trace_nfs4_recall_delegation(inode, -ntohl(res)); + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); + return res; +} + +#if defined(CONFIG_NFS_V4_1) + +/* + * Lookup a layout by filehandle. + * + * Note: gets a refcount on the layout hdr and on its respective inode. + * Caller must put the layout hdr and the inode. + * + * TODO: keep track of all layouts (and delegations) in a hash table + * hashed by filehandle. + */ +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) +{ + struct nfs_server *server; + struct inode *ino; + struct pnfs_layout_hdr *lo; + + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + continue; + if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + break; + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); + break; + } + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); + return lo; + } + } + + return NULL; +} + +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); + lo = get_layout_by_fh_locked(clp, fh, stateid); + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + return lo; +} + +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct inode *ino; + struct pnfs_layout_hdr *lo; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + if (!lo) + goto out; + + ino = lo->plh_inode; + + spin_lock(&ino->i_lock); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + + pnfs_layoutcommit_inode(ino, false); + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, + &args->cbl_range)) { + rv = NFS4ERR_DELAY; + goto unlock; + } + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); + } +unlock: + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + pnfs_put_layout_hdr(lo); + iput(ino); +out: + return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + int stat; + + if (args->cbl_recall_type == RETURN_FSID) + stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + else + stat = pnfs_destroy_layouts_byclid(clp, true); + if (stat != 0) + return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + u32 res; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); + if (args->cbl_recall_type == RETURN_FILE) + res = initiate_file_draining(clp, args); + else + res = initiate_bulk_draining(clp, args); + dprintk("%s returning %i\n", __func__, res); + return res; + +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps) +{ + u32 res; + + dprintk("%s: -->\n", __func__); + + if (cps->clp) + res = do_callback_layoutrecall(cps->clp, args); + else + res = NFS4ERR_OP_NOT_IN_SESSION; + + dprintk("%s: exit with status = %d\n", __func__, res); + return cpu_to_be32(res); +} + +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs args; + + /* Pretend we got a CB_LAYOUTRECALL(ALL) */ + memset(&args, 0, sizeof(args)); + args.cbl_recall_type = RETURN_ALL; + /* FIXME we ignore errors, what should we do? */ + do_callback_layoutrecall(clp, &args); +} + +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps) +{ + int i; + __be32 res = 0; + struct nfs_client *clp = cps->clp; + struct nfs_server *server = NULL; + + dprintk("%s: -->\n", __func__); + + if (!clp) { + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + goto out; + } + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + if (!server || + server->pnfs_curr_ld->id != dev->cbd_layout_type) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (server->pnfs_curr_ld && + server->pnfs_curr_ld->id == dev->cbd_layout_type) { + rcu_read_unlock(); + goto found; + } + rcu_read_unlock(); + dprintk("%s: layout type %u not found\n", + __func__, dev->cbd_layout_type); + continue; + } + + found: + nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + } + +out: + kfree(args->devs); + dprintk("%s: exit with status = %u\n", + __func__, be32_to_cpu(res)); + return res; +} + +/* + * Validate the sequenceID sent by the server. + * Return success if the sequenceID is one more than what we last saw on + * this slot, accounting for wraparound. Increments the slot's sequence. + * + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now + * since we only currently implement idempotent callbacks anyway. + * + * We have a single slot backchannel at this time, so we don't bother + * checking the used_slots bit array on the table. The lower layer guarantees + * a single outstanding callback request at a time. + */ +static __be32 +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) +{ + struct nfs4_slot *slot; + + dprintk("%s enter. slotid %u seqid %u\n", + __func__, args->csa_slotid, args->csa_sequenceid); + + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) + return htonl(NFS4ERR_BADSLOT); + + slot = tbl->slots + args->csa_slotid; + dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); + + /* Normal */ + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { + slot->seq_nr++; + goto out_ok; + } + + /* Replay */ + if (args->csa_sequenceid == slot->seq_nr) { + dprintk("%s seqid %u is a replay\n", + __func__, args->csa_sequenceid); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + } + + /* Wraparound */ + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { + slot->seq_nr = 1; + goto out_ok; + } + + /* Misordered request */ + return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); +} + +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *tbl; + struct nfs_client *clp; + int i; + __be32 status = htonl(NFS4ERR_BADSESSION); + + clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, + &args->csa_sessionid, cps->minorversion); + if (clp == NULL) + goto out; + + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + goto out; + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* state manager is resetting the session */ + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); + goto out; + } + + status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + + cps->slotid = args->csa_slotid; + + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + +out: + cps->clp = clp; /* put in nfs4_callback_compound */ + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + cps->drc_status = status; + status = 0; + } else + res->csr_status = status; + + trace_nfs4_cb_sequence(args, res, status); + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; +} + +static bool +validate_bitmap_values(unsigned long mask) +{ + return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, + struct cb_process_state *cps) +{ + __be32 status; + fmode_t flags = 0; + + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk_rcu("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values(args->craa_type_mask)) + goto out; + + status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + pnfs_recall_all_layouts(cps->clp); + if (flags) + nfs_expire_unused_delegation_types(cps->clp, flags); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +/* Reduce the fore channel's max_slots to the target value */ +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *fc_tbl; + __be32 status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_highest_slotid); + + fc_tbl = &cps->clp->cl_session->fc_slot_table; + + status = htonl(NFS4_OK); + + nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); + nfs41_server_notify_target_slotid_update(cps->clp); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ diff --git a/kernel/fs/nfs/callback_xdr.c b/kernel/fs/nfs/callback_xdr.c new file mode 100644 index 000000000..19ca95cdf --- /dev/null +++ b/kernel/fs/nfs/callback_xdr.c @@ -0,0 +1,1033 @@ +/* + * linux/fs/nfs/callback_xdr.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback encode/decode procedures + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" +#include "nfs4session.h" + +#define CB_OP_TAGLEN_MAXSZ (512) +#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) +#define CB_OP_GETATTR_BITMAP_MAXSZ (4) +#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + CB_OP_GETATTR_BITMAP_MAXSZ + \ + 2 + 2 + 3 + 3) +#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + +#if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#endif /* CONFIG_NFS_V4_1 */ + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + +typedef __be32 (*callback_process_op_t)(void *, void *, + struct cb_process_state *); +typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); +typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); + + +struct callback_op { + callback_process_op_t process_op; + callback_decode_arg_t decode_args; + callback_encode_res_t encode_res; + long res_maxsize; +}; + +static struct callback_op callback_ops[]; + +static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return htonl(NFS4_OK); +} + +static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, nbytes); + if (unlikely(p == NULL)) + printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); + return p; +} + +static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *len = ntohl(*p); + + if (*len != 0) { + p = read_buf(xdr, *len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *str = (const char *)p; + } else + *str = NULL; + + return 0; +} + +static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + fh->size = ntohl(*p); + if (fh->size > NFS4_FHSIZE) + return htonl(NFS4ERR_BADHANDLE); + p = read_buf(xdr, fh->size); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(&fh->data[0], p, fh->size); + memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); + return 0; +} + +static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + __be32 *p; + unsigned int attrlen; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + attrlen = ntohl(*p); + p = read_buf(xdr, attrlen << 2); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + if (likely(attrlen > 0)) + bitmap[0] = ntohl(*p++); + if (attrlen > 1) + bitmap[1] = ntohl(*p); + return 0; +} + +static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + __be32 *p; + + p = read_buf(xdr, NFS4_STATEID_SIZE); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(stateid, p, NFS4_STATEID_SIZE); + return 0; +} + +static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) +{ + __be32 *p; + __be32 status; + + status = decode_string(xdr, &hdr->taglen, &hdr->tag); + if (unlikely(status != 0)) + return status; + /* We do not like overly long tags! */ + if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { + printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", + __func__, hdr->taglen); + return htonl(NFS4ERR_RESOURCE); + } + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + hdr->minorversion = ntohl(*p++); + /* Check for minor version support */ + if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */ + } else { + pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " + "illegal minor version %u!\n", + __func__, hdr->minorversion); + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } + hdr->nops = ntohl(*p); + dprintk("%s: minorversion %d nops %d\n", __func__, + hdr->minorversion, hdr->nops); + return 0; +} + +static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) +{ + __be32 *p; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *op = ntohl(*p); + return 0; +} + +static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->fh); + if (unlikely(status != 0)) + goto out; + args->addr = svc_addr(rqstp); + status = decode_bitmap(xdr, args->bitmap); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) +{ + __be32 *p; + __be32 status; + + args->addr = svc_addr(rqstp); + status = decode_stateid(xdr, &args->stateid); + if (unlikely(status != 0)) + goto out; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_RESOURCE); + goto out; + } + args->truncate = ntohl(*p); + status = decode_fh(xdr, &args->fh); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + uint32_t iomode; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + /* Depite the spec's xdr, iomode really belongs in the FILE switch, + * as it is unusable and ignored with the other types. + */ + iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (args->cbl_recall_type == RETURN_FILE) { + args->cbl_range.iomode = iomode; + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_range.offset); + p = xdr_decode_hyper(p, &args->cbl_range.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } else if (args->cbl_recall_type != RETURN_ALL) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", + __func__, + args->cbl_layout_type, iomode, + args->cbl_layoutchanged, args->cbl_recall_type); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + if (n > ULONG_MAX / sizeof(*args->devs)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); + if (!args->devs) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +err: + kfree(args->devs); + goto out; +} + +static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(sid->data, p, len); + return 0; +} + +static __be32 decode_rc_list(struct xdr_stream *xdr, + struct referring_call_list *rc_list) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &rc_list->rcl_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + rc_list->rcl_nrefcalls = ntohl(*p++); + if (rc_list->rcl_nrefcalls) { + p = read_buf(xdr, + rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls, + sizeof(*rc_list->rcl_refcalls), + GFP_KERNEL); + if (unlikely(rc_list->rcl_refcalls == NULL)) + goto out; + for (i = 0; i < rc_list->rcl_nrefcalls; i++) { + rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++); + rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++); + } + } + status = 0; + +out: + return status; +} + +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_sequenceargs *args) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &args->csa_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, 5 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + args->csa_addr = svc_addr(rqstp); + args->csa_sequenceid = ntohl(*p++); + args->csa_slotid = ntohl(*p++); + args->csa_highestslotid = ntohl(*p++); + args->csa_cachethis = ntohl(*p++); + args->csa_nrclists = ntohl(*p++); + args->csa_rclists = NULL; + if (args->csa_nrclists) { + args->csa_rclists = kmalloc_array(args->csa_nrclists, + sizeof(*args->csa_rclists), + GFP_KERNEL); + if (unlikely(args->csa_rclists == NULL)) + goto out; + + for (i = 0; i < args->csa_nrclists; i++) { + status = decode_rc_list(xdr, &args->csa_rclists[i]); + if (status) { + args->csa_nrclists = i; + goto out_free; + } + } + } + status = 0; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " + "highestslotid %u cachethis %d nrclists %u\n", + __func__, + ((u32 *)&args->csa_sessionid)[0], + ((u32 *)&args->csa_sessionid)[1], + ((u32 *)&args->csa_sessionid)[2], + ((u32 *)&args->csa_sessionid)[3], + args->csa_sequenceid, args->csa_slotid, + args->csa_highestslotid, args->csa_cachethis, + args->csa_nrclists); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; + +out_free: + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + goto out; +} + +static __be32 decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + uint32_t bitmap[2]; + __be32 *p, status; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; + + return 0; +} + +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_highest_slotid = ntohl(*p++); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + xdr_encode_opaque(p, str, len); + return 0; +} + +#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) +#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +{ + __be32 bm[2]; + __be32 *p; + + bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); + bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); + if (bm[1] != 0) { + p = xdr_reserve_space(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(2); + *p++ = bm[0]; + *p++ = bm[1]; + } else if (bm[0] != 0) { + p = xdr_reserve_space(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(1); + *p++ = bm[0]; + } else { + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(0); + } + *savep = p; + return 0; +} + +static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_CHANGE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, change); + return 0; +} + +static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_SIZE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, size); + return 0; +} + +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, time->tv_sec); + *p = htonl(time->tv_nsec); + return 0; +} + +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) +{ + __be32 status; + + hdr->status = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->status == NULL)) + return htonl(NFS4ERR_RESOURCE); + status = encode_string(xdr, hdr->taglen, hdr->tag); + if (unlikely(status != 0)) + return status; + hdr->nops = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->nops == NULL)) + return htonl(NFS4ERR_RESOURCE); + return 0; +} + +static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *p++ = htonl(op); + *p = res; + return 0; +} + +static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) +{ + __be32 *savep = NULL; + __be32 status = res->status; + + if (unlikely(status != 0)) + goto out; + status = encode_attr_bitmap(xdr, res->bitmap, &savep); + if (unlikely(status != 0)) + goto out; + status = encode_attr_change(xdr, res->bitmap, res->change_attr); + if (unlikely(status != 0)) + goto out; + status = encode_attr_size(xdr, res->bitmap, res->size); + if (unlikely(status != 0)) + goto out; + status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); + *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 encode_sessionid(struct xdr_stream *xdr, + const struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = xdr_reserve_space(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(p, sid, len); + return 0; +} + +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + const struct cb_sequenceres *res) +{ + __be32 *p; + __be32 status = res->csr_status; + + if (unlikely(status != 0)) + goto out; + + encode_sessionid(xdr, &res->csr_sessionid); + + p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + *p++ = htonl(res->csr_sequenceid); + *p++ = htonl(res->csr_slotid); + *p++ = htonl(res->csr_highestslotid); + *p++ = htonl(res->csr_target_highestslotid); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + if (op_nr == OP_CB_SEQUENCE) { + if (nop != 0) + return htonl(NFS4ERR_SEQUENCE_POS); + } else { + if (nop == 0) + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + } + + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: + case OP_CB_WANTS_CANCELLED: + case OP_CB_NOTIFY_LOCK: + return htonl(NFS4ERR_NOTSUPP); + + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl = &session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* + * Let the state manager know callback processing done. + * A single slot, so highest used slotid is either 0 or -1 + */ + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_slot_tbl_drain_complete(tbl); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ + if (cps->slotid != NFS4_NO_SLOT) + nfs4_callback_free_slot(cps->clp->cl_session); +} + +#else /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +#ifdef CONFIG_NFS_V4_2 +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + __be32 status = preprocess_nfs41_op(nop, op_nr, op); + if (status != htonl(NFS4ERR_OP_ILLEGAL)) + return status; + + if (op_nr == OP_CB_OFFLOAD) + return htonl(NFS4ERR_NOTSUPP); + return htonl(NFS4ERR_OP_ILLEGAL); +} +#else /* CONFIG_NFS_V4_2 */ +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} +#endif /* CONFIG_NFS_V4_2 */ + +static __be32 +preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) +{ + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + *op = &callback_ops[op_nr]; + break; + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static __be32 process_op(int nop, struct svc_rqst *rqstp, + struct xdr_stream *xdr_in, void *argp, + struct xdr_stream *xdr_out, void *resp, + struct cb_process_state *cps) +{ + struct callback_op *op = &callback_ops[0]; + unsigned int op_nr; + __be32 status; + long maxlen; + __be32 res; + + dprintk("%s: start\n", __func__); + status = decode_op_hdr(xdr_in, &op_nr); + if (unlikely(status)) + return status; + + dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", + __func__, cps->minorversion, nop, op_nr); + + switch (cps->minorversion) { + case 0: + status = preprocess_nfs4_op(op_nr, &op); + break; + case 1: + status = preprocess_nfs41_op(nop, op_nr, &op); + break; + case 2: + status = preprocess_nfs42_op(nop, op_nr, &op); + break; + default: + status = htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } + + if (status == htonl(NFS4ERR_OP_ILLEGAL)) + op_nr = OP_CB_ILLEGAL; + if (status) + goto encode_hdr; + + if (cps->drc_status) { + status = cps->drc_status; + goto encode_hdr; + } + + maxlen = xdr_out->end - xdr_out->p; + if (maxlen > 0 && maxlen < PAGE_SIZE) { + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) + status = op->process_op(argp, resp, cps); + } else + status = htonl(NFS4ERR_RESOURCE); + +encode_hdr: + res = encode_op_hdr(xdr_out, op_nr, status); + if (unlikely(res)) + return res; + if (op->encode_res != NULL && status == 0) + status = op->encode_res(rqstp, xdr_out, resp); + dprintk("%s: done, status = %d\n", __func__, ntohl(status)); + return status; +} + +/* + * Decode, process and encode a COMPOUND + */ +static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) +{ + struct cb_compound_hdr_arg hdr_arg = { 0 }; + struct cb_compound_hdr_res hdr_res = { NULL }; + struct xdr_stream xdr_in, xdr_out; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, + .clp = NULL, + .slotid = NFS4_NO_SLOT, + .net = SVC_NET(rqstp), + }; + unsigned int nops = 0; + + dprintk("%s: start\n", __func__); + + xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + + p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + + status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + if (status == __constant_htonl(NFS4ERR_RESOURCE)) + return rpc_garbage_args; + + if (hdr_arg.minorversion == 0) { + cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + return rpc_drop_reply; + } + + cps.minorversion = hdr_arg.minorversion; + hdr_res.taglen = hdr_arg.taglen; + hdr_res.tag = hdr_arg.tag; + if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + return rpc_system_err; + + while (status == 0 && nops != hdr_arg.nops) { + status = process_op(nops, rqstp, &xdr_in, + argp, &xdr_out, resp, &cps); + nops++; + } + + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + + *hdr_res.status = status; + *hdr_res.nops = htonl(nops); + nfs4_cb_free_slot(&cps); + nfs_put_client(cps.clp); + dprintk("%s: done, status = %u\n", __func__, ntohl(status)); + return rpc_success; +} + +/* + * Define NFS4 callback COMPOUND ops. + */ +static struct callback_op callback_ops[] = { + [0] = { + .res_maxsize = CB_OP_HDR_RES_MAXSZ, + }, + [OP_CB_GETATTR] = { + .process_op = (callback_process_op_t)nfs4_callback_getattr, + .decode_args = (callback_decode_arg_t)decode_getattr_args, + .encode_res = (callback_encode_res_t)encode_getattr_res, + .res_maxsize = CB_OP_GETATTR_RES_MAXSZ, + }, + [OP_CB_RECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_recall, + .decode_args = (callback_decode_arg_t)decode_recall_args, + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, +#if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, + .encode_res = (callback_encode_res_t)encode_cb_sequence_res, + .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, + }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/* + * Define NFS4 callback procedures + */ +static struct svc_procedure nfs4_callback_procedures1[] = { + [CB_NULL] = { + .pc_func = nfs4_callback_null, + .pc_decode = (kxdrproc_t)nfs4_decode_void, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_xdrressize = 1, + }, + [CB_COMPOUND] = { + .pc_func = nfs4_callback_compound, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_argsize = 256, + .pc_ressize = 256, + .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + } +}; + +struct svc_version nfs4_callback_version1 = { + .vs_vers = 1, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; + +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; diff --git a/kernel/fs/nfs/client.c b/kernel/fs/nfs/client.c new file mode 100644 index 000000000..892aefff3 --- /dev/null +++ b/kernel/fs/nfs/client.c @@ -0,0 +1,1472 @@ +/* client.c: NFS client sharing and management code + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" +#include "nfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +static DEFINE_SPINLOCK(nfs_version_lock); +static DEFINE_MUTEX(nfs_version_mutex); +static LIST_HEAD(nfs_versions); + +/* + * RPC cruft for NFS + */ +static const struct rpc_version *nfs_version[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, +}; + +const struct rpc_program nfs_program = { + .name = "nfs", + .number = NFS_PROGRAM, + .nrvers = ARRAY_SIZE(nfs_version), + .version = nfs_version, + .stats = &nfs_rpcstat, + .pipe_dir_name = NFS_PIPE_DIRNAME, +}; + +struct rpc_stat nfs_rpcstat = { + .program = &nfs_program +}; + +static struct nfs_subversion *find_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs; + spin_lock(&nfs_version_lock); + + list_for_each_entry(nfs, &nfs_versions, list) { + if (nfs->rpc_ops->version == version) { + spin_unlock(&nfs_version_lock); + return nfs; + } + } + + spin_unlock(&nfs_version_lock); + return ERR_PTR(-EPROTONOSUPPORT); +} + +struct nfs_subversion *get_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs = find_nfs_version(version); + + if (IS_ERR(nfs)) { + mutex_lock(&nfs_version_mutex); + request_module("nfsv%d", version); + nfs = find_nfs_version(version); + mutex_unlock(&nfs_version_mutex); + } + + if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + return ERR_PTR(-EAGAIN); + return nfs; +} + +void put_nfs_version(struct nfs_subversion *nfs) +{ + module_put(nfs->owner); +} + +void register_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + list_add(&nfs->list, &nfs_versions); + nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(register_nfs_version); + +void unregister_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + nfs_version[nfs->rpc_ops->version] = NULL; + list_del(&nfs->list); + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(unregister_nfs_version); + +/* + * Allocate a shared client record + * + * Since these are allocated/deallocated very rarely, we don't + * bother putting them in a slab cache... + */ +struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +{ + struct nfs_client *clp; + struct rpc_cred *cred; + int err = -ENOMEM; + + if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) + goto error_0; + + clp->cl_nfs_mod = cl_init->nfs_mod; + if (!try_module_get(clp->cl_nfs_mod->owner)) + goto error_dealloc; + + clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; + + atomic_set(&clp->cl_count, 1); + clp->cl_cons_state = NFS_CS_INITING; + + memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); + clp->cl_addrlen = cl_init->addrlen; + + if (cl_init->hostname) { + err = -ENOMEM; + clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); + if (!clp->cl_hostname) + goto error_cleanup; + } + + INIT_LIST_HEAD(&clp->cl_superblocks); + clp->cl_rpcclient = ERR_PTR(-EINVAL); + + clp->cl_proto = cl_init->proto; + clp->cl_net = get_net(cl_init->net); + + cred = rpc_lookup_machine_cred("*"); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; + nfs_fscache_get_client_cookie(clp); + + return clp; + +error_cleanup: + put_nfs_version(clp->cl_nfs_mod); +error_dealloc: + kfree(clp); +error_0: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(nfs_alloc_client); + +#if IS_ENABLED(CONFIG_NFS_V4) +void nfs_cleanup_cb_ident_idr(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + idr_destroy(&nn->cb_ident_idr); +} + +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (clp->cl_cb_ident) + idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); +} + +static void pnfs_init_server(struct nfs_server *server) +{ + rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +} + +#else +void nfs_cleanup_cb_ident_idr(struct net *net) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server) +{ +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Destroy a shared client record + */ +void nfs_free_client(struct nfs_client *clp) +{ + dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); + + nfs_fscache_release_client_cookie(clp); + + /* -EIO all pending I/O */ + if (!IS_ERR(clp->cl_rpcclient)) + rpc_shutdown_client(clp->cl_rpcclient); + + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + + put_net(clp->cl_net); + put_nfs_version(clp->cl_nfs_mod); + kfree(clp->cl_hostname); + kfree(clp->cl_acceptor); + kfree(clp); + + dprintk("<-- nfs_free_client()\n"); +} +EXPORT_SYMBOL_GPL(nfs_free_client); + +/* + * Release a reference to a shared client record + */ +void nfs_put_client(struct nfs_client *clp) +{ + struct nfs_net *nn; + + if (!clp) + return; + + dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + nn = net_generic(clp->cl_net, nfs_net_id); + + if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { + list_del(&clp->cl_share_link); + nfs_cb_idr_remove_locked(clp); + spin_unlock(&nn->nfs_client_lock); + + WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); + + clp->rpc_ops->free_client(clp); + } +} +EXPORT_SYMBOL_GPL(nfs_put_client); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) + return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; + + return 1; +} +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + return 0; +} +#endif + +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} + +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, excluding the port number. + */ +int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; +} +EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6(sa1, sa2); + } + return 0; +} + +/* + * Find an nfs_client on the list that matches the initialisation data + * that is supplied. + */ +static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) +{ + struct nfs_client *clp; + const struct sockaddr *sap = data->addr; + struct nfs_net *nn = net_generic(data->net, nfs_net_id); + + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + /* Don't match clients that failed to initialise properly */ + if (clp->cl_cons_state < 0) + continue; + + /* Different NFS versions cannot share the same nfs_client */ + if (clp->rpc_ops != data->nfs_mod->rpc_ops) + continue; + + if (clp->cl_proto != data->proto) + continue; + /* Match nfsv4 minorversion */ + if (clp->cl_minorversion != data->minorversion) + continue; + /* Match the full socket address */ + if (!nfs_sockaddr_cmp(sap, clap)) + continue; + + atomic_inc(&clp->cl_count); + return clp; + } + return NULL; +} + +static bool nfs_client_init_is_complete(const struct nfs_client *clp) +{ + return clp->cl_cons_state <= NFS_CS_READY; +} + +int nfs_wait_client_init_complete(const struct nfs_client *clp) +{ + return wait_event_killable(nfs_client_active_wq, + nfs_client_init_is_complete(clp)); +} +EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); + +/* + * Found an existing client. Make sure it's ready before returning. + */ +static struct nfs_client * +nfs_found_client(const struct nfs_client_initdata *cl_init, + struct nfs_client *clp) +{ + int error; + + error = nfs_wait_client_init_complete(clp); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(-ERESTARTSYS); + } + + if (clp->cl_cons_state < NFS_CS_READY) { + error = clp->cl_cons_state; + nfs_put_client(clp); + return ERR_PTR(error); + } + + smp_rmb(); + + dprintk("<-- %s found nfs_client %p for %s\n", + __func__, clp, cl_init->hostname ?: ""); + return clp; +} + +/* + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist + */ +struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour) +{ + struct nfs_client *clp, *new = NULL; + struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); + const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; + + if (cl_init->hostname == NULL) { + WARN_ON(1); + return NULL; + } + + dprintk("--> nfs_get_client(%s,v%u)\n", + cl_init->hostname, rpc_ops->version); + + /* see if the client already exists */ + do { + spin_lock(&nn->nfs_client_lock); + + clp = nfs_match_client(cl_init); + if (clp) { + spin_unlock(&nn->nfs_client_lock); + if (new) + new->rpc_ops->free_client(new); + return nfs_found_client(cl_init, clp); + } + if (new) { + list_add_tail(&new->cl_share_link, + &nn->nfs_client_list); + spin_unlock(&nn->nfs_client_lock); + new->cl_flags = cl_init->init_flags; + return rpc_ops->init_client(new, timeparms, ip_addr); + } + + spin_unlock(&nn->nfs_client_lock); + + new = rpc_ops->alloc_client(cl_init); + } while (!IS_ERR(new)); + + dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", + cl_init->hostname, PTR_ERR(new)); + return new; +} +EXPORT_SYMBOL_GPL(nfs_get_client); + +/* + * Mark a server as ready or failed + */ +void nfs_mark_client_ready(struct nfs_client *clp, int state) +{ + smp_wmb(); + clp->cl_cons_state = state; + wake_up_all(&nfs_client_active_wq); +} +EXPORT_SYMBOL_GPL(nfs_mark_client_ready); + +/* + * Initialise the timeout values for a connection + */ +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, + unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + + switch (proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; + if (to->to_initval == 0) + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) + to->to_maxval = NFS_MAX_TCP_TIMEOUT; + if (to->to_maxval < to->to_initval) + to->to_maxval = to->to_initval; + to->to_exponential = 0; + break; + case XPRT_TRANSPORT_UDP: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; + if (!to->to_initval) + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + default: + BUG(); + } +} +EXPORT_SYMBOL_GPL(nfs_init_timeout_values); + +/* + * Create an RPC client handle + */ +int nfs_create_rpc_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + rpc_authflavor_t flavor) +{ + struct rpc_clnt *clnt = NULL; + struct rpc_create_args args = { + .net = clp->cl_net, + .protocol = clp->cl_proto, + .address = (struct sockaddr *)&clp->cl_addr, + .addrsize = clp->cl_addrlen, + .timeout = timeparms, + .servername = clp->cl_hostname, + .program = &nfs_program, + .version = clp->rpc_ops->version, + .authflavor = flavor, + }; + + if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; + if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; + + if (!IS_ERR(clp->cl_rpcclient)) + return 0; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %ld\n", + __func__, PTR_ERR(clnt)); + return PTR_ERR(clnt); + } + + clp->cl_rpcclient = clnt; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_create_rpc_client); + +/* + * Version 2 or 3 client destruction + */ +static void nfs_destroy_server(struct nfs_server *server) +{ + if (server->nlm_host) + nlmclnt_done(server->nlm_host); +} + +/* + * Version 2 or 3 lockd setup + */ +static int nfs_start_lockd(struct nfs_server *server) +{ + struct nlm_host *host; + struct nfs_client *clp = server->nfs_client; + struct nlmclnt_initdata nlm_init = { + .hostname = clp->cl_hostname, + .address = (struct sockaddr *)&clp->cl_addr, + .addrlen = clp->cl_addrlen, + .nfs_version = clp->rpc_ops->version, + .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? + 1 : 0, + .net = clp->cl_net, + }; + + if (nlm_init.nfs_version > 3) + return 0; + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) + return 0; + + switch (clp->cl_proto) { + default: + nlm_init.protocol = IPPROTO_TCP; + break; + case XPRT_TRANSPORT_UDP: + nlm_init.protocol = IPPROTO_UDP; + } + + host = nlmclnt_init(&nlm_init); + if (IS_ERR(host)) + return PTR_ERR(host); + + server->nlm_host = host; + server->destroy = nfs_destroy_server; + return 0; +} + +/* + * Create a general RPC client + */ +int nfs_init_server_rpcclient(struct nfs_server *server, + const struct rpc_timeout *timeo, + rpc_authflavor_t pseudoflavour) +{ + struct nfs_client *clp = server->nfs_client; + + server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, + pseudoflavour); + if (IS_ERR(server->client)) { + dprintk("%s: couldn't create rpc_client!\n", __func__); + return PTR_ERR(server->client); + } + + memcpy(&server->client->cl_timeout_default, + timeo, + sizeof(server->client->cl_timeout_default)); + server->client->cl_timeout = &server->client->cl_timeout_default; + server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); + +/** + * nfs_init_client - Initialise an NFS2 or NFS3 client + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: IP presentation address (not used) + * + * Returns pointer to an NFS client, or an ERR_PTR value. + */ +struct nfs_client *nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is already initialised */ + dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + return clp; + } + + /* + * Create a client RPC handle for doing FSSTAT with UNIX auth only + * - RFC 2623, sec 2.3.2 + */ + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + if (error < 0) + goto error; + nfs_mark_client_ready(clp, NFS_CS_READY); + return clp; + +error: + nfs_mark_client_ready(clp, error); + nfs_put_client(clp); + dprintk("<-- nfs_init_client() = xerror %d\n", error); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(nfs_init_client); + +/* + * Create a version 2 or 3 client + */ +static int nfs_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data, + struct nfs_subversion *nfs_mod) +{ + struct nfs_client_initdata cl_init = { + .hostname = data->nfs_server.hostname, + .addr = (const struct sockaddr *)&data->nfs_server.address, + .addrlen = data->nfs_server.addrlen, + .nfs_mod = nfs_mod, + .proto = data->nfs_server.protocol, + .net = data->net, + }; + struct rpc_timeout timeparms; + struct nfs_client *clp; + int error; + + dprintk("--> nfs_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + if (data->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + if (IS_ERR(clp)) { + dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + return PTR_ERR(clp); + } + + server->nfs_client = clp; + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + /* Start lockd here, before we might error out */ + error = nfs_start_lockd(server); + if (error < 0) + goto error; + + server->port = data->nfs_server.port; + server->auth_info = data->auth_info; + + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); + if (error < 0) + goto error; + + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + + server->namelen = data->namlen; + dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); + return 0; + +error: + server->nfs_client = NULL; + nfs_put_client(clp); + dprintk("<-- nfs_init_server() = xerror %d\n", error); + return error; +} + +/* + * Load up the server record from information gained in an fsinfo record + */ +static void nfs_server_set_fsinfo(struct nfs_server *server, + struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) +{ + unsigned long max_rpc_payload; + + /* Work out a lot of parameters */ + if (server->rsize == 0) + server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + if (server->wsize == 0) + server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + + if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) + server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) + server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + + max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + if (server->rsize > max_rpc_payload) + server->rsize = max_rpc_payload; + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + server->backing_dev_info.name = "nfs"; + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + if (server->flags & NFS_MOUNT_NOAC) { + server->acregmin = server->acregmax = 0; + server->acdirmin = server->acdirmax = 0; + } + + server->maxfilesize = fsinfo->maxfilesize; + + server->time_delta = fsinfo->time_delta; + + /* We're airborne Set socket buffersize */ + rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); +} + +/* + * Probe filesystem information, including the FSID on v2/v3 + */ +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +{ + struct nfs_fsinfo fsinfo; + struct nfs_client *clp = server->nfs_client; + int error; + + dprintk("--> nfs_probe_fsinfo()\n"); + + if (clp->rpc_ops->set_capabilities != NULL) { + error = clp->rpc_ops->set_capabilities(server, mntfh); + if (error < 0) + goto out_error; + } + + fsinfo.fattr = fattr; + fsinfo.layouttype = 0; + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; + + nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { + struct nfs_pathconf pathinfo; + + pathinfo.fattr = fattr; + nfs_fattr_init(fattr); + + if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) + server->namelen = pathinfo.max_namelen; + } + + dprintk("<-- nfs_probe_fsinfo() = 0\n"); + return 0; + +out_error: + dprintk("nfs_probe_fsinfo: error = %d\n", -error); + return error; +} +EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); + +/* + * Copy useful information when duplicating a server record + */ +void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +{ + target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; + target->acregmin = source->acregmin; + target->acregmax = source->acregmax; + target->acdirmin = source->acdirmin; + target->acdirmax = source->acdirmax; + target->caps = source->caps; + target->options = source->options; + target->auth_info = source->auth_info; +} +EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); + +void nfs_server_insert_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); + list_add_tail(&server->master_link, &nn->nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + spin_unlock(&nn->nfs_client_lock); + +} +EXPORT_SYMBOL_GPL(nfs_server_insert_lists); + +void nfs_server_remove_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn; + + if (clp == NULL) + return; + nn = net_generic(clp->cl_net, nfs_net_id); + spin_lock(&nn->nfs_client_lock); + list_del_rcu(&server->client_link); + if (list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + list_del(&server->master_link); + spin_unlock(&nn->nfs_client_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nfs_server_remove_lists); + +/* + * Allocate and initialise a server record + */ +struct nfs_server *nfs_alloc_server(void) +{ + struct nfs_server *server; + + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + return NULL; + + server->client = server->client_acl = ERR_PTR(-EINVAL); + + /* Zero out the NFS state stuff */ + INIT_LIST_HEAD(&server->client_link); + INIT_LIST_HEAD(&server->master_link); + INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); + INIT_LIST_HEAD(&server->state_owners_lru); + + atomic_set(&server->active, 0); + + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + kfree(server); + return NULL; + } + + if (bdi_init(&server->backing_dev_info)) { + nfs_free_iostats(server->io_stats); + kfree(server); + return NULL; + } + + ida_init(&server->openowner_id); + ida_init(&server->lockowner_id); + pnfs_init_server(server); + + return server; +} +EXPORT_SYMBOL_GPL(nfs_alloc_server); + +/* + * Free up a server record + */ +void nfs_free_server(struct nfs_server *server) +{ + dprintk("--> nfs_free_server()\n"); + + nfs_server_remove_lists(server); + + if (server->destroy != NULL) + server->destroy(server); + + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); + if (!IS_ERR(server->client)) + rpc_shutdown_client(server->client); + + nfs_put_client(server->nfs_client); + + ida_destroy(&server->lockowner_id); + ida_destroy(&server->openowner_id); + nfs_free_iostats(server->io_stats); + bdi_destroy(&server->backing_dev_info); + kfree(server); + nfs_release_automount_timer(); + dprintk("<-- nfs_free_server()\n"); +} +EXPORT_SYMBOL_GPL(nfs_free_server); + +/* + * Create a version 2 or 3 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + + /* Get a client representation */ + error = nfs_init_server(server, mount_info->parsed, nfs_mod); + if (error < 0) + goto error; + + /* Probe the root fh to retrieve its FSID */ + error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + + if (!(fattr->valid & NFS_ATTR_FATTR)) { + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + nfs_free_fattr(fattr); + return server; + +error: + nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(nfs_create_server); + +/* + * Clone an NFS2, NFS3 or NFS4 server record + */ +struct nfs_server *nfs_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server; + struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", + (unsigned long long) fattr->fsid.major, + (unsigned long long) fattr->fsid.minor); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr_fsinfo = nfs_alloc_fattr(); + if (fattr_fsinfo == NULL) + goto out_free_server; + + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + server->destroy = source->destroy; + atomic_inc(&server->nfs_client->cl_count); + nfs_server_copy_userdata(server, source); + + server->fsid = fattr->fsid; + + error = nfs_init_server_rpcclient(server, + source->client->cl_timeout, + flavor); + if (error < 0) + goto out_free_server; + + /* probe the filesystem info for this server filesystem */ + error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + dprintk("Cloned FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + error = nfs_start_lockd(server); + if (error < 0) + goto out_free_server; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + + nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + +out_free_server: + nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(nfs_clone_server); + +void nfs_clients_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + INIT_LIST_HEAD(&nn->nfs_client_list); + INIT_LIST_HEAD(&nn->nfs_volume_list); +#if IS_ENABLED(CONFIG_NFS_V4) + idr_init(&nn->cb_ident_idr); +#endif + spin_lock_init(&nn->nfs_client_lock); + nn->boot_time = CURRENT_TIME; +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_fs_nfs; + +static int nfs_server_list_open(struct inode *inode, struct file *file); +static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_server_list_stop(struct seq_file *p, void *v); +static int nfs_server_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_server_list_ops = { + .start = nfs_server_list_start, + .next = nfs_server_list_next, + .stop = nfs_server_list_stop, + .show = nfs_server_list_show, +}; + +static const struct file_operations nfs_server_list_fops = { + .open = nfs_server_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, + .owner = THIS_MODULE, +}; + +static int nfs_volume_list_open(struct inode *inode, struct file *file); +static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_volume_list_stop(struct seq_file *p, void *v); +static int nfs_volume_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_volume_list_ops = { + .start = nfs_volume_list_start, + .next = nfs_volume_list_next, + .stop = nfs_volume_list_stop, + .show = nfs_volume_list_show, +}; + +static const struct file_operations nfs_volume_list_fops = { + .open = nfs_volume_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, + .owner = THIS_MODULE, +}; + +/* + * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which + * we're dealing + */ +static int nfs_server_list_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &nfs_server_list_ops, + sizeof(struct seq_net_private)); +} + +/* + * set up the iterator to start reading from the server list and return the first item + */ +static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) +{ + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); + + /* lock the list against modification */ + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_client_list, *_pos); +} + +/* + * move to next server + */ +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); + + return seq_list_next(v, &nn->nfs_client_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_server_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) +{ + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_server_list_show(struct seq_file *m, void *v) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); + + /* display header on line 1 */ + if (v == &nn->nfs_client_list) { + seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); + return 0; + } + + /* display one transport per line on subsequent lines */ + clp = list_entry(v, struct nfs_client, cl_share_link); + + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + + rcu_read_lock(); + seq_printf(m, "v%u %s %s %3d %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + atomic_read(&clp->cl_count), + clp->cl_hostname); + rcu_read_unlock(); + + return 0; +} + +/* + * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes + */ +static int nfs_volume_list_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &nfs_volume_list_ops, + sizeof(struct seq_net_private)); +} + +/* + * set up the iterator to start reading from the volume list and return the first item + */ +static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) +{ + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); + + /* lock the list against modification */ + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_volume_list, *_pos); +} + +/* + * move to next volume + */ +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); + + return seq_list_next(v, &nn->nfs_volume_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_volume_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) +{ + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_volume_list_show(struct seq_file *m, void *v) +{ + struct nfs_server *server; + struct nfs_client *clp; + char dev[8], fsid[17]; + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); + + /* display header on line 1 */ + if (v == &nn->nfs_volume_list) { + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + return 0; + } + /* display one transport per line on subsequent lines */ + server = list_entry(v, struct nfs_server, master_link); + clp = server->nfs_client; + + snprintf(dev, 8, "%u:%u", + MAJOR(server->s_dev), MINOR(server->s_dev)); + + snprintf(fsid, 17, "%llx:%llx", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + rcu_read_lock(); + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + dev, + fsid, + nfs_server_fscache_state(server)); + rcu_read_unlock(); + + return 0; +} + +int nfs_fs_proc_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct proc_dir_entry *p; + + nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); + if (!nn->proc_nfsfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_create("servers", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_server_list_fops); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_create("volumes", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_volume_list_fops); + if (!p) + goto error_1; + return 0; + +error_1: + remove_proc_subtree("nfsfs", net->proc_net); +error_0: + return -ENOMEM; +} + +void nfs_fs_proc_net_exit(struct net *net) +{ + remove_proc_subtree("nfsfs", net->proc_net); +} + +/* + * initialise the /proc/fs/nfsfs/ directory + */ +int __init nfs_fs_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); + if (!proc_fs_nfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); + if (!p) + goto error_2; + return 0; + +error_2: + remove_proc_entry("servers", proc_fs_nfs); +error_1: + remove_proc_entry("fs/nfsfs", NULL); +error_0: + return -ENOMEM; +} + +/* + * clean up the /proc/fs/nfsfs/ directory + */ +void nfs_fs_proc_exit(void) +{ + remove_proc_entry("volumes", proc_fs_nfs); + remove_proc_entry("servers", proc_fs_nfs); + remove_proc_entry("fs/nfsfs", NULL); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/fs/nfs/delegation.c b/kernel/fs/nfs/delegation.c new file mode 100644 index 000000000..029d688a9 --- /dev/null +++ b/kernel/fs/nfs/delegation.c @@ -0,0 +1,902 @@ +/* + * linux/fs/nfs/delegation.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFS file delegation management + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" +#include "nfs4trace.h" + +static void nfs_free_delegation(struct nfs_delegation *delegation) +{ + if (delegation->cred) { + put_rpccred(delegation->cred); + delegation->cred = NULL; + } + kfree_rcu(delegation, rcu); +} + +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */ +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); +} + +static int +nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) +{ + struct nfs_delegation *delegation; + int ret = 0; + + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (mark) + nfs_mark_delegation_referenced(delegation); + ret = 1; + } + rcu_read_unlock(); + return ret; +} +/** + * nfs_have_delegation - check if inode has a delegation, mark it + * NFS_DELEGATION_REFERENCED if there is one. + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs4_have_delegation(struct inode *inode, fmode_t flags) +{ + return nfs4_do_check_delegation(inode, flags, true); +} + +/* + * nfs4_check_delegation - check if inode has a delegation, do not mark + * NFS_DELEGATION_REFERENCED if it has one. + */ +int nfs4_check_delegation(struct inode *inode, fmode_t flags) +{ + return nfs4_do_check_delegation(inode, flags, false); +} + +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct inode *inode = state->inode; + struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; + struct list_head *list; + int status = 0; + + if (flctx == NULL) + goto out; + + list = &flctx->flc_posix; + spin_lock(&flctx->flc_lock); +restart: + list_for_each_entry(fl, list, fl_list) { + if (nfs_file_open_context(fl->fl_file) != ctx) + continue; + spin_unlock(&flctx->flc_lock); + status = nfs4_lock_delegation_recall(fl, state, stateid); + if (status < 0) + goto out; + spin_lock(&flctx->flc_lock); + } + if (list == &flctx->flc_posix) { + list = &flctx->flc_flock; + goto restart; + } + spin_unlock(&flctx->flc_lock); +out: + return status; +} + +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; + struct nfs4_state *state; + unsigned int seq; + int err; + +again: + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (!nfs4_valid_open_stateid(state)) + continue; + if (!nfs4_stateid_match(&state->stateid, stateid)) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); + sp = state->owner; + /* Block nfs4_proc_unlck */ + mutex_lock(&sp->so_delegreturn_mutex); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + err = nfs4_open_delegation_recall(ctx, state, stateid); + if (!err) + err = nfs_delegation_claim_locks(ctx, state, stateid); + if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + err = -EAGAIN; + mutex_unlock(&sp->so_delegreturn_mutex); + put_nfs_open_context(ctx); + if (err != 0) + return err; + goto again; + } + spin_unlock(&inode->i_lock); + return 0; +} + +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + * + */ +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + struct nfs_openres *res) +{ + struct nfs_delegation *delegation; + struct rpc_cred *oldcred = NULL; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { + nfs4_stateid_copy(&delegation->stateid, &res->delegation); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; + delegation->cred = get_rpccred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags); + spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_rpccred(oldcred); + trace_nfs4_reclaim_delegation(inode, res->delegation_type); + } else { + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); + } + } else { + rcu_read_unlock(); + } +} + +static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + int res = 0; + + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + res = nfs4_proc_delegreturn(inode, + delegation->cred, + &delegation->stateid, + issync); + nfs_free_delegation(delegation); + return res; +} + +static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation) +{ + struct inode *inode = NULL; + + spin_lock(&delegation->lock); + if (delegation->inode != NULL) + inode = igrab(delegation->inode); + spin_unlock(&delegation->lock); + return inode; +} + +static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) +{ + struct nfs_delegation *ret = NULL; + struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + + if (delegation == NULL) + goto out; + spin_lock(&delegation->lock); + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + ret = delegation; + spin_unlock(&delegation->lock); +out: + return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs_start_delegation_return_locked(nfsi); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + + spin_lock(&delegation->lock); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, + struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + struct nfs_delegation *deleg_cur = + rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + + if (deleg_cur == NULL || delegation != deleg_cur) + return NULL; + + spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; + rcu_assign_pointer(nfsi->delegation, NULL); + spin_unlock(&delegation->lock); + return delegation; +} + +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, + struct nfs_delegation *delegation, + struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); + spin_unlock(&clp->cl_lock); + return delegation; +} + +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_delegation *delegation; + + delegation = nfs_start_delegation_return(nfsi); + if (delegation == NULL) + return NULL; + return nfs_detach_delegation(nfsi, delegation, server); +} + +static void +nfs_update_inplace_delegation(struct nfs_delegation *delegation, + const struct nfs_delegation *update) +{ + if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) { + delegation->stateid.seqid = update->stateid.seqid; + smp_wmb(); + delegation->type = update->type; + } +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value. + */ +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation, *old_delegation; + struct nfs_delegation *freeme = NULL; + int status = 0; + + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + nfs4_stateid_copy(&delegation->stateid, &res->delegation); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = inode->i_version; + delegation->cred = get_rpccred(cred); + delegation->inode = inode; + delegation->flags = 1<lock); + + spin_lock(&clp->cl_lock); + old_delegation = rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + if (old_delegation != NULL) { + /* Is this an update of the existing delegation? */ + if (nfs4_stateid_match_other(&old_delegation->stateid, + &delegation->stateid)) { + nfs_update_inplace_delegation(old_delegation, + delegation); + goto out; + } + /* + * Deal with broken servers that hand out two + * delegations for the same file. + * Allow for upgrades to a WRITE delegation, but + * nothing else. + */ + dfprintk(FILE, "%s: server %s handed out " + "a duplicate delegation!\n", + __func__, clp->cl_hostname); + if (delegation->type == old_delegation->type || + !(delegation->type & FMODE_WRITE)) { + freeme = delegation; + delegation = NULL; + goto out; + } + if (test_and_set_bit(NFS_DELEGATION_RETURNING, + &old_delegation->flags)) + goto out; + freeme = nfs_detach_delegation_locked(nfsi, + old_delegation, clp); + if (freeme == NULL) + goto out; + } + list_add_tail_rcu(&delegation->super_list, &server->delegations); + rcu_assign_pointer(nfsi->delegation, delegation); + delegation = NULL; + + /* Ensure we revalidate the attributes and page cache! */ + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); + trace_nfs4_set_delegation(inode, res->delegation_type); + +out: + spin_unlock(&clp->cl_lock); + if (delegation != NULL) + nfs_free_delegation(delegation); + if (freeme != NULL) + nfs_do_return_delegation(inode, freeme, 0); + return status; +} + +/* + * Basic procedure for returning a delegation to the server + */ +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + int err = 0; + + if (delegation == NULL) + return 0; + do { + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + break; + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + if (!issync || err != -EAGAIN) + break; + /* + * Guard against state recovery + */ + err = nfs4_wait_clnt_recover(clp); + } while (err == 0); + + if (err) { + nfs_abort_delegation_return(delegation, clp); + goto out; + } + if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) + goto out; + + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; +} + +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ + bool ret = false; + + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + goto out; + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + ret = true; + if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { + struct inode *inode; + + spin_lock(&delegation->lock); + inode = delegation->inode; + if (inode && list_empty(&NFS_I(inode)->open_files)) + ret = true; + spin_unlock(&delegation->lock); + } +out: + return ret; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * + * Returns zero on success, or a negative errno value. + */ +int nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + int err = 0; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (!nfs_delegation_need_return(delegation)) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + + err = nfs_end_delegation_return(inode, delegation, 0); + iput(inode); + nfs_sb_deactive(server->super); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; + } + } + rcu_read_unlock(); + return 0; +} + +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode(). + */ +void nfs_inode_return_delegation_noreclaim(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_inode_detach_delegation(inode); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 1); +} + +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * This routine will always flush any dirty data to disk on the + * assumption that if we need to return the delegation, then + * we should stop caching. + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_return_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int err = 0; + + nfs_wb_all(inode); + delegation = nfs_start_delegation_return(nfsi); + if (delegation != NULL) + err = nfs_end_delegation_return(inode, delegation, 1); + return err; +} + +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + bool ret = false; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + nfs_mark_return_delegation(server, delegation); + ret = true; + } + return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * + */ +void nfs_server_return_all_delegations(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + bool need_wait; + + if (clp == NULL) + return; + + rcu_read_lock(); + need_wait = nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); + + if (need_wait) { + nfs4_schedule_state_manager(clp); + nfs4_wait_clnt_recover(clp); + } +} + +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, + fmode_t flags) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) + continue; + if (delegation->type & flags) + nfs_mark_return_if_closed_delegation(server, delegation); + } +} + +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, + fmode_t flags) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unused_delegation_types(server, flags); + rcu_read_unlock(); +} + +static void nfs_revoke_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + nfs_mark_return_delegation(NFS_SERVER(inode), delegation); + } + rcu_read_unlock(); +} + +void nfs_remove_bad_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + + nfs_revoke_delegation(inode); + delegation = nfs_inode_detach_delegation(inode); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** + * nfs_expire_unused_delegation_types + * @clp: client to process + * @flags: delegation types to expire + * + */ +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags) +{ + nfs_client_mark_return_unused_delegation_types(clp, flags); + nfs_delegation_run_state_manager(clp); +} + +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) + continue; + nfs_mark_return_if_closed_delegation(server, delegation); + } +} + +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */ +void nfs_expire_unreferenced_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unreferenced_delegations(server); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information + * + * Returns zero on success, or a negative errno value. + */ +int nfs_async_inode_return_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + filemap_flush(inode->i_mapping); + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL) + goto out_enoent; + + if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) + goto out_enoent; + nfs_mark_return_delegation(server, delegation); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); + return 0; +out_enoent: + rcu_read_unlock(); + return -ENOENT; +} + +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + struct nfs_delegation *delegation; + struct inode *res = NULL; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL && + nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { + res = igrab(delegation->inode); + } + spin_unlock(&delegation->lock); + if (res != NULL) + break; + } + return res; +} + +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. + */ +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, + const struct nfs_fh *fhandle) +{ + struct nfs_server *server; + struct inode *res = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + res = nfs_delegation_find_inode_server(server, fhandle); + if (res != NULL) + break; + } + rcu_read_unlock(); + return res; +} + +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + * + */ +void nfs_delegation_mark_reclaim(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_reclaim_server(server); + rcu_read_unlock(); +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + if (delegation != NULL) { + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + if (delegation != NULL) + nfs_free_delegation(delegation); + } + iput(inode); + nfs_sb_deactive(server->super); + goto restart; + } + } + rcu_read_unlock(); +} + +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ + struct nfs_server *server; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (!list_empty(&server->delegations)) { + ret = 1; + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * @flags: delegation type requirement + * + * Returns "true" and fills in "dst->data" * if inode had a delegation, + * otherwise "false" is returned. + */ +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, + fmode_t flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + bool ret; + + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + ret = (delegation != NULL && (delegation->type & flags) == flags); + if (ret) { + nfs4_stateid_copy(dst, &delegation->stateid); + nfs_mark_delegation_referenced(delegation); + } + rcu_read_unlock(); + return ret; +} diff --git a/kernel/fs/nfs/delegation.h b/kernel/fs/nfs/delegation.h new file mode 100644 index 000000000..e3c20a3cc --- /dev/null +++ b/kernel/fs/nfs/delegation.h @@ -0,0 +1,73 @@ +/* + * linux/fs/nfs/delegation.h + * + * Copyright (c) Trond Myklebust + * + * Definitions pertaining to NFS delegated files + */ +#ifndef FS_NFS_DELEGATION_H +#define FS_NFS_DELEGATION_H + +#if IS_ENABLED(CONFIG_NFS_V4) +/* + * NFSv4 delegation + */ +struct nfs_delegation { + struct list_head super_list; + struct rpc_cred *cred; + struct inode *inode; + nfs4_stateid stateid; + fmode_t type; + loff_t maxsize; + __u64 change_attr; + unsigned long flags; + spinlock_t lock; + struct rcu_head rcu; +}; + +enum { + NFS_DELEGATION_NEED_RECLAIM = 0, + NFS_DELEGATION_RETURN, + NFS_DELEGATION_RETURN_IF_CLOSED, + NFS_DELEGATION_REFERENCED, + NFS_DELEGATION_RETURNING, + NFS_DELEGATION_REVOKED, +}; + +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs4_inode_return_delegation(struct inode *inode); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_inode_return_delegation_noreclaim(struct inode *inode); + +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +void nfs_server_return_all_delegations(struct nfs_server *); +void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unreferenced_delegations(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); + +void nfs_delegation_mark_reclaim(struct nfs_client *clp); +void nfs_delegation_reap_unclaimed(struct nfs_client *clp); + +/* NFSv4 delegation-related procedures */ +int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); + +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); +int nfs4_have_delegation(struct inode *inode, fmode_t flags); +int nfs4_check_delegation(struct inode *inode, fmode_t flags); + +#endif + +static inline int nfs_have_delegated_attributes(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && + !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); +} + +#endif diff --git a/kernel/fs/nfs/dir.c b/kernel/fs/nfs/dir.c new file mode 100644 index 000000000..b2c8b31b2 --- /dev/null +++ b/kernel/fs/nfs/dir.c @@ -0,0 +1,2520 @@ +/* + * linux/fs/nfs/dir.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs directory handling functions + * + * 10 Apr 1996 Added silly rename for unlink --okir + * 28 Sep 1996 Improved directory cache --okir + * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de + * Re-implemented silly rename for unlink, newly implemented + * silly rename for nfs_rename() following the suggestions + * of Olaf Kirch (okir) found in this file. + * Following Linus comments on my original hack, this version + * depends only on the dcache stuff and doesn't touch the inode + * layer (iput() and friends). + * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" + +#include "nfstrace.h" + +/* #define NFS_DEBUG_VERBOSE 1 */ + +static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); +static int nfs_readdir(struct file *, struct dir_context *); +static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); +static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static void nfs_readdir_clear_array(struct page*); + +const struct file_operations nfs_dir_operations = { + .llseek = nfs_llseek_dir, + .read = generic_read_dir, + .iterate = nfs_readdir, + .open = nfs_opendir, + .release = nfs_closedir, + .fsync = nfs_fsync_dir, +}; + +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, +}; + +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +{ + struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + spin_lock(&dir->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&dir->i_lock); + return ctx; + } + return ERR_PTR(-ENOMEM); +} + +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) +{ + spin_lock(&dir->i_lock); + list_del(&ctx->list); + spin_unlock(&dir->i_lock); + put_rpccred(ctx->cred); + kfree(ctx); +} + +/* + * Open file + */ +static int +nfs_opendir(struct inode *inode, struct file *filp) +{ + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; + + dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; + if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { + /* This is a mountpoint, so d_revalidate will never + * have been called, so we need to refresh the + * inode (for close-open consistency) ourselves. + */ + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } +out: + put_rpccred(cred); + return res; +} + +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(file_inode(filp), filp->private_data); + return 0; +} + +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; + unsigned char d_type; +}; + +struct nfs_cache_array { + int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); +typedef struct { + struct file *file; + struct page *page; + struct dir_context *ctx; + unsigned long page_index; + u64 *dir_cookie; + u64 last_cookie; + loff_t current_index; + decode_dirent_t decode; + + unsigned long timestamp; + unsigned long gencount; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; +} nfs_readdir_descriptor_t; + +/* + * The caller is responsible for calling nfs_readdir_release_array(page) + */ +static +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) +{ + void *ptr; + if (page == NULL) + return ERR_PTR(-EIO); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; +} + +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +void nfs_readdir_clear_array(struct page *page) +{ + struct nfs_cache_array *array; + int i; + + array = kmap_atomic(page); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + kunmap_atomic(array); +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + if (string->name == NULL) + return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); + string->hash = full_name_hash(name, len); + return 0; +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + + if (IS_ERR(array)) + return PTR_ERR(array); + + cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; + array->last_cookie = entry->cookie; + array->size++; + if (entry->eof != 0) + array->eof_index = array->size; +out: + nfs_readdir_release_array(page); + return ret; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->ctx->pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index >= 0) + goto out_eof; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + return false; + smp_rmb(); + return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + loff_t new_pos; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); + struct nfs_open_dir_context *ctx = desc->file->private_data; + + new_pos = desc->current_index + i; + if (ctx->attr_gencount != nfsi->attr_gencount || + !nfs_readdir_inode_mapping_valid(nfsi)) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->ctx->pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %pD2 contains a readdir loop." + "Please contact your server vendor. " + "The file: %.*s has duplicate cookie %llu\n", + desc->file, array->array[i].string.len, + array->array[i].string.name, *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = -1; + } + desc->ctx->pos = new_pos; + desc->cache_entry_index = i; + return 0; + } + } + if (array->eof_index >= 0) { + status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; + } +out: + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->page_index++; + } + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) +{ + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; + unsigned long timestamp, gencount; + int error; + + again: + timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + NFS_SERVER(inode)->dtsize, desc->plus); + if (error < 0) { + /* We requested READDIRPLUS, but the server doesn't grok it */ + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + desc->plus = 0; + goto again; + } + goto error; + } + desc->timestamp = timestamp; + desc->gencount = gencount; +error: + return error; +} + +static int xdr_decode(nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct xdr_stream *xdr) +{ + int error; + + error = desc->decode(xdr, entry, desc->plus); + if (error) + return error; + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; + return 0; +} + +/* Match file and dirent using either filehandle or fileid + * Note: caller is responsible for checking the fsid + */ +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) +{ + struct nfs_inode *nfsi; + + if (d_really_is_negative(dentry)) + return 0; + + nfsi = NFS_I(d_inode(dentry)); + if (entry->fattr->fileid == nfsi->fileid) + return 1; + if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) + return 1; + return 0; +} + +static +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) + return false; + if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +/* + * This function is called by the lookup code to request the use of + * readdirplus to accelerate any future lookups in the same + * directory. + */ +static +void nfs_advise_use_readdirplus(struct inode *dir) +{ + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); +} + +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ + if (!list_empty(&NFS_I(dir)->open_files)) { + nfs_advise_use_readdirplus(dir); + nfs_zap_mapping(dir, dir->i_mapping); + } +} + +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +{ + struct qstr filename = QSTR_INIT(entry->name, entry->len); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct inode *inode; + int status; + + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) + return; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) + return; + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + + dentry = d_lookup(parent, &filename); + if (dentry != NULL) { + /* Is there a mountpoint here? If so, just exit */ + if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, + &entry->fattr->fsid)) + goto out; + if (nfs_same_file(dentry, entry)) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + status = nfs_refresh_inode(d_inode(dentry), entry->fattr); + if (!status) + nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); + goto out; + } else { + d_invalidate(dentry); + dput(dentry); + } + } + + dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); + if (IS_ERR(inode)) + goto out; + + alias = d_splice_alias(inode, dentry); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); +} + +/* Perform conversion from xdr to cache array */ +static +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + struct page **xdr_pages, struct page *page, unsigned int buflen) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct nfs_cache_array *array; + unsigned int count = 0; + int status; + + scratch = alloc_page(GFP_KERNEL); + if (scratch == NULL) + return -ENOMEM; + + if (buflen == 0) + goto out_nopages; + + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) { + if (status == -EAGAIN) + status = 0; + break; + } + + count++; + + if (desc->plus != 0) + nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; + } while (!entry->eof); + +out_nopages: + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { + array = nfs_readdir_get_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); + } + + put_page(scratch); + return status; +} + +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + nfs_readdir_free_pagearray(pages, npages); +} + +/* + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page + */ +static +int nfs_readdir_large_page(struct page **pages, unsigned int npages) +{ + unsigned int i; + + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } + return 0; + +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return -ENOMEM; +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = -ENOMEM; + unsigned int array_size = ARRAY_SIZE(pages); + + entry.prev_cookie = 0; + entry.cookie = desc->last_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + entry.server = NFS_SERVER(inode); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; + + entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry.label)) { + status = PTR_ERR(entry.label); + goto out; + } + + array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out_label_free; + } + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + + status = nfs_readdir_large_page(pages, array_size); + if (status < 0) + goto out_release_array; + do { + unsigned int pglen; + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + + if (status < 0) + break; + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); + + nfs_readdir_free_large_page(pages_ptr, pages, array_size); +out_release_array: + nfs_readdir_release_array(page); +out_label_free: + nfs4_label_free(entry.label); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); + return status; +} + +/* + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler + */ +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +{ + struct inode *inode = file_inode(desc->file); + int ret; + + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + SetPageUptodate(page); + + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { + /* Should never happen */ + nfs_zap_mapping(inode, inode->i_mapping); + } + unlock_page(page); + return 0; + error: + unlock_page(page); + return ret; +} + +static +void cache_page_release(nfs_readdir_descriptor_t *desc) +{ + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); + page_cache_release(desc->page); + desc->page = NULL; +} + +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +{ + return read_cache_page(file_inode(desc->file)->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); +} + +/* + * Returns 0 if desc->dir_cookie was found on page desc->page_index + */ +static +int find_cache_page(nfs_readdir_descriptor_t *desc) +{ + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); + + res = nfs_readdir_search_array(desc); + if (res != 0) + cache_page_release(desc); + return res; +} + +/* Search for desc->dir_cookie from the beginning of the page cache */ +static inline +int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +{ + int res; + + if (desc->page_index == 0) { + desc->current_index = 0; + desc->last_cookie = 0; + } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); + return res; +} + +/* + * Once we've found the start of the dirent within a page: fill 'er up... + */ +static +int nfs_do_filldir(nfs_readdir_descriptor_t *desc) +{ + struct file *file = desc->file; + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } + + for (i = desc->cache_entry_index; i < array->size; i++) { + struct nfs_cache_array_entry *ent; + + ent = &array->array[i]; + if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + nfs_compat_user_ino64(ent->ino), ent->d_type)) { + desc->eof = 1; + break; + } + desc->ctx->pos++; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; + } + if (array->eof_index >= 0) + desc->eof = 1; + + nfs_readdir_release_array(desc->page); +out: + cache_page_release(desc); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", + (unsigned long long)*desc->dir_cookie, res); + return res; +} + +/* + * If we cannot find a cookie in our cache, we suspect that this is + * because it points to a deleted file, so we ask the server to return + * whatever it thinks is the next entry. We then feed this to filldir. + * If all goes well, we should then be able to find our way round the + * cache on the next call to readdir_search_pagecache(); + * + * NOTE: we cannot add the anonymous page to the pagecache because + * the data it contains might not be page aligned. Besides, + * we should already have a complete representation of the + * directory in the page cache by the time we get here. + */ +static inline +int uncached_readdir(nfs_readdir_descriptor_t *desc) +{ + struct page *page = NULL; + int status; + struct inode *inode = file_inode(desc->file); + struct nfs_open_dir_context *ctx = desc->file->private_data; + + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", + (unsigned long long)*desc->dir_cookie); + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + status = -ENOMEM; + goto out; + } + + desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; + desc->page = page; + ctx->duped = 0; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + + status = nfs_do_filldir(desc); + + out: + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", + __func__, status); + return status; + out_release: + cache_page_release(desc); + goto out; +} + +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_attribute_cache_expired(dir)) + return true; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return true; + return false; +} + +/* The file offset position represents the dirent entry number. A + last cookie cache takes care of the common case of reading the + whole directory. + */ +static int nfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = file->private_data; + int res = 0; + + dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", + file, (long long)ctx->pos); + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + + /* + * ctx->pos points to the dirent entry number. + * *desc->dir_cookie has the cookie for the next entry. We have + * to either find the entry with the appropriate number or + * revalidate the cookie. + */ + memset(desc, 0, sizeof(*desc)); + + desc->file = file; + desc->ctx = ctx; + desc->dir_cookie = &dir_ctx->dir_cookie; + desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; + + nfs_block_sillyrename(dentry); + if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) + res = nfs_revalidate_mapping(inode, file->f_mapping); + if (res < 0) + goto out; + + do { + res = readdir_search_pagecache(desc); + + if (res == -EBADCOOKIE) { + res = 0; + /* This means either end of directory */ + if (*desc->dir_cookie && desc->eof == 0) { + /* Or that the server has 'lost' a cookie */ + res = uncached_readdir(desc); + if (res == 0) + continue; + } + break; + } + if (res == -ETOOSMALL && desc->plus) { + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + nfs_zap_caches(inode); + desc->page_index = 0; + desc->plus = 0; + desc->eof = 0; + continue; + } + if (res < 0) + break; + + res = nfs_do_filldir(desc); + if (res < 0) + break; + } while (!desc->eof); +out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; + dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); + return res; +} + +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) +{ + struct inode *inode = file_inode(filp); + struct nfs_open_dir_context *dir_ctx = filp->private_data; + + dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", + filp, offset, whence); + + mutex_lock(&inode->i_mutex); + switch (whence) { + case 1: + offset += filp->f_pos; + case 0: + if (offset >= 0) + break; + default: + offset = -EINVAL; + goto out; + } + if (offset != filp->f_pos) { + filp->f_pos = offset; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +/* + * All directory operations under NFS are synchronous, so fsync() + * is a dummy operation. + */ +static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(filp); + + dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); + + mutex_lock(&inode->i_mutex); + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + mutex_unlock(&inode->i_mutex); + return 0; +} + +/** + * nfs_force_lookup_revalidate - Mark the directory as having changed + * @dir - pointer to directory inode + * + * This forces the revalidation code in nfs_lookup_revalidate() to do a + * full lookup on all child dentries of 'dir' whenever a change occurs + * on the server that might have invalidated our dcache. + * + * The caller should be holding dir->i_lock + */ +void nfs_force_lookup_revalidate(struct inode *dir) +{ + NFS_I(dir)->cache_change_attribute++; +} +EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); + +/* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + * If rcu_walk prevents us from performing a full check, return 0. + */ +static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, + int rcu_walk) +{ + int ret; + + if (IS_ROOT(dentry)) + return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + /* Revalidate nfsi->cache_change_attribute before we declare a match */ + if (rcu_walk) + ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir); + else + ret = nfs_revalidate_inode(NFS_SERVER(dir), dir); + if (ret < 0) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + return 1; +} + +/* + * Use intent information to check whether or not we're going to do + * an O_EXCL create using this path component. + */ +static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) +{ + if (NFS_PROTO(dir)->version == 2) + return 0; + return flags & LOOKUP_EXCL; +} + +/* + * Inode and filehandle revalidation for lookups. + * + * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, + * or if the intent information indicates that we're about to open this + * particular file and the "nocto" mount flag is not set. + * + */ +static +int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (IS_AUTOMOUNT(inode)) + return 0; + /* VFS wants an on-the-wire revalidation */ + if (flags & LOOKUP_REVAL) + goto out_force; + /* This is an open(2) */ + if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + goto out_force; +out: + return (inode->i_nlink == 0) ? -ENOENT : 0; +out_force: + if (flags & LOOKUP_RCU) + return -ECHILD; + ret = __nfs_revalidate_inode(server, inode); + if (ret != 0) + return ret; + goto out; +} + +/* + * We judge how long we want to trust negative + * dentries by looking at the parent inode mtime. + * + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. + * + * If LOOKUP_RCU prevents us from performing a full check, return 1 + * suggesting a reval is needed. + */ +static inline +int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + /* Don't revalidate a negative dentry if we're creating a new file */ + if (flags & LOOKUP_CREATE) + return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; + return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); +} + +/* + * This is called every time the dcache has a lookup hit, + * and we should check whether we can really trust that + * lookup. + * + * NOTE! The hit can be a negative hit too, don't assume + * we have an inode! + * + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. + */ +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *dir; + struct inode *inode; + struct dentry *parent; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; + int error; + + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = d_inode(parent); + } + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); + inode = d_inode(dentry); + + if (!inode) { + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + goto out_bad; + } + goto out_valid_noent; + } + + if (is_bad_inode(inode)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", + __func__, dentry); + goto out_bad; + } + + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + goto out_set_verifier; + + /* Force a full look up iff the parent directory has changed */ + if (!nfs_is_exclusive_create(dir, flags) && + nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { + + if (nfs_lookup_verify_inode(inode, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + goto out_zap_parent; + } + goto out_valid; + } + + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (NFS_STALE(inode)) + goto out_bad; + + error = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out_error; + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(label)) + goto out_error; + + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); + if (error) + goto out_bad; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; + if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + + nfs_setsecurity(inode, fattr, label); + + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + +out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + out_valid_noent: + if (flags & LOOKUP_RCU) { + if (parent != ACCESS_ONCE(dentry->d_parent)) + return -ECHILD; + } else + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", + __func__, dentry); + return 1; +out_zap_parent: + nfs_zap_caches(dir); + out_bad: + WARN_ON(flags & LOOKUP_RCU); + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) + goto out_valid; + } + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", + __func__, dentry); + return 0; +out_error: + WARN_ON(flags & LOOKUP_RCU); + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", + __func__, dentry, error); + return error; +} + +/* + * A weaker form of d_revalidate for revalidating just the d_inode(dentry) + * when we don't really care about the dentry name. This is called when a + * pathwalk ends on a dentry that was not found via a normal lookup in the + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). + * + * In this situation, we just want to verify that the inode itself is OK + * since the dentry might have changed on the server. + */ +static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + int error; + struct inode *inode = d_inode(dentry); + + /* + * I believe we can only get a negative dentry here in the case of a + * procfs-style symlink. Just assume it's correct for now, but we may + * eventually need to do something more here. + */ + if (!inode) { + dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", + __func__, dentry); + return 1; + } + + if (is_bad_inode(inode)) { + dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", + __func__, dentry); + return 0; + } + + error = nfs_revalidate_inode(NFS_SERVER(inode), inode); + dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", + __func__, inode->i_ino, error ? "invalid" : "valid"); + return !error; +} + +/* + * This is called from dput() when d_count is going to 0. + */ +static int nfs_dentry_delete(const struct dentry *dentry) +{ + dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", + dentry, dentry->d_flags); + + /* Unhash any dentry with a stale inode */ + if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) + return 1; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + /* Unhash it, so that ->d_iput() would be called */ + return 1; + } + if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + /* Unhash it, so that ancestors of killed async unlink + * files will be cleaned up during umount */ + return 1; + } + return 0; + +} + +/* Ensure that we revalidate inode->i_nlink */ +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + /* drop the inode if we're reasonably sure this is the last link */ + if (inode->i_nlink == 1) + clear_nlink(inode); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); +} + +/* + * Called when the dentry loses inode. + * We use it to clean up silly-renamed files. + */ +static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + /* drop any readdir cache as it could easily be old */ + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + nfs_complete_unlink(dentry, inode); + nfs_drop_nlink(inode); + } + iput(inode); +} + +static void nfs_d_release(struct dentry *dentry) +{ + /* free cached devname value, if it survived that far */ + if (unlikely(dentry->d_fsdata)) { + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + WARN_ON(1); + else + kfree(dentry->d_fsdata); + } +} + +const struct dentry_operations nfs_dentry_operations = { + .d_revalidate = nfs_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; +EXPORT_SYMBOL_GPL(nfs_dentry_operations); + +struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +{ + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; + int error; + + dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); + nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); + + res = ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + + /* + * If we're doing an exclusive create, optimize away the lookup + * but don't hash the dentry. + */ + if (nfs_is_exclusive_create(dir, flags)) { + d_instantiate(dentry, NULL); + res = NULL; + goto out; + } + + res = ERR_PTR(-ENOMEM); + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out; + + label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); + if (IS_ERR(label)) + goto out; + + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + trace_nfs_lookup_enter(dir, dentry, flags); + nfs_block_sillyrename(parent); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); + res = ERR_CAST(inode); + if (IS_ERR(res)) + goto out_unblock_sillyrename; + + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + +no_entry: + res = d_splice_alias(inode, dentry); + if (res != NULL) { + if (IS_ERR(res)) + goto out_unblock_sillyrename; + dentry = res; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); + trace_nfs_lookup_exit(dir, dentry, flags, error); + nfs4_label_free(label); +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + return res; +} +EXPORT_SYMBOL_GPL(nfs_lookup); + +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs4_lookup_revalidate(struct dentry *, unsigned int); + +const struct dentry_operations nfs4_dentry_operations = { + .d_revalidate = nfs4_lookup_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; +EXPORT_SYMBOL_GPL(nfs4_dentry_operations); + +static fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) +{ + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); +} + +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_open_file(inode, filp); + return 0; +} + +static int nfs_finish_open(struct nfs_open_context *ctx, + struct dentry *dentry, + struct file *file, unsigned open_flags, + int *opened) +{ + int err; + + if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + + err = finish_open(file, dentry, do_open, opened); + if (err) + goto out; + nfs_file_set_open_context(file, ctx); + +out: + return err; +} + +int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) +{ + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; + struct inode *inode; + unsigned int lookup_flags = 0; + int err; + + /* Expect a negative dentry */ + BUG_ON(d_inode(dentry)); + + dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + err = nfs_check_flags(open_flags); + if (err) + return err; + + /* NFS only supports OPEN on regular files */ + if ((open_flags & O_DIRECTORY)) { + if (!d_unhashed(dentry)) { + /* + * Hashed negative dentry with O_DIRECTORY: dentry was + * revalidated and is fine, no need to perform lookup + * again + */ + return -ENOENT; + } + lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; + goto no_open; + } + + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + + if (open_flags & O_CREAT) { + attr.ia_valid |= ATTR_MODE; + attr.ia_mode = mode & ~current_umask(); + } + if (open_flags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + } + + ctx = create_nfs_open_context(dentry, open_flags); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + trace_nfs_atomic_open_enter(dir, ctx, open_flags); + nfs_block_sillyrename(dentry->d_parent); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); + nfs_unblock_sillyrename(dentry->d_parent); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); + switch (err) { + case -ENOENT: + d_drop(dentry); + d_add(dentry, NULL); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + break; + case -EISDIR: + case -ENOTDIR: + goto no_open; + case -ELOOP: + if (!(open_flags & O_NOFOLLOW)) + goto no_open; + break; + /* case -EINVAL: */ + default: + break; + } + goto out; + } + + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); +out: + return err; + +no_open: + res = nfs_lookup(dir, dentry, lookup_flags); + err = PTR_ERR(res); + if (IS_ERR(res)) + goto out; + + return finish_no_open(file, res); +} +EXPORT_SYMBOL_GPL(nfs_atomic_open); + +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + int ret = 0; + + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) + goto no_open; + if (d_mountpoint(dentry)) + goto no_open; + if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) + goto no_open; + + inode = d_inode(dentry); + + /* We can't create new files in nfs_open_revalidate(), so we + * optimize away revalidation of negative dentries. + */ + if (inode == NULL) { + struct dentry *parent; + struct inode *dir; + + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = d_inode(parent); + } + if (!nfs_neg_need_reval(dir, dentry, flags)) + ret = 1; + else if (flags & LOOKUP_RCU) + ret = -ECHILD; + if (!(flags & LOOKUP_RCU)) + dput(parent); + else if (parent != ACCESS_ONCE(dentry->d_parent)) + return -ECHILD; + goto out; + } + + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open; + /* We cannot do exclusive creation on a positive dentry */ + if (flags & LOOKUP_EXCL) + goto no_open; + + /* Let f_op->open() actually open (and revalidate) the file */ + ret = 1; + +out: + return ret; + +no_open: + return nfs_lookup_revalidate(dentry, flags); +} + +#endif /* CONFIG_NFSV4 */ + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *parent = dget_parent(dentry); + struct inode *dir = d_inode(parent); + struct inode *inode; + int error = -EACCES; + + d_drop(dentry); + + /* We may have been initialized further down */ + if (d_really_is_positive(dentry)) + goto out; + if (fhandle->size == 0) { + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); + if (error) + goto out_error; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (!(fattr->valid & NFS_ATTR_FATTR)) { + struct nfs_server *server = NFS_SB(dentry->d_sb); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); + if (error < 0) + goto out_error; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); + error = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_error; + d_add(dentry, inode); +out: + dput(parent); + return 0; +out_error: + nfs_mark_for_revalidate(dir); + dput(parent); + return error; +} +EXPORT_SYMBOL_GPL(nfs_instantiate); + +/* + * Following a failed create operation, we drop the dentry rather + * than retain a negative dentry. This avoids a problem in the event + * that the operation succeeded on the server, but an error in the + * reply path made it appear to have failed. + */ +int nfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct iattr attr; + int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; + int error; + + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + trace_nfs_create_enter(dir, dentry, open_flags); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + trace_nfs_create_exit(dir, dentry, open_flags, error); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} +EXPORT_SYMBOL_GPL(nfs_create); + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +int +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct iattr attr; + int status; + + dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + trace_nfs_mknod_enter(dir, dentry); + status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); + trace_nfs_mknod_exit(dir, dentry, status); + if (status != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return status; +} +EXPORT_SYMBOL_GPL(nfs_mknod); + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct iattr attr; + int error; + + dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + attr.ia_valid = ATTR_MODE; + attr.ia_mode = mode | S_IFDIR; + + trace_nfs_mkdir_enter(dir, dentry); + error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, error); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} +EXPORT_SYMBOL_GPL(nfs_mkdir); + +static void nfs_dentry_handle_enoent(struct dentry *dentry) +{ + if (d_really_is_positive(dentry) && !d_unhashed(dentry)) + d_delete(dentry); +} + +int nfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + trace_nfs_rmdir_enter(dir, dentry); + if (d_really_is_positive(dentry)) { + nfs_wait_on_sillyrename(dentry); + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + /* Ensure the VFS deletes this inode */ + switch (error) { + case 0: + clear_nlink(d_inode(dentry)); + break; + case -ENOENT: + nfs_dentry_handle_enoent(dentry); + } + } else + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + trace_nfs_rmdir_exit(dir, dentry, error); + + return error; +} +EXPORT_SYMBOL_GPL(nfs_rmdir); + +/* + * Remove a file after making sure there are no pending writes, + * and after checking that the file has only one user. + * + * We invalidate the attribute cache and free the inode prior to the operation + * to avoid possible races if the server reuses the inode. + */ +static int nfs_safe_remove(struct dentry *dentry) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = d_inode(dentry); + int error = -EBUSY; + + dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); + + /* If the dentry was sillyrenamed, we simply call d_delete() */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + error = 0; + goto out; + } + + trace_nfs_remove_enter(dir, dentry); + if (inode != NULL) { + NFS_PROTO(inode)->return_delegation(inode); + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + if (error == 0) + nfs_drop_nlink(inode); + } else + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + if (error == -ENOENT) + nfs_dentry_handle_enoent(dentry); + trace_nfs_remove_exit(dir, dentry, error); +out: + return error; +} + +/* We do silly rename. In case sillyrename() returns -EBUSY, the inode + * belongs to an active ".nfs..." file and we return -EBUSY. + * + * If sillyrename() returns 0, we do nothing, otherwise we unlink. + */ +int nfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + int need_rehash = 0; + + dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, + dir->i_ino, dentry); + + trace_nfs_unlink_enter(dir, dentry); + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) { + spin_unlock(&dentry->d_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(d_inode(dentry), 0); + error = nfs_sillyrename(dir, dentry); + goto out; + } + if (!d_unhashed(dentry)) { + __d_drop(dentry); + need_rehash = 1; + } + spin_unlock(&dentry->d_lock); + error = nfs_safe_remove(dentry); + if (!error || error == -ENOENT) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + } else if (need_rehash) + d_rehash(dentry); +out: + trace_nfs_unlink_exit(dir, dentry, error); + return error; +} +EXPORT_SYMBOL_GPL(nfs_unlink); + +/* + * To create a symbolic link, most file systems instantiate a new inode, + * add a page to it containing the path, then write it out to the disk + * using prepare_write/commit_write. + * + * Unfortunately the NFS client can't create the in-core inode first + * because it needs a file handle to create an in-core inode (see + * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the + * symlink request has completed on the server. + * + * So instead we allocate a raw page, copy the symname into it, then do + * the SYMLINK request with the page as the buffer. If it succeeds, we + * now have a new file handle and can instantiate an in-core NFS inode + * and move the raw page into its mapping. + */ +int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct page *page; + char *kaddr; + struct iattr attr; + unsigned int pathlen = strlen(symname); + int error; + + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry, symname); + + if (pathlen > PAGE_SIZE) + return -ENAMETOOLONG; + + attr.ia_mode = S_IFLNK | S_IRWXUGO; + attr.ia_valid = ATTR_MODE; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + kaddr = kmap_atomic(page); + memcpy(kaddr, symname, pathlen); + if (pathlen < PAGE_SIZE) + memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); + kunmap_atomic(kaddr); + + trace_nfs_symlink_enter(dir, dentry); + error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + trace_nfs_symlink_exit(dir, dentry, error); + if (error != 0) { + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", + dir->i_sb->s_id, dir->i_ino, + dentry, symname, error); + d_drop(dentry); + __free_page(page); + return error; + } + + /* + * No big deal if we can't add this page to the page cache here. + * READLINK will get the missing page from the server if needed. + */ + if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, + GFP_KERNEL)) { + SetPageUptodate(page); + unlock_page(page); + /* + * add_to_page_cache_lru() grabs an extra page refcount. + * Drop it here to avoid leaking this page later. + */ + page_cache_release(page); + } else + __free_page(page); + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_symlink); + +int +nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int error; + + dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", + old_dentry, dentry); + + trace_nfs_link_enter(inode, dir, dentry); + NFS_PROTO(inode)->return_delegation(inode); + + d_drop(dentry); + error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + if (error == 0) { + ihold(inode); + d_add(dentry, inode); + } + trace_nfs_link_exit(inode, dir, dentry, error); + return error; +} +EXPORT_SYMBOL_GPL(nfs_link); + +/* + * RENAME + * FIXME: Some nfsds, like the Linux user space nfsd, may generate a + * different file handle for the same inode after a rename (e.g. when + * moving to a different directory). A fail-safe method to do so would + * be to look up old_dir/old_name, create a link to new_dir/new_name and + * rename the old file using the sillyrename stuff. This way, the original + * file in old_dir will go away when the last process iput()s the inode. + * + * FIXED. + * + * It actually works quite well. One needs to have the possibility for + * at least one ".nfs..." file in each directory the file ever gets + * moved or linked to which happens automagically with the new + * implementation that only depends on the dcache stuff instead of + * using the inode layer + * + * Unfortunately, things are a little more complicated than indicated + * above. For a cross-directory move, we want to make sure we can get + * rid of the old inode after the operation. This means there must be + * no pending writes (if it's a file), and the use count must be 1. + * If these conditions are met, we can drop the dentries before doing + * the rename. + */ +int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct dentry *dentry = NULL, *rehash = NULL; + struct rpc_task *task; + int error = -EBUSY; + + dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", + old_dentry, new_dentry, + d_count(new_dentry)); + + trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); + /* + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. + */ + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } + + if (d_count(new_dentry) > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; + + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; + + new_dentry = dentry; + rehash = NULL; + new_inode = NULL; + } + } + + NFS_PROTO(old_inode)->return_delegation(old_inode); + if (new_inode != NULL) + NFS_PROTO(new_inode)->return_delegation(new_inode); + + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); +out: + if (rehash) + d_rehash(rehash); + trace_nfs_rename_exit(old_dir, old_dentry, + new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + + /* new dentry created? */ + if (dentry) + dput(dentry); + return error; +} +EXPORT_SYMBOL_GPL(nfs_rename); + +static DEFINE_SPINLOCK(nfs_access_lru_lock); +static LIST_HEAD(nfs_access_lru_list); +static atomic_long_t nfs_access_nr_entries; + +static unsigned long nfs_access_max_cachesize = ULONG_MAX; +module_param(nfs_access_max_cachesize, ulong, 0644); +MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); + +static void nfs_access_free_entry(struct nfs_access_entry *entry) +{ + put_rpccred(entry->cred); + kfree_rcu(entry, rcu_head); + smp_mb__before_atomic(); + atomic_long_dec(&nfs_access_nr_entries); + smp_mb__after_atomic(); +} + +static void nfs_access_free_list(struct list_head *head) +{ + struct nfs_access_entry *cache; + + while (!list_empty(head)) { + cache = list_entry(head->next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } +} + +static unsigned long +nfs_do_access_cache_scan(unsigned int nr_to_scan) +{ + LIST_HEAD(head); + struct nfs_inode *nfsi, *next; + struct nfs_access_entry *cache; + long freed = 0; + + spin_lock(&nfs_access_lru_lock); + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { + struct inode *inode; + + if (nr_to_scan-- == 0) + break; + inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; + cache = list_entry(nfsi->access_cache_entry_lru.next, + struct nfs_access_entry, lru); + list_move(&cache->lru, &head); + rb_erase(&cache->rb_node, &nfsi->access_cache); + freed++; + if (!list_empty(&nfsi->access_cache_entry_lru)) + list_move_tail(&nfsi->access_cache_inode_lru, + &nfs_access_lru_list); + else { +remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); + smp_mb__before_atomic(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + smp_mb__after_atomic(); + } + spin_unlock(&inode->i_lock); + } + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); + return freed; +} + +unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return SHRINK_STOP; + return nfs_do_access_cache_scan(nr_to_scan); +} + + +unsigned long +nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); +} + +static void +nfs_access_cache_enforce_limit(void) +{ + long nr_entries = atomic_long_read(&nfs_access_nr_entries); + unsigned long diff; + unsigned int nr_to_scan; + + if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize) + return; + nr_to_scan = 100; + diff = nr_entries - nfs_access_max_cachesize; + if (diff < nr_to_scan) + nr_to_scan = diff; + nfs_do_access_cache_scan(nr_to_scan); +} + +static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) +{ + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); + list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +} + +void nfs_access_zap_cache(struct inode *inode) +{ + LIST_HEAD(head); + + if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) + return; + /* Remove from global LRU init */ + spin_lock(&nfs_access_lru_lock); + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); + + spin_lock(&inode->i_lock); + __nfs_access_zap_cache(NFS_I(inode), &head); + spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); +} +EXPORT_SYMBOL_GPL(nfs_access_zap_cache); + +static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +{ + struct rb_node *n = NFS_I(inode)->access_cache.rb_node; + struct nfs_access_entry *entry; + + while (n != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + + if (cred < entry->cred) + n = n->rb_left; + else if (cred > entry->cred) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache; + int err = -ENOENT; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + if (cache == NULL) + goto out; + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + goto out_stale; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); + err = 0; +out: + spin_unlock(&inode->i_lock); + return err; +out_stale: + rb_erase(&cache->rb_node, &nfsi->access_cache); + list_del(&cache->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(cache); + return -ENOENT; +out_zap: + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + return -ENOENT; +} + +static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +{ + /* Only check the most recently returned cache entry, + * but do it without locking. + */ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache; + int err = -ECHILD; + struct list_head *lh; + + rcu_read_lock(); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out; + lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + cache = list_entry(lh, struct nfs_access_entry, lru); + if (lh == &nfsi->access_cache_entry_lru || + cred != cache->cred) + cache = NULL; + if (cache == NULL) + goto out; + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + goto out; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + err = 0; +out: + rcu_read_unlock(); + return err; +} + +static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node **p = &root_node->rb_node; + struct rb_node *parent = NULL; + struct nfs_access_entry *entry; + + spin_lock(&inode->i_lock); + while (*p != NULL) { + parent = *p; + entry = rb_entry(parent, struct nfs_access_entry, rb_node); + + if (set->cred < entry->cred) + p = &parent->rb_left; + else if (set->cred > entry->cred) + p = &parent->rb_right; + else + goto found; + } + rb_link_node(&set->rb_node, parent, p); + rb_insert_color(&set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + spin_unlock(&inode->i_lock); + return; +found: + rb_replace_node(parent, &set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + list_del(&entry->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(entry); +} + +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); + if (cache == NULL) + return; + RB_CLEAR_NODE(&cache->rb_node); + cache->jiffies = set->jiffies; + cache->cred = get_rpccred(set->cred); + cache->mask = set->mask; + + /* The above field assignments must be visible + * before this item appears on the lru. We cannot easily + * use rcu_assign_pointer, so just force the memory barrier. + */ + smp_wmb(); + nfs_access_add_rbtree(inode, cache); + + /* Update accounting */ + smp_mb__before_atomic(); + atomic_long_inc(&nfs_access_nr_entries); + smp_mb__after_atomic(); + + /* Add inode to global LRU list */ + if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, + &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } + nfs_access_cache_enforce_limit(); +} +EXPORT_SYMBOL_GPL(nfs_access_add_cache); + +void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) +{ + entry->mask = 0; + if (access_result & NFS4_ACCESS_READ) + entry->mask |= MAY_READ; + if (access_result & + (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +} +EXPORT_SYMBOL_GPL(nfs_access_set_mask); + +static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) +{ + struct nfs_access_entry cache; + int status; + + trace_nfs_access_enter(inode); + + status = nfs_access_get_cached_rcu(inode, cred, &cache); + if (status != 0) + status = nfs_access_get_cached(inode, cred, &cache); + if (status == 0) + goto out_cached; + + status = -ECHILD; + if (mask & MAY_NOT_BLOCK) + goto out; + + /* Be clever: ask server to check for all possible rights */ + cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.cred = cred; + cache.jiffies = jiffies; + status = NFS_PROTO(inode)->access(inode, &cache); + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + goto out; + } + nfs_access_add_cache(inode, &cache); +out_cached: + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + status = -EACCES; +out: + trace_nfs_access_exit(inode, status); + return status; +} + +static int nfs_open_permission_mask(int openflags) +{ + int mask = 0; + + if (openflags & __FMODE_EXEC) { + /* ONLY check exec rights */ + mask = MAY_EXEC; + } else { + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + } + + return mask; +} + +int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) +{ + return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); +} +EXPORT_SYMBOL_GPL(nfs_may_open); + +int nfs_permission(struct inode *inode, int mask) +{ + struct rpc_cred *cred; + int res = 0; + + nfs_inc_stats(inode, NFSIOS_VFSACCESS); + + if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + goto out; + /* Is this sys_access() ? */ + if (mask & (MAY_ACCESS | MAY_CHDIR)) + goto force_lookup; + + switch (inode->i_mode & S_IFMT) { + case S_IFLNK: + goto out; + case S_IFREG: + break; + case S_IFDIR: + /* + * Optimize away all write operations, since the server + * will check permissions when we perform the op. + */ + if ((mask & MAY_WRITE) && !(mask & MAY_READ)) + goto out; + } + +force_lookup: + if (!NFS_PROTO(inode)->access) + goto out_notsup; + + /* Always try fast lookups first */ + rcu_read_lock(); + cred = rpc_lookup_cred_nonblock(); + if (!IS_ERR(cred)) + res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK); + else + res = PTR_ERR(cred); + rcu_read_unlock(); + if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) { + /* Fast lookup failed, try the slow way */ + cred = rpc_lookup_cred(); + if (!IS_ERR(cred)) { + res = nfs_do_access(inode, cred, mask); + put_rpccred(cred); + } else + res = PTR_ERR(cred); + } +out: + if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) + res = -EACCES; + + dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", + inode->i_sb->s_id, inode->i_ino, mask, res); + return res; +out_notsup: + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (res == 0) + res = generic_permission(inode, mask); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_permission); + +/* + * Local variables: + * version-control: t + * kept-new-versions: 5 + * End: + */ diff --git a/kernel/fs/nfs/direct.c b/kernel/fs/nfs/direct.c new file mode 100644 index 000000000..38678d9a5 --- /dev/null +++ b/kernel/fs/nfs/direct.c @@ -0,0 +1,1066 @@ +/* + * linux/fs/nfs/direct.c + * + * Copyright (C) 2003 by Chuck Lever + * + * High-performance uncached I/O for the Linux NFS client + * + * There are important applications whose performance or correctness + * depends on uncached access to file data. Database clusters + * (multiple copies of the same instance running on separate hosts) + * implement their own cache coherency protocol that subsumes file + * system cache protocols. Applications that process datasets + * considerably larger than the client's memory do not always benefit + * from a local cache. A streaming video server, for instance, has no + * need to cache the contents of a file. + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. + * + * 18 Dec 2001 Initial implementation for 2.4 --cel + * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy + * 08 Jun 2003 Port to 2.5 APIs --cel + * 31 Mar 2004 Handle direct I/O without VFS support --cel + * 15 Sep 2004 Parallel async reads --cel + * 04 May 2005 support O_DIRECT with aio --cel + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "internal.h" +#include "iostat.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static struct kmem_cache *nfs_direct_cachep; + +/* + * This represents a set of asynchronous requests that we're waiting on + */ +struct nfs_direct_mirror { + ssize_t count; +}; + +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + + struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; + int mirror_count; + + ssize_t count, /* bytes actually processed */ + bytes_left, /* bytes left to be sent */ + io_start, /* start of IO */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; + int flags; +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + struct nfs_writeverf verf; /* unstable write verifier */ +}; + +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static void nfs_direct_write_schedule_work(struct work_struct *work); + +static inline void get_dreq(struct nfs_direct_req *dreq) +{ + atomic_inc(&dreq->io_count); +} + +static inline int put_dreq(struct nfs_direct_req *dreq) +{ + return atomic_dec_and_test(&dreq->io_count); +} + +void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) +{ + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; +} +EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); + +static void +nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +{ + int i; + ssize_t count; + + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; + + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); + + dreq->count = count; + } +} + +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @commit_idx - commit bucket index for the DS + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, + struct nfs_client *ds_clp, + int commit_idx) +{ + struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 + if (ds_clp) { + /* pNFS is in use, use the DS verf */ + if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; + else + WARN_ON_ONCE(1); + } +#endif + return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); + WARN_ON_ONCE(verfp->committed >= 0); + memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); + if (verfp->committed < 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + return 0; + } + return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, + struct nfs_commit_data *data) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, data->ds_clp, + data->ds_commit_index); + + /* verifier not set so always fail */ + if (verfp->committed < 0) + return 1; + + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} + +/** + * nfs_direct_IO - NFS address space operation for direct I/O + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of this routine in the address space ops vector means + * the NFS client supports direct I/O. However, for most direct IO, we + * shunt off direct read and write requests before the VFS gets them, + * so this method is only ever called for swap. + */ +ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + + /* we only support swap file calling nfs_direct_IO */ + if (!IS_SWAPFILE(inode)) + return 0; + + VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); + + if (iov_iter_rw(iter) == READ) + return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_write(iocb, iter); +} + +static void nfs_direct_release_pages(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + page_cache_release(pages[i]); +} + +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq) +{ + cinfo->lock = &dreq->inode->i_lock; + cinfo->mds = &dreq->mds_cinfo; + cinfo->ds = &dreq->ds_cinfo; + cinfo->dreq = dreq; + cinfo->completion_ops = &nfs_direct_commit_completion_ops; +} + +static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, + struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + int mirror_count = 1; + + if (pgio->pg_ops->pg_get_mirror_count) + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + + dreq->mirror_count = mirror_count; +} + +static inline struct nfs_direct_req *nfs_direct_req_alloc(void) +{ + struct nfs_direct_req *dreq; + + dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL); + if (!dreq) + return NULL; + + kref_init(&dreq->kref); + kref_get(&dreq->kref); + init_completion(&dreq->completion); + INIT_LIST_HEAD(&dreq->mds_cinfo.list); + dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ + INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); + dreq->mirror_count = 1; + spin_lock_init(&dreq->lock); + + return dreq; +} + +static void nfs_direct_req_free(struct kref *kref) +{ + struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + + nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); + if (dreq->ctx != NULL) + put_nfs_open_context(dreq->ctx); + kmem_cache_free(nfs_direct_cachep, dreq); +} + +static void nfs_direct_req_release(struct nfs_direct_req *dreq) +{ + kref_put(&dreq->kref, nfs_direct_req_free); +} + +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +{ + return dreq->bytes_left; +} +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); + +/* + * Collects and returns the final error value/byte-count. + */ +static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) +{ + ssize_t result = -EIOCBQUEUED; + + /* Async requests don't wait here */ + if (dreq->iocb) + goto out; + + result = wait_for_completion_killable(&dreq->completion); + + if (!result) + result = dreq->error; + if (!result) + result = dreq->count; + +out: + return (ssize_t) result; +} + +/* + * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust + * the iocb is still valid here if this is a synchronous request. + */ +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +{ + struct inode *inode = dreq->inode; + + if (dreq->iocb && write) { + loff_t pos = dreq->iocb->ki_pos + dreq->count; + + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + if (write) + nfs_zap_mapping(inode, inode->i_mapping); + + inode_dio_end(inode); + + if (dreq->iocb) { + long res = (long) dreq->error; + if (!res) + res = (long) dreq->count; + dreq->iocb->ki_complete(dreq->iocb, res, 0); + } + + complete_all(&dreq->completion); + + nfs_direct_req_release(dreq); +} + +static void nfs_direct_readpage_release(struct nfs_page *req) +{ + dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", + d_inode(req->wb_context->dentry)->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), + req->wb_bytes, + (long long)req_offset(req)); + nfs_release_request(req); +} + +static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) +{ + unsigned long bytes = 0; + struct nfs_direct_req *dreq = hdr->dreq; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; + + spin_lock(&dreq->lock); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) + dreq->error = hdr->error; + else + nfs_direct_good_bytes(dreq, hdr); + + spin_unlock(&dreq->lock); + + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + + if (!PageCompound(page) && bytes < hdr->good_bytes) + set_page_dirty(page); + bytes += req->wb_bytes; + nfs_list_remove_request(req); + nfs_direct_readpage_release(req); + } +out_put: + if (put_dreq(dreq)) + nfs_direct_complete(dreq, false); + hdr->release(hdr); +} + +static void nfs_read_sync_pgio_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_release_request(req); + } +} + +static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) +{ + get_dreq(hdr->dreq); +} + +static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { + .error_cleanup = nfs_read_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_read_completion, +}; + +/* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, + * bail and stop sending more reads. Read length accounting is + * handled automatically by nfs_direct_read_result(). Otherwise, if + * no requests have been sent, just return an error. + */ + +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + struct iov_iter *iter, + loff_t pos) +{ + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); + + nfs_pageio_init_read(&desc, dreq->inode, false, + &nfs_direct_read_completion_ops); + get_dreq(dreq); + desc.pg_dreq = dreq; + inode_dio_begin(inode); + + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + + result = iov_iter_get_pages_alloc(iter, &pagevec, + rsize, &pgbase); + if (result < 0) + break; + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + /* XXX do we need to do the eof zeroing found in async_filler? */ + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) + break; + } + + nfs_pageio_complete(&desc); + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + inode_dio_end(inode); + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_complete(dreq, false); + return 0; +} + +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers into which to read data + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + ssize_t result = -EINVAL; + size_t count = iov_iter_count(iter); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); + + result = 0; + if (!count) + goto out; + + mutex_lock(&inode->i_mutex); + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + task_io_account_read(count); + + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (dreq == NULL) + goto out_unlock; + + dreq->inode = inode; + dreq->bytes_left = count; + dreq->io_start = pos; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + NFS_I(inode)->read_io += count; + result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + + mutex_unlock(&inode->i_mutex); + + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) + iocb->ki_pos = pos + result; + } + + nfs_direct_req_release(dreq); + return result; + +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); +out: + return result; +} + +static void +nfs_direct_write_scan_commit_list(struct inode *inode, + struct list_head *list, + struct nfs_commit_info *cinfo) +{ + spin_lock(cinfo->lock); +#ifdef CONFIG_NFS_V4_1 + if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) + NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); +#endif + nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); + spin_unlock(cinfo->lock); +} + +static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) +{ + struct nfs_pageio_descriptor desc; + struct nfs_page *req, *tmp; + LIST_HEAD(reqs); + struct nfs_commit_info cinfo; + LIST_HEAD(failed); + int i; + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + + dreq->count = 0; + for (i = 0; i < dreq->mirror_count; i++) + dreq->mirrors[i].count = 0; + get_dreq(dreq); + + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; + + req = nfs_list_entry(reqs.next); + nfs_direct_setup_mirroring(dreq, &desc, req); + + list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + if (!nfs_pageio_add_request(&desc, req)) { + nfs_list_remove_request(req); + nfs_list_add_request(req, &failed); + spin_lock(cinfo.lock); + dreq->flags = 0; + dreq->error = -EIO; + spin_unlock(cinfo.lock); + } + nfs_release_request(req); + } + nfs_pageio_complete(&desc); + + while (!list_empty(&failed)) { + req = nfs_list_entry(failed.next); + nfs_list_remove_request(req); + nfs_unlock_and_release_request(req); + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); +} + +static void nfs_direct_commit_complete(struct nfs_commit_data *data) +{ + struct nfs_direct_req *dreq = data->dreq; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int status = data->task.tk_status; + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + if (status < 0) { + dprintk("NFS: %5u commit failed with error %d.\n", + data->task.tk_pid, status); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + /* Note the rewrite will go through mds */ + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else + nfs_release_request(req); + nfs_unlock_and_release_request(req); + } + + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_direct_write_complete(dreq, data->inode); +} + +static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +{ + /* There is no lock to clear */ +} + +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { + .completion = nfs_direct_commit_complete, + .error_cleanup = nfs_direct_error_cleanup, +}; + +static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) +{ + int res; + struct nfs_commit_info cinfo; + LIST_HEAD(mds_list); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_scan_commit(dreq->inode, &mds_list, &cinfo); + res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); + if (res < 0) /* res == -ENOMEM */ + nfs_direct_write_reschedule(dreq); +} + +static void nfs_direct_write_schedule_work(struct work_struct *work) +{ + struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); + int flags = dreq->flags; + + dreq->flags = 0; + switch (flags) { + case NFS_ODIRECT_DO_COMMIT: + nfs_direct_commit_schedule(dreq); + break; + case NFS_ODIRECT_RESCHED_WRITES: + nfs_direct_write_reschedule(dreq); + break; + default: + nfs_direct_complete(dreq, true); + } +} + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ +} + +static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_commit_info cinfo; + bool request_commit = false; + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + + spin_lock(&dreq->lock); + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + dreq->flags = 0; + dreq->error = hdr->error; + } + if (dreq->error == 0) { + nfs_direct_good_bytes(dreq, hdr); + if (nfs_write_need_commit(hdr)) { + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) + request_commit = true; + else if (dreq->flags == 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + request_commit = true; + dreq->flags = NFS_ODIRECT_DO_COMMIT; + } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { + request_commit = true; + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) + dreq->flags = + NFS_ODIRECT_RESCHED_WRITES; + } + } + } + spin_unlock(&dreq->lock); + + while (!list_empty(&hdr->pages)) { + + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + if (request_commit) { + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, hdr->lseg, &cinfo, + hdr->ds_commit_idx); + } + nfs_unlock_and_release_request(req); + } + +out_put: + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, hdr->inode); + hdr->release(hdr); +} + +static void nfs_write_sync_pgio_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_unlock_and_release_request(req); + } +} + +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { + .error_cleanup = nfs_write_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_write_completion, +}; + + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. + */ +static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, + struct iov_iter *iter, + loff_t pos) +{ + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; + ssize_t result = 0; + size_t requested_bytes = 0; + size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + + nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; + get_dreq(dreq); + inode_dio_begin(inode); + + NFS_I(inode)->write_io += iov_iter_count(iter); + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + + result = iov_iter_get_pages_alloc(iter, &pagevec, + wsize, &pgbase); + if (result < 0) + break; + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + + nfs_direct_setup_mirroring(dreq, &desc, req); + + nfs_lock_request(req); + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) + break; + } + nfs_pageio_complete(&desc); + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + inode_dio_end(inode); + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); + return 0; +} + +/** + * nfs_file_direct_write - file direct write operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers from which to write data + * @pos: byte offset in file where writing starts + * + * We use this function for direct writes instead of calling + * generic_file_aio_write() in order to avoid taking the inode + * semaphore and updating the i_size. The NFS server will set + * the new i_size and this client must read the updated size + * back into its cache. We let the server do generic write + * parameter checking and report problems. + * + * We eliminate local atime updates, see direct read above. + * + * We avoid unnecessary page cache invalidations for normal cached + * readers of this file. + * + * Note that O_APPEND is not supported for NFS direct writes, as there + * is no atomic O_APPEND write facility in the NFS protocol. + */ +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t result = -EINVAL; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + loff_t pos, end; + + dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", + file, iov_iter_count(iter), (long long) iocb->ki_pos); + + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, + iov_iter_count(iter)); + + pos = iocb->ki_pos; + end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; + + mutex_lock(&inode->i_mutex); + + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + if (mapping->nrpages) { + result = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (result) + goto out_unlock; + } + + task_io_account_write(iov_iter_count(iter)); + + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out_unlock; + + dreq->inode = inode; + dreq->bytes_left = iov_iter_count(iter); + dreq->io_start = pos; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + + mutex_unlock(&inode->i_mutex); + + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) { + struct inode *inode = mapping->host; + + iocb->ki_pos = pos + result; + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); + } + } + nfs_direct_req_release(dreq); + return result; + +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); + return result; +} + +/** + * nfs_init_directcache - create a slab cache for nfs_direct_req structures + * + */ +int __init nfs_init_directcache(void) +{ + nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", + sizeof(struct nfs_direct_req), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + if (nfs_direct_cachep == NULL) + return -ENOMEM; + + return 0; +} + +/** + * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures + * + */ +void nfs_destroy_directcache(void) +{ + kmem_cache_destroy(nfs_direct_cachep); +} diff --git a/kernel/fs/nfs/dns_resolve.c b/kernel/fs/nfs/dns_resolve.c new file mode 100644 index 000000000..d25f10fb4 --- /dev/null +++ b/kernel/fs/nfs/dns_resolve.c @@ -0,0 +1,470 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust + * + * Resolves DNS hostnames into valid ip addresses + */ + +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include +#include +#include +#include +#include "dns_resolve.h" + +ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + char *ip_addr = NULL; + int ip_len; + + ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); + if (ip_len > 0) + ret = rpc_pton(net, ip_addr, ip_len, sa, salen); + else + ret = -ESRCH; + kfree(ip_addr); + return ret; +} + +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "dns_resolve.h" +#include "cache_lib.h" +#include "netns.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + nfs_dns_ent_update(cnew, ckey); + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = item->h.expiry_time - seconds_since_boot(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, " "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned int ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(cd->net, buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + if (get_uint(&buf, &ttl) < 0) + goto out; + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + seconds_since_boot(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < seconds_since_boot() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, nn->nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +static struct cache_detail nfs_dns_resolve_template = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_request = nfs_dns_request, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + + +int nfs_dns_resolver_cache_init(struct net *net) +{ + int err; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); + if (IS_ERR(nn->nfs_dns_resolve)) + return PTR_ERR(nn->nfs_dns_resolve); + + err = nfs_cache_register_net(net, nn->nfs_dns_resolve); + if (err) + goto err_reg; + return 0; + +err_reg: + cache_destroy_net(nn->nfs_dns_resolve, net); + return err; +} + +void nfs_dns_resolver_cache_destroy(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs_cache_unregister_net(net, nn->nfs_dns_resolve); + cache_destroy_net(nn->nfs_dns_resolve, net); +} + +static int nfs4_dns_net_init(struct net *net) +{ + return nfs_dns_resolver_cache_init(net); +} + +static void nfs4_dns_net_exit(struct net *net) +{ + nfs_dns_resolver_cache_destroy(net); +} + +static struct pernet_operations nfs4_dns_resolver_ops = { + .init = nfs4_dns_net_init, + .exit = nfs4_dns_net_exit, +}; + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct cache_detail *cd = nn->nfs_dns_resolve; + int ret = 0; + + if (cd == NULL) + return 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + switch (event) { + case RPC_PIPEFS_MOUNT: + ret = nfs_cache_register_sb(sb, cd); + break; + case RPC_PIPEFS_UMOUNT: + nfs_cache_unregister_sb(sb, cd); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs_dns_resolver_block = { + .notifier_call = rpc_pipefs_event, +}; + +int nfs_dns_resolver_init(void) +{ + int err; + + err = register_pernet_subsys(&nfs4_dns_resolver_ops); + if (err < 0) + goto out; + err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + if (err < 0) + goto out1; + return 0; +out1: + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +out: + return err; +} + +void nfs_dns_resolver_destroy(void) +{ + rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +} +#endif diff --git a/kernel/fs/nfs/dns_resolve.h b/kernel/fs/nfs/dns_resolve.h new file mode 100644 index 000000000..2e4f596d2 --- /dev/null +++ b/kernel/fs/nfs/dns_resolve.h @@ -0,0 +1,36 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + + +#ifdef CONFIG_NFS_USE_KERNEL_DNS +static inline int nfs_dns_resolver_init(void) +{ + return 0; +} + +static inline void nfs_dns_resolver_destroy(void) +{} + +static inline int nfs_dns_resolver_cache_init(struct net *net) +{ + return 0; +} + +static inline void nfs_dns_resolver_cache_destroy(struct net *net) +{} +#else +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern int nfs_dns_resolver_cache_init(struct net *net); +extern void nfs_dns_resolver_cache_destroy(struct net *net); +#endif + +extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen); + +#endif diff --git a/kernel/fs/nfs/file.c b/kernel/fs/nfs/file.c new file mode 100644 index 000000000..8b8d83a52 --- /dev/null +++ b/kernel/fs/nfs/file.c @@ -0,0 +1,947 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + * + * Changes Copyright (C) 1994 by Florian La Roche + * - Do not copy data too often around in the kernel. + * - In nfs_file_read the return value of kmalloc wasn't checked. + * - Put in a better version of read look-ahead buffering. Original idea + * and implementation by Wai S Kok elekokws@ee.nus.sg. + * + * Expire cache on write to a file by Wai S Kok (Oct 1994). + * + * Total rewrite of read side for new NFS buffer cache.. Linus. + * + * nfs regular file handling functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "fscache.h" +#include "pnfs.h" + +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_FILE + +static const struct vm_operations_struct nfs_file_vm_ops; + +/* Hack for future NFS swap support */ +#ifndef IS_SWAPFILE +# define IS_SWAPFILE(inode) (0) +#endif + +int nfs_check_flags(int flags) +{ + if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_check_flags); + +/* + * Open file + */ +static int +nfs_file_open(struct inode *inode, struct file *filp) +{ + int res; + + dprintk("NFS: open file(%pD2)\n", filp); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + res = nfs_check_flags(filp->f_flags); + if (res) + return res; + + res = nfs_open(inode, filp); + return res; +} + +int +nfs_file_release(struct inode *inode, struct file *filp) +{ + dprintk("NFS: release(%pD2)\n", filp); + + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); + return nfs_release(inode, filp); +} +EXPORT_SYMBOL_GPL(nfs_file_release); + +/** + * nfs_revalidate_size - Revalidate the file size + * @inode - pointer to inode struct + * @file - pointer to struct file + * + * Revalidates the file length. This is basically a wrapper around + * nfs_revalidate_inode() that takes into account the fact that we may + * have cached writes (in which case we don't care about the server's + * idea of what the file length is), or O_DIRECT (in which case we + * shouldn't trust the cache). + */ +static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfs_have_delegated_attributes(inode)) + goto out_noreval; + + if (filp->f_flags & O_DIRECT) + goto force_reval; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + goto force_reval; + if (nfs_attribute_timeout(inode)) + goto force_reval; +out_noreval: + return 0; +force_reval: + return __nfs_revalidate_inode(server, inode); +} + +loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence) +{ + dprintk("NFS: llseek file(%pD2, %lld, %d)\n", + filp, offset, whence); + + /* + * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (whence != SEEK_SET && whence != SEEK_CUR) { + struct inode *inode = filp->f_mapping->host; + + int retval = nfs_revalidate_file_size(inode, filp); + if (retval < 0) + return (loff_t)retval; + } + + return generic_file_llseek(filp, offset, whence); +} +EXPORT_SYMBOL_GPL(nfs_file_llseek); + +/* + * Flush all dirty pages, and check for write errors. + */ +int +nfs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + dprintk("NFS: flush(%pD2)\n", file); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* + * If we're holding a write delegation, then just start the i/o + * but don't wait for completion (or send a commit). + */ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + return filemap_fdatawrite(file->f_mapping); + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} +EXPORT_SYMBOL_GPL(nfs_file_flush); + +ssize_t +nfs_file_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t result; + + if (iocb->ki_flags & IOCB_DIRECT) + return nfs_file_direct_read(iocb, to, iocb->ki_pos); + + dprintk("NFS: read(%pD2, %zu@%lu)\n", + iocb->ki_filp, + iov_iter_count(to), (unsigned long) iocb->ki_pos); + + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + if (!result) { + result = generic_file_read_iter(iocb, to); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } + return result; +} +EXPORT_SYMBOL_GPL(nfs_file_read); + +ssize_t +nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct inode *inode = file_inode(filp); + ssize_t res; + + dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", + filp, (unsigned long) count, (unsigned long long) *ppos); + + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + if (!res) { + res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } + return res; +} +EXPORT_SYMBOL_GPL(nfs_file_splice_read); + +int +nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct inode *inode = file_inode(file); + int status; + + dprintk("NFS: mmap(%pD2)\n", file); + + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); + if (!status) { + vma->vm_ops = &nfs_file_vm_ops; + status = nfs_revalidate_mapping(inode, file->f_mapping); + } + return status; +} +EXPORT_SYMBOL_GPL(nfs_file_mmap); + +/* + * Flush any dirty pages for this process, and check for write errors. + * The return status from this call provides a reliable indication of + * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occurred, and hence cause it to + * fall back to doing a synchronous write. + */ +int +nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = file_inode(file); + int have_error, do_resend, status; + int ret = 0; + + dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); + + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) { + ret = xchg(&ctx->error, 0); + if (ret) + goto out; + } + if (status < 0) { + ret = status; + goto out; + } + do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + if (do_resend) + ret = -EAGAIN; +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); + +static int +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file_inode(file); + + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); + + trace_nfs_fsync_exit(inode, ret); + return ret; +} + +/* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if (pnfs_ld_read_whole_page(file->f_mapping->host)) { + if (!PageUptodate(page)) + return 1; + return 0; + } + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* + * This does the "real" work of the write. We must allocate and lock the + * page to be sent back to the generic routine, which then copies the + * data from user space. + * + * If the writer ends up delaying the write, the writer needs to + * increment the page use counts until he is done with the page. + */ +static int nfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; + + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", + file, mapping->host->i_ino, len, (long long) pos); + +start: + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + /* + * Wait for O_DIRECT to complete + */ + nfs_inode_dio_wait(mapping->host); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = nfs_flush_incompatible(file, page); + if (ret) { + unlock_page(page); + page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; + } + return ret; +} + +static int nfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct nfs_open_context *ctx = nfs_file_open_context(file); + int status; + + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", + file, mapping->host->i_ino, len, (long long) pos); + + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + + status = nfs_updatepage(file, page, offset, copied); + + unlock_page(page); + page_cache_release(page); + + if (status < 0) + return status; + NFS_I(mapping->host)->write_io += copied; + + if (nfs_ctx_key_to_expire(ctx)) { + status = nfs_wb_all(mapping->host); + if (status < 0) + return status; + } + + return copied; +} + +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ +static void nfs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) +{ + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", + page, offset, length); + + if (offset != 0 || length < PAGE_CACHE_SIZE) + return; + /* Cancel any unstarted writes on this page */ + nfs_wb_page_cancel(page_file_mapping(page)->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); +} + +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ +static int nfs_release_page(struct page *page, gfp_t gfp) +{ + struct address_space *mapping = page->mapping; + + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + + /* Always try to initiate a 'commit' if relevant, but only + * wait for it if __GFP_WAIT is set. Even then, only wait 1 + * second and only if the 'bdi' is not congested. + * Waiting indefinitely can cause deadlocks when the NFS + * server is on this machine, when a new TCP connection is + * needed and in other rare cases. There is no particular + * need to wait extensively here. A short wait has the + * benefit that someone else can worry about the freezer. + */ + if (mapping) { + struct nfs_server *nfss = NFS_SERVER(mapping->host); + nfs_commit_inode(mapping->host, 0); + if ((gfp & __GFP_WAIT) && + !bdi_write_congested(&nfss->backing_dev_info)) { + wait_on_page_bit_killable_timeout(page, PG_private, + HZ); + if (PagePrivate(page)) + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } + } + /* If PagePrivate() is set, then the page is not freeable */ + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); +} + +static void nfs_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct nfs_inode *nfsi; + struct address_space *mapping = page_file_mapping(page); + + if (!mapping || PageSwapCache(page)) + return; + + /* + * Check if an unstable page is currently being committed and + * if so, have the VM treat it as if the page is under writeback + * so it will not block due to pages that will shortly be freeable. + */ + nfsi = NFS_I(mapping->host); + if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + *writeback = true; + return; + } + + /* + * If PagePrivate() is set, then the page is not freeable and as the + * inode is not being committed, it's not going to be cleaned in the + * near future so treat it as dirty + */ + if (PagePrivate(page)) + *dirty = true; +} + +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ +static int nfs_launder_page(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + nfs_fscache_wait_on_page_write(nfsi, page); + return nfs_wb_page(inode, page); +} + +#ifdef CONFIG_NFS_SWAP +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + int ret; + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + + *span = sis->pages; + + rcu_read_lock(); + ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); + rcu_read_unlock(); + + return ret; +} + +static void nfs_swap_deactivate(struct file *file) +{ + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + + rcu_read_lock(); + xs_swapper(rcu_dereference(clnt->cl_xprt), 0); + rcu_read_unlock(); +} +#endif + +const struct address_space_operations nfs_file_aops = { + .readpage = nfs_readpage, + .readpages = nfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = nfs_writepage, + .writepages = nfs_writepages, + .write_begin = nfs_write_begin, + .write_end = nfs_write_end, + .invalidatepage = nfs_invalidate_page, + .releasepage = nfs_release_page, + .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, + .launder_page = nfs_launder_page, + .is_dirty_writeback = nfs_check_dirty_writeback, + .error_remove_page = generic_error_remove_page, +#ifdef CONFIG_NFS_SWAP + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, +#endif +}; + +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct inode *inode = file_inode(filp); + unsigned pagelen; + int ret = VM_FAULT_NOPAGE; + struct address_space *mapping; + + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", + filp, filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(inode), page); + + wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + + lock_page(page); + mapping = page_file_mapping(page); + if (mapping != inode->i_mapping) + goto out_unlock; + + wait_on_page_writeback(page); + + pagelen = nfs_page_length(page); + if (pagelen == 0) + goto out_unlock; + + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; + + ret = VM_FAULT_SIGBUS; +out_unlock: + unlock_page(page); +out: + return ret; +} + +static const struct vm_operations_struct nfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = nfs_vm_page_mkwrite, +}; + +static int nfs_need_sync_write(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx; + + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) + return 1; + ctx = nfs_file_open_context(filp); + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || + nfs_ctx_key_to_expire(ctx)) + return 1; + return 0; +} + +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + unsigned long written = 0; + ssize_t result; + size_t count = iov_iter_count(from); + + result = nfs_key_timeout_notify(file, inode); + if (result) + return result; + + if (iocb->ki_flags & IOCB_DIRECT) { + result = generic_write_checks(iocb, from); + if (result <= 0) + return result; + return nfs_file_direct_write(iocb, from); + } + + dprintk("NFS: write(%pD2, %zu@%Ld)\n", + file, count, (long long) iocb->ki_pos); + + result = -EBUSY; + if (IS_SWAPFILE(inode)) + goto out_swapfile; + /* + * O_APPEND implies that we must revalidate the file length. + */ + if (iocb->ki_flags & IOCB_APPEND) { + result = nfs_revalidate_file_size(inode, file); + if (result) + goto out; + } + + result = count; + if (!count) + goto out; + + result = generic_file_write_iter(iocb, from); + if (result > 0) + written = result; + + /* Return error values for O_DSYNC and IS_SYNC() */ + if (result >= 0 && nfs_need_sync_write(file, inode)) { + int err = vfs_fsync(file, 0); + if (err < 0) + result = err; + } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); +out: + return result; + +out_swapfile: + printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_file_write); + +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status = 0; + unsigned int saved_type = fl->fl_type; + + /* Try local locking first */ + posix_test_lock(filp, fl); + if (fl->fl_type != F_UNLCK) { + /* found a conflict */ + goto out; + } + fl->fl_type = saved_type; + + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + goto out_noconflict; + + if (is_local) + goto out_noconflict; + + status = NFS_PROTO(inode)->lock(filp, cmd, fl); +out: + return status; +out_noconflict: + fl->fl_type = F_UNLCK; + goto out; +} + +static int do_vfs_lock(struct file *file, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(file, fl); + break; + default: + BUG(); + } + return res; +} + +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + struct nfs_lock_context *l_ctx; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + vfs_fsync(filp, 0); + + l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); + if (!IS_ERR(l_ctx)) { + status = nfs_iocounter_wait(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + if (status < 0) + return status; + } + + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + return status; +} + +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + status = nfs_sync_mapping(filp->f_mapping); + if (status != 0) + goto out; + + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + if (status < 0) + goto out; + + /* + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * + * This makes locking act as a cache coherency point. + */ + nfs_sync_mapping(filp->f_mapping); + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } +out: + return status; +} + +/* + * Lock a (portion of) a file + */ +int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int ret = -ENOLCK; + int is_local = 0; + + dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", + filp, fl->fl_type, fl->fl_flags, + (long long)fl->fl_start, (long long)fl->fl_end); + + nfs_inc_stats(inode, NFSIOS_VFSLOCK); + + /* No mandatory locks over NFS */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } + + if (IS_GETLK(cmd)) + ret = do_getlk(filp, cmd, fl, is_local); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl, is_local); + else + ret = do_setlk(filp, cmd, fl, is_local); +out_err: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_lock); + +/* + * Lock a (portion of) a file + */ +int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + + dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", + filp, fl->fl_type, fl->fl_flags); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + /* + * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of + * any standard. In principle we might be able to support LOCK_MAND + * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the + * NFS code is not set up for it. + */ + if (fl->fl_type & LOCK_MAND) + return -EINVAL; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + + /* We're simulating flock() locks using posix locks on the server */ + if (fl->fl_type == F_UNLCK) + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); +} +EXPORT_SYMBOL_GPL(nfs_flock); + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = iter_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = simple_nosetlease, +}; +EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/kernel/fs/nfs/filelayout/Makefile b/kernel/fs/nfs/filelayout/Makefile new file mode 100644 index 000000000..8516cdffb --- /dev/null +++ b/kernel/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/kernel/fs/nfs/filelayout/filelayout.c b/kernel/fs/nfs/filelayout/filelayout.c new file mode 100644 index 000000000..a46bf6de9 --- /dev/null +++ b/kernel/fs/nfs/filelayout/filelayout.c @@ -0,0 +1,1162 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include +#include + +#include + +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand "); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) +{ + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 stripe_no; + u32 rem; + + offset -= flseg->pattern_offset; + stripe_no = div_u64(offset, stripe_width); + div_u64_rem(offset, flseg->stripe_unit, &rem); + + return stripe_no * flseg->stripe_unit + rem; +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); + } + + BUG(); +} + +static void filelayout_reset_write(struct nfs_pgio_header *hdr) +{ + struct rpc_task *task = &hdr->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr); + } +} + +static void filelayout_reset_read(struct nfs_pgio_header *hdr) +{ + struct rpc_task *task = &hdr->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct inode *inode = lo->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); + struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + + if (task->tk_status >= 0) + return 0; + + switch (task->tk_status) { + /* MDS state errors */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } + nfs4_schedule_lease_recovery(mds_client); + goto wait_on_recovery; + /* DS session errors */ + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + pnfs_error_mark_layout_for_return(inode, lseg); + rpc_wake_up(&tbl->slot_tbl_waitq); + /* fall through */ + default: +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + } +out: + task->tk_status = 0; + return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); + goto out; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + int err; + + trace_nfs4_pnfs_read(hdr, task->tk_status); + err = filelayout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_read(hdr); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) +{ + + if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || + hdr->res.verf->committed != NFS_DATA_SYNC) + return; + + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); +} + +bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return filelayout_test_devid_invalid(node) || + nfs4_test_deviceid_unavailable(node); +} + +static bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); + + return filelayout_test_devid_unavailable(node); +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } + if (filelayout_reset_to_mds(hdr->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_read(hdr); + rpc_exit(task, 0); + return; + } + hdr->pgio_done_cb = filelayout_read_done_cb; + + if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs41_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); +} + +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + int err; + + trace_nfs4_pnfs_write(hdr, task->tk_status); + err = filelayout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_write(hdr); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + filelayout_set_layoutcommit(hdr); + return 0; +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_commit_data *data) +{ + int err; + + trace_nfs4_pnfs_commit_ds(data, task->tk_status); + err = filelayout_async_handle_error(task, NULL, data->ds_clp, + data->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + if (data->verf.committed == NFS_UNSTABLE) + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + + return 0; +} + +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } + if (filelayout_reset_to_mds(hdr->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_write(hdr); + rpc_exit(task, 0); + return; + } + if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs41_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); +} + +static void filelayout_commit_prepare(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); +} + +static void filelayout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); +} + +static const struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_count_stats = filelayout_read_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_count_stats = filelayout_write_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_commit_prepare, + .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_count_stats = filelayout_commit_count_stats, + .rpc_release = pnfs_generic_commit_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_pgio_header *hdr) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + loff_t offset = hdr->args.offset; + u32 j, idx; + struct nfs_fh *fh; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, hdr->inode->i_ino, + hdr->args.pgbase, (size_t)hdr->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + dprintk("%s USE DS: %s cl_count %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + + /* No multipath support. Use first DS */ + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + hdr->ds_commit_idx = idx; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + hdr->args.fh = fh; + + hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); + hdr->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, + NFS_PROTO(hdr->inode), &filelayout_read_call_ops, + 0, RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + loff_t offset = hdr->args.offset; + u32 j, idx; + struct nfs_fh *fh; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + + hdr->pgio_done_cb = filelayout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + hdr->ds_commit_idx = idx; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + hdr->args.fh = fh; + hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); + + /* Perform an asynchronous write */ + nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, + NFS_PROTO(hdr->inode), &filelayout_write_call_ops, + sync, RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + + dprintk("--> %s\n", __func__); + + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld too large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (!fl->stripe_unit) { + dprintk("%s Invalid stripe unit (%u)\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id, + lo->plh_lc_cred, gfp_flags); + if (d == NULL) + goto out; + + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is unavailable */ + if (filelayout_test_devid_unavailable(&dsaddr->id_node)) + goto out_put; + + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + nfs4_fl_put_deviceid(dsaddr); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + nfs4_print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) + goto out_err; + + if (fl->num_fh > 0) { + fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "NFS: Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + goto out_err_free; + } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + __free_page(scratch); + return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + nfs4_fl_put_deviceid(fl->dsaddr); + /* This assumes a single RW lseg */ + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_filelayout *flo; + + flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); + flo->commit_info.nbuckets = 0; + kfree(flo->commit_info.buckets); + flo->commit_info.buckets = NULL; + } + _filelayout_free_lseg(fl); +} + +static int +filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size, i; + + if (fl->commit_through_mds) + return 0; + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + if (cinfo->ds->nbuckets >= size) { + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + return 0; + } + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + } + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets >= size) + goto out; + for (i = 0; i < cinfo->ds->nbuckets; i++) { + list_splice(&cinfo->ds->buckets[i].written, + &buckets[i].written); + list_splice(&cinfo->ds->buckets[i].committing, + &buckets[i].committing); + buckets[i].direct_verf.committed = + cinfo->ds->buckets[i].direct_verf.committed; + buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; + buckets[i].clseg = cinfo->ds->buckets[i].clseg; + } + swap(cinfo->ds->buckets, buckets); + cinfo->ds->nbuckets = size; +out: + spin_unlock(cinfo->lock); + kfree(buckets); + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), gfp_flags); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { + _filelayout_free_lseg(fl); + return NULL; + } + return &fl->generic_hdr; +} + +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); +} + +static void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); +} + +static void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs_commit_info cinfo; + int status; + + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) { + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out_mds; + } + return; +out_mds: + nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_init = filelayout_pg_init_read, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_init = filelayout_pg_init_write, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +static void +filelayout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) + +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + + if (fl->commit_through_mds) { + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + } else { + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); + i = select_bucket_index(fl, j); + pnfs_layout_mark_request_commit(req, lseg, cinfo, i); + } +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + goto out_err; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode); + if (IS_ERR(ds_clnt)) + goto out_err; + + dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->commit_done_cb = filelayout_commit_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode), + &filelayout_commit_call_ops, how, + RPC_TASK_SOFTCONN); +out_err: + pnfs_generic_prepare_to_resend_writes(data); + pnfs_generic_commit_release(data); + return -EAGAIN; +} + +/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest + * for @page + * @cinfo - commit info for current inode + * @page - page to search for matching head request + * + * Returns a the head request if one is found, otherwise returns NULL. + */ +static struct nfs_page * +filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) +{ + struct nfs_page *freq, *t; + struct pnfs_commit_bucket *b; + int i; + + /* Linearly search the commit lists for each bucket until a matching + * request is found */ + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + list_for_each_entry_safe(freq, t, &b->written, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + list_for_each_entry_safe(freq, t, &b->committing, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + } + + return NULL; +} + +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo) +{ + return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo, + filelayout_initiate_commit); +} + +static struct nfs4_deviceid_node * +filelayout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + + dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} + +static void +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); +} + +static struct pnfs_layout_hdr * +filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_filelayout *flo; + + flo = kzalloc(sizeof(*flo), gfp_flags); + return flo != NULL ? &flo->generic_hdr : NULL; +} + +static void +filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + kfree(FILELAYOUT_FROM_HDR(lo)); +} + +static struct pnfs_ds_commit_info * +filelayout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + else + return &FILELAYOUT_FROM_HDR(layout)->commit_info; +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_layout_hdr = filelayout_alloc_layout_hdr, + .free_layout_hdr = filelayout_free_layout_hdr, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .pg_read_ops = &filelayout_pg_read_ops, + .pg_write_ops = &filelayout_pg_write_ops, + .get_ds_info = &filelayout_get_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .search_commit_reqs = filelayout_search_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, + .alloc_deviceid_node = filelayout_alloc_deviceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-1"); + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/kernel/fs/nfs/filelayout/filelayout.h b/kernel/fs/nfs/filelayout/filelayout.h new file mode 100644 index 000000000..2896cb833 --- /dev/null +++ b/kernel/fs/nfs/filelayout/filelayout.h @@ -0,0 +1,117 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "../pnfs.h" + +/* + * Field testing shows we need to support up to 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +struct nfs4_file_layout_dsaddr { + struct nfs4_deviceid_node id_node; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; +}; + +struct nfs4_filelayout { + struct pnfs_layout_hdr generic_hdr; + struct pnfs_ds_commit_info commit_info; +}; + +static inline struct nfs4_filelayout * +FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_filelayout, generic_hdr); +} + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ + return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + +static inline bool +filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) +{ + return test_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +extern bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); + +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); + +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/kernel/fs/nfs/filelayout/filelayoutdev.c b/kernel/fs/nfs/filelayout/filelayoutdev.c new file mode 100644 index 000000000..4946ef40b --- /dev/null +++ b/kernel/fs/nfs/filelayout/filelayoutdev.c @@ -0,0 +1,299 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * Garth Goodson + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include +#include + +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + +void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + nfs4_print_deviceid(&dsaddr->id_node.deviceid); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) + nfs4_pnfs_ds_put(ds); + } + kfree(dsaddr->stripe_indices); + kfree_rcu(dsaddr, id_node.rcu); +} + +/* Decode opaque device data and return the result */ +struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) +{ + int i; + u32 cnt, num; + u8 *indexp; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; + + /* set up xdr stream */ + scratch = alloc_page(gfp_flags); + if (!scratch) + goto out_err; + + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* Get the stripe count (number of stripe index) */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "NFS: %s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; + } + + /* Check the multipath list count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "NFS: %s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err_free_stripe_indices; + } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + gfp_flags); + if (!dsaddr) + goto out_err_free_stripe_indices; + + dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; + dsaddr->ds_num = num; + nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id); + + INIT_LIST_HEAD(&dsaddrs); + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + + mp_count = be32_to_cpup(p); /* multipath count */ + for (j = 0; j < mp_count; j++) { + da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, + &stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + goto out_err_free_deviceid; + } + + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!dsaddr->ds_list[i]) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + } + + __free_page(scratch); + return dsaddr; + +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } +out_err_free_deviceid: + nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + nfs4_put_deviceid_node(&dsaddr->id_node); +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +/* Upon return, either ds is connected, or ds is NULL */ +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); + struct nfs4_pnfs_ds *ret = ds; + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + + if (ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + __func__, ds_idx); + pnfs_generic_mark_devid_invalid(devid); + goto out; + } + smp_rmb(); + if (ds->ds_clp) + goto out_test_devid; + + nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + dataserver_retrans, 4, + s->nfs_client->cl_minorversion, + s->nfs_client->cl_rpcclient->cl_auth->au_flavor); + +out_test_devid: + if (filelayout_test_devid_unavailable(devid)) + ret = NULL; +out: + return ret; +} + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " + "retries a request before it attempts further " + " recovery action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " + "NFSv4.1 client waits for a response from a " + " data server before it retries an NFS request."); diff --git a/kernel/fs/nfs/flexfilelayout/Makefile b/kernel/fs/nfs/flexfilelayout/Makefile new file mode 100644 index 000000000..1d2c9f6bb --- /dev/null +++ b/kernel/fs/nfs/flexfilelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Flexfile Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o +nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayout.c b/kernel/fs/nfs/flexfilelayout/flexfilelayout.c new file mode 100644 index 000000000..7d05089e5 --- /dev/null +++ b/kernel/fs/nfs/flexfilelayout/flexfilelayout.c @@ -0,0 +1,1535 @@ +/* + * Module for pnfs flexfile layout driver. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng + */ + +#include +#include +#include + +#include + +#include "flexfilelayout.h" +#include "../nfs4session.h" +#include "../nfs4idmap.h" +#include "../internal.h" +#include "../delegation.h" +#include "../nfs4trace.h" +#include "../iostat.h" +#include "../nfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) + +static struct pnfs_layout_hdr * +ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_flexfile_layout *ffl; + + ffl = kzalloc(sizeof(*ffl), gfp_flags); + if (ffl) { + INIT_LIST_HEAD(&ffl->error_list); + return &ffl->generic_hdr; + } else + return NULL; +} + +static void +ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs4_ff_layout_ds_err *err, *n; + + list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, + list) { + list_del(&err->list); + kfree(err); + } + kfree(FF_LAYOUT_FROM_HDR(lo)); +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); + if (unlikely(p == NULL)) + return -ENOBUFS; + memcpy(stateid, p, NFS4_STATEID_SIZE); + dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); + return 0; +} + +static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); + if (unlikely(!p)) + return -ENOBUFS; + memcpy(devid, p, NFS4_DEVICEID4_SIZE); + nfs4_print_deviceid(devid); + return 0; +} + +static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -ENOBUFS; + fh->size = be32_to_cpup(p++); + if (fh->size > sizeof(struct nfs_fh)) { + printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", + fh->size); + return -EOVERFLOW; + } + /* fh.data */ + p = xdr_inline_decode(xdr, fh->size); + if (unlikely(!p)) + return -ENOBUFS; + memcpy(&fh->data, p, fh->size); + dprintk("%s: fh len %d\n", __func__, fh->size); + + return 0; +} + +/* + * Currently only stringified uids and gids are accepted. + * I.e., kerberos is not supported to the DSes, so no pricipals. + * + * That means that one common function will suffice, but when + * principals are added, this should be split to accomodate + * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid(). + */ +static int +decode_name(struct xdr_stream *xdr, u32 *id) +{ + __be32 *p; + int len; + + /* opaque_length(4)*/ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -ENOBUFS; + len = be32_to_cpup(p++); + if (len < 0) + return -EINVAL; + + dprintk("%s: len %u\n", __func__, len); + + /* opaque body */ + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -ENOBUFS; + + if (!nfs_map_string_to_numeric((char *)p, len, id)) + return -EINVAL; + + return 0; +} + +static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) +{ + int i; + + if (fls->mirror_array) { + for (i = 0; i < fls->mirror_array_cnt; i++) { + /* normally mirror_ds is freed in + * .free_deviceid_node but we still do it here + * for .alloc_lseg error path */ + if (fls->mirror_array[i]) { + kfree(fls->mirror_array[i]->fh_versions); + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); + kfree(fls->mirror_array[i]); + } + } + kfree(fls->mirror_array); + fls->mirror_array = NULL; + } +} + +static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) +{ + int ret = 0; + + dprintk("--> %s\n", __func__); + + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + ret = -EINVAL; + } + + dprintk("--> %s returns %d\n", __func__, ret); + return ret; +} + +static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) +{ + if (fls) { + ff_layout_free_mirror_array(fls); + kfree(fls); + } +} + +static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) +{ + struct nfs4_ff_layout_mirror *tmp; + int i, j; + + for (i = 0; i < fls->mirror_array_cnt - 1; i++) { + for (j = i + 1; j < fls->mirror_array_cnt; j++) + if (fls->mirror_array[i]->efficiency < + fls->mirror_array[j]->efficiency) { + tmp = fls->mirror_array[i]; + fls->mirror_array[i] = fls->mirror_array[j]; + fls->mirror_array[j] = tmp; + } + } +} + +static struct pnfs_layout_segment * +ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *ret; + struct nfs4_ff_layout_segment *fls = NULL; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + u64 stripe_unit; + u32 mirror_array_cnt; + __be32 *p; + int i, rc; + + dprintk("--> %s\n", __func__); + scratch = alloc_page(gfp_flags); + if (!scratch) + return ERR_PTR(-ENOMEM); + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, + lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* stripe unit and mirror_array_cnt */ + rc = -EIO; + p = xdr_inline_decode(&stream, 8 + 4); + if (!p) + goto out_err_free; + + p = xdr_decode_hyper(p, &stripe_unit); + mirror_array_cnt = be32_to_cpup(p++); + dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__, + stripe_unit, mirror_array_cnt); + + if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT || + mirror_array_cnt == 0) + goto out_err_free; + + rc = -ENOMEM; + fls = kzalloc(sizeof(*fls), gfp_flags); + if (!fls) + goto out_err_free; + + fls->mirror_array_cnt = mirror_array_cnt; + fls->stripe_unit = stripe_unit; + fls->mirror_array = kcalloc(fls->mirror_array_cnt, + sizeof(fls->mirror_array[0]), gfp_flags); + if (fls->mirror_array == NULL) + goto out_err_free; + + for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_deviceid devid; + struct nfs4_deviceid_node *idnode; + u32 ds_count; + u32 fh_count; + int j; + + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + ds_count = be32_to_cpup(p); + + /* FIXME: allow for striping? */ + if (ds_count != 1) + goto out_err_free; + + fls->mirror_array[i] = + kzalloc(sizeof(struct nfs4_ff_layout_mirror), + gfp_flags); + if (fls->mirror_array[i] == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + spin_lock_init(&fls->mirror_array[i]->lock); + fls->mirror_array[i]->ds_count = ds_count; + + /* deviceid */ + rc = decode_deviceid(&stream, &devid); + if (rc) + goto out_err_free; + + idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), + &devid, lh->plh_lc_cred, + gfp_flags); + /* + * upon success, mirror_ds is allocated by previous + * getdeviceinfo, or newly by .alloc_deviceid_node + * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure + */ + if (idnode) + fls->mirror_array[i]->mirror_ds = + FF_LAYOUT_MIRROR_DS(idnode); + else + goto out_err_free; + + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fls->mirror_array[i]->efficiency = be32_to_cpup(p); + + /* stateid */ + rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + if (rc) + goto out_err_free; + + /* fh */ + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); + + fls->mirror_array[i]->fh_versions = + kzalloc(fh_count * sizeof(struct nfs_fh), + gfp_flags); + if (fls->mirror_array[i]->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &fls->mirror_array[i]->fh_versions[j]); + if (rc) + goto out_err_free; + } + + fls->mirror_array[i]->fh_versions_cnt = fh_count; + + /* user */ + rc = decode_name(&stream, &fls->mirror_array[i]->uid); + if (rc) + goto out_err_free; + + /* group */ + rc = decode_name(&stream, &fls->mirror_array[i]->gid); + if (rc) + goto out_err_free; + + dprintk("%s: uid %d gid %d\n", __func__, + fls->mirror_array[i]->uid, + fls->mirror_array[i]->gid); + } + + ff_layout_sort_mirrors(fls); + rc = ff_layout_check_layout(lgr); + if (rc) + goto out_err_free; + + ret = &fls->generic_hdr; + dprintk("<-- %s (success)\n", __func__); +out_free_page: + __free_page(scratch); + return ret; +out_err_free: + _ff_layout_free_lseg(fls); + ret = ERR_PTR(rc); + dprintk("<-- %s (%d)\n", __func__, rc); + goto out_free_page; +} + +static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + return true; + + return false; +} + +static void +ff_layout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + int i; + + dprintk("--> %s\n", __func__); + + for (i = 0; i < fls->mirror_array_cnt; i++) { + if (fls->mirror_array[i]) { + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); + fls->mirror_array[i]->mirror_ds = NULL; + if (fls->mirror_array[i]->cred) { + put_rpccred(fls->mirror_array[i]->cred); + fls->mirror_array[i]->cred = NULL; + } + } + } + + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_flexfile_layout *ffl; + struct inode *inode; + + ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); + inode = ffl->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); + if (!ff_layout_has_rw_segments(lseg->pls_layout)) { + ffl->commit_info.nbuckets = 0; + kfree(ffl->commit_info.buckets); + ffl->commit_info.buckets = NULL; + } + spin_unlock(&inode->i_lock); + } + _ff_layout_free_lseg(fls); +} + +/* Return 1 until we have multiple lsegs support */ +static int +ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) +{ + return 1; +} + +static int +ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size; + + if (cinfo->ds->nbuckets != 0) { + /* This assumes there is only one RW lseg per file. + * To support multiple lseg per file, we need to + * change struct pnfs_commit_bucket to allow dynamic + * increasing nbuckets. + */ + return 0; + } + + size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + else { + int i; + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets != 0) + kfree(buckets); + else { + cinfo->ds->buckets = buckets; + cinfo->ds->nbuckets = size; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = + NFS_INVALID_STABLE_HOW; + } + } + spin_unlock(cinfo->lock); + return 0; + } +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio, + int *best_idx) +{ + struct nfs4_ff_layout_segment *fls; + struct nfs4_pnfs_ds *ds; + int idx; + + fls = FF_LAYOUT_LSEG(pgio->pg_lseg); + /* mirrors are sorted by efficiency */ + for (idx = 0; idx < fls->mirror_array_cnt; idx++) { + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false); + if (ds) { + *best_idx = idx; + return ds; + } + } + + return NULL; +} + +static void +ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs_pgio_mirror *pgm; + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_pnfs_ds *ds; + int ds_idx; + + /* Use full layout for now */ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + + ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx); + if (!ds) + goto out_mds; + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); + + pgio->pg_mirror_idx = ds_idx; + + /* read always uses only one mirror - idx 0 for pgio layer */ + pgm = &pgio->pg_mirrors[0]; + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + + return; +out_mds: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_read_mds(pgio); +} + +static void +ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs_pgio_mirror *pgm; + struct nfs_commit_info cinfo; + struct nfs4_pnfs_ds *ds; + int i; + int status; + + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) + goto out_mds; + + /* Use a direct mapping of ds_idx to pgio mirror_idx */ + if (WARN_ON_ONCE(pgio->pg_mirror_count != + FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) + goto out_mds; + + for (i = 0; i < pgio->pg_mirror_count; i++) { + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); + if (!ds) + goto out_mds; + pgm = &pgio->pg_mirrors[i]; + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + } + + return; + +out_mds: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_write_mds(pgio); +} + +static unsigned int +ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + if (pgio->pg_lseg) + return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); + + /* no lseg means that pnfs is not in use, so no mirroring here */ + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_write_mds(pgio); + return 1; +} + +static const struct nfs_pageio_ops ff_layout_pg_read_ops = { + .pg_init = ff_layout_pg_init_read, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static const struct nfs_pageio_ops ff_layout_pg_write_ops = { + .pg_init = ff_layout_pg_init_write, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_writepages, + .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) +{ + struct rpc_task *task = &hdr->task; + + pnfs_layoutcommit_inode(hdr->inode, false); + + if (retry_pnfs) { + dprintk("%s Reset task %5u for i/o through pNFS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + if (!hdr->dreq) { + struct nfs_open_context *ctx; + + ctx = nfs_list_entry(hdr->pages.next)->wb_context; + set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + hdr->completion_ops->error_cleanup(&hdr->pages); + } else { + nfs_direct_set_resched_writes(hdr->dreq); + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = 0; + } + return; + } + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr); + } +} + +static void ff_layout_reset_read(struct nfs_pgio_header *hdr) +{ + struct rpc_task *task = &hdr->task; + + pnfs_layoutcommit_inode(hdr->inode, false); + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr); + } +} + +static int ff_layout_async_handle_error_v4(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, + int idx) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct inode *inode = lo->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + + if (task->tk_status >= 0) + return 0; + + switch (task->tk_status) { + /* MDS state errors */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } + nfs4_schedule_lease_recovery(mds_client); + goto wait_on_recovery; + /* DS session errors */ + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + rpc_wake_up(&tbl->slot_tbl_waitq); + /* fall through */ + default: + if (ff_layout_has_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + } +out: + task->tk_status = 0; + return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); + goto out; +} + +/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ +static int ff_layout_async_handle_error_v3(struct rpc_task *task, + struct pnfs_layout_segment *lseg, + int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (task->tk_status >= 0) + return 0; + + if (task->tk_status != -EJUKEBOX) { + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + if (ff_layout_has_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + else + return -NFS4ERR_RESET_TO_MDS; + } + + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return -EAGAIN; +} + +static int ff_layout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, + int idx) +{ + int vers = clp->cl_nfs_mod->rpc_vers->number; + + switch (vers) { + case 3: + return ff_layout_async_handle_error_v3(task, lseg, idx); + case 4: + return ff_layout_async_handle_error_v4(task, state, clp, + lseg, idx); + default: + /* should never happen */ + WARN_ON_ONCE(1); + return 0; + } +} + +static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, + int idx, u64 offset, u64 length, + u32 status, int opnum) +{ + struct nfs4_ff_layout_mirror *mirror; + int err; + + mirror = FF_LAYOUT_COMP(lseg, idx); + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, offset, length, status, opnum, + GFP_NOIO); + dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); +} + +/* NFS_PROTO call done callback routines */ + +static int ff_layout_read_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_read(hdr, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) + hdr->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && hdr->res.op_status) + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + hdr->args.offset, hdr->args.count, + hdr->res.op_status, OP_READ); + err = ff_layout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &hdr->lseg->pls_layout->plh_flags); + pnfs_read_resend_pnfs(hdr); + return task->tk_status; + case -NFS4ERR_RESET_TO_MDS: + inode = hdr->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, hdr->lseg); + ff_layout_reset_read(hdr); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + * + * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so + * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751 + * we always send layoutcommit after DS writes. + */ +static void +ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +{ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); +} + +static bool +ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +{ + /* No mirroring for now */ + struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); + + return ff_layout_test_devid_unavailable(node); +} + +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return -EIO; + } + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + if (ff_layout_has_available_ds(hdr->lseg)) + pnfs_read_resend_pnfs(hdr); + else + ff_layout_reset_read(hdr); + rpc_exit(task, 0); + return -EAGAIN; + } + hdr->pgio_done_cb = ff_layout_read_done_cb; + + return 0; +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_read_prepare_common(task, hdr)) + return; + + rpc_call_start(task); +} + +static int ff_layout_setup_sequence(struct nfs_client *ds_clp, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + if (ds_clp->cl_session) + return nfs41_setup_sequence(ds_clp->cl_session, + args, + res, + task); + return nfs40_setup_sequence(ds_clp->cl_slot_tbl, + args, + res, + task); +} + +static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_read_prepare_common(task, hdr)) + return; + + if (ff_layout_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void ff_layout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs4_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, hdr); +} + +static void ff_layout_read_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); +} + +static int ff_layout_write_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_write(hdr, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) + hdr->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && hdr->res.op_status) + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + hdr->args.offset, hdr->args.count, + hdr->res.op_status, OP_WRITE); + err = ff_layout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + case -NFS4ERR_RESET_TO_MDS: + inode = hdr->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, hdr->lseg); + if (err == -NFS4ERR_RESET_TO_PNFS) { + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, true); + } else { + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, false); + } + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + if (hdr->res.verf->committed == NFS_FILE_SYNC || + hdr->res.verf->committed == NFS_DATA_SYNC) + ff_layout_set_layoutcommit(hdr); + + return 0; +} + +static int ff_layout_commit_done_cb(struct rpc_task *task, + struct nfs_commit_data *data) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_commit_ds(data, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !data->res.op_status) + data->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && data->res.op_status) + ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + data->args.offset, data->args.count, + data->res.op_status, OP_COMMIT); + err = ff_layout_async_handle_error(task, NULL, data->ds_clp, + data->lseg, data->ds_commit_index); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + case -NFS4ERR_RESET_TO_MDS: + inode = data->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, data->lseg); + if (err == -NFS4ERR_RESET_TO_PNFS) + pnfs_set_retry_layoutget(data->lseg->pls_layout); + else + pnfs_clear_retry_layoutget(data->lseg->pls_layout); + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + if (data->verf.committed == NFS_UNSTABLE) + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + + return 0; +} + +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return -EIO; + } + + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { + bool retry_pnfs; + + retry_pnfs = ff_layout_has_available_ds(hdr->lseg); + dprintk("%s task %u reset io to %s\n", __func__, + task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); + ff_layout_reset_write(hdr, retry_pnfs); + rpc_exit(task, 0); + return -EAGAIN; + } + + return 0; +} + +static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_write_prepare_common(task, hdr)) + return; + + rpc_call_start(task); +} + +static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_write_prepare_common(task, hdr)) + return; + + if (ff_layout_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void ff_layout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs4_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, hdr); +} + +static void ff_layout_write_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); +} + +static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) +{ + rpc_call_start(task); +} + +static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + ff_layout_setup_sequence(wdata->ds_clp, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); +} + +static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); +} + +static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { + .rpc_call_prepare = ff_layout_read_prepare_v3, + .rpc_call_done = ff_layout_read_call_done, + .rpc_count_stats = ff_layout_read_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { + .rpc_call_prepare = ff_layout_read_prepare_v4, + .rpc_call_done = ff_layout_read_call_done, + .rpc_count_stats = ff_layout_read_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { + .rpc_call_prepare = ff_layout_write_prepare_v3, + .rpc_call_done = ff_layout_write_call_done, + .rpc_count_stats = ff_layout_write_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { + .rpc_call_prepare = ff_layout_write_prepare_v4, + .rpc_call_done = ff_layout_write_call_done, + .rpc_count_stats = ff_layout_write_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { + .rpc_call_prepare = ff_layout_commit_prepare_v3, + .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_count_stats = ff_layout_commit_count_stats, + .rpc_release = pnfs_generic_commit_release, +}; + +static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { + .rpc_call_prepare = ff_layout_commit_prepare_v4, + .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_count_stats = ff_layout_commit_count_stats, + .rpc_release = pnfs_generic_commit_release, +}; + +static enum pnfs_try_status +ff_layout_read_pagelist(struct nfs_pgio_header *hdr) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + loff_t offset = hdr->args.offset; + u32 idx = hdr->pgio_mirror_idx; + int vers; + struct nfs_fh *fh; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, hdr->inode->i_ino, + hdr->args.pgbase, (size_t)hdr->args.count, offset); + + ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (!ds) + goto out_failed; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + hdr->inode); + if (IS_ERR(ds_clnt)) + goto out_failed; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + if (IS_ERR(ds_cred)) + goto out_failed; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + if (fh) + hdr->args.fh = fh; + + /* + * Note that if we ever decide to split across DSes, + * then we may need to handle dense-like offsets. + */ + hdr->args.offset = offset; + hdr->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_read_call_ops_v3 : + &ff_layout_read_call_ops_v4, + 0, RPC_TASK_SOFTCONN); + + return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_has_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + loff_t offset = hdr->args.offset; + int vers; + struct nfs_fh *fh; + int idx = hdr->pgio_mirror_idx; + + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + if (IS_ERR(ds_cred)) + return PNFS_NOT_ATTEMPTED; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), + vers); + + hdr->pgio_done_cb = ff_layout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + hdr->ds_commit_idx = idx; + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + if (fh) + hdr->args.fh = fh; + + /* + * Note that if we ever decide to split across DSes, + * then we may need to handle dense-like offsets. + */ + hdr->args.offset = offset; + + /* Perform an asynchronous write */ + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_write_call_ops_v3 : + &ff_layout_write_call_ops_v4, + sync, RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + return i; +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + /* FIXME: Assume that there is only one NFS version available + * for the DS. + */ + return &flseg->mirror_array[i]->fh_versions[0]; +} + +static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + u32 idx; + int vers; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + if (!ds) + goto out_err; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + data->inode); + if (IS_ERR(ds_clnt)) + goto out_err; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + if (IS_ERR(ds_cred)) + goto out_err; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), + vers); + data->commit_done_cb = ff_layout_commit_done_cb; + data->cred = ds_cred; + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_commit_call_ops_v3 : + &ff_layout_commit_call_ops_v4, + how, RPC_TASK_SOFTCONN); +out_err: + pnfs_generic_prepare_to_resend_writes(data); + pnfs_generic_commit_release(data); + return -EAGAIN; +} + +static int +ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo) +{ + return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo, + ff_layout_initiate_commit); +} + +static struct pnfs_ds_commit_info * +ff_layout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + + return &FF_LAYOUT_FROM_HDR(layout)->commit_info; +} + +static void +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, + id_node)); +} + +static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct pnfs_layout_hdr *hdr = &flo->generic_hdr; + __be32 *start; + int count = 0, ret = 0; + + start = xdr_reserve_space(xdr, 4); + if (unlikely(!start)) + return -E2BIG; + + /* This assume we always return _ALL_ layouts */ + spin_lock(&hdr->plh_inode->i_lock); + ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range); + spin_unlock(&hdr->plh_inode->i_lock); + + *start = cpu_to_be32(count); + + return ret; +} + +/* report nothing for now */ +static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (likely(p)) + *p = cpu_to_be32(0); +} + +static struct nfs4_deviceid_node * +ff_layout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_ff_layout_ds *dsaddr; + + dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} + +static void +ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + if (ff_layout_encode_ioerr(flo, xdr, args)) + goto out; + + ff_layout_encode_iostats(flo, xdr, args); +out: + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + +static struct pnfs_layoutdriver_type flexfilelayout_type = { + .id = LAYOUT_FLEX_FILES, + .name = "LAYOUT_FLEX_FILES", + .owner = THIS_MODULE, + .alloc_layout_hdr = ff_layout_alloc_layout_hdr, + .free_layout_hdr = ff_layout_free_layout_hdr, + .alloc_lseg = ff_layout_alloc_lseg, + .free_lseg = ff_layout_free_lseg, + .pg_read_ops = &ff_layout_pg_read_ops, + .pg_write_ops = &ff_layout_pg_write_ops, + .get_ds_info = ff_layout_get_ds_info, + .free_deviceid_node = ff_layout_free_deviceid_node, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, + .read_pagelist = ff_layout_read_pagelist, + .write_pagelist = ff_layout_write_pagelist, + .alloc_deviceid_node = ff_layout_alloc_deviceid_node, + .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, +}; + +static int __init nfs4flexfilelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&flexfilelayout_type); +} + +static void __exit nfs4flexfilelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&flexfilelayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-4"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("The NFSv4 flexfile layout driver"); + +module_init(nfs4flexfilelayout_init); +module_exit(nfs4flexfilelayout_exit); diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayout.h b/kernel/fs/nfs/flexfilelayout/flexfilelayout.h new file mode 100644 index 000000000..070f20445 --- /dev/null +++ b/kernel/fs/nfs/flexfilelayout/flexfilelayout.h @@ -0,0 +1,155 @@ +/* + * NFSv4 flexfile layout driver data structures. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng + */ + +#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H +#define FS_NFS_NFS4FLEXFILELAYOUT_H + +#include "../pnfs.h" + +/* XXX: Let's filter out insanely large mirror count for now to avoid oom + * due to network error etc. */ +#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 + +struct nfs4_ff_ds_version { + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +/* chained in global deviceid hlist */ +struct nfs4_ff_layout_ds { + struct nfs4_deviceid_node id_node; + u32 ds_versions_cnt; + struct nfs4_ff_ds_version *ds_versions; + struct nfs4_pnfs_ds *ds; +}; + +struct nfs4_ff_layout_ds_err { + struct list_head list; /* linked in mirror error_list */ + u64 offset; + u64 length; + int status; + enum nfs_opnum4 opnum; + nfs4_stateid stateid; + struct nfs4_deviceid deviceid; +}; + +struct nfs4_ff_layout_mirror { + u32 ds_count; + u32 efficiency; + struct nfs4_ff_layout_ds *mirror_ds; + u32 fh_versions_cnt; + struct nfs_fh *fh_versions; + nfs4_stateid stateid; + struct nfs4_string user_name; + struct nfs4_string group_name; + u32 uid; + u32 gid; + struct rpc_cred *cred; + spinlock_t lock; +}; + +struct nfs4_ff_layout_segment { + struct pnfs_layout_segment generic_hdr; + u64 stripe_unit; + u32 mirror_array_cnt; + struct nfs4_ff_layout_mirror **mirror_array; +}; + +struct nfs4_flexfile_layout { + struct pnfs_layout_hdr generic_hdr; + struct pnfs_ds_commit_info commit_info; + struct list_head error_list; /* nfs4_ff_layout_ds_err */ +}; + +static inline struct nfs4_flexfile_layout * +FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_flexfile_layout, generic_hdr); +} + +static inline struct nfs4_ff_layout_segment * +FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_ff_layout_segment, + generic_hdr); +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +{ + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || + FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || + FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) + return NULL; + return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; +} + +static inline struct nfs4_ff_layout_ds * +FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) +{ + return container_of(node, struct nfs4_ff_layout_ds, id_node); +} + +static inline struct nfs4_ff_layout_mirror * +FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) +{ + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) + return NULL; + return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; +} + +static inline u32 +FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt; +} + +static inline bool +ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return nfs4_test_deviceid_unavailable(node); +} + +static inline int +nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; +} + +struct nfs4_ff_layout_ds * +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_mirror *mirror, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + gfp_t gfp_flags); +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, int *count, + const struct pnfs_layout_range *range); +struct nfs_fh * +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); + +struct nfs4_pnfs_ds * +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, + bool fail_return); + +struct rpc_clnt * +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, + u32 ds_idx, + struct nfs_client *ds_clp, + struct inode *inode); +struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, + u32 ds_idx, struct rpc_cred *mdscred); +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c new file mode 100644 index 000000000..77a2d026a --- /dev/null +++ b/kernel/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -0,0 +1,552 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng + */ + +#include +#include +#include +#include + +#include "../internal.h" +#include "../nfs4session.h" +#include "flexfilelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) +{ + if (mirror_ds) + nfs4_put_deviceid_node(&mirror_ds->id_node); +} + +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) +{ + nfs4_print_deviceid(&mirror_ds->id_node.deviceid); + nfs4_pnfs_ds_put(mirror_ds->ds); + kfree_rcu(mirror_ds, id_node.rcu); +} + +/* Decode opaque device data and construct new_ds using it */ +struct nfs4_ff_layout_ds * +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_ff_layout_ds *new_ds = NULL; + struct nfs4_ff_ds_version *ds_versions = NULL; + u32 mp_count; + u32 version_count; + __be32 *p; + int i, ret = -ENOMEM; + + /* set up xdr stream */ + scratch = alloc_page(gfp_flags); + if (!scratch) + goto out_err; + + new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags); + if (!new_ds) + goto out_scratch; + + nfs4_init_deviceid_node(&new_ds->id_node, + server, + &pdev->dev_id); + INIT_LIST_HEAD(&dsaddrs); + + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* multipath count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + mp_count = be32_to_cpup(p); + dprintk("%s: multipath ds count %d\n", __func__, mp_count); + + for (i = 0; i < mp_count; i++) { + /* multipath ds */ + da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, + &stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + ret = -ENOMEDIUM; + goto out_err_drain_dsaddrs; + } + + /* version count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + version_count = be32_to_cpup(p); + dprintk("%s: version count %d\n", __func__, version_count); + + ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), + gfp_flags); + if (!ds_versions) + goto out_scratch; + + for (i = 0; i < version_count; i++) { + /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) + + * tightly_coupled(4) */ + p = xdr_inline_decode(&stream, 20); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + ds_versions[i].version = be32_to_cpup(p++); + ds_versions[i].minor_version = be32_to_cpup(p++); + ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].tightly_coupled = be32_to_cpup(p); + + if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) + ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE; + if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) + ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; + + if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, + i, ds_versions[i].version, + ds_versions[i].minor_version); + ret = -EPROTONOSUPPORT; + goto out_err_drain_dsaddrs; + } + + dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n", + __func__, i, ds_versions[i].version, + ds_versions[i].minor_version, + ds_versions[i].rsize, + ds_versions[i].wsize, + ds_versions[i].tightly_coupled); + } + + new_ds->ds_versions = ds_versions; + new_ds->ds_versions_cnt = version_count; + + new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!new_ds->ds) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + __free_page(scratch); + return new_ds; + +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds_versions); +out_scratch: + __free_page(scratch); +out_err: + kfree(new_ds); + + dprintk("%s ERROR: returning %d\n", __func__, ret); + return NULL; +} + +static u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, + u64 offset, u64 length) +{ + u64 end; + + end = max_t(u64, end_offset(err->offset, err->length), + end_offset(offset, length)); + err->offset = min_t(u64, err->offset, offset); + err->length = end - err->offset; +} + +static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + nfs4_stateid *stateid, + struct nfs4_deviceid *deviceid) +{ + return err->status == status && err->opnum == opnum && + nfs4_stateid_match(&err->stateid, stateid) && + !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && + end_offset(err->offset, err->length) >= offset && + err->offset <= end_offset(offset, length); +} + +static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, + struct nfs4_ff_layout_ds_err *new) +{ + if (!ds_error_can_merge(old, new->offset, new->length, new->status, + new->opnum, &new->stateid, &new->deviceid)) + return false; + + extend_ds_error(old, new->offset, new->length); + return true; +} + +static bool +ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_ds_err *dserr) +{ + struct nfs4_ff_layout_ds_err *err; + + list_for_each_entry(err, &flo->error_list, list) { + if (merge_ds_error(err, dserr)) { + return true; + } + } + + list_add(&dserr->list, &flo->error_list); + return false; +} + +static bool +ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) +{ + bool found = false; + struct nfs4_ff_layout_ds_err *err; + + list_for_each_entry(err, &flo->error_list, list) { + if (ds_error_can_merge(err, offset, length, status, opnum, + stateid, deviceid)) { + found = true; + extend_ds_error(err, offset, length); + break; + } + } + + return found; +} + +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_mirror *mirror, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + gfp_t gfp_flags) +{ + struct nfs4_ff_layout_ds_err *dserr; + bool needfree; + + if (status == 0) + return 0; + + if (mirror->mirror_ds == NULL) + return -EINVAL; + + spin_lock(&flo->generic_hdr.plh_inode->i_lock); + if (ff_layout_update_ds_error(flo, offset, length, status, opnum, + &mirror->stateid, + &mirror->mirror_ds->id_node.deviceid)) { + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + return 0; + } + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + dserr = kmalloc(sizeof(*dserr), gfp_flags); + if (!dserr) + return -ENOMEM; + + INIT_LIST_HEAD(&dserr->list); + dserr->offset = offset; + dserr->length = length; + dserr->status = status; + dserr->opnum = opnum; + nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); + memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + NFS4_DEVICEID4_SIZE); + + spin_lock(&flo->generic_hdr.plh_inode->i_lock); + needfree = ff_layout_add_ds_error_locked(flo, dserr); + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + if (needfree) + kfree(dserr); + + return 0; +} + +/* currently we only support AUTH_NONE and AUTH_SYS */ +static rpc_authflavor_t +nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror->uid == (u32)-1) + return RPC_AUTH_NULL; + return RPC_AUTH_UNIX; +} + +/* fetch cred for NFSv3 DS */ +static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_pnfs_ds *ds) +{ + if (ds->ds_clp && !mirror->cred && + mirror->mirror_ds->ds_versions[0].version == 3) { + struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; + struct rpc_cred *cred; + struct auth_cred acred = { + .uid = make_kuid(&init_user_ns, mirror->uid), + .gid = make_kgid(&init_user_ns, mirror->gid), + }; + + /* AUTH_NULL ignores acred */ + cred = auth->au_ops->lookup_cred(auth, &acred, 0); + if (IS_ERR(cred)) { + dprintk("%s: lookup_cred failed with %ld\n", + __func__, PTR_ERR(cred)); + return PTR_ERR(cred); + } else { + mirror->cred = cred; + } + } + return 0; +} + +struct nfs_fh * +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + struct nfs_fh *fh = NULL; + struct nfs4_deviceid_node *devid; + + if (mirror == NULL || mirror->mirror_ds == NULL || + mirror->mirror_ds->ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", + __func__, mirror_idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + pnfs_generic_mark_devid_invalid(devid); + } + goto out; + } + + /* FIXME: For now assume there is only 1 version available for the DS */ + fh = &mirror->fh_versions[0]; +out: + return fh; +} + +/* Upon return, either ds is connected, or ds is NULL */ +struct nfs4_pnfs_ds * +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, + bool fail_return) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + struct nfs4_pnfs_ds *ds = NULL; + struct nfs4_deviceid_node *devid; + struct inode *ino = lseg->pls_layout->plh_inode; + struct nfs_server *s = NFS_SERVER(ino); + unsigned int max_payload; + rpc_authflavor_t flavor; + + if (mirror == NULL || mirror->mirror_ds == NULL || + mirror->mirror_ds->ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + __func__, ds_idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + pnfs_generic_mark_devid_invalid(devid); + } + goto out; + } + + devid = &mirror->mirror_ds->id_node; + if (ff_layout_test_devid_unavailable(devid)) + goto out; + + ds = mirror->mirror_ds->ds; + /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ + smp_rmb(); + if (ds->ds_clp) + goto out; + + flavor = nfs4_ff_layout_choose_authflavor(mirror); + + /* FIXME: For now we assume the server sent only one version of NFS + * to use for the DS. + */ + nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + dataserver_retrans, + mirror->mirror_ds->ds_versions[0].version, + mirror->mirror_ds->ds_versions[0].minor_version, + flavor); + + /* connect success, check rsize/wsize limit */ + if (ds->ds_clp) { + max_payload = + nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), + NULL); + if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) + mirror->mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) + mirror->mirror_ds->ds_versions[0].wsize = max_payload; + } else { + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); + if (fail_return) { + pnfs_error_mark_layout_for_return(ino, lseg); + if (ff_layout_has_available_ds(lseg)) + pnfs_set_retry_layoutget(lseg->pls_layout); + else + pnfs_clear_retry_layoutget(lseg->pls_layout); + + } else { + if (ff_layout_has_available_ds(lseg)) + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lseg->pls_layout->plh_flags); + else { + pnfs_error_mark_layout_for_return(ino, lseg); + pnfs_clear_retry_layoutget(lseg->pls_layout); + } + } + } + + if (ff_layout_update_mirror_cred(mirror, ds)) + ds = NULL; +out: + return ds; +} + +struct rpc_cred * +ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, + struct rpc_cred *mdscred) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + struct rpc_cred *cred = ERR_PTR(-EINVAL); + + if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) + goto out; + + if (mirror && mirror->cred) + cred = mirror->cred; + else + cred = mdscred; +out: + return cred; +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, + struct nfs_client *ds_clp, struct inode *inode) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + + switch (mirror->mirror_ds->ds_versions[0].version) { + case 3: + /* For NFSv3 DS, flavor is set when creating DS connections */ + return ds_clp->cl_rpcclient; + case 4: + return nfs4_find_or_create_ds_client(ds_clp, inode); + default: + BUG(); + } +} + +static bool is_range_intersecting(u64 offset1, u64 length1, + u64 offset2, u64 length2) +{ + u64 end1 = end_offset(offset1, length1); + u64 end2 = end_offset(offset2, length2); + + return (end1 == NFS4_MAX_UINT64 || end1 > offset2) && + (end2 == NFS4_MAX_UINT64 || end2 > offset1); +} + +/* called with inode i_lock held */ +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, int *count, + const struct pnfs_layout_range *range) +{ + struct nfs4_ff_layout_ds_err *err, *n; + __be32 *p; + + list_for_each_entry_safe(err, n, &flo->error_list, list) { + if (!is_range_intersecting(err->offset, err->length, + range->offset, range->length)) + continue; + /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) + * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) + */ + p = xdr_reserve_space(xdr, + 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); + if (unlikely(!p)) + return -ENOBUFS; + p = xdr_encode_hyper(p, err->offset); + p = xdr_encode_hyper(p, err->length); + p = xdr_encode_opaque_fixed(p, &err->stateid, + NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, &err->deviceid, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(err->status); + *p++ = cpu_to_be32(err->opnum); + *count += 1; + list_del(&err->list); + dprintk("%s: offset %llu length %llu status %d op %d count %d\n", + __func__, err->offset, err->length, err->status, + err->opnum, *count); + kfree(err); + } + + return 0; +} + +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *devid; + int idx; + + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + if (!ff_layout_test_devid_unavailable(devid)) + return true; + } + } + + return false; +} + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " + "retries a request before it attempts further " + " recovery action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " + "NFSv4.1 client waits for a response from a " + " data server before it retries an NFS request."); diff --git a/kernel/fs/nfs/fscache-index.c b/kernel/fs/nfs/fscache-index.c new file mode 100644 index 000000000..777b05506 --- /dev/null +++ b/kernel/fs/nfs/fscache-index.c @@ -0,0 +1,336 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + memset(key, 0, len); + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/kernel/fs/nfs/fscache.c b/kernel/fs/nfs/fscache.c new file mode 100644 index 000000000..d63bea8bb --- /dev/null +++ b/kernel/fs/nfs/fscache.c @@ -0,0 +1,439 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp, true); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + * + * The default uniquifier is just an empty string, but it may be overridden + * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent + * superblock across an automount point of some nature. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + int diff; + + if (!uniq) { + uniq = ""; + ulen = 1; + } + + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss, true); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->fscache = NULL; + if (!S_ISREG(inode->i_mode)) + return; + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi, false); +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_clear_inode(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); + + fscache_relinquish_cookie(cookie, false); + nfsi->fscache = NULL; +} + +static bool nfs_fscache_can_enable(void *data) +{ + struct inode *inode = data; + + return !inode_is_open_for_write(inode); +} + +/* + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time. Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing. We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions. + */ +void nfs_fscache_open_file(struct inode *inode, struct file *filp) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + if (!fscache_cookie_valid(cookie)) + return; + + if (inode_is_open_for_write(inode)) { + dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); + clear_bit(NFS_INO_FSCACHE, &nfsi->flags); + fscache_disable_cookie(cookie, true); + fscache_uncache_all_inode_pages(cookie, inode); + } else { + dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); + fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + if (fscache_cookie_enabled(cookie)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + } +} +EXPORT_SYMBOL_GPL(nfs_fscache_open_file); + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); + + BUG_ON(!cookie); + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, NFS_I(page->mapping->host)); + + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, NFS_I(inode)); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + nfs_i_fscache(inode), page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + unsigned npages = *nr_pages; + int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + nfs_i_fscache(inode), npages, inode); + + ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + nfs_i_fscache(inode), page, page->index, page->flags, sync); + + ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(nfs_i_fscache(inode), page); + nfs_inc_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); + } else { + nfs_inc_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK); + } +} diff --git a/kernel/fs/nfs/fscache.h b/kernel/fs/nfs/fscache.h new file mode 100644 index 000000000..d7fe3e799 --- /dev/null +++ b/kernel/fs/nfs/fscache.h @@ -0,0 +1,229 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include +#include +#include +#include + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * Invalidate the contents of fscache for this inode. This will not sleep. + */ +static inline void nfs_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(NFS_I(inode)->fscache); +} + +/* + * Wait for an object to finish being invalidated. + */ +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +{ + fscache_wait_on_invalidate(NFS_I(inode)->fscache); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, + struct file *filp) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + + +static inline void nfs_fscache_invalidate(struct inode *inode) {} +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/kernel/fs/nfs/getroot.c b/kernel/fs/nfs/getroot.c new file mode 100644 index 000000000..a608ffd28 --- /dev/null +++ b/kernel/fs/nfs/getroot.c @@ -0,0 +1,133 @@ +/* getroot.c: get the root dentry for an NFS mount + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Set the superblock root dentry. + * Note that this function frees the inode in case of error. + */ +static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) +{ + /* The mntroot acts as the dummy root dentry for this superblock */ + if (sb->s_root == NULL) { + sb->s_root = d_make_root(inode); + if (sb->s_root == NULL) + return -ENOMEM; + ihold(inode); + /* + * Ensure that this dentry is invisible to d_find_alias(). + * Otherwise, it may be spliced into the tree by + * d_splice_alias if a parent directory from the same + * filesystem gets mounted at a later time. + * This again causes shrink_dcache_for_umount_subtree() to + * Oops, since the test for IS_ROOT() will fail. + */ + spin_lock(&d_inode(sb->s_root)->i_lock); + spin_lock(&sb->s_root->d_lock); + hlist_del_init(&sb->s_root->d_u.d_alias); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&d_inode(sb->s_root)->i_lock); + } + return 0; +} + +/* + * get an NFS2/NFS3 root dentry from the root filehandle + */ +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; + struct dentry *ret; + struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); + int error; + + if (!name) + return ERR_PTR(-ENOMEM); + + /* get the actual root for this mount */ + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + ret = ERR_PTR(error); + goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + ret = ERR_CAST(inode); + goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + ret = d_obtain_root(inode); + if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); + goto out; + } + + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !ret->d_fsdata && + !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + kfree(name); + nfs_free_fattr(fsinfo.fattr); + return ret; +} diff --git a/kernel/fs/nfs/inode.c b/kernel/fs/nfs/inode.c new file mode 100644 index 000000000..f734562c6 --- /dev/null +++ b/kernel/fs/nfs/inode.c @@ -0,0 +1,2066 @@ +/* + * linux/fs/nfs/inode.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs inode and superblock handling functions + * + * Modularised by Alan Cox , while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" +#include "nfs.h" +#include "netns.h" + +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 + +/* Default is to see 64-bit inode numbers */ +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; + +static void nfs_invalidate_inode(struct inode *); +static int nfs_update_inode(struct inode *, struct nfs_fattr *); + +static struct kmem_cache * nfs_inode_cachep; + +static inline unsigned long +nfs_fattr_to_ino_t(struct nfs_fattr *fattr) +{ + return nfs_fileid_to_ino_t(fattr->fileid); +} + +/** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(struct wait_bit_key *key) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); + +/** + * nfs_compat_user_ino64 - returns the user-visible inode number + * @fileid: 64-bit fileid + * + * This function returns a 32-bit inode number if the boot parameter + * nfs.enable_ino64 is zero. + */ +u64 nfs_compat_user_ino64(u64 fileid) +{ +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif + + if (enable_ino64) + return fileid; + ino = fileid; + if (sizeof(ino) < sizeof(fileid)) + ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; + return ino; +} + +int nfs_drop_inode(struct inode *inode) +{ + return NFS_STALE(inode) || generic_drop_inode(inode); +} +EXPORT_SYMBOL_GPL(nfs_drop_inode); + +void nfs_clear_inode(struct inode *inode) +{ + /* + * The following should never happen... + */ + WARN_ON_ONCE(nfs_have_writebacks(inode)); + WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); + nfs_zap_acl_cache(inode); + nfs_access_zap_cache(inode); + nfs_fscache_clear_inode(inode); +} +EXPORT_SYMBOL_GPL(nfs_clear_inode); + +void nfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + nfs_clear_inode(inode); +} + +int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} +EXPORT_SYMBOL_GPL(nfs_sync_inode); + +/** + * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + */ +int nfs_sync_mapping(struct address_space *mapping) +{ + int ret = 0; + + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } + return ret; +} + +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + +/* + * Invalidate the local caches + */ +static void nfs_zap_caches_locked(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int mode = inode->i_mode; + + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = jiffies; + + memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE); + } else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE); + nfs_zap_label_cache_locked(nfsi); +} + +void nfs_zap_caches(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_zap_caches_locked(inode); + spin_unlock(&inode->i_lock); +} + +void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) +{ + if (mapping->nrpages != 0) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + spin_unlock(&inode->i_lock); + } +} + +void nfs_zap_acl_cache(struct inode *inode) +{ + void (*clear_acl_cache)(struct inode *); + + clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; + if (clear_acl_cache != NULL) + clear_acl_cache(inode); + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); + +void nfs_invalidate_atime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_invalidate_atime); + +/* + * Invalidate, but do not unhash, the inode. + * NB: must be called with inode->i_lock held! + */ +static void nfs_invalidate_inode(struct inode *inode) +{ + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_zap_caches_locked(inode); +} + +struct nfs_find_desc { + struct nfs_fh *fh; + struct nfs_fattr *fattr; +}; + +/* + * In NFSv3 we can have 64bit inode numbers. In order to support + * this, and re-exported directories (also seen in NFSv2) + * we are forced to allow 2 different inodes to have the same + * i_ino. + */ +static int +nfs_find_actor(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fh *fh = desc->fh; + struct nfs_fattr *fattr = desc->fattr; + + if (NFS_FILEID(inode) != fattr->fileid) + return 0; + if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + return 0; + if (nfs_compare_fh(NFS_FH(inode), fh)) + return 0; + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + return 1; +} + +static int +nfs_init_locked(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fattr *fattr = desc->fattr; + + set_nfs_fileid(inode, fattr->fileid); + nfs_copy_fh(NFS_FH(inode), desc->fh); + return 0; +} + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; + spin_unlock(&inode->i_lock); +} + +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + int error; + + if (label == NULL) + return; + + if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { + error = security_inode_notifysecctx(inode, label->label, + label->len); + if (error) + printk(KERN_ERR "%s() %s %d " + "security_inode_notifysecctx() %d\n", + __func__, + (char *)label->label, + label->len, error); + nfs_clear_label_invalid(inode); + } +} + +struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) +{ + struct nfs4_label *label = NULL; + int minor_version = server->nfs_client->cl_minorversion; + + if (minor_version < 2) + return label; + + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + return label; + + label = kzalloc(sizeof(struct nfs4_label), flags); + if (label == NULL) + return ERR_PTR(-ENOMEM); + + label->label = kzalloc(NFS4_MAXLABELLEN, flags); + if (label->label == NULL) { + kfree(label); + return ERR_PTR(-ENOMEM); + } + label->len = NFS4_MAXLABELLEN; + + return label; +} +EXPORT_SYMBOL_GPL(nfs4_label_alloc); +#else +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ +} +#endif +EXPORT_SYMBOL_GPL(nfs_setsecurity); + +/* + * This is our front-end to iget that looks up inodes by file handle + * instead of inode number. + */ +struct inode * +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct nfs_find_desc desc = { + .fh = fh, + .fattr = fattr + }; + struct inode *inode = ERR_PTR(-ENOENT); + unsigned long hash; + + nfs_attr_check_mountpoint(sb, fattr); + + if (nfs_attr_use_mounted_on_fileid(fattr)) + fattr->fileid = fattr->mounted_on_fileid; + else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) + goto out_no_inode; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) + goto out_no_inode; + + hash = nfs_fattr_to_ino_t(fattr); + + inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + goto out_no_inode; + } + + if (inode->i_state & I_NEW) { + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; + + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ + inode->i_ino = hash; + + /* We can't support update_atime(), since the server will reset it */ + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; + inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; + /* Deal with crossing mountpoints */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + inode->i_op = &nfs_referral_inode_operations; + else + inode->i_op = &nfs_mountpoint_inode_operations; + inode->i_fop = NULL; + inode->i_flags |= S_AUTOMOUNT; + } + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode->i_version = 0; + inode->i_size = 0; + clear_nlink(inode); + inode->i_uid = make_kuid(&init_user_ns, -2); + inode->i_gid = make_kgid(&init_user_ns, -2); + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + nfsi->write_io = 0; + nfsi->read_io = 0; + + nfsi->read_cache_jiffies = fattr->time_start; + nfsi->attr_gencount = fattr->gencount; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) + inode->i_version = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + set_nlink(inode, fattr->nlink); + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + + nfs_setsecurity(inode, fattr, label); + + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->access_cache = RB_ROOT; + + nfs_fscache_init_inode(inode); + + unlock_new_inode(inode); + } else + nfs_refresh_inode(inode, fattr); + dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), + nfs_display_fhandle_hash(fh), + atomic_read(&inode->i_count)); + +out: + return inode; + +out_no_inode: + dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_fhget); + +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) + +int +nfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct nfs_fattr *fattr; + int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + if (attr->ia_valid & ATTR_SIZE) { + loff_t i_size; + + BUG_ON(!S_ISREG(inode->i_mode)); + + i_size = i_size_read(inode); + if (attr->ia_size == i_size) + attr->ia_valid &= ~ATTR_SIZE; + else if (attr->ia_size < i_size && IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + /* Optimization: if the end result is no change, don't RPC */ + attr->ia_valid &= NFS_VALID_ATTRS; + if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + return 0; + + trace_nfs_setattr_enter(inode); + + /* Write all dirty data */ + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + NFS_PROTO(inode)->return_delegation(inode); + error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) + error = nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + trace_nfs_setattr_exit(inode, error); + return error; +} +EXPORT_SYMBOL_GPL(nfs_setattr); + +/** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + int err; + + err = inode_newsize_ok(inode, offset); + if (err) + goto out; + + i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + + spin_unlock(&inode->i_lock); + truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); +out: + return err; +} + +/** + * nfs_setattr_update_inode - Update inode metadata after a setattr call. + * @inode: pointer to struct inode + * @attr: pointer to struct iattr + * + * Note: we do this in the *proc.c in order to ensure that + * it works for things like exclusive creates too. + */ +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) +{ + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + if ((attr->ia_valid & ATTR_MODE) != 0) { + int mode = attr->ia_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + } + if ((attr->ia_valid & ATTR_UID) != 0) + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); + } + if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); + nfs_vmtruncate(inode, attr->ia_size); + } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); + +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ + struct dentry *parent; + + parent = dget_parent(dentry); + nfs_force_use_readdirplus(d_inode(parent)); + dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ + if (NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + return true; + if (nfs_attribute_cache_expired(inode)) + return true; + return false; +} + +int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + int err = 0; + + trace_nfs_getattr_enter(inode); + /* Flush out writes to the server in order to update c/mtime. */ + if (S_ISREG(inode->i_mode)) { + mutex_lock(&inode->i_mutex); + err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); + if (err) + goto out; + } + + /* + * We may force a getattr if the user cares about atime. + * + * Note that we only have to check the vfsmount flags here: + * - NFS always sets S_NOATIME by so checking it would give a + * bogus result + * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * no point in checking those. + */ + if ((mnt->mnt_flags & MNT_NOATIME) || + ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + need_atime = 0; + + if (need_atime || nfs_need_revalidate_inode(inode)) { + struct nfs_server *server = NFS_SERVER(inode); + + if (server->caps & NFS_CAP_READDIRPLUS) + nfs_request_parent_use_readdirplus(dentry); + err = __nfs_revalidate_inode(server, inode); + } + if (!err) { + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + } +out: + trace_nfs_getattr_exit(inode, err); + return err; +} +EXPORT_SYMBOL_GPL(nfs_getattr); + +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner.l_owner = current->files; + l_ctx->lockowner.l_pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); + nfs_iocounter_init(&l_ctx->io_count); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *head = &ctx->lock_context; + struct nfs_lock_context *pos = head; + + do { + if (pos->lockowner.l_owner != current->files) + continue; + if (pos->lockowner.l_pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = d_inode(ctx->dentry); + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} +EXPORT_SYMBOL_GPL(nfs_get_lock_context); + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = d_inode(ctx->dentry); + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} +EXPORT_SYMBOL_GPL(nfs_put_lock_context); + +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = d_inode(ctx->dentry); + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} +EXPORT_SYMBOL_GPL(nfs_close_context); + +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) +{ + struct nfs_open_context *ctx; + struct rpc_cred *cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + put_rpccred(cred); + return ERR_PTR(-ENOMEM); + } + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); + ctx->cred = cred; + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); + ctx->mdsthreshold = NULL; + return ctx; +} +EXPORT_SYMBOL_GPL(alloc_nfs_open_context); + +struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) +{ + if (ctx != NULL) + atomic_inc(&ctx->lock_context.count); + return ctx; +} +EXPORT_SYMBOL_GPL(get_nfs_open_context); + +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode = d_inode(ctx->dentry); + struct super_block *sb = ctx->dentry->d_sb; + + if (!list_empty(&ctx->list)) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) + return; + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); + if (ctx->cred != NULL) + put_rpccred(ctx->cred); + dput(ctx->dentry); + nfs_sb_deactive(sb); + kfree(ctx->mdsthreshold); + kfree(ctx); +} + +void put_nfs_open_context(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 0); +} +EXPORT_SYMBOL_GPL(put_nfs_open_context); + +/* + * Ensure that mmap has a recent RPC credential for use when writing out + * shared pages + */ +void nfs_inode_attach_open_context(struct nfs_open_context *ctx) +{ + struct inode *inode = d_inode(ctx->dentry); + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); + +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + filp->private_data = get_nfs_open_context(ctx); + if (list_empty(&ctx->list)) + nfs_inode_attach_open_context(ctx); +} +EXPORT_SYMBOL_GPL(nfs_file_set_open_context); + +/* + * Given an inode, search for an open context with the desired characteristics + */ +struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *pos, *ctx = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &nfsi->open_files, list) { + if (cred != NULL && pos->cred != cred) + continue; + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; + } + spin_unlock(&inode->i_lock); + return ctx; +} + +static void nfs_file_clear_open_context(struct file *filp) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filp); + + if (ctx) { + struct inode *inode = d_inode(ctx->dentry); + + filp->private_data = NULL; + spin_lock(&inode->i_lock); + list_move_tail(&ctx->list, &NFS_I(inode)->open_files); + spin_unlock(&inode->i_lock); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + } +} + +/* + * These allocate and release file read/write context information. + */ +int nfs_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + nfs_file_set_open_context(filp, ctx); + put_nfs_open_context(ctx); + nfs_fscache_open_file(inode, filp); + return 0; +} + +int nfs_release(struct inode *inode, struct file *filp) +{ + nfs_file_clear_open_context(filp); + return 0; +} + +/* + * This function is called whenever some part of NFS notices that + * the cached attributes have to be refreshed. + */ +int +__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + int status = -ESTALE; + struct nfs4_label *label = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", + inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); + + trace_nfs_revalidate_inode_enter(inode); + + if (is_bad_inode(inode)) + goto out; + if (NFS_STALE(inode)) + goto out; + + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) { + status = PTR_ERR(label); + goto out; + } + + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), status); + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + goto err_out; + } + + status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), status); + goto err_out; + } + + if (nfsi->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + + nfs_setsecurity(inode, fattr, label); + + dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); + +err_out: + nfs4_label_free(label); +out: + nfs_free_fattr(fattr); + trace_nfs_revalidate_inode_exit(inode, status); + return status; +} + +int nfs_attribute_timeout(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +int nfs_attribute_cache_expired(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) + return 0; + return nfs_attribute_timeout(inode); +} + +/** + * nfs_revalidate_inode - Revalidate the inode attributes + * @server - pointer to nfs_server struct + * @inode - pointer to inode struct + * + * Updates inode attribute information by retrieving the data from the server. + */ +int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + if (!nfs_need_revalidate_inode(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); +} +EXPORT_SYMBOL_GPL(nfs_revalidate_inode); + +int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode) +{ + if (!(NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return -ECHILD; +} + +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int ret; + + if (mapping->nrpages != 0) { + if (S_ISREG(inode->i_mode)) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_sync_mapping(mapping); + if (ret < 0) + return ret; + } + ret = invalidate_inode_pages2(mapping); + if (ret < 0) + return ret; + } + if (S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + spin_unlock(&inode->i_lock); + } + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + nfs_fscache_wait_on_invalidate(inode); + + dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); + return 0; +} + +static bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) + return false; + return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_timeout(inode) + || NFS_STALE(inode); +} + +/** + * __nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? + */ +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + goto out; + + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; + } + + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; + } + + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); +out: + return ret; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; + + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; + if (S_ISDIR(inode->i_mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + ret |= NFS_INO_INVALID_ATTR; + } + /* If we have atomic WCC data, we may update some attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + ret |= NFS_INO_INVALID_ATTR; + } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->nrequests == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + + return ret; +} + +/** + * nfs_check_inode_attributes - verify consistency of the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Verifies the attribute cache. If we have just changed the attributes, + * so that fattr carries weak cache consistency data, then it may + * also update the ctime/mtime/change_attribute. + */ +static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_size, new_isize; + unsigned long invalid = 0; + + + if (nfs_have_delegated_attributes(inode)) + return 0; + /* Has the inode gone and changed behind our back? */ + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + return -EIO; + + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->nrequests == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } + + /* Have any file permissions changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + + /* Has the link count changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + invalid |= NFS_INO_INVALID_ATIME; + + if (invalid != 0) + nfs_set_cache_invalid(inode, invalid); + + nfsi->read_cache_jiffies = fattr->time_start; + return 0; +} + +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static atomic_long_t nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + return atomic_long_read(&nfs_attr_generation_counter); +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + return atomic_long_inc_return(&nfs_attr_generation_counter); +} +EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + fattr->owner_name = NULL; + fattr->group_name = NULL; +} +EXPORT_SYMBOL_GPL(nfs_fattr_init); + +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +struct nfs_fattr *nfs_alloc_fattr(void) +{ + struct nfs_fattr *fattr; + + fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + if (fattr != NULL) + nfs_fattr_init(fattr); + return fattr; +} +EXPORT_SYMBOL_GPL(nfs_alloc_fattr); + +struct nfs_fh *nfs_alloc_fhandle(void) +{ + struct nfs_fh *fh; + + fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + if (fh != NULL) + fh->size = 0; + return fh; +} +EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); + +#ifdef NFS_DEBUG +/* + * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle + * in the same way that wireshark does + * + * @fh: file handle + * + * For debugging only. + */ +u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) +{ + /* wireshark uses 32-bit AUTODIN crc and does a bitwise + * not on the result */ + return nfs_fhandle_hash(fh); +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); + +/* + * _nfs_display_fhandle - display an NFS file handle on the console + * + * @fh: file handle to display + * @caption: display caption + * + * For debugging only. + */ +void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) +{ + unsigned short i; + + if (fh == NULL || fh->size == 0) { + printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); + return; + } + + printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", + caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); + for (i = 0; i < fh->size; i += 16) { + __be32 *pos = (__be32 *)&fh->data[i]; + + switch ((fh->size - i - 1) >> 2) { + case 0: + printk(KERN_DEFAULT " %08x\n", + be32_to_cpup(pos)); + break; + case 1: + printk(KERN_DEFAULT " %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1)); + break; + case 2: + printk(KERN_DEFAULT " %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2)); + break; + default: + printk(KERN_DEFAULT " %08x %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); + } + } +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); +#endif + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, + struct nfs_fattr *fattr) +{ + if (pnfs_layoutcommit_outstanding(inode)) + fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SIZE); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + int ret; + + trace_nfs_refresh_inode_enter(inode); + + nfs_inode_attrs_handle_layoutcommit(inode, fattr); + + if (nfs_inode_attrs_need_update(inode, fattr)) + ret = nfs_update_inode(inode, fattr); + else + ret = nfs_check_inode_attributes(inode, fattr); + + trace_nfs_refresh_inode_exit(inode, ret); + return ret; +} + +/** + * nfs_refresh_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Check that an RPC call that returned attributes has not overlapped with + * other recent updates of the inode metadata, then decide whether it is + * safe to do a full update of the inode attributes, or whether just to + * call nfs_check_inode_attributes. + */ +int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + spin_lock(&inode->i_lock); + status = nfs_refresh_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + + return status; +} +EXPORT_SYMBOL_GPL(nfs_refresh_inode); + +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + +/** + * nfs_post_op_update_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. + * + * NB: if the server didn't return any post op attributes, this + * function will force the retrieval of attributes before the next + * NFS request. Thus it should be used only for operations that + * are expected to change one or more attributes, to avoid + * unnecessary NFS requests and trips through nfs_update_inode(). + */ +int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + + return status; +} +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); + +/** + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); + goto out_noforce; + } + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { + fattr->pre_change_attr = inode->i_version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { + memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { + memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { + fattr->pre_size = i_size_read(inode); + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; + } +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; +} +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + +/* + * Many nfs protocol calls return the new file attributes after + * an operation. Here we update the inode to reflect the state + * of the server's inode. + * + * This is a bit tricky because we have to make sure all dirty pages + * have been sent off to the server before calling invalidate_inode_pages. + * To make sure no other process adds more write requests while we try + * our best to flush them, we make them sleep during the attribute refresh. + * + * A very similar scenario holds for the dir cache. + */ +static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_isize, new_isize; + unsigned long invalid = 0; + unsigned long now = jiffies; + unsigned long save_cache_validity; + + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, + nfs_display_fhandle_hash(NFS_FH(inode)), + atomic_read(&inode->i_count), fattr->valid); + + if (!nfs_fileid_valid(nfsi, fattr)) { + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + inode->i_sb->s_id, (long long)nfsi->fileid, + (long long)fattr->fileid); + goto out_err; + } + + /* + * Make sure the inode's type hasn't changed. + */ + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + /* + * Big trouble! The inode has become a different object. + */ + printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", + __func__, inode->i_ino, inode->i_mode, fattr->mode); + goto out_err; + } + + server = NFS_SERVER(inode); + /* Update the fsid? */ + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && + !nfs_fsid_equal(&server->fsid, &fattr->fsid) && + !IS_AUTOMOUNT(inode)) + server->fsid = fattr->fsid; + + /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; + + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); + + /* Do atomic weak cache consistency updates */ + invalid |= nfs_wcc_update_inode(inode, fattr); + + /* More cache consistency checks */ + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (inode->i_version != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + inode->i_version = fattr->change_attr; + } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + nfsi->cache_validity |= save_cache_validity; + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + } else if (server->caps & NFS_CAP_MTIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } else if (server->caps & NFS_CAP_CTIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + /* Check if our cached file size is stale */ + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; + } + dprintk("NFS: isize change on server for file %s/%ld " + "(%Ld to %Ld)\n", + inode->i_sb->s_id, + inode->i_ino, + (long long)cur_isize, + (long long)new_isize); + } + } else + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + } + } else if (server->caps & NFS_CAP_MODE) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (!uid_eq(inode->i_uid, fattr->uid)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (!gid_eq(inode->i_gid, fattr->gid)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + set_nlink(inode, fattr->nlink); + } + } else if (server->caps & NFS_CAP_NLINK) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + + /* Update attrtimeo value if we're out of the unstable period */ + if (invalid & NFS_INO_INVALID_ATTR) { + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + /* Set barrier to be more recent than all outstanding updates */ + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + } else { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + } + /* Set the barrier to be more recent than this fattr */ + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; + } + invalid &= ~NFS_INO_INVALID_ATTR; + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + invalid &= ~NFS_INO_INVALID_DATA; + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || + (save_cache_validity & NFS_INO_REVAL_FORCED)) + nfs_set_cache_invalid(inode, invalid); + + return 0; + out_err: + /* + * No need to worry about unhashing the dentry, as the + * lookup validation will know that the inode is bad. + * (But we fall through to invalidate the caches.) + */ + nfs_invalidate_inode(inode); + return -ESTALE; +} + +struct inode *nfs_alloc_inode(struct super_block *sb) +{ + struct nfs_inode *nfsi; + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + if (!nfsi) + return NULL; + nfsi->flags = 0UL; + nfsi->cache_validity = 0UL; +#if IS_ENABLED(CONFIG_NFS_V4) + nfsi->nfs4_acl = NULL; +#endif /* CONFIG_NFS_V4 */ + return &nfsi->vfs_inode; +} +EXPORT_SYMBOL_GPL(nfs_alloc_inode); + +static void nfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); +} + +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} +EXPORT_SYMBOL_GPL(nfs_destroy_inode); + +static inline void nfs4_init_once(struct nfs_inode *nfsi) +{ +#if IS_ENABLED(CONFIG_NFS_V4) + INIT_LIST_HEAD(&nfsi->open_states); + nfsi->delegation = NULL; + init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; +#endif +} + +static void init_once(void *foo) +{ + struct nfs_inode *nfsi = (struct nfs_inode *) foo; + + inode_init_once(&nfsi->vfs_inode); + INIT_LIST_HEAD(&nfsi->open_files); + INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); + INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); + INIT_LIST_HEAD(&nfsi->commit_info.list); + nfsi->nrequests = 0; + nfsi->commit_info.ncommit = 0; + atomic_set(&nfsi->commit_info.rpcs_out, 0); + atomic_set(&nfsi->silly_count, 1); + INIT_HLIST_HEAD(&nfsi->silly_list); + init_waitqueue_head(&nfsi->waitqueue); + nfs4_init_once(nfsi); +} + +static int __init nfs_init_inodecache(void) +{ + nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", + sizeof(struct nfs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (nfs_inode_cachep == NULL) + return -ENOMEM; + + return 0; +} + +static void nfs_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(nfs_inode_cachep); +} + +struct workqueue_struct *nfsiod_workqueue; +EXPORT_SYMBOL_GPL(nfsiod_workqueue); + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + +int nfs_net_id; +EXPORT_SYMBOL_GPL(nfs_net_id); + +static int nfs_net_init(struct net *net) +{ + nfs_clients_init(net); + return nfs_fs_proc_net_init(net); +} + +static void nfs_net_exit(struct net *net) +{ + nfs_fs_proc_net_exit(net); + nfs_cleanup_cb_ident_idr(net); +} + +static struct pernet_operations nfs_net_ops = { + .init = nfs_net_init, + .exit = nfs_net_exit, + .id = &nfs_net_id, + .size = sizeof(struct nfs_net), +}; + +/* + * Initialize NFS + */ +static int __init init_nfs_fs(void) +{ + int err; + + err = register_pernet_subsys(&nfs_net_ops); + if (err < 0) + goto out9; + + err = nfs_fscache_register(); + if (err < 0) + goto out8; + + err = nfsiod_start(); + if (err) + goto out7; + + err = nfs_fs_proc_init(); + if (err) + goto out6; + + err = nfs_init_nfspagecache(); + if (err) + goto out5; + + err = nfs_init_inodecache(); + if (err) + goto out4; + + err = nfs_init_readpagecache(); + if (err) + goto out3; + + err = nfs_init_writepagecache(); + if (err) + goto out2; + + err = nfs_init_directcache(); + if (err) + goto out1; + +#ifdef CONFIG_PROC_FS + rpc_proc_register(&init_net, &nfs_rpcstat); +#endif + if ((err = register_nfs_fs()) != 0) + goto out0; + + return 0; +out0: +#ifdef CONFIG_PROC_FS + rpc_proc_unregister(&init_net, "nfs"); +#endif + nfs_destroy_directcache(); +out1: + nfs_destroy_writepagecache(); +out2: + nfs_destroy_readpagecache(); +out3: + nfs_destroy_inodecache(); +out4: + nfs_destroy_nfspagecache(); +out5: + nfs_fs_proc_exit(); +out6: + nfsiod_stop(); +out7: + nfs_fscache_unregister(); +out8: + unregister_pernet_subsys(&nfs_net_ops); +out9: + return err; +} + +static void __exit exit_nfs_fs(void) +{ + nfs_destroy_directcache(); + nfs_destroy_writepagecache(); + nfs_destroy_readpagecache(); + nfs_destroy_inodecache(); + nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); + unregister_pernet_subsys(&nfs_net_ops); +#ifdef CONFIG_PROC_FS + rpc_proc_unregister(&init_net, "nfs"); +#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +} + +/* Not quite true; I just maintain it */ +MODULE_AUTHOR("Olaf Kirch "); +MODULE_LICENSE("GPL"); +module_param(enable_ino64, bool, 0644); + +module_init(init_nfs_fs) +module_exit(exit_nfs_fs) diff --git a/kernel/fs/nfs/internal.h b/kernel/fs/nfs/internal.h new file mode 100644 index 000000000..9e6475bc5 --- /dev/null +++ b/kernel/fs/nfs/internal.h @@ -0,0 +1,683 @@ +/* + * NFS internal definitions + */ + +#include "nfs4_fs.h" +#include +#include +#include +#include + +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + +struct nfs_string; + +/* Maximum number of readahead requests + * FIXME: this should really be a sysctl so that users may tune it to suit + * their needs. People that do NFS over a slow network, might for + * instance want to reduce it to something closer to 1 for improved + * interactive response. + */ +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) + +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + return 1; +} + +struct nfs_clone_mount { + const struct super_block *sb; + const struct dentry *dentry; + struct nfs_fh *fh; + struct nfs_fattr *fattr; + char *hostname; + char *mnt_path; + struct sockaddr *addr; + size_t addrlen; + rpc_authflavor_t authflavor; +}; + +/* + * Note: RFC 1813 doesn't limit the number of auth flavors that + * a server can return, so make something up. + */ +#define NFS_MAX_SECFLAVORS (12) + +/* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + +struct nfs_client_initdata { + unsigned long init_flags; + const char *hostname; + const struct sockaddr *addr; + size_t addrlen; + struct nfs_subversion *nfs_mod; + int proto; + u32 minorversion; + struct net *net; +}; + +/* + * In-kernel mount arguments + */ +struct nfs_parsed_mount_data { + int flags; + unsigned int rsize, wsize; + unsigned int timeo, retrans; + unsigned int acregmin, acregmax, + acdirmin, acdirmax; + unsigned int namlen; + unsigned int options; + unsigned int bsize; + struct nfs_auth_info auth_info; + rpc_authflavor_t selected_flavor; + char *client_address; + unsigned int version; + unsigned int minorversion; + char *fscache_uniq; + bool need_mount; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + u32 version; + int port; + unsigned short protocol; + } mount_server; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + char *export_path; + int port; + unsigned short protocol; + } nfs_server; + + struct security_mnt_opts lsm_opts; + struct net *net; +}; + +/* mount_clnt.c */ +struct nfs_mount_request { + struct sockaddr *sap; + size_t salen; + char *hostname; + char *dirpath; + u32 version; + unsigned short protocol; + struct nfs_fh *fh; + int noresvport; + unsigned int *auth_flav_len; + rpc_authflavor_t *auth_flavs; + struct net *net; +}; + +struct nfs_mount_info { + void (*fill_super)(struct super_block *, struct nfs_mount_info *); + int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); + struct nfs_parsed_mount_data *parsed; + struct nfs_clone_mount *cloned; + struct nfs_fh *mntfh; +}; + +extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); + +/* client.c */ +extern const struct rpc_program nfs_program; +extern void nfs_clients_init(struct net *net); +extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); +int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, + const struct rpc_timeout *, const char *, + rpc_authflavor_t); +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); +void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); +void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, + rpc_authflavor_t); +struct nfs_server *nfs_alloc_server(void); +void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); + +extern void nfs_cleanup_cb_ident_idr(struct net *); +extern void nfs_put_client(struct nfs_client *); +extern void nfs_free_client(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_ident(struct net *, int); +extern struct nfs_client * +nfs4_find_client_sessionid(struct net *, const struct sockaddr *, + struct nfs4_sessionid *, u32); +extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, + struct nfs_subversion *); +extern struct nfs_server *nfs4_create_server( + struct nfs_mount_info *, + struct nfs_subversion *); +extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, + struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, + struct net *net); +extern void nfs_free_server(struct nfs_server *server); +extern struct nfs_server *nfs_clone_server(struct nfs_server *, + struct nfs_fh *, + struct nfs_fattr *, + rpc_authflavor_t); +extern int nfs_wait_client_init_complete(const struct nfs_client *clp); +extern void nfs_mark_client_ready(struct nfs_client *clp, int state); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto, + unsigned int ds_timeo, + unsigned int ds_retrans, + u32 minor_version, + rpc_authflavor_t au_flavor); +extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, + struct inode *); +extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, + unsigned int ds_retrans, rpc_authflavor_t au_flavor); +#ifdef CONFIG_PROC_FS +extern int __init nfs_fs_proc_init(void); +extern void nfs_fs_proc_exit(void); +extern int nfs_fs_proc_net_init(struct net *net); +extern void nfs_fs_proc_net_exit(struct net *net); +#else +static inline int nfs_fs_proc_net_init(struct net *net) +{ + return 0; +} +static inline void nfs_fs_proc_net_exit(struct net *net) +{ +} +static inline int nfs_fs_proc_init(void) +{ + return 0; +} +static inline void nfs_fs_proc_exit(void) +{ +} +#endif + +#ifdef CONFIG_NFS_V4_1 +int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); +#endif + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +struct nfs_pageio_descriptor; +/* pagelist.c */ +extern int __init nfs_init_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); +extern int __init nfs_init_readpagecache(void); +extern void nfs_destroy_readpagecache(void); +extern int __init nfs_init_writepagecache(void); +extern void nfs_destroy_writepagecache(void); + +extern int __init nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); +extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)); +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); +void nfs_pgio_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_destroy(struct nfs_pgio_header *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, + struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, + const struct rpc_call_ops *call_ops, int how, int flags); +void nfs_free_request(struct nfs_page *req); +struct nfs_pgio_mirror * +nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ + c->flags = 0; + atomic_set(&c->io_count, 0); +} + +static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) +{ + WARN_ON_ONCE(desc->pg_mirror_count < 1); + return desc->pg_mirror_count > 1; +} + +/* nfs2xdr.c */ +extern struct rpc_procinfo nfs_procedures[]; +extern int nfs2_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs3xdr.c */ +extern struct rpc_procinfo nfs3_procedures[]; +extern int nfs3_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs4xdr.c */ +#if IS_ENABLED(CONFIG_NFS_V4) +extern int nfs4_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); +#endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +extern const u32 nfs41_maxgetdevinfo_overhead; +#endif + +/* nfs4proc.c */ +#if IS_ENABLED(CONFIG_NFS_V4) +extern struct rpc_procinfo nfs4_procedures[]; +#endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ + if (label) { + kfree(label->label); + kfree(label); + } + return; +} + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ + if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) + nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern struct nfs_client *nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr); + +/* dir.c */ +extern void nfs_force_use_readdirplus(struct inode *dir); +extern unsigned long nfs_access_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +int nfs_create(struct inode *, struct dentry *, umode_t, bool); +int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_rmdir(struct inode *, struct dentry *); +int nfs_unlink(struct inode *, struct dentry *); +int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_link(struct dentry *, struct inode *, struct dentry *); +int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* file.c */ +int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); +loff_t nfs_file_llseek(struct file *, loff_t, int); +int nfs_file_flush(struct file *, fl_owner_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); +ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, + size_t, unsigned int); +int nfs_file_mmap(struct file *, struct vm_area_struct *); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); +int nfs_file_release(struct inode *, struct file *); +int nfs_lock(struct file *, int, struct file_lock *); +int nfs_flock(struct file *, int, struct file_lock *); +int nfs_check_flags(int); + +/* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; +extern struct inode *nfs_alloc_inode(struct super_block *sb); +extern void nfs_destroy_inode(struct inode *); +extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern int nfs_drop_inode(struct inode *); +extern void nfs_clear_inode(struct inode *); +extern void nfs_evict_inode(struct inode *); +void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(struct wait_bit_key *key); + +/* super.c */ +extern const struct super_operations nfs_sops; +extern struct file_system_type nfs_fs_type; +extern struct file_system_type nfs_xdev_fs_type; +#if IS_ENABLED(CONFIG_NFS_V4) +extern struct file_system_type nfs4_xdev_fs_type; +extern struct file_system_type nfs4_referral_fs_type; +#endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); +struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, + struct nfs_subversion *); +void nfs_initialise_sb(struct super_block *); +int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, + struct nfs_mount_info *, struct nfs_subversion *); +struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); +struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, + const char *, struct nfs_mount_info *); +void nfs_kill_super(struct super_block *); +void nfs_fill_super(struct super_block *, struct nfs_mount_info *); + +extern struct rpc_stat nfs_rpcstat; + +extern int __init register_nfs_fs(void); +extern void __exit unregister_nfs_fs(void); +extern bool nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); + +/* namespace.c */ +#define NFS_PATH_CANONICAL 1 +extern char *nfs_path(char **p, struct dentry *dentry, + char *buffer, ssize_t buflen, unsigned flags); +extern struct vfsmount *nfs_d_automount(struct path *path); +struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); +struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); + +/* getroot.c */ +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, + const char *); +#if IS_ENABLED(CONFIG_NFS_V4) +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, + const char *); + +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); +#endif + +struct nfs_pgio_completion_ops; +/* read.c */ +extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops); +extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); + +/* super.c */ +void nfs_clone_super(struct super_block *, struct nfs_mount_info *); +void nfs_umount_begin(struct super_block *); +int nfs_statfs(struct dentry *, struct kstatfs *); +int nfs_show_options(struct seq_file *, struct dentry *); +int nfs_show_devname(struct seq_file *, struct dentry *); +int nfs_show_path(struct seq_file *, struct dentry *); +int nfs_show_stats(struct seq_file *, struct dentry *); +int nfs_remount(struct super_block *sb, int *flags, char *raw_data); + +/* write.c */ +extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_commit_free(struct nfs_commit_data *p); +extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct rpc_clnt *clnt, + struct nfs_commit_data *data, + const struct nfs_rpc_ops *nfs_ops, + const struct rpc_call_ops *call_ops, + int how, int flags); +extern void nfs_init_commit(struct nfs_commit_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); +int nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); +int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); +void nfs_commitdata_release(struct nfs_commit_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo); +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq); +int nfs_key_timeout_notify(struct file *filp, struct inode *inode); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); + +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *, enum migrate_mode); +#else +#define nfs_migrate_page NULL +#endif + +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + +/* direct.c */ +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq); +static inline void nfs_inode_dio_wait(struct inode *inode) +{ + inode_dio_wait(inode); +} +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); + +/* nfs4proc.c */ +extern void __nfs4_read_done_cb(struct nfs_pgio_header *); +extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr); +extern int nfs40_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); +extern int nfs41_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); + +static inline struct inode *nfs_igrab_and_active(struct inode *inode) +{ + inode = igrab(inode); + if (inode != NULL && !nfs_sb_active(inode->i_sb)) { + iput(inode); + inode = NULL; + } + return inode; +} + +static inline void nfs_iput_and_deactive(struct inode *inode) +{ + if (inode != NULL) { + struct super_block *sb = inode->i_sb; + + iput(inode); + nfs_sb_deactive(sb); + } +} + +/* + * Determine the device name as a string + */ +static inline char *nfs_devname(struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + char *dummy; + return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); +} + +/* + * Determine the actual block size (and log2 thereof) + */ +static inline +unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) +{ + /* make sure blocksize is a power of two */ + if ((bsize & (bsize - 1)) || nrbitsp) { + unsigned char nrbits; + + for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + ; + bsize = 1 << nrbits; + if (nrbitsp) + *nrbitsp = nrbits; + } + + return bsize; +} + +/* + * Calculate the number of 512byte blocks used. + */ +static inline blkcnt_t nfs_calc_block_size(u64 tsize) +{ + blkcnt_t used = (tsize + 511) >> 9; + return (used > ULONG_MAX) ? ULONG_MAX : used; +} + +/* + * Compute and set NFS server blocksize + */ +static inline +unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) +{ + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; + + return nfs_block_bits(bsize, nrbitsp); +} + +/* + * Determine the maximum file size for a superblock + */ +static inline +void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) +{ + sb->s_maxbytes = (loff_t)maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) + sb->s_maxbytes = MAX_LFS_FILESIZE; +} + +/* + * Record the page as unstable and mark its inode as dirty. + */ +static inline +void nfs_mark_page_unstable(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +} + +/* + * Determine the number of bytes of data the page contains + */ +static inline +unsigned int nfs_page_length(struct page *page) +{ + loff_t i_size = i_size_read(page_file_mapping(page)->host); + + if (i_size > 0) { + pgoff_t page_index = page_file_index(page); + pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (page_index < end_index) + return PAGE_CACHE_SIZE; + if (page_index == end_index) + return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; + } + return 0; +} + +/* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* + * Determine the number of pages in an array of length 'len' and + * with a base offset of 'base' + */ +static inline +unsigned int nfs_page_array_len(unsigned int base, size_t len) +{ + return ((unsigned long)len + (unsigned long)base + + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +/* + * Convert a struct timespec into a 64-bit change attribute + * + * This does approximately the same thing as timespec_to_ns(), + * but for calculation efficiency, we multiply the seconds by + * 1024*1024*1024. + */ +static inline +u64 nfs_timespec_to_change_attr(const struct timespec *ts) +{ + return ((u64)ts->tv_sec << 30) + ts->tv_nsec; +} + +#ifdef CONFIG_CRC32 +/** + * nfs_fhandle_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); +} +#else +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return 0; +} +#endif diff --git a/kernel/fs/nfs/iostat.h b/kernel/fs/nfs/iostat.h new file mode 100644 index 000000000..0cb806fbd --- /dev/null +++ b/kernel/fs/nfs/iostat.h @@ -0,0 +1,76 @@ +/* + * linux/fs/nfs/iostat.h + * + * Declarations for NFS client per-mount statistics + * + * Copyright (C) 2005, 2006 Chuck Lever + * + */ + +#ifndef _NFS_IOSTAT +#define _NFS_IOSTAT + +#include +#include +#include + +struct nfs_iostats { + unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif + unsigned long events[__NFSIOS_COUNTSMAX]; +} ____cacheline_aligned; + +static inline void nfs_inc_server_stats(const struct nfs_server *server, + enum nfs_stat_eventcounters stat) +{ + this_cpu_inc(server->io_stats->events[stat]); +} + +static inline void nfs_inc_stats(const struct inode *inode, + enum nfs_stat_eventcounters stat) +{ + nfs_inc_server_stats(NFS_SERVER(inode), stat); +} + +static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, + long addend) +{ + this_cpu_add(server->io_stats->bytes[stat], addend); +} + +static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, + long addend) +{ + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); +} + +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + long addend) +{ + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); +} +static inline void nfs_inc_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat) +{ + this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]); +} +#endif + +static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) +{ + return alloc_percpu(struct nfs_iostats); +} + +static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) +{ + if (stats != NULL) + free_percpu(stats); +} + +#endif /* _NFS_IOSTAT */ diff --git a/kernel/fs/nfs/mount_clnt.c b/kernel/fs/nfs/mount_clnt.c new file mode 100644 index 000000000..99a45283b --- /dev/null +++ b/kernel/fs/nfs/mount_clnt.c @@ -0,0 +1,535 @@ +/* + * In-kernel MOUNT protocol client + * + * Copyright (C) 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef NFS_DEBUG +# define NFSDBG_FACILITY NFSDBG_MOUNT +#endif + +/* + * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4 + */ +#define MNTPATHLEN (1024) + +/* + * XDR data type sizes + */ +#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) +#define MNT_status_sz (1) +#define MNT_fhs_status_sz (1) +#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE)) +#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) + +/* + * XDR argument and result sizes + */ +#define MNT_enc_dirpath_sz encode_dirpath_sz +#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ + MNT_authflav3_sz) + +/* + * Defined by RFC 1094, section A.5 + */ +enum { + MOUNTPROC_NULL = 0, + MOUNTPROC_MNT = 1, + MOUNTPROC_DUMP = 2, + MOUNTPROC_UMNT = 3, + MOUNTPROC_UMNTALL = 4, + MOUNTPROC_EXPORT = 5, +}; + +/* + * Defined by RFC 1813, section 5.2 + */ +enum { + MOUNTPROC3_NULL = 0, + MOUNTPROC3_MNT = 1, + MOUNTPROC3_DUMP = 2, + MOUNTPROC3_UMNT = 3, + MOUNTPROC3_UMNTALL = 4, + MOUNTPROC3_EXPORT = 5, +}; + +static const struct rpc_program mnt_program; + +/* + * Defined by OpenGroup XNFS Version 3W, chapter 8 + */ +enum mountstat { + MNT_OK = 0, + MNT_EPERM = 1, + MNT_ENOENT = 2, + MNT_EACCES = 13, + MNT_EINVAL = 22, +}; + +static struct { + u32 status; + int errno; +} mnt_errtbl[] = { + { .status = MNT_OK, .errno = 0, }, + { .status = MNT_EPERM, .errno = -EPERM, }, + { .status = MNT_ENOENT, .errno = -ENOENT, }, + { .status = MNT_EACCES, .errno = -EACCES, }, + { .status = MNT_EINVAL, .errno = -EINVAL, }, +}; + +/* + * Defined by RFC 1813, section 5.1.5 + */ +enum mountstat3 { + MNT3_OK = 0, /* no error */ + MNT3ERR_PERM = 1, /* Not owner */ + MNT3ERR_NOENT = 2, /* No such file or directory */ + MNT3ERR_IO = 5, /* I/O error */ + MNT3ERR_ACCES = 13, /* Permission denied */ + MNT3ERR_NOTDIR = 20, /* Not a directory */ + MNT3ERR_INVAL = 22, /* Invalid argument */ + MNT3ERR_NAMETOOLONG = 63, /* Filename too long */ + MNT3ERR_NOTSUPP = 10004, /* Operation not supported */ + MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */ +}; + +static struct { + u32 status; + int errno; +} mnt3_errtbl[] = { + { .status = MNT3_OK, .errno = 0, }, + { .status = MNT3ERR_PERM, .errno = -EPERM, }, + { .status = MNT3ERR_NOENT, .errno = -ENOENT, }, + { .status = MNT3ERR_IO, .errno = -EIO, }, + { .status = MNT3ERR_ACCES, .errno = -EACCES, }, + { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, }, + { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, + { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, + { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, +}; + +struct mountres { + int errno; + struct nfs_fh *fh; + unsigned int *auth_count; + rpc_authflavor_t *auth_flavors; +}; + +struct mnt_fhstatus { + u32 status; + struct nfs_fh *fh; +}; + +/** + * nfs_mount - Obtain an NFS file handle for the given host and path + * @info: pointer to mount request arguments + * + * Uses default timeout parameters specified by underlying transport. On + * successful return, the auth_flavs list and auth_flav_len will be populated + * with the list from the server or a faked-up list if the server didn't + * provide one. + */ +int nfs_mount(struct nfs_mount_request *info) +{ + struct mountres result = { + .fh = info->fh, + .auth_count = info->auth_flav_len, + .auth_flavors = info->auth_flavs, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_create_args args = { + .net = info->net, + .protocol = info->protocol, + .address = info->sap, + .addrsize = info->salen, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + }; + struct rpc_clnt *mnt_clnt; + int status; + + dprintk("NFS: sending MNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), + info->dirpath); + + if (strlen(info->dirpath) > MNTPATHLEN) + return -ENAMETOOLONG; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + mnt_clnt = rpc_create(&args); + if (IS_ERR(mnt_clnt)) + goto out_clnt_err; + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; + else + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; + + status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT); + rpc_shutdown_client(mnt_clnt); + + if (status < 0) + goto out_call_err; + if (result.errno != 0) + goto out_mnt_err; + + dprintk("NFS: MNT request succeeded\n"); + status = 0; + + /* + * If the server didn't provide a flavor list, allow the + * client to try any flavor. + */ + if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) { + dprintk("NFS: Faking up auth_flavs list\n"); + info->auth_flavs[0] = RPC_AUTH_NULL; + *info->auth_flav_len = 1; + } +out: + return status; + +out_clnt_err: + status = PTR_ERR(mnt_clnt); + dprintk("NFS: failed to create MNT RPC client, status=%d\n", status); + goto out; + +out_call_err: + dprintk("NFS: MNT request failed, status=%d\n", status); + goto out; + +out_mnt_err: + dprintk("NFS: MNT server returned result %d\n", result.errno); + status = result.errno; + goto out; +} + +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .net = info->net, + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + }; + struct rpc_clnt *clnt; + int status; + + if (strlen(info->dirpath) > MNTPATHLEN) + return; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + +/* + * XDR encode/decode functions for MOUNT + */ + +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +{ + const u32 pathname_len = strlen(pathname); + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + pathname_len); + xdr_encode_opaque(p, pathname, pathname_len); +} + +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, + const char *dirpath) +{ + encode_mntdirpath(xdr, dirpath); +} + +/* + * RFC 1094: "A non-zero status indicates some sort of error. In this + * case, the status is a UNIX error number." This can be problematic + * if the server and client use different errno values for the same + * error. + * + * However, the OpenGroup XNFS spec provides a simple mapping that is + * independent of local errno values on the server and the client. + */ +static int decode_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { + if (mnt_errtbl[i].status == status) { + res->errno = mnt_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +} + +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + return decode_fhandle(xdr, res); +} + +static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { + if (mnt3_errtbl[i].status == status) { + res->errno = mnt3_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT3 status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + u32 size; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + size = be32_to_cpup(p); + if (size > NFS3_FHSIZE || size == 0) + return -EIO; + + p = xdr_inline_decode(xdr, size); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = size; + memcpy(fh->data, p, size); + return 0; +} + +static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) +{ + rpc_authflavor_t *flavors = res->auth_flavors; + unsigned int *count = res->auth_count; + u32 entries, i; + __be32 *p; + + if (*count == 0) + return 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + entries = be32_to_cpup(p); + dprintk("NFS: received %u auth flavors\n", entries); + if (entries > NFS_MAX_SECFLAVORS) + entries = NFS_MAX_SECFLAVORS; + + p = xdr_inline_decode(xdr, 4 * entries); + if (unlikely(p == NULL)) + return -EIO; + + if (entries > *count) + entries = *count; + + for (i = 0; i < entries; i++) { + flavors[i] = be32_to_cpup(p++); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); + } + *count = i; + + return 0; +} + +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_fhs_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + status = decode_fhandle3(xdr, res); + if (unlikely(status != 0)) { + res->errno = -EBADHANDLE; + return 0; + } + return decode_auth_flavors(xdr, res); +} + +static struct rpc_procinfo mnt_procedures[] = { + [MOUNTPROC_MNT] = { + .p_proc = MOUNTPROC_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres_sz, + .p_statidx = MOUNTPROC_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, +}; + +static struct rpc_procinfo mnt3_procedures[] = { + [MOUNTPROC3_MNT] = { + .p_proc = MOUNTPROC3_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres3_sz, + .p_statidx = MOUNTPROC3_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, +}; + + +static const struct rpc_version mnt_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(mnt_procedures), + .procs = mnt_procedures, +}; + +static const struct rpc_version mnt_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(mnt3_procedures), + .procs = mnt3_procedures, +}; + +static const struct rpc_version *mnt_version[] = { + NULL, + &mnt_version1, + NULL, + &mnt_version3, +}; + +static struct rpc_stat mnt_stats; + +static const struct rpc_program mnt_program = { + .name = "mount", + .number = NFS_MNT_PROGRAM, + .nrvers = ARRAY_SIZE(mnt_version), + .version = mnt_version, + .stats = &mnt_stats, +}; diff --git a/kernel/fs/nfs/namespace.c b/kernel/fs/nfs/namespace.c new file mode 100644 index 000000000..c8162c660 --- /dev/null +++ b/kernel/fs/nfs/namespace.c @@ -0,0 +1,289 @@ +/* + * linux/fs/nfs/namespace.c + * + * Copyright (C) 2005 Trond Myklebust + * - Modified by David Howells + * + * NFS namespace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static void nfs_expire_automounts(struct work_struct *work); + +static LIST_HEAD(nfs_automount_list); +static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); +int nfs_mountpoint_expiry_timeout = 500 * HZ; + +/* + * nfs_path - reconstruct the path given an arbitrary dentry + * @base - used to return pointer to the end of devname part of path + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer + * @flags - options (see below) + * + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. + * + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. + * + * Supported flags: + * NFS_PATH_CANONICAL: ensure there is exactly one slash after + * the original device (export) name + * (if unset, the original name is returned verbatim) + */ +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, + unsigned flags) +{ + char *end; + int namelen; + unsigned seq; + const char *base; + +rename_retry: + end = buffer+buflen; + *--end = '\0'; + buflen--; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + while (1) { + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) + break; + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong_unlock; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + spin_unlock(&dentry->d_lock); + dentry = dentry->d_parent; + } + if (read_seqretry(&rename_lock, seq)) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + if ((flags & NFS_PATH_CANONICAL) && *end != '/') { + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + *--end = '/'; + } + *p = end; + base = dentry->d_fsdata; + if (!base) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + WARN_ON(1); + return end; + } + namelen = strlen(base); + if (flags & NFS_PATH_CANONICAL) { + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + } + buflen -= namelen; + if (buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + end -= namelen; + memcpy(end, base, namelen); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + return end; +Elong_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} +EXPORT_SYMBOL_GPL(nfs_path); + +/* + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint + * + * When we encounter a mountpoint on the server, we want to set up + * a mountpoint on the client too, to prevent inode numbers from + * colliding, and to allow "df" to work properly. + * On NFSv4, we also want to allow for the fact that different + * filesystems may be migrated to different servers in a failover + * situation, and that different filesystems may want to use + * different security flavours. + */ +struct vfsmount *nfs_d_automount(struct path *path) +{ + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); + struct nfs_fh *fh = NULL; + struct nfs_fattr *fattr = NULL; + + dprintk("--> nfs_d_automount()\n"); + + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; + + mnt = ERR_PTR(-ENOMEM); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fh == NULL || fattr == NULL) + goto out; + + dprintk("%s: enter\n", __func__); + + mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); + if (IS_ERR(mnt)) + goto out; + + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out_nofree: + if (IS_ERR(mnt)) + dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); + else + dprintk("<-- %s() = %p\n", __func__, mnt); + return mnt; +} + +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(d_inode(dentry))->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(d_inode(dentry), stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(d_inode(dentry))->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + +const struct inode_operations nfs_mountpoint_inode_operations = { + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, +}; + +static void nfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &nfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +} + +void nfs_release_automount_timer(void) +{ + if (list_empty(&nfs_automount_list)) + cancel_delayed_work(&nfs_automount_task); +} + +/* + * Clone a mountpoint of the appropriate type + */ +static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, + const char *devname, + struct nfs_clone_mount *mountdata) +{ + return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); +} + +/** + * nfs_do_submount - set up mountpoint when crossing a filesystem boundary + * @dentry - parent directory + * @fh - filehandle for new root dentry + * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount + * + */ +struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, + struct nfs_fattr *fattr, rpc_authflavor_t authflavor) +{ + struct nfs_clone_mount mountdata = { + .sb = dentry->d_sb, + .dentry = dentry, + .fh = fh, + .fattr = fattr, + .authflavor = authflavor, + }; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + char *page = (char *) __get_free_page(GFP_USER); + char *devname; + + dprintk("--> nfs_do_submount()\n"); + + dprintk("%s: submounting on %pd2\n", __func__, + dentry); + if (page == NULL) + goto out; + devname = nfs_devname(dentry, page, PAGE_SIZE); + mnt = (struct vfsmount *)devname; + if (IS_ERR(devname)) + goto free_page; + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); +free_page: + free_page((unsigned long)page); +out: + dprintk("%s: done\n", __func__); + + dprintk("<-- nfs_do_submount() = %p\n", mnt); + return mnt; +} +EXPORT_SYMBOL_GPL(nfs_do_submount); + +struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + int err; + struct dentry *parent = dget_parent(dentry); + + /* Look it up again to get its attributes */ + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL); + dput(parent); + if (err != 0) + return ERR_PTR(err); + + return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); +} +EXPORT_SYMBOL_GPL(nfs_submount); diff --git a/kernel/fs/nfs/netns.h b/kernel/fs/nfs/netns.h new file mode 100644 index 000000000..f0e06e4ac --- /dev/null +++ b/kernel/fs/nfs/netns.h @@ -0,0 +1,40 @@ +/* + * NFS-private data for each "struct net". Accessed with net_generic(). + */ + +#ifndef __NFS_NETNS_H__ +#define __NFS_NETNS_H__ + +#include +#include +#include + +struct bl_dev_msg { + int32_t status; + uint32_t major, minor; +}; + +struct nfs_net { + struct cache_detail *nfs_dns_resolve; + struct rpc_pipe *bl_device_pipe; + struct bl_dev_msg bl_mount_reply; + wait_queue_head_t bl_wq; + struct mutex bl_mutex; + struct list_head nfs_client_list; + struct list_head nfs_volume_list; +#if IS_ENABLED(CONFIG_NFS_V4) + struct idr cb_ident_idr; /* Protected by nfs_client_lock */ + unsigned short nfs_callback_tcpport; + unsigned short nfs_callback_tcpport6; + int cb_users[NFS4_MAX_MINOR_VERSION + 1]; +#endif + spinlock_t nfs_client_lock; + struct timespec boot_time; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfsfs; +#endif +}; + +extern int nfs_net_id; + +#endif diff --git a/kernel/fs/nfs/nfs.h b/kernel/fs/nfs/nfs.h new file mode 100644 index 000000000..43679df56 --- /dev/null +++ b/kernel/fs/nfs/nfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + * + * Function and structures exported by the NFS module + * for use by NFS version-specific modules. + */ +#ifndef __LINUX_INTERNAL_NFS_H +#define __LINUX_INTERNAL_NFS_H + +#include +#include +#include + +struct nfs_subversion { + struct module *owner; /* THIS_MODULE pointer */ + struct file_system_type *nfs_fs; /* NFS filesystem type */ + const struct rpc_version *rpc_vers; /* NFS version information */ + const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ + const struct super_operations *sops; /* NFS Super operations */ + const struct xattr_handler **xattr; /* NFS xattr handlers */ + struct list_head list; /* List of NFS versions */ +}; + +struct nfs_subversion *get_nfs_version(unsigned int); +void put_nfs_version(struct nfs_subversion *); +void register_nfs_version(struct nfs_subversion *); +void unregister_nfs_version(struct nfs_subversion *); + +#endif /* __LINUX_INTERNAL_NFS_H */ diff --git a/kernel/fs/nfs/nfs2super.c b/kernel/fs/nfs/nfs2super.c new file mode 100644 index 000000000..0a9782c91 --- /dev/null +++ b/kernel/fs/nfs/nfs2super.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include +#include +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v2 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version2, + .rpc_ops = &nfs_v2_clientops, + .sops = &nfs_sops, +}; + +static int __init init_nfs_v2(void) +{ + register_nfs_version(&nfs_v2); + return 0; +} + +static void __exit exit_nfs_v2(void) +{ + unregister_nfs_version(&nfs_v2); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v2); +module_exit(exit_nfs_v2); diff --git a/kernel/fs/nfs/nfs2xdr.c b/kernel/fs/nfs/nfs2xdr.c new file mode 100644 index 000000000..b4e03ed85 --- /dev/null +++ b/kernel/fs/nfs/nfs2xdr.c @@ -0,0 +1,1151 @@ +/* + * linux/fs/nfs/nfs2xdr.c + * + * XDR functions to encode/decode NFS RPC arguments and results. + * + * Copyright (C) 1992, 1993, 1994 Rick Sladkey + * Copyright (C) 1996 Olaf Kirch + * 04 Aug 1998 Ion Badulescu + * FIFO's need special handling in NFSv2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +/* + * Declare the space requirements for NFS arguments and replies as + * number of 32bit-words + */ +#define NFS_fhandle_sz (8) +#define NFS_sattr_sz (8) +#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) +#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2)) +#define NFS_fattr_sz (17) +#define NFS_info_sz (5) +#define NFS_entry_sz (NFS_filename_sz+3) + +#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz) +#define NFS_readlinkargs_sz (NFS_fhandle_sz) +#define NFS_readargs_sz (NFS_fhandle_sz+3) +#define NFS_writeargs_sz (NFS_fhandle_sz+4) +#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz) +#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz) +#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz) +#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz) +#define NFS_readdirargs_sz (NFS_fhandle_sz+2) + +#define NFS_attrstat_sz (1+NFS_fattr_sz) +#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) +#define NFS_readlinkres_sz (2) +#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_writeres_sz (NFS_attrstat_sz) +#define NFS_stat_sz (1) +#define NFS_readdirres_sz (1) +#define NFS_statfsres_sz (1+NFS_info_sz) + +static int nfs_stat_to_errno(enum nfs_stat); + +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + * typedef opaque nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) +{ + u32 recvd, count; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; +out: + result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ + result->count = count; + return count; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * enum stat { + * NFS_OK = 0, + * NFSERR_PERM = 1, + * NFSERR_NOENT = 2, + * NFSERR_IO = 5, + * NFSERR_NXIO = 6, + * NFSERR_ACCES = 13, + * NFSERR_EXIST = 17, + * NFSERR_NODEV = 19, + * NFSERR_NOTDIR = 20, + * NFSERR_ISDIR = 21, + * NFSERR_FBIG = 27, + * NFSERR_NOSPC = 28, + * NFSERR_ROFS = 30, + * NFSERR_NAMETOOLONG = 63, + * NFSERR_NOTEMPTY = 66, + * NFSERR_DQUOT = 69, + * NFSERR_STALE = 70, + * NFSERR_WFLUSH = 99 + * }; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.2. ftype + * + * enum ftype { + * NFNON = 0, + * NFREG = 1, + * NFDIR = 2, + * NFBLK = 3, + * NFCHR = 4, + * NFLNK = 5 + * }; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type) +{ + *type = be32_to_cpup(p++); + if (unlikely(*type > NF2FIFO)) + *type = NFBAD; + return p; +} + +/* + * 2.3.3. fhandle + * + * typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS2_FHSIZE); + memcpy(p, fh->data, NFS2_FHSIZE); +} + +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.4. timeval + * + * struct timeval { + * unsigned int seconds; + * unsigned int useconds; + * }; + */ +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + if (timep->tv_nsec != 0) + *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); + else + *p++ = cpu_to_be32(0); + return p; +} + +/* + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time". It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly. See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. + */ +static __be32 *xdr_encode_current_server_time(__be32 *p, + const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(1000000); + return p; +} + +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; + return p; +} + +/* + * 2.3.5. fattr + * + * struct fattr { + * ftype type; + * unsigned int mode; + * unsigned int nlink; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * unsigned int blocksize; + * unsigned int rdev; + * unsigned int blocks; + * unsigned int fsid; + * unsigned int fileid; + * timeval atime; + * timeval mtime; + * timeval ctime; + * }; + * + */ +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + u32 rdev, type; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_V2; + + p = xdr_decode_ftype(p, &type); + + fattr->mode = be32_to_cpup(p++); + fattr->nlink = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + + fattr->size = be32_to_cpup(p++); + fattr->du.nfs2.blocksize = be32_to_cpup(p++); + + rdev = be32_to_cpup(p++); + fattr->rdev = new_decode_dev(rdev); + if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { + fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; + fattr->rdev = 0; + } + + fattr->du.nfs2.blocks = be32_to_cpup(p++); + fattr->fsid.major = be32_to_cpup(p++); + fattr->fsid.minor = 0; + fattr->fileid = be32_to_cpup(p++); + + p = xdr_decode_time(p, &fattr->atime); + p = xdr_decode_time(p, &fattr->mtime); + xdr_decode_time(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + + return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.6. sattr + * + * struct sattr { + * unsigned int mode; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * timeval atime; + * timeval mtime; + * }; + */ + +#define NFS2_SATTR_NOT_SET (0xffffffff) + +static __be32 *xdr_time_not_set(__be32 *p) +{ + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + return p; +} + +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); + + if (attr->ia_valid & ATTR_MODE) + *p++ = cpu_to_be32(attr->ia_mode); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_UID) + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_GID) + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_SIZE) + *p++ = cpu_to_be32((u32)attr->ia_size); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + + if (attr->ia_valid & ATTR_ATIME_SET) + p = xdr_encode_time(p, &attr->ia_atime); + else if (attr->ia_valid & ATTR_ATIME) + p = xdr_encode_current_server_time(p, &attr->ia_atime); + else + p = xdr_time_not_set(p); + if (attr->ia_valid & ATTR_MTIME_SET) + xdr_encode_time(p, &attr->ia_mtime); + else if (attr->ia_valid & ATTR_MTIME) + xdr_encode_current_server_time(p, &attr->ia_mtime); + else + xdr_time_not_set(p); +} + +/* + * 2.3.7. filename + * + * typedef string filename; + */ +static void encode_filename(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + WARN_ON_ONCE(length > NFS2_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_filename_inline(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.8. path + * + * typedef string path; + */ +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_path(struct xdr_stream *xdr) +{ + u32 length, recvd; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p); + if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) + goto out_size; + recvd = xdr_read_pages(xdr, length); + if (unlikely(length > recvd)) + goto out_cheating; + xdr_terminate_string(xdr->buf, length); + return 0; +out_size: + dprintk("NFS: returned pathname too long: %u\n", length); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "length %u > received %u\n", length, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.9. attrstat + * + * union attrstat switch (stat status) { + * case NFS_OK: + * fattr attributes; + * default: + * void; + * }; + */ +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, + __u32 *op_status) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (op_status) + *op_status = status; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.3.10. diropargs + * + * struct diropargs { + * fhandle dir; + * filename name; + * }; + */ +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) +{ + encode_fhandle(xdr, fh); + encode_filename(xdr, name, length); +} + +/* + * 2.3.11. diropres + * + * union diropres switch (stat status) { + * case NFS_OK: + * struct { + * fhandle file; + * fattr attributes; + * } diropok; + * default: + * void; + * }; + */ +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + int error; + + error = decode_fhandle(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_fattr(xdr, result->fattr); +out: + return error; +} + +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_diropok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_fhandle(xdr, fh); +} + +/* + * 2.2.3. sattrargs + * + * struct sattrargs { + * fhandle file; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_sattrargs *args) +{ + encode_fhandle(xdr, args->fh); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_diropargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); +} + +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readlinkargs *args) +{ + encode_fhandle(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); +} + +/* + * 2.2.7. readargs + * + * struct readargs { + * fhandle file; + * unsigned offset; + * unsigned count; + * unsigned totalcount; + * }; + */ +static void encode_readargs(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + *p = cpu_to_be32(count); +} + +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_readargs(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; +} + +/* + * 2.2.9. writeargs + * + * struct writeargs { + * fhandle file; + * unsigned beginoffset; + * unsigned offset; + * unsigned totalcount; + * nfsdata data; + * }; + */ +static void encode_writeargs(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + + /* nfsdata */ + *p = cpu_to_be32(count); + xdr_write_pages(xdr, args->pages, args->pgbase, count); +} + +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_writeargs(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} + +/* + * 2.2.10. createargs + * + * struct createargs { + * diropargs where; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_createargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs(xdr, args->fh, args->name.name, args->name.len); +} + +/* + * 2.2.12. renameargs + * + * struct renameargs { + * diropargs from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs(xdr, args->old_dir, old->name, old->len); + encode_diropargs(xdr, args->new_dir, new->name, new->len); +} + +/* + * 2.2.13. linkargs + * + * struct linkargs { + * fhandle from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_linkargs *args) +{ + encode_fhandle(xdr, args->fromfh); + encode_diropargs(xdr, args->tofh, args->toname, args->tolen); +} + +/* + * 2.2.14. symlinkargs + * + * struct symlinkargs { + * diropargs from; + * path to; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_symlinkargs *args) +{ + encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); + encode_path(xdr, args->pages, args->pathlen); + encode_sattr(xdr, args->sattr); +} + +/* + * 2.2.17. readdirargs + * + * struct readdirargs { + * fhandle dir; + * nfscookie cookie; + * unsigned count; + * }; + */ +static void encode_readdirargs(struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(args->cookie); + *p = cpu_to_be32(args->count); +} + +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + encode_readdirargs(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS_readdirres_sz); +} + +/* + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + return decode_attrstat(xdr, result, NULL); +} + +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_diropok *result) +{ + return decode_diropres(xdr, result); +} + +/* + * 2.2.6. readlinkres + * + * union readlinkres switch (stat status) { + * case NFS_OK: + * path data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_path(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.7. readres + * + * union readres switch (stat status) { + * case NFS_OK: + * fattr attributes; + * nfsdata data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + result->op_status = status; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_nfsdata(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + /* All NFSv2 writes are "file sync" writes */ + result->verf->committed = NFS_FILE_SYNC; + return decode_attrstat(xdr, result->fattr, &result->op_status); +} + +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17. entry + * + * struct entry { + * unsigned fileid; + * filename name; + * nfscookie cookie; + * entry *nextentry; + * }; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->ino = be32_to_cpup(p); + + error = decode_filename_inline(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + /* + * The type (size and byte order) of nfscookie isn't defined in + * RFC 1094. This implementation assumes that it's an XDR uint32. + */ + entry->prev_cookie = entry->cookie; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->cookie = be32_to_cpup(p); + + entry->d_type = DT_UNKNOWN; + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +} + +/* + * 2.2.17. readdirres + * + * union readdirres switch (stat status) { + * case NFS_OK: + * struct { + * entry *entries; + * bool eof; + * } readdirok; + * default: + * void; + * }; + * + * Read the directory contents into the page cache, but don't + * touch them. The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls. + */ +static int decode_readdirok(struct xdr_stream *xdr) +{ + return xdr_read_pages(xdr, xdr->buf->page_len); +} + +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_readdirok(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.18. statfsres + * + * union statfsres (stat status) { + * case NFS_OK: + * struct { + * unsigned tsize; + * unsigned bsize; + * unsigned blocks; + * unsigned bfree; + * unsigned bavail; + * } info; + * default: + * void; + * }; + */ +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_info_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + result->tsize = be32_to_cpup(p++); + result->bsize = be32_to_cpup(p++); + result->blocks = be32_to_cpup(p++); + result->bfree = be32_to_cpup(p++); + result->bavail = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs2_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_info(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +static int nfs_stat_to_errno(enum nfs_stat status) +{ + int i; + + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; +} + +#define PROC(proc, argtype, restype, timer) \ +[NFSPROC_##proc] = { \ + .p_proc = NFSPROC_##proc, \ + .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \ + .p_arglen = NFS_##argtype##_sz, \ + .p_replen = NFS_##restype##_sz, \ + .p_timer = timer, \ + .p_statidx = NFSPROC_##proc, \ + .p_name = #proc, \ + } +struct rpc_procinfo nfs_procedures[] = { + PROC(GETATTR, fhandle, attrstat, 1), + PROC(SETATTR, sattrargs, attrstat, 0), + PROC(LOOKUP, diropargs, diropres, 2), + PROC(READLINK, readlinkargs, readlinkres, 3), + PROC(READ, readargs, readres, 3), + PROC(WRITE, writeargs, writeres, 4), + PROC(CREATE, createargs, diropres, 0), + PROC(REMOVE, removeargs, stat, 0), + PROC(RENAME, renameargs, stat, 0), + PROC(LINK, linkargs, stat, 0), + PROC(SYMLINK, symlinkargs, stat, 0), + PROC(MKDIR, createargs, diropres, 0), + PROC(RMDIR, diropargs, stat, 0), + PROC(READDIR, readdirargs, readdirres, 3), + PROC(STATFS, fhandle, statfsres, 0), +}; + +const struct rpc_version nfs_version2 = { + .number = 2, + .nrprocs = ARRAY_SIZE(nfs_procedures), + .procs = nfs_procedures +}; diff --git a/kernel/fs/nfs/nfs3_fs.h b/kernel/fs/nfs/nfs3_fs.h new file mode 100644 index 000000000..e134d6548 --- /dev/null +++ b/kernel/fs/nfs/nfs3_fs.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014 Anna Schumaker. + * + * NFSv3-specific filesystem definitions and declarations + */ +#ifndef __LINUX_FS_NFS_NFS3_FS_H +#define __LINUX_FS_NFS_NFS3_FS_H + +/* + * nfs3acl.c + */ +#ifdef CONFIG_NFS_V3_ACL +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl); +extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); +extern const struct xattr_handler *nfs3_xattr_handlers[]; +#else +static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + return 0; +} +#define nfs3_listxattr NULL +#endif /* CONFIG_NFS_V3_ACL */ + +/* nfs3client.c */ +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); + +/* nfs3super.c */ +extern struct nfs_subversion nfs_v3; + +#endif /* __LINUX_FS_NFS_NFS3_FS_H */ diff --git a/kernel/fs/nfs/nfs3acl.c b/kernel/fs/nfs/nfs3acl.c new file mode 100644 index 000000000..1ebe2fc7c --- /dev/null +++ b/kernel/fs/nfs/nfs3acl.c @@ -0,0 +1,296 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "nfs3_fs.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +struct posix_acl *nfs3_get_acl(struct inode *inode, int type) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), + /* The xdr layer may allocate pages here. */ + .pages = pages, + }; + struct nfs3_getaclres res = { + NULL, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status, count; + + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + return ERR_PTR(-EOPNOTSUPP); + + status = nfs_revalidate_inode(server, inode); + if (status < 0) + return ERR_PTR(status); + + /* + * Only get the access acl when explicitly requested: We don't + * need it for access decisions, and only some applications use + * it. Applications which request the access acl first are not + * penalized from this optimization. + */ + if (type == ACL_TYPE_ACCESS) + args.mask |= NFS_ACLCNT|NFS_ACL; + if (S_ISDIR(inode->i_mode)) + args.mask |= NFS_DFACLCNT|NFS_DFACL; + if (args.mask == 0) + return NULL; + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return ERR_PTR(-ENOMEM); + + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + + /* pages may have been allocated at the xdr layer. */ + for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) + __free_page(args.pages[count]); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL extension not supported; disabling\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + default: + goto getout; + } + if ((args.mask & res.mask) != args.mask) { + status = -EIO; + goto getout; + } + + if (res.acl_access != NULL) { + if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || + res.acl_access->a_count == 0) { + posix_acl_release(res.acl_access); + res.acl_access = NULL; + } + } + + if (res.mask & NFS_ACL) + set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + else + forget_cached_acl(inode, ACL_TYPE_ACCESS); + + if (res.mask & NFS_DFACL) + set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + else + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + nfs_free_fattr(res.fattr); + if (type == ACL_TYPE_ACCESS) { + posix_acl_release(res.acl_default); + return res.acl_access; + } else { + posix_acl_release(res.acl_access); + return res.acl_default; + } + +getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); + nfs_free_fattr(res.fattr); + return ERR_PTR(status); +} + +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, + .mask = NFS_ACL, + .acl_access = acl, + .pages = pages, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &fattr, + }; + int status = 0; + + if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL)) + goto out; + + status = -EOPNOTSUPP; + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + goto out; + + /* We are doing this here because XDR marshalling does not + * return any results, it BUGs. */ + status = -ENOSPC; + if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (S_ISDIR(inode->i_mode)) { + args.mask |= NFS_DFACL; + args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); + } + + dprintk("NFS call setacl\n"); + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out_freepages; + + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); + dprintk("NFS reply setacl: %d\n", status); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, fattr); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL SETACL RPC not supported" + "(will not retry)\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + } + nfs_free_fattr(fattr); +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } +out: + return status; +} + +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + int ret; + ret = __nfs3_proc_setacls(inode, acl, dfacl); + return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct posix_acl *alloc = NULL, *dfacl = NULL; + int status; + + if (S_ISDIR(inode->i_mode)) { + switch(type) { + case ACL_TYPE_ACCESS: + alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; + } + } + + if (acl == NULL) { + alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(alloc)) + goto fail; + } + status = __nfs3_proc_setacls(inode, acl, dfacl); + posix_acl_release(alloc); + return status; + +fail: + return PTR_ERR(alloc); +} + +const struct xattr_handler *nfs3_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) +{ + struct posix_acl *acl; + char *p = data + *result; + + acl = get_acl(inode, type); + if (IS_ERR_OR_NULL(acl)) + return 0; + + posix_acl_release(acl); + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = d_inode(dentry); + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; +} diff --git a/kernel/fs/nfs/nfs3client.c b/kernel/fs/nfs/nfs3client.c new file mode 100644 index 000000000..9e9fa347a --- /dev/null +++ b/kernel/fs/nfs/nfs3client.c @@ -0,0 +1,107 @@ +#include +#include +#include +#include "internal.h" +#include "nfs3_fs.h" + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static const struct rpc_version *nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +const struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; + +/* + * Initialise an NFSv3 ACL client connection + */ +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server = nfs_create_server(mount_info, nfs_mod); + /* Create a client RPC handle for the NFS v3 ACL management interface */ + if (!IS_ERR(server)) + nfs_init_server_aclclient(server); + return server; +} + +struct nfs_server *nfs3_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor); + if (!IS_ERR(server) && !IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + return server; +} + +/* + * Set up a pNFS Data Server client over NFSv3. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, + rpc_authflavor_t au_flavor) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .nfs_mod = &nfs_v3, + .proto = ds_proto, + .net = mds_clp->cl_net, + }; + struct rpc_timeout ds_timeout; + struct nfs_client *clp; + char buf[INET6_ADDRSTRLEN + 1]; + + /* fake a hostname because lockd wants it */ + if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + return ERR_PTR(-EINVAL); + cl_init.hostname = buf; + + /* Use the MDS nfs_client cl_ipaddr. */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + au_flavor); + + return clp; +} +EXPORT_SYMBOL_GPL(nfs3_set_ds_client); diff --git a/kernel/fs/nfs/nfs3proc.c b/kernel/fs/nfs/nfs3proc.c new file mode 100644 index 000000000..cb28cceef --- /dev/null +++ b/kernel/fs/nfs/nfs3proc.c @@ -0,0 +1,974 @@ +/* + * linux/fs/nfs/nfs3proc.c + * + * Client-side NFSv3 procedures stubs. + * + * Copyright (C) 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iostat.h" +#include "internal.h" +#include "nfs3_fs.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +/* A wrapper to handle the EJUKEBOX error messages */ +static int +nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EJUKEBOX) + break; + freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags) + +static int +nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) +{ + if (task->tk_status != -EJUKEBOX) + return 0; + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +static int +do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("%s: call fsinfo\n", __func__); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply fsinfo: %d\n", __func__, status); + if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_resp = info->fattr; + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); + } + return status; +} + +/* + * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb + */ +static int +nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_get_root(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + +/* + * One function for each procedure in the NFS protocol. + */ +static int +nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call getattr\n"); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply getattr: %d\n", status); + return status; +} + +static int +nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = d_inode(dentry); + struct nfs3_sattrargs arg = { + .fh = NFS_FH(inode), + .sattr = sattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr, fattr); + dprintk("NFS reply setattr: %d\n", status); + return status; +} + +static int +nfs3_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { + .fh = fhandle, + .fattr = fattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + dprintk("NFS call lookup %s\n", name->name); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + return -ENOMEM; + + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; + struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = entry->cred, + }; + int mode = entry->mask; + int status = -ENOMEM; + + dprintk("NFS call access\n"); + + if (mode & MAY_READ) + arg.access |= NFS3_ACCESS_READ; + if (S_ISDIR(inode->i_mode)) { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_LOOKUP; + } else { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) + entry->mask |= MAY_READ; + if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } + nfs_free_fattr(res.fattr); +out: + dprintk("NFS reply access: %d\n", status); + return status; +} + +static int nfs3_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs_fattr *fattr; + struct nfs3_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, + }; + int status = -ENOMEM; + + dprintk("NFS call readlink\n"); + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + msg.rpc_resp = fattr; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + dprintk("NFS reply readlink: %d\n", status); + return status; +} + +struct nfs3_createdata { + struct rpc_message msg; + union { + struct nfs3_createargs create; + struct nfs3_mkdirargs mkdir; + struct nfs3_symlinkargs symlink; + struct nfs3_mknodargs mknod; + } arg; + struct nfs3_diropres res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_attr; +}; + +static struct nfs3_createdata *nfs3_alloc_createdata(void) +{ + struct nfs3_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_attr = &data->dir_attr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_attr); + } + return data; +} + +static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +{ + int status; + + status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + nfs_post_op_update_inode(dir, data->res.dir_attr); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + return status; +} + +static void nfs3_free_createdata(struct nfs3_createdata *data) +{ + kfree(data); +} + +/* + * Create a regular file. + */ +static int +nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) +{ + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; + + dprintk("NFS call create %pd\n", dentry); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE]; + data->arg.create.fh = NFS_FH(dir); + data->arg.create.name = dentry->d_name.name; + data->arg.create.len = dentry->d_name.len; + data->arg.create.sattr = sattr; + + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; + if (flags & O_EXCL) { + data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; + data->arg.create.verifier[0] = cpu_to_be32(jiffies); + data->arg.create.verifier[1] = cpu_to_be32(current->pid); + } + + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + + for (;;) { + status = nfs3_do_create(dir, dentry, data); + + if (status != -ENOTSUPP) + break; + /* If the server doesn't support the exclusive creation + * semantics, try again with simple 'guarded' mode. */ + switch (data->arg.create.createmode) { + case NFS3_CREATE_EXCLUSIVE: + data->arg.create.createmode = NFS3_CREATE_GUARDED; + break; + + case NFS3_CREATE_GUARDED: + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; + break; + + case NFS3_CREATE_UNCHECKED: + goto out; + } + nfs_fattr_init(data->res.dir_attr); + nfs_fattr_init(data->res.fattr); + } + + if (status != 0) + goto out_release_acls; + + /* When we created the file with exclusive semantics, make + * sure we set the attributes afterwards. */ + if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { + dprintk("NFS call setattr (post-create)\n"); + + if (!(sattr->ia_valid & ATTR_ATIME_SET)) + sattr->ia_valid |= ATTR_ATIME; + if (!(sattr->ia_valid & ATTR_MTIME_SET)) + sattr->ia_valid |= ATTR_MTIME; + + /* Note: we could use a guarded setattr here, but I'm + * not sure this buys us anything (and I'd have + * to revamp the NFSv3 XDR code) */ + status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); + nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); + dprintk("NFS reply setattr (post-create): %d\n", status); + if (status != 0) + goto out_release_acls; + } + + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); +out: + nfs3_free_createdata(data); + dprintk("NFS reply create: %d\n", status); + return status; +} + +static int +nfs3_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_removeargs arg = { + .fh = NFS_FH(dir), + .name = *name, + }; + struct nfs_removeres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + dprintk("NFS call remove %s\n", name->name); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_free_fattr(res.dir_attr); +out: + dprintk("NFS reply remove: %d\n", status); + return status; +} + +static void +nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; +} + +static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + +static int +nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + struct nfs_removeres *res; + if (nfs3_async_handle_jukebox(task, dir)) + return 0; + res = task->tk_msg.rpc_resp; + nfs_post_op_update_inode(dir, res->dir_attr); + return 1; +} + +static void +nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; +} + +static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} + +static int +nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res; + + if (nfs3_async_handle_jukebox(task, old_dir)) + return 0; + res = task->tk_msg.rpc_resp; + + nfs_post_op_update_inode(old_dir, res->old_fattr); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + +static int +nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs3_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; + struct nfs3_linkres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + dprintk("NFS call link %s\n", name->name); + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); + dprintk("NFS reply link: %d\n", status); + return status; +} + +static int +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) +{ + struct nfs3_createdata *data; + int status = -ENOMEM; + + if (len > NFS3_MAXPATHLEN) + return -ENAMETOOLONG; + + dprintk("NFS call symlink %pd\n", dentry); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK]; + data->arg.symlink.fromfh = NFS_FH(dir); + data->arg.symlink.fromname = dentry->d_name.name; + data->arg.symlink.fromlen = dentry->d_name.len; + data->arg.symlink.pages = &page; + data->arg.symlink.pathlen = len; + data->arg.symlink.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + + nfs3_free_createdata(data); +out: + dprintk("NFS reply symlink: %d\n", status); + return status; +} + +static int +nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) +{ + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; + + dprintk("NFS call mkdir %pd\n", dentry); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; + data->arg.mkdir.fh = NFS_FH(dir); + data->arg.mkdir.name = dentry->d_name.name; + data->arg.mkdir.len = dentry->d_name.len; + data->arg.mkdir.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + if (status != 0) + goto out_release_acls; + + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); +out: + nfs3_free_createdata(data); + dprintk("NFS reply mkdir: %d\n", status); + return status; +} + +static int +nfs3_proc_rmdir(struct inode *dir, struct qstr *name) +{ + struct nfs_fattr *dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, + }; + int status = -ENOMEM; + + dprintk("NFS call rmdir %s\n", name->name); + dir_attr = nfs_alloc_fattr(); + if (dir_attr == NULL) + goto out; + + msg.rpc_resp = dir_attr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_post_op_update_inode(dir, dir_attr); + nfs_free_fattr(dir_attr); +out: + dprintk("NFS reply rmdir: %d\n", status); + return status; +} + +/* + * The READDIR implementation is somewhat hackish - we pass the user buffer + * to the encode function, which installs it in the receive iovec. + * The decode function itself doesn't perform any decoding, it just makes + * sure the reply is syntactically correct. + * + * Also note that this implementation handles both plain readdir and + * readdirplus. + */ +static int +nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = d_inode(dentry); + __be32 *verf = NFS_I(dir)->cookieverf; + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), + .cookie = cookie, + .verf = {verf[0], verf[1]}, + .plus = plus, + .count = count, + .pages = pages + }; + struct nfs3_readdirres res = { + .verf = verf, + .plus = plus + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred + }; + int status = -ENOMEM; + + if (plus) + msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + + dprintk("NFS call readdir%s %d\n", + plus? "plus" : "", (unsigned int) cookie); + + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); + nfs_refresh_inode(dir, res.dir_attr); + + nfs_free_fattr(res.dir_attr); +out: + dprintk("NFS reply readdir%s: %d\n", + plus? "plus" : "", status); + return status; +} + +static int +nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) +{ + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; + + dprintk("NFS call mknod %pd %u:%u\n", dentry, + MAJOR(rdev), MINOR(rdev)); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; + data->arg.mknod.fh = NFS_FH(dir); + data->arg.mknod.name = dentry->d_name.name; + data->arg.mknod.len = dentry->d_name.len; + data->arg.mknod.sattr = sattr; + data->arg.mknod.rdev = rdev; + + switch (sattr->ia_mode & S_IFMT) { + case S_IFBLK: + data->arg.mknod.type = NF3BLK; + break; + case S_IFCHR: + data->arg.mknod.type = NF3CHR; + break; + case S_IFIFO: + data->arg.mknod.type = NF3FIFO; + break; + case S_IFSOCK: + data->arg.mknod.type = NF3SOCK; + break; + default: + status = -EINVAL; + goto out; + } + + status = nfs3_do_create(dir, dentry, data); + if (status != 0) + goto out_release_acls; + + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); +out: + nfs3_free_createdata(data); + dprintk("NFS reply mknod: %d\n", status); + return status; +} + +static int +nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *stat) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT], + .rpc_argp = fhandle, + .rpc_resp = stat, + }; + int status; + + dprintk("NFS call fsstat\n"); + nfs_fattr_init(stat->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply fsstat: %d\n", status); + return status; +} + +static int +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("NFS call fsinfo\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("NFS reply fsinfo: %d\n", status); + return status; +} + +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + +static int +nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("NFS call pathconf\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply pathconf: %d\n", status); + return status; +} + +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + struct inode *inode = hdr->inode; + + if (hdr->pgio_done_cb != NULL) + return hdr->pgio_done_cb(task, hdr); + + if (nfs3_async_handle_jukebox(task, inode)) + return -EAGAIN; + + nfs_invalidate_atime(inode); + nfs_refresh_inode(inode, &hdr->fattr); + return 0; +} + +static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; +} + +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + rpc_call_start(task); + return 0; +} + +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + struct inode *inode = hdr->inode; + + if (hdr->pgio_done_cb != NULL) + return hdr->pgio_done_cb(task, hdr); + + if (nfs3_async_handle_jukebox(task, inode)) + return -EAGAIN; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; +} + +static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; +} + +static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + rpc_call_start(task); +} + +static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) +{ + if (data->commit_done_cb != NULL) + return data->commit_done_cb(task, data); + + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; + nfs_refresh_inode(data->inode, data->res.fattr); + return 0; +} + +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; +} + +static int +nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(filp); + + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); +} + +static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs3_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = nfs3_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif +}; + +static const struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = nfs3_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif +}; + +const struct nfs_rpc_ops nfs_v3_clientops = { + .version = 3, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .submount = nfs_submount, + .try_mount = nfs_try_mount, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, + .lookup = nfs3_proc_lookup, + .access = nfs3_proc_access, + .readlink = nfs3_proc_readlink, + .create = nfs3_proc_create, + .remove = nfs3_proc_remove, + .unlink_setup = nfs3_proc_unlink_setup, + .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, + .unlink_done = nfs3_proc_unlink_done, + .rename_setup = nfs3_proc_rename_setup, + .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, + .rename_done = nfs3_proc_rename_done, + .link = nfs3_proc_link, + .symlink = nfs3_proc_symlink, + .mkdir = nfs3_proc_mkdir, + .rmdir = nfs3_proc_rmdir, + .readdir = nfs3_proc_readdir, + .mknod = nfs3_proc_mknod, + .statfs = nfs3_proc_statfs, + .fsinfo = nfs3_proc_fsinfo, + .pathconf = nfs3_proc_pathconf, + .decode_dirent = nfs3_decode_dirent, + .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare, + .read_setup = nfs3_proc_read_setup, + .read_done = nfs3_read_done, + .write_setup = nfs3_proc_write_setup, + .write_done = nfs3_write_done, + .commit_setup = nfs3_proc_commit_setup, + .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, + .commit_done = nfs3_commit_done, + .lock = nfs3_proc_lock, + .clear_acl_cache = forget_all_cached_acls, + .close_context = nfs_close_context, + .have_delegation = nfs3_have_delegation, + .return_delegation = nfs3_return_delegation, + .alloc_client = nfs_alloc_client, + .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs3_create_server, + .clone_server = nfs3_clone_server, +}; diff --git a/kernel/fs/nfs/nfs3super.c b/kernel/fs/nfs/nfs3super.c new file mode 100644 index 000000000..5c4394e46 --- /dev/null +++ b/kernel/fs/nfs/nfs3super.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include +#include +#include "internal.h" +#include "nfs3_fs.h" +#include "nfs.h" + +struct nfs_subversion nfs_v3 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version3, + .rpc_ops = &nfs_v3_clientops, + .sops = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL + .xattr = nfs3_xattr_handlers, +#endif +}; + +static int __init init_nfs_v3(void) +{ + register_nfs_version(&nfs_v3); + return 0; +} + +static void __exit exit_nfs_v3(void) +{ + unregister_nfs_version(&nfs_v3); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v3); +module_exit(exit_nfs_v3); diff --git a/kernel/fs/nfs/nfs3xdr.c b/kernel/fs/nfs/nfs3xdr.c new file mode 100644 index 000000000..53852a4bd --- /dev/null +++ b/kernel/fs/nfs/nfs3xdr.c @@ -0,0 +1,2564 @@ +/* + * linux/fs/nfs/nfs3xdr.c + * + * XDR functions to encode/decode NFSv3 RPC arguments and results. + * + * Copyright (C) 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +/* + * Declare the space requirements for NFS arguments and replies as + * number of 32bit-words + */ +#define NFS3_fhandle_sz (1+16) +#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_sattr_sz (15) +#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) +#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) +#define NFS3_fattr_sz (21) +#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2) +#define NFS3_wcc_attr_sz (6) +#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) +#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) +#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) +#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) + +#define NFS3_getattrargs_sz (NFS3_fh_sz) +#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz) +#define NFS3_accessargs_sz (NFS3_fh_sz+1) +#define NFS3_readlinkargs_sz (NFS3_fh_sz) +#define NFS3_readargs_sz (NFS3_fh_sz+3) +#define NFS3_writeargs_sz (NFS3_fh_sz+5) +#define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) +#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) +#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) +#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) +#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3) +#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4) +#define NFS3_commitargs_sz (NFS3_fh_sz+3) + +#define NFS3_getattrres_sz (1+NFS3_fattr_sz) +#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz) +#define NFS3_removeres_sz (NFS3_setattrres_sz) +#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) +#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) +#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) +#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) +#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) +#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) +#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) + +#define ACL3_getaclargs_sz (NFS3_fh_sz+1) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) + +static int nfs3_stat_to_errno(enum nfs_stat); + +/* + * Map file type to S_IFMT bits + */ +static const umode_t nfs_type2fmt[] = { + [NF3BAD] = 0, + [NF3REG] = S_IFREG, + [NF3DIR] = S_IFDIR, + [NF3BLK] = S_IFBLK, + [NF3CHR] = S_IFCHR, + [NF3LNK] = S_IFLNK, + [NF3SOCK] = S_IFSOCK, + [NF3FIFO] = S_IFIFO, +}; + +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv3 basic data types + * + * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: + * "NFS Version 3 Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_uint32(struct xdr_stream *xdr, u32 value) +{ + __be32 *p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +static int decode_uint32(struct xdr_stream *xdr, u32 *value) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *value = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_uint64(struct xdr_stream *xdr, u64 *value) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + goto out_overflow; + xdr_decode_hyper(p, value); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * fileid3 + * + * typedef uint64 fileid3; + */ +static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid) +{ + return xdr_decode_hyper(p, fileid); +} + +static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid) +{ + return decode_uint64(xdr, fileid); +} + +/* + * filename3 + * + * typedef string filename3<>; + */ +static void encode_filename3(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + WARN_ON_ONCE(length > NFS3_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_inline_filename3(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; + +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * nfspath3 + * + * typedef string nfspath3<>; + */ +static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, + const u32 length) +{ + encode_uint32(xdr, length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_nfspath3(struct xdr_stream *xdr) +{ + u32 recvd, count; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) + goto out_nametoolong; + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; + xdr_terminate_string(xdr->buf, count); + return 0; + +out_nametoolong: + dprintk("NFS: returned pathname too long: %u\n", count); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "count %u > recvd %u\n", count, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * cookie3 + * + * typedef uint64 cookie3 + */ +static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie) +{ + return xdr_encode_hyper(p, cookie); +} + +static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie) +{ + return decode_uint64(xdr, cookie); +} + +/* + * cookieverf3 + * + * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE]; + */ +static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier) +{ + memcpy(p, verifier, NFS3_COOKIEVERFSIZE); + return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE); +} + +static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_COOKIEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * createverf3 + * + * typedef opaque createverf3[NFS3_CREATEVERFSIZE]; + */ +static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE); + memcpy(p, verifier, NFS3_CREATEVERFSIZE); +} + +static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * size3 + * + * typedef uint64 size3; + */ +static __be32 *xdr_decode_size3(__be32 *p, u64 *size) +{ + return xdr_decode_hyper(p, size); +} + +/* + * nfsstat3 + * + * enum nfsstat3 { + * NFS3_OK = 0, + * ... + * } + */ +#define NFS3_OK NFS_OK + +static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * ftype3 + * + * enum ftype3 { + * NF3REG = 1, + * NF3DIR = 2, + * NF3BLK = 3, + * NF3CHR = 4, + * NF3LNK = 5, + * NF3SOCK = 6, + * NF3FIFO = 7 + * }; + */ +static void encode_ftype3(struct xdr_stream *xdr, const u32 type) +{ + encode_uint32(xdr, type); +} + +static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode) +{ + u32 type; + + type = be32_to_cpup(p++); + if (type > NF3FIFO) + type = NF3NON; + *mode = nfs_type2fmt[type]; + return p; +} + +/* + * specdata3 + * + * struct specdata3 { + * uint32 specdata1; + * uint32 specdata2; + * }; + */ +static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(MAJOR(rdev)); + *p = cpu_to_be32(MINOR(rdev)); +} + +static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev) +{ + unsigned int major, minor; + + major = be32_to_cpup(p++); + minor = be32_to_cpup(p++); + *rdev = MKDEV(major, minor); + if (MAJOR(*rdev) != major || MINOR(*rdev) != minor) + *rdev = 0; + return p; +} + +/* + * nfs_fh3 + * + * struct nfs_fh3 { + * opaque data; + * }; + */ +static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + WARN_ON_ONCE(fh->size > NFS3_FHSIZE); + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); +} + +static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > NFS3_FHSIZE)) + goto out_toobig; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = length; + memcpy(fh->data, p, length); + return 0; +out_toobig: + dprintk("NFS: file handle size (%u) too big\n", length); + return -E2BIG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static void zero_nfs_fh3(struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(*fh)); +} + +/* + * nfstime3 + * + * struct nfstime3 { + * uint32 seconds; + * uint32 nseconds; + * }; + */ +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(timep->tv_nsec); + return p; +} + +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++); + return p; +} + +/* + * sattr3 + * + * enum time_how { + * DONT_CHANGE = 0, + * SET_TO_SERVER_TIME = 1, + * SET_TO_CLIENT_TIME = 2 + * }; + * + * union set_mode3 switch (bool set_it) { + * case TRUE: + * mode3 mode; + * default: + * void; + * }; + * + * union set_uid3 switch (bool set_it) { + * case TRUE: + * uid3 uid; + * default: + * void; + * }; + * + * union set_gid3 switch (bool set_it) { + * case TRUE: + * gid3 gid; + * default: + * void; + * }; + * + * union set_size3 switch (bool set_it) { + * case TRUE: + * size3 size; + * default: + * void; + * }; + * + * union set_atime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 atime; + * default: + * void; + * }; + * + * union set_mtime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 mtime; + * default: + * void; + * }; + * + * struct sattr3 { + * set_mode3 mode; + * set_uid3 uid; + * set_gid3 gid; + * set_size3 size; + * set_atime atime; + * set_mtime mtime; + * }; + */ +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +{ + u32 nbytes; + __be32 *p; + + /* + * In order to make only a single xdr_reserve_space() call, + * pre-compute the total number of bytes to be reserved. + * Six boolean values, one for each set_foo field, are always + * present in the encoded result, so start there. + */ + nbytes = 6 * 4; + if (attr->ia_valid & ATTR_MODE) + nbytes += 4; + if (attr->ia_valid & ATTR_UID) + nbytes += 4; + if (attr->ia_valid & ATTR_GID) + nbytes += 4; + if (attr->ia_valid & ATTR_SIZE) + nbytes += 8; + if (attr->ia_valid & ATTR_ATIME_SET) + nbytes += 8; + if (attr->ia_valid & ATTR_MTIME_SET) + nbytes += 8; + p = xdr_reserve_space(xdr, nbytes); + + if (attr->ia_valid & ATTR_MODE) { + *p++ = xdr_one; + *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_UID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_GID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_SIZE) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64)attr->ia_size); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_ATIME_SET) { + *p++ = xdr_two; + p = xdr_encode_nfstime3(p, &attr->ia_atime); + } else if (attr->ia_valid & ATTR_ATIME) { + *p++ = xdr_one; + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_MTIME_SET) { + *p++ = xdr_two; + xdr_encode_nfstime3(p, &attr->ia_mtime); + } else if (attr->ia_valid & ATTR_MTIME) { + *p = xdr_one; + } else + *p = xdr_zero; +} + +/* + * fattr3 + * + * struct fattr3 { + * ftype3 type; + * mode3 mode; + * uint32 nlink; + * uid3 uid; + * gid3 gid; + * size3 size; + * size3 used; + * specdata3 rdev; + * uint64 fsid; + * fileid3 fileid; + * nfstime3 atime; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + umode_t fmode; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + p = xdr_decode_ftype3(p, &fmode); + + fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; + fattr->nlink = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + + p = xdr_decode_size3(p, &fattr->size); + p = xdr_decode_size3(p, &fattr->du.nfs3.used); + p = xdr_decode_specdata3(p, &fattr->rdev); + + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; + + p = xdr_decode_fileid3(p, &fattr->fileid); + p = xdr_decode_nfstime3(p, &fattr->atime); + p = xdr_decode_nfstime3(p, &fattr->mtime); + xdr_decode_nfstime3(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + + fattr->valid |= NFS_ATTR_FATTR_V3; + return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * post_op_attr + * + * union post_op_attr switch (bool attributes_follow) { + * case TRUE: + * fattr3 attributes; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_fattr3(xdr, fattr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * wcc_attr + * struct wcc_attr { + * size3 size; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME; + + p = xdr_decode_size3(p, &fattr->pre_size); + p = xdr_decode_nfstime3(p, &fattr->pre_mtime); + xdr_decode_nfstime3(p, &fattr->pre_ctime); + fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * pre_op_attr + * union pre_op_attr switch (bool attributes_follow) { + * case TRUE: + * wcc_attr attributes; + * case FALSE: + * void; + * }; + * + * wcc_data + * + * struct wcc_data { + * pre_op_attr before; + * post_op_attr after; + * }; + */ +static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_wcc_attr(xdr, fattr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + int error; + + error = decode_pre_op_attr(xdr, fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, fattr); +out: + return error; +} + +/* + * post_op_fh3 + * + * union post_op_fh3 switch (bool handle_follows) { + * case TRUE: + * nfs_fh3 handle; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_nfs_fh3(xdr, fh); + zero_nfs_fh3(fh); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * diropargs3 + * + * struct diropargs3 { + * nfs_fh3 dir; + * filename3 name; + * }; + */ +static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) +{ + encode_nfs_fh3(xdr, fh); + encode_filename3(xdr, name, length); +} + + +/* + * NFSv3 XDR encode functions + * + * NFSv3 argument types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". + */ + +/* + * 3.3.1 GETATTR3args + * + * struct GETATTR3args { + * nfs_fh3 object; + * }; + */ +static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_nfs_fh3(xdr, fh); +} + +/* + * 3.3.2 SETATTR3args + * + * union sattrguard3 switch (bool check) { + * case TRUE: + * nfstime3 obj_ctime; + * case FALSE: + * void; + * }; + * + * struct SETATTR3args { + * nfs_fh3 object; + * sattr3 new_attributes; + * sattrguard3 guard; + * }; + */ +static void encode_sattrguard3(struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + __be32 *p; + + if (args->guard) { + p = xdr_reserve_space(xdr, 4 + 8); + *p++ = xdr_one; + xdr_encode_nfstime3(p, &args->guardtime); + } else { + p = xdr_reserve_space(xdr, 4); + *p = xdr_zero; + } +} + +static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_sattr3(xdr, args->sattr); + encode_sattrguard3(xdr, args); +} + +/* + * 3.3.3 LOOKUP3args + * + * struct LOOKUP3args { + * diropargs3 what; + * }; + */ +static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_diropargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); +} + +/* + * 3.3.4 ACCESS3args + * + * struct ACCESS3args { + * nfs_fh3 object; + * uint32 access; + * }; + */ +static void encode_access3args(struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->access); +} + +static void nfs3_xdr_enc_access3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_access3args(xdr, args); +} + +/* + * 3.3.5 READLINK3args + * + * struct READLINK3args { + * nfs_fh3 symlink; + * }; + */ +static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readlinkargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); +} + +/* + * 3.3.6 READ3args + * + * struct READ3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; + */ +static void encode_read3args(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_read3args(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS3_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; +} + +/* + * 3.3.7 WRITE3args + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * stable_how stable; + * opaque data<>; + * }; + */ +static void encode_write3args(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->count); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_write3args(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} + +/* + * 3.3.8 CREATE3args + * + * enum createmode3 { + * UNCHECKED = 0, + * GUARDED = 1, + * EXCLUSIVE = 2 + * }; + * + * union createhow3 switch (createmode3 mode) { + * case UNCHECKED: + * case GUARDED: + * sattr3 obj_attributes; + * case EXCLUSIVE: + * createverf3 verf; + * }; + * + * struct CREATE3args { + * diropargs3 where; + * createhow3 how; + * }; + */ +static void encode_createhow3(struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + encode_sattr3(xdr, args->sattr); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); + break; + default: + BUG(); + } +} + +static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_createhow3(xdr, args); +} + +/* + * 3.3.9 MKDIR3args + * + * struct MKDIR3args { + * diropargs3 where; + * sattr3 attributes; + * }; + */ +static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mkdirargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_sattr3(xdr, args->sattr); +} + +/* + * 3.3.10 SYMLINK3args + * + * struct symlinkdata3 { + * sattr3 symlink_attributes; + * nfspath3 symlink_data; + * }; + * + * struct SYMLINK3args { + * diropargs3 where; + * symlinkdata3 symlink; + * }; + */ +static void encode_symlinkdata3(struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_sattr3(xdr, args->sattr); + encode_nfspath3(xdr, args->pages, args->pathlen); +} + +static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); + encode_symlinkdata3(xdr, args); +} + +/* + * 3.3.11 MKNOD3args + * + * struct devicedata3 { + * sattr3 dev_attributes; + * specdata3 spec; + * }; + * + * union mknoddata3 switch (ftype3 type) { + * case NF3CHR: + * case NF3BLK: + * devicedata3 device; + * case NF3SOCK: + * case NF3FIFO: + * sattr3 pipe_attributes; + * default: + * void; + * }; + * + * struct MKNOD3args { + * diropargs3 where; + * mknoddata3 what; + * }; + */ +static void encode_devicedata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_sattr3(xdr, args->sattr); + encode_specdata3(xdr, args->rdev); +} + +static void encode_mknoddata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: + encode_devicedata3(xdr, args); + break; + case NF3SOCK: + case NF3FIFO: + encode_sattr3(xdr, args->sattr); + break; + case NF3REG: + case NF3DIR: + break; + default: + BUG(); + } +} + +static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_mknoddata3(xdr, args); +} + +/* + * 3.3.12 REMOVE3args + * + * struct REMOVE3args { + * diropargs3 object; + * }; + */ +static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name.name, args->name.len); +} + +/* + * 3.3.14 RENAME3args + * + * struct RENAME3args { + * diropargs3 from; + * diropargs3 to; + * }; + */ +static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs3(xdr, args->old_dir, old->name, old->len); + encode_diropargs3(xdr, args->new_dir, new->name, new->len); +} + +/* + * 3.3.15 LINK3args + * + * struct LINK3args { + * nfs_fh3 file; + * diropargs3 link; + * }; + */ +static void nfs3_xdr_enc_link3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_linkargs *args) +{ + encode_nfs_fh3(xdr, args->fromfh); + encode_diropargs3(xdr, args->tofh, args->toname, args->tolen); +} + +/* + * 3.3.16 READDIR3args + * + * struct READDIR3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 count; + * }; + */ +static void encode_readdir3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdir3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); +} + +/* + * 3.3.17 READDIRPLUS3args + * + * struct READDIRPLUS3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 dircount; + * count3 maxcount; + * }; + */ +static void encode_readdirplus3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + + /* + * readdirplus: need dircount + buffer size. + * We just make sure we make dircount big enough + */ + *p++ = cpu_to_be32(args->count >> 3); + + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdirplus3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); +} + +/* + * 3.3.21 COMMIT3args + * + * struct COMMIT3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; + */ +static void encode_commit3args(struct xdr_stream *xdr, + const struct nfs_commitargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_commitargs *args) +{ + encode_commit3args(xdr, args); +} + +#ifdef CONFIG_NFS_V3_ACL + +static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_getaclargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->mask); + if (args->mask & (NFS_ACL | NFS_DFACL)) + prepare_reply_buffer(req, args->pages, 0, + NFSACL_MAXPAGES << PAGE_SHIFT, + ACL3_getaclres_sz); +} + +static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_setaclargs *args) +{ + unsigned int base; + int error; + + encode_nfs_fh3(xdr, NFS_FH(args->inode)); + encode_uint32(xdr, args->mask); + + base = req->rq_slen; + if (args->npages != 0) + xdr_write_pages(xdr, args->pages, 0, args->len); + else + xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + + error = nfsacl_encode(xdr->buf, base, args->inode, + (args->mask & NFS_ACL) ? + args->acl_access : NULL, 1, 0); + /* FIXME: this is just broken */ + BUG_ON(error < 0); + error = nfsacl_encode(xdr->buf, base + error, args->inode, + (args->mask & NFS_DFACL) ? + args->acl_default : NULL, 1, + NFS_ACL_DEFAULT); + BUG_ON(error < 0); +} + +#endif /* CONFIG_NFS_V3_ACL */ + +/* + * NFSv3 XDR decode functions + * + * NFSv3 result types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". + */ + +/* + * 3.3.1 GETATTR3res + * + * struct GETATTR3resok { + * fattr3 obj_attributes; + * }; + * + * union GETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * GETATTR3resok resok; + * default: + * void; + * }; + */ +static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_fattr3(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.2 SETATTR3res + * + * struct SETATTR3resok { + * wcc_data obj_wcc; + * }; + * + * struct SETATTR3resfail { + * wcc_data obj_wcc; + * }; + * + * union SETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * SETATTR3resok resok; + * default: + * SETATTR3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.3 LOOKUP3res + * + * struct LOOKUP3resok { + * nfs_fh3 object; + * post_op_attr obj_attributes; + * post_op_attr dir_attributes; + * }; + * + * struct LOOKUP3resfail { + * post_op_attr dir_attributes; + * }; + * + * union LOOKUP3res switch (nfsstat3 status) { + * case NFS3_OK: + * LOOKUP3resok resok; + * default: + * LOOKUP3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfs_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->dir_attr); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.4 ACCESS3res + * + * struct ACCESS3resok { + * post_op_attr obj_attributes; + * uint32 access; + * }; + * + * struct ACCESS3resfail { + * post_op_attr obj_attributes; + * }; + * + * union ACCESS3res switch (nfsstat3 status) { + * case NFS3_OK: + * ACCESS3resok resok; + * default: + * ACCESS3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_accessres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_uint32(xdr, &result->access); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.5 READLINK3res + * + * struct READLINK3resok { + * post_op_attr symlink_attributes; + * nfspath3 data; + * }; + * + * struct READLINK3resfail { + * post_op_attr symlink_attributes; + * }; + * + * union READLINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * READLINK3resok resok; + * default: + * READLINK3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfspath3(xdr); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.6 READ3res + * + * struct READ3resok { + * post_op_attr file_attributes; + * count3 count; + * bool eof; + * opaque data<>; + * }; + * + * struct READ3resfail { + * post_op_attr file_attributes; + * }; + * + * union READ3res switch (nfsstat3 status) { + * case NFS3_OK: + * READ3resok resok; + * default: + * READ3resfail resfail; + * }; + */ +static int decode_read3resok(struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + u32 eof, count, ocount, recvd; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p++); + eof = be32_to_cpup(p++); + ocount = be32_to_cpup(p++); + if (unlikely(ocount != count)) + goto out_mismatch; + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; +out: + result->eof = eof; + result->count = count; + return count; +out_mismatch: + dprintk("NFS: READ count doesn't match length of opaque: " + "count %u != ocount %u\n", count, ocount); + return -EIO; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + result->op_status = status; + if (status != NFS3_OK) + goto out_status; + error = decode_read3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.7 WRITE3res + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3resok { + * wcc_data file_wcc; + * count3 count; + * stable_how committed; + * writeverf3 verf; + * }; + * + * struct WRITE3resfail { + * wcc_data file_wcc; + * }; + * + * union WRITE3res switch (nfsstat3 status) { + * case NFS3_OK: + * WRITE3resok resok; + * default: + * WRITE3resfail resfail; + * }; + */ +static int decode_write3resok(struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->count = be32_to_cpup(p++); + result->verf->committed = be32_to_cpup(p++); + if (unlikely(result->verf->committed > NFS_FILE_SYNC)) + goto out_badvalue; + if (decode_writeverf3(xdr, &result->verf->verifier)) + goto out_eio; + return result->count; +out_badvalue: + dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); +out_eio: + return -EIO; +} + +static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + result->op_status = status; + if (status != NFS3_OK) + goto out_status; + error = decode_write3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.8 CREATE3res + * + * struct CREATE3resok { + * post_op_fh3 obj; + * post_op_attr obj_attributes; + * wcc_data dir_wcc; + * }; + * + * struct CREATE3resfail { + * wcc_data dir_wcc; + * }; + * + * union CREATE3res switch (nfsstat3 status) { + * case NFS3_OK: + * CREATE3resok resok; + * default: + * CREATE3resfail resfail; + * }; + */ +static int decode_create3resok(struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + int error; + + error = decode_post_op_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + /* The server isn't required to return a file handle. + * If it didn't, force the client to perform a LOOKUP + * to determine the correct file handle and attribute + * values for the new object. */ + if (result->fh->size == 0) + result->fattr->valid = 0; + error = decode_wcc_data(xdr, result->dir_attr); +out: + return error; +} + +static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_create3resok(xdr, result); +out: + return error; +out_default: + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.12 REMOVE3res + * + * struct REMOVE3resok { + * wcc_data dir_wcc; + * }; + * + * struct REMOVE3resfail { + * wcc_data dir_wcc; + * }; + * + * union REMOVE3res switch (nfsstat3 status) { + * case NFS3_OK: + * REMOVE3resok resok; + * default: + * REMOVE3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_removeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.14 RENAME3res + * + * struct RENAME3resok { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * struct RENAME3resfail { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * union RENAME3res switch (nfsstat3 status) { + * case NFS3_OK: + * RENAME3resok resok; + * default: + * RENAME3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_renameres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->old_fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->new_fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.15 LINK3res + * + * struct LINK3resok { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * struct LINK3resfail { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * union LINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * LINK3resok resok; + * default: + * LINK3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs3_linkres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/** + * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in + * the local page cache + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 3.3.16 entry3 + * + * struct entry3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * fhandle3 filehandle; + * post_op_attr3 attributes; + * entry3 *nextentry; + * }; + * + * 3.3.17 entryplus3 + * struct entryplus3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * post_op_attr name_attributes; + * post_op_fh3 name_handle; + * entryplus3 *nextentry; + * }; + */ +int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + struct nfs_entry old = *entry; + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + error = decode_fileid3(xdr, &entry->ino); + if (unlikely(error)) + return error; + + error = decode_inline_filename3(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + entry->prev_cookie = entry->cookie; + error = decode_cookie3(xdr, &entry->cookie); + if (unlikely(error)) + return error; + + entry->d_type = DT_UNKNOWN; + + if (plus) { + entry->fattr->valid = 0; + error = decode_post_op_attr(xdr, entry->fattr); + if (unlikely(error)) + return error; + if (entry->fattr->valid & NFS_ATTR_FATTR_V3) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + if (entry->fattr->fileid != entry->ino) { + entry->fattr->mounted_on_fileid = entry->ino; + entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + + /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) { + error = decode_nfs_fh3(xdr, entry->fh); + if (unlikely(error)) { + if (error == -E2BIG) + goto out_truncated; + return error; + } + } else + zero_nfs_fh3(entry->fh); + } + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +out_truncated: + dprintk("NFS: directory entry contains invalid file handle\n"); + *entry = old; + return -EAGAIN; +} + +/* + * 3.3.16 READDIR3res + * + * struct dirlist3 { + * entry3 *entries; + * bool eof; + * }; + * + * struct READDIR3resok { + * post_op_attr dir_attributes; + * cookieverf3 cookieverf; + * dirlist3 reply; + * }; + * + * struct READDIR3resfail { + * post_op_attr dir_attributes; + * }; + * + * union READDIR3res switch (nfsstat3 status) { + * case NFS3_OK: + * READDIR3resok resok; + * default: + * READDIR3resfail resfail; + * }; + * + * Read the directory contents into the page cache, but otherwise + * don't touch them. The actual decoding is done by nfs3_decode_entry() + * during subsequent nfs_readdir() calls. + */ +static int decode_dirlist3(struct xdr_stream *xdr) +{ + return xdr_read_pages(xdr, xdr->buf->page_len); +} + +static int decode_readdir3resok(struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + int error; + + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + /* XXX: do we need to check if result->verf != NULL ? */ + error = decode_cookieverf3(xdr, result->verf); + if (unlikely(error)) + goto out; + error = decode_dirlist3(xdr); +out: + return error; +} + +static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_readdir3resok(xdr, result); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.18 FSSTAT3res + * + * struct FSSTAT3resok { + * post_op_attr obj_attributes; + * size3 tbytes; + * size3 fbytes; + * size3 abytes; + * size3 tfiles; + * size3 ffiles; + * size3 afiles; + * uint32 invarsec; + * }; + * + * struct FSSTAT3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSSTAT3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSSTAT3resok resok; + * default: + * FSSTAT3resfail resfail; + * }; + */ +static int decode_fsstat3resok(struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8 * 6 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + p = xdr_decode_size3(p, &result->tbytes); + p = xdr_decode_size3(p, &result->fbytes); + p = xdr_decode_size3(p, &result->abytes); + p = xdr_decode_size3(p, &result->tfiles); + p = xdr_decode_size3(p, &result->ffiles); + xdr_decode_size3(p, &result->afiles); + /* ignore invarsec */ + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsstat3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.19 FSINFO3res + * + * struct FSINFO3resok { + * post_op_attr obj_attributes; + * uint32 rtmax; + * uint32 rtpref; + * uint32 rtmult; + * uint32 wtmax; + * uint32 wtpref; + * uint32 wtmult; + * uint32 dtpref; + * size3 maxfilesize; + * nfstime3 time_delta; + * uint32 properties; + * }; + * + * struct FSINFO3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSINFO3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSINFO3resok resok; + * default: + * FSINFO3resfail resfail; + * }; + */ +static int decode_fsinfo3resok(struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->rtmax = be32_to_cpup(p++); + result->rtpref = be32_to_cpup(p++); + result->rtmult = be32_to_cpup(p++); + result->wtmax = be32_to_cpup(p++); + result->wtpref = be32_to_cpup(p++); + result->wtmult = be32_to_cpup(p++); + result->dtpref = be32_to_cpup(p++); + p = xdr_decode_size3(p, &result->maxfilesize); + xdr_decode_nfstime3(p, &result->time_delta); + + /* ignore properties */ + result->lease_time = 0; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsinfo3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.20 PATHCONF3res + * + * struct PATHCONF3resok { + * post_op_attr obj_attributes; + * uint32 linkmax; + * uint32 name_max; + * bool no_trunc; + * bool chown_restricted; + * bool case_insensitive; + * bool case_preserving; + * }; + * + * struct PATHCONF3resfail { + * post_op_attr obj_attributes; + * }; + * + * union PATHCONF3res switch (nfsstat3 status) { + * case NFS3_OK: + * PATHCONF3resok resok; + * default: + * PATHCONF3resfail resfail; + * }; + */ +static int decode_pathconf3resok(struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * 6); + if (unlikely(p == NULL)) + goto out_overflow; + result->max_link = be32_to_cpup(p++); + result->max_namelen = be32_to_cpup(p); + /* ignore remaining fields */ + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_pathconf3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.21 COMMIT3res + * + * struct COMMIT3resok { + * wcc_data file_wcc; + * writeverf3 verf; + * }; + * + * struct COMMIT3resfail { + * wcc_data file_wcc; + * }; + * + * union COMMIT3res switch (nfsstat3 status) { + * case NFS3_OK: + * COMMIT3resok resok; + * default: + * COMMIT3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_commitres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + result->op_status = status; + if (status != NFS3_OK) + goto out_status; + error = decode_writeverf3(xdr, &result->verf->verifier); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +#ifdef CONFIG_NFS_V3_ACL + +static inline int decode_getacl3resok(struct xdr_stream *xdr, + struct nfs3_getaclres *result) +{ + struct posix_acl **acl; + unsigned int *aclcnt; + size_t hdrlen; + int error; + + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_uint32(xdr, &result->mask); + if (unlikely(error)) + goto out; + error = -EINVAL; + if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + goto out; + + hdrlen = xdr_stream_pos(xdr); + + acl = NULL; + if (result->mask & NFS_ACL) + acl = &result->acl_access; + aclcnt = NULL; + if (result->mask & NFS_ACLCNT) + aclcnt = &result->acl_access_count; + error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl); + if (unlikely(error <= 0)) + goto out; + + acl = NULL; + if (result->mask & NFS_DFACL) + acl = &result->acl_default; + aclcnt = NULL; + if (result->mask & NFS_DFACLCNT) + aclcnt = &result->acl_default_count; + error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl); + if (unlikely(error <= 0)) + return error; + error = 0; +out: + return error; +} + +static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_getaclres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_getacl3resok(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} + +static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_post_op_attr(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} + +#endif /* CONFIG_NFS_V3_ACL */ + + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs3_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +static int nfs3_stat_to_errno(enum nfs_stat status) +{ + int i; + + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; +} + + +#define PROC(proc, argtype, restype, timer) \ +[NFS3PROC_##proc] = { \ + .p_proc = NFS3PROC_##proc, \ + .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \ + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \ + .p_arglen = NFS3_##argtype##args_sz, \ + .p_replen = NFS3_##restype##res_sz, \ + .p_timer = timer, \ + .p_statidx = NFS3PROC_##proc, \ + .p_name = #proc, \ + } + +struct rpc_procinfo nfs3_procedures[] = { + PROC(GETATTR, getattr, getattr, 1), + PROC(SETATTR, setattr, setattr, 0), + PROC(LOOKUP, lookup, lookup, 2), + PROC(ACCESS, access, access, 1), + PROC(READLINK, readlink, readlink, 3), + PROC(READ, read, read, 3), + PROC(WRITE, write, write, 4), + PROC(CREATE, create, create, 0), + PROC(MKDIR, mkdir, create, 0), + PROC(SYMLINK, symlink, create, 0), + PROC(MKNOD, mknod, create, 0), + PROC(REMOVE, remove, remove, 0), + PROC(RMDIR, lookup, setattr, 0), + PROC(RENAME, rename, rename, 0), + PROC(LINK, link, link, 0), + PROC(READDIR, readdir, readdir, 3), + PROC(READDIRPLUS, readdirplus, readdir, 3), + PROC(FSSTAT, getattr, fsstat, 0), + PROC(FSINFO, getattr, fsinfo, 0), + PROC(PATHCONF, getattr, pathconf, 0), + PROC(COMMIT, commit, commit, 5), +}; + +const struct rpc_version nfs_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(nfs3_procedures), + .procs = nfs3_procedures +}; + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_procinfo nfs3_acl_procedures[] = { + [ACLPROC3_GETACL] = { + .p_proc = ACLPROC3_GETACL, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res, + .p_arglen = ACL3_getaclargs_sz, + .p_replen = ACL3_getaclres_sz, + .p_timer = 1, + .p_name = "GETACL", + }, + [ACLPROC3_SETACL] = { + .p_proc = ACLPROC3_SETACL, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res, + .p_arglen = ACL3_setaclargs_sz, + .p_replen = ACL3_setaclres_sz, + .p_timer = 0, + .p_name = "SETACL", + }, +}; + +const struct rpc_version nfsacl_version3 = { + .number = 3, + .nrprocs = sizeof(nfs3_acl_procedures)/ + sizeof(nfs3_acl_procedures[0]), + .procs = nfs3_acl_procedures, +}; +#endif /* CONFIG_NFS_V3_ACL */ diff --git a/kernel/fs/nfs/nfs42.h b/kernel/fs/nfs/nfs42.h new file mode 100644 index 000000000..7afb8947d --- /dev/null +++ b/kernel/fs/nfs/nfs42.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2014 Anna Schumaker + */ + +#ifndef __LINUX_FS_NFS_NFS4_2_H +#define __LINUX_FS_NFS_NFS4_2_H + +/* nfs4.2proc.c */ +int nfs42_proc_allocate(struct file *, loff_t, loff_t); +int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +loff_t nfs42_proc_llseek(struct file *, loff_t, int); + +/* nfs4.2xdr.h */ +extern struct rpc_procinfo nfs4_2_procedures[]; + +#endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/kernel/fs/nfs/nfs42proc.c b/kernel/fs/nfs/nfs42proc.c new file mode 100644 index 000000000..3a9e75235 --- /dev/null +++ b/kernel/fs/nfs/nfs42proc.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2014 Anna Schumaker + */ +#include +#include +#include +#include +#include +#include +#include +#include "nfs4_fs.h" +#include "nfs42.h" + +static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, + fmode_t fmode) +{ + struct nfs_open_context *open; + struct nfs_lock_context *lock; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + lock = nfs_get_lock_context(open); + if (IS_ERR(lock)) { + put_nfs_open_context(open); + return PTR_ERR(lock); + } + + ret = nfs4_set_rw_stateid(dst, open, lock, fmode); + + nfs_put_lock_context(lock); + put_nfs_open_context(open); + return ret; +} + +static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs42_falloc_args args = { + .falloc_fh = NFS_FH(inode), + .falloc_offset = offset, + .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, + }; + int status; + + msg->rpc_argp = &args; + msg->rpc_resp = &res; + + status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE); + if (status) + return status; + + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; +} + +static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, + loff_t offset, loff_t len) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_fallocate(msg, filep, offset, len); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + +int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); + return err; +} + +int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) + return -EOPNOTSUPP; + + nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); + return err; +} + +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct inode *inode = file_inode(filep); + struct nfs42_seek_args args = { + .sa_fh = NFS_FH(inode), + .sa_offset = offset, + .sa_what = (whence == SEEK_HOLE) ? + NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA, + }; + struct nfs42_seek_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct nfs_server *server = NFS_SERVER(inode); + int status; + + if (!nfs_server_capable(inode, NFS_CAP_SEEK)) + return -ENOTSUPP; + + status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); + if (status) + return status; + + nfs_wb_all(inode); + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_SEEK; + if (status) + return status; + + return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); +} diff --git a/kernel/fs/nfs/nfs42xdr.c b/kernel/fs/nfs/nfs42xdr.c new file mode 100644 index 000000000..1a25b2724 --- /dev/null +++ b/kernel/fs/nfs/nfs42xdr.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2014 Anna Schumaker + */ +#ifndef __LINUX_FS_NFS_NFS4_2XDR_H +#define __LINUX_FS_NFS_NFS4_2XDR_H + +#define encode_fallocate_maxsz (encode_stateid_maxsz + \ + 2 /* offset */ + \ + 2 /* length */) +#define encode_allocate_maxsz (op_encode_hdr_maxsz + \ + encode_fallocate_maxsz) +#define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ + encode_fallocate_maxsz) +#define decode_deallocate_maxsz (op_decode_hdr_maxsz) +#define encode_seek_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 2 /* offset */ + \ + 1 /* whence */) +#define decode_seek_maxsz (op_decode_hdr_maxsz + \ + 1 /* eof */ + \ + 1 /* whence */ + \ + 2 /* offset */ + \ + 2 /* length */) + +#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_seek_maxsz) +#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_seek_maxsz) + + +static void encode_fallocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + encode_nfs4_stateid(xdr, &args->falloc_stateid); + encode_uint64(xdr, args->falloc_offset); + encode_uint64(xdr, args->falloc_length); +} + +static void encode_allocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr); + encode_fallocate(xdr, args); +} + +static void encode_deallocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr); + encode_fallocate(xdr, args); +} + +static void encode_seek(struct xdr_stream *xdr, + struct nfs42_seek_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->sa_stateid); + encode_uint64(xdr, args->sa_offset); + encode_uint32(xdr, args->sa_what); +} + +/* + * Encode ALLOCATE request + */ +static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode DEALLOCATE request + */ +static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode SEEK request + */ +static void nfs4_xdr_enc_seek(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_seek_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->sa_fh, &hdr); + encode_seek(xdr, args, &hdr); + encode_nops(&hdr); +} + +static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) +{ + return decode_op_hdr(xdr, OP_ALLOCATE); +} + +static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) +{ + return decode_op_hdr(xdr, OP_DEALLOCATE); +} + +static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) +{ + int status; + __be32 *p; + + status = decode_op_hdr(xdr, OP_SEEK); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4 + 8); + if (unlikely(!p)) + goto out_overflow; + + res->sr_eof = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &res->sr_offset); + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * Decode ALLOCATE request + */ +static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_falloc_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* + * Decode DEALLOCATE request + */ +static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_falloc_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* + * Decode SEEK request + */ +static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_seek_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_seek(xdr, res); +out: + return status; +} +#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/kernel/fs/nfs/nfs4_fs.h b/kernel/fs/nfs/nfs4_fs.h new file mode 100644 index 000000000..fdef424b0 --- /dev/null +++ b/kernel/fs/nfs/nfs4_fs.h @@ -0,0 +1,529 @@ +/* + * linux/fs/nfs/nfs4_fs.h + * + * Copyright (C) 2005 Trond Myklebust + * + * NFSv4-specific filesystem definitions and declarations + */ + +#ifndef __LINUX_FS_NFS_NFS4_FS_H +#define __LINUX_FS_NFS_NFS4_FS_H + +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif + +#if IS_ENABLED(CONFIG_NFS_V4) + +#define NFS4_MAX_LOOP_ON_RECOVER (10) + +#include + +struct idmap; + +enum nfs4_client_state { + NFS4CLNT_MANAGER_RUNNING = 0, + NFS4CLNT_CHECK_LEASE, + NFS4CLNT_LEASE_EXPIRED, + NFS4CLNT_RECLAIM_REBOOT, + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_LEASE_CONFIRM, + NFS4CLNT_SERVER_SCOPE_MISMATCH, + NFS4CLNT_PURGE_STATE, + NFS4CLNT_BIND_CONN_TO_SESSION, + NFS4CLNT_MOVED, + NFS4CLNT_LEASE_MOVED, +}; + +#define NFS4_RENEW_TIMEOUT 0x01 +#define NFS4_RENEW_DELEGATION_CB 0x02 + +struct nfs_seqid_counter; +struct nfs4_minor_version_ops { + u32 minor_version; + unsigned init_caps; + + int (*init_client)(struct nfs_client *); + void (*shutdown_client)(struct nfs_client *); + bool (*match_stateid)(const nfs4_stateid *, + const nfs4_stateid *); + int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); + void (*free_lock_state)(struct nfs_server *, + struct nfs4_lock_state *); + struct nfs_seqid * + (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + const struct rpc_call_ops *call_sync_ops; + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; + const struct nfs4_mig_recovery_ops *mig_recovery_ops; +}; + +#define NFS_SEQID_CONFIRMED 1 +struct nfs_seqid_counter { + ktime_t create_time; + int owner_id; + int flags; + u32 counter; + spinlock_t lock; /* Protects the list */ + struct list_head list; /* Defines sequence of RPC calls */ + struct rpc_wait_queue wait; /* RPC call delay queue */ +}; + +struct nfs_seqid { + struct nfs_seqid_counter *sequence; + struct list_head list; + struct rpc_task *task; +}; + +static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) +{ + if (seqid_mutating_err(-status)) + seqid->flags |= NFS_SEQID_CONFIRMED; +} + +/* + * NFS4 state_owners and lock_owners are simply labels for ordered + * sequences of RPC calls. Their sole purpose is to provide once-only + * semantics by allowing the server to identify replayed requests. + */ +struct nfs4_state_owner { + struct nfs_server *so_server; + struct list_head so_lru; + unsigned long so_expires; + struct rb_node so_server_node; + + struct rpc_cred *so_cred; /* Associated cred */ + + spinlock_t so_lock; + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; + struct nfs_seqid_counter so_seqid; + seqcount_t so_reclaim_seqcount; + struct mutex so_delegreturn_mutex; +}; + +enum { + NFS_OWNER_RECLAIM_REBOOT, + NFS_OWNER_RECLAIM_NOGRACE +}; + +#define NFS_LOCK_NEW 0 +#define NFS_LOCK_RECLAIM 1 +#define NFS_LOCK_EXPIRED 2 + +/* + * struct nfs4_state maintains the client-side state for a given + * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). + * + * OPEN: + * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server, + * we need to know how many files are open for reading or writing on a + * given inode. This information too is stored here. + * + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + +struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +#define NFS_LOCK_INITIALIZED 0 +#define NFS_LOCK_LOST 1 + unsigned long ls_flags; + struct nfs_seqid_counter ls_seqid; + nfs4_stateid ls_stateid; + atomic_t ls_count; + fl_owner_t ls_owner; +}; + +/* bits for nfs4_state->flags */ +enum { + LK_STATE_IN_USE, + NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_OPEN_STATE, /* OPEN stateid is set */ + NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ + NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ + NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ + NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ + NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ + NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ +}; + +struct nfs4_state { + struct list_head open_states; /* List of states for the same state_owner */ + struct list_head inode_states; /* List of states for the same inode */ + struct list_head lock_states; /* List of subservient lock stateids */ + + struct nfs4_state_owner *owner; /* Pointer to the open owner */ + struct inode *inode; /* Pointer to the inode */ + + unsigned long flags; /* Do we hold any locks? */ + spinlock_t state_lock; /* Protects the lock_states list */ + + seqlock_t seqlock; /* Protects the stateid/open_stateid */ + nfs4_stateid stateid; /* Current stateid: may be delegation */ + nfs4_stateid open_stateid; /* OPEN stateid */ + + /* The following 3 fields are protected by owner->so_lock */ + unsigned int n_rdonly; /* Number of read-only references */ + unsigned int n_wronly; /* Number of write-only references */ + unsigned int n_rdwr; /* Number of read/write references */ + fmode_t state; /* State on the server (R,W, or RW) */ + atomic_t count; +}; + + +struct nfs4_exception { + long timeout; + int retry; + struct nfs4_state *state; + struct inode *inode; +}; + +struct nfs4_state_recovery_ops { + int owner_flag_bit; + int state_flag_bit; + int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); + int (*recover_lock)(struct nfs4_state *, struct file_lock *); + int (*establish_clid)(struct nfs_client *, struct rpc_cred *); + int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); + int (*detect_trunking)(struct nfs_client *, struct nfs_client **, + struct rpc_cred *); +}; + +struct nfs4_state_maintenance_ops { + int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); + struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); + int (*renew_lease)(struct nfs_client *, struct rpc_cred *); +}; + +struct nfs4_mig_recovery_ops { + int (*get_locations)(struct inode *, struct nfs4_fs_locations *, + struct page *, struct rpc_cred *); + int (*fsid_present)(struct inode *, struct rpc_cred *); +}; + +extern const struct dentry_operations nfs4_dentry_operations; + +/* dir.c */ +int nfs_atomic_open(struct inode *, struct dentry *, struct file *, + unsigned, umode_t, int *); + +/* super.c */ +extern struct file_system_type nfs4_fs_type; + +/* nfs4namespace.c */ +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); +struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations); + +/* nfs4proc.c */ +extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); +extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, + struct rpc_message *, struct nfs4_sequence_args *, + struct nfs4_sequence_res *, int); +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); +extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); +extern int nfs4_destroy_clientid(struct nfs_client *clp); +extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); +extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, + struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, + struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, + struct nfs_fh *, struct nfs_fattr *); +extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); +extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode); + +#if defined(CONFIG_NFS_V4_1) +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); +extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + bool sync); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ + struct rpc_cred *newcred = NULL; + rpc_authflavor_t flavor; + + if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { + spin_lock(&clp->cl_lock); + if (clp->cl_machine_cred != NULL) + /* don't call get_rpccred on the machine cred - + * a reference will be held for life of clp */ + newcred = clp->cl_machine_cred; + spin_unlock(&clp->cl_lock); + msg->rpc_cred = newcred; + + flavor = clp->cl_rpcclient->cl_auth->au_flavor; + WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I && + flavor != RPC_AUTH_GSS_KRB5P); + *clntp = clp->cl_rpcclient; + + return true; + } + return false; +} + +/* + * Function responsible for determining if an rpc_message should use the + * machine cred under SP4_MACH_CRED and if so switching the credential and + * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p). + * Should be called before rpc_call_sync/rpc_call_async. + */ +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ + _nfs4_state_protect(clp, sp4_mode, clntp, msg); +} + +/* + * Special wrapper to nfs4_state_protect for write. + * If WRITE can use machine cred but COMMIT cannot, make sure all writes + * that use machine cred use NFS_FILE_SYNC. + */ +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_pgio_header *hdr) +{ + if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && + !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) + hdr->args.stable = NFS_FILE_SYNC; +} +#else /* CONFIG_NFS_v4_1 */ +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} + +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ +} + +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_pgio_header *hdr) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + +extern const u32 nfs4_fattr_bitmap[3]; +extern const u32 nfs4_statfs_bitmap[3]; +extern const u32 nfs4_pathconf_bitmap[3]; +extern const u32 nfs4_fsinfo_bitmap[3]; +extern const u32 nfs4_fs_locations_bitmap[3]; + +void nfs40_shutdown_client(struct nfs_client *); +void nfs41_shutdown_client(struct nfs_client *); +int nfs40_init_client(struct nfs_client *); +int nfs41_init_client(struct nfs_client *); +void nfs4_free_client(struct nfs_client *); + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); + +/* nfs4renewd.c */ +extern void nfs4_schedule_state_renewal(struct nfs_client *); +extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); +extern void nfs4_kill_renewd(struct nfs_client *); +extern void nfs4_renew_state(struct work_struct *); + +/* nfs4state.c */ +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **); +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); +#if defined(CONFIG_NFS_V4_1) +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); +extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); +extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); +extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); + +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t); +extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern void nfs4_purge_state_owners(struct nfs_server *); +extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); +extern void nfs4_put_open_state(struct nfs4_state *); +extern void nfs4_close_state(struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct nfs4_state *, fmode_t); +extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); +extern int nfs4_wait_clnt_recover(struct nfs_client *clp); +extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); +extern void nfs4_schedule_state_manager(struct nfs_client *); +extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_server_scope(struct nfs_client *, + struct nfs41_server_scope **); +extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); +extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, + fmode_t, const struct nfs_lockowner *); + +extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); +extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_release_seqid(struct nfs_seqid *seqid); +extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task); +extern int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res); + +extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); + +extern const nfs4_stateid zero_stateid; + +/* nfs4super.c */ +struct nfs_mount_info; +extern struct nfs_subversion nfs_v4; +struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); +extern bool nfs4_disable_idmapping; +extern unsigned short max_session_slots; +extern unsigned short send_implementation_id; +extern bool recover_lost_locks; + +#define NFS4_CLIENT_ID_UNIQ_LEN (64) +extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; + +/* nfs4sysctl.c */ +#ifdef CONFIG_SYSCTL +int nfs4_register_sysctl(void); +void nfs4_unregister_sysctl(void); +#else +static inline int nfs4_register_sysctl(void) +{ + return 0; +} + +static inline void nfs4_unregister_sysctl(void) +{ +} +#endif + +/* nfs4xdr.c */ +extern struct rpc_procinfo nfs4_procedures[]; + +struct nfs4_mount_data; + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst, src, sizeof(*dst)) == 0; +} + +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ + return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} + +#else + +#define nfs4_close_state(a, b) do { } while (0) +#define nfs4_close_sync(a, b) do { } while (0) +#define nfs4_state_protect(a, b, c, d) do { } while (0) +#define nfs4_state_protect_write(a, b, c, d) do { } while (0) + +#endif /* CONFIG_NFS_V4 */ +#endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/kernel/fs/nfs/nfs4client.c b/kernel/fs/nfs/nfs4client.c new file mode 100644 index 000000000..e42be52a8 --- /dev/null +++ b/kernel/fs/nfs/nfs4client.c @@ -0,0 +1,1232 @@ +/* + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "callback.h" +#include "delegation.h" +#include "nfs4session.h" +#include "nfs4idmap.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; + idr_preload(GFP_KERNEL); + spin_lock(&nn->nfs_client_lock); + ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; + spin_unlock(&nn->nfs_client_lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +#ifdef CONFIG_NFS_V4_1 +/** + * Per auth flavor data server rpc clients + */ +struct nfs4_ds_server { + struct list_head list; /* ds_clp->cl_ds_clients */ + struct rpc_clnt *rpc_clnt; +}; + +/** + * Common lookup case for DS I/O + */ +static struct nfs4_ds_server * +nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + rcu_read_lock(); + list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + dss = NULL; +out: + rcu_read_unlock(); + return dss; +} + +static struct nfs4_ds_server * +nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor, + struct nfs4_ds_server *new) +{ + struct nfs4_ds_server *dss; + + spin_lock(&ds_clp->cl_lock); + list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + if (new) + list_add_rcu(&new->list, &ds_clp->cl_ds_clients); + dss = new; +out: + spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */ + return dss; +} + +static struct nfs4_ds_server * +nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + dss = kmalloc(sizeof(*dss), GFP_NOFS); + if (dss == NULL) + return ERR_PTR(-ENOMEM); + + dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor); + if (IS_ERR(dss->rpc_clnt)) { + int err = PTR_ERR(dss->rpc_clnt); + kfree (dss); + return ERR_PTR(err); + } + INIT_LIST_HEAD(&dss->list); + + return dss; +} + +static void +nfs4_free_ds_server(struct nfs4_ds_server *dss) +{ + rpc_release_client(dss->rpc_clnt); + kfree(dss); +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) +{ + struct nfs4_ds_server *dss, *new; + rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor; + + dss = nfs4_find_ds_client(ds_clp, flavor); + if (dss != NULL) + goto out; + new = nfs4_alloc_ds_server(ds_clp, flavor); + if (IS_ERR(new)) + return ERR_CAST(new); + dss = nfs4_add_ds_client(ds_clp, flavor, new); + if (dss != new) + nfs4_free_ds_server(new); +out: + return dss->rpc_clnt; +} +EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client); + +static void +nfs4_shutdown_ds_clients(struct nfs_client *clp) +{ + struct nfs4_ds_server *dss; + LIST_HEAD(shutdown_list); + + while (!list_empty(&clp->cl_ds_clients)) { + dss = list_entry(clp->cl_ds_clients.next, + struct nfs4_ds_server, list); + list_del(&dss->list); + rpc_shutdown_client(dss->rpc_clnt); + kfree (dss); + } +} + +void nfs41_shutdown_client(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) { + nfs4_shutdown_ds_clients(clp); + nfs4_destroy_session(clp->cl_session); + nfs4_destroy_clientid(clp); + } + +} +#endif /* CONFIG_NFS_V4_1 */ + +void nfs40_shutdown_client(struct nfs_client *clp) +{ + if (clp->cl_slot_tbl) { + nfs4_shutdown_slot_table(clp->cl_slot_tbl); + kfree(clp->cl_slot_tbl); + } +} + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) +{ + int err; + struct nfs_client *clp = nfs_alloc_client(cl_init); + if (IS_ERR(clp)) + return clp; + + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error; + + if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) { + err = -EINVAL; + goto error; + } + + spin_lock_init(&clp->cl_lock); + INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + INIT_LIST_HEAD(&clp->cl_ds_clients); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + clp->cl_mig_gen = 1; + return clp; + +error: + nfs_free_client(clp); + return ERR_PTR(err); +} + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + clp->cl_mvops->shutdown_client(clp); + nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); + kfree(clp->cl_serverowner); + kfree(clp->cl_serverscope); + kfree(clp->cl_implid); + kfree(clp->cl_owner_id); +} + +void nfs4_free_client(struct nfs_client *clp) +{ + nfs4_shutdown_client(clp); + nfs_free_client(clp); +} + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ + struct rpc_xprt *xprt; + int error; + + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + if (error < 0) + return error; + } + + error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + + return 0; +} + +/** + * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs40_init_client(struct nfs_client *clp) +{ + struct nfs4_slot_table *tbl; + int ret; + + tbl = kzalloc(sizeof(*tbl), GFP_NOFS); + if (tbl == NULL) + return -ENOMEM; + + ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, + "NFSv4.0 transport Slot table"); + if (ret) { + kfree(tbl); + return ret; + } + + clp->cl_slot_tbl = tbl; + return 0; +} + +#if defined(CONFIG_NFS_V4_1) + +/** + * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs41_init_client(struct nfs_client *clp) +{ + struct nfs4_session *session = NULL; + + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ + int ret; + + ret = clp->cl_mvops->init_client(clp); + if (ret) + return ret; + return nfs4_init_callback(clp); +} + +/** + * nfs4_init_client - Initialise an NFS4 client record + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: callback IP address in presentation format + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. + */ +struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr) +{ + char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_client *old; + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return clp; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + if (clp->cl_minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + if (error == -EINVAL) + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + if (error < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (error < 0) + goto error; + error = rpc_ntop(sap, buf, sizeof(buf)); + if (error < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + error = nfs4_init_client_minor_version(clp); + if (error < 0) + goto error; + + if (!nfs4_has_session(clp)) + nfs_mark_client_ready(clp, NFS_CS_READY); + + error = nfs4_discover_server_trunking(clp, &old); + if (error < 0) + goto error; + + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; + +error: + nfs_mark_client_ready(clp, error); + nfs_put_client(clp); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return ERR_PTR(error); +} + +/* + * SETCLIENTID just did a callback update with the callback ident in + * "drop," but server trunking discovery claims "drop" and "keep" are + * actually the same server. Swap the callback IDs so that "keep" + * will continue to use the callback ident the server now knows about, + * and so that "keep"'s original callback ident is destroyed when + * "drop" is freed. + */ +static void nfs4_swap_callback_idents(struct nfs_client *keep, + struct nfs_client *drop) +{ + struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id); + unsigned int save = keep->cl_cb_ident; + + if (keep->cl_cb_ident == drop->cl_cb_ident) + return; + + dprintk("%s: keeping callback ident %u and dropping ident %u\n", + __func__, keep->cl_cb_ident, drop->cl_cb_ident); + + spin_lock(&nn->nfs_client_lock); + + idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident); + keep->cl_cb_ident = drop->cl_cb_ident; + + idr_replace(&nn->cb_ident_idr, drop, save); + drop->cl_cb_ident = save; + + spin_unlock(&nn->nfs_client_lock); +} + +static bool nfs4_match_client_owner_id(const struct nfs_client *clp1, + const struct nfs_client *clp2) +{ + if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL) + return true; + return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0; +} + +/** + * nfs40_walk_client_list - Find server that recognizes a client ID + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs40_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs40_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *prev = NULL; + struct nfs4_setclientid_res clid = { + .clientid = new->cl_clientid, + .confirm = new->cl_confirm, + }; + int status = -NFS4ERR_STALE_CLIENTID; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos" */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + if (status < 0) + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); + } + if (pos->cl_cons_state != NFS_CS_READY) + continue; + + if (pos->cl_clientid != new->cl_clientid) + continue; + + if (!nfs4_match_client_owner_id(pos, new)) + continue; + + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(prev); + prev = pos; + + status = nfs4_proc_setclientid_confirm(pos, &clid, cred); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: + nfs4_swap_callback_idents(pos, new); + + prev = NULL; + *result = pos; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + goto out; + case -ERESTARTSYS: + case -ETIMEDOUT: + /* The callback path may have been inadvertently + * changed. Schedule recovery! + */ + nfs4_schedule_path_down_recovery(pos); + default: + goto out; + } + + spin_lock(&nn->nfs_client_lock); + } + spin_unlock(&nn->nfs_client_lock); + + /* No match found. The server lost our clientid */ +out: + nfs_put_client(prev); + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ + if (a->cl_clientid != b->cl_clientid) { + dprintk("NFS: --> %s client ID %llx does not match %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return false; + } + dprintk("NFS: --> %s client ID %llx matches %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return true; +} + +/* + * Returns true if the server major ids match + */ +static bool +nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) +{ + struct nfs41_server_owner *o1 = a->cl_serverowner; + struct nfs41_server_owner *o2 = b->cl_serverowner; + + if (o1->major_id_sz != o2->major_id_sz) + goto out_major_mismatch; + if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) + goto out_major_mismatch; + + dprintk("NFS: --> %s server owners match\n", __func__); + return true; + +out_major_mismatch: + dprintk("NFS: --> %s server owner major IDs do not match\n", + __func__); + return false; +} + +/** + * nfs41_walk_client_list - Find nfs_client that matches a client/server owner + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs41_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs41_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *prev = NULL; + int status = -NFS4ERR_STALE_CLIENTID; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos == new) + goto found; + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + if (status < 0) + break; + status = -NFS4ERR_STALE_CLIENTID; + } + if (pos->cl_cons_state != NFS_CS_READY) + continue; + + if (!nfs4_match_clientids(pos, new)) + continue; + + /* + * Note that session trunking is just a special subcase of + * client id trunking. In either case, we want to fall back + * to using the existing nfs_client. + */ + if (!nfs4_check_clientid_trunking(pos, new)) + continue; + + /* Unlike NFSv4.0, we know that NFSv4.1 always uses the + * uniform string, however someone might switch the + * uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + continue; +found: + atomic_inc(&pos->cl_count); + *result = pos; + status = 0; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + break; + } + + /* No matching nfs_client found. */ + spin_unlock(&nn->nfs_client_lock); + dprintk("NFS: <-- %s status = %d\n", __func__, status); + nfs_put_client(prev); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs_server_return_all_delegations(server); + unset_pnfs_layoutdriver(server); + nfs4_purge_state_owners(server); +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(struct net *net, int cb_ident) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + clp = idr_find(&nn->cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* Common match routine for v4.0 and v4.1 callback services */ +static bool nfs4_cb_match_client(const struct sockaddr *addr, + struct nfs_client *clp, u32 minorversion) +{ + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; + + smp_rmb(); + + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; + + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; + + return true; +} + +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid, u32 minorversion) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, minorversion) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; + } + spin_unlock(&nn->nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid, u32 minorversion) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion, struct net *net) +{ + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = addr, + .addrlen = addrlen, + .nfs_mod = &nfs_v4, + .proto = proto, + .minorversion = minorversion, + .net = net, + }; + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + if (server->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, + u32 minor_version, rpc_authflavor_t au_flavor) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .nfs_mod = &nfs_v4, + .proto = ds_proto, + .minorversion = minor_version, + .net = mds_clp->cl_net, + }; + struct rpc_timeout ds_timeout; + struct nfs_client *clp; + char buf[INET6_ADDRSTRLEN + 1]; + + if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + return ERR_PTR(-EINVAL); + cl_init.hostname = buf; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + au_flavor); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh, bool auth_probe) +{ + struct nfs_fattr *fattr; + int error; + + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server->nfs_client); + if (error < 0) + goto out; + + /* Set the basic capabilities */ + server->caps |= server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh, auth_probe); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; +out: + nfs_free_fattr(fattr); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + struct nfs_parsed_mount_data *data) +{ + struct rpc_timeout timeparms; + int error; + + dprintk("--> nfs4_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->options = data->options; + server->auth_info = data->auth_info; + + /* Use the first specified auth flavor. If this flavor isn't + * allowed by the server, use the SECINFO path to try the + * other specified flavors */ + if (data->auth_info.flavor_len >= 1) + data->selected_flavor = data->auth_info.flavors[0]; + else + data->selected_flavor = RPC_AUTH_UNIX; + + /* Get a client record */ + error = nfs4_set_client(server, + data->nfs_server.hostname, + (const struct sockaddr *)&data->nfs_server.address, + data->nfs_server.addrlen, + data->client_address, + data->selected_flavor, + data->nfs_server.protocol, + &timeparms, + data->minorversion, + data->net); + if (error < 0) + goto error; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); + +error: + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh)*/ +struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + bool auth_probe; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; + + /* set up the general RPC client */ + error = nfs4_init_server(server, mount_info->parsed); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe); + if (error < 0) + goto error; + + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *mntfh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + bool auth_probe; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + data->authflavor, + rpc_protocol(parent_server->client), + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); + if (error < 0) + goto error; + + auth_probe = parent_server->auth_info.flavor_len < 1; + + error = nfs4_server_common_setup(server, mntfh, auth_probe); + if (error < 0) + goto error; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ + struct inode *inode = d_inode(server->super->s_root); + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + + nfs_free_fattr(fattr); + return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * @net: net namespace + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, struct net *net) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_clnt *clnt = server->client; + struct xprt_create xargs = { + .ident = clp->cl_proto, + .net = net, + .dstaddr = sap, + .addrlen = salen, + .servername = hostname, + }; + char buf[INET6_ADDRSTRLEN + 1]; + struct sockaddr_storage address; + struct sockaddr *localaddr = (struct sockaddr *)&address; + int error; + + dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + hostname); + + error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); + if (error != 0) { + dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", + __func__, error); + goto out; + } + + error = rpc_localaddr(clnt, localaddr, sizeof(address)); + if (error != 0) { + dprintk("<-- %s(): rpc_localaddr returned %d\n", + __func__, error); + goto out; + } + + error = -EAFNOSUPPORT; + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { + dprintk("<-- %s(): rpc_ntop returned %d\n", + __func__, error); + goto out; + } + + nfs_server_remove_lists(server); + error = nfs4_set_client(server, hostname, sap, salen, buf, + clp->cl_rpcclient->cl_auth->au_flavor, + clp->cl_proto, clnt->cl_timeout, + clp->cl_minorversion, net); + nfs_put_client(clp); + if (error != 0) { + nfs_server_insert_lists(server); + dprintk("<-- %s(): nfs4_set_client returned %d\n", + __func__, error); + goto out; + } + + if (server->nfs_client->cl_hostname == NULL) + server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + nfs_server_insert_lists(server); + + error = nfs_probe_destination(server); + if (error < 0) + goto out; + + dprintk("<-- %s() succeeded\n", __func__); + +out: + return error; +} diff --git a/kernel/fs/nfs/nfs4file.c b/kernel/fs/nfs/nfs4file.c new file mode 100644 index 000000000..f58c17b3b --- /dev/null +++ b/kernel/fs/nfs/nfs4file.c @@ -0,0 +1,189 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + */ +#include +#include +#include +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#include "nfstrace.h" + +#ifdef CONFIG_NFS_V4_2 +#include "nfs42.h" +#endif + +#define NFSDBG_FACILITY NFSDBG_FILE + +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + struct dentry *dentry = filp->f_path.dentry; + struct dentry *parent = NULL; + struct inode *dir; + unsigned openflags = filp->f_flags; + struct iattr attr; + int opened = 0; + int err; + + /* + * If no cached dentry exists or if it's negative, NFSv4 handled the + * opens in ->lookup() or ->create(). + * + * We only get this far for a cached positive dentry. We skipped + * revalidation, so handle it here by dropping the dentry and returning + * -EOPENSTALE. The VFS will retry the lookup/create/open. + */ + + dprintk("NFS: open file(%pd2)\n", dentry); + + if ((openflags & O_ACCMODE) == 3) + openflags--; + + /* We can't create new files here */ + openflags &= ~(O_CREAT|O_EXCL); + + parent = dget_parent(dentry); + dir = d_inode(parent); + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + attr.ia_valid = ATTR_OPEN; + if (openflags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + nfs_sync_inode(inode); + } + + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + switch (err) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + if (inode != d_inode(dentry)) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_file_set_open_context(filp, ctx); + nfs_fscache_open_file(inode, filp); + err = 0; + +out_put_ctx: + put_nfs_open_context(ctx); +out: + dput(parent); + return err; + +out_drop: + d_drop(dentry); + err = -EOPENSTALE; + goto out_put_ctx; +} + +static int +nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file_inode(file); + + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + if (!ret) + ret = pnfs_sync_inode(inode, !!datasync); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); + + trace_nfs_fsync_exit(inode, ret); + return ret; +} + +#ifdef CONFIG_NFS_V4_2 +static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) +{ + loff_t ret; + + switch (whence) { + case SEEK_HOLE: + case SEEK_DATA: + ret = nfs42_proc_llseek(filep, offset, whence); + if (ret != -ENOTSUPP) + return ret; + default: + return nfs_file_llseek(filep, offset, whence); + } +} + +static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(filep); + long ret; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + return -EOPNOTSUPP; + + ret = inode_newsize_ok(inode, offset + len); + if (ret < 0) + return ret; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); +} +#endif /* CONFIG_NFS_V4_2 */ + +const struct file_operations nfs4_file_operations = { +#ifdef CONFIG_NFS_V4_2 + .llseek = nfs4_file_llseek, +#else + .llseek = nfs_file_llseek, +#endif + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs4_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = iter_file_splice_write, +#ifdef CONFIG_NFS_V4_2 + .fallocate = nfs42_fallocate, +#endif /* CONFIG_NFS_V4_2 */ + .check_flags = nfs_check_flags, + .setlease = simple_nosetlease, +}; diff --git a/kernel/fs/nfs/nfs4getroot.c b/kernel/fs/nfs/nfs4getroot.c new file mode 100644 index 000000000..c0b3a16b4 --- /dev/null +++ b/kernel/fs/nfs/nfs4getroot.c @@ -0,0 +1,50 @@ +/* +* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. +* Written by David Howells (dhowells@redhat.com) +*/ + +#include +#include "nfs4_fs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) +{ + struct nfs_fsinfo fsinfo; + int ret = -ENOMEM; + + dprintk("--> nfs4_get_rootfh()\n"); + + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; + + /* Start by getting the root filehandle from the server */ + ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); + if (ret < 0) { + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; + } + + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); + ret = -ENOTDIR; + goto out; + } + + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); + ret = -EREMOTE; + goto out; + } + + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; +} diff --git a/kernel/fs/nfs/nfs4idmap.c b/kernel/fs/nfs/nfs4idmap.c new file mode 100644 index 000000000..2e1737c40 --- /dev/null +++ b/kernel/fs/nfs/nfs4idmap.c @@ -0,0 +1,792 @@ +/* + * fs/nfs/idmap.c + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "netns.h" +#include "nfs4idmap.h" +#include "nfs4trace.h" + +#define NFS_UINT_MAXLEN 11 + +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; + +struct idmap_legacy_upcalldata { + struct rpc_pipe_msg pipe_msg; + struct idmap_msg idmap_msg; + struct key_construction *key_cons; + struct idmap *idmap; +}; + +struct idmap { + struct rpc_pipe_dir_object idmap_pdo; + struct rpc_pipe *idmap_pipe; + struct idmap_legacy_upcalldata *idmap_upcall_data; + struct mutex idmap_mutex; +}; + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) +{ + fattr->owner_name = owner_name; + fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + kuid_t uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + kgid_t gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (kstrtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} +EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + +static struct key_type key_type_id_resolver = { + .name = "id_resolver", + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +static int nfs_idmap_init_keyring(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "NFS: Registering the %s key type\n", + key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = keyring_alloc(".id_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver_legacy); + if (ret < 0) + goto failed_reg_legacy; + + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_reg_legacy: + unregister_key_type(&key_type_id_resolver); +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +static void nfs_idmap_quit_keyring(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + unregister_key_type(&key_type_id_resolver_legacy); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!*desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, struct idmap *idmap) +{ + char *desc; + struct key *rkey; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + return ERR_PTR(ret); + + rkey = request_key(&key_type_id_resolver, desc, ""); + if (IS_ERR(rkey)) { + mutex_lock(&idmap->idmap_mutex); + rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, + desc, "", 0, idmap); + mutex_unlock(&idmap->idmap_mutex); + } + if (!IS_ERR(rkey)) + set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); + + kfree(desc); + return rkey; +} + +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) +{ + const struct cred *saved_cred; + struct key *rkey; + struct user_key_payload *payload; + ssize_t ret; + + saved_cred = override_creds(id_resolver_cache); + rkey = nfs_idmap_request_key(name, namelen, type, idmap); + revert_creds(saved_cred); + + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.rcudata); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, + size_t buflen, struct idmap *idmap) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, + __u32 *id, struct idmap *idmap) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = kstrtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +/* idmap classic begins here */ + +enum { + Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err +}; + +static const match_table_t nfs_idmap_tokens = { + { Opt_find_uid, "uid:%s" }, + { Opt_find_gid, "gid:%s" }, + { Opt_find_user, "user:%s" }, + { Opt_find_group, "group:%s" }, + { Opt_find_err, NULL } +}; + +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); +static void idmap_release_pipe(struct inode *); +static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); + +static const struct rpc_pipe_ops idmap_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = idmap_pipe_downcall, + .release_pipe = idmap_release_pipe, + .destroy_msg = idmap_pipe_destroy_msg, +}; + +static struct key_type key_type_id_resolver_legacy = { + .name = "id_legacy", + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, + .request_key = nfs_idmap_legacy_upcall, +}; + +static void nfs_idmap_pipe_destroy(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) +{ + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + + if (pipe->dentry) { + rpc_unlink(pipe->dentry); + pipe->dentry = NULL; + } +} + +static int nfs_idmap_pipe_create(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) +{ + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + struct dentry *dentry; + + dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + pipe->dentry = dentry; + return 0; +} + +static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { + .create = nfs_idmap_pipe_create, + .destroy = nfs_idmap_pipe_destroy, +}; + +int +nfs_idmap_new(struct nfs_client *clp) +{ + struct idmap *idmap; + struct rpc_pipe *pipe; + int error; + + idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); + if (idmap == NULL) + return -ENOMEM; + + rpc_init_pipe_dir_object(&idmap->idmap_pdo, + &nfs_idmap_pipe_dir_object_ops, + idmap); + + pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); + if (IS_ERR(pipe)) { + error = PTR_ERR(pipe); + goto err; + } + idmap->idmap_pipe = pipe; + mutex_init(&idmap->idmap_mutex); + + error = rpc_add_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + if (error) + goto err_destroy_pipe; + + clp->cl_idmap = idmap; + return 0; +err_destroy_pipe: + rpc_destroy_pipe_data(idmap->idmap_pipe); +err: + kfree(idmap); + return error; +} + +void +nfs_idmap_delete(struct nfs_client *clp) +{ + struct idmap *idmap = clp->cl_idmap; + + if (!idmap) + return; + clp->cl_idmap = NULL; + rpc_remove_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + rpc_destroy_pipe_data(idmap->idmap_pipe); + kfree(idmap); +} + +int nfs_idmap_init(void) +{ + int ret; + ret = nfs_idmap_init_keyring(); + if (ret != 0) + goto out; +out: + return ret; +} + +void nfs_idmap_quit(void) +{ + nfs_idmap_quit_keyring(); +} + +static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, + struct idmap_msg *im, + struct rpc_pipe_msg *msg) +{ + substring_t substr; + int token, ret; + + im->im_type = IDMAP_TYPE_GROUP; + token = match_token(desc, nfs_idmap_tokens, &substr); + + switch (token) { + case Opt_find_uid: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_gid: + im->im_conv = IDMAP_CONV_NAMETOID; + ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); + break; + + case Opt_find_user: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_group: + im->im_conv = IDMAP_CONV_IDTONAME; + ret = match_int(&substr, &im->im_id); + break; + + default: + ret = -EINVAL; + goto out; + } + + msg->data = im; + msg->len = sizeof(struct idmap_msg); + +out: + return ret; +} + +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data) +{ + if (idmap->idmap_upcall_data != NULL) { + WARN_ON_ONCE(1); + return false; + } + idmap->idmap_upcall_data = data; + return true; +} + +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +{ + struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + + kfree(idmap->idmap_upcall_data); + idmap->idmap_upcall_data = NULL; + complete_request_key(cons, ret); +} + +static void +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +{ + if (idmap->idmap_upcall_data != NULL) + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +} + +static int nfs_idmap_legacy_upcall(struct key_construction *cons, + const char *op, + void *aux) +{ + struct idmap_legacy_upcalldata *data; + struct rpc_pipe_msg *msg; + struct idmap_msg *im; + struct idmap *idmap = (struct idmap *)aux; + struct key *key = cons->key; + int ret = -ENOMEM; + + /* msg and im are freed in idmap_pipe_destroy_msg */ + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out1; + + msg = &data->pipe_msg; + im = &data->idmap_msg; + data->idmap = idmap; + data->key_cons = cons; + + ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); + if (ret < 0) + goto out2; + + ret = -EAGAIN; + if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) + goto out2; + + ret = rpc_queue_upcall(idmap->idmap_pipe, msg); + if (ret < 0) + nfs_idmap_abort_pipe_upcall(idmap, ret); + + return ret; +out2: + kfree(data); +out1: + complete_request_key(cons, ret); + return ret; +} + +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) +{ + return key_instantiate_and_link(key, data, datalen, + id_resolver_cache->thread_keyring, + authkey); +} + +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, + struct idmap_msg *upcall, + struct key *key, struct key *authkey) +{ + char id_str[NFS_UINT_MAXLEN]; + size_t len; + int ret = -ENOKEY; + + /* ret = -ENOKEY */ + if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) + goto out; + switch (im->im_conv) { + case IDMAP_CONV_NAMETOID: + if (strcmp(upcall->im_name, im->im_name) != 0) + break; + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); + break; + case IDMAP_CONV_IDTONAME: + if (upcall->im_id != im->im_id) + break; + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); + break; + default: + ret = -EINVAL; + } +out: + return ret; +} + +static ssize_t +idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(file_inode(filp)); + struct idmap *idmap = (struct idmap *)rpci->private; + struct key_construction *cons; + struct idmap_msg im; + size_t namelen_in; + int ret = -ENOKEY; + + /* If instantiation is successful, anyone waiting for key construction + * will have been woken up and someone else may now have used + * idmap_key_cons - so after this point we may no longer touch it. + */ + if (idmap->idmap_upcall_data == NULL) + goto out_noupcall; + + cons = idmap->idmap_upcall_data->key_cons; + + if (mlen != sizeof(im)) { + ret = -ENOSPC; + goto out; + } + + if (copy_from_user(&im, src, mlen) != 0) { + ret = -EFAULT; + goto out; + } + + if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { + ret = -ENOKEY; + goto out; + } + + namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { + ret = -EINVAL; + goto out; +} + + ret = nfs_idmap_read_and_verify_message(&im, + &idmap->idmap_upcall_data->idmap_msg, + cons->key, cons->authkey); + if (ret >= 0) { + key_set_timeout(cons->key, nfs_idmap_cache_timeout); + ret = mlen; + } + +out: + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall: + return ret; +} + +static void +idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct idmap_legacy_upcalldata *data = container_of(msg, + struct idmap_legacy_upcalldata, + pipe_msg); + struct idmap *idmap = data->idmap; + + if (msg->errno) + nfs_idmap_abort_pipe_upcall(idmap, msg->errno); +} + +static void +idmap_release_pipe(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct idmap *idmap = (struct idmap *)rpci->private; + + nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); +} + +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); + if (ret == 0) { + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + ret = -ERANGE; + } + trace_nfs4_map_name_to_uid(name, namelen, id, ret); + return ret; +} + +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); + if (ret == 0) { + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + ret = -ERANGE; + } + trace_nfs4_map_group_to_gid(name, namelen, id, ret); + return ret; +} + +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kuid(&init_user_ns, uid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_uid_to_name(buf, ret, id, ret); + return ret; +} +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kgid(&init_user_ns, gid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_gid_to_group(buf, ret, id, ret); + return ret; +} diff --git a/kernel/fs/nfs/nfs4idmap.h b/kernel/fs/nfs/nfs4idmap.h new file mode 100644 index 000000000..de44d7330 --- /dev/null +++ b/kernel/fs/nfs/nfs4idmap.h @@ -0,0 +1,68 @@ +/* + * fs/nfs/nfs4idmap.h + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef NFS_IDMAP_H +#define NFS_IDMAP_H + +#include +#include + + +/* Forward declaration to make this header independent of others */ +struct nfs_client; +struct nfs_server; +struct nfs_fattr; +struct nfs4_string; + +int nfs_idmap_init(void); +void nfs_idmap_quit(void); +int nfs_idmap_new(struct nfs_client *); +void nfs_idmap_delete(struct nfs_client *); + +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name); +void nfs_fattr_free_names(struct nfs_fattr *); +void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *); + +int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *); +int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *); +int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); +int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res); + +extern unsigned int nfs_idmap_cache_timeout; +#endif /* NFS_IDMAP_H */ diff --git a/kernel/fs/nfs/nfs4namespace.c b/kernel/fs/nfs/nfs4namespace.c new file mode 100644 index 000000000..f59267237 --- /dev/null +++ b/kernel/fs/nfs/nfs4namespace.c @@ -0,0 +1,523 @@ +/* + * linux/fs/nfs/nfs4namespace.c + * + * Copyright (C) 2005 Trond Myklebust + * - Modified by David Howells + * + * NFSv4 namespace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "nfs4_fs.h" +#include "dns_resolve.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +/* + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer + */ +static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, + char *buffer, ssize_t buflen) +{ + char *end = buffer + buflen; + int n; + + *--end = '\0'; + buflen--; + + n = pathname->ncomponents; + while (--n >= 0) { + const struct nfs4_string *component = &pathname->components[n]; + buflen -= component->len + 1; + if (buflen < 0) + goto Elong; + end -= component->len; + memcpy(end, component->data, component->len); + *--end = '/'; + } + return end; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +/* + * return the path component of ":" + * nfspath - the ":" string + * end - one past the last char that could contain ":" + * returns NULL on failure + */ +static char *nfs_path_component(const char *nfspath, const char *end) +{ + char *p; + + if (*nfspath == '[') { + /* parse [] escaped IPv6 addrs */ + p = strchr(nfspath, ']'); + if (p != NULL && ++p < end && *p == ':') + return p + 1; + } else { + /* otherwise split on first colon */ + p = strchr(nfspath, ':'); + if (p != NULL && p < end) + return p + 1; + } + return NULL; +} + +/* + * Determine the mount path as a string + */ +static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) +{ + char *limit; + char *path = nfs_path(&limit, dentry, buffer, buflen, + NFS_PATH_CANONICAL); + if (!IS_ERR(path)) { + char *path_component = nfs_path_component(path, limit); + if (path_component) + return path_component; + } + return path; +} + +/* + * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we + * believe to be the server path to this dentry + */ +static int nfs4_validate_fspath(struct dentry *dentry, + const struct nfs4_fs_locations *locations, + char *page, char *page2) +{ + const char *path, *fs_path; + + path = nfs4_path(dentry, page, PAGE_SIZE); + if (IS_ERR(path)) + return PTR_ERR(path); + + fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); + if (IS_ERR(fs_path)) + return PTR_ERR(fs_path); + + if (strncmp(path, fs_path, strlen(fs_path)) != 0) { + dprintk("%s: path %s does not begin with fsroot %s\n", + __func__, path, fs_path); + return -ENOENT; + } + + return 0; +} + +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen, struct net *net) +{ + ssize_t ret; + + ret = rpc_pton(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * + */ +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, + struct nfs4_secinfo_flavors *flavors) +{ + rpc_authflavor_t pflavor; + struct nfs4_secinfo4 *secinfo; + unsigned int i; + + for (i = 0; i < flavors->num_flavors; i++) { + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } + } + } + return ERR_PTR(-EPERM); +} + +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) +{ + struct page *page; + struct nfs4_secinfo_flavors *flavors; + struct rpc_clnt *new; + int err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return ERR_PTR(-ENOMEM); + + flavors = page_address(page); + + err = nfs4_proc_secinfo(inode, name, flavors); + if (err < 0) { + new = ERR_PTR(err); + goto out; + } + + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); + +out: + put_page(page); + return new; +} + +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return ERR_CAST(mnt_path); + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + + mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); + if (mountdata->addr == NULL) + return ERR_PTR(-ENOMEM); + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + mountdata->addr, addr_bufsize, net); + if (mountdata->addrlen == 0) + continue; + + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; + } + kfree(mountdata->addr); + return mnt; +} + +/** + * nfs_follow_referral - set up mountpoint when hitting a referral on moved error + * @dentry - parent directory + * @locations - array of NFSv4 server location information + * + */ +static struct vfsmount *nfs_follow_referral(struct dentry *dentry, + const struct nfs4_fs_locations *locations) +{ + struct vfsmount *mnt = ERR_PTR(-ENOENT); + struct nfs_clone_mount mountdata = { + .sb = dentry->d_sb, + .dentry = dentry, + .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor, + }; + char *page = NULL, *page2 = NULL; + int loc, error; + + if (locations == NULL || locations->nlocations <= 0) + goto out; + + dprintk("%s: referral at %pd2\n", __func__, dentry); + + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + /* Ensure fs path is a prefix of current dentry path */ + error = nfs4_validate_fspath(dentry, locations, page, page2); + if (error < 0) { + mnt = ERR_PTR(error); + goto out; + } + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; + } + +out: + free_page((unsigned long) page); + free_page((unsigned long) page2); + dprintk("%s: done\n", __func__); + return mnt; +} + +/* + * nfs_do_refmount - handle crossing a referral on server + * @dentry - dentry of referral + * + */ +static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) +{ + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct dentry *parent; + struct nfs4_fs_locations *fs_locations = NULL; + struct page *page; + int err; + + /* BUG_ON(IS_ROOT(dentry)); */ + dprintk("%s: enter\n", __func__); + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + + fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (fs_locations == NULL) + goto out_free; + + /* Get locations */ + mnt = ERR_PTR(-ENOENT); + + parent = dget_parent(dentry); + dprintk("%s: getting locations for %pd2\n", + __func__, dentry); + + err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); + dput(parent); + if (err != 0 || + fs_locations->nlocations <= 0 || + fs_locations->fs_path.ncomponents <= 0) + goto out_free; + + mnt = nfs_follow_referral(dentry, fs_locations); +out_free: + __free_page(page); + kfree(fs_locations); +out: + dprintk("%s: done\n", __func__); + return mnt; +} + +struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; + struct dentry *parent = dget_parent(dentry); + struct inode *dir = d_inode(parent); + struct qstr *name = &dentry->d_name; + struct rpc_clnt *client; + struct vfsmount *mnt; + + /* Look it up again to get its attributes and sec flavor */ + client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); + dput(parent); + if (IS_ERR(client)) + return ERR_CAST(client); + + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + mnt = nfs_do_refmount(client, dentry); + goto out; + } + + if (client->cl_auth->au_flavor != flavor) + flavor = client->cl_auth->au_flavor; + mnt = nfs_do_submount(dentry, fh, fattr, flavor); +out: + rpc_shutdown_client(client); + return mnt; +} + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(server->client); + struct sockaddr *sap; + unsigned int s; + size_t salen; + int error; + + sap = kmalloc(addr_bufsize, GFP_KERNEL); + if (sap == NULL) + return -ENOMEM; + + error = -ENOENT; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + char *hostname; + + if (buf->len <= 0 || buf->len > PAGE_SIZE) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) + continue; + + salen = nfs_parse_server_name(buf->data, buf->len, + sap, addr_bufsize, net); + if (salen == 0) + continue; + rpc_set_port(sap, NFS_PORT); + + error = -ENOMEM; + hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + if (hostname == NULL) + break; + + error = nfs4_update_server(server, hostname, sap, salen, net); + kfree(hostname); + if (error == 0) + break; + } + + kfree(sap); + return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations) +{ + char *page = NULL, *page2 = NULL; + int loc, error; + + error = -ENOENT; + if (locations == NULL || locations->nlocations <= 0) + goto out; + + error = -ENOMEM; + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = + &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + error = nfs4_try_replacing_one_location(server, page, + page2, location); + if (error == 0) + break; + } + +out: + free_page((unsigned long)page); + free_page((unsigned long)page2); + return error; +} diff --git a/kernel/fs/nfs/nfs4proc.c b/kernel/fs/nfs/nfs4proc.c new file mode 100644 index 000000000..55e1e3af2 --- /dev/null +++ b/kernel/fs/nfs/nfs4proc.c @@ -0,0 +1,8687 @@ +/* + * fs/nfs/nfs4proc.c + * + * Client-side procedure declarations for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "callback.h" +#include "pnfs.h" +#include "netns.h" +#include "nfs4idmap.h" +#include "nfs4session.h" +#include "fscache.h" + +#include "nfs4trace.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +#define NFS4_POLL_RETRY_MIN (HZ/10) +#define NFS4_POLL_RETRY_MAX (15*HZ) + +struct nfs4_opendata; +static int _nfs4_proc_open(struct nfs4_opendata *data); +static int _nfs4_recover_proc_open(struct nfs4_opendata *data); +static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *); +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel); +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +#endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label) +{ + int err; + + if (label == NULL) + return NULL; + + if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) + return NULL; + + err = security_dentry_init_security(dentry, sattr->ia_mode, + &dentry->d_name, (void **)&label->label, &label->len); + if (err == 0) + return label; + + return NULL; +} +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ + if (label) + security_release_secctx(label->label, label->len); +} +static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ + if (label) + return server->attr_bitmask; + + return server->attr_bitmask_nl; +} +#else +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *l) +{ return NULL; } +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ return; } +static inline u32 * +nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ return server->attr_bitmask; } +#endif + +/* Prevent leaks of NFSv4 errors into userland */ +static int nfs4_map_errors(int err) +{ + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + case -NFS4ERR_WRONG_CRED: + return -EPERM; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; + case -NFS4ERR_SHARE_DENIED: + return -EACCES; + case -NFS4ERR_MINOR_VERS_MISMATCH: + return -EPROTONOSUPPORT; + case -NFS4ERR_FILE_OPEN: + return -EBUSY; + default: + dprintk("%s could not handle NFSv4 error %d\n", + __func__, -err); + break; + } + return -EIO; +} + +/* + * This is our standard bitmap for GETATTR requests. + */ +const u32 nfs4_fattr_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + FATTR4_WORD2_SECURITY_LABEL +#endif +}; + +static const u32 nfs4_pnfs_open_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY, + FATTR4_WORD2_MDSTHRESHOLD +}; + +static const u32 nfs4_open_noattr_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_FILEID, +}; + +const u32 nfs4_statfs_bitmap[3] = { + FATTR4_WORD0_FILES_AVAIL + | FATTR4_WORD0_FILES_FREE + | FATTR4_WORD0_FILES_TOTAL, + FATTR4_WORD1_SPACE_AVAIL + | FATTR4_WORD1_SPACE_FREE + | FATTR4_WORD1_SPACE_TOTAL +}; + +const u32 nfs4_pathconf_bitmap[3] = { + FATTR4_WORD0_MAXLINK + | FATTR4_WORD0_MAXNAME, + 0 +}; + +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, + FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES, + FATTR4_WORD2_LAYOUT_BLKSIZE +}; + +const u32 nfs4_fs_locations_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID + | FATTR4_WORD0_FS_LOCATIONS, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, +}; + +static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, + struct nfs4_readdir_arg *readdir) +{ + __be32 *start, *p; + + if (cookie > 2) { + readdir->cookie = cookie; + memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); + return; + } + + readdir->cookie = 0; + memset(&readdir->verifier, 0, sizeof(readdir->verifier)); + if (cookie == 2) + return; + + /* + * NFSv4 servers do not return entries for '.' and '..' + * Therefore, we fake these entries here. We let '.' + * have cookie 0 and '..' have cookie 1. Note that + * when talking to the server, we always send cookie 0 + * instead of 1 or 2. + */ + start = p = kmap_atomic(*readdir->pages); + + if (cookie == 0) { + *p++ = xdr_one; /* next */ + *p++ = xdr_zero; /* cookie, first word */ + *p++ = xdr_one; /* cookie, second word */ + *p++ = xdr_one; /* entry len */ + memcpy(p, ".\0\0\0", 4); /* entry */ + p++; + *p++ = xdr_one; /* bitmap length */ + *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ + *p++ = htonl(8); /* attribute buffer length */ + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry))); + } + + *p++ = xdr_one; /* next */ + *p++ = xdr_zero; /* cookie, first word */ + *p++ = xdr_two; /* cookie, second word */ + *p++ = xdr_two; /* entry len */ + memcpy(p, "..\0\0", 4); /* entry */ + p++; + *p++ = xdr_one; /* bitmap length */ + *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ + *p++ = htonl(8); /* attribute buffer length */ + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + + readdir->pgbase = (char *)p - (char *)start; + readdir->count -= readdir->pgbase; + kunmap_atomic(start); +} + +static long nfs4_update_delay(long *timeout) +{ + long ret; + if (!timeout) + return NFS4_POLL_RETRY_MAX; + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + ret = *timeout; + *timeout <<= 1; + return ret; +} + +static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +{ + int res = 0; + + might_sleep(); + + freezable_schedule_timeout_killable_unsafe( + nfs4_update_delay(timeout)); + if (fatal_signal_pending(current)) + res = -ERESTARTSYS; + return res; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = exception->state; + struct inode *inode = exception->inode; + int ret = errorcode; + + exception->retry = 0; + switch(errorcode) { + case 0: + return 0; + case -NFS4ERR_OPENMODE: + if (inode && nfs4_have_delegation(inode, FMODE_READ)) { + nfs4_inode_return_delegation(inode); + exception->retry = 1; + return 0; + } + if (state == NULL) + break; + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + } + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; + case -NFS4ERR_MOVED: + ret = nfs4_schedule_migration_recovery(server); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR: %d Reset session\n", __func__, + errorcode); + nfs4_schedule_session_recovery(clp->cl_session, errorcode); + goto wait_on_recovery; +#endif /* defined(CONFIG_NFS_V4_1) */ + case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + ret = nfs4_delay(server->client, &exception->timeout); + if (ret != 0) + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_OLD_STATEID: + exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } + } + /* We failed to handle the error */ + return nfs4_map_errors(ret); +wait_on_recovery: + ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; + if (ret == 0) + exception->retry = 1; + return ret; +} + +/* + * Return 'true' if 'clp' is using an rpc_client that is integrity protected + * or 'false' otherwise. + */ +static bool _nfs4_is_integrity_protected(struct nfs_client *clp) +{ + rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; + + if (flavor == RPC_AUTH_GSS_KRB5I || + flavor == RPC_AUTH_GSS_KRB5P) + return true; + + return false; +} + +static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) +{ + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal,timestamp)) + clp->cl_last_renewal = timestamp; + spin_unlock(&clp->cl_lock); +} + +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +{ + do_renew_lease(server->nfs_client, timestamp); +} + +struct nfs4_call_sync_data { + const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; +}; + +static void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) +{ + args->sa_slot = NULL; + args->sa_cache_this = cache_reply; + args->sa_privileged = 0; + + res->sr_slot = NULL; +} + +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ + args->sa_privileged = 1; +} + +int nfs40_setup_sequence(struct nfs4_slot_table *tbl, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_slot *slot; + + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_start; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + args->sa_slot = slot; + res->sr_slot = slot; + +out_start: + rpc_call_start(task); + return 0; + +out_sleep: + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(nfs40_setup_sequence); + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + struct nfs4_slot *slot = res->sr_slot; + struct nfs4_slot_table *tbl; + + if (slot == NULL) + goto out; + + tbl = slot->table; + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_wake_and_assign_slot(tbl, slot)) + nfs4_free_slot(tbl, slot); + spin_unlock(&tbl->slot_tbl_lock); + + res->sr_slot = NULL; +out: + return 1; +} + +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot = res->sr_slot; + bool send_new_highest_used_slotid = false; + + tbl = slot->table; + session = tbl->session; + + spin_lock(&tbl->slot_tbl_lock); + /* Be nice to the server: try to ensure that the last transmitted + * value for highest_user_slotid <= target_highest_slotid + */ + if (tbl->highest_used_slotid > tbl->target_highest_slotid) + send_new_highest_used_slotid = true; + + if (nfs41_wake_and_assign_slot(tbl, slot)) { + send_new_highest_used_slotid = false; + goto out_unlock; + } + nfs4_free_slot(tbl, slot); + + if (tbl->highest_used_slotid != NFS4_NO_SLOT) + send_new_highest_used_slotid = false; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slot = NULL; + if (send_new_highest_used_slotid) + nfs41_server_notify_highest_slotid_update(session->clp); +} + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + struct nfs4_session *session; + struct nfs4_slot *slot = res->sr_slot; + struct nfs_client *clp; + bool interrupted = false; + int ret = 1; + + if (slot == NULL) + goto out_noaction; + /* don't increment the sequence number if the task wasn't sent */ + if (!RPC_WAS_SENT(task)) + goto out; + + session = slot->table->session; + + if (slot->interrupted) { + slot->interrupted = 0; + interrupted = true; + } + + trace_nfs4_sequence_done(session, res); + /* Check the SEQUENCE operation status */ + switch (res->sr_status) { + case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + clp = session->clp; + do_renew_lease(clp, res->sr_timestamp); + /* Check sequence flags */ + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); + nfs41_update_target_slotid(slot->table, slot, res); + break; + case 1: + /* + * sr_status remains 1 if an RPC level error occurred. + * The server may or may not have processed the sequence + * operation.. + * Mark the slot as having hosted an interrupted RPC call. + */ + slot->interrupted = 1; + goto out; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%u seq=%u: Operation in progress\n", + __func__, + slot->slot_nr, + slot->seq_nr); + goto out_retry; + case -NFS4ERR_BADSLOT: + /* + * The slot id we used was probably retired. Try again + * using a different slot id. + */ + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + /* + * Was the last operation on this sequence interrupted? + * If so, retry after bumping the sequence number. + */ + if (interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } + /* + * Could this slot have been previously retired? + * If so, then the server may be expecting seq_nr = 1! + */ + if (slot->seq_nr != 1) { + slot->seq_nr = 1; + goto retry_nowait; + } + break; + case -NFS4ERR_SEQ_FALSE_RETRY: + ++slot->seq_nr; + goto retry_nowait; + default: + /* Just update the slot sequence no. */ + ++slot->seq_nr; + } +out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); + nfs41_sequence_free_slot(res); +out_noaction: + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) { + task->tk_status = 0; + ret = 0; + } + goto out; +out_retry: + if (!rpc_restart_call(task)) + goto out; + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return 0; +} +EXPORT_SYMBOL_GPL(nfs41_sequence_done); + +int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (!res->sr_slot->table->session) + return nfs40_sequence_done(task, res); + return nfs41_sequence_done(task, res); +} +EXPORT_SYMBOL_GPL(nfs4_sequence_done); + +int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_slot *slot; + struct nfs4_slot_table *tbl; + + dprintk("--> %s\n", __func__); + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_success; + + tbl = &session->fc_slot_table; + + task->tk_timeout = 0; + + spin_lock(&tbl->slot_tbl_lock); + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && + !args->sa_privileged) { + /* The state manager will wait until the slot table is empty */ + dprintk("%s session is draining\n", __func__); + goto out_sleep; + } + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* If out of memory, try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + dprintk("<-- %s: no free slots\n", __func__); + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + args->sa_slot = slot; + + dprintk("<-- %s slotid=%u seqid=%u\n", __func__, + slot->slot_nr, slot->seq_nr); + + res->sr_slot = slot; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. + */ + res->sr_status = 1; + trace_nfs4_setup_sequence(session, args); +out_success: + rpc_call_start(task); + return 0; +out_sleep: + /* Privileged tasks are queued with top priority */ + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); + +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + + if (!session) + return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + args, res, task); + + dprintk("--> %s clp %p session %p sr_slot %u\n", + __func__, session->clp, session, res->sr_slot ? + res->sr_slot->slot_nr : NFS4_NO_SLOT); + + ret = nfs41_setup_sequence(session, args, res, task); + + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + struct nfs4_session *session = nfs4_get_session(data->seq_server); + + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); +} + +static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + + nfs41_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs41_call_sync_ops = { + .rpc_call_prepare = nfs41_call_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + +#else /* !CONFIG_NFS_V4_1 */ + +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + args, res, task); +} + +int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} +EXPORT_SYMBOL_GPL(nfs4_sequence_done); + +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_setup_sequence(data->seq_server, + data->seq_args, data->seq_res, task); +} + +static void nfs40_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs40_call_sync_ops = { + .rpc_call_prepare = nfs40_call_sync_prepare, + .rpc_call_done = nfs40_call_sync_done, +}; + +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res) +{ + int ret; + struct rpc_task *task; + struct nfs_client *clp = server->nfs_client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = args, + .seq_res = res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data + }; + + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else { + ret = task->tk_status; + rpc_put_task(task); + } + return ret; +} + +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + nfs4_init_sequence(args, res, cache_reply); + return nfs4_call_sync_sequence(clnt, server, msg, args, res); +} + +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + spin_lock(&dir->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!cinfo->atomic || cinfo->before != dir->i_version) + nfs_force_lookup_revalidate(dir); + dir->i_version = cinfo->after; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_fscache_invalidate(dir); + spin_unlock(&dir->i_lock); +} + +struct nfs4_opendata { + struct kref kref; + struct nfs_openargs o_arg; + struct nfs_openres o_res; + struct nfs_open_confirmargs c_arg; + struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; + struct nfs_fattr f_attr; + struct nfs4_label *f_label; + struct dentry *dir; + struct dentry *dentry; + struct nfs4_state_owner *owner; + struct nfs4_state *state; + struct iattr attrs; + unsigned long timestamp; + unsigned int rpc_done : 1; + unsigned int file_created : 1; + unsigned int is_recover : 1; + int rpc_status; + int cancelled; +}; + +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + if (err != -EINVAL) + return false; + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + return false; + server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; + exception->retry = 1; + return true; +} + +static u32 +nfs4_map_atomic_open_share(struct nfs_server *server, + fmode_t fmode, int openflags) +{ + u32 res = 0; + + switch (fmode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + res = NFS4_SHARE_ACCESS_READ; + break; + case FMODE_WRITE: + res = NFS4_SHARE_ACCESS_WRITE; + break; + case FMODE_READ|FMODE_WRITE: + res = NFS4_SHARE_ACCESS_BOTH; + } + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + goto out; + /* Want no delegation if we're using O_DIRECT */ + if (openflags & O_DIRECT) + res |= NFS4_SHARE_WANT_NO_DELEG; +out: + return res; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, + enum open_claim_type4 claim) +{ + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + return claim; + switch (claim) { + default: + return claim; + case NFS4_OPEN_CLAIM_FH: + return NFS4_OPEN_CLAIM_NULL; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + return NFS4_OPEN_CLAIM_DELEGATE_CUR; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + return NFS4_OPEN_CLAIM_DELEGATE_PREV; + } +} + +static void nfs4_init_opendata_res(struct nfs4_opendata *p) +{ + p->o_res.f_attr = &p->f_attr; + p->o_res.f_label = p->f_label; + p->o_res.seqid = p->o_arg.seqid; + p->c_res.seqid = p->c_arg.seqid; + p->o_res.server = p->o_arg.server; + p->o_res.access_request = p->o_arg.access; + nfs_fattr_init(&p->f_attr); + nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); +} + +static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, + const struct iattr *attrs, + struct nfs4_label *label, + enum open_claim_type4 claim, + gfp_t gfp_mask) +{ + struct dentry *parent = dget_parent(dentry); + struct inode *dir = d_inode(parent); + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + struct nfs4_opendata *p; + + p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + goto err; + + p->f_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->f_label)) + goto err_free_p; + + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); + if (IS_ERR(p->o_arg.seqid)) + goto err_free_label; + nfs_sb_active(dentry->d_sb); + p->dentry = dget(dentry); + p->dir = parent; + p->owner = sp; + atomic_inc(&sp->so_count); + p->o_arg.open_flags = flags; + p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + p->o_arg.share_access = nfs4_map_atomic_open_share(server, + fmode, flags); + /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS + * will return permission denied for all bits until close */ + if (!(flags & O_EXCL)) { + /* ask server to check for all possible rights as results + * are cached */ + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + } + p->o_arg.clientid = server->nfs_client->cl_clientid; + p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); + p->o_arg.id.uniquifier = sp->so_seqid.owner_id; + p->o_arg.name = &dentry->d_name; + p->o_arg.server = server; + p->o_arg.bitmask = nfs4_bitmask(server, label); + p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; + p->o_arg.label = label; + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); + switch (p->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + p->o_arg.fh = NFS_FH(dir); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + p->o_arg.fh = NFS_FH(d_inode(dentry)); + } + if (attrs != NULL && attrs->ia_valid != 0) { + __u32 verf[2]; + + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); + + verf[0] = jiffies; + verf[1] = current->pid; + memcpy(p->o_arg.u.verifier.data, verf, + sizeof(p->o_arg.u.verifier.data)); + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; + p->c_arg.seqid = p->o_arg.seqid; + nfs4_init_opendata_res(p); + kref_init(&p->kref); + return p; + +err_free_label: + nfs4_label_free(p->f_label); +err_free_p: + kfree(p); +err: + dput(parent); + return NULL; +} + +static void nfs4_opendata_free(struct kref *kref) +{ + struct nfs4_opendata *p = container_of(kref, + struct nfs4_opendata, kref); + struct super_block *sb = p->dentry->d_sb; + + nfs_free_seqid(p->o_arg.seqid); + if (p->state != NULL) + nfs4_put_open_state(p->state); + nfs4_put_state_owner(p->owner); + + nfs4_label_free(p->f_label); + + dput(p->dir); + dput(p->dentry); + nfs_sb_deactive(sb); + nfs_fattr_free_names(&p->f_attr); + kfree(p->f_attr.mdsthreshold); + kfree(p); +} + +static void nfs4_opendata_put(struct nfs4_opendata *p) +{ + if (p != NULL) + kref_put(&p->kref, nfs4_opendata_free); +} + +static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) +{ + int ret; + + ret = rpc_wait_for_completion_task(task); + return ret; +} + +static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +{ + int ret = 0; + + if (open_mode & (O_EXCL|O_TRUNC)) + goto out; + switch (mode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 + && state->n_rdonly != 0; + break; + case FMODE_WRITE: + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 + && state->n_wronly != 0; + break; + case FMODE_READ|FMODE_WRITE: + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 + && state->n_rdwr != 0; + } +out: + return ret; +} + +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) +{ + if (delegation == NULL) + return 0; + if ((delegation->type & fmode) != fmode) + return 0; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return 0; + nfs_mark_delegation_referenced(delegation); + return 1; +} + +static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) +{ + switch (fmode) { + case FMODE_WRITE: + state->n_wronly++; + break; + case FMODE_READ: + state->n_rdonly++; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr++; + } + nfs4_state_set_mode_locked(state, state->state | fmode); +} + +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ + struct nfs_client *clp = state->owner->so_server->nfs_client; + bool need_recover = false; + + if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) + need_recover = true; + if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) + need_recover = true; + if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) + need_recover = true; + if (need_recover) + nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, + nfs4_stateid *stateid) +{ + if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) + return true; + if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs_test_and_clear_all_open_stateid(state); + return true; + } + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + return true; + return false; +} + +static void nfs_resync_open_stateid_locked(struct nfs4_state *state) +{ + if (state->n_wronly) + set_bit(NFS_O_WRONLY_STATE, &state->flags); + if (state->n_rdonly) + set_bit(NFS_O_RDONLY_STATE, &state->flags); + if (state->n_rdwr) + set_bit(NFS_O_RDWR_STATE, &state->flags); +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *stateid, fmode_t fmode) +{ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_READ: + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + if (stateid == NULL) + return; + /* Handle races with OPEN */ + if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || + !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + nfs_resync_open_stateid_locked(state); + return; + } + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_clear_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); +} + +static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +{ + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { + nfs4_stateid_copy(&state->stateid, deleg_stateid); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) + nfs_set_open_stateid_locked(state, open_stateid, fmode); + write_sequnlock(&state->seqlock); + spin_lock(&state->owner->so_lock); + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); +} + +static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *deleg_cur; + int ret = 0; + + fmode &= (FMODE_READ|FMODE_WRITE); + + rcu_read_lock(); + deleg_cur = rcu_dereference(nfsi->delegation); + if (deleg_cur == NULL) + goto no_delegation; + + spin_lock(&deleg_cur->lock); + if (rcu_dereference(nfsi->delegation) != deleg_cur || + test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || + (deleg_cur->type & fmode) != fmode) + goto no_delegation_unlock; + + if (delegation == NULL) + delegation = &deleg_cur->stateid; + else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + ret = 1; +no_delegation_unlock: + spin_unlock(&deleg_cur->lock); +no_delegation: + rcu_read_unlock(); + + if (!ret && open_stateid != NULL) { + __update_open_stateid(state, open_stateid, NULL, fmode); + ret = 1; + } + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + + return ret; +} + +static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp, + const nfs4_stateid *stateid) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid)) + goto out_noupdate; + if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid)) + goto out_noupdate; + nfs4_stateid_copy(&lsp->ls_stateid, stateid); + ret = true; +out_noupdate: + spin_unlock(&state->state_lock); + return ret; +} + +static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL || (delegation->type & fmode) == fmode) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + nfs4_inode_return_delegation(inode); +} + +static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) +{ + struct nfs4_state *state = opendata->state; + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *delegation; + int open_mode = opendata->o_arg.open_flags; + fmode_t fmode = opendata->o_arg.fmode; + nfs4_stateid stateid; + int ret = -EAGAIN; + + for (;;) { + spin_lock(&state->owner->so_lock); + if (can_open_cached(state, fmode, open_mode)) { + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); + goto out_return_state; + } + spin_unlock(&state->owner->so_lock); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (!can_open_delegated(delegation, fmode)) { + rcu_read_unlock(); + break; + } + /* Save the delegation */ + nfs4_stateid_copy(&stateid, &delegation->stateid); + rcu_read_unlock(); + nfs_release_seqid(opendata->o_arg.seqid); + if (!opendata->is_recover) { + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + } + ret = -EAGAIN; + + /* Try to update the stateid using the delegation */ + if (update_open_stateid(state, NULL, &stateid, fmode)) + goto out_return_state; + } +out: + return ERR_PTR(ret); +out_return_state: + atomic_inc(&state->count); + return state; +} + +static void +nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) +{ + struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; + struct nfs_delegation *delegation; + int delegation_flags = 0; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + clp->cl_hostname); + } else if ((delegation_flags & 1UL<inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); +} + +/* + * Check the inode attributes against the CLAIM_PREVIOUS returned attributes + * and update the nfs4_state. + */ +static struct nfs4_state * +_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode = data->state->inode; + struct nfs4_state *state = data->state; + int ret; + + if (!data->rpc_done) { + if (data->rpc_status) { + ret = data->rpc_status; + goto err; + } + /* cached opens have already been processed */ + goto update; + } + + ret = nfs_refresh_inode(inode, &data->f_attr); + if (ret) + goto err; + + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); +update: + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + atomic_inc(&state->count); + + return state; +err: + return ERR_PTR(ret); + +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode; + struct nfs4_state *state = NULL; + int ret; + + if (!data->rpc_done) { + state = nfs4_try_open_cached(data); + goto out; + } + + ret = -EAGAIN; + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + goto err; + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); + ret = PTR_ERR(inode); + if (IS_ERR(inode)) + goto err; + ret = -ENOMEM; + state = nfs4_get_open_state(inode, data->owner); + if (state == NULL) + goto err_put_inode; + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + iput(inode); +out: + nfs_release_seqid(data->o_arg.seqid); + return state; +err_put_inode: + iput(inode); +err: + return ERR_PTR(ret); +} + +static struct nfs4_state * +nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) + return _nfs4_opendata_reclaim_to_nfs4_state(data); + return _nfs4_opendata_to_nfs4_state(data); +} + +static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_open_context *ctx; + + spin_lock(&state->inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + get_nfs_open_context(ctx); + spin_unlock(&state->inode->i_lock); + return ctx; + } + spin_unlock(&state->inode->i_lock); + return ERR_PTR(-ENOENT); +} + +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, + struct nfs4_state *state, enum open_claim_type4 claim) +{ + struct nfs4_opendata *opendata; + + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, + NULL, NULL, claim, GFP_NOFS); + if (opendata == NULL) + return ERR_PTR(-ENOMEM); + opendata->state = state; + atomic_inc(&state->count); + return opendata; +} + +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) +{ + struct nfs4_state *newstate; + int ret; + + opendata->o_arg.open_flags = 0; + opendata->o_arg.fmode = fmode; + opendata->o_arg.share_access = nfs4_map_atomic_open_share( + NFS_SB(opendata->dentry->d_sb), + fmode, 0); + memset(&opendata->o_res, 0, sizeof(opendata->o_res)); + memset(&opendata->c_res, 0, sizeof(opendata->c_res)); + nfs4_init_opendata_res(opendata); + ret = _nfs4_recover_proc_open(opendata); + if (ret != 0) + return ret; + newstate = nfs4_opendata_to_nfs4_state(opendata); + if (IS_ERR(newstate)) + return PTR_ERR(newstate); + nfs4_close_state(newstate, fmode); + *res = newstate; + return 0; +} + +static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) +{ + struct nfs4_state *newstate; + int ret; + + /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + /* memory barrier prior to reading state->n_* */ + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + smp_rmb(); + if (state->n_rdwr != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + if (state->n_wronly != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + if (state->n_rdonly != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + /* + * We may have performed cached opens for all three recoveries. + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && + !nfs4_stateid_match(&state->stateid, &state->open_stateid)) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); + } + return 0; +} + +/* + * OPEN_RECLAIM: + * reclaim state on the server after a reboot. + */ +static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_delegation *delegation; + struct nfs4_opendata *opendata; + fmode_t delegation_type = 0; + int status; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_PREVIOUS); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) + delegation_type = delegation->type; + rcu_read_unlock(); + opendata->o_arg.u.delegation_type = delegation_type; + status = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return status; +} + +static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_do_open_reclaim(ctx, state); + trace_nfs4_open_reclaim(ctx, 0, err); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + int ret; + + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return -EAGAIN; + ret = nfs4_do_open_reclaim(ctx, state); + put_nfs_open_context(ctx); + return ret; +} + +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +{ + switch (err) { + default: + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); + case 0: + case -ENOENT: + case -ESTALE: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + return -EAGAIN; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_MOVED: + nfs4_schedule_migration_recovery(server); + return -EAGAIN; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + return -EAGAIN; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + set_bit(NFS_DELEGATED_STATE, &state->flags); + ssleep(1); + return -EAGAIN; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + return 0; + } + return err; +} + +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_opendata *opendata; + int err; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_DELEG_CUR_FH); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); + err = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); +} + +static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl, + &data->c_arg.seq_args, &data->c_res.seq_res, task); +} + +static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + nfs40_sequence_done(task, &data->c_res.seq_res); + + data->rpc_status = task->tk_status; + if (data->rpc_status == 0) { + nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; + } +} + +static void nfs4_open_confirm_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; + + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (!data->rpc_done) + goto out_free; + state = nfs4_opendata_to_nfs4_state(data); + if (!IS_ERR(state)) + nfs4_close_state(state, data->o_arg.fmode); +out_free: + nfs4_opendata_put(data); +} + +static const struct rpc_call_ops nfs4_open_confirm_ops = { + .rpc_call_prepare = nfs4_open_confirm_prepare, + .rpc_call_done = nfs4_open_confirm_done, + .rpc_release = nfs4_open_confirm_release, +}; + +/* + * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata + */ +static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) +{ + struct nfs_server *server = NFS_SERVER(d_inode(data->dir)); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], + .rpc_argp = &data->c_arg, + .rpc_resp = &data->c_res, + .rpc_cred = data->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_open_confirm_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status; + + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); + kref_get(&data->kref); + data->rpc_done = 0; + data->rpc_status = 0; + data->timestamp = jiffies; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + return status; +} + +static void nfs4_open_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state_owner *sp = data->owner; + struct nfs_client *clp = sp->so_server->nfs_client; + + if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) + goto out_wait; + /* + * Check if we still need to send an OPEN call, or if we can use + * a delegation instead. + */ + if (data->state != NULL) { + struct nfs_delegation *delegation; + + if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + goto out_no_action; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && + can_open_delegated(delegation, data->o_arg.fmode)) + goto unlock_no_action; + rcu_read_unlock(); + } + /* Update client id. */ + data->o_arg.clientid = clp->cl_clientid; + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + case NFS4_OPEN_CLAIM_FH: + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->o_arg.server, + &data->o_arg.seq_args, + &data->o_res.seq_res, + task) != 0) + nfs_release_seqid(data->o_arg.seqid); + + /* Set the create mode (note dependency on the session type) */ + data->o_arg.createmode = NFS4_CREATE_UNCHECKED; + if (data->o_arg.open_flags & O_EXCL) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_GUARDED; + else if (clp->cl_mvops->minor_version > 0) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + } + return; +unlock_no_action: + rcu_read_unlock(); +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &data->o_res.seq_res); +} + +static void nfs4_open_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + data->rpc_status = task->tk_status; + + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + return; + + if (task->tk_status == 0) { + if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) { + switch (data->o_res.f_attr->mode & S_IFMT) { + case S_IFREG: + break; + case S_IFLNK: + data->rpc_status = -ELOOP; + break; + case S_IFDIR: + data->rpc_status = -EISDIR; + break; + default: + data->rpc_status = -ENOTDIR; + } + } + renew_lease(data->o_res.server, data->timestamp); + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) + nfs_confirm_seqid(&data->owner->so_seqid, 0); + } + data->rpc_done = 1; +} + +static void nfs4_open_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; + + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) + goto out_free; + /* In case we need an open_confirm, no cleanup! */ + if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) + goto out_free; + state = nfs4_opendata_to_nfs4_state(data); + if (!IS_ERR(state)) + nfs4_close_state(state, data->o_arg.fmode); +out_free: + nfs4_opendata_put(data); +} + +static const struct rpc_call_ops nfs4_open_ops = { + .rpc_call_prepare = nfs4_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) +{ + struct inode *dir = d_inode(data->dir); + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN], + .rpc_argp = o_arg, + .rpc_resp = o_res, + .rpc_cred = data->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_open_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status; + + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); + kref_get(&data->kref); + data->rpc_done = 0; + data->rpc_status = 0; + data->cancelled = 0; + data->is_recover = 0; + if (isrecover) { + nfs4_set_sequence_privileged(&o_arg->seq_args); + data->is_recover = 1; + } + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + + return status; +} + +static int _nfs4_recover_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = d_inode(data->dir); + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 1); + if (status != 0 || !data->rpc_done) + return status; + + nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); + + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + + return status; +} + +/* + * Additional permission checks in order to distinguish between an + * open for read, and an open for execute. This works around the + * fact that NFSv4 OPEN treats read and execute permissions as being + * the same. + * Note that in the non-execute case, we want to turn off permission + * checking if we just created a new file (POSIX open() semantics). + */ +static int nfs4_opendata_access(struct rpc_cred *cred, + struct nfs4_opendata *opendata, + struct nfs4_state *state, fmode_t fmode, + int openflags) +{ + struct nfs_access_entry cache; + u32 mask; + + /* access call failed or for some reason the server doesn't + * support any access modes -- defer access call until later */ + if (opendata->o_res.access_supported == 0) + return 0; + + mask = 0; + /* + * Use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. + */ + if (openflags & __FMODE_EXEC) { + /* ONLY check for exec rights */ + mask = MAY_EXEC; + } else if ((fmode & FMODE_READ) && !opendata->file_created) + mask = MAY_READ; + + cache.cred = cred; + cache.jiffies = jiffies; + nfs_access_set_mask(&cache, opendata->o_res.access_result); + nfs_access_add_cache(state->inode, &cache); + + if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + return 0; + + /* even though OPEN succeeded, access is denied. Close the file */ + nfs4_close_state(state, fmode); + return -EACCES; +} + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = d_inode(data->dir); + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 0); + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; + return status; + } + + nfs_fattr_map_and_free_names(server, &data->f_attr); + + if (o_arg->open_flags & O_CREAT) { + update_changeattr(dir, &o_res->cinfo); + if (o_arg->open_flags & O_EXCL) + data->file_created = 1; + else if (o_res->cinfo.before != o_res->cinfo.after) + data->file_created = 1; + } + if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) + server->caps &= ~NFS_CAP_POSIX_LOCK; + if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + return 0; +} + +static int nfs4_recover_expired_lease(struct nfs_server *server) +{ + return nfs4_client_recover_expired_lease(server->nfs_client); +} + +/* + * OPEN_EXPIRED: + * reclaim state on the server after a network partition. + * Assumes caller holds the appropriate lock + */ +static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs4_opendata *opendata; + int ret; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_FH); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + ret = nfs4_open_recover(opendata, state); + if (ret == -ESTALE) + d_drop(ctx->dentry); + nfs4_opendata_put(opendata); + return ret; +} + +static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_open_expired(ctx, state); + trace_nfs4_open_expired(ctx, 0, err); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } + } while (exception.retry); +out: + return err; +} + +static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + int ret; + + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return -EAGAIN; + ret = nfs4_do_open_expired(ctx, state); + put_nfs_open_context(ctx); + return ret; +} + +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +{ + nfs_remove_bad_delegation(state->inode); + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); + clear_bit(NFS_DELEGATED_STATE, &state->flags); +} + +static void nfs40_clear_delegation_stateid(struct nfs4_state *state) +{ + if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) + nfs_finish_clear_delegation_stateid(state); +} + +static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + /* NFSv4.0 doesn't allow for delegation recovery on open expire */ + nfs40_clear_delegation_stateid(state); + return nfs4_open_expired(sp, state); +} + +#if defined(CONFIG_NFS_V4_1) +static void nfs41_check_delegation_stateid(struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid stateid; + struct nfs_delegation *delegation; + struct rpc_cred *cred; + int status; + + /* Get the delegation credential for use by test/free_stateid */ + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation == NULL) { + rcu_read_unlock(); + return; + } + + nfs4_stateid_copy(&stateid, &delegation->stateid); + cred = get_rpccred(delegation->cred); + rcu_read_unlock(); + status = nfs41_test_stateid(server, &stateid, cred); + trace_nfs4_test_delegation_stateid(state, NULL, status); + + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, &stateid, cred); + nfs_finish_clear_delegation_stateid(state); + } + + put_rpccred(cred); +} + +/** + * nfs41_check_open_stateid - possibly free an open stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_open_stateid(struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid *stateid = &state->open_stateid; + struct rpc_cred *cred = state->owner->so_cred; + int status; + + /* If a state reset has been done, test_stateid is unneeded */ + if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + return -NFS4ERR_BAD_STATEID; + + status = nfs41_test_stateid(server, stateid, cred); + trace_nfs4_test_open_stateid(state, NULL, status); + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, stateid, cred); + + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + return status; +} + +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int status; + + nfs41_check_delegation_stateid(state); + status = nfs41_check_open_stateid(state); + if (status != NFS_OK) + status = nfs4_open_expired(sp, state); + return status; +} +#endif + +/* + * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* + * fields corresponding to attributes that were used to store the verifier. + * Make sure we clobber those fields in the later setattr call + */ +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +{ + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + !(sattr->ia_valid & ATTR_ATIME_SET)) + sattr->ia_valid |= ATTR_ATIME; + + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + !(sattr->ia_valid & ATTR_MTIME_SET)) + sattr->ia_valid |= ATTR_MTIME; +} + +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, + fmode_t fmode, + int flags, + struct nfs_open_context *ctx) +{ + struct nfs4_state_owner *sp = opendata->owner; + struct nfs_server *server = sp->so_server; + struct dentry *dentry; + struct nfs4_state *state; + unsigned int seq; + int ret; + + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + + ret = _nfs4_proc_open(opendata); + if (ret != 0) + goto out; + + state = nfs4_opendata_to_nfs4_state(opendata); + ret = PTR_ERR(state); + if (IS_ERR(state)) + goto out; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + dentry = opendata->dentry; + if (d_really_is_negative(dentry)) { + /* FIXME: Is this d_drop() ever needed? */ + d_drop(dentry); + dentry = d_add_unique(dentry, igrab(state->inode)); + if (dentry == NULL) { + dentry = opendata->dentry; + } else if (dentry != ctx->dentry) { + dput(ctx->dentry); + ctx->dentry = dget(dentry); + } + nfs_set_verifier(dentry, + nfs_save_change_attribute(d_inode(opendata->dir))); + } + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + if (ret != 0) + goto out; + + ctx->state = state; + if (d_inode(dentry) == state->inode) { + nfs_inode_attach_open_context(ctx); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + nfs4_schedule_stateid_recovery(server, state); + } +out: + return ret; +} + +/* + * Returns a referenced nfs4_state + */ +static int _nfs4_do_open(struct inode *dir, + struct nfs_open_context *ctx, + int flags, + struct iattr *sattr, + struct nfs4_label *label, + int *opened) +{ + struct nfs4_state_owner *sp; + struct nfs4_state *state = NULL; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *opendata; + struct dentry *dentry = ctx->dentry; + struct rpc_cred *cred = ctx->cred; + struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; + fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); + enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct nfs4_label *olabel = NULL; + int status; + + /* Protect against reboot recovery conflicts */ + status = -ENOMEM; + sp = nfs4_get_state_owner(server, cred, GFP_KERNEL); + if (sp == NULL) { + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } + status = nfs4_recover_expired_lease(server); + if (status != 0) + goto err_put_state_owner; + if (d_really_is_positive(dentry)) + nfs4_return_incompatible_delegation(d_inode(dentry), fmode); + status = -ENOMEM; + if (d_really_is_positive(dentry)) + claim = NFS4_OPEN_CLAIM_FH; + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, + label, claim, GFP_KERNEL); + if (opendata == NULL) + goto err_put_state_owner; + + if (label) { + olabel = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(olabel)) { + status = PTR_ERR(olabel); + goto err_opendata_put; + } + } + + if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + if (!opendata->f_attr.mdsthreshold) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_free_label; + } + opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; + } + if (d_really_is_positive(dentry)) + opendata->state = nfs4_get_open_state(d_inode(dentry), sp); + + status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); + if (status != 0) + goto err_free_label; + state = ctx->state; + + if ((opendata->o_arg.open_flags & O_EXCL) && + (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { + nfs4_exclusive_attrset(opendata, sattr); + + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } + } + if (opendata->file_created) + *opened |= FILE_CREATED; + + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { + *ctx_th = opendata->f_attr.mdsthreshold; + opendata->f_attr.mdsthreshold = NULL; + } + + nfs4_label_free(olabel); + + nfs4_opendata_put(opendata); + nfs4_put_state_owner(sp); + return 0; +err_free_label: + nfs4_label_free(olabel); +err_opendata_put: + nfs4_opendata_put(opendata); +err_put_state_owner: + nfs4_put_state_owner(sp); +out_err: + return status; +} + + +static struct nfs4_state *nfs4_do_open(struct inode *dir, + struct nfs_open_context *ctx, + int flags, + struct iattr *sattr, + struct nfs4_label *label, + int *opened) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_exception exception = { }; + struct nfs4_state *res; + int status; + + do { + status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); + res = ctx->state; + trace_nfs4_open_file(ctx, flags, status); + if (status == 0) + break; + /* NOTE: BAD_SEQID means the server and client disagree about the + * book-keeping w.r.t. state-changing operations + * (OPEN/CLOSE/LOCK/LOCKU...) + * It is actually a sign of a bug on the client or on the server. + * + * If we receive a BAD_SEQID error in the particular case of + * doing an OPEN, we assume that nfs_increment_open_seqid() will + * have unhashed the old state_owner for us, and that we can + * therefore safely retry using a new one. We should still warn + * the user though... + */ + if (status == -NFS4ERR_BAD_SEQID) { + pr_warn_ratelimited("NFS: v4 server %s " + " returned a bad sequence-id error!\n", + NFS_SERVER(dir)->nfs_client->cl_hostname); + exception.retry = 1; + continue; + } + /* + * BAD_STATEID on OPEN means that the server cancelled our + * state before it received the OPEN_CONFIRM. + * Recover by retrying the request as per the discussion + * on Page 181 of RFC3530. + */ + if (status == -NFS4ERR_BAD_STATEID) { + exception.retry = 1; + continue; + } + if (status == -EAGAIN) { + /* We must have found a delegation */ + exception.retry = 1; + continue; + } + if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) + continue; + res = ERR_PTR(nfs4_handle_exception(server, + status, &exception)); + } while (exception.retry); + return res; +} + +static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long timestamp = jiffies; + fmode_t fmode; + bool truncate; + int status; + + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + + nfs_fattr_init(fattr); + + /* Servers should only apply open mode checks for file size changes */ + truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + fmode = truncate ? FMODE_WRITE : FMODE_READ; + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + /* Use that stateid */ + } else if (truncate && state != NULL) { + struct nfs_lockowner lockowner = { + .l_owner = current->files, + .l_pid = current->tgid, + }; + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, + &lockowner) == -EIO) + return -EBADF; + } else + nfs4_stateid_copy(&arg.stateid, &zero_stateid); + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (status == 0 && state != NULL) + renew_lease(server, timestamp); + return status; +} + +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .state = state, + .inode = inode, + }; + int err; + do { + err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + trace_nfs4_setattr(inode, err); + switch (err) { + case -NFS4ERR_OPENMODE: + if (!(sattr->ia_valid & ATTR_SIZE)) { + pr_warn_once("NFSv4: server %s is incorrectly " + "applying open mode checks to " + "a SETATTR that is not " + "changing file size.\n", + server->nfs_client->cl_hostname); + } + if (state && !(state->state & FMODE_WRITE)) { + err = -EBADF; + if (sattr->ia_valid & ATTR_OPEN) + err = -EACCES; + goto out; + } + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); +out: + return err; +} + +struct nfs4_closedata { + struct inode *inode; + struct nfs4_state *state; + struct nfs_closeargs arg; + struct nfs_closeres res; + struct nfs_fattr fattr; + unsigned long timestamp; + bool roc; + u32 roc_barrier; +}; + +static void nfs4_free_closedata(void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state_owner *sp = calldata->state->owner; + struct super_block *sb = calldata->state->inode->i_sb; + + if (calldata->roc) + pnfs_roc_release(calldata->state->inode); + nfs4_put_open_state(calldata->state); + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_state_owner(sp); + nfs_sb_deactive(sb); + kfree(calldata); +} + +static void nfs4_close_done(struct rpc_task *task, void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + nfs4_stateid *res_stateid = NULL; + + dprintk("%s: begin!\n", __func__); + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; + trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); + /* hmm. we are done with the inode, and in the process of freeing + * the state_owner. we keep this around to process errors + */ + switch (task->tk_status) { + case 0: + res_stateid = &calldata->res.stateid; + if (calldata->arg.fmode == 0 && calldata->roc) + pnfs_roc_set_barrier(state->inode, + calldata->roc_barrier); + renew_lease(server, calldata->timestamp); + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + if (!nfs4_stateid_match(&calldata->arg.stateid, + &state->open_stateid)) { + rpc_restart_call_prepare(task); + goto out_release; + } + if (calldata->arg.fmode == 0) + break; + default: + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + goto out_release; + } + } + nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); +out_release: + nfs_release_seqid(calldata->arg.seqid); + nfs_refresh_inode(calldata->inode, calldata->res.fattr); + dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); +} + +static void nfs4_close_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state *state = calldata->state; + struct inode *inode = calldata->inode; + bool is_rdonly, is_wronly, is_rdwr; + int call_close = 0; + + dprintk("%s: begin!\n", __func__); + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) + goto out_wait; + + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + spin_lock(&state->owner->so_lock); + is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); + is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); + is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); + nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); + /* Calculate the change in open mode */ + calldata->arg.fmode = 0; + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) + call_close |= is_rdonly; + else if (is_rdonly) + calldata->arg.fmode |= FMODE_READ; + if (state->n_wronly == 0) + call_close |= is_wronly; + else if (is_wronly) + calldata->arg.fmode |= FMODE_WRITE; + } else if (is_rdwr) + calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; + + if (calldata->arg.fmode == 0) + call_close |= is_rdwr; + + if (!nfs4_valid_open_stateid(state)) + call_close = 0; + spin_unlock(&state->owner->so_lock); + + if (!call_close) { + /* Note: exit _without_ calling nfs4_close_done */ + goto out_no_action; + } + + if (calldata->arg.fmode == 0) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + if (calldata->roc && + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { + nfs_release_seqid(calldata->arg.seqid); + goto out_wait; + } + } + calldata->arg.share_access = + nfs4_map_atomic_open_share(NFS_SERVER(inode), + calldata->arg.fmode, 0); + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(NFS_SERVER(inode), + &calldata->arg.seq_args, + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + dprintk("%s: done!\n", __func__); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); +} + +static const struct rpc_call_ops nfs4_close_ops = { + .rpc_call_prepare = nfs4_close_prepare, + .rpc_call_done = nfs4_close_done, + .rpc_release = nfs4_free_closedata, +}; + +static bool nfs4_roc(struct inode *inode) +{ + if (!nfs_have_layout(inode)) + return false; + return pnfs_roc(inode); +} + +/* + * It is possible for data to be read/written from a mem-mapped file + * after the sys_close call (which hits the vfs layer as a flush). + * This means that we can't safely call nfsv4 close on a file until + * the inode is cleared. This in turn means that we are not good + * NFSv4 citizens - we do not indicate to the server to update the file's + * share state even when we are done with one of the three share + * stateid's in the inode. + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + struct nfs4_closedata *calldata; + struct nfs4_state_owner *sp = state->owner; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], + .rpc_cred = state->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_close_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + + calldata = kzalloc(sizeof(*calldata), gfp_mask); + if (calldata == NULL) + goto out; + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); + calldata->inode = state->inode; + calldata->state = state; + calldata->arg.fh = NFS_FH(state->inode); + /* Serialization for the sequence id */ + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (IS_ERR(calldata->arg.seqid)) + goto out_free_calldata; + calldata->arg.fmode = 0; + calldata->arg.bitmask = server->cache_consistency_bitmask; + calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; + calldata->res.server = server; + calldata->roc = nfs4_roc(state->inode); + nfs_sb_active(calldata->inode->i_sb); + + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = 0; + if (wait) + status = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return status; +out_free_calldata: + kfree(calldata); +out: + nfs4_put_open_state(state); + nfs4_put_state_owner(sp); + return status; +} + +static struct inode * +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, + int open_flags, struct iattr *attr, int *opened) +{ + struct nfs4_state *state; + struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + + label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); + + /* Protect against concurrent sillydeletes */ + state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened); + + nfs4_label_release_security(label); + + if (IS_ERR(state)) + return ERR_CAST(state); + return state->inode; +} + +static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +{ + if (ctx->state == NULL) + return; + if (is_sync) + nfs4_close_sync(ctx->state, ctx->mode); + else + nfs4_close_state(ctx->state, ctx->mode); +} + +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) + +static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +{ + struct nfs4_server_caps_arg args = { + .fhandle = fhandle, + }; + struct nfs4_server_caps_res res = {}; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (status == 0) { + /* Sanity check the server answers */ + switch (server->nfs_client->cl_minorversion) { + case 0: + res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; + res.attr_bitmask[2] = 0; + break; + case 1: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + break; + case 2: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + } + memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME| + NFS_CAP_SECURITY_LABEL); + if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && + res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) + server->caps |= NFS_CAP_ACLS; + if (res.has_links != 0) + server->caps |= NFS_CAP_HARDLINKS; + if (res.has_symlinks != 0) + server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif + memcpy(server->attr_bitmask_nl, res.attr_bitmask, + sizeof(server->attr_bitmask)); + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); + server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; + server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->cache_consistency_bitmask[2] = 0; + server->acl_bitmask = res.acl_bitmask; + server->fh_expire_type = res.fh_expire_type; + } + + return status; +} + +int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_server_capabilities(server, fhandle), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + u32 bitmask[3]; + struct nfs4_lookup_root_arg args = { + .bitmask = bitmask, + }; + struct nfs4_lookup_res res = { + .server = server, + .fattr = info->fattr, + .fh = fhandle, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + bitmask[0] = nfs4_fattr_bitmap[0]; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* + * Process the label in the upcoming getfattr + */ + bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; + + nfs_fattr_init(info->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_lookup_root(server, fhandle, info); + trace_nfs4_lookup_root(server, fhandle, info->fattr, err); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + goto out; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); +out: + return err; +} + +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth_create_args auth_args = { + .pseudoflavor = flavor, + }; + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(&auth_args, server->client); + if (IS_ERR(auth)) { + ret = -EACCES; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); +out: + return ret; +} + +/* + * Retry pseudoroot lookup with various security flavors. We do this when: + * + * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + * NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value. + */ +static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + /* Per 3530bis 15.33.5 */ + static const rpc_authflavor_t flav_array[] = { + RPC_AUTH_GSS_KRB5P, + RPC_AUTH_GSS_KRB5I, + RPC_AUTH_GSS_KRB5, + RPC_AUTH_UNIX, /* courtesy */ + RPC_AUTH_NULL, + }; + int status = -EPERM; + size_t i; + + if (server->auth_info.flavor_len > 0) { + /* try each flavor specified by user */ + for (i = 0; i < server->auth_info.flavor_len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + server->auth_info.flavors[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } else { + /* no flavors specified by user, try default list */ + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } + + /* + * -EACCESS could mean that the user doesn't have correct permissions + * to access the mount. It could also mean that we tried to mount + * with a gss auth flavor, but rpc.gssd isn't running. Either way, + * existing mount programs don't handle -EACCES very well so it should + * be mapped to -EPERM instead. + */ + if (status == -EACCES) + status = -EPERM; + return status; +} + +static int nfs4_do_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ + int mv = server->nfs_client->cl_minorversion; + return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * @auth_probe: probe the auth flavours + * + * Returns zero on success, or a negative errno. + */ +int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, + bool auth_probe) +{ + int status = 0; + + if (!auth_probe) + status = nfs4_lookup_root(server, fhandle, info); + + if (auth_probe || status == NFS4ERR_WRONGSEC) + status = nfs4_do_find_root_sec(server, fhandle, info); + + if (status == 0) + status = nfs4_server_capabilities(server, fhandle); + if (status == 0) + status = nfs4_do_fsinfo(server, fhandle, info); + + return nfs4_map_errors(status); +} + +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, + struct nfs_fsinfo *info) +{ + int error; + struct nfs_fattr *fattr = info->fattr; + struct nfs4_label *label = NULL; + + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs4_get_root: getcaps error = %d\n", -error); + return error; + } + + label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + error = nfs4_proc_getattr(server, mntfh, fattr, label); + if (error < 0) { + dprintk("nfs4_get_root: getattr error = %d\n", -error); + goto err_free_label; + } + + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + +err_free_label: + nfs4_label_free(label); + + return error; +} + +/* + * Get locations and (maybe) other attributes of a referral. + * Note that we'll actually follow the referral later when + * we detect fsid mismatch in inode revalidation + */ +static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fattr *fattr, + struct nfs_fh *fhandle) +{ + int status = -ENOMEM; + struct page *page = NULL; + struct nfs4_fs_locations *locations = NULL; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (locations == NULL) + goto out; + + status = nfs4_proc_fs_locations(client, dir, name, locations, page); + if (status != 0) + goto out; + + /* + * If the fsid didn't change, this is a migration event, not a + * referral. Cause us to drop into the exception handler, which + * will kick off migration recovery. + */ + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); + status = -NFS4ERR_MOVED; + goto out; + } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + + /* replace the lookup nfs_fattr with the locations nfs_fattr */ + memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + memset(fhandle, 0, sizeof(struct nfs_fh)); +out: + if (page) + __free_page(page); + kfree(locations); + return status; +} + +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct nfs4_getattr_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = fattr, + .label = label, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + args.bitmask = nfs4_bitmask(server, label); + + nfs_fattr_init(fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_getattr(server, fhandle, fattr, label); + trace_nfs4_getattr(server, fhandle, fattr, err); + err = nfs4_handle_exception(server, err, + &exception); + } while (exception.retry); + return err; +} + +/* + * The file is not closed if it is opened due to the a request to change + * the size of the file. The open call will not be needed once the + * VFS layer lookup-intents are implemented. + * + * Close is called when the inode is destroyed. + * If we haven't opened the file for O_WRONLY, we + * need to in the size_change case to obtain a stateid. + * + * Got race? + * Because OPEN is always done by name in nfsv4, it is + * possible that we opened a different file by the same + * name. We can recognize this race condition, but we + * can't do anything about it besides returning an error. + * + * This will be fixed with VFS changes (lookup-intent). + */ +static int +nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = d_inode(dentry); + struct rpc_cred *cred = NULL; + struct nfs4_state *state = NULL; + struct nfs4_label *label = NULL; + int status; + + if (pnfs_ld_layoutret_on_setattr(inode) && + sattr->ia_valid & ATTR_SIZE && + sattr->ia_size < i_size_read(inode)) + pnfs_commit_and_return_layout(inode); + + nfs_fattr_init(fattr); + + /* Deal with open(O_TRUNC) */ + if (sattr->ia_valid & ATTR_OPEN) + sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME); + + /* Optimization: if the end result is no change, don't RPC */ + if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + return 0; + + /* Search for an existing open(O_WRITE) file */ + if (sattr->ia_valid & ATTR_FILE) { + struct nfs_open_context *ctx; + + ctx = nfs_file_open_context(sattr->ia_file); + if (ctx) { + cred = ctx->cred; + state = ctx->state; + } + } + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + if (status == 0) { + nfs_setattr_update_inode(inode, sattr, fattr); + nfs_setsecurity(inode, fattr, label); + } + nfs4_label_free(label); + return status; +} + +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct nfs_server *server = NFS_SERVER(dir); + int status; + struct nfs4_lookup_arg args = { + .bitmask = server->attr_bitmask, + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_lookup_res res = { + .server = server, + .fattr = fattr, + .label = label, + .fh = fhandle, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + args.bitmask = nfs4_bitmask(server, label); + + nfs_fattr_init(fattr); + + dprintk("NFS call lookup %s\n", name->name); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) +{ + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct nfs4_exception exception = { }; + struct rpc_clnt *client = *clnt; + int err; + do { + err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); + trace_nfs4_lookup(dir, name, err); + switch (err) { + case -NFS4ERR_BADNAME: + err = -ENOENT; + goto out; + case -NFS4ERR_MOVED: + err = nfs4_get_referral(client, dir, name, fattr, fhandle); + goto out; + case -NFS4ERR_WRONGSEC: + err = -EPERM; + if (client != *clnt) + goto out; + client = nfs4_negotiate_security(client, dir, name); + if (IS_ERR(client)) + return PTR_ERR(client); + + exception.retry = 1; + break; + default: + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); + } + } while (exception.retry); + +out: + if (err == 0) + *clnt = client; + else if (client != *clnt) + rpc_shutdown_client(client); + + return err; +} + +static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + int status; + struct rpc_clnt *client = NFS_CLIENT(dir); + + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); + if (client != NFS_CLIENT(dir)) { + rpc_shutdown_client(client); + nfs_fixup_secinfo_attributes(fattr); + } + return status; +} + +struct rpc_clnt * +nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct rpc_clnt *client = NFS_CLIENT(dir); + int status; + + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); + if (status < 0) + return ERR_PTR(status); + return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; +} + +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + .bitmask = server->cache_consistency_bitmask, + }; + struct nfs4_accessres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = entry->cred, + }; + int mode = entry->mask; + int status = 0; + + /* + * Determine which access bits we want to ask for... + */ + if (mode & MAY_READ) + args.access |= NFS4_ACCESS_READ; + if (S_ISDIR(inode->i_mode)) { + if (mode & MAY_WRITE) + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_LOOKUP; + } else { + if (mode & MAY_WRITE) + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_EXECUTE; + } + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (!status) { + nfs_access_set_mask(entry, res.access); + nfs_refresh_inode(inode, res.fattr); + } + nfs_free_fattr(res.fattr); + return status; +} + +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_access(inode, entry); + trace_nfs4_access(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} + +/* + * TODO: For the time being, we don't try to get any attributes + * along with any of the zero-copy operations READ, READDIR, + * READLINK, WRITE. + * + * In the case of the first three, we want to put the GETATTR + * after the read-type operation -- this is because it is hard + * to predict the length of a GETATTR response in v4, and thus + * align the READ data correctly. This means that the GETATTR + * may end up partially falling into the page cache, and we should + * shift it into the 'tail' of the xdr_buf before processing. + * To do this efficiently, we need to know the total length + * of data received, which doesn't seem to be available outside + * of the RPC layer. + * + * In the case of WRITE, we also want to put the GETATTR after + * the operation -- in this case because we want to make sure + * we get the post-operation mtime and size. + * + * Both of these changes to the XDR layer would in fact be quite + * minor, but I decided to leave them for a subsequent patch. + */ +static int _nfs4_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs4_readlink args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page, + }; + struct nfs4_readlink_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_readlink(inode, page, pgbase, pglen); + trace_nfs4_readlink(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} + +/* + * This is just for mknod. open(O_CREAT) will always do ->open_context(). + */ +static int +nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) +{ + struct nfs4_label l, *ilabel = NULL; + struct nfs_open_context *ctx; + struct nfs4_state *state; + int opened = 0; + int status = 0; + + ctx = alloc_nfs_open_context(dentry, FMODE_READ); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); + if (IS_ERR(state)) { + status = PTR_ERR(state); + goto out; + } +out: + nfs4_label_release_security(ilabel); + put_nfs_open_context(ctx); + return status; +} + +static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_removeargs args = { + .fh = NFS_FH(dir), + .name = *name, + }; + struct nfs_removeres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + if (status == 0) + update_changeattr(dir, &res.cinfo); + return status; +} + +static int nfs4_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_remove(dir, name); + trace_nfs4_remove(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_removeargs *args = msg->rpc_argp; + struct nfs_removeres *res = msg->rpc_resp; + + res->server = server; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; + nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); + + nfs_fattr_init(res->dir_attr); +} + +static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + nfs4_setup_sequence(NFS_SERVER(data->dir), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + struct nfs_unlinkdata *data = task->tk_calldata; + struct nfs_removeres *res = &data->res; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL, + &data->timeout) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + return 1; +} + +static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_renameargs *arg = msg->rpc_argp; + struct nfs_renameres *res = msg->rpc_resp; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; + res->server = server; + nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + nfs4_setup_sequence(NFS_SERVER(data->old_dir), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renamedata *data = task->tk_calldata; + struct nfs_renameres *res = &data->res; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); + update_changeattr(new_dir, &res->new_cinfo); + return 1; +} + +static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_link_arg arg = { + .fh = NFS_FH(inode), + .dir_fh = NFS_FH(dir), + .name = name, + .bitmask = server->attr_bitmask, + }; + struct nfs4_link_res res = { + .server = server, + .label = NULL, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + + res.label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(res.label)) { + status = PTR_ERR(res.label); + goto out; + } + arg.bitmask = nfs4_bitmask(server, res.label); + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (!status) { + update_changeattr(dir, &res.cinfo); + status = nfs_post_op_update_inode(inode, res.fattr); + if (!status) + nfs_setsecurity(inode, res.fattr, res.label); + } + + + nfs4_label_free(res.label); + +out: + nfs_free_fattr(res.fattr); + return status; +} + +static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_proc_link(inode, dir, name), + &exception); + } while (exception.retry); + return err; +} + +struct nfs4_createdata { + struct rpc_message msg; + struct nfs4_create_arg arg; + struct nfs4_create_res res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs4_label *label; +}; + +static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, + struct qstr *name, struct iattr *sattr, u32 ftype) +{ + struct nfs4_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + struct nfs_server *server = NFS_SERVER(dir); + + data->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(data->label)) + goto out_free; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->arg.dir_fh = NFS_FH(dir); + data->arg.server = server; + data->arg.name = name; + data->arg.attrs = sattr; + data->arg.ftype = ftype; + data->arg.bitmask = nfs4_bitmask(server, data->label); + data->res.server = server; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.label = data->label; + nfs_fattr_init(data->res.fattr); + } + return data; +out_free: + kfree(data); + return NULL; +} + +static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + if (status == 0) { + update_changeattr(dir, &data->res.dir_cinfo); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); + } + return status; +} + +static void nfs4_free_createdata(struct nfs4_createdata *data) +{ + nfs4_label_free(data->label); + kfree(data); +} + +static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr, + struct nfs4_label *label) +{ + struct nfs4_createdata *data; + int status = -ENAMETOOLONG; + + if (len > NFS4_MAXPATHLEN) + goto out; + + status = -ENOMEM; + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; + data->arg.u.symlink.pages = &page; + data->arg.u.symlink.len = len; + data->arg.label = label; + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) +{ + struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; + int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + + do { + err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + trace_nfs4_symlink(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + + nfs4_label_release_security(label); + return err; +} + +static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label) +{ + struct nfs4_createdata *data; + int status = -ENOMEM; + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); + if (data == NULL) + goto out; + + data->arg.label = label; + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) +{ + struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; + int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); + do { + err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + trace_nfs4_mkdir(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + nfs4_label_release_security(label); + + return err; +} + +static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = d_inode(dentry); + struct nfs4_readdir_arg args = { + .fh = NFS_FH(dir), + .pages = pages, + .pgbase = 0, + .count = count, + .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, + .plus = plus, + }; + struct nfs4_readdir_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, + dentry, + (unsigned long long)cookie); + nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + res.pgbase = args.pgbase; + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + if (status >= 0) { + memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } + + nfs_invalidate_atime(dir); + + dprintk("%s: returns %d\n", __func__, status); + return status; +} + +static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_readdir(dentry, cred, cookie, + pages, count, plus); + trace_nfs4_readdir(d_inode(dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label, dev_t rdev) +{ + struct nfs4_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); + if (data == NULL) + goto out; + + if (S_ISFIFO(mode)) + data->arg.ftype = NF4FIFO; + else if (S_ISBLK(mode)) { + data->arg.ftype = NF4BLK; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); + } + else if (S_ISCHR(mode)) { + data->arg.ftype = NF4CHR; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); + } else if (!S_ISSOCK(mode)) { + status = -EINVAL; + goto out_free; + } + + data->arg.label = label; + status = nfs4_do_create(dir, dentry, data); +out_free: + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, dev_t rdev) +{ + struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; + int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); + do { + err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); + trace_nfs4_mknod(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + + nfs4_label_release_security(label); + + return err; +} + +static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *fsstat) +{ + struct nfs4_statfs_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_statfs_res res = { + .fsstat = fsstat, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(fsstat->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_proc_statfs(server, fhandle, fsstat), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *fsinfo) +{ + struct nfs4_fsinfo_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_fsinfo_res res = { + .fsinfo = fsinfo, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) +{ + struct nfs4_exception exception = { }; + unsigned long now = jiffies; + int err; + + do { + err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); + if (err == 0) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo->lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + break; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) +{ + int error; + + nfs_fattr_init(fsinfo->fattr); + error = nfs4_do_fsinfo(server, fhandle, fsinfo); + if (error == 0) { + /* block layout checks this! */ + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + } + + return error; +} + +static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) +{ + struct nfs4_pathconf_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_pathconf_res res = { + .pathconf = pathconf, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + /* None of the pathconf attributes are mandatory to implement */ + if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) { + memset(pathconf, 0, sizeof(*pathconf)); + return 0; + } + + nfs_fattr_init(pathconf->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_pathconf(server, fhandle, pathconf), + &exception); + } while (exception.retry); + return err; +} + +int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; + return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); + +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + nfs4_stateid current_stateid; + + /* If the current stateid represents a lost lock, then exit */ + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + return true; + return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ + switch (err) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_OPENMODE: + case -NFS4ERR_EXPIRED: + return true; + } + return false; +} + +void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) +{ + nfs_invalidate_atime(hdr->inode); +} + +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + struct nfs_server *server = NFS_SERVER(hdr->inode); + + trace_nfs4_read(hdr, task->tk_status); + if (nfs4_async_handle_error(task, server, + hdr->args.context->state, + NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + __nfs4_read_done_cb(hdr); + if (task->tk_status > 0) + renew_lease(server, hdr->timestamp); + return 0; +} + +static bool nfs4_read_stateid_changed(struct rpc_task *task, + struct nfs_pgio_args *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_READ)) + return false; + rpc_restart_call_prepare(task); + return true; +} + +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &hdr->res.seq_res)) + return -EAGAIN; + if (nfs4_read_stateid_changed(task, &hdr->args)) + return -EAGAIN; + return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : + nfs4_read_done_cb(task, hdr); +} + +static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + hdr->timestamp = jiffies; + hdr->pgio_done_cb = nfs4_read_done_cb; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); +} + +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (nfs4_setup_sequence(NFS_SERVER(hdr->inode), + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return 0; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, + hdr->rw_ops->rw_mode) == -EIO) + return -EIO; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) + return -EIO; + return 0; +} + +static int nfs4_write_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct inode *inode = hdr->inode; + + trace_nfs4_write(hdr, task->tk_status); + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + hdr->args.context->state, + NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } + if (task->tk_status >= 0) { + renew_lease(NFS_SERVER(inode), hdr->timestamp); + nfs_writeback_update_inode(hdr); + } + return 0; +} + +static bool nfs4_write_stateid_changed(struct rpc_task *task, + struct nfs_pgio_args *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_WRITE)) + return false; + rpc_restart_call_prepare(task); + return true; +} + +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + if (!nfs4_sequence_done(task, &hdr->res.seq_res)) + return -EAGAIN; + if (nfs4_write_stateid_changed(task, &hdr->args)) + return -EAGAIN; + return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : + nfs4_write_done_cb(task, hdr); +} + +static +bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) +{ + /* Don't request attributes for pNFS or O_DIRECT writes */ + if (hdr->ds_clp != NULL || hdr->dreq != NULL) + return false; + /* Otherwise, request attributes if and only if we don't hold + * a delegation + */ + return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; +} + +static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + struct nfs_server *server = NFS_SERVER(hdr->inode); + + if (!nfs4_write_need_cache_consistency_data(hdr)) { + hdr->args.bitmask = NULL; + hdr->res.fattr = NULL; + } else + hdr->args.bitmask = server->cache_consistency_bitmask; + + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_write_done_cb; + hdr->res.server = server; + hdr->timestamp = jiffies; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1); +} + +static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) +{ + struct inode *inode = data->inode; + + trace_nfs4_commit(data, task->tk_status); + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + NULL, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } + return 0; +} + +static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->commit_done_cb(task, data); +} + +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +{ + struct nfs_server *server = NFS_SERVER(data->inode); + + if (data->commit_done_cb == NULL) + data->commit_done_cb = nfs4_commit_done_cb; + data->res.server = server; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +} + +struct nfs4_renewdata { + struct nfs_client *client; + unsigned long timestamp; +}; + +/* + * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special + * standalone procedure for queueing an asynchronous RENEW. + */ +static void nfs4_renew_release(void *calldata) +{ + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(data); +} + +static void nfs4_renew_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + unsigned long timestamp = data->timestamp; + + trace_nfs4_renew_async(clp, task->tk_status); + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + break; + default: + /* Unless we're shutting down, schedule state recovery! */ + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) + return; + if (task->tk_status != NFS4ERR_CB_PATH_DOWN) { + nfs4_schedule_lease_recovery(clp); + return; + } + nfs4_schedule_path_down_recovery(clp); + } + do_renew_lease(clp, timestamp); +} + +static const struct rpc_call_ops nfs4_renew_ops = { + .rpc_call_done = nfs4_renew_done, + .rpc_release = nfs4_renew_release, +}; + +static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], + .rpc_argp = clp, + .rpc_cred = cred, + }; + struct nfs4_renewdata *data; + + if (renew_flags == 0) + return 0; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->client = clp; + data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, + &nfs4_renew_ops, data); +} + +static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], + .rpc_argp = clp, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (status < 0) + return status; + do_renew_lease(clp, now); + return 0; +} + +static inline int nfs4_server_supports_acls(struct nfs_server *server) +{ + return server->caps & NFS_CAP_ACLS; +} + +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on + * the stack. + */ +#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) + +static int buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; + + do { + len = min_t(size_t, PAGE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; +} + +struct nfs4_cached_acl { + int cached; + size_t len; + char data[0]; +}; + +static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + kfree(nfsi->nfs4_acl); + nfsi->nfs4_acl = acl; + spin_unlock(&inode->i_lock); +} + +static void nfs4_zap_acl_attr(struct inode *inode) +{ + nfs4_set_cached_acl(inode, NULL); +} + +static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs4_cached_acl *acl; + int ret = -ENOENT; + + spin_lock(&inode->i_lock); + acl = nfsi->nfs4_acl; + if (acl == NULL) + goto out; + if (buf == NULL) /* user is just asking for length */ + goto out_len; + if (acl->cached == 0) + goto out; + ret = -ERANGE; /* see getxattr(2) man page */ + if (acl->len > buflen) + goto out; + memcpy(buf, acl->data, acl->len); +out_len: + ret = acl->len; +out: + spin_unlock(&inode->i_lock); + return ret; +} + +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +{ + struct nfs4_cached_acl *acl; + size_t buflen = sizeof(*acl) + acl_len; + + if (buflen <= PAGE_SIZE) { + acl = kmalloc(buflen, GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 1; + _copy_from_pages(acl->data, pages, pgbase, acl_len); + } else { + acl = kmalloc(sizeof(*acl), GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 0; + } + acl->len = acl_len; +out: + nfs4_set_cached_acl(inode, acl); +} + +/* + * The getxattr API returns the required buffer length when called with a + * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating + * the required buf. On a NULL buf, we send a page of data to the server + * guessing that the ACL request can be serviced by a page. If so, we cache + * up to the page of ACL data, and the 2nd call to getxattr is serviced by + * the cache. If not so, we throw away the page, and cache the required + * length. The next getxattr call will then produce another round trip to + * the server, this time with the input buf of the required size. + */ +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct nfs_getaclargs args = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + struct nfs_getaclres res = { + .acl_len = buflen, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], + .rpc_argp = &args, + .rpc_resp = &res, + }; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + int ret = -ENOMEM, i; + + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + if (npages == 0) + npages = 1; + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; + + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* for decoding across pages */ + res.acl_scratch = alloc_page(GFP_KERNEL); + if (!res.acl_scratch) + goto out_free; + + args.acl_len = npages * PAGE_SIZE; + args.acl_pgbase = 0; + + dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", + __func__, buf, buflen, npages, args.acl_len); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), + &msg, &args.seq_args, &res.seq_res, 0); + if (ret) + goto out_free; + + /* Handle the case where the passed-in buffer is too short */ + if (res.acl_flags & NFS4_ACL_TRUNC) { + /* Did the user only issue a request for the acl length? */ + if (buf == NULL) + goto out_ok; + ret = -ERANGE; + goto out_free; + } + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + if (buf) { + if (res.acl_len > buflen) { + ret = -ERANGE; + goto out_free; + } + _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); + } +out_ok: + ret = res.acl_len; +out_free: + for (i = 0; i < npages; i++) + if (pages[i]) + __free_page(pages[i]); + if (res.acl_scratch) + __free_page(res.acl_scratch); + return ret; +} + +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); + trace_nfs4_get_acl(inode, ret); + if (ret >= 0) + break; + ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); + } while (exception.retry); + return ret; +} + +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + ret = nfs_revalidate_inode(server, inode); + if (ret < 0) + return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + ret = nfs4_read_cached_acl(inode, buf, buflen); + if (ret != -ENOENT) + /* -ENOENT is returned if there is no ACL or if there is an ACL + * but no cached acl data, just the acl length */ + return ret; + return nfs4_get_acl_uncached(inode, buf, buflen); +} + +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFS4ACL_MAXPAGES]; + struct nfs_setaclargs arg = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + struct nfs_setaclres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + int ret, i; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; + nfs4_inode_return_delegation(inode); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); + return ret; +} + +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = __nfs4_proc_set_acl(inode, buf, buflen); + trace_nfs4_set_acl(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static int _nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct nfs4_label label = {0, 0, buflen, buf}; + + u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs4_getattr_arg arg = { + .fh = NFS_FH(inode), + .bitmask = bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = &fattr, + .label = &label, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret; + + nfs_fattr_init(&fattr); + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + if (ret) + return ret; + if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) + return -ENOENT; + if (buflen < label.len) + return -ERANGE; + return 0; +} + +static int nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + do { + err = _nfs4_get_security_label(inode, buf, buflen); + trace_nfs4_get_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + + struct iattr sattr = {0}; + struct nfs_server *server = NFS_SERVER(inode); + const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = &sattr, + .server = server, + .bitmask = bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + nfs4_stateid_copy(&arg.stateid, &zero_stateid); + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (status) + dprintk("%s failed: %d\n", __func__, status); + + return status; +} + +static int nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_do_set_security_label(inode, ilabel, + fattr, olabel); + trace_nfs4_set_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +{ + struct nfs4_label ilabel, *olabel = NULL; + struct nfs_fattr fattr; + struct rpc_cred *cred; + struct inode *inode = d_inode(dentry); + int status; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + nfs_fattr_init(&fattr); + + ilabel.pi = 0; + ilabel.lfs = 0; + ilabel.label = (char *)buf; + ilabel.len = buflen; + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + + olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(olabel)) { + status = -PTR_ERR(olabel); + goto out; + } + + status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); + if (status == 0) + nfs_setsecurity(inode, &fattr, olabel); + + nfs4_label_free(olabel); +out: + put_rpccred(cred); + return status; +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + + +static int +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, + struct nfs4_state *state, long *timeout) +{ + struct nfs_client *clp = server->nfs_client; + + if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto recovery_failed; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto recovery_failed; + } + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; + case -NFS4ERR_MOVED: + if (nfs4_schedule_migration_recovery(server) < 0) + goto recovery_failed; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session\n", __func__, + task->tk_status); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + goto wait_on_recovery; +#endif /* CONFIG_NFS_V4_1 */ + case -NFS4ERR_DELAY: + nfs_inc_server_stats(server, NFSIOS_DELAY); + rpc_delay(task, nfs4_update_delay(timeout)); + goto restart_call; + case -NFS4ERR_GRACE: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_OLD_STATEID: + goto restart_call; + } + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; +recovery_failed: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + goto recovery_failed; +restart_call: + task->tk_status = 0; + return -EAGAIN; +} + +static void nfs4_init_boot_verifier(const struct nfs_client *clp, + nfs4_verifier *bootverf) +{ + __be32 verf[2]; + + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + /* An impossible timestamp guarantees this value + * will never match a generated boot time. */ + verf[0] = 0; + verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + } else { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + verf[0] = cpu_to_be32(nn->boot_time.tv_sec); + verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + } + memcpy(bootverf->data, verf, sizeof(bootverf->data)); +} + +static unsigned int +nfs4_init_nonuniform_client_string(struct nfs_client *clp, + char *buf, size_t len) +{ + unsigned int result; + + if (clp->cl_owner_id != NULL) + return strlcpy(buf, clp->cl_owner_id, len); + + rcu_read_lock(); + result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO)); + rcu_read_unlock(); + clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); + return result; +} + +static unsigned int +nfs4_init_uniform_client_string(struct nfs_client *clp, + char *buf, size_t len) +{ + const char *nodename = clp->cl_rpcclient->cl_nodename; + unsigned int result; + + if (clp->cl_owner_id != NULL) + return strlcpy(buf, clp->cl_owner_id, len); + + if (nfs4_client_id_uniquifier[0] != '\0') + result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, + clp->cl_minorversion, + nfs4_client_id_uniquifier, + nodename); + else + result = scnprintf(buf, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + nodename); + clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); + return result; +} + +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services. Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ + if (strchr(clp->cl_ipaddr, ':') != NULL) + return scnprintf(buf, len, "tcp6"); + else + return scnprintf(buf, len, "tcp"); +} + +static void nfs4_setclientid_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_setclientid *sc = calldata; + + if (task->tk_status == 0) + sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred); +} + +static const struct rpc_call_ops nfs4_setclientid_ops = { + .rpc_call_done = nfs4_setclientid_done, +}; + +/** + * nfs4_proc_setclientid - Negotiate client ID + * @clp: state data structure + * @program: RPC program for NFSv4 callback service + * @port: IP port number for NFS4 callback service + * @cred: RPC credential to use for this call + * @res: where to place the result + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ +int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +{ + nfs4_verifier sc_verifier; + struct nfs4_setclientid setclientid = { + .sc_verifier = &sc_verifier, + .sc_prog = program, + .sc_cb_ident = clp->cl_cb_ident, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, + .rpc_resp = res, + .rpc_cred = cred, + }; + struct rpc_task *task; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_setclientid_ops, + .callback_data = &setclientid, + .flags = RPC_TASK_TIMEOUT, + }; + int status; + + /* nfs_client_id4 */ + nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) + setclientid.sc_name_len = + nfs4_init_uniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + else + setclientid.sc_name_len = + nfs4_init_nonuniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + /* cb_client4 */ + setclientid.sc_netid_len = + nfs4_init_callback_netid(clp, + setclientid.sc_netid, + sizeof(setclientid.sc_netid)); + setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, + sizeof(setclientid.sc_uaddr), "%s.%u.%u", + clp->cl_ipaddr, port >> 8, port & 255); + + dprintk("NFS call setclientid auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + setclientid.sc_name_len, setclientid.sc_name); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out; + } + status = task->tk_status; + if (setclientid.sc_cred) { + clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); + put_rpccred(setclientid.sc_cred); + } + rpc_put_task(task); +out: + trace_nfs4_setclientid(clp, status); + dprintk("NFS reply setclientid: %d\n", status); + return status; +} + +/** + * nfs4_proc_setclientid_confirm - Confirm client ID + * @clp: state data structure + * @res: result of a previous SETCLIENTID + * @cred: RPC credential to use for this call + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, + struct nfs4_setclientid_res *arg, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], + .rpc_argp = arg, + .rpc_cred = cred, + }; + int status; + + dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + clp->cl_clientid); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_setclientid_confirm(clp, status); + dprintk("NFS reply setclientid_confirm: %d\n", status); + return status; +} + +struct nfs4_delegreturndata { + struct nfs4_delegreturnargs args; + struct nfs4_delegreturnres res; + struct nfs_fh fh; + nfs4_stateid stateid; + unsigned long timestamp; + struct nfs_fattr fattr; + int rpc_status; + struct inode *inode; + bool roc; + u32 roc_barrier; +}; + +static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegreturndata *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); + switch (task->tk_status) { + case 0: + renew_lease(data->res.server, data->timestamp); + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + task->tk_status = 0; + if (data->roc) + pnfs_roc_set_barrier(data->inode, data->roc_barrier); + break; + default: + if (nfs4_async_handle_error(task, data->res.server, + NULL, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + data->rpc_status = task->tk_status; +} + +static void nfs4_delegreturn_release(void *calldata) +{ + struct nfs4_delegreturndata *data = calldata; + struct inode *inode = data->inode; + + if (inode) { + if (data->roc) + pnfs_roc_release(inode); + nfs_iput_and_deactive(inode); + } + kfree(calldata); +} + +static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_delegreturndata *d_data; + + d_data = (struct nfs4_delegreturndata *)data; + + if (d_data->roc && + pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) + return; + + nfs4_setup_sequence(d_data->res.server, + &d_data->args.seq_args, + &d_data->res.seq_res, + task); +} + +static const struct rpc_call_ops nfs4_delegreturn_ops = { + .rpc_call_prepare = nfs4_delegreturn_prepare, + .rpc_call_done = nfs4_delegreturn_done, + .rpc_release = nfs4_delegreturn_release, +}; + +static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +{ + struct nfs4_delegreturndata *data; + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_delegreturn_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + data = kzalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + data->args.fhandle = &data->fh; + data->args.stateid = &data->stateid; + data->args.bitmask = server->cache_consistency_bitmask; + nfs_copy_fh(&data->fh, NFS_FH(inode)); + nfs4_stateid_copy(&data->stateid, stateid); + data->res.fattr = &data->fattr; + data->res.server = server; + nfs_fattr_init(data->res.fattr); + data->timestamp = jiffies; + data->rpc_status = 0; + data->inode = nfs_igrab_and_active(inode); + if (data->inode) + data->roc = nfs4_roc(inode); + + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (!issync) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = data->rpc_status; + if (status == 0) + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + else + nfs_refresh_inode(inode, &data->fattr); +out: + rpc_put_task(task); + return status; +} + +int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + trace_nfs4_delegreturn(inode, err); + switch (err) { + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + case 0: + return 0; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +/* + * sleep, with exponential backoff, and retry the LOCK operation. + */ +static unsigned long +nfs4_set_lock_task_retry(unsigned long timeout) +{ + freezable_schedule_timeout_killable_unsafe(timeout); + timeout <<= 1; + if (timeout > NFS4_LOCK_MAXTIMEOUT) + return NFS4_LOCK_MAXTIMEOUT; + return timeout; +} + +static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_lockt_args arg = { + .fh = NFS_FH(inode), + .fl = request, + }; + struct nfs_lockt_res res = { + .denied = request, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = state->owner->so_cred, + }; + struct nfs4_lock_state *lsp; + int status; + + arg.lock_owner.clientid = clp->cl_clientid; + status = nfs4_set_lock_state(state, request); + if (status != 0) + goto out; + lsp = request->fl_u.nfs4_fl.owner; + arg.lock_owner.id = lsp->ls_seqid.owner_id; + arg.lock_owner.s_dev = server->s_dev; + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + switch (status) { + case 0: + request->fl_type = F_UNLCK; + break; + case -NFS4ERR_DENIED: + status = 0; + } + request->fl_ops->fl_release_private(request); + request->fl_ops = NULL; +out: + return status; +} + +static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_proc_getlk(state, cmd, request); + trace_nfs4_get_lock(request, state, cmd, err); + err = nfs4_handle_exception(NFS_SERVER(state->inode), err, + &exception); + } while (exception.retry); + return err; +} + +static int do_vfs_lock(struct file *file, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(file, fl); + break; + default: + BUG(); + } + return res; +} + +struct nfs4_unlockdata { + struct nfs_locku_args arg; + struct nfs_locku_res res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + struct file_lock fl; + const struct nfs_server *server; + unsigned long timestamp; +}; + +static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + + p = kzalloc(sizeof(*p), GFP_NOFS); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.seqid = seqid; + p->res.seqid = seqid; + p->lsp = lsp; + atomic_inc(&lsp->ls_count); + /* Ensure we don't close file until we're done freeing locks! */ + p->ctx = get_nfs_open_context(ctx); + memcpy(&p->fl, fl, sizeof(p->fl)); + p->server = NFS_SERVER(inode); + return p; +} + +static void nfs4_locku_release_calldata(void *data) +{ + struct nfs4_unlockdata *calldata = data; + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_lock_state(calldata->lsp); + put_nfs_open_context(calldata->ctx); + kfree(calldata); +} + +static void nfs4_locku_done(struct rpc_task *task, void *data) +{ + struct nfs4_unlockdata *calldata = data; + + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; + switch (task->tk_status) { + case 0: + renew_lease(calldata->server, calldata->timestamp); + do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + if (nfs4_update_lock_stateid(calldata->lsp, + &calldata->res.stateid)) + break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + if (!nfs4_stateid_match(&calldata->arg.stateid, + &calldata->lsp->ls_stateid)) + rpc_restart_call_prepare(task); + break; + default: + if (nfs4_async_handle_error(task, calldata->server, + NULL, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +} + +static void nfs4_locku_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_unlockdata *calldata = data; + + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) + goto out_wait; + nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); + if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { + /* Note: exit _without_ running nfs4_locku_done */ + goto out_no_action; + } + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(calldata->server, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); +} + +static const struct rpc_call_ops nfs4_locku_ops = { + .rpc_call_prepare = nfs4_locku_prepare, + .rpc_call_done = nfs4_locku_done, + .rpc_release = nfs4_locku_release_calldata, +}; + +static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(lsp->ls_state->inode), + .rpc_message = &msg, + .callback_ops = &nfs4_locku_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); + + /* Ensure this is an unlock - when canceling a lock, the + * canceled lock is passed in, and it won't be an unlock. + */ + fl->fl_type = F_UNLCK; + + data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); + if (data == NULL) { + nfs_free_seqid(seqid); + return ERR_PTR(-ENOMEM); + } + + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + task_setup_data.callback_data = data; + return rpc_run_task(&task_setup_data); +} + +static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct inode *inode = state->inode; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_seqid *seqid; + struct nfs4_lock_state *lsp; + struct rpc_task *task; + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + int status = 0; + unsigned char fl_flags = request->fl_flags; + + status = nfs4_set_lock_state(state, request); + /* Unlock _before_ we do the RPC call */ + request->fl_flags |= FL_EXISTS; + /* Exclude nfs_delegation_claim_locks() */ + mutex_lock(&sp->so_delegreturn_mutex); + /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ + down_read(&nfsi->rwsem); + if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); + goto out; + } + up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); + if (status != 0) + goto out; + /* Is this a delegated lock? */ + lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) + goto out; + alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid; + seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + status = -ENOMEM; + if (IS_ERR(seqid)) + goto out; + task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + status = PTR_ERR(task); + if (IS_ERR(task)) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + rpc_put_task(task); +out: + request->fl_flags = fl_flags; + trace_nfs4_unlock(request, state, F_SETLK, status); + return status; +} + +struct nfs4_lockdata { + struct nfs_lock_args arg; + struct nfs_lock_res res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + struct file_lock fl; + unsigned long timestamp; + int rpc_status; + int cancelled; + struct nfs_server *server; +}; + +static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, + struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, + gfp_t gfp_mask) +{ + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + + p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); + if (IS_ERR(p->arg.open_seqid)) + goto out_free; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (IS_ERR(p->arg.lock_seqid)) + goto out_free_seqid; + p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; + p->arg.lock_owner.id = lsp->ls_seqid.owner_id; + p->arg.lock_owner.s_dev = server->s_dev; + p->res.lock_seqid = p->arg.lock_seqid; + p->lsp = lsp; + p->server = server; + atomic_inc(&lsp->ls_count); + p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); + memcpy(&p->fl, fl, sizeof(p->fl)); + return p; +out_free_seqid: + nfs_free_seqid(p->arg.open_seqid); +out_free: + kfree(p); + return NULL; +} + +static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + struct nfs4_state *state = data->lsp->ls_state; + + dprintk("%s: begin!\n", __func__); + if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) + goto out_wait; + /* Do we need to do an open_to_lock_owner? */ + if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) { + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { + goto out_release_lock_seqid; + } + nfs4_stateid_copy(&data->arg.open_stateid, + &state->open_stateid); + data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; + } else { + data->arg.new_lock_owner = 0; + nfs4_stateid_copy(&data->arg.lock_stateid, + &data->lsp->ls_stateid); + } + if (!nfs4_valid_open_stateid(state)) { + data->rpc_status = -EBADF; + task->tk_action = NULL; + goto out_release_open_seqid; + } + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, + &data->res.seq_res, + task) == 0) + return; +out_release_open_seqid: + nfs_release_seqid(data->arg.open_seqid); +out_release_lock_seqid: + nfs_release_seqid(data->arg.lock_seqid); +out_wait: + nfs4_sequence_done(task, &data->res.seq_res); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); +} + +static void nfs4_lock_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + struct nfs4_lock_state *lsp = data->lsp; + + dprintk("%s: begin!\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + data->rpc_status = task->tk_status; + switch (task->tk_status) { + case 0: + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), + data->timestamp); + if (data->arg.new_lock) { + data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); + if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + rpc_restart_call_prepare(task); + break; + } + } + if (data->arg.new_lock_owner != 0) { + nfs_confirm_seqid(&lsp->ls_seqid, 0); + nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); + set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + if (data->arg.new_lock_owner != 0) { + if (!nfs4_stateid_match(&data->arg.open_stateid, + &lsp->ls_state->open_stateid)) + rpc_restart_call_prepare(task); + } else if (!nfs4_stateid_match(&data->arg.lock_stateid, + &lsp->ls_stateid)) + rpc_restart_call_prepare(task); + } + dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); +} + +static void nfs4_lock_release(void *calldata) +{ + struct nfs4_lockdata *data = calldata; + + dprintk("%s: begin!\n", __func__); + nfs_free_seqid(data->arg.open_seqid); + if (data->cancelled != 0) { + struct rpc_task *task; + task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, + data->arg.lock_seqid); + if (!IS_ERR(task)) + rpc_put_task_async(task); + dprintk("%s: cancelling lock!\n", __func__); + } else + nfs_free_seqid(data->arg.lock_seqid); + nfs4_put_lock_state(data->lsp); + put_nfs_open_context(data->ctx); + fput(data->fl.fl_file); + kfree(data); + dprintk("%s: done!\n", __func__); +} + +static const struct rpc_call_ops nfs4_lock_ops = { + .rpc_call_prepare = nfs4_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + if (new_lock_owner != 0 || + test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) + nfs4_schedule_stateid_recovery(server, lsp->ls_state); + break; + case -NFS4ERR_STALE_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); + }; +} + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) +{ + struct nfs4_lockdata *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK], + .rpc_cred = state->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(state->inode), + .rpc_message = &msg, + .callback_ops = &nfs4_lock_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int ret; + + dprintk("%s: begin!\n", __func__); + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), + fl->fl_u.nfs4_fl.owner, + recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) + data->arg.block = 1; + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + task_setup_data.callback_data = data; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + nfs4_set_sequence_privileged(&data->arg.seq_args); + } else + data->arg.new_lock = 1; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = nfs4_wait_for_completion_rpc_task(task); + if (ret == 0) { + ret = data->rpc_status; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); + } else + data->cancelled = 1; + rpc_put_task(task); + dprintk("%s: done, ret = %d!\n", __func__, ret); + return ret; +} + +static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { + .inode = state->inode, + }; + int err; + + do { + /* Cache the lock if possible... */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); + trace_nfs4_lock_reclaim(request, state, F_SETLK, err); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { + .inode = state->inode, + }; + int err; + + err = nfs4_set_lock_state(state, request); + if (err != 0) + return err; + if (!recover_lost_locks) { + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags); + return 0; + } + do { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); + trace_nfs4_lock_expired(request, state, F_SETLK, err); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } + } while (exception.retry); +out: + return err; +} + +#if defined(CONFIG_NFS_V4_1) +/** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = -NFS4ERR_BAD_STATEID; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status != NFS_OK) { + /* Free the stateid unless the server + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, + &lsp->ls_stateid, + cred); + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + ret = status; + } + } + }; + + return ret; +} + +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status = NFS_OK; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + status = nfs4_lock_expired(state, request); + return status; +} +#endif + +static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + unsigned char fl_flags = request->fl_flags; + int status = -ENOLCK; + + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; + /* Is this a delegated open? */ + status = nfs4_set_lock_state(state, request); + if (status != 0) + goto out; + request->fl_flags |= FL_ACCESS; + status = do_vfs_lock(request->fl_file, request); + if (status < 0) + goto out; + down_read(&nfsi->rwsem); + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + /* Yes: cache locks! */ + /* ...but avoid races with delegation recall... */ + request->fl_flags = fl_flags & ~FL_SLEEP; + status = do_vfs_lock(request->fl_file, request); + up_read(&nfsi->rwsem); + goto out; + } + up_read(&nfsi->rwsem); + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); +out: + request->fl_flags = fl_flags; + return status; +} + +static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs4_exception exception = { + .state = state, + .inode = state->inode, + }; + int err; + + do { + err = _nfs4_proc_setlk(state, cmd, request); + trace_nfs4_set_lock(request, state, cmd, err); + if (err == -NFS4ERR_DENIED) + err = -EAGAIN; + err = nfs4_handle_exception(NFS_SERVER(state->inode), + err, &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) +{ + struct nfs_open_context *ctx; + struct nfs4_state *state; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + int status; + + /* verify open state */ + ctx = nfs_file_open_context(filp); + state = ctx->state; + + if (request->fl_start < 0 || request->fl_end < 0) + return -EINVAL; + + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } + + if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) + return -EINVAL; + + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + + if (state == NULL) + return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + + do { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + timeout = nfs4_set_lock_task_retry(timeout); + status = -ERESTARTSYS; + if (signalled()) + break; + } while(status < 0); + return status; +} + +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + int err; + + err = nfs4_set_lock_state(state, fl); + if (err != 0) + return err; + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); +} + +struct nfs_release_lockowner_data { + struct nfs4_lock_state *lsp; + struct nfs_server *server; + struct nfs_release_lockowner_args args; + struct nfs_release_lockowner_res res; + unsigned long timestamp; +}; + +static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + struct nfs_server *server = data->server; + nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + &data->args.seq_args, &data->res.seq_res, task); + data->args.lock_owner.clientid = server->nfs_client->cl_clientid; + data->timestamp = jiffies; +} + +static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + struct nfs_server *server = data->server; + + nfs40_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case 0: + renew_lease(server, data->timestamp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); + break; + case -NFS4ERR_LEASE_MOVED: + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, + NULL, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs4_release_lockowner_release(void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + nfs4_free_lock_state(data->server, data->lsp); + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_call_prepare = nfs4_release_lockowner_prepare, + .rpc_call_done = nfs4_release_lockowner_done, + .rpc_release = nfs4_release_lockowner_release, +}; + +static void +nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct nfs_release_lockowner_data *data; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return; + + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return; + data->lsp = lsp; + data->server = server; + data->args.lock_owner.clientid = server->nfs_client->cl_clientid; + data->args.lock_owner.id = lsp->ls_seqid.owner_id; + data->args.lock_owner.s_dev = server->s_dev; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); +} + +#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + +static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (strcmp(key, "") != 0) + return -EINVAL; + + return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); +} + +static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (strcmp(key, "") != 0) + return -EINVAL; + + return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); +} + +static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = sizeof(XATTR_NAME_NFSV4_ACL); + + if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) + return 0; + + if (list && len <= list_len) + memcpy(list, XATTR_NAME_NFSV4_ACL, len); + return len; +} + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline int nfs4_server_supports_labels(struct nfs_server *server) +{ + return server->caps & NFS_CAP_SECURITY_LABEL; +} + +static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (security_ismaclabel(key)) + return nfs4_set_security_label(dentry, buf, buflen); + + return -EOPNOTSUPP; +} + +static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (security_ismaclabel(key)) + return nfs4_get_security_label(d_inode(dentry), buf, buflen); + return -EOPNOTSUPP; +} + +static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = 0; + + if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(d_inode(dentry), NULL, 0); + if (list && len <= list_len) + security_inode_listsecurity(d_inode(dentry), list, len); + } + return len; +} + +static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = nfs4_xattr_list_nfs4_label, + .get = nfs4_xattr_get_nfs4_label, + .set = nfs4_xattr_set_nfs4_label, +}; +#endif + + +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) +{ + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS))) + return; + + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, + struct nfs4_fs_locations *fs_locations, + struct page *page) +{ + struct nfs_server *server = NFS_SERVER(dir); + u32 bitmask[3] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + .page = page, + .bitmask = bitmask, + }; + struct nfs4_fs_locations_res res = { + .fs_locations = fs_locations, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + + nfs_fattr_init(&fs_locations->fattr); + fs_locations->server = server; + fs_locations->nlocations = 0; + status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("%s: returned status = %d\n", __func__, status); + return status; +} + +int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, + struct nfs4_fs_locations *fs_locations, + struct page *page) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_fs_locations(client, dir, name, + fs_locations, page); + trace_nfs4_get_fs_locations(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop returning + * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .clientid = server->nfs_client->cl_clientid, + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + .renew = 1, /* append RENEW */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status) + return status; + + renew_lease(server, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->get_locations(inode, locations, page, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop + * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + .clientid = clp->cl_clientid, + .renew = 1, /* append RENEW */ + }; + struct nfs4_fsid_present_res res = { + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status) + return status; + + do_renew_lease(clp, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + }; + struct nfs4_fsid_present_res res = { + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized. This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + * NFS4ERR code if some error occurred on the server, or a + * negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->fsid_present(inode, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/** + * If 'use_integrity' is true and the state managment nfs_client + * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient + * and the machine credential as per RFC3530bis and RFC5661 Security + * Considerations sections. Otherwise, just use the user cred with the + * filesystem's rpc_client. + */ +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct rpc_cred *cred = NULL; + + if (use_integrity) { + clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; + cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + msg.rpc_cred = cred; + } + + dprintk("NFS call secinfo %s\n", name->name); + + nfs4_state_protect(NFS_SERVER(dir)->nfs_client, + NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + + status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + + if (cred) + put_rpccred(cred); + + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, + struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client)) + err = _nfs4_proc_secinfo(dir, name, flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs4_proc_secinfo(dir, name, flavors, false); + + trace_nfs4_secinfo(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Check the exchange flags returned by the server for invalid flags, having + * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or + * DS flags set. + */ +static int nfs4_check_cl_exchange_flags(u32 flags) +{ + if (flags & ~EXCHGID4_FLAG_MASK_R) + goto out_inval; + if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && + (flags & EXCHGID4_FLAG_USE_NON_PNFS)) + goto out_inval; + if (!(flags & (EXCHGID4_FLAG_MASK_PNFS))) + goto out_inval; + return NFS_OK; +out_inval: + return -NFS4ERR_INVAL; +} + +static bool +nfs41_same_server_scope(struct nfs41_server_scope *a, + struct nfs41_server_scope *b) +{ + if (a->server_scope_sz == b->server_scope_sz && + memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) + return true; + + return false; +} + +/* + * nfs4_proc_bind_conn_to_session() + * + * The 4.1 client currently uses the same TCP connection for the + * fore and backchannel. + */ +int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + struct nfs41_bind_conn_to_session_args args = { + .client = clp, + .dir = NFS4_CDFC4_FORE_OR_BOTH, + }; + struct nfs41_bind_conn_to_session_res res; + struct rpc_message msg = { + .rpc_proc = + &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + + dprintk("--> %s\n", __func__); + + nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + args.dir = NFS4_CDFC4_FORE; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_bind_conn_to_session(clp, status); + if (status == 0) { + if (memcmp(res.sessionid.data, + clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { + dprintk("NFS: %s: Session ID mismatch\n", __func__); + status = -EIO; + goto out; + } + if ((res.dir & args.dir) != res.dir || res.dir == 0) { + dprintk("NFS: %s: Unexpected direction from server\n", + __func__); + status = -EIO; + goto out; + } + if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { + dprintk("NFS: %s: Server returned RDMA mode = true\n", + __func__); + status = -EIO; + goto out; + } + } +out: + dprintk("<-- %s status= %d\n", __func__, status); + return status; +} + +/* + * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map + * and operations we'd like to see to enable certain features in the allow map + */ +static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { + .how = SP4_MACH_CRED, + .enforce.u.words = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }, + .allow.u.words = { + [0] = 1 << (OP_CLOSE) | + 1 << (OP_LOCKU) | + 1 << (OP_COMMIT), + [1] = 1 << (OP_SECINFO - 32) | + 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32) | + 1 << (OP_WRITE - 32) + } +}; + +/* + * Select the state protection mode for client `clp' given the server results + * from exchange_id in `sp'. + * + * Returns 0 on success, negative errno otherwise. + */ +static int nfs4_sp4_select_mode(struct nfs_client *clp, + struct nfs41_state_protection *sp) +{ + static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }; + unsigned int i; + + if (sp->how == SP4_MACH_CRED) { + /* Print state protect result */ + dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n"); + for (i = 0; i <= LAST_NFS4_OP; i++) { + if (test_bit(i, sp->enforce.u.longs)) + dfprintk(MOUNT, " enforce op %d\n", i); + if (test_bit(i, sp->allow.u.longs)) + dfprintk(MOUNT, " allow op %d\n", i); + } + + /* make sure nothing is on enforce list that isn't supported */ + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { + if (sp->enforce.u.words[i] & ~supported_enforce[i]) { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + } + + /* + * Minimal mode - state operations are allowed to use machine + * credential. Note this already happens by default, so the + * client doesn't have to do anything more than the negotiation. + * + * NOTE: we don't care if EXCHANGE_ID is in the list - + * we're already using the machine cred for exchange_id + * and will never use a different cred. + */ + if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) && + test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { + dfprintk(MOUNT, "sp4_mach_cred:\n"); + dfprintk(MOUNT, " minimal mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + } else { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + + if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_LOCKU, sp->allow.u.longs)) { + dfprintk(MOUNT, " cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + } + + if (test_bit(OP_SECINFO, sp->allow.u.longs) && + test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { + dfprintk(MOUNT, " secinfo mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + } + + if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && + test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { + dfprintk(MOUNT, " stateid mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + } + + if (test_bit(OP_WRITE, sp->allow.u.longs)) { + dfprintk(MOUNT, " write mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + } + + if (test_bit(OP_COMMIT, sp->allow.u.longs)) { + dfprintk(MOUNT, " commit mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + } + } + + return 0; +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, + u32 sp4_how) +{ + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .verifier = &verifier, + .client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + }; + struct nfs41_exchange_id_res res = { + 0 + }; + int status; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + + nfs4_init_boot_verifier(clp, &verifier); + args.id_len = nfs4_init_uniform_client_string(clp, args.id, + sizeof(args.id)); + dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + args.id_len, args.id); + + res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + if (unlikely(res.server_owner == NULL)) { + status = -ENOMEM; + goto out; + } + + res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + GFP_NOFS); + if (unlikely(res.server_scope == NULL)) { + status = -ENOMEM; + goto out_server_owner; + } + + res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(res.impl_id == NULL)) { + status = -ENOMEM; + goto out_server_scope; + } + + switch (sp4_how) { + case SP4_NONE: + args.state_protect.how = SP4_NONE; + break; + + case SP4_MACH_CRED: + args.state_protect = nfs4_sp4_mach_cred_request; + break; + + default: + /* unsupported! */ + WARN_ON_ONCE(1); + status = -EINVAL; + goto out_server_scope; + } + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_exchange_id(clp, status); + if (status == 0) + status = nfs4_check_cl_exchange_flags(res.flags); + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &res.state_protect); + + if (status == 0) { + clp->cl_clientid = res.clientid; + clp->cl_exchange_flags = res.flags; + /* Client ID is not confirmed */ + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + clp->cl_seqid = res.seqid; + } + + kfree(clp->cl_serverowner); + clp->cl_serverowner = res.server_owner; + res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = res.impl_id; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = res.server_scope; + goto out; + } + } else + kfree(res.impl_id); + +out_server_owner: + kfree(res.server_owner); +out_server_scope: + kfree(res.server_scope); +out: + if (clp->cl_implid != NULL) + dprintk("NFS reply exchange_id: Server Implementation ID: " + "domain: %s, name: %s, date: %llu,%u\n", + clp->cl_implid->domain, clp->cl_implid->name, + clp->cl_implid->date.seconds, + clp->cl_implid->date.nseconds); + dprintk("NFS reply exchange_id: %d\n", status); + return status; +} + +/* + * nfs4_proc_exchange_id() + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + * + * Since the clientid has expired, all compounds using sessions + * associated with the stale clientid will be returning + * NFS4ERR_BADSESSION in the sequence operation, and will therefore + * be in some phase of session reset. + * + * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used. + */ +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +{ + rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor; + int status; + + /* try SP4_MACH_CRED if krb5i/p */ + if (authflavor == RPC_AUTH_GSS_KRB5I || + authflavor == RPC_AUTH_GSS_KRB5P) { + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + if (!status) + return 0; + } + + /* try SP4_NONE */ + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); +} + +static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID], + .rpc_argp = clp, + .rpc_cred = cred, + }; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_clientid(clp, status); + if (status) + dprintk("NFS: Got error %d from the server %s on " + "DESTROY_CLIENTID.", status, clp->cl_hostname); + return status; +} + +static int nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = _nfs4_proc_destroy_clientid(clp, cred); + switch (ret) { + case -NFS4ERR_DELAY: + case -NFS4ERR_CLIENTID_BUSY: + ssleep(1); + break; + default: + return ret; + } + } + return 0; +} + +int nfs4_destroy_clientid(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int ret = 0; + + if (clp->cl_mvops->minor_version < 1) + goto out; + if (clp->cl_exchange_flags == 0) + goto out; + if (clp->cl_preserve_clid) + goto out; + cred = nfs4_get_clid_cred(clp); + ret = nfs4_proc_destroy_clientid(clp, cred); + if (cred) + put_rpccred(cred); + switch (ret) { + case 0: + case -NFS4ERR_STALE_CLIENTID: + clp->cl_exchange_flags = 0; + } +out: + return ret; +} + +struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; + struct nfs4_get_lease_time_res *res; + struct nfs_client *clp; +}; + +static void nfs4_get_lease_time_prepare(struct rpc_task *task, + void *calldata) +{ + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + /* just setup sequence, do not trigger session recovery + since we're invoked within one */ + nfs41_setup_sequence(data->clp->cl_session, + &data->args->la_seq_args, + &data->res->lr_seq_res, + task); + dprintk("<-- %s\n", __func__); +} + +/* + * Called from nfs4_state_manager thread for session setup, so don't recover + * from sequence operation or clientid errors. + */ +static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); + rpc_delay(task, NFS4_POLL_RETRY_MIN); + task->tk_status = 0; + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_get_lease_time_ops = { + .rpc_call_prepare = nfs4_get_lease_time_prepare, + .rpc_call_done = nfs4_get_lease_time_done, +}; + +int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) +{ + struct rpc_task *task; + struct nfs4_get_lease_time_args args; + struct nfs4_get_lease_time_res res = { + .lr_fsinfo = fsinfo, + }; + struct nfs4_get_lease_time_data data = { + .args = &args, + .res = &res, + .clp = clp, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_get_lease_time_ops, + .callback_data = &data, + .flags = RPC_TASK_TIMEOUT, + }; + int status; + + nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); + nfs4_set_sequence_privileged(&args.la_seq_args); + dprintk("--> %s\n", __func__); + task = rpc_run_task(&task_setup); + + if (IS_ERR(task)) + status = PTR_ERR(task); + else { + status = task->tk_status; + rpc_put_task(task); + } + dprintk("<-- %s return %d\n", __func__, status); + + return status; +} + +/* + * Initialize the values to be used by the client in CREATE_SESSION + * If nfs4_init_session set the fore channel request and response sizes, + * use them. + * + * Set the back channel max_resp_sz_cached to zero to force the client to + * always set csa_cachethis to FALSE because the current implementation + * of the back channel DRC only supports caching the CB_SEQUENCE operation. + */ +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +{ + unsigned int max_rqst_sz, max_resp_sz; + + max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; + max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; + + /* Fore channel attributes */ + args->fc_attrs.max_rqst_sz = max_rqst_sz; + args->fc_attrs.max_resp_sz = max_resp_sz; + args->fc_attrs.max_ops = NFS4_MAX_OPS; + args->fc_attrs.max_reqs = max_session_slots; + + dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_ops=%u max_reqs=%u\n", + __func__, + args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, + args->fc_attrs.max_ops, args->fc_attrs.max_reqs); + + /* Back channel attributes */ + args->bc_attrs.max_rqst_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz_cached = 0; + args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; + args->bc_attrs.max_reqs = 1; + + dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", + __func__, + args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz, + args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops, + args->bc_attrs.max_reqs); +} + +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) +{ + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &res->fc_attrs; + + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE) + rcvd->max_reqs = NFS4_MAX_SLOT_TABLE; + return 0; +} + +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &res->bc_attrs; + + if (!(res->flags & SESSION4_BACK_CHAN)) + goto out; + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops != sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs != sent->max_reqs) + return -EINVAL; +out: + return 0; +} + +static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) +{ + int ret; + + ret = nfs4_verify_fore_channel_attrs(args, res); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, res); +} + +static void nfs4_update_session(struct nfs4_session *session, + struct nfs41_create_session_res *res) +{ + nfs4_copy_sessionid(&session->sess_id, &res->sessionid); + /* Mark client id and session as being confirmed */ + session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state); + session->flags = res->flags; + memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs)); + if (res->flags & SESSION4_BACK_CHAN) + memcpy(&session->bc_attrs, &res->bc_attrs, + sizeof(session->bc_attrs)); +} + +static int _nfs4_proc_create_session(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct nfs4_session *session = clp->cl_session; + struct nfs41_create_session_args args = { + .client = clp, + .clientid = clp->cl_clientid, + .seqid = clp->cl_seqid, + .cb_program = NFS4_CALLBACK, + }; + struct nfs41_create_session_res res; + + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs4_init_channel_attrs(&args); + args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); + + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_create_session(clp, status); + + if (!status) { + /* Verify the session's negotiated channel_attrs values */ + status = nfs4_verify_channel_attrs(&args, &res); + /* Increment the clientid slot sequence id */ + if (clp->cl_seqid == res.seqid) + clp->cl_seqid++; + if (status) + goto out; + nfs4_update_session(session, &res); + } +out: + return status; +} + +/* + * Issues a CREATE_SESSION operation to the server. + * It is the responsibility of the caller to verify the session is + * expired before calling this routine. + */ +int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + unsigned *ptr; + struct nfs4_session *session = clp->cl_session; + + dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); + + status = _nfs4_proc_create_session(clp, cred); + if (status) + goto out; + + /* Init or reset the session slot tables */ + status = nfs4_setup_session_slot_tables(session); + dprintk("slot table setup returned %d\n", status); + if (status) + goto out; + + ptr = (unsigned *)&session->sess_id.data[0]; + dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, + clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); +out: + dprintk("<-- %s\n", __func__); + return status; +} + +/* + * Issue the over-the-wire RPC DESTROY_SESSION. + * The caller must serialize access to this routine. + */ +int nfs4_proc_destroy_session(struct nfs4_session *session, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION], + .rpc_argp = session, + .rpc_cred = cred, + }; + int status = 0; + + dprintk("--> nfs4_proc_destroy_session\n"); + + /* session is still being setup */ + if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) + return 0; + + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_session(session->clp, status); + + if (status) + dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " + "Session has been destroyed regardless...\n", status); + + dprintk("<-- nfs4_proc_destroy_session\n"); + return status; +} + +/* + * Renew the cl_session lease. + */ +struct nfs4_sequence_data { + struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +}; + +static void nfs41_sequence_release(void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs41_sequence_call_done(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; + + trace_nfs4_sequence(clp, task->tk_status); + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); +out: + dprintk("<-- %s\n", __func__); +} + +static void nfs41_sequence_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + + nfs41_setup_sequence(clp->cl_session, args, res, task); +} + +static const struct rpc_call_ops nfs41_sequence_ops = { + .rpc_call_done = nfs41_sequence_call_done, + .rpc_call_prepare = nfs41_sequence_prepare, + .rpc_release = nfs41_sequence_release, +}; + +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + bool is_privileged) +{ + struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return ERR_PTR(-EIO); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { + nfs_put_client(clp); + return ERR_PTR(-ENOMEM); + } + nfs4_init_sequence(&calldata->args, &calldata->res, 0); + if (is_privileged) + nfs4_set_sequence_privileged(&calldata->args); + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; + + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) +{ + struct rpc_task *task; + int ret = 0; + + if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) + return -EAGAIN; + task = _nfs41_proc_sequence(clp, cred, false); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task_async(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred, true); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + ret = task->tk_status; + } + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +struct nfs4_reclaim_complete_data { + struct nfs_client *clp; + struct nfs41_reclaim_complete_args arg; + struct nfs41_reclaim_complete_res res; +}; + +static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task); +} + +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, res)) + return; + + trace_nfs4_reclaim_complete(clp, task->tk_status); + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_free_reclaim_complete_data(void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { + .rpc_call_prepare = nfs4_reclaim_complete_prepare, + .rpc_call_done = nfs4_reclaim_complete_done, + .rpc_release = nfs4_free_reclaim_complete_data, +}; + +/* + * Issue a global reclaim complete. + */ +static int nfs41_proc_reclaim_complete(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct nfs4_reclaim_complete_data *calldata; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_reclaim_complete_call_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; + calldata->arg.one_fs = 0; + + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); + nfs4_set_sequence_privileged(&calldata->arg.seq_args); + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out; + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + rpc_put_task(task); + return 0; +out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct nfs4_session *session = nfs4_get_session(server); + + dprintk("--> %s\n", __func__); + /* Note the is a race here, where a CB_LAYOUTRECALL can come in + * right now covering the LAYOUTGET we are about to send. + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ + if (nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task)) + return; + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + NFS_I(lgp->args.inode)->layout, + &lgp->args.range, + lgp->args.ctx->state)) { + rpc_exit(task, NFS4_OK); + } +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + struct nfs4_state *state = NULL; + unsigned long timeo, now, giveup; + + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + + if (!nfs41_sequence_done(task, &lgp->res.seq_res)) + goto out; + + switch (task->tk_status) { + case 0: + goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ + case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ + case -NFS4ERR_RECALLCONFLICT: + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!lo || list_empty(&lo->plh_segs)) { + spin_unlock(&inode->i_lock); + /* If the open stateid was bad, then recover it. */ + state = lgp->args.ctx->state; + } else { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + + task->tk_status = 0; + rpc_restart_call_prepare(task); + } + } + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); +out: + dprintk("<-- %s\n", __func__); +} + +static size_t max_response_pages(struct nfs_server *server) +{ + u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + return nfs_page_array_len(0, max_resp_sz); +} + +static void nfs4_free_pages(struct page **pages, size_t size) +{ + int i; + + if (!pages) + return; + + for (i = 0; i < size; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); +} + +static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) +{ + struct page **pages; + int i; + + pages = kcalloc(size, sizeof(struct page *), gfp_flags); + if (!pages) { + dprintk("%s: can't alloc array of %zu pages\n", __func__, size); + return NULL; + } + + for (i = 0; i < size; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) { + dprintk("%s: failed to allocate page\n", __func__); + nfs4_free_pages(pages, size); + return NULL; + } + } + + return pages; +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + size_t max_pages = max_response_pages(server); + + dprintk("--> %s\n", __func__); + nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(NFS_I(inode)->layout); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +{ + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + size_t max_pages = max_response_pages(server); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + .rpc_cred = lgp->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + struct pnfs_layout_segment *lseg = NULL; + int status = 0; + + dprintk("--> %s\n", __func__); + + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); + if (!lgp->args.layout.pages) { + nfs4_layoutget_release(lgp); + return ERR_PTR(-ENOMEM); + } + lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; + + lgp->res.layoutp = &lgp->args.layout; + lgp->res.seq_res.sr_slot = NULL; + nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return ERR_CAST(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + trace_nfs4_layoutget(lgp->args.ctx, + &lgp->args.range, + &lgp->res.range, + status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (status == 0 && lgp->res.layoutp->len) + lseg = pnfs_layout_process(lgp); + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + if (status) + return ERR_PTR(status); + return lseg; +} + +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_setup_sequence(lrp->clp->cl_session, + &lrp->args.seq_args, + &lrp->res.seq_res, + task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct nfs_server *server; + + dprintk("--> %s\n", __func__); + + if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + return; + + server = NFS_SERVER(lrp->args.inode); + switch (task->tk_status) { + default: + task->tk_status = 0; + case 0: + break; + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) + break; + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct pnfs_layout_hdr *lo = lrp->args.layout; + + dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (lrp->res.lrs_present) + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_clear_layoutreturn_waitbit(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); + pnfs_put_layout_hdr(lrp->args.layout); + nfs_iput_and_deactive(lrp->inode); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { + .rpc_call_prepare = nfs4_layoutreturn_prepare, + .rpc_call_done = nfs4_layoutreturn_done, + .rpc_release = nfs4_layoutreturn_release, +}; + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], + .rpc_argp = &lrp->args, + .rpc_resp = &lrp->res, + .rpc_cred = lrp->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_SERVER(lrp->args.inode)->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutreturn_call_ops, + .callback_data = lrp, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + if (!sync) { + lrp->inode = nfs_igrab_and_active(lrp->args.inode); + if (!lrp->inode) { + nfs4_layoutreturn_release(lrp); + return -EAGAIN; + } + task_setup_data.flags |= RPC_TASK_ASYNC; + } + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync) + status = task->tk_status; + trace_nfs4_layoutreturn(lrp->args.inode, status); + dprintk("<-- %s status=%d\n", __func__, status); + rpc_put_task(task); + return status; +} + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; + + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev, cred), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + struct nfs4_session *session = nfs4_get_session(server); + + nfs41_setup_sequence(session, + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs41_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case -NFS4ERR_BADLAYOUT: /* no layout */ + case -NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + case 0: + break; + default: + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + + pnfs_cleanup_layoutcommit(data); + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); + put_rpccred(data->cred); + nfs_iput_and_deactive(data->inode); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + if (!sync) { + data->inode = nfs_igrab_and_active(data->args.inode); + if (data->inode == NULL) { + nfs4_layoutcommit_release(data); + return -EAGAIN; + } + task_setup_data.flags = RPC_TASK_ASYNC; + } + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync) + status = task->tk_status; + trace_nfs4_layoutcommit(data->args.inode, status); + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} + +/** + * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if + * possible) as per RFC3530bis and RFC5661 Security Considerations sections + */ +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, + struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ + struct nfs41_secinfo_no_name_args args = { + .style = SECINFO_STYLE_CURRENT_FH, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_clnt *clnt = server->client; + struct rpc_cred *cred = NULL; + int status; + + if (use_integrity) { + clnt = server->nfs_client->cl_rpcclient; + cred = nfs4_get_clid_cred(server->nfs_client); + msg.rpc_cred = cred; + } + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + if (cred) + put_rpccred(cred); + + return status; +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + /* first try using integrity protection */ + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(server->nfs_client)) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, false); + + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + case -ENOTSUPP: + goto out; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); +out: + return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int err; + struct page *page; + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + struct nfs4_secinfo_flavors *flavors; + struct nfs4_secinfo4 *secinfo; + int i; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + flavors = page_address(page); + err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + + /* + * Fall back on "guess and check" method if + * the server doesn't support SECINFO_NO_NAME + */ + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { + err = nfs4_find_root_sec(server, fhandle, info); + goto out_freepage; + } + if (err) + goto out_freepage; + + for (i = 0; i < flavors->num_flavors; i++) { + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + flavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + break; + default: + flavor = RPC_AUTH_MAXFLAVOR; + break; + } + + if (!nfs_auth_info_match(&server->auth_info, flavor)) + flavor = RPC_AUTH_MAXFLAVOR; + + if (flavor != RPC_AUTH_MAXFLAVOR) { + err = nfs4_lookup_root_sec(server, fhandle, + info, flavor); + if (!err) + break; + } + } + + if (flavor == RPC_AUTH_MAXFLAVOR) + err = -EPERM; + +out_freepage: + put_page(page); + if (err == -EACCES) + return -EPERM; +out: + return err; +} + +static int _nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + struct nfs41_test_stateid_args args = { + .stateid = stateid, + }; + struct nfs41_test_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + struct rpc_clnt *rpc_client = server->client; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &rpc_client, &msg); + + dprintk("NFS call test_stateid %p\n", stateid); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(rpc_client, server, &msg, + &args.seq_args, &res.seq_res); + if (status != NFS_OK) { + dprintk("NFS reply test_stateid: failed, %d\n", status); + return status; + } + dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status); + return -res.status; +} + +/** + * nfs41_test_stateid - perform a TEST_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to test + * @cred: credential + * + * Returns NFS_OK if the server recognizes that "stateid" is valid. + * Otherwise a negative NFS4ERR value is returned if the operation + * failed or the state ID is not currently valid. + */ +static int nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs41_test_stateid(server, stateid, cred); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +struct nfs_free_stateid_data { + struct nfs_server *server; + struct nfs41_free_stateid_args args; + struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + nfs41_setup_sequence(nfs4_get_session(data->server), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + + nfs41_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs41_free_stateid_release(void *calldata) +{ + kfree(calldata); +} + +static const struct rpc_call_ops nfs41_free_stateid_ops = { + .rpc_call_prepare = nfs41_free_stateid_prepare, + .rpc_call_done = nfs41_free_stateid_done, + .rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred, + bool privileged) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs41_free_stateid_ops, + .flags = RPC_TASK_ASYNC, + }; + struct nfs_free_stateid_data *data; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &task_setup.rpc_client, &msg); + + dprintk("NFS call free_stateid %p\n", stateid); + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + data->server = server; + nfs4_stateid_copy(&data->args.stateid, stateid); + + task_setup.callback_data = data; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + if (privileged) + nfs4_set_sequence_privileged(&data->args.seq_args); + + return rpc_run_task(&task_setup); +} + +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * @cred: credential + * + * Returns NFS_OK if the server freed "stateid". Otherwise a + * negative NFS4ERR value is returned. + */ +static int nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_free_stateid(server, stateid, cred, true); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static void +nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct rpc_task *task; + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs4_free_lock_state(server, lsp); + if (IS_ERR(task)) + return; + rpc_put_task(task); +} + +static bool nfs41_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) + return false; + + if (s1->seqid == s2->seqid) + return true; + if (s1->seqid == 0 || s2->seqid == 0) + return true; + + return false; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static bool nfs4_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + return nfs4_stateid_match(s1, s2); +} + + +static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, + .recover_open = nfs4_open_reclaim, + .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs4_init_clientid, + .detect_trunking = nfs40_discover_server_trunking, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, + .recover_open = nfs4_open_reclaim, + .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs41_init_clientid, + .reclaim_complete = nfs41_proc_reclaim_complete, + .detect_trunking = nfs41_discover_server_trunking, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, + .recover_open = nfs40_open_expired, + .recover_lock = nfs4_lock_expired, + .establish_clid = nfs4_init_clientid, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, + .recover_open = nfs41_open_expired, + .recover_lock = nfs41_lock_expired, + .establish_clid = nfs41_init_clientid, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { + .sched_state_renewal = nfs4_proc_async_renew, + .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, + .renew_lease = nfs4_proc_renew, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { + .sched_state_renewal = nfs41_proc_async_sequence, + .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, + .renew_lease = nfs4_proc_sequence, +}; +#endif + +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { + .get_locations = _nfs40_proc_get_locations, + .fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { + .get_locations = _nfs41_proc_get_locations, + .fsid_present = _nfs41_proc_fsid_present, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK, + .init_client = nfs40_init_client, + .shutdown_client = nfs40_shutdown_client, + .match_stateid = nfs4_match_stateid, + .find_root_sec = nfs4_find_root_sec, + .free_lock_state = nfs4_release_lockowner, + .alloc_seqid = nfs_alloc_seqid, + .call_sync_ops = &nfs40_call_sync_ops, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, + .mig_recovery_ops = &nfs40_mig_recovery_ops, +}; + +#if defined(CONFIG_NFS_V4_1) +static struct nfs_seqid * +nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2) +{ + return NULL; +} + +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .alloc_seqid = nfs_alloc_no_seqid, + .call_sync_ops = &nfs41_call_sync_ops, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, +}; +#endif + +#if defined(CONFIG_NFS_V4_2) +static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { + .minor_version = 2, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_ALLOCATE + | NFS_CAP_DEALLOCATE + | NFS_CAP_SEEK, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .call_sync_ops = &nfs41_call_sync_ops, + .alloc_seqid = nfs_alloc_no_seqid, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, +}; +#endif + +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, +#if defined(CONFIG_NFS_V4_1) + [1] = &nfs_v4_1_minor_ops, +#endif +#if defined(CONFIG_NFS_V4_2) + [2] = &nfs_v4_2_minor_ops, +#endif +}; + +static const struct inode_operations nfs4_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + +static const struct inode_operations nfs4_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + +const struct nfs_rpc_ops nfs_v4_clientops = { + .version = 4, /* protocol version */ + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, + .getroot = nfs4_proc_get_root, + .submount = nfs4_submount, + .try_mount = nfs4_try_mount, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, + .lookup = nfs4_proc_lookup, + .access = nfs4_proc_access, + .readlink = nfs4_proc_readlink, + .create = nfs4_proc_create, + .remove = nfs4_proc_remove, + .unlink_setup = nfs4_proc_unlink_setup, + .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, + .unlink_done = nfs4_proc_unlink_done, + .rename_setup = nfs4_proc_rename_setup, + .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, + .rename_done = nfs4_proc_rename_done, + .link = nfs4_proc_link, + .symlink = nfs4_proc_symlink, + .mkdir = nfs4_proc_mkdir, + .rmdir = nfs4_proc_remove, + .readdir = nfs4_proc_readdir, + .mknod = nfs4_proc_mknod, + .statfs = nfs4_proc_statfs, + .fsinfo = nfs4_proc_fsinfo, + .pathconf = nfs4_proc_pathconf, + .set_capabilities = nfs4_server_capabilities, + .decode_dirent = nfs4_decode_dirent, + .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare, + .read_setup = nfs4_proc_read_setup, + .read_done = nfs4_read_done, + .write_setup = nfs4_proc_write_setup, + .write_done = nfs4_write_done, + .commit_setup = nfs4_proc_commit_setup, + .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, + .commit_done = nfs4_commit_done, + .lock = nfs4_proc_lock, + .clear_acl_cache = nfs4_zap_acl_attr, + .close_context = nfs4_close_context, + .open_context = nfs4_atomic_open, + .have_delegation = nfs4_have_delegation, + .return_delegation = nfs4_inode_return_delegation, + .alloc_client = nfs4_alloc_client, + .init_client = nfs4_init_client, + .free_client = nfs4_free_client, + .create_server = nfs4_create_server, + .clone_server = nfs_clone_server, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { + .prefix = XATTR_NAME_NFSV4_ACL, + .list = nfs4_xattr_list_nfs4_acl, + .get = nfs4_xattr_get_nfs4_acl, + .set = nfs4_xattr_set_nfs4_acl, +}; + +const struct xattr_handler *nfs4_xattr_handlers[] = { + &nfs4_xattr_nfs4_acl_handler, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + &nfs4_xattr_nfs4_label_handler, +#endif + NULL +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfs/nfs4renewd.c b/kernel/fs/nfs/nfs4renewd.c new file mode 100644 index 000000000..e1ba58c3d --- /dev/null +++ b/kernel/fs/nfs/nfs4renewd.c @@ -0,0 +1,143 @@ +/* + * fs/nfs/nfs4renewd.c + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Implementation of the NFSv4 "renew daemon", which wakes up periodically to + * send a RENEW, to keep state alive on the server. The daemon is implemented + * as an rpc_task, not a real kernel thread, so it always runs in rpciod's + * context. There is one renewd per nfs_server. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include "nfs4_fs.h" +#include "delegation.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +void +nfs4_renew_state(struct work_struct *work) +{ + const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + unsigned renew_flags = 0; + + ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) + goto out; + + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; + last = clp->cl_last_renewal; + now = jiffies; + /* Are we close to a lease timeout? */ + if (time_after(now, last + lease/3)) + renew_flags |= NFS4_RENEW_TIMEOUT; + if (nfs_delegations_present(clp)) + renew_flags |= NFS4_RENEW_DELEGATION_CB; + + if (renew_flags != 0) { + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) { + if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + goto out; + } + nfs_expire_all_delegations(clp); + } else { + int ret; + + /* Queue an asynchronous RENEW. */ + ret = ops->sched_state_renewal(clp, cred, renew_flags); + put_rpccred(cred); + switch (ret) { + default: + goto out_exp; + case -EAGAIN: + case -ENOMEM: + break; + } + } + } else { + dprintk("%s: failed to call renewd. Reason: lease not expired \n", + __func__); + spin_unlock(&clp->cl_lock); + } + nfs4_schedule_state_renewal(clp); +out_exp: + nfs_expire_unreferenced_delegations(clp); +out: + dprintk("%s: done\n", __func__); +} + +void +nfs4_schedule_state_renewal(struct nfs_client *clp) +{ + long timeout; + + spin_lock(&clp->cl_lock); + timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal + - (long)jiffies; + if (timeout < 5 * HZ) + timeout = 5 * HZ; + dprintk("%s: requeueing work. Lease period = %ld\n", + __func__, (timeout + HZ - 1) / HZ); + mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + set_bit(NFS_CS_RENEWD, &clp->cl_res_state); + spin_unlock(&clp->cl_lock); +} + +void +nfs4_kill_renewd(struct nfs_client *clp) +{ + cancel_delayed_work_sync(&clp->cl_renewd); +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfs/nfs4session.c b/kernel/fs/nfs/nfs4session.c new file mode 100644 index 000000000..e23366eff --- /dev/null +++ b/kernel/fs/nfs/nfs4session.c @@ -0,0 +1,565 @@ +/* + * fs/nfs/nfs4session.c + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) +{ + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_completion(&tbl->complete); +} + +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) +{ + struct nfs4_slot **p; + if (newsize >= tbl->max_slots) + return; + + p = &tbl->slots; + while (newsize--) + p = &(*p)->next; + while (*p) { + struct nfs4_slot *slot = *p; + + *p = slot->next; + kfree(slot); + tbl->max_slots--; + } +} + +/** + * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete + * @tbl - controlling slot table + * + */ +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) +{ + if (nfs4_slot_tbl_draining(tbl)) + complete(&tbl->complete); +} + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. + * + * Must be called while holding tbl->slot_tbl_lock + */ +void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) +{ + u32 slotid = slot->slot_nr; + + /* clear used bit in bitmap */ + __clear_bit(slotid, tbl->used_slots); + + /* update highest_used_slotid when it is freed */ + if (slotid == tbl->highest_used_slotid) { + u32 new_max = find_last_bit(tbl->used_slots, slotid); + if (new_max < slotid) + tbl->highest_used_slotid = new_max; + else { + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_slot_tbl_drain_complete(tbl); + } + } + dprintk("%s: slotid %u highest_used_slotid %u\n", __func__, + slotid, tbl->highest_used_slotid); +} + +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot *slot; + + slot = kzalloc(sizeof(*slot), gfp_mask); + if (slot) { + slot->table = tbl; + slot->slot_nr = slotid; + slot->seq_nr = seq_init; + } + return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot **p, *slot; + + p = &tbl->slots; + for (;;) { + if (*p == NULL) { + *p = nfs4_new_slot(tbl, tbl->max_slots, + seq_init, gfp_mask); + if (*p == NULL) + break; + tbl->max_slots++; + } + slot = *p; + if (slot->slot_nr == slotid) + return slot; + p = &slot->next; + } + return ERR_PTR(-ENOMEM); +} + +/* + * nfs4_alloc_slot - efficiently look for a free slot + * + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * + * Note: must be called with under the slot_tbl_lock. + */ +struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *ret = ERR_PTR(-EBUSY); + u32 slotid; + + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + tbl->max_slotid + 1); + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); + if (slotid > tbl->max_slotid) + goto out; + ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + if (IS_ERR(ret)) + goto out; + __set_bit(slotid, tbl->used_slots); + if (slotid > tbl->highest_used_slotid || + tbl->highest_used_slotid == NFS4_NO_SLOT) + tbl->highest_used_slotid = slotid; + ret->generation = tbl->generation; + +out: + dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT); + return ret; +} + +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + if (max_reqs <= tbl->max_slots) + return 0; + if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) + return 0; + return -ENOMEM; +} + +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, + u32 server_highest_slotid, + u32 ivalue) +{ + struct nfs4_slot **p; + + nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); + p = &tbl->slots; + while (*p) { + (*p)->seq_nr = ivalue; + (*p)->interrupted = 0; + p = &(*p)->next; + } + tbl->highest_used_slotid = NFS4_NO_SLOT; + tbl->target_highest_slotid = server_highest_slotid; + tbl->server_highest_slotid = server_highest_slotid; + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + tbl->max_slotid = server_highest_slotid; +} + +/* + * (re)Initialise a slot table + */ +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + int ret; + + dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__, + max_reqs, tbl->max_slots); + + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + goto out; + + spin_lock(&tbl->slot_tbl_lock); + nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); + spin_unlock(&tbl->slot_tbl_lock); + + dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + +/** + * nfs4_shutdown_slot_table - release resources attached to a slot table + * @tbl: slot table to shut down + * + */ +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_release_slot_table(tbl); + rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); +} + +/** + * nfs4_setup_slot_table - prepare a stand-alone slot table for use + * @tbl: slot table to set up + * @max_reqs: maximum number of requests allowed + * @queue: name to give RPC wait queue + * + * Returns zero on success, or a negative errno. + */ +int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, + const char *queue) +{ + nfs4_init_slot_table(tbl, queue); + return nfs4_realloc_slot_table(tbl, max_reqs, 0); +} + +static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) +{ + struct nfs4_sequence_args *args = task->tk_msg.rpc_argp; + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + struct nfs4_slot *slot = pslot; + struct nfs4_slot_table *tbl = slot->table; + + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + return false; + slot->generation = tbl->generation; + args->sa_slot = slot; + res->sr_timestamp = jiffies; + res->sr_slot = slot; + res->sr_status_flags = 0; + res->sr_status = 1; + return true; +} + +static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot)) + return true; + return false; +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (slot->slot_nr > tbl->max_slotid) + return false; + return __nfs41_wake_and_assign_slot(tbl, slot); +} + +static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *slot = nfs4_alloc_slot(tbl); + if (!IS_ERR(slot)) { + bool ret = __nfs41_wake_and_assign_slot(tbl, slot); + if (ret) + return ret; + nfs4_free_slot(tbl, slot); + } + return false; +} + +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) +{ + for (;;) { + if (!nfs41_try_wake_next_slot_table_entry(tbl)) + break; + } +} + +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + u32 max_slotid; + + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid); + if (max_slotid > tbl->server_highest_slotid) + max_slotid = tbl->server_highest_slotid; + if (max_slotid > tbl->target_highest_slotid) + max_slotid = tbl->target_highest_slotid; + tbl->max_slotid = max_slotid; + nfs41_wake_slot_table(tbl); +} + +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + if (tbl->target_highest_slotid == target_highest_slotid) + return; + tbl->target_highest_slotid = target_highest_slotid; + tbl->generation++; +} + +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + spin_lock(&tbl->slot_tbl_lock); + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + nfs41_set_max_slotid_locked(tbl, target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, + u32 highest_slotid) +{ + if (tbl->server_highest_slotid == highest_slotid) + return; + if (tbl->highest_used_slotid > highest_slotid) + return; + /* Deallocate slots */ + nfs4_shrink_slot_table(tbl, highest_slotid + 1); + tbl->server_highest_slotid = highest_slotid; +} + +static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2) +{ + s1 -= s2; + if (s1 == 0) + return 0; + if (s1 < 0) + return (s1 - 1) >> 1; + return (s1 + 1) >> 1; +} + +static int nfs41_sign_s32(s32 s1) +{ + if (s1 > 0) + return 1; + if (s1 < 0) + return -1; + return 0; +} + +static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2) +{ + if (!s1 || !s2) + return true; + return nfs41_sign_s32(s1) == nfs41_sign_s32(s2); +} + +/* Try to eliminate outliers by checking for sharp changes in the + * derivatives and second derivatives + */ +static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl, + u32 new_target) +{ + s32 d_target, d2_target; + bool ret = true; + + d_target = nfs41_derivative_target_slotid(new_target, + tbl->target_highest_slotid); + d2_target = nfs41_derivative_target_slotid(d_target, + tbl->d_target_highest_slotid); + /* Is first derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid)) + ret = false; + /* Is second derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid)) + ret = false; + tbl->d_target_highest_slotid = d_target; + tbl->d2_target_highest_slotid = d2_target; + return ret; +} + +void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res) +{ + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); + if (tbl->generation == slot->generation) + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); + nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_release_session_slot_tables(struct nfs4_session *session) +{ + nfs4_release_slot_table(&session->fc_slot_table); + nfs4_release_slot_table(&session->bc_slot_table); +} + +/* + * Initialize or reset the forechannel and backchannel tables + */ +int nfs4_setup_session_slot_tables(struct nfs4_session *ses) +{ + struct nfs4_slot_table *tbl; + int status; + + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */ + return status; + /* Back channel */ + tbl = &ses->bc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_release_session_slot_tables(ses); + return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ + struct nfs4_session *session; + + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + + nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table"); + nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table"); + session->session_state = 1<clp = clp; + return session; +} + +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_shutdown_slot_table(&session->fc_slot_table); + nfs4_shutdown_slot_table(&session->bc_slot_table); +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ + struct rpc_xprt *xprt; + struct rpc_cred *cred; + + cred = nfs4_get_clid_cred(session->clp); + nfs4_proc_destroy_session(session, cred); + if (cred) + put_rpccred(cred); + + rcu_read_lock(); + xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); + dprintk("%s Destroy backchannel for xprt %p\n", + __func__, xprt); + xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + nfs4_destroy_session_slot_tables(session); + kfree(session); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ + int ret; + + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + ret = nfs4_client_recover_expired_lease(clp); + if (ret) + return ret; + } + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + smp_rmb(); + return 0; +} + +int nfs4_init_session(struct nfs_client *clp) +{ + if (!nfs4_has_session(clp)) + return 0; + + clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state); + return nfs41_check_session_ready(clp); +} + +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the + * DS lease to be equal to the MDS lease. + */ + clp->cl_lease_time = lease_time; + clp->cl_last_renewal = jiffies; + } + spin_unlock(&clp->cl_lock); + + ret = nfs41_check_session_ready(clp); + if (ret) + return ret; + /* Test for the DS role */ + if (!is_ds_client(clp)) + return -ENODEV; + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + +#endif /* defined(CONFIG_NFS_V4_1) */ diff --git a/kernel/fs/nfs/nfs4session.h b/kernel/fs/nfs/nfs4session.h new file mode 100644 index 000000000..e3ea2c532 --- /dev/null +++ b/kernel/fs/nfs/nfs4session.h @@ -0,0 +1,160 @@ +/* + * fs/nfs/nfs4session.h + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#ifndef __LINUX_FS_NFS_NFS4SESSION_H +#define __LINUX_FS_NFS_NFS4SESSION_H + +/* maximum number of slots to use */ +#define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_MAX_SLOT_TABLE (1024U) +#define NFS4_NO_SLOT ((u32)-1) + +#if IS_ENABLED(CONFIG_NFS_V4) + +/* Sessions slot seqid */ +struct nfs4_slot { + struct nfs4_slot_table *table; + struct nfs4_slot *next; + unsigned long generation; + u32 slot_nr; + u32 seq_nr; + unsigned int interrupted : 1; +}; + +/* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +struct nfs4_slot_table { + struct nfs4_session *session; /* Parent session */ + struct nfs4_slot *slots; /* seqid per slot */ + unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ + spinlock_t slot_tbl_lock; + struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + u32 max_slots; /* # slots in table */ + u32 max_slotid; /* Max allowed slotid value */ + u32 highest_used_slotid; /* sent to server on each SEQ. + * op for dynamic resizing */ + u32 target_highest_slotid; /* Server max_slot target */ + u32 server_highest_slotid; /* Server highest slotid */ + s32 d_target_highest_slotid; /* Derivative */ + s32 d2_target_highest_slotid; /* 2nd derivative */ + unsigned long generation; /* Generation counter for + target_highest_slotid */ + struct completion complete; + unsigned long slot_tbl_state; +}; + +/* + * Session related parameters + */ +struct nfs4_session { + struct nfs4_sessionid sess_id; + u32 flags; + unsigned long session_state; + u32 hash_alg; + u32 ssv_len; + + /* The fore and back channel */ + struct nfs4_channel_attrs fc_attrs; + struct nfs4_slot_table fc_slot_table; + struct nfs4_channel_attrs bc_attrs; + struct nfs4_slot_table bc_slot_table; + struct nfs_client *clp; +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_ESTABLISHED, +}; + +extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, + unsigned int max_reqs, const char *queue); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); +extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); +extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) +{ + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); +} + +#if defined(CONFIG_NFS_V4_1) +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid); +extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res); + +extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); + +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern int nfs4_init_session(struct nfs_client *clp); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + if (clp->cl_session) + return 1; + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); + return 0; +} + +static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, + const struct nfs4_sessionid *src) +{ + memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); +} + +#ifdef CONFIG_CRC32 +/* + * nfs_session_id_hash - calculate the crc32 hash for the session id + * @session - pointer to session + */ +#define nfs_session_id_hash(sess_id) \ + (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) +#else +#define nfs_session_id_hash(session) (0) +#endif +#else /* defined(CONFIG_NFS_V4_1) */ + +static inline int nfs4_init_session(struct nfs_client *clp) +{ + return 0; +} + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + return 0; +} + +#endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ +#endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/kernel/fs/nfs/nfs4state.c b/kernel/fs/nfs/nfs4state.c new file mode 100644 index 000000000..2782cfca2 --- /dev/null +++ b/kernel/fs/nfs/nfs4state.c @@ -0,0 +1,2455 @@ +/* + * fs/nfs/nfs4state.c + * + * Client-side XDR for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Implementation of the NFSv4 state model. For the time being, + * this is minimal, but will be made much more complex in a + * subsequent patch. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "internal.h" +#include "nfs4idmap.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +#define OPENOWNER_POOL_SIZE 8 + +const nfs4_stateid zero_stateid; +static DEFINE_MUTEX(nfs_clid_init_mutex); + +int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + unsigned short port; + int status; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + port = nn->nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nn->nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_setclientid_confirm(clp, &clid, cred); + if (status != 0) + goto out; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_renewal(clp); +out: + return status; +} + +/** + * nfs40_discover_server_trunking - Detect server IP address trunking (mv0) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + unsigned short port; + int status; + + port = nn->nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nn->nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + + status = nfs40_walk_client_list(clp, result, cred); + if (status == 0) { + /* Sustain the lease, even if it's empty. If the clientid4 + * goes stale it's of no use for trunking discovery. */ + nfs4_schedule_state_renewal(*result); + } +out: + return status; +} + +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + + if (clp->cl_machine_cred != NULL) + cred = get_rpccred(clp->cl_machine_cred); + return cred; +} + +static void nfs4_root_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred, *new; + + new = rpc_lookup_machine_cred(NULL); + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = new; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + +static struct rpc_cred * +nfs4_get_renew_cred_server_locked(struct nfs_server *server) +{ + struct rpc_cred *cred = NULL; + struct nfs4_state_owner *sp; + struct rb_node *pos; + + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + if (list_empty(&sp->so_states)) + continue; + cred = get_rpccred(sp->so_cred); + break; + } + return cred; +} + +/** + * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + * Caller must hold clp->cl_lock. + */ +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + struct nfs_server *server; + + /* Use machine credentials if available */ + cred = nfs4_get_machine_cred_locked(clp); + if (cred != NULL) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_renew_cred_server_locked(server); + if (cred != NULL) + break; + } + rcu_read_unlock(); + +out: + return cred; +} + +static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) +{ + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { + spin_lock(&tbl->slot_tbl_lock); + nfs41_wake_slot_table(tbl); + spin_unlock(&tbl->slot_tbl_lock); + } +} + +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + + if (clp->cl_slot_tbl) { + nfs4_end_drain_slot_table(clp->cl_slot_tbl); + return; + } + + if (ses != NULL) { + nfs4_end_drain_slot_table(&ses->bc_slot_table); + nfs4_end_drain_slot_table(&ses->fc_slot_table); + } +} + +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) +{ + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); + spin_lock(&tbl->slot_tbl_lock); + if (tbl->highest_used_slotid != NFS4_NO_SLOT) { + reinit_completion(&tbl->complete); + spin_unlock(&tbl->slot_tbl_lock); + return wait_for_completion_interruptible(&tbl->complete); + } + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int ret = 0; + + if (clp->cl_slot_tbl) + return nfs4_drain_slot_tbl(clp->cl_slot_tbl); + + /* back channel */ + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); + if (ret) + return ret; + /* fore channel */ + return nfs4_drain_slot_tbl(&ses->fc_slot_table); +} + +#if defined(CONFIG_NFS_V4_1) + +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); + } + + return status; +} + +static void nfs41_finish_session_reset(struct nfs_client *clp) +{ + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + nfs41_setup_state_renewal(clp); +} + +int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + nfs4_begin_drain_session(clp); + status = nfs4_proc_exchange_id(clp, cred); + if (status != 0) + goto out; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_create_session(clp, cred); + if (status != 0) + goto out; + nfs41_finish_session_reset(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); +out: + return status; +} + +/** + * nfs41_discover_server_trunking - Detect server IP address trunking (mv1) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status. + * If NFS4_OK is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + int status; + + status = nfs4_proc_exchange_id(clp, cred); + if (status != NFS4_OK) + return status; + + status = nfs41_walk_client_list(clp, result, cred); + if (status < 0) + return status; + if (clp != *result) + return 0; + + /* Purge state if the client id was established in a prior instance */ + if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + else + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_manager(clp); + status = nfs_wait_client_init_complete(clp); + if (status < 0) + nfs_put_client(clp); + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_get_clid_cred - Acquire credential for a setclientid operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + */ +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = nfs4_get_machine_cred_locked(clp); + spin_unlock(&clp->cl_lock); + return cred; +} + +static struct nfs4_state_owner * +nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) +{ + struct rb_node **p = &server->state_owners.rb_node, + *parent = NULL; + struct nfs4_state_owner *sp; + + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + + if (cred < sp->so_cred) + p = &parent->rb_left; + else if (cred > sp->so_cred) + p = &parent->rb_right; + else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); + atomic_inc(&sp->so_count); + return sp; + } + } + return NULL; +} + +static struct nfs4_state_owner * +nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) +{ + struct nfs_server *server = new->so_server; + struct rb_node **p = &server->state_owners.rb_node, + *parent = NULL; + struct nfs4_state_owner *sp; + int err; + + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + + if (new->so_cred < sp->so_cred) + p = &parent->rb_left; + else if (new->so_cred > sp->so_cred) + p = &parent->rb_right; + else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); + atomic_inc(&sp->so_count); + return sp; + } + } + err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); + if (err) + return ERR_PTR(err); + rb_link_node(&new->so_server_node, parent, p); + rb_insert_color(&new->so_server_node, &server->state_owners); + return new; +} + +static void +nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) +{ + struct nfs_server *server = sp->so_server; + + if (!RB_EMPTY_NODE(&sp->so_server_node)) + rb_erase(&sp->so_server_node, &server->state_owners); + ida_remove(&server->openowner_id, sp->so_seqid.owner_id); +} + +static void +nfs4_init_seqid_counter(struct nfs_seqid_counter *sc) +{ + sc->create_time = ktime_get(); + sc->flags = 0; + sc->counter = 0; + spin_lock_init(&sc->lock); + INIT_LIST_HEAD(&sc->list); + rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue"); +} + +static void +nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) +{ + rpc_destroy_wait_queue(&sc->wait); +} + +/* + * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to + * create a new state_owner. + * + */ +static struct nfs4_state_owner * +nfs4_alloc_state_owner(struct nfs_server *server, + struct rpc_cred *cred, + gfp_t gfp_flags) +{ + struct nfs4_state_owner *sp; + + sp = kzalloc(sizeof(*sp), gfp_flags); + if (!sp) + return NULL; + sp->so_server = server; + sp->so_cred = get_rpccred(cred); + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); + nfs4_init_seqid_counter(&sp->so_seqid); + atomic_set(&sp->so_count, 1); + INIT_LIST_HEAD(&sp->so_lru); + seqcount_init(&sp->so_reclaim_seqcount); + mutex_init(&sp->so_delegreturn_mutex); + return sp; +} + +static void +nfs4_drop_state_owner(struct nfs4_state_owner *sp) +{ + struct rb_node *rb_node = &sp->so_server_node; + + if (!RB_EMPTY_NODE(rb_node)) { + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + if (!RB_EMPTY_NODE(rb_node)) { + rb_erase(rb_node, &server->state_owners); + RB_CLEAR_NODE(rb_node); + } + spin_unlock(&clp->cl_lock); + } +} + +static void nfs4_free_state_owner(struct nfs4_state_owner *sp) +{ + nfs4_destroy_seqid_counter(&sp->so_seqid); + put_rpccred(sp->so_cred); + kfree(sp); +} + +static void nfs4_gc_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + unsigned long time_min, time_max; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + time_max = jiffies; + time_min = (long)time_max - (long)clp->cl_lease_time; + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + /* NB: LRU is sorted so that oldest is at the head */ + if (time_in_range(sp->so_expires, time_min, time_max)) + break; + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } +} + +/** + * nfs4_get_state_owner - Look up a state owner given a credential + * @server: nfs_server to search + * @cred: RPC credential to match + * + * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. + */ +struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, + struct rpc_cred *cred, + gfp_t gfp_flags) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *new; + + spin_lock(&clp->cl_lock); + sp = nfs4_find_state_owner_locked(server, cred); + spin_unlock(&clp->cl_lock); + if (sp != NULL) + goto out; + new = nfs4_alloc_state_owner(server, cred, gfp_flags); + if (new == NULL) + goto out; + do { + if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) + break; + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); + } while (sp == ERR_PTR(-EAGAIN)); + if (sp != new) + nfs4_free_state_owner(new); +out: + nfs4_gc_state_owners(server); + return sp; +} + +/** + * nfs4_put_state_owner - Release a nfs4_state_owner + * @sp: state owner data to release + * + * Note that we keep released state owners on an LRU + * list. + * This caches valid state owners so that they can be + * reused, to avoid the OPEN_CONFIRM on minor version 0. + * It also pins the uniquifier of dropped state owners for + * a while, to ensure that those state owner names are + * never reused. + */ +void nfs4_put_state_owner(struct nfs4_state_owner *sp) +{ + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) + return; + + sp->so_expires = jiffies; + list_add_tail(&sp->so_lru, &server->state_owners_lru); + spin_unlock(&clp->cl_lock); +} + +/** + * nfs4_purge_state_owners - Release all cached state owners + * @server: nfs_server with cached state owners to release + * + * Called at umount time. Remaining state owners will be on + * the LRU with ref count of zero. + */ +void nfs4_purge_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } +} + +static struct nfs4_state * +nfs4_alloc_open_state(void) +{ + struct nfs4_state *state; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return NULL; + atomic_set(&state->count, 1); + INIT_LIST_HEAD(&state->lock_states); + spin_lock_init(&state->state_lock); + seqlock_init(&state->seqlock); + return state; +} + +void +nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode) +{ + if (state->state == fmode) + return; + /* NB! List reordering - see the reclaim code for why. */ + if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { + if (fmode & FMODE_WRITE) + list_move(&state->open_states, &state->owner->so_states); + else + list_move_tail(&state->open_states, &state->owner->so_states); + } + state->state = fmode; +} + +static struct nfs4_state * +__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs4_state *state; + + list_for_each_entry(state, &nfsi->open_states, inode_states) { + if (state->owner != owner) + continue; + if (!nfs4_valid_open_stateid(state)) + continue; + if (atomic_inc_not_zero(&state->count)) + return state; + } + return NULL; +} + +static void +nfs4_free_open_state(struct nfs4_state *state) +{ + kfree(state); +} + +struct nfs4_state * +nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) +{ + struct nfs4_state *state, *new; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + state = __nfs4_find_state_byowner(inode, owner); + spin_unlock(&inode->i_lock); + if (state) + goto out; + new = nfs4_alloc_open_state(); + spin_lock(&owner->so_lock); + spin_lock(&inode->i_lock); + state = __nfs4_find_state_byowner(inode, owner); + if (state == NULL && new != NULL) { + state = new; + state->owner = owner; + atomic_inc(&owner->so_count); + list_add(&state->inode_states, &nfsi->open_states); + ihold(inode); + state->inode = inode; + spin_unlock(&inode->i_lock); + /* Note: The reclaim code dictates that we add stateless + * and read-only stateids to the end of the list */ + list_add_tail(&state->open_states, &owner->so_states); + spin_unlock(&owner->so_lock); + } else { + spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); + if (new) + nfs4_free_open_state(new); + } +out: + return state; +} + +void nfs4_put_open_state(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs4_state_owner *owner = state->owner; + + if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + return; + spin_lock(&inode->i_lock); + list_del(&state->inode_states); + list_del(&state->open_states); + spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); + iput(inode); + nfs4_free_open_state(state); + nfs4_put_state_owner(owner); +} + +/* + * Close the current file. + */ +static void __nfs4_close(struct nfs4_state *state, + fmode_t fmode, gfp_t gfp_mask, int wait) +{ + struct nfs4_state_owner *owner = state->owner; + int call_close = 0; + fmode_t newstate; + + atomic_inc(&owner->so_count); + /* Protect against nfs4_find_state() */ + spin_lock(&owner->so_lock); + switch (fmode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + state->n_rdonly--; + break; + case FMODE_WRITE: + state->n_wronly--; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr--; + } + newstate = FMODE_READ|FMODE_WRITE; + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) { + newstate &= ~FMODE_READ; + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (state->n_wronly == 0) { + newstate &= ~FMODE_WRITE; + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (newstate == 0) + clear_bit(NFS_DELEGATED_STATE, &state->flags); + } + nfs4_state_set_mode_locked(state, newstate); + spin_unlock(&owner->so_lock); + + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else + nfs4_do_close(state, gfp_mask, wait); +} + +void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) +{ + __nfs4_close(state, fmode, GFP_NOFS, 0); +} + +void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) +{ + __nfs4_close(state, fmode, GFP_KERNEL, 1); +} + +/* + * Search the state->lock_states for an existing lock_owner + * that is compatible with current->files + */ +static struct nfs4_lock_state * +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +{ + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (pos->ls_owner != fl_owner) + continue; + atomic_inc(&pos->ls_count); + return pos; + } + return NULL; +} + +/* + * Return a compatible lock_state. If no initialized lock_state structure + * exists, return an uninitialized one. + * + */ +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +{ + struct nfs4_lock_state *lsp; + struct nfs_server *server = state->owner->so_server; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) + return NULL; + nfs4_init_seqid_counter(&lsp->ls_seqid); + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; + lsp->ls_owner = fl_owner; + lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); + if (lsp->ls_seqid.owner_id < 0) + goto out_free; + INIT_LIST_HEAD(&lsp->ls_locks); + return lsp; +out_free: + kfree(lsp); + return NULL; +} + +void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + nfs4_destroy_seqid_counter(&lsp->ls_seqid); + kfree(lsp); +} + +/* + * Return a compatible lock_state. If no initialized lock_state structure + * exists, return an uninitialized one. + * + */ +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +{ + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, owner); + if (lsp != NULL) + break; + if (new != NULL) { + list_add(&new->ls_locks, &state->lock_states); + set_bit(LK_STATE_IN_USE, &state->flags); + lsp = new; + new = NULL; + break; + } + spin_unlock(&state->state_lock); + new = nfs4_alloc_lock_state(state, owner); + if (new == NULL) + return NULL; + } + spin_unlock(&state->state_lock); + if (new != NULL) + nfs4_free_lock_state(state->owner->so_server, new); + return lsp; +} + +/* + * Release reference to lock_state, and free it if we see that + * it is no longer in use + */ +void nfs4_put_lock_state(struct nfs4_lock_state *lsp) +{ + struct nfs_server *server; + struct nfs4_state *state; + + if (lsp == NULL) + return; + state = lsp->ls_state; + if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + return; + list_del(&lsp->ls_locks); + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); + server = state->owner->so_server; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else + nfs4_free_lock_state(server, lsp); +} + +static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; + + dst->fl_u.nfs4_fl.owner = lsp; + atomic_inc(&lsp->ls_count); +} + +static void nfs4_fl_release_lock(struct file_lock *fl) +{ + nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); +} + +static const struct file_lock_operations nfs4_fl_lock_ops = { + .fl_copy_lock = nfs4_fl_copy_lock, + .fl_release_private = nfs4_fl_release_lock, +}; + +int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) +{ + struct nfs4_lock_state *lsp; + + if (fl->fl_ops != NULL) + return 0; + lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; + fl->fl_ops = &nfs4_fl_lock_ops; + return 0; +} + +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, + struct nfs4_state *state, + const struct nfs_lockowner *lockowner) +{ + struct nfs4_lock_state *lsp; + fl_owner_t fl_owner; + int ret = -ENOENT; + + + if (lockowner == NULL) + goto out; + + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) + goto out; + + fl_owner = lockowner->l_owner; + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, fl_owner); + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + ret = -EIO; + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { + nfs4_stateid_copy(dst, &lsp->ls_stateid); + ret = 0; + } + spin_unlock(&state->state_lock); + nfs4_put_lock_state(lsp); +out: + return ret; +} + +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ + const nfs4_stateid *src; + int seq; + + do { + src = &zero_stateid; + seq = read_seqbegin(&state->seqlock); + if (test_bit(NFS_OPEN_STATE, &state->flags)) + src = &state->open_stateid; + nfs4_stateid_copy(dst, src); + } while (read_seqretry(&state->seqlock, seq)); +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner) +{ + int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret == -EIO) + /* A lost lock - don't even consider delegations */ + goto out; + /* returns true if delegation stateid found and copied */ + if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + ret = 0; + goto out; + } + if (ret != -ENOENT) + /* nfs4_copy_delegation_stateid() didn't over-write + * dst, so it still has the lock stateid which we now + * choose to use. + */ + goto out; + nfs4_copy_open_stateid(dst, state); + ret = 0; +out: + if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) + dst->seqid = 0; + return ret; +} + +struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) +{ + struct nfs_seqid *new; + + new = kmalloc(sizeof(*new), gfp_mask); + if (new == NULL) + return ERR_PTR(-ENOMEM); + new->sequence = counter; + INIT_LIST_HEAD(&new->list); + new->task = NULL; + return new; +} + +void nfs_release_seqid(struct nfs_seqid *seqid) +{ + struct nfs_seqid_counter *sequence; + + if (seqid == NULL || list_empty(&seqid->list)) + return; + sequence = seqid->sequence; + spin_lock(&sequence->lock); + list_del_init(&seqid->list); + if (!list_empty(&sequence->list)) { + struct nfs_seqid *next; + + next = list_first_entry(&sequence->list, + struct nfs_seqid, list); + rpc_wake_up_queued_task(&sequence->wait, next->task); + } + spin_unlock(&sequence->lock); +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + nfs_release_seqid(seqid); + kfree(seqid); +} + +/* + * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or + * failed with a seqid incrementing error - + * see comments nfs4.h:seqid_mutating_error() + */ +static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) +{ + switch (status) { + case 0: + break; + case -NFS4ERR_BAD_SEQID: + if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) + return; + pr_warn_ratelimited("NFS: v4 server returned a bad" + " sequence-id error on an" + " unconfirmed sequence %p!\n", + seqid->sequence); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_BADXDR: + case -NFS4ERR_RESOURCE: + case -NFS4ERR_NOFILEHANDLE: + /* Non-seqid mutating errors */ + return; + }; + /* + * Note: no locking needed as we are guaranteed to be first + * on the sequence list + */ + seqid->sequence->counter++; +} + +void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) +{ + struct nfs4_state_owner *sp; + + if (seqid == NULL) + return; + + sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid); + if (status == -NFS4ERR_BAD_SEQID) + nfs4_drop_state_owner(sp); + if (!nfs4_has_session(sp->so_server->nfs_client)) + nfs_increment_seqid(status, seqid); +} + +/* + * Increment the seqid if the LOCK/LOCKU succeeded, or + * failed with a seqid incrementing error - + * see comments nfs4.h:seqid_mutating_error() + */ +void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) +{ + if (seqid != NULL) + nfs_increment_seqid(status, seqid); +} + +int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) +{ + struct nfs_seqid_counter *sequence; + int status = 0; + + if (seqid == NULL) + goto out; + sequence = seqid->sequence; + spin_lock(&sequence->lock); + seqid->task = task; + if (list_empty(&seqid->list)) + list_add_tail(&seqid->list, &sequence->list); + if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) + goto unlock; + rpc_sleep_on(&sequence->wait, task, NULL); + status = -EAGAIN; +unlock: + spin_unlock(&sequence->lock); +out: + return status; +} + +static int nfs4_run_state_manager(void *); + +static void nfs4_clear_state_manager_bit(struct nfs_client *clp) +{ + smp_mb__before_atomic(); + clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); + rpc_wake_up(&clp->cl_rpcwaitq); +} + +/* + * Schedule the nfs_client asynchronous state management routine + */ +void nfs4_schedule_state_manager(struct nfs_client *clp) +{ + struct task_struct *task; + char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); + atomic_inc(&clp->cl_count); + + /* The rcu_read_lock() is not strictly necessary, as the state + * manager is the only thread that ever changes the rpc_xprt + * after it's initialized. At this point, we're single threaded. */ + rcu_read_lock(); + snprintf(buf, sizeof(buf), "%s-manager", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); + if (IS_ERR(task)) { + printk(KERN_ERR "%s: kthread_run: %ld\n", + __func__, PTR_ERR(task)); + nfs4_clear_state_manager_bit(clp); + nfs_put_client(clp); + module_put(THIS_MODULE); + } +} + +/* + * Schedule a lease recovery attempt + */ +void nfs4_schedule_lease_recovery(struct nfs_client *clp) +{ + if (!clp) + return; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + dprintk("%s: scheduling lease recovery for server %s\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); + +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + if (server->fh_expire_type != NFS4_FH_PERSISTENT) { + pr_err("NFS: volatile file handles not supported (server %s)\n", + clp->cl_hostname); + return -NFS4ERR_IO; + } + + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -NFS4ERR_IO; + + dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", + __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + set_bit(NFS_MIG_IN_TRANSITION, + &((struct nfs_server *)server)->mig_status); + set_bit(NFS4CLNT_MOVED, &clp->cl_state); + + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ + dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", + __func__, clp->cl_clientid, clp->cl_hostname); + + set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + +int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + atomic_inc(&clp->cl_count); + res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (res) + goto out; + if (clp->cl_cons_state < 0) + res = clp->cl_cons_state; +out: + nfs_put_client(clp); + return res; +} + +int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = nfs4_wait_clnt_recover(clp); + if (ret != 0) + break; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) + break; + nfs4_schedule_state_manager(clp); + ret = -EIO; + } + return ret; +} + +/* + * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a + * resend of the SETCLIENTID and hence re-establish the + * callback channel. Then return all existing delegations. + */ +static void nfs40_handle_cb_pathdown(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs_expire_all_delegations(clp); + dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__, + clp->cl_hostname); +} + +void nfs4_schedule_path_down_recovery(struct nfs_client *clp) +{ + nfs40_handle_cb_pathdown(clp); + nfs4_schedule_state_manager(clp); +} + +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +{ + + set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + /* Don't recover state that expired before the reboot */ + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + return 0; + } + set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + return 1; +} + +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +{ + set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); + return 1; +} + +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + nfs4_state_mark_reclaim_nograce(clp, state); + dprintk("%s: scheduling stateid recovery for server %s\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); + +void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + bool found = false; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (!nfs4_stateid_match(&state->stateid, stateid)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + found = true; + } + spin_unlock(&inode->i_lock); + if (found) + nfs4_schedule_state_manager(clp); +} + +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + set_bit(NFS_CONTEXT_BAD, &ctx->flags); + } + spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ + set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); + nfs4_state_mark_open_context_bad(state); +} + + +static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct file_lock *fl; + int status = 0; + struct file_lock_context *flctx = inode->i_flctx; + struct list_head *list; + + if (flctx == NULL) + return 0; + + list = &flctx->flc_posix; + + /* Guard against delegation returns and new lock/unlock calls */ + down_write(&nfsi->rwsem); + spin_lock(&flctx->flc_lock); +restart: + list_for_each_entry(fl, list, fl_list) { + if (nfs_file_open_context(fl->fl_file)->state != state) + continue; + spin_unlock(&flctx->flc_lock); + status = ops->recover_lock(state, fl); + switch (status) { + case 0: + break; + case -ESTALE: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out; + default: + pr_err("NFS: %s: unhandled error %d\n", + __func__, status); + case -ENOMEM: + case -NFS4ERR_DENIED: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + status = 0; + } + spin_lock(&flctx->flc_lock); + } + if (list == &flctx->flc_posix) { + list = &flctx->flc_flock; + goto restart; + } + spin_unlock(&flctx->flc_lock); +out: + up_write(&nfsi->rwsem); + return status; +} + +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_state *state; + struct nfs4_lock_state *lock; + int status = 0; + + /* Note: we rely on the sp->so_states list being ordered + * so that we always reclaim open(O_RDWR) and/or open(O_WRITE) + * states first. + * This is needed to ensure that the server won't give us any + * read delegations that we have to return if, say, we are + * recovering after a network partition or a reboot from a + * server that doesn't support a grace period. + */ + spin_lock(&sp->so_lock); + raw_write_seqcount_begin(&sp->so_reclaim_seqcount); +restart: + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) + continue; + if (!nfs4_valid_open_stateid(state)) + continue; + if (state->state == 0) + continue; + atomic_inc(&state->count); + spin_unlock(&sp->so_lock); + status = ops->recover_open(sp, state); + if (status >= 0) { + status = nfs4_reclaim_locks(state, ops); + if (status >= 0) { + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: " + "%s: Lock reclaim " + "failed!\n", __func__); + } + spin_unlock(&state->state_lock); + } + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + goto restart; + } + } + switch (status) { + default: + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); + case -ENOENT: + case -ENOMEM: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out_err; + } + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + goto restart; + } + raw_write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); + return 0; +out_err: + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); + return status; +} + +static void nfs4_clear_open_state(struct nfs4_state *state) +{ + struct nfs4_lock_state *lock; + + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + lock->ls_seqid.flags = 0; + clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags); + } + spin_unlock(&state->state_lock); +} + +static void nfs4_reset_seqids(struct nfs_server *server, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp; + struct rb_node *pos; + struct nfs4_state *state; + + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + sp->so_seqid.flags = 0; + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (mark_reclaim(clp, state)) + nfs4_clear_open_state(state); + } + spin_unlock(&sp->so_lock); + } + spin_unlock(&clp->cl_lock); +} + +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_reset_seqids(server, mark_reclaim); + rcu_read_unlock(); +} + +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) +{ + /* Mark all delegations for reclaim */ + nfs_delegation_mark_reclaim(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); +} + +static void nfs4_reclaim_complete(struct nfs_client *clp, + const struct nfs4_state_recovery_ops *ops, + struct rpc_cred *cred) +{ + /* Notify the server we're done reclaiming our state */ + if (ops->reclaim_complete) + (void)ops->reclaim_complete(clp, cred); +} + +static void nfs4_clear_reclaim_server(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp; + struct rb_node *pos; + struct nfs4_state *state; + + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, + &state->flags)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + } + spin_unlock(&sp->so_lock); + } + spin_unlock(&clp->cl_lock); +} + +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +{ + struct nfs_server *server; + + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_clear_reclaim_server(server); + rcu_read_unlock(); + + nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + const struct nfs4_state_recovery_ops *ops; + struct rpc_cred *cred; + + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + ops = clp->cl_mvops->reboot_recovery_ops; + cred = nfs4_get_clid_cred(clp); + nfs4_reclaim_complete(clp, ops, cred); + put_rpccred(cred); +} + +static void nfs_delegation_clear_all(struct nfs_client *clp) +{ + nfs_delegation_mark_reclaim(clp); + nfs_delegation_reap_unclaimed(clp); +} + +static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) +{ + nfs_delegation_clear_all(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); +} + +static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) +{ + switch (error) { + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; + } + dprintk("%s: handled error %d for server %s\n", __func__, error, + clp->cl_hostname); + return 0; +} + +static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_state_owner *sp; + struct nfs_server *server; + struct rb_node *pos; + int status = 0; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + nfs4_purge_state_owners(server); + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, + struct nfs4_state_owner, so_server_node); + if (!test_and_clear_bit(ops->owner_flag_bit, + &sp->so_flags)) + continue; + atomic_inc(&sp->so_count); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + + status = nfs4_reclaim_open_state(sp, ops); + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); + status = nfs4_recovery_handle_error(clp, status); + return (status != 0) ? status : -EAGAIN; + } + + nfs4_put_state_owner(sp); + goto restart; + } + spin_unlock(&clp->cl_lock); + } + rcu_read_unlock(); + return 0; +} + +static int nfs4_check_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status; + + /* Is the client already known to have an expired lease? */ + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + return 0; + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + status = -ENOKEY; + if (cred == NULL) + goto out; + } + status = ops->renew_lease(clp, cred); + put_rpccred(cred); + if (status == -ETIMEDOUT) { + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + return 0; + } +out: + return nfs4_recovery_handle_error(clp, status); +} + +/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors + * and for recoverable errors on EXCHANGE_ID for v4.1 + */ +static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) +{ + switch (status) { + case -NFS4ERR_SEQ_MISORDERED: + if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) + return -ESERVERFAULT; + /* Lease confirmation error: retry after purging the lease */ + ssleep(1); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; + case -NFS4ERR_STALE_CLIENTID: + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_CLID_INUSE: + pr_err("NFS: Server %s reports our clientid is in use\n", + clp->cl_hostname); + nfs_mark_client_ready(clp, -EPERM); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + return -EPERM; + case -EACCES: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + break; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EPROTONOSUPPORT); + dprintk("%s: exit with error %d for server %s\n", + __func__, -EPROTONOSUPPORT, clp->cl_hostname); + return -EPROTONOSUPPORT; + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + default: + dprintk("%s: exit with error %d for server %s\n", __func__, + status, clp->cl_hostname); + return status; + } + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + dprintk("%s: handled error %d for server %s\n", __func__, status, + clp->cl_hostname); + return 0; +} + +static int nfs4_establish_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + int status; + + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOENT; + status = ops->establish_clid(clp, cred); + put_rpccred(cred); + if (status != 0) + return status; + pnfs_destroy_all_layouts(clp); + return 0; +} + +/* + * Returns zero or a negative errno. NFS4ERR values are converted + * to local errno values. + */ +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) + return nfs4_handle_reclaim_lease_error(clp, status); + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + return 0; +} + +static int nfs4_purge_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) + return nfs4_handle_reclaim_lease_error(clp, status); + clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + return 0; +} + +/* + * Try remote migration of one FSID from a source server to a + * destination server. The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_fs_locations *locations = NULL; + struct inode *inode; + struct page *page; + int status, result; + + dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + result = 0; + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } + + inode = d_inode(server->super->s_root); + result = nfs4_proc_get_locations(inode, locations, page, cred); + if (result) { + dprintk("<-- %s: failed to retrieve fs_locations: %d\n", + __func__, result); + goto out; + } + + result = -NFS4ERR_NXIO; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + dprintk("<-- %s: No fs_locations data, migration skipped\n", + __func__); + goto out; + } + + nfs4_begin_drain_session(clp); + + status = nfs4_replace_transport(server, locations); + if (status != 0) { + dprintk("<-- %s: failed to replace transport: %d\n", + __func__, status); + goto out; + } + + result = 0; + dprintk("<-- %s: migration succeeded\n", __func__); + +out: + if (page != NULL) + __free_page(page); + kfree(locations); + if (result) { + pr_err("NFS: migration recovery failed (server %s)\n", + clp->cl_hostname); + set_bit(NFS_MIG_FAILED, &server->mig_status); + } + return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: migration reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, + &server->mig_status)) + continue; + + rcu_read_unlock(); + status = nfs4_try_migration(server, cred); + if (status < 0) { + put_rpccred(cred); + return status; + } + goto restart; + } + rcu_read_unlock(); + put_rpccred(cred); + return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server. Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: lease moved reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + struct inode *inode; + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + rcu_read_unlock(); + + inode = d_inode(server->super->s_root); + status = nfs4_proc_fsid_present(inode, cred); + if (status != -NFS4ERR_MOVED) + goto restart; /* wasn't this one */ + if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) + goto restart; /* there are more */ + goto out; + } + rcu_read_unlock(); + +out: + put_rpccred(cred); + return 0; +} + +/** + * nfs4_discover_server_trunking - Detect server IP address trunking + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * + * Returns zero or a negative errno. If zero is returned, + * an nfs_client pointer is planted in "result". + * + * Note: since we are invoked in process context, and + * not from inside the state manager, we cannot use + * nfs4_handle_reclaim_lease_error(). + */ +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result) +{ + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + struct rpc_clnt *clnt; + struct rpc_cred *cred; + int i, status; + + dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); + + clnt = clp->cl_rpcclient; + i = 0; + + mutex_lock(&nfs_clid_init_mutex); +again: + status = -ENOENT; + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + goto out_unlock; + + status = ops->detect_trunking(clp, result, cred); + put_rpccred(cred); + switch (status) { + case 0: + break; + case -ETIMEDOUT: + if (clnt->cl_softrtry) + break; + case -NFS4ERR_DELAY: + case -EAGAIN: + ssleep(1); + case -NFS4ERR_STALE_CLIENTID: + dprintk("NFS: %s after status %d, retrying\n", + __func__, status); + goto again; + case -EACCES: + if (i++ == 0) { + nfs4_root_machine_cred(clp); + goto again; + } + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) + break; + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_WRONGSEC: + /* No point in retrying if we already used RPC_AUTH_UNIX */ + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) { + status = -EPERM; + break; + } + clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + break; + } + /* Note: this is safe because we haven't yet marked the + * client as ready, so we are the only user of + * clp->cl_rpcclient + */ + clnt = xchg(&clp->cl_rpcclient, clnt); + rpc_shutdown_client(clnt); + clnt = clp->cl_rpcclient; + goto again; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + status = -EPROTONOSUPPORT; + break; + + case -EKEYEXPIRED: + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + status = -EKEYEXPIRED; + break; + default: + pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", + __func__, status); + status = -EIO; + } + +out_unlock: + mutex_unlock(&nfs_clid_init_mutex); + dprintk("NFS: %s: status = %d\n", __func__, status); + return status; +} + +#ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ + struct nfs_client *clp = session->clp; + + switch (err) { + default: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + } + nfs4_schedule_lease_recovery(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); + +static void nfs41_ping_server(struct nfs_client *clp) +{ + /* Use CHECK_LEASE to ping the server with a SEQUENCE */ + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} + +void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + +void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + +static void nfs4_reset_all_state(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + dprintk("%s: scheduling reset of all state for server %s!\n", + __func__, clp->cl_hostname); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_server_reboot(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + nfs4_state_start_reclaim_reboot(clp); + dprintk("%s: server %s rebooted!\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_state_revoked(struct nfs_client *clp) +{ + nfs4_reset_all_state(clp); + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + +static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) +{ + /* This will need to handle layouts too */ + nfs_expire_all_delegations(clp); + dprintk("%s: Recallable state revoked on server %s!\n", __func__, + clp->cl_hostname); +} + +static void nfs41_handle_backchannel_fault(struct nfs_client *clp) +{ + nfs_expire_all_delegations(clp); + if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, + clp->cl_hostname); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); +} + +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +{ + if (!flags) + return; + + dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", + __func__, clp->cl_hostname, clp->cl_clientid, flags); + + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + nfs41_handle_server_reboot(clp); + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + SEQ4_STATUS_ADMIN_STATE_REVOKED)) + nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_LEASE_MOVED) + nfs4_schedule_lease_moved_recovery(clp); + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + nfs41_handle_recallable_state_revoked(clp); + if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) + nfs41_handle_backchannel_fault(clp); + else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + nfs41_handle_cb_path_down(clp); +} + +static int nfs4_reset_session(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int status; + + if (!nfs4_has_session(clp)) + return 0; + nfs4_begin_drain_session(clp); + cred = nfs4_get_clid_cred(clp); + status = nfs4_proc_destroy_session(clp->cl_session, cred); + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: + status = nfs4_recovery_handle_error(clp, status); + goto out; + } + + memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); + status = nfs4_proc_create_session(clp, cred); + if (status) { + dprintk("%s: session reset failed with status %d for server %s!\n", + __func__, status, clp->cl_hostname); + status = nfs4_handle_reclaim_lease_error(clp, status); + goto out; + } + nfs41_finish_session_reset(clp); + dprintk("%s: session reset was successful for server %s!\n", + __func__, clp->cl_hostname); +out: + if (cred) + put_rpccred(cred); + return status; +} + +static int nfs4_bind_conn_to_session(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + nfs4_begin_drain_session(clp); + cred = nfs4_get_clid_cred(clp); + ret = nfs4_proc_bind_conn_to_session(clp, cred); + if (cred) + put_rpccred(cred); + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + switch (ret) { + case 0: + dprintk("%s: bind_conn_to_session was successful for server %s!\n", + __func__, clp->cl_hostname); + break; + case -NFS4ERR_DELAY: + ssleep(1); + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + return nfs4_recovery_handle_error(clp, ret); + } + return 0; +} +#else /* CONFIG_NFS_V4_1 */ +static int nfs4_reset_session(struct nfs_client *clp) { return 0; } + +static int nfs4_bind_conn_to_session(struct nfs_client *clp) +{ + return 0; +} +#endif /* CONFIG_NFS_V4_1 */ + +static void nfs4_state_manager(struct nfs_client *clp) +{ + int status = 0; + const char *section = "", *section_sep = ""; + + /* Ensure exclusive access to NFSv4 state */ + do { + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + section = "purge state"; + status = nfs4_purge_lease(clp); + if (status < 0) + goto out_error; + continue; + } + + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + section = "lease expired"; + /* We're going to have to re-establish a clientid */ + status = nfs4_reclaim_lease(clp); + if (status < 0) + goto out_error; + continue; + } + + /* Initialize or reset the session */ + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { + section = "reset session"; + status = nfs4_reset_session(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + /* Send BIND_CONN_TO_SESSION */ + if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state)) { + section = "bind conn to session"; + status = nfs4_bind_conn_to_session(clp); + if (status < 0) + goto out_error; + continue; + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + section = "check lease"; + status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; + continue; + } + + if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { + section = "migration"; + status = nfs4_handle_migration(clp); + if (status < 0) + goto out_error; + } + + if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { + section = "lease moved"; + status = nfs4_handle_lease_moved(clp); + if (status < 0) + goto out_error; + } + + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + section = "reclaim reboot"; + status = nfs4_do_reclaim(clp, + clp->cl_mvops->reboot_recovery_ops); + if (status == -EAGAIN) + continue; + if (status < 0) + goto out_error; + nfs4_state_end_reclaim_reboot(clp); + } + + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + section = "reclaim nograce"; + status = nfs4_do_reclaim(clp, + clp->cl_mvops->nograce_recovery_ops); + if (status == -EAGAIN) + continue; + if (status < 0) + goto out_error; + } + + nfs4_end_drain_session(clp); + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + continue; + } + + nfs4_clear_state_manager_bit(clp); + /* Did we race with an attempt to give us more work? */ + if (clp->cl_state == 0) + break; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + break; + } while (atomic_read(&clp->cl_count) > 1); + return; +out_error: + if (strlen(section)) + section_sep = ": "; + pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" + " with error %d\n", section_sep, section, + clp->cl_hostname, -status); + ssleep(1); + nfs4_end_drain_session(clp); + nfs4_clear_state_manager_bit(clp); +} + +static int nfs4_run_state_manager(void *ptr) +{ + struct nfs_client *clp = ptr; + + allow_signal(SIGKILL); + nfs4_state_manager(clp); + nfs_put_client(clp); + module_put_and_exit(0); + return 0; +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfs/nfs4super.c b/kernel/fs/nfs/nfs4super.c new file mode 100644 index 000000000..6fb7cb6b3 --- /dev/null +++ b/kernel/fs/nfs/nfs4super.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2012 Bryan Schumaker + */ +#include +#include +#include +#include +#include "delegation.h" +#include "internal.h" +#include "nfs4_fs.h" +#include "nfs4idmap.h" +#include "dns_resolve.h" +#include "pnfs.h" +#include "nfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc); +static void nfs4_evict_inode(struct inode *inode); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); + +static struct file_system_type nfs4_remote_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs4_write_inode, + .drop_inode = nfs_drop_inode, + .statfs = nfs_statfs, + .evict_inode = nfs4_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; + +struct nfs_subversion nfs_v4 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs4_fs_type, + .rpc_vers = &nfs_version4, + .rpc_ops = &nfs_v4_clientops, + .sops = &nfs4_sops, + .xattr = nfs4_xattr_handlers, +}; + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret = nfs_write_inode(inode, wbc); + + if (ret == 0) + ret = pnfs_layoutcommit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL); + return ret; +} + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +static void nfs4_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); +} + +/* + * Get the superblock for the NFS4 root partition + */ +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *info) +{ + struct nfs_mount_info *mount_info = info; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + mount_info->set_security = nfs_set_sb_security; + + /* Get a volume representation */ + server = nfs4_create_server(mount_info, &nfs_v4); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4); + +out: + return mntroot; +} + +static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, + int flags, void *data, const char *hostname) +{ + struct vfsmount *root_mnt; + char *root_devname; + size_t len; + + len = strlen(hostname) + 5; + root_devname = kmalloc(len, GFP_KERNEL); + if (root_devname == NULL) + return ERR_PTR(-ENOMEM); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); + root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); + kfree(root_devname); + return root_mnt; +} + +struct nfs_referral_count { + struct list_head list; + const struct task_struct *task; + unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ + struct nfs_referral_count *p; + + list_for_each_entry(p, &nfs_referral_count_list, list) { + if (p->task == current) + return p; + } + return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ + struct nfs_referral_count *p, *new; + int ret = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + new->task = current; + new->referral_count = 1; + + ret = 0; + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + if (p != NULL) { + if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) + ret = -ELOOP; + else + p->referral_count++; + } else { + list_add(&new->list, &nfs_referral_count_list); + new = NULL; + } + spin_unlock(&nfs_referral_count_list_lock); + kfree(new); +out: + return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ + struct nfs_referral_count *p; + + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + p->referral_count--; + if (p->referral_count == 0) + list_del(&p->list); + else + p = NULL; + spin_unlock(&nfs_referral_count_list_lock); + kfree(p); +} + +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path) +{ + struct dentry *dentry; + int err; + + if (IS_ERR(root_mnt)) + return ERR_CAST(root_mnt); + + err = nfs_referral_loop_protect(); + if (err) { + mntput(root_mnt); + return ERR_PTR(err); + } + + dentry = mount_subtree(root_mnt, export_path); + nfs_referral_loop_unprotect(); + + return dentry; +} + +struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + struct nfs_parsed_mount_data *data = mount_info->parsed; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); + return res; +} + +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, + }; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + dprintk("--> nfs4_referral_get_sb()\n"); + + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.cloned == NULL || mount_info.mntfh == NULL) + goto out; + + /* create a new volume representation */ + server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4); +out: + nfs_free_fhandle(mount_info.mntfh); + return mntroot; +} + +/* + * Create an NFS4 server record on referral traversal + */ +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + + dprintk("--> nfs4_referral_mount()\n"); + + export_path = data->mnt_path; + data->mnt_path = "/"; + + root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, + flags, data, data->hostname); + data->mnt_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + dprintk("<-- nfs4_referral_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); + return res; +} + + +static int __init init_nfs_v4(void) +{ + int err; + + err = nfs_dns_resolver_init(); + if (err) + goto out; + + err = nfs_idmap_init(); + if (err) + goto out1; + + err = nfs4_register_sysctl(); + if (err) + goto out2; + + register_nfs_version(&nfs_v4); + return 0; +out2: + nfs_idmap_quit(); +out1: + nfs_dns_resolver_destroy(); +out: + return err; +} + +static void __exit exit_nfs_v4(void) +{ + /* Not called in the _init(), conditionally loaded */ + nfs4_pnfs_v3_ds_connect_unload(); + + unregister_nfs_version(&nfs_v4); + nfs4_unregister_sysctl(); + nfs_idmap_quit(); + nfs_dns_resolver_destroy(); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v4); +module_exit(exit_nfs_v4); diff --git a/kernel/fs/nfs/nfs4sysctl.c b/kernel/fs/nfs/nfs4sysctl.c new file mode 100644 index 000000000..0fbd3ab1b --- /dev/null +++ b/kernel/fs/nfs/nfs4sysctl.c @@ -0,0 +1,69 @@ +/* + * linux/fs/nfs/nfs4sysctl.c + * + * Sysctl interface to NFS v4 parameters + * + * Copyright (c) 2006 Trond Myklebust + */ +#include +#include + +#include "nfs4_fs.h" +#include "nfs4idmap.h" +#include "callback.h" + +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static struct ctl_table_header *nfs4_callback_sysctl_table; + +static struct ctl_table nfs4_cb_sysctls[] = { + { + .procname = "nfs_callback_tcpport", + .data = &nfs_callback_set_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&nfs_set_port_min, + .extra2 = (int *)&nfs_set_port_max, + }, + { + .procname = "idmap_cache_timeout", + .data = &nfs_idmap_cache_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table nfs4_cb_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nfs4_cb_sysctls, + }, + { } +}; + +static struct ctl_table nfs4_cb_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nfs4_cb_sysctl_dir, + }, + { } +}; + +int nfs4_register_sysctl(void) +{ + nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + if (nfs4_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs4_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs4_callback_sysctl_table); + nfs4_callback_sysctl_table = NULL; +} diff --git a/kernel/fs/nfs/nfs4trace.c b/kernel/fs/nfs/nfs4trace.c new file mode 100644 index 000000000..d774335cc --- /dev/null +++ b/kernel/fs/nfs/nfs4trace.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2013 Trond Myklebust + */ +#include +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define CREATE_TRACE_POINTS +#include "nfs4trace.h" + +#ifdef CONFIG_NFS_V4_1 +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); +#endif diff --git a/kernel/fs/nfs/nfs4trace.h b/kernel/fs/nfs/nfs4trace.h new file mode 100644 index 000000000..470af1a78 --- /dev/null +++ b/kernel/fs/nfs/nfs4trace.h @@ -0,0 +1,1148 @@ +/* + * Copyright (c) 2013 Trond Myklebust + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs4 + +#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS4_H + +#include + +#define show_nfsv4_errors(error) \ + __print_symbolic(error, \ + { NFS4_OK, "OK" }, \ + /* Mapped by nfs4_stat_to_errno() */ \ + { -EPERM, "EPERM" }, \ + { -ENOENT, "ENOENT" }, \ + { -EIO, "EIO" }, \ + { -ENXIO, "ENXIO" }, \ + { -EACCES, "EACCES" }, \ + { -EEXIST, "EEXIST" }, \ + { -EXDEV, "EXDEV" }, \ + { -ENOTDIR, "ENOTDIR" }, \ + { -EISDIR, "EISDIR" }, \ + { -EFBIG, "EFBIG" }, \ + { -ENOSPC, "ENOSPC" }, \ + { -EROFS, "EROFS" }, \ + { -EMLINK, "EMLINK" }, \ + { -ENAMETOOLONG, "ENAMETOOLONG" }, \ + { -ENOTEMPTY, "ENOTEMPTY" }, \ + { -EDQUOT, "EDQUOT" }, \ + { -ESTALE, "ESTALE" }, \ + { -EBADHANDLE, "EBADHANDLE" }, \ + { -EBADCOOKIE, "EBADCOOKIE" }, \ + { -ENOTSUPP, "ENOTSUPP" }, \ + { -ETOOSMALL, "ETOOSMALL" }, \ + { -EREMOTEIO, "EREMOTEIO" }, \ + { -EBADTYPE, "EBADTYPE" }, \ + { -EAGAIN, "EAGAIN" }, \ + { -ELOOP, "ELOOP" }, \ + { -EOPNOTSUPP, "EOPNOTSUPP" }, \ + { -EDEADLK, "EDEADLK" }, \ + /* RPC errors */ \ + { -ENOMEM, "ENOMEM" }, \ + { -EKEYEXPIRED, "EKEYEXPIRED" }, \ + { -ETIMEDOUT, "ETIMEDOUT" }, \ + { -ERESTARTSYS, "ERESTARTSYS" }, \ + { -ECONNREFUSED, "ECONNREFUSED" }, \ + { -ECONNRESET, "ECONNRESET" }, \ + { -ENETUNREACH, "ENETUNREACH" }, \ + { -EHOSTUNREACH, "EHOSTUNREACH" }, \ + { -EHOSTDOWN, "EHOSTDOWN" }, \ + { -EPIPE, "EPIPE" }, \ + { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \ + { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \ + /* NFSv4 native errors */ \ + { -NFS4ERR_ACCESS, "ACCESS" }, \ + { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \ + { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \ + { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \ + { -NFS4ERR_BADCHAR, "BADCHAR" }, \ + { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \ + { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \ + { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \ + { -NFS4ERR_BADLABEL, "BADLABEL" }, \ + { -NFS4ERR_BADNAME, "BADNAME" }, \ + { -NFS4ERR_BADOWNER, "BADOWNER" }, \ + { -NFS4ERR_BADSESSION, "BADSESSION" }, \ + { -NFS4ERR_BADSLOT, "BADSLOT" }, \ + { -NFS4ERR_BADTYPE, "BADTYPE" }, \ + { -NFS4ERR_BADXDR, "BADXDR" }, \ + { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \ + { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \ + { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \ + { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \ + { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \ + { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \ + { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \ + { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \ + { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \ + { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \ + "CONN_NOT_BOUND_TO_SESSION" }, \ + { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \ + { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \ + { -NFS4ERR_DELAY, "DELAY" }, \ + { -NFS4ERR_DELEG_ALREADY_WANTED, \ + "DELEG_ALREADY_WANTED" }, \ + { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \ + { -NFS4ERR_DENIED, "DENIED" }, \ + { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \ + { -NFS4ERR_DQUOT, "DQUOT" }, \ + { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \ + { -NFS4ERR_EXIST, "EXIST" }, \ + { -NFS4ERR_EXPIRED, "EXPIRED" }, \ + { -NFS4ERR_FBIG, "FBIG" }, \ + { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \ + { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \ + { -NFS4ERR_GRACE, "GRACE" }, \ + { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \ + { -NFS4ERR_INVAL, "INVAL" }, \ + { -NFS4ERR_IO, "IO" }, \ + { -NFS4ERR_ISDIR, "ISDIR" }, \ + { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \ + { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \ + { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \ + { -NFS4ERR_LOCKED, "LOCKED" }, \ + { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \ + { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \ + { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \ + { -NFS4ERR_MLINK, "MLINK" }, \ + { -NFS4ERR_MOVED, "MOVED" }, \ + { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { -NFS4ERR_NOENT, "NOENT" }, \ + { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \ + { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \ + { -NFS4ERR_NOSPC, "NOSPC" }, \ + { -NFS4ERR_NOTDIR, "NOTDIR" }, \ + { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \ + { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \ + { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \ + { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \ + { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \ + { -NFS4ERR_NXIO, "NXIO" }, \ + { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \ + { -NFS4ERR_OPENMODE, "OPENMODE" }, \ + { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \ + { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \ + { -NFS4ERR_PERM, "PERM" }, \ + { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \ + { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \ + { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \ + { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \ + { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \ + { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \ + { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \ + { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \ + "REP_TOO_BIG_TO_CACHE" }, \ + { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \ + { -NFS4ERR_RESOURCE, "RESOURCE" }, \ + { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \ + { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \ + { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \ + { -NFS4ERR_ROFS, "ROFS" }, \ + { -NFS4ERR_SAME, "SAME" }, \ + { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \ + { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \ + { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \ + { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \ + { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \ + { -NFS4ERR_STALE, "STALE" }, \ + { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \ + { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \ + { -NFS4ERR_SYMLINK, "SYMLINK" }, \ + { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \ + { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \ + { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \ + { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \ + { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \ + { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \ + { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \ + { -NFS4ERR_XDEV, "XDEV" }) + +#define show_open_flags(flags) \ + __print_flags(flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_DIRECT, "O_DIRECT" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +#define show_nfs_fattr_flags(valid) \ + __print_flags((unsigned long)valid, "|", \ + { NFS_ATTR_FATTR_TYPE, "TYPE" }, \ + { NFS_ATTR_FATTR_MODE, "MODE" }, \ + { NFS_ATTR_FATTR_NLINK, "NLINK" }, \ + { NFS_ATTR_FATTR_OWNER, "OWNER" }, \ + { NFS_ATTR_FATTR_GROUP, "GROUP" }, \ + { NFS_ATTR_FATTR_RDEV, "RDEV" }, \ + { NFS_ATTR_FATTR_SIZE, "SIZE" }, \ + { NFS_ATTR_FATTR_FSID, "FSID" }, \ + { NFS_ATTR_FATTR_FILEID, "FILEID" }, \ + { NFS_ATTR_FATTR_ATIME, "ATIME" }, \ + { NFS_ATTR_FATTR_MTIME, "MTIME" }, \ + { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ + { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ + { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + +DECLARE_EVENT_CLASS(nfs4_clientid_event, + TP_PROTO( + const struct nfs_client *clp, + int error + ), + + TP_ARGS(clp, error), + + TP_STRUCT__entry( + __string(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)) + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + __assign_str(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + ), + + TP_printk( + "error=%d (%s) dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __get_str(dstaddr) + ) +); +#define DEFINE_NFS4_CLIENTID_EVENT(name) \ + DEFINE_EVENT(nfs4_clientid_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + int error \ + ), \ + TP_ARGS(clp, error)) +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); + +TRACE_EVENT(nfs4_setup_sequence, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_args *args + ), + TP_ARGS(session, args), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_used_slotid) + ), + + TP_fast_assign( + const struct nfs4_slot *sa_slot = args->sa_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sa_slot->slot_nr; + __entry->seq_nr = sa_slot->seq_nr; + __entry->highest_used_slotid = + sa_slot->table->highest_used_slotid; + ), + TP_printk( + "session=0x%08x slot_nr=%u seq_nr=%u " + "highest_used_slotid=%u", + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_used_slotid + ) +); + +#define show_nfs4_sequence_status_flags(status) \ + __print_flags((unsigned long)status, "|", \ + { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \ + "CB_GSS_CONTEXTS_EXPIRING" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \ + "CB_GSS_CONTEXTS_EXPIRED" }, \ + { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \ + "EXPIRED_ALL_STATE_REVOKED" }, \ + { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \ + "EXPIRED_SOME_STATE_REVOKED" }, \ + { SEQ4_STATUS_ADMIN_STATE_REVOKED, \ + "ADMIN_STATE_REVOKED" }, \ + { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \ + "RECALLABLE_STATE_REVOKED" }, \ + { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \ + { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \ + "RESTART_RECLAIM_NEEDED" }, \ + { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \ + "CB_PATH_DOWN_SESSION" }, \ + { SEQ4_STATUS_BACKCHANNEL_FAULT, \ + "BACKCHANNEL_FAULT" }) + +TRACE_EVENT(nfs4_sequence_done, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_res *res + ), + TP_ARGS(session, res), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, target_highest_slotid) + __field(unsigned int, status_flags) + __field(int, error) + ), + + TP_fast_assign( + const struct nfs4_slot *sr_slot = res->sr_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sr_slot->slot_nr; + __entry->seq_nr = sr_slot->seq_nr; + __entry->highest_slotid = res->sr_highest_slotid; + __entry->target_highest_slotid = + res->sr_target_highest_slotid; + __entry->error = res->sr_status; + ), + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u target_highest_slotid=%u " + "status_flags=%u (%s)", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid, + __entry->target_highest_slotid, + __entry->status_flags, + show_nfs4_sequence_status_flags(__entry->status_flags) + ) +); + +struct cb_sequenceargs; +struct cb_sequenceres; + +TRACE_EVENT(nfs4_cb_sequence, + TP_PROTO( + const struct cb_sequenceargs *args, + const struct cb_sequenceres *res, + __be32 status + ), + TP_ARGS(args, res, status), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, cachethis) + __field(int, error) + ), + + TP_fast_assign( + __entry->session = nfs_session_id_hash(&args->csa_sessionid); + __entry->slot_nr = args->csa_slotid; + __entry->seq_nr = args->csa_sequenceid; + __entry->highest_slotid = args->csa_highestslotid; + __entry->cachethis = args->csa_cachethis; + __entry->error = -be32_to_cpu(status); + ), + + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid + ) +); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_open_event, + TP_PROTO( + const struct nfs_open_context *ctx, + int flags, + int error + ), + + TP_ARGS(ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + const struct nfs4_state *state = ctx->state; + const struct inode *inode = NULL; + + __entry->error = error; + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __entry->dev = ctx->dentry->d_sb->s_dev; + if (!IS_ERR(state)) + inode = state->inode; + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + } else { + __entry->fileid = 0; + __entry->fhandle = 0; + } + __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d (%s) flags=%d (%s) fmode=%s " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_OPEN_EVENT(name) \ + DEFINE_EVENT(nfs4_open_event, name, \ + TP_PROTO( \ + const struct nfs_open_context *ctx, \ + int flags, \ + int error \ + ), \ + TP_ARGS(ctx, flags, error)) +DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); + +TRACE_EVENT(nfs4_close, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs_closeargs *args, + const struct nfs_closeres *res, + int error + ), + + TP_ARGS(state, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define show_lock_cmd(type) \ + __print_symbolic((int)type, \ + { F_GETLK, "GETLK" }, \ + { F_SETLK, "SETLK" }, \ + { F_SETLKW, "SETLKW" }) +#define show_lock_type(type) \ + __print_symbolic((int)type, \ + { F_RDLCK, "RDLCK" }, \ + { F_WRLCK, "WRLCK" }, \ + { F_UNLCK, "UNLCK" }) + +DECLARE_EVENT_CLASS(nfs4_lock_event, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + int cmd, + int error + ), + + TP_ARGS(request, state, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_LOCK_EVENT(name) \ + DEFINE_EVENT(nfs4_lock_event, name, \ + TP_PROTO( \ + const struct file_lock *request, \ + const struct nfs4_state *state, \ + int cmd, \ + int error \ + ), \ + TP_ARGS(request, state, cmd, error)) +DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); +DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); + +DECLARE_EVENT_CLASS(nfs4_set_delegation_event, + TP_PROTO( + const struct inode *inode, + fmode_t fmode + ), + + TP_ARGS(inode, fmode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)fmode; + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x", + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); +#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_set_delegation_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + fmode_t fmode \ + ), \ + TP_ARGS(inode, fmode)) +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); + +TRACE_EVENT(nfs4_delegreturn_exit, + TP_PROTO( + const struct nfs4_delegreturnargs *args, + const struct nfs4_delegreturnres *res, + int error + ), + + TP_ARGS(args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = res->server->s_dev; + __entry->fhandle = nfs_fhandle_hash(args->fhandle); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fhandle + ) +); + +#ifdef CONFIG_NFS_V4_1 +DECLARE_EVENT_CLASS(nfs4_test_stateid_event, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lsp, + int error + ), + + TP_ARGS(state, lsp, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_test_stateid_event, name, \ + TP_PROTO( \ + const struct nfs4_state *state, \ + const struct nfs4_lock_state *lsp, \ + int error \ + ), \ + TP_ARGS(state, lsp, error)) +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct qstr *name, + int error + ), + + TP_ARGS(dir, name, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __string(name, name->name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, name->name); + ), + + TP_printk( + "error=%d (%s) name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs4_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct qstr *name, \ + int error \ + ), \ + TP_ARGS(dir, name, error)) + +DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo); + +TRACE_EVENT(nfs4_rename, + TP_PROTO( + const struct inode *olddir, + const struct qstr *oldname, + const struct inode *newdir, + const struct qstr *newname, + int error + ), + + TP_ARGS(olddir, oldname, newdir, newname, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, olddir) + __string(oldname, oldname->name) + __field(u64, newdir) + __string(newname, newname->name) + ), + + TP_fast_assign( + __entry->dev = olddir->i_sb->s_dev; + __entry->olddir = NFS_FILEID(olddir); + __entry->newdir = NFS_FILEID(newdir); + __entry->error = error; + __assign_str(oldname, oldname->name); + __assign_str(newname, newname->name); + ), + + TP_printk( + "error=%d (%s) oldname=%02x:%02x:%llu/%s " + "newname=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->olddir, + __get_str(oldname), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->newdir, + __get_str(newname) + ) +); + +DECLARE_EVENT_CLASS(nfs4_inode_event, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_INODE_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) + +DEFINE_NFS4_INODE_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_EVENT(nfs4_access); +DEFINE_NFS4_INODE_EVENT(nfs4_readlink); +DEFINE_NFS4_INODE_EVENT(nfs4_readdir); +DEFINE_NFS4_INODE_EVENT(nfs4_get_acl); +DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); +DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); +DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_getattr_event, + TP_PROTO( + const struct nfs_server *server, + const struct nfs_fh *fhandle, + const struct nfs_fattr *fattr, + int error + ), + + TP_ARGS(server, fhandle, fattr, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, valid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->valid = fattr->valid; + __entry->fhandle = nfs_fhandle_hash(fhandle); + __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "valid=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_nfs_fattr_flags(__entry->valid) + ) +); + +#define DEFINE_NFS4_GETATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_getattr_event, name, \ + TP_PROTO( \ + const struct nfs_server *server, \ + const struct nfs_fh *fhandle, \ + const struct nfs_fattr *fattr, \ + int error \ + ), \ + TP_ARGS(server, fhandle, fattr, error)) +DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); +DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); +DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); + +DECLARE_EVENT_CLASS(nfs4_idmap_event, + TP_PROTO( + const char *name, + int len, + u32 id, + int error + ), + + TP_ARGS(name, len, id, error), + + TP_STRUCT__entry( + __field(int, error) + __field(u32, id) + __dynamic_array(char, name, len > 0 ? len + 1 : 1) + ), + + TP_fast_assign( + if (len < 0) + len = 0; + __entry->error = error < 0 ? error : 0; + __entry->id = id; + memcpy(__get_dynamic_array(name), name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d id=%u name=%s", + __entry->error, + __entry->id, + __get_str(name) + ) +); +#define DEFINE_NFS4_IDMAP_EVENT(name) \ + DEFINE_EVENT(nfs4_idmap_event, name, \ + TP_PROTO( \ + const char *name, \ + int len, \ + u32 id, \ + int error \ + ), \ + TP_ARGS(name, len, id, error)) +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); + +DECLARE_EVENT_CLASS(nfs4_read_event, + TP_PROTO( + const struct nfs_pgio_header *hdr, + int error + ), + + TP_ARGS(hdr, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_READ_EVENT(name) \ + DEFINE_EVENT(nfs4_read_event, name, \ + TP_PROTO( \ + const struct nfs_pgio_header *hdr, \ + int error \ + ), \ + TP_ARGS(hdr, error)) +DEFINE_NFS4_READ_EVENT(nfs4_read); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_write_event, + TP_PROTO( + const struct nfs_pgio_header *hdr, + int error + ), + + TP_ARGS(hdr, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); + +#define DEFINE_NFS4_WRITE_EVENT(name) \ + DEFINE_EVENT(nfs4_write_event, name, \ + TP_PROTO( \ + const struct nfs_pgio_header *hdr, \ + int error \ + ), \ + TP_ARGS(hdr, error)) +DEFINE_NFS4_WRITE_EVENT(nfs4_write); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_commit_event, + TP_PROTO( + const struct nfs_commit_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_COMMIT_EVENT(name) \ + DEFINE_EVENT(nfs4_commit_event, name, \ + TP_PROTO( \ + const struct nfs_commit_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_COMMIT_EVENT(nfs4_commit); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds); + +#define show_pnfs_iomode(iomode) \ + __print_symbolic(iomode, \ + { IOMODE_READ, "READ" }, \ + { IOMODE_RW, "RW" }, \ + { IOMODE_ANY, "ANY" }) + +TRACE_EVENT(nfs4_layoutget, + TP_PROTO( + const struct nfs_open_context *ctx, + const struct pnfs_layout_range *args, + const struct pnfs_layout_range *res, + int error + ), + + TP_ARGS(ctx, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u32, iomode) + __field(u64, offset) + __field(u64, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = d_inode(ctx->dentry); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->iomode = args->iomode; + __entry->offset = args->offset; + __entry->count = args->length; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s offset=%llu count=%llu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->offset, + (unsigned long long)__entry->count + ) +); + +DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* _TRACE_NFS4_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfs4trace +/* This part must be outside protection */ +#include diff --git a/kernel/fs/nfs/nfs4xdr.c b/kernel/fs/nfs/nfs4xdr.c new file mode 100644 index 000000000..0aea97841 --- /dev/null +++ b/kernel/fs/nfs/nfs4xdr.c @@ -0,0 +1,7443 @@ +/* + * fs/nfs/nfs4xdr.c + * + * Client-side XDR for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4idmap.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +static int nfs4_stat_to_errno(int); + +/* NFSv4 COMPOUND tags are only wanted for debugging purposes */ +#ifdef DEBUG +#define NFS4_MAXTAGLEN 20 +#else +#define NFS4_MAXTAGLEN 0 +#endif + +/* lock,open owner id: + * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) + */ +#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) +#define lock_owner_id_maxsz (1 + 1 + 4) +#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define op_encode_hdr_maxsz (1) +#define op_decode_hdr_maxsz (2) +#define encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define decode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define encode_putfh_maxsz (op_encode_hdr_maxsz + 1 + \ + (NFS4_FHSIZE >> 2)) +#define decode_putfh_maxsz (op_decode_hdr_maxsz) +#define encode_putrootfh_maxsz (op_encode_hdr_maxsz) +#define decode_putrootfh_maxsz (op_decode_hdr_maxsz) +#define encode_getfh_maxsz (op_encode_hdr_maxsz) +#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +#define nfs4_fattr_bitmap_maxsz 4 +#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) +#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ +#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) +#else +#define nfs4_label_maxsz 0 +#endif +/* We support only one layout type per file system */ +#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) +/* This is based on getfattr, which uses the most attributes: */ +#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ + 3 + 3 + 3 + nfs4_owner_maxsz + \ + nfs4_group_maxsz + nfs4_label_maxsz + \ + decode_mdsthreshold_maxsz)) +#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ + nfs4_fattr_value_maxsz) +#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) +#define encode_attrs_maxsz (nfs4_fattr_bitmap_maxsz + \ + 1 + 2 + 1 + \ + nfs4_owner_maxsz + \ + nfs4_group_maxsz + \ + nfs4_label_maxsz + \ + 4 + 4) +#define encode_savefh_maxsz (op_encode_hdr_maxsz) +#define decode_savefh_maxsz (op_decode_hdr_maxsz) +#define encode_restorefh_maxsz (op_encode_hdr_maxsz) +#define decode_restorefh_maxsz (op_decode_hdr_maxsz) +#define encode_fsinfo_maxsz (encode_getattr_maxsz) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) +#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) +#define decode_renew_maxsz (op_decode_hdr_maxsz) +#define encode_setclientid_maxsz \ + (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ + XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + 1 /* sc_prog */ + \ + 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ + 1) /* sc_cb_ident */ +#define decode_setclientid_maxsz \ + (op_decode_hdr_maxsz + \ + 2 /* clientid */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN)) +#define encode_setclientid_confirm_maxsz \ + (op_encode_hdr_maxsz + \ + 3 + (NFS4_VERIFIER_SIZE >> 2)) +#define decode_setclientid_confirm_maxsz \ + (op_decode_hdr_maxsz) +#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_lookup_maxsz (op_decode_hdr_maxsz) +#define encode_share_access_maxsz \ + (2) +#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz) +#define encode_opentype_maxsz (1 + encode_createmode_maxsz) +#define encode_claim_null_maxsz (1 + nfs4_name_maxsz) +#define encode_open_maxsz (op_encode_hdr_maxsz + \ + 2 + encode_share_access_maxsz + 2 + \ + open_owner_id_maxsz + \ + encode_opentype_maxsz + \ + encode_claim_null_maxsz) +#define decode_ace_maxsz (3 + nfs4_owner_maxsz) +#define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \ + decode_ace_maxsz) +#define decode_change_info_maxsz (5) +#define decode_open_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz + \ + decode_change_info_maxsz + 1 + \ + nfs4_fattr_bitmap_maxsz + \ + decode_delegation_maxsz) +#define encode_open_confirm_maxsz \ + (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 1) +#define decode_open_confirm_maxsz \ + (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_open_downgrade_maxsz \ + (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 1 + \ + encode_share_access_maxsz) +#define decode_open_downgrade_maxsz \ + (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_close_maxsz (op_encode_hdr_maxsz + \ + 1 + encode_stateid_maxsz) +#define decode_close_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_setattr_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + encode_attrs_maxsz) +#define decode_setattr_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz) +#define encode_read_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define encode_readdir_maxsz (op_encode_hdr_maxsz + \ + 2 + encode_verifier_maxsz + 5 + \ + nfs4_label_maxsz) +#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ + decode_verifier_maxsz) +#define encode_readlink_maxsz (op_encode_hdr_maxsz) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define encode_write_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 4) +#define decode_write_maxsz (op_decode_hdr_maxsz + \ + 2 + decode_verifier_maxsz) +#define encode_commit_maxsz (op_encode_hdr_maxsz + 3) +#define decode_commit_maxsz (op_decode_hdr_maxsz + \ + decode_verifier_maxsz) +#define encode_remove_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) +#define decode_remove_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) +#define encode_rename_maxsz (op_encode_hdr_maxsz + \ + 2 * nfs4_name_maxsz) +#define decode_rename_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + decode_change_info_maxsz) +#define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) +#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) +#define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) +#define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) +#define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) +#define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ + encode_stateid_maxsz + \ + 4) +#define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) +#define encode_access_maxsz (op_encode_hdr_maxsz + 1) +#define decode_access_maxsz (op_decode_hdr_maxsz + 2) +#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_name_maxsz + \ + 1 + \ + nfs4_fattr_maxsz) +#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) +#define encode_create_maxsz (op_encode_hdr_maxsz + \ + 1 + 2 + nfs4_name_maxsz + \ + encode_attrs_maxsz) +#define decode_create_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + nfs4_fattr_bitmap_maxsz) +#define encode_statfs_maxsz (encode_getattr_maxsz) +#define decode_statfs_maxsz (decode_getattr_maxsz) +#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) +#define decode_delegreturn_maxsz (op_decode_hdr_maxsz) +#define encode_getacl_maxsz (encode_getattr_maxsz) +#define decode_getacl_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 1) +#define encode_setacl_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define decode_setacl_maxsz (decode_setattr_maxsz) +#define encode_fs_locations_maxsz \ + (encode_getattr_maxsz) +#define decode_fs_locations_maxsz \ + (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) + +#if defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MACHINE_NAME_LEN (64) +#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \ + sizeof(utsname()->version) + sizeof(utsname()->machine) + 8) + +#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ + encode_verifier_maxsz + \ + 1 /* co_ownerid.len */ + \ + XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + 1 /* flags */ + \ + 1 /* spa_how */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 /* implementation id array of size 1 */ + \ + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(IMPL_NAME_LIMIT) + \ + 3 /* nii_date */) +#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ + 2 /* eir_clientid */ + \ + 1 /* eir_sequenceid */ + \ + 1 /* eir_flags */ + \ + 1 /* spr_how */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 2 /* eir_server_owner.so_minor_id */ + \ + /* eir_server_owner.so_major_id<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + /* eir_server_scope<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + 1 /* eir_server_impl_id array length */ + \ + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 3 /* nii_date */) +#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */) +#define decode_channel_attrs_maxsz (6 + \ + 1 /* ca_rdma_ird.len */ + \ + 1 /* ca_rdma_ird */) +#define encode_create_session_maxsz (op_encode_hdr_maxsz + \ + 2 /* csa_clientid */ + \ + 1 /* csa_sequence */ + \ + 1 /* csa_flags */ + \ + encode_channel_attrs_maxsz + \ + encode_channel_attrs_maxsz + \ + 1 /* csa_cb_program */ + \ + 1 /* csa_sec_parms.len (1) */ + \ + 1 /* cb_secflavor (AUTH_SYS) */ + \ + 1 /* stamp */ + \ + 1 /* machinename.len */ + \ + XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \ + 1 /* uid */ + \ + 1 /* gid */ + \ + 1 /* gids.len (0) */) +#define decode_create_session_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* csr_sequence */ + \ + 1 /* csr_flags */ + \ + decode_channel_attrs_maxsz + \ + decode_channel_attrs_maxsz) +#define encode_bind_conn_to_session_maxsz (op_encode_hdr_maxsz + \ + /* bctsa_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsa_dir */ + \ + 1 /* bctsa_use_conn_in_rdma_mode */) +#define decode_bind_conn_to_session_maxsz (op_decode_hdr_maxsz + \ + /* bctsr_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsr_dir */ + \ + 1 /* bctsr_use_conn_in_rdma_mode */) +#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4) +#define decode_destroy_session_maxsz (op_decode_hdr_maxsz) +#define encode_destroy_clientid_maxsz (op_encode_hdr_maxsz + 2) +#define decode_destroy_clientid_maxsz (op_decode_hdr_maxsz) +#define encode_sequence_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) +#define decode_sequence_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) +#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) +#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* layout type */ + \ + 1 /* maxcount */ + \ + 1 /* bitmap size */ + \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap, word 0 */) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap, word 0 */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* layoutupdate4 opaqueue len */) + /* the actual content of layoutupdate4 should + be allocated by drivers and spliced in + using xdr_write_pages */ +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 1 /* FIXME: opaque lrf_body always empty at the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ + 1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz) +#else /* CONFIG_NFS_V4_1 */ +#define encode_sequence_maxsz 0 +#define decode_sequence_maxsz 0 +#endif /* CONFIG_NFS_V4_1 */ + +#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_read_maxsz) +#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_read_maxsz) +#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_readlink_maxsz) +#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_readlink_maxsz) +#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_readdir_maxsz) +#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_readdir_maxsz) +#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_write_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_write_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_commit_maxsz) +#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_commit_maxsz) +#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_open_maxsz + \ + encode_access_maxsz + \ + encode_getfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_open_maxsz + \ + decode_access_maxsz + \ + decode_getfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_open_confirm_sz \ + (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_open_confirm_maxsz) +#define NFS4_dec_open_confirm_sz \ + (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_open_confirm_maxsz) +#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_open_maxsz + \ + encode_access_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_open_maxsz + \ + decode_access_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_open_downgrade_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_open_downgrade_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_downgrade_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_open_downgrade_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_close_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_close_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setattr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setattr_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_renew_sz (compound_decode_hdr_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_setclientid_sz (compound_encode_hdr_maxsz + \ + encode_setclientid_maxsz) +#define NFS4_dec_setclientid_sz (compound_decode_hdr_maxsz + \ + decode_setclientid_maxsz) +#define NFS4_enc_setclientid_confirm_sz \ + (compound_encode_hdr_maxsz + \ + encode_setclientid_confirm_maxsz) +#define NFS4_dec_setclientid_confirm_sz \ + (compound_decode_hdr_maxsz + \ + decode_setclientid_confirm_maxsz) +#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lock_maxsz) +#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lock_maxsz) +#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lockt_maxsz) +#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lockt_maxsz) +#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_locku_maxsz) +#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) +#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_access_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_access_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lookup_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lookup_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_remove_maxsz) +#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_remove_maxsz) +#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_rename_maxsz) +#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_rename_maxsz) +#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_link_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_link_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_symlink_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_symlink_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_create_maxsz + \ + encode_getfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_create_maxsz + \ + decode_getfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_statfs_maxsz) +#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_statfs_maxsz) +#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_delegreturn_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_delegreturn_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getacl_maxsz) +#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getacl_maxsz) +#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setacl_maxsz) +#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setacl_maxsz) +#define NFS4_enc_fs_locations_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lookup_maxsz + \ + encode_fs_locations_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fs_locations_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lookup_maxsz + \ + decode_fs_locations_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getfh_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getfh_maxsz + \ + decode_renew_maxsz) +#if defined(CONFIG_NFS_V4_1) +#define NFS4_enc_bind_conn_to_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_bind_conn_to_session_maxsz) +#define NFS4_dec_bind_conn_to_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_bind_conn_to_session_maxsz) +#define NFS4_enc_exchange_id_sz \ + (compound_encode_hdr_maxsz + \ + encode_exchange_id_maxsz) +#define NFS4_dec_exchange_id_sz \ + (compound_decode_hdr_maxsz + \ + decode_exchange_id_maxsz) +#define NFS4_enc_create_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_create_session_maxsz) +#define NFS4_dec_create_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_create_session_maxsz) +#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \ + encode_destroy_session_maxsz) +#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \ + decode_destroy_session_maxsz) +#define NFS4_enc_destroy_clientid_sz (compound_encode_hdr_maxsz + \ + encode_destroy_clientid_maxsz) +#define NFS4_dec_destroy_clientid_sz (compound_decode_hdr_maxsz + \ + decode_destroy_clientid_maxsz) +#define NFS4_enc_sequence_sz \ + (compound_decode_hdr_maxsz + \ + encode_sequence_maxsz) +#define NFS4_dec_sequence_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz) +#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_reclaim_complete_maxsz) +#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz +\ + encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_free_stateid_maxsz) + +const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + + encode_getattr_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz) * + XDR_UNIT); +EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead); +#endif /* CONFIG_NFS_V4_1 */ + +static const umode_t nfs_type2fmt[] = { + [NF4BAD] = 0, + [NF4REG] = S_IFREG, + [NF4DIR] = S_IFDIR, + [NF4BLK] = S_IFBLK, + [NF4CHR] = S_IFCHR, + [NF4LNK] = S_IFLNK, + [NF4SOCK] = S_IFSOCK, + [NF4FIFO] = S_IFIFO, + [NF4ATTRDIR] = 0, + [NF4NAMEDATTR] = 0, +}; + +struct compound_hdr { + int32_t status; + uint32_t nops; + __be32 * nops_p; + uint32_t taglen; + char * tag; + uint32_t replen; /* expected reply words */ + u32 minorversion; +}; + +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} + +static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, len); + xdr_encode_opaque_fixed(p, buf, len); +} + +static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) +{ + __be32 *p; + + p = reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, str, len); +} + +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(n); +} + +static void encode_uint64(struct xdr_stream *xdr, u64 n) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + xdr_encode_hyper(p, n); +} + +static void encode_nfs4_seqid(struct xdr_stream *xdr, + const struct nfs_seqid *seqid) +{ + if (seqid != NULL) + encode_uint32(xdr, seqid->sequence->counter); + else + encode_uint32(xdr, 0); +} + +static void encode_compound_hdr(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct compound_hdr *hdr) +{ + __be32 *p; + struct rpc_auth *auth = req->rq_cred->cr_auth; + + /* initialize running count of expected bytes in reply. + * NOTE: the replied tag SHOULD be the same is the one sent, + * but this is not required as a MUST for the server to do so. */ + hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + + WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); + encode_string(xdr, hdr->taglen, hdr->tag); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(hdr->minorversion); + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); +} + +static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, + uint32_t replen, + struct compound_hdr *hdr) +{ + encode_uint32(xdr, op); + hdr->nops++; + hdr->replen += replen; +} + +static void encode_nops(struct compound_hdr *hdr) +{ + WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS); + *hdr->nops_p = htonl(hdr->nops); +} + +static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} + +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); +} + +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, + const struct nfs4_label *label, + const struct nfs_server *server) +{ + char owner_name[IDMAP_NAMESZ]; + char owner_group[IDMAP_NAMESZ]; + int owner_namelen = 0; + int owner_grouplen = 0; + __be32 *p; + unsigned i; + uint32_t len = 0; + uint32_t bmval_len; + uint32_t bmval[3] = { 0 }; + + /* + * We reserve enough space to write the entire attribute buffer at once. + * In the worst-case, this would be + * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 40 bytes, plus any contribution from variable-length fields + * such as owner/group. + */ + if (iap->ia_valid & ATTR_SIZE) { + bmval[0] |= FATTR4_WORD0_SIZE; + len += 8; + } + if (iap->ia_valid & ATTR_MODE) { + bmval[1] |= FATTR4_WORD1_MODE; + len += 4; + } + if (iap->ia_valid & ATTR_UID) { + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); + if (owner_namelen < 0) { + dprintk("nfs: couldn't resolve uid %d to string\n", + from_kuid(&init_user_ns, iap->ia_uid)); + /* XXX */ + strcpy(owner_name, "nobody"); + owner_namelen = sizeof("nobody") - 1; + /* goto out; */ + } + bmval[1] |= FATTR4_WORD1_OWNER; + len += 4 + (XDR_QUADLEN(owner_namelen) << 2); + } + if (iap->ia_valid & ATTR_GID) { + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); + if (owner_grouplen < 0) { + dprintk("nfs: couldn't resolve gid %d to string\n", + from_kgid(&init_user_ns, iap->ia_gid)); + strcpy(owner_group, "nobody"); + owner_grouplen = sizeof("nobody") - 1; + /* goto out; */ + } + bmval[1] |= FATTR4_WORD1_OWNER_GROUP; + len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); + } + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 16; + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 4; + } + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 16; + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 4; + } + if (label) { + len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); + bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; + } + + if (bmval[2] != 0) + bmval_len = 3; + else if (bmval[1] != 0) + bmval_len = 2; + else + bmval_len = 1; + + p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); + + *p++ = cpu_to_be32(bmval_len); + for (i = 0; i < bmval_len; i++) + *p++ = cpu_to_be32(bmval[i]); + *p++ = cpu_to_be32(len); + + if (bmval[0] & FATTR4_WORD0_SIZE) + p = xdr_encode_hyper(p, iap->ia_size); + if (bmval[1] & FATTR4_WORD1_MODE) + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); + if (bmval[1] & FATTR4_WORD1_OWNER) + p = xdr_encode_opaque(p, owner_name, owner_namelen); + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) + p = xdr_encode_opaque(p, owner_group, owner_grouplen); + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); + *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); + } + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); + } + if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + *p++ = cpu_to_be32(label->lfs); + *p++ = cpu_to_be32(label->pi); + *p++ = cpu_to_be32(label->len); + p = xdr_encode_opaque_fixed(p, label->label, label->len); + } + +/* out: */ +} + +static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr); + encode_uint32(xdr, access); +} + +static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); + encode_nfs4_seqid(xdr, arg->seqid); + encode_nfs4_stateid(xdr, &arg->stateid); +} + +static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr); + encode_uint32(xdr, create->ftype); + + switch (create->ftype) { + case NF4LNK: + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); + break; + + case NF4BLK: case NF4CHR: + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); + break; + + default: + break; + } + + encode_string(xdr, create->name->len, create->name->name); + encode_attrs(xdr, create->attrs, create->label, create->server); +} + +static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); +} + +static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); +} + +static void +encode_getattr_three(struct xdr_stream *xdr, + uint32_t bm0, uint32_t bm1, uint32_t bm2, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + if (bm2) { + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm2); + } else if (bm1) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + } else { + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bm0); + } +} + +static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], + bitmask[2] & nfs4_fattr_bitmap[2], + hdr); +} + +static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, + const u32 *open_bitmap, + struct compound_hdr *hdr) +{ + encode_getattr_three(xdr, + bitmask[0] & open_bitmap[0], + bitmask[1] & open_bitmap[1], + bitmask[2] & open_bitmap[2], + hdr); +} + +static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_three(xdr, + bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], + bitmask[2] & nfs4_fsinfo_bitmap[2], + hdr); +} + +static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], + bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); +} + +static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr); +} + +static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} + +static inline int nfs4_lock_type(struct file_lock *fl, int block) +{ + if (fl->fl_type == F_RDLCK) + return block ? NFS4_READW_LT : NFS4_READ_LT; + return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; +} + +static inline uint64_t nfs4_lock_length(struct file_lock *fl) +{ + if (fl->fl_end == OFFSET_MAX) + return ~(uint64_t)0; + return fl->fl_end - fl->fl_start + 1; +} + +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 32); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(20); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + *p++ = cpu_to_be32(lowner->s_dev); + xdr_encode_hyper(p, lowner->id); +} + +/* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 + */ +static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr); + p = reserve_space(xdr, 28); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ + encode_nfs4_seqid(xdr, args->open_seqid); + encode_nfs4_stateid(xdr, &args->open_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); + encode_lockowner(xdr, &args->lock_owner); + } + else { + encode_nfs4_stateid(xdr, &args->lock_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); + } +} + +static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr); + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + encode_lockowner(xdr, &args->lock_owner); +} + +static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); + encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); + encode_nfs4_seqid(xdr, args->seqid); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +} + +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr); + encode_lockowner(xdr, lowner); +} + +static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} + +static void encode_share_access(struct xdr_stream *xdr, u32 share_access) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(share_access); + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ +} + +static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + __be32 *p; + /* + * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, + * owner 4 = 32 + */ + encode_nfs4_seqid(xdr, arg->seqid); + encode_share_access(xdr, arg->share_access); + p = reserve_space(xdr, 36); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(24); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + *p++ = cpu_to_be32(arg->server->s_dev); + *p++ = cpu_to_be32(arg->id.uniquifier); + xdr_encode_hyper(p, arg->id.create_time); +} + +static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + struct iattr dummy; + __be32 *p; + + p = reserve_space(xdr, 4); + switch(arg->createmode) { + case NFS4_CREATE_UNCHECKED: + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + break; + case NFS4_CREATE_GUARDED: + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + break; + case NFS4_CREATE_EXCLUSIVE: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->label, arg->server); + } +} + +static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + switch (arg->open_flags & O_CREAT) { + case 0: + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); + break; + default: + *p = cpu_to_be32(NFS4_OPEN_CREATE); + encode_createmode(xdr, arg); + } +} + +static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + switch (delegation_type) { + case 0: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); + break; + case FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); + break; + case FMODE_WRITE|FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + break; + default: + BUG(); + } +} + +static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); + encode_string(xdr, name->len, name->name); +} + +static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); + encode_delegation_type(xdr, type); +} + +static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + encode_nfs4_stateid(xdr, stateid); + encode_string(xdr, name->len, name->name); +} + +static inline void encode_claim_fh(struct xdr_stream *xdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); +} + +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); + encode_nfs4_stateid(xdr, stateid); +} + +static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); + encode_openhdr(xdr, arg); + encode_opentype(xdr, arg); + switch (arg->claim) { + case NFS4_OPEN_CLAIM_NULL: + encode_claim_null(xdr, arg->name); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + encode_claim_previous(xdr, arg->u.delegation_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); + break; + case NFS4_OPEN_CLAIM_FH: + encode_claim_fh(xdr); + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); + break; + default: + BUG(); + } +} + +static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr); + encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); +} + +static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); + encode_nfs4_stateid(xdr, &arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); + encode_share_access(xdr, arg->share_access); +} + +static void +encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr); + encode_string(xdr, fh->size, fh->data); +} + +static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); +} + +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) +{ + uint32_t attrs[3] = { + FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD1_MOUNTED_ON_FILEID, + }; + uint32_t dircount = readdir->count >> 1; + __be32 *p, verf[2]; + uint32_t attrlen = 0; + unsigned int i; + + if (readdir->plus) { + attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| + FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| + FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| + FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; + dircount >>= 1; + } + /* Use mounted_on_fileid only if the server supports it */ + if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) + attrs[0] |= FATTR4_WORD0_FILEID; + for (i = 0; i < ARRAY_SIZE(attrs); i++) { + attrs[i] &= readdir->bitmask[i]; + if (attrs[i] != 0) + attrlen = i+1; + } + + encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); + encode_uint64(xdr, readdir->cookie); + encode_nfs4_verifier(xdr, &readdir->verifier); + p = reserve_space(xdr, 12 + (attrlen << 2)); + *p++ = cpu_to_be32(dircount); + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(attrlen); + for (i = 0; i < attrlen; i++) + *p++ = cpu_to_be32(attrs[i]); + memcpy(verf, readdir->verifier.data, sizeof(verf)); + + dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", + __func__, + (unsigned long long)readdir->cookie, + verf[0], verf[1], + attrs[0] & readdir->bitmask[0], + attrs[1] & readdir->bitmask[1], + attrs[2] & readdir->bitmask[2]); +} + +static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr); +} + +static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} + +static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); +} + +static void encode_renew(struct xdr_stream *xdr, clientid4 clid, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr); + encode_uint64(xdr, clid); +} + +static void +encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); +} + +static void +encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); + encode_nfs4_stateid(xdr, &zero_stateid); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); + xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); +} + +static void +encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr); +} + +static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); + encode_nfs4_stateid(xdr, &arg->stateid); + encode_attrs(xdr, arg->iap, arg->label, server); +} + +static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); + encode_nfs4_verifier(xdr, setclientid->sc_verifier); + + encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); + encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); + encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); +} + +static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM, + decode_setclientid_confirm_maxsz, hdr); + encode_uint64(xdr, arg->clientid); + encode_nfs4_verifier(xdr, &arg->confirm); +} + +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr); + encode_nfs4_stateid(xdr, stateid); +} + +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} + +#if defined(CONFIG_NFS_V4_1) +/* NFSv4.1 operations */ +static void encode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs41_bind_conn_to_session_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, + decode_bind_conn_to_session_maxsz, hdr); + encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN); + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(args->dir); + *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0); +} + +static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + unsigned int i; + encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS); + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) + encode_uint32(xdr, op_map->u.words[i]); +} + +static void encode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + char impl_name[IMPL_NAME_LIMIT]; + int len = 0; + + encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); + encode_nfs4_verifier(xdr, args->verifier); + + encode_string(xdr, args->id_len, args->id); + + encode_uint32(xdr, args->flags); + encode_uint32(xdr, args->state_protect.how); + + switch (args->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + encode_op_map(xdr, &args->state_protect.enforce); + encode_op_map(xdr, &args->state_protect.allow); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (send_implementation_id && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) + <= sizeof(impl_name) + 1) + len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", + utsname()->sysname, utsname()->release, + utsname()->version, utsname()->machine); + + if (len > 0) { + encode_uint32(xdr, 1); /* implementation id array length=1 */ + + encode_string(xdr, + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, + CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN); + encode_string(xdr, len, impl_name); + /* just send zeros for nii_date - the date is in nii_name */ + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, 0); + *p = cpu_to_be32(0); + } else + encode_uint32(xdr, 0); /* implementation id array length=0 */ +} + +static void encode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + struct nfs_client *clp = args->client; + struct rpc_clnt *clnt = clp->cl_rpcclient; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + u32 max_resp_sz_cached; + + /* + * Assumes OPEN is the biggest non-idempotent compound. + * 2 is the verifier. + */ + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + + encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); + p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); + p = xdr_encode_hyper(p, args->clientid); + *p++ = cpu_to_be32(args->seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ + + /* Fore Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + /* Back Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ + + /* authsys_parms rfc1831 */ + *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ +} + +static void encode_destroy_session(struct xdr_stream *xdr, + struct nfs4_session *session, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr); + encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); +} + +static void encode_destroy_clientid(struct xdr_stream *xdr, + uint64_t clientid, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr); + encode_uint64(xdr, clientid); +} + +static void encode_reclaim_complete(struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr); + encode_uint32(xdr, args->one_fs); +} +#endif /* CONFIG_NFS_V4_1 */ + +static void encode_sequence(struct xdr_stream *xdr, + const struct nfs4_sequence_args *args, + struct compound_hdr *hdr) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; + struct nfs4_slot_table *tp; + struct nfs4_slot *slot = args->sa_slot; + __be32 *p; + + tp = slot->table; + session = tp->session; + if (!session) + return; + + encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); + + /* + * Sessionid + seqid + slotid + max slotid + cache_this + */ + dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d " + "max_slotid=%d cache_this=%d\n", + __func__, + ((u32 *)session->sess_id.data)[0], + ((u32 *)session->sess_id.data)[1], + ((u32 *)session->sess_id.data)[2], + ((u32 *)session->sess_id.data)[3], + slot->seq_nr, slot->slot_nr, + tp->highest_used_slotid, args->sa_cache_this); + p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(slot->slot_nr); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); +#endif /* CONFIG_NFS_V4_1 */ +} + +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ + + p = reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(args->notify_types); +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr); + p = reserve_space(xdr, 36); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + encode_nfs4_stateid(xdr, &args->stateid); + encode_uint32(xdr, args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); +} + +static int +encode_layoutcommit(struct xdr_stream *xdr, + struct inode *inode, + struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr); + p = reserve_space(xdr, 20); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ + *p = cpu_to_be32(0); /* reclaim */ + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( + NFS_I(inode)->layout, xdr, args); + } else { + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) { + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); + } + } + + return 0; +} + +static void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ + *p++ = cpu_to_be32(args->layout_type); + *p++ = cpu_to_be32(args->range.iomode); + *p = cpu_to_be32(RETURN_FILE); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + spin_lock(&args->inode->i_lock); + encode_nfs4_stateid(xdr, &args->stateid); + spin_unlock(&args->inode->i_lock); + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( + NFS_I(args->inode)->layout, xdr, args); + } else + encode_uint32(xdr, 0); +} + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, + const struct nfs41_secinfo_no_name_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr); + encode_uint32(xdr, args->style); + return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); + encode_uint32(xdr, 1); + encode_nfs4_stateid(xdr, args->stateid); +} + +static void encode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * END OF "GENERIC" ENCODE ROUTINES. + */ + +static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session = args->sa_slot->table->session; + if (session) + return session->clp->cl_mvops->minor_version; +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +/* + * Encode an ACCESS request + */ +static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_accessargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_access(xdr, args->access, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LOOKUP request + */ +static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_lookup_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LOOKUP_ROOT request + */ +static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_lookup_root_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode REMOVE request + */ +static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_remove(xdr, &args->name, &hdr); + encode_nops(&hdr); +} + +/* + * Encode RENAME request + */ +static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->old_dir, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->new_dir, &hdr); + encode_rename(xdr, args->old_name, args->new_name, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LINK request + */ +static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_link_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_link(xdr, args->name, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode CREATE request + */ +static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_create(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode SYMLINK request + */ +static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) +{ + nfs4_xdr_enc_create(req, xdr, args); +} + +/* + * Encode GETATTR request + */ +static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_getattr_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a CLOSE request + */ +static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_closeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_close(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN request + */ +static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_openargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN_CONFIRM request + */ +static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_open_confirmargs *args) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_confirm(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN request with no attributes. + */ +static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_openargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN_DOWNGRADE request + */ +static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_closeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_downgrade(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCK request + */ +static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lock_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lock(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCKT request + */ +static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lockt_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lockt(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCKU request + */ +static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_locku_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_locku(xdr, args, &hdr); + encode_nops(&hdr); +} + +static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_release_lockowner_args *args) +{ + struct compound_hdr hdr = { + .minorversion = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_release_lockowner(xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a READLINK request + */ +static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readlink *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readlink(xdr, args, req, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, + args->pgbase, args->pglen); + encode_nops(&hdr); +} + +/* + * Encode a READDIR request + */ +static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readdir_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readdir(xdr, args, req, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, + args->pgbase, args->count); + dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", + __func__, hdr.replen << 2, args->pages, + args->pgbase, args->count); + encode_nops(&hdr); +} + +/* + * Encode a READ request + */ +static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->pages, args->pgbase, args->count); + req->rq_rcv_buf.flags |= XDRBUF_READ; + encode_nops(&hdr); +} + +/* + * Encode an SETATTR request + */ +static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setattrargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setattr(xdr, args, args->server, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a GETACL request + */ +static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_getaclargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; + encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, + args->acl_pages, args->acl_pgbase, args->acl_len); + + encode_nops(&hdr); +} + +/* + * Encode a WRITE request + */ +static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_write(xdr, args, &hdr); + req->rq_snd_buf.flags |= XDRBUF_WRITE; + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * a COMMIT request + */ +static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_commitargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_commit(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * FSINFO request + */ +static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_fsinfo(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * a PATHCONF request + */ +static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_pathconf_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], + &hdr); + encode_nops(&hdr); +} + +/* + * a STATFS request + */ +static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_statfs_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], + args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_nops(&hdr); +} + +/* + * GETATTR_BITMAP request + */ +static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + FATTR4_WORD0_FH_EXPIRE_TYPE| + FATTR4_WORD0_LINK_SUPPORT| + FATTR4_WORD0_SYMLINK_SUPPORT| + FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_nops(&hdr); +} + +/* + * a RENEW request + */ +static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_renew(xdr, clp->cl_clientid, &hdr); + encode_nops(&hdr); +} + +/* + * a SETCLIENTID request + */ +static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid *sc) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid(xdr, sc, &hdr); + encode_nops(&hdr); +} + +/* + * a SETCLIENTID_CONFIRM request + */ +static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *arg) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); + encode_nops(&hdr); +} + +/* + * DELEGRETURN request + */ +static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_delegreturnargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_delegreturn(xdr, args->stateid, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FS_LOCATIONS request + */ +static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + if (args->migration) { + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + } else { + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + } + + /* Set up reply kvec to capture returned fs_locations array. */ + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, + 0, PAGE_SIZE); + encode_nops(&hdr); +} + +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fsid_present_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfh(xdr, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + encode_nops(&hdr); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * BIND_CONN_TO_SESSION request + */ +static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_bind_conn_to_session_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_bind_conn_to_session(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * EXCHANGE_ID request + */ +static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_exchange_id(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a CREATE_SESSION request + */ +static void nfs4_xdr_enc_create_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_create_session_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_create_session(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a DESTROY_SESSION request + */ +static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_session *session) +{ + struct compound_hdr hdr = { + .minorversion = session->clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_session(xdr, session, &hdr); + encode_nops(&hdr); +} + +/* + * a DESTROY_CLIENTID request + */ +static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .minorversion = clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_clientid(xdr, clp->cl_clientid, &hdr); + encode_nops(&hdr); +} + +/* + * a SEQUENCE request + */ +static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_sequence_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a GET_LEASE_TIME request + */ +static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * a RECLAIM_COMPLETE request + */ +static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args) + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_reclaim_complete(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETDEVICEINFO request + */ +static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTGET request + */ +static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutget_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTCOMMIT request + */ +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct nfs4_layoutcommit_data *data = + container_of(args, struct nfs4_layoutcommit_data, args); + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, data->args.inode, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LAYOUTRETURN request + */ +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutreturn(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_secinfo_no_name_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_secinfo_no_name(xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_test_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_free_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} +#endif /* CONFIG_NFS_V4_1 */ + +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + +static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; + *string = (char *)p; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; + hdr->tag = (char *)p; + p += XDR_QUADLEN(hdr->taglen); + hdr->nops = be32_to_cpup(p); + if (unlikely(hdr->nops < 1)) + return nfs4_stat_to_errno(hdr->status); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) +{ + __be32 *p; + uint32_t opnum; + int32_t nfserr; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); + if (unlikely(opnum != expected)) + goto out_bad_operation; + nfserr = be32_to_cpup(p); + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; +out_overflow: + print_overflow_msg(__func__, xdr); + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; +} + +/* Dummy routine */ +static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) +{ + __be32 *p; + unsigned int strlen; + char *str; + + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + uint32_t bmlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + + bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); + if (bmlen > 1) { + bitmap[1] = be32_to_cpup(p++); + if (bmlen > 2) + bitmap[2] = be32_to_cpup(p); + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); + *savep = xdr_stream_pos(xdr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) +{ + if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); + return 0; +} + +static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) +{ + __be32 *p; + int ret = 0; + + *type = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); + if (*type < NF4REG || *type > NF4NAMEDATTR) { + dprintk("%s: bad type %d\n", __func__, *type); + return -EIO; + } + bitmap[0] &= ~FATTR4_WORD0_TYPE; + ret = NFS_ATTR_FATTR_TYPE; + } + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fh_expire_type(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *type) +{ + __be32 *p; + + *type = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; + } + dprintk("%s: expire type=0x%x\n", __func__, *type); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) +{ + __be32 *p; + int ret = 0; + + *change = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); + bitmap[0] &= ~FATTR4_WORD0_CHANGE; + ret = NFS_ATTR_FATTR_CHANGE; + } + dprintk("%s: change attribute=%Lu\n", __func__, + (unsigned long long)*change); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) +{ + __be32 *p; + int ret = 0; + + *size = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); + bitmap[0] &= ~FATTR4_WORD0_SIZE; + ret = NFS_ATTR_FATTR_SIZE; + } + dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; + } + dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; + } + dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) +{ + __be32 *p; + int ret = 0; + + fsid->major = 0; + fsid->minor = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); + bitmap[0] &= ~FATTR4_WORD0_FSID; + ret = NFS_ATTR_FATTR_FSID; + } + dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, + (unsigned long long)fsid->major, + (unsigned long long)fsid->minor); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 60; + if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; + } + dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) +{ + __be32 *p; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + *res = -be32_to_cpup(p); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) +{ + __be32 *p; + int len; + + if (fh != NULL) + memset(fh, 0, sizeof(*fh)); + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (fh != NULL) { + memcpy(fh->data, p, len); + fh->size = len; + } + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; + } + dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) +{ + __be32 *p; + int ret = 0; + + *fileid = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); + bitmap[0] &= ~FATTR4_WORD0_FILEID; + ret = NFS_ATTR_FATTR_FILEID; + } + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) +{ + __be32 *p; + int ret = 0; + + *fileid = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); + bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; + } + dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; + } + dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; + } + dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) +{ + u32 n; + __be32 *p; + int status = 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); + if (n == 0) + goto root_path; + dprintk("pathname4: "); + if (n > NFS4_PATHNAME_MAXCOMPONENTS) { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) { + struct nfs4_string *component = &path->components[path->ncomponents]; + status = decode_opaque_inline(xdr, &component->len, &component->data); + if (unlikely(status != 0)) + goto out_eio; + ifdebug (XDR) + pr_cont("%s%.*s ", + (path->ncomponents != n ? "/ " : ""), + component->len, component->data); + } +out: + return status; +root_path: +/* a root pathname is sent as a zero component4 */ + path->ncomponents = 1; + path->components[0].len=0; + path->components[0].data=NULL; + dprintk("pathname4: /\n"); + goto out; +out_eio: + dprintk(" status %d", status); + status = -EIO; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) +{ + int n; + __be32 *p; + int status = -EIO; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U))) + goto out; + status = 0; + if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) + goto out; + status = -EIO; + /* Ignore borken servers that return unrequested attrs */ + if (unlikely(res == NULL)) + goto out; + dprintk("%s: fsroot:\n", __func__); + status = decode_pathname(xdr, &res->fs_path); + if (unlikely(status != 0)) + goto out; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); + if (n <= 0) + goto out_eio; + for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { + u32 m; + struct nfs4_fs_location *loc; + + if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) + break; + loc = &res->locations[res->nlocations]; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); + + dprintk("%s: servers:\n", __func__); + for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { + struct nfs4_string *server; + + if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) { + unsigned int i; + dprintk("%s: using first %u of %u servers " + "returned for location %u\n", + __func__, + NFS4_FS_LOCATION_MAXSERVERS, + m, res->nlocations); + for (i = loc->nservers; i < m; i++) { + unsigned int len; + char *data; + status = decode_opaque_inline(xdr, &len, &data); + if (unlikely(status != 0)) + goto out_eio; + } + break; + } + server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); + } + status = decode_pathname(xdr, &loc->rootpath); + if (unlikely(status != 0)) + goto out_eio; + } + if (res->nlocations != 0) + status = NFS_ATTR_FATTR_V4_LOCATIONS; +out: + dprintk("%s: fs_locations done, error = %d\n", __func__, status); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); +out_eio: + status = -EIO; + goto out; +} + +static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; + } + dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) +{ + __be32 *p; + int status = 0; + + *maxlink = 1; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_MAXLINK; + } + dprintk("%s: maxlink=%u\n", __func__, *maxlink); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) +{ + __be32 *p; + int status = 0; + + *maxname = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_MAXNAME; + } + dprintk("%s: maxname=%u\n", __func__, *maxname); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + int status = 0; + + *res = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { + uint64_t maxread; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); + if (maxread > 0x7FFFFFFF) + maxread = 0x7FFFFFFF; + *res = (uint32_t)maxread; + bitmap[0] &= ~FATTR4_WORD0_MAXREAD; + } + dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + int status = 0; + + *res = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { + uint64_t maxwrite; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); + if (maxwrite > 0x7FFFFFFF) + maxwrite = 0x7FFFFFFF; + *res = (uint32_t)maxwrite; + bitmap[0] &= ~FATTR4_WORD0_MAXWRITE; + } + dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) +{ + uint32_t tmp; + __be32 *p; + int ret = 0; + + *mode = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); + *mode = tmp & ~S_IFMT; + bitmap[1] &= ~FATTR4_WORD1_MODE; + ret = NFS_ATTR_FATTR_MODE; + } + dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) +{ + __be32 *p; + int ret = 0; + + *nlink = 1; + if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); + bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; + ret = NFS_ATTR_FATTR_NLINK; + } + dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, kuid_t *uid, + struct nfs4_string *owner_name) +{ + uint32_t len; + __be32 *p; + int ret = 0; + + *uid = make_kuid(&init_user_ns, -2); + if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (owner_name != NULL) { + owner_name->data = kmemdup(p, len, GFP_NOWAIT); + if (owner_name->data != NULL) { + owner_name->len = len; + ret = NFS_ATTR_FATTR_OWNER_NAME; + } + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) + ret = NFS_ATTR_FATTR_OWNER; + else + dprintk("%s: nfs_map_name_to_uid failed!\n", + __func__); + } else + dprintk("%s: name too long (%u)!\n", + __func__, len); + bitmap[1] &= ~FATTR4_WORD1_OWNER; + } + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, kgid_t *gid, + struct nfs4_string *group_name) +{ + uint32_t len; + __be32 *p; + int ret = 0; + + *gid = make_kgid(&init_user_ns, -2); + if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (group_name != NULL) { + group_name->data = kmemdup(p, len, GFP_NOWAIT); + if (group_name->data != NULL) { + group_name->len = len; + ret = NFS_ATTR_FATTR_GROUP_NAME; + } + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) + ret = NFS_ATTR_FATTR_GROUP; + else + dprintk("%s: nfs_map_group_to_gid failed!\n", + __func__); + } else + dprintk("%s: name too long (%u)!\n", + __func__, len); + bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + } + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) +{ + uint32_t major = 0, minor = 0; + __be32 *p; + int ret = 0; + + *rdev = MKDEV(0,0); + if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { + dev_t tmp; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); + tmp = MKDEV(major, minor); + if (MAJOR(tmp) == major && MINOR(tmp) == minor) + *rdev = tmp; + bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; + ret = NFS_ATTR_FATTR_RDEV; + } + dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; + } + dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; + } + dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; + } + dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) +{ + __be32 *p; + int ret = 0; + + *used = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); + bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; + ret = NFS_ATTR_FATTR_SPACE_USED; + } + dprintk("%s: space used=%Lu\n", __func__, + (unsigned long long)*used); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) +{ + __be32 *p; + uint64_t sec; + uint32_t nsec; + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); + time->tv_sec = (time_t)sec; + time->tv_nsec = (long)nsec; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_ATIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; + } + dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_CTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; + } + dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, + struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) { + status = decode_attr_time(xdr, time); + bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; + } + dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, + (long)time->tv_nsec); + return status; +} + +static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_label *label) +{ + uint32_t pi = 0; + uint32_t lfs = 0; + __u32 len; + __be32 *p; + int status = 0; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + lfs = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pi = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (len < NFS4_MAXLABELLEN) { + if (label) { + memcpy(label->label, p, len); + label->len = len; + label->pi = pi; + label->lfs = lfs; + status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; + } + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } else + printk(KERN_WARNING "%s: label too long (%u)!\n", + __func__, len); + } + if (label && label->label) + dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, + (char *)label->label, label->len, label->pi, label->lfs); + return status; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_MTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; + } + dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen) +{ + unsigned int attrwords = XDR_QUADLEN(attrlen); + unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2; + + if (unlikely(attrwords != nwords)) { + dprintk("%s: server returned incorrect attribute length: " + "%u %c %u\n", + __func__, + attrwords << 2, + (attrwords < nwords) ? '<' : '>', + nwords << 2); + return -EIO; + } + return 0; +} + +static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) +{ + __be32 *p; + uint32_t supp, acc; + int status; + + status = decode_op_hdr(xdr, OP_ACCESS); + if (status) + return status; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); + *supported = supp; + *access = acc; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_CLOSE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE); +} + +static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ + return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE); +} + +static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COMMIT); + if (!status) + status = decode_write_verifier(xdr, &res->verf->verifier); + return status; +} + +static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + __be32 *p; + uint32_t bmlen; + int status; + + status = decode_op_hdr(xdr, OP_CREATE); + if (status) + return status; + if ((status = decode_change_info(xdr, cinfo))) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) +{ + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0) + goto xdr_error; + if ((status = decode_attr_fh_expire_type(xdr, bitmap, + &res->fh_expire_type)) != 0) + goto xdr_error; + if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0) + goto xdr_error; + if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) + goto xdr_error; + if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) +{ + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0) + goto xdr_error; + if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0) + goto xdr_error; + if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0) + goto xdr_error; + if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0) + goto xdr_error; + if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0) + goto xdr_error; + if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) +{ + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0) + goto xdr_error; + if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_threshold_hint(struct xdr_stream *xdr, + uint32_t *bitmap, + uint64_t *res, + uint32_t hint_bit) +{ + __be32 *p; + + *res = 0; + if (likely(bitmap[0] & hint_bit)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_first_threshold_item4(struct xdr_stream *xdr, + struct nfs4_threshold *res) +{ + __be32 *p; + unsigned int savep; + uint32_t bitmap[3] = {0,}, attrlen; + int status; + + /* layout type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + res->l_type = be32_to_cpup(p); + + /* thi_hintset bitmap */ + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + /* thi_hintlist length */ + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + /* thi_hintlist */ + status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz, + THRESHOLD_RD_IO); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz, + THRESHOLD_WR_IO); + if (status < 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); + res->bm = bitmap[0]; + + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz, + res->wr_io_sz); +xdr_error: + dprintk("%s ret=%d!\n", __func__, status); + return status; +} + +/* + * Thresholds on pNFS direct I/O vrs MDS I/O + */ +static int decode_attr_mdsthreshold(struct xdr_stream *xdr, + uint32_t *bitmap, + struct nfs4_threshold *res) +{ + __be32 *p; + int status = 0; + uint32_t num; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) + return -EIO; + if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) { + /* Did the server return an unrequested attribute? */ + if (unlikely(res == NULL)) + return -EREMOTEIO; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + if (num == 0) + return 0; + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", + __func__); + + status = decode_first_threshold_item4(xdr, res); + bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; + } + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_fattr *fattr, struct nfs_fh *fh, + struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, + const struct nfs_server *server) +{ + int status; + umode_t fmode = 0; + uint32_t type; + int32_t err; + + status = decode_attr_type(xdr, bitmap, &type); + if (status < 0) + goto xdr_error; + fattr->mode = 0; + if (status != 0) { + fattr->mode |= nfs_type2fmt[type]; + fattr->valid |= status; + } + + status = decode_attr_change(xdr, bitmap, &fattr->change_attr); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_size(xdr, bitmap, &fattr->size); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + err = 0; + status = decode_attr_error(xdr, bitmap, &err); + if (status < 0) + goto xdr_error; + + status = decode_attr_filehandle(xdr, bitmap, fh); + if (status < 0) + goto xdr_error; + + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_fs_locations(xdr, bitmap, fs_loc); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_mode(xdr, bitmap, &fmode); + if (status < 0) + goto xdr_error; + if (status != 0) { + fattr->mode |= fmode; + fattr->valid |= status; + } + + status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_access(xdr, bitmap, &fattr->atime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold); + if (status < 0) + goto xdr_error; + + if (label) { + status = decode_attr_security_label(xdr, bitmap, label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + } + +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, + struct nfs4_label *label, const struct nfs_server *server) +{ + unsigned int savep; + uint32_t attrlen, + bitmap[3] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) + goto xdr_error; + + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, + label, server); + if (status < 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs4_label *label, const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); +} + +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server); +} + +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + __be32 *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} + +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; + } + return 0; +} + +static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) +{ + unsigned int savep; + uint32_t attrlen, bitmap[3]; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ + + if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) + goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) + goto xdr_error; + fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax; + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); + if (status) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + uint32_t len; + int status; + + /* Zero handle first to allow comparisons */ + memset(fh, 0, sizeof(*fh)); + + status = decode_op_hdr(xdr, OP_GETFH); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + fh->size = len; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_LINK); + if (status) + return status; + return decode_change_info(xdr, cinfo); +} + +/* + * We create the owner, so we know a proper owner.id length is 4. + */ +static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +{ + uint64_t offset, length, clientid; + __be32 *p; + uint32_t namelen, type; + + p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); /* 4 byte read */ + if (fl != NULL) { /* manipulate file lock */ + fl->fl_start = (loff_t)offset; + fl->fl_end = fl->fl_start + (loff_t)length - 1; + if (length == ~(uint64_t)0) + fl->fl_end = OFFSET_MAX; + fl->fl_type = F_WRLCK; + if (type & 1) + fl->fl_type = F_RDLCK; + fl->fl_pid = 0; + } + p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ + namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ + p = xdr_inline_decode(xdr, namelen); /* variable size field */ + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_LOCK); + if (status == -EIO) + goto out; + if (status == 0) { + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; + } else if (status == -NFS4ERR_DENIED) + status = decode_lock_denied(xdr, NULL); + if (res->open_seqid != NULL) + nfs_increment_open_seqid(status, res->open_seqid); + nfs_increment_lock_seqid(status, res->lock_seqid); +out: + return status; +} + +static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) +{ + int status; + status = decode_op_hdr(xdr, OP_LOCKT); + if (status == -NFS4ERR_DENIED) + return decode_lock_denied(xdr, res->denied); + return status; +} + +static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_LOCKU); + if (status != -EIO) + nfs_increment_lock_seqid(status, res->seqid); + if (status == 0) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + +static int decode_lookup(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LOOKUP); +} + +/* This is too sick! */ +static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) +{ + __be32 *p; + uint32_t limit_type, nblocks, blocksize; + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); + switch (limit_type) { + case 1: + xdr_decode_hyper(p, maxsize); + break; + case 2: + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); + *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_rw_delegation(struct xdr_stream *xdr, + uint32_t delegation_type, + struct nfs_openres *res) +{ + __be32 *p; + int status; + + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); + + switch (delegation_type) { + case NFS4_OPEN_DELEGATE_READ: + res->delegation_type = FMODE_READ; + break; + case NFS4_OPEN_DELEGATE_WRITE: + res->delegation_type = FMODE_WRITE|FMODE_READ; + if (decode_space_limit(xdr, &res->maxsize) < 0) + return -EIO; + } + return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t why_no_delegation; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + why_no_delegation = be32_to_cpup(p); + switch (why_no_delegation) { + case WND4_CONTENTION: + case WND4_RESOURCE: + xdr_inline_decode(xdr, 4); + /* Ignore for now */ + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t delegation_type; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); + res->delegation_type = 0; + switch (delegation_type) { + case NFS4_OPEN_DELEGATE_NONE: + return 0; + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + return decode_rw_delegation(xdr, delegation_type, res); + case NFS4_OPEN_DELEGATE_NONE_EXT: + return decode_no_delegation(xdr, res); + } + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t savewords, bmlen, i; + int status; + + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + return status; + + decode_change_info(xdr, &res->cinfo); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); + if (bmlen > 10) + goto xdr_error; + + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; + savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); + for (i = 0; i < savewords; ++i) + res->attrset[i] = be32_to_cpup(p++); + for (; i < NFS4_BITMAP_SIZE; i++) + res->attrset[i] = 0; + + return decode_delegation(xdr, res); +xdr_error: + dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_putfh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_PUTFH); +} + +static int decode_putrootfh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_PUTROOTFH); +} + +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_pgio_res *res) +{ + __be32 *p; + uint32_t count, eof, recvd; + int status; + + status = decode_op_hdr(xdr, OP_READ); + if (status) + return status; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); + recvd = xdr_read_pages(xdr, count); + if (count > recvd) { + dprintk("NFS: server cheating in read reply: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + } + res->eof = eof; + res->count = count; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) +{ + int status; + __be32 verf[2]; + + status = decode_op_hdr(xdr, OP_READDIR); + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) + return status; + memcpy(verf, readdir->verifier.data, sizeof(verf)); + dprintk("%s: verifier = %08x:%08x\n", + __func__, verf[0], verf[1]); + return xdr_read_pages(xdr, xdr->buf->page_len); +} + +static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) +{ + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + u32 len, recvd; + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_READLINK); + if (status) + return status; + + /* Convert length of symlink */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len >= rcvbuf->page_len || len <= 0) { + dprintk("nfs: server returned giant symlink!\n"); + return -ENAMETOOLONG; + } + recvd = xdr_read_pages(xdr, len); + if (recvd < len) { + dprintk("NFS: server cheating in readlink reply: " + "count %u > recvd %u\n", len, recvd); + return -EIO; + } + /* + * The XDR encode routine has set things up so that + * the link text will be copied directly into the + * buffer. We just have to do overflow-checking, + * and and null-terminate the text (the VFS expects + * null-termination). + */ + xdr_terminate_string(rcvbuf, len); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_REMOVE); + if (status) + goto out; + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo, + struct nfs4_change_info *new_cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_RENAME); + if (status) + goto out; + if ((status = decode_change_info(xdr, old_cinfo))) + goto out; + status = decode_change_info(xdr, new_cinfo); +out: + return status; +} + +static int decode_renew(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RENEW); +} + +static int +decode_restorefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RESTOREFH); +} + +static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_getaclres *res) +{ + unsigned int savep; + uint32_t attrlen, + bitmap[3] = {0}; + int status; + unsigned int pg_offset; + + res->acl_len = 0; + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto out; + + xdr_enter_page(xdr, xdr->buf->page_len); + + /* Calculate the offset of the page data */ + pg_offset = xdr->buf->head[0].iov_len; + + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto out; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto out; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { + + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; + res->acl_len = attrlen; + + /* Check for receive buffer overflow */ + if (res->acl_len > (xdr->nwords << 2) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %u\n", + attrlen, xdr->nwords << 2); + } + } else + status = -EOPNOTSUPP; + +out: + return status; +} + +static int +decode_savefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SAVEFH); +} + +static int decode_setattr(struct xdr_stream *xdr) +{ + __be32 *p; + uint32_t bmlen; + int status; + + status = decode_op_hdr(xdr, OP_SETATTR); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) +{ + __be32 *p; + uint32_t opnum; + int32_t nfserr; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); + if (opnum != OP_SETCLIENTID) { + dprintk("nfs: decode_setclientid: Server returned operation" + " %d\n", opnum); + return -EIO; + } + nfserr = be32_to_cpup(p); + if (nfserr == NFS_OK) { + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->clientid); + memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); + } else if (nfserr == NFSERR_CLID_INUSE) { + uint32_t len; + + /* skip netid string */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + + /* skip uaddr string */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + return -NFSERR_CLID_INUSE; + } else + return nfs4_stat_to_errno(nfserr); + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_setclientid_confirm(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); +} + +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_WRITE); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + return decode_write_verifier(xdr, &res->verf->verifier); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_delegreturn(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_DELEGRETURN); +} + +static int decode_secinfo_gss(struct xdr_stream *xdr, + struct nfs4_secinfo4 *flavor) +{ + u32 oid_len; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + oid_len = be32_to_cpup(p); + if (oid_len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, oid_len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->flavor_info.oid.data, p, oid_len); + flavor->flavor_info.oid.len = oid_len; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->flavor_info.qop = be32_to_cpup(p++); + flavor->flavor_info.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo4 *sec_flavor; + unsigned int i, num_flavors; + int status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + + res->flavors->num_flavors = 0; + num_flavors = be32_to_cpup(p); + + for (i = 0; i < num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + status = decode_secinfo_gss(xdr, sec_flavor); + if (status) + goto out; + } + res->flavors->num_flavors++; + } + + status = 0; +out: + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + int status = decode_op_hdr(xdr, OP_SECINFO); + if (status) + return status; + return decode_secinfo_common(xdr, res); +} + +#if defined(CONFIG_NFS_V4_1) +static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME); + if (status) + return status; + return decode_secinfo_common(xdr, res); +} + +static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + __be32 *p; + uint32_t bitmap_words; + unsigned int i; + + p = xdr_inline_decode(xdr, 4); + bitmap_words = be32_to_cpup(p++); + if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) + return -EIO; + p = xdr_inline_decode(xdr, 4 * bitmap_words); + for (i = 0; i < bitmap_words; i++) + op_map->u.words[i] = be32_to_cpup(p++); + + return 0; +} + +static int decode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_res *res) +{ + __be32 *p; + uint32_t dummy; + char *dummy_str; + int status; + uint32_t impl_id_count; + + status = decode_op_hdr(xdr, OP_EXCHANGE_ID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &res->clientid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p++); + + res->state_protect.how = be32_to_cpup(p); + switch (res->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + status = decode_op_map(xdr, &res->state_protect.enforce); + if (status) + return status; + status = decode_op_map(xdr, &res->state_protect.allow); + if (status) + return status; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* server_owner4.so_minor_id */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->server_owner->minor_id); + + /* server_owner4.so_major_id */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->server_owner->major_id, dummy_str, dummy); + res->server_owner->major_id_sz = dummy; + + /* server_scope4 */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->server_scope->server_scope, dummy_str, dummy); + res->server_scope->server_scope_sz = dummy; + + /* Implementation Id */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + impl_id_count = be32_to_cpup(p++); + + if (impl_id_count) { + /* nii_domain */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->domain, dummy_str, dummy); + + /* nii_name */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->name, dummy_str, dummy); + + /* nii_date */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->impl_id->date.seconds); + res->impl_id->date.nseconds = be32_to_cpup(p); + + /* if there's more than one entry, ignore the rest */ + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_chan_attrs(struct xdr_stream *xdr, + struct nfs4_channel_attrs *attrs) +{ + __be32 *p; + u32 nr_attrs, val; + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); + if (unlikely(nr_attrs > 1)) { + printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs " + "count %u\n", __func__, nr_attrs); + return -EINVAL; + } + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); +} + +static int decode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs41_bind_conn_to_session_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); + if (!status) + status = decode_sessionid(xdr, &res->sessionid); + if (unlikely(status)) + return status; + + /* dir flags, rdma mode bool */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + + res->dir = be32_to_cpup(p++); + if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) + return -EIO; + if (be32_to_cpup(p) == 0) + res->use_conn_in_rdma_mode = false; + else + res->use_conn_in_rdma_mode = true; + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_CREATE_SESSION); + if (!status) + status = decode_sessionid(xdr, &res->sessionid); + if (unlikely(status)) + return status; + + /* seqid, flags */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p); + + /* Channel attributes */ + status = decode_chan_attrs(xdr, &res->fc_attrs); + if (!status) + status = decode_chan_attrs(xdr, &res->bc_attrs); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_SESSION); +} + +static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_CLIENTID); +} + +static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); +} +#endif /* CONFIG_NFS_V4_1 */ + +static int decode_sequence(struct xdr_stream *xdr, + struct nfs4_sequence_res *res, + struct rpc_rqst *rqstp) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; + struct nfs4_sessionid id; + u32 dummy; + int status; + __be32 *p; + + if (res->sr_slot == NULL) + return 0; + if (!res->sr_slot->table->session) + return 0; + + status = decode_op_hdr(xdr, OP_SEQUENCE); + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) + goto out_err; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + status = -EREMOTEIO; + session = res->sr_slot->table->session; + + if (memcmp(id.data, session->sess_id.data, + NFS4_MAX_SESSIONID_LEN)) { + dprintk("%s Invalid session id\n", __func__); + goto out_err; + } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + + /* seqid */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot->seq_nr) { + dprintk("%s Invalid sequence number\n", __func__); + goto out_err; + } + /* slot id */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot->slot_nr) { + dprintk("%s Invalid slot id\n", __func__); + goto out_err; + } + /* highest slot id */ + res->sr_highest_slotid = be32_to_cpup(p++); + /* target highest slot id */ + res->sr_target_highest_slotid = be32_to_cpup(p++); + /* result flags */ + res->sr_status_flags = be32_to_cpup(p); + status = 0; +out_err: + res->sr_status = status; + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; +#else /* CONFIG_NFS_V4_1 */ + return 0; +#endif /* CONFIG_NFS_V4_1 */ +} + +#if defined(CONFIG_NFS_V4_1) +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_res *res) +{ + struct pnfs_device *pdev = res->pdev; + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) + goto out_overflow; + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + uint32_t i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + + res->notification = be32_to_cpup(p++); + for (i = 1; i < len; i++) { + if (be32_to_cpup(p++)) { + dprintk("%s: unsupported notification\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + u32 recvd; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p); + decode_stateid(xdr, &res->stateid); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + res->layoutp->len = be32_to_cpup(p); + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layoutp->len); + + recvd = xdr_read_pages(xdr, res->layoutp->len); + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->lrs_present = be32_to_cpup(p); + if (res->lrs_present) + status = decode_stateid(xdr, &res->stateid); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + res->status = status; + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + __be32 *p; + int status; + int num_res; + + status = decode_op_hdr(xdr, OP_TEST_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num_res = be32_to_cpup(p++); + if (num_res != 1) + goto out; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + + return status; +out_overflow: + print_overflow_msg(__func__, xdr); +out: + return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + res->status = decode_op_hdr(xdr, OP_FREE_STATEID); + return res->status; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * END OF "GENERIC" DECODE ROUTINES. + */ + +/* + * Decode OPEN_DOWNGRADE response + */ +static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_downgrade(xdr, res); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode ACCESS response + */ +static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_accessres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_access(xdr, &res->supported, &res->access); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode LOOKUP response + */ +static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_lookup_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lookup(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); +out: + return status; +} + +/* + * Decode LOOKUP_ROOT response + */ +static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_lookup_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status == 0) + status = decode_getfattr_label(xdr, res->fattr, + res->label, res->server); +out: + return status; +} + +/* + * Decode REMOVE response + */ +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_removeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_remove(xdr, &res->cinfo); +out: + return status; +} + +/* + * Decode RENAME response + */ +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_renameres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo); +out: + return status; +} + +/* + * Decode LINK response + */ +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_link_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_link(xdr, &res->cinfo); + if (status) + goto out; + /* + * Note order: OP_LINK leaves the directory as the current + * filehandle. + */ + status = decode_restorefh(xdr); + if (status) + goto out; + decode_getfattr_label(xdr, res->fattr, res->label, res->server); +out: + return status; +} + +/* + * Decode CREATE response + */ +static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_create(xdr, &res->dir_cinfo); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + decode_getfattr_label(xdr, res->fattr, res->label, res->server); +out: + return status; +} + +/* + * Decode SYMLINK response + */ +static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) +{ + return nfs4_xdr_dec_create(rqstp, xdr, res); +} + +/* + * Decode GETATTR response + */ +static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_getattr_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); +out: + return status; +} + +/* + * Encode an SETACL request + */ +static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setaclargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setacl(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Decode SETACL response + */ +static int +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_setaclres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); +out: + return status; +} + +/* + * Decode GETACL response + */ +static int +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_getaclres *res) +{ + struct compound_hdr hdr; + int status; + + if (res->acl_scratch != NULL) { + void *p = page_address(res->acl_scratch); + xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); + } + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getacl(xdr, rqstp, res); + +out: + return status; +} + +/* + * Decode CLOSE response + */ +static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_close(xdr, res); + if (status != 0) + goto out; + /* + * Note: Server may do delete on close for this file + * in which case the getattr call will fail with + * an ESTALE error. Shouldn't be a problem, + * though, since fattr->valid will remain unset. + */ + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode OPEN response + */ +static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_openres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open(xdr, res); + if (status) + goto out; + status = decode_getfh(xdr, &res->fh); + if (status) + goto out; + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); + decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); +out: + return status; +} + +/* + * Decode OPEN_CONFIRM response + */ +static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_open_confirmres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_confirm(xdr, res); +out: + return status; +} + +/* + * Decode OPEN response + */ +static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_openres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open(xdr, res); + if (status) + goto out; + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); + decode_getfattr(xdr, res->f_attr, res->server); +out: + return status; +} + +/* + * Decode SETATTR response + */ +static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_setattrres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); + if (status) + goto out; + decode_getfattr_label(xdr, res->fattr, res->label, res->server); +out: + return status; +} + +/* + * Decode LOCK response + */ +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lock_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lock(xdr, res); +out: + return status; +} + +/* + * Decode LOCKT response + */ +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lockt_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lockt(xdr, res); +out: + return status; +} + +/* + * Decode LOCKU response + */ +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_locku_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_locku(xdr, res); +out: + return status; +} + +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *dummy) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_release_lockowner(xdr); + return status; +} + +/* + * Decode READLINK response + */ +static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_readlink_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_readlink(xdr, rqstp); +out: + return status; +} + +/* + * Decode READDIR response + */ +static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_readdir_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_readdir(xdr, rqstp, res); +out: + return status; +} + +/* + * Decode Read response + */ +static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_pgio_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_read(xdr, rqstp, res); + if (!status) + status = res->count; +out: + return status; +} + +/* + * Decode WRITE response + */ +static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_pgio_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_write(xdr, res); + if (status) + goto out; + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); + if (!status) + status = res->count; +out: + return status; +} + +/* + * Decode COMMIT response + */ +static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_commitres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_commit(xdr, res); +out: + return status; +} + +/* + * Decode FSINFO response + */ +static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_fsinfo(xdr, res->fsinfo); + return status; +} + +/* + * Decode PATHCONF response + */ +static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_pathconf_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_pathconf(xdr, res->pathconf); + return status; +} + +/* + * Decode STATFS response + */ +static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_statfs_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_statfs(xdr, res->fsstat); + return status; +} + +/* + * Decode GETATTR_BITMAP response + */ +static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_server_caps(xdr, res); +out: + return status; +} + +/* + * Decode RENEW response + */ +static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *__unused) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_renew(xdr); + return status; +} + +/* + * Decode SETCLIENTID response + */ +static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_setclientid(xdr, res); + return status; +} + +/* + * Decode SETCLIENTID_CONFIRM response + */ +static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_setclientid_confirm(xdr); + return status; +} + +/* + * Decode DELEGRETURN response + */ +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_delegreturnres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server); + if (status != 0) + goto out; + status = decode_delegreturn(xdr); +out: + return status; +} + +/* + * Decode FS_LOCATIONS response + */ +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + if (res->migration) { + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); + } else { + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + } +out: + return status; +} + +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); +out: + return status; +} + +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_fsid_present_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); +out: + return status; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Decode BIND_CONN_TO_SESSION response + */ +static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_bind_conn_to_session(xdr, res); + return status; +} + +/* + * Decode EXCHANGE_ID response + */ +static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_exchange_id(xdr, res); + return status; +} + +/* + * Decode CREATE_SESSION response + */ +static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_create_session(xdr, res); + return status; +} + +/* + * Decode DESTROY_SESSION response + */ +static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_destroy_session(xdr, res); + return status; +} + +/* + * Decode DESTROY_CLIENTID response + */ +static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_destroy_clientid(xdr, res); + return status; +} + +/* + * Decode SEQUENCE response + */ +static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_sequence_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, res, rqstp); + return status; +} + +/* + * Decode GET_LEASE_TIME response + */ +static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->lr_seq_res, rqstp); + if (!status) + status = decode_putrootfh(xdr); + if (!status) + status = decode_fsinfo(xdr, res->lr_fsinfo); + return status; +} + +/* + * Decode RECLAIM_COMPLETE response + */ +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (!status) + status = decode_reclaim_complete(xdr, NULL); + return status; +} + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(xdr, res); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutget_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutget(xdr, rqstp, res); +out: + return status; +} + +/* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutreturn(xdr, res); +out: + return status; +} + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_secinfo_no_name(xdr, res); +out: + return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_test_stateid(xdr, res); +out: + return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_free_stateid(xdr, res); +out: + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + */ +int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + unsigned int savep; + uint32_t bitmap[3] = {0}; + uint32_t len; + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + entry->prev_cookie = entry->cookie; + p = xdr_decode_hyper(p, &entry->cookie); + entry->len = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, entry->len); + if (unlikely(!p)) + goto out_overflow; + entry->name = (const char *) p; + + /* + * In case the server doesn't return an inode number, + * we fake one here. (We don't use inode number 0, + * since glibc seems to choke on it...) + */ + entry->ino = 1; + entry->fattr->valid = 0; + + if (decode_attr_bitmap(xdr, bitmap) < 0) + goto out_overflow; + + if (decode_attr_length(xdr, &len, &savep) < 0) + goto out_overflow; + + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, + NULL, entry->label, entry->server) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + entry->ino = entry->fattr->mounted_on_fileid; + else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; + + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +} + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -errno_NFSERR_IO}, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -EREMOTEIO }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; + +/* + * Convert an NFS error code to a local one. + * This one is used jointly by NFSv2 and NFSv3. + */ +static int +nfs4_stat_to_errno(int stat) +{ + int i; + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == stat) + return nfs_errtbl[i].errno; + } + if (stat <= 10000 || stat > 10100) { + /* The server is looney tunes. */ + return -EREMOTEIO; + } + /* If we cannot translate the error, the recovery routines should + * handle it. + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ + return -stat; +} + +#ifdef CONFIG_NFS_V4_2 +#include "nfs42xdr.c" +#endif /* CONFIG_NFS_V4_2 */ + +#define PROC(proc, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_COMPOUND, \ + .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \ + .p_arglen = NFS4_##argtype##_sz, \ + .p_replen = NFS4_##restype##_sz, \ + .p_statidx = NFSPROC4_CLNT_##proc, \ + .p_name = #proc, \ +} + +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + +struct rpc_procinfo nfs4_procedures[] = { + PROC(READ, enc_read, dec_read), + PROC(WRITE, enc_write, dec_write), + PROC(COMMIT, enc_commit, dec_commit), + PROC(OPEN, enc_open, dec_open), + PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), + PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), + PROC(CLOSE, enc_close, dec_close), + PROC(SETATTR, enc_setattr, dec_setattr), + PROC(FSINFO, enc_fsinfo, dec_fsinfo), + PROC(RENEW, enc_renew, dec_renew), + PROC(SETCLIENTID, enc_setclientid, dec_setclientid), + PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), + PROC(LOCK, enc_lock, dec_lock), + PROC(LOCKT, enc_lockt, dec_lockt), + PROC(LOCKU, enc_locku, dec_locku), + PROC(ACCESS, enc_access, dec_access), + PROC(GETATTR, enc_getattr, dec_getattr), + PROC(LOOKUP, enc_lookup, dec_lookup), + PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), + PROC(REMOVE, enc_remove, dec_remove), + PROC(RENAME, enc_rename, dec_rename), + PROC(LINK, enc_link, dec_link), + PROC(SYMLINK, enc_symlink, dec_symlink), + PROC(CREATE, enc_create, dec_create), + PROC(PATHCONF, enc_pathconf, dec_pathconf), + PROC(STATFS, enc_statfs, dec_statfs), + PROC(READLINK, enc_readlink, dec_readlink), + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), + PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), + PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), +#if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), + PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), + PROC(BIND_CONN_TO_SESSION, + enc_bind_conn_to_session, dec_bind_conn_to_session), + PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), +#endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 + PROC(SEEK, enc_seek, dec_seek), + PROC(ALLOCATE, enc_allocate, dec_allocate), + PROC(DEALLOCATE, enc_deallocate, dec_deallocate), +#endif /* CONFIG_NFS_V4_2 */ +}; + +const struct rpc_version nfs_version4 = { + .number = 4, + .nrprocs = ARRAY_SIZE(nfs4_procedures), + .procs = nfs4_procedures +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfs/nfsroot.c b/kernel/fs/nfs/nfsroot.c new file mode 100644 index 000000000..9bc9f04fb --- /dev/null +++ b/kernel/fs/nfs/nfsroot.c @@ -0,0 +1,309 @@ +/* + * Copyright (C) 1995, 1996 Gero Kuhlmann + * + * Allow an NFS filesystem to be mounted as root. The way this works is: + * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. + * (2) Construct the device string and the options string using DHCP + * option 17 and/or kernel command line options. + * (3) When mount_root() sets up the root file system, pass these strings + * to the NFS client's regular mount interface via sys_mount(). + * + * + * Changes: + * + * Alan Cox : Removed get_address name clash with FPU. + * Alan Cox : Reformatted a bit. + * Gero Kuhlmann : Code cleanup + * Michael Rausch : Fixed recognition of an incoming RARP answer. + * Martin Mares : (2.0) Auto-configuration via BOOTP supported. + * Martin Mares : Manual selection of interface & BOOTP/RARP. + * Martin Mares : Using network routes instead of host routes, + * allowing the default configuration to be used + * for normal operation of the host. + * Martin Mares : Randomized timer with exponential backoff + * installed to minimize network congestion. + * Martin Mares : Code cleanup. + * Martin Mares : (2.1) BOOTP and RARP made configuration options. + * Martin Mares : Server hostname generation fixed. + * Gerd Knorr : Fixed wired inode handling + * Martin Mares : (2.2) "0.0.0.0" addresses from command line ignored. + * Martin Mares : RARP replies not tested for server address. + * Gero Kuhlmann : (2.3) Some bug fixes and code cleanup again (please + * send me your new patches _before_ bothering + * Linus so that I don' always have to cleanup + * _afterwards_ - thanks) + * Gero Kuhlmann : Last changes of Martin Mares undone. + * Gero Kuhlmann : RARP replies are tested for specified server + * again. However, it's now possible to have + * different RARP and NFS servers. + * Gero Kuhlmann : "0.0.0.0" addresses from command line are + * now mapped to INADDR_NONE. + * Gero Kuhlmann : Fixed a bug which prevented BOOTP path name + * from being used (thanks to Leo Spiekman) + * Andy Walker : Allow to specify the NFS server in nfs_root + * without giving a path name + * Swen Thümmler : Allow to specify the NFS options in nfs_root + * without giving a path name. Fix BOOTP request + * for domainname (domainname is NIS domain, not + * DNS domain!). Skip dummy devices for BOOTP. + * Jacek Zapala : Fixed a bug which prevented server-ip address + * from nfsroot parameter from being used. + * Olaf Kirch : Adapted to new NFS code. + * Jakub Jelinek : Free used code segment. + * Marko Kohtala : Fixed some bugs. + * Martin Mares : Debug message cleanup + * Martin Mares : Changed to use the new generic IP layer autoconfig + * code. BOOTP and RARP moved there. + * Martin Mares : Default path now contains host name instead of + * host IP address (but host name defaults to IP + * address anyway). + * Martin Mares : Use root_server_addr appropriately during setup. + * Martin Mares : Rewrote parameter parsing, now hopefully giving + * correct overriding. + * Trond Myklebust : Add in preliminary support for NFSv3 and TCP. + * Fix bug in root_nfs_addr(). nfs_data.namlen + * is NOT for the length of the hostname. + * Hua Qin : Support for mounting root file system via + * NFS over TCP. + * Fabian Frederick: Option parser rebuilt (using parser lib) + * Chuck Lever : Use super.c's text-based mount option parsing + * Chuck Lever : Add "nfsrootdebug". + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_ROOT + +/* Default path we try to mount. "%s" gets replaced by our IP address */ +#define NFS_ROOT "/tftpboot/%s" + +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" + +/* Parameters passed from the kernel command line */ +static char nfs_root_parms[256] __initdata = ""; + +/* Text-based mount options passed to super.c */ +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; + +/* Address of NFS server */ +static __be32 servaddr __initdata = htonl(INADDR_NONE); + +/* Name of directory to mount */ +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; + +/* server:export path string passed to super.c */ +static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; + +#ifdef NFS_DEBUG +/* + * When the "nfsrootdebug" kernel command line option is specified, + * enable debugging messages for NFSROOT. + */ +static int __init nfs_root_debug(char *__unused) +{ + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; + return 1; +} + +__setup("nfsrootdebug", nfs_root_debug); +#endif + +/* + * Parse NFS server and directory information passed on the kernel + * command line. + * + * nfsroot=[:][,] + * + * If there is a "%s" token in the string, it is replaced + * by the ASCII-representation of the client's IP address. + */ +static int __init nfs_root_setup(char *line) +{ + ROOT_DEV = Root_NFS; + + if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { + strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + } else { + size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; + if (n >= sizeof(nfs_root_parms)) + line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0'; + sprintf(nfs_root_parms, NFS_ROOT, line); + } + + /* + * Extract the IP address of the NFS server containing our + * root file system, if one was specified. + * + * Note: root_nfs_parse_addr() removes the server-ip from + * nfs_root_parms, if it exists. + */ + root_server_addr = root_nfs_parse_addr(nfs_root_parms); + + return 1; +} + +__setup("nfsroot=", nfs_root_setup); + +static int __init root_nfs_copy(char *dest, const char *src, + const size_t destlen) +{ + if (strlcpy(dest, src, destlen) > destlen) + return -1; + return 0; +} + +static int __init root_nfs_cat(char *dest, const char *src, + const size_t destlen) +{ + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + + if (strlcat(dest, src, destlen) > destlen) + return -1; + return 0; +} + +/* + * Parse out root export path and mount options from + * passed-in string @incoming. + * + * Copy the export path into @exppath. + */ +static int __init root_nfs_parse_options(char *incoming, char *exppath, + const size_t exppathlen) +{ + char *p; + + /* + * Set the NFS remote path + */ + p = strsep(&incoming, ","); + if (*p != '\0' && strcmp(p, "default") != 0) + if (root_nfs_copy(exppath, p, exppathlen)) + return -1; + + /* + * @incoming now points to the rest of the string; if it + * contains something, append it to our root options buffer + */ + if (incoming != NULL && *incoming != '\0') + if (root_nfs_cat(nfs_root_options, incoming, + sizeof(nfs_root_options))) + return -1; + return 0; +} + +/* + * Decode the export directory path name and NFS options from + * the kernel command line. This has to be done late in order to + * use a dynamically acquired client IP address for the remote + * root directory path. + * + * Returns zero if successful; otherwise -1 is returned. + */ +static int __init root_nfs_data(char *cmdline) +{ + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + int len, retval = -1; + char *tmp = NULL; + const size_t tmplen = sizeof(nfs_export_path); + + tmp = kzalloc(tmplen, GFP_KERNEL); + if (tmp == NULL) + goto out_nomem; + strcpy(tmp, NFS_ROOT); + + if (root_server_path[0] != '\0') { + dprintk("Root-NFS: DHCPv4 option 17: %s\n", + root_server_path); + if (root_nfs_parse_options(root_server_path, tmp, tmplen)) + goto out_optionstoolong; + } + + if (cmdline[0] != '\0') { + dprintk("Root-NFS: nfsroot=%s\n", cmdline); + if (root_nfs_parse_options(cmdline, tmp, tmplen)) + goto out_optionstoolong; + } + + /* + * Append mandatory options for nfsroot so they override + * what has come before + */ + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", + &servaddr); + if (root_nfs_cat(nfs_root_options, mand_options, + sizeof(nfs_root_options))) + goto out_optionstoolong; + + /* + * Set up nfs_root_device. For NFS mounts, this looks like + * + * server:/path + * + * At this point, utsname()->nodename contains our local + * IP address or hostname, set by ipconfig. If "%s" exists + * in tmp, substitute the nodename, then shovel the whole + * mess into nfs_root_device. + */ + len = snprintf(nfs_export_path, sizeof(nfs_export_path), + tmp, utsname()->nodename); + if (len >= (int)sizeof(nfs_export_path)) + goto out_devnametoolong; + len = snprintf(nfs_root_device, sizeof(nfs_root_device), + "%pI4:%s", &servaddr, nfs_export_path); + if (len >= (int)sizeof(nfs_root_device)) + goto out_devnametoolong; + + retval = 0; + +out: + kfree(tmp); + return retval; +out_nomem: + printk(KERN_ERR "Root-NFS: could not allocate memory\n"); + goto out; +out_optionstoolong: + printk(KERN_ERR "Root-NFS: mount options string too long\n"); + goto out; +out_devnametoolong: + printk(KERN_ERR "Root-NFS: root device name too long.\n"); + goto out; +} + +/** + * nfs_root_data - Return prepared 'data' for NFSROOT mount + * @root_device: OUT: address of string containing NFSROOT device + * @root_data: OUT: address of string containing NFSROOT mount options + * + * Returns zero and sets @root_device and @root_data if successful, + * otherwise -1 is returned. + */ +int __init nfs_root_data(char **root_device, char **root_data) +{ + servaddr = root_server_addr; + if (servaddr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-NFS: no NFS server address\n"); + return -1; + } + + if (root_nfs_data(nfs_root_parms) < 0) + return -1; + + *root_device = nfs_root_device; + *root_data = nfs_root_options; + return 0; +} diff --git a/kernel/fs/nfs/nfstrace.c b/kernel/fs/nfs/nfstrace.c new file mode 100644 index 000000000..c74f7af23 --- /dev/null +++ b/kernel/fs/nfs/nfstrace.c @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2013 Trond Myklebust + */ +#include +#include +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); diff --git a/kernel/fs/nfs/nfstrace.h b/kernel/fs/nfs/nfstrace.h new file mode 100644 index 000000000..59f838cdc --- /dev/null +++ b/kernel/fs/nfs/nfstrace.h @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2013 Trond Myklebust + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs + +#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS_H + +#include + +#define nfs_show_file_type(ftype) \ + __print_symbolic(ftype, \ + { DT_UNKNOWN, "UNKNOWN" }, \ + { DT_FIFO, "FIFO" }, \ + { DT_CHR, "CHR" }, \ + { DT_DIR, "DIR" }, \ + { DT_BLK, "BLK" }, \ + { DT_REG, "REG" }, \ + { DT_LNK, "LNK" }, \ + { DT_SOCK, "SOCK" }, \ + { DT_WHT, "WHT" }) + +#define nfs_show_cache_validity(v) \ + __print_flags(v, "|", \ + { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ + { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ + { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ + { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ + { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ + { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ + { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ + { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + +#define nfs_show_nfsi_flags(v) \ + __print_flags(v, "|", \ + { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ + { 1 << NFS_INO_STALE, "STALE" }, \ + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ + { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ + { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ + { 1 << NFS_INO_COMMIT, "COMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + +DECLARE_EVENT_CLASS(nfs_inode_event, + TP_PROTO( + const struct inode *inode + ), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode->i_version; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (unsigned long long)__entry->version + ) +); + +DECLARE_EVENT_CLASS(nfs_inode_event_done, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(unsigned char, type) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, size) + __field(unsigned long, nfsi_flags) + __field(unsigned long, cache_validity) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->type = nfs_umode_to_dtype(inode->i_mode); + __entry->version = inode->i_version; + __entry->size = i_size_read(inode); + __entry->nfsi_flags = nfsi->flags; + __entry->cache_validity = nfsi->cache_validity; + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "type=%u (%s) version=%llu size=%lld " + "cache_validity=%lu (%s) nfs_flags=%ld (%s)", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->type, + nfs_show_file_type(__entry->type), + (unsigned long long)__entry->version, + (long long)__entry->size, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity), + __entry->nfsi_flags, + nfs_show_nfsi_flags(__entry->nfsi_flags) + ) +); + +#define DEFINE_NFS_INODE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode \ + ), \ + TP_ARGS(inode)) +#define DEFINE_NFS_INODE_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_inode_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit); +DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); +DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); + +#define show_lookup_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ + { LOOKUP_DIRECTORY, "DIRECTORY" }, \ + { LOOKUP_OPEN, "OPEN" }, \ + { LOOKUP_CREATE, "CREATE" }, \ + { LOOKUP_EXCL, "EXCL" }) + +DECLARE_EVENT_CLASS(nfs_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags \ + ), \ + TP_ARGS(dir, dentry, flags)) + +DECLARE_EVENT_CLASS(nfs_lookup_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_lookup_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags, \ + int error \ + ), \ + TP_ARGS(dir, dentry, flags, error)) + +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); + +#define show_open_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_APPEND, "O_APPEND" }, \ + { O_DSYNC, "O_DSYNC" }, \ + { O_DIRECT, "O_DIRECT" }, \ + { O_DIRECTORY, "O_DIRECTORY" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +TRACE_EVENT(nfs_atomic_open_enter, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags + ), + + TP_ARGS(dir, ctx, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_atomic_open_exit, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags, + int error + ), + + TP_ARGS(dir, ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) fmode=%s " + "name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_enter, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_exit, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_directory_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT(name) \ + DEFINE_EVENT(nfs_directory_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry \ + ), \ + TP_ARGS(dir, dentry)) + +DECLARE_EVENT_CLASS(nfs_directory_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_directory_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + int error \ + ), \ + TP_ARGS(dir, dentry, error)) + +DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit); + +TRACE_EVENT(nfs_link_enter, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(inode, dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_link_exit, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(inode, dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_rename_event, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, old_dir) + __field(u64, new_dir) + __string(old_name, old_dentry->d_name.name) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT(name) \ + DEFINE_EVENT(nfs_rename_event, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry)) + +DECLARE_EVENT_CLASS(nfs_rename_event_done, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, + int error + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, old_dir) + __string(old_name, old_dentry->d_name.name) + __field(u64, new_dir) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __entry->error = error; + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "error=%d old_name=%02x:%02x:%llu/%s " + "new_name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_rename_event_done, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry, \ + int error \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, \ + new_dentry, error)) + +DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); + +DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); + +TRACE_EVENT(nfs_sillyrename_unlink, + TP_PROTO( + const struct nfs_unlinkdata *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __dynamic_array(char, name, data->args.name.len + 1) + ), + + TP_fast_assign( + struct inode *dir = data->dir; + size_t len = data->args.name.len; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + memcpy(__get_dynamic_array(name), + data->args.name.name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); +#endif /* _TRACE_NFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfstrace +/* This part must be outside protection */ +#include diff --git a/kernel/fs/nfs/objlayout/Kbuild b/kernel/fs/nfs/objlayout/Kbuild new file mode 100644 index 000000000..ed30ea072 --- /dev/null +++ b/kernel/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/kernel/fs/nfs/objlayout/objio_osd.c b/kernel/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 000000000..5aaed3635 --- /dev/null +++ b/kernel/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,678 @@ +/* + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "objlayout.h" +#include "../internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +struct objio_dev_ent { + struct nfs4_deviceid_node id_node; + struct ore_dev od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); + kfree_rcu(d, rcu); +} + +struct objio_segment { + struct pnfs_layout_segment lseg; + + struct ore_layout layout; + struct ore_components oc; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, struct objio_segment, lseg); +} + +struct objio_state { + /* Generic layer */ + struct objlayout_io_res oir; + + bool sync; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; +}; + +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +struct nfs4_deviceid_node * +objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) +{ + struct pnfs_osd_deviceaddr *deviceaddr; + struct objio_dev_ent *ode = NULL; + struct osd_dev *od; + struct osd_dev_info odi; + bool retry_flag = true; + __be32 *p; + int err; + + deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); + if (!deviceaddr) + return NULL; + + p = page_address(pdev->pages[0]); + pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + +retry_lookup: + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + if (err == -ENODEV && retry_flag) { + err = objlayout_autologin(deviceaddr); + if (likely(!err)) { + retry_flag = false; + goto retry_lookup; + } + } + goto out; + } + + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); + + ode = kzalloc(sizeof(*ode), gfp_flags); + if (!ode) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + goto out; + } + + nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); + kfree(deviceaddr); + + ode->od.od = od; + return &ode->id_node; + +out: + kfree(deviceaddr); + return NULL; +} + +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) +{ + struct ore_comp *ocomp = &oc->comps[c]; + + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); + + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; + + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); +} + +static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) +{ +/* This is the in memory structure of the objio_segment + * + * struct __alloc_objio_segment { + * struct objio_segment olseg; + * struct ore_dev *ods[numdevs]; + * struct ore_comp comps[numdevs]; + * } *aolseg; + * NOTE: The code as above compiles and runs perfectly. It is elegant, + * type safe and compact. At some Past time Linus has decided he does not + * like variable length arrays, For the sake of this principal we uglify + * the code as below. + */ + struct objio_segment *lseg; + size_t lseg_size = sizeof(*lseg) + + numdevs * sizeof(lseg->oc.ods[0]) + + numdevs * sizeof(*lseg->oc.comps); + + lseg = kzalloc(lseg_size, gfp_flags); + if (unlikely(!lseg)) { + dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, + numdevs, lseg_size); + return -ENOMEM; + } + + lseg->oc.numdevs = numdevs; + lseg->oc.single_comp = EC_MULTPLE_COMPS; + lseg->oc.ods = (void *)(lseg + 1); + lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); + + *pseg = lseg; + return 0; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags) +{ + struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); + struct objio_segment *objio_seg; + struct pnfs_osd_xdr_decode_layout_iter iter; + struct pnfs_osd_layout layout; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; + int err; + + err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); + if (unlikely(err)) + return err; + + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); + if (unlikely(err)) + return err; + + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; + + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); + if (unlikely(err)) + goto err; + + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + struct nfs4_deviceid_node *d; + struct objio_dev_ent *ode; + + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + + d = nfs4_find_get_deviceid(server, + &src_comp.oc_object_id.oid_device_id, + pnfslay->plh_lc_cred, gfp_flags); + if (!d) { + err = -ENXIO; + goto err; + } + + ode = container_of(d, struct objio_dev_ent, id_node); + objio_seg->oc.ods[cur_comp++] = &ode->od; + } + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; + + *outp = &objio_seg->lseg; + return 0; + +err: + kfree(objio_seg); + dprintk("%s: Error: return %d\n", __func__, err); + *outp = NULL; + return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ + int i; + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) + break; + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); + } + kfree(objio_seg); +} + +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + struct ore_io_state *ios; + int ret; + struct __alloc_objio_state { + struct objio_state objios; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) + return -ENOMEM; + + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, + aos->ioerrs, rpcdata, pnfs_layout_type); + + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + + ios->pages = pages; + ios->pgbase = pgbase; + ios->private = aos; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; + return 0; +} + +void objio_free_result(struct objlayout_io_res *oir) +{ + struct objio_state *objios = container_of(oir, struct objio_state, oir); + + ore_put_io_state(objios->ios); + kfree(objios); +} + +static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + +static void __on_dev_error(struct ore_io_state *ios, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) +{ + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; + + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; + + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, !ios->reading); +} + +/* + * read + */ +static void _read_done(struct ore_io_state *ios, void *private) +{ + struct objio_state *objios = private; + ssize_t status; + int ret = ore_check_io(ios, &__on_dev_error); + + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&objios->oir, status, objios->sync); +} + +int objio_read_pagelist(struct nfs_pgio_header *hdr) +{ + struct objio_state *objios; + int ret; + + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, + hdr->lseg, hdr->args.pages, hdr->args.pgbase, + hdr->args.offset, hdr->args.count, hdr, + GFP_KERNEL, &objios); + if (unlikely(ret)) + return ret; + + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + hdr->args.offset, hdr->args.count); + ret = ore_read(objios->ios); + if (unlikely(ret)) + objio_free_result(&objios->oir); + return ret; +} + +/* + * write + */ +static void _write_done(struct ore_io_state *ios, void *private) +{ + struct objio_state *objios = private; + ssize_t status; + int ret = ore_check_io(ios, &__on_dev_error); + + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + objios->oir.committed = NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&objios->oir, status, objios->sync); +} + +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct objio_state *objios = priv; + struct nfs_pgio_header *hdr = objios->oir.rpcdata; + struct address_space *mapping = hdr->inode->i_mapping; + pgoff_t index = offset / PAGE_SIZE; + struct page *page; + loff_t i_size = i_size_read(hdr->inode); + + if (offset >= i_size) { + *uptodate = true; + dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); + return ZERO_PAGE(0); + } + + page = find_get_page(mapping, index); + if (!page) { + page = find_or_create_page(mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, + (page == ZERO_PAGE(0)) ? -1UL : page->index); + if (ZERO_PAGE(0) != page) + page_cache_release(page); + return; +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) +{ + struct objio_state *objios; + int ret; + + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, + hdr->lseg, hdr->args.pages, hdr->args.pgbase, + hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, + &objios); + if (unlikely(ret)) + return ret; + + objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; + + if (!objios->sync) + objios->ios->done = _write_done; + + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + hdr->args.offset, hdr->args.count); + ret = ore_write(objios->ios); + if (unlikely(ret)) { + objio_free_result(&objios->oir); + return ret; + } + + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); + unsigned int size; + + size = pnfs_generic_pg_test(pgio, prev, req); + + if (!size || mirror->pg_count + req->wb_bytes > + (unsigned long)pgio->pg_layout_private) + return 0; + + return min(size, req->wb_bytes); +} + +static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + pnfs_generic_pg_init_read(pgio, req); + if (unlikely(pgio->pg_lseg == NULL)) + return; /* Not pNFS */ + + pgio->pg_layout_private = (void *) + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; +} + +static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, + unsigned long *stripe_end) +{ + u32 stripe_off; + unsigned stripe_size; + + if (layout->raid_algorithm == PNFS_OSD_RAID_0) + return true; + + stripe_size = layout->stripe_unit * + (layout->group_width - layout->parity); + + div_u64_rem(offset, stripe_size, &stripe_off); + if (!stripe_off) + return true; + + *stripe_end = stripe_size - stripe_off; + return false; +} + +static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + unsigned long stripe_end = 0; + u64 wb_size; + + if (pgio->pg_dreq == NULL) + wb_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); + if (unlikely(pgio->pg_lseg == NULL)) + return; /* Not pNFS */ + + if (req->wb_offset || + !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, + &OBJIO_LSEG(pgio->pg_lseg)->layout, + &stripe_end)) { + pgio->pg_layout_private = (void *)stripe_end; + } else { + pgio->pg_layout_private = (void *) + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; + } +} + +static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_init = objio_init_read, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_init = objio_init_write, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, + + .max_deviceinfo_size = PAGE_SIZE, + .owner = THIS_MODULE, + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, + + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, + + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .pg_read_ops = &objio_pg_read_ops, + .pg_write_ops = &objio_pg_write_ops, + + .sync = pnfs_generic_sync, + + .free_deviceid_node = objio_free_deviceid_node, + + .encode_layoutcommit = objlayout_encode_layoutcommit, + .encode_layoutreturn = objlayout_encode_layoutreturn, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy "); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + int ret = pnfs_register_layoutdriver(&objlayout_type); + + if (ret) + printk(KERN_INFO + "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", + __func__, ret); + else + printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", + __func__); + return ret; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +MODULE_ALIAS("nfs-layouttype4-2"); + +module_init(objlayout_init); +module_exit(objlayout_exit); diff --git a/kernel/fs/nfs/objlayout/objlayout.c b/kernel/fs/nfs/objlayout/objlayout.c new file mode 100644 index 000000000..919efd4a1 --- /dev/null +++ b/kernel/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,706 @@ +/* + * pNFS Objects layout driver high level definitions + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + if (!objlay) + return NULL; + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + WARN_ON(!list_empty(&objlay->err_list)); + kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + int status = -ENOMEM; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + + scratch = alloc_page(gfp_flags); + if (!scratch) + goto err_nofree; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); + if (unlikely(status)) { + dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, + status); + goto err; + } + + __free_page(scratch); + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + +err: + __free_page(scratch); +err_nofree: + dprintk("%s: Err Return=>%d\n", __func__, status); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objio_free_lseg(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) +{ + u64 lseg_end_offset; + + BUG_ON(offset < lseg->pls_range.offset); + lseg_end_offset = end_offset(lseg->pls_range.offset, + lseg->pls_range.length); + BUG_ON(offset >= lseg_end_offset); + WARN_ON(offset + count > lseg_end_offset); + + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; + } +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_res *oir) +{ + if (likely(oir->status >= 0)) { + objio_free_result(oir); + } else { + struct objlayout *objlay = oir->objlay; + + spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; + list_add(&objlay->err_list, &oir->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, + struct pnfs_osd_objid *pooid, int osd_error, + u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; + + BUG_ON(index >= oir->num_comps); + if (osd_error) { + ioerr->oer_component = *pooid; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_header *hdr; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + hdr = container_of(task, struct nfs_pgio_header, task); + + pnfs_ld_read_done(hdr); +} + +void +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ + struct nfs_pgio_header *hdr = oir->rpcdata; + + oir->status = hdr->task.tk_status = status; + if (status >= 0) + hdr->res.count = status; + else + hdr->pnfs_error = status; + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, hdr->res.eof, sync); + + if (sync) + pnfs_ld_read_done(hdr); + else { + INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); + schedule_work(&hdr->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_pgio_header *hdr) +{ + struct inode *inode = hdr->inode; + loff_t offset = hdr->args.offset; + size_t count = hdr->args.count; + int err; + loff_t eof; + + eof = i_size_read(inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + err = 0; + hdr->res.count = 0; + hdr->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ + goto out; + } + count = eof - offset; + } + + hdr->res.eof = (offset + count) >= eof; + _fix_verify_io_params(hdr->lseg, &hdr->args.pages, + &hdr->args.pgbase, + hdr->args.offset, hdr->args.count); + + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, inode->i_ino, offset, count, hdr->res.eof); + + err = objio_read_pagelist(hdr); + out: + if (unlikely(err)) { + hdr->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_header *hdr; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + hdr = container_of(task, struct nfs_pgio_header, task); + + pnfs_ld_write_done(hdr); +} + +void +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ + struct nfs_pgio_header *hdr = oir->rpcdata; + + oir->status = hdr->task.tk_status = status; + if (status >= 0) { + hdr->res.count = status; + hdr->verf.committed = oir->committed; + } else { + hdr->pnfs_error = status; + } + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, hdr->verf.committed, sync); + + if (sync) + pnfs_ld_write_done(hdr); + else { + INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); + schedule_work(&hdr->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) +{ + int err; + + _fix_verify_io_params(hdr->lseg, &hdr->args.pages, + &hdr->args.pgbase, + hdr->args.offset, hdr->args.count); + + err = objio_write_pagelist(hdr, how); + if (unlikely(err)) { + hdr->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } + return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ + struct objlayout_io_res *oir, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " + "is_write=%d dev(%llx:%llx) par=0x%llx " + "obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&oir->err_list); + objio_free_result(oir); + } + + pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_res *oir, *tmp; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { + __be32 *last_xdr = NULL, *p; + unsigned i; + int res = 0; + + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + p = pnfs_osd_xdr_ioerr_reserve_space(xdr); + if (unlikely(!p)) { + res = -E2BIG; + break; /* accumulated_error */ + } + + last_xdr = p; + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); + } + + /* TODO: use xdr_write_pages */ + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(!last_xdr); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + encode_accumulated_error(objlay, last_xdr); + goto loop_done; + } + list_del(&oir->err_list); + objio_free_result(oir); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + +enum { + OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, + OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, + OSD_LOGIN_UPCALL_PATHLEN = 256 +}; + +static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; + +module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), + 0600); +MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); + +struct __auto_login { + char uri[OBJLAYOUT_MAX_URI_LEN]; + char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; + char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; +}; + +static int __objlayout_upcall(struct __auto_login *login) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int ret; + + if (unlikely(!osd_login_prog[0])) { + dprintk("%s: osd_login_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s uri: %s\n", __func__, login->uri); + dprintk("%s osdname %s\n", __func__, login->osdname); + dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); + + argv[0] = (char *)osd_login_prog; + argv[1] = "-u"; + argv[2] = login->uri; + argv[3] = "-o"; + argv[4] = login->osdname; + argv[5] = "-s"; + argv[6] = login->systemid_hex; + argv[7] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the objlayoutdriver.osd_login_prog module parameter once + * the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + printk(KERN_ERR "PNFS-OBJ: %s was not found please set " + "objlayoutdriver.osd_login_prog kernel parameter!\n", + osd_login_prog); + osd_login_prog[0] = '\0'; + } + dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); + + return ret; +} + +/* Assume dest is all zeros */ +static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, + char *dest, int max_len, + const char *var_name) +{ + if (!s.len) + return; + + if (s.len >= max_len) { + pr_warn_ratelimited( + "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", + var_name, s.len, max_len); + s.len = max_len - 1; /* space for null terminator */ + } + + memcpy(dest, s.data, s.len); +} + +/* Assume sysid is all zeros */ +static void _sysid_2_hex(struct nfs4_string s, + char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) +{ + int i; + char *cur; + + if (!s.len) + return; + + if (s.len != OSD_SYSTEMID_LEN) { + pr_warn_ratelimited( + "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", + s.len); + if (s.len > OSD_SYSTEMID_LEN) + s.len = OSD_SYSTEMID_LEN; + } + + cur = sysid; + for (i = 0; i < s.len; i++) + cur = hex_byte_pack(cur, s.data[i]); +} + +int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) +{ + int rc; + struct __auto_login login; + + if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) + return -ENODEV; + + memset(&login, 0, sizeof(login)); + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_targetaddr.ota_netaddr.r_addr, + login.uri, sizeof(login.uri), "URI"); + + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_osdname, + login.osdname, sizeof(login.osdname), "OSDNAME"); + + _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); + + rc = __objlayout_upcall(&login); + if (rc > 0) /* script returns positive values */ + rc = -ENODEV; + + return rc; +} diff --git a/kernel/fs/nfs/objlayout/objlayout.h b/kernel/fs/nfs/objlayout/objlayout.h new file mode 100644 index 000000000..2641dbad3 --- /dev/null +++ b/kernel/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,184 @@ +/* + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include +#include +#include "../pnfs.h" + +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; + + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_res { + struct objlayout *objlay; + + void *rpcdata; + int status; /* res */ + int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; +}; + +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +/* objio_free_result will free these @oir structs received from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); + +extern int objio_read_pagelist(struct nfs_pgio_header *rdata); +extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_res *oir, + unsigned index, struct pnfs_osd_objid *pooid, + int osd_error, u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) +{ + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_res *oir, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_res *oir, + ssize_t status, bool sync); + +/* + * exported generic objects function vectors + */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + +extern struct pnfs_layout_segment *objlayout_alloc_lseg( + struct pnfs_layout_hdr *, + struct nfs4_layoutget_res *, + gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +extern enum pnfs_try_status objlayout_read_pagelist( + struct nfs_pgio_header *); + +extern enum pnfs_try_status objlayout_write_pagelist( + struct nfs_pgio_header *, + int how); + +extern void objlayout_encode_layoutcommit( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutcommit_args *); + +extern void objlayout_encode_layoutreturn( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutreturn_args *); + +extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); + +#endif /* _OBJLAYOUT_H */ diff --git a/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 000000000..f093c7ec9 --- /dev/null +++ b/kernel/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,415 @@ +/* + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, + sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} +/* + * struct pnfs_osd_opaque_cred { + * u32 cred_len; + * void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 1); + + if (!p) + return -EINVAL; + + opaque_cred->cred_len = be32_to_cpu(*p++); + + p = xdr_inline_decode(xdr, opaque_cred->cred_len); + if (!p) + return -EINVAL; + + opaque_cred->cred = p; + return 0; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); + int ret; + + if (!p) + return -EIO; + + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p); + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); + if (unlikely(ret)) + return ret; + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); + return ret; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ + return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ + data_map->odm_num_comps = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); + data_map->odm_group_width = be32_to_cpup(p++); + data_map->odm_group_depth = be32_to_cpup(p++); + data_map->odm_mirror_cnt = be32_to_cpup(p++); + data_map->odm_raid_algorithm = be32_to_cpup(p++); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ + __be32 *p; + + memset(iter, 0, sizeof(*iter)); + + p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); + if (unlikely(!p)) + return -EINVAL; + + p = _osd_xdr_decode_data_map(p, &layout->olo_map); + layout->olo_comps_index = be32_to_cpup(p++); + layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + + iter->total_comps = layout->olo_num_comps; + return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, + int *err) +{ + BUG_ON(iter->decoded_comps > iter->total_comps); + if (iter->decoded_comps == iter->total_comps) + return false; + + *err = _osd_xdr_decode_object_cred(comp, xdr); + if (unlikely(*err)) { + dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " + "total_comps=%d\n", __func__, *err, + iter->decoded_comps, iter->total_comps); + return false; /* stop the loop */ + } + dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + + iter->decoded_comps++; + return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + * variable strings fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + * unsigned int len; + * char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ + str->len = be32_to_cpup(p++); + str->data = (char *)p; + + p += XDR_QUADLEN(str->len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ + u32 oti_type; + + oti_type = be32_to_cpup(p++); + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ + p = __read_u8_opaque(p, &netaddr->r_netid); + p = __read_u8_opaque(p, &netaddr->r_addr); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ + u32 ota_available; + + ota_available = be32_to_cpup(p++); + targetaddr->ota_available = ota_available; + + if (ota_available) + p = __read_net_addr(p, &targetaddr->ota_netaddr); + + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + opaque_cred->cred_len = be32_to_cpu(*p++); + opaque_cred->cred = p; + return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p++); + + p = __read_opaque_cred(p, &comp->oc_cap_key); + p = __read_opaque_cred(p, &comp->oc_cap); + return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ + p = __read_targetid(p, &deviceaddr->oda_targetid); + + p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + + p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, + sizeof(deviceaddr->oda_lun)); + + p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + + p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + + p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + + /* libosd likes this terminated in dbg. It's last, so no problems */ + deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ + p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32 + 24); + if (unlikely(!p)) + dprintk("%s: out of xdr space\n", __func__); + + return p; +} diff --git a/kernel/fs/nfs/pagelist.c b/kernel/fs/nfs/pagelist.c new file mode 100644 index 000000000..282b39369 --- /dev/null +++ b/kernel/fs/nfs/pagelist.c @@ -0,0 +1,1309 @@ +/* + * linux/fs/nfs/pagelist.c + * + * A set of helper functions for managing NFS read and write requests. + * The main purpose of these routines is to provide support for the + * coalescing of several requests into a single RPC call. + * + * Copyright 2000, 2001 (c) Trond Myklebust + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; + +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +{ + p->npages = pagecount; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) + p->npages = 0; + } + return p->pagevec != NULL; +} + +struct nfs_pgio_mirror * +nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) +{ + return nfs_pgio_has_mirroring(desc) ? + &desc->pg_mirrors[desc->pg_mirror_idx] : + &desc->pg_mirrors[0]; +} +EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); + +void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + hdr->req = nfs_list_entry(mirror->pg_list.next); + hdr->inode = desc->pg_inode; + hdr->cred = hdr->req->wb_context->cred; + hdr->io_start = req_offset(hdr->req); + hdr->good_bytes = mirror->pg_count; + hdr->dreq = desc->pg_dreq; + hdr->layout_private = desc->pg_layout_private; + hdr->release = release; + hdr->completion_ops = desc->pg_completion_ops; + if (hdr->completion_ops->init_hdr) + hdr->completion_ops->init_hdr(hdr); + + hdr->pgio_mirror_idx = desc->pg_mirror_idx; +} +EXPORT_SYMBOL_GPL(nfs_pgheader_init); + +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) +{ + spin_lock(&hdr->lock); + if (pos < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_ERROR, &hdr->flags); + clear_bit(NFS_IOHDR_EOF, &hdr->flags); + hdr->good_bytes = pos - hdr->io_start; + hdr->error = error; + } + spin_unlock(&hdr->lock); +} + +static inline struct nfs_page * +nfs_page_alloc(void) +{ + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); + if (p) + INIT_LIST_HEAD(&p->wb_list); + return p; +} + +static inline void +nfs_page_free(struct nfs_page *p) +{ + kmem_cache_free(nfs_page_cachep, p); +} + +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ + atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ + if (atomic_dec_and_test(&c->io_count)) { + clear_bit(NFS_IO_INPROGRESS, &c->flags); + smp_mb__after_atomic(); + wake_up_bit(&c->flags, NFS_IO_INPROGRESS); + } +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ + wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); + DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); + int ret = 0; + + do { + prepare_to_wait(wq, &q.wait, TASK_KILLABLE); + set_bit(NFS_IO_INPROGRESS, &c->flags); + if (atomic_read(&c->io_count) == 0) + break; + ret = nfs_wait_bit_killable(&q.key); + } while (atomic_read(&c->io_count) != 0 && !ret); + finish_wait(wq, &q.wait); + return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ + if (atomic_read(&c->io_count) == 0) + return 0; + return __nfs_iocounter_wait(c); +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * @nonblock - if true don't block waiting for lock + * + * this lock must be held if modifying the page group list + * + * return 0 on success, < 0 on error: -EDELAY if nonblocking or the + * result from wait_on_bit_lock + * + * NOTE: calling with nonblock=false should always have set the + * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock + * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + */ +int +nfs_page_group_lock(struct nfs_page *req, bool nonblock) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + return 0; + + if (!nonblock) + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); + + return -EAGAIN; +} + +/* + * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it + * @req - a request in the group + * + * This is a blocking call to wait for the group lock to be cleared. + */ +void +nfs_page_group_lock_wait(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + smp_mb__before_atomic(); + clear_bit(PG_HEADLOCK, &head->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ + struct nfs_page *head = req->wb_head; + struct nfs_page *tmp; + + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + + tmp = req->wb_this_page; + while (tmp != req) { + if (!test_bit(bit, &tmp->wb_flags)) + return false; + tmp = tmp->wb_this_page; + } + + /* true! reset all bits */ + tmp = req; + do { + clear_bit(bit, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + + return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + * return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ + bool ret; + + nfs_page_group_lock(req, false); + ret = nfs_page_group_sync_on_bit_locked(req, bit); + nfs_page_group_unlock(req); + + return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + * or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ + struct inode *inode; + WARN_ON_ONCE(prev == req); + + if (!prev) { + /* a head request */ + req->wb_head = req; + req->wb_this_page = req; + } else { + /* a subrequest */ + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); + req->wb_head = prev->wb_head; + req->wb_this_page = prev->wb_this_page; + prev->wb_this_page = req; + + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + + /* grab extra ref and bump the request count if head request + * has extra ref from the write/commit path to handle handoff + * between write and commit lists. */ + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + inode = page_file_mapping(req->wb_page)->host; + set_bit(PG_INODE_REF, &req->wb_flags); + kref_get(&req->wb_kref); + spin_lock(&inode->i_lock); + NFS_I(inode)->nrequests++; + spin_unlock(&inode->i_lock); + } + } +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *tmp, *next; + + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) + return; + + tmp = req; + do { + next = tmp->wb_this_page; + /* unlink and free */ + tmp->wb_this_page = tmp; + tmp->wb_head = tmp; + nfs_free_request(tmp); + tmp = next; + } while (tmp != req); +} + +/** + * nfs_create_request - Create an NFS read/write request. + * @ctx: open context to use + * @page: page to write + * @last: last nfs request created for this page group or NULL if head + * @offset: starting offset within the page for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page * +nfs_create_request(struct nfs_open_context *ctx, struct page *page, + struct nfs_page *last, unsigned int offset, + unsigned int count) +{ + struct nfs_page *req; + struct nfs_lock_context *l_ctx; + + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + /* get lock context early so we can deal with alloc failures */ + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) { + nfs_page_free(req); + return ERR_CAST(l_ctx); + } + req->wb_lock_context = l_ctx; + nfs_iocounter_inc(&l_ctx->io_count); + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in + * update_nfs_request below if the region is not locked. */ + req->wb_page = page; + req->wb_index = page_file_index(page); + page_cache_get(page); + req->wb_offset = offset; + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); + kref_init(&req->wb_kref); + nfs_page_group_init(req, last); + return req; +} + +/** + * nfs_unlock_request - Unlock request and wake up sleepers. + * @req: + */ +void nfs_unlock_request(struct nfs_page *req) +{ + if (!NFS_WBACK_BUSY(req)) { + printk(KERN_ERR "NFS: Invalid unlock attempted\n"); + BUG(); + } + smp_mb__before_atomic(); + clear_bit(PG_BUSY, &req->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&req->wb_flags, PG_BUSY); +} + +/** + * nfs_unlock_and_release_request - Unlock request and release the nfs_page + * @req: + */ +void nfs_unlock_and_release_request(struct nfs_page *req) +{ + nfs_unlock_request(req); + nfs_release_request(req); +} + +/* + * nfs_clear_request - Free up all resources allocated to the request + * @req: + * + * Release page and open context resources associated with a read/write + * request after it has completed. + */ +static void nfs_clear_request(struct nfs_page *req) +{ + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } + if (l_ctx != NULL) { + nfs_iocounter_dec(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } +} + +/** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release + * + * Note: Should never be called with the spinlock held! + */ +void nfs_free_request(struct nfs_page *req) +{ + WARN_ON_ONCE(req->wb_this_page != req); + + /* extra debug: make sure no sync bits are still set */ + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags)); + + /* Release struct file and open context */ + nfs_clear_request(req); + nfs_page_free(req); +} + +void nfs_release_request(struct nfs_page *req) +{ + kref_put(&req->wb_kref, nfs_page_group_destroy); +} + +/** + * nfs_wait_on_request - Wait for a request to complete. + * @req: request to wait upon. + * + * Interruptible by fatal signals only. + * The user is responsible for holding a count on the request. + */ +int +nfs_wait_on_request(struct nfs_page *req) +{ + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, struct nfs_page *req) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + if (mirror->pg_count > mirror->pg_bsize) { + /* should never happen */ + WARN_ON_ONCE(1); + return 0; + } + + /* + * Limit the request size so that we can still allocate a page array + * for it without upsetting the slab allocator. + */ + if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) * + sizeof(struct page) > PAGE_SIZE) + return 0; + + return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes); +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); + +struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) +{ + struct nfs_pgio_header *hdr = ops->rw_alloc_header(); + + if (hdr) { + INIT_LIST_HEAD(&hdr->pages); + spin_lock_init(&hdr->lock); + hdr->rw_ops = ops; + } + return hdr; +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); + +/** + * nfs_pgio_data_destroy - make @hdr suitable for reuse + * + * Frees memory and releases refs from nfs_generic_pgio, so that it may + * be called again. + * + * @hdr: A header that has had nfs_generic_pgio called + */ +void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +{ + if (hdr->args.context) + put_nfs_open_context(hdr->args.context); + if (hdr->page_array.pagevec != hdr->page_array.page_array) + kfree(hdr->page_array.pagevec); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @hdr: The pageio hdr + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, + unsigned int count, unsigned int offset, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_page *req = hdr->req; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with hdr->commit et al. */ + + hdr->args.fh = NFS_FH(hdr->inode); + hdr->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + hdr->mds_offset = hdr->args.offset; + hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pages = hdr->page_array.pagevec; + hdr->args.count = count; + hdr->args.context = get_nfs_open_context(req->wb_context); + hdr->args.lock_context = req->wb_lock_context; + hdr->args.stable = NFS_UNSTABLE; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_reqs_to_commit(cinfo)) + break; + default: + hdr->args.stable = NFS_FILE_SYNC; + } + + hdr->res.fattr = &hdr->fattr; + hdr->res.count = count; + hdr->res.eof = 0; + hdr->res.verf = &hdr->verf; + nfs_fattr_init(&hdr->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio hdr to go over the wire + * @task: The current task + * @calldata: pageio header to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_header *hdr = calldata; + int err; + err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr); + if (err) + rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, + struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, + const struct rpc_call_ops *call_ops, int how, int flags) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &hdr->args, + .rpc_resp = &hdr->res, + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &hdr->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = hdr, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + }; + int ret = 0; + + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); + + dprintk("NFS: %5u initiated pgio call " + "(req %s/%llu, %u bytes @ offset %llu)\n", + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_mirror *mirror; + u32 midx; + + set_bit(NFS_IOHDR_REDO, &hdr->flags); + nfs_pgio_data_destroy(hdr); + hdr->completion_ops->completion(hdr); + /* TODO: Make sure it's right to clean up all mirrors here + * and not just hdr->pgio_mirror_idx */ + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + } + return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio header to release + */ +static void nfs_pgio_release(void *calldata) +{ + struct nfs_pgio_header *hdr = calldata; + if (hdr->rw_ops->rw_release) + hdr->rw_ops->rw_release(hdr); + nfs_pgio_data_destroy(hdr); + hdr->completion_ops->completion(hdr); +} + +static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, + unsigned int bsize) +{ + INIT_LIST_HEAD(&mirror->pg_list); + mirror->pg_bytes_written = 0; + mirror->pg_count = 0; + mirror->pg_bsize = bsize; + mirror->pg_base = 0; + mirror->pg_recoalesce = 0; +} + +/** + * nfs_pageio_init - initialise a page io descriptor + * @desc: pointer to descriptor + * @inode: pointer to inode + * @doio: pointer to io function + * @bsize: io block size + * @io_flags: extra parameters for the io function + */ +void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + const struct nfs_pageio_ops *pg_ops, + const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, + size_t bsize, + int io_flags) +{ + struct nfs_pgio_mirror *new; + int i; + + desc->pg_moreio = 0; + desc->pg_inode = inode; + desc->pg_ops = pg_ops; + desc->pg_completion_ops = compl_ops; + desc->pg_rw_ops = rw_ops; + desc->pg_ioflags = io_flags; + desc->pg_error = 0; + desc->pg_lseg = NULL; + desc->pg_dreq = NULL; + desc->pg_layout_private = NULL; + desc->pg_bsize = bsize; + + desc->pg_mirror_count = 1; + desc->pg_mirror_idx = 0; + + if (pg_ops->pg_get_mirror_count) { + /* until we have a request, we don't have an lseg and no + * idea how many mirrors there will be */ + new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, + sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + desc->pg_mirrors_dynamic = new; + desc->pg_mirrors = new; + + for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) + nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); + } else { + desc->pg_mirrors_dynamic = NULL; + desc->pg_mirrors = desc->pg_mirrors_static; + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); + } +} +EXPORT_SYMBOL_GPL(nfs_pageio_init); + +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio header to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_header *hdr = calldata; + struct inode *inode = hdr->inode; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, + task->tk_pid, task->tk_status); + + if (hdr->rw_ops->rw_done(task, hdr, inode) != 0) + return; + if (task->tk_status < 0) + nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset); + else + hdr->rw_ops->rw_result(task, hdr); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + struct nfs_page *req; + struct page **pages, + *last_page; + struct list_head *head = &mirror->pg_list; + struct nfs_commit_info cinfo; + unsigned int pagecount, pageused; + + pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) + return nfs_pgio_error(desc, hdr); + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = hdr->page_array.pagevec; + last_page = NULL; + pageused = 0; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + + if (!last_page || last_page != req->wb_page) { + pageused++; + if (pageused > pagecount) + break; + *pages++ = last_page = req->wb_page; + } + } + if (WARN_ON_ONCE(pageused != pagecount)) + return nfs_pgio_error(desc, hdr); + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + desc->pg_rpc_callops = &nfs_pgio_common_ops; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ + struct nfs_pgio_mirror *mirror; + struct nfs_pgio_header *hdr; + int ret; + + mirror = nfs_pgio_current_mirror(desc); + + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { + /* TODO: make sure this is right with mirroring - or + * should it back out all mirrors? */ + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + return -ENOMEM; + } + nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); + ret = nfs_generic_pgio(desc, hdr); + if (ret == 0) + ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), + hdr, + hdr->cred, + NFS_PROTO(hdr->inode), + desc->pg_rpc_callops, + desc->pg_ioflags, 0); + return ret; +} + +/* + * nfs_pageio_setup_mirroring - determine if mirroring is to be used + * by calling the pg_get_mirror_count op + */ +static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + int mirror_count = 1; + + if (!pgio->pg_ops->pg_get_mirror_count) + return 0; + + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) + return -EINVAL; + + if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) + return -EINVAL; + + pgio->pg_mirror_count = mirror_count; + + return 0; +} + +/* + * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) + */ +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_mirror_count = 1; + pgio->pg_mirror_idx = 0; +} + +static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_mirror_count = 1; + pgio->pg_mirror_idx = 0; + pgio->pg_mirrors = pgio->pg_mirrors_static; + kfree(pgio->pg_mirrors_dynamic); + pgio->pg_mirrors_dynamic = NULL; +} + +static bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + +static bool nfs_match_lock_context(const struct nfs_lock_context *l1, + const struct nfs_lock_context *l2) +{ + return l1->lockowner.l_owner == l2->lockowner.l_owner + && l1->lockowner.l_pid == l2->lockowner.l_pid; +} + +/** + * nfs_can_coalesce_requests - test two requests for compatibility + * @prev: pointer to nfs_page + * @req: pointer to nfs_page + * + * The nfs_page structures 'prev' and 'req' are compared to ensure that the + * page data area they describe is contiguous, and that their RPC + * credentials, NFSv4 open state, and lockowners are the same. + * + * Return 'true' if this is the case, else return 'false'. + */ +static bool nfs_can_coalesce_requests(struct nfs_page *prev, + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) +{ + size_t size; + struct file_lock_context *flctx; + + if (prev) { + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + return false; + flctx = d_inode(req->wb_context->dentry)->i_flctx; + if (flctx != NULL && + !(list_empty_careful(&flctx->flc_posix) && + list_empty_careful(&flctx->flc_flock)) && + !nfs_match_lock_context(req->wb_lock_context, + prev->wb_lock_context)) + return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + if (req->wb_page == prev->wb_page) { + if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) + return false; + } else { + if (req->wb_pgbase != 0 || + prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return false; + } + } + size = pgio->pg_ops->pg_test(pgio, prev, req); + WARN_ON_ONCE(size > req->wb_bytes); + if (size && size < req->wb_bytes) + req->wb_bytes = size; + return size > 0; +} + +/** + * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list. + * @desc: destination io descriptor + * @req: request + * + * Returns true if the request 'req' was successfully coalesced into the + * existing list of pages 'desc'. + */ +static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + struct nfs_page *prev = NULL; + + if (mirror->pg_count != 0) { + prev = nfs_list_entry(mirror->pg_list.prev); + } else { + if (desc->pg_ops->pg_init) + desc->pg_ops->pg_init(desc, req); + mirror->pg_base = req->wb_pgbase; + } + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + nfs_list_remove_request(req); + nfs_list_add_request(req, &mirror->pg_list); + mirror->pg_count += req->wb_bytes; + return 1; +} + +/* + * Helper for nfs_pageio_add_request and nfs_pageio_complete + */ +static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + if (!list_empty(&mirror->pg_list)) { + int error = desc->pg_ops->pg_doio(desc); + if (error < 0) + desc->pg_error = error; + else + mirror->pg_bytes_written += mirror->pg_count; + } + if (list_empty(&mirror->pg_list)) { + mirror->pg_count = 0; + mirror->pg_base = 0; + } +} + +/** + * nfs_pageio_add_request - Attempt to coalesce a request into a page list. + * @desc: destination io descriptor + * @req: request + * + * This may split a request into subrequests which are all part of the + * same page group. + * + * Returns true if the request 'req' was successfully coalesced into the + * existing list of pages 'desc'. + */ +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + struct nfs_page *subreq; + unsigned int bytes_left = 0; + unsigned int offset, pgbase; + + nfs_page_group_lock(req, false); + + subreq = req; + bytes_left = subreq->wb_bytes; + offset = subreq->wb_offset; + pgbase = subreq->wb_pgbase; + + do { + if (!nfs_pageio_do_add_request(desc, subreq)) { + /* make sure pg_test call(s) did nothing */ + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); + WARN_ON_ONCE(subreq->wb_offset != offset); + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + + nfs_page_group_unlock(req); + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + if (mirror->pg_recoalesce) + return 0; + /* retry add_request for this subreq */ + nfs_page_group_lock(req, false); + continue; + } + + /* check for buggy pg_test call(s) */ + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); + WARN_ON_ONCE(subreq->wb_bytes == 0); + + bytes_left -= subreq->wb_bytes; + offset += subreq->wb_bytes; + pgbase += subreq->wb_bytes; + + if (bytes_left) { + subreq = nfs_create_request(req->wb_context, + req->wb_page, + subreq, pgbase, bytes_left); + if (IS_ERR(subreq)) + goto err_ptr; + nfs_lock_request(subreq); + subreq->wb_offset = offset; + subreq->wb_index = req->wb_index; + } + } while (bytes_left > 0); + + nfs_page_group_unlock(req); + return 1; +err_ptr: + desc->pg_error = PTR_ERR(subreq); + nfs_page_group_unlock(req); + return 0; +} + +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + LIST_HEAD(head); + + do { + list_splice_init(&mirror->pg_list, &head); + mirror->pg_bytes_written -= mirror->pg_count; + mirror->pg_count = 0; + mirror->pg_base = 0; + mirror->pg_recoalesce = 0; + + desc->pg_moreio = 0; + + while (!list_empty(&head)) { + struct nfs_page *req; + + req = list_first_entry(&head, struct nfs_page, wb_list); + nfs_list_remove_request(req); + if (__nfs_pageio_add_request(desc, req)) + continue; + if (desc->pg_error < 0) + return 0; + break; + } + } while (mirror->pg_recoalesce); + return 1; +} + +static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + int ret; + + do { + ret = __nfs_pageio_add_request(desc, req); + if (ret) + break; + if (desc->pg_error < 0) + break; + ret = nfs_do_recoalesce(desc); + } while (ret); + + return ret; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + u32 midx; + unsigned int pgbase, offset, bytes; + struct nfs_page *dupreq, *lastreq; + + pgbase = req->wb_pgbase; + offset = req->wb_offset; + bytes = req->wb_bytes; + + nfs_pageio_setup_mirroring(desc, req); + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + if (midx) { + nfs_page_group_lock(req, false); + + /* find the last request */ + for (lastreq = req->wb_head; + lastreq->wb_this_page != req->wb_head; + lastreq = lastreq->wb_this_page) + ; + + dupreq = nfs_create_request(req->wb_context, + req->wb_page, lastreq, pgbase, bytes); + + if (IS_ERR(dupreq)) { + nfs_page_group_unlock(req); + return 0; + } + + nfs_lock_request(dupreq); + nfs_page_group_unlock(req); + dupreq->wb_offset = offset; + dupreq->wb_index = req->wb_index; + } else + dupreq = req; + + if (nfs_pgio_has_mirroring(desc)) + desc->pg_mirror_idx = midx; + if (!nfs_pageio_add_request_mirror(desc, dupreq)) + return 0; + } + + return 1; +} + +/* + * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an + * nfs_pageio_descriptor + * @desc: pointer to io descriptor + */ +static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, + u32 mirror_idx) +{ + struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; + u32 restore_idx = desc->pg_mirror_idx; + + if (nfs_pgio_has_mirroring(desc)) + desc->pg_mirror_idx = mirror_idx; + for (;;) { + nfs_pageio_doio(desc); + if (!mirror->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } + desc->pg_mirror_idx = restore_idx; +} + +/* + * nfs_pageio_resend - Transfer requests to new descriptor and resend + * @hdr - the pgio header to move request from + * @desc - the pageio descriptor to add requests to + * + * Try to move each request (nfs_page) from @hdr to @desc then attempt + * to send them. + * + * Returns 0 on success and < 0 on error. + */ +int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + LIST_HEAD(failed); + + desc->pg_dreq = hdr->dreq; + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(desc, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(desc); + if (!list_empty(&failed)) { + list_move(&failed, &hdr->pages); + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(nfs_pageio_resend); + +/** + * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor + * @desc: pointer to io descriptor + */ +void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) +{ + u32 midx; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) + nfs_pageio_complete_mirror(desc, midx); + + if (desc->pg_ops->pg_cleanup) + desc->pg_ops->pg_cleanup(desc); + nfs_pageio_cleanup_mirroring(desc); +} + +/** + * nfs_pageio_cond_complete - Conditional I/O completion + * @desc: pointer to io descriptor + * @index: page index + * + * It is important to ensure that processes don't try to take locks + * on non-contiguous ranges of pages as that might deadlock. This + * function should be called before attempting to wait on a locked + * nfs_page. It will complete the I/O if the page index 'index' + * is not contiguous with the existing list of pages in 'desc'. + */ +void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) +{ + struct nfs_pgio_mirror *mirror; + struct nfs_page *prev; + u32 midx; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + if (!list_empty(&mirror->pg_list)) { + prev = nfs_list_entry(mirror->pg_list.prev); + if (index != prev->wb_index + 1) + nfs_pageio_complete_mirror(desc, midx); + } + } +} + +int __init nfs_init_nfspagecache(void) +{ + nfs_page_cachep = kmem_cache_create("nfs_page", + sizeof(struct nfs_page), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_page_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_nfspagecache(void) +{ + kmem_cache_destroy(nfs_page_cachep); +} + +static const struct rpc_call_ops nfs_pgio_common_ops = { + .rpc_call_prepare = nfs_pgio_prepare, + .rpc_call_done = nfs_pgio_result, + .rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_pgios, +}; diff --git a/kernel/fs/nfs/pnfs.c b/kernel/fs/nfs/pnfs.c new file mode 100644 index 000000000..230606243 --- /dev/null +++ b/kernel/fs/nfs/pnfs.c @@ -0,0 +1,2249 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include +#include +#include "internal.h" +#include "pnfs.h" +#include "iostat.h" +#include "nfs4trace.h" +#include "delegation.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS +#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + if (local != NULL && !try_module_get(local->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + local = NULL; + } + spin_unlock(&pnfs_spinlock); + return local; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (nfss->pnfs_curr_ld) { + if (nfss->pnfs_curr_ld->clear_layoutdriver) + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + /* Decrement the MDS count. Purge the deviceid cache if zero */ + if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) + nfs4_deviceid_purge_client(nfss->nfs_client); + module_put(nfss->pnfs_curr_ld->owner); + } + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, + u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", + __func__, id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver + && ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " + "driver %u.\n", __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } + /* Bump the MDS count */ + atomic_inc(&server->nfs_client->cl_mds_count); + + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); + return status; + } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "NFS: %s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +/* + * pNFS client layout cache + */ + +/* Need to hold i_lock if caller does not already hold reference */ +void +pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) +{ + atomic_inc(&lo->plh_refcount); +} + +static struct pnfs_layout_hdr * +pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + return ld->alloc_layout_hdr(ino, gfp_flags); +} + +static void +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs_server *server = NFS_SERVER(lo->plh_inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (!list_empty(&lo->plh_layouts)) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } + put_rpccred(lo->plh_lc_cred); + return ld->free_layout_hdr(lo); +} + +static void +pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs_inode *nfsi = NFS_I(lo->plh_inode); + dprintk("%s: freeing layout cache %p\n", __func__, lo); + nfsi->layout = NULL; + /* Reset MDS Threshold I/O counters */ + nfsi->write_io = 0; + nfsi->read_io = 0; +} + +void +pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (!list_empty(&lo->plh_segs)) + WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); + pnfs_detach_layout_hdr(lo); + spin_unlock(&inode->i_lock); + pnfs_free_layout_hdr(lo); + } +} + +static int +pnfs_iomode_to_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +static void +pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + lo->plh_retry_timestamp = jiffies; + if (!test_and_set_bit(fail_bit, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); +} + +static void +pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + if (test_and_clear_bit(fail_bit, &lo->plh_flags)) + atomic_dec(&lo->plh_refcount); +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(head); + + spin_lock(&inode->i_lock); + pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, + iomode == IOMODE_RW ? "RW" : "READ"); +} + +static bool +pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + unsigned long start, end; + int fail_bit = pnfs_iomode_to_fail_bit(iomode); + + if (test_bit(fail_bit, &lo->plh_flags) == 0) + return false; + end = jiffies; + start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; + if (!time_in_range(lo->plh_retry_timestamp, start, end)) { + /* It is time to retry the failed layoutgets */ + pnfs_layout_clear_fail_bit(lo, fail_bit); + return false; + } + return true; +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->pls_list); + INIT_LIST_HEAD(&lseg->pls_lc_list); + atomic_set(&lseg->pls_refcount, 1); + smp_mb(); + set_bit(NFS_LSEG_VALID, &lseg->pls_flags); + lseg->pls_layout = lo; +} + +static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct inode *ino = lseg->pls_layout->plh_inode; + + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); +} + +static void +pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = lo->plh_inode; + + WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ + atomic_dec(&lo->plh_refcount); + if (list_empty(&lo->plh_segs)) + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +/* Return true if layoutreturn is needed */ +static bool +pnfs_layout_need_return(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *s; + + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + return false; + + list_for_each_entry(s, &lo->plh_segs, pls_list) + if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) + return false; + + return true; +} + +static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, + struct pnfs_layout_hdr *lo, struct inode *inode) +{ + lo = lseg->pls_layout; + inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + if (pnfs_layout_need_return(lo, lseg)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode; + + stateid = lo->plh_stateid; + iomode = lo->plh_return_iomode; + /* decreased in pnfs_send_layoutreturn() */ + lo->plh_block_lgets++; + lo->plh_return_iomode = 0; + spin_unlock(&inode->i_lock); + pnfs_get_layout_hdr(lo); + + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } else + spin_unlock(&inode->i_lock); +} + +void +pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + + /* Handle the case where refcount != 1 */ + if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) + return; + + lo = lseg->pls_layout; + inode = lo->plh_inode; + /* Do we need a layoutreturn? */ + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); + + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); + spin_unlock(&inode->i_lock); + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg); + +static void pnfs_free_lseg_async_work(struct work_struct *work) +{ + struct pnfs_layout_segment *lseg; + struct pnfs_layout_hdr *lo; + + lseg = container_of(work, struct pnfs_layout_segment, pls_work); + lo = lseg->pls_layout; + + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); +} + +static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) +{ + INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); + schedule_work(&lseg->pls_work); +} + +void +pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + struct pnfs_layout_hdr *lo = lseg->pls_layout; + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); + pnfs_free_lseg_async(lseg); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); + +static u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* + * is l2 fully contained in l1? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (end1 == NFS4_MAX_UINT64 || end1 > start2) && + (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + +static bool +should_free_lseg(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + +/* Returns 1 if lseg is removed from list, 0 otherwise */ +static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + int rv = 0; + + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) + rv = 1; + } + return rv; +} + +/* Returns count of number of matching invalid lsegs remaining in list + * after call. + */ +int +pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range) +{ + struct pnfs_layout_segment *lseg, *next; + int invalid = 0, removed = 0; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) + return 0; + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (!recall_range || + should_free_lseg(&lseg->pls_range, recall_range)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, lseg->pls_range.offset, + lseg->pls_range.length); + invalid++; + removed += mark_lseg_invalid(lseg, tmp_list); + } + dprintk("%s:Return %i\n", __func__, invalid - removed); + return invalid - removed; +} + +/* note free_me must contain lsegs from a single layout_hdr */ +void +pnfs_free_lseg_list(struct list_head *free_me) +{ + struct pnfs_layout_segment *lseg, *tmp; + + if (list_empty(free_me)) + return; + + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { + list_del(&lseg->pls_list); + pnfs_free_lseg(lseg); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_get_layout_hdr(lo); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + pnfs_clear_retry_layoutget(lo); + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); + pnfs_put_layout_hdr(lo); + } else + spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_destroy_layout); + +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo; + bool ret = false; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + + pnfs_layoutcommit_inode(inode, false); + + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); +} + +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +/* update lo->plh_stateid with new if is more recent */ +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + bool update_barrier) +{ + u32 oldseq, newseq, new_barrier; + int empty = list_empty(&lo->plh_segs); + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + if (update_barrier) { + new_barrier = be32_to_cpu(new->seqid); + } else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); + } + if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; + } +} + +static bool +pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid) +{ + u32 seqid = be32_to_cpu(stateid->seqid); + + return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); +} + +static bool +pnfs_layout_returning(const struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && + (lo->plh_return_iomode == IOMODE_ANY || + lo->plh_return_iomode == range->iomode); +} + +/* lget is set to 1 if called from inside send_layoutget call chain */ +static bool +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, int lget) +{ + return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + (list_empty(&lo->plh_segs) && + (atomic_read(&lo->plh_outstanding) > lget)) || + pnfs_layout_returning(lo, range); +} + +int +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, + struct nfs4_state *open_state) +{ + int status = 0; + + dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (pnfs_layoutgets_blocked(lo, range, 1)) { + status = -EAGAIN; + } else if (!nfs4_valid_open_stateid(open_state)) { + status = -EBADF; + } else if (list_empty(&lo->plh_segs) || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + int seq; + + do { + seq = read_seqbegin(&open_state->seqlock); + nfs4_stateid_copy(dst, &open_state->stateid); + } while (read_seqretry(&open_state->seqlock, seq)); + } else + nfs4_stateid_copy(dst, &lo->plh_stateid); + spin_unlock(&lo->plh_inode->i_lock); + dprintk("<-- %s\n", __func__); + return status; +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + struct pnfs_layout_range *range, + gfp_t gfp_flags) +{ + struct inode *ino = lo->plh_inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg; + + dprintk("--> %s\n", __func__); + + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return NULL; + + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + if (IS_ERR(lseg)) { + switch (PTR_ERR(lseg)) { + case -ENOMEM: + case -ERESTARTSYS: + break; + default: + /* remember that LAYOUTGET failed and suspend trying */ + pnfs_layout_io_set_failed(lo, range->iomode); + } + return NULL; + } else + pnfs_layout_clear_fail_bit(lo, + pnfs_iomode_to_fail_bit(range->iomode)); + + return lseg; +} + +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + +void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +{ + clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); +} + +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync) +{ + struct inode *ino = lo->plh_inode; + struct nfs4_layoutreturn *lrp; + int status = 0; + + lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + spin_lock(&ino->i_lock); + lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); + rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->args.range.iomode = iomode; + lrp->args.range.offset = 0; + lrp->args.range.length = NFS4_MAX_UINT64; + lrp->args.layout = lo; + lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; + + status = nfs4_proc_layoutreturn(lrp, sync); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + +/* + * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr + * when the layout segment list is empty. + * + * Note that a pnfs_layout_hdr can exist with an empty layout segment + * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the + * deviceid is marked invalid. + */ +int +_pnfs_return_layout(struct inode *ino) +{ + struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); + LIST_HEAD(tmp_list); + nfs4_stateid stateid; + int status = 0, empty; + + dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); + + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (!lo) { + spin_unlock(&ino->i_lock); + dprintk("NFS: %s no layout to return\n", __func__); + goto out; + } + stateid = nfsi->layout->plh_stateid; + /* Reference matched in nfs4_layoutreturn_release */ + pnfs_get_layout_hdr(lo); + empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); + } + + /* Don't send a LAYOUTRETURN if list was initially empty */ + if (empty) { + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + dprintk("NFS: %s no layout segments to return\n", __func__); + goto out; + } + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + lo->plh_block_lgets++; + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} +EXPORT_SYMBOL_GPL(_pnfs_return_layout); + +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + +bool pnfs_roc(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_open_context *ctx; + struct nfs4_state *state; + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg, *tmp; + nfs4_stateid stateid; + LIST_HEAD(tmp_list); + bool found = false, layoutreturn = false; + + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + goto out_noroc; + + /* Don't return layout if we hold a delegation */ + if (nfs4_check_delegation(ino, FMODE_READ)) + goto out_noroc; + + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + /* Don't return layout if there is open file state */ + if (state != NULL && state->state != 0) + goto out_noroc; + } + + pnfs_clear_retry_layoutget(lo); + list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + mark_lseg_invalid(lseg, &tmp_list); + found = true; + } + if (!found) + goto out_noroc; + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); + return true; + +out_noroc: + if (lo) { + stateid = lo->plh_stateid; + layoutreturn = + test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); + if (layoutreturn) { + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + } + } + spin_unlock(&ino->i_lock); + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); + pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } + return false; +} + +void pnfs_roc_release(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + lo->plh_block_lgets--; + if (atomic_dec_and_test(&lo->plh_refcount)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&ino->i_lock); + pnfs_free_layout_hdr(lo); + } else + spin_unlock(&ino->i_lock); +} + +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) + lo->plh_barrier = barrier; + spin_unlock(&ino->i_lock); +} + +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg; + nfs4_stateid stateid; + u32 current_seqid; + bool found = false, layoutreturn = false; + + spin_lock(&ino->i_lock); + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + found = true; + goto out; + } + lo = nfsi->layout; + current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); +out: + if (!found) { + stateid = lo->plh_stateid; + layoutreturn = + test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); + if (layoutreturn) { + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + } + } + spin_unlock(&ino->i_lock); + if (layoutreturn) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); + } + return found; +} + +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + s64 d; + + /* high offset > low offset */ + d = l1->offset - l2->offset; + if (d) + return d; + + /* short length > long length */ + d = l2->length - l1->length; + if (d) + return d; + + /* read > read/write */ + return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); +} + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + + dprintk("%s:Begin\n", __func__); + + list_for_each_entry(lp, &lo->plh_segs, pls_list) { + if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) + continue; + list_add_tail(&lseg->pls_list, &lp->pls_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length, + lp, lp->pls_range.iomode, lp->pls_range.offset, + lp->pls_range.length); + goto out; + } + list_add_tail(&lseg->pls_list, &lo->plh_segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length); +out: + pnfs_get_layout_hdr(lo); + + dprintk("%s:Return\n", __func__); +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct pnfs_layout_hdr *lo; + + lo = pnfs_alloc_layout_hdr(ino, gfp_flags); + if (!lo) + return NULL; + atomic_set(&lo->plh_refcount, 1); + INIT_LIST_HEAD(&lo->plh_layouts); + INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); + lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->cred); + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + if (nfsi->layout != NULL) + goto out_existing; + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) { /* Won the race? */ + nfsi->layout = new; + return new; + } else if (new != NULL) + pnfs_free_layout_hdr(new); +out_existing: + pnfs_get_layout_hdr(nfsi->layout); + return nfsi->layout; +} + +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, + const struct pnfs_layout_range *range) +{ + struct pnfs_layout_range range1; + + if ((range->iomode == IOMODE_RW && + ls_range->iomode != IOMODE_RW) || + !pnfs_lseg_range_intersecting(ls_range, range)) + return 0; + + /* range1 covers only the first byte in the range */ + range1 = *range; + range1.length = 1; + return pnfs_lseg_range_contained(ls_range, &range1); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_find_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && + pnfs_lseg_range_match(&lseg->pls_range, range)) { + ret = pnfs_get_lseg(lseg); + break; + } + if (lseg->pls_range.offset > range->offset) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + return ret; +} + +/* + * Use mdsthreshold hints set at each OPEN to determine if I/O should go + * to the MDS or over pNFS + * + * The nfs_inode read_io and write_io fields are cumulative counters reset + * when there are no layout segments. Note that in pnfs_update_layout iomode + * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a + * WRITE request. + * + * A return of true means use MDS I/O. + * + * From rfc 5661: + * If a file's size is smaller than the file size threshold, data accesses + * SHOULD be sent to the metadata server. If an I/O request has a length that + * is below the I/O size threshold, the I/O SHOULD be sent to the metadata + * server. If both file size and I/O size are provided, the client SHOULD + * reach or exceed both thresholds before sending its read or write + * requests to the data server. + */ +static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, + struct inode *ino, int iomode) +{ + struct nfs4_threshold *t = ctx->mdsthreshold; + struct nfs_inode *nfsi = NFS_I(ino); + loff_t fsize = i_size_read(ino); + bool size = false, size_set = false, io = false, io_set = false, ret = false; + + if (t == NULL) + return ret; + + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); + + switch (iomode) { + case IOMODE_READ: + if (t->bm & THRESHOLD_RD) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->rd_sz) + size = true; + } + if (t->bm & THRESHOLD_RD_IO) { + dprintk("%s nfsi->read_io %llu\n", __func__, + nfsi->read_io); + io_set = true; + if (nfsi->read_io < t->rd_io_sz) + io = true; + } + break; + case IOMODE_RW: + if (t->bm & THRESHOLD_WR) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->wr_sz) + size = true; + } + if (t->bm & THRESHOLD_WR_IO) { + dprintk("%s nfsi->write_io %llu\n", __func__, + nfsi->write_io); + io_set = true; + if (nfsi->write_io < t->wr_io_sz) + io = true; + } + break; + } + if (size_set && io_set) { + if (size && io) + ret = true; + } else if (size || io) + ret = true; + + dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); + return ret; +} + +/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +{ + if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) + return 1; + return nfs_wait_bit_killable(key); +} + +static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + /* + * send layoutcommit as it can hold up layoutreturn due to lseg + * reference + */ + pnfs_layoutcommit_inode(lo->plh_inode, false); + return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, + pnfs_layoutget_retry_bit_wait, + TASK_UNINTERRUPTIBLE); +} + +static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) +{ + unsigned long *bitlock = &lo->plh_flags; + + clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags) +{ + struct pnfs_layout_range arg = { + .iomode = iomode, + .offset = pos, + .length = count, + }; + unsigned pg_offset; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs_client *clp = server->nfs_client; + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + bool first; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + goto out; + + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + goto out; + +lookup_again: + first = false; + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); + if (lo == NULL) { + spin_unlock(&ino->i_lock); + goto out; + } + + /* Do we even need to bother with this? */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s matches recall, use MDS\n", __func__); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (pnfs_layout_io_test_failed(lo, iomode) && + !pnfs_should_retry_layoutget(lo)) + goto out_unlock; + + first = list_empty(&lo->plh_segs); + if (first) { + /* The first layoutget for the file. Need to serialize per + * RFC 5661 Errata 3208. + */ + if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, + &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto lookup_again; + } + } else { + /* Check to see if the layout for the given range + * already exists + */ + lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + } + + /* + * Because we free lsegs before sending LAYOUTRETURN, we need to wait + * for LAYOUTRETURN even if first is true. + */ + if (!lseg && pnfs_should_retry_layoutget(lo) && + test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + dprintk("%s wait for layoutreturn\n", __func__); + if (pnfs_prepare_to_retry_layoutget(lo)) { + if (first) + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); + goto lookup_again; + } + goto out_put_layout_hdr; + } + + if (pnfs_layoutgets_blocked(lo, &arg, 0)) + goto out_unlock; + atomic_inc(&lo->plh_outstanding); + spin_unlock(&ino->i_lock); + + if (list_empty(&lo->plh_layouts)) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + if (list_empty(&lo->plh_layouts)) + list_add_tail(&lo->plh_layouts, &server->layouts); + spin_unlock(&clp->cl_lock); + } + + pg_offset = arg.offset & ~PAGE_CACHE_MASK; + if (pg_offset) { + arg.offset -= pg_offset; + arg.length += pg_offset; + } + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); + + lseg = send_layoutget(lo, ctx, &arg, gfp_flags); + pnfs_clear_retry_layoutget(lo); + atomic_dec(&lo->plh_outstanding); +out_put_layout_hdr: + if (first) + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); +out: + dprintk("%s: inode %s/%llu pNFS layout segment %s for " + "(%s, offset: %llu, length: %llu)\n", + __func__, ino->i_sb->s_id, + (unsigned long long)NFS_FILEID(ino), + lseg == NULL ? "not found" : "found", + iomode==IOMODE_RW ? "read/write" : "read-only", + (unsigned long long)pos, + (unsigned long long)count); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out_put_layout_hdr; +} +EXPORT_SYMBOL_GPL(pnfs_update_layout); + +struct pnfs_layout_segment * +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->plh_inode; + LIST_HEAD(free_me); + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + init_lseg(lo, lseg); + lseg->pls_range = res->range; + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s forget reply due to recall\n", __func__); + goto out_forget_reply; + } + + if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { + dprintk("%s forget reply due to state\n", __func__); + goto out_forget_reply; + } + + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + /* existing state ID, make sure the sequence number matches. */ + if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to sequence\n", __func__); + goto out_forget_reply; + } + pnfs_set_layout_stateid(lo, &res->stateid, false); + } else { + /* + * We got an entirely new state ID. Mark all segments for the + * inode invalid, and don't bother validating the stateid + * sequence number. + */ + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + + nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); + lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + } + + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + + pnfs_get_lseg(lseg); + pnfs_layout_insert_lseg(lo, lseg); + + if (res->return_on_close) { + set_bit(NFS_LSEG_ROC, &lseg->pls_flags); + set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); + } + + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); + return lseg; +out: + return ERR_PTR(status); + +out_forget_reply: + spin_unlock(&ino->i_lock); + lseg->pls_layout = lo; + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + goto out; +} + +static void +pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *return_range) +{ + struct pnfs_layout_segment *lseg, *next; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) + return; + + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (should_free_lseg(&lseg->pls_range, return_range)) { + dprintk("%s: marking lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, + lseg->pls_range.length); + set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + mark_lseg_invalid(lseg, tmp_list); + } +} + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; + int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(free_me); + + spin_lock(&inode->i_lock); + /* set failure bit so that pnfs path will be retried later */ + pnfs_layout_set_fail_bit(lo, iomode); + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); + if (lo->plh_return_iomode == 0) + lo->plh_return_iomode = range.iomode; + else if (lo->plh_return_iomode != range.iomode) + lo->plh_return_iomode = IOMODE_ANY; + /* + * mark all matching lsegs so that we are sure to have no live + * segments at hand when sending layoutreturn. See pnfs_put_lseg() + * for how it works. + */ + pnfs_mark_matching_lsegs_return(lo, &free_me, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); +} +EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); + +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + u64 rd_size = req->wb_bytes; + + if (pgio->pg_lseg == NULL) { + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + rd_size, + IOMODE_READ, + GFP_KERNEL); + } + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size) +{ + if (pgio->pg_lseg == NULL) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + wb_size, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); + +void +pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc) +{ + if (desc->pg_lseg) { + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup); + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + unsigned int size; + u64 seg_end, req_start, seg_left; + + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + + /* + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start >= seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) { + /* reference the new lseg */ + if (pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + if (pgio->pg_ops->pg_init) + pgio->pg_ops->pg_init(pgio, req); + return 0; + } + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); + +int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) +{ + struct nfs_pageio_descriptor pgio; + + /* Resend all requests through the MDS */ + nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, + hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); +} +EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); + +static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) +{ + + dprintk("pnfs write error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr); +} + +/* + * Called by non rpc-based layout drivers + */ +void pnfs_ld_write_done(struct nfs_pgio_header *hdr) +{ + trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); + if (!hdr->pnfs_error) { + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); + } else + pnfs_ld_handle_write_error(hdr); + hdr->mds_ops->rpc_release(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_ld_write_done); + +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &mirror->pg_list); + nfs_pageio_reset_write_mds(desc); + mirror->pg_recoalesce = 1; + } + nfs_pgio_data_destroy(hdr); +} + +static enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) +{ + struct inode *inode = hdr->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + hdr->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, hdr->args.count, hdr->args.offset, how); + trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); + if (trypnfs != PNFS_NOT_ATTEMPTED) + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +static void +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) +{ + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; + + trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, hdr); +} + +static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) +{ + pnfs_put_lseg(hdr->lseg); + nfs_pgio_header_free(hdr); +} + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + struct nfs_pgio_header *hdr; + int ret; + + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + return -ENOMEM; + } + nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); + + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); + ret = nfs_generic_pgio(desc, hdr); + if (!ret) + pnfs_do_write(desc, hdr, desc->pg_ioflags); + + return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + +int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr) +{ + struct nfs_pageio_descriptor pgio; + + /* Resend all requests through the MDS */ + nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); +} +EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); + +static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) +{ + dprintk("pnfs read error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr); +} + +/* + * Called by non rpc-based layout drivers + */ +void pnfs_ld_read_done(struct nfs_pgio_header *hdr) +{ + trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); + if (likely(!hdr->pnfs_error)) { + __nfs4_read_done_cb(hdr); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); + } else + pnfs_ld_handle_read_error(hdr); + hdr->mds_ops->rpc_release(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_ld_read_done); + +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &mirror->pg_list); + nfs_pageio_reset_read_mds(desc); + mirror->pg_recoalesce = 1; + } + nfs_pgio_data_destroy(hdr); +} + +/* + * Call the appropriate parallel I/O subsystem read function. + */ +static enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = hdr->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; + + hdr->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, hdr->args.count, hdr->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); + if (trypnfs != PNFS_NOT_ATTEMPTED) + nfs_inc_stats(inode, NFSIOS_PNFS_READ); + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +/* Resend all requests through pnfs. */ +int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +{ + struct nfs_pageio_descriptor pgio; + + nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); +} +EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); + +static void +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) +{ + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; + int err = 0; + + trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); + if (trypnfs == PNFS_TRY_AGAIN) + err = pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_through_mds(desc, hdr); +} + +static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) +{ + pnfs_put_lseg(hdr->lseg); + nfs_pgio_header_free(hdr); +} + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + struct nfs_pgio_header *hdr; + int ret; + + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + return -ENOMEM; + } + nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); + ret = nfs_generic_pgio(desc, hdr); + if (!ret) + pnfs_do_read(desc, hdr); + return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + +/* + * There can be multiple RW segments. + */ +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW && + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + list_add(&lseg->pls_lc_list, listp); + } +} + +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + pnfs_clear_layoutcommitting(inode); +} + +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); + +void +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) +{ + struct nfs_inode *nfsi = NFS_I(inode); + bool mark_as_dirty = false; + + spin_lock(&inode->i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + nfsi->layout->plh_lwb = end_pos; + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, inode->i_ino); + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + pnfs_get_lseg(lseg); + } + spin_unlock(&inode->i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutcommit) + nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); +} + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t end_pos; + int status; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + status = -EAGAIN; + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) + goto out; + status = wait_on_bit_lock_action(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (status) + goto out; + } + + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + + INIT_LIST_HEAD(&data->lseg_list); + pnfs_list_write_lseg(inode, &data->lseg_list); + + end_pos = nfsi->layout->plh_lwb; + + nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + if (ld->prepare_layoutcommit) { + status = ld->prepare_layoutcommit(&data->args); + if (status) { + spin_lock(&inode->i_lock); + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&inode->i_lock); + put_rpccred(data->cred); + goto clear_layoutcommitting; + } + } + + + status = nfs4_proc_layoutcommit(data, sync); +out: + if (status) + mark_inode_dirty_sync(inode); + dprintk("<-- %s status %d\n", __func__, status); + return status; +out_unlock: + spin_unlock(&inode->i_lock); + kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); + +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + struct nfs4_threshold *thp; + + thp = kzalloc(sizeof(*thp), GFP_NOFS); + if (!thp) { + dprintk("%s mdsthreshold allocation failed\n", __func__); + return NULL; + } + return thp; +} diff --git a/kernel/fs/nfs/pnfs.h b/kernel/fs/nfs/pnfs.h new file mode 100644 index 000000000..1e6308f82 --- /dev/null +++ b/kernel/fs/nfs/pnfs.h @@ -0,0 +1,692 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +#include +#include +#include + +enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ + NFS_LSEG_ROC, /* roc bit received from server */ + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ + NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; + struct nfs_client *ds_clp; + atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ +}; + +struct pnfs_layout_segment { + struct list_head pls_list; + struct list_head pls_lc_list; + struct pnfs_layout_range pls_range; + atomic_t pls_refcount; + unsigned long pls_flags; + struct pnfs_layout_hdr *pls_layout; + struct work_struct pls_work; +}; + +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, + PNFS_TRY_AGAIN = 2, +}; + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +/* + * Default data server connection timeout and retrans vaules. + * Set by module parameters dataserver_timeo and dataserver_retrans. + */ +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ +#define NFS4_DEF_DS_RETRANS 5 + +/* error codes for internal use */ +#define NFS4ERR_RESET_TO_MDS 12001 +#define NFS4ERR_RESET_TO_PNFS 12002 + +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ + NFS_LAYOUT_ROC, /* some lseg had roc bit set */ + NFS_LAYOUT_RETURN, /* Return this layout ASAP */ + NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ + NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ + NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ +}; + +enum layoutdriver_policy_flags { + /* Should the pNFS client commit and return the layout upon truncate to + * a smaller size */ + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, + PNFS_READ_WHOLE_PAGE = 1 << 2, +}; + +struct nfs4_deviceid_node; + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + unsigned flags; + unsigned max_deviceinfo_size; + + int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); + + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); + void (*free_layout_hdr) (struct pnfs_layout_hdr *); + + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + void (*free_lseg) (struct pnfs_layout_segment *lseg); + + void (*return_range) (struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range); + + /* test for nfs page cache coalescing */ + const struct nfs_pageio_ops *pg_read_ops; + const struct nfs_pageio_ops *pg_write_ops; + + struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, + struct page *page); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + + int (*sync)(struct inode *inode, bool datasync); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *); + enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); + + void (*free_deviceid_node) (struct nfs4_deviceid_node *); + struct nfs4_deviceid_node * (*alloc_deviceid_node) + (struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); + + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args); + + void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); + int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args); +}; + +struct pnfs_layout_hdr { + atomic_t plh_refcount; + struct list_head plh_layouts; /* other client layouts */ + struct list_head plh_bulk_destroy; + struct list_head plh_segs; /* layout segments list */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_retry_timestamp; + unsigned long plh_flags; + enum pnfs_iomode plh_return_iomode; + loff_t plh_lwb; /* last write byte for layoutcommit */ + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ + struct inode *plh_inode; +}; + +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + unsigned int maxcount; /* gdia_maxcount */ + struct page **pages; + unsigned int pgbase; + unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ +}; + +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +struct pnfs_devicelist { + unsigned int eof; + unsigned int num_devs; + struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; + +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +/* nfs4proc.c */ +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev, + struct rpc_cred *cred); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); + +/* pnfs.c */ +void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_lseg(struct pnfs_layout_segment *lseg); +void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size); +void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); +struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_free_lseg_list(struct list_head *tmp_list); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall); +void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + bool update_barrier); +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, + struct nfs4_state *open_state); +int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range); +bool pnfs_roc(struct inode *ino); +void pnfs_roc_release(struct inode *ino); +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); +int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); +void pnfs_ld_write_done(struct nfs_pgio_header *); +void pnfs_ld_read_done(struct nfs_pgio_header *); +int pnfs_read_resend_pnfs(struct nfs_pgio_header *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags); +void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); + +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); +int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); +int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg); + +/* nfs4_deviceid_flags */ +enum { + NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ + NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ +}; + +/* pnfs_dev.c */ +struct nfs4_deviceid_node { + struct hlist_node node; + struct hlist_node tmpnode; + const struct pnfs_layoutdriver_type *ld; + const struct nfs_client *nfs_client; + unsigned long flags; + unsigned long timestamp_unavailable; + struct nfs4_deviceid deviceid; + struct rcu_head rcu; + atomic_t ref; +}; + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask); +void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, + const struct nfs4_deviceid *); +bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); +bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); +void nfs4_deviceid_purge_client(const struct nfs_client *); + +/* pnfs_nfs.c */ +void pnfs_generic_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo); +void pnfs_generic_commit_release(void *calldata); +void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); +void pnfs_generic_rw_release(void *data); +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo); +int pnfs_generic_commit_pagelist(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo, + int (*initiate_commit)(struct nfs_commit_data *data, + int how)); +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); +void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); +void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, + gfp_t gfp_flags); +void nfs4_pnfs_v3_ds_connect_unload(void); +void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, + struct nfs4_deviceid_node *devid, unsigned int timeo, + unsigned int retrans, u32 version, u32 minor_version, + rpc_authflavor_t au_flavor); +struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, + struct xdr_stream *xdr, + gfp_t gfp_flags); +void pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + +static inline bool nfs_have_layout(struct inode *inode) +{ + return NFS_I(inode)->layout != NULL; +} + +static inline struct nfs4_deviceid_node * +nfs4_get_deviceid(struct nfs4_deviceid_node *d) +{ + atomic_inc(&d->ref); + return d; +} + +static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); +} + +static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { + atomic_dec(&lo->plh_refcount); + /* wake up waiters for LAYOUTRETURN as that is not needed */ + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + } +} + +static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); +} + +static inline struct pnfs_layout_segment * +pnfs_get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic(); + } + return lseg; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) +{ + if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->get_ds_info == NULL) + return NULL; + return ld->get_ds_info(inode); +} + +static inline void +pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) +{ + set_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, u32 ds_commit_idx) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (lseg == NULL || ld->mark_request_commit == NULL) + return false; + ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); + return true; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->clear_request_commit == NULL) + return false; + ld->clear_request_commit(req, cinfo); + return true; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) +{ + if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + return 0; + else + return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); +} + +static inline struct nfs_page * +pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, + struct page *page) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->search_commit_reqs == NULL) + return NULL; + return ld->search_commit_reqs(cinfo, page); +} + +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_SETATTR; +} + +static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; +} + +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || + test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss) && nfsi->layout) + return _pnfs_return_layout(ino); + + return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld && + nfss->pnfs_curr_ld->id == src->l_type); +} + +#ifdef NFS_DEBUG +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +#else +static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) +{ +} +#endif /* NFS_DEBUG */ +#else /* CONFIG_NFS_V4_1 */ + +static inline bool nfs_have_layout(struct inode *inode) +{ + return false; +} + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + return 0; +} + +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + return false; +} + +static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + return false; +} + +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + +static inline bool +pnfs_roc(struct inode *ino) +{ + return false; +} + +static inline void +pnfs_roc_release(struct inode *ino) +{ +} + +static inline void +pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +} + +static inline bool +pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ + return false; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, + const struct nfs_fh *mntfh, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + return NULL; +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, u32 ds_commit_idx) +{ + return false; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ + return false; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) +{ + return 0; +} + +static inline struct nfs_page * +pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, + struct page *page) +{ + return NULL; +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return false; +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + +static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + return NULL; +} + +static inline void nfs4_pnfs_v3_ds_connect_unload(void) +{ +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/kernel/fs/nfs/pnfs_dev.c b/kernel/fs/nfs/pnfs_dev.c new file mode 100644 index 000000000..2961fcd7a --- /dev/null +++ b/kernel/fs/nfs/pnfs_dev.c @@ -0,0 +1,365 @@ +/* + * Device operations for the pnfs client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * Garth Goodson + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include +#include "nfs4session.h" +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* + * Device ID RCU cache. A device ID is unique per server and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) + +static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfs4_deviceid_lock); + +#ifdef NFS_DEBUG +void +nfs4_print_deviceid(const struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} +EXPORT_SYMBOL_GPL(nfs4_print_deviceid); +#endif + +static inline u32 +nfs4_deviceid_hash(const struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +static struct nfs4_deviceid_node * +_lookup_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) + if (d->ld == ld && d->nfs_client == clp && + !memcmp(&d->deviceid, id, sizeof(*id))) { + if (atomic_read(&d->ref)) + return d; + else + continue; + } + return NULL; +} + +static struct nfs4_deviceid_node * +nfs4_get_device_info(struct nfs_server *server, + const struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d = NULL; + struct pnfs_device *pdev = NULL; + struct page **pages = NULL; + u32 max_resp_sz; + int max_pages; + int rc, i; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + if (server->pnfs_curr_ld->max_deviceinfo_size && + server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) + max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s: server %p max_resp_sz %u max_pages %d\n", + __func__, server, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(*pdev), gfp_flags); + if (!pdev) + return NULL; + + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); + if (!pages) + goto out_free_pdev; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free_pages; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = server->pnfs_curr_ld->id; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = max_resp_sz; + pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free_pages; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, + gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); + +out_free_pages: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); +out_free_pdev: + kfree(pdev); + dprintk("<-- %s d %p\n", __func__, d); + return d; +} + +/* + * Lookup a deviceid in cache and get a reference count on it if found + * + * @clp nfs_client associated with deviceid + * @id deviceid to look up + */ +static struct nfs4_deviceid_node * +__nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, long hash) +{ + struct nfs4_deviceid_node *d; + + rcu_read_lock(); + d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, + hash); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; + rcu_read_unlock(); + return d; +} + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) +{ + long hash = nfs4_deviceid_hash(id); + struct nfs4_deviceid_node *d, *new; + + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) + return d; + + new = nfs4_get_device_info(server, id, cred, gfp_mask); + if (!new) + return new; + + spin_lock(&nfs4_deviceid_lock); + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + server->pnfs_curr_ld->free_deviceid_node(new); + return d; + } + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + atomic_inc(&new->ref); + spin_unlock(&nfs4_deviceid_lock); + + return new; +} +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); + +/* + * Remove a deviceid from cache + * + * @clp nfs_client associated with deviceid + * @id the deviceid to unhash + * + * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. + */ +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + rcu_read_unlock(); + if (!d) { + spin_unlock(&nfs4_deviceid_lock); + return; + } + hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); + spin_unlock(&nfs4_deviceid_lock); + + /* balance the initial ref set in pnfs_insert_deviceid */ + nfs4_put_deviceid_node(d); +} +EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server, + const struct nfs4_deviceid *id) +{ + INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); + d->ld = server->pnfs_curr_ld; + d->nfs_client = server->nfs_client; + d->flags = 0; + d->deviceid = *id; + atomic_set(&d->ref, 1); +} +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); + +/* + * Dereference a deviceid node and delete it when its reference count drops + * to zero. + * + * @d deviceid node to put + * + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. + */ +bool +nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) +{ + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } + if (!atomic_dec_and_test(&d->ref)) + return false; + d->ld->free_deviceid_node(d); + return true; +} +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); + +void +nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + node->timestamp_unavailable = jiffies; + set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); + +bool +nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long start, end; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (time_in_range(node->timestamp_unavailable, start, end)) + return true; + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + } + return false; +} +EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable); + +static void +_deviceid_purge_client(const struct nfs_client *clp, long hash) +{ + struct nfs4_deviceid_node *d; + HLIST_HEAD(tmp); + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) + if (d->nfs_client == clp && atomic_read(&d->ref)) { + hlist_del_init_rcu(&d->node); + hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); + } + rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); + + if (hlist_empty(&tmp)) + return; + + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); + nfs4_put_deviceid_node(d); + } +} + +void +nfs4_deviceid_purge_client(const struct nfs_client *clp) +{ + long h; + + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; + for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) + _deviceid_purge_client(clp, h); +} + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ + struct nfs4_deviceid_node *d; + int i; + + rcu_read_lock(); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node) + if (d->nfs_client == clp) + set_bit(NFS_DEVICEID_INVALID, &d->flags); + } + rcu_read_unlock(); +} diff --git a/kernel/fs/nfs/pnfs_nfs.c b/kernel/fs/nfs/pnfs_nfs.c new file mode 100644 index 000000000..f37e25b63 --- /dev/null +++ b/kernel/fs/nfs/pnfs_nfs.c @@ -0,0 +1,880 @@ +/* + * Common NFS I/O operations for the pnfs file based + * layout drivers. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tom Haynes + */ + +#include +#include +#include +#include + +#include "nfs4session.h" +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +void pnfs_generic_rw_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + nfs_put_client(hdr->ds_clp); + hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_rw_release); + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(&data->verf.verifier, &first->wb_verf, + sizeof(data->verf.verifier)); + data->verf.verifier.data[0]++; /* ensure verifier mismatch */ +} +EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes); + +void pnfs_generic_write_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done); + +void pnfs_generic_commit_release(void *calldata) +{ + struct nfs_commit_data *data = calldata; + + data->completion_ops->completion(data); + pnfs_put_lseg(data->lseg); + nfs_put_client(data->ds_clp); + nfs_commitdata_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); + +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + * Note this must be called holding the inode (/cinfo) lock + */ +void +pnfs_generic_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo) +{ + struct pnfs_layout_segment *freeme = NULL; + + if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) + goto out; + cinfo->ds->nwritten--; + if (list_is_singular(&req->wb_list)) { + struct pnfs_commit_bucket *bucket; + + bucket = list_first_entry(&req->wb_list, + struct pnfs_commit_bucket, + written); + freeme = bucket->wlseg; + bucket->wlseg = NULL; + } +out: + nfs_request_remove_commit_list(req, cinfo); + pnfs_put_lseg_locked(freeme); +} +EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); + +static int +pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); + nfs_list_add_request(req, dst); + ret++; + if ((ret == max) && !cinfo->dreq) + break; + } + return ret; +} + +static int +pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + int ret; + + lockdep_assert_held(cinfo->lock); + ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + if (ret) { + cinfo->ds->nwritten -= ret; + cinfo->ds->ncommitting += ret; + bucket->clseg = bucket->wlseg; + if (list_empty(src)) + bucket->wlseg = NULL; + else + pnfs_get_lseg(bucket->clseg); + } + return ret; +} + +/* Move reqs from written to committing lists, returning count + * of number moved. + */ +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, + int max) +{ + int i, rv = 0, cnt; + + lockdep_assert_held(cinfo->lock); + for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { + cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], + cinfo, max); + max -= cnt; + rv += cnt; + } + return rv; +} +EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); + +/* Pull everything off the committing lists and dump into @dst. */ +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; + int i; + + lockdep_assert_held(cinfo->lock); +restart: + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + if (pnfs_generic_transfer_commit_list(&b->written, dst, + cinfo, 0)) { + freeme = b->wlseg; + b->wlseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + spin_lock(cinfo->lock); + goto restart; + } + } + cinfo->ds->nwritten = 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); + +static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_bucket *bucket; + struct pnfs_layout_segment *freeme; + int i; + + for (i = idx; i < fl_cinfo->nbuckets; i++) { + bucket = &fl_cinfo->buckets[i]; + if (list_empty(&bucket->committing)) + continue; + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i); + spin_lock(cinfo->lock); + freeme = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + } +} + +static unsigned int +pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, + struct list_head *list) +{ + struct pnfs_ds_commit_info *fl_cinfo; + struct pnfs_commit_bucket *bucket; + struct nfs_commit_data *data; + int i; + unsigned int nreq = 0; + + fl_cinfo = cinfo->ds; + bucket = fl_cinfo->buckets; + for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + if (list_empty(&bucket->committing)) + continue; + data = nfs_commitdata_alloc(); + if (!data) + break; + data->ds_commit_index = i; + spin_lock(cinfo->lock); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + list_add(&data->pages, list); + nreq++; + } + + /* Clean up on error */ + pnfs_generic_retry_commit(cinfo, i); + return nreq; +} + +/* This follows nfs_commit_list pretty closely */ +int +pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo, + int (*initiate_commit)(struct nfs_commit_data *data, + int how)) +{ + struct nfs_commit_data *data, *tmp; + LIST_HEAD(list); + unsigned int nreq = 0; + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (data != NULL) { + data->lseg = NULL; + list_add(&data->pages, &list); + nreq++; + } else { + nfs_retry_commit(mds_pages, NULL, cinfo, 0); + pnfs_generic_retry_commit(cinfo, 0); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + return -ENOMEM; + } + } + + nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); + + if (nreq == 0) { + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + goto out; + } + + atomic_add(nreq, &cinfo->mds->rpcs_out); + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_initiate_commit(NFS_CLIENT(inode), data, + NFS_PROTO(data->inode), + data->mds_ops, how, 0); + } else { + struct pnfs_commit_bucket *buckets; + + buckets = cinfo->ds->buckets; + nfs_init_commit(data, + &buckets[data->ds_commit_index].committing, + data->lseg, + cinfo); + initiate_commit(data, how); + } + } +out: + cinfo->ds->ncommitting = 0; + return PNFS_ATTEMPTED; +} +EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +static DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +static void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk(KERN_WARNING "%s NULL device\n", __func__); + return; + } + printk(KERN_WARNING " ds %s\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ds->ds_remotestr, + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) +{ + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; + + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_src_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } + + return false; +} + +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, + const struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + + /* step through both lists, comparing as we go */ + for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), + da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); + da1 != NULL && da2 != NULL; + da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), + da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { + if (!same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return false; + } + if (da1 == NULL && da2 == NULL) + return true; + + return false; +} + +/* + * Lookup DS by addresses. nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + return ds; + return NULL; +} + +static void destroy_ds(struct nfs4_pnfs_ds *ds) +{ + struct nfs4_pnfs_ds_addr *da; + + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds->ds_remotestr); + kfree(ds); +} + +void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) +{ + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put); + +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da; + char *remotestr; + size_t len; + char *p; + + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ + } + + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) + return NULL; + + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); + + if (ll > len) + goto out_err; + + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; + return remotestr; +out_err: + kfree(remotestr); + return NULL; +} + +/* + * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if + * uncached and return cached struct nfs4_pnfs_ds. + */ +struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; + + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); + if (!ds) + goto out; + + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(dsaddrs); + if (tmp_ds == NULL) { + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); + ds->ds_remotestr = remotestr; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); + } else { + kfree(remotestr); + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); + +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_atomic(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_atomic(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + +static struct nfs_client *(*get_v3_ds_connect)( + struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, + int ds_proto, + unsigned int ds_timeo, + unsigned int ds_retrans, + rpc_authflavor_t au_flavor); + +static bool load_v3_ds_connect(void) +{ + if (!get_v3_ds_connect) { + get_v3_ds_connect = symbol_request(nfs3_set_ds_client); + WARN_ON_ONCE(!get_v3_ds_connect); + } + + return(get_v3_ds_connect != NULL); +} + +void nfs4_pnfs_v3_ds_connect_unload(void) +{ + if (get_v3_ds_connect) { + symbol_put(nfs3_set_ds_client); + get_v3_ds_connect = NULL; + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); + +static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, + struct nfs4_pnfs_ds *ds, + unsigned int timeo, + unsigned int retrans, + rpc_authflavor_t au_flavor) +{ + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; + int status = 0; + + dprintk("--> %s DS %s au_flavor %d\n", __func__, + ds->ds_remotestr, au_flavor); + + if (!load_v3_ds_connect()) + goto out; + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = get_v3_ds_connect(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, au_flavor); + if (!IS_ERR(clp)) + break; + } + + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + smp_wmb(); + ds->ds_clp = clp; + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: + return status; +} + +static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, + struct nfs4_pnfs_ds *ds, + unsigned int timeo, + unsigned int retrans, + u32 minor_version, + rpc_authflavor_t au_flavor) +{ + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; + int status = 0; + + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, + au_flavor); + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (!IS_ERR(clp)) + break; + } + + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); + if (status) + goto out_put; + + smp_wmb(); + ds->ds_clp = clp; + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + +/* + * Create an rpc connection to the nfs4_pnfs_ds data server. + * Currently only supports IPv4 and IPv6 addresses. + * If connection fails, make devid unavailable. + */ +void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, + struct nfs4_deviceid_node *devid, unsigned int timeo, + unsigned int retrans, u32 version, + u32 minor_version, rpc_authflavor_t au_flavor) +{ + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { + int err = 0; + + if (version == 3) { + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, + retrans, au_flavor); + } else if (version == 4) { + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, + retrans, minor_version, + au_flavor); + } else { + dprintk("%s: unsupported DS version %d\n", __func__, + version); + err = -EPROTONOSUPPORT; + } + + if (err) + nfs4_mark_deviceid_unavailable(devid); + nfs4_clear_ds_conn_bit(ds); + } else { + nfs4_wait_ds_connect(ds); + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); + +/* + * Currently only supports ipv4, ipv6 and one multi-path address. + */ +struct nfs4_pnfs_ds_addr * +nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = NULL; + char *buf, *portstr; + __be16 port; + int nlen, rlen; + int tmp[2]; + __be32 *p; + char *netid, *match_netid; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + + + /* r_netid */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_err; + nlen = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, nlen); + if (unlikely(!p)) + goto out_err; + + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) + goto out_err; + + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_free_netid; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, rlen); + if (unlikely(!p)) + goto out_free_netid; + + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { + dprintk("%s: Invalid address, length %d\n", __func__, + rlen); + goto out_free_netid; + } + buf = kmalloc(rlen + 1, gfp_flags); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_free_netid; + } + buf[rlen] = '\0'; + memcpy(buf, p, rlen); + + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; + } + *portstr = '\0'; + + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) + goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; + } + + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + switch (da->da_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + startsep = "["; + endsep = "]"; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, da->da_addr.ss_family); + goto out_free_da; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_da; + } + + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); + kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); +out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); + kfree(buf); +out_free_netid: + kfree(netid); +out_err: + return NULL; +} +EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr); + +void +pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) +{ + struct list_head *list; + struct pnfs_commit_bucket *buckets; + + spin_lock(cinfo->lock); + buckets = cinfo->ds->buckets; + list = &buckets[ds_commit_idx].written; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); + buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); + } + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); + + nfs_request_add_commit_list(req, list, cinfo); +} +EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/kernel/fs/nfs/proc.c b/kernel/fs/nfs/proc.c new file mode 100644 index 000000000..b417bbcd9 --- /dev/null +++ b/kernel/fs/nfs/proc.c @@ -0,0 +1,749 @@ +/* + * linux/fs/nfs/proc.c + * + * Copyright (C) 1992, 1993, 1994 Rick Sladkey + * + * OS-independent nfs remote procedure call functions + * + * Tuned by Alan Cox for >3K buffers + * so at last we can have decent(ish) throughput off a + * Sun server. + * + * Coding optimized and cleaned up by Florian La Roche. + * Note: Error returns are optimized for NFS_OK, which isn't translated via + * nfs_stat_to_errno(), but happens to be already the right return code. + * + * Also, the code currently doesn't check the size of the packet, when + * it decodes the packet. + * + * Feel free to fix it and mail me the diffs if it worries you. + * + * Completely rewritten to support the new RPC call interface; + * rewrote and moved the entire XDR stuff to xdr.c + * --Olaf Kirch June 1996 + * + * The code below initializes all auto variables explicitly, otherwise + * it will fail to work as a module (gcc generates a memset call for an + * incomplete struct). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +/* + * Bare-bones access to getattr: this is for nfs_read_super. + */ +static int +nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs_fattr *fattr = info->fattr; + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("%s: call getattr\n", __func__); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); + if (status) + return status; + dprintk("%s: call statfs\n", __func__); + msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; + msg.rpc_resp = &fsinfo; + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply statfs: %d\n", __func__, status); + if (status) + return status; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->lease_time = 0; + return 0; +} + +/* + * One function for each procedure in the NFS protocol. + */ +static int +nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call getattr\n"); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply getattr: %d\n", status); + return status; +} + +static int +nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = d_inode(dentry); + struct nfs_sattrargs arg = { + .fh = NFS_FH(inode), + .sattr = sattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; + int status; + + /* Mask out the non-modebit related stuff from attr->ia_mode */ + sattr->ia_mode &= S_IALLUGO; + + dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr, fattr); + dprintk("NFS reply setattr: %d\n", status); + return status; +} + +static int +nfs_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct nfs_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs_diropok res = { + .fh = fhandle, + .fattr = fattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + dprintk("NFS call lookup %s\n", name->name); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static int nfs_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_READLINK], + .rpc_argp = &args, + }; + int status; + + dprintk("NFS call readlink\n"); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + dprintk("NFS reply readlink: %d\n", status); + return status; +} + +struct nfs_createdata { + struct nfs_createargs arg; + struct nfs_diropok res; + struct nfs_fh fhandle; + struct nfs_fattr fattr; +}; + +static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, + struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data != NULL) { + data->arg.fh = NFS_FH(dir); + data->arg.name = dentry->d_name.name; + data->arg.len = dentry->d_name.len; + data->arg.sattr = sattr; + nfs_fattr_init(&data->fattr); + data->fhandle.size = 0; + data->res.fh = &data->fhandle; + data->res.fattr = &data->fattr; + } + return data; +}; + +static void nfs_free_createdata(const struct nfs_createdata *data) +{ + kfree(data); +} + +static int +nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + }; + int status = -ENOMEM; + + dprintk("NFS call create %pd\n", dentry); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: + dprintk("NFS reply create: %d\n", status); + return status; +} + +/* + * In NFSv2, mknod is grafted onto the create call. + */ +static int +nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + }; + umode_t mode; + int status = -ENOMEM; + + dprintk("NFS call mknod %pd\n", dentry); + + mode = sattr->ia_mode; + if (S_ISFIFO(mode)) { + sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR; + sattr->ia_valid &= ~ATTR_SIZE; + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { + sattr->ia_valid |= ATTR_SIZE; + sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ + } + + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + if (status == -EINVAL && S_ISFIFO(mode)) { + sattr->ia_mode = mode; + nfs_fattr_init(data->res.fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: + dprintk("NFS reply mknod: %d\n", status); + return status; +} + +static int +nfs_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_removeargs arg = { + .fh = NFS_FH(dir), + .name = *name, + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call remove %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + dprintk("NFS reply remove: %d\n", status); + return status; +} + +static void +nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; +} + +static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + +static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + nfs_mark_for_revalidate(dir); + return 1; +} + +static void +nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; +} + +static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} + +static int +nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); + return 1; +} + +static int +nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LINK], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call link %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_mark_for_revalidate(inode); + nfs_mark_for_revalidate(dir); + dprintk("NFS reply link: %d\n", status); + return status; +} + +static int +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) +{ + struct nfs_fh *fh; + struct nfs_fattr *fattr; + struct nfs_symlinkargs arg = { + .fromfh = NFS_FH(dir), + .fromname = dentry->d_name.name, + .fromlen = dentry->d_name.len, + .pages = &page, + .pathlen = len, + .sattr = sattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; + int status = -ENAMETOOLONG; + + dprintk("NFS call symlink %pd\n", dentry); + + if (len > NFS2_MAXPATHLEN) + goto out; + + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) + goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + /* + * V2 SYMLINK requests don't return any attributes. Setting the + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr, NULL); + +out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out: + dprintk("NFS reply symlink: %d\n", status); + return status; +} + +static int +nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], + }; + int status = -ENOMEM; + + dprintk("NFS call mkdir %pd\n", dentry); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: + dprintk("NFS reply mkdir: %d\n", status); + return status; +} + +static int +nfs_proc_rmdir(struct inode *dir, struct qstr *name) +{ + struct nfs_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_RMDIR], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call rmdir %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + dprintk("NFS reply rmdir: %d\n", status); + return status; +} + +/* + * The READDIR implementation is somewhat hackish - we pass a temporary + * buffer to the encode function, which installs it in the receive + * the receive iovec. The decode function just parses the reply to make + * sure it is syntactically correct; the entries itself are decoded + * from nfs_readdir by calling the decode_entry function directly. + */ +static int +nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = d_inode(dentry); + struct nfs_readdirargs arg = { + .fh = NFS_FH(dir), + .cookie = cookie, + .count = count, + .pages = pages, + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_READDIR], + .rpc_argp = &arg, + .rpc_cred = cred, + }; + int status; + + dprintk("NFS call readdir %d\n", (unsigned int)cookie); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); + + dprintk("NFS reply readdir: %d\n", status); + return status; +} + +static int +nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *stat) +{ + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; + int status; + + dprintk("NFS call statfs\n"); + nfs_fattr_init(stat->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply statfs: %d\n", status); + if (status) + goto out; + stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize; + stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize; + stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize; + stat->tfiles = 0; + stat->ffiles = 0; + stat->afiles = 0; +out: + return status; +} + +static int +nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; + int status; + + dprintk("NFS call fsinfo\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply fsinfo: %d\n", status); + if (status) + goto out; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->lease_time = 0; +out: + return status; +} + +static int +nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + info->max_link = 0; + info->max_namelen = NFS2_MAXNAMLEN; + return 0; +} + +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + struct inode *inode = hdr->inode; + + nfs_invalidate_atime(inode); + if (task->tk_status >= 0) { + nfs_refresh_inode(inode, hdr->res.fattr); + /* Emulate the eof flag, which isn't normally needed in NFSv2 + * as it is guaranteed to always return the file attributes + */ + if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) + hdr->res.eof = 1; + } + return 0; +} + +static void nfs_proc_read_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; +} + +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + rpc_call_start(task); + return 0; +} + +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; +} + +static void nfs_proc_write_setup(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ + hdr->args.stable = NFS_FILE_SYNC; + msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; +} + +static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + BUG(); +} + +static void +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +{ + BUG(); +} + +static int +nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(filp); + + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); +} + +/* Helper functions for NFS lock bounds checking */ +#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL) +static int nfs_lock_check_bounds(const struct file_lock *fl) +{ + __s32 start, end; + + start = (__s32)fl->fl_start; + if ((loff_t)start != fl->fl_start) + goto out_einval; + + if (fl->fl_end != OFFSET_MAX) { + end = (__s32)fl->fl_end; + if ((loff_t)end != fl->fl_end) + goto out_einval; + } else + end = NFS_LOCK32_OFFSET_MAX; + + if (start < 0 || start > end) + goto out_einval; + return 0; +out_einval: + return -EINVAL; +} + +static int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +static const struct inode_operations nfs_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +const struct nfs_rpc_ops nfs_v2_clientops = { + .version = 2, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .submount = nfs_submount, + .try_mount = nfs_try_mount, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, + .lookup = nfs_proc_lookup, + .access = NULL, /* access */ + .readlink = nfs_proc_readlink, + .create = nfs_proc_create, + .remove = nfs_proc_remove, + .unlink_setup = nfs_proc_unlink_setup, + .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, + .unlink_done = nfs_proc_unlink_done, + .rename_setup = nfs_proc_rename_setup, + .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, + .rename_done = nfs_proc_rename_done, + .link = nfs_proc_link, + .symlink = nfs_proc_symlink, + .mkdir = nfs_proc_mkdir, + .rmdir = nfs_proc_rmdir, + .readdir = nfs_proc_readdir, + .mknod = nfs_proc_mknod, + .statfs = nfs_proc_statfs, + .fsinfo = nfs_proc_fsinfo, + .pathconf = nfs_proc_pathconf, + .decode_dirent = nfs2_decode_dirent, + .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare, + .read_setup = nfs_proc_read_setup, + .read_done = nfs_read_done, + .write_setup = nfs_proc_write_setup, + .write_done = nfs_write_done, + .commit_setup = nfs_proc_commit_setup, + .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, + .lock = nfs_proc_lock, + .lock_check_bounds = nfs_lock_check_bounds, + .close_context = nfs_close_context, + .have_delegation = nfs_have_delegation, + .return_delegation = nfs_return_delegation, + .alloc_client = nfs_alloc_client, + .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs_create_server, + .clone_server = nfs_clone_server, +}; diff --git a/kernel/fs/nfs/read.c b/kernel/fs/nfs/read.c new file mode 100644 index 000000000..ae0ff7a11 --- /dev/null +++ b/kernel/fs/nfs/read.c @@ -0,0 +1,445 @@ +/* + * linux/fs/nfs/read.c + * + * Block I/O for NFS + * + * Partial copy of Linus' read cache modifications to fs/nfs/file.c + * modified for async RPC by okir@monad.swb.de + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" +#include "iostat.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops; + +static struct kmem_cache *nfs_rdata_cachep; + +static struct nfs_pgio_header *nfs_readhdr_alloc(void) +{ + return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); +} + +static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) +{ + kmem_cache_free(nfs_rdata_cachep, rhdr); +} + +static +int nfs_return_empty_page(struct page *page) +{ + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + unlock_page(page); + return 0; +} + +void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops) +{ + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, + server->rsize, 0); +} +EXPORT_SYMBOL_GPL(nfs_pageio_init_read); + +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ + struct nfs_pgio_mirror *mirror; + + pgio->pg_ops = &nfs_pgio_rw_ops; + + /* read path should never have more than one mirror */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + mirror = &pgio->pg_mirrors[0]; + mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); + +int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, + struct page *page) +{ + struct nfs_page *new; + unsigned int len; + struct nfs_pageio_descriptor pgio; + struct nfs_pgio_mirror *pgm; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + new = nfs_create_request(ctx, page, NULL, 0, len); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); + } + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + nfs_pageio_add_request(&pgio, new); + nfs_pageio_complete(&pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio.pg_mirror_count != 1); + + pgm = &pgio.pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + + return 0; +} + +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } + nfs_release_request(req); +} + +static void nfs_page_group_set_uptodate(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + SetPageUptodate(req->wb_page); +} + +static void nfs_read_completion(struct nfs_pgio_header *hdr) +{ + unsigned long bytes = 0; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + unsigned long start = req->wb_pgbase; + unsigned long end = req->wb_pgbase + req->wb_bytes; + + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { + /* note: regions of the page not covered by a + * request are zeroed in nfs_readpage_async / + * readpage_async_filler */ + if (bytes > hdr->good_bytes) { + /* nothing in this request was good, so zero + * the full extent of the request */ + zero_user_segment(page, start, end); + + } else if (hdr->good_bytes - bytes < req->wb_bytes) { + /* part of this request has good bytes, but + * not all. zero the bad bytes */ + start += hdr->good_bytes - bytes; + WARN_ON(start < req->wb_pgbase); + zero_user_segment(page, start, end); + } + } + bytes += req->wb_bytes; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + if (bytes <= hdr->good_bytes) + nfs_page_group_set_uptodate(req); + } else + nfs_page_group_set_uptodate(req); + nfs_list_remove_request(req); + nfs_readpage_release(req); + } +out: + hdr->release(hdr); +} + +static void nfs_initiate_read(struct nfs_pgio_header *hdr, + struct rpc_message *msg, + const struct nfs_rpc_ops *rpc_ops, + struct rpc_task_setup *task_setup_data, int how) +{ + struct inode *inode = hdr->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + + task_setup_data->flags |= swap_flags; + rpc_ops->read_setup(hdr, msg); +} + +static void +nfs_async_read_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_readpage_release(req); + } +} + +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { + .error_cleanup = nfs_async_read_error, + .completion = nfs_read_completion, +}; + +/* + * This is the callback from RPC telling us whether a reply was + * received or some error occurred (timeout or socket shutdown). + */ +static int nfs_readpage_done(struct rpc_task *task, + struct nfs_pgio_header *hdr, + struct inode *inode) +{ + int status = NFS_PROTO(inode)->read_done(task, hdr); + if (status != 0) + return status; + + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count); + + if (task->tk_status == -ESTALE) { + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_mark_for_revalidate(inode); + } + return 0; +} + +static void nfs_readpage_retry(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + /* This is a short read! */ + nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD); + /* Has the server at least made some progress? */ + if (resp->count == 0) { + nfs_set_pgio_error(hdr, -EIO, argp->offset); + return; + } + /* Yes, so retry the read at the end of the hdr */ + hdr->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + rpc_restart_call_prepare(task); +} + +static void nfs_readpage_result(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (hdr->res.eof) { + loff_t bound; + + bound = hdr->args.offset + hdr->res.count; + spin_lock(&hdr->lock); + if (bound < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_EOF, &hdr->flags); + clear_bit(NFS_IOHDR_ERROR, &hdr->flags); + hdr->good_bytes = bound - hdr->io_start; + } + spin_unlock(&hdr->lock); + } else if (hdr->res.count != hdr->args.count) + nfs_readpage_retry(task, hdr); +} + +/* + * Read a page over NFS. + * We read the page synchronously in the following case: + * - The error flag is set for this page. This happens only when a + * previous async read operation failed. + */ +int nfs_readpage(struct file *file, struct page *page) +{ + struct nfs_open_context *ctx; + struct inode *inode = page_file_mapping(page)->host; + int error; + + dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", + page, PAGE_CACHE_SIZE, page_file_index(page)); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); + + /* + * Try to flush any pending writes to the file.. + * + * NOTE! Because we own the page lock, there cannot + * be any new pending writes generated at this point + * for this page (other pages can be written to). + */ + error = nfs_wb_page(inode, page); + if (error) + goto out_unlock; + if (PageUptodate(page)) + goto out_unlock; + + error = -ESTALE; + if (NFS_STALE(inode)) + goto out_unlock; + + if (file == NULL) { + error = -EBADF; + ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (ctx == NULL) + goto out_unlock; + } else + ctx = get_nfs_open_context(nfs_file_open_context(file)); + + if (!IS_SYNC(inode)) { + error = nfs_readpage_from_fscache(ctx, inode, page); + if (error == 0) + goto out; + } + + error = nfs_readpage_async(ctx, inode, page); + +out: + put_nfs_open_context(ctx); + return error; +out_unlock: + unlock_page(page); + return error; +} + +struct nfs_readdesc { + struct nfs_pageio_descriptor *pgio; + struct nfs_open_context *ctx; +}; + +static int +readpage_async_filler(void *data, struct page *page) +{ + struct nfs_readdesc *desc = (struct nfs_readdesc *)data; + struct nfs_page *new; + unsigned int len; + int error; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + + new = nfs_create_request(desc->ctx, page, NULL, 0, len); + if (IS_ERR(new)) + goto out_error; + + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (!nfs_pageio_add_request(desc->pgio, new)) { + error = desc->pgio->pg_error; + goto out_unlock; + } + return 0; +out_error: + error = PTR_ERR(new); +out_unlock: + unlock_page(page); + return error; +} + +int nfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct nfs_pageio_descriptor pgio; + struct nfs_pgio_mirror *pgm; + struct nfs_readdesc desc = { + .pgio = &pgio, + }; + struct inode *inode = mapping->host; + unsigned long npages; + int ret = -ESTALE; + + dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), + nr_pages); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + + if (NFS_STALE(inode)) + goto out; + + if (filp == NULL) { + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) + return -EBADF; + } else + desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + + /* attempt to read as many of the pages as possible from the cache + * - this returns -ENOBUFS immediately if the cookie is negative + */ + ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, + pages, &nr_pages); + if (ret == 0) + goto read_complete; /* all pages were read */ + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + nfs_pageio_complete(&pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio.pg_mirror_count != 1); + + pgm = &pgio.pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +read_complete: + put_nfs_open_context(desc.ctx); +out: + return ret; +} + +int __init nfs_init_readpagecache(void) +{ + nfs_rdata_cachep = kmem_cache_create("nfs_read_data", + sizeof(struct nfs_pgio_header), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_rdata_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_readpagecache(void) +{ + kmem_cache_destroy(nfs_rdata_cachep); +} + +static const struct nfs_rw_ops nfs_rw_read_ops = { + .rw_mode = FMODE_READ, + .rw_alloc_header = nfs_readhdr_alloc, + .rw_free_header = nfs_readhdr_free, + .rw_done = nfs_readpage_done, + .rw_result = nfs_readpage_result, + .rw_initiate = nfs_initiate_read, +}; diff --git a/kernel/fs/nfs/super.c b/kernel/fs/nfs/super.c new file mode 100644 index 000000000..f175b833b --- /dev/null +++ b/kernel/fs/nfs/super.c @@ -0,0 +1,2877 @@ +/* + * linux/fs/nfs/super.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs superblock handling functions + * + * Modularised by Alan Cox , while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + * Split from inode.c by David Howells + * + * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a + * particular server are held in the same superblock + * - NFS superblocks can have several effective roots to the dentry tree + * - directory type roots are spliced into the tree when a path from one root reaches the root + * of another (see nfs_lookup()) + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "nfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS +#define NFS_TEXT_DATA 1 + +#if IS_ENABLED(CONFIG_NFS_V3) +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + +enum { + /* Mount options that take no arguments */ + Opt_soft, Opt_hard, + Opt_posix, Opt_noposix, + Opt_cto, Opt_nocto, + Opt_ac, Opt_noac, + Opt_lock, Opt_nolock, + Opt_udp, Opt_tcp, Opt_rdma, + Opt_acl, Opt_noacl, + Opt_rdirplus, Opt_nordirplus, + Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, + Opt_migration, Opt_nomigration, + + /* Mount options that take integer arguments */ + Opt_port, + Opt_rsize, Opt_wsize, Opt_bsize, + Opt_timeo, Opt_retrans, + Opt_acregmin, Opt_acregmax, + Opt_acdirmin, Opt_acdirmax, + Opt_actimeo, + Opt_namelen, + Opt_mountport, + Opt_mountvers, + Opt_minorversion, + + /* Mount options that take string arguments */ + Opt_nfsvers, + Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, + Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, + Opt_fscache_uniq, + Opt_local_lock, + + /* Special mount options */ + Opt_userspace, Opt_deprecated, Opt_sloppy, + + Opt_err +}; + +static const match_table_t nfs_mount_option_tokens = { + { Opt_userspace, "bg" }, + { Opt_userspace, "fg" }, + { Opt_userspace, "retry=%s" }, + + { Opt_sloppy, "sloppy" }, + + { Opt_soft, "soft" }, + { Opt_hard, "hard" }, + { Opt_deprecated, "intr" }, + { Opt_deprecated, "nointr" }, + { Opt_posix, "posix" }, + { Opt_noposix, "noposix" }, + { Opt_cto, "cto" }, + { Opt_nocto, "nocto" }, + { Opt_ac, "ac" }, + { Opt_noac, "noac" }, + { Opt_lock, "lock" }, + { Opt_nolock, "nolock" }, + { Opt_udp, "udp" }, + { Opt_tcp, "tcp" }, + { Opt_rdma, "rdma" }, + { Opt_acl, "acl" }, + { Opt_noacl, "noacl" }, + { Opt_rdirplus, "rdirplus" }, + { Opt_nordirplus, "nordirplus" }, + { Opt_sharecache, "sharecache" }, + { Opt_nosharecache, "nosharecache" }, + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, + { Opt_nofscache, "nofsc" }, + { Opt_migration, "migration" }, + { Opt_nomigration, "nomigration" }, + + { Opt_port, "port=%s" }, + { Opt_rsize, "rsize=%s" }, + { Opt_wsize, "wsize=%s" }, + { Opt_bsize, "bsize=%s" }, + { Opt_timeo, "timeo=%s" }, + { Opt_retrans, "retrans=%s" }, + { Opt_acregmin, "acregmin=%s" }, + { Opt_acregmax, "acregmax=%s" }, + { Opt_acdirmin, "acdirmin=%s" }, + { Opt_acdirmax, "acdirmax=%s" }, + { Opt_actimeo, "actimeo=%s" }, + { Opt_namelen, "namlen=%s" }, + { Opt_mountport, "mountport=%s" }, + { Opt_mountvers, "mountvers=%s" }, + { Opt_minorversion, "minorversion=%s" }, + + { Opt_nfsvers, "nfsvers=%s" }, + { Opt_nfsvers, "vers=%s" }, + + { Opt_sec, "sec=%s" }, + { Opt_proto, "proto=%s" }, + { Opt_mountproto, "mountproto=%s" }, + { Opt_addr, "addr=%s" }, + { Opt_clientaddr, "clientaddr=%s" }, + { Opt_mounthost, "mounthost=%s" }, + { Opt_mountaddr, "mountaddr=%s" }, + + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, + + /* The following needs to be listed after all other options */ + { Opt_nfsvers, "v%s" }, + + { Opt_err, NULL } +}; + +enum { + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + + Opt_xprt_err +}; + +static const match_table_t nfs_xprt_protocol_tokens = { + { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, + { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, + { Opt_xprt_rdma, "rdma" }, + + { Opt_xprt_err, NULL } +}; + +enum { + Opt_sec_none, Opt_sec_sys, + Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, + Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, + Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, + + Opt_sec_err +}; + +static const match_table_t nfs_secflavor_tokens = { + { Opt_sec_none, "none" }, + { Opt_sec_none, "null" }, + { Opt_sec_sys, "sys" }, + + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + + { Opt_sec_lkey, "lkey" }, + { Opt_sec_lkeyi, "lkeyi" }, + { Opt_sec_lkeyp, "lkeyp" }, + + { Opt_sec_spkm, "spkm3" }, + { Opt_sec_spkmi, "spkm3i" }, + { Opt_sec_spkmp, "spkm3p" }, + + { Opt_sec_err, NULL } +}; + +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + +enum { + Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, + Opt_local_lock_none, + + Opt_local_lock_err +}; + +static match_table_t nfs_local_lock_tokens = { + { Opt_local_lock_all, "all" }, + { Opt_local_lock_flock, "flock" }, + { Opt_local_lock_posix, "posix" }, + { Opt_local_lock_none, "none" }, + + { Opt_local_lock_err, NULL } +}; + +enum { + Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, + Opt_vers_4_1, Opt_vers_4_2, + + Opt_vers_err +}; + +static match_table_t nfs_vers_tokens = { + { Opt_vers_2, "2" }, + { Opt_vers_3, "3" }, + { Opt_vers_4, "4" }, + { Opt_vers_4_0, "4.0" }, + { Opt_vers_4_1, "4.1" }, + { Opt_vers_4_2, "4.2" }, + + { Opt_vers_err, NULL } +}; + +static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); + +struct file_system_type nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .mount = nfs_fs_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("nfs"); +EXPORT_SYMBOL_GPL(nfs_fs_type); + +struct file_system_type nfs_xdev_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .mount = nfs_xdev_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +const struct super_operations nfs_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs_write_inode, + .drop_inode = nfs_drop_inode, + .statfs = nfs_statfs, + .evict_inode = nfs_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; +EXPORT_SYMBOL_GPL(nfs_sops); + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); +static int nfs4_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); + +struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs_fs_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); +EXPORT_SYMBOL_GPL(nfs4_fs_type); + +static int __init register_nfs4_fs(void) +{ + return register_filesystem(&nfs4_fs_type); +} + +static void unregister_nfs4_fs(void) +{ + unregister_filesystem(&nfs4_fs_type); +} +#else +static int __init register_nfs4_fs(void) +{ + return 0; +} + +static void unregister_nfs4_fs(void) +{ +} +#endif + +static struct shrinker acl_shrinker = { + .count_objects = nfs_access_cache_count, + .scan_objects = nfs_access_cache_scan, + .seeks = DEFAULT_SEEKS, +}; + +/* + * Register the NFS filesystems + */ +int __init register_nfs_fs(void) +{ + int ret; + + ret = register_filesystem(&nfs_fs_type); + if (ret < 0) + goto error_0; + + ret = register_nfs4_fs(); + if (ret < 0) + goto error_1; + + ret = nfs_register_sysctl(); + if (ret < 0) + goto error_2; + register_shrinker(&acl_shrinker); + return 0; + +error_2: + unregister_nfs4_fs(); +error_1: + unregister_filesystem(&nfs_fs_type); +error_0: + return ret; +} + +/* + * Unregister the NFS filesystems + */ +void __exit unregister_nfs_fs(void) +{ + unregister_shrinker(&acl_shrinker); + nfs_unregister_sysctl(); + unregister_nfs4_fs(); + unregister_filesystem(&nfs_fs_type); +} + +bool nfs_sb_active(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (!atomic_inc_not_zero(&sb->s_active)) + return false; + if (atomic_inc_return(&server->active) != 1) + atomic_dec(&sb->s_active); + return true; +} +EXPORT_SYMBOL_GPL(nfs_sb_active); + +void nfs_sb_deactive(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} +EXPORT_SYMBOL_GPL(nfs_sb_deactive); + +/* + * Deliver file system statistics to userspace + */ +int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct nfs_server *server = NFS_SB(dentry->d_sb); + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *fh = NFS_FH(d_inode(dentry)); + struct nfs_fsstat res; + int error = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out_err; + + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(d_inode(pd_dentry)); + dput(pd_dentry); + } + } + nfs_free_fattr(res.fattr); + if (error < 0) + goto out_err; + + buf->f_type = NFS_SUPER_MAGIC; + + /* + * Current versions of glibc do not correctly handle the + * case where f_frsize != f_bsize. Eventually we want to + * report the value of wtmult in this field. + */ + buf->f_frsize = dentry->d_sb->s_blocksize; + + /* + * On most *nix systems, f_blocks, f_bfree, and f_bavail + * are reported in units of f_frsize. Linux hasn't had + * an f_frsize field in its statfs struct until recently, + * thus historically Linux's sys_statfs reports these + * fields in units of f_bsize. + */ + buf->f_bsize = dentry->d_sb->s_blocksize; + blockbits = dentry->d_sb->s_blocksize_bits; + blockres = (1 << blockbits) - 1; + buf->f_blocks = (res.tbytes + blockres) >> blockbits; + buf->f_bfree = (res.fbytes + blockres) >> blockbits; + buf->f_bavail = (res.abytes + blockres) >> blockbits; + + buf->f_files = res.tfiles; + buf->f_ffree = res.afiles; + + buf->f_namelen = server->namelen; + + return 0; + + out_err: + dprintk("%s: statfs error = %d\n", __func__, -error); + return error; +} +EXPORT_SYMBOL_GPL(nfs_statfs); + +/* + * Map the security flavour number to a name + */ +static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) +{ + static const struct { + rpc_authflavor_t flavour; + const char *str; + } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { + /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */ + { RPC_AUTH_NULL, "null" }, + { RPC_AUTH_UNIX, "sys" }, + { RPC_AUTH_GSS_KRB5, "krb5" }, + { RPC_AUTH_GSS_KRB5I, "krb5i" }, + { RPC_AUTH_GSS_KRB5P, "krb5p" }, + { RPC_AUTH_GSS_LKEY, "lkey" }, + { RPC_AUTH_GSS_LKEYI, "lkeyi" }, + { RPC_AUTH_GSS_LKEYP, "lkeyp" }, + { RPC_AUTH_GSS_SPKM, "spkm" }, + { RPC_AUTH_GSS_SPKMI, "spkmi" }, + { RPC_AUTH_GSS_SPKMP, "spkmp" }, + { UINT_MAX, "unknown" } + }; + int i; + + for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) { + if (sec_flavours[i].flavour == flavour) + break; + } + return sec_flavours[i].str; +} + +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + +static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + + if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE) + return; + + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); + break; + } + default: + if (showdefaults) + seq_printf(m, ",mountaddr=unspecified"); + } + + if (nfss->mountd_version || showdefaults) + seq_printf(m, ",mountvers=%u", nfss->mountd_version); + if ((nfss->mountd_port && + nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || + showdefaults) + seq_printf(m, ",mountport=%u", nfss->mountd_port); + + nfs_show_mountd_netid(m, nfss, showdefaults); +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct nfs_client *clp = nfss->nfs_client; + + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); +} +#else +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ +} +#endif + +static void nfs_show_nfs_version(struct seq_file *m, + unsigned int version, + unsigned int minorversion) +{ + seq_printf(m, ",vers=%u", version); + if (version == 4) + seq_printf(m, ".%u", minorversion); +} + +/* + * Describe the mount options in force on this server representation + */ +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + static const struct proc_nfs_info { + int flag; + const char *str; + const char *nostr; + } nfs_info[] = { + { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_POSIX, ",posix", "" }, + { NFS_MOUNT_NOCTO, ",nocto", "" }, + { NFS_MOUNT_NOAC, ",noac", "" }, + { NFS_MOUNT_NONLM, ",nolock", "" }, + { NFS_MOUNT_NOACL, ",noacl", "" }, + { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { 0, NULL, NULL } + }; + const struct proc_nfs_info *nfs_infop; + struct nfs_client *clp = nfss->nfs_client; + u32 version = clp->rpc_ops->version; + int local_flock, local_fcntl; + + nfs_show_nfs_version(m, version, clp->cl_minorversion); + seq_printf(m, ",rsize=%u", nfss->rsize); + seq_printf(m, ",wsize=%u", nfss->wsize); + if (nfss->bsize != 0) + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); + if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults) + seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); + if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults) + seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); + if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults) + seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); + if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) + seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); + for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { + if (nfss->flags & nfs_infop->flag) + seq_puts(m, nfs_infop->str); + else + seq_puts(m, nfs_infop->nostr); + } + rcu_read_lock(); + seq_printf(m, ",proto=%s", + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); + rcu_read_unlock(); + if (version == 4) { + if (nfss->port != NFS_PORT) + seq_printf(m, ",port=%u", nfss->port); + } else + if (nfss->port) + seq_printf(m, ",port=%u", nfss->port); + + seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); + seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); + seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + + if (version != 4) + nfs_show_mountd_options(m, nfss, showdefaults); + else + nfs_show_nfsv4_options(m, nfss, showdefaults); + + if (nfss->options & NFS_OPTION_FSCACHE) + seq_printf(m, ",fsc"); + + if (nfss->options & NFS_OPTION_MIGRATION) + seq_printf(m, ",migration"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } + + local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; + local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; + + if (!local_flock && !local_fcntl) + seq_printf(m, ",local_lock=none"); + else if (local_flock && local_fcntl) + seq_printf(m, ",local_lock=all"); + else if (local_flock) + seq_printf(m, ",local_lock=flock"); + else + seq_printf(m, ",local_lock=posix"); +} + +/* + * Describe the mount options on this VFS mountpoint + */ +int nfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct nfs_server *nfss = NFS_SB(root->d_sb); + + nfs_show_mount_options(m, nfss, 0); + + rcu_read_lock(); + seq_printf(m, ",addr=%s", + rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient, + RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_show_options); + +#if IS_ENABLED(CONFIG_NFS_V4) +#ifdef CONFIG_NFS_V4_1 +static void show_sessions(struct seq_file *m, struct nfs_server *server) +{ + if (nfs4_has_session(server->nfs_client)) + seq_printf(m, ",sessions"); +} +#else +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif + +#ifdef CONFIG_NFS_V4_1 +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ + seq_printf(m, ",pnfs="); + if (server->pnfs_curr_ld) + seq_printf(m, "%s", server->pnfs_curr_ld->name); + else + seq_printf(m, "not configured"); +} + +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ + if (nfss->nfs_client && nfss->nfs_client->cl_implid) { + struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid; + seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," + "date='%llu,%u'", + impl_id->name, impl_id->domain, + impl_id->date.seconds, impl_id->date.nseconds); + } +} +#else +#if IS_ENABLED(CONFIG_NFS_V4) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +} +#endif +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +} +#endif + +int nfs_show_devname(struct seq_file *m, struct dentry *root) +{ + char *page = (char *) __get_free_page(GFP_KERNEL); + char *devname, *dummy; + int err = 0; + if (!page) + return -ENOMEM; + devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0); + if (IS_ERR(devname)) + err = PTR_ERR(devname); + else + seq_escape(m, devname, " \t\n\\"); + free_page((unsigned long)page); + return err; +} +EXPORT_SYMBOL_GPL(nfs_show_devname); + +int nfs_show_path(struct seq_file *m, struct dentry *dentry) +{ + seq_puts(m, "/"); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_show_path); + +/* + * Present statistical information for this VFS mountpoint + */ +int nfs_show_stats(struct seq_file *m, struct dentry *root) +{ + int i, cpu; + struct nfs_server *nfss = NFS_SB(root->d_sb); + struct rpc_auth *auth = nfss->client->cl_auth; + struct nfs_iostats totals = { }; + + seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); + + /* + * Display all mount option settings + */ + seq_printf(m, "\n\topts:\t"); + seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + nfs_show_mount_options(m, nfss, 1); + + seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + + show_implementation_id(m, nfss); + + seq_printf(m, "\n\tcaps:\t"); + seq_printf(m, "caps=0x%x", nfss->caps); + seq_printf(m, ",wtmult=%u", nfss->wtmult); + seq_printf(m, ",dtsize=%u", nfss->dtsize); + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); + +#if IS_ENABLED(CONFIG_NFS_V4) + if (nfss->nfs_client->rpc_ops->version == 4) { + seq_printf(m, "\n\tnfsv4:\t"); + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + show_sessions(m, nfss); + show_pnfs(m, nfss); + } +#endif + + /* + * Display security flavor in effect for this mount + */ + seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor); + if (auth->au_flavor) + seq_printf(m, ",pseudoflavor=%u", auth->au_flavor); + + /* + * Display superblock I/O counters + */ + for_each_possible_cpu(cpu) { + struct nfs_iostats *stats; + + preempt_disable(); + stats = per_cpu_ptr(nfss->io_stats, cpu); + + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + totals.events[i] += stats->events[i]; + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + totals.bytes[i] += stats->bytes[i]; +#ifdef CONFIG_NFS_FSCACHE + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + totals.fscache[i] += stats->fscache[i]; +#endif + + preempt_enable(); + } + + seq_printf(m, "\n\tevents:\t"); + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + seq_printf(m, "%lu ", totals.events[i]); + seq_printf(m, "\n\tbytes:\t"); + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); +#ifdef CONFIG_NFS_FSCACHE + if (nfss->options & NFS_OPTION_FSCACHE) { + seq_printf(m, "\n\tfsc:\t"); + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + } +#endif + seq_printf(m, "\n"); + + rpc_print_iostats(m, nfss->client); + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_show_stats); + +/* + * Begin unmount by attempting to remove all automounted mountpoints we added + * in response to xdev traversals and referrals + */ +void nfs_umount_begin(struct super_block *sb) +{ + struct nfs_server *server; + struct rpc_clnt *rpc; + + server = NFS_SB(sb); + /* -EIO all pending I/O */ + rpc = server->client_acl; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); + rpc = server->client; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); +} +EXPORT_SYMBOL_GPL(nfs_umount_begin); + +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) +{ + struct nfs_parsed_mount_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data) { + data->acregmin = NFS_DEF_ACREGMIN; + data->acregmax = NFS_DEF_ACREGMAX; + data->acdirmin = NFS_DEF_ACDIRMIN; + data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; + data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; + data->selected_flavor = RPC_AUTH_MAXFLAVOR; + data->minorversion = 0; + data->need_mount = true; + data->net = current->nsproxy->net_ns; + security_init_mnt_opts(&data->lsm_opts); + } + return data; +} + +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); + } +} + +/* + * Sanity-check a server address provided by the mount command. + * + * Address family must be initialized, and address must not be + * the ANY address for that family. + */ +static int nfs_verify_server_address(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sa = (struct sockaddr_in *)addr; + return sa->sin_addr.s_addr != htonl(INADDR_ANY); + } + case AF_INET6: { + struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; + return !ipv6_addr_any(sa); + } + } + + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); + return 0; +} + +/* + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. + */ +static void nfs_set_port(struct sockaddr *sap, int *port, + const unsigned short default_port) +{ + if (*port == NFS_UNSPEC_PORT) + *port = default_port; + + rpc_set_port(sap, *port); +} + +/* + * Sanity check the NFS transport protocol. + * + */ +static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + nfs_validate_transport_protocol(mnt); + + if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || + mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) + return; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) +{ + unsigned int i; + unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return true; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) { + dfprintk(MOUNT, "NFS: too many sec= flavors\n"); + return false; + } + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return true; +} + +/* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, + rpc_authflavor_t match) +{ + int i; + + if (!auth_info->flavor_len) + return true; + + for (i = 0; i < auth_info->flavor_len; i++) { + if (auth_info->flavors[i] == match) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/* + * Parse the value of the 'sec=' option. + */ +static int nfs_parse_security_flavors(char *value, + struct nfs_parsed_mount_data *mnt) +{ + substring_t args[MAX_OPT_ARGS]; + rpc_authflavor_t pseudoflavor; + char *p; + + dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); + + while ((p = strsep(&value, ":")) != NULL) { + switch (match_token(p, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + dfprintk(MOUNT, + "NFS: sec= option '%s' not recognized\n", p); + return 0; + } + + if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) + return 0; + } + + return 1; +} + +static int nfs_parse_version_string(char *string, + struct nfs_parsed_mount_data *mnt, + substring_t *args) +{ + mnt->flags &= ~NFS_MOUNT_VER3; + switch (match_token(string, nfs_vers_tokens, args)) { + case Opt_vers_2: + mnt->version = 2; + break; + case Opt_vers_3: + mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; + case Opt_vers_4: + /* Backward compatibility option. In future, + * the mount program should always supply + * a NFSv4 minor version number. + */ + mnt->version = 4; + break; + case Opt_vers_4_0: + mnt->version = 4; + mnt->minorversion = 0; + break; + case Opt_vers_4_1: + mnt->version = 4; + mnt->minorversion = 1; + break; + case Opt_vers_4_2: + mnt->version = 4; + mnt->minorversion = 2; + break; + default: + return 0; + } + return 1; +} + +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !*option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = kstrtoul(string, 10, option); + kfree(string); + + return rc; +} + +/* + * Error-check and convert a string of mount options from user space into + * a data structure. The whole mount string is processed; bad options are + * skipped as they are encountered. If there were no errors, return 1; + * otherwise return 0 (zero). + */ +static int nfs_parse_mount_options(char *raw, + struct nfs_parsed_mount_data *mnt) +{ + char *p, *string, *secdata; + int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; + + if (!raw) { + dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); + return 1; + } + dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw); + + secdata = alloc_secdata(); + if (!secdata) + goto out_nomem; + + rc = security_sb_copy_data(raw, secdata); + if (rc) + goto out_security_failure; + + rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts); + if (rc) + goto out_security_failure; + + free_secdata(secdata); + + while ((p = strsep(&raw, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + unsigned long option; + int token; + + if (!*p) + continue; + + dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", p); + + token = match_token(p, nfs_mount_option_tokens, args); + switch (token) { + + /* + * boolean options: foo/nofoo + */ + case Opt_soft: + mnt->flags |= NFS_MOUNT_SOFT; + break; + case Opt_hard: + mnt->flags &= ~NFS_MOUNT_SOFT; + break; + case Opt_posix: + mnt->flags |= NFS_MOUNT_POSIX; + break; + case Opt_noposix: + mnt->flags &= ~NFS_MOUNT_POSIX; + break; + case Opt_cto: + mnt->flags &= ~NFS_MOUNT_NOCTO; + break; + case Opt_nocto: + mnt->flags |= NFS_MOUNT_NOCTO; + break; + case Opt_ac: + mnt->flags &= ~NFS_MOUNT_NOAC; + break; + case Opt_noac: + mnt->flags |= NFS_MOUNT_NOAC; + break; + case Opt_lock: + mnt->flags &= ~NFS_MOUNT_NONLM; + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_nolock: + mnt->flags |= NFS_MOUNT_NONLM; + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_rdma: + mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(p); + break; + case Opt_acl: + mnt->flags &= ~NFS_MOUNT_NOACL; + break; + case Opt_noacl: + mnt->flags |= NFS_MOUNT_NOACL; + break; + case Opt_rdirplus: + mnt->flags &= ~NFS_MOUNT_NORDIRPLUS; + break; + case Opt_nordirplus: + mnt->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_sharecache: + mnt->flags &= ~NFS_MOUNT_UNSHARED; + break; + case Opt_nosharecache: + mnt->flags |= NFS_MOUNT_UNSHARED; + break; + case Opt_resvport: + mnt->flags &= ~NFS_MOUNT_NORESVPORT; + break; + case Opt_noresvport: + mnt->flags |= NFS_MOUNT_NORESVPORT; + break; + case Opt_fscache: + mnt->options |= NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_nofscache: + mnt->options &= ~NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_migration: + mnt->options |= NFS_OPTION_MIGRATION; + break; + case Opt_nomigration: + mnt->options &= NFS_OPTION_MIGRATION; + break; + + /* + * options that take numeric values + */ + case Opt_port: + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; + mnt->nfs_server.port = option; + break; + case Opt_rsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->rsize = option; + break; + case Opt_wsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->wsize = option; + break; + case Opt_bsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->bsize = option; + break; + case Opt_timeo: + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->timeo = option; + break; + case Opt_retrans: + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->retrans = option; + break; + case Opt_acregmin: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = option; + break; + case Opt_acregmax: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmax = option; + break; + case Opt_acdirmin: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmin = option; + break; + case Opt_acdirmax: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmax = option; + break; + case Opt_actimeo: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = mnt->acregmax = + mnt->acdirmin = mnt->acdirmax = option; + break; + case Opt_namelen: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->namlen = option; + break; + case Opt_mountport: + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; + mnt->mount_server.port = option; + break; + case Opt_mountvers: + if (nfs_get_option_ul(args, &option) || + option < NFS_MNT_VERSION || + option > NFS_MNT3_VERSION) + goto out_invalid_value; + mnt->mount_server.version = option; + break; + case Opt_minorversion: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; + break; + + /* + * options that take text values + */ + case Opt_nfsvers: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = nfs_parse_version_string(string, mnt, args); + kfree(string); + if (!rc) + goto out_invalid_value; + break; + case Opt_sec: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = nfs_parse_security_flavors(string, mnt); + kfree(string); + if (!rc) { + dfprintk(MOUNT, "NFS: unrecognized " + "security flavor\n"); + return 0; + } + break; + case Opt_proto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + + protofamily = AF_INET; + switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; + case Opt_xprt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; + case Opt_xprt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma: + /* vector side protocols to TCP */ + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(string); + break; + default: + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + kfree(string); + return 0; + } + kfree(string); + break; + case Opt_mountproto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + kfree(string); + + mountfamily = AF_INET; + switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; + case Opt_xprt_udp: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; + case Opt_xprt_tcp: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma: /* not used for side protocols */ + default: + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + return 0; + } + break; + case Opt_addr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->nfs_server.addrlen = + rpc_pton(mnt->net, string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); + kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; + break; + case Opt_clientaddr: + if (nfs_get_option_str(args, &mnt->client_address)) + goto out_nomem; + break; + case Opt_mounthost: + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) + goto out_nomem; + break; + case Opt_mountaddr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->mount_server.addrlen = + rpc_pton(mnt->net, string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); + kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; + break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + return 0; + }; + break; + case Opt_fscache_uniq: + if (nfs_get_option_str(args, &mnt->fscache_uniq)) + goto out_nomem; + mnt->options |= NFS_OPTION_FSCACHE; + break; + case Opt_local_lock: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, nfs_local_lock_tokens, + args); + kfree(string); + switch (token) { + case Opt_local_lock_all: + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "local_lock argument\n"); + return 0; + }; + break; + + /* + * Special options + */ + case Opt_sloppy: + sloppy = 1; + dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); + break; + case Opt_userspace: + case Opt_deprecated: + dfprintk(MOUNT, "NFS: ignoring mount option " + "'%s'\n", p); + break; + + default: + invalid_option = 1; + dfprintk(MOUNT, "NFS: unrecognized mount option " + "'%s'\n", p); + } + } + + if (!sloppy && invalid_option) + return 0; + + if (mnt->minorversion && mnt->version != 4) + goto out_minorversion_mismatch; + + if (mnt->options & NFS_OPTION_MIGRATION && + (mnt->version != 4 || mnt->minorversion != 0)) + goto out_migration_misuse; + + /* + * verify that any proto=/mountproto= options match the address + * families in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } + + return 1; + +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; +out_invalid_value: + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); + return 0; +out_minorversion_mismatch: + printk(KERN_INFO "NFS: mount option vers=%u does not support " + "minorversion=%u\n", mnt->version, mnt->minorversion); + return 0; +out_migration_misuse: + printk(KERN_INFO + "NFS: 'migration' not supported for this NFS version\n"); + return 0; +out_nomem: + printk(KERN_INFO "NFS: not enough memory to parse option\n"); + return 0; +out_security_failure: + free_secdata(secdata); + printk(KERN_INFO "NFS: security options invalid: %d\n", rc); + return 0; +} + +/* + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not. + */ +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, + rpc_authflavor_t *server_authlist, unsigned int count) +{ + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + unsigned int i; + + /* + * If the sec= mount option is used, the specified flavor or AUTH_NULL + * must be in the list returned by the server. + * + * AUTH_NULL has a special meaning when it's in the server list - it + * means that the server will ignore the rpc creds, so any flavor + * can be used. + */ + for (i = 0; i < count; i++) { + flavor = server_authlist[i]; + + if (nfs_auth_info_match(&args->auth_info, flavor) || + flavor == RPC_AUTH_NULL) + goto out; + } + + dfprintk(MOUNT, + "NFS: specified auth flavors not supported by server\n"); + return -EACCES; + +out: + args->selected_flavor = flavor; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); + return 0; +} + +/* + * Use the remote server's MOUNT service to request the NFS file handle + * corresponding to the provided path. + */ +static int nfs_request_mount(struct nfs_parsed_mount_data *args, + struct nfs_fh *root_fh, + rpc_authflavor_t *server_authlist, + unsigned int *server_authlist_len) +{ + struct nfs_mount_request request = { + .sap = (struct sockaddr *) + &args->mount_server.address, + .dirpath = args->nfs_server.export_path, + .protocol = args->mount_server.protocol, + .fh = root_fh, + .noresvport = args->flags & NFS_MOUNT_NORESVPORT, + .auth_flav_len = server_authlist_len, + .auth_flavs = server_authlist, + .net = args->net, + }; + int status; + + if (args->mount_server.version == 0) { + switch (args->version) { + default: + args->mount_server.version = NFS_MNT3_VERSION; + break; + case 2: + args->mount_server.version = NFS_MNT_VERSION; + } + } + request.version = args->mount_server.version; + + if (args->mount_server.hostname) + request.hostname = args->mount_server.hostname; + else + request.hostname = args->nfs_server.hostname; + + /* + * Construct the mount server's address. + */ + if (args->mount_server.address.ss_family == AF_UNSPEC) { + memcpy(request.sap, &args->nfs_server.address, + args->nfs_server.addrlen); + args->mount_server.addrlen = args->nfs_server.addrlen; + } + request.salen = args->mount_server.addrlen; + nfs_set_port(request.sap, &args->mount_server.port, 0); + + /* + * Now ask the mount server to map our export path + * to a file handle. + */ + status = nfs_mount(&request); + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } + + return 0; +} + +static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + int status; + unsigned int i; + bool tried_auth_unix = false; + bool auth_null_in_list = false; + struct nfs_server *server = ERR_PTR(-EACCES); + struct nfs_parsed_mount_data *args = mount_info->parsed; + rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; + unsigned int authlist_len = ARRAY_SIZE(authlist); + + status = nfs_request_mount(args, mount_info->mntfh, authlist, + &authlist_len); + if (status) + return ERR_PTR(status); + + /* + * Was a sec= authflavor specified in the options? First, verify + * whether the server supports it, and then just try to use it if so. + */ + if (args->auth_info.flavor_len > 0) { + status = nfs_verify_authflavors(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", + args->selected_flavor); + if (status) + return ERR_PTR(status); + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + } + + /* + * No sec= option was provided. RFC 2623, section 2.7 suggests we + * SHOULD prefer the flavor listed first. However, some servers list + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. + */ + for (i = 0; i < authlist_len; ++i) { + rpc_authflavor_t flavor; + struct rpcsec_gss_info info; + + flavor = authlist[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + tried_auth_unix = true; + break; + case RPC_AUTH_NULL: + auth_null_in_list = true; + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) != 0) + continue; + /* Fallthrough */ + } + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); + args->selected_flavor = flavor; + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + if (!IS_ERR(server)) + return server; + } + + /* + * Nothing we tried so far worked. At this point, give up if we've + * already tried AUTH_UNIX or if the server's list doesn't contain + * AUTH_NULL + */ + if (tried_auth_unix || !auth_null_in_list) + return server; + + /* Last chance! Try AUTH_UNIX */ + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +} + +struct dentry *nfs_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + + if (mount_info->parsed->need_mount) + server = nfs_try_mount_request(mount_info, nfs_mod); + else + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + + if (IS_ERR(server)) + return ERR_CAST(server); + + return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod); +} +EXPORT_SYMBOL_GPL(nfs_try_mount); + +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *end; + + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') + goto out_bad_devname; + + len = end - dev_name; + end++; + } else { + char *comma; + + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; + + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } + + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (*hostname == NULL) + goto out_nomem; + len = strlen(++end); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(end, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + +/* + * Validate the NFS2/NFS3 mount data + * - fills in the mount root filehandle + * + * For option strings, user space handles the following behaviors: + * + * + DNS: mapping server host name to IP address ("addr=" option) + * + * + failure mode: how to behave if a mount request can't be handled + * immediately ("fg/bg" option) + * + * + retry: how often to retry a mount request ("retry=" option) + * + * + breaking back: trying proto=udp after proto=tcp, v2 after v3, + * mountproto=tcp after mountproto=udp, and so on + */ +static int nfs23_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; + + if (data == NULL) + goto out_no_data; + + args->version = NFS_DEFAULT_VERSION; + switch (data->version) { + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) + goto out_no_v3; + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + /* Turn off security negotiation */ + extra_flags |= NFS_MOUNT_SECFLAVOUR; + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) + goto out_no_sec; + case 5: + memset(data->context, 0, sizeof(data->context)); + case 6: + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; + mntfh->size = data->root.size; + args->version = 3; + } else { + mntfh->size = NFS2_FHSIZE; + args->version = 2; + } + + + memcpy(mntfh->data, data->root.data, mntfh->size); + if (mntfh->size < sizeof(mntfh->data)) + memset(mntfh->data + mntfh->size, 0, + sizeof(mntfh->data) - mntfh->size); + + /* + * Translate to nfs_parsed_mount_data, which nfs_fill_super + * can deal with. + */ + args->flags = data->flags & NFS_MOUNT_FLAGMASK; + args->flags |= extra_flags; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + args->need_mount = false; + + memcpy(sap, &data->addr, sizeof(data->addr)); + args->nfs_server.addrlen = sizeof(data->addr); + args->nfs_server.port = ntohs(data->addr.sin_port); + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (!(data->flags & NFS_MOUNT_TCP)) + args->nfs_server.protocol = XPRT_TRANSPORT_UDP; + /* N.B. caller will free nfs_server.hostname in all cases */ + args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); + args->namlen = data->namlen; + args->bsize = data->bsize; + + if (data->flags & NFS_MOUNT_SECFLAVOUR) + args->selected_flavor = data->pseudoflavor; + else + args->selected_flavor = RPC_AUTH_UNIX; + if (!args->nfs_server.hostname) + goto out_nomem; + + if (!(data->flags & NFS_MOUNT_NONLM)) + args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + args->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + /* + * The legacy version 6 binary mount data from userspace has a + * field used only to transport selinux information into the + * the kernel. To continue to support that functionality we + * have a touch of selinux knowledge here in the NFS code. The + * userspace code converted context=blah to just blah so we are + * converting back to the full string selinux understands. + */ + if (data->context[0]){ +#ifdef CONFIG_SECURITY_SELINUX + int rc; + char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL); + if (!opts_str) + return -ENOMEM; + strcpy(opts_str, "context="); + data->context[NFS_MAX_CONTEXT_LEN] = '\0'; + strcat(opts_str, &data->context[0]); + rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts); + kfree(opts_str); + if (rc) + return rc; +#else + return -EINVAL; +#endif + } + + break; + default: + return NFS_TEXT_DATA; + } + + return 0; + +out_no_data: + dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n"); + return -EINVAL; + +out_no_v3: + dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n", + data->version); + return -EINVAL; + +out_no_sec: + dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); + return -ENOMEM; + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; + +out_invalid_fh: + dfprintk(MOUNT, "NFS: invalid root filehandle\n"); + return -EINVAL; +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + if (fs_type == &nfs_fs_type) + return nfs23_validate_mount_data(options, args, mntfh, dev_name); + return nfs4_validate_mount_data(options, args, dev_name); +} +#else +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + return nfs23_validate_mount_data(options, args, mntfh, dev_name); +} +#endif + +static int nfs_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + int port = 0; + int max_namelen = PAGE_SIZE; + int max_pathlen = NFS_MAXPATHLEN; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (args->version == 4) { +#if IS_ENABLED(CONFIG_NFS_V4) + port = NFS_PORT; + max_namelen = NFS4_MAXNAMLEN; + max_pathlen = NFS4_MAXPATHLEN; + nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; + nfs4_validate_mount_flags(args); +#else + goto out_v4_not_compiled; +#endif /* CONFIG_NFS_V4 */ + } else + nfs_set_mount_transport_protocol(args); + + nfs_set_port(sap, &args->nfs_server.port, port); + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + max_namelen, + &args->nfs_server.export_path, + max_pathlen); + +#if !IS_ENABLED(CONFIG_NFS_V4) +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#else +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); + return -EINVAL; +#endif /* !CONFIG_NFS_V4 */ + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; +} + +#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_LEGACY_INTERFACE) + +#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \ + ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT)) + +static int +nfs_compare_remount_data(struct nfs_server *nfss, + struct nfs_parsed_mount_data *data) +{ + if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK || + data->rsize != nfss->rsize || + data->wsize != nfss->wsize || + data->version != nfss->nfs_client->rpc_ops->version || + data->minorversion != nfss->nfs_client->cl_minorversion || + data->retrans != nfss->client->cl_timeout->to_retries || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || + data->acregmin != nfss->acregmin / HZ || + data->acregmax != nfss->acregmax / HZ || + data->acdirmin != nfss->acdirmin / HZ || + data->acdirmax != nfss->acdirmax / HZ || + data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || + data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) + return -EINVAL; + + return 0; +} + +int +nfs_remount(struct super_block *sb, int *flags, char *raw_data) +{ + int error; + struct nfs_server *nfss = sb->s_fs_info; + struct nfs_parsed_mount_data *data; + struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; + struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + u32 nfsvers = nfss->nfs_client->rpc_ops->version; + + sync_filesystem(sb); + + /* + * Userspace mount programs that send binary options generally send + * them populated with default values. We have no way to know which + * ones were explicitly specified. Fall back to legacy behavior and + * just return success. + */ + if ((nfsvers == 4 && (!options4 || options4->version == 1)) || + (nfsvers <= 3 && (!options || (options->version >= 1 && + options->version <= 6)))) + return 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* fill out struct with values from existing mount */ + data->flags = nfss->flags; + data->rsize = nfss->rsize; + data->wsize = nfss->wsize; + data->retrans = nfss->client->cl_timeout->to_retries; + data->selected_flavor = nfss->client->cl_auth->au_flavor; + data->acregmin = nfss->acregmin / HZ; + data->acregmax = nfss->acregmax / HZ; + data->acdirmin = nfss->acdirmin / HZ; + data->acdirmax = nfss->acdirmax / HZ; + data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; + data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + data->version = nfsvers; + data->minorversion = nfss->nfs_client->cl_minorversion; + data->net = current->nsproxy->net_ns; + memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen); + + /* overwrite those values with any that were specified */ + error = -EINVAL; + if (!nfs_parse_mount_options((char *)options, data)) + goto out; + + /* + * noac is a special case. It implies -o sync, but that's not + * necessarily reflected in the mtab options. do_remount_sb + * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * remount options, so we have to explicitly reset it. + */ + if (data->flags & NFS_MOUNT_NOAC) + *flags |= MS_SYNCHRONOUS; + + /* compare new mount options with old ones */ + error = nfs_compare_remount_data(nfss, data); +out: + kfree(data); + return error; +} +EXPORT_SYMBOL_GPL(nfs_remount); + +/* + * Initialise the common bits of the superblock + */ +inline void nfs_initialise_sb(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + sb->s_magic = NFS_SUPER_MAGIC; + + /* We probably want something more informative here */ + snprintf(sb->s_id, sizeof(sb->s_id), + "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + + if (sb->s_blocksize == 0) + sb->s_blocksize = nfs_block_bits(server->wsize, + &sb->s_blocksize_bits); + + sb->s_bdi = &server->backing_dev_info; + + nfs_super_set_maxbytes(sb, server->maxfilesize); +} + +/* + * Finish setting up an NFS2/3 superblock + */ +void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) +{ + struct nfs_parsed_mount_data *data = mount_info->parsed; + struct nfs_server *server = NFS_SB(sb); + + sb->s_blocksize_bits = 0; + sb->s_blocksize = 0; + sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; + sb->s_op = server->nfs_client->cl_nfs_mod->sops; + if (data && data->bsize) + sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); + + if (server->nfs_client->rpc_ops->version != 2) { + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_time_gran = 1; + } + + nfs_initialise_sb(sb); +} +EXPORT_SYMBOL_GPL(nfs_fill_super); + +/* + * Finish setting up a cloned NFS2/3/4 superblock + */ +void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +{ + const struct super_block *old_sb = mount_info->cloned->sb; + struct nfs_server *server = NFS_SB(sb); + + sb->s_blocksize_bits = old_sb->s_blocksize_bits; + sb->s_blocksize = old_sb->s_blocksize; + sb->s_maxbytes = old_sb->s_maxbytes; + sb->s_xattr = old_sb->s_xattr; + sb->s_op = old_sb->s_op; + sb->s_time_gran = 1; + + if (server->nfs_client->rpc_ops->version != 2) { + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + } + + nfs_initialise_sb(sb); +} + +static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) +{ + const struct nfs_server *a = s->s_fs_info; + const struct rpc_clnt *clnt_a = a->client; + const struct rpc_clnt *clnt_b = b->client; + + if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK)) + goto Ebusy; + if (a->nfs_client != b->nfs_client) + goto Ebusy; + if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK) + goto Ebusy; + if (a->wsize != b->wsize) + goto Ebusy; + if (a->rsize != b->rsize) + goto Ebusy; + if (a->acregmin != b->acregmin) + goto Ebusy; + if (a->acregmax != b->acregmax) + goto Ebusy; + if (a->acdirmin != b->acdirmin) + goto Ebusy; + if (a->acdirmax != b->acdirmax) + goto Ebusy; + if (b->auth_info.flavor_len > 0 && + clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + goto Ebusy; + return 1; +Ebusy: + return 0; +} + +struct nfs_sb_mountdata { + struct nfs_server *server; + int mntflags; +}; + +static int nfs_set_super(struct super_block *s, void *data) +{ + struct nfs_sb_mountdata *sb_mntdata = data; + struct nfs_server *server = sb_mntdata->server; + int ret; + + s->s_flags = sb_mntdata->mntflags; + s->s_fs_info = server; + s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + ret = set_anon_super(s, server); + if (ret == 0) + server->s_dev = s->s_dev; + return ret; +} + +static int nfs_compare_super_address(struct nfs_server *server1, + struct nfs_server *server2) +{ + struct sockaddr *sap1, *sap2; + + sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; + sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; + + if (sap1->sa_family != sap2->sa_family) + return 0; + + switch (sap1->sa_family) { + case AF_INET: { + struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1; + struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2; + if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr) + return 0; + if (sin1->sin_port != sin2->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1; + struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2; + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) + return 0; + if (sin1->sin6_port != sin2->sin6_port) + return 0; + break; + } + default: + return 0; + } + + return 1; +} + +static int nfs_compare_super(struct super_block *sb, void *data) +{ + struct nfs_sb_mountdata *sb_mntdata = data; + struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); + int mntflags = sb_mntdata->mntflags; + + if (!nfs_compare_super_address(old, server)) + return 0; + /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ + if (old->flags & NFS_MOUNT_UNSHARED) + return 0; + if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) + return 0; + return nfs_compare_mount_options(sb, server, mntflags); +} + +#ifdef CONFIG_NFS_FSCACHE +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ + struct nfs_server *nfss = NFS_SB(sb); + char *uniq = NULL; + int ulen = 0; + + nfss->fscache_key = NULL; + nfss->fscache = NULL; + + if (parsed) { + if (!(parsed->options & NFS_OPTION_FSCACHE)) + return; + if (parsed->fscache_uniq) { + uniq = parsed->fscache_uniq; + ulen = strlen(parsed->fscache_uniq); + } + } else if (cloned) { + struct nfs_server *mnt_s = NFS_SB(cloned->sb); + if (!(mnt_s->options & NFS_OPTION_FSCACHE)) + return; + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + }; + } else + return; + + nfs_fscache_get_super_cookie(sb, uniq, ulen); +} +#else +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ +} +#endif + +static int nfs_bdi_register(struct nfs_server *server) +{ + return bdi_register_dev(&server->backing_dev_info, server->s_dev); +} + +int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + int error; + unsigned long kflags = 0, kflags_out = 0; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, + kflags, &kflags_out); + if (error) + goto err; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; +err: + return error; +} +EXPORT_SYMBOL_GPL(nfs_set_sb_security); + +int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + /* clone any lsm security options from the parent to the new sb */ + if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + return -ESTALE; + return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); +} +EXPORT_SYMBOL_GPL(nfs_clone_sb_security); + +struct dentry *nfs_fs_mount_common(struct nfs_server *server, + int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct super_block *s; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + .server = server, + }; + int error; + + if (server->flags & NFS_MOUNT_UNSHARED) + compare_super = NULL; + + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + + if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) + if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); + if (IS_ERR(s)) { + mntroot = ERR_CAST(s); + goto out_err_nosb; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) { + mntroot = ERR_PTR(error); + goto error_splat_super; + } + server->super = s; + } + + if (!s->s_root) { + /* initial superblock/root creation */ + mount_info->fill_super(s, mount_info); + nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + } + + mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); + if (IS_ERR(mntroot)) + goto error_splat_super; + + error = mount_info->set_security(s, mntroot, mount_info); + if (error) + goto error_splat_root; + + s->s_flags |= MS_ACTIVE; + +out: + return mntroot; + +out_err_nosb: + nfs_free_server(server); + goto out; + +error_splat_root: + dput(mntroot); + mntroot = ERR_PTR(error); +error_splat_super: + deactivate_locked_super(s); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_fs_mount_common); + +struct dentry *nfs_fs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_set_sb_security, + }; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod; + int error; + + mount_info.parsed = nfs_alloc_parsed_mount_data(); + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.parsed == NULL || mount_info.mntfh == NULL) + goto out; + + /* Validate the mount data */ + error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name); + if (error == NFS_TEXT_DATA) + error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name); + if (error < 0) { + mntroot = ERR_PTR(error); + goto out; + } + + nfs_mod = get_nfs_version(mount_info.parsed->version); + if (IS_ERR(nfs_mod)) { + mntroot = ERR_CAST(nfs_mod); + goto out; + } + + mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod); + + put_nfs_version(nfs_mod); +out: + nfs_free_parsed_mount_data(mount_info.parsed); + nfs_free_fhandle(mount_info.mntfh); + return mntroot; +} +EXPORT_SYMBOL_GPL(nfs_fs_mount); + +/* + * Destroy an NFS2/3 superblock + */ +void nfs_kill_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + dev_t dev = s->s_dev; + + generic_shutdown_super(s); + + nfs_fscache_release_super_cookie(s); + + nfs_free_server(server); + free_anon_bdev(dev); +} +EXPORT_SYMBOL_GPL(nfs_kill_super); + +/* + * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) + */ +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + struct nfs_mount_info mount_info = { + .fill_super = nfs_clone_super, + .set_security = nfs_clone_sb_security, + .cloned = data, + }; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; + + dprintk("--> nfs_xdev_mount()\n"); + + mount_info.mntfh = mount_info.cloned->fh; + + /* create a new volume representation */ + server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); + + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); + + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; +} + +#if IS_ENABLED(CONFIG_NFS_V4) + +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) +{ + args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| + NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); +} + +/* + * Validate NFSv4 mount options + */ +static int nfs4_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; + char *c; + + if (data == NULL) + goto out_no_data; + + args->version = 4; + + switch (data->version) { + case 1: + if (data->host_addrlen > sizeof(args->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + args->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, + data->auth_flavours, + sizeof(pseudoflavor))) + return -EFAULT; + args->selected_flavor = pseudoflavor; + } else + args->selected_flavor = RPC_AUTH_UNIX; + + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + args->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + args->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + args->client_address = c; + + /* + * Translate to nfs_parsed_mount_data, which nfs4_fill_super + * can deal with. + */ + + args->flags = data->flags & NFS4_MOUNT_FLAGMASK; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + args->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; + + break; + default: + return NFS_TEXT_DATA; + } + + return 0; + +out_no_data: + dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n"); + return -EINVAL; + +out_inval_auth: + dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n", + data->auth_flavourlen); + return -EINVAL; + +out_no_address: + dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); + return -EINVAL; + +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); + return -EINVAL; +} + +/* + * NFS v4 module parameters need to stay in the + * NFS client for backwards compatibility + */ +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ +bool nfs4_disable_idmapping = true; +unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short send_implementation_id = 1; +char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; +bool recover_lost_locks = false; + +EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); +EXPORT_SYMBOL_GPL(nfs_callback_tcpport); +EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); +EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); +EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(send_implementation_id); +EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); +EXPORT_SYMBOL_GPL(recover_lost_locks); + +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param(nfs_idmap_cache_timeout, int, 0644); +module_param(nfs4_disable_idmapping, bool, 0644); +module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, + NFS4_CLIENT_ID_UNIQ_LEN, 0600); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " + "requests the client will negotiate"); +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, + "Send implementation ID with NFSv4.1 exchange_id"); +MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); + +module_param(recover_lost_locks, bool, 0644); +MODULE_PARM_DESC(recover_lost_locks, + "If the server reports that a lock might be lost, " + "try to recover it risking data corruption."); + + +#endif /* CONFIG_NFS_V4 */ diff --git a/kernel/fs/nfs/symlink.c b/kernel/fs/nfs/symlink.c new file mode 100644 index 000000000..2d5620065 --- /dev/null +++ b/kernel/fs/nfs/symlink.c @@ -0,0 +1,78 @@ +/* + * linux/fs/nfs/symlink.c + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * nfs symlink handling code + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Symlink caching in the page cache is even more simplistic + * and straight-forward than readdir caching. + */ + +static int nfs_symlink_filler(struct inode *inode, struct page *page) +{ + int error; + + error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE); + if (error < 0) + goto error; + SetPageUptodate(page); + unlock_page(page); + return 0; + +error: + SetPageError(page); + unlock_page(page); + return -EIO; +} + +static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = d_inode(dentry); + struct page *page; + void *err; + + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); + if (err) + goto read_failed; + page = read_cache_page(&inode->i_data, 0, + (filler_t *)nfs_symlink_filler, inode); + if (IS_ERR(page)) { + err = page; + goto read_failed; + } + nd_set_link(nd, kmap(page)); + return page; + +read_failed: + nd_set_link(nd, err); + return NULL; +} + +/* + * symlinks can't do much... + */ +const struct inode_operations nfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = nfs_follow_link, + .put_link = page_put_link, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; diff --git a/kernel/fs/nfs/sysctl.c b/kernel/fs/nfs/sysctl.c new file mode 100644 index 000000000..bb6ed810f --- /dev/null +++ b/kernel/fs/nfs/sysctl.c @@ -0,0 +1,64 @@ +/* + * linux/fs/nfs/sysctl.c + * + * Sysctl interface to NFS parameters + */ +#include +#include +#include +#include +#include +#include +#include + +static struct ctl_table_header *nfs_callback_sysctl_table; + +static struct ctl_table nfs_cb_sysctls[] = { + { + .procname = "nfs_mountpoint_timeout", + .data = &nfs_mountpoint_expiry_timeout, + .maxlen = sizeof(nfs_mountpoint_expiry_timeout), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nfs_congestion_kb", + .data = &nfs_congestion_kb, + .maxlen = sizeof(nfs_congestion_kb), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table nfs_cb_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nfs_cb_sysctls, + }, + { } +}; + +static struct ctl_table nfs_cb_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nfs_cb_sysctl_dir, + }, + { } +}; + +int nfs_register_sysctl(void) +{ + nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); + if (nfs_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs_callback_sysctl_table); + nfs_callback_sysctl_table = NULL; +} diff --git a/kernel/fs/nfs/unlink.c b/kernel/fs/nfs/unlink.c new file mode 100644 index 000000000..fa538b2ba --- /dev/null +++ b/kernel/fs/nfs/unlink.c @@ -0,0 +1,604 @@ +/* + * linux/fs/nfs/unlink.c + * + * nfs sillydelete handling + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "nfs4_fs.h" +#include "iostat.h" +#include "delegation.h" + +#include "nfstrace.h" + +/** + * nfs_free_unlinkdata - release data from a sillydelete operation. + * @data: pointer to unlink structure. + */ +static void +nfs_free_unlinkdata(struct nfs_unlinkdata *data) +{ + iput(data->dir); + put_rpccred(data->cred); + kfree(data->args.name.name); + kfree(data); +} + +#define NAME_ALLOC_LEN(len) ((len+16) & ~15) +/** + * nfs_copy_dname - copy dentry name to data structure + * @dentry: pointer to dentry + * @data: nfs_unlinkdata + */ +static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + char *str; + int len = dentry->d_name.len; + + str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL); + if (!str) + return -ENOMEM; + data->args.name.len = len; + data->args.name.name = str; + return 0; +} + +static void nfs_free_dname(struct nfs_unlinkdata *data) +{ + kfree(data->args.name.name); + data->args.name.name = NULL; + data->args.name.len = 0; +} + +static void nfs_dec_sillycount(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + if (atomic_dec_return(&nfsi->silly_count) == 1) + wake_up(&nfsi->waitqueue); +} + +/** + * nfs_async_unlink_done - Sillydelete post-processing + * @task: rpc_task of the sillydelete + * + * Do the directory attribute update. + */ +static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + struct inode *dir = data->dir; + + trace_nfs_sillyrename_unlink(data, task->tk_status); + if (!NFS_PROTO(dir)->unlink_done(task, dir)) + rpc_restart_call_prepare(task); +} + +/** + * nfs_async_unlink_release - Release the sillydelete data. + * @task: rpc_task of the sillydelete + * + * We need to call nfs_put_unlinkdata as a 'tk_release' task since the + * rpc_task would be freed too. + */ +static void nfs_async_unlink_release(void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + struct super_block *sb = data->dir->i_sb; + + nfs_dec_sillycount(data->dir); + nfs_free_unlinkdata(data); + nfs_sb_deactive(sb); +} + +static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data); +} + +static const struct rpc_call_ops nfs_unlink_ops = { + .rpc_call_done = nfs_async_unlink_done, + .rpc_release = nfs_async_unlink_release, + .rpc_call_prepare = nfs_unlink_prepare, +}; + +static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) +{ + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_unlink_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + struct dentry *alias; + + alias = d_lookup(parent, &data->args.name); + if (alias != NULL) { + int ret; + void *devname_garbage = NULL; + + /* + * Hey, we raced with lookup... See if we need to transfer + * the sillyrename information to the aliased dentry. + */ + nfs_free_dname(data); + ret = nfs_copy_dname(alias, data); + spin_lock(&alias->d_lock); + if (ret == 0 && d_really_is_positive(alias) && + !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { + devname_garbage = alias->d_fsdata; + alias->d_fsdata = data; + alias->d_flags |= DCACHE_NFSFS_RENAMED; + ret = 1; + } else + ret = 0; + spin_unlock(&alias->d_lock); + nfs_dec_sillycount(dir); + dput(alias); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + kfree(devname_garbage); + return ret; + } + data->dir = igrab(dir); + if (!data->dir) { + nfs_dec_sillycount(dir); + return 0; + } + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); + nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + + task_setup_data.rpc_client = NFS_CLIENT(dir); + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task_async(task); + return 1; +} + +static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + struct dentry *parent; + struct inode *dir; + int ret = 0; + + + parent = dget_parent(dentry); + if (parent == NULL) + goto out_free; + dir = d_inode(parent); + /* Non-exclusive lock protects against concurrent lookup() calls */ + spin_lock(&dir->i_lock); + if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { + /* Deferred delete */ + hlist_add_head(&data->list, &NFS_I(dir)->silly_list); + spin_unlock(&dir->i_lock); + ret = 1; + goto out_dput; + } + spin_unlock(&dir->i_lock); + ret = nfs_do_call_unlink(parent, dir, data); +out_dput: + dput(parent); +out_free: + return ret; +} + +void nfs_wait_on_sillyrename(struct dentry *dentry) +{ + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); + + wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); +} + +void nfs_block_sillyrename(struct dentry *dentry) +{ + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); + + wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); +} + +void nfs_unblock_sillyrename(struct dentry *dentry) +{ + struct inode *dir = d_inode(dentry); + struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_unlinkdata *data; + + atomic_inc(&nfsi->silly_count); + spin_lock(&dir->i_lock); + while (!hlist_empty(&nfsi->silly_list)) { + if (!atomic_inc_not_zero(&nfsi->silly_count)) + break; + data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list); + hlist_del(&data->list); + spin_unlock(&dir->i_lock); + if (nfs_do_call_unlink(dentry, dir, data) == 0) + nfs_free_unlinkdata(data); + spin_lock(&dir->i_lock); + } + spin_unlock(&dir->i_lock); +} + +/** + * nfs_async_unlink - asynchronous unlinking of a file + * @dir: parent directory of dentry + * @dentry: dentry to unlink + */ +static int +nfs_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nfs_unlinkdata *data; + int status = -ENOMEM; + void *devname_garbage = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + goto out; + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + status = PTR_ERR(data->cred); + goto out_free; + } + data->res.dir_attr = &data->dir_attr; + + status = -EBUSY; + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out_unlock; + dentry->d_flags |= DCACHE_NFSFS_RENAMED; + devname_garbage = dentry->d_fsdata; + dentry->d_fsdata = data; + spin_unlock(&dentry->d_lock); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + kfree(devname_garbage); + return 0; +out_unlock: + spin_unlock(&dentry->d_lock); + put_rpccred(data->cred); +out_free: + kfree(data); +out: + return status; +} + +/** + * nfs_complete_unlink - Initialize completion of the sillydelete + * @dentry: dentry to delete + * @inode: inode + * + * Since we're most likely to be called by dentry_iput(), we + * only use the dentry to find the sillydelete. We then copy the name + * into the qstr. + */ +void +nfs_complete_unlink(struct dentry *dentry, struct inode *inode) +{ + struct nfs_unlinkdata *data = NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + data = dentry->d_fsdata; + dentry->d_fsdata = NULL; + } + spin_unlock(&dentry->d_lock); + + if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) + nfs_free_unlinkdata(data); +} + +/* Cancel a queued async unlink. Called when a sillyrename run fails. */ +static void +nfs_cancel_async_unlink(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + struct nfs_unlinkdata *data = dentry->d_fsdata; + + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + nfs_free_unlinkdata(data); + return; + } + spin_unlock(&dentry->d_lock); +} + +/** + * nfs_async_rename_done - Sillyrename post-processing + * @task: rpc_task of the sillyrename + * @calldata: nfs_renamedata for the sillyrename + * + * Do the directory attribute updates and the d_move + */ +static void nfs_async_rename_done(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct inode *old_dir = data->old_dir; + struct inode *new_dir = data->new_dir; + struct dentry *old_dentry = data->old_dentry; + + trace_nfs_sillyrename_rename(old_dir, old_dentry, + new_dir, data->new_dentry, task->tk_status); + if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { + rpc_restart_call_prepare(task); + return; + } + + if (data->complete) + data->complete(task, data); +} + +/** + * nfs_async_rename_release - Release the sillyrename data. + * @calldata: the struct nfs_renamedata to be released + */ +static void nfs_async_rename_release(void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct super_block *sb = data->old_dir->i_sb; + + if (d_really_is_positive(data->old_dentry)) + nfs_mark_for_revalidate(d_inode(data->old_dentry)); + + dput(data->old_dentry); + dput(data->new_dentry); + iput(data->old_dir); + iput(data->new_dir); + nfs_sb_deactive(sb); + put_rpccred(data->cred); + kfree(data); +} + +static void nfs_rename_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data); +} + +static const struct rpc_call_ops nfs_rename_ops = { + .rpc_call_done = nfs_async_rename_done, + .rpc_release = nfs_async_rename_release, + .rpc_call_prepare = nfs_rename_prepare, +}; + +/** + * nfs_async_rename - perform an asynchronous rename operation + * @old_dir: directory that currently holds the dentry to be renamed + * @new_dir: target directory for the rename + * @old_dentry: original dentry to be renamed + * @new_dentry: dentry to which the old_dentry should be renamed + * + * It's expected that valid references to the dentries and inodes are held + */ +struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)) +{ + struct nfs_renamedata *data; + struct rpc_message msg = { }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_rename_ops, + .workqueue = nfsiod_workqueue, + .rpc_client = NFS_CLIENT(old_dir), + .flags = RPC_TASK_ASYNC, + }; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return ERR_PTR(-ENOMEM); + task_setup_data.callback_data = data; + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + struct rpc_task *task = ERR_CAST(data->cred); + kfree(data); + return task; + } + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + /* set up nfs_renamedata */ + data->old_dir = old_dir; + ihold(old_dir); + data->new_dir = new_dir; + ihold(new_dir); + data->old_dentry = dget(old_dentry); + data->new_dentry = dget(new_dentry); + nfs_fattr_init(&data->old_fattr); + nfs_fattr_init(&data->new_fattr); + data->complete = complete; + + /* set up nfs_renameargs */ + data->args.old_dir = NFS_FH(old_dir); + data->args.old_name = &old_dentry->d_name; + data->args.new_dir = NFS_FH(new_dir); + data->args.new_name = &new_dentry->d_name; + + /* set up nfs_renameres */ + data->res.old_fattr = &data->old_fattr; + data->res.new_fattr = &data->new_fattr; + + nfs_sb_active(old_dir->i_sb); + + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + + return rpc_run_task(&task_setup_data); +} + +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *dentry = data->old_dentry; + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(dentry); + return; + } + + /* + * vfs_unlink and the like do not issue this when a file is + * sillyrenamed, so do it here. + */ + fsnotify_nameremove(dentry, 0); +} + +#define SILLYNAME_PREFIX ".nfs" +#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) +#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) +#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1) +#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \ + SILLYNAME_FILEID_LEN + \ + SILLYNAME_COUNTER_LEN) + +/** + * nfs_sillyrename - Perform a silly-rename of a dentry + * @dir: inode of directory that contains dentry + * @dentry: dentry to be sillyrenamed + * + * NFSv2/3 is stateless and the server doesn't know when the client is + * holding a file open. To prevent application problems when a file is + * unlinked while it's still open, the client performs a "silly-rename". + * That is, it renames the file to a hidden file in the same directory, + * and only performs the unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced. The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots. However, we may not assume a server does so. (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.)) + */ +int +nfs_sillyrename(struct inode *dir, struct dentry *dentry) +{ + static unsigned int sillycounter; + unsigned char silly[SILLYNAME_LEN + 1]; + unsigned long long fileid; + struct dentry *sdentry; + struct rpc_task *task; + int error = -EBUSY; + + dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n", + dentry, d_count(dentry)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + + /* + * We don't allow a dentry to be silly-renamed twice. + */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out; + + fileid = NFS_FILEID(d_inode(dentry)); + + /* Return delegation in anticipation of the rename */ + NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); + + sdentry = NULL; + do { + int slen; + dput(sdentry); + sillycounter++; + slen = scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); + + dfprintk(VFS, "NFS: trying to rename %pd to %s\n", + dentry, silly); + + sdentry = lookup_one_len(silly, dentry->d_parent, slen); + /* + * N.B. Better to return EBUSY here ... it could be + * dangerous to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (d_inode(sdentry) != NULL); /* need negative lookup */ + + /* queue unlink first. Can't do this from rpc_release as it + * has to allocate memory + */ + error = nfs_async_unlink(dir, dentry); + if (error) + goto out_dput; + + /* populate unlinkdata with the right dname */ + error = nfs_copy_dname(sdentry, + (struct nfs_unlinkdata *)dentry->d_fsdata); + if (error) { + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* run the rename task, undo unlink if it fails */ + task = nfs_async_rename(dir, dir, dentry, sdentry, + nfs_complete_sillyrename); + if (IS_ERR(task)) { + error = -EBUSY; + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* wait for the RPC task to complete, unless a SIGKILL intervenes */ + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + switch (error) { + case 0: + /* The rename succeeded */ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by + * forcing a new lookup */ + d_drop(dentry); + d_drop(sdentry); + } + rpc_put_task(task); +out_dput: + dput(sdentry); +out: + return error; +} diff --git a/kernel/fs/nfs/write.c b/kernel/fs/nfs/write.c new file mode 100644 index 000000000..dfc19f157 --- /dev/null +++ b/kernel/fs/nfs/write.c @@ -0,0 +1,2019 @@ +/* + * linux/fs/nfs/write.c + * + * Write file data over NFS. + * + * Copyright (C) 1996, 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "nfs4_fs.h" +#include "fscache.h" +#include "pnfs.h" + +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +#define MIN_POOL_WRITE (32) +#define MIN_POOL_COMMIT (4) + +/* + * Local function declarations + */ +static void nfs_redirty_request(struct nfs_page *req); +static const struct rpc_call_ops nfs_commit_ops; +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode); +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page); + +static struct kmem_cache *nfs_wdata_cachep; +static mempool_t *nfs_wdata_mempool; +static struct kmem_cache *nfs_cdata_cachep; +static mempool_t *nfs_commit_mempool; + +struct nfs_commit_data *nfs_commitdata_alloc(void) +{ + struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + } + return p; +} +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); + +void nfs_commit_free(struct nfs_commit_data *p) +{ + mempool_free(p, nfs_commit_mempool); +} +EXPORT_SYMBOL_GPL(nfs_commit_free); + +static struct nfs_pgio_header *nfs_writehdr_alloc(void) +{ + struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + + if (p) + memset(p, 0, sizeof(*p)); + return p; +} + +static void nfs_writehdr_free(struct nfs_pgio_header *hdr) +{ + mempool_free(hdr, nfs_wdata_mempool); +} + +static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +{ + ctx->error = error; + smp_wmb(); + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); +} + +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page * +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) +{ + struct nfs_page *req = NULL; + + if (PagePrivate(page)) + req = (struct nfs_page *)page_private(page); + else if (unlikely(PageSwapCache(page))) + req = nfs_page_search_commits_for_head_request_locked(nfsi, + page); + + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } + + return req; +} + +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req = NULL; + + spin_lock(&inode->i_lock); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); + spin_unlock(&inode->i_lock); + return req; +} + +/* Adjust the file length if we're writing beyond the end */ +static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) +{ + struct inode *inode = page_file_mapping(page)->host; + loff_t end, i_size; + pgoff_t end_index; + + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (i_size > 0 && page_file_index(page) < end_index) + goto out; + end = page_file_offset(page) + ((loff_t)offset+count); + if (i_size >= end) + goto out; + i_size_write(inode, end); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); +out: + spin_unlock(&inode->i_lock); +} + +/* A writeback failed: mark the page as bad, and invalidate the page cache */ +static void nfs_set_pageerror(struct page *page) +{ + nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); +} + +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ + struct nfs_page *req; + + WARN_ON_ONCE(head != head->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + + req = head; + do { + if (page_offset >= req->wb_pgbase && + page_offset < (req->wb_pgbase + req->wb_bytes)) + return req; + + req = req->wb_this_page; + } while (req != head); + + return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ + struct nfs_page *tmp; + unsigned int pos = 0; + unsigned int len = nfs_page_length(req->wb_page); + + nfs_page_group_lock(req, false); + + do { + tmp = nfs_page_group_search_locked(req->wb_head, pos); + if (tmp) { + /* no way this should happen */ + WARN_ON_ONCE(tmp->wb_pgbase != pos); + pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); + } + } while (tmp && pos < len); + + nfs_page_group_unlock(req); + WARN_ON_ONCE(pos > len); + return pos == len; +} + +/* We can set the PG_uptodate flag if we see that a write request + * covers the full page. + */ +static void nfs_mark_uptodate(struct nfs_page *req) +{ + if (PageUptodate(req->wb_page)) + return; + if (!nfs_page_group_covers_page(req)) + return; + SetPageUptodate(req->wb_page); +} + +static int wb_priority(struct writeback_control *wbc) +{ + int ret = 0; + if (wbc->for_reclaim) + return FLUSH_HIGHPRI | FLUSH_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) + ret = FLUSH_COND_STABLE; + if (wbc->for_kupdate || wbc->for_background) + ret |= FLUSH_LOWPRI; + return ret; +} + +/* + * NFS congestion control + */ + +int nfs_congestion_kb; + +#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) +#define NFS_CONGESTION_OFF_THRESH \ + (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) + +static void nfs_set_page_writeback(struct page *page) +{ + struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); + int ret = test_set_page_writeback(page); + + WARN_ON_ONCE(ret != 0); + + if (atomic_long_inc_return(&nfss->writeback) > + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } +} + +static void nfs_end_page_writeback(struct nfs_page *req) +{ + struct inode *inode = page_file_mapping(req->wb_page)->host; + struct nfs_server *nfss = NFS_SERVER(inode); + + if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + return; + + end_page_writeback(req->wb_page); + if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); +} + + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; + int ret; + +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + + spin_lock(&inode->i_lock); + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { + spin_unlock(&inode->i_lock); + return NULL; + } + + /* holding inode lock, so always make a non-blocking call to try the + * page group lock */ + ret = nfs_page_group_lock(head, true); + if (ret < 0) { + spin_unlock(&inode->i_lock); + + if (!nonblock && ret == -EAGAIN) { + nfs_page_group_lock_wait(head); + nfs_release_request(head); + goto try_again; + } + + nfs_release_request(head); + return ERR_PTR(ret); + } + + /* lock each request in the page group */ + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order - but may be repeated (mirrored writes). + */ + if (subreq->wb_offset == (head->wb_offset + total_bytes)) { + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || + ((subreq->wb_offset + subreq->wb_bytes) > + (head->wb_offset + total_bytes)))) { + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + return ERR_PTR(-EIO); + } + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + + return ERR_PTR(ret); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_clear_request_commit(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; + } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clean uprequests on destroy list */ + spin_unlock(&inode->i_lock); + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page, bool nonblock) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_lock_and_join_requests(page, nonblock); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + nfs_set_page_writeback(page); + WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); + + ret = 0; + if (!nfs_pageio_add_request(pgio, req)) { + nfs_redirty_request(req); + ret = pgio->pg_error; + } +out: + return ret; +} + +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +{ + struct inode *inode = page_file_mapping(page)->host; + int ret; + + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); + + nfs_pageio_cond_complete(pgio, page_file_index(page)); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; +} + +/* + * Write an mmapped page to the server. + */ +static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +{ + struct nfs_pageio_descriptor pgio; + int err; + + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + false, &nfs_async_write_completion_ops); + err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_complete(&pgio); + if (err < 0) + return err; + if (pgio.pg_error < 0) + return pgio.pg_error; + return 0; +} + +int nfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + + ret = nfs_writepage_locked(page, wbc); + unlock_page(page); + return ret; +} + +static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +{ + int ret; + + ret = nfs_do_writepage(page, wbc, data); + unlock_page(page); + return ret; +} + +int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + unsigned long *bitlock = &NFS_I(inode)->flags; + struct nfs_pageio_descriptor pgio; + int err; + + /* Stop dirtying of new pages while we sync */ + err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (err) + goto out_err; + + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + &nfs_async_write_completion_ops); + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); + nfs_pageio_complete(&pgio); + + clear_bit_unlock(NFS_INO_FLUSHING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_FLUSHING); + + if (err < 0) + goto out_err; + err = pgio.pg_error; + if (err < 0) + goto out_err; + return 0; +out_err: + return err; +} + +/* + * Insert a write request into an inode + */ +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + WARN_ON_ONCE(req->wb_this_page != req); + + /* Lock the request! */ + nfs_lock_request(req); + + spin_lock(&inode->i_lock); + if (!nfsi->nrequests && + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + inode->i_version++; + /* + * Swap-space should not get truncated. Hence no need to plug the race + * with invalidate/truncate. + */ + if (likely(!PageSwapCache(req->wb_page))) { + set_bit(PG_MAPPED, &req->wb_flags); + SetPagePrivate(req->wb_page); + set_page_private(req->wb_page, (unsigned long)req); + } + nfsi->nrequests++; + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit. + * This flag also informs pgio layer when to bump nrequests when + * adding subrequests. */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); + kref_get(&req->wb_kref); + spin_unlock(&inode->i_lock); +} + +/* + * Remove a write request from an inode + */ +static void nfs_inode_remove_request(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *head; + + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + head = req->wb_head; + + spin_lock(&inode->i_lock); + if (likely(!PageSwapCache(head->wb_page))) { + set_page_private(head->wb_page, 0); + ClearPagePrivate(head->wb_page); + smp_mb__after_atomic(); + wake_up_page(head->wb_page, PG_private); + clear_bit(PG_MAPPED, &head->wb_flags); + } + nfsi->nrequests--; + spin_unlock(&inode->i_lock); + } else { + spin_lock(&inode->i_lock); + nfsi->nrequests--; + spin_unlock(&inode->i_lock); + } + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); +} + +static void +nfs_mark_request_dirty(struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + +/* + * nfs_page_search_commits_for_head_request_locked + * + * Search through commit lists on @inode for the head request for @page. + * Must be called while holding the inode (which is cinfo) lock. + * + * Returns the head request if found, or NULL if not found. + */ +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page) +{ + struct nfs_page *freq, *t; + struct nfs_commit_info cinfo; + struct inode *inode = &nfsi->vfs_inode; + + nfs_init_cinfo_from_inode(&cinfo, inode); + + /* search through pnfs commit lists */ + freq = pnfs_search_commit_reqs(inode, &cinfo, page); + if (freq) + return freq->wb_head; + + /* Linearly search the commit list for the correct request */ + list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + + return NULL; +} + +/** + * nfs_request_add_commit_list - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must _not_ hold the cinfo->lock, but must be + * holding the nfs_page lock. + */ +void +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + set_bit(PG_CLEAN, &(req)->wb_flags); + spin_lock(cinfo->lock); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; + spin_unlock(cinfo->lock); + if (!cinfo->dreq) + nfs_mark_page_unstable(req->wb_page); +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); + +/** + * nfs_request_remove_commit_list - Remove request from a commit list + * @req: pointer to a nfs_page + * @cinfo: holds list lock and accounting info + * + * This clears the PG_CLEAN bit, and updates the cinfo's count of + * number of outstanding requests requiring a commit + * It does not update the MM page stats. + * + * The caller _must_ hold the cinfo->lock and the nfs_page lock. + */ +void +nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo) +{ + if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) + return; + nfs_list_remove_request(req); + cinfo->mds->ncommit--; +} +EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); + +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode) +{ + cinfo->lock = &inode->i_lock; + cinfo->mds = &NFS_I(inode)->commit_info; + cinfo->ds = pnfs_get_ds_info(inode); + cinfo->dreq = NULL; + cinfo->completion_ops = &nfs_commit_completion_ops; +} + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq) +{ + if (dreq) + nfs_init_cinfo_from_dreq(cinfo, dreq); + else + nfs_init_cinfo_from_inode(cinfo, inode); +} +EXPORT_SYMBOL_GPL(nfs_init_cinfo); + +/* + * Add a request to the inode's commit list. + */ +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, u32 ds_commit_idx) +{ + if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) + return; + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); +} + +static void +nfs_clear_page_commit(struct page *page) +{ + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); +} + +/* Called holding inode (/cinfo) lock */ +static void +nfs_clear_request_commit(struct nfs_page *req) +{ + if (test_bit(PG_CLEAN, &req->wb_flags)) { + struct inode *inode = d_inode(req->wb_context->dentry); + struct nfs_commit_info cinfo; + + nfs_init_cinfo_from_inode(&cinfo, inode); + if (!pnfs_clear_request_commit(req, &cinfo)) { + nfs_request_remove_commit_list(req, &cinfo); + } + nfs_clear_page_commit(req->wb_page); + } +} + +int nfs_write_need_commit(struct nfs_pgio_header *hdr) +{ + if (hdr->verf.committed == NFS_DATA_SYNC) + return hdr->lseg == NULL; + return hdr->verf.committed != NFS_FILE_SYNC; +} + +static void nfs_write_completion(struct nfs_pgio_header *hdr) +{ + struct nfs_commit_info cinfo; + unsigned long bytes = 0; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + nfs_init_cinfo_from_inode(&cinfo, hdr->inode); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + bytes += req->wb_bytes; + nfs_list_remove_request(req); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && + (hdr->good_bytes < bytes)) { + nfs_set_pageerror(req->wb_page); + nfs_context_set_write_error(req->wb_context, hdr->error); + goto remove_req; + } + if (nfs_write_need_commit(hdr)) { + memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); + nfs_mark_request_commit(req, hdr->lseg, &cinfo, + hdr->pgio_mirror_idx); + goto next; + } +remove_req: + nfs_inode_remove_request(req); +next: + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); + } +out: + hdr->release(hdr); +} + +unsigned long +nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +{ + return cinfo->mds->ncommit; +} + +/* cinfo->lock held by caller */ +int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req, cinfo); + nfs_list_add_request(req, dst); + ret++; + if ((ret == max) && !cinfo->dreq) + break; + } + return ret; +} + +/* + * nfs_scan_commit - Scan an inode for commit requests + * @inode: NFS inode to scan + * @dst: mds destination list + * @cinfo: mds and ds lists of reqs ready to commit + * + * Moves requests from the inode's 'commit' request list. + * The requests are *not* checked to ensure that they form a contiguous set. + */ +int +nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + int ret = 0; + + spin_lock(cinfo->lock); + if (cinfo->mds->ncommit > 0) { + const int max = INT_MAX; + + ret = nfs_scan_commit_list(&cinfo->mds->list, dst, + cinfo, max); + ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); + } + spin_unlock(cinfo->lock); + return ret; +} + +/* + * Search for an existing write request, and attempt to update + * it to reflect a new dirty region on a given page. + * + * If the attempt fails, then the existing request is flushed out + * to disk. + */ +static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, + unsigned int bytes) +{ + struct nfs_page *req; + unsigned int rqend; + unsigned int end; + int error; + + if (!PagePrivate(page)) + return NULL; + + end = offset + bytes; + spin_lock(&inode->i_lock); + + for (;;) { + req = nfs_page_find_head_request_locked(NFS_I(inode), page); + if (req == NULL) + goto out_unlock; + + /* should be handled by nfs_flush_incompatible */ + WARN_ON_ONCE(req->wb_head != req); + WARN_ON_ONCE(req->wb_this_page != req); + + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend + || end < req->wb_offset) + goto out_flushme; + + if (nfs_lock_request(req)) + break; + + /* The request is locked, so wait and then retry */ + spin_unlock(&inode->i_lock); + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error != 0) + goto out_err; + spin_lock(&inode->i_lock); + } + + /* Okay, the request matches. Update the region */ + if (offset < req->wb_offset) { + req->wb_offset = offset; + req->wb_pgbase = offset; + } + if (end > rqend) + req->wb_bytes = end - req->wb_offset; + else + req->wb_bytes = rqend - req->wb_offset; +out_unlock: + if (req) + nfs_clear_request_commit(req); + spin_unlock(&inode->i_lock); + return req; +out_flushme: + spin_unlock(&inode->i_lock); + nfs_release_request(req); + error = nfs_wb_page(inode, page); +out_err: + return ERR_PTR(error); +} + +/* + * Try to update an existing write request, or create one if there is none. + * + * Note: Should always be called with the Page Lock held to prevent races + * if we have to add a new request. Also assumes that the caller has + * already called nfs_flush_incompatible() if necessary. + */ +static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, + struct page *page, unsigned int offset, unsigned int bytes) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req; + + req = nfs_try_to_update_request(inode, page, offset, bytes); + if (req != NULL) + goto out; + req = nfs_create_request(ctx, page, NULL, offset, bytes); + if (IS_ERR(req)) + goto out; + nfs_inode_add_request(inode, req); +out: + return req; +} + +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + req = nfs_setup_write_request(ctx, page, offset, count); + if (IS_ERR(req)) + return PTR_ERR(req); + /* Update file length */ + nfs_grow_file(page, offset, count); + nfs_mark_uptodate(req); + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); + return 0; +} + +int nfs_flush_incompatible(struct file *file, struct page *page) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_lock_context *l_ctx; + struct file_lock_context *flctx = file_inode(file)->i_flctx; + struct nfs_page *req; + int do_flush, status; + /* + * Look for a request corresponding to this page. If there + * is one, and it belongs to another file, we flush it out + * before we try to copy anything into the page. Do this + * due to the lack of an ACCESS-type call in NFSv2. + * Also do the same if we find a request from an existing + * dropped page. + */ + do { + req = nfs_page_find_head_request(page); + if (req == NULL) + return 0; + l_ctx = req->wb_lock_context; + do_flush = req->wb_page != page || req->wb_context != ctx; + /* for now, flush if more than 1 request in page_group */ + do_flush |= req->wb_this_page != req; + if (l_ctx && flctx && + !(list_empty_careful(&flctx->flc_posix) && + list_empty_careful(&flctx->flc_flock))) { + do_flush |= l_ctx->lockowner.l_owner != current->files + || l_ctx->lockowner.l_pid != current->tgid; + } + nfs_release_request(req); + if (!do_flush) + return 0; + status = nfs_wb_page(page_file_mapping(page)->host, page); + } while (status == 0); + return status; +} + +/* + * Avoid buffered writes when a open context credential's key would + * expire soon. + * + * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL. + * + * Return 0 and set a credential flag which triggers the inode to flush + * and performs NFS_FILE_SYNC writes if the key will expired within + * RPC_KEY_EXPIRE_TIMEO. + */ +int +nfs_key_timeout_notify(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filp); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_key_timeout_notify(auth, ctx->cred); +} + +/* + * Test if the open context credential key is marked to expire soon. + */ +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +{ + return rpcauth_cred_key_to_expire(ctx->cred); +} + +/* + * If the page cache is marked as unsafe or invalid, then we can't rely on + * the PageUptodate() flag. In this case, we will need to turn off + * write optimisations that depend on the page contents being correct. + */ +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfs_have_delegated_attributes(inode)) + goto out; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + return false; + smp_rmb(); + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + return false; +out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; + return PageUptodate(page) != 0; +} + +static bool +is_whole_file_wrlock(struct file_lock *fl) +{ + return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && + fl->fl_type == F_WRLCK; +} + +/* If we know the page is up to date, and we're not using byte range locks (or + * if we have the whole file locked for writing), it may be more efficient to + * extend the write to cover the entire page in order to avoid fragmentation + * inefficiencies. + * + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. + */ +static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +{ + int ret; + struct file_lock_context *flctx = inode->i_flctx; + struct file_lock *fl; + + if (file->f_flags & O_DSYNC) + return 0; + if (!nfs_write_pageuptodate(page, inode)) + return 0; + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + return 1; + if (!flctx || (list_empty_careful(&flctx->flc_flock) && + list_empty_careful(&flctx->flc_posix))) + return 0; + + /* Check to see if there are whole file write locks */ + ret = 0; + spin_lock(&flctx->flc_lock); + if (!list_empty(&flctx->flc_posix)) { + fl = list_first_entry(&flctx->flc_posix, struct file_lock, + fl_list); + if (is_whole_file_wrlock(fl)) + ret = 1; + } else if (!list_empty(&flctx->flc_flock)) { + fl = list_first_entry(&flctx->flc_flock, struct file_lock, + fl_list); + if (fl->fl_type == F_WRLCK) + ret = 1; + } + spin_unlock(&flctx->flc_lock); + return ret; +} + +/* + * Update and possibly write a cached page of an NFS file. + * + * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ +int nfs_updatepage(struct file *file, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page_file_mapping(page)->host; + int status = 0; + + nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + + dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", + file, count, (long long)(page_file_offset(page) + offset)); + + if (nfs_can_extend_write(file, page, inode)) { + count = max(count + offset, nfs_page_length(page)); + offset = 0; + } + + status = nfs_writepage_setup(ctx, page, offset, count); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); + + dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + status, (long long)i_size_read(inode)); + return status; +} + +static int flush_task_priority(int how) +{ + switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { + case FLUSH_HIGHPRI: + return RPC_PRIORITY_HIGH; + case FLUSH_LOWPRI: + return RPC_PRIORITY_LOW; + } + return RPC_PRIORITY_NORMAL; +} + +static void nfs_initiate_write(struct nfs_pgio_header *hdr, + struct rpc_message *msg, + const struct nfs_rpc_ops *rpc_ops, + struct rpc_task_setup *task_setup_data, int how) +{ + int priority = flush_task_priority(how); + + task_setup_data->priority = priority; + rpc_ops->write_setup(hdr, msg); + + nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, + &task_setup_data->rpc_client, msg, hdr); +} + +/* If a nfs_flush_* function fails, it should remove reqs from @head and + * call this on each, which will prepare them to be retried on next + * writeback using standard nfs. + */ +static void nfs_redirty_request(struct nfs_page *req) +{ + nfs_mark_request_dirty(req); + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); +} + +static void nfs_async_write_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } +} + +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { + .error_cleanup = nfs_async_write_error, + .completion = nfs_write_completion, +}; + +void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops) +{ + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, + server->wsize, ioflags); +} +EXPORT_SYMBOL_GPL(nfs_pageio_init_write); + +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) +{ + struct nfs_pgio_mirror *mirror; + + pgio->pg_ops = &nfs_pgio_rw_ops; + + nfs_pageio_stop_mirroring(pgio); + + mirror = &pgio->pg_mirrors[0]; + mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); + + +void nfs_commit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_commit_data *data = calldata; + + NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); +} + +static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) +{ + /* do nothing! */ +} + +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} + +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + +/* + * This function is called when the WRITE call is complete. + */ +static int nfs_writeback_done(struct rpc_task *task, + struct nfs_pgio_header *hdr, + struct inode *inode) +{ + int status; + + /* + * ->write_done will attempt to use post-op attributes to detect + * conflicting writes by other clients. A strict interpretation + * of close-to-open would allow us to continue caching even if + * another writer had changed the file, but some applications + * depend on tighter cache coherency when writing. + */ + status = NFS_PROTO(inode)->write_done(task, hdr); + if (status != 0) + return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); + + if (hdr->res.verf->committed < hdr->args.stable && + task->tk_status >= 0) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + /* Note this will print the MDS for a DS write */ + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + hdr->res.verf->committed, hdr->args.stable); + complain = jiffies + 300 * HZ; + } + } + + /* Deal with the suid/sgid bit corner case */ + if (nfs_should_remove_suid(inode)) + nfs_mark_for_revalidate(inode); + return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (resp->count < argp->count) { + static unsigned long complain; + + /* This a short write! */ + nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ + if (resp->count == 0) { + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote zero bytes, expected %u.\n", + argp->count); + complain = jiffies + 300 * HZ; + } + nfs_set_pgio_error(hdr, -EIO, argp->offset); + task->tk_status = -EIO; + return; + } + /* Was this an NFSv2 write or an NFSv3 stable write? */ + if (resp->verf->committed != NFS_UNSTABLE) { + /* Resend from where the server left off */ + hdr->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + } else { + /* Resend as a stable write in order to avoid + * headaches in the case of a server crash. + */ + argp->stable = NFS_FILE_SYNC; + } + rpc_restart_call_prepare(task); + } +} + + +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + int ret; + + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; +} + +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +{ + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_atomic(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); +} + +void nfs_commitdata_release(struct nfs_commit_data *data) +{ + put_nfs_open_context(data->context); + nfs_commit_free(data); +} +EXPORT_SYMBOL_GPL(nfs_commitdata_release); + +int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, + const struct nfs_rpc_ops *nfs_ops, + const struct rpc_call_ops *call_ops, + int how, int flags) +{ + struct rpc_task *task; + int priority = flush_task_priority(how); + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + .priority = priority, + }; + /* Set up the initial task struct. */ + nfs_ops->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +static loff_t nfs_get_lwb(struct list_head *head) +{ + loff_t lwb = 0; + struct nfs_page *req; + + list_for_each_entry(req, head, wb_list) + if (lwb < (req_offset(req) + req->wb_bytes)) + lwb = req_offset(req) + req->wb_bytes; + + return lwb; +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_commit_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = d_inode(first->wb_context->dentry); + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; + data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ + /* only set lwb for pnfs commit */ + if (lseg) + data->lwb = nfs_get_lwb(&data->pages); + data->mds_ops = &nfs_commit_ops; + data->completion_ops = cinfo->completion_ops; + data->dreq = cinfo->dreq; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ + data->args.offset = 0; + data->args.count = 0; + data->context = get_nfs_open_context(first->wb_context); + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); + +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) +{ + struct nfs_page *req; + + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); + if (!cinfo->dreq) + nfs_clear_page_commit(req->wb_page); + nfs_unlock_and_release_request(req); + } +} +EXPORT_SYMBOL_GPL(nfs_retry_commit); + +/* + * Commit dirty pages + */ +static int +nfs_commit_list(struct inode *inode, struct list_head *head, int how, + struct nfs_commit_info *cinfo) +{ + struct nfs_commit_data *data; + + data = nfs_commitdata_alloc(); + + if (!data) + goto out_bad; + + /* Set up the argument struct */ + nfs_init_commit(data, head, NULL, cinfo); + atomic_inc(&cinfo->mds->rpcs_out); + return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), + data->mds_ops, how, 0); + out_bad: + nfs_retry_commit(head, NULL, cinfo, 0); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + return -ENOMEM; +} + +/* + * COMMIT call returned + */ +static void nfs_commit_done(struct rpc_task *task, void *calldata) +{ + struct nfs_commit_data *data = calldata; + + dprintk("NFS: %5u nfs_commit_done (status %d)\n", + task->tk_pid, task->tk_status); + + /* Call the NFS version-specific code */ + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_commit_release_pages(struct nfs_commit_data *data) +{ + struct nfs_page *req; + int status = data->task.tk_status; + struct nfs_commit_info cinfo; + struct nfs_server *nfss; + + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + nfs_clear_page_commit(req->wb_page); + + dprintk("NFS: commit (%s/%llu %d@%lld)", + req->wb_context->dentry->d_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); + goto next; + } + + /* Okay, COMMIT succeeded, apparently. Check the verifier + * returned by the server against all stored verfs. */ + if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + /* We have a match */ + nfs_inode_remove_request(req); + dprintk(" OK\n"); + goto next; + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); + nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); + next: + nfs_unlock_and_release_request(req); + } + nfss = NFS_SERVER(data->inode); + if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + + nfs_init_cinfo(&cinfo, data->inode, data->dreq); + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_commit_clear_lock(NFS_I(data->inode)); +} + +static void nfs_commit_release(void *calldata) +{ + struct nfs_commit_data *data = calldata; + + data->completion_ops->completion(data); + nfs_commitdata_release(calldata); +} + +static const struct rpc_call_ops nfs_commit_ops = { + .rpc_call_prepare = nfs_commit_prepare, + .rpc_call_done = nfs_commit_done, + .rpc_release = nfs_commit_release, +}; + +static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { + .completion = nfs_commit_release_pages, + .error_cleanup = nfs_commit_clear_lock, +}; + +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo) +{ + int status; + + status = pnfs_commit_list(inode, head, how, cinfo); + if (status == PNFS_NOT_ATTEMPTED) + status = nfs_commit_list(inode, head, how, cinfo); + return status; +} + +int nfs_commit_inode(struct inode *inode, int how) +{ + LIST_HEAD(head); + struct nfs_commit_info cinfo; + int may_wait = how & FLUSH_SYNC; + int res; + + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) + goto out_mark_dirty; + nfs_init_cinfo_from_inode(&cinfo, inode); + res = nfs_scan_commit(inode, &head, &cinfo); + if (res) { + int error; + + error = nfs_generic_commit_list(inode, &head, how, &cinfo); + if (error < 0) + return error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_bit_action(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; + } else + nfs_commit_clear_lock(NFS_I(inode)); + return res; + /* Note: If we exit without ensuring that the commit is complete, + * we must mark the inode as dirty. Otherwise, future calls to + * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure + * that the data is on the disk. + */ +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return res; +} + +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int flags = FLUSH_SYNC; + int ret = 0; + + /* no commits means nothing needs to be done */ + if (!nfsi->commit_info.ncommit) + return ret; + + if (wbc->sync_mode == WB_SYNC_NONE) { + /* Don't commit yet if this is a non-blocking flush and there + * are a lot of outstanding writes for this mapping. + */ + if (nfsi->commit_info.ncommit <= (nfsi->nrequests >> 1)) + goto out_mark_dirty; + + /* don't wait for the COMMIT response */ + flags = 0; + } + + ret = nfs_commit_inode(inode, flags); + if (ret >= 0) { + if (wbc->sync_mode == WB_SYNC_NONE) { + if (ret < wbc->nr_to_write) + wbc->nr_to_write -= ret; + else + wbc->nr_to_write = 0; + } + return 0; + } +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; +} + +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return nfs_commit_unstable_pages(inode, wbc); +} +EXPORT_SYMBOL_GPL(nfs_write_inode); + +/* + * flush the inode to disk. + */ +int nfs_wb_all(struct inode *inode) +{ + int ret; + + trace_nfs_writeback_inode_enter(inode); + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out; + pnfs_sync_inode(inode, true); + ret = 0; + +out: + trace_nfs_writeback_inode_exit(inode, ret); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_wb_all); + +int nfs_wb_page_cancel(struct inode *inode, struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + nfs_unlock_and_release_request(req); + } + + return ret; +} + +/* + * Write back all requests on one page - we do this before reading it. + */ +int nfs_wb_page(struct inode *inode, struct page *page) +{ + loff_t range_start = page_file_offset(page); + loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_end, + }; + int ret; + + trace_nfs_writeback_page_enter(inode); + + for (;;) { + wait_on_page_writeback(page); + if (clear_page_dirty_for_io(page)) { + ret = nfs_writepage_locked(page, &wbc); + if (ret < 0) + goto out_error; + continue; + } + ret = 0; + if (!PagePrivate(page)) + break; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out_error; + } +out_error: + trace_nfs_writeback_page_exit(inode, ret); + return ret; +} + +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; + + if (!nfs_fscache_release_page(page, GFP_KERNEL)) + return -EBUSY; + + return migrate_page(mapping, newpage, page, mode); +} +#endif + +int __init nfs_init_writepagecache(void) +{ + nfs_wdata_cachep = kmem_cache_create("nfs_write_data", + sizeof(struct nfs_pgio_header), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_wdata_cachep == NULL) + return -ENOMEM; + + nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, + nfs_wdata_cachep); + if (nfs_wdata_mempool == NULL) + goto out_destroy_write_cache; + + nfs_cdata_cachep = kmem_cache_create("nfs_commit_data", + sizeof(struct nfs_commit_data), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_cdata_cachep == NULL) + goto out_destroy_write_mempool; + + nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, + nfs_cdata_cachep); + if (nfs_commit_mempool == NULL) + goto out_destroy_commit_cache; + + /* + * NFS congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (nfs_congestion_kb > 256*1024) + nfs_congestion_kb = 256*1024; + + return 0; + +out_destroy_commit_cache: + kmem_cache_destroy(nfs_cdata_cachep); +out_destroy_write_mempool: + mempool_destroy(nfs_wdata_mempool); +out_destroy_write_cache: + kmem_cache_destroy(nfs_wdata_cachep); + return -ENOMEM; +} + +void nfs_destroy_writepagecache(void) +{ + mempool_destroy(nfs_commit_mempool); + kmem_cache_destroy(nfs_cdata_cachep); + mempool_destroy(nfs_wdata_mempool); + kmem_cache_destroy(nfs_wdata_cachep); +} + +static const struct nfs_rw_ops nfs_rw_write_ops = { + .rw_mode = FMODE_WRITE, + .rw_alloc_header = nfs_writehdr_alloc, + .rw_free_header = nfs_writehdr_free, + .rw_release = nfs_writeback_release_common, + .rw_done = nfs_writeback_done, + .rw_result = nfs_writeback_result, + .rw_initiate = nfs_initiate_write, +}; diff --git a/kernel/fs/nfs_common/Makefile b/kernel/fs/nfs_common/Makefile new file mode 100644 index 000000000..d153ca3ea --- /dev/null +++ b/kernel/fs/nfs_common/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for Linux filesystem routines that are shared by client and server. +# + +obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o +nfs_acl-objs := nfsacl.o + +obj-$(CONFIG_GRACE_PERIOD) += grace.o diff --git a/kernel/fs/nfs_common/grace.c b/kernel/fs/nfs_common/grace.c new file mode 100644 index 000000000..ae6e58ea4 --- /dev/null +++ b/kernel/fs/nfs_common/grace.c @@ -0,0 +1,113 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + * + * Transplanted from lockd code + */ + +#include +#include +#include +#include + +static int grace_net_id; +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @net: net namespace that this lock manager belongs to + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void +locks_start_grace(struct net *net, struct lock_manager *lm) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + spin_lock(&grace_lock); + list_add(&lm->list, grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @net: net namespace that this lock manager belongs to + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void +locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int +locks_in_grace(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + return !list_empty(grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); + +static int __net_init +grace_init_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + INIT_LIST_HEAD(grace_list); + return 0; +} + +static void __net_exit +grace_exit_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + BUG_ON(!list_empty(grace_list)); +} + +static struct pernet_operations grace_net_ops = { + .init = grace_init_net, + .exit = grace_exit_net, + .id = &grace_net_id, + .size = sizeof(struct list_head), +}; + +static int __init +init_grace(void) +{ + return register_pernet_subsys(&grace_net_ops); +} + +static void __exit +exit_grace(void) +{ + unregister_pernet_subsys(&grace_net_ops); +} + +MODULE_AUTHOR("Jeff Layton "); +MODULE_LICENSE("GPL"); +module_init(init_grace) +module_exit(exit_grace) diff --git a/kernel/fs/nfs_common/nfsacl.c b/kernel/fs/nfs_common/nfsacl.c new file mode 100644 index 000000000..538f14293 --- /dev/null +++ b/kernel/fs/nfs_common/nfsacl.c @@ -0,0 +1,296 @@ +/* + * fs/nfs_common/nfsacl.c + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +/* + * The Solaris nfsacl protocol represents some ACLs slightly differently + * than POSIX 1003.1e draft 17 does (and we do): + * + * - Minimal ACLs always have an ACL_MASK entry, so they have + * four instead of three entries. + * - The ACL_MASK entry in such minimal ACLs always has the same + * permissions as the ACL_GROUP_OBJ entry. (In extended ACLs + * the ACL_MASK and ACL_GROUP_OBJ entries may differ.) + * - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ + * entries contain the identifiers of the owner and owning group. + * (In POSIX ACLs we always set them to ACL_UNDEFINED_ID). + * - ACL entries in the kernel are kept sorted in ascending order + * of (e_tag, e_id). Solaris ACLs are unsorted. + */ + +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +struct nfsacl_encode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; + int typeflag; + kuid_t uid; + kgid_t gid; +}; + +struct nfsacl_simple_acl { + struct posix_acl acl; + struct posix_acl_entry ace[4]; +}; + +static int +xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_encode_desc *nfsacl_desc = + (struct nfsacl_encode_desc *) desc; + __be32 *p = elem; + + struct posix_acl_entry *entry = + &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + + *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag); + switch(entry->e_tag) { + case ACL_USER_OBJ: + *p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid)); + break; + case ACL_GROUP_OBJ: + *p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid)); + break; + case ACL_USER: + *p++ = htonl(from_kuid(&init_user_ns, entry->e_uid)); + break; + case ACL_GROUP: + *p++ = htonl(from_kgid(&init_user_ns, entry->e_gid)); + break; + default: /* Solaris depends on that! */ + *p++ = 0; + break; + } + *p++ = htonl(entry->e_perm & S_IRWXO); + return 0; +} + +/** + * nfsacl_encode - Encode an NFSv3 ACL + * + * @buf: destination xdr_buf to contain XDR encoded ACL + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @inode: inode of file whose ACL this is + * @acl: posix_acl to encode + * @encode_entries: whether to encode ACEs as well + * @typeflag: ACL type: NFS_ACL_DEFAULT or zero + * + * Returns size of encoded ACL in bytes or a negative errno value. + */ +int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, + struct posix_acl *acl, int encode_entries, int typeflag) +{ + int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; + struct nfsacl_encode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .array_len = encode_entries ? entries : 0, + .xcode = xdr_nfsace_encode, + }, + .acl = acl, + .typeflag = typeflag, + .uid = inode->i_uid, + .gid = inode->i_gid, + }; + struct nfsacl_simple_acl aclbuf; + int err; + + if (entries > NFS_ACL_MAX_ENTRIES || + xdr_encode_word(buf, base, entries)) + return -EINVAL; + if (encode_entries && acl && acl->a_count == 3) { + struct posix_acl *acl2 = &aclbuf.acl; + + /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is + * invoked in contexts where a memory allocation failure is + * fatal. Fortunately this fake ACL is small enough to + * construct on the stack. */ + posix_acl_init(acl2, 4); + + /* Insert entries in canonical order: other orders seem + to confuse Solaris VxFS. */ + acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ + acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */ + acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */ + acl2->a_entries[2].e_tag = ACL_MASK; + acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */ + nfsacl_desc.acl = acl2; + } + err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); + if (!err) + err = 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; + return err; +} +EXPORT_SYMBOL_GPL(nfsacl_encode); + +struct nfsacl_decode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; +}; + +static int +xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_decode_desc *nfsacl_desc = + (struct nfsacl_decode_desc *) desc; + __be32 *p = elem; + struct posix_acl_entry *entry; + unsigned int id; + + if (!nfsacl_desc->acl) { + if (desc->array_len > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL); + if (!nfsacl_desc->acl) + return -ENOMEM; + nfsacl_desc->count = 0; + } + + entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; + id = ntohl(*p++); + entry->e_perm = ntohl(*p++); + + switch(entry->e_tag) { + case ACL_USER: + entry->e_uid = make_kuid(&init_user_ns, id); + if (!uid_valid(entry->e_uid)) + return -EINVAL; + break; + case ACL_GROUP: + entry->e_gid = make_kgid(&init_user_ns, id); + if (!gid_valid(entry->e_gid)) + return -EINVAL; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_OTHER: + if (entry->e_perm & ~S_IRWXO) + return -EINVAL; + break; + case ACL_MASK: + /* Solaris sometimes sets additional bits in the mask */ + entry->e_perm &= S_IRWXO; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +cmp_acl_entry(const void *x, const void *y) +{ + const struct posix_acl_entry *a = x, *b = y; + + if (a->e_tag != b->e_tag) + return a->e_tag - b->e_tag; + else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid)) + return 1; + else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid)) + return -1; + else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid)) + return 1; + else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid)) + return -1; + else + return 0; +} + +/* + * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL. + */ +static int +posix_acl_from_nfsacl(struct posix_acl *acl) +{ + struct posix_acl_entry *pa, *pe, + *group_obj = NULL, *mask = NULL; + + if (!acl) + return 0; + + sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry), + cmp_acl_entry, NULL); + + /* Find the ACL_GROUP_OBJ and ACL_MASK entries. */ + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + break; + case ACL_GROUP_OBJ: + group_obj = pa; + break; + case ACL_MASK: + mask = pa; + /* fall through */ + case ACL_OTHER: + break; + } + } + if (acl->a_count == 4 && group_obj && mask && + mask->e_perm == group_obj->e_perm) { + /* remove bogus ACL_MASK entry */ + memmove(mask, mask+1, (3 - (mask - acl->a_entries)) * + sizeof(struct posix_acl_entry)); + acl->a_count = 3; + } + return 0; +} + +/** + * nfsacl_decode - Decode an NFSv3 ACL + * + * @buf: xdr_buf containing XDR'd ACL data to decode + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @aclcnt: count of ACEs in decoded posix_acl + * @pacl: buffer in which to place decoded posix_acl + * + * Returns the length of the decoded ACL in bytes, or a negative errno value. + */ +int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + u32 entries; + int err; + + if (xdr_decode_word(buf, base, &entries) || + entries > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc.desc.array_maxlen = entries; + err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc); + if (err) + return err; + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return -EINVAL; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; +} +EXPORT_SYMBOL_GPL(nfsacl_decode); diff --git a/kernel/fs/nfsd/Kconfig b/kernel/fs/nfsd/Kconfig new file mode 100644 index 000000000..a0b77fc1b --- /dev/null +++ b/kernel/fs/nfsd/Kconfig @@ -0,0 +1,117 @@ +config NFSD + tristate "NFS server support" + depends on INET + depends on FILE_LOCKING + select LOCKD + select SUNRPC + select EXPORTFS + select NFS_ACL_SUPPORT if NFSD_V2_ACL + depends on MULTIUSER + help + Choose Y here if you want to allow other computers to access + files residing on this system using Sun's Network File System + protocol. To compile the NFS server support as a module, + choose M here: the module will be called nfsd. + + You may choose to use a user-space NFS server instead, in which + case you can choose N here. + + To export local file systems using NFS, you also need to install + user space programs which can be found in the Linux nfs-utils + package, available from http://linux-nfs.org/. More detail about + the Linux NFS server implementation is available via the + exports(5) man page. + + Below you can choose which versions of the NFS protocol are + available to clients mounting the NFS server on this system. + Support for NFS version 2 (RFC 1094) is always available when + CONFIG_NFSD is selected. + + If unsure, say N. + +config NFSD_V2_ACL + bool + depends on NFSD + +config NFSD_V3 + bool "NFS server support for NFS version 3" + depends on NFSD + help + This option enables support in your system's NFS server for + version 3 of the NFS protocol (RFC 1813). + + If unsure, say Y. + +config NFSD_V3_ACL + bool "NFS server support for the NFSv3 ACL protocol extension" + depends on NFSD_V3 + select NFSD_V2_ACL + help + Solaris NFS servers support an auxiliary NFSv3 ACL protocol that + never became an official part of the NFS version 3 protocol. + This protocol extension allows applications on NFS clients to + manipulate POSIX Access Control Lists on files residing on NFS + servers. NFS servers enforce POSIX ACLs on local files whether + this protocol is available or not. + + This option enables support in your system's NFS server for the + NFSv3 ACL protocol extension allowing NFS clients to manipulate + POSIX ACLs on files exported by your system's NFS server. NFS + clients which support the Solaris NFSv3 ACL protocol can then + access and modify ACLs on your NFS server. + + To store ACLs on your NFS server, you also need to enable ACL- + related CONFIG options for your local file systems of choice. + + If unsure, say N. + +config NFSD_V4 + bool "NFS server support for NFS version 4" + depends on NFSD && PROC_FS + select NFSD_V3 + select FS_POSIX_ACL + select SUNRPC_GSS + select CRYPTO + select GRACE_PERIOD + help + This option enables support in your system's NFS server for + version 4 of the NFS protocol (RFC 3530). + + To export files using NFSv4, you need to install additional user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. + +config NFSD_PNFS + bool "NFSv4.1 server support for Parallel NFS (pNFS)" + depends on NFSD_V4 + help + This option enables support for the parallel NFS features of the + minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS + server. + + If unsure, say N. + +config NFSD_V4_SECURITY_LABEL + bool "Provide Security Label support for NFSv4 server" + depends on NFSD_V4 && SECURITY + help + + Say Y here if you want enable fine-grained security label attribute + support for NFS version 4. Security labels allow security modules like + SELinux and Smack to label files to facilitate enforcement of their policies. + Without this an NFSv4 mount will have the same label on each file. + + If you do not wish to enable fine-grained security labels SELinux or + Smack policies on NFSv4 files, say N. + +config NFSD_FAULT_INJECTION + bool "NFS server manual fault injection" + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS + help + This option enables support for manually injecting faults + into the NFS server. This is intended to be used for + testing error recovery on the NFS client. + + If unsure, say N. diff --git a/kernel/fs/nfsd/Makefile b/kernel/fs/nfsd/Makefile new file mode 100644 index 000000000..9a6028e12 --- /dev/null +++ b/kernel/fs/nfsd/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the Linux nfs server +# + +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_NFSD) += nfsd.o + +# this one should be compiled first, as the tracing macros can easily blow up +nfsd-y += trace.o + +nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o +nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o +nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o +nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o +nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o diff --git a/kernel/fs/nfsd/acl.h b/kernel/fs/nfsd/acl.h new file mode 100644 index 000000000..4cd7c69a6 --- /dev/null +++ b/kernel/fs/nfsd/acl.h @@ -0,0 +1,59 @@ +/* + * Common NFSv4 ACL handling definitions. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFS4_ACL_H +#define LINUX_NFS4_ACL_H + +struct nfs4_acl; +struct svc_fh; +struct svc_rqst; + +/* + * Maximum ACL we'll accept from a client; chosen (somewhat + * arbitrarily) so that kmalloc'ing the ACL shouldn't require a + * high-order allocation. This allows 204 ACEs on x86_64: + */ +#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ + / sizeof(struct nfs4_ace)) + +int nfs4_acl_bytes(int entries); +int nfs4_acl_get_whotype(char *, u32); +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); + +int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl); +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl); + +#endif /* LINUX_NFS4_ACL_H */ diff --git a/kernel/fs/nfsd/auth.c b/kernel/fs/nfsd/auth.c new file mode 100644 index 000000000..9d46a0bdd --- /dev/null +++ b/kernel/fs/nfsd/auth.c @@ -0,0 +1,90 @@ +/* Copyright (C) 1995, 1996 Olaf Kirch */ + +#include +#include "nfsd.h" +#include "auth.h" + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) + return f->flags; + } + return exp->ex_flags; + +} + +int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct group_info *rqgi; + struct group_info *gi; + struct cred *new; + int i; + int flags = nfsexp_flags(rqstp, exp); + + validate_process_creds(); + + /* discard any old override before preparing the new set */ + revert_creds(get_cred(current_real_cred())); + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; + + rqgi = rqstp->rq_cred.cr_group_info; + + if (flags & NFSEXP_ALLSQUASH) { + new->fsuid = exp->ex_anon_uid; + new->fsgid = exp->ex_anon_gid; + gi = groups_alloc(0); + if (!gi) + goto oom; + } else if (flags & NFSEXP_ROOTSQUASH) { + if (uid_eq(new->fsuid, GLOBAL_ROOT_UID)) + new->fsuid = exp->ex_anon_uid; + if (gid_eq(new->fsgid, GLOBAL_ROOT_GID)) + new->fsgid = exp->ex_anon_gid; + + gi = groups_alloc(rqgi->ngroups); + if (!gi) + goto oom; + + for (i = 0; i < rqgi->ngroups; i++) { + if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) + GROUP_AT(gi, i) = exp->ex_anon_gid; + else + GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + } + } else { + gi = get_group_info(rqgi); + } + + if (uid_eq(new->fsuid, INVALID_UID)) + new->fsuid = exp->ex_anon_uid; + if (gid_eq(new->fsgid, INVALID_GID)) + new->fsgid = exp->ex_anon_gid; + + set_groups(new, gi); + put_group_info(gi); + + if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) + new->cap_effective = cap_drop_nfsd_set(new->cap_effective); + else + new->cap_effective = cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + validate_process_creds(); + put_cred(override_creds(new)); + put_cred(new); + validate_process_creds(); + return 0; + +oom: + abort_creds(new); + return -ENOMEM; +} + diff --git a/kernel/fs/nfsd/auth.h b/kernel/fs/nfsd/auth.h new file mode 100644 index 000000000..53325a12b --- /dev/null +++ b/kernel/fs/nfsd/auth.h @@ -0,0 +1,16 @@ +/* + * nfsd-specific authentication stuff. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef LINUX_NFSD_AUTH_H +#define LINUX_NFSD_AUTH_H + +/* + * Set the current process's fsuid/fsgid etc to those of the NFS + * client user + */ +int nfsd_setuser(struct svc_rqst *, struct svc_export *); + +#endif /* LINUX_NFSD_AUTH_H */ diff --git a/kernel/fs/nfsd/blocklayout.c b/kernel/fs/nfsd/blocklayout.c new file mode 100644 index 000000000..cdefaa331 --- /dev/null +++ b/kernel/fs/nfsd/blocklayout.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include +#include +#include + +#include + +#include "blocklayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +static int +nfsd4_block_get_device_info_simple(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SIMPLE; + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, + &b->simple.offset); +} + +static __be32 +nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + if (sb->s_bdev != sb->s_bdev->bd_contains) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); +} + +static __be32 +nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct super_block *sb = inode->i_sb; + u32 block_size = (1 << inode->i_blkbits); + struct pnfs_block_extent *bex; + struct iomap iomap; + u32 device_generation = 0; + int error; + + /* + * We do not attempt to support I/O smaller than the fs block size, + * or not aligned to it. + */ + if (args->lg_minlength < block_size) { + dprintk("pnfsd: I/O too small\n"); + goto out_layoutunavailable; + } + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_layoutunavailable; + } + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + error = -ENOMEM; + bex = kzalloc(sizeof(*bex), GFP_KERNEL); + if (!bex) + goto out_error; + args->lg_content = bex; + + error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, + &iomap, seg->iomode != IOMODE_READ, + &device_generation); + if (error) { + if (error == -ENXIO) + goto out_layoutunavailable; + goto out_error; + } + + if (iomap.length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_layoutunavailable; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (seg->iomode == IOMODE_READ) + bex->es = PNFS_BLOCK_READ_DATA; + else + bex->es = PNFS_BLOCK_READWRITE_DATA; + bex->soff = (iomap.blkno << 9); + break; + case IOMAP_UNWRITTEN: + if (seg->iomode & IOMODE_RW) { + /* + * Crack monkey special case from section 2.3.1. + */ + if (args->lg_minlength == 0) { + dprintk("pnfsd: no soup for you!\n"); + goto out_layoutunavailable; + } + + bex->es = PNFS_BLOCK_INVALID_DATA; + bex->soff = (iomap.blkno << 9); + break; + } + /*FALLTHRU*/ + case IOMAP_HOLE: + if (seg->iomode == IOMODE_READ) { + bex->es = PNFS_BLOCK_NONE_DATA; + break; + } + /*FALLTHRU*/ + case IOMAP_DELALLOC: + default: + WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); + goto out_layoutunavailable; + } + + error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); + if (error) + goto out_error; + bex->foff = iomap.offset; + bex->len = iomap.length; + + seg->offset = iomap.offset; + seg->length = iomap.length; + + dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +out_layoutunavailable: + seg->length = 0; + return nfserr_layoutunavailable; +} + +static __be32 +nfsd4_block_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + loff_t new_size = lcp->lc_last_wr + 1; + struct iattr iattr = { .ia_valid = 0 }; + struct iomap *iomaps; + int nr_iomaps; + int error; + + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + if (lcp->lc_mtime.tv_nsec == UTIME_NOW || + timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + lcp->lc_mtime = current_fs_time(inode->i_sb); + iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + + if (new_size > i_size_read(inode)) { + iattr.ia_valid |= ATTR_SIZE; + iattr.ia_size = new_size; + } + + error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, + nr_iomaps, &iattr); + kfree(iomaps); + return nfserrno(error); +} + +const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_block_proc_layoutcommit, +}; diff --git a/kernel/fs/nfsd/blocklayoutxdr.c b/kernel/fs/nfsd/blocklayoutxdr.c new file mode 100644 index 000000000..9aa2796da --- /dev/null +++ b/kernel/fs/nfsd/blocklayoutxdr.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include +#include +#include + +#include "nfsd.h" +#include "blocklayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +__be32 +nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_block_extent *b = lgp->lg_content; + int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* we always return a single extent */ + + p = xdr_encode_opaque_fixed(p, &b->vol_id, + sizeof(struct nfsd4_deviceid)); + p = xdr_encode_hyper(p, b->foff); + p = xdr_encode_hyper(p, b->len); + p = xdr_encode_hyper(p, b->soff); + *p++ = cpu_to_be32(b->es); + return 0; +} + +static int +nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int len; + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + len = 4 + 4 + 8 + 4 + b->simple.sig_len; + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(1); /* single signature */ + p = xdr_encode_hyper(p, b->simple.offset); + p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); + break; + default: + return -ENOTSUPP; + } + + return len; +} + +__be32 +nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev = gdp->gd_device; + int len = sizeof(__be32), ret, i; + __be32 *p; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + for (i = 0; i < dev->nr_volumes; i++) { + ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]); + if (ret < 0) + return nfserrno(ret); + len += ret; + } + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(dev->nr_volumes); + return 0; +} + +int +nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, expected, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + if (len != expected) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, expected); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + struct pnfs_block_extent bex; + + memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); + p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + + p = xdr_decode_hyper(p, &bex.foff); + if (bex.foff & (block_size - 1)) { + dprintk("%s: unaligned offset 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.len); + if (bex.len & (block_size - 1)) { + dprintk("%s: unaligned length 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.soff); + if (bex.soff & (block_size - 1)) { + dprintk("%s: unaligned disk offset 0x%llx\n", + __func__, bex.soff); + goto fail; + } + bex.es = be32_to_cpup(p++); + if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + dprintk("%s: incorrect extent state %d\n", + __func__, bex.es); + goto fail; + } + + iomaps[i].offset = bex.foff; + iomaps[i].length = bex.len; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} diff --git a/kernel/fs/nfsd/blocklayoutxdr.h b/kernel/fs/nfsd/blocklayoutxdr.h new file mode 100644 index 000000000..fdc79037c --- /dev/null +++ b/kernel/fs/nfsd/blocklayoutxdr.h @@ -0,0 +1,62 @@ +#ifndef _NFSD_BLOCKLAYOUTXDR_H +#define _NFSD_BLOCKLAYOUTXDR_H 1 + +#include +#include "xdr4.h" + +struct iomap; +struct xdr_stream; + +enum pnfs_block_extent_state { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, + PNFS_BLOCK_NONE_DATA = 3, +}; + +struct pnfs_block_extent { + struct nfsd4_deviceid vol_id; + u64 foff; + u64 len; + u64 soff; + enum pnfs_block_extent_state es; +}; +#define NFS4_BLOCK_EXTENT_SIZE 44 + +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } simple; + }; +}; + +struct pnfs_block_deviceaddr { + u32 nr_volumes; + struct pnfs_block_volume volumes[]; +}; + +__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); +int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); + +#endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/kernel/fs/nfsd/cache.h b/kernel/fs/nfsd/cache.h new file mode 100644 index 000000000..dd96a3830 --- /dev/null +++ b/kernel/fs/nfsd/cache.h @@ -0,0 +1,86 @@ +/* + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include + +/* + * Representation of a reply cache entry. + * + * Note that we use a sockaddr_in6 to hold the address instead of the more + * typical sockaddr_storage. This is for space reasons, since sockaddr_storage + * is much larger than a sockaddr_in6. + */ +struct svc_cacherep { + struct list_head c_lru; + + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + struct sockaddr_in6 c_addr; + __be32 c_xid; + u32 c_prot; + u32 c_proc; + u32 c_vers; + unsigned int c_len; + __wsum c_csum; + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* + * If requests are retransmitted within this interval, they're dropped. + */ +#define RC_DELAY (HZ/5) + +/* Cache entries expire after this time period */ +#define RC_EXPIRE (120 * HZ) + +/* Checksum this amount of the request */ +#define RC_CSUMLEN (256U) + +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); +int nfsd_cache_lookup(struct svc_rqst *); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_reply_cache_stats_open(struct inode *, struct file *); + +#endif /* NFSCACHE_H */ diff --git a/kernel/fs/nfsd/current_stateid.h b/kernel/fs/nfsd/current_stateid.h new file mode 100644 index 000000000..412355120 --- /dev/null +++ b/kernel/fs/nfsd/current_stateid.h @@ -0,0 +1,28 @@ +#ifndef _NFSD4_CURRENT_STATE_H +#define _NFSD4_CURRENT_STATE_H + +#include "state.h" +#include "xdr4.h" + +extern void clear_current_stateid(struct nfsd4_compound_state *cstate); +/* + * functions to set current state id + */ +extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *); +extern void nfsd4_set_openstateid(struct nfsd4_compound_state *, struct nfsd4_open *); +extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *, struct nfsd4_lock *); +extern void nfsd4_set_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *); + +/* + * functions to consume current state id + */ +extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *); +extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *, struct nfsd4_delegreturn *); +extern void nfsd4_get_freestateid(struct nfsd4_compound_state *, struct nfsd4_free_stateid *); +extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *, struct nfsd4_setattr *); +extern void nfsd4_get_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *); +extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *, struct nfsd4_locku *); +extern void nfsd4_get_readstateid(struct nfsd4_compound_state *, struct nfsd4_read *); +extern void nfsd4_get_writestateid(struct nfsd4_compound_state *, struct nfsd4_write *); + +#endif /* _NFSD4_CURRENT_STATE_H */ diff --git a/kernel/fs/nfsd/export.c b/kernel/fs/nfsd/export.c new file mode 100644 index 000000000..f79521a59 --- /dev/null +++ b/kernel/fs/nfsd/export.c @@ -0,0 +1,1345 @@ +/* + * NFS exporting and validation. + * + * We maintain a list of clients, each of which has a list of + * exports. To export an fs to a given client, you first have + * to create the client entry with NFSCTL_ADDCLIENT, which + * creates a client control block and adds it to the hash + * table. Then, you call NFSCTL_EXPORT for each fs. + * + * + * Copyright (C) 1995, 1996 Olaf Kirch, + */ + +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_EXPORT + +/* + * We have two caches. + * One maps client+vfsmnt+dentry to export options - the export map + * The other maps client+filehandle-fragment to export options. - the expkey map + * + * The export options are actually stored in the first map, and the + * second map contains a reference to the entry in the first map. + */ + +#define EXPKEY_HASHBITS 8 +#define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) +#define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) + +static void expkey_put(struct kref *ref) +{ + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); + + if (test_bit(CACHE_VALID, &key->h.flags) && + !test_bit(CACHE_NEGATIVE, &key->h.flags)) + path_put(&key->ek_path); + auth_domain_put(key->ek_client); + kfree(key); +} + +static void expkey_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client fsidtype \xfsid */ + struct svc_expkey *ek = container_of(h, struct svc_expkey, h); + char type[5]; + + qword_add(bpp, blen, ek->ek_client->name); + snprintf(type, 5, "%d", ek->ek_fsidtype); + qword_add(bpp, blen, type); + qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); + (*bpp)[-1] = '\n'; +} + +static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, + struct svc_expkey *old); +static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); + +static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client fsidtype fsid expiry [path] */ + char *buf; + int len; + struct auth_domain *dom = NULL; + int err; + int fsidtype; + char *ep; + struct svc_expkey key; + struct svc_expkey *ek = NULL; + + if (mesg[mlen - 1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; + if (!buf) + goto out; + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + dprintk("found domain %s\n", buf); + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + fsidtype = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + dprintk("found fsidtype %d\n", fsidtype); + if (key_len(fsidtype)==0) /* invalid type */ + goto out; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + dprintk("found fsid length %d\n", len); + if (len != key_len(fsidtype)) + goto out; + + /* OK, we seem to have a valid key */ + key.h.flags = 0; + key.h.expiry_time = get_expiry(&mesg); + if (key.h.expiry_time == 0) + goto out; + + key.ek_client = dom; + key.ek_fsidtype = fsidtype; + memcpy(key.ek_fsid, buf, len); + + ek = svc_expkey_lookup(cd, &key); + err = -ENOMEM; + if (!ek) + goto out; + + /* now we want a pathname, or empty meaning NEGATIVE */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len < 0) + goto out; + dprintk("Path seems to be <%s>\n", buf); + err = 0; + if (len == 0) { + set_bit(CACHE_NEGATIVE, &key.h.flags); + ek = svc_expkey_update(cd, &key, ek); + if (!ek) + err = -ENOMEM; + } else { + err = kern_path(buf, 0, &key.ek_path); + if (err) + goto out; + + dprintk("Found the path %s\n", buf); + + ek = svc_expkey_update(cd, &key, ek); + if (!ek) + err = -ENOMEM; + path_put(&key.ek_path); + } + cache_flush(); + out: + if (ek) + cache_put(&ek->h, cd); + if (dom) + auth_domain_put(dom); + kfree(buf); + return err; +} + +static int expkey_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_expkey *ek ; + int i; + + if (h ==NULL) { + seq_puts(m, "#domain fsidtype fsid [path]\n"); + return 0; + } + ek = container_of(h, struct svc_expkey, h); + seq_printf(m, "%s %d 0x", ek->ek_client->name, + ek->ek_fsidtype); + for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) + seq_printf(m, "%08x", ek->ek_fsid[i]); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + seq_printf(m, " "); + seq_path(m, &ek->ek_path, "\\ \t\n"); + } + seq_printf(m, "\n"); + return 0; +} + +static inline int expkey_match (struct cache_head *a, struct cache_head *b) +{ + struct svc_expkey *orig = container_of(a, struct svc_expkey, h); + struct svc_expkey *new = container_of(b, struct svc_expkey, h); + + if (orig->ek_fsidtype != new->ek_fsidtype || + orig->ek_client != new->ek_client || + memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) + return 0; + return 1; +} + +static inline void expkey_init(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + kref_get(&item->ek_client->ref); + new->ek_client = item->ek_client; + new->ek_fsidtype = item->ek_fsidtype; + + memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); +} + +static inline void expkey_update(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + new->ek_path = item->ek_path; + path_get(&item->ek_path); +} + +static struct cache_head *expkey_alloc(void) +{ + struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static struct cache_detail svc_expkey_cache_template = { + .owner = THIS_MODULE, + .hash_size = EXPKEY_HASHMAX, + .name = "nfsd.fh", + .cache_put = expkey_put, + .cache_request = expkey_request, + .cache_parse = expkey_parse, + .cache_show = expkey_show, + .match = expkey_match, + .init = expkey_init, + .update = expkey_update, + .alloc = expkey_alloc, +}; + +static int +svc_expkey_hash(struct svc_expkey *item) +{ + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); + + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; + return hash; +} + +static struct svc_expkey * +svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup(cd, &item->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + +static struct svc_expkey * +svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, + struct svc_expkey *old) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + + +#define EXPORT_HASHBITS 8 +#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) + +static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) +{ + struct nfsd4_fs_location *locations = fsloc->locations; + int i; + + if (!locations) + return; + + for (i = 0; i < fsloc->locations_count; i++) { + kfree(locations[i].path); + kfree(locations[i].hosts); + } + + kfree(locations); + fsloc->locations = NULL; +} + +static void svc_export_put(struct kref *ref) +{ + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); + path_put(&exp->ex_path); + auth_domain_put(exp->ex_client); + nfsd4_fslocs_free(&exp->ex_fslocs); + kfree(exp->ex_uuid); + kfree(exp); +} + +static void svc_export_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client path */ + struct svc_export *exp = container_of(h, struct svc_export, h); + char *pth; + + qword_add(bpp, blen, exp->ex_client->name); + pth = d_path(&exp->ex_path, *bpp, *blen); + if (IS_ERR(pth)) { + /* is this correct? */ + (*bpp)[0] = '\n'; + return; + } + qword_add(bpp, blen, pth); + (*bpp)[-1] = '\n'; +} + +static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); +static struct svc_export *svc_export_lookup(struct svc_export *); + +static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +{ + + /* + * We currently export only dirs, regular files, and (for v4 + * pseudoroot) symlinks. + */ + if (!S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode) && + !S_ISREG(inode->i_mode)) + return -ENOTDIR; + + /* + * Mountd should never pass down a writeable V4ROOT export, but, + * just to make sure: + */ + if (*flags & NFSEXP_V4ROOT) + *flags |= NFSEXP_READONLY; + + /* There are two requirements on a filesystem to be exportable. + * 1: We must be able to identify the filesystem from a number. + * either a device number (so FS_REQUIRES_DEV needed) + * or an FSID number (so NFSEXP_FSID or ->uuid is needed). + * 2: We must be able to find an inode from a filehandle. + * This means that s_export_op must be set. + */ + if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && + !(*flags & NFSEXP_FSID) && + uuid == NULL) { + dprintk("exp_export: export of non-dev fs without fsid\n"); + return -EINVAL; + } + + if (!inode->i_sb->s_export_op || + !inode->i_sb->s_export_op->fh_to_dentry) { + dprintk("exp_export: export of invalid fs type.\n"); + return -EINVAL; + } + + return 0; + +} + +#ifdef CONFIG_NFSD_V4 + +static int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) +{ + int len; + int migrated, i, err; + + /* more than one fsloc */ + if (fsloc->locations) + return -EINVAL; + + /* listsize */ + err = get_uint(mesg, &fsloc->locations_count); + if (err) + return err; + if (fsloc->locations_count > MAX_FS_LOCATIONS) + return -EINVAL; + if (fsloc->locations_count == 0) + return 0; + + fsloc->locations = kzalloc(fsloc->locations_count + * sizeof(struct nfsd4_fs_location), GFP_KERNEL); + if (!fsloc->locations) + return -ENOMEM; + for (i=0; i < fsloc->locations_count; i++) { + /* colon separated host list */ + err = -EINVAL; + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].hosts) + goto out_free_all; + err = -EINVAL; + /* slash separated path component list */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].path) + goto out_free_all; + } + /* migrated */ + err = get_int(mesg, &migrated); + if (err) + goto out_free_all; + err = -EINVAL; + if (migrated < 0 || migrated > 1) + goto out_free_all; + fsloc->migrated = migrated; + return 0; +out_free_all: + nfsd4_fslocs_free(fsloc); + return err; +} + +static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) +{ + struct exp_flavor_info *f; + u32 listsize; + int err; + + /* more than one secinfo */ + if (exp->ex_nflavors) + return -EINVAL; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > MAX_SECINFO_LIST) + return -EINVAL; + + for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { + err = get_uint(mesg, &f->pseudoflavor); + if (err) + return err; + /* + * XXX: It would be nice to also check whether this + * pseudoflavor is supported, so we can discover the + * problem at export time instead of when a client fails + * to authenticate. + */ + err = get_uint(mesg, &f->flags); + if (err) + return err; + /* Only some flags are allowed to differ between flavors: */ + if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) + return -EINVAL; + } + exp->ex_nflavors = listsize; + return 0; +} + +#else /* CONFIG_NFSD_V4 */ +static inline int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} +static inline int +secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } +#endif + +static inline int +uuid_parse(char **mesg, char *buf, unsigned char **puuid) +{ + int len; + + /* more than one uuid */ + if (*puuid) + return -EINVAL; + + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != EX_UUID_LEN) + return -EINVAL; + + *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); + if (*puuid == NULL) + return -ENOMEM; + + return 0; +} + +static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client path expiry [flags anonuid anongid fsid] */ + char *buf; + int len; + int err; + struct auth_domain *dom = NULL; + struct svc_export exp = {}, *expp; + int an_int; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* client */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + + /* path */ + err = -EINVAL; + if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out1; + + err = kern_path(buf, 0, &exp.ex_path); + if (err) + goto out1; + + exp.ex_client = dom; + exp.cd = cd; + exp.ex_devid_map = NULL; + + /* expiry */ + err = -EINVAL; + exp.h.expiry_time = get_expiry(&mesg); + if (exp.h.expiry_time == 0) + goto out3; + + /* flags */ + err = get_int(&mesg, &an_int); + if (err == -ENOENT) { + err = 0; + set_bit(CACHE_NEGATIVE, &exp.h.flags); + } else { + if (err || an_int < 0) + goto out3; + exp.ex_flags= an_int; + + /* anon uid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); + + /* anon gid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); + + /* fsid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_fsid = an_int; + + while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + if (strcmp(buf, "fsloc") == 0) + err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); + else if (strcmp(buf, "uuid") == 0) + err = uuid_parse(&mesg, buf, &exp.ex_uuid); + else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else + /* quietly ignore unknown words and anything + * following. Newer user-space can try to set + * new values, then see what the result was. + */ + break; + if (err) + goto out4; + } + + err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, + exp.ex_uuid); + if (err) + goto out4; + /* + * No point caching this if it would immediately expire. + * Also, this protects exportfs's dummy export from the + * anon_uid/anon_gid checks: + */ + if (exp.h.expiry_time < seconds_since_boot()) + goto out4; + /* + * For some reason exportfs has been passing down an + * invalid (-1) uid & gid on the "dummy" export which it + * uses to test export support. To make sure exportfs + * sees errors from check_export we therefore need to + * delay these checks till after check_export: + */ + err = -EINVAL; + if (!uid_valid(exp.ex_anon_uid)) + goto out4; + if (!gid_valid(exp.ex_anon_gid)) + goto out4; + err = 0; + + nfsd4_setup_layout_type(&exp); + } + + expp = svc_export_lookup(&exp); + if (expp) + expp = svc_export_update(&exp, expp); + else + err = -ENOMEM; + cache_flush(); + if (expp == NULL) + err = -ENOMEM; + else + exp_put(expp); +out4: + nfsd4_fslocs_free(&exp.ex_fslocs); + kfree(exp.ex_uuid); +out3: + path_put(&exp.ex_path); +out1: + auth_domain_put(dom); +out: + kfree(buf); + return err; +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); +static void show_secinfo(struct seq_file *m, struct svc_export *exp); + +static int svc_export_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_export *exp ; + + if (h ==NULL) { + seq_puts(m, "#path domain(flags)\n"); + return 0; + } + exp = container_of(h, struct svc_export, h); + seq_path(m, &exp->ex_path, " \t\n\\"); + seq_putc(m, '\t'); + seq_escape(m, exp->ex_client->name, " \t\n\\"); + seq_putc(m, '('); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + exp_flags(m, exp->ex_flags, exp->ex_fsid, + exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); + if (exp->ex_uuid) { + int i; + seq_puts(m, ",uuid="); + for (i = 0; i < EX_UUID_LEN; i++) { + if ((i&3) == 0 && i) + seq_putc(m, ':'); + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); + return 0; +} +static int svc_export_match(struct cache_head *a, struct cache_head *b) +{ + struct svc_export *orig = container_of(a, struct svc_export, h); + struct svc_export *new = container_of(b, struct svc_export, h); + return orig->ex_client == new->ex_client && + path_equal(&orig->ex_path, &new->ex_path); +} + +static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + + kref_get(&item->ex_client->ref); + new->ex_client = item->ex_client; + new->ex_path = item->ex_path; + path_get(&item->ex_path); + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; + new->ex_layout_type = 0; + new->ex_uuid = NULL; + new->cd = item->cd; +} + +static void export_update(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + int i; + + new->ex_flags = item->ex_flags; + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; + new->ex_devid_map = item->ex_devid_map; + item->ex_devid_map = NULL; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_fslocs.locations = item->ex_fslocs.locations; + item->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; + item->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = item->ex_fslocs.migrated; + item->ex_fslocs.migrated = 0; + new->ex_layout_type = item->ex_layout_type; + new->ex_nflavors = item->ex_nflavors; + for (i = 0; i < MAX_SECINFO_LIST; i++) { + new->ex_flavors[i] = item->ex_flavors[i]; + } +} + +static struct cache_head *svc_export_alloc(void) +{ + struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static struct cache_detail svc_export_cache_template = { + .owner = THIS_MODULE, + .hash_size = EXPORT_HASHMAX, + .name = "nfsd.export", + .cache_put = svc_export_put, + .cache_request = svc_export_request, + .cache_parse = svc_export_parse, + .cache_show = svc_export_show, + .match = svc_export_match, + .init = svc_export_init, + .update = export_update, + .alloc = svc_export_alloc, +}; + +static int +svc_export_hash(struct svc_export *exp) +{ + int hash; + + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); + return hash; +} + +static struct svc_export * +svc_export_lookup(struct svc_export *exp) +{ + struct cache_head *ch; + int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + +static struct svc_export * +svc_export_update(struct svc_export *new, struct svc_export *old) +{ + struct cache_head *ch; + int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + + +static struct svc_expkey * +exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, + u32 *fsidv, struct cache_req *reqp) +{ + struct svc_expkey key, *ek; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ek_client = clp; + key.ek_fsidtype = fsid_type; + memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); + + ek = svc_expkey_lookup(cd, &key); + if (ek == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(cd, &ek->h, reqp); + if (err) + return ERR_PTR(err); + return ek; +} + +static struct svc_export * +exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, + const struct path *path, struct cache_req *reqp) +{ + struct svc_export *exp, key; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ex_client = clp; + key.ex_path = *path; + key.cd = cd; + + exp = svc_export_lookup(&key); + if (exp == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(cd, &exp->h, reqp); + if (err) + return ERR_PTR(err); + return exp; +} + +/* + * Find the export entry for a given dentry. + */ +static struct svc_export * +exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = exp_get_by_name(cd, clp, path, NULL); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + + + +/* + * Obtain the root fh on behalf of a client. + * This could be done in user space, but I feel that it adds some safety + * since its harder to fool a kernel module than a user space program. + */ +int +exp_rootfh(struct net *net, struct auth_domain *clp, char *name, + struct knfsd_fh *f, int maxsize) +{ + struct svc_export *exp; + struct path path; + struct inode *inode; + struct svc_fh fh; + int err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + err = -EPERM; + /* NB: we probably ought to check that it's NUL-terminated */ + if (kern_path(name, 0, &path)) { + printk("nfsd: exp_rootfh path not found %s", name); + return err; + } + inode = d_inode(path.dentry); + + dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", + name, path.dentry, clp->name, + inode->i_sb->s_id, inode->i_ino); + exp = exp_parent(cd, clp, &path); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto out; + } + + /* + * fh must be initialized before calling fh_compose + */ + fh_init(&fh, maxsize); + if (fh_compose(&fh, exp, path.dentry, NULL)) + err = -EINVAL; + else + err = 0; + memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); + fh_put(&fh); + exp_put(exp); +out: + path_put(&path); + return err; +} + +static struct svc_export *exp_find(struct cache_detail *cd, + struct auth_domain *clp, int fsid_type, + u32 *fsidv, struct cache_req *reqp) +{ + struct svc_export *exp; + struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); + struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); + if (IS_ERR(ek)) + return ERR_CAST(ek); + + exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); + cache_put(&ek->h, nn->svc_expkey_cache); + + if (IS_ERR(exp)) + return ERR_CAST(exp); + return exp; +} + +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + /* legacy gss-only clients are always OK: */ + if (exp->ex_client == rqstp->rq_gssclient) + return 0; + /* ip-address based client; check sec= export option: */ + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) + return 0; + } + /* defaults in absence of sec= options: */ + if (exp->ex_nflavors == 0) { + if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || + rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) + return 0; + } + return nfserr_wrongsec; +} + +/* + * Uses rq_client and rq_gssclient to find an export; uses rq_client (an + * auth_unix client) if it's available and has secinfo information; + * otherwise, will try to use rq_gssclient. + * + * Called from functions that handle requests; functions that do work on + * behalf of mountd are passed a single client name to use, and should + * use exp_get_by_name() or exp_find(). + */ +struct svc_export * +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_find(cd, rqstp->rq_client, fsid_type, + fsidv, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv, + &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = rqst_exp_get_by_name(rqstp, path); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) +{ + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); + + return rqst_exp_find(rqstp, FSID_NUM, fsidv); +} + +/* + * Called when we need the filehandle for the root of the pseudofs, + * for a given NFSv4 client. The root is defined to be the + * export point with fsid==0 + */ +__be32 +exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct svc_export *exp; + __be32 rv; + + exp = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); + exp_put(exp); + return rv; +} + +/* Iterator */ + +static void *e_start(struct seq_file *m, loff_t *pos) + __acquires(((struct cache_detail *)m->private)->hash_lock) +{ + loff_t n = *pos; + unsigned hash, export; + struct cache_head *ch; + struct cache_detail *cd = m->private; + struct cache_head **export_table = cd->hash_table; + + read_lock(&cd->hash_lock); + if (!n--) + return SEQ_START_TOKEN; + hash = n >> 32; + export = n & ((1LL<<32) - 1); + + + for (ch=export_table[hash]; ch; ch=ch->next) + if (!export--) + return ch; + n &= ~((1LL<<32) - 1); + do { + hash++; + n += 1LL<<32; + } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL); + if (hash >= EXPORT_HASHMAX) + return NULL; + *pos = n+1; + return export_table[hash]; +} + +static void *e_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cache_head *ch = p; + int hash = (*pos >> 32); + struct cache_detail *cd = m->private; + struct cache_head **export_table = cd->hash_table; + + if (p == SEQ_START_TOKEN) + hash = 0; + else if (ch->next == NULL) { + hash++; + *pos += 1LL<<32; + } else { + ++*pos; + return ch->next; + } + *pos &= ~((1LL<<32) - 1); + while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) { + hash++; + *pos += 1LL<<32; + } + if (hash >= EXPORT_HASHMAX) + return NULL; + ++*pos; + return export_table[hash]; +} + +static void e_stop(struct seq_file *m, void *p) + __releases(((struct cache_detail *)m->private)->hash_lock) +{ + struct cache_detail *cd = m->private; + + read_unlock(&cd->hash_lock); +} + +static struct flags { + int flag; + char *name[2]; +} expflags[] = { + { NFSEXP_READONLY, {"ro", "rw"}}, + { NFSEXP_INSECURE_PORT, {"insecure", ""}}, + { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, + { NFSEXP_ALLSQUASH, {"all_squash", ""}}, + { NFSEXP_ASYNC, {"async", "sync"}}, + { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, + { NFSEXP_NOHIDE, {"nohide", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, + { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, + { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, + { NFSEXP_PNFS, {"pnfs", ""}}, + { 0, {"", ""}} +}; + +static void show_expflags(struct seq_file *m, int flags, int mask) +{ + struct flags *flg; + int state, first = 0; + + for (flg = expflags; flg->flag; flg++) { + if (flg->flag & ~mask) + continue; + state = (flg->flag & flags) ? 0 : 1; + if (*flg->name[state]) + seq_printf(m, "%s%s", first++?",":"", flg->name[state]); + } +} + +static void show_secinfo_flags(struct seq_file *m, int flags) +{ + seq_printf(m, ","); + show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); +} + +static bool secinfo_flags_equal(int f, int g) +{ + f &= NFSEXP_SECINFO_FLAGS; + g &= NFSEXP_SECINFO_FLAGS; + return f == g; +} + +static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) +{ + int flags; + + flags = (*fp)->flags; + seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); + (*fp)++; + while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { + seq_printf(m, ":%d", (*fp)->pseudoflavor); + (*fp)++; + } + return flags; +} + +static void show_secinfo(struct seq_file *m, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + int flags; + + if (exp->ex_nflavors == 0) + return; + f = exp->ex_flavors; + flags = show_secinfo_run(m, &f, end); + if (!secinfo_flags_equal(flags, exp->ex_flags)) + show_secinfo_flags(m, flags); + while (f != end) { + flags = show_secinfo_run(m, &f, end); + show_secinfo_flags(m, flags); + } +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) +{ + show_expflags(m, flag, NFSEXP_ALLFLAGS); + if (flag & NFSEXP_FSID) + seq_printf(m, ",fsid=%d", fsid); + if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) && + !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu)); + if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) && + !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong)); + if (fsloc && fsloc->locations_count > 0) { + char *loctype = (fsloc->migrated) ? "refer" : "replicas"; + int i; + + seq_printf(m, ",%s=", loctype); + seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); + for (i = 1; i < fsloc->locations_count; i++) { + seq_putc(m, ';'); + seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); + } + } +} + +static int e_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct svc_export *exp = container_of(cp, struct svc_export, h); + struct cache_detail *cd = m->private; + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# Version 1.1\n"); + seq_puts(m, "# Path Client(Flags) # IPs\n"); + return 0; + } + + exp_get(exp); + if (cache_check(cd, &exp->h, NULL)) + return 0; + exp_put(exp); + return svc_export_show(m, cd, cp); +} + +const struct seq_operations nfs_exports_op = { + .start = e_start, + .next = e_next, + .stop = e_stop, + .show = e_show, +}; + +/* + * Initialize the exports module. + */ +int +nfsd_export_init(struct net *net) +{ + int rv; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("nfsd: initializing export module (net: %p).\n", net); + + nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); + if (IS_ERR(nn->svc_export_cache)) + return PTR_ERR(nn->svc_export_cache); + rv = cache_register_net(nn->svc_export_cache, net); + if (rv) + goto destroy_export_cache; + + nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); + if (IS_ERR(nn->svc_expkey_cache)) { + rv = PTR_ERR(nn->svc_expkey_cache); + goto unregister_export_cache; + } + rv = cache_register_net(nn->svc_expkey_cache, net); + if (rv) + goto destroy_expkey_cache; + return 0; + +destroy_expkey_cache: + cache_destroy_net(nn->svc_expkey_cache, net); +unregister_export_cache: + cache_unregister_net(nn->svc_export_cache, net); +destroy_export_cache: + cache_destroy_net(nn->svc_export_cache, net); + return rv; +} + +/* + * Flush exports table - called when last nfsd thread is killed + */ +void +nfsd_export_flush(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cache_purge(nn->svc_expkey_cache); + cache_purge(nn->svc_export_cache); +} + +/* + * Shutdown the exports module. + */ +void +nfsd_export_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("nfsd: shutting down export module (net: %p).\n", net); + + cache_unregister_net(nn->svc_expkey_cache, net); + cache_unregister_net(nn->svc_export_cache, net); + cache_destroy_net(nn->svc_expkey_cache, net); + cache_destroy_net(nn->svc_export_cache, net); + svcauth_unix_purge(net); + + dprintk("nfsd: export shutdown complete (net: %p).\n", net); +} diff --git a/kernel/fs/nfsd/export.h b/kernel/fs/nfsd/export.h new file mode 100644 index 000000000..1f52bfcc4 --- /dev/null +++ b/kernel/fs/nfsd/export.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch + */ +#ifndef NFSD_EXPORT_H +#define NFSD_EXPORT_H + +#include +#include + +struct knfsd_fh; +struct svc_fh; +struct svc_rqst; + +/* + * FS Locations + */ + +#define MAX_FS_LOCATIONS 128 + +struct nfsd4_fs_location { + char *hosts; /* colon separated list of hosts */ + char *path; /* slash separated list of path components */ +}; + +struct nfsd4_fs_locations { + uint32_t locations_count; + struct nfsd4_fs_location *locations; +/* If we're not actually serving this data ourselves (only providing a + * list of replicas that do serve it) then we set "migrated": */ + int migrated; +}; + +/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the foreseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, + * spkm3i, and spkm3p (and using all 8 at once should be rare). + */ +#define MAX_SECINFO_LIST 8 +#define EX_UUID_LEN 16 + +struct exp_flavor_info { + u32 pseudoflavor; + u32 flags; +}; + +struct svc_export { + struct cache_head h; + struct auth_domain * ex_client; + int ex_flags; + struct path ex_path; + kuid_t ex_anon_uid; + kgid_t ex_anon_gid; + int ex_fsid; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + uint32_t ex_nflavors; + struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + enum pnfs_layouttype ex_layout_type; + struct nfsd4_deviceid_map *ex_devid_map; + struct cache_detail *cd; +}; + +/* an "export key" (expkey) maps a filehandlefragement to an + * svc_export for a given client. There can be several per export, + * for the different fsid types. + */ +struct svc_expkey { + struct cache_head h; + + struct auth_domain * ek_client; + int ek_fsidtype; + u32 ek_fsid[6]; + + struct path ek_path; +}; + +#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) +#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) +#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); + +/* + * Function declarations + */ +int nfsd_export_init(struct net *); +void nfsd_export_shutdown(struct net *); +void nfsd_export_flush(struct net *); +struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, + struct path *); +struct svc_export * rqst_exp_parent(struct svc_rqst *, + struct path *); +struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); +int exp_rootfh(struct net *, struct auth_domain *, + char *path, struct knfsd_fh *, int maxsize); +__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); +__be32 nfserrno(int errno); + +static inline void exp_put(struct svc_export *exp) +{ + cache_put(&exp->h, exp->cd); +} + +static inline struct svc_export *exp_get(struct svc_export *exp) +{ + cache_get(&exp->h); + return exp; +} +struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); + +#endif /* NFSD_EXPORT_H */ diff --git a/kernel/fs/nfsd/fault_inject.c b/kernel/fs/nfsd/fault_inject.c new file mode 100644 index 000000000..c16bf5af6 --- /dev/null +++ b/kernel/fs/nfsd/fault_inject.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2011 Bryan Schumaker + * + * Uses debugfs to create fault injection points for client testing + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "state.h" +#include "netns.h" + +struct nfsd_fault_inject_op { + char *file; + u64 (*get)(void); + u64 (*set_val)(u64); + u64 (*set_clnt)(struct sockaddr_storage *, size_t); +}; + +static struct dentry *debug_dir; + +static ssize_t fault_inject_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + static u64 val; + char read_buf[25]; + size_t size; + loff_t pos = *ppos; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; + + if (!pos) + val = op->get(); + size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); + + return simple_read_from_buffer(buf, len, ppos, read_buf, size); +} + +static ssize_t fault_inject_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + char write_buf[INET6_ADDRSTRLEN]; + size_t size = min(sizeof(write_buf) - 1, len); + struct net *net = current->nsproxy->net_ns; + struct sockaddr_storage sa; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; + u64 val; + char *nl; + + if (copy_from_user(write_buf, buf, size)) + return -EFAULT; + write_buf[size] = '\0'; + + /* Deal with any embedded newlines in the string */ + nl = strchr(write_buf, '\n'); + if (nl) { + size = nl - write_buf; + *nl = '\0'; + } + + size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); + if (size > 0) { + val = op->set_clnt(&sa, size); + if (val) + pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", + op->file, write_buf, val); + } else { + val = simple_strtoll(write_buf, NULL, 0); + if (val == 0) + pr_info("NFSD Fault Injection: %s (all)", op->file); + else + pr_info("NFSD Fault Injection: %s (n = %llu)", + op->file, val); + val = op->set_val(val); + pr_info("NFSD: %s: found %llu", op->file, val); + } + return len; /* on success, claim we got the whole input */ +} + +static const struct file_operations fops_nfsd = { + .owner = THIS_MODULE, + .read = fault_inject_read, + .write = fault_inject_write, +}; + +void nfsd_fault_inject_cleanup(void) +{ + debugfs_remove_recursive(debug_dir); +} + +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .get = nfsd_inject_print_clients, + .set_val = nfsd_inject_forget_clients, + .set_clnt = nfsd_inject_forget_client, + }, + { + .file = "forget_locks", + .get = nfsd_inject_print_locks, + .set_val = nfsd_inject_forget_locks, + .set_clnt = nfsd_inject_forget_client_locks, + }, + { + .file = "forget_openowners", + .get = nfsd_inject_print_openowners, + .set_val = nfsd_inject_forget_openowners, + .set_clnt = nfsd_inject_forget_client_openowners, + }, + { + .file = "forget_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_forget_delegations, + .set_clnt = nfsd_inject_forget_client_delegations, + }, + { + .file = "recall_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_recall_delegations, + .set_clnt = nfsd_inject_recall_client_delegations, + }, +}; + +#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op)) + +int nfsd_fault_inject_init(void) +{ + unsigned int i; + struct nfsd_fault_inject_op *op; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + + debug_dir = debugfs_create_dir("nfsd", NULL); + if (!debug_dir) + goto fail; + + for (i = 0; i < NUM_INJECT_OPS; i++) { + op = &inject_ops[i]; + if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) + goto fail; + } + return 0; + +fail: + nfsd_fault_inject_cleanup(); + return -ENOMEM; +} diff --git a/kernel/fs/nfsd/idmap.h b/kernel/fs/nfsd/idmap.h new file mode 100644 index 000000000..a3f349000 --- /dev/null +++ b/kernel/fs/nfsd/idmap.h @@ -0,0 +1,62 @@ +/* + * Mapping of UID to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. +> * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFSD_IDMAP_H +#define LINUX_NFSD_IDMAP_H + +#include +#include + +/* XXX from linux/nfs_idmap.h */ +#define IDMAP_NAMESZ 128 + +#ifdef CONFIG_NFSD_V4 +int nfsd_idmap_init(struct net *); +void nfsd_idmap_shutdown(struct net *); +#else +static inline int nfsd_idmap_init(struct net *net) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(struct net *net) +{ +} +#endif + +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); +__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t); +__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t); + +#endif /* LINUX_NFSD_IDMAP_H */ diff --git a/kernel/fs/nfsd/lockd.c b/kernel/fs/nfsd/lockd.c new file mode 100644 index 000000000..77e7a5cca --- /dev/null +++ b/kernel/fs/nfsd/lockd.c @@ -0,0 +1,77 @@ +/* + * This file contains all the stubs needed when communicating with lockd. + * This level of indirection is necessary so we can run nfsd+lockd without + * requiring the nfs client to be compiled in/loaded, and vice versa. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include "nfsd.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_LOCKD + +#ifdef CONFIG_LOCKD_V4 +#define nlm_stale_fh nlm4_stale_fh +#define nlm_failed nlm4_failed +#else +#define nlm_stale_fh nlm_lck_denied_nolocks +#define nlm_failed nlm_lck_denied_nolocks +#endif +/* + * Note: we hold the dentry use count while the file is open. + */ +static __be32 +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +{ + __be32 nfserr; + struct svc_fh fh; + + /* must initialize before using! but maxsize doesn't matter */ + fh_init(&fh,0); + fh.fh_handle.fh_size = f->size; + memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); + fh.fh_export = NULL; + + nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + fh_put(&fh); + /* We return nlm error codes as nlm doesn't know + * about nfsd, but nfsd does know about nlm.. + */ + switch (nfserr) { + case nfs_ok: + return 0; + case nfserr_dropit: + return nlm_drop_reply; + case nfserr_stale: + return nlm_stale_fh; + default: + return nlm_failed; + } +} + +static void +nlm_fclose(struct file *filp) +{ + fput(filp); +} + +static struct nlmsvc_binding nfsd_nlm_ops = { + .fopen = nlm_fopen, /* open file for locking */ + .fclose = nlm_fclose, /* close file */ +}; + +void +nfsd_lockd_init(void) +{ + dprintk("nfsd: initializing lockd\n"); + nlmsvc_ops = &nfsd_nlm_ops; +} + +void +nfsd_lockd_shutdown(void) +{ + nlmsvc_ops = NULL; +} diff --git a/kernel/fs/nfsd/netns.h b/kernel/fs/nfsd/netns.h new file mode 100644 index 000000000..ea6749a32 --- /dev/null +++ b/kernel/fs/nfsd/netns.h @@ -0,0 +1,121 @@ +/* + * per net namespace data structures for nfsd + * + * Copyright (C) 2012, Jeff Layton + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __NFSD_NETNS_H__ +#define __NFSD_NETNS_H__ + +#include +#include + +/* Hash tables for nfs4_clientid state */ +#define CLIENT_HASH_BITS 4 +#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) + +#define SESSION_HASH_SIZE 512 + +struct cld_net; +struct nfsd4_client_tracking_ops; + +/* + * Represents a nfsd "container". With respect to nfsv4 state tracking, the + * fields of interest are the *_id_hashtbls and the *_name_tree. These track + * the nfs4_client objects by either short or long form clientid. + * + * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean + * up expired clients and delegations within the container. + */ +struct nfsd_net { + struct cld_net *cld_net; + + struct cache_detail *svc_expkey_cache; + struct cache_detail *svc_export_cache; + + struct cache_detail *idtoname_cache; + struct cache_detail *nametoid_cache; + + struct lock_manager nfsd4_manager; + bool grace_ended; + time_t boot_time; + + /* + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * + * conf_id_hashtbl[], and conf_name_tree hold confirmed + * setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed + * setclientid info. + */ + struct list_head *reclaim_str_hashtbl; + int reclaim_str_hashtbl_size; + struct list_head *conf_id_hashtbl; + struct rb_root conf_name_tree; + struct list_head *unconf_id_hashtbl; + struct rb_root unconf_name_tree; + struct list_head *sessionid_hashtbl; + /* + * client_lru holds client queue ordered by nfs4_client.cl_time + * for lease renewal. + * + * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time + * for last close replay. + * + * All of the above fields are protected by the client_mutex. + */ + struct list_head client_lru; + struct list_head close_lru; + struct list_head del_recall_lru; + + struct delayed_work laundromat_work; + + /* client_lock protects the client lru list and session hash table */ + spinlock_t client_lock; + + struct file *rec_file; + bool in_grace; + struct nfsd4_client_tracking_ops *client_tracking_ops; + + time_t nfsd4_lease; + time_t nfsd4_grace; + + bool nfsd_net_up; + bool lockd_up; + + /* Time of server startup */ + struct timeval nfssvc_boot; + + /* + * Max number of connections this nfsd container will allow. Defaults + * to '0' which is means that it bases this on the number of threads. + */ + unsigned int max_connections; + + u32 clientid_counter; + + struct svc_serv *nfsd_serv; +}; + +/* Simple check to find out if a given net was properly initialized */ +#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) + +extern int nfsd_net_id; +#endif /* __NFSD_NETNS_H__ */ diff --git a/kernel/fs/nfsd/nfs2acl.c b/kernel/fs/nfsd/nfs2acl.c new file mode 100644 index 000000000..d54701f6d --- /dev/null +++ b/kernel/fs/nfsd/nfs2acl.c @@ -0,0 +1,382 @@ +/* + * Process version 2 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include +#include +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static __be32 +nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; + __be32 nfserr = 0; + + dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + inode = d_inode(fh->fh_dentry); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + nfserr = fh_getattr(fh, &resp->stat); + if (nfserr) + goto fail; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + acl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd_attrstat *resp) +{ + struct inode *inode; + svc_fh *fh; + __be32 nfserr = 0; + int error; + + dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; + + inode = d_inode(fh->fh_dentry); + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; + } + + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + if (error) + goto out_drop_write; + + fh_drop_write(fh); + + nfserr = fh_getattr(fh, &resp->stat); + +out: + /* argp->acl_{access,default} may have been allocated in + nfssvc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + return nfserr; +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); + goto out; +} + +/* + * Check file attributes + */ +static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp, + struct nfsd_fhandle *argp, struct nfsd_attrstat *resp) +{ + __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + return nfserr; + nfserr = fh_getattr(&resp->fh, &resp->stat); + return nfserr; +} + +/* + * Check file access + */ +static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, + struct nfsd3_accessres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + if (nfserr) + return nfserr; + nfserr = fh_getattr(&resp->fh, &resp->stat); + return nfserr; +} + +/* + * XDR decode functions + */ +static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclargs *argp) +{ + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_setaclargs *argp) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->mask = ntohl(*p++); + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL); + return (n > 0); +} + +static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_fhandle *argp) +{ + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessargs *argp) +{ + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ + +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETACL */ +static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + struct inode *inode; + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + /* + * Since this is version 2, the check for nfserr in + * nfsd_dispatch actually ensures the following cannot happen. + * However, it seems fragile to depend on that. + */ + if (dentry == NULL || d_really_is_negative(dentry)) + return 0; + inode = d_inode(dentry); + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!*(rqstp->rq_next_page++)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + return 1; +} + +static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + fh_put(&resp->fh); + return 1; +} + +static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + fh_put(&resp->fh); + return 1; +} + +#define nfsaclsvc_decode_voidargs NULL +#define nfsaclsvc_release_void NULL +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_attrstatres nfsd_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsacld_proc_##name, \ + (kxdrproc_t) nfsaclsvc_decode_##argt##args, \ + (kxdrproc_t) nfsaclsvc_encode_##rest##res, \ + (kxdrproc_t) nfsaclsvc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures2[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +}; + +struct svc_version nfsd_acl_version2 = { + .vs_vers = 2, + .vs_nproc = 5, + .vs_proc = nfsd_acl_procedures2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 0, +}; diff --git a/kernel/fs/nfsd/nfs3acl.c b/kernel/fs/nfsd/nfs3acl.c new file mode 100644 index 000000000..882b1a14b --- /dev/null +++ b/kernel/fs/nfsd/nfs3acl.c @@ -0,0 +1,273 @@ +/* + * Process version 3 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include +#include +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; + __be32 nfserr = 0; + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + inode = d_inode(fh->fh_dentry); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + acl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd3_attrstat *resp) +{ + struct inode *inode; + svc_fh *fh; + __be32 nfserr = 0; + int error; + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; + + inode = d_inode(fh->fh_dentry); + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; + } + + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); +out: + /* argp->acl_{access,default} may have been allocated in + nfs3svc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * XDR decode functions + */ +static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclargs *args) +{ + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) + return 0; + args->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_setaclargs *args) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) + return 0; + args->mask = ntohl(*p++); + if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (args->mask & NFS_ACL) ? + &args->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (args->mask & NFS_DFACL) ? + &args->acl_default : NULL); + return (n > 0); +} + +/* + * XDR encode functions + */ + +/* GETACL */ +static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { + struct inode *inode = d_inode(dentry); + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!*(rqstp->rq_next_page++)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + } else + if (!xdr_ressize_check(rqstp, p)) + return 0; + + return 1; +} + +/* SETACL */ +static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +#define nfs3svc_decode_voidargs NULL +#define nfs3svc_release_void NULL +#define nfsd3_setaclres nfsd3_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsd3_proc_##name, \ + (kxdrproc_t) nfs3svc_decode_##argt##args, \ + (kxdrproc_t) nfs3svc_encode_##rest##res, \ + (kxdrproc_t) nfs3svc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures3[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +}; + +struct svc_version nfsd_acl_version3 = { + .vs_vers = 3, + .vs_nproc = 3, + .vs_proc = nfsd_acl_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 0, +}; + diff --git a/kernel/fs/nfsd/nfs3proc.c b/kernel/fs/nfsd/nfs3proc.c new file mode 100644 index 000000000..7b755b7f7 --- /dev/null +++ b/kernel/fs/nfsd/nfs3proc.c @@ -0,0 +1,891 @@ +/* + * Process version 3 NFS requests. + * + * Copyright (C) 1996, 1997, 1998 Olaf Kirch + */ + +#include +#include +#include + +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +static int nfs3_ftypes[] = { + 0, /* NF3NON */ + S_IFREG, /* NF3REG */ + S_IFDIR, /* NF3DIR */ + S_IFBLK, /* NF3BLK */ + S_IFCHR, /* NF3CHR */ + S_IFLNK, /* NF3LNK */ + S_IFSOCK, /* NF3SOCK */ + S_IFIFO, /* NF3FIFO */ +}; + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get a file's attributes + */ +static __be32 +nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: GETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (nfserr) + RETURN_STATUS(nfserr); + + nfserr = fh_getattr(&resp->fh, &resp->stat); + + RETURN_STATUS(nfserr); +} + +/* + * Set a file's attributes + */ +static __be32 +nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: SETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, + argp->check_guard, argp->guardtime); + RETURN_STATUS(nfserr); +} + +/* + * Look up a path name component + */ +static __be32 +nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LOOKUP(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + nfserr = nfsd_lookup(rqstp, &resp->dirfh, + argp->name, + argp->len, + &resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Check file access + */ +static __be32 +nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, + struct nfsd3_accessres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: ACCESS(3) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + RETURN_STATUS(nfserr); +} + +/* + * Read a symlink. + */ +static __be32 +nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp, + struct nfsd3_readlinkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + fh_copy(&resp->fh, &argp->fh); + resp->len = NFS3_MAXPATHLEN; + nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a file. + */ +static __be32 +nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, + struct nfsd3_readres *resp) +{ + __be32 nfserr; + u32 max_blocksize = svc_max_payload(rqstp); + + dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", + SVCFH_fmt(&argp->fh), + (unsigned long) argp->count, + (unsigned long long) argp->offset); + + /* Obtain buffer pointer for payload. + * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) + * + 1 (xdr opaque byte count) = 26 + */ + resp->count = min(argp->count, max_blocksize); + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_read(rqstp, &resp->fh, + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count); + if (nfserr == 0) { + struct inode *inode = d_inode(resp->fh.fh_dentry); + + resp->eof = (argp->offset + resp->count) >= inode->i_size; + } + + RETURN_STATUS(nfserr); +} + +/* + * Write data to a file + */ +static __be32 +nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, + struct nfsd3_writeres *resp) +{ + __be32 nfserr; + unsigned long cnt = argp->len; + + dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", + SVCFH_fmt(&argp->fh), + argp->len, + (unsigned long long) argp->offset, + argp->stable? " stable" : ""); + + fh_copy(&resp->fh, &argp->fh); + resp->committed = argp->stable; + nfserr = nfsd_write(rqstp, &resp->fh, NULL, + argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, + &resp->committed); + resp->count = cnt; + RETURN_STATUS(nfserr); +} + +/* + * With NFSv3, CREATE processing is a lot easier than with NFSv2. + * At least in theory; we'll see how it fares in practice when the + * first reports about SunOS compatibility problems start to pour in... + */ +static __be32 +nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, + struct nfsd3_diropres *resp) +{ + svc_fh *dirfhp, *newfhp = NULL; + struct iattr *attr; + __be32 nfserr; + + dprintk("nfsd: CREATE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + dirfhp = fh_copy(&resp->dirfh, &argp->fh); + newfhp = fh_init(&resp->fh, NFS3_FHSIZE); + attr = &argp->attrs; + + /* Unfudge the mode bits */ + attr->ia_mode &= ~S_IFMT; + if (!(attr->ia_valid & ATTR_MODE)) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = S_IFREG; + } else { + attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; + } + + /* Now create the file and set attributes */ + nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, newfhp, + argp->createmode, (u32 *)argp->verf, NULL, NULL); + + RETURN_STATUS(nfserr); +} + +/* + * Make directory. This operation is not idempotent. + */ +static __be32 +nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: MKDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_unlock(&resp->dirfh); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), + argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_copy(&resp->dirfh, &argp->ffh); + fh_init(&resp->fh, NFS3_FHSIZE); + nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, + argp->tname, &resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Make socket/fifo/device. + */ +static __be32 +nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + int type; + dev_t rdev = 0; + + dprintk("nfsd: MKNOD(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + if (argp->ftype == 0 || argp->ftype >= NF3BAD) + RETURN_STATUS(nfserr_inval); + if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { + rdev = MKDEV(argp->major, argp->minor); + if (MAJOR(rdev) != argp->major || + MINOR(rdev) != argp->minor) + RETURN_STATUS(nfserr_inval); + } else + if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) + RETURN_STATUS(nfserr_inval); + + type = nfs3_ftypes[argp->ftype]; + nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, type, rdev, &resp->fh); + fh_unlock(&resp->dirfh); + RETURN_STATUS(nfserr); +} + +/* + * Remove file/fifo/socket etc. + */ +static __be32 +nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: REMOVE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + /* Unlink. -S_IFDIR means file must not be a directory */ + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Remove a directory + */ +static __be32 +nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RMDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp, + struct nfsd3_renameres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RENAME(3) %s %.*s ->\n", + SVCFH_fmt(&argp->ffh), + argp->flen, + argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->ffh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, + &resp->tfh, argp->tname, argp->tlen); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp, + struct nfsd3_linkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LINK(3) %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->fh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, + &resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, + struct nfsd3_readdirres *resp) +{ + __be32 nfserr; + int count; + + dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Make sure we've room for the NULL ptr & eof flag, and shrink to + * client read size */ + count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->buflen = count; + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->rqstp = rqstp; + nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, + &resp->common, nfs3svc_encode_entry); + memcpy(resp->verf, argp->verf, 8); + resp->count = resp->buffer - argp->buffer; + if (resp->offset) + xdr_encode_hyper(resp->offset, argp->cookie); + + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a directory, including file handles and attrs. + * For now, we choose to ignore the dircount parameter. + */ +static __be32 +nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, + struct nfsd3_readdirres *resp) +{ + __be32 nfserr; + int count = 0; + loff_t offset; + struct page **p; + caddr_t page_addr = NULL; + + dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->buflen = resp->count; + resp->rqstp = rqstp; + offset = argp->cookie; + + nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) + RETURN_STATUS(nfserr_notsupp); + + nfserr = nfsd_readdir(rqstp, &resp->fh, + &offset, + &resp->common, + nfs3svc_encode_entry_plus); + memcpy(resp->verf, argp->verf, 8); + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; + if (resp->offset) { + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + } + + RETURN_STATUS(nfserr); +} + +/* + * Get file system stats + */ +static __be32 +nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_fsstatres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: FSSTAT(3) %s\n", + SVCFH_fmt(&argp->fh)); + + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Get file system info + */ +static __be32 +nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_fsinfores *resp) +{ + __be32 nfserr; + u32 max_blocksize = svc_max_payload(rqstp); + + dprintk("nfsd: FSINFO(3) %s\n", + SVCFH_fmt(&argp->fh)); + + resp->f_rtmax = max_blocksize; + resp->f_rtpref = max_blocksize; + resp->f_rtmult = PAGE_SIZE; + resp->f_wtmax = max_blocksize; + resp->f_wtpref = max_blocksize; + resp->f_wtmult = PAGE_SIZE; + resp->f_dtpref = PAGE_SIZE; + resp->f_maxfilesize = ~(u32) 0; + resp->f_properties = NFS3_FSF_DEFAULT; + + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + + /* Check special features of the file system. May request + * different read/write sizes for file systems known to have + * problems with large blocks */ + if (nfserr == 0) { + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; + + /* Note that we don't care for remote fs's here */ + if (sb->s_magic == MSDOS_SUPER_MAGIC) { + resp->f_properties = NFS3_FSF_BILLYBOY; + } + resp->f_maxfilesize = sb->s_maxbytes; + } + + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Get pathconf info for the specified file + */ +static __be32 +nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_pathconfres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: PATHCONF(3) %s\n", + SVCFH_fmt(&argp->fh)); + + /* Set default pathconf */ + resp->p_link_max = 255; /* at least */ + resp->p_name_max = 255; /* at least */ + resp->p_no_trunc = 0; + resp->p_chown_restricted = 1; + resp->p_case_insensitive = 0; + resp->p_case_preserving = 1; + + nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + + if (nfserr == 0) { + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; + + /* Note that we don't care for remote fs's here */ + switch (sb->s_magic) { + case EXT2_SUPER_MAGIC: + resp->p_link_max = EXT2_LINK_MAX; + resp->p_name_max = EXT2_NAME_LEN; + break; + case MSDOS_SUPER_MAGIC: + resp->p_case_insensitive = 1; + resp->p_case_preserving = 0; + break; + } + } + + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + + +/* + * Commit a file (range) to stable storage. + */ +static __be32 +nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp, + struct nfsd3_commitres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", + SVCFH_fmt(&argp->fh), + argp->count, + (unsigned long long) argp->offset); + + if (argp->offset > NFS_OFFSET_MAX) + RETURN_STATUS(nfserr_inval); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count); + + RETURN_STATUS(nfserr); +} + + +/* + * NFSv3 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle +#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat +#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat +#define nfsd3_mkdirargs nfsd3_createargs +#define nfsd3_readdirplusargs nfsd3_readdirargs +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_fhandleres nfsd3_attrstat +#define nfsd3_attrstatres nfsd3_attrstat +#define nfsd3_wccstatres nfsd3_attrstat +#define nfsd3_createres nfsd3_diropres +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsd3_proc_##name, \ + (kxdrproc_t) nfs3svc_decode_##argt##args, \ + (kxdrproc_t) nfs3svc_encode_##rest##res, \ + (kxdrproc_t) nfs3svc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define FH 17 /* filehandle with length */ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define WC (7+pAT) /* WCC attributes */ + +static struct svc_procedure nfsd_procedures3[22] = { + [NFS3PROC_NULL] = { + .pc_func = (svc_procfunc) nfsd3_proc_null, + .pc_encode = (kxdrproc_t) nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFS3PROC_GETATTR] = { + .pc_func = (svc_procfunc) nfsd3_proc_getattr, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_attrstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFS3PROC_SETATTR] = { + .pc_func = (svc_procfunc) nfsd3_proc_setattr, + .pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_LOOKUP] = { + .pc_func = (svc_procfunc) nfsd3_proc_lookup, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_diropres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+pAT+pAT, + }, + [NFS3PROC_ACCESS] = { + .pc_func = (svc_procfunc) nfsd3_proc_access, + .pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_accessres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1, + }, + [NFS3PROC_READLINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_readlink, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_ressize = sizeof(struct nfsd3_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + }, + [NFS3PROC_READ] = { + .pc_func = (svc_procfunc) nfsd3_proc_read, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_ressize = sizeof(struct nfsd3_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + }, + [NFS3PROC_WRITE] = { + .pc_func = (svc_procfunc) nfsd3_proc_write, + .pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_writeres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_ressize = sizeof(struct nfsd3_writeres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+4, + }, + [NFS3PROC_CREATE] = { + .pc_func = (svc_procfunc) nfsd3_proc_create, + .pc_decode = (kxdrproc_t) nfs3svc_decode_createargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_mkdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_SYMLINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_symlink, + .pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKNOD] = { + .pc_func = (svc_procfunc) nfsd3_proc_mknod, + .pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_REMOVE] = { + .pc_func = (svc_procfunc) nfsd3_proc_remove, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RMDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_rmdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RENAME] = { + .pc_func = (svc_procfunc) nfsd3_proc_rename, + .pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_renameres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_ressize = sizeof(struct nfsd3_renameres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+WC, + }, + [NFS3PROC_LINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_link, + .pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_linkres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_ressize = sizeof(struct nfsd3_linkres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+pAT+WC, + }, + [NFS3PROC_READDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_readdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_READDIRPLUS] = { + .pc_func = (svc_procfunc) nfsd3_proc_readdirplus, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_FSSTAT] = { + .pc_func = (svc_procfunc) nfsd3_proc_fsstat, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+2*6+1, + }, + [NFS3PROC_FSINFO] = { + .pc_func = (svc_procfunc) nfsd3_proc_fsinfo, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsinfores), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+12, + }, + [NFS3PROC_PATHCONF] = { + .pc_func = (svc_procfunc) nfsd3_proc_pathconf, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_pathconfres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+6, + }, + [NFS3PROC_COMMIT] = { + .pc_func = (svc_procfunc) nfsd3_proc_commit, + .pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_commitres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_ressize = sizeof(struct nfsd3_commitres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+WC+2, + }, +}; + +struct svc_version nfsd_version3 = { + .vs_vers = 3, + .vs_nproc = 22, + .vs_proc = nfsd_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; diff --git a/kernel/fs/nfsd/nfs3xdr.c b/kernel/fs/nfsd/nfs3xdr.c new file mode 100644 index 000000000..e4b2b4322 --- /dev/null +++ b/kernel/fs/nfsd/nfs3xdr.c @@ -0,0 +1,1114 @@ +/* + * XDR support for nfsd/protocol version 3. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + * + * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! + */ + +#include +#include +#include "xdr3.h" +#include "auth.h" +#include "netns.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs3_ftypes[] = { + NF3NON, NF3FIFO, NF3CHR, NF3BAD, + NF3DIR, NF3BAD, NF3BLK, NF3BAD, + NF3REG, NF3BAD, NF3LNK, NF3BAD, + NF3SOCK, NF3BAD, NF3LNK, NF3BAD, +}; + +/* + * XDR functions for basic NFS types + */ +static __be32 * +encode_time3(__be32 *p, struct timespec *time) +{ + *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + return p; +} + +static __be32 * +decode_time3(__be32 *p, struct timespec *time) +{ + time->tv_sec = ntohl(*p++); + time->tv_nsec = ntohl(*p++); + return p; +} + +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size; + fh_init(fhp, NFS3_FHSIZE); + size = ntohl(*p++); + if (size > NFS3_FHSIZE) + return NULL; + + memcpy(&fhp->fh_handle.fh_base, p, size); + fhp->fh_handle.fh_size = size; + return p + XDR_QUADLEN(size); +} + +/* Helper function for NFSv3 ACL code */ +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size = fhp->fh_handle.fh_size; + *p++ = htonl(size); + if (size) p[XDR_QUADLEN(size)-1]=0; + memcpy(p, &fhp->fh_handle.fh_base, size); + return p + XDR_QUADLEN(size); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr3(__be32 *p, struct iattr *iap) +{ + u32 tmp; + + iap->ia_valid = 0; + + if (*p++) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = ntohl(*p++); + } + if (*p++) { + iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++)); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + if (*p++) { + iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++)); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + if (*p++) { + u64 newsize; + + iap->ia_valid |= ATTR_SIZE; + p = xdr_decode_hyper(p, &newsize); + iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_ATIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = ntohl(*p++); + iap->ia_atime.tv_nsec = ntohl(*p++); + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_MTIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = ntohl(*p++); + iap->ia_mtime.tv_nsec = ntohl(*p++); + } + return p; +} + +static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +{ + u64 f; + switch(fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + p = xdr_encode_hyper(p, (u64)huge_encode_dev + (d_inode(fhp->fh_dentry)->i_sb->s_dev)); + break; + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u64*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; + p = xdr_encode_hyper(p, f); + break; + } + return p; +} + +static __be32 * +encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { + p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); + } else { + p = xdr_encode_hyper(p, (u64) stat->size); + } + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + *p++ = htonl((u32) MAJOR(stat->rdev)); + *p++ = htonl((u32) MINOR(stat->rdev)); + p = encode_fsid(p, fhp); + p = xdr_encode_hyper(p, stat->ino); + p = encode_time3(p, &stat->atime); + p = encode_time3(p, &stat->mtime); + p = encode_time3(p, &stat->ctime); + + return p; +} + +static __be32 * +encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + /* Attributes to follow */ + *p++ = xdr_one; + return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); +} + +/* + * Encode post-operation attributes. + * The inode may be NULL if the call failed because of a stale file + * handle. In this case, no attributes are returned. + */ +static __be32 * +encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + if (dentry && d_really_is_positive(dentry)) { + __be32 err; + struct kstat stat; + + err = fh_getattr(fhp, &stat); + if (!err) { + *p++ = xdr_one; /* attributes follow */ + lease_get_mtime(d_inode(dentry), &stat.mtime); + return encode_fattr3(rqstp, p, fhp, &stat); + } + } + *p++ = xdr_zero; + return p; +} + +/* Helper for NFSv3 ACLs */ +__be32 * +nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Enocde weak cache consistency data + */ +static __be32 * +encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + + if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { + if (fhp->fh_pre_saved) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); + p = encode_time3(p, &fhp->fh_pre_mtime); + p = encode_time3(p, &fhp->fh_pre_ctime); + } else { + *p++ = xdr_zero; + } + return encode_saved_post_attr(rqstp, p, fhp); + } + /* no pre- or post-attrs */ + *p++ = xdr_zero; + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Fill in the post_op attr for the wcc data + */ +void fill_post_wcc(struct svc_fh *fhp) +{ + __be32 err; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; + if (err) { + fhp->fh_post_saved = 0; + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; + } else + fhp->fh_post_saved = 1; +} + +/* + * XDR decode functions + */ +int +nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_sattrargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = decode_sattr3(p, &args->attrs); + + if ((args->check_guard = ntohl(*p++)) != 0) { + struct timespec time; + p = decode_time3(p, &time); + args->guardtime = time.tv_sec; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readargs *args) +{ + unsigned int len; + int v; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + args->count = ntohl(*p++); + len = min(args->count, max_blocksize); + + /* set up the kvec */ + v=0; + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_writeargs *args) +{ + unsigned int len, v, hdr, dlen; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + args->count = ntohl(*p++); + args->stable = ntohl(*p++); + len = args->len = ntohl(*p++); + /* + * The count must equal the amount of data passed. + */ + if (args->count != args->len) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; + dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + - hdr; + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + if (args->count > max_blocksize) { + args->count = max_blocksize; + len = args->len = max_blocksize; + } + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + v = 0; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; + v++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; + } + rqstp->rq_vec[v].iov_len = len; + args->vlen = v + 1; + return 1; +} + +int +nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_createargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + switch (args->createmode = ntohl(*p++)) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + p = decode_sattr3(p, &args->attrs); + break; + case NFS3_CREATE_EXCLUSIVE: + args->verf = p; + p += 2; + break; + default: + return 0; + } + + return xdr_argsize_check(rqstp, p); +} +int +nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_createargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) || + !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr3(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_symlinkargs *args) +{ + unsigned int len, avail; + char *old, *new; + struct kvec *vec; + + if (!(p = decode_fh(p, &args->ffh)) || + !(p = decode_filename(p, &args->fname, &args->flen)) + ) + return 0; + p = decode_sattr3(p, &args->attrs); + + /* now decode the pathname, which might be larger than the first page. + * As we have to check for nul's anyway, we copy it into a new page + * This page appears in the rq_res.pages list, but as pages_len is always + * 0, it won't get in the way + */ + len = ntohl(*p++); + if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) + return 0; + args->tname = new = page_address(*(rqstp->rq_next_page++)); + args->tlen = len; + /* first copy and check from the first page */ + old = (char*)p; + vec = &rqstp->rq_arg.head[0]; + avail = vec->iov_len - (old - (char*)vec->iov_base); + while (len && avail && *old) { + *new++ = *old++; + len--; + avail--; + } + /* now copy next page if there is one */ + if (len && !avail && rqstp->rq_arg.page_len) { + avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); + old = page_address(rqstp->rq_arg.pages[0]); + } + while (len && avail && *old) { + *new++ = *old++; + len--; + avail--; + } + *new = '\0'; + if (len) + return 0; + + return 1; +} + +int +nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_mknodargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + args->ftype = ntohl(*p++); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR + || args->ftype == NF3SOCK || args->ftype == NF3FIFO) + p = decode_sattr3(p, &args->attrs); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR) { + args->major = ntohl(*p++); + args->minor = ntohl(*p++); + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_renameargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readlinkargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_linkargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ~0; + args->count = ntohl(*p++); + args->count = min_t(u32, args->count, PAGE_SIZE); + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirargs *args) +{ + int len; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ntohl(*p++); + args->count = ntohl(*p++); + + len = args->count = min(args->count, max_blocksize); + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_commitargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + args->count = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +int +nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETATTR */ +int +nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + if (resp->status == 0) { + lease_get_mtime(d_inode(resp->fh.fh_dentry), + &resp->stat.mtime); + p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + } + return xdr_ressize_check(rqstp, p); +} + +/* SETATTR, REMOVE, RMDIR */ +int +nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* LOOKUP */ +int +nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropres *resp) +{ + if (resp->status == 0) { + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_post_op_attr(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +int +nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* READLINK */ +int +nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readlinkres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* READ */ +int +nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->eof); + *p++ = htonl(resp->count); /* xdr opaque count */ + xdr_ressize_check(rqstp, p); + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* WRITE */ +int +nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_writeres *resp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + p = encode_wcc_data(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->committed); + *p++ = htonl(nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_usec); + } + return xdr_ressize_check(rqstp, p); +} + +/* CREATE, MKDIR, SYMLINK, MKNOD */ +int +nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropres *resp) +{ + if (resp->status == 0) { + *p++ = xdr_one; + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_wcc_data(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* RENAME */ +int +nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_renameres *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->ffh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* LINK */ +int +nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_linkres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* READDIR */ +int +nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + + if (resp->status == 0) { + /* stupid readdir cookie */ + memcpy(p, resp->verf, 8); p += 2; + xdr_ressize_check(rqstp, p); + if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) + return 1; /*No room for trailer */ + rqstp->rq_res.page_len = (resp->count) << 2; + + /* add the 'tail' to the end of the 'head' page - page 0. */ + rqstp->rq_res.tail[0].iov_base = p; + *p++ = 0; /* no more entries */ + *p++ = htonl(resp->common.err == nfserr_eof); + rqstp->rq_res.tail[0].iov_len = 2<<2; + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +static __be32 * +encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, + int namlen, u64 ino) +{ + *p++ = xdr_one; /* mark entry present */ + p = xdr_encode_hyper(p, ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + + cd->offset = p; /* remember pointer */ + p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ + + return p; +} + +static __be32 +compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, + const char *name, int namlen) +{ + struct svc_export *exp; + struct dentry *dparent, *dchild; + __be32 rv = nfserr_noent; + + dparent = cd->fh.fh_dentry; + exp = cd->fh.fh_export; + + if (isdotent(name, namlen)) { + if (namlen == 2) { + dchild = dget_parent(dparent); + /* filesystem root - cannot return filehandle for ".." */ + if (dchild == dparent) + goto out; + } else + dchild = dget(dparent); + } else + dchild = lookup_one_len(name, dparent, namlen); + if (IS_ERR(dchild)) + return rv; + if (d_mountpoint(dchild)) + goto out; + if (d_really_is_negative(dchild)) + goto out; + rv = fh_compose(fhp, exp, dchild, &cd->fh); +out: + dput(dchild); + return rv; +} + +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +{ + struct svc_fh *fh = &cd->scratch; + __be32 err; + + fh_init(fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, fh, name, namlen); + if (err) { + *p++ = 0; + *p++ = 0; + goto out; + } + p = encode_post_op_attr(cd->rqstp, p, fh); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, fh); +out: + fh_put(fh); + return p; +} + +/* + * Encode a directory entry. This one works for both normal readdir + * and readdirplus. + * The normal readdir reply requires 2 (fileid) + 1 (stringlen) + * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. + * + * The readdirplus baggage is 1+21 words for post_op_attr, plus the + * file handle. + */ + +#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) +#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) +static int +encode_entry(struct readdir_cd *ccd, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type, int plus) +{ + struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, + common); + __be32 *p = cd->buffer; + caddr_t curr_page_addr = NULL; + struct page ** page; + int slen; /* string (name) length */ + int elen; /* estimated entry length in words */ + int num_entry_words = 0; /* actual number of words */ + + if (cd->offset) { + u64 offset64 = offset; + + if (unlikely(cd->offset1)) { + /* we ended up with offset on a page boundary */ + *cd->offset = htonl(offset64 >> 32); + *cd->offset1 = htonl(offset64 & 0xffffffff); + cd->offset1 = NULL; + } else { + xdr_encode_hyper(cd->offset, offset64); + } + } + + /* + dprintk("encode_entry(%.*s @%ld%s)\n", + namlen, name, (long) offset, plus? " plus" : ""); + */ + + /* truncate filename if too long */ + namlen = min(namlen, NFS3_MAXNAMLEN); + + slen = XDR_QUADLEN(namlen); + elen = slen + NFS3_ENTRY_BAGGAGE + + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + + if (cd->buflen < elen) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + /* determine which page in rq_respages[] we are currently filling */ + for (page = cd->rqstp->rq_respages + 1; + page < cd->rqstp->rq_next_page; page++) { + curr_page_addr = page_address(*page); + + if (((caddr_t)cd->buffer >= curr_page_addr) && + ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) + break; + } + + if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { + /* encode entry in current page */ + + p = encode_entry_baggage(cd, p, name, namlen, ino); + + if (plus) + p = encode_entryplus_baggage(cd, p, name, namlen); + num_entry_words = p - cd->buffer; + } else if (*(page+1) != NULL) { + /* temporarily encode entry into next page, then move back to + * current and next page in rq_respages[] */ + __be32 *p1, *tmp; + int len1, len2; + + /* grab next page for temporary storage of entry */ + p1 = tmp = page_address(*(page+1)); + + p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + + if (plus) + p1 = encode_entryplus_baggage(cd, p1, name, namlen); + + /* determine entry word length and lengths to go in pages */ + num_entry_words = p1 - tmp; + len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; + if ((num_entry_words << 2) < len1) { + /* the actual number of words in the entry is less + * than elen and can still fit in the current page + */ + memmove(p, tmp, num_entry_words << 2); + p += num_entry_words; + + /* update offset */ + cd->offset = cd->buffer + (cd->offset - tmp); + } else { + unsigned int offset_r = (cd->offset - tmp) << 2; + + /* update pointer to offset location. + * This is a 64bit quantity, so we need to + * deal with 3 cases: + * - entirely in first page + * - entirely in second page + * - 4 bytes in each page + */ + if (offset_r + 8 <= len1) { + cd->offset = p + (cd->offset - tmp); + } else if (offset_r >= len1) { + cd->offset -= len1 >> 2; + } else { + /* sitting on the fence */ + BUG_ON(offset_r != len1 - 4); + cd->offset = p + (cd->offset - tmp); + cd->offset1 = tmp; + } + + len2 = (num_entry_words << 2) - len1; + + /* move from temp page to current and next pages */ + memmove(p, tmp, len1); + memmove(tmp, (caddr_t)tmp+len1, len2); + + p = tmp + (len2 >> 2); + } + } + else { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + cd->buflen -= num_entry_words; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; + +} + +int +nfs3svc_encode_entry(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 0); +} + +int +nfs3svc_encode_entry_plus(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 1); +} + +/* FSSTAT */ +int +nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fsstatres *resp) +{ + struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p++ = htonl(resp->invarsec); /* mean unchanged time */ + } + return xdr_ressize_check(rqstp, p); +} + +/* FSINFO */ +int +nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fsinfores *resp) +{ + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->f_rtmax); + *p++ = htonl(resp->f_rtpref); + *p++ = htonl(resp->f_rtmult); + *p++ = htonl(resp->f_wtmax); + *p++ = htonl(resp->f_wtpref); + *p++ = htonl(resp->f_wtmult); + *p++ = htonl(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + *p++ = xdr_one; + *p++ = xdr_zero; + *p++ = htonl(resp->f_properties); + } + + return xdr_ressize_check(rqstp, p); +} + +/* PATHCONF */ +int +nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_pathconfres *resp) +{ + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->p_link_max); + *p++ = htonl(resp->p_name_max); + *p++ = htonl(resp->p_no_trunc); + *p++ = htonl(resp->p_chown_restricted); + *p++ = htonl(resp->p_case_insensitive); + *p++ = htonl(resp->p_case_preserving); + } + + return xdr_ressize_check(rqstp, p); +} + +/* COMMIT */ +int +nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_commitres *resp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + p = encode_wcc_data(rqstp, p, &resp->fh); + /* Write verifier */ + if (resp->status == 0) { + *p++ = htonl(nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_usec); + } + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +int +nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + fh_put(&resp->fh); + return 1; +} + +int +nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fhandle_pair *resp) +{ + fh_put(&resp->fh1); + fh_put(&resp->fh2); + return 1; +} diff --git a/kernel/fs/nfsd/nfs4acl.c b/kernel/fs/nfsd/nfs4acl.c new file mode 100644 index 000000000..67242bf7c --- /dev/null +++ b/kernel/fs/nfsd/nfs4acl.c @@ -0,0 +1,896 @@ +/* + * Common NFSv4 ACL handling code. + * + * Copyright (c) 2002, 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * Jeff Sedlak + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "nfsfh.h" +#include "nfsd.h" +#include "acl.h" +#include "vfs.h" + +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 + +/* mode bit translations: */ +#define NFS4_READ_MODE (NFS4_ACE_READ_DATA) +#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA) +#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE +#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) +#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) + +/* We don't support these bits; insist they be neither allowed nor denied */ +#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \ + | NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS) + +/* flags used to simulate posix default ACLs */ +#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ + | NFS4_ACE_DIRECTORY_INHERIT_ACE) + +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \ + | NFS4_ACE_INHERIT_ONLY_ACE \ + | NFS4_ACE_IDENTIFIER_GROUP) + +#define MASK_EQUAL(mask1, mask2) \ + ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) + +static u32 +mask_from_posix(unsigned short perm, unsigned int flags) +{ + int mask = NFS4_ANYONE_MODE; + + if (flags & NFS4_ACL_OWNER) + mask |= NFS4_OWNER_MODE; + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +static u32 +deny_mask_from_posix(unsigned short perm, u32 flags) +{ + u32 mask = 0; + + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +/* XXX: modify functions to return NFS errors; they're only ever + * used by nfs code, after all.... */ + +/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the + * side of being more restrictive, so the mode bit mapping below is + * pessimistic. An optimistic version would be needed to handle DENY's, + * but we espect to coalesce all ALLOWs and DENYs before mapping to mode + * bits. */ + +static void +low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) +{ + u32 write_mode = NFS4_WRITE_MODE; + + if (flags & NFS4_ACL_DIR) + write_mode |= NFS4_ACE_DELETE_CHILD; + *mode = 0; + if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) + *mode |= ACL_READ; + if ((perm & write_mode) == write_mode) + *mode |= ACL_WRITE; + if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) + *mode |= ACL_EXECUTE; +} + +struct ace_container { + struct nfs4_ace *ace; + struct list_head ace_l; +}; + +static short ace2type(struct nfs4_ace *); +static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, + unsigned int); + +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl) +{ + struct inode *inode = d_inode(dentry); + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + int size = 0; + + pacl = get_acl(inode, ACL_TYPE_ACCESS); + if (!pacl) + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; + + if (S_ISDIR(inode->i_mode)) { + flags = NFS4_ACL_DIR; + dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(dpacl)) { + error = PTR_ERR(dpacl); + goto rel_pacl; + } + + if (dpacl) + size += 2 * dpacl->a_count; + } + + *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL); + if (*acl == NULL) { + error = -ENOMEM; + goto out; + } + (*acl)->naces = 0; + + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + + if (dpacl) + _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); + +out: + posix_acl_release(dpacl); +rel_pacl: + posix_acl_release(pacl); + return error; +} + +struct posix_acl_summary { + unsigned short owner; + unsigned short users; + unsigned short group; + unsigned short groups; + unsigned short other; + unsigned short mask; +}; + +static void +summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) +{ + struct posix_acl_entry *pa, *pe; + + /* + * Only pas.users and pas.groups need initialization; previous + * posix_acl_valid() calls ensure that the other fields will be + * initialized in the following loop. But, just to placate gcc: + */ + memset(pas, 0, sizeof(*pas)); + pas->mask = 07; + + pe = acl->a_entries + acl->a_count; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pas->owner = pa->e_perm; + break; + case ACL_GROUP_OBJ: + pas->group = pa->e_perm; + break; + case ACL_USER: + pas->users |= pa->e_perm; + break; + case ACL_GROUP: + pas->groups |= pa->e_perm; + break; + case ACL_OTHER: + pas->other = pa->e_perm; + break; + case ACL_MASK: + pas->mask = pa->e_perm; + break; + } + } + /* We'll only care about effective permissions: */ + pas->users &= pas->mask; + pas->group &= pas->mask; + pas->groups &= pas->mask; +} + +/* We assume the acl has been verified with posix_acl_valid. */ +static void +_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, + unsigned int flags) +{ + struct posix_acl_entry *pa, *group_owner_entry; + struct nfs4_ace *ace; + struct posix_acl_summary pas; + unsigned short deny; + int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? + NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0); + + BUG_ON(pacl->a_count < 3); + summarize_posix_acl(pacl, &pas); + + pa = pacl->a_entries; + ace = acl->aces + acl->naces; + + /* We could deny everything not granted by the owner: */ + deny = ~pas.owner; + /* + * but it is equivalent (and simpler) to deny only what is not + * granted by later entries: + */ + deny &= pas.users | pas.group | pas.groups | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + } + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_USER) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.groups | pas.group | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_uid = pa->e_uid; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_uid = pa->e_uid; + ace++; + acl->naces++; + pa++; + } + + /* In the case of groups, we apply allow ACEs first, then deny ACEs, + * since a user can be in more than one group. */ + + /* allow ACEs */ + + group_owner_entry = pa; + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pas.group, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_GROUP) { + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_gid = pa->e_gid; + ace++; + acl->naces++; + pa++; + } + + /* deny ACEs */ + + pa = group_owner_entry; + + deny = ~pas.group & pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + } + pa++; + + while (pa->e_tag == ACL_GROUP) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_gid = pa->e_gid; + ace++; + acl->naces++; + } + pa++; + } + + if (pa->e_tag == ACL_MASK) + pa++; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags); + ace->whotype = NFS4_ACL_WHO_EVERYONE; + acl->naces++; +} + +static bool +pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) +{ + if (pace1->e_tag != pace2->e_tag) + return pace1->e_tag > pace2->e_tag; + if (pace1->e_tag == ACL_USER) + return uid_gt(pace1->e_uid, pace2->e_uid); + if (pace1->e_tag == ACL_GROUP) + return gid_gt(pace1->e_gid, pace2->e_gid); + return false; +} + +static void +sort_pacl_range(struct posix_acl *pacl, int start, int end) { + int sorted = 0, i; + struct posix_acl_entry tmp; + + /* We just do a bubble sort; easy to do in place, and we're not + * expecting acl's to be long enough to justify anything more. */ + while (!sorted) { + sorted = 1; + for (i = start; i < end; i++) { + if (pace_gt(&pacl->a_entries[i], + &pacl->a_entries[i+1])) { + sorted = 0; + tmp = pacl->a_entries[i]; + pacl->a_entries[i] = pacl->a_entries[i+1]; + pacl->a_entries[i+1] = tmp; + } + } + } +} + +static void +sort_pacl(struct posix_acl *pacl) +{ + /* posix_acl_valid requires that users and groups be in order + * by uid/gid. */ + int i, j; + + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + + i = 1; + while (pacl->a_entries[i].e_tag == ACL_USER) + i++; + sort_pacl_range(pacl, 1, i-1); + + BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); + j = ++i; + while (pacl->a_entries[j].e_tag == ACL_GROUP) + j++; + sort_pacl_range(pacl, i, j-1); + return; +} + +/* + * While processing the NFSv4 ACE, this maintains bitmasks representing + * which permission bits have been allowed and which denied to a given + * entity: */ +struct posix_ace_state { + u32 allow; + u32 deny; +}; + +struct posix_user_ace_state { + union { + kuid_t uid; + kgid_t gid; + }; + struct posix_ace_state perms; +}; + +struct posix_ace_state_array { + int n; + struct posix_user_ace_state aces[]; +}; + +/* + * While processing the NFSv4 ACE, this maintains the partial permissions + * calculated so far: */ + +struct posix_acl_state { + int empty; + struct posix_ace_state owner; + struct posix_ace_state group; + struct posix_ace_state other; + struct posix_ace_state everyone; + struct posix_ace_state mask; /* Deny unused in this case */ + struct posix_ace_state_array *users; + struct posix_ace_state_array *groups; +}; + +static int +init_state(struct posix_acl_state *state, int cnt) +{ + int alloc; + + memset(state, 0, sizeof(struct posix_acl_state)); + state->empty = 1; + /* + * In the worst case, each individual acl could be for a distinct + * named user or group, but we don't no which, so we allocate + * enough space for either: + */ + alloc = sizeof(struct posix_ace_state_array) + + cnt*sizeof(struct posix_user_ace_state); + state->users = kzalloc(alloc, GFP_KERNEL); + if (!state->users) + return -ENOMEM; + state->groups = kzalloc(alloc, GFP_KERNEL); + if (!state->groups) { + kfree(state->users); + return -ENOMEM; + } + return 0; +} + +static void +free_state(struct posix_acl_state *state) { + kfree(state->users); + kfree(state->groups); +} + +static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate) +{ + state->mask.allow |= astate->allow; +} + +static struct posix_acl * +posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) +{ + struct posix_acl_entry *pace; + struct posix_acl *pacl; + int nace; + int i; + + /* + * ACLs with no ACEs are treated differently in the inheritable + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. + */ + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + + /* + * When there are no effective ACEs, the following will end + * up setting a 3-element effective posix ACL with all + * permissions zero. + */ + if (!state->users->n && !state->groups->n) + nace = 3; + else /* Note we also include a MASK ACE in this case: */ + nace = 4 + state->users->n + state->groups->n; + pacl = posix_acl_alloc(nace, GFP_KERNEL); + if (!pacl) + return ERR_PTR(-ENOMEM); + + pace = pacl->a_entries; + pace->e_tag = ACL_USER_OBJ; + low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); + + for (i=0; i < state->users->n; i++) { + pace++; + pace->e_tag = ACL_USER; + low_mode_from_nfs4(state->users->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_uid = state->users->aces[i].uid; + add_to_mask(state, &state->users->aces[i].perms); + } + + pace++; + pace->e_tag = ACL_GROUP_OBJ; + low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); + add_to_mask(state, &state->group); + + for (i=0; i < state->groups->n; i++) { + pace++; + pace->e_tag = ACL_GROUP; + low_mode_from_nfs4(state->groups->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_gid = state->groups->aces[i].gid; + add_to_mask(state, &state->groups->aces[i].perms); + } + + if (state->users->n || state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + } + + pace++; + pace->e_tag = ACL_OTHER; + low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); + + return pacl; +} + +static inline void allow_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Allow all bits in the mask not already denied: */ + astate->allow |= mask & ~astate->deny; +} + +static inline void deny_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Deny all bits in the mask not already allowed: */ + astate->deny |= mask & ~astate->allow; +} + +static int find_uid(struct posix_acl_state *state, kuid_t uid) +{ + struct posix_ace_state_array *a = state->users; + int i; + + for (i = 0; i < a->n; i++) + if (uid_eq(a->aces[i].uid, uid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].uid = uid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + +static int find_gid(struct posix_acl_state *state, kgid_t gid) +{ + struct posix_ace_state_array *a = state->groups; + int i; + + for (i = 0; i < a->n; i++) + if (gid_eq(a->aces[i].gid, gid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].gid = gid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + +static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + deny_bits(&a->aces[i].perms, mask); +} + +static void allow_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + allow_bits(&a->aces[i].perms, mask); +} + +static void process_one_v4_ace(struct posix_acl_state *state, + struct nfs4_ace *ace) +{ + u32 mask = ace->access_mask; + int i; + + state->empty = 0; + + switch (ace2type(ace)) { + case ACL_USER_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + } else { + deny_bits(&state->owner, mask); + } + break; + case ACL_USER: + i = find_uid(state, ace->who_uid); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->users->aces[i].perms, mask); + } else { + deny_bits(&state->users->aces[i].perms, mask); + mask = state->users->aces[i].perms.deny; + deny_bits(&state->owner, mask); + } + break; + case ACL_GROUP_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->group, mask); + } else { + deny_bits(&state->group, mask); + mask = state->group.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_GROUP: + i = find_gid(state, ace->who_gid); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->groups->aces[i].perms, mask); + } else { + deny_bits(&state->groups->aces[i].perms, mask); + mask = state->groups->aces[i].perms.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_OTHER: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + allow_bits(&state->group, mask); + allow_bits(&state->other, mask); + allow_bits(&state->everyone, mask); + allow_bits_array(state->users, mask); + allow_bits_array(state->groups, mask); + } else { + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->other, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + } +} + +static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, + struct posix_acl **pacl, struct posix_acl **dpacl, + unsigned int flags) +{ + struct posix_acl_state effective_acl_state, default_acl_state; + struct nfs4_ace *ace; + int ret; + + ret = init_state(&effective_acl_state, acl->naces); + if (ret) + return ret; + ret = init_state(&default_acl_state, acl->naces); + if (ret) + goto out_estate; + ret = -EINVAL; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && + ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) + goto out_dstate; + if (ace->flag & ~NFS4_SUPPORTED_FLAGS) + goto out_dstate; + if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) { + process_one_v4_ace(&effective_acl_state, ace); + continue; + } + if (!(flags & NFS4_ACL_DIR)) + goto out_dstate; + /* + * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT + * is set, we're effectively turning on the other. That's OK, + * according to rfc 3530. + */ + process_one_v4_ace(&default_acl_state, ace); + + if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) + process_one_v4_ace(&effective_acl_state, ace); + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); + if (IS_ERR(*pacl)) { + ret = PTR_ERR(*pacl); + *pacl = NULL; + goto out_dstate; + } + *dpacl = posix_state_to_acl(&default_acl_state, + flags | NFS4_ACL_TYPE_DEFAULT); + if (IS_ERR(*dpacl)) { + ret = PTR_ERR(*dpacl); + *dpacl = NULL; + posix_acl_release(*pacl); + *pacl = NULL; + goto out_dstate; + } + sort_pacl(*pacl); + sort_pacl(*dpacl); + ret = 0; +out_dstate: + free_state(&default_acl_state); +out_estate: + free_state(&effective_acl_state); + return ret; +} + +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) + return nfserr_attrnotsupp; + + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) + return nfserr_attrnotsupp; + if (host_error < 0) + goto out_nfserr; + + host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + if (host_error < 0) + goto out_release; + + if (S_ISDIR(inode->i_mode)) { + host_error = inode->i_op->set_acl(inode, dpacl, + ACL_TYPE_DEFAULT); + } + +out_release: + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + + +static short +ace2type(struct nfs4_ace *ace) +{ + switch (ace->whotype) { + case NFS4_ACL_WHO_NAMED: + return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ? + ACL_GROUP : ACL_USER); + case NFS4_ACL_WHO_OWNER: + return ACL_USER_OBJ; + case NFS4_ACL_WHO_GROUP: + return ACL_GROUP_OBJ; + case NFS4_ACL_WHO_EVERYONE: + return ACL_OTHER; + } + BUG(); + return -1; +} + +/* + * return the size of the struct nfs4_acl required to represent an acl + * with @entries entries. + */ +int nfs4_acl_bytes(int entries) +{ + return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace); +} + +static struct { + char *string; + int stringlen; + int type; +} s2t_map[] = { + { + .string = "OWNER@", + .stringlen = sizeof("OWNER@") - 1, + .type = NFS4_ACL_WHO_OWNER, + }, + { + .string = "GROUP@", + .stringlen = sizeof("GROUP@") - 1, + .type = NFS4_ACL_WHO_GROUP, + }, + { + .string = "EVERYONE@", + .stringlen = sizeof("EVERYONE@") - 1, + .type = NFS4_ACL_WHO_EVERYONE, + }, +}; + +int +nfs4_acl_get_whotype(char *p, u32 len) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].stringlen == len && + 0 == memcmp(s2t_map[i].string, p, len)) + return s2t_map[i].type; + } + return NFS4_ACL_WHO_NAMED; +} + +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) +{ + __be32 *p; + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].type != who) + continue; + p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, s2t_map[i].string, + s2t_map[i].stringlen); + return 0; + } + WARN_ON_ONCE(1); + return nfserr_serverfault; +} diff --git a/kernel/fs/nfsd/nfs4callback.c b/kernel/fs/nfsd/nfs4callback.c new file mode 100644 index 000000000..5694cfb7a --- /dev/null +++ b/kernel/fs/nfsd/nfs4callback.c @@ -0,0 +1,1099 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "nfsd.h" +#include "state.h" +#include "netns.h" +#include "xdr4cb.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + +#define NFSPROC4_CB_NULL 0 +#define NFSPROC4_CB_COMPOUND 1 + +/* Index of predefined Linux callback client operations */ + +struct nfs4_cb_compound_hdr { + /* args */ + u32 ident; /* minorversion 0 only */ + u32 nops; + __be32 *nops_p; + u32 minorversion; + /* res */ + int status; +}; + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + +static __be32 *xdr_encode_empty_array(__be32 *p) +{ + *p++ = xdr_zero; + return p; +} + +/* + * Encode/decode NFSv4 CB basic data types + * + * Basic NFSv4 callback data types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section + * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version + * 1 Protocol" + */ + +/* + * nfs_cb_opnum4 + * + * enum nfs_cb_opnum4 { + * OP_CB_GETATTR = 3, + * ... + * }; + */ +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_ILLEGAL = 10044 +}; + +static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(op); +} + +/* + * nfs_fh4 + * + * typedef opaque nfs_fh4; + */ +static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) +{ + u32 length = fh->fh_size; + __be32 *p; + + BUG_ON(length > NFS4_FHSIZE); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, &fh->fh_base, length); +} + +/* + * stateid4 + * + * struct stateid4 { + * uint32_t seqid; + * opaque other[12]; + * }; + */ +static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(sid->si_generation); + xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE); +} + +/* + * sessionid4 + * + * typedef opaque sessionid4[NFS4_SESSIONID_SIZE]; + */ +static void encode_sessionid4(struct xdr_stream *xdr, + const struct nfsd4_session *session) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); +} + +/* + * nfsstat4 + */ +static const struct { + int stat; + int errno; +} nfs_cb_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -EIO }, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; + +/* + * If we cannot translate the error, the recovery routines should + * handle it. + * + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ +static int nfs_cb_stat_to_errno(int status) +{ + int i; + + for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { + if (nfs_cb_errtbl[i].stat == status) + return nfs_cb_errtbl[i].errno; + } + + dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status); + return -status; +} + +static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *status) +{ + __be32 *p; + u32 op; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + op = be32_to_cpup(p++); + if (unlikely(op != expected)) + goto out_unexpected; + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_unexpected: + dprintk("NFSD: Callback server returned operation %d but " + "we issued a request for %d\n", op, expected); + return -EIO; +} + +/* + * CB_COMPOUND4args + * + * struct CB_COMPOUND4args { + * utf8str_cs tag; + * uint32_t minorversion; + * uint32_t callback_ident; + * nfs_cb_argop4 argarray<>; + * }; +*/ +static void encode_cb_compound4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 * p; + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + p = xdr_encode_empty_array(p); /* empty tag */ + *p++ = cpu_to_be32(hdr->minorversion); + *p++ = cpu_to_be32(hdr->ident); + + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); /* argarray element count */ +} + +/* + * Update argarray element count + */ +static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) +{ + BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS); + *hdr->nops_p = cpu_to_be32(hdr->nops); +} + +/* + * CB_COMPOUND4res + * + * struct CB_COMPOUND4res { + * nfsstat4 status; + * utf8str_cs tag; + * nfs_cb_resop4 resarray<>; + * }; + */ +static int decode_cb_compound4res(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + /* Ignore the tag */ + length = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, length + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->nops = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * CB_RECALL4args + * + * struct CB_RECALL4args { + * stateid4 stateid; + * bool truncate; + * nfs_fh4 fh; + * }; + */ +static void encode_cb_recall4args(struct xdr_stream *xdr, + const struct nfs4_delegation *dp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); + + p = xdr_reserve_space(xdr, 4); + *p++ = xdr_zero; /* truncate */ + + encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle); + + hdr->nops++; +} + +/* + * CB_SEQUENCE4args + * + * struct CB_SEQUENCE4args { + * sessionid4 csa_sessionid; + * sequenceid4 csa_sequenceid; + * slotid4 csa_slotid; + * slotid4 csa_highest_slotid; + * bool csa_cachethis; + * referring_call_list4 csa_referring_call_lists<>; + * }; + */ +static void encode_cb_sequence4args(struct xdr_stream *xdr, + const struct nfsd4_callback *cb, + struct nfs4_cb_compound_hdr *hdr) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + __be32 *p; + + if (hdr->minorversion == 0) + return; + + encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); + encode_sessionid4(xdr, session); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ + *p++ = xdr_zero; /* csa_slotid */ + *p++ = xdr_zero; /* csa_highest_slotid */ + *p++ = xdr_zero; /* csa_cachethis */ + xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + hdr->nops++; +} + +/* + * CB_SEQUENCE4resok + * + * struct CB_SEQUENCE4resok { + * sessionid4 csr_sessionid; + * sequenceid4 csr_sequenceid; + * slotid4 csr_slotid; + * slotid4 csr_highest_slotid; + * slotid4 csr_target_highest_slotid; + * }; + * + * union CB_SEQUENCE4res switch (nfsstat4 csr_status) { + * case NFS4_OK: + * CB_SEQUENCE4resok csr_resok4; + * default: + * void; + * }; + * + * Our current back channel implmentation supports a single backchannel + * with a single slot. + */ +static int decode_cb_sequence4resok(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfs4_sessionid id; + int status; + __be32 *p; + u32 dummy; + + status = -ESERVERFAULT; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); + if (memcmp(id.data, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) { + dprintk("NFS: %s Invalid session id\n", __func__); + goto out; + } + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + + dummy = be32_to_cpup(p++); + if (dummy != session->se_cb_seq_nr) { + dprintk("NFS: %s Invalid sequence number\n", __func__); + goto out; + } + + dummy = be32_to_cpup(p++); + if (dummy != 0) { + dprintk("NFS: %s Invalid slotid\n", __func__); + goto out; + } + + /* + * FIXME: process highest slotid and target highest slotid + */ + status = 0; +out: + if (status) + nfsd4_mark_cb_fault(cb->cb_clp, status); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_cb_sequence4res(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + int status; + + if (cb->cb_minorversion == 0) + return 0; + + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + + return decode_cb_sequence4resok(xdr, cb); +} + +/* + * NFSv4.0 and NFSv4.1 XDR encode functions + * + * NFSv4.0 callback argument types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +/* + * NB: Without this zero space reservation, callbacks over krb5p fail + */ +static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + xdr_reserve_space(xdr, 0); +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_delegation *dp = cb_to_delegation(cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recall4args(xdr, dp, &hdr); + encode_cb_nops(&hdr); +} + + +/* + * NFSv4.0 and NFSv4.1 XDR decode functions + * + * NFSv4.0 callback result types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + return 0; +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb != NULL) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_status)) + return status; + } + + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); +} + +#ifdef CONFIG_NFSD_PNFS +/* + * CB_LAYOUTRECALL4args + * + * struct layoutrecall_file4 { + * nfs_fh4 lor_fh; + * offset4 lor_offset; + * length4 lor_length; + * stateid4 lor_stateid; + * }; + * + * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { + * case LAYOUTRECALL4_FILE: + * layoutrecall_file4 lor_layout; + * case LAYOUTRECALL4_FSID: + * fsid4 lor_fsid; + * case LAYOUTRECALL4_ALL: + * void; + * }; + * + * struct CB_LAYOUTRECALL4args { + * layouttype4 clora_type; + * layoutiomode4 clora_iomode; + * bool clora_changed; + * layoutrecall4 clora_recall; + * }; + */ +static void encode_cb_layout4args(struct xdr_stream *xdr, + const struct nfs4_layout_stateid *ls, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + BUG_ON(hdr->minorversion == 0); + + p = xdr_reserve_space(xdr, 5 * 4); + *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); + *p++ = cpu_to_be32(ls->ls_layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(RETURN_FILE); + + encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); + + p = xdr_reserve_space(xdr, 2 * 8); + p = xdr_encode_hyper(p, 0); + xdr_encode_hyper(p, NFS4_MAX_UINT64); + + encode_stateid4(xdr, &ls->ls_recall_sid); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_layout4args(xdr, ls, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); +} +#endif /* CONFIG_NFSD_PNFS */ + +/* + * RPC procedure tables + */ +#define PROC(proc, call, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_CB_##call, \ + .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \ + .p_arglen = NFS4_enc_##argtype##_sz, \ + .p_replen = NFS4_dec_##restype##_sz, \ + .p_statidx = NFSPROC4_CB_##call, \ + .p_name = #proc, \ +} + +static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), +#ifdef CONFIG_NFSD_PNFS + PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), +#endif +}; + +static struct rpc_version nfs_cb_version4 = { +/* + * Note on the callback rpc program version number: despite language in rfc + * 5661 section 18.36.3 requiring servers to use 4 in this field, the + * official xdr descriptions for both 4.0 and 4.1 specify version 1, and + * in practice that appears to be what implementations use. The section + * 18.36.3 language is expected to be fixed in an erratum. + */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures +}; + +static const struct rpc_version *nfs_cb_version[] = { + &nfs_cb_version4, +}; + +static const struct rpc_program cb_program; + +static struct rpc_stat cb_stats = { + .program = &cb_program +}; + +#define NFS4_CALLBACK 0x40000000 +static const struct rpc_program cb_program = { + .name = "nfs4_cb", + .number = NFS4_CALLBACK, + .nrvers = ARRAY_SIZE(nfs_cb_version), + .version = nfs_cb_version, + .stats = &cb_stats, + .pipe_dir_name = "nfsd4_cb", +}; + +static int max_cb_time(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + return max(nn->nfsd4_lease/10, (time_t)1) * HZ; +} + +static struct rpc_cred *callback_cred; + +int set_callback_cred(void) +{ + if (callback_cred) + return 0; + callback_cred = rpc_lookup_machine_cred("nfs"); + if (!callback_cred) + return -ENOMEM; + return 0; +} + +static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) +{ + if (clp->cl_minorversion == 0) { + return get_rpccred(callback_cred); + } else { + struct rpc_auth *auth = client->cl_auth; + struct auth_cred acred = {}; + + acred.uid = ses->se_cb_sec.uid; + acred.gid = ses->se_cb_sec.gid; + return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0); + } +} + +static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) +{ + struct rpc_xprt *xprt; + + if (args->protocol != XPRT_TRANSPORT_BC_TCP) + return rpc_create(args); + + xprt = args->bc_xprt->xpt_bc_xprt; + if (xprt) { + xprt_get(xprt); + return rpc_create_xprt(args, xprt); + } + + return rpc_create(args); +} + +static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) +{ + int maxtime = max_cb_time(clp->net); + struct rpc_timeout timeparms = { + .to_initval = maxtime, + .to_retries = 0, + .to_maxval = maxtime, + }; + struct rpc_create_args args = { + .net = clp->net, + .address = (struct sockaddr *) &conn->cb_addr, + .addrsize = conn->cb_addrlen, + .saddress = (struct sockaddr *) &conn->cb_saddr, + .timeout = &timeparms, + .program = &cb_program, + .version = 0, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + }; + struct rpc_clnt *client; + struct rpc_cred *cred; + + if (clp->cl_minorversion == 0) { + if (!clp->cl_cred.cr_principal && + (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + args.client_name = clp->cl_cred.cr_principal; + args.prognumber = conn->cb_prog; + args.protocol = XPRT_TRANSPORT_TCP; + args.authflavor = clp->cl_cred.cr_flavor; + clp->cl_cb_ident = conn->cb_ident; + } else { + if (!conn->cb_xprt) + return -EINVAL; + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; + clp->cl_cb_session = ses; + args.bc_xprt = conn->cb_xprt; + args.prognumber = clp->cl_cb_session->se_cb_prog; + args.protocol = conn->cb_xprt->xpt_class->xcl_ident | + XPRT_TRANSPORT_BC; + args.authflavor = ses->se_cb_sec.flavor; + } + /* Create RPC client */ + client = create_backchannel_client(&args); + if (IS_ERR(client)) { + dprintk("NFSD: couldn't create callback client: %ld\n", + PTR_ERR(client)); + return PTR_ERR(client); + } + cred = get_backchannel_cred(clp, client, ses); + if (IS_ERR(cred)) { + rpc_shutdown_client(client); + return PTR_ERR(cred); + } + clp->cl_cb_client = client; + clp->cl_cb_cred = cred; + return 0; +} + +static void warn_no_callback_path(struct nfs4_client *clp, int reason) +{ + dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", + (int)clp->cl_name.len, clp->cl_name.data, reason); +} + +static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_DOWN; + warn_no_callback_path(clp, reason); +} + +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_FAULT; + warn_no_callback_path(clp, reason); +} + +static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + if (task->tk_status) + nfsd4_mark_cb_down(clp, task->tk_status); + else + clp->cl_cb_state = NFSD4_CB_UP; +} + +static const struct rpc_call_ops nfsd4_cb_probe_ops = { + /* XXX: release method to ensure we set the cb channel down if + * necessary on early failure? */ + .rpc_call_done = nfsd4_cb_probe_done, +}; + +static struct workqueue_struct *callback_wq; + +/* + * Poke the callback thread to process any updates to the callback + * parameters, and send a null probe. + */ +void nfsd4_probe_callback(struct nfs4_client *clp) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + nfsd4_run_cb(&clp->cl_cb_null); +} + +void nfsd4_probe_callback_sync(struct nfs4_client *clp) +{ + nfsd4_probe_callback(clp); + flush_workqueue(callback_wq); +} + +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + spin_lock(&clp->cl_lock); + memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); + spin_unlock(&clp->cl_lock); +} + +/* + * There's currently a single callback channel slot. + * If the slot is available, then mark it busy. Otherwise, set the + * thread for sleeping on the callback RPC wait queue. + */ +static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) +{ + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); + /* Race breaker */ + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + dprintk("%s slot is busy\n", __func__); + return false; + } + rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); + } + return true; +} + +/* + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + u32 minorversion = clp->cl_minorversion; + + cb->cb_minorversion = minorversion; + if (minorversion) { + if (!nfsd41_cb_get_slot(clp, task)) + return; + } + rpc_call_start(task); +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + + if (clp->cl_minorversion) { + /* No need for lock, access serialized in nfsd4_cb_prepare */ + if (!task->tk_status) + ++clp->cl_cb_session->se_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); + } + + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) { + task->tk_status = 0; + cb->cb_need_restart = true; + return; + } + + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } + + switch (cb->cb_ops->done(cb, task)) { + case 0: + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; + case 1: + break; + case -1: + /* Network partition? */ + nfsd4_mark_cb_down(clp, task->tk_status); + break; + default: + BUG(); + } +} + +static void nfsd4_cb_release(void *calldata) +{ + struct nfsd4_callback *cb = calldata; + + if (cb->cb_need_restart) + nfsd4_run_cb(cb); + else + cb->cb_ops->release(cb); + +} + +static const struct rpc_call_ops nfsd4_cb_ops = { + .rpc_call_prepare = nfsd4_cb_prepare, + .rpc_call_done = nfsd4_cb_done, + .rpc_release = nfsd4_cb_release, +}; + +int nfsd4_create_callback_queue(void) +{ + callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); + if (!callback_wq) + return -ENOMEM; + return 0; +} + +void nfsd4_destroy_callback_queue(void) +{ + destroy_workqueue(callback_wq); +} + +/* must be called under the state lock */ +void nfsd4_shutdown_callback(struct nfs4_client *clp) +{ + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); + /* + * Note this won't actually result in a null callback; + * instead, nfsd4_run_cb_null() will detect the killed + * client, destroy the rpc client, and stop: + */ + nfsd4_run_cb(&clp->cl_cb_null); + flush_workqueue(callback_wq); +} + +/* requires cl_lock: */ +static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) +{ + struct nfsd4_session *s; + struct nfsd4_conn *c; + + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_flags & NFS4_CDFC4_BACK) + return c; + } + } + return NULL; +} + +static void nfsd4_process_cb_update(struct nfsd4_callback *cb) +{ + struct nfs4_cb_conn conn; + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = NULL; + struct nfsd4_conn *c; + int err; + + /* + * This is either an update, or the client dying; in either case, + * kill the old client: + */ + if (clp->cl_cb_client) { + rpc_shutdown_client(clp->cl_cb_client); + clp->cl_cb_client = NULL; + put_rpccred(clp->cl_cb_cred); + clp->cl_cb_cred = NULL; + } + if (clp->cl_cb_conn.cb_xprt) { + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + clp->cl_cb_conn.cb_xprt = NULL; + } + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) + return; + spin_lock(&clp->cl_lock); + /* + * Only serialized callback code is allowed to clear these + * flags; main nfsd code can only set them: + */ + BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + c = __nfsd4_find_backchannel(clp); + if (c) { + svc_xprt_get(c->cn_xprt); + conn.cb_xprt = c->cn_xprt; + ses = c->cn_session; + } + spin_unlock(&clp->cl_lock); + + err = setup_callback_client(clp, &conn, ses); + if (err) { + nfsd4_mark_cb_down(clp, err); + return; + } +} + +static void +nfsd4_run_cb_work(struct work_struct *work) +{ + struct nfsd4_callback *cb = + container_of(work, struct nfsd4_callback, cb_work); + struct nfs4_client *clp = cb->cb_clp; + struct rpc_clnt *clnt; + + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } + + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) + nfsd4_process_cb_update(cb); + + clnt = clp->cl_cb_client; + if (!clnt) { + /* Callback channel broken, or client killed; give up: */ + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); + return; + } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + return; + } + + cb->cb_msg.rpc_cred = clp->cl_cb_cred; + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); +} + +void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) +{ + cb->cb_clp = clp; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; + cb->cb_msg.rpc_argp = cb; + cb->cb_msg.rpc_resp = cb; + cb->cb_ops = ops; + INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); + cb->cb_status = 0; + cb->cb_need_restart = false; +} + +void nfsd4_run_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); +} diff --git a/kernel/fs/nfsd/nfs4idmap.c b/kernel/fs/nfsd/nfs4idmap.c new file mode 100644 index 000000000..e1b3d3d47 --- /dev/null +++ b/kernel/fs/nfsd/nfs4idmap.c @@ -0,0 +1,666 @@ +/* + * Mapping of UID/GIDs to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include "idmap.h" +#include "nfsd.h" +#include "netns.h" + +/* + * Turn off idmapping when using AUTH_SYS. + */ +static bool nfs4_disable_idmapping = true; +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off server's NFSv4 idmapping when using 'sec=sys'"); + +/* + * Cache entry + */ + +/* + * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on + * that. + */ + +#define IDMAP_TYPE_USER 0 +#define IDMAP_TYPE_GROUP 1 + +struct ent { + struct cache_head h; + int type; /* User / Group */ + u32 id; + char name[IDMAP_NAMESZ]; + char authname[IDMAP_NAMESZ]; +}; + +/* Common entry handling */ + +#define ENT_HASHBITS 8 +#define ENT_HASHMAX (1 << ENT_HASHBITS) + +static void +ent_init(struct cache_head *cnew, struct cache_head *citm) +{ + struct ent *new = container_of(cnew, struct ent, h); + struct ent *itm = container_of(citm, struct ent, h); + + new->id = itm->id; + new->type = itm->type; + + strlcpy(new->name, itm->name, sizeof(new->name)); + strlcpy(new->authname, itm->authname, sizeof(new->name)); +} + +static void +ent_put(struct kref *ref) +{ + struct ent *map = container_of(ref, struct ent, h.ref); + kfree(map); +} + +static struct cache_head * +ent_alloc(void) +{ + struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); + if (e) + return &e->h; + else + return NULL; +} + +/* + * ID -> Name cache + */ + +static uint32_t +idtoname_hash(struct ent *ent) +{ + uint32_t hash; + + hash = hash_str(ent->authname, ENT_HASHBITS); + hash = hash_long(hash ^ ent->id, ENT_HASHBITS); + + /* Flip LSB for user/group */ + if (ent->type == IDMAP_TYPE_GROUP) + hash ^= 1; + + return hash; +} + +static void +idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + char idstr[11]; + + qword_add(bpp, blen, ent->authname); + snprintf(idstr, sizeof(idstr), "%u", ent->id); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, idstr); + + (*bpp)[-1] = '\n'; +} + +static int +idtoname_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->id == b->id && a->type == b->type && + strcmp(a->authname, b->authname) == 0); +} + +static int +idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type id [name]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %u", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->id); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %s", ent->name); + seq_printf(m, "\n"); + return 0; +} + +static void +warn_no_idmapd(struct cache_detail *detail, int has_died) +{ + printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", + has_died ? "died" : "not been started"); +} + + +static int idtoname_parse(struct cache_detail *, char *, int); +static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); +static struct ent *idtoname_update(struct cache_detail *, struct ent *, + struct ent *); + +static struct cache_detail idtoname_cache_template = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .name = "nfs4.idtoname", + .cache_put = ent_put, + .cache_request = idtoname_request, + .cache_parse = idtoname_parse, + .cache_show = idtoname_show, + .warn_no_listener = warn_no_idmapd, + .match = idtoname_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +idtoname_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1, *bp; + int len; + int error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* ID */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.id = simple_strtoul(buf1, &bp, 10); + if (bp == buf1) + goto out; + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + error = -ENOMEM; + res = idtoname_lookup(cd, &ent); + if (!res) + goto out; + + /* Name */ + error = -EINVAL; + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len < 0 || len >= IDMAP_NAMESZ) + goto out; + if (len == 0) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + else + memcpy(ent.name, buf1, sizeof(ent.name)); + error = -ENOMEM; + res = idtoname_update(cd, &ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, cd); + error = 0; +out: + kfree(buf1); + return error; +} + +static struct ent * +idtoname_lookup(struct cache_detail *cd, struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, + idtoname_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, + idtoname_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + + +/* + * Name -> ID cache + */ + +static inline int +nametoid_hash(struct ent *ent) +{ + return hash_str(ent->name, ENT_HASHBITS); +} + +static void +nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + + qword_add(bpp, blen, ent->authname); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, ent->name); + + (*bpp)[-1] = '\n'; +} + +static int +nametoid_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->type == b->type && strcmp(a->name, b->name) == 0 && + strcmp(a->authname, b->authname) == 0); +} + +static int +nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type name [id]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %s", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->name); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %u", ent->id); + seq_printf(m, "\n"); + return 0; +} + +static struct ent *nametoid_lookup(struct cache_detail *, struct ent *); +static struct ent *nametoid_update(struct cache_detail *, struct ent *, + struct ent *); +static int nametoid_parse(struct cache_detail *, char *, int); + +static struct cache_detail nametoid_cache_template = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .name = "nfs4.nametoid", + .cache_put = ent_put, + .cache_request = nametoid_request, + .cache_parse = nametoid_parse, + .cache_show = nametoid_show, + .warn_no_listener = warn_no_idmapd, + .match = nametoid_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +nametoid_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1; + int len, error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* Name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.name, buf1, sizeof(ent.name)); + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + /* ID */ + error = get_int(&buf, &ent.id); + if (error == -EINVAL) + goto out; + if (error == -ENOENT) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + + error = -ENOMEM; + res = nametoid_lookup(cd, &ent); + if (res == NULL) + goto out; + res = nametoid_update(cd, &ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, cd); + error = 0; +out: + kfree(buf1); + return (error); +} + + +static struct ent * +nametoid_lookup(struct cache_detail *cd, struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, + nametoid_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, + nametoid_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +/* + * Exported API + */ + +int +nfsd_idmap_init(struct net *net) +{ + int rv; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net); + if (IS_ERR(nn->idtoname_cache)) + return PTR_ERR(nn->idtoname_cache); + rv = cache_register_net(nn->idtoname_cache, net); + if (rv) + goto destroy_idtoname_cache; + nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); + if (IS_ERR(nn->nametoid_cache)) { + rv = PTR_ERR(nn->nametoid_cache); + goto unregister_idtoname_cache; + } + rv = cache_register_net(nn->nametoid_cache, net); + if (rv) + goto destroy_nametoid_cache; + return 0; + +destroy_nametoid_cache: + cache_destroy_net(nn->nametoid_cache, net); +unregister_idtoname_cache: + cache_unregister_net(nn->idtoname_cache, net); +destroy_idtoname_cache: + cache_destroy_net(nn->idtoname_cache, net); + return rv; +} + +void +nfsd_idmap_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cache_unregister_net(nn->idtoname_cache, net); + cache_unregister_net(nn->nametoid_cache, net); + cache_destroy_net(nn->idtoname_cache, net); + cache_destroy_net(nn->nametoid_cache, net); +} + +static int +idmap_lookup(struct svc_rqst *rqstp, + struct ent *(*lookup_fn)(struct cache_detail *, struct ent *), + struct ent *key, struct cache_detail *detail, struct ent **item) +{ + int ret; + + *item = lookup_fn(detail, key); + if (!*item) + return -ENOMEM; + retry: + ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); + + if (ret == -ETIMEDOUT) { + struct ent *prev_item = *item; + *item = lookup_fn(detail, key); + if (*item != prev_item) + goto retry; + cache_put(&(*item)->h, detail); + } + return ret; +} + +static char * +rqst_authname(struct svc_rqst *rqstp) +{ + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return clp->name; +} + +static __be32 +idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, + u32 *id) +{ + struct ent *item, key = { + .type = type, + }; + int ret; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (namelen + 1 > sizeof(key.name)) + return nfserr_badowner; + memcpy(key.name, name, namelen); + key.name[namelen] = '\0'; + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); + if (ret == -ENOENT) + return nfserr_badowner; + if (ret) + return nfserrno(ret); + *id = item->id; + cache_put(&item->h, nn->nametoid_cache); + return 0; +} + +static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) +{ + char buf[11]; + int len; + __be32 *p; + + len = sprintf(buf, "%u", id); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, buf, len); + return 0; +} + +static __be32 idmap_id_to_name(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) +{ + struct ent *item, key = { + .id = id, + .type = type, + }; + __be32 *p; + int ret; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); + if (ret == -ENOENT) + return encode_ascii_id(xdr, id); + if (ret) + return nfserrno(ret); + ret = strlen(item->name); + WARN_ON_ONCE(ret > IDMAP_NAMESZ); + p = xdr_reserve_space(xdr, ret + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, item->name, ret); + cache_put(&item->h, nn->idtoname_cache); + return 0; +} + +static bool +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) +{ + int ret; + char buf[11]; + + if (namelen + 1 > sizeof(buf)) + /* too long to represent a 32-bit id: */ + return false; + /* Just to make sure it's null-terminated: */ + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + ret = kstrtouint(buf, 10, id); + return ret == 0; +} + +static __be32 +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) +{ + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) + if (numeric_name_to_id(rqstp, type, name, namelen, id)) + return 0; + /* + * otherwise, fall through and try idmapping, for + * backwards compatibility with clients sending names: + */ + return idmap_name_to_id(rqstp, type, name, namelen, id); +} + +static __be32 encode_name_from_id(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) +{ + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) + return encode_ascii_id(xdr, id); + return idmap_id_to_name(xdr, rqstp, type, id); +} + +__be32 +nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, + kuid_t *uid) +{ + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + status = nfserr_badowner; + return status; +} + +__be32 +nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, + kgid_t *gid) +{ + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + status = nfserr_badowner; + return status; +} + +__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kuid_t uid) +{ + u32 id = from_kuid(&init_user_ns, uid); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); +} + +__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kgid_t gid) +{ + u32 id = from_kgid(&init_user_ns, gid); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); +} diff --git a/kernel/fs/nfsd/nfs4layouts.c b/kernel/fs/nfsd/nfs4layouts.c new file mode 100644 index 000000000..6904213a4 --- /dev/null +++ b/kernel/fs/nfsd/nfs4layouts.c @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include +#include +#include +#include +#include + +#include "pnfs.h" +#include "netns.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct nfs4_layout { + struct list_head lo_perstate; + struct nfs4_layout_stateid *lo_state; + struct nfsd4_layout_seg lo_seg; +}; + +static struct kmem_cache *nfs4_layout_cache; +static struct kmem_cache *nfs4_layout_stateid_cache; + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct lock_manager_operations nfsd4_layouts_lm_ops; + +const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { + [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, +}; + +/* pNFS device ID to export fsid mapping */ +#define DEVID_HASH_BITS 8 +#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) +#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) +static u64 nfsd_devid_seq = 1; +static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfsd_devid_lock); + +static inline u32 devid_hashfn(u64 idx) +{ + return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; +} + +static void +nfsd4_alloc_devid_map(const struct svc_fh *fhp) +{ + const struct knfsd_fh *fh = &fhp->fh_handle; + size_t fsid_len = key_len(fh->fh_fsid_type); + struct nfsd4_deviceid_map *map, *old; + int i; + + map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); + if (!map) + return; + + map->fsid_type = fh->fh_fsid_type; + memcpy(&map->fsid, fh->fh_fsid, fsid_len); + + spin_lock(&nfsd_devid_lock); + if (fhp->fh_export->ex_devid_map) + goto out_unlock; + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + list_for_each_entry(old, &nfsd_devid_hash[i], hash) { + if (old->fsid_type != fh->fh_fsid_type) + continue; + if (memcmp(old->fsid, fh->fh_fsid, + key_len(old->fsid_type))) + continue; + + fhp->fh_export->ex_devid_map = old; + goto out_unlock; + } + } + + map->idx = nfsd_devid_seq++; + list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); + fhp->fh_export->ex_devid_map = map; + map = NULL; + +out_unlock: + spin_unlock(&nfsd_devid_lock); + kfree(map); +} + +struct nfsd4_deviceid_map * +nfsd4_find_devid_map(int idx) +{ + struct nfsd4_deviceid_map *map, *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) + if (map->idx == idx) + ret = map; + rcu_read_unlock(); + + return ret; +} + +int +nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation) +{ + if (!fhp->fh_export->ex_devid_map) { + nfsd4_alloc_devid_map(fhp); + if (!fhp->fh_export->ex_devid_map) + return -ENOMEM; + } + + id->fsid_idx = fhp->fh_export->ex_devid_map->idx; + id->generation = device_generation; + id->pad = 0; + return 0; +} + +void nfsd4_setup_layout_type(struct svc_export *exp) +{ + struct super_block *sb = exp->ex_path.mnt->mnt_sb; + + if (!(exp->ex_flags & NFSEXP_PNFS)) + return; + + if (sb->s_export_op->get_uuid && + sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks) + exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; +} + +static void +nfsd4_free_layout_stateid(struct nfs4_stid *stid) +{ + struct nfs4_layout_stateid *ls = layoutstateid(stid); + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct nfs4_file *fp = ls->ls_stid.sc_file; + + trace_layoutstate_free(&ls->ls_stid.sc_stateid); + + spin_lock(&clp->cl_lock); + list_del_init(&ls->ls_perclnt); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_del_init(&ls->ls_perfile); + spin_unlock(&fp->fi_lock); + + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + fput(ls->ls_file); + + if (ls->ls_recalled) + atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); + + kmem_cache_free(nfs4_layout_stateid_cache, ls); +} + +static int +nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) +{ + struct file_lock *fl; + int status; + + fl = locks_alloc_lock(); + if (!fl) + return -ENOMEM; + locks_init_lock(fl); + fl->fl_lmops = &nfsd4_layouts_lm_ops; + fl->fl_flags = FL_LAYOUT; + fl->fl_type = F_RDLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = ls; + fl->fl_pid = current->tgid; + fl->fl_file = ls->ls_file; + + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + if (status) { + locks_free_lock(fl); + return status; + } + BUG_ON(fl != NULL); + return 0; +} + +static struct nfs4_layout_stateid * +nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, + struct nfs4_stid *parent, u32 layout_type) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_file *fp = parent->sc_file; + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stp; + + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + if (!stp) + return NULL; + stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); + stp->sc_file = fp; + + ls = layoutstateid(stp); + INIT_LIST_HEAD(&ls->ls_perclnt); + INIT_LIST_HEAD(&ls->ls_perfile); + spin_lock_init(&ls->ls_lock); + INIT_LIST_HEAD(&ls->ls_layouts); + ls->ls_layout_type = layout_type; + nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, + NFSPROC4_CLNT_CB_LAYOUT); + + if (parent->sc_type == NFS4_DELEG_STID) + ls->ls_file = get_file(fp->fi_deleg_file); + else + ls->ls_file = find_any_file(fp); + BUG_ON(!ls->ls_file); + + if (nfsd4_layout_setlease(ls)) { + put_nfs4_file(fp); + kmem_cache_free(nfs4_layout_stateid_cache, ls); + return NULL; + } + + spin_lock(&clp->cl_lock); + stp->sc_type = NFS4_LAYOUT_STID; + list_add(&ls->ls_perclnt, &clp->cl_lo_states); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_add(&ls->ls_perfile, &fp->fi_lo_states); + spin_unlock(&fp->fi_lock); + + trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); + return ls; +} + +__be32 +nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stid; + unsigned char typemask = NFS4_LAYOUT_STID; + __be32 status; + + if (create) + typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + net_generic(SVC_NET(rqstp), nfsd_net_id)); + if (status) + goto out; + + if (!fh_match(&cstate->current_fh.fh_handle, + &stid->sc_file->fi_fhandle)) { + status = nfserr_bad_stateid; + goto out_put_stid; + } + + if (stid->sc_type != NFS4_LAYOUT_STID) { + ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); + nfs4_put_stid(stid); + + status = nfserr_jukebox; + if (!ls) + goto out; + } else { + ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); + + status = nfserr_bad_stateid; + if (stateid->si_generation > stid->sc_stateid.si_generation) + goto out_put_stid; + if (layout_type != ls->ls_layout_type) + goto out_put_stid; + } + + *lsp = ls; + return 0; + +out_put_stid: + nfs4_put_stid(stid); +out: + return status; +} + +static void +nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) +{ + spin_lock(&ls->ls_lock); + if (ls->ls_recalled) + goto out_unlock; + + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); + if (list_empty(&ls->ls_layouts)) + goto out_unlock; + + trace_layout_recall(&ls->ls_stid.sc_stateid); + + atomic_inc(&ls->ls_stid.sc_count); + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfsd4_run_cb(&ls->ls_recall); + +out_unlock: + spin_unlock(&ls->ls_lock); +} + +static inline u64 +layout_end(struct nfsd4_layout_seg *seg) +{ + u64 end = seg->offset + seg->length; + return end >= seg->offset ? end : NFS4_MAX_UINT64; +} + +static void +layout_update_len(struct nfsd4_layout_seg *lo, u64 end) +{ + if (end == NFS4_MAX_UINT64) + lo->length = NFS4_MAX_UINT64; + else + lo->length = end - lo->offset; +} + +static bool +layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) +{ + if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) + return false; + if (layout_end(&lo->lo_seg) <= s->offset) + return false; + if (layout_end(s) <= lo->lo_seg.offset) + return false; + return true; +} + +static bool +layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) +{ + if (lo->iomode != new->iomode) + return false; + if (layout_end(new) < lo->offset) + return false; + if (layout_end(lo) < new->offset) + return false; + + lo->offset = min(lo->offset, new->offset); + layout_update_len(lo, max(layout_end(lo), layout_end(new))); + return true; +} + +static __be32 +nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) +{ + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout_stateid *l, *n; + __be32 nfserr = nfs_ok; + + assert_spin_locked(&fp->fi_lock); + + list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { + if (l != ls) { + nfsd4_recall_file_layout(l); + nfserr = nfserr_recallconflict; + } + } + + return nfserr; +} + +__be32 +nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) +{ + struct nfsd4_layout_seg *seg = &lgp->lg_seg; + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout *lp, *new = NULL; + __be32 nfserr; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + spin_unlock(&ls->ls_lock); + spin_unlock(&fp->fi_lock); + + new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); + if (!new) + return nfserr_jukebox; + memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + new->lo_state = ls; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + + atomic_inc(&ls->ls_stid.sc_count); + list_add_tail(&new->lo_perstate, &ls->ls_layouts); + new = NULL; +done: + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + spin_unlock(&ls->ls_lock); +out: + spin_unlock(&fp->fi_lock); + if (new) + kmem_cache_free(nfs4_layout_cache, new); + return nfserr; +} + +static void +nfsd4_free_layouts(struct list_head *reaplist) +{ + while (!list_empty(reaplist)) { + struct nfs4_layout *lp = list_first_entry(reaplist, + struct nfs4_layout, lo_perstate); + + list_del(&lp->lo_perstate); + nfs4_put_stid(&lp->lo_state->ls_stid); + kmem_cache_free(nfs4_layout_cache, lp); + } +} + +static void +nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, + struct list_head *reaplist) +{ + struct nfsd4_layout_seg *lo = &lp->lo_seg; + u64 end = layout_end(lo); + + if (seg->offset <= lo->offset) { + if (layout_end(seg) >= end) { + list_move_tail(&lp->lo_perstate, reaplist); + return; + } + lo->offset = layout_end(seg); + } else { + /* retain the whole layout segment on a split. */ + if (layout_end(seg) < end) { + dprintk("%s: split not supported\n", __func__); + return; + } + end = seg->offset; + } + + layout_update_len(lo, end); +} + +__be32 +nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_layout *lp, *n; + LIST_HEAD(reaplist); + __be32 nfserr; + int found = 0; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, + false, lrp->lr_layout_type, + &ls); + if (nfserr) { + trace_layout_return_lookup_fail(&lrp->lr_sid); + return nfserr; + } + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { + if (layouts_overlapping(lp, &lrp->lr_seg)) { + nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); + found++; + } + } + if (!list_empty(&ls->ls_layouts)) { + if (found) { + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, + sizeof(stateid_t)); + } + lrp->lrs_present = 1; + } else { + trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); + nfs4_unhash_stid(&ls->ls_stid); + lrp->lrs_present = 0; + } + spin_unlock(&ls->ls_lock); + + nfs4_put_stid(&ls->ls_stid); + nfsd4_free_layouts(&reaplist); + return nfs_ok; +} + +__be32 +nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls, *n; + struct nfs4_client *clp = cstate->clp; + struct nfs4_layout *lp, *t; + LIST_HEAD(reaplist); + + lrp->lrs_present = 0; + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { + if (ls->ls_layout_type != lrp->lr_layout_type) + continue; + + if (lrp->lr_return_type == RETURN_FSID && + !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, + &cstate->current_fh.fh_handle)) + continue; + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { + if (lrp->lr_seg.iomode == IOMODE_ANY || + lrp->lr_seg.iomode == lp->lo_seg.iomode) + list_move_tail(&lp->lo_perstate, &reaplist); + } + spin_unlock(&ls->ls_lock); + } + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); + return 0; +} + +static void +nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, + struct list_head *reaplist) +{ + spin_lock(&ls->ls_lock); + list_splice_init(&ls->ls_layouts, reaplist); + spin_unlock(&ls->ls_lock); +} + +void +nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) + nfsd4_return_all_layouts(ls, &reaplist); + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); +} + +void +nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&fp->fi_lock); + list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { + if (ls->ls_stid.sc_client == clp) + nfsd4_return_all_layouts(ls, &reaplist); + } + spin_unlock(&fp->fi_lock); + + nfsd4_free_layouts(&reaplist); +} + +static void +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + char addr_str[INET6_ADDRSTRLEN]; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int error; + + rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); + + trace_layout_recall_fail(&ls->ls_stid.sc_stateid); + + printk(KERN_WARNING + "nfsd: client %s failed to respond to layout recall. " + " Fencing..\n", addr_str); + + argv[0] = "/sbin/nfsd-recall-failed"; + argv[1] = addr_str; + argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[3] = NULL; + + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (error) { + printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", + addr_str, error); + } +} + +static int +nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + switch (task->tk_status) { + case 0: + return 1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; + case -NFS4ERR_DELAY: + /* Poll the client until it's done with the layout */ + /* FIXME: cap number of retries. + * The pnfs standard states that we need to only expire + * the client after at-least "lease time" .eg lease-time * 2 + * when failing to communicate a recall + */ + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + default: + /* + * Unknown error or non-responding client, we'll need to fence. + */ + nfsd4_cb_layout_fail(ls); + return -1; + } +} + +static void +nfsd4_cb_layout_release(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + trace_layout_recall_release(&ls->ls_stid.sc_stateid); + + nfsd4_return_all_layouts(ls, &reaplist); + nfsd4_free_layouts(&reaplist); + nfs4_put_stid(&ls->ls_stid); +} + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .done = nfsd4_cb_layout_done, + .release = nfsd4_cb_layout_release, +}; + +static bool +nfsd4_layout_lm_break(struct file_lock *fl) +{ + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a layout isn't returned + * in time: + */ + fl->fl_break_time = 0; + nfsd4_recall_file_layout(fl->fl_owner); + return false; +} + +static int +nfsd4_layout_lm_change(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + BUG_ON(!(arg & F_UNLCK)); + return lease_modify(onlist, arg, dispose); +} + +static const struct lock_manager_operations nfsd4_layouts_lm_ops = { + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, +}; + +int +nfsd4_init_pnfs(void) +{ + int i; + + for (i = 0; i < DEVID_HASH_SIZE; i++) + INIT_LIST_HEAD(&nfsd_devid_hash[i]); + + nfs4_layout_cache = kmem_cache_create("nfs4_layout", + sizeof(struct nfs4_layout), 0, 0, NULL); + if (!nfs4_layout_cache) + return -ENOMEM; + + nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", + sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + if (!nfs4_layout_stateid_cache) { + kmem_cache_destroy(nfs4_layout_cache); + return -ENOMEM; + } + return 0; +} + +void +nfsd4_exit_pnfs(void) +{ + int i; + + kmem_cache_destroy(nfs4_layout_cache); + kmem_cache_destroy(nfs4_layout_stateid_cache); + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + struct nfsd4_deviceid_map *map, *n; + + list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) + kfree(map); + } +} diff --git a/kernel/fs/nfsd/nfs4proc.c b/kernel/fs/nfsd/nfs4proc.c new file mode 100644 index 000000000..864e2003e --- /dev/null +++ b/kernel/fs/nfsd/nfs4proc.c @@ -0,0 +1,2368 @@ +/* + * Server-side procedures for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include "idmap.h" +#include "cache.h" +#include "xdr4.h" +#include "vfs.h" +#include "current_stateid.h" +#include "netns.h" +#include "acl.h" +#include "pnfs.h" +#include "trace.h" + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include + +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ + struct inode *inode = d_inode(resfh->fh_dentry); + int status; + + mutex_lock(&inode->i_mutex); + status = security_inode_setsecctx(resfh->fh_dentry, + label->data, label->len); + mutex_unlock(&inode->i_mutex); + + if (status) + /* + * XXX: We should really fail the whole open, but we may + * already have created a new file, so it may be too + * late. For now this seems the least of evils: + */ + bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return; +} +#else +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ } +#endif + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static u32 nfsd_attrmask[] = { + NFSD_WRITEABLE_ATTRS_WORD0, + NFSD_WRITEABLE_ATTRS_WORD1, + NFSD_WRITEABLE_ATTRS_WORD2 +}; + +static u32 nfsd41_ex_attrmask[] = { + NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2 +}; + +static __be32 +check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + u32 *bmval, u32 *writable) +{ + struct dentry *dentry = cstate->current_fh.fh_dentry; + + /* + * Check about attributes are supported by the NFSv4 server or not. + * According to spec, unsupported attributes return ERR_ATTRNOTSUPP. + */ + if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) || + (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) || + (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) + return nfserr_attrnotsupp; + + /* + * Check FATTR4_WORD0_ACL can be supported + * in current environment or not. + */ + if (bmval[0] & FATTR4_WORD0_ACL) { + if (!IS_POSIXACL(d_inode(dentry))) + return nfserr_attrnotsupp; + } + + /* + * According to spec, read-only attributes return ERR_INVAL. + */ + if (writable) { + if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || + (bmval[2] & ~writable[2])) + return nfserr_inval; + } + + return nfs_ok; +} + +static __be32 +nfsd4_check_open_attributes(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + __be32 status = nfs_ok; + + if (open->op_create == NFS4_OPEN_CREATE) { + if (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd_attrmask); + else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd41_ex_attrmask); + } + + return status; +} + +static int +is_create_with_attrs(struct nfsd4_open *open) +{ + return open->op_create == NFS4_OPEN_CREATE + && (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED + || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); +} + +/* + * if error occurs when setting the acl, just clear the acl bit + * in the returned attr bitmap. + */ +static void +do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl, u32 *bmval) +{ + __be32 status; + + status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); + if (status) + /* + * We should probably fail the whole open at this point, + * but we've already created the file, so it's too late; + * So this seems the least of evils: + */ + bmval[0] &= ~FATTR4_WORD0_ACL; +} + +static inline void +fh_dup2(struct svc_fh *dst, struct svc_fh *src) +{ + fh_put(dst); + dget(src->fh_dentry); + if (src->fh_export) + exp_get(src->fh_export); + *dst = *src; +} + +static __be32 +do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) +{ + __be32 status; + + if (open->op_truncate && + !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + + accmode |= NFSD_MAY_READ_IF_EXEC; + + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) + accmode |= NFSD_MAY_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); + if (open->op_share_deny & NFS4_SHARE_DENY_READ) + accmode |= NFSD_MAY_WRITE; + + status = fh_verify(rqstp, current_fh, S_IFREG, accmode); + + return status; +} + +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = d_inode(fh->fh_dentry)->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + +static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh) +{ + if (nfsd4_has_session(cstate)) + return; + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, + &resfh->fh_handle); +} + +static __be32 +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) +{ + struct svc_fh *current_fh = &cstate->current_fh; + int accmode; + __be32 status; + + *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!*resfh) + return nfserr_jukebox; + fh_init(*resfh, NFS4_FHSIZE); + open->op_truncate = 0; + + if (open->op_create) { + /* FIXME: check session persistence and pnfs flags. + * The nfsv4.1 spec requires the following semantics: + * + * Persistent | pNFS | Server REQUIRED | Client Allowed + * Reply Cache | server | | + * -------------+--------+-----------------+-------------------- + * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * | | | (SHOULD) + * | | and EXCLUSIVE4 | or EXCLUSIVE4 + * | | | (SHOULD NOT) + * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * yes | no | GUARDED4 | GUARDED4 + * yes | yes | GUARDED4 | GUARDED4 + */ + + /* + * Note: create modes (UNCHECKED,GUARDED...) are the same + * in NFSv4 as in v3 except EXCLUSIVE4_1. + */ + status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, + open->op_fname.len, &open->op_iattr, + *resfh, open->op_createmode, + (u32 *)open->op_verf.data, + &open->op_truncate, &open->op_created); + + if (!status && open->op_label.len) + nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); + + /* + * Following rfc 3530 14.2.16, use the returned bitmask + * to indicate which attributes we used to store the + * verifier: + */ + if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) + open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY); + } else + /* + * Note this may exit with the parent still locked. + * We will hold the lock until nfsd4_open's final + * lookup, to prevent renames or unlinks until we've had + * a chance to an acquire a delegation if appropriate. + */ + status = nfsd_lookup(rqstp, current_fh, + open->op_fname.data, open->op_fname.len, *resfh); + if (status) + goto out; + status = nfsd_check_obj_isreg(*resfh); + if (status) + goto out; + + if (is_create_with_attrs(open) && open->op_acl != NULL) + do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); + + nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); + accmode = NFSD_MAY_NOP; + if (open->op_created || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + accmode |= NFSD_MAY_OWNER_OVERRIDE; + status = do_open_permission(rqstp, *resfh, open, accmode); + set_change_info(&open->op_cinfo, current_fh); +out: + return status; +} + +static __be32 +do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + struct svc_fh *current_fh = &cstate->current_fh; + __be32 status; + int accmode = 0; + + /* We don't know the target directory, and therefore can not + * set the change info + */ + + memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); + + nfsd4_set_open_owner_reply_cache(cstate, open, current_fh); + + open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && + (open->op_iattr.ia_size == 0); + /* + * In the delegation case, the client is telling us about an + * open that it *already* performed locally, some time ago. We + * should let it succeed now if possible. + * + * In the case of a CLAIM_FH open, on the other hand, the client + * may be counting on us to enforce permissions (the Linux 4.1 + * client uses this for normal opens, for example). + */ + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) + accmode = NFSD_MAY_OWNER_OVERRIDE; + + status = do_open_permission(rqstp, current_fh, open, accmode); + + return status; +} + +static void +copy_clientid(clientid_t *clid, struct nfsd4_session *session) +{ + struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)session->se_sessionid.data; + + clid->cl_boot = sid->clientid.cl_boot; + clid->cl_id = sid->clientid.cl_id; +} + +static __be32 +nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) +{ + __be32 status; + struct svc_fh *resfh = NULL; + struct nfsd4_compoundres *resp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", + (int)open->op_fname.len, open->op_fname.data, + open->op_openowner); + + /* This check required by spec. */ + if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) + return nfserr_inval; + + open->op_created = 0; + /* + * RFC5661 18.51.3 + * Before RECLAIM_COMPLETE done, server should deny new lock + */ + if (nfsd4_has_session(cstate) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags) && + open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + + if (nfsd4_has_session(cstate)) + copy_clientid(&open->op_clientid, cstate->session); + + /* check seqid for replay. set nfs4_owner */ + resp = rqstp->rq_resp; + status = nfsd4_process_open1(&resp->cstate, open, nn); + if (status == nfserr_replay_me) { + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; + fh_put(&cstate->current_fh); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + dprintk("nfsd4_open: replay failed" + " restoring previous filehandle\n"); + else + status = nfserr_replay_me; + } + if (status) + goto out; + if (open->op_xdr_error) { + status = open->op_xdr_error; + goto out; + } + + status = nfsd4_check_open_attributes(rqstp, cstate, open); + if (status) + goto out; + + /* Openowner is now set, so sequence id will get bumped. Now we need + * these checks before we do any creates: */ + status = nfserr_grace; + if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + status = nfserr_no_grace; + if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) + goto out; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(&open->op_clientid, + cstate, nn); + if (status) + goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) + goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + dprintk("NFSD: unsupported OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_notsupp; + goto out; + default: + dprintk("NFSD: Invalid OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_inval; + goto out; + } + /* + * nfsd4_process_open2() does the actual opening of the file. If + * successful, it (1) truncates the file if open->op_truncate was + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, resfh, open); + WARN(status && open->op_created, + "nfsd4_process_open2 failed to open newly-created file! status=%u\n", + be32_to_cpu(status)); +out: + if (resfh && resfh != &cstate->current_fh) { + fh_dup2(&cstate->current_fh, resfh); + fh_put(resfh); + kfree(resfh); + } + nfsd4_cleanup_open_state(cstate, open); + nfsd4_bump_seqid(cstate, status); + return status; +} + +/* + * OPEN is the only seqid-mutating operation whose decoding can fail + * with a seqid-mutating error (specifically, decoding of user names in + * the attributes). Therefore we have to do some processing to look up + * the stateowner so that we can bump the seqid. + */ +static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op) +{ + struct nfsd4_open *open = (struct nfsd4_open *)&op->u; + + if (!seqid_mutating_err(ntohl(op->status))) + return op->status; + if (nfsd4_has_session(cstate)) + return op->status; + open->op_xdr_error = op->status; + return nfsd4_open(rqstp, cstate, open); +} + +/* + * filehandle-manipulating ops. + */ +static __be32 +nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct svc_fh **getfh) +{ + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + + *getfh = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_putfh *putfh) +{ + fh_put(&cstate->current_fh); + cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; + memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, + putfh->pf_fhlen); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); +} + +static __be32 +nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + __be32 status; + + fh_put(&cstate->current_fh); + status = exp_pseudoroot(rqstp, &cstate->current_fh); + return status; +} + +static __be32 +nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + if (!cstate->save_fh.fh_dentry) + return nfserr_restorefh; + + fh_dup2(&cstate->current_fh, &cstate->save_fh); + if (HAS_STATE_ID(cstate, SAVED_STATE_ID_FLAG)) { + memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + } + return nfs_ok; +} + +static __be32 +nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + + fh_dup2(&cstate->save_fh, &cstate->current_fh); + if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) { + memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, SAVED_STATE_ID_FLAG); + } + return nfs_ok; +} + +/* + * misc nfsv4 ops + */ +static __be32 +nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_access *access) +{ + if (access->ac_req_access & ~NFS3_ACCESS_FULL) + return nfserr_inval; + + access->ac_resp_access = access->ac_req_access; + return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access, + &access->ac_supported); +} + +static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) +{ + __be32 verf[2]; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec; + memcpy(verifier->data, verf, sizeof(verifier->data)); +} + +static __be32 +nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) +{ + gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp)); + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); +} + +static __be32 +nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_create *create) +{ + struct svc_fh resfh; + __be32 status; + dev_t rdev; + + fh_init(&resfh, NFS4_FHSIZE); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, + NFSD_MAY_CREATE); + if (status) + return status; + + status = check_attr_support(rqstp, cstate, create->cr_bmval, + nfsd_attrmask); + if (status) + return status; + + switch (create->cr_type) { + case NF4LNK: + status = nfsd_symlink(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + create->cr_data, &resfh); + break; + + case NF4BLK: + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + return nfserr_inval; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFBLK, rdev, &resfh); + break; + + case NF4CHR: + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + return nfserr_inval; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr,S_IFCHR, rdev, &resfh); + break; + + case NF4SOCK: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFSOCK, 0, &resfh); + break; + + case NF4FIFO: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFIFO, 0, &resfh); + break; + + case NF4DIR: + create->cr_iattr.ia_valid &= ~ATTR_SIZE; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFDIR, 0, &resfh); + break; + + default: + status = nfserr_badtype; + } + + if (status) + goto out; + + if (create->cr_label.len) + nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval); + + if (create->cr_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, + create->cr_bmval); + + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); +out: + fh_put(&resfh); + return status; +} + +static __be32 +nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_getattr *getattr) +{ + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + + getattr->ga_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_link *link) +{ + __be32 status = nfserr_nofilehandle; + + if (!cstate->save_fh.fh_dentry) + return status; + status = nfsd_link(rqstp, &cstate->current_fh, + link->li_name, link->li_namelen, &cstate->save_fh); + if (!status) + set_change_info(&link->li_cinfo, &cstate->current_fh); + return status; +} + +static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) +{ + struct svc_fh tmp_fh; + __be32 ret; + + fh_init(&tmp_fh, NFS4_FHSIZE); + ret = exp_pseudoroot(rqstp, &tmp_fh); + if (ret) + return ret; + if (tmp_fh.fh_dentry == fh->fh_dentry) { + fh_put(&tmp_fh); + return nfserr_noent; + } + fh_put(&tmp_fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh); +} + +static __be32 +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + return nfsd4_do_lookupp(rqstp, &cstate->current_fh); +} + +static __be32 +nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lookup *lookup) +{ + return nfsd_lookup(rqstp, &cstate->current_fh, + lookup->lo_name, lookup->lo_len, + &cstate->current_fh); +} + +static __be32 +nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_read *read) +{ + __be32 status; + + /* no need to check permission - this will be done in nfsd_read() */ + + read->rd_filp = NULL; + if (read->rd_offset >= OFFSET_MAX) + return nfserr_inval; + + /* + * If we do a zero copy read, then a client will see read data + * that reflects the state of the file *after* performing the + * following compound. + * + * To ensure proper ordering, we therefore turn off zero copy if + * the client wants us to do more in this compound: + */ + if (!nfsd4_last_compound_op(rqstp)) + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + + /* check stateid */ + if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp))) { + dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); + goto out; + } + status = nfs_ok; +out: + read->rd_rqstp = rqstp; + read->rd_fhp = &cstate->current_fh; + return status; +} + +static __be32 +nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readdir *readdir) +{ + u64 cookie = readdir->rd_cookie; + static const nfs4_verifier zeroverf; + + /* no need to check permission - this will be done in nfsd_readdir() */ + + if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + + if ((cookie == 1) || (cookie == 2) || + (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) + return nfserr_bad_cookie; + + readdir->rd_rqstp = rqstp; + readdir->rd_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readlink *readlink) +{ + readlink->rl_rqstp = rqstp; + readlink->rl_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_remove *remove) +{ + __be32 status; + + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + status = nfsd_unlink(rqstp, &cstate->current_fh, 0, + remove->rm_name, remove->rm_namelen); + if (!status) { + fh_unlock(&cstate->current_fh); + set_change_info(&remove->rm_cinfo, &cstate->current_fh); + } + return status; +} + +static __be32 +nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_rename *rename) +{ + __be32 status = nfserr_nofilehandle; + + if (!cstate->save_fh.fh_dentry) + return status; + if (locks_in_grace(SVC_NET(rqstp)) && + !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) + return nfserr_grace; + status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, + rename->rn_snamelen, &cstate->current_fh, + rename->rn_tname, rename->rn_tnamelen); + if (status) + return status; + set_change_info(&rename->rn_sinfo, &cstate->current_fh); + set_change_info(&rename->rn_tinfo, &cstate->save_fh); + return nfs_ok; +} + +static __be32 +nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo *secinfo) +{ + struct svc_fh resfh; + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + fh_init(&resfh, NFS4_FHSIZE); + err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, + secinfo->si_name, secinfo->si_namelen, + &exp, &dentry); + if (err) + return err; + if (d_really_is_negative(dentry)) { + exp_put(exp); + err = nfserr_noent; + } else + secinfo->si_exp = exp; + dput(dentry); + if (cstate->minorversion) + /* See rfc 5661 section 2.6.3.1.1.8 */ + fh_put(&cstate->current_fh); + return err; +} + +static __be32 +nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo_no_name *sin) +{ + __be32 err; + + switch (sin->sin_style) { + case NFS4_SECINFO_STYLE4_CURRENT_FH: + break; + case NFS4_SECINFO_STYLE4_PARENT: + err = nfsd4_do_lookupp(rqstp, &cstate->current_fh); + if (err) + return err; + break; + default: + return nfserr_inval; + } + + sin->sin_exp = exp_get(cstate->current_fh.fh_export); + fh_put(&cstate->current_fh); + return nfs_ok; +} + +static __be32 +nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setattr *setattr) +{ + __be32 status = nfs_ok; + int err; + + if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &setattr->sa_stateid, WR_STATE, NULL); + if (status) { + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + return status; + } + } + err = fh_want_write(&cstate->current_fh); + if (err) + return nfserrno(err); + status = nfs_ok; + + status = check_attr_support(rqstp, cstate, setattr->sa_bmval, + nfsd_attrmask); + if (status) + goto out; + + if (setattr->sa_acl != NULL) + status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, + setattr->sa_acl); + if (status) + goto out; + if (setattr->sa_label.len) + status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh, + &setattr->sa_label); + if (status) + goto out; + status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, + 0, (time_t)0); +out: + fh_drop_write(&cstate->current_fh); + return status; +} + +static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) +{ + int i = 1; + int buflen = write->wr_buflen; + + vec[0].iov_base = write->wr_head.iov_base; + vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); + buflen -= vec[0].iov_len; + + while (buflen) { + vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); + vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); + buflen -= vec[i].iov_len; + i++; + } + return i; +} + +static __be32 +nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_write *write) +{ + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; + __be32 status = nfs_ok; + unsigned long cnt; + int nvecs; + + /* no need to check permission - this will be done in nfsd_write() */ + + if (write->wr_offset >= OFFSET_MAX) + return nfserr_inval; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, stateid, WR_STATE, &filp); + if (status) { + dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + return status; + } + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; + gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); + + nvecs = fill_in_write_vector(rqstp->rq_vec, write); + WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); + + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, nvecs, + &cnt, &write->wr_how_written); + if (filp) + fput(filp); + + write->wr_bytes_written = cnt; + + return status; +} + +static __be32 +nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate, int flags) +{ + __be32 status = nfserr_notsupp; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &fallocate->falloc_stateid, + WR_STATE, &file); + if (status != nfs_ok) { + dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + return status; + } + if (!file) + return nfserr_bad_stateid; + + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + fallocate->falloc_offset, + fallocate->falloc_length, + flags); + fput(file); + return status; +} + +static __be32 +nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, 0); +} + +static __be32 +nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE); +} + +static __be32 +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_seek *seek) +{ + int whence; + __be32 status; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &seek->seek_stateid, + RD_STATE, &file); + if (status) { + dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + return status; + } + if (!file) + return nfserr_bad_stateid; + + switch (seek->seek_whence) { + case NFS4_CONTENT_DATA: + whence = SEEK_DATA; + break; + case NFS4_CONTENT_HOLE: + whence = SEEK_HOLE; + break; + default: + status = nfserr_union_notsupp; + goto out; + } + + /* + * Note: This call does change file->f_pos, but nothing in NFSD + * should ever file->f_pos. + */ + seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + if (seek->seek_pos < 0) + status = nfserrno(seek->seek_pos); + else if (seek->seek_pos >= i_size_read(file_inode(file))) + seek->seek_eof = true; + +out: + fput(file); + return status; +} + +/* This routine never returns NFS_OK! If there are no other errors, it + * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the + * attributes matched. VERIFY is implemented by mapping NFSERR_SAME + * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK. + */ +static __be32 +_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 *buf, *p; + int count; + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL); + if (status) + return status; + + if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) + || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) + return nfserr_inval; + if (verify->ve_attrlen & 3) + return nfserr_inval; + + /* count in words: + * bitmap_len(1) + bitmap(2) + attr_len(1) = 4 + */ + count = 4 + (verify->ve_attrlen >> 2); + buf = kmalloc(count << 2, GFP_KERNEL); + if (!buf) + return nfserr_jukebox; + + p = buf; + status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh, + cstate->current_fh.fh_export, + cstate->current_fh.fh_dentry, + verify->ve_bmval, + rqstp, 0); + /* + * If nfsd4_encode_fattr() ran out of space, assume that's because + * the attributes are longer (hence different) than those given: + */ + if (status == nfserr_resource) + status = nfserr_not_same; + if (status) + goto out_kfree; + + /* skip bitmap */ + p = buf + 1 + ntohl(buf[0]); + status = nfserr_not_same; + if (ntohl(*p++) != verify->ve_attrlen) + goto out_kfree; + if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen)) + status = nfserr_same; + +out_kfree: + kfree(buf); + return status; +} + +static __be32 +nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_not_same ? nfs_ok : status; +} + +static __be32 +nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_same ? nfs_ok : status; +} + +#ifdef CONFIG_NFSD_PNFS +static const struct nfsd4_layout_ops * +nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) +{ + if (!exp->ex_layout_type) { + dprintk("%s: export does not support pNFS\n", __func__); + return NULL; + } + + if (exp->ex_layout_type != layout_type) { + dprintk("%s: layout type %d not supported\n", + __func__, layout_type); + return NULL; + } + + return nfsd4_layout_ops[layout_type]; +} + +static __be32 +nfsd4_getdeviceinfo(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_getdeviceinfo *gdp) +{ + const struct nfsd4_layout_ops *ops; + struct nfsd4_deviceid_map *map; + struct svc_export *exp; + __be32 nfserr; + + dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n", + __func__, + gdp->gd_layout_type, + gdp->gd_devid.fsid_idx, gdp->gd_devid.generation, + gdp->gd_maxcount); + + map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx); + if (!map) { + dprintk("%s: couldn't find device ID to export mapping!\n", + __func__); + return nfserr_noent; + } + + exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); + if (IS_ERR(exp)) { + dprintk("%s: could not find device id\n", __func__); + return nfserr_noent; + } + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(exp, gdp->gd_layout_type); + if (!ops) + goto out; + + nfserr = nfs_ok; + if (gdp->gd_maxcount != 0) + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); + + gdp->gd_notify_types &= ops->notify_types; +out: + exp_put(exp); + return nfserr; +} + +static __be32 +nfsd4_layoutget(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutget *lgp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + int accmode; + + switch (lgp->lg_seg.iomode) { + case IOMODE_READ: + accmode = NFSD_MAY_READ; + break; + case IOMODE_RW: + accmode = NFSD_MAY_READ | NFSD_MAY_WRITE; + break; + default: + dprintk("%s: invalid iomode %d\n", + __func__, lgp->lg_seg.iomode); + nfserr = nfserr_badiomode; + goto out; + } + + nfserr = fh_verify(rqstp, current_fh, 0, accmode); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type); + if (!ops) + goto out; + + /* + * Verify minlength and range as per RFC5661: + * o If loga_length is less than loga_minlength, + * the metadata server MUST return NFS4ERR_INVAL. + * o If the sum of loga_offset and loga_minlength exceeds + * NFS4_UINT64_MAX, and loga_minlength is not + * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result. + * o If the sum of loga_offset and loga_length exceeds + * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX, + * the error NFS4ERR_INVAL MUST result. + */ + nfserr = nfserr_inval; + if (lgp->lg_seg.length < lgp->lg_minlength || + (lgp->lg_minlength != NFS4_MAX_UINT64 && + lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) || + (lgp->lg_seg.length != NFS4_MAX_UINT64 && + lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset)) + goto out; + if (lgp->lg_seg.length == 0) + goto out; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, + true, lgp->lg_layout_type, &ls); + if (nfserr) { + trace_layout_get_lookup_fail(&lgp->lg_sid); + goto out; + } + + nfserr = nfserr_recallconflict; + if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) + goto out_put_stid; + + nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), + current_fh, lgp); + if (nfserr) + goto out_put_stid; + + nfserr = nfsd4_insert_layout(lgp, ls); + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutcommit(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutcommit *lcp) +{ + const struct nfsd4_layout_seg *seg = &lcp->lc_seg; + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + loff_t new_size = lcp->lc_last_wr + 1; + struct inode *inode; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); + if (!ops) + goto out; + inode = d_inode(current_fh->fh_dentry); + + nfserr = nfserr_inval; + if (new_size <= seg->offset) { + dprintk("pnfsd: last write before layout segment\n"); + goto out; + } + if (new_size > seg->offset + seg->length) { + dprintk("pnfsd: last write beyond layout segment\n"); + goto out; + } + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { + dprintk("pnfsd: layoutcommit beyond EOF\n"); + goto out; + } + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, + false, lcp->lc_layout_type, + &ls); + if (nfserr) { + trace_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + nfserr = ops->proc_layoutcommit(inode, lcp); + if (nfserr) + goto out_put_stid; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = 1; + lcp->lc_newsize = new_size; + } else { + lcp->lc_size_chg = 0; + } + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type)) + goto out; + + switch (lrp->lr_seg.iomode) { + case IOMODE_READ: + case IOMODE_RW: + case IOMODE_ANY: + break; + default: + dprintk("%s: invalid iomode %d\n", __func__, + lrp->lr_seg.iomode); + nfserr = nfserr_inval; + goto out; + } + + switch (lrp->lr_return_type) { + case RETURN_FILE: + nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp); + break; + case RETURN_FSID: + case RETURN_ALL: + nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp); + break; + default: + dprintk("%s: invalid return_type %d\n", __func__, + lrp->lr_return_type); + nfserr = nfserr_inval; + break; + } +out: + return nfserr; +} +#endif /* CONFIG_NFSD_PNFS */ + +/* + * NULL call. + */ +static __be32 +nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + +typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, + void *); +typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); +typedef void(*stateid_setter)(struct nfsd4_compound_state *, void *); +typedef void(*stateid_getter)(struct nfsd4_compound_state *, void *); + +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, + /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: + * We use the DRC for compounds containing non-idempotent + * operations, *except* those that are 4.1-specific (since + * sessions provide their own EOS), and except for stateful + * operations other than setclientid and setclientid_confirm + * (since sequence numbers provide EOS for open, lock, etc in + * the v4.0 case). + */ + OP_CACHEME = 1 << 6, + /* + * These are ops which clear current state id. + */ + OP_CLEAR_STATEID = 1 << 7, +}; + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; + char *op_name; + /* Try to get response size before operation */ + nfsd4op_rsize op_rsize_bop; + stateid_getter op_get_currentstateid; + stateid_setter op_set_currentstateid; +}; + +static struct nfsd4_operation nfsd4_ops[]; + +static const char *nfsd4_op_name(unsigned opnum); + +/* + * Enforce NFSv4.1 COMPOUND ordering rules: + * + * Also note, enforced elsewhere: + * - SEQUENCE other than as first op results in + * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) + * - BIND_CONN_TO_SESSION must be the only op in its compound. + * (Enforced in nfsd4_bind_conn_to_session().) + * - DESTROY_SESSION must be the final operation in a compound, if + * sessionid's in SEQUENCE and DESTROY_SESSION are the same. + * (Enforced in nfsd4_destroy_session().) + */ +static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) +{ + struct nfsd4_op *op = &args->ops[0]; + + /* These ordering requirements don't apply to NFSv4.0: */ + if (args->minorversion == 0) + return nfs_ok; + /* This is weird, but OK, not our problem: */ + if (args->opcnt == 0) + return nfs_ok; + if (op->status == nfserr_op_illegal) + return nfs_ok; + if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) + return nfserr_op_not_in_session; + if (op->opnum == OP_SEQUENCE) + return nfs_ok; + if (args->opcnt != 1) + return nfserr_not_only_op; + return nfs_ok; +} + +static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +{ + return &nfsd4_ops[op->opnum]; +} + +bool nfsd4_cache_this_op(struct nfsd4_op *op) +{ + if (op->opnum == OP_ILLEGAL) + return false; + return OPDESC(op)->op_flags & OP_CACHEME; +} + +static bool need_wrongsec_check(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *next = &argp->ops[resp->opcnt]; + struct nfsd4_operation *thisd; + struct nfsd4_operation *nextd; + + thisd = OPDESC(this); + /* + * Most ops check wronsec on our own; only the putfh-like ops + * have special rules. + */ + if (!(thisd->op_flags & OP_IS_PUTFH_LIKE)) + return false; + /* + * rfc 5661 2.6.3.1.1.6: don't bother erroring out a + * put-filehandle operation if we're not going to use the + * result: + */ + if (argp->opcnt == resp->opcnt) + return false; + if (next->opnum == OP_ILLEGAL) + return false; + nextd = OPDESC(next); + /* + * Rest of 2.6.3.1.1: certain operations will return WRONGSEC + * errors themselves as necessary; others should check for them + * now: + */ + return !(nextd->op_flags & OP_HANDLES_WRONGSEC); +} + +static void svcxdr_init_encode(struct svc_rqst *rqstp, + struct nfsd4_compoundres *resp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + + xdr->buf = buf; + xdr->iov = head; + xdr->p = head->iov_base + head->iov_len; + xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; + /* Tail and page_len should be zero at this point: */ + buf->len = buf->head[0].iov_len; + xdr->scratch.iov_len = 0; + xdr->page_ptr = buf->pages - 1; + buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) + - rqstp->rq_auth_slack; +} + +/* + * COMPOUND call. + */ +static __be32 +nfsd4_proc_compound(struct svc_rqst *rqstp, + struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + struct nfsd4_operation *opdesc; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct svc_fh *current_fh = &cstate->current_fh; + struct svc_fh *save_fh = &cstate->save_fh; + __be32 status; + + svcxdr_init_encode(rqstp, resp); + resp->tagp = resp->xdr.p; + /* reserve space for: taglen, tag, and opcnt */ + xdr_reserve_space(&resp->xdr, 8 + args->taglen); + resp->taglen = args->taglen; + resp->tag = args->tag; + resp->rqstp = rqstp; + cstate->minorversion = args->minorversion; + fh_init(current_fh, NFS4_FHSIZE); + fh_init(save_fh, NFS4_FHSIZE); + /* + * Don't use the deferral mechanism for NFSv4; compounds make it + * too hard to avoid non-idempotency problems. + */ + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + + /* + * According to RFC3010, this takes precedence over all other errors. + */ + status = nfserr_minor_vers_mismatch; + if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0) + goto out; + + status = nfs41_check_op_ordering(args); + if (status) { + op = &args->ops[0]; + op->status = status; + goto encode_op; + } + + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + + dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", + resp->opcnt, args->opcnt, op->opnum, + nfsd4_op_name(op->opnum)); + /* + * The XDR decode routines may have pre-set op->status; + * for example, if there is a miscellaneous XDR error + * it will be set to nfserr_bad_xdr. + */ + if (op->status) { + if (op->opnum == OP_OPEN) + op->status = nfsd4_open_omfg(rqstp, cstate, op); + goto encode_op; + } + + opdesc = OPDESC(op); + + if (!current_fh->fh_dentry) { + if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { + op->status = nfserr_nofilehandle; + goto encode_op; + } + } else if (current_fh->fh_export->ex_fslocs.migrated && + !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { + op->status = nfserr_moved; + goto encode_op; + } + + fh_clear_wcc(current_fh); + + /* If op is non-idempotent */ + if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + /* + * Don't execute this op if we couldn't encode a + * succesful reply: + */ + u32 plen = opdesc->op_rsize_bop(rqstp, op); + /* + * Plus if there's another operation, make sure + * we'll have space to at least encode an error: + */ + if (resp->opcnt < args->opcnt) + plen += COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + + if (opdesc->op_get_currentstateid) + opdesc->op_get_currentstateid(cstate, &op->u); + op->status = opdesc->op_func(rqstp, cstate, &op->u); + + if (!op->status) { + if (opdesc->op_set_currentstateid) + opdesc->op_set_currentstateid(cstate, &op->u); + + if (opdesc->op_flags & OP_CLEAR_STATEID) + clear_current_stateid(cstate); + + if (need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(current_fh->fh_export, rqstp); + } + +encode_op: + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } + if (op->status == nfserr_replay_me) { + op->replay = &cstate->replay_owner->so_replay; + nfsd4_encode_replay(&resp->xdr, op); + status = op->status = op->replay->rp_status; + } else { + nfsd4_encode_operation(resp, op); + status = op->status; + } + + dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", + args->ops, args->opcnt, resp->opcnt, op->opnum, + be32_to_cpu(status)); + + nfsd4_cstate_clear_replay(cstate); + /* XXX Ugh, we need to get rid of this kind of special case: */ + if (op->opnum == OP_READ && op->u.read.rd_filp) + fput(op->u.read.rd_filp); + + nfsd4_increment_op_stats(op->opnum); + } + + cstate->status = status; + fh_put(current_fh); + fh_put(save_fh); + BUG_ON(cstate->replay_owner); +out: + /* Reset deferral mechanism for RPC deferrals */ + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + dprintk("nfsv4 compound returned %d\n", ntohl(status)); + return status; +} + +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +/* We'll fall back on returning no lockowner if run out of space: */ +#define op_encode_lockowner_maxsz (0) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +/* + * Note since this is an idempotent operation we won't insist on failing + * the op prematurely if the estimate is too large. We may turn off splice + * reads unnecessarily. + */ +static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 *bmap = op->u.getattr.ga_bmval; + u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; + u32 ret = 0; + + if (bmap0 & FATTR4_WORD0_ACL) + return svc_max_payload(rqstp); + if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) + return svc_max_payload(rqstp); + + if (bmap1 & FATTR4_WORD1_OWNER) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER; + } + if (bmap1 & FATTR4_WORD1_OWNER_GROUP) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER_GROUP; + } + if (bmap0 & FATTR4_WORD0_FILEHANDLE) { + ret += NFS4_FHSIZE + 4; + bmap0 &= ~FATTR4_WORD0_FILEHANDLE; + } + if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { + ret += NFS4_MAXLABELLEN + 12; + bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; + } + /* + * Largest of remaining attributes are 16 bytes (e.g., + * supported_attributes) + */ + ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2)); + /* bitmask, length */ + ret += 20; + return ret; +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.read.rd_length, maxcount); + + return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.readdir.rd_maxcount, maxcount); + + return (op_encode_hdr_size + op_encode_verifier_maxsz + + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * + sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + /* eir_flags, spr_how */\ + 4 + /* spo_must_enforce & _allow with bitmap */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + +#ifdef CONFIG_NFSD_PNFS +/* + * At this stage we don't really know what layout driver will handle the request, + * so we need to define an arbitrary upper bound here. + */ +#define MAX_LAYOUT_SIZE 128 +static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* logr_return_on_close */ + + op_encode_stateid_maxsz + + 1 /* nr of layouts */ + + MAX_LAYOUT_SIZE) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* locr_newsize */ + + 2 /* ns_size */) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* lrs_stateid */ + + op_encode_stateid_maxsz) * sizeof(__be32); +} +#endif /* CONFIG_NFSD_PNFS */ + +static struct nfsd4_operation nfsd4_ops[] = { + [OP_ACCESS] = { + .op_func = (nfsd4op_func)nfsd4_access, + .op_name = "OP_ACCESS", + }, + [OP_CLOSE] = { + .op_func = (nfsd4op_func)nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_CLOSE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_closestateid, + .op_set_currentstateid = (stateid_setter)nfsd4_set_closestateid, + }, + [OP_COMMIT] = { + .op_func = (nfsd4op_func)nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_COMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize, + }, + [OP_CREATE] = { + .op_func = (nfsd4op_func)nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID, + .op_name = "OP_CREATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, + }, + [OP_DELEGRETURN] = { + .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_delegreturnstateid, + }, + [OP_GETATTR] = { + .op_func = (nfsd4op_func)nfsd4_getattr, + .op_flags = ALLOWED_ON_ABSENT_FS, + .op_rsize_bop = nfsd4_getattr_rsize, + .op_name = "OP_GETATTR", + }, + [OP_GETFH] = { + .op_func = (nfsd4op_func)nfsd4_getfh, + .op_name = "OP_GETFH", + }, + [OP_LINK] = { + .op_func = (nfsd4op_func)nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, + .op_name = "OP_LINK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, + }, + [OP_LOCK] = { + .op_func = (nfsd4op_func)nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LOCK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, + .op_set_currentstateid = (stateid_setter)nfsd4_set_lockstateid, + }, + [OP_LOCKT] = { + .op_func = (nfsd4op_func)nfsd4_lockt, + .op_name = "OP_LOCKT", + }, + [OP_LOCKU] = { + .op_func = (nfsd4op_func)nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LOCKU", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_lockustateid, + }, + [OP_LOOKUP] = { + .op_func = (nfsd4op_func)nfsd4_lookup, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, + .op_name = "OP_LOOKUP", + }, + [OP_LOOKUPP] = { + .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, + .op_name = "OP_LOOKUPP", + }, + [OP_NVERIFY] = { + .op_func = (nfsd4op_func)nfsd4_nverify, + .op_name = "OP_NVERIFY", + }, + [OP_OPEN] = { + .op_func = (nfsd4op_func)nfsd4_open, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, + .op_set_currentstateid = (stateid_setter)nfsd4_set_openstateid, + }, + [OP_OPEN_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + }, + [OP_OPEN_DOWNGRADE] = { + .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_opendowngradestateid, + .op_set_currentstateid = (stateid_setter)nfsd4_set_opendowngradestateid, + }, + [OP_PUTFH] = { + .op_func = (nfsd4op_func)nfsd4_putfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_PUTPUBFH] = { + .op_func = (nfsd4op_func)nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTPUBFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_PUTROOTFH] = { + .op_func = (nfsd4op_func)nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTROOTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_READ] = { + .op_func = (nfsd4op_func)nfsd4_read, + .op_name = "OP_READ", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid, + }, + [OP_READDIR] = { + .op_func = (nfsd4op_func)nfsd4_readdir, + .op_name = "OP_READDIR", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, + }, + [OP_READLINK] = { + .op_func = (nfsd4op_func)nfsd4_readlink, + .op_name = "OP_READLINK", + }, + [OP_REMOVE] = { + .op_func = (nfsd4op_func)nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_REMOVE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, + }, + [OP_RENAME] = { + .op_func = (nfsd4op_func)nfsd4_rename, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_RENAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, + }, + [OP_RENEW] = { + .op_func = (nfsd4op_func)nfsd4_renew, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, + .op_name = "OP_RENEW", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + + }, + [OP_RESTOREFH] = { + .op_func = (nfsd4op_func)nfsd4_restorefh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, + .op_name = "OP_RESTOREFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_SAVEFH] = { + .op_func = (nfsd4op_func)nfsd4_savefh, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, + .op_name = "OP_SAVEFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_SECINFO] = { + .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO", + }, + [OP_SETATTR] = { + .op_func = (nfsd4op_func)nfsd4_setattr, + .op_name = "OP_SETATTR", + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_setattrstateid, + }, + [OP_SETCLIENTID] = { + .op_func = (nfsd4op_func)nfsd4_setclientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETCLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, + }, + [OP_SETCLIENTID_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETCLIENTID_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_VERIFY] = { + .op_func = (nfsd4op_func)nfsd4_verify, + .op_name = "OP_VERIFY", + }, + [OP_WRITE] = { + .op_func = (nfsd4op_func)nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_WRITE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid, + }, + [OP_RELEASE_LOCKOWNER] = { + .op_func = (nfsd4op_func)nfsd4_release_lockowner, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, + .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + + /* NFSv4.1 operations */ + [OP_EXCHANGE_ID] = { + .op_func = (nfsd4op_func)nfsd4_exchange_id, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, + }, + [OP_BACKCHANNEL_CTL] = { + .op_func = (nfsd4op_func)nfsd4_backchannel_ctl, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_BACKCHANNEL_CTL", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_BIND_CONN_TO_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize, + }, + [OP_CREATE_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_create_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize, + }, + [OP_DESTROY_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_destroy_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_SEQUENCE] = { + .op_func = (nfsd4op_func)nfsd4_sequence, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize, + }, + [OP_DESTROY_CLIENTID] = { + .op_func = (nfsd4op_func)nfsd4_destroy_clientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_RECLAIM_COMPLETE] = { + .op_func = (nfsd4op_func)nfsd4_reclaim_complete, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_SECINFO_NO_NAME] = { + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO_NO_NAME", + }, + [OP_TEST_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_test_stateid, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_TEST_STATEID", + }, + [OP_FREE_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_free_stateid, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_FREE_STATEID", + .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = { + .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_GETDEVICEINFO", + }, + [OP_LAYOUTGET] = { + .op_func = (nfsd4op_func)nfsd4_layoutget, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTGET", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize, + }, + [OP_LAYOUTCOMMIT] = { + .op_func = (nfsd4op_func)nfsd4_layoutcommit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTCOMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize, + }, + [OP_LAYOUTRETURN] = { + .op_func = (nfsd4op_func)nfsd4_layoutreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTRETURN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize, + }, +#endif /* CONFIG_NFSD_PNFS */ + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_allocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_ALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_DEALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_deallocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_DEALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, + [OP_SEEK] = { + .op_func = (nfsd4op_func)nfsd4_seek, + .op_name = "OP_SEEK", + }, +}; + +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + struct nfsd4_operation *opdesc; + nfsd4op_rsize estimator; + + if (op->opnum == OP_ILLEGAL) + return op_encode_hdr_size * sizeof(__be32); + opdesc = OPDESC(op); + estimator = opdesc->op_rsize_bop; + return estimator ? estimator(rqstp, op) : PAGE_SIZE; +} + +void warn_on_nonidempotent_op(struct nfsd4_op *op) +{ + if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { + pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + op->opnum, nfsd4_op_name(op->opnum)); + WARN_ON_ONCE(1); + } +} + +static const char *nfsd4_op_name(unsigned opnum) +{ + if (opnum < ARRAY_SIZE(nfsd4_ops)) + return nfsd4_ops[opnum].op_name; + return "unknown_operation"; +} + +#define nfsd4_voidres nfsd4_voidargs +struct nfsd4_voidargs { int dummy; }; + +static struct svc_procedure nfsd_procedures4[2] = { + [NFSPROC4_NULL] = { + .pc_func = (svc_procfunc) nfsd4_proc_null, + .pc_encode = (kxdrproc_t) nfs4svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd4_voidargs), + .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 1, + }, + [NFSPROC4_COMPOUND] = { + .pc_func = (svc_procfunc) nfsd4_proc_compound, + .pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs, + .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres, + .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_ressize = sizeof(struct nfsd4_compoundres), + .pc_release = nfsd4_release_compoundargs, + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = NFSD_BUFSIZE/4, + }, +}; + +struct svc_version nfsd_version4 = { + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = 1, +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfsd/nfs4recover.c b/kernel/fs/nfsd/nfs4recover.c new file mode 100644 index 000000000..d88ea7b9a --- /dev/null +++ b/kernel/fs/nfsd/nfs4recover.c @@ -0,0 +1,1551 @@ +/* +* Copyright (c) 2004 The Regents of the University of Michigan. +* Copyright (c) 2012 Jeff Layton +* All rights reserved. +* +* Andy Adamson +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "state.h" +#include "vfs.h" +#include "netns.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Declarations */ +struct nfsd4_client_tracking_ops { + int (*init)(struct net *); + void (*exit)(struct net *); + void (*create)(struct nfs4_client *); + void (*remove)(struct nfs4_client *); + int (*check)(struct nfs4_client *); + void (*grace_done)(struct nfsd_net *); +}; + +/* Globals */ +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + +static int +nfs4_save_creds(const struct cred **original_creds) +{ + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = GLOBAL_ROOT_UID; + new->fsgid = GLOBAL_ROOT_GID; + *original_creds = override_creds(new); + put_cred(new); + return 0; +} + +static void +nfs4_reset_creds(const struct cred *original) +{ + revert_creds(original); +} + +static void +md5_to_hex(char *out, char *md5) +{ + int i; + + for (i=0; i<16; i++) { + unsigned char c = md5[i]; + + *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + } + *out = '\0'; +} + +static int +nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) +{ + struct xdr_netobj cksum; + struct hash_desc desc; + struct scatterlist sg; + int status; + + dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", + clname->len, clname->data); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) { + status = PTR_ERR(desc.tfm); + goto out_no_tfm; + } + + cksum.len = crypto_hash_digestsize(desc.tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + status = -ENOMEM; + goto out; + } + + sg_init_one(&sg, clname->data, clname->len); + + status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data); + if (status) + goto out; + + md5_to_hex(dname, cksum.data); + + status = 0; +out: + kfree(cksum.data); + crypto_free_hash(desc.tfm); +out_no_tfm: + return status; +} + +/* + * If we had an error generating the recdir name for the legacy tracker + * then warn the admin. If the error doesn't appear to be transient, + * then disable recovery tracking. + */ +static void +legacy_recdir_name_error(struct nfs4_client *clp, int error) +{ + printk(KERN_ERR "NFSD: unable to generate recoverydir " + "name (%d).\n", error); + + /* + * if the algorithm just doesn't exist, then disable the recovery + * tracker altogether. The crypto libs will generally return this if + * FIPS is enabled as well. + */ + if (error == -ENOENT) { + printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " + "Reboot recovery will not function correctly!\n"); + nfsd4_client_tracking_exit(clp->net); + } +} + +static void +nfsd4_create_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + char dname[HEXDIR_LEN]; + struct dentry *dir, *dentry; + struct nfs4_client_reclaim *crp; + int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + if (!nn->rec_file) + return; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(clp, status); + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return; + + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out_creds; + + dir = nn->rec_file->f_path.dentry; + /* lock the parent */ + mutex_lock(&d_inode(dir)->i_mutex); + + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + if (d_really_is_positive(dentry)) + /* + * In the 4.1 case, where we're called from + * reclaim_complete(), records from the previous reboot + * may still be left, so this is OK. + * + * In the 4.0 case, we should never get here; but we may + * as well be forgiving and just succeed silently. + */ + goto out_put; + status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); +out_put: + dput(dentry); +out_unlock: + mutex_unlock(&d_inode(dir)->i_mutex); + if (status == 0) { + if (nn->in_grace) { + crp = nfs4_client_to_reclaim(dname, nn); + if (crp) + crp->cr_clp = clp; + } + vfs_fsync(nn->rec_file, 0); + } else { + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); + } + mnt_drop_write_file(nn->rec_file); +out_creds: + nfs4_reset_creds(original_cred); +} + +typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); + +struct name_list { + char name[HEXDIR_LEN]; + struct list_head list; +}; + +struct nfs4_dir_ctx { + struct dir_context ctx; + struct list_head names; +}; + +static int +nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct nfs4_dir_ctx *ctx = + container_of(__ctx, struct nfs4_dir_ctx, ctx); + struct name_list *entry; + + if (namlen != HEXDIR_LEN - 1) + return 0; + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, &ctx->names); + return 0; +} + +static int +nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) +{ + const struct cred *original_cred; + struct dentry *dir = nn->rec_file->f_path.dentry; + struct nfs4_dir_ctx ctx = { + .ctx.actor = nfsd4_build_namelist, + .names = LIST_HEAD_INIT(ctx.names) + }; + int status; + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; + + status = vfs_llseek(nn->rec_file, 0, SEEK_SET); + if (status < 0) { + nfs4_reset_creds(original_cred); + return status; + } + + status = iterate_dir(nn->rec_file, &ctx.ctx); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + while (!list_empty(&ctx.names)) { + struct name_list *entry; + entry = list_entry(ctx.names.next, struct name_list, list); + if (!status) { + struct dentry *dentry; + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + break; + } + status = f(dir, dentry, nn); + dput(dentry); + } + list_del(&entry->list); + kfree(entry); + } + mutex_unlock(&d_inode(dir)->i_mutex); + nfs4_reset_creds(original_cred); + return status; +} + +static int +nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) +{ + struct dentry *dir, *dentry; + int status; + + dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + + dir = nn->rec_file->f_path.dentry; + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + status = -ENOENT; + if (d_really_is_negative(dentry)) + goto out; + status = vfs_rmdir(d_inode(dir), dentry); +out: + dput(dentry); +out_unlock: + mutex_unlock(&d_inode(dir)->i_mutex); + return status; +} + +static void +nfsd4_remove_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + struct nfs4_client_reclaim *crp; + char dname[HEXDIR_LEN]; + int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(clp, status); + + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + + status = nfs4_save_creds(&original_cred); + if (status < 0) + goto out_drop_write; + + status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); + nfs4_reset_creds(original_cred); + if (status == 0) { + vfs_fsync(nn->rec_file, 0); + if (nn->in_grace) { + /* remove reclaim record */ + crp = nfsd4_find_reclaim_client(dname, nn); + if (crp) + nfs4_remove_reclaim_record(crp, nn); + } + } +out_drop_write: + mnt_drop_write_file(nn->rec_file); +out: + if (status) + printk("NFSD: Failed to remove expired client state directory" + " %.*s\n", HEXDIR_LEN, dname); +} + +static int +purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +{ + int status; + + if (nfs4_has_reclaimed_state(child->d_name.name, nn)) + return 0; + + status = vfs_rmdir(d_inode(parent), child); + if (status) + printk("failed to remove client recovery directory %pd\n", + child); + /* Keep trying, success or failure: */ + return 0; +} + +static void +nfsd4_recdir_purge_old(struct nfsd_net *nn) +{ + int status; + + nn->in_grace = false; + if (!nn->rec_file) + return; + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out; + status = nfsd4_list_rec_dir(purge_old, nn); + if (status == 0) + vfs_fsync(nn->rec_file, 0); + mnt_drop_write_file(nn->rec_file); +out: + nfs4_release_reclaim(nn); + if (status) + printk("nfsd4: failed to purge old clients from recovery" + " directory %pD\n", nn->rec_file); +} + +static int +load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +{ + if (child->d_name.len != HEXDIR_LEN - 1) { + printk("nfsd4: illegal name %pd in recovery directory\n", + child); + /* Keep trying; maybe the others are OK: */ + return 0; + } + nfs4_client_to_reclaim(child->d_name.name, nn); + return 0; +} + +static int +nfsd4_recdir_load(struct net *net) { + int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return 0; + + status = nfsd4_list_rec_dir(load_recdir, nn); + if (status) + printk("nfsd4: failed loading clients from recovery" + " directory %pD\n", nn->rec_file); + return status; +} + +/* + * Hold reference to the recovery directory. + */ + +static int +nfsd4_init_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + const struct cred *original_cred; + int status; + + printk("NFSD: Using %s as the NFSv4 state recovery directory\n", + user_recovery_dirname); + + BUG_ON(nn->rec_file); + + status = nfs4_save_creds(&original_cred); + if (status < 0) { + printk("NFSD: Unable to change credentials to find recovery" + " directory: error %d\n", + status); + return status; + } + + nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(nn->rec_file)) { + printk("NFSD: unable to find recovery directory %s\n", + user_recovery_dirname); + status = PTR_ERR(nn->rec_file); + nn->rec_file = NULL; + } + + nfs4_reset_creds(original_cred); + if (!status) + nn->in_grace = true; + return status; +} + +static void +nfsd4_shutdown_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return; + fput(nn->rec_file); + nn->rec_file = NULL; +} + +static int +nfs4_legacy_state_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->reclaim_str_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); + nn->reclaim_str_hashtbl_size = 0; + + return 0; +} + +static void +nfs4_legacy_state_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + kfree(nn->reclaim_str_hashtbl); +} + +static int +nfsd4_load_reboot_recovery_data(struct net *net) +{ + int status; + + status = nfsd4_init_recdir(net); + if (status) + return status; + + status = nfsd4_recdir_load(net); + if (status) + nfsd4_shutdown_recdir(net); + + return status; +} + +static int +nfsd4_legacy_tracking_init(struct net *net) +{ + int status; + + /* XXX: The legacy code won't work in a container */ + if (net != &init_net) { + WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " + "tracking in a container!\n"); + return -EINVAL; + } + + status = nfs4_legacy_state_init(net); + if (status) + return status; + + status = nfsd4_load_reboot_recovery_data(net); + if (status) + goto err; + return 0; + +err: + nfs4_legacy_state_shutdown(net); + return status; +} + +static void +nfsd4_legacy_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_release_reclaim(nn); + nfsd4_shutdown_recdir(net); + nfs4_legacy_state_shutdown(net); +} + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (d_is_dir(path.dentry)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} + +static int +nfsd4_check_legacy_client(struct nfs4_client *clp) +{ + int status; + char dname[HEXDIR_LEN]; + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) { + legacy_recdir_name_error(clp, status); + return status; + } + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(dname, nn); + if (crp) { + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + crp->cr_clp = clp; + return 0; + } + + return -ENOENT; +} + +static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { + .init = nfsd4_legacy_tracking_init, + .exit = nfsd4_legacy_tracking_exit, + .create = nfsd4_create_clid_dir, + .remove = nfsd4_remove_clid_dir, + .check = nfsd4_check_legacy_client, + .grace_done = nfsd4_recdir_purge_old, +}; + +/* Globals */ +#define NFSD_PIPE_DIR "nfsd" +#define NFSD_CLD_PIPE "cld" + +/* per-net-ns structure for holding cld upcall info */ +struct cld_net { + struct rpc_pipe *cn_pipe; + spinlock_t cn_lock; + struct list_head cn_list; + unsigned int cn_xid; +}; + +struct cld_upcall { + struct list_head cu_list; + struct cld_net *cu_net; + struct task_struct *cu_task; + struct cld_msg cu_msg; +}; + +static int +__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +{ + int ret; + struct rpc_pipe_msg msg; + + memset(&msg, 0, sizeof(msg)); + msg.data = cmsg; + msg.len = sizeof(*cmsg); + + /* + * Set task state before we queue the upcall. That prevents + * wake_up_process in the downcall from racing with schedule. + */ + set_current_state(TASK_UNINTERRUPTIBLE); + ret = rpc_queue_upcall(pipe, &msg); + if (ret < 0) { + set_current_state(TASK_RUNNING); + goto out; + } + + schedule(); + + if (msg.errno < 0) + ret = msg.errno; +out: + return ret; +} + +static int +cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +{ + int ret; + + /* + * -EAGAIN occurs when pipe is closed and reopened while there are + * upcalls queued. + */ + do { + ret = __cld_pipe_upcall(pipe, cmsg); + } while (ret == -EAGAIN); + + return ret; +} + +static ssize_t +cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct cld_upcall *tmp, *cup; + struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + uint32_t xid; + struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, + nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + if (mlen != sizeof(*cmsg)) { + dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, + sizeof(*cmsg)); + return -EINVAL; + } + + /* copy just the xid so we can try to find that */ + if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + dprintk("%s: error when copying xid from userspace", __func__); + return -EFAULT; + } + + /* walk the list and find corresponding xid */ + cup = NULL; + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + cup = tmp; + list_del_init(&cup->cu_list); + break; + } + } + spin_unlock(&cn->cn_lock); + + /* couldn't find upcall? */ + if (!cup) { + dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid); + return -EINVAL; + } + + if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + return -EFAULT; + + wake_up_process(cup->cu_task); + return mlen; +} + +static void +cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct cld_msg *cmsg = msg->data; + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, + cu_msg); + + /* errno >= 0 means we got a downcall */ + if (msg->errno >= 0) + return; + + wake_up_process(cup->cu_task); +} + +static const struct rpc_pipe_ops cld_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = cld_pipe_downcall, + .destroy_msg = cld_pipe_destroy_msg, +}; + +static struct dentry * +nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); + dput(dir); + return dentry; +} + +static void +nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static struct dentry * +nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + struct dentry *dentry; + + sb = rpc_get_sb_net(net); + if (!sb) + return NULL; + dentry = nfsd4_cld_register_sb(sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void +nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + + sb = rpc_get_sb_net(net); + if (sb) { + nfsd4_cld_unregister_sb(pipe); + rpc_put_sb_net(net); + } +} + +/* Initialize rpc_pipefs pipe for communication with client tracking daemon */ +static int +nfsd4_init_cld_pipe(struct net *net) +{ + int ret; + struct dentry *dentry; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn; + + if (nn->cld_net) + return 0; + + cn = kzalloc(sizeof(*cn), GFP_KERNEL); + if (!cn) { + ret = -ENOMEM; + goto err; + } + + cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(cn->cn_pipe)) { + ret = PTR_ERR(cn->cn_pipe); + goto err; + } + spin_lock_init(&cn->cn_lock); + INIT_LIST_HEAD(&cn->cn_list); + + dentry = nfsd4_cld_register_net(net, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto err_destroy_data; + } + + cn->cn_pipe->dentry = dentry; + nn->cld_net = cn; + return 0; + +err_destroy_data: + rpc_destroy_pipe_data(cn->cn_pipe); +err: + kfree(cn); + printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n", + ret); + return ret; +} + +static void +nfsd4_remove_cld_pipe(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + nfsd4_cld_unregister_net(net, cn->cn_pipe); + rpc_destroy_pipe_data(cn->cn_pipe); + kfree(nn->cld_net); + nn->cld_net = NULL; +} + +static struct cld_upcall * +alloc_cld_upcall(struct cld_net *cn) +{ + struct cld_upcall *new, *tmp; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return new; + + /* FIXME: hard cap on number in flight? */ +restart_search: + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (tmp->cu_msg.cm_xid == cn->cn_xid) { + cn->cn_xid++; + spin_unlock(&cn->cn_lock); + goto restart_search; + } + } + new->cu_task = current; + new->cu_msg.cm_vers = CLD_UPCALL_VERSION; + put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_net = cn; + list_add(&new->cu_list, &cn->cn_list); + spin_unlock(&cn->cn_lock); + + dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + + return new; +} + +static void +free_cld_upcall(struct cld_upcall *victim) +{ + struct cld_net *cn = victim->cu_net; + + spin_lock(&cn->cn_lock); + list_del(&victim->cu_list); + spin_unlock(&cn->cn_lock); + kfree(victim); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_Create; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to create client " + "record on stable storage: %d\n", ret); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_remove(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already removed */ + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_Remove; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to remove client " + "record from stable storage: %d\n", ret); +} + +/* Check for presence of a record, and update its timestamp */ +static int +nfsd4_cld_check(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if one was already stored during this grace pd */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + cup = alloc_cld_upcall(cn); + if (!cup) { + printk(KERN_ERR "NFSD: Unable to check client record on " + "stable storage: %d\n", -ENOMEM); + return -ENOMEM; + } + + cup->cu_msg.cm_cmd = Cld_Check; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); + return ret; +} + +static void +nfsd4_cld_grace_done(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) + ret = cup->cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); +} + +static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { + .init = nfsd4_init_cld_pipe, + .exit = nfsd4_remove_cld_pipe, + .create = nfsd4_cld_create, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check, + .grace_done = nfsd4_cld_grace_done, +}; + +/* upcall via usermodehelper */ +static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; +module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), + S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); + +static bool cltrack_legacy_disable; +module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_legacy_disable, + "Disable legacy recoverydir conversion. Default: false"); + +#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" +#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" +#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" +#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" + +static char * +nfsd4_cltrack_legacy_topdir(void) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", + nfs4_recoverydir()); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + /* +1 is for '/' between "topdir" and "recdir" */ + len = strlen(LEGACY_RECDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", + nfs4_recoverydir()); + if (copied > (len - HEXDIR_LEN)) { + /* just return nothing if output will be truncated */ + kfree(result); + return NULL; + } + + copied = nfs4_make_rec_clidname(result + copied, name); + if (copied) { + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_client_has_session(struct nfs4_client *clp) +{ + int copied; + size_t len; + char *result; + + /* prefix + Y/N character + terminating NULL */ + len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", + clp->cl_minorversion ? 'Y' : 'N'); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_grace_start(time_t grace_start) +{ + int copied; + size_t len; + char *result; + + /* prefix + max width of int64_t string + terminating NULL */ + len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld", + grace_start); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static int +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) +{ + char *envp[3]; + char *argv[4]; + int ret; + + if (unlikely(!cltrack_prog[0])) { + dprintk("%s: cltrack_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s: cmd: %s\n", __func__, cmd); + dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); + dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); + dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); + + envp[0] = env0; + envp[1] = env1; + envp[2] = NULL; + + argv[0] = (char *)cltrack_prog; + argv[1] = cmd; + argv[2] = arg; + argv[3] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or EACCES + * error. The admin can re-enable it on the fly by using sysfs + * once the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + dprintk("NFSD: %s was not found or isn't executable (%d). " + "Setting cltrack_prog to blank string!", + cltrack_prog, ret); + cltrack_prog[0] = '\0'; + } + dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); + + return ret; +} + +static char * +bin_to_hex_dup(const unsigned char *src, int srclen) +{ + int i; + char *buf, *hex; + + /* +1 for terminating NULL */ + buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); + if (!buf) + return buf; + + hex = buf; + for (i = 0; i < srclen; i++) { + sprintf(hex, "%2.2x", *src++); + hex += 2; + } + return buf; +} + +static int +nfsd4_umh_cltrack_init(struct net *net) +{ + int ret; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + /* XXX: The usermode helper s not working in container yet. */ + if (net != &init_net) { + WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " + "tracking in a container!\n"); + return -EINVAL; + } + + ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); + kfree(grace_start); + return ret; +} + +static void +nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) +{ + wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, + TASK_UNINTERRUPTIBLE); +} + +static void +nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) +{ + smp_mb__before_atomic(); + clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); +} + +static void +nfsd4_umh_cltrack_create(struct nfs4_client *clp) +{ + char *hexid, *has_session, *grace_start; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* + * With v4.0 clients, there's little difference in outcome between a + * create and check operation, and we can end up calling into this + * function multiple times per client (once for each openowner). So, + * for v4.0 clients skip upcalling once the client has been recorded + * on stable storage. + * + * For v4.1+ clients, the outcome of the two operations is different, + * so we must ensure that we upcall for the create operation. v4.1+ + * clients call this on RECLAIM_COMPLETE though, so we should only end + * up doing a single create upcall per client. + */ + if (clp->cl_minorversion == 0 && + test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + + has_session = nfsd4_cltrack_client_has_session(clp); + grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + nfsd4_cltrack_upcall_lock(clp); + if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(has_session); + kfree(grace_start); + kfree(hexid); +} + +static void +nfsd4_umh_cltrack_remove(struct nfs4_client *clp) +{ + char *hexid; + + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(hexid); +} + +static int +nfsd4_umh_cltrack_check(struct nfs4_client *clp) +{ + int ret; + char *hexid, *has_session, *legacy; + + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return -ENOMEM; + } + + has_session = nfsd4_cltrack_client_has_session(clp); + legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { + ret = 0; + } else { + ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); + if (ret == 0) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + nfsd4_cltrack_upcall_unlock(clp); + kfree(has_session); + kfree(legacy); + kfree(hexid); + + return ret; +} + +static void +nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) +{ + char *legacy; + char timestr[22]; /* FIXME: better way to determine max size? */ + + sprintf(timestr, "%ld", nn->boot_time); + legacy = nfsd4_cltrack_legacy_topdir(); + nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); + kfree(legacy); +} + +static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { + .init = nfsd4_umh_cltrack_init, + .exit = NULL, + .create = nfsd4_umh_cltrack_create, + .remove = nfsd4_umh_cltrack_remove, + .check = nfsd4_umh_cltrack_check, + .grace_done = nfsd4_umh_cltrack_grace_done, +}; + +int +nfsd4_client_tracking_init(struct net *net) +{ + int status; + struct path path; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + /* just run the init if it the method is already decided */ + if (nn->client_tracking_ops) + goto do_init; + + /* + * First, try a UMH upcall. It should succeed or fail quickly, so + * there's little harm in trying that first. + */ + nn->client_tracking_ops = &nfsd4_umh_tracking_ops; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + + /* + * See if the recoverydir exists and is a directory. If it is, + * then use the legacy ops. + */ + nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; + status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); + if (!status) { + status = d_is_dir(path.dentry); + path_put(&path); + if (status) + goto do_init; + } + + /* Finally, try to use nfsdcld */ + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be " + "removed in 3.10. Please transition to using " + "nfsdcltrack.\n"); +do_init: + status = nn->client_tracking_ops->init(net); + if (status) { + printk(KERN_WARNING "NFSD: Unable to initialize client " + "recovery tracking! (%d)\n", status); + nn->client_tracking_ops = NULL; + } + return status; +} + +void +nfsd4_client_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->client_tracking_ops) { + if (nn->client_tracking_ops->exit) + nn->client_tracking_ops->exit(net); + nn->client_tracking_ops = NULL; + } +} + +void +nfsd4_client_record_create(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->create(clp); +} + +void +nfsd4_client_record_remove(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->remove(clp); +} + +int +nfsd4_client_record_check(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + return nn->client_tracking_ops->check(clp); + + return -EOPNOTSUPP; +} + +void +nfsd4_record_grace_done(struct nfsd_net *nn) +{ + if (nn->client_tracking_ops) + nn->client_tracking_ops->grace_done(nn); +} + +static int +rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (!cn) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + cn->cn_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (cn->cn_pipe->dentry) + nfsd4_cld_unregister_sb(cn->cn_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfsd4_cld_block = { + .notifier_call = rpc_pipefs_event, +}; + +int +register_cld_notifier(void) +{ + return rpc_pipefs_notifier_register(&nfsd4_cld_block); +} + +void +unregister_cld_notifier(void) +{ + rpc_pipefs_notifier_unregister(&nfsd4_cld_block); +} diff --git a/kernel/fs/nfsd/nfs4state.c b/kernel/fs/nfsd/nfs4state.c new file mode 100644 index 000000000..039f9c8a9 --- /dev/null +++ b/kernel/fs/nfsd/nfs4state.c @@ -0,0 +1,6716 @@ +/* +* Copyright (c) 2001 The Regents of the University of Michigan. +* All rights reserved. +* +* Kendrick Smith +* Andy Adamson +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xdr4.h" +#include "xdr4cb.h" +#include "vfs.h" +#include "current_stateid.h" + +#include "netns.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +#define all_ones {{~0,~0},~0} +static const stateid_t one_stateid = { + .si_generation = ~0, + .si_opaque = all_ones, +}; +static const stateid_t zero_stateid = { + /* all fields zero */ +}; +static const stateid_t currentstateid = { + .si_generation = 1, +}; + +static u64 current_sessionid = 1; + +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) +#define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) + +/* forward declarations */ +static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); +static void nfs4_free_ol_stateid(struct nfs4_stid *stid); + +/* Locking: */ + +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(state_lock); + +/* + * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for + * the refcount on the open stateid to drop. + */ +static DECLARE_WAIT_QUEUE_HEAD(close_wq); + +static struct kmem_cache *openowner_slab; +static struct kmem_cache *lockowner_slab; +static struct kmem_cache *file_slab; +static struct kmem_cache *stateid_slab; +static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; + +static void free_session(struct nfsd4_session *); + +static struct nfsd4_callback_ops nfsd4_cb_recall_ops; + +static bool is_session_dead(struct nfsd4_session *ses) +{ + return ses->se_flags & NFS4_SESSION_DEAD; +} + +static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) +{ + if (atomic_read(&ses->se_ref) > ref_held_by_me) + return nfserr_jukebox; + ses->se_flags |= NFS4_SESSION_DEAD; + return nfs_ok; +} + +static bool is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + +static __be32 get_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (is_client_expired(clp)) + return nfserr_expired; + atomic_inc(&clp->cl_refcount); + return nfs_ok; +} + +/* must be called under the client_lock */ +static inline void +renew_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (is_client_expired(clp)) { + WARN_ON(1); + printk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + + dprintk("renewing client (clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + list_move_tail(&clp->cl_lru, &nn->client_lru); + clp->cl_time = get_seconds(); +} + +static void put_client_renew_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (!atomic_dec_and_test(&clp->cl_refcount)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); +} + +static void put_client_renew(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) +{ + __be32 status; + + if (is_session_dead(ses)) + return nfserr_badsession; + status = get_client_locked(ses->se_client); + if (status) + return status; + atomic_inc(&ses->se_ref); + return nfs_ok; +} + +static void nfsd4_put_session_locked(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) + free_session(ses); + put_client_renew_locked(clp); +} + +static void nfsd4_put_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + nfsd4_put_session_locked(ses); + spin_unlock(&nn->client_lock); +} + +static inline struct nfs4_stateowner * +nfs4_get_stateowner(struct nfs4_stateowner *sop) +{ + atomic_inc(&sop->so_count); + return sop; +} + +static int +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len); +} + +static struct nfs4_openowner * +find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + if (!so->so_is_open_owner) + continue; + if (same_owner_str(so, &open->op_owner)) + return openowner(nfs4_get_stateowner(so)); + } + return NULL; +} + +static struct nfs4_openowner * +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + spin_lock(&clp->cl_lock); + oo = find_openstateowner_str_locked(hashval, open, clp); + spin_unlock(&clp->cl_lock); + return oo; +} + +static inline u32 +opaque_hashval(const void *ptr, int nbytes) +{ + unsigned char *cptr = (unsigned char *) ptr; + + u32 x = 0; + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x; +} + +static void nfsd4_free_file_rcu(struct rcu_head *rcu) +{ + struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); + + kmem_cache_free(file_slab, fp); +} + +void +put_nfs4_file(struct nfs4_file *fi) +{ + might_lock(&state_lock); + + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { + hlist_del_rcu(&fi->fi_hash); + spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); + WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); + call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); + } +} + +static struct file * +__nfs4_get_fd(struct nfs4_file *f, int oflag) +{ + if (f->fi_fds[oflag]) + return get_file(f->fi_fds[oflag]); + return NULL; +} + +static struct file * +find_writeable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_writeable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_writeable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct file *find_readable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_RDONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_readable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_readable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +struct file * +find_any_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = __nfs4_get_fd(f, O_RDWR); + if (!ret) { + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDONLY); + } + spin_unlock(&f->fi_lock); + return ret; +} + +static atomic_long_t num_delegations; +unsigned long max_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for lock and open owners */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) +{ + unsigned int ret; + + ret = opaque_hashval(ownername->data, ownername->len); + return ret & OWNER_HASH_MASK; +} + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) + +static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) +{ + return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); +} + +static unsigned int file_hashval(struct knfsd_fh *fh) +{ + return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); +} + +static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + +static void +__nfs4_file_get_access(struct nfs4_file *fp, u32 access) +{ + lockdep_assert_held(&fp->fi_lock); + + if (access & NFS4_SHARE_ACCESS_WRITE) + atomic_inc(&fp->fi_access[O_WRONLY]); + if (access & NFS4_SHARE_ACCESS_READ) + atomic_inc(&fp->fi_access[O_RDONLY]); +} + +static __be32 +nfs4_file_get_access(struct nfs4_file *fp, u32 access) +{ + lockdep_assert_held(&fp->fi_lock); + + /* Does this access mode make sense? */ + if (access & ~NFS4_SHARE_ACCESS_BOTH) + return nfserr_inval; + + /* Does it conflict with a deny mode already set? */ + if ((access & fp->fi_share_deny) != 0) + return nfserr_share_denied; + + __nfs4_file_get_access(fp, access); + return nfs_ok; +} + +static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) +{ + /* Common case is that there is no deny mode. */ + if (deny) { + /* Does this deny mode make sense? */ + if (deny & ~NFS4_SHARE_DENY_BOTH) + return nfserr_inval; + + if ((deny & NFS4_SHARE_DENY_READ) && + atomic_read(&fp->fi_access[O_RDONLY])) + return nfserr_share_denied; + + if ((deny & NFS4_SHARE_DENY_WRITE) && + atomic_read(&fp->fi_access[O_WRONLY])) + return nfserr_share_denied; + } + return nfs_ok; +} + +static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + might_lock(&fp->fi_lock); + + if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { + struct file *f1 = NULL; + struct file *f2 = NULL; + + swap(f1, fp->fi_fds[oflag]); + if (atomic_read(&fp->fi_access[1 - oflag]) == 0) + swap(f2, fp->fi_fds[O_RDWR]); + spin_unlock(&fp->fi_lock); + if (f1) + fput(f1); + if (f2) + fput(f2); + } +} + +static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) +{ + WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); + + if (access & NFS4_SHARE_ACCESS_WRITE) + __nfs4_file_put_access(fp, O_WRONLY); + if (access & NFS4_SHARE_ACCESS_READ) + __nfs4_file_put_access(fp, O_RDONLY); +} + +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + atomic_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + atomic_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab) +{ + struct nfs4_stid *stid; + int new_id; + + stid = kmem_cache_zalloc(slab, GFP_KERNEL); + if (!stid) + return NULL; + + idr_preload(GFP_KERNEL); + spin_lock(&cl->cl_lock); + new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT); + spin_unlock(&cl->cl_lock); + idr_preload_end(); + if (new_id < 0) + goto out_free; + stid->sc_client = cl; + stid->sc_stateid.si_opaque.so_id = new_id; + stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; + /* Will be incremented before return to client: */ + atomic_set(&stid->sc_count, 1); + + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ + return stid; +out_free: + kmem_cache_free(slab, stid); + return NULL; +} + +static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) +{ + struct nfs4_stid *stid; + struct nfs4_ol_stateid *stp; + + stid = nfs4_alloc_stid(clp, stateid_slab); + if (!stid) + return NULL; + + stp = openlockstateid(stid); + stp->st_stid.sc_free = nfs4_free_ol_stateid; + return stp; +} + +static void nfs4_free_deleg(struct nfs4_stid *stid) +{ + kmem_cache_free(deleg_slab, stid); + atomic_long_dec(&num_delegations); +} + +/* + * When we recall a delegation, we should be careful not to hand it + * out again straight away. + * To ensure this we keep a pair of bloom filters ('new' and 'old') + * in which the filehandles of recalled delegations are "stored". + * If a filehandle appear in either filter, a delegation is blocked. + * When a delegation is recalled, the filehandle is stored in the "new" + * filter. + * Every 30 seconds we swap the filters and clear the "new" one, + * unless both are empty of course. + * + * Each filter is 256 bits. We hash the filehandle to 32bit and use the + * low 3 bytes as hash-table indices. + * + * 'blocked_delegations_lock', which is always taken in block_delegations(), + * is used to manage concurrent access. Testing does not need the lock + * except when swapping the two filters. + */ +static DEFINE_SPINLOCK(blocked_delegations_lock); +static struct bloom_pair { + int entries, old_entries; + time_t swap_time; + int new; /* index into 'set' */ + DECLARE_BITMAP(set[2], 256); +} blocked_delegations; + +static int delegation_blocked(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + if (bd->entries == 0) + return 0; + if (seconds_since_boot() - bd->swap_time > 30) { + spin_lock(&blocked_delegations_lock); + if (seconds_since_boot() - bd->swap_time > 30) { + bd->entries -= bd->old_entries; + bd->old_entries = bd->entries; + memset(bd->set[bd->new], 0, + sizeof(bd->set[0])); + bd->new = 1-bd->new; + bd->swap_time = seconds_since_boot(); + } + spin_unlock(&blocked_delegations_lock); + } + hash = jhash(&fh->fh_base, fh->fh_size, 0); + if (test_bit(hash&255, bd->set[0]) && + test_bit((hash>>8)&255, bd->set[0]) && + test_bit((hash>>16)&255, bd->set[0])) + return 1; + + if (test_bit(hash&255, bd->set[1]) && + test_bit((hash>>8)&255, bd->set[1]) && + test_bit((hash>>16)&255, bd->set[1])) + return 1; + + return 0; +} + +static void block_delegations(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + hash = jhash(&fh->fh_base, fh->fh_size, 0); + + spin_lock(&blocked_delegations_lock); + __set_bit(hash&255, bd->set[bd->new]); + __set_bit((hash>>8)&255, bd->set[bd->new]); + __set_bit((hash>>16)&255, bd->set[bd->new]); + if (bd->entries == 0) + bd->swap_time = seconds_since_boot(); + bd->entries += 1; + spin_unlock(&blocked_delegations_lock); +} + +static struct nfs4_delegation * +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) +{ + struct nfs4_delegation *dp; + long n; + + dprintk("NFSD alloc_init_deleg\n"); + n = atomic_long_inc_return(&num_delegations); + if (n < 0 || n > max_delegations) + goto out_dec; + if (delegation_blocked(¤t_fh->fh_handle)) + goto out_dec; + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); + if (dp == NULL) + goto out_dec; + + dp->dl_stid.sc_free = nfs4_free_deleg; + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; + INIT_LIST_HEAD(&dp->dl_perfile); + INIT_LIST_HEAD(&dp->dl_perclnt); + INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); + dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_retries = 1; + nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, + &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + return dp; +out_dec: + atomic_long_dec(&num_delegations); + return NULL; +} + +void +nfs4_put_stid(struct nfs4_stid *s) +{ + struct nfs4_file *fp = s->sc_file; + struct nfs4_client *clp = s->sc_client; + + might_lock(&clp->cl_lock); + + if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + wake_up_all(&close_wq); + return; + } + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + spin_unlock(&clp->cl_lock); + s->sc_free(s); + if (fp) + put_nfs4_file(fp); +} + +static void nfs4_put_deleg_lease(struct nfs4_file *fp) +{ + struct file *filp = NULL; + + spin_lock(&fp->fi_lock); + if (fp->fi_deleg_file && --fp->fi_delegees == 0) + swap(filp, fp->fi_deleg_file); + spin_unlock(&fp->fi_lock); + + if (filp) { + vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); + fput(filp); + } +} + +void nfs4_unhash_stid(struct nfs4_stid *s) +{ + s->sc_type = 0; +} + +static void +hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + + atomic_inc(&dp->dl_stid.sc_count); + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); +} + +static void +unhash_delegation_locked(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + + lockdep_assert_held(&state_lock); + + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + /* Ensure that deleg break won't try to requeue it */ + ++dp->dl_time; + spin_lock(&fp->fi_lock); + list_del_init(&dp->dl_perclnt); + list_del_init(&dp->dl_recall_lru); + list_del_init(&dp->dl_perfile); + spin_unlock(&fp->fi_lock); +} + +static void destroy_delegation(struct nfs4_delegation *dp) +{ + spin_lock(&state_lock); + unhash_delegation_locked(dp); + spin_unlock(&state_lock); + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); +} + +static void revoke_delegation(struct nfs4_delegation *dp) +{ + struct nfs4_client *clp = dp->dl_stid.sc_client; + + WARN_ON(!list_empty(&dp->dl_recall_lru)); + + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + + if (clp->cl_minorversion == 0) + nfs4_put_stid(&dp->dl_stid); + else { + dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + spin_lock(&clp->cl_lock); + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + spin_unlock(&clp->cl_lock); + } +} + +/* + * SETCLIENTID state + */ + +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(const char *name) +{ + return opaque_hashval(name, 8) & CLIENT_HASH_MASK; +} + +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static unsigned int +bmap_to_share_mode(unsigned long bmap) { + int i; + unsigned int access = 0; + + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + access |= i; + } + return access; +} + +/* set share access for a given stateid */ +static inline void +set_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; +} + +/* clear share access for a given stateid */ +static inline void +clear_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; +} + +/* test whether a given stateid has access */ +static inline bool +test_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); +} + +/* set share deny for a given stateid */ +static inline void +set_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; +} + +/* clear share deny for a given stateid */ +static inline void +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; +} + +/* test whether a given stateid is denying specific access */ +static inline bool +test_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + WARN_ON_ONCE(1); + return O_RDONLY; +} + +/* + * A stateid that had a deny mode associated with it is being released + * or downgraded. Recalculate the deny mode on the file. + */ +static void +recalculate_deny_mode(struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *stp; + + spin_lock(&fp->fi_lock); + fp->fi_share_deny = 0; + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + spin_unlock(&fp->fi_lock); +} + +static void +reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + int i; + bool change = false; + + for (i = 1; i < 4; i++) { + if ((i & deny) != i) { + change = true; + clear_deny(i, stp); + } + } + + /* Recalculate per-file deny mode if there was a change */ + if (change) + recalculate_deny_mode(stp->st_stid.sc_file); +} + +/* release all access and file references for a given stateid */ +static void +release_all_access(struct nfs4_ol_stateid *stp) +{ + int i; + struct nfs4_file *fp = stp->st_stid.sc_file; + + if (fp && stp->st_deny_bmap != 0) + recalculate_deny_mode(fp); + + for (i = 1; i < 4; i++) { + if (test_access(i, stp)) + nfs4_file_put_access(stp->st_stid.sc_file, i); + clear_access(i, stp); + } +} + +static void nfs4_put_stateowner(struct nfs4_stateowner *sop) +{ + struct nfs4_client *clp = sop->so_client; + + might_lock(&clp->cl_lock); + + if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) + return; + sop->so_ops->so_unhash(sop); + spin_unlock(&clp->cl_lock); + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + +static void unhash_ol_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp = stp->st_stid.sc_file; + + lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + + spin_lock(&fp->fi_lock); + list_del(&stp->st_perfile); + spin_unlock(&fp->fi_lock); + list_del(&stp->st_perstateowner); +} + +static void nfs4_free_ol_stateid(struct nfs4_stid *stid) +{ + struct nfs4_ol_stateid *stp = openlockstateid(stid); + + put_clnt_odstate(stp->st_clnt_odstate); + release_all_access(stp); + if (stp->st_stateowner) + nfs4_put_stateowner(stp->st_stateowner); + kmem_cache_free(stateid_slab, stid); +} + +static void nfs4_free_lock_stateid(struct nfs4_stid *stid) +{ + struct nfs4_ol_stateid *stp = openlockstateid(stid); + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + struct file *file; + + file = find_any_file(stp->st_stid.sc_file); + if (file) + filp_close(file, (fl_owner_t)lo); + nfs4_free_ol_stateid(stid); +} + +/* + * Put the persistent reference to an already unhashed generic stateid, while + * holding the cl_lock. If it's the last reference, then put it onto the + * reaplist for later destruction. + */ +static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) +{ + struct nfs4_stid *s = &stp->st_stid; + struct nfs4_client *clp = s->sc_client; + + lockdep_assert_held(&clp->cl_lock); + + WARN_ON_ONCE(!list_empty(&stp->st_locks)); + + if (!atomic_dec_and_test(&s->sc_count)) { + wake_up_all(&close_wq); + return; + } + + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + list_add(&stp->st_locks, reaplist); +} + +static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + + lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + + list_del_init(&stp->st_locks); + unhash_ol_stateid(stp); + nfs4_unhash_stid(&stp->st_stid); +} + +static void release_lock_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + + spin_lock(&oo->oo_owner.so_client->cl_lock); + unhash_lock_stateid(stp); + spin_unlock(&oo->oo_owner.so_client->cl_lock); + nfs4_put_stid(&stp->st_stid); +} + +static void unhash_lockowner_locked(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&lo->lo_owner.so_strhash); +} + +/* + * Free a list of generic stateids that were collected earlier after being + * fully unhashed. + */ +static void +free_ol_stateid_reaplist(struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_file *fp; + + might_sleep(); + + while (!list_empty(reaplist)) { + stp = list_first_entry(reaplist, struct nfs4_ol_stateid, + st_locks); + list_del(&stp->st_locks); + fp = stp->st_stid.sc_file; + stp->st_stid.sc_free(&stp->st_stid); + if (fp) + put_nfs4_file(fp); + } +} + +static void release_lockowner(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *stp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); +} + +static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, + struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + + while (!list_empty(&open_stp->st_locks)) { + stp = list_entry(open_stp->st_locks.next, + struct nfs4_ol_stateid, st_locks); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, reaplist); + } +} + +static void unhash_open_stateid(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) +{ + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + + unhash_ol_stateid(stp); + release_open_stateid_locks(stp, reaplist); +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + LIST_HEAD(reaplist); + + spin_lock(&stp->st_stid.sc_client->cl_lock); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&stp->st_stid.sc_client->cl_lock); + free_ol_stateid_reaplist(&reaplist); +} + +static void unhash_openowner_locked(struct nfs4_openowner *oo) +{ + struct nfs4_client *clp = oo->oo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&oo->oo_owner.so_strhash); + list_del_init(&oo->oo_perclient); +} + +static void release_last_closed_stateid(struct nfs4_openowner *oo) +{ + struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, + nfsd_net_id); + struct nfs4_ol_stateid *s; + + spin_lock(&nn->client_lock); + s = oo->oo_last_closed_stid; + if (s) { + list_del_init(&oo->oo_close_lru); + oo->oo_last_closed_stid = NULL; + } + spin_unlock(&nn->client_lock); + if (s) + nfs4_put_stid(&s->st_stid); +} + +static void release_openowner(struct nfs4_openowner *oo) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_openowner_locked(oo); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + release_last_closed_stateid(oo); + nfs4_put_stateowner(&oo->oo_owner); +} + +static inline int +hash_sessionid(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; + + return sid->sequence % SESSION_HASH_SIZE; +} + +#ifdef CONFIG_SUNRPC_DEBUG +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ + u32 *ptr = (u32 *)(&sessionid->data[0]); + dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); +} +#else +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ +} +#endif + +/* + * Bump the seqid on cstate->replay_owner, and clear replay_owner if it + * won't be used for replay. + */ +void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (nfserr == nfserr_replay_me) + return; + + if (!seqid_mutating_err(ntohl(nfserr))) { + nfsd4_cstate_clear_replay(cstate); + return; + } + if (!so) + return; + if (so->so_is_open_owner) + release_last_closed_stateid(openowner(so)); + so->so_seqid++; + return; +} + +static void +gen_sessionid(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_sessionid *sid; + + sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; + sid->clientid = clp->cl_clientid; + sid->sequence = current_sessionid++; + sid->reserved = 0; +} + +/* + * The protocol defines ca_maxresponssize_cached to include the size of + * the rpc header, but all we need to cache is the data starting after + * the end of the initial SEQUENCE operation--the rest we regenerate + * each time. Therefore we can advertise a ca_maxresponssize_cached + * value that is the number of bytes in our cache plus a few additional + * bytes. In order to stay on the safe side, and not promise more than + * we can cache, those additional bytes must be the minimum possible: 24 + * bytes of rpc header (xid through accept state, with AUTH_NULL + * verifier), 12 for the compound header (with zero-length tag), and 44 + * for the SEQUENCE op response: + */ +#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) + +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) + kfree(ses->se_slots[i]); +} + +/* + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +{ + u32 size; + + if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) + size = 0; + else + size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + return size + sizeof(struct nfsd4_slot); +} + +/* + * XXX: If we run out of reserved DRC memory we could (up to a point) + * re-negotiate active sessions and reduce their slot usage to make + * room for new connections. For now we just fail the create session. + */ +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +{ + u32 slotsize = slot_bytes(ca); + u32 num = ca->maxreqs; + int avail; + + spin_lock(&nfsd_drc_lock); + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, + nfsd_drc_max_mem - nfsd_drc_mem_used); + num = min_t(int, num, avail / slotsize); + nfsd_drc_mem_used += num * slotsize; + spin_unlock(&nfsd_drc_lock); + + return num; +} + +static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) +{ + int slotsize = slot_bytes(ca); + + spin_lock(&nfsd_drc_lock); + nfsd_drc_mem_used -= slotsize * ca->maxreqs; + spin_unlock(&nfsd_drc_lock); +} + +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, + struct nfsd4_channel_attrs *battrs) +{ + int numslots = fattrs->maxreqs; + int slotsize = slot_bytes(fattrs); + struct nfsd4_session *new; + int mem, i; + + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + mem = numslots * sizeof(struct nfsd4_slot *); + + new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + if (!new) + return NULL; + /* allocate each struct nfsd4_slot and data cache in one piece */ + for (i = 0; i < numslots; i++) { + new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); + if (!new->se_slots[i]) + goto out_free; + } + + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + + return new; +out_free: + while (i--) + kfree(new->se_slots[i]); + kfree(new); + return NULL; +} + +static void free_conn(struct nfsd4_conn *c) +{ + svc_xprt_put(c->cn_xprt); + kfree(c); +} + +static void nfsd4_conn_lost(struct svc_xpt_user *u) +{ + struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); + struct nfs4_client *clp = c->cn_session->se_client; + + spin_lock(&clp->cl_lock); + if (!list_empty(&c->cn_persession)) { + list_del(&c->cn_persession); + free_conn(c); + } + nfsd4_probe_callback(clp); + spin_unlock(&clp->cl_lock); +} + +static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) +{ + struct nfsd4_conn *conn; + + conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); + if (!conn) + return NULL; + svc_xprt_get(rqstp->rq_xprt); + conn->cn_xprt = rqstp->rq_xprt; + conn->cn_flags = flags; + INIT_LIST_HEAD(&conn->cn_xpt_user.list); + return conn; +} + +static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + conn->cn_session = ses; + list_add(&conn->cn_persession, &ses->se_conns); +} + +static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + + spin_lock(&clp->cl_lock); + __nfsd4_hash_conn(conn, ses); + spin_unlock(&clp->cl_lock); +} + +static int nfsd4_register_conn(struct nfsd4_conn *conn) +{ + conn->cn_xpt_user.callback = nfsd4_conn_lost; + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); +} + +static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + int ret; + + nfsd4_hash_conn(conn, ses); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); + /* We may have gained or lost a callback channel: */ + nfsd4_probe_callback_sync(ses->se_client); +} + +static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) +{ + u32 dir = NFS4_CDFC4_FORE; + + if (cses->flags & SESSION4_BACK_CHAN) + dir |= NFS4_CDFC4_BACK; + return alloc_conn(rqstp, dir); +} + +/* must be called under client_lock */ +static void nfsd4_del_conns(struct nfsd4_session *s) +{ + struct nfs4_client *clp = s->se_client; + struct nfsd4_conn *c; + + spin_lock(&clp->cl_lock); + while (!list_empty(&s->se_conns)) { + c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); + list_del_init(&c->cn_persession); + spin_unlock(&clp->cl_lock); + + unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); + free_conn(c); + + spin_lock(&clp->cl_lock); + } + spin_unlock(&clp->cl_lock); +} + +static void __free_session(struct nfsd4_session *ses) +{ + free_session_slots(ses); + kfree(ses); +} + +static void free_session(struct nfsd4_session *ses) +{ + nfsd4_del_conns(ses); + nfsd4_put_drc_mem(&ses->se_fchannel); + __free_session(ses); +} + +static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) +{ + int idx; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + new->se_client = clp; + gen_sessionid(new); + + INIT_LIST_HEAD(&new->se_conns); + + new->se_cb_seq_nr = 1; + new->se_flags = cses->flags; + new->se_cb_prog = cses->callback_prog; + new->se_cb_sec = cses->cb_sec; + atomic_set(&new->se_ref, 0); + idx = hash_sessionid(&new->se_sessionid); + list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); + spin_lock(&clp->cl_lock); + list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&clp->cl_lock); + + { + struct sockaddr *sa = svc_addr(rqstp); + /* + * This is a little silly; with sessions there's no real + * use for the callback address. Use the peer address + * as a reasonable default for now, but consider fixing + * the rpc client not to require an address in the + * future: + */ + rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); + clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + } +} + +/* caller must hold client_lock */ +static struct nfsd4_session * +__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) +{ + struct nfsd4_session *elem; + int idx; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + dump_sessionid(__func__, sessionid); + idx = hash_sessionid(sessionid); + /* Search in the appropriate list */ + list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) { + if (!memcmp(elem->se_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + return elem; + } + } + + dprintk("%s: session not found\n", __func__); + return NULL; +} + +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, + __be32 *ret) +{ + struct nfsd4_session *session; + __be32 status = nfserr_badsession; + + session = __find_in_sessionid_hashtbl(sessionid, net); + if (!session) + goto out; + status = nfsd4_get_session_locked(session); + if (status) + session = NULL; +out: + *ret = status; + return session; +} + +/* caller must hold client_lock */ +static void +unhash_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + list_del(&ses->se_hash); + spin_lock(&ses->se_client->cl_lock); + list_del(&ses->se_perclnt); + spin_unlock(&ses->se_client->cl_lock); +} + +/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ +static int +STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) +{ + /* + * We're assuming the clid was not given out from a boot + * precisely 2^32 (about 136 years) before this one. That seems + * a safe assumption: + */ + if (clid->cl_boot == (u32)nn->boot_time) + return 0; + dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", + clid->cl_boot, clid->cl_id, nn->boot_time); + return 1; +} + +/* + * XXX Should we use a slab cache ? + * This type of memory management is somewhat inefficient, but we use it + * anyway since SETCLIENTID is not a common operation. + */ +static struct nfs4_client *alloc_client(struct xdr_netobj name) +{ + struct nfs4_client *clp; + int i; + + clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); + if (clp == NULL) + return NULL; + clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); + if (clp->cl_name.data == NULL) + goto err_no_name; + clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * + OWNER_HASH_SIZE, GFP_KERNEL); + if (!clp->cl_ownerstr_hashtbl) + goto err_no_hashtbl; + for (i = 0; i < OWNER_HASH_SIZE; i++) + INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); + clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_revoked); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&clp->cl_lo_states); +#endif + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + return clp; +err_no_hashtbl: + kfree(clp->cl_name.data); +err_no_name: + kfree(clp); + return NULL; +} + +static void +free_client(struct nfs4_client *clp) +{ + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + list_del(&ses->se_perclnt); + WARN_ON_ONCE(atomic_read(&ses->se_ref)); + free_session(ses); + } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); + free_svc_cred(&clp->cl_cred); + kfree(clp->cl_ownerstr_hashtbl); + kfree(clp->cl_name.data); + idr_destroy(&clp->cl_stateids); + kfree(clp); +} + +/* must be called under the client_lock */ +static void +unhash_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_session *ses; + + lockdep_assert_held(&nn->client_lock); + + /* Mark the client as expired! */ + clp->cl_time = 0; + /* Make it invisible */ + if (!list_empty(&clp->cl_idhash)) { + list_del_init(&clp->cl_idhash); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + rb_erase(&clp->cl_namenode, &nn->conf_name_tree); + else + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + } + list_del_init(&clp->cl_lru); + spin_lock(&clp->cl_lock); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + list_del_init(&ses->se_hash); + spin_unlock(&clp->cl_lock); +} + +static void +unhash_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_refcount)) + return nfserr_jukebox; + unhash_client_locked(clp); + return nfs_ok; +} + +static void +__destroy_client(struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + struct nfs4_delegation *dp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + spin_lock(&state_lock); + while (!list_empty(&clp->cl_delegations)) { + dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + while (!list_empty(&reaplist)) { + dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); + } + while (!list_empty(&clp->cl_revoked)) { + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); + } + while (!list_empty(&clp->cl_openowners)) { + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + nfs4_get_stateowner(&oo->oo_owner); + release_openowner(oo); + } + nfsd4_return_all_client_layouts(clp); + nfsd4_shutdown_callback(clp); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + free_client(clp); +} + +static void +destroy_client(struct nfs4_client *clp) +{ + unhash_client(clp); + __destroy_client(clp); +} + +static void expire_client(struct nfs4_client *clp) +{ + unhash_client(clp); + nfsd4_client_record_remove(clp); + __destroy_client(clp); +} + +static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +{ + memcpy(target->cl_verifier.data, source->data, + sizeof(target->cl_verifier.data)); +} + +static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) +{ + target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; + target->cl_clientid.cl_id = source->cl_clientid.cl_id; +} + +static int copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + if (source->cr_principal) { + target->cr_principal = + kstrdup(source->cr_principal, GFP_KERNEL); + if (target->cr_principal == NULL) + return -ENOMEM; + } else + target->cr_principal = NULL; + target->cr_flavor = source->cr_flavor; + target->cr_uid = source->cr_uid; + target->cr_gid = source->cr_gid; + target->cr_group_info = source->cr_group_info; + get_group_info(target->cr_group_info); + target->cr_gss_mech = source->cr_gss_mech; + if (source->cr_gss_mech) + gss_mech_get(source->cr_gss_mech); + return 0; +} + +static int +compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) +{ + if (o1->len < o2->len) + return -1; + if (o1->len > o2->len) + return 1; + return memcmp(o1->data, o2->data, o1->len); +} + +static int same_name(const char *n1, const char *n2) +{ + return 0 == memcmp(n1, n2, HEXDIR_LEN); +} + +static int +same_verf(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); +} + +static int +same_clid(clientid_t *cl1, clientid_t *cl2) +{ + return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); +} + +static bool groups_equal(struct group_info *g1, struct group_info *g2) +{ + int i; + + if (g1->ngroups != g2->ngroups) + return false; + for (i=0; ingroups; i++) + if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) + return false; + return true; +} + +/* + * RFC 3530 language requires clid_inuse be returned when the + * "principal" associated with a requests differs from that previously + * used. We use uid, gid's, and gss principal string as our best + * approximation. We also don't want to allow non-gss use of a client + * established using gss: in theory cr_principal should catch that + * change, but in practice cr_principal can be null even in the gss case + * since gssd doesn't always pass down a principal string. + */ +static bool is_gss_cred(struct svc_cred *cr) +{ + /* Is cr_flavor one of the gss "pseudoflavors"?: */ + return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR); +} + + +static bool +same_creds(struct svc_cred *cr1, struct svc_cred *cr2) +{ + if ((is_gss_cred(cr1) != is_gss_cred(cr2)) + || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) + || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) + || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) + return false; + if (cr1->cr_principal == cr2->cr_principal) + return true; + if (!cr1->cr_principal || !cr2->cr_principal) + return false; + return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); +} + +static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + u32 service; + + if (!cr->cr_gss_mech) + return false; + service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); + return service == RPC_GSS_SVC_INTEGRITY || + service == RPC_GSS_SVC_PRIVACY; +} + +static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + + if (!cl->cl_mach_cred) + return true; + if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech) + return false; + if (!svc_rqst_integrity_protected(rqstp)) + return false; + if (!cr->cr_principal) + return false; + return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); +} + +static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) +{ + __be32 verf[2]; + + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)get_seconds(); + verf[1] = (__force __be32)nn->clientid_counter; + memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); +} + +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) +{ + clp->cl_clientid.cl_boot = nn->boot_time; + clp->cl_clientid.cl_id = nn->clientid_counter++; + gen_confirm(clp, nn); +} + +static struct nfs4_stid * +find_stateid_locked(struct nfs4_client *cl, stateid_t *t) +{ + struct nfs4_stid *ret; + + ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); + if (!ret || !ret->sc_type) + return NULL; + return ret; +} + +static struct nfs4_stid * +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, t); + if (s != NULL) { + if (typemask & s->sc_type) + atomic_inc(&s->sc_count); + else + s = NULL; + } + spin_unlock(&cl->cl_lock); + return s; +} + +static struct nfs4_client *create_client(struct xdr_netobj name, + struct svc_rqst *rqstp, nfs4_verifier *verf) +{ + struct nfs4_client *clp; + struct sockaddr *sa = svc_addr(rqstp); + int ret; + struct net *net = SVC_NET(rqstp); + + clp = alloc_client(name); + if (clp == NULL) + return NULL; + + ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); + if (ret) { + free_client(clp); + return NULL; + } + nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); + clp->cl_time = get_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + copy_verf(clp, verf); + rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); + clp->cl_cb_session = NULL; + clp->net = net; + return clp; +} + +static void +add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct nfs4_client *clp; + + while (*new) { + clp = rb_entry(*new, struct nfs4_client, cl_namenode); + parent = *new; + + if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_clp->cl_namenode, parent, new); + rb_insert_color(&new_clp->cl_namenode, root); +} + +static struct nfs4_client * +find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) +{ + int cmp; + struct rb_node *node = root->rb_node; + struct nfs4_client *clp; + + while (node) { + clp = rb_entry(node, struct nfs4_client, cl_namenode); + cmp = compare_blob(&clp->cl_name, name); + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + return clp; + } + return NULL; +} + +static void +add_to_unconfirmed(struct nfs4_client *clp) +{ + unsigned int idhashval; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + add_clp_to_name_tree(clp, &nn->unconf_name_tree); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); + renew_client_locked(clp); +} + +static void +move_to_confirmed(struct nfs4_client *clp) +{ + unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); + list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + add_clp_to_name_tree(clp, &nn->conf_name_tree); + set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + renew_client_locked(clp); +} + +static struct nfs4_client * +find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) +{ + struct nfs4_client *clp; + unsigned int idhashval = clientid_hashval(clid->cl_id); + + list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { + if (same_clid(&clp->cl_clientid, clid)) { + if ((bool)clp->cl_minorversion != sessions) + return NULL; + renew_client_locked(clp); + return clp; + } + } + return NULL; +} + +static struct nfs4_client * +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->conf_id_hashtbl; + + lockdep_assert_held(&nn->client_lock); + return find_client_in_id_table(tbl, clid, sessions); +} + +static struct nfs4_client * +find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->unconf_id_hashtbl; + + lockdep_assert_held(&nn->client_lock); + return find_client_in_id_table(tbl, clid, sessions); +} + +static bool clp_used_exchangeid(struct nfs4_client *clp) +{ + return clp->cl_exchange_flags != 0; +} + +static struct nfs4_client * +find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) +{ + lockdep_assert_held(&nn->client_lock); + return find_clp_in_name_tree(name, &nn->conf_name_tree); +} + +static struct nfs4_client * +find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) +{ + lockdep_assert_held(&nn->client_lock); + return find_clp_in_name_tree(name, &nn->unconf_name_tree); +} + +static void +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) +{ + struct nfs4_cb_conn *conn = &clp->cl_cb_conn; + struct sockaddr *sa = svc_addr(rqstp); + u32 scopeid = rpc_get_scope_id(sa); + unsigned short expected_family; + + /* Currently, we only support tcp and tcp6 for the callback channel */ + if (se->se_callback_netid_len == 3 && + !memcmp(se->se_callback_netid_val, "tcp", 3)) + expected_family = AF_INET; + else if (se->se_callback_netid_len == 4 && + !memcmp(se->se_callback_netid_val, "tcp6", 4)) + expected_family = AF_INET6; + else + goto out_err; + + conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *)&conn->cb_addr, + sizeof(conn->cb_addr)); + + if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) + goto out_err; + + if (conn->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; + + conn->cb_prog = se->se_callback_prog; + conn->cb_ident = se->se_callback_ident; + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); + return; +out_err: + conn->cb_addr.ss_family = AF_UNSPEC; + conn->cb_addrlen = 0; + dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " + "will not receive delegations\n", + clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + + return; +} + +/* + * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. + */ +static void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +{ + struct xdr_buf *buf = resp->xdr.buf; + struct nfsd4_slot *slot = resp->cstate.slot; + unsigned int base; + + dprintk("--> %s slot %p\n", __func__, slot); + + slot->sl_opcnt = resp->opcnt; + slot->sl_status = resp->cstate.status; + + slot->sl_flags |= NFSD4_SLOT_INITIALIZED; + if (nfsd4_not_cached(resp)) { + slot->sl_datalen = 0; + return; + } + base = resp->cstate.data_offset; + slot->sl_datalen = buf->len - base; + if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) + WARN("%s: sessions DRC could not cache compound\n", __func__); + return; +} + +/* + * Encode the replay sequence operation from the slot values. + * If cachethis is FALSE encode the uncached rep error on the next + * operation which sets resp->p and increments resp->opcnt for + * nfs4svc_encode_compoundres. + * + */ +static __be32 +nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + struct nfsd4_slot *slot = resp->cstate.slot; + + /* Encode the replayed sequence operation */ + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); + + /* Return nfserr_retry_uncached_rep in next operation. */ + if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + } + return op->status; +} + +/* + * The sequence operation is not cached because we can use the slot and + * session values. + */ +static __be32 +nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq) +{ + struct nfsd4_slot *slot = resp->cstate.slot; + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + __be32 status; + + dprintk("--> %s slot %p\n", __func__, slot); + + status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); + if (status) + return status; + + p = xdr_reserve_space(xdr, slot->sl_datalen); + if (!p) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); + xdr_commit_encode(xdr); + + resp->opcnt = slot->sl_opcnt; + return slot->sl_status; +} + +/* + * Set the exchange_id flags returned by the server. + */ +static void +nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) +{ +#ifdef CONFIG_NFSD_PNFS + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; +#else + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; +#endif + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; + + /* set the wire flags to return to client. */ + clid->flags = new->cl_exchange_flags; +} + +static bool client_has_state(struct nfs4_client *clp) +{ + /* + * Note clp->cl_openowners check isn't quite right: there's no + * need to count owners without stateid's. + * + * Also note we should probably be using this in 4.0 case too. + */ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + +__be32 +nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_exchange_id *exid) +{ + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; + __be32 status; + char addr_str[INET6_ADDRSTRLEN]; + nfs4_verifier verf = exid->verifier; + struct sockaddr *sa = svc_addr(rqstp); + bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + rpc_ntop(sa, addr_str, sizeof(addr_str)); + dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " + "ip_addr=%s flags %x, spa_how %d\n", + __func__, rqstp, exid, exid->clname.len, exid->clname.data, + addr_str, exid->flags, exid->spa_how); + + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) + return nfserr_inval; + + switch (exid->spa_how) { + case SP4_MACH_CRED: + if (!svc_rqst_integrity_protected(rqstp)) + return nfserr_inval; + case SP4_NONE: + break; + default: /* checked by xdr code */ + WARN_ON_ONCE(1); + case SP4_SSV: + return nfserr_encr_alg_unsupp; + } + + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + + /* Cases below refer to rfc 5661 section 18.35.4: */ + spin_lock(&nn->client_lock); + conf = find_confirmed_client_by_name(&exid->clname, nn); + if (conf) { + bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); + bool verfs_match = same_verf(&verf, &conf->cl_verifier); + + if (update) { + if (!clp_used_exchangeid(conf)) { /* buggy client */ + status = nfserr_inval; + goto out; + } + if (!mach_creds_match(conf, rqstp)) { + status = nfserr_wrong_cred; + goto out; + } + if (!creds_match) { /* case 9 */ + status = nfserr_perm; + goto out; + } + if (!verfs_match) { /* case 8 */ + status = nfserr_not_same; + goto out; + } + /* case 6 */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + goto out_copy; + } + if (!creds_match) { /* case 3 */ + if (client_has_state(conf)) { + status = nfserr_clid_inuse; + goto out; + } + goto out_new; + } + if (verfs_match) { /* case 2 */ + conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + goto out_copy; + } + /* case 5, client reboot */ + conf = NULL; + goto out_new; + } + + if (update) { /* case 7 */ + status = nfserr_noent; + goto out; + } + + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); + if (unconf) /* case 4, possible retry or client restart */ + unhash_client_locked(unconf); + + /* case 1 (normal case) */ +out_new: + if (conf) { + status = mark_client_expired_locked(conf); + if (status) + goto out; + } + new->cl_minorversion = cstate->minorversion; + new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); + + gen_clid(new, nn); + add_to_unconfirmed(new); + swap(new, conf); +out_copy: + exid->clientid.cl_boot = conf->cl_clientid.cl_boot; + exid->clientid.cl_id = conf->cl_clientid.cl_id; + + exid->seqid = conf->cl_cs_slot.sl_seqid + 1; + nfsd4_set_ex_flags(conf, exid); + + dprintk("nfsd4_exchange_id seqid %d flags %x\n", + conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); + status = nfs_ok; + +out: + spin_unlock(&nn->client_lock); + if (new) + expire_client(new); + if (unconf) + expire_client(unconf); + return status; +} + +static __be32 +check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) +{ + dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, + slot_seqid); + + /* The slot is in use, and no response has been sent. */ + if (slot_inuse) { + if (seqid == slot_seqid) + return nfserr_jukebox; + else + return nfserr_seq_misordered; + } + /* Note unsigned 32-bit arithmetic handles wraparound: */ + if (likely(seqid == slot_seqid + 1)) + return nfs_ok; + if (seqid == slot_seqid) + return nfserr_replay_cache; + return nfserr_seq_misordered; +} + +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, __be32 nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + +#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ + 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* version, opcount, opcode */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, cache */ \ + 4 ) * sizeof(__be32)) + +#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ + 2 + /* verifier: AUTH_NULL, length 0 */\ + 1 + /* status */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* opcount, opcode, opstatus*/ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, slotID, status */ \ + 5 ) * sizeof(__be32)) + +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) +{ + u32 maxrpc = nn->nfsd_serv->sv_max_mesg; + + if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) + return nfserr_toosmall; + ca->headerpadsz = 0; + ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); + ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); + ca->maxresp_cached = min_t(u32, ca->maxresp_cached, + NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); + ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); + /* + * Note decreasing slot size below client's request may make it + * difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. Clients that + * request larger slots than they need will get poor results: + */ + ca->maxreqs = nfsd4_get_drc_mem(ca); + if (!ca->maxreqs) + return nfserr_jukebox; + + return nfs_ok; +} + +#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ + RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) +#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ + RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) + +static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) +{ + ca->headerpadsz = 0; + + /* + * These RPC_MAX_HEADER macros are overkill, especially since we + * don't even do gss on the backchannel yet. But this is still + * less than 1k. Tighten up this estimate in the unlikely event + * it turns out to be a problem for some client: + */ + if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) + return nfserr_toosmall; + ca->maxresp_cached = 0; + if (ca->maxops < 2) + return nfserr_toosmall; + + return nfs_ok; +} + +static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs) +{ + switch (cbs->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + return nfs_ok; + default: + /* + * GSS case: the spec doesn't allow us to return this + * error. But it also doesn't allow us not to support + * GSS. + * I'd rather this fail hard than return some error the + * client might think it can already handle: + */ + return nfserr_encr_alg_unsupp; + } +} + +__be32 +nfsd4_create_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_create_session *cr_ses) +{ + struct sockaddr *sa = svc_addr(rqstp); + struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; + struct nfsd4_session *new; + struct nfsd4_conn *conn; + struct nfsd4_clid_slot *cs_slot = NULL; + __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) + return nfserr_inval; + status = nfsd4_check_cb_sec(&cr_ses->cb_sec); + if (status) + return status; + status = check_forechannel_attrs(&cr_ses->fore_channel, nn); + if (status) + return status; + status = check_backchannel_attrs(&cr_ses->back_channel); + if (status) + goto out_release_drc_mem; + status = nfserr_jukebox; + new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); + if (!new) + goto out_release_drc_mem; + conn = alloc_conn_from_crses(rqstp, cr_ses); + if (!conn) + goto out_free_session; + + spin_lock(&nn->client_lock); + unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); + conf = find_confirmed_client(&cr_ses->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); + + if (conf) { + status = nfserr_wrong_cred; + if (!mach_creds_match(conf, rqstp)) + goto out_free_conn; + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status == nfserr_replay_cache) { + status = nfsd4_replay_create_session(cr_ses, cs_slot); + goto out_free_conn; + } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { + status = nfserr_seq_misordered; + goto out_free_conn; + } + } else if (unconf) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { + status = nfserr_clid_inuse; + goto out_free_conn; + } + status = nfserr_wrong_cred; + if (!mach_creds_match(unconf, rqstp)) + goto out_free_conn; + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + /* an unconfirmed replay returns misordered */ + status = nfserr_seq_misordered; + goto out_free_conn; + } + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = mark_client_expired_locked(old); + if (status) { + old = NULL; + goto out_free_conn; + } + } + move_to_confirmed(unconf); + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out_free_conn; + } + status = nfs_ok; + /* + * We do not support RDMA or persistent sessions + */ + cr_ses->flags &= ~SESSION4_PERSIST; + cr_ses->flags &= ~SESSION4_RDMA; + + init_session(rqstp, new, conf, cr_ses); + nfsd4_get_session_locked(new); + + memcpy(cr_ses->sessionid.data, new->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + + /* cache solo and embedded create sessions under the client_lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); + spin_unlock(&nn->client_lock); + /* init connection and backchannel */ + nfsd4_init_conn(rqstp, conn, new); + nfsd4_put_session(new); + if (old) + expire_client(old); + return status; +out_free_conn: + spin_unlock(&nn->client_lock); + free_conn(conn); + if (old) + expire_client(old); +out_free_session: + __free_session(new); +out_release_drc_mem: + nfsd4_put_drc_mem(&cr_ses->fore_channel); + return status; +} + +static __be32 nfsd4_map_bcts_dir(u32 *dir) +{ + switch (*dir) { + case NFS4_CDFC4_FORE: + case NFS4_CDFC4_BACK: + return nfs_ok; + case NFS4_CDFC4_FORE_OR_BOTH: + case NFS4_CDFC4_BACK_OR_BOTH: + *dir = NFS4_CDFC4_BOTH; + return nfs_ok; + }; + return nfserr_inval; +} + +__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc) +{ + struct nfsd4_session *session = cstate->session; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + + status = nfsd4_check_cb_sec(&bc->bc_cb_sec); + if (status) + return status; + spin_lock(&nn->client_lock); + session->se_cb_prog = bc->bc_cb_program; + session->se_cb_sec = bc->bc_cb_sec; + spin_unlock(&nn->client_lock); + + nfsd4_probe_callback(session->se_client); + + return nfs_ok; +} + +__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 status; + struct nfsd4_conn *conn; + struct nfsd4_session *session; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nfsd4_last_compound_op(rqstp)) + return nfserr_not_only_op; + spin_lock(&nn->client_lock); + session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); + spin_unlock(&nn->client_lock); + if (!session) + goto out_no_session; + status = nfserr_wrong_cred; + if (!mach_creds_match(session->se_client, rqstp)) + goto out; + status = nfsd4_map_bcts_dir(&bcts->dir); + if (status) + goto out; + conn = alloc_conn(rqstp, bcts->dir); + status = nfserr_jukebox; + if (!conn) + goto out; + nfsd4_init_conn(rqstp, conn, session); + status = nfs_ok; +out: + nfsd4_put_session(session); +out_no_session: + return status; +} + +static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) +{ + if (!session) + return 0; + return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); +} + +__be32 +nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, + struct nfsd4_destroy_session *sessionid) +{ + struct nfsd4_session *ses; + __be32 status; + int ref_held_by_me = 0; + struct net *net = SVC_NET(r); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + status = nfserr_not_only_op; + if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { + if (!nfsd4_last_compound_op(r)) + goto out; + ref_held_by_me++; + } + dump_sessionid(__func__, &sessionid->sessionid); + spin_lock(&nn->client_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status); + if (!ses) + goto out_client_lock; + status = nfserr_wrong_cred; + if (!mach_creds_match(ses->se_client, r)) + goto out_put_session; + status = mark_session_dead_locked(ses, 1 + ref_held_by_me); + if (status) + goto out_put_session; + unhash_session(ses); + spin_unlock(&nn->client_lock); + + nfsd4_probe_callback_sync(ses->se_client); + + spin_lock(&nn->client_lock); + status = nfs_ok; +out_put_session: + nfsd4_put_session_locked(ses); +out_client_lock: + spin_unlock(&nn->client_lock); +out: + return status; +} + +static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) +{ + struct nfsd4_conn *c; + + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_xprt == xpt) { + return c; + } + } + return NULL; +} + +static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_conn *c; + __be32 status = nfs_ok; + int ret; + + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(new->cn_xprt, ses); + if (c) + goto out_free; + status = nfserr_conn_not_bound_to_session; + if (clp->cl_mach_cred) + goto out_free; + __nfsd4_hash_conn(new, ses); + spin_unlock(&clp->cl_lock); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); + return nfs_ok; +out_free: + spin_unlock(&clp->cl_lock); + free_conn(new); + return status; +} + +static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + return args->opcnt > session->se_fchannel.maxops; +} + +static bool nfsd4_request_too_big(struct svc_rqst *rqstp, + struct nfsd4_session *session) +{ + struct xdr_buf *xb = &rqstp->rq_arg; + + return xb->len > session->se_fchannel.maxreq_sz; +} + +__be32 +nfsd4_sequence(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_sequence *seq) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_session *session; + struct nfs4_client *clp; + struct nfsd4_slot *slot; + struct nfsd4_conn *conn; + __be32 status; + int buflen; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (resp->opcnt != 1) + return nfserr_sequence_pos; + + /* + * Will be either used or freed by nfsd4_sequence_check_conn + * below. + */ + conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); + if (!conn) + return nfserr_jukebox; + + spin_lock(&nn->client_lock); + session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); + if (!session) + goto out_no_session; + clp = session->se_client; + + status = nfserr_too_many_ops; + if (nfsd4_session_too_many_ops(rqstp, session)) + goto out_put_session; + + status = nfserr_req_too_big; + if (nfsd4_request_too_big(rqstp, session)) + goto out_put_session; + + status = nfserr_badslot; + if (seq->slotid >= session->se_fchannel.maxreqs) + goto out_put_session; + + slot = session->se_slots[seq->slotid]; + dprintk("%s: slotid %d\n", __func__, seq->slotid); + + /* We do not negotiate the number of slots yet, so set the + * maxslots to the session maxreqs which is used to encode + * sr_highest_slotid and the sr_target_slot id to maxslots */ + seq->maxslots = session->se_fchannel.maxreqs; + + status = check_slot_seqid(seq->seqid, slot->sl_seqid, + slot->sl_flags & NFSD4_SLOT_INUSE); + if (status == nfserr_replay_cache) { + status = nfserr_seq_misordered; + if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) + goto out_put_session; + cstate->slot = slot; + cstate->session = session; + cstate->clp = clp; + /* Return the cached reply status and set cstate->status + * for nfsd4_proc_compound processing */ + status = nfsd4_replay_cache_entry(resp, seq); + cstate->status = nfserr_replay_cache; + goto out; + } + if (status) + goto out_put_session; + + status = nfsd4_sequence_check_conn(conn, session); + conn = NULL; + if (status) + goto out_put_session; + + buflen = (seq->cachethis) ? + session->se_fchannel.maxresp_cached : + session->se_fchannel.maxresp_sz; + status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : + nfserr_rep_too_big; + if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) + goto out_put_session; + svc_reserve(rqstp, buflen); + + status = nfs_ok; + /* Success! bump slot seqid */ + slot->sl_seqid = seq->seqid; + slot->sl_flags |= NFSD4_SLOT_INUSE; + if (seq->cachethis) + slot->sl_flags |= NFSD4_SLOT_CACHETHIS; + else + slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS; + + cstate->slot = slot; + cstate->session = session; + cstate->clp = clp; + +out: + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; +out_no_session: + if (conn) + free_conn(conn); + spin_unlock(&nn->client_lock); + return status; +out_put_session: + nfsd4_put_session_locked(session); + goto out_no_session; +} + +void +nfsd4_sequence_done(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compound_state *cs = &resp->cstate; + + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; + } + /* Drop session reference that was taken in nfsd4_sequence() */ + nfsd4_put_session(cs->session); + } else if (cs->clp) + put_client_renew(cs->clp); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) +{ + struct nfs4_client *conf, *unconf; + struct nfs4_client *clp = NULL; + __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + spin_lock(&nn->client_lock); + unconf = find_unconfirmed_client(&dc->clientid, true, nn); + conf = find_confirmed_client(&dc->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); + + if (conf) { + if (client_has_state(conf)) { + status = nfserr_clientid_busy; + goto out; + } + status = mark_client_expired_locked(conf); + if (status) + goto out; + clp = conf; + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + if (!mach_creds_match(clp, rqstp)) { + clp = NULL; + status = nfserr_wrong_cred; + goto out; + } + unhash_client_locked(clp); +out: + spin_unlock(&nn->client_lock); + if (clp) + expire_client(clp); + return status; +} + +__be32 +nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) +{ + __be32 status = 0; + + if (rc->rca_one_fs) { + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + /* + * We don't take advantage of the rca_one_fs case. + * That's OK, it's optional, we can safely ignore it. + */ + return nfs_ok; + } + + status = nfserr_complete_already; + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags)) + goto out; + + status = nfserr_stale_clientid; + if (is_client_expired(cstate->session->se_client)) + /* + * The following error isn't really legal. + * But we only get here if the client just explicitly + * destroyed the client. Surely it no longer cares what + * error it gets back on an operation for the dead + * client. + */ + goto out; + + status = nfs_ok; + nfsd4_client_record_create(cstate->session->se_client); +out: + return status; +} + +__be32 +nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) +{ + struct xdr_netobj clname = setclid->se_name; + nfs4_verifier clverifier = setclid->se_verf; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + new = create_client(clname, rqstp, &clverifier); + if (new == NULL) + return nfserr_jukebox; + /* Cases below refer to rfc 3530 section 14.2.33: */ + spin_lock(&nn->client_lock); + conf = find_confirmed_client_by_name(&clname, nn); + if (conf) { + /* case 0: */ + status = nfserr_clid_inuse; + if (clp_used_exchangeid(conf)) + goto out; + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, + sizeof(addr_str)); + dprintk("NFSD: setclientid: string in use by client " + "at %s\n", addr_str); + goto out; + } + } + unconf = find_unconfirmed_client_by_name(&clname, nn); + if (unconf) + unhash_client_locked(unconf); + if (conf && same_verf(&conf->cl_verifier, &clverifier)) + /* case 1: probable callback update */ + copy_clid(new, conf); + else /* case 4 (new client) or cases 2, 3 (client reboot): */ + gen_clid(new, nn); + new->cl_minorversion = 0; + gen_callback(new, setclid, rqstp); + add_to_unconfirmed(new); + setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; + setclid->se_clientid.cl_id = new->cl_clientid.cl_id; + memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); + new = NULL; + status = nfs_ok; +out: + spin_unlock(&nn->client_lock); + if (new) + free_client(new); + if (unconf) + expire_client(unconf); + return status; +} + + +__be32 +nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid_confirm *setclientid_confirm) +{ + struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; + nfs4_verifier confirm = setclientid_confirm->sc_confirm; + clientid_t * clid = &setclientid_confirm->sc_clientid; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + spin_lock(&nn->client_lock); + conf = find_confirmed_client(clid, false, nn); + unconf = find_unconfirmed_client(clid, false, nn); + /* + * We try hard to give out unique clientid's, so if we get an + * attempt to confirm the same clientid with a different cred, + * there's a bug somewhere. Let's charitably assume it's our + * bug. + */ + status = nfserr_serverfault; + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + goto out; + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + goto out; + /* cases below refer to rfc 3530 section 14.2.34: */ + if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { + if (conf && !unconf) /* case 2: probable retransmit */ + status = nfs_ok; + else /* case 4: client hasn't noticed we rebooted yet? */ + status = nfserr_stale_clientid; + goto out; + } + status = nfs_ok; + if (conf) { /* case 1: callback update */ + old = unconf; + unhash_client_locked(old); + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + } else { /* case 3: normal case; new or rebooted client */ + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = mark_client_expired_locked(old); + if (status) { + old = NULL; + goto out; + } + } + move_to_confirmed(unconf); + conf = unconf; + } + get_client_locked(conf); + spin_unlock(&nn->client_lock); + nfsd4_probe_callback(conf); + spin_lock(&nn->client_lock); + put_client_renew_locked(conf); +out: + spin_unlock(&nn->client_lock); + if (old) + expire_client(old); + return status; +} + +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + +/* OPEN Share state helper functions */ +static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, + struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + + atomic_set(&fp->fi_ref, 1); + spin_lock_init(&fp->fi_lock); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); + fh_copy_shallow(&fp->fi_fhandle, fh); + fp->fi_deleg_file = NULL; + fp->fi_had_conflict = false; + fp->fi_share_deny = 0; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&fp->fi_lo_states); + atomic_set(&fp->fi_lo_recalls, 0); +#endif + hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); +} + +void +nfsd4_free_slabs(void) +{ + kmem_cache_destroy(odstate_slab); + kmem_cache_destroy(openowner_slab); + kmem_cache_destroy(lockowner_slab); + kmem_cache_destroy(file_slab); + kmem_cache_destroy(stateid_slab); + kmem_cache_destroy(deleg_slab); +} + +int +nfsd4_init_slabs(void) +{ + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_lockowner), 0, 0, NULL); + if (lockowner_slab == NULL) + goto out_free_openowner_slab; + file_slab = kmem_cache_create("nfsd4_files", + sizeof(struct nfs4_file), 0, 0, NULL); + if (file_slab == NULL) + goto out_free_lockowner_slab; + stateid_slab = kmem_cache_create("nfsd4_stateids", + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); + if (stateid_slab == NULL) + goto out_free_file_slab; + deleg_slab = kmem_cache_create("nfsd4_delegations", + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; + return 0; + +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); +out_free_stateid_slab: + kmem_cache_destroy(stateid_slab); +out_free_file_slab: + kmem_cache_destroy(file_slab); +out_free_lockowner_slab: + kmem_cache_destroy(lockowner_slab); +out_free_openowner_slab: + kmem_cache_destroy(openowner_slab); +out: + dprintk("nfsd4: out of memory while initializing nfsv4\n"); + return -ENOMEM; +} + +static void init_nfs4_replay(struct nfs4_replay *rp) +{ + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; + mutex_init(&rp->rp_mutex); +} + +static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) +{ + if (!nfsd4_has_session(cstate)) { + mutex_lock(&so->so_replay.rp_mutex); + cstate->replay_owner = nfs4_get_stateowner(so); + } +} + +void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (so != NULL) { + cstate->replay_owner = NULL; + mutex_unlock(&so->so_replay.rp_mutex); + nfs4_put_stateowner(so); + } +} + +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) +{ + struct nfs4_stateowner *sop; + + sop = kmem_cache_alloc(slab, GFP_KERNEL); + if (!sop) + return NULL; + + sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(slab, sop); + return NULL; + } + sop->so_owner.len = owner->len; + + INIT_LIST_HEAD(&sop->so_stateids); + sop->so_client = clp; + init_nfs4_replay(&sop->so_replay); + atomic_set(&sop->so_count, 1); + return sop; +} + +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) +{ + lockdep_assert_held(&clp->cl_lock); + + list_add(&oo->oo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); +} + +static void nfs4_unhash_openowner(struct nfs4_stateowner *so) +{ + unhash_openowner_locked(openowner(so)); +} + +static void nfs4_free_openowner(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo = openowner(so); + + kmem_cache_free(openowner_slab, oo); +} + +static const struct nfs4_stateowner_operations openowner_ops = { + .so_unhash = nfs4_unhash_openowner, + .so_free = nfs4_free_openowner, +}; + +static struct nfs4_openowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_openowner *oo, *ret; + + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) + return NULL; + oo->oo_owner.so_ops = &openowner_ops; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_flags = 0; + if (nfsd4_has_session(cstate)) + oo->oo_flags |= NFS4_OO_CONFIRMED; + oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&oo->oo_close_lru); + spin_lock(&clp->cl_lock); + ret = find_openstateowner_str_locked(strhashval, open, clp); + if (ret == NULL) { + hash_openowner(oo, clp, strhashval); + ret = oo; + } else + nfs4_free_openowner(&oo->oo_owner); + spin_unlock(&clp->cl_lock); + return ret; +} + +static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { + struct nfs4_openowner *oo = open->op_openowner; + + atomic_inc(&stp->st_stid.sc_count); + stp->st_stid.sc_type = NFS4_OPEN_STID; + INIT_LIST_HEAD(&stp->st_locks); + stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); + get_nfs4_file(fp); + stp->st_stid.sc_file = fp; + stp->st_access_bmap = 0; + stp->st_deny_bmap = 0; + stp->st_openstp = NULL; + spin_lock(&oo->oo_owner.so_client->cl_lock); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); + spin_unlock(&oo->oo_owner.so_client->cl_lock); +} + +/* + * In the 4.0 case we need to keep the owners around a little while to handle + * CLOSE replay. We still do need to release any file access that is held by + * them before returning however. + */ +static void +move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) +{ + struct nfs4_ol_stateid *last; + struct nfs4_openowner *oo = openowner(s->st_stateowner); + struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, + nfsd_net_id); + + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); + + /* + * We know that we hold one reference via nfsd4_close, and another + * "persistent" reference for the client. If the refcount is higher + * than 2, then there are still calls in progress that are using this + * stateid. We can't put the sc_file reference until they are finished. + * Wait for the refcount to drop to 2. Since it has been unhashed, + * there should be no danger of the refcount going back up again at + * this point. + */ + wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2); + + release_all_access(s); + if (s->st_stid.sc_file) { + put_nfs4_file(s->st_stid.sc_file); + s->st_stid.sc_file = NULL; + } + + spin_lock(&nn->client_lock); + last = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = s; + list_move_tail(&oo->oo_close_lru, &nn->close_lru); + oo->oo_time = get_seconds(); + spin_unlock(&nn->client_lock); + if (last) + nfs4_put_stid(&last->st_stid); +} + +/* search file_hashtbl[] for file */ +static struct nfs4_file * +find_file_locked(struct knfsd_fh *fh, unsigned int hashval) +{ + struct nfs4_file *fp; + + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { + if (fh_match(&fp->fi_fhandle, fh)) { + if (atomic_inc_not_zero(&fp->fi_ref)) + return fp; + } + } + return NULL; +} + +struct nfs4_file * +find_file(struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + return fp; +} + +static struct nfs4_file * +find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + if (fp) + return fp; + + spin_lock(&state_lock); + fp = find_file_locked(fh, hashval); + if (likely(fp == NULL)) { + nfsd4_init_file(fh, hashval, new); + fp = new; + } + spin_unlock(&state_lock); + + return fp; +} + +/* + * Called to check deny when READ with all zero stateid or + * WRITE with all zero or all one stateid + */ +static __be32 +nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) +{ + struct nfs4_file *fp; + __be32 ret = nfs_ok; + + fp = find_file(¤t_fh->fh_handle); + if (!fp) + return ret; + /* Check for conflicting share reservations */ + spin_lock(&fp->fi_lock); + if (fp->fi_share_deny & deny_type) + ret = nfserr_locked; + spin_unlock(&fp->fi_lock); + put_nfs4_file(fp); + return ret; +} + +static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); + + block_delegations(&dp->dl_stid.sc_file->fi_fhandle); + + /* + * We can't do this in nfsd_break_deleg_cb because it is + * already holding inode->i_lock. + * + * If the dl_time != 0, then we know that it has already been + * queued for a lease break. Don't queue it again. + */ + spin_lock(&state_lock); + if (dp->dl_time == 0) { + dp->dl_time = get_seconds(); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); + } + spin_unlock(&state_lock); +} + +static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + switch (task->tk_status) { + case 0: + return 1; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* + * Race: client probably got cb_recall before open reply + * granting delegation. + */ + if (dp->dl_retries--) { + rpc_delay(task, 2 * HZ); + return 0; + } + /*FALLTHRU*/ + default: + return -1; + } +} + +static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + nfs4_put_stid(&dp->dl_stid); +} + +static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { + .prepare = nfsd4_cb_recall_prepare, + .done = nfsd4_cb_recall_done, + .release = nfsd4_cb_recall_release, +}; + +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +{ + /* + * We're assuming the state code never drops its reference + * without first removing the lease. Since we're in this lease + * callback (and since the lease code is serialized by the kernel + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference. + */ + atomic_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&dp->dl_recall); +} + +/* Called from break_lease() with i_lock held. */ +static bool +nfsd_break_deleg_cb(struct file_lock *fl) +{ + bool ret = false; + struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; + struct nfs4_delegation *dp; + + if (!fp) { + WARN(1, "(%p)->fl_owner NULL\n", fl); + return ret; + } + if (fp->fi_had_conflict) { + WARN(1, "duplicate break on %p\n", fp); + return ret; + } + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a delegation isn't returned + * in time: + */ + fl->fl_break_time = 0; + + spin_lock(&fp->fi_lock); + fp->fi_had_conflict = true; + /* + * If there are no delegations on the list, then return true + * so that the lease code will go ahead and delete it. + */ + if (list_empty(&fp->fi_delegations)) + ret = true; + else + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); + spin_unlock(&fp->fi_lock); + return ret; +} + +static int +nfsd_change_deleg_cb(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + if (arg & F_UNLCK) + return lease_modify(onlist, arg, dispose); + else + return -EAGAIN; +} + +static const struct lock_manager_operations nfsd_lease_mng_ops = { + .lm_break = nfsd_break_deleg_cb, + .lm_change = nfsd_change_deleg_cb, +}; + +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} + +static __be32 lookup_clientid(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ + struct nfs4_client *found; + + if (cstate->clp) { + found = cstate->clp; + if (!same_clid(&found->cl_clientid, clid)) + return nfserr_stale_clientid; + return nfs_ok; + } + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + /* + * For v4.1+ we get the client in the SEQUENCE op. If we don't have one + * cached already then we know this is for is for v4.0 and "sessions" + * will be false. + */ + WARN_ON_ONCE(cstate->session); + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, false, nn); + if (!found) { + spin_unlock(&nn->client_lock); + return nfserr_expired; + } + atomic_inc(&found->cl_refcount); + spin_unlock(&nn->client_lock); + + /* Cache the nfs4_client in cstate! */ + cstate->clp = found; + return nfs_ok; +} + +__be32 +nfsd4_process_open1(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, struct nfsd_net *nn) +{ + clientid_t *clientid = &open->op_clientid; + struct nfs4_client *clp = NULL; + unsigned int strhashval; + struct nfs4_openowner *oo = NULL; + __be32 status; + + if (STALE_CLIENTID(&open->op_clientid, nn)) + return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; + + status = lookup_clientid(clientid, cstate, nn); + if (status) + return status; + clp = cstate->clp; + + strhashval = ownerstr_hashval(&open->op_owner); + oo = find_openstateowner_str(strhashval, open, clp); + open->op_openowner = oo; + if (!oo) { + goto new_owner; + } + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + /* Replace unconfirmed owners without checking for replay. */ + release_openowner(oo); + open->op_openowner = NULL; + goto new_owner; + } + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + goto alloc_stateid; +new_owner: + oo = alloc_init_open_stateowner(strhashval, open, cstate); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_open_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + + return nfs_ok; +} + +static inline __be32 +nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) +{ + if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) + return nfserr_openmode; + else + return nfs_ok; +} + +static int share_access_to_flags(u32 share_access) +{ + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; +} + +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +{ + struct nfs4_stid *ret; + + ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); +} + +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; +} + +static __be32 +nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, + struct nfs4_delegation **dp) +{ + int flags; + __be32 status = nfserr_bad_stateid; + struct nfs4_delegation *deleg; + + deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); + if (deleg == NULL) + goto out; + flags = share_access_to_flags(open->op_share_access); + status = nfs4_check_delegmode(deleg, flags); + if (status) { + nfs4_put_stid(&deleg->dl_stid); + goto out; + } + *dp = deleg; +out: + if (!nfsd4_is_deleg_cur(open)) + return nfs_ok; + if (status) + return status; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + return nfs_ok; +} + +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *local, *ret = NULL; + struct nfs4_openowner *oo = open->op_openowner; + + spin_lock(&fp->fi_lock); + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { + /* ignore lock owners */ + if (local->st_stateowner->so_is_open_owner == 0) + continue; + if (local->st_stateowner == &oo->oo_owner) { + ret = local; + atomic_inc(&ret->st_stid.sc_count); + break; + } + } + spin_unlock(&fp->fi_lock); + return ret; +} + +static inline int nfs4_access_to_access(u32 nfs4_access) +{ + int flags = 0; + + if (nfs4_access & NFS4_SHARE_ACCESS_READ) + flags |= NFSD_MAY_READ; + if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) + flags |= NFSD_MAY_WRITE; + return flags; +} + +static inline __be32 +nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, + struct nfsd4_open *open) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = 0, + }; + if (!open->op_truncate) + return 0; + if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); +} + +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) +{ + struct file *filp = NULL; + __be32 status; + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + unsigned char old_access_bmap, old_deny_bmap; + + spin_lock(&fp->fi_lock); + + /* + * Are we trying to set a deny mode that would conflict with + * current access? + */ + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; + } + + /* set access to the file */ + status = nfs4_file_get_access(fp, open->op_share_access); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; + } + + /* Set access bits in stateid */ + old_access_bmap = stp->st_access_bmap; + set_access(open->op_share_access, stp); + + /* Set new deny mask */ + old_deny_bmap = stp->st_deny_bmap; + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + + if (!fp->fi_fds[oflag]) { + spin_unlock(&fp->fi_lock); + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + if (status) + goto out_put_access; + spin_lock(&fp->fi_lock); + if (!fp->fi_fds[oflag]) { + fp->fi_fds[oflag] = filp; + filp = NULL; + } + } + spin_unlock(&fp->fi_lock); + if (filp) + fput(filp); + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status) + goto out_put_access; +out: + return status; +out_put_access: + stp->st_access_bmap = old_access_bmap; + nfs4_file_put_access(fp, open->op_share_access); + reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); + goto out; +} + +static __be32 +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +{ + __be32 status; + unsigned char old_deny_bmap; + + if (!test_access(open->op_share_access, stp)) + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + + /* test and set deny mode */ + spin_lock(&fp->fi_lock); + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status == nfs_ok) { + old_deny_bmap = stp->st_deny_bmap; + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } + spin_unlock(&fp->fi_lock); + + if (status != nfs_ok) + return status; + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status != nfs_ok) + reset_union_bmap_deny(old_deny_bmap, stp); + return status; +} + +static void +nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) +{ + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; +} + +/* Should we give out recallable state?: */ +static bool nfsd4_cb_channel_good(struct nfs4_client *clp) +{ + if (clp->cl_cb_state == NFSD4_CB_UP) + return true; + /* + * In the sessions case, since we don't have to establish a + * separate connection for callbacks, we assume it's OK + * until we hear otherwise: + */ + return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; +} + +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + return NULL; + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_DELEG; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)fp; + fl->fl_pid = current->tgid; + return fl; +} + +static int nfs4_setlease(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + struct file_lock *fl, *ret; + struct file *filp; + int status = 0; + + fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ); + if (!fl) + return -ENOMEM; + filp = find_readable_file(fp); + if (!filp) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return -EBADF; + } + fl->fl_file = filp; + ret = fl; + status = vfs_setlease(filp, fl->fl_type, &fl, NULL); + if (fl) + locks_free_lock(fl); + if (status) + goto out_fput; + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* Did the lease get broken before we took the lock? */ + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + /* Race breaker */ + if (fp->fi_deleg_file) { + status = 0; + ++fp->fi_delegees; + hash_delegation_locked(dp, fp); + goto out_unlock; + } + fp->fi_deleg_file = filp; + fp->fi_delegees = 1; + hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + return 0; +out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); +out_fput: + fput(filp); + return status; +} + +static struct nfs4_delegation * +nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) +{ + int status; + struct nfs4_delegation *dp; + + if (fp->fi_had_conflict) + return ERR_PTR(-EAGAIN); + + dp = alloc_init_deleg(clp, fh, odstate); + if (!dp) + return ERR_PTR(-ENOMEM); + + get_nfs4_file(fp); + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + dp->dl_stid.sc_file = fp; + if (!fp->fi_deleg_file) { + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + status = nfs4_setlease(dp); + goto out; + } + if (fp->fi_had_conflict) { + status = -EAGAIN; + goto out_unlock; + } + ++fp->fi_delegees; + hash_delegation_locked(dp, fp); + status = 0; +out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); +out: + if (status) { + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_stid(&dp->dl_stid); + return ERR_PTR(status); + } + return dp; +} + +static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) +{ + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + if (status == -EAGAIN) + open->op_why_no_deleg = WND4_CONTENTION; + else { + open->op_why_no_deleg = WND4_RESOURCE; + switch (open->op_deleg_want) { + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + break; + case NFS4_SHARE_WANT_CANCEL: + open->op_why_no_deleg = WND4_CANCELLED; + break; + case NFS4_SHARE_WANT_NO_DELEG: + WARN_ON_ONCE(1); + } + } +} + +/* + * Attempt to hand out a delegation. + * + * Note we don't support write delegations, and won't until the vfs has + * proper support for them. + */ +static void +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp) +{ + struct nfs4_delegation *dp; + struct nfs4_openowner *oo = openowner(stp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; + int cb_up; + int status = 0; + + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!cb_up) + open->op_recall = 1; + if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) + goto out_no_deleg; + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + /* + * Let's not give out any delegations till everyone's + * had the chance to reclaim theirs.... + */ + if (locks_in_grace(clp->net)) + goto out_no_deleg; + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) + goto out_no_deleg; + /* + * Also, if the file was opened for write or + * create, there's a good chance the client's + * about to write to it, resulting in an + * immediate recall (since we don't support + * write delegations): + */ + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + goto out_no_deleg; + if (open->op_create == NFS4_OPEN_CREATE) + goto out_no_deleg; + break; + default: + goto out_no_deleg; + } + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); + if (IS_ERR(dp)) + goto out_no_deleg; + + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); + + dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", + STATEID_VAL(&dp->dl_stid.sc_stateid)); + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + nfs4_put_stid(&dp->dl_stid); + return; +out_no_deleg: + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && + open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); + open->op_recall = 1; + } + + /* 4.1 client asking for a delegation? */ + if (open->op_deleg_want) + nfsd4_open_deleg_none_ext(open, status); + return; +} + +static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, + struct nfs4_delegation *dp) +{ + if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; + } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; + } + /* Otherwise the client must be confused wanting a delegation + * it already has, therefore we don't return + * NFS4_OPEN_DELEGATE_NONE_EXT and reason. + */ +} + +__be32 +nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; + struct nfs4_file *fp = NULL; + struct nfs4_ol_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; + __be32 status; + + /* + * Lookup file; if found, lookup stateid and check open request, + * and check for delegations in the process of being recalled. + * If not found, create the nfs4_file struct + */ + fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + if (fp != open->op_file) { + status = nfs4_check_deleg(cl, open, &dp); + if (status) + goto out; + stp = nfsd4_find_existing_open(fp, open); + } else { + open->op_file = NULL; + status = nfserr_bad_stateid; + if (nfsd4_is_deleg_cur(open)) + goto out; + } + + /* + * OPEN the file, or upgrade an existing OPEN. + * If truncate fails, the OPEN fails. + */ + if (stp) { + /* Stateid was found, this is an OPEN upgrade */ + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); + if (status) + goto out; + } else { + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { + release_open_stateid(stp); + goto out; + } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; + } + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + + if (nfsd4_has_session(&resp->cstate)) { + if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_WANTED; + goto nodeleg; + } + } + + /* + * Attempt to hand out a delegation. No error return, because the + * OPEN succeeds even if we fail. + */ + nfs4_open_delegation(current_fh, open, stp); +nodeleg: + status = nfs_ok; + + dprintk("%s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&stp->st_stid.sc_stateid)); +out: + /* 4.1 client trying to upgrade/downgrade delegation? */ + if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && + open->op_deleg_want) + nfsd4_deleg_xgrade_none_ext(open, dp); + + if (fp) + put_nfs4_file(fp); + if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate)); + /* + * To finish the open response, we just need to set the rflags. + */ + open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; + if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && + !nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) + nfs4_put_stid(&dp->dl_stid); + if (stp) + nfs4_put_stid(&stp->st_stid); + + return status; +} + +void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) +{ + if (open->op_openowner) { + struct nfs4_stateowner *so = &open->op_openowner->oo_owner; + + nfsd4_cstate_assign_replay(cstate, so); + nfs4_put_stateowner(so); + } + if (open->op_file) + kmem_cache_free(file_slab, open->op_file); + if (open->op_stp) + nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); +} + +__be32 +nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + clientid_t *clid) +{ + struct nfs4_client *clp; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("process_renew(%08x/%08x): starting\n", + clid->cl_boot, clid->cl_id); + status = lookup_clientid(clid, cstate, nn); + if (status) + goto out; + clp = cstate->clp; + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) + && clp->cl_cb_state != NFSD4_CB_UP) + goto out; + status = nfs_ok; +out: + return status; +} + +void +nfsd4_end_grace(struct nfsd_net *nn) +{ + /* do nothing if grace period already ended */ + if (nn->grace_ended) + return; + + dprintk("NFSD: end of grace period\n"); + nn->grace_ended = true; + /* + * If the server goes down again right now, an NFSv4 + * client will still be allowed to reclaim after it comes back up, + * even if it hasn't yet had a chance to reclaim state this time. + * + */ + nfsd4_record_grace_done(nn); + /* + * At this point, NFSv4 clients can still reclaim. But if the + * server crashes, any that have not yet reclaimed will be out + * of luck on the next boot. + * + * (NFSv4.1+ clients are considered to have reclaimed once they + * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to + * have reclaimed after their first OPEN.) + */ + locks_end_grace(&nn->nfsd4_manager); + /* + * At this point, and once lockd and/or any other containers + * exit their grace period, further reclaims will fail and + * regular locking can resume. + */ +} + +static time_t +nfs4_laundromat(struct nfsd_net *nn) +{ + struct nfs4_client *clp; + struct nfs4_openowner *oo; + struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; + struct list_head *pos, *next, reaplist; + time_t cutoff = get_seconds() - nn->nfsd4_lease; + time_t t, new_timeo = nn->nfsd4_lease; + + dprintk("NFSD: laundromat service - starting\n"); + nfsd4_end_grace(nn); + INIT_LIST_HEAD(&reaplist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { + t = clp->cl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + if (mark_client_expired_locked(clp)) { + dprintk("NFSD: client in use (clientid %08x)\n", + clp->cl_clientid.cl_id); + continue; + } + list_add(&clp->cl_lru, &reaplist); + } + spin_unlock(&nn->client_lock); + list_for_each_safe(pos, next, &reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); + list_del_init(&clp->cl_lru); + expire_client(clp); + } + spin_lock(&state_lock); + list_for_each_safe(pos, next, &nn->del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) + continue; + if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { + t = dp->dl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + while (!list_empty(&reaplist)) { + dp = list_first_entry(&reaplist, struct nfs4_delegation, + dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + revoke_delegation(dp); + } + + spin_lock(&nn->client_lock); + while (!list_empty(&nn->close_lru)) { + oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, + oo_close_lru); + if (time_after((unsigned long)oo->oo_time, + (unsigned long)cutoff)) { + t = oo->oo_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_del_init(&oo->oo_close_lru); + stp = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = NULL; + spin_unlock(&nn->client_lock); + nfs4_put_stid(&stp->st_stid); + spin_lock(&nn->client_lock); + } + spin_unlock(&nn->client_lock); + + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); + return new_timeo; +} + +static struct workqueue_struct *laundry_wq; +static void laundromat_main(struct work_struct *); + +static void +laundromat_main(struct work_struct *laundry) +{ + time_t t; + struct delayed_work *dwork = container_of(laundry, struct delayed_work, + work); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + laundromat_work); + + t = nfs4_laundromat(nn); + dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); + queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); +} + +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +{ + if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + return nfserr_bad_stateid; + return nfs_ok; +} + +static inline int +access_permit_read(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_READ, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp) || + test_access(NFS4_SHARE_ACCESS_WRITE, stp); +} + +static inline int +access_permit_write(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp); +} + +static +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) +{ + __be32 status = nfserr_openmode; + + /* For lock stateid's, we test the parent open, not the lock: */ + if (stp->st_openstp) + stp = stp->st_openstp; + if ((flags & WR_STATE) && !access_permit_write(stp)) + goto out; + if ((flags & RD_STATE) && !access_permit_read(stp)) + goto out; + status = nfs_ok; +out: + return status; +} + +static inline __be32 +check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) +{ + if (ONE_STATEID(stateid) && (flags & RD_STATE)) + return nfs_ok; + else if (locks_in_grace(net)) { + /* Answer in remaining cases depends on existence of + * conflicting state; so we must wait out the grace period. */ + return nfserr_grace; + } else if (flags & WR_STATE) + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_WRITE); + else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */ + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_READ); +} + +/* + * Allow READ/WRITE during grace period on recovered state only for files + * that are not able to provide mandatory locking. + */ +static inline int +grace_disallows_io(struct net *net, struct inode *inode) +{ + return locks_in_grace(net) && mandatory_lock(inode); +} + +/* Returns true iff a is later than b: */ +static bool stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)(a->si_generation - b->si_generation) > 0; +} + +static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) +{ + /* + * When sessions are used the stateid generation number is ignored + * when it is zero. + */ + if (has_session && in->si_generation == 0) + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; + + /* If the client sends us a stateid from the future, it's buggy: */ + if (stateid_generation_after(in, ref)) + return nfserr_bad_stateid; + /* + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: + */ + return nfserr_old_stateid; +} + +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + +static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) +{ + struct nfs4_stid *s; + __be32 status = nfserr_bad_stateid; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return status; + /* Client debugging aid. */ + if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, + sizeof(addr_str)); + pr_warn_ratelimited("NFSD: client %s testing state ID " + "with incorrect client ID\n", addr_str); + return status; + } + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); + if (!s) + goto out_unlock; + status = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (status) + goto out_unlock; + switch (s->sc_type) { + case NFS4_DELEG_STID: + status = nfs_ok; + break; + case NFS4_REVOKED_DELEG_STID: + status = nfserr_deleg_revoked; + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); + break; + default: + printk("unknown stateid type %x\n", s->sc_type); + /* Fallthrough */ + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + status = nfserr_bad_stateid; + } +out_unlock: + spin_unlock(&cl->cl_lock); + return status; +} + +__be32 +nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn) +{ + __be32 status; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); + if (status == nfserr_stale_clientid) { + if (cstate->session) + return nfserr_bad_stateid; + return nfserr_stale_stateid; + } + if (status) + return status; + *s = find_stateid_by_type(cstate->clp, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + return nfs_ok; +} + +/* +* Checks for stateid operations +*/ +__be32 +nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filpp) +{ + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; + struct svc_fh *current_fh = &cstate->current_fh; + struct inode *ino = d_inode(current_fh->fh_dentry); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct file *file = NULL; + __be32 status; + + if (filpp) + *filpp = NULL; + + if (grace_disallows_io(net, ino)) + return nfserr_grace; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return check_special_stateids(net, current_fh, stateid, flags); + + status = nfsd4_lookup_stateid(cstate, stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + &s, nn); + if (status) + return status; + status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + if (status) + goto out; + switch (s->sc_type) { + case NFS4_DELEG_STID: + dp = delegstateid(s); + status = nfs4_check_delegmode(dp, flags); + if (status) + goto out; + if (filpp) { + file = dp->dl_stid.sc_file->fi_deleg_file; + if (!file) { + WARN_ON_ONCE(1); + status = nfserr_serverfault; + goto out; + } + get_file(file); + } + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + stp = openlockstateid(s); + status = nfs4_check_fh(current_fh, stp); + if (status) + goto out; + status = nfsd4_check_openowner_confirmed(stp); + if (status) + goto out; + status = nfs4_check_openmode(stp, flags); + if (status) + goto out; + if (filpp) { + struct nfs4_file *fp = stp->st_stid.sc_file; + + if (flags & RD_STATE) + file = find_readable_file(fp); + else + file = find_writeable_file(fp); + } + break; + default: + status = nfserr_bad_stateid; + goto out; + } + status = nfs_ok; + if (file) + *filpp = file; +out: + nfs4_put_stid(s); + return status; +} + +/* + * Test if the stateid is valid + */ +__be32 +nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_test_stateid *test_stateid) +{ + struct nfsd4_test_stateid_id *stateid; + struct nfs4_client *cl = cstate->session->se_client; + + list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) + stateid->ts_id_status = + nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); + + return nfs_ok; +} + +__be32 +nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_free_stateid *free_stateid) +{ + stateid_t *stateid = &free_stateid->fr_stateid; + struct nfs4_stid *s; + struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; + struct nfs4_client *cl = cstate->session->se_client; + __be32 ret = nfserr_bad_stateid; + + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); + if (!s) + goto out_unlock; + switch (s->sc_type) { + case NFS4_DELEG_STID: + ret = nfserr_locks_held; + break; + case NFS4_OPEN_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + break; + ret = nfserr_locks_held; + break; + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + break; + stp = openlockstateid(s); + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + break; + unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + case NFS4_REVOKED_DELEG_STID: + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + /* Default falls through and returns nfserr_bad_stateid */ + } +out_unlock: + spin_unlock(&cl->cl_lock); +out: + return ret; +} + +static inline int +setlkflg (int type) +{ + return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? + RD_STATE : WR_STATE; +} + +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + if (stp->st_stid.sc_type == NFS4_CLOSED_STID + || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) + /* + * "Closed" stateid's exist *only* to return + * nfserr_replay_me from the previous step, and + * revoked delegations are kept only for free_stateid. + */ + return nfserr_bad_stateid; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + return status; + return nfs4_check_fh(current_fh, stp); +} + +/* + * Checks for sequence id mutating operations. + */ +static __be32 +nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, char typemask, + struct nfs4_ol_stateid **stpp, + struct nfsd_net *nn) +{ + __be32 status; + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; + + dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, + seqid, STATEID_VAL(stateid)); + + *stpp = NULL; + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); + if (status) + return status; + stp = openlockstateid(s); + nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); + + status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); + if (!status) + *stpp = stp; + else + nfs4_put_stid(&stp->st_stid); + return status; +} + +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) +{ + __be32 status; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; + + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + NFS4_OPEN_STID, &stp, nn); + if (status) + return status; + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + nfs4_put_stid(&stp->st_stid); + return nfserr_bad_stateid; + } + *stpp = stp; + return nfs_ok; +} + +__be32 +nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open_confirm *oc) +{ + __be32 status; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_open_confirm on file %pd\n", + cstate->current_fh.fh_dentry); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); + if (status) + return status; + + status = nfs4_preprocess_seqid_op(cstate, + oc->oc_seqid, &oc->oc_req_stateid, + NFS4_OPEN_STID, &stp, nn); + if (status) + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; + if (oo->oo_flags & NFS4_OO_CONFIRMED) + goto put_stateid; + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); + + nfsd4_client_record_create(oo->oo_owner.so_client); + status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + return status; +} + +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) +{ + if (!test_access(access, stp)) + return; + nfs4_file_put_access(stp->st_stid.sc_file, access); + clear_access(access, stp); +} + +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) +{ + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + WARN_ON_ONCE(1); + } +} + +__be32 +nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_open_downgrade *od) +{ + __be32 status; + struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", + cstate->current_fh.fh_dentry); + + /* We don't yet support WANT bits: */ + if (od->od_deleg_want) + dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, + od->od_deleg_want); + + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp, nn); + if (status) + goto out; + status = nfserr_inval; + if (!test_access(od->od_share_access, stp)) { + dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", + stp->st_access_bmap, od->od_share_access); + goto put_stateid; + } + if (!test_deny(od->od_share_deny, stp)) { + dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", + stp->st_deny_bmap, od->od_share_deny); + goto put_stateid; + } + nfs4_stateid_downgrade(stp, od->od_share_access); + + reset_union_bmap_deny(od->od_share_deny, stp); + + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + return status; +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + struct nfs4_client *clp = s->st_stid.sc_client; + LIST_HEAD(reaplist); + + s->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&clp->cl_lock); + unhash_open_stateid(s, &reaplist); + + if (clp->cl_minorversion) { + put_ol_stateid_locked(s, &reaplist); + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + } else { + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + move_to_close_lru(s, clp->net); + } +} + +/* + * nfs4_unlock_state() called after encode + */ +__be32 +nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_close *close) +{ + __be32 status; + struct nfs4_ol_stateid *stp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("NFSD: nfsd4_close on file %pd\n", + cstate->current_fh.fh_dentry); + + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp, nn); + nfsd4_bump_seqid(cstate, status); + if (status) + goto out; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + + nfsd4_close_open_stateid(stp); + + /* put reference from nfs4_preprocess_seqid_op */ + nfs4_put_stid(&stp->st_stid); +out: + return status; +} + +__be32 +nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_delegreturn *dr) +{ + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + return status; + + status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + if (status) + goto out; + dp = delegstateid(s); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + goto put_stateid; + + destroy_delegation(dp); +put_stateid: + nfs4_put_stid(&dp->dl_stid); +out: + return status; +} + + +#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) + +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + WARN_ON_ONCE(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + +/* + * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that + * we can't properly handle lock requests that go beyond the (2^63 - 1)-th + * byte, because of sign extension problems. Since NFSv4 calls for 64-bit + * locking, this prevents us from being completely protocol-compliant. The + * real solution to this problem is to start using unsigned file offsets in + * the VFS, but this is a very deep change! + */ +static inline void +nfs4_transform_lock_offset(struct file_lock *lock) +{ + if (lock->fl_start < 0) + lock->fl_start = OFFSET_MAX; + if (lock->fl_end < 0) + lock->fl_end = OFFSET_MAX; +} + +static fl_owner_t +nfsd4_fl_get_owner(fl_owner_t owner) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; + + nfs4_get_stateowner(&lo->lo_owner); + return owner; +} + +static void +nfsd4_fl_put_owner(fl_owner_t owner) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; + + if (lo) + nfs4_put_stateowner(&lo->lo_owner); +} + +static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_get_owner = nfsd4_fl_get_owner, + .lm_put_owner = nfsd4_fl_put_owner, +}; + +static inline void +nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) +{ + struct nfs4_lockowner *lo; + + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + lo = (struct nfs4_lockowner *) fl->fl_owner; + deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, + lo->lo_owner.so_owner.len, GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_owner.len = lo->lo_owner.so_owner.len; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; + } else { +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; + } + deny->ld_start = fl->fl_start; + deny->ld_length = NFS4_MAX_UINT64; + if (fl->fl_end != NFS4_MAX_UINT64) + deny->ld_length = fl->fl_end - fl->fl_start + 1; + deny->ld_type = NFS4_READ_LT; + if (fl->fl_type != F_RDLCK) + deny->ld_type = NFS4_WRITE_LT; +} + +static struct nfs4_lockowner * +find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) +{ + unsigned int strhashval = ownerstr_hashval(owner); + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], + so_strhash) { + if (so->so_is_open_owner) + continue; + if (same_owner_str(so, owner)) + return lockowner(nfs4_get_stateowner(so)); + } + return NULL; +} + +static struct nfs4_lockowner * +find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) +{ + struct nfs4_lockowner *lo; + + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clid, owner, clp); + spin_unlock(&clp->cl_lock); + return lo; +} + +static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner_locked(lockowner(sop)); +} + +static void nfs4_free_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_lockowner *lo = lockowner(sop); + + kmem_cache_free(lockowner_slab, lo); +} + +static const struct nfs4_stateowner_operations lockowner_ops = { + .so_unhash = nfs4_unhash_lockowner, + .so_free = nfs4_free_lockowner, +}; + +/* + * Alloc a lock owner structure. + * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has + * occurred. + * + * strhashval = ownerstr_hashval + */ +static struct nfs4_lockowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, + struct nfs4_ol_stateid *open_stp, + struct nfsd4_lock *lock) +{ + struct nfs4_lockowner *lo, *ret; + + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) + return NULL; + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; + lo->lo_owner.so_ops = &lockowner_ops; + spin_lock(&clp->cl_lock); + ret = find_lockowner_str_locked(&clp->cl_clientid, + &lock->lk_new_owner, clp); + if (ret == NULL) { + list_add(&lo->lo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + ret = lo; + } else + nfs4_free_lockowner(&lo->lo_owner); + spin_unlock(&clp->cl_lock); + return ret; +} + +static void +init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, + struct nfs4_file *fp, struct inode *inode, + struct nfs4_ol_stateid *open_stp) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + atomic_inc(&stp->st_stid.sc_count); + stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); + get_nfs4_file(fp); + stp->st_stid.sc_file = fp; + stp->st_stid.sc_free = nfs4_free_lock_stateid; + stp->st_access_bmap = 0; + stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); +} + +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_file == fp) { + atomic_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * +find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, + struct inode *inode, struct nfs4_ol_stateid *ost, + bool *new) +{ + struct nfs4_stid *ns = NULL; + struct nfs4_ol_stateid *lst; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *clp = oo->oo_owner.so_client; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (lst == NULL) { + spin_unlock(&clp->cl_lock); + ns = nfs4_alloc_stid(clp, stateid_slab); + if (ns == NULL) + return NULL; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (likely(!lst)) { + lst = openlockstateid(ns); + init_lock_stateid(lst, lo, fi, inode, ost); + ns = NULL; + *new = true; + } + } + spin_unlock(&clp->cl_lock); + if (ns) + nfs4_put_stid(ns); + return lst; +} + +static int +check_lock_length(u64 offset, u64 length) +{ + return ((length == 0) || ((length != NFS4_MAX_UINT64) && + LOFF_OVERFLOW(offset, length))); +} + +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) +{ + struct nfs4_file *fp = lock_stp->st_stid.sc_file; + + lockdep_assert_held(&fp->fi_lock); + + if (test_access(access, lock_stp)) + return; + __nfs4_file_get_access(fp, access); + set_access(access, lock_stp); +} + +static __be32 +lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, + struct nfs4_ol_stateid *ost, + struct nfsd4_lock *lock, + struct nfs4_ol_stateid **lst, bool *new) +{ + __be32 status; + struct nfs4_file *fi = ost->st_stid.sc_file; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *cl = oo->oo_owner.so_client; + struct inode *inode = d_inode(cstate->current_fh.fh_dentry); + struct nfs4_lockowner *lo; + unsigned int strhashval; + + lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl); + if (!lo) { + strhashval = ownerstr_hashval(&lock->v.new.owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + } else { + /* with an existing lockowner, seqids must be the same */ + status = nfserr_bad_seqid; + if (!cstate->minorversion && + lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) + goto out; + } + + *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (*lst == NULL) { + status = nfserr_jukebox; + goto out; + } + status = nfs_ok; +out: + nfs4_put_stateowner(&lo->lo_owner); + return status; +} + +/* + * LOCK operation + */ +__be32 +nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lock *lock) +{ + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; + struct nfs4_ol_stateid *lock_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; + struct nfs4_file *fp; + struct file *filp = NULL; + struct file_lock *file_lock = NULL; + struct file_lock *conflock = NULL; + __be32 status = 0; + int lkflg; + int err; + bool new = false; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", + (long long) lock->lk_offset, + (long long) lock->lk_length); + + if (check_lock_length(lock->lk_offset, lock->lk_length)) + return nfserr_inval; + + if ((status = fh_verify(rqstp, &cstate->current_fh, + S_IFREG, NFSD_MAY_LOCK))) { + dprintk("NFSD: nfsd4_lock: permission denied!\n"); + return status; + } + + if (lock->lk_is_new) { + if (nfsd4_has_session(cstate)) + /* See rfc 5661 18.10.3: given clientid is ignored: */ + memcpy(&lock->v.new.clientid, + &cstate->session->se_client->cl_clientid, + sizeof(clientid_t)); + + status = nfserr_stale_clientid; + if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) + goto out; + + /* validate and update open stateid and open seqid */ + status = nfs4_preprocess_confirmed_seqid_op(cstate, + lock->lk_new_open_seqid, + &lock->lk_new_open_stateid, + &open_stp, nn); + if (status) + goto out; + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, + &lock->v.new.clientid)) + goto out; + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new); + } else { + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + NFS4_LOCK_STID, &lock_stp, nn); + } + if (status) + goto out; + lock_sop = lockowner(lock_stp->st_stateowner); + + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; + + status = nfserr_grace; + if (locks_in_grace(net) && !lock->lk_reclaim) + goto out; + status = nfserr_no_grace; + if (!locks_in_grace(net) && lock->lk_reclaim) + goto out; + + file_lock = locks_alloc_lock(); + if (!file_lock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + fp = lock_stp->st_stid.sc_file; + switch (lock->lk_type) { + case NFS4_READ_LT: + case NFS4_READW_LT: + spin_lock(&fp->fi_lock); + filp = find_readable_file_locked(fp); + if (filp) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); + spin_unlock(&fp->fi_lock); + file_lock->fl_type = F_RDLCK; + break; + case NFS4_WRITE_LT: + case NFS4_WRITEW_LT: + spin_lock(&fp->fi_lock); + filp = find_writeable_file_locked(fp); + if (filp) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); + spin_unlock(&fp->fi_lock); + file_lock->fl_type = F_WRLCK; + break; + default: + status = nfserr_inval; + goto out; + } + if (!filp) { + status = nfserr_openmode; + goto out; + } + + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); + file_lock->fl_pid = current->tgid; + file_lock->fl_file = filp; + file_lock->fl_flags = FL_POSIX; + file_lock->fl_lmops = &nfsd_posix_mng_ops; + file_lock->fl_start = lock->lk_offset; + file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); + nfs4_transform_lock_offset(file_lock); + + conflock = locks_alloc_lock(); + if (!conflock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + switch (-err) { + case 0: /* success! */ + update_stateid(&lock_stp->st_stid.sc_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, + sizeof(stateid_t)); + status = 0; + break; + case (EAGAIN): /* conflock holds conflicting lock */ + status = nfserr_denied; + dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); + nfs4_set_lock_denied(conflock, &lock->lk_denied); + break; + case (EDEADLK): + status = nfserr_deadlock; + break; + default: + dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); + status = nfserrno(err); + break; + } +out: + if (filp) + fput(filp); + if (lock_stp) { + /* Bump seqid manually if the 4.0 replay owner is openowner */ + if (cstate->replay_owner && + cstate->replay_owner != &lock_sop->lo_owner && + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. + */ + if (status && new) + release_lock_stateid(lock_stp); + + nfs4_put_stid(&lock_stp->st_stid); + } + if (open_stp) + nfs4_put_stid(&open_stp->st_stid); + nfsd4_bump_seqid(cstate, status); + if (file_lock) + locks_free_lock(file_lock); + if (conflock) + locks_free_lock(conflock); + return status; +} + +/* + * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, + * so we do a temporary open here just to get an open file to pass to + * vfs_test_lock. (Arguably perhaps test_lock should be done with an + * inode operation.) + */ +static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +{ + struct file *file; + __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (!err) { + err = nfserrno(vfs_test_lock(file, lock)); + nfsd_close(file); + } + return err; +} + +/* + * LOCKT operation + */ +__be32 +nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lockt *lockt) +{ + struct file_lock *file_lock = NULL; + struct nfs4_lockowner *lo = NULL; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + if (check_lock_length(lockt->lt_offset, lockt->lt_length)) + return nfserr_inval; + + if (!nfsd4_has_session(cstate)) { + status = lookup_clientid(&lockt->lt_clientid, cstate, nn); + if (status) + goto out; + } + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + goto out; + + file_lock = locks_alloc_lock(); + if (!file_lock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + switch (lockt->lt_type) { + case NFS4_READ_LT: + case NFS4_READW_LT: + file_lock->fl_type = F_RDLCK; + break; + case NFS4_WRITE_LT: + case NFS4_WRITEW_LT: + file_lock->fl_type = F_WRLCK; + break; + default: + dprintk("NFSD: nfs4_lockt: bad lock type!\n"); + status = nfserr_inval; + goto out; + } + + lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner, + cstate->clp); + if (lo) + file_lock->fl_owner = (fl_owner_t)lo; + file_lock->fl_pid = current->tgid; + file_lock->fl_flags = FL_POSIX; + + file_lock->fl_start = lockt->lt_offset; + file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); + + nfs4_transform_lock_offset(file_lock); + + status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock); + if (status) + goto out; + + if (file_lock->fl_type != F_UNLCK) { + status = nfserr_denied; + nfs4_set_lock_denied(file_lock, &lockt->lt_denied); + } +out: + if (lo) + nfs4_put_stateowner(&lo->lo_owner); + if (file_lock) + locks_free_lock(file_lock); + return status; +} + +__be32 +nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_locku *locku) +{ + struct nfs4_ol_stateid *stp; + struct file *filp = NULL; + struct file_lock *file_lock = NULL; + __be32 status; + int err; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", + (long long) locku->lu_offset, + (long long) locku->lu_length); + + if (check_lock_length(locku->lu_offset, locku->lu_length)) + return nfserr_inval; + + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, NFS4_LOCK_STID, + &stp, nn); + if (status) + goto out; + filp = find_any_file(stp->st_stid.sc_file); + if (!filp) { + status = nfserr_lock_range; + goto put_stateid; + } + file_lock = locks_alloc_lock(); + if (!file_lock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto fput; + } + + file_lock->fl_type = F_UNLCK; + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); + file_lock->fl_pid = current->tgid; + file_lock->fl_file = filp; + file_lock->fl_flags = FL_POSIX; + file_lock->fl_lmops = &nfsd_posix_mng_ops; + file_lock->fl_start = locku->lu_offset; + + file_lock->fl_end = last_byte_offset(locku->lu_offset, + locku->lu_length); + nfs4_transform_lock_offset(file_lock); + + err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + if (err) { + dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); + goto out_nfserr; + } + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); +fput: + fput(filp); +put_stateid: + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + if (file_lock) + locks_free_lock(file_lock); + return status; + +out_nfserr: + status = nfserrno(err); + goto fput; +} + +/* + * returns + * true: locks held by lockowner + * false: no locks held by lockowner + */ +static bool +check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) +{ + struct file_lock *fl; + int status = false; + struct file *filp = find_any_file(fp); + struct inode *inode; + struct file_lock_context *flctx; + + if (!filp) { + /* Any valid lock stateid should have some sort of access */ + WARN_ON_ONCE(1); + return status; + } + + inode = file_inode(filp); + flctx = inode->i_flctx; + + if (flctx && !list_empty_careful(&flctx->flc_posix)) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner == (fl_owner_t)lowner) { + status = true; + break; + } + } + spin_unlock(&flctx->flc_lock); + } + fput(filp); + return status; +} + +__be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_release_lockowner *rlockowner) +{ + clientid_t *clid = &rlockowner->rl_clientid; + struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo = NULL; + struct nfs4_ol_stateid *stp; + struct xdr_netobj *owner = &rlockowner->rl_owner; + unsigned int hashval = ownerstr_hashval(owner); + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_client *clp; + + dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", + clid->cl_boot, clid->cl_id); + + status = lookup_clientid(clid, cstate, nn); + if (status) + return status; + + clp = cstate->clp; + /* Find the matching lock stateowner */ + spin_lock(&clp->cl_lock); + list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + + if (sop->so_is_open_owner || !same_owner_str(sop, owner)) + continue; + + /* see if there are still any locks associated with it */ + lo = lockowner(sop); + list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + status = nfserr_locks_held; + spin_unlock(&clp->cl_lock); + return status; + } + } + + nfs4_get_stateowner(sop); + break; + } + spin_unlock(&clp->cl_lock); + if (lo) + release_lockowner(lo); + return status; +} + +static inline struct nfs4_client_reclaim * +alloc_reclaim(void) +{ + return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); +} + +bool +nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn) +{ + struct nfs4_client_reclaim *crp; + + crp = nfsd4_find_reclaim_client(name, nn); + return (crp && crp->cr_clp); +} + +/* + * failure => all reset bets are off, nfserr_no_grace... + */ +struct nfs4_client_reclaim * +nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn) +{ + unsigned int strhashval; + struct nfs4_client_reclaim *crp; + + dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); + crp = alloc_reclaim(); + if (crp) { + strhashval = clientstr_hashval(name); + INIT_LIST_HEAD(&crp->cr_strhash); + list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); + memcpy(crp->cr_recdir, name, HEXDIR_LEN); + crp->cr_clp = NULL; + nn->reclaim_str_hashtbl_size++; + } + return crp; +} + +void +nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) +{ + list_del(&crp->cr_strhash); + kfree(crp); + nn->reclaim_str_hashtbl_size--; +} + +void +nfs4_release_reclaim(struct nfsd_net *nn) +{ + struct nfs4_client_reclaim *crp = NULL; + int i; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->reclaim_str_hashtbl[i])) { + crp = list_entry(nn->reclaim_str_hashtbl[i].next, + struct nfs4_client_reclaim, cr_strhash); + nfs4_remove_reclaim_record(crp, nn); + } + } + WARN_ON_ONCE(nn->reclaim_str_hashtbl_size); +} + +/* + * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ +struct nfs4_client_reclaim * +nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) +{ + unsigned int strhashval; + struct nfs4_client_reclaim *crp = NULL; + + dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir); + + strhashval = clientstr_hashval(recdir); + list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { + if (same_name(crp->cr_recdir, recdir)) { + return crp; + } + } + return NULL; +} + +/* +* Called from OPEN. Look for clientid in reclaim list. +*/ +__be32 +nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ + __be32 status; + + /* find clientid in conf_id_hashtbl */ + status = lookup_clientid(clid, cstate, nn); + if (status) + return nfserr_reclaim_bad; + + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + return nfserr_no_grace; + + if (nfsd4_client_record_check(cstate->clp)) + return nfserr_reclaim_bad; + + return nfs_ok; +} + +#ifdef CONFIG_NFSD_FAULT_INJECTION +static inline void +put_client(struct nfs4_client *clp) +{ + atomic_dec(&clp->cl_refcount); +} + +static struct nfs4_client * +nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) +{ + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return NULL; + + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + if (memcmp(&clp->cl_addr, addr, addr_size) == 0) + return clp; + } + return NULL; +} + +u64 +nfsd_inject_print_clients(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + char buf[INET6_ADDRSTRLEN]; + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); + pr_info("NFS Client: %s\n", buf); + ++count; + } + spin_unlock(&nn->client_lock); + + return count; +} + +u64 +nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) { + if (mark_client_expired_locked(clp) == nfs_ok) + ++count; + else + clp = NULL; + } + spin_unlock(&nn->client_lock); + + if (clp) + expire_client(clp); + + return count; +} + +u64 +nfsd_inject_forget_clients(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + if (mark_client_expired_locked(clp) == nfs_ok) { + list_add(&clp->cl_lru, &reaplist); + if (max != 0 && ++count >= max) + break; + } + } + spin_unlock(&nn->client_lock); + + list_for_each_entry_safe(clp, next, &reaplist, cl_lru) + expire_client(clp); + + return count; +} + +static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, + const char *type) +{ + char buf[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); + printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); +} + +static void +nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, + struct list_head *collect) +{ + struct nfs4_client *clp = lst->st_stid.sc_client; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!collect) + return; + + lockdep_assert_held(&nn->client_lock); + atomic_inc(&clp->cl_refcount); + list_add(&lst->st_locks, collect); +} + +static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_ol_stateid *)) +{ + struct nfs4_openowner *oop; + struct nfs4_ol_stateid *stp, *st_next; + struct nfs4_ol_stateid *lst, *lst_next; + u64 count = 0; + + spin_lock(&clp->cl_lock); + list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { + list_for_each_entry_safe(stp, st_next, + &oop->oo_owner.so_stateids, st_perstateowner) { + list_for_each_entry_safe(lst, lst_next, + &stp->st_locks, st_locks) { + if (func) { + func(lst); + nfsd_inject_add_lock_to_list(lst, + collect); + } + ++count; + /* + * Despite the fact that these functions deal + * with 64-bit integers for "count", we must + * ensure that it doesn't blow up the + * clp->cl_refcount. Throw a warning if we + * start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + goto out; + } + } + } +out: + spin_unlock(&clp->cl_lock); + + return count; +} + +static u64 +nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect, + u64 max) +{ + return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid); +} + +static u64 +nfsd_print_client_locks(struct nfs4_client *clp) +{ + u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL); + nfsd_print_count(clp, count, "locked files"); + return count; +} + +u64 +nfsd_inject_print_locks(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_locks(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_reap_locks(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_ol_stateid *stp, *next; + + list_for_each_entry_safe(stp, next, reaplist, st_locks) { + list_del_init(&stp->st_locks); + clp = stp->st_stid.sc_client; + nfs4_put_stid(&stp->st_stid); + put_client(clp); + } +} + +u64 +nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_locks(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +u64 +nfsd_inject_forget_locks(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_locks(clp, &reaplist, max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +static u64 +nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_openowner *)) +{ + struct nfs4_openowner *oop, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + u64 count = 0; + + lockdep_assert_held(&nn->client_lock); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { + if (func) { + func(oop); + if (collect) { + atomic_inc(&clp->cl_refcount); + list_add(&oop->oo_perclient, collect); + } + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + break; + } + spin_unlock(&clp->cl_lock); + + return count; +} + +static u64 +nfsd_print_client_openowners(struct nfs4_client *clp) +{ + u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL); + + nfsd_print_count(clp, count, "openowners"); + return count; +} + +static u64 +nfsd_collect_client_openowners(struct nfs4_client *clp, + struct list_head *collect, u64 max) +{ + return nfsd_foreach_client_openowner(clp, max, collect, + unhash_openowner_locked); +} + +u64 +nfsd_inject_print_openowners(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_openowners(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_reap_openowners(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_openowner *oop, *next; + + list_for_each_entry_safe(oop, next, reaplist, oo_perclient) { + list_del_init(&oop->oo_perclient); + clp = oop->oo_owner.so_client; + release_openowner(oop); + put_client(clp); + } +} + +u64 +nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr, + size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_openowners(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); + return count; +} + +u64 +nfsd_inject_forget_openowners(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_openowners(clp, &reaplist, + max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); + return count; +} + +static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, + struct list_head *victims) +{ + struct nfs4_delegation *dp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + u64 count = 0; + + lockdep_assert_held(&nn->client_lock); + + spin_lock(&state_lock); + list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { + if (victims) { + /* + * It's not safe to mess with delegations that have a + * non-zero dl_time. They might have already been broken + * and could be processed by the laundromat outside of + * the state_lock. Just leave them be. + */ + if (dp->dl_time != 0) + continue; + + atomic_inc(&clp->cl_refcount); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, victims); + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + break; + } + spin_unlock(&state_lock); + return count; +} + +static u64 +nfsd_print_client_delegations(struct nfs4_client *clp) +{ + u64 count = nfsd_find_all_delegations(clp, 0, NULL); + + nfsd_print_count(clp, count, "delegations"); + return count; +} + +u64 +nfsd_inject_print_delegations(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_delegations(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_forget_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + revoke_delegation(dp); + put_client(clp); + } +} + +u64 +nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_forget_delegations(&reaplist); + return count; +} + +u64 +nfsd_inject_forget_delegations(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_forget_delegations(&reaplist); + return count; +} + +static void +nfsd_recall_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + /* + * We skipped all entries that had a zero dl_time before, + * so we can now reset the dl_time back to 0. If a delegation + * break comes in now, then it won't make any difference since + * we're recalling it either way. + */ + spin_lock(&state_lock); + dp->dl_time = 0; + spin_unlock(&state_lock); + nfsd_break_one_deleg(dp); + put_client(clp); + } +} + +u64 +nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_recall_delegations(&reaplist); + return count; +} + +u64 +nfsd_inject_recall_delegations(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && ++count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_recall_delegations(&reaplist); + return count; +} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + +/* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has + * the inode cached. This becomes an obvious problem the first time a + * client's inode cache approaches the size of the server's total memory. + * + * For now we avoid this problem by imposing a hard limit on the number + * of delegations, which varies according to the server's memory size. + */ +static void +set_max_delegations(void) +{ + /* + * Allow at most 4 delegations per megabyte of RAM. Quick + * estimates suggest that in the worst case (where every delegation + * is for a different inode), a delegation could take about 1.5K, + * giving a worst case usage of about 6% of memory. + */ + max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); +} + +static int nfs4_state_create_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->conf_id_hashtbl) + goto err; + nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->unconf_id_hashtbl) + goto err_unconf_id; + nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * + SESSION_HASH_SIZE, GFP_KERNEL); + if (!nn->sessionid_hashtbl) + goto err_sessionid; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); + INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); + } + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); + nn->conf_name_tree = RB_ROOT; + nn->unconf_name_tree = RB_ROOT; + INIT_LIST_HEAD(&nn->client_lru); + INIT_LIST_HEAD(&nn->close_lru); + INIT_LIST_HEAD(&nn->del_recall_lru); + spin_lock_init(&nn->client_lock); + + INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + get_net(net); + + return 0; + +err_sessionid: + kfree(nn->unconf_id_hashtbl); +err_unconf_id: + kfree(nn->conf_id_hashtbl); +err: + return -ENOMEM; +} + +static void +nfs4_state_destroy_net(struct net *net) +{ + int i; + struct nfs4_client *clp = NULL; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->conf_id_hashtbl[i])) { + clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } + } + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->unconf_id_hashtbl[i])) { + clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } + } + + kfree(nn->sessionid_hashtbl); + kfree(nn->unconf_id_hashtbl); + kfree(nn->conf_id_hashtbl); + put_net(net); +} + +int +nfs4_state_start_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int ret; + + ret = nfs4_state_create_net(net); + if (ret) + return ret; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + locks_start_grace(net, &nn->nfsd4_manager); + nfsd4_client_tracking_init(net); + printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", + nn->nfsd4_grace, net); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); + return 0; +} + +/* initialization to perform when the nfsd service is started: */ + +int +nfs4_state_start(void) +{ + int ret; + + ret = set_callback_cred(); + if (ret) + return -ENOMEM; + laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) { + ret = -ENOMEM; + goto out_recovery; + } + ret = nfsd4_create_callback_queue(); + if (ret) + goto out_free_laundry; + + set_max_delegations(); + + return 0; + +out_free_laundry: + destroy_workqueue(laundry_wq); +out_recovery: + return ret; +} + +void +nfs4_state_shutdown_net(struct net *net) +{ + struct nfs4_delegation *dp = NULL; + struct list_head *pos, *next, reaplist; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cancel_delayed_work_sync(&nn->laundromat_work); + locks_end_grace(&nn->nfsd4_manager); + + INIT_LIST_HEAD(&reaplist); + spin_lock(&state_lock); + list_for_each_safe(pos, next, &nn->del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + list_for_each_safe(pos, next, &reaplist) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); + } + + nfsd4_client_tracking_exit(net); + nfs4_state_destroy_net(net); +} + +void +nfs4_state_shutdown(void) +{ + destroy_workqueue(laundry_wq); + nfsd4_destroy_callback_queue(); +} + +static void +get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid)) + memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); +} + +static void +put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (cstate->minorversion) { + memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + } +} + +void +clear_current_stateid(struct nfsd4_compound_state *cstate) +{ + CLEAR_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); +} + +/* + * functions to set current state id + */ +void +nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp) +{ + put_stateid(cstate, &odp->od_stateid); +} + +void +nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + put_stateid(cstate, &open->op_stateid); +} + +void +nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close) +{ + put_stateid(cstate, &close->cl_stateid); +} + +void +nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) +{ + put_stateid(cstate, &lock->lk_resp_stateid); +} + +/* + * functions to consume current state id + */ + +void +nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp) +{ + get_stateid(cstate, &odp->od_stateid); +} + +void +nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *drp) +{ + get_stateid(cstate, &drp->dr_stateid); +} + +void +nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *fsp) +{ + get_stateid(cstate, &fsp->fr_stateid); +} + +void +nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) +{ + get_stateid(cstate, &setattr->sa_stateid); +} + +void +nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close) +{ + get_stateid(cstate, &close->cl_stateid); +} + +void +nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) +{ + get_stateid(cstate, &locku->lu_stateid); +} + +void +nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, struct nfsd4_read *read) +{ + get_stateid(cstate, &read->rd_stateid); +} + +void +nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, struct nfsd4_write *write) +{ + get_stateid(cstate, &write->wr_stateid); +} diff --git a/kernel/fs/nfsd/nfs4xdr.c b/kernel/fs/nfsd/nfs4xdr.c new file mode 100644 index 000000000..158badf94 --- /dev/null +++ b/kernel/fs/nfsd/nfs4xdr.c @@ -0,0 +1,4453 @@ +/* + * Server-side XDR for NFSv4 + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "idmap.h" +#include "acl.h" +#include "xdr4.h" +#include "vfs.h" +#include "state.h" +#include "cache.h" +#include "netns.h" +#include "pnfs.h" + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include +#endif + + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +/* + * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing + * directory in order to indicate to the client that a filesystem boundary is present + * We use a fixed fsid for a referral + */ +#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL +#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL + +static __be32 +check_filename(char *str, int len) +{ + int i; + + if (len == 0) + return nfserr_inval; + if (isdotent(str, len)) + return nfserr_badname; + for (i = 0; i < len; i++) + if (str[i] == '/') + return nfserr_badname; + return 0; +} + +#define DECODE_HEAD \ + __be32 *p; \ + __be32 status +#define DECODE_TAIL \ + status = 0; \ +out: \ + return status; \ +xdr_error: \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + status = nfserr_bad_xdr; \ + goto out + +#define READMEM(x,nbytes) do { \ + x = (char *)p; \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define SAVEMEM(x,nbytes) do { \ + if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ + savemem(argp, p, nbytes) : \ + (char *)p)) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define COPYMEM(x,nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) + +/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ +#define READ_BUF(nbytes) do { \ + if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ + p = argp->p; \ + argp->p += XDR_QUADLEN(nbytes); \ + } else if (!(p = read_buf(argp, nbytes))) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ +} while (0) + +static void next_decode_page(struct nfsd4_compoundargs *argp) +{ + argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; + if (argp->pagelen < PAGE_SIZE) { + argp->end = argp->p + (argp->pagelen>>2); + argp->pagelen = 0; + } else { + argp->end = argp->p + (PAGE_SIZE>>2); + argp->pagelen -= PAGE_SIZE; + } +} + +static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) +{ + /* We want more bytes than seem to be available. + * Maybe we need a new page, maybe we have just run out + */ + unsigned int avail = (char *)argp->end - (char *)argp->p; + __be32 *p; + if (avail + argp->pagelen < nbytes) + return NULL; + if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ + return NULL; + /* ok, we can do it with the current plus the next page */ + if (nbytes <= sizeof(argp->tmp)) + p = argp->tmp; + else { + kfree(argp->tmpp); + p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; + + } + /* + * The following memcpy is safe because read_buf is always + * called with nbytes > avail, and the two cases above both + * guarantee p points to at least nbytes bytes. + */ + memcpy(p, argp->p, avail); + next_decode_page(argp); + memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); + argp->p += XDR_QUADLEN(nbytes - avail); + return p; +} + +static int zero_clientid(clientid_t *clid) +{ + return (clid->cl_boot == 0) && (clid->cl_id == 0); +} + +/** + * svcxdr_tmpalloc - allocate memory to be freed after compound processing + * @argp: NFSv4 compound argument structure + * @p: pointer to be freed (with kfree()) + * + * Marks @p to be freed when processing the compound operation + * described in @argp finishes. + */ +static void * +svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) +{ + struct svcxdr_tmpbuf *tb; + + tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL); + if (!tb) + return NULL; + tb->next = argp->to_free; + argp->to_free = tb; + return tb->buf; +} + +/* + * For xdr strings that need to be passed to other kernel api's + * as null-terminated strings. + * + * Note null-terminating in place usually isn't safe since the + * buffer might end on a page boundary. + */ +static char * +svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) +{ + char *p = svcxdr_tmpalloc(argp, len + 1); + + if (!p) + return NULL; + memcpy(p, buf, len); + p[len] = '\0'; + return p; +} + +/** + * savemem - duplicate a chunk of memory for later processing + * @argp: NFSv4 compound argument structure to be freed with + * @p: pointer to be duplicated + * @nbytes: length to be duplicated + * + * Returns a pointer to a copy of @nbytes bytes of memory at @p + * that are preserved until processing of the NFSv4 compound + * operation described by @argp finishes. + */ +static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) +{ + void *ret; + + ret = svcxdr_tmpalloc(argp, nbytes); + if (!ret) + return NULL; + memcpy(ret, p, nbytes); + return ret; +} + +/* + * We require the high 32 bits of 'seconds' to be 0, and + * we ignore all 32 bits of 'nseconds'. + */ +static __be32 +nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv) +{ + DECODE_HEAD; + u64 sec; + + READ_BUF(12); + p = xdr_decode_hyper(p, &sec); + tv->tv_sec = sec; + tv->tv_nsec = be32_to_cpup(p++); + if (tv->tv_nsec >= (u32)1000000000) + return nfserr_inval; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +{ + u32 bmlen; + DECODE_HEAD; + + bmval[0] = 0; + bmval[1] = 0; + bmval[2] = 0; + + READ_BUF(4); + bmlen = be32_to_cpup(p++); + if (bmlen > 1000) + goto xdr_error; + + READ_BUF(bmlen << 2); + if (bmlen > 0) + bmval[0] = be32_to_cpup(p++); + if (bmlen > 1) + bmval[1] = be32_to_cpup(p++); + if (bmlen > 2) + bmval[2] = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label) +{ + int expected_len, len = 0; + u32 dummy32; + char *buf; + + DECODE_HEAD; + iattr->ia_valid = 0; + if ((status = nfsd4_decode_bitmap(argp, bmval))) + return status; + + READ_BUF(4); + expected_len = be32_to_cpup(p++); + + if (bmval[0] & FATTR4_WORD0_SIZE) { + READ_BUF(8); + len += 8; + p = xdr_decode_hyper(p, &iattr->ia_size); + iattr->ia_valid |= ATTR_SIZE; + } + if (bmval[0] & FATTR4_WORD0_ACL) { + u32 nace; + struct nfs4_ace *ace; + + READ_BUF(4); len += 4; + nace = be32_to_cpup(p++); + + if (nace > NFS4_ACL_MAX) + return nfserr_fbig; + + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); + if (*acl == NULL) + return nfserr_jukebox; + + (*acl)->naces = nace; + for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { + READ_BUF(16); len += 16; + ace->type = be32_to_cpup(p++); + ace->flag = be32_to_cpup(p++); + ace->access_mask = be32_to_cpup(p++); + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += XDR_QUADLEN(dummy32) << 2; + READMEM(buf, dummy32); + ace->whotype = nfs4_acl_get_whotype(buf, dummy32); + status = nfs_ok; + if (ace->whotype != NFS4_ACL_WHO_NAMED) + ; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + buf, dummy32, &ace->who_gid); + else + status = nfsd_map_name_to_uid(argp->rqstp, + buf, dummy32, &ace->who_uid); + if (status) + return status; + } + } else + *acl = NULL; + if (bmval[1] & FATTR4_WORD1_MODE) { + READ_BUF(4); + len += 4; + iattr->ia_mode = be32_to_cpup(p++); + iattr->ia_mode &= (S_IFMT | S_IALLUGO); + iattr->ia_valid |= ATTR_MODE; + } + if (bmval[1] & FATTR4_WORD1_OWNER) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; + iattr->ia_valid |= ATTR_UID; + } + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; + iattr->ia_valid |= ATTR_GID; + } + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + len += 12; + status = nfsd4_decode_time(argp, &iattr->ia_atime); + if (status) + return status; + iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_ATIME; + break; + default: + goto xdr_error; + } + } + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + len += 12; + status = nfsd4_decode_time(argp, &iattr->ia_mtime); + if (status) + return status; + iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_MTIME; + break; + default: + goto xdr_error; + } + } + + label->len = 0; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + if (dummy32 > NFS4_MAXLABELLEN) + return nfserr_badlabel; + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + label->len = dummy32; + label->data = svcxdr_dupstr(argp, buf, dummy32); + if (!label->data) + return nfserr_jukebox; + } +#endif + + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 + || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 + || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) + READ_BUF(expected_len - len); + else if (len != expected_len) + goto xdr_error; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + sid->si_generation = be32_to_cpup(p++); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +{ + DECODE_HEAD; + + READ_BUF(4); + access->ac_req_access = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + DECODE_HEAD; + u32 dummy, uid, gid; + char *machine_name; + int i; + int nr_secflavs; + + /* callback_sec_params4 */ + READ_BUF(4); + nr_secflavs = be32_to_cpup(p++); + if (nr_secflavs) + cbs->flavor = (u32)(-1); + else + /* Is this legal? Be generous, take it to mean AUTH_NONE: */ + cbs->flavor = 0; + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + if (cbs->flavor == (u32)(-1)) + cbs->flavor = RPC_AUTH_NULL; + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + dummy = be32_to_cpup(p++); + + /* machine name */ + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + uid = be32_to_cpup(p++); + gid = be32_to_cpup(p++); + + /* more gids */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + if (cbs->flavor == (u32)(-1)) { + kuid_t kuid = make_kuid(&init_user_ns, uid); + kgid_t kgid = make_kgid(&init_user_ns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid" + "uid or gid ignoring!\n"); + } + } + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + dummy = be32_to_cpup(p++); + /* gcbp_handle_from_server */ + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ + DECODE_HEAD; + + READ_BUF(4); + bc->bc_cb_program = be32_to_cpup(p++); + nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); + COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + bcts->dir = be32_to_cpup(p++); + /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker + * could help us figure out we should be using it. */ + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +{ + DECODE_HEAD; + + READ_BUF(4); + close->cl_seqid = be32_to_cpup(p++); + return nfsd4_decode_stateid(argp, &close->cl_stateid); + + DECODE_TAIL; +} + + +static __be32 +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +{ + DECODE_HEAD; + + READ_BUF(12); + p = xdr_decode_hyper(p, &commit->co_offset); + commit->co_count = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +{ + DECODE_HEAD; + + READ_BUF(4); + create->cr_type = be32_to_cpup(p++); + switch (create->cr_type) { + case NF4LNK: + READ_BUF(4); + create->cr_datalen = be32_to_cpup(p++); + READ_BUF(create->cr_datalen); + create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); + if (!create->cr_data) + return nfserr_jukebox; + break; + case NF4BLK: + case NF4CHR: + READ_BUF(8); + create->cr_specdata1 = be32_to_cpup(p++); + create->cr_specdata2 = be32_to_cpup(p++); + break; + case NF4SOCK: + case NF4FIFO: + case NF4DIR: + default: + break; + } + + READ_BUF(4); + create->cr_namelen = be32_to_cpup(p++); + READ_BUF(create->cr_namelen); + SAVEMEM(create->cr_name, create->cr_namelen); + if ((status = check_filename(create->cr_name, create->cr_namelen))) + return status; + + status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, + &create->cr_acl, &create->cr_label); + if (status) + goto out; + + DECODE_TAIL; +} + +static inline __be32 +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +{ + return nfsd4_decode_stateid(argp, &dr->dr_stateid); +} + +static inline __be32 +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +{ + return nfsd4_decode_bitmap(argp, getattr->ga_bmval); +} + +static __be32 +nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +{ + DECODE_HEAD; + + READ_BUF(4); + link->li_namelen = be32_to_cpup(p++); + READ_BUF(link->li_namelen); + SAVEMEM(link->li_name, link->li_namelen); + if ((status = check_filename(link->li_name, link->li_namelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +{ + DECODE_HEAD; + + /* + * type, reclaim(boolean), offset, length, new_lock_owner(boolean) + */ + READ_BUF(28); + lock->lk_type = be32_to_cpup(p++); + if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) + goto xdr_error; + lock->lk_reclaim = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lock->lk_offset); + p = xdr_decode_hyper(p, &lock->lk_length); + lock->lk_is_new = be32_to_cpup(p++); + + if (lock->lk_is_new) { + READ_BUF(4); + lock->lk_new_open_seqid = be32_to_cpup(p++); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); + lock->lk_new_lock_seqid = be32_to_cpup(p++); + COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); + lock->lk_new_owner.len = be32_to_cpup(p++); + READ_BUF(lock->lk_new_owner.len); + READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); + } else { + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); + lock->lk_old_lock_seqid = be32_to_cpup(p++); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +{ + DECODE_HEAD; + + READ_BUF(32); + lockt->lt_type = be32_to_cpup(p++); + if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + goto xdr_error; + p = xdr_decode_hyper(p, &lockt->lt_offset); + p = xdr_decode_hyper(p, &lockt->lt_length); + COPYMEM(&lockt->lt_clientid, 8); + lockt->lt_owner.len = be32_to_cpup(p++); + READ_BUF(lockt->lt_owner.len); + READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +{ + DECODE_HEAD; + + READ_BUF(8); + locku->lu_type = be32_to_cpup(p++); + if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) + goto xdr_error; + locku->lu_seqid = be32_to_cpup(p++); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); + p = xdr_decode_hyper(p, &locku->lu_offset); + p = xdr_decode_hyper(p, &locku->lu_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +{ + DECODE_HEAD; + + READ_BUF(4); + lookup->lo_len = be32_to_cpup(p++); + READ_BUF(lookup->lo_len); + SAVEMEM(lookup->lo_name, lookup->lo_len); + if ((status = check_filename(lookup->lo_name, lookup->lo_len))) + return status; + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + w = be32_to_cpup(p++); + *share_access = w & NFS4_SHARE_ACCESS_MASK; + *deleg_want = w & NFS4_SHARE_WANT_MASK; + if (deleg_when) + *deleg_when = w & NFS4_SHARE_WHEN_MASK; + + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + + if (!deleg_when) /* open_downgrade */ + return nfserr_inval; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + *x = be32_to_cpup(p++); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + o->len = be32_to_cpup(p++); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 +nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + DECODE_HEAD; + u32 dummy; + + memset(open->op_bmval, 0, sizeof(open->op_bmval)); + open->op_iattr.ia_valid = 0; + open->op_openowner = NULL; + + open->op_xdr_error = 0; + /* seqid, share_access, share_deny, clientid, ownerlen */ + READ_BUF(4); + open->op_seqid = be32_to_cpup(p++); + /* decode, yet ignore deleg_when until supported */ + status = nfsd4_decode_share_access(argp, &open->op_share_access, + &open->op_deleg_want, &dummy); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t)); + COPYMEM(&open->op_clientid, sizeof(clientid_t)); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); + open->op_create = be32_to_cpup(p++); + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + READ_BUF(4); + open->op_createmode = be32_to_cpup(p++); + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl, &open->op_label); + if (status) + goto out; + break; + case NFS4_CREATE_EXCLUSIVE: + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + goto xdr_error; + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl, &open->op_label); + if (status) + goto out; + break; + default: + goto xdr_error; + } + break; + default: + goto xdr_error; + } + + /* open_claim */ + READ_BUF(4); + open->op_claim_type = be32_to_cpup(p++); + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + READ_BUF(4); + open->op_fname.len = be32_to_cpup(p++); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + return status; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + READ_BUF(4); + open->op_delegate_type = be32_to_cpup(p++); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); + open->op_fname.len = be32_to_cpup(p++); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + return status; + break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; + default: + goto xdr_error; + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); + open_conf->oc_seqid = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(4); + open_down->od_seqid = be32_to_cpup(p++); + status = nfsd4_decode_share_access(argp, &open_down->od_share_access, + &open_down->od_deleg_want, NULL); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +{ + DECODE_HEAD; + + READ_BUF(4); + putfh->pf_fhlen = be32_to_cpup(p++); + if (putfh->pf_fhlen > NFS4_FHSIZE) + goto xdr_error; + READ_BUF(putfh->pf_fhlen); + SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +{ + if (argp->minorversion == 0) + return nfs_ok; + return nfserr_notsupp; +} + +static __be32 +nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); + p = xdr_decode_hyper(p, &read->rd_offset); + read->rd_length = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +{ + DECODE_HEAD; + + READ_BUF(24); + p = xdr_decode_hyper(p, &readdir->rd_cookie); + COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); + readdir->rd_dircount = be32_to_cpup(p++); + readdir->rd_maxcount = be32_to_cpup(p++); + if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) + goto out; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +{ + DECODE_HEAD; + + READ_BUF(4); + remove->rm_namelen = be32_to_cpup(p++); + READ_BUF(remove->rm_namelen); + SAVEMEM(remove->rm_name, remove->rm_namelen); + if ((status = check_filename(remove->rm_name, remove->rm_namelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +{ + DECODE_HEAD; + + READ_BUF(4); + rename->rn_snamelen = be32_to_cpup(p++); + READ_BUF(rename->rn_snamelen + 4); + SAVEMEM(rename->rn_sname, rename->rn_snamelen); + rename->rn_tnamelen = be32_to_cpup(p++); + READ_BUF(rename->rn_tnamelen); + SAVEMEM(rename->rn_tname, rename->rn_tnamelen); + if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) + return status; + if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(sizeof(clientid_t)); + COPYMEM(clientid, sizeof(clientid_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo *secinfo) +{ + DECODE_HEAD; + + READ_BUF(4); + secinfo->si_namelen = be32_to_cpup(p++); + READ_BUF(secinfo->si_namelen); + SAVEMEM(secinfo->si_name, secinfo->si_namelen); + status = check_filename(secinfo->si_name, secinfo->si_namelen); + if (status) + return status; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + DECODE_HEAD; + + READ_BUF(4); + sin->sin_style = be32_to_cpup(p++); + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +{ + __be32 status; + + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, + &setattr->sa_acl, &setattr->sa_label); +} + +static __be32 +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); + + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); + setclientid->se_callback_prog = be32_to_cpup(p++); + setclientid->se_callback_netid_len = be32_to_cpup(p++); + + READ_BUF(setclientid->se_callback_netid_len + 4); + SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + setclientid->se_callback_addr_len = be32_to_cpup(p++); + + READ_BUF(setclientid->se_callback_addr_len + 4); + SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + setclientid->se_callback_ident = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(8 + NFS4_VERIFIER_SIZE); + COPYMEM(&scd_c->sc_clientid, 8); + COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); + + DECODE_TAIL; +} + +/* Also used for NVERIFY */ +static __be32 +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +{ + DECODE_HEAD; + + if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) + goto out; + + /* For convenience's sake, we compare raw xdr'd attributes in + * nfsd4_proc_verify */ + + READ_BUF(4); + verify->ve_attrlen = be32_to_cpup(p++); + READ_BUF(verify->ve_attrlen); + SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +{ + int avail; + int len; + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); + p = xdr_decode_hyper(p, &write->wr_offset); + write->wr_stable_how = be32_to_cpup(p++); + if (write->wr_stable_how > 2) + goto xdr_error; + write->wr_buflen = be32_to_cpup(p++); + + /* Sorry .. no magic macros for this.. * + * READ_BUF(write->wr_buflen); + * SAVEMEM(write->wr_buf, write->wr_buflen); + */ + avail = (char*)argp->end - (char*)argp->p; + if (avail + argp->pagelen < write->wr_buflen) { + dprintk("NFSD: xdr error (%s:%d)\n", + __FILE__, __LINE__); + goto xdr_error; + } + write->wr_head.iov_base = p; + write->wr_head.iov_len = avail; + write->wr_pagelist = argp->pagelist; + + len = XDR_QUADLEN(write->wr_buflen) << 2; + if (len >= avail) { + int pages; + + len -= avail; + + pages = len >> PAGE_SHIFT; + argp->pagelist += pages; + argp->pagelen -= pages * PAGE_SIZE; + len -= pages * PAGE_SIZE; + + argp->p = (__be32 *)page_address(argp->pagelist[0]); + argp->pagelist++; + argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + } + argp->p += XDR_QUADLEN(len); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(12); + COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); + rlockowner->rl_owner.len = be32_to_cpup(p++); + READ_BUF(rlockowner->rl_owner.len); + READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + + if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) + return nfserr_inval; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + int dummy, tmp; + DECODE_HEAD; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; + + READ_BUF(4); + exid->flags = be32_to_cpup(p++); + + /* Ignore state_protect4_a */ + READ_BUF(4); + exid->spa_how = be32_to_cpup(p++); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + + /* spo_must_allow */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + break; + case SP4_SSV: + /* ssp_ops */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + + /* ssp_hash_algs<> */ + READ_BUF(4); + tmp = be32_to_cpup(p++); + while (tmp--) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ssp_encr_algs<> */ + READ_BUF(4); + tmp = be32_to_cpup(p++); + while (tmp--) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ssp_window and ssp_num_gss_handles */ + READ_BUF(8); + dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p++); + break; + default: + goto xdr_error; + } + + /* Ignore Implementation ID */ + READ_BUF(4); /* nfs_impl_id4 array length */ + dummy = be32_to_cpup(p++); + + if (dummy > 1) + goto xdr_error; + + if (dummy == 1) { + /* nii_domain */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_name */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_date */ + READ_BUF(12); + p += 3; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) +{ + DECODE_HEAD; + u32 dummy; + + READ_BUF(16); + COPYMEM(&sess->clientid, 8); + sess->seqid = be32_to_cpup(p++); + sess->flags = be32_to_cpup(p++); + + /* Fore channel attrs */ + READ_BUF(28); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->fore_channel.maxreq_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_cached = be32_to_cpup(p++); + sess->fore_channel.maxops = be32_to_cpup(p++); + sess->fore_channel.maxreqs = be32_to_cpup(p++); + sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); + if (sess->fore_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + sess->fore_channel.rdma_attrs = be32_to_cpup(p++); + } else if (sess->fore_channel.nr_rdma_attrs > 1) { + dprintk("Too many fore channel attr bitmaps!\n"); + goto xdr_error; + } + + /* Back channel attrs */ + READ_BUF(28); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->back_channel.maxreq_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_cached = be32_to_cpup(p++); + sess->back_channel.maxops = be32_to_cpup(p++); + sess->back_channel.maxreqs = be32_to_cpup(p++); + sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); + if (sess->back_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + sess->back_channel.rdma_attrs = be32_to_cpup(p++); + } else if (sess->back_channel.nr_rdma_attrs > 1) { + dprintk("Too many back channel attr bitmaps!\n"); + goto xdr_error; + } + + READ_BUF(4); + sess->callback_prog = be32_to_cpup(p++); + nfsd4_decode_cb_sec(argp, &sess->cb_sec); + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) +{ + DECODE_HEAD; + READ_BUF(NFS4_MAX_SESSIONID_LEN); + COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, + struct nfsd4_free_stateid *free_stateid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); + COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +{ + int i; + __be32 *p, status; + struct nfsd4_test_stateid_id *stateid; + + READ_BUF(4); + test_stateid->ts_num_ids = ntohl(*p++); + + INIT_LIST_HEAD(&test_stateid->ts_stateid_list); + + for (i = 0; i < test_stateid->ts_num_ids; i++) { + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); + if (!stateid) { + status = nfserrno(-ENOMEM); + goto out; + } + + INIT_LIST_HEAD(&stateid->ts_id_list); + list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + + status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); + if (status) + goto out; + } + + status = 0; +out: + return status; +xdr_error: + dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); + status = nfserr_bad_xdr; + goto out; +} + +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +{ + DECODE_HEAD; + + READ_BUF(4); + rc->rca_one_fs = be32_to_cpup(p++); + + DECODE_TAIL; +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_getdeviceinfo *gdev) +{ + DECODE_HEAD; + u32 num, i; + + READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); + COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); + gdev->gd_layout_type = be32_to_cpup(p++); + gdev->gd_maxcount = be32_to_cpup(p++); + num = be32_to_cpup(p++); + if (num) { + READ_BUF(4 * num); + gdev->gd_notify_types = be32_to_cpup(p++); + for (i = 1; i < num; i++) { + if (be32_to_cpup(p++)) { + status = nfserr_inval; + goto out; + } + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutget *lgp) +{ + DECODE_HEAD; + + READ_BUF(36); + lgp->lg_signal = be32_to_cpup(p++); + lgp->lg_layout_type = be32_to_cpup(p++); + lgp->lg_seg.iomode = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lgp->lg_seg.offset); + p = xdr_decode_hyper(p, &lgp->lg_seg.length); + p = xdr_decode_hyper(p, &lgp->lg_minlength); + + status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + if (status) + return status; + + READ_BUF(4); + lgp->lg_maxcount = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + DECODE_HEAD; + u32 timechange; + + READ_BUF(20); + p = xdr_decode_hyper(p, &lcp->lc_seg.offset); + p = xdr_decode_hyper(p, &lcp->lc_seg.length); + lcp->lc_reclaim = be32_to_cpup(p++); + + status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + if (status) + return status; + + READ_BUF(4); + lcp->lc_newoffset = be32_to_cpup(p++); + if (lcp->lc_newoffset) { + READ_BUF(8); + p = xdr_decode_hyper(p, &lcp->lc_last_wr); + } else + lcp->lc_last_wr = 0; + READ_BUF(4); + timechange = be32_to_cpup(p++); + if (timechange) { + status = nfsd4_decode_time(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; + } + READ_BUF(8); + lcp->lc_layout_type = be32_to_cpup(p++); + + /* + * Save the layout update in XDR format and let the layout driver deal + * with it later. + */ + lcp->lc_up_len = be32_to_cpup(p++); + if (lcp->lc_up_len > 0) { + READ_BUF(lcp->lc_up_len); + READMEM(lcp->lc_up_layout, lcp->lc_up_len); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + DECODE_HEAD; + + READ_BUF(16); + lrp->lr_reclaim = be32_to_cpup(p++); + lrp->lr_layout_type = be32_to_cpup(p++); + lrp->lr_seg.iomode = be32_to_cpup(p++); + lrp->lr_return_type = be32_to_cpup(p++); + if (lrp->lr_return_type == RETURN_FILE) { + READ_BUF(16); + p = xdr_decode_hyper(p, &lrp->lr_seg.offset); + p = xdr_decode_hyper(p, &lrp->lr_seg.length); + + status = nfsd4_decode_stateid(argp, &lrp->lr_sid); + if (status) + return status; + + READ_BUF(4); + lrp->lrf_body_len = be32_to_cpup(p++); + if (lrp->lrf_body_len > 0) { + READ_BUF(lrp->lrf_body_len); + READMEM(lrp->lrf_body, lrp->lrf_body_len); + } + } else { + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + } + + DECODE_TAIL; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + if (status) + return status; + + READ_BUF(16); + p = xdr_decode_hyper(p, &fallocate->falloc_offset); + xdr_decode_hyper(p, &fallocate->falloc_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + if (status) + return status; + + READ_BUF(8 + 4); + p = xdr_decode_hyper(p, &seek->seek_offset); + seek->seek_whence = be32_to_cpup(p); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +{ + return nfs_ok; +} + +static __be32 +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +{ + return nfserr_notsupp; +} + +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); + +static nfsd4_dec nfsd4_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + + /* new operations for NFSv4.1 */ + [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, +#else + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, +#endif + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + + /* new operations for NFSv4.2 */ + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, + [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, + [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, +}; + +static inline bool +nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) +{ + if (op->opnum < FIRST_NFS4_OP) + return false; + else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP) + return false; + else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP) + return false; + else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP) + return false; + return true; +} + +static __be32 +nfsd4_decode_compound(struct nfsd4_compoundargs *argp) +{ + DECODE_HEAD; + struct nfsd4_op *op; + bool cachethis = false; + int auth_slack= argp->rqstp->rq_auth_slack; + int max_reply = auth_slack + 8; /* opcnt, status */ + int readcount = 0; + int readbytes = 0; + int i; + + READ_BUF(4); + argp->taglen = be32_to_cpup(p++); + READ_BUF(argp->taglen + 8); + SAVEMEM(argp->tag, argp->taglen); + argp->minorversion = be32_to_cpup(p++); + argp->opcnt = be32_to_cpup(p++); + max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); + + if (argp->taglen > NFSD4_MAX_TAGLEN) + goto xdr_error; + if (argp->opcnt > 100) + goto xdr_error; + + if (argp->opcnt > ARRAY_SIZE(argp->iops)) { + argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + if (!argp->ops) { + argp->ops = argp->iops; + dprintk("nfsd: couldn't allocate room for COMPOUND\n"); + goto xdr_error; + } + } + + if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION) + argp->opcnt = 0; + + for (i = 0; i < argp->opcnt; i++) { + op = &argp->ops[i]; + op->replay = NULL; + + READ_BUF(4); + op->opnum = be32_to_cpup(p++); + + if (nfsd4_opnum_in_range(argp, op)) + op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); + else { + op->opnum = OP_ILLEGAL; + op->status = nfserr_op_illegal; + } + /* + * We'll try to cache the result in the DRC if any one + * op in the compound wants to be cached: + */ + cachethis |= nfsd4_cache_this_op(op); + + if (op->opnum == OP_READ) { + readcount++; + readbytes += nfsd4_max_reply(argp->rqstp, op); + } else + max_reply += nfsd4_max_reply(argp->rqstp, op); + /* + * OP_LOCK may return a conflicting lock. (Special case + * because it will just skip encoding this if it runs + * out of xdr buffer space, and it is the only operation + * that behaves this way.) + */ + if (op->opnum == OP_LOCK) + max_reply += NFS4_OPAQUE_LIMIT; + + if (op->status) { + argp->opcnt = i+1; + break; + } + } + /* Sessions make the DRC unnecessary: */ + if (argp->minorversion) + cachethis = false; + svc_reserve(argp->rqstp, max_reply + readbytes); + argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + + if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + + DECODE_TAIL; +} + +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) +{ + if (IS_I_VERSION(inode)) { + p = xdr_encode_hyper(p, inode->i_version); + } else { + *p++ = cpu_to_be32(stat->ctime.tv_sec); + *p++ = cpu_to_be32(stat->ctime.tv_nsec); + } + return p; +} + +static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +{ + *p++ = cpu_to_be32(c->atomic); + if (c->change_supported) { + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); + } else { + *p++ = cpu_to_be32(c->before_ctime_sec); + *p++ = cpu_to_be32(c->before_ctime_nsec); + *p++ = cpu_to_be32(c->after_ctime_sec); + *p++ = cpu_to_be32(c->after_ctime_nsec); + } + return p; +} + +/* Encode as an array of strings the string given with components + * separated @sep, escaped with esc_enter and esc_exit. + */ +static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, + char *components, char esc_enter, + char esc_exit) +{ + __be32 *p; + __be32 pathlen; + int pathlen_offset; + int strlen, count=0; + char *str, *end, *next; + + dprintk("nfsd4_encode_components(%s)\n", components); + + pathlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + p++; /* We will fill this in with @count later */ + + end = str = components; + while (*end) { + bool found_esc = false; + + /* try to parse as esc_start, ..., esc_end, sep */ + if (*str == esc_enter) { + for (; *end && (*end != esc_exit); end++) + /* find esc_exit or end of string */; + next = end + 1; + if (*end && (!*next || *next == sep)) { + str++; + found_esc = true; + } + } + + if (!found_esc) + for (; *end && (*end != sep); end++) + /* find sep or end of string */; + + strlen = end - str; + if (strlen) { + p = xdr_reserve_space(xdr, strlen + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, str, strlen); + count++; + } + else + end++; + if (found_esc) + end = next; + + str = end; + } + pathlen = htonl(count); + write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); + return 0; +} + +/* Encode as an array of strings the string given with components + * separated @sep. + */ +static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, + char *components) +{ + return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); +} + +/* + * encode a location element of a fs_locations structure + */ +static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, + struct nfsd4_fs_location *location) +{ + __be32 status; + + status = nfsd4_encode_components_esc(xdr, ':', location->hosts, + '[', ']'); + if (status) + return status; + status = nfsd4_encode_components(xdr, '/', location->path); + if (status) + return status; + return 0; +} + +/* + * Encode a path in RFC3530 'pathname4' format + */ +static __be32 nfsd4_encode_path(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) +{ + struct path cur = *path; + __be32 *p; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; + + dprintk("nfsd4_encode_components("); + + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (path_equal(&cur, root)) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } + err = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_free; + *p++ = cpu_to_be32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len; + + spin_lock(&dentry->d_lock); + len = dentry->d_name.len; + p = xdr_reserve_space(xdr, len + 4); + if (!p) { + spin_unlock(&dentry->d_lock); + goto out_free; + } + p = xdr_encode_opaque(p, dentry->d_name.name, len); + dprintk("/%pd", dentry); + spin_unlock(&dentry->d_lock); + dput(dentry); + ncomponents--; + } + + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, + struct svc_rqst *rqstp, const struct path *path) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); + exp_put(exp_ps); + return res; +} + +/* + * encode a fs_locations structure + */ +static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, + struct svc_rqst *rqstp, struct svc_export *exp) +{ + __be32 status; + int i; + __be32 *p; + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; + + status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); + if (status) + return status; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(fslocs->locations_count); + for (i=0; ilocations_count; i++) { + status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); + if (status) + return status; + } + return 0; +} + +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} + +static inline __be32 +nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) +{ + if (ace->whotype != NFS4_ACL_WHO_NAMED) + return nfs4_acl_write_who(xdr, ace->whotype); + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + return nfsd4_encode_group(xdr, rqstp, ace->who_gid); + else + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); +} + +#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ + FATTR4_WORD0_RDATTR_ERROR) +#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static inline __be32 +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + if (!p) + return nfserr_resource; + + /* + * For now we use a 0 here to indicate the null translation; in + * the future we may place a call to translation code here. + */ + *p++ = cpu_to_be32(0); /* lfs */ + *p++ = cpu_to_be32(0); /* pi */ + p = xdr_encode_opaque(p, context, len); + return 0; +} +#else +static inline __be32 +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) +{ return 0; } +#endif + +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +{ + /* As per referral draft: */ + if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || + *bmval1 & ~WORD1_ABSENT_FS_ATTRS) { + if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR || + *bmval0 & FATTR4_WORD0_FS_LOCATIONS) + *rdattr_err = NFSERR_MOVED; + else + return nfserr_moved; + } + *bmval0 &= WORD0_ABSENT_FS_ATTRS; + *bmval1 &= WORD1_ABSENT_FS_ATTRS; + return 0; +} + + +static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +{ + struct path path = exp->ex_path; + int err; + + path_get(&path); + while (follow_up(&path)) { + if (path.dentry != path.mnt->mnt_root) + break; + } + err = vfs_getattr(&path, stat); + path_put(&path); + return err; +} + +/* + * Note: @fhp can be NULL; in this case, we might have to compose the filehandle + * ourselves. + */ +static __be32 +nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, + struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + u32 bmval0 = bmval[0]; + u32 bmval1 = bmval[1]; + u32 bmval2 = bmval[2]; + struct kstat stat; + struct svc_fh *tempfh = NULL; + struct kstatfs statfs; + __be32 *p; + int starting_len = xdr->buf->len; + int attrlen_offset; + __be32 attrlen; + u32 dummy; + u64 dummy64; + u32 rdattr_err = 0; + __be32 status; + int err; + int aclsupport = 0; + struct nfs4_acl *acl = NULL; + void *context = NULL; + int contextlen; + bool contextsupport = false; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + struct path path = { + .mnt = exp->ex_path.mnt, + .dentry = dentry, + }; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); + BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); + BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); + BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); + + if (exp->ex_fslocs.migrated) { + BUG_ON(bmval[2]); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + if (status) + goto out; + } + + err = vfs_getattr(&path, &stat); + if (err) + goto out_nfserr; + if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || + (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + FATTR4_WORD1_SPACE_TOTAL))) { + err = vfs_statfs(&path, &statfs); + if (err) + goto out_nfserr; + } + if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + status = nfserr_jukebox; + if (!tempfh) + goto out; + fh_init(tempfh, NFS4_FHSIZE); + status = fh_compose(tempfh, exp, dentry, NULL); + if (status) + goto out; + fhp = tempfh; + } + if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT + | FATTR4_WORD0_SUPPORTED_ATTRS)) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + aclsupport = (err == 0); + if (bmval0 & FATTR4_WORD0_ACL) { + if (err == -EOPNOTSUPP) + bmval0 &= ~FATTR4_WORD0_ACL; + else if (err == -EINVAL) { + status = nfserr_attrnotsupp; + goto out; + } else if (err != 0) + goto out_nfserr; + } + } + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || + bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + err = security_inode_getsecctx(d_inode(dentry), + &context, &contextlen); + contextsupport = (err == 0); + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + if (err == -EOPNOTSUPP) + bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + else if (err) + goto out_nfserr; + } + } +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ + + if (bmval2) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); + } else if (bmval1) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + } else { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); + } + + attrlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + p++; /* to be backfilled later */ + + if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + u32 word0 = nfsd_suppattrs0(minorversion); + u32 word1 = nfsd_suppattrs1(minorversion); + u32 word2 = nfsd_suppattrs2(minorversion); + + if (!aclsupport) + word0 &= ~FATTR4_WORD0_ACL; + if (!contextsupport) + word2 &= ~FATTR4_WORD2_SECURITY_LABEL; + if (!word2) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); + } else { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(word2); + } + } + if (bmval0 & FATTR4_WORD0_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + dummy = nfs4_file_type(stat.mode); + if (dummy == NF4BAD) { + status = nfserr_serverfault; + goto out; + } + *p++ = cpu_to_be32(dummy); + } + if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); + else + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| + NFS4_FH_VOL_RENAME); + } + if (bmval0 & FATTR4_WORD0_CHANGE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = encode_change(p, &stat, d_inode(dentry)); + } + if (bmval0 & FATTR4_WORD0_SIZE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, stat.size); + } + if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_FSID) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + if (exp->ex_fslocs.migrated) { + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); + } else switch(fsid_source(fhp)) { + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64)exp->ex_fsid); + p = xdr_encode_hyper(p, (u64)0); + break; + case FSIDSOURCE_DEV: + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MAJOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MINOR(stat.dev)); + break; + case FSIDSOURCE_UUID: + p = xdr_encode_opaque_fixed(p, exp->ex_uuid, + EX_UUID_LEN); + break; + } + } + if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(nn->nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(rdattr_err); + } + if (bmval0 & FATTR4_WORD0_ACL) { + struct nfs4_ace *ace; + + if (acl == NULL) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + + *p++ = cpu_to_be32(0); + goto out_acl; + } + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(acl->naces); + + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + p = xdr_reserve_space(xdr, 4*3); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(ace->type); + *p++ = cpu_to_be32(ace->flag); + *p++ = cpu_to_be32(ace->access_mask & + NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(xdr, rqstp, ace); + if (status) + goto out; + } + } +out_acl: + if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(aclsupport ? + ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); + } + if (bmval0 & FATTR4_WORD0_CANSETTIME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_FILEHANDLE) { + p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); + if (!p) + goto out_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, + fhp->fh_handle.fh_size); + } + if (bmval0 & FATTR4_WORD0_FILEID) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, stat.ino); + } + if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_FREE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_files); + } + if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { + status = nfsd4_encode_fs_locations(xdr, rqstp, exp); + if (status) + goto out; + } + if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); + } + if (bmval0 & FATTR4_WORD0_MAXLINK) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(255); + } + if (bmval0 & FATTR4_WORD0_MAXNAME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(statfs.f_namelen); + } + if (bmval0 & FATTR4_WORD0_MAXREAD) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); + } + if (bmval0 & FATTR4_WORD0_MAXWRITE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); + } + if (bmval1 & FATTR4_WORD1_MODE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.mode & S_IALLUGO); + } + if (bmval1 & FATTR4_WORD1_NO_TRUNC) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval1 & FATTR4_WORD1_NUMLINKS) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.nlink); + } + if (bmval1 & FATTR4_WORD1_OWNER) { + status = nfsd4_encode_user(xdr, rqstp, stat.uid); + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { + status = nfsd4_encode_group(xdr, rqstp, stat.gid); + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_RAWDEV) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); + } + if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_FREE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_USED) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)stat.blocks << 9; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); + *p++ = cpu_to_be32(stat.atime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_DELTA) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(0); + } + if (bmval1 & FATTR4_WORD1_TIME_METADATA) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); + *p++ = cpu_to_be32(stat.ctime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); + *p++ = cpu_to_be32(stat.mtime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + /* + * Get parent's attributes if not ignoring crossmount + * and this is the root of a cross-mounted filesystem. + */ + if (ignore_crossmnt == 0 && + dentry == exp->ex_path.mnt->mnt_root) + get_parent_attributes(exp, &stat); + p = xdr_encode_hyper(p, stat.ino); + } +#ifdef CONFIG_NFSD_PNFS + if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || + (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { + if (exp->ex_layout_type) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(exp->ex_layout_type); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.blksize); + } +#endif /* CONFIG_NFSD_PNFS */ + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + status = nfsd4_encode_security_label(xdr, rqstp, context, + contextlen); + if (status) + goto out; + } + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + } + + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); + status = nfs_ok; + +out: +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (context) + security_release_secctx(context, contextlen); +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ + kfree(acl); + if (tempfh) { + fh_put(tempfh); + kfree(tempfh); + } + if (status) + xdr_truncate_encode(xdr, starting_len); + return status; +out_nfserr: + status = nfserrno(err); + goto out; +out_resource: + status = nfserr_resource; + goto out; +} + +static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr, + struct xdr_buf *buf, __be32 *p, int bytes) +{ + xdr->scratch.iov_len = 0; + memset(buf, 0, sizeof(struct xdr_buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = 0; + buf->len = 0; + xdr->buf = buf; + xdr->iov = buf->head; + xdr->p = p; + xdr->end = (void *)p + bytes; + buf->buflen = bytes; +} + +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + struct xdr_buf dummy; + struct xdr_stream xdr; + __be32 ret; + + svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); + ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, + ignore_crossmnt); + *p = xdr.p; + return ret; +} + +static inline int attributes_need_mount(u32 *bmval) +{ + if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) + return 1; + if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) + return 1; + return 0; +} + +static __be32 +nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, + const char *name, int namlen) +{ + struct svc_export *exp = cd->rd_fhp->fh_export; + struct dentry *dentry; + __be32 nfserr; + int ignore_crossmnt = 0; + + dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); + if (IS_ERR(dentry)) + return nfserrno(PTR_ERR(dentry)); + if (d_really_is_negative(dentry)) { + /* + * nfsd_buffered_readdir drops the i_mutex between + * readdir and calling this callback, leaving a window + * where this directory entry could have gone away. + */ + dput(dentry); + return nfserr_noent; + } + + exp_get(exp); + /* + * In the case of a mountpoint, the client may be asking for + * attributes that are only properties of the underlying filesystem + * as opposed to the cross-mounted file system. In such a case, + * we will not follow the cross mount and will fill the attribtutes + * directly from the mountpoint dentry. + */ + if (nfsd_mountpoint(dentry, exp)) { + int err; + + if (!(exp->ex_flags & NFSEXP_V4ROOT) + && !attributes_need_mount(cd->rd_bmval)) { + ignore_crossmnt = 1; + goto out_encode; + } + /* + * Why the heck aren't we just using nfsd_lookup?? + * Different "."/".." handling? Something else? + * At least, add a comment here to explain.... + */ + err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); + if (err) { + nfserr = nfserrno(err); + goto out_put; + } + nfserr = check_nfsd_access(exp, cd->rd_rqstp); + if (nfserr) + goto out_put; + + } +out_encode: + nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, + cd->rd_rqstp, ignore_crossmnt); +out_put: + dput(dentry); + exp_put(exp); + return nfserr; +} + +static __be32 * +nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return NULL; + *p++ = htonl(2); + *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ + *p++ = htonl(0); /* bmval1 */ + + *p++ = htonl(4); /* attribute length */ + *p++ = nfserr; /* no htonl */ + return p; +} + +static int +nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); + struct xdr_stream *xdr = cd->xdr; + int start_offset = xdr->buf->len; + int cookie_offset; + u32 name_and_cookie; + int entry_bytes; + __be32 nfserr = nfserr_toosmall; + __be64 wire_offset; + __be32 *p; + + /* In nfsv4, "." and ".." never make it onto the wire.. */ + if (name && isdotent(name, namlen)) { + cd->common.err = nfs_ok; + return 0; + } + + if (cd->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, + &wire_offset, 8); + } + + p = xdr_reserve_space(xdr, 4); + if (!p) + goto fail; + *p++ = xdr_one; /* mark entry present */ + cookie_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 3*4 + namlen); + if (!p) + goto fail; + p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_array(p, name, namlen); /* name length & name */ + + nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); + switch (nfserr) { + case nfs_ok: + break; + case nfserr_resource: + nfserr = nfserr_toosmall; + goto fail; + case nfserr_noent: + xdr_truncate_encode(xdr, start_offset); + goto skip_entry; + default: + /* + * If the client requested the RDATTR_ERROR attribute, + * we stuff the error code into this attribute + * and continue. If this attribute was not requested, + * then in accordance with the spec, we fail the + * entire READDIR operation(!) + */ + if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) + goto fail; + p = nfsd4_encode_rdattr_error(xdr, nfserr); + if (p == NULL) { + nfserr = nfserr_toosmall; + goto fail; + } + } + nfserr = nfserr_toosmall; + entry_bytes = xdr->buf->len - start_offset; + if (entry_bytes > cd->rd_maxcount) + goto fail; + cd->rd_maxcount -= entry_bytes; + /* + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so + * let's always let through the first entry, at least: + */ + if (!cd->rd_dircount) + goto fail; + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + + cd->cookie_offset = cookie_offset; +skip_entry: + cd->common.err = nfs_ok; + return 0; +fail: + xdr_truncate_encode(xdr, start_offset); + cd->common.err = nfserr; + return -EINVAL; +} + +static __be32 +nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(stateid_t)); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sid->si_generation); + p = xdr_encode_opaque_fixed(p, &sid->si_opaque, + sizeof(stateid_opaque_t)); + return 0; +} + +static __be32 +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); + } + return nfserr; +} + +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); + /* Sorry, we do not yet support RDMA over 4.1: */ + *p++ = cpu_to_be32(0); + } + return nfserr; +} + +static __be32 +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid); + + return nfserr; +} + + +static __be32 +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + NFS4_VERIFIER_SIZE); + } + return nfserr; +} + +static __be32 +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &create->cr_cinfo); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(create->cr_bmval[0]); + *p++ = cpu_to_be32(create->cr_bmval[1]); + } + return nfserr; +} + +static __be32 +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +{ + struct svc_fh *fhp = getattr->ga_fhp; + struct xdr_stream *xdr = &resp->xdr; + + if (nfserr) + return nfserr; + + nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, + resp->rqstp, 0); + return nfserr; +} + +static __be32 +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct svc_fh *fhp = *fhpp; + unsigned int len; + __be32 *p; + + if (!nfserr) { + len = fhp->fh_handle.fh_size; + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); + } + return nfserr; +} + +/* +* Including all fields other than the name, a LOCK4denied structure requires +* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. +*/ +static __be32 +nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) +{ + struct xdr_netobj *conf = &ld->ld_owner; + __be32 *p; + +again: + p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); + if (!p) { + /* + * Don't fail to return the result just because we can't + * return the conflicting open: + */ + if (conf->len) { + kfree(conf->data); + conf->len = 0; + conf->data = NULL; + goto again; + } + return nfserr_resource; + } + p = xdr_encode_hyper(p, ld->ld_start); + p = xdr_encode_hyper(p, ld->ld_length); + *p++ = cpu_to_be32(ld->ld_type); + if (conf->len) { + p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); + p = xdr_encode_opaque(p, conf->data, conf->len); + kfree(conf->data); + } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ + p = xdr_encode_hyper(p, (u64)0); /* clientid */ + *p++ = cpu_to_be32(0); /* length of owner name */ + } + return nfserr_denied; +} + +static __be32 +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) + nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); + + return nfserr; +} + +static __be32 +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (nfserr == nfserr_denied) + nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); + return nfserr; +} + +static __be32 +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid); + + return nfserr; +} + + +static __be32 +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); + } + return nfserr; +} + + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + goto out; + + nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); + if (nfserr) + goto out; + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &open->op_cinfo); + *p++ = cpu_to_be32(open->op_rflags); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(open->op_bmval[0]); + *p++ = cpu_to_be32(open->op_bmval[1]); + *p++ = cpu_to_be32(open->op_delegate_type); + + switch (open->op_delegate_type) { + case NFS4_OPEN_DELEGATE_NONE: + break; + case NFS4_OPEN_DELEGATE_READ: + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); + + /* + * TODO: ACE's in delegations + */ + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + break; + case NFS4_OPEN_DELEGATE_WRITE: + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + + /* + * TODO: space_limit's in delegations + */ + *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); + *p++ = cpu_to_be32(~(u32)0); + *p++ = cpu_to_be32(~(u32)0); + + /* + * TODO: ACE's in delegations + */ + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + break; + case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + case WND4_RESOURCE: + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + /* deleg signaling not supported yet: */ + *p++ = cpu_to_be32(0); + break; + default: + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + } + break; + default: + BUG(); + } + /* XXX save filehandle here */ +out: + return nfserr; +} + +static __be32 +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); + + return nfserr; +} + +static __be32 +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid); + + return nfserr; +} + +static __be32 nfsd4_encode_splice_read( + struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = xdr->buf; + u32 eof; + int space_left; + __be32 nfserr; + __be32 *p = xdr->p - 2; + + /* Make sure there will be room for padding if needed */ + if (xdr->end - xdr->p < 1) + return nfserr_resource; + + nfserr = nfsd_splice_read(read->rd_rqstp, file, + read->rd_offset, &maxcount); + if (nfserr) { + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode: + */ + buf->page_len = 0; + return nfserr; + } + + eof = (read->rd_offset + maxcount >= + d_inode(read->rd_fhp->fh_dentry)->i_size); + + *(p++) = htonl(eof); + *(p++) = htonl(maxcount); + + buf->page_len = maxcount; + buf->len += maxcount; + xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) + / PAGE_SIZE; + + /* Use rest of head for padding and remaining ops: */ + buf->tail[0].iov_base = xdr->p; + buf->tail[0].iov_len = 0; + xdr->iov = buf->tail; + if (maxcount&3) { + int pad = 4 - (maxcount&3); + + *(xdr->p++) = 0; + + buf->tail[0].iov_base += maxcount&3; + buf->tail[0].iov_len = pad; + buf->len += pad; + } + + space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, + buf->buflen - buf->len); + buf->buflen = buf->len + space_left; + xdr->end = (__be32 *)((void *)xdr->end + space_left); + + return 0; +} + +static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 eof; + int v; + int starting_len = xdr->buf->len - 8; + long len; + int thislen; + __be32 nfserr; + __be32 tmp; + __be32 *p; + u32 zzz = 0; + int pad; + + len = maxcount; + v = 0; + + thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; + v++; + len -= thislen; + + while (len) { + thislen = min_t(long, len, PAGE_SIZE); + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; + v++; + len -= thislen; + } + read->rd_vlen = v; + + nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, + read->rd_vlen, &maxcount); + if (nfserr) + return nfserr; + xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + + eof = (read->rd_offset + maxcount >= + d_inode(read->rd_fhp->fh_dentry)->i_size); + + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); + tmp = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + + pad = (maxcount&3) ? 4 - (maxcount&3) : 0; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, + &zzz, pad); + return 0; + +} + +static __be32 +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) +{ + unsigned long maxcount; + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_filp; + struct svc_fh *fhp = read->rd_fhp; + int starting_len = xdr->buf->len; + struct raparms *ra; + __be32 *p; + __be32 err; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ + if (!p) { + WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); + return nfserr_resource; + } + if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { + WARN_ON_ONCE(1); + return nfserr_resource; + } + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); + + if (read->rd_filp) + err = nfsd_permission(resp->rqstp, fhp->fh_export, + fhp->fh_dentry, + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); + else + err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, + &file, &ra); + if (err) + goto err_truncate; + + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) + err = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + err = nfsd4_encode_readv(resp, read, file, maxcount); + + if (!read->rd_filp) + nfsd_put_tmp_read_open(file, ra); + +err_truncate: + if (err) + xdr_truncate_encode(xdr, starting_len); + return err; +} + +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + __be32 wire_count; + int zero = 0; + struct xdr_stream *xdr = &resp->xdr; + int length_offset = xdr->buf->len; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + maxcount = PAGE_SIZE; + + p = xdr_reserve_space(xdr, maxcount); + if (!p) + return nfserr_resource; + /* + * XXX: By default, the ->readlink() VFS op will truncate symlinks + * if they would overflow the buffer. Is this kosher in NFSv4? If + * not, one easy fix is: if ->readlink() precisely fills the buffer, + * assume that truncation occurred, and return NFS4ERR_RESOURCE. + */ + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, + (char *)p, &maxcount); + if (nfserr == nfserr_isdir) + nfserr = nfserr_inval; + if (nfserr) { + xdr_truncate_encode(xdr, length_offset); + return nfserr; + } + + wire_count = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); + if (maxcount & 3) + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, + &zero, 4 - (maxcount&3)); + return 0; +} + +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +{ + int maxcount; + int bytes_left; + loff_t offset; + __be64 wire_offset; + struct xdr_stream *xdr = &resp->xdr; + int starting_len = xdr->buf->len; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) + - (char *)resp->xdr.buf->head[0].iov_base; + + /* + * Number of bytes left for directory entries allowing for the + * final 8 bytes of the readdir and a following failed op: + */ + bytes_left = xdr->buf->buflen - xdr->buf->len + - COMPOUND_ERR_SLACK_SPACE - 8; + if (bytes_left < 0) { + nfserr = nfserr_resource; + goto err_no_verf; + } + maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX); + /* + * Note the rfc defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier above + * and the 8 bytes encoded at the end of this function: + */ + if (maxcount < 16) { + nfserr = nfserr_toosmall; + goto err_no_verf; + } + maxcount = min_t(int, maxcount-16, bytes_left); + + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + if (!readdir->rd_dircount) + readdir->rd_dircount = INT_MAX; + + readdir->xdr = xdr; + readdir->rd_maxcount = maxcount; + readdir->common.err = 0; + readdir->cookie_offset = 0; + + offset = readdir->rd_cookie; + nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, + &offset, + &readdir->common, nfsd4_encode_dirent); + if (nfserr == nfs_ok && + readdir->common.err == nfserr_toosmall && + xdr->buf->len == starting_len + 8) { + /* nothing encoded; which limit did we hit?: */ + if (maxcount - 16 < bytes_left) + /* It was the fault of rd_maxcount: */ + nfserr = nfserr_toosmall; + else + /* We ran out of buffer space: */ + nfserr = nfserr_resource; + } + if (nfserr) + goto err_no_verf; + + if (readdir->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, + &wire_offset, 8); + } + + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + goto err_no_verf; + } + *p++ = 0; /* no more entries */ + *p++ = htonl(readdir->common.err == nfserr_eof); + + return 0; +err_no_verf: + xdr_truncate_encode(xdr, starting_len); + return nfserr; +} + +static __be32 +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); + } + return nfserr; +} + +static __be32 +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); + } + return nfserr; +} + +static __be32 +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, + __be32 nfserr, struct svc_export *exp) +{ + u32 i, nflavs, supported; + struct exp_flavor_info *flavs; + struct exp_flavor_info def_flavs[2]; + __be32 *p, *flavorsp; + static bool report = true; + + if (nfserr) + goto out; + nfserr = nfserr_resource; + if (exp->ex_nflavors) { + flavs = exp->ex_flavors; + nflavs = exp->ex_nflavors; + } else { /* Handling of some defaults in absence of real secinfo: */ + flavs = def_flavs; + if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) { + nflavs = 2; + flavs[0].pseudoflavor = RPC_AUTH_UNIX; + flavs[1].pseudoflavor = RPC_AUTH_NULL; + } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) { + nflavs = 1; + flavs[0].pseudoflavor + = svcauth_gss_flavor(exp->ex_client); + } else { + nflavs = 1; + flavs[0].pseudoflavor + = exp->ex_client->flavour->flavour; + } + } + + supported = 0; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + flavorsp = p++; /* to be backfilled later */ + + for (i = 0; i < nflavs; i++) { + rpc_authflavor_t pf = flavs[i].pseudoflavor; + struct rpcsec_gss_info info; + + if (rpcauth_get_gssinfo(pf, &info) == 0) { + supported++; + p = xdr_reserve_space(xdr, 4 + 4 + + XDR_LEN(info.oid.len) + 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(RPC_AUTH_GSS); + p = xdr_encode_opaque(p, info.oid.data, info.oid.len); + *p++ = cpu_to_be32(info.qop); + *p++ = cpu_to_be32(info.service); + } else if (pf < RPC_AUTH_MAXFLAVOR) { + supported++; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = cpu_to_be32(pf); + } else { + if (report) + pr_warn("NFS: SECINFO: security flavor %u " + "is not supported\n", pf); + } + } + + if (nflavs != supported) + report = false; + *flavorsp = htonl(supported); + nfserr = 0; +out: + if (exp) + exp_put(exp); + return nfserr; +} + +static __be32 +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo *secinfo) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp); +} + +static __be32 +nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo_no_name *secinfo) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp); +} + +/* + * The SETATTR encode routine is special -- it always encodes a bitmap, + * regardless of the error status. + */ +static __be32 +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + if (nfserr) { + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + } + else { + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(setattr->sa_bmval[0]); + *p++ = cpu_to_be32(setattr->sa_bmval[1]); + *p++ = cpu_to_be32(setattr->sa_bmval[2]); + } + return nfserr; +} + +static __be32 +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); + p = xdr_encode_opaque_fixed(p, &scd->se_confirm, + NFS4_VERIFIER_SIZE); + } + else if (nfserr == nfserr_clid_inuse) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + } + return nfserr; +} + +static __be32 +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + } + return nfserr; +} + +static const u32 nfs4_minimal_spo_must_enforce[2] = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) +}; + +static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_exchange_id *exid) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + char *major_id; + char *server_scope; + int major_id_sz; + int server_scope_sz; + uint64_t minor_id = 0; + + if (nfserr) + return nfserr; + + major_id = utsname()->nodename; + major_id_sz = strlen(major_id); + server_scope = utsname()->nodename; + server_scope_sz = strlen(server_scope); + + p = xdr_reserve_space(xdr, + 8 /* eir_clientid */ + + 4 /* eir_sequenceid */ + + 4 /* eir_flags */ + + 4 /* spr_how */); + if (!p) + return nfserr_resource; + + p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); + *p++ = cpu_to_be32(exid->seqid); + *p++ = cpu_to_be32(exid->flags); + + *p++ = cpu_to_be32(exid->spa_how); + + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce, spo_must_allow */ + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + + /* spo_must_enforce bitmap: */ + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); + /* empty spo_must_allow bitmap: */ + *p++ = cpu_to_be32(0); + + break; + default: + WARN_ON_ONCE(1); + } + + p = xdr_reserve_space(xdr, + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + if (!p) + return nfserr_resource; + + /* The server_owner struct */ + p = xdr_encode_hyper(p, minor_id); /* Minor id */ + /* major id */ + p = xdr_encode_opaque(p, major_id, major_id_sz); + + /* Server scope */ + p = xdr_encode_opaque(p, server_scope, server_scope_sz); + + /* Implementation id */ + *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ + return 0; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_create_session *sess) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, sess->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(sess->seqid); + *p++ = cpu_to_be32(sess->flags); + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->fore_channel.maxops); + *p++ = cpu_to_be32(sess->fore_channel.maxreqs); + *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); + + if (sess->fore_channel.nr_rdma_attrs) { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); + } + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->back_channel.maxops); + *p++ = cpu_to_be32(sess->back_channel.maxreqs); + *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); + + if (sess->back_channel.nr_rdma_attrs) { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); + } + return 0; +} + +static __be32 +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_sequence *seq) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, seq->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(seq->seqid); + *p++ = cpu_to_be32(seq->slotid); + /* Note slotid's are numbered from zero: */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ + *p++ = cpu_to_be32(seq->status_flags); + + resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ + return 0; +} + +static __be32 +nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_test_stateid *test_stateid) +{ + struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_test_stateid_id *stateid, *next; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); + if (!p) + return nfserr_resource; + *p++ = htonl(test_stateid->ts_num_ids); + + list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { + *p++ = stateid->ts_id_status; + } + + return nfserr; +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getdeviceinfo *gdev) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[gdev->gd_layout_type]; + u32 starting_len = xdr->buf->len, needed_len; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + + *p++ = cpu_to_be32(gdev->gd_layout_type); + + /* If maxcount is 0 then just update notifications */ + if (gdev->gd_maxcount != 0) { + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + goto out; + } + } + + nfserr = nfserr_resource; + if (gdev->gd_notify_types) { + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(gdev->gd_notify_types); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = 0; + } + + nfserr = 0; +out: + kfree(gdev->gd_device); + dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr)); + return nfserr; + +toosmall: + dprintk("%s: maxcount too small\n", __func__); + needed_len = xdr->buf->len + 4 /* notifications */; + xdr_truncate_encode(xdr, starting_len); + p = xdr_reserve_space(xdr, 4); + if (!p) { + nfserr = nfserr_resource; + } else { + *p++ = cpu_to_be32(needed_len); + nfserr = nfserr_toosmall; + } + goto out; +} + +static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutget *lgp) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); + if (!p) + goto out; + + *p++ = cpu_to_be32(1); /* we always set return-on-close */ + *p++ = cpu_to_be32(lgp->lg_sid.si_generation); + p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* we always return a single layout */ + p = xdr_encode_hyper(p, lgp->lg_seg.offset); + p = xdr_encode_hyper(p, lgp->lg_seg.length); + *p++ = cpu_to_be32(lgp->lg_seg.iomode); + *p++ = cpu_to_be32(lgp->lg_layout_type); + + nfserr = ops->encode_layoutget(xdr, lgp); +out: + kfree(lgp->lg_content); + return nfserr; +} + +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutcommit *lcp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lcp->lc_size_chg); + if (lcp->lc_size_chg) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, lcp->lc_newsize); + } + + return nfs_ok; +} + +static __be32 +nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutreturn *lrp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lrp->lrs_present); + if (lrp->lrs_present) + return nfsd4_encode_stateid(xdr, &lrp->lr_sid); + return nfs_ok; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_seek *seek) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 8); + *p++ = cpu_to_be32(seek->seek_eof); + p = xdr_encode_hyper(p, seek->seek_pos); + + return nfserr; +} + +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +{ + return nfserr; +} + +typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); + +/* + * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 + * since we don't need to filter out obsolete ops as this is + * done in the decoding phase. + */ +static nfsd4_enc nfsd4_enc_ops[] = { + [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, + [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, + [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, + [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, + [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, + [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, + [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, + [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, + [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, + [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, + [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, + [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ] = (nfsd4_enc)nfsd4_encode_read, + [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, + [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, + [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, + [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, + [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, + [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, + [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, + [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.1 operations */ + [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, +#else + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, +#endif + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, + [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, +}; + +/* + * Calculate whether we still have space to encode repsize bytes. + * There are two considerations: + * - For NFS versions >=4.1, the size of the reply must stay within + * session limits + * - For all NFS versions, we must stay within limited preallocated + * buffer space. + * + * This is called before the operation is processed, so can only provide + * an upper estimate. For some nonidempotent operations (such as + * getattr), it's not necessarily a problem if that estimate is wrong, + * as we can fail it after processing without significant side effects. + */ +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) +{ + struct xdr_buf *buf = &resp->rqstp->rq_res; + struct nfsd4_slot *slot = resp->cstate.slot; + + if (buf->len + respsize <= buf->buflen) + return nfs_ok; + if (!nfsd4_has_session(&resp->cstate)) + return nfserr_resource; + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) { + WARN_ON_ONCE(1); + return nfserr_rep_too_big_to_cache; + } + return nfserr_rep_too_big; +} + +void +nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +{ + struct xdr_stream *xdr = &resp->xdr; + struct nfs4_stateowner *so = resp->cstate.replay_owner; + struct svc_rqst *rqstp = resp->rqstp; + int post_err_offset; + nfsd4_enc encoder; + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + post_err_offset = xdr->buf->len; + + if (op->opnum == OP_ILLEGAL) + goto status; + BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + !nfsd4_enc_ops[op->opnum]); + encoder = nfsd4_enc_ops[op->opnum]; + op->status = encoder(resp, op->status, &op->u); + xdr_commit_encode(xdr); + + /* nfsd4_check_resp_size guarantees enough room for error status */ + if (!op->status) { + int space_needed = 0; + if (!nfsd4_last_compound_op(rqstp)) + space_needed = COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, space_needed); + } + if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { + struct nfsd4_slot *slot = resp->cstate.slot; + + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) + op->status = nfserr_rep_too_big_to_cache; + else + op->status = nfserr_rep_too_big; + } + if (op->status == nfserr_resource || + op->status == nfserr_rep_too_big || + op->status == nfserr_rep_too_big_to_cache) { + /* + * The operation may have already been encoded or + * partially encoded. No op returns anything additional + * in the case of one of these three errors, so we can + * just truncate back to after the status. But it's a + * bug if we had to do this on a non-idempotent op: + */ + warn_on_nonidempotent_op(op); + xdr_truncate_encode(xdr, post_err_offset); + } + if (so) { + int len = xdr->buf->len - post_err_offset; + + so->so_replay.rp_status = op->status; + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + so->so_replay.rp_buf, len); + } +status: + /* Note that op->status is already in network byte order: */ + write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); +} + +/* + * Encode the reply stored in the stateowner reply cache + * + * XDR note: do not encode rp->rp_buflen: the buffer contains the + * previously sent already encoded operation. + */ +void +nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) +{ + __be32 *p; + struct nfs4_replay *rp = op->replay; + + BUG_ON(!rp); + + p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + *p++ = rp->rp_status; /* already xdr'ed */ + + p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); +} + +int +nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) +{ + struct svc_rqst *rqstp = rq; + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + if (args->ops != args->iops) { + kfree(args->ops); + args->ops = args->iops; + } + kfree(args->tmpp); + args->tmpp = NULL; + while (args->to_free) { + struct svcxdr_tmpbuf *tb = args->to_free; + args->to_free = tb->next; + kfree(tb); + } + return 1; +} + +int +nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) +{ + if (rqstp->rq_arg.head[0].iov_len % 4) { + /* client is nuts */ + dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", + __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); + return 0; + } + args->p = p; + args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; + args->pagelist = rqstp->rq_arg.pages; + args->pagelen = rqstp->rq_arg.page_len; + args->tmpp = NULL; + args->to_free = NULL; + args->ops = args->iops; + args->rqstp = rqstp; + + return !nfsd4_decode_compound(args); +} + +int +nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp) +{ + /* + * All that remains is to write the tag and operation count... + */ + struct xdr_buf *buf = resp->xdr.buf; + + WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + + buf->tail[0].iov_len); + + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + + p = resp->tagp; + *p++ = htonl(resp->taglen); + memcpy(p, resp->tag, resp->taglen); + p += XDR_QUADLEN(resp->taglen); + *p++ = htonl(resp->opcnt); + + nfsd4_sequence_done(resp); + return 1; +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfsd/nfscache.c b/kernel/fs/nfsd/nfscache.c new file mode 100644 index 000000000..46ec934f5 --- /dev/null +++ b/kernel/fs/nfsd/nfscache.c @@ -0,0 +1,637 @@ +/* + * Request reply cache. This is currently a global cache, but this may + * change in the future and be a per-client cache. + * + * This code is heavily inspired by the 44BSD implementation, although + * it does things a bit differently. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "cache.h" + +#define NFSDDBG_FACILITY NFSDDBG_REPCACHE + +/* + * We use this value to determine the number of hash buckets from the max + * cache size, the idea being that when the cache is at its maximum number + * of entries, then this should be the average number of entries per bucket. + */ +#define TARGET_BUCKET_SIZE 64 + +struct nfsd_drc_bucket { + struct list_head lru_head; + spinlock_t cache_lock; +}; + +static struct nfsd_drc_bucket *drc_hashtbl; +static struct kmem_cache *drc_slab; + +/* max number of entries allowed in the cache */ +static unsigned int max_drc_entries; + +/* number of significant bits in the hash value */ +static unsigned int maskbits; +static unsigned int drc_hashsize; + +/* + * Stats and other tracking of on the duplicate reply cache. All of these and + * the "rc" fields in nfsdstats are protected by the cache_lock + */ + +/* total number of entries */ +static atomic_t num_drc_entries; + +/* cache misses due only to checksum comparison failures */ +static unsigned int payload_misses; + +/* amount of memory (in bytes) currently consumed by the DRC */ +static unsigned int drc_mem_usage; + +/* longest hash chain seen */ +static unsigned int longest_chain; + +/* size of cache when we saw the longest hash chain */ +static unsigned int longest_chain_cachesize; + +static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); +static void cache_cleaner_func(struct work_struct *unused); +static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker nfsd_reply_cache_shrinker = { + .scan_objects = nfsd_reply_cache_scan, + .count_objects = nfsd_reply_cache_count, + .seeks = 1, +}; + +/* + * locking for the reply cache: + * A cache entry is "single use" if c_state == RC_INPROG + * Otherwise, it when accessing _prev or _next, the lock must be held. + */ +static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); + +/* + * Put a cap on the size of the DRC based on the amount of available + * low memory in the machine. + * + * 64MB: 8192 + * 128MB: 11585 + * 256MB: 16384 + * 512MB: 23170 + * 1GB: 32768 + * 2GB: 46340 + * 4GB: 65536 + * 8GB: 92681 + * 16GB: 131072 + * + * ...with a hard cap of 256k entries. In the worst case, each entry will be + * ~1k, so the above numbers should give a rough max of the amount of memory + * used in k. + */ +static unsigned int +nfsd_cache_size_limit(void) +{ + unsigned int limit; + unsigned long low_pages = totalram_pages - totalhigh_pages; + + limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); + return min_t(unsigned int, limit, 256*1024); +} + +/* + * Compute the number of hash buckets we need. Divide the max cachesize by + * the "target" max bucket size, and round up to next power of two. + */ +static unsigned int +nfsd_hashsize(unsigned int limit) +{ + return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); +} + +static u32 +nfsd_cache_hash(__be32 xid) +{ + return hash_32(be32_to_cpu(xid), maskbits); +} + +static struct svc_cacherep * +nfsd_reply_cache_alloc(void) +{ + struct svc_cacherep *rp; + + rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); + if (rp) { + rp->c_state = RC_UNUSED; + rp->c_type = RC_NOCACHE; + INIT_LIST_HEAD(&rp->c_lru); + } + return rp; +} + +static void +nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { + drc_mem_usage -= rp->c_replvec.iov_len; + kfree(rp->c_replvec.iov_base); + } + list_del(&rp->c_lru); + atomic_dec(&num_drc_entries); + drc_mem_usage -= sizeof(*rp); + kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +{ + spin_lock(&b->cache_lock); + nfsd_reply_cache_free_locked(rp); + spin_unlock(&b->cache_lock); +} + +int nfsd_reply_cache_init(void) +{ + unsigned int hashsize; + unsigned int i; + int status = 0; + + max_drc_entries = nfsd_cache_size_limit(); + atomic_set(&num_drc_entries, 0); + hashsize = nfsd_hashsize(max_drc_entries); + maskbits = ilog2(hashsize); + + status = register_shrinker(&nfsd_reply_cache_shrinker); + if (status) + return status; + + drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), + 0, 0, NULL); + if (!drc_slab) + goto out_nomem; + + drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); + if (!drc_hashtbl) + goto out_nomem; + for (i = 0; i < hashsize; i++) { + INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); + spin_lock_init(&drc_hashtbl[i].cache_lock); + } + drc_hashsize = hashsize; + + return 0; +out_nomem: + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); + nfsd_reply_cache_shutdown(); + return -ENOMEM; +} + +void nfsd_reply_cache_shutdown(void) +{ + struct svc_cacherep *rp; + unsigned int i; + + unregister_shrinker(&nfsd_reply_cache_shrinker); + cancel_delayed_work_sync(&cache_cleaner); + + for (i = 0; i < drc_hashsize; i++) { + struct list_head *head = &drc_hashtbl[i].lru_head; + while (!list_empty(head)) { + rp = list_first_entry(head, struct svc_cacherep, c_lru); + nfsd_reply_cache_free_locked(rp); + } + } + + kfree (drc_hashtbl); + drc_hashtbl = NULL; + drc_hashsize = 0; + + if (drc_slab) { + kmem_cache_destroy(drc_slab); + drc_slab = NULL; + } +} + +/* + * Move cache entry to end of LRU list, and queue the cleaner to run if it's + * not already scheduled. + */ +static void +lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +{ + rp->c_timestamp = jiffies; + list_move_tail(&rp->c_lru, &b->lru_head); + schedule_delayed_work(&cache_cleaner, RC_EXPIRE); +} + +static long +prune_bucket(struct nfsd_drc_bucket *b) +{ + struct svc_cacherep *rp, *tmp; + long freed = 0; + + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + if (atomic_read(&num_drc_entries) <= max_drc_entries && + time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) + break; + nfsd_reply_cache_free_locked(rp); + freed++; + } + return freed; +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static long +prune_cache_entries(void) +{ + unsigned int i; + long freed = 0; + bool cancel = true; + + for (i = 0; i < drc_hashsize; i++) { + struct nfsd_drc_bucket *b = &drc_hashtbl[i]; + + if (list_empty(&b->lru_head)) + continue; + spin_lock(&b->cache_lock); + freed += prune_bucket(b); + if (!list_empty(&b->lru_head)) + cancel = false; + spin_unlock(&b->cache_lock); + } + + /* + * Conditionally rearm the job to run in RC_EXPIRE since we just + * ran the pruner. + */ + if (!cancel) + mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); + return freed; +} + +static void +cache_cleaner_func(struct work_struct *unused) +{ + prune_cache_entries(); +} + +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return atomic_read(&num_drc_entries); +} + +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + return prune_cache_entries(); +} +/* + * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + */ +static __wsum +nfsd_cache_csum(struct svc_rqst *rqstp) +{ + int idx; + unsigned int base; + __wsum csum; + struct xdr_buf *buf = &rqstp->rq_arg; + const unsigned char *p = buf->head[0].iov_base; + size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, + RC_CSUMLEN); + size_t len = min(buf->head[0].iov_len, csum_len); + + /* rq_arg.head first */ + csum = csum_partial(p, len, 0); + csum_len -= len; + + /* Continue into page array */ + idx = buf->page_base / PAGE_SIZE; + base = buf->page_base & ~PAGE_MASK; + while (csum_len) { + p = page_address(buf->pages[idx]) + base; + len = min_t(size_t, PAGE_SIZE - base, csum_len); + csum = csum_partial(p, len, csum); + csum_len -= len; + base = 0; + ++idx; + } + return csum; +} + +static bool +nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +{ + /* Check RPC XID first */ + if (rqstp->rq_xid != rp->c_xid) + return false; + /* compare checksum of NFS data */ + if (csum != rp->c_csum) { + ++payload_misses; + return false; + } + + /* Other discriminators */ + if (rqstp->rq_proc != rp->c_proc || + rqstp->rq_prot != rp->c_prot || + rqstp->rq_vers != rp->c_vers || + rqstp->rq_arg.len != rp->c_len || + !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || + rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + return false; + + return true; +} + +/* + * Search the request hash for an entry that matches the given rqstp. + * Must be called with cache_lock held. Returns the found entry or + * NULL on failure. + */ +static struct svc_cacherep * +nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, + __wsum csum) +{ + struct svc_cacherep *rp, *ret = NULL; + struct list_head *rh = &b->lru_head; + unsigned int entries = 0; + + list_for_each_entry(rp, rh, c_lru) { + ++entries; + if (nfsd_cache_match(rqstp, csum, rp)) { + ret = rp; + break; + } + } + + /* tally hash chain length stats */ + if (entries > longest_chain) { + longest_chain = entries; + longest_chain_cachesize = atomic_read(&num_drc_entries); + } else if (entries == longest_chain) { + /* prefer to keep the smallest cachesize possible here */ + longest_chain_cachesize = min_t(unsigned int, + longest_chain_cachesize, + atomic_read(&num_drc_entries)); + } + + return ret; +} + +/* + * Try to find an entry matching the current call in the cache. When none + * is found, we try to grab the oldest expired entry off the LRU list. If + * a suitable one isn't there, then drop the cache_lock and allocate a + * new one, then search again in case one got inserted while this thread + * didn't hold the lock. + */ +int +nfsd_cache_lookup(struct svc_rqst *rqstp) +{ + struct svc_cacherep *rp, *found; + __be32 xid = rqstp->rq_xid; + u32 proto = rqstp->rq_prot, + vers = rqstp->rq_vers, + proc = rqstp->rq_proc; + __wsum csum; + u32 hash = nfsd_cache_hash(xid); + struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; + unsigned long age; + int type = rqstp->rq_cachetype; + int rtn = RC_DOIT; + + rqstp->rq_cacherep = NULL; + if (type == RC_NOCACHE) { + nfsdstats.rcnocache++; + return rtn; + } + + csum = nfsd_cache_csum(rqstp); + + /* + * Since the common case is a cache miss followed by an insert, + * preallocate an entry. + */ + rp = nfsd_reply_cache_alloc(); + spin_lock(&b->cache_lock); + if (likely(rp)) { + atomic_inc(&num_drc_entries); + drc_mem_usage += sizeof(*rp); + } + + /* go ahead and prune the cache */ + prune_bucket(b); + + found = nfsd_cache_search(b, rqstp, csum); + if (found) { + if (likely(rp)) + nfsd_reply_cache_free_locked(rp); + rp = found; + goto found_entry; + } + + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + goto out; + } + + nfsdstats.rcmisses++; + rqstp->rq_cacherep = rp; + rp->c_state = RC_INPROG; + rp->c_xid = xid; + rp->c_proc = proc; + rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_prot = proto; + rp->c_vers = vers; + rp->c_len = rqstp->rq_arg.len; + rp->c_csum = csum; + + lru_put_end(b, rp); + + /* release any buffer */ + if (rp->c_type == RC_REPLBUFF) { + drc_mem_usage -= rp->c_replvec.iov_len; + kfree(rp->c_replvec.iov_base); + rp->c_replvec.iov_base = NULL; + } + rp->c_type = RC_NOCACHE; + out: + spin_unlock(&b->cache_lock); + return rtn; + +found_entry: + nfsdstats.rchits++; + /* We found a matching entry which is either in progress or done. */ + age = jiffies - rp->c_timestamp; + lru_put_end(b, rp); + + rtn = RC_DROPIT; + /* Request being processed or excessive rexmits */ + if (rp->c_state == RC_INPROG || age < RC_DELAY) + goto out; + + /* From the hall of fame of impractical attacks: + * Is this a user who tries to snoop on the cache? */ + rtn = RC_DOIT; + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) + goto out; + + /* Compose RPC reply header */ + switch (rp->c_type) { + case RC_NOCACHE: + break; + case RC_REPLSTAT: + svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + rtn = RC_REPLY; + break; + case RC_REPLBUFF: + if (!nfsd_cache_append(rqstp, &rp->c_replvec)) + goto out; /* should not happen */ + rtn = RC_REPLY; + break; + default: + printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); + nfsd_reply_cache_free_locked(rp); + } + + goto out; +} + +/* + * Update a cache entry. This is called from nfsd_dispatch when + * the procedure has been executed and the complete reply is in + * rqstp->rq_res. + * + * We're copying around data here rather than swapping buffers because + * the toplevel loop requires max-sized buffers, which would be a waste + * of memory for a cache with a max reply size of 100 bytes (diropokres). + * + * If we should start to use different types of cache entries tailored + * specifically for attrstat and fh's, we may save even more space. + * + * Also note that a cachetype of RC_NOCACHE can legally be passed when + * nfsd failed to encode a reply that otherwise would have been cached. + * In this case, nfsd_cache_update is called with statp == NULL. + */ +void +nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +{ + struct svc_cacherep *rp = rqstp->rq_cacherep; + struct kvec *resv = &rqstp->rq_res.head[0], *cachv; + u32 hash; + struct nfsd_drc_bucket *b; + int len; + size_t bufsize = 0; + + if (!rp) + return; + + hash = nfsd_cache_hash(rp->c_xid); + b = &drc_hashtbl[hash]; + + len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); + len >>= 2; + + /* Don't cache excessive amounts of data and XDR failures */ + if (!statp || len > (256 >> 2)) { + nfsd_reply_cache_free(b, rp); + return; + } + + switch (cachetype) { + case RC_REPLSTAT: + if (len != 1) + printk("nfsd: RC_REPLSTAT/reply len %d!\n",len); + rp->c_replstat = *statp; + break; + case RC_REPLBUFF: + cachv = &rp->c_replvec; + bufsize = len << 2; + cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); + if (!cachv->iov_base) { + nfsd_reply_cache_free(b, rp); + return; + } + cachv->iov_len = bufsize; + memcpy(cachv->iov_base, statp, bufsize); + break; + case RC_NOCACHE: + nfsd_reply_cache_free(b, rp); + return; + } + spin_lock(&b->cache_lock); + drc_mem_usage += bufsize; + lru_put_end(b, rp); + rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); + rp->c_type = cachetype; + rp->c_state = RC_DONE; + spin_unlock(&b->cache_lock); + return; +} + +/* + * Copy cached reply to current reply buffer. Should always fit. + * FIXME as reply is in a page, we should just attach the page, and + * keep a refcount.... + */ +static int +nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) +{ + struct kvec *vec = &rqstp->rq_res.head[0]; + + if (vec->iov_len + data->iov_len > PAGE_SIZE) { + printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n", + data->iov_len); + return 0; + } + memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); + vec->iov_len += data->iov_len; + return 1; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +{ + seq_printf(m, "max entries: %u\n", max_drc_entries); + seq_printf(m, "num entries: %u\n", + atomic_read(&num_drc_entries)); + seq_printf(m, "hash buckets: %u\n", 1 << maskbits); + seq_printf(m, "mem usage: %u\n", drc_mem_usage); + seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); + seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); + seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); + seq_printf(m, "payload misses: %u\n", payload_misses); + seq_printf(m, "longest chain len: %u\n", longest_chain); + seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); + return 0; +} + +int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_reply_cache_stats_show, NULL); +} diff --git a/kernel/fs/nfsd/nfsctl.c b/kernel/fs/nfsd/nfsctl.c new file mode 100644 index 000000000..9690cb4dd --- /dev/null +++ b/kernel/fs/nfsd/nfsctl.c @@ -0,0 +1,1318 @@ +/* + * Syscall interface to knfsd. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "idmap.h" +#include "nfsd.h" +#include "cache.h" +#include "state.h" +#include "netns.h" +#include "pnfs.h" + +/* + * We have a single directory with several nodes in it. + */ +enum { + NFSD_Root = 1, + NFSD_List, + NFSD_Export_features, + NFSD_Fh, + NFSD_FO_UnlockIP, + NFSD_FO_UnlockFS, + NFSD_Threads, + NFSD_Pool_Threads, + NFSD_Pool_Stats, + NFSD_Reply_Cache_Stats, + NFSD_Versions, + NFSD_Ports, + NFSD_MaxBlkSize, + NFSD_MaxConnections, + NFSD_SupportedEnctypes, + /* + * The below MUST come last. Otherwise we leave a hole in nfsd_files[] + * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops + */ +#ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, + NFSD_Gracetime, + NFSD_RecoveryDir, + NFSD_V4EndGrace, +#endif +}; + +/* + * write() for these nodes. + */ +static ssize_t write_filehandle(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); +static ssize_t write_threads(struct file *file, char *buf, size_t size); +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); +static ssize_t write_versions(struct file *file, char *buf, size_t size); +static ssize_t write_ports(struct file *file, char *buf, size_t size); +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); +static ssize_t write_maxconn(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_V4 +static ssize_t write_leasetime(struct file *file, char *buf, size_t size); +static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); +#endif + +static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Fh] = write_filehandle, + [NFSD_FO_UnlockIP] = write_unlock_ip, + [NFSD_FO_UnlockFS] = write_unlock_fs, + [NFSD_Threads] = write_threads, + [NFSD_Pool_Threads] = write_pool_threads, + [NFSD_Versions] = write_versions, + [NFSD_Ports] = write_ports, + [NFSD_MaxBlkSize] = write_maxblksize, + [NFSD_MaxConnections] = write_maxconn, +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + [NFSD_V4EndGrace] = write_v4_end_grace, +#endif +}; + +static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +{ + ino_t ino = file_inode(file)->i_ino; + char *data; + ssize_t rv; + + if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + rv = write_op[ino](file, data, size); + if (rv >= 0) { + simple_transaction_set(file, rv); + rv = size; + } + return rv; +} + +static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) +{ + if (! file->private_data) { + /* An attempt to read a transaction file without writing + * causes a 0-byte write so that the file can return + * state information + */ + ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos); + if (rv < 0) + return rv; + } + return simple_transaction_read(file, buf, size, pos); +} + +static const struct file_operations transaction_ops = { + .write = nfsctl_transaction_write, + .read = nfsctl_transaction_read, + .release = simple_transaction_release, + .llseek = default_llseek, +}; + +static int exports_net_open(struct net *net, struct file *file) +{ + int err; + struct seq_file *seq; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + err = seq_open(file, &nfs_exports_op); + if (err) + return err; + + seq = file->private_data; + seq->private = nn->svc_export_cache; + return 0; +} + +static int exports_proc_open(struct inode *inode, struct file *file) +{ + return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct file_operations exports_proc_operations = { + .open = exports_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int exports_nfsd_open(struct inode *inode, struct file *file) +{ + return exports_net_open(inode->i_sb->s_fs_info, file); +} + +static const struct file_operations exports_nfsd_operations = { + .open = exports_nfsd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static const struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) +static int supported_enctypes_show(struct seq_file *m, void *v) +{ + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); + return 0; +} + +static int supported_enctypes_open(struct inode *inode, struct file *file) +{ + return single_open(file, supported_enctypes_show, NULL); +} + +static const struct file_operations supported_enctypes_ops = { + .open = supported_enctypes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ + +static const struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nfsd_pool_stats_release, + .owner = THIS_MODULE, +}; + +static struct file_operations reply_cache_stats_operations = { + .open = nfsd_reply_cache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/*----------------------------------------------------------------------------*/ +/* + * payload - write methods + */ + +static inline struct net *netns(struct file *file) +{ + return file_inode(file)->i_sb->s_fs_info; +} + +/** + * write_unlock_ip - Release all locks used by a client + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing a + * presentation format IP address + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) +{ + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); + char *fo_path; + struct net *net = netns(file); + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + if (rpc_pton(net, fo_path, size, sap, salen) == 0) + return -EINVAL; + + return nlmsvc_unlock_all_by_ip(sap); +} + +/** + * write_unlock_fs - Release all locks on a local file system + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing the + * absolute pathname of a local file system + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) +{ + struct path path; + char *fo_path; + int error; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + error = kern_path(fo_path, 0, &path); + if (error) + return error; + + /* + * XXX: Needs better sanity checking. Otherwise we could end up + * releasing locks on the wrong file system. + * + * For example: + * 1. Does the path refer to a directory? + * 2. Is that directory a mount point, or + * 3. Is that directory the root of an exported file system? + */ + error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + + path_put(&path); + return error; +} + +/** + * write_filehandle - Get a variable-length NFS file handle by path + * + * On input, the buffer contains a '\n'-terminated C string comprised of + * three alphanumeric words separated by whitespace. The string may + * contain escape sequences. + * + * Input: + * buf: + * domain: client domain name + * path: export pathname + * maxsize: numeric maximum size of + * @buf + * size: length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a ASCII hex text version + * of the NFS file handle; + * return code is the size in bytes of the string + * On error: return code is negative errno value + */ +static ssize_t write_filehandle(struct file *file, char *buf, size_t size) +{ + char *dname, *path; + int uninitialized_var(maxsize); + char *mesg = buf; + int len; + struct auth_domain *dom; + struct knfsd_fh fh; + + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + dname = mesg; + len = qword_get(&mesg, dname, size); + if (len <= 0) + return -EINVAL; + + path = dname+len+1; + len = qword_get(&mesg, path, size); + if (len <= 0) + return -EINVAL; + + len = get_int(&mesg, &maxsize); + if (len) + return len; + + if (maxsize < NFS_FHSIZE) + return -EINVAL; + maxsize = min(maxsize, NFS3_FHSIZE); + + if (qword_get(&mesg, mesg, size)>0) + return -EINVAL; + + /* we have all the words, they are in buf.. */ + dom = unix_domain_find(dname); + if (!dom) + return -ENOMEM; + + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + auth_domain_put(dom); + if (len) + return len; + + mesg = buf; + len = SIMPLE_TRANSACTION_LIMIT; + qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); + mesg[-1] = '\n'; + return mesg - buf; +} + +/** + * write_threads - Start NFSD, or report the current number of running threads + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the + * number of NFSD threads to start + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_threads(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + int rv; + struct net *net = netns(file); + + if (size > 0) { + int newthreads; + rv = get_int(&mesg, &newthreads); + if (rv) + return rv; + if (newthreads < 0) + return -EINVAL; + rv = nfsd_svc(newthreads, net); + if (rv < 0) + return rv; + } else + rv = nfsd_nrthreads(net); + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); +} + +/** + * write_pool_threads - Set or report the current number of threads per pool + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated unsigned integer values + * representing the number of NFSD + * threads to start in each pool + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing integer values representing the + * number of NFSD threads in each pool; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) +{ + /* if size > 0, look for an array of number of threads per node + * and apply them then write out number of threads per node as reply + */ + char *mesg = buf; + int i; + int rv; + int len; + int npools; + int *nthreads; + struct net *net = netns(file); + + mutex_lock(&nfsd_mutex); + npools = nfsd_nrpools(net); + if (npools == 0) { + /* + * NFS is shut down. The admin can start it by + * writing to the threads file but NOT the pool_threads + * file, sorry. Report zero threads. + */ + mutex_unlock(&nfsd_mutex); + strcpy(buf, "0\n"); + return strlen(buf); + } + + nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + rv = -ENOMEM; + if (nthreads == NULL) + goto out_free; + + if (size > 0) { + for (i = 0; i < npools; i++) { + rv = get_int(&mesg, &nthreads[i]); + if (rv == -ENOENT) + break; /* fewer numbers than pools */ + if (rv) + goto out_free; /* syntax error */ + rv = -EINVAL; + if (nthreads[i] < 0) + goto out_free; + } + rv = nfsd_set_nrthreads(i, nthreads, net); + if (rv) + goto out_free; + } + + rv = nfsd_get_nrthreads(npools, nthreads, net); + if (rv) + goto out_free; + + mesg = buf; + size = SIMPLE_TRANSACTION_LIMIT; + for (i = 0; i < npools && size > 0; i++) { + snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); + len = strlen(mesg); + size -= len; + mesg += len; + } + rv = mesg - buf; +out_free: + kfree(nthreads); + mutex_unlock(&nfsd_mutex); + return rv; +} + +static ssize_t __write_versions(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + char *vers, *minorp, sign; + int len, num, remaining; + unsigned minor; + ssize_t tlen = 0; + char *sep; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size>0) { + if (nn->nfsd_serv) + /* Cannot change versions without updating + * nn->nfsd_serv->sv_xdrsize, and reallocing + * rq_argp and rq_resp + */ + return -EBUSY; + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + vers = mesg; + len = qword_get(&mesg, vers, size); + if (len <= 0) return -EINVAL; + do { + sign = *vers; + if (sign == '+' || sign == '-') + num = simple_strtol((vers+1), &minorp, 0); + else + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num != 4) + return -EINVAL; + minor = simple_strtoul(minorp+1, NULL, 0); + if (minor == 0) + return -EINVAL; + if (nfsd_minorversion(minor, sign == '-' ? + NFSD_CLEAR : NFSD_SET) < 0) + return -EINVAL; + goto next; + } + switch(num) { + case 2: + case 3: + case 4: + nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); + break; + default: + return -EINVAL; + } + next: + vers += len + 1; + } while ((len = qword_get(&mesg, vers, size)) > 0); + /* If all get turned off, turn them back on, as + * having no versions is BAD + */ + nfsd_reset_versions(); + } + + /* Now write current state into reply buffer */ + len = 0; + sep = ""; + remaining = SIMPLE_TRANSACTION_LIMIT; + for (num=2 ; num <= 4 ; num++) + if (nfsd_vers(num, NFSD_AVAIL)) { + len = snprintf(buf, remaining, "%s%c%d", sep, + nfsd_vers(num, NFSD_TEST)?'+':'-', + num); + sep = " "; + + if (len >= remaining) + break; + remaining -= len; + buf += len; + tlen += len; + } + if (nfsd_vers(4, NFSD_AVAIL)) + for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; + minor++) { + len = snprintf(buf, remaining, " %c4.%u", + (nfsd_vers(4, NFSD_TEST) && + nfsd_minorversion(minor, NFSD_TEST)) ? + '+' : '-', + minor); + + if (len >= remaining) + break; + remaining -= len; + buf += len; + tlen += len; + } + + len = snprintf(buf, remaining, "\n"); + if (len >= remaining) + return -EINVAL; + return tlen + len; +} + +/** + * write_versions - Set or report the available NFS protocol versions + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing positive or negative integer + * values representing the current status of each + * protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") + * size: non-zero length of C string in @buf + * Output: + * On success: status of zero or more protocol versions has + * been updated; passed-in buffer filled with + * '\n'-terminated C string containing positive + * or negative integer values representing the + * current status of each protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_versions(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_versions(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/* + * Zero-length write. Return a list of NFSD's current listener + * transports. + */ +static ssize_t __write_ports_names(char *buf, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) + return 0; + return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); +} + +/* + * A single 'fd' number was written, in which case it must be for + * a socket of a supported family/protocol, and we use it as an + * nfsd listener. + */ +static ssize_t __write_ports_addfd(char *buf, struct net *net) +{ + char *mesg = buf; + int fd, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + err = get_int(&mesg, &fd); + if (err != 0 || fd < 0) + return -EINVAL; + + if (svc_alien_sock(net, fd)) { + printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); + return -EINVAL; + } + + err = nfsd_create_serv(net); + if (err != 0) + return err; + + err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); + if (err < 0) { + nfsd_destroy(net); + return err; + } + + /* Decrease the count, but don't shut down the service */ + nn->nfsd_serv->sv_nrthreads--; + return err; +} + +/* + * A transport listener is added by writing it's transport name and + * a port number. + */ +static ssize_t __write_ports_addxprt(char *buf, struct net *net) +{ + char transport[16]; + struct svc_xprt *xprt; + int port, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (sscanf(buf, "%15s %5u", transport, &port) != 2) + return -EINVAL; + + if (port < 1 || port > USHRT_MAX) + return -EINVAL; + + err = nfsd_create_serv(net); + if (err != 0) + return err; + + err = svc_create_xprt(nn->nfsd_serv, transport, net, + PF_INET, port, SVC_SOCK_ANONYMOUS); + if (err < 0) + goto out_err; + + err = svc_create_xprt(nn->nfsd_serv, transport, net, + PF_INET6, port, SVC_SOCK_ANONYMOUS); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_close; + + /* Decrease the count, but don't shut down the service */ + nn->nfsd_serv->sv_nrthreads--; + return 0; +out_close: + xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); + if (xprt != NULL) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + } +out_err: + nfsd_destroy(net); + return err; +} + +static ssize_t __write_ports(struct file *file, char *buf, size_t size, + struct net *net) +{ + if (size == 0) + return __write_ports_names(buf, net); + + if (isdigit(buf[0])) + return __write_ports_addfd(buf, net); + + if (isalpha(buf[0])) + return __write_ports_addxprt(buf, net); + + return -EINVAL; +} + +/** + * write_ports - Pass a socket file descriptor or transport name to listen on + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with a '\n'-terminated C + * string containing a whitespace-separated list of + * named NFSD listeners; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing a bound + * but unconnected socket that is to be + * used as an NFSD listener; listen(3) + * must be called for a SOCK_STREAM + * socket, otherwise it is ignored + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique alphanumeric name of + * the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a transport + * name and an unsigned integer value + * representing the port to listen on, + * separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service is started + * On error: return code is a negative errno value + */ +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_ports(file, buf, size, netns(file)); + mutex_unlock(&nfsd_mutex); + return rv; +} + + +int nfsd_max_blksize; + +/** + * write_maxblksize - Set or report the current NFS blksize + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of the current NFS blksize + * setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size > 0) { + int bsize; + int rv = get_int(&mesg, &bsize); + if (rv) + return rv; + /* force bsize into allowed range and + * required alignment. + */ + bsize = max_t(int, bsize, 1024); + bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); + bsize &= ~(1024-1); + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) { + mutex_unlock(&nfsd_mutex); + return -EBUSY; + } + nfsd_max_blksize = bsize; + mutex_unlock(&nfsd_mutex); + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", + nfsd_max_blksize); +} + +/** + * write_maxconn - Set or report the current max number of connections + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of max_connections setting + * for this net namespace; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxconn(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + unsigned int maxconn = nn->max_connections; + + if (size > 0) { + int rv = get_uint(&mesg, &maxconn); + + if (rv) + return rv; + nn->max_connections = maxconn; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); +} + +#ifdef CONFIG_NFSD_V4 +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, + time_t *time, struct nfsd_net *nn) +{ + char *mesg = buf; + int rv, i; + + if (size > 0) { + if (nn->nfsd_serv) + return -EBUSY; + rv = get_int(&mesg, &i); + if (rv) + return rv; + /* + * Some sanity checking. We don't have a reason for + * these particular numbers, but problems with the + * extremes are: + * - Too short: the briefest network outage may + * cause clients to lose all their locks. Also, + * the frequent polling may be wasteful. + * - Too long: do you really want reboot recovery + * to take more than an hour? Or to make other + * clients wait an hour before being able to + * revoke a dead client's locks? + */ + if (i < 10 || i > 3600) + return -EINVAL; + *time = i; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); +} + +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, + time_t *time, struct nfsd_net *nn) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __nfsd4_write_time(file, buf, size, time, nn); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/** + * write_leasetime - Set or report the current NFSv4 lease time + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFSv4 lease expiry time + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing unsigned integer value of the + * current lease expiry time; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); +} + +/** + * write_gracetime - Set or report current NFSv4 grace period time + * + * As above, but sets the time of the NFSv4 grace period. + * + * Note this should never be set to less than the *previous* + * lease-period time, but we don't try to enforce this. (In the common + * case (a new boot), we don't know what the previous lease time was + * anyway.) + */ +static ssize_t write_gracetime(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); +} + +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, + struct nfsd_net *nn) +{ + char *mesg = buf; + char *recdir; + int len, status; + + if (size > 0) { + if (nn->nfsd_serv) + return -EBUSY; + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; + + status = nfs4_reset_recoverydir(recdir); + if (status) + return status; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", + nfs4_recoverydir()); +} + +/** + * write_recoverydir - Set or report the pathname of the recovery directory + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing the pathname + * of the directory on a local file + * system containing permanent NFSv4 + * recovery data + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing the current recovery pathname setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + mutex_lock(&nfsd_mutex); + rv = __write_recoverydir(file, buf, size, nn); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/** + * write_v4_end_grace - release grace period for nfsd's v4.x lock manager + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: any value + * size: non-zero length of C string in @buf + * Output: + * passed-in buffer filled with "Y" or "N" with a newline + * and NULL-terminated C string. This indicates whether + * the grace period has ended in the current net + * namespace. Return code is the size in bytes of the + * string. Writing a string that starts with 'Y', 'y', or + * '1' to the file will end the grace period for nfsd's v4 + * lock manager. + */ +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size > 0) { + switch(buf[0]) { + case 'Y': + case 'y': + case '1': + nfsd4_end_grace(nn); + break; + default: + return -EINVAL; + } + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", + nn->grace_ended ? 'Y' : 'N'); +} + +#endif + +/*----------------------------------------------------------------------------*/ +/* + * populating the filesystem. + */ + +static int nfsd_fill_super(struct super_block * sb, void * data, int silent) +{ + static struct tree_descr nfsd_files[] = { + [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, + [NFSD_FO_UnlockIP] = {"unlock_ip", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_FO_UnlockFS] = {"unlock_filesystem", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, + [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) + [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, +#endif + /* last one */ {""} + }; + struct net *net = data; + int ret; + + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); + if (ret) + return ret; + sb->s_fs_info = get_net(net); + return 0; +} + +static struct dentry *nfsd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); +} + +static void nfsd_umount(struct super_block *sb) +{ + struct net *net = sb->s_fs_info; + + kill_litter_super(sb); + put_net(net); +} + +static struct file_system_type nfsd_fs_type = { + .owner = THIS_MODULE, + .name = "nfsd", + .mount = nfsd_mount, + .kill_sb = nfsd_umount, +}; +MODULE_ALIAS_FS("nfsd"); + +#ifdef CONFIG_PROC_FS +static int create_proc_exports_entry(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/nfs", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("exports", 0, entry, + &exports_proc_operations); + if (!entry) { + remove_proc_entry("fs/nfs", NULL); + return -ENOMEM; + } + return 0; +} +#else /* CONFIG_PROC_FS */ +static int create_proc_exports_entry(void) +{ + return 0; +} +#endif + +int nfsd_net_id; + +static __net_init int nfsd_init_net(struct net *net) +{ + int retval; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + retval = nfsd_export_init(net); + if (retval) + goto out_export_error; + retval = nfsd_idmap_init(net); + if (retval) + goto out_idmap_error; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; + return 0; + +out_idmap_error: + nfsd_export_shutdown(net); +out_export_error: + return retval; +} + +static __net_exit void nfsd_exit_net(struct net *net) +{ + nfsd_idmap_shutdown(net); + nfsd_export_shutdown(net); +} + +static struct pernet_operations nfsd_net_ops = { + .init = nfsd_init_net, + .exit = nfsd_exit_net, + .id = &nfsd_net_id, + .size = sizeof(struct nfsd_net), +}; + +static int __init init_nfsd(void) +{ + int retval; + printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); + + retval = register_pernet_subsys(&nfsd_net_ops); + if (retval < 0) + return retval; + retval = register_cld_notifier(); + if (retval) + goto out_unregister_pernet; + retval = nfsd4_init_slabs(); + if (retval) + goto out_unregister_notifier; + retval = nfsd4_init_pnfs(); + if (retval) + goto out_free_slabs; + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + if (retval) + goto out_exit_pnfs; + nfsd_stat_init(); /* Statistics */ + retval = nfsd_reply_cache_init(); + if (retval) + goto out_free_stat; + nfsd_lockd_init(); /* lockd->nfsd callbacks */ + retval = create_proc_exports_entry(); + if (retval) + goto out_free_lockd; + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; + return 0; +out_free_all: + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +out_free_lockd: + nfsd_lockd_shutdown(); + nfsd_reply_cache_shutdown(); +out_free_stat: + nfsd_stat_shutdown(); + nfsd_fault_inject_cleanup(); +out_exit_pnfs: + nfsd4_exit_pnfs(); +out_free_slabs: + nfsd4_free_slabs(); +out_unregister_notifier: + unregister_cld_notifier(); +out_unregister_pernet: + unregister_pernet_subsys(&nfsd_net_ops); + return retval; +} + +static void __exit exit_nfsd(void) +{ + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); + nfsd_stat_shutdown(); + nfsd_lockd_shutdown(); + nfsd4_free_slabs(); + nfsd4_exit_pnfs(); + nfsd_fault_inject_cleanup(); + unregister_filesystem(&nfsd_fs_type); + unregister_cld_notifier(); + unregister_pernet_subsys(&nfsd_net_ops); +} + +MODULE_AUTHOR("Olaf Kirch "); +MODULE_LICENSE("GPL"); +module_init(init_nfsd) +module_exit(exit_nfsd) diff --git a/kernel/fs/nfsd/nfsd.h b/kernel/fs/nfsd/nfsd.h new file mode 100644 index 000000000..cf9805238 --- /dev/null +++ b/kernel/fs/nfsd/nfsd.h @@ -0,0 +1,422 @@ +/* + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "stats.h" +#include "export.h" + +#undef ifdebug +#ifdef CONFIG_SUNRPC_DEBUG +# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) +#else +# define ifdebug(flag) if (0) +#endif + +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 2 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern struct mutex nfsd_mutex; +extern spinlock_t nfsd_drc_lock; +extern unsigned long nfsd_drc_max_mem; +extern unsigned long nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(int nrservs, struct net *net); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(struct net *); +int nfsd_nrpools(struct net *); +int nfsd_get_nrthreads(int n, int *, struct net *); +int nfsd_set_nrthreads(int n, int *, struct net *); +int nfsd_pool_stats_open(struct inode *, struct file *); +int nfsd_pool_stats_release(struct inode *, struct file *); + +void nfsd_destroy(struct net *net); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +int nfsd_minorversion(u32 minorversion, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(struct net *net); + +extern int nfsd_max_blksize; + +static inline int nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned long max_delegations; +int nfsd4_init_slabs(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +int nfs4_state_start_net(struct net *net); +void nfs4_state_shutdown(void); +void nfs4_state_shutdown_net(struct net *net); +void nfs4_reset_lease(time_t leasetime); +int nfs4_reset_recoverydir(char *recdir); +char * nfs4_recoverydir(void); +#else +static inline int nfsd4_init_slabs(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline int nfs4_state_start_net(struct net *net) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline void nfs4_state_shutdown_net(struct net *net) { } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +static inline char * nfs4_recoverydir(void) {return NULL; } +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) +#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) +#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) +#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) +#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) +#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ + +#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +/* 4.1 */ +#ifdef CONFIG_NFSD_PNFS +#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES +#define PNFSD_SUPPORTED_ATTRS_WORD2 \ +(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES) +#else +#define PNFSD_SUPPORTED_ATTRS_WORD1 0 +#define PNFSD_SUPPORTED_ATTRS_WORD2 0 +#endif /* CONFIG_NFSD_PNFS */ + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1) + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +/* 4.2 */ +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL +#else +#define NFSD4_2_SECURITY_ATTRS 0 +#endif + +#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + NFSD4_2_SECURITY_ATTRS) + +static inline u32 nfsd_suppattrs0(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 + : NFSD4_SUPPORTED_ATTRS_WORD0; +} + +static inline u32 nfsd_suppattrs1(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 + : NFSD4_SUPPORTED_ATTRS_WORD1; +} + +static inline u32 nfsd_suppattrs2(u32 minorversion) +{ + switch (minorversion) { + default: return NFSD4_2_SUPPORTED_ATTRS_WORD2; + case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2; + case 0: return NFSD4_SUPPORTED_ATTRS_WORD2; + } +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ + (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ + (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL +#else +#define NFSD_WRITEABLE_ATTRS_WORD2 0 +#endif + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +extern int nfsd4_is_junction(struct dentry *dentry); +extern int register_cld_notifier(void); +extern void unregister_cld_notifier(void); +#else /* CONFIG_NFSD_V4 */ +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + +#define register_cld_notifier() 0 +#define unregister_cld_notifier() do { } while(0) + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/kernel/fs/nfsd/nfsfh.c b/kernel/fs/nfsd/nfsfh.c new file mode 100644 index 000000000..350041a40 --- /dev/null +++ b/kernel/fs/nfsd/nfsfh.c @@ -0,0 +1,692 @@ +/* + * NFS server file handle treatment. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * Portions Copyright (C) 1999 G. Allen Morris III + * Extensive rewrite by Neil Brown Southern-Spring 1999 + * ... and again Southern-Winter 2001 to support export_operations + */ + +#include + +#include +#include "nfsd.h" +#include "vfs.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + + +/* + * our acceptability function. + * if NOSUBTREECHECK, accept anything + * if not, require that we can walk up to exp->ex_dentry + * doing some checks on the 'x' bits + */ +static int nfsd_acceptable(void *expv, struct dentry *dentry) +{ + struct svc_export *exp = expv; + int rv; + struct dentry *tdentry; + struct dentry *parent; + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + return 1; + + tdentry = dget(dentry); + while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) { + /* make sure parents give x permission to user */ + int err; + parent = dget_parent(tdentry); + err = inode_permission(d_inode(parent), MAY_EXEC); + if (err < 0) { + dput(parent); + break; + } + dput(tdentry); + tdentry = parent; + } + if (tdentry != exp->ex_path.dentry) + dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry); + rv = (tdentry == exp->ex_path.dentry); + dput(tdentry); + return rv; +} + +/* Type check. The correct error return for type mismatches does not seem to be + * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a + * comment in the NFSv3 spec says this is incorrect (implementation notes for + * the write call). + */ +static inline __be32 +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) +{ + mode &= S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) + return nfs_ok; + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; +} + +static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, + struct svc_export *exp) +{ + int flags = nfsexp_flags(rqstp, exp); + + /* Check if the request originated from a secure port. */ + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return nfserr_perm; + } + + /* Set user creds for this exportpoint */ + return nfserrno(nfsd_setuser(rqstp, exp)); +} + +static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, + struct dentry *dentry, struct svc_export *exp) +{ + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return nfs_ok; + /* + * v2/v3 clients have no need for the V4ROOT export--they use + * the mount protocl instead; also, further V4ROOT checks may be + * in v4-specific code, in which case v2/v3 clients could bypass + * them. + */ + if (!nfsd_v4client(rqstp)) + return nfserr_stale; + /* + * We're exposing only the directories and symlinks that have to be + * traversed on the way to real exports: + */ + if (unlikely(!d_is_dir(dentry) && + !d_is_symlink(dentry))) + return nfserr_stale; + /* + * A pseudoroot export gives permission to access only one + * single directory; the kernel has to make another upcall + * before granting access to anything else under it: + */ + if (unlikely(dentry != exp->ex_path.dentry)) + return nfserr_stale; + return nfs_ok; +} + +/* + * Use the given filehandle to look up the corresponding export and + * dentry. On success, the results are used to set fh_export and + * fh_dentry. + */ +static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; + int fileid_type; + int data_left = fh->fh_size/4; + __be32 error; + + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + if (rqstp->rq_vers == 4 && fh->fh_size == 0) + return nfserr_nofilehandle; + + if (fh->fh_version == 1) { + int len; + + if (--data_left < 0) + return error; + if (fh->fh_auth_type != 0) + return error; + len = key_len(fh->fh_fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + /* deprecated, convert to type 3 */ + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; + /* + * struct knfsd_fh uses host-endian fields, which are + * sometimes used to hold net-endian values. This + * confuses sparse, so we must use __force here to + * keep it from complaining. + */ + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), + ntohl((__force __be32)fh->fh_fsid[1]))); + fh->fh_fsid[1] = fh->fh_fsid[2]; + } + data_left -= len; + if (data_left < 0) + return error; + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); + fid = (struct fid *)(fh->fh_fsid + len); + } else { + __u32 tfh[2]; + dev_t xdev; + ino_t xino; + + if (fh->fh_size != NFS_FHSIZE) + return error; + /* assume old filehandle format */ + xdev = old_decode_dev(fh->ofh_xdev); + xino = u32_to_ino_t(fh->ofh_xino); + mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); + exp = rqst_exp_find(rqstp, FSID_DEV, tfh); + } + + error = nfserr_stale; + if (PTR_ERR(exp) == -ENOENT) + return error; + + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + /* Elevate privileges so that the lack of 'r' or 'x' + * permission on some parent directory will + * not stop exportfs_decode_fh from being able + * to reconnect a directory into the dentry cache. + * The same problem can affect "SUBTREECHECK" exports, + * but as nfsd_acceptable depends on correct + * access control settings being in effect, we cannot + * fix that case easily. + */ + struct cred *new = prepare_creds(); + if (!new) { + error = nfserrno(-ENOMEM); + goto out; + } + new->cap_effective = + cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + put_cred(override_creds(new)); + put_cred(new); + } else { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + } + + /* + * Look up the dentry using the NFS file handle. + */ + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + + if (fh->fh_version != 1) { + sfid.i32.ino = fh->ofh_ino; + sfid.i32.gen = fh->ofh_generation; + sfid.i32.parent_ino = fh->ofh_dirino; + fid = &sfid; + data_left = 3; + if (fh->ofh_dirino == 0) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + } else + fileid_type = fh->fh_fileid_type; + + if (fileid_type == FILEID_ROOT) + dentry = dget(exp->ex_path.dentry); + else { + dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + } + if (dentry == NULL) + goto out; + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) != -EINVAL) + error = nfserrno(PTR_ERR(dentry)); + goto out; + } + + if (d_is_dir(dentry) && + (dentry->d_flags & DCACHE_DISCONNECTED)) { + printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n", + dentry); + } + + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; +out: + exp_put(exp); + return error; +} + +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. + * + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. + * + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * include/linux/nfsd/nfsd.h. + */ +__be32 +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 error; + + dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); + + if (!fhp->fh_dentry) { + error = nfsd_set_fh_dentry(rqstp, fhp); + if (error) + goto out; + } + dentry = fhp->fh_dentry; + exp = fhp->fh_export; + /* + * We still have to do all these permission checks, even when + * fh_dentry is already set: + * - fh_verify may be called multiple times with different + * "access" arguments (e.g. nfsd_proc_create calls + * fh_verify(...,NFSD_MAY_EXEC) first, then later (in + * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE). + * - in the NFSv4 case, the filehandle may have been filled + * in by fh_compose, and given a dentry, but further + * compound operations performed with that filehandle + * still need permissions checks. In the worst case, a + * mountpoint crossing may have changed the export + * options, and we may now need to use a different uid + * (for example, if different id-squashing options are in + * effect on the new filesystem). + */ + error = check_pseudo_root(rqstp, dentry, exp); + if (error) + goto out; + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + + error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); + if (error) + goto out; + + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + +skip_pseudoflavor_check: + /* Finally, check access permissions. */ + error = nfsd_permission(rqstp, exp, dentry, access); + + if (error) { + dprintk("fh_verify: %pd2 permission failure, " + "acc=%x, error=%d\n", + dentry, + access, ntohl(error)); + } +out: + if (error == nfserr_stale) + nfsdstats.fh_stale++; + return error; +} + + +/* + * Compose a file handle for an NFS reply. + * + * Note that when first composed, the dentry may not yet have + * an inode. In this case a call to fh_update should be made + * before the fh goes out on the wire ... + */ +static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry) +{ + if (dentry != exp->ex_path.dentry) { + struct fid *fid = (struct fid *) + (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); + int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; + int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); + + fhp->fh_handle.fh_fileid_type = + exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck); + fhp->fh_handle.fh_size += maxsize * 4; + } else { + fhp->fh_handle.fh_fileid_type = FILEID_ROOT; + } +} + +/* + * for composing old style file handles + */ +static inline void _fh_update_old(struct dentry *dentry, + struct svc_export *exp, + struct knfsd_fh *fh) +{ + fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); + fh->ofh_generation = d_inode(dentry)->i_generation; + if (d_is_dir(dentry) || + (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + fh->ofh_dirino = 0; +} + +static bool is_root_export(struct svc_export *exp) +{ + return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; +} + +static struct super_block *exp_sb(struct svc_export *exp) +{ + return d_inode(exp->ex_path.dentry)->i_sb; +} + +static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) +{ + switch (fsid_type) { + case FSID_DEV: + if (!old_valid_dev(exp_sb(exp)->s_dev)) + return 0; + /* FALL THROUGH */ + case FSID_MAJOR_MINOR: + case FSID_ENCODE_DEV: + return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; + case FSID_NUM: + return exp->ex_flags & NFSEXP_FSID; + case FSID_UUID8: + case FSID_UUID16: + if (!is_root_export(exp)) + return 0; + /* fall through */ + case FSID_UUID4_INUM: + case FSID_UUID16_INUM: + return exp->ex_uuid != NULL; + } + return 1; +} + + +static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh) +{ + u8 version; + u8 fsid_type; +retry: + version = 1; + if (ref_fh && ref_fh->fh_export == exp) { + version = ref_fh->fh_handle.fh_version; + fsid_type = ref_fh->fh_handle.fh_fsid_type; + + ref_fh = NULL; + + switch (version) { + case 0xca: + fsid_type = FSID_DEV; + break; + case 1: + break; + default: + goto retry; + } + + /* + * As the fsid -> filesystem mapping was guided by + * user-space, there is no guarantee that the filesystem + * actually supports that fsid type. If it doesn't we + * loop around again without ref_fh set. + */ + if (!fsid_type_ok_for_exp(fsid_type, exp)) + goto retry; + } else if (exp->ex_flags & NFSEXP_FSID) { + fsid_type = FSID_NUM; + } else if (exp->ex_uuid) { + if (fhp->fh_maxsize >= 64) { + if (is_root_export(exp)) + fsid_type = FSID_UUID16; + else + fsid_type = FSID_UUID16_INUM; + } else { + if (is_root_export(exp)) + fsid_type = FSID_UUID8; + else + fsid_type = FSID_UUID4_INUM; + } + } else if (!old_valid_dev(exp_sb(exp)->s_dev)) + /* for newer device numbers, we must use a newer fsid format */ + fsid_type = FSID_ENCODE_DEV; + else + fsid_type = FSID_DEV; + fhp->fh_handle.fh_version = version; + if (version) + fhp->fh_handle.fh_fsid_type = fsid_type; +} + +__be32 +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) +{ + /* ref_fh is a reference file handle. + * if it is non-null and for the same filesystem, then we should compose + * a filehandle which is of the same version, where possible. + * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca + * Then create a 32byte filehandle using nfs_fhbase_old + * + */ + + struct inode * inode = d_inode(dentry); + dev_t ex_dev = exp_sb(exp)->s_dev; + + dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", + MAJOR(ex_dev), MINOR(ex_dev), + (long) d_inode(exp->ex_path.dentry)->i_ino, + dentry, + (inode ? inode->i_ino : 0)); + + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ + set_version_and_fsid_type(fhp, exp, ref_fh); + + if (ref_fh == fhp) + fh_put(ref_fh); + + if (fhp->fh_locked || fhp->fh_dentry) { + printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n", + dentry); + } + if (fhp->fh_maxsize < NFS_FHSIZE) + printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n", + fhp->fh_maxsize, + dentry); + + fhp->fh_dentry = dget(dentry); /* our internal copy */ + fhp->fh_export = exp_get(exp); + + if (fhp->fh_handle.fh_version == 0xca) { + /* old style filehandle please */ + memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + fhp->fh_handle.ofh_dcookie = 0xfeebbaca; + fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); + fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; + fhp->fh_handle.ofh_xino = + ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); + fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); + if (inode) + _fh_update_old(dentry, exp, &fhp->fh_handle); + } else { + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; + fhp->fh_handle.fh_auth_type = 0; + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fhp->fh_handle.fh_fsid, + ex_dev, + d_inode(exp->ex_path.dentry)->i_ino, + exp->ex_fsid, exp->ex_uuid); + + if (inode) + _fh_update(fhp, exp, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { + fh_put(fhp); + return nfserr_opnotsupp; + } + } + + return 0; +} + +/* + * Update file handle information after changing a dentry. + * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create + */ +__be32 +fh_update(struct svc_fh *fhp) +{ + struct dentry *dentry; + + if (!fhp->fh_dentry) + goto out_bad; + + dentry = fhp->fh_dentry; + if (d_really_is_negative(dentry)) + goto out_negative; + if (fhp->fh_handle.fh_version != 1) { + _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); + } else { + if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) + return 0; + + _fh_update(fhp, fhp->fh_export, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) + return nfserr_opnotsupp; + } + return 0; +out_bad: + printk(KERN_ERR "fh_update: fh not verified!\n"); + return nfserr_serverfault; +out_negative: + printk(KERN_ERR "fh_update: %pd2 still negative!\n", + dentry); + return nfserr_serverfault; +} + +/* + * Release a file handle. + */ +void +fh_put(struct svc_fh *fhp) +{ + struct dentry * dentry = fhp->fh_dentry; + struct svc_export * exp = fhp->fh_export; + if (dentry) { + fh_unlock(fhp); + fhp->fh_dentry = NULL; + dput(dentry); +#ifdef CONFIG_NFSD_V3 + fhp->fh_pre_saved = 0; + fhp->fh_post_saved = 0; +#endif + } + fh_drop_write(fhp); + if (exp) { + exp_put(exp); + fhp->fh_export = NULL; + } + return; +} + +/* + * Shorthand for dprintk()'s + */ +char * SVCFH_fmt(struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + + static char buf[80]; + sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x", + fh->fh_size, + fh->fh_base.fh_pad[0], + fh->fh_base.fh_pad[1], + fh->fh_base.fh_pad[2], + fh->fh_base.fh_pad[3], + fh->fh_base.fh_pad[4], + fh->fh_base.fh_pad[5]); + return buf; +} + +enum fsid_source fsid_source(struct svc_fh *fhp) +{ + if (fhp->fh_handle.fh_version != 1) + return FSIDSOURCE_DEV; + switch(fhp->fh_handle.fh_fsid_type) { + case FSID_DEV: + case FSID_ENCODE_DEV: + case FSID_MAJOR_MINOR: + if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV) + return FSIDSOURCE_DEV; + break; + case FSID_NUM: + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + break; + default: + break; + } + /* either a UUID type filehandle, or the filehandle doesn't + * match the export. + */ + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + if (fhp->fh_export->ex_uuid) + return FSIDSOURCE_UUID; + return FSIDSOURCE_DEV; +} diff --git a/kernel/fs/nfsd/nfsfh.h b/kernel/fs/nfsd/nfsfh.h new file mode 100644 index 000000000..1e90dad49 --- /dev/null +++ b/kernel/fs/nfsd/nfsfh.h @@ -0,0 +1,292 @@ +/* + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + * + * This file describes the layout of the file handles as passed + * over the wire. + */ +#ifndef _LINUX_NFSD_NFSFH_H +#define _LINUX_NFSD_NFSFH_H + +#include +#include + +static inline __u32 ino_t_to_u32(ino_t ino) +{ + return (__u32) ino; +} + +static inline ino_t u32_to_ino_t(__u32 uino) +{ + return (ino_t) uino; +} + +/* + * This is the internal representation of an NFS handle used in knfsd. + * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. + */ +typedef struct svc_fh { + struct knfsd_fh fh_handle; /* FH data */ + struct dentry * fh_dentry; /* validated dentry */ + struct svc_export * fh_export; /* export pointer */ + int fh_maxsize; /* max size for fh_handle */ + + unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ + +#ifdef CONFIG_NFSD_V3 + unsigned char fh_post_saved; /* post-op attrs saved */ + unsigned char fh_pre_saved; /* pre-op attrs saved */ + + /* Pre-op attributes saved during fh_lock */ + __u64 fh_pre_size; /* size before operation */ + struct timespec fh_pre_mtime; /* mtime before oper */ + struct timespec fh_pre_ctime; /* ctime before oper */ + /* + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) + * to find out if it is valid. + */ + u64 fh_pre_change; + + /* Post-op attributes saved in fh_unlock */ + struct kstat fh_post_attr; /* full attrs after operation */ + u64 fh_post_change; /* nfsv4 change; see above */ +#endif /* CONFIG_NFSD_V3 */ + +} svc_fh; + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* + * This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + * + * In some cases the values are considered to be host endian and in + * others, net endian. fsidv is always considered to be u32 as the + * callers don't know which it will be. So we must use __force to keep + * sparse from complaining. Since these values are opaque to the + * client, that shouldn't be a problem. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = (__force __u32)htonl(MAJOR(dev)); + fsidv[1] = (__force __u32)htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_size != fh2->fh_size) + return false; + if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) + return false; + return true; +} + +static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_fsid_type != fh2->fh_fsid_type) + return false; + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) + return false; + return true; +} + +#ifdef CONFIG_NFSD_V3 +/* + * The wcc data stored in current_fh should be cleared + * between compound ops. + */ +static inline void +fh_clear_wcc(struct svc_fh *fhp) +{ + fhp->fh_post_saved = 0; + fhp->fh_pre_saved = 0; +} + +/* + * Fill in the pre_op attr for the wcc data + */ +static inline void +fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + + inode = d_inode(fhp->fh_dentry); + if (!fhp->fh_pre_saved) { + fhp->fh_pre_mtime = inode->i_mtime; + fhp->fh_pre_ctime = inode->i_ctime; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_saved = 1; + } +} + +extern void fill_post_wcc(struct svc_fh *); +#else +#define fh_clear_wcc(ignored) +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %pd2 already locked!\n", + dentry); + return; + } + + inode = d_inode(dentry); + mutex_lock_nested(&inode->i_mutex, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = 1; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + if (fhp->fh_locked) { + fill_post_wcc(fhp); + mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + fhp->fh_locked = 0; + } +} + +#endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/kernel/fs/nfsd/nfsproc.c b/kernel/fs/nfsd/nfsproc.c new file mode 100644 index 000000000..aecbcd34d --- /dev/null +++ b/kernel/fs/nfsd/nfsproc.c @@ -0,0 +1,759 @@ +/* + * Process version 2 NFS requests. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#include + +#include "cache.h" +#include "xdr.h" +#include "vfs.h" + +typedef struct svc_rqst svc_rqst; +typedef struct svc_buf svc_buf; + +#define NFSDDBG_FACILITY NFSDDBG_PROC + + +static __be32 +nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +static __be32 +nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp) +{ + if (err) return err; + return fh_getattr(&resp->fh, &resp->stat); +} +static __be32 +nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp) +{ + if (err) return err; + return fh_getattr(&resp->fh, &resp->stat); +} +/* + * Get a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * Set a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", + SVCFH_fmt(&argp->fh), + argp->attrs.ia_valid, (long) argp->attrs.ia_size); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * Look up a path name component + * Note: the dentry in the resp->fh may be negative if the file + * doesn't exist yet. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + struct nfsd_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LOOKUP %s %.*s\n", + SVCFH_fmt(&argp->fh), argp->len, argp->name); + + fh_init(&resp->fh, NFS_FHSIZE); + nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, + &resp->fh); + + fh_put(&argp->fh); + return nfsd_return_dirop(nfserr, resp); +} + +/* + * Read a symlink. + */ +static __be32 +nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp, + struct nfsd_readlinkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + resp->len = NFS_MAXPATHLEN; + nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + + fh_put(&argp->fh); + return nfserr; +} + +/* + * Read a portion of a file. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, + struct nfsd_readres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READ %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->offset); + + /* Obtain buffer pointer for payload. 19 is 1 word for + * status, 17 words for fattr, and 1 word for the byte count. + */ + + if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_NOTICE + "oversized read request from %s (%d bytes)\n", + svc_print_addr(rqstp, buf, sizeof(buf)), + argp->count); + argp->count = NFSSVC_MAXBLKSIZE_V2; + } + svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); + + resp->count = argp->count; + nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count); + + if (nfserr) return nfserr; + return fh_getattr(&resp->fh, &resp->stat); +} + +/* + * Write data to a file + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + int stable = 1; + unsigned long cnt = argp->len; + + dprintk("nfsd: WRITE %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->len, argp->offset); + + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, + argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, + &stable); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * CREATE processing is complicated. The keyword here is `overloaded.' + * The parent directory is kept locked between the check for existence + * and the actual create() call in compliance with VFS protocols. + * N.B. After this call _both_ argp->fh and resp->fh need an fh_put + */ +static __be32 +nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, + struct nfsd_diropres *resp) +{ + svc_fh *dirfhp = &argp->fh; + svc_fh *newfhp = &resp->fh; + struct iattr *attr = &argp->attrs; + struct inode *inode; + struct dentry *dchild; + int type, mode; + __be32 nfserr; + int hosterr; + dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); + + dprintk("nfsd: CREATE %s %.*s\n", + SVCFH_fmt(dirfhp), argp->len, argp->name); + + /* First verify the parent file handle */ + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); + if (nfserr) + goto done; /* must fh_put dirfhp even on error */ + + /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ + + nfserr = nfserr_acces; + if (!argp->len) + goto done; + nfserr = nfserr_exist; + if (isdotent(argp->name, argp->len)) + goto done; + hosterr = fh_want_write(dirfhp); + if (hosterr) { + nfserr = nfserrno(hosterr); + goto done; + } + + fh_lock_nested(dirfhp, I_MUTEX_PARENT); + dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); + if (IS_ERR(dchild)) { + nfserr = nfserrno(PTR_ERR(dchild)); + goto out_unlock; + } + fh_init(newfhp, NFS_FHSIZE); + nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); + if (!nfserr && d_really_is_negative(dchild)) + nfserr = nfserr_noent; + dput(dchild); + if (nfserr) { + if (nfserr != nfserr_noent) + goto out_unlock; + /* + * If the new file handle wasn't verified, we can't tell + * whether the file exists or not. Time to bail ... + */ + nfserr = nfserr_acces; + if (!newfhp->fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_create: file handle not verified\n"); + goto out_unlock; + } + } + + inode = d_inode(newfhp->fh_dentry); + + /* Unfudge the mode bits */ + if (attr->ia_valid & ATTR_MODE) { + type = attr->ia_mode & S_IFMT; + mode = attr->ia_mode & ~S_IFMT; + if (!type) { + /* no type, so if target exists, assume same as that, + * else assume a file */ + if (inode) { + type = inode->i_mode & S_IFMT; + switch(type) { + case S_IFCHR: + case S_IFBLK: + /* reserve rdev for later checking */ + rdev = inode->i_rdev; + attr->ia_valid |= ATTR_SIZE; + + /* FALLTHROUGH */ + case S_IFIFO: + /* this is probably a permission check.. + * at least IRIX implements perm checking on + * echo thing > device-special-file-or-pipe + * by doing a CREATE with type==0 + */ + nfserr = nfsd_permission(rqstp, + newfhp->fh_export, + newfhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); + if (nfserr && nfserr != nfserr_rofs) + goto out_unlock; + } + } else + type = S_IFREG; + } + } else if (inode) { + type = inode->i_mode & S_IFMT; + mode = inode->i_mode & ~S_IFMT; + } else { + type = S_IFREG; + mode = 0; /* ??? */ + } + + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = mode; + + /* Special treatment for non-regular files according to the + * gospel of sun micro + */ + if (type != S_IFREG) { + if (type != S_IFBLK && type != S_IFCHR) { + rdev = 0; + } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { + /* If you think you've seen the worst, grok this. */ + type = S_IFIFO; + } else { + /* Okay, char or block special */ + if (!rdev) + rdev = wanted; + } + + /* we've used the SIZE information, so discard it */ + attr->ia_valid &= ~ATTR_SIZE; + + /* Make sure the type and device matches */ + nfserr = nfserr_exist; + if (inode && type != (inode->i_mode & S_IFMT)) + goto out_unlock; + } + + nfserr = 0; + if (!inode) { + /* File doesn't exist. Create it and set attrs */ + nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, type, rdev, newfhp); + } else if (type == S_IFREG) { + dprintk("nfsd: existing %s, valid=%x, size=%ld\n", + argp->name, attr->ia_valid, (long) attr->ia_size); + /* File already exists. We ignore all attributes except + * size, so that creat() behaves exactly like + * open(..., O_CREAT|O_TRUNC|O_WRONLY). + */ + attr->ia_valid &= ATTR_SIZE; + if (attr->ia_valid) + nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time_t)0); + } + +out_unlock: + /* We don't really need to unlock, as fh_put does it. */ + fh_unlock(dirfhp); + fh_drop_write(dirfhp); +done: + fh_put(dirfhp); + return nfsd_return_dirop(nfserr, resp); +} + +static __be32 +nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), + argp->len, argp->name); + + /* Unlink. -SIFDIR means file must not be a directory */ + nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); + fh_put(&argp->fh); + return nfserr; +} + +static __be32 +nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RENAME %s %.*s -> \n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); + + nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, + &argp->tfh, argp->tname, argp->tlen); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return nfserr; +} + +static __be32 +nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LINK %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, + &argp->ffh); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return nfserr; +} + +static __be32 +nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp, + void *resp) +{ + struct svc_fh newfh; + __be32 nfserr; + + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_init(&newfh, NFS_FHSIZE); + /* + * Crazy hack: the request fits in a page, and already-decoded + * attributes follow argp->tname, so it's safe to just write a + * null to ensure it's null-terminated: + */ + argp->tname[argp->tlen] = '\0'; + nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, + argp->tname, &newfh); + + fh_put(&argp->ffh); + fh_put(&newfh); + return nfserr; +} + +/* + * Make directory. This operation is not idempotent. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp, + struct nfsd_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + if (resp->fh.fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_mkdir: response already verified??\n"); + } + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_init(&resp->fh, NFS_FHSIZE); + nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_put(&argp->fh); + return nfsd_return_dirop(nfserr, resp); +} + +/* + * Remove a directory + */ +static __be32 +nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); + fh_put(&argp->fh); + return nfserr; +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp, + struct nfsd_readdirres *resp) +{ + int count; + __be32 nfserr; + loff_t offset; + + dprintk("nfsd: READDIR %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->cookie); + + /* Shrink to the client read size */ + count = (argp->count >> 2) - 2; + + /* Make sure we've room for the NULL ptr & eof flag */ + count -= 2; + if (count < 0) + count = 0; + + resp->buffer = argp->buffer; + resp->offset = NULL; + resp->buflen = count; + resp->common.err = nfs_ok; + /* Read directory and encode entries on the fly */ + offset = argp->cookie; + nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, + &resp->common, nfssvc_encode_entry); + + resp->count = resp->buffer - argp->buffer; + if (resp->offset) + *resp->offset = htonl(offset); + + fh_put(&argp->fh); + return nfserr; +} + +/* + * Get file system info + */ +static __be32 +nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd_statfsres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); + + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); + fh_put(&argp->fh); + return nfserr; +} + +/* + * NFSv2 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +struct nfsd_void { int dummy; }; + +#define ST 1 /* status */ +#define FH 8 /* filehandle */ +#define AT 18 /* attributes */ + +static struct svc_procedure nfsd_procedures2[18] = { + [NFSPROC_NULL] = { + .pc_func = (svc_procfunc) nfsd_proc_null, + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_GETATTR] = { + .pc_func = (svc_procfunc) nfsd_proc_getattr, + .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_SETATTR] = { + .pc_func = (svc_procfunc) nfsd_proc_setattr, + .pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_ROOT] = { + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_LOOKUP] = { + .pc_func = (svc_procfunc) nfsd_proc_lookup, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_READLINK] = { + .pc_func = (svc_procfunc) nfsd_proc_readlink, + .pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres, + .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_ressize = sizeof(struct nfsd_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + }, + [NFSPROC_READ] = { + .pc_func = (svc_procfunc) nfsd_proc_read, + .pc_decode = (kxdrproc_t) nfssvc_decode_readargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_readargs), + .pc_ressize = sizeof(struct nfsd_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + }, + [NFSPROC_WRITECACHE] = { + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_WRITE] = { + .pc_func = (svc_procfunc) nfsd_proc_write, + .pc_decode = (kxdrproc_t) nfssvc_decode_writeargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_CREATE] = { + .pc_func = (svc_procfunc) nfsd_proc_create, + .pc_decode = (kxdrproc_t) nfssvc_decode_createargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_REMOVE] = { + .pc_func = (svc_procfunc) nfsd_proc_remove, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_RENAME] = { + .pc_func = (svc_procfunc) nfsd_proc_rename, + .pc_decode = (kxdrproc_t) nfssvc_decode_renameargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_renameargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_LINK] = { + .pc_func = (svc_procfunc) nfsd_proc_link, + .pc_decode = (kxdrproc_t) nfssvc_decode_linkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_linkargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_SYMLINK] = { + .pc_func = (svc_procfunc) nfsd_proc_symlink, + .pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_symlinkargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_MKDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_mkdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_createargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_RMDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_rmdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_READDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_readdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readdirres, + .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_ressize = sizeof(struct nfsd_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFSPROC_STATFS] = { + .pc_func = (svc_procfunc) nfsd_proc_statfs, + .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle, + .pc_encode = (kxdrproc_t) nfssvc_encode_statfsres, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_statfsres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+5, + }, +}; + + +struct svc_version nfsd_version2 = { + .vs_vers = 2, + .vs_nproc = 18, + .vs_proc = nfsd_procedures2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS2_SVC_XDRSIZE, +}; + +/* + * Map errnos to NFS errnos. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, +#ifdef EDQUOT + { nfserr_dquot, -EDQUOT }, +#endif + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + diff --git a/kernel/fs/nfsd/nfssvc.c b/kernel/fs/nfsd/nfssvc.c new file mode 100644 index 000000000..9277cc91c --- /dev/null +++ b/kernel/fs/nfsd/nfssvc.c @@ -0,0 +1,752 @@ +/* + * Central processing for nfsd. + * + * Authors: Olaf Kirch (okir@monad.swb.de) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "nfsd.h" +#include "cache.h" +#include "vfs.h" +#include "netns.h" + +#define NFSDDBG_FACILITY NFSDDBG_SVC + +extern struct svc_program nfsd_program; +static int nfsd(void *vrqstp); + +/* + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members + * of the svc_serv struct. In particular, ->sv_nrthreads but also to some + * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * + * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number + * of nfsd threads must exist and each must listed in ->sp_all_threads in each + * entry of ->sv_pools[]. + * + * Transitions of the thread count between zero and non-zero are of particular + * interest since the svc_serv needs to be created and initialized at that + * point, or freed. + * + * Finally, the nfsd_mutex also protects some of the global variables that are + * accessed when nfsd starts and that are settable via the write_* routines in + * nfsctl.c. In particular: + * + * user_recovery_dirname + * user_lease_time + * nfsd_versions + */ +DEFINE_MUTEX(nfsd_mutex); + +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned long nfsd_drc_max_mem; +unsigned long nfsd_drc_mem_used; + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) +static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_versions, + .pg_name = "nfsacl", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + +static struct svc_version * nfsd_version[] = { + [2] = &nfsd_version2, +#if defined(CONFIG_NFSD_V3) + [3] = &nfsd_version3, +#endif +#if defined(CONFIG_NFSD_V4) + [4] = &nfsd_version4, +#endif +}; + +#define NFSD_MINVERS 2 +#define NFSD_NRVERS ARRAY_SIZE(nfsd_version) +static struct svc_version *nfsd_versions[NFSD_NRVERS]; + +struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif + .pg_prog = NFS_PROGRAM, /* program number */ + .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ + .pg_vers = nfsd_versions, /* version table */ + .pg_name = "nfsd", /* program name */ + .pg_class = "nfsd", /* authentication class */ + .pg_stats = &nfsd_svcstats, /* version table */ + .pg_authenticate = &svc_set_client, /* export authentication */ + +}; + +static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { + [0] = 1, + [1] = 1, + [2] = 1, +}; + +int nfsd_vers(int vers, enum vers_op change) +{ + if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + return 0; + switch(change) { + case NFSD_SET: + nfsd_versions[vers] = nfsd_version[vers]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_versions[vers] = nfsd_acl_version[vers]; +#endif + break; + case NFSD_CLEAR: + nfsd_versions[vers] = NULL; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_versions[vers] = NULL; +#endif + break; + case NFSD_TEST: + return nfsd_versions[vers] != NULL; + case NFSD_AVAIL: + return nfsd_version[vers] != NULL; + } + return 0; +} + +int nfsd_minorversion(u32 minorversion, enum vers_op change) +{ + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + return -1; + switch(change) { + case NFSD_SET: + nfsd_supported_minorversions[minorversion] = true; + break; + case NFSD_CLEAR: + nfsd_supported_minorversions[minorversion] = false; + break; + case NFSD_TEST: + return nfsd_supported_minorversions[minorversion]; + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + } + return 0; +} + +/* + * Maximum number of nfsd processes + */ +#define NFSD_MAXSERVS 8192 + +int nfsd_nrthreads(struct net *net) +{ + int rv = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) + rv = nn->nfsd_serv->sv_nrthreads; + mutex_unlock(&nfsd_mutex); + return rv; +} + +static int nfsd_init_socks(struct net *net) +{ + int error; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + return 0; + + error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + return 0; +} + +static int nfsd_users = 0; + +static int nfsd_startup_generic(int nrservs) +{ + int ret; + + if (nfsd_users++) + return 0; + + /* + * Readahead param cache - will no-op if it already exists. + * (Note therefore results will be suboptimal if number of + * threads is modified after nfsd start.) + */ + ret = nfsd_racache_init(2*nrservs); + if (ret) + goto dec_users; + + ret = nfs4_state_start(); + if (ret) + goto out_racache; + return 0; + +out_racache: + nfsd_racache_shutdown(); +dec_users: + nfsd_users--; + return ret; +} + +static void nfsd_shutdown_generic(void) +{ + if (--nfsd_users) + return; + + nfs4_state_shutdown(); + nfsd_racache_shutdown(); +} + +static bool nfsd_needs_lockd(void) +{ +#if defined(CONFIG_NFSD_V3) + return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL); +#else + return (nfsd_versions[2] != NULL); +#endif +} + +static int nfsd_startup_net(int nrservs, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int ret; + + if (nn->nfsd_net_up) + return 0; + + ret = nfsd_startup_generic(nrservs); + if (ret) + return ret; + ret = nfsd_init_socks(net); + if (ret) + goto out_socks; + + if (nfsd_needs_lockd() && !nn->lockd_up) { + ret = lockd_up(net); + if (ret) + goto out_socks; + nn->lockd_up = 1; + } + + ret = nfs4_state_start_net(net); + if (ret) + goto out_lockd; + + nn->nfsd_net_up = true; + return 0; + +out_lockd: + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } +out_socks: + nfsd_shutdown_generic(); + return ret; +} + +static void nfsd_shutdown_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_state_shutdown_net(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } + nn->nfsd_net_up = false; + nfsd_shutdown_generic(); +} + +static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + /* + * write_ports can create the server without actually starting + * any threads--if we get shut down before any threads are + * started, then nfsd_last_thread will be run before any of this + * other initialization has been done. + */ + if (!nn->nfsd_net_up) + return; + nfsd_shutdown_net(net); + + svc_rpcb_cleanup(serv, net); + + printk(KERN_WARNING "nfsd: last server has exited, flushing export " + "cache\n"); + nfsd_export_flush(net); +} + +void nfsd_reset_versions(void) +{ + int found_one = 0; + int i; + + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { + if (nfsd_program.pg_vers[i]) + found_one = 1; + } + + if (!found_one) { + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) + nfsd_program.pg_vers[i] = nfsd_version[i]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; +#endif + } +} + +/* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ +static void set_max_drc(void) +{ + #define NFSD_DRC_SIZE_SHIFT 10 + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); +} + +static int nfsd_get_default_max_blksize(void) +{ + struct sysinfo i; + unsigned long long target; + unsigned long ret; + + si_meminfo(&i); + target = (i.totalram - i.totalhigh) << PAGE_SHIFT; + /* + * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig + * machines, but only uses 32K on 128M machines. Bottom out at + * 8K on 32M and smaller. Of course, this is only a default. + */ + target >>= 12; + + ret = NFSSVC_MAXBLKSIZE; + while (ret > target && ret >= 8*1024*2) + ret /= 2; + return ret; +} + +int nfsd_create_serv(struct net *net) +{ + int error; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + if (nn->nfsd_serv) { + svc_get(nn->nfsd_serv); + return 0; + } + if (nfsd_max_blksize == 0) + nfsd_max_blksize = nfsd_get_default_max_blksize(); + nfsd_reset_versions(); + nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + nfsd_last_thread, nfsd, THIS_MODULE); + if (nn->nfsd_serv == NULL) + return -ENOMEM; + + nn->nfsd_serv->sv_maxconn = nn->max_connections; + error = svc_bind(nn->nfsd_serv, net); + if (error < 0) { + svc_destroy(nn->nfsd_serv); + return error; + } + + set_max_drc(); + do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ + return 0; +} + +int nfsd_nrpools(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) + return 0; + else + return nn->nfsd_serv->sv_nrpools; +} + +int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) +{ + int i = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv != NULL) { + for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; + } + + return 0; +} + +void nfsd_destroy(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int destroy = (nn->nfsd_serv->sv_nrthreads == 1); + + if (destroy) + svc_shutdown_net(nn->nfsd_serv, net); + svc_destroy(nn->nfsd_serv); + if (destroy) + nn->nfsd_serv = NULL; +} + +int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) +{ + int i = 0; + int tot = 0; + int err = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + if (nn->nfsd_serv == NULL || n <= 0) + return 0; + + if (n > nn->nfsd_serv->sv_nrpools) + n = nn->nfsd_serv->sv_nrpools; + + /* enforce a global maximum number of threads */ + tot = 0; + for (i = 0; i < n; i++) { + nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); + tot += nthreads[i]; + } + if (tot > NFSD_MAXSERVS) { + /* total too large: scale down requested numbers */ + for (i = 0; i < n && tot > 0; i++) { + int new = nthreads[i] * NFSD_MAXSERVS / tot; + tot -= (nthreads[i] - new); + nthreads[i] = new; + } + for (i = 0; i < n && tot > 0; i++) { + nthreads[i]--; + tot--; + } + } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + + /* apply the new numbers */ + svc_get(nn->nfsd_serv); + for (i = 0; i < n; i++) { + err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], + nthreads[i]); + if (err) + break; + } + nfsd_destroy(net); + return err; +} + +/* + * Adjust the number of threads and return the new number of threads. + * This is also the function that starts the server if necessary, if + * this is the first time nrservs is nonzero. + */ +int +nfsd_svc(int nrservs, struct net *net) +{ + int error; + bool nfsd_up_before; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + dprintk("nfsd: creating service\n"); + + nrservs = max(nrservs, 0); + nrservs = min(nrservs, NFSD_MAXSERVS); + error = 0; + + if (nrservs == 0 && nn->nfsd_serv == NULL) + goto out; + + error = nfsd_create_serv(net); + if (error) + goto out; + + nfsd_up_before = nn->nfsd_net_up; + + error = nfsd_startup_net(nrservs, net); + if (error) + goto out_destroy; + error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + if (error) + goto out_shutdown; + /* We are holding a reference to nn->nfsd_serv which + * we don't want to count in the return value, + * so subtract 1 + */ + error = nn->nfsd_serv->sv_nrthreads - 1; +out_shutdown: + if (error < 0 && !nfsd_up_before) + nfsd_shutdown_net(net); +out_destroy: + nfsd_destroy(net); /* Release server */ +out: + mutex_unlock(&nfsd_mutex); + return error; +} + + +/* + * This is the NFS server kernel thread + */ +static int +nfsd(void *vrqstp) +{ + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; + struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); + struct net *net = perm_sock->xpt_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int err; + + /* Lock module and set up kernel thread */ + mutex_lock(&nfsd_mutex); + + /* At this point, the thread shares current->fs + * with the init process. We need to create files with a + * umask of 0 instead of init's umask. */ + if (unshare_fs_struct() < 0) { + printk("Unable to start nfsd thread: out of memory\n"); + goto out; + } + + current->fs->umask = 0; + + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that will bring down the thread + */ + allow_signal(SIGKILL); + allow_signal(SIGHUP); + allow_signal(SIGINT); + allow_signal(SIGQUIT); + + nfsdstats.th_cnt++; + mutex_unlock(&nfsd_mutex); + + set_freezable(); + + /* + * The main request loop + */ + for (;;) { + /* Update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nn->max_connections; + + /* + * Find a socket with data available and call its + * recvfrom routine. + */ + while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) + ; + if (err == -EINTR) + break; + validate_process_creds(); + svc_process(rqstp); + validate_process_creds(); + } + + /* Clear signals before calling svc_exit_thread() */ + flush_signals(current); + + mutex_lock(&nfsd_mutex); + nfsdstats.th_cnt --; + +out: + rqstp->rq_server = NULL; + + /* Release the thread */ + svc_exit_thread(rqstp); + + nfsd_destroy(net); + + /* Release module */ + mutex_unlock(&nfsd_mutex); + module_put_and_exit(0); + return 0; +} + +static __be32 map_new_errors(u32 vers, __be32 nfserr) +{ + if (nfserr == nfserr_jukebox && vers == 2) + return nfserr_dropit; + if (nfserr == nfserr_wrongsec && vers < 4) + return nfserr_acces; + return nfserr; +} + +int +nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + struct svc_procedure *proc; + kxdrproc_t xdr; + __be32 nfserr; + __be32 *nfserrp; + + dprintk("nfsd_dispatch: vers %d proc %d\n", + rqstp->rq_vers, rqstp->rq_proc); + proc = rqstp->rq_procinfo; + + /* + * Give the xdr decoder a chance to change this if it wants + * (necessary in the NFSv4.0 compound case) + */ + rqstp->rq_cachetype = proc->pc_cachetype; + /* Decode arguments */ + xdr = proc->pc_decode; + if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, + rqstp->rq_argp)) { + dprintk("nfsd: failed to decode arguments!\n"); + *statp = rpc_garbage_args; + return 1; + } + + /* Check whether we have this call in the cache. */ + switch (nfsd_cache_lookup(rqstp)) { + case RC_DROPIT: + return 0; + case RC_REPLY: + return 1; + case RC_DOIT:; + /* do it */ + } + + /* need to grab the location to store the status, as + * nfsv4 does some encoding while processing + */ + nfserrp = rqstp->rq_res.head[0].iov_base + + rqstp->rq_res.head[0].iov_len; + rqstp->rq_res.head[0].iov_len += sizeof(__be32); + + /* Now call the procedure handler, and encode NFS status. */ + nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + nfserr = map_new_errors(rqstp->rq_vers, nfserr); + if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { + dprintk("nfsd: Dropping request; may be revisited later\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + return 0; + } + + if (rqstp->rq_proc != 0) + *nfserrp++ = nfserr; + + /* Encode result. + * For NFSv2, additional info is never returned in case of an error. + */ + if (!(nfserr && rqstp->rq_vers == 2)) { + xdr = proc->pc_encode; + if (xdr && !xdr(rqstp, nfserrp, + rqstp->rq_resp)) { + /* Failed to encode result. Release cache entry */ + dprintk("nfsd: failed to encode result!\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + *statp = rpc_system_err; + return 1; + } + } + + /* Store reply in cache. */ + nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + return 1; +} + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + int ret; + struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv == NULL) { + mutex_unlock(&nfsd_mutex); + return -ENODEV; + } + /* bump up the psudo refcount while traversing */ + svc_get(nn->nfsd_serv); + ret = svc_pool_stats_open(nn->nfsd_serv, file); + mutex_unlock(&nfsd_mutex); + return ret; +} + +int nfsd_pool_stats_release(struct inode *inode, struct file *file) +{ + int ret = seq_release(inode, file); + struct net *net = inode->i_sb->s_fs_info; + + mutex_lock(&nfsd_mutex); + /* this function really, really should have been called svc_put() */ + nfsd_destroy(net); + mutex_unlock(&nfsd_mutex); + return ret; +} diff --git a/kernel/fs/nfsd/nfsxdr.c b/kernel/fs/nfsd/nfsxdr.c new file mode 100644 index 000000000..79d964aa8 --- /dev/null +++ b/kernel/fs/nfsd/nfsxdr.c @@ -0,0 +1,550 @@ +/* + * XDR support for nfsd + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include "vfs.h" +#include "xdr.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs_ftypes[] = { + NFNON, NFCHR, NFCHR, NFBAD, + NFDIR, NFBAD, NFBLK, NFBAD, + NFREG, NFBAD, NFLNK, NFBAD, + NFSOCK, NFBAD, NFLNK, NFBAD, +}; + + +/* + * XDR functions for basic NFS types + */ +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + fh_init(fhp, NFS_FHSIZE); + memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + + /* FIXME: Look up export pointer here and verify + * Sun Secure RPC if requested */ + return p + (NFS_FHSIZE >> 2); +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); + return p + (NFS_FHSIZE>> 2); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_pathname(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr(__be32 *p, struct iattr *iap) +{ + u32 tmp, tmp1; + + iap->ia_valid = 0; + + /* Sun client bug compatibility check: some sun clients seem to + * put 0xffff in the mode field when they mean 0xffffffff. + * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + */ + if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_uid = make_kuid(&init_user_ns, tmp); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_gid = make_kgid(&init_user_ns, tmp); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = tmp; + iap->ia_atime.tv_nsec = tmp1 * 1000; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = tmp; + iap->ia_mtime.tv_nsec = tmp1 * 1000; + /* + * Passing the invalid value useconds=1000000 for mtime + * is a Sun convention for "set both mtime and atime to + * current server time". It's needed to make permissions + * checks for the "touch" program across v2 mounts to + * Solaris and Irix boxes work correctly. See description of + * sattr in section 6.1 of "NFS Illustrated" by + * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 + */ + if (tmp1 == 1000000) + iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); + } + return p; +} + +static __be32 * +encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + struct dentry *dentry = fhp->fh_dentry; + int type; + struct timespec time; + u32 f; + + type = (stat->mode & S_IFMT); + + *p++ = htonl(nfs_ftypes[type >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); + + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { + *p++ = htonl(NFS_MAXPATHLEN); + } else { + *p++ = htonl((u32) stat->size); + } + *p++ = htonl((u32) stat->blksize); + if (S_ISCHR(type) || S_ISBLK(type)) + *p++ = htonl(new_encode_dev(stat->rdev)); + else + *p++ = htonl(0xffffffff); + *p++ = htonl((u32) stat->blocks); + switch (fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + *p++ = htonl(new_encode_dev(stat->dev)); + break; + case FSIDSOURCE_FSID: + *p++ = htonl((u32) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u32*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; + *p++ = htonl(f); + break; + } + *p++ = htonl((u32) stat->ino); + *p++ = htonl((u32) stat->atime.tv_sec); + *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); + lease_get_mtime(d_inode(dentry), &time); + *p++ = htonl((u32) time.tv_sec); + *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); + *p++ = htonl((u32) stat->ctime.tv_sec); + *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + + return p; +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) +{ + return encode_fattr(rqstp, p, fhp, stat); +} + +/* + * XDR decode functions + */ +int +nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_sattrargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_diropargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readargs *args) +{ + unsigned int len; + int v; + p = decode_fh(p, &args->fh); + if (!p) + return 0; + + args->offset = ntohl(*p++); + len = args->count = ntohl(*p++); + p++; /* totalcount - unused */ + + len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); + + /* set up somewhere to store response. + * We take pages, put them on reslist and include in iovec + */ + v=0; + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_writeargs *args) +{ + unsigned int len, hdr, dlen; + int v; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + + p++; /* beginoffset */ + args->offset = ntohl(*p++); /* offset */ + p++; /* totalcount */ + len = args->len = ntohl(*p++); + /* + * The protocol specifies a maximum of 8192 bytes. + */ + if (len > NFSSVC_MAXBLKSIZE_V2) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; + dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + - hdr; + + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + v = 0; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; + v++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; + } + rqstp->rq_vec[v].iov_len = len; + args->vlen = v + 1; + return 1; +} + +int +nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_createargs *args) +{ + if ( !(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_renameargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_linkargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_symlinkargs *args) +{ + if ( !(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_pathname(p, &args->tname, &args->tlen))) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readdirargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->cookie = ntohl(*p++); + args->count = ntohl(*p++); + args->count = min_t(u32, args->count, PAGE_SIZE); + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ +int +nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_diropres *resp) +{ + p = encode_fh(p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readlinkres *resp) +{ + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; +} + +int +nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readres *resp) +{ + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->count); + xdr_ressize_check(rqstp, p); + + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); + } + return 1; +} + +int +nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readdirres *resp) +{ + xdr_ressize_check(rqstp, p); + p = resp->buffer; + *p++ = 0; /* no more entries */ + *p++ = htonl((resp->common.err == nfserr_eof)); + rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; + + return 1; +} + +int +nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_statfsres *resp) +{ + struct kstatfs *stat = &resp->stats; + + *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ + *p++ = htonl(stat->f_bsize); + *p++ = htonl(stat->f_blocks); + *p++ = htonl(stat->f_bfree); + *p++ = htonl(stat->f_bavail); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_entry(void *ccdv, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); + __be32 *p = cd->buffer; + int buflen, slen; + + /* + dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", + namlen, name, offset, ino); + */ + + if (offset > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + if (cd->offset) + *cd->offset = htonl(offset); + + /* truncate filename */ + namlen = min(namlen, NFS2_MAXNAMLEN); + slen = XDR_QUADLEN(namlen); + + if ((buflen = cd->buflen - slen - 4) < 0) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + if (ino > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + *p++ = xdr_one; /* mark entry present */ + *p++ = htonl((u32) ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + cd->offset = p; /* remember pointer */ + *p++ = htonl(~0U); /* offset of next entry */ + + cd->buflen = buflen; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; +} + +/* + * XDR release functions + */ +int +nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_fhandle *resp) +{ + fh_put(&resp->fh); + return 1; +} diff --git a/kernel/fs/nfsd/pnfs.h b/kernel/fs/nfsd/pnfs.h new file mode 100644 index 000000000..d4c445367 --- /dev/null +++ b/kernel/fs/nfsd/pnfs.h @@ -0,0 +1,86 @@ +#ifndef _FS_NFSD_PNFS_H +#define _FS_NFSD_PNFS_H 1 + +#ifdef CONFIG_NFSD_V4 +#include +#include + +#include "state.h" +#include "xdr4.h" + +struct xdr_stream; + +struct nfsd4_deviceid_map { + struct list_head hash; + u64 idx; + int fsid_type; + u32 fsid[]; +}; + +struct nfsd4_layout_ops { + u32 notify_types; + + __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdevp); + __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdevp); + + __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, + struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *, + struct nfsd4_layoutget *lgp); + + __be32 (*proc_layoutcommit)(struct inode *inode, + struct nfsd4_layoutcommit *lcp); +}; + +extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; +extern const struct nfsd4_layout_ops bl_layout_ops; + +__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp); +__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, + struct nfs4_layout_stateid *ls); +__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation); +struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); +#endif /* CONFIG_NFSD_V4 */ + +#ifdef CONFIG_NFSD_PNFS +void nfsd4_setup_layout_type(struct svc_export *exp); +void nfsd4_return_all_client_layouts(struct nfs4_client *); +void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp); +int nfsd4_init_pnfs(void); +void nfsd4_exit_pnfs(void); +#else +struct nfs4_client; +struct nfs4_file; + +static inline void nfsd4_setup_layout_type(struct svc_export *exp) +{ +} + +static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ +} +static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp) +{ +} +static inline void nfsd4_exit_pnfs(void) +{ +} +static inline int nfsd4_init_pnfs(void) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _FS_NFSD_PNFS_H */ diff --git a/kernel/fs/nfsd/state.h b/kernel/fs/nfsd/state.h new file mode 100644 index 000000000..dbc4f85a5 --- /dev/null +++ b/kernel/fs/nfsd/state.h @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include +#include +#include "nfsfh.h" + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + clientid_t so_clid; + u32 so_id; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; + +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_opaque.so_clid.cl_boot, \ + (s)->si_opaque.so_clid.cl_id, \ + (s)->si_opaque.so_id, \ + (s)->si_generation + +struct nfsd4_callback { + struct nfs4_client *cb_clp; + u32 cb_minorversion; + struct rpc_message cb_msg; + struct nfsd4_callback_ops *cb_ops; + struct work_struct cb_work; + int cb_status; + bool cb_need_restart; +}; + +struct nfsd4_callback_ops { + void (*prepare)(struct nfsd4_callback *); + int (*done)(struct nfsd4_callback *, struct rpc_task *); + void (*release)(struct nfsd4_callback *); +}; + +/* + * A core object that represents a "common" stateid. These are generally + * embedded within the different (more specific) stateid objects and contain + * fields that are of general use to any stateid. + */ +struct nfs4_stid { + atomic_t sc_count; +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 +/* For a deleg stateid kept around only to process free_stateid's: */ +#define NFS4_REVOKED_DELEG_STID 16 +#define NFS4_CLOSED_DELEG_STID 32 +#define NFS4_LAYOUT_STID 64 + unsigned char sc_type; + stateid_t sc_stateid; + struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); +}; + +/* + * Represents a delegation stateid. The nfs4_client holds references to these + * and they are put when it is being destroyed or when the delegation is + * returned by the client: + * + * o 1 reference as long as a delegation is still in force (taken when it's + * alloc'd, put when it's returned or revoked) + * + * o 1 reference as long as a recall rpc is in progress (taken when the lease + * is broken, put when the rpc exits) + * + * o 1 more ephemeral reference for each nfsd thread currently doing something + * with that delegation without holding the cl_lock + * + * If the server attempts to recall a delegation and the client doesn't do so + * before a timeout, the server may also revoke the delegation. In that case, + * the object will either be destroyed (v4.0) or moved to a per-client list of + * revoked delegations (v4.1+). + * + * This object is a superset of the nfs4_stid. + */ +struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; + u32 dl_type; + time_t dl_time; +/* For recall: */ + int dl_retries; + struct nfsd4_callback dl_recall; +}; + +#define cb_to_delegation(cb) \ + container_of(cb, struct nfs4_delegation, dl_recall) + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + struct sockaddr_storage cb_saddr; + size_t cb_addrlen; + u32 cb_prog; /* used only in 4.0 case; + per-session otherwise */ + u32 cb_ident; /* minorversion 0 only */ + struct svc_xprt *cb_xprt; /* minorversion 1 only */ +}; + +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 2048 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + u16 sl_opcnt; +#define NFSD4_SLOT_INUSE (1 << 0) +#define NFSD4_SLOT_CACHETHIS (1 << 1) +#define NFSD4_SLOT_INITIALIZED (1 << 2) + u8 sl_flags; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_cb_sec { + u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */ + kuid_t uid; + kgid_t gid; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + struct nfsd4_cb_sec cb_sec; +}; + +struct nfsd4_backchannel_ctl { + u32 bc_cb_program; + struct nfsd4_cb_sec bc_cb_sec; +}; + +struct nfsd4_bind_conn_to_session { + struct nfs4_sessionid sessionid; + u32 dir; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_conn { + struct list_head cn_persession; + struct svc_xprt *cn_xprt; + struct svc_xpt_user cn_xpt_user; + struct nfsd4_session *cn_session; +/* CDFC4_FORE, CDFC4_BACK: */ + unsigned char cn_flags; +}; + +/* + * Representation of a v4.1+ session. These are refcounted in a similar fashion + * to the nfs4_client. References are only taken when the server is actively + * working on the object (primarily during the processing of compounds). + */ +struct nfsd4_session { + atomic_t se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; +/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ +#define NFS4_SESSION_DEAD 0x010 + u32 se_flags; + struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_cb_sec se_cb_sec; + struct list_head se_conns; + u32 se_cb_prog; + u32 se_cb_seq_nr; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * + * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) + * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped. + * Each nfsd_net_ns object contains a set of these and they are tracked via + * short and long form clientid. They are hashed and searched for under the + * per-nfsd_net client_lock spinlock. + * + * References to it are only held during the processing of compounds, and in + * certain other operations. In their "resting state" they have a refcount of + * 0. If they are not renewed within a lease period, they become eligible for + * destruction by the laundromat. + * + * These objects can also be destroyed prematurely by the fault injection code, + * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * Care is taken *not* to do this however when the objects have an elevated + * refcount. + * + * o Each nfs4_client is hashed by clientid + * + * o Each nfs4_clients is also hashed by name (the opaque quantity initially + * sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct rb_node cl_namenode; /* link into by-name trees */ + struct list_head *cl_ownerstr_hashtbl; + struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ + struct list_head cl_delegations; + struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ + struct list_head cl_lru; /* tail queue */ +#ifdef CONFIG_NFSD_PNFS + struct list_head cl_lo_states; /* outstanding layout states */ +#endif + struct xdr_netobj cl_name; /* id generated by client */ + nfs4_verifier cl_verifier; /* generated by client */ + time_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + bool cl_mach_cred; /* SP4_MACH_CRED in force */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + u32 cl_minorversion; + + /* for v4.0 and v4.1 callbacks: */ + struct nfs4_cb_conn cl_cb_conn; +#define NFSD4_CLIENT_CB_UPDATE (0) +#define NFSD4_CLIENT_CB_KILL (1) +#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ +#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ +#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ +#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ +#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ + 1 << NFSD4_CLIENT_CB_KILL) + unsigned long cl_flags; + struct rpc_cred *cl_cb_cred; + struct rpc_clnt *cl_cb_client; + u32 cl_cb_ident; +#define NFSD4_CB_UP 0 +#define NFSD4_CB_UNKNOWN 1 +#define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 + int cl_cb_state; + struct nfsd4_callback cl_cb_null; + struct nfsd4_session *cl_cb_session; + + /* for all client information that callback code might need: */ + spinlock_t cl_lock; + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + /* number of rpc's in progress over an associated session: */ + atomic_t cl_refcount; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ + struct net *net; +}; + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + struct nfs4_client *cr_clp; /* pointer to associated clp */ + char cr_recdir[HEXDIR_LEN]; /* recover dir */ +}; + +static inline void +update_stateid(stateid_t *stateid) +{ + stateid->si_generation++; + /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ + if (stateid->si_generation == 0) + stateid->si_generation = 1; +} + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + struct knfsd_fh rp_openfh; + struct mutex rp_mutex; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +struct nfs4_stateowner; + +struct nfs4_stateowner_operations { + void (*so_unhash)(struct nfs4_stateowner *); + void (*so_free)(struct nfs4_stateowner *); +}; + +/* + * A core object that represents either an open or lock owner. The object and + * lock owner objects have one of these embedded within them. Refcounts and + * other fields common to both owner types are contained within these + * structures. + */ +struct nfs4_stateowner { + struct list_head so_strhash; + struct list_head so_stateids; + struct nfs4_client *so_client; + const struct nfs4_stateowner_operations *so_ops; + /* after increment in nfsd4_bump_seqid, represents the next + * sequence id expected from the client: */ + atomic_t so_count; + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + struct nfs4_replay so_replay; + bool so_is_open_owner; +}; + +/* + * When a file is opened, the client provides an open state owner opaque string + * that indicates the "owner" of that open. These objects are refcounted. + * References to it are held by each open state associated with it. This object + * is a superset of the nfs4_stateowner struct. + */ +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; + struct nfs4_ol_stateid *oo_last_closed_stid; + time_t oo_time; /* time of placement on so_close_lru */ +#define NFS4_OO_CONFIRMED 1 + unsigned char oo_flags; +}; + +/* + * Represents a generic "lockowner". Similar to an openowner. References to it + * are held by the lock stateids that are created on its behalf. This object is + * a superset of the nfs4_stateowner struct (or would be if it needed any extra + * fields). + */ +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + +/* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + atomic_t co_odcount; +}; + +/* + * nfs4_file: a file opened by some number of (open) nfs4_stateowners. + * + * These objects are global. nfsd keeps one instance of a nfs4_file per + * filehandle (though it may keep multiple file descriptors for each). Each + * inode can have multiple filehandles associated with it, so there is + * (potentially) a many to one relationship between this struct and struct + * inode. + * + * These are hashed by filehandle in the file_hashtbl, which is protected by + * the global state_lock spinlock. + */ +struct nfs4_file { + atomic_t fi_ref; + spinlock_t fi_lock; + struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct list_head fi_stateids; + union { + struct list_head fi_delegations; + struct rcu_head fi_rcu; + }; + struct list_head fi_clnt_odstate; + /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ + struct file * fi_fds[3]; + /* + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. + */ + atomic_t fi_access[2]; + u32 fi_share_deny; + struct file *fi_deleg_file; + int fi_delegees; + struct knfsd_fh fi_fhandle; + bool fi_had_conflict; +#ifdef CONFIG_NFSD_PNFS + struct list_head fi_lo_states; + atomic_t fi_lo_recalls; +#endif +}; + +/* + * A generic struct representing either a open or lock stateid. The nfs4_client + * holds a reference to each of these objects, and they in turn hold a + * reference to their respective stateowners. The client's reference is + * released in response to a close or unlock (depending on whether it's an open + * or lock stateid) or when the client is being destroyed. + * + * In the case of v4.0 open stateids, these objects are preserved for a little + * while after close in order to handle CLOSE replays. Those are eventually + * reclaimed via a LRU scheme by the laundromat. + * + * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock". + * Better suggestions welcome. + */ +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; /* must be first field */ + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_locks; + struct nfs4_stateowner * st_stateowner; + struct nfs4_clnt_odstate * st_clnt_odstate; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; + struct nfs4_ol_stateid * st_openstp; +}; + +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + +struct nfs4_layout_stateid { + struct nfs4_stid ls_stid; + struct list_head ls_perclnt; + struct list_head ls_perfile; + spinlock_t ls_lock; + struct list_head ls_layouts; + u32 ls_layout_type; + struct file *ls_file; + struct nfsd4_callback ls_recall; + stateid_t ls_recall_sid; + bool ls_recalled; +}; + +static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_layout_stateid, ls_stid); +} + +/* flags for preprocess_seqid_op() */ +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 + +enum nfsd4_cb_op { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_SEQUENCE, +}; + + +struct nfsd4_compound_state; +struct nfsd_net; + +extern __be32 nfs4_preprocess_stateid_op(struct net *net, + struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filp); +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab); +void nfs4_unhash_stid(struct nfs4_stid *s); +void nfs4_put_stid(struct nfs4_stid *s); +void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); +extern void nfs4_release_reclaim(struct nfsd_net *); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, + struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern int set_callback_cred(void); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); +extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); +extern void nfsd4_run_cb(struct nfsd4_callback *cb); +extern int nfsd4_create_callback_queue(void); +extern void nfsd4_destroy_callback_queue(void); +extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, + struct nfsd_net *nn); +extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); + +struct nfs4_file *find_file(struct knfsd_fh *fh); +void put_nfs4_file(struct nfs4_file *fi); +static inline void get_nfs4_file(struct nfs4_file *fi) +{ + atomic_inc(&fi->fi_ref); +} +struct file *find_any_file(struct nfs4_file *f); + +/* grace period management */ +void nfsd4_end_grace(struct nfsd_net *nn); + +/* nfs4recover operations */ +extern int nfsd4_client_tracking_init(struct net *net); +extern void nfsd4_client_tracking_exit(struct net *net); +extern void nfsd4_client_record_create(struct nfs4_client *clp); +extern void nfsd4_client_record_remove(struct nfs4_client *clp); +extern int nfsd4_client_record_check(struct nfs4_client *clp); +extern void nfsd4_record_grace_done(struct nfsd_net *nn); + +/* nfs fault injection functions */ +#ifdef CONFIG_NFSD_FAULT_INJECTION +int nfsd_fault_inject_init(void); +void nfsd_fault_inject_cleanup(void); + +u64 nfsd_inject_print_clients(void); +u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_clients(u64); + +u64 nfsd_inject_print_locks(void); +u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_locks(u64); + +u64 nfsd_inject_print_openowners(void); +u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_openowners(u64); + +u64 nfsd_inject_print_delegations(void); +u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_delegations(u64); +u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_recall_delegations(u64); +#else /* CONFIG_NFSD_FAULT_INJECTION */ +static inline int nfsd_fault_inject_init(void) { return 0; } +static inline void nfsd_fault_inject_cleanup(void) {} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + +#endif /* NFSD4_STATE_H */ diff --git a/kernel/fs/nfsd/stats.c b/kernel/fs/nfsd/stats.c new file mode 100644 index 000000000..cd90878a7 --- /dev/null +++ b/kernel/fs/nfsd/stats.c @@ -0,0 +1,104 @@ +/* + * procfs-based user access to knfsd statistics + * + * /proc/net/rpc/nfsd + * + * Format: + * rc + * Statistsics for the reply cache + * fh + * statistics for filehandle lookup + * io + * statistics for IO throughput + * th <10%-20%> <20%-30%> ... <90%-100%> <100%> + * time (seconds) when nfsd thread usage above thresholds + * and number of times that all threads were in use + * ra cache-size <10% <20% <30% ... <100% not-found + * number of times that read-ahead entry was found that deep in + * the cache. + * plus generic RPC stats (see net/sunrpc/stats.c) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include + +#include "nfsd.h" + +struct nfsd_stats nfsdstats; +struct svc_stat nfsd_svcstats = { + .program = &nfsd_program, +}; + +static int nfsd_proc_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", + nfsdstats.rchits, + nfsdstats.rcmisses, + nfsdstats.rcnocache, + nfsdstats.fh_stale, + nfsdstats.fh_lookup, + nfsdstats.fh_anon, + nfsdstats.fh_nocache_dir, + nfsdstats.fh_nocache_nondir, + nfsdstats.io_read, + nfsdstats.io_write); + /* thread usage: */ + seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); + for (i=0; i<10; i++) { + unsigned int jifs = nfsdstats.th_usage[i]; + unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; + seq_printf(seq, " %u.%03u", sec, msec); + } + + /* newline and ra-cache */ + seq_printf(seq, "\nra %u", nfsdstats.ra_size); + for (i=0; i<11; i++) + seq_printf(seq, " %u", nfsdstats.ra_depth[i]); + seq_putc(seq, '\n'); + + /* show my rpc info */ + svc_seq_show(seq, &nfsd_svcstats); + +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + + return 0; +} + +static int nfsd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_proc_show, NULL); +} + +static const struct file_operations nfsd_proc_fops = { + .owner = THIS_MODULE, + .open = nfsd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void +nfsd_stat_init(void) +{ + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); +} + +void +nfsd_stat_shutdown(void) +{ + svc_proc_unregister(&init_net, "nfsd"); +} diff --git a/kernel/fs/nfsd/stats.h b/kernel/fs/nfsd/stats.h new file mode 100644 index 000000000..a5c944b77 --- /dev/null +++ b/kernel/fs/nfsd/stats.h @@ -0,0 +1,43 @@ +/* + * Statistics for NFS server. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ +#ifndef _NFSD_STATS_H +#define _NFSD_STATS_H + +#include + + +struct nfsd_stats { + unsigned int rchits; /* repcache hits */ + unsigned int rcmisses; /* repcache hits */ + unsigned int rcnocache; /* uncached reqs */ + unsigned int fh_stale; /* FH stale error */ + unsigned int fh_lookup; /* dentry cached */ + unsigned int fh_anon; /* anon file dentry returned */ + unsigned int fh_nocache_dir; /* filehandle not found in dcache */ + unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ + unsigned int io_read; /* bytes returned to read requests */ + unsigned int io_write; /* bytes passed in write requests */ + unsigned int th_cnt; /* number of available threads */ + unsigned int th_usage[10]; /* number of ticks during which n perdeciles + * of available threads were in use */ + unsigned int th_fullcnt; /* number of times last free thread was used */ + unsigned int ra_size; /* size of ra cache */ + unsigned int ra_depth[11]; /* number of times ra entry was found that deep + * in the cache (10percentiles). [10] = not found */ +#ifdef CONFIG_NFSD_V4 + unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ +#endif + +}; + + +extern struct nfsd_stats nfsdstats; +extern struct svc_stat nfsd_svcstats; + +void nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +#endif /* _NFSD_STATS_H */ diff --git a/kernel/fs/nfsd/trace.c b/kernel/fs/nfsd/trace.c new file mode 100644 index 000000000..82f890705 --- /dev/null +++ b/kernel/fs/nfsd/trace.c @@ -0,0 +1,5 @@ + +#include "state.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/kernel/fs/nfsd/trace.h b/kernel/fs/nfsd/trace.h new file mode 100644 index 000000000..c668520c3 --- /dev/null +++ b/kernel/fs/nfsd/trace.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfsd + +#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _NFSD_TRACE_H + +#include + +DECLARE_EVENT_CLASS(nfsd_stateid_class, + TP_PROTO(stateid_t *stp), + TP_ARGS(stp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x", + __entry->cl_boot, + __entry->cl_id, + __entry->si_id, + __entry->si_generation) +) + +#define DEFINE_STATEID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateid_class, name, \ + TP_PROTO(stateid_t *stp), \ + TP_ARGS(stp)) +DEFINE_STATEID_EVENT(layoutstate_alloc); +DEFINE_STATEID_EVENT(layoutstate_unhash); +DEFINE_STATEID_EVENT(layoutstate_free); +DEFINE_STATEID_EVENT(layout_get_lookup_fail); +DEFINE_STATEID_EVENT(layout_commit_lookup_fail); +DEFINE_STATEID_EVENT(layout_return_lookup_fail); +DEFINE_STATEID_EVENT(layout_recall); +DEFINE_STATEID_EVENT(layout_recall_done); +DEFINE_STATEID_EVENT(layout_recall_fail); +DEFINE_STATEID_EVENT(layout_recall_release); + +#endif /* _NFSD_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include diff --git a/kernel/fs/nfsd/vfs.c b/kernel/fs/nfsd/vfs.c new file mode 100644 index 000000000..84d770be0 --- /dev/null +++ b/kernel/fs/nfsd/vfs.c @@ -0,0 +1,2154 @@ +/* + * File operations used by nfsd. Some of these have been ripped from + * other parts of the kernel because they weren't exported, others + * are partial duplicates with added or changed functionality. + * + * Note that several functions dget() the dentry upon which they want + * to act, most notably those that create directory entries. Response + * dentry's are dput()'d if necessary in the release callback. + * So if you notice code paths that apparently fail to dput() the + * dentry, don't worry--they have been taken care of. + * + * Copyright (C) 1995-1999 Olaf Kirch + * Zerocpy NFS support (C) 2002 Hirokazu Takahashi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + +#ifdef CONFIG_NFSD_V4 +#include "acl.h" +#include "idmap.h" +#endif /* CONFIG_NFSD_V4 */ + +#include "nfsd.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_FILEOP + + +/* + * This is a cache of readahead params that help us choose the proper + * readahead strategy. Initially, we set all readahead parameters to 0 + * and let the VFS handle things. + * If you increase the number of cached files very much, you'll need to + * add a hash table here. + */ +struct raparms { + struct raparms *p_next; + unsigned int p_count; + ino_t p_ino; + dev_t p_dev; + int p_set; + struct file_ra_state p_ra; + unsigned int p_hindex; +}; + +struct raparm_hbucket { + struct raparms *pb_head; + spinlock_t pb_lock; +} ____cacheline_aligned_in_smp; + +#define RAPARM_HASH_BITS 4 +#define RAPARM_HASH_SIZE (1<ex_path.mnt), + .dentry = dget(dentry)}; + int err = 0; + + err = follow_down(&path); + if (err < 0) + goto out; + + exp2 = rqst_exp_get_by_name(rqstp, &path); + if (IS_ERR(exp2)) { + err = PTR_ERR(exp2); + /* + * We normally allow NFS clients to continue + * "underneath" a mountpoint that is not exported. + * The exception is V4ROOT, where no traversal is ever + * allowed without an explicit export of the new + * directory. + */ + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) + err = 0; + path_put(&path); + goto out; + } + if (nfsd_v4client(rqstp) || + (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { + /* successfully crossed mount point */ + /* + * This is subtle: path.dentry is *not* on path.mnt + * at this point. The only reason we are safe is that + * original mnt is pinned down by exp, so we should + * put path *before* putting exp + */ + *dpp = path.dentry; + path.dentry = dentry; + *expp = exp2; + exp2 = exp; + } + path_put(&path); + exp_put(exp2); +out: + return err; +} + +static void follow_to_parent(struct path *path) +{ + struct dentry *dp; + + while (path->dentry == path->mnt->mnt_root && follow_up(path)) + ; + dp = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = dp; +} + +static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) +{ + struct svc_export *exp2; + struct path path = {.mnt = mntget((*exp)->ex_path.mnt), + .dentry = dget(dparent)}; + + follow_to_parent(&path); + + exp2 = rqst_exp_parent(rqstp, &path); + if (PTR_ERR(exp2) == -ENOENT) { + *dentryp = dget(dparent); + } else if (IS_ERR(exp2)) { + path_put(&path); + return PTR_ERR(exp2); + } else { + *dentryp = dget(path.dentry); + exp_put(*exp); + *exp = exp2; + } + path_put(&path); + return 0; +} + +/* + * For nfsd purposes, we treat V4ROOT exports as though there was an + * export at *every* directory. + */ +int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) +{ + if (d_mountpoint(dentry)) + return 1; + if (nfsd4_is_junction(dentry)) + return 1; + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return 0; + return d_inode(dentry) != NULL; +} + +__be32 +nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, + const char *name, unsigned int len, + struct svc_export **exp_ret, struct dentry **dentry_ret) +{ + struct svc_export *exp; + struct dentry *dparent; + struct dentry *dentry; + int host_err; + + dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + + dparent = fhp->fh_dentry; + exp = exp_get(fhp->fh_export); + + /* Lookup the name, but don't follow links */ + if (isdotent(name, len)) { + if (len==1) + dentry = dget(dparent); + else if (dparent != exp->ex_path.dentry) + dentry = dget_parent(dparent); + else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) + dentry = dget(dparent); /* .. == . just like at / */ + else { + /* checking mountpoint crossing is very different when stepping up */ + host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); + if (host_err) + goto out_nfserr; + } + } else { + /* + * In the nfsd4_open() case, this may be held across + * subsequent open and delegation acquisition which may + * need to take the child's i_mutex: + */ + fh_lock_nested(fhp, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dparent, len); + host_err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_nfserr; + /* + * check if we have crossed a mount point ... + */ + if (nfsd_mountpoint(dentry, exp)) { + if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { + dput(dentry); + goto out_nfserr; + } + } + } + *dentry_ret = dentry; + *exp_ret = exp; + return 0; + +out_nfserr: + exp_put(exp); + return nfserrno(host_err); +} + +/* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put + * + * If the lookup would cross a mountpoint, and the mounted filesystem + * is exported to the client with NFSEXP_NOHIDE, then the lookup is + * accepted as it stands and the mounted directory is + * returned. Otherwise the covered directory is returned. + * NOTE: this mountpoint crossing is not supported properly by all + * clients and is explicitly disallowed for NFSv3 + * NeilBrown + */ +__be32 +nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, + unsigned int len, struct svc_fh *resfh) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); + if (err) + return err; + err = check_nfsd_access(exp, rqstp); + if (err) + goto out; + /* + * Note: we compose the file handle now, but as the + * dentry may be negative, it may need to be updated. + */ + err = fh_compose(resfh, exp, dentry, fhp); + if (!err && d_really_is_negative(dentry)) + err = nfserr_noent; +out: + dput(dentry); + exp_put(exp); + return err; +} + +/* + * Commit metadata changes to stable storage. + */ +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + const struct export_operations *export_ops = inode->i_sb->s_export_op; + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); +} + +/* + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. + */ +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) +{ + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" + * which only requires access, and "set-[ac]time-to-X" which + * requires ownership. + * So if it looks like it might be "set both to the same time which + * is close to now", and if inode_change_ok fails, then we + * convert to "set to now" instead of "set to explicit time" + * + * We only call inode_change_ok as the last test as technically + * it is not an interface that we should be using. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. + */ + time_t delta = iap->ia_atime.tv_sec - get_seconds(); + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + inode_change_ok(inode, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } + } + + /* sanitize the mode change */ + if (iap->ia_valid & ATTR_MODE) { + iap->ia_mode &= S_IALLUGO; + iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); + } + + /* Revoke setuid/setgid on chown */ + if (!S_ISDIR(inode->i_mode) && + ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { + iap->ia_valid |= ATTR_KILL_PRIV; + if (iap->ia_valid & ATTR_MODE) { + /* we're setting mode too, just clear the s*id bits */ + iap->ia_mode &= ~S_ISUID; + if (iap->ia_mode & S_IXGRP) + iap->ia_mode &= ~S_ISGID; + } else { + /* set ATTR_KILL_* bits and let VFS handle it */ + iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + } + } +} + +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + int host_err; + + if (iap->ia_size < inode->i_size) { + __be32 err; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } + + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; + + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; + +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); +} + +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + bool get_write_count; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Callers that do fh_verify should do the fh_want_write: */ + get_write_count = !fhp->fh_dentry; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + if (get_write_count) { + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + } + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + goto out; + + nfsd_sanitize_attrs(inode, iap); + + /* + * The size case is special, it changes the file in addition to the + * attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + goto out; + size_change = 1; + + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + if (iap->ia_size != i_size_read(inode)) + iap->ia_valid |= ATTR_MTIME; + } + + iap->ia_valid |= ATTR_CTIME; + + if (check_guard && guardtime != inode->i_ctime.tv_sec) { + err = nfserr_notsync; + goto out_put_write_access; + } + + fh_lock(fhp); + host_err = notify_change(dentry, iap, NULL); + fh_unlock(fhp); + err = nfserrno(host_err); + +out_put_write_access: + if (size_change) + put_write_access(inode); + if (!err) + err = nfserrno(commit_metadata(fhp)); +out: + return err; +} + +#if defined(CONFIG_NFSD_V4) +/* + * NFS junction information is stored in an extended attribute. + */ +#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" + +/** + * nfsd4_is_junction - Test if an object could be an NFS junction + * + * @dentry: object to test + * + * Returns 1 if "dentry" appears to contain NFS junction information. + * Otherwise 0 is returned. + */ +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) + return 0; + return 1; +} +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + __be32 error; + int host_error; + struct dentry *dentry; + + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + + mutex_lock(&d_inode(dentry)->i_mutex); + host_error = security_inode_setsecctx(dentry, label->data, label->len); + mutex_unlock(&d_inode(dentry)->i_mutex); + return nfserrno(host_error); +} +#else +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + return nfserr_notsupp; +} +#endif + +__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, loff_t len, + int flags) +{ + __be32 err; + int error; + + if (!S_ISREG(file_inode(file)->i_mode)) + return nfserr_inval; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); + if (err) + return err; + + error = vfs_fallocate(file, flags, offset, len); + if (!error) + error = commit_metadata(fhp); + + return nfserrno(error); +} +#endif /* defined(CONFIG_NFSD_V4) */ + +#ifdef CONFIG_NFSD_V3 +/* + * Check server access rights to a file system object + */ +struct accessmap { + u32 access; + int how; +}; +static struct accessmap nfs3_regaccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_diraccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, + { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, + { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_anyaccess[] = { + /* Some clients - Solaris 2.6 at least, make an access call + * to the server to check for access for things like /dev/null + * (which really, the server doesn't care about). So + * We provide simple access checking for them, looking + * mainly at mode bits, and we make sure to ignore read-only + * filesystem checks + */ + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + + { 0, 0 } +}; + +__be32 +nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) +{ + struct accessmap *map; + struct svc_export *export; + struct dentry *dentry; + u32 query, result = 0, sresult = 0; + __be32 error; + + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (error) + goto out; + + export = fhp->fh_export; + dentry = fhp->fh_dentry; + + if (d_is_reg(dentry)) + map = nfs3_regaccess; + else if (d_is_dir(dentry)) + map = nfs3_diraccess; + else + map = nfs3_anyaccess; + + + query = *access; + for (; map->access; map++) { + if (map->access & query) { + __be32 err2; + + sresult |= map->access; + + err2 = nfsd_permission(rqstp, export, dentry, map->how); + switch (err2) { + case nfs_ok: + result |= map->access; + break; + + /* the following error codes just mean the access was not allowed, + * rather than an error occurred */ + case nfserr_rofs: + case nfserr_acces: + case nfserr_perm: + /* simply don't "or" in the access bit. */ + break; + default: + error = err2; + goto out; + } + } + } + *access = result; + if (supported) + *supported = sresult; + + out: + return error; +} +#endif /* CONFIG_NFSD_V3 */ + +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} + +/* + * Open an existing file or directory. + * The may_flags argument indicates the type of open (read/write/lock) + * and additional flags. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + struct path path; + struct inode *inode; + struct file *file; + int flags = O_RDONLY|O_LARGEFILE; + __be32 err; + int host_err = 0; + + validate_process_creds(); + + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (err) + goto out; + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + inode = d_inode(path.dentry); + + /* Disallow write access to files with the append-only bit set + * or any access when mandatory locking enabled + */ + err = nfserr_perm; + if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) + goto out; + /* + * We must ignore files (but only files) which might have mandatory + * locks on them because there is no way to know if the accesser has + * the lock. + */ + if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) + goto out; + + if (!inode->i_fop) + goto out; + + host_err = nfsd_open_break_lease(inode, may_flags); + if (host_err) /* NOMEM or WOULDBLOCK */ + goto out_nfserr; + + if (may_flags & NFSD_MAY_WRITE) { + if (may_flags & NFSD_MAY_READ) + flags = O_RDWR|O_LARGEFILE; + else + flags = O_WRONLY|O_LARGEFILE; + } + + file = dentry_open(&path, flags, current_cred()); + if (IS_ERR(file)) { + host_err = PTR_ERR(file); + goto out_nfserr; + } + + host_err = ima_file_check(file, may_flags, 0); + if (host_err) { + nfsd_close(file); + goto out_nfserr; + } + + if (may_flags & NFSD_MAY_64BIT_COOKIE) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + + *filp = file; +out_nfserr: + err = nfserrno(host_err); +out: + validate_process_creds(); + return err; +} + +/* + * Close a file. + */ +void +nfsd_close(struct file *filp) +{ + fput(filp); +} + +/* + * Obtain the readahead parameters for the file + * specified by (dev, ino). + */ + +static inline struct raparms * +nfsd_get_raparms(dev_t dev, ino_t ino) +{ + struct raparms *ra, **rap, **frap = NULL; + int depth = 0; + unsigned int hash; + struct raparm_hbucket *rab; + + hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; + rab = &raparm_hash[hash]; + + spin_lock(&rab->pb_lock); + for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { + if (ra->p_ino == ino && ra->p_dev == dev) + goto found; + depth++; + if (ra->p_count == 0) + frap = rap; + } + depth = nfsdstats.ra_size; + if (!frap) { + spin_unlock(&rab->pb_lock); + return NULL; + } + rap = frap; + ra = *frap; + ra->p_dev = dev; + ra->p_ino = ino; + ra->p_set = 0; + ra->p_hindex = hash; +found: + if (rap != &rab->pb_head) { + *rap = ra->p_next; + ra->p_next = rab->pb_head; + rab->pb_head = ra; + } + ra->p_count++; + nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; + spin_unlock(&rab->pb_lock); + return ra; +} + +/* + * Grab and keep cached pages associated with a file in the svc_rqst + * so that they can be passed to the network sendmsg/sendpage routines + * directly. They will be released after the sending has completed. + */ +static int +nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct svc_rqst *rqstp = sd->u.data; + struct page **pp = rqstp->rq_next_page; + struct page *page = buf->page; + size_t size; + + size = sd->len; + + if (rqstp->rq_res.page_len == 0) { + get_page(page); + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; + rqstp->rq_res.page_base = buf->offset; + rqstp->rq_res.page_len = size; + } else if (page != pp[-1]) { + get_page(page); + if (*rqstp->rq_next_page) + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; + rqstp->rq_res.page_len += size; + } else + rqstp->rq_res.page_len += size; + + return size; +} + +static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + return __splice_from_pipe(pipe, sd, nfsd_splice_actor); +} + +static __be32 +nfsd_finish_read(struct file *file, unsigned long *count, int host_err) +{ + if (host_err >= 0) { + nfsdstats.io_read += host_err; + *count = host_err; + fsnotify_access(file); + return 0; + } else + return nfserrno(host_err); +} + +__be32 nfsd_splice_read(struct svc_rqst *rqstp, + struct file *file, loff_t offset, unsigned long *count) +{ + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + int host_err; + + rqstp->rq_next_page = rqstp->rq_respages + 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + return nfsd_finish_read(file, count, host_err); +} + +__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, + unsigned long *count) +{ + mm_segment_t oldfs; + int host_err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + return nfsd_finish_read(file, count, host_err); +} + +static __be32 +nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + return nfsd_splice_read(rqstp, file, offset, count); + else + return nfsd_readv(file, offset, vec, vlen, count); +} + +/* + * Gathered writes: If another process is currently writing to the file, + * there's a high chance this is another nfsd (triggered by a bulk write + * from a client's biod). Rather than syncing the file with each write + * request, we sleep for 10 msec. + * + * I don't know if this roughly approximates C. Juszak's idea of + * gathered writes, but it's a nice and simple solution (IMHO), and it + * seems to work:-) + * + * Note: we do this only in the NFSv2 case, since v3 and higher have a + * better tool (separate unstable writes and commits) for solving this + * problem. + */ +static int wait_for_concurrent_writes(struct file *file) +{ + struct inode *inode = file_inode(file); + static ino_t last_ino; + static dev_t last_dev; + int err = 0; + + if (atomic_read(&inode->i_writecount) > 1 + || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { + dprintk("nfsd: write defer %d\n", task_pid_nr(current)); + msleep(10); + dprintk("nfsd: write resume %d\n", task_pid_nr(current)); + } + + if (inode->i_state & I_DIRTY) { + dprintk("nfsd: write sync %d\n", task_pid_nr(current)); + err = vfs_fsync(file, 0); + } + last_ino = inode->i_ino; + last_dev = inode->i_sb->s_dev; + return err; +} + +static __be32 +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, + unsigned long *cnt, int *stablep) +{ + struct svc_export *exp; + struct inode *inode; + mm_segment_t oldfs; + __be32 err = 0; + int host_err; + int stable = *stablep; + int use_wgather; + loff_t pos = offset; + loff_t end = LLONG_MAX; + unsigned int pflags = current->flags; + + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + /* + * We want less throttling in balance_dirty_pages() + * and shrink_inactive_list() so that nfs to + * localhost doesn't cause nfsd to lock up due to all + * the client's dirty pages or its congested queue. + */ + current->flags |= PF_LESS_THROTTLE; + + inode = file_inode(file); + exp = fhp->fh_export; + + use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); + + if (!EX_ISSYNC(exp)) + stable = 0; + + /* Write the data. */ + oldfs = get_fs(); set_fs(KERNEL_DS); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); + set_fs(oldfs); + if (host_err < 0) + goto out_nfserr; + *cnt = host_err; + nfsdstats.io_write += host_err; + fsnotify_modify(file); + + if (stable) { + if (use_wgather) { + host_err = wait_for_concurrent_writes(file); + } else { + if (*cnt) + end = offset + *cnt - 1; + host_err = vfs_fsync_range(file, offset, end, 0); + } + } + +out_nfserr: + dprintk("nfsd: write complete host_err=%d\n", host_err); + if (host_err >= 0) + err = 0; + else + err = nfserrno(host_err); + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + return err; +} + +__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file **file, struct raparms **ra) +{ + struct inode *inode; + __be32 err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); + if (err) + return err; + + inode = file_inode(*file); + + /* Get readahead parameters */ + *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + + if (*ra && (*ra)->p_set) + (*file)->f_ra = (*ra)->p_ra; + return nfs_ok; +} + +void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) +{ + /* Write back readahead params */ + if (ra) { + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); + } + nfsd_close(file); +} + +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + struct file *file; + struct raparms *ra; + __be32 err; + + err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + if (err) + return err; + + err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + nfsd_put_tmp_read_open(file, ra); + + return err; +} + +/* + * Write data to a file. + * The stable flag requests synchronous writes. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, + int *stablep) +{ + __be32 err = 0; + + if (file) { + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, + stablep); + } else { + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) + goto out; + + if (cnt) + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, + cnt, stablep); + nfsd_close(file); + } +out: + return err; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Commit all pending writes to stable storage. + * + * Note: we only guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. + * + * Unfortunately we cannot lock the file to make sure we return full WCC + * data to the client, as locking happens lower down in the filesystem. + */ +__be32 +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long count) +{ + struct file *file; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; + + if (offset < 0) + goto out; + if (count != 0) { + end = offset + (loff_t)count - 1; + if (end < offset) + goto out; + } + + err = nfsd_open(rqstp, fhp, S_IFREG, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + if (err) + goto out; + if (EX_ISSYNC(fhp->fh_export)) { + int err2 = vfs_fsync_range(file, offset, end, 0); + + if (err2 != -EINVAL) + err = nfserrno(err2); + else + err = nfserr_notsupp; + } + + nfsd_close(file); +out: + return err; +} +#endif /* CONFIG_NFSD_V3 */ + +static __be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, + struct iattr *iap) +{ + /* + * Mode has already been set earlier in create: + */ + iap->ia_valid &= ~ATTR_MODE; + /* + * Setting uid/gid works only for root. Irix appears to + * send along the gid on create when it tries to implement + * setgid directories via NFS: + */ + if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) + iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + if (iap->ia_valid) + return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); + /* Callers expect file metadata to be committed here */ + return nfserrno(commit_metadata(resfhp)); +} + +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + +/* + * Create a file (regular, directory, device, fifo); UNIX sockets + * not yet implemented. + * If the response fh has been verified, the parent directory should + * already be locked. Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + __be32 err2; + int host_err; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + err = nfserr_notdir; + if (!dirp->i_op->lookup) + goto out; + /* + * Check whether the response file handle has been verified yet. + * If it has, the parent directory should already be locked. + */ + if (!resfhp->fh_dentry) { + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + } else { + /* called from nfsd_proc_create */ + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + /* not actually possible */ + printk(KERN_ERR + "nfsd_create: parent %pd2 not locked!\n", + dentry); + err = nfserr_io; + goto out; + } + } + /* + * Make sure the child dentry is still negative ... + */ + err = nfserr_exist; + if (d_really_is_positive(dchild)) { + dprintk("nfsd_create: dentry %pd/%pd not negative!\n", + dentry, dchild); + goto out; + } + + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + + err = nfserr_inval; + if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + goto out; + } + + /* + * Get the dir op function pointer. + */ + err = 0; + host_err = 0; + switch (type) { + case S_IFREG: + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + if (!host_err) + nfsd_check_ignore_resizing(iap); + break; + case S_IFDIR: + host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + break; + } + if (host_err < 0) + goto out_nfserr; + + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_create_setattr already committed the child. Transactional + * filesystems had a chance to commit changes for both parent and + * child * simultaneously making the following commit_metadata a + * noop. + */ + err2 = nfserrno(commit_metadata(fhp)); + if (err2) + err = err2; + /* + * Update the file handle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); +out: + if (dchild && !IS_ERR(dchild)) + dput(dchild); + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +#ifdef CONFIG_NFSD_V3 + +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +/* + * NFSv3 and NFSv4 version of nfsd_create + */ +__be32 +do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + struct svc_fh *resfhp, int createmode, u32 *verifier, + bool *truncp, bool *created) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + __u32 v_mtime=0, v_atime=0; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + /* Get all the sanity checks out of the way before + * we lock the parent. */ + err = nfserr_notdir; + if (!dirp->i_op->lookup) + goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + /* + * Compose the response file handle. + */ + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + + /* If file doesn't exist, check for permissions to create one */ + if (d_really_is_negative(dchild)) { + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + } + + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + + if (nfsd_create_is_exclusive(createmode)) { + /* solaris7 gets confused (bugid 4218508) if these have + * the high bit set, so just clear the high bits. If this is + * ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be fixed + * accordingly. + */ + v_mtime = verifier[0]&0x7fffffff; + v_atime = verifier[1]&0x7fffffff; + } + + if (d_really_is_positive(dchild)) { + err = 0; + + switch (createmode) { + case NFS3_CREATE_UNCHECKED: + if (! d_is_reg(dchild)) + goto out; + else if (truncp) { + /* in nfsv4, we need to treat this case a little + * differently. we don't want to truncate the + * file now; this would be wrong if the OPEN + * fails for some other reason. furthermore, + * if the size is nonzero, we should ignore it + * according to spec! + */ + *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; + } + else { + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + } + break; + case NFS3_CREATE_EXCLUSIVE: + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { + if (created) + *created = 1; + break; + } + case NFS4_CREATE_EXCLUSIVE4_1: + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { + if (created) + *created = 1; + goto set_attr; + } + /* fallthru */ + case NFS3_CREATE_GUARDED: + err = nfserr_exist; + } + fh_drop_write(fhp); + goto out; + } + + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + if (host_err < 0) { + fh_drop_write(fhp); + goto out_nfserr; + } + if (created) + *created = 1; + + nfsd_check_ignore_resizing(iap); + + if (nfsd_create_is_exclusive(createmode)) { + /* Cram the verifier into atime/mtime */ + iap->ia_valid = ATTR_MTIME|ATTR_ATIME + | ATTR_MTIME_SET|ATTR_ATIME_SET; + /* XXX someone who knows this better please fix it for nsec */ + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + + set_attr: + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_create_setattr already committed the child + * (and possibly also the parent). + */ + if (!err) + err = nfserrno(commit_metadata(fhp)); + + /* + * Update the filehandle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); + + out: + fh_unlock(fhp); + if (dchild && !IS_ERR(dchild)) + dput(dchild); + fh_drop_write(fhp); + return err; + + out_nfserr: + err = nfserrno(host_err); + goto out; +} +#endif /* CONFIG_NFSD_V3 */ + +/* + * Read a symlink. On entry, *lenp must contain the maximum path length that + * fits into the buffer. On return, it contains the true length. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) +{ + struct inode *inode; + mm_segment_t oldfs; + __be32 err; + int host_err; + struct path path; + + err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); + if (err) + goto out; + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + inode = d_inode(path.dentry); + + err = nfserr_inval; + if (!inode->i_op->readlink) + goto out; + + touch_atime(&path); + /* N.B. Why does this call need a get_fs()?? + * Remove the set_fs and watch the fireworks:-) --okir + */ + + oldfs = get_fs(); set_fs(KERNEL_DS); + host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp); + set_fs(oldfs); + + if (host_err < 0) + goto out_nfserr; + *lenp = host_err; + err = 0; +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a symlink and look up its inode + * N.B. After this call _both_ fhp and resfhp need an fh_put + */ +__be32 +nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, + char *path, + struct svc_fh *resfhp) +{ + struct dentry *dentry, *dnew; + __be32 err, cerr; + int host_err; + + err = nfserr_noent; + if (!flen || path[0] == '\0') + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock(fhp); + dentry = fhp->fh_dentry; + dnew = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + host_err = vfs_symlink(d_inode(dentry), dnew, path); + err = nfserrno(host_err); + if (!err) + err = nfserrno(commit_metadata(fhp)); + fh_unlock(fhp); + + fh_drop_write(fhp); + + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); + dput(dnew); + if (err==0) err = cerr; +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a hardlink + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, + char *name, int len, struct svc_fh *tfhp) +{ + struct dentry *ddir, *dnew, *dold; + struct inode *dirp; + __be32 err; + int host_err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); + if (err) + goto out; + err = nfserr_isdir; + if (d_is_dir(tfhp->fh_dentry)) + goto out; + err = nfserr_perm; + if (!len) + goto out; + err = nfserr_exist; + if (isdotent(name, len)) + goto out; + + host_err = fh_want_write(tfhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + + fh_lock_nested(ffhp, I_MUTEX_PARENT); + ddir = ffhp->fh_dentry; + dirp = d_inode(ddir); + + dnew = lookup_one_len(name, ddir, len); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + dold = tfhp->fh_dentry; + + err = nfserr_noent; + if (d_really_is_negative(dold)) + goto out_dput; + host_err = vfs_link(dold, dirp, dnew, NULL); + if (!host_err) { + err = nfserrno(commit_metadata(ffhp)); + if (!err) + err = nfserrno(commit_metadata(tfhp)); + } else { + if (host_err == -EXDEV && rqstp->rq_vers == 2) + err = nfserr_acces; + else + err = nfserrno(host_err); + } +out_dput: + dput(dnew); +out_unlock: + fh_unlock(ffhp); + fh_drop_write(tfhp); +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out_unlock; +} + +/* + * Rename a file + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, + struct svc_fh *tfhp, char *tname, int tlen) +{ + struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct inode *fdir, *tdir; + __be32 err; + int host_err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + fdentry = ffhp->fh_dentry; + fdir = d_inode(fdentry); + + tdentry = tfhp->fh_dentry; + tdir = d_inode(tdentry); + + err = nfserr_perm; + if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) + goto out; + + host_err = fh_want_write(ffhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + + /* cannot use fh_lock as we need deadlock protective ordering + * so do it by hand */ + trap = lock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = 1; + fill_pre_wcc(ffhp); + fill_pre_wcc(tfhp); + + odentry = lookup_one_len(fname, fdentry, flen); + host_err = PTR_ERR(odentry); + if (IS_ERR(odentry)) + goto out_nfserr; + + host_err = -ENOENT; + if (d_really_is_negative(odentry)) + goto out_dput_old; + host_err = -EINVAL; + if (odentry == trap) + goto out_dput_old; + + ndentry = lookup_one_len(tname, tdentry, tlen); + host_err = PTR_ERR(ndentry); + if (IS_ERR(ndentry)) + goto out_dput_old; + host_err = -ENOTEMPTY; + if (ndentry == trap) + goto out_dput_new; + + host_err = -EXDEV; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out_dput_new; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out_dput_new; + + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } + out_dput_new: + dput(ndentry); + out_dput_old: + dput(odentry); + out_nfserr: + err = nfserrno(host_err); + /* + * We cannot rely on fh_unlock on the two filehandles, + * as that would do the wrong thing if the two directories + * were the same, so again we do it by hand. + */ + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + unlock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = 0; + fh_drop_write(ffhp); + +out: + return err; +} + +/* + * Unlink a file or directory + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + char *fname, int flen) +{ + struct dentry *dentry, *rdentry; + struct inode *dirp; + __be32 err; + int host_err; + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) + goto out; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + rdentry = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(rdentry); + if (IS_ERR(rdentry)) + goto out_nfserr; + + if (d_really_is_negative(rdentry)) { + dput(rdentry); + err = nfserr_noent; + goto out; + } + + if (!type) + type = d_inode(rdentry)->i_mode & S_IFMT; + + if (type != S_IFDIR) + host_err = vfs_unlink(dirp, rdentry, NULL); + else + host_err = vfs_rmdir(dirp, rdentry); + if (!host_err) + host_err = commit_metadata(fhp); + dput(rdentry); + +out_nfserr: + err = nfserrno(host_err); +out: + return err; +} + +/* + * We do this buffering because we must not call back into the file + * system's ->lookup() method from the filldir callback. That may well + * deadlock a number of file systems. + * + * This is based heavily on the implementation of same in XFS. + */ +struct buffered_dirent { + u64 ino; + loff_t offset; + int namlen; + unsigned int d_type; + char name[]; +}; + +struct readdir_data { + struct dir_context ctx; + char *dirent; + size_t used; + int full; +}; + +static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct readdir_data *buf = + container_of(ctx, struct readdir_data, ctx); + struct buffered_dirent *de = (void *)(buf->dirent + buf->used); + unsigned int reclen; + + reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); + if (buf->used + reclen > PAGE_SIZE) { + buf->full = 1; + return -EINVAL; + } + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += reclen; + + return 0; +} + +static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) +{ + struct buffered_dirent *de; + int host_err; + int size; + loff_t offset; + struct readdir_data buf = { + .ctx.actor = nfsd_buffered_filldir, + .dirent = (void *)__get_free_page(GFP_KERNEL) + }; + + if (!buf.dirent) + return nfserrno(-ENOMEM); + + offset = *offsetp; + + while (1) { + struct inode *dir_inode = file_inode(file); + unsigned int reclen; + + cdp->err = nfserr_eof; /* will be cleared on successful read */ + buf.used = 0; + buf.full = 0; + + host_err = iterate_dir(file, &buf.ctx); + if (buf.full) + host_err = 0; + + if (host_err < 0) + break; + + size = buf.used; + + if (!size) + break; + + /* + * Various filldir functions may end up calling back into + * lookup_one_len() and the file system's ->lookup() method. + * These expect i_mutex to be held, as it would within readdir. + */ + host_err = mutex_lock_killable(&dir_inode->i_mutex); + if (host_err) + break; + + de = (struct buffered_dirent *)buf.dirent; + while (size > 0) { + offset = de->offset; + + if (func(cdp, de->name, de->namlen, de->offset, + de->ino, de->d_type)) + break; + + if (cdp->err != nfs_ok) + break; + + reclen = ALIGN(sizeof(*de) + de->namlen, + sizeof(u64)); + size -= reclen; + de = (struct buffered_dirent *)((char *)de + reclen); + } + mutex_unlock(&dir_inode->i_mutex); + if (size > 0) /* We bailed out early */ + break; + + offset = vfs_llseek(file, 0, SEEK_CUR); + } + + free_page((unsigned long)(buf.dirent)); + + if (host_err) + return nfserrno(host_err); + + *offsetp = offset; + return cdp->err; +} + +/* + * Read entries from a directory. + * The NFSv3/4 verifier we ignore for now. + */ +__be32 +nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, + struct readdir_cd *cdp, nfsd_filldir_t func) +{ + __be32 err; + struct file *file; + loff_t offset = *offsetp; + int may_flags = NFSD_MAY_READ; + + /* NFSv2 only supports 32 bit cookies */ + if (rqstp->rq_vers > 2) + may_flags |= NFSD_MAY_64BIT_COOKIE; + + err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); + if (err) + goto out; + + offset = vfs_llseek(file, offset, SEEK_SET); + if (offset < 0) { + err = nfserrno((int)offset); + goto out_close; + } + + err = nfsd_buffered_readdir(file, func, cdp, offsetp); + + if (err == nfserr_eof || err == nfserr_toosmall) + err = nfs_ok; /* can still be found in ->err */ +out_close: + nfsd_close(file); +out: + return err; +} + +/* + * Get file system stats + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) +{ + __be32 err; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); + if (!err) { + struct path path = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + if (vfs_statfs(&path, stat)) + err = nfserr_io; + } + return err; +} + +static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) +{ + return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; +} + +/* + * Check for a user's access permissions to this inode. + */ +__be32 +nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, + struct dentry *dentry, int acc) +{ + struct inode *inode = d_inode(dentry); + int err; + + if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) + return 0; +#if 0 + dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", + acc, + (acc & NFSD_MAY_READ)? " read" : "", + (acc & NFSD_MAY_WRITE)? " write" : "", + (acc & NFSD_MAY_EXEC)? " exec" : "", + (acc & NFSD_MAY_SATTR)? " sattr" : "", + (acc & NFSD_MAY_TRUNC)? " trunc" : "", + (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", + inode->i_mode, + IS_IMMUTABLE(inode)? " immut" : "", + IS_APPEND(inode)? " append" : "", + __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); + dprintk(" owner %d/%d user %d/%d\n", + inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid()); +#endif + + /* Normally we reject any write/sattr etc access on a read-only file + * system. But if it is IRIX doing check on write-access for a + * device special file, we ignore rofs. + */ + if (!(acc & NFSD_MAY_LOCAL_ACCESS)) + if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { + if (exp_rdonly(rqstp, exp) || + __mnt_is_readonly(exp->ex_path.mnt)) + return nfserr_rofs; + if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) + return nfserr_perm; + } + if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) + return nfserr_perm; + + if (acc & NFSD_MAY_LOCK) { + /* If we cannot rely on authentication in NLM requests, + * just allow locks, otherwise require read permission, or + * ownership + */ + if (exp->ex_flags & NFSEXP_NOAUTHNLM) + return 0; + else + acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; + } + /* + * The file owner always gets access permission for accesses that + * would normally be checked at open time. This is to make + * file access work even when the client has done a fchmod(fd, 0). + * + * However, `cp foo bar' should fail nevertheless when bar is + * readonly. A sensible way to do this might be to reject all + * attempts to truncate a read-only file, because a creat() call + * always implies file truncation. + * ... but this isn't really fair. A process may reasonably call + * ftruncate on an open file descriptor on a file with perm 000. + * We must trust the client to do permission checking - using "ACCESS" + * with NFSv3. + */ + if ((acc & NFSD_MAY_OWNER_OVERRIDE) && + uid_eq(inode->i_uid, current_fsuid())) + return 0; + + /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ + err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + + /* Allow read access to binaries even when mode 111 */ + if (err == -EACCES && S_ISREG(inode->i_mode) && + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) + err = inode_permission(inode, MAY_EXEC); + + return err? nfserrno(err) : 0; +} + +void +nfsd_racache_shutdown(void) +{ + struct raparms *raparm, *last_raparm; + unsigned int i; + + dprintk("nfsd: freeing readahead buffers.\n"); + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } +} +/* + * Initialize readahead param cache + */ +int +nfsd_racache_init(int cache_size) +{ + int i; + int j = 0; + int nperbucket; + struct raparms **raparm = NULL; + + + if (raparm_hash[0].pb_head) + return 0; + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + nperbucket = max(2, nperbucket); + cache_size = nperbucket * RAPARM_HASH_SIZE; + + dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + spin_lock_init(&raparm_hash[i].pb_lock); + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; + } + + nfsdstats.ra_size = cache_size; + return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; +} diff --git a/kernel/fs/nfsd/vfs.h b/kernel/fs/nfsd/vfs.h new file mode 100644 index 000000000..2050cb016 --- /dev/null +++ b/kernel/fs/nfsd/vfs.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +#include "nfsfh.h" +#include "nfsd.h" + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f + +/* extra hints to permission and open routines: */ +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 + +#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +/* + * Callback function for readdir + */ +typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); + +/* nfsd/vfs.c */ +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_mountpoint(struct dentry *, struct svc_export *); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, + struct xdr_netobj *); +__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, + struct file *, loff_t, loff_t, int); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, bool *truncp, bool *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); +void nfsd_close(struct file *); +struct raparms; +__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, + struct file **, struct raparms **); +void nfsd_put_tmp_read_open(struct file *, struct raparms *); +__be32 nfsd_splice_read(struct svc_rqst *, + struct file *, loff_t, unsigned long *); +__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, + unsigned long *); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, + loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, + struct svc_fh *res); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, nfsd_filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); + +static inline int fh_want_write(struct svc_fh *fh) +{ + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); + + if (!ret) + fh->fh_want_write = 1; + return ret; +} + +static inline void fh_drop_write(struct svc_fh *fh) +{ + if (fh->fh_want_write) { + fh->fh_want_write = 0; + mnt_drop_write(fh->fh_export->ex_path.mnt); + } +} + +static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +{ + struct path p = {.mnt = fh->fh_export->ex_path.mnt, + .dentry = fh->fh_dentry}; + return nfserrno(vfs_getattr(&p, stat)); +} + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/kernel/fs/nfsd/xdr.h b/kernel/fs/nfsd/xdr.h new file mode 100644 index 000000000..4f0481d63 --- /dev/null +++ b/kernel/fs/nfsd/xdr.h @@ -0,0 +1,173 @@ +/* XDR types for nfsd. This is mainly a typing exercise. */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include +#include "nfsd.h" +#include "nfsfh.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + int len; + int vlen; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_attrstat { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + int len; +}; + +struct nfsd_readres { + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd_sattrargs *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd_diropargs *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd_readargs *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd_writeargs *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd_createargs *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd_renameargs *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_readlinkargs *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd_linkargs *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_symlinkargs *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd_readdirargs *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/kernel/fs/nfsd/xdr3.h b/kernel/fs/nfsd/xdr3.h new file mode 100644 index 000000000..335e04aaf --- /dev/null +++ b/kernel/fs/nfsd/xdr3.h @@ -0,0 +1,349 @@ +/* + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + int vlen; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; + struct kstat stat; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + /* Just to save kmalloc on every readdirplus entry (svc_fh is a + * little large for the stack): */ + struct svc_fh scratch; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; + struct kstat stat; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd3_sattrargs *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd3_diropargs *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, + struct nfsd3_accessargs *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd3_readargs *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd3_writeargs *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, + struct nfsd3_mknodargs *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd3_renameargs *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkargs *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd3_linkargs *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_symlinkargs *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, + struct nfsd3_commitargs *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, + struct nfsd3_accessres *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkres *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, + struct nfsd3_renameres *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, + struct nfsd3_linkres *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, + struct nfsd3_readdirres *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, + struct nfsd3_fsstatres *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, + struct nfsd3_fsinfores *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, + struct nfsd3_pathconfres *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, + struct nfsd3_commitres *); + +int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, + struct nfsd3_fhandle_pair *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/kernel/fs/nfsd/xdr4.h b/kernel/fs/nfsd/xdr4.h new file mode 100644 index 000000000..2f8c092be --- /dev/null +++ b/kernel/fs/nfsd/xdr4.h @@ -0,0 +1,724 @@ +/* + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +#define CURRENT_STATE_ID_FLAG (1<<0) +#define SAVED_STATE_ID_FLAG (1<<1) + +#define SET_STATE_ID(c, f) ((c)->sid_flags |= (f)) +#define HAS_STATE_ID(c, f) ((c)->sid_flags & (f)) +#define CLEAR_STATE_ID(c, f) ((c)->sid_flags &= ~(f)) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + struct nfs4_client *clp; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + int data_offset; + size_t iovlen; + u32 minorversion; + __be32 status; + stateid_t current_stateid; + stateid_t save_stateid; + /* to indicate current and saved state id presents */ + u32 sid_flags; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 datalen; + char *data; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; + struct xdr_netobj cr_label; +}; +#define cr_datalen u.link.datalen +#define cr_data u.link.data +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct xdr_netobj ld_owner; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier op_verf __attribute__((aligned(32))); + /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + u32 op_deleg_want; /* request */ + stateid_t op_stateid; /* response */ + __be32 op_xdr_error; /* see nfsd4_open_omfg() */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ + struct nfs4_acl *op_acl; + struct xdr_netobj op_label; +}; + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; /* request */ + u32 od_deleg_want; /* request */ + u32 od_share_deny; /* request */ +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + struct xdr_stream *xdr; + int cookie_offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_secinfo_no_name { + u32 sin_style; /* request */ + struct svc_export *sin_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; + struct xdr_netobj sa_label; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + struct xdr_netobj se_name; + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +struct nfsd4_saved_compoundargs { + __be32 *p; + __be32 *end; + int pagelen; + struct page **pagelist; +}; + +struct nfsd4_test_stateid_id { + __be32 ts_id_status; + stateid_t ts_id_stateid; + struct list_head ts_id_list; +}; + +struct nfsd4_test_stateid { + u32 ts_num_ids; + struct list_head ts_stateid_list; +}; + +struct nfsd4_free_stateid { + stateid_t fr_stateid; /* request */ +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + struct kvec wr_head; + struct page ** wr_pagelist; /* request */ + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ +#endif /* not yet */ + u32 status_flags; /* response */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + +struct nfsd4_reclaim_complete { + u32 rca_one_fs; +}; + +struct nfsd4_deviceid { + u64 fsid_idx; + u32 generation; + u32 pad; +}; + +struct nfsd4_layout_seg { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfsd4_getdeviceinfo { + struct nfsd4_deviceid gd_devid; /* request */ + u32 gd_layout_type; /* request */ + u32 gd_maxcount; /* request */ + u32 gd_notify_types;/* request - response */ + void *gd_device; /* response */ +}; + +struct nfsd4_layoutget { + u64 lg_minlength; /* request */ + u32 lg_signal; /* request */ + u32 lg_layout_type; /* request */ + u32 lg_maxcount; /* request */ + stateid_t lg_sid; /* request/response */ + struct nfsd4_layout_seg lg_seg; /* request/response */ + void *lg_content; /* response */ +}; + +struct nfsd4_layoutcommit { + stateid_t lc_sid; /* request */ + struct nfsd4_layout_seg lc_seg; /* request */ + u32 lc_reclaim; /* request */ + u32 lc_newoffset; /* request */ + u64 lc_last_wr; /* request */ + struct timespec lc_mtime; /* request */ + u32 lc_layout_type; /* request */ + u32 lc_up_len; /* layout length */ + void *lc_up_layout; /* decoded by callback */ + u32 lc_size_chg; /* boolean for response */ + u64 lc_newsize; /* response */ +}; + +struct nfsd4_layoutreturn { + u32 lr_return_type; /* request */ + u32 lr_layout_type; /* request */ + struct nfsd4_layout_seg lr_seg; /* request */ + u32 lr_reclaim; /* request */ + u32 lrf_body_len; /* request */ + void *lrf_body; /* request */ + stateid_t lr_sid; /* request/response */ + u32 lrs_present; /* response */ +}; + +struct nfsd4_fallocate { + /* request */ + stateid_t falloc_stateid; + loff_t falloc_offset; + u64 falloc_length; +}; + +struct nfsd4_seek { + /* request */ + stateid_t seek_stateid; + loff_t seek_offset; + u32 seek_whence; + + /* response */ + u32 seek_eof; + loff_t seek_pos; +}; + +struct nfsd4_op { + int opnum; + __be32 status; + union { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_backchannel_ctl backchannel_ctl; + struct nfsd4_bind_conn_to_session bind_conn_to_session; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; + struct nfsd4_test_stateid test_stateid; + struct nfsd4_free_stateid free_stateid; + struct nfsd4_getdeviceinfo getdeviceinfo; + struct nfsd4_layoutget layoutget; + struct nfsd4_layoutcommit layoutcommit; + struct nfsd4_layoutreturn layoutreturn; + + /* NFSv4.2 */ + struct nfsd4_fallocate allocate; + struct nfsd4_fallocate deallocate; + struct nfsd4_seek seek; + } u; + struct nfs4_replay * replay; +}; + +bool nfsd4_cache_this_op(struct nfsd4_op *); + +/* + * Memory needed just for the duration of processing one compound: + */ +struct svcxdr_tmpbuf { + struct svcxdr_tmpbuf *next; + char buf[]; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + __be32 tmp[8]; + __be32 * tmpp; + struct svcxdr_tmpbuf *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; + int cachetype; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + struct xdr_stream xdr; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +{ + return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) + || nfsd4_is_solo_sequence(resp); +} + +static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + return argp->opcnt == resp->opcnt; +} + +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); +void warn_on_nonidempotent_op(struct nfsd4_op *op); + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; + cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + +} + +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, + struct nfsd4_compoundargs *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, + struct nfsd4_compoundres *); +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid *setclid); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid_confirm *setclientid_confirm); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); +extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); +extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_sequence *); +extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); +__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open, struct nfsd_net *nn); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); +extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_close *close); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_open_downgrade *od); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + struct nfsd4_lock *lock); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_lockt *lockt); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_locku *locku); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_release_lockowner *rlockowner); +extern int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, clientid_t *clid); +extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid); +extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); +extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); + +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/kernel/fs/nfsd/xdr4cb.h b/kernel/fs/nfsd/xdr4cb.h new file mode 100644 index 000000000..c47f6fdb1 --- /dev/null +++ b/kernel/fs/nfsd/xdr4cb.h @@ -0,0 +1,30 @@ +#define NFS4_MAXTAGLEN 20 + +#define NFS4_enc_cb_null_sz 0 +#define NFS4_dec_cb_null_sz 0 +#define cb_compound_enc_hdr_sz 4 +#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + +#define op_enc_sz 1 +#define op_dec_sz 2 +#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) +#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_stateid_sz + \ + enc_nfs4_fh_sz) + +#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) +#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 3 + \ + enc_nfs4_fh_sz + 4) +#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/kernel/fs/nilfs2/Kconfig b/kernel/fs/nilfs2/Kconfig new file mode 100644 index 000000000..80da8eb27 --- /dev/null +++ b/kernel/fs/nilfs2/Kconfig @@ -0,0 +1,24 @@ +config NILFS2_FS + tristate "NILFS2 file system support" + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. diff --git a/kernel/fs/nilfs2/Makefile b/kernel/fs/nilfs2/Makefile new file mode 100644 index 000000000..fc603e043 --- /dev/null +++ b/kernel/fs/nilfs2/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_NILFS2_FS) += nilfs2.o +nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ + btnode.o bmap.o btree.o direct.o dat.o recovery.o \ + the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ + ifile.o alloc.o gcinode.o ioctl.o sysfs.o diff --git a/kernel/fs/nilfs2/alloc.c b/kernel/fs/nilfs2/alloc.c new file mode 100644 index 000000000..8df0f3b78 --- /dev/null +++ b/kernel/fs/nilfs2/alloc.c @@ -0,0 +1,785 @@ +/* + * alloc.c - NILFS dat/inode allocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato . + * Two allocators were unified by Ryusuke Konishi , + * Amagai Yoshiji . + */ + +#include +#include +#include +#include +#include +#include "mdt.h" +#include "alloc.h" + + +/** + * nilfs_palloc_groups_per_desc_block - get the number of groups that a group + * descriptor block can maintain + * @inode: inode of metadata file using this allocator + */ +static inline unsigned long +nilfs_palloc_groups_per_desc_block(const struct inode *inode) +{ + return (1UL << inode->i_blkbits) / + sizeof(struct nilfs_palloc_group_desc); +} + +/** + * nilfs_palloc_groups_count - get maximum number of groups + * @inode: inode of metadata file using this allocator + */ +static inline unsigned long +nilfs_palloc_groups_count(const struct inode *inode) +{ + return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); +} + +/** + * nilfs_palloc_init_blockgroup - initialize private variables for allocator + * @inode: inode of metadata file using this allocator + * @entry_size: size of the persistent object + */ +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); + if (!mi->mi_bgl) + return -ENOMEM; + + bgl_lock_init(mi->mi_bgl); + + nilfs_mdt_set_entry_size(inode, entry_size, 0); + + mi->mi_blocks_per_group = + DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), + mi->mi_entries_per_block) + 1; + /* Number of blocks in a group including entry blocks and + a bitmap block */ + mi->mi_blocks_per_desc_block = + nilfs_palloc_groups_per_desc_block(inode) * + mi->mi_blocks_per_group + 1; + /* Number of blocks per descriptor including the + descriptor block */ + return 0; +} + +/** + * nilfs_palloc_group - get group number and offset from an entry number + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @offset: pointer to store offset number in the group + */ +static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, + unsigned long *offset) +{ + __u64 group = nr; + + *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); + return group; +} + +/** + * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_desc_blkoff() returns block offset of the descriptor + * block which contains a descriptor of the specified group. + */ +static unsigned long +nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_block = + group / nilfs_palloc_groups_per_desc_block(inode); + return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; +} + +/** + * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap + * block used to allocate/deallocate entries in the specified group. + */ +static unsigned long +nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_offset = + group % nilfs_palloc_groups_per_desc_block(inode); + return nilfs_palloc_desc_blkoff(inode, group) + 1 + + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; +} + +/** + * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + */ +static unsigned long +nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, + const struct nilfs_palloc_group_desc *desc) +{ + unsigned long nfree; + + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + return nfree; +} + +/** + * nilfs_palloc_group_desc_add_entries - adjust count of free entries + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + * @n: delta to be added + */ +static void +nilfs_palloc_group_desc_add_entries(struct inode *inode, + unsigned long group, + struct nilfs_palloc_group_desc *desc, + u32 n) +{ + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + le32_add_cpu(&desc->pg_nfrees, n); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); +} + +/** + * nilfs_palloc_entry_blkoff - get block offset of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + */ +static unsigned long +nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) +{ + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, nr, &group_offset); + + return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + + group_offset / NILFS_MDT(inode)->mi_entries_per_block; +} + +/** + * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block + * @inode: inode of metadata file + * @bh: buffer head of the buffer to be initialized + * @kaddr: kernel address mapped for the page including the buffer + */ +static void nilfs_palloc_desc_block_init(struct inode *inode, + struct buffer_head *bh, void *kaddr) +{ + struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + unsigned long n = nilfs_palloc_groups_per_desc_block(inode); + __le32 nfrees; + + nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); + while (n-- > 0) { + desc->pg_nfrees = nfrees; + desc++; + } +} + +static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, + int create, + void (*init_block)(struct inode *, + struct buffer_head *, + void *), + struct buffer_head **bhp, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + int ret; + + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + get_bh(prev->bh); + *bhp = prev->bh; + spin_unlock(lock); + return 0; + } + spin_unlock(lock); + + ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); + if (!ret) { + spin_lock(lock); + /* + * The following code must be safe for change of the + * cache contents during the get block call. + */ + brelse(prev->bh); + get_bh(*bhp); + prev->bh = *bhp; + prev->blkoff = blkoff; + spin_unlock(lock); + } + return ret; +} + +/** + * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +static int nilfs_palloc_get_desc_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, + bhp, &cache->prev_desc, &cache->lock); +} + +/** + * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +static int nilfs_palloc_get_bitmap_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp, + &cache->prev_bitmap, &cache->lock); +} + +/** + * nilfs_palloc_get_entry_block - get buffer head of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp, + &cache->prev_entry, &cache->lock); +} + +/** + * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * @inode: inode of metadata file using this allocator + * @group: group number + * @bh: buffer head of the buffer storing the group descriptor block + * @kaddr: kernel address mapped for the page including the buffer + */ +static struct nilfs_palloc_group_desc * +nilfs_palloc_block_get_group_desc(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh, void *kaddr) +{ + return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + + group % nilfs_palloc_groups_per_desc_block(inode); +} + +/** + * nilfs_palloc_block_get_entry - get kernel address of an entry + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the buffer storing the entry block + * @kaddr: kernel address mapped for the page including the buffer + */ +void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, + const struct buffer_head *bh, void *kaddr) +{ + unsigned long entry_offset, group_offset; + + nilfs_palloc_group(inode, nr, &group_offset); + entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + + return kaddr + bh_offset(bh) + + entry_offset * NILFS_MDT(inode)->mi_entry_size; +} + +/** + * nilfs_palloc_find_available_slot - find available slot in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @target: offset number of an entry in the group (start point) + * @bitmap: bitmap of the group + * @bsize: size in bits + */ +static int nilfs_palloc_find_available_slot(struct inode *inode, + unsigned long group, + unsigned long target, + unsigned char *bitmap, + int bsize) +{ + int curr, pos, end, i; + + if (target > 0) { + end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, target); + if (pos < end && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + return pos; + } else + end = 0; + + for (i = 0, curr = end; + i < bsize; + i += BITS_PER_LONG, curr += BITS_PER_LONG) { + /* wrap around */ + if (curr >= bsize) + curr = 0; + while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) + != ~0UL) { + end = curr + BITS_PER_LONG; + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, curr); + if ((pos < end) && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, + bitmap)) + return pos; + } + } + return -ENOSPC; +} + +/** + * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups + * in a group descriptor block + * @inode: inode of metadata file using this allocator + * @curr: current group number + * @max: maximum number of groups + */ +static unsigned long +nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, + unsigned long curr, unsigned long max) +{ + return min_t(unsigned long, + nilfs_palloc_groups_per_desc_block(inode) - + curr % nilfs_palloc_groups_per_desc_block(inode), + max - curr + 1); +} + +/** + * nilfs_palloc_count_desc_blocks - count descriptor blocks number + * @inode: inode of metadata file using this allocator + * @desc_blocks: descriptor blocks number [out] + */ +static int nilfs_palloc_count_desc_blocks(struct inode *inode, + unsigned long *desc_blocks) +{ + __u64 blknum; + int ret; + + ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); + if (likely(!ret)) + *desc_blocks = DIV_ROUND_UP( + (unsigned long)blknum, + NILFS_MDT(inode)->mi_blocks_per_desc_block); + return ret; +} + +/** + * nilfs_palloc_mdt_file_can_grow - check potential opportunity for + * MDT file growing + * @inode: inode of metadata file using this allocator + * @desc_blocks: known current descriptor blocks count + */ +static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, + unsigned long desc_blocks) +{ + return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < + nilfs_palloc_groups_count(inode); +} + +/** + * nilfs_palloc_count_max_entries - count max number of entries that can be + * described by descriptor blocks count + * @inode: inode of metadata file using this allocator + * @nused: current number of used entries + * @nmaxp: max number of entries [out] + */ +int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) +{ + unsigned long desc_blocks = 0; + u64 entries_per_desc_block, nmax; + int err; + + err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); + if (unlikely(err)) + return err; + + entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * + nilfs_palloc_groups_per_desc_block(inode); + nmax = entries_per_desc_block * desc_blocks; + + if (nused == nmax && + nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) + nmax += entries_per_desc_block; + + if (nused > nmax) + return -ERANGE; + + *nmaxp = nmax; + return 0; +} + +/** + * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, maxgroup, ngroups; + unsigned long group_offset, maxgroup_offset; + unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long i, j; + int pos, ret; + + ngroups = nilfs_palloc_groups_count(inode); + maxgroup = ngroups - 1; + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + entries_per_group = nilfs_palloc_entries_per_group(inode); + groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); + + for (i = 0; i < ngroups; i += n) { + if (group >= ngroups) { + /* wrap around */ + group = 0; + maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, + &maxgroup_offset) - 1; + } + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + n = nilfs_palloc_rest_groups_in_desc_block(inode, group, + maxgroup); + for (j = 0; j < n; j++, desc++, group++) { + if (nilfs_palloc_group_desc_nfrees(inode, group, desc) + > 0) { + ret = nilfs_palloc_get_bitmap_block( + inode, group, 1, &bitmap_bh); + if (ret < 0) + goto out_desc; + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + pos = nilfs_palloc_find_available_slot( + inode, group, group_offset, bitmap, + entries_per_group); + if (pos >= 0) { + /* found a free entry */ + nilfs_palloc_group_desc_add_entries( + inode, group, desc, -1); + req->pr_entry_nr = + entries_per_group * group + pos; + kunmap(desc_bh->b_page); + kunmap(bitmap_bh->b_page); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; + } + kunmap(bitmap_bh->b_page); + brelse(bitmap_bh); + } + + group_offset = 0; + } + + kunmap(desc_bh->b_page); + brelse(desc_bh); + } + + /* no entries left */ + return -ENOSPC; + + out_desc: + kunmap(desc_bh->b_page); + brelse(desc_bh); + return ret; +} + +/** + * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +void nilfs_palloc_commit_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +/** + * nilfs_palloc_commit_free_entry - finish deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +void nilfs_palloc_commit_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + unsigned long group, group_offset; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + else + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +/** + * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +void nilfs_palloc_abort_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + void *desc_kaddr, *bitmap_kaddr; + unsigned char *bitmap; + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + else + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +/** + * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +int nilfs_palloc_prepare_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + unsigned long group, group_offset; + int ret; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; +} + +/** + * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +void nilfs_palloc_abort_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +/** + * nilfs_palloc_group_is_in - judge if an entry is in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @nr: serial number of the entry (e.g. inode number) + */ +static int +nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) +{ + __u64 first, last; + + first = group * nilfs_palloc_entries_per_group(inode); + last = first + nilfs_palloc_entries_per_group(inode) - 1; + return (nr >= first) && (nr <= last); +} + +/** + * nilfs_palloc_freev - deallocate a set of persistent objects + * @inode: inode of metadata file using this allocator + * @entry_nrs: array of entry numbers to be deallocated + * @nitems: number of entries stored in @entry_nrs + */ +int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, group_offset; + int i, j, n, ret; + + for (i = 0; i < nitems; i = j) { + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 0, + &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + for (j = i, n = 0; + (j < nitems) && nilfs_palloc_group_is_in(inode, group, + entry_nrs[j]); + j++) { + nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + if (!nilfs_clear_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) { + printk(KERN_WARNING + "%s: entry number %llu already freed\n", + __func__, + (unsigned long long)entry_nrs[j]); + } else { + n++; + } + } + nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + kunmap(bitmap_bh->b_page); + kunmap(desc_bh->b_page); + + mark_buffer_dirty(desc_bh); + mark_buffer_dirty(bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(bitmap_bh); + brelse(desc_bh); + } + return 0; +} + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache) +{ + NILFS_MDT(inode)->mi_palloc_cache = cache; + spin_lock_init(&cache->lock); +} + +void nilfs_palloc_clear_cache(struct inode *inode) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + spin_lock(&cache->lock); + brelse(cache->prev_desc.bh); + brelse(cache->prev_bitmap.bh); + brelse(cache->prev_entry.bh); + cache->prev_desc.bh = NULL; + cache->prev_bitmap.bh = NULL; + cache->prev_entry.bh = NULL; + spin_unlock(&cache->lock); +} + +void nilfs_palloc_destroy_cache(struct inode *inode) +{ + nilfs_palloc_clear_cache(inode); + NILFS_MDT(inode)->mi_palloc_cache = NULL; +} diff --git a/kernel/fs/nilfs2/alloc.h b/kernel/fs/nilfs2/alloc.h new file mode 100644 index 000000000..4bd6451b5 --- /dev/null +++ b/kernel/fs/nilfs2/alloc.h @@ -0,0 +1,110 @@ +/* + * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato . + * Two allocators were unified by Ryusuke Konishi , + * Amagai Yoshiji . + */ + +#ifndef _NILFS_ALLOC_H +#define _NILFS_ALLOC_H + +#include +#include +#include + +/** + * nilfs_palloc_entries_per_group - get the number of entries per group + * @inode: inode of metadata file using this allocator + * + * The number of entries per group is defined by the number of bits + * that a bitmap block can maintain. + */ +static inline unsigned long +nilfs_palloc_entries_per_group(const struct inode *inode) +{ + return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); +} + +int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_get_entry_block(struct inode *, __u64, int, + struct buffer_head **); +void *nilfs_palloc_block_get_entry(const struct inode *, __u64, + const struct buffer_head *, void *); + +int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); + +/** + * nilfs_palloc_req - persistent allocator request and reply + * @pr_entry_nr: entry number (vblocknr or inode number) + * @pr_desc_bh: buffer head of the buffer containing block group descriptors + * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap + * @pr_entry_bh: buffer head of the buffer containing translation entries + */ +struct nilfs_palloc_req { + __u64 pr_entry_nr; + struct buffer_head *pr_desc_bh; + struct buffer_head *pr_bitmap_bh; + struct buffer_head *pr_entry_bh; +}; + +int nilfs_palloc_prepare_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_commit_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_freev(struct inode *, __u64 *, size_t); + +#define nilfs_set_bit_atomic ext2_set_bit_atomic +#define nilfs_clear_bit_atomic ext2_clear_bit_atomic +#define nilfs_find_next_zero_bit find_next_zero_bit_le + +/** + * struct nilfs_bh_assoc - block offset and buffer head association + * @blkoff: block offset + * @bh: buffer head + */ +struct nilfs_bh_assoc { + unsigned long blkoff; + struct buffer_head *bh; +}; + +/** + * struct nilfs_palloc_cache - persistent object allocator cache + * @lock: cache protecting lock + * @prev_desc: blockgroup descriptors cache + * @prev_bitmap: blockgroup bitmap cache + * @prev_entry: translation entries cache + */ +struct nilfs_palloc_cache { + spinlock_t lock; + struct nilfs_bh_assoc prev_desc; + struct nilfs_bh_assoc prev_bitmap; + struct nilfs_bh_assoc prev_entry; +}; + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache); +void nilfs_palloc_clear_cache(struct inode *inode); +void nilfs_palloc_destroy_cache(struct inode *inode); + +#endif /* _NILFS_ALLOC_H */ diff --git a/kernel/fs/nilfs2/bmap.c b/kernel/fs/nilfs2/bmap.c new file mode 100644 index 000000000..27f75bcbe --- /dev/null +++ b/kernel/fs/nilfs2/bmap.c @@ -0,0 +1,593 @@ +/* + * bmap.c - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include "nilfs.h" +#include "bmap.h" +#include "btree.h" +#include "direct.h" +#include "btnode.h" +#include "mdt.h" +#include "dat.h" +#include "alloc.h" + +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info; + + return nilfs->ns_dat; +} + +static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, + const char *fname, int err) +{ + struct inode *inode = bmap->b_inode; + + if (err == -EINVAL) { + nilfs_error(inode->i_sb, fname, + "broken bmap (inode number=%lu)\n", inode->i_ino); + err = -EIO; + } + return err; +} + +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, + __u64 *ptrp) +{ + sector_t blocknr; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); + if (ret < 0) { + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + goto out; + } + if (NILFS_BMAP_USE_VBN(bmap)) { + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, + &blocknr); + if (!ret) + *ptrp = blocknr; + } + + out: + up_read(&bmap->b_sem); + return ret; +} + +int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); + up_read(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; + __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_insert != NULL) { + ret = bmap->b_ops->bop_check_insert(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1); + if (n < 0) + return n; + ret = nilfs_btree_convert_and_insert( + bmap, key, ptr, keys, ptrs, n); + if (ret == 0) + bmap->b_u.u_flags |= NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_insert(bmap, key, ptr); +} + +/** + * nilfs_bmap_insert - insert a new key-record pair into a bmap + * @bmap: bmap + * @key: key + * @rec: record + * + * Description: nilfs_bmap_insert() inserts the new key-record pair specified + * by @key and @rec into @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EEXIST - A record associated with @key already exist. + */ +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_insert(bmap, key, rec); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 keys[NILFS_BMAP_LARGE_LOW + 1]; + __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_delete != NULL) { + ret = bmap->b_ops->bop_check_delete(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1); + if (n < 0) + return n; + ret = nilfs_direct_delete_and_convert( + bmap, key, keys, ptrs, n); + if (ret == 0) + bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_delete(bmap, key); +} + +/** + * nilfs_bmap_seek_key - seek a valid entry and return its key + * @bmap: bmap struct + * @start: start key number + * @keyp: place to store valid key + * + * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap + * starting from @start, and stores it to @keyp if found. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No valid entry was found + */ +int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_seek_key(bmap, start, keyp); + up_read(&bmap->b_sem); + + if (ret < 0) + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + return ret; +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, keyp); + up_read(&bmap->b_sem); + + if (ret < 0) + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + return ret; +} + +/** + * nilfs_bmap_delete - delete a key-record pair from a bmap + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_delete() deletes the key-record pair specified by + * @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_delete(bmap, key); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 lastkey; + int ret; + + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + + while (key <= lastkey) { + ret = nilfs_bmap_do_delete(bmap, lastkey); + if (ret < 0) + return ret; + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + } + return 0; +} + +/** + * nilfs_bmap_truncate - truncate a bmap to a specified key + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are + * greater than or equal to @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_truncate(bmap, key); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_clear - free resources a bmap holds + * @bmap: bmap + * + * Description: nilfs_bmap_clear() frees resources associated with @bmap. + */ +void nilfs_bmap_clear(struct nilfs_bmap *bmap) +{ + down_write(&bmap->b_sem); + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + up_write(&bmap->b_sem); +} + +/** + * nilfs_bmap_propagate - propagate dirty state + * @bmap: bmap + * @bh: buffer head + * + * Description: nilfs_bmap_propagate() marks the buffers that directly or + * indirectly refer to the block specified by @bh dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_propagate(bmap, bh); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_lookup_dirty_buffers - + * @bmap: bmap + * @listp: pointer to buffer head list + */ +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + if (bmap->b_ops->bop_lookup_dirty_buffers != NULL) + bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp); +} + +/** + * nilfs_bmap_assign - assign a new block number to a block + * @bmap: bmap + * @bhp: pointer to buffer head + * @blocknr: block number + * @binfo: block information + * + * Description: nilfs_bmap_assign() assigns the block number @blocknr to the + * buffer specified by @bh. + * + * Return Value: On success, 0 is returned and the buffer head of a newly + * create buffer and the block information associated with the buffer are + * stored in the place pointed by @bh and @binfo, respectively. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + unsigned long blocknr, + union nilfs_binfo *binfo) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_mark - mark block dirty + * @bmap: bmap + * @key: key + * @level: level + * + * Description: nilfs_bmap_mark() marks the block specified by @key and @level + * as dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + int ret; + + if (bmap->b_ops->bop_mark == NULL) + return 0; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_mark(bmap, key, level); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state + * @bmap: bmap + * + * Description: nilfs_test_and_clear() is the atomic operation to test and + * clear the dirty state of @bmap. + * + * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + */ +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_dirty(bmap); + nilfs_bmap_clear_dirty(bmap); + up_write(&bmap->b_sem); + return ret; +} + + +/* + * Internal use only + */ +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, + const struct buffer_head *bh) +{ + struct buffer_head *pbh; + __u64 key; + + key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + bmap->b_inode->i_blkbits); + for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) + key++; + + return key; +} + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) +{ + __s64 diff; + + diff = key - bmap->b_last_allocated_key; + if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) && + (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) && + (bmap->b_last_allocated_ptr + diff > 0)) + return bmap->b_last_allocated_ptr + diff; + else + return NILFS_BMAP_INVALID_PTR; +} + +#define NILFS_BMAP_GROUP_DIV 8 +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) +{ + struct inode *dat = nilfs_bmap_get_dat(bmap); + unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat); + unsigned long group = bmap->b_inode->i_ino / entries_per_group; + + return group * entries_per_group + + (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) * + (entries_per_group / NILFS_BMAP_GROUP_DIV); +} + +static struct lock_class_key nilfs_bmap_dat_lock_key; +static struct lock_class_key nilfs_bmap_mdt_lock_key; + +/** + * nilfs_bmap_read - read a bmap from an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_read() initializes the bmap @bmap. + * + * Return Value: On success, 0 is returned. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + if (raw_inode == NULL) + memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE); + else + memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE); + + init_rwsem(&bmap->b_sem); + bmap->b_state = 0; + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + bmap->b_ptr_type = NILFS_BMAP_PTR_P; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); + break; + case NILFS_CPFILE_INO: + case NILFS_SUFILE_INO: + bmap->b_ptr_type = NILFS_BMAP_PTR_VS; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); + break; + case NILFS_IFILE_INO: + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); + /* Fall through */ + default: + bmap->b_ptr_type = NILFS_BMAP_PTR_VM; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + } + + return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? + nilfs_btree_init(bmap) : nilfs_direct_init(bmap); +} + +/** + * nilfs_bmap_write - write back a bmap to an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_write() stores @bmap in @raw_inode. + */ +void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + down_write(&bmap->b_sem); + memcpy(raw_inode->i_bmap, bmap->b_u.u_data, + NILFS_INODE_BMAP_SIZE * sizeof(__le64)); + if (bmap->b_inode->i_ino == NILFS_DAT_INO) + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + + up_write(&bmap->b_sem); +} + +void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) +{ + memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + bmap->b_ptr_type = NILFS_BMAP_PTR_U; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + bmap->b_state = 0; + nilfs_btree_init_gc(bmap); +} + +void nilfs_bmap_save(const struct nilfs_bmap *bmap, + struct nilfs_bmap_store *store) +{ + memcpy(store->data, bmap->b_u.u_data, sizeof(store->data)); + store->last_allocated_key = bmap->b_last_allocated_key; + store->last_allocated_ptr = bmap->b_last_allocated_ptr; + store->state = bmap->b_state; +} + +void nilfs_bmap_restore(struct nilfs_bmap *bmap, + const struct nilfs_bmap_store *store) +{ + memcpy(bmap->b_u.u_data, store->data, sizeof(store->data)); + bmap->b_last_allocated_key = store->last_allocated_key; + bmap->b_last_allocated_ptr = store->last_allocated_ptr; + bmap->b_state = store->state; +} diff --git a/kernel/fs/nilfs2/bmap.h b/kernel/fs/nilfs2/bmap.h new file mode 100644 index 000000000..bfa817ce4 --- /dev/null +++ b/kernel/fs/nilfs2/bmap.h @@ -0,0 +1,280 @@ +/* + * bmap.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_BMAP_H +#define _NILFS_BMAP_H + +#include +#include +#include +#include +#include "alloc.h" +#include "dat.h" + +#define NILFS_BMAP_INVALID_PTR 0 + +#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) + + +struct nilfs_bmap; + +/** + * union nilfs_bmap_ptr_req - request for bmap ptr + * @bpr_ptr: bmap pointer + * @bpr_req: request for persistent allocator + */ +union nilfs_bmap_ptr_req { + __u64 bpr_ptr; + struct nilfs_palloc_req bpr_req; +}; + +/** + * struct nilfs_bmap_stats - bmap statistics + * @bs_nblocks: number of blocks created or deleted + */ +struct nilfs_bmap_stats { + unsigned int bs_nblocks; +}; + +/** + * struct nilfs_bmap_operations - bmap operation table + */ +struct nilfs_bmap_operations { + int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, + unsigned); + int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); + int (*bop_delete)(struct nilfs_bmap *, __u64); + void (*bop_clear)(struct nilfs_bmap *); + + int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *); + void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, + struct list_head *); + + int (*bop_assign)(struct nilfs_bmap *, + struct buffer_head **, + sector_t, + union nilfs_binfo *); + int (*bop_mark)(struct nilfs_bmap *, __u64, int); + + int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *); + int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + + /* The following functions are internal use only. */ + int (*bop_check_insert)(const struct nilfs_bmap *, __u64); + int (*bop_check_delete)(struct nilfs_bmap *, __u64); + int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); +}; + + +#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) +#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) +#define NILFS_BMAP_NEW_PTR_INIT \ + (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1)) + +static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) +{ + return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); +} + + +/** + * struct nilfs_bmap - bmap structure + * @b_u: raw data + * @b_sem: semaphore + * @b_inode: owner of bmap + * @b_ops: bmap operation table + * @b_last_allocated_key: last allocated key for data block + * @b_last_allocated_ptr: last allocated ptr for data block + * @b_ptr_type: pointer type + * @b_state: state + * @b_nchildren_per_block: maximum number of child nodes for non-root nodes + */ +struct nilfs_bmap { + union { + __u8 u_flags; + __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; + } b_u; + struct rw_semaphore b_sem; + struct inode *b_inode; + const struct nilfs_bmap_operations *b_ops; + __u64 b_last_allocated_key; + __u64 b_last_allocated_ptr; + int b_ptr_type; + int b_state; + __u16 b_nchildren_per_block; +}; + +/* pointer type */ +#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ +#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single + version) */ +#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple + versions) */ +#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ + +#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) + +/* state */ +#define NILFS_BMAP_DIRTY 0x00000001 + +/** + * struct nilfs_bmap_store - shadow copy of bmap state + * @data: cached raw block mapping of on-disk inode + * @last_allocated_key: cached value of last allocated key for data block + * @last_allocated_ptr: cached value of last allocated ptr for data block + * @state: cached value of state field of bmap structure + */ +struct nilfs_bmap_store { + __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)]; + __u64 last_allocated_key; + __u64 last_allocated_ptr; + int state; +}; + +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); +int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); +void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); +int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); +int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp); +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp); +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key); +void nilfs_bmap_clear(struct nilfs_bmap *); +int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); +int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, + unsigned long, union nilfs_binfo *); +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); +int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); + +void nilfs_bmap_init_gc(struct nilfs_bmap *); + +void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *); +void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *); + +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + +/* + * Internal use only + */ +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); + +static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); +} + +static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); + else + bmap->b_last_allocated_ptr--; +} + +static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; +} + +static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); +} + +static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); +} + +static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key, + __u64 ptr) +{ + bmap->b_last_allocated_key = key; + bmap->b_last_allocated_ptr = ptr; +} + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, + const struct buffer_head *); + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); + + +/* Assume that bmap semaphore is locked. */ +static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) +{ + return !!(bmap->b_state & NILFS_BMAP_DIRTY); +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state |= NILFS_BMAP_DIRTY; +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state &= ~NILFS_BMAP_DIRTY; +} + + +#define NILFS_BMAP_LARGE 0x1 + +#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN +#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX +#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX +#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX + +#endif /* _NILFS_BMAP_H */ diff --git a/kernel/fs/nilfs2/btnode.c b/kernel/fs/nilfs2/btnode.c new file mode 100644 index 000000000..a35ae35e6 --- /dev/null +++ b/kernel/fs/nilfs2/btnode.c @@ -0,0 +1,297 @@ +/* + * btnode.c - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * This file was originally written by Seiji Kihara + * and fully revised by Ryusuke Konishi for + * stabilization and simplification. + * + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "dat.h" +#include "page.h" +#include "btnode.h" + +void nilfs_btnode_cache_clear(struct address_space *btnc) +{ + invalidate_mapping_pages(btnc, 0, -1); + truncate_inode_pages(btnc, 0); +} + +struct buffer_head * +nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) +{ + struct inode *inode = NILFS_BTNC_I(btnc); + struct buffer_head *bh; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return NULL; + + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + memset(bh->b_data, 0, 1 << inode->i_blkbits); + bh->b_bdev = inode->i_sb->s_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return bh; +} + +int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, int mode, + struct buffer_head **pbh, sector_t *submit_ptr) +{ + struct buffer_head *bh; + struct inode *inode = NILFS_BTNC_I(btnc); + struct page *page; + int err; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return -ENOMEM; + + err = -EEXIST; /* internal code */ + page = bh->b_page; + + if (buffer_uptodate(bh) || buffer_dirty(bh)) + goto found; + + if (pblocknr == 0) { + pblocknr = blocknr; + if (inode->i_ino != NILFS_DAT_INO) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* blocknr is a virtual block number */ + err = nilfs_dat_translate(nilfs->ns_dat, blocknr, + &pblocknr); + if (unlikely(err)) { + brelse(bh); + goto out_locked; + } + } + } + + if (mode == READA) { + if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { + err = -EBUSY; /* internal code */ + brelse(bh); + goto out_locked; + } + } else { /* mode == READ */ + lock_buffer(bh); + } + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + err = -EEXIST; /* internal code */ + goto found; + } + set_buffer_mapped(bh); + bh->b_bdev = inode->i_sb->s_bdev; + bh->b_blocknr = pblocknr; /* set block address for read */ + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + bh->b_blocknr = blocknr; /* set back to the given block address */ + *submit_ptr = pblocknr; + err = 0; +found: + *pbh = bh; + +out_locked: + unlock_page(page); + page_cache_release(page); + return err; +} + +/** + * nilfs_btnode_delete - delete B-tree node buffer + * @bh: buffer to be deleted + * + * nilfs_btnode_delete() invalidates the specified buffer and delete the page + * including the buffer if the page gets unbusy. + */ +void nilfs_btnode_delete(struct buffer_head *bh) +{ + struct address_space *mapping; + struct page *page = bh->b_page; + pgoff_t index = page_index(page); + int still_dirty; + + page_cache_get(page); + lock_page(page); + wait_on_page_writeback(page); + + nilfs_forget_buffer(bh); + still_dirty = PageDirty(page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if (!still_dirty && mapping) + invalidate_inode_pages2_range(mapping, index, index); +} + +/** + * nilfs_btnode_prepare_change_key + * prepare to move contents of the block for old key to one of new key. + * the old buffer will not be removed, but might be reused for new buffer. + * it might return -ENOMEM because of memory allocation errors, + * and might return -EIO because of disk read errors. + */ +int nilfs_btnode_prepare_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh, *nbh; + struct inode *inode = NILFS_BTNC_I(btnc); + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + int err; + + if (oldkey == newkey) + return 0; + + obh = ctxt->bh; + ctxt->newbh = NULL; + + if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + lock_page(obh->b_page); + /* + * We cannot call radix_tree_preload for the kernels older + * than 2.6.23, because it is not exported for modules. + */ +retry: + err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (err) + goto failed_unlock; + /* BUG_ON(oldkey != obh->b_page->index); */ + if (unlikely(oldkey != obh->b_page->index)) + NILFS_PAGE_BUG(obh->b_page, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + + spin_lock_irq(&btnc->tree_lock); + err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); + spin_unlock_irq(&btnc->tree_lock); + /* + * Note: page->index will not change to newkey until + * nilfs_btnode_commit_change_key() will be called. + * To protect the page in intermediate state, the page lock + * is held. + */ + radix_tree_preload_end(); + if (!err) + return 0; + else if (err != -EEXIST) + goto failed_unlock; + + err = invalidate_inode_pages2_range(btnc, newkey, newkey); + if (!err) + goto retry; + /* fallback to copy mode */ + unlock_page(obh->b_page); + } + + nbh = nilfs_btnode_create_block(btnc, newkey); + if (!nbh) + return -ENOMEM; + + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + return 0; + + failed_unlock: + unlock_page(obh->b_page); + return err; +} + +/** + * nilfs_btnode_commit_change_key + * commit the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_commit_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + struct page *opage; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + opage = obh->b_page; + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + mark_buffer_dirty(obh); + + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, oldkey); + radix_tree_tag_set(&btnc->page_tree, newkey, + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&btnc->tree_lock); + + opage->index = obh->b_blocknr = newkey; + unlock_page(opage); + } else { + nilfs_copy_buffer(nbh, obh); + mark_buffer_dirty(nbh); + + nbh->b_blocknr = newkey; + ctxt->bh = nbh; + nilfs_btnode_delete(obh); /* will decrement bh->b_count */ + } +} + +/** + * nilfs_btnode_abort_change_key + * abort the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_abort_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, newkey); + spin_unlock_irq(&btnc->tree_lock); + unlock_page(ctxt->bh->b_page); + } else + brelse(nbh); +} diff --git a/kernel/fs/nilfs2/btnode.h b/kernel/fs/nilfs2/btnode.h new file mode 100644 index 000000000..d876b565c --- /dev/null +++ b/kernel/fs/nilfs2/btnode.h @@ -0,0 +1,59 @@ +/* + * btnode.h - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara + * Revised by Ryusuke Konishi + */ + +#ifndef _NILFS_BTNODE_H +#define _NILFS_BTNODE_H + +#include +#include +#include +#include + +/** + * struct nilfs_btnode_chkey_ctxt - change key context + * @oldkey: old key of block's moving content + * @newkey: new key for block's content + * @bh: buffer head of old buffer + * @newbh: buffer head of new buffer + */ +struct nilfs_btnode_chkey_ctxt { + __u64 oldkey; + __u64 newkey; + struct buffer_head *bh; + struct buffer_head *newbh; +}; + +void nilfs_btnode_cache_clear(struct address_space *); +struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, + __u64 blocknr); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, + struct buffer_head **, sector_t *); +void nilfs_btnode_delete(struct buffer_head *); +int nilfs_btnode_prepare_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_commit_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_abort_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); + +#endif /* _NILFS_BTNODE_H */ diff --git a/kernel/fs/nilfs2/btree.c b/kernel/fs/nilfs2/btree.c new file mode 100644 index 000000000..919fd5bb1 --- /dev/null +++ b/kernel/fs/nilfs2/btree.c @@ -0,0 +1,2414 @@ +/* + * btree.c - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include "nilfs.h" +#include "page.h" +#include "btnode.h" +#include "btree.h" +#include "alloc.h" +#include "dat.h" + +static void __nilfs_btree_init(struct nilfs_bmap *bmap); + +static struct nilfs_btree_path *nilfs_btree_alloc_path(void) +{ + struct nilfs_btree_path *path; + int level = NILFS_BTREE_LEVEL_DATA; + + path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + if (path == NULL) + goto out; + + for (; level < NILFS_BTREE_LEVEL_MAX; level++) { + path[level].bp_bh = NULL; + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } + +out: + return path; +} + +static void nilfs_btree_free_path(struct nilfs_btree_path *path) +{ + int level = NILFS_BTREE_LEVEL_DATA; + + for (; level < NILFS_BTREE_LEVEL_MAX; level++) + brelse(path[level].bp_bh); + + kmem_cache_free(nilfs_btree_path_cache, path); +} + +/* + * B-tree node operations + */ +static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, + __u64 ptr, struct buffer_head **bhp) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh; + + bh = nilfs_btnode_create_block(btnc, ptr); + if (!bh) + return -ENOMEM; + + set_buffer_nilfs_volatile(bh); + *bhp = bh; + return 0; +} + +static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) +{ + return node->bn_flags; +} + +static void +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) +{ + node->bn_flags = flags; +} + +static int nilfs_btree_node_root(const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; +} + +static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node) +{ + return node->bn_level; +} + +static void +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) +{ + node->bn_level = level; +} + +static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) +{ + return le16_to_cpu(node->bn_nchildren); +} + +static void +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) +{ + node->bn_nchildren = cpu_to_le16(nchildren); +} + +static int nilfs_btree_node_size(const struct nilfs_bmap *btree) +{ + return 1 << btree->b_inode->i_blkbits; +} + +static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree) +{ + return btree->b_nchildren_per_block; +} + +static __le64 * +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) +{ + return (__le64 *)((char *)(node + 1) + + (nilfs_btree_node_root(node) ? + 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); +} + +static __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax) +{ + return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax); +} + +static __u64 +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) +{ + return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index)); +} + +static void +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) +{ + *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key); +} + +static __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index, + int ncmax) +{ + return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index)); +} + +static void +nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr, + int ncmax) +{ + *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr); +} + +static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags, + int level, int nchildren, int ncmax, + const __u64 *keys, const __u64 *ptrs) +{ + __le64 *dkeys; + __le64 *dptrs; + int i; + + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + for (i = 0; i < nchildren; i++) { + dkeys[i] = cpu_to_le64(keys[i]); + dptrs[i] = cpu_to_le64(ptrs[i]); + } +} + +/* Assume the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_left(struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n, int lncmax, int rncmax) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); + lnchildren = nilfs_btree_node_get_nchildren(left); + + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); + rnchildren = nilfs_btree_node_get_nchildren(right); + + memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); + memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); + memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); + memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); + + lnchildren += n; + rnchildren -= n; + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); +} + +/* Assume that the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_right(struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n, int lncmax, int rncmax) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); + lnchildren = nilfs_btree_node_get_nchildren(left); + + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); + rnchildren = nilfs_btree_node_get_nchildren(right); + + memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); + memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); + memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); + memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); + + lnchildren -= n; + rnchildren += n; + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index, + __u64 key, __u64 ptr, int ncmax) +{ + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + nchildren = nilfs_btree_node_get_nchildren(node); + if (index < nchildren) { + memmove(dkeys + index + 1, dkeys + index, + (nchildren - index) * sizeof(*dkeys)); + memmove(dptrs + index + 1, dptrs + index, + (nchildren - index) * sizeof(*dptrs)); + } + dkeys[index] = cpu_to_le64(key); + dptrs[index] = cpu_to_le64(ptr); + nchildren++; + nilfs_btree_node_set_nchildren(node, nchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index, + __u64 *keyp, __u64 *ptrp, int ncmax) +{ + __u64 key; + __u64 ptr; + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + key = le64_to_cpu(dkeys[index]); + ptr = le64_to_cpu(dptrs[index]); + nchildren = nilfs_btree_node_get_nchildren(node); + if (keyp != NULL) + *keyp = key; + if (ptrp != NULL) + *ptrp = ptr; + + if (index < nchildren - 1) { + memmove(dkeys + index, dkeys + index + 1, + (nchildren - index - 1) * sizeof(*dkeys)); + memmove(dptrs + index, dptrs + index + 1, + (nchildren - index - 1) * sizeof(*dptrs)); + } + nchildren--; + nilfs_btree_node_set_nchildren(node, nchildren); +} + +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, + __u64 key, int *indexp) +{ + __u64 nkey; + int index, low, high, s; + + /* binary search */ + low = 0; + high = nilfs_btree_node_get_nchildren(node) - 1; + index = 0; + s = 0; + while (low <= high) { + index = (low + high) / 2; + nkey = nilfs_btree_node_get_key(node, index); + if (nkey == key) { + s = 0; + goto out; + } else if (nkey < key) { + low = index + 1; + s = -1; + } else { + high = index - 1; + s = 1; + } + } + + /* adjust index */ + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) + index--; + } else if (s < 0) + index++; + + out: + *indexp = index; + + return s == 0; +} + +/** + * nilfs_btree_node_broken - verify consistency of btree node + * @node: btree node block to be examined + * @size: node size (in bytes) + * @blocknr: block number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, + size_t size, sector_t blocknr) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX || + (flags & NILFS_BTREE_NODE_ROOT) || + nchildren < 0 || + nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { + printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " + "level = %d, flags = 0x%x, nchildren = %d\n", + (unsigned long long)blocknr, level, flags, nchildren); + ret = 1; + } + return ret; +} + +/** + * nilfs_btree_root_broken - verify consistency of btree root node + * @node: btree root node to be examined + * @ino: inode number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, + unsigned long ino) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX || + nchildren < 0 || + nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { + pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", + ino, level, flags, nchildren); + ret = 1; + } + return ret; +} + +int nilfs_btree_broken_node_block(struct buffer_head *bh) +{ + int ret; + + if (buffer_nilfs_checked(bh)) + return 0; + + ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, + bh->b_size, bh->b_blocknr); + if (likely(!ret)) + set_buffer_nilfs_checked(bh); + return ret; +} + +static struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_bmap *btree) +{ + return (struct nilfs_btree_node *)btree->b_u.u_data; +} + +static struct nilfs_btree_node * +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) +{ + return (struct nilfs_btree_node *)path[level].bp_bh->b_data; +} + +static struct nilfs_btree_node * +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) +{ + return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; +} + +static int nilfs_btree_height(const struct nilfs_bmap *btree) +{ + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; +} + +static struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + int level, int *ncmaxp) +{ + struct nilfs_btree_node *node; + + if (level == nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_root(btree); + *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX; + } else { + node = nilfs_btree_get_nonroot_node(path, level); + *ncmaxp = nilfs_btree_nchildren_per_block(btree); + } + return node; +} + +static int +nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +{ + if (unlikely(nilfs_btree_node_get_level(node) != level)) { + dump_stack(); + printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", + nilfs_btree_node_get_level(node), level); + return 1; + } + return 0; +} + +struct nilfs_btree_readahead_info { + struct nilfs_btree_node *node; /* parent node */ + int max_ra_blocks; /* max nof blocks to read ahead */ + int index; /* current index on the parent node */ + int ncmax; /* nof children in the parent node */ +}; + +static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp, + const struct nilfs_btree_readahead_info *ra) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh, *ra_bh; + sector_t submit_ptr = 0; + int ret; + + ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + if (ret) { + if (ret != -EEXIST) + return ret; + goto out_check; + } + + if (ra) { + int i, n; + __u64 ptr2; + + /* read ahead sibling nodes */ + for (n = ra->max_ra_blocks, i = ra->index + 1; + n > 0 && i < ra->ncmax; n--, i++) { + ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); + + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + &ra_bh, &submit_ptr); + if (likely(!ret || ret == -EEXIST)) + brelse(ra_bh); + else if (ret != -EBUSY) + break; + if (!buffer_locked(bh)) + goto out_no_wait; + } + } + + wait_on_buffer(bh); + + out_no_wait: + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + + out_check: + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + brelse(bh); + return -EINVAL; + } + + *bhp = bh; + return 0; +} + +static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp) +{ + return __nilfs_btree_get_block(btree, ptr, bhp, NULL); +} + +static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + __u64 key, __u64 *ptrp, int minlevel, + int readahead) +{ + struct nilfs_btree_node *node; + struct nilfs_btree_readahead_info p, *ra; + __u64 ptr; + int level, index, found, ncmax, ret; + + node = nilfs_btree_get_root(btree); + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) + return -ENOENT; + + found = nilfs_btree_node_lookup(node, key, &index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + ncmax = nilfs_btree_nchildren_per_block(btree); + + while (--level >= minlevel) { + ra = NULL; + if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) { + p.node = nilfs_btree_get_node(btree, path, level + 1, + &p.ncmax); + p.index = index; + p.max_ra_blocks = 7; + ra = &p; + } + ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh, + ra); + if (ret < 0) + return ret; + + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; + if (!found) + found = nilfs_btree_node_lookup(node, key, &index); + else + index = 0; + if (index < ncmax) { + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + } else { + WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); + /* insert */ + ptr = NILFS_BMAP_INVALID_PTR; + } + path[level].bp_index = index; + } + if (!found) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int index, level, ncmax, ret; + + node = nilfs_btree_get_root(btree); + index = nilfs_btree_node_get_nchildren(node) - 1; + if (index < 0) + return -ENOENT; + level = nilfs_btree_node_get_level(node); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + path[level].bp_bh = NULL; + path[level].bp_index = index; + ncmax = nilfs_btree_nchildren_per_block(btree); + + for (level--; level > 0; level--) { + ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; + index = nilfs_btree_node_get_nchildren(node) - 1; + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + path[level].bp_index = index; + } + + if (keyp != NULL) + *keyp = nilfs_btree_node_get_key(node, index); + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +/** + * nilfs_btree_get_next_key - get next valid key from btree path array + * @btree: bmap struct of btree + * @path: array of nilfs_btree_path struct + * @minlevel: start level + * @nextkey: place to store the next valid key + * + * Return Value: If a next key was found, 0 is returned. Otherwise, + * -ENOENT is returned. + */ +static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + int minlevel, __u64 *nextkey) +{ + struct nilfs_btree_node *node; + int maxlevel = nilfs_btree_height(btree) - 1; + int index, next_adj, level; + + /* Next index is already set to bp_index for leaf nodes. */ + next_adj = 0; + for (level = minlevel; level <= maxlevel; level++) { + if (level == maxlevel) + node = nilfs_btree_get_root(btree); + else + node = nilfs_btree_get_nonroot_node(path, level); + + index = path[level].bp_index + next_adj; + if (index < nilfs_btree_node_get_nchildren(node)) { + /* Next key is in this node */ + *nextkey = nilfs_btree_node_get_key(node, index); + return 0; + } + /* For non-leaf nodes, next index is stored at bp_index + 1. */ + next_adj = 1; + } + return -ENOENT; +} + +static int nilfs_btree_lookup(const struct nilfs_bmap *btree, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_btree_path *path; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0); + + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, + __u64 key, __u64 *ptrp, unsigned maxblocks) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int level = NILFS_BTREE_LEVEL_NODE_MIN; + int ret, cnt, index, maxlevel, ncmax; + struct nilfs_btree_readahead_info p; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1); + if (ret < 0) + goto out; + + if (NILFS_BMAP_USE_VBN(btree)) { + dat = nilfs_bmap_get_dat(btree); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + goto out; + ptr = blocknr; + } + cnt = 1; + if (cnt == maxblocks) + goto end; + + maxlevel = nilfs_btree_height(btree) - 1; + node = nilfs_btree_get_node(btree, path, level, &ncmax); + index = path[level].bp_index + 1; + for (;;) { + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != + key + cnt) + goto end; + ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax); + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + goto out; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt || ++cnt == maxblocks) + goto end; + index++; + continue; + } + if (level == maxlevel) + break; + + /* look-up right sibling node */ + p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); + p.index = path[level + 1].bp_index + 1; + p.max_ra_blocks = 7; + if (p.index >= nilfs_btree_node_get_nchildren(p.node) || + nilfs_btree_node_get_key(p.node, p.index) != key + cnt) + break; + ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax); + path[level + 1].bp_index = p.index; + + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + + ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh, + &p); + if (ret < 0) + goto out; + node = nilfs_btree_get_nonroot_node(path, level); + ncmax = nilfs_btree_nchildren_per_block(btree); + index = 0; + path[level].bp_index = index; + } + end: + *ptrp = ptr; + ret = cnt; + out: + nilfs_btree_free_path(path); + return ret; +} + +static void nilfs_btree_promote_key(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 key) +{ + if (level < nilfs_btree_height(btree) - 1) { + do { + nilfs_btree_node_set_key( + nilfs_btree_get_nonroot_node(path, level), + path[level].bp_index, key); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + } while ((path[level].bp_index == 0) && + (++level < nilfs_btree_height(btree) - 1)); + } + + /* root */ + if (level == nilfs_btree_height(btree) - 1) { + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), + path[level].bp_index, key); + } +} + +static void nilfs_btree_do_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + int ncblk; + + if (level < nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, ncblk); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, + 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + } +} + +static void nilfs_btree_carry_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + lnchildren + 1) / 2 - lnchildren; + if (n > path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + + if (move) { + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += lnchildren; + path[level + 1].bp_index--; + } else { + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index -= n; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_carry_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + rnchildren + 1) / 2 - rnchildren; + if (n > nchildren - path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(right, 0)); + path[level + 1].bp_index--; + + if (move) { + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); + path[level + 1].bp_index++; + } else { + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_split(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + __u64 newkey; + __u64 newptr; + int nchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + 1) / 2; + if (n > nchildren - path[level].bp_index) { + n--; + move = 1; + } + + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + newkey = nilfs_btree_node_get_key(right, 0); + newptr = path[level].bp_newreq.bpr_ptr; + + if (move) { + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); + nilfs_btree_node_insert(right, path[level].bp_index, + *keyp, *ptrp, ncblk); + + *keyp = nilfs_btree_node_get_key(right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + } else { + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + path[level + 1].bp_index++; +} + +static void nilfs_btree_grow(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n, ncblk; + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(root); + + nilfs_btree_node_move_right(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); + nilfs_btree_node_set_level(root, level + 1); + + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(child, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; +} + +static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path) +{ + struct nilfs_btree_node *node; + int level, ncmax; + + if (path == NULL) + return NILFS_BMAP_INVALID_PTR; + + /* left sibling */ + level = NILFS_BTREE_LEVEL_NODE_MIN; + if (path[level].bp_index > 0) { + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, + path[level].bp_index - 1, + ncmax); + } + + /* parent */ + level = NILFS_BTREE_LEVEL_NODE_MIN + 1; + if (level <= nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncmax); + } + + return NILFS_BMAP_INVALID_PTR; +} + +static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(btree, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else { + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + } + /* block group */ + return nilfs_bmap_find_target_in_group(btree); +} + +static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int *levelp, __u64 key, __u64 ptr, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ncmax, ncblk, ret; + struct inode *dat = NULL; + + stats->bs_nblocks = 0; + level = NILFS_BTREE_LEVEL_DATA; + + /* allocate a new ptr for data block */ + if (NILFS_BMAP_USE_VBN(btree)) { + path[level].bp_newreq.bpr_ptr = + nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(btree); + } + + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_data; + + ncblk = nilfs_btree_nchildren_per_block(btree); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < ncblk) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + pindex = path[level + 1].bp_index; + + /* left sibling */ + if (pindex > 0) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_left; + stats->bs_nblocks++; + goto out; + } else { + brelse(bh); + } + } + + /* right sibling */ + if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_right; + stats->bs_nblocks++; + goto out; + } else { + brelse(bh); + } + } + + /* split */ + path[level].bp_newreq.bpr_ptr = + path[level - 1].bp_newreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, + &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_btree_get_new_block(btree, + path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + stats->bs_nblocks++; + + sib = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_split; + } + + /* root */ + node = nilfs_btree_get_root(btree); + if (nilfs_btree_node_get_nchildren(node) < + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + /* grow */ + path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data, + 0, level, 0, ncblk, NULL, NULL); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_grow; + + level++; + path[level].bp_op = nilfs_btree_do_insert; + + /* a newly-created node block and a data block are added */ + stats->bs_nblocks += 2; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + err_out_child_node: + for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { + nilfs_btnode_delete(path[level].bp_sib_bh); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + + } + + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + err_out_data: + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int maxlevel, __u64 key, __u64 ptr) +{ + struct inode *dat = NULL; + int level; + + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; + if (NILFS_BMAP_USE_VBN(btree)) { + nilfs_bmap_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(btree); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + nilfs_bmap_commit_alloc_ptr(btree, + &path[level - 1].bp_newreq, dat); + path[level].bp_op(btree, path, level, &key, &ptr); + } + + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); +} + +static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr) +{ + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN, 0); + if (ret != -ENOENT) { + if (ret == 0) + ret = -EEXIST; + goto out; + } + + ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_insert(btree, path, level, key, ptr); + nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); + + out: + nilfs_btree_free_path(path); + return ret; +} + +static void nilfs_btree_do_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + int ncblk; + + if (level < nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, ncblk); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + } +} + +static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = (nchildren + lnchildren) / 2 - nchildren; + + nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index += n; +} + +static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = (nchildren + rnchildren) / 2 - nchildren; + + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(right, 0)); + path[level + 1].bp_index--; + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; +} + +static void nilfs_btree_concat_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(node); + + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btnode_delete(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += nilfs_btree_node_get_nchildren(left); +} + +static void nilfs_btree_concat_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(right); + + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + nilfs_btnode_delete(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level + 1].bp_index++; +} + +static void nilfs_btree_shrink(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + nilfs_btree_node_delete(root, 0, NULL, NULL, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); + nilfs_btree_node_move_left(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); + + nilfs_btnode_delete(path[level].bp_bh); + path[level].bp_bh = NULL; +} + +static void nilfs_btree_nop(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ +} + +static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int *levelp, + struct nilfs_bmap_stats *stats, + struct inode *dat) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, dindex, level, ncmin, ncmax, ncblk, ret; + + ret = 0; + stats->bs_nblocks = 0; + ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + ncblk = nilfs_btree_nchildren_per_block(btree); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(path, level); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(node, dindex, ncblk); + ret = nilfs_bmap_prepare_end_ptr(btree, + &path[level].bp_oldreq, dat); + if (ret < 0) + goto err_out_child_node; + + if (nilfs_btree_node_get_nchildren(node) > ncmin) { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + pindex = path[level + 1].bp_index; + dindex = pindex; + + if (pindex > 0) { + /* left sibling */ + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_left; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_left; + stats->bs_nblocks++; + /* continue; */ + } + } else if (pindex < + nilfs_btree_node_get_nchildren(parent) - 1) { + /* right sibling */ + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_right; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_right; + stats->bs_nblocks++; + /* + * When merging right sibling node + * into the current node, pointer to + * the right sibling node must be + * terminated instead. The adjustment + * below is required for that. + */ + dindex = pindex + 1; + /* continue; */ + } + } else { + /* no siblings */ + /* the only child of the root node */ + WARN_ON(level != nilfs_btree_height(btree) - 2); + if (nilfs_btree_node_get_nchildren(node) - 1 <= + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_shrink; + stats->bs_nblocks += 2; + level++; + path[level].bp_op = nilfs_btree_nop; + goto shrink_root_child; + } else { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + } + } + + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + +shrink_root_child: + node = nilfs_btree_get_root(btree); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(node, dindex, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); + if (ret < 0) + goto err_out_child_node; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); + err_out_child_node: + for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { + brelse(path[level].bp_sib_bh); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); + } + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int maxlevel, struct inode *dat) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat); + path[level].bp_op(btree, path, level, NULL, NULL); + } + + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); +} + +static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key) + +{ + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + struct inode *dat; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN, 0); + if (ret < 0) + goto out; + + + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); + if (ret < 0) + goto out; + nilfs_btree_commit_delete(btree, path, level, dat); + nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks); + +out: + nilfs_btree_free_path(path); + return ret; +} + +static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start, + __u64 *keyp) +{ + struct nilfs_btree_path *path; + const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN; + int ret; + + path = nilfs_btree_alloc_path(); + if (!path) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0); + if (!ret) + *keyp = start; + else if (ret == -ENOENT) + ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp); + + nilfs_btree_free_path(path); + return ret; +} + +static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) +{ + struct nilfs_btree_path *path; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); + + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) +{ + struct buffer_head *bh; + struct nilfs_btree_node *root, *node; + __u64 maxkey, nextmaxkey; + __u64 ptr; + int nchildren, ret; + + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(root); + if (nchildren > 1) + return 0; + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + return 0; + } + + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); + nextmaxkey = (nchildren > 1) ? + nilfs_btree_node_get_key(node, nchildren - 2) : 0; + if (bh != NULL) + brelse(bh); + + return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); +} + +static int nilfs_btree_gather_data(struct nilfs_bmap *btree, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *root; + __le64 *dkeys; + __le64 *dptrs; + __u64 ptr; + int nchildren, ncmax, i, ret; + + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(root); + WARN_ON(nchildren > 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + ncmax = nilfs_btree_nchildren_per_block(btree); + break; + default: + node = NULL; + return -EINVAL; + } + + nchildren = nilfs_btree_node_get_nchildren(node); + if (nchildren < nitems) + nitems = nchildren; + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + for (i = 0; i < nitems; i++) { + keys[i] = le64_to_cpu(dkeys[i]); + ptrs[i] = le64_to_cpu(dptrs[i]); + } + + if (bh != NULL) + brelse(bh); + + return nitems; +} + +static int +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head **bhp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct inode *dat = NULL; + int ret; + + stats->bs_nblocks = 0; + + /* for data */ + /* cannot find near ptr */ + if (NILFS_BMAP_USE_VBN(btree)) { + dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(btree); + } + + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); + if (ret < 0) + return ret; + + *bhp = NULL; + stats->bs_nblocks++; + if (nreq != NULL) { + nreq->bpr_ptr = dreq->bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat); + if (ret < 0) + goto err_out_dreq; + + ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh); + if (ret < 0) + goto err_out_nreq; + + *bhp = bh; + stats->bs_nblocks++; + } + + /* success */ + return 0; + + /* error */ + err_out_nreq: + nilfs_bmap_abort_alloc_ptr(btree, nreq, dat); + err_out_dreq: + nilfs_bmap_abort_alloc_ptr(btree, dreq, dat); + stats->bs_nblocks = 0; + return ret; + +} + +static void +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head *bh) +{ + struct nilfs_btree_node *node; + struct inode *dat; + __u64 tmpptr; + int ncblk; + + /* free resources */ + if (btree->b_ops->bop_clear != NULL) + btree->b_ops->bop_clear(btree); + + /* ptr must be a pointer to a buffer head. */ + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + + /* convert and insert */ + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + __nilfs_btree_init(btree); + if (nreq != NULL) { + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); + + /* create child node at level 1 */ + node = (struct nilfs_btree_node *)bh->b_data; + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); + if (!buffer_dirty(bh)) + mark_buffer_dirty(bh); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + + brelse(bh); + + /* create root node at level 2 */ + node = nilfs_btree_get_root(btree); + tmpptr = nreq->bpr_ptr; + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + &keys[0], &tmpptr); + } else { + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + + /* create root node at level 1 */ + node = nilfs_btree_get_root(btree); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + } + + if (NILFS_BMAP_USE_VBN(btree)) + nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr); +} + +/** + * nilfs_btree_convert_and_insert - + * @bmap: + * @key: + * @ptr: + * @keys: + * @ptrs: + * @n: + */ +int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, int n) +{ + struct buffer_head *bh; + union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; + struct nilfs_bmap_stats stats; + int ret; + + if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { + di = &dreq; + ni = NULL; + } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( + 1 << btree->b_inode->i_blkbits)) { + di = &dreq; + ni = &nreq; + } else { + di = NULL; + ni = NULL; + BUG(); + } + + ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh, + &stats); + if (ret < 0) + return ret; + nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, + di, ni, bh); + nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); + return 0; +} + +static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + return 0; +} + +static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + struct nilfs_btree_node *parent; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(path[level].bp_bh)) { + path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; + path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; + path[level].bp_ctxt.bh = path[level].bp_bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) { + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + return ret; + } + } + + return 0; +} + +static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + struct nilfs_btree_node *parent; + int ncmax; + + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->b_ptr_type == NILFS_BMAP_PTR_VS); + + if (buffer_nilfs_node(path[level].bp_bh)) { + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + path[level].bp_bh = path[level].bp_ctxt.bh; + } + set_buffer_nilfs_volatile(path[level].bp_bh); + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr, ncmax); +} + +static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + if (buffer_nilfs_node(path[level].bp_bh)) + nilfs_btnode_abort_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); +} + +static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int minlevel, int *maxlevelp, + struct inode *dat) +{ + int level, ret; + + level = minlevel; + if (!buffer_nilfs_volatile(path[level].bp_bh)) { + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); + if (ret < 0) + return ret; + } + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) { + + WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); + if (ret < 0) + goto out; + } + + /* success */ + *maxlevelp = level - 1; + return 0; + + /* error */ + out: + while (--level > minlevel) + nilfs_btree_abort_update_v(btree, path, level, dat); + if (!buffer_nilfs_volatile(path[level].bp_bh)) + nilfs_btree_abort_update_v(btree, path, level, dat); + return ret; +} + +static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) +{ + int level; + + if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) + nilfs_btree_commit_update_v(btree, path, minlevel, dat); + + for (level = minlevel + 1; level <= maxlevel; level++) + nilfs_btree_commit_update_v(btree, path, level, dat); +} + +static int nilfs_btree_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct buffer_head *bh) +{ + int maxlevel = 0, ret; + struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(btree); + __u64 ptr; + int ncmax; + + get_bh(bh); + path[level].bp_bh = bh; + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); + if (ret < 0) + goto out; + + if (buffer_nilfs_volatile(path[level].bp_bh)) { + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, + path[level + 1].bp_index, + ncmax); + ret = nilfs_dat_mark_dirty(dat, ptr); + if (ret < 0) + goto out; + } + + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); + + out: + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + return ret; +} + +static int nilfs_btree_propagate(struct nilfs_bmap *btree, + struct buffer_head *bh) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + WARN_ON(!buffer_dirty(bh)); + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + if (buffer_nilfs_node(bh)) { + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + } else { + key = nilfs_bmap_data_get_key(btree, bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); + if (ret < 0) { + if (unlikely(ret == -ENOENT)) + printk(KERN_CRIT "%s: key = %llu, level == %d\n", + __func__, (unsigned long long)key, level); + goto out; + } + + ret = NILFS_BMAP_USE_VBN(btree) ? + nilfs_btree_propagate_v(btree, path, level, bh) : + nilfs_btree_propagate_p(btree, path, level, bh); + + out: + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree, + struct buffer_head *bh) +{ + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr); +} + +static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, + struct list_head *lists, + struct buffer_head *bh) +{ + struct list_head *head; + struct buffer_head *cbh; + struct nilfs_btree_node *node, *cnode; + __u64 key, ckey; + int level; + + get_bh(bh); + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + if (level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX) { + dump_stack(); + printk(KERN_WARNING + "%s: invalid btree level: %d (key=%llu, ino=%lu, " + "blocknr=%llu)\n", + __func__, level, (unsigned long long)key, + NILFS_BMAP_I(btree)->vfs_inode.i_ino, + (unsigned long long)bh->b_blocknr); + return; + } + + list_for_each(head, &lists[level]) { + cbh = list_entry(head, struct buffer_head, b_assoc_buffers); + cnode = (struct nilfs_btree_node *)cbh->b_data; + ckey = nilfs_btree_node_get_key(cnode, 0); + if (key < ckey) + break; + } + list_add_tail(&bh->b_assoc_buffers, head); +} + +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, + struct list_head *listp) +{ + struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct list_head lists[NILFS_BTREE_LEVEL_MAX]; + struct pagevec pvec; + struct buffer_head *bh, *head; + pgoff_t index = 0; + int level, i; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + INIT_LIST_HEAD(&lists[level]); + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) + nilfs_btree_add_dirty_buffer(btree, + lists, bh); + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + cond_resched(); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + list_splice_tail(&lists[level], listp); +} + +static int nilfs_btree_assign_p(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + if (buffer_nilfs_node(*bh)) { + path[level].bp_ctxt.oldkey = ptr; + path[level].bp_ctxt.newkey = blocknr; + path[level].bp_ctxt.bh = *bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) + return ret; + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + *bh = path[level].bp_ctxt.bh; + } + + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr, + ncmax); + + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); + binfo->bi_dat.bi_level = level; + + return 0; +} + +static int nilfs_btree_assign_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(btree); + __u64 key; + __u64 ptr; + union nilfs_bmap_ptr_req req; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + req.bpr_ptr = ptr; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) + return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + + return 0; +} + +static int nilfs_btree_assign(struct nilfs_bmap *btree, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + } else { + key = nilfs_bmap_data_get_key(btree, *bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + ret = NILFS_BMAP_USE_VBN(btree) ? + nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : + nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); + + out: + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_assign_gc(struct nilfs_bmap *btree, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *node; + __u64 key; + int ret; + + ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr, + blocknr); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(node, 0); + } else + key = nilfs_bmap_data_get_key(btree, *bh); + + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + + return 0; +} + +static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) +{ + struct buffer_head *bh; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + if (!buffer_dirty(bh)) + mark_buffer_dirty(bh); + brelse(bh); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + + out: + nilfs_btree_free_path(path); + return ret; +} + +static const struct nilfs_bmap_operations nilfs_btree_ops = { + .bop_lookup = nilfs_btree_lookup, + .bop_lookup_contig = nilfs_btree_lookup_contig, + .bop_insert = nilfs_btree_insert, + .bop_delete = nilfs_btree_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign, + .bop_mark = nilfs_btree_mark, + + .bop_seek_key = nilfs_btree_seek_key, + .bop_last_key = nilfs_btree_last_key, + + .bop_check_insert = NULL, + .bop_check_delete = nilfs_btree_check_delete, + .bop_gather_data = nilfs_btree_gather_data, +}; + +static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { + .bop_lookup = NULL, + .bop_lookup_contig = NULL, + .bop_insert = NULL, + .bop_delete = NULL, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate_gc, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign_gc, + .bop_mark = NULL, + + .bop_seek_key = NULL, + .bop_last_key = NULL, + + .bop_check_insert = NULL, + .bop_check_delete = NULL, + .bop_gather_data = NULL, +}; + +static void __nilfs_btree_init(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_btree_ops; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); +} + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + int ret = 0; + + __nilfs_btree_init(bmap); + + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), + bmap->b_inode->i_ino)) + ret = -EIO; + return ret; +} + +void nilfs_btree_init_gc(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_btree_ops_gc; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); +} diff --git a/kernel/fs/nilfs2/btree.h b/kernel/fs/nilfs2/btree.h new file mode 100644 index 000000000..22c02e35b --- /dev/null +++ b/kernel/fs/nilfs2/btree.h @@ -0,0 +1,77 @@ +/* + * btree.h - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_BTREE_H +#define _NILFS_BTREE_H + +#include +#include +#include +#include +#include "btnode.h" +#include "bmap.h" + +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; + +#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE +#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ + ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0 +#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64)) +#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \ + (((nodesize) - sizeof(struct nilfs_btree_node) - \ + NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \ + ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1) +#define NILFS_BTREE_KEY_MIN ((__u64)0) +#define NILFS_BTREE_KEY_MAX (~(__u64)0) + +extern struct kmem_cache *nilfs_btree_path_cache; + +int nilfs_btree_init(struct nilfs_bmap *); +int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, + const __u64 *, const __u64 *, int); +void nilfs_btree_init_gc(struct nilfs_bmap *); + +int nilfs_btree_broken_node_block(struct buffer_head *bh); + +#endif /* _NILFS_BTREE_H */ diff --git a/kernel/fs/nilfs2/cpfile.c b/kernel/fs/nilfs2/cpfile.c new file mode 100644 index 000000000..b6596cab9 --- /dev/null +++ b/kernel/fs/nilfs2/cpfile.c @@ -0,0 +1,1027 @@ +/* + * cpfile.c - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include +#include +#include "mdt.h" +#include "cpfile.h" + + +static inline unsigned long +nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) +{ + return NILFS_MDT(cpfile)->mi_entries_per_block; +} + +/* block number from the beginning of the file */ +static unsigned long +nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + return (unsigned long)tcno; +} + +/* offset in block */ +static unsigned long +nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); +} + +static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile, + unsigned long blkoff) +{ + return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff + + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset; +} + +static unsigned long +nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, + __u64 curr, + __u64 max) +{ + return min_t(__u64, + nilfs_cpfile_checkpoints_per_block(cpfile) - + nilfs_cpfile_get_offset(cpfile, curr), + max - curr); +} + +static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, + __u64 cno) +{ + return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; +} + +static unsigned int +nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + count = le32_to_cpu(cp->cp_checkpoints_count) + n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static unsigned int +nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); + count = le32_to_cpu(cp->cp_checkpoints_count) - n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static inline struct nilfs_cpfile_header * +nilfs_cpfile_block_get_header(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_checkpoint * +nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +static void nilfs_cpfile_block_init(struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + int n = nilfs_cpfile_checkpoints_per_block(cpfile); + + while (n-- > 0) { + nilfs_checkpoint_set_invalid(cp); + cp = (void *)cp + cpsz; + } +} + +static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); +} + +static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, + __u64 cno, + int create, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno), + create, nilfs_cpfile_block_init, bhp); +} + +/** + * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile + * @cpfile: inode of cpfile + * @start_cno: start checkpoint number (inclusive) + * @end_cno: end checkpoint number (inclusive) + * @cnop: place to store the next checkpoint number + * @bhp: place to store a pointer to buffer_head struct + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - no block exists in the range. + */ +static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile, + __u64 start_cno, __u64 end_cno, + __u64 *cnop, + struct buffer_head **bhp) +{ + unsigned long start, end, blkoff; + int ret; + + if (unlikely(start_cno > end_cno)) + return -ENOENT; + + start = nilfs_cpfile_get_blkoff(cpfile, start_cno); + end = nilfs_cpfile_get_blkoff(cpfile, end_cno); + + ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp); + if (!ret) + *cnop = (blkoff == start) ? start_cno : + nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff); + return ret; +} + +static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, + __u64 cno) +{ + return nilfs_mdt_delete_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno)); +} + +/** + * nilfs_cpfile_get_checkpoint - get a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @create: create flag + * @cpp: pointer to a checkpoint + * @bhp: pointer to a buffer head + * + * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint + * specified by @cno. A new checkpoint will be created if @cno is the current + * checkpoint number and @create is nonzero. + * + * Return Value: On success, 0 is returned, and the checkpoint and the + * buffer head of the buffer on which the checkpoint is located are stored in + * the place pointed by @cpp and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + * + * %-EINVAL - invalid checkpoint. + */ +int nilfs_cpfile_get_checkpoint(struct inode *cpfile, + __u64 cno, + int create, + struct nilfs_checkpoint **cpp, + struct buffer_head **bhp) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || + (cno < nilfs_mdt_cno(cpfile) && create))) + return -EINVAL; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); + if (ret < 0) + goto out_header; + kaddr = kmap(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + if (!create) { + kunmap(cp_bh->b_page); + brelse(cp_bh); + ret = -ENOENT; + goto out_header; + } + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + mark_buffer_dirty(cp_bh); + + kaddr = kmap_atomic(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_atomic(kaddr); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + } + + if (cpp != NULL) + *cpp = cp; + *bhp = cp_bh; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_put_checkpoint - put a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @bh: buffer head + * + * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint + * specified by @cno. @bh must be the buffer head which has been returned by + * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + */ +void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_cpfile_delete_checkpoints - delete checkpoints + * @cpfile: inode of checkpoint file + * @start: start checkpoint number + * @end: end checkpoint numer + * + * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in + * the period from @start to @end, excluding @end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, + __u64 start, + __u64 end) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cno; + void *kaddr; + unsigned long tnicps; + int ret, ncps, nicps, nss, count, i; + + if (unlikely(start == 0 || start > end)) { + printk(KERN_ERR "%s: invalid range of checkpoint numbers: " + "[%llu, %llu)\n", __func__, + (unsigned long long)start, (unsigned long long)end); + return -EINVAL; + } + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + tnicps = 0; + nss = 0; + + for (cno = start; cno < end; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) { + if (ret != -ENOENT) + break; + /* skip hole */ + ret = 0; + continue; + } + + kaddr = kmap_atomic(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, cno, cp_bh, kaddr); + nicps = 0; + for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { + if (nilfs_checkpoint_snapshot(cp)) { + nss++; + } else if (!nilfs_checkpoint_invalid(cp)) { + nilfs_checkpoint_set_invalid(cp); + nicps++; + } + } + if (nicps > 0) { + tnicps += nicps; + mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) { + count = + nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps); + if (count == 0) { + /* make hole */ + kunmap_atomic(kaddr); + brelse(cp_bh); + ret = + nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR + "%s: cannot delete block\n", + __func__); + break; + } + } + } + + kunmap_atomic(kaddr); + brelse(cp_bh); + } + + if (tnicps > 0) { + kaddr = kmap_atomic(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + kunmap_atomic(kaddr); + } + + brelse(header_bh); + if (nss > 0) + ret = -EBUSY; + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, + struct nilfs_checkpoint *cp, + struct nilfs_cpinfo *ci) +{ + ci->ci_flags = le32_to_cpu(cp->cp_flags); + ci->ci_cno = le64_to_cpu(cp->cp_cno); + ci->ci_create = le64_to_cpu(cp->cp_create); + ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); + ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); + ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); + ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); +} + +static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, + void *buf, unsigned cisz, size_t nci) +{ + struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; + struct buffer_head *bh; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + void *kaddr; + int n, ret; + int ncps, i; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + for (n = 0; n < nci; cno += ncps) { + ret = nilfs_cpfile_find_checkpoint_block( + cpfile, cno, cur_cno - 1, &cno, &bh); + if (ret < 0) { + if (likely(ret == -ENOENT)) + break; + goto out; + } + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); + + kaddr = kmap_atomic(bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, + ci); + ci = (void *)ci + cisz; + n++; + } + } + kunmap_atomic(kaddr); + brelse(bh); + } + + ret = n; + if (n > 0) { + ci = (void *)ci - cisz; + *cnop = ci->ci_cno + 1; + } + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, + void *buf, unsigned cisz, size_t nci) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; + __u64 curr = *cnop, next; + unsigned long curr_blkoff, next_blkoff; + void *kaddr; + int n = 0, ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + if (curr == 0) { + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); + kunmap_atomic(kaddr); + brelse(bh); + if (curr == 0) { + ret = 0; + goto out; + } + } else if (unlikely(curr == ~(__u64)0)) { + ret = 0; + goto out; + } + + curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = 0; /* No snapshots (started from a hole block) */ + goto out; + } + kaddr = kmap_atomic(bh->b_page); + while (n < nci) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); + curr = ~(__u64)0; /* Terminator */ + if (unlikely(nilfs_checkpoint_invalid(cp) || + !nilfs_checkpoint_snapshot(cp))) + break; + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci); + ci = (void *)ci + cisz; + n++; + next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); + if (next == 0) + break; /* reach end of the snapshot list */ + + next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); + if (curr_blkoff != next_blkoff) { + kunmap_atomic(kaddr); + brelse(bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, + 0, &bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -ENOENT); + goto out; + } + kaddr = kmap_atomic(bh->b_page); + } + curr = next; + curr_blkoff = next_blkoff; + } + kunmap_atomic(kaddr); + brelse(bh); + *cnop = curr; + ret = n; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_get_cpinfo - + * @cpfile: + * @cno: + * @ci: + * @nci: + */ + +ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, + void *buf, unsigned cisz, size_t nci) +{ + switch (mode) { + case NILFS_CHECKPOINT: + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci); + case NILFS_SNAPSHOT: + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_delete_checkpoint - + * @cpfile: + * @cno: + */ +int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct nilfs_cpinfo ci; + __u64 tcno = cno; + ssize_t nci; + + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1); + if (nci < 0) + return nci; + else if (nci == 0 || ci.ci_cno != cno) + return -ENOENT; + else if (nilfs_cpinfo_snapshot(&ci)) + return -EBUSY; + + return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); +} + +static struct nilfs_snapshot_list * +nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + + if (cno != 0) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + list = &cp->cp_snapshot_list; + } else { + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + list = &header->ch_snapshot_list; + } + return list; +} + +static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 curr, prev; + unsigned long curr_blkoff, prev_blkoff; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr); + goto out_cp; + } + if (nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr); + goto out_cp; + } + kunmap_atomic(kaddr); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + kaddr = kmap_atomic(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + list = &header->ch_snapshot_list; + curr_bh = header_bh; + get_bh(curr_bh); + curr = 0; + curr_blkoff = 0; + prev = le64_to_cpu(list->ssl_prev); + while (prev > cno) { + prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); + curr = prev; + if (curr_blkoff != prev_blkoff) { + kunmap_atomic(kaddr); + brelse(curr_bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, + 0, &curr_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(curr_bh->b_page); + } + curr_blkoff = prev_blkoff; + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, curr, curr_bh, kaddr); + list = &cp->cp_snapshot_list; + prev = le64_to_cpu(list->ssl_prev); + } + kunmap_atomic(kaddr); + + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_curr; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(curr_bh->b_page); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, curr, curr_bh, kaddr); + list->ssl_prev = cpu_to_le64(cno); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); + nilfs_checkpoint_set_snapshot(cp); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(prev_bh->b_page); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(cno); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, 1); + kunmap_atomic(kaddr); + + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(curr_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_curr: + brelse(curr_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 next, prev; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr); + goto out_cp; + } + if (!nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr); + goto out_cp; + } + + list = &cp->cp_snapshot_list; + next = le64_to_cpu(list->ssl_next); + prev = le64_to_cpu(list->ssl_prev); + kunmap_atomic(kaddr); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + if (next != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, + &next_bh); + if (ret < 0) + goto out_header; + } else { + next_bh = header_bh; + get_bh(next_bh); + } + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_next; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(next_bh->b_page); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, next, next_bh, kaddr); + list->ssl_prev = cpu_to_le64(prev); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(prev_bh->b_page); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(next); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); + nilfs_checkpoint_clear_snapshot(cp); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, -1); + kunmap_atomic(kaddr); + + mark_buffer_dirty(next_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_next: + brelse(next_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_is_snapshot - + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * + * Description: + * + * Return Value: On success, 1 is returned if the checkpoint specified by + * @cno is a snapshot, or 0 if not. On error, one of the following negative + * error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); + kunmap_atomic(kaddr); + brelse(bh); + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_change_cpmode - change checkpoint mode + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @status: mode of checkpoint + * + * Description: nilfs_change_cpmode() changes the mode of the checkpoint + * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) +{ + int ret; + + switch (mode) { + case NILFS_CHECKPOINT: + if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno)) + /* + * Current implementation does not have to protect + * plain read-only mounts since they are exclusive + * with a read/write mount and are protected from the + * cleaner. + */ + ret = -EBUSY; + else + ret = nilfs_cpfile_clear_snapshot(cpfile, cno); + return ret; + case NILFS_SNAPSHOT: + return nilfs_cpfile_set_snapshot(cpfile, cno); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_get_stat - get checkpoint statistics + * @cpfile: inode of checkpoint file + * @stat: pointer to a structure of checkpoint statistics + * + * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + cpstat->cs_cno = nilfs_mdt_cno(cpfile); + cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); + cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); + kunmap_atomic(kaddr); + brelse(bh); + + out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_read - read or get cpfile inode + * @sb: super block instance + * @cpsize: size of a checkpoint entry + * @raw_inode: on-disk cpfile inode + * @inodep: buffer to store the inode + */ +int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + struct inode *cpfile; + int err; + + if (cpsize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { + printk(KERN_ERR + "NILFS: too small checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } + + cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); + if (unlikely(!cpfile)) + return -ENOMEM; + if (!(cpfile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); + if (err) + goto failed; + + nilfs_mdt_set_entry_size(cpfile, cpsize, + sizeof(struct nilfs_cpfile_header)); + + err = nilfs_read_inode_common(cpfile, raw_inode); + if (err) + goto failed; + + unlock_new_inode(cpfile); + out: + *inodep = cpfile; + return 0; + failed: + iget_failed(cpfile); + return err; +} diff --git a/kernel/fs/nilfs2/cpfile.h b/kernel/fs/nilfs2/cpfile.h new file mode 100644 index 000000000..a242b9a31 --- /dev/null +++ b/kernel/fs/nilfs2/cpfile.h @@ -0,0 +1,46 @@ +/* + * cpfile.h - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_CPFILE_H +#define _NILFS_CPFILE_H + +#include +#include +#include + + +int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, + struct nilfs_checkpoint **, + struct buffer_head **); +void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); +int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); +int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); +int nilfs_cpfile_is_snapshot(struct inode *, __u64); +int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, + size_t); + +int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, + struct nilfs_inode *raw_inode, struct inode **inodep); + +#endif /* _NILFS_CPFILE_H */ diff --git a/kernel/fs/nilfs2/dat.c b/kernel/fs/nilfs2/dat.c new file mode 100644 index 000000000..0d5fada91 --- /dev/null +++ b/kernel/fs/nilfs2/dat.c @@ -0,0 +1,529 @@ +/* + * dat.c - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "dat.h" + + +#define NILFS_CNO_MIN ((__u64)1) +#define NILFS_CNO_MAX (~(__u64)0) + +/** + * struct nilfs_dat_info - on-memory private data of DAT file + * @mi: on-memory private data of metadata file + * @palloc_cache: persistent object allocator cache of DAT file + * @shadow: shadow map of DAT file + */ +struct nilfs_dat_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; + struct nilfs_shadow_map shadow; +}; + +static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) +{ + return (struct nilfs_dat_info *)NILFS_MDT(dat); +} + +static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) +{ + return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); +} + +static void nilfs_dat_commit_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + mark_buffer_dirty(req->pr_entry_bh); + nilfs_mdt_mark_dirty(dat); + brelse(req->pr_entry_bh); +} + +static void nilfs_dat_abort_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_entry_bh); +} + +int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_alloc_entry(dat, req); + if (ret < 0) + return ret; + + ret = nilfs_dat_prepare_entry(dat, req, 1); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(dat, req); + + return ret; +} + +void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MAX); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr); + + nilfs_palloc_commit_alloc_entry(dat, req); + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_alloc_entry(dat, req); +} + +static void nilfs_dat_commit_free(struct inode *dat, + struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MIN); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr); + + nilfs_dat_commit_entry(dat, req); + nilfs_palloc_commit_free_entry(dat, req); +} + +int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + WARN_ON(ret == -ENOENT); + return ret; +} + +void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, + sector_t blocknr) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr); + + nilfs_dat_commit_entry(dat, req); +} + +int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr); + + if (blocknr == 0) { + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) { + nilfs_dat_abort_entry(dat, req); + return ret; + } + } + + return 0; +} + +void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, + int dead) +{ + struct nilfs_dat_entry *entry; + __u64 start, end; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + end = start = le64_to_cpu(entry->de_start); + if (!dead) { + end = nilfs_mdt_cno(dat); + WARN_ON(start > end); + } + entry->de_end = cpu_to_le64(end); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr); + + if (blocknr == 0) + nilfs_dat_commit_free(dat, req); + else + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr); + + if (start == nilfs_mdt_cno(dat) && blocknr == 0) + nilfs_palloc_abort_free_entry(dat, req); + nilfs_dat_abort_entry(dat, req); +} + +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + +/** + * nilfs_dat_mark_dirty - + * @dat: DAT file inode + * @vblocknr: virtual block number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = vblocknr; + ret = nilfs_dat_prepare_entry(dat, &req, 0); + if (ret == 0) + nilfs_dat_commit_entry(dat, &req); + return ret; +} + +/** + * nilfs_dat_freev - free virtual block numbers + * @dat: DAT file inode + * @vblocknrs: array of virtual block numbers + * @nitems: number of virtual block numbers + * + * Description: nilfs_dat_freev() frees the virtual block numbers specified by + * @vblocknrs and @nitems. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) +{ + return nilfs_palloc_freev(dat, vblocknrs, nitems); +} + +/** + * nilfs_dat_move - change a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknr: block number + * + * Description: nilfs_dat_move() changes the block number associated with + * @vblocknr to @blocknr. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + /* + * The given disk block number (blocknr) is not yet written to + * the device at this point. + * + * To prevent nilfs_dat_translate() from returning the + * uncommitted block number, this makes a copy of the entry + * buffer and redirects nilfs_dat_translate() to the copy. + */ + if (!buffer_nilfs_redirected(entry_bh)) { + ret = nilfs_mdt_freeze_buffer(dat, entry_bh); + if (ret) { + brelse(entry_bh); + return ret; + } + } + + kaddr = kmap_atomic(entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { + printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, + (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); + kunmap_atomic(kaddr); + brelse(entry_bh); + return -EINVAL; + } + WARN_ON(blocknr == 0); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr); + + mark_buffer_dirty(entry_bh); + nilfs_mdt_mark_dirty(dat); + + brelse(entry_bh); + + return 0; +} + +/** + * nilfs_dat_translate - translate a virtual block number to a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknrp: pointer to a block number + * + * Description: nilfs_dat_translate() maps the virtual block number @vblocknr + * to the corresponding block number. + * + * Return Value: On success, 0 is returned and the block number associated + * with @vblocknr is stored in the place pointed by @blocknrp. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A block number associated with @vblocknr does not exist. + */ +int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) +{ + struct buffer_head *entry_bh, *bh; + struct nilfs_dat_entry *entry; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) { + bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh); + if (bh) { + WARN_ON(!buffer_uptodate(bh)); + brelse(entry_bh); + entry_bh = bh; + } + } + + kaddr = kmap_atomic(entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + blocknr = le64_to_cpu(entry->de_blocknr); + if (blocknr == 0) { + ret = -ENOENT; + goto out; + } + *blocknrp = blocknr; + + out: + kunmap_atomic(kaddr); + brelse(entry_bh); + return ret; +} + +ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, + size_t nvi) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + struct nilfs_vinfo *vinfo = buf; + __u64 first, last; + void *kaddr; + unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + int i, j, n, ret; + + for (i = 0; i < nvi; i += n) { + ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr, + 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page); + /* last virtual block number in this block */ + first = vinfo->vi_vblocknr; + do_div(first, entries_per_block); + first *= entries_per_block; + last = first + entries_per_block - 1; + for (j = i, n = 0; + j < nvi && vinfo->vi_vblocknr >= first && + vinfo->vi_vblocknr <= last; + j++, n++, vinfo = (void *)vinfo + visz) { + entry = nilfs_palloc_block_get_entry( + dat, vinfo->vi_vblocknr, entry_bh, kaddr); + vinfo->vi_start = le64_to_cpu(entry->de_start); + vinfo->vi_end = le64_to_cpu(entry->de_end); + vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); + } + kunmap_atomic(kaddr); + brelse(entry_bh); + } + + return nvi; +} + +/** + * nilfs_dat_read - read or get dat inode + * @sb: super block instance + * @entry_size: size of a dat entry + * @raw_inode: on-disk dat inode + * @inodep: buffer to store the inode + */ +int nilfs_dat_read(struct super_block *sb, size_t entry_size, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + static struct lock_class_key dat_lock_key; + struct inode *dat; + struct nilfs_dat_info *di; + int err; + + if (entry_size > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { + printk(KERN_ERR + "NILFS: too small DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } + + dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); + if (unlikely(!dat)) + return -ENOMEM; + if (!(dat->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di)); + if (err) + goto failed; + + err = nilfs_palloc_init_blockgroup(dat, entry_size); + if (err) + goto failed; + + di = NILFS_DAT_I(dat); + lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); + nilfs_palloc_setup_cache(dat, &di->palloc_cache); + nilfs_mdt_setup_shadow_map(dat, &di->shadow); + + err = nilfs_read_inode_common(dat, raw_inode); + if (err) + goto failed; + + unlock_new_inode(dat); + out: + *inodep = dat; + return 0; + failed: + iget_failed(dat); + return err; +} diff --git a/kernel/fs/nilfs2/dat.h b/kernel/fs/nilfs2/dat.h new file mode 100644 index 000000000..cbd8e9732 --- /dev/null +++ b/kernel/fs/nilfs2/dat.h @@ -0,0 +1,59 @@ +/* + * dat.h - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_DAT_H +#define _NILFS_DAT_H + +#include +#include +#include + + +struct nilfs_palloc_req; + +int nilfs_dat_translate(struct inode *, __u64, sector_t *); + +int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, + sector_t); +int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); +void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); + +int nilfs_dat_mark_dirty(struct inode *, __u64); +int nilfs_dat_freev(struct inode *, __u64 *, size_t); +int nilfs_dat_move(struct inode *, __u64, sector_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); + +int nilfs_dat_read(struct super_block *sb, size_t entry_size, + struct nilfs_inode *raw_inode, struct inode **inodep); + +#endif /* _NILFS_DAT_H */ diff --git a/kernel/fs/nilfs2/dir.c b/kernel/fs/nilfs2/dir.c new file mode 100644 index 000000000..0ee0bed36 --- /dev/null +++ b/kernel/fs/nilfs2/dir.c @@ -0,0 +1,676 @@ +/* + * dir.c - NILFS directory entry operations + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji + */ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include +#include "nilfs.h" +#include "page.h" + +/* + * nilfs uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned nilfs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return __block_write_begin(page, pos, to - from, nilfs_get_block); +} + +static void nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + struct inode *dir = mapping->host; + loff_t pos = page_offset(page) + from; + unsigned len = to - from; + unsigned nr_dirty, copied; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos + copied > dir->i_size) + i_size_write(dir, pos + copied); + if (IS_DIRSYNC(dir)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + err = nilfs_set_file_dirty(dir, nr_dirty); + WARN_ON(err); /* do not happen */ + unlock_page(page); +} + +static void nilfs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = nilfs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct nilfs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct nilfs_dir_entry *)(kaddr + offs); + rec_len = nilfs_rec_len_from_disk(p->rec_len); + + if (rec_len < NILFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + nilfs_error(sb, "nilfs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; +bad_entry: + nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<inode), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct nilfs_dir_entry *)(kaddr + offs); + nilfs_error(sb, "nilfs_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<inode)); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + nilfs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + nilfs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. + * + * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. + */ +static int +nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) +{ + return (struct nilfs_dir_entry *)((char *)p + + nilfs_rec_len_from_disk(p->rec_len)); +} + +static unsigned char +nilfs_filetype_table[NILFS_FT_MAX] = { + [NILFS_FT_UNKNOWN] = DT_UNKNOWN, + [NILFS_FT_REG_FILE] = DT_REG, + [NILFS_FT_DIR] = DT_DIR, + [NILFS_FT_CHRDEV] = DT_CHR, + [NILFS_FT_BLKDEV] = DT_BLK, + [NILFS_FT_FIFO] = DT_FIFO, + [NILFS_FT_SOCK] = DT_SOCK, + [NILFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char +nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, +}; + +static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) +{ + umode_t mode = inode->i_mode; + + de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static int nilfs_readdir(struct file *file, struct dir_context *ctx) +{ + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); +/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ + + if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) + return 0; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct nilfs_dir_entry *de; + struct page *page = nilfs_get_page(inode, n); + + if (IS_ERR(page)) { + nilfs_error(sb, __func__, "bad page in #%lu", + inode->i_ino); + ctx->pos += PAGE_CACHE_SIZE - offset; + return -EIO; + } + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)(kaddr + offset); + limit = kaddr + nilfs_last_byte(inode, n) - + NILFS_DIR_REC_LEN(1); + for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { + if (de->rec_len == 0) { + nilfs_error(sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + return -EIO; + } + if (de->inode) { + unsigned char t; + + if (de->file_type < NILFS_FT_MAX) + t = nilfs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; + + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), t)) { + nilfs_put_page(page); + return 0; + } + } + ctx->pos += nilfs_rec_len_from_disk(de->rec_len); + } + nilfs_put_page(page); + } + return 0; +} + +/* + * nilfs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct nilfs_dir_entry * +nilfs_find_entry(struct inode *dir, const struct qstr *qstr, + struct page **res_page) +{ + const unsigned char *name = qstr->name; + int namelen = qstr->len; + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct nilfs_inode_info *ei = NILFS_I(dir); + struct nilfs_dir_entry *de; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = nilfs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; + } + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + nilfs_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block count %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = nilfs_get_page(dir, 0); + struct nilfs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = nilfs_next_entry( + (struct nilfs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +{ + ino_t res = 0; + struct nilfs_dir_entry *de; + struct page *page; + + de = nilfs_find_entry(dir, qstr, &page); + if (de) { + res = le64_to_cpu(de->inode); + kunmap(page); + page_cache_release(page); + } + return res; +} + +/* Releases the page */ +void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct page *page, struct inode *inode) +{ + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); + struct address_space *mapping = page->mapping; + int err; + + lock_page(page); + err = nilfs_prepare_chunk(page, from, to); + BUG_ON(err); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + nilfs_commit_chunk(page, mapping, from, to); + nilfs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +} + +/* + * Parent is locked. + */ +int nilfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = nilfs_chunk_size(dir); + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct nilfs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = nilfs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + nilfs_last_byte(dir, n); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = nilfs_rec_len_to_disk(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (nilfs_match(namelen, name, de)) + goto out_unlock; + name_len = NILFS_DIR_REC_LEN(de->name_len); + rec_len = nilfs_rec_len_from_disk(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct nilfs_dir_entry *)((char *)de + rec_len); + } + unlock_page(page); + nilfs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + from = (char *)de - (char *)page_address(page); + to = from + rec_len; + err = nilfs_prepare_chunk(page, from, to); + if (err) + goto out_unlock; + if (de->inode) { + struct nilfs_dir_entry *de1; + + de1 = (struct nilfs_dir_entry *)((char *)de + name_len); + de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len); + de->rec_len = nilfs_rec_len_to_disk(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + nilfs_commit_chunk(page, page->mapping, from, to); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + nilfs_mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + nilfs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * nilfs_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + unsigned to = ((char *)dir - kaddr) + + nilfs_rec_len_from_disk(dir->rec_len); + struct nilfs_dir_entry *pde = NULL; + struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + int err; + + while ((char *)de < (char *)dir) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = nilfs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + lock_page(page); + err = nilfs_prepare_chunk(page, from, to); + BUG_ON(err); + if (pde) + pde->rec_len = nilfs_rec_len_to_disk(to - from); + dir->inode = 0; + nilfs_commit_chunk(page, mapping, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +out: + nilfs_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int nilfs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = nilfs_chunk_size(inode); + struct nilfs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = nilfs_prepare_chunk(page, 0, chunk_size); + if (unlikely(err)) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page); + memset(kaddr, 0, chunk_size); + de = (struct nilfs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1)); + memcpy(de->name, ".\0\0", 4); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + + de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1)); + de->inode = cpu_to_le64(parent->i_ino); + memcpy(de->name, "..\0", 4); + nilfs_set_de_type(de, inode); + kunmap_atomic(kaddr); + nilfs_commit_chunk(page, mapping, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int nilfs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct nilfs_dir_entry *de; + + page = nilfs_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry " + "(kaddr=%p, de=%p)\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le64(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + return 1; + +not_empty: + nilfs_put_page(page); + return 0; +} + +const struct file_operations nilfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = nilfs_readdir, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_compat_ioctl, +#endif /* CONFIG_COMPAT */ + .fsync = nilfs_sync_file, + +}; diff --git a/kernel/fs/nilfs2/direct.c b/kernel/fs/nilfs2/direct.c new file mode 100644 index 000000000..ebf89fd8a --- /dev/null +++ b/kernel/fs/nilfs2/direct.c @@ -0,0 +1,386 @@ +/* + * direct.c - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include "nilfs.h" +#include "page.h" +#include "direct.h" +#include "alloc.h" +#include "dat.h" + +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct) +{ + return (__le64 *) + ((struct nilfs_direct_node *)direct->b_u.u_data + 1); +} + +static inline __u64 +nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key) +{ + return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key)); +} + +static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct, + __u64 key, __u64 ptr) +{ + *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr); +} + +static int nilfs_direct_lookup(const struct nilfs_bmap *direct, + __u64 key, int level, __u64 *ptrp) +{ + __u64 ptr; + + if (key > NILFS_DIRECT_KEY_MAX || level != 1) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + *ptrp = ptr; + return 0; +} + +static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, + __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int ret, cnt; + + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + if (NILFS_BMAP_USE_VBN(direct)) { + dat = nilfs_bmap_get_dat(direct); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + return ret; + ptr = blocknr; + } + + maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1); + for (cnt = 1; cnt < maxblocks && + (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) != + NILFS_BMAP_INVALID_PTR; + cnt++) { + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + return ret; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt) + break; + } + *ptrp = ptr; + return cnt; +} + +static __u64 +nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else + /* block group */ + return nilfs_bmap_find_target_in_group(direct); +} + +static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + union nilfs_bmap_ptr_req req; + struct inode *dat = NULL; + struct buffer_head *bh; + int ret; + + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR) + return -EEXIST; + + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(bmap, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); + + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(bmap, key, req.bpr_ptr); + + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); + + nilfs_inode_add_blocks(bmap->b_inode, 1); + } + return ret; +} + +static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) +{ + union nilfs_bmap_ptr_req req; + struct inode *dat; + int ret; + + if (key > NILFS_DIRECT_KEY_MAX || + nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(bmap, key); + + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); + nilfs_inode_sub_blocks(bmap->b_inode, 1); + } + return ret; +} + +static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start, + __u64 *keyp) +{ + __u64 key; + + for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) { + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) { + *keyp = key; + return 0; + } + } + return -ENOENT; +} + +static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp) +{ + __u64 key, lastkey; + + lastkey = NILFS_DIRECT_KEY_MAX + 1; + for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) + lastkey = key; + + if (lastkey == NILFS_DIRECT_KEY_MAX + 1) + return -ENOENT; + + *keyp = lastkey; + + return 0; +} + +static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) +{ + return key > NILFS_DIRECT_KEY_MAX; +} + +static int nilfs_direct_gather_data(struct nilfs_bmap *direct, + __u64 *keys, __u64 *ptrs, int nitems) +{ + __u64 key; + __u64 ptr; + int n; + + if (nitems > NILFS_DIRECT_NBLOCKS) + nitems = NILFS_DIRECT_NBLOCKS; + n = 0; + for (key = 0; key < nitems; key++) { + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) { + keys[n] = key; + ptrs[n] = ptr; + n++; + } + } + return n; +} + +int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, + __u64 key, __u64 *keys, __u64 *ptrs, int n) +{ + __le64 *dptrs; + int ret, i, j; + + /* no need to allocate any resource for conversion */ + + /* delete */ + ret = bmap->b_ops->bop_delete(bmap, key); + if (ret < 0) + return ret; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* convert */ + dptrs = nilfs_direct_dptrs(bmap); + for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { + if ((j < n) && (i == keys[j])) { + dptrs[i] = (i != key) ? + cpu_to_le64(ptrs[j]) : + NILFS_BMAP_INVALID_PTR; + j++; + } else + dptrs[i] = NILFS_BMAP_INVALID_PTR; + } + + nilfs_direct_init(bmap); + return 0; +} + +static int nilfs_direct_propagate(struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; + __u64 key; + __u64 ptr; + int ret; + + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); + ptr = nilfs_direct_get_ptr(bmap, key); + if (!buffer_nilfs_volatile(bh)) { + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); + if (ret < 0) + return ret; + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); + set_buffer_nilfs_volatile(bh); + nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr); + } else + ret = nilfs_dat_mark_dirty(dat, ptr); + + return ret; +} + +static int nilfs_direct_assign_v(struct nilfs_bmap *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct inode *dat = nilfs_bmap_get_dat(direct); + union nilfs_bmap_ptr_req req; + int ret; + + req.bpr_ptr = ptr; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + } + return ret; +} + +static int nilfs_direct_assign_p(struct nilfs_bmap *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + nilfs_direct_set_ptr(direct, key, blocknr); + + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); + binfo->bi_dat.bi_level = 0; + + return 0; +} + +static int nilfs_direct_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + __u64 key; + __u64 ptr; + + key = nilfs_bmap_data_get_key(bmap, *bh); + if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { + printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, + (unsigned long long)key); + return -EINVAL; + } + ptr = nilfs_direct_get_ptr(bmap, key); + if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { + printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, + (unsigned long long)ptr); + return -EINVAL; + } + + return NILFS_BMAP_USE_VBN(bmap) ? + nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) : + nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo); +} + +static const struct nilfs_bmap_operations nilfs_direct_ops = { + .bop_lookup = nilfs_direct_lookup, + .bop_lookup_contig = nilfs_direct_lookup_contig, + .bop_insert = nilfs_direct_insert, + .bop_delete = nilfs_direct_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_direct_propagate, + + .bop_lookup_dirty_buffers = NULL, + + .bop_assign = nilfs_direct_assign, + .bop_mark = NULL, + + .bop_seek_key = nilfs_direct_seek_key, + .bop_last_key = nilfs_direct_last_key, + + .bop_check_insert = nilfs_direct_check_insert, + .bop_check_delete = NULL, + .bop_gather_data = nilfs_direct_gather_data, +}; + + +int nilfs_direct_init(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_direct_ops; + return 0; +} diff --git a/kernel/fs/nilfs2/direct.h b/kernel/fs/nilfs2/direct.h new file mode 100644 index 000000000..dc643de20 --- /dev/null +++ b/kernel/fs/nilfs2/direct.h @@ -0,0 +1,51 @@ +/* + * direct.h - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_DIRECT_H +#define _NILFS_DIRECT_H + +#include +#include +#include "bmap.h" + + +/** + * struct nilfs_direct_node - direct node + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) +#define NILFS_DIRECT_KEY_MIN 0 +#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) + + +int nilfs_direct_init(struct nilfs_bmap *); +int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, + __u64 *, int); + + +#endif /* _NILFS_DIRECT_H */ diff --git a/kernel/fs/nilfs2/export.h b/kernel/fs/nilfs2/export.h new file mode 100644 index 000000000..19ccbf952 --- /dev/null +++ b/kernel/fs/nilfs2/export.h @@ -0,0 +1,25 @@ +#ifndef NILFS_EXPORT_H +#define NILFS_EXPORT_H + +#include + +extern const struct export_operations nilfs_export_ops; + +/** + * struct nilfs_fid - NILFS file id type + * @cno: checkpoint number + * @ino: inode number + * @gen: file generation (version) for NFS + * @parent_gen: parent generation (version) for NFS + * @parent_ino: parent inode number + */ +struct nilfs_fid { + u64 cno; + u64 ino; + u32 gen; + + u32 parent_gen; + u64 parent_ino; +} __attribute__ ((packed)); + +#endif diff --git a/kernel/fs/nilfs2/file.c b/kernel/fs/nilfs2/file.c new file mode 100644 index 000000000..54575e3cc --- /dev/null +++ b/kernel/fs/nilfs2/file.c @@ -0,0 +1,165 @@ +/* + * file.c - NILFS regular file handling primitives including fsync(). + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji , + * Ryusuke Konishi + */ + +#include +#include +#include +#include "nilfs.h" +#include "segment.h" + +int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + /* + * Called from fsync() system call + * This is the only entry point that can catch write and synch + * timing for both data blocks and intermediate blocks. + * + * This function should be implemented when the writeback function + * will be implemented. + */ + struct the_nilfs *nilfs; + struct inode *inode = file->f_mapping->host; + int err = 0; + + if (nilfs_inode_dirty(inode)) { + if (datasync) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + start, end); + else + err = nilfs_construct_segment(inode->i_sb); + } + + nilfs = inode->i_sb->s_fs_info; + if (!err) + err = nilfs_flush_device(nilfs); + + return err; +} + +static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct nilfs_transaction_info ti; + int ret = 0; + + if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) + return VM_FAULT_SIGBUS; /* -ENOSPC */ + + sb_start_pagefault(inode->i_sb); + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { + unlock_page(page); + ret = -EFAULT; /* make the VM retry the fault */ + goto out; + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto mapped; + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int fully_mapped = 1; + + bh = head = page_buffers(page); + do { + if (!buffer_mapped(bh)) { + fully_mapped = 0; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (fully_mapped) { + SetPageMappedToDisk(page); + goto mapped; + } + } + unlock_page(page); + + /* + * fill hole blocks + */ + ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); + /* never returns -ENOMEM, but may return -ENOSPC */ + if (unlikely(ret)) + goto out; + + file_update_time(vma->vm_file); + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); + if (ret) { + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); + nilfs_transaction_commit(inode->i_sb); + + mapped: + wait_for_stable_page(page); + out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); +} + +static const struct vm_operations_struct nilfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = nilfs_page_mkwrite, +}; + +static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &nilfs_file_vm_ops; + return 0; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the nilfs filesystem. + */ +const struct file_operations nilfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_compat_ioctl, +#endif /* CONFIG_COMPAT */ + .mmap = nilfs_file_mmap, + .open = generic_file_open, + /* .release = nilfs_release_file, */ + .fsync = nilfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations nilfs_file_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, + .fiemap = nilfs_fiemap, +}; + +/* end of file */ diff --git a/kernel/fs/nilfs2/gcinode.c b/kernel/fs/nilfs2/gcinode.c new file mode 100644 index 000000000..748ca2389 --- /dev/null +++ b/kernel/fs/nilfs2/gcinode.c @@ -0,0 +1,197 @@ +/* + * gcinode.c - dummy inodes to buffer blocks for garbage collection + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara , Amagai Yoshiji , + * and Ryusuke Konishi . + * Revised by Ryusuke Konishi . + * + */ +/* + * This file adds the cache of on-disk blocks to be moved in garbage + * collection. The disk blocks are held with dummy inodes (called + * gcinodes), and this file provides lookup function of the dummy + * inodes and their buffer read function. + * + * Buffers and pages held by the dummy inodes will be released each + * time after they are copied to a new log. Dirty blocks made on the + * current generation and the blocks to be moved by GC never overlap + * because the dirty blocks make a new generation; they rather must be + * written individually. + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btree.h" +#include "btnode.h" +#include "page.h" +#include "mdt.h" +#include "dat.h" +#include "ifile.h" + +/* + * nilfs_gccache_submit_read_data() - add data buffer and submit read request + * @inode - gc inode + * @blkoff - dummy offset treated as the key for the page cache + * @pbn - physical block number of the block + * @vbn - virtual block number of the block, 0 for non-virtual block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_data() registers the data buffer + * specified by @pbn to the GC pagecache with the key @blkoff. + * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The block specified with @pbn does not exist. + */ +int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, + sector_t pbn, __u64 vbn, + struct buffer_head **out_bh) +{ + struct buffer_head *bh; + int err; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + return -ENOMEM; + + if (buffer_uptodate(bh)) + goto out; + + if (pbn == 0) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); + if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ + brelse(bh); + goto failed; + } + } + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + if (!buffer_mapped(bh)) { + bh->b_bdev = inode->i_sb->s_bdev; + set_buffer_mapped(bh); + } + bh->b_blocknr = pbn; + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + if (vbn) + bh->b_blocknr = vbn; + out: + err = 0; + *out_bh = bh; + + failed: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +/* + * nilfs_gccache_submit_read_node() - add node buffer and submit read request + * @inode - gc inode + * @pbn - physical block number for the block + * @vbn - virtual block number for the block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_node() registers the node buffer + * specified by @vbn to the GC pagecache. @pbn can be supplied by the + * caller to avoid translation of the disk block address. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, + __u64 vbn, struct buffer_head **out_bh) +{ + int ret; + + ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, READ, out_bh, &pbn); + if (ret == -EEXIST) /* internal code (cache hit) */ + ret = 0; + return ret; +} + +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) +{ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + if (buffer_dirty(bh)) + return -EEXIST; + + if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; + } + mark_buffer_dirty(bh); + return 0; +} + +int nilfs_init_gcinode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + inode->i_mode = S_IFREG; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_mapping->a_ops = &empty_aops; + + ii->i_flags = 0; + nilfs_bmap_init_gc(ii->i_bmap); + + return 0; +} + +/** + * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes + */ +void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) +{ + struct list_head *head = &nilfs->ns_gc_inodes; + struct nilfs_inode_info *ii; + + while (!list_empty(head)) { + ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); + list_del_init(&ii->i_dirty); + truncate_inode_pages(&ii->vfs_inode.i_data, 0); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); + iput(&ii->vfs_inode); + } +} diff --git a/kernel/fs/nilfs2/ifile.c b/kernel/fs/nilfs2/ifile.c new file mode 100644 index 000000000..6548c7851 --- /dev/null +++ b/kernel/fs/nilfs2/ifile.c @@ -0,0 +1,227 @@ +/* + * ifile.c - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji . + * Revised by Ryusuke Konishi . + * + */ + +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "ifile.h" + +/** + * struct nilfs_ifile_info - on-memory private data of ifile + * @mi: on-memory private data of metadata file + * @palloc_cache: persistent object allocator cache of ifile + */ +struct nilfs_ifile_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) +{ + return (struct nilfs_ifile_info *)NILFS_MDT(ifile); +} + +/** + * nilfs_ifile_create_inode - create a new disk inode + * @ifile: ifile inode + * @out_ino: pointer to a variable to store inode number + * @out_bh: buffer_head contains newly allocated disk inode + * + * Return Value: On success, 0 is returned and the newly allocated inode + * number is stored in the place pointed by @ino, and buffer_head pointer + * that contains newly allocated disk inode structure is stored in the + * place pointed by @out_bh + * On error, one of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No inode left. + */ +int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, + struct buffer_head **out_bh) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = 0; /* 0 says find free inode from beginning of + a group. dull code!! */ + req.pr_entry_bh = NULL; + + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + nilfs_palloc_commit_alloc_entry(ifile, &req); + mark_buffer_dirty(req.pr_entry_bh); + nilfs_mdt_mark_dirty(ifile); + *out_ino = (ino_t)req.pr_entry_nr; + *out_bh = req.pr_entry_bh; + return 0; +} + +/** + * nilfs_ifile_delete_inode - delete a disk inode + * @ifile: ifile inode + * @ino: inode number + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The inode number @ino have not been allocated. + */ +int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) +{ + struct nilfs_palloc_req req = { + .pr_entry_nr = ino, .pr_entry_bh = NULL + }; + struct nilfs_inode *raw_inode; + void *kaddr; + int ret; + + ret = nilfs_palloc_prepare_free_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_free_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + + kaddr = kmap_atomic(req.pr_entry_bh->b_page); + raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, + req.pr_entry_bh, kaddr); + raw_inode->i_flags = 0; + kunmap_atomic(kaddr); + + mark_buffer_dirty(req.pr_entry_bh); + brelse(req.pr_entry_bh); + + nilfs_palloc_commit_free_entry(ifile, &req); + + return 0; +} + +int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, + struct buffer_head **out_bh) +{ + struct super_block *sb = ifile->i_sb; + int err; + + if (unlikely(!NILFS_VALID_INODE(sb, ino))) { + nilfs_error(sb, __func__, "bad inode number: %lu", + (unsigned long) ino); + return -EINVAL; + } + + err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); + if (unlikely(err)) + nilfs_warning(sb, __func__, "unable to read inode: %lu", + (unsigned long) ino); + return err; +} + +/** + * nilfs_ifile_count_free_inodes - calculate free inodes count + * @ifile: ifile inode + * @nmaxinodes: current maximum of available inodes count [out] + * @nfreeinodes: free inodes count [out] + */ +int nilfs_ifile_count_free_inodes(struct inode *ifile, + u64 *nmaxinodes, u64 *nfreeinodes) +{ + u64 nused; + int err; + + *nmaxinodes = 0; + *nfreeinodes = 0; + + nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count); + err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); + if (likely(!err)) + *nfreeinodes = *nmaxinodes - nused; + return err; +} + +/** + * nilfs_ifile_read - read or get ifile inode + * @sb: super block instance + * @root: root object + * @inode_size: size of an inode + * @raw_inode: on-disk ifile inode + * @inodep: buffer to store the inode + */ +int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, + size_t inode_size, struct nilfs_inode *raw_inode, + struct inode **inodep) +{ + struct inode *ifile; + int err; + + ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); + if (unlikely(!ifile)) + return -ENOMEM; + if (!(ifile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, + sizeof(struct nilfs_ifile_info)); + if (err) + goto failed; + + err = nilfs_palloc_init_blockgroup(ifile, inode_size); + if (err) + goto failed; + + nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache); + + err = nilfs_read_inode_common(ifile, raw_inode); + if (err) + goto failed; + + unlock_new_inode(ifile); + out: + *inodep = ifile; + return 0; + failed: + iget_failed(ifile); + return err; +} diff --git a/kernel/fs/nilfs2/ifile.h b/kernel/fs/nilfs2/ifile.h new file mode 100644 index 000000000..679674d13 --- /dev/null +++ b/kernel/fs/nilfs2/ifile.h @@ -0,0 +1,58 @@ +/* + * ifile.h - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji + * Revised by Ryusuke Konishi + * + */ + +#ifndef _NILFS_IFILE_H +#define _NILFS_IFILE_H + +#include +#include +#include +#include "mdt.h" +#include "alloc.h" + + +static inline struct nilfs_inode * +nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) +{ + void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); +} + +static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, + struct buffer_head *ibh) +{ + kunmap(ibh->b_page); +} + +int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); +int nilfs_ifile_delete_inode(struct inode *, ino_t); +int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); + +int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); + +int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, + size_t inode_size, struct nilfs_inode *raw_inode, + struct inode **inodep); + +#endif /* _NILFS_IFILE_H */ diff --git a/kernel/fs/nilfs2/inode.c b/kernel/fs/nilfs2/inode.c new file mode 100644 index 000000000..258d9fe25 --- /dev/null +++ b/kernel/fs/nilfs2/inode.c @@ -0,0 +1,1135 @@ +/* + * inode.c - NILFS inode operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" +#include "cpfile.h" +#include "ifile.h" + +/** + * struct nilfs_iget_args - arguments used during comparison between inodes + * @ino: inode number + * @cno: checkpoint number + * @root: pointer on NILFS root object (mounted checkpoint) + * @for_gc: inode for GC flag + */ +struct nilfs_iget_args { + u64 ino; + __u64 cno; + struct nilfs_root *root; + int for_gc; +}; + +static int nilfs_iget_test(struct inode *inode, void *opaque); + +void nilfs_inode_add_blocks(struct inode *inode, int n) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + + inode_add_bytes(inode, (1 << inode->i_blkbits) * n); + if (root) + atomic64_add(n, &root->blocks_count); +} + +void nilfs_inode_sub_blocks(struct inode *inode, int n) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + + inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); + if (root) + atomic64_sub(n, &root->blocks_count); +} + +/** + * nilfs_get_block() - get a file block on the filesystem (callback function) + * @inode - inode struct of the target file + * @blkoff - file block number + * @bh_result - buffer head to be mapped on + * @create - indicate whether allocating the block or not when it has not + * been allocated yet. + * + * This function does not issue actual read request of the specified data + * block. It is done by VFS. + */ +int nilfs_get_block(struct inode *inode, sector_t blkoff, + struct buffer_head *bh_result, int create) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 blknum = 0; + int err = 0, ret; + unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + if (ret >= 0) { /* found */ + map_bh(bh_result, inode->i_sb, blknum); + if (ret > 0) + bh_result->b_size = (ret << inode->i_blkbits); + goto out; + } + /* data block was not found */ + if (ret == -ENOENT && create) { + struct nilfs_transaction_info ti; + + bh_result->b_blocknr = 0; + err = nilfs_transaction_begin(inode->i_sb, &ti, 1); + if (unlikely(err)) + goto out; + err = nilfs_bmap_insert(ii->i_bmap, blkoff, + (unsigned long)bh_result); + if (unlikely(err != 0)) { + if (err == -EEXIST) { + /* + * The get_block() function could be called + * from multiple callers for an inode. + * However, the page having this block must + * be locked in this case. + */ + printk(KERN_WARNING + "nilfs_get_block: a race condition " + "while inserting a data block. " + "(inode number=%lu, file block " + "offset=%llu)\n", + inode->i_ino, + (unsigned long long)blkoff); + err = 0; + } + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_mark_inode_dirty_sync(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ + /* Error handling should be detailed */ + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed + to proper value */ + } else if (ret == -ENOENT) { + /* not found is not error (e.g. hole); must return without + the mapped state flag. */ + ; + } else { + err = ret; + } + + out: + return err; +} + +/** + * nilfs_readpage() - implement readpage() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @page - the page to be read + */ +static int nilfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, nilfs_get_block); +} + +/** + * nilfs_readpages() - implement readpages() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @mapping - address_space struct used for reading multiple pages + * @pages - the pages to be read + * @nr_pages - number of pages to be read + */ +static int nilfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); +} + +static int nilfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int err = 0; + + if (inode->i_sb->s_flags & MS_RDONLY) { + nilfs_clear_dirty_pages(mapping, false); + return -EROFS; + } + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + wbc->range_start, + wbc->range_end); + return err; +} + +static int nilfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int err; + + if (inode->i_sb->s_flags & MS_RDONLY) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (wbc->sync_mode == WB_SYNC_ALL) { + err = nilfs_construct_segment(inode->i_sb); + if (unlikely(err)) + return err; + } else if (wbc->for_reclaim) + nilfs_flush_segment(inode->i_sb, inode->i_ino); + + return 0; +} + +static int nilfs_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = __set_page_dirty_nobuffers(page); + + if (page_has_buffers(page)) { + unsigned nr_dirty = 0; + struct buffer_head *bh, *head; + + /* + * This page is locked by callers, and no other thread + * concurrently marks its buffers dirty since they are + * only dirtied through routines in fs/buffer.c in + * which call sites of mark_buffer_dirty are protected + * by page lock. + */ + bh = head = page_buffers(page); + do { + /* Do not mark hole blocks dirty */ + if (buffer_dirty(bh) || !buffer_mapped(bh)) + continue; + + set_buffer_dirty(bh); + nr_dirty++; + } while (bh = bh->b_this_page, bh != head); + + if (nr_dirty) + nilfs_set_file_dirty(inode, nr_dirty); + } else if (ret) { + unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(inode, nr_dirty); + } + return ret; +} + +void nilfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + nilfs_truncate(inode); + } +} + +static int nilfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + +{ + struct inode *inode = mapping->host; + int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); + + if (unlikely(err)) + return err; + + err = block_write_begin(mapping, pos, len, flags, pagep, + nilfs_get_block); + if (unlikely(err)) { + nilfs_write_failed(mapping, pos + len); + nilfs_transaction_abort(inode->i_sb); + } + return err; +} + +static int nilfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned nr_dirty; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, start, + start + copied); + copied = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + nilfs_set_file_dirty(inode, nr_dirty); + err = nilfs_transaction_commit(inode->i_sb); + return err ? : copied; +} + +static ssize_t +nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t size; + + if (iov_iter_rw(iter) == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + nilfs_write_failed(mapping, end); + } + + return size; +} + +const struct address_space_operations nilfs_aops = { + .writepage = nilfs_writepage, + .readpage = nilfs_readpage, + .writepages = nilfs_writepages, + .set_page_dirty = nilfs_set_page_dirty, + .readpages = nilfs_readpages, + .write_begin = nilfs_write_begin, + .write_end = nilfs_write_end, + /* .releasepage = nilfs_releasepage, */ + .invalidatepage = block_invalidatepage, + .direct_IO = nilfs_direct_IO, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static int nilfs_insert_inode_locked(struct inode *inode, + struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); +} + +struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct the_nilfs *nilfs = sb->s_fs_info; + struct inode *inode; + struct nilfs_inode_info *ii; + struct nilfs_root *root; + int err = -ENOMEM; + ino_t ino; + + inode = new_inode(sb); + if (unlikely(!inode)) + goto failed; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + root = NILFS_I(dir)->i_root; + ii = NILFS_I(inode); + ii->i_state = 1 << NILFS_I_NEW; + ii->i_root = root; + + err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + + atomic64_inc(&root->inodes_count); + inode_init_owner(inode, dir, mode); + inode->i_ino = ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + err = nilfs_bmap_read(ii->i_bmap, NULL); + if (err < 0) + goto failed_after_creation; + + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + + ii->i_flags = nilfs_mask_flags( + mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); + + /* ii->i_file_acl = 0; */ + /* ii->i_dir_acl = 0; */ + ii->i_dir_start_lookup = 0; + nilfs_set_inode_flags(inode); + spin_lock(&nilfs->ns_next_gen_lock); + inode->i_generation = nilfs->ns_next_generation++; + spin_unlock(&nilfs->ns_next_gen_lock); + if (nilfs_insert_inode_locked(inode, root, ino) < 0) { + err = -EIO; + goto failed_after_creation; + } + + err = nilfs_init_acl(inode, dir); + if (unlikely(err)) + goto failed_after_creation; /* never occur. When supporting + nilfs_init_acl(), proper cancellation of + above jobs should be considered */ + + return inode; + + failed_after_creation: + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); /* raw_inode will be deleted through + nilfs_evict_inode() */ + goto failed; + + failed_ifile_create_inode: + make_bad_inode(inode); + iput(inode); /* if i_nlink == 1, generic_forget_inode() will be + called */ + failed: + return ERR_PTR(err); +} + +void nilfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = NILFS_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & FS_SYNC_FL) + new_fl |= S_SYNC; + if (flags & FS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE | + S_NOATIME | S_DIRSYNC); +} + +int nilfs_read_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + i_uid_write(inode, le32_to_cpu(raw_inode->i_uid)); + i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le64_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (inode->i_nlink == 0) + return -ESTALE; /* this inode is deleted */ + + inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); + ii->i_flags = le32_to_cpu(raw_inode->i_flags); +#if 0 + ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ii->i_dir_acl = S_ISREG(inode->i_mode) ? + 0 : le32_to_cpu(raw_inode->i_dir_acl); +#endif + ii->i_dir_start_lookup = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + err = nilfs_bmap_read(ii->i_bmap, raw_inode); + if (err < 0) + return err; + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + return 0; +} + +static int __nilfs_read_inode(struct super_block *sb, + struct nilfs_root *root, unsigned long ino, + struct inode *inode) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *bh; + struct nilfs_inode *raw_inode; + int err; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); + if (unlikely(err)) + goto bad_inode; + + raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); + + err = nilfs_read_inode_common(inode, raw_inode); + if (err) + goto failed_unmap; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else { + inode->i_op = &nilfs_special_inode_operations; + init_special_inode( + inode, inode->i_mode, + huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } + nilfs_ifile_unmap_inode(root->ifile, ino, bh); + brelse(bh); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + nilfs_set_inode_flags(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + return 0; + + failed_unmap: + nilfs_ifile_unmap_inode(root->ifile, ino, bh); + brelse(bh); + + bad_inode: + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + return err; +} + +static int nilfs_iget_test(struct inode *inode, void *opaque) +{ + struct nilfs_iget_args *args = opaque; + struct nilfs_inode_info *ii; + + if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) + return 0; + + ii = NILFS_I(inode); + if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) + return !args->for_gc; + + return args->for_gc && args->cno == ii->i_cno; +} + +static int nilfs_iget_set(struct inode *inode, void *opaque) +{ + struct nilfs_iget_args *args = opaque; + + inode->i_ino = args->ino; + if (args->for_gc) { + NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; + NILFS_I(inode)->i_cno = args->cno; + NILFS_I(inode)->i_root = NULL; + } else { + if (args->root && args->ino == NILFS_ROOT_INO) + nilfs_get_root(args->root); + NILFS_I(inode)->i_root = args->root; + } + return 0; +} + +struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return ilookup5(sb, ino, nilfs_iget_test, &args); +} + +struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); +} + +struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct inode *inode; + int err; + + inode = nilfs_iget_locked(sb, root, ino); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = __nilfs_read_inode(sb, root, ino, inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, + __u64 cno) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 + }; + struct inode *inode; + int err; + + inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = nilfs_init_gcinode(inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode, int has_bmap) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + raw_inode->i_uid = cpu_to_le32(i_uid_read(inode)); + raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le64(inode->i_size); + raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); + + raw_inode->i_flags = cpu_to_le32(ii->i_flags); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + + if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* zero-fill unused portion in the case of super root block */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + } + + if (has_bmap) + nilfs_bmap_write(ii->i_bmap, raw_inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(huge_encode_dev(inode->i_rdev)); + /* When extending inode, nilfs->ns_inode_size should be checked + for substitutions of appended fields */ +} + +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) +{ + ino_t ino = inode->i_ino; + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *ifile = ii->i_root->ifile; + struct nilfs_inode *raw_inode; + + raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); + + if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) + memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); + if (flags & I_DIRTY_DATASYNC) + set_bit(NILFS_I_INODE_SYNC, &ii->i_state); + + nilfs_write_inode_common(inode, raw_inode, 0); + /* XXX: call with has_bmap = 0 is a workaround to avoid + deadlock of bmap. This delays update of i_bmap to just + before writing */ + nilfs_ifile_unmap_inode(ifile, ino, ibh); +} + +#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ + +static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, + unsigned long from) +{ + __u64 b; + int ret; + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; +repeat: + ret = nilfs_bmap_last_key(ii->i_bmap, &b); + if (ret == -ENOENT) + return; + else if (ret < 0) + goto failed; + + if (b < from) + return; + + b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + ret = nilfs_bmap_truncate(ii->i_bmap, b); + nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); + if (!ret || (ret == -ENOMEM && + nilfs_bmap_truncate(ii->i_bmap, b) == 0)) + goto repeat; + +failed: + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); +} + +void nilfs_truncate(struct inode *inode) +{ + unsigned long blkoff; + unsigned int blocksize; + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + blocksize = sb->s_blocksize; + blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); + + nilfs_truncate_bmap(ii, blkoff); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_mark_inode_dirty(inode); + nilfs_set_file_dirty(inode, 0); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But truncate has no return value. */ +} + +static void nilfs_clear_inode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + /* + * Free resources allocated in nilfs_read_inode(), here. + */ + BUG_ON(!list_empty(&ii->i_dirty)); + brelse(ii->i_bh); + ii->i_bh = NULL; + + if (mdi && mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); + + nilfs_btnode_cache_clear(&ii->i_btnode_cache); + + if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) + nilfs_put_root(ii->i_root); +} + +void nilfs_evict_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + int ret; + + if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + nilfs_clear_inode(inode); + return; + } + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + truncate_inode_pages_final(&inode->i_data); + + /* TODO: some of the following operations may fail. */ + nilfs_truncate_bmap(ii, 0); + nilfs_mark_inode_dirty(inode); + clear_inode(inode); + + ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); + if (!ret) + atomic64_dec(&ii->i_root->inodes_count); + + nilfs_clear_inode(inode); + + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But delete_inode has no return value. */ +} + +int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct nilfs_transaction_info ti; + struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + + err = nilfs_transaction_begin(sb, &ti, 0); + if (unlikely(err)) + return err; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + truncate_setsize(inode, iattr->ia_size); + nilfs_truncate(inode); + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) { + err = nilfs_acl_chmod(inode); + if (unlikely(err)) + goto out_err; + } + + return nilfs_transaction_commit(sb); + +out_err: + nilfs_transaction_abort(sb); + return err; +} + +int nilfs_permission(struct inode *inode, int mask) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + if ((mask & MAY_WRITE) && root && + root->cno != NILFS_CPTREE_CURRENT_CNO) + return -EROFS; /* snapshot is not writable */ + + return generic_permission(inode, mask); +} + +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + spin_lock(&nilfs->ns_inode_lock); + if (ii->i_bh == NULL) { + spin_unlock(&nilfs->ns_inode_lock); + err = nilfs_ifile_get_inode_block(ii->i_root->ifile, + inode->i_ino, pbh); + if (unlikely(err)) + return err; + spin_lock(&nilfs->ns_inode_lock); + if (ii->i_bh == NULL) + ii->i_bh = *pbh; + else { + brelse(*pbh); + *pbh = ii->i_bh; + } + } else + *pbh = ii->i_bh; + + get_bh(*pbh); + spin_unlock(&nilfs->ns_inode_lock); + return 0; +} + +int nilfs_inode_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + int ret = 0; + + if (!list_empty(&ii->i_dirty)) { + spin_lock(&nilfs->ns_inode_lock); + ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || + test_bit(NILFS_I_BUSY, &ii->i_state); + spin_unlock(&nilfs->ns_inode_lock); + } + return ret; +} + +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); + + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) + return 0; + + spin_lock(&nilfs->ns_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + /* Because this routine may race with nilfs_dispose_list(), + we have to check NILFS_I_QUEUED here, too. */ + if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { + /* This will happen when somebody is freeing + this inode. */ + nilfs_warning(inode->i_sb, __func__, + "cannot get inode (ino=%lu)\n", + inode->i_ino); + spin_unlock(&nilfs->ns_inode_lock); + return -EINVAL; /* NILFS_I_DIRTY may remain for + freeing inode */ + } + list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); + set_bit(NILFS_I_QUEUED, &ii->i_state); + } + spin_unlock(&nilfs->ns_inode_lock); + return 0; +} + +int __nilfs_mark_inode_dirty(struct inode *inode, int flags) +{ + struct buffer_head *ibh; + int err; + + err = nilfs_load_inode_block(inode, &ibh); + if (unlikely(err)) { + nilfs_warning(inode->i_sb, __func__, + "failed to reget inode block.\n"); + return err; + } + nilfs_update_inode(inode, ibh, flags); + mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); + brelse(ibh); + return 0; +} + +/** + * nilfs_dirty_inode - reflect changes on given inode to an inode block. + * @inode: inode of the file to be registered. + * + * nilfs_dirty_inode() loads a inode block containing the specified + * @inode and copies data from a nilfs_inode to a corresponding inode + * entry in the inode block. This operation is excluded from the segment + * construction. This function can be called both as a single operation + * and as a part of indivisible file operations. + */ +void nilfs_dirty_inode(struct inode *inode, int flags) +{ + struct nilfs_transaction_info ti; + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + if (is_bad_inode(inode)) { + nilfs_warning(inode->i_sb, __func__, + "tried to mark bad_inode dirty. ignored.\n"); + dump_stack(); + return; + } + if (mdi) { + nilfs_mdt_mark_dirty(inode); + return; + } + nilfs_transaction_begin(inode->i_sb, &ti, 0); + __nilfs_mark_inode_dirty(inode, flags); + nilfs_transaction_commit(inode->i_sb); /* never fails */ +} + +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 logical = 0, phys = 0, size = 0; + __u32 flags = 0; + loff_t isize; + sector_t blkoff, end_blkoff; + sector_t delalloc_blkoff; + unsigned long delalloc_blklen; + unsigned int blkbits = inode->i_blkbits; + int ret, n; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + + blkoff = start >> blkbits; + end_blkoff = (start + len - 1) >> blkbits; + + delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, + &delalloc_blkoff); + + do { + __u64 blkphy; + unsigned int maxblocks; + + if (delalloc_blklen && blkoff == delalloc_blkoff) { + if (size) { + /* End of the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + } + if (blkoff > end_blkoff) + break; + + flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; + logical = blkoff << blkbits; + phys = 0; + size = delalloc_blklen << blkbits; + + blkoff = delalloc_blkoff + delalloc_blklen; + delalloc_blklen = nilfs_find_uncommitted_extent( + inode, blkoff, &delalloc_blkoff); + continue; + } + + /* + * Limit the number of blocks that we look up so as + * not to get into the next delayed allocation extent. + */ + maxblocks = INT_MAX; + if (delalloc_blklen) + maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, + maxblocks); + blkphy = 0; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + n = nilfs_bmap_lookup_contig( + NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + if (n < 0) { + int past_eof; + + if (unlikely(n != -ENOENT)) + break; /* error */ + + /* HOLE */ + blkoff++; + past_eof = ((blkoff << blkbits) >= isize); + + if (size) { + /* End of the current extent */ + + if (past_eof) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + size = 0; + } + if (blkoff > end_blkoff || past_eof) + break; + } else { + if (size) { + if (phys && blkphy << blkbits == phys + size) { + /* The current extent goes on */ + size += n << blkbits; + } else { + /* Terminate the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, + flags); + if (ret || blkoff > end_blkoff) + break; + + /* Start another extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + } else { + /* Start a new extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + blkoff += n; + } + cond_resched(); + } while (true); + + /* If ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/kernel/fs/nilfs2/ioctl.c b/kernel/fs/nilfs2/ioctl.c new file mode 100644 index 000000000..9a20e513d --- /dev/null +++ b/kernel/fs/nilfs2/ioctl.c @@ -0,0 +1,1379 @@ +/* + * ioctl.c - NILFS ioctl operations. + * + * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include /* capable() */ +#include /* copy_from_user(), copy_to_user() */ +#include +#include /* compat_ptr() */ +#include /* mnt_want_write_file(), mnt_drop_write_file() */ +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "bmap.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" + +/** + * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @dir: set of direction flags + * @dofunc: concrete function of get/set metadata info + * + * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of + * calling dofunc() function on the basis of @argv argument. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ +static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, + struct nilfs_argv *argv, int dir, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) +{ + void *buf; + void __user *base = (void __user *)(unsigned long)argv->v_base; + size_t maxmembs, total, n; + ssize_t nr; + int ret, i; + __u64 pos, ppos; + + if (argv->v_nmembs == 0) + return 0; + + if (argv->v_size > PAGE_SIZE) + return -EINVAL; + + /* + * Reject pairs of a start item position (argv->v_index) and a + * total count (argv->v_nmembs) which leads position 'pos' to + * overflow by the increment at the end of the loop. + */ + if (argv->v_index > ~(__u64)0 - argv->v_nmembs) + return -EINVAL; + + buf = (void *)__get_free_pages(GFP_NOFS, 0); + if (unlikely(!buf)) + return -ENOMEM; + maxmembs = PAGE_SIZE / argv->v_size; + + ret = 0; + total = 0; + pos = argv->v_index; + for (i = 0; i < argv->v_nmembs; i += n) { + n = (argv->v_nmembs - i < maxmembs) ? + argv->v_nmembs - i : maxmembs; + if ((dir & _IOC_WRITE) && + copy_from_user(buf, base + argv->v_size * i, + argv->v_size * n)) { + ret = -EFAULT; + break; + } + ppos = pos; + nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size, + n); + if (nr < 0) { + ret = nr; + break; + } + if ((dir & _IOC_READ) && + copy_to_user(base + argv->v_size * i, buf, + argv->v_size * nr)) { + ret = -EFAULT; + break; + } + total += nr; + if ((size_t)nr < n) + break; + if (pos == ppos) + pos += n; + } + argv->v_nmembs = total; + + free_pages((unsigned long)buf, 0); + return ret; +} + +/** + * nilfs_ioctl_getflags - ioctl to support lsattr + */ +static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) +{ + unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; + + return put_user(flags, (int __user *)argp); +} + +/** + * nilfs_ioctl_setflags - ioctl to support chattr + */ +static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, + void __user *argp) +{ + struct nilfs_transaction_info ti; + unsigned int flags, oldflags; + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)argp)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + flags = nilfs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + oldflags = NILFS_I(inode)->i_flags; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by the + * relevant capability. + */ + ret = -EPERM; + if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + + ret = nilfs_transaction_begin(inode->i_sb, &ti, 0); + if (ret) + goto out; + + NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) | + (flags & FS_FL_USER_MODIFIABLE); + + nilfs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_mark_inode_dirty(inode); + ret = nilfs_transaction_commit(inode->i_sb); +out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); + return ret; +} + +/** + * nilfs_ioctl_getversion - get info about a file's version (generation number) + */ +static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) +{ + return put_user(inode->i_generation, (int __user *)argp); +} + +/** + * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot) + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_change_cpmode() function changes mode of + * given checkpoint between checkpoint and snapshot state. This ioctl + * is used in chcp and mkcp utilities. + * + * Return Value: On success, 0 is returned and mode of a checkpoint is + * changed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint mode changing. + */ +static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_cpmode cpmode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + goto out; + + mutex_lock(&nilfs->ns_snapshot_mount_mutex); + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( + nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); +out: + mnt_drop_write_file(filp); + return ret; +} + +/** + * nilfs_ioctl_delete_checkpoint - remove checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_delete_checkpoint() function removes + * checkpoint from NILFS2 file system. This ioctl is used in rmcp + * utility. + * + * Return Value: On success, 0 is returned and a checkpoint is + * removed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint removing. + */ +static int +nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + __u64 cno; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&cno, argp, sizeof(cno))) + goto out; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ +out: + mnt_drop_write_file(filp); + return ret; +} + +/** + * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints + * @nilfs: nilfs object + * @posp: pointer on array of checkpoint's numbers + * @flags: checkpoint mode (checkpoint or snapshot) + * @buf: buffer for storing checkponts' info + * @size: size in bytes of one checkpoint info item in array + * @nmembs: number of checkpoints in array (numbers and infos) + * + * Description: nilfs_ioctl_do_get_cpinfo() function returns info about + * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in + * lscp utility and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_cpinfo structures in output buffer. + */ +static ssize_t +nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, + size, nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +/** + * nilfs_ioctl_get_cpstat - get checkpoints statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints. + * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting checkpoints statistics. + */ +static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_cpstat cpstat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &cpstat, sizeof(cpstat))) + ret = -EFAULT; + return ret; +} + +/** + * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info + * @nilfs: nilfs object + * @posp: pointer on array of segment numbers + * @flags: *not used* + * @buf: buffer for storing suinfo array + * @size: size in bytes of one suinfo item in array + * @nmembs: count of segment numbers and suinfos in array + * + * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage + * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used + * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_suinfo structures in output buffer. + */ +static ssize_t +nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size, + nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +/** + * nilfs_ioctl_get_sustat - get segment usage statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_sustat() returns segment usage statistics. + * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and segment usage information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting segment usage statistics. + */ +static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_sustat sustat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &sustat, sizeof(sustat))) + ret = -EFAULT; + return ret; +} + +/** + * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_vinfo structures + * @size: size in bytes of one vinfo item in array + * @nmembs: count of vinfos in array + * + * Description: nilfs_ioctl_do_get_vinfo() function returns information + * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used + * by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_vinfo structures in output buffer. + */ +static ssize_t +nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +/** + * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_bdesc structures + * @size: size in bytes of one bdesc item in array + * @nmembs: count of bdescs in array + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_bdescs structures in output buffer. + */ +static ssize_t +nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + down_read(&nilfs->ns_segctor_sem); + for (i = 0; i < nmembs; i++) { + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) { + up_read(&nilfs->ns_segctor_sem); + return ret; + } + bdescs[i].bd_blocknr = 0; + } + } + up_read(&nilfs->ns_segctor_sem); + return nmembs; +} + +/** + * nilfs_ioctl_get_bdescs - get disk block descriptors + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and disk block descriptors are + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting disk block descriptors. + */ +static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + if (argv.v_size != sizeof(struct nilfs_bdesc)) + return -EINVAL; + + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_bdescs); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +/** + * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC + * @inode: inode object + * @vdesc: descriptor of virtual block number + * @buffers: list of moving buffers + * + * Description: nilfs_ioctl_move_inode_block() function registers data/node + * buffer in the GC pagecache and submit read request. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Requested block doesn't exist. + * + * %-EEXIST - Blocks conflict is detected. + */ +static int nilfs_ioctl_move_inode_block(struct inode *inode, + struct nilfs_vdesc *vdesc, + struct list_head *buffers) +{ + struct buffer_head *bh; + int ret; + + if (vdesc->vd_flags == 0) + ret = nilfs_gccache_submit_read_data( + inode, vdesc->vd_offset, vdesc->vd_blocknr, + vdesc->vd_vblocknr, &bh); + else + ret = nilfs_gccache_submit_read_node( + inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh); + + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + printk(KERN_CRIT + "%s: invalid virtual block address (%s): " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + return ret; + } + if (unlikely(!list_empty(&bh->b_assoc_buffers))) { + printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " + "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + brelse(bh); + return -EEXIST; + } + list_add_tail(&bh->b_assoc_buffers, buffers); + return 0; +} + +/** + * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection + * @sb: superblock object + * @argv: vector of arguments from userspace + * @buf: array of nilfs_vdesc structures + * + * Description: nilfs_ioctl_move_blocks() function reads valid data/node + * blocks that garbage collector specified with the array of nilfs_vdesc + * structures and stores them into page caches of GC inodes. + * + * Return Value: Number of processed nilfs_vdesc structures or + * error code, otherwise. + */ +static int nilfs_ioctl_move_blocks(struct super_block *sb, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct the_nilfs *nilfs = sb->s_fs_info; + struct inode *inode; + struct nilfs_vdesc *vdesc; + struct buffer_head *bh, *n; + LIST_HEAD(buffers); + ino_t ino; + __u64 cno; + int i, ret; + + for (i = 0, vdesc = buf; i < nmembs; ) { + ino = vdesc->vd_ino; + cno = vdesc->vd_cno; + inode = nilfs_iget_for_gc(sb, ino, cno); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto failed; + } + if (list_empty(&NILFS_I(inode)->i_dirty)) { + /* + * Add the inode to GC inode list. Garbage Collection + * is serialized and no two processes manipulate the + * list simultaneously. + */ + igrab(inode); + list_add(&NILFS_I(inode)->i_dirty, + &nilfs->ns_gc_inodes); + } + + do { + ret = nilfs_ioctl_move_inode_block(inode, vdesc, + &buffers); + if (unlikely(ret < 0)) { + iput(inode); + goto failed; + } + vdesc++; + } while (++i < nmembs && + vdesc->vd_ino == ino && vdesc->vd_cno == cno); + + iput(inode); /* The inode still remains in GC inode list */ + } + + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + ret = nilfs_gccache_wait_and_mark_dirty(bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -EEXIST); + goto failed; + } + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return nmembs; + + failed: + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return ret; +} + +/** + * nilfs_ioctl_delete_checkpoints - delete checkpoints + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of periods of checkpoints numbers + * + * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints + * in the period from p_start to p_end, excluding p_end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: Number of processed nilfs_period structures or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct inode *cpfile = nilfs->ns_cpfile; + struct nilfs_period *periods = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_cpfile_delete_checkpoints( + cpfile, periods[i].p_start, periods[i].p_end); + if (ret < 0) + return ret; + } + return nmembs; +} + +/** + * nilfs_ioctl_free_vblocknrs - free virtual block numbers + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of virtual block numbers + * + * Description: nilfs_ioctl_free_vblocknrs() function frees + * the virtual block numbers specified by @buf and @argv->v_nmembs. + * + * Return Value: Number of processed virtual block numbers or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + int ret; + + ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs); + + return (ret < 0) ? ret : nmembs; +} + +/** + * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of block descriptors + * + * Description: nilfs_ioctl_mark_blocks_dirty() function marks + * metadata file or data blocks as dirty. + * + * Return Value: Number of processed block descriptors or + * error code, otherwise. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ +static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + /* XXX: use macro or inline func to check liveness */ + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr) + /* skip dead block */ + continue; + if (bdescs[i].bd_level == 0) { + ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat, + bdescs[i].bd_offset); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } else { + ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, + bdescs[i].bd_level); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } + } + return nmembs; +} + +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void **kbufs) +{ + const char *msg; + int ret; + + ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); + if (ret < 0) { + /* + * can safely abort because checkpoints can be removed + * independently. + */ + msg = "cannot delete checkpoints"; + goto failed; + } + ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]); + if (ret < 0) { + /* + * can safely abort because DAT file is updated atomically + * using a copy-on-write technique. + */ + msg = "cannot delete virtual blocks from DAT file"; + goto failed; + } + ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]); + if (ret < 0) { + /* + * can safely abort because the operation is nondestructive. + */ + msg = "cannot mark copying blocks dirty"; + goto failed; + } + return 0; + + failed: + printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", + msg, ret); + return ret; +} + +/** + * nilfs_ioctl_clean_segments - clean segments + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_clean_segments() function makes garbage + * collection operation in the environment of requested parameters + * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by + * nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ +static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct nilfs_argv argv[5]; + static const size_t argsz[5] = { + sizeof(struct nilfs_vdesc), + sizeof(struct nilfs_period), + sizeof(__u64), + sizeof(struct nilfs_bdesc), + sizeof(__u64), + }; + void __user *base; + void *kbufs[5]; + struct the_nilfs *nilfs; + size_t len, nsegs; + int n, ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + nsegs = argv[4].v_nmembs; + if (argv[4].v_size != argsz[4]) + goto out; + if (nsegs > UINT_MAX / sizeof(__u64)) + goto out; + + /* + * argv[4] points to segment numbers this ioctl cleans. We + * use kmalloc() for its buffer because memory used for the + * segment numbers is enough small. + */ + kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, + nsegs * sizeof(__u64)); + if (IS_ERR(kbufs[4])) { + ret = PTR_ERR(kbufs[4]); + goto out; + } + nilfs = inode->i_sb->s_fs_info; + + for (n = 0; n < 4; n++) { + ret = -EINVAL; + if (argv[n].v_size != argsz[n]) + goto out_free; + + if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) + goto out_free; + + if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size) + goto out_free; + + len = argv[n].v_size * argv[n].v_nmembs; + base = (void __user *)(unsigned long)argv[n].v_base; + if (len == 0) { + kbufs[n] = NULL; + continue; + } + + kbufs[n] = vmalloc(len); + if (!kbufs[n]) { + ret = -ENOMEM; + goto out_free; + } + if (copy_from_user(kbufs[n], base, len)) { + ret = -EFAULT; + vfree(kbufs[n]); + goto out_free; + } + } + + /* + * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else { + if (nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + } + + nilfs_remove_all_gcinodes(nilfs); + clear_nilfs_gc_running(nilfs); + +out_free: + while (--n >= 0) + vfree(kbufs[n]); + kfree(kbufs[4]); +out: + mnt_drop_write_file(filp); + return ret; +} + +/** + * nilfs_ioctl_sync - make a checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_sync() function constructs a logical segment + * for checkpointing. This function guarantees that all modified data + * and metadata are written out to the device when it successfully + * returned. + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ +static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + __u64 cno; + int ret; + struct the_nilfs *nilfs; + + ret = nilfs_construct_segment(inode->i_sb); + if (ret < 0) + return ret; + + nilfs = inode->i_sb->s_fs_info; + ret = nilfs_flush_device(nilfs); + if (ret < 0) + return ret; + + if (argp != NULL) { + down_read(&nilfs->ns_segctor_sem); + cno = nilfs->ns_cno - 1; + up_read(&nilfs->ns_segctor_sem); + if (copy_to_user(argp, &cno, sizeof(cno))) + return -EFAULT; + } + return 0; +} + +/** + * nilfs_ioctl_resize - resize NILFS2 volume + * @inode: inode object + * @filp: file object + * @argp: pointer on argument from userspace + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ +static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, + void __user *argp) +{ + __u64 newsize; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = mnt_want_write_file(filp); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_from_user(&newsize, argp, sizeof(newsize))) + goto out_drop_write; + + ret = nilfs_resize_fs(inode->i_sb, newsize); + +out_drop_write: + mnt_drop_write_file(filp); +out: + return ret; +} + +/** + * nilfs_ioctl_trim_fs() - trim ioctl handle function + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It + * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which + * performs the actual trim operation. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); + up_read(&nilfs->ns_segctor_sem); + + if (ret < 0) + return ret; + + if (copy_to_user(argp, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +/** + * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit + * of segments in bytes and upper limit of segments in bytes. + * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ +static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 range[2]; + __u64 minseg, maxseg; + unsigned long segbytes; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EFAULT; + if (copy_from_user(range, argp, sizeof(__u64[2]))) + goto out; + + ret = -ERANGE; + if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + goto out; + + segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; + + minseg = range[0] + segbytes - 1; + do_div(minseg, segbytes); + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + do_div(maxseg, segbytes); + maxseg--; + + ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg); +out: + return ret; +} + +/** + * nilfs_ioctl_get_info - wrapping function of get metadata info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * @membsz: size of an item in bytes + * @dofunc: concrete function of getting metadata info + * + * Description: nilfs_ioctl_get_info() gets metadata info by means of + * calling dofunc() function. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ +static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp, + size_t membsz, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) + +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + if (argv.v_size < membsz) + return -EINVAL; + + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +/** + * nilfs_ioctl_set_suinfo - set segment usage info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: Expects an array of nilfs_suinfo_update structures + * encapsulated in nilfs_argv and updates the segment usage info + * according to the flags in nilfs_suinfo_update. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EPERM - Not enough permissions + * + * %-EFAULT - Error copying input data + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_argv argv; + size_t len; + void __user *base; + void *kbuf; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + if (argv.v_size < sizeof(struct nilfs_suinfo_update)) + goto out; + + if (argv.v_nmembs > nilfs->ns_nsegments) + goto out; + + if (argv.v_nmembs >= UINT_MAX / argv.v_size) + goto out; + + len = argv.v_size * argv.v_nmembs; + if (!len) { + ret = 0; + goto out; + } + + base = (void __user *)(unsigned long)argv.v_base; + kbuf = vmalloc(len); + if (!kbuf) { + ret = -ENOMEM; + goto out; + } + + if (copy_from_user(kbuf, base, len)) { + ret = -EFAULT; + goto out_free; + } + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size, + argv.v_nmembs); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + +out_free: + vfree(kbuf); +out: + mnt_drop_write_file(filp); + return ret; +} + +long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return nilfs_ioctl_getflags(inode, argp); + case FS_IOC_SETFLAGS: + return nilfs_ioctl_setflags(inode, filp, argp); + case FS_IOC_GETVERSION: + return nilfs_ioctl_getversion(inode, argp); + case NILFS_IOCTL_CHANGE_CPMODE: + return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); + case NILFS_IOCTL_DELETE_CHECKPOINT: + return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_cpinfo), + nilfs_ioctl_do_get_cpinfo); + case NILFS_IOCTL_GET_CPSTAT: + return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_suinfo), + nilfs_ioctl_do_get_suinfo); + case NILFS_IOCTL_SET_SUINFO: + return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUSTAT: + return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_VINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_vinfo), + nilfs_ioctl_do_get_vinfo); + case NILFS_IOCTL_GET_BDESCS: + return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); + case NILFS_IOCTL_CLEAN_SEGMENTS: + return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); + case NILFS_IOCTL_SYNC: + return nilfs_ioctl_sync(inode, filp, cmd, argp); + case NILFS_IOCTL_RESIZE: + return nilfs_ioctl_resize(inode, filp, argp); + case NILFS_IOCTL_SET_ALLOC_RANGE: + return nilfs_ioctl_set_alloc_range(inode, argp); + case FITRIM: + return nilfs_ioctl_trim_fs(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_SET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: + break; + default: + return -ENOIOCTLCMD; + } + return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff --git a/kernel/fs/nilfs2/mdt.c b/kernel/fs/nilfs2/mdt.c new file mode 100644 index 000000000..dee34d990 --- /dev/null +++ b/kernel/fs/nilfs2/mdt.c @@ -0,0 +1,652 @@ +/* + * mdt.c - meta data file for NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) + + +static int +nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, + struct buffer_head *bh, + void (*init_block)(struct inode *, + struct buffer_head *, void *)) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + void *kaddr; + int ret; + + /* Caller exclude read accesses using page lock */ + + /* set_buffer_new(bh); */ + bh->b_blocknr = 0; + + ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); + if (unlikely(ret)) + return ret; + + set_buffer_mapped(bh); + + kaddr = kmap_atomic(bh->b_page); + memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + if (init_block) + init_block(inode, bh, kaddr); + flush_dcache_page(bh->b_page); + kunmap_atomic(kaddr); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + return 0; +} + +static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh, + void (*init_block)(struct inode *, + struct buffer_head *, + void *)) +{ + struct super_block *sb = inode->i_sb; + struct nilfs_transaction_info ti; + struct buffer_head *bh; + int err; + + nilfs_transaction_begin(sb, &ti, 0); + + err = -ENOMEM; + bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); + if (unlikely(!bh)) + goto failed_unlock; + + err = -EEXIST; + if (buffer_uptodate(bh)) + goto failed_bh; + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + goto failed_bh; + + bh->b_bdev = sb->s_bdev; + err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); + if (likely(!err)) { + get_bh(bh); + *out_bh = bh; + } + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + + failed_unlock: + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + + return err; +} + +static int +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, + int mode, struct buffer_head **out_bh) +{ + struct buffer_head *bh; + __u64 blknum = 0; + int ret = -ENOMEM; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + goto failed; + + ret = -EEXIST; /* internal code */ + if (buffer_uptodate(bh)) + goto out; + + if (mode == READA) { + if (!trylock_buffer(bh)) { + ret = -EBUSY; + goto failed_bh; + } + } else /* mode == READ */ + lock_buffer(bh); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; + } + map_bh(bh, inode->i_sb, (sector_t)blknum); + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + ret = 0; + out: + get_bh(bh); + *out_bh = bh; + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + failed: + return ret; +} + +static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, + int readahead, struct buffer_head **out_bh) +{ + struct buffer_head *first_bh, *bh; + unsigned long blkoff; + int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; + int err; + + err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + if (err == -EEXIST) /* internal code */ + goto out; + + if (unlikely(err)) + goto failed; + + if (readahead) { + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; + /* abort readahead if bmap lookup failed */ + if (!buffer_locked(first_bh)) + goto out_no_wait; + } + } + + wait_on_buffer(first_bh); + + out_no_wait: + err = -EIO; + if (!buffer_uptodate(first_bh)) + goto failed_bh; + out: + *out_bh = first_bh; + return 0; + + failed_bh: + brelse(first_bh); + failed: + return err; +} + +/** + * nilfs_mdt_get_block - read or create a buffer on meta data file. + * @inode: inode of the meta data file + * @blkoff: block offset + * @create: create flag + * @init_block: initializer used for newly allocated block + * @out_bh: output of a pointer to the buffer_head + * + * nilfs_mdt_get_block() looks up the specified buffer and tries to create + * a new buffer if @create is not zero. On success, the returned buffer is + * assured to be either existing or formatted using a buffer lock on success. + * @out_bh is substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EROFS - Read only filesystem (for create mode) + */ +int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **out_bh) +{ + int ret; + + /* Should be rewritten with merging nilfs_mdt_read_block() */ + retry: + ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); + if (!create || ret != -ENOENT) + return ret; + + ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); + if (unlikely(ret == -EEXIST)) { + /* create = 0; */ /* limit read-create loop retries */ + goto retry; + } + return ret; +} + +/** + * nilfs_mdt_find_block - find and get a buffer on meta data file. + * @inode: inode of the meta data file + * @start: start block offset (inclusive) + * @end: end block offset (inclusive) + * @blkoff: block offset + * @out_bh: place to store a pointer to buffer_head struct + * + * nilfs_mdt_find_block() looks up an existing block in range of + * [@start, @end] and stores pointer to a buffer head of the block to + * @out_bh, and block offset to @blkoff, respectively. @out_bh and + * @blkoff are substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - no block was found in the range + */ +int nilfs_mdt_find_block(struct inode *inode, unsigned long start, + unsigned long end, unsigned long *blkoff, + struct buffer_head **out_bh) +{ + __u64 next; + int ret; + + if (unlikely(start > end)) + return -ENOENT; + + ret = nilfs_mdt_read_block(inode, start, true, out_bh); + if (!ret) { + *blkoff = start; + goto out; + } + if (unlikely(ret != -ENOENT || start == ULONG_MAX)) + goto out; + + ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next); + if (!ret) { + if (next <= end) { + ret = nilfs_mdt_read_block(inode, next, true, out_bh); + if (!ret) + *blkoff = next; + } else { + ret = -ENOENT; + } + } +out: + return ret; +} + +/** + * nilfs_mdt_delete_block - make a hole on the meta data file. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, zero is returned. + * On error, one of the following negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + */ +int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + err = nilfs_bmap_delete(ii->i_bmap, block); + if (!err || err == -ENOENT) { + nilfs_mdt_mark_dirty(inode); + nilfs_mdt_forget_block(inode, block); + } + return err; +} + +/** + * nilfs_mdt_forget_block - discard dirty state and try to remove the page + * @inode: inode of the meta data file + * @block: block offset + * + * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and + * tries to release the page including the buffer from a page cache. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EBUSY - page has an active buffer. + * + * %-ENOENT - page cache has no page addressed by the offset. + */ +int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) +{ + pgoff_t index = (pgoff_t)block >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct page *page; + unsigned long first_block; + int ret = 0; + int still_dirty; + + page = find_lock_page(inode->i_mapping, index); + if (!page) + return -ENOENT; + + wait_on_page_writeback(page); + + first_block = (unsigned long)index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (page_has_buffers(page)) { + struct buffer_head *bh; + + bh = nilfs_page_get_nth_block(page, block - first_block); + nilfs_forget_buffer(bh); + } + still_dirty = PageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (still_dirty || + invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) + ret = -EBUSY; + return ret; +} + +/** + * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ +int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) +{ + struct buffer_head *bh; + int err; + + err = nilfs_mdt_read_block(inode, block, 0, &bh); + if (unlikely(err)) + return err; + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + brelse(bh); + return 0; +} + +int nilfs_mdt_fetch_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { + set_bit(NILFS_I_DIRTY, &ii->i_state); + return 1; + } + return test_bit(NILFS_I_DIRTY, &ii->i_state); +} + +static int +nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb; + int err = 0; + + if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (!inode) + return 0; + + sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_segment(sb); + else if (wbc->for_reclaim) + nilfs_flush_segment(sb, inode->i_ino); + + return err; +} + + +static const struct address_space_operations def_mdt_aops = { + .writepage = nilfs_mdt_write_page, +}; + +static const struct inode_operations def_mdt_iops; +static const struct file_operations def_mdt_fops; + + +int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) +{ + struct nilfs_mdt_info *mi; + + mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); + if (!mi) + return -ENOMEM; + + init_rwsem(&mi->mi_sem); + inode->i_private = mi; + + inode->i_mode = S_IFREG; + mapping_set_gfp_mask(inode->i_mapping, gfp_mask); + + inode->i_op = &def_mdt_iops; + inode->i_fop = &def_mdt_fops; + inode->i_mapping->a_ops = &def_mdt_aops; + + return 0; +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, + unsigned header_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_entry_size = entry_size; + mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); +} + +/** + * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file + * @inode: inode of the metadata file + * @shadow: shadow mapping + */ +int nilfs_mdt_setup_shadow_map(struct inode *inode, + struct nilfs_shadow_map *shadow) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + INIT_LIST_HEAD(&shadow->frozen_buffers); + address_space_init_once(&shadow->frozen_data); + nilfs_mapping_init(&shadow->frozen_data, inode); + address_space_init_once(&shadow->frozen_btnodes); + nilfs_mapping_init(&shadow->frozen_btnodes, inode); + mi->mi_shadow = shadow; + return 0; +} + +/** + * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map + * @inode: inode of the metadata file + */ +int nilfs_mdt_save_to_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + int ret; + + ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); + if (ret) + goto out; + + ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, + &ii->i_btnode_cache); + if (ret) + goto out; + + nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store); + out: + return ret; +} + +int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) +{ + struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; + struct buffer_head *bh_frozen; + struct page *page; + int blkbits = inode->i_blkbits; + + page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); + if (!page) + return -ENOMEM; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, 0); + + bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); + + if (!buffer_uptodate(bh_frozen)) + nilfs_copy_buffer(bh_frozen, bh); + if (list_empty(&bh_frozen->b_assoc_buffers)) { + list_add_tail(&bh_frozen->b_assoc_buffers, + &shadow->frozen_buffers); + set_buffer_nilfs_redirected(bh); + } else { + brelse(bh_frozen); /* already frozen */ + } + + unlock_page(page); + page_cache_release(page); + return 0; +} + +struct buffer_head * +nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) +{ + struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; + struct buffer_head *bh_frozen = NULL; + struct page *page; + int n; + + page = find_lock_page(&shadow->frozen_data, bh->b_page->index); + if (page) { + if (page_has_buffers(page)) { + n = bh_offset(bh) >> inode->i_blkbits; + bh_frozen = nilfs_page_get_nth_block(page, n); + } + unlock_page(page); + page_cache_release(page); + } + return bh_frozen; +} + +static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow) +{ + struct list_head *head = &shadow->frozen_buffers; + struct buffer_head *bh; + + while (!list_empty(head)) { + bh = list_first_entry(head, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); /* drop ref-count to make it releasable */ + } +} + +/** + * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state + * @inode: inode of the metadata file + */ +void nilfs_mdt_restore_from_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + + down_write(&mi->mi_sem); + + if (mi->mi_palloc_cache) + nilfs_palloc_clear_cache(inode); + + nilfs_clear_dirty_pages(inode->i_mapping, true); + nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); + + nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); + nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); + + nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); + + up_write(&mi->mi_sem); +} + +/** + * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches + * @inode: inode of the metadata file + */ +void nilfs_mdt_clear_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + + down_write(&mi->mi_sem); + nilfs_release_frozen_buffers(shadow); + truncate_inode_pages(&shadow->frozen_data, 0); + truncate_inode_pages(&shadow->frozen_btnodes, 0); + up_write(&mi->mi_sem); +} diff --git a/kernel/fs/nilfs2/mdt.h b/kernel/fs/nilfs2/mdt.h new file mode 100644 index 000000000..fe529a87a --- /dev/null +++ b/kernel/fs/nilfs2/mdt.h @@ -0,0 +1,123 @@ +/* + * mdt.h - NILFS meta data file prototype and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#ifndef _NILFS_MDT_H +#define _NILFS_MDT_H + +#include +#include +#include "nilfs.h" +#include "page.h" + +/** + * struct nilfs_shadow_map - shadow mapping of meta data file + * @bmap_store: shadow copy of bmap state + * @frozen_data: shadowed dirty data pages + * @frozen_btnodes: shadowed dirty b-tree nodes' pages + * @frozen_buffers: list of frozen buffers + */ +struct nilfs_shadow_map { + struct nilfs_bmap_store bmap_store; + struct address_space frozen_data; + struct address_space frozen_btnodes; + struct list_head frozen_buffers; +}; + +/** + * struct nilfs_mdt_info - on-memory private data of meta data files + * @mi_sem: reader/writer semaphore for meta data operations + * @mi_bgl: per-blockgroup locking + * @mi_entry_size: size of an entry + * @mi_first_entry_offset: offset to the first entry + * @mi_entries_per_block: number of entries in a block + * @mi_palloc_cache: persistent object allocator cache + * @mi_shadow: shadow of bmap and page caches + * @mi_blocks_per_group: number of blocks in a group + * @mi_blocks_per_desc_block: number of blocks per descriptor block + */ +struct nilfs_mdt_info { + struct rw_semaphore mi_sem; + struct blockgroup_lock *mi_bgl; + unsigned mi_entry_size; + unsigned mi_first_entry_offset; + unsigned long mi_entries_per_block; + struct nilfs_palloc_cache *mi_palloc_cache; + struct nilfs_shadow_map *mi_shadow; + unsigned long mi_blocks_per_group; + unsigned long mi_blocks_per_desc_block; +}; + +static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) +{ + return inode->i_private; +} + +/* Default GFP flags using highmem */ +#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) + +int nilfs_mdt_get_block(struct inode *, unsigned long, int, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **); +int nilfs_mdt_find_block(struct inode *inode, unsigned long start, + unsigned long end, unsigned long *blkoff, + struct buffer_head **out_bh); +int nilfs_mdt_delete_block(struct inode *, unsigned long); +int nilfs_mdt_forget_block(struct inode *, unsigned long); +int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); +int nilfs_mdt_fetch_dirty(struct inode *); + +int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz); +void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); + +int nilfs_mdt_setup_shadow_map(struct inode *inode, + struct nilfs_shadow_map *shadow); +int nilfs_mdt_save_to_shadow_map(struct inode *inode); +void nilfs_mdt_restore_from_shadow_map(struct inode *inode); +void nilfs_mdt_clear_shadow_map(struct inode *inode); +int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh); +struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode, + struct buffer_head *bh); + +static inline void nilfs_mdt_mark_dirty(struct inode *inode) +{ + if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) + set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline void nilfs_mdt_clear_dirty(struct inode *inode) +{ + clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline __u64 nilfs_mdt_cno(struct inode *inode) +{ + return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno; +} + +static inline spinlock_t * +nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group) +{ + return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group); +} + +#endif /* _NILFS_MDT_H */ diff --git a/kernel/fs/nilfs2/namei.c b/kernel/fs/nilfs2/namei.c new file mode 100644 index 000000000..22180836e --- /dev/null +++ b/kernel/fs/nilfs2/namei.c @@ -0,0 +1,585 @@ +/* + * namei.c - NILFS pathname lookup operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji , + * Ryusuke Konishi + */ +/* + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include "nilfs.h" +#include "export.h" + +#define NILFS_FID_SIZE_NON_CONNECTABLE \ + (offsetof(struct nilfs_fid, parent_gen) / 4) +#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4) + +static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + } + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry * +nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > NILFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = nilfs_inode_by_name(dir, &dentry->d_name); + inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; + return d_splice_alias(inode, dentry); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + nilfs_mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int +nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + nilfs_mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = dir->i_sb; + unsigned l = strlen(symname)+1; + struct inode *inode; + int err; + + if (l > sb->s_blocksize) + return -ENAMETOOLONG; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + /* slow symlink */ + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + /* mark_inode_dirty(inode); */ + /* page_symlink() do this */ + + err = nilfs_add_nondir(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); + iput(inode); + goto out; +} + +static int nilfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + ihold(inode); + + err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + err = nilfs_transaction_commit(dir->i_sb); + } else { + inode_dec_link_count(inode); + iput(inode); + nilfs_transaction_abort(dir->i_sb); + } + + return err; +} + +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inc_nlink(dir); + + inode = nilfs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + + inc_nlink(inode); + + err = nilfs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = nilfs_add_link(dentry, inode); + if (err) + goto out_fail; + + nilfs_mark_inode_dirty(inode); + d_instantiate(dentry, inode); + unlock_new_inode(inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + drop_nlink(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); + iput(inode); +out_dir: + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); + goto out; +} + +static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct nilfs_dir_entry *de; + struct page *page; + int err; + + err = -ENOENT; + de = nilfs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto out; + + inode = d_inode(dentry); + err = -EIO; + if (le64_to_cpu(de->inode) != inode->i_ino) + goto out; + + if (!inode->i_nlink) { + nilfs_warning(inode->i_sb, __func__, + "deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + err = nilfs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + drop_nlink(inode); + err = 0; +out: + return err; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = nilfs_do_unlink(dir, dentry); + + if (!err) { + nilfs_mark_inode_dirty(dir); + nilfs_mark_inode_dirty(d_inode(dentry)); + err = nilfs_transaction_commit(dir->i_sb); + } else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOTEMPTY; + if (nilfs_empty_dir(inode)) { + err = nilfs_do_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); + } + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *dir_page = NULL; + struct nilfs_dir_entry *dir_de = NULL; + struct page *old_page; + struct nilfs_dir_entry *old_de; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); + if (unlikely(err)) + return err; + + err = -ENOENT; + old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = nilfs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct nilfs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !nilfs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_mark_inode_dirty(new_dir); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + drop_nlink(new_inode); + nilfs_mark_inode_dirty(new_inode); + } else { + err = nilfs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) { + inc_nlink(new_dir); + nilfs_mark_inode_dirty(new_dir); + } + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME; + + nilfs_delete_entry(old_de, old_page); + + if (dir_de) { + nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + drop_nlink(old_dir); + } + nilfs_mark_inode_dirty(old_dir); + nilfs_mark_inode_dirty(old_inode); + + err = nilfs_transaction_commit(old_dir->i_sb); + return err; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + nilfs_transaction_abort(old_dir->i_sb); + return err; +} + +/* + * Export operations + */ +static struct dentry *nilfs_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + struct qstr dotdot = QSTR_INIT("..", 2); + struct nilfs_root *root; + + ino = nilfs_inode_by_name(d_inode(child), &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + + root = NILFS_I(d_inode(child))->i_root; + + inode = nilfs_iget(d_inode(child)->i_sb, root, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(inode); +} + +static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno, + u64 ino, u32 gen) +{ + struct nilfs_root *root; + struct inode *inode; + + if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO) + return ERR_PTR(-ESTALE); + + root = nilfs_lookup_root(sb->s_fs_info, cno); + if (!root) + return ERR_PTR(-ESTALE); + + inode = nilfs_iget(sb, root, ino); + nilfs_put_root(root); + + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (gen && inode->i_generation != gen) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + + if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE && + fh_len != NILFS_FID_SIZE_CONNECTABLE) || + (fh_type != FILEID_NILFS_WITH_PARENT && + fh_type != FILEID_NILFS_WITHOUT_PARENT)) + return NULL; + + return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen); +} + +static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + + if (fh_len != NILFS_FID_SIZE_CONNECTABLE || + fh_type != FILEID_NILFS_WITH_PARENT) + return NULL; + + return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen); +} + +static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + struct nilfs_root *root = NILFS_I(inode)->i_root; + int type; + + if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) { + *lenp = NILFS_FID_SIZE_CONNECTABLE; + return FILEID_INVALID; + } + if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) { + *lenp = NILFS_FID_SIZE_NON_CONNECTABLE; + return FILEID_INVALID; + } + + fid->cno = root->cno; + fid->ino = inode->i_ino; + fid->gen = inode->i_generation; + + if (parent) { + fid->parent_ino = parent->i_ino; + fid->parent_gen = parent->i_generation; + type = FILEID_NILFS_WITH_PARENT; + *lenp = NILFS_FID_SIZE_CONNECTABLE; + } else { + type = FILEID_NILFS_WITHOUT_PARENT; + *lenp = NILFS_FID_SIZE_NON_CONNECTABLE; + } + + return type; +} + +const struct inode_operations nilfs_dir_inode_operations = { + .create = nilfs_create, + .lookup = nilfs_lookup, + .link = nilfs_link, + .unlink = nilfs_unlink, + .symlink = nilfs_symlink, + .mkdir = nilfs_mkdir, + .rmdir = nilfs_rmdir, + .mknod = nilfs_mknod, + .rename = nilfs_rename, + .setattr = nilfs_setattr, + .permission = nilfs_permission, + .fiemap = nilfs_fiemap, +}; + +const struct inode_operations nilfs_special_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +const struct inode_operations nilfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = nilfs_permission, +}; + +const struct export_operations nilfs_export_ops = { + .encode_fh = nilfs_encode_fh, + .fh_to_dentry = nilfs_fh_to_dentry, + .fh_to_parent = nilfs_fh_to_parent, + .get_parent = nilfs_get_parent, +}; diff --git a/kernel/fs/nilfs2/nilfs.h b/kernel/fs/nilfs2/nilfs.h new file mode 100644 index 000000000..385704027 --- /dev/null +++ b/kernel/fs/nilfs2/nilfs.h @@ -0,0 +1,354 @@ +/* + * nilfs.h - NILFS local header file. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato + * Ryusuke Konishi + */ + +#ifndef _NILFS_H +#define _NILFS_H + +#include +#include +#include +#include +#include +#include "the_nilfs.h" +#include "bmap.h" + +/** + * struct nilfs_inode_info - nilfs inode data in memory + * @i_flags: inode flags + * @i_state: dynamic state flags + * @i_bmap: pointer on i_bmap_data + * @i_bmap_data: raw block mapping + * @i_xattr: + * @i_dir_start_lookup: page index of last successful search + * @i_cno: checkpoint number for GC inode + * @i_btnode_cache: cached pages of b-tree nodes + * @i_dirty: list for connecting dirty files + * @xattr_sem: semaphore for extended attributes processing + * @i_bh: buffer contains disk inode + * @i_root: root object of the current filesystem tree + * @vfs_inode: VFS inode object + */ +struct nilfs_inode_info { + __u32 i_flags; + unsigned long i_state; /* Dynamic state flags */ + struct nilfs_bmap *i_bmap; + struct nilfs_bmap i_bmap_data; + __u64 i_xattr; /* sector_t ??? */ + __u32 i_dir_start_lookup; + __u64 i_cno; /* check point number for GC inode */ + struct address_space i_btnode_cache; + struct list_head i_dirty; /* List for connecting dirty files */ + +#ifdef CONFIG_NILFS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_sem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + struct buffer_head *i_bh; /* i_bh contains a new or dirty + disk inode */ + struct nilfs_root *i_root; + struct inode vfs_inode; +}; + +static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) +{ + return container_of(inode, struct nilfs_inode_info, vfs_inode); +} + +static inline struct nilfs_inode_info * +NILFS_BMAP_I(const struct nilfs_bmap *bmap) +{ + return container_of(bmap, struct nilfs_inode_info, i_bmap_data); +} + +static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) +{ + struct nilfs_inode_info *ii = + container_of(btnc, struct nilfs_inode_info, i_btnode_cache); + return &ii->vfs_inode; +} + +/* + * Dynamic state flags of NILFS on-memory inode (i_state) + */ +enum { + NILFS_I_NEW = 0, /* Inode is newly created */ + NILFS_I_DIRTY, /* The file is dirty */ + NILFS_I_QUEUED, /* inode is in dirty_files list */ + NILFS_I_BUSY, /* inode is grabbed by a segment + constructor */ + NILFS_I_COLLECTED, /* All dirty blocks are collected */ + NILFS_I_UPDATED, /* The file has been written back */ + NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ + NILFS_I_BMAP, /* has bmap and btnode_cache */ + NILFS_I_GCINODE, /* inode for GC, on memory only */ +}; + +/* + * commit flags for nilfs_commit_super and nilfs_sync_super + */ +enum { + NILFS_SB_COMMIT = 0, /* Commit a super block alternately */ + NILFS_SB_COMMIT_ALL /* Commit both super blocks */ +}; + +/* + * Macros to check inode numbers + */ +#define NILFS_MDT_INO_BITS \ + ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ + 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ + 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) + +#define NILFS_SYS_INO_BITS \ + ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) + +#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) + +#define NILFS_MDT_INODE(sb, ino) \ + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) +#define NILFS_VALID_INODE(sb, ino) \ + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + +/** + * struct nilfs_transaction_info: context information for synchronization + * @ti_magic: Magic number + * @ti_save: Backup of journal_info field of task_struct + * @ti_flags: Flags + * @ti_count: Nest level + */ +struct nilfs_transaction_info { + u32 ti_magic; + void *ti_save; + /* This should never used. If this happens, + one of other filesystems has a bug. */ + unsigned short ti_flags; + unsigned short ti_count; +}; + +/* ti_magic */ +#define NILFS_TI_MAGIC 0xd9e392fb + +/* ti_flags */ +#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ +#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the + end of transaction. */ +#define NILFS_TI_GC 0x0004 /* GC context */ +#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ +#define NILFS_TI_WRITER 0x0010 /* Constructor context */ + + +int nilfs_transaction_begin(struct super_block *, + struct nilfs_transaction_info *, int); +int nilfs_transaction_commit(struct super_block *); +void nilfs_transaction_abort(struct super_block *); + +static inline void nilfs_set_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= flag; +} + +static inline int nilfs_test_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC) + return 0; + return !!(ti->ti_flags & flag); +} + +static inline int nilfs_doing_gc(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_GC); +} + +static inline int nilfs_doing_construction(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_WRITER); +} + +/* + * function prototype + */ +#ifdef CONFIG_NILFS_POSIX_ACL +#error "NILFS: not yet supported POSIX ACL" +extern int nilfs_acl_chmod(struct inode *); +extern int nilfs_init_acl(struct inode *, struct inode *); +#else +static inline int nilfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) +{ + inode->i_mode &= ~current_umask(); + return 0; +} +#endif + +#define NILFS_ATIME_DISABLE + +/* Flags that should be inherited by new inodes from their parent. */ +#define NILFS_FL_INHERITED \ + (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL | \ + FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\ + FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL); + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* dir.c */ +extern int nilfs_add_link(struct dentry *, struct inode *); +extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +extern int nilfs_make_empty(struct inode *, struct inode *); +extern struct nilfs_dir_entry * +nilfs_find_entry(struct inode *, const struct qstr *, struct page **); +extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); +extern int nilfs_empty_dir(struct inode *); +extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); +extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct page *, struct inode *); + +/* file.c */ +extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); + +/* ioctl.c */ +long nilfs_ioctl(struct file *, unsigned int, unsigned long); +long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, + void **); + +/* inode.c */ +void nilfs_inode_add_blocks(struct inode *inode, int n); +void nilfs_inode_sub_blocks(struct inode *inode, int n); +extern struct inode *nilfs_new_inode(struct inode *, umode_t); +extern void nilfs_free_inode(struct inode *); +extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void nilfs_set_inode_flags(struct inode *); +extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); +extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +extern struct inode *nilfs_iget_for_gc(struct super_block *sb, + unsigned long ino, __u64 cno); +extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); +extern void nilfs_truncate(struct inode *); +extern void nilfs_evict_inode(struct inode *); +extern int nilfs_setattr(struct dentry *, struct iattr *); +extern void nilfs_write_failed(struct address_space *mapping, loff_t to); +int nilfs_permission(struct inode *inode, int mask); +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); +extern int nilfs_inode_dirty(struct inode *); +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); +extern int __nilfs_mark_inode_dirty(struct inode *, int); +extern void nilfs_dirty_inode(struct inode *, int flags); +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +static inline int nilfs_mark_inode_dirty(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY); +} +static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC); +} + +/* super.c */ +extern struct inode *nilfs_alloc_inode(struct super_block *); +extern void nilfs_destroy_inode(struct inode *); +extern __printf(3, 4) +void nilfs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void nilfs_warning(struct super_block *, const char *, const char *, ...); +extern struct nilfs_super_block * +nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); +extern int nilfs_store_magic_and_option(struct super_block *, + struct nilfs_super_block *, char *); +extern int nilfs_check_feature_compatibility(struct super_block *, + struct nilfs_super_block *); +extern void nilfs_set_log_cursor(struct nilfs_super_block *, + struct the_nilfs *); +struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, + int flip); +int nilfs_commit_super(struct super_block *sb, int flag); +int nilfs_cleanup_super(struct super_block *sb); +int nilfs_resize_fs(struct super_block *sb, __u64 newsize); +int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, + struct nilfs_root **root); +int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); + +/* gcinode.c */ +int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); +int nilfs_init_gcinode(struct inode *inode); +void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); + +/* sysfs.c */ +int __init nilfs_sysfs_init(void); +void nilfs_sysfs_exit(void); +int nilfs_sysfs_create_device_group(struct super_block *); +void nilfs_sysfs_delete_device_group(struct the_nilfs *); +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *); +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *); + +/* + * Inodes and files operations + */ +extern const struct file_operations nilfs_dir_operations; +extern const struct inode_operations nilfs_file_inode_operations; +extern const struct file_operations nilfs_file_operations; +extern const struct address_space_operations nilfs_aops; +extern const struct inode_operations nilfs_dir_inode_operations; +extern const struct inode_operations nilfs_special_inode_operations; +extern const struct inode_operations nilfs_symlink_inode_operations; + +/* + * filesystem type + */ +extern struct file_system_type nilfs_fs_type; + + +#endif /* _NILFS_H */ diff --git a/kernel/fs/nilfs2/page.c b/kernel/fs/nilfs2/page.c new file mode 100644 index 000000000..45d650add --- /dev/null +++ b/kernel/fs/nilfs2/page.c @@ -0,0 +1,581 @@ +/* + * page.c - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi , + * Seiji Kihara . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_BUFFER_INHERENT_BITS \ + ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) + +static struct buffer_head * +__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, + int blkbits, unsigned long b_state) + +{ + unsigned long first_block; + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, b_state); + + first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + bh = nilfs_page_get_nth_block(page, block - first_block); + + touch_buffer(bh); + wait_on_buffer(bh); + return bh; +} + +struct buffer_head *nilfs_grab_buffer(struct inode *inode, + struct address_space *mapping, + unsigned long blkoff, + unsigned long b_state) +{ + int blkbits = inode->i_blkbits; + pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + struct page *page; + struct buffer_head *bh; + + page = grab_cache_page(mapping, index); + if (unlikely(!page)) + return NULL; + + bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + if (unlikely(!bh)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + return bh; +} + +/** + * nilfs_forget_buffer - discard dirty state + * @inode: owner inode of the buffer + * @bh: buffer head of the buffer to be discarded + */ +void nilfs_forget_buffer(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + const unsigned long clear_bits = + (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | + 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + + lock_buffer(bh); + set_mask_bits(&bh->b_state, clear_bits, 0); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + + bh->b_blocknr = -1; + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + unlock_buffer(bh); + brelse(bh); +} + +/** + * nilfs_copy_buffer -- copy buffer data and flags + * @dbh: destination buffer + * @sbh: source buffer + */ +void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) +{ + void *kaddr0, *kaddr1; + unsigned long bits; + struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct buffer_head *bh; + + kaddr0 = kmap_atomic(spage); + kaddr1 = kmap_atomic(dpage); + memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); + kunmap_atomic(kaddr1); + kunmap_atomic(kaddr0); + + dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + + bh = dbh; + bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + while ((bh = bh->b_this_page) != dbh) { + lock_buffer(bh); + bits &= bh->b_state; + unlock_buffer(bh); + } + if (bits & (1UL << BH_Uptodate)) + SetPageUptodate(dpage); + else + ClearPageUptodate(dpage); + if (bits & (1UL << BH_Mapped)) + SetPageMappedToDisk(dpage); + else + ClearPageMappedToDisk(dpage); +} + +/** + * nilfs_page_buffers_clean - check if a page has dirty buffers or not. + * @page: page to be checked + * + * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. + * Otherwise, it returns non-zero value. + */ +int nilfs_page_buffers_clean(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh)) + return 0; + bh = bh->b_this_page; + } while (bh != head); + return 1; +} + +void nilfs_page_bug(struct page *page) +{ + struct address_space *m; + unsigned long ino; + + if (unlikely(!page)) { + printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + return; + } + + m = page->mapping; + ino = m ? m->host->i_ino : 0; + + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + "mapping=%p ino=%lu\n", + page, atomic_read(&page->_count), + (unsigned long long)page->index, page->flags, m, ino); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int i = 0; + + bh = head = page_buffers(page); + do { + printk(KERN_CRIT + " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", + i++, bh, atomic_read(&bh->b_count), + (unsigned long long)bh->b_blocknr, bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + } +} + +/** + * nilfs_copy_page -- copy the page with buffers + * @dst: destination page + * @src: source page + * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * + * This function is for both data pages and btnode pages. The dirty flag + * should be treated by caller. The page must not be under i/o. + * Both src and dst page must be locked + */ +static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +{ + struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + unsigned long mask = NILFS_BUFFER_INHERENT_BITS; + + BUG_ON(PageWriteback(dst)); + + sbh = sbufs = page_buffers(src); + if (!page_has_buffers(dst)) + create_empty_buffers(dst, sbh->b_size, 0); + + if (copy_dirty) + mask |= (1UL << BH_Dirty); + + dbh = dbufs = page_buffers(dst); + do { + lock_buffer(sbh); + lock_buffer(dbh); + dbh->b_state = sbh->b_state & mask; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); + + copy_highpage(dst, src); + + if (PageUptodate(src) && !PageUptodate(dst)) + SetPageUptodate(dst); + else if (!PageUptodate(src) && PageUptodate(dst)) + ClearPageUptodate(dst); + if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) + SetPageMappedToDisk(dst); + else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) + ClearPageMappedToDisk(dst); + + do { + unlock_buffer(sbh); + unlock_buffer(dbh); + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); +} + +int nilfs_copy_dirty_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + int err = 0; + + pagevec_init(&pvec, 0); +repeat: + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) + return 0; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + + lock_page(page); + if (unlikely(!PageDirty(page))) + NILFS_PAGE_BUG(page, "inconsistent dirty state"); + + dpage = grab_cache_page(dmap, page->index); + if (unlikely(!dpage)) { + /* No empty page is added to the page cache */ + err = -ENOMEM; + unlock_page(page); + break; + } + if (unlikely(!page_has_buffers(page))) + NILFS_PAGE_BUG(page, + "found empty page in dat page cache"); + + nilfs_copy_page(dpage, page, 1); + __set_page_dirty_nobuffers(dpage); + + unlock_page(dpage); + page_cache_release(dpage); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache + * @dmap: destination page cache + * @smap: source page cache + * + * No pages must no be added to the cache during this process. + * This must be ensured by the caller. + */ +void nilfs_copy_back_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i, n; + pgoff_t index = 0; + int err; + + pagevec_init(&pvec, 0); +repeat: + n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + if (!n) + return; + index = pvec.pages[n - 1]->index + 1; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + pgoff_t offset = page->index; + + lock_page(page); + dpage = find_lock_page(dmap, offset); + if (dpage) { + /* override existing page on the destination cache */ + WARN_ON(PageDirty(dpage)); + nilfs_copy_page(dpage, page, 0); + unlock_page(dpage); + page_cache_release(dpage); + } else { + struct page *page2; + + /* move the page to the destination cache */ + spin_lock_irq(&smap->tree_lock); + page2 = radix_tree_delete(&smap->page_tree, offset); + WARN_ON(page2 != page); + + smap->nrpages--; + spin_unlock_irq(&smap->tree_lock); + + spin_lock_irq(&dmap->tree_lock); + err = radix_tree_insert(&dmap->page_tree, offset, page); + if (unlikely(err < 0)) { + WARN_ON(err == -EEXIST); + page->mapping = NULL; + page_cache_release(page); /* for cache */ + } else { + page->mapping = dmap; + dmap->nrpages++; + if (PageDirty(page)) + radix_tree_tag_set(&dmap->page_tree, + offset, + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&dmap->tree_lock); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + goto repeat; +} + +/** + * nilfs_clear_dirty_pages - discard dirty pages in address space + * @mapping: address space with dirty pages for discarding + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + nilfs_clear_dirty_page(page, silent); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +/** + * nilfs_clear_dirty_page - discard dirty page + * @page: dirty page that will be discarded + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_page(struct page *page, bool silent) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + + BUG_ON(!PageLocked(page)); + + if (!silent) { + nilfs_warning(sb, __func__, + "discard page: offset %lld, ino %lu", + page_offset(page), inode->i_ino); + } + + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + const unsigned long clear_bits = + (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | + 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + + bh = head = page_buffers(page); + do { + lock_buffer(bh); + if (!silent) { + nilfs_warning(sb, __func__, + "discard block %llu, size %zu", + (u64)bh->b_blocknr, bh->b_size); + } + set_mask_bits(&bh->b_state, clear_bits, 0); + unlock_buffer(bh); + } while (bh = bh->b_this_page, bh != head); + } + + __nilfs_clear_page_dirty(page); +} + +unsigned nilfs_page_count_clean_buffers(struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + struct buffer_head *bh, *head; + unsigned nc = 0; + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + bh->b_size; + if (block_end > from && block_start < to && !buffer_dirty(bh)) + nc++; + } + return nc; +} + +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) +{ + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->a_ops = &empty_aops; +} + +/* + * NILFS2 needs clear_page_dirty() in the following two cases: + * + * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears + * page dirty flags when it copies back pages from the shadow cache + * (gcdat->{i_mapping,i_btnode_cache}) to its original cache + * (dat->{i_mapping,i_btnode_cache}). + * + * 2) Some B-tree operations like insertion or deletion may dispose buffers + * in dirty state, and this needs to cancel the dirty state of their pages. + */ +int __nilfs_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock_irq(&mapping->tree_lock); + if (test_bit(PG_dirty, &page->flags)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&mapping->tree_lock); + return clear_page_dirty_for_io(page); + } + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + return TestClearPageDirty(page); +} + +/** + * nilfs_find_uncommitted_extent - find extent of uncommitted data + * @inode: inode + * @start_blk: start block offset (in) + * @blkoff: start offset of the found extent (out) + * + * This function searches an extent of buffers marked "delayed" which + * starts from a block offset equal to or larger than @start_blk. If + * such an extent was found, this will store the start offset in + * @blkoff and return its length in blocks. Otherwise, zero is + * returned. + */ +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff) +{ + unsigned int i; + pgoff_t index; + unsigned int nblocks_in_page; + unsigned long length = 0; + sector_t b; + struct pagevec pvec; + struct page *page; + + if (inode->i_mapping->nrpages == 0) + return 0; + + index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + +repeat: + pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, + pvec.pages); + if (pvec.nr == 0) + return length; + + if (length > 0 && pvec.pages[0]->index > index) + goto out; + + b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + i = 0; + do { + page = pvec.pages[i]; + + lock_page(page); + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (b < start_blk) + continue; + if (buffer_delay(bh)) { + if (length == 0) + *blkoff = b; + length++; + } else if (length > 0) { + goto out_locked; + } + } while (++b, bh = bh->b_this_page, bh != head); + } else { + if (length > 0) + goto out_locked; + + b += nblocks_in_page; + } + unlock_page(page); + + } while (++i < pagevec_count(&pvec)); + + index = page->index + 1; + pagevec_release(&pvec); + cond_resched(); + goto repeat; + +out_locked: + unlock_page(page); +out: + pagevec_release(&pvec); + return length; +} diff --git a/kernel/fs/nilfs2/page.h b/kernel/fs/nilfs2/page.h new file mode 100644 index 000000000..a43b8287d --- /dev/null +++ b/kernel/fs/nilfs2/page.h @@ -0,0 +1,80 @@ +/* + * page.h - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi , + * Seiji Kihara . + */ + +#ifndef _NILFS_PAGE_H +#define _NILFS_PAGE_H + +#include +#include "nilfs.h" + +/* + * Extended buffer state bits + */ +enum { + BH_NILFS_Allocated = BH_PrivateStart, + BH_NILFS_Node, + BH_NILFS_Volatile, + BH_NILFS_Checked, + BH_NILFS_Redirected, +}; + +BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ +BUFFER_FNS(NILFS_Volatile, nilfs_volatile) +BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ +BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ + + +int __nilfs_clear_page_dirty(struct page *); + +struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, + unsigned long, unsigned long); +void nilfs_forget_buffer(struct buffer_head *); +void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); +int nilfs_page_buffers_clean(struct page *); +void nilfs_page_bug(struct page *); + +int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); +void nilfs_copy_back_pages(struct address_space *, struct address_space *); +void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_dirty_pages(struct address_space *, bool); +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); +unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff); + +#define NILFS_PAGE_BUG(page, m, a...) \ + do { nilfs_page_bug(page); BUG(); } while (0) + +static inline struct buffer_head * +nilfs_page_get_nth_block(struct page *page, unsigned int count) +{ + struct buffer_head *bh = page_buffers(page); + + while (count-- > 0) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + +#endif /* _NILFS_PAGE_H */ diff --git a/kernel/fs/nilfs2/recovery.c b/kernel/fs/nilfs2/recovery.c new file mode 100644 index 000000000..ff00a0b7a --- /dev/null +++ b/kernel/fs/nilfs2/recovery.c @@ -0,0 +1,964 @@ +/* + * recovery.c - NILFS recovery logic + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "sufile.h" +#include "page.h" +#include "segbuf.h" + +/* + * Segment check result + */ +enum { + NILFS_SEG_VALID, + NILFS_SEG_NO_SUPER_ROOT, + NILFS_SEG_FAIL_IO, + NILFS_SEG_FAIL_MAGIC, + NILFS_SEG_FAIL_SEQ, + NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, + NILFS_SEG_FAIL_CHECKSUM_FULL, + NILFS_SEG_FAIL_CONSISTENCY, +}; + +/* work structure for recovery */ +struct nilfs_recovery_block { + ino_t ino; /* Inode number of the file that this block + belongs to */ + sector_t blocknr; /* block number */ + __u64 vblocknr; /* virtual block number */ + unsigned long blkoff; /* File offset of the data block (per block) */ + struct list_head list; +}; + + +static int nilfs_warn_segment_error(int err) +{ + switch (err) { + case NILFS_SEG_FAIL_IO: + printk(KERN_WARNING + "NILFS warning: I/O error on loading last segment\n"); + return -EIO; + case NILFS_SEG_FAIL_MAGIC: + printk(KERN_WARNING + "NILFS warning: Segment magic number invalid\n"); + break; + case NILFS_SEG_FAIL_SEQ: + printk(KERN_WARNING + "NILFS warning: Sequence number mismatch\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: Checksum error in super root\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_FULL: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment payload\n"); + break; + case NILFS_SEG_FAIL_CONSISTENCY: + printk(KERN_WARNING + "NILFS warning: Inconsistent segment\n"); + break; + case NILFS_SEG_NO_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: No super root in the last segment\n"); + break; + } + return -EINVAL; +} + +/** + * nilfs_compute_checksum - compute checksum of blocks continuously + * @nilfs: nilfs object + * @bhs: buffer head of start block + * @sum: place to store result + * @offset: offset bytes in the first block + * @check_bytes: number of bytes to be checked + * @start: DBN of start block + * @nblock: number of blocks to be checked + */ +static int nilfs_compute_checksum(struct the_nilfs *nilfs, + struct buffer_head *bhs, u32 *sum, + unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) +{ + unsigned int blocksize = nilfs->ns_blocksize; + unsigned long size; + u32 crc; + + BUG_ON(offset >= blocksize); + check_bytes -= offset; + size = min_t(u64, check_bytes, blocksize - offset); + crc = crc32_le(nilfs->ns_crc_seed, + (unsigned char *)bhs->b_data + offset, size); + if (--nblock > 0) { + do { + struct buffer_head *bh; + + bh = __bread(nilfs->ns_bdev, ++start, blocksize); + if (!bh) + return -EIO; + check_bytes -= size; + size = min_t(u64, check_bytes, blocksize); + crc = crc32_le(crc, bh->b_data, size); + brelse(bh); + } while (--nblock > 0); + } + *sum = crc; + return 0; +} + +/** + * nilfs_read_super_root_block - read super root block + * @nilfs: nilfs object + * @sr_block: disk block number of the super root block + * @pbh: address of a buffer_head pointer to return super root buffer + * @check: CRC check flag + */ +int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, + struct buffer_head **pbh, int check) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *sr; + u32 crc; + int ret; + + *pbh = NULL; + bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); + if (unlikely(!bh_sr)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + + sr = (struct nilfs_super_root *)bh_sr->b_data; + if (check) { + unsigned bytes = le16_to_cpu(sr->sr_bytes); + + if (bytes == 0 || bytes > nilfs->ns_blocksize) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + if (nilfs_compute_checksum( + nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, + sr_block, 1)) { + ret = NILFS_SEG_FAIL_IO; + goto failed_bh; + } + if (crc != le32_to_cpu(sr->sr_sum)) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + } + *pbh = bh_sr; + return 0; + + failed_bh: + brelse(bh_sr); + + failed: + return nilfs_warn_segment_error(ret); +} + +/** + * nilfs_read_log_header - read summary header of the specified log + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: pointer to return segment summary structure + */ +static struct buffer_head * +nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary **sum) +{ + struct buffer_head *bh_sum; + + bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (bh_sum) + *sum = (struct nilfs_segment_summary *)bh_sum->b_data; + return bh_sum; +} + +/** + * nilfs_validate_log - verify consistency of log + * @nilfs: nilfs object + * @seg_seq: sequence number of segment + * @bh_sum: buffer head of summary block + * @sum: segment summary struct + */ +static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, + struct buffer_head *bh_sum, + struct nilfs_segment_summary *sum) +{ + unsigned long nblock; + u32 crc; + int ret; + + ret = NILFS_SEG_FAIL_MAGIC; + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) + goto out; + + ret = NILFS_SEG_FAIL_SEQ; + if (le64_to_cpu(sum->ss_seq) != seg_seq) + goto out; + + nblock = le32_to_cpu(sum->ss_nblocks); + ret = NILFS_SEG_FAIL_CONSISTENCY; + if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) + /* This limits the number of blocks read in the CRC check */ + goto out; + + ret = NILFS_SEG_FAIL_IO; + if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), + ((u64)nblock << nilfs->ns_blocksize_bits), + bh_sum->b_blocknr, nblock)) + goto out; + + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + if (crc != le32_to_cpu(sum->ss_datasum)) + goto out; + ret = 0; +out: + return ret; +} + +/** + * nilfs_read_summary_info - read an item on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be read + */ +static void *nilfs_read_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) +{ + void *ptr; + sector_t blocknr; + + BUG_ON((*pbh)->b_size < *offset); + if (bytes > (*pbh)->b_size - *offset) { + blocknr = (*pbh)->b_blocknr; + brelse(*pbh); + *pbh = __bread(nilfs->ns_bdev, blocknr + 1, + nilfs->ns_blocksize); + if (unlikely(!*pbh)) + return NULL; + *offset = 0; + } + ptr = (*pbh)->b_data + *offset; + *offset += bytes; + return ptr; +} + +/** + * nilfs_skip_summary_info - skip items on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be skipped + * @count: number of items to be skipped + */ +static void nilfs_skip_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) +{ + unsigned int rest_item_in_current_block + = ((*pbh)->b_size - *offset) / bytes; + + if (count <= rest_item_in_current_block) { + *offset += bytes * count; + } else { + sector_t blocknr = (*pbh)->b_blocknr; + unsigned int nitem_per_block = (*pbh)->b_size / bytes; + unsigned int bcnt; + + count -= rest_item_in_current_block; + bcnt = DIV_ROUND_UP(count, nitem_per_block); + *offset = bytes * (count - (bcnt - 1) * nitem_per_block); + + brelse(*pbh); + *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, + nilfs->ns_blocksize); + } +} + +/** + * nilfs_scan_dsync_log - get block information of a log written for data sync + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: log summary information + * @head: list head to add nilfs_recovery_block struct + */ +static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary *sum, + struct list_head *head) +{ + struct buffer_head *bh; + unsigned int offset; + u32 nfinfo, sumbytes; + sector_t blocknr; + ino_t ino; + int err = -EIO; + + nfinfo = le32_to_cpu(sum->ss_nfinfo); + if (!nfinfo) + return 0; + + sumbytes = le32_to_cpu(sum->ss_sumbytes); + blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); + bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (unlikely(!bh)) + goto out; + + offset = le16_to_cpu(sum->ss_bytes); + for (;;) { + unsigned long nblocks, ndatablk, nnodeblk; + struct nilfs_finfo *finfo; + + finfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*finfo)); + if (unlikely(!finfo)) + goto out; + + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + nnodeblk = nblocks - ndatablk; + + while (ndatablk-- > 0) { + struct nilfs_recovery_block *rb; + struct nilfs_binfo_v *binfo; + + binfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*binfo)); + if (unlikely(!binfo)) + goto out; + + rb = kmalloc(sizeof(*rb), GFP_NOFS); + if (unlikely(!rb)) { + err = -ENOMEM; + goto out; + } + rb->ino = ino; + rb->blocknr = blocknr++; + rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); + rb->blkoff = le64_to_cpu(binfo->bi_blkoff); + /* INIT_LIST_HEAD(&rb->list); */ + list_add_tail(&rb->list, head); + } + if (--nfinfo == 0) + break; + blocknr += nnodeblk; /* always 0 for data sync logs */ + nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), + nnodeblk); + if (unlikely(!bh)) + goto out; + } + err = 0; + out: + brelse(bh); /* brelse(NULL) is just ignored */ + return err; +} + +static void dispose_recovery_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_recovery_block *rb; + + rb = list_first_entry(head, struct nilfs_recovery_block, list); + list_del(&rb->list); + kfree(rb); + } +} + +struct nilfs_segment_entry { + struct list_head list; + __u64 segnum; +}; + +static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (unlikely(!ent)) + return -ENOMEM; + + ent->segnum = segnum; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, head); + return 0; +} + +void nilfs_dispose_segment_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_segment_entry *ent; + + ent = list_first_entry(head, struct nilfs_segment_entry, list); + list_del(&ent->list); + kfree(ent); + } +} + +static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_recovery_info *ri) +{ + struct list_head *head = &ri->ri_used_segments; + struct nilfs_segment_entry *ent, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 segnum[4]; + int err; + int i; + + segnum[0] = nilfs->ns_segnum; + segnum[1] = nilfs->ns_nextnum; + segnum[2] = ri->ri_segnum; + segnum[3] = ri->ri_nextnum; + + /* + * Releasing the next segment of the latest super root. + * The next segment is invalidated by this recovery. + */ + err = nilfs_sufile_free(sufile, segnum[1]); + if (unlikely(err)) + goto failed; + + for (i = 1; i < 4; i++) { + err = nilfs_segment_list_add(head, segnum[i]); + if (unlikely(err)) + goto failed; + } + + /* + * Collecting segments written after the latest super root. + * These are marked dirty to avoid being reallocated in the next write. + */ + list_for_each_entry_safe(ent, n, head, list) { + if (ent->segnum != segnum[0]) { + err = nilfs_sufile_scrap(sufile, ent->segnum); + if (unlikely(err)) + goto failed; + } + list_del(&ent->list); + kfree(ent); + } + + /* Allocate new segments for recovery */ + err = nilfs_sufile_alloc(sufile, &segnum[0]); + if (unlikely(err)) + goto failed; + + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq = ri->ri_seq + 2; + nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; + + failed: + /* No need to recover sufile because it will be destroyed on error */ + return err; +} + +static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, + struct nilfs_recovery_block *rb, + struct page *page) +{ + struct buffer_head *bh_org; + void *kaddr; + + bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); + if (unlikely(!bh_org)) + return -EIO; + + kaddr = kmap_atomic(page); + memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + kunmap_atomic(kaddr); + brelse(bh_org); + return 0; +} + +static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_root *root, + struct list_head *head, + unsigned long *nr_salvaged_blocks) +{ + struct inode *inode; + struct nilfs_recovery_block *rb, *n; + unsigned blocksize = nilfs->ns_blocksize; + struct page *page; + loff_t pos; + int err = 0, err2 = 0; + + list_for_each_entry_safe(rb, n, head, list) { + inode = nilfs_iget(sb, root, rb->ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto failed_inode; + } + + pos = rb->blkoff << inode->i_blkbits; + err = block_write_begin(inode->i_mapping, pos, blocksize, + 0, &page, nilfs_get_block); + if (unlikely(err)) { + loff_t isize = inode->i_size; + if (pos + blocksize > isize) + nilfs_write_failed(inode->i_mapping, + pos + blocksize); + goto failed_inode; + } + + err = nilfs_recovery_copy_block(nilfs, rb, page); + if (unlikely(err)) + goto failed_page; + + err = nilfs_set_file_dirty(inode, 1); + if (unlikely(err)) + goto failed_page; + + block_write_end(NULL, inode->i_mapping, pos, blocksize, + blocksize, page, NULL); + + unlock_page(page); + page_cache_release(page); + + (*nr_salvaged_blocks)++; + goto next; + + failed_page: + unlock_page(page); + page_cache_release(page); + + failed_inode: + printk(KERN_WARNING + "NILFS warning: error recovering data block " + "(err=%d, ino=%lu, block-offset=%llu)\n", + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); + if (!err2) + err2 = err; + next: + iput(inode); /* iput(NULL) is just ignored */ + list_del_init(&rb->list); + kfree(rb); + } + return err2; +} + +/** + * nilfs_do_roll_forward - salvage logical segments newer than the latest + * checkpoint + * @nilfs: nilfs object + * @sb: super block instance + * @ri: pointer to a nilfs_recovery_info + */ +static int nilfs_do_roll_forward(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_root *root, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; + sector_t pseg_start; + sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ + unsigned long nsalvaged_blocks = 0; + unsigned int flags; + u64 seg_seq; + __u64 segnum, nextnum = 0; + int empty_seg = 0; + int err = 0, ret; + LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ + enum { + RF_INIT_ST, + RF_DSYNC_ST, /* scanning data-sync segments */ + }; + int state = RF_INIT_ST; + + pseg_start = ri->ri_lsegs_start; + seg_seq = ri->ri_lsegs_start_seq; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + brelse(bh_sum); + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) { + err = -EIO; + goto failed; + } + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) { + err = -EIO; + goto failed; + } + goto strayed; + } + + flags = le16_to_cpu(sum->ss_flags); + if (flags & NILFS_SS_SR) + goto confused; + + /* Found a valid partial segment; do recovery actions */ + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); + empty_seg = 0; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + if (!(flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = nilfs->ns_ctime; + + switch (state) { + case RF_INIT_ST: + if (!(flags & NILFS_SS_LOGBGN) || + !(flags & NILFS_SS_SYNDT)) + goto try_next_pseg; + state = RF_DSYNC_ST; + /* Fall through */ + case RF_DSYNC_ST: + if (!(flags & NILFS_SS_SYNDT)) + goto confused; + + err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, + &dsync_blocks); + if (unlikely(err)) + goto failed; + if (flags & NILFS_SS_LOGEND) { + err = nilfs_recover_dsync_blocks( + nilfs, sb, root, &dsync_blocks, + &nsalvaged_blocks); + if (unlikely(err)) + goto failed; + state = RF_INIT_ST; + } + break; /* Fall through to try_next_pseg */ + } + + try_next_pseg: + if (pseg_start == ri->ri_lsegs_end) + break; + pseg_start += le32_to_cpu(sum->ss_nblocks); + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + if (pseg_start == ri->ri_lsegs_end) + break; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + break; + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + if (nsalvaged_blocks) { + printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", + sb->s_id, nsalvaged_blocks); + ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; + } + out: + brelse(bh_sum); + dispose_recovery_list(&dsync_blocks); + return err; + + confused: + err = -EINVAL; + failed: + printk(KERN_ERR + "NILFS (device %s): Error roll-forwarding " + "(err=%d, pseg block=%llu). ", + sb->s_id, err, (unsigned long long)pseg_start); + goto out; +} + +static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh; + int err; + + if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != + nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) + return; + + bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); + BUG_ON(!bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_dirty(bh); + err = sync_dirty_buffer(bh); + if (unlikely(err)) + printk(KERN_WARNING + "NILFS warning: buffer sync write failed during " + "post-cleaning of recovery.\n"); + brelse(bh); +} + +/** + * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint + * @nilfs: nilfs object + * @sb: super block instance + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - Inconsistent filesystem state. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_recovery_info *ri) +{ + struct nilfs_root *root; + int err; + + if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) + return 0; + + err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: error loading the latest checkpoint.\n"); + return err; + } + + err = nilfs_do_roll_forward(nilfs, sb, root, ri); + if (unlikely(err)) + goto failed; + + if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { + err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Error preparing segments for " + "recovery.\n"); + goto failed; + } + + err = nilfs_attach_log_writer(sb, root); + if (unlikely(err)) + goto failed; + + set_nilfs_discontinued(nilfs); + err = nilfs_construct_segment(sb); + nilfs_detach_log_writer(sb); + + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Oops! recovery failed. " + "(err=%d)\n", err); + goto failed; + } + + nilfs_finish_roll_forward(nilfs, ri); + } + + failed: + nilfs_put_root(root); + return err; +} + +/** + * nilfs_search_super_root - search the latest valid super root + * @nilfs: the_nilfs + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * nilfs_search_super_root() looks for the latest super-root from a partial + * segment pointed by the superblock. It sets up struct the_nilfs through + * this search. It fills nilfs_recovery_info (ri) required for recovery. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - No valid segment found + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_search_super_root(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; + sector_t pseg_start, pseg_end, sr_pseg_start = 0; + sector_t seg_start, seg_end; /* range of full segment (block number) */ + sector_t b, end; + unsigned long nblocks; + unsigned int flags; + u64 seg_seq; + __u64 segnum, nextnum = 0; + __u64 cno; + LIST_HEAD(segments); + int empty_seg = 0, scan_newer = 0; + int ret; + + pseg_start = nilfs->ns_last_pseg; + seg_seq = nilfs->ns_last_seq; + cno = nilfs->ns_last_cno; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + + /* Calculate range of segment */ + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + /* Read ahead segment */ + b = seg_start; + while (b <= seg_end) + __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); + + for (;;) { + brelse(bh_sum); + ret = NILFS_SEG_FAIL_IO; + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) + goto failed; + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) + goto failed; + goto strayed; + } + + nblocks = le32_to_cpu(sum->ss_nblocks); + pseg_end = pseg_start + nblocks - 1; + if (unlikely(pseg_end > seg_end)) { + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto strayed; + } + + /* A valid partial segment */ + ri->ri_pseg_start = pseg_start; + ri->ri_seq = seg_seq; + ri->ri_segnum = segnum; + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); + ri->ri_nextnum = nextnum; + empty_seg = 0; + + flags = le16_to_cpu(sum->ss_flags); + if (!(flags & NILFS_SS_SR) && !scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + + if (pseg_start == seg_start) { + nilfs_get_segment_range(nilfs, nextnum, &b, &end); + while (b <= end) + __breadahead(nilfs->ns_bdev, b++, + nilfs->ns_blocksize); + } + if (!(flags & NILFS_SS_SR)) { + if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { + ri->ri_lsegs_start = pseg_start; + ri->ri_lsegs_start_seq = seg_seq; + } + if (flags & NILFS_SS_LOGEND) + ri->ri_lsegs_end = pseg_start; + goto try_next_pseg; + } + + /* A valid super root was found. */ + ri->ri_cno = cno++; + ri->ri_super_root = pseg_end; + ri->ri_lsegs_start = ri->ri_lsegs_end = 0; + + nilfs_dispose_segment_list(&segments); + sr_pseg_start = pseg_start; + nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; + nilfs->ns_seg_seq = seg_seq; + nilfs->ns_segnum = segnum; + nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + nilfs->ns_nextnum = nextnum; + + if (scan_newer) + ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; + else { + if (nilfs->ns_mount_state & NILFS_VALID_FS) + goto super_root_found; + scan_newer = 1; + } + + try_next_pseg: + /* Standing on a course, or met an inconsistent state */ + pseg_start += nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + /* Off the trail */ + if (!scan_newer) + /* + * This can happen if a checkpoint was written without + * barriers, or as a result of an I/O failure. + */ + goto failed; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + goto super_root_found; /* found a valid super root */ + + ret = nilfs_segment_list_add(&segments, segnum); + if (unlikely(ret)) + goto failed; + + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + super_root_found: + /* Updating pointers relating to the latest checkpoint */ + brelse(bh_sum); + list_splice_tail(&segments, &ri->ri_used_segments); + nilfs->ns_last_pseg = sr_pseg_start; + nilfs->ns_last_seq = nilfs->ns_seg_seq; + nilfs->ns_last_cno = ri->ri_cno; + return 0; + + failed: + brelse(bh_sum); + nilfs_dispose_segment_list(&segments); + return (ret < 0) ? ret : nilfs_warn_segment_error(ret); +} diff --git a/kernel/fs/nilfs2/segbuf.c b/kernel/fs/nilfs2/segbuf.c new file mode 100644 index 000000000..dc3a9efda --- /dev/null +++ b/kernel/fs/nilfs2/segbuf.c @@ -0,0 +1,536 @@ +/* + * segbuf.c - NILFS segment buffer + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include "page.h" +#include "segbuf.h" + + +struct nilfs_write_info { + struct the_nilfs *nilfs; + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; +}; + +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs); +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) +{ + struct nilfs_segment_buffer *segbuf; + + segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); + if (unlikely(!segbuf)) + return NULL; + + segbuf->sb_super = sb; + INIT_LIST_HEAD(&segbuf->sb_list); + INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); + INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; + + init_completion(&segbuf->sb_bio_event); + atomic_set(&segbuf->sb_err, 0); + segbuf->sb_nbio = 0; + + return segbuf; +} + +void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) +{ + kmem_cache_free(nilfs_segbuf_cachep, segbuf); +} + +void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, + unsigned long offset, struct the_nilfs *nilfs) +{ + segbuf->sb_segnum = segnum; + nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, + &segbuf->sb_fseg_end); + + segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +/** + * nilfs_segbuf_map_cont - map a new log behind a given log + * @segbuf: new segment buffer + * @prev: segment buffer containing a log to be continued + */ +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev) +{ + segbuf->sb_segnum = prev->sb_segnum; + segbuf->sb_fseg_start = prev->sb_fseg_start; + segbuf->sb_fseg_end = prev->sb_fseg_end; + segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, + __u64 nextnum, struct the_nilfs *nilfs) +{ + segbuf->sb_nextnum = nextnum; + segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); +} + +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_segsum_buffer(segbuf, bh); + return 0; +} + +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_payload_buffer(segbuf, bh); + *bhp = bh; + return 0; +} + +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, + time_t ctime, __u64 cno) +{ + int err; + + segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + return err; + + segbuf->sb_sum.flags = flags; + segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); + segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; + segbuf->sb_sum.ctime = ctime; + segbuf->sb_sum.cno = cno; + return 0; +} + +/* + * Setup segment summary + */ +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct nilfs_segment_summary *raw_sum; + struct buffer_head *bh_sum; + + bh_sum = list_entry(segbuf->sb_segsum_buffers.next, + struct buffer_head, b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); + raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); + raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); + raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); + raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); + raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); + raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); + raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); + raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); + raw_sum->ss_pad = 0; + raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); +} + +/* + * CRC calculation routines + */ +static void +nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + unsigned long size, bytes = segbuf->sb_sum.sumbytes; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(seed, + (unsigned char *)raw_sum + + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), + size - (sizeof(raw_sum->ss_datasum) + + sizeof(raw_sum->ss_sumsum))); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + bytes -= size; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(crc, bh->b_data, size); + } + raw_sum->ss_sumsum = cpu_to_le32(crc); +} + +static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + void *kaddr; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), + bh->b_size - sizeof(raw_sum->ss_datasum)); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + crc = crc32_le(crc, bh->b_data, bh->b_size); + } + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + kaddr = kmap_atomic(bh->b_page); + crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); + kunmap_atomic(kaddr); + } + raw_sum->ss_datasum = cpu_to_le32(crc); +} + +static void +nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct nilfs_super_root *raw_sr; + struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; + unsigned srsize; + u32 crc; + + raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + srsize - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + +static void nilfs_release_buffers(struct list_head *list) +{ + struct buffer_head *bh, *n; + + list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } +} + +static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; +} + +/* + * Iterators for segment buffers + */ +void nilfs_clear_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) + nilfs_segbuf_clear(segbuf); +} + +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last) +{ + struct nilfs_segment_buffer *n, *segbuf; + + segbuf = list_prepare_entry(last, logs, sb_list); + list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_clear(segbuf); + nilfs_segbuf_free(segbuf); + } +} + +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret = 0; + + list_for_each_entry(segbuf, logs, sb_list) { + ret = nilfs_segbuf_write(segbuf, nilfs); + if (ret) + break; + } + return ret; +} + +int nilfs_wait_on_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + int err, ret = 0; + + list_for_each_entry(segbuf, logs, sb_list) { + err = nilfs_segbuf_wait(segbuf); + if (err && !ret) + ret = err; + } + return ret; +} + +/** + * nilfs_add_checksums_on_logs - add checksums on the logs + * @logs: list of segment buffers storing target logs + * @seed: checksum seed value + */ +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) { + if (segbuf->sb_super_root) + nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + +/* + * BIO operations + */ +static void nilfs_end_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nilfs_segment_buffer *segbuf = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + /* to be detected by nilfs_segbuf_submit_bio() */ + } + + if (!uptodate) + atomic_inc(&segbuf->sb_err); + + bio_put(bio); + complete(&segbuf->sb_bio_event); +} + +static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, int mode) +{ + struct bio *bio = wi->bio; + int err; + + if (segbuf->sb_nbio > 0 && + bdi_write_congested(segbuf->sb_super->s_bdi)) { + wait_for_completion(&segbuf->sb_bio_event); + segbuf->sb_nbio--; + if (unlikely(atomic_read(&segbuf->sb_err))) { + bio_put(bio); + err = -EIO; + goto failed; + } + } + + bio->bi_end_io = nilfs_end_bio_write; + bio->bi_private = segbuf; + bio_get(bio); + submit_bio(mode, bio); + segbuf->sb_nbio++; + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + bio_put(bio); + err = -EOPNOTSUPP; + goto failed; + } + bio_put(bio); + + wi->bio = NULL; + wi->rest_blocks -= wi->end - wi->start; + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end; + return 0; + + failed: + wi->bio = NULL; + return err; +} + +/** + * nilfs_alloc_seg_bio - allocate a new bio for writing log + * @nilfs: nilfs object + * @start: start block number of the bio + * @nr_vecs: request size of page vector. + * + * Return Value: On success, pointer to the struct bio is returned. + * On error, NULL is returned. + */ +static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, + int nr_vecs) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOIO, nr_vecs); + } + if (likely(bio)) { + bio->bi_bdev = nilfs->ns_bdev; + bio->bi_iter.bi_sector = + start << (nilfs->ns_blocksize_bits - 9); + } + return bio; +} + +static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + wi->bio = NULL; + wi->rest_blocks = segbuf->sb_sum.nblocks; + wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end = 0; + wi->blocknr = segbuf->sb_pseg_start; +} + +static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, + struct buffer_head *bh, int mode) +{ + int len, err; + + BUG_ON(wi->nr_vecs <= 0); + repeat: + if (!wi->bio) { + wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end, + wi->nr_vecs); + if (unlikely(!wi->bio)) + return -ENOMEM; + } + + len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (len == bh->b_size) { + wi->end++; + return 0; + } + /* bio is FULL */ + err = nilfs_segbuf_submit_bio(segbuf, wi, mode); + /* never submit current bh */ + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_segbuf_write - submit write requests of a log + * @segbuf: buffer storing a log to be written + * @nilfs: nilfs object + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs) +{ + struct nilfs_write_info wi; + struct buffer_head *bh; + int res = 0, rw = WRITE; + + wi.nilfs = nilfs; + nilfs_segbuf_prepare_write(segbuf, &wi); + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + if (wi.bio) { + /* + * Last BIO is always sent through the following + * submission. + */ + rw |= REQ_SYNC; + res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); + } + + failed_bio: + return res; +} + +/** + * nilfs_segbuf_wait - wait for completion of requested BIOs + * @segbuf: segment buffer + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + */ +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) +{ + int err = 0; + + if (!segbuf->sb_nbio) + return 0; + + do { + wait_for_completion(&segbuf->sb_bio_event); + } while (--segbuf->sb_nbio > 0); + + if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { + printk(KERN_ERR "NILFS: IO error writing segment\n"); + err = -EIO; + } + return err; +} diff --git a/kernel/fs/nilfs2/segbuf.h b/kernel/fs/nilfs2/segbuf.h new file mode 100644 index 000000000..b04f08cc2 --- /dev/null +++ b/kernel/fs/nilfs2/segbuf.h @@ -0,0 +1,184 @@ +/* + * segbuf.h - NILFS Segment buffer prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ +#ifndef _NILFS_SEGBUF_H +#define _NILFS_SEGBUF_H + +#include +#include +#include +#include + +/** + * struct nilfs_segsum_info - On-memory segment summary + * @flags: Flags + * @nfinfo: Number of file information structures + * @nblocks: Number of blocks included in the partial segment + * @nsumblk: Number of summary blocks + * @sumbytes: Byte count of segment summary + * @nfileblk: Total number of file blocks + * @seg_seq: Segment sequence number + * @cno: Checkpoint number + * @ctime: Creation time + * @next: Block number of the next full segment + */ +struct nilfs_segsum_info { + unsigned int flags; + unsigned long nfinfo; + unsigned long nblocks; + unsigned long nsumblk; + unsigned long sumbytes; + unsigned long nfileblk; + u64 seg_seq; + __u64 cno; + time_t ctime; + sector_t next; +}; + +/** + * struct nilfs_segment_buffer - Segment buffer + * @sb_super: back pointer to a superblock struct + * @sb_list: List head to chain this structure + * @sb_sum: On-memory segment summary + * @sb_segnum: Index number of the full segment + * @sb_nextnum: Index number of the next full segment + * @sb_fseg_start: Start block number of the full segment + * @sb_fseg_end: End block number of the full segment + * @sb_pseg_start: Disk block number of partial segment + * @sb_rest_blocks: Number of residual blocks in the current segment + * @sb_segsum_buffers: List of buffers for segment summaries + * @sb_payload_buffers: List of buffers for segment payload + * @sb_super_root: Pointer to buffer storing a super root block (if exists) + * @sb_nbio: Number of flying bio requests + * @sb_err: I/O error status + * @sb_bio_event: Completion event of log writing + */ +struct nilfs_segment_buffer { + struct super_block *sb_super; + struct list_head sb_list; + + /* Segment information */ + struct nilfs_segsum_info sb_sum; + __u64 sb_segnum; + __u64 sb_nextnum; + sector_t sb_fseg_start, sb_fseg_end; + sector_t sb_pseg_start; + unsigned sb_rest_blocks; + + /* Buffers */ + struct list_head sb_segsum_buffers; + struct list_head sb_payload_buffers; /* including super root */ + struct buffer_head *sb_super_root; + + /* io status */ + int sb_nbio; + atomic_t sb_err; + struct completion sb_bio_event; +}; + +#define NILFS_LIST_SEGBUF(head) \ + list_entry((head), struct nilfs_segment_buffer, sb_list) +#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next) +#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev) +#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev) +#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next) +#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head)) + +#define nilfs_for_each_segbuf_before(s, t, h) \ + for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \ + (s) = NILFS_NEXT_SEGBUF(s)) + +#define NILFS_SEGBUF_FIRST_BH(head) \ + (list_entry((head)->next, struct buffer_head, b_assoc_buffers)) +#define NILFS_SEGBUF_NEXT_BH(bh) \ + (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \ + b_assoc_buffers)) +#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) + +extern struct kmem_cache *nilfs_segbuf_cachep; + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); +void nilfs_segbuf_free(struct nilfs_segment_buffer *); +void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, + struct the_nilfs *); +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev); +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, + struct the_nilfs *); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, + struct buffer_head **); +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); + +static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf) +{ + unsigned int flags = segbuf->sb_sum.flags; + + return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND); +} + +static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf) +{ + return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk; +} + +static inline void +nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers); + segbuf->sb_sum.nblocks++; + segbuf->sb_sum.nsumblk++; +} + +static inline void +nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers); + segbuf->sb_sum.nblocks++; +} + +static inline void +nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + get_bh(bh); + nilfs_segbuf_add_payload_buffer(segbuf, bh); + segbuf->sb_sum.nfileblk++; +} + +void nilfs_clear_logs(struct list_head *logs); +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last); +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); +int nilfs_wait_on_logs(struct list_head *logs); +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed); + +static inline void nilfs_destroy_logs(struct list_head *logs) +{ + nilfs_truncate_logs(logs, NULL); +} + +#endif /* _NILFS_SEGBUF_H */ diff --git a/kernel/fs/nilfs2/segment.c b/kernel/fs/nilfs2/segment.c new file mode 100644 index 000000000..c6abbad9b --- /dev/null +++ b/kernel/fs/nilfs2/segment.c @@ -0,0 +1,2758 @@ +/* + * segment.c - NILFS segment constructor. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "page.h" +#include "segment.h" +#include "sufile.h" +#include "cpfile.h" +#include "ifile.h" +#include "segbuf.h" + + +/* + * Segment constructor + */ +#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ + +#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments + appended in collection retry loop */ + +/* Construction mode */ +enum { + SC_LSEG_SR = 1, /* Make a logical segment having a super root */ + SC_LSEG_DSYNC, /* Flush data blocks of a given file and make + a logical segment without a super root */ + SC_FLUSH_FILE, /* Flush data files, leads to segment writes without + creating a checkpoint */ + SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without + a checkpoint */ +}; + +/* Stage numbers of dirty block collection */ +enum { + NILFS_ST_INIT = 0, + NILFS_ST_GC, /* Collecting dirty blocks for GC */ + NILFS_ST_FILE, + NILFS_ST_IFILE, + NILFS_ST_CPFILE, + NILFS_ST_SUFILE, + NILFS_ST_DAT, + NILFS_ST_SR, /* Super root */ + NILFS_ST_DSYNC, /* Data sync blocks */ + NILFS_ST_DONE, +}; + +/* State flags of collection */ +#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ +#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ +#define NILFS_CF_SUFREED 0x0004 /* segment usages has been freed */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED) + +/* Operations depending on the construction mode and file type */ +struct nilfs_sc_operations { + int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + void (*write_data_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); + void (*write_node_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); +}; + +/* + * Other definitions + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *); +static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); +static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); + +#define nilfs_cnt32_gt(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(b) - (__s32)(a) < 0)) +#define nilfs_cnt32_ge(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(a) - (__s32)(b) >= 0)) +#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) +#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) + +static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + void *save = NULL; + + if (cur_ti) { + if (cur_ti->ti_magic == NILFS_TI_MAGIC) + return ++cur_ti->ti_count; + else { + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different " + "FS\n"); + save = current->journal_info; + } + } + if (!ti) { + ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); + if (!ti) + return -ENOMEM; + ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; + } else { + ti->ti_flags = 0; + } + ti->ti_count = 0; + ti->ti_save = save; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + return 0; +} + +/** + * nilfs_transaction_begin - start indivisible file operations. + * @sb: super block + * @ti: nilfs_transaction_info + * @vacancy_check: flags for vacancy rate checks + * + * nilfs_transaction_begin() acquires a reader/writer semaphore, called + * the segment semaphore, to make a segment construction and write tasks + * exclusive. The function is used with nilfs_transaction_commit() in pairs. + * The region enclosed by these two functions can be nested. To avoid a + * deadlock, the semaphore is only acquired or released in the outermost call. + * + * This function allocates a nilfs_transaction_info struct to keep context + * information on it. It is initialized and hooked onto the current task in + * the outermost call. If a pre-allocated struct is given to @ti, it is used + * instead; otherwise a new struct is assigned from a slab. + * + * When @vacancy_check flag is set, this function will check the amount of + * free space, and will wait for the GC to reclaim disk space if low capacity. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-ENOSPC - No space left on device + */ +int nilfs_transaction_begin(struct super_block *sb, + struct nilfs_transaction_info *ti, + int vacancy_check) +{ + struct the_nilfs *nilfs; + int ret = nilfs_prepare_segment_lock(ti); + + if (unlikely(ret < 0)) + return ret; + if (ret > 0) + return 0; + + sb_start_intwrite(sb); + + nilfs = sb->s_fs_info; + down_read(&nilfs->ns_segctor_sem); + if (vacancy_check && nilfs_near_disk_full(nilfs)) { + up_read(&nilfs->ns_segctor_sem); + ret = -ENOSPC; + goto failed; + } + return 0; + + failed: + ti = current->journal_info; + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); + return ret; +} + +/** + * nilfs_transaction_commit - commit indivisible file operations. + * @sb: super block + * + * nilfs_transaction_commit() releases the read semaphore which is + * acquired by nilfs_transaction_begin(). This is only performed + * in outermost call of this function. If a commit flag is set, + * nilfs_transaction_commit() sets a timer to start the segment + * constructor. If a sync flag is set, it starts construction + * directly. + */ +int nilfs_transaction_commit(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + int err = 0; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + ti->ti_flags |= NILFS_TI_COMMIT; + if (ti->ti_count > 0) { + ti->ti_count--; + return 0; + } + if (nilfs->ns_writer) { + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (ti->ti_flags & NILFS_TI_COMMIT) + nilfs_segctor_start_timer(sci); + if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark) + nilfs_segctor_do_flush(sci, 0); + } + up_read(&nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + + if (ti->ti_flags & NILFS_TI_SYNC) + err = nilfs_construct_segment(sb); + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); + return err; +} + +void nilfs_transaction_abort(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + if (ti->ti_count > 0) { + ti->ti_count--; + return; + } + up_read(&nilfs->ns_segctor_sem); + + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); +} + +void nilfs_relax_pressure_in_lock(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (!sci || !sci->sc_flush_request) + return; + + set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); + up_read(&nilfs->ns_segctor_sem); + + down_write(&nilfs->ns_segctor_sem); + if (sci->sc_flush_request && + test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= NILFS_TI_WRITER; + nilfs_segctor_do_immediate_flush(sci); + ti->ti_flags &= ~NILFS_TI_WRITER; + } + downgrade_write(&nilfs->ns_segctor_sem); +} + +static void nilfs_transaction_lock(struct super_block *sb, + struct nilfs_transaction_info *ti, + int gcflag) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + WARN_ON(cur_ti); + ti->ti_flags = NILFS_TI_WRITER; + ti->ti_count = 0; + ti->ti_save = cur_ti; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + + for (;;) { + down_write(&nilfs->ns_segctor_sem); + if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) + break; + + nilfs_segctor_do_immediate_flush(sci); + + up_write(&nilfs->ns_segctor_sem); + yield(); + } + if (gcflag) + ti->ti_flags |= NILFS_TI_GC; +} + +static void nilfs_transaction_unlock(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + BUG_ON(ti->ti_count > 0); + + up_write(&nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; +} + +static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + unsigned bytes) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + unsigned blocksize = sci->sc_super->s_blocksize; + void *p; + + if (unlikely(ssp->offset + bytes > blocksize)) { + ssp->offset = 0; + BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, + &segbuf->sb_segsum_buffers)); + ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); + } + p = ssp->bh->b_data + ssp->offset; + ssp->offset += bytes; + return p; +} + +/** + * nilfs_segctor_reset_segment_buffer - reset the current segment buffer + * @sci: nilfs_sc_info + */ +static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + struct buffer_head *sumbh; + unsigned sumbytes; + unsigned flags = 0; + int err; + + if (nilfs_doing_gc()) + flags = NILFS_SS_GC; + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno); + if (unlikely(err)) + return err; + + sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + sumbytes = segbuf->sb_sum.sumbytes; + sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; + sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; + return 0; +} + +static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) +{ + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) + return -E2BIG; /* The current segment is filled up + (internal code) */ + sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); + return nilfs_segctor_reset_segment_buffer(sci); +} + +static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + int err; + + if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + segbuf = sci->sc_curseg; + } + err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); + if (likely(!err)) + segbuf->sb_sum.flags |= NILFS_SS_SR; + return err; +} + +/* + * Functions for making segment summary and payloads + */ +static int nilfs_segctor_segsum_block_required( + struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, + unsigned binfo_size) +{ + unsigned blocksize = sci->sc_super->s_blocksize; + /* Size of finfo and binfo is enough small against blocksize */ + + return ssp->offset + binfo_size + + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > + blocksize; +} + +static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + sci->sc_curseg->sb_sum.nfinfo++; + sci->sc_binfo_ptr = sci->sc_finfo_ptr; + nilfs_segctor_map_segsum_entry( + sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); + + if (NILFS_I(inode)->i_root && + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + /* skip finfo */ +} + +static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + struct nilfs_finfo *finfo; + struct nilfs_inode_info *ii; + struct nilfs_segment_buffer *segbuf; + __u64 cno; + + if (sci->sc_blk_cnt == 0) + return; + + ii = NILFS_I(inode); + + if (test_bit(NILFS_I_GCINODE, &ii->i_state)) + cno = ii->i_cno; + else if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) + cno = 0; + else + cno = sci->sc_cno; + + finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, + sizeof(*finfo)); + finfo->fi_ino = cpu_to_le64(inode->i_ino); + finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); + finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); + finfo->fi_cno = cpu_to_le64(cno); + + segbuf = sci->sc_curseg; + segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); + sci->sc_finfo_ptr = sci->sc_binfo_ptr; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; +} + +static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode, + unsigned binfo_size) +{ + struct nilfs_segment_buffer *segbuf; + int required, err = 0; + + retry: + segbuf = sci->sc_curseg; + required = nilfs_segctor_segsum_block_required( + sci, &sci->sc_binfo_ptr, binfo_size); + if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { + nilfs_segctor_end_finfo(sci, inode); + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + goto retry; + } + if (unlikely(required)) { + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + goto failed; + } + if (sci->sc_blk_cnt == 0) + nilfs_segctor_begin_finfo(sci, inode); + + nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); + /* Substitution to vblocknr is delayed until update_blocknr() */ + nilfs_segbuf_add_file_buffer(segbuf, bh); + sci->sc_blk_cnt++; + failed: + return err; +} + +/* + * Callback functions that enumerate, mark, and collect dirty blocks + */ +static int nilfs_collect_file_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (err < 0) + return err; + + err = nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_v)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_file_node(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); +} + +static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); +} + +static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*binfo_v)); + *binfo_v = binfo->bi_v; +} + +static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *vblocknr = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*vblocknr)); + *vblocknr = binfo->bi_v.bi_vblocknr; +} + +static struct nilfs_sc_operations nilfs_sc_file_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_file_bmap, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = nilfs_write_file_node_binfo, +}; + +static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (err < 0) + return err; + + err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_dat)); +} + +static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, + sizeof(*blkoff)); + *blkoff = binfo->bi_dat.bi_blkoff; +} + +static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_dat *binfo_dat = + nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); + *binfo_dat = binfo->bi_dat; +} + +static struct nilfs_sc_operations nilfs_sc_dat_ops = { + .collect_data = nilfs_collect_dat_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_dat_bmap, + .write_data_binfo = nilfs_write_dat_data_binfo, + .write_node_binfo = nilfs_write_dat_node_binfo, +}; + +static struct nilfs_sc_operations nilfs_sc_dsync_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = NULL, + .collect_bmap = NULL, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = NULL, +}; + +static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, + struct list_head *listp, + size_t nlimit, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0, last = ULONG_MAX; + size_t ndirties = 0; + int i; + + if (unlikely(start != 0 || end != LLONG_MAX)) { + /* + * A valid range is given for sync-ing data pages. The + * range is rounded to per-page; extra dirty buffers + * may be included if blocksize < pagesize. + */ + index = start >> PAGE_SHIFT; + last = end >> PAGE_SHIFT; + } + pagevec_init(&pvec, 0); + repeat: + if (unlikely(index > last) || + !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + min_t(pgoff_t, last - index, + PAGEVEC_SIZE - 1) + 1)) + return ndirties; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct buffer_head *bh, *head; + struct page *page = pvec.pages[i]; + + if (unlikely(page->index > last)) + break; + + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + unlock_page(page); + + bh = head = page_buffers(page); + do { + if (!buffer_dirty(bh) || buffer_async_write(bh)) + continue; + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, listp); + ndirties++; + if (unlikely(ndirties >= nlimit)) { + pagevec_release(&pvec); + cond_resched(); + return ndirties; + } + } while (bh = bh->b_this_page, bh != head); + } + pagevec_release(&pvec); + cond_resched(); + goto repeat; +} + +static void nilfs_lookup_dirty_node_buffers(struct inode *inode, + struct list_head *listp) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct address_space *mapping = &ii->i_btnode_cache; + struct pagevec pvec; + struct buffer_head *bh, *head; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh) && + !buffer_async_write(bh)) { + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, + listp); + } + bh = bh->b_this_page; + } while (bh != head); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +static void nilfs_dispose_list(struct the_nilfs *nilfs, + struct list_head *head, int force) +{ + struct nilfs_inode_info *ii, *n; + struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; + unsigned nv = 0; + + while (!list_empty(head)) { + spin_lock(&nilfs->ns_inode_lock); + list_for_each_entry_safe(ii, n, head, i_dirty) { + list_del_init(&ii->i_dirty); + if (force) { + if (unlikely(ii->i_bh)) { + brelse(ii->i_bh); + ii->i_bh = NULL; + } + } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { + set_bit(NILFS_I_QUEUED, &ii->i_state); + list_add_tail(&ii->i_dirty, + &nilfs->ns_dirty_files); + continue; + } + ivec[nv++] = ii; + if (nv == SC_N_INODEVEC) + break; + } + spin_unlock(&nilfs->ns_inode_lock); + + for (pii = ivec; nv > 0; pii++, nv--) + iput(&(*pii)->vfs_inode); + } +} + +static void nilfs_iput_work_func(struct work_struct *work) +{ + struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info, + sc_iput_work); + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0); +} + +static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, + struct nilfs_root *root) +{ + int ret = 0; + + if (nilfs_mdt_fetch_dirty(root->ifile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) + ret++; + if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat)) + ret++; + return ret; +} + +static int nilfs_segctor_clean(struct nilfs_sc_info *sci) +{ + return list_empty(&sci->sc_dirty_files) && + !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && + sci->sc_nfreesegs == 0 && + (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); +} + +static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int ret = 0; + + if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + spin_lock(&nilfs->ns_inode_lock); + if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci)) + ret++; + + spin_unlock(&nilfs->ns_inode_lock); + return ret; +} + +static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + + nilfs_mdt_clear_dirty(sci->sc_root->ifile); + nilfs_mdt_clear_dirty(nilfs->ns_cpfile); + nilfs_mdt_clear_dirty(nilfs->ns_sufile); + nilfs_mdt_clear_dirty(nilfs->ns_dat); +} + +static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + /* XXX: this interface will be changed */ + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, + &raw_cp, &bh_cp); + if (likely(!err)) { + /* The following code is duplicated with cpfile. But, it is + needed to collect the checkpoint even if it was not newly + created */ + mark_buffer_dirty(bh_cp); + nilfs_mdt_mark_dirty(nilfs->ns_cpfile); + nilfs_cpfile_put_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + } else + WARN_ON(err == -EINVAL || err == -ENOENT); + + return err; +} + +static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, + &raw_cp, &bh_cp); + if (unlikely(err)) { + WARN_ON(err == -EINVAL || err == -ENOENT); + goto failed_ibh; + } + raw_cp->cp_snapshot_list.ssl_next = 0; + raw_cp->cp_snapshot_list.ssl_prev = 0; + raw_cp->cp_inodes_count = + cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); + raw_cp->cp_blocks_count = + cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); + raw_cp->cp_nblk_inc = + cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); + raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); + raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); + + if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + nilfs_checkpoint_clear_minor(raw_cp); + else + nilfs_checkpoint_set_minor(raw_cp); + + nilfs_write_inode_common(sci->sc_root->ifile, + &raw_cp->cp_ifile_inode, 1); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + return 0; + + failed_ibh: + return err; +} + +static void nilfs_fill_in_file_bmap(struct inode *ifile, + struct nilfs_inode_info *ii) + +{ + struct buffer_head *ibh; + struct nilfs_inode *raw_inode; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) { + ibh = ii->i_bh; + BUG_ON(!ibh); + raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, + ibh); + nilfs_bmap_write(ii->i_bmap, raw_inode); + nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + } +} + +static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { + nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii); + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + unsigned isz, srsz; + + bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + isz = nilfs->ns_inode_size; + srsz = NILFS_SR_BYTES(isz); + + raw_sr->sr_bytes = cpu_to_le16(srsz); + raw_sr->sr_nongc_ctime + = cpu_to_le64(nilfs_doing_gc() ? + nilfs->ns_nongc_ctime : sci->sc_seg_ctime); + raw_sr->sr_flags = 0; + + nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz), 1); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); +} + +static void nilfs_redirty_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) + clear_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_drop_collected_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) + continue; + + clear_bit(NILFS_I_INODE_SYNC, &ii->i_state); + set_bit(NILFS_I_UPDATED, &ii->i_state); + } +} + +static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, + struct inode *inode, + struct list_head *listp, + int (*collect)(struct nilfs_sc_info *, + struct buffer_head *, + struct inode *)) +{ + struct buffer_head *bh, *n; + int err = 0; + + if (collect) { + list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + err = collect(sci, bh, inode); + brelse(bh); + if (unlikely(err)) + goto dispose_buffers; + } + return 0; + } + + dispose_buffers: + while (!list_empty(listp)) { + bh = list_first_entry(listp, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return err; +} + +static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) +{ + /* Remaining number of blocks within segment buffer */ + return sci->sc_segbuf_nblocks - + (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); +} + +static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, + struct inode *inode, + struct nilfs_sc_operations *sc_ops) +{ + LIST_HEAD(data_buffers); + LIST_HEAD(node_buffers); + int err; + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + size_t n, rest = nilfs_segctor_buffer_rest(sci); + + n = nilfs_lookup_dirty_data_buffers( + inode, &data_buffers, rest + 1, 0, LLONG_MAX); + if (n > rest) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, + sc_ops->collect_data); + BUG_ON(!err); /* always receive -E2BIG or true error */ + goto break_or_fail; + } + } + nilfs_lookup_dirty_node_buffers(inode, &node_buffers); + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, sc_ops->collect_data); + if (unlikely(err)) { + /* dispose node list */ + nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, NULL); + goto break_or_fail; + } + sci->sc_stage.flags |= NILFS_CF_NODE; + } + /* Collect node */ + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_node); + if (unlikely(err)) + goto break_or_fail; + + nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_bmap); + if (unlikely(err)) + goto break_or_fail; + + nilfs_segctor_end_finfo(sci, inode); + sci->sc_stage.flags &= ~NILFS_CF_NODE; + + break_or_fail: + return err; +} + +static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, + struct inode *inode) +{ + LIST_HEAD(data_buffers); + size_t n, rest = nilfs_segctor_buffer_rest(sci); + int err; + + n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, + sci->sc_dsync_start, + sci->sc_dsync_end); + + err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, + nilfs_collect_file_data); + if (!err) { + nilfs_segctor_end_finfo(sci, inode); + BUG_ON(n > rest); + /* always receive -E2BIG or true error if n > rest */ + } + return err; +} + +static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct list_head *head; + struct nilfs_inode_info *ii; + size_t ndone; + int err = 0; + + switch (sci->sc_stage.scnt) { + case NILFS_ST_INIT: + /* Pre-processes */ + sci->sc_stage.flags = 0; + + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { + sci->sc_nblk_inc = 0; + sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; + if (mode == SC_LSEG_DSYNC) { + sci->sc_stage.scnt = NILFS_ST_DSYNC; + goto dsync_mode; + } + } + + sci->sc_stage.dirty_file_ptr = NULL; + sci->sc_stage.gc_inode_ptr = NULL; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DAT; + goto dat_stage; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_GC: + if (nilfs_doing_gc()) { + head = &sci->sc_gc_inodes; + ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, + head, i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + err = nilfs_segctor_scan_file( + sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.gc_inode_ptr = list_entry( + ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } + sci->sc_stage.gc_inode_ptr = NULL; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_FILE: + head = &sci->sc_dirty_files; + ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, + i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + clear_bit(NILFS_I_DIRTY, &ii->i_state); + + err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.dirty_file_ptr = + list_entry(ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ + /* XXX: required ? */ + } + sci->sc_stage.dirty_file_ptr = NULL; + if (mode == SC_FLUSH_FILE) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; + sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; + /* Fall through */ + case NILFS_ST_IFILE: + err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; + /* Creating a checkpoint */ + err = nilfs_segctor_create_checkpoint(sci); + if (unlikely(err)) + break; + /* Fall through */ + case NILFS_ST_CPFILE: + err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SUFILE: + err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, + sci->sc_nfreesegs, &ndone); + if (unlikely(err)) { + nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, ndone, + NULL); + break; + } + sci->sc_stage.flags |= NILFS_CF_SUFREED; + + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_DAT: + dat_stage: + err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, + &nilfs_sc_dat_ops); + if (unlikely(err)) + break; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SR: + if (mode == SC_LSEG_SR) { + /* Appending a super root */ + err = nilfs_segctor_add_super_root(sci); + if (unlikely(err)) + break; + } + /* End of a logical segment */ + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DSYNC: + dsync_mode: + sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; + ii = sci->sc_dsync_inode; + if (!test_bit(NILFS_I_BUSY, &ii->i_state)) + break; + + err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); + if (unlikely(err)) + break; + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DONE: + return 0; + default: + BUG(); + } + + break_or_fail: + return err; +} + +/** + * nilfs_segctor_begin_construction - setup segment buffer to make a new log + * @sci: nilfs_sc_info + * @nilfs: nilfs object + */ +static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *prev; + __u64 nextnum; + int err, alloc = 0; + + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + + if (list_empty(&sci->sc_write_logs)) { + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, + nilfs->ns_pseg_offset, nilfs); + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } + + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nextnum = nilfs->ns_nextnum; + + if (nilfs->ns_segnum == nilfs->ns_nextnum) + /* Start from the head of a new full segment */ + alloc++; + } else { + /* Continue logs */ + prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_segbuf_map_cont(segbuf, prev); + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq; + nextnum = prev->sb_nextnum; + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + segbuf->sb_sum.seg_seq++; + alloc++; + } + } + + err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum); + if (err) + goto failed; + + if (alloc) { + err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); + if (err) + goto failed; + } + nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); + + BUG_ON(!list_empty(&sci->sc_segbufs)); + list_add_tail(&segbuf->sb_list, &sci->sc_segbufs); + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; + return 0; + + failed: + nilfs_segbuf_free(segbuf); + return err; +} + +static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int nadd) +{ + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + __u64 nextnextnum; + LIST_HEAD(list); + int err, ret, i; + + prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + /* + * Since the segment specified with nextnum might be allocated during + * the previous construction, the buffer including its segusage may + * not be dirty. The following call ensures that the buffer is dirty + * and will pin the buffer on memory until the sufile is written. + */ + err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); + if (unlikely(err)) + return err; + + for (i = 0; i < nadd; i++) { + /* extend segment info */ + err = -ENOMEM; + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + goto failed; + + /* map this buffer to region of segment on-disk */ + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; + + /* allocate the next next full segment */ + err = nilfs_sufile_alloc(sufile, &nextnextnum); + if (unlikely(err)) + goto failed_segbuf; + + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; + nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); + + list_add_tail(&segbuf->sb_list, &list); + prev = segbuf; + } + list_splice_tail(&list, &sci->sc_segbufs); + return 0; + + failed_segbuf: + nilfs_segbuf_free(segbuf); + failed: + list_for_each_entry(segbuf, &list, sb_list) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + nilfs_destroy_logs(&list); + return err; +} + +static void nilfs_free_incomplete_logs(struct list_head *logs, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(logs); + if (nilfs->ns_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (atomic_read(&segbuf->sb_err)) { + /* Case 1: The first segment failed */ + if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) + /* Case 1a: Partial segment appended into an existing + segment */ + nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, + segbuf->sb_fseg_end); + else /* Case 1b: New full segment */ + set_nilfs_discontinued(nilfs); + } + + prev = segbuf; + list_for_each_entry_continue(segbuf, logs, sb_list) { + if (prev->sb_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (atomic_read(&segbuf->sb_err) && + segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(sufile, segbuf->sb_segnum); + prev = segbuf; + } +} + +static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + unsigned long live_blocks; + int ret; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + live_blocks = segbuf->sb_sum.nblocks + + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + live_blocks, + sci->sc_seg_ctime); + WARN_ON(ret); /* always succeed because the segusage is dirty */ + } +} + +static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(logs); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + segbuf->sb_pseg_start - + segbuf->sb_fseg_start, 0); + WARN_ON(ret); /* always succeed because the segusage is dirty */ + + list_for_each_entry_continue(segbuf, logs, sb_list) { + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + 0, 0); + WARN_ON(ret); /* always succeed */ + } +} + +static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *last, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf = last; + int ret; + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); + } + nilfs_truncate_logs(&sci->sc_segbufs, last); +} + + +static int nilfs_segctor_collect(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int mode) +{ + struct nilfs_cstage prev_stage = sci->sc_stage; + int err, nadd = 1; + + /* Collection retry loop */ + for (;;) { + sci->sc_nblk_this_inc = 0; + sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + err = nilfs_segctor_reset_segment_buffer(sci); + if (unlikely(err)) + goto failed; + + err = nilfs_segctor_collect_blocks(sci, mode); + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (!err) + break; + + if (unlikely(err != -E2BIG)) + goto failed; + + /* The current segment is filled up */ + if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + break; + + nilfs_clear_logs(&sci->sc_segbufs); + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; + } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); + sci->sc_stage = prev_stage; + } + nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); + return 0; + + failed: + return err; +} + +static void nilfs_list_replace_buffer(struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); + + list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); + /* The caller must release old_bh */ +} + +static int +nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *segbuf, + int mode) +{ + struct inode *inode = NULL; + sector_t blocknr; + unsigned long nfinfo = segbuf->sb_sum.nfinfo; + unsigned long nblocks = 0, ndatablk = 0; + struct nilfs_sc_operations *sc_op = NULL; + struct nilfs_segsum_pointer ssp; + struct nilfs_finfo *finfo = NULL; + union nilfs_binfo binfo; + struct buffer_head *bh, *bh_org; + ino_t ino = 0; + int err = 0; + + if (!nfinfo) + goto out; + + blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; + ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + ssp.offset = sizeof(struct nilfs_segment_summary); + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + if (bh == segbuf->sb_super_root) + break; + if (!finfo) { + finfo = nilfs_segctor_map_segsum_entry( + sci, &ssp, sizeof(*finfo)); + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + + inode = bh->b_page->mapping->host; + + if (mode == SC_LSEG_DSYNC) + sc_op = &nilfs_sc_dsync_ops; + else if (ino == NILFS_DAT_INO) + sc_op = &nilfs_sc_dat_ops; + else /* file blocks */ + sc_op = &nilfs_sc_file_ops; + } + bh_org = bh; + get_bh(bh_org); + err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, + &binfo); + if (bh != bh_org) + nilfs_list_replace_buffer(bh_org, bh); + brelse(bh_org); + if (unlikely(err)) + goto failed_bmap; + + if (ndatablk > 0) + sc_op->write_data_binfo(sci, &ssp, &binfo); + else + sc_op->write_node_binfo(sci, &ssp, &binfo); + + blocknr++; + if (--nblocks == 0) { + finfo = NULL; + if (--nfinfo == 0) + break; + } else if (ndatablk > 0) + ndatablk--; + } + out: + return 0; + + failed_bmap: + return err; +} + +static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); + if (unlikely(err)) + return err; + nilfs_segbuf_fill_in_segsum(segbuf); + } + return 0; +} + +static void nilfs_begin_page_io(struct page *page) +{ + if (!page || PageWriteback(page)) + /* For split b-tree node pages, this function may be called + twice. We ignore the 2nd or later calls by this check. */ + return; + + lock_page(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); +} + +static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + set_buffer_async_write(bh); + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_begin_page_io(fs_page); + fs_page = bh->b_page; + } + } + } + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + nilfs_begin_page_io(fs_page); +} + +static int nilfs_segctor_write(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + int ret; + + ret = nilfs_write_logs(&sci->sc_segbufs, nilfs); + list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); + return ret; +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + + if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { + /* + * For b-tree node pages, this function may be called twice + * or more because they might be split in a segment. + */ + if (PageDirty(page)) { + /* + * For pages holding split b-tree node buffers, dirty + * flag on the buffers may be cleared discretely. + * In that case, the page is once redirtied for + * remaining buffers, and it must be cancelled if + * all the buffers get cleaned later. + */ + lock_page(page); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + return; + } + + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); + } + + end_page_writeback(page); +} + +static void nilfs_abort_logs(struct list_head *logs, int err) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct buffer_head *bh; + + if (list_empty(logs)) + return; + + list_for_each_entry(segbuf, logs, sb_list) { + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + clear_buffer_async_write(bh); + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + fs_page = bh->b_page; + } + } + } + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, err); +} + +static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + LIST_HEAD(logs); + int ret; + + list_splice_tail_init(&sci->sc_write_logs, &logs); + ret = nilfs_wait_on_logs(&logs); + nilfs_abort_logs(&logs, ret ? : err); + + list_splice_tail_init(&sci->sc_segbufs, &logs); + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); + nilfs_free_incomplete_logs(&logs, nilfs); + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(ret); /* do not happen */ + } + + nilfs_destroy_logs(&logs); +} + +static void nilfs_set_next_segment(struct the_nilfs *nilfs, + struct nilfs_segment_buffer *segbuf) +{ + nilfs->ns_segnum = segbuf->sb_segnum; + nilfs->ns_nextnum = segbuf->sb_nextnum; + nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + + segbuf->sb_sum.nblocks; + nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; + nilfs->ns_ctime = segbuf->sb_sum.ctime; +} + +static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int update_sr = false; + + list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + /* + * We assume that the buffers which belong to the same page + * continue over the buffer list. + * Under this assumption, the last BHs of pages is + * identifiable by the discontinuity of bh->b_page + * (page != fs_page). + * + * For B-tree node blocks, however, this assumption is not + * guaranteed. The cleanup code of B-tree node pages needs + * special care. + */ + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + const unsigned long set_bits = (1 << BH_Uptodate); + const unsigned long clear_bits = + (1 << BH_Dirty | 1 << BH_Async_Write | + 1 << BH_Delay | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Redirected); + + set_mask_bits(&bh->b_state, clear_bits, set_bits); + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + update_sr = true; + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; + } + } + + if (!nilfs_segbuf_simplex(segbuf)) { + if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) { + set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + sci->sc_lseg_stime = jiffies; + } + if (segbuf->sb_sum.flags & NILFS_SS_LOGEND) + clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + } + } + /* + * Since pages may continue over multiple segment buffers, + * end of the last page must be checked outside of the loop. + */ + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, 0); + + nilfs_drop_collected_inodes(&sci->sc_dirty_files); + + if (nilfs_doing_gc()) + nilfs_drop_collected_inodes(&sci->sc_gc_inodes); + else + nilfs->ns_nongc_ctime = sci->sc_seg_ctime; + + sci->sc_nblk_inc += sci->sc_nblk_this_inc; + + segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_set_next_segment(nilfs, segbuf); + + if (update_sr) { + nilfs->ns_flushed_device = 0; + nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, + segbuf->sb_sum.seg_seq, nilfs->ns_cno++); + + clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); + set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + nilfs_segctor_clear_metadata_dirty(sci); + } else + clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); +} + +static int nilfs_segctor_wait(struct nilfs_sc_info *sci) +{ + int ret; + + ret = nilfs_wait_on_logs(&sci->sc_write_logs); + if (!ret) { + nilfs_segctor_complete_write(sci); + nilfs_destroy_logs(&sci->sc_write_logs); + } + return ret; +} + +static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + struct inode *ifile = sci->sc_root->ifile; + + spin_lock(&nilfs->ns_inode_lock); + retry: + list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) { + if (!ii->i_bh) { + struct buffer_head *ibh; + int err; + + spin_unlock(&nilfs->ns_inode_lock); + err = nilfs_ifile_get_inode_block( + ifile, ii->vfs_inode.i_ino, &ibh); + if (unlikely(err)) { + nilfs_warning(sci->sc_super, __func__, + "failed to get inode block.\n"); + return err; + } + mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(ifile); + spin_lock(&nilfs->ns_inode_lock); + if (likely(!ii->i_bh)) + ii->i_bh = ibh; + else + brelse(ibh); + goto retry; + } + + clear_bit(NILFS_I_QUEUED, &ii->i_state); + set_bit(NILFS_I_BUSY, &ii->i_state); + list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); + } + spin_unlock(&nilfs->ns_inode_lock); + + return 0; +} + +static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); + int defer_iput = false; + + spin_lock(&nilfs->ns_inode_lock); + list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { + if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || + test_bit(NILFS_I_DIRTY, &ii->i_state)) + continue; + + clear_bit(NILFS_I_BUSY, &ii->i_state); + brelse(ii->i_bh); + ii->i_bh = NULL; + list_del_init(&ii->i_dirty); + if (!ii->vfs_inode.i_nlink || during_mount) { + /* + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. + */ + list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); + defer_iput = true; + } else { + spin_unlock(&nilfs->ns_inode_lock); + iput(&ii->vfs_inode); + spin_lock(&nilfs->ns_inode_lock); + } + } + spin_unlock(&nilfs->ns_inode_lock); + + if (defer_iput) + schedule_work(&sci->sc_iput_work); +} + +/* + * Main procedure of segment constructor + */ +static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int err; + + sci->sc_stage.scnt = NILFS_ST_INIT; + sci->sc_cno = nilfs->ns_cno; + + err = nilfs_segctor_collect_dirty_files(sci, nilfs); + if (unlikely(err)) + goto out; + + if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + if (nilfs_segctor_clean(sci)) + goto out; + + do { + sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; + + err = nilfs_segctor_begin_construction(sci, nilfs); + if (unlikely(err)) + goto out; + + /* Update time stamp */ + sci->sc_seg_ctime = get_seconds(); + + err = nilfs_segctor_collect(sci, nilfs, mode); + if (unlikely(err)) + goto failed; + + /* Avoid empty segment */ + if (sci->sc_stage.scnt == NILFS_ST_DONE && + nilfs_segbuf_empty(sci->sc_curseg)) { + nilfs_segctor_abort_construction(sci, nilfs, 1); + goto out; + } + + err = nilfs_segctor_assign(sci, mode); + if (unlikely(err)) + goto failed; + + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_segctor_fill_in_file_bmap(sci); + + if (mode == SC_LSEG_SR && + sci->sc_stage.scnt >= NILFS_ST_CPFILE) { + err = nilfs_segctor_fill_in_checkpoint(sci); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_fill_in_super_root(sci, nilfs); + } + nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); + + /* Write partial segments */ + nilfs_segctor_prepare_write(sci); + + nilfs_add_checksums_on_logs(&sci->sc_segbufs, + nilfs->ns_crc_seed); + + err = nilfs_segctor_write(sci, nilfs); + if (unlikely(err)) + goto failed_to_write; + + if (sci->sc_stage.scnt == NILFS_ST_DONE || + nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { + /* + * At this point, we avoid double buffering + * for blocksize < pagesize because page dirty + * flag is turned off during write and dirty + * buffers are not properly collected for + * pages crossing over segments. + */ + err = nilfs_segctor_wait(sci); + if (err) + goto failed_to_write; + } + } while (sci->sc_stage.scnt != NILFS_ST_DONE); + + out: + nilfs_segctor_drop_written_files(sci, nilfs); + return err; + + failed_to_write: + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_redirty_inodes(&sci->sc_dirty_files); + + failed: + if (nilfs_doing_gc()) + nilfs_redirty_inodes(&sci->sc_gc_inodes); + nilfs_segctor_abort_construction(sci, nilfs, err); + goto out; +} + +/** + * nilfs_segctor_start_timer - set timer of background write + * @sci: nilfs_sc_info + * + * If the timer has already been set, it ignores the new request. + * This function MUST be called within a section locking the segment + * semaphore. + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer.expires = jiffies + sci->sc_interval; + add_timer(&sci->sc_timer); + sci->sc_state |= NILFS_SEGCTOR_COMMIT; + } + spin_unlock(&sci->sc_state_lock); +} + +static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_flush_request & (1 << bn))) { + unsigned long prev_req = sci->sc_flush_request; + + sci->sc_flush_request |= (1 << bn); + if (!prev_req) + wake_up(&sci->sc_wait_daemon); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_flush_segment - trigger a segment construction for resource control + * @sb: super block + * @ino: inode number of the file to be flushed out. + */ +void nilfs_flush_segment(struct super_block *sb, ino_t ino) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (!sci || nilfs_doing_construction()) + return; + nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); + /* assign bit 0 to data files */ +} + +struct nilfs_segctor_wait_request { + wait_queue_t wq; + __u32 seq; + int err; + atomic_t done; +}; + +static int nilfs_segctor_sync(struct nilfs_sc_info *sci) +{ + struct nilfs_segctor_wait_request wait_req; + int err = 0; + + spin_lock(&sci->sc_state_lock); + init_wait(&wait_req.wq); + wait_req.err = 0; + atomic_set(&wait_req.done, 0); + wait_req.seq = ++sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + init_waitqueue_entry(&wait_req.wq, current); + add_wait_queue(&sci->sc_wait_request, &wait_req.wq); + set_current_state(TASK_INTERRUPTIBLE); + wake_up(&sci->sc_wait_daemon); + + for (;;) { + if (atomic_read(&wait_req.done)) { + err = wait_req.err; + break; + } + if (!signal_pending(current)) { + schedule(); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(&sci->sc_wait_request, &wait_req.wq); + return err; +} + +static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) +{ + struct nilfs_segctor_wait_request *wrq, *n; + unsigned long flags; + + spin_lock_irqsave(&sci->sc_wait_request.lock, flags); + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, + wq.task_list) { + if (!atomic_read(&wrq->done) && + nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { + wrq->err = err; + atomic_set(&wrq->done, 1); + } + if (atomic_read(&wrq->done)) { + wrq->wq.func(&wrq->wq, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); +} + +/** + * nilfs_construct_segment - construct a logical segment + * @sb: super block + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_segment(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_transaction_info *ti; + int err; + + if (!sci) + return -EROFS; + + /* A call inside transactions causes a deadlock. */ + BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); + + err = nilfs_segctor_sync(sci); + return err; +} + +/** + * nilfs_construct_dsync_segment - construct a data-only logical segment + * @sb: super block + * @inode: inode whose data blocks should be written out + * @start: start byte offset + * @end: end byte offset (inclusive) + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, + loff_t start, loff_t end) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_inode_info *ii; + struct nilfs_transaction_info ti; + int err = 0; + + if (!sci) + return -EROFS; + + nilfs_transaction_lock(sb, &ti, 0); + + ii = NILFS_I(inode); + if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) || + nilfs_test_opt(nilfs, STRICT_ORDER) || + test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + nilfs_discontinued(nilfs)) { + nilfs_transaction_unlock(sb); + err = nilfs_segctor_sync(sci); + return err; + } + + spin_lock(&nilfs->ns_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + spin_unlock(&nilfs->ns_inode_lock); + nilfs_transaction_unlock(sb); + return 0; + } + spin_unlock(&nilfs->ns_inode_lock); + sci->sc_dsync_inode = ii; + sci->sc_dsync_start = start; + sci->sc_dsync_end = end; + + err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + if (!err) + nilfs->ns_flushed_device = 0; + + nilfs_transaction_unlock(sb); + return err; +} + +#define FLUSH_FILE_BIT (0x1) /* data file only */ +#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ + +/** + * nilfs_segctor_accept - record accepted sequence count of log-write requests + * @sci: segment constructor object + */ +static void nilfs_segctor_accept(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + sci->sc_seq_accepted = sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + del_timer_sync(&sci->sc_timer); +} + +/** + * nilfs_segctor_notify - notify the result of request to caller threads + * @sci: segment constructor object + * @mode: mode of log forming + * @err: error code to be notified + */ +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) +{ + /* Clear requests (even when the construction failed) */ + spin_lock(&sci->sc_state_lock); + + if (mode == SC_LSEG_SR) { + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; + sci->sc_seq_done = sci->sc_seq_accepted; + nilfs_segctor_wakeup(sci, err); + sci->sc_flush_request = 0; + } else { + if (mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + + /* re-enable timer if checkpoint creation was not done */ + if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer.expires)) + add_timer(&sci->sc_timer); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_segctor_construct - form logs and write them to disk + * @sci: segment constructor object + * @mode: mode of log forming + */ +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct nilfs_super_block **sbp; + int err = 0; + + nilfs_segctor_accept(sci); + + if (nilfs_discontinued(nilfs)) + mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) + err = nilfs_segctor_do_construct(sci, mode); + + if (likely(!err)) { + if (mode != SC_FLUSH_DAT) + atomic_set(&nilfs->ns_ndirtyblks, 0); + if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && + nilfs_discontinued(nilfs)) { + down_write(&nilfs->ns_sem); + err = -EIO; + sbp = nilfs_prepare_super(sci->sc_super, + nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + err = nilfs_commit_super(sci->sc_super, + NILFS_SB_COMMIT); + } + up_write(&nilfs->ns_sem); + } + } + + nilfs_segctor_notify(sci, mode, err); + return err; +} + +static void nilfs_construction_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +static void +nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) +{ + struct nilfs_inode_info *ii, *n; + + list_for_each_entry_safe(ii, n, head, i_dirty) { + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + list_del_init(&ii->i_dirty); + truncate_inode_pages(&ii->vfs_inode.i_data, 0); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); + iput(&ii->vfs_inode); + } +} + +int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, + void **kbufs) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_transaction_info ti; + int err; + + if (unlikely(!sci)) + return -EROFS; + + nilfs_transaction_lock(sb, &ti, 1); + + err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat); + if (unlikely(err)) + goto out_unlock; + + err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); + if (unlikely(err)) { + nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat); + goto out_unlock; + } + + sci->sc_freesegs = kbufs[4]; + sci->sc_nfreesegs = argv[4].v_nmembs; + list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); + + for (;;) { + err = nilfs_segctor_construct(sci, SC_LSEG_SR); + nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); + + if (likely(!err)) + break; + + nilfs_warning(sb, __func__, + "segment construction failed. (err=%d)", err); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(sci->sc_interval); + } + if (nilfs_test_opt(nilfs, DISCARD)) { + int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, + sci->sc_nfreesegs); + if (ret) { + printk(KERN_WARNING + "NILFS warning: error %d on discard request, " + "turning discards off for the device\n", ret); + nilfs_clear_opt(nilfs, DISCARD); + } + } + + out_unlock: + sci->sc_freesegs = NULL; + sci->sc_nfreesegs = 0; + nilfs_mdt_clear_shadow_map(nilfs->ns_dat); + nilfs_transaction_unlock(sb); + return err; +} + +static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_transaction_info ti; + + nilfs_transaction_lock(sci->sc_super, &ti, 0); + nilfs_segctor_construct(sci, mode); + + /* + * Unclosed segment should be retried. We do this using sc_timer. + * Timeout of sc_timer will invoke complete construction which leads + * to close the current logical segment. + */ + if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) + nilfs_segctor_start_timer(sci); + + nilfs_transaction_unlock(sci->sc_super); +} + +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) +{ + int mode = 0; + int err; + + spin_lock(&sci->sc_state_lock); + mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? + SC_FLUSH_DAT : SC_FLUSH_FILE; + spin_unlock(&sci->sc_state_lock); + + if (mode) { + err = nilfs_segctor_do_construct(sci, mode); + + spin_lock(&sci->sc_state_lock); + sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? + ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; + spin_unlock(&sci->sc_state_lock); + } + clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); +} + +static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) +{ + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { + if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) + return SC_FLUSH_FILE; + else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) + return SC_FLUSH_DAT; + } + return SC_LSEG_SR; +} + +/** + * nilfs_segctor_thread - main loop of the segment constructor thread. + * @arg: pointer to a struct nilfs_sc_info. + * + * nilfs_segctor_thread() initializes a timer and serves as a daemon + * to execute segment constructions. + */ +static int nilfs_segctor_thread(void *arg) +{ + struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int timeout = 0; + + sci->sc_timer.data = (unsigned long)current; + sci->sc_timer.function = nilfs_construction_timeout; + + /* start sync. */ + sci->sc_task = current; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ + printk(KERN_INFO + "segctord starting. Construction interval = %lu seconds, " + "CP frequency < %lu seconds\n", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + + spin_lock(&sci->sc_state_lock); + loop: + for (;;) { + int mode; + + if (sci->sc_state & NILFS_SEGCTOR_QUIT) + goto end_thread; + + if (timeout || sci->sc_seq_request != sci->sc_seq_done) + mode = SC_LSEG_SR; + else if (!sci->sc_flush_request) + break; + else + mode = nilfs_segctor_flush_mode(sci); + + spin_unlock(&sci->sc_state_lock); + nilfs_segctor_thread_construct(sci, mode); + spin_lock(&sci->sc_state_lock); + timeout = 0; + } + + + if (freezing(current)) { + spin_unlock(&sci->sc_state_lock); + try_to_freeze(); + spin_lock(&sci->sc_state_lock); + } else { + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&sci->sc_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + + if (sci->sc_seq_request != sci->sc_seq_done) + should_sleep = 0; + else if (sci->sc_flush_request) + should_sleep = 0; + else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) + should_sleep = time_before(jiffies, + sci->sc_timer.expires); + + if (should_sleep) { + spin_unlock(&sci->sc_state_lock); + schedule(); + spin_lock(&sci->sc_state_lock); + } + finish_wait(&sci->sc_wait_daemon, &wait); + timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_after_eq(jiffies, sci->sc_timer.expires)); + + if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); + } + goto loop; + + end_thread: + spin_unlock(&sci->sc_state_lock); + + /* end sync. */ + sci->sc_task = NULL; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + return 0; +} + +static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) +{ + struct task_struct *t; + + t = kthread_run(nilfs_segctor_thread, sci, "segctord"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + + printk(KERN_ERR "NILFS: error %d creating segctord thread\n", + err); + return err; + } + wait_event(sci->sc_wait_task, sci->sc_task != NULL); + return 0; +} + +static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) + __acquires(&sci->sc_state_lock) + __releases(&sci->sc_state_lock) +{ + sci->sc_state |= NILFS_SEGCTOR_QUIT; + + while (sci->sc_task) { + wake_up(&sci->sc_wait_daemon); + spin_unlock(&sci->sc_state_lock); + wait_event(sci->sc_wait_task, sci->sc_task == NULL); + spin_lock(&sci->sc_state_lock); + } +} + +/* + * Setup & clean-up functions + */ +static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, + struct nilfs_root *root) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci; + + sci = kzalloc(sizeof(*sci), GFP_KERNEL); + if (!sci) + return NULL; + + sci->sc_super = sb; + + nilfs_get_root(root); + sci->sc_root = root; + + init_waitqueue_head(&sci->sc_wait_request); + init_waitqueue_head(&sci->sc_wait_daemon); + init_waitqueue_head(&sci->sc_wait_task); + spin_lock_init(&sci->sc_state_lock); + INIT_LIST_HEAD(&sci->sc_dirty_files); + INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_write_logs); + INIT_LIST_HEAD(&sci->sc_gc_inodes); + INIT_LIST_HEAD(&sci->sc_iput_queue); + INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); + init_timer(&sci->sc_timer); + + sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; + sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; + sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; + + if (nilfs->ns_interval) + sci->sc_interval = HZ * nilfs->ns_interval; + if (nilfs->ns_watermark) + sci->sc_watermark = nilfs->ns_watermark; + return sci; +} + +static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) +{ + int ret, retrycount = NILFS_SC_CLEANUP_RETRY; + + /* The segctord thread was stopped and its timer was removed. + But some tasks remain. */ + do { + struct nilfs_transaction_info ti; + + nilfs_transaction_lock(sci->sc_super, &ti, 0); + ret = nilfs_segctor_construct(sci, SC_LSEG_SR); + nilfs_transaction_unlock(sci->sc_super); + + flush_work(&sci->sc_iput_work); + + } while (ret && retrycount-- > 0); +} + +/** + * nilfs_segctor_destroy - destroy the segment constructor. + * @sci: nilfs_sc_info + * + * nilfs_segctor_destroy() kills the segctord thread and frees + * the nilfs_sc_info struct. + * Caller must hold the segment semaphore. + */ +static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int flag; + + up_write(&nilfs->ns_segctor_sem); + + spin_lock(&sci->sc_state_lock); + nilfs_segctor_kill_thread(sci); + flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request + || sci->sc_seq_request != sci->sc_seq_done); + spin_unlock(&sci->sc_state_lock); + + if (flush_work(&sci->sc_iput_work)) + flag = true; + + if (flag || !nilfs_segctor_confirm(sci)) + nilfs_segctor_write_out(sci); + + if (!list_empty(&sci->sc_dirty_files)) { + nilfs_warning(sci->sc_super, __func__, + "dirty file(s) after the final construction\n"); + nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); + } + + if (!list_empty(&sci->sc_iput_queue)) { + nilfs_warning(sci->sc_super, __func__, + "iput queue is not empty\n"); + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); + } + + WARN_ON(!list_empty(&sci->sc_segbufs)); + WARN_ON(!list_empty(&sci->sc_write_logs)); + + nilfs_put_root(sci->sc_root); + + down_write(&nilfs->ns_segctor_sem); + + del_timer_sync(&sci->sc_timer); + kfree(sci); +} + +/** + * nilfs_attach_log_writer - attach log writer + * @sb: super block instance + * @root: root object of the current filesystem tree + * + * This allocates a log writer object, initializes it, and starts the + * log writer. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + if (nilfs->ns_writer) { + /* + * This happens if the filesystem was remounted + * read/write after nilfs_error degenerated it into a + * read-only mount. + */ + nilfs_detach_log_writer(sb); + } + + nilfs->ns_writer = nilfs_segctor_new(sb, root); + if (!nilfs->ns_writer) + return -ENOMEM; + + err = nilfs_segctor_start_thread(nilfs->ns_writer); + if (err) { + kfree(nilfs->ns_writer); + nilfs->ns_writer = NULL; + } + return err; +} + +/** + * nilfs_detach_log_writer - destroy log writer + * @sb: super block instance + * + * This kills log writer daemon, frees the log writer object, and + * destroys list of dirty files. + */ +void nilfs_detach_log_writer(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + LIST_HEAD(garbage_list); + + down_write(&nilfs->ns_segctor_sem); + if (nilfs->ns_writer) { + nilfs_segctor_destroy(nilfs->ns_writer); + nilfs->ns_writer = NULL; + } + + /* Force to free the list of dirty files */ + spin_lock(&nilfs->ns_inode_lock); + if (!list_empty(&nilfs->ns_dirty_files)) { + list_splice_init(&nilfs->ns_dirty_files, &garbage_list); + nilfs_warning(sb, __func__, + "Hit dirty file after stopped log writer\n"); + } + spin_unlock(&nilfs->ns_inode_lock); + up_write(&nilfs->ns_segctor_sem); + + nilfs_dispose_list(nilfs, &garbage_list, 1); +} diff --git a/kernel/fs/nilfs2/segment.h b/kernel/fs/nilfs2/segment.h new file mode 100644 index 000000000..a48d6de1e --- /dev/null +++ b/kernel/fs/nilfs2/segment.h @@ -0,0 +1,251 @@ +/* + * segment.h - NILFS Segment constructor prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ +#ifndef _NILFS_SEGMENT_H +#define _NILFS_SEGMENT_H + +#include +#include +#include +#include +#include +#include "nilfs.h" + +struct nilfs_root; + +/** + * struct nilfs_recovery_info - Recovery information + * @ri_need_recovery: Recovery status + * @ri_super_root: Block number of the last super root + * @ri_ri_cno: Number of the last checkpoint + * @ri_lsegs_start: Region for roll-forwarding (start block number) + * @ri_lsegs_end: Region for roll-forwarding (end block number) + * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start + * @ri_used_segments: List of segments to be mark active + * @ri_pseg_start: Block number of the last partial segment + * @ri_seq: Sequence number on the last partial segment + * @ri_segnum: Segment number on the last partial segment + * @ri_nextnum: Next segment number on the last partial segment + */ +struct nilfs_recovery_info { + int ri_need_recovery; + sector_t ri_super_root; + __u64 ri_cno; + + sector_t ri_lsegs_start; + sector_t ri_lsegs_end; + u64 ri_lsegs_start_seq; + struct list_head ri_used_segments; + sector_t ri_pseg_start; + u64 ri_seq; + __u64 ri_segnum; + __u64 ri_nextnum; +}; + +/* ri_need_recovery */ +#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */ +#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */ + +/** + * struct nilfs_cstage - Context of collection stage + * @scnt: Stage count + * @flags: State flags + * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file + * @gc_inode_ptr: Pointer on the list of gc-inodes + */ +struct nilfs_cstage { + int scnt; + unsigned flags; + struct nilfs_inode_info *dirty_file_ptr; + struct nilfs_inode_info *gc_inode_ptr; +}; + +struct nilfs_segment_buffer; + +struct nilfs_segsum_pointer { + struct buffer_head *bh; + unsigned offset; /* offset in bytes */ +}; + +/** + * struct nilfs_sc_info - Segment constructor information + * @sc_super: Back pointer to super_block struct + * @sc_root: root object of the current filesystem tree + * @sc_nblk_inc: Block count of current generation + * @sc_dirty_files: List of files to be written + * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_iput_queue: list of inodes for which iput should be done + * @sc_iput_work: work struct to defer iput call + * @sc_freesegs: array of segment numbers to be freed + * @sc_nfreesegs: number of segments on @sc_freesegs + * @sc_dsync_inode: inode whose data pages are written for a sync operation + * @sc_dsync_start: start byte offset of data pages + * @sc_dsync_end: end byte offset of data pages (inclusive) + * @sc_segbufs: List of segment buffers + * @sc_write_logs: List of segment buffers to hold logs under writing + * @sc_segbuf_nblocks: Number of available blocks in segment buffers. + * @sc_curseg: Current segment buffer + * @sc_stage: Collection stage + * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary + * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary + * @sc_blk_cnt: Block count of a file + * @sc_datablk_cnt: Data block count of a file + * @sc_nblk_this_inc: Number of blocks included in the current logical segment + * @sc_seg_ctime: Creation time + * @sc_cno: checkpoint number of current log + * @sc_flags: Internal flags + * @sc_state_lock: spinlock for sc_state and so on + * @sc_state: Segctord state flags + * @sc_flush_request: inode bitmap of metadata files to be flushed + * @sc_wait_request: Client request queue + * @sc_wait_daemon: Daemon wait queue + * @sc_wait_task: Start/end wait queue to control segctord task + * @sc_seq_request: Request counter + * @sc_seq_accept: Accepted request count + * @sc_seq_done: Completion counter + * @sc_sync: Request of explicit sync operation + * @sc_interval: Timeout value of background construction + * @sc_mjcp_freq: Frequency of creating checkpoints + * @sc_lseg_stime: Start time of the latest logical segment + * @sc_watermark: Watermark for the number of dirty buffers + * @sc_timer: Timer for segctord + * @sc_task: current thread of segctord + */ +struct nilfs_sc_info { + struct super_block *sc_super; + struct nilfs_root *sc_root; + + unsigned long sc_nblk_inc; + + struct list_head sc_dirty_files; + struct list_head sc_gc_inodes; + struct list_head sc_iput_queue; + struct work_struct sc_iput_work; + + __u64 *sc_freesegs; + size_t sc_nfreesegs; + + struct nilfs_inode_info *sc_dsync_inode; + loff_t sc_dsync_start; + loff_t sc_dsync_end; + + /* Segment buffers */ + struct list_head sc_segbufs; + struct list_head sc_write_logs; + unsigned long sc_segbuf_nblocks; + struct nilfs_segment_buffer *sc_curseg; + + struct nilfs_cstage sc_stage; + + struct nilfs_segsum_pointer sc_finfo_ptr; + struct nilfs_segsum_pointer sc_binfo_ptr; + unsigned long sc_blk_cnt; + unsigned long sc_datablk_cnt; + unsigned long sc_nblk_this_inc; + time_t sc_seg_ctime; + __u64 sc_cno; + unsigned long sc_flags; + + spinlock_t sc_state_lock; + unsigned long sc_state; + unsigned long sc_flush_request; + + wait_queue_head_t sc_wait_request; + wait_queue_head_t sc_wait_daemon; + wait_queue_head_t sc_wait_task; + + __u32 sc_seq_request; + __u32 sc_seq_accepted; + __u32 sc_seq_done; + + int sc_sync; + unsigned long sc_interval; + unsigned long sc_mjcp_freq; + unsigned long sc_lseg_stime; /* in 1/HZ seconds */ + unsigned long sc_watermark; + + struct timer_list sc_timer; + struct task_struct *sc_task; +}; + +/* sc_flags */ +enum { + NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ + NILFS_SC_UNCLOSED, /* Logical segment is not closed */ + NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ + NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a + checkpoint */ + NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files + other than DAT, cpfile, sufile, or files + moved by GC */ +}; + +/* sc_state */ +#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */ +#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */ + +/* + * Constant parameters + */ +#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when + destroying segctord */ + +/* + * Default values of timeout, in seconds. + */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. + It triggers construction of a + logical segment with a super root */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root + creation */ + +/* + * The default threshold amount of data, in block counts. + */ +#define NILFS_SC_DEFAULT_WATERMARK 3600 + +/* super.c */ +extern struct kmem_cache *nilfs_transaction_cachep; + +/* segment.c */ +extern void nilfs_relax_pressure_in_lock(struct super_block *); + +extern int nilfs_construct_segment(struct super_block *); +extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, + loff_t, loff_t); +extern void nilfs_flush_segment(struct super_block *, ino_t); +extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, + void **); + +int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root); +void nilfs_detach_log_writer(struct super_block *sb); + +/* recovery.c */ +extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, + struct buffer_head **, int); +extern int nilfs_search_super_root(struct the_nilfs *, + struct nilfs_recovery_info *); +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, + struct nilfs_recovery_info *ri); +extern void nilfs_dispose_segment_list(struct list_head *); + +#endif /* _NILFS_SEGMENT_H */ diff --git a/kernel/fs/nilfs2/sufile.c b/kernel/fs/nilfs2/sufile.c new file mode 100644 index 000000000..2a869c35c --- /dev/null +++ b/kernel/fs/nilfs2/sufile.c @@ -0,0 +1,1222 @@ +/* + * sufile.c - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + * Revised by Ryusuke Konishi . + */ + +#include +#include +#include +#include +#include +#include +#include "mdt.h" +#include "sufile.h" + +/** + * struct nilfs_sufile_info - on-memory private data of sufile + * @mi: on-memory private data of metadata file + * @ncleansegs: number of clean segments + * @allocmin: lower limit of allocatable segment range + * @allocmax: upper limit of allocatable segment range + */ +struct nilfs_sufile_info { + struct nilfs_mdt_info mi; + unsigned long ncleansegs;/* number of clean segments */ + __u64 allocmin; /* lower limit of allocatable segment range */ + __u64 allocmax; /* upper limit of allocatable segment range */ +}; + +static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) +{ + return (struct nilfs_sufile_info *)NILFS_MDT(sufile); +} + +static inline unsigned long +nilfs_sufile_segment_usages_per_block(const struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_entries_per_block; +} + +static unsigned long +nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + return (unsigned long)t; +} + +static unsigned long +nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); +} + +static unsigned long +nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, + __u64 max) +{ + return min_t(unsigned long, + nilfs_sufile_segment_usages_per_block(sufile) - + nilfs_sufile_get_offset(sufile, curr), + max - curr + 1); +} + +static struct nilfs_segment_usage * +nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, + struct buffer_head *bh, void *kaddr) +{ + return kaddr + bh_offset(bh) + + nilfs_sufile_get_offset(sufile, segnum) * + NILFS_MDT(sufile)->mi_entry_size; +} + +static inline int nilfs_sufile_get_header_block(struct inode *sufile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); +} + +static inline int +nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum), + create, NULL, bhp); +} + +static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, + __u64 segnum) +{ + return nilfs_mdt_delete_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum)); +} + +static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, + u64 ncleanadd, u64 ndirtyadd) +{ + struct nilfs_sufile_header *header; + void *kaddr; + + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, ncleanadd); + le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); + kunmap_atomic(kaddr); + + mark_buffer_dirty(header_bh); +} + +/** + * nilfs_sufile_get_ncleansegs - return the number of clean segments + * @sufile: inode of segment usage file + */ +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) +{ + return NILFS_SUI(sufile)->ncleansegs; +} + +/** + * nilfs_sufile_updatev - modify multiple segment usages at a time + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @create: creation flag + * @ndone: place to store number of modified segments on @segnumv + * @dofunc: primitive operation for the update + * + * Description: nilfs_sufile_updatev() repeatedly calls @dofunc + * against the given array of segments. The @dofunc is called with + * buffers of a header block and the sufile block in which the target + * segment usage entry is contained. If @ndone is given, the number + * of successfully modified segments from the head is stored in the + * place @ndone points to. + * + * Return Value: On success, zero is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Given segment usage is in hole block (may be returned if + * @create is zero) + * + * %-EINVAL - Invalid segment usage number + */ +int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, + int create, size_t *ndone, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + unsigned long blkoff, prev_blkoff; + __u64 *seg; + size_t nerr = 0, n = 0; + int ret = 0; + + if (unlikely(nsegs == 0)) + goto out; + + down_write(&NILFS_MDT(sufile)->mi_sem); + for (seg = segnumv; seg < segnumv + nsegs; seg++) { + if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING + "%s: invalid segment number: %llu\n", __func__, + (unsigned long long)*seg); + nerr++; + } + } + if (nerr > 0) { + ret = -EINVAL; + goto out_sem; + } + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + seg = segnumv; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + dofunc(sufile, *seg, header_bh, bh); + + if (++seg >= segnumv + nsegs) + break; + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + brelse(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (unlikely(ret < 0)) + goto out_header; + } + brelse(bh); + + out_header: + n = seg - segnumv; + brelse(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + out: + if (ndone) + *ndone = n; + return ret; +} + +int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); + if (!ret) { + dofunc(sufile, segnum, header_bh, bh); + brelse(bh); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_set_alloc_range - limit range of segment to be allocated + * @sufile: inode of segment usage file + * @start: minimum segment number of allocatable region (inclusive) + * @end: maximum segment number of allocatable region (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-ERANGE - invalid segment region + */ +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + __u64 nsegs; + int ret = -ERANGE; + + down_write(&NILFS_MDT(sufile)->mi_sem); + nsegs = nilfs_sufile_get_nsegments(sufile); + + if (start <= end && end < nsegs) { + sui->allocmin = start; + sui->allocmax = end; + ret = 0; + } + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_alloc - allocate a segment + * @sufile: inode of segment usage file + * @segnump: pointer to segment number + * + * Description: nilfs_sufile_alloc() allocates a clean segment. + * + * Return Value: On success, 0 is returned and the segment number of the + * allocated segment is stored in the place pointed by @segnump. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No clean segment left. + */ +int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) +{ + struct buffer_head *header_bh, *su_bh; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + __u64 segnum, maxsegnum, last_alloc; + void *kaddr; + unsigned long nsegments, ncleansegs, nsus, cnt; + int ret, j; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + ncleansegs = le64_to_cpu(header->sh_ncleansegs); + last_alloc = le64_to_cpu(header->sh_last_alloc); + kunmap_atomic(kaddr); + + nsegments = nilfs_sufile_get_nsegments(sufile); + maxsegnum = sui->allocmax; + segnum = last_alloc + 1; + if (segnum < sui->allocmin || segnum > sui->allocmax) + segnum = sui->allocmin; + + for (cnt = 0; cnt < nsegments; cnt += nsus) { + if (segnum > maxsegnum) { + if (cnt < sui->allocmax - sui->allocmin + 1) { + /* + * wrap around in the limited region. + * if allocation started from + * sui->allocmin, this never happens. + */ + segnum = sui->allocmin; + maxsegnum = last_alloc; + } else if (segnum > sui->allocmin && + sui->allocmax + 1 < nsegments) { + segnum = sui->allocmax + 1; + maxsegnum = nsegments - 1; + } else if (sui->allocmin > 0) { + segnum = 0; + maxsegnum = sui->allocmin - 1; + } else { + break; /* never happens */ + } + } + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, + &su_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + + nsus = nilfs_sufile_segment_usages_in_block( + sufile, segnum, maxsegnum); + for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { + if (!nilfs_segment_usage_clean(su)) + continue; + /* found a clean segment */ + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); + + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + header->sh_last_alloc = cpu_to_le64(segnum); + kunmap_atomic(kaddr); + + sui->ncleansegs--; + mark_buffer_dirty(header_bh); + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + *segnump = segnum; + goto out_header; + } + + kunmap_atomic(kaddr); + brelse(su_bh); + } + + /* no segments left */ + ret = -ENOSPC; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (unlikely(!nilfs_segment_usage_clean(su))) { + printk(KERN_WARNING "%s: segment %llu must be clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr); + return; + } + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); + + nilfs_sufile_mod_counter(header_bh, -1, 1); + NILFS_SUI(sufile)->ncleansegs--; + + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int clean, dirty; + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + su->su_nblocks == cpu_to_le32(0)) { + kunmap_atomic(kaddr); + return; + } + clean = nilfs_segment_usage_clean(su); + dirty = nilfs_segment_usage_dirty(su); + + /* make the segment garbage */ + su->su_lastmod = cpu_to_le64(0); + su->su_nblocks = cpu_to_le32(0); + su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + kunmap_atomic(kaddr); + + nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + NILFS_SUI(sufile)->ncleansegs -= clean; + + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int sudirty; + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_clean(su)) { + printk(KERN_WARNING "%s: segment %llu is already clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr); + return; + } + WARN_ON(nilfs_segment_usage_error(su)); + WARN_ON(!nilfs_segment_usage_dirty(su)); + + sudirty = nilfs_segment_usage_dirty(su); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr); + mark_buffer_dirty(su_bh); + + nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + NILFS_SUI(sufile)->ncleansegs++; + + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty + * @sufile: inode of segment usage file + * @segnum: segment number + */ +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *bh; + int ret; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); + brelse(bh); + } + return ret; +} + +/** + * nilfs_sufile_set_segment_usage - set usage of a segment + * @sufile: inode of segment usage file + * @segnum: segment number + * @nblocks: number of live blocks in the segment + * @modtime: modification time (option) + */ +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime) +{ + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + if (modtime) + su->su_lastmod = cpu_to_le64(modtime); + su->su_nblocks = cpu_to_le32(nblocks); + kunmap_atomic(kaddr); + + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); + brelse(bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_stat - get segment usage statistics + * @sufile: inode of segment usage file + * @stat: pointer to a structure of segment usage statistics + * + * Description: nilfs_sufile_get_stat() returns information about segment + * usage. + * + * Return Value: On success, 0 is returned, and segment usage information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) +{ + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); + sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); + sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); + sustat->ss_ctime = nilfs->ns_ctime; + sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; + spin_lock(&nilfs->ns_last_segment_lock); + sustat->ss_prot_seq = nilfs->ns_prot_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + kunmap_atomic(kaddr); + brelse(header_bh); + + out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int suclean; + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap_atomic(kaddr); + return; + } + suclean = nilfs_segment_usage_clean(su); + nilfs_segment_usage_set_error(su); + kunmap_atomic(kaddr); + + if (suclean) { + nilfs_sufile_mod_counter(header_bh, -1, 0); + NILFS_SUI(sufile)->ncleansegs--; + } + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ +static int nilfs_sufile_truncate_range(struct inode *sufile, + __u64 start, __u64 end) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su, *su2; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + unsigned long segusages_per_block; + unsigned long nsegs, ncleaned; + __u64 segnum; + void *kaddr; + ssize_t n, nc; + int ret; + int j; + + nsegs = nilfs_sufile_get_nsegments(sufile); + + ret = -EINVAL; + if (start > end || start >= nsegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + ncleaned = 0; + + for (segnum = start; segnum <= end; segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + end - segnum + 1); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_header; + /* hole */ + continue; + } + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + su2 = su; + for (j = 0; j < n; j++, su = (void *)su + susz) { + if ((le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + nilfs_segment_is_active(nilfs, segnum + j)) { + ret = -EBUSY; + kunmap_atomic(kaddr); + brelse(su_bh); + goto out_header; + } + } + nc = 0; + for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { + if (nilfs_segment_usage_error(su)) { + nilfs_segment_usage_set_clean(su); + nc++; + } + } + kunmap_atomic(kaddr); + if (nc > 0) { + mark_buffer_dirty(su_bh); + ncleaned += nc; + } + brelse(su_bh); + + if (n == segusages_per_block) { + /* make hole */ + nilfs_sufile_delete_segment_usage_block(sufile, segnum); + } + } + ret = 0; + +out_header: + if (ncleaned > 0) { + NILFS_SUI(sufile)->ncleansegs += ncleaned; + nilfs_sufile_mod_counter(header_bh, ncleaned, 0); + nilfs_mdt_mark_dirty(sufile); + } + brelse(header_bh); +out: + return ret; +} + +/** + * nilfs_sufile_resize - resize segment array + * @sufile: inode of segment usage file + * @newnsegs: new number of segments + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - Enough free space is not left for shrinking + * + * %-EBUSY - Dirty or active segments exist in the region to be truncated + */ +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + void *kaddr; + unsigned long nsegs, nrsvsegs; + int ret = 0; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nsegs = nilfs_sufile_get_nsegments(sufile); + if (nsegs == newnsegs) + goto out; + + ret = -ENOSPC; + nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); + if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + if (newnsegs > nsegs) { + sui->ncleansegs += newnsegs - nsegs; + } else /* newnsegs < nsegs */ { + ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); + if (ret < 0) + goto out_header; + + sui->ncleansegs -= nsegs - newnsegs; + } + + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); + kunmap_atomic(kaddr); + + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + nilfs_set_nsegments(nilfs, newnsegs); + +out_header: + brelse(header_bh); +out: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_suinfo - + * @sufile: inode of segment usage file + * @segnum: segment number to start looking + * @buf: array of suinfo + * @sisz: byte size of suinfo + * @nsi: size of suinfo array + * + * Description: + * + * Return Value: On success, 0 is returned and .... On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, + unsigned sisz, size_t nsi) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + struct nilfs_suinfo *si = buf; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + void *kaddr; + unsigned long nsegs, segusages_per_block; + ssize_t n; + int ret, i, j; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs = min_t(unsigned long, + nilfs_sufile_get_nsegments(sufile) - segnum, + nsi); + for (i = 0; i < nsegs; i += n, segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + nsegs - i); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + memset(si, 0, sisz * n); + si = (void *)si + sisz * n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + for (j = 0; j < n; + j++, su = (void *)su + susz, si = (void *)si + sisz) { + si->sui_lastmod = le64_to_cpu(su->su_lastmod); + si->sui_nblocks = le32_to_cpu(su->su_nblocks); + si->sui_flags = le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + if (nilfs_segment_is_active(nilfs, segnum + j)) + si->sui_flags |= + (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + } + kunmap_atomic(kaddr); + brelse(su_bh); + } + ret = nsegs; + + out: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_set_suinfo - sets segment usage info + * @sufile: inode of segment usage file + * @buf: array of suinfo_update + * @supsz: byte size of suinfo_update + * @nsup: size of suinfo_update array + * + * Description: Takes an array of nilfs_suinfo_update structs and updates + * segment usage accordingly. Only the fields indicated by the sup_flags + * are updated. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, + unsigned supsz, size_t nsup) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh, *bh; + struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; + struct nilfs_segment_usage *su; + void *kaddr; + unsigned long blkoff, prev_blkoff; + int cleansi, cleansu, dirtysi, dirtysu; + long ncleaned = 0, ndirtied = 0; + int ret = 0; + + if (unlikely(nsup == 0)) + return ret; + + for (sup = buf; sup < supend; sup = (void *)sup + supsz) { + if (sup->sup_segnum >= nilfs->ns_nsegments + || (sup->sup_flags & + (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) + || (nilfs_suinfo_update_nblocks(sup) && + sup->sup_sui.sui_nblocks > + nilfs->ns_blocks_per_segment)) + return -EINVAL; + } + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + sup = buf; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, sup->sup_segnum, bh, kaddr); + + if (nilfs_suinfo_update_lastmod(sup)) + su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); + + if (nilfs_suinfo_update_nblocks(sup)) + su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); + + if (nilfs_suinfo_update_flags(sup)) { + /* + * Active flag is a virtual flag projected by running + * nilfs kernel code - drop it not to write it to + * disk. + */ + sup->sup_sui.sui_flags &= + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + + cleansi = nilfs_suinfo_clean(&sup->sup_sui); + cleansu = nilfs_segment_usage_clean(su); + dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); + dirtysu = nilfs_segment_usage_dirty(su); + + if (cleansi && !cleansu) + ++ncleaned; + else if (!cleansi && cleansu) + --ncleaned; + + if (dirtysi && !dirtysu) + ++ndirtied; + else if (!dirtysi && dirtysu) + --ndirtied; + + su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); + } + + kunmap_atomic(kaddr); + + sup = (void *)sup + supsz; + if (sup >= supend) + break; + + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + mark_buffer_dirty(bh); + put_bh(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (unlikely(ret < 0)) + goto out_mark; + } + mark_buffer_dirty(bh); + put_bh(bh); + + out_mark: + if (ncleaned || ndirtied) { + nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, + (u64)ndirtied); + NILFS_SUI(sufile)->ncleansegs += ncleaned; + } + nilfs_mdt_mark_dirty(sufile); + out_header: + put_bh(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_trim_fs() - trim ioctl handle function + * @sufile: inode of segment usage file + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * + * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes + * from start to start+len. start is rounded up to the next block boundary + * and start+len is rounded down. For each clean segment blkdev_issue_discard + * function is invoked. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + void *kaddr; + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; + sector_t seg_start, seg_end, start_block, end_block; + sector_t start = 0, nblocks = 0; + u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; + int ret = 0; + unsigned int sects_per_block; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + len = range->len >> nilfs->ns_blocksize_bits; + minlen = range->minlen >> nilfs->ns_blocksize_bits; + max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); + + if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) + return -EINVAL; + + start_block = (range->start + nilfs->ns_blocksize - 1) >> + nilfs->ns_blocksize_bits; + + /* + * range->len can be very large (actually, it is set to + * ULLONG_MAX by default) - truncate upper end of the range + * carefully so as not to overflow. + */ + if (max_blocks - start_block < len) + end_block = max_blocks - 1; + else + end_block = start_block + len - 1; + + segnum = nilfs_get_segnum_of_block(nilfs, start_block); + segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); + + down_read(&NILFS_MDT(sufile)->mi_sem); + + while (segnum <= segnum_end) { + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, + segnum_end); + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* hole */ + segnum += n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, + su_bh, kaddr); + for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { + if (!nilfs_segment_usage_clean(su)) + continue; + + nilfs_get_segment_range(nilfs, segnum, &seg_start, + &seg_end); + + if (!nblocks) { + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + continue; + } + + if (start + nblocks == seg_start) { + /* add to previous extent */ + nblocks += seg_end - seg_start + 1; + continue; + } + + /* discard previous extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + + if (nblocks >= minlen) { + kunmap_atomic(kaddr); + + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) { + put_bh(su_bh); + goto out_sem; + } + + ndiscarded += nblocks; + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + } + + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + } + kunmap_atomic(kaddr); + put_bh(su_bh); + } + + + if (nblocks) { + /* discard last extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + if (start + nblocks > end_block + 1) + nblocks = end_block - start + 1; + + if (nblocks >= minlen) { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (!ret) + ndiscarded += nblocks; + } + } + +out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + + range->len = ndiscarded << nilfs->ns_blocksize_bits; + return ret; +} + +/** + * nilfs_sufile_read - read or get sufile inode + * @sb: super block instance + * @susize: size of a segment usage entry + * @raw_inode: on-disk sufile inode + * @inodep: buffer to store the inode + */ +int nilfs_sufile_read(struct super_block *sb, size_t susize, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + struct inode *sufile; + struct nilfs_sufile_info *sui; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + void *kaddr; + int err; + + if (susize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { + printk(KERN_ERR + "NILFS: too small segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } + + sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); + if (unlikely(!sufile)) + return -ENOMEM; + if (!(sufile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); + if (err) + goto failed; + + nilfs_mdt_set_entry_size(sufile, susize, + sizeof(struct nilfs_sufile_header)); + + err = nilfs_read_inode_common(sufile, raw_inode); + if (err) + goto failed; + + err = nilfs_sufile_get_header_block(sufile, &header_bh); + if (err) + goto failed; + + sui = NILFS_SUI(sufile); + kaddr = kmap_atomic(header_bh->b_page); + header = kaddr + bh_offset(header_bh); + sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); + kunmap_atomic(kaddr); + brelse(header_bh); + + sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; + sui->allocmin = 0; + + unlock_new_inode(sufile); + out: + *inodep = sufile; + return 0; + failed: + iget_failed(sufile); + return err; +} diff --git a/kernel/fs/nilfs2/sufile.h b/kernel/fs/nilfs2/sufile.h new file mode 100644 index 000000000..b8afd72f2 --- /dev/null +++ b/kernel/fs/nilfs2/sufile.h @@ -0,0 +1,146 @@ +/* + * sufile.h - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_SUFILE_H +#define _NILFS_SUFILE_H + +#include +#include +#include +#include "mdt.h" + + +static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) +{ + return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments; +} + +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); + +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); +int nilfs_sufile_alloc(struct inode *, __u64 *); +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime); +int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, + size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); + +int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +int nilfs_sufile_update(struct inode *, __u64, int, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); + +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); +int nilfs_sufile_read(struct super_block *sb, size_t susize, + struct nilfs_inode *raw_inode, struct inode **inodep); +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); + +/** + * nilfs_sufile_scrap - make a segment garbage + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap); +} + +/** + * nilfs_sufile_free - free segment + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_freev - free segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of freed segments + */ +static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv, + size_t nsegs, size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_cancel_freev - reallocate freeing segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of cancelled segments + * + * Return Value: On success, 0 is returned. On error, a negative error codes + * is returned. + */ +static inline int nilfs_sufile_cancel_freev(struct inode *sufile, + __u64 *segnumv, size_t nsegs, + size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_cancel_free); +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_set_error); +} + +#endif /* _NILFS_SUFILE_H */ diff --git a/kernel/fs/nilfs2/super.c b/kernel/fs/nilfs2/super.c new file mode 100644 index 000000000..f47585bfe --- /dev/null +++ b/kernel/fs/nilfs2/super.c @@ -0,0 +1,1486 @@ +/* + * super.c - NILFS module and super block management. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "export.h" +#include "mdt.h" +#include "alloc.h" +#include "btree.h" +#include "btnode.h" +#include "page.h" +#include "cpfile.h" +#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */ +#include "ifile.h" +#include "dat.h" +#include "segment.h" +#include "segbuf.h" + +MODULE_AUTHOR("NTT Corp."); +MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " + "(NILFS)"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *nilfs_inode_cachep; +struct kmem_cache *nilfs_transaction_cachep; +struct kmem_cache *nilfs_segbuf_cachep; +struct kmem_cache *nilfs_btree_path_cache; + +static int nilfs_setup_super(struct super_block *sb, int is_mount); +static int nilfs_remount(struct super_block *sb, int *flags, char *data); + +static void nilfs_set_error(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + if (sbp[1]) + sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + } + up_write(&nilfs->ns_sem); +} + +/** + * nilfs_error() - report failure condition on a filesystem + * + * nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. It should be called when NILFS detects + * incoherences or defects of meta data on disk. As for sustainable + * errors such as a single-shot I/O error, nilfs_warning() or the printk() + * function should be used instead. + * + * The segment constructor must not call this function because it can + * kill itself. + */ +void nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (!(sb->s_flags & MS_RDONLY)) { + nilfs_set_error(sb); + + if (nilfs_test_opt(nilfs, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + } + + if (nilfs_test_opt(nilfs, ERRORS_PANIC)) + panic("NILFS (device %s): panic forced after error\n", + sb->s_id); +} + +void nilfs_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); +} + + +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + struct nilfs_inode_info *ii; + + ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + if (!ii) + return NULL; + ii->i_bh = NULL; + ii->i_state = 0; + ii->i_cno = 0; + ii->vfs_inode.i_version = 1; + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); + return &ii->vfs_inode; +} + +static void nilfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + if (mdi) { + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); + } + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); +} + +void nilfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nilfs_i_callback); +} + +static int nilfs_sync_super(struct super_block *sb, int flag) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + retry: + set_buffer_dirty(nilfs->ns_sbh[0]); + if (nilfs_test_opt(nilfs, BARRIER)) { + err = __sync_dirty_buffer(nilfs->ns_sbh[0], + WRITE_SYNC | WRITE_FLUSH_FUA); + } else { + err = sync_dirty_buffer(nilfs->ns_sbh[0]); + } + + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: unable to write superblock (err=%d)\n", err); + if (err == -EIO && nilfs->ns_sbh[1]) { + /* + * sbp[0] points to newer log than sbp[1], + * so copy sbp[0] to sbp[1] to take over sbp[0]. + */ + memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], + nilfs->ns_sbsize); + nilfs_fall_back_super_block(nilfs); + goto retry; + } + } else { + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbwcount++; + + /* + * The latest segment becomes trailable from the position + * written in superblock. + */ + clear_nilfs_discontinued(nilfs); + + /* update GC protection for recent segments */ + if (nilfs->ns_sbh[1]) { + if (flag == NILFS_SB_COMMIT_ALL) { + set_buffer_dirty(nilfs->ns_sbh[1]); + if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) + goto out; + } + if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < + le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) + sbp = nilfs->ns_sbp[1]; + } + + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + out: + return err; +} + +void nilfs_set_log_cursor(struct nilfs_super_block *sbp, + struct the_nilfs *nilfs) +{ + sector_t nfreeblocks; + + /* nilfs->ns_sem must be locked by the caller. */ + nilfs_count_free_blocks(nilfs, &nfreeblocks); + sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); + + spin_lock(&nilfs->ns_last_segment_lock); + sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); +} + +struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, + int flip) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + /* nilfs->ns_sem must be locked by the caller. */ + if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + if (sbp[1] && + sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + } else { + printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", + sb->s_id); + return NULL; + } + } else if (sbp[1] && + sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + } + + if (flip && sbp[1]) + nilfs_swap_super_block(nilfs); + + return sbp; +} + +int nilfs_commit_super(struct super_block *sb, int flag) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + time_t t; + + /* nilfs->ns_sem must be locked by the caller. */ + t = get_seconds(); + nilfs->ns_sbwtime = t; + sbp[0]->s_wtime = cpu_to_le64(t); + sbp[0]->s_sum = 0; + sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[0], + nilfs->ns_sbsize)); + if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { + sbp[1]->s_wtime = sbp[0]->s_wtime; + sbp[1]->s_sum = 0; + sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[1], + nilfs->ns_sbsize)); + } + clear_nilfs_sb_dirty(nilfs); + nilfs->ns_flushed_device = 1; + /* make sure store to ns_flushed_device cannot be reordered */ + smp_wmb(); + return nilfs_sync_super(sb, flag); +} + +/** + * nilfs_cleanup_super() - write filesystem state for cleanup + * @sb: super block instance to be unmounted or degraded to read-only + * + * This function restores state flags in the on-disk super block. + * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the + * filesystem was not clean previously. + */ +int nilfs_cleanup_super(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int flag = NILFS_SB_COMMIT; + int ret = -EIO; + + sbp = nilfs_prepare_super(sb, 0); + if (sbp) { + sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + nilfs_set_log_cursor(sbp[0], nilfs); + if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { + /* + * make the "clean" flag also to the opposite + * super block if both super blocks point to + * the same checkpoint. + */ + sbp[1]->s_state = sbp[0]->s_state; + flag = NILFS_SB_COMMIT_ALL; + } + ret = nilfs_commit_super(sb, flag); + } + return ret; +} + +/** + * nilfs_move_2nd_super - relocate secondary super block + * @sb: super block instance + * @sb2off: new offset of the secondary super block (in bytes) + */ +static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *nsbh; + struct nilfs_super_block *nsbp; + sector_t blocknr, newblocknr; + unsigned long offset; + int sb2i = -1; /* array index of the secondary superblock */ + int ret = 0; + + /* nilfs->ns_sem must be locked by the caller. */ + if (nilfs->ns_sbh[1] && + nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 1; + blocknr = nilfs->ns_sbh[1]->b_blocknr; + } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 0; + blocknr = nilfs->ns_sbh[0]->b_blocknr; + } + if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) + goto out; /* super block location is unchanged */ + + /* Get new super block buffer */ + newblocknr = sb2off >> nilfs->ns_blocksize_bits; + offset = sb2off & (nilfs->ns_blocksize - 1); + nsbh = sb_getblk(sb, newblocknr); + if (!nsbh) { + printk(KERN_WARNING + "NILFS warning: unable to move secondary superblock " + "to block %llu\n", (unsigned long long)newblocknr); + ret = -EIO; + goto out; + } + nsbp = (void *)nsbh->b_data + offset; + memset(nsbp, 0, nilfs->ns_blocksize); + + if (sb2i >= 0) { + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + brelse(nilfs->ns_sbh[sb2i]); + nilfs->ns_sbh[sb2i] = nsbh; + nilfs->ns_sbp[sb2i] = nsbp; + } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) { + /* secondary super block will be restored to index 1 */ + nilfs->ns_sbh[1] = nsbh; + nilfs->ns_sbp[1] = nsbp; + } else { + brelse(nsbh); + } +out: + return ret; +} + +/** + * nilfs_resize_fs - resize the filesystem + * @sb: super block instance + * @newsize: new size of the filesystem (in bytes) + */ +int nilfs_resize_fs(struct super_block *sb, __u64 newsize) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + __u64 devsize, newnsegs; + loff_t sb2off; + int ret; + + ret = -ERANGE; + devsize = i_size_read(sb->s_bdev->bd_inode); + if (newsize > devsize) + goto out; + + /* + * Write lock is required to protect some functions depending + * on the number of segments, the number of reserved segments, + * and so forth. + */ + down_write(&nilfs->ns_segctor_sem); + + sb2off = NILFS_SB2_OFFSET_BYTES(newsize); + newnsegs = sb2off >> nilfs->ns_blocksize_bits; + do_div(newnsegs, nilfs->ns_blocks_per_segment); + + ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); + up_write(&nilfs->ns_segctor_sem); + if (ret < 0) + goto out; + + ret = nilfs_construct_segment(sb); + if (ret < 0) + goto out; + + down_write(&nilfs->ns_sem); + nilfs_move_2nd_super(sb, sb2off); + ret = -EIO; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + /* + * Drop NILFS_RESIZE_FS flag for compatibility with + * mount-time resize which may be implemented in a + * future release. + */ + sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & + ~NILFS_RESIZE_FS); + sbp[0]->s_dev_size = cpu_to_le64(newsize); + sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + up_write(&nilfs->ns_sem); + + /* + * Reset the range of allocatable segments last. This order + * is important in the case of expansion because the secondary + * superblock must be protected from log write until migration + * completes. + */ + if (!ret) + nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1); +out: + return ret; +} + +static void nilfs_put_super(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + nilfs_detach_log_writer(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + } + + iput(nilfs->ns_sufile); + iput(nilfs->ns_cpfile); + iput(nilfs->ns_dat); + + destroy_nilfs(nilfs); + sb->s_fs_info = NULL; +} + +static int nilfs_sync_fs(struct super_block *sb, int wait) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int err = 0; + + /* This function is called when super block should be written back */ + if (wait) + err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (nilfs_sb_dirty(nilfs)) { + sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + nilfs_commit_super(sb, NILFS_SB_COMMIT); + } + } + up_write(&nilfs->ns_sem); + + if (!err) + err = nilfs_flush_device(nilfs); + + return err; +} + +int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, + struct nilfs_root **rootp) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root; + struct nilfs_checkpoint *raw_cp; + struct buffer_head *bh_cp; + int err = -ENOMEM; + + root = nilfs_find_or_create_root( + nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno); + if (!root) + return err; + + if (root->ifile) + goto reuse; /* already attached checkpoint */ + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, + &bh_cp); + up_read(&nilfs->ns_segctor_sem); + if (unlikely(err)) { + if (err == -ENOENT || err == -EINVAL) { + printk(KERN_ERR + "NILFS: Invalid checkpoint " + "(checkpoint number=%llu)\n", + (unsigned long long)cno); + err = -EINVAL; + } + goto failed; + } + + err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size, + &raw_cp->cp_ifile_inode, &root->ifile); + if (err) + goto failed_bh; + + atomic64_set(&root->inodes_count, + le64_to_cpu(raw_cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, + le64_to_cpu(raw_cp->cp_blocks_count)); + + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + + reuse: + *rootp = root; + return 0; + + failed_bh: + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + failed: + nilfs_put_root(root); + + return err; +} + +static int nilfs_freeze(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + if (sb->s_flags & MS_RDONLY) + return 0; + + /* Mark super block clean */ + down_write(&nilfs->ns_sem); + err = nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + return err; +} + +static int nilfs_unfreeze(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + if (sb->s_flags & MS_RDONLY) + return 0; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, false); + up_write(&nilfs->ns_sem); + return 0; +} + +static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; + struct the_nilfs *nilfs = root->nilfs; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + unsigned long long blocks; + unsigned long overhead; + unsigned long nrsvblocks; + sector_t nfreeblocks; + u64 nmaxinodes, nfreeinodes; + int err; + + /* + * Compute all of the segment blocks + * + * The blocks before first segment and after last segment + * are excluded. + */ + blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments + - nilfs->ns_first_data_block; + nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; + + /* + * Compute the overhead + * + * When distributing meta data blocks outside segment structure, + * We must count them as the overhead. + */ + overhead = 0; + + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) + return err; + + err = nilfs_ifile_count_free_inodes(root->ifile, + &nmaxinodes, &nfreeinodes); + if (unlikely(err)) { + printk(KERN_WARNING + "NILFS warning: fail to count free inodes: err %d.\n", + err); + if (err == -ERANGE) { + /* + * If nilfs_palloc_count_max_entries() returns + * -ERANGE error code then we simply treat + * curent inodes count as maximum possible and + * zero as free inodes value. + */ + nmaxinodes = atomic64_read(&root->inodes_count); + nfreeinodes = 0; + err = 0; + } else + return err; + } + + buf->f_type = NILFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = blocks - overhead; + buf->f_bfree = nfreeblocks; + buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? + (buf->f_bfree - nrsvblocks) : 0; + buf->f_files = nmaxinodes; + buf->f_ffree = nfreeinodes; + buf->f_namelen = NILFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; + + if (!nilfs_test_opt(nilfs, BARRIER)) + seq_puts(seq, ",nobarrier"); + if (root->cno != NILFS_CPTREE_CURRENT_CNO) + seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno); + if (nilfs_test_opt(nilfs, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (nilfs_test_opt(nilfs, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (nilfs_test_opt(nilfs, STRICT_ORDER)) + seq_puts(seq, ",order=strict"); + if (nilfs_test_opt(nilfs, NORECOVERY)) + seq_puts(seq, ",norecovery"); + if (nilfs_test_opt(nilfs, DISCARD)) + seq_puts(seq, ",discard"); + + return 0; +} + +static const struct super_operations nilfs_sops = { + .alloc_inode = nilfs_alloc_inode, + .destroy_inode = nilfs_destroy_inode, + .dirty_inode = nilfs_dirty_inode, + .evict_inode = nilfs_evict_inode, + .put_super = nilfs_put_super, + .sync_fs = nilfs_sync_fs, + .freeze_fs = nilfs_freeze, + .unfreeze_fs = nilfs_unfreeze, + .statfs = nilfs_statfs, + .remount_fs = nilfs_remount, + .show_options = nilfs_show_options +}; + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, Opt_nodiscard, Opt_err, +}; + +static match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_snapshot, "cp=%u"}, + {Opt_order, "order=%s"}, + {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb, int is_remount) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + nilfs_set_opt(nilfs, BARRIER); + break; + case Opt_nobarrier: + nilfs_clear_opt(nilfs, BARRIER); + break; + case Opt_order: + if (strcmp(args[0].from, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(nilfs, STRICT_ORDER); + else if (strcmp(args[0].from, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(nilfs, STRICT_ORDER); + else + return 0; + break; + case Opt_err_panic: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC); + break; + case Opt_err_ro: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO); + break; + case Opt_err_cont: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT); + break; + case Opt_snapshot: + if (is_remount) { + printk(KERN_ERR + "NILFS: \"%s\" option is invalid " + "for remount.\n", p); + return 0; + } + break; + case Opt_norecovery: + nilfs_set_opt(nilfs, NORECOVERY); + break; + case Opt_discard: + nilfs_set_opt(nilfs, DISCARD); + break; + case Opt_nodiscard: + nilfs_clear_opt(nilfs, DISCARD); + break; + default: + printk(KERN_ERR + "NILFS: Unrecognized mount option \"%s\"\n", p); + return 0; + } + } + return 1; +} + +static inline void +nilfs_set_default_options(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + nilfs->ns_mount_opt = + NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; +} + +static int nilfs_setup_super(struct super_block *sb, int is_mount) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int max_mnt_count; + int mnt_count; + + /* nilfs->ns_sem must be locked by the caller. */ + sbp = nilfs_prepare_super(sb, 0); + if (!sbp) + return -EIO; + + if (!is_mount) + goto skip_mount_setup; + + max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); + mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); + + if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + printk(KERN_WARNING + "NILFS warning: mounting fs with errors\n"); +#if 0 + } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { + printk(KERN_WARNING + "NILFS warning: maximal mount count reached\n"); +#endif + } + if (!max_mnt_count) + sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + +skip_mount_setup: + sbp[0]->s_state = + cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); + /* synchronize sbp[1] with sbp[0] */ + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); +} + +struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, + u64 pos, int blocksize, + struct buffer_head **pbh) +{ + unsigned long long sb_index = pos; + unsigned long offset; + + offset = do_div(sb_index, blocksize); + *pbh = sb_bread(sb, sb_index); + if (!*pbh) + return NULL; + return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); +} + +int nilfs_store_magic_and_option(struct super_block *sb, + struct nilfs_super_block *sbp, + char *data) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + sb->s_magic = le16_to_cpu(sbp->s_magic); + + /* FS independent flags */ +#ifdef NILFS_ATIME_DISABLE + sb->s_flags |= MS_NOATIME; +#endif + + nilfs_set_default_options(sb, sbp); + + nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid); + nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid); + nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); + nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); + + return !parse_options(data, sb, 0) ? -EINVAL : 0 ; +} + +int nilfs_check_feature_compatibility(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + __u64 features; + + features = le64_to_cpu(sbp->s_feature_incompat) & + ~NILFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + features = le64_to_cpu(sbp->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "NILFS: couldn't mount RDWR because of " + "unsupported optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + return 0; +} + +static int nilfs_get_root_dentry(struct super_block *sb, + struct nilfs_root *root, + struct dentry **root_dentry) +{ + struct inode *inode; + struct dentry *dentry; + int ret = 0; + + inode = nilfs_iget(sb, root, NILFS_ROOT_INO); + if (IS_ERR(inode)) { + printk(KERN_ERR "NILFS: get root inode failed\n"); + ret = PTR_ERR(inode); + goto out; + } + if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) { + iput(inode); + printk(KERN_ERR "NILFS: corrupt root inode.\n"); + ret = -EINVAL; + goto out; + } + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + dentry = d_find_alias(inode); + if (!dentry) { + dentry = d_make_root(inode); + if (!dentry) { + ret = -ENOMEM; + goto failed_dentry; + } + } else { + iput(inode); + } + } else { + dentry = d_obtain_root(inode); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto failed_dentry; + } + } + *root_dentry = dentry; + out: + return ret; + + failed_dentry: + printk(KERN_ERR "NILFS: get root dentry failed\n"); + goto out; +} + +static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, + struct dentry **root_dentry) +{ + struct the_nilfs *nilfs = s->s_fs_info; + struct nilfs_root *root; + int ret; + + mutex_lock(&nilfs->ns_snapshot_mount_mutex); + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) { + ret = (ret == -ENOENT) ? -EINVAL : ret; + goto out; + } else if (!ret) { + printk(KERN_ERR "NILFS: The specified checkpoint is " + "not a snapshot (checkpoint number=%llu).\n", + (unsigned long long)cno); + ret = -EINVAL; + goto out; + } + + ret = nilfs_attach_checkpoint(s, cno, false, &root); + if (ret) { + printk(KERN_ERR "NILFS: error loading snapshot " + "(checkpoint number=%llu).\n", + (unsigned long long)cno); + goto out; + } + ret = nilfs_get_root_dentry(s, root, root_dentry); + nilfs_put_root(root); + out: + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); + return ret; +} + +/** + * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint + * @root_dentry: root dentry of the tree to be shrunk + * + * This function returns true if the tree was in-use. + */ +static bool nilfs_tree_is_busy(struct dentry *root_dentry) +{ + shrink_dcache_parent(root_dentry); + return d_count(root_dentry) > 1; +} + +int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root; + struct inode *inode; + struct dentry *dentry; + int ret; + + if (cno > nilfs->ns_cno) + return false; + + if (cno >= nilfs_last_cno(nilfs)) + return true; /* protect recent checkpoints */ + + ret = false; + root = nilfs_lookup_root(nilfs, cno); + if (root) { + inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO); + if (inode) { + dentry = d_find_alias(inode); + if (dentry) { + ret = nilfs_tree_is_busy(dentry); + dput(dentry); + } + iput(inode); + } + nilfs_put_root(root); + } + return ret; +} + +/** + * nilfs_fill_super() - initialize a super block instance + * @sb: super_block + * @data: mount options + * @silent: silent mode flag + * + * This function is called exclusively by nilfs->ns_mount_mutex. + * So, the recovery process is protected from other simultaneous mounts. + */ +static int +nilfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct the_nilfs *nilfs; + struct nilfs_root *fsroot; + __u64 cno; + int err; + + nilfs = alloc_nilfs(sb->s_bdev); + if (!nilfs) + return -ENOMEM; + + sb->s_fs_info = nilfs; + + err = init_nilfs(nilfs, sb, (char *)data); + if (err) + goto failed_nilfs; + + sb->s_op = &nilfs_sops; + sb->s_export_op = &nilfs_export_ops; + sb->s_root = NULL; + sb->s_time_gran = 1; + sb->s_max_links = NILFS_LINK_MAX; + + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; + + err = load_nilfs(nilfs, sb); + if (err) + goto failed_nilfs; + + cno = nilfs_last_cno(nilfs); + err = nilfs_attach_checkpoint(sb, cno, true, &fsroot); + if (err) { + printk(KERN_ERR "NILFS: error loading last checkpoint " + "(checkpoint number=%llu).\n", (unsigned long long)cno); + goto failed_unload; + } + + if (!(sb->s_flags & MS_RDONLY)) { + err = nilfs_attach_log_writer(sb, fsroot); + if (err) + goto failed_checkpoint; + } + + err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root); + if (err) + goto failed_segctor; + + nilfs_put_root(fsroot); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, true); + up_write(&nilfs->ns_sem); + } + + return 0; + + failed_segctor: + nilfs_detach_log_writer(sb); + + failed_checkpoint: + nilfs_put_root(fsroot); + + failed_unload: + iput(nilfs->ns_sufile); + iput(nilfs->ns_cpfile); + iput(nilfs->ns_dat); + + failed_nilfs: + destroy_nilfs(nilfs); + return err; +} + +static int nilfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + unsigned long old_sb_flags; + unsigned long old_mount_opt; + int err; + + sync_filesystem(sb); + old_sb_flags = sb->s_flags; + old_mount_opt = nilfs->ns_mount_opt; + + if (!parse_options(data, sb, 1)) { + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + + err = -EINVAL; + + if (!nilfs_valid_fs(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the filesystem is in an " + "incomplete recovery state.\n", sb->s_id); + goto restore_opts; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + if (*flags & MS_RDONLY) { + /* Shutting down log writer */ + nilfs_detach_log_writer(sb); + sb->s_flags |= MS_RDONLY; + + /* + * Remounting a valid RW partition RDONLY, so set + * the RDONLY flag and then mark the partition as valid again. + */ + down_write(&nilfs->ns_sem); + nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + } else { + __u64 features; + struct nilfs_root *root; + + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by fsck since we originally mounted the partition.) + */ + down_read(&nilfs->ns_sem); + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + up_read(&nilfs->ns_sem); + if (features) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount RDWR because of unsupported optional " + "features (%llx)\n", + sb->s_id, (unsigned long long)features); + err = -EROFS; + goto restore_opts; + } + + sb->s_flags &= ~MS_RDONLY; + + root = NILFS_I(d_inode(sb->s_root))->i_root; + err = nilfs_attach_log_writer(sb, root); + if (err) + goto restore_opts; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, true); + up_write(&nilfs->ns_sem); + } + out: + return 0; + + restore_opts: + sb->s_flags = old_sb_flags; + nilfs->ns_mount_opt = old_mount_opt; + return err; +} + +struct nilfs_super_data { + struct block_device *bdev; + __u64 cno; + int flags; +}; + +/** + * nilfs_identify - pre-read mount options needed to identify mount instance + * @data: mount options + * @sd: nilfs_super_data + */ +static int nilfs_identify(char *data, struct nilfs_super_data *sd) +{ + char *p, *options = data; + substring_t args[MAX_OPT_ARGS]; + int token; + int ret = 0; + + do { + p = strsep(&options, ","); + if (p != NULL && *p) { + token = match_token(p, tokens, args); + if (token == Opt_snapshot) { + if (!(sd->flags & MS_RDONLY)) { + ret++; + } else { + sd->cno = simple_strtoull(args[0].from, + NULL, 0); + /* + * No need to see the end pointer; + * match_token() has done syntax + * checking. + */ + if (sd->cno == 0) + ret++; + } + } + if (ret) + printk(KERN_ERR + "NILFS: invalid mount option: %s\n", p); + } + if (!options) + break; + BUG_ON(options == data); + *(options - 1) = ','; + } while (!ret); + return ret; +} + +static int nilfs_set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int nilfs_test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +static struct dentry * +nilfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct nilfs_super_data sd; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + struct dentry *root_dentry; + int err, s_new = false; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(sd.bdev)) + return ERR_CAST(sd.bdev); + + sd.cno = 0; + sd.flags = flags; + if (nilfs_identify((char *)data, &sd)) { + err = -EINVAL; + goto failed; + } + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&sd.bdev->bd_fsfreeze_mutex); + if (sd.bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); + err = -EBUSY; + goto failed; + } + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, + sd.bdev); + mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) { + err = PTR_ERR(s); + goto failed; + } + + if (!s->s_root) { + char b[BDEVNAME_SIZE]; + + s_new = true; + + /* New superblock instance created */ + s->s_mode = mode; + strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(sd.bdev)); + + err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto failed_super; + + s->s_flags |= MS_ACTIVE; + } else if (!sd.cno) { + if (nilfs_tree_is_busy(s->s_root)) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + printk(KERN_ERR "NILFS: the device already " + "has a %s mount.\n", + (s->s_flags & MS_RDONLY) ? + "read-only" : "read/write"); + err = -EBUSY; + goto failed_super; + } + } else { + /* + * Try remount to setup mount states if the current + * tree is not mounted and only snapshots use this sb. + */ + err = nilfs_remount(s, &flags, data); + if (err) + goto failed_super; + } + } + + if (sd.cno) { + err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); + if (err) + goto failed_super; + } else { + root_dentry = dget(s->s_root); + } + + if (!s_new) + blkdev_put(sd.bdev, mode); + + return root_dentry; + + failed_super: + deactivate_locked_super(s); + + failed: + if (!s_new) + blkdev_put(sd.bdev, mode); + return ERR_PTR(err); +} + +struct file_system_type nilfs_fs_type = { + .owner = THIS_MODULE, + .name = "nilfs2", + .mount = nilfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("nilfs2"); + +static void nilfs_inode_init_once(void *obj) +{ + struct nilfs_inode_info *ii = obj; + + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + address_space_init_once(&ii->i_btnode_cache); + ii->i_bmap = &ii->i_bmap_data; + inode_init_once(&ii->vfs_inode); +} + +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} + +static void nilfs_destroy_cachep(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + if (nilfs_inode_cachep) + kmem_cache_destroy(nilfs_inode_cachep); + if (nilfs_transaction_cachep) + kmem_cache_destroy(nilfs_transaction_cachep); + if (nilfs_segbuf_cachep) + kmem_cache_destroy(nilfs_segbuf_cachep); + if (nilfs_btree_path_cache) + kmem_cache_destroy(nilfs_btree_path_cache); +} + +static int __init nilfs_init_cachep(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + if (!nilfs_inode_cachep) + goto fail; + + nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!nilfs_transaction_cachep) + goto fail; + + nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); + if (!nilfs_segbuf_cachep) + goto fail; + + nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, + 0, 0, NULL); + if (!nilfs_btree_path_cache) + goto fail; + + return 0; + +fail: + nilfs_destroy_cachep(); + return -ENOMEM; +} + +static int __init init_nilfs_fs(void) +{ + int err; + + err = nilfs_init_cachep(); + if (err) + goto fail; + + err = nilfs_sysfs_init(); + if (err) + goto free_cachep; + + err = register_filesystem(&nilfs_fs_type); + if (err) + goto deinit_sysfs_entry; + + printk(KERN_INFO "NILFS version 2 loaded\n"); + return 0; + +deinit_sysfs_entry: + nilfs_sysfs_exit(); +free_cachep: + nilfs_destroy_cachep(); +fail: + return err; +} + +static void __exit exit_nilfs_fs(void) +{ + nilfs_destroy_cachep(); + nilfs_sysfs_exit(); + unregister_filesystem(&nilfs_fs_type); +} + +module_init(init_nilfs_fs) +module_exit(exit_nilfs_fs) diff --git a/kernel/fs/nilfs2/sysfs.c b/kernel/fs/nilfs2/sysfs.c new file mode 100644 index 000000000..bbb0dcc35 --- /dev/null +++ b/kernel/fs/nilfs2/sysfs.c @@ -0,0 +1,1137 @@ +/* + * sysfs.c - sysfs support implementation. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko + */ + +#include + +#include "nilfs.h" +#include "mdt.h" +#include "sufile.h" +#include "cpfile.h" +#include "sysfs.h" + +/* /sys/fs// */ +static struct kset *nilfs_kset; + +#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ + struct tm res; \ + int count = 0; \ + time_to_tm(time_t_val, 0, &res); \ + res.tm_year += 1900; \ + res.tm_mon += 1; \ + count = scnprintf(buf, PAGE_SIZE, \ + "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ + res.tm_year, res.tm_mon, res.tm_mday, \ + res.tm_hour, res.tm_min, res.tm_sec);\ + count; \ +}) + +#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ +static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ + struct attribute *attr, char *buf) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->show ? a->show(a, nilfs, buf) : 0; \ +} \ +static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->store ? a->store(a, nilfs, buf, len) : 0; \ +} \ +static const struct sysfs_ops nilfs_##name##_attr_ops = { \ + .show = nilfs_##name##_attr_show, \ + .store = nilfs_##name##_attr_store, \ +}; + +#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ +static void nilfs_##name##_attr_release(struct kobject *kobj) \ +{ \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + complete(&subgroups->sg_##name##_kobj_unregister); \ +} \ +static struct kobj_type nilfs_##name##_ktype = { \ + .default_attrs = nilfs_##name##_attrs, \ + .sysfs_ops = &nilfs_##name##_attr_ops, \ + .release = nilfs_##name##_attr_release, \ +}; + +#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ +static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ +{ \ + struct kobject *parent; \ + struct kobject *kobj; \ + struct completion *kobj_unregister; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + int err; \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + kobj = &subgroups->sg_##name##_kobj; \ + kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \ + parent = &nilfs->ns_##parent_name##_kobj; \ + kobj->kset = nilfs_kset; \ + init_completion(kobj_unregister); \ + err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ + #name); \ + if (err) \ + return err; \ + return 0; \ +} \ +static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ +{ \ + kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ +} + +/************************************************************************ + * NILFS snapshot attrs * + ************************************************************************/ + +static ssize_t +nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->inodes_count)); +} + +static ssize_t +nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->blocks_count)); +} + +static const char snapshot_readme_str[] = + "The group contains details about mounted snapshot.\n\n" + "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n" + "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n"; + +static ssize_t +nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, snapshot_readme_str); +} + +NILFS_SNAPSHOT_RO_ATTR(inodes_count); +NILFS_SNAPSHOT_RO_ATTR(blocks_count); +NILFS_SNAPSHOT_RO_ATTR(README); + +static struct attribute *nilfs_snapshot_attrs[] = { + NILFS_SNAPSHOT_ATTR_LIST(inodes_count), + NILFS_SNAPSHOT_ATTR_LIST(blocks_count), + NILFS_SNAPSHOT_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->show ? a->show(a, root, buf) : 0; +} + +static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->store ? a->store(a, root, buf, len) : 0; +} + +static void nilfs_snapshot_attr_release(struct kobject *kobj) +{ + struct nilfs_root *root = container_of(kobj, struct nilfs_root, + snapshot_kobj); + complete(&root->snapshot_kobj_unregister); +} + +static const struct sysfs_ops nilfs_snapshot_attr_ops = { + .show = nilfs_snapshot_attr_show, + .store = nilfs_snapshot_attr_store, +}; + +static struct kobj_type nilfs_snapshot_ktype = { + .default_attrs = nilfs_snapshot_attrs, + .sysfs_ops = &nilfs_snapshot_attr_ops, + .release = nilfs_snapshot_attr_release, +}; + +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) +{ + struct the_nilfs *nilfs; + struct kobject *parent; + int err; + + nilfs = root->nilfs; + parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj; + root->snapshot_kobj.kset = nilfs_kset; + init_completion(&root->snapshot_kobj_unregister); + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + &nilfs->ns_dev_kobj, + "current_checkpoint"); + } else { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + parent, + "%llu", root->cno); + } + + if (err) + return err; + + return 0; +} + +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) +{ + kobject_del(&root->snapshot_kobj); +} + +/************************************************************************ + * NILFS mounted snapshots attrs * + ************************************************************************/ + +static const char mounted_snapshots_readme_str[] = + "The mounted_snapshots group contains group for\n" + "every mounted snapshot.\n"; + +static ssize_t +nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str); +} + +NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README); + +static struct attribute *nilfs_mounted_snapshots_attrs[] = { + NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev); + +/************************************************************************ + * NILFS checkpoints attrs * + ************************************************************************/ + +static ssize_t +nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 ncheckpoints; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + ncheckpoints = cpstat.cs_ncps; + + return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints); +} + +static ssize_t +nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nsnapshots; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + nsnapshots = cpstat.cs_nsss; + + return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots); +} + +static ssize_t +nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static const char checkpoints_readme_str[] = + "The checkpoints group contains attributes that describe\n" + "details about volume's checkpoints.\n\n" + "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n" + "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) next_checkpoint\n\tshow next checkpoint number.\n\n"; + +static ssize_t +nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, checkpoints_readme_str); +} + +NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number); +NILFS_CHECKPOINTS_RO_ATTR(snapshots_number); +NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(README); + +static struct attribute *nilfs_checkpoints_attrs[] = { + NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number), + NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number), + NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); +NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); +NILFS_DEV_INT_GROUP_FNS(checkpoints, dev); + +/************************************************************************ + * NILFS segments attrs * + ************************************************************************/ + +static ssize_t +nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments); +} + +static ssize_t +nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment); +} + +static ssize_t +nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs); +} + +static ssize_t +nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_sustat sustat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", + err); + return err; + } + + return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs); +} + +static const char segments_readme_str[] = + "The segments group contains attributes that describe\n" + "details about volume's segments.\n\n" + "(1) segments_number\n\tshow number of segments on volume.\n\n" + "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n" + "(3) clean_segments\n\tshow count of clean segments.\n\n" + "(4) dirty_segments\n\tshow count of dirty segments.\n\n"; + +static ssize_t +nilfs_segments_README_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, segments_readme_str); +} + +NILFS_SEGMENTS_RO_ATTR(segments_number); +NILFS_SEGMENTS_RO_ATTR(blocks_per_segment); +NILFS_SEGMENTS_RO_ATTR(clean_segments); +NILFS_SEGMENTS_RO_ATTR(dirty_segments); +NILFS_SEGMENTS_RO_ATTR(README); + +static struct attribute *nilfs_segments_attrs[] = { + NILFS_SEGMENTS_ATTR_LIST(segments_number), + NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment), + NILFS_SEGMENTS_ATTR_LIST(clean_segments), + NILFS_SEGMENTS_ATTR_LIST(dirty_segments), + NILFS_SEGMENTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segments, dev); +NILFS_DEV_INT_GROUP_TYPE(segments, dev); +NILFS_DEV_INT_GROUP_FNS(segments, dev); + +/************************************************************************ + * NILFS segctor attrs * + ************************************************************************/ + +static ssize_t +nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t last_pseg; + + spin_lock(&nilfs->ns_last_segment_lock); + last_pseg = nilfs->ns_last_pseg; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)last_pseg); +} + +static ssize_t +nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 last_seq; + + spin_lock(&nilfs->ns_last_segment_lock); + last_seq = nilfs->ns_last_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq); +} + +static ssize_t +nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 seg_seq; + + down_read(&nilfs->ns_sem); + seg_seq = nilfs->ns_seg_seq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); +} + +static ssize_t +nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 segnum; + + down_read(&nilfs->ns_sem); + segnum = nilfs->ns_segnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); +} + +static ssize_t +nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nextnum; + + down_read(&nilfs->ns_sem); + nextnum = nilfs->ns_nextnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); +} + +static ssize_t +nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long pseg_offset; + + down_read(&nilfs->ns_sem); + pseg_offset = nilfs->ns_pseg_offset; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); +} + +static ssize_t +nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(ctime, buf); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(nongc_ctime, buf); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)nongc_ctime); +} + +static ssize_t +nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u32 ndirtyblks; + + down_read(&nilfs->ns_sem); + ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); +} + +static const char segctor_readme_str[] = + "The segctor group contains attributes that describe\n" + "segctor thread activity details.\n\n" + "(1) last_pseg_block\n" + "\tshow start block number of the latest segment.\n\n" + "(2) last_seg_sequence\n" + "\tshow sequence value of the latest segment.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n" + "(5) current_last_full_seg\n" + "\tshow index number of the latest full segment.\n\n" + "(6) next_full_seg\n" + "\tshow index number of the full segment index to be used next.\n\n" + "(7) next_pseg_offset\n" + "\tshow offset of next partial segment in the current full segment.\n\n" + "(8) next_checkpoint\n\tshow next checkpoint number.\n\n" + "(9) last_seg_write_time\n" + "\tshow write time of the last segment in human-readable format.\n\n" + "(10) last_seg_write_time_secs\n" + "\tshow write time of the last segment in seconds.\n\n" + "(11) last_nongc_write_time\n" + "\tshow write time of the last segment not for cleaner operation " + "in human-readable format.\n\n" + "(12) last_nongc_write_time_secs\n" + "\tshow write time of the last segment not for cleaner operation " + "in seconds.\n\n" + "(13) dirty_data_blocks_count\n" + "\tshow number of dirty data blocks.\n\n"; + +static ssize_t +nilfs_segctor_README_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, segctor_readme_str); +} + +NILFS_SEGCTOR_RO_ATTR(last_pseg_block); +NILFS_SEGCTOR_RO_ATTR(last_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint); +NILFS_SEGCTOR_RO_ATTR(current_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(current_last_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_pseg_offset); +NILFS_SEGCTOR_RO_ATTR(next_checkpoint); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count); +NILFS_SEGCTOR_RO_ATTR(README); + +static struct attribute *nilfs_segctor_attrs[] = { + NILFS_SEGCTOR_ATTR_LIST(last_pseg_block), + NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset), + NILFS_SEGCTOR_ATTR_LIST(next_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count), + NILFS_SEGCTOR_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segctor, dev); +NILFS_DEV_INT_GROUP_TYPE(segctor, dev); +NILFS_DEV_INT_GROUP_FNS(segctor, dev); + +/************************************************************************ + * NILFS superblock attrs * + ************************************************************************/ + +static ssize_t +nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(sbwtime, buf); +} + +static ssize_t +nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); +} + +static ssize_t +nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sbwcount; + + down_read(&nilfs->ns_sem); + sbwcount = nilfs->ns_sbwcount; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sb_update_freq; + + down_read(&nilfs->ns_sem); + sb_update_freq = nilfs->ns_sb_update_freq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + const char *buf, size_t count) +{ + unsigned val; + int err; + + err = kstrtouint(skip_spaces(buf), 0, &val); + if (err) { + printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", + err); + return err; + } + + if (val < NILFS_SB_FREQ) { + val = NILFS_SB_FREQ; + printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + } + + down_write(&nilfs->ns_sem); + nilfs->ns_sb_update_freq = val; + up_write(&nilfs->ns_sem); + + return count; +} + +static const char sb_readme_str[] = + "The superblock group contains attributes that describe\n" + "superblock's details.\n\n" + "(1) sb_write_time\n\tshow previous write time of super block " + "in human-readable format.\n\n" + "(2) sb_write_time_secs\n\tshow previous write time of super block " + "in seconds.\n\n" + "(3) sb_write_count\n\tshow write count of super block.\n\n" + "(4) sb_update_frequency\n" + "\tshow/set interval of periodical update of superblock (in seconds).\n\n" + "\tYou can set preferable frequency of superblock update by command:\n\n" + "\t'echo > /sys/fs///superblock/sb_update_frequency'\n"; + +static ssize_t +nilfs_superblock_README_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, sb_readme_str); +} + +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_count); +NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency); +NILFS_SUPERBLOCK_RO_ATTR(README); + +static struct attribute *nilfs_superblock_attrs[] = { + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count), + NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency), + NILFS_SUPERBLOCK_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(superblock, dev); +NILFS_DEV_INT_GROUP_TYPE(superblock, dev); +NILFS_DEV_INT_GROUP_FNS(superblock, dev); + +/************************************************************************ + * NILFS device attrs * + ************************************************************************/ + +static +ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u32 major = le32_to_cpu(sbp[0]->s_rev_level); + u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + + return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor); +} + +static +ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize); +} + +static +ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + + return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size); +} + +static +ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t free_blocks = 0; + + nilfs_count_free_blocks(nilfs, &free_blocks); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)free_blocks); +} + +static +ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid); +} + +static +ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", + sbp[0]->s_volume_name); +} + +static const char dev_readme_str[] = + "The group contains attributes that describe file system\n" + "partition's details.\n\n" + "(1) revision\n\tshow NILFS file system revision.\n\n" + "(2) blocksize\n\tshow volume block size in bytes.\n\n" + "(3) device_size\n\tshow volume size in bytes.\n\n" + "(4) free_blocks\n\tshow count of free blocks on volume.\n\n" + "(5) uuid\n\tshow volume's UUID.\n\n" + "(6) volume_name\n\tshow volume's name.\n\n"; + +static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, dev_readme_str); +} + +NILFS_DEV_RO_ATTR(revision); +NILFS_DEV_RO_ATTR(blocksize); +NILFS_DEV_RO_ATTR(device_size); +NILFS_DEV_RO_ATTR(free_blocks); +NILFS_DEV_RO_ATTR(uuid); +NILFS_DEV_RO_ATTR(volume_name); +NILFS_DEV_RO_ATTR(README); + +static struct attribute *nilfs_dev_attrs[] = { + NILFS_DEV_ATTR_LIST(revision), + NILFS_DEV_ATTR_LIST(blocksize), + NILFS_DEV_ATTR_LIST(device_size), + NILFS_DEV_ATTR_LIST(free_blocks), + NILFS_DEV_ATTR_LIST(uuid), + NILFS_DEV_ATTR_LIST(volume_name), + NILFS_DEV_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_dev_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->show ? a->show(a, nilfs, buf) : 0; +} + +static ssize_t nilfs_dev_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->store ? a->store(a, nilfs, buf, len) : 0; +} + +static void nilfs_dev_attr_release(struct kobject *kobj) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + complete(&nilfs->ns_dev_kobj_unregister); +} + +static const struct sysfs_ops nilfs_dev_attr_ops = { + .show = nilfs_dev_attr_show, + .store = nilfs_dev_attr_store, +}; + +static struct kobj_type nilfs_dev_ktype = { + .default_attrs = nilfs_dev_attrs, + .sysfs_ops = &nilfs_dev_attr_ops, + .release = nilfs_dev_attr_release, +}; + +int nilfs_sysfs_create_device_group(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups); + int err; + + nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); + if (unlikely(!nilfs->ns_dev_subgroups)) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + goto failed_create_device_group; + } + + nilfs->ns_dev_kobj.kset = nilfs_kset; + init_completion(&nilfs->ns_dev_kobj_unregister); + err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, + "%s", sb->s_id); + if (err) + goto free_dev_subgroups; + + err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); + if (err) + goto cleanup_dev_kobject; + + err = nilfs_sysfs_create_checkpoints_group(nilfs); + if (err) + goto delete_mounted_snapshots_group; + + err = nilfs_sysfs_create_segments_group(nilfs); + if (err) + goto delete_checkpoints_group; + + err = nilfs_sysfs_create_superblock_group(nilfs); + if (err) + goto delete_segments_group; + + err = nilfs_sysfs_create_segctor_group(nilfs); + if (err) + goto delete_superblock_group; + + return 0; + +delete_superblock_group: + nilfs_sysfs_delete_superblock_group(nilfs); + +delete_segments_group: + nilfs_sysfs_delete_segments_group(nilfs); + +delete_checkpoints_group: + nilfs_sysfs_delete_checkpoints_group(nilfs); + +delete_mounted_snapshots_group: + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + +cleanup_dev_kobject: + kobject_del(&nilfs->ns_dev_kobj); + +free_dev_subgroups: + kfree(nilfs->ns_dev_subgroups); + +failed_create_device_group: + return err; +} + +void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) +{ + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + nilfs_sysfs_delete_checkpoints_group(nilfs); + nilfs_sysfs_delete_segments_group(nilfs); + nilfs_sysfs_delete_superblock_group(nilfs); + nilfs_sysfs_delete_segctor_group(nilfs); + kobject_del(&nilfs->ns_dev_kobj); + kfree(nilfs->ns_dev_subgroups); +} + +/************************************************************************ + * NILFS feature attrs * + ************************************************************************/ + +static ssize_t nilfs_feature_revision_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d.%d\n", + NILFS_CURRENT_REV, NILFS_MINOR_REV); +} + +static const char features_readme_str[] = + "The features group contains attributes that describe NILFS file\n" + "system driver features.\n\n" + "(1) revision\n\tshow current revision of NILFS file system driver.\n"; + +static ssize_t nilfs_feature_README_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, features_readme_str); +} + +NILFS_FEATURE_RO_ATTR(revision); +NILFS_FEATURE_RO_ATTR(README); + +static struct attribute *nilfs_feature_attrs[] = { + NILFS_FEATURE_ATTR_LIST(revision), + NILFS_FEATURE_ATTR_LIST(README), + NULL, +}; + +static const struct attribute_group nilfs_feature_attr_group = { + .name = "features", + .attrs = nilfs_feature_attrs, +}; + +int __init nilfs_sysfs_init(void) +{ + int err; + + nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); + if (!nilfs_kset) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", + err); + goto failed_sysfs_init; + } + + err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", + err); + goto cleanup_sysfs_init; + } + + return 0; + +cleanup_sysfs_init: + kset_unregister(nilfs_kset); + +failed_sysfs_init: + return err; +} + +void nilfs_sysfs_exit(void) +{ + sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + kset_unregister(nilfs_kset); +} diff --git a/kernel/fs/nilfs2/sysfs.h b/kernel/fs/nilfs2/sysfs.h new file mode 100644 index 000000000..677e3a1a8 --- /dev/null +++ b/kernel/fs/nilfs2/sysfs.h @@ -0,0 +1,176 @@ +/* + * sysfs.h - sysfs support declarations. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko + */ + +#ifndef _NILFS_SYSFS_H +#define _NILFS_SYSFS_H + +#include + +#define NILFS_ROOT_GROUP_NAME "nilfs2" + +/* + * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects + * @sg_superblock_kobj: /sys/fs///superblock + * @sg_superblock_kobj_unregister: completion state + * @sg_segctor_kobj: /sys/fs///segctor + * @sg_segctor_kobj_unregister: completion state + * @sg_mounted_snapshots_kobj: /sys/fs///mounted_snapshots + * @sg_mounted_snapshots_kobj_unregister: completion state + * @sg_checkpoints_kobj: /sys/fs///checkpoints + * @sg_checkpoints_kobj_unregister: completion state + * @sg_segments_kobj: /sys/fs///segments + * @sg_segments_kobj_unregister: completion state + */ +struct nilfs_sysfs_dev_subgroups { + /* /sys/fs///superblock */ + struct kobject sg_superblock_kobj; + struct completion sg_superblock_kobj_unregister; + + /* /sys/fs///segctor */ + struct kobject sg_segctor_kobj; + struct completion sg_segctor_kobj_unregister; + + /* /sys/fs///mounted_snapshots */ + struct kobject sg_mounted_snapshots_kobj; + struct completion sg_mounted_snapshots_kobj_unregister; + + /* /sys/fs///checkpoints */ + struct kobject sg_checkpoints_kobj; + struct completion sg_checkpoints_kobj_unregister; + + /* /sys/fs///segments */ + struct kobject sg_segments_kobj; + struct completion sg_segments_kobj_unregister; +}; + +#define NILFS_COMMON_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct kobject *, struct attribute *, \ + char *); \ + ssize_t (*store)(struct kobject *, struct attribute *, \ + const char *, size_t); \ +}; + +NILFS_COMMON_ATTR_STRUCT(feature); + +#define NILFS_DEV_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + const char *, size_t); \ +}; + +NILFS_DEV_ATTR_STRUCT(dev); +NILFS_DEV_ATTR_STRUCT(segments); +NILFS_DEV_ATTR_STRUCT(mounted_snapshots); +NILFS_DEV_ATTR_STRUCT(checkpoints); +NILFS_DEV_ATTR_STRUCT(superblock); +NILFS_DEV_ATTR_STRUCT(segctor); + +#define NILFS_CP_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + const char *, size_t); \ +}; + +NILFS_CP_ATTR_STRUCT(snapshot); + +#define NILFS_ATTR(type, name, mode, show, store) \ + static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \ + __ATTR(name, mode, show, store) + +#define NILFS_INFO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, NULL, NULL) +#define NILFS_RO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL) +#define NILFS_RW_ATTR(type, name) \ + NILFS_ATTR(type, name, 0644, \ + nilfs_##type##_##name##_show, \ + nilfs_##type##_##name##_store) + +#define NILFS_FEATURE_INFO_ATTR(name) \ + NILFS_INFO_ATTR(feature, name) +#define NILFS_FEATURE_RO_ATTR(name) \ + NILFS_RO_ATTR(feature, name) +#define NILFS_FEATURE_RW_ATTR(name) \ + NILFS_RW_ATTR(feature, name) + +#define NILFS_DEV_INFO_ATTR(name) \ + NILFS_INFO_ATTR(dev, name) +#define NILFS_DEV_RO_ATTR(name) \ + NILFS_RO_ATTR(dev, name) +#define NILFS_DEV_RW_ATTR(name) \ + NILFS_RW_ATTR(dev, name) + +#define NILFS_SEGMENTS_RO_ATTR(name) \ + NILFS_RO_ATTR(segments, name) +#define NILFS_SEGMENTS_RW_ATTR(name) \ + NILFS_RW_ATTR(segs_info, name) + +#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \ + NILFS_RO_ATTR(mounted_snapshots, name) + +#define NILFS_CHECKPOINTS_RO_ATTR(name) \ + NILFS_RO_ATTR(checkpoints, name) +#define NILFS_CHECKPOINTS_RW_ATTR(name) \ + NILFS_RW_ATTR(checkpoints, name) + +#define NILFS_SNAPSHOT_INFO_ATTR(name) \ + NILFS_INFO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RO_ATTR(name) \ + NILFS_RO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RW_ATTR(name) \ + NILFS_RW_ATTR(snapshot, name) + +#define NILFS_SUPERBLOCK_RO_ATTR(name) \ + NILFS_RO_ATTR(superblock, name) +#define NILFS_SUPERBLOCK_RW_ATTR(name) \ + NILFS_RW_ATTR(superblock, name) + +#define NILFS_SEGCTOR_INFO_ATTR(name) \ + NILFS_INFO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RO_ATTR(name) \ + NILFS_RO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RW_ATTR(name) \ + NILFS_RW_ATTR(segctor, name) + +#define NILFS_FEATURE_ATTR_LIST(name) \ + (&nilfs_feature_attr_##name.attr) +#define NILFS_DEV_ATTR_LIST(name) \ + (&nilfs_dev_attr_##name.attr) +#define NILFS_SEGMENTS_ATTR_LIST(name) \ + (&nilfs_segments_attr_##name.attr) +#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \ + (&nilfs_mounted_snapshots_attr_##name.attr) +#define NILFS_CHECKPOINTS_ATTR_LIST(name) \ + (&nilfs_checkpoints_attr_##name.attr) +#define NILFS_SNAPSHOT_ATTR_LIST(name) \ + (&nilfs_snapshot_attr_##name.attr) +#define NILFS_SUPERBLOCK_ATTR_LIST(name) \ + (&nilfs_superblock_attr_##name.attr) +#define NILFS_SEGCTOR_ATTR_LIST(name) \ + (&nilfs_segctor_attr_##name.attr) + +#endif /* _NILFS_SYSFS_H */ diff --git a/kernel/fs/nilfs2/the_nilfs.c b/kernel/fs/nilfs2/the_nilfs.c new file mode 100644 index 000000000..69bd801af --- /dev/null +++ b/kernel/fs/nilfs2/the_nilfs.c @@ -0,0 +1,815 @@ +/* + * the_nilfs.c - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "alloc.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" +#include "segbuf.h" + + +static int nilfs_valid_sb(struct nilfs_super_block *sbp); + +void nilfs_set_last_segment(struct the_nilfs *nilfs, + sector_t start_blocknr, u64 seq, __u64 cno) +{ + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_last_pseg = start_blocknr; + nilfs->ns_last_seq = seq; + nilfs->ns_last_cno = cno; + + if (!nilfs_sb_dirty(nilfs)) { + if (nilfs->ns_prev_seq == nilfs->ns_last_seq) + goto stay_cursor; + + set_nilfs_sb_dirty(nilfs); + } + nilfs->ns_prev_seq = nilfs->ns_last_seq; + + stay_cursor: + spin_unlock(&nilfs->ns_last_segment_lock); +} + +/** + * alloc_nilfs - allocate a nilfs object + * @bdev: block device to which the_nilfs is related + * + * Return Value: On success, pointer to the_nilfs is returned. + * On error, NULL is returned. + */ +struct the_nilfs *alloc_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs; + + nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); + if (!nilfs) + return NULL; + + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_snapshot_mount_mutex); + INIT_LIST_HEAD(&nilfs->ns_dirty_files); + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + spin_lock_init(&nilfs->ns_inode_lock); + spin_lock_init(&nilfs->ns_next_gen_lock); + spin_lock_init(&nilfs->ns_last_segment_lock); + nilfs->ns_cptree = RB_ROOT; + spin_lock_init(&nilfs->ns_cptree_lock); + init_rwsem(&nilfs->ns_segctor_sem); + nilfs->ns_sb_update_freq = NILFS_SB_FREQ; + + return nilfs; +} + +/** + * destroy_nilfs - destroy nilfs object + * @nilfs: nilfs object to be released + */ +void destroy_nilfs(struct the_nilfs *nilfs) +{ + might_sleep(); + if (nilfs_init(nilfs)) { + nilfs_sysfs_delete_device_group(nilfs); + brelse(nilfs->ns_sbh[0]); + brelse(nilfs->ns_sbh[1]); + } + kfree(nilfs); +} + +static int nilfs_load_super_root(struct the_nilfs *nilfs, + struct super_block *sb, sector_t sr_block) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_inode *rawi; + unsigned dat_entry_size, segment_usage_size, checkpoint_size; + unsigned inode_size; + int err; + + err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); + if (unlikely(err)) + return err; + + down_read(&nilfs->ns_sem); + dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); + checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); + segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); + up_read(&nilfs->ns_sem); + + inode_size = nilfs->ns_inode_size; + + rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size); + err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat); + if (err) + goto failed; + + rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size); + err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile); + if (err) + goto failed_dat; + + rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size); + err = nilfs_sufile_read(sb, segment_usage_size, rawi, + &nilfs->ns_sufile); + if (err) + goto failed_cpfile; + + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); + + failed: + brelse(bh_sr); + return err; + + failed_cpfile: + iput(nilfs->ns_cpfile); + + failed_dat: + iput(nilfs->ns_dat); + goto failed; +} + +static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) +{ + memset(ri, 0, sizeof(*ri)); + INIT_LIST_HEAD(&ri->ri_used_segments); +} + +static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) +{ + nilfs_dispose_segment_list(&ri->ri_used_segments); +} + +/** + * nilfs_store_log_cursor - load log cursor from a super block + * @nilfs: nilfs object + * @sbp: buffer storing super block to be read + * + * nilfs_store_log_cursor() reads the last position of the log + * containing a super root from a given super block, and initializes + * relevant information on the nilfs object preparatory for log + * scanning and recovery. + */ +static int nilfs_store_log_cursor(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + int ret = 0; + + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_prev_seq = nilfs->ns_last_seq; + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + ret = -EINVAL; + } + return ret; +} + +/** + * load_nilfs - load and recover the nilfs + * @nilfs: the_nilfs structure to be released + * @sb: super block isntance used to recover past segment + * + * load_nilfs() searches and load the latest super root, + * attaches the last segment, and does recovery if needed. + * The caller must call this exclusively for simultaneous mounts. + */ +int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) +{ + struct nilfs_recovery_info ri; + unsigned int s_flags = sb->s_flags; + int really_read_only = bdev_read_only(nilfs->ns_bdev); + int valid_fs = nilfs_valid_fs(nilfs); + int err; + + if (!valid_fs) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); + } + } + + nilfs_init_recovery_info(&ri); + + err = nilfs_search_super_root(nilfs, &ri); + if (unlikely(err)) { + struct nilfs_super_block **sbp = nilfs->ns_sbp; + int blocksize; + + if (err != -EINVAL) + goto scan_error; + + if (!nilfs_valid_sb(sbp[1])) { + printk(KERN_WARNING + "NILFS warning: unable to fall back to spare" + "super block\n"); + goto scan_error; + } + printk(KERN_INFO + "NILFS: try rollback from an earlier position\n"); + + /* + * restore super block with its spare and reconfigure + * relevant states of the nilfs object. + */ + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed); + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + + /* verify consistency between two super blocks */ + blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + if (blocksize != nilfs->ns_blocksize) { + printk(KERN_WARNING + "NILFS warning: blocksize differs between " + "two super blocks (%d != %d)\n", + blocksize, nilfs->ns_blocksize); + goto scan_error; + } + + err = nilfs_store_log_cursor(nilfs, sbp[0]); + if (err) + goto scan_error; + + /* drop clean flag to allow roll-forward and recovery */ + nilfs->ns_mount_state &= ~NILFS_VALID_FS; + valid_fs = 0; + + err = nilfs_search_super_root(nilfs, &ri); + if (err) + goto scan_error; + } + + err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error loading super root.\n"); + goto failed; + } + + if (valid_fs) + goto skip_recovery; + + if (s_flags & MS_RDONLY) { + __u64 features; + + if (nilfs_test_opt(nilfs, NORECOVERY)) { + printk(KERN_INFO "NILFS: norecovery option specified. " + "skipping roll-forward recovery\n"); + goto skip_recovery; + } + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't proceed with " + "recovery because of unsupported optional " + "features (%llx)\n", + (unsigned long long)features); + err = -EROFS; + goto failed_unload; + } + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed_unload; + } + sb->s_flags &= ~MS_RDONLY; + } else if (nilfs_test_opt(nilfs, NORECOVERY)) { + printk(KERN_ERR "NILFS: recovery cancelled because norecovery " + "option was specified for a read/write mount\n"); + err = -EINVAL; + goto failed_unload; + } + + err = nilfs_salvage_orphan_logs(nilfs, sb, &ri); + if (err) + goto failed_unload; + + down_write(&nilfs->ns_sem); + nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ + err = nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + + if (err) { + printk(KERN_ERR "NILFS: failed to update super block. " + "recovery unfinished.\n"); + goto failed_unload; + } + printk(KERN_INFO "NILFS: recovery complete.\n"); + + skip_recovery: + nilfs_clear_recovery_info(&ri); + sb->s_flags = s_flags; + return 0; + + scan_error: + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + + failed_unload: + iput(nilfs->ns_cpfile); + iput(nilfs->ns_sufile); + iput(nilfs->ns_dat); + + failed: + nilfs_clear_recovery_info(&ri); + sb->s_flags = s_flags; + return err; +} + +static unsigned long long nilfs_max_size(unsigned int blkbits) +{ + unsigned int max_bits; + unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ + + max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ + if (max_bits < 64) + res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); + return res; +} + +/** + * nilfs_nrsvsegs - calculate the number of reserved segments + * @nilfs: nilfs object + * @nsegs: total number of segments + */ +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) +{ + return max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage, + 100)); +} + +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) +{ + nilfs->ns_nsegments = nsegs; + nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs); +} + +static int nilfs_store_disk_layout(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { + printk(KERN_ERR "NILFS: unsupported revision " + "(superblock rev.=%d.%d, current rev.=%d.%d). " + "Please check the version of mkfs.nilfs.\n", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); + return -EINVAL; + } + nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); + if (nilfs->ns_sbsize > BLOCK_SIZE) + return -EINVAL; + + nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + if (nilfs->ns_inode_size > nilfs->ns_blocksize) { + printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { + printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } + + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + + nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { + printk(KERN_ERR "NILFS: too short segment.\n"); + return -EINVAL; + } + + nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); + nilfs->ns_r_segments_percentage = + le32_to_cpu(sbp->s_r_segments_percentage); + if (nilfs->ns_r_segments_percentage < 1 || + nilfs->ns_r_segments_percentage > 99) { + printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); + return -EINVAL; + } + + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); + nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); + return 0; +} + +static int nilfs_valid_sb(struct nilfs_super_block *sbp) +{ + static unsigned char sum[4]; + const int sumoff = offsetof(struct nilfs_super_block, s_sum); + size_t bytes; + u32 crc; + + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); + if (bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); + crc = crc32_le(crc, sum, 4); + crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, + bytes - sumoff - 4); + return crc == le32_to_cpu(sbp->s_sum); +} + +static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +{ + return offset < ((le64_to_cpu(sbp->s_nsegments) * + le32_to_cpu(sbp->s_blocks_per_segment)) << + (le32_to_cpu(sbp->s_log_block_size) + 10)); +} + +static void nilfs_release_super_block(struct the_nilfs *nilfs) +{ + int i; + + for (i = 0; i < 2; i++) { + if (nilfs->ns_sbp[i]) { + brelse(nilfs->ns_sbh[i]); + nilfs->ns_sbh[i] = NULL; + nilfs->ns_sbp[i] = NULL; + } + } +} + +void nilfs_fall_back_super_block(struct the_nilfs *nilfs) +{ + brelse(nilfs->ns_sbh[0]); + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = NULL; + nilfs->ns_sbp[1] = NULL; +} + +void nilfs_swap_super_block(struct the_nilfs *nilfs) +{ + struct buffer_head *tsbh = nilfs->ns_sbh[0]; + struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = tsbh; + nilfs->ns_sbp[1] = tsbp; +} + +static int nilfs_load_super_block(struct the_nilfs *nilfs, + struct super_block *sb, int blocksize, + struct nilfs_super_block **sbpp) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct buffer_head **sbh = nilfs->ns_sbh; + u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + int valid[2], swp = 0; + + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, + &sbh[0]); + sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); + + if (!sbp[0]) { + if (!sbp[1]) { + printk(KERN_ERR "NILFS: unable to read superblock\n"); + return -EIO; + } + printk(KERN_WARNING + "NILFS warning: unable to read primary superblock " + "(blocksize = %d)\n", blocksize); + } else if (!sbp[1]) { + printk(KERN_WARNING + "NILFS warning: unable to read secondary superblock " + "(blocksize = %d)\n", blocksize); + } + + /* + * Compare two super blocks and set 1 in swp if the secondary + * super block is valid and newer. Otherwise, set 0 in swp. + */ + valid[0] = nilfs_valid_sb(sbp[0]); + valid[1] = nilfs_valid_sb(sbp[1]); + swp = valid[1] && (!valid[0] || + le64_to_cpu(sbp[1]->s_last_cno) > + le64_to_cpu(sbp[0]->s_last_cno)); + + if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { + brelse(sbh[1]); + sbh[1] = NULL; + sbp[1] = NULL; + valid[1] = 0; + swp = 0; + } + if (!valid[swp]) { + nilfs_release_super_block(nilfs); + printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", + sb->s_id); + return -EINVAL; + } + + if (!valid[!swp]) + printk(KERN_WARNING "NILFS warning: broken superblock. " + "using spare superblock (blocksize = %d).\n", blocksize); + if (swp) + nilfs_swap_super_block(nilfs); + + nilfs->ns_sbwcount = 0; + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + *sbpp = sbp[0]; + return 0; +} + +/** + * init_nilfs - initialize a NILFS instance. + * @nilfs: the_nilfs structure + * @sb: super block + * @data: mount options + * + * init_nilfs() performs common initialization per block device (e.g. + * reading the super block, getting disk layout information, initializing + * shared fields in the_nilfs). + * + * Return Value: On success, 0 is returned. On error, a negative error + * code is returned. + */ +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) +{ + struct nilfs_super_block *sbp; + int blocksize; + int err; + + down_write(&nilfs->ns_sem); + + blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "NILFS: unable to set blocksize\n"); + err = -EINVAL; + goto out; + } + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto failed_sbh; + + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto failed_sbh; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (blocksize < NILFS_MIN_BLOCK_SIZE || + blocksize > NILFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "filesystem blocksize %d\n", blocksize); + err = -EINVAL; + goto failed_sbh; + } + if (sb->s_blocksize != blocksize) { + int hw_blocksize = bdev_logical_block_size(sb->s_bdev); + + if (blocksize < hw_blocksize) { + printk(KERN_ERR + "NILFS: blocksize %d too small for device " + "(sector-size = %d).\n", + blocksize, hw_blocksize); + err = -EINVAL; + goto failed_sbh; + } + nilfs_release_super_block(nilfs); + sb_set_blocksize(sb, blocksize); + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + /* not failed_sbh; sbh is released automatically + when reloading fails. */ + } + nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + nilfs->ns_blocksize = blocksize; + + get_random_bytes(&nilfs->ns_next_generation, + sizeof(nilfs->ns_next_generation)); + + err = nilfs_store_disk_layout(nilfs, sbp); + if (err) + goto failed_sbh; + + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + + nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); + + err = nilfs_store_log_cursor(nilfs, sbp); + if (err) + goto failed_sbh; + + err = nilfs_sysfs_create_device_group(sb); + if (err) + goto failed_sbh; + + set_nilfs_init(nilfs); + err = 0; + out: + up_write(&nilfs->ns_sem); + return err; + + failed_sbh: + nilfs_release_super_block(nilfs); + goto out; +} + +int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, + size_t nsegs) +{ + sector_t seg_start, seg_end; + sector_t start = 0, nblocks = 0; + unsigned int sects_per_block; + __u64 *sn; + int ret = 0; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + for (sn = segnump; sn < segnump + nsegs; sn++) { + nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end); + + if (!nblocks) { + start = seg_start; + nblocks = seg_end - seg_start + 1; + } else if (start + nblocks == seg_start) { + nblocks += seg_end - seg_start + 1; + } else { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) + return ret; + nblocks = 0; + } + } + if (nblocks) + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + return ret; +} + +int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return 0; +} + +int nilfs_near_disk_full(struct the_nilfs *nilfs) +{ + unsigned long ncleansegs, nincsegs; + + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + + return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; +} + +struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) +{ + struct rb_node *n; + struct nilfs_root *root; + + spin_lock(&nilfs->ns_cptree_lock); + n = nilfs->ns_cptree.rb_node; + while (n) { + root = rb_entry(n, struct nilfs_root, rb_node); + + if (cno < root->cno) { + n = n->rb_left; + } else if (cno > root->cno) { + n = n->rb_right; + } else { + atomic_inc(&root->count); + spin_unlock(&nilfs->ns_cptree_lock); + return root; + } + } + spin_unlock(&nilfs->ns_cptree_lock); + + return NULL; +} + +struct nilfs_root * +nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) +{ + struct rb_node **p, *parent; + struct nilfs_root *root, *new; + int err; + + root = nilfs_lookup_root(nilfs, cno); + if (root) + return root; + + new = kzalloc(sizeof(*root), GFP_KERNEL); + if (!new) + return NULL; + + spin_lock(&nilfs->ns_cptree_lock); + + p = &nilfs->ns_cptree.rb_node; + parent = NULL; + + while (*p) { + parent = *p; + root = rb_entry(parent, struct nilfs_root, rb_node); + + if (cno < root->cno) { + p = &(*p)->rb_left; + } else if (cno > root->cno) { + p = &(*p)->rb_right; + } else { + atomic_inc(&root->count); + spin_unlock(&nilfs->ns_cptree_lock); + kfree(new); + return root; + } + } + + new->cno = cno; + new->ifile = NULL; + new->nilfs = nilfs; + atomic_set(&new->count, 1); + atomic64_set(&new->inodes_count, 0); + atomic64_set(&new->blocks_count, 0); + + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &nilfs->ns_cptree); + + spin_unlock(&nilfs->ns_cptree_lock); + + err = nilfs_sysfs_create_snapshot_group(new); + if (err) { + kfree(new); + new = NULL; + } + + return new; +} + +void nilfs_put_root(struct nilfs_root *root) +{ + if (atomic_dec_and_test(&root->count)) { + struct the_nilfs *nilfs = root->nilfs; + + nilfs_sysfs_delete_snapshot_group(root); + + spin_lock(&nilfs->ns_cptree_lock); + rb_erase(&root->rb_node, &nilfs->ns_cptree); + spin_unlock(&nilfs->ns_cptree_lock); + iput(root->ifile); + + kfree(root); + } +} diff --git a/kernel/fs/nilfs2/the_nilfs.h b/kernel/fs/nilfs2/the_nilfs.h new file mode 100644 index 000000000..23778d385 --- /dev/null +++ b/kernel/fs/nilfs2/the_nilfs.h @@ -0,0 +1,396 @@ +/* + * the_nilfs.h - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#ifndef _THE_NILFS_H +#define _THE_NILFS_H + +#include +#include +#include +#include +#include +#include +#include + +struct nilfs_sc_info; +struct nilfs_sysfs_dev_subgroups; + +/* the_nilfs struct */ +enum { + THE_NILFS_INIT = 0, /* Information from super_block is set */ + THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ + THE_NILFS_SB_DIRTY, /* super block is dirty */ +}; + +/** + * struct the_nilfs - struct to supervise multiple nilfs mount points + * @ns_flags: flags + * @ns_flushed_device: flag indicating if all volatile data was flushed + * @ns_bdev: block device + * @ns_sem: semaphore for shared states + * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super block + * @ns_sbwcount: write count of super block + * @ns_sbsize: size of valid data in super block + * @ns_mount_state: file system state + * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds) + * @ns_seg_seq: segment sequence counter + * @ns_segnum: index number of the latest full segment. + * @ns_nextnum: index number of the full segment index to be used next + * @ns_pseg_offset: offset of next partial segment in the current full segment + * @ns_cno: next checkpoint number + * @ns_ctime: write time of the last segment + * @ns_nongc_ctime: write time of the last segment not for cleaner operation + * @ns_ndirtyblks: Number of dirty data blocks + * @ns_last_segment_lock: lock protecting fields for the latest segment + * @ns_last_pseg: start block number of the latest segment + * @ns_last_seq: sequence value of the latest segment + * @ns_last_cno: checkpoint number of the latest segment + * @ns_prot_seq: least sequence number of segments which must not be reclaimed + * @ns_prev_seq: base sequence number used to decide if advance log cursor + * @ns_writer: log writer + * @ns_segctor_sem: semaphore protecting log write + * @ns_dat: DAT file inode + * @ns_cpfile: checkpoint file inode + * @ns_sufile: segusage file inode + * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root) + * @ns_cptree_lock: lock protecting @ns_cptree + * @ns_dirty_files: list of dirty files + * @ns_inode_lock: lock protecting @ns_dirty_files + * @ns_gc_inodes: dummy inodes to keep live blocks + * @ns_next_generation: next generation number for inodes + * @ns_next_gen_lock: lock protecting @ns_next_generation + * @ns_mount_opt: mount options + * @ns_resuid: uid for reserved blocks + * @ns_resgid: gid for reserved blocks + * @ns_interval: checkpoint creation interval + * @ns_watermark: watermark for the number of dirty buffers + * @ns_blocksize_bits: bit length of block size + * @ns_blocksize: block size + * @ns_nsegments: number of segments in filesystem + * @ns_blocks_per_segment: number of blocks per segment + * @ns_r_segments_percentage: reserved segments percentage + * @ns_nrsvsegs: number of reserved segments + * @ns_first_data_block: block number of first data block + * @ns_inode_size: size of on-disk inode + * @ns_first_ino: first not-special inode number + * @ns_crc_seed: seed value of CRC32 calculation + * @ns_dev_kobj: /sys/fs// + * @ns_dev_kobj_unregister: completion state + * @ns_dev_subgroups: subgroups pointer + */ +struct the_nilfs { + unsigned long ns_flags; + int ns_flushed_device; + + struct block_device *ns_bdev; + struct rw_semaphore ns_sem; + struct mutex ns_snapshot_mount_mutex; + + /* + * used for + * - loading the latest checkpoint exclusively. + * - allocating a new full segment. + */ + struct buffer_head *ns_sbh[2]; + struct nilfs_super_block *ns_sbp[2]; + time_t ns_sbwtime; + unsigned ns_sbwcount; + unsigned ns_sbsize; + unsigned ns_mount_state; + unsigned ns_sb_update_freq; + + /* + * Following fields are dedicated to a writable FS-instance. + * Except for the period seeking checkpoint, code outside the segment + * constructor must lock a segment semaphore while accessing these + * fields. + * The writable FS-instance is sole during a lifetime of the_nilfs. + */ + u64 ns_seg_seq; + __u64 ns_segnum; + __u64 ns_nextnum; + unsigned long ns_pseg_offset; + __u64 ns_cno; + time_t ns_ctime; + time_t ns_nongc_ctime; + atomic_t ns_ndirtyblks; + + /* + * The following fields hold information on the latest partial segment + * written to disk with a super root. These fields are protected by + * ns_last_segment_lock. + */ + spinlock_t ns_last_segment_lock; + sector_t ns_last_pseg; + u64 ns_last_seq; + __u64 ns_last_cno; + u64 ns_prot_seq; + u64 ns_prev_seq; + + struct nilfs_sc_info *ns_writer; + struct rw_semaphore ns_segctor_sem; + + /* + * Following fields are lock free except for the period before + * the_nilfs is initialized. + */ + struct inode *ns_dat; + struct inode *ns_cpfile; + struct inode *ns_sufile; + + /* Checkpoint tree */ + struct rb_root ns_cptree; + spinlock_t ns_cptree_lock; + + /* Dirty inode list */ + struct list_head ns_dirty_files; + spinlock_t ns_inode_lock; + + /* GC inode list */ + struct list_head ns_gc_inodes; + + /* Inode allocator */ + u32 ns_next_generation; + spinlock_t ns_next_gen_lock; + + /* Mount options */ + unsigned long ns_mount_opt; + + uid_t ns_resuid; + gid_t ns_resgid; + unsigned long ns_interval; + unsigned long ns_watermark; + + /* Disk layout information (static) */ + unsigned int ns_blocksize_bits; + unsigned int ns_blocksize; + unsigned long ns_nsegments; + unsigned long ns_blocks_per_segment; + unsigned long ns_r_segments_percentage; + unsigned long ns_nrsvsegs; + unsigned long ns_first_data_block; + int ns_inode_size; + int ns_first_ino; + u32 ns_crc_seed; + + /* /sys/fs// */ + struct kobject ns_dev_kobj; + struct completion ns_dev_kobj_unregister; + struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; +}; + +#define THE_NILFS_FNS(bit, name) \ +static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline int nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} + +THE_NILFS_FNS(INIT, init) +THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) +THE_NILFS_FNS(SB_DIRTY, sb_dirty) + +/* + * Mount option operations + */ +#define nilfs_clear_opt(nilfs, opt) \ + do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) +#define nilfs_set_opt(nilfs, opt) \ + do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0) +#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) +#define nilfs_write_opt(nilfs, mask, opt) \ + do { (nilfs)->ns_mount_opt = \ + (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ + NILFS_MOUNT_##opt); \ + } while (0) + +/** + * struct nilfs_root - nilfs root object + * @cno: checkpoint number + * @rb_node: red-black tree node + * @count: refcount of this structure + * @nilfs: nilfs object + * @ifile: inode file + * @inodes_count: number of inodes + * @blocks_count: number of blocks + * @snapshot_kobj: /sys/fs///mounted_snapshots/ + * @snapshot_kobj_unregister: completion state for kernel object + */ +struct nilfs_root { + __u64 cno; + struct rb_node rb_node; + + atomic_t count; + struct the_nilfs *nilfs; + struct inode *ifile; + + atomic64_t inodes_count; + atomic64_t blocks_count; + + /* /sys/fs///mounted_snapshots/ */ + struct kobject snapshot_kobj; + struct completion snapshot_kobj_unregister; +}; + +/* Special checkpoint number */ +#define NILFS_CPTREE_CURRENT_CNO 0 + +/* Minimum interval of periodical update of superblocks (in seconds) */ +#define NILFS_SB_FREQ 10 + +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime || + t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; +} + +static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) +{ + int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); +} + +void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); +struct the_nilfs *alloc_nilfs(struct block_device *bdev); +void destroy_nilfs(struct the_nilfs *nilfs); +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); +int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); +int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); +int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); +struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs, + __u64 cno); +void nilfs_put_root(struct nilfs_root *root); +int nilfs_near_disk_full(struct the_nilfs *); +void nilfs_fall_back_super_block(struct the_nilfs *); +void nilfs_swap_super_block(struct the_nilfs *); + + +static inline void nilfs_get_root(struct nilfs_root *root) +{ + atomic_inc(&root->count); +} + +static inline int nilfs_valid_fs(struct the_nilfs *nilfs) +{ + unsigned valid_fs; + + down_read(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_read(&nilfs->ns_sem); + return valid_fs; +} + +static inline void +nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, + sector_t *seg_start, sector_t *seg_end) +{ + *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum; + *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1; + if (segnum == 0) + *seg_start = nilfs->ns_first_data_block; +} + +static inline sector_t +nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum) +{ + return (segnum == 0) ? nilfs->ns_first_data_block : + (sector_t)nilfs->ns_blocks_per_segment * segnum; +} + +static inline __u64 +nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr) +{ + sector_t segnum = blocknr; + + sector_div(segnum, nilfs->ns_blocks_per_segment); + return segnum; +} + +static inline void +nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start, + sector_t seg_end) +{ + /* terminate the current full segment (used in case of I/O-error) */ + nilfs->ns_pseg_offset = seg_end - seg_start + 1; +} + +static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs) +{ + /* move forward with a full segment */ + nilfs->ns_segnum = nilfs->ns_nextnum; + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq++; +} + +static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs) +{ + __u64 cno; + + spin_lock(&nilfs->ns_last_segment_lock); + cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + return cno; +} + +static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) +{ + return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; +} + +static inline int nilfs_flush_device(struct the_nilfs *nilfs) +{ + int err; + + if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device) + return 0; + + nilfs->ns_flushed_device = 1; + /* + * the store to ns_flushed_device must not be reordered after + * blkdev_issue_flush(). + */ + smp_wmb(); + + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL); + if (err != -EIO) + err = 0; + return err; +} + +#endif /* _THE_NILFS_H */ diff --git a/kernel/fs/nls/Kconfig b/kernel/fs/nls/Kconfig new file mode 100644 index 000000000..e2ce79ef4 --- /dev/null +++ b/kernel/fs/nls/Kconfig @@ -0,0 +1,619 @@ +# +# Native language support configuration +# + +menuconfig NLS + tristate "Native language support" + ---help--- + The base Native Language Support. A number of filesystems + depend on it (e.g. FAT, JOLIET, NT, BEOS filesystems), as well + as the ability of some filesystems to use native languages + (NCP, SMB). + + If unsure, say Y. + + To compile this code as a module, choose M here: the module + will be called nls_base. + +if NLS + +config NLS_DEFAULT + string "Default NLS Option" + default "iso8859-1" + ---help--- + The default NLS used when mounting file system. Note, that this is + the NLS used by your console, not the NLS used by a specific file + system (if different) to store data (filenames) on a disk. + Currently, the valid values are: + big5, cp437, cp737, cp775, cp850, cp852, cp855, cp857, cp860, cp861, + cp862, cp863, cp864, cp865, cp866, cp869, cp874, cp932, cp936, + cp949, cp950, cp1251, cp1255, euc-jp, euc-kr, gb2312, iso8859-1, + iso8859-2, iso8859-3, iso8859-4, iso8859-5, iso8859-6, iso8859-7, + iso8859-8, iso8859-9, iso8859-13, iso8859-14, iso8859-15, + koi8-r, koi8-ru, koi8-u, sjis, tis-620, macroman, utf8. + If you specify a wrong value, it will use the built-in NLS; + compatible with iso8859-1. + + If unsure, specify it as "iso8859-1". + +config NLS_CODEPAGE_437 + tristate "Codepage 437 (United States, Canada)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used in + the United States and parts of Canada. This is recommended. + +config NLS_CODEPAGE_737 + tristate "Codepage 737 (Greek)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used for + Greek. If unsure, say N. + +config NLS_CODEPAGE_775 + tristate "Codepage 775 (Baltic Rim)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used + for the Baltic Rim Languages (Latvian and Lithuanian). If unsure, + say N. + +config NLS_CODEPAGE_850 + tristate "Codepage 850 (Europe)" + ---help--- + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used for + much of Europe -- United Kingdom, Germany, Spain, Italy, and [add + more countries here]. It has some characters useful to many European + languages that are not part of the US codepage 437. + + If unsure, say Y. + +config NLS_CODEPAGE_852 + tristate "Codepage 852 (Central/Eastern Europe)" + ---help--- + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Latin 2 codepage used by DOS + for much of Central and Eastern Europe. It has all the required + characters for these languages: Albanian, Croatian, Czech, English, + Finnish, Hungarian, Irish, German, Polish, Romanian, Serbian (Latin + transcription), Slovak, Slovenian, and Sorbian. + +config NLS_CODEPAGE_855 + tristate "Codepage 855 (Cyrillic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Cyrillic. + +config NLS_CODEPAGE_857 + tristate "Codepage 857 (Turkish)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Turkish. + +config NLS_CODEPAGE_860 + tristate "Codepage 860 (Portuguese)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Portuguese. + +config NLS_CODEPAGE_861 + tristate "Codepage 861 (Icelandic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Icelandic. + +config NLS_CODEPAGE_862 + tristate "Codepage 862 (Hebrew)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Hebrew. + +config NLS_CODEPAGE_863 + tristate "Codepage 863 (Canadian French)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Canadian + French. + +config NLS_CODEPAGE_864 + tristate "Codepage 864 (Arabic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Arabic. + +config NLS_CODEPAGE_865 + tristate "Codepage 865 (Norwegian, Danish)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for the Nordic + European countries. + +config NLS_CODEPAGE_866 + tristate "Codepage 866 (Cyrillic/Russian)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for + Cyrillic/Russian. + +config NLS_CODEPAGE_869 + tristate "Codepage 869 (Greek)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Greek. + +config NLS_CODEPAGE_936 + tristate "Simplified Chinese charset (CP936, GB2312)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Simplified + Chinese(GBK). + +config NLS_CODEPAGE_950 + tristate "Traditional Chinese charset (Big5)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Traditional + Chinese(Big5). + +config NLS_CODEPAGE_932 + tristate "Japanese charsets (Shift-JIS, EUC-JP)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Shift-JIS + or EUC-JP. To use EUC-JP, you can use 'euc-jp' as mount option or + NLS Default value during kernel configuration, instead of 'cp932'. + +config NLS_CODEPAGE_949 + tristate "Korean charset (CP949, EUC-KR)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for UHC. + +config NLS_CODEPAGE_874 + tristate "Thai charset (CP874, TIS-620)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Thai. + +config NLS_ISO8859_8 + tristate "Hebrew charsets (ISO-8859-8, CP1255)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-8, the Hebrew + character set. + +config NLS_CODEPAGE_1250 + tristate "Windows CP1250 (Slavic/Central European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CDROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Windows CP-1250 + character set, which works for most Latin-written Slavic and Central + European languages: Czech, German, Hungarian, Polish, Rumanian, Croatian, + Slovak, Slovene. + +config NLS_CODEPAGE_1251 + tristate "Windows CP1251 (Bulgarian, Belarusian)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Russian and + Bulgarian and Belarusian. + +config NLS_ASCII + tristate "ASCII (United States)" + help + An ASCII NLS module is needed if you want to override the + DEFAULT NLS with this very basic charset and don't want any + non-ASCII characters to be translated. + +config NLS_ISO8859_1 + tristate "NLS ISO 8859-1 (Latin 1; Western European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 1 character + set, which covers most West European languages such as Albanian, + Catalan, Danish, Dutch, English, Faeroese, Finnish, French, German, + Galician, Irish, Icelandic, Italian, Norwegian, Portuguese, Spanish, + and Swedish. It is also the default for the US. If unsure, say Y. + +config NLS_ISO8859_2 + tristate "NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 2 character + set, which works for most Latin-written Slavic and Central European + languages: Czech, German, Hungarian, Polish, Rumanian, Croatian, + Slovak, Slovene. + +config NLS_ISO8859_3 + tristate "NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 3 character + set, which is popular with authors of Esperanto, Galician, Maltese, + and Turkish. + +config NLS_ISO8859_4 + tristate "NLS ISO 8859-4 (Latin 4; old Baltic charset)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 4 character + set which introduces letters for Estonian, Latvian, and + Lithuanian. It is an incomplete predecessor of Latin 7. + +config NLS_ISO8859_5 + tristate "NLS ISO 8859-5 (Cyrillic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-5, a Cyrillic + character set with which you can type Bulgarian, Belarusian, + Macedonian, Russian, Serbian, and Ukrainian. Note that the charset + KOI8-R is preferred in Russia. + +config NLS_ISO8859_6 + tristate "NLS ISO 8859-6 (Arabic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-6, the Arabic + character set. + +config NLS_ISO8859_7 + tristate "NLS ISO 8859-7 (Modern Greek)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-7, the Modern + Greek character set. + +config NLS_ISO8859_9 + tristate "NLS ISO 8859-9 (Latin 5; Turkish)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 5 character + set, and it replaces the rarely needed Icelandic letters in Latin 1 + with the Turkish ones. Useful in Turkey. + +config NLS_ISO8859_13 + tristate "NLS ISO 8859-13 (Latin 7; Baltic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 7 character + set, which supports modern Baltic languages including Latvian + and Lithuanian. + +config NLS_ISO8859_14 + tristate "NLS ISO 8859-14 (Latin 8; Celtic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 8 character + set, which adds the last accented vowels for Welsh (aka Cymraeg) + (and Manx Gaelic) that were missing in Latin 1. + has further information. + +config NLS_ISO8859_15 + tristate "NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)" + ---help--- + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 9 character + set, which covers most West European languages such as Albanian, + Catalan, Danish, Dutch, English, Estonian, Faeroese, Finnish, + French, German, Galician, Irish, Icelandic, Italian, Norwegian, + Portuguese, Spanish, and Swedish. Latin 9 is an update to + Latin 1 (ISO 8859-1) that removes a handful of rarely used + characters and instead adds support for Estonian, corrects the + support for French and Finnish, and adds the new Euro character. + If unsure, say Y. + +config NLS_KOI8_R + tristate "NLS KOI8-R (Russian)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the preferred Russian + character set. + +config NLS_KOI8_U + tristate "NLS KOI8-U/RU (Ukrainian, Belarusian)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the preferred Ukrainian + (koi8-u) and Belarusian (koi8-ru) character sets. + +config NLS_MAC_ROMAN + tristate "Codepage macroman" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + much of Europe -- United Kingdom, Germany, Spain, Italy, and [add + more countries here]. + + If unsure, say Y. + +config NLS_MAC_CELTIC + tristate "Codepage macceltic" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Celtic. + + If unsure, say Y. + +config NLS_MAC_CENTEURO + tristate "Codepage maccenteuro" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Central Europe. + + If unsure, say Y. + +config NLS_MAC_CROATIAN + tristate "Codepage maccroatian" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Croatian. + + If unsure, say Y. + +config NLS_MAC_CYRILLIC + tristate "Codepage maccyrillic" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Cyrillic. + + If unsure, say Y. + +config NLS_MAC_GAELIC + tristate "Codepage macgaelic" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Gaelic. + + If unsure, say Y. + +config NLS_MAC_GREEK + tristate "Codepage macgreek" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Greek. + + If unsure, say Y. + +config NLS_MAC_ICELAND + tristate "Codepage maciceland" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Iceland. + + If unsure, say Y. + +config NLS_MAC_INUIT + tristate "Codepage macinuit" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Inuit. + + If unsure, say Y. + +config NLS_MAC_ROMANIAN + tristate "Codepage macromanian" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Romanian. + + If unsure, say Y. + +config NLS_MAC_TURKISH + tristate "Codepage macturkish" + ---help--- + The Apple HFS file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called MAC codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + Mac partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Mac codepage that is used for + Turkish. + + If unsure, say Y. + +config NLS_UTF8 + tristate "NLS UTF-8" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the UTF-8 encoding of + the Unicode/ISO9646 universal character set. + +endif # NLS diff --git a/kernel/fs/nls/Makefile b/kernel/fs/nls/Makefile new file mode 100644 index 000000000..8ae37c1b5 --- /dev/null +++ b/kernel/fs/nls/Makefile @@ -0,0 +1,55 @@ +# +# Makefile for native language support +# + +obj-$(CONFIG_NLS) += nls_base.o + +obj-$(CONFIG_NLS_CODEPAGE_437) += nls_cp437.o +obj-$(CONFIG_NLS_CODEPAGE_737) += nls_cp737.o +obj-$(CONFIG_NLS_CODEPAGE_775) += nls_cp775.o +obj-$(CONFIG_NLS_CODEPAGE_850) += nls_cp850.o +obj-$(CONFIG_NLS_CODEPAGE_852) += nls_cp852.o +obj-$(CONFIG_NLS_CODEPAGE_855) += nls_cp855.o +obj-$(CONFIG_NLS_CODEPAGE_857) += nls_cp857.o +obj-$(CONFIG_NLS_CODEPAGE_860) += nls_cp860.o +obj-$(CONFIG_NLS_CODEPAGE_861) += nls_cp861.o +obj-$(CONFIG_NLS_CODEPAGE_862) += nls_cp862.o +obj-$(CONFIG_NLS_CODEPAGE_863) += nls_cp863.o +obj-$(CONFIG_NLS_CODEPAGE_864) += nls_cp864.o +obj-$(CONFIG_NLS_CODEPAGE_865) += nls_cp865.o +obj-$(CONFIG_NLS_CODEPAGE_866) += nls_cp866.o +obj-$(CONFIG_NLS_CODEPAGE_869) += nls_cp869.o +obj-$(CONFIG_NLS_CODEPAGE_874) += nls_cp874.o +obj-$(CONFIG_NLS_CODEPAGE_932) += nls_cp932.o nls_euc-jp.o +obj-$(CONFIG_NLS_CODEPAGE_936) += nls_cp936.o +obj-$(CONFIG_NLS_CODEPAGE_949) += nls_cp949.o +obj-$(CONFIG_NLS_CODEPAGE_950) += nls_cp950.o +obj-$(CONFIG_NLS_CODEPAGE_1250) += nls_cp1250.o +obj-$(CONFIG_NLS_CODEPAGE_1251) += nls_cp1251.o +obj-$(CONFIG_NLS_ASCII) += nls_ascii.o +obj-$(CONFIG_NLS_ISO8859_1) += nls_iso8859-1.o +obj-$(CONFIG_NLS_ISO8859_2) += nls_iso8859-2.o +obj-$(CONFIG_NLS_ISO8859_3) += nls_iso8859-3.o +obj-$(CONFIG_NLS_ISO8859_4) += nls_iso8859-4.o +obj-$(CONFIG_NLS_ISO8859_5) += nls_iso8859-5.o +obj-$(CONFIG_NLS_ISO8859_6) += nls_iso8859-6.o +obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o +obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o +obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o +obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o +obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o +obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o +obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o +obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o +obj-$(CONFIG_NLS_UTF8) += nls_utf8.o +obj-$(CONFIG_NLS_MAC_CELTIC) += mac-celtic.o +obj-$(CONFIG_NLS_MAC_CENTEURO) += mac-centeuro.o +obj-$(CONFIG_NLS_MAC_CROATIAN) += mac-croatian.o +obj-$(CONFIG_NLS_MAC_CYRILLIC) += mac-cyrillic.o +obj-$(CONFIG_NLS_MAC_GAELIC) += mac-gaelic.o +obj-$(CONFIG_NLS_MAC_GREEK) += mac-greek.o +obj-$(CONFIG_NLS_MAC_ICELAND) += mac-iceland.o +obj-$(CONFIG_NLS_MAC_INUIT) += mac-inuit.o +obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o +obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o +obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o diff --git a/kernel/fs/nls/mac-celtic.c b/kernel/fs/nls/mac-celtic.c new file mode 100644 index 000000000..266c2d7d5 --- /dev/null +++ b/kernel/fs/nls/mac-celtic.c @@ -0,0 +1,601 @@ +/* + * linux/fs/nls/mac-celtic.c + * + * Charset macceltic translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x00c6, 0x00d8, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x03c0, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x00e6, 0x00f8, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x00ff, 0x0178, 0x2044, 0x20ac, + 0x2039, 0x203a, 0x0176, 0x0177, + /* 0xe0 */ + 0x2021, 0x00b7, 0x1ef2, 0x1ef3, + 0x2030, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0x2663, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x00dd, 0x00fd, + 0x0174, 0x0175, 0x1e84, 0x1e85, + 0x1e80, 0x1e81, 0x1e82, 0x1e83, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xf6, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xf7, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xde, 0xdf, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page1e[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0xfc, 0xfd, 0xfe, 0xff, 0xfa, 0xfb, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xe2, 0xe3, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ + 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page26[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page1e, NULL, + page20, page21, page22, NULL, NULL, page25, page26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macceltic", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macceltic(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macceltic(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macceltic) +module_exit(exit_nls_macceltic) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-centeuro.c b/kernel/fs/nls/mac-centeuro.c new file mode 100644 index 000000000..9789c6057 --- /dev/null +++ b/kernel/fs/nls/mac-centeuro.c @@ -0,0 +1,531 @@ +/* + * linux/fs/nls/mac-centeuro.c + * + * Charset maccenteuro translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x0100, 0x0101, 0x00c9, + 0x0104, 0x00d6, 0x00dc, 0x00e1, + 0x0105, 0x010c, 0x00e4, 0x010d, + 0x0106, 0x0107, 0x00e9, 0x0179, + /* 0x90 */ + 0x017a, 0x010e, 0x00ed, 0x010f, + 0x0112, 0x0113, 0x0116, 0x00f3, + 0x0117, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x011a, 0x011b, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x0118, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x0119, + 0x00a8, 0x2260, 0x0123, 0x012e, + /* 0xb0 */ + 0x012f, 0x012a, 0x2264, 0x2265, + 0x012b, 0x0136, 0x2202, 0x2211, + 0x0142, 0x013b, 0x013c, 0x013d, + 0x013e, 0x0139, 0x013a, 0x0145, + /* 0xc0 */ + 0x0146, 0x0143, 0x00ac, 0x221a, + 0x0144, 0x0147, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x0148, + 0x0150, 0x00d5, 0x0151, 0x014c, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x014d, 0x0154, 0x0155, 0x0158, + 0x2039, 0x203a, 0x0159, 0x0156, + /* 0xe0 */ + 0x0157, 0x0160, 0x201a, 0x201e, + 0x0161, 0x015a, 0x015b, 0x00c1, + 0x0164, 0x0165, 0x00cd, 0x017d, + 0x017e, 0x016a, 0x00d3, 0x00d4, + /* 0xf0 */ + 0x016b, 0x016e, 0x00da, 0x016f, + 0x0170, 0x0171, 0x0172, 0x0173, + 0x00dd, 0x00fd, 0x0137, 0x017b, + 0x0141, 0x017c, 0x0122, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xa1, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xe7, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x83, 0x00, 0x00, 0x00, 0xea, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xf2, 0x00, 0x86, 0xf8, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x00, 0x87, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x8e, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x9c, 0x00, 0x9f, 0xf9, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x81, 0x82, 0x00, 0x00, 0x84, 0x88, 0x8c, 0x8d, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x8b, 0x91, 0x93, /* 0x08-0x0f */ + 0x00, 0x00, 0x94, 0x95, 0x00, 0x00, 0x96, 0x98, /* 0x10-0x17 */ + 0xa2, 0xab, 0x9d, 0x9e, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xfe, 0xae, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0xb1, 0xb4, 0x00, 0x00, 0xaf, 0xb0, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb5, 0xfa, /* 0x30-0x37 */ + 0x00, 0xbd, 0xbe, 0xb9, 0xba, 0xbb, 0xbc, 0x00, /* 0x38-0x3f */ + 0x00, 0xfc, 0xb8, 0xc1, 0xc4, 0xbf, 0xc0, 0xc5, /* 0x40-0x47 */ + 0xcb, 0x00, 0x00, 0x00, 0xcf, 0xd8, 0x00, 0x00, /* 0x48-0x4f */ + 0xcc, 0xce, 0x00, 0x00, 0xd9, 0xda, 0xdf, 0xe0, /* 0x50-0x57 */ + 0xdb, 0xde, 0xe5, 0xe6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xe1, 0xe4, 0x00, 0x00, 0xe8, 0xe9, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xed, 0xf0, 0x00, 0x00, 0xf1, 0xf3, /* 0x68-0x6f */ + 0xf4, 0xf5, 0xf6, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8f, 0x90, 0xfb, 0xfd, 0xeb, 0xec, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "maccenteuro", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_maccenteuro(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_maccenteuro(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_maccenteuro) +module_exit(exit_nls_maccenteuro) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-croatian.c b/kernel/fs/nls/mac-croatian.c new file mode 100644 index 000000000..bb19e7a07 --- /dev/null +++ b/kernel/fs/nls/mac-croatian.c @@ -0,0 +1,601 @@ +/* + * linux/fs/nls/mac-croatian.c + * + * Charset maccroatian translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x0160, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x017d, 0x00d8, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x2206, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x0161, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x017e, 0x00f8, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x0106, 0x00ab, + 0x010c, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x0110, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0xf8ff, 0x00a9, 0x2044, 0x20ac, + 0x2039, 0x203a, 0x00c6, 0x00bb, + /* 0xe0 */ + 0x2013, 0x00b7, 0x201a, 0x201e, + 0x2030, 0x00c2, 0x0107, 0x00c1, + 0x010d, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0x0111, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x02c6, 0x02dc, + 0x00af, 0x03c0, 0x00cb, 0x02da, + 0x00b8, 0x00ca, 0x00e6, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xd9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0xfc, 0x00, 0xbc, 0xdf, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xde, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xfd, 0xfa, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xfe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xe6, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xfb, 0x00, 0xf7, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xe0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb4, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char pagef8[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "maccroatian", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_maccroatian(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_maccroatian(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_maccroatian) +module_exit(exit_nls_maccroatian) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-cyrillic.c b/kernel/fs/nls/mac-cyrillic.c new file mode 100644 index 000000000..2a7dea36a --- /dev/null +++ b/kernel/fs/nls/mac-cyrillic.c @@ -0,0 +1,496 @@ +/* + * linux/fs/nls/mac-cyrillic.c + * + * Charset maccyrillic translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0x90 */ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xa0 */ + 0x2020, 0x00b0, 0x0490, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x0406, + 0x00ae, 0x00a9, 0x2122, 0x0402, + 0x0452, 0x2260, 0x0403, 0x0453, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x0456, 0x00b5, 0x0491, 0x0408, + 0x0404, 0x0454, 0x0407, 0x0457, + 0x0409, 0x0459, 0x040a, 0x045a, + /* 0xc0 */ + 0x0458, 0x0405, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x040b, + 0x045b, 0x040c, 0x045c, 0x0455, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x201e, + 0x040e, 0x045e, 0x040f, 0x045f, + 0x2116, 0x0401, 0x0451, 0x044f, + /* 0xe0 */ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xf0 */ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x20ac, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xa6, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xdd, 0xab, 0xae, 0xb8, 0xc1, 0xa7, 0xba, /* 0x00-0x07 */ + 0xb7, 0xbc, 0xbe, 0xcb, 0xcd, 0x00, 0xd8, 0xda, /* 0x08-0x0f */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0x48-0x4f */ + 0x00, 0xde, 0xac, 0xaf, 0xb9, 0xcf, 0xb4, 0xbb, /* 0x50-0x57 */ + 0xc0, 0xbd, 0xbf, 0xcc, 0xce, 0x00, 0xd9, 0xdb, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xa2, 0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0xd7, 0x00, /* 0x18-0x1f */ + 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "maccyrillic", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_maccyrillic(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_maccyrillic(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_maccyrillic) +module_exit(exit_nls_maccyrillic) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-gaelic.c b/kernel/fs/nls/mac-gaelic.c new file mode 100644 index 000000000..77b001653 --- /dev/null +++ b/kernel/fs/nls/mac-gaelic.c @@ -0,0 +1,566 @@ +/* + * linux/fs/nls/mac-gaelic.c + * + * Charset macgaelic translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x00c6, 0x00d8, + /* 0xb0 */ + 0x1e02, 0x00b1, 0x2264, 0x2265, + 0x1e03, 0x010a, 0x010b, 0x1e0a, + 0x1e0b, 0x1e1e, 0x1e1f, 0x0120, + 0x0121, 0x1e40, 0x00e6, 0x00f8, + /* 0xc0 */ + 0x1e41, 0x1e56, 0x1e57, 0x027c, + 0x0192, 0x017f, 0x1e60, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x1e61, 0x1e9b, + 0x00ff, 0x0178, 0x1e6a, 0x20ac, + 0x2039, 0x203a, 0x0176, 0x0177, + /* 0xe0 */ + 0x1e6b, 0x00b7, 0x1ef2, 0x1ef3, + 0x204a, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0x2663, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x00dd, 0x00fd, + 0x0174, 0x0175, 0x1e84, 0x1e85, + 0x1e80, 0x1e81, 0x1e82, 0x1e83, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0x00, 0xa2, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0x00, 0xc7, 0x00, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0x00, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xf6, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0x00, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xf7, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xb5, 0xb6, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xbb, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xde, 0xdf, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page1e[256] = { + 0x00, 0x00, 0xb0, 0xb4, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xb7, 0xb8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0xba, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xbd, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc1, 0xc2, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xda, 0xe0, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0xfc, 0xfd, 0xfe, 0xff, 0xfa, 0xfb, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xe2, 0xe3, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ + 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page26[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page1e, NULL, + page20, page21, page22, NULL, NULL, NULL, page26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macgaelic", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macgaelic(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macgaelic(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macgaelic) +module_exit(exit_nls_macgaelic) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-greek.c b/kernel/fs/nls/mac-greek.c new file mode 100644 index 000000000..1eccf499e --- /dev/null +++ b/kernel/fs/nls/mac-greek.c @@ -0,0 +1,496 @@ +/* + * linux/fs/nls/mac-greek.c + * + * Charset macgreek translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00b9, 0x00b2, 0x00c9, + 0x00b3, 0x00d6, 0x00dc, 0x0385, + 0x00e0, 0x00e2, 0x00e4, 0x0384, + 0x00a8, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00a3, 0x2122, + 0x00ee, 0x00ef, 0x2022, 0x00bd, + 0x2030, 0x00f4, 0x00f6, 0x00a6, + 0x20ac, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x0393, 0x0394, 0x0398, + 0x039b, 0x039e, 0x03a0, 0x00df, + 0x00ae, 0x00a9, 0x03a3, 0x03aa, + 0x00a7, 0x2260, 0x00b0, 0x00b7, + /* 0xb0 */ + 0x0391, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x0392, 0x0395, 0x0396, + 0x0397, 0x0399, 0x039a, 0x039c, + 0x03a6, 0x03ab, 0x03a8, 0x03a9, + /* 0xc0 */ + 0x03ac, 0x039d, 0x00ac, 0x039f, + 0x03a1, 0x2248, 0x03a4, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x03a5, + 0x03a7, 0x0386, 0x0388, 0x0153, + /* 0xd0 */ + 0x2013, 0x2015, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x0389, + 0x038a, 0x038c, 0x038e, 0x03ad, + 0x03ae, 0x03af, 0x03cc, 0x038f, + /* 0xe0 */ + 0x03cd, 0x03b1, 0x03b2, 0x03c8, + 0x03b4, 0x03b5, 0x03c6, 0x03b3, + 0x03b7, 0x03b9, 0x03be, 0x03ba, + 0x03bb, 0x03bc, 0x03bd, 0x03bf, + /* 0xf0 */ + 0x03c0, 0x03ce, 0x03c1, 0x03c3, + 0x03c4, 0x03b8, 0x03c9, 0x03c2, + 0x03c7, 0x03c5, 0x03b6, 0x03ca, + 0x03cb, 0x0390, 0x03b0, 0x00ad, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0x00, 0x00, 0x92, 0x00, 0xb4, 0x9b, 0xac, /* 0xa0-0xa7 */ + 0x8c, 0xa9, 0x00, 0xc7, 0xc2, 0xff, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xae, 0xb1, 0x82, 0x84, 0x00, 0x00, 0x00, 0xaf, /* 0xb0-0xb7 */ + 0x00, 0x81, 0x00, 0xc8, 0x00, 0x97, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x00, 0x89, 0x00, 0x8a, 0x00, 0x00, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x00, 0x00, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0x00, 0x9d, 0x00, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x8b, 0x87, 0xcd, 0x00, /* 0x80-0x87 */ + 0xce, 0xd7, 0xd8, 0x00, 0xd9, 0x00, 0xda, 0xdf, /* 0x88-0x8f */ + 0xfd, 0xb0, 0xb5, 0xa1, 0xa2, 0xb6, 0xb7, 0xb8, /* 0x90-0x97 */ + 0xa3, 0xb9, 0xba, 0xa4, 0xbb, 0xc1, 0xa5, 0xc3, /* 0x98-0x9f */ + 0xa6, 0xc4, 0x00, 0xaa, 0xc6, 0xcb, 0xbc, 0xcc, /* 0xa0-0xa7 */ + 0xbe, 0xbf, 0xab, 0xbd, 0xc0, 0xdb, 0xdc, 0xdd, /* 0xa8-0xaf */ + 0xfe, 0xe1, 0xe2, 0xe7, 0xe4, 0xe5, 0xfa, 0xe8, /* 0xb0-0xb7 */ + 0xf5, 0xe9, 0xeb, 0xec, 0xed, 0xee, 0xea, 0xef, /* 0xb8-0xbf */ + 0xf0, 0xf2, 0xf7, 0xf3, 0xf4, 0xf9, 0xe6, 0xf8, /* 0xc0-0xc7 */ + 0xe3, 0xf6, 0xfb, 0xfc, 0xde, 0xe0, 0xf1, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ + 0xa0, 0x00, 0x96, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macgreek", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macgreek(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macgreek(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macgreek) +module_exit(exit_nls_macgreek) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-iceland.c b/kernel/fs/nls/mac-iceland.c new file mode 100644 index 000000000..cbd0875c6 --- /dev/null +++ b/kernel/fs/nls/mac-iceland.c @@ -0,0 +1,601 @@ +/* + * linux/fs/nls/mac-iceland.c + * + * Charset maciceland translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x00dd, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x00c6, 0x00d8, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x03c0, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x00e6, 0x00f8, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x00ff, 0x0178, 0x2044, 0x20ac, + 0x00d0, 0x00f0, 0x00de, 0x00fe, + /* 0xe0 */ + 0x00fd, 0x00b7, 0x201a, 0x201e, + 0x2030, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0xf8ff, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x02c6, 0x02dc, + 0x00af, 0x02d8, 0x02d9, 0x02da, + 0x00b8, 0x02dd, 0x02db, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0xdc, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xa0, 0xde, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0xdd, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xe0, 0xdf, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char pagef8[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "maciceland", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_maciceland(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_maciceland(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_maciceland) +module_exit(exit_nls_maciceland) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-inuit.c b/kernel/fs/nls/mac-inuit.c new file mode 100644 index 000000000..fba8357aa --- /dev/null +++ b/kernel/fs/nls/mac-inuit.c @@ -0,0 +1,531 @@ +/* + * linux/fs/nls/mac-inuit.c + * + * Charset macinuit translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x1403, 0x1404, 0x1405, 0x1406, + 0x140a, 0x140b, 0x1431, 0x1432, + 0x1433, 0x1434, 0x1438, 0x1439, + 0x1449, 0x144e, 0x144f, 0x1450, + /* 0x90 */ + 0x1451, 0x1455, 0x1456, 0x1466, + 0x146d, 0x146e, 0x146f, 0x1470, + 0x1472, 0x1473, 0x1483, 0x148b, + 0x148c, 0x148d, 0x148e, 0x1490, + /* 0xa0 */ + 0x1491, 0x00b0, 0x14a1, 0x14a5, + 0x14a6, 0x2022, 0x00b6, 0x14a7, + 0x00ae, 0x00a9, 0x2122, 0x14a8, + 0x14aa, 0x14ab, 0x14bb, 0x14c2, + /* 0xb0 */ + 0x14c3, 0x14c4, 0x14c5, 0x14c7, + 0x14c8, 0x14d0, 0x14ef, 0x14f0, + 0x14f1, 0x14f2, 0x14f4, 0x14f5, + 0x1505, 0x14d5, 0x14d6, 0x14d7, + /* 0xc0 */ + 0x14d8, 0x14da, 0x14db, 0x14ea, + 0x1528, 0x1529, 0x152a, 0x152b, + 0x152d, 0x2026, 0x00a0, 0x152e, + 0x153e, 0x1555, 0x1556, 0x1557, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x1558, 0x1559, + 0x155a, 0x155d, 0x1546, 0x1547, + 0x1548, 0x1549, 0x154b, 0x154c, + /* 0xe0 */ + 0x1550, 0x157f, 0x1580, 0x1581, + 0x1582, 0x1583, 0x1584, 0x1585, + 0x158f, 0x1590, 0x1591, 0x1592, + 0x1593, 0x1594, 0x1595, 0x1671, + /* 0xf0 */ + 0x1672, 0x1673, 0x1674, 0x1675, + 0x1676, 0x1596, 0x15a0, 0x15a1, + 0x15a2, 0x15a3, 0x15a4, 0x15a5, + 0x15a6, 0x157c, 0x0141, 0x0142, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0x00, 0x00, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ + 0xa1, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0xfe, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page14[256] = { + 0x00, 0x00, 0x00, 0x80, 0x81, 0x82, 0x83, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x84, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x86, 0x87, 0x88, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x8a, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x8d, 0x8e, /* 0x48-0x4f */ + 0x8f, 0x90, 0x00, 0x00, 0x00, 0x91, 0x92, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x95, 0x96, /* 0x68-0x6f */ + 0x97, 0x00, 0x98, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x88-0x8f */ + 0x9f, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa3, 0xa4, 0xa7, /* 0xa0-0xa7 */ + 0xab, 0x00, 0xac, 0xad, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0xaf, 0xb0, 0xb1, 0xb2, 0x00, 0xb3, /* 0xc0-0xc7 */ + 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0xb5, 0x00, 0x00, 0x00, 0x00, 0xbd, 0xbe, 0xbf, /* 0xd0-0xd7 */ + 0xc0, 0x00, 0xc1, 0xc2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x00, 0xb6, /* 0xe8-0xef */ + 0xb7, 0xb8, 0xb9, 0x00, 0xba, 0xbb, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page15[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0xc4, 0xc5, 0xc6, 0xc7, 0x00, 0xc8, 0xcb, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x40-0x47 */ + 0xdc, 0xdd, 0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xe0, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xce, 0xcf, /* 0x50-0x57 */ + 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0xd9, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0xe1, /* 0x78-0x7f */ + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, /* 0x88-0x8f */ + 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xf5, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page16[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, page14, page15, page16, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */ + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macinuit", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macinuit(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macinuit(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macinuit) +module_exit(exit_nls_macinuit) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-roman.c b/kernel/fs/nls/mac-roman.c new file mode 100644 index 000000000..b6a98a520 --- /dev/null +++ b/kernel/fs/nls/mac-roman.c @@ -0,0 +1,636 @@ +/* + * linux/fs/nls/mac-roman.c + * + * Charset macroman translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x00c6, 0x00d8, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x03c0, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x00e6, 0x00f8, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x00ff, 0x0178, 0x2044, 0x20ac, + 0x2039, 0x203a, 0xfb01, 0xfb02, + /* 0xe0 */ + 0x2021, 0x00b7, 0x201a, 0x201e, + 0x2030, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0xf8ff, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x02c6, 0x02dc, + 0x00af, 0x02d8, 0x02d9, 0x02da, + 0x00b8, 0x02dd, 0x02db, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char pagef8[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ +}; + +static const unsigned char pagefb[256] = { + 0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + pagef8, NULL, NULL, pagefb, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macroman", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macroman(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macroman(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macroman) +module_exit(exit_nls_macroman) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-romanian.c b/kernel/fs/nls/mac-romanian.c new file mode 100644 index 000000000..25547f023 --- /dev/null +++ b/kernel/fs/nls/mac-romanian.c @@ -0,0 +1,601 @@ +/* + * linux/fs/nls/mac-romanian.c + * + * Charset macromanian translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x0102, 0x0218, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x03c0, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x0103, 0x0219, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x00ff, 0x0178, 0x2044, 0x20ac, + 0x2039, 0x203a, 0x021a, 0x021b, + /* 0xe0 */ + 0x2021, 0x00b7, 0x201a, 0x201e, + 0x2030, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0xf8ff, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0x0131, 0x02c6, 0x02dc, + 0x00af, 0x02d8, 0x02d9, 0x02da, + 0x00b8, 0x02dd, 0x02db, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0x00, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0x00, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0x00, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0x00, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xae, 0xbe, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xaf, 0xbf, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char pagef8[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macromanian", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macromanian(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macromanian(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macromanian) +module_exit(exit_nls_macromanian) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/mac-turkish.c b/kernel/fs/nls/mac-turkish.c new file mode 100644 index 000000000..b5454bc7b --- /dev/null +++ b/kernel/fs/nls/mac-turkish.c @@ -0,0 +1,601 @@ +/* + * linux/fs/nls/mac-turkish.c + * + * Charset macturkish translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of the Unicode data files and any associated documentation (the "Data + * Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do + * so, provided that (a) the above copyright notice(s) and this permission + * notice appear with all copies of the Data Files or Software, (b) both the + * above copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00 */ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10 */ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20 */ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30 */ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40 */ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50 */ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60 */ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70 */ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80 */ + 0x00c4, 0x00c5, 0x00c7, 0x00c9, + 0x00d1, 0x00d6, 0x00dc, 0x00e1, + 0x00e0, 0x00e2, 0x00e4, 0x00e3, + 0x00e5, 0x00e7, 0x00e9, 0x00e8, + /* 0x90 */ + 0x00ea, 0x00eb, 0x00ed, 0x00ec, + 0x00ee, 0x00ef, 0x00f1, 0x00f3, + 0x00f2, 0x00f4, 0x00f6, 0x00f5, + 0x00fa, 0x00f9, 0x00fb, 0x00fc, + /* 0xa0 */ + 0x2020, 0x00b0, 0x00a2, 0x00a3, + 0x00a7, 0x2022, 0x00b6, 0x00df, + 0x00ae, 0x00a9, 0x2122, 0x00b4, + 0x00a8, 0x2260, 0x00c6, 0x00d8, + /* 0xb0 */ + 0x221e, 0x00b1, 0x2264, 0x2265, + 0x00a5, 0x00b5, 0x2202, 0x2211, + 0x220f, 0x03c0, 0x222b, 0x00aa, + 0x00ba, 0x03a9, 0x00e6, 0x00f8, + /* 0xc0 */ + 0x00bf, 0x00a1, 0x00ac, 0x221a, + 0x0192, 0x2248, 0x2206, 0x00ab, + 0x00bb, 0x2026, 0x00a0, 0x00c0, + 0x00c3, 0x00d5, 0x0152, 0x0153, + /* 0xd0 */ + 0x2013, 0x2014, 0x201c, 0x201d, + 0x2018, 0x2019, 0x00f7, 0x25ca, + 0x00ff, 0x0178, 0x011e, 0x011f, + 0x0130, 0x0131, 0x015e, 0x015f, + /* 0xe0 */ + 0x2021, 0x00b7, 0x201a, 0x201e, + 0x2030, 0x00c2, 0x00ca, 0x00c1, + 0x00cb, 0x00c8, 0x00cd, 0x00ce, + 0x00cf, 0x00cc, 0x00d3, 0x00d4, + /* 0xf0 */ + 0xf8ff, 0x00d2, 0x00da, 0x00db, + 0x00d9, 0xf8a0, 0x02c6, 0x02dc, + 0x00af, 0x02d8, 0x02d9, 0x02da, + 0x00b8, 0x02dd, 0x02db, 0x02c7, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ + 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ + 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ + 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ + 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xdf, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ + 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ + 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page25[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char pagef8[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "macturkish", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_macturkish(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_macturkish(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_macturkish) +module_exit(exit_nls_macturkish) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_ascii.c b/kernel/fs/nls/nls_ascii.c new file mode 100644 index 000000000..a2620650d --- /dev/null +++ b/kernel/fs/nls/nls_ascii.c @@ -0,0 +1,166 @@ +/* + * linux/fs/nls/nls_ascii.c + * + * Charset ascii translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "ascii", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_ascii(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_ascii(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_ascii) +module_exit(exit_nls_ascii) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_base.c b/kernel/fs/nls/nls_base.c new file mode 100644 index 000000000..52ccd34b1 --- /dev/null +++ b/kernel/fs/nls/nls_base.c @@ -0,0 +1,548 @@ +/* + * linux/fs/nls/nls_base.c + * + * Native language support--charsets and unicode translations. + * By Gordon Chaffee 1996, 1997 + * + * Unicode based case conversion 1999 by Wolfram Pienkoss + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct nls_table default_table; +static struct nls_table *tables = &default_table; +static DEFINE_SPINLOCK(nls_lock); + +/* + * Sample implementation from Unicode home page. + * http://www.stonehand.com/unicode/standard/fss-utf.html + */ +struct utf8_table { + int cmask; + int cval; + int shift; + long lmask; + long lval; +}; + +static const struct utf8_table utf8_table[] = +{ + {0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */}, + {0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */}, + {0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */}, + {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, + {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, + {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, + {0, /* end of table */} +}; + +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) +{ + unsigned long l; + int c0, c, nc; + const struct utf8_table *t; + + nc = 0; + c0 = *s; + l = c0; + for (t = utf8_table; t->cmask; t++) { + nc++; + if ((c0 & t->cmask) == t->cval) { + l &= t->lmask; + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + *pu = (unicode_t) l; + return nc; + } + if (inlen <= nc) + return -1; + s++; + c = (*s ^ 0x80) & 0xFF; + if (c & 0xC0) + return -1; + l = (l << 6) | c; + } + return -1; +} +EXPORT_SYMBOL(utf8_to_utf32); + +int utf32_to_utf8(unicode_t u, u8 *s, int maxout) +{ + unsigned long l; + int c, nc; + const struct utf8_table *t; + + if (!s) + return 0; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + + nc = 0; + for (t = utf8_table; t->cmask && maxout; t++, maxout--) { + nc++; + if (l <= t->lmask) { + c = t->shift; + *s = (u8) (t->cval | (l >> c)); + while (c > 0) { + c -= 6; + s++; + *s = (u8) (0x80 | ((l >> c) & 0x3F)); + } + return nc; + } + } + return -1; +} +EXPORT_SYMBOL(utf32_to_utf8); + +static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + *s = (wchar_t) c; + break; + case UTF16_LITTLE_ENDIAN: + *s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *s = __cpu_to_be16(c); + break; + } +} + +int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) +{ + u16 *op; + int size; + unicode_t u; + + op = pwcs; + while (inlen > 0 && maxout > 0 && *s) { + if (*s & 0x80) { + size = utf8_to_utf32(s, inlen, &u); + if (size < 0) + return -EINVAL; + s += size; + inlen -= size; + + if (u >= PLANE_SIZE) { + if (maxout < 2) + break; + u -= PLANE_SIZE; + put_utf16(op++, SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS), + endian); + maxout -= 2; + } else { + put_utf16(op++, u, endian); + maxout--; + } + } else { + put_utf16(op++, *s++, endian); + inlen--; + maxout--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, + u8 *s, int maxout) +{ + u8 *op; + int size; + unsigned long u, v; + + op = s; + while (inlen > 0 && maxout > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + inlen--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (inlen <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + inlen--; + } + size = utf32_to_utf8(u, op, maxout); + if (size == -1) { + /* Ignore character and move on */ + } else { + op += size; + maxout -= size; + } + } else { + *op++ = (u8) u; + maxout--; + } + } + return op - s; +} +EXPORT_SYMBOL(utf16s_to_utf8s); + +int __register_nls(struct nls_table *nls, struct module *owner) +{ + struct nls_table ** tmp = &tables; + + if (nls->next) + return -EBUSY; + + nls->owner = owner; + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + spin_unlock(&nls_lock); + return -EBUSY; + } + tmp = &(*tmp)->next; + } + nls->next = tables; + tables = nls; + spin_unlock(&nls_lock); + return 0; +} +EXPORT_SYMBOL(__register_nls); + +int unregister_nls(struct nls_table * nls) +{ + struct nls_table ** tmp = &tables; + + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + *tmp = nls->next; + spin_unlock(&nls_lock); + return 0; + } + tmp = &(*tmp)->next; + } + spin_unlock(&nls_lock); + return -EINVAL; +} + +static struct nls_table *find_nls(char *charset) +{ + struct nls_table *nls; + spin_lock(&nls_lock); + for (nls = tables; nls; nls = nls->next) { + if (!strcmp(nls->charset, charset)) + break; + if (nls->alias && !strcmp(nls->alias, charset)) + break; + } + if (nls && !try_module_get(nls->owner)) + nls = NULL; + spin_unlock(&nls_lock); + return nls; +} + +struct nls_table *load_nls(char *charset) +{ + return try_then_request_module(find_nls(charset), "nls_%s", charset); +} + +void unload_nls(struct nls_table *nls) +{ + if (nls) + module_put(nls->owner); +} + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00 +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table default_table = { + .charset = "default", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +/* Returns a simple default translation table */ +struct nls_table *load_nls_default(void) +{ + struct nls_table *default_nls; + + default_nls = load_nls(CONFIG_NLS_DEFAULT); + if (default_nls != NULL) + return default_nls; + else + return &default_table; +} + +EXPORT_SYMBOL(unregister_nls); +EXPORT_SYMBOL(unload_nls); +EXPORT_SYMBOL(load_nls); +EXPORT_SYMBOL(load_nls_default); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp1250.c b/kernel/fs/nls/nls_cp1250.c new file mode 100644 index 000000000..ace3e19d3 --- /dev/null +++ b/kernel/fs/nls/nls_cp1250.c @@ -0,0 +1,346 @@ +/* + * linux/fs/nls/nls_cp1250.c + * + * Charset cp1250 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x20ac, 0x0000, 0x201a, 0x0000, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x0000, 0x2030, 0x0160, 0x2039, + 0x015a, 0x0164, 0x017d, 0x0179, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x2122, 0x0161, 0x203a, + 0x015b, 0x0165, 0x017e, 0x017a, + /* 0xa0*/ + 0x00a0, 0x02c7, 0x02d8, 0x0141, + 0x00a4, 0x0104, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x015e, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x017b, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x02db, 0x0142, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x0105, 0x015f, 0x00bb, + 0x013d, 0x02dd, 0x013e, 0x017c, + /* 0xc0*/ + 0x0154, 0x00c1, 0x00c2, 0x0102, + 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x011a, 0x00cd, 0x00ce, 0x010e, + /* 0xd0*/ + 0x0110, 0x0143, 0x0147, 0x00d3, + 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, + 0x00dc, 0x00dd, 0x0162, 0x00df, + /* 0xe0*/ + 0x0155, 0x00e1, 0x00e2, 0x0103, + 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x011b, 0x00ed, 0x00ee, 0x010f, + /* 0xf0*/ + 0x0111, 0x0144, 0x0148, 0x00f3, + 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, + 0x00fc, 0x00fd, 0x0163, 0x02d9, + }; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ + }; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc3, 0xe3, 0xa5, 0xb9, 0xc6, 0xe6, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xbc, 0xbe, 0x00, /* 0x38-0x3f */ + 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ + 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ + 0xd8, 0xf8, 0x8c, 0x9c, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ + 0x8a, 0x9a, 0xde, 0xfe, 0x8d, 0x9d, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ + 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8f, 0x9f, 0xaf, 0xbf, 0x8e, 0x9e, 0x00, /* 0x78-0x7f */ + + }; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ + }; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + }; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + }; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, + }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x00, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xb9, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbe, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ + }; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x00, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa5, 0xaa, 0xbb, 0xbc, 0xbd, 0xbc, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ + }; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1250", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp1250(void) +{ + return register_nls(&table); +} +static void __exit exit_nls_cp1250(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1250) +module_exit(exit_nls_cp1250) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp1251.c b/kernel/fs/nls/nls_cp1251.c new file mode 100644 index 000000000..9273ddfd0 --- /dev/null +++ b/kernel/fs/nls/nls_cp1251.c @@ -0,0 +1,301 @@ +/* + * linux/fs/nls/nls_cp1251.c + * + * Charset cp1251 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0402, 0x0403, 0x201a, 0x0453, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x20ac, 0x2030, 0x0409, 0x2039, + 0x040a, 0x040c, 0x040b, 0x040f, + /* 0x90*/ + 0x0452, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x2122, 0x0459, 0x203a, + 0x045a, 0x045c, 0x045b, 0x045f, + /* 0xa0*/ + 0x00a0, 0x040e, 0x045e, 0x0408, + 0x00a4, 0x0490, 0x00a6, 0x00a7, + 0x0401, 0x00a9, 0x0404, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x0407, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x0406, 0x0456, + 0x0491, 0x00b5, 0x00b6, 0x00b7, + 0x0451, 0x2116, 0x0454, 0x00bb, + 0x0458, 0x0405, 0x0455, 0x0457, + /* 0xc0*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0xd0*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xe0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xf0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */ + 0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */ + 0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */ + 0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x90, 0x83, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa2, 0xa2, 0xbc, 0xa4, 0xb4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xb8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb3, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x80, 0x81, 0x82, 0x81, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x80, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa1, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb2, 0xa5, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xa8, 0xb9, 0xaa, 0xbb, 0xa3, 0xbd, 0xbd, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1251", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp1251(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp1251(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1251) +module_exit(exit_nls_cp1251) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp1255.c b/kernel/fs/nls/nls_cp1255.c new file mode 100644 index 000000000..1caf5dfed --- /dev/null +++ b/kernel/fs/nls/nls_cp1255.c @@ -0,0 +1,384 @@ +/* + * linux/fs/nls/nls_cp1255.c + * + * Charset cp1255 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x20ac, 0x0000, 0x201a, 0x0192, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x02c6, 0x2030, 0x0000, 0x2039, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x02dc, 0x2122, 0x0000, 0x203a, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x20aa, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00d7, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x203e, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00f7, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x05b0, 0x05b1, 0x05b2, 0x05b3, + 0x05b4, 0x05b5, 0x05b6, 0x05b7, + 0x05b8, 0x05b9, 0x0000, 0x05bb, + 0x05bc, 0x05bd, 0x05be, 0x05bf, + /* 0xd0*/ + 0x05c0, 0x05c1, 0x05c2, 0x05c3, + 0x05f0, 0x05f1, 0x05f2, 0x05f3, + 0x05f4, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2017, + /* 0xe0*/ + 0x05d0, 0x05d1, 0x05d2, 0x05d3, + 0x05d4, 0x05d5, 0x05d6, 0x05d7, + 0x05d8, 0x05d9, 0x05da, 0x05db, + 0x05dc, 0x05dd, 0x05de, 0x05df, + /* 0xf0*/ + 0x05e0, 0x05e1, 0x05e2, 0x05e3, + 0x05e4, 0x05e5, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0x00, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, /* 0xf0-0xf7 */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char page05[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xb0-0xb7 */ + 0xc8, 0xc9, 0x00, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xb8-0xbf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xd0-0xd7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xd8-0xdf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xe0-0xe7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xfe, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa4, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, page05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1255", + .alias = "iso8859-8", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp1255(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp1255(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1255) +module_exit(exit_nls_cp1255) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(iso8859-8); diff --git a/kernel/fs/nls/nls_cp437.c b/kernel/fs/nls/nls_cp437.c new file mode 100644 index 000000000..7ddb830da --- /dev/null +++ b/kernel/fs/nls/nls_cp437.c @@ -0,0 +1,387 @@ +/* + * linux/fs/nls/nls_cp437.c + * + * Charset cp437 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00a2, + 0x00a3, 0x00a5, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp437", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp437(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp437(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp437) +module_exit(exit_nls_cp437) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp737.c b/kernel/fs/nls/nls_cp737.c new file mode 100644 index 000000000..c593f683a --- /dev/null +++ b/kernel/fs/nls/nls_cp737.c @@ -0,0 +1,350 @@ +/* + * linux/fs/nls/nls_cp737.c + * + * Charset cp737 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0391, 0x0392, 0x0393, 0x0394, + 0x0395, 0x0396, 0x0397, 0x0398, + 0x0399, 0x039a, 0x039b, 0x039c, + 0x039d, 0x039e, 0x039f, 0x03a0, + /* 0x90*/ + 0x03a1, 0x03a3, 0x03a4, 0x03a5, + 0x03a6, 0x03a7, 0x03a8, 0x03a9, + 0x03b1, 0x03b2, 0x03b3, 0x03b4, + 0x03b5, 0x03b6, 0x03b7, 0x03b8, + /* 0xa0*/ + 0x03b9, 0x03ba, 0x03bb, 0x03bc, + 0x03bd, 0x03be, 0x03bf, 0x03c0, + 0x03c1, 0x03c3, 0x03c2, 0x03c4, + 0x03c5, 0x03c6, 0x03c7, 0x03c8, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03c9, 0x03ac, 0x03ad, 0x03ae, + 0x03ca, 0x03af, 0x03cc, 0x03cd, + 0x03cb, 0x03ce, 0x0386, 0x0388, + 0x0389, 0x038a, 0x038c, 0x038e, + /* 0xf0*/ + 0x038f, 0x00b1, 0x2265, 0x2264, + 0x03aa, 0x03ab, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x00, /* 0x80-0x87 */ + 0xeb, 0xec, 0xed, 0x00, 0xee, 0x00, 0xef, 0xf0, /* 0x88-0x8f */ + 0x00, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, /* 0x90-0x97 */ + 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, /* 0x98-0x9f */ + 0x8f, 0x90, 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, /* 0xa0-0xa7 */ + 0x96, 0x97, 0xf4, 0xf5, 0xe1, 0xe2, 0xe3, 0xe5, /* 0xa8-0xaf */ + 0x00, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, /* 0xb0-0xb7 */ + 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, /* 0xb8-0xbf */ + 0xa7, 0xa8, 0xaa, 0xa9, 0xab, 0xac, 0xad, 0xae, /* 0xc0-0xc7 */ + 0xaf, 0xe0, 0xe4, 0xe8, 0xe6, 0xe7, 0xe9, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x80-0x87 */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x88-0x8f */ + 0xa8, 0xa9, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xe0, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xe1, 0xe2, 0xe3, 0xe5, 0xe6, 0xe7, /* 0xe8-0xef */ + 0xe9, 0xf1, 0xf2, 0xf3, 0xe4, 0xe8, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x98-0x9f */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa0-0xa7 */ + 0x90, 0x91, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x97, 0xea, 0xeb, 0xec, 0xf4, 0xed, 0xee, 0xef, /* 0xe0-0xe7 */ + 0xf5, 0xf0, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp737", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp737(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp737(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp737) +module_exit(exit_nls_cp737) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp775.c b/kernel/fs/nls/nls_cp775.c new file mode 100644 index 000000000..554c86374 --- /dev/null +++ b/kernel/fs/nls/nls_cp775.c @@ -0,0 +1,319 @@ +/* + * linux/fs/nls/nls_cp775.c + * + * Charset cp775 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0106, 0x00fc, 0x00e9, 0x0101, + 0x00e4, 0x0123, 0x00e5, 0x0107, + 0x0142, 0x0113, 0x0156, 0x0157, + 0x012b, 0x0179, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x014d, + 0x00f6, 0x0122, 0x00a2, 0x015a, + 0x015b, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x00d7, 0x00a4, + /* 0xa0*/ + 0x0100, 0x012a, 0x00f3, 0x017b, + 0x017c, 0x017a, 0x201d, 0x00a6, + 0x00a9, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x0141, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x0104, 0x010c, 0x0118, + 0x0116, 0x2563, 0x2551, 0x2557, + 0x255d, 0x012e, 0x0160, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x0172, 0x016a, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x017d, + /* 0xd0*/ + 0x0105, 0x010d, 0x0119, 0x0117, + 0x012f, 0x0161, 0x0173, 0x016b, + 0x017e, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x014c, 0x0143, + 0x00f5, 0x00d5, 0x00b5, 0x0144, + 0x0136, 0x0137, 0x013b, 0x013c, + 0x0146, 0x0112, 0x0145, 0x2019, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x201c, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x201e, + 0x00b0, 0x2219, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */ + 0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */ + 0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */ + 0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */ + 0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */ + 0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */ + 0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */ + 0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp775", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp775(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp775(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp775) +module_exit(exit_nls_cp775) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp850.c b/kernel/fs/nls/nls_cp850.c new file mode 100644 index 000000000..56cccd14b --- /dev/null +++ b/kernel/fs/nls/nls_cp850.c @@ -0,0 +1,315 @@ +/* + * linux/fs/nls/nls_cp850.c + * + * Charset cp850 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x00d7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x00c0, + 0x00a9, 0x2563, 0x2551, 0x2557, + 0x255d, 0x00a2, 0x00a5, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x00f0, 0x00d0, 0x00ca, 0x00cb, + 0x00c8, 0x0131, 0x00cd, 0x00ce, + 0x00cf, 0x2518, 0x250c, 0x2588, + 0x2584, 0x00a6, 0x00cc, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x00d2, + 0x00f5, 0x00d5, 0x00b5, 0x00fe, + 0x00de, 0x00da, 0x00db, 0x00d9, + 0x00fd, 0x00dd, 0x00af, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x2017, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */ + 0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */ + 0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, /* 0x10-0x17 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0x88, 0x89, 0x8a, 0xd5, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x8d, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe7, 0xa3, 0x96, 0x97, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */ + 0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0xde, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd2, 0xd3, 0xd4, 0x49, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe8, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xed, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp850", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp850(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp850(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp850) +module_exit(exit_nls_cp850) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp852.c b/kernel/fs/nls/nls_cp852.c new file mode 100644 index 000000000..7cdc05ac1 --- /dev/null +++ b/kernel/fs/nls/nls_cp852.c @@ -0,0 +1,337 @@ +/* + * linux/fs/nls/nls_cp852.c + * + * Charset cp852 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x016f, 0x0107, 0x00e7, + 0x0142, 0x00eb, 0x0150, 0x0151, + 0x00ee, 0x0179, 0x00c4, 0x0106, + /* 0x90*/ + 0x00c9, 0x0139, 0x013a, 0x00f4, + 0x00f6, 0x013d, 0x013e, 0x015a, + 0x015b, 0x00d6, 0x00dc, 0x0164, + 0x0165, 0x0141, 0x00d7, 0x010d, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x0104, 0x0105, 0x017d, 0x017e, + 0x0118, 0x0119, 0x00ac, 0x017a, + 0x010c, 0x015f, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x011a, + 0x015e, 0x2563, 0x2551, 0x2557, + 0x255d, 0x017b, 0x017c, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x0102, 0x0103, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x0111, 0x0110, 0x010e, 0x00cb, + 0x010f, 0x0147, 0x00cd, 0x00ce, + 0x011b, 0x2518, 0x250c, 0x2588, + 0x2584, 0x0162, 0x016e, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x0143, + 0x0144, 0x0148, 0x0160, 0x0161, + 0x0154, 0x00da, 0x0155, 0x0170, + 0x00fd, 0x00dd, 0x0163, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x02dd, 0x02db, 0x02c7, + 0x02d8, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x02d9, 0x0171, + 0x0158, 0x0159, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0x00, 0x00, 0xae, 0xaa, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xf7, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xb5, 0xb6, 0x00, 0x8e, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0xd3, 0x00, 0xd6, 0xd7, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xe0, 0xe2, 0x00, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xe9, 0x00, 0x9a, 0xed, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0xa0, 0x83, 0x00, 0x84, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x00, 0x82, 0x00, 0x89, 0x00, 0xa1, 0x8c, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xa3, 0x00, 0x81, 0xec, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc6, 0xc7, 0xa4, 0xa5, 0x8f, 0x86, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x9f, 0xd2, 0xd4, /* 0x08-0x0f */ + 0xd1, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xa8, 0xa9, 0xb7, 0xd8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x00, /* 0x38-0x3f */ + 0x00, 0x9d, 0x88, 0xe3, 0xe4, 0x00, 0x00, 0xd5, /* 0x40-0x47 */ + 0xe5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x8a, 0x8b, 0x00, 0x00, 0xe8, 0xea, 0x00, 0x00, /* 0x50-0x57 */ + 0xfc, 0xfd, 0x97, 0x98, 0x00, 0x00, 0xb8, 0xad, /* 0x58-0x5f */ + 0xe6, 0xe7, 0xdd, 0xee, 0x9b, 0x9c, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x85, /* 0x68-0x6f */ + 0xeb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8d, 0xab, 0xbd, 0xbe, 0xa6, 0xa7, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf3, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf4, 0xfa, 0x00, 0xf2, 0x00, 0xf1, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xab, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x92, 0x92, 0x93, 0x94, 0x96, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9c, 0x9c, 0x88, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa9, 0xa9, 0xaa, 0xab, 0x9f, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0xd8, /* 0xb0-0xb7 */ + 0xad, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0xd4, 0x89, 0xd4, 0xe5, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xee, 0x85, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0xe4, 0xe4, 0xe5, 0xe7, 0xe7, /* 0xe0-0xe7 */ + 0xea, 0xa3, 0xea, 0xfb, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xde, 0x8f, 0x80, /* 0x80-0x87 */ + 0x9d, 0xd3, 0x8a, 0x8a, 0xd7, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x91, 0xe2, 0x99, 0x95, 0x95, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9b, 0x9b, 0x9d, 0x9e, 0xac, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa8, 0xaa, 0x8d, 0xac, 0xb8, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd2, 0xd3, 0xd2, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xb7, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe3, 0xd5, 0xe6, 0xe6, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xe8, 0xeb, 0xed, 0xed, 0xdd, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xeb, 0xfc, 0xfc, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp852", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp852(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp852(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp852) +module_exit(exit_nls_cp852) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp855.c b/kernel/fs/nls/nls_cp855.c new file mode 100644 index 000000000..7426eea05 --- /dev/null +++ b/kernel/fs/nls/nls_cp855.c @@ -0,0 +1,299 @@ +/* + * linux/fs/nls/nls_cp855.c + * + * Charset cp855 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0452, 0x0402, 0x0453, 0x0403, + 0x0451, 0x0401, 0x0454, 0x0404, + 0x0455, 0x0405, 0x0456, 0x0406, + 0x0457, 0x0407, 0x0458, 0x0408, + /* 0x90*/ + 0x0459, 0x0409, 0x045a, 0x040a, + 0x045b, 0x040b, 0x045c, 0x040c, + 0x045e, 0x040e, 0x045f, 0x040f, + 0x044e, 0x042e, 0x044a, 0x042a, + /* 0xa0*/ + 0x0430, 0x0410, 0x0431, 0x0411, + 0x0446, 0x0426, 0x0434, 0x0414, + 0x0435, 0x0415, 0x0444, 0x0424, + 0x0433, 0x0413, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x0445, 0x0425, 0x0438, + 0x0418, 0x2563, 0x2551, 0x2557, + 0x255d, 0x0439, 0x0419, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x043a, 0x041a, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x043b, 0x041b, 0x043c, 0x041c, + 0x043d, 0x041d, 0x043e, 0x041e, + 0x043f, 0x2518, 0x250c, 0x2588, + 0x2584, 0x041f, 0x044f, 0x2580, + /* 0xe0*/ + 0x042f, 0x0440, 0x0420, 0x0441, + 0x0421, 0x0442, 0x0422, 0x0443, + 0x0423, 0x0436, 0x0416, 0x0432, + 0x0412, 0x044c, 0x042c, 0x2116, + /* 0xf0*/ + 0x00ad, 0x044b, 0x042b, 0x0437, + 0x0417, 0x0448, 0x0428, 0x044d, + 0x042d, 0x0449, 0x0429, 0x0447, + 0x0427, 0x00a7, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0xae, 0x00, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page04[256] = { + 0x00, 0x85, 0x81, 0x83, 0x87, 0x89, 0x8b, 0x8d, /* 0x00-0x07 */ + 0x8f, 0x91, 0x93, 0x95, 0x97, 0x00, 0x99, 0x9b, /* 0x08-0x0f */ + 0xa1, 0xa3, 0xec, 0xad, 0xa7, 0xa9, 0xea, 0xf4, /* 0x10-0x17 */ + 0xb8, 0xbe, 0xc7, 0xd1, 0xd3, 0xd5, 0xd7, 0xdd, /* 0x18-0x1f */ + 0xe2, 0xe4, 0xe6, 0xe8, 0xab, 0xb6, 0xa5, 0xfc, /* 0x20-0x27 */ + 0xf6, 0xfa, 0x9f, 0xf2, 0xee, 0xf8, 0x9d, 0xe0, /* 0x28-0x2f */ + 0xa0, 0xa2, 0xeb, 0xac, 0xa6, 0xa8, 0xe9, 0xf3, /* 0x30-0x37 */ + 0xb7, 0xbd, 0xc6, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, /* 0x38-0x3f */ + 0xe1, 0xe3, 0xe5, 0xe7, 0xaa, 0xb5, 0xa4, 0xfb, /* 0x40-0x47 */ + 0xf5, 0xf9, 0x9e, 0xf1, 0xed, 0xf7, 0x9c, 0xde, /* 0x48-0x4f */ + 0x00, 0x84, 0x80, 0x82, 0x86, 0x88, 0x8a, 0x8c, /* 0x50-0x57 */ + 0x8e, 0x90, 0x92, 0x94, 0x96, 0x00, 0x98, 0x9a, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x80, 0x82, 0x82, 0x84, 0x84, 0x86, 0x86, /* 0x80-0x87 */ + 0x88, 0x88, 0x8a, 0x8a, 0x8c, 0x8c, 0x8e, 0x8e, /* 0x88-0x8f */ + 0x90, 0x90, 0x92, 0x92, 0x94, 0x94, 0x96, 0x96, /* 0x90-0x97 */ + 0x98, 0x98, 0x9a, 0x9a, 0x9c, 0x9c, 0x9e, 0x9e, /* 0x98-0x9f */ + 0xa0, 0xa0, 0xa2, 0xa2, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa8, 0xaa, 0xaa, 0xac, 0xac, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb5, 0xb7, /* 0xb0-0xb7 */ + 0xb7, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0xd2, 0xd2, 0xd4, 0xd4, 0xd6, 0xd6, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xd8, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xde, 0xe1, 0xe1, 0xe3, 0xe3, 0xe5, 0xe5, 0xe7, /* 0xe0-0xe7 */ + 0xe7, 0xe9, 0xe9, 0xeb, 0xeb, 0xed, 0xed, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, /* 0xf0-0xf7 */ + 0xf7, 0xf9, 0xf9, 0xfb, 0xfb, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x81, 0x81, 0x83, 0x83, 0x85, 0x85, 0x87, 0x87, /* 0x80-0x87 */ + 0x89, 0x89, 0x8b, 0x8b, 0x8d, 0x8d, 0x8f, 0x8f, /* 0x88-0x8f */ + 0x91, 0x91, 0x93, 0x93, 0x95, 0x95, 0x97, 0x97, /* 0x90-0x97 */ + 0x99, 0x99, 0x9b, 0x9b, 0x9d, 0x9d, 0x9f, 0x9f, /* 0x98-0x9f */ + 0xa1, 0xa1, 0xa3, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa9, 0xa9, 0xab, 0xab, 0xad, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb6, 0xb8, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd3, 0xd3, 0xd5, 0xd5, 0xd7, 0xd7, /* 0xd0-0xd7 */ + 0xdd, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xe0, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe2, 0xe2, 0xe4, 0xe4, 0xe6, 0xe6, 0xe8, /* 0xe0-0xe7 */ + 0xe8, 0xea, 0xea, 0xec, 0xec, 0xee, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, 0xf8, /* 0xf0-0xf7 */ + 0xf8, 0xfa, 0xfa, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp855", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp855(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp855(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp855) +module_exit(exit_nls_cp855) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp857.c b/kernel/fs/nls/nls_cp857.c new file mode 100644 index 000000000..098309733 --- /dev/null +++ b/kernel/fs/nls/nls_cp857.c @@ -0,0 +1,301 @@ +/* + * linux/fs/nls/nls_cp857.c + * + * Charset cp857 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x0131, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x0130, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x015e, 0x015f, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x011e, 0x011f, + 0x00bf, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x00c0, + 0x00a9, 0x2563, 0x2551, 0x2557, + 0x255d, 0x00a2, 0x00a5, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x00ba, 0x00aa, 0x00ca, 0x00cb, + 0x00c8, 0x0000, 0x00cd, 0x00ce, + 0x00cf, 0x2518, 0x250c, 0x2588, + 0x2584, 0x00a6, 0x00cc, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x00d2, + 0x00f5, 0x00d5, 0x00b5, 0x0000, + 0x00d7, 0x00da, 0x00db, 0x00d9, + 0x00ec, 0x00ff, 0x00af, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x0000, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */ + 0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */ + 0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */ + 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */ + 0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp857", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp857(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp857(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp857) +module_exit(exit_nls_cp857) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp860.c b/kernel/fs/nls/nls_cp860.c new file mode 100644 index 000000000..84224478e --- /dev/null +++ b/kernel/fs/nls/nls_cp860.c @@ -0,0 +1,364 @@ +/* + * linux/fs/nls/nls_cp860.c + * + * Charset cp860 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e3, 0x00e0, 0x00c1, 0x00e7, + 0x00ea, 0x00ca, 0x00e8, 0x00cd, + 0x00d4, 0x00ec, 0x00c3, 0x00c2, + /* 0x90*/ + 0x00c9, 0x00c0, 0x00c8, 0x00f4, + 0x00f5, 0x00f2, 0x00da, 0x00f9, + 0x00cc, 0x00d5, 0x00dc, 0x00a2, + 0x00a3, 0x00d9, 0x20a7, 0x00d3, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x00d2, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x91, 0x86, 0x8f, 0x8e, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x92, 0x90, 0x89, 0x00, 0x98, 0x8b, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0xa9, 0x9f, 0x8c, 0x99, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x9d, 0x96, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x84, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x00, 0x8d, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x94, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0xa0, 0x87, /* 0x80-0x87 */ + 0x88, 0x88, 0x8a, 0xa1, 0x93, 0x8d, 0x84, 0x83, /* 0x88-0x8f */ + 0x82, 0x85, 0x8a, 0x93, 0x94, 0x95, 0xa3, 0x97, /* 0x90-0x97 */ + 0x8d, 0x94, 0x81, 0x9b, 0x9c, 0x97, 0x9e, 0xa2, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x95, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x8f, 0x8e, 0x91, 0x86, 0x80, /* 0x80-0x87 */ + 0x89, 0x89, 0x92, 0x8b, 0x8c, 0x98, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x8c, 0x99, 0xa9, 0x96, 0x9d, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x86, 0x8b, 0x9f, 0x96, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp860", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp860(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp860(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp860) +module_exit(exit_nls_cp860) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp861.c b/kernel/fs/nls/nls_cp861.c new file mode 100644 index 000000000..dc873e4be --- /dev/null +++ b/kernel/fs/nls/nls_cp861.c @@ -0,0 +1,387 @@ +/* + * linux/fs/nls/nls_cp861.c + * + * Charset cp861 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00d0, + 0x00f0, 0x00de, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00fe, 0x00fb, 0x00dd, + 0x00fd, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00c1, 0x00cd, 0x00d3, 0x00da, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0xa4, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, /* 0xc8-0xcf */ + 0x8b, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0xa7, 0x00, 0x9a, 0x97, 0x8d, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x8c, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x00, 0xa3, 0x96, 0x81, 0x98, 0x95, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8c, 0x8c, 0x95, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa0, 0xa1, 0xa2, 0xa3, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x8b, 0x8b, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x8d, 0x00, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa4, 0xa5, 0xa6, 0xa7, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp861", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp861(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp861(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp861) +module_exit(exit_nls_cp861) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp862.c b/kernel/fs/nls/nls_cp862.c new file mode 100644 index 000000000..d5263e3c5 --- /dev/null +++ b/kernel/fs/nls/nls_cp862.c @@ -0,0 +1,421 @@ +/* + * linux/fs/nls/nls_cp862.c + * + * Charset cp862 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x05d0, 0x05d1, 0x05d2, 0x05d3, + 0x05d4, 0x05d5, 0x05d6, 0x05d7, + 0x05d8, 0x05d9, 0x05da, 0x05db, + 0x05dc, 0x05dd, 0x05de, 0x05df, + /* 0x90*/ + 0x05e0, 0x05e1, 0x05e2, 0x05e3, + 0x05e4, 0x05e5, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x00a2, + 0x00a3, 0x00a5, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0xa4, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page05[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xd0-0xd7 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xd8-0xdf */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, page05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp862", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp862(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp862(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp862) +module_exit(exit_nls_cp862) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp863.c b/kernel/fs/nls/nls_cp863.c new file mode 100644 index 000000000..051c9832e --- /dev/null +++ b/kernel/fs/nls/nls_cp863.c @@ -0,0 +1,381 @@ +/* + * linux/fs/nls/nls_cp863.c + * + * Charset cp863 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00c2, 0x00e0, 0x00b6, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x2017, 0x00c0, 0x00a7, + /* 0x90*/ + 0x00c9, 0x00c8, 0x00ca, 0x00f4, + 0x00cb, 0x00cf, 0x00fb, 0x00f9, + 0x00a4, 0x00d4, 0x00dc, 0x00a2, + 0x00a3, 0x00d9, 0x00db, 0x0192, + /* 0xa0*/ + 0x00a6, 0x00b4, 0x00f3, 0x00fa, + 0x00a8, 0x00b8, 0x00b3, 0x00af, + 0x00ce, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00be, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x9b, 0x9c, 0x98, 0x00, 0xa0, 0x8f, /* 0xa0-0xa7 */ + 0xa4, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0xa7, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xa6, 0xa1, 0xe6, 0x86, 0xfa, /* 0xb0-0xb7 */ + 0xa5, 0x00, 0x00, 0xaf, 0xac, 0xab, 0xad, 0x00, /* 0xb8-0xbf */ + 0x8e, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x91, 0x90, 0x92, 0x94, 0x00, 0x00, 0xa8, 0x95, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x9d, 0x00, 0x9e, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x00, 0x00, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8d, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x83, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x85, 0x8f, /* 0x88-0x8f */ + 0x82, 0x8a, 0x88, 0x93, 0x89, 0x8b, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x93, 0x81, 0x9b, 0x9c, 0x97, 0x96, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x8c, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x84, 0x84, 0x8e, 0x86, 0x80, /* 0x80-0x87 */ + 0x92, 0x94, 0x91, 0x95, 0xa8, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x99, 0x94, 0x95, 0x9e, 0x9d, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0x00, 0x00, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp863", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp863(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp863(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp863) +module_exit(exit_nls_cp863) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp864.c b/kernel/fs/nls/nls_cp864.c new file mode 100644 index 000000000..97eb1273b --- /dev/null +++ b/kernel/fs/nls/nls_cp864.c @@ -0,0 +1,407 @@ +/* + * linux/fs/nls/nls_cp864.c + * + * Charset cp864 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x066a, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00b0, 0x00b7, 0x2219, 0x221a, + 0x2592, 0x2500, 0x2502, 0x253c, + 0x2524, 0x252c, 0x251c, 0x2534, + 0x2510, 0x250c, 0x2514, 0x2518, + /* 0x90*/ + 0x03b2, 0x221e, 0x03c6, 0x00b1, + 0x00bd, 0x00bc, 0x2248, 0x00ab, + 0x00bb, 0xfef7, 0xfef8, 0x0000, + 0x0000, 0xfefb, 0xfefc, 0x0000, + /* 0xa0*/ + 0x00a0, 0x00ad, 0xfe82, 0x00a3, + 0x00a4, 0xfe84, 0x0000, 0x0000, + 0xfe8e, 0xfe8f, 0xfe95, 0xfe99, + 0x060c, 0xfe9d, 0xfea1, 0xfea5, + /* 0xb0*/ + 0x0660, 0x0661, 0x0662, 0x0663, + 0x0664, 0x0665, 0x0666, 0x0667, + 0x0668, 0x0669, 0xfed1, 0x061b, + 0xfeb1, 0xfeb5, 0xfeb9, 0x061f, + /* 0xc0*/ + 0x00a2, 0xfe80, 0xfe81, 0xfe83, + 0xfe85, 0xfeca, 0xfe8b, 0xfe8d, + 0xfe91, 0xfe93, 0xfe97, 0xfe9b, + 0xfe9f, 0xfea3, 0xfea7, 0xfea9, + /* 0xd0*/ + 0xfeab, 0xfead, 0xfeaf, 0xfeb3, + 0xfeb7, 0xfebb, 0xfebf, 0xfec1, + 0xfec5, 0xfecb, 0xfecf, 0x00a6, + 0x00ac, 0x00f7, 0x00d7, 0xfec9, + /* 0xe0*/ + 0x0640, 0xfed3, 0xfed7, 0xfedb, + 0xfedf, 0xfee3, 0xfee7, 0xfeeb, + 0xfeed, 0xfeef, 0xfef3, 0xfebd, + 0xfecc, 0xfece, 0xfecd, 0xfee1, + /* 0xf0*/ + 0xfe7d, 0x0651, 0xfee5, 0xfee9, + 0xfeec, 0xfef0, 0xfef2, 0xfed0, + 0xfed5, 0xfef5, 0xfef6, 0xfedd, + 0xfed9, 0xfef1, 0x25a0, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */ + 0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page06[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */ + 0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ +}; + +static const unsigned char page25[256] = { + 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char pagefe[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */ + + 0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */ + 0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */ + 0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */ + 0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */ + 0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */ + 0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */ + 0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */ + 0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */ + 0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */ + 0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */ + 0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */ + 0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */ + 0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */ + 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, page06, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, pagefe, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp864", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp864(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp864(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp864) +module_exit(exit_nls_cp864) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp865.c b/kernel/fs/nls/nls_cp865.c new file mode 100644 index 000000000..111214228 --- /dev/null +++ b/kernel/fs/nls/nls_cp865.c @@ -0,0 +1,387 @@ +/* + * linux/fs/nls/nls_cp865.c + * + * Charset cp865 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00a4, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x00, 0x9c, 0xaf, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0x00, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp865", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp865(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp865(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp865) +module_exit(exit_nls_cp865) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp866.c b/kernel/fs/nls/nls_cp866.c new file mode 100644 index 000000000..ffdcbc3fc --- /dev/null +++ b/kernel/fs/nls/nls_cp866.c @@ -0,0 +1,305 @@ +/* + * linux/fs/nls/nls_cp866.c + * + * Charset cp866 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0x90*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xa0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, + /* 0xf0*/ + 0x0401, 0x0451, 0x0404, 0x0454, + 0x0407, 0x0457, 0x040e, 0x045e, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x2116, 0x00a4, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xf0, 0x00, 0x00, 0xf2, 0x00, 0x00, 0xf4, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0x00, /* 0x08-0x0f */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x30-0x37 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0x00, 0xf1, 0x00, 0x00, 0xf3, 0x00, 0x00, 0xf5, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0x00, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x80-0x87 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x88-0x8f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x90-0x97 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xa0-0xa7 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0xe8-0xef */ + 0xf0, 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp866", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp866(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp866(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp866) +module_exit(exit_nls_cp866) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp869.c b/kernel/fs/nls/nls_cp869.c new file mode 100644 index 000000000..3b5a34589 --- /dev/null +++ b/kernel/fs/nls/nls_cp869.c @@ -0,0 +1,315 @@ +/* + * linux/fs/nls/nls_cp869.c + * + * Charset cp869 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0386, 0x0000, + 0x00b7, 0x00ac, 0x00a6, 0x2018, + 0x2019, 0x0388, 0x2015, 0x0389, + /* 0x90*/ + 0x038a, 0x03aa, 0x038c, 0x0000, + 0x0000, 0x038e, 0x03ab, 0x00a9, + 0x038f, 0x00b2, 0x00b3, 0x03ac, + 0x00a3, 0x03ad, 0x03ae, 0x03af, + /* 0xa0*/ + 0x03ca, 0x0390, 0x03cc, 0x03cd, + 0x0391, 0x0392, 0x0393, 0x0394, + 0x0395, 0x0396, 0x0397, 0x00bd, + 0x0398, 0x0399, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x039a, 0x039b, 0x039c, + 0x039d, 0x2563, 0x2551, 0x2557, + 0x255d, 0x039e, 0x039f, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x03a0, 0x03a1, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x03a3, + /* 0xd0*/ + 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03b1, 0x03b2, + 0x03b3, 0x2518, 0x250c, 0x2588, + 0x2584, 0x03b4, 0x03b5, 0x2580, + /* 0xe0*/ + 0x03b6, 0x03b7, 0x03b8, 0x03b9, + 0x03ba, 0x03bb, 0x03bc, 0x03bd, + 0x03be, 0x03bf, 0x03c0, 0x03c1, + 0x03c3, 0x03c2, 0x03c4, 0x0384, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x03c5, 0x03c6, + 0x03c7, 0x00a7, 0x03c8, 0x0385, + 0x00b0, 0x00a8, 0x03c9, 0x03cb, + 0x03b0, 0x03ce, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */ + 0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */ + 0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */ + 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */ + 0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */ + 0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */ + 0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */ + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */ + 0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */ + 0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */ + 0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */ + 0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */ + 0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */ + 0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */ + 0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */ + 0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */ + 0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */ + 0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */ + 0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */ + 0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp869", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp869(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp869(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp869) +module_exit(exit_nls_cp869) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_cp874.c b/kernel/fs/nls/nls_cp874.c new file mode 100644 index 000000000..8dfaa1071 --- /dev/null +++ b/kernel/fs/nls/nls_cp874.c @@ -0,0 +1,275 @@ +/* + * linux/fs/nls/nls_cp874.c + * + * Charset cp874 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2026, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xa0*/ + 0x00a0, 0x0e01, 0x0e02, 0x0e03, + 0x0e04, 0x0e05, 0x0e06, 0x0e07, + 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, + 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, + /* 0xb0*/ + 0x0e10, 0x0e11, 0x0e12, 0x0e13, + 0x0e14, 0x0e15, 0x0e16, 0x0e17, + 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, + 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, + /* 0xc0*/ + 0x0e20, 0x0e21, 0x0e22, 0x0e23, + 0x0e24, 0x0e25, 0x0e26, 0x0e27, + 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, + 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, + /* 0xd0*/ + 0x0e30, 0x0e31, 0x0e32, 0x0e33, + 0x0e34, 0x0e35, 0x0e36, 0x0e37, + 0x0e38, 0x0e39, 0x0e3a, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0e3f, + /* 0xe0*/ + 0x0e40, 0x0e41, 0x0e42, 0x0e43, + 0x0e44, 0x0e45, 0x0e46, 0x0e47, + 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, + 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, + /* 0xf0*/ + 0x0e50, 0x0e51, 0x0e52, 0x0e53, + 0x0e54, 0x0e55, 0x0e56, 0x0e57, + 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char page0e[256] = { + 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page0e, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp874", + .alias = "tis-620", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp874(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp874(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp874) +module_exit(exit_nls_cp874) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(tis-620); diff --git a/kernel/fs/nls/nls_cp932.c b/kernel/fs/nls/nls_cp932.c new file mode 100644 index 000000000..67b7398e8 --- /dev/null +++ b/kernel/fs/nls/nls_cp932.c @@ -0,0 +1,7933 @@ +/* + * linux/fs/nls/nls_cp932.c + * + * Charset cp932 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3000,0x3001,0x3002,0xFF0C,0xFF0E,0x30FB,0xFF1A,0xFF1B,/* 0x40-0x47 */ + 0xFF1F,0xFF01,0x309B,0x309C,0x00B4,0xFF40,0x00A8,0xFF3E,/* 0x48-0x4F */ + 0xFFE3,0xFF3F,0x30FD,0x30FE,0x309D,0x309E,0x3003,0x4EDD,/* 0x50-0x57 */ + 0x3005,0x3006,0x3007,0x30FC,0x2015,0x2010,0xFF0F,0xFF3C,/* 0x58-0x5F */ + 0xFF5E,0x2225,0xFF5C,0x2026,0x2025,0x2018,0x2019,0x201C,/* 0x60-0x67 */ + 0x201D,0xFF08,0xFF09,0x3014,0x3015,0xFF3B,0xFF3D,0xFF5B,/* 0x68-0x6F */ + 0xFF5D,0x3008,0x3009,0x300A,0x300B,0x300C,0x300D,0x300E,/* 0x70-0x77 */ + 0x300F,0x3010,0x3011,0xFF0B,0xFF0D,0x00B1,0x00D7,0x0000,/* 0x78-0x7F */ + + 0x00F7,0xFF1D,0x2260,0xFF1C,0xFF1E,0x2266,0x2267,0x221E,/* 0x80-0x87 */ + 0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFFE5,/* 0x88-0x8F */ + 0xFF04,0xFFE0,0xFFE1,0xFF05,0xFF03,0xFF06,0xFF0A,0xFF20,/* 0x90-0x97 */ + 0x00A7,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0x98-0x9F */ + 0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x203B,0x3012,/* 0xA0-0xA7 */ + 0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB0-0xB7 */ + 0x2208,0x220B,0x2286,0x2287,0x2282,0x2283,0x222A,0x2229,/* 0xB8-0xBF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x2227,0x2228,0xFFE2,0x21D2,0x21D4,0x2200,0x2203,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */ + 0x0000,0x0000,0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,/* 0xD8-0xDF */ + 0x2252,0x226A,0x226B,0x221A,0x223D,0x221D,0x2235,0x222B,/* 0xE0-0xE7 */ + 0x222C,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */ + 0x212B,0x2030,0x266F,0x266D,0x266A,0x2020,0x2021,0x00B6,/* 0xF0-0xF7 */ + 0x0000,0x0000,0x0000,0x0000,0x25EF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xFF10,/* 0x48-0x4F */ + 0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0x50-0x57 */ + 0xFF19,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,/* 0x60-0x67 */ + 0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,/* 0x68-0x6F */ + 0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,/* 0x70-0x77 */ + 0xFF39,0xFF3A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0x80-0x87 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0x88-0x8F */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0x90-0x97 */ + 0xFF58,0xFF59,0xFF5A,0x0000,0x0000,0x0000,0x0000,0x3041,/* 0x98-0x9F */ + 0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,0x3048,0x3049,/* 0xA0-0xA7 */ + 0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,0x3050,0x3051,/* 0xA8-0xAF */ + 0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,0x3058,0x3059,/* 0xB0-0xB7 */ + 0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,0x3060,0x3061,/* 0xB8-0xBF */ + 0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,0x3068,0x3069,/* 0xC0-0xC7 */ + 0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,0x3070,0x3071,/* 0xC8-0xCF */ + 0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,0x3078,0x3079,/* 0xD0-0xD7 */ + 0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,0x3080,0x3081,/* 0xD8-0xDF */ + 0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,0x3088,0x3089,/* 0xE0-0xE7 */ + 0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,0x3090,0x3091,/* 0xE8-0xEF */ + 0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,0x30A8,/* 0x40-0x47 */ + 0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,0x30B0,/* 0x48-0x4F */ + 0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,0x30B8,/* 0x50-0x57 */ + 0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,0x30C0,/* 0x58-0x5F */ + 0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,0x30C8,/* 0x60-0x67 */ + 0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,0x30D0,/* 0x68-0x6F */ + 0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,0x30D8,/* 0x70-0x77 */ + 0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,0x0000,/* 0x78-0x7F */ + + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0x80-0x87 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0x88-0x8F */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0391,/* 0x98-0x9F */ + 0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,0x0398,0x0399,/* 0xA0-0xA7 */ + 0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,0x03A0,0x03A1,/* 0xA8-0xAF */ + 0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,0x03A9,0x0000,/* 0xB0-0xB7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x03B1,/* 0xB8-0xBF */ + 0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,/* 0xC0-0xC7 */ + 0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,/* 0xC8-0xCF */ + 0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,0x03C9,0x0000,/* 0xD0-0xD7 */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,0x0416,/* 0x40-0x47 */ + 0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,/* 0x48-0x4F */ + 0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,/* 0x50-0x57 */ + 0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,/* 0x58-0x5F */ + 0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,0x0436,/* 0x70-0x77 */ + 0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x0000,/* 0x78-0x7F */ + + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0x80-0x87 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0x88-0x8F */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2500,/* 0x98-0x9F */ + 0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,0x252C,0x2524,/* 0xA0-0xA7 */ + 0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,0x251B,0x2517,/* 0xA8-0xAF */ + 0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,0x252F,0x2528,/* 0xB0-0xB7 */ + 0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,0x2542,0x0000,/* 0xB8-0xBF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,/* 0x40-0x47 */ + 0x2468,0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x246F,/* 0x48-0x4F */ + 0x2470,0x2471,0x2472,0x2473,0x2160,0x2161,0x2162,0x2163,/* 0x50-0x57 */ + 0x2164,0x2165,0x2166,0x2167,0x2168,0x2169,0x0000,0x3349,/* 0x58-0x5F */ + 0x3314,0x3322,0x334D,0x3318,0x3327,0x3303,0x3336,0x3351,/* 0x60-0x67 */ + 0x3357,0x330D,0x3326,0x3323,0x332B,0x334A,0x333B,0x339C,/* 0x68-0x6F */ + 0x339D,0x339E,0x338E,0x338F,0x33C4,0x33A1,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x337B,0x0000,/* 0x78-0x7F */ + + 0x301D,0x301F,0x2116,0x33CD,0x2121,0x32A4,0x32A5,0x32A6,/* 0x80-0x87 */ + 0x32A7,0x32A8,0x3231,0x3232,0x3239,0x337E,0x337D,0x337C,/* 0x88-0x8F */ + 0x2252,0x2261,0x222B,0x222E,0x2211,0x221A,0x22A5,0x2220,/* 0x90-0x97 */ + 0x221F,0x22BF,0x2235,0x2229,0x222A,0x0000,0x0000,0x0000,/* 0x98-0x9F */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x4E9C,/* 0x98-0x9F */ + 0x5516,0x5A03,0x963F,0x54C0,0x611B,0x6328,0x59F6,0x9022,/* 0xA0-0xA7 */ + 0x8475,0x831C,0x7A50,0x60AA,0x63E1,0x6E25,0x65ED,0x8466,/* 0xA8-0xAF */ + 0x82A6,0x9BF5,0x6893,0x5727,0x65A1,0x6271,0x5B9B,0x59D0,/* 0xB0-0xB7 */ + 0x867B,0x98F4,0x7D62,0x7DBE,0x9B8E,0x6216,0x7C9F,0x88B7,/* 0xB8-0xBF */ + 0x5B89,0x5EB5,0x6309,0x6697,0x6848,0x95C7,0x978D,0x674F,/* 0xC0-0xC7 */ + 0x4EE5,0x4F0A,0x4F4D,0x4F9D,0x5049,0x56F2,0x5937,0x59D4,/* 0xC8-0xCF */ + 0x5A01,0x5C09,0x60DF,0x610F,0x6170,0x6613,0x6905,0x70BA,/* 0xD0-0xD7 */ + 0x754F,0x7570,0x79FB,0x7DAD,0x7DEF,0x80C3,0x840E,0x8863,/* 0xD8-0xDF */ + 0x8B02,0x9055,0x907A,0x533B,0x4E95,0x4EA5,0x57DF,0x80B2,/* 0xE0-0xE7 */ + 0x90C1,0x78EF,0x4E00,0x58F1,0x6EA2,0x9038,0x7A32,0x8328,/* 0xE8-0xEF */ + 0x828B,0x9C2F,0x5141,0x5370,0x54BD,0x54E1,0x56E0,0x59FB,/* 0xF0-0xF7 */ + 0x5F15,0x98F2,0x6DEB,0x80E4,0x852D,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9662,0x9670,0x96A0,0x97FB,0x540B,0x53F3,0x5B87,0x70CF,/* 0x40-0x47 */ + 0x7FBD,0x8FC2,0x96E8,0x536F,0x9D5C,0x7ABA,0x4E11,0x7893,/* 0x48-0x4F */ + 0x81FC,0x6E26,0x5618,0x5504,0x6B1D,0x851A,0x9C3B,0x59E5,/* 0x50-0x57 */ + 0x53A9,0x6D66,0x74DC,0x958F,0x5642,0x4E91,0x904B,0x96F2,/* 0x58-0x5F */ + 0x834F,0x990C,0x53E1,0x55B6,0x5B30,0x5F71,0x6620,0x66F3,/* 0x60-0x67 */ + 0x6804,0x6C38,0x6CF3,0x6D29,0x745B,0x76C8,0x7A4E,0x9834,/* 0x68-0x6F */ + 0x82F1,0x885B,0x8A60,0x92ED,0x6DB2,0x75AB,0x76CA,0x99C5,/* 0x70-0x77 */ + 0x60A6,0x8B01,0x8D8A,0x95B2,0x698E,0x53AD,0x5186,0x0000,/* 0x78-0x7F */ + + 0x5712,0x5830,0x5944,0x5BB4,0x5EF6,0x6028,0x63A9,0x63F4,/* 0x80-0x87 */ + 0x6CBF,0x6F14,0x708E,0x7114,0x7159,0x71D5,0x733F,0x7E01,/* 0x88-0x8F */ + 0x8276,0x82D1,0x8597,0x9060,0x925B,0x9D1B,0x5869,0x65BC,/* 0x90-0x97 */ + 0x6C5A,0x7525,0x51F9,0x592E,0x5965,0x5F80,0x5FDC,0x62BC,/* 0x98-0x9F */ + 0x65FA,0x6A2A,0x6B27,0x6BB4,0x738B,0x7FC1,0x8956,0x9D2C,/* 0xA0-0xA7 */ + 0x9D0E,0x9EC4,0x5CA1,0x6C96,0x837B,0x5104,0x5C4B,0x61B6,/* 0xA8-0xAF */ + 0x81C6,0x6876,0x7261,0x4E59,0x4FFA,0x5378,0x6069,0x6E29,/* 0xB0-0xB7 */ + 0x7A4F,0x97F3,0x4E0B,0x5316,0x4EEE,0x4F55,0x4F3D,0x4FA1,/* 0xB8-0xBF */ + 0x4F73,0x52A0,0x53EF,0x5609,0x590F,0x5AC1,0x5BB6,0x5BE1,/* 0xC0-0xC7 */ + 0x79D1,0x6687,0x679C,0x67B6,0x6B4C,0x6CB3,0x706B,0x73C2,/* 0xC8-0xCF */ + 0x798D,0x79BE,0x7A3C,0x7B87,0x82B1,0x82DB,0x8304,0x8377,/* 0xD0-0xD7 */ + 0x83EF,0x83D3,0x8766,0x8AB2,0x5629,0x8CA8,0x8FE6,0x904E,/* 0xD8-0xDF */ + 0x971E,0x868A,0x4FC4,0x5CE8,0x6211,0x7259,0x753B,0x81E5,/* 0xE0-0xE7 */ + 0x82BD,0x86FE,0x8CC0,0x96C5,0x9913,0x99D5,0x4ECB,0x4F1A,/* 0xE8-0xEF */ + 0x89E3,0x56DE,0x584A,0x58CA,0x5EFB,0x5FEB,0x602A,0x6094,/* 0xF0-0xF7 */ + 0x6062,0x61D0,0x6212,0x62D0,0x6539,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B41,0x6666,0x68B0,0x6D77,0x7070,0x754C,0x7686,0x7D75,/* 0x40-0x47 */ + 0x82A5,0x87F9,0x958B,0x968E,0x8C9D,0x51F1,0x52BE,0x5916,/* 0x48-0x4F */ + 0x54B3,0x5BB3,0x5D16,0x6168,0x6982,0x6DAF,0x788D,0x84CB,/* 0x50-0x57 */ + 0x8857,0x8A72,0x93A7,0x9AB8,0x6D6C,0x99A8,0x86D9,0x57A3,/* 0x58-0x5F */ + 0x67FF,0x86CE,0x920E,0x5283,0x5687,0x5404,0x5ED3,0x62E1,/* 0x60-0x67 */ + 0x64B9,0x683C,0x6838,0x6BBB,0x7372,0x78BA,0x7A6B,0x899A,/* 0x68-0x6F */ + 0x89D2,0x8D6B,0x8F03,0x90ED,0x95A3,0x9694,0x9769,0x5B66,/* 0x70-0x77 */ + 0x5CB3,0x697D,0x984D,0x984E,0x639B,0x7B20,0x6A2B,0x0000,/* 0x78-0x7F */ + + 0x6A7F,0x68B6,0x9C0D,0x6F5F,0x5272,0x559D,0x6070,0x62EC,/* 0x80-0x87 */ + 0x6D3B,0x6E07,0x6ED1,0x845B,0x8910,0x8F44,0x4E14,0x9C39,/* 0x88-0x8F */ + 0x53F6,0x691B,0x6A3A,0x9784,0x682A,0x515C,0x7AC3,0x84B2,/* 0x90-0x97 */ + 0x91DC,0x938C,0x565B,0x9D28,0x6822,0x8305,0x8431,0x7CA5,/* 0x98-0x9F */ + 0x5208,0x82C5,0x74E6,0x4E7E,0x4F83,0x51A0,0x5BD2,0x520A,/* 0xA0-0xA7 */ + 0x52D8,0x52E7,0x5DFB,0x559A,0x582A,0x59E6,0x5B8C,0x5B98,/* 0xA8-0xAF */ + 0x5BDB,0x5E72,0x5E79,0x60A3,0x611F,0x6163,0x61BE,0x63DB,/* 0xB0-0xB7 */ + 0x6562,0x67D1,0x6853,0x68FA,0x6B3E,0x6B53,0x6C57,0x6F22,/* 0xB8-0xBF */ + 0x6F97,0x6F45,0x74B0,0x7518,0x76E3,0x770B,0x7AFF,0x7BA1,/* 0xC0-0xC7 */ + 0x7C21,0x7DE9,0x7F36,0x7FF0,0x809D,0x8266,0x839E,0x89B3,/* 0xC8-0xCF */ + 0x8ACC,0x8CAB,0x9084,0x9451,0x9593,0x9591,0x95A2,0x9665,/* 0xD0-0xD7 */ + 0x97D3,0x9928,0x8218,0x4E38,0x542B,0x5CB8,0x5DCC,0x73A9,/* 0xD8-0xDF */ + 0x764C,0x773C,0x5CA9,0x7FEB,0x8D0B,0x96C1,0x9811,0x9854,/* 0xE0-0xE7 */ + 0x9858,0x4F01,0x4F0E,0x5371,0x559C,0x5668,0x57FA,0x5947,/* 0xE8-0xEF */ + 0x5B09,0x5BC4,0x5C90,0x5E0C,0x5E7E,0x5FCC,0x63EE,0x673A,/* 0xF0-0xF7 */ + 0x65D7,0x65E2,0x671F,0x68CB,0x68C4,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A5F,0x5E30,0x6BC5,0x6C17,0x6C7D,0x757F,0x7948,0x5B63,/* 0x40-0x47 */ + 0x7A00,0x7D00,0x5FBD,0x898F,0x8A18,0x8CB4,0x8D77,0x8ECC,/* 0x48-0x4F */ + 0x8F1D,0x98E2,0x9A0E,0x9B3C,0x4E80,0x507D,0x5100,0x5993,/* 0x50-0x57 */ + 0x5B9C,0x622F,0x6280,0x64EC,0x6B3A,0x72A0,0x7591,0x7947,/* 0x58-0x5F */ + 0x7FA9,0x87FB,0x8ABC,0x8B70,0x63AC,0x83CA,0x97A0,0x5409,/* 0x60-0x67 */ + 0x5403,0x55AB,0x6854,0x6A58,0x8A70,0x7827,0x6775,0x9ECD,/* 0x68-0x6F */ + 0x5374,0x5BA2,0x811A,0x8650,0x9006,0x4E18,0x4E45,0x4EC7,/* 0x70-0x77 */ + 0x4F11,0x53CA,0x5438,0x5BAE,0x5F13,0x6025,0x6551,0x0000,/* 0x78-0x7F */ + + 0x673D,0x6C42,0x6C72,0x6CE3,0x7078,0x7403,0x7A76,0x7AAE,/* 0x80-0x87 */ + 0x7B08,0x7D1A,0x7CFE,0x7D66,0x65E7,0x725B,0x53BB,0x5C45,/* 0x88-0x8F */ + 0x5DE8,0x62D2,0x62E0,0x6319,0x6E20,0x865A,0x8A31,0x8DDD,/* 0x90-0x97 */ + 0x92F8,0x6F01,0x79A6,0x9B5A,0x4EA8,0x4EAB,0x4EAC,0x4F9B,/* 0x98-0x9F */ + 0x4FA0,0x50D1,0x5147,0x7AF6,0x5171,0x51F6,0x5354,0x5321,/* 0xA0-0xA7 */ + 0x537F,0x53EB,0x55AC,0x5883,0x5CE1,0x5F37,0x5F4A,0x602F,/* 0xA8-0xAF */ + 0x6050,0x606D,0x631F,0x6559,0x6A4B,0x6CC1,0x72C2,0x72ED,/* 0xB0-0xB7 */ + 0x77EF,0x80F8,0x8105,0x8208,0x854E,0x90F7,0x93E1,0x97FF,/* 0xB8-0xBF */ + 0x9957,0x9A5A,0x4EF0,0x51DD,0x5C2D,0x6681,0x696D,0x5C40,/* 0xC0-0xC7 */ + 0x66F2,0x6975,0x7389,0x6850,0x7C81,0x50C5,0x52E4,0x5747,/* 0xC8-0xCF */ + 0x5DFE,0x9326,0x65A4,0x6B23,0x6B3D,0x7434,0x7981,0x79BD,/* 0xD0-0xD7 */ + 0x7B4B,0x7DCA,0x82B9,0x83CC,0x887F,0x895F,0x8B39,0x8FD1,/* 0xD8-0xDF */ + 0x91D1,0x541F,0x9280,0x4E5D,0x5036,0x53E5,0x533A,0x72D7,/* 0xE0-0xE7 */ + 0x7396,0x77E9,0x82E6,0x8EAF,0x99C6,0x99C8,0x99D2,0x5177,/* 0xE8-0xEF */ + 0x611A,0x865E,0x55B0,0x7A7A,0x5076,0x5BD3,0x9047,0x9685,/* 0xF0-0xF7 */ + 0x4E32,0x6ADB,0x91E7,0x5C51,0x5C48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6398,0x7A9F,0x6C93,0x9774,0x8F61,0x7AAA,0x718A,0x9688,/* 0x40-0x47 */ + 0x7C82,0x6817,0x7E70,0x6851,0x936C,0x52F2,0x541B,0x85AB,/* 0x48-0x4F */ + 0x8A13,0x7FA4,0x8ECD,0x90E1,0x5366,0x8888,0x7941,0x4FC2,/* 0x50-0x57 */ + 0x50BE,0x5211,0x5144,0x5553,0x572D,0x73EA,0x578B,0x5951,/* 0x58-0x5F */ + 0x5F62,0x5F84,0x6075,0x6176,0x6167,0x61A9,0x63B2,0x643A,/* 0x60-0x67 */ + 0x656C,0x666F,0x6842,0x6E13,0x7566,0x7A3D,0x7CFB,0x7D4C,/* 0x68-0x6F */ + 0x7D99,0x7E4B,0x7F6B,0x830E,0x834A,0x86CD,0x8A08,0x8A63,/* 0x70-0x77 */ + 0x8B66,0x8EFD,0x981A,0x9D8F,0x82B8,0x8FCE,0x9BE8,0x0000,/* 0x78-0x7F */ + + 0x5287,0x621F,0x6483,0x6FC0,0x9699,0x6841,0x5091,0x6B20,/* 0x80-0x87 */ + 0x6C7A,0x6F54,0x7A74,0x7D50,0x8840,0x8A23,0x6708,0x4EF6,/* 0x88-0x8F */ + 0x5039,0x5026,0x5065,0x517C,0x5238,0x5263,0x55A7,0x570F,/* 0x90-0x97 */ + 0x5805,0x5ACC,0x5EFA,0x61B2,0x61F8,0x62F3,0x6372,0x691C,/* 0x98-0x9F */ + 0x6A29,0x727D,0x72AC,0x732E,0x7814,0x786F,0x7D79,0x770C,/* 0xA0-0xA7 */ + 0x80A9,0x898B,0x8B19,0x8CE2,0x8ED2,0x9063,0x9375,0x967A,/* 0xA8-0xAF */ + 0x9855,0x9A13,0x9E78,0x5143,0x539F,0x53B3,0x5E7B,0x5F26,/* 0xB0-0xB7 */ + 0x6E1B,0x6E90,0x7384,0x73FE,0x7D43,0x8237,0x8A00,0x8AFA,/* 0xB8-0xBF */ + 0x9650,0x4E4E,0x500B,0x53E4,0x547C,0x56FA,0x59D1,0x5B64,/* 0xC0-0xC7 */ + 0x5DF1,0x5EAB,0x5F27,0x6238,0x6545,0x67AF,0x6E56,0x72D0,/* 0xC8-0xCF */ + 0x7CCA,0x88B4,0x80A1,0x80E1,0x83F0,0x864E,0x8A87,0x8DE8,/* 0xD0-0xD7 */ + 0x9237,0x96C7,0x9867,0x9F13,0x4E94,0x4E92,0x4F0D,0x5348,/* 0xD8-0xDF */ + 0x5449,0x543E,0x5A2F,0x5F8C,0x5FA1,0x609F,0x68A7,0x6A8E,/* 0xE0-0xE7 */ + 0x745A,0x7881,0x8A9E,0x8AA4,0x8B77,0x9190,0x4E5E,0x9BC9,/* 0xE8-0xEF */ + 0x4EA4,0x4F7C,0x4FAF,0x5019,0x5016,0x5149,0x516C,0x529F,/* 0xF0-0xF7 */ + 0x52B9,0x52FE,0x539A,0x53E3,0x5411,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x540E,0x5589,0x5751,0x57A2,0x597D,0x5B54,0x5B5D,0x5B8F,/* 0x40-0x47 */ + 0x5DE5,0x5DE7,0x5DF7,0x5E78,0x5E83,0x5E9A,0x5EB7,0x5F18,/* 0x48-0x4F */ + 0x6052,0x614C,0x6297,0x62D8,0x63A7,0x653B,0x6602,0x6643,/* 0x50-0x57 */ + 0x66F4,0x676D,0x6821,0x6897,0x69CB,0x6C5F,0x6D2A,0x6D69,/* 0x58-0x5F */ + 0x6E2F,0x6E9D,0x7532,0x7687,0x786C,0x7A3F,0x7CE0,0x7D05,/* 0x60-0x67 */ + 0x7D18,0x7D5E,0x7DB1,0x8015,0x8003,0x80AF,0x80B1,0x8154,/* 0x68-0x6F */ + 0x818F,0x822A,0x8352,0x884C,0x8861,0x8B1B,0x8CA2,0x8CFC,/* 0x70-0x77 */ + 0x90CA,0x9175,0x9271,0x783F,0x92FC,0x95A4,0x964D,0x0000,/* 0x78-0x7F */ + + 0x9805,0x9999,0x9AD8,0x9D3B,0x525B,0x52AB,0x53F7,0x5408,/* 0x80-0x87 */ + 0x58D5,0x62F7,0x6FE0,0x8C6A,0x8F5F,0x9EB9,0x514B,0x523B,/* 0x88-0x8F */ + 0x544A,0x56FD,0x7A40,0x9177,0x9D60,0x9ED2,0x7344,0x6F09,/* 0x90-0x97 */ + 0x8170,0x7511,0x5FFD,0x60DA,0x9AA8,0x72DB,0x8FBC,0x6B64,/* 0x98-0x9F */ + 0x9803,0x4ECA,0x56F0,0x5764,0x58BE,0x5A5A,0x6068,0x61C7,/* 0xA0-0xA7 */ + 0x660F,0x6606,0x6839,0x68B1,0x6DF7,0x75D5,0x7D3A,0x826E,/* 0xA8-0xAF */ + 0x9B42,0x4E9B,0x4F50,0x53C9,0x5506,0x5D6F,0x5DE6,0x5DEE,/* 0xB0-0xB7 */ + 0x67FB,0x6C99,0x7473,0x7802,0x8A50,0x9396,0x88DF,0x5750,/* 0xB8-0xBF */ + 0x5EA7,0x632B,0x50B5,0x50AC,0x518D,0x6700,0x54C9,0x585E,/* 0xC0-0xC7 */ + 0x59BB,0x5BB0,0x5F69,0x624D,0x63A1,0x683D,0x6B73,0x6E08,/* 0xC8-0xCF */ + 0x707D,0x91C7,0x7280,0x7815,0x7826,0x796D,0x658E,0x7D30,/* 0xD0-0xD7 */ + 0x83DC,0x88C1,0x8F09,0x969B,0x5264,0x5728,0x6750,0x7F6A,/* 0xD8-0xDF */ + 0x8CA1,0x51B4,0x5742,0x962A,0x583A,0x698A,0x80B4,0x54B2,/* 0xE0-0xE7 */ + 0x5D0E,0x57FC,0x7895,0x9DFA,0x4F5C,0x524A,0x548B,0x643E,/* 0xE8-0xEF */ + 0x6628,0x6714,0x67F5,0x7A84,0x7B56,0x7D22,0x932F,0x685C,/* 0xF0-0xF7 */ + 0x9BAD,0x7B39,0x5319,0x518A,0x5237,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5BDF,0x62F6,0x64AE,0x64E6,0x672D,0x6BBA,0x85A9,0x96D1,/* 0x40-0x47 */ + 0x7690,0x9BD6,0x634C,0x9306,0x9BAB,0x76BF,0x6652,0x4E09,/* 0x48-0x4F */ + 0x5098,0x53C2,0x5C71,0x60E8,0x6492,0x6563,0x685F,0x71E6,/* 0x50-0x57 */ + 0x73CA,0x7523,0x7B97,0x7E82,0x8695,0x8B83,0x8CDB,0x9178,/* 0x58-0x5F */ + 0x9910,0x65AC,0x66AB,0x6B8B,0x4ED5,0x4ED4,0x4F3A,0x4F7F,/* 0x60-0x67 */ + 0x523A,0x53F8,0x53F2,0x55E3,0x56DB,0x58EB,0x59CB,0x59C9,/* 0x68-0x6F */ + 0x59FF,0x5B50,0x5C4D,0x5E02,0x5E2B,0x5FD7,0x601D,0x6307,/* 0x70-0x77 */ + 0x652F,0x5B5C,0x65AF,0x65BD,0x65E8,0x679D,0x6B62,0x0000,/* 0x78-0x7F */ + + 0x6B7B,0x6C0F,0x7345,0x7949,0x79C1,0x7CF8,0x7D19,0x7D2B,/* 0x80-0x87 */ + 0x80A2,0x8102,0x81F3,0x8996,0x8A5E,0x8A69,0x8A66,0x8A8C,/* 0x88-0x8F */ + 0x8AEE,0x8CC7,0x8CDC,0x96CC,0x98FC,0x6B6F,0x4E8B,0x4F3C,/* 0x90-0x97 */ + 0x4F8D,0x5150,0x5B57,0x5BFA,0x6148,0x6301,0x6642,0x6B21,/* 0x98-0x9F */ + 0x6ECB,0x6CBB,0x723E,0x74BD,0x75D4,0x78C1,0x793A,0x800C,/* 0xA0-0xA7 */ + 0x8033,0x81EA,0x8494,0x8F9E,0x6C50,0x9E7F,0x5F0F,0x8B58,/* 0xA8-0xAF */ + 0x9D2B,0x7AFA,0x8EF8,0x5B8D,0x96EB,0x4E03,0x53F1,0x57F7,/* 0xB0-0xB7 */ + 0x5931,0x5AC9,0x5BA4,0x6089,0x6E7F,0x6F06,0x75BE,0x8CEA,/* 0xB8-0xBF */ + 0x5B9F,0x8500,0x7BE0,0x5072,0x67F4,0x829D,0x5C61,0x854A,/* 0xC0-0xC7 */ + 0x7E1E,0x820E,0x5199,0x5C04,0x6368,0x8D66,0x659C,0x716E,/* 0xC8-0xCF */ + 0x793E,0x7D17,0x8005,0x8B1D,0x8ECA,0x906E,0x86C7,0x90AA,/* 0xD0-0xD7 */ + 0x501F,0x52FA,0x5C3A,0x6753,0x707C,0x7235,0x914C,0x91C8,/* 0xD8-0xDF */ + 0x932B,0x82E5,0x5BC2,0x5F31,0x60F9,0x4E3B,0x53D6,0x5B88,/* 0xE0-0xE7 */ + 0x624B,0x6731,0x6B8A,0x72E9,0x73E0,0x7A2E,0x816B,0x8DA3,/* 0xE8-0xEF */ + 0x9152,0x9996,0x5112,0x53D7,0x546A,0x5BFF,0x6388,0x6A39,/* 0xF0-0xF7 */ + 0x7DAC,0x9700,0x56DA,0x53CE,0x5468,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5B97,0x5C31,0x5DDE,0x4FEE,0x6101,0x62FE,0x6D32,0x79C0,/* 0x40-0x47 */ + 0x79CB,0x7D42,0x7E4D,0x7FD2,0x81ED,0x821F,0x8490,0x8846,/* 0x48-0x4F */ + 0x8972,0x8B90,0x8E74,0x8F2F,0x9031,0x914B,0x916C,0x96C6,/* 0x50-0x57 */ + 0x919C,0x4EC0,0x4F4F,0x5145,0x5341,0x5F93,0x620E,0x67D4,/* 0x58-0x5F */ + 0x6C41,0x6E0B,0x7363,0x7E26,0x91CD,0x9283,0x53D4,0x5919,/* 0x60-0x67 */ + 0x5BBF,0x6DD1,0x795D,0x7E2E,0x7C9B,0x587E,0x719F,0x51FA,/* 0x68-0x6F */ + 0x8853,0x8FF0,0x4FCA,0x5CFB,0x6625,0x77AC,0x7AE3,0x821C,/* 0x70-0x77 */ + 0x99FF,0x51C6,0x5FAA,0x65EC,0x696F,0x6B89,0x6DF3,0x0000,/* 0x78-0x7F */ + + 0x6E96,0x6F64,0x76FE,0x7D14,0x5DE1,0x9075,0x9187,0x9806,/* 0x80-0x87 */ + 0x51E6,0x521D,0x6240,0x6691,0x66D9,0x6E1A,0x5EB6,0x7DD2,/* 0x88-0x8F */ + 0x7F72,0x66F8,0x85AF,0x85F7,0x8AF8,0x52A9,0x53D9,0x5973,/* 0x90-0x97 */ + 0x5E8F,0x5F90,0x6055,0x92E4,0x9664,0x50B7,0x511F,0x52DD,/* 0x98-0x9F */ + 0x5320,0x5347,0x53EC,0x54E8,0x5546,0x5531,0x5617,0x5968,/* 0xA0-0xA7 */ + 0x59BE,0x5A3C,0x5BB5,0x5C06,0x5C0F,0x5C11,0x5C1A,0x5E84,/* 0xA8-0xAF */ + 0x5E8A,0x5EE0,0x5F70,0x627F,0x6284,0x62DB,0x638C,0x6377,/* 0xB0-0xB7 */ + 0x6607,0x660C,0x662D,0x6676,0x677E,0x68A2,0x6A1F,0x6A35,/* 0xB8-0xBF */ + 0x6CBC,0x6D88,0x6E09,0x6E58,0x713C,0x7126,0x7167,0x75C7,/* 0xC0-0xC7 */ + 0x7701,0x785D,0x7901,0x7965,0x79F0,0x7AE0,0x7B11,0x7CA7,/* 0xC8-0xCF */ + 0x7D39,0x8096,0x83D6,0x848B,0x8549,0x885D,0x88F3,0x8A1F,/* 0xD0-0xD7 */ + 0x8A3C,0x8A54,0x8A73,0x8C61,0x8CDE,0x91A4,0x9266,0x937E,/* 0xD8-0xDF */ + 0x9418,0x969C,0x9798,0x4E0A,0x4E08,0x4E1E,0x4E57,0x5197,/* 0xE0-0xE7 */ + 0x5270,0x57CE,0x5834,0x58CC,0x5B22,0x5E38,0x60C5,0x64FE,/* 0xE8-0xEF */ + 0x6761,0x6756,0x6D44,0x72B6,0x7573,0x7A63,0x84B8,0x8B72,/* 0xF0-0xF7 */ + 0x91B8,0x9320,0x5631,0x57F4,0x98FE,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x62ED,0x690D,0x6B96,0x71ED,0x7E54,0x8077,0x8272,0x89E6,/* 0x40-0x47 */ + 0x98DF,0x8755,0x8FB1,0x5C3B,0x4F38,0x4FE1,0x4FB5,0x5507,/* 0x48-0x4F */ + 0x5A20,0x5BDD,0x5BE9,0x5FC3,0x614E,0x632F,0x65B0,0x664B,/* 0x50-0x57 */ + 0x68EE,0x699B,0x6D78,0x6DF1,0x7533,0x75B9,0x771F,0x795E,/* 0x58-0x5F */ + 0x79E6,0x7D33,0x81E3,0x82AF,0x85AA,0x89AA,0x8A3A,0x8EAB,/* 0x60-0x67 */ + 0x8F9B,0x9032,0x91DD,0x9707,0x4EBA,0x4EC1,0x5203,0x5875,/* 0x68-0x6F */ + 0x58EC,0x5C0B,0x751A,0x5C3D,0x814E,0x8A0A,0x8FC5,0x9663,/* 0x70-0x77 */ + 0x976D,0x7B25,0x8ACF,0x9808,0x9162,0x56F3,0x53A8,0x0000,/* 0x78-0x7F */ + + 0x9017,0x5439,0x5782,0x5E25,0x63A8,0x6C34,0x708A,0x7761,/* 0x80-0x87 */ + 0x7C8B,0x7FE0,0x8870,0x9042,0x9154,0x9310,0x9318,0x968F,/* 0x88-0x8F */ + 0x745E,0x9AC4,0x5D07,0x5D69,0x6570,0x67A2,0x8DA8,0x96DB,/* 0x90-0x97 */ + 0x636E,0x6749,0x6919,0x83C5,0x9817,0x96C0,0x88FE,0x6F84,/* 0x98-0x9F */ + 0x647A,0x5BF8,0x4E16,0x702C,0x755D,0x662F,0x51C4,0x5236,/* 0xA0-0xA7 */ + 0x52E2,0x59D3,0x5F81,0x6027,0x6210,0x653F,0x6574,0x661F,/* 0xA8-0xAF */ + 0x6674,0x68F2,0x6816,0x6B63,0x6E05,0x7272,0x751F,0x76DB,/* 0xB0-0xB7 */ + 0x7CBE,0x8056,0x58F0,0x88FD,0x897F,0x8AA0,0x8A93,0x8ACB,/* 0xB8-0xBF */ + 0x901D,0x9192,0x9752,0x9759,0x6589,0x7A0E,0x8106,0x96BB,/* 0xC0-0xC7 */ + 0x5E2D,0x60DC,0x621A,0x65A5,0x6614,0x6790,0x77F3,0x7A4D,/* 0xC8-0xCF */ + 0x7C4D,0x7E3E,0x810A,0x8CAC,0x8D64,0x8DE1,0x8E5F,0x78A9,/* 0xD0-0xD7 */ + 0x5207,0x62D9,0x63A5,0x6442,0x6298,0x8A2D,0x7A83,0x7BC0,/* 0xD8-0xDF */ + 0x8AAC,0x96EA,0x7D76,0x820C,0x8749,0x4ED9,0x5148,0x5343,/* 0xE0-0xE7 */ + 0x5360,0x5BA3,0x5C02,0x5C16,0x5DDD,0x6226,0x6247,0x64B0,/* 0xE8-0xEF */ + 0x6813,0x6834,0x6CC9,0x6D45,0x6D17,0x67D3,0x6F5C,0x714E,/* 0xF0-0xF7 */ + 0x717D,0x65CB,0x7A7F,0x7BAD,0x7DDA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E4A,0x7FA8,0x817A,0x821B,0x8239,0x85A6,0x8A6E,0x8CCE,/* 0x40-0x47 */ + 0x8DF5,0x9078,0x9077,0x92AD,0x9291,0x9583,0x9BAE,0x524D,/* 0x48-0x4F */ + 0x5584,0x6F38,0x7136,0x5168,0x7985,0x7E55,0x81B3,0x7CCE,/* 0x50-0x57 */ + 0x564C,0x5851,0x5CA8,0x63AA,0x66FE,0x66FD,0x695A,0x72D9,/* 0x58-0x5F */ + 0x758F,0x758E,0x790E,0x7956,0x79DF,0x7C97,0x7D20,0x7D44,/* 0x60-0x67 */ + 0x8607,0x8A34,0x963B,0x9061,0x9F20,0x50E7,0x5275,0x53CC,/* 0x68-0x6F */ + 0x53E2,0x5009,0x55AA,0x58EE,0x594F,0x723D,0x5B8B,0x5C64,/* 0x70-0x77 */ + 0x531D,0x60E3,0x60F3,0x635C,0x6383,0x633F,0x63BB,0x0000,/* 0x78-0x7F */ + + 0x64CD,0x65E9,0x66F9,0x5DE3,0x69CD,0x69FD,0x6F15,0x71E5,/* 0x80-0x87 */ + 0x4E89,0x75E9,0x76F8,0x7A93,0x7CDF,0x7DCF,0x7D9C,0x8061,/* 0x88-0x8F */ + 0x8349,0x8358,0x846C,0x84BC,0x85FB,0x88C5,0x8D70,0x9001,/* 0x90-0x97 */ + 0x906D,0x9397,0x971C,0x9A12,0x50CF,0x5897,0x618E,0x81D3,/* 0x98-0x9F */ + 0x8535,0x8D08,0x9020,0x4FC3,0x5074,0x5247,0x5373,0x606F,/* 0xA0-0xA7 */ + 0x6349,0x675F,0x6E2C,0x8DB3,0x901F,0x4FD7,0x5C5E,0x8CCA,/* 0xA8-0xAF */ + 0x65CF,0x7D9A,0x5352,0x8896,0x5176,0x63C3,0x5B58,0x5B6B,/* 0xB0-0xB7 */ + 0x5C0A,0x640D,0x6751,0x905C,0x4ED6,0x591A,0x592A,0x6C70,/* 0xB8-0xBF */ + 0x8A51,0x553E,0x5815,0x59A5,0x60F0,0x6253,0x67C1,0x8235,/* 0xC0-0xC7 */ + 0x6955,0x9640,0x99C4,0x9A28,0x4F53,0x5806,0x5BFE,0x8010,/* 0xC8-0xCF */ + 0x5CB1,0x5E2F,0x5F85,0x6020,0x614B,0x6234,0x66FF,0x6CF0,/* 0xD0-0xD7 */ + 0x6EDE,0x80CE,0x817F,0x82D4,0x888B,0x8CB8,0x9000,0x902E,/* 0xD8-0xDF */ + 0x968A,0x9EDB,0x9BDB,0x4EE3,0x53F0,0x5927,0x7B2C,0x918D,/* 0xE0-0xE7 */ + 0x984C,0x9DF9,0x6EDD,0x7027,0x5353,0x5544,0x5B85,0x6258,/* 0xE8-0xEF */ + 0x629E,0x62D3,0x6CA2,0x6FEF,0x7422,0x8A17,0x9438,0x6FC1,/* 0xF0-0xF7 */ + 0x8AFE,0x8338,0x51E7,0x86F8,0x53EA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x53E9,0x4F46,0x9054,0x8FB0,0x596A,0x8131,0x5DFD,0x7AEA,/* 0x40-0x47 */ + 0x8FBF,0x68DA,0x8C37,0x72F8,0x9C48,0x6A3D,0x8AB0,0x4E39,/* 0x48-0x4F */ + 0x5358,0x5606,0x5766,0x62C5,0x63A2,0x65E6,0x6B4E,0x6DE1,/* 0x50-0x57 */ + 0x6E5B,0x70AD,0x77ED,0x7AEF,0x7BAA,0x7DBB,0x803D,0x80C6,/* 0x58-0x5F */ + 0x86CB,0x8A95,0x935B,0x56E3,0x58C7,0x5F3E,0x65AD,0x6696,/* 0x60-0x67 */ + 0x6A80,0x6BB5,0x7537,0x8AC7,0x5024,0x77E5,0x5730,0x5F1B,/* 0x68-0x6F */ + 0x6065,0x667A,0x6C60,0x75F4,0x7A1A,0x7F6E,0x81F4,0x8718,/* 0x70-0x77 */ + 0x9045,0x99B3,0x7BC9,0x755C,0x7AF9,0x7B51,0x84C4,0x0000,/* 0x78-0x7F */ + + 0x9010,0x79E9,0x7A92,0x8336,0x5AE1,0x7740,0x4E2D,0x4EF2,/* 0x80-0x87 */ + 0x5B99,0x5FE0,0x62BD,0x663C,0x67F1,0x6CE8,0x866B,0x8877,/* 0x88-0x8F */ + 0x8A3B,0x914E,0x92F3,0x99D0,0x6A17,0x7026,0x732A,0x82E7,/* 0x90-0x97 */ + 0x8457,0x8CAF,0x4E01,0x5146,0x51CB,0x558B,0x5BF5,0x5E16,/* 0x98-0x9F */ + 0x5E33,0x5E81,0x5F14,0x5F35,0x5F6B,0x5FB4,0x61F2,0x6311,/* 0xA0-0xA7 */ + 0x66A2,0x671D,0x6F6E,0x7252,0x753A,0x773A,0x8074,0x8139,/* 0xA8-0xAF */ + 0x8178,0x8776,0x8ABF,0x8ADC,0x8D85,0x8DF3,0x929A,0x9577,/* 0xB0-0xB7 */ + 0x9802,0x9CE5,0x52C5,0x6357,0x76F4,0x6715,0x6C88,0x73CD,/* 0xB8-0xBF */ + 0x8CC3,0x93AE,0x9673,0x6D25,0x589C,0x690E,0x69CC,0x8FFD,/* 0xC0-0xC7 */ + 0x939A,0x75DB,0x901A,0x585A,0x6802,0x63B4,0x69FB,0x4F43,/* 0xC8-0xCF */ + 0x6F2C,0x67D8,0x8FBB,0x8526,0x7DB4,0x9354,0x693F,0x6F70,/* 0xD0-0xD7 */ + 0x576A,0x58F7,0x5B2C,0x7D2C,0x722A,0x540A,0x91E3,0x9DB4,/* 0xD8-0xDF */ + 0x4EAD,0x4F4E,0x505C,0x5075,0x5243,0x8C9E,0x5448,0x5824,/* 0xE0-0xE7 */ + 0x5B9A,0x5E1D,0x5E95,0x5EAD,0x5EF7,0x5F1F,0x608C,0x62B5,/* 0xE8-0xEF */ + 0x633A,0x63D0,0x68AF,0x6C40,0x7887,0x798E,0x7A0B,0x7DE0,/* 0xF0-0xF7 */ + 0x8247,0x8A02,0x8AE6,0x8E44,0x9013,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x90B8,0x912D,0x91D8,0x9F0E,0x6CE5,0x6458,0x64E2,0x6575,/* 0x40-0x47 */ + 0x6EF4,0x7684,0x7B1B,0x9069,0x93D1,0x6EBA,0x54F2,0x5FB9,/* 0x48-0x4F */ + 0x64A4,0x8F4D,0x8FED,0x9244,0x5178,0x586B,0x5929,0x5C55,/* 0x50-0x57 */ + 0x5E97,0x6DFB,0x7E8F,0x751C,0x8CBC,0x8EE2,0x985B,0x70B9,/* 0x58-0x5F */ + 0x4F1D,0x6BBF,0x6FB1,0x7530,0x96FB,0x514E,0x5410,0x5835,/* 0x60-0x67 */ + 0x5857,0x59AC,0x5C60,0x5F92,0x6597,0x675C,0x6E21,0x767B,/* 0x68-0x6F */ + 0x83DF,0x8CED,0x9014,0x90FD,0x934D,0x7825,0x783A,0x52AA,/* 0x70-0x77 */ + 0x5EA6,0x571F,0x5974,0x6012,0x5012,0x515A,0x51AC,0x0000,/* 0x78-0x7F */ + + 0x51CD,0x5200,0x5510,0x5854,0x5858,0x5957,0x5B95,0x5CF6,/* 0x80-0x87 */ + 0x5D8B,0x60BC,0x6295,0x642D,0x6771,0x6843,0x68BC,0x68DF,/* 0x88-0x8F */ + 0x76D7,0x6DD8,0x6E6F,0x6D9B,0x706F,0x71C8,0x5F53,0x75D8,/* 0x90-0x97 */ + 0x7977,0x7B49,0x7B54,0x7B52,0x7CD6,0x7D71,0x5230,0x8463,/* 0x98-0x9F */ + 0x8569,0x85E4,0x8A0E,0x8B04,0x8C46,0x8E0F,0x9003,0x900F,/* 0xA0-0xA7 */ + 0x9419,0x9676,0x982D,0x9A30,0x95D8,0x50CD,0x52D5,0x540C,/* 0xA8-0xAF */ + 0x5802,0x5C0E,0x61A7,0x649E,0x6D1E,0x77B3,0x7AE5,0x80F4,/* 0xB0-0xB7 */ + 0x8404,0x9053,0x9285,0x5CE0,0x9D07,0x533F,0x5F97,0x5FB3,/* 0xB8-0xBF */ + 0x6D9C,0x7279,0x7763,0x79BF,0x7BE4,0x6BD2,0x72EC,0x8AAD,/* 0xC0-0xC7 */ + 0x6803,0x6A61,0x51F8,0x7A81,0x6934,0x5C4A,0x9CF6,0x82EB,/* 0xC8-0xCF */ + 0x5BC5,0x9149,0x701E,0x5678,0x5C6F,0x60C7,0x6566,0x6C8C,/* 0xD0-0xD7 */ + 0x8C5A,0x9041,0x9813,0x5451,0x66C7,0x920D,0x5948,0x90A3,/* 0xD8-0xDF */ + 0x5185,0x4E4D,0x51EA,0x8599,0x8B0E,0x7058,0x637A,0x934B,/* 0xE0-0xE7 */ + 0x6962,0x99B4,0x7E04,0x7577,0x5357,0x6960,0x8EDF,0x96E3,/* 0xE8-0xEF */ + 0x6C5D,0x4E8C,0x5C3C,0x5F10,0x8FE9,0x5302,0x8CD1,0x8089,/* 0xF0-0xF7 */ + 0x8679,0x5EFF,0x65E5,0x4E73,0x5165,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5982,0x5C3F,0x97EE,0x4EFB,0x598A,0x5FCD,0x8A8D,0x6FE1,/* 0x40-0x47 */ + 0x79B0,0x7962,0x5BE7,0x8471,0x732B,0x71B1,0x5E74,0x5FF5,/* 0x48-0x4F */ + 0x637B,0x649A,0x71C3,0x7C98,0x4E43,0x5EFC,0x4E4B,0x57DC,/* 0x50-0x57 */ + 0x56A2,0x60A9,0x6FC3,0x7D0D,0x80FD,0x8133,0x81BF,0x8FB2,/* 0x58-0x5F */ + 0x8997,0x86A4,0x5DF4,0x628A,0x64AD,0x8987,0x6777,0x6CE2,/* 0x60-0x67 */ + 0x6D3E,0x7436,0x7834,0x5A46,0x7F75,0x82AD,0x99AC,0x4FF3,/* 0x68-0x6F */ + 0x5EC3,0x62DD,0x6392,0x6557,0x676F,0x76C3,0x724C,0x80CC,/* 0x70-0x77 */ + 0x80BA,0x8F29,0x914D,0x500D,0x57F9,0x5A92,0x6885,0x0000,/* 0x78-0x7F */ + + 0x6973,0x7164,0x72FD,0x8CB7,0x58F2,0x8CE0,0x966A,0x9019,/* 0x80-0x87 */ + 0x877F,0x79E4,0x77E7,0x8429,0x4F2F,0x5265,0x535A,0x62CD,/* 0x88-0x8F */ + 0x67CF,0x6CCA,0x767D,0x7B94,0x7C95,0x8236,0x8584,0x8FEB,/* 0x90-0x97 */ + 0x66DD,0x6F20,0x7206,0x7E1B,0x83AB,0x99C1,0x9EA6,0x51FD,/* 0x98-0x9F */ + 0x7BB1,0x7872,0x7BB8,0x8087,0x7B48,0x6AE8,0x5E61,0x808C,/* 0xA0-0xA7 */ + 0x7551,0x7560,0x516B,0x9262,0x6E8C,0x767A,0x9197,0x9AEA,/* 0xA8-0xAF */ + 0x4F10,0x7F70,0x629C,0x7B4F,0x95A5,0x9CE9,0x567A,0x5859,/* 0xB0-0xB7 */ + 0x86E4,0x96BC,0x4F34,0x5224,0x534A,0x53CD,0x53DB,0x5E06,/* 0xB8-0xBF */ + 0x642C,0x6591,0x677F,0x6C3E,0x6C4E,0x7248,0x72AF,0x73ED,/* 0xC0-0xC7 */ + 0x7554,0x7E41,0x822C,0x85E9,0x8CA9,0x7BC4,0x91C6,0x7169,/* 0xC8-0xCF */ + 0x9812,0x98EF,0x633D,0x6669,0x756A,0x76E4,0x78D0,0x8543,/* 0xD0-0xD7 */ + 0x86EE,0x532A,0x5351,0x5426,0x5983,0x5E87,0x5F7C,0x60B2,/* 0xD8-0xDF */ + 0x6249,0x6279,0x62AB,0x6590,0x6BD4,0x6CCC,0x75B2,0x76AE,/* 0xE0-0xE7 */ + 0x7891,0x79D8,0x7DCB,0x7F77,0x80A5,0x88AB,0x8AB9,0x8CBB,/* 0xE8-0xEF */ + 0x907F,0x975E,0x98DB,0x6A0B,0x7C38,0x5099,0x5C3E,0x5FAE,/* 0xF0-0xF7 */ + 0x6787,0x6BD8,0x7435,0x7709,0x7F8E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9F3B,0x67CA,0x7A17,0x5339,0x758B,0x9AED,0x5F66,0x819D,/* 0x40-0x47 */ + 0x83F1,0x8098,0x5F3C,0x5FC5,0x7562,0x7B46,0x903C,0x6867,/* 0x48-0x4F */ + 0x59EB,0x5A9B,0x7D10,0x767E,0x8B2C,0x4FF5,0x5F6A,0x6A19,/* 0x50-0x57 */ + 0x6C37,0x6F02,0x74E2,0x7968,0x8868,0x8A55,0x8C79,0x5EDF,/* 0x58-0x5F */ + 0x63CF,0x75C5,0x79D2,0x82D7,0x9328,0x92F2,0x849C,0x86ED,/* 0x60-0x67 */ + 0x9C2D,0x54C1,0x5F6C,0x658C,0x6D5C,0x7015,0x8CA7,0x8CD3,/* 0x68-0x6F */ + 0x983B,0x654F,0x74F6,0x4E0D,0x4ED8,0x57E0,0x592B,0x5A66,/* 0x70-0x77 */ + 0x5BCC,0x51A8,0x5E03,0x5E9C,0x6016,0x6276,0x6577,0x0000,/* 0x78-0x7F */ + + 0x65A7,0x666E,0x6D6E,0x7236,0x7B26,0x8150,0x819A,0x8299,/* 0x80-0x87 */ + 0x8B5C,0x8CA0,0x8CE6,0x8D74,0x961C,0x9644,0x4FAE,0x64AB,/* 0x88-0x8F */ + 0x6B66,0x821E,0x8461,0x856A,0x90E8,0x5C01,0x6953,0x98A8,/* 0x90-0x97 */ + 0x847A,0x8557,0x4F0F,0x526F,0x5FA9,0x5E45,0x670D,0x798F,/* 0x98-0x9F */ + 0x8179,0x8907,0x8986,0x6DF5,0x5F17,0x6255,0x6CB8,0x4ECF,/* 0xA0-0xA7 */ + 0x7269,0x9B92,0x5206,0x543B,0x5674,0x58B3,0x61A4,0x626E,/* 0xA8-0xAF */ + 0x711A,0x596E,0x7C89,0x7CDE,0x7D1B,0x96F0,0x6587,0x805E,/* 0xB0-0xB7 */ + 0x4E19,0x4F75,0x5175,0x5840,0x5E63,0x5E73,0x5F0A,0x67C4,/* 0xB8-0xBF */ + 0x4E26,0x853D,0x9589,0x965B,0x7C73,0x9801,0x50FB,0x58C1,/* 0xC0-0xC7 */ + 0x7656,0x78A7,0x5225,0x77A5,0x8511,0x7B86,0x504F,0x5909,/* 0xC8-0xCF */ + 0x7247,0x7BC7,0x7DE8,0x8FBA,0x8FD4,0x904D,0x4FBF,0x52C9,/* 0xD0-0xD7 */ + 0x5A29,0x5F01,0x97AD,0x4FDD,0x8217,0x92EA,0x5703,0x6355,/* 0xD8-0xDF */ + 0x6B69,0x752B,0x88DC,0x8F14,0x7A42,0x52DF,0x5893,0x6155,/* 0xE0-0xE7 */ + 0x620A,0x66AE,0x6BCD,0x7C3F,0x83E9,0x5023,0x4FF8,0x5305,/* 0xE8-0xEF */ + 0x5446,0x5831,0x5949,0x5B9D,0x5CF0,0x5CEF,0x5D29,0x5E96,/* 0xF0-0xF7 */ + 0x62B1,0x6367,0x653E,0x65B9,0x670B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6CD5,0x6CE1,0x70F9,0x7832,0x7E2B,0x80DE,0x82B3,0x840C,/* 0x40-0x47 */ + 0x84EC,0x8702,0x8912,0x8A2A,0x8C4A,0x90A6,0x92D2,0x98FD,/* 0x48-0x4F */ + 0x9CF3,0x9D6C,0x4E4F,0x4EA1,0x508D,0x5256,0x574A,0x59A8,/* 0x50-0x57 */ + 0x5E3D,0x5FD8,0x5FD9,0x623F,0x66B4,0x671B,0x67D0,0x68D2,/* 0x58-0x5F */ + 0x5192,0x7D21,0x80AA,0x81A8,0x8B00,0x8C8C,0x8CBF,0x927E,/* 0x60-0x67 */ + 0x9632,0x5420,0x982C,0x5317,0x50D5,0x535C,0x58A8,0x64B2,/* 0x68-0x6F */ + 0x6734,0x7267,0x7766,0x7A46,0x91E6,0x52C3,0x6CA1,0x6B86,/* 0x70-0x77 */ + 0x5800,0x5E4C,0x5954,0x672C,0x7FFB,0x51E1,0x76C6,0x0000,/* 0x78-0x7F */ + + 0x6469,0x78E8,0x9B54,0x9EBB,0x57CB,0x59B9,0x6627,0x679A,/* 0x80-0x87 */ + 0x6BCE,0x54E9,0x69D9,0x5E55,0x819C,0x6795,0x9BAA,0x67FE,/* 0x88-0x8F */ + 0x9C52,0x685D,0x4EA6,0x4FE3,0x53C8,0x62B9,0x672B,0x6CAB,/* 0x90-0x97 */ + 0x8FC4,0x4FAD,0x7E6D,0x9EBF,0x4E07,0x6162,0x6E80,0x6F2B,/* 0x98-0x9F */ + 0x8513,0x5473,0x672A,0x9B45,0x5DF3,0x7B95,0x5CAC,0x5BC6,/* 0xA0-0xA7 */ + 0x871C,0x6E4A,0x84D1,0x7A14,0x8108,0x5999,0x7C8D,0x6C11,/* 0xA8-0xAF */ + 0x7720,0x52D9,0x5922,0x7121,0x725F,0x77DB,0x9727,0x9D61,/* 0xB0-0xB7 */ + 0x690B,0x5A7F,0x5A18,0x51A5,0x540D,0x547D,0x660E,0x76DF,/* 0xB8-0xBF */ + 0x8FF7,0x9298,0x9CF4,0x59EA,0x725D,0x6EC5,0x514D,0x68C9,/* 0xC0-0xC7 */ + 0x7DBF,0x7DEC,0x9762,0x9EBA,0x6478,0x6A21,0x8302,0x5984,/* 0xC8-0xCF */ + 0x5B5F,0x6BDB,0x731B,0x76F2,0x7DB2,0x8017,0x8499,0x5132,/* 0xD0-0xD7 */ + 0x6728,0x9ED9,0x76EE,0x6762,0x52FF,0x9905,0x5C24,0x623B,/* 0xD8-0xDF */ + 0x7C7E,0x8CB0,0x554F,0x60B6,0x7D0B,0x9580,0x5301,0x4E5F,/* 0xE0-0xE7 */ + 0x51B6,0x591C,0x723A,0x8036,0x91CE,0x5F25,0x77E2,0x5384,/* 0xE8-0xEF */ + 0x5F79,0x7D04,0x85AC,0x8A33,0x8E8D,0x9756,0x67F3,0x85AE,/* 0xF0-0xF7 */ + 0x9453,0x6109,0x6108,0x6CB9,0x7652,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8AED,0x8F38,0x552F,0x4F51,0x512A,0x52C7,0x53CB,0x5BA5,/* 0x40-0x47 */ + 0x5E7D,0x60A0,0x6182,0x63D6,0x6709,0x67DA,0x6E67,0x6D8C,/* 0x48-0x4F */ + 0x7336,0x7337,0x7531,0x7950,0x88D5,0x8A98,0x904A,0x9091,/* 0x50-0x57 */ + 0x90F5,0x96C4,0x878D,0x5915,0x4E88,0x4F59,0x4E0E,0x8A89,/* 0x58-0x5F */ + 0x8F3F,0x9810,0x50AD,0x5E7C,0x5996,0x5BB9,0x5EB8,0x63DA,/* 0x60-0x67 */ + 0x63FA,0x64C1,0x66DC,0x694A,0x69D8,0x6D0B,0x6EB6,0x7194,/* 0x68-0x6F */ + 0x7528,0x7AAF,0x7F8A,0x8000,0x8449,0x84C9,0x8981,0x8B21,/* 0x70-0x77 */ + 0x8E0A,0x9065,0x967D,0x990A,0x617E,0x6291,0x6B32,0x0000,/* 0x78-0x7F */ + + 0x6C83,0x6D74,0x7FCC,0x7FFC,0x6DC0,0x7F85,0x87BA,0x88F8,/* 0x80-0x87 */ + 0x6765,0x83B1,0x983C,0x96F7,0x6D1B,0x7D61,0x843D,0x916A,/* 0x88-0x8F */ + 0x4E71,0x5375,0x5D50,0x6B04,0x6FEB,0x85CD,0x862D,0x89A7,/* 0x90-0x97 */ + 0x5229,0x540F,0x5C65,0x674E,0x68A8,0x7406,0x7483,0x75E2,/* 0x98-0x9F */ + 0x88CF,0x88E1,0x91CC,0x96E2,0x9678,0x5F8B,0x7387,0x7ACB,/* 0xA0-0xA7 */ + 0x844E,0x63A0,0x7565,0x5289,0x6D41,0x6E9C,0x7409,0x7559,/* 0xA8-0xAF */ + 0x786B,0x7C92,0x9686,0x7ADC,0x9F8D,0x4FB6,0x616E,0x65C5,/* 0xB0-0xB7 */ + 0x865C,0x4E86,0x4EAE,0x50DA,0x4E21,0x51CC,0x5BEE,0x6599,/* 0xB8-0xBF */ + 0x6881,0x6DBC,0x731F,0x7642,0x77AD,0x7A1C,0x7CE7,0x826F,/* 0xC0-0xC7 */ + 0x8AD2,0x907C,0x91CF,0x9675,0x9818,0x529B,0x7DD1,0x502B,/* 0xC8-0xCF */ + 0x5398,0x6797,0x6DCB,0x71D0,0x7433,0x81E8,0x8F2A,0x96A3,/* 0xD0-0xD7 */ + 0x9C57,0x9E9F,0x7460,0x5841,0x6D99,0x7D2F,0x985E,0x4EE4,/* 0xD8-0xDF */ + 0x4F36,0x4F8B,0x51B7,0x52B1,0x5DBA,0x601C,0x73B2,0x793C,/* 0xE0-0xE7 */ + 0x82D3,0x9234,0x96B7,0x96F6,0x970A,0x9E97,0x9F62,0x66A6,/* 0xE8-0xEF */ + 0x6B74,0x5217,0x52A3,0x70C8,0x88C2,0x5EC9,0x604B,0x6190,/* 0xF0-0xF7 */ + 0x6F23,0x7149,0x7C3E,0x7DF4,0x806F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x84EE,0x9023,0x932C,0x5442,0x9B6F,0x6AD3,0x7089,0x8CC2,/* 0x40-0x47 */ + 0x8DEF,0x9732,0x52B4,0x5A41,0x5ECA,0x5F04,0x6717,0x697C,/* 0x48-0x4F */ + 0x6994,0x6D6A,0x6F0F,0x7262,0x72FC,0x7BED,0x8001,0x807E,/* 0x50-0x57 */ + 0x874B,0x90CE,0x516D,0x9E93,0x7984,0x808B,0x9332,0x8AD6,/* 0x58-0x5F */ + 0x502D,0x548C,0x8A71,0x6B6A,0x8CC4,0x8107,0x60D1,0x67A0,/* 0x60-0x67 */ + 0x9DF2,0x4E99,0x4E98,0x9C10,0x8A6B,0x85C1,0x8568,0x6900,/* 0x68-0x6F */ + 0x6E7E,0x7897,0x8155,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x5F0C,/* 0x98-0x9F */ + 0x4E10,0x4E15,0x4E2A,0x4E31,0x4E36,0x4E3C,0x4E3F,0x4E42,/* 0xA0-0xA7 */ + 0x4E56,0x4E58,0x4E82,0x4E85,0x8C6B,0x4E8A,0x8212,0x5F0D,/* 0xA8-0xAF */ + 0x4E8E,0x4E9E,0x4E9F,0x4EA0,0x4EA2,0x4EB0,0x4EB3,0x4EB6,/* 0xB0-0xB7 */ + 0x4ECE,0x4ECD,0x4EC4,0x4EC6,0x4EC2,0x4ED7,0x4EDE,0x4EED,/* 0xB8-0xBF */ + 0x4EDF,0x4EF7,0x4F09,0x4F5A,0x4F30,0x4F5B,0x4F5D,0x4F57,/* 0xC0-0xC7 */ + 0x4F47,0x4F76,0x4F88,0x4F8F,0x4F98,0x4F7B,0x4F69,0x4F70,/* 0xC8-0xCF */ + 0x4F91,0x4F6F,0x4F86,0x4F96,0x5118,0x4FD4,0x4FDF,0x4FCE,/* 0xD0-0xD7 */ + 0x4FD8,0x4FDB,0x4FD1,0x4FDA,0x4FD0,0x4FE4,0x4FE5,0x501A,/* 0xD8-0xDF */ + 0x5028,0x5014,0x502A,0x5025,0x5005,0x4F1C,0x4FF6,0x5021,/* 0xE0-0xE7 */ + 0x5029,0x502C,0x4FFE,0x4FEF,0x5011,0x5006,0x5043,0x5047,/* 0xE8-0xEF */ + 0x6703,0x5055,0x5050,0x5048,0x505A,0x5056,0x506C,0x5078,/* 0xF0-0xF7 */ + 0x5080,0x509A,0x5085,0x50B4,0x50B2,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x50C9,0x50CA,0x50B3,0x50C2,0x50D6,0x50DE,0x50E5,0x50ED,/* 0x40-0x47 */ + 0x50E3,0x50EE,0x50F9,0x50F5,0x5109,0x5101,0x5102,0x5116,/* 0x48-0x4F */ + 0x5115,0x5114,0x511A,0x5121,0x513A,0x5137,0x513C,0x513B,/* 0x50-0x57 */ + 0x513F,0x5140,0x5152,0x514C,0x5154,0x5162,0x7AF8,0x5169,/* 0x58-0x5F */ + 0x516A,0x516E,0x5180,0x5182,0x56D8,0x518C,0x5189,0x518F,/* 0x60-0x67 */ + 0x5191,0x5193,0x5195,0x5196,0x51A4,0x51A6,0x51A2,0x51A9,/* 0x68-0x6F */ + 0x51AA,0x51AB,0x51B3,0x51B1,0x51B2,0x51B0,0x51B5,0x51BD,/* 0x70-0x77 */ + 0x51C5,0x51C9,0x51DB,0x51E0,0x8655,0x51E9,0x51ED,0x0000,/* 0x78-0x7F */ + + 0x51F0,0x51F5,0x51FE,0x5204,0x520B,0x5214,0x520E,0x5227,/* 0x80-0x87 */ + 0x522A,0x522E,0x5233,0x5239,0x524F,0x5244,0x524B,0x524C,/* 0x88-0x8F */ + 0x525E,0x5254,0x526A,0x5274,0x5269,0x5273,0x527F,0x527D,/* 0x90-0x97 */ + 0x528D,0x5294,0x5292,0x5271,0x5288,0x5291,0x8FA8,0x8FA7,/* 0x98-0x9F */ + 0x52AC,0x52AD,0x52BC,0x52B5,0x52C1,0x52CD,0x52D7,0x52DE,/* 0xA0-0xA7 */ + 0x52E3,0x52E6,0x98ED,0x52E0,0x52F3,0x52F5,0x52F8,0x52F9,/* 0xA8-0xAF */ + 0x5306,0x5308,0x7538,0x530D,0x5310,0x530F,0x5315,0x531A,/* 0xB0-0xB7 */ + 0x5323,0x532F,0x5331,0x5333,0x5338,0x5340,0x5346,0x5345,/* 0xB8-0xBF */ + 0x4E17,0x5349,0x534D,0x51D6,0x535E,0x5369,0x536E,0x5918,/* 0xC0-0xC7 */ + 0x537B,0x5377,0x5382,0x5396,0x53A0,0x53A6,0x53A5,0x53AE,/* 0xC8-0xCF */ + 0x53B0,0x53B6,0x53C3,0x7C12,0x96D9,0x53DF,0x66FC,0x71EE,/* 0xD0-0xD7 */ + 0x53EE,0x53E8,0x53ED,0x53FA,0x5401,0x543D,0x5440,0x542C,/* 0xD8-0xDF */ + 0x542D,0x543C,0x542E,0x5436,0x5429,0x541D,0x544E,0x548F,/* 0xE0-0xE7 */ + 0x5475,0x548E,0x545F,0x5471,0x5477,0x5470,0x5492,0x547B,/* 0xE8-0xEF */ + 0x5480,0x5476,0x5484,0x5490,0x5486,0x54C7,0x54A2,0x54B8,/* 0xF0-0xF7 */ + 0x54A5,0x54AC,0x54C4,0x54C8,0x54A8,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54AB,0x54C2,0x54A4,0x54BE,0x54BC,0x54D8,0x54E5,0x54E6,/* 0x40-0x47 */ + 0x550F,0x5514,0x54FD,0x54EE,0x54ED,0x54FA,0x54E2,0x5539,/* 0x48-0x4F */ + 0x5540,0x5563,0x554C,0x552E,0x555C,0x5545,0x5556,0x5557,/* 0x50-0x57 */ + 0x5538,0x5533,0x555D,0x5599,0x5580,0x54AF,0x558A,0x559F,/* 0x58-0x5F */ + 0x557B,0x557E,0x5598,0x559E,0x55AE,0x557C,0x5583,0x55A9,/* 0x60-0x67 */ + 0x5587,0x55A8,0x55DA,0x55C5,0x55DF,0x55C4,0x55DC,0x55E4,/* 0x68-0x6F */ + 0x55D4,0x5614,0x55F7,0x5616,0x55FE,0x55FD,0x561B,0x55F9,/* 0x70-0x77 */ + 0x564E,0x5650,0x71DF,0x5634,0x5636,0x5632,0x5638,0x0000,/* 0x78-0x7F */ + + 0x566B,0x5664,0x562F,0x566C,0x566A,0x5686,0x5680,0x568A,/* 0x80-0x87 */ + 0x56A0,0x5694,0x568F,0x56A5,0x56AE,0x56B6,0x56B4,0x56C2,/* 0x88-0x8F */ + 0x56BC,0x56C1,0x56C3,0x56C0,0x56C8,0x56CE,0x56D1,0x56D3,/* 0x90-0x97 */ + 0x56D7,0x56EE,0x56F9,0x5700,0x56FF,0x5704,0x5709,0x5708,/* 0x98-0x9F */ + 0x570B,0x570D,0x5713,0x5718,0x5716,0x55C7,0x571C,0x5726,/* 0xA0-0xA7 */ + 0x5737,0x5738,0x574E,0x573B,0x5740,0x574F,0x5769,0x57C0,/* 0xA8-0xAF */ + 0x5788,0x5761,0x577F,0x5789,0x5793,0x57A0,0x57B3,0x57A4,/* 0xB0-0xB7 */ + 0x57AA,0x57B0,0x57C3,0x57C6,0x57D4,0x57D2,0x57D3,0x580A,/* 0xB8-0xBF */ + 0x57D6,0x57E3,0x580B,0x5819,0x581D,0x5872,0x5821,0x5862,/* 0xC0-0xC7 */ + 0x584B,0x5870,0x6BC0,0x5852,0x583D,0x5879,0x5885,0x58B9,/* 0xC8-0xCF */ + 0x589F,0x58AB,0x58BA,0x58DE,0x58BB,0x58B8,0x58AE,0x58C5,/* 0xD0-0xD7 */ + 0x58D3,0x58D1,0x58D7,0x58D9,0x58D8,0x58E5,0x58DC,0x58E4,/* 0xD8-0xDF */ + 0x58DF,0x58EF,0x58FA,0x58F9,0x58FB,0x58FC,0x58FD,0x5902,/* 0xE0-0xE7 */ + 0x590A,0x5910,0x591B,0x68A6,0x5925,0x592C,0x592D,0x5932,/* 0xE8-0xEF */ + 0x5938,0x593E,0x7AD2,0x5955,0x5950,0x594E,0x595A,0x5958,/* 0xF0-0xF7 */ + 0x5962,0x5960,0x5967,0x596C,0x5969,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5978,0x5981,0x599D,0x4F5E,0x4FAB,0x59A3,0x59B2,0x59C6,/* 0x40-0x47 */ + 0x59E8,0x59DC,0x598D,0x59D9,0x59DA,0x5A25,0x5A1F,0x5A11,/* 0x48-0x4F */ + 0x5A1C,0x5A09,0x5A1A,0x5A40,0x5A6C,0x5A49,0x5A35,0x5A36,/* 0x50-0x57 */ + 0x5A62,0x5A6A,0x5A9A,0x5ABC,0x5ABE,0x5ACB,0x5AC2,0x5ABD,/* 0x58-0x5F */ + 0x5AE3,0x5AD7,0x5AE6,0x5AE9,0x5AD6,0x5AFA,0x5AFB,0x5B0C,/* 0x60-0x67 */ + 0x5B0B,0x5B16,0x5B32,0x5AD0,0x5B2A,0x5B36,0x5B3E,0x5B43,/* 0x68-0x6F */ + 0x5B45,0x5B40,0x5B51,0x5B55,0x5B5A,0x5B5B,0x5B65,0x5B69,/* 0x70-0x77 */ + 0x5B70,0x5B73,0x5B75,0x5B78,0x6588,0x5B7A,0x5B80,0x0000,/* 0x78-0x7F */ + + 0x5B83,0x5BA6,0x5BB8,0x5BC3,0x5BC7,0x5BC9,0x5BD4,0x5BD0,/* 0x80-0x87 */ + 0x5BE4,0x5BE6,0x5BE2,0x5BDE,0x5BE5,0x5BEB,0x5BF0,0x5BF6,/* 0x88-0x8F */ + 0x5BF3,0x5C05,0x5C07,0x5C08,0x5C0D,0x5C13,0x5C20,0x5C22,/* 0x90-0x97 */ + 0x5C28,0x5C38,0x5C39,0x5C41,0x5C46,0x5C4E,0x5C53,0x5C50,/* 0x98-0x9F */ + 0x5C4F,0x5B71,0x5C6C,0x5C6E,0x4E62,0x5C76,0x5C79,0x5C8C,/* 0xA0-0xA7 */ + 0x5C91,0x5C94,0x599B,0x5CAB,0x5CBB,0x5CB6,0x5CBC,0x5CB7,/* 0xA8-0xAF */ + 0x5CC5,0x5CBE,0x5CC7,0x5CD9,0x5CE9,0x5CFD,0x5CFA,0x5CED,/* 0xB0-0xB7 */ + 0x5D8C,0x5CEA,0x5D0B,0x5D15,0x5D17,0x5D5C,0x5D1F,0x5D1B,/* 0xB8-0xBF */ + 0x5D11,0x5D14,0x5D22,0x5D1A,0x5D19,0x5D18,0x5D4C,0x5D52,/* 0xC0-0xC7 */ + 0x5D4E,0x5D4B,0x5D6C,0x5D73,0x5D76,0x5D87,0x5D84,0x5D82,/* 0xC8-0xCF */ + 0x5DA2,0x5D9D,0x5DAC,0x5DAE,0x5DBD,0x5D90,0x5DB7,0x5DBC,/* 0xD0-0xD7 */ + 0x5DC9,0x5DCD,0x5DD3,0x5DD2,0x5DD6,0x5DDB,0x5DEB,0x5DF2,/* 0xD8-0xDF */ + 0x5DF5,0x5E0B,0x5E1A,0x5E19,0x5E11,0x5E1B,0x5E36,0x5E37,/* 0xE0-0xE7 */ + 0x5E44,0x5E43,0x5E40,0x5E4E,0x5E57,0x5E54,0x5E5F,0x5E62,/* 0xE8-0xEF */ + 0x5E64,0x5E47,0x5E75,0x5E76,0x5E7A,0x9EBC,0x5E7F,0x5EA0,/* 0xF0-0xF7 */ + 0x5EC1,0x5EC2,0x5EC8,0x5ED0,0x5ECF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5ED6,0x5EE3,0x5EDD,0x5EDA,0x5EDB,0x5EE2,0x5EE1,0x5EE8,/* 0x40-0x47 */ + 0x5EE9,0x5EEC,0x5EF1,0x5EF3,0x5EF0,0x5EF4,0x5EF8,0x5EFE,/* 0x48-0x4F */ + 0x5F03,0x5F09,0x5F5D,0x5F5C,0x5F0B,0x5F11,0x5F16,0x5F29,/* 0x50-0x57 */ + 0x5F2D,0x5F38,0x5F41,0x5F48,0x5F4C,0x5F4E,0x5F2F,0x5F51,/* 0x58-0x5F */ + 0x5F56,0x5F57,0x5F59,0x5F61,0x5F6D,0x5F73,0x5F77,0x5F83,/* 0x60-0x67 */ + 0x5F82,0x5F7F,0x5F8A,0x5F88,0x5F91,0x5F87,0x5F9E,0x5F99,/* 0x68-0x6F */ + 0x5F98,0x5FA0,0x5FA8,0x5FAD,0x5FBC,0x5FD6,0x5FFB,0x5FE4,/* 0x70-0x77 */ + 0x5FF8,0x5FF1,0x5FDD,0x60B3,0x5FFF,0x6021,0x6060,0x0000,/* 0x78-0x7F */ + + 0x6019,0x6010,0x6029,0x600E,0x6031,0x601B,0x6015,0x602B,/* 0x80-0x87 */ + 0x6026,0x600F,0x603A,0x605A,0x6041,0x606A,0x6077,0x605F,/* 0x88-0x8F */ + 0x604A,0x6046,0x604D,0x6063,0x6043,0x6064,0x6042,0x606C,/* 0x90-0x97 */ + 0x606B,0x6059,0x6081,0x608D,0x60E7,0x6083,0x609A,0x6084,/* 0x98-0x9F */ + 0x609B,0x6096,0x6097,0x6092,0x60A7,0x608B,0x60E1,0x60B8,/* 0xA0-0xA7 */ + 0x60E0,0x60D3,0x60B4,0x5FF0,0x60BD,0x60C6,0x60B5,0x60D8,/* 0xA8-0xAF */ + 0x614D,0x6115,0x6106,0x60F6,0x60F7,0x6100,0x60F4,0x60FA,/* 0xB0-0xB7 */ + 0x6103,0x6121,0x60FB,0x60F1,0x610D,0x610E,0x6147,0x613E,/* 0xB8-0xBF */ + 0x6128,0x6127,0x614A,0x613F,0x613C,0x612C,0x6134,0x613D,/* 0xC0-0xC7 */ + 0x6142,0x6144,0x6173,0x6177,0x6158,0x6159,0x615A,0x616B,/* 0xC8-0xCF */ + 0x6174,0x616F,0x6165,0x6171,0x615F,0x615D,0x6153,0x6175,/* 0xD0-0xD7 */ + 0x6199,0x6196,0x6187,0x61AC,0x6194,0x619A,0x618A,0x6191,/* 0xD8-0xDF */ + 0x61AB,0x61AE,0x61CC,0x61CA,0x61C9,0x61F7,0x61C8,0x61C3,/* 0xE0-0xE7 */ + 0x61C6,0x61BA,0x61CB,0x7F79,0x61CD,0x61E6,0x61E3,0x61F6,/* 0xE8-0xEF */ + 0x61FA,0x61F4,0x61FF,0x61FD,0x61FC,0x61FE,0x6200,0x6208,/* 0xF0-0xF7 */ + 0x6209,0x620D,0x620C,0x6214,0x621B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x621E,0x6221,0x622A,0x622E,0x6230,0x6232,0x6233,0x6241,/* 0x40-0x47 */ + 0x624E,0x625E,0x6263,0x625B,0x6260,0x6268,0x627C,0x6282,/* 0x48-0x4F */ + 0x6289,0x627E,0x6292,0x6293,0x6296,0x62D4,0x6283,0x6294,/* 0x50-0x57 */ + 0x62D7,0x62D1,0x62BB,0x62CF,0x62FF,0x62C6,0x64D4,0x62C8,/* 0x58-0x5F */ + 0x62DC,0x62CC,0x62CA,0x62C2,0x62C7,0x629B,0x62C9,0x630C,/* 0x60-0x67 */ + 0x62EE,0x62F1,0x6327,0x6302,0x6308,0x62EF,0x62F5,0x6350,/* 0x68-0x6F */ + 0x633E,0x634D,0x641C,0x634F,0x6396,0x638E,0x6380,0x63AB,/* 0x70-0x77 */ + 0x6376,0x63A3,0x638F,0x6389,0x639F,0x63B5,0x636B,0x0000,/* 0x78-0x7F */ + + 0x6369,0x63BE,0x63E9,0x63C0,0x63C6,0x63E3,0x63C9,0x63D2,/* 0x80-0x87 */ + 0x63F6,0x63C4,0x6416,0x6434,0x6406,0x6413,0x6426,0x6436,/* 0x88-0x8F */ + 0x651D,0x6417,0x6428,0x640F,0x6467,0x646F,0x6476,0x644E,/* 0x90-0x97 */ + 0x652A,0x6495,0x6493,0x64A5,0x64A9,0x6488,0x64BC,0x64DA,/* 0x98-0x9F */ + 0x64D2,0x64C5,0x64C7,0x64BB,0x64D8,0x64C2,0x64F1,0x64E7,/* 0xA0-0xA7 */ + 0x8209,0x64E0,0x64E1,0x62AC,0x64E3,0x64EF,0x652C,0x64F6,/* 0xA8-0xAF */ + 0x64F4,0x64F2,0x64FA,0x6500,0x64FD,0x6518,0x651C,0x6505,/* 0xB0-0xB7 */ + 0x6524,0x6523,0x652B,0x6534,0x6535,0x6537,0x6536,0x6538,/* 0xB8-0xBF */ + 0x754B,0x6548,0x6556,0x6555,0x654D,0x6558,0x655E,0x655D,/* 0xC0-0xC7 */ + 0x6572,0x6578,0x6582,0x6583,0x8B8A,0x659B,0x659F,0x65AB,/* 0xC8-0xCF */ + 0x65B7,0x65C3,0x65C6,0x65C1,0x65C4,0x65CC,0x65D2,0x65DB,/* 0xD0-0xD7 */ + 0x65D9,0x65E0,0x65E1,0x65F1,0x6772,0x660A,0x6603,0x65FB,/* 0xD8-0xDF */ + 0x6773,0x6635,0x6636,0x6634,0x661C,0x664F,0x6644,0x6649,/* 0xE0-0xE7 */ + 0x6641,0x665E,0x665D,0x6664,0x6667,0x6668,0x665F,0x6662,/* 0xE8-0xEF */ + 0x6670,0x6683,0x6688,0x668E,0x6689,0x6684,0x6698,0x669D,/* 0xF0-0xF7 */ + 0x66C1,0x66B9,0x66C9,0x66BE,0x66BC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x66C4,0x66B8,0x66D6,0x66DA,0x66E0,0x663F,0x66E6,0x66E9,/* 0x40-0x47 */ + 0x66F0,0x66F5,0x66F7,0x670F,0x6716,0x671E,0x6726,0x6727,/* 0x48-0x4F */ + 0x9738,0x672E,0x673F,0x6736,0x6741,0x6738,0x6737,0x6746,/* 0x50-0x57 */ + 0x675E,0x6760,0x6759,0x6763,0x6764,0x6789,0x6770,0x67A9,/* 0x58-0x5F */ + 0x677C,0x676A,0x678C,0x678B,0x67A6,0x67A1,0x6785,0x67B7,/* 0x60-0x67 */ + 0x67EF,0x67B4,0x67EC,0x67B3,0x67E9,0x67B8,0x67E4,0x67DE,/* 0x68-0x6F */ + 0x67DD,0x67E2,0x67EE,0x67B9,0x67CE,0x67C6,0x67E7,0x6A9C,/* 0x70-0x77 */ + 0x681E,0x6846,0x6829,0x6840,0x684D,0x6832,0x684E,0x0000,/* 0x78-0x7F */ + + 0x68B3,0x682B,0x6859,0x6863,0x6877,0x687F,0x689F,0x688F,/* 0x80-0x87 */ + 0x68AD,0x6894,0x689D,0x689B,0x6883,0x6AAE,0x68B9,0x6874,/* 0x88-0x8F */ + 0x68B5,0x68A0,0x68BA,0x690F,0x688D,0x687E,0x6901,0x68CA,/* 0x90-0x97 */ + 0x6908,0x68D8,0x6922,0x6926,0x68E1,0x690C,0x68CD,0x68D4,/* 0x98-0x9F */ + 0x68E7,0x68D5,0x6936,0x6912,0x6904,0x68D7,0x68E3,0x6925,/* 0xA0-0xA7 */ + 0x68F9,0x68E0,0x68EF,0x6928,0x692A,0x691A,0x6923,0x6921,/* 0xA8-0xAF */ + 0x68C6,0x6979,0x6977,0x695C,0x6978,0x696B,0x6954,0x697E,/* 0xB0-0xB7 */ + 0x696E,0x6939,0x6974,0x693D,0x6959,0x6930,0x6961,0x695E,/* 0xB8-0xBF */ + 0x695D,0x6981,0x696A,0x69B2,0x69AE,0x69D0,0x69BF,0x69C1,/* 0xC0-0xC7 */ + 0x69D3,0x69BE,0x69CE,0x5BE8,0x69CA,0x69DD,0x69BB,0x69C3,/* 0xC8-0xCF */ + 0x69A7,0x6A2E,0x6991,0x69A0,0x699C,0x6995,0x69B4,0x69DE,/* 0xD0-0xD7 */ + 0x69E8,0x6A02,0x6A1B,0x69FF,0x6B0A,0x69F9,0x69F2,0x69E7,/* 0xD8-0xDF */ + 0x6A05,0x69B1,0x6A1E,0x69ED,0x6A14,0x69EB,0x6A0A,0x6A12,/* 0xE0-0xE7 */ + 0x6AC1,0x6A23,0x6A13,0x6A44,0x6A0C,0x6A72,0x6A36,0x6A78,/* 0xE8-0xEF */ + 0x6A47,0x6A62,0x6A59,0x6A66,0x6A48,0x6A38,0x6A22,0x6A90,/* 0xF0-0xF7 */ + 0x6A8D,0x6AA0,0x6A84,0x6AA2,0x6AA3,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A97,0x8617,0x6ABB,0x6AC3,0x6AC2,0x6AB8,0x6AB3,0x6AAC,/* 0x40-0x47 */ + 0x6ADE,0x6AD1,0x6ADF,0x6AAA,0x6ADA,0x6AEA,0x6AFB,0x6B05,/* 0x48-0x4F */ + 0x8616,0x6AFA,0x6B12,0x6B16,0x9B31,0x6B1F,0x6B38,0x6B37,/* 0x50-0x57 */ + 0x76DC,0x6B39,0x98EE,0x6B47,0x6B43,0x6B49,0x6B50,0x6B59,/* 0x58-0x5F */ + 0x6B54,0x6B5B,0x6B5F,0x6B61,0x6B78,0x6B79,0x6B7F,0x6B80,/* 0x60-0x67 */ + 0x6B84,0x6B83,0x6B8D,0x6B98,0x6B95,0x6B9E,0x6BA4,0x6BAA,/* 0x68-0x6F */ + 0x6BAB,0x6BAF,0x6BB2,0x6BB1,0x6BB3,0x6BB7,0x6BBC,0x6BC6,/* 0x70-0x77 */ + 0x6BCB,0x6BD3,0x6BDF,0x6BEC,0x6BEB,0x6BF3,0x6BEF,0x0000,/* 0x78-0x7F */ + + 0x9EBE,0x6C08,0x6C13,0x6C14,0x6C1B,0x6C24,0x6C23,0x6C5E,/* 0x80-0x87 */ + 0x6C55,0x6C62,0x6C6A,0x6C82,0x6C8D,0x6C9A,0x6C81,0x6C9B,/* 0x88-0x8F */ + 0x6C7E,0x6C68,0x6C73,0x6C92,0x6C90,0x6CC4,0x6CF1,0x6CD3,/* 0x90-0x97 */ + 0x6CBD,0x6CD7,0x6CC5,0x6CDD,0x6CAE,0x6CB1,0x6CBE,0x6CBA,/* 0x98-0x9F */ + 0x6CDB,0x6CEF,0x6CD9,0x6CEA,0x6D1F,0x884D,0x6D36,0x6D2B,/* 0xA0-0xA7 */ + 0x6D3D,0x6D38,0x6D19,0x6D35,0x6D33,0x6D12,0x6D0C,0x6D63,/* 0xA8-0xAF */ + 0x6D93,0x6D64,0x6D5A,0x6D79,0x6D59,0x6D8E,0x6D95,0x6FE4,/* 0xB0-0xB7 */ + 0x6D85,0x6DF9,0x6E15,0x6E0A,0x6DB5,0x6DC7,0x6DE6,0x6DB8,/* 0xB8-0xBF */ + 0x6DC6,0x6DEC,0x6DDE,0x6DCC,0x6DE8,0x6DD2,0x6DC5,0x6DFA,/* 0xC0-0xC7 */ + 0x6DD9,0x6DE4,0x6DD5,0x6DEA,0x6DEE,0x6E2D,0x6E6E,0x6E2E,/* 0xC8-0xCF */ + 0x6E19,0x6E72,0x6E5F,0x6E3E,0x6E23,0x6E6B,0x6E2B,0x6E76,/* 0xD0-0xD7 */ + 0x6E4D,0x6E1F,0x6E43,0x6E3A,0x6E4E,0x6E24,0x6EFF,0x6E1D,/* 0xD8-0xDF */ + 0x6E38,0x6E82,0x6EAA,0x6E98,0x6EC9,0x6EB7,0x6ED3,0x6EBD,/* 0xE0-0xE7 */ + 0x6EAF,0x6EC4,0x6EB2,0x6ED4,0x6ED5,0x6E8F,0x6EA5,0x6EC2,/* 0xE8-0xEF */ + 0x6E9F,0x6F41,0x6F11,0x704C,0x6EEC,0x6EF8,0x6EFE,0x6F3F,/* 0xF0-0xF7 */ + 0x6EF2,0x6F31,0x6EEF,0x6F32,0x6ECC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6F3E,0x6F13,0x6EF7,0x6F86,0x6F7A,0x6F78,0x6F81,0x6F80,/* 0x40-0x47 */ + 0x6F6F,0x6F5B,0x6FF3,0x6F6D,0x6F82,0x6F7C,0x6F58,0x6F8E,/* 0x48-0x4F */ + 0x6F91,0x6FC2,0x6F66,0x6FB3,0x6FA3,0x6FA1,0x6FA4,0x6FB9,/* 0x50-0x57 */ + 0x6FC6,0x6FAA,0x6FDF,0x6FD5,0x6FEC,0x6FD4,0x6FD8,0x6FF1,/* 0x58-0x5F */ + 0x6FEE,0x6FDB,0x7009,0x700B,0x6FFA,0x7011,0x7001,0x700F,/* 0x60-0x67 */ + 0x6FFE,0x701B,0x701A,0x6F74,0x701D,0x7018,0x701F,0x7030,/* 0x68-0x6F */ + 0x703E,0x7032,0x7051,0x7063,0x7099,0x7092,0x70AF,0x70F1,/* 0x70-0x77 */ + 0x70AC,0x70B8,0x70B3,0x70AE,0x70DF,0x70CB,0x70DD,0x0000,/* 0x78-0x7F */ + + 0x70D9,0x7109,0x70FD,0x711C,0x7119,0x7165,0x7155,0x7188,/* 0x80-0x87 */ + 0x7166,0x7162,0x714C,0x7156,0x716C,0x718F,0x71FB,0x7184,/* 0x88-0x8F */ + 0x7195,0x71A8,0x71AC,0x71D7,0x71B9,0x71BE,0x71D2,0x71C9,/* 0x90-0x97 */ + 0x71D4,0x71CE,0x71E0,0x71EC,0x71E7,0x71F5,0x71FC,0x71F9,/* 0x98-0x9F */ + 0x71FF,0x720D,0x7210,0x721B,0x7228,0x722D,0x722C,0x7230,/* 0xA0-0xA7 */ + 0x7232,0x723B,0x723C,0x723F,0x7240,0x7246,0x724B,0x7258,/* 0xA8-0xAF */ + 0x7274,0x727E,0x7282,0x7281,0x7287,0x7292,0x7296,0x72A2,/* 0xB0-0xB7 */ + 0x72A7,0x72B9,0x72B2,0x72C3,0x72C6,0x72C4,0x72CE,0x72D2,/* 0xB8-0xBF */ + 0x72E2,0x72E0,0x72E1,0x72F9,0x72F7,0x500F,0x7317,0x730A,/* 0xC0-0xC7 */ + 0x731C,0x7316,0x731D,0x7334,0x732F,0x7329,0x7325,0x733E,/* 0xC8-0xCF */ + 0x734E,0x734F,0x9ED8,0x7357,0x736A,0x7368,0x7370,0x7378,/* 0xD0-0xD7 */ + 0x7375,0x737B,0x737A,0x73C8,0x73B3,0x73CE,0x73BB,0x73C0,/* 0xD8-0xDF */ + 0x73E5,0x73EE,0x73DE,0x74A2,0x7405,0x746F,0x7425,0x73F8,/* 0xE0-0xE7 */ + 0x7432,0x743A,0x7455,0x743F,0x745F,0x7459,0x7441,0x745C,/* 0xE8-0xEF */ + 0x7469,0x7470,0x7463,0x746A,0x7476,0x747E,0x748B,0x749E,/* 0xF0-0xF7 */ + 0x74A7,0x74CA,0x74CF,0x74D4,0x73F1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74E0,0x74E3,0x74E7,0x74E9,0x74EE,0x74F2,0x74F0,0x74F1,/* 0x40-0x47 */ + 0x74F8,0x74F7,0x7504,0x7503,0x7505,0x750C,0x750E,0x750D,/* 0x48-0x4F */ + 0x7515,0x7513,0x751E,0x7526,0x752C,0x753C,0x7544,0x754D,/* 0x50-0x57 */ + 0x754A,0x7549,0x755B,0x7546,0x755A,0x7569,0x7564,0x7567,/* 0x58-0x5F */ + 0x756B,0x756D,0x7578,0x7576,0x7586,0x7587,0x7574,0x758A,/* 0x60-0x67 */ + 0x7589,0x7582,0x7594,0x759A,0x759D,0x75A5,0x75A3,0x75C2,/* 0x68-0x6F */ + 0x75B3,0x75C3,0x75B5,0x75BD,0x75B8,0x75BC,0x75B1,0x75CD,/* 0x70-0x77 */ + 0x75CA,0x75D2,0x75D9,0x75E3,0x75DE,0x75FE,0x75FF,0x0000,/* 0x78-0x7F */ + + 0x75FC,0x7601,0x75F0,0x75FA,0x75F2,0x75F3,0x760B,0x760D,/* 0x80-0x87 */ + 0x7609,0x761F,0x7627,0x7620,0x7621,0x7622,0x7624,0x7634,/* 0x88-0x8F */ + 0x7630,0x763B,0x7647,0x7648,0x7646,0x765C,0x7658,0x7661,/* 0x90-0x97 */ + 0x7662,0x7668,0x7669,0x766A,0x7667,0x766C,0x7670,0x7672,/* 0x98-0x9F */ + 0x7676,0x7678,0x767C,0x7680,0x7683,0x7688,0x768B,0x768E,/* 0xA0-0xA7 */ + 0x7696,0x7693,0x7699,0x769A,0x76B0,0x76B4,0x76B8,0x76B9,/* 0xA8-0xAF */ + 0x76BA,0x76C2,0x76CD,0x76D6,0x76D2,0x76DE,0x76E1,0x76E5,/* 0xB0-0xB7 */ + 0x76E7,0x76EA,0x862F,0x76FB,0x7708,0x7707,0x7704,0x7729,/* 0xB8-0xBF */ + 0x7724,0x771E,0x7725,0x7726,0x771B,0x7737,0x7738,0x7747,/* 0xC0-0xC7 */ + 0x775A,0x7768,0x776B,0x775B,0x7765,0x777F,0x777E,0x7779,/* 0xC8-0xCF */ + 0x778E,0x778B,0x7791,0x77A0,0x779E,0x77B0,0x77B6,0x77B9,/* 0xD0-0xD7 */ + 0x77BF,0x77BC,0x77BD,0x77BB,0x77C7,0x77CD,0x77D7,0x77DA,/* 0xD8-0xDF */ + 0x77DC,0x77E3,0x77EE,0x77FC,0x780C,0x7812,0x7926,0x7820,/* 0xE0-0xE7 */ + 0x792A,0x7845,0x788E,0x7874,0x7886,0x787C,0x789A,0x788C,/* 0xE8-0xEF */ + 0x78A3,0x78B5,0x78AA,0x78AF,0x78D1,0x78C6,0x78CB,0x78D4,/* 0xF0-0xF7 */ + 0x78BE,0x78BC,0x78C5,0x78CA,0x78EC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x78E7,0x78DA,0x78FD,0x78F4,0x7907,0x7912,0x7911,0x7919,/* 0x40-0x47 */ + 0x792C,0x792B,0x7940,0x7960,0x7957,0x795F,0x795A,0x7955,/* 0x48-0x4F */ + 0x7953,0x797A,0x797F,0x798A,0x799D,0x79A7,0x9F4B,0x79AA,/* 0x50-0x57 */ + 0x79AE,0x79B3,0x79B9,0x79BA,0x79C9,0x79D5,0x79E7,0x79EC,/* 0x58-0x5F */ + 0x79E1,0x79E3,0x7A08,0x7A0D,0x7A18,0x7A19,0x7A20,0x7A1F,/* 0x60-0x67 */ + 0x7980,0x7A31,0x7A3B,0x7A3E,0x7A37,0x7A43,0x7A57,0x7A49,/* 0x68-0x6F */ + 0x7A61,0x7A62,0x7A69,0x9F9D,0x7A70,0x7A79,0x7A7D,0x7A88,/* 0x70-0x77 */ + 0x7A97,0x7A95,0x7A98,0x7A96,0x7AA9,0x7AC8,0x7AB0,0x0000,/* 0x78-0x7F */ + + 0x7AB6,0x7AC5,0x7AC4,0x7ABF,0x9083,0x7AC7,0x7ACA,0x7ACD,/* 0x80-0x87 */ + 0x7ACF,0x7AD5,0x7AD3,0x7AD9,0x7ADA,0x7ADD,0x7AE1,0x7AE2,/* 0x88-0x8F */ + 0x7AE6,0x7AED,0x7AF0,0x7B02,0x7B0F,0x7B0A,0x7B06,0x7B33,/* 0x90-0x97 */ + 0x7B18,0x7B19,0x7B1E,0x7B35,0x7B28,0x7B36,0x7B50,0x7B7A,/* 0x98-0x9F */ + 0x7B04,0x7B4D,0x7B0B,0x7B4C,0x7B45,0x7B75,0x7B65,0x7B74,/* 0xA0-0xA7 */ + 0x7B67,0x7B70,0x7B71,0x7B6C,0x7B6E,0x7B9D,0x7B98,0x7B9F,/* 0xA8-0xAF */ + 0x7B8D,0x7B9C,0x7B9A,0x7B8B,0x7B92,0x7B8F,0x7B5D,0x7B99,/* 0xB0-0xB7 */ + 0x7BCB,0x7BC1,0x7BCC,0x7BCF,0x7BB4,0x7BC6,0x7BDD,0x7BE9,/* 0xB8-0xBF */ + 0x7C11,0x7C14,0x7BE6,0x7BE5,0x7C60,0x7C00,0x7C07,0x7C13,/* 0xC0-0xC7 */ + 0x7BF3,0x7BF7,0x7C17,0x7C0D,0x7BF6,0x7C23,0x7C27,0x7C2A,/* 0xC8-0xCF */ + 0x7C1F,0x7C37,0x7C2B,0x7C3D,0x7C4C,0x7C43,0x7C54,0x7C4F,/* 0xD0-0xD7 */ + 0x7C40,0x7C50,0x7C58,0x7C5F,0x7C64,0x7C56,0x7C65,0x7C6C,/* 0xD8-0xDF */ + 0x7C75,0x7C83,0x7C90,0x7CA4,0x7CAD,0x7CA2,0x7CAB,0x7CA1,/* 0xE0-0xE7 */ + 0x7CA8,0x7CB3,0x7CB2,0x7CB1,0x7CAE,0x7CB9,0x7CBD,0x7CC0,/* 0xE8-0xEF */ + 0x7CC5,0x7CC2,0x7CD8,0x7CD2,0x7CDC,0x7CE2,0x9B3B,0x7CEF,/* 0xF0-0xF7 */ + 0x7CF2,0x7CF4,0x7CF6,0x7CFA,0x7D06,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D02,0x7D1C,0x7D15,0x7D0A,0x7D45,0x7D4B,0x7D2E,0x7D32,/* 0x40-0x47 */ + 0x7D3F,0x7D35,0x7D46,0x7D73,0x7D56,0x7D4E,0x7D72,0x7D68,/* 0x48-0x4F */ + 0x7D6E,0x7D4F,0x7D63,0x7D93,0x7D89,0x7D5B,0x7D8F,0x7D7D,/* 0x50-0x57 */ + 0x7D9B,0x7DBA,0x7DAE,0x7DA3,0x7DB5,0x7DC7,0x7DBD,0x7DAB,/* 0x58-0x5F */ + 0x7E3D,0x7DA2,0x7DAF,0x7DDC,0x7DB8,0x7D9F,0x7DB0,0x7DD8,/* 0x60-0x67 */ + 0x7DDD,0x7DE4,0x7DDE,0x7DFB,0x7DF2,0x7DE1,0x7E05,0x7E0A,/* 0x68-0x6F */ + 0x7E23,0x7E21,0x7E12,0x7E31,0x7E1F,0x7E09,0x7E0B,0x7E22,/* 0x70-0x77 */ + 0x7E46,0x7E66,0x7E3B,0x7E35,0x7E39,0x7E43,0x7E37,0x0000,/* 0x78-0x7F */ + + 0x7E32,0x7E3A,0x7E67,0x7E5D,0x7E56,0x7E5E,0x7E59,0x7E5A,/* 0x80-0x87 */ + 0x7E79,0x7E6A,0x7E69,0x7E7C,0x7E7B,0x7E83,0x7DD5,0x7E7D,/* 0x88-0x8F */ + 0x8FAE,0x7E7F,0x7E88,0x7E89,0x7E8C,0x7E92,0x7E90,0x7E93,/* 0x90-0x97 */ + 0x7E94,0x7E96,0x7E8E,0x7E9B,0x7E9C,0x7F38,0x7F3A,0x7F45,/* 0x98-0x9F */ + 0x7F4C,0x7F4D,0x7F4E,0x7F50,0x7F51,0x7F55,0x7F54,0x7F58,/* 0xA0-0xA7 */ + 0x7F5F,0x7F60,0x7F68,0x7F69,0x7F67,0x7F78,0x7F82,0x7F86,/* 0xA8-0xAF */ + 0x7F83,0x7F88,0x7F87,0x7F8C,0x7F94,0x7F9E,0x7F9D,0x7F9A,/* 0xB0-0xB7 */ + 0x7FA3,0x7FAF,0x7FB2,0x7FB9,0x7FAE,0x7FB6,0x7FB8,0x8B71,/* 0xB8-0xBF */ + 0x7FC5,0x7FC6,0x7FCA,0x7FD5,0x7FD4,0x7FE1,0x7FE6,0x7FE9,/* 0xC0-0xC7 */ + 0x7FF3,0x7FF9,0x98DC,0x8006,0x8004,0x800B,0x8012,0x8018,/* 0xC8-0xCF */ + 0x8019,0x801C,0x8021,0x8028,0x803F,0x803B,0x804A,0x8046,/* 0xD0-0xD7 */ + 0x8052,0x8058,0x805A,0x805F,0x8062,0x8068,0x8073,0x8072,/* 0xD8-0xDF */ + 0x8070,0x8076,0x8079,0x807D,0x807F,0x8084,0x8086,0x8085,/* 0xE0-0xE7 */ + 0x809B,0x8093,0x809A,0x80AD,0x5190,0x80AC,0x80DB,0x80E5,/* 0xE8-0xEF */ + 0x80D9,0x80DD,0x80C4,0x80DA,0x80D6,0x8109,0x80EF,0x80F1,/* 0xF0-0xF7 */ + 0x811B,0x8129,0x8123,0x812F,0x814B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x968B,0x8146,0x813E,0x8153,0x8151,0x80FC,0x8171,0x816E,/* 0x40-0x47 */ + 0x8165,0x8166,0x8174,0x8183,0x8188,0x818A,0x8180,0x8182,/* 0x48-0x4F */ + 0x81A0,0x8195,0x81A4,0x81A3,0x815F,0x8193,0x81A9,0x81B0,/* 0x50-0x57 */ + 0x81B5,0x81BE,0x81B8,0x81BD,0x81C0,0x81C2,0x81BA,0x81C9,/* 0x58-0x5F */ + 0x81CD,0x81D1,0x81D9,0x81D8,0x81C8,0x81DA,0x81DF,0x81E0,/* 0x60-0x67 */ + 0x81E7,0x81FA,0x81FB,0x81FE,0x8201,0x8202,0x8205,0x8207,/* 0x68-0x6F */ + 0x820A,0x820D,0x8210,0x8216,0x8229,0x822B,0x8238,0x8233,/* 0x70-0x77 */ + 0x8240,0x8259,0x8258,0x825D,0x825A,0x825F,0x8264,0x0000,/* 0x78-0x7F */ + + 0x8262,0x8268,0x826A,0x826B,0x822E,0x8271,0x8277,0x8278,/* 0x80-0x87 */ + 0x827E,0x828D,0x8292,0x82AB,0x829F,0x82BB,0x82AC,0x82E1,/* 0x88-0x8F */ + 0x82E3,0x82DF,0x82D2,0x82F4,0x82F3,0x82FA,0x8393,0x8303,/* 0x90-0x97 */ + 0x82FB,0x82F9,0x82DE,0x8306,0x82DC,0x8309,0x82D9,0x8335,/* 0x98-0x9F */ + 0x8334,0x8316,0x8332,0x8331,0x8340,0x8339,0x8350,0x8345,/* 0xA0-0xA7 */ + 0x832F,0x832B,0x8317,0x8318,0x8385,0x839A,0x83AA,0x839F,/* 0xA8-0xAF */ + 0x83A2,0x8396,0x8323,0x838E,0x8387,0x838A,0x837C,0x83B5,/* 0xB0-0xB7 */ + 0x8373,0x8375,0x83A0,0x8389,0x83A8,0x83F4,0x8413,0x83EB,/* 0xB8-0xBF */ + 0x83CE,0x83FD,0x8403,0x83D8,0x840B,0x83C1,0x83F7,0x8407,/* 0xC0-0xC7 */ + 0x83E0,0x83F2,0x840D,0x8422,0x8420,0x83BD,0x8438,0x8506,/* 0xC8-0xCF */ + 0x83FB,0x846D,0x842A,0x843C,0x855A,0x8484,0x8477,0x846B,/* 0xD0-0xD7 */ + 0x84AD,0x846E,0x8482,0x8469,0x8446,0x842C,0x846F,0x8479,/* 0xD8-0xDF */ + 0x8435,0x84CA,0x8462,0x84B9,0x84BF,0x849F,0x84D9,0x84CD,/* 0xE0-0xE7 */ + 0x84BB,0x84DA,0x84D0,0x84C1,0x84C6,0x84D6,0x84A1,0x8521,/* 0xE8-0xEF */ + 0x84FF,0x84F4,0x8517,0x8518,0x852C,0x851F,0x8515,0x8514,/* 0xF0-0xF7 */ + 0x84FC,0x8540,0x8563,0x8558,0x8548,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8541,0x8602,0x854B,0x8555,0x8580,0x85A4,0x8588,0x8591,/* 0x40-0x47 */ + 0x858A,0x85A8,0x856D,0x8594,0x859B,0x85EA,0x8587,0x859C,/* 0x48-0x4F */ + 0x8577,0x857E,0x8590,0x85C9,0x85BA,0x85CF,0x85B9,0x85D0,/* 0x50-0x57 */ + 0x85D5,0x85DD,0x85E5,0x85DC,0x85F9,0x860A,0x8613,0x860B,/* 0x58-0x5F */ + 0x85FE,0x85FA,0x8606,0x8622,0x861A,0x8630,0x863F,0x864D,/* 0x60-0x67 */ + 0x4E55,0x8654,0x865F,0x8667,0x8671,0x8693,0x86A3,0x86A9,/* 0x68-0x6F */ + 0x86AA,0x868B,0x868C,0x86B6,0x86AF,0x86C4,0x86C6,0x86B0,/* 0x70-0x77 */ + 0x86C9,0x8823,0x86AB,0x86D4,0x86DE,0x86E9,0x86EC,0x0000,/* 0x78-0x7F */ + + 0x86DF,0x86DB,0x86EF,0x8712,0x8706,0x8708,0x8700,0x8703,/* 0x80-0x87 */ + 0x86FB,0x8711,0x8709,0x870D,0x86F9,0x870A,0x8734,0x873F,/* 0x88-0x8F */ + 0x8737,0x873B,0x8725,0x8729,0x871A,0x8760,0x875F,0x8778,/* 0x90-0x97 */ + 0x874C,0x874E,0x8774,0x8757,0x8768,0x876E,0x8759,0x8753,/* 0x98-0x9F */ + 0x8763,0x876A,0x8805,0x87A2,0x879F,0x8782,0x87AF,0x87CB,/* 0xA0-0xA7 */ + 0x87BD,0x87C0,0x87D0,0x96D6,0x87AB,0x87C4,0x87B3,0x87C7,/* 0xA8-0xAF */ + 0x87C6,0x87BB,0x87EF,0x87F2,0x87E0,0x880F,0x880D,0x87FE,/* 0xB0-0xB7 */ + 0x87F6,0x87F7,0x880E,0x87D2,0x8811,0x8816,0x8815,0x8822,/* 0xB8-0xBF */ + 0x8821,0x8831,0x8836,0x8839,0x8827,0x883B,0x8844,0x8842,/* 0xC0-0xC7 */ + 0x8852,0x8859,0x885E,0x8862,0x886B,0x8881,0x887E,0x889E,/* 0xC8-0xCF */ + 0x8875,0x887D,0x88B5,0x8872,0x8882,0x8897,0x8892,0x88AE,/* 0xD0-0xD7 */ + 0x8899,0x88A2,0x888D,0x88A4,0x88B0,0x88BF,0x88B1,0x88C3,/* 0xD8-0xDF */ + 0x88C4,0x88D4,0x88D8,0x88D9,0x88DD,0x88F9,0x8902,0x88FC,/* 0xE0-0xE7 */ + 0x88F4,0x88E8,0x88F2,0x8904,0x890C,0x890A,0x8913,0x8943,/* 0xE8-0xEF */ + 0x891E,0x8925,0x892A,0x892B,0x8941,0x8944,0x893B,0x8936,/* 0xF0-0xF7 */ + 0x8938,0x894C,0x891D,0x8960,0x895E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8966,0x8964,0x896D,0x896A,0x896F,0x8974,0x8977,0x897E,/* 0x40-0x47 */ + 0x8983,0x8988,0x898A,0x8993,0x8998,0x89A1,0x89A9,0x89A6,/* 0x48-0x4F */ + 0x89AC,0x89AF,0x89B2,0x89BA,0x89BD,0x89BF,0x89C0,0x89DA,/* 0x50-0x57 */ + 0x89DC,0x89DD,0x89E7,0x89F4,0x89F8,0x8A03,0x8A16,0x8A10,/* 0x58-0x5F */ + 0x8A0C,0x8A1B,0x8A1D,0x8A25,0x8A36,0x8A41,0x8A5B,0x8A52,/* 0x60-0x67 */ + 0x8A46,0x8A48,0x8A7C,0x8A6D,0x8A6C,0x8A62,0x8A85,0x8A82,/* 0x68-0x6F */ + 0x8A84,0x8AA8,0x8AA1,0x8A91,0x8AA5,0x8AA6,0x8A9A,0x8AA3,/* 0x70-0x77 */ + 0x8AC4,0x8ACD,0x8AC2,0x8ADA,0x8AEB,0x8AF3,0x8AE7,0x0000,/* 0x78-0x7F */ + + 0x8AE4,0x8AF1,0x8B14,0x8AE0,0x8AE2,0x8AF7,0x8ADE,0x8ADB,/* 0x80-0x87 */ + 0x8B0C,0x8B07,0x8B1A,0x8AE1,0x8B16,0x8B10,0x8B17,0x8B20,/* 0x88-0x8F */ + 0x8B33,0x97AB,0x8B26,0x8B2B,0x8B3E,0x8B28,0x8B41,0x8B4C,/* 0x90-0x97 */ + 0x8B4F,0x8B4E,0x8B49,0x8B56,0x8B5B,0x8B5A,0x8B6B,0x8B5F,/* 0x98-0x9F */ + 0x8B6C,0x8B6F,0x8B74,0x8B7D,0x8B80,0x8B8C,0x8B8E,0x8B92,/* 0xA0-0xA7 */ + 0x8B93,0x8B96,0x8B99,0x8B9A,0x8C3A,0x8C41,0x8C3F,0x8C48,/* 0xA8-0xAF */ + 0x8C4C,0x8C4E,0x8C50,0x8C55,0x8C62,0x8C6C,0x8C78,0x8C7A,/* 0xB0-0xB7 */ + 0x8C82,0x8C89,0x8C85,0x8C8A,0x8C8D,0x8C8E,0x8C94,0x8C7C,/* 0xB8-0xBF */ + 0x8C98,0x621D,0x8CAD,0x8CAA,0x8CBD,0x8CB2,0x8CB3,0x8CAE,/* 0xC0-0xC7 */ + 0x8CB6,0x8CC8,0x8CC1,0x8CE4,0x8CE3,0x8CDA,0x8CFD,0x8CFA,/* 0xC8-0xCF */ + 0x8CFB,0x8D04,0x8D05,0x8D0A,0x8D07,0x8D0F,0x8D0D,0x8D10,/* 0xD0-0xD7 */ + 0x9F4E,0x8D13,0x8CCD,0x8D14,0x8D16,0x8D67,0x8D6D,0x8D71,/* 0xD8-0xDF */ + 0x8D73,0x8D81,0x8D99,0x8DC2,0x8DBE,0x8DBA,0x8DCF,0x8DDA,/* 0xE0-0xE7 */ + 0x8DD6,0x8DCC,0x8DDB,0x8DCB,0x8DEA,0x8DEB,0x8DDF,0x8DE3,/* 0xE8-0xEF */ + 0x8DFC,0x8E08,0x8E09,0x8DFF,0x8E1D,0x8E1E,0x8E10,0x8E1F,/* 0xF0-0xF7 */ + 0x8E42,0x8E35,0x8E30,0x8E34,0x8E4A,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E47,0x8E49,0x8E4C,0x8E50,0x8E48,0x8E59,0x8E64,0x8E60,/* 0x40-0x47 */ + 0x8E2A,0x8E63,0x8E55,0x8E76,0x8E72,0x8E7C,0x8E81,0x8E87,/* 0x48-0x4F */ + 0x8E85,0x8E84,0x8E8B,0x8E8A,0x8E93,0x8E91,0x8E94,0x8E99,/* 0x50-0x57 */ + 0x8EAA,0x8EA1,0x8EAC,0x8EB0,0x8EC6,0x8EB1,0x8EBE,0x8EC5,/* 0x58-0x5F */ + 0x8EC8,0x8ECB,0x8EDB,0x8EE3,0x8EFC,0x8EFB,0x8EEB,0x8EFE,/* 0x60-0x67 */ + 0x8F0A,0x8F05,0x8F15,0x8F12,0x8F19,0x8F13,0x8F1C,0x8F1F,/* 0x68-0x6F */ + 0x8F1B,0x8F0C,0x8F26,0x8F33,0x8F3B,0x8F39,0x8F45,0x8F42,/* 0x70-0x77 */ + 0x8F3E,0x8F4C,0x8F49,0x8F46,0x8F4E,0x8F57,0x8F5C,0x0000,/* 0x78-0x7F */ + + 0x8F62,0x8F63,0x8F64,0x8F9C,0x8F9F,0x8FA3,0x8FAD,0x8FAF,/* 0x80-0x87 */ + 0x8FB7,0x8FDA,0x8FE5,0x8FE2,0x8FEA,0x8FEF,0x9087,0x8FF4,/* 0x88-0x8F */ + 0x9005,0x8FF9,0x8FFA,0x9011,0x9015,0x9021,0x900D,0x901E,/* 0x90-0x97 */ + 0x9016,0x900B,0x9027,0x9036,0x9035,0x9039,0x8FF8,0x904F,/* 0x98-0x9F */ + 0x9050,0x9051,0x9052,0x900E,0x9049,0x903E,0x9056,0x9058,/* 0xA0-0xA7 */ + 0x905E,0x9068,0x906F,0x9076,0x96A8,0x9072,0x9082,0x907D,/* 0xA8-0xAF */ + 0x9081,0x9080,0x908A,0x9089,0x908F,0x90A8,0x90AF,0x90B1,/* 0xB0-0xB7 */ + 0x90B5,0x90E2,0x90E4,0x6248,0x90DB,0x9102,0x9112,0x9119,/* 0xB8-0xBF */ + 0x9132,0x9130,0x914A,0x9156,0x9158,0x9163,0x9165,0x9169,/* 0xC0-0xC7 */ + 0x9173,0x9172,0x918B,0x9189,0x9182,0x91A2,0x91AB,0x91AF,/* 0xC8-0xCF */ + 0x91AA,0x91B5,0x91B4,0x91BA,0x91C0,0x91C1,0x91C9,0x91CB,/* 0xD0-0xD7 */ + 0x91D0,0x91D6,0x91DF,0x91E1,0x91DB,0x91FC,0x91F5,0x91F6,/* 0xD8-0xDF */ + 0x921E,0x91FF,0x9214,0x922C,0x9215,0x9211,0x925E,0x9257,/* 0xE0-0xE7 */ + 0x9245,0x9249,0x9264,0x9248,0x9295,0x923F,0x924B,0x9250,/* 0xE8-0xEF */ + 0x929C,0x9296,0x9293,0x929B,0x925A,0x92CF,0x92B9,0x92B7,/* 0xF0-0xF7 */ + 0x92E9,0x930F,0x92FA,0x9344,0x932E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9319,0x9322,0x931A,0x9323,0x933A,0x9335,0x933B,0x935C,/* 0x40-0x47 */ + 0x9360,0x937C,0x936E,0x9356,0x93B0,0x93AC,0x93AD,0x9394,/* 0x48-0x4F */ + 0x93B9,0x93D6,0x93D7,0x93E8,0x93E5,0x93D8,0x93C3,0x93DD,/* 0x50-0x57 */ + 0x93D0,0x93C8,0x93E4,0x941A,0x9414,0x9413,0x9403,0x9407,/* 0x58-0x5F */ + 0x9410,0x9436,0x942B,0x9435,0x9421,0x943A,0x9441,0x9452,/* 0x60-0x67 */ + 0x9444,0x945B,0x9460,0x9462,0x945E,0x946A,0x9229,0x9470,/* 0x68-0x6F */ + 0x9475,0x9477,0x947D,0x945A,0x947C,0x947E,0x9481,0x947F,/* 0x70-0x77 */ + 0x9582,0x9587,0x958A,0x9594,0x9596,0x9598,0x9599,0x0000,/* 0x78-0x7F */ + + 0x95A0,0x95A8,0x95A7,0x95AD,0x95BC,0x95BB,0x95B9,0x95BE,/* 0x80-0x87 */ + 0x95CA,0x6FF6,0x95C3,0x95CD,0x95CC,0x95D5,0x95D4,0x95D6,/* 0x88-0x8F */ + 0x95DC,0x95E1,0x95E5,0x95E2,0x9621,0x9628,0x962E,0x962F,/* 0x90-0x97 */ + 0x9642,0x964C,0x964F,0x964B,0x9677,0x965C,0x965E,0x965D,/* 0x98-0x9F */ + 0x965F,0x9666,0x9672,0x966C,0x968D,0x9698,0x9695,0x9697,/* 0xA0-0xA7 */ + 0x96AA,0x96A7,0x96B1,0x96B2,0x96B0,0x96B4,0x96B6,0x96B8,/* 0xA8-0xAF */ + 0x96B9,0x96CE,0x96CB,0x96C9,0x96CD,0x894D,0x96DC,0x970D,/* 0xB0-0xB7 */ + 0x96D5,0x96F9,0x9704,0x9706,0x9708,0x9713,0x970E,0x9711,/* 0xB8-0xBF */ + 0x970F,0x9716,0x9719,0x9724,0x972A,0x9730,0x9739,0x973D,/* 0xC0-0xC7 */ + 0x973E,0x9744,0x9746,0x9748,0x9742,0x9749,0x975C,0x9760,/* 0xC8-0xCF */ + 0x9764,0x9766,0x9768,0x52D2,0x976B,0x9771,0x9779,0x9785,/* 0xD0-0xD7 */ + 0x977C,0x9781,0x977A,0x9786,0x978B,0x978F,0x9790,0x979C,/* 0xD8-0xDF */ + 0x97A8,0x97A6,0x97A3,0x97B3,0x97B4,0x97C3,0x97C6,0x97C8,/* 0xE0-0xE7 */ + 0x97CB,0x97DC,0x97ED,0x9F4F,0x97F2,0x7ADF,0x97F6,0x97F5,/* 0xE8-0xEF */ + 0x980F,0x980C,0x9838,0x9824,0x9821,0x9837,0x983D,0x9846,/* 0xF0-0xF7 */ + 0x984F,0x984B,0x986B,0x986F,0x9870,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9871,0x9874,0x9873,0x98AA,0x98AF,0x98B1,0x98B6,0x98C4,/* 0x40-0x47 */ + 0x98C3,0x98C6,0x98E9,0x98EB,0x9903,0x9909,0x9912,0x9914,/* 0x48-0x4F */ + 0x9918,0x9921,0x991D,0x991E,0x9924,0x9920,0x992C,0x992E,/* 0x50-0x57 */ + 0x993D,0x993E,0x9942,0x9949,0x9945,0x9950,0x994B,0x9951,/* 0x58-0x5F */ + 0x9952,0x994C,0x9955,0x9997,0x9998,0x99A5,0x99AD,0x99AE,/* 0x60-0x67 */ + 0x99BC,0x99DF,0x99DB,0x99DD,0x99D8,0x99D1,0x99ED,0x99EE,/* 0x68-0x6F */ + 0x99F1,0x99F2,0x99FB,0x99F8,0x9A01,0x9A0F,0x9A05,0x99E2,/* 0x70-0x77 */ + 0x9A19,0x9A2B,0x9A37,0x9A45,0x9A42,0x9A40,0x9A43,0x0000,/* 0x78-0x7F */ + + 0x9A3E,0x9A55,0x9A4D,0x9A5B,0x9A57,0x9A5F,0x9A62,0x9A65,/* 0x80-0x87 */ + 0x9A64,0x9A69,0x9A6B,0x9A6A,0x9AAD,0x9AB0,0x9ABC,0x9AC0,/* 0x88-0x8F */ + 0x9ACF,0x9AD1,0x9AD3,0x9AD4,0x9ADE,0x9ADF,0x9AE2,0x9AE3,/* 0x90-0x97 */ + 0x9AE6,0x9AEF,0x9AEB,0x9AEE,0x9AF4,0x9AF1,0x9AF7,0x9AFB,/* 0x98-0x9F */ + 0x9B06,0x9B18,0x9B1A,0x9B1F,0x9B22,0x9B23,0x9B25,0x9B27,/* 0xA0-0xA7 */ + 0x9B28,0x9B29,0x9B2A,0x9B2E,0x9B2F,0x9B32,0x9B44,0x9B43,/* 0xA8-0xAF */ + 0x9B4F,0x9B4D,0x9B4E,0x9B51,0x9B58,0x9B74,0x9B93,0x9B83,/* 0xB0-0xB7 */ + 0x9B91,0x9B96,0x9B97,0x9B9F,0x9BA0,0x9BA8,0x9BB4,0x9BC0,/* 0xB8-0xBF */ + 0x9BCA,0x9BB9,0x9BC6,0x9BCF,0x9BD1,0x9BD2,0x9BE3,0x9BE2,/* 0xC0-0xC7 */ + 0x9BE4,0x9BD4,0x9BE1,0x9C3A,0x9BF2,0x9BF1,0x9BF0,0x9C15,/* 0xC8-0xCF */ + 0x9C14,0x9C09,0x9C13,0x9C0C,0x9C06,0x9C08,0x9C12,0x9C0A,/* 0xD0-0xD7 */ + 0x9C04,0x9C2E,0x9C1B,0x9C25,0x9C24,0x9C21,0x9C30,0x9C47,/* 0xD8-0xDF */ + 0x9C32,0x9C46,0x9C3E,0x9C5A,0x9C60,0x9C67,0x9C76,0x9C78,/* 0xE0-0xE7 */ + 0x9CE7,0x9CEC,0x9CF0,0x9D09,0x9D08,0x9CEB,0x9D03,0x9D06,/* 0xE8-0xEF */ + 0x9D2A,0x9D26,0x9DAF,0x9D23,0x9D1F,0x9D44,0x9D15,0x9D12,/* 0xF0-0xF7 */ + 0x9D41,0x9D3F,0x9D3E,0x9D46,0x9D48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9D5D,0x9D5E,0x9D64,0x9D51,0x9D50,0x9D59,0x9D72,0x9D89,/* 0x40-0x47 */ + 0x9D87,0x9DAB,0x9D6F,0x9D7A,0x9D9A,0x9DA4,0x9DA9,0x9DB2,/* 0x48-0x4F */ + 0x9DC4,0x9DC1,0x9DBB,0x9DB8,0x9DBA,0x9DC6,0x9DCF,0x9DC2,/* 0x50-0x57 */ + 0x9DD9,0x9DD3,0x9DF8,0x9DE6,0x9DED,0x9DEF,0x9DFD,0x9E1A,/* 0x58-0x5F */ + 0x9E1B,0x9E1E,0x9E75,0x9E79,0x9E7D,0x9E81,0x9E88,0x9E8B,/* 0x60-0x67 */ + 0x9E8C,0x9E92,0x9E95,0x9E91,0x9E9D,0x9EA5,0x9EA9,0x9EB8,/* 0x68-0x6F */ + 0x9EAA,0x9EAD,0x9761,0x9ECC,0x9ECE,0x9ECF,0x9ED0,0x9ED4,/* 0x70-0x77 */ + 0x9EDC,0x9EDE,0x9EDD,0x9EE0,0x9EE5,0x9EE8,0x9EEF,0x0000,/* 0x78-0x7F */ + + 0x9EF4,0x9EF6,0x9EF7,0x9EF9,0x9EFB,0x9EFC,0x9EFD,0x9F07,/* 0x80-0x87 */ + 0x9F08,0x76B7,0x9F15,0x9F21,0x9F2C,0x9F3E,0x9F4A,0x9F52,/* 0x88-0x8F */ + 0x9F54,0x9F63,0x9F5F,0x9F60,0x9F61,0x9F66,0x9F67,0x9F6C,/* 0x90-0x97 */ + 0x9F6A,0x9F77,0x9F72,0x9F76,0x9F95,0x9F9C,0x9FA0,0x582F,/* 0x98-0x9F */ + 0x69C7,0x9059,0x7464,0x51DC,0x7199,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E8A,0x891C,0x9348,0x9288,0x84DC,0x4FC9,0x70BB,0x6631,/* 0x40-0x47 */ + 0x68C8,0x92F9,0x66FB,0x5F45,0x4E28,0x4EE1,0x4EFC,0x4F00,/* 0x48-0x4F */ + 0x4F03,0x4F39,0x4F56,0x4F92,0x4F8A,0x4F9A,0x4F94,0x4FCD,/* 0x50-0x57 */ + 0x5040,0x5022,0x4FFF,0x501E,0x5046,0x5070,0x5042,0x5094,/* 0x58-0x5F */ + 0x50F4,0x50D8,0x514A,0x5164,0x519D,0x51BE,0x51EC,0x5215,/* 0x60-0x67 */ + 0x529C,0x52A6,0x52C0,0x52DB,0x5300,0x5307,0x5324,0x5372,/* 0x68-0x6F */ + 0x5393,0x53B2,0x53DD,0xFA0E,0x549C,0x548A,0x54A9,0x54FF,/* 0x70-0x77 */ + 0x5586,0x5759,0x5765,0x57AC,0x57C8,0x57C7,0xFA0F,0x0000,/* 0x78-0x7F */ + + 0xFA10,0x589E,0x58B2,0x590B,0x5953,0x595B,0x595D,0x5963,/* 0x80-0x87 */ + 0x59A4,0x59BA,0x5B56,0x5BC0,0x752F,0x5BD8,0x5BEC,0x5C1E,/* 0x88-0x8F */ + 0x5CA6,0x5CBA,0x5CF5,0x5D27,0x5D53,0xFA11,0x5D42,0x5D6D,/* 0x90-0x97 */ + 0x5DB8,0x5DB9,0x5DD0,0x5F21,0x5F34,0x5F67,0x5FB7,0x5FDE,/* 0x98-0x9F */ + 0x605D,0x6085,0x608A,0x60DE,0x60D5,0x6120,0x60F2,0x6111,/* 0xA0-0xA7 */ + 0x6137,0x6130,0x6198,0x6213,0x62A6,0x63F5,0x6460,0x649D,/* 0xA8-0xAF */ + 0x64CE,0x654E,0x6600,0x6615,0x663B,0x6609,0x662E,0x661E,/* 0xB0-0xB7 */ + 0x6624,0x6665,0x6657,0x6659,0xFA12,0x6673,0x6699,0x66A0,/* 0xB8-0xBF */ + 0x66B2,0x66BF,0x66FA,0x670E,0xF929,0x6766,0x67BB,0x6852,/* 0xC0-0xC7 */ + 0x67C0,0x6801,0x6844,0x68CF,0xFA13,0x6968,0xFA14,0x6998,/* 0xC8-0xCF */ + 0x69E2,0x6A30,0x6A6B,0x6A46,0x6A73,0x6A7E,0x6AE2,0x6AE4,/* 0xD0-0xD7 */ + 0x6BD6,0x6C3F,0x6C5C,0x6C86,0x6C6F,0x6CDA,0x6D04,0x6D87,/* 0xD8-0xDF */ + 0x6D6F,0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,/* 0xE0-0xE7 */ + 0x6E5C,0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,/* 0xE8-0xEF */ + 0x7007,0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,/* 0xF0-0xF7 */ + 0x7147,0xFA15,0x71C1,0x71FE,0x72B1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x72BE,0x7324,0xFA16,0x7377,0x73BD,0x73C9,0x73D6,0x73E3,/* 0x40-0x47 */ + 0x73D2,0x7407,0x73F5,0x7426,0x742A,0x7429,0x742E,0x7462,/* 0x48-0x4F */ + 0x7489,0x749F,0x7501,0x756F,0x7682,0x769C,0x769E,0x769B,/* 0x50-0x57 */ + 0x76A6,0xFA17,0x7746,0x52AF,0x7821,0x784E,0x7864,0x787A,/* 0x58-0x5F */ + 0x7930,0xFA18,0xFA19,0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,/* 0x60-0x67 */ + 0x7AE7,0xFA1C,0x7AEB,0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,/* 0x68-0x6F */ + 0x7DA0,0x7DD6,0x7E52,0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,/* 0x70-0x77 */ + 0x837F,0x83C7,0x83F6,0x8448,0x84B4,0x8553,0x8559,0x0000,/* 0x78-0x7F */ + + 0x856B,0xFA1F,0x85B0,0xFA20,0xFA21,0x8807,0x88F5,0x8A12,/* 0x80-0x87 */ + 0x8A37,0x8A79,0x8AA7,0x8ABE,0x8ADF,0xFA22,0x8AF6,0x8B53,/* 0x88-0x8F */ + 0x8B7F,0x8CF0,0x8CF4,0x8D12,0x8D76,0xFA23,0x8ECF,0xFA24,/* 0x90-0x97 */ + 0xFA25,0x9067,0x90DE,0xFA26,0x9115,0x9127,0x91DA,0x91D7,/* 0x98-0x9F */ + 0x91DE,0x91ED,0x91EE,0x91E4,0x91E5,0x9206,0x9210,0x920A,/* 0xA0-0xA7 */ + 0x923A,0x9240,0x923C,0x924E,0x9259,0x9251,0x9239,0x9267,/* 0xA8-0xAF */ + 0x92A7,0x9277,0x9278,0x92E7,0x92D7,0x92D9,0x92D0,0xFA27,/* 0xB0-0xB7 */ + 0x92D5,0x92E0,0x92D3,0x9325,0x9321,0x92FB,0xFA28,0x931E,/* 0xB8-0xBF */ + 0x92FF,0x931D,0x9302,0x9370,0x9357,0x93A4,0x93C6,0x93DE,/* 0xC0-0xC7 */ + 0x93F8,0x9431,0x9445,0x9448,0x9592,0xF9DC,0xFA29,0x969D,/* 0xC8-0xCF */ + 0x96AF,0x9733,0x973B,0x9743,0x974D,0x974F,0x9751,0x9755,/* 0xD0-0xD7 */ + 0x9857,0x9865,0xFA2A,0xFA2B,0x9927,0xFA2C,0x999E,0x9A4E,/* 0xD8-0xDF */ + 0x9AD9,0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,/* 0xE0-0xE7 */ + 0x9D70,0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x2170,/* 0xE8-0xEF */ + 0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,0x2178,/* 0xF0-0xF7 */ + 0x2179,0xFFE2,0xFFE4,0xFF07,0xFF02,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,/* 0x40-0x47 */ + 0x2178,0x2179,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,/* 0x48-0x4F */ + 0x2166,0x2167,0x2168,0x2169,0xFFE2,0xFFE4,0xFF07,0xFF02,/* 0x50-0x57 */ + 0x3231,0x2116,0x2121,0x2235,0x7E8A,0x891C,0x9348,0x9288,/* 0x58-0x5F */ + 0x84DC,0x4FC9,0x70BB,0x6631,0x68C8,0x92F9,0x66FB,0x5F45,/* 0x60-0x67 */ + 0x4E28,0x4EE1,0x4EFC,0x4F00,0x4F03,0x4F39,0x4F56,0x4F92,/* 0x68-0x6F */ + 0x4F8A,0x4F9A,0x4F94,0x4FCD,0x5040,0x5022,0x4FFF,0x501E,/* 0x70-0x77 */ + 0x5046,0x5070,0x5042,0x5094,0x50F4,0x50D8,0x514A,0x0000,/* 0x78-0x7F */ + + 0x5164,0x519D,0x51BE,0x51EC,0x5215,0x529C,0x52A6,0x52C0,/* 0x80-0x87 */ + 0x52DB,0x5300,0x5307,0x5324,0x5372,0x5393,0x53B2,0x53DD,/* 0x88-0x8F */ + 0xFA0E,0x549C,0x548A,0x54A9,0x54FF,0x5586,0x5759,0x5765,/* 0x90-0x97 */ + 0x57AC,0x57C8,0x57C7,0xFA0F,0xFA10,0x589E,0x58B2,0x590B,/* 0x98-0x9F */ + 0x5953,0x595B,0x595D,0x5963,0x59A4,0x59BA,0x5B56,0x5BC0,/* 0xA0-0xA7 */ + 0x752F,0x5BD8,0x5BEC,0x5C1E,0x5CA6,0x5CBA,0x5CF5,0x5D27,/* 0xA8-0xAF */ + 0x5D53,0xFA11,0x5D42,0x5D6D,0x5DB8,0x5DB9,0x5DD0,0x5F21,/* 0xB0-0xB7 */ + 0x5F34,0x5F67,0x5FB7,0x5FDE,0x605D,0x6085,0x608A,0x60DE,/* 0xB8-0xBF */ + 0x60D5,0x6120,0x60F2,0x6111,0x6137,0x6130,0x6198,0x6213,/* 0xC0-0xC7 */ + 0x62A6,0x63F5,0x6460,0x649D,0x64CE,0x654E,0x6600,0x6615,/* 0xC8-0xCF */ + 0x663B,0x6609,0x662E,0x661E,0x6624,0x6665,0x6657,0x6659,/* 0xD0-0xD7 */ + 0xFA12,0x6673,0x6699,0x66A0,0x66B2,0x66BF,0x66FA,0x670E,/* 0xD8-0xDF */ + 0xF929,0x6766,0x67BB,0x6852,0x67C0,0x6801,0x6844,0x68CF,/* 0xE0-0xE7 */ + 0xFA13,0x6968,0xFA14,0x6998,0x69E2,0x6A30,0x6A6B,0x6A46,/* 0xE8-0xEF */ + 0x6A73,0x6A7E,0x6AE2,0x6AE4,0x6BD6,0x6C3F,0x6C5C,0x6C86,/* 0xF0-0xF7 */ + 0x6C6F,0x6CDA,0x6D04,0x6D87,0x6D6F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,0x6E5C,/* 0x40-0x47 */ + 0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,0x7007,/* 0x48-0x4F */ + 0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,0x7147,/* 0x50-0x57 */ + 0xFA15,0x71C1,0x71FE,0x72B1,0x72BE,0x7324,0xFA16,0x7377,/* 0x58-0x5F */ + 0x73BD,0x73C9,0x73D6,0x73E3,0x73D2,0x7407,0x73F5,0x7426,/* 0x60-0x67 */ + 0x742A,0x7429,0x742E,0x7462,0x7489,0x749F,0x7501,0x756F,/* 0x68-0x6F */ + 0x7682,0x769C,0x769E,0x769B,0x76A6,0xFA17,0x7746,0x52AF,/* 0x70-0x77 */ + 0x7821,0x784E,0x7864,0x787A,0x7930,0xFA18,0xFA19,0x0000,/* 0x78-0x7F */ + + 0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,0x7AE7,0xFA1C,0x7AEB,/* 0x80-0x87 */ + 0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,0x7DA0,0x7DD6,0x7E52,/* 0x88-0x8F */ + 0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,0x837F,0x83C7,0x83F6,/* 0x90-0x97 */ + 0x8448,0x84B4,0x8553,0x8559,0x856B,0xFA1F,0x85B0,0xFA20,/* 0x98-0x9F */ + 0xFA21,0x8807,0x88F5,0x8A12,0x8A37,0x8A79,0x8AA7,0x8ABE,/* 0xA0-0xA7 */ + 0x8ADF,0xFA22,0x8AF6,0x8B53,0x8B7F,0x8CF0,0x8CF4,0x8D12,/* 0xA8-0xAF */ + 0x8D76,0xFA23,0x8ECF,0xFA24,0xFA25,0x9067,0x90DE,0xFA26,/* 0xB0-0xB7 */ + 0x9115,0x9127,0x91DA,0x91D7,0x91DE,0x91ED,0x91EE,0x91E4,/* 0xB8-0xBF */ + 0x91E5,0x9206,0x9210,0x920A,0x923A,0x9240,0x923C,0x924E,/* 0xC0-0xC7 */ + 0x9259,0x9251,0x9239,0x9267,0x92A7,0x9277,0x9278,0x92E7,/* 0xC8-0xCF */ + 0x92D7,0x92D9,0x92D0,0xFA27,0x92D5,0x92E0,0x92D3,0x9325,/* 0xD0-0xD7 */ + 0x9321,0x92FB,0xFA28,0x931E,0x92FF,0x931D,0x9302,0x9370,/* 0xD8-0xDF */ + 0x9357,0x93A4,0x93C6,0x93DE,0x93F8,0x9431,0x9445,0x9448,/* 0xE0-0xE7 */ + 0x9592,0xF9DC,0xFA29,0x969D,0x96AF,0x9733,0x973B,0x9743,/* 0xE8-0xEF */ + 0x974D,0x974F,0x9751,0x9755,0x9857,0x9865,0xFA2A,0xFA2B,/* 0xF0-0xF7 */ + 0x9927,0xFA2C,0x999E,0x9A4E,0x9AD9,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,0x9D70,/* 0x40-0x47 */ + 0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, NULL, NULL, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, NULL, NULL, c2u_ED, c2u_EE, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, c2u_FA, c2u_FB, c2u_FC, NULL, NULL, NULL, +}; + +static const unsigned char u2c_00hi[256 - 0xA0][2] = { + {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x91}, {0x81, 0x92},/* 0xA0-0xA3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x98},/* 0xA4-0xA7 */ + {0x81, 0x4E}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xA8-0xAB */ + {0x81, 0xCA}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xAC-0xAF */ + {0x81, 0x8B}, {0x81, 0x7D}, {0x00, 0x00}, {0x00, 0x00},/* 0xB0-0xB3 */ + {0x81, 0x4C}, {0x00, 0x00}, {0x81, 0xF7}, {0x00, 0x00},/* 0xB4-0xB7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xB8-0xBB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xBC-0xBF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC0-0xC3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC4-0xC7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC8-0xCB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xCC-0xCF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD0-0xD3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x7E},/* 0xD4-0xD7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD8-0xDB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xDC-0xDF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE0-0xE3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE4-0xE7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE8-0xEB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xEC-0xEF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF0-0xF3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x80},/* 0xF4-0xF7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF8-0xFB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xFC-0xFF */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x90-0x93 */ + 0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x94-0x97 */ + 0x83, 0xA6, 0x83, 0xA7, 0x83, 0xA8, 0x83, 0xA9, /* 0x98-0x9B */ + 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, 0x83, 0xAD, /* 0x9C-0x9F */ + 0x83, 0xAE, 0x83, 0xAF, 0x00, 0x00, 0x83, 0xB0, /* 0xA0-0xA3 */ + 0x83, 0xB1, 0x83, 0xB2, 0x83, 0xB3, 0x83, 0xB4, /* 0xA4-0xA7 */ + 0x83, 0xB5, 0x83, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0xB0-0xB3 */ + 0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0xB4-0xB7 */ + 0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xB8-0xBB */ + 0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xBC-0xBF */ + 0x83, 0xCE, 0x83, 0xCF, 0x00, 0x00, 0x83, 0xD0, /* 0xC0-0xC3 */ + 0x83, 0xD1, 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, /* 0xC4-0xC7 */ + 0x83, 0xD5, 0x83, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0x84, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0x84, 0x43, /* 0x10-0x13 */ + 0x84, 0x44, 0x84, 0x45, 0x84, 0x47, 0x84, 0x48, /* 0x14-0x17 */ + 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, 0x84, 0x4C, /* 0x18-0x1B */ + 0x84, 0x4D, 0x84, 0x4E, 0x84, 0x4F, 0x84, 0x50, /* 0x1C-0x1F */ + 0x84, 0x51, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0x20-0x23 */ + 0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x24-0x27 */ + 0x84, 0x59, 0x84, 0x5A, 0x84, 0x5B, 0x84, 0x5C, /* 0x28-0x2B */ + 0x84, 0x5D, 0x84, 0x5E, 0x84, 0x5F, 0x84, 0x60, /* 0x2C-0x2F */ + 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, /* 0x30-0x33 */ + 0x84, 0x74, 0x84, 0x75, 0x84, 0x77, 0x84, 0x78, /* 0x34-0x37 */ + 0x84, 0x79, 0x84, 0x7A, 0x84, 0x7B, 0x84, 0x7C, /* 0x38-0x3B */ + 0x84, 0x7D, 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, /* 0x3C-0x3F */ + 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, 0x84, 0x85, /* 0x40-0x43 */ + 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, 0x84, 0x89, /* 0x44-0x47 */ + 0x84, 0x8A, 0x84, 0x8B, 0x84, 0x8C, 0x84, 0x8D, /* 0x48-0x4B */ + 0x84, 0x8E, 0x84, 0x8F, 0x84, 0x90, 0x84, 0x91, /* 0x4C-0x4F */ + 0x00, 0x00, 0x84, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x81, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x81, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x81, 0x65, 0x81, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x81, 0x67, 0x81, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x81, 0xF5, 0x81, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x81, 0x64, 0x81, 0x63, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x81, 0xF1, 0x00, 0x00, 0x81, 0x8C, 0x81, 0x8D, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xA6, /* 0x38-0x3B */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x8E, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0x59, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xFA, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xF0, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, 0xFA, 0x4D, /* 0x60-0x63 */ + 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, 0xFA, 0x51, /* 0x64-0x67 */ + 0xFA, 0x52, 0xFA, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEE, 0xEF, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0x70-0x73 */ + 0xEE, 0xF3, 0xEE, 0xF4, 0xEE, 0xF5, 0xEE, 0xF6, /* 0x74-0x77 */ + 0xEE, 0xF7, 0xEE, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xA8, 0x81, 0xAB, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x81, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_22[512] = { + 0x81, 0xCD, 0x00, 0x00, 0x81, 0xDD, 0x81, 0xCE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xDE, /* 0x04-0x07 */ + 0x81, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x81, 0xB9, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x87, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x95, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x81, 0xE5, 0x81, 0x87, 0x87, 0x98, /* 0x1C-0x1F */ + 0x87, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x81, 0x61, 0x00, 0x00, 0x81, 0xC8, /* 0x24-0x27 */ + 0x81, 0xC9, 0x87, 0x9B, 0x87, 0x9C, 0x87, 0x92, /* 0x28-0x2B */ + 0x81, 0xE8, 0x00, 0x00, 0x87, 0x93, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x81, 0x88, 0xFA, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x81, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x90, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x81, 0x82, 0x87, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x85, 0x81, 0x86, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xE1, 0x81, 0xE2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x81, 0xBC, 0x81, 0xBD, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xBA, 0x81, 0xBB, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x87, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x99, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xDC, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x87, 0x40, 0x87, 0x41, 0x87, 0x42, 0x87, 0x43, /* 0x60-0x63 */ + 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, /* 0x64-0x67 */ + 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, /* 0x68-0x6B */ + 0x87, 0x4C, 0x87, 0x4D, 0x87, 0x4E, 0x87, 0x4F, /* 0x6C-0x6F */ + 0x87, 0x50, 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, /* 0x70-0x73 */ +}; + +static const unsigned char u2c_25[512] = { + 0x84, 0x9F, 0x84, 0xAA, 0x84, 0xA0, 0x84, 0xAB, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x84, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAC, /* 0x0C-0x0F */ + 0x84, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAD, /* 0x10-0x13 */ + 0x84, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAF, /* 0x14-0x17 */ + 0x84, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAE, /* 0x18-0x1B */ + 0x84, 0xA5, 0x84, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x84, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB0, /* 0x20-0x23 */ + 0x84, 0xA7, 0x84, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x84, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB2, /* 0x28-0x2B */ + 0x84, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB6, /* 0x2C-0x2F */ + 0x84, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB1, /* 0x30-0x33 */ + 0x84, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB8, /* 0x34-0x37 */ + 0x84, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB3, /* 0x38-0x3B */ + 0x84, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB9, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x84, 0xBE, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x81, 0xA1, 0x81, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xA3, 0x81, 0xA2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x81, 0xA5, 0x81, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x9F, 0x81, 0x9E, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9B, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x9D, 0x81, 0x9C, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xFC, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x81, 0x9A, 0x81, 0x99, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x81, 0x8A, 0x00, 0x00, 0x81, 0x89, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xF4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x81, 0xF3, 0x00, 0x00, 0x81, 0xF2, /* 0x6C-0x6F */ +}; + +static const unsigned char u2c_30[512] = { + 0x81, 0x40, 0x81, 0x41, 0x81, 0x42, 0x81, 0x56, /* 0x00-0x03 */ + 0x00, 0x00, 0x81, 0x58, 0x81, 0x59, 0x81, 0x5A, /* 0x04-0x07 */ + 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, 0x81, 0x74, /* 0x08-0x0B */ + 0x81, 0x75, 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, /* 0x0C-0x0F */ + 0x81, 0x79, 0x81, 0x7A, 0x81, 0xA7, 0x81, 0xAC, /* 0x10-0x13 */ + 0x81, 0x6B, 0x81, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x87, 0x80, 0x00, 0x00, 0x87, 0x81, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, /* 0x40-0x43 */ + 0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x44-0x47 */ + 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, 0x82, 0xA9, /* 0x48-0x4B */ + 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, 0x82, 0xAD, /* 0x4C-0x4F */ + 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, 0x82, 0xB1, /* 0x50-0x53 */ + 0x82, 0xB2, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x54-0x57 */ + 0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x58-0x5B */ + 0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0x5C-0x5F */ + 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0x60-0x63 */ + 0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0x64-0x67 */ + 0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0x82, 0xC9, /* 0x68-0x6B */ + 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0x82, 0xCD, /* 0x6C-0x6F */ + 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x70-0x73 */ + 0x82, 0xD2, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0x74-0x77 */ + 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, 0x82, 0xD9, /* 0x78-0x7B */ + 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, /* 0x7C-0x7F */ + + 0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0x80-0x83 */ + 0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0x84-0x87 */ + 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, /* 0x88-0x8B */ + 0x82, 0xEA, 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, /* 0x8C-0x8F */ + 0x82, 0xEE, 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, /* 0x90-0x93 */ + 0x83, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x4A, /* 0x98-0x9B */ + 0x81, 0x4B, 0x81, 0x54, 0x81, 0x55, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xA0-0xA3 */ + 0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xA4-0xA7 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xA8-0xAB */ + 0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xAC-0xAF */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0xB0-0xB3 */ + 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, 0x83, 0x56, /* 0xB4-0xB7 */ + 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, 0x83, 0x5A, /* 0xB8-0xBB */ + 0x83, 0x5B, 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, /* 0xBC-0xBF */ + 0x83, 0x5F, 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, /* 0xC0-0xC3 */ + 0x83, 0x63, 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, /* 0xC4-0xC7 */ + 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, /* 0xC8-0xCB */ + 0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, /* 0xCC-0xCF */ + 0x83, 0x6F, 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, /* 0xD0-0xD3 */ + 0x83, 0x73, 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, /* 0xD4-0xD7 */ + 0x83, 0x77, 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, /* 0xD8-0xDB */ + 0x83, 0x7B, 0x83, 0x7C, 0x83, 0x7D, 0x83, 0x7E, /* 0xDC-0xDF */ + 0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0xE0-0xE3 */ + 0x83, 0x84, 0x83, 0x85, 0x83, 0x86, 0x83, 0x87, /* 0xE4-0xE7 */ + 0x83, 0x88, 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, /* 0xE8-0xEB */ + 0x83, 0x8C, 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, /* 0xEC-0xEF */ + 0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0xF0-0xF3 */ + 0x83, 0x94, 0x83, 0x95, 0x83, 0x96, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x45, /* 0xF8-0xFB */ + 0x81, 0x5B, 0x81, 0x52, 0x81, 0x53, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFA, 0x58, 0x87, 0x8B, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x87, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x87, 0x85, 0x87, 0x86, 0x87, 0x87, 0x87, 0x88, /* 0xA4-0xA7 */ + 0x87, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x65, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x87, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x87, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x87, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x61, 0x87, 0x6B, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x6A, 0x87, 0x64, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6C, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x66, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6E, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x87, 0x5F, 0x87, 0x6D, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x87, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x87, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x68, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x7E, /* 0x78-0x7B */ + 0x87, 0x8F, 0x87, 0x8E, 0x87, 0x8D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x72, 0x87, 0x73, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x87, 0x6F, 0x87, 0x70, 0x87, 0x71, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x87, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x87, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x87, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ +}; + +static const unsigned char u2c_4E[512] = { + 0x88, 0xEA, 0x92, 0x9A, 0x00, 0x00, 0x8E, 0xB5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9C, /* 0x04-0x07 */ + 0x8F, 0xE4, 0x8E, 0x4F, 0x8F, 0xE3, 0x89, 0xBA, /* 0x08-0x0B */ + 0x00, 0x00, 0x95, 0x73, 0x97, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x98, 0xA0, 0x89, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8A, 0x8E, 0x98, 0xA1, 0x90, 0xA2, 0x99, 0xC0, /* 0x14-0x17 */ + 0x8B, 0x75, 0x95, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xE5, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x97, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xC0, 0x00, 0x00, /* 0x24-0x27 */ + 0xED, 0x4C, 0x00, 0x00, 0x98, 0xA2, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x92, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x98, 0xA3, 0x8B, 0xF8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xA4, 0x00, 0x00, /* 0x34-0x37 */ + 0x8A, 0xDB, 0x92, 0x4F, 0x00, 0x00, 0x8E, 0xE5, /* 0x38-0x3B */ + 0x98, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xA6, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xA7, 0x94, 0x54, /* 0x40-0x43 */ + 0x00, 0x00, 0x8B, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x56, /* 0x48-0x4B */ + 0x00, 0x00, 0x93, 0xE1, 0x8C, 0xC1, 0x96, 0x52, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE5, 0x68, 0x98, 0xA8, 0x8F, 0xE6, /* 0x54-0x57 */ + 0x98, 0xA9, 0x89, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8B, 0xE3, 0x8C, 0xEE, 0x96, 0xE7, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xA4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x97, 0x90, 0x00, 0x00, 0x93, 0xFB, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8B, 0x54, 0x00, 0x00, 0x98, 0xAA, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x98, 0xAB, 0x97, 0xB9, 0x00, 0x00, /* 0x84-0x87 */ + 0x97, 0x5C, 0x91, 0x88, 0x98, 0xAD, 0x8E, 0x96, /* 0x88-0x8B */ + 0x93, 0xF1, 0x00, 0x00, 0x98, 0xB0, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x89, 0x5D, 0x8C, 0xDD, 0x00, 0x00, /* 0x90-0x93 */ + 0x8C, 0xDC, 0x88, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x98, 0x6A, 0x98, 0x69, 0x00, 0x00, 0x8D, 0xB1, /* 0x98-0x9B */ + 0x88, 0x9F, 0x00, 0x00, 0x98, 0xB1, 0x98, 0xB2, /* 0x9C-0x9F */ + 0x98, 0xB3, 0x96, 0x53, 0x98, 0xB4, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8C, 0xF0, 0x88, 0xE5, 0x96, 0x92, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8B, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9D, /* 0xA8-0xAB */ + 0x8B, 0x9E, 0x92, 0xE0, 0x97, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */ + 0x98, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xB6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xB7, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x8F, 0x59, 0x90, 0x6D, 0x98, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x98, 0xBA, 0x00, 0x00, 0x98, 0xBB, 0x8B, 0x77, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA1, 0x89, 0xEE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x98, 0xB9, 0x98, 0xB8, 0x95, 0xA7, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8E, 0x65, 0x8E, 0x64, 0x91, 0xBC, 0x98, 0xBD, /* 0xD4-0xD7 */ + 0x95, 0x74, 0x90, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x81, 0x57, 0x98, 0xBE, 0x98, 0xC0, /* 0xDC-0xDF */ + 0x00, 0x00, 0xED, 0x4D, 0x00, 0x00, 0x91, 0xE3, /* 0xE0-0xE3 */ + 0x97, 0xDF, 0x88, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x98, 0xBF, 0x89, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8B, 0xC2, 0x00, 0x00, 0x92, 0x87, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8F, 0x98, 0xC1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x43, /* 0xF8-0xFB */ + 0xED, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0xED, 0x4F, 0x8A, 0xE9, 0x00, 0x00, 0xED, 0x50, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x98, 0xC2, 0x88, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x8C, 0xDE, 0x8A, 0xEA, 0x95, 0x9A, /* 0x0C-0x0F */ + 0x94, 0xB0, 0x8B, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xEF, 0x00, 0x00, /* 0x18-0x1B */ + 0x98, 0xE5, 0x93, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x8C, /* 0x2C-0x2F */ + 0x98, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x94, 0xBA, 0x00, 0x00, 0x97, 0xE0, 0x00, 0x00, /* 0x34-0x37 */ + 0x90, 0x4C, 0xED, 0x51, 0x8E, 0x66, 0x00, 0x00, /* 0x38-0x3B */ + 0x8E, 0x97, 0x89, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xCF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x41, 0x98, 0xC8, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x88, 0xCA, 0x92, 0xE1, 0x8F, 0x5A, /* 0x4C-0x4F */ + 0x8D, 0xB2, 0x97, 0x43, 0x00, 0x00, 0x91, 0xCC, /* 0x50-0x53 */ + 0x00, 0x00, 0x89, 0xBD, 0xED, 0x52, 0x98, 0xC7, /* 0x54-0x57 */ + 0x00, 0x00, 0x97, 0x5D, 0x98, 0xC3, 0x98, 0xC5, /* 0x58-0x5B */ + 0x8D, 0xEC, 0x98, 0xC6, 0x9B, 0x43, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x98, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xD1, /* 0x6C-0x6F */ + 0x98, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC0, /* 0x70-0x73 */ + 0x00, 0x00, 0x95, 0xB9, 0x98, 0xC9, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xCD, /* 0x78-0x7B */ + 0x8C, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x67, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA4, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xD2, 0x00, 0x00, /* 0x84-0x87 */ + 0x98, 0xCA, 0x00, 0x00, 0xED, 0x54, 0x97, 0xE1, /* 0x88-0x8B */ + 0x00, 0x00, 0x8E, 0x98, 0x00, 0x00, 0x98, 0xCB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x98, 0xD0, 0xED, 0x53, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0x56, 0x00, 0x00, 0x98, 0xD3, 0x00, 0x00, /* 0x94-0x97 */ + 0x98, 0xCC, 0x00, 0x00, 0xED, 0x55, 0x8B, 0x9F, /* 0x98-0x9B */ + 0x00, 0x00, 0x88, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0xA0, 0x89, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x44, /* 0xA8-0xAB */ + 0x00, 0x00, 0x96, 0x99, 0x95, 0x8E, 0x8C, 0xF2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x90, 0x4E, 0x97, 0xB5, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xD6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x57, 0x91, 0xA3, /* 0xC0-0xC3 */ + 0x89, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0x45, 0x8F, 0x72, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xED, 0x57, 0x98, 0xD7, 0x00, 0x00, /* 0xCC-0xCF */ + 0x98, 0xDC, 0x98, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x98, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAD, /* 0xD4-0xD7 */ + 0x98, 0xD8, 0x00, 0x00, 0x98, 0xDB, 0x98, 0xD9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x95, 0xDB, 0x00, 0x00, 0x98, 0xD6, /* 0xDC-0xDF */ + 0x00, 0x00, 0x90, 0x4D, 0x00, 0x00, 0x96, 0x93, /* 0xE0-0xE3 */ + 0x98, 0xDD, 0x98, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x43, 0x98, 0xEB, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x6F, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x95, 0x55, 0x98, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x95, 0xEE, 0x00, 0x00, 0x89, 0xB4, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xEA, 0xED, 0x5A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x98, 0xE4, 0x98, 0xED, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x91, 0x71, 0x00, 0x00, 0x8C, 0xC2, /* 0x08-0x0B */ + 0x00, 0x00, 0x94, 0x7B, 0x00, 0x00, 0xE0, 0xC5, /* 0x0C-0x0F */ + 0x00, 0x00, 0x98, 0xEC, 0x93, 0x7C, 0x00, 0x00, /* 0x10-0x13 */ + 0x98, 0xE1, 0x00, 0x00, 0x8C, 0xF4, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x8C, 0xF3, 0x98, 0xDF, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x5B, 0x8E, 0xD8, /* 0x1C-0x1F */ + 0x00, 0x00, 0x98, 0xE7, 0xED, 0x59, 0x95, 0xED, /* 0x20-0x23 */ + 0x92, 0x6C, 0x98, 0xE3, 0x8C, 0x91, 0x00, 0x00, /* 0x24-0x27 */ + 0x98, 0xE0, 0x98, 0xE8, 0x98, 0xE2, 0x97, 0xCF, /* 0x28-0x2B */ + 0x98, 0xE9, 0x98, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8C, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xED, 0x58, 0x00, 0x00, 0xED, 0x5E, 0x98, 0xEE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x5C, 0x98, 0xEF, /* 0x44-0x47 */ + 0x98, 0xF3, 0x88, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xCE, /* 0x4C-0x4F */ + 0x98, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x98, 0xF1, 0x98, 0xF5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xF4, 0x00, 0x00, /* 0x58-0x5B */ + 0x92, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x8C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x98, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xED, 0x5D, 0x00, 0x00, 0x8E, 0xC3, 0x00, 0x00, /* 0x70-0x73 */ + 0x91, 0xA4, 0x92, 0xE3, 0x8B, 0xF4, 0x00, 0x00, /* 0x74-0x77 */ + 0x98, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x98, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x98, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8C, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x8E, 0x50, 0x94, 0xF5, 0x98, 0xF9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8D, 0xC3, 0x97, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xFC, 0x99, 0x42, /* 0xB0-0xB3 */ + 0x98, 0xFB, 0x8D, 0xC2, 0x00, 0x00, 0x8F, 0x9D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x58, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x43, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8B, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x99, 0x40, 0x99, 0x41, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x93, 0xAD, 0x00, 0x00, 0x91, 0x9C, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8B, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x96, 0x6C, 0x99, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xED, 0x61, 0x00, 0x00, 0x97, 0xBB, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x45, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x48, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x99, 0x46, 0x00, 0x00, 0x91, 0x6D, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x99, 0x47, 0x99, 0x49, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xED, 0x60, 0x99, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x99, 0x4A, 0x00, 0x00, 0x95, 0xC6, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_51[512] = { + 0x8B, 0x56, 0x99, 0x4D, 0x99, 0x4E, 0x00, 0x00, /* 0x00-0x03 */ + 0x89, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x99, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF2, 0x00, 0x00, /* 0x10-0x13 */ + 0x99, 0x51, 0x99, 0x50, 0x99, 0x4F, 0x00, 0x00, /* 0x14-0x17 */ + 0x98, 0xD4, 0x00, 0x00, 0x99, 0x52, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x9E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x99, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x44, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xD7, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x55, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x54, 0x99, 0x57, /* 0x38-0x3B */ + 0x99, 0x56, 0x00, 0x00, 0x00, 0x00, 0x99, 0x58, /* 0x3C-0x3F */ + 0x99, 0x59, 0x88, 0xF2, 0x00, 0x00, 0x8C, 0xB3, /* 0x40-0x43 */ + 0x8C, 0x5A, 0x8F, 0x5B, 0x92, 0x9B, 0x8B, 0xA2, /* 0x44-0x47 */ + 0x90, 0xE6, 0x8C, 0xF5, 0xED, 0x62, 0x8D, 0x8E, /* 0x48-0x4B */ + 0x99, 0x5B, 0x96, 0xC6, 0x93, 0x65, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0x99, 0x00, 0x00, 0x99, 0x5A, 0x00, 0x00, /* 0x50-0x53 */ + 0x99, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x7D, 0x00, 0x00, /* 0x58-0x5B */ + 0x8A, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x5D, 0x00, 0x00, /* 0x60-0x63 */ + 0xED, 0x63, 0x93, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x91, 0x53, 0x99, 0x5F, 0x99, 0x60, 0x94, 0xAA, /* 0x68-0x6B */ + 0x8C, 0xF6, 0x98, 0x5A, 0x99, 0x61, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8B, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x95, 0xBA, 0x91, 0xB4, 0x8B, 0xEF, /* 0x74-0x77 */ + 0x93, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x8C, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x99, 0x62, 0x00, 0x00, 0x99, 0x63, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x93, 0xE0, 0x89, 0x7E, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x99, 0x66, 0x8D, 0xFB, 0x00, 0x00, /* 0x88-0x8B */ + 0x99, 0x65, 0x8D, 0xC4, 0x00, 0x00, 0x99, 0x67, /* 0x8C-0x8F */ + 0xE3, 0xEC, 0x99, 0x68, 0x96, 0x60, 0x99, 0x69, /* 0x90-0x93 */ + 0x00, 0x00, 0x99, 0x6A, 0x99, 0x6B, 0x8F, 0xE7, /* 0x94-0x97 */ + 0x00, 0x00, 0x8E, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xED, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8A, 0xA5, 0x00, 0x00, 0x99, 0x6E, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x99, 0x6C, 0x96, 0xBB, 0x99, 0x6D, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x95, 0x79, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0xA8-0xAB */ + 0x93, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x99, 0x75, 0x99, 0x73, 0x99, 0x74, 0x99, 0x72, /* 0xB0-0xB3 */ + 0x8D, 0xE1, 0x99, 0x76, 0x96, 0xE8, 0x97, 0xE2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x99, 0x77, 0xED, 0x65, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x90, 0xA6, 0x99, 0x78, 0x8F, 0x79, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x99, 0x79, 0x00, 0x00, 0x92, 0x9C, /* 0xC8-0xCB */ + 0x97, 0xBD, 0x93, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xC3, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x7A, /* 0xD8-0xDB */ + 0xEA, 0xA3, 0x8B, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x99, 0x7B, 0x96, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x88, 0x91, 0xFA, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x99, 0x7D, 0x93, 0xE2, 0x00, 0x00, /* 0xE8-0xEB */ + 0xED, 0x66, 0x99, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x99, 0x80, 0x8A, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x99, 0x81, 0x8B, 0xA5, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x93, 0xCA, 0x89, 0x9A, 0x8F, 0x6F, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x94, 0x9F, 0x99, 0x82, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0x93, 0x81, 0x00, 0x00, 0x00, 0x00, 0x90, 0x6E, /* 0x00-0x03 */ + 0x99, 0x83, 0x00, 0x00, 0x95, 0xAA, 0x90, 0xD8, /* 0x04-0x07 */ + 0x8A, 0xA0, 0x00, 0x00, 0x8A, 0xA7, 0x99, 0x84, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x86, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8C, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x99, 0x85, 0xED, 0x67, 0x00, 0x00, 0x97, 0xF1, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x8F, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x94, 0xBB, 0x95, 0xCA, 0x00, 0x00, 0x99, 0x87, /* 0x24-0x27 */ + 0x00, 0x00, 0x97, 0x98, 0x99, 0x88, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x89, 0x00, 0x00, /* 0x2C-0x2F */ + 0x93, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x99, 0x8A, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xA7, 0x8D, 0xFC, /* 0x34-0x37 */ + 0x8C, 0x94, 0x99, 0x8B, 0x8E, 0x68, 0x8D, 0x8F, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xE4, /* 0x40-0x43 */ + 0x99, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x91, 0xA5, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xED, 0x99, 0x8E, /* 0x48-0x4B */ + 0x99, 0x8F, 0x91, 0x4F, 0x00, 0x00, 0x99, 0x8C, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x99, 0x91, 0x00, 0x00, 0x96, 0x55, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x84, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x90, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x95, /* 0x60-0x63 */ + 0x8D, 0xDC, 0x94, 0x8D, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x99, 0x94, 0x99, 0x92, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x9B, /* 0x6C-0x6F */ + 0x8F, 0xE8, 0x99, 0x9B, 0x8A, 0x84, 0x99, 0x95, /* 0x70-0x73 */ + 0x99, 0x93, 0x91, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x99, 0x97, 0x00, 0x00, 0x99, 0x96, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x63, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x80, /* 0x84-0x87 */ + 0x99, 0x9C, 0x97, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x99, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x99, 0x9D, 0x99, 0x9A, 0x00, 0x00, /* 0x90-0x93 */ + 0x99, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xCD, /* 0x98-0x9B */ + 0xED, 0x68, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF7, /* 0x9C-0x9F */ + 0x89, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x97, 0xF2, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8F, 0x95, 0x93, 0x77, 0x8D, 0x85, /* 0xA8-0xAB */ + 0x99, 0xA0, 0x99, 0xA1, 0x00, 0x00, 0xEE, 0x5B, /* 0xAC-0xAF */ + 0x00, 0x00, 0x97, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x98, 0x4A, 0x99, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x8C, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x99, 0xA2, 0x00, 0x00, 0x8A, 0x4E, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0x6A, 0x99, 0xA4, 0x00, 0x00, 0x96, 0x75, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x92, 0xBA, 0x00, 0x00, 0x97, 0x45, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x95, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x99, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x93, 0xAE, 0x00, 0x00, 0x99, 0xA6, /* 0xD4-0xD7 */ + 0x8A, 0xA8, 0x96, 0xB1, 0x00, 0x00, 0xED, 0x6B, /* 0xD8-0xDB */ + 0x00, 0x00, 0x8F, 0x9F, 0x99, 0xA7, 0x95, 0xE5, /* 0xDC-0xDF */ + 0x99, 0xAB, 0x00, 0x00, 0x90, 0xA8, 0x99, 0xA8, /* 0xE0-0xE3 */ + 0x8B, 0xCE, 0x00, 0x00, 0x99, 0xA9, 0x8A, 0xA9, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4D, 0x99, 0xAC, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x99, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x99, 0xAE, 0x99, 0xAF, 0x8E, 0xD9, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF9, 0x96, 0xDC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0xED, 0x6C, 0x96, 0xE6, 0x93, 0xF5, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x95, 0xEF, 0x99, 0xB0, 0xED, 0x6D, /* 0x04-0x07 */ + 0x99, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x99, 0xB3, 0x00, 0x00, 0x99, 0xB5, /* 0x0C-0x0F */ + 0x99, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x99, 0xB6, 0x89, 0xBB, 0x96, 0x6B, /* 0x14-0x17 */ + 0x00, 0x00, 0x8D, 0xFA, 0x99, 0xB7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x91, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8F, 0xA0, 0x8B, 0xA7, 0x00, 0x00, 0x99, 0xB8, /* 0x20-0x23 */ + 0xED, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xD9, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xB9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x99, 0xBA, 0x00, 0x00, 0x99, 0xBB, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x99, 0xBC, 0x95, 0x43, 0x8B, 0xE6, 0x88, 0xE3, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBD, /* 0x3C-0x3F */ + 0x99, 0xBD, 0x8F, 0x5C, 0x00, 0x00, 0x90, 0xE7, /* 0x40-0x43 */ + 0x00, 0x00, 0x99, 0xBF, 0x99, 0xBE, 0x8F, 0xA1, /* 0x44-0x47 */ + 0x8C, 0xDF, 0x99, 0xC1, 0x94, 0xBC, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x99, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x94, 0xDA, 0x91, 0xB2, 0x91, 0xEC, /* 0x50-0x53 */ + 0x8B, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEC, /* 0x54-0x57 */ + 0x92, 0x50, 0x00, 0x00, 0x94, 0x8E, 0x00, 0x00, /* 0x58-0x5B */ + 0x96, 0x6D, 0x00, 0x00, 0x99, 0xC4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x90, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x54, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x99, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xC6, 0x89, 0x4B, /* 0x6C-0x6F */ + 0x88, 0xF3, 0x8A, 0xEB, 0xED, 0x6F, 0x91, 0xA6, /* 0x70-0x73 */ + 0x8B, 0x70, 0x97, 0x91, 0x00, 0x00, 0x99, 0xC9, /* 0x74-0x77 */ + 0x89, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x99, 0xC8, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA8, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x99, 0xCA, 0x00, 0x00, /* 0x80-0x83 */ + 0x96, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x70, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xCB, 0x00, 0x00, /* 0x94-0x97 */ + 0x97, 0xD0, 0x00, 0x00, 0x8C, 0xFA, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xB4, /* 0x9C-0x9F */ + 0x99, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x99, 0xCE, 0x99, 0xCD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x90, 0x7E, 0x89, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x89, 0x7D, 0x99, 0xCF, 0x00, 0x00, /* 0xAC-0xAF */ + 0x99, 0xD0, 0x00, 0x00, 0xED, 0x71, 0x8C, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xD1, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8E, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x51, 0x99, 0xD2, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x96, 0x94, 0x8D, 0xB3, 0x8B, 0x79, 0x97, 0x46, /* 0xC8-0xCB */ + 0x91, 0x6F, 0x94, 0xBD, 0x8E, 0xFB, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8F, 0x66, 0x00, 0x00, 0x8E, 0xE6, 0x8E, 0xF3, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8F, 0x96, 0x00, 0x00, 0x94, 0xBE, /* 0xD8-0xDB */ + 0x00, 0x00, 0xED, 0x72, 0x00, 0x00, 0x99, 0xD5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x89, 0x62, 0x91, 0x70, 0x8C, 0xFB, /* 0xE0-0xE3 */ + 0x8C, 0xC3, 0x8B, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x99, 0xD9, 0x92, 0x40, 0x91, 0xFC, 0x8B, 0xA9, /* 0xE8-0xEB */ + 0x8F, 0xA2, 0x99, 0xDA, 0x99, 0xD8, 0x89, 0xC2, /* 0xEC-0xEF */ + 0x91, 0xE4, 0x8E, 0xB6, 0x8E, 0x6A, 0x89, 0x45, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x90, 0x8D, 0x86, /* 0xF4-0xF7 */ + 0x8E, 0x69, 0x00, 0x00, 0x99, 0xDB, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0x99, 0xDC, 0x00, 0x00, 0x8B, 0x68, /* 0x00-0x03 */ + 0x8A, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8D, 0x87, 0x8B, 0x67, 0x92, 0xDD, 0x89, 0x44, /* 0x08-0x0B */ + 0x93, 0xAF, 0x96, 0xBC, 0x8D, 0x40, 0x97, 0x99, /* 0x0C-0x0F */ + 0x93, 0x66, 0x8C, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4E, /* 0x18-0x1B */ + 0x00, 0x00, 0x99, 0xE5, 0x00, 0x00, 0x8B, 0xE1, /* 0x1C-0x1F */ + 0x96, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xDB, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x99, 0xE4, 0x00, 0x00, 0x8A, 0xDC, /* 0x28-0x2B */ + 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE2, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xE3, 0x00, 0x00, /* 0x34-0x37 */ + 0x8B, 0x7A, 0x90, 0x81, 0x00, 0x00, 0x95, 0xAB, /* 0x38-0x3B */ + 0x99, 0xE1, 0x99, 0xDD, 0x8C, 0xE1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x99, 0xDE, 0x00, 0x00, 0x98, 0x43, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xF0, 0x00, 0x00, /* 0x44-0x47 */ + 0x92, 0xE6, 0x8C, 0xE0, 0x8D, 0x90, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xE6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x93, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEA, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8E, 0xFC, 0x00, 0x00, 0x8E, 0xF4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x99, 0xED, 0x99, 0xEB, 0x00, 0x00, 0x96, 0xA1, /* 0x70-0x73 */ + 0x00, 0x00, 0x99, 0xE8, 0x99, 0xF1, 0x99, 0xEC, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEF, /* 0x78-0x7B */ + 0x8C, 0xC4, 0x96, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x99, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x99, 0xF2, 0x00, 0x00, 0x99, 0xF4, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x75, 0x8D, 0xEE, /* 0x88-0x8B */ + 0x98, 0x61, 0x00, 0x00, 0x99, 0xE9, 0x99, 0xE7, /* 0x8C-0x8F */ + 0x99, 0xF3, 0x00, 0x00, 0x99, 0xEE, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xED, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xF6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x9A, 0x42, 0x99, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x99, 0xFC, 0xED, 0x76, 0x00, 0x00, 0x9A, 0x40, /* 0xA8-0xAB */ + 0x99, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x5D, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE7, 0x8A, 0x50, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x99, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9A, 0x44, 0x88, 0xF4, 0x9A, 0x43, 0x00, 0x00, /* 0xBC-0xBF */ + 0x88, 0xA3, 0x95, 0x69, 0x9A, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x99, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x99, 0xF5, /* 0xC4-0xC7 */ + 0x99, 0xFB, 0x8D, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9A, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x88, 0xF5, 0x9A, 0x4E, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x9A, 0x46, 0x9A, 0x47, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8F, 0xA3, 0x96, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9A, 0x4C, 0x9A, 0x4B, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x4E, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x4D, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9A, 0x4A, 0x00, 0x00, 0xED, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x89, 0x53, 0x00, 0x00, 0x8D, 0xB4, 0x90, 0x4F, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x48, /* 0x0C-0x0F */ + 0x93, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9A, 0x49, 0x00, 0x00, 0x88, 0xA0, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x53, 0x97, 0x42, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0xA5, 0x00, 0x00, 0x9A, 0x59, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9A, 0x58, 0x9A, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xC1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9A, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x91, 0xED, 0x9A, 0x55, 0x8F, 0xA4, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x9A, 0x52, 0x00, 0x00, 0x00, 0x00, 0x96, 0xE2, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5B, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x56, 0x9A, 0x57, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9A, 0x54, 0x9A, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x51, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x60, /* 0x78-0x7B */ + 0x9A, 0x65, 0x00, 0x00, 0x9A, 0x61, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9A, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x66, /* 0x80-0x83 */ + 0x91, 0x50, 0x00, 0x00, 0xED, 0x78, 0x9A, 0x68, /* 0x84-0x87 */ + 0x00, 0x00, 0x8D, 0x41, 0x9A, 0x5E, 0x92, 0x9D, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x9A, 0x62, 0x9A, 0x5B, 0x8A, 0xAB, 0x00, 0x00, /* 0x98-0x9B */ + 0x8A, 0xEC, 0x8A, 0x85, 0x9A, 0x63, 0x9A, 0x5F, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x96, /* 0xA4-0xA7 */ + 0x9A, 0x69, 0x9A, 0x67, 0x91, 0x72, 0x8B, 0x69, /* 0xA8-0xAB */ + 0x8B, 0xAA, 0x00, 0x00, 0x9A, 0x64, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8B, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x63, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9A, 0x6D, 0x9A, 0x6B, 0x00, 0x00, 0x9A, 0xA5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9A, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9A, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6C, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6B, /* 0xE0-0xE3 */ + 0x9A, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x72, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9A, 0x75, 0x9A, 0x74, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x51, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x89, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9A, 0x71, 0x00, 0x00, 0x9A, 0x73, 0x8F, 0xA6, /* 0x14-0x17 */ + 0x89, 0x52, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x76, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x82, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0xFA, 0x9A, 0x7D, 0x00, 0x00, /* 0x30-0x33 */ + 0x9A, 0x7B, 0x00, 0x00, 0x9A, 0x7C, 0x00, 0x00, /* 0x34-0x37 */ + 0x9A, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x5C, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x91, 0x58, 0x00, 0x00, 0x9A, 0x78, 0x00, 0x00, /* 0x4C-0x4F */ + 0x9A, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x9A, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x9A, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8A, 0xED, 0x00, 0x00, 0x9A, 0x84, 0x9A, 0x80, /* 0x68-0x6B */ + 0x9A, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x95, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x93, 0xD3, 0x00, 0x00, 0x94, 0xB6, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x85, 0x8A, 0x64, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x87, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9A, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9A, 0x88, 0x00, 0x00, 0x94, 0x58, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x9A, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8C, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x9A, 0x8E, 0x00, 0x00, 0x9A, 0x8D, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9A, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9A, 0x93, 0x9A, 0x91, 0x9A, 0x8F, 0x9A, 0x92, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x9A, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x95, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9A, 0x96, 0x00, 0x00, 0x9A, 0x97, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x98, /* 0xD4-0xD7 */ + 0x99, 0x64, 0x00, 0x00, 0x8E, 0xFA, 0x8E, 0x6C, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF1, 0x00, 0x00, /* 0xDC-0xDF */ + 0x88, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x92, 0x63, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x99, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8D, 0xA2, 0x00, 0x00, 0x88, 0xCD, 0x90, 0x7D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0x9A, 0x8C, 0xC5, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x8D, 0x91, 0x00, 0x00, 0x9A, 0x9C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x9A, 0x9B, 0x00, 0x00, 0x00, 0x00, 0x95, 0xDE, /* 0x00-0x03 */ + 0x9A, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9A, 0x9F, 0x9A, 0x9E, 0x00, 0x00, 0x9A, 0xA0, /* 0x08-0x0B */ + 0x00, 0x00, 0x9A, 0xA1, 0x00, 0x00, 0x8C, 0x97, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x80, 0x9A, 0xA2, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA4, 0x00, 0x00, /* 0x14-0x17 */ + 0x9A, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9A, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0x79, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA7, 0x88, 0xB3, /* 0x24-0x27 */ + 0x8D, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8C, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x92, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA8, /* 0x34-0x37 */ + 0x9A, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAB, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9A, 0xAC, 0x00, 0x00, 0x8D, 0xE2, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xCF, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x56, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAA, 0x9A, 0xAD, /* 0x4C-0x4F */ + 0x8D, 0xBF, 0x8D, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xED, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9A, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x8D, 0xA3, 0xED, 0x7A, 0x92, 0x52, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x9A, 0xAE, 0x92, 0xD8, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x90, 0x82, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x9A, 0xB0, 0x9A, 0xB3, 0x00, 0x00, 0x8C, 0x5E, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB4, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9A, 0xB5, 0x00, 0x00, 0x8D, 0x43, 0x8A, 0x5F, /* 0xA0-0xA3 */ + 0x9A, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0xED, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x9A, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9A, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBA, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBB, 0xED, 0x7D, /* 0xC4-0xC7 */ + 0xED, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x96, 0x84, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xE9, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xD0-0xD3 */ + 0x9A, 0xBC, 0x00, 0x00, 0x9A, 0xC0, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x94, 0x57, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE6, /* 0xDC-0xDF */ + 0x95, 0x75, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x8F, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xB7, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x94, 0x7C, 0x8A, 0xEE, 0x00, 0x00, /* 0xF8-0xFB */ + 0x8D, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0x96, 0x78, 0x00, 0x00, 0x93, 0xB0, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x8C, 0x98, 0x91, 0xCD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBF, 0x9A, 0xC2, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x91, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9A, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x9A, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9A, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x92, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAC, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9F, /* 0x2C-0x2F */ + 0x89, 0x81, 0x95, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x8F, 0xEA, 0x93, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE4, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x9A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x95, 0xBB, 0x97, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF2, 0x9A, 0xC8, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x91, 0x59, 0x9A, 0xCB, 0x00, 0x00, /* 0x50-0x53 */ + 0x93, 0x83, 0x00, 0x00, 0x00, 0x00, 0x93, 0x68, /* 0x54-0x57 */ + 0x93, 0x84, 0x94, 0xB7, 0x92, 0xCB, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC7, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x89, 0x96, 0x00, 0x00, 0x93, 0x55, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x9A, 0xC9, 0x00, 0x00, 0x9A, 0xC5, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x90, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x9A, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAB, /* 0x80-0x83 */ + 0x00, 0x00, 0x9A, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE6, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x9D, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x92, 0xC4, 0x00, 0x00, 0xED, 0x81, 0x9A, 0xD0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD1, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD6, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x82, 0x95, 0xAD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9A, 0xD5, 0x9A, 0xCF, 0x9A, 0xD2, 0x9A, 0xD4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA4, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x95, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9A, 0xD7, 0x00, 0x00, 0x92, 0x64, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF3, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8F, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9A, 0xD9, 0x00, 0x00, 0x9A, 0xD8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x8D, 0x88, 0x00, 0x00, 0x9A, 0xDA, /* 0xD4-0xD7 */ + 0x9A, 0xDC, 0x9A, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9A, 0xDE, 0x00, 0x00, 0x9A, 0xD3, 0x9A, 0xE0, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9A, 0xDF, 0x9A, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6D, /* 0xE8-0xEB */ + 0x90, 0x70, 0x00, 0x00, 0x91, 0x73, 0x9A, 0xE1, /* 0xEC-0xEF */ + 0x90, 0xBA, 0x88, 0xEB, 0x94, 0x84, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xD9, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0xE3, 0x9A, 0xE2, 0x9A, 0xE4, /* 0xF8-0xFB */ + 0x9A, 0xE5, 0x9A, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xE7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x95, 0xCF, 0x9A, 0xE8, 0xED, 0x83, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC4, /* 0x0C-0x0F */ + 0x9A, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x97, 0x5B, 0x8A, 0x4F, 0x00, 0x00, /* 0x14-0x17 */ + 0x99, 0xC7, 0x8F, 0x67, 0x91, 0xBD, 0x9A, 0xEA, /* 0x18-0x1B */ + 0x96, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xB2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x9A, 0xEC, 0x00, 0x00, 0x91, 0xE5, /* 0x24-0x27 */ + 0x00, 0x00, 0x93, 0x56, 0x91, 0xBE, 0x95, 0x76, /* 0x28-0x2B */ + 0x9A, 0xED, 0x9A, 0xEE, 0x89, 0x9B, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xB8, 0x9A, 0xEF, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xCE, /* 0x34-0x37 */ + 0x9A, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x89, 0x82, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xEF, /* 0x44-0x47 */ + 0x93, 0xDE, 0x95, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xF5, 0x91, 0x74, /* 0x4C-0x4F */ + 0x9A, 0xF4, 0x8C, 0x5F, 0x00, 0x00, 0xED, 0x84, /* 0x50-0x53 */ + 0x96, 0x7A, 0x9A, 0xF3, 0x00, 0x00, 0x93, 0x85, /* 0x54-0x57 */ + 0x9A, 0xF7, 0x00, 0x00, 0x9A, 0xF6, 0xED, 0x85, /* 0x58-0x5B */ + 0x00, 0x00, 0xED, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x9A, 0xF9, 0x00, 0x00, 0x9A, 0xF8, 0xED, 0x87, /* 0x60-0x63 */ + 0x00, 0x00, 0x89, 0x9C, 0x00, 0x00, 0x9A, 0xFA, /* 0x64-0x67 */ + 0x8F, 0xA7, 0x9A, 0xFC, 0x92, 0x44, 0x00, 0x00, /* 0x68-0x6B */ + 0x9A, 0xFB, 0x00, 0x00, 0x95, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x97, /* 0x70-0x73 */ + 0x93, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9B, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8D, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9B, 0x41, 0x94, 0x40, 0x94, 0xDC, /* 0x80-0x83 */ + 0x96, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x44, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x57, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x64, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x96, 0xAD, 0x00, 0x00, 0x9B, 0xAA, /* 0x98-0x9B */ + 0x00, 0x00, 0x9B, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x45, /* 0xA0-0xA3 */ + 0xED, 0x88, 0x91, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x93, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x96, 0x85, 0xED, 0x89, 0x8D, 0xC8, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xA8, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x47, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8E, 0x6F, 0x00, 0x00, 0x8E, 0x6E, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x88, 0xB7, 0x8C, 0xC6, 0x00, 0x00, 0x90, 0xA9, /* 0xD0-0xD3 */ + 0x88, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9B, 0x4B, 0x9B, 0x4C, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9B, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x89, 0x57, 0x8A, 0xAD, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9B, 0x48, 0x00, 0x00, 0x96, 0xC3, 0x95, 0x50, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xA6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF7, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x70, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x00, 0x00, 0x88, 0xD0, 0x00, 0x00, 0x88, 0xA1, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x9B, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x96, 0xBA, 0x00, 0x00, 0x9B, 0x52, 0x00, 0x00, /* 0x18-0x1B */ + 0x9B, 0x50, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x4E, /* 0x1C-0x1F */ + 0x90, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x9B, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x95, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE2, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x9B, 0x56, 0x9B, 0x57, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0x53, 0x98, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x6B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x9B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA5, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x58, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x77, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x59, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x7D, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x5A, 0x95, 0x51, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9B, 0x5B, 0x9B, 0x5F, 0x9B, 0x5C, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x89, 0xC5, 0x9B, 0x5E, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8E, 0xB9, 0x00, 0x00, 0x9B, 0x5D, /* 0xC8-0xCB */ + 0x8C, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9B, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x64, 0x9B, 0x61, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x92, 0x84, 0x00, 0x00, 0x9B, 0x60, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x62, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9B, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x65, 0x9B, 0x66, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8A, 0xF0, 0x00, 0x00, 0x9B, 0x68, /* 0x08-0x0B */ + 0x9B, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x69, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xEC, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6C, 0x00, 0x00, /* 0x28-0x2B */ + 0x92, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x89, 0x64, 0x00, 0x00, 0x9B, 0x6A, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6E, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0x71, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6F, /* 0x40-0x43 */ + 0x00, 0x00, 0x9B, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0x71, 0x9B, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8D, 0x45, 0x9B, 0x73, 0xED, 0x8A, 0x8E, 0x9A, /* 0x54-0x57 */ + 0x91, 0xB6, 0x00, 0x00, 0x9B, 0x74, 0x9B, 0x75, /* 0x58-0x5B */ + 0x8E, 0x79, 0x8D, 0x46, 0x00, 0x00, 0x96, 0xD0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x47, /* 0x60-0x63 */ + 0x8C, 0xC7, 0x9B, 0x76, 0x8A, 0x77, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x9B, 0x77, 0x00, 0x00, 0x91, 0xB7, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x9B, 0x78, 0x9B, 0xA1, 0x00, 0x00, 0x9B, 0x79, /* 0x70-0x73 */ + 0x00, 0x00, 0x9B, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9B, 0x7B, 0x00, 0x00, 0x9B, 0x7D, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9B, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x80, /* 0x80-0x83 */ + 0x00, 0x00, 0x91, 0xEE, 0x00, 0x00, 0x89, 0x46, /* 0x84-0x87 */ + 0x8E, 0xE7, 0x88, 0xC0, 0x00, 0x00, 0x91, 0x76, /* 0x88-0x8B */ + 0x8A, 0xAE, 0x8E, 0xB3, 0x00, 0x00, 0x8D, 0x47, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x93, 0x86, 0x00, 0x00, 0x8F, 0x40, /* 0x94-0x97 */ + 0x8A, 0xAF, 0x92, 0x88, 0x92, 0xE8, 0x88, 0xB6, /* 0x98-0x9B */ + 0x8B, 0x58, 0x95, 0xF3, 0x00, 0x00, 0x8E, 0xC0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x71, 0x90, 0xE9, /* 0xA0-0xA3 */ + 0x8E, 0xBA, 0x97, 0x47, 0x9B, 0x81, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x7B, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x51, /* 0xB0-0xB3 */ + 0x89, 0x83, 0x8F, 0xAA, 0x89, 0xC6, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9B, 0x82, 0x97, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x68, /* 0xBC-0xBF */ + 0xED, 0x8B, 0x00, 0x00, 0x8E, 0xE2, 0x9B, 0x83, /* 0xC0-0xC3 */ + 0x8A, 0xF1, 0x93, 0xD0, 0x96, 0xA7, 0x9B, 0x84, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9B, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x95, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9B, 0x87, 0x00, 0x00, 0x8A, 0xA6, 0x8B, 0xF5, /* 0xD0-0xD3 */ + 0x9B, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xED, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB0, /* 0xD8-0xDB */ + 0x00, 0x00, 0x90, 0x51, 0x9B, 0x8B, 0x8E, 0x40, /* 0xDC-0xDF */ + 0x00, 0x00, 0x89, 0xC7, 0x9B, 0x8A, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9B, 0x88, 0x9B, 0x8C, 0x9B, 0x89, 0x94, 0x4A, /* 0xE4-0xE7 */ + 0x9E, 0xCB, 0x90, 0x52, 0x00, 0x00, 0x9B, 0x8D, /* 0xE8-0xEB */ + 0xED, 0x8E, 0x00, 0x00, 0x97, 0xBE, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9B, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x90, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x92, 0x9E, 0x9B, 0x8F, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x90, 0xA1, 0x00, 0x00, 0x8E, 0x9B, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xCE, 0x8E, 0xF5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0x95, 0x95, 0x90, 0xEA, 0x00, 0x00, /* 0x00-0x03 */ + 0x8E, 0xCB, 0x9B, 0x91, 0x8F, 0xAB, 0x9B, 0x92, /* 0x04-0x07 */ + 0x9B, 0x93, 0x88, 0xD1, 0x91, 0xB8, 0x90, 0x71, /* 0x08-0x0B */ + 0x00, 0x00, 0x9B, 0x94, 0x93, 0xB1, 0x8F, 0xAC, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8F, 0xAD, 0x00, 0x00, 0x9B, 0x95, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xEB, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xAE, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x8F, 0x00, 0x00, /* 0x1C-0x1F */ + 0x9B, 0x96, 0x00, 0x00, 0x9B, 0x97, 0x00, 0x00, /* 0x20-0x23 */ + 0x96, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x9B, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8B, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9B, 0x99, 0x9B, 0x9A, 0x8E, 0xDA, 0x90, 0x4B, /* 0x38-0x3B */ + 0x93, 0xF2, 0x90, 0x73, 0x94, 0xF6, 0x94, 0x41, /* 0x3C-0x3F */ + 0x8B, 0xC7, 0x9B, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8B, 0x8F, 0x9B, 0x9C, 0x00, 0x00, /* 0x44-0x47 */ + 0x8B, 0xFC, 0x00, 0x00, 0x93, 0xCD, 0x89, 0xAE, /* 0x48-0x4B */ + 0x00, 0x00, 0x8E, 0x72, 0x9B, 0x9D, 0x9B, 0xA0, /* 0x4C-0x4F */ + 0x9B, 0x9F, 0x8B, 0xFB, 0x00, 0x00, 0x9B, 0x9E, /* 0x50-0x53 */ + 0x00, 0x00, 0x93, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xAE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x93, 0x6A, 0x8E, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x91, 0x77, 0x97, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0xA2, 0x00, 0x00, 0x9B, 0xA3, 0x93, 0xD4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8E, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xA5, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x9B, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x9B, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8A, 0xF2, 0x9B, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9B, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x89, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x90, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x91, 0x5A, 0x8A, 0xE2, 0x00, 0x00, 0x9B, 0xAB, /* 0xA8-0xAB */ + 0x96, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x91, 0xD0, 0x00, 0x00, 0x8A, 0x78, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xAD, 0x9B, 0xAF, /* 0xB4-0xB7 */ + 0x8A, 0xDD, 0x00, 0x00, 0xED, 0x91, 0x9B, 0xAC, /* 0xB8-0xBB */ + 0x9B, 0xAE, 0x00, 0x00, 0x9B, 0xB1, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9B, 0xB0, 0x00, 0x00, 0x9B, 0xB2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x93, 0xBB, 0x8B, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x89, 0xE3, 0x9B, 0xB4, 0x9B, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9B, 0xB7, 0x00, 0x00, 0x95, 0xF5, /* 0xEC-0xEF */ + 0x95, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xED, 0x92, 0x93, 0x87, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xB6, 0x8F, 0x73, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x92, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBA, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9B, 0xC1, 0x9B, 0xBB, 0x8A, 0x52, 0x9B, 0xBC, /* 0x14-0x17 */ + 0x9B, 0xC5, 0x9B, 0xC4, 0x9B, 0xC3, 0x9B, 0xBF, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x93, /* 0x24-0x27 */ + 0x00, 0x00, 0x95, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x96, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC9, /* 0x48-0x4B */ + 0x9B, 0xC6, 0x00, 0x00, 0x9B, 0xC8, 0x00, 0x00, /* 0x4C-0x4F */ + 0x97, 0x92, 0x00, 0x00, 0x9B, 0xC7, 0xED, 0x94, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9B, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x90, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0xCA, 0xED, 0x97, 0x00, 0x00, 0x8D, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCB, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCC, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCF, 0x00, 0x00, /* 0x80-0x83 */ + 0x9B, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x88, /* 0x88-0x8B */ + 0x9B, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9B, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x9B, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x9B, 0xD2, 0x00, 0x00, 0x9B, 0xD3, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD6, /* 0xB4-0xB7 */ + 0xED, 0x98, 0xED, 0x99, 0x97, 0xE4, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9B, 0xD7, 0x9B, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9B, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8A, 0xDE, 0x9B, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xED, 0x9A, 0x00, 0x00, 0x9B, 0xDB, 0x9B, 0xDA, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDD, /* 0xD8-0xDB */ + 0x00, 0x00, 0x90, 0xEC, 0x8F, 0x42, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8F, 0x84, 0x00, 0x00, 0x91, 0x83, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x8D, 0x48, 0x8D, 0xB6, 0x8D, 0x49, /* 0xE4-0xE7 */ + 0x8B, 0x90, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDE, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x8C, 0xC8, 0x9B, 0xDF, 0x96, 0xA4, /* 0xF0-0xF3 */ + 0x94, 0x62, 0x9B, 0xE0, 0x00, 0x00, 0x8D, 0x4A, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAA, /* 0xF8-0xFB */ + 0x00, 0x00, 0x92, 0x46, 0x8B, 0xD0, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x73, 0x95, 0x7A, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xBF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE1, /* 0x08-0x0B */ + 0x8A, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x9F, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9B, 0xE3, 0x9B, 0xE2, 0x9B, 0xE5, /* 0x18-0x1B */ + 0x00, 0x00, 0x92, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x90, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x74, /* 0x28-0x2B */ + 0x00, 0x00, 0x90, 0xC8, 0x00, 0x00, 0x91, 0xD1, /* 0x2C-0x2F */ + 0x8B, 0x41, 0x00, 0x00, 0x00, 0x00, 0x92, 0xA0, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE6, 0x9B, 0xE7, /* 0x34-0x37 */ + 0x8F, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x96, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE9, /* 0x40-0x43 */ + 0x9B, 0xE8, 0x95, 0x9D, 0x00, 0x00, 0x9B, 0xF1, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x96, 0x79, 0x00, 0x00, 0x9B, 0xEB, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x9B, 0xED, 0x96, 0x8B, 0x00, 0x00, 0x9B, 0xEC, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xEE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x94, 0xA6, 0x9B, 0xEF, 0x95, 0xBC, /* 0x60-0x63 */ + 0x9B, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB1, 0x95, 0xBD, /* 0x70-0x73 */ + 0x94, 0x4E, 0x9B, 0xF2, 0x9B, 0xF3, 0x00, 0x00, /* 0x74-0x77 */ + 0x8D, 0x4B, 0x8A, 0xB2, 0x9B, 0xF4, 0x8C, 0xB6, /* 0x78-0x7B */ + 0x97, 0x63, 0x97, 0x48, 0x8A, 0xF4, 0x9B, 0xF6, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x92, 0xA1, 0x00, 0x00, 0x8D, 0x4C, /* 0x80-0x83 */ + 0x8F, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x94, 0xDD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xB0, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x98, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x92, 0xEA, 0x95, 0xF7, 0x93, 0x58, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x4D, 0x00, 0x00, /* 0x98-0x9B */ + 0x95, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9B, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x78, 0x8D, 0xC0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xC9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x92, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x88, 0xC1, 0x8F, 0x8E, 0x8D, 0x4E, /* 0xB4-0xB7 */ + 0x97, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9B, 0xF8, 0x9B, 0xF9, 0x94, 0x70, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x9B, 0xFA, 0x97, 0xF5, 0x98, 0x4C, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xFC, /* 0xCC-0xCF */ + 0x9B, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x66, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x40, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x43, 0x9C, 0x44, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9C, 0x42, 0x00, 0x00, 0x95, 0x5F, /* 0xDC-0xDF */ + 0x8F, 0xB1, 0x9C, 0x46, 0x9C, 0x45, 0x9C, 0x41, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9C, 0x47, 0x9C, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9C, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9C, 0x4C, 0x9C, 0x4A, 0x00, 0x00, 0x9C, 0x4B, /* 0xF0-0xF3 */ + 0x9C, 0x4D, 0x00, 0x00, 0x89, 0x84, 0x92, 0xEC, /* 0xF4-0xF7 */ + 0x9C, 0x4E, 0x00, 0x00, 0x8C, 0x9A, 0x89, 0xF4, /* 0xF8-0xFB */ + 0x94, 0x55, 0x00, 0x00, 0x9C, 0x4F, 0x93, 0xF9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0x95, 0xD9, 0x00, 0x00, 0x9C, 0x50, /* 0x00-0x03 */ + 0x98, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x9C, 0x51, 0x95, 0xBE, 0x9C, 0x54, /* 0x08-0x0B */ + 0x98, 0x9F, 0x98, 0xAF, 0x00, 0x00, 0x8E, 0xAE, /* 0x0C-0x0F */ + 0x93, 0xF3, 0x9C, 0x55, 0x00, 0x00, 0x8B, 0x7C, /* 0x10-0x13 */ + 0x92, 0xA2, 0x88, 0xF8, 0x9C, 0x56, 0x95, 0xA4, /* 0x14-0x17 */ + 0x8D, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6F, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xED, /* 0x1C-0x1F */ + 0x00, 0x00, 0xED, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x96, 0xED, 0x8C, 0xB7, 0x8C, 0xCA, /* 0x24-0x27 */ + 0x00, 0x00, 0x9C, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x9C, 0x58, 0x00, 0x00, 0x9C, 0x5E, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xED, 0x9C, 0x92, 0xA3, 0x00, 0x00, 0x8B, 0xAD, /* 0x34-0x37 */ + 0x9C, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x95, 0x4A, 0x00, 0x00, 0x92, 0x65, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9C, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xED, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x9C, 0x5B, 0x00, 0x00, 0x8B, 0xAE, 0x00, 0x00, /* 0x48-0x4B */ + 0x9C, 0x5C, 0x00, 0x00, 0x9C, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x9C, 0x5F, 0x00, 0x00, 0x93, 0x96, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x60, 0x9C, 0x61, /* 0x54-0x57 */ + 0x00, 0x00, 0x9C, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9C, 0x53, 0x9C, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9C, 0x63, 0x8C, 0x60, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x46, 0xED, 0x9D, /* 0x64-0x67 */ + 0x00, 0x00, 0x8D, 0xCA, 0x95, 0x56, 0x92, 0xA4, /* 0x68-0x6B */ + 0x95, 0x6A, 0x9C, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8F, 0xB2, 0x89, 0x65, 0x00, 0x00, 0x9C, 0x65, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x66, /* 0x74-0x77 */ + 0x00, 0x00, 0x96, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x94, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x69, /* 0x7C-0x7F */ + + 0x89, 0x9D, 0x90, 0xAA, 0x9C, 0x68, 0x9C, 0x67, /* 0x80-0x83 */ + 0x8C, 0x61, 0x91, 0xD2, 0x00, 0x00, 0x9C, 0x6D, /* 0x84-0x87 */ + 0x9C, 0x6B, 0x00, 0x00, 0x9C, 0x6A, 0x97, 0xA5, /* 0x88-0x8B */ + 0x8C, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x99, 0x9C, 0x6C, 0x93, 0x6B, 0x8F, 0x5D, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBE, /* 0x94-0x97 */ + 0x9C, 0x70, 0x9C, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x6E, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9C, 0x71, 0x8C, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x9C, 0x72, 0x95, 0x9C, 0x8F, 0x7A, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x9C, 0x73, 0x94, 0xF7, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBF, /* 0xB0-0xB3 */ + 0x92, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xED, 0x9E, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x93, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9C, 0x74, 0x8B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x53, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x95, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8A, 0xF5, 0x94, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x75, 0x8E, 0x75, /* 0xD4-0xD7 */ + 0x96, 0x59, 0x96, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x89, 0x9E, 0x9C, 0x7A, 0xED, 0x9F, 0x00, 0x00, /* 0xDC-0xDF */ + 0x92, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9C, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9C, 0xAB, 0x9C, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x94, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x9C, 0x78, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x76, /* 0xF8-0xFB */ + 0x00, 0x00, 0x8D, 0x9A, 0x00, 0x00, 0x9C, 0x7C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x83, 0x9C, 0x89, /* 0x0C-0x0F */ + 0x9C, 0x81, 0x00, 0x00, 0x93, 0x7B, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x9C, 0x86, 0x95, 0x7C, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9C, 0x80, 0x00, 0x00, 0x9C, 0x85, /* 0x18-0x1B */ + 0x97, 0xE5, 0x8E, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0xD3, 0x9C, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x8B, 0x7D, 0x9C, 0x88, 0x90, 0xAB, /* 0x24-0x27 */ + 0x89, 0x85, 0x9C, 0x82, 0x89, 0xF6, 0x9C, 0x87, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAF, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9C, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x8A, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9C, 0x8C, 0x9C, 0x96, 0x9C, 0x94, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x91, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x90, 0x97, 0xF6, /* 0x48-0x4B */ + 0x00, 0x00, 0x9C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0xB0, 0x00, 0x00, 0x8D, 0x50, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x8F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9C, 0x99, 0x9C, 0x8B, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xED, 0xA0, 0x00, 0x00, 0x9C, 0x8F, /* 0x5C-0x5F */ + 0x9C, 0x7E, 0x00, 0x00, 0x89, 0xF8, 0x9C, 0x93, /* 0x60-0x63 */ + 0x9C, 0x95, 0x92, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8D, 0xA6, 0x89, 0xB6, 0x9C, 0x8D, 0x9C, 0x98, /* 0x68-0x6B */ + 0x9C, 0x97, 0x8B, 0xB1, 0x00, 0x00, 0x91, 0xA7, /* 0x6C-0x6F */ + 0x8A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8C, 0x62, 0x00, 0x00, 0x9C, 0x8E, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9C, 0x9A, 0x00, 0x00, 0x9C, 0x9D, /* 0x80-0x83 */ + 0x9C, 0x9F, 0xED, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x8E, 0xBB, 0xED, 0xA2, 0x9C, 0xA5, /* 0x88-0x8B */ + 0x92, 0xEE, 0x9C, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xA3, 0x00, 0x00, /* 0x90-0x93 */ + 0x89, 0xF7, 0x00, 0x00, 0x9C, 0xA1, 0x9C, 0xA2, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9E, 0x9C, 0xA0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE5, /* 0x9C-0x9F */ + 0x97, 0x49, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB3, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x78, 0x9C, 0xA4, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x94, 0x59, 0x88, 0xAB, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xDF, 0x9C, 0x7B, /* 0xB0-0xB3 */ + 0x9C, 0xAA, 0x9C, 0xAE, 0x96, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x93, 0x89, 0x9C, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8F, 0xEE, 0x9C, 0xAD, 0x93, 0xD5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x98, 0x66, 0x00, 0x00, 0x9C, 0xA9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9C, 0xAF, 0x00, 0x00, 0x8D, 0x9B, 0x00, 0x00, /* 0xD8-0xDB */ + 0x90, 0xC9, 0x00, 0x00, 0xED, 0xA3, 0x88, 0xD2, /* 0xDC-0xDF */ + 0x9C, 0xA8, 0x9C, 0xA6, 0x00, 0x00, 0x91, 0x79, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9C, /* 0xE4-0xE7 */ + 0x8E, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x91, 0xC4, 0x9C, 0xBB, 0xED, 0xA6, 0x91, 0x7A, /* 0xF0-0xF3 */ + 0x9C, 0xB6, 0x00, 0x00, 0x9C, 0xB3, 0x9C, 0xB4, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x8E, 0xE4, 0x9C, 0xB7, 0x9C, 0xBA, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_61[512] = { + 0x9C, 0xB5, 0x8F, 0x44, 0x00, 0x00, 0x9C, 0xB8, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xB2, 0x00, 0x00, /* 0x04-0x07 */ + 0x96, 0xFA, 0x96, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x9C, 0xBC, 0x9C, 0xBD, 0x88, 0xD3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xED, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x9C, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xF0, 0x88, 0xA4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB4, /* 0x1C-0x1F */ + 0xED, 0xA5, 0x9C, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC1, /* 0x24-0x27 */ + 0x9C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x9C, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xED, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9C, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xED, 0xA8, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x9C, 0xC4, 0x9C, 0xC7, 0x9C, 0xBF, 0x9C, 0xC3, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC8, 0x00, 0x00, /* 0x40-0x43 */ + 0x9C, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xBE, /* 0x44-0x47 */ + 0x8E, 0x9C, 0x00, 0x00, 0x9C, 0xC2, 0x91, 0xD4, /* 0x48-0x4B */ + 0x8D, 0x51, 0x9C, 0xB0, 0x90, 0x54, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xD6, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9C, 0xD5, 0x00, 0x00, 0x9C, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x9D, 0x8A, 0xB5, /* 0x60-0x63 */ + 0x00, 0x00, 0x9C, 0xD2, 0x00, 0x00, 0x8C, 0x64, /* 0x64-0x67 */ + 0x8A, 0x53, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xCF, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xB6, 0x9C, 0xD1, /* 0x6C-0x6F */ + 0x88, 0xD4, 0x9C, 0xD3, 0x00, 0x00, 0x9C, 0xCA, /* 0x70-0x73 */ + 0x9C, 0xD0, 0x9C, 0xD7, 0x8C, 0x63, 0x9C, 0xCB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x7C, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x97, 0x4A, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDA, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDE, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x9E, 0x00, 0x00, /* 0x8C-0x8F */ + 0x97, 0xF7, 0x9C, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9C, 0xDC, 0x00, 0x00, 0x9C, 0xD9, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xAA, 0x9C, 0xD8, 0x9C, 0xDD, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x95, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB2, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8C, 0x65, 0x00, 0x00, 0x9C, 0xE0, /* 0xA8-0xAB */ + 0x9C, 0xDB, 0x00, 0x00, 0x9C, 0xE1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x9B, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xAF, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE7, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE8, 0x8D, 0xA7, /* 0xC4-0xC7 */ + 0x9C, 0xE6, 0x9C, 0xE4, 0x9C, 0xE3, 0x9C, 0xEA, /* 0xC8-0xCB */ + 0x9C, 0xE2, 0x9C, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x89, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xEE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xED, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x9C, 0xF1, 0x00, 0x00, 0x9C, 0xEF, 0x9C, 0xE5, /* 0xF4-0xF7 */ + 0x8C, 0x9C, 0x00, 0x00, 0x9C, 0xF0, 0x00, 0x00, /* 0xF8-0xFB */ + 0x9C, 0xF4, 0x9C, 0xF3, 0x9C, 0xF5, 0x9C, 0xF2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0x9C, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9C, 0xF7, 0x9C, 0xF8, 0x95, 0xE8, 0x00, 0x00, /* 0x08-0x0B */ + 0x9C, 0xFA, 0x9C, 0xF9, 0x8F, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x90, 0xAC, 0x89, 0xE4, 0x89, 0xFA, 0xED, 0xAB, /* 0x10-0x13 */ + 0x9C, 0xFB, 0x00, 0x00, 0x88, 0xBD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xCA, 0x9C, 0xFC, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0xC1, 0x9D, 0x40, 0x8C, 0x81, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9D, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xED, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x42, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x43, 0x8B, 0x59, /* 0x2C-0x2F */ + 0x9D, 0x44, 0x00, 0x00, 0x9D, 0x45, 0x9D, 0x46, /* 0x30-0x33 */ + 0x91, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x8C, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x96, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5B, /* 0x3C-0x3F */ + 0x8F, 0x8A, 0x9D, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xEE, /* 0x44-0x47 */ + 0xE7, 0xBB, 0x94, 0xE0, 0x00, 0x00, 0x8E, 0xE8, /* 0x48-0x4B */ + 0x00, 0x00, 0x8D, 0xCB, 0x9D, 0x48, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xC5, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x91, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4B, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x49, 0x00, 0x00, /* 0x5C-0x5F */ + 0x9D, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4A, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x9D, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xAF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x88, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x7D, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x94, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x9D, 0x4E, 0x00, 0x00, 0x9D, 0x51, 0x8F, 0xB3, /* 0x7C-0x7F */ + + 0x8B, 0x5A, 0x00, 0x00, 0x9D, 0x4F, 0x9D, 0x56, /* 0x80-0x83 */ + 0x8F, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x9D, 0x50, 0x94, 0x63, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x97, 0x7D, 0x9D, 0x52, 0x9D, 0x53, /* 0x90-0x93 */ + 0x9D, 0x57, 0x93, 0x8A, 0x9D, 0x54, 0x8D, 0x52, /* 0x94-0x97 */ + 0x90, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x65, /* 0x98-0x9B */ + 0x94, 0xB2, 0x00, 0x00, 0x91, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xE2, /* 0xA8-0xAB */ + 0x9D, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x95, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x92, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x96, 0x95, 0x00, 0x00, 0x9D, 0x5A, /* 0xB8-0xBB */ + 0x89, 0x9F, 0x92, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x63, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x92, 0x53, 0x9D, 0x5D, 0x9D, 0x64, /* 0xC4-0xC7 */ + 0x9D, 0x5F, 0x9D, 0x66, 0x9D, 0x62, 0x00, 0x00, /* 0xC8-0xCB */ + 0x9D, 0x61, 0x94, 0x8F, 0x00, 0x00, 0x9D, 0x5B, /* 0xCC-0xCF */ + 0x89, 0xFB, 0x9D, 0x59, 0x8B, 0x91, 0x91, 0xF1, /* 0xD0-0xD3 */ + 0x9D, 0x55, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x58, /* 0xD4-0xD7 */ + 0x8D, 0x53, 0x90, 0xD9, 0x00, 0x00, 0x8F, 0xB5, /* 0xD8-0xDB */ + 0x9D, 0x60, 0x94, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8B, 0x92, 0x8A, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8A, 0x87, 0x90, 0x40, 0x9D, 0x68, 0x9D, 0x6D, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0x69, 0x00, 0x00, 0x8C, 0x9D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x9D, 0x6E, 0x8E, 0x41, 0x8D, 0x89, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x45, 0x9D, 0x5C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x00, 0x00, 0x8E, 0x9D, 0x9D, 0x6B, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x77, /* 0x04-0x07 */ + 0x9D, 0x6C, 0x88, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x9D, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x92, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x8B, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x6A, /* 0x24-0x27 */ + 0x88, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC1, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x55, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xF0, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x94, 0xD2, 0x9D, 0x70, 0x91, 0x7D, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x91, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8E, 0x4A, 0x9D, 0x71, 0x00, 0x00, 0x9D, 0x73, /* 0x4C-0x4F */ + 0x9D, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xDF, 0x00, 0x00, 0x92, 0xBB, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x91, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xF9, /* 0x64-0x67 */ + 0x8E, 0xCC, 0x9D, 0x80, 0x00, 0x00, 0x9D, 0x7E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x98, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x9E, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x78, 0x8F, 0xB7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xE6, 0x94, 0x50, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9D, 0x76, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7C, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x8E, 0xF6, 0x9D, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8F, 0xB6, 0x00, 0x00, 0x9D, 0x75, 0x9D, 0x7A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x72, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x74, 0x00, 0x00, /* 0x94-0x97 */ + 0x8C, 0x40, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x7C, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x7C, /* 0x9C-0x9F */ + 0x97, 0xA9, 0x8D, 0xCC, 0x92, 0x54, 0x9D, 0x79, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x90, 0xDA, 0x00, 0x00, 0x8D, 0x54, /* 0xA4-0xA7 */ + 0x90, 0x84, 0x89, 0x86, 0x91, 0x5B, 0x9D, 0x77, /* 0xA8-0xAB */ + 0x8B, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x66, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xCD, 0x9D, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7E, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x81, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9D, 0x83, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB5, /* 0xC0-0xC3 */ + 0x9D, 0x89, 0x00, 0x00, 0x9D, 0x84, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9D, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x60, /* 0xCC-0xCF */ + 0x92, 0xF1, 0x00, 0x00, 0x9D, 0x87, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x67, 0x8A, 0xB7, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x88, 0xAC, 0x00, 0x00, 0x9D, 0x85, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9D, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x89, 0x87, 0xED, 0xAD, 0x9D, 0x88, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x68, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8C, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x91, 0xB9, 0x00, 0x00, 0x9D, 0x93, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8D, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8A, 0x9D, 0x91, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8E, 0x00, 0x00, /* 0x24-0x27 */ + 0x9D, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x94, 0xC0, 0x93, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0x8B, 0x00, 0x00, 0x9D, 0x8F, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x67, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xDB, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x97, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x93, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xED, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x94, /* 0x64-0x67 */ + 0x00, 0x00, 0x96, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x95, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x96, 0x00, 0x00, /* 0x74-0x77 */ + 0x96, 0xCC, 0x00, 0x00, 0x90, 0xA0, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x82, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x9D, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x54, 0x9D, 0x9A, /* 0x90-0x93 */ + 0x00, 0x00, 0x9D, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x51, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xED, 0xAF, 0x93, 0xB3, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x93, 0x50, 0x9D, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x9D, 0x9C, 0x00, 0x00, 0x95, 0x8F, /* 0xA8-0xAB */ + 0x00, 0x00, 0x94, 0x64, 0x8E, 0x42, 0x00, 0x00, /* 0xAC-0xAF */ + 0x90, 0xEF, 0x00, 0x00, 0x96, 0x6F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x8A, 0x68, 0x00, 0x00, 0x9D, 0xA3, /* 0xB8-0xBB */ + 0x9D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x97, 0x69, 0x9D, 0xA5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9D, 0xA1, 0x00, 0x00, 0x9D, 0xA2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x91, 0x80, 0xED, 0xB0, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xA0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9D, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9D, 0xA4, 0x00, 0x00, 0x9D, 0x9F, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9D, 0xA9, 0x9D, 0xAA, 0x93, 0x46, 0x9D, 0xAC, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x43, 0x9D, 0xA7, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8B, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xAD, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0xA6, 0x9D, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x9D, 0xB0, 0x00, 0x00, 0x9D, 0xAF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9D, 0xB4, 0x8F, 0xEF, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0x9D, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x9D, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x9D, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0xB6, 0x9D, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB9, /* 0x20-0x23 */ + 0x9D, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x98, 0x9D, 0xBA, /* 0x28-0x2B */ + 0x9D, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x78, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBE, 0x9D, 0xBD, /* 0x34-0x37 */ + 0x9D, 0xBF, 0x89, 0xFC, 0x00, 0x00, 0x8D, 0x55, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xFA, 0x90, 0xAD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8C, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x9D, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9D, 0xC4, 0xED, 0xB1, 0x95, 0x71, /* 0x4C-0x4F */ + 0x00, 0x00, 0x8B, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x9D, 0xC3, 0x9D, 0xC2, 0x94, 0x73, /* 0x54-0x57 */ + 0x9D, 0xC5, 0x8B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9D, 0xC7, 0x9D, 0xC6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB8, 0x8E, 0x55, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xD6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x8C, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x90, 0x94, 0x00, 0x00, 0x9D, 0xC8, 0x00, 0x00, /* 0x70-0x73 */ + 0x90, 0xAE, 0x93, 0x47, 0x00, 0x00, 0x95, 0x7E, /* 0x74-0x77 */ + 0x9D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xB6, /* 0x84-0x87 */ + 0x9B, 0x7C, 0x90, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x95, 0x6B, 0x00, 0x00, 0x8D, 0xD6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x94, 0xE3, 0x94, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x6C, /* 0x94-0x97 */ + 0x00, 0x00, 0x97, 0xBF, 0x00, 0x00, 0x9D, 0xCD, /* 0x98-0x9B */ + 0x8E, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCE, /* 0x9C-0x9F */ + 0x00, 0x00, 0x88, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8B, 0xD2, 0x90, 0xCB, 0x00, 0x00, 0x95, 0x80, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCF, /* 0xA8-0xAB */ + 0x8E, 0x61, 0x92, 0x66, 0x00, 0x00, 0x8E, 0x7A, /* 0xAC-0xAF */ + 0x90, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD0, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x95, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x89, 0x97, 0x8E, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9D, 0xD3, 0x00, 0x00, 0x9D, 0xD1, /* 0xC0-0xC3 */ + 0x9D, 0xD4, 0x97, 0xB7, 0x9D, 0xD2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF9, /* 0xC8-0xCB */ + 0x9D, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB0, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF8, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9D, 0xD8, 0x00, 0x00, 0x9D, 0xD7, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9D, 0xD9, 0x9D, 0xDA, 0x8A, 0xF9, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0xFA, 0x92, 0x55, 0x8B, 0x8C, /* 0xE4-0xE7 */ + 0x8E, 0x7C, 0x91, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8F, 0x7B, 0x88, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA0, 0x9D, 0xDF, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_66[512] = { + 0xED, 0xB2, 0x00, 0x00, 0x8D, 0x56, 0x9D, 0xDE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA9, 0x8F, 0xB8, /* 0x04-0x07 */ + 0x00, 0x00, 0xED, 0xB5, 0x9D, 0xDD, 0x00, 0x00, /* 0x08-0x0B */ + 0x8F, 0xB9, 0x00, 0x00, 0x96, 0xBE, 0x8D, 0xA8, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xD5, /* 0x10-0x13 */ + 0x90, 0xCC, 0xED, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0xE4, 0x00, 0x00, 0xED, 0xB7, 0x90, 0xAF, /* 0x1C-0x1F */ + 0x89, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xED, 0xB8, 0x8F, 0x74, 0x00, 0x00, 0x96, 0x86, /* 0x24-0x27 */ + 0x8D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8F, 0xBA, 0xED, 0xB6, 0x90, 0xA5, /* 0x2C-0x2F */ + 0x00, 0x00, 0xED, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0xE3, 0x9D, 0xE1, 0x9D, 0xE2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB4, /* 0x38-0x3B */ + 0x92, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x45, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9D, 0xE8, 0x8E, 0x9E, 0x8D, 0x57, /* 0x40-0x43 */ + 0x9D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x9D, 0xE7, 0x00, 0x00, 0x90, 0x57, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xE5, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4E, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBA, /* 0x54-0x57 */ + 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9D, 0xEA, 0x9D, 0xE9, 0x9D, 0xEE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xEF, 0x00, 0x00, /* 0x60-0x63 */ + 0x9D, 0xEB, 0xED, 0xB9, 0x8A, 0x41, 0x9D, 0xEC, /* 0x64-0x67 */ + 0x9D, 0xED, 0x94, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x81, 0x8C, 0x69, /* 0x6C-0x6F */ + 0x9D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x70-0x73 */ + 0x90, 0xB0, 0x00, 0x00, 0x8F, 0xBB, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x71, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8B, 0xC5, 0x00, 0x00, 0x9D, 0xF1, /* 0x80-0x83 */ + 0x9D, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC9, /* 0x84-0x87 */ + 0x9D, 0xF2, 0x9D, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8F, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x67, 0x88, 0xC3, /* 0x94-0x97 */ + 0x9D, 0xF6, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x9D, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xED, 0xBF, 0x00, 0x00, 0x92, 0xA8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xEF, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x62, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xE9, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x96, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9E, 0x41, 0x9D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9D, 0xFC, 0x00, 0x00, 0x9D, 0xFB, 0xED, 0xC1, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9D, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9E, 0x40, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9D, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x42, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8F, 0x8C, 0x9E, 0x43, 0x00, 0x00, /* 0xD8-0xDB */ + 0x97, 0x6A, 0x94, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9E, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x46, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9E, 0x48, 0x00, 0x00, 0x8B, 0xC8, 0x89, 0x67, /* 0xF0-0xF3 */ + 0x8D, 0x58, 0x9E, 0x49, 0x00, 0x00, 0x9E, 0x4A, /* 0xF4-0xF7 */ + 0x8F, 0x91, 0x91, 0x82, 0xED, 0xC2, 0xED, 0x4A, /* 0xF8-0xFB */ + 0x99, 0xD6, 0x91, 0x5D, 0x91, 0x5C, 0x91, 0xD6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0x8D, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xF0, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8C, 0x8E, 0x97, 0x4C, 0x00, 0x00, 0x95, 0xFC, /* 0x08-0x0B */ + 0x00, 0x00, 0x95, 0x9E, 0xED, 0xC3, 0x9E, 0x4B, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8D, 0xF1, 0x92, 0xBD, 0x9E, 0x4C, 0x98, 0x4E, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5D, /* 0x18-0x1B */ + 0x00, 0x00, 0x92, 0xA9, 0x9E, 0x4D, 0x8A, 0xFA, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x4E, 0x9E, 0x4F, /* 0x24-0x27 */ + 0x96, 0xD8, 0x00, 0x00, 0x96, 0xA2, 0x96, 0x96, /* 0x28-0x2B */ + 0x96, 0x7B, 0x8E, 0x44, 0x9E, 0x51, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x96, 0x70, 0x00, 0x00, 0x9E, 0x53, 0x9E, 0x56, /* 0x34-0x37 */ + 0x9E, 0x55, 0x00, 0x00, 0x8A, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x8B, 0x80, 0x00, 0x00, 0x9E, 0x52, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9E, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x57, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x90, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x9B, 0x88, 0xC7, /* 0x4C-0x4F */ + 0x8D, 0xDE, 0x91, 0xBA, 0x00, 0x00, 0x8E, 0xDB, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF1, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x93, 0x6D, 0x00, 0x00, 0x9E, 0x58, 0x91, 0xA9, /* 0x5C-0x5F */ + 0x9E, 0x59, 0x8F, 0xF0, 0x96, 0xDB, 0x9E, 0x5B, /* 0x60-0x63 */ + 0x9E, 0x5C, 0x97, 0x88, 0xED, 0xC5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x61, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x8D, 0x59, 0x00, 0x00, 0x94, 0x74, /* 0x6C-0x6F */ + 0x9E, 0x5E, 0x93, 0x8C, 0x9D, 0xDC, 0x9D, 0xE0, /* 0x70-0x73 */ + 0x00, 0x00, 0x8B, 0x6E, 0x00, 0x00, 0x94, 0x66, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x9E, 0x60, 0x00, 0x00, 0x8F, 0xBC, 0x94, 0xC2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x9E, 0x66, 0x00, 0x00, 0x94, 0xF8, /* 0x84-0x87 */ + 0x00, 0x00, 0x9E, 0x5D, 0x00, 0x00, 0x9E, 0x63, /* 0x88-0x8B */ + 0x9E, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x90, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x96, 0x8D, 0x00, 0x00, 0x97, 0xD1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x87, 0x00, 0x00, /* 0x98-0x9B */ + 0x89, 0xCA, 0x8E, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x98, 0x67, 0x9E, 0x65, 0x90, 0x95, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x64, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x9E, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCD, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x6B, /* 0xB0-0xB3 */ + 0x9E, 0x69, 0x00, 0x00, 0x89, 0xCB, 0x9E, 0x67, /* 0xB4-0xB7 */ + 0x9E, 0x6D, 0x9E, 0x73, 0x00, 0x00, 0xED, 0xC6, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0xC8, 0x91, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x95, 0xBF, 0x00, 0x00, 0x9E, 0x75, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x41, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x74, 0x94, 0x90, /* 0xCC-0xCF */ + 0x96, 0x5E, 0x8A, 0xB9, 0x00, 0x00, 0x90, 0xF5, /* 0xD0-0xD3 */ + 0x8F, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x92, 0xD1, 0x00, 0x00, 0x97, 0x4D, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9E, 0x70, 0x9E, 0x6F, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x71, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9E, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x76, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9E, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9E, 0x6A, 0x00, 0x00, 0x9E, 0x72, 0x9E, 0x68, /* 0xEC-0xEF */ + 0x00, 0x00, 0x92, 0x8C, 0x00, 0x00, 0x96, 0xF6, /* 0xF0-0xF3 */ + 0x8E, 0xC4, 0x8D, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x8F, 0x8A, 0x60, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0xED, 0xC9, 0x92, 0xCC, 0x93, 0xC8, /* 0x00-0x03 */ + 0x89, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF0, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB2, 0x8C, 0x49, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x78, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x8D, 0x5A, 0x8A, 0x9C, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x9E, 0x7A, 0x8A, 0x94, 0x9E, 0x81, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x7D, 0x00, 0x00, /* 0x30-0x33 */ + 0x90, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x8A, 0x6A, 0x8D, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8A, 0x69, 0x8D, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9E, 0x7B, 0x8C, 0x85, 0x8C, 0x6A, 0x93, 0x8D, /* 0x40-0x43 */ + 0xED, 0xCA, 0x00, 0x00, 0x9E, 0x79, 0x00, 0x00, /* 0x44-0x47 */ + 0x88, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9E, 0x7C, 0x9E, 0x7E, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0xCB, 0x8C, 0x4B, 0xED, 0xC7, 0x8A, 0xBA, /* 0x50-0x53 */ + 0x8B, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x8D, 0xF7, 0x96, 0x91, 0x00, 0x00, 0x8E, 0x56, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x83, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x4F, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x9E, 0x8F, 0x00, 0x00, 0x89, 0xB1, 0x9E, 0x84, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x95, 0x9E, 0x85, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x97, 0xC0, 0x00, 0x00, 0x9E, 0x8C, /* 0x80-0x83 */ + 0x00, 0x00, 0x94, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9E, 0x94, 0x00, 0x00, 0x9E, 0x87, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xB2, /* 0x90-0x93 */ + 0x9E, 0x89, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x5B, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x8B, /* 0x98-0x9B */ + 0x00, 0x00, 0x9E, 0x8A, 0x00, 0x00, 0x9E, 0x86, /* 0x9C-0x9F */ + 0x9E, 0x91, 0x00, 0x00, 0x8F, 0xBD, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xEB, 0x8C, 0xE6, /* 0xA4-0xA7 */ + 0x97, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x9E, 0x88, 0x00, 0x00, 0x92, 0xF2, /* 0xAC-0xAF */ + 0x8A, 0x42, 0x8D, 0xAB, 0x00, 0x00, 0x9E, 0x80, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x9E, 0x90, 0x8A, 0x81, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x9E, 0x8E, 0x9E, 0x92, 0x00, 0x00, /* 0xB8-0xBB */ + 0x93, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x8A, 0xFC, 0x00, 0x00, 0x9E, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xED, 0x48, 0x96, 0xC7, 0x9E, 0x97, 0x8A, 0xFB, /* 0xC8-0xCB */ + 0x00, 0x00, 0x9E, 0x9E, 0x00, 0x00, 0xED, 0xCB, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x5F, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9E, 0x9F, 0x9E, 0xA1, 0x00, 0x00, 0x9E, 0xA5, /* 0xD4-0xD7 */ + 0x9E, 0x99, 0x00, 0x00, 0x92, 0x49, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x8F, /* 0xDC-0xDF */ + 0x9E, 0xA9, 0x9E, 0x9C, 0x00, 0x00, 0x9E, 0xA6, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA0, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x58, 0x9E, 0xAA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9E, 0xA8, 0x8A, 0xBB, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_69[512] = { + 0x98, 0x6F, 0x9E, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x9E, 0xA4, 0x88, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9E, 0x98, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB8, /* 0x08-0x0B */ + 0x9E, 0x9D, 0x90, 0x41, 0x92, 0xC5, 0x9E, 0x93, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA3, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x90, 0x9A, 0x9E, 0xAD, 0x8A, 0x91, /* 0x18-0x1B */ + 0x8C, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9E, 0xAF, 0x9E, 0x9A, 0x9E, 0xAE, /* 0x20-0x23 */ + 0x00, 0x00, 0x9E, 0xA7, 0x9E, 0x9B, 0x00, 0x00, /* 0x24-0x27 */ + 0x9E, 0xAB, 0x00, 0x00, 0x9E, 0xAC, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x9E, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x93, 0xCC, 0x00, 0x00, 0x9E, 0xA2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x9E, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x9E, 0xBB, 0x00, 0x00, 0x92, 0xD6, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x6B, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x96, /* 0x50-0x53 */ + 0x9E, 0xB6, 0x91, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0xBC, 0x91, 0x5E, 0x00, 0x00, /* 0x58-0x5B */ + 0x9E, 0xB3, 0x9E, 0xC0, 0x9E, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x93, 0xED, 0x9E, 0xBE, 0x93, 0xE8, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xED, 0xCD, 0x00, 0x00, 0x9E, 0xC2, 0x9E, 0xB5, /* 0x68-0x6B */ + 0x00, 0x00, 0x8B, 0xC6, 0x9E, 0xB8, 0x8F, 0x7C, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x80, /* 0x70-0x73 */ + 0x9E, 0xBA, 0x8B, 0xC9, 0x00, 0x00, 0x9E, 0xB2, /* 0x74-0x77 */ + 0x9E, 0xB4, 0x9E, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x98, 0x4F, 0x8A, 0x79, 0x9E, 0xB7, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9E, 0xC1, 0x8A, 0x54, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7C, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x9E, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x98, 0x50, 0x9E, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x90, 0x59, /* 0x98-0x9B */ + 0x9E, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9E, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9E, 0xE1, 0x9E, 0xC3, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x9E, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCE, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC9, 0x9E, 0xC6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9E, 0xC7, 0x00, 0x00, 0x9E, 0xCF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA0, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCC, 0x8D, 0x5C, /* 0xC8-0xCB */ + 0x92, 0xC6, 0x91, 0x84, 0x9E, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9E, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x97, 0x6C, 0x96, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9E, 0xCD, 0x9E, 0xD7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDF, /* 0xE4-0xE7 */ + 0x9E, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDE, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9E, 0xDD, 0x00, 0x00, 0x92, 0xCE, /* 0xF8-0xFB */ + 0x00, 0x00, 0x91, 0x85, 0x00, 0x00, 0x9E, 0xDB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD9, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x9E, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE6, 0x94, 0xF3, /* 0x08-0x0B */ + 0x9E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE7, 0x9E, 0xEA, /* 0x10-0x13 */ + 0x9E, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x92, 0x94, /* 0x14-0x17 */ + 0x00, 0x00, 0x95, 0x57, 0x00, 0x00, 0x9E, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE2, 0x8F, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x96, 0xCD, 0x9E, 0xF6, 0x9E, 0xE9, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x8C, 0xA0, 0x89, 0xA1, 0x8A, 0x7E, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD1, 0x00, 0x00, /* 0x2C-0x2F */ + 0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x8F, 0xBF, 0x9E, 0xEE, 0x00, 0x00, /* 0x34-0x37 */ + 0x9E, 0xF5, 0x8E, 0xF7, 0x8A, 0x92, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x92, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x9E, 0xEB, 0x00, 0x00, 0xED, 0xD3, 0x9E, 0xF0, /* 0x44-0x47 */ + 0x9E, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x8B, 0x6B, 0x9E, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x40, /* 0x5C-0x5F */ + 0x00, 0x00, 0x93, 0xC9, 0x9E, 0xF1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xF3, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xED, 0xED, 0xD4, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9E, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, 0x8A, 0x80, /* 0x7C-0x7F */ + + 0x92, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x9E, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9E, 0xF8, 0x8C, 0xE7, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9E, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x40, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x9E, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9E, 0xF9, 0x00, 0x00, 0x9E, 0xFB, 0x9E, 0xFC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x4B, 0x00, 0x00, /* 0xA8-0xAB */ + 0x9F, 0x47, 0x00, 0x00, 0x9E, 0x8D, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x46, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9F, 0x45, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x42, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9E, 0xE8, 0x9F, 0x44, 0x9F, 0x43, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9F, 0x49, 0x00, 0x00, 0x98, 0x45, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x4C, 0x8B, 0xF9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x48, 0x9F, 0x4A, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x94, 0xA5, 0x00, 0x00, 0x9F, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x51, 0x9F, 0x4E, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x97, 0x93, 0x9F, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x52, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x53, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x89, 0x54, 0x00, 0x00, 0x9F, 0x55, /* 0x1C-0x1F */ + 0x8C, 0x87, 0x8E, 0x9F, 0x00, 0x00, 0x8B, 0xD3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xA2, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x7E, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x57, /* 0x34-0x37 */ + 0x9F, 0x56, 0x9F, 0x59, 0x8B, 0x5C, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x8B, 0xD4, 0x8A, 0xBC, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5C, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5B, /* 0x44-0x47 */ + 0x00, 0x00, 0x9F, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x89, 0xCC, 0x00, 0x00, 0x92, 0x56, 0x00, 0x00, /* 0x4C-0x4F */ + 0x9F, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xBD, /* 0x50-0x53 */ + 0x9F, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9F, 0x5F, 0x00, 0x00, 0x9F, 0x61, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x62, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9F, 0x63, 0x8E, 0x7E, 0x90, 0xB3, /* 0x60-0x63 */ + 0x8D, 0x9F, 0x00, 0x00, 0x95, 0x90, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x95, 0xE0, 0x98, 0x63, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x95, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xCE, /* 0x70-0x73 */ + 0x97, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9F, 0x64, 0x9F, 0x65, 0x00, 0x00, 0x8E, 0x80, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x66, /* 0x7C-0x7F */ + + 0x9F, 0x67, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x69, /* 0x80-0x83 */ + 0x9F, 0x68, 0x00, 0x00, 0x96, 0x77, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x8F, 0x7D, 0x8E, 0xEA, 0x8E, 0x63, /* 0x88-0x8B */ + 0x00, 0x00, 0x9F, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x9F, 0x6C, 0x90, 0x42, 0x00, 0x00, /* 0x94-0x97 */ + 0x9F, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x6D, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x9F, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x6F, 0x9F, 0x70, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x71, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9F, 0x73, 0x9F, 0x72, 0x9F, 0x74, /* 0xB0-0xB3 */ + 0x89, 0xA3, 0x92, 0x69, 0x00, 0x00, 0x9F, 0x75, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x45, 0x8A, 0x6B, /* 0xB8-0xBB */ + 0x9F, 0x76, 0x00, 0x00, 0x00, 0x00, 0x93, 0x61, /* 0xBC-0xBF */ + 0x9A, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8B, 0x42, 0x9F, 0x77, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x78, /* 0xC8-0xCB */ + 0x00, 0x00, 0x95, 0xEA, 0x96, 0x88, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xC5, 0x9F, 0x79, /* 0xD0-0xD3 */ + 0x94, 0xE4, 0x00, 0x00, 0xED, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x94, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD1, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7A, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7C, /* 0xE8-0xEB */ + 0x9F, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7E, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7D, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x81, /* 0x0C-0x0F */ + 0x00, 0x00, 0x96, 0xAF, 0x00, 0x00, 0x9F, 0x82, /* 0x10-0x13 */ + 0x9F, 0x83, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x43, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x84, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x86, /* 0x20-0x23 */ + 0x9F, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x90, 0x85, 0x00, 0x00, 0x00, 0x00, 0x95, 0x58, /* 0x34-0x37 */ + 0x89, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xC3, 0xED, 0xD9, /* 0x3C-0x3F */ + 0x92, 0xF3, 0x8F, 0x60, 0x8B, 0x81, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x9F, 0x88, 0x00, 0x00, 0x8A, 0xBE, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x98, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xDA, 0x93, 0xF0, 0x9F, 0x87, 0x8D, 0x5D, /* 0x5C-0x5F */ + 0x92, 0x72, 0x00, 0x00, 0x9F, 0x89, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x9F, 0x91, 0x00, 0x00, 0x9F, 0x8A, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xDC, /* 0x6C-0x6F */ + 0x91, 0xBF, 0x00, 0x00, 0x8B, 0x82, 0x9F, 0x92, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x88, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8B, 0x44, 0x9F, 0x90, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9F, 0x8E, 0x9F, 0x8B, 0x97, 0x80, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xDB, 0x00, 0x00, /* 0x84-0x87 */ + 0x92, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x93, 0xD7, 0x9F, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9F, 0x94, 0x00, 0x00, 0x9F, 0x93, 0x8C, 0x42, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xAB, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x8D, 0xB9, 0x9F, 0x8D, 0x9F, 0x8F, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x96, 0x76, 0x91, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x97, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x9C, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9F, 0x9D, 0x00, 0x00, 0x89, 0xCD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x95, 0xA6, 0x96, 0xFB, 0x9F, 0x9F, 0x8E, 0xA1, /* 0xB8-0xBB */ + 0x8F, 0xC0, 0x9F, 0x98, 0x9F, 0x9E, 0x89, 0x88, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9F, 0x95, 0x9F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x90, 0xF2, 0x94, 0x91, 0x00, 0x00, /* 0xC8-0xCB */ + 0x94, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x97, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x96, 0x40, 0x00, 0x00, 0x9F, 0x99, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9F, 0xA2, 0xED, 0xDD, 0x9F, 0xA0, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9F, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x96, 0x41, 0x94, 0x67, 0x8B, 0x83, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x92, 0x8D, 0x00, 0x00, 0x9F, 0xA3, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xA1, /* 0xEC-0xEF */ + 0x91, 0xD7, 0x9F, 0x96, 0x00, 0x00, 0x89, 0x6A, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x6D, /* 0x08-0x0B */ + 0x9F, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAD, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF4, /* 0x14-0x17 */ + 0x00, 0x00, 0x9F, 0xAA, 0x00, 0x00, 0x97, 0x8C, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xB4, 0x9F, 0xA4, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x92, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0x6B, 0x8D, 0x5E, 0x9F, 0xA7, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x46, 0x9F, 0xAC, /* 0x30-0x33 */ + 0x00, 0x00, 0x9F, 0xAB, 0x9F, 0xA6, 0x00, 0x00, /* 0x34-0x37 */ + 0x9F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x88, /* 0x38-0x3B */ + 0x00, 0x00, 0x9F, 0xA8, 0x94, 0x68, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x97, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x8F, 0xF2, 0x90, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9F, 0xB4, 0x9F, 0xB2, 0x00, 0x00, /* 0x58-0x5B */ + 0x95, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAF, /* 0x60-0x63 */ + 0x9F, 0xB1, 0x00, 0x00, 0x89, 0x59, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x8D, 0x5F, 0x98, 0x51, 0x00, 0x00, /* 0x68-0x6B */ + 0x8A, 0x5C, 0x00, 0x00, 0x95, 0x82, 0xED, 0xE0, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x97, 0x81, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x43, /* 0x74-0x77 */ + 0x90, 0x5A, 0x9F, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x9F, 0xB8, 0x00, 0x00, 0xED, 0xDF, /* 0x84-0x87 */ + 0x8F, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x97, 0x4F, 0x00, 0x00, 0x9F, 0xB5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x9F, 0xB6, 0xED, 0xE1, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x97, 0xDC, 0x00, 0x00, 0x93, 0x93, /* 0x98-0x9B */ + 0x93, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xED, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x55, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x74, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x9F, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9F, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x97, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x97, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9F, 0xC6, 0x9F, 0xC0, 0x9F, 0xBD, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD2, /* 0xC8-0xCB */ + 0x9F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8F, 0x69, 0x9F, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x9F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0x91, 0x9F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xC2, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x92, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9F, 0xC9, 0x00, 0x00, 0x9F, 0xBE, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9F, 0xC4, 0x00, 0x00, 0x9F, 0xCB, 0x88, 0xFA, /* 0xE8-0xEB */ + 0x9F, 0xC1, 0x00, 0x00, 0x9F, 0xCC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x90, 0x5B, 0xED, 0xE5, 0x8F, 0x7E, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x95, 0xA3, 0x00, 0x00, 0x8D, 0xAC, /* 0xF4-0xF7 */ + 0xED, 0xE4, 0x9F, 0xB9, 0x9F, 0xC7, 0x93, 0x59, /* 0xF8-0xFB */ + 0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x90, 0xB4, 0x00, 0x00, 0x8A, 0x89, /* 0x04-0x07 */ + 0x8D, 0xCF, 0x8F, 0xC2, 0x9F, 0xBB, 0x8F, 0x61, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x6B, /* 0x10-0x13 */ + 0x00, 0x00, 0x9F, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9F, 0xD0, 0x8F, 0x8D, 0x8C, 0xB8, /* 0x18-0x1B */ + 0x00, 0x00, 0x9F, 0xDF, 0x00, 0x00, 0x9F, 0xD9, /* 0x1C-0x1F */ + 0x8B, 0x94, 0x93, 0x6E, 0x00, 0x00, 0x9F, 0xD4, /* 0x20-0x23 */ + 0x9F, 0xDD, 0x88, 0xAD, 0x89, 0x51, 0xED, 0xE9, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0xB7, 0x00, 0x00, 0x9F, 0xD6, /* 0x28-0x2B */ + 0x91, 0xAA, 0x9F, 0xCD, 0x9F, 0xCF, 0x8D, 0x60, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9F, 0xE0, 0xED, 0xE7, 0x9F, 0xDB, 0x00, 0x00, /* 0x38-0x3B */ + 0xED, 0xEA, 0x00, 0x00, 0x9F, 0xD3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xDA, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xA9, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9F, 0xD8, 0x9F, 0xDC, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCE, 0x00, 0x00, /* 0x54-0x57 */ + 0x8F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x92, 0x58, /* 0x58-0x5B */ + 0xED, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x4E, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xCE, 0x93, 0x92, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD1, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD7, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x70, 0x8E, 0xBC, /* 0x7C-0x7F */ + + 0x96, 0x9E, 0x00, 0x00, 0x9F, 0xE1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x94, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xED, /* 0x8C-0x8F */ + 0x8C, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x80, 0x00, 0x00, /* 0x94-0x97 */ + 0x9F, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x97, 0xAD, 0x8D, 0x61, 0x00, 0x00, 0x9F, 0xF0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x9F, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE8, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xEA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x6E, 0x9F, 0xE5, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x9F, 0xE7, 0x00, 0x00, 0xED, 0xEB, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xEF, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9F, 0xE9, 0x96, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9F, 0xE4, 0x00, 0x00, 0x8E, 0xA0, /* 0xC8-0xCB */ + 0x9F, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8A, 0x8A, 0x00, 0x00, 0x9F, 0xE6, /* 0xD0-0xD3 */ + 0x9F, 0xEB, 0x9F, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x91, 0xEA, 0x91, 0xD8, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9F, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xFA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x93, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x42, /* 0xF4-0xF7 */ + 0x9F, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xF6, 0x9F, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x00, 0x00, 0x8B, 0x99, 0x95, 0x59, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8D, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x52, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9F, 0xF2, 0x00, 0x00, 0xE0, 0x41, /* 0x10-0x13 */ + 0x89, 0x89, 0x91, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x94, 0x99, 0x00, 0x00, 0x8A, 0xBF, 0x97, 0xF8, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9F, /* 0x28-0x2B */ + 0x92, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9F, 0xF9, 0x9F, 0xFB, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x40, 0x9F, 0xF7, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9F, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8A, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8C, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE0, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x49, /* 0x58-0x5B */ + 0x90, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x83, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x8F, 0x81, 0x00, 0x00, 0xE0, 0x52, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE0, 0x4B, 0x92, 0xAA, 0xE0, 0x48, /* 0x6C-0x6F */ + 0x92, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE0, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE0, 0x45, 0x00, 0x00, 0xE0, 0x44, 0x00, 0x00, /* 0x78-0x7B */ + 0xE0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE0, 0x47, 0xE0, 0x46, 0xE0, 0x4C, 0x00, 0x00, /* 0x80-0x83 */ + 0x90, 0x9F, 0x00, 0x00, 0xE0, 0x43, 0x00, 0x00, /* 0x84-0x87 */ + 0xED, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x4F, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE0, 0x55, 0x00, 0x00, 0xE0, 0x54, /* 0xA0-0xA3 */ + 0xE0, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x59, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x93, 0x62, 0x00, 0x00, 0xE0, 0x53, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xED, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x8C, 0x83, 0x91, 0xF7, 0xE0, 0x51, 0x94, 0x5A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0x5D, 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE0, 0x5E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x61, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x5A, /* 0xDC-0xDF */ + 0x8D, 0x8A, 0x94, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9F, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x94, /* 0xE8-0xEB */ + 0xE0, 0x5C, 0x00, 0x00, 0xE0, 0x60, 0x91, 0xF3, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0x5F, 0x00, 0x00, 0xE0, 0x4A, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xED, 0xEE, 0xE8, 0x89, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x64, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x68, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x00, 0x00, 0xE0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xEF, 0x00, 0x00, 0xED, 0xF0, /* 0x04-0x07 */ + 0x00, 0x00, 0xE0, 0x62, 0x00, 0x00, 0xE0, 0x63, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x67, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE0, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x95, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE0, 0x6D, 0x00, 0x00, 0xE0, 0x6A, 0xE0, 0x69, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0x6C, 0x93, 0xD2, 0xE0, 0x6E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x95, 0x91, 0xEB, /* 0x24-0x27 */ + 0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x90, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0x6F, 0x00, 0x00, 0xE0, 0x71, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x70, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x9F, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE0, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x93, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x73, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xCE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x94, /* 0x6C-0x6F */ + 0x8A, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x8B, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x8E, 0xDC, 0x8D, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xED, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x98, 0x46, 0x90, 0x86, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x8A, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x75, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE0, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF3, /* 0xA8-0xAB */ + 0xE0, 0x78, 0x92, 0x59, 0xE0, 0x7B, 0xE0, 0x76, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7A, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE0, 0x79, 0x93, 0x5F, 0x88, 0xD7, 0xED, 0x46, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x97, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7D, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x47, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE0, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE0, 0x7E, 0x00, 0x00, 0xE0, 0x7C, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x96, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE0, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE0, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x89, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE0, 0x84, 0x95, 0xB0, 0x00, 0x00, /* 0x18-0x1B */ + 0xE0, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x96, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xC5, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x52, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8F, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xF7, 0xED, 0xF8, /* 0x44-0x47 */ + 0x00, 0x00, 0x97, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE0, 0x8A, 0x00, 0x00, 0x90, 0xF7, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE0, 0x86, 0xE0, 0x8B, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x89, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x89, 0x00, 0x00, /* 0x60-0x63 */ + 0x94, 0x81, 0xE0, 0x85, 0xE0, 0x88, 0x8F, 0xC6, /* 0x64-0x67 */ + 0x00, 0x00, 0x94, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0x8C, 0x00, 0x00, 0x8E, 0xCF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x90, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE0, 0x87, 0x00, 0x00, 0x8C, 0x46, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x8D, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x97, 0x6F, 0xE0, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6E, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE0, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x94, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x95, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xED, 0xFA, 0x00, 0x00, 0x94, 0x52, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x93, 0x95, 0xE0, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x99, 0x00, 0x00, /* 0xCC-0xCF */ + 0x97, 0xD3, 0x00, 0x00, 0xE0, 0x96, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0x98, 0x89, 0x8D, 0x00, 0x00, 0xE0, 0x93, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x7A, /* 0xDC-0xDF */ + 0xE0, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x91, 0x87, 0x8E, 0x57, 0xE0, 0x9C, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0x9B, 0x90, 0x43, 0x99, 0xD7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE0, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0x9F, 0x00, 0x00, 0xE0, 0x8E, /* 0xF8-0xFB */ + 0xE0, 0x9E, 0x00, 0x00, 0xED, 0xFB, 0xE0, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x9A, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE0, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE0, 0xA4, 0x00, 0x00, 0x92, 0xDC, 0x00, 0x00, /* 0x28-0x2B */ + 0xE0, 0xA6, 0xE0, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0xA7, 0x00, 0x00, 0xE0, 0xA8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x8E, 0xDD, 0x95, 0x83, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEA, 0xE0, 0xA9, /* 0x38-0x3B */ + 0xE0, 0xAA, 0x91, 0x75, 0x8E, 0xA2, 0xE0, 0xAB, /* 0x3C-0x3F */ + 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAD, 0x95, 0xD0, /* 0x44-0x47 */ + 0x94, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAE, /* 0x48-0x4B */ + 0x94, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xAB, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE0, 0xAF, 0x89, 0xE5, 0x00, 0x00, 0x8B, 0x8D, /* 0x58-0x5B */ + 0x00, 0x00, 0x96, 0xC4, 0x00, 0x00, 0x96, 0xB4, /* 0x5C-0x5F */ + 0x00, 0x00, 0x89, 0xB2, 0x98, 0x53, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x71, /* 0x64-0x67 */ + 0x00, 0x00, 0x95, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB5, 0x00, 0x00, /* 0x70-0x73 */ + 0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x93, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8C, 0xA1, 0xE0, 0xB1, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8D, 0xD2, 0xE0, 0xB3, 0xE0, 0xB2, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0x5D, 0x00, 0x00, 0xE0, 0xB7, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8C, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x94, 0xC6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xED, 0xFC, 0xE0, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x40, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB6, 0xE0, 0xBB, /* 0xC0-0xC3 */ + 0xE0, 0xBD, 0x00, 0x00, 0xE0, 0xBC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBE, 0x00, 0x00, /* 0xCC-0xCF */ + 0x8C, 0xCF, 0x00, 0x00, 0xE0, 0xBF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE7, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x91, 0x5F, 0x00, 0x00, 0x8D, 0x9D, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE0, 0xC1, 0xE0, 0xC2, 0xE0, 0xC0, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x8E, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x93, 0xC6, 0x8B, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC4, /* 0xF4-0xF7 */ + 0x92, 0x4B, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x98, 0x54, 0x94, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0xE0, 0xC6, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD2, /* 0x18-0x1B */ + 0xE0, 0xC8, 0xE0, 0xCA, 0x00, 0x00, 0x97, 0xC2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xEE, 0x41, 0xE0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xCD, 0x92, 0x96, 0x94, 0x4C, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA3, 0xE0, 0xCC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE0, 0xCB, 0x00, 0x00, 0x97, 0x50, 0x97, 0x51, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCF, 0x89, 0x8E, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x8D, 0x96, 0x8E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0xE0, 0xD1, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD3, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x62, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xE0, 0xD5, 0x00, 0x00, 0xE0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE0, 0xD6, 0x00, 0x00, 0x8A, 0x6C, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, 0xEE, 0x43, /* 0x74-0x77 */ + 0xE0, 0xD7, 0x00, 0x00, 0xE0, 0xDA, 0xE0, 0xD9, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x8C, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA6, /* 0x84-0x87 */ + 0x00, 0x00, 0x8B, 0xCA, 0x00, 0x00, 0x89, 0xA4, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8A, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xE6, 0xE0, 0xDC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEE, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE0, 0xDF, 0x00, 0x00, 0x89, 0xCF, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE0, 0xDB, 0xEE, 0x45, 0x8E, 0x58, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x92, 0xBF, 0xE0, 0xDD, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x48, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x46, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x47, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE0, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x94, 0xC7, 0xE0, 0xE1, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xEE, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xBB, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x85, /* 0x00-0x03 */ + 0x00, 0x00, 0xE0, 0xE4, 0x97, 0x9D, 0xEE, 0x49, /* 0x04-0x07 */ + 0x00, 0x00, 0x97, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xF4, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE0, 0xE6, 0xEE, 0x4B, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xEE, 0x4D, 0xEE, 0x4C, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x4E, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x97, 0xD4, /* 0x30-0x33 */ + 0x8B, 0xD5, 0x94, 0xFA, 0x94, 0x69, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEB, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE0, 0xED, 0x8C, 0xE8, 0x89, 0x6C, /* 0x58-0x5B */ + 0xE0, 0xEF, 0x00, 0x00, 0x90, 0x90, 0xE0, 0xEC, /* 0x5C-0x5F */ + 0x97, 0xDA, 0x00, 0x00, 0xEE, 0x4F, 0xE0, 0xF2, /* 0x60-0x63 */ + 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE0, 0xF0, 0xE0, 0xF3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x6C-0x6F */ + 0xE0, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBA, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF4, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x9E, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xEE, 0x50, 0x00, 0x00, 0xE0, 0xF6, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF7, 0xEE, 0x51, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8A, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x8E, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF9, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x89, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE1, 0x40, 0x00, 0x00, 0x95, 0x5A, 0xE1, 0x41, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA2, 0xE1, 0x42, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x44, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE1, 0x46, 0xE1, 0x47, 0xE1, 0x45, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x72, 0xE1, 0x49, /* 0xF4-0xF7 */ + 0xE1, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_75[512] = { + 0x00, 0x00, 0xEE, 0x52, 0x00, 0x00, 0xE1, 0x4B, /* 0x00-0x03 */ + 0xE1, 0x4A, 0xE1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0x4D, 0xE1, 0x4F, 0xE1, 0x4E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8D, 0x99, 0x00, 0x00, 0xE1, 0x51, /* 0x10-0x13 */ + 0x00, 0x00, 0xE1, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x8A, 0xC3, 0x00, 0x00, 0x90, 0x72, 0x00, 0x00, /* 0x18-0x1B */ + 0x93, 0x5B, 0x00, 0x00, 0xE1, 0x52, 0x90, 0xB6, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x59, /* 0x20-0x23 */ + 0x00, 0x00, 0x89, 0x99, 0xE1, 0x53, 0x00, 0x00, /* 0x24-0x27 */ + 0x97, 0x70, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE1, /* 0x28-0x2B */ + 0xE1, 0x54, 0x00, 0x00, 0x00, 0x00, 0xED, 0x8C, /* 0x2C-0x2F */ + 0x93, 0x63, 0x97, 0x52, 0x8D, 0x62, 0x90, 0x5C, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6A, /* 0x34-0x37 */ + 0x99, 0xB2, 0x00, 0x00, 0x92, 0xAC, 0x89, 0xE6, /* 0x38-0x3B */ + 0xE1, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE1, 0x56, 0x00, 0x00, 0xE1, 0x5B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE1, 0x59, 0xE1, 0x58, 0x9D, 0xC0, /* 0x48-0x4B */ + 0x8A, 0x45, 0xE1, 0x57, 0x00, 0x00, 0x88, 0xD8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x94, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x94, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x97, 0xAF, 0xE1, 0x5C, 0xE1, 0x5A, /* 0x58-0x5B */ + 0x92, 0x7B, 0x90, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x94, 0xA9, 0x00, 0x00, 0x95, 0x4C, 0x00, 0x00, /* 0x60-0x63 */ + 0xE1, 0x5E, 0x97, 0xAA, 0x8C, 0x6C, 0xE1, 0x5F, /* 0x64-0x67 */ + 0x00, 0x00, 0xE1, 0x5D, 0x94, 0xD4, 0xE1, 0x60, /* 0x68-0x6B */ + 0x00, 0x00, 0xE1, 0x61, 0x00, 0x00, 0xEE, 0x53, /* 0x6C-0x6F */ + 0x88, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF4, /* 0x70-0x73 */ + 0xE1, 0x66, 0x00, 0x00, 0xE1, 0x63, 0x93, 0xEB, /* 0x74-0x77 */ + 0xE1, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x45, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x69, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x64, 0xE1, 0x65, /* 0x84-0x87 */ + 0x00, 0x00, 0xE1, 0x68, 0xE1, 0x67, 0x95, 0x44, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x61, 0x91, 0x60, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8B, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE1, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6B, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xE1, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6E, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE1, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x75, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE1, 0x76, 0x94, 0xE6, 0xE1, 0x70, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE1, 0x74, 0x90, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE1, 0x75, 0xE1, 0x73, 0x8E, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6F, 0xE1, 0x71, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x95, 0x61, 0x00, 0x00, 0x8F, 0xC7, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x78, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x79, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8E, 0xA4, 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0x97, 0xE1, 0x7A, 0x00, 0x00, 0x92, 0xC9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x7C, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x9F, 0xE1, 0x7B, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x91, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE1, 0x82, 0x00, 0x00, 0xE1, 0x84, 0xE1, 0x85, /* 0xF0-0xF3 */ + 0x92, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x83, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0x80, 0x00, 0x00, 0xE1, 0x7D, 0xE1, 0x7E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0x00, 0x00, 0xE1, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE1, 0x88, 0x00, 0x00, 0xE1, 0x86, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x89, /* 0x1C-0x1F */ + 0xE1, 0x8B, 0xE1, 0x8C, 0xE1, 0x8D, 0x00, 0x00, /* 0x20-0x23 */ + 0xE1, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x8A, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE1, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE1, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x91, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xC3, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x94, 0xE1, 0x92, /* 0x44-0x47 */ + 0xE1, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8A, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xFC, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xC8, 0x00, 0x00, /* 0x54-0x57 */ + 0xE1, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE1, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE1, 0x97, 0xE1, 0x98, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x9C, /* 0x64-0x67 */ + 0xE1, 0x99, 0xE1, 0x9A, 0xE1, 0x9B, 0x00, 0x00, /* 0x68-0x6B */ + 0xE1, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE1, 0x9E, 0x00, 0x00, 0xE1, 0x9F, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA0, 0x00, 0x00, /* 0x74-0x77 */ + 0xE1, 0xA1, 0x00, 0x00, 0x94, 0xAD, 0x93, 0x6F, /* 0x78-0x7B */ + 0xE1, 0xA2, 0x94, 0x92, 0x95, 0x53, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE1, 0xA3, 0x00, 0x00, 0xEE, 0x54, 0xE1, 0xA4, /* 0x80-0x83 */ + 0x93, 0x49, 0x00, 0x00, 0x8A, 0x46, 0x8D, 0x63, /* 0x84-0x87 */ + 0xE1, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA6, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8E, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE1, 0xAA, 0xE1, 0xAB, 0xEE, 0x57, /* 0x98-0x9B */ + 0xEE, 0x55, 0x00, 0x00, 0xEE, 0x56, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x58, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xE7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE1, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x89, /* 0xB4-0xB7 */ + 0xE1, 0xAE, 0xE1, 0xAF, 0xE1, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4D, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB1, 0x94, 0x75, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x7E, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x89, 0x6D, 0x00, 0x00, 0x89, 0x76, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB3, 0x93, 0x90, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xB7, /* 0xD8-0xDB */ + 0x9F, 0x58, 0x00, 0x00, 0xE1, 0xB5, 0x96, 0xBF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE1, 0xB6, 0x00, 0x00, 0x8A, 0xC4, /* 0xE0-0xE3 */ + 0x94, 0xD5, 0xE1, 0xB7, 0x00, 0x00, 0xE1, 0xB8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xDA, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xD3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x92, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x91, 0x8A, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x82, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0x8F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE1, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0x04-0x07 */ + 0xE1, 0xBC, 0x94, 0xFB, 0x00, 0x00, 0x8A, 0xC5, /* 0x08-0x0B */ + 0x8C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC1, 0x90, 0x5E, /* 0x1C-0x1F */ + 0x96, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE1, 0xC0, 0xE1, 0xC2, 0xE1, 0xC3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC5, /* 0x34-0x37 */ + 0xE1, 0xC6, 0x00, 0x00, 0x92, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0x8A, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x92, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5A, 0xE1, 0xC7, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xCB, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x90, 0x87, 0x00, 0x00, 0x93, 0xC2, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xCC, 0x96, 0x72, 0x00, 0x00, /* 0x64-0x67 */ + 0xE1, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCA, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE1, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCE, 0xE1, 0xCD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD1, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE1, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x95, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8F, 0x75, 0x97, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD6, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE1, 0xDB, /* 0xB8-0xBB */ + 0xE1, 0xD9, 0xE1, 0xDA, 0x00, 0x00, 0xE1, 0xD8, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDF, 0x96, 0xB5, /* 0xD8-0xDB */ + 0xE1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEE, 0xE1, 0xE1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x92, 0x6D, 0x00, 0x00, 0x94, 0x8A, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x8B, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x92, 0x5A, 0xE1, 0xE2, 0x8B, 0xB8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xCE, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBB, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0x00, 0x00, /* 0x10-0x13 */ + 0x8C, 0xA4, 0x8D, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE1, 0xE7, 0xEE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x93, 0x75, 0x8D, 0xD4, 0x8B, 0x6D, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x43, 0x00, 0x00, /* 0x30-0x33 */ + 0x94, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x76, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x7B, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEE, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xB0, /* 0x68-0x6B */ + 0x8D, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xA1, 0x00, 0x00, /* 0x70-0x73 */ + 0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5F, 0x00, 0x00, /* 0x78-0x7B */ + 0xE1, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8C, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xEC, 0x92, 0xF4, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE1, 0xEF, 0x8A, 0x56, 0xE1, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x94, 0xE8, 0x00, 0x00, 0x89, 0x4F, /* 0x90-0x93 */ + 0x00, 0x00, 0x8D, 0xEA, 0x00, 0x00, 0x98, 0x71, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xEE, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF0, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC9, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x90, 0xD7, 0xE1, 0xF2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x6D, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE1, 0xF9, 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8E, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE1, 0xFA, 0xE1, 0xF5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xFB, 0xE1, 0xF6, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x94, 0xD6, 0xE1, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE1, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x41, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x40, /* 0xE4-0xE7 */ + 0x96, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE2, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0x8F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x44, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x62, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE2, 0x46, 0xE2, 0x45, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE8, 0xE2, 0x49, /* 0x28-0x2B */ + 0xE2, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEE, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA6, 0x00, 0x00, /* 0x38-0x3B */ + 0x97, 0xE7, 0x00, 0x00, 0x8E, 0xD0, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE2, 0x4A, 0x8C, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x5F, /* 0x44-0x47 */ + 0x8B, 0x46, 0x8E, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x97, 0x53, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x50, /* 0x50-0x53 */ + 0x00, 0x00, 0xE2, 0x4F, 0x91, 0x63, 0xE2, 0x4C, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x4E, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0x6A, 0x90, 0x5F, 0xE2, 0x4D, /* 0x5C-0x5F */ + 0xE2, 0x4B, 0x00, 0x00, 0x94, 0x49, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x8F, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x95, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x8D, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x98, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x51, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x52, /* 0x7C-0x7F */ + + 0xE2, 0x68, 0x8B, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x98, 0x5C, 0x91, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x89, 0xD0, 0x92, 0xF5, 0x95, 0x9F, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xEE, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x66, /* 0x98-0x9B */ + 0x00, 0x00, 0xE2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9A, 0xE2, 0x55, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x57, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x58, 0x00, 0x00, /* 0xAC-0xAF */ + 0x94, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x59, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0x5A, 0xE2, 0x5B, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x8B, 0xD7, 0x89, 0xD1, 0x93, 0xC3, /* 0xBC-0xBF */ + 0x8F, 0x47, 0x8E, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x8F, 0x48, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x89, 0xC8, 0x95, 0x62, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE2, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x94, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x64, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE2, 0x60, 0x00, 0x00, 0xE2, 0x61, /* 0xE0-0xE3 */ + 0x94, 0x89, 0x00, 0x00, 0x90, 0x60, 0xE2, 0x5E, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x92, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE2, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8F, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDA, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7A[512] = { + 0x8B, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0x62, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF6, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0x63, 0x90, 0xC5, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x96, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x42, /* 0x14-0x17 */ + 0xE2, 0x64, 0xE2, 0x65, 0x92, 0x74, 0x00, 0x00, /* 0x18-0x1B */ + 0x97, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x67, /* 0x1C-0x1F */ + 0xE2, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0x69, 0x88, 0xEE, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6C, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6A, /* 0x38-0x3B */ + 0x89, 0xD2, 0x8C, 0x6D, 0xE2, 0x6B, 0x8D, 0x65, /* 0x3C-0x3F */ + 0x8D, 0x92, 0x00, 0x00, 0x95, 0xE4, 0xE2, 0x6D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x73, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE2, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x90, 0xCF, 0x89, 0x6E, 0x89, 0xB8, /* 0x4C-0x4F */ + 0x88, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6E, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0x70, 0xE2, 0x71, 0x8F, 0xF5, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE2, 0x72, 0x00, 0x00, 0x8A, 0x6E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x8C, 0x8A, 0x00, 0x00, 0x8B, 0x86, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE2, 0x75, 0x8B, 0xF3, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE2, 0x76, 0x00, 0x00, 0x90, 0xFA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x93, 0xCB, 0x00, 0x00, 0x90, 0xDE, /* 0x80-0x83 */ + 0x8D, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE2, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x82, 0x91, 0x8B, /* 0x90-0x93 */ + 0x00, 0x00, 0xE2, 0x79, 0xE2, 0x7B, 0xE2, 0x78, /* 0x94-0x97 */ + 0xE2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x41, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE2, 0x7C, 0x8C, 0x45, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x87, 0x97, 0x71, /* 0xAC-0xAF */ + 0xE2, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x80, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x83, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x96, /* 0xC0-0xC3 */ + 0xE2, 0x82, 0xE2, 0x81, 0x00, 0x00, 0xE2, 0x85, /* 0xC4-0xC7 */ + 0xE2, 0x7D, 0x00, 0x00, 0xE2, 0x86, 0x97, 0xA7, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE2, 0x87, 0x00, 0x00, 0xE2, 0x88, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEE, 0x67, 0x9A, 0xF2, 0xE2, 0x8A, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE2, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE2, 0x8B, 0xE2, 0x8C, 0x00, 0x00, /* 0xD8-0xDB */ + 0x97, 0xB3, 0xE2, 0x8D, 0x00, 0x00, 0xE8, 0xED, /* 0xDC-0xDF */ + 0x8F, 0xCD, 0xE2, 0x8E, 0xE2, 0x8F, 0x8F, 0x76, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0xB6, 0xE2, 0x90, 0xEE, 0x68, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x47, 0xEE, 0x6A, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE2, 0x91, 0x00, 0x00, 0x92, 0x5B, /* 0xEC-0xEF */ + 0xE2, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA3, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x99, 0x5E, 0x92, 0x7C, 0x8E, 0xB1, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x93, 0x00, 0x00, /* 0x00-0x03 */ + 0xE2, 0xA0, 0x00, 0x00, 0xE2, 0x96, 0x00, 0x00, /* 0x04-0x07 */ + 0x8B, 0x88, 0x00, 0x00, 0xE2, 0x95, 0xE2, 0xA2, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x94, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8F, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE2, 0x98, 0xE2, 0x99, 0x00, 0x00, 0x93, 0x4A, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x9A, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8A, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x90, 0x79, 0x95, 0x84, 0x00, 0x00, /* 0x24-0x27 */ + 0xE2, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x91, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x97, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0x9B, 0xE2, 0x9D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE2, 0xA4, 0x95, 0x4D, 0x00, 0x00, /* 0x44-0x47 */ + 0x94, 0xA4, 0x93, 0x99, 0x00, 0x00, 0x8B, 0xD8, /* 0x48-0x4B */ + 0xE2, 0xA3, 0xE2, 0xA1, 0x00, 0x00, 0x94, 0xB3, /* 0x4C-0x4F */ + 0xE2, 0x9E, 0x92, 0x7D, 0x93, 0x9B, 0x00, 0x00, /* 0x50-0x53 */ + 0x93, 0x9A, 0x00, 0x00, 0x8D, 0xF4, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE2, 0xA6, 0x00, 0x00, 0xE2, 0xA8, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xAB, 0x00, 0x00, 0xE2, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0xA9, 0xE2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE2, 0xA7, 0xE2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x9F, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xCD, 0x89, 0xD3, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, 0xE2, 0xB5, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, 0x00, 0x00, /* 0x90-0x93 */ + 0x94, 0x93, 0x96, 0xA5, 0x00, 0x00, 0x8E, 0x5A, /* 0x94-0x97 */ + 0xE2, 0xAE, 0xE2, 0xB7, 0xE2, 0xB2, 0x00, 0x00, /* 0x98-0x9B */ + 0xE2, 0xB1, 0xE2, 0xAD, 0xEE, 0x6B, 0xE2, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x8A, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x90, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x94, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x94, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x90, 0xDF, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x94, 0xCD, 0x00, 0x00, 0xE2, 0xBD, 0x95, 0xD1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x92, 0x7A, 0x00, 0x00, 0xE2, 0xB8, /* 0xC8-0xCB */ + 0xE2, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBB, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8E, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x93, 0xC4, 0xE2, 0xC3, 0xE2, 0xC2, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE2, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x98, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCC, 0xE2, 0xC9, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_7C[512] = { + 0xE2, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE2, 0xC0, 0x99, 0xD3, 0xE2, 0xC7, /* 0x10-0x13 */ + 0xE2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0x1C-0x1F */ + 0x00, 0x00, 0x8A, 0xC8, 0x00, 0x00, 0xE2, 0xCD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0xE2, 0xD2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, /* 0x34-0x37 */ + 0x94, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE2, 0xD3, 0x97, 0xFA, 0x95, 0xEB, /* 0x3C-0x3F */ + 0xE2, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD5, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE2, 0xD4, 0x90, 0xD0, 0x00, 0x00, 0xE2, 0xD7, /* 0x4C-0x4F */ + 0xE2, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE2, 0xD6, 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, /* 0x54-0x57 */ + 0xE2, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xDB, /* 0x5C-0x5F */ + 0xE2, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE2, 0xDC, 0xE2, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC4, /* 0x70-0x73 */ + 0x00, 0x00, 0xE2, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xE0, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8B, 0xCC, 0x8C, 0x48, 0xE2, 0xE1, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x95, 0xB2, 0x00, 0x00, 0x90, 0x88, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xE2, 0x00, 0x00, 0x97, 0xB1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x94, 0x94, 0x00, 0x00, 0x91, 0x65, /* 0x94-0x97 */ + 0x94, 0x53, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6C, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xBE, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE2, 0xE7, 0xE2, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE2, 0xE3, 0x8A, 0x9F, 0x00, 0x00, 0x8F, 0xCF, /* 0xA4-0xA7 */ + 0xE2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE6, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE2, 0xE4, 0xE2, 0xEC, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE2, 0xEB, 0xE2, 0xEA, 0xE2, 0xE9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE2, 0xEE, 0x90, 0xB8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE2, 0xEF, 0x00, 0x00, 0xE2, 0xF1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD0, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x57, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x9C, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE2, 0xF4, 0x00, 0x00, 0x95, 0xB3, 0x91, 0x8C, /* 0xDC-0xDF */ + 0x8D, 0x66, 0x00, 0x00, 0xE2, 0xF5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xC6, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF7, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0xF9, 0x00, 0x00, 0xE2, 0xFA, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8E, 0x85, 0x00, 0x00, 0xE2, 0xFB, 0x8C, 0x6E, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8A, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0x8B, 0x49, 0x00, 0x00, 0xE3, 0x40, 0x00, 0x00, /* 0x00-0x03 */ + 0x96, 0xF1, 0x8D, 0x67, 0xE2, 0xFC, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x43, 0x96, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0x94, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x95, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8F, 0x83, 0xE3, 0x42, 0x00, 0x00, 0x8E, 0xD1, /* 0x14-0x17 */ + 0x8D, 0x68, 0x8E, 0x86, 0x8B, 0x89, 0x95, 0xB4, /* 0x18-0x1B */ + 0xE3, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0x66, 0x96, 0x61, 0x8D, 0xF5, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x87, /* 0x28-0x2B */ + 0x92, 0xDB, 0x00, 0x00, 0xE3, 0x46, 0x97, 0xDD, /* 0x2C-0x2F */ + 0x8D, 0xD7, 0x00, 0x00, 0xE3, 0x47, 0x90, 0x61, /* 0x30-0x33 */ + 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8F, 0xD0, 0x8D, 0xAE, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x48, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x49, 0x8C, 0xBC, /* 0x40-0x43 */ + 0x91, 0x67, 0xE3, 0x44, 0xE3, 0x4A, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x45, /* 0x48-0x4B */ + 0x8C, 0x6F, 0x00, 0x00, 0xE3, 0x4D, 0xE3, 0x51, /* 0x4C-0x4F */ + 0x8C, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x4C, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x55, /* 0x58-0x5B */ + 0xEE, 0x6E, 0x00, 0x00, 0x8D, 0x69, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x97, 0x8D, 0x88, 0xBA, 0xE3, 0x52, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8B, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x50, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x93, 0x9D, 0xE3, 0x4E, 0xE3, 0x4B, /* 0x70-0x73 */ + 0x00, 0x00, 0x8A, 0x47, 0x90, 0xE2, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x8C, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE3, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE3, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x56, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x53, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x8C, 0x70, 0x91, 0xB1, 0xE3, 0x58, /* 0x98-0x9B */ + 0x91, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x65, /* 0x9C-0x9F */ + 0xEE, 0x70, 0x00, 0x00, 0xE3, 0x61, 0xE3, 0x5B, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5F, /* 0xA8-0xAB */ + 0x8E, 0xF8, 0x88, 0xDB, 0xE3, 0x5A, 0xE3, 0x62, /* 0xAC-0xAF */ + 0xE3, 0x66, 0x8D, 0x6A, 0x96, 0xD4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xD4, 0xE3, 0x5C, 0x00, 0x00, 0xEE, 0x6F, /* 0xB4-0xB7 */ + 0xE3, 0x64, 0x00, 0x00, 0xE3, 0x59, 0x92, 0x5D, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE3, 0x5E, 0x88, 0xBB, 0x96, 0xC8, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xD9, 0x94, 0xEA, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x8D, /* 0xCC-0xCF */ + 0x00, 0x00, 0x97, 0xCE, 0x8F, 0x8F, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE3, 0x8E, 0xEE, 0x71, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE3, 0x67, 0x00, 0x00, 0x90, 0xFC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE3, 0x63, 0xE3, 0x68, 0xE3, 0x6A, 0x00, 0x00, /* 0xDC-0xDF */ + 0x92, 0xF7, 0xE3, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE3, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x95, 0xD2, 0x8A, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x96, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6C, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x97, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6B, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x89, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x93, 0xEA, 0xE3, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x6F, 0xE3, 0x76, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x9B, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xC8, 0xE3, 0x74, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE3, 0x71, 0xE3, 0x77, 0xE3, 0x70, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x63, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x44, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6B, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE3, 0x73, 0xE3, 0x80, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE3, 0x7B, 0x00, 0x00, 0xE3, 0x7E, /* 0x34-0x37 */ + 0x00, 0x00, 0xE3, 0x7C, 0xE3, 0x81, 0xE3, 0x7A, /* 0x38-0x3B */ + 0x00, 0x00, 0xE3, 0x60, 0x90, 0xD1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x94, 0xC9, 0x00, 0x00, 0xE3, 0x7D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x78, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x40, 0x8C, 0x71, /* 0x48-0x4B */ + 0x00, 0x00, 0x8F, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x72, 0x00, 0x00, /* 0x50-0x53 */ + 0x90, 0x44, 0x91, 0x55, 0xE3, 0x84, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE3, 0x86, 0xE3, 0x87, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE3, 0x83, 0xE3, 0x85, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x79, 0xE3, 0x82, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0x8A, 0xE3, 0x89, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x96, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8C, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE3, 0x88, 0x00, 0x00, 0xE3, 0x8C, /* 0x78-0x7B */ + 0xE3, 0x8B, 0xE3, 0x8F, 0x00, 0x00, 0xE3, 0x91, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5B, 0xE3, 0x8D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE3, 0x92, 0xE3, 0x93, 0xED, 0x40, 0x00, 0x00, /* 0x88-0x8B */ + 0xE3, 0x94, 0x00, 0x00, 0xE3, 0x9A, 0x93, 0x5A, /* 0x8C-0x8F */ + 0xE3, 0x96, 0x00, 0x00, 0xE3, 0x95, 0xE3, 0x97, /* 0x90-0x93 */ + 0xE3, 0x98, 0x00, 0x00, 0xE3, 0x99, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x9B, /* 0x98-0x9B */ + 0xE3, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xCA, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0x9D, 0x00, 0x00, 0xE3, 0x9E, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE3, 0x9F, 0x00, 0x00, 0xEE, 0x73, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE3, 0xA0, 0xE3, 0xA1, 0xE3, 0xA2, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE3, 0xA3, 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE3, 0xA6, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, /* 0x5C-0x5F */ + 0xE3, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x64-0x67 */ + 0xE3, 0xAA, 0xE3, 0xAB, 0x8D, 0xDF, 0x8C, 0x72, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x75, 0x00, 0x00, /* 0x6C-0x6F */ + 0x94, 0xB1, 0x00, 0x00, 0x8F, 0x90, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x94, 0x6C, 0x00, 0x00, 0x94, 0xEB, /* 0x74-0x77 */ + 0xE3, 0xAD, 0x9C, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, 0xE3, 0xB0, /* 0x80-0x83 */ + 0x00, 0x00, 0x97, 0x85, 0xE3, 0xAF, 0xE3, 0xB2, /* 0x84-0x87 */ + 0xE3, 0xB1, 0x00, 0x00, 0x97, 0x72, 0x00, 0x00, /* 0x88-0x8B */ + 0xE3, 0xB3, 0x00, 0x00, 0x94, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xB7, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xE3, 0xB6, 0xE3, 0xB5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEE, 0x74, 0x00, 0x00, 0xE3, 0xB8, /* 0xA0-0xA3 */ + 0x8C, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x91, 0x41, 0x8B, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xE3, 0xB9, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE3, 0xBE, 0xE3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x89, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x89, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xC0, 0xE3, 0xC1, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xC8-0xCB */ + 0x97, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x4B, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE3, 0xC4, 0xE3, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x90, 0x89, 0xE3, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x8A, 0xE3, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8A, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x96, 0x7C, /* 0xF8-0xFB */ + 0x97, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0x97, 0x73, 0x98, 0x56, 0x00, 0x00, 0x8D, 0x6C, /* 0x00-0x03 */ + 0xE3, 0xCC, 0x8E, 0xD2, 0xE3, 0xCB, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCD, /* 0x08-0x0B */ + 0x8E, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x91, 0xCF, 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x8D, 0x6B, 0x00, 0x00, 0x96, 0xD5, /* 0x14-0x17 */ + 0xE3, 0xCF, 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD5, /* 0x38-0x3B */ + 0x00, 0x00, 0x92, 0x5E, 0x00, 0x00, 0xE3, 0xD4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD7, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB9, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xD9, 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xB7, 0xE3, 0xDB, /* 0x5C-0x5F */ + 0x00, 0x00, 0x91, 0x8F, 0xE3, 0xDC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xFC, /* 0x6C-0x6F */ + 0xE3, 0xE0, 0x00, 0x00, 0xE3, 0xDF, 0xE3, 0xDE, /* 0x70-0x73 */ + 0x92, 0xAE, 0x00, 0x00, 0xE3, 0xE1, 0x90, 0x45, /* 0x74-0x77 */ + 0x00, 0x00, 0xE3, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE3, 0xE3, 0x98, 0x57, 0xE3, 0xE4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE3, 0xE5, 0xE3, 0xE7, 0xE3, 0xE6, 0x94, 0xA3, /* 0x84-0x87 */ + 0x00, 0x00, 0x93, 0xF7, 0x00, 0x00, 0x98, 0x5D, /* 0x88-0x8B */ + 0x94, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE9, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD1, 0x00, 0x00, /* 0x94-0x97 */ + 0x95, 0x49, 0x00, 0x00, 0xE3, 0xEA, 0xE3, 0xE8, /* 0x98-0x9B */ + 0x00, 0x00, 0x8A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x8C, 0xD2, 0x8E, 0x88, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x94, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8C, 0xA8, 0x96, 0x62, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE3, 0xED, 0xE3, 0xEB, 0x00, 0x00, 0x8D, 0x6D, /* 0xAC-0xAF */ + 0x00, 0x00, 0x8D, 0x6E, 0x88, 0xE7, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x8D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x78, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDD, /* 0xC0-0xC3 */ + 0xE3, 0xF2, 0x00, 0x00, 0x92, 0x5F, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x94, 0x77, 0x00, 0x00, 0x91, 0xD9, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE3, 0xF0, 0xE3, 0xF3, 0xE3, 0xEE, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE3, 0xF1, 0x96, 0x45, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8C, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x88, 0xFB, 0xE3, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x93, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8B, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE4, 0x45, 0x94, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x89, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x8B, 0xBA, 0x90, 0xC6, 0x98, 0x65, /* 0x04-0x07 */ + 0x96, 0xAC, 0xE3, 0xF5, 0x90, 0xD2, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x72, 0xE3, 0xF8, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFA, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE3, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x92, 0x45, 0x00, 0x00, 0x94, 0x5D, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x92, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x41, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFC, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x74, 0x00, 0x00, /* 0x4C-0x4F */ + 0x95, 0x85, 0xE4, 0x44, 0x00, 0x00, 0xE4, 0x43, /* 0x50-0x53 */ + 0x8D, 0x6F, 0x98, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE4, 0x48, 0xE4, 0x49, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x47, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8D, 0x98, 0xE4, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE4, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x92, 0xB0, 0x95, 0xA0, 0x91, 0x42, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDA, /* 0x7C-0x7F */ + + 0xE4, 0x4E, 0x00, 0x00, 0xE4, 0x4F, 0xE4, 0x4B, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE4, 0x4C, 0x00, 0x00, 0xE4, 0x4D, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x70, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x55, /* 0x90-0x93 */ + 0x00, 0x00, 0xE4, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x86, 0x00, 0x00, /* 0x98-0x9B */ + 0x96, 0x8C, 0x95, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x53, /* 0xA0-0xA3 */ + 0xE4, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x63, 0xE4, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE4, 0x57, 0x00, 0x00, 0x00, 0x00, 0x91, 0x56, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE4, 0x5A, 0x00, 0x00, 0xE4, 0x5E, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0x5B, 0xE4, 0x59, 0x94, 0x5E, /* 0xBC-0xBF */ + 0xE4, 0x5C, 0x00, 0x00, 0xE4, 0x5D, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE4, 0x64, 0xE4, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE4, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE4, 0x61, 0x00, 0x00, 0x91, 0x9F, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE4, 0x63, 0xE4, 0x62, 0xE4, 0x65, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x66, /* 0xDC-0xDF */ + 0xE4, 0x67, 0x00, 0x00, 0x00, 0x00, 0x90, 0x62, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x89, 0xE7, 0x00, 0x00, 0xE4, 0x68, /* 0xE4-0xE7 */ + 0x97, 0xD5, 0x00, 0x00, 0x8E, 0xA9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x8F, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8A, /* 0xF0-0xF3 */ + 0x92, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x69, 0xE4, 0x6A, /* 0xF8-0xFB */ + 0x89, 0x50, 0x00, 0x00, 0xE4, 0x6B, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0x00, 0x00, 0xE4, 0x6C, 0xE4, 0x6D, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE4, 0x6E, 0x00, 0x00, 0xE4, 0x6F, /* 0x04-0x07 */ + 0x8B, 0xBB, 0x9D, 0xA8, 0xE4, 0x70, 0x00, 0x00, /* 0x08-0x0B */ + 0x90, 0xE3, 0xE4, 0x71, 0x8E, 0xC9, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0x72, 0x00, 0x00, 0x98, 0xAE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0x95, 0xDC, /* 0x14-0x17 */ + 0x8A, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x91, 0x43, /* 0x18-0x1B */ + 0x8F, 0x77, 0x00, 0x00, 0x95, 0x91, 0x8F, 0x4D, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE4, 0x74, 0x8D, 0x71, 0xE4, 0x75, /* 0x28-0x2B */ + 0x94, 0xCA, 0x00, 0x00, 0xE4, 0x84, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x77, /* 0x30-0x33 */ + 0x00, 0x00, 0x91, 0xC7, 0x94, 0x95, 0x8C, 0xBD, /* 0x34-0x37 */ + 0xE4, 0x76, 0x91, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF8, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE4, 0x7A, 0xE4, 0x79, 0xE4, 0x7C, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE4, 0x7B, 0x00, 0x00, 0xE4, 0x7D, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x80, 0x00, 0x00, /* 0x60-0x63 */ + 0xE4, 0x7E, 0x00, 0x00, 0x8A, 0xCD, 0x00, 0x00, /* 0x64-0x67 */ + 0xE4, 0x81, 0x00, 0x00, 0xE4, 0x82, 0xE4, 0x83, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xAF, 0x97, 0xC7, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE4, 0x85, 0x90, 0x46, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x90, 0xE4, 0x86, /* 0x74-0x77 */ + 0xE4, 0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x88, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF0, /* 0x88-0x8B */ + 0x00, 0x00, 0xE4, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8A, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x95, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x8E, 0xC5, 0x00, 0x00, 0xE4, 0x8C, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x8A, 0x48, 0x88, 0xB0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8B, /* 0xA8-0xAB */ + 0xE4, 0x8E, 0x94, 0x6D, 0x00, 0x00, 0x90, 0x63, /* 0xAC-0xAF */ + 0x00, 0x00, 0x89, 0xD4, 0x00, 0x00, 0x96, 0x46, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8C, 0x7C, 0x8B, 0xDA, 0x00, 0x00, 0xE4, 0x8D, /* 0xB8-0xBB */ + 0x00, 0x00, 0x89, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8A, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x89, 0x91, 0xE4, 0x92, 0x97, 0xE8, /* 0xD0-0xD3 */ + 0x91, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x63, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0x9E, 0x00, 0x00, 0x89, 0xD5, /* 0xD8-0xDB */ + 0xE4, 0x9C, 0x00, 0x00, 0xE4, 0x9A, 0xE4, 0x91, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE4, 0x8F, 0x00, 0x00, 0xE4, 0x90, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x8E, 0xE1, 0x8B, 0xEA, 0x92, 0x97, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x89, 0x70, 0x00, 0x00, 0xE4, 0x94, /* 0xF0-0xF3 */ + 0xE4, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE4, 0x99, 0xE4, 0x95, 0xE4, 0x98, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_83[512] = { + 0x00, 0x00, 0xEE, 0x76, 0x96, 0xCE, 0xE4, 0x97, /* 0x00-0x03 */ + 0x89, 0xD6, 0x8A, 0x9D, 0xE4, 0x9B, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x73, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA1, 0xE4, 0xAA, /* 0x14-0x17 */ + 0xE4, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x88, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB2, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x88, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE4, 0xA3, 0xE4, 0xA2, 0x00, 0x00, /* 0x30-0x33 */ + 0xE4, 0xA0, 0xE4, 0x9F, 0x92, 0x83, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0xF9, 0xE4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE4, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x91, 0x90, 0x8C, 0x74, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x60, /* 0x4C-0x4F */ + 0xE4, 0xA6, 0x00, 0x00, 0x8D, 0x72, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x91, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x77, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB8, /* 0x70-0x73 */ + 0x00, 0x00, 0xE4, 0xB9, 0x00, 0x00, 0x89, 0xD7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xAC, /* 0x78-0x7B */ + 0xE4, 0xB6, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x78, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, 0xE4, 0xB4, /* 0x84-0x87 */ + 0x00, 0x00, 0xE4, 0xBB, 0xE4, 0xB5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB3, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x96, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB1, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAD, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xCE, 0xE4, 0xAF, /* 0x9C-0x9F */ + 0xE4, 0xBA, 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE4, 0xBC, 0x00, 0x00, 0xE4, 0xAE, 0x94, 0x9C, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x97, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x90, 0x9B, 0x00, 0x00, 0xEE, 0x79, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x65, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8B, 0xDB, 0x00, 0x00, 0xE4, 0xC0, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD2, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x8D, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x70, /* 0xDC-0xDF */ + 0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x95, 0xEC, 0x00, 0x00, 0xE4, 0xBF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD8, /* 0xEC-0xEF */ + 0x8C, 0xD4, 0x95, 0x48, 0xE4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xBD, 0x00, 0x00, 0xEE, 0x7A, 0xE4, 0xC6, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD0, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE4, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC2, /* 0x00-0x03 */ + 0x93, 0xB8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC4, /* 0x08-0x0B */ + 0x96, 0x47, 0xE4, 0xCA, 0x88, 0xDE, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE4, 0xCC, 0x00, 0x00, 0xE4, 0xCB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x94, 0x8B, 0xE4, 0xD2, 0x00, 0x00, /* 0x28-0x2B */ + 0xE4, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8A, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xE4, 0xD3, 0x97, 0x8E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0x7B, 0x97, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xA8, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x98, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x8B, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x95, 0x92, 0xE4, 0xE2, 0x93, 0x9F, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xAF, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE4, 0xD7, /* 0x68-0x6B */ + 0x91, 0x92, 0xE4, 0xD1, 0xE4, 0xD9, 0xE4, 0xDE, /* 0x6C-0x6F */ + 0x00, 0x00, 0x94, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x88, 0xA8, 0x00, 0x00, 0xE4, 0xD6, /* 0x74-0x77 */ + 0x00, 0x00, 0xE4, 0xDF, 0x95, 0x98, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, /* 0x80-0x83 */ + 0xE4, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x8E, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x96, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x95, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE5, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x97, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xEE, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8F, 0xF6, 0xE4, 0xE3, 0x00, 0x00, 0xE4, 0xE8, /* 0xB8-0xBB */ + 0x91, 0x93, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE4, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x92, 0x7E, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x97, 0x75, 0xE4, 0xE1, 0x8A, 0x57, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE4, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE4, 0xEA, 0x96, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0xE6, 0xE4, 0xE9, 0x00, 0x00, /* 0xD8-0xDB */ + 0xED, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x96, 0x48, 0x00, 0x00, 0x98, 0x40, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0x8E, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x95, 0xCC, 0x00, 0x00, 0x96, 0xA0, /* 0x10-0x13 */ + 0xE4, 0xF7, 0xE4, 0xF6, 0x00, 0x00, 0xE4, 0xF2, /* 0x14-0x17 */ + 0xE4, 0xF3, 0x00, 0x00, 0x89, 0x55, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xD3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE4, 0xF4, 0x88, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x91, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x95, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0xF9, 0xE5, 0x40, 0x00, 0x00, 0x94, 0xD7, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xFC, 0x8F, 0xD4, 0x8E, 0xC7, 0xE5, 0x42, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBC, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x7D, /* 0x50-0x53 */ + 0x00, 0x00, 0xE5, 0x43, 0x00, 0x00, 0x95, 0x99, /* 0x54-0x57 */ + 0xE4, 0xFB, 0xEE, 0x7E, 0xE4, 0xD4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x98, 0x6E, 0x93, 0xA0, 0x95, 0x93, 0xEE, 0x80, /* 0x68-0x6B */ + 0x00, 0x00, 0xE5, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x50, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x51, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE5, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x94, 0x96, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x4E, /* 0x84-0x87 */ + 0xE5, 0x46, 0x00, 0x00, 0xE5, 0x48, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE5, 0x52, 0xE5, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE5, 0x4B, 0x00, 0x00, 0x00, 0x00, 0x89, 0x92, /* 0x94-0x97 */ + 0x00, 0x00, 0x93, 0xE3, 0x00, 0x00, 0xE5, 0x4C, /* 0x98-0x9B */ + 0xE5, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE5, 0x45, 0x00, 0x00, 0x91, 0x45, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE5, 0x49, 0x8E, 0x46, 0x90, 0x64, 0x8C, 0x4F, /* 0xA8-0xAB */ + 0x96, 0xF2, 0x00, 0x00, 0x96, 0xF7, 0x8F, 0x92, /* 0xAC-0xAF */ + 0xEE, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE5, 0x56, 0xE5, 0x54, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x98, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE5, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x97, 0x95, 0x00, 0x00, 0xE5, 0x55, /* 0xCC-0xCF */ + 0xE5, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE5, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE5, 0x5B, 0xE5, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x93, 0xA1, 0xE5, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x94, 0xCB, 0xE5, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x93, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE5, 0x5C, 0xE5, 0x61, 0x91, 0x94, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x41, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x62, 0x91, 0x68, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5F, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5E, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x50, 0x9F, 0x41, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x64, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x63, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x97, 0x96, 0x00, 0x00, 0xE1, 0xBA, /* 0x2C-0x2F */ + 0xE5, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x66, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE5, 0x67, 0x8C, 0xD5, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE5, 0x69, 0x99, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x95, 0x00, 0x00, /* 0x58-0x5B */ + 0x97, 0xB8, 0x00, 0x00, 0x8B, 0xF1, 0xE5, 0x6A, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6B, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x8E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE5, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x93, 0xF8, 0x00, 0x00, 0x88, 0xB8, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xE1, 0xE5, 0x71, /* 0x88-0x8B */ + 0xE5, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6D, /* 0x90-0x93 */ + 0x00, 0x00, 0x8E, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6E, /* 0xA0-0xA3 */ + 0x94, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x7A, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x74, /* 0xAC-0xAF */ + 0xE5, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x73, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE5, 0x75, 0x00, 0x00, 0xE5, 0x76, 0x8E, 0xD6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE5, 0x78, 0x00, 0x00, 0x92, 0x60, /* 0xC8-0xCB */ + 0x00, 0x00, 0x8C, 0x75, 0x8A, 0x61, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE5, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8A, 0x5E, 0x00, 0x00, 0xE5, 0x81, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x7C, 0xE5, 0x80, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x94, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE5, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE5, 0x7E, 0x95, 0x67, 0x94, 0xD8, 0xE5, 0x82, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x91, 0xFB, 0xE5, 0x8C, 0x00, 0x00, 0xE5, 0x88, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xE9, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xE5, 0x86, 0x00, 0x00, 0x96, 0x49, 0xE5, 0x87, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x84, 0x00, 0x00, /* 0x04-0x07 */ + 0xE5, 0x85, 0xE5, 0x8A, 0xE5, 0x8D, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE5, 0x89, 0xE5, 0x83, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x92, 0x77, 0x00, 0x00, 0xE5, 0x94, 0x00, 0x00, /* 0x18-0x1B */ + 0x96, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE5, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE5, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x90, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x91, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x8F, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x90, 0xE4, 0x00, 0x00, 0x98, 0x58, /* 0x48-0x4B */ + 0xE5, 0x98, 0x00, 0x00, 0xE5, 0x99, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9F, /* 0x50-0x53 */ + 0x00, 0x00, 0x90, 0x49, 0x00, 0x00, 0xE5, 0x9B, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x96, /* 0x5C-0x5F */ + 0xE5, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA0, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xDA, 0x00, 0x00, /* 0x64-0x67 */ + 0xE5, 0x9C, 0x00, 0x00, 0xE5, 0xA1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9D, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE5, 0x9A, 0x00, 0x00, 0x92, 0xB1, 0x00, 0x00, /* 0x74-0x77 */ + 0xE5, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x88, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA5, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x97, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x86, 0xE5, 0xB1, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE5, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE5, 0xAD, 0x00, 0x00, 0xE5, 0xB0, 0xE5, 0xAF, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE5, 0xAA, 0x00, 0x00, 0xE5, 0xBB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB8, 0xE5, 0xB9, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x8A, 0x49, 0x00, 0x00, 0x8B, 0x61, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE5, 0xA2, 0x00, 0x00, 0xEE, 0x85, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xB6, 0xE5, 0xBA, 0xE5, 0xB5, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE5, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE5, 0xBE, 0xE5, 0xBD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE5, 0xC0, 0xE5, 0xBF, 0xE5, 0x79, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC4, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC5, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x8C, 0x8C, 0x00, 0x00, 0xE5, 0xC7, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xC6, 0x00, 0x00, 0x8F, 0x4F, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8D, 0x73, 0x9F, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC8, 0x8F, 0x70, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x58, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xC9, 0x00, 0x00, 0x89, 0x71, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0xD5, 0xE5, 0xCA, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8D, 0x74, 0xE5, 0xCB, 0x88, 0xDF, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x95, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCC, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x90, 0x8A, 0x00, 0x00, 0xE5, 0xD3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE5, 0xD0, 0x00, 0x00, 0x92, 0x8F, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE5, 0xD1, 0xE5, 0xCE, 0x8B, 0xDC, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE5, 0xCD, 0xE5, 0xD4, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x8C, 0x55, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDC, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD6, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xB3, 0xE5, 0xD5, /* 0x94-0x97 */ + 0x00, 0x00, 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD9, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE5, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xED, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE5, 0xDC, 0xE5, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x8C, 0xD1, 0xE5, 0xD2, 0x00, 0x00, 0x88, 0xBF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8D, 0xD9, 0x97, 0xF4, 0xE5, 0xDF, /* 0xC0-0xC3 */ + 0xE5, 0xE0, 0x91, 0x95, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA0, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE5, 0xE1, 0x97, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE5, 0xE2, 0xE5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x95, 0xE2, 0xE5, 0xE4, 0x00, 0x00, 0x8D, 0xBE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x97, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE5, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xEA, 0x8F, 0xD6, /* 0xF0-0xF3 */ + 0xE5, 0xE8, 0xEE, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x97, 0x87, 0xE5, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE5, 0xE7, 0x90, 0xBB, 0x90, 0x9E, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x95, 0xA1, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xED, 0x00, 0x00, /* 0x08-0x0B */ + 0xE5, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x8A, 0x8C, 0x00, 0x00, 0x96, 0x4A, 0xE5, 0xEE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xED, 0x41, 0xE5, 0xFA, 0xE5, 0xF0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF2, 0xE5, 0xF3, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x34-0x37 */ + 0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE5, 0xF4, 0x00, 0x00, 0xE5, 0xEF, /* 0x40-0x43 */ + 0xE5, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE5, 0xF9, 0xE8, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xFC, 0x8B, 0xDD, /* 0x5C-0x5F */ + 0xE5, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x40, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x43, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE6, 0x42, 0x00, 0x00, 0xE6, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x50, 0x00, 0x00, /* 0x70-0x73 */ + 0xE6, 0x45, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x46, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x47, 0x90, 0xBC, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x97, 0x76, 0x00, 0x00, 0xE6, 0x48, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xA2, 0x94, 0x65, /* 0x84-0x87 */ + 0xE6, 0x49, 0x00, 0x00, 0xE6, 0x4A, 0x8C, 0xA9, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x4B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4B, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8B, 0x94, 0x60, /* 0x94-0x97 */ + 0xE6, 0x4C, 0x00, 0x00, 0x8A, 0x6F, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE6, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4F, 0x97, 0x97, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE6, 0x4E, 0x90, 0x65, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x51, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x52, 0x8A, 0xCF, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x53, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE6, 0x54, 0x00, 0x00, 0xE6, 0x55, /* 0xBC-0xBF */ + 0xE6, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x70, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x57, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE6, 0x58, 0xE6, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF0, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x47, 0xE6, 0x5A, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE6, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_8A[512] = { + 0x8C, 0xBE, 0x00, 0x00, 0x92, 0xF9, 0xE6, 0x5D, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8C, 0x76, 0x00, 0x00, 0x90, 0x75, 0x00, 0x00, /* 0x08-0x0B */ + 0xE6, 0x60, 0x00, 0x00, 0x93, 0xA2, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE6, 0x5F, 0x00, 0x00, 0xEE, 0x87, 0x8C, 0x50, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x5E, 0x91, 0xF5, /* 0x14-0x17 */ + 0x8B, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x61, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0x62, 0x00, 0x00, 0x8F, 0xD7, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8D, /* 0x20-0x23 */ + 0x00, 0x00, 0xE6, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x4B, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x90, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8B, 0x96, 0x00, 0x00, 0x96, 0xF3, /* 0x30-0x33 */ + 0x91, 0x69, 0x00, 0x00, 0xE6, 0x64, 0xEE, 0x88, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x66, 0x92, 0x90, /* 0x38-0x3B */ + 0x8F, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x68, 0x00, 0x00, /* 0x44-0x47 */ + 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8D, 0xBC, 0x91, 0xC0, 0xE6, 0x67, 0x00, 0x00, /* 0x50-0x53 */ + 0x8F, 0xD9, 0x95, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x66, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8C, 0x00, 0x00, /* 0x5C-0x5F */ + 0x89, 0x72, 0x00, 0x00, 0xE6, 0x6D, 0x8C, 0x77, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8E, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x8E, 0x8D, 0x00, 0x00, 0x98, 0x6C, /* 0x68-0x6B */ + 0xE6, 0x6C, 0xE6, 0x6B, 0x91, 0x46, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8B, 0x6C, 0x98, 0x62, 0x8A, 0x59, 0x8F, 0xDA, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xEE, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x6F, 0x00, 0x00, /* 0x80-0x83 */ + 0xE6, 0x70, 0xE6, 0x6E, 0x00, 0x00, 0x8C, 0xD6, /* 0x84-0x87 */ + 0x00, 0x00, 0x97, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8E, 0x8F, 0x94, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE6, 0x73, 0x00, 0x00, 0x90, 0xBE, /* 0x90-0x93 */ + 0x00, 0x00, 0x92, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x97, 0x55, 0x00, 0x00, 0xE6, 0x76, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x90, 0xBD, 0xE6, 0x72, 0x00, 0x00, 0xE6, 0x77, /* 0xA0-0xA3 */ + 0x8C, 0xEB, 0xE6, 0x74, 0xE6, 0x75, 0xEE, 0x8A, /* 0xA4-0xA7 */ + 0xE6, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x90, 0xE0, 0x93, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x92, 0x4E, 0x00, 0x00, 0x89, 0xDB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x94, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x8B, 0x62, 0x00, 0x00, 0xEE, 0x8B, 0x92, 0xB2, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7A, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE6, 0x78, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6B, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xBF, /* 0xC8-0xCB */ + 0x8A, 0xD0, 0xE6, 0x79, 0x00, 0x00, 0x90, 0x7A, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xC8, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x5F, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7B, 0xE6, 0x87, /* 0xD8-0xDB */ + 0x92, 0xB3, 0x00, 0x00, 0xE6, 0x86, 0xEE, 0x8C, /* 0xDC-0xDF */ + 0xE6, 0x83, 0xE6, 0x8B, 0xE6, 0x84, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE6, 0x80, 0x00, 0x00, 0x92, 0xFA, 0xE6, 0x7E, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7C, /* 0xE8-0xEB */ + 0x00, 0x00, 0x97, 0x40, 0x8E, 0x90, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE6, 0x81, 0x00, 0x00, 0xE6, 0x7D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8E, 0xE6, 0x85, /* 0xF4-0xF7 */ + 0x8F, 0x94, 0x00, 0x00, 0x8C, 0xBF, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xF8, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0x96, 0x64, 0x89, 0x79, 0x88, 0xE0, 0x00, 0x00, /* 0x00-0x03 */ + 0x93, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x89, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE6, 0x88, 0x00, 0x00, 0x93, 0xE4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE6, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE6, 0x82, 0x00, 0x00, 0xE6, 0x8C, 0xE6, 0x8E, /* 0x14-0x17 */ + 0x00, 0x00, 0x8C, 0xAA, 0xE6, 0x8A, 0x8D, 0x75, /* 0x18-0x1B */ + 0x00, 0x00, 0x8E, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE6, 0x8F, 0x97, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x92, 0x00, 0x00, /* 0x24-0x27 */ + 0xE6, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x93, /* 0x28-0x2B */ + 0x95, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x90, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8B, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x94, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE6, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE6, 0x97, 0x00, 0x00, 0xE6, 0x99, 0xE6, 0x98, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8F, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9B, 0x00, 0x00, /* 0x54-0x57 */ + 0x8E, 0xAF, 0x00, 0x00, 0xE6, 0x9D, 0xE6, 0x9C, /* 0x58-0x5B */ + 0x95, 0x88, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9F, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x78, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9E, /* 0x68-0x6B */ + 0xE6, 0xA0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA1, /* 0x6C-0x6F */ + 0x8B, 0x63, 0xE3, 0xBF, 0x8F, 0xF7, 0x00, 0x00, /* 0x70-0x73 */ + 0xE6, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEC, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE6, 0xA3, 0x00, 0x00, 0xEE, 0x90, /* 0x7C-0x7F */ + + 0xE6, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCC, 0x00, 0x00, /* 0x88-0x8B */ + 0xE6, 0xA5, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x51, 0x00, 0x00, 0xE6, 0xA7, 0xE6, 0xA8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE6, 0xAA, 0xE6, 0xAB, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x4A, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAE, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xA4, 0x00, 0x00, /* 0x44-0x47 */ + 0xE6, 0xAF, 0x00, 0x00, 0x96, 0x4C, 0x00, 0x00, /* 0x48-0x4B */ + 0xE6, 0xB0, 0x00, 0x00, 0xE6, 0xB1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE6, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xD8, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8F, 0xDB, 0xE6, 0xB4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8B, 0x98, 0xAC, /* 0x68-0x6B */ + 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE6, 0xB6, 0x95, 0x5E, 0xE6, 0xB7, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB8, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE6, 0xB9, 0xE6, 0xBB, 0x00, 0x00, /* 0x88-0x8B */ + 0x96, 0x65, 0xE6, 0xBC, 0xE6, 0xBD, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE6, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE6, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x8A, 0x4C, 0x92, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x95, 0x89, 0x8D, 0xE0, 0x8D, 0x76, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x6E, /* 0xA4-0xA7 */ + 0x89, 0xDD, 0x94, 0xCC, 0xE6, 0xC3, 0x8A, 0xD1, /* 0xA8-0xAB */ + 0x90, 0xD3, 0xE6, 0xC2, 0xE6, 0xC7, 0x92, 0x99, /* 0xAC-0xAF */ + 0x96, 0xE1, 0x00, 0x00, 0xE6, 0xC5, 0xE6, 0xC6, /* 0xB0-0xB3 */ + 0x8B, 0x4D, 0x00, 0x00, 0xE6, 0xC8, 0x94, 0x83, /* 0xB4-0xB7 */ + 0x91, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x94, 0xEF, /* 0xB8-0xBB */ + 0x93, 0x5C, 0xE6, 0xC4, 0x00, 0x00, 0x96, 0x66, /* 0xBC-0xBF */ + 0x89, 0xEA, 0xE6, 0xCA, 0x98, 0x47, 0x92, 0xC0, /* 0xC0-0xC3 */ + 0x98, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x91, /* 0xC4-0xC7 */ + 0xE6, 0xC9, 0x00, 0x00, 0x91, 0xAF, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE6, 0xDA, 0x91, 0x47, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x93, 0xF6, 0x00, 0x00, 0x95, 0x6F, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xCD, 0x8E, 0x5E, /* 0xD8-0xDB */ + 0x8E, 0x92, 0x00, 0x00, 0x8F, 0xDC, 0x00, 0x00, /* 0xDC-0xDF */ + 0x94, 0x85, 0x00, 0x00, 0x8C, 0xAB, 0xE6, 0xCC, /* 0xE0-0xE3 */ + 0xE6, 0xCB, 0x00, 0x00, 0x95, 0x8A, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x93, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEE, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEE, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xCF, 0xE6, 0xD0, /* 0xF8-0xFB */ + 0x8D, 0x77, 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE6, 0xD1, 0xE6, 0xD2, 0x00, 0x00, 0xE6, 0xD4, /* 0x04-0x07 */ + 0x91, 0xA1, 0x00, 0x00, 0xE6, 0xD3, 0x8A, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0xE6, 0xD6, 0x00, 0x00, 0xE6, 0xD5, /* 0x0C-0x0F */ + 0xE6, 0xD7, 0x00, 0x00, 0xEE, 0x93, 0xE6, 0xD9, /* 0x10-0x13 */ + 0xE6, 0xDB, 0x00, 0x00, 0xE6, 0xDC, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x90, 0xD4, 0x00, 0x00, 0x8E, 0xCD, 0xE6, 0xDD, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x71, /* 0x68-0x6B */ + 0x00, 0x00, 0xE6, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x91, 0x96, 0xE6, 0xDF, 0x00, 0x00, 0xE6, 0xE0, /* 0x70-0x73 */ + 0x95, 0x8B, 0x00, 0x00, 0xEE, 0x94, 0x8B, 0x4E, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE6, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x92, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7A, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x90, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE5, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE4, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, /* 0xC8-0xCB */ + 0xE6, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE6, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, 0xE6, 0xEA, /* 0xD8-0xDB */ + 0x00, 0x00, 0x8B, 0x97, 0x00, 0x00, 0xE6, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x90, 0xD5, 0x00, 0x00, 0xE6, 0xEF, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8C, 0xD7, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xED, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x48, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x91, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE6, 0xF1, 0xE6, 0xF2, 0x97, 0x78, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xA5, /* 0x0C-0x0F */ + 0xE6, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0xF4, 0xE6, 0xF5, 0xE6, 0xF7, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x48, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE6, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE6, 0xFB, 0xE6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF8, 0x00, 0x00, /* 0x40-0x43 */ + 0x92, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x40, /* 0x44-0x47 */ + 0xE7, 0x44, 0xE7, 0x41, 0xE6, 0xFC, 0x00, 0x00, /* 0x48-0x4B */ + 0xE7, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE7, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xD6, /* 0x5C-0x5F */ + 0xE7, 0x47, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x49, /* 0x60-0x63 */ + 0xE7, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4C, 0x00, 0x00, /* 0x70-0x73 */ + 0x8F, 0x52, 0x00, 0x00, 0xE7, 0x4B, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE7, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE7, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE7, 0x51, 0xE7, 0x50, 0x00, 0x00, 0xE7, 0x4F, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x53, 0xE7, 0x52, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE7, 0x55, 0x00, 0x00, 0xE7, 0x54, /* 0x90-0x93 */ + 0xE7, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE7, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE7, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x58, 0x90, 0x67, /* 0xA8-0xAB */ + 0xE7, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xEB, /* 0xAC-0xAF */ + 0xE7, 0x5B, 0xE7, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x5E, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE7, 0x5F, 0xE7, 0x5C, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE7, 0x60, 0x00, 0x00, 0x8E, 0xD4, 0xE7, 0x61, /* 0xC8-0xCB */ + 0x8B, 0x4F, 0x8C, 0x52, 0x00, 0x00, 0xEE, 0x96, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xAC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x62, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x5D, 0xE7, 0x63, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x66, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8E, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x65, /* 0xF8-0xFB */ + 0xE7, 0x64, 0x8C, 0x79, 0xE7, 0x67, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x72, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8D, 0xDA, 0xE7, 0x68, 0x00, 0x00, /* 0x08-0x0B */ + 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x6B, 0xE7, 0x6D, /* 0x10-0x13 */ + 0x95, 0xE3, 0xE7, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0x6C, 0x00, 0x00, 0xE7, 0x70, /* 0x18-0x1B */ + 0xE7, 0x6E, 0x8B, 0x50, 0x00, 0x00, 0xE7, 0x6F, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x72, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x94, 0x79, 0x97, 0xD6, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x53, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x97, 0x41, 0xE7, 0x75, 0x00, 0x00, 0xE7, 0x74, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x78, 0x97, 0x60, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x77, 0x00, 0x00, /* 0x40-0x43 */ + 0x8A, 0x8D, 0xE7, 0x76, 0xE7, 0x7B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE7, 0x79, 0x93, 0x51, 0xE7, 0x7C, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x7D, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE7, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8C, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8C, 0x44, 0xE7, 0x80, 0xE7, 0x81, /* 0x60-0x63 */ + 0xE7, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x68, /* 0x98-0x9B */ + 0xE7, 0x83, 0x00, 0x00, 0x8E, 0xAB, 0xE7, 0x84, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x85, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x9F, /* 0xA4-0xA7 */ + 0x99, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE7, 0x86, 0xE3, 0x90, 0xE7, 0x87, /* 0xAC-0xAF */ + 0x92, 0x43, 0x90, 0x4A, 0x94, 0x5F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x88, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xD3, 0x92, 0xD2, /* 0xB8-0xBB */ + 0x8D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x92, 0x48, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x96, 0x98, 0x90, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7D, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8B, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x95, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x89, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8B, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE7, 0x8A, 0x89, 0xDE, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x93, 0xF4, 0xE7, 0x8C, 0x94, 0x97, /* 0xE8-0xEB */ + 0x00, 0x00, 0x93, 0x52, 0x00, 0x00, 0xE7, 0x8D, /* 0xEC-0xEF */ + 0x8F, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE7, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x96, 0xC0, /* 0xF4-0xF7 */ + 0xE7, 0x9E, 0xE7, 0x91, 0xE7, 0x92, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x92, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0x91, 0xDE, 0x91, 0x97, 0x00, 0x00, 0x93, 0xA6, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0x90, 0x8B, 0x74, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x99, /* 0x08-0x0B */ + 0x00, 0x00, 0xE7, 0x96, 0xE7, 0xA3, 0x93, 0xA7, /* 0x0C-0x0F */ + 0x92, 0x80, 0xE7, 0x93, 0x00, 0x00, 0x92, 0xFC, /* 0x10-0x13 */ + 0x93, 0x72, 0xE7, 0x94, 0xE7, 0x98, 0x90, 0x80, /* 0x14-0x17 */ + 0x00, 0x00, 0x94, 0x87, 0x92, 0xCA, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x90, 0xC0, 0xE7, 0x97, 0x91, 0xAC, /* 0x1C-0x1F */ + 0x91, 0xA2, 0xE7, 0x95, 0x88, 0xA7, 0x98, 0x41, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x9A, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xDF, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0x54, 0x90, 0x69, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE7, 0x9C, 0xE7, 0x9B, 0x00, 0x00, /* 0x34-0x37 */ + 0x88, 0xED, 0xE7, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x95, 0x4E, 0x00, 0x00, 0xE7, 0xA5, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x93, 0xD9, 0x90, 0x8B, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x92, 0x78, 0x00, 0x00, 0x8B, 0xF6, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0xA4, 0x97, 0x56, 0x89, 0x5E, /* 0x48-0x4B */ + 0x00, 0x00, 0x95, 0xD5, 0x89, 0xDF, 0xE7, 0x9F, /* 0x4C-0x4F */ + 0xE7, 0xA0, 0xE7, 0xA1, 0xE7, 0xA2, 0x93, 0xB9, /* 0x50-0x53 */ + 0x92, 0x42, 0x88, 0xE1, 0xE7, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0xE7, 0xA7, 0xEA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x91, 0xBB, 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, /* 0x5C-0x5F */ + 0x89, 0x93, 0x91, 0x6B, 0x00, 0x00, 0x8C, 0xAD, /* 0x60-0x63 */ + 0x00, 0x00, 0x97, 0x79, 0x00, 0x00, 0xEE, 0x99, /* 0x64-0x67 */ + 0xE7, 0xA9, 0x93, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x91, 0x98, 0x8E, 0xD5, 0xE7, 0xAA, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xAD, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8F, 0x85, 0xE7, 0xAB, 0x91, 0x4A, /* 0x74-0x77 */ + 0x91, 0x49, 0x00, 0x00, 0x88, 0xE2, 0x00, 0x00, /* 0x78-0x7B */ + 0x97, 0xC9, 0xE7, 0xAF, 0x00, 0x00, 0x94, 0xF0, /* 0x7C-0x7F */ + + 0xE7, 0xB1, 0xE7, 0xB0, 0xE7, 0xAE, 0xE2, 0x84, /* 0x80-0x83 */ + 0x8A, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8E, /* 0x84-0x87 */ + 0x00, 0x00, 0xE7, 0xB3, 0xE7, 0xB2, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, /* 0x8C-0x8F */ + 0x00, 0x00, 0x97, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x4D, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE7, 0xB5, 0x00, 0x00, 0x8E, 0xD7, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x93, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x88, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x78, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x59, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9A, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8C, 0x53, 0xE7, 0xB9, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE7, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x95, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x8A, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x97, 0x58, 0x00, 0x00, 0x8B, 0xBD, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x93, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xEE, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9D, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x93, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE7, 0xC1, 0x00, 0x00, 0xE7, 0xC0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x93, 0xD1, 0xE7, 0xC2, 0x8F, 0x55, /* 0x48-0x4B */ + 0x8E, 0xDE, 0x94, 0x7A, 0x92, 0x91, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF0, 0x00, 0x00, /* 0x50-0x53 */ + 0x90, 0x8C, 0x00, 0x00, 0xE7, 0xC3, 0x00, 0x00, /* 0x54-0x57 */ + 0xE7, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x7C, 0xE7, 0xC5, /* 0x60-0x63 */ + 0x00, 0x00, 0xE7, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE7, 0xC7, 0x97, 0x8F, 0x00, 0x00, /* 0x68-0x6B */ + 0x8F, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC9, 0xE7, 0xC8, /* 0x70-0x73 */ + 0x00, 0x00, 0x8D, 0x79, 0x00, 0x00, 0x8D, 0x93, /* 0x74-0x77 */ + 0x8E, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x86, /* 0x84-0x87 */ + 0x00, 0x00, 0xE7, 0xCB, 0x00, 0x00, 0xE7, 0xCA, /* 0x88-0x8B */ + 0x00, 0x00, 0x91, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8C, 0xED, 0x00, 0x00, 0x90, 0xC1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xAE, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x8F, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCD, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8F, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD0, 0xE7, 0xCE, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCF, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE7, 0xD2, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8F, 0xF8, 0x00, 0x00, 0xE7, 0xD3, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE7, 0xD4, 0xE7, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xCE, 0x8D, 0xD1, /* 0xC4-0xC7 */ + 0x8E, 0xDF, 0xE7, 0xD6, 0x00, 0x00, 0xE7, 0xD7, /* 0xC8-0xCB */ + 0x97, 0xA2, 0x8F, 0x64, 0x96, 0xEC, 0x97, 0xCA, /* 0xCC-0xCF */ + 0xE7, 0xD8, 0x8B, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0xEE, 0x9F, /* 0xD4-0xD7 */ + 0x93, 0x42, 0x00, 0x00, 0xEE, 0x9E, 0xE7, 0xDC, /* 0xD8-0xDB */ + 0x8A, 0x98, 0x90, 0x6A, 0xEE, 0xA0, 0xE7, 0xDA, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE7, 0xDB, 0x00, 0x00, 0x92, 0xDE, /* 0xE0-0xE3 */ + 0xEE, 0xA3, 0xEE, 0xA4, 0x96, 0x74, 0x8B, 0xFA, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEE, 0xA1, 0xEE, 0xA2, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE7, 0xDE, 0xE7, 0xDF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA5, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x93, 0xDD, 0x8A, 0x62, 0x00, 0x00, /* 0x0C-0x0F */ + 0xEE, 0xA6, 0xE7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE7, 0xE2, 0xE7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE8, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE7, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x97, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD8, /* 0x34-0x37 */ + 0x00, 0x00, 0xEE, 0xAE, 0xEE, 0xA8, 0x00, 0x00, /* 0x38-0x3B */ + 0xEE, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xED, /* 0x3C-0x3F */ + 0xEE, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x93, 0x53, 0xE7, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE7, 0xEB, 0xE7, 0xE9, 0x00, 0x00, 0xE7, 0xEE, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0xEF, 0xEE, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE7, /* 0x54-0x57 */ + 0x00, 0x00, 0xEE, 0xAC, 0xE7, 0xF4, 0x89, 0x94, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xAB, 0x00, 0x00, /* 0x60-0x63 */ + 0xE7, 0xEA, 0x00, 0x00, 0x8F, 0xDE, 0xEE, 0xAF, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8D, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB1, /* 0x74-0x77 */ + 0xEE, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x67, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8B, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x65, /* 0x80-0x83 */ + 0x00, 0x00, 0x93, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xED, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x91, 0x4C, 0x00, 0x00, 0xE7, 0xF2, /* 0x90-0x93 */ + 0x00, 0x00, 0xE7, 0xEC, 0xE7, 0xF1, 0x00, 0x00, /* 0x94-0x97 */ + 0x96, 0xC1, 0x00, 0x00, 0x92, 0xB6, 0xE7, 0xF3, /* 0x98-0x9B */ + 0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x91, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE7, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF5, /* 0xCC-0xCF */ + 0xEE, 0xB6, 0x00, 0x00, 0x96, 0x4E, 0xEE, 0xBA, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xEE, 0xB8, 0x00, 0x00, 0xEE, 0xB4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEE, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x8F, 0x9B, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB3, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE7, 0xF8, 0x95, 0xDD, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x89, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x65, 0x92, 0x92, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8B, 0x98, 0xED, 0x49, 0xE7, 0xFA, 0xEE, 0xBD, /* 0xF8-0xFB */ + 0x8D, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC2, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4B, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, /* 0x0C-0x0F */ + 0x90, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x90, 0x8E, 0xE8, 0x40, 0xE8, 0x42, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xEE, 0xC1, 0xEE, 0xBF, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8F, 0xF9, 0xEE, 0xBC, 0xE8, 0x41, 0xE8, 0x43, /* 0x20-0x23 */ + 0x00, 0x00, 0xEE, 0xBB, 0x8B, 0xD1, 0x00, 0x00, /* 0x24-0x27 */ + 0x95, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xE0, /* 0x28-0x2B */ + 0x98, 0x42, 0x00, 0x00, 0xE7, 0xFC, 0x8D, 0xF6, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x5E, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE8, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x44, 0xE8, 0x46, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE7, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x93, 0xE7, /* 0x48-0x4B */ + 0x00, 0x00, 0x93, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x92, 0xD5, 0x00, 0x00, 0xE8, 0x4B, 0xEE, 0xC4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x62, /* 0x58-0x5B */ + 0xE8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x8C, 0x4C, 0x00, 0x00, 0xE8, 0x4A, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8C, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0x49, 0x00, 0x00, 0x8F, 0xDF, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8A, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE8, 0x4F, 0x00, 0x00, 0x8D, 0xBD, 0x91, 0x99, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xC8, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEE, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x5A, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE8, 0x4D, 0xE8, 0x4E, 0x92, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE8, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE8, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x56, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE8, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE8, 0x58, 0x93, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x51, 0xE8, 0x52, /* 0xD4-0xD7 */ + 0xE8, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE8, 0x57, 0xEE, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8B, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE8, 0x5A, 0xE8, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE8, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEE, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_94[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5E, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5F, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE8, 0x60, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5D, /* 0x10-0x13 */ + 0xE8, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x8F, 0xE0, 0x93, 0xA8, 0xE8, 0x5B, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x62, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE8, 0x63, 0xE8, 0x61, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0xF6, 0x00, 0x00, 0xE8, 0x65, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE8, 0x68, 0xEE, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x8A, 0xD3, 0xE8, 0x67, 0x96, 0xF8, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x73, 0xE8, 0x69, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x6C, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0x6A, 0x00, 0x00, 0xE8, 0x6B, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x6D, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE8, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE8, 0x70, 0x00, 0x00, 0xE8, 0x71, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0x74, 0xE8, 0x72, 0xE8, 0x75, 0xE8, 0x77, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE8, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x96, 0xE5, 0x00, 0x00, 0xE8, 0x78, 0x91, 0x4D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x79, /* 0x84-0x87 */ + 0x00, 0x00, 0x95, 0xC2, 0xE8, 0x7A, 0x8A, 0x4A, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x5B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8A, 0xD5, 0xEE, 0xCC, 0x8A, 0xD4, /* 0x90-0x93 */ + 0xE8, 0x7B, 0x00, 0x00, 0xE8, 0x7C, 0x00, 0x00, /* 0x94-0x97 */ + 0xE8, 0x7D, 0xE8, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE8, 0x80, 0x00, 0x00, 0x8A, 0xD6, 0x8A, 0x74, /* 0xA0-0xA3 */ + 0x8D, 0x7D, 0x94, 0xB4, 0x00, 0x00, 0xE8, 0x82, /* 0xA4-0xA7 */ + 0xE8, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE8, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7B, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE8, 0x86, 0x00, 0x00, 0xE8, 0x85, /* 0xB8-0xBB */ + 0xE8, 0x84, 0x00, 0x00, 0xE8, 0x87, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x8A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xC5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x88, 0x00, 0x00, /* 0xC8-0xCB */ + 0xE8, 0x8C, 0xE8, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE8, 0x8E, 0xE8, 0x8D, 0xE8, 0x8F, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE8, 0x91, 0xE8, 0x93, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x95, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE8, 0x95, 0x00, 0x00, 0x8D, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x96, 0xE8, 0x97, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x68, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x6A, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xA2, /* 0x3C-0x3F */ + 0x91, 0xC9, 0x00, 0x00, 0xE8, 0x98, 0x00, 0x00, /* 0x40-0x43 */ + 0x95, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x9B, /* 0x48-0x4B */ + 0xE8, 0x99, 0x8D, 0x7E, 0x00, 0x00, 0xE8, 0x9A, /* 0x4C-0x4F */ + 0x8C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC3, /* 0x58-0x5B */ + 0xE8, 0x9D, 0xE8, 0x9F, 0xE8, 0x9E, 0xE8, 0xA0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x40, 0x90, 0x77, /* 0x60-0x63 */ + 0x8F, 0x9C, 0x8A, 0xD7, 0xE8, 0xA1, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x86, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x89, 0x41, 0x00, 0x00, 0xE8, 0xA2, 0x92, 0xC2, /* 0x70-0x73 */ + 0x00, 0x00, 0x97, 0xCB, 0x93, 0xA9, 0xE8, 0x9C, /* 0x74-0x77 */ + 0x97, 0xA4, 0x00, 0x00, 0x8C, 0xAF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x97, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x8B, 0xF7, 0x97, 0xB2, 0x00, 0x00, /* 0x84-0x87 */ + 0x8C, 0x47, 0x00, 0x00, 0x91, 0xE0, 0xE4, 0x40, /* 0x88-0x8B */ + 0x00, 0x00, 0xE8, 0xA4, 0x8A, 0x4B, 0x90, 0x8F, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x8A, 0x75, 0xE8, 0xA6, 0x00, 0x00, 0xE8, 0xA7, /* 0x94-0x97 */ + 0xE8, 0xA5, 0x8C, 0x84, 0x00, 0x00, 0x8D, 0xDB, /* 0x98-0x9B */ + 0x8F, 0xE1, 0xEE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x89, 0x42, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD7, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0xA4-0xA7 */ + 0xE7, 0xAC, 0x00, 0x00, 0xE8, 0xA8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD0, /* 0xAC-0xAF */ + 0xE8, 0xAC, 0xE8, 0xAA, 0xE8, 0xAB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE8, 0xAD, 0x00, 0x00, 0xE8, 0xAE, 0x97, 0xEA, /* 0xB4-0xB7 */ + 0xE8, 0xAF, 0xE8, 0xB0, 0x00, 0x00, 0x90, 0xC7, /* 0xB8-0xBB */ + 0x94, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x90, 0x9D, 0x8A, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x97, 0x59, 0x89, 0xEB, 0x8F, 0x57, 0x8C, 0xD9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB2, /* 0xC8-0xCB */ + 0x8E, 0x93, 0xE8, 0xB4, 0xE8, 0xB1, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE8, 0xB8, 0xE5, 0xAB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x99, 0xD4, 0x00, 0x00, 0x90, 0x97, /* 0xD8-0xDB */ + 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xA3, 0x93, 0xEF, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x89, 0x4A, 0x00, 0x00, 0x90, 0xE1, 0x8E, 0xB4, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x95, 0xB5, 0x00, 0x00, 0x89, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xEB, 0x97, 0x8B, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE8, 0xB9, 0x00, 0x00, 0x93, 0x64, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_97[512] = { + 0x8E, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE8, 0xBA, 0x00, 0x00, 0xE8, 0xBB, 0x90, 0x6B, /* 0x04-0x07 */ + 0xE8, 0xBC, 0x00, 0x00, 0x97, 0xEC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE8, 0xB7, 0xE8, 0xBE, 0xE8, 0xC0, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, 0xE8, 0xBD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC1, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x91, 0x9A, 0x00, 0x00, 0x89, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB6, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC4, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE8, 0xC5, 0x00, 0x00, 0x98, 0x49, 0xEE, 0xD1, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9E, 0x50, 0xE8, 0xC6, 0x00, 0x00, 0xEE, 0xD2, /* 0x38-0x3B */ + 0x00, 0x00, 0xE8, 0xC7, 0xE8, 0xC8, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCC, 0xEE, 0xD3, /* 0x40-0x43 */ + 0xE8, 0xC9, 0x00, 0x00, 0xE8, 0xCA, 0x00, 0x00, /* 0x44-0x47 */ + 0xE8, 0xCB, 0xE8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xD4, 0x00, 0x00, 0xEE, 0xD5, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEE, 0xD6, 0x90, 0xC2, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xEE, 0xD7, 0x96, 0xF5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x90, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE8, 0xCE, 0x00, 0x00, 0x94, 0xF1, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0xCF, 0xEA, 0x72, 0x96, 0xCA, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xD0, 0x00, 0x00, 0xE8, 0xD1, 0x00, 0x00, /* 0x64-0x67 */ + 0xE8, 0xD2, 0x8A, 0x76, 0x00, 0x00, 0xE8, 0xD4, /* 0x68-0x6B */ + 0x00, 0x00, 0x90, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x8C, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE8, 0xD6, 0xE8, 0xDA, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x8A, 0x93, 0xE8, 0xD7, 0xE8, 0xDB, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDC, /* 0x88-0x8B */ + 0x00, 0x00, 0x88, 0xC6, 0x00, 0x00, 0xE8, 0xDD, /* 0x8C-0x8F */ + 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x8F, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xE8, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE2, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE1, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE8, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x91, /* 0xA8-0xAB */ + 0x00, 0x00, 0x95, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE3, /* 0xB0-0xB3 */ + 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE5, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE8, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE8, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xD8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE8, 0xEA, 0x94, 0x42, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEC, 0x89, 0xB9, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE8, 0xEF, 0xE8, 0xEE, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x43, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0x00, 0x00, 0x95, 0xC5, 0x92, 0xB8, 0x8D, 0xA0, /* 0x00-0x03 */ + 0x00, 0x00, 0x8D, 0x80, 0x8F, 0x87, 0x00, 0x00, /* 0x04-0x07 */ + 0x90, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE8, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF0, /* 0x0C-0x0F */ + 0x97, 0x61, 0x8A, 0xE6, 0x94, 0xD0, 0x93, 0xDA, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x9C, /* 0x14-0x17 */ + 0x97, 0xCC, 0x00, 0x00, 0x8C, 0x7A, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x96, 0x6A, 0x93, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x89, 0x6F, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF5, /* 0x34-0x37 */ + 0xE8, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x95, 0x70, /* 0x38-0x3B */ + 0x97, 0x8A, 0xE8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF9, /* 0x48-0x4B */ + 0x91, 0xE8, 0x8A, 0x7A, 0x8A, 0x7B, 0xE8, 0xF8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8A, 0xE7, 0x8C, 0xB0, 0x00, 0x00, 0xEE, 0xD8, /* 0x54-0x57 */ + 0x8A, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x5E, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xDE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEE, 0xD9, 0x00, 0x00, 0x8C, 0xDA, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFA, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFB, /* 0x6C-0x6F */ + 0xE8, 0xFC, 0xE9, 0x40, 0x00, 0x00, 0xE9, 0x42, /* 0x70-0x73 */ + 0xE9, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x95, 0x97, 0x00, 0x00, 0xE9, 0x43, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x44, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x46, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x48, /* 0xC0-0xC3 */ + 0xE9, 0x47, 0x00, 0x00, 0xE9, 0x49, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xF2, /* 0xD8-0xDB */ + 0xE3, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x90, 0x48, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x51, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE9, 0x4A, 0x00, 0x00, 0xE9, 0x4B, /* 0xE8-0xEB */ + 0x00, 0x00, 0x99, 0xAA, 0x9F, 0x5A, 0x94, 0xD1, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x88, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x8E, 0x94, 0x96, 0x4F, 0x8F, 0xFC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4C, /* 0x00-0x03 */ + 0x00, 0x00, 0x96, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE9, 0x4D, 0x97, 0x7B, 0x00, 0x00, /* 0x08-0x0B */ + 0x89, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x8E, 0x60, 0x00, 0x00, 0xE9, 0x4E, 0x89, 0xEC, /* 0x10-0x13 */ + 0xE9, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE9, 0x52, 0xE9, 0x53, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0x55, 0xE9, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0x54, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDC, /* 0x24-0x27 */ + 0x8A, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE9, 0x56, 0x00, 0x00, 0xE9, 0x57, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x59, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x5A, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE9, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE9, 0x5B, 0x00, 0x00, 0xE9, 0x5E, /* 0x48-0x4B */ + 0xE9, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE9, 0x5D, 0xE9, 0x5F, 0xE9, 0x60, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE9, 0x62, 0x00, 0x00, 0x8B, 0xC0, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF1, 0xE9, 0x63, /* 0x94-0x97 */ + 0xE9, 0x64, 0x8D, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE9, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8A, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x94, 0x6E, 0xE9, 0x66, 0xE9, 0x67, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x79, /* 0xB0-0xB3 */ + 0x93, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x94, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x91, 0xCA, 0x89, 0x77, 0x8B, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x8B, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x92, 0x93, 0xE9, 0x6D, 0x8B, 0xEE, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x89, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE9, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6A, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE9, 0x6B, 0x00, 0x00, 0xE9, 0x69, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x77, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE9, 0x6E, 0xE9, 0x6F, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE9, 0x70, 0xE9, 0x71, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE9, 0x73, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x72, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xE9, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x52, 0xE9, 0x75, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x9B, 0x8C, 0xB1, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x91, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x79, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x93, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x7A, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x80, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE9, 0x7D, 0x00, 0x00, 0xE9, 0x7C, 0xE9, 0x7E, /* 0x40-0x43 */ + 0x00, 0x00, 0xE9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0x82, 0xEE, 0xDF, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE9, 0x81, 0x00, 0x00, 0xE9, 0x84, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xC1, 0xE9, 0x83, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x85, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x86, 0x00, 0x00, /* 0x60-0x63 */ + 0xE9, 0x88, 0xE9, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE9, 0x89, 0xE9, 0x8B, 0xE9, 0x8A, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8D, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE9, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE9, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x90, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x90, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0x91, 0x00, 0x00, 0xE9, 0x92, /* 0xD0-0xD3 */ + 0xE9, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x8D, 0x82, 0xEE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xEE, 0xE1, 0x00, 0x00, 0xE9, 0x94, 0xE9, 0x95, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x96, 0xE9, 0x97, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x98, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xAF, 0xE9, 0x9A, /* 0xE8-0xEB */ + 0x00, 0x00, 0x95, 0x45, 0xE9, 0x9B, 0xE9, 0x99, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE9, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE9, 0x9C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9E, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9F, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA0, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE9, 0xA1, 0x00, 0x00, 0xE9, 0xA2, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA3, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0xE9, 0xA5, /* 0x20-0x23 */ + 0x00, 0x00, 0xE9, 0xA6, 0x00, 0x00, 0xE9, 0xA7, /* 0x24-0x27 */ + 0xE9, 0xA8, 0xE9, 0xA9, 0xE9, 0xAA, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xAB, 0xE9, 0xAC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9F, 0x54, 0xE9, 0xAD, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, /* 0x38-0x3B */ + 0x8B, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x8A, 0x40, 0x8D, 0xB0, 0xE9, 0xAF, /* 0x40-0x43 */ + 0xE9, 0xAE, 0x96, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0xB1, 0xE9, 0xB2, 0xE9, 0xB0, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x96, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE9, 0xB4, 0x00, 0x00, 0x8B, 0x9B, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, 0x00, 0x00, /* 0x70-0x73 */ + 0xE9, 0xB5, 0xEE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xBC, 0xEE, 0xE4, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE9, 0xB8, 0x95, 0xA9, 0xE9, 0xB6, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xBA, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBB, /* 0x9C-0x9F */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE9, 0xBD, 0x00, 0x00, 0x96, 0x8E, 0x8E, 0x4C, /* 0xA8-0xAB */ + 0x00, 0x00, 0x8D, 0xF8, 0x91, 0x4E, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEE, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, 0xEE, 0xE6, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8C, 0xEF, 0xE9, 0xC0, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC3, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0xC4, 0xE9, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE9, 0xC9, 0x00, 0x00, 0x8E, 0x49, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xE2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE9, 0xCA, 0xE9, 0xC7, 0xE9, 0xC6, /* 0xE0-0xE3 */ + 0xE9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8C, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE9, 0xCE, 0xE9, 0xCD, 0xE9, 0xCC, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x88, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9C[512] = { + 0xEE, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE9, 0xD8, 0x00, 0x00, 0xE9, 0xD4, 0x00, 0x00, /* 0x04-0x07 */ + 0xE9, 0xD5, 0xE9, 0xD1, 0xE9, 0xD7, 0x00, 0x00, /* 0x08-0x0B */ + 0xE9, 0xD3, 0x8A, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x98, 0x6B, 0x00, 0x00, 0xE9, 0xD6, 0xE9, 0xD2, /* 0x10-0x13 */ + 0xE9, 0xD0, 0xE9, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE9, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0xDC, 0xE9, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x95, 0x68, 0xE9, 0xD9, 0x88, 0xF1, /* 0x2C-0x2F */ + 0xE9, 0xDE, 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8A, 0x8F, 0xE9, 0xCB, 0x89, 0x56, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE1, 0xE9, 0xDF, /* 0x44-0x47 */ + 0x92, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x90, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE5, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0x74-0x77 */ + 0xE9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x92, 0xB9, 0x00, 0x00, 0xE9, 0xE8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x94, 0xB5, 0x00, 0x00, 0xE9, 0xED, /* 0xE8-0xEB */ + 0xE9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE9, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x96, 0x50, /* 0xF0-0xF3 */ + 0x96, 0xC2, 0x00, 0x00, 0x93, 0xCE, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEF, 0x93, 0xBC, /* 0x04-0x07 */ + 0xE9, 0xEC, 0xE9, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE9, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x95, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF4, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, 0x00, 0x00, /* 0x24-0x27 */ + 0x8A, 0x9B, 0x00, 0x00, 0xE9, 0xF0, 0x8E, 0xB0, /* 0x28-0x2B */ + 0x89, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x83, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFA, 0xE9, 0xF9, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE9, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE9, 0xF5, 0x00, 0x00, 0xE9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xEA, 0x44, 0xEA, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xEA, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x89, 0x4C, 0xEA, 0x40, 0xEA, 0x41, 0x00, 0x00, /* 0x5C-0x5F */ + 0x8D, 0x94, 0x96, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEA, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE9, /* 0x68-0x6B */ + 0x96, 0x51, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4A, /* 0x6C-0x6F */ + 0xEE, 0xE8, 0x00, 0x00, 0xEA, 0x46, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4B, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x48, /* 0x84-0x87 */ + 0x00, 0x00, 0xEA, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4C, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEA, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0x4E, 0x00, 0x00, 0xEA, 0x49, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x53, 0x00, 0x00, 0xEA, 0x54, 0xEA, 0x52, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xEA, 0x51, 0xEA, 0x57, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0x50, 0x00, 0x00, 0xEA, 0x55, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x56, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x59, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEA, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x5B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEA, 0x5C, 0x00, 0x00, 0xEA, 0x5D, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x68, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEA, 0x5A, 0x91, 0xE9, 0x8D, 0xEB, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xEA, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xEE, 0xEB, 0xEA, 0x5F, 0xEA, 0x60, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x61, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xEA, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x8C, 0xB2, 0xEA, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xEA, 0x64, 0x00, 0x00, 0x8E, 0xAD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEA, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xEA, 0x66, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x67, /* 0x88-0x8B */ + 0xEA, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEA, 0x6B, 0xEA, 0x69, 0x98, 0x5B, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x6A, 0x00, 0x00, 0x97, 0xED, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0x97, 0xD9, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEA, 0x6D, 0x94, 0x9E, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0x6E, 0xEA, 0x70, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x6F, 0x8D, 0x8D, 0x96, 0xCB, 0x96, 0x83, /* 0xB8-0xBB */ + 0x9B, 0xF5, 0x00, 0x00, 0x9F, 0x80, 0x96, 0x9B, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x89, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEA, 0x73, 0x8B, 0x6F, 0xEA, 0x74, 0xEA, 0x75, /* 0xCC-0xCF */ + 0xEA, 0x76, 0xEE, 0xEC, 0x8D, 0x95, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE0, 0xD2, 0x96, 0xD9, 0x00, 0x00, 0x91, 0xE1, /* 0xD8-0xDB */ + 0xEA, 0x78, 0xEA, 0x7A, 0xEA, 0x79, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEA, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEA, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEA, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x7E, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEA, 0x80, 0x00, 0x00, 0xEA, 0x81, 0xEA, 0x82, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEA, 0x83, 0x00, 0x00, 0xEA, 0x84, /* 0xF8-0xFB */ + 0xEA, 0x85, 0xEA, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x87, /* 0x04-0x07 */ + 0xEA, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x43, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xDB, /* 0x10-0x13 */ + 0x00, 0x00, 0xEA, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0x6C, 0xEA, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xEA, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x40, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8D, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8E, 0xE2, 0x56, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0xE8, 0xEB, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8F, 0x00, 0x00, /* 0x50-0x53 */ + 0xEA, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x92, /* 0x5C-0x5F */ + 0xEA, 0x93, 0xEA, 0x94, 0x97, 0xEE, 0xEA, 0x91, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x95, 0xEA, 0x96, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x98, 0x00, 0x00, /* 0x68-0x6B */ + 0xEA, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9A, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9B, 0xEA, 0x99, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x97, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xEA, 0x9D, 0xE2, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEA, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xEE, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_FA[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x73, 0xED, 0x7E, /* 0x0C-0x0F */ + 0xED, 0x80, 0xED, 0x95, 0xED, 0xBC, 0xED, 0xCC, /* 0x10-0x13 */ + 0xED, 0xCE, 0xED, 0xF9, 0xEE, 0x42, 0xEE, 0x59, /* 0x14-0x17 */ + 0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x65, /* 0x18-0x1B */ + 0xEE, 0x69, 0xEE, 0x6C, 0xEE, 0x75, 0xEE, 0x81, /* 0x1C-0x1F */ + 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x8D, 0xEE, 0x95, /* 0x20-0x23 */ + 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x9B, 0xEE, 0xB7, /* 0x24-0x27 */ + 0xEE, 0xBE, 0xEE, 0xCE, 0xEE, 0xDA, 0xEE, 0xDB, /* 0x28-0x2B */ + 0xEE, 0xDD, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0x81, 0x49, 0xEE, 0xFC, 0x81, 0x94, /* 0x00-0x03 */ + 0x81, 0x90, 0x81, 0x93, 0x81, 0x95, 0xEE, 0xFB, /* 0x04-0x07 */ + 0x81, 0x69, 0x81, 0x6A, 0x81, 0x96, 0x81, 0x7B, /* 0x08-0x0B */ + 0x81, 0x43, 0x81, 0x7C, 0x81, 0x44, 0x81, 0x5E, /* 0x0C-0x0F */ + 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, 0x82, 0x52, /* 0x10-0x13 */ + 0x82, 0x53, 0x82, 0x54, 0x82, 0x55, 0x82, 0x56, /* 0x14-0x17 */ + 0x82, 0x57, 0x82, 0x58, 0x81, 0x46, 0x81, 0x47, /* 0x18-0x1B */ + 0x81, 0x83, 0x81, 0x81, 0x81, 0x84, 0x81, 0x48, /* 0x1C-0x1F */ + 0x81, 0x97, 0x82, 0x60, 0x82, 0x61, 0x82, 0x62, /* 0x20-0x23 */ + 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, 0x82, 0x66, /* 0x24-0x27 */ + 0x82, 0x67, 0x82, 0x68, 0x82, 0x69, 0x82, 0x6A, /* 0x28-0x2B */ + 0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0x2C-0x2F */ + 0x82, 0x6F, 0x82, 0x70, 0x82, 0x71, 0x82, 0x72, /* 0x30-0x33 */ + 0x82, 0x73, 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, /* 0x34-0x37 */ + 0x82, 0x77, 0x82, 0x78, 0x82, 0x79, 0x81, 0x6D, /* 0x38-0x3B */ + 0x81, 0x5F, 0x81, 0x6E, 0x81, 0x4F, 0x81, 0x51, /* 0x3C-0x3F */ + 0x81, 0x4D, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x40-0x43 */ + 0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x44-0x47 */ + 0x82, 0x88, 0x82, 0x89, 0x82, 0x8A, 0x82, 0x8B, /* 0x48-0x4B */ + 0x82, 0x8C, 0x82, 0x8D, 0x82, 0x8E, 0x82, 0x8F, /* 0x4C-0x4F */ + 0x82, 0x90, 0x82, 0x91, 0x82, 0x92, 0x82, 0x93, /* 0x50-0x53 */ + 0x82, 0x94, 0x82, 0x95, 0x82, 0x96, 0x82, 0x97, /* 0x54-0x57 */ + 0x82, 0x98, 0x82, 0x99, 0x82, 0x9A, 0x81, 0x6F, /* 0x58-0x5B */ + 0x81, 0x62, 0x81, 0x70, 0x81, 0x60, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, /* 0x60-0x63 */ + 0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, /* 0x64-0x67 */ + 0x00, 0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, /* 0x68-0x6B */ + 0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, /* 0x6C-0x6F */ + 0x00, 0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, /* 0x70-0x73 */ + 0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, /* 0x74-0x77 */ + 0x00, 0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, /* 0x78-0x7B */ + 0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, /* 0x7C-0x7F */ + + 0x00, 0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, /* 0x80-0x83 */ + 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, /* 0x84-0x87 */ + 0x00, 0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, /* 0x88-0x8B */ + 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, /* 0x8C-0x8F */ + 0x00, 0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, /* 0x90-0x93 */ + 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, /* 0x94-0x97 */ + 0x00, 0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, /* 0x98-0x9B */ + 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0xCA, 0x81, 0x50, /* 0xE0-0xE3 */ + 0xEE, 0xFA, 0x81, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, NULL, NULL, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, NULL, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, NULL, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (ch == 0xFF && 0x61 <= cl && cl <= 0x9F) { + out[0] = cl + 0x40; + return 1; + } + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen < 2) + return -ENAMETOOLONG; + + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + return 2; + } else if (ch == 0) { + if (cl <= 0x7F) { + out[0] = cl; + return 1; + } else if (0xA0 <= cl) { + out[0] = u2c_00hi[cl - 0xA0][0]; + out[1] = u2c_00hi[cl - 0xA0][1]; + if (out[0] && out[1]) + return 2; + } + return -EINVAL; + } + else + return -EINVAL; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (rawstring[0] <= 0x7F) { + *uni = rawstring[0]; + return 1; + } + if (0xA1 <= rawstring[0] && rawstring[0] <= 0xDF) { + *uni = 0xFF00 | (rawstring[0] - 0x40); + return 1; + } + + if (boundlen < 2) + return -ENAMETOOLONG; + ch = rawstring[0]; + cl = rawstring[1]; + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + return 2; + } + else + return -EINVAL; +} + +static struct nls_table table = { + .charset = "cp932", + .alias = "sjis", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp932(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp932(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp932) +module_exit(exit_nls_cp932) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(sjis); diff --git a/kernel/fs/nls/nls_cp936.c b/kernel/fs/nls/nls_cp936.c new file mode 100644 index 000000000..c96546cfe --- /dev/null +++ b/kernel/fs/nls/nls_cp936.c @@ -0,0 +1,11111 @@ +/* + * linux/fs/nls/nls_cp936.c + * + * Charset cp936 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,/* 0x40-0x47 */ + 0x4E20,0x4E21,0x4E23,0x4E26,0x4E29,0x4E2E,0x4E2F,0x4E31,/* 0x48-0x4F */ + 0x4E33,0x4E35,0x4E37,0x4E3C,0x4E40,0x4E41,0x4E42,0x4E44,/* 0x50-0x57 */ + 0x4E46,0x4E4A,0x4E51,0x4E55,0x4E57,0x4E5A,0x4E5B,0x4E62,/* 0x58-0x5F */ + 0x4E63,0x4E64,0x4E65,0x4E67,0x4E68,0x4E6A,0x4E6B,0x4E6C,/* 0x60-0x67 */ + 0x4E6D,0x4E6E,0x4E6F,0x4E72,0x4E74,0x4E75,0x4E76,0x4E77,/* 0x68-0x6F */ + 0x4E78,0x4E79,0x4E7A,0x4E7B,0x4E7C,0x4E7D,0x4E7F,0x4E80,/* 0x70-0x77 */ + 0x4E81,0x4E82,0x4E83,0x4E84,0x4E85,0x4E87,0x4E8A,0x0000,/* 0x78-0x7F */ + + 0x4E90,0x4E96,0x4E97,0x4E99,0x4E9C,0x4E9D,0x4E9E,0x4EA3,/* 0x80-0x87 */ + 0x4EAA,0x4EAF,0x4EB0,0x4EB1,0x4EB4,0x4EB6,0x4EB7,0x4EB8,/* 0x88-0x8F */ + 0x4EB9,0x4EBC,0x4EBD,0x4EBE,0x4EC8,0x4ECC,0x4ECF,0x4ED0,/* 0x90-0x97 */ + 0x4ED2,0x4EDA,0x4EDB,0x4EDC,0x4EE0,0x4EE2,0x4EE6,0x4EE7,/* 0x98-0x9F */ + 0x4EE9,0x4EED,0x4EEE,0x4EEF,0x4EF1,0x4EF4,0x4EF8,0x4EF9,/* 0xA0-0xA7 */ + 0x4EFA,0x4EFC,0x4EFE,0x4F00,0x4F02,0x4F03,0x4F04,0x4F05,/* 0xA8-0xAF */ + 0x4F06,0x4F07,0x4F08,0x4F0B,0x4F0C,0x4F12,0x4F13,0x4F14,/* 0xB0-0xB7 */ + 0x4F15,0x4F16,0x4F1C,0x4F1D,0x4F21,0x4F23,0x4F28,0x4F29,/* 0xB8-0xBF */ + 0x4F2C,0x4F2D,0x4F2E,0x4F31,0x4F33,0x4F35,0x4F37,0x4F39,/* 0xC0-0xC7 */ + 0x4F3B,0x4F3E,0x4F3F,0x4F40,0x4F41,0x4F42,0x4F44,0x4F45,/* 0xC8-0xCF */ + 0x4F47,0x4F48,0x4F49,0x4F4A,0x4F4B,0x4F4C,0x4F52,0x4F54,/* 0xD0-0xD7 */ + 0x4F56,0x4F61,0x4F62,0x4F66,0x4F68,0x4F6A,0x4F6B,0x4F6D,/* 0xD8-0xDF */ + 0x4F6E,0x4F71,0x4F72,0x4F75,0x4F77,0x4F78,0x4F79,0x4F7A,/* 0xE0-0xE7 */ + 0x4F7D,0x4F80,0x4F81,0x4F82,0x4F85,0x4F86,0x4F87,0x4F8A,/* 0xE8-0xEF */ + 0x4F8C,0x4F8E,0x4F90,0x4F92,0x4F93,0x4F95,0x4F96,0x4F98,/* 0xF0-0xF7 */ + 0x4F99,0x4F9A,0x4F9C,0x4F9E,0x4F9F,0x4FA1,0x4FA2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4FA4,0x4FAB,0x4FAD,0x4FB0,0x4FB1,0x4FB2,0x4FB3,0x4FB4,/* 0x40-0x47 */ + 0x4FB6,0x4FB7,0x4FB8,0x4FB9,0x4FBA,0x4FBB,0x4FBC,0x4FBD,/* 0x48-0x4F */ + 0x4FBE,0x4FC0,0x4FC1,0x4FC2,0x4FC6,0x4FC7,0x4FC8,0x4FC9,/* 0x50-0x57 */ + 0x4FCB,0x4FCC,0x4FCD,0x4FD2,0x4FD3,0x4FD4,0x4FD5,0x4FD6,/* 0x58-0x5F */ + 0x4FD9,0x4FDB,0x4FE0,0x4FE2,0x4FE4,0x4FE5,0x4FE7,0x4FEB,/* 0x60-0x67 */ + 0x4FEC,0x4FF0,0x4FF2,0x4FF4,0x4FF5,0x4FF6,0x4FF7,0x4FF9,/* 0x68-0x6F */ + 0x4FFB,0x4FFC,0x4FFD,0x4FFF,0x5000,0x5001,0x5002,0x5003,/* 0x70-0x77 */ + 0x5004,0x5005,0x5006,0x5007,0x5008,0x5009,0x500A,0x0000,/* 0x78-0x7F */ + + 0x500B,0x500E,0x5010,0x5011,0x5013,0x5015,0x5016,0x5017,/* 0x80-0x87 */ + 0x501B,0x501D,0x501E,0x5020,0x5022,0x5023,0x5024,0x5027,/* 0x88-0x8F */ + 0x502B,0x502F,0x5030,0x5031,0x5032,0x5033,0x5034,0x5035,/* 0x90-0x97 */ + 0x5036,0x5037,0x5038,0x5039,0x503B,0x503D,0x503F,0x5040,/* 0x98-0x9F */ + 0x5041,0x5042,0x5044,0x5045,0x5046,0x5049,0x504A,0x504B,/* 0xA0-0xA7 */ + 0x504D,0x5050,0x5051,0x5052,0x5053,0x5054,0x5056,0x5057,/* 0xA8-0xAF */ + 0x5058,0x5059,0x505B,0x505D,0x505E,0x505F,0x5060,0x5061,/* 0xB0-0xB7 */ + 0x5062,0x5063,0x5064,0x5066,0x5067,0x5068,0x5069,0x506A,/* 0xB8-0xBF */ + 0x506B,0x506D,0x506E,0x506F,0x5070,0x5071,0x5072,0x5073,/* 0xC0-0xC7 */ + 0x5074,0x5075,0x5078,0x5079,0x507A,0x507C,0x507D,0x5081,/* 0xC8-0xCF */ + 0x5082,0x5083,0x5084,0x5086,0x5087,0x5089,0x508A,0x508B,/* 0xD0-0xD7 */ + 0x508C,0x508E,0x508F,0x5090,0x5091,0x5092,0x5093,0x5094,/* 0xD8-0xDF */ + 0x5095,0x5096,0x5097,0x5098,0x5099,0x509A,0x509B,0x509C,/* 0xE0-0xE7 */ + 0x509D,0x509E,0x509F,0x50A0,0x50A1,0x50A2,0x50A4,0x50A6,/* 0xE8-0xEF */ + 0x50AA,0x50AB,0x50AD,0x50AE,0x50AF,0x50B0,0x50B1,0x50B3,/* 0xF0-0xF7 */ + 0x50B4,0x50B5,0x50B6,0x50B7,0x50B8,0x50B9,0x50BC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x50BD,0x50BE,0x50BF,0x50C0,0x50C1,0x50C2,0x50C3,0x50C4,/* 0x40-0x47 */ + 0x50C5,0x50C6,0x50C7,0x50C8,0x50C9,0x50CA,0x50CB,0x50CC,/* 0x48-0x4F */ + 0x50CD,0x50CE,0x50D0,0x50D1,0x50D2,0x50D3,0x50D4,0x50D5,/* 0x50-0x57 */ + 0x50D7,0x50D8,0x50D9,0x50DB,0x50DC,0x50DD,0x50DE,0x50DF,/* 0x58-0x5F */ + 0x50E0,0x50E1,0x50E2,0x50E3,0x50E4,0x50E5,0x50E8,0x50E9,/* 0x60-0x67 */ + 0x50EA,0x50EB,0x50EF,0x50F0,0x50F1,0x50F2,0x50F4,0x50F6,/* 0x68-0x6F */ + 0x50F7,0x50F8,0x50F9,0x50FA,0x50FC,0x50FD,0x50FE,0x50FF,/* 0x70-0x77 */ + 0x5100,0x5101,0x5102,0x5103,0x5104,0x5105,0x5108,0x0000,/* 0x78-0x7F */ + + 0x5109,0x510A,0x510C,0x510D,0x510E,0x510F,0x5110,0x5111,/* 0x80-0x87 */ + 0x5113,0x5114,0x5115,0x5116,0x5117,0x5118,0x5119,0x511A,/* 0x88-0x8F */ + 0x511B,0x511C,0x511D,0x511E,0x511F,0x5120,0x5122,0x5123,/* 0x90-0x97 */ + 0x5124,0x5125,0x5126,0x5127,0x5128,0x5129,0x512A,0x512B,/* 0x98-0x9F */ + 0x512C,0x512D,0x512E,0x512F,0x5130,0x5131,0x5132,0x5133,/* 0xA0-0xA7 */ + 0x5134,0x5135,0x5136,0x5137,0x5138,0x5139,0x513A,0x513B,/* 0xA8-0xAF */ + 0x513C,0x513D,0x513E,0x5142,0x5147,0x514A,0x514C,0x514E,/* 0xB0-0xB7 */ + 0x514F,0x5150,0x5152,0x5153,0x5157,0x5158,0x5159,0x515B,/* 0xB8-0xBF */ + 0x515D,0x515E,0x515F,0x5160,0x5161,0x5163,0x5164,0x5166,/* 0xC0-0xC7 */ + 0x5167,0x5169,0x516A,0x516F,0x5172,0x517A,0x517E,0x517F,/* 0xC8-0xCF */ + 0x5183,0x5184,0x5186,0x5187,0x518A,0x518B,0x518E,0x518F,/* 0xD0-0xD7 */ + 0x5190,0x5191,0x5193,0x5194,0x5198,0x519A,0x519D,0x519E,/* 0xD8-0xDF */ + 0x519F,0x51A1,0x51A3,0x51A6,0x51A7,0x51A8,0x51A9,0x51AA,/* 0xE0-0xE7 */ + 0x51AD,0x51AE,0x51B4,0x51B8,0x51B9,0x51BA,0x51BE,0x51BF,/* 0xE8-0xEF */ + 0x51C1,0x51C2,0x51C3,0x51C5,0x51C8,0x51CA,0x51CD,0x51CE,/* 0xF0-0xF7 */ + 0x51D0,0x51D2,0x51D3,0x51D4,0x51D5,0x51D6,0x51D7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x51D8,0x51D9,0x51DA,0x51DC,0x51DE,0x51DF,0x51E2,0x51E3,/* 0x40-0x47 */ + 0x51E5,0x51E6,0x51E7,0x51E8,0x51E9,0x51EA,0x51EC,0x51EE,/* 0x48-0x4F */ + 0x51F1,0x51F2,0x51F4,0x51F7,0x51FE,0x5204,0x5205,0x5209,/* 0x50-0x57 */ + 0x520B,0x520C,0x520F,0x5210,0x5213,0x5214,0x5215,0x521C,/* 0x58-0x5F */ + 0x521E,0x521F,0x5221,0x5222,0x5223,0x5225,0x5226,0x5227,/* 0x60-0x67 */ + 0x522A,0x522C,0x522F,0x5231,0x5232,0x5234,0x5235,0x523C,/* 0x68-0x6F */ + 0x523E,0x5244,0x5245,0x5246,0x5247,0x5248,0x5249,0x524B,/* 0x70-0x77 */ + 0x524E,0x524F,0x5252,0x5253,0x5255,0x5257,0x5258,0x0000,/* 0x78-0x7F */ + + 0x5259,0x525A,0x525B,0x525D,0x525F,0x5260,0x5262,0x5263,/* 0x80-0x87 */ + 0x5264,0x5266,0x5268,0x526B,0x526C,0x526D,0x526E,0x5270,/* 0x88-0x8F */ + 0x5271,0x5273,0x5274,0x5275,0x5276,0x5277,0x5278,0x5279,/* 0x90-0x97 */ + 0x527A,0x527B,0x527C,0x527E,0x5280,0x5283,0x5284,0x5285,/* 0x98-0x9F */ + 0x5286,0x5287,0x5289,0x528A,0x528B,0x528C,0x528D,0x528E,/* 0xA0-0xA7 */ + 0x528F,0x5291,0x5292,0x5294,0x5295,0x5296,0x5297,0x5298,/* 0xA8-0xAF */ + 0x5299,0x529A,0x529C,0x52A4,0x52A5,0x52A6,0x52A7,0x52AE,/* 0xB0-0xB7 */ + 0x52AF,0x52B0,0x52B4,0x52B5,0x52B6,0x52B7,0x52B8,0x52B9,/* 0xB8-0xBF */ + 0x52BA,0x52BB,0x52BC,0x52BD,0x52C0,0x52C1,0x52C2,0x52C4,/* 0xC0-0xC7 */ + 0x52C5,0x52C6,0x52C8,0x52CA,0x52CC,0x52CD,0x52CE,0x52CF,/* 0xC8-0xCF */ + 0x52D1,0x52D3,0x52D4,0x52D5,0x52D7,0x52D9,0x52DA,0x52DB,/* 0xD0-0xD7 */ + 0x52DC,0x52DD,0x52DE,0x52E0,0x52E1,0x52E2,0x52E3,0x52E5,/* 0xD8-0xDF */ + 0x52E6,0x52E7,0x52E8,0x52E9,0x52EA,0x52EB,0x52EC,0x52ED,/* 0xE0-0xE7 */ + 0x52EE,0x52EF,0x52F1,0x52F2,0x52F3,0x52F4,0x52F5,0x52F6,/* 0xE8-0xEF */ + 0x52F7,0x52F8,0x52FB,0x52FC,0x52FD,0x5301,0x5302,0x5303,/* 0xF0-0xF7 */ + 0x5304,0x5307,0x5309,0x530A,0x530B,0x530C,0x530E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_85[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5311,0x5312,0x5313,0x5314,0x5318,0x531B,0x531C,0x531E,/* 0x40-0x47 */ + 0x531F,0x5322,0x5324,0x5325,0x5327,0x5328,0x5329,0x532B,/* 0x48-0x4F */ + 0x532C,0x532D,0x532F,0x5330,0x5331,0x5332,0x5333,0x5334,/* 0x50-0x57 */ + 0x5335,0x5336,0x5337,0x5338,0x533C,0x533D,0x5340,0x5342,/* 0x58-0x5F */ + 0x5344,0x5346,0x534B,0x534C,0x534D,0x5350,0x5354,0x5358,/* 0x60-0x67 */ + 0x5359,0x535B,0x535D,0x5365,0x5368,0x536A,0x536C,0x536D,/* 0x68-0x6F */ + 0x5372,0x5376,0x5379,0x537B,0x537C,0x537D,0x537E,0x5380,/* 0x70-0x77 */ + 0x5381,0x5383,0x5387,0x5388,0x538A,0x538E,0x538F,0x0000,/* 0x78-0x7F */ + + 0x5390,0x5391,0x5392,0x5393,0x5394,0x5396,0x5397,0x5399,/* 0x80-0x87 */ + 0x539B,0x539C,0x539E,0x53A0,0x53A1,0x53A4,0x53A7,0x53AA,/* 0x88-0x8F */ + 0x53AB,0x53AC,0x53AD,0x53AF,0x53B0,0x53B1,0x53B2,0x53B3,/* 0x90-0x97 */ + 0x53B4,0x53B5,0x53B7,0x53B8,0x53B9,0x53BA,0x53BC,0x53BD,/* 0x98-0x9F */ + 0x53BE,0x53C0,0x53C3,0x53C4,0x53C5,0x53C6,0x53C7,0x53CE,/* 0xA0-0xA7 */ + 0x53CF,0x53D0,0x53D2,0x53D3,0x53D5,0x53DA,0x53DC,0x53DD,/* 0xA8-0xAF */ + 0x53DE,0x53E1,0x53E2,0x53E7,0x53F4,0x53FA,0x53FE,0x53FF,/* 0xB0-0xB7 */ + 0x5400,0x5402,0x5405,0x5407,0x540B,0x5414,0x5418,0x5419,/* 0xB8-0xBF */ + 0x541A,0x541C,0x5422,0x5424,0x5425,0x542A,0x5430,0x5433,/* 0xC0-0xC7 */ + 0x5436,0x5437,0x543A,0x543D,0x543F,0x5441,0x5442,0x5444,/* 0xC8-0xCF */ + 0x5445,0x5447,0x5449,0x544C,0x544D,0x544E,0x544F,0x5451,/* 0xD0-0xD7 */ + 0x545A,0x545D,0x545E,0x545F,0x5460,0x5461,0x5463,0x5465,/* 0xD8-0xDF */ + 0x5467,0x5469,0x546A,0x546B,0x546C,0x546D,0x546E,0x546F,/* 0xE0-0xE7 */ + 0x5470,0x5474,0x5479,0x547A,0x547E,0x547F,0x5481,0x5483,/* 0xE8-0xEF */ + 0x5485,0x5487,0x5488,0x5489,0x548A,0x548D,0x5491,0x5493,/* 0xF0-0xF7 */ + 0x5497,0x5498,0x549C,0x549E,0x549F,0x54A0,0x54A1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_86[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54A2,0x54A5,0x54AE,0x54B0,0x54B2,0x54B5,0x54B6,0x54B7,/* 0x40-0x47 */ + 0x54B9,0x54BA,0x54BC,0x54BE,0x54C3,0x54C5,0x54CA,0x54CB,/* 0x48-0x4F */ + 0x54D6,0x54D8,0x54DB,0x54E0,0x54E1,0x54E2,0x54E3,0x54E4,/* 0x50-0x57 */ + 0x54EB,0x54EC,0x54EF,0x54F0,0x54F1,0x54F4,0x54F5,0x54F6,/* 0x58-0x5F */ + 0x54F7,0x54F8,0x54F9,0x54FB,0x54FE,0x5500,0x5502,0x5503,/* 0x60-0x67 */ + 0x5504,0x5505,0x5508,0x550A,0x550B,0x550C,0x550D,0x550E,/* 0x68-0x6F */ + 0x5512,0x5513,0x5515,0x5516,0x5517,0x5518,0x5519,0x551A,/* 0x70-0x77 */ + 0x551C,0x551D,0x551E,0x551F,0x5521,0x5525,0x5526,0x0000,/* 0x78-0x7F */ + + 0x5528,0x5529,0x552B,0x552D,0x5532,0x5534,0x5535,0x5536,/* 0x80-0x87 */ + 0x5538,0x5539,0x553A,0x553B,0x553D,0x5540,0x5542,0x5545,/* 0x88-0x8F */ + 0x5547,0x5548,0x554B,0x554C,0x554D,0x554E,0x554F,0x5551,/* 0x90-0x97 */ + 0x5552,0x5553,0x5554,0x5557,0x5558,0x5559,0x555A,0x555B,/* 0x98-0x9F */ + 0x555D,0x555E,0x555F,0x5560,0x5562,0x5563,0x5568,0x5569,/* 0xA0-0xA7 */ + 0x556B,0x556F,0x5570,0x5571,0x5572,0x5573,0x5574,0x5579,/* 0xA8-0xAF */ + 0x557A,0x557D,0x557F,0x5585,0x5586,0x558C,0x558D,0x558E,/* 0xB0-0xB7 */ + 0x5590,0x5592,0x5593,0x5595,0x5596,0x5597,0x559A,0x559B,/* 0xB8-0xBF */ + 0x559E,0x55A0,0x55A1,0x55A2,0x55A3,0x55A4,0x55A5,0x55A6,/* 0xC0-0xC7 */ + 0x55A8,0x55A9,0x55AA,0x55AB,0x55AC,0x55AD,0x55AE,0x55AF,/* 0xC8-0xCF */ + 0x55B0,0x55B2,0x55B4,0x55B6,0x55B8,0x55BA,0x55BC,0x55BF,/* 0xD0-0xD7 */ + 0x55C0,0x55C1,0x55C2,0x55C3,0x55C6,0x55C7,0x55C8,0x55CA,/* 0xD8-0xDF */ + 0x55CB,0x55CE,0x55CF,0x55D0,0x55D5,0x55D7,0x55D8,0x55D9,/* 0xE0-0xE7 */ + 0x55DA,0x55DB,0x55DE,0x55E0,0x55E2,0x55E7,0x55E9,0x55ED,/* 0xE8-0xEF */ + 0x55EE,0x55F0,0x55F1,0x55F4,0x55F6,0x55F8,0x55F9,0x55FA,/* 0xF0-0xF7 */ + 0x55FB,0x55FC,0x55FF,0x5602,0x5603,0x5604,0x5605,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5606,0x5607,0x560A,0x560B,0x560D,0x5610,0x5611,0x5612,/* 0x40-0x47 */ + 0x5613,0x5614,0x5615,0x5616,0x5617,0x5619,0x561A,0x561C,/* 0x48-0x4F */ + 0x561D,0x5620,0x5621,0x5622,0x5625,0x5626,0x5628,0x5629,/* 0x50-0x57 */ + 0x562A,0x562B,0x562E,0x562F,0x5630,0x5633,0x5635,0x5637,/* 0x58-0x5F */ + 0x5638,0x563A,0x563C,0x563D,0x563E,0x5640,0x5641,0x5642,/* 0x60-0x67 */ + 0x5643,0x5644,0x5645,0x5646,0x5647,0x5648,0x5649,0x564A,/* 0x68-0x6F */ + 0x564B,0x564F,0x5650,0x5651,0x5652,0x5653,0x5655,0x5656,/* 0x70-0x77 */ + 0x565A,0x565B,0x565D,0x565E,0x565F,0x5660,0x5661,0x0000,/* 0x78-0x7F */ + + 0x5663,0x5665,0x5666,0x5667,0x566D,0x566E,0x566F,0x5670,/* 0x80-0x87 */ + 0x5672,0x5673,0x5674,0x5675,0x5677,0x5678,0x5679,0x567A,/* 0x88-0x8F */ + 0x567D,0x567E,0x567F,0x5680,0x5681,0x5682,0x5683,0x5684,/* 0x90-0x97 */ + 0x5687,0x5688,0x5689,0x568A,0x568B,0x568C,0x568D,0x5690,/* 0x98-0x9F */ + 0x5691,0x5692,0x5694,0x5695,0x5696,0x5697,0x5698,0x5699,/* 0xA0-0xA7 */ + 0x569A,0x569B,0x569C,0x569D,0x569E,0x569F,0x56A0,0x56A1,/* 0xA8-0xAF */ + 0x56A2,0x56A4,0x56A5,0x56A6,0x56A7,0x56A8,0x56A9,0x56AA,/* 0xB0-0xB7 */ + 0x56AB,0x56AC,0x56AD,0x56AE,0x56B0,0x56B1,0x56B2,0x56B3,/* 0xB8-0xBF */ + 0x56B4,0x56B5,0x56B6,0x56B8,0x56B9,0x56BA,0x56BB,0x56BD,/* 0xC0-0xC7 */ + 0x56BE,0x56BF,0x56C0,0x56C1,0x56C2,0x56C3,0x56C4,0x56C5,/* 0xC8-0xCF */ + 0x56C6,0x56C7,0x56C8,0x56C9,0x56CB,0x56CC,0x56CD,0x56CE,/* 0xD0-0xD7 */ + 0x56CF,0x56D0,0x56D1,0x56D2,0x56D3,0x56D5,0x56D6,0x56D8,/* 0xD8-0xDF */ + 0x56D9,0x56DC,0x56E3,0x56E5,0x56E6,0x56E7,0x56E8,0x56E9,/* 0xE0-0xE7 */ + 0x56EA,0x56EC,0x56EE,0x56EF,0x56F2,0x56F3,0x56F6,0x56F7,/* 0xE8-0xEF */ + 0x56F8,0x56FB,0x56FC,0x5700,0x5701,0x5702,0x5705,0x5707,/* 0xF0-0xF7 */ + 0x570B,0x570C,0x570D,0x570E,0x570F,0x5710,0x5711,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5712,0x5713,0x5714,0x5715,0x5716,0x5717,0x5718,0x5719,/* 0x40-0x47 */ + 0x571A,0x571B,0x571D,0x571E,0x5720,0x5721,0x5722,0x5724,/* 0x48-0x4F */ + 0x5725,0x5726,0x5727,0x572B,0x5731,0x5732,0x5734,0x5735,/* 0x50-0x57 */ + 0x5736,0x5737,0x5738,0x573C,0x573D,0x573F,0x5741,0x5743,/* 0x58-0x5F */ + 0x5744,0x5745,0x5746,0x5748,0x5749,0x574B,0x5752,0x5753,/* 0x60-0x67 */ + 0x5754,0x5755,0x5756,0x5758,0x5759,0x5762,0x5763,0x5765,/* 0x68-0x6F */ + 0x5767,0x576C,0x576E,0x5770,0x5771,0x5772,0x5774,0x5775,/* 0x70-0x77 */ + 0x5778,0x5779,0x577A,0x577D,0x577E,0x577F,0x5780,0x0000,/* 0x78-0x7F */ + + 0x5781,0x5787,0x5788,0x5789,0x578A,0x578D,0x578E,0x578F,/* 0x80-0x87 */ + 0x5790,0x5791,0x5794,0x5795,0x5796,0x5797,0x5798,0x5799,/* 0x88-0x8F */ + 0x579A,0x579C,0x579D,0x579E,0x579F,0x57A5,0x57A8,0x57AA,/* 0x90-0x97 */ + 0x57AC,0x57AF,0x57B0,0x57B1,0x57B3,0x57B5,0x57B6,0x57B7,/* 0x98-0x9F */ + 0x57B9,0x57BA,0x57BB,0x57BC,0x57BD,0x57BE,0x57BF,0x57C0,/* 0xA0-0xA7 */ + 0x57C1,0x57C4,0x57C5,0x57C6,0x57C7,0x57C8,0x57C9,0x57CA,/* 0xA8-0xAF */ + 0x57CC,0x57CD,0x57D0,0x57D1,0x57D3,0x57D6,0x57D7,0x57DB,/* 0xB0-0xB7 */ + 0x57DC,0x57DE,0x57E1,0x57E2,0x57E3,0x57E5,0x57E6,0x57E7,/* 0xB8-0xBF */ + 0x57E8,0x57E9,0x57EA,0x57EB,0x57EC,0x57EE,0x57F0,0x57F1,/* 0xC0-0xC7 */ + 0x57F2,0x57F3,0x57F5,0x57F6,0x57F7,0x57FB,0x57FC,0x57FE,/* 0xC8-0xCF */ + 0x57FF,0x5801,0x5803,0x5804,0x5805,0x5808,0x5809,0x580A,/* 0xD0-0xD7 */ + 0x580C,0x580E,0x580F,0x5810,0x5812,0x5813,0x5814,0x5816,/* 0xD8-0xDF */ + 0x5817,0x5818,0x581A,0x581B,0x581C,0x581D,0x581F,0x5822,/* 0xE0-0xE7 */ + 0x5823,0x5825,0x5826,0x5827,0x5828,0x5829,0x582B,0x582C,/* 0xE8-0xEF */ + 0x582D,0x582E,0x582F,0x5831,0x5832,0x5833,0x5834,0x5836,/* 0xF0-0xF7 */ + 0x5837,0x5838,0x5839,0x583A,0x583B,0x583C,0x583D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x583E,0x583F,0x5840,0x5841,0x5842,0x5843,0x5845,0x5846,/* 0x40-0x47 */ + 0x5847,0x5848,0x5849,0x584A,0x584B,0x584E,0x584F,0x5850,/* 0x48-0x4F */ + 0x5852,0x5853,0x5855,0x5856,0x5857,0x5859,0x585A,0x585B,/* 0x50-0x57 */ + 0x585C,0x585D,0x585F,0x5860,0x5861,0x5862,0x5863,0x5864,/* 0x58-0x5F */ + 0x5866,0x5867,0x5868,0x5869,0x586A,0x586D,0x586E,0x586F,/* 0x60-0x67 */ + 0x5870,0x5871,0x5872,0x5873,0x5874,0x5875,0x5876,0x5877,/* 0x68-0x6F */ + 0x5878,0x5879,0x587A,0x587B,0x587C,0x587D,0x587F,0x5882,/* 0x70-0x77 */ + 0x5884,0x5886,0x5887,0x5888,0x588A,0x588B,0x588C,0x0000,/* 0x78-0x7F */ + + 0x588D,0x588E,0x588F,0x5890,0x5891,0x5894,0x5895,0x5896,/* 0x80-0x87 */ + 0x5897,0x5898,0x589B,0x589C,0x589D,0x58A0,0x58A1,0x58A2,/* 0x88-0x8F */ + 0x58A3,0x58A4,0x58A5,0x58A6,0x58A7,0x58AA,0x58AB,0x58AC,/* 0x90-0x97 */ + 0x58AD,0x58AE,0x58AF,0x58B0,0x58B1,0x58B2,0x58B3,0x58B4,/* 0x98-0x9F */ + 0x58B5,0x58B6,0x58B7,0x58B8,0x58B9,0x58BA,0x58BB,0x58BD,/* 0xA0-0xA7 */ + 0x58BE,0x58BF,0x58C0,0x58C2,0x58C3,0x58C4,0x58C6,0x58C7,/* 0xA8-0xAF */ + 0x58C8,0x58C9,0x58CA,0x58CB,0x58CC,0x58CD,0x58CE,0x58CF,/* 0xB0-0xB7 */ + 0x58D0,0x58D2,0x58D3,0x58D4,0x58D6,0x58D7,0x58D8,0x58D9,/* 0xB8-0xBF */ + 0x58DA,0x58DB,0x58DC,0x58DD,0x58DE,0x58DF,0x58E0,0x58E1,/* 0xC0-0xC7 */ + 0x58E2,0x58E3,0x58E5,0x58E6,0x58E7,0x58E8,0x58E9,0x58EA,/* 0xC8-0xCF */ + 0x58ED,0x58EF,0x58F1,0x58F2,0x58F4,0x58F5,0x58F7,0x58F8,/* 0xD0-0xD7 */ + 0x58FA,0x58FB,0x58FC,0x58FD,0x58FE,0x58FF,0x5900,0x5901,/* 0xD8-0xDF */ + 0x5903,0x5905,0x5906,0x5908,0x5909,0x590A,0x590B,0x590C,/* 0xE0-0xE7 */ + 0x590E,0x5910,0x5911,0x5912,0x5913,0x5917,0x5918,0x591B,/* 0xE8-0xEF */ + 0x591D,0x591E,0x5920,0x5921,0x5922,0x5923,0x5926,0x5928,/* 0xF0-0xF7 */ + 0x592C,0x5930,0x5932,0x5933,0x5935,0x5936,0x593B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x593D,0x593E,0x593F,0x5940,0x5943,0x5945,0x5946,0x594A,/* 0x40-0x47 */ + 0x594C,0x594D,0x5950,0x5952,0x5953,0x5959,0x595B,0x595C,/* 0x48-0x4F */ + 0x595D,0x595E,0x595F,0x5961,0x5963,0x5964,0x5966,0x5967,/* 0x50-0x57 */ + 0x5968,0x5969,0x596A,0x596B,0x596C,0x596D,0x596E,0x596F,/* 0x58-0x5F */ + 0x5970,0x5971,0x5972,0x5975,0x5977,0x597A,0x597B,0x597C,/* 0x60-0x67 */ + 0x597E,0x597F,0x5980,0x5985,0x5989,0x598B,0x598C,0x598E,/* 0x68-0x6F */ + 0x598F,0x5990,0x5991,0x5994,0x5995,0x5998,0x599A,0x599B,/* 0x70-0x77 */ + 0x599C,0x599D,0x599F,0x59A0,0x59A1,0x59A2,0x59A6,0x0000,/* 0x78-0x7F */ + + 0x59A7,0x59AC,0x59AD,0x59B0,0x59B1,0x59B3,0x59B4,0x59B5,/* 0x80-0x87 */ + 0x59B6,0x59B7,0x59B8,0x59BA,0x59BC,0x59BD,0x59BF,0x59C0,/* 0x88-0x8F */ + 0x59C1,0x59C2,0x59C3,0x59C4,0x59C5,0x59C7,0x59C8,0x59C9,/* 0x90-0x97 */ + 0x59CC,0x59CD,0x59CE,0x59CF,0x59D5,0x59D6,0x59D9,0x59DB,/* 0x98-0x9F */ + 0x59DE,0x59DF,0x59E0,0x59E1,0x59E2,0x59E4,0x59E6,0x59E7,/* 0xA0-0xA7 */ + 0x59E9,0x59EA,0x59EB,0x59ED,0x59EE,0x59EF,0x59F0,0x59F1,/* 0xA8-0xAF */ + 0x59F2,0x59F3,0x59F4,0x59F5,0x59F6,0x59F7,0x59F8,0x59FA,/* 0xB0-0xB7 */ + 0x59FC,0x59FD,0x59FE,0x5A00,0x5A02,0x5A0A,0x5A0B,0x5A0D,/* 0xB8-0xBF */ + 0x5A0E,0x5A0F,0x5A10,0x5A12,0x5A14,0x5A15,0x5A16,0x5A17,/* 0xC0-0xC7 */ + 0x5A19,0x5A1A,0x5A1B,0x5A1D,0x5A1E,0x5A21,0x5A22,0x5A24,/* 0xC8-0xCF */ + 0x5A26,0x5A27,0x5A28,0x5A2A,0x5A2B,0x5A2C,0x5A2D,0x5A2E,/* 0xD0-0xD7 */ + 0x5A2F,0x5A30,0x5A33,0x5A35,0x5A37,0x5A38,0x5A39,0x5A3A,/* 0xD8-0xDF */ + 0x5A3B,0x5A3D,0x5A3E,0x5A3F,0x5A41,0x5A42,0x5A43,0x5A44,/* 0xE0-0xE7 */ + 0x5A45,0x5A47,0x5A48,0x5A4B,0x5A4C,0x5A4D,0x5A4E,0x5A4F,/* 0xE8-0xEF */ + 0x5A50,0x5A51,0x5A52,0x5A53,0x5A54,0x5A56,0x5A57,0x5A58,/* 0xF0-0xF7 */ + 0x5A59,0x5A5B,0x5A5C,0x5A5D,0x5A5E,0x5A5F,0x5A60,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A61,0x5A63,0x5A64,0x5A65,0x5A66,0x5A68,0x5A69,0x5A6B,/* 0x40-0x47 */ + 0x5A6C,0x5A6D,0x5A6E,0x5A6F,0x5A70,0x5A71,0x5A72,0x5A73,/* 0x48-0x4F */ + 0x5A78,0x5A79,0x5A7B,0x5A7C,0x5A7D,0x5A7E,0x5A80,0x5A81,/* 0x50-0x57 */ + 0x5A82,0x5A83,0x5A84,0x5A85,0x5A86,0x5A87,0x5A88,0x5A89,/* 0x58-0x5F */ + 0x5A8A,0x5A8B,0x5A8C,0x5A8D,0x5A8E,0x5A8F,0x5A90,0x5A91,/* 0x60-0x67 */ + 0x5A93,0x5A94,0x5A95,0x5A96,0x5A97,0x5A98,0x5A99,0x5A9C,/* 0x68-0x6F */ + 0x5A9D,0x5A9E,0x5A9F,0x5AA0,0x5AA1,0x5AA2,0x5AA3,0x5AA4,/* 0x70-0x77 */ + 0x5AA5,0x5AA6,0x5AA7,0x5AA8,0x5AA9,0x5AAB,0x5AAC,0x0000,/* 0x78-0x7F */ + + 0x5AAD,0x5AAE,0x5AAF,0x5AB0,0x5AB1,0x5AB4,0x5AB6,0x5AB7,/* 0x80-0x87 */ + 0x5AB9,0x5ABA,0x5ABB,0x5ABC,0x5ABD,0x5ABF,0x5AC0,0x5AC3,/* 0x88-0x8F */ + 0x5AC4,0x5AC5,0x5AC6,0x5AC7,0x5AC8,0x5ACA,0x5ACB,0x5ACD,/* 0x90-0x97 */ + 0x5ACE,0x5ACF,0x5AD0,0x5AD1,0x5AD3,0x5AD5,0x5AD7,0x5AD9,/* 0x98-0x9F */ + 0x5ADA,0x5ADB,0x5ADD,0x5ADE,0x5ADF,0x5AE2,0x5AE4,0x5AE5,/* 0xA0-0xA7 */ + 0x5AE7,0x5AE8,0x5AEA,0x5AEC,0x5AED,0x5AEE,0x5AEF,0x5AF0,/* 0xA8-0xAF */ + 0x5AF2,0x5AF3,0x5AF4,0x5AF5,0x5AF6,0x5AF7,0x5AF8,0x5AF9,/* 0xB0-0xB7 */ + 0x5AFA,0x5AFB,0x5AFC,0x5AFD,0x5AFE,0x5AFF,0x5B00,0x5B01,/* 0xB8-0xBF */ + 0x5B02,0x5B03,0x5B04,0x5B05,0x5B06,0x5B07,0x5B08,0x5B0A,/* 0xC0-0xC7 */ + 0x5B0B,0x5B0C,0x5B0D,0x5B0E,0x5B0F,0x5B10,0x5B11,0x5B12,/* 0xC8-0xCF */ + 0x5B13,0x5B14,0x5B15,0x5B18,0x5B19,0x5B1A,0x5B1B,0x5B1C,/* 0xD0-0xD7 */ + 0x5B1D,0x5B1E,0x5B1F,0x5B20,0x5B21,0x5B22,0x5B23,0x5B24,/* 0xD8-0xDF */ + 0x5B25,0x5B26,0x5B27,0x5B28,0x5B29,0x5B2A,0x5B2B,0x5B2C,/* 0xE0-0xE7 */ + 0x5B2D,0x5B2E,0x5B2F,0x5B30,0x5B31,0x5B33,0x5B35,0x5B36,/* 0xE8-0xEF */ + 0x5B38,0x5B39,0x5B3A,0x5B3B,0x5B3C,0x5B3D,0x5B3E,0x5B3F,/* 0xF0-0xF7 */ + 0x5B41,0x5B42,0x5B43,0x5B44,0x5B45,0x5B46,0x5B47,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5B48,0x5B49,0x5B4A,0x5B4B,0x5B4C,0x5B4D,0x5B4E,0x5B4F,/* 0x40-0x47 */ + 0x5B52,0x5B56,0x5B5E,0x5B60,0x5B61,0x5B67,0x5B68,0x5B6B,/* 0x48-0x4F */ + 0x5B6D,0x5B6E,0x5B6F,0x5B72,0x5B74,0x5B76,0x5B77,0x5B78,/* 0x50-0x57 */ + 0x5B79,0x5B7B,0x5B7C,0x5B7E,0x5B7F,0x5B82,0x5B86,0x5B8A,/* 0x58-0x5F */ + 0x5B8D,0x5B8E,0x5B90,0x5B91,0x5B92,0x5B94,0x5B96,0x5B9F,/* 0x60-0x67 */ + 0x5BA7,0x5BA8,0x5BA9,0x5BAC,0x5BAD,0x5BAE,0x5BAF,0x5BB1,/* 0x68-0x6F */ + 0x5BB2,0x5BB7,0x5BBA,0x5BBB,0x5BBC,0x5BC0,0x5BC1,0x5BC3,/* 0x70-0x77 */ + 0x5BC8,0x5BC9,0x5BCA,0x5BCB,0x5BCD,0x5BCE,0x5BCF,0x0000,/* 0x78-0x7F */ + + 0x5BD1,0x5BD4,0x5BD5,0x5BD6,0x5BD7,0x5BD8,0x5BD9,0x5BDA,/* 0x80-0x87 */ + 0x5BDB,0x5BDC,0x5BE0,0x5BE2,0x5BE3,0x5BE6,0x5BE7,0x5BE9,/* 0x88-0x8F */ + 0x5BEA,0x5BEB,0x5BEC,0x5BED,0x5BEF,0x5BF1,0x5BF2,0x5BF3,/* 0x90-0x97 */ + 0x5BF4,0x5BF5,0x5BF6,0x5BF7,0x5BFD,0x5BFE,0x5C00,0x5C02,/* 0x98-0x9F */ + 0x5C03,0x5C05,0x5C07,0x5C08,0x5C0B,0x5C0C,0x5C0D,0x5C0E,/* 0xA0-0xA7 */ + 0x5C10,0x5C12,0x5C13,0x5C17,0x5C19,0x5C1B,0x5C1E,0x5C1F,/* 0xA8-0xAF */ + 0x5C20,0x5C21,0x5C23,0x5C26,0x5C28,0x5C29,0x5C2A,0x5C2B,/* 0xB0-0xB7 */ + 0x5C2D,0x5C2E,0x5C2F,0x5C30,0x5C32,0x5C33,0x5C35,0x5C36,/* 0xB8-0xBF */ + 0x5C37,0x5C43,0x5C44,0x5C46,0x5C47,0x5C4C,0x5C4D,0x5C52,/* 0xC0-0xC7 */ + 0x5C53,0x5C54,0x5C56,0x5C57,0x5C58,0x5C5A,0x5C5B,0x5C5C,/* 0xC8-0xCF */ + 0x5C5D,0x5C5F,0x5C62,0x5C64,0x5C67,0x5C68,0x5C69,0x5C6A,/* 0xD0-0xD7 */ + 0x5C6B,0x5C6C,0x5C6D,0x5C70,0x5C72,0x5C73,0x5C74,0x5C75,/* 0xD8-0xDF */ + 0x5C76,0x5C77,0x5C78,0x5C7B,0x5C7C,0x5C7D,0x5C7E,0x5C80,/* 0xE0-0xE7 */ + 0x5C83,0x5C84,0x5C85,0x5C86,0x5C87,0x5C89,0x5C8A,0x5C8B,/* 0xE8-0xEF */ + 0x5C8E,0x5C8F,0x5C92,0x5C93,0x5C95,0x5C9D,0x5C9E,0x5C9F,/* 0xF0-0xF7 */ + 0x5CA0,0x5CA1,0x5CA4,0x5CA5,0x5CA6,0x5CA7,0x5CA8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5CAA,0x5CAE,0x5CAF,0x5CB0,0x5CB2,0x5CB4,0x5CB6,0x5CB9,/* 0x40-0x47 */ + 0x5CBA,0x5CBB,0x5CBC,0x5CBE,0x5CC0,0x5CC2,0x5CC3,0x5CC5,/* 0x48-0x4F */ + 0x5CC6,0x5CC7,0x5CC8,0x5CC9,0x5CCA,0x5CCC,0x5CCD,0x5CCE,/* 0x50-0x57 */ + 0x5CCF,0x5CD0,0x5CD1,0x5CD3,0x5CD4,0x5CD5,0x5CD6,0x5CD7,/* 0x58-0x5F */ + 0x5CD8,0x5CDA,0x5CDB,0x5CDC,0x5CDD,0x5CDE,0x5CDF,0x5CE0,/* 0x60-0x67 */ + 0x5CE2,0x5CE3,0x5CE7,0x5CE9,0x5CEB,0x5CEC,0x5CEE,0x5CEF,/* 0x68-0x6F */ + 0x5CF1,0x5CF2,0x5CF3,0x5CF4,0x5CF5,0x5CF6,0x5CF7,0x5CF8,/* 0x70-0x77 */ + 0x5CF9,0x5CFA,0x5CFC,0x5CFD,0x5CFE,0x5CFF,0x5D00,0x0000,/* 0x78-0x7F */ + + 0x5D01,0x5D04,0x5D05,0x5D08,0x5D09,0x5D0A,0x5D0B,0x5D0C,/* 0x80-0x87 */ + 0x5D0D,0x5D0F,0x5D10,0x5D11,0x5D12,0x5D13,0x5D15,0x5D17,/* 0x88-0x8F */ + 0x5D18,0x5D19,0x5D1A,0x5D1C,0x5D1D,0x5D1F,0x5D20,0x5D21,/* 0x90-0x97 */ + 0x5D22,0x5D23,0x5D25,0x5D28,0x5D2A,0x5D2B,0x5D2C,0x5D2F,/* 0x98-0x9F */ + 0x5D30,0x5D31,0x5D32,0x5D33,0x5D35,0x5D36,0x5D37,0x5D38,/* 0xA0-0xA7 */ + 0x5D39,0x5D3A,0x5D3B,0x5D3C,0x5D3F,0x5D40,0x5D41,0x5D42,/* 0xA8-0xAF */ + 0x5D43,0x5D44,0x5D45,0x5D46,0x5D48,0x5D49,0x5D4D,0x5D4E,/* 0xB0-0xB7 */ + 0x5D4F,0x5D50,0x5D51,0x5D52,0x5D53,0x5D54,0x5D55,0x5D56,/* 0xB8-0xBF */ + 0x5D57,0x5D59,0x5D5A,0x5D5C,0x5D5E,0x5D5F,0x5D60,0x5D61,/* 0xC0-0xC7 */ + 0x5D62,0x5D63,0x5D64,0x5D65,0x5D66,0x5D67,0x5D68,0x5D6A,/* 0xC8-0xCF */ + 0x5D6D,0x5D6E,0x5D70,0x5D71,0x5D72,0x5D73,0x5D75,0x5D76,/* 0xD0-0xD7 */ + 0x5D77,0x5D78,0x5D79,0x5D7A,0x5D7B,0x5D7C,0x5D7D,0x5D7E,/* 0xD8-0xDF */ + 0x5D7F,0x5D80,0x5D81,0x5D83,0x5D84,0x5D85,0x5D86,0x5D87,/* 0xE0-0xE7 */ + 0x5D88,0x5D89,0x5D8A,0x5D8B,0x5D8C,0x5D8D,0x5D8E,0x5D8F,/* 0xE8-0xEF */ + 0x5D90,0x5D91,0x5D92,0x5D93,0x5D94,0x5D95,0x5D96,0x5D97,/* 0xF0-0xF7 */ + 0x5D98,0x5D9A,0x5D9B,0x5D9C,0x5D9E,0x5D9F,0x5DA0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5DA1,0x5DA2,0x5DA3,0x5DA4,0x5DA5,0x5DA6,0x5DA7,0x5DA8,/* 0x40-0x47 */ + 0x5DA9,0x5DAA,0x5DAB,0x5DAC,0x5DAD,0x5DAE,0x5DAF,0x5DB0,/* 0x48-0x4F */ + 0x5DB1,0x5DB2,0x5DB3,0x5DB4,0x5DB5,0x5DB6,0x5DB8,0x5DB9,/* 0x50-0x57 */ + 0x5DBA,0x5DBB,0x5DBC,0x5DBD,0x5DBE,0x5DBF,0x5DC0,0x5DC1,/* 0x58-0x5F */ + 0x5DC2,0x5DC3,0x5DC4,0x5DC6,0x5DC7,0x5DC8,0x5DC9,0x5DCA,/* 0x60-0x67 */ + 0x5DCB,0x5DCC,0x5DCE,0x5DCF,0x5DD0,0x5DD1,0x5DD2,0x5DD3,/* 0x68-0x6F */ + 0x5DD4,0x5DD5,0x5DD6,0x5DD7,0x5DD8,0x5DD9,0x5DDA,0x5DDC,/* 0x70-0x77 */ + 0x5DDF,0x5DE0,0x5DE3,0x5DE4,0x5DEA,0x5DEC,0x5DED,0x0000,/* 0x78-0x7F */ + + 0x5DF0,0x5DF5,0x5DF6,0x5DF8,0x5DF9,0x5DFA,0x5DFB,0x5DFC,/* 0x80-0x87 */ + 0x5DFF,0x5E00,0x5E04,0x5E07,0x5E09,0x5E0A,0x5E0B,0x5E0D,/* 0x88-0x8F */ + 0x5E0E,0x5E12,0x5E13,0x5E17,0x5E1E,0x5E1F,0x5E20,0x5E21,/* 0x90-0x97 */ + 0x5E22,0x5E23,0x5E24,0x5E25,0x5E28,0x5E29,0x5E2A,0x5E2B,/* 0x98-0x9F */ + 0x5E2C,0x5E2F,0x5E30,0x5E32,0x5E33,0x5E34,0x5E35,0x5E36,/* 0xA0-0xA7 */ + 0x5E39,0x5E3A,0x5E3E,0x5E3F,0x5E40,0x5E41,0x5E43,0x5E46,/* 0xA8-0xAF */ + 0x5E47,0x5E48,0x5E49,0x5E4A,0x5E4B,0x5E4D,0x5E4E,0x5E4F,/* 0xB0-0xB7 */ + 0x5E50,0x5E51,0x5E52,0x5E53,0x5E56,0x5E57,0x5E58,0x5E59,/* 0xB8-0xBF */ + 0x5E5A,0x5E5C,0x5E5D,0x5E5F,0x5E60,0x5E63,0x5E64,0x5E65,/* 0xC0-0xC7 */ + 0x5E66,0x5E67,0x5E68,0x5E69,0x5E6A,0x5E6B,0x5E6C,0x5E6D,/* 0xC8-0xCF */ + 0x5E6E,0x5E6F,0x5E70,0x5E71,0x5E75,0x5E77,0x5E79,0x5E7E,/* 0xD0-0xD7 */ + 0x5E81,0x5E82,0x5E83,0x5E85,0x5E88,0x5E89,0x5E8C,0x5E8D,/* 0xD8-0xDF */ + 0x5E8E,0x5E92,0x5E98,0x5E9B,0x5E9D,0x5EA1,0x5EA2,0x5EA3,/* 0xE0-0xE7 */ + 0x5EA4,0x5EA8,0x5EA9,0x5EAA,0x5EAB,0x5EAC,0x5EAE,0x5EAF,/* 0xE8-0xEF */ + 0x5EB0,0x5EB1,0x5EB2,0x5EB4,0x5EBA,0x5EBB,0x5EBC,0x5EBD,/* 0xF0-0xF7 */ + 0x5EBF,0x5EC0,0x5EC1,0x5EC2,0x5EC3,0x5EC4,0x5EC5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5EC6,0x5EC7,0x5EC8,0x5ECB,0x5ECC,0x5ECD,0x5ECE,0x5ECF,/* 0x40-0x47 */ + 0x5ED0,0x5ED4,0x5ED5,0x5ED7,0x5ED8,0x5ED9,0x5EDA,0x5EDC,/* 0x48-0x4F */ + 0x5EDD,0x5EDE,0x5EDF,0x5EE0,0x5EE1,0x5EE2,0x5EE3,0x5EE4,/* 0x50-0x57 */ + 0x5EE5,0x5EE6,0x5EE7,0x5EE9,0x5EEB,0x5EEC,0x5EED,0x5EEE,/* 0x58-0x5F */ + 0x5EEF,0x5EF0,0x5EF1,0x5EF2,0x5EF3,0x5EF5,0x5EF8,0x5EF9,/* 0x60-0x67 */ + 0x5EFB,0x5EFC,0x5EFD,0x5F05,0x5F06,0x5F07,0x5F09,0x5F0C,/* 0x68-0x6F */ + 0x5F0D,0x5F0E,0x5F10,0x5F12,0x5F14,0x5F16,0x5F19,0x5F1A,/* 0x70-0x77 */ + 0x5F1C,0x5F1D,0x5F1E,0x5F21,0x5F22,0x5F23,0x5F24,0x0000,/* 0x78-0x7F */ + + 0x5F28,0x5F2B,0x5F2C,0x5F2E,0x5F30,0x5F32,0x5F33,0x5F34,/* 0x80-0x87 */ + 0x5F35,0x5F36,0x5F37,0x5F38,0x5F3B,0x5F3D,0x5F3E,0x5F3F,/* 0x88-0x8F */ + 0x5F41,0x5F42,0x5F43,0x5F44,0x5F45,0x5F46,0x5F47,0x5F48,/* 0x90-0x97 */ + 0x5F49,0x5F4A,0x5F4B,0x5F4C,0x5F4D,0x5F4E,0x5F4F,0x5F51,/* 0x98-0x9F */ + 0x5F54,0x5F59,0x5F5A,0x5F5B,0x5F5C,0x5F5E,0x5F5F,0x5F60,/* 0xA0-0xA7 */ + 0x5F63,0x5F65,0x5F67,0x5F68,0x5F6B,0x5F6E,0x5F6F,0x5F72,/* 0xA8-0xAF */ + 0x5F74,0x5F75,0x5F76,0x5F78,0x5F7A,0x5F7D,0x5F7E,0x5F7F,/* 0xB0-0xB7 */ + 0x5F83,0x5F86,0x5F8D,0x5F8E,0x5F8F,0x5F91,0x5F93,0x5F94,/* 0xB8-0xBF */ + 0x5F96,0x5F9A,0x5F9B,0x5F9D,0x5F9E,0x5F9F,0x5FA0,0x5FA2,/* 0xC0-0xC7 */ + 0x5FA3,0x5FA4,0x5FA5,0x5FA6,0x5FA7,0x5FA9,0x5FAB,0x5FAC,/* 0xC8-0xCF */ + 0x5FAF,0x5FB0,0x5FB1,0x5FB2,0x5FB3,0x5FB4,0x5FB6,0x5FB8,/* 0xD0-0xD7 */ + 0x5FB9,0x5FBA,0x5FBB,0x5FBE,0x5FBF,0x5FC0,0x5FC1,0x5FC2,/* 0xD8-0xDF */ + 0x5FC7,0x5FC8,0x5FCA,0x5FCB,0x5FCE,0x5FD3,0x5FD4,0x5FD5,/* 0xE0-0xE7 */ + 0x5FDA,0x5FDB,0x5FDC,0x5FDE,0x5FDF,0x5FE2,0x5FE3,0x5FE5,/* 0xE8-0xEF */ + 0x5FE6,0x5FE8,0x5FE9,0x5FEC,0x5FEF,0x5FF0,0x5FF2,0x5FF3,/* 0xF0-0xF7 */ + 0x5FF4,0x5FF6,0x5FF7,0x5FF9,0x5FFA,0x5FFC,0x6007,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6008,0x6009,0x600B,0x600C,0x6010,0x6011,0x6013,0x6017,/* 0x40-0x47 */ + 0x6018,0x601A,0x601E,0x601F,0x6022,0x6023,0x6024,0x602C,/* 0x48-0x4F */ + 0x602D,0x602E,0x6030,0x6031,0x6032,0x6033,0x6034,0x6036,/* 0x50-0x57 */ + 0x6037,0x6038,0x6039,0x603A,0x603D,0x603E,0x6040,0x6044,/* 0x58-0x5F */ + 0x6045,0x6046,0x6047,0x6048,0x6049,0x604A,0x604C,0x604E,/* 0x60-0x67 */ + 0x604F,0x6051,0x6053,0x6054,0x6056,0x6057,0x6058,0x605B,/* 0x68-0x6F */ + 0x605C,0x605E,0x605F,0x6060,0x6061,0x6065,0x6066,0x606E,/* 0x70-0x77 */ + 0x6071,0x6072,0x6074,0x6075,0x6077,0x607E,0x6080,0x0000,/* 0x78-0x7F */ + + 0x6081,0x6082,0x6085,0x6086,0x6087,0x6088,0x608A,0x608B,/* 0x80-0x87 */ + 0x608E,0x608F,0x6090,0x6091,0x6093,0x6095,0x6097,0x6098,/* 0x88-0x8F */ + 0x6099,0x609C,0x609E,0x60A1,0x60A2,0x60A4,0x60A5,0x60A7,/* 0x90-0x97 */ + 0x60A9,0x60AA,0x60AE,0x60B0,0x60B3,0x60B5,0x60B6,0x60B7,/* 0x98-0x9F */ + 0x60B9,0x60BA,0x60BD,0x60BE,0x60BF,0x60C0,0x60C1,0x60C2,/* 0xA0-0xA7 */ + 0x60C3,0x60C4,0x60C7,0x60C8,0x60C9,0x60CC,0x60CD,0x60CE,/* 0xA8-0xAF */ + 0x60CF,0x60D0,0x60D2,0x60D3,0x60D4,0x60D6,0x60D7,0x60D9,/* 0xB0-0xB7 */ + 0x60DB,0x60DE,0x60E1,0x60E2,0x60E3,0x60E4,0x60E5,0x60EA,/* 0xB8-0xBF */ + 0x60F1,0x60F2,0x60F5,0x60F7,0x60F8,0x60FB,0x60FC,0x60FD,/* 0xC0-0xC7 */ + 0x60FE,0x60FF,0x6102,0x6103,0x6104,0x6105,0x6107,0x610A,/* 0xC8-0xCF */ + 0x610B,0x610C,0x6110,0x6111,0x6112,0x6113,0x6114,0x6116,/* 0xD0-0xD7 */ + 0x6117,0x6118,0x6119,0x611B,0x611C,0x611D,0x611E,0x6121,/* 0xD8-0xDF */ + 0x6122,0x6125,0x6128,0x6129,0x612A,0x612C,0x612D,0x612E,/* 0xE0-0xE7 */ + 0x612F,0x6130,0x6131,0x6132,0x6133,0x6134,0x6135,0x6136,/* 0xE8-0xEF */ + 0x6137,0x6138,0x6139,0x613A,0x613B,0x613C,0x613D,0x613E,/* 0xF0-0xF7 */ + 0x6140,0x6141,0x6142,0x6143,0x6144,0x6145,0x6146,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6147,0x6149,0x614B,0x614D,0x614F,0x6150,0x6152,0x6153,/* 0x40-0x47 */ + 0x6154,0x6156,0x6157,0x6158,0x6159,0x615A,0x615B,0x615C,/* 0x48-0x4F */ + 0x615E,0x615F,0x6160,0x6161,0x6163,0x6164,0x6165,0x6166,/* 0x50-0x57 */ + 0x6169,0x616A,0x616B,0x616C,0x616D,0x616E,0x616F,0x6171,/* 0x58-0x5F */ + 0x6172,0x6173,0x6174,0x6176,0x6178,0x6179,0x617A,0x617B,/* 0x60-0x67 */ + 0x617C,0x617D,0x617E,0x617F,0x6180,0x6181,0x6182,0x6183,/* 0x68-0x6F */ + 0x6184,0x6185,0x6186,0x6187,0x6188,0x6189,0x618A,0x618C,/* 0x70-0x77 */ + 0x618D,0x618F,0x6190,0x6191,0x6192,0x6193,0x6195,0x0000,/* 0x78-0x7F */ + + 0x6196,0x6197,0x6198,0x6199,0x619A,0x619B,0x619C,0x619E,/* 0x80-0x87 */ + 0x619F,0x61A0,0x61A1,0x61A2,0x61A3,0x61A4,0x61A5,0x61A6,/* 0x88-0x8F */ + 0x61AA,0x61AB,0x61AD,0x61AE,0x61AF,0x61B0,0x61B1,0x61B2,/* 0x90-0x97 */ + 0x61B3,0x61B4,0x61B5,0x61B6,0x61B8,0x61B9,0x61BA,0x61BB,/* 0x98-0x9F */ + 0x61BC,0x61BD,0x61BF,0x61C0,0x61C1,0x61C3,0x61C4,0x61C5,/* 0xA0-0xA7 */ + 0x61C6,0x61C7,0x61C9,0x61CC,0x61CD,0x61CE,0x61CF,0x61D0,/* 0xA8-0xAF */ + 0x61D3,0x61D5,0x61D6,0x61D7,0x61D8,0x61D9,0x61DA,0x61DB,/* 0xB0-0xB7 */ + 0x61DC,0x61DD,0x61DE,0x61DF,0x61E0,0x61E1,0x61E2,0x61E3,/* 0xB8-0xBF */ + 0x61E4,0x61E5,0x61E7,0x61E8,0x61E9,0x61EA,0x61EB,0x61EC,/* 0xC0-0xC7 */ + 0x61ED,0x61EE,0x61EF,0x61F0,0x61F1,0x61F2,0x61F3,0x61F4,/* 0xC8-0xCF */ + 0x61F6,0x61F7,0x61F8,0x61F9,0x61FA,0x61FB,0x61FC,0x61FD,/* 0xD0-0xD7 */ + 0x61FE,0x6200,0x6201,0x6202,0x6203,0x6204,0x6205,0x6207,/* 0xD8-0xDF */ + 0x6209,0x6213,0x6214,0x6219,0x621C,0x621D,0x621E,0x6220,/* 0xE0-0xE7 */ + 0x6223,0x6226,0x6227,0x6228,0x6229,0x622B,0x622D,0x622F,/* 0xE8-0xEF */ + 0x6230,0x6231,0x6232,0x6235,0x6236,0x6238,0x6239,0x623A,/* 0xF0-0xF7 */ + 0x623B,0x623C,0x6242,0x6244,0x6245,0x6246,0x624A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x624F,0x6250,0x6255,0x6256,0x6257,0x6259,0x625A,0x625C,/* 0x40-0x47 */ + 0x625D,0x625E,0x625F,0x6260,0x6261,0x6262,0x6264,0x6265,/* 0x48-0x4F */ + 0x6268,0x6271,0x6272,0x6274,0x6275,0x6277,0x6278,0x627A,/* 0x50-0x57 */ + 0x627B,0x627D,0x6281,0x6282,0x6283,0x6285,0x6286,0x6287,/* 0x58-0x5F */ + 0x6288,0x628B,0x628C,0x628D,0x628E,0x628F,0x6290,0x6294,/* 0x60-0x67 */ + 0x6299,0x629C,0x629D,0x629E,0x62A3,0x62A6,0x62A7,0x62A9,/* 0x68-0x6F */ + 0x62AA,0x62AD,0x62AE,0x62AF,0x62B0,0x62B2,0x62B3,0x62B4,/* 0x70-0x77 */ + 0x62B6,0x62B7,0x62B8,0x62BA,0x62BE,0x62C0,0x62C1,0x0000,/* 0x78-0x7F */ + + 0x62C3,0x62CB,0x62CF,0x62D1,0x62D5,0x62DD,0x62DE,0x62E0,/* 0x80-0x87 */ + 0x62E1,0x62E4,0x62EA,0x62EB,0x62F0,0x62F2,0x62F5,0x62F8,/* 0x88-0x8F */ + 0x62F9,0x62FA,0x62FB,0x6300,0x6303,0x6304,0x6305,0x6306,/* 0x90-0x97 */ + 0x630A,0x630B,0x630C,0x630D,0x630F,0x6310,0x6312,0x6313,/* 0x98-0x9F */ + 0x6314,0x6315,0x6317,0x6318,0x6319,0x631C,0x6326,0x6327,/* 0xA0-0xA7 */ + 0x6329,0x632C,0x632D,0x632E,0x6330,0x6331,0x6333,0x6334,/* 0xA8-0xAF */ + 0x6335,0x6336,0x6337,0x6338,0x633B,0x633C,0x633E,0x633F,/* 0xB0-0xB7 */ + 0x6340,0x6341,0x6344,0x6347,0x6348,0x634A,0x6351,0x6352,/* 0xB8-0xBF */ + 0x6353,0x6354,0x6356,0x6357,0x6358,0x6359,0x635A,0x635B,/* 0xC0-0xC7 */ + 0x635C,0x635D,0x6360,0x6364,0x6365,0x6366,0x6368,0x636A,/* 0xC8-0xCF */ + 0x636B,0x636C,0x636F,0x6370,0x6372,0x6373,0x6374,0x6375,/* 0xD0-0xD7 */ + 0x6378,0x6379,0x637C,0x637D,0x637E,0x637F,0x6381,0x6383,/* 0xD8-0xDF */ + 0x6384,0x6385,0x6386,0x638B,0x638D,0x6391,0x6393,0x6394,/* 0xE0-0xE7 */ + 0x6395,0x6397,0x6399,0x639A,0x639B,0x639C,0x639D,0x639E,/* 0xE8-0xEF */ + 0x639F,0x63A1,0x63A4,0x63A6,0x63AB,0x63AF,0x63B1,0x63B2,/* 0xF0-0xF7 */ + 0x63B5,0x63B6,0x63B9,0x63BB,0x63BD,0x63BF,0x63C0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x63C1,0x63C2,0x63C3,0x63C5,0x63C7,0x63C8,0x63CA,0x63CB,/* 0x40-0x47 */ + 0x63CC,0x63D1,0x63D3,0x63D4,0x63D5,0x63D7,0x63D8,0x63D9,/* 0x48-0x4F */ + 0x63DA,0x63DB,0x63DC,0x63DD,0x63DF,0x63E2,0x63E4,0x63E5,/* 0x50-0x57 */ + 0x63E6,0x63E7,0x63E8,0x63EB,0x63EC,0x63EE,0x63EF,0x63F0,/* 0x58-0x5F */ + 0x63F1,0x63F3,0x63F5,0x63F7,0x63F9,0x63FA,0x63FB,0x63FC,/* 0x60-0x67 */ + 0x63FE,0x6403,0x6404,0x6406,0x6407,0x6408,0x6409,0x640A,/* 0x68-0x6F */ + 0x640D,0x640E,0x6411,0x6412,0x6415,0x6416,0x6417,0x6418,/* 0x70-0x77 */ + 0x6419,0x641A,0x641D,0x641F,0x6422,0x6423,0x6424,0x0000,/* 0x78-0x7F */ + + 0x6425,0x6427,0x6428,0x6429,0x642B,0x642E,0x642F,0x6430,/* 0x80-0x87 */ + 0x6431,0x6432,0x6433,0x6435,0x6436,0x6437,0x6438,0x6439,/* 0x88-0x8F */ + 0x643B,0x643C,0x643E,0x6440,0x6442,0x6443,0x6449,0x644B,/* 0x90-0x97 */ + 0x644C,0x644D,0x644E,0x644F,0x6450,0x6451,0x6453,0x6455,/* 0x98-0x9F */ + 0x6456,0x6457,0x6459,0x645A,0x645B,0x645C,0x645D,0x645F,/* 0xA0-0xA7 */ + 0x6460,0x6461,0x6462,0x6463,0x6464,0x6465,0x6466,0x6468,/* 0xA8-0xAF */ + 0x646A,0x646B,0x646C,0x646E,0x646F,0x6470,0x6471,0x6472,/* 0xB0-0xB7 */ + 0x6473,0x6474,0x6475,0x6476,0x6477,0x647B,0x647C,0x647D,/* 0xB8-0xBF */ + 0x647E,0x647F,0x6480,0x6481,0x6483,0x6486,0x6488,0x6489,/* 0xC0-0xC7 */ + 0x648A,0x648B,0x648C,0x648D,0x648E,0x648F,0x6490,0x6493,/* 0xC8-0xCF */ + 0x6494,0x6497,0x6498,0x649A,0x649B,0x649C,0x649D,0x649F,/* 0xD0-0xD7 */ + 0x64A0,0x64A1,0x64A2,0x64A3,0x64A5,0x64A6,0x64A7,0x64A8,/* 0xD8-0xDF */ + 0x64AA,0x64AB,0x64AF,0x64B1,0x64B2,0x64B3,0x64B4,0x64B6,/* 0xE0-0xE7 */ + 0x64B9,0x64BB,0x64BD,0x64BE,0x64BF,0x64C1,0x64C3,0x64C4,/* 0xE8-0xEF */ + 0x64C6,0x64C7,0x64C8,0x64C9,0x64CA,0x64CB,0x64CC,0x64CF,/* 0xF0-0xF7 */ + 0x64D1,0x64D3,0x64D4,0x64D5,0x64D6,0x64D9,0x64DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x64DB,0x64DC,0x64DD,0x64DF,0x64E0,0x64E1,0x64E3,0x64E5,/* 0x40-0x47 */ + 0x64E7,0x64E8,0x64E9,0x64EA,0x64EB,0x64EC,0x64ED,0x64EE,/* 0x48-0x4F */ + 0x64EF,0x64F0,0x64F1,0x64F2,0x64F3,0x64F4,0x64F5,0x64F6,/* 0x50-0x57 */ + 0x64F7,0x64F8,0x64F9,0x64FA,0x64FB,0x64FC,0x64FD,0x64FE,/* 0x58-0x5F */ + 0x64FF,0x6501,0x6502,0x6503,0x6504,0x6505,0x6506,0x6507,/* 0x60-0x67 */ + 0x6508,0x650A,0x650B,0x650C,0x650D,0x650E,0x650F,0x6510,/* 0x68-0x6F */ + 0x6511,0x6513,0x6514,0x6515,0x6516,0x6517,0x6519,0x651A,/* 0x70-0x77 */ + 0x651B,0x651C,0x651D,0x651E,0x651F,0x6520,0x6521,0x0000,/* 0x78-0x7F */ + + 0x6522,0x6523,0x6524,0x6526,0x6527,0x6528,0x6529,0x652A,/* 0x80-0x87 */ + 0x652C,0x652D,0x6530,0x6531,0x6532,0x6533,0x6537,0x653A,/* 0x88-0x8F */ + 0x653C,0x653D,0x6540,0x6541,0x6542,0x6543,0x6544,0x6546,/* 0x90-0x97 */ + 0x6547,0x654A,0x654B,0x654D,0x654E,0x6550,0x6552,0x6553,/* 0x98-0x9F */ + 0x6554,0x6557,0x6558,0x655A,0x655C,0x655F,0x6560,0x6561,/* 0xA0-0xA7 */ + 0x6564,0x6565,0x6567,0x6568,0x6569,0x656A,0x656D,0x656E,/* 0xA8-0xAF */ + 0x656F,0x6571,0x6573,0x6575,0x6576,0x6578,0x6579,0x657A,/* 0xB0-0xB7 */ + 0x657B,0x657C,0x657D,0x657E,0x657F,0x6580,0x6581,0x6582,/* 0xB8-0xBF */ + 0x6583,0x6584,0x6585,0x6586,0x6588,0x6589,0x658A,0x658D,/* 0xC0-0xC7 */ + 0x658E,0x658F,0x6592,0x6594,0x6595,0x6596,0x6598,0x659A,/* 0xC8-0xCF */ + 0x659D,0x659E,0x65A0,0x65A2,0x65A3,0x65A6,0x65A8,0x65AA,/* 0xD0-0xD7 */ + 0x65AC,0x65AE,0x65B1,0x65B2,0x65B3,0x65B4,0x65B5,0x65B6,/* 0xD8-0xDF */ + 0x65B7,0x65B8,0x65BA,0x65BB,0x65BE,0x65BF,0x65C0,0x65C2,/* 0xE0-0xE7 */ + 0x65C7,0x65C8,0x65C9,0x65CA,0x65CD,0x65D0,0x65D1,0x65D3,/* 0xE8-0xEF */ + 0x65D4,0x65D5,0x65D8,0x65D9,0x65DA,0x65DB,0x65DC,0x65DD,/* 0xF0-0xF7 */ + 0x65DE,0x65DF,0x65E1,0x65E3,0x65E4,0x65EA,0x65EB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x65F2,0x65F3,0x65F4,0x65F5,0x65F8,0x65F9,0x65FB,0x65FC,/* 0x40-0x47 */ + 0x65FD,0x65FE,0x65FF,0x6601,0x6604,0x6605,0x6607,0x6608,/* 0x48-0x4F */ + 0x6609,0x660B,0x660D,0x6610,0x6611,0x6612,0x6616,0x6617,/* 0x50-0x57 */ + 0x6618,0x661A,0x661B,0x661C,0x661E,0x6621,0x6622,0x6623,/* 0x58-0x5F */ + 0x6624,0x6626,0x6629,0x662A,0x662B,0x662C,0x662E,0x6630,/* 0x60-0x67 */ + 0x6632,0x6633,0x6637,0x6638,0x6639,0x663A,0x663B,0x663D,/* 0x68-0x6F */ + 0x663F,0x6640,0x6642,0x6644,0x6645,0x6646,0x6647,0x6648,/* 0x70-0x77 */ + 0x6649,0x664A,0x664D,0x664E,0x6650,0x6651,0x6658,0x0000,/* 0x78-0x7F */ + + 0x6659,0x665B,0x665C,0x665D,0x665E,0x6660,0x6662,0x6663,/* 0x80-0x87 */ + 0x6665,0x6667,0x6669,0x666A,0x666B,0x666C,0x666D,0x6671,/* 0x88-0x8F */ + 0x6672,0x6673,0x6675,0x6678,0x6679,0x667B,0x667C,0x667D,/* 0x90-0x97 */ + 0x667F,0x6680,0x6681,0x6683,0x6685,0x6686,0x6688,0x6689,/* 0x98-0x9F */ + 0x668A,0x668B,0x668D,0x668E,0x668F,0x6690,0x6692,0x6693,/* 0xA0-0xA7 */ + 0x6694,0x6695,0x6698,0x6699,0x669A,0x669B,0x669C,0x669E,/* 0xA8-0xAF */ + 0x669F,0x66A0,0x66A1,0x66A2,0x66A3,0x66A4,0x66A5,0x66A6,/* 0xB0-0xB7 */ + 0x66A9,0x66AA,0x66AB,0x66AC,0x66AD,0x66AF,0x66B0,0x66B1,/* 0xB8-0xBF */ + 0x66B2,0x66B3,0x66B5,0x66B6,0x66B7,0x66B8,0x66BA,0x66BB,/* 0xC0-0xC7 */ + 0x66BC,0x66BD,0x66BF,0x66C0,0x66C1,0x66C2,0x66C3,0x66C4,/* 0xC8-0xCF */ + 0x66C5,0x66C6,0x66C7,0x66C8,0x66C9,0x66CA,0x66CB,0x66CC,/* 0xD0-0xD7 */ + 0x66CD,0x66CE,0x66CF,0x66D0,0x66D1,0x66D2,0x66D3,0x66D4,/* 0xD8-0xDF */ + 0x66D5,0x66D6,0x66D7,0x66D8,0x66DA,0x66DE,0x66DF,0x66E0,/* 0xE0-0xE7 */ + 0x66E1,0x66E2,0x66E3,0x66E4,0x66E5,0x66E7,0x66E8,0x66EA,/* 0xE8-0xEF */ + 0x66EB,0x66EC,0x66ED,0x66EE,0x66EF,0x66F1,0x66F5,0x66F6,/* 0xF0-0xF7 */ + 0x66F8,0x66FA,0x66FB,0x66FD,0x6701,0x6702,0x6703,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6704,0x6705,0x6706,0x6707,0x670C,0x670E,0x670F,0x6711,/* 0x40-0x47 */ + 0x6712,0x6713,0x6716,0x6718,0x6719,0x671A,0x671C,0x671E,/* 0x48-0x4F */ + 0x6720,0x6721,0x6722,0x6723,0x6724,0x6725,0x6727,0x6729,/* 0x50-0x57 */ + 0x672E,0x6730,0x6732,0x6733,0x6736,0x6737,0x6738,0x6739,/* 0x58-0x5F */ + 0x673B,0x673C,0x673E,0x673F,0x6741,0x6744,0x6745,0x6747,/* 0x60-0x67 */ + 0x674A,0x674B,0x674D,0x6752,0x6754,0x6755,0x6757,0x6758,/* 0x68-0x6F */ + 0x6759,0x675A,0x675B,0x675D,0x6762,0x6763,0x6764,0x6766,/* 0x70-0x77 */ + 0x6767,0x676B,0x676C,0x676E,0x6771,0x6774,0x6776,0x0000,/* 0x78-0x7F */ + + 0x6778,0x6779,0x677A,0x677B,0x677D,0x6780,0x6782,0x6783,/* 0x80-0x87 */ + 0x6785,0x6786,0x6788,0x678A,0x678C,0x678D,0x678E,0x678F,/* 0x88-0x8F */ + 0x6791,0x6792,0x6793,0x6794,0x6796,0x6799,0x679B,0x679F,/* 0x90-0x97 */ + 0x67A0,0x67A1,0x67A4,0x67A6,0x67A9,0x67AC,0x67AE,0x67B1,/* 0x98-0x9F */ + 0x67B2,0x67B4,0x67B9,0x67BA,0x67BB,0x67BC,0x67BD,0x67BE,/* 0xA0-0xA7 */ + 0x67BF,0x67C0,0x67C2,0x67C5,0x67C6,0x67C7,0x67C8,0x67C9,/* 0xA8-0xAF */ + 0x67CA,0x67CB,0x67CC,0x67CD,0x67CE,0x67D5,0x67D6,0x67D7,/* 0xB0-0xB7 */ + 0x67DB,0x67DF,0x67E1,0x67E3,0x67E4,0x67E6,0x67E7,0x67E8,/* 0xB8-0xBF */ + 0x67EA,0x67EB,0x67ED,0x67EE,0x67F2,0x67F5,0x67F6,0x67F7,/* 0xC0-0xC7 */ + 0x67F8,0x67F9,0x67FA,0x67FB,0x67FC,0x67FE,0x6801,0x6802,/* 0xC8-0xCF */ + 0x6803,0x6804,0x6806,0x680D,0x6810,0x6812,0x6814,0x6815,/* 0xD0-0xD7 */ + 0x6818,0x6819,0x681A,0x681B,0x681C,0x681E,0x681F,0x6820,/* 0xD8-0xDF */ + 0x6822,0x6823,0x6824,0x6825,0x6826,0x6827,0x6828,0x682B,/* 0xE0-0xE7 */ + 0x682C,0x682D,0x682E,0x682F,0x6830,0x6831,0x6834,0x6835,/* 0xE8-0xEF */ + 0x6836,0x683A,0x683B,0x683F,0x6847,0x684B,0x684D,0x684F,/* 0xF0-0xF7 */ + 0x6852,0x6856,0x6857,0x6858,0x6859,0x685A,0x685B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x685C,0x685D,0x685E,0x685F,0x686A,0x686C,0x686D,0x686E,/* 0x40-0x47 */ + 0x686F,0x6870,0x6871,0x6872,0x6873,0x6875,0x6878,0x6879,/* 0x48-0x4F */ + 0x687A,0x687B,0x687C,0x687D,0x687E,0x687F,0x6880,0x6882,/* 0x50-0x57 */ + 0x6884,0x6887,0x6888,0x6889,0x688A,0x688B,0x688C,0x688D,/* 0x58-0x5F */ + 0x688E,0x6890,0x6891,0x6892,0x6894,0x6895,0x6896,0x6898,/* 0x60-0x67 */ + 0x6899,0x689A,0x689B,0x689C,0x689D,0x689E,0x689F,0x68A0,/* 0x68-0x6F */ + 0x68A1,0x68A3,0x68A4,0x68A5,0x68A9,0x68AA,0x68AB,0x68AC,/* 0x70-0x77 */ + 0x68AE,0x68B1,0x68B2,0x68B4,0x68B6,0x68B7,0x68B8,0x0000,/* 0x78-0x7F */ + + 0x68B9,0x68BA,0x68BB,0x68BC,0x68BD,0x68BE,0x68BF,0x68C1,/* 0x80-0x87 */ + 0x68C3,0x68C4,0x68C5,0x68C6,0x68C7,0x68C8,0x68CA,0x68CC,/* 0x88-0x8F */ + 0x68CE,0x68CF,0x68D0,0x68D1,0x68D3,0x68D4,0x68D6,0x68D7,/* 0x90-0x97 */ + 0x68D9,0x68DB,0x68DC,0x68DD,0x68DE,0x68DF,0x68E1,0x68E2,/* 0x98-0x9F */ + 0x68E4,0x68E5,0x68E6,0x68E7,0x68E8,0x68E9,0x68EA,0x68EB,/* 0xA0-0xA7 */ + 0x68EC,0x68ED,0x68EF,0x68F2,0x68F3,0x68F4,0x68F6,0x68F7,/* 0xA8-0xAF */ + 0x68F8,0x68FB,0x68FD,0x68FE,0x68FF,0x6900,0x6902,0x6903,/* 0xB0-0xB7 */ + 0x6904,0x6906,0x6907,0x6908,0x6909,0x690A,0x690C,0x690F,/* 0xB8-0xBF */ + 0x6911,0x6913,0x6914,0x6915,0x6916,0x6917,0x6918,0x6919,/* 0xC0-0xC7 */ + 0x691A,0x691B,0x691C,0x691D,0x691E,0x6921,0x6922,0x6923,/* 0xC8-0xCF */ + 0x6925,0x6926,0x6927,0x6928,0x6929,0x692A,0x692B,0x692C,/* 0xD0-0xD7 */ + 0x692E,0x692F,0x6931,0x6932,0x6933,0x6935,0x6936,0x6937,/* 0xD8-0xDF */ + 0x6938,0x693A,0x693B,0x693C,0x693E,0x6940,0x6941,0x6943,/* 0xE0-0xE7 */ + 0x6944,0x6945,0x6946,0x6947,0x6948,0x6949,0x694A,0x694B,/* 0xE8-0xEF */ + 0x694C,0x694D,0x694E,0x694F,0x6950,0x6951,0x6952,0x6953,/* 0xF0-0xF7 */ + 0x6955,0x6956,0x6958,0x6959,0x695B,0x695C,0x695F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6961,0x6962,0x6964,0x6965,0x6967,0x6968,0x6969,0x696A,/* 0x40-0x47 */ + 0x696C,0x696D,0x696F,0x6970,0x6972,0x6973,0x6974,0x6975,/* 0x48-0x4F */ + 0x6976,0x697A,0x697B,0x697D,0x697E,0x697F,0x6981,0x6983,/* 0x50-0x57 */ + 0x6985,0x698A,0x698B,0x698C,0x698E,0x698F,0x6990,0x6991,/* 0x58-0x5F */ + 0x6992,0x6993,0x6996,0x6997,0x6999,0x699A,0x699D,0x699E,/* 0x60-0x67 */ + 0x699F,0x69A0,0x69A1,0x69A2,0x69A3,0x69A4,0x69A5,0x69A6,/* 0x68-0x6F */ + 0x69A9,0x69AA,0x69AC,0x69AE,0x69AF,0x69B0,0x69B2,0x69B3,/* 0x70-0x77 */ + 0x69B5,0x69B6,0x69B8,0x69B9,0x69BA,0x69BC,0x69BD,0x0000,/* 0x78-0x7F */ + + 0x69BE,0x69BF,0x69C0,0x69C2,0x69C3,0x69C4,0x69C5,0x69C6,/* 0x80-0x87 */ + 0x69C7,0x69C8,0x69C9,0x69CB,0x69CD,0x69CF,0x69D1,0x69D2,/* 0x88-0x8F */ + 0x69D3,0x69D5,0x69D6,0x69D7,0x69D8,0x69D9,0x69DA,0x69DC,/* 0x90-0x97 */ + 0x69DD,0x69DE,0x69E1,0x69E2,0x69E3,0x69E4,0x69E5,0x69E6,/* 0x98-0x9F */ + 0x69E7,0x69E8,0x69E9,0x69EA,0x69EB,0x69EC,0x69EE,0x69EF,/* 0xA0-0xA7 */ + 0x69F0,0x69F1,0x69F3,0x69F4,0x69F5,0x69F6,0x69F7,0x69F8,/* 0xA8-0xAF */ + 0x69F9,0x69FA,0x69FB,0x69FC,0x69FE,0x6A00,0x6A01,0x6A02,/* 0xB0-0xB7 */ + 0x6A03,0x6A04,0x6A05,0x6A06,0x6A07,0x6A08,0x6A09,0x6A0B,/* 0xB8-0xBF */ + 0x6A0C,0x6A0D,0x6A0E,0x6A0F,0x6A10,0x6A11,0x6A12,0x6A13,/* 0xC0-0xC7 */ + 0x6A14,0x6A15,0x6A16,0x6A19,0x6A1A,0x6A1B,0x6A1C,0x6A1D,/* 0xC8-0xCF */ + 0x6A1E,0x6A20,0x6A22,0x6A23,0x6A24,0x6A25,0x6A26,0x6A27,/* 0xD0-0xD7 */ + 0x6A29,0x6A2B,0x6A2C,0x6A2D,0x6A2E,0x6A30,0x6A32,0x6A33,/* 0xD8-0xDF */ + 0x6A34,0x6A36,0x6A37,0x6A38,0x6A39,0x6A3A,0x6A3B,0x6A3C,/* 0xE0-0xE7 */ + 0x6A3F,0x6A40,0x6A41,0x6A42,0x6A43,0x6A45,0x6A46,0x6A48,/* 0xE8-0xEF */ + 0x6A49,0x6A4A,0x6A4B,0x6A4C,0x6A4D,0x6A4E,0x6A4F,0x6A51,/* 0xF0-0xF7 */ + 0x6A52,0x6A53,0x6A54,0x6A55,0x6A56,0x6A57,0x6A5A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A5C,0x6A5D,0x6A5E,0x6A5F,0x6A60,0x6A62,0x6A63,0x6A64,/* 0x40-0x47 */ + 0x6A66,0x6A67,0x6A68,0x6A69,0x6A6A,0x6A6B,0x6A6C,0x6A6D,/* 0x48-0x4F */ + 0x6A6E,0x6A6F,0x6A70,0x6A72,0x6A73,0x6A74,0x6A75,0x6A76,/* 0x50-0x57 */ + 0x6A77,0x6A78,0x6A7A,0x6A7B,0x6A7D,0x6A7E,0x6A7F,0x6A81,/* 0x58-0x5F */ + 0x6A82,0x6A83,0x6A85,0x6A86,0x6A87,0x6A88,0x6A89,0x6A8A,/* 0x60-0x67 */ + 0x6A8B,0x6A8C,0x6A8D,0x6A8F,0x6A92,0x6A93,0x6A94,0x6A95,/* 0x68-0x6F */ + 0x6A96,0x6A98,0x6A99,0x6A9A,0x6A9B,0x6A9C,0x6A9D,0x6A9E,/* 0x70-0x77 */ + 0x6A9F,0x6AA1,0x6AA2,0x6AA3,0x6AA4,0x6AA5,0x6AA6,0x0000,/* 0x78-0x7F */ + + 0x6AA7,0x6AA8,0x6AAA,0x6AAD,0x6AAE,0x6AAF,0x6AB0,0x6AB1,/* 0x80-0x87 */ + 0x6AB2,0x6AB3,0x6AB4,0x6AB5,0x6AB6,0x6AB7,0x6AB8,0x6AB9,/* 0x88-0x8F */ + 0x6ABA,0x6ABB,0x6ABC,0x6ABD,0x6ABE,0x6ABF,0x6AC0,0x6AC1,/* 0x90-0x97 */ + 0x6AC2,0x6AC3,0x6AC4,0x6AC5,0x6AC6,0x6AC7,0x6AC8,0x6AC9,/* 0x98-0x9F */ + 0x6ACA,0x6ACB,0x6ACC,0x6ACD,0x6ACE,0x6ACF,0x6AD0,0x6AD1,/* 0xA0-0xA7 */ + 0x6AD2,0x6AD3,0x6AD4,0x6AD5,0x6AD6,0x6AD7,0x6AD8,0x6AD9,/* 0xA8-0xAF */ + 0x6ADA,0x6ADB,0x6ADC,0x6ADD,0x6ADE,0x6ADF,0x6AE0,0x6AE1,/* 0xB0-0xB7 */ + 0x6AE2,0x6AE3,0x6AE4,0x6AE5,0x6AE6,0x6AE7,0x6AE8,0x6AE9,/* 0xB8-0xBF */ + 0x6AEA,0x6AEB,0x6AEC,0x6AED,0x6AEE,0x6AEF,0x6AF0,0x6AF1,/* 0xC0-0xC7 */ + 0x6AF2,0x6AF3,0x6AF4,0x6AF5,0x6AF6,0x6AF7,0x6AF8,0x6AF9,/* 0xC8-0xCF */ + 0x6AFA,0x6AFB,0x6AFC,0x6AFD,0x6AFE,0x6AFF,0x6B00,0x6B01,/* 0xD0-0xD7 */ + 0x6B02,0x6B03,0x6B04,0x6B05,0x6B06,0x6B07,0x6B08,0x6B09,/* 0xD8-0xDF */ + 0x6B0A,0x6B0B,0x6B0C,0x6B0D,0x6B0E,0x6B0F,0x6B10,0x6B11,/* 0xE0-0xE7 */ + 0x6B12,0x6B13,0x6B14,0x6B15,0x6B16,0x6B17,0x6B18,0x6B19,/* 0xE8-0xEF */ + 0x6B1A,0x6B1B,0x6B1C,0x6B1D,0x6B1E,0x6B1F,0x6B25,0x6B26,/* 0xF0-0xF7 */ + 0x6B28,0x6B29,0x6B2A,0x6B2B,0x6B2C,0x6B2D,0x6B2E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6B2F,0x6B30,0x6B31,0x6B33,0x6B34,0x6B35,0x6B36,0x6B38,/* 0x40-0x47 */ + 0x6B3B,0x6B3C,0x6B3D,0x6B3F,0x6B40,0x6B41,0x6B42,0x6B44,/* 0x48-0x4F */ + 0x6B45,0x6B48,0x6B4A,0x6B4B,0x6B4D,0x6B4E,0x6B4F,0x6B50,/* 0x50-0x57 */ + 0x6B51,0x6B52,0x6B53,0x6B54,0x6B55,0x6B56,0x6B57,0x6B58,/* 0x58-0x5F */ + 0x6B5A,0x6B5B,0x6B5C,0x6B5D,0x6B5E,0x6B5F,0x6B60,0x6B61,/* 0x60-0x67 */ + 0x6B68,0x6B69,0x6B6B,0x6B6C,0x6B6D,0x6B6E,0x6B6F,0x6B70,/* 0x68-0x6F */ + 0x6B71,0x6B72,0x6B73,0x6B74,0x6B75,0x6B76,0x6B77,0x6B78,/* 0x70-0x77 */ + 0x6B7A,0x6B7D,0x6B7E,0x6B7F,0x6B80,0x6B85,0x6B88,0x0000,/* 0x78-0x7F */ + + 0x6B8C,0x6B8E,0x6B8F,0x6B90,0x6B91,0x6B94,0x6B95,0x6B97,/* 0x80-0x87 */ + 0x6B98,0x6B99,0x6B9C,0x6B9D,0x6B9E,0x6B9F,0x6BA0,0x6BA2,/* 0x88-0x8F */ + 0x6BA3,0x6BA4,0x6BA5,0x6BA6,0x6BA7,0x6BA8,0x6BA9,0x6BAB,/* 0x90-0x97 */ + 0x6BAC,0x6BAD,0x6BAE,0x6BAF,0x6BB0,0x6BB1,0x6BB2,0x6BB6,/* 0x98-0x9F */ + 0x6BB8,0x6BB9,0x6BBA,0x6BBB,0x6BBC,0x6BBD,0x6BBE,0x6BC0,/* 0xA0-0xA7 */ + 0x6BC3,0x6BC4,0x6BC6,0x6BC7,0x6BC8,0x6BC9,0x6BCA,0x6BCC,/* 0xA8-0xAF */ + 0x6BCE,0x6BD0,0x6BD1,0x6BD8,0x6BDA,0x6BDC,0x6BDD,0x6BDE,/* 0xB0-0xB7 */ + 0x6BDF,0x6BE0,0x6BE2,0x6BE3,0x6BE4,0x6BE5,0x6BE6,0x6BE7,/* 0xB8-0xBF */ + 0x6BE8,0x6BE9,0x6BEC,0x6BED,0x6BEE,0x6BF0,0x6BF1,0x6BF2,/* 0xC0-0xC7 */ + 0x6BF4,0x6BF6,0x6BF7,0x6BF8,0x6BFA,0x6BFB,0x6BFC,0x6BFE,/* 0xC8-0xCF */ + 0x6BFF,0x6C00,0x6C01,0x6C02,0x6C03,0x6C04,0x6C08,0x6C09,/* 0xD0-0xD7 */ + 0x6C0A,0x6C0B,0x6C0C,0x6C0E,0x6C12,0x6C17,0x6C1C,0x6C1D,/* 0xD8-0xDF */ + 0x6C1E,0x6C20,0x6C23,0x6C25,0x6C2B,0x6C2C,0x6C2D,0x6C31,/* 0xE0-0xE7 */ + 0x6C33,0x6C36,0x6C37,0x6C39,0x6C3A,0x6C3B,0x6C3C,0x6C3E,/* 0xE8-0xEF */ + 0x6C3F,0x6C43,0x6C44,0x6C45,0x6C48,0x6C4B,0x6C4C,0x6C4D,/* 0xF0-0xF7 */ + 0x6C4E,0x6C4F,0x6C51,0x6C52,0x6C53,0x6C56,0x6C58,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6C59,0x6C5A,0x6C62,0x6C63,0x6C65,0x6C66,0x6C67,0x6C6B,/* 0x40-0x47 */ + 0x6C6C,0x6C6D,0x6C6E,0x6C6F,0x6C71,0x6C73,0x6C75,0x6C77,/* 0x48-0x4F */ + 0x6C78,0x6C7A,0x6C7B,0x6C7C,0x6C7F,0x6C80,0x6C84,0x6C87,/* 0x50-0x57 */ + 0x6C8A,0x6C8B,0x6C8D,0x6C8E,0x6C91,0x6C92,0x6C95,0x6C96,/* 0x58-0x5F */ + 0x6C97,0x6C98,0x6C9A,0x6C9C,0x6C9D,0x6C9E,0x6CA0,0x6CA2,/* 0x60-0x67 */ + 0x6CA8,0x6CAC,0x6CAF,0x6CB0,0x6CB4,0x6CB5,0x6CB6,0x6CB7,/* 0x68-0x6F */ + 0x6CBA,0x6CC0,0x6CC1,0x6CC2,0x6CC3,0x6CC6,0x6CC7,0x6CC8,/* 0x70-0x77 */ + 0x6CCB,0x6CCD,0x6CCE,0x6CCF,0x6CD1,0x6CD2,0x6CD8,0x0000,/* 0x78-0x7F */ + + 0x6CD9,0x6CDA,0x6CDC,0x6CDD,0x6CDF,0x6CE4,0x6CE6,0x6CE7,/* 0x80-0x87 */ + 0x6CE9,0x6CEC,0x6CED,0x6CF2,0x6CF4,0x6CF9,0x6CFF,0x6D00,/* 0x88-0x8F */ + 0x6D02,0x6D03,0x6D05,0x6D06,0x6D08,0x6D09,0x6D0A,0x6D0D,/* 0x90-0x97 */ + 0x6D0F,0x6D10,0x6D11,0x6D13,0x6D14,0x6D15,0x6D16,0x6D18,/* 0x98-0x9F */ + 0x6D1C,0x6D1D,0x6D1F,0x6D20,0x6D21,0x6D22,0x6D23,0x6D24,/* 0xA0-0xA7 */ + 0x6D26,0x6D28,0x6D29,0x6D2C,0x6D2D,0x6D2F,0x6D30,0x6D34,/* 0xA8-0xAF */ + 0x6D36,0x6D37,0x6D38,0x6D3A,0x6D3F,0x6D40,0x6D42,0x6D44,/* 0xB0-0xB7 */ + 0x6D49,0x6D4C,0x6D50,0x6D55,0x6D56,0x6D57,0x6D58,0x6D5B,/* 0xB8-0xBF */ + 0x6D5D,0x6D5F,0x6D61,0x6D62,0x6D64,0x6D65,0x6D67,0x6D68,/* 0xC0-0xC7 */ + 0x6D6B,0x6D6C,0x6D6D,0x6D70,0x6D71,0x6D72,0x6D73,0x6D75,/* 0xC8-0xCF */ + 0x6D76,0x6D79,0x6D7A,0x6D7B,0x6D7D,0x6D7E,0x6D7F,0x6D80,/* 0xD0-0xD7 */ + 0x6D81,0x6D83,0x6D84,0x6D86,0x6D87,0x6D8A,0x6D8B,0x6D8D,/* 0xD8-0xDF */ + 0x6D8F,0x6D90,0x6D92,0x6D96,0x6D97,0x6D98,0x6D99,0x6D9A,/* 0xE0-0xE7 */ + 0x6D9C,0x6DA2,0x6DA5,0x6DAC,0x6DAD,0x6DB0,0x6DB1,0x6DB3,/* 0xE8-0xEF */ + 0x6DB4,0x6DB6,0x6DB7,0x6DB9,0x6DBA,0x6DBB,0x6DBC,0x6DBD,/* 0xF0-0xF7 */ + 0x6DBE,0x6DC1,0x6DC2,0x6DC3,0x6DC8,0x6DC9,0x6DCA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6DCD,0x6DCE,0x6DCF,0x6DD0,0x6DD2,0x6DD3,0x6DD4,0x6DD5,/* 0x40-0x47 */ + 0x6DD7,0x6DDA,0x6DDB,0x6DDC,0x6DDF,0x6DE2,0x6DE3,0x6DE5,/* 0x48-0x4F */ + 0x6DE7,0x6DE8,0x6DE9,0x6DEA,0x6DED,0x6DEF,0x6DF0,0x6DF2,/* 0x50-0x57 */ + 0x6DF4,0x6DF5,0x6DF6,0x6DF8,0x6DFA,0x6DFD,0x6DFE,0x6DFF,/* 0x58-0x5F */ + 0x6E00,0x6E01,0x6E02,0x6E03,0x6E04,0x6E06,0x6E07,0x6E08,/* 0x60-0x67 */ + 0x6E09,0x6E0B,0x6E0F,0x6E12,0x6E13,0x6E15,0x6E18,0x6E19,/* 0x68-0x6F */ + 0x6E1B,0x6E1C,0x6E1E,0x6E1F,0x6E22,0x6E26,0x6E27,0x6E28,/* 0x70-0x77 */ + 0x6E2A,0x6E2C,0x6E2E,0x6E30,0x6E31,0x6E33,0x6E35,0x0000,/* 0x78-0x7F */ + + 0x6E36,0x6E37,0x6E39,0x6E3B,0x6E3C,0x6E3D,0x6E3E,0x6E3F,/* 0x80-0x87 */ + 0x6E40,0x6E41,0x6E42,0x6E45,0x6E46,0x6E47,0x6E48,0x6E49,/* 0x88-0x8F */ + 0x6E4A,0x6E4B,0x6E4C,0x6E4F,0x6E50,0x6E51,0x6E52,0x6E55,/* 0x90-0x97 */ + 0x6E57,0x6E59,0x6E5A,0x6E5C,0x6E5D,0x6E5E,0x6E60,0x6E61,/* 0x98-0x9F */ + 0x6E62,0x6E63,0x6E64,0x6E65,0x6E66,0x6E67,0x6E68,0x6E69,/* 0xA0-0xA7 */ + 0x6E6A,0x6E6C,0x6E6D,0x6E6F,0x6E70,0x6E71,0x6E72,0x6E73,/* 0xA8-0xAF */ + 0x6E74,0x6E75,0x6E76,0x6E77,0x6E78,0x6E79,0x6E7A,0x6E7B,/* 0xB0-0xB7 */ + 0x6E7C,0x6E7D,0x6E80,0x6E81,0x6E82,0x6E84,0x6E87,0x6E88,/* 0xB8-0xBF */ + 0x6E8A,0x6E8B,0x6E8C,0x6E8D,0x6E8E,0x6E91,0x6E92,0x6E93,/* 0xC0-0xC7 */ + 0x6E94,0x6E95,0x6E96,0x6E97,0x6E99,0x6E9A,0x6E9B,0x6E9D,/* 0xC8-0xCF */ + 0x6E9E,0x6EA0,0x6EA1,0x6EA3,0x6EA4,0x6EA6,0x6EA8,0x6EA9,/* 0xD0-0xD7 */ + 0x6EAB,0x6EAC,0x6EAD,0x6EAE,0x6EB0,0x6EB3,0x6EB5,0x6EB8,/* 0xD8-0xDF */ + 0x6EB9,0x6EBC,0x6EBE,0x6EBF,0x6EC0,0x6EC3,0x6EC4,0x6EC5,/* 0xE0-0xE7 */ + 0x6EC6,0x6EC8,0x6EC9,0x6ECA,0x6ECC,0x6ECD,0x6ECE,0x6ED0,/* 0xE8-0xEF */ + 0x6ED2,0x6ED6,0x6ED8,0x6ED9,0x6EDB,0x6EDC,0x6EDD,0x6EE3,/* 0xF0-0xF7 */ + 0x6EE7,0x6EEA,0x6EEB,0x6EEC,0x6EED,0x6EEE,0x6EEF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6EF0,0x6EF1,0x6EF2,0x6EF3,0x6EF5,0x6EF6,0x6EF7,0x6EF8,/* 0x40-0x47 */ + 0x6EFA,0x6EFB,0x6EFC,0x6EFD,0x6EFE,0x6EFF,0x6F00,0x6F01,/* 0x48-0x4F */ + 0x6F03,0x6F04,0x6F05,0x6F07,0x6F08,0x6F0A,0x6F0B,0x6F0C,/* 0x50-0x57 */ + 0x6F0D,0x6F0E,0x6F10,0x6F11,0x6F12,0x6F16,0x6F17,0x6F18,/* 0x58-0x5F */ + 0x6F19,0x6F1A,0x6F1B,0x6F1C,0x6F1D,0x6F1E,0x6F1F,0x6F21,/* 0x60-0x67 */ + 0x6F22,0x6F23,0x6F25,0x6F26,0x6F27,0x6F28,0x6F2C,0x6F2E,/* 0x68-0x6F */ + 0x6F30,0x6F32,0x6F34,0x6F35,0x6F37,0x6F38,0x6F39,0x6F3A,/* 0x70-0x77 */ + 0x6F3B,0x6F3C,0x6F3D,0x6F3F,0x6F40,0x6F41,0x6F42,0x0000,/* 0x78-0x7F */ + + 0x6F43,0x6F44,0x6F45,0x6F48,0x6F49,0x6F4A,0x6F4C,0x6F4E,/* 0x80-0x87 */ + 0x6F4F,0x6F50,0x6F51,0x6F52,0x6F53,0x6F54,0x6F55,0x6F56,/* 0x88-0x8F */ + 0x6F57,0x6F59,0x6F5A,0x6F5B,0x6F5D,0x6F5F,0x6F60,0x6F61,/* 0x90-0x97 */ + 0x6F63,0x6F64,0x6F65,0x6F67,0x6F68,0x6F69,0x6F6A,0x6F6B,/* 0x98-0x9F */ + 0x6F6C,0x6F6F,0x6F70,0x6F71,0x6F73,0x6F75,0x6F76,0x6F77,/* 0xA0-0xA7 */ + 0x6F79,0x6F7B,0x6F7D,0x6F7E,0x6F7F,0x6F80,0x6F81,0x6F82,/* 0xA8-0xAF */ + 0x6F83,0x6F85,0x6F86,0x6F87,0x6F8A,0x6F8B,0x6F8F,0x6F90,/* 0xB0-0xB7 */ + 0x6F91,0x6F92,0x6F93,0x6F94,0x6F95,0x6F96,0x6F97,0x6F98,/* 0xB8-0xBF */ + 0x6F99,0x6F9A,0x6F9B,0x6F9D,0x6F9E,0x6F9F,0x6FA0,0x6FA2,/* 0xC0-0xC7 */ + 0x6FA3,0x6FA4,0x6FA5,0x6FA6,0x6FA8,0x6FA9,0x6FAA,0x6FAB,/* 0xC8-0xCF */ + 0x6FAC,0x6FAD,0x6FAE,0x6FAF,0x6FB0,0x6FB1,0x6FB2,0x6FB4,/* 0xD0-0xD7 */ + 0x6FB5,0x6FB7,0x6FB8,0x6FBA,0x6FBB,0x6FBC,0x6FBD,0x6FBE,/* 0xD8-0xDF */ + 0x6FBF,0x6FC1,0x6FC3,0x6FC4,0x6FC5,0x6FC6,0x6FC7,0x6FC8,/* 0xE0-0xE7 */ + 0x6FCA,0x6FCB,0x6FCC,0x6FCD,0x6FCE,0x6FCF,0x6FD0,0x6FD3,/* 0xE8-0xEF */ + 0x6FD4,0x6FD5,0x6FD6,0x6FD7,0x6FD8,0x6FD9,0x6FDA,0x6FDB,/* 0xF0-0xF7 */ + 0x6FDC,0x6FDD,0x6FDF,0x6FE2,0x6FE3,0x6FE4,0x6FE5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FE6,0x6FE7,0x6FE8,0x6FE9,0x6FEA,0x6FEB,0x6FEC,0x6FED,/* 0x40-0x47 */ + 0x6FF0,0x6FF1,0x6FF2,0x6FF3,0x6FF4,0x6FF5,0x6FF6,0x6FF7,/* 0x48-0x4F */ + 0x6FF8,0x6FF9,0x6FFA,0x6FFB,0x6FFC,0x6FFD,0x6FFE,0x6FFF,/* 0x50-0x57 */ + 0x7000,0x7001,0x7002,0x7003,0x7004,0x7005,0x7006,0x7007,/* 0x58-0x5F */ + 0x7008,0x7009,0x700A,0x700B,0x700C,0x700D,0x700E,0x700F,/* 0x60-0x67 */ + 0x7010,0x7012,0x7013,0x7014,0x7015,0x7016,0x7017,0x7018,/* 0x68-0x6F */ + 0x7019,0x701C,0x701D,0x701E,0x701F,0x7020,0x7021,0x7022,/* 0x70-0x77 */ + 0x7024,0x7025,0x7026,0x7027,0x7028,0x7029,0x702A,0x0000,/* 0x78-0x7F */ + + 0x702B,0x702C,0x702D,0x702E,0x702F,0x7030,0x7031,0x7032,/* 0x80-0x87 */ + 0x7033,0x7034,0x7036,0x7037,0x7038,0x703A,0x703B,0x703C,/* 0x88-0x8F */ + 0x703D,0x703E,0x703F,0x7040,0x7041,0x7042,0x7043,0x7044,/* 0x90-0x97 */ + 0x7045,0x7046,0x7047,0x7048,0x7049,0x704A,0x704B,0x704D,/* 0x98-0x9F */ + 0x704E,0x7050,0x7051,0x7052,0x7053,0x7054,0x7055,0x7056,/* 0xA0-0xA7 */ + 0x7057,0x7058,0x7059,0x705A,0x705B,0x705C,0x705D,0x705F,/* 0xA8-0xAF */ + 0x7060,0x7061,0x7062,0x7063,0x7064,0x7065,0x7066,0x7067,/* 0xB0-0xB7 */ + 0x7068,0x7069,0x706A,0x706E,0x7071,0x7072,0x7073,0x7074,/* 0xB8-0xBF */ + 0x7077,0x7079,0x707A,0x707B,0x707D,0x7081,0x7082,0x7083,/* 0xC0-0xC7 */ + 0x7084,0x7086,0x7087,0x7088,0x708B,0x708C,0x708D,0x708F,/* 0xC8-0xCF */ + 0x7090,0x7091,0x7093,0x7097,0x7098,0x709A,0x709B,0x709E,/* 0xD0-0xD7 */ + 0x709F,0x70A0,0x70A1,0x70A2,0x70A3,0x70A4,0x70A5,0x70A6,/* 0xD8-0xDF */ + 0x70A7,0x70A8,0x70A9,0x70AA,0x70B0,0x70B2,0x70B4,0x70B5,/* 0xE0-0xE7 */ + 0x70B6,0x70BA,0x70BE,0x70BF,0x70C4,0x70C5,0x70C6,0x70C7,/* 0xE8-0xEF */ + 0x70C9,0x70CB,0x70CC,0x70CD,0x70CE,0x70CF,0x70D0,0x70D1,/* 0xF0-0xF7 */ + 0x70D2,0x70D3,0x70D4,0x70D5,0x70D6,0x70D7,0x70DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x70DC,0x70DD,0x70DE,0x70E0,0x70E1,0x70E2,0x70E3,0x70E5,/* 0x40-0x47 */ + 0x70EA,0x70EE,0x70F0,0x70F1,0x70F2,0x70F3,0x70F4,0x70F5,/* 0x48-0x4F */ + 0x70F6,0x70F8,0x70FA,0x70FB,0x70FC,0x70FE,0x70FF,0x7100,/* 0x50-0x57 */ + 0x7101,0x7102,0x7103,0x7104,0x7105,0x7106,0x7107,0x7108,/* 0x58-0x5F */ + 0x710B,0x710C,0x710D,0x710E,0x710F,0x7111,0x7112,0x7114,/* 0x60-0x67 */ + 0x7117,0x711B,0x711C,0x711D,0x711E,0x711F,0x7120,0x7121,/* 0x68-0x6F */ + 0x7122,0x7123,0x7124,0x7125,0x7127,0x7128,0x7129,0x712A,/* 0x70-0x77 */ + 0x712B,0x712C,0x712D,0x712E,0x7132,0x7133,0x7134,0x0000,/* 0x78-0x7F */ + + 0x7135,0x7137,0x7138,0x7139,0x713A,0x713B,0x713C,0x713D,/* 0x80-0x87 */ + 0x713E,0x713F,0x7140,0x7141,0x7142,0x7143,0x7144,0x7146,/* 0x88-0x8F */ + 0x7147,0x7148,0x7149,0x714B,0x714D,0x714F,0x7150,0x7151,/* 0x90-0x97 */ + 0x7152,0x7153,0x7154,0x7155,0x7156,0x7157,0x7158,0x7159,/* 0x98-0x9F */ + 0x715A,0x715B,0x715D,0x715F,0x7160,0x7161,0x7162,0x7163,/* 0xA0-0xA7 */ + 0x7165,0x7169,0x716A,0x716B,0x716C,0x716D,0x716F,0x7170,/* 0xA8-0xAF */ + 0x7171,0x7174,0x7175,0x7176,0x7177,0x7179,0x717B,0x717C,/* 0xB0-0xB7 */ + 0x717E,0x717F,0x7180,0x7181,0x7182,0x7183,0x7185,0x7186,/* 0xB8-0xBF */ + 0x7187,0x7188,0x7189,0x718B,0x718C,0x718D,0x718E,0x7190,/* 0xC0-0xC7 */ + 0x7191,0x7192,0x7193,0x7195,0x7196,0x7197,0x719A,0x719B,/* 0xC8-0xCF */ + 0x719C,0x719D,0x719E,0x71A1,0x71A2,0x71A3,0x71A4,0x71A5,/* 0xD0-0xD7 */ + 0x71A6,0x71A7,0x71A9,0x71AA,0x71AB,0x71AD,0x71AE,0x71AF,/* 0xD8-0xDF */ + 0x71B0,0x71B1,0x71B2,0x71B4,0x71B6,0x71B7,0x71B8,0x71BA,/* 0xE0-0xE7 */ + 0x71BB,0x71BC,0x71BD,0x71BE,0x71BF,0x71C0,0x71C1,0x71C2,/* 0xE8-0xEF */ + 0x71C4,0x71C5,0x71C6,0x71C7,0x71C8,0x71C9,0x71CA,0x71CB,/* 0xF0-0xF7 */ + 0x71CC,0x71CD,0x71CF,0x71D0,0x71D1,0x71D2,0x71D3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x71D6,0x71D7,0x71D8,0x71D9,0x71DA,0x71DB,0x71DC,0x71DD,/* 0x40-0x47 */ + 0x71DE,0x71DF,0x71E1,0x71E2,0x71E3,0x71E4,0x71E6,0x71E8,/* 0x48-0x4F */ + 0x71E9,0x71EA,0x71EB,0x71EC,0x71ED,0x71EF,0x71F0,0x71F1,/* 0x50-0x57 */ + 0x71F2,0x71F3,0x71F4,0x71F5,0x71F6,0x71F7,0x71F8,0x71FA,/* 0x58-0x5F */ + 0x71FB,0x71FC,0x71FD,0x71FE,0x71FF,0x7200,0x7201,0x7202,/* 0x60-0x67 */ + 0x7203,0x7204,0x7205,0x7207,0x7208,0x7209,0x720A,0x720B,/* 0x68-0x6F */ + 0x720C,0x720D,0x720E,0x720F,0x7210,0x7211,0x7212,0x7213,/* 0x70-0x77 */ + 0x7214,0x7215,0x7216,0x7217,0x7218,0x7219,0x721A,0x0000,/* 0x78-0x7F */ + + 0x721B,0x721C,0x721E,0x721F,0x7220,0x7221,0x7222,0x7223,/* 0x80-0x87 */ + 0x7224,0x7225,0x7226,0x7227,0x7229,0x722B,0x722D,0x722E,/* 0x88-0x8F */ + 0x722F,0x7232,0x7233,0x7234,0x723A,0x723C,0x723E,0x7240,/* 0x90-0x97 */ + 0x7241,0x7242,0x7243,0x7244,0x7245,0x7246,0x7249,0x724A,/* 0x98-0x9F */ + 0x724B,0x724E,0x724F,0x7250,0x7251,0x7253,0x7254,0x7255,/* 0xA0-0xA7 */ + 0x7257,0x7258,0x725A,0x725C,0x725E,0x7260,0x7263,0x7264,/* 0xA8-0xAF */ + 0x7265,0x7268,0x726A,0x726B,0x726C,0x726D,0x7270,0x7271,/* 0xB0-0xB7 */ + 0x7273,0x7274,0x7276,0x7277,0x7278,0x727B,0x727C,0x727D,/* 0xB8-0xBF */ + 0x7282,0x7283,0x7285,0x7286,0x7287,0x7288,0x7289,0x728C,/* 0xC0-0xC7 */ + 0x728E,0x7290,0x7291,0x7293,0x7294,0x7295,0x7296,0x7297,/* 0xC8-0xCF */ + 0x7298,0x7299,0x729A,0x729B,0x729C,0x729D,0x729E,0x72A0,/* 0xD0-0xD7 */ + 0x72A1,0x72A2,0x72A3,0x72A4,0x72A5,0x72A6,0x72A7,0x72A8,/* 0xD8-0xDF */ + 0x72A9,0x72AA,0x72AB,0x72AE,0x72B1,0x72B2,0x72B3,0x72B5,/* 0xE0-0xE7 */ + 0x72BA,0x72BB,0x72BC,0x72BD,0x72BE,0x72BF,0x72C0,0x72C5,/* 0xE8-0xEF */ + 0x72C6,0x72C7,0x72C9,0x72CA,0x72CB,0x72CC,0x72CF,0x72D1,/* 0xF0-0xF7 */ + 0x72D3,0x72D4,0x72D5,0x72D6,0x72D8,0x72DA,0x72DB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3000,0x3001,0x3002,0x00B7,0x02C9,0x02C7,0x00A8,/* 0xA0-0xA7 */ + 0x3003,0x3005,0x2014,0xFF5E,0x2016,0x2026,0x2018,0x2019,/* 0xA8-0xAF */ + 0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */ + 0x300C,0x300D,0x300E,0x300F,0x3016,0x3017,0x3010,0x3011,/* 0xB8-0xBF */ + 0x00B1,0x00D7,0x00F7,0x2236,0x2227,0x2228,0x2211,0x220F,/* 0xC0-0xC7 */ + 0x222A,0x2229,0x2208,0x2237,0x221A,0x22A5,0x2225,0x2220,/* 0xC8-0xCF */ + 0x2312,0x2299,0x222B,0x222E,0x2261,0x224C,0x2248,0x223D,/* 0xD0-0xD7 */ + 0x221D,0x2260,0x226E,0x226F,0x2264,0x2265,0x221E,0x2235,/* 0xD8-0xDF */ + 0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFF04,/* 0xE0-0xE7 */ + 0x00A4,0xFFE0,0xFFE1,0x2030,0x00A7,0x2116,0x2606,0x2605,/* 0xE8-0xEF */ + 0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,0x25A1,0x25A0,0x25B3,/* 0xF0-0xF7 */ + 0x25B2,0x203B,0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */ + 0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x0000,0x2488,0x2489,0x248A,0x248B,0x248C,0x248D,0x248E,/* 0xB0-0xB7 */ + 0x248F,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,/* 0xB8-0xBF */ + 0x2497,0x2498,0x2499,0x249A,0x249B,0x2474,0x2475,0x2476,/* 0xC0-0xC7 */ + 0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,0x247D,0x247E,/* 0xC8-0xCF */ + 0x247F,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,/* 0xD0-0xD7 */ + 0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,/* 0xD8-0xDF */ + 0x2467,0x2468,0x2469,0x0000,0x0000,0x3220,0x3221,0x3222,/* 0xE0-0xE7 */ + 0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0x0000,/* 0xE8-0xEF */ + 0x0000,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xF0-0xF7 */ + 0x2167,0x2168,0x2169,0x216A,0x216B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFF01,0xFF02,0xFF03,0xFFE5,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */ + 0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */ + 0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */ + 0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */ + 0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */ + 0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */ + 0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */ + 0xFF38,0xFF39,0xFF3A,0xFF3B,0xFF3C,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */ + 0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */ + 0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */ + 0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */ + 0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */ + 0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */ + 0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */ + 0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */ + 0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */ + 0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */ + 0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */ + 0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */ + 0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */ + 0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */ + 0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */ + 0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */ + 0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */ + 0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */ + 0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */ + 0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */ + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xA0-0xA7 */ + 0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xA8-0xAF */ + 0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xB0-0xB7 */ + 0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */ + 0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xC0-0xC7 */ + 0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xC8-0xCF */ + 0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xD0-0xD7 */ + 0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0xFE35,0xFE36,0xFE39,0xFE3A,0xFE3F,0xFE40,0xFE3D,0xFE3E,/* 0xE0-0xE7 */ + 0xFE41,0xFE42,0xFE43,0xFE44,0x0000,0x0000,0xFE3B,0xFE3C,/* 0xE8-0xEF */ + 0xFE37,0xFE38,0xFE31,0x0000,0xFE33,0xFE34,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */ + 0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */ + 0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */ + 0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */ + 0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */ + 0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */ + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x02CA,0x02CB,0x02D9,0x2013,0x2015,0x2025,0x2035,0x2105,/* 0x40-0x47 */ + 0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221F,0x2223,/* 0x48-0x4F */ + 0x2252,0x2266,0x2267,0x22BF,0x2550,0x2551,0x2552,0x2553,/* 0x50-0x57 */ + 0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255A,0x255B,/* 0x58-0x5F */ + 0x255C,0x255D,0x255E,0x255F,0x2560,0x2561,0x2562,0x2563,/* 0x60-0x67 */ + 0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256A,0x256B,/* 0x68-0x6F */ + 0x256C,0x256D,0x256E,0x256F,0x2570,0x2571,0x2572,0x2573,/* 0x70-0x77 */ + 0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,/* 0x78-0x7F */ + + 0x2588,0x2589,0x258A,0x258B,0x258C,0x258D,0x258E,0x258F,/* 0x80-0x87 */ + 0x2593,0x2594,0x2595,0x25BC,0x25BD,0x25E2,0x25E3,0x25E4,/* 0x88-0x8F */ + 0x25E5,0x2609,0x2295,0x3012,0x301D,0x301E,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0101,0x00E1,0x01CE,0x00E0,0x0113,0x00E9,0x011B,/* 0xA0-0xA7 */ + 0x00E8,0x012B,0x00ED,0x01D0,0x00EC,0x014D,0x00F3,0x01D2,/* 0xA8-0xAF */ + 0x00F2,0x016B,0x00FA,0x01D4,0x00F9,0x01D6,0x01D8,0x01DA,/* 0xB0-0xB7 */ + 0x01DC,0x00FC,0x00EA,0x0251,0x0000,0x0144,0x0148,0x0000,/* 0xB8-0xBF */ + 0x0261,0x0000,0x0000,0x0000,0x0000,0x3105,0x3106,0x3107,/* 0xC0-0xC7 */ + 0x3108,0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,/* 0xC8-0xCF */ + 0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,/* 0xD0-0xD7 */ + 0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,0x311F,/* 0xD8-0xDF */ + 0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,/* 0xE0-0xE7 */ + 0x3128,0x3129,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,/* 0x40-0x47 */ + 0x3029,0x32A3,0x338E,0x338F,0x339C,0x339D,0x339E,0x33A1,/* 0x48-0x4F */ + 0x33C4,0x33CE,0x33D1,0x33D2,0x33D5,0xFE30,0xFFE2,0xFFE4,/* 0x50-0x57 */ + 0x0000,0x2121,0x3231,0x0000,0x2010,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x30FC,0x309B,0x309C,0x30FD,0x30FE,0x3006,0x309D,0x309E,/* 0x60-0x67 */ + 0xFE49,0xFE4A,0xFE4B,0xFE4C,0xFE4D,0xFE4E,0xFE4F,0xFE50,/* 0x68-0x6F */ + 0xFE51,0xFE52,0xFE54,0xFE55,0xFE56,0xFE57,0xFE59,0xFE5A,/* 0x70-0x77 */ + 0xFE5B,0xFE5C,0xFE5D,0xFE5E,0xFE5F,0xFE60,0xFE61,0x0000,/* 0x78-0x7F */ + + 0xFE62,0xFE63,0xFE64,0xFE65,0xFE66,0xFE68,0xFE69,0xFE6A,/* 0x80-0x87 */ + 0xFE6B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x3007,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0000,0x0000,0x0000,0x2500,0x2501,0x2502,0x2503,/* 0xA0-0xA7 */ + 0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250A,0x250B,/* 0xA8-0xAF */ + 0x250C,0x250D,0x250E,0x250F,0x2510,0x2511,0x2512,0x2513,/* 0xB0-0xB7 */ + 0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251A,0x251B,/* 0xB8-0xBF */ + 0x251C,0x251D,0x251E,0x251F,0x2520,0x2521,0x2522,0x2523,/* 0xC0-0xC7 */ + 0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252A,0x252B,/* 0xC8-0xCF */ + 0x252C,0x252D,0x252E,0x252F,0x2530,0x2531,0x2532,0x2533,/* 0xD0-0xD7 */ + 0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253A,0x253B,/* 0xD8-0xDF */ + 0x253C,0x253D,0x253E,0x253F,0x2540,0x2541,0x2542,0x2543,/* 0xE0-0xE7 */ + 0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254A,0x254B,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x72DC,0x72DD,0x72DF,0x72E2,0x72E3,0x72E4,0x72E5,0x72E6,/* 0x40-0x47 */ + 0x72E7,0x72EA,0x72EB,0x72F5,0x72F6,0x72F9,0x72FD,0x72FE,/* 0x48-0x4F */ + 0x72FF,0x7300,0x7302,0x7304,0x7305,0x7306,0x7307,0x7308,/* 0x50-0x57 */ + 0x7309,0x730B,0x730C,0x730D,0x730F,0x7310,0x7311,0x7312,/* 0x58-0x5F */ + 0x7314,0x7318,0x7319,0x731A,0x731F,0x7320,0x7323,0x7324,/* 0x60-0x67 */ + 0x7326,0x7327,0x7328,0x732D,0x732F,0x7330,0x7332,0x7333,/* 0x68-0x6F */ + 0x7335,0x7336,0x733A,0x733B,0x733C,0x733D,0x7340,0x7341,/* 0x70-0x77 */ + 0x7342,0x7343,0x7344,0x7345,0x7346,0x7347,0x7348,0x0000,/* 0x78-0x7F */ + + 0x7349,0x734A,0x734B,0x734C,0x734E,0x734F,0x7351,0x7353,/* 0x80-0x87 */ + 0x7354,0x7355,0x7356,0x7358,0x7359,0x735A,0x735B,0x735C,/* 0x88-0x8F */ + 0x735D,0x735E,0x735F,0x7361,0x7362,0x7363,0x7364,0x7365,/* 0x90-0x97 */ + 0x7366,0x7367,0x7368,0x7369,0x736A,0x736B,0x736E,0x7370,/* 0x98-0x9F */ + 0x7371,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7372,0x7373,0x7374,0x7375,0x7376,0x7377,0x7378,0x7379,/* 0x40-0x47 */ + 0x737A,0x737B,0x737C,0x737D,0x737F,0x7380,0x7381,0x7382,/* 0x48-0x4F */ + 0x7383,0x7385,0x7386,0x7388,0x738A,0x738C,0x738D,0x738F,/* 0x50-0x57 */ + 0x7390,0x7392,0x7393,0x7394,0x7395,0x7397,0x7398,0x7399,/* 0x58-0x5F */ + 0x739A,0x739C,0x739D,0x739E,0x73A0,0x73A1,0x73A3,0x73A4,/* 0x60-0x67 */ + 0x73A5,0x73A6,0x73A7,0x73A8,0x73AA,0x73AC,0x73AD,0x73B1,/* 0x68-0x6F */ + 0x73B4,0x73B5,0x73B6,0x73B8,0x73B9,0x73BC,0x73BD,0x73BE,/* 0x70-0x77 */ + 0x73BF,0x73C1,0x73C3,0x73C4,0x73C5,0x73C6,0x73C7,0x0000,/* 0x78-0x7F */ + + 0x73CB,0x73CC,0x73CE,0x73D2,0x73D3,0x73D4,0x73D5,0x73D6,/* 0x80-0x87 */ + 0x73D7,0x73D8,0x73DA,0x73DB,0x73DC,0x73DD,0x73DF,0x73E1,/* 0x88-0x8F */ + 0x73E2,0x73E3,0x73E4,0x73E6,0x73E8,0x73EA,0x73EB,0x73EC,/* 0x90-0x97 */ + 0x73EE,0x73EF,0x73F0,0x73F1,0x73F3,0x73F4,0x73F5,0x73F6,/* 0x98-0x9F */ + 0x73F7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x73F8,0x73F9,0x73FA,0x73FB,0x73FC,0x73FD,0x73FE,0x73FF,/* 0x40-0x47 */ + 0x7400,0x7401,0x7402,0x7404,0x7407,0x7408,0x740B,0x740C,/* 0x48-0x4F */ + 0x740D,0x740E,0x7411,0x7412,0x7413,0x7414,0x7415,0x7416,/* 0x50-0x57 */ + 0x7417,0x7418,0x7419,0x741C,0x741D,0x741E,0x741F,0x7420,/* 0x58-0x5F */ + 0x7421,0x7423,0x7424,0x7427,0x7429,0x742B,0x742D,0x742F,/* 0x60-0x67 */ + 0x7431,0x7432,0x7437,0x7438,0x7439,0x743A,0x743B,0x743D,/* 0x68-0x6F */ + 0x743E,0x743F,0x7440,0x7442,0x7443,0x7444,0x7445,0x7446,/* 0x70-0x77 */ + 0x7447,0x7448,0x7449,0x744A,0x744B,0x744C,0x744D,0x0000,/* 0x78-0x7F */ + + 0x744E,0x744F,0x7450,0x7451,0x7452,0x7453,0x7454,0x7456,/* 0x80-0x87 */ + 0x7458,0x745D,0x7460,0x7461,0x7462,0x7463,0x7464,0x7465,/* 0x88-0x8F */ + 0x7466,0x7467,0x7468,0x7469,0x746A,0x746B,0x746C,0x746E,/* 0x90-0x97 */ + 0x746F,0x7471,0x7472,0x7473,0x7474,0x7475,0x7478,0x7479,/* 0x98-0x9F */ + 0x747A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x747B,0x747C,0x747D,0x747F,0x7482,0x7484,0x7485,0x7486,/* 0x40-0x47 */ + 0x7488,0x7489,0x748A,0x748C,0x748D,0x748F,0x7491,0x7492,/* 0x48-0x4F */ + 0x7493,0x7494,0x7495,0x7496,0x7497,0x7498,0x7499,0x749A,/* 0x50-0x57 */ + 0x749B,0x749D,0x749F,0x74A0,0x74A1,0x74A2,0x74A3,0x74A4,/* 0x58-0x5F */ + 0x74A5,0x74A6,0x74AA,0x74AB,0x74AC,0x74AD,0x74AE,0x74AF,/* 0x60-0x67 */ + 0x74B0,0x74B1,0x74B2,0x74B3,0x74B4,0x74B5,0x74B6,0x74B7,/* 0x68-0x6F */ + 0x74B8,0x74B9,0x74BB,0x74BC,0x74BD,0x74BE,0x74BF,0x74C0,/* 0x70-0x77 */ + 0x74C1,0x74C2,0x74C3,0x74C4,0x74C5,0x74C6,0x74C7,0x0000,/* 0x78-0x7F */ + + 0x74C8,0x74C9,0x74CA,0x74CB,0x74CC,0x74CD,0x74CE,0x74CF,/* 0x80-0x87 */ + 0x74D0,0x74D1,0x74D3,0x74D4,0x74D5,0x74D6,0x74D7,0x74D8,/* 0x88-0x8F */ + 0x74D9,0x74DA,0x74DB,0x74DD,0x74DF,0x74E1,0x74E5,0x74E7,/* 0x90-0x97 */ + 0x74E8,0x74E9,0x74EA,0x74EB,0x74EC,0x74ED,0x74F0,0x74F1,/* 0x98-0x9F */ + 0x74F2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74F3,0x74F5,0x74F8,0x74F9,0x74FA,0x74FB,0x74FC,0x74FD,/* 0x40-0x47 */ + 0x74FE,0x7500,0x7501,0x7502,0x7503,0x7505,0x7506,0x7507,/* 0x48-0x4F */ + 0x7508,0x7509,0x750A,0x750B,0x750C,0x750E,0x7510,0x7512,/* 0x50-0x57 */ + 0x7514,0x7515,0x7516,0x7517,0x751B,0x751D,0x751E,0x7520,/* 0x58-0x5F */ + 0x7521,0x7522,0x7523,0x7524,0x7526,0x7527,0x752A,0x752E,/* 0x60-0x67 */ + 0x7534,0x7536,0x7539,0x753C,0x753D,0x753F,0x7541,0x7542,/* 0x68-0x6F */ + 0x7543,0x7544,0x7546,0x7547,0x7549,0x754A,0x754D,0x7550,/* 0x70-0x77 */ + 0x7551,0x7552,0x7553,0x7555,0x7556,0x7557,0x7558,0x0000,/* 0x78-0x7F */ + + 0x755D,0x755E,0x755F,0x7560,0x7561,0x7562,0x7563,0x7564,/* 0x80-0x87 */ + 0x7567,0x7568,0x7569,0x756B,0x756C,0x756D,0x756E,0x756F,/* 0x88-0x8F */ + 0x7570,0x7571,0x7573,0x7575,0x7576,0x7577,0x757A,0x757B,/* 0x90-0x97 */ + 0x757C,0x757D,0x757E,0x7580,0x7581,0x7582,0x7584,0x7585,/* 0x98-0x9F */ + 0x7587,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7588,0x7589,0x758A,0x758C,0x758D,0x758E,0x7590,0x7593,/* 0x40-0x47 */ + 0x7595,0x7598,0x759B,0x759C,0x759E,0x75A2,0x75A6,0x75A7,/* 0x48-0x4F */ + 0x75A8,0x75A9,0x75AA,0x75AD,0x75B6,0x75B7,0x75BA,0x75BB,/* 0x50-0x57 */ + 0x75BF,0x75C0,0x75C1,0x75C6,0x75CB,0x75CC,0x75CE,0x75CF,/* 0x58-0x5F */ + 0x75D0,0x75D1,0x75D3,0x75D7,0x75D9,0x75DA,0x75DC,0x75DD,/* 0x60-0x67 */ + 0x75DF,0x75E0,0x75E1,0x75E5,0x75E9,0x75EC,0x75ED,0x75EE,/* 0x68-0x6F */ + 0x75EF,0x75F2,0x75F3,0x75F5,0x75F6,0x75F7,0x75F8,0x75FA,/* 0x70-0x77 */ + 0x75FB,0x75FD,0x75FE,0x7602,0x7604,0x7606,0x7607,0x0000,/* 0x78-0x7F */ + + 0x7608,0x7609,0x760B,0x760D,0x760E,0x760F,0x7611,0x7612,/* 0x80-0x87 */ + 0x7613,0x7614,0x7616,0x761A,0x761C,0x761D,0x761E,0x7621,/* 0x88-0x8F */ + 0x7623,0x7627,0x7628,0x762C,0x762E,0x762F,0x7631,0x7632,/* 0x90-0x97 */ + 0x7636,0x7637,0x7639,0x763A,0x763B,0x763D,0x7641,0x7642,/* 0x98-0x9F */ + 0x7644,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7645,0x7646,0x7647,0x7648,0x7649,0x764A,0x764B,0x764E,/* 0x40-0x47 */ + 0x764F,0x7650,0x7651,0x7652,0x7653,0x7655,0x7657,0x7658,/* 0x48-0x4F */ + 0x7659,0x765A,0x765B,0x765D,0x765F,0x7660,0x7661,0x7662,/* 0x50-0x57 */ + 0x7664,0x7665,0x7666,0x7667,0x7668,0x7669,0x766A,0x766C,/* 0x58-0x5F */ + 0x766D,0x766E,0x7670,0x7671,0x7672,0x7673,0x7674,0x7675,/* 0x60-0x67 */ + 0x7676,0x7677,0x7679,0x767A,0x767C,0x767F,0x7680,0x7681,/* 0x68-0x6F */ + 0x7683,0x7685,0x7689,0x768A,0x768C,0x768D,0x768F,0x7690,/* 0x70-0x77 */ + 0x7692,0x7694,0x7695,0x7697,0x7698,0x769A,0x769B,0x0000,/* 0x78-0x7F */ + + 0x769C,0x769D,0x769E,0x769F,0x76A0,0x76A1,0x76A2,0x76A3,/* 0x80-0x87 */ + 0x76A5,0x76A6,0x76A7,0x76A8,0x76A9,0x76AA,0x76AB,0x76AC,/* 0x88-0x8F */ + 0x76AD,0x76AF,0x76B0,0x76B3,0x76B5,0x76B6,0x76B7,0x76B8,/* 0x90-0x97 */ + 0x76B9,0x76BA,0x76BB,0x76BC,0x76BD,0x76BE,0x76C0,0x76C1,/* 0x98-0x9F */ + 0x76C3,0x554A,0x963F,0x57C3,0x6328,0x54CE,0x5509,0x54C0,/* 0xA0-0xA7 */ + 0x7691,0x764C,0x853C,0x77EE,0x827E,0x788D,0x7231,0x9698,/* 0xA8-0xAF */ + 0x978D,0x6C28,0x5B89,0x4FFA,0x6309,0x6697,0x5CB8,0x80FA,/* 0xB0-0xB7 */ + 0x6848,0x80AE,0x6602,0x76CE,0x51F9,0x6556,0x71AC,0x7FF1,/* 0xB8-0xBF */ + 0x8884,0x50B2,0x5965,0x61CA,0x6FB3,0x82AD,0x634C,0x6252,/* 0xC0-0xC7 */ + 0x53ED,0x5427,0x7B06,0x516B,0x75A4,0x5DF4,0x62D4,0x8DCB,/* 0xC8-0xCF */ + 0x9776,0x628A,0x8019,0x575D,0x9738,0x7F62,0x7238,0x767D,/* 0xD0-0xD7 */ + 0x67CF,0x767E,0x6446,0x4F70,0x8D25,0x62DC,0x7A17,0x6591,/* 0xD8-0xDF */ + 0x73ED,0x642C,0x6273,0x822C,0x9881,0x677F,0x7248,0x626E,/* 0xE0-0xE7 */ + 0x62CC,0x4F34,0x74E3,0x534A,0x529E,0x7ECA,0x90A6,0x5E2E,/* 0xE8-0xEF */ + 0x6886,0x699C,0x8180,0x7ED1,0x68D2,0x78C5,0x868C,0x9551,/* 0xF0-0xF7 */ + 0x508D,0x8C24,0x82DE,0x80DE,0x5305,0x8912,0x5265,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x76C4,0x76C7,0x76C9,0x76CB,0x76CC,0x76D3,0x76D5,0x76D9,/* 0x40-0x47 */ + 0x76DA,0x76DC,0x76DD,0x76DE,0x76E0,0x76E1,0x76E2,0x76E3,/* 0x48-0x4F */ + 0x76E4,0x76E6,0x76E7,0x76E8,0x76E9,0x76EA,0x76EB,0x76EC,/* 0x50-0x57 */ + 0x76ED,0x76F0,0x76F3,0x76F5,0x76F6,0x76F7,0x76FA,0x76FB,/* 0x58-0x5F */ + 0x76FD,0x76FF,0x7700,0x7702,0x7703,0x7705,0x7706,0x770A,/* 0x60-0x67 */ + 0x770C,0x770E,0x770F,0x7710,0x7711,0x7712,0x7713,0x7714,/* 0x68-0x6F */ + 0x7715,0x7716,0x7717,0x7718,0x771B,0x771C,0x771D,0x771E,/* 0x70-0x77 */ + 0x7721,0x7723,0x7724,0x7725,0x7727,0x772A,0x772B,0x0000,/* 0x78-0x7F */ + + 0x772C,0x772E,0x7730,0x7731,0x7732,0x7733,0x7734,0x7739,/* 0x80-0x87 */ + 0x773B,0x773D,0x773E,0x773F,0x7742,0x7744,0x7745,0x7746,/* 0x88-0x8F */ + 0x7748,0x7749,0x774A,0x774B,0x774C,0x774D,0x774E,0x774F,/* 0x90-0x97 */ + 0x7752,0x7753,0x7754,0x7755,0x7756,0x7757,0x7758,0x7759,/* 0x98-0x9F */ + 0x775C,0x8584,0x96F9,0x4FDD,0x5821,0x9971,0x5B9D,0x62B1,/* 0xA0-0xA7 */ + 0x62A5,0x66B4,0x8C79,0x9C8D,0x7206,0x676F,0x7891,0x60B2,/* 0xA8-0xAF */ + 0x5351,0x5317,0x8F88,0x80CC,0x8D1D,0x94A1,0x500D,0x72C8,/* 0xB0-0xB7 */ + 0x5907,0x60EB,0x7119,0x88AB,0x5954,0x82EF,0x672C,0x7B28,/* 0xB8-0xBF */ + 0x5D29,0x7EF7,0x752D,0x6CF5,0x8E66,0x8FF8,0x903C,0x9F3B,/* 0xC0-0xC7 */ + 0x6BD4,0x9119,0x7B14,0x5F7C,0x78A7,0x84D6,0x853D,0x6BD5,/* 0xC8-0xCF */ + 0x6BD9,0x6BD6,0x5E01,0x5E87,0x75F9,0x95ED,0x655D,0x5F0A,/* 0xD0-0xD7 */ + 0x5FC5,0x8F9F,0x58C1,0x81C2,0x907F,0x965B,0x97AD,0x8FB9,/* 0xD8-0xDF */ + 0x7F16,0x8D2C,0x6241,0x4FBF,0x53D8,0x535E,0x8FA8,0x8FA9,/* 0xE0-0xE7 */ + 0x8FAB,0x904D,0x6807,0x5F6A,0x8198,0x8868,0x9CD6,0x618B,/* 0xE8-0xEF */ + 0x522B,0x762A,0x5F6C,0x658C,0x6FD2,0x6EE8,0x5BBE,0x6448,/* 0xF0-0xF7 */ + 0x5175,0x51B0,0x67C4,0x4E19,0x79C9,0x997C,0x70B3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x775D,0x775E,0x775F,0x7760,0x7764,0x7767,0x7769,0x776A,/* 0x40-0x47 */ + 0x776D,0x776E,0x776F,0x7770,0x7771,0x7772,0x7773,0x7774,/* 0x48-0x4F */ + 0x7775,0x7776,0x7777,0x7778,0x777A,0x777B,0x777C,0x7781,/* 0x50-0x57 */ + 0x7782,0x7783,0x7786,0x7787,0x7788,0x7789,0x778A,0x778B,/* 0x58-0x5F */ + 0x778F,0x7790,0x7793,0x7794,0x7795,0x7796,0x7797,0x7798,/* 0x60-0x67 */ + 0x7799,0x779A,0x779B,0x779C,0x779D,0x779E,0x77A1,0x77A3,/* 0x68-0x6F */ + 0x77A4,0x77A6,0x77A8,0x77AB,0x77AD,0x77AE,0x77AF,0x77B1,/* 0x70-0x77 */ + 0x77B2,0x77B4,0x77B6,0x77B7,0x77B8,0x77B9,0x77BA,0x0000,/* 0x78-0x7F */ + + 0x77BC,0x77BE,0x77C0,0x77C1,0x77C2,0x77C3,0x77C4,0x77C5,/* 0x80-0x87 */ + 0x77C6,0x77C7,0x77C8,0x77C9,0x77CA,0x77CB,0x77CC,0x77CE,/* 0x88-0x8F */ + 0x77CF,0x77D0,0x77D1,0x77D2,0x77D3,0x77D4,0x77D5,0x77D6,/* 0x90-0x97 */ + 0x77D8,0x77D9,0x77DA,0x77DD,0x77DE,0x77DF,0x77E0,0x77E1,/* 0x98-0x9F */ + 0x77E4,0x75C5,0x5E76,0x73BB,0x83E0,0x64AD,0x62E8,0x94B5,/* 0xA0-0xA7 */ + 0x6CE2,0x535A,0x52C3,0x640F,0x94C2,0x7B94,0x4F2F,0x5E1B,/* 0xA8-0xAF */ + 0x8236,0x8116,0x818A,0x6E24,0x6CCA,0x9A73,0x6355,0x535C,/* 0xB0-0xB7 */ + 0x54FA,0x8865,0x57E0,0x4E0D,0x5E03,0x6B65,0x7C3F,0x90E8,/* 0xB8-0xBF */ + 0x6016,0x64E6,0x731C,0x88C1,0x6750,0x624D,0x8D22,0x776C,/* 0xC0-0xC7 */ + 0x8E29,0x91C7,0x5F69,0x83DC,0x8521,0x9910,0x53C2,0x8695,/* 0xC8-0xCF */ + 0x6B8B,0x60ED,0x60E8,0x707F,0x82CD,0x8231,0x4ED3,0x6CA7,/* 0xD0-0xD7 */ + 0x85CF,0x64CD,0x7CD9,0x69FD,0x66F9,0x8349,0x5395,0x7B56,/* 0xD8-0xDF */ + 0x4FA7,0x518C,0x6D4B,0x5C42,0x8E6D,0x63D2,0x53C9,0x832C,/* 0xE0-0xE7 */ + 0x8336,0x67E5,0x78B4,0x643D,0x5BDF,0x5C94,0x5DEE,0x8BE7,/* 0xE8-0xEF */ + 0x62C6,0x67F4,0x8C7A,0x6400,0x63BA,0x8749,0x998B,0x8C17,/* 0xF0-0xF7 */ + 0x7F20,0x94F2,0x4EA7,0x9610,0x98A4,0x660C,0x7316,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x77E6,0x77E8,0x77EA,0x77EF,0x77F0,0x77F1,0x77F2,0x77F4,/* 0x40-0x47 */ + 0x77F5,0x77F7,0x77F9,0x77FA,0x77FB,0x77FC,0x7803,0x7804,/* 0x48-0x4F */ + 0x7805,0x7806,0x7807,0x7808,0x780A,0x780B,0x780E,0x780F,/* 0x50-0x57 */ + 0x7810,0x7813,0x7815,0x7819,0x781B,0x781E,0x7820,0x7821,/* 0x58-0x5F */ + 0x7822,0x7824,0x7828,0x782A,0x782B,0x782E,0x782F,0x7831,/* 0x60-0x67 */ + 0x7832,0x7833,0x7835,0x7836,0x783D,0x783F,0x7841,0x7842,/* 0x68-0x6F */ + 0x7843,0x7844,0x7846,0x7848,0x7849,0x784A,0x784B,0x784D,/* 0x70-0x77 */ + 0x784F,0x7851,0x7853,0x7854,0x7858,0x7859,0x785A,0x0000,/* 0x78-0x7F */ + + 0x785B,0x785C,0x785E,0x785F,0x7860,0x7861,0x7862,0x7863,/* 0x80-0x87 */ + 0x7864,0x7865,0x7866,0x7867,0x7868,0x7869,0x786F,0x7870,/* 0x88-0x8F */ + 0x7871,0x7872,0x7873,0x7874,0x7875,0x7876,0x7878,0x7879,/* 0x90-0x97 */ + 0x787A,0x787B,0x787D,0x787E,0x787F,0x7880,0x7881,0x7882,/* 0x98-0x9F */ + 0x7883,0x573A,0x5C1D,0x5E38,0x957F,0x507F,0x80A0,0x5382,/* 0xA0-0xA7 */ + 0x655E,0x7545,0x5531,0x5021,0x8D85,0x6284,0x949E,0x671D,/* 0xA8-0xAF */ + 0x5632,0x6F6E,0x5DE2,0x5435,0x7092,0x8F66,0x626F,0x64A4,/* 0xB0-0xB7 */ + 0x63A3,0x5F7B,0x6F88,0x90F4,0x81E3,0x8FB0,0x5C18,0x6668,/* 0xB8-0xBF */ + 0x5FF1,0x6C89,0x9648,0x8D81,0x886C,0x6491,0x79F0,0x57CE,/* 0xC0-0xC7 */ + 0x6A59,0x6210,0x5448,0x4E58,0x7A0B,0x60E9,0x6F84,0x8BDA,/* 0xC8-0xCF */ + 0x627F,0x901E,0x9A8B,0x79E4,0x5403,0x75F4,0x6301,0x5319,/* 0xD0-0xD7 */ + 0x6C60,0x8FDF,0x5F1B,0x9A70,0x803B,0x9F7F,0x4F88,0x5C3A,/* 0xD8-0xDF */ + 0x8D64,0x7FC5,0x65A5,0x70BD,0x5145,0x51B2,0x866B,0x5D07,/* 0xE0-0xE7 */ + 0x5BA0,0x62BD,0x916C,0x7574,0x8E0C,0x7A20,0x6101,0x7B79,/* 0xE8-0xEF */ + 0x4EC7,0x7EF8,0x7785,0x4E11,0x81ED,0x521D,0x51FA,0x6A71,/* 0xF0-0xF7 */ + 0x53A8,0x8E87,0x9504,0x96CF,0x6EC1,0x9664,0x695A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7884,0x7885,0x7886,0x7888,0x788A,0x788B,0x788F,0x7890,/* 0x40-0x47 */ + 0x7892,0x7894,0x7895,0x7896,0x7899,0x789D,0x789E,0x78A0,/* 0x48-0x4F */ + 0x78A2,0x78A4,0x78A6,0x78A8,0x78A9,0x78AA,0x78AB,0x78AC,/* 0x50-0x57 */ + 0x78AD,0x78AE,0x78AF,0x78B5,0x78B6,0x78B7,0x78B8,0x78BA,/* 0x58-0x5F */ + 0x78BB,0x78BC,0x78BD,0x78BF,0x78C0,0x78C2,0x78C3,0x78C4,/* 0x60-0x67 */ + 0x78C6,0x78C7,0x78C8,0x78CC,0x78CD,0x78CE,0x78CF,0x78D1,/* 0x68-0x6F */ + 0x78D2,0x78D3,0x78D6,0x78D7,0x78D8,0x78DA,0x78DB,0x78DC,/* 0x70-0x77 */ + 0x78DD,0x78DE,0x78DF,0x78E0,0x78E1,0x78E2,0x78E3,0x0000,/* 0x78-0x7F */ + + 0x78E4,0x78E5,0x78E6,0x78E7,0x78E9,0x78EA,0x78EB,0x78ED,/* 0x80-0x87 */ + 0x78EE,0x78EF,0x78F0,0x78F1,0x78F3,0x78F5,0x78F6,0x78F8,/* 0x88-0x8F */ + 0x78F9,0x78FB,0x78FC,0x78FD,0x78FE,0x78FF,0x7900,0x7902,/* 0x90-0x97 */ + 0x7903,0x7904,0x7906,0x7907,0x7908,0x7909,0x790A,0x790B,/* 0x98-0x9F */ + 0x790C,0x7840,0x50A8,0x77D7,0x6410,0x89E6,0x5904,0x63E3,/* 0xA0-0xA7 */ + 0x5DDD,0x7A7F,0x693D,0x4F20,0x8239,0x5598,0x4E32,0x75AE,/* 0xA8-0xAF */ + 0x7A97,0x5E62,0x5E8A,0x95EF,0x521B,0x5439,0x708A,0x6376,/* 0xB0-0xB7 */ + 0x9524,0x5782,0x6625,0x693F,0x9187,0x5507,0x6DF3,0x7EAF,/* 0xB8-0xBF */ + 0x8822,0x6233,0x7EF0,0x75B5,0x8328,0x78C1,0x96CC,0x8F9E,/* 0xC0-0xC7 */ + 0x6148,0x74F7,0x8BCD,0x6B64,0x523A,0x8D50,0x6B21,0x806A,/* 0xC8-0xCF */ + 0x8471,0x56F1,0x5306,0x4ECE,0x4E1B,0x51D1,0x7C97,0x918B,/* 0xD0-0xD7 */ + 0x7C07,0x4FC3,0x8E7F,0x7BE1,0x7A9C,0x6467,0x5D14,0x50AC,/* 0xD8-0xDF */ + 0x8106,0x7601,0x7CB9,0x6DEC,0x7FE0,0x6751,0x5B58,0x5BF8,/* 0xE0-0xE7 */ + 0x78CB,0x64AE,0x6413,0x63AA,0x632B,0x9519,0x642D,0x8FBE,/* 0xE8-0xEF */ + 0x7B54,0x7629,0x6253,0x5927,0x5446,0x6B79,0x50A3,0x6234,/* 0xF0-0xF7 */ + 0x5E26,0x6B86,0x4EE3,0x8D37,0x888B,0x5F85,0x902E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x790D,0x790E,0x790F,0x7910,0x7911,0x7912,0x7914,0x7915,/* 0x40-0x47 */ + 0x7916,0x7917,0x7918,0x7919,0x791A,0x791B,0x791C,0x791D,/* 0x48-0x4F */ + 0x791F,0x7920,0x7921,0x7922,0x7923,0x7925,0x7926,0x7927,/* 0x50-0x57 */ + 0x7928,0x7929,0x792A,0x792B,0x792C,0x792D,0x792E,0x792F,/* 0x58-0x5F */ + 0x7930,0x7931,0x7932,0x7933,0x7935,0x7936,0x7937,0x7938,/* 0x60-0x67 */ + 0x7939,0x793D,0x793F,0x7942,0x7943,0x7944,0x7945,0x7947,/* 0x68-0x6F */ + 0x794A,0x794B,0x794C,0x794D,0x794E,0x794F,0x7950,0x7951,/* 0x70-0x77 */ + 0x7952,0x7954,0x7955,0x7958,0x7959,0x7961,0x7963,0x0000,/* 0x78-0x7F */ + + 0x7964,0x7966,0x7969,0x796A,0x796B,0x796C,0x796E,0x7970,/* 0x80-0x87 */ + 0x7971,0x7972,0x7973,0x7974,0x7975,0x7976,0x7979,0x797B,/* 0x88-0x8F */ + 0x797C,0x797D,0x797E,0x797F,0x7982,0x7983,0x7986,0x7987,/* 0x90-0x97 */ + 0x7988,0x7989,0x798B,0x798C,0x798D,0x798E,0x7990,0x7991,/* 0x98-0x9F */ + 0x7992,0x6020,0x803D,0x62C5,0x4E39,0x5355,0x90F8,0x63B8,/* 0xA0-0xA7 */ + 0x80C6,0x65E6,0x6C2E,0x4F46,0x60EE,0x6DE1,0x8BDE,0x5F39,/* 0xA8-0xAF */ + 0x86CB,0x5F53,0x6321,0x515A,0x8361,0x6863,0x5200,0x6363,/* 0xB0-0xB7 */ + 0x8E48,0x5012,0x5C9B,0x7977,0x5BFC,0x5230,0x7A3B,0x60BC,/* 0xB8-0xBF */ + 0x9053,0x76D7,0x5FB7,0x5F97,0x7684,0x8E6C,0x706F,0x767B,/* 0xC0-0xC7 */ + 0x7B49,0x77AA,0x51F3,0x9093,0x5824,0x4F4E,0x6EF4,0x8FEA,/* 0xC8-0xCF */ + 0x654C,0x7B1B,0x72C4,0x6DA4,0x7FDF,0x5AE1,0x62B5,0x5E95,/* 0xD0-0xD7 */ + 0x5730,0x8482,0x7B2C,0x5E1D,0x5F1F,0x9012,0x7F14,0x98A0,/* 0xD8-0xDF */ + 0x6382,0x6EC7,0x7898,0x70B9,0x5178,0x975B,0x57AB,0x7535,/* 0xE0-0xE7 */ + 0x4F43,0x7538,0x5E97,0x60E6,0x5960,0x6DC0,0x6BBF,0x7889,/* 0xE8-0xEF */ + 0x53FC,0x96D5,0x51CB,0x5201,0x6389,0x540A,0x9493,0x8C03,/* 0xF0-0xF7 */ + 0x8DCC,0x7239,0x789F,0x8776,0x8FED,0x8C0D,0x53E0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7993,0x7994,0x7995,0x7996,0x7997,0x7998,0x7999,0x799B,/* 0x40-0x47 */ + 0x799C,0x799D,0x799E,0x799F,0x79A0,0x79A1,0x79A2,0x79A3,/* 0x48-0x4F */ + 0x79A4,0x79A5,0x79A6,0x79A8,0x79A9,0x79AA,0x79AB,0x79AC,/* 0x50-0x57 */ + 0x79AD,0x79AE,0x79AF,0x79B0,0x79B1,0x79B2,0x79B4,0x79B5,/* 0x58-0x5F */ + 0x79B6,0x79B7,0x79B8,0x79BC,0x79BF,0x79C2,0x79C4,0x79C5,/* 0x60-0x67 */ + 0x79C7,0x79C8,0x79CA,0x79CC,0x79CE,0x79CF,0x79D0,0x79D3,/* 0x68-0x6F */ + 0x79D4,0x79D6,0x79D7,0x79D9,0x79DA,0x79DB,0x79DC,0x79DD,/* 0x70-0x77 */ + 0x79DE,0x79E0,0x79E1,0x79E2,0x79E5,0x79E8,0x79EA,0x0000,/* 0x78-0x7F */ + + 0x79EC,0x79EE,0x79F1,0x79F2,0x79F3,0x79F4,0x79F5,0x79F6,/* 0x80-0x87 */ + 0x79F7,0x79F9,0x79FA,0x79FC,0x79FE,0x79FF,0x7A01,0x7A04,/* 0x88-0x8F */ + 0x7A05,0x7A07,0x7A08,0x7A09,0x7A0A,0x7A0C,0x7A0F,0x7A10,/* 0x90-0x97 */ + 0x7A11,0x7A12,0x7A13,0x7A15,0x7A16,0x7A18,0x7A19,0x7A1B,/* 0x98-0x9F */ + 0x7A1C,0x4E01,0x76EF,0x53EE,0x9489,0x9876,0x9F0E,0x952D,/* 0xA0-0xA7 */ + 0x5B9A,0x8BA2,0x4E22,0x4E1C,0x51AC,0x8463,0x61C2,0x52A8,/* 0xA8-0xAF */ + 0x680B,0x4F97,0x606B,0x51BB,0x6D1E,0x515C,0x6296,0x6597,/* 0xB0-0xB7 */ + 0x9661,0x8C46,0x9017,0x75D8,0x90FD,0x7763,0x6BD2,0x728A,/* 0xB8-0xBF */ + 0x72EC,0x8BFB,0x5835,0x7779,0x8D4C,0x675C,0x9540,0x809A,/* 0xC0-0xC7 */ + 0x5EA6,0x6E21,0x5992,0x7AEF,0x77ED,0x953B,0x6BB5,0x65AD,/* 0xC8-0xCF */ + 0x7F0E,0x5806,0x5151,0x961F,0x5BF9,0x58A9,0x5428,0x8E72,/* 0xD0-0xD7 */ + 0x6566,0x987F,0x56E4,0x949D,0x76FE,0x9041,0x6387,0x54C6,/* 0xD8-0xDF */ + 0x591A,0x593A,0x579B,0x8EB2,0x6735,0x8DFA,0x8235,0x5241,/* 0xE0-0xE7 */ + 0x60F0,0x5815,0x86FE,0x5CE8,0x9E45,0x4FC4,0x989D,0x8BB9,/* 0xE8-0xEF */ + 0x5A25,0x6076,0x5384,0x627C,0x904F,0x9102,0x997F,0x6069,/* 0xF0-0xF7 */ + 0x800C,0x513F,0x8033,0x5C14,0x9975,0x6D31,0x4E8C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A1D,0x7A1F,0x7A21,0x7A22,0x7A24,0x7A25,0x7A26,0x7A27,/* 0x40-0x47 */ + 0x7A28,0x7A29,0x7A2A,0x7A2B,0x7A2C,0x7A2D,0x7A2E,0x7A2F,/* 0x48-0x4F */ + 0x7A30,0x7A31,0x7A32,0x7A34,0x7A35,0x7A36,0x7A38,0x7A3A,/* 0x50-0x57 */ + 0x7A3E,0x7A40,0x7A41,0x7A42,0x7A43,0x7A44,0x7A45,0x7A47,/* 0x58-0x5F */ + 0x7A48,0x7A49,0x7A4A,0x7A4B,0x7A4C,0x7A4D,0x7A4E,0x7A4F,/* 0x60-0x67 */ + 0x7A50,0x7A52,0x7A53,0x7A54,0x7A55,0x7A56,0x7A58,0x7A59,/* 0x68-0x6F */ + 0x7A5A,0x7A5B,0x7A5C,0x7A5D,0x7A5E,0x7A5F,0x7A60,0x7A61,/* 0x70-0x77 */ + 0x7A62,0x7A63,0x7A64,0x7A65,0x7A66,0x7A67,0x7A68,0x0000,/* 0x78-0x7F */ + + 0x7A69,0x7A6A,0x7A6B,0x7A6C,0x7A6D,0x7A6E,0x7A6F,0x7A71,/* 0x80-0x87 */ + 0x7A72,0x7A73,0x7A75,0x7A7B,0x7A7C,0x7A7D,0x7A7E,0x7A82,/* 0x88-0x8F */ + 0x7A85,0x7A87,0x7A89,0x7A8A,0x7A8B,0x7A8C,0x7A8E,0x7A8F,/* 0x90-0x97 */ + 0x7A90,0x7A93,0x7A94,0x7A99,0x7A9A,0x7A9B,0x7A9E,0x7AA1,/* 0x98-0x9F */ + 0x7AA2,0x8D30,0x53D1,0x7F5A,0x7B4F,0x4F10,0x4E4F,0x9600,/* 0xA0-0xA7 */ + 0x6CD5,0x73D0,0x85E9,0x5E06,0x756A,0x7FFB,0x6A0A,0x77FE,/* 0xA8-0xAF */ + 0x9492,0x7E41,0x51E1,0x70E6,0x53CD,0x8FD4,0x8303,0x8D29,/* 0xB0-0xB7 */ + 0x72AF,0x996D,0x6CDB,0x574A,0x82B3,0x65B9,0x80AA,0x623F,/* 0xB8-0xBF */ + 0x9632,0x59A8,0x4EFF,0x8BBF,0x7EBA,0x653E,0x83F2,0x975E,/* 0xC0-0xC7 */ + 0x5561,0x98DE,0x80A5,0x532A,0x8BFD,0x5420,0x80BA,0x5E9F,/* 0xC8-0xCF */ + 0x6CB8,0x8D39,0x82AC,0x915A,0x5429,0x6C1B,0x5206,0x7EB7,/* 0xD0-0xD7 */ + 0x575F,0x711A,0x6C7E,0x7C89,0x594B,0x4EFD,0x5FFF,0x6124,/* 0xD8-0xDF */ + 0x7CAA,0x4E30,0x5C01,0x67AB,0x8702,0x5CF0,0x950B,0x98CE,/* 0xE0-0xE7 */ + 0x75AF,0x70FD,0x9022,0x51AF,0x7F1D,0x8BBD,0x5949,0x51E4,/* 0xE8-0xEF */ + 0x4F5B,0x5426,0x592B,0x6577,0x80A4,0x5B75,0x6276,0x62C2,/* 0xF0-0xF7 */ + 0x8F90,0x5E45,0x6C1F,0x7B26,0x4F0F,0x4FD8,0x670D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7AA3,0x7AA4,0x7AA7,0x7AA9,0x7AAA,0x7AAB,0x7AAE,0x7AAF,/* 0x40-0x47 */ + 0x7AB0,0x7AB1,0x7AB2,0x7AB4,0x7AB5,0x7AB6,0x7AB7,0x7AB8,/* 0x48-0x4F */ + 0x7AB9,0x7ABA,0x7ABB,0x7ABC,0x7ABD,0x7ABE,0x7AC0,0x7AC1,/* 0x50-0x57 */ + 0x7AC2,0x7AC3,0x7AC4,0x7AC5,0x7AC6,0x7AC7,0x7AC8,0x7AC9,/* 0x58-0x5F */ + 0x7ACA,0x7ACC,0x7ACD,0x7ACE,0x7ACF,0x7AD0,0x7AD1,0x7AD2,/* 0x60-0x67 */ + 0x7AD3,0x7AD4,0x7AD5,0x7AD7,0x7AD8,0x7ADA,0x7ADB,0x7ADC,/* 0x68-0x6F */ + 0x7ADD,0x7AE1,0x7AE2,0x7AE4,0x7AE7,0x7AE8,0x7AE9,0x7AEA,/* 0x70-0x77 */ + 0x7AEB,0x7AEC,0x7AEE,0x7AF0,0x7AF1,0x7AF2,0x7AF3,0x0000,/* 0x78-0x7F */ + + 0x7AF4,0x7AF5,0x7AF6,0x7AF7,0x7AF8,0x7AFB,0x7AFC,0x7AFE,/* 0x80-0x87 */ + 0x7B00,0x7B01,0x7B02,0x7B05,0x7B07,0x7B09,0x7B0C,0x7B0D,/* 0x88-0x8F */ + 0x7B0E,0x7B10,0x7B12,0x7B13,0x7B16,0x7B17,0x7B18,0x7B1A,/* 0x90-0x97 */ + 0x7B1C,0x7B1D,0x7B1F,0x7B21,0x7B22,0x7B23,0x7B27,0x7B29,/* 0x98-0x9F */ + 0x7B2D,0x6D6E,0x6DAA,0x798F,0x88B1,0x5F17,0x752B,0x629A,/* 0xA0-0xA7 */ + 0x8F85,0x4FEF,0x91DC,0x65A7,0x812F,0x8151,0x5E9C,0x8150,/* 0xA8-0xAF */ + 0x8D74,0x526F,0x8986,0x8D4B,0x590D,0x5085,0x4ED8,0x961C,/* 0xB0-0xB7 */ + 0x7236,0x8179,0x8D1F,0x5BCC,0x8BA3,0x9644,0x5987,0x7F1A,/* 0xB8-0xBF */ + 0x5490,0x5676,0x560E,0x8BE5,0x6539,0x6982,0x9499,0x76D6,/* 0xC0-0xC7 */ + 0x6E89,0x5E72,0x7518,0x6746,0x67D1,0x7AFF,0x809D,0x8D76,/* 0xC8-0xCF */ + 0x611F,0x79C6,0x6562,0x8D63,0x5188,0x521A,0x94A2,0x7F38,/* 0xD0-0xD7 */ + 0x809B,0x7EB2,0x5C97,0x6E2F,0x6760,0x7BD9,0x768B,0x9AD8,/* 0xD8-0xDF */ + 0x818F,0x7F94,0x7CD5,0x641E,0x9550,0x7A3F,0x544A,0x54E5,/* 0xE0-0xE7 */ + 0x6B4C,0x6401,0x6208,0x9E3D,0x80F3,0x7599,0x5272,0x9769,/* 0xE8-0xEF */ + 0x845B,0x683C,0x86E4,0x9601,0x9694,0x94EC,0x4E2A,0x5404,/* 0xF0-0xF7 */ + 0x7ED9,0x6839,0x8DDF,0x8015,0x66F4,0x5E9A,0x7FB9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7B2F,0x7B30,0x7B32,0x7B34,0x7B35,0x7B36,0x7B37,0x7B39,/* 0x40-0x47 */ + 0x7B3B,0x7B3D,0x7B3F,0x7B40,0x7B41,0x7B42,0x7B43,0x7B44,/* 0x48-0x4F */ + 0x7B46,0x7B48,0x7B4A,0x7B4D,0x7B4E,0x7B53,0x7B55,0x7B57,/* 0x50-0x57 */ + 0x7B59,0x7B5C,0x7B5E,0x7B5F,0x7B61,0x7B63,0x7B64,0x7B65,/* 0x58-0x5F */ + 0x7B66,0x7B67,0x7B68,0x7B69,0x7B6A,0x7B6B,0x7B6C,0x7B6D,/* 0x60-0x67 */ + 0x7B6F,0x7B70,0x7B73,0x7B74,0x7B76,0x7B78,0x7B7A,0x7B7C,/* 0x68-0x6F */ + 0x7B7D,0x7B7F,0x7B81,0x7B82,0x7B83,0x7B84,0x7B86,0x7B87,/* 0x70-0x77 */ + 0x7B88,0x7B89,0x7B8A,0x7B8B,0x7B8C,0x7B8E,0x7B8F,0x0000,/* 0x78-0x7F */ + + 0x7B91,0x7B92,0x7B93,0x7B96,0x7B98,0x7B99,0x7B9A,0x7B9B,/* 0x80-0x87 */ + 0x7B9E,0x7B9F,0x7BA0,0x7BA3,0x7BA4,0x7BA5,0x7BAE,0x7BAF,/* 0x88-0x8F */ + 0x7BB0,0x7BB2,0x7BB3,0x7BB5,0x7BB6,0x7BB7,0x7BB9,0x7BBA,/* 0x90-0x97 */ + 0x7BBB,0x7BBC,0x7BBD,0x7BBE,0x7BBF,0x7BC0,0x7BC2,0x7BC3,/* 0x98-0x9F */ + 0x7BC4,0x57C2,0x803F,0x6897,0x5DE5,0x653B,0x529F,0x606D,/* 0xA0-0xA7 */ + 0x9F9A,0x4F9B,0x8EAC,0x516C,0x5BAB,0x5F13,0x5DE9,0x6C5E,/* 0xA8-0xAF */ + 0x62F1,0x8D21,0x5171,0x94A9,0x52FE,0x6C9F,0x82DF,0x72D7,/* 0xB0-0xB7 */ + 0x57A2,0x6784,0x8D2D,0x591F,0x8F9C,0x83C7,0x5495,0x7B8D,/* 0xB8-0xBF */ + 0x4F30,0x6CBD,0x5B64,0x59D1,0x9F13,0x53E4,0x86CA,0x9AA8,/* 0xC0-0xC7 */ + 0x8C37,0x80A1,0x6545,0x987E,0x56FA,0x96C7,0x522E,0x74DC,/* 0xC8-0xCF */ + 0x5250,0x5BE1,0x6302,0x8902,0x4E56,0x62D0,0x602A,0x68FA,/* 0xD0-0xD7 */ + 0x5173,0x5B98,0x51A0,0x89C2,0x7BA1,0x9986,0x7F50,0x60EF,/* 0xD8-0xDF */ + 0x704C,0x8D2F,0x5149,0x5E7F,0x901B,0x7470,0x89C4,0x572D,/* 0xE0-0xE7 */ + 0x7845,0x5F52,0x9F9F,0x95FA,0x8F68,0x9B3C,0x8BE1,0x7678,/* 0xE8-0xEF */ + 0x6842,0x67DC,0x8DEA,0x8D35,0x523D,0x8F8A,0x6EDA,0x68CD,/* 0xF0-0xF7 */ + 0x9505,0x90ED,0x56FD,0x679C,0x88F9,0x8FC7,0x54C8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7BC5,0x7BC8,0x7BC9,0x7BCA,0x7BCB,0x7BCD,0x7BCE,0x7BCF,/* 0x40-0x47 */ + 0x7BD0,0x7BD2,0x7BD4,0x7BD5,0x7BD6,0x7BD7,0x7BD8,0x7BDB,/* 0x48-0x4F */ + 0x7BDC,0x7BDE,0x7BDF,0x7BE0,0x7BE2,0x7BE3,0x7BE4,0x7BE7,/* 0x50-0x57 */ + 0x7BE8,0x7BE9,0x7BEB,0x7BEC,0x7BED,0x7BEF,0x7BF0,0x7BF2,/* 0x58-0x5F */ + 0x7BF3,0x7BF4,0x7BF5,0x7BF6,0x7BF8,0x7BF9,0x7BFA,0x7BFB,/* 0x60-0x67 */ + 0x7BFD,0x7BFF,0x7C00,0x7C01,0x7C02,0x7C03,0x7C04,0x7C05,/* 0x68-0x6F */ + 0x7C06,0x7C08,0x7C09,0x7C0A,0x7C0D,0x7C0E,0x7C10,0x7C11,/* 0x70-0x77 */ + 0x7C12,0x7C13,0x7C14,0x7C15,0x7C17,0x7C18,0x7C19,0x0000,/* 0x78-0x7F */ + + 0x7C1A,0x7C1B,0x7C1C,0x7C1D,0x7C1E,0x7C20,0x7C21,0x7C22,/* 0x80-0x87 */ + 0x7C23,0x7C24,0x7C25,0x7C28,0x7C29,0x7C2B,0x7C2C,0x7C2D,/* 0x88-0x8F */ + 0x7C2E,0x7C2F,0x7C30,0x7C31,0x7C32,0x7C33,0x7C34,0x7C35,/* 0x90-0x97 */ + 0x7C36,0x7C37,0x7C39,0x7C3A,0x7C3B,0x7C3C,0x7C3D,0x7C3E,/* 0x98-0x9F */ + 0x7C42,0x9AB8,0x5B69,0x6D77,0x6C26,0x4EA5,0x5BB3,0x9A87,/* 0xA0-0xA7 */ + 0x9163,0x61A8,0x90AF,0x97E9,0x542B,0x6DB5,0x5BD2,0x51FD,/* 0xA8-0xAF */ + 0x558A,0x7F55,0x7FF0,0x64BC,0x634D,0x65F1,0x61BE,0x608D,/* 0xB0-0xB7 */ + 0x710A,0x6C57,0x6C49,0x592F,0x676D,0x822A,0x58D5,0x568E,/* 0xB8-0xBF */ + 0x8C6A,0x6BEB,0x90DD,0x597D,0x8017,0x53F7,0x6D69,0x5475,/* 0xC0-0xC7 */ + 0x559D,0x8377,0x83CF,0x6838,0x79BE,0x548C,0x4F55,0x5408,/* 0xC8-0xCF */ + 0x76D2,0x8C89,0x9602,0x6CB3,0x6DB8,0x8D6B,0x8910,0x9E64,/* 0xD0-0xD7 */ + 0x8D3A,0x563F,0x9ED1,0x75D5,0x5F88,0x72E0,0x6068,0x54FC,/* 0xD8-0xDF */ + 0x4EA8,0x6A2A,0x8861,0x6052,0x8F70,0x54C4,0x70D8,0x8679,/* 0xE0-0xE7 */ + 0x9E3F,0x6D2A,0x5B8F,0x5F18,0x7EA2,0x5589,0x4FAF,0x7334,/* 0xE8-0xEF */ + 0x543C,0x539A,0x5019,0x540E,0x547C,0x4E4E,0x5FFD,0x745A,/* 0xF0-0xF7 */ + 0x58F6,0x846B,0x80E1,0x8774,0x72D0,0x7CCA,0x6E56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7C43,0x7C44,0x7C45,0x7C46,0x7C47,0x7C48,0x7C49,0x7C4A,/* 0x40-0x47 */ + 0x7C4B,0x7C4C,0x7C4E,0x7C4F,0x7C50,0x7C51,0x7C52,0x7C53,/* 0x48-0x4F */ + 0x7C54,0x7C55,0x7C56,0x7C57,0x7C58,0x7C59,0x7C5A,0x7C5B,/* 0x50-0x57 */ + 0x7C5C,0x7C5D,0x7C5E,0x7C5F,0x7C60,0x7C61,0x7C62,0x7C63,/* 0x58-0x5F */ + 0x7C64,0x7C65,0x7C66,0x7C67,0x7C68,0x7C69,0x7C6A,0x7C6B,/* 0x60-0x67 */ + 0x7C6C,0x7C6D,0x7C6E,0x7C6F,0x7C70,0x7C71,0x7C72,0x7C75,/* 0x68-0x6F */ + 0x7C76,0x7C77,0x7C78,0x7C79,0x7C7A,0x7C7E,0x7C7F,0x7C80,/* 0x70-0x77 */ + 0x7C81,0x7C82,0x7C83,0x7C84,0x7C85,0x7C86,0x7C87,0x0000,/* 0x78-0x7F */ + + 0x7C88,0x7C8A,0x7C8B,0x7C8C,0x7C8D,0x7C8E,0x7C8F,0x7C90,/* 0x80-0x87 */ + 0x7C93,0x7C94,0x7C96,0x7C99,0x7C9A,0x7C9B,0x7CA0,0x7CA1,/* 0x88-0x8F */ + 0x7CA3,0x7CA6,0x7CA7,0x7CA8,0x7CA9,0x7CAB,0x7CAC,0x7CAD,/* 0x90-0x97 */ + 0x7CAF,0x7CB0,0x7CB4,0x7CB5,0x7CB6,0x7CB7,0x7CB8,0x7CBA,/* 0x98-0x9F */ + 0x7CBB,0x5F27,0x864E,0x552C,0x62A4,0x4E92,0x6CAA,0x6237,/* 0xA0-0xA7 */ + 0x82B1,0x54D7,0x534E,0x733E,0x6ED1,0x753B,0x5212,0x5316,/* 0xA8-0xAF */ + 0x8BDD,0x69D0,0x5F8A,0x6000,0x6DEE,0x574F,0x6B22,0x73AF,/* 0xB0-0xB7 */ + 0x6853,0x8FD8,0x7F13,0x6362,0x60A3,0x5524,0x75EA,0x8C62,/* 0xB8-0xBF */ + 0x7115,0x6DA3,0x5BA6,0x5E7B,0x8352,0x614C,0x9EC4,0x78FA,/* 0xC0-0xC7 */ + 0x8757,0x7C27,0x7687,0x51F0,0x60F6,0x714C,0x6643,0x5E4C,/* 0xC8-0xCF */ + 0x604D,0x8C0E,0x7070,0x6325,0x8F89,0x5FBD,0x6062,0x86D4,/* 0xD0-0xD7 */ + 0x56DE,0x6BC1,0x6094,0x6167,0x5349,0x60E0,0x6666,0x8D3F,/* 0xD8-0xDF */ + 0x79FD,0x4F1A,0x70E9,0x6C47,0x8BB3,0x8BF2,0x7ED8,0x8364,/* 0xE0-0xE7 */ + 0x660F,0x5A5A,0x9B42,0x6D51,0x6DF7,0x8C41,0x6D3B,0x4F19,/* 0xE8-0xEF */ + 0x706B,0x83B7,0x6216,0x60D1,0x970D,0x8D27,0x7978,0x51FB,/* 0xF0-0xF7 */ + 0x573E,0x57FA,0x673A,0x7578,0x7A3D,0x79EF,0x7B95,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7CBF,0x7CC0,0x7CC2,0x7CC3,0x7CC4,0x7CC6,0x7CC9,0x7CCB,/* 0x40-0x47 */ + 0x7CCE,0x7CCF,0x7CD0,0x7CD1,0x7CD2,0x7CD3,0x7CD4,0x7CD8,/* 0x48-0x4F */ + 0x7CDA,0x7CDB,0x7CDD,0x7CDE,0x7CE1,0x7CE2,0x7CE3,0x7CE4,/* 0x50-0x57 */ + 0x7CE5,0x7CE6,0x7CE7,0x7CE9,0x7CEA,0x7CEB,0x7CEC,0x7CED,/* 0x58-0x5F */ + 0x7CEE,0x7CF0,0x7CF1,0x7CF2,0x7CF3,0x7CF4,0x7CF5,0x7CF6,/* 0x60-0x67 */ + 0x7CF7,0x7CF9,0x7CFA,0x7CFC,0x7CFD,0x7CFE,0x7CFF,0x7D00,/* 0x68-0x6F */ + 0x7D01,0x7D02,0x7D03,0x7D04,0x7D05,0x7D06,0x7D07,0x7D08,/* 0x70-0x77 */ + 0x7D09,0x7D0B,0x7D0C,0x7D0D,0x7D0E,0x7D0F,0x7D10,0x0000,/* 0x78-0x7F */ + + 0x7D11,0x7D12,0x7D13,0x7D14,0x7D15,0x7D16,0x7D17,0x7D18,/* 0x80-0x87 */ + 0x7D19,0x7D1A,0x7D1B,0x7D1C,0x7D1D,0x7D1E,0x7D1F,0x7D21,/* 0x88-0x8F */ + 0x7D23,0x7D24,0x7D25,0x7D26,0x7D28,0x7D29,0x7D2A,0x7D2C,/* 0x90-0x97 */ + 0x7D2D,0x7D2E,0x7D30,0x7D31,0x7D32,0x7D33,0x7D34,0x7D35,/* 0x98-0x9F */ + 0x7D36,0x808C,0x9965,0x8FF9,0x6FC0,0x8BA5,0x9E21,0x59EC,/* 0xA0-0xA7 */ + 0x7EE9,0x7F09,0x5409,0x6781,0x68D8,0x8F91,0x7C4D,0x96C6,/* 0xA8-0xAF */ + 0x53CA,0x6025,0x75BE,0x6C72,0x5373,0x5AC9,0x7EA7,0x6324,/* 0xB0-0xB7 */ + 0x51E0,0x810A,0x5DF1,0x84DF,0x6280,0x5180,0x5B63,0x4F0E,/* 0xB8-0xBF */ + 0x796D,0x5242,0x60B8,0x6D4E,0x5BC4,0x5BC2,0x8BA1,0x8BB0,/* 0xC0-0xC7 */ + 0x65E2,0x5FCC,0x9645,0x5993,0x7EE7,0x7EAA,0x5609,0x67B7,/* 0xC8-0xCF */ + 0x5939,0x4F73,0x5BB6,0x52A0,0x835A,0x988A,0x8D3E,0x7532,/* 0xD0-0xD7 */ + 0x94BE,0x5047,0x7A3C,0x4EF7,0x67B6,0x9A7E,0x5AC1,0x6B7C,/* 0xD8-0xDF */ + 0x76D1,0x575A,0x5C16,0x7B3A,0x95F4,0x714E,0x517C,0x80A9,/* 0xE0-0xE7 */ + 0x8270,0x5978,0x7F04,0x8327,0x68C0,0x67EC,0x78B1,0x7877,/* 0xE8-0xEF */ + 0x62E3,0x6361,0x7B80,0x4FED,0x526A,0x51CF,0x8350,0x69DB,/* 0xF0-0xF7 */ + 0x9274,0x8DF5,0x8D31,0x89C1,0x952E,0x7BAD,0x4EF6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D37,0x7D38,0x7D39,0x7D3A,0x7D3B,0x7D3C,0x7D3D,0x7D3E,/* 0x40-0x47 */ + 0x7D3F,0x7D40,0x7D41,0x7D42,0x7D43,0x7D44,0x7D45,0x7D46,/* 0x48-0x4F */ + 0x7D47,0x7D48,0x7D49,0x7D4A,0x7D4B,0x7D4C,0x7D4D,0x7D4E,/* 0x50-0x57 */ + 0x7D4F,0x7D50,0x7D51,0x7D52,0x7D53,0x7D54,0x7D55,0x7D56,/* 0x58-0x5F */ + 0x7D57,0x7D58,0x7D59,0x7D5A,0x7D5B,0x7D5C,0x7D5D,0x7D5E,/* 0x60-0x67 */ + 0x7D5F,0x7D60,0x7D61,0x7D62,0x7D63,0x7D64,0x7D65,0x7D66,/* 0x68-0x6F */ + 0x7D67,0x7D68,0x7D69,0x7D6A,0x7D6B,0x7D6C,0x7D6D,0x7D6F,/* 0x70-0x77 */ + 0x7D70,0x7D71,0x7D72,0x7D73,0x7D74,0x7D75,0x7D76,0x0000,/* 0x78-0x7F */ + + 0x7D78,0x7D79,0x7D7A,0x7D7B,0x7D7C,0x7D7D,0x7D7E,0x7D7F,/* 0x80-0x87 */ + 0x7D80,0x7D81,0x7D82,0x7D83,0x7D84,0x7D85,0x7D86,0x7D87,/* 0x88-0x8F */ + 0x7D88,0x7D89,0x7D8A,0x7D8B,0x7D8C,0x7D8D,0x7D8E,0x7D8F,/* 0x90-0x97 */ + 0x7D90,0x7D91,0x7D92,0x7D93,0x7D94,0x7D95,0x7D96,0x7D97,/* 0x98-0x9F */ + 0x7D98,0x5065,0x8230,0x5251,0x996F,0x6E10,0x6E85,0x6DA7,/* 0xA0-0xA7 */ + 0x5EFA,0x50F5,0x59DC,0x5C06,0x6D46,0x6C5F,0x7586,0x848B,/* 0xA8-0xAF */ + 0x6868,0x5956,0x8BB2,0x5320,0x9171,0x964D,0x8549,0x6912,/* 0xB0-0xB7 */ + 0x7901,0x7126,0x80F6,0x4EA4,0x90CA,0x6D47,0x9A84,0x5A07,/* 0xB8-0xBF */ + 0x56BC,0x6405,0x94F0,0x77EB,0x4FA5,0x811A,0x72E1,0x89D2,/* 0xC0-0xC7 */ + 0x997A,0x7F34,0x7EDE,0x527F,0x6559,0x9175,0x8F7F,0x8F83,/* 0xC8-0xCF */ + 0x53EB,0x7A96,0x63ED,0x63A5,0x7686,0x79F8,0x8857,0x9636,/* 0xD0-0xD7 */ + 0x622A,0x52AB,0x8282,0x6854,0x6770,0x6377,0x776B,0x7AED,/* 0xD8-0xDF */ + 0x6D01,0x7ED3,0x89E3,0x59D0,0x6212,0x85C9,0x82A5,0x754C,/* 0xE0-0xE7 */ + 0x501F,0x4ECB,0x75A5,0x8BEB,0x5C4A,0x5DFE,0x7B4B,0x65A4,/* 0xE8-0xEF */ + 0x91D1,0x4ECA,0x6D25,0x895F,0x7D27,0x9526,0x4EC5,0x8C28,/* 0xF0-0xF7 */ + 0x8FDB,0x9773,0x664B,0x7981,0x8FD1,0x70EC,0x6D78,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D99,0x7D9A,0x7D9B,0x7D9C,0x7D9D,0x7D9E,0x7D9F,0x7DA0,/* 0x40-0x47 */ + 0x7DA1,0x7DA2,0x7DA3,0x7DA4,0x7DA5,0x7DA7,0x7DA8,0x7DA9,/* 0x48-0x4F */ + 0x7DAA,0x7DAB,0x7DAC,0x7DAD,0x7DAF,0x7DB0,0x7DB1,0x7DB2,/* 0x50-0x57 */ + 0x7DB3,0x7DB4,0x7DB5,0x7DB6,0x7DB7,0x7DB8,0x7DB9,0x7DBA,/* 0x58-0x5F */ + 0x7DBB,0x7DBC,0x7DBD,0x7DBE,0x7DBF,0x7DC0,0x7DC1,0x7DC2,/* 0x60-0x67 */ + 0x7DC3,0x7DC4,0x7DC5,0x7DC6,0x7DC7,0x7DC8,0x7DC9,0x7DCA,/* 0x68-0x6F */ + 0x7DCB,0x7DCC,0x7DCD,0x7DCE,0x7DCF,0x7DD0,0x7DD1,0x7DD2,/* 0x70-0x77 */ + 0x7DD3,0x7DD4,0x7DD5,0x7DD6,0x7DD7,0x7DD8,0x7DD9,0x0000,/* 0x78-0x7F */ + + 0x7DDA,0x7DDB,0x7DDC,0x7DDD,0x7DDE,0x7DDF,0x7DE0,0x7DE1,/* 0x80-0x87 */ + 0x7DE2,0x7DE3,0x7DE4,0x7DE5,0x7DE6,0x7DE7,0x7DE8,0x7DE9,/* 0x88-0x8F */ + 0x7DEA,0x7DEB,0x7DEC,0x7DED,0x7DEE,0x7DEF,0x7DF0,0x7DF1,/* 0x90-0x97 */ + 0x7DF2,0x7DF3,0x7DF4,0x7DF5,0x7DF6,0x7DF7,0x7DF8,0x7DF9,/* 0x98-0x9F */ + 0x7DFA,0x5C3D,0x52B2,0x8346,0x5162,0x830E,0x775B,0x6676,/* 0xA0-0xA7 */ + 0x9CB8,0x4EAC,0x60CA,0x7CBE,0x7CB3,0x7ECF,0x4E95,0x8B66,/* 0xA8-0xAF */ + 0x666F,0x9888,0x9759,0x5883,0x656C,0x955C,0x5F84,0x75C9,/* 0xB0-0xB7 */ + 0x9756,0x7ADF,0x7ADE,0x51C0,0x70AF,0x7A98,0x63EA,0x7A76,/* 0xB8-0xBF */ + 0x7EA0,0x7396,0x97ED,0x4E45,0x7078,0x4E5D,0x9152,0x53A9,/* 0xC0-0xC7 */ + 0x6551,0x65E7,0x81FC,0x8205,0x548E,0x5C31,0x759A,0x97A0,/* 0xC8-0xCF */ + 0x62D8,0x72D9,0x75BD,0x5C45,0x9A79,0x83CA,0x5C40,0x5480,/* 0xD0-0xD7 */ + 0x77E9,0x4E3E,0x6CAE,0x805A,0x62D2,0x636E,0x5DE8,0x5177,/* 0xD8-0xDF */ + 0x8DDD,0x8E1E,0x952F,0x4FF1,0x53E5,0x60E7,0x70AC,0x5267,/* 0xE0-0xE7 */ + 0x6350,0x9E43,0x5A1F,0x5026,0x7737,0x5377,0x7EE2,0x6485,/* 0xE8-0xEF */ + 0x652B,0x6289,0x6398,0x5014,0x7235,0x89C9,0x51B3,0x8BC0,/* 0xF0-0xF7 */ + 0x7EDD,0x5747,0x83CC,0x94A7,0x519B,0x541B,0x5CFB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7DFB,0x7DFC,0x7DFD,0x7DFE,0x7DFF,0x7E00,0x7E01,0x7E02,/* 0x40-0x47 */ + 0x7E03,0x7E04,0x7E05,0x7E06,0x7E07,0x7E08,0x7E09,0x7E0A,/* 0x48-0x4F */ + 0x7E0B,0x7E0C,0x7E0D,0x7E0E,0x7E0F,0x7E10,0x7E11,0x7E12,/* 0x50-0x57 */ + 0x7E13,0x7E14,0x7E15,0x7E16,0x7E17,0x7E18,0x7E19,0x7E1A,/* 0x58-0x5F */ + 0x7E1B,0x7E1C,0x7E1D,0x7E1E,0x7E1F,0x7E20,0x7E21,0x7E22,/* 0x60-0x67 */ + 0x7E23,0x7E24,0x7E25,0x7E26,0x7E27,0x7E28,0x7E29,0x7E2A,/* 0x68-0x6F */ + 0x7E2B,0x7E2C,0x7E2D,0x7E2E,0x7E2F,0x7E30,0x7E31,0x7E32,/* 0x70-0x77 */ + 0x7E33,0x7E34,0x7E35,0x7E36,0x7E37,0x7E38,0x7E39,0x0000,/* 0x78-0x7F */ + + 0x7E3A,0x7E3C,0x7E3D,0x7E3E,0x7E3F,0x7E40,0x7E42,0x7E43,/* 0x80-0x87 */ + 0x7E44,0x7E45,0x7E46,0x7E48,0x7E49,0x7E4A,0x7E4B,0x7E4C,/* 0x88-0x8F */ + 0x7E4D,0x7E4E,0x7E4F,0x7E50,0x7E51,0x7E52,0x7E53,0x7E54,/* 0x90-0x97 */ + 0x7E55,0x7E56,0x7E57,0x7E58,0x7E59,0x7E5A,0x7E5B,0x7E5C,/* 0x98-0x9F */ + 0x7E5D,0x4FCA,0x7AE3,0x6D5A,0x90E1,0x9A8F,0x5580,0x5496,/* 0xA0-0xA7 */ + 0x5361,0x54AF,0x5F00,0x63E9,0x6977,0x51EF,0x6168,0x520A,/* 0xA8-0xAF */ + 0x582A,0x52D8,0x574E,0x780D,0x770B,0x5EB7,0x6177,0x7CE0,/* 0xB0-0xB7 */ + 0x625B,0x6297,0x4EA2,0x7095,0x8003,0x62F7,0x70E4,0x9760,/* 0xB8-0xBF */ + 0x5777,0x82DB,0x67EF,0x68F5,0x78D5,0x9897,0x79D1,0x58F3,/* 0xC0-0xC7 */ + 0x54B3,0x53EF,0x6E34,0x514B,0x523B,0x5BA2,0x8BFE,0x80AF,/* 0xC8-0xCF */ + 0x5543,0x57A6,0x6073,0x5751,0x542D,0x7A7A,0x6050,0x5B54,/* 0xD0-0xD7 */ + 0x63A7,0x62A0,0x53E3,0x6263,0x5BC7,0x67AF,0x54ED,0x7A9F,/* 0xD8-0xDF */ + 0x82E6,0x9177,0x5E93,0x88E4,0x5938,0x57AE,0x630E,0x8DE8,/* 0xE0-0xE7 */ + 0x80EF,0x5757,0x7B77,0x4FA9,0x5FEB,0x5BBD,0x6B3E,0x5321,/* 0xE8-0xEF */ + 0x7B50,0x72C2,0x6846,0x77FF,0x7736,0x65F7,0x51B5,0x4E8F,/* 0xF0-0xF7 */ + 0x76D4,0x5CBF,0x7AA5,0x8475,0x594E,0x9B41,0x5080,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E5E,0x7E5F,0x7E60,0x7E61,0x7E62,0x7E63,0x7E64,0x7E65,/* 0x40-0x47 */ + 0x7E66,0x7E67,0x7E68,0x7E69,0x7E6A,0x7E6B,0x7E6C,0x7E6D,/* 0x48-0x4F */ + 0x7E6E,0x7E6F,0x7E70,0x7E71,0x7E72,0x7E73,0x7E74,0x7E75,/* 0x50-0x57 */ + 0x7E76,0x7E77,0x7E78,0x7E79,0x7E7A,0x7E7B,0x7E7C,0x7E7D,/* 0x58-0x5F */ + 0x7E7E,0x7E7F,0x7E80,0x7E81,0x7E83,0x7E84,0x7E85,0x7E86,/* 0x60-0x67 */ + 0x7E87,0x7E88,0x7E89,0x7E8A,0x7E8B,0x7E8C,0x7E8D,0x7E8E,/* 0x68-0x6F */ + 0x7E8F,0x7E90,0x7E91,0x7E92,0x7E93,0x7E94,0x7E95,0x7E96,/* 0x70-0x77 */ + 0x7E97,0x7E98,0x7E99,0x7E9A,0x7E9C,0x7E9D,0x7E9E,0x0000,/* 0x78-0x7F */ + + 0x7EAE,0x7EB4,0x7EBB,0x7EBC,0x7ED6,0x7EE4,0x7EEC,0x7EF9,/* 0x80-0x87 */ + 0x7F0A,0x7F10,0x7F1E,0x7F37,0x7F39,0x7F3B,0x7F3C,0x7F3D,/* 0x88-0x8F */ + 0x7F3E,0x7F3F,0x7F40,0x7F41,0x7F43,0x7F46,0x7F47,0x7F48,/* 0x90-0x97 */ + 0x7F49,0x7F4A,0x7F4B,0x7F4C,0x7F4D,0x7F4E,0x7F4F,0x7F52,/* 0x98-0x9F */ + 0x7F53,0x9988,0x6127,0x6E83,0x5764,0x6606,0x6346,0x56F0,/* 0xA0-0xA7 */ + 0x62EC,0x6269,0x5ED3,0x9614,0x5783,0x62C9,0x5587,0x8721,/* 0xA8-0xAF */ + 0x814A,0x8FA3,0x5566,0x83B1,0x6765,0x8D56,0x84DD,0x5A6A,/* 0xB0-0xB7 */ + 0x680F,0x62E6,0x7BEE,0x9611,0x5170,0x6F9C,0x8C30,0x63FD,/* 0xB8-0xBF */ + 0x89C8,0x61D2,0x7F06,0x70C2,0x6EE5,0x7405,0x6994,0x72FC,/* 0xC0-0xC7 */ + 0x5ECA,0x90CE,0x6717,0x6D6A,0x635E,0x52B3,0x7262,0x8001,/* 0xC8-0xCF */ + 0x4F6C,0x59E5,0x916A,0x70D9,0x6D9D,0x52D2,0x4E50,0x96F7,/* 0xD0-0xD7 */ + 0x956D,0x857E,0x78CA,0x7D2F,0x5121,0x5792,0x64C2,0x808B,/* 0xD8-0xDF */ + 0x7C7B,0x6CEA,0x68F1,0x695E,0x51B7,0x5398,0x68A8,0x7281,/* 0xE0-0xE7 */ + 0x9ECE,0x7BF1,0x72F8,0x79BB,0x6F13,0x7406,0x674E,0x91CC,/* 0xE8-0xEF */ + 0x9CA4,0x793C,0x8389,0x8354,0x540F,0x6817,0x4E3D,0x5389,/* 0xF0-0xF7 */ + 0x52B1,0x783E,0x5386,0x5229,0x5088,0x4F8B,0x4FD0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F56,0x7F59,0x7F5B,0x7F5C,0x7F5D,0x7F5E,0x7F60,0x7F63,/* 0x40-0x47 */ + 0x7F64,0x7F65,0x7F66,0x7F67,0x7F6B,0x7F6C,0x7F6D,0x7F6F,/* 0x48-0x4F */ + 0x7F70,0x7F73,0x7F75,0x7F76,0x7F77,0x7F78,0x7F7A,0x7F7B,/* 0x50-0x57 */ + 0x7F7C,0x7F7D,0x7F7F,0x7F80,0x7F82,0x7F83,0x7F84,0x7F85,/* 0x58-0x5F */ + 0x7F86,0x7F87,0x7F88,0x7F89,0x7F8B,0x7F8D,0x7F8F,0x7F90,/* 0x60-0x67 */ + 0x7F91,0x7F92,0x7F93,0x7F95,0x7F96,0x7F97,0x7F98,0x7F99,/* 0x68-0x6F */ + 0x7F9B,0x7F9C,0x7FA0,0x7FA2,0x7FA3,0x7FA5,0x7FA6,0x7FA8,/* 0x70-0x77 */ + 0x7FA9,0x7FAA,0x7FAB,0x7FAC,0x7FAD,0x7FAE,0x7FB1,0x0000,/* 0x78-0x7F */ + + 0x7FB3,0x7FB4,0x7FB5,0x7FB6,0x7FB7,0x7FBA,0x7FBB,0x7FBE,/* 0x80-0x87 */ + 0x7FC0,0x7FC2,0x7FC3,0x7FC4,0x7FC6,0x7FC7,0x7FC8,0x7FC9,/* 0x88-0x8F */ + 0x7FCB,0x7FCD,0x7FCF,0x7FD0,0x7FD1,0x7FD2,0x7FD3,0x7FD6,/* 0x90-0x97 */ + 0x7FD7,0x7FD9,0x7FDA,0x7FDB,0x7FDC,0x7FDD,0x7FDE,0x7FE2,/* 0x98-0x9F */ + 0x7FE3,0x75E2,0x7ACB,0x7C92,0x6CA5,0x96B6,0x529B,0x7483,/* 0xA0-0xA7 */ + 0x54E9,0x4FE9,0x8054,0x83B2,0x8FDE,0x9570,0x5EC9,0x601C,/* 0xA8-0xAF */ + 0x6D9F,0x5E18,0x655B,0x8138,0x94FE,0x604B,0x70BC,0x7EC3,/* 0xB0-0xB7 */ + 0x7CAE,0x51C9,0x6881,0x7CB1,0x826F,0x4E24,0x8F86,0x91CF,/* 0xB8-0xBF */ + 0x667E,0x4EAE,0x8C05,0x64A9,0x804A,0x50DA,0x7597,0x71CE,/* 0xC0-0xC7 */ + 0x5BE5,0x8FBD,0x6F66,0x4E86,0x6482,0x9563,0x5ED6,0x6599,/* 0xC8-0xCF */ + 0x5217,0x88C2,0x70C8,0x52A3,0x730E,0x7433,0x6797,0x78F7,/* 0xD0-0xD7 */ + 0x9716,0x4E34,0x90BB,0x9CDE,0x6DCB,0x51DB,0x8D41,0x541D,/* 0xD8-0xDF */ + 0x62CE,0x73B2,0x83F1,0x96F6,0x9F84,0x94C3,0x4F36,0x7F9A,/* 0xE0-0xE7 */ + 0x51CC,0x7075,0x9675,0x5CAD,0x9886,0x53E6,0x4EE4,0x6E9C,/* 0xE8-0xEF */ + 0x7409,0x69B4,0x786B,0x998F,0x7559,0x5218,0x7624,0x6D41,/* 0xF0-0xF7 */ + 0x67F3,0x516D,0x9F99,0x804B,0x5499,0x7B3C,0x7ABF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7FE4,0x7FE7,0x7FE8,0x7FEA,0x7FEB,0x7FEC,0x7FED,0x7FEF,/* 0x40-0x47 */ + 0x7FF2,0x7FF4,0x7FF5,0x7FF6,0x7FF7,0x7FF8,0x7FF9,0x7FFA,/* 0x48-0x4F */ + 0x7FFD,0x7FFE,0x7FFF,0x8002,0x8007,0x8008,0x8009,0x800A,/* 0x50-0x57 */ + 0x800E,0x800F,0x8011,0x8013,0x801A,0x801B,0x801D,0x801E,/* 0x58-0x5F */ + 0x801F,0x8021,0x8023,0x8024,0x802B,0x802C,0x802D,0x802E,/* 0x60-0x67 */ + 0x802F,0x8030,0x8032,0x8034,0x8039,0x803A,0x803C,0x803E,/* 0x68-0x6F */ + 0x8040,0x8041,0x8044,0x8045,0x8047,0x8048,0x8049,0x804E,/* 0x70-0x77 */ + 0x804F,0x8050,0x8051,0x8053,0x8055,0x8056,0x8057,0x0000,/* 0x78-0x7F */ + + 0x8059,0x805B,0x805C,0x805D,0x805E,0x805F,0x8060,0x8061,/* 0x80-0x87 */ + 0x8062,0x8063,0x8064,0x8065,0x8066,0x8067,0x8068,0x806B,/* 0x88-0x8F */ + 0x806C,0x806D,0x806E,0x806F,0x8070,0x8072,0x8073,0x8074,/* 0x90-0x97 */ + 0x8075,0x8076,0x8077,0x8078,0x8079,0x807A,0x807B,0x807C,/* 0x98-0x9F */ + 0x807D,0x9686,0x5784,0x62E2,0x9647,0x697C,0x5A04,0x6402,/* 0xA0-0xA7 */ + 0x7BD3,0x6F0F,0x964B,0x82A6,0x5362,0x9885,0x5E90,0x7089,/* 0xA8-0xAF */ + 0x63B3,0x5364,0x864F,0x9C81,0x9E93,0x788C,0x9732,0x8DEF,/* 0xB0-0xB7 */ + 0x8D42,0x9E7F,0x6F5E,0x7984,0x5F55,0x9646,0x622E,0x9A74,/* 0xB8-0xBF */ + 0x5415,0x94DD,0x4FA3,0x65C5,0x5C65,0x5C61,0x7F15,0x8651,/* 0xC0-0xC7 */ + 0x6C2F,0x5F8B,0x7387,0x6EE4,0x7EFF,0x5CE6,0x631B,0x5B6A,/* 0xC8-0xCF */ + 0x6EE6,0x5375,0x4E71,0x63A0,0x7565,0x62A1,0x8F6E,0x4F26,/* 0xD0-0xD7 */ + 0x4ED1,0x6CA6,0x7EB6,0x8BBA,0x841D,0x87BA,0x7F57,0x903B,/* 0xD8-0xDF */ + 0x9523,0x7BA9,0x9AA1,0x88F8,0x843D,0x6D1B,0x9A86,0x7EDC,/* 0xE0-0xE7 */ + 0x5988,0x9EBB,0x739B,0x7801,0x8682,0x9A6C,0x9A82,0x561B,/* 0xE8-0xEF */ + 0x5417,0x57CB,0x4E70,0x9EA6,0x5356,0x8FC8,0x8109,0x7792,/* 0xF0-0xF7 */ + 0x9992,0x86EE,0x6EE1,0x8513,0x66FC,0x6162,0x6F2B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x807E,0x8081,0x8082,0x8085,0x8088,0x808A,0x808D,0x808E,/* 0x40-0x47 */ + 0x808F,0x8090,0x8091,0x8092,0x8094,0x8095,0x8097,0x8099,/* 0x48-0x4F */ + 0x809E,0x80A3,0x80A6,0x80A7,0x80A8,0x80AC,0x80B0,0x80B3,/* 0x50-0x57 */ + 0x80B5,0x80B6,0x80B8,0x80B9,0x80BB,0x80C5,0x80C7,0x80C8,/* 0x58-0x5F */ + 0x80C9,0x80CA,0x80CB,0x80CF,0x80D0,0x80D1,0x80D2,0x80D3,/* 0x60-0x67 */ + 0x80D4,0x80D5,0x80D8,0x80DF,0x80E0,0x80E2,0x80E3,0x80E6,/* 0x68-0x6F */ + 0x80EE,0x80F5,0x80F7,0x80F9,0x80FB,0x80FE,0x80FF,0x8100,/* 0x70-0x77 */ + 0x8101,0x8103,0x8104,0x8105,0x8107,0x8108,0x810B,0x0000,/* 0x78-0x7F */ + + 0x810C,0x8115,0x8117,0x8119,0x811B,0x811C,0x811D,0x811F,/* 0x80-0x87 */ + 0x8120,0x8121,0x8122,0x8123,0x8124,0x8125,0x8126,0x8127,/* 0x88-0x8F */ + 0x8128,0x8129,0x812A,0x812B,0x812D,0x812E,0x8130,0x8133,/* 0x90-0x97 */ + 0x8134,0x8135,0x8137,0x8139,0x813A,0x813B,0x813C,0x813D,/* 0x98-0x9F */ + 0x813F,0x8C29,0x8292,0x832B,0x76F2,0x6C13,0x5FD9,0x83BD,/* 0xA0-0xA7 */ + 0x732B,0x8305,0x951A,0x6BDB,0x77DB,0x94C6,0x536F,0x8302,/* 0xA8-0xAF */ + 0x5192,0x5E3D,0x8C8C,0x8D38,0x4E48,0x73AB,0x679A,0x6885,/* 0xB0-0xB7 */ + 0x9176,0x9709,0x7164,0x6CA1,0x7709,0x5A92,0x9541,0x6BCF,/* 0xB8-0xBF */ + 0x7F8E,0x6627,0x5BD0,0x59B9,0x5A9A,0x95E8,0x95F7,0x4EEC,/* 0xC0-0xC7 */ + 0x840C,0x8499,0x6AAC,0x76DF,0x9530,0x731B,0x68A6,0x5B5F,/* 0xC8-0xCF */ + 0x772F,0x919A,0x9761,0x7CDC,0x8FF7,0x8C1C,0x5F25,0x7C73,/* 0xD0-0xD7 */ + 0x79D8,0x89C5,0x6CCC,0x871C,0x5BC6,0x5E42,0x68C9,0x7720,/* 0xD8-0xDF */ + 0x7EF5,0x5195,0x514D,0x52C9,0x5A29,0x7F05,0x9762,0x82D7,/* 0xE0-0xE7 */ + 0x63CF,0x7784,0x85D0,0x79D2,0x6E3A,0x5E99,0x5999,0x8511,/* 0xE8-0xEF */ + 0x706D,0x6C11,0x62BF,0x76BF,0x654F,0x60AF,0x95FD,0x660E,/* 0xF0-0xF7 */ + 0x879F,0x9E23,0x94ED,0x540D,0x547D,0x8C2C,0x6478,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8140,0x8141,0x8142,0x8143,0x8144,0x8145,0x8147,0x8149,/* 0x40-0x47 */ + 0x814D,0x814E,0x814F,0x8152,0x8156,0x8157,0x8158,0x815B,/* 0x48-0x4F */ + 0x815C,0x815D,0x815E,0x815F,0x8161,0x8162,0x8163,0x8164,/* 0x50-0x57 */ + 0x8166,0x8168,0x816A,0x816B,0x816C,0x816F,0x8172,0x8173,/* 0x58-0x5F */ + 0x8175,0x8176,0x8177,0x8178,0x8181,0x8183,0x8184,0x8185,/* 0x60-0x67 */ + 0x8186,0x8187,0x8189,0x818B,0x818C,0x818D,0x818E,0x8190,/* 0x68-0x6F */ + 0x8192,0x8193,0x8194,0x8195,0x8196,0x8197,0x8199,0x819A,/* 0x70-0x77 */ + 0x819E,0x819F,0x81A0,0x81A1,0x81A2,0x81A4,0x81A5,0x0000,/* 0x78-0x7F */ + + 0x81A7,0x81A9,0x81AB,0x81AC,0x81AD,0x81AE,0x81AF,0x81B0,/* 0x80-0x87 */ + 0x81B1,0x81B2,0x81B4,0x81B5,0x81B6,0x81B7,0x81B8,0x81B9,/* 0x88-0x8F */ + 0x81BC,0x81BD,0x81BE,0x81BF,0x81C4,0x81C5,0x81C7,0x81C8,/* 0x90-0x97 */ + 0x81C9,0x81CB,0x81CD,0x81CE,0x81CF,0x81D0,0x81D1,0x81D2,/* 0x98-0x9F */ + 0x81D3,0x6479,0x8611,0x6A21,0x819C,0x78E8,0x6469,0x9B54,/* 0xA0-0xA7 */ + 0x62B9,0x672B,0x83AB,0x58A8,0x9ED8,0x6CAB,0x6F20,0x5BDE,/* 0xA8-0xAF */ + 0x964C,0x8C0B,0x725F,0x67D0,0x62C7,0x7261,0x4EA9,0x59C6,/* 0xB0-0xB7 */ + 0x6BCD,0x5893,0x66AE,0x5E55,0x52DF,0x6155,0x6728,0x76EE,/* 0xB8-0xBF */ + 0x7766,0x7267,0x7A46,0x62FF,0x54EA,0x5450,0x94A0,0x90A3,/* 0xC0-0xC7 */ + 0x5A1C,0x7EB3,0x6C16,0x4E43,0x5976,0x8010,0x5948,0x5357,/* 0xC8-0xCF */ + 0x7537,0x96BE,0x56CA,0x6320,0x8111,0x607C,0x95F9,0x6DD6,/* 0xD0-0xD7 */ + 0x5462,0x9981,0x5185,0x5AE9,0x80FD,0x59AE,0x9713,0x502A,/* 0xD8-0xDF */ + 0x6CE5,0x5C3C,0x62DF,0x4F60,0x533F,0x817B,0x9006,0x6EBA,/* 0xE0-0xE7 */ + 0x852B,0x62C8,0x5E74,0x78BE,0x64B5,0x637B,0x5FF5,0x5A18,/* 0xE8-0xEF */ + 0x917F,0x9E1F,0x5C3F,0x634F,0x8042,0x5B7D,0x556E,0x954A,/* 0xF0-0xF7 */ + 0x954D,0x6D85,0x60A8,0x67E0,0x72DE,0x51DD,0x5B81,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x81D4,0x81D5,0x81D6,0x81D7,0x81D8,0x81D9,0x81DA,0x81DB,/* 0x40-0x47 */ + 0x81DC,0x81DD,0x81DE,0x81DF,0x81E0,0x81E1,0x81E2,0x81E4,/* 0x48-0x4F */ + 0x81E5,0x81E6,0x81E8,0x81E9,0x81EB,0x81EE,0x81EF,0x81F0,/* 0x50-0x57 */ + 0x81F1,0x81F2,0x81F5,0x81F6,0x81F7,0x81F8,0x81F9,0x81FA,/* 0x58-0x5F */ + 0x81FD,0x81FF,0x8203,0x8207,0x8208,0x8209,0x820A,0x820B,/* 0x60-0x67 */ + 0x820E,0x820F,0x8211,0x8213,0x8215,0x8216,0x8217,0x8218,/* 0x68-0x6F */ + 0x8219,0x821A,0x821D,0x8220,0x8224,0x8225,0x8226,0x8227,/* 0x70-0x77 */ + 0x8229,0x822E,0x8232,0x823A,0x823C,0x823D,0x823F,0x0000,/* 0x78-0x7F */ + + 0x8240,0x8241,0x8242,0x8243,0x8245,0x8246,0x8248,0x824A,/* 0x80-0x87 */ + 0x824C,0x824D,0x824E,0x8250,0x8251,0x8252,0x8253,0x8254,/* 0x88-0x8F */ + 0x8255,0x8256,0x8257,0x8259,0x825B,0x825C,0x825D,0x825E,/* 0x90-0x97 */ + 0x8260,0x8261,0x8262,0x8263,0x8264,0x8265,0x8266,0x8267,/* 0x98-0x9F */ + 0x8269,0x62E7,0x6CDE,0x725B,0x626D,0x94AE,0x7EBD,0x8113,/* 0xA0-0xA7 */ + 0x6D53,0x519C,0x5F04,0x5974,0x52AA,0x6012,0x5973,0x6696,/* 0xA8-0xAF */ + 0x8650,0x759F,0x632A,0x61E6,0x7CEF,0x8BFA,0x54E6,0x6B27,/* 0xB0-0xB7 */ + 0x9E25,0x6BB4,0x85D5,0x5455,0x5076,0x6CA4,0x556A,0x8DB4,/* 0xB8-0xBF */ + 0x722C,0x5E15,0x6015,0x7436,0x62CD,0x6392,0x724C,0x5F98,/* 0xC0-0xC7 */ + 0x6E43,0x6D3E,0x6500,0x6F58,0x76D8,0x78D0,0x76FC,0x7554,/* 0xC8-0xCF */ + 0x5224,0x53DB,0x4E53,0x5E9E,0x65C1,0x802A,0x80D6,0x629B,/* 0xD0-0xD7 */ + 0x5486,0x5228,0x70AE,0x888D,0x8DD1,0x6CE1,0x5478,0x80DA,/* 0xD8-0xDF */ + 0x57F9,0x88F4,0x8D54,0x966A,0x914D,0x4F69,0x6C9B,0x55B7,/* 0xE0-0xE7 */ + 0x76C6,0x7830,0x62A8,0x70F9,0x6F8E,0x5F6D,0x84EC,0x68DA,/* 0xE8-0xEF */ + 0x787C,0x7BF7,0x81A8,0x670B,0x9E4F,0x6367,0x78B0,0x576F,/* 0xF0-0xF7 */ + 0x7812,0x9739,0x6279,0x62AB,0x5288,0x7435,0x6BD7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x826A,0x826B,0x826C,0x826D,0x8271,0x8275,0x8276,0x8277,/* 0x40-0x47 */ + 0x8278,0x827B,0x827C,0x8280,0x8281,0x8283,0x8285,0x8286,/* 0x48-0x4F */ + 0x8287,0x8289,0x828C,0x8290,0x8293,0x8294,0x8295,0x8296,/* 0x50-0x57 */ + 0x829A,0x829B,0x829E,0x82A0,0x82A2,0x82A3,0x82A7,0x82B2,/* 0x58-0x5F */ + 0x82B5,0x82B6,0x82BA,0x82BB,0x82BC,0x82BF,0x82C0,0x82C2,/* 0x60-0x67 */ + 0x82C3,0x82C5,0x82C6,0x82C9,0x82D0,0x82D6,0x82D9,0x82DA,/* 0x68-0x6F */ + 0x82DD,0x82E2,0x82E7,0x82E8,0x82E9,0x82EA,0x82EC,0x82ED,/* 0x70-0x77 */ + 0x82EE,0x82F0,0x82F2,0x82F3,0x82F5,0x82F6,0x82F8,0x0000,/* 0x78-0x7F */ + + 0x82FA,0x82FC,0x82FD,0x82FE,0x82FF,0x8300,0x830A,0x830B,/* 0x80-0x87 */ + 0x830D,0x8310,0x8312,0x8313,0x8316,0x8318,0x8319,0x831D,/* 0x88-0x8F */ + 0x831E,0x831F,0x8320,0x8321,0x8322,0x8323,0x8324,0x8325,/* 0x90-0x97 */ + 0x8326,0x8329,0x832A,0x832E,0x8330,0x8332,0x8337,0x833B,/* 0x98-0x9F */ + 0x833D,0x5564,0x813E,0x75B2,0x76AE,0x5339,0x75DE,0x50FB,/* 0xA0-0xA7 */ + 0x5C41,0x8B6C,0x7BC7,0x504F,0x7247,0x9A97,0x98D8,0x6F02,/* 0xA8-0xAF */ + 0x74E2,0x7968,0x6487,0x77A5,0x62FC,0x9891,0x8D2B,0x54C1,/* 0xB0-0xB7 */ + 0x8058,0x4E52,0x576A,0x82F9,0x840D,0x5E73,0x51ED,0x74F6,/* 0xB8-0xBF */ + 0x8BC4,0x5C4F,0x5761,0x6CFC,0x9887,0x5A46,0x7834,0x9B44,/* 0xC0-0xC7 */ + 0x8FEB,0x7C95,0x5256,0x6251,0x94FA,0x4EC6,0x8386,0x8461,/* 0xC8-0xCF */ + 0x83E9,0x84B2,0x57D4,0x6734,0x5703,0x666E,0x6D66,0x8C31,/* 0xD0-0xD7 */ + 0x66DD,0x7011,0x671F,0x6B3A,0x6816,0x621A,0x59BB,0x4E03,/* 0xD8-0xDF */ + 0x51C4,0x6F06,0x67D2,0x6C8F,0x5176,0x68CB,0x5947,0x6B67,/* 0xE0-0xE7 */ + 0x7566,0x5D0E,0x8110,0x9F50,0x65D7,0x7948,0x7941,0x9A91,/* 0xE8-0xEF */ + 0x8D77,0x5C82,0x4E5E,0x4F01,0x542F,0x5951,0x780C,0x5668,/* 0xF0-0xF7 */ + 0x6C14,0x8FC4,0x5F03,0x6C7D,0x6CE3,0x8BAB,0x6390,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x833E,0x833F,0x8341,0x8342,0x8344,0x8345,0x8348,0x834A,/* 0x40-0x47 */ + 0x834B,0x834C,0x834D,0x834E,0x8353,0x8355,0x8356,0x8357,/* 0x48-0x4F */ + 0x8358,0x8359,0x835D,0x8362,0x8370,0x8371,0x8372,0x8373,/* 0x50-0x57 */ + 0x8374,0x8375,0x8376,0x8379,0x837A,0x837E,0x837F,0x8380,/* 0x58-0x5F */ + 0x8381,0x8382,0x8383,0x8384,0x8387,0x8388,0x838A,0x838B,/* 0x60-0x67 */ + 0x838C,0x838D,0x838F,0x8390,0x8391,0x8394,0x8395,0x8396,/* 0x68-0x6F */ + 0x8397,0x8399,0x839A,0x839D,0x839F,0x83A1,0x83A2,0x83A3,/* 0x70-0x77 */ + 0x83A4,0x83A5,0x83A6,0x83A7,0x83AC,0x83AD,0x83AE,0x0000,/* 0x78-0x7F */ + + 0x83AF,0x83B5,0x83BB,0x83BE,0x83BF,0x83C2,0x83C3,0x83C4,/* 0x80-0x87 */ + 0x83C6,0x83C8,0x83C9,0x83CB,0x83CD,0x83CE,0x83D0,0x83D1,/* 0x88-0x8F */ + 0x83D2,0x83D3,0x83D5,0x83D7,0x83D9,0x83DA,0x83DB,0x83DE,/* 0x90-0x97 */ + 0x83E2,0x83E3,0x83E4,0x83E6,0x83E7,0x83E8,0x83EB,0x83EC,/* 0x98-0x9F */ + 0x83ED,0x6070,0x6D3D,0x7275,0x6266,0x948E,0x94C5,0x5343,/* 0xA0-0xA7 */ + 0x8FC1,0x7B7E,0x4EDF,0x8C26,0x4E7E,0x9ED4,0x94B1,0x94B3,/* 0xA8-0xAF */ + 0x524D,0x6F5C,0x9063,0x6D45,0x8C34,0x5811,0x5D4C,0x6B20,/* 0xB0-0xB7 */ + 0x6B49,0x67AA,0x545B,0x8154,0x7F8C,0x5899,0x8537,0x5F3A,/* 0xB8-0xBF */ + 0x62A2,0x6A47,0x9539,0x6572,0x6084,0x6865,0x77A7,0x4E54,/* 0xC0-0xC7 */ + 0x4FA8,0x5DE7,0x9798,0x64AC,0x7FD8,0x5CED,0x4FCF,0x7A8D,/* 0xC8-0xCF */ + 0x5207,0x8304,0x4E14,0x602F,0x7A83,0x94A6,0x4FB5,0x4EB2,/* 0xD0-0xD7 */ + 0x79E6,0x7434,0x52E4,0x82B9,0x64D2,0x79BD,0x5BDD,0x6C81,/* 0xD8-0xDF */ + 0x9752,0x8F7B,0x6C22,0x503E,0x537F,0x6E05,0x64CE,0x6674,/* 0xE0-0xE7 */ + 0x6C30,0x60C5,0x9877,0x8BF7,0x5E86,0x743C,0x7A77,0x79CB,/* 0xE8-0xEF */ + 0x4E18,0x90B1,0x7403,0x6C42,0x56DA,0x914B,0x6CC5,0x8D8B,/* 0xF0-0xF7 */ + 0x533A,0x86C6,0x66F2,0x8EAF,0x5C48,0x9A71,0x6E20,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x83EE,0x83EF,0x83F3,0x83F4,0x83F5,0x83F6,0x83F7,0x83FA,/* 0x40-0x47 */ + 0x83FB,0x83FC,0x83FE,0x83FF,0x8400,0x8402,0x8405,0x8407,/* 0x48-0x4F */ + 0x8408,0x8409,0x840A,0x8410,0x8412,0x8413,0x8414,0x8415,/* 0x50-0x57 */ + 0x8416,0x8417,0x8419,0x841A,0x841B,0x841E,0x841F,0x8420,/* 0x58-0x5F */ + 0x8421,0x8422,0x8423,0x8429,0x842A,0x842B,0x842C,0x842D,/* 0x60-0x67 */ + 0x842E,0x842F,0x8430,0x8432,0x8433,0x8434,0x8435,0x8436,/* 0x68-0x6F */ + 0x8437,0x8439,0x843A,0x843B,0x843E,0x843F,0x8440,0x8441,/* 0x70-0x77 */ + 0x8442,0x8443,0x8444,0x8445,0x8447,0x8448,0x8449,0x0000,/* 0x78-0x7F */ + + 0x844A,0x844B,0x844C,0x844D,0x844E,0x844F,0x8450,0x8452,/* 0x80-0x87 */ + 0x8453,0x8454,0x8455,0x8456,0x8458,0x845D,0x845E,0x845F,/* 0x88-0x8F */ + 0x8460,0x8462,0x8464,0x8465,0x8466,0x8467,0x8468,0x846A,/* 0x90-0x97 */ + 0x846E,0x846F,0x8470,0x8472,0x8474,0x8477,0x8479,0x847B,/* 0x98-0x9F */ + 0x847C,0x53D6,0x5A36,0x9F8B,0x8DA3,0x53BB,0x5708,0x98A7,/* 0xA0-0xA7 */ + 0x6743,0x919B,0x6CC9,0x5168,0x75CA,0x62F3,0x72AC,0x5238,/* 0xA8-0xAF */ + 0x529D,0x7F3A,0x7094,0x7638,0x5374,0x9E4A,0x69B7,0x786E,/* 0xB0-0xB7 */ + 0x96C0,0x88D9,0x7FA4,0x7136,0x71C3,0x5189,0x67D3,0x74E4,/* 0xB8-0xBF */ + 0x58E4,0x6518,0x56B7,0x8BA9,0x9976,0x6270,0x7ED5,0x60F9,/* 0xC0-0xC7 */ + 0x70ED,0x58EC,0x4EC1,0x4EBA,0x5FCD,0x97E7,0x4EFB,0x8BA4,/* 0xC8-0xCF */ + 0x5203,0x598A,0x7EAB,0x6254,0x4ECD,0x65E5,0x620E,0x8338,/* 0xD0-0xD7 */ + 0x84C9,0x8363,0x878D,0x7194,0x6EB6,0x5BB9,0x7ED2,0x5197,/* 0xD8-0xDF */ + 0x63C9,0x67D4,0x8089,0x8339,0x8815,0x5112,0x5B7A,0x5982,/* 0xE0-0xE7 */ + 0x8FB1,0x4E73,0x6C5D,0x5165,0x8925,0x8F6F,0x962E,0x854A,/* 0xE8-0xEF */ + 0x745E,0x9510,0x95F0,0x6DA6,0x82E5,0x5F31,0x6492,0x6D12,/* 0xF0-0xF7 */ + 0x8428,0x816E,0x9CC3,0x585E,0x8D5B,0x4E09,0x53C1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x847D,0x847E,0x847F,0x8480,0x8481,0x8483,0x8484,0x8485,/* 0x40-0x47 */ + 0x8486,0x848A,0x848D,0x848F,0x8490,0x8491,0x8492,0x8493,/* 0x48-0x4F */ + 0x8494,0x8495,0x8496,0x8498,0x849A,0x849B,0x849D,0x849E,/* 0x50-0x57 */ + 0x849F,0x84A0,0x84A2,0x84A3,0x84A4,0x84A5,0x84A6,0x84A7,/* 0x58-0x5F */ + 0x84A8,0x84A9,0x84AA,0x84AB,0x84AC,0x84AD,0x84AE,0x84B0,/* 0x60-0x67 */ + 0x84B1,0x84B3,0x84B5,0x84B6,0x84B7,0x84BB,0x84BC,0x84BE,/* 0x68-0x6F */ + 0x84C0,0x84C2,0x84C3,0x84C5,0x84C6,0x84C7,0x84C8,0x84CB,/* 0x70-0x77 */ + 0x84CC,0x84CE,0x84CF,0x84D2,0x84D4,0x84D5,0x84D7,0x0000,/* 0x78-0x7F */ + + 0x84D8,0x84D9,0x84DA,0x84DB,0x84DC,0x84DE,0x84E1,0x84E2,/* 0x80-0x87 */ + 0x84E4,0x84E7,0x84E8,0x84E9,0x84EA,0x84EB,0x84ED,0x84EE,/* 0x88-0x8F */ + 0x84EF,0x84F1,0x84F2,0x84F3,0x84F4,0x84F5,0x84F6,0x84F7,/* 0x90-0x97 */ + 0x84F8,0x84F9,0x84FA,0x84FB,0x84FD,0x84FE,0x8500,0x8501,/* 0x98-0x9F */ + 0x8502,0x4F1E,0x6563,0x6851,0x55D3,0x4E27,0x6414,0x9A9A,/* 0xA0-0xA7 */ + 0x626B,0x5AC2,0x745F,0x8272,0x6DA9,0x68EE,0x50E7,0x838E,/* 0xA8-0xAF */ + 0x7802,0x6740,0x5239,0x6C99,0x7EB1,0x50BB,0x5565,0x715E,/* 0xB0-0xB7 */ + 0x7B5B,0x6652,0x73CA,0x82EB,0x6749,0x5C71,0x5220,0x717D,/* 0xB8-0xBF */ + 0x886B,0x95EA,0x9655,0x64C5,0x8D61,0x81B3,0x5584,0x6C55,/* 0xC0-0xC7 */ + 0x6247,0x7F2E,0x5892,0x4F24,0x5546,0x8D4F,0x664C,0x4E0A,/* 0xC8-0xCF */ + 0x5C1A,0x88F3,0x68A2,0x634E,0x7A0D,0x70E7,0x828D,0x52FA,/* 0xD0-0xD7 */ + 0x97F6,0x5C11,0x54E8,0x90B5,0x7ECD,0x5962,0x8D4A,0x86C7,/* 0xD8-0xDF */ + 0x820C,0x820D,0x8D66,0x6444,0x5C04,0x6151,0x6D89,0x793E,/* 0xE0-0xE7 */ + 0x8BBE,0x7837,0x7533,0x547B,0x4F38,0x8EAB,0x6DF1,0x5A20,/* 0xE8-0xEF */ + 0x7EC5,0x795E,0x6C88,0x5BA1,0x5A76,0x751A,0x80BE,0x614E,/* 0xF0-0xF7 */ + 0x6E17,0x58F0,0x751F,0x7525,0x7272,0x5347,0x7EF3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8503,0x8504,0x8505,0x8506,0x8507,0x8508,0x8509,0x850A,/* 0x40-0x47 */ + 0x850B,0x850D,0x850E,0x850F,0x8510,0x8512,0x8514,0x8515,/* 0x48-0x4F */ + 0x8516,0x8518,0x8519,0x851B,0x851C,0x851D,0x851E,0x8520,/* 0x50-0x57 */ + 0x8522,0x8523,0x8524,0x8525,0x8526,0x8527,0x8528,0x8529,/* 0x58-0x5F */ + 0x852A,0x852D,0x852E,0x852F,0x8530,0x8531,0x8532,0x8533,/* 0x60-0x67 */ + 0x8534,0x8535,0x8536,0x853E,0x853F,0x8540,0x8541,0x8542,/* 0x68-0x6F */ + 0x8544,0x8545,0x8546,0x8547,0x854B,0x854C,0x854D,0x854E,/* 0x70-0x77 */ + 0x854F,0x8550,0x8551,0x8552,0x8553,0x8554,0x8555,0x0000,/* 0x78-0x7F */ + + 0x8557,0x8558,0x855A,0x855B,0x855C,0x855D,0x855F,0x8560,/* 0x80-0x87 */ + 0x8561,0x8562,0x8563,0x8565,0x8566,0x8567,0x8569,0x856A,/* 0x88-0x8F */ + 0x856B,0x856C,0x856D,0x856E,0x856F,0x8570,0x8571,0x8573,/* 0x90-0x97 */ + 0x8575,0x8576,0x8577,0x8578,0x857C,0x857D,0x857F,0x8580,/* 0x98-0x9F */ + 0x8581,0x7701,0x76DB,0x5269,0x80DC,0x5723,0x5E08,0x5931,/* 0xA0-0xA7 */ + 0x72EE,0x65BD,0x6E7F,0x8BD7,0x5C38,0x8671,0x5341,0x77F3,/* 0xA8-0xAF */ + 0x62FE,0x65F6,0x4EC0,0x98DF,0x8680,0x5B9E,0x8BC6,0x53F2,/* 0xB0-0xB7 */ + 0x77E2,0x4F7F,0x5C4E,0x9A76,0x59CB,0x5F0F,0x793A,0x58EB,/* 0xB8-0xBF */ + 0x4E16,0x67FF,0x4E8B,0x62ED,0x8A93,0x901D,0x52BF,0x662F,/* 0xC0-0xC7 */ + 0x55DC,0x566C,0x9002,0x4ED5,0x4F8D,0x91CA,0x9970,0x6C0F,/* 0xC8-0xCF */ + 0x5E02,0x6043,0x5BA4,0x89C6,0x8BD5,0x6536,0x624B,0x9996,/* 0xD0-0xD7 */ + 0x5B88,0x5BFF,0x6388,0x552E,0x53D7,0x7626,0x517D,0x852C,/* 0xD8-0xDF */ + 0x67A2,0x68B3,0x6B8A,0x6292,0x8F93,0x53D4,0x8212,0x6DD1,/* 0xE0-0xE7 */ + 0x758F,0x4E66,0x8D4E,0x5B70,0x719F,0x85AF,0x6691,0x66D9,/* 0xE8-0xEF */ + 0x7F72,0x8700,0x9ECD,0x9F20,0x5C5E,0x672F,0x8FF0,0x6811,/* 0xF0-0xF7 */ + 0x675F,0x620D,0x7AD6,0x5885,0x5EB6,0x6570,0x6F31,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8582,0x8583,0x8586,0x8588,0x8589,0x858A,0x858B,0x858C,/* 0x40-0x47 */ + 0x858D,0x858E,0x8590,0x8591,0x8592,0x8593,0x8594,0x8595,/* 0x48-0x4F */ + 0x8596,0x8597,0x8598,0x8599,0x859A,0x859D,0x859E,0x859F,/* 0x50-0x57 */ + 0x85A0,0x85A1,0x85A2,0x85A3,0x85A5,0x85A6,0x85A7,0x85A9,/* 0x58-0x5F */ + 0x85AB,0x85AC,0x85AD,0x85B1,0x85B2,0x85B3,0x85B4,0x85B5,/* 0x60-0x67 */ + 0x85B6,0x85B8,0x85BA,0x85BB,0x85BC,0x85BD,0x85BE,0x85BF,/* 0x68-0x6F */ + 0x85C0,0x85C2,0x85C3,0x85C4,0x85C5,0x85C6,0x85C7,0x85C8,/* 0x70-0x77 */ + 0x85CA,0x85CB,0x85CC,0x85CD,0x85CE,0x85D1,0x85D2,0x0000,/* 0x78-0x7F */ + + 0x85D4,0x85D6,0x85D7,0x85D8,0x85D9,0x85DA,0x85DB,0x85DD,/* 0x80-0x87 */ + 0x85DE,0x85DF,0x85E0,0x85E1,0x85E2,0x85E3,0x85E5,0x85E6,/* 0x88-0x8F */ + 0x85E7,0x85E8,0x85EA,0x85EB,0x85EC,0x85ED,0x85EE,0x85EF,/* 0x90-0x97 */ + 0x85F0,0x85F1,0x85F2,0x85F3,0x85F4,0x85F5,0x85F6,0x85F7,/* 0x98-0x9F */ + 0x85F8,0x6055,0x5237,0x800D,0x6454,0x8870,0x7529,0x5E05,/* 0xA0-0xA7 */ + 0x6813,0x62F4,0x971C,0x53CC,0x723D,0x8C01,0x6C34,0x7761,/* 0xA8-0xAF */ + 0x7A0E,0x542E,0x77AC,0x987A,0x821C,0x8BF4,0x7855,0x6714,/* 0xB0-0xB7 */ + 0x70C1,0x65AF,0x6495,0x5636,0x601D,0x79C1,0x53F8,0x4E1D,/* 0xB8-0xBF */ + 0x6B7B,0x8086,0x5BFA,0x55E3,0x56DB,0x4F3A,0x4F3C,0x9972,/* 0xC0-0xC7 */ + 0x5DF3,0x677E,0x8038,0x6002,0x9882,0x9001,0x5B8B,0x8BBC,/* 0xC8-0xCF */ + 0x8BF5,0x641C,0x8258,0x64DE,0x55FD,0x82CF,0x9165,0x4FD7,/* 0xD0-0xD7 */ + 0x7D20,0x901F,0x7C9F,0x50F3,0x5851,0x6EAF,0x5BBF,0x8BC9,/* 0xD8-0xDF */ + 0x8083,0x9178,0x849C,0x7B97,0x867D,0x968B,0x968F,0x7EE5,/* 0xE0-0xE7 */ + 0x9AD3,0x788E,0x5C81,0x7A57,0x9042,0x96A7,0x795F,0x5B59,/* 0xE8-0xEF */ + 0x635F,0x7B0B,0x84D1,0x68AD,0x5506,0x7F29,0x7410,0x7D22,/* 0xF0-0xF7 */ + 0x9501,0x6240,0x584C,0x4ED6,0x5B83,0x5979,0x5854,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x85F9,0x85FA,0x85FC,0x85FD,0x85FE,0x8600,0x8601,0x8602,/* 0x40-0x47 */ + 0x8603,0x8604,0x8606,0x8607,0x8608,0x8609,0x860A,0x860B,/* 0x48-0x4F */ + 0x860C,0x860D,0x860E,0x860F,0x8610,0x8612,0x8613,0x8614,/* 0x50-0x57 */ + 0x8615,0x8617,0x8618,0x8619,0x861A,0x861B,0x861C,0x861D,/* 0x58-0x5F */ + 0x861E,0x861F,0x8620,0x8621,0x8622,0x8623,0x8624,0x8625,/* 0x60-0x67 */ + 0x8626,0x8628,0x862A,0x862B,0x862C,0x862D,0x862E,0x862F,/* 0x68-0x6F */ + 0x8630,0x8631,0x8632,0x8633,0x8634,0x8635,0x8636,0x8637,/* 0x70-0x77 */ + 0x8639,0x863A,0x863B,0x863D,0x863E,0x863F,0x8640,0x0000,/* 0x78-0x7F */ + + 0x8641,0x8642,0x8643,0x8644,0x8645,0x8646,0x8647,0x8648,/* 0x80-0x87 */ + 0x8649,0x864A,0x864B,0x864C,0x8652,0x8653,0x8655,0x8656,/* 0x88-0x8F */ + 0x8657,0x8658,0x8659,0x865B,0x865C,0x865D,0x865F,0x8660,/* 0x90-0x97 */ + 0x8661,0x8663,0x8664,0x8665,0x8666,0x8667,0x8668,0x8669,/* 0x98-0x9F */ + 0x866A,0x736D,0x631E,0x8E4B,0x8E0F,0x80CE,0x82D4,0x62AC,/* 0xA0-0xA7 */ + 0x53F0,0x6CF0,0x915E,0x592A,0x6001,0x6C70,0x574D,0x644A,/* 0xA8-0xAF */ + 0x8D2A,0x762B,0x6EE9,0x575B,0x6A80,0x75F0,0x6F6D,0x8C2D,/* 0xB0-0xB7 */ + 0x8C08,0x5766,0x6BEF,0x8892,0x78B3,0x63A2,0x53F9,0x70AD,/* 0xB8-0xBF */ + 0x6C64,0x5858,0x642A,0x5802,0x68E0,0x819B,0x5510,0x7CD6,/* 0xC0-0xC7 */ + 0x5018,0x8EBA,0x6DCC,0x8D9F,0x70EB,0x638F,0x6D9B,0x6ED4,/* 0xC8-0xCF */ + 0x7EE6,0x8404,0x6843,0x9003,0x6DD8,0x9676,0x8BA8,0x5957,/* 0xD0-0xD7 */ + 0x7279,0x85E4,0x817E,0x75BC,0x8A8A,0x68AF,0x5254,0x8E22,/* 0xD8-0xDF */ + 0x9511,0x63D0,0x9898,0x8E44,0x557C,0x4F53,0x66FF,0x568F,/* 0xE0-0xE7 */ + 0x60D5,0x6D95,0x5243,0x5C49,0x5929,0x6DFB,0x586B,0x7530,/* 0xE8-0xEF */ + 0x751C,0x606C,0x8214,0x8146,0x6311,0x6761,0x8FE2,0x773A,/* 0xF0-0xF7 */ + 0x8DF3,0x8D34,0x94C1,0x5E16,0x5385,0x542C,0x70C3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x866D,0x866F,0x8670,0x8672,0x8673,0x8674,0x8675,0x8676,/* 0x40-0x47 */ + 0x8677,0x8678,0x8683,0x8684,0x8685,0x8686,0x8687,0x8688,/* 0x48-0x4F */ + 0x8689,0x868E,0x868F,0x8690,0x8691,0x8692,0x8694,0x8696,/* 0x50-0x57 */ + 0x8697,0x8698,0x8699,0x869A,0x869B,0x869E,0x869F,0x86A0,/* 0x58-0x5F */ + 0x86A1,0x86A2,0x86A5,0x86A6,0x86AB,0x86AD,0x86AE,0x86B2,/* 0x60-0x67 */ + 0x86B3,0x86B7,0x86B8,0x86B9,0x86BB,0x86BC,0x86BD,0x86BE,/* 0x68-0x6F */ + 0x86BF,0x86C1,0x86C2,0x86C3,0x86C5,0x86C8,0x86CC,0x86CD,/* 0x70-0x77 */ + 0x86D2,0x86D3,0x86D5,0x86D6,0x86D7,0x86DA,0x86DC,0x0000,/* 0x78-0x7F */ + + 0x86DD,0x86E0,0x86E1,0x86E2,0x86E3,0x86E5,0x86E6,0x86E7,/* 0x80-0x87 */ + 0x86E8,0x86EA,0x86EB,0x86EC,0x86EF,0x86F5,0x86F6,0x86F7,/* 0x88-0x8F */ + 0x86FA,0x86FB,0x86FC,0x86FD,0x86FF,0x8701,0x8704,0x8705,/* 0x90-0x97 */ + 0x8706,0x870B,0x870C,0x870E,0x870F,0x8710,0x8711,0x8714,/* 0x98-0x9F */ + 0x8716,0x6C40,0x5EF7,0x505C,0x4EAD,0x5EAD,0x633A,0x8247,/* 0xA0-0xA7 */ + 0x901A,0x6850,0x916E,0x77B3,0x540C,0x94DC,0x5F64,0x7AE5,/* 0xA8-0xAF */ + 0x6876,0x6345,0x7B52,0x7EDF,0x75DB,0x5077,0x6295,0x5934,/* 0xB0-0xB7 */ + 0x900F,0x51F8,0x79C3,0x7A81,0x56FE,0x5F92,0x9014,0x6D82,/* 0xB8-0xBF */ + 0x5C60,0x571F,0x5410,0x5154,0x6E4D,0x56E2,0x63A8,0x9893,/* 0xC0-0xC7 */ + 0x817F,0x8715,0x892A,0x9000,0x541E,0x5C6F,0x81C0,0x62D6,/* 0xC8-0xCF */ + 0x6258,0x8131,0x9E35,0x9640,0x9A6E,0x9A7C,0x692D,0x59A5,/* 0xD0-0xD7 */ + 0x62D3,0x553E,0x6316,0x54C7,0x86D9,0x6D3C,0x5A03,0x74E6,/* 0xD8-0xDF */ + 0x889C,0x6B6A,0x5916,0x8C4C,0x5F2F,0x6E7E,0x73A9,0x987D,/* 0xE0-0xE7 */ + 0x4E38,0x70F7,0x5B8C,0x7897,0x633D,0x665A,0x7696,0x60CB,/* 0xE8-0xEF */ + 0x5B9B,0x5A49,0x4E07,0x8155,0x6C6A,0x738B,0x4EA1,0x6789,/* 0xF0-0xF7 */ + 0x7F51,0x5F80,0x65FA,0x671B,0x5FD8,0x5984,0x5A01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8719,0x871B,0x871D,0x871F,0x8720,0x8724,0x8726,0x8727,/* 0x40-0x47 */ + 0x8728,0x872A,0x872B,0x872C,0x872D,0x872F,0x8730,0x8732,/* 0x48-0x4F */ + 0x8733,0x8735,0x8736,0x8738,0x8739,0x873A,0x873C,0x873D,/* 0x50-0x57 */ + 0x8740,0x8741,0x8742,0x8743,0x8744,0x8745,0x8746,0x874A,/* 0x58-0x5F */ + 0x874B,0x874D,0x874F,0x8750,0x8751,0x8752,0x8754,0x8755,/* 0x60-0x67 */ + 0x8756,0x8758,0x875A,0x875B,0x875C,0x875D,0x875E,0x875F,/* 0x68-0x6F */ + 0x8761,0x8762,0x8766,0x8767,0x8768,0x8769,0x876A,0x876B,/* 0x70-0x77 */ + 0x876C,0x876D,0x876F,0x8771,0x8772,0x8773,0x8775,0x0000,/* 0x78-0x7F */ + + 0x8777,0x8778,0x8779,0x877A,0x877F,0x8780,0x8781,0x8784,/* 0x80-0x87 */ + 0x8786,0x8787,0x8789,0x878A,0x878C,0x878E,0x878F,0x8790,/* 0x88-0x8F */ + 0x8791,0x8792,0x8794,0x8795,0x8796,0x8798,0x8799,0x879A,/* 0x90-0x97 */ + 0x879B,0x879C,0x879D,0x879E,0x87A0,0x87A1,0x87A2,0x87A3,/* 0x98-0x9F */ + 0x87A4,0x5DCD,0x5FAE,0x5371,0x97E6,0x8FDD,0x6845,0x56F4,/* 0xA0-0xA7 */ + 0x552F,0x60DF,0x4E3A,0x6F4D,0x7EF4,0x82C7,0x840E,0x59D4,/* 0xA8-0xAF */ + 0x4F1F,0x4F2A,0x5C3E,0x7EAC,0x672A,0x851A,0x5473,0x754F,/* 0xB0-0xB7 */ + 0x80C3,0x5582,0x9B4F,0x4F4D,0x6E2D,0x8C13,0x5C09,0x6170,/* 0xB8-0xBF */ + 0x536B,0x761F,0x6E29,0x868A,0x6587,0x95FB,0x7EB9,0x543B,/* 0xC0-0xC7 */ + 0x7A33,0x7D0A,0x95EE,0x55E1,0x7FC1,0x74EE,0x631D,0x8717,/* 0xC8-0xCF */ + 0x6DA1,0x7A9D,0x6211,0x65A1,0x5367,0x63E1,0x6C83,0x5DEB,/* 0xD0-0xD7 */ + 0x545C,0x94A8,0x4E4C,0x6C61,0x8BEC,0x5C4B,0x65E0,0x829C,/* 0xD8-0xDF */ + 0x68A7,0x543E,0x5434,0x6BCB,0x6B66,0x4E94,0x6342,0x5348,/* 0xE0-0xE7 */ + 0x821E,0x4F0D,0x4FAE,0x575E,0x620A,0x96FE,0x6664,0x7269,/* 0xE8-0xEF */ + 0x52FF,0x52A1,0x609F,0x8BEF,0x6614,0x7199,0x6790,0x897F,/* 0xF0-0xF7 */ + 0x7852,0x77FD,0x6670,0x563B,0x5438,0x9521,0x727A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x87A5,0x87A6,0x87A7,0x87A9,0x87AA,0x87AE,0x87B0,0x87B1,/* 0x40-0x47 */ + 0x87B2,0x87B4,0x87B6,0x87B7,0x87B8,0x87B9,0x87BB,0x87BC,/* 0x48-0x4F */ + 0x87BE,0x87BF,0x87C1,0x87C2,0x87C3,0x87C4,0x87C5,0x87C7,/* 0x50-0x57 */ + 0x87C8,0x87C9,0x87CC,0x87CD,0x87CE,0x87CF,0x87D0,0x87D4,/* 0x58-0x5F */ + 0x87D5,0x87D6,0x87D7,0x87D8,0x87D9,0x87DA,0x87DC,0x87DD,/* 0x60-0x67 */ + 0x87DE,0x87DF,0x87E1,0x87E2,0x87E3,0x87E4,0x87E6,0x87E7,/* 0x68-0x6F */ + 0x87E8,0x87E9,0x87EB,0x87EC,0x87ED,0x87EF,0x87F0,0x87F1,/* 0x70-0x77 */ + 0x87F2,0x87F3,0x87F4,0x87F5,0x87F6,0x87F7,0x87F8,0x0000,/* 0x78-0x7F */ + + 0x87FA,0x87FB,0x87FC,0x87FD,0x87FF,0x8800,0x8801,0x8802,/* 0x80-0x87 */ + 0x8804,0x8805,0x8806,0x8807,0x8808,0x8809,0x880B,0x880C,/* 0x88-0x8F */ + 0x880D,0x880E,0x880F,0x8810,0x8811,0x8812,0x8814,0x8817,/* 0x90-0x97 */ + 0x8818,0x8819,0x881A,0x881C,0x881D,0x881E,0x881F,0x8820,/* 0x98-0x9F */ + 0x8823,0x7A00,0x606F,0x5E0C,0x6089,0x819D,0x5915,0x60DC,/* 0xA0-0xA7 */ + 0x7184,0x70EF,0x6EAA,0x6C50,0x7280,0x6A84,0x88AD,0x5E2D,/* 0xA8-0xAF */ + 0x4E60,0x5AB3,0x559C,0x94E3,0x6D17,0x7CFB,0x9699,0x620F,/* 0xB0-0xB7 */ + 0x7EC6,0x778E,0x867E,0x5323,0x971E,0x8F96,0x6687,0x5CE1,/* 0xB8-0xBF */ + 0x4FA0,0x72ED,0x4E0B,0x53A6,0x590F,0x5413,0x6380,0x9528,/* 0xC0-0xC7 */ + 0x5148,0x4ED9,0x9C9C,0x7EA4,0x54B8,0x8D24,0x8854,0x8237,/* 0xC8-0xCF */ + 0x95F2,0x6D8E,0x5F26,0x5ACC,0x663E,0x9669,0x73B0,0x732E,/* 0xD0-0xD7 */ + 0x53BF,0x817A,0x9985,0x7FA1,0x5BAA,0x9677,0x9650,0x7EBF,/* 0xD8-0xDF */ + 0x76F8,0x53A2,0x9576,0x9999,0x7BB1,0x8944,0x6E58,0x4E61,/* 0xE0-0xE7 */ + 0x7FD4,0x7965,0x8BE6,0x60F3,0x54CD,0x4EAB,0x9879,0x5DF7,/* 0xE8-0xEF */ + 0x6A61,0x50CF,0x5411,0x8C61,0x8427,0x785D,0x9704,0x524A,/* 0xF0-0xF7 */ + 0x54EE,0x56A3,0x9500,0x6D88,0x5BB5,0x6DC6,0x6653,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8824,0x8825,0x8826,0x8827,0x8828,0x8829,0x882A,0x882B,/* 0x40-0x47 */ + 0x882C,0x882D,0x882E,0x882F,0x8830,0x8831,0x8833,0x8834,/* 0x48-0x4F */ + 0x8835,0x8836,0x8837,0x8838,0x883A,0x883B,0x883D,0x883E,/* 0x50-0x57 */ + 0x883F,0x8841,0x8842,0x8843,0x8846,0x8847,0x8848,0x8849,/* 0x58-0x5F */ + 0x884A,0x884B,0x884E,0x884F,0x8850,0x8851,0x8852,0x8853,/* 0x60-0x67 */ + 0x8855,0x8856,0x8858,0x885A,0x885B,0x885C,0x885D,0x885E,/* 0x68-0x6F */ + 0x885F,0x8860,0x8866,0x8867,0x886A,0x886D,0x886F,0x8871,/* 0x70-0x77 */ + 0x8873,0x8874,0x8875,0x8876,0x8878,0x8879,0x887A,0x0000,/* 0x78-0x7F */ + + 0x887B,0x887C,0x8880,0x8883,0x8886,0x8887,0x8889,0x888A,/* 0x80-0x87 */ + 0x888C,0x888E,0x888F,0x8890,0x8891,0x8893,0x8894,0x8895,/* 0x88-0x8F */ + 0x8897,0x8898,0x8899,0x889A,0x889B,0x889D,0x889E,0x889F,/* 0x90-0x97 */ + 0x88A0,0x88A1,0x88A3,0x88A5,0x88A6,0x88A7,0x88A8,0x88A9,/* 0x98-0x9F */ + 0x88AA,0x5C0F,0x5B5D,0x6821,0x8096,0x5578,0x7B11,0x6548,/* 0xA0-0xA7 */ + 0x6954,0x4E9B,0x6B47,0x874E,0x978B,0x534F,0x631F,0x643A,/* 0xA8-0xAF */ + 0x90AA,0x659C,0x80C1,0x8C10,0x5199,0x68B0,0x5378,0x87F9,/* 0xB0-0xB7 */ + 0x61C8,0x6CC4,0x6CFB,0x8C22,0x5C51,0x85AA,0x82AF,0x950C,/* 0xB8-0xBF */ + 0x6B23,0x8F9B,0x65B0,0x5FFB,0x5FC3,0x4FE1,0x8845,0x661F,/* 0xC0-0xC7 */ + 0x8165,0x7329,0x60FA,0x5174,0x5211,0x578B,0x5F62,0x90A2,/* 0xC8-0xCF */ + 0x884C,0x9192,0x5E78,0x674F,0x6027,0x59D3,0x5144,0x51F6,/* 0xD0-0xD7 */ + 0x80F8,0x5308,0x6C79,0x96C4,0x718A,0x4F11,0x4FEE,0x7F9E,/* 0xD8-0xDF */ + 0x673D,0x55C5,0x9508,0x79C0,0x8896,0x7EE3,0x589F,0x620C,/* 0xE0-0xE7 */ + 0x9700,0x865A,0x5618,0x987B,0x5F90,0x8BB8,0x84C4,0x9157,/* 0xE8-0xEF */ + 0x53D9,0x65ED,0x5E8F,0x755C,0x6064,0x7D6E,0x5A7F,0x7EEA,/* 0xF0-0xF7 */ + 0x7EED,0x8F69,0x55A7,0x5BA3,0x60AC,0x65CB,0x7384,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x88AC,0x88AE,0x88AF,0x88B0,0x88B2,0x88B3,0x88B4,0x88B5,/* 0x40-0x47 */ + 0x88B6,0x88B8,0x88B9,0x88BA,0x88BB,0x88BD,0x88BE,0x88BF,/* 0x48-0x4F */ + 0x88C0,0x88C3,0x88C4,0x88C7,0x88C8,0x88CA,0x88CB,0x88CC,/* 0x50-0x57 */ + 0x88CD,0x88CF,0x88D0,0x88D1,0x88D3,0x88D6,0x88D7,0x88DA,/* 0x58-0x5F */ + 0x88DB,0x88DC,0x88DD,0x88DE,0x88E0,0x88E1,0x88E6,0x88E7,/* 0x60-0x67 */ + 0x88E9,0x88EA,0x88EB,0x88EC,0x88ED,0x88EE,0x88EF,0x88F2,/* 0x68-0x6F */ + 0x88F5,0x88F6,0x88F7,0x88FA,0x88FB,0x88FD,0x88FF,0x8900,/* 0x70-0x77 */ + 0x8901,0x8903,0x8904,0x8905,0x8906,0x8907,0x8908,0x0000,/* 0x78-0x7F */ + + 0x8909,0x890B,0x890C,0x890D,0x890E,0x890F,0x8911,0x8914,/* 0x80-0x87 */ + 0x8915,0x8916,0x8917,0x8918,0x891C,0x891D,0x891E,0x891F,/* 0x88-0x8F */ + 0x8920,0x8922,0x8923,0x8924,0x8926,0x8927,0x8928,0x8929,/* 0x90-0x97 */ + 0x892C,0x892D,0x892E,0x892F,0x8931,0x8932,0x8933,0x8935,/* 0x98-0x9F */ + 0x8937,0x9009,0x7663,0x7729,0x7EDA,0x9774,0x859B,0x5B66,/* 0xA0-0xA7 */ + 0x7A74,0x96EA,0x8840,0x52CB,0x718F,0x5FAA,0x65EC,0x8BE2,/* 0xA8-0xAF */ + 0x5BFB,0x9A6F,0x5DE1,0x6B89,0x6C5B,0x8BAD,0x8BAF,0x900A,/* 0xB0-0xB7 */ + 0x8FC5,0x538B,0x62BC,0x9E26,0x9E2D,0x5440,0x4E2B,0x82BD,/* 0xB8-0xBF */ + 0x7259,0x869C,0x5D16,0x8859,0x6DAF,0x96C5,0x54D1,0x4E9A,/* 0xC0-0xC7 */ + 0x8BB6,0x7109,0x54BD,0x9609,0x70DF,0x6DF9,0x76D0,0x4E25,/* 0xC8-0xCF */ + 0x7814,0x8712,0x5CA9,0x5EF6,0x8A00,0x989C,0x960E,0x708E,/* 0xD0-0xD7 */ + 0x6CBF,0x5944,0x63A9,0x773C,0x884D,0x6F14,0x8273,0x5830,/* 0xD8-0xDF */ + 0x71D5,0x538C,0x781A,0x96C1,0x5501,0x5F66,0x7130,0x5BB4,/* 0xE0-0xE7 */ + 0x8C1A,0x9A8C,0x6B83,0x592E,0x9E2F,0x79E7,0x6768,0x626C,/* 0xE8-0xEF */ + 0x4F6F,0x75A1,0x7F8A,0x6D0B,0x9633,0x6C27,0x4EF0,0x75D2,/* 0xF0-0xF7 */ + 0x517B,0x6837,0x6F3E,0x9080,0x8170,0x5996,0x7476,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8938,0x8939,0x893A,0x893B,0x893C,0x893D,0x893E,0x893F,/* 0x40-0x47 */ + 0x8940,0x8942,0x8943,0x8945,0x8946,0x8947,0x8948,0x8949,/* 0x48-0x4F */ + 0x894A,0x894B,0x894C,0x894D,0x894E,0x894F,0x8950,0x8951,/* 0x50-0x57 */ + 0x8952,0x8953,0x8954,0x8955,0x8956,0x8957,0x8958,0x8959,/* 0x58-0x5F */ + 0x895A,0x895B,0x895C,0x895D,0x8960,0x8961,0x8962,0x8963,/* 0x60-0x67 */ + 0x8964,0x8965,0x8967,0x8968,0x8969,0x896A,0x896B,0x896C,/* 0x68-0x6F */ + 0x896D,0x896E,0x896F,0x8970,0x8971,0x8972,0x8973,0x8974,/* 0x70-0x77 */ + 0x8975,0x8976,0x8977,0x8978,0x8979,0x897A,0x897C,0x0000,/* 0x78-0x7F */ + + 0x897D,0x897E,0x8980,0x8982,0x8984,0x8985,0x8987,0x8988,/* 0x80-0x87 */ + 0x8989,0x898A,0x898B,0x898C,0x898D,0x898E,0x898F,0x8990,/* 0x88-0x8F */ + 0x8991,0x8992,0x8993,0x8994,0x8995,0x8996,0x8997,0x8998,/* 0x90-0x97 */ + 0x8999,0x899A,0x899B,0x899C,0x899D,0x899E,0x899F,0x89A0,/* 0x98-0x9F */ + 0x89A1,0x6447,0x5C27,0x9065,0x7A91,0x8C23,0x59DA,0x54AC,/* 0xA0-0xA7 */ + 0x8200,0x836F,0x8981,0x8000,0x6930,0x564E,0x8036,0x7237,/* 0xA8-0xAF */ + 0x91CE,0x51B6,0x4E5F,0x9875,0x6396,0x4E1A,0x53F6,0x66F3,/* 0xB0-0xB7 */ + 0x814B,0x591C,0x6DB2,0x4E00,0x58F9,0x533B,0x63D6,0x94F1,/* 0xB8-0xBF */ + 0x4F9D,0x4F0A,0x8863,0x9890,0x5937,0x9057,0x79FB,0x4EEA,/* 0xC0-0xC7 */ + 0x80F0,0x7591,0x6C82,0x5B9C,0x59E8,0x5F5D,0x6905,0x8681,/* 0xC8-0xCF */ + 0x501A,0x5DF2,0x4E59,0x77E3,0x4EE5,0x827A,0x6291,0x6613,/* 0xD0-0xD7 */ + 0x9091,0x5C79,0x4EBF,0x5F79,0x81C6,0x9038,0x8084,0x75AB,/* 0xD8-0xDF */ + 0x4EA6,0x88D4,0x610F,0x6BC5,0x5FC6,0x4E49,0x76CA,0x6EA2,/* 0xE0-0xE7 */ + 0x8BE3,0x8BAE,0x8C0A,0x8BD1,0x5F02,0x7FFC,0x7FCC,0x7ECE,/* 0xE8-0xEF */ + 0x8335,0x836B,0x56E0,0x6BB7,0x97F3,0x9634,0x59FB,0x541F,/* 0xF0-0xF7 */ + 0x94F6,0x6DEB,0x5BC5,0x996E,0x5C39,0x5F15,0x9690,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x89A2,0x89A3,0x89A4,0x89A5,0x89A6,0x89A7,0x89A8,0x89A9,/* 0x40-0x47 */ + 0x89AA,0x89AB,0x89AC,0x89AD,0x89AE,0x89AF,0x89B0,0x89B1,/* 0x48-0x4F */ + 0x89B2,0x89B3,0x89B4,0x89B5,0x89B6,0x89B7,0x89B8,0x89B9,/* 0x50-0x57 */ + 0x89BA,0x89BB,0x89BC,0x89BD,0x89BE,0x89BF,0x89C0,0x89C3,/* 0x58-0x5F */ + 0x89CD,0x89D3,0x89D4,0x89D5,0x89D7,0x89D8,0x89D9,0x89DB,/* 0x60-0x67 */ + 0x89DD,0x89DF,0x89E0,0x89E1,0x89E2,0x89E4,0x89E7,0x89E8,/* 0x68-0x6F */ + 0x89E9,0x89EA,0x89EC,0x89ED,0x89EE,0x89F0,0x89F1,0x89F2,/* 0x70-0x77 */ + 0x89F4,0x89F5,0x89F6,0x89F7,0x89F8,0x89F9,0x89FA,0x0000,/* 0x78-0x7F */ + + 0x89FB,0x89FC,0x89FD,0x89FE,0x89FF,0x8A01,0x8A02,0x8A03,/* 0x80-0x87 */ + 0x8A04,0x8A05,0x8A06,0x8A08,0x8A09,0x8A0A,0x8A0B,0x8A0C,/* 0x88-0x8F */ + 0x8A0D,0x8A0E,0x8A0F,0x8A10,0x8A11,0x8A12,0x8A13,0x8A14,/* 0x90-0x97 */ + 0x8A15,0x8A16,0x8A17,0x8A18,0x8A19,0x8A1A,0x8A1B,0x8A1C,/* 0x98-0x9F */ + 0x8A1D,0x5370,0x82F1,0x6A31,0x5A74,0x9E70,0x5E94,0x7F28,/* 0xA0-0xA7 */ + 0x83B9,0x8424,0x8425,0x8367,0x8747,0x8FCE,0x8D62,0x76C8,/* 0xA8-0xAF */ + 0x5F71,0x9896,0x786C,0x6620,0x54DF,0x62E5,0x4F63,0x81C3,/* 0xB0-0xB7 */ + 0x75C8,0x5EB8,0x96CD,0x8E0A,0x86F9,0x548F,0x6CF3,0x6D8C,/* 0xB8-0xBF */ + 0x6C38,0x607F,0x52C7,0x7528,0x5E7D,0x4F18,0x60A0,0x5FE7,/* 0xC0-0xC7 */ + 0x5C24,0x7531,0x90AE,0x94C0,0x72B9,0x6CB9,0x6E38,0x9149,/* 0xC8-0xCF */ + 0x6709,0x53CB,0x53F3,0x4F51,0x91C9,0x8BF1,0x53C8,0x5E7C,/* 0xD0-0xD7 */ + 0x8FC2,0x6DE4,0x4E8E,0x76C2,0x6986,0x865E,0x611A,0x8206,/* 0xD8-0xDF */ + 0x4F59,0x4FDE,0x903E,0x9C7C,0x6109,0x6E1D,0x6E14,0x9685,/* 0xE0-0xE7 */ + 0x4E88,0x5A31,0x96E8,0x4E0E,0x5C7F,0x79B9,0x5B87,0x8BED,/* 0xE8-0xEF */ + 0x7FBD,0x7389,0x57DF,0x828B,0x90C1,0x5401,0x9047,0x55BB,/* 0xF0-0xF7 */ + 0x5CEA,0x5FA1,0x6108,0x6B32,0x72F1,0x80B2,0x8A89,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A1E,0x8A1F,0x8A20,0x8A21,0x8A22,0x8A23,0x8A24,0x8A25,/* 0x40-0x47 */ + 0x8A26,0x8A27,0x8A28,0x8A29,0x8A2A,0x8A2B,0x8A2C,0x8A2D,/* 0x48-0x4F */ + 0x8A2E,0x8A2F,0x8A30,0x8A31,0x8A32,0x8A33,0x8A34,0x8A35,/* 0x50-0x57 */ + 0x8A36,0x8A37,0x8A38,0x8A39,0x8A3A,0x8A3B,0x8A3C,0x8A3D,/* 0x58-0x5F */ + 0x8A3F,0x8A40,0x8A41,0x8A42,0x8A43,0x8A44,0x8A45,0x8A46,/* 0x60-0x67 */ + 0x8A47,0x8A49,0x8A4A,0x8A4B,0x8A4C,0x8A4D,0x8A4E,0x8A4F,/* 0x68-0x6F */ + 0x8A50,0x8A51,0x8A52,0x8A53,0x8A54,0x8A55,0x8A56,0x8A57,/* 0x70-0x77 */ + 0x8A58,0x8A59,0x8A5A,0x8A5B,0x8A5C,0x8A5D,0x8A5E,0x0000,/* 0x78-0x7F */ + + 0x8A5F,0x8A60,0x8A61,0x8A62,0x8A63,0x8A64,0x8A65,0x8A66,/* 0x80-0x87 */ + 0x8A67,0x8A68,0x8A69,0x8A6A,0x8A6B,0x8A6C,0x8A6D,0x8A6E,/* 0x88-0x8F */ + 0x8A6F,0x8A70,0x8A71,0x8A72,0x8A73,0x8A74,0x8A75,0x8A76,/* 0x90-0x97 */ + 0x8A77,0x8A78,0x8A7A,0x8A7B,0x8A7C,0x8A7D,0x8A7E,0x8A7F,/* 0x98-0x9F */ + 0x8A80,0x6D74,0x5BD3,0x88D5,0x9884,0x8C6B,0x9A6D,0x9E33,/* 0xA0-0xA7 */ + 0x6E0A,0x51A4,0x5143,0x57A3,0x8881,0x539F,0x63F4,0x8F95,/* 0xA8-0xAF */ + 0x56ED,0x5458,0x5706,0x733F,0x6E90,0x7F18,0x8FDC,0x82D1,/* 0xB0-0xB7 */ + 0x613F,0x6028,0x9662,0x66F0,0x7EA6,0x8D8A,0x8DC3,0x94A5,/* 0xB8-0xBF */ + 0x5CB3,0x7CA4,0x6708,0x60A6,0x9605,0x8018,0x4E91,0x90E7,/* 0xC0-0xC7 */ + 0x5300,0x9668,0x5141,0x8FD0,0x8574,0x915D,0x6655,0x97F5,/* 0xC8-0xCF */ + 0x5B55,0x531D,0x7838,0x6742,0x683D,0x54C9,0x707E,0x5BB0,/* 0xD0-0xD7 */ + 0x8F7D,0x518D,0x5728,0x54B1,0x6512,0x6682,0x8D5E,0x8D43,/* 0xD8-0xDF */ + 0x810F,0x846C,0x906D,0x7CDF,0x51FF,0x85FB,0x67A3,0x65E9,/* 0xE0-0xE7 */ + 0x6FA1,0x86A4,0x8E81,0x566A,0x9020,0x7682,0x7076,0x71E5,/* 0xE8-0xEF */ + 0x8D23,0x62E9,0x5219,0x6CFD,0x8D3C,0x600E,0x589E,0x618E,/* 0xF0-0xF7 */ + 0x66FE,0x8D60,0x624E,0x55B3,0x6E23,0x672D,0x8F67,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A81,0x8A82,0x8A83,0x8A84,0x8A85,0x8A86,0x8A87,0x8A88,/* 0x40-0x47 */ + 0x8A8B,0x8A8C,0x8A8D,0x8A8E,0x8A8F,0x8A90,0x8A91,0x8A92,/* 0x48-0x4F */ + 0x8A94,0x8A95,0x8A96,0x8A97,0x8A98,0x8A99,0x8A9A,0x8A9B,/* 0x50-0x57 */ + 0x8A9C,0x8A9D,0x8A9E,0x8A9F,0x8AA0,0x8AA1,0x8AA2,0x8AA3,/* 0x58-0x5F */ + 0x8AA4,0x8AA5,0x8AA6,0x8AA7,0x8AA8,0x8AA9,0x8AAA,0x8AAB,/* 0x60-0x67 */ + 0x8AAC,0x8AAD,0x8AAE,0x8AAF,0x8AB0,0x8AB1,0x8AB2,0x8AB3,/* 0x68-0x6F */ + 0x8AB4,0x8AB5,0x8AB6,0x8AB7,0x8AB8,0x8AB9,0x8ABA,0x8ABB,/* 0x70-0x77 */ + 0x8ABC,0x8ABD,0x8ABE,0x8ABF,0x8AC0,0x8AC1,0x8AC2,0x0000,/* 0x78-0x7F */ + + 0x8AC3,0x8AC4,0x8AC5,0x8AC6,0x8AC7,0x8AC8,0x8AC9,0x8ACA,/* 0x80-0x87 */ + 0x8ACB,0x8ACC,0x8ACD,0x8ACE,0x8ACF,0x8AD0,0x8AD1,0x8AD2,/* 0x88-0x8F */ + 0x8AD3,0x8AD4,0x8AD5,0x8AD6,0x8AD7,0x8AD8,0x8AD9,0x8ADA,/* 0x90-0x97 */ + 0x8ADB,0x8ADC,0x8ADD,0x8ADE,0x8ADF,0x8AE0,0x8AE1,0x8AE2,/* 0x98-0x9F */ + 0x8AE3,0x94E1,0x95F8,0x7728,0x6805,0x69A8,0x548B,0x4E4D,/* 0xA0-0xA7 */ + 0x70B8,0x8BC8,0x6458,0x658B,0x5B85,0x7A84,0x503A,0x5BE8,/* 0xA8-0xAF */ + 0x77BB,0x6BE1,0x8A79,0x7C98,0x6CBE,0x76CF,0x65A9,0x8F97,/* 0xB0-0xB7 */ + 0x5D2D,0x5C55,0x8638,0x6808,0x5360,0x6218,0x7AD9,0x6E5B,/* 0xB8-0xBF */ + 0x7EFD,0x6A1F,0x7AE0,0x5F70,0x6F33,0x5F20,0x638C,0x6DA8,/* 0xC0-0xC7 */ + 0x6756,0x4E08,0x5E10,0x8D26,0x4ED7,0x80C0,0x7634,0x969C,/* 0xC8-0xCF */ + 0x62DB,0x662D,0x627E,0x6CBC,0x8D75,0x7167,0x7F69,0x5146,/* 0xD0-0xD7 */ + 0x8087,0x53EC,0x906E,0x6298,0x54F2,0x86F0,0x8F99,0x8005,/* 0xD8-0xDF */ + 0x9517,0x8517,0x8FD9,0x6D59,0x73CD,0x659F,0x771F,0x7504,/* 0xE0-0xE7 */ + 0x7827,0x81FB,0x8D1E,0x9488,0x4FA6,0x6795,0x75B9,0x8BCA,/* 0xE8-0xEF */ + 0x9707,0x632F,0x9547,0x9635,0x84B8,0x6323,0x7741,0x5F81,/* 0xF0-0xF7 */ + 0x72F0,0x4E89,0x6014,0x6574,0x62EF,0x6B63,0x653F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8AE4,0x8AE5,0x8AE6,0x8AE7,0x8AE8,0x8AE9,0x8AEA,0x8AEB,/* 0x40-0x47 */ + 0x8AEC,0x8AED,0x8AEE,0x8AEF,0x8AF0,0x8AF1,0x8AF2,0x8AF3,/* 0x48-0x4F */ + 0x8AF4,0x8AF5,0x8AF6,0x8AF7,0x8AF8,0x8AF9,0x8AFA,0x8AFB,/* 0x50-0x57 */ + 0x8AFC,0x8AFD,0x8AFE,0x8AFF,0x8B00,0x8B01,0x8B02,0x8B03,/* 0x58-0x5F */ + 0x8B04,0x8B05,0x8B06,0x8B08,0x8B09,0x8B0A,0x8B0B,0x8B0C,/* 0x60-0x67 */ + 0x8B0D,0x8B0E,0x8B0F,0x8B10,0x8B11,0x8B12,0x8B13,0x8B14,/* 0x68-0x6F */ + 0x8B15,0x8B16,0x8B17,0x8B18,0x8B19,0x8B1A,0x8B1B,0x8B1C,/* 0x70-0x77 */ + 0x8B1D,0x8B1E,0x8B1F,0x8B20,0x8B21,0x8B22,0x8B23,0x0000,/* 0x78-0x7F */ + + 0x8B24,0x8B25,0x8B27,0x8B28,0x8B29,0x8B2A,0x8B2B,0x8B2C,/* 0x80-0x87 */ + 0x8B2D,0x8B2E,0x8B2F,0x8B30,0x8B31,0x8B32,0x8B33,0x8B34,/* 0x88-0x8F */ + 0x8B35,0x8B36,0x8B37,0x8B38,0x8B39,0x8B3A,0x8B3B,0x8B3C,/* 0x90-0x97 */ + 0x8B3D,0x8B3E,0x8B3F,0x8B40,0x8B41,0x8B42,0x8B43,0x8B44,/* 0x98-0x9F */ + 0x8B45,0x5E27,0x75C7,0x90D1,0x8BC1,0x829D,0x679D,0x652F,/* 0xA0-0xA7 */ + 0x5431,0x8718,0x77E5,0x80A2,0x8102,0x6C41,0x4E4B,0x7EC7,/* 0xA8-0xAF */ + 0x804C,0x76F4,0x690D,0x6B96,0x6267,0x503C,0x4F84,0x5740,/* 0xB0-0xB7 */ + 0x6307,0x6B62,0x8DBE,0x53EA,0x65E8,0x7EB8,0x5FD7,0x631A,/* 0xB8-0xBF */ + 0x63B7,0x81F3,0x81F4,0x7F6E,0x5E1C,0x5CD9,0x5236,0x667A,/* 0xC0-0xC7 */ + 0x79E9,0x7A1A,0x8D28,0x7099,0x75D4,0x6EDE,0x6CBB,0x7A92,/* 0xC8-0xCF */ + 0x4E2D,0x76C5,0x5FE0,0x949F,0x8877,0x7EC8,0x79CD,0x80BF,/* 0xD0-0xD7 */ + 0x91CD,0x4EF2,0x4F17,0x821F,0x5468,0x5DDE,0x6D32,0x8BCC,/* 0xD8-0xDF */ + 0x7CA5,0x8F74,0x8098,0x5E1A,0x5492,0x76B1,0x5B99,0x663C,/* 0xE0-0xE7 */ + 0x9AA4,0x73E0,0x682A,0x86DB,0x6731,0x732A,0x8BF8,0x8BDB,/* 0xE8-0xEF */ + 0x9010,0x7AF9,0x70DB,0x716E,0x62C4,0x77A9,0x5631,0x4E3B,/* 0xF0-0xF7 */ + 0x8457,0x67F1,0x52A9,0x86C0,0x8D2E,0x94F8,0x7B51,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B46,0x8B47,0x8B48,0x8B49,0x8B4A,0x8B4B,0x8B4C,0x8B4D,/* 0x40-0x47 */ + 0x8B4E,0x8B4F,0x8B50,0x8B51,0x8B52,0x8B53,0x8B54,0x8B55,/* 0x48-0x4F */ + 0x8B56,0x8B57,0x8B58,0x8B59,0x8B5A,0x8B5B,0x8B5C,0x8B5D,/* 0x50-0x57 */ + 0x8B5E,0x8B5F,0x8B60,0x8B61,0x8B62,0x8B63,0x8B64,0x8B65,/* 0x58-0x5F */ + 0x8B67,0x8B68,0x8B69,0x8B6A,0x8B6B,0x8B6D,0x8B6E,0x8B6F,/* 0x60-0x67 */ + 0x8B70,0x8B71,0x8B72,0x8B73,0x8B74,0x8B75,0x8B76,0x8B77,/* 0x68-0x6F */ + 0x8B78,0x8B79,0x8B7A,0x8B7B,0x8B7C,0x8B7D,0x8B7E,0x8B7F,/* 0x70-0x77 */ + 0x8B80,0x8B81,0x8B82,0x8B83,0x8B84,0x8B85,0x8B86,0x0000,/* 0x78-0x7F */ + + 0x8B87,0x8B88,0x8B89,0x8B8A,0x8B8B,0x8B8C,0x8B8D,0x8B8E,/* 0x80-0x87 */ + 0x8B8F,0x8B90,0x8B91,0x8B92,0x8B93,0x8B94,0x8B95,0x8B96,/* 0x88-0x8F */ + 0x8B97,0x8B98,0x8B99,0x8B9A,0x8B9B,0x8B9C,0x8B9D,0x8B9E,/* 0x90-0x97 */ + 0x8B9F,0x8BAC,0x8BB1,0x8BBB,0x8BC7,0x8BD0,0x8BEA,0x8C09,/* 0x98-0x9F */ + 0x8C1E,0x4F4F,0x6CE8,0x795D,0x9A7B,0x6293,0x722A,0x62FD,/* 0xA0-0xA7 */ + 0x4E13,0x7816,0x8F6C,0x64B0,0x8D5A,0x7BC6,0x6869,0x5E84,/* 0xA8-0xAF */ + 0x88C5,0x5986,0x649E,0x58EE,0x72B6,0x690E,0x9525,0x8FFD,/* 0xB0-0xB7 */ + 0x8D58,0x5760,0x7F00,0x8C06,0x51C6,0x6349,0x62D9,0x5353,/* 0xB8-0xBF */ + 0x684C,0x7422,0x8301,0x914C,0x5544,0x7740,0x707C,0x6D4A,/* 0xC0-0xC7 */ + 0x5179,0x54A8,0x8D44,0x59FF,0x6ECB,0x6DC4,0x5B5C,0x7D2B,/* 0xC8-0xCF */ + 0x4ED4,0x7C7D,0x6ED3,0x5B50,0x81EA,0x6E0D,0x5B57,0x9B03,/* 0xD0-0xD7 */ + 0x68D5,0x8E2A,0x5B97,0x7EFC,0x603B,0x7EB5,0x90B9,0x8D70,/* 0xD8-0xDF */ + 0x594F,0x63CD,0x79DF,0x8DB3,0x5352,0x65CF,0x7956,0x8BC5,/* 0xE0-0xE7 */ + 0x963B,0x7EC4,0x94BB,0x7E82,0x5634,0x9189,0x6700,0x7F6A,/* 0xE8-0xEF */ + 0x5C0A,0x9075,0x6628,0x5DE6,0x4F50,0x67DE,0x505A,0x4F5C,/* 0xF0-0xF7 */ + 0x5750,0x5EA7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8C38,0x8C39,0x8C3A,0x8C3B,0x8C3C,0x8C3D,0x8C3E,0x8C3F,/* 0x40-0x47 */ + 0x8C40,0x8C42,0x8C43,0x8C44,0x8C45,0x8C48,0x8C4A,0x8C4B,/* 0x48-0x4F */ + 0x8C4D,0x8C4E,0x8C4F,0x8C50,0x8C51,0x8C52,0x8C53,0x8C54,/* 0x50-0x57 */ + 0x8C56,0x8C57,0x8C58,0x8C59,0x8C5B,0x8C5C,0x8C5D,0x8C5E,/* 0x58-0x5F */ + 0x8C5F,0x8C60,0x8C63,0x8C64,0x8C65,0x8C66,0x8C67,0x8C68,/* 0x60-0x67 */ + 0x8C69,0x8C6C,0x8C6D,0x8C6E,0x8C6F,0x8C70,0x8C71,0x8C72,/* 0x68-0x6F */ + 0x8C74,0x8C75,0x8C76,0x8C77,0x8C7B,0x8C7C,0x8C7D,0x8C7E,/* 0x70-0x77 */ + 0x8C7F,0x8C80,0x8C81,0x8C83,0x8C84,0x8C86,0x8C87,0x0000,/* 0x78-0x7F */ + + 0x8C88,0x8C8B,0x8C8D,0x8C8E,0x8C8F,0x8C90,0x8C91,0x8C92,/* 0x80-0x87 */ + 0x8C93,0x8C95,0x8C96,0x8C97,0x8C99,0x8C9A,0x8C9B,0x8C9C,/* 0x88-0x8F */ + 0x8C9D,0x8C9E,0x8C9F,0x8CA0,0x8CA1,0x8CA2,0x8CA3,0x8CA4,/* 0x90-0x97 */ + 0x8CA5,0x8CA6,0x8CA7,0x8CA8,0x8CA9,0x8CAA,0x8CAB,0x8CAC,/* 0x98-0x9F */ + 0x8CAD,0x4E8D,0x4E0C,0x5140,0x4E10,0x5EFF,0x5345,0x4E15,/* 0xA0-0xA7 */ + 0x4E98,0x4E1E,0x9B32,0x5B6C,0x5669,0x4E28,0x79BA,0x4E3F,/* 0xA8-0xAF */ + 0x5315,0x4E47,0x592D,0x723B,0x536E,0x6C10,0x56DF,0x80E4,/* 0xB0-0xB7 */ + 0x9997,0x6BD3,0x777E,0x9F17,0x4E36,0x4E9F,0x9F10,0x4E5C,/* 0xB8-0xBF */ + 0x4E69,0x4E93,0x8288,0x5B5B,0x556C,0x560F,0x4EC4,0x538D,/* 0xC0-0xC7 */ + 0x539D,0x53A3,0x53A5,0x53AE,0x9765,0x8D5D,0x531A,0x53F5,/* 0xC8-0xCF */ + 0x5326,0x532E,0x533E,0x8D5C,0x5366,0x5363,0x5202,0x5208,/* 0xD0-0xD7 */ + 0x520E,0x522D,0x5233,0x523F,0x5240,0x524C,0x525E,0x5261,/* 0xD8-0xDF */ + 0x525C,0x84AF,0x527D,0x5282,0x5281,0x5290,0x5293,0x5182,/* 0xE0-0xE7 */ + 0x7F54,0x4EBB,0x4EC3,0x4EC9,0x4EC2,0x4EE8,0x4EE1,0x4EEB,/* 0xE8-0xEF */ + 0x4EDE,0x4F1B,0x4EF3,0x4F22,0x4F64,0x4EF5,0x4F25,0x4F27,/* 0xF0-0xF7 */ + 0x4F09,0x4F2B,0x4F5E,0x4F67,0x6538,0x4F5A,0x4F5D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8CAE,0x8CAF,0x8CB0,0x8CB1,0x8CB2,0x8CB3,0x8CB4,0x8CB5,/* 0x40-0x47 */ + 0x8CB6,0x8CB7,0x8CB8,0x8CB9,0x8CBA,0x8CBB,0x8CBC,0x8CBD,/* 0x48-0x4F */ + 0x8CBE,0x8CBF,0x8CC0,0x8CC1,0x8CC2,0x8CC3,0x8CC4,0x8CC5,/* 0x50-0x57 */ + 0x8CC6,0x8CC7,0x8CC8,0x8CC9,0x8CCA,0x8CCB,0x8CCC,0x8CCD,/* 0x58-0x5F */ + 0x8CCE,0x8CCF,0x8CD0,0x8CD1,0x8CD2,0x8CD3,0x8CD4,0x8CD5,/* 0x60-0x67 */ + 0x8CD6,0x8CD7,0x8CD8,0x8CD9,0x8CDA,0x8CDB,0x8CDC,0x8CDD,/* 0x68-0x6F */ + 0x8CDE,0x8CDF,0x8CE0,0x8CE1,0x8CE2,0x8CE3,0x8CE4,0x8CE5,/* 0x70-0x77 */ + 0x8CE6,0x8CE7,0x8CE8,0x8CE9,0x8CEA,0x8CEB,0x8CEC,0x0000,/* 0x78-0x7F */ + + 0x8CED,0x8CEE,0x8CEF,0x8CF0,0x8CF1,0x8CF2,0x8CF3,0x8CF4,/* 0x80-0x87 */ + 0x8CF5,0x8CF6,0x8CF7,0x8CF8,0x8CF9,0x8CFA,0x8CFB,0x8CFC,/* 0x88-0x8F */ + 0x8CFD,0x8CFE,0x8CFF,0x8D00,0x8D01,0x8D02,0x8D03,0x8D04,/* 0x90-0x97 */ + 0x8D05,0x8D06,0x8D07,0x8D08,0x8D09,0x8D0A,0x8D0B,0x8D0C,/* 0x98-0x9F */ + 0x8D0D,0x4F5F,0x4F57,0x4F32,0x4F3D,0x4F76,0x4F74,0x4F91,/* 0xA0-0xA7 */ + 0x4F89,0x4F83,0x4F8F,0x4F7E,0x4F7B,0x4FAA,0x4F7C,0x4FAC,/* 0xA8-0xAF */ + 0x4F94,0x4FE6,0x4FE8,0x4FEA,0x4FC5,0x4FDA,0x4FE3,0x4FDC,/* 0xB0-0xB7 */ + 0x4FD1,0x4FDF,0x4FF8,0x5029,0x504C,0x4FF3,0x502C,0x500F,/* 0xB8-0xBF */ + 0x502E,0x502D,0x4FFE,0x501C,0x500C,0x5025,0x5028,0x507E,/* 0xC0-0xC7 */ + 0x5043,0x5055,0x5048,0x504E,0x506C,0x507B,0x50A5,0x50A7,/* 0xC8-0xCF */ + 0x50A9,0x50BA,0x50D6,0x5106,0x50ED,0x50EC,0x50E6,0x50EE,/* 0xD0-0xD7 */ + 0x5107,0x510B,0x4EDD,0x6C3D,0x4F58,0x4F65,0x4FCE,0x9FA0,/* 0xD8-0xDF */ + 0x6C46,0x7C74,0x516E,0x5DFD,0x9EC9,0x9998,0x5181,0x5914,/* 0xE0-0xE7 */ + 0x52F9,0x530D,0x8A07,0x5310,0x51EB,0x5919,0x5155,0x4EA0,/* 0xE8-0xEF */ + 0x5156,0x4EB3,0x886E,0x88A4,0x4EB5,0x8114,0x88D2,0x7980,/* 0xF0-0xF7 */ + 0x5B34,0x8803,0x7FB8,0x51AB,0x51B1,0x51BD,0x51BC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8D0E,0x8D0F,0x8D10,0x8D11,0x8D12,0x8D13,0x8D14,0x8D15,/* 0x40-0x47 */ + 0x8D16,0x8D17,0x8D18,0x8D19,0x8D1A,0x8D1B,0x8D1C,0x8D20,/* 0x48-0x4F */ + 0x8D51,0x8D52,0x8D57,0x8D5F,0x8D65,0x8D68,0x8D69,0x8D6A,/* 0x50-0x57 */ + 0x8D6C,0x8D6E,0x8D6F,0x8D71,0x8D72,0x8D78,0x8D79,0x8D7A,/* 0x58-0x5F */ + 0x8D7B,0x8D7C,0x8D7D,0x8D7E,0x8D7F,0x8D80,0x8D82,0x8D83,/* 0x60-0x67 */ + 0x8D86,0x8D87,0x8D88,0x8D89,0x8D8C,0x8D8D,0x8D8E,0x8D8F,/* 0x68-0x6F */ + 0x8D90,0x8D92,0x8D93,0x8D95,0x8D96,0x8D97,0x8D98,0x8D99,/* 0x70-0x77 */ + 0x8D9A,0x8D9B,0x8D9C,0x8D9D,0x8D9E,0x8DA0,0x8DA1,0x0000,/* 0x78-0x7F */ + + 0x8DA2,0x8DA4,0x8DA5,0x8DA6,0x8DA7,0x8DA8,0x8DA9,0x8DAA,/* 0x80-0x87 */ + 0x8DAB,0x8DAC,0x8DAD,0x8DAE,0x8DAF,0x8DB0,0x8DB2,0x8DB6,/* 0x88-0x8F */ + 0x8DB7,0x8DB9,0x8DBB,0x8DBD,0x8DC0,0x8DC1,0x8DC2,0x8DC5,/* 0x90-0x97 */ + 0x8DC7,0x8DC8,0x8DC9,0x8DCA,0x8DCD,0x8DD0,0x8DD2,0x8DD3,/* 0x98-0x9F */ + 0x8DD4,0x51C7,0x5196,0x51A2,0x51A5,0x8BA0,0x8BA6,0x8BA7,/* 0xA0-0xA7 */ + 0x8BAA,0x8BB4,0x8BB5,0x8BB7,0x8BC2,0x8BC3,0x8BCB,0x8BCF,/* 0xA8-0xAF */ + 0x8BCE,0x8BD2,0x8BD3,0x8BD4,0x8BD6,0x8BD8,0x8BD9,0x8BDC,/* 0xB0-0xB7 */ + 0x8BDF,0x8BE0,0x8BE4,0x8BE8,0x8BE9,0x8BEE,0x8BF0,0x8BF3,/* 0xB8-0xBF */ + 0x8BF6,0x8BF9,0x8BFC,0x8BFF,0x8C00,0x8C02,0x8C04,0x8C07,/* 0xC0-0xC7 */ + 0x8C0C,0x8C0F,0x8C11,0x8C12,0x8C14,0x8C15,0x8C16,0x8C19,/* 0xC8-0xCF */ + 0x8C1B,0x8C18,0x8C1D,0x8C1F,0x8C20,0x8C21,0x8C25,0x8C27,/* 0xD0-0xD7 */ + 0x8C2A,0x8C2B,0x8C2E,0x8C2F,0x8C32,0x8C33,0x8C35,0x8C36,/* 0xD8-0xDF */ + 0x5369,0x537A,0x961D,0x9622,0x9621,0x9631,0x962A,0x963D,/* 0xE0-0xE7 */ + 0x963C,0x9642,0x9649,0x9654,0x965F,0x9667,0x966C,0x9672,/* 0xE8-0xEF */ + 0x9674,0x9688,0x968D,0x9697,0x96B0,0x9097,0x909B,0x909D,/* 0xF0-0xF7 */ + 0x9099,0x90AC,0x90A1,0x90B4,0x90B3,0x90B6,0x90BA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8DD5,0x8DD8,0x8DD9,0x8DDC,0x8DE0,0x8DE1,0x8DE2,0x8DE5,/* 0x40-0x47 */ + 0x8DE6,0x8DE7,0x8DE9,0x8DED,0x8DEE,0x8DF0,0x8DF1,0x8DF2,/* 0x48-0x4F */ + 0x8DF4,0x8DF6,0x8DFC,0x8DFE,0x8DFF,0x8E00,0x8E01,0x8E02,/* 0x50-0x57 */ + 0x8E03,0x8E04,0x8E06,0x8E07,0x8E08,0x8E0B,0x8E0D,0x8E0E,/* 0x58-0x5F */ + 0x8E10,0x8E11,0x8E12,0x8E13,0x8E15,0x8E16,0x8E17,0x8E18,/* 0x60-0x67 */ + 0x8E19,0x8E1A,0x8E1B,0x8E1C,0x8E20,0x8E21,0x8E24,0x8E25,/* 0x68-0x6F */ + 0x8E26,0x8E27,0x8E28,0x8E2B,0x8E2D,0x8E30,0x8E32,0x8E33,/* 0x70-0x77 */ + 0x8E34,0x8E36,0x8E37,0x8E38,0x8E3B,0x8E3C,0x8E3E,0x0000,/* 0x78-0x7F */ + + 0x8E3F,0x8E43,0x8E45,0x8E46,0x8E4C,0x8E4D,0x8E4E,0x8E4F,/* 0x80-0x87 */ + 0x8E50,0x8E53,0x8E54,0x8E55,0x8E56,0x8E57,0x8E58,0x8E5A,/* 0x88-0x8F */ + 0x8E5B,0x8E5C,0x8E5D,0x8E5E,0x8E5F,0x8E60,0x8E61,0x8E62,/* 0x90-0x97 */ + 0x8E63,0x8E64,0x8E65,0x8E67,0x8E68,0x8E6A,0x8E6B,0x8E6E,/* 0x98-0x9F */ + 0x8E71,0x90B8,0x90B0,0x90CF,0x90C5,0x90BE,0x90D0,0x90C4,/* 0xA0-0xA7 */ + 0x90C7,0x90D3,0x90E6,0x90E2,0x90DC,0x90D7,0x90DB,0x90EB,/* 0xA8-0xAF */ + 0x90EF,0x90FE,0x9104,0x9122,0x911E,0x9123,0x9131,0x912F,/* 0xB0-0xB7 */ + 0x9139,0x9143,0x9146,0x520D,0x5942,0x52A2,0x52AC,0x52AD,/* 0xB8-0xBF */ + 0x52BE,0x54FF,0x52D0,0x52D6,0x52F0,0x53DF,0x71EE,0x77CD,/* 0xC0-0xC7 */ + 0x5EF4,0x51F5,0x51FC,0x9B2F,0x53B6,0x5F01,0x755A,0x5DEF,/* 0xC8-0xCF */ + 0x574C,0x57A9,0x57A1,0x587E,0x58BC,0x58C5,0x58D1,0x5729,/* 0xD0-0xD7 */ + 0x572C,0x572A,0x5733,0x5739,0x572E,0x572F,0x575C,0x573B,/* 0xD8-0xDF */ + 0x5742,0x5769,0x5785,0x576B,0x5786,0x577C,0x577B,0x5768,/* 0xE0-0xE7 */ + 0x576D,0x5776,0x5773,0x57AD,0x57A4,0x578C,0x57B2,0x57CF,/* 0xE8-0xEF */ + 0x57A7,0x57B4,0x5793,0x57A0,0x57D5,0x57D8,0x57DA,0x57D9,/* 0xF0-0xF7 */ + 0x57D2,0x57B8,0x57F4,0x57EF,0x57F8,0x57E4,0x57DD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E73,0x8E75,0x8E77,0x8E78,0x8E79,0x8E7A,0x8E7B,0x8E7D,/* 0x40-0x47 */ + 0x8E7E,0x8E80,0x8E82,0x8E83,0x8E84,0x8E86,0x8E88,0x8E89,/* 0x48-0x4F */ + 0x8E8A,0x8E8B,0x8E8C,0x8E8D,0x8E8E,0x8E91,0x8E92,0x8E93,/* 0x50-0x57 */ + 0x8E95,0x8E96,0x8E97,0x8E98,0x8E99,0x8E9A,0x8E9B,0x8E9D,/* 0x58-0x5F */ + 0x8E9F,0x8EA0,0x8EA1,0x8EA2,0x8EA3,0x8EA4,0x8EA5,0x8EA6,/* 0x60-0x67 */ + 0x8EA7,0x8EA8,0x8EA9,0x8EAA,0x8EAD,0x8EAE,0x8EB0,0x8EB1,/* 0x68-0x6F */ + 0x8EB3,0x8EB4,0x8EB5,0x8EB6,0x8EB7,0x8EB8,0x8EB9,0x8EBB,/* 0x70-0x77 */ + 0x8EBC,0x8EBD,0x8EBE,0x8EBF,0x8EC0,0x8EC1,0x8EC2,0x0000,/* 0x78-0x7F */ + + 0x8EC3,0x8EC4,0x8EC5,0x8EC6,0x8EC7,0x8EC8,0x8EC9,0x8ECA,/* 0x80-0x87 */ + 0x8ECB,0x8ECC,0x8ECD,0x8ECF,0x8ED0,0x8ED1,0x8ED2,0x8ED3,/* 0x88-0x8F */ + 0x8ED4,0x8ED5,0x8ED6,0x8ED7,0x8ED8,0x8ED9,0x8EDA,0x8EDB,/* 0x90-0x97 */ + 0x8EDC,0x8EDD,0x8EDE,0x8EDF,0x8EE0,0x8EE1,0x8EE2,0x8EE3,/* 0x98-0x9F */ + 0x8EE4,0x580B,0x580D,0x57FD,0x57ED,0x5800,0x581E,0x5819,/* 0xA0-0xA7 */ + 0x5844,0x5820,0x5865,0x586C,0x5881,0x5889,0x589A,0x5880,/* 0xA8-0xAF */ + 0x99A8,0x9F19,0x61FF,0x8279,0x827D,0x827F,0x828F,0x828A,/* 0xB0-0xB7 */ + 0x82A8,0x8284,0x828E,0x8291,0x8297,0x8299,0x82AB,0x82B8,/* 0xB8-0xBF */ + 0x82BE,0x82B0,0x82C8,0x82CA,0x82E3,0x8298,0x82B7,0x82AE,/* 0xC0-0xC7 */ + 0x82CB,0x82CC,0x82C1,0x82A9,0x82B4,0x82A1,0x82AA,0x829F,/* 0xC8-0xCF */ + 0x82C4,0x82CE,0x82A4,0x82E1,0x8309,0x82F7,0x82E4,0x830F,/* 0xD0-0xD7 */ + 0x8307,0x82DC,0x82F4,0x82D2,0x82D8,0x830C,0x82FB,0x82D3,/* 0xD8-0xDF */ + 0x8311,0x831A,0x8306,0x8314,0x8315,0x82E0,0x82D5,0x831C,/* 0xE0-0xE7 */ + 0x8351,0x835B,0x835C,0x8308,0x8392,0x833C,0x8334,0x8331,/* 0xE8-0xEF */ + 0x839B,0x835E,0x832F,0x834F,0x8347,0x8343,0x835F,0x8340,/* 0xF0-0xF7 */ + 0x8317,0x8360,0x832D,0x833A,0x8333,0x8366,0x8365,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8EE5,0x8EE6,0x8EE7,0x8EE8,0x8EE9,0x8EEA,0x8EEB,0x8EEC,/* 0x40-0x47 */ + 0x8EED,0x8EEE,0x8EEF,0x8EF0,0x8EF1,0x8EF2,0x8EF3,0x8EF4,/* 0x48-0x4F */ + 0x8EF5,0x8EF6,0x8EF7,0x8EF8,0x8EF9,0x8EFA,0x8EFB,0x8EFC,/* 0x50-0x57 */ + 0x8EFD,0x8EFE,0x8EFF,0x8F00,0x8F01,0x8F02,0x8F03,0x8F04,/* 0x58-0x5F */ + 0x8F05,0x8F06,0x8F07,0x8F08,0x8F09,0x8F0A,0x8F0B,0x8F0C,/* 0x60-0x67 */ + 0x8F0D,0x8F0E,0x8F0F,0x8F10,0x8F11,0x8F12,0x8F13,0x8F14,/* 0x68-0x6F */ + 0x8F15,0x8F16,0x8F17,0x8F18,0x8F19,0x8F1A,0x8F1B,0x8F1C,/* 0x70-0x77 */ + 0x8F1D,0x8F1E,0x8F1F,0x8F20,0x8F21,0x8F22,0x8F23,0x0000,/* 0x78-0x7F */ + + 0x8F24,0x8F25,0x8F26,0x8F27,0x8F28,0x8F29,0x8F2A,0x8F2B,/* 0x80-0x87 */ + 0x8F2C,0x8F2D,0x8F2E,0x8F2F,0x8F30,0x8F31,0x8F32,0x8F33,/* 0x88-0x8F */ + 0x8F34,0x8F35,0x8F36,0x8F37,0x8F38,0x8F39,0x8F3A,0x8F3B,/* 0x90-0x97 */ + 0x8F3C,0x8F3D,0x8F3E,0x8F3F,0x8F40,0x8F41,0x8F42,0x8F43,/* 0x98-0x9F */ + 0x8F44,0x8368,0x831B,0x8369,0x836C,0x836A,0x836D,0x836E,/* 0xA0-0xA7 */ + 0x83B0,0x8378,0x83B3,0x83B4,0x83A0,0x83AA,0x8393,0x839C,/* 0xA8-0xAF */ + 0x8385,0x837C,0x83B6,0x83A9,0x837D,0x83B8,0x837B,0x8398,/* 0xB0-0xB7 */ + 0x839E,0x83A8,0x83BA,0x83BC,0x83C1,0x8401,0x83E5,0x83D8,/* 0xB8-0xBF */ + 0x5807,0x8418,0x840B,0x83DD,0x83FD,0x83D6,0x841C,0x8438,/* 0xC0-0xC7 */ + 0x8411,0x8406,0x83D4,0x83DF,0x840F,0x8403,0x83F8,0x83F9,/* 0xC8-0xCF */ + 0x83EA,0x83C5,0x83C0,0x8426,0x83F0,0x83E1,0x845C,0x8451,/* 0xD0-0xD7 */ + 0x845A,0x8459,0x8473,0x8487,0x8488,0x847A,0x8489,0x8478,/* 0xD8-0xDF */ + 0x843C,0x8446,0x8469,0x8476,0x848C,0x848E,0x8431,0x846D,/* 0xE0-0xE7 */ + 0x84C1,0x84CD,0x84D0,0x84E6,0x84BD,0x84D3,0x84CA,0x84BF,/* 0xE8-0xEF */ + 0x84BA,0x84E0,0x84A1,0x84B9,0x84B4,0x8497,0x84E5,0x84E3,/* 0xF0-0xF7 */ + 0x850C,0x750D,0x8538,0x84F0,0x8539,0x851F,0x853A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F45,0x8F46,0x8F47,0x8F48,0x8F49,0x8F4A,0x8F4B,0x8F4C,/* 0x40-0x47 */ + 0x8F4D,0x8F4E,0x8F4F,0x8F50,0x8F51,0x8F52,0x8F53,0x8F54,/* 0x48-0x4F */ + 0x8F55,0x8F56,0x8F57,0x8F58,0x8F59,0x8F5A,0x8F5B,0x8F5C,/* 0x50-0x57 */ + 0x8F5D,0x8F5E,0x8F5F,0x8F60,0x8F61,0x8F62,0x8F63,0x8F64,/* 0x58-0x5F */ + 0x8F65,0x8F6A,0x8F80,0x8F8C,0x8F92,0x8F9D,0x8FA0,0x8FA1,/* 0x60-0x67 */ + 0x8FA2,0x8FA4,0x8FA5,0x8FA6,0x8FA7,0x8FAA,0x8FAC,0x8FAD,/* 0x68-0x6F */ + 0x8FAE,0x8FAF,0x8FB2,0x8FB3,0x8FB4,0x8FB5,0x8FB7,0x8FB8,/* 0x70-0x77 */ + 0x8FBA,0x8FBB,0x8FBC,0x8FBF,0x8FC0,0x8FC3,0x8FC6,0x0000,/* 0x78-0x7F */ + + 0x8FC9,0x8FCA,0x8FCB,0x8FCC,0x8FCD,0x8FCF,0x8FD2,0x8FD6,/* 0x80-0x87 */ + 0x8FD7,0x8FDA,0x8FE0,0x8FE1,0x8FE3,0x8FE7,0x8FEC,0x8FEF,/* 0x88-0x8F */ + 0x8FF1,0x8FF2,0x8FF4,0x8FF5,0x8FF6,0x8FFA,0x8FFB,0x8FFC,/* 0x90-0x97 */ + 0x8FFE,0x8FFF,0x9007,0x9008,0x900C,0x900E,0x9013,0x9015,/* 0x98-0x9F */ + 0x9018,0x8556,0x853B,0x84FF,0x84FC,0x8559,0x8548,0x8568,/* 0xA0-0xA7 */ + 0x8564,0x855E,0x857A,0x77A2,0x8543,0x8572,0x857B,0x85A4,/* 0xA8-0xAF */ + 0x85A8,0x8587,0x858F,0x8579,0x85AE,0x859C,0x8585,0x85B9,/* 0xB0-0xB7 */ + 0x85B7,0x85B0,0x85D3,0x85C1,0x85DC,0x85FF,0x8627,0x8605,/* 0xB8-0xBF */ + 0x8629,0x8616,0x863C,0x5EFE,0x5F08,0x593C,0x5941,0x8037,/* 0xC0-0xC7 */ + 0x5955,0x595A,0x5958,0x530F,0x5C22,0x5C25,0x5C2C,0x5C34,/* 0xC8-0xCF */ + 0x624C,0x626A,0x629F,0x62BB,0x62CA,0x62DA,0x62D7,0x62EE,/* 0xD0-0xD7 */ + 0x6322,0x62F6,0x6339,0x634B,0x6343,0x63AD,0x63F6,0x6371,/* 0xD8-0xDF */ + 0x637A,0x638E,0x63B4,0x636D,0x63AC,0x638A,0x6369,0x63AE,/* 0xE0-0xE7 */ + 0x63BC,0x63F2,0x63F8,0x63E0,0x63FF,0x63C4,0x63DE,0x63CE,/* 0xE8-0xEF */ + 0x6452,0x63C6,0x63BE,0x6445,0x6441,0x640B,0x641B,0x6420,/* 0xF0-0xF7 */ + 0x640C,0x6426,0x6421,0x645E,0x6484,0x646D,0x6496,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9019,0x901C,0x9023,0x9024,0x9025,0x9027,0x9028,0x9029,/* 0x40-0x47 */ + 0x902A,0x902B,0x902C,0x9030,0x9031,0x9032,0x9033,0x9034,/* 0x48-0x4F */ + 0x9037,0x9039,0x903A,0x903D,0x903F,0x9040,0x9043,0x9045,/* 0x50-0x57 */ + 0x9046,0x9048,0x9049,0x904A,0x904B,0x904C,0x904E,0x9054,/* 0x58-0x5F */ + 0x9055,0x9056,0x9059,0x905A,0x905C,0x905D,0x905E,0x905F,/* 0x60-0x67 */ + 0x9060,0x9061,0x9064,0x9066,0x9067,0x9069,0x906A,0x906B,/* 0x68-0x6F */ + 0x906C,0x906F,0x9070,0x9071,0x9072,0x9073,0x9076,0x9077,/* 0x70-0x77 */ + 0x9078,0x9079,0x907A,0x907B,0x907C,0x907E,0x9081,0x0000,/* 0x78-0x7F */ + + 0x9084,0x9085,0x9086,0x9087,0x9089,0x908A,0x908C,0x908D,/* 0x80-0x87 */ + 0x908E,0x908F,0x9090,0x9092,0x9094,0x9096,0x9098,0x909A,/* 0x88-0x8F */ + 0x909C,0x909E,0x909F,0x90A0,0x90A4,0x90A5,0x90A7,0x90A8,/* 0x90-0x97 */ + 0x90A9,0x90AB,0x90AD,0x90B2,0x90B7,0x90BC,0x90BD,0x90BF,/* 0x98-0x9F */ + 0x90C0,0x647A,0x64B7,0x64B8,0x6499,0x64BA,0x64C0,0x64D0,/* 0xA0-0xA7 */ + 0x64D7,0x64E4,0x64E2,0x6509,0x6525,0x652E,0x5F0B,0x5FD2,/* 0xA8-0xAF */ + 0x7519,0x5F11,0x535F,0x53F1,0x53FD,0x53E9,0x53E8,0x53FB,/* 0xB0-0xB7 */ + 0x5412,0x5416,0x5406,0x544B,0x5452,0x5453,0x5454,0x5456,/* 0xB8-0xBF */ + 0x5443,0x5421,0x5457,0x5459,0x5423,0x5432,0x5482,0x5494,/* 0xC0-0xC7 */ + 0x5477,0x5471,0x5464,0x549A,0x549B,0x5484,0x5476,0x5466,/* 0xC8-0xCF */ + 0x549D,0x54D0,0x54AD,0x54C2,0x54B4,0x54D2,0x54A7,0x54A6,/* 0xD0-0xD7 */ + 0x54D3,0x54D4,0x5472,0x54A3,0x54D5,0x54BB,0x54BF,0x54CC,/* 0xD8-0xDF */ + 0x54D9,0x54DA,0x54DC,0x54A9,0x54AA,0x54A4,0x54DD,0x54CF,/* 0xE0-0xE7 */ + 0x54DE,0x551B,0x54E7,0x5520,0x54FD,0x5514,0x54F3,0x5522,/* 0xE8-0xEF */ + 0x5523,0x550F,0x5511,0x5527,0x552A,0x5567,0x558F,0x55B5,/* 0xF0-0xF7 */ + 0x5549,0x556D,0x5541,0x5555,0x553F,0x5550,0x553C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x90C2,0x90C3,0x90C6,0x90C8,0x90C9,0x90CB,0x90CC,0x90CD,/* 0x40-0x47 */ + 0x90D2,0x90D4,0x90D5,0x90D6,0x90D8,0x90D9,0x90DA,0x90DE,/* 0x48-0x4F */ + 0x90DF,0x90E0,0x90E3,0x90E4,0x90E5,0x90E9,0x90EA,0x90EC,/* 0x50-0x57 */ + 0x90EE,0x90F0,0x90F1,0x90F2,0x90F3,0x90F5,0x90F6,0x90F7,/* 0x58-0x5F */ + 0x90F9,0x90FA,0x90FB,0x90FC,0x90FF,0x9100,0x9101,0x9103,/* 0x60-0x67 */ + 0x9105,0x9106,0x9107,0x9108,0x9109,0x910A,0x910B,0x910C,/* 0x68-0x6F */ + 0x910D,0x910E,0x910F,0x9110,0x9111,0x9112,0x9113,0x9114,/* 0x70-0x77 */ + 0x9115,0x9116,0x9117,0x9118,0x911A,0x911B,0x911C,0x0000,/* 0x78-0x7F */ + + 0x911D,0x911F,0x9120,0x9121,0x9124,0x9125,0x9126,0x9127,/* 0x80-0x87 */ + 0x9128,0x9129,0x912A,0x912B,0x912C,0x912D,0x912E,0x9130,/* 0x88-0x8F */ + 0x9132,0x9133,0x9134,0x9135,0x9136,0x9137,0x9138,0x913A,/* 0x90-0x97 */ + 0x913B,0x913C,0x913D,0x913E,0x913F,0x9140,0x9141,0x9142,/* 0x98-0x9F */ + 0x9144,0x5537,0x5556,0x5575,0x5576,0x5577,0x5533,0x5530,/* 0xA0-0xA7 */ + 0x555C,0x558B,0x55D2,0x5583,0x55B1,0x55B9,0x5588,0x5581,/* 0xA8-0xAF */ + 0x559F,0x557E,0x55D6,0x5591,0x557B,0x55DF,0x55BD,0x55BE,/* 0xB0-0xB7 */ + 0x5594,0x5599,0x55EA,0x55F7,0x55C9,0x561F,0x55D1,0x55EB,/* 0xB8-0xBF */ + 0x55EC,0x55D4,0x55E6,0x55DD,0x55C4,0x55EF,0x55E5,0x55F2,/* 0xC0-0xC7 */ + 0x55F3,0x55CC,0x55CD,0x55E8,0x55F5,0x55E4,0x8F94,0x561E,/* 0xC8-0xCF */ + 0x5608,0x560C,0x5601,0x5624,0x5623,0x55FE,0x5600,0x5627,/* 0xD0-0xD7 */ + 0x562D,0x5658,0x5639,0x5657,0x562C,0x564D,0x5662,0x5659,/* 0xD8-0xDF */ + 0x565C,0x564C,0x5654,0x5686,0x5664,0x5671,0x566B,0x567B,/* 0xE0-0xE7 */ + 0x567C,0x5685,0x5693,0x56AF,0x56D4,0x56D7,0x56DD,0x56E1,/* 0xE8-0xEF */ + 0x56F5,0x56EB,0x56F9,0x56FF,0x5704,0x570A,0x5709,0x571C,/* 0xF0-0xF7 */ + 0x5E0F,0x5E19,0x5E14,0x5E11,0x5E31,0x5E3B,0x5E3C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9145,0x9147,0x9148,0x9151,0x9153,0x9154,0x9155,0x9156,/* 0x40-0x47 */ + 0x9158,0x9159,0x915B,0x915C,0x915F,0x9160,0x9166,0x9167,/* 0x48-0x4F */ + 0x9168,0x916B,0x916D,0x9173,0x917A,0x917B,0x917C,0x9180,/* 0x50-0x57 */ + 0x9181,0x9182,0x9183,0x9184,0x9186,0x9188,0x918A,0x918E,/* 0x58-0x5F */ + 0x918F,0x9193,0x9194,0x9195,0x9196,0x9197,0x9198,0x9199,/* 0x60-0x67 */ + 0x919C,0x919D,0x919E,0x919F,0x91A0,0x91A1,0x91A4,0x91A5,/* 0x68-0x6F */ + 0x91A6,0x91A7,0x91A8,0x91A9,0x91AB,0x91AC,0x91B0,0x91B1,/* 0x70-0x77 */ + 0x91B2,0x91B3,0x91B6,0x91B7,0x91B8,0x91B9,0x91BB,0x0000,/* 0x78-0x7F */ + + 0x91BC,0x91BD,0x91BE,0x91BF,0x91C0,0x91C1,0x91C2,0x91C3,/* 0x80-0x87 */ + 0x91C4,0x91C5,0x91C6,0x91C8,0x91CB,0x91D0,0x91D2,0x91D3,/* 0x88-0x8F */ + 0x91D4,0x91D5,0x91D6,0x91D7,0x91D8,0x91D9,0x91DA,0x91DB,/* 0x90-0x97 */ + 0x91DD,0x91DE,0x91DF,0x91E0,0x91E1,0x91E2,0x91E3,0x91E4,/* 0x98-0x9F */ + 0x91E5,0x5E37,0x5E44,0x5E54,0x5E5B,0x5E5E,0x5E61,0x5C8C,/* 0xA0-0xA7 */ + 0x5C7A,0x5C8D,0x5C90,0x5C96,0x5C88,0x5C98,0x5C99,0x5C91,/* 0xA8-0xAF */ + 0x5C9A,0x5C9C,0x5CB5,0x5CA2,0x5CBD,0x5CAC,0x5CAB,0x5CB1,/* 0xB0-0xB7 */ + 0x5CA3,0x5CC1,0x5CB7,0x5CC4,0x5CD2,0x5CE4,0x5CCB,0x5CE5,/* 0xB8-0xBF */ + 0x5D02,0x5D03,0x5D27,0x5D26,0x5D2E,0x5D24,0x5D1E,0x5D06,/* 0xC0-0xC7 */ + 0x5D1B,0x5D58,0x5D3E,0x5D34,0x5D3D,0x5D6C,0x5D5B,0x5D6F,/* 0xC8-0xCF */ + 0x5D5D,0x5D6B,0x5D4B,0x5D4A,0x5D69,0x5D74,0x5D82,0x5D99,/* 0xD0-0xD7 */ + 0x5D9D,0x8C73,0x5DB7,0x5DC5,0x5F73,0x5F77,0x5F82,0x5F87,/* 0xD8-0xDF */ + 0x5F89,0x5F8C,0x5F95,0x5F99,0x5F9C,0x5FA8,0x5FAD,0x5FB5,/* 0xE0-0xE7 */ + 0x5FBC,0x8862,0x5F61,0x72AD,0x72B0,0x72B4,0x72B7,0x72B8,/* 0xE8-0xEF */ + 0x72C3,0x72C1,0x72CE,0x72CD,0x72D2,0x72E8,0x72EF,0x72E9,/* 0xF0-0xF7 */ + 0x72F2,0x72F4,0x72F7,0x7301,0x72F3,0x7303,0x72FA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x91E6,0x91E7,0x91E8,0x91E9,0x91EA,0x91EB,0x91EC,0x91ED,/* 0x40-0x47 */ + 0x91EE,0x91EF,0x91F0,0x91F1,0x91F2,0x91F3,0x91F4,0x91F5,/* 0x48-0x4F */ + 0x91F6,0x91F7,0x91F8,0x91F9,0x91FA,0x91FB,0x91FC,0x91FD,/* 0x50-0x57 */ + 0x91FE,0x91FF,0x9200,0x9201,0x9202,0x9203,0x9204,0x9205,/* 0x58-0x5F */ + 0x9206,0x9207,0x9208,0x9209,0x920A,0x920B,0x920C,0x920D,/* 0x60-0x67 */ + 0x920E,0x920F,0x9210,0x9211,0x9212,0x9213,0x9214,0x9215,/* 0x68-0x6F */ + 0x9216,0x9217,0x9218,0x9219,0x921A,0x921B,0x921C,0x921D,/* 0x70-0x77 */ + 0x921E,0x921F,0x9220,0x9221,0x9222,0x9223,0x9224,0x0000,/* 0x78-0x7F */ + + 0x9225,0x9226,0x9227,0x9228,0x9229,0x922A,0x922B,0x922C,/* 0x80-0x87 */ + 0x922D,0x922E,0x922F,0x9230,0x9231,0x9232,0x9233,0x9234,/* 0x88-0x8F */ + 0x9235,0x9236,0x9237,0x9238,0x9239,0x923A,0x923B,0x923C,/* 0x90-0x97 */ + 0x923D,0x923E,0x923F,0x9240,0x9241,0x9242,0x9243,0x9244,/* 0x98-0x9F */ + 0x9245,0x72FB,0x7317,0x7313,0x7321,0x730A,0x731E,0x731D,/* 0xA0-0xA7 */ + 0x7315,0x7322,0x7339,0x7325,0x732C,0x7338,0x7331,0x7350,/* 0xA8-0xAF */ + 0x734D,0x7357,0x7360,0x736C,0x736F,0x737E,0x821B,0x5925,/* 0xB0-0xB7 */ + 0x98E7,0x5924,0x5902,0x9963,0x9967,0x9968,0x9969,0x996A,/* 0xB8-0xBF */ + 0x996B,0x996C,0x9974,0x9977,0x997D,0x9980,0x9984,0x9987,/* 0xC0-0xC7 */ + 0x998A,0x998D,0x9990,0x9991,0x9993,0x9994,0x9995,0x5E80,/* 0xC8-0xCF */ + 0x5E91,0x5E8B,0x5E96,0x5EA5,0x5EA0,0x5EB9,0x5EB5,0x5EBE,/* 0xD0-0xD7 */ + 0x5EB3,0x8D53,0x5ED2,0x5ED1,0x5EDB,0x5EE8,0x5EEA,0x81BA,/* 0xD8-0xDF */ + 0x5FC4,0x5FC9,0x5FD6,0x5FCF,0x6003,0x5FEE,0x6004,0x5FE1,/* 0xE0-0xE7 */ + 0x5FE4,0x5FFE,0x6005,0x6006,0x5FEA,0x5FED,0x5FF8,0x6019,/* 0xE8-0xEF */ + 0x6035,0x6026,0x601B,0x600F,0x600D,0x6029,0x602B,0x600A,/* 0xF0-0xF7 */ + 0x603F,0x6021,0x6078,0x6079,0x607B,0x607A,0x6042,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9246,0x9247,0x9248,0x9249,0x924A,0x924B,0x924C,0x924D,/* 0x40-0x47 */ + 0x924E,0x924F,0x9250,0x9251,0x9252,0x9253,0x9254,0x9255,/* 0x48-0x4F */ + 0x9256,0x9257,0x9258,0x9259,0x925A,0x925B,0x925C,0x925D,/* 0x50-0x57 */ + 0x925E,0x925F,0x9260,0x9261,0x9262,0x9263,0x9264,0x9265,/* 0x58-0x5F */ + 0x9266,0x9267,0x9268,0x9269,0x926A,0x926B,0x926C,0x926D,/* 0x60-0x67 */ + 0x926E,0x926F,0x9270,0x9271,0x9272,0x9273,0x9275,0x9276,/* 0x68-0x6F */ + 0x9277,0x9278,0x9279,0x927A,0x927B,0x927C,0x927D,0x927E,/* 0x70-0x77 */ + 0x927F,0x9280,0x9281,0x9282,0x9283,0x9284,0x9285,0x0000,/* 0x78-0x7F */ + + 0x9286,0x9287,0x9288,0x9289,0x928A,0x928B,0x928C,0x928D,/* 0x80-0x87 */ + 0x928F,0x9290,0x9291,0x9292,0x9293,0x9294,0x9295,0x9296,/* 0x88-0x8F */ + 0x9297,0x9298,0x9299,0x929A,0x929B,0x929C,0x929D,0x929E,/* 0x90-0x97 */ + 0x929F,0x92A0,0x92A1,0x92A2,0x92A3,0x92A4,0x92A5,0x92A6,/* 0x98-0x9F */ + 0x92A7,0x606A,0x607D,0x6096,0x609A,0x60AD,0x609D,0x6083,/* 0xA0-0xA7 */ + 0x6092,0x608C,0x609B,0x60EC,0x60BB,0x60B1,0x60DD,0x60D8,/* 0xA8-0xAF */ + 0x60C6,0x60DA,0x60B4,0x6120,0x6126,0x6115,0x6123,0x60F4,/* 0xB0-0xB7 */ + 0x6100,0x610E,0x612B,0x614A,0x6175,0x61AC,0x6194,0x61A7,/* 0xB8-0xBF */ + 0x61B7,0x61D4,0x61F5,0x5FDD,0x96B3,0x95E9,0x95EB,0x95F1,/* 0xC0-0xC7 */ + 0x95F3,0x95F5,0x95F6,0x95FC,0x95FE,0x9603,0x9604,0x9606,/* 0xC8-0xCF */ + 0x9608,0x960A,0x960B,0x960C,0x960D,0x960F,0x9612,0x9615,/* 0xD0-0xD7 */ + 0x9616,0x9617,0x9619,0x961A,0x4E2C,0x723F,0x6215,0x6C35,/* 0xD8-0xDF */ + 0x6C54,0x6C5C,0x6C4A,0x6CA3,0x6C85,0x6C90,0x6C94,0x6C8C,/* 0xE0-0xE7 */ + 0x6C68,0x6C69,0x6C74,0x6C76,0x6C86,0x6CA9,0x6CD0,0x6CD4,/* 0xE8-0xEF */ + 0x6CAD,0x6CF7,0x6CF8,0x6CF1,0x6CD7,0x6CB2,0x6CE0,0x6CD6,/* 0xF0-0xF7 */ + 0x6CFA,0x6CEB,0x6CEE,0x6CB1,0x6CD3,0x6CEF,0x6CFE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x92A8,0x92A9,0x92AA,0x92AB,0x92AC,0x92AD,0x92AF,0x92B0,/* 0x40-0x47 */ + 0x92B1,0x92B2,0x92B3,0x92B4,0x92B5,0x92B6,0x92B7,0x92B8,/* 0x48-0x4F */ + 0x92B9,0x92BA,0x92BB,0x92BC,0x92BD,0x92BE,0x92BF,0x92C0,/* 0x50-0x57 */ + 0x92C1,0x92C2,0x92C3,0x92C4,0x92C5,0x92C6,0x92C7,0x92C9,/* 0x58-0x5F */ + 0x92CA,0x92CB,0x92CC,0x92CD,0x92CE,0x92CF,0x92D0,0x92D1,/* 0x60-0x67 */ + 0x92D2,0x92D3,0x92D4,0x92D5,0x92D6,0x92D7,0x92D8,0x92D9,/* 0x68-0x6F */ + 0x92DA,0x92DB,0x92DC,0x92DD,0x92DE,0x92DF,0x92E0,0x92E1,/* 0x70-0x77 */ + 0x92E2,0x92E3,0x92E4,0x92E5,0x92E6,0x92E7,0x92E8,0x0000,/* 0x78-0x7F */ + + 0x92E9,0x92EA,0x92EB,0x92EC,0x92ED,0x92EE,0x92EF,0x92F0,/* 0x80-0x87 */ + 0x92F1,0x92F2,0x92F3,0x92F4,0x92F5,0x92F6,0x92F7,0x92F8,/* 0x88-0x8F */ + 0x92F9,0x92FA,0x92FB,0x92FC,0x92FD,0x92FE,0x92FF,0x9300,/* 0x90-0x97 */ + 0x9301,0x9302,0x9303,0x9304,0x9305,0x9306,0x9307,0x9308,/* 0x98-0x9F */ + 0x9309,0x6D39,0x6D27,0x6D0C,0x6D43,0x6D48,0x6D07,0x6D04,/* 0xA0-0xA7 */ + 0x6D19,0x6D0E,0x6D2B,0x6D4D,0x6D2E,0x6D35,0x6D1A,0x6D4F,/* 0xA8-0xAF */ + 0x6D52,0x6D54,0x6D33,0x6D91,0x6D6F,0x6D9E,0x6DA0,0x6D5E,/* 0xB0-0xB7 */ + 0x6D93,0x6D94,0x6D5C,0x6D60,0x6D7C,0x6D63,0x6E1A,0x6DC7,/* 0xB8-0xBF */ + 0x6DC5,0x6DDE,0x6E0E,0x6DBF,0x6DE0,0x6E11,0x6DE6,0x6DDD,/* 0xC0-0xC7 */ + 0x6DD9,0x6E16,0x6DAB,0x6E0C,0x6DAE,0x6E2B,0x6E6E,0x6E4E,/* 0xC8-0xCF */ + 0x6E6B,0x6EB2,0x6E5F,0x6E86,0x6E53,0x6E54,0x6E32,0x6E25,/* 0xD0-0xD7 */ + 0x6E44,0x6EDF,0x6EB1,0x6E98,0x6EE0,0x6F2D,0x6EE2,0x6EA5,/* 0xD8-0xDF */ + 0x6EA7,0x6EBD,0x6EBB,0x6EB7,0x6ED7,0x6EB4,0x6ECF,0x6E8F,/* 0xE0-0xE7 */ + 0x6EC2,0x6E9F,0x6F62,0x6F46,0x6F47,0x6F24,0x6F15,0x6EF9,/* 0xE8-0xEF */ + 0x6F2F,0x6F36,0x6F4B,0x6F74,0x6F2A,0x6F09,0x6F29,0x6F89,/* 0xF0-0xF7 */ + 0x6F8D,0x6F8C,0x6F78,0x6F72,0x6F7C,0x6F7A,0x6FD1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x930A,0x930B,0x930C,0x930D,0x930E,0x930F,0x9310,0x9311,/* 0x40-0x47 */ + 0x9312,0x9313,0x9314,0x9315,0x9316,0x9317,0x9318,0x9319,/* 0x48-0x4F */ + 0x931A,0x931B,0x931C,0x931D,0x931E,0x931F,0x9320,0x9321,/* 0x50-0x57 */ + 0x9322,0x9323,0x9324,0x9325,0x9326,0x9327,0x9328,0x9329,/* 0x58-0x5F */ + 0x932A,0x932B,0x932C,0x932D,0x932E,0x932F,0x9330,0x9331,/* 0x60-0x67 */ + 0x9332,0x9333,0x9334,0x9335,0x9336,0x9337,0x9338,0x9339,/* 0x68-0x6F */ + 0x933A,0x933B,0x933C,0x933D,0x933F,0x9340,0x9341,0x9342,/* 0x70-0x77 */ + 0x9343,0x9344,0x9345,0x9346,0x9347,0x9348,0x9349,0x0000,/* 0x78-0x7F */ + + 0x934A,0x934B,0x934C,0x934D,0x934E,0x934F,0x9350,0x9351,/* 0x80-0x87 */ + 0x9352,0x9353,0x9354,0x9355,0x9356,0x9357,0x9358,0x9359,/* 0x88-0x8F */ + 0x935A,0x935B,0x935C,0x935D,0x935E,0x935F,0x9360,0x9361,/* 0x90-0x97 */ + 0x9362,0x9363,0x9364,0x9365,0x9366,0x9367,0x9368,0x9369,/* 0x98-0x9F */ + 0x936B,0x6FC9,0x6FA7,0x6FB9,0x6FB6,0x6FC2,0x6FE1,0x6FEE,/* 0xA0-0xA7 */ + 0x6FDE,0x6FE0,0x6FEF,0x701A,0x7023,0x701B,0x7039,0x7035,/* 0xA8-0xAF */ + 0x704F,0x705E,0x5B80,0x5B84,0x5B95,0x5B93,0x5BA5,0x5BB8,/* 0xB0-0xB7 */ + 0x752F,0x9A9E,0x6434,0x5BE4,0x5BEE,0x8930,0x5BF0,0x8E47,/* 0xB8-0xBF */ + 0x8B07,0x8FB6,0x8FD3,0x8FD5,0x8FE5,0x8FEE,0x8FE4,0x8FE9,/* 0xC0-0xC7 */ + 0x8FE6,0x8FF3,0x8FE8,0x9005,0x9004,0x900B,0x9026,0x9011,/* 0xC8-0xCF */ + 0x900D,0x9016,0x9021,0x9035,0x9036,0x902D,0x902F,0x9044,/* 0xD0-0xD7 */ + 0x9051,0x9052,0x9050,0x9068,0x9058,0x9062,0x905B,0x66B9,/* 0xD8-0xDF */ + 0x9074,0x907D,0x9082,0x9088,0x9083,0x908B,0x5F50,0x5F57,/* 0xE0-0xE7 */ + 0x5F56,0x5F58,0x5C3B,0x54AB,0x5C50,0x5C59,0x5B71,0x5C63,/* 0xE8-0xEF */ + 0x5C66,0x7FBC,0x5F2A,0x5F29,0x5F2D,0x8274,0x5F3C,0x9B3B,/* 0xF0-0xF7 */ + 0x5C6E,0x5981,0x5983,0x598D,0x59A9,0x59AA,0x59A3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x936C,0x936D,0x936E,0x936F,0x9370,0x9371,0x9372,0x9373,/* 0x40-0x47 */ + 0x9374,0x9375,0x9376,0x9377,0x9378,0x9379,0x937A,0x937B,/* 0x48-0x4F */ + 0x937C,0x937D,0x937E,0x937F,0x9380,0x9381,0x9382,0x9383,/* 0x50-0x57 */ + 0x9384,0x9385,0x9386,0x9387,0x9388,0x9389,0x938A,0x938B,/* 0x58-0x5F */ + 0x938C,0x938D,0x938E,0x9390,0x9391,0x9392,0x9393,0x9394,/* 0x60-0x67 */ + 0x9395,0x9396,0x9397,0x9398,0x9399,0x939A,0x939B,0x939C,/* 0x68-0x6F */ + 0x939D,0x939E,0x939F,0x93A0,0x93A1,0x93A2,0x93A3,0x93A4,/* 0x70-0x77 */ + 0x93A5,0x93A6,0x93A7,0x93A8,0x93A9,0x93AA,0x93AB,0x0000,/* 0x78-0x7F */ + + 0x93AC,0x93AD,0x93AE,0x93AF,0x93B0,0x93B1,0x93B2,0x93B3,/* 0x80-0x87 */ + 0x93B4,0x93B5,0x93B6,0x93B7,0x93B8,0x93B9,0x93BA,0x93BB,/* 0x88-0x8F */ + 0x93BC,0x93BD,0x93BE,0x93BF,0x93C0,0x93C1,0x93C2,0x93C3,/* 0x90-0x97 */ + 0x93C4,0x93C5,0x93C6,0x93C7,0x93C8,0x93C9,0x93CB,0x93CC,/* 0x98-0x9F */ + 0x93CD,0x5997,0x59CA,0x59AB,0x599E,0x59A4,0x59D2,0x59B2,/* 0xA0-0xA7 */ + 0x59AF,0x59D7,0x59BE,0x5A05,0x5A06,0x59DD,0x5A08,0x59E3,/* 0xA8-0xAF */ + 0x59D8,0x59F9,0x5A0C,0x5A09,0x5A32,0x5A34,0x5A11,0x5A23,/* 0xB0-0xB7 */ + 0x5A13,0x5A40,0x5A67,0x5A4A,0x5A55,0x5A3C,0x5A62,0x5A75,/* 0xB8-0xBF */ + 0x80EC,0x5AAA,0x5A9B,0x5A77,0x5A7A,0x5ABE,0x5AEB,0x5AB2,/* 0xC0-0xC7 */ + 0x5AD2,0x5AD4,0x5AB8,0x5AE0,0x5AE3,0x5AF1,0x5AD6,0x5AE6,/* 0xC8-0xCF */ + 0x5AD8,0x5ADC,0x5B09,0x5B17,0x5B16,0x5B32,0x5B37,0x5B40,/* 0xD0-0xD7 */ + 0x5C15,0x5C1C,0x5B5A,0x5B65,0x5B73,0x5B51,0x5B53,0x5B62,/* 0xD8-0xDF */ + 0x9A75,0x9A77,0x9A78,0x9A7A,0x9A7F,0x9A7D,0x9A80,0x9A81,/* 0xE0-0xE7 */ + 0x9A85,0x9A88,0x9A8A,0x9A90,0x9A92,0x9A93,0x9A96,0x9A98,/* 0xE8-0xEF */ + 0x9A9B,0x9A9C,0x9A9D,0x9A9F,0x9AA0,0x9AA2,0x9AA3,0x9AA5,/* 0xF0-0xF7 */ + 0x9AA7,0x7E9F,0x7EA1,0x7EA3,0x7EA5,0x7EA8,0x7EA9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x93CE,0x93CF,0x93D0,0x93D1,0x93D2,0x93D3,0x93D4,0x93D5,/* 0x40-0x47 */ + 0x93D7,0x93D8,0x93D9,0x93DA,0x93DB,0x93DC,0x93DD,0x93DE,/* 0x48-0x4F */ + 0x93DF,0x93E0,0x93E1,0x93E2,0x93E3,0x93E4,0x93E5,0x93E6,/* 0x50-0x57 */ + 0x93E7,0x93E8,0x93E9,0x93EA,0x93EB,0x93EC,0x93ED,0x93EE,/* 0x58-0x5F */ + 0x93EF,0x93F0,0x93F1,0x93F2,0x93F3,0x93F4,0x93F5,0x93F6,/* 0x60-0x67 */ + 0x93F7,0x93F8,0x93F9,0x93FA,0x93FB,0x93FC,0x93FD,0x93FE,/* 0x68-0x6F */ + 0x93FF,0x9400,0x9401,0x9402,0x9403,0x9404,0x9405,0x9406,/* 0x70-0x77 */ + 0x9407,0x9408,0x9409,0x940A,0x940B,0x940C,0x940D,0x0000,/* 0x78-0x7F */ + + 0x940E,0x940F,0x9410,0x9411,0x9412,0x9413,0x9414,0x9415,/* 0x80-0x87 */ + 0x9416,0x9417,0x9418,0x9419,0x941A,0x941B,0x941C,0x941D,/* 0x88-0x8F */ + 0x941E,0x941F,0x9420,0x9421,0x9422,0x9423,0x9424,0x9425,/* 0x90-0x97 */ + 0x9426,0x9427,0x9428,0x9429,0x942A,0x942B,0x942C,0x942D,/* 0x98-0x9F */ + 0x942E,0x7EAD,0x7EB0,0x7EBE,0x7EC0,0x7EC1,0x7EC2,0x7EC9,/* 0xA0-0xA7 */ + 0x7ECB,0x7ECC,0x7ED0,0x7ED4,0x7ED7,0x7EDB,0x7EE0,0x7EE1,/* 0xA8-0xAF */ + 0x7EE8,0x7EEB,0x7EEE,0x7EEF,0x7EF1,0x7EF2,0x7F0D,0x7EF6,/* 0xB0-0xB7 */ + 0x7EFA,0x7EFB,0x7EFE,0x7F01,0x7F02,0x7F03,0x7F07,0x7F08,/* 0xB8-0xBF */ + 0x7F0B,0x7F0C,0x7F0F,0x7F11,0x7F12,0x7F17,0x7F19,0x7F1C,/* 0xC0-0xC7 */ + 0x7F1B,0x7F1F,0x7F21,0x7F22,0x7F23,0x7F24,0x7F25,0x7F26,/* 0xC8-0xCF */ + 0x7F27,0x7F2A,0x7F2B,0x7F2C,0x7F2D,0x7F2F,0x7F30,0x7F31,/* 0xD0-0xD7 */ + 0x7F32,0x7F33,0x7F35,0x5E7A,0x757F,0x5DDB,0x753E,0x9095,/* 0xD8-0xDF */ + 0x738E,0x7391,0x73AE,0x73A2,0x739F,0x73CF,0x73C2,0x73D1,/* 0xE0-0xE7 */ + 0x73B7,0x73B3,0x73C0,0x73C9,0x73C8,0x73E5,0x73D9,0x987C,/* 0xE8-0xEF */ + 0x740A,0x73E9,0x73E7,0x73DE,0x73BA,0x73F2,0x740F,0x742A,/* 0xF0-0xF7 */ + 0x745B,0x7426,0x7425,0x7428,0x7430,0x742E,0x742C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x942F,0x9430,0x9431,0x9432,0x9433,0x9434,0x9435,0x9436,/* 0x40-0x47 */ + 0x9437,0x9438,0x9439,0x943A,0x943B,0x943C,0x943D,0x943F,/* 0x48-0x4F */ + 0x9440,0x9441,0x9442,0x9443,0x9444,0x9445,0x9446,0x9447,/* 0x50-0x57 */ + 0x9448,0x9449,0x944A,0x944B,0x944C,0x944D,0x944E,0x944F,/* 0x58-0x5F */ + 0x9450,0x9451,0x9452,0x9453,0x9454,0x9455,0x9456,0x9457,/* 0x60-0x67 */ + 0x9458,0x9459,0x945A,0x945B,0x945C,0x945D,0x945E,0x945F,/* 0x68-0x6F */ + 0x9460,0x9461,0x9462,0x9463,0x9464,0x9465,0x9466,0x9467,/* 0x70-0x77 */ + 0x9468,0x9469,0x946A,0x946C,0x946D,0x946E,0x946F,0x0000,/* 0x78-0x7F */ + + 0x9470,0x9471,0x9472,0x9473,0x9474,0x9475,0x9476,0x9477,/* 0x80-0x87 */ + 0x9478,0x9479,0x947A,0x947B,0x947C,0x947D,0x947E,0x947F,/* 0x88-0x8F */ + 0x9480,0x9481,0x9482,0x9483,0x9484,0x9491,0x9496,0x9498,/* 0x90-0x97 */ + 0x94C7,0x94CF,0x94D3,0x94D4,0x94DA,0x94E6,0x94FB,0x951C,/* 0x98-0x9F */ + 0x9520,0x741B,0x741A,0x7441,0x745C,0x7457,0x7455,0x7459,/* 0xA0-0xA7 */ + 0x7477,0x746D,0x747E,0x749C,0x748E,0x7480,0x7481,0x7487,/* 0xA8-0xAF */ + 0x748B,0x749E,0x74A8,0x74A9,0x7490,0x74A7,0x74D2,0x74BA,/* 0xB0-0xB7 */ + 0x97EA,0x97EB,0x97EC,0x674C,0x6753,0x675E,0x6748,0x6769,/* 0xB8-0xBF */ + 0x67A5,0x6787,0x676A,0x6773,0x6798,0x67A7,0x6775,0x67A8,/* 0xC0-0xC7 */ + 0x679E,0x67AD,0x678B,0x6777,0x677C,0x67F0,0x6809,0x67D8,/* 0xC8-0xCF */ + 0x680A,0x67E9,0x67B0,0x680C,0x67D9,0x67B5,0x67DA,0x67B3,/* 0xD0-0xD7 */ + 0x67DD,0x6800,0x67C3,0x67B8,0x67E2,0x680E,0x67C1,0x67FD,/* 0xD8-0xDF */ + 0x6832,0x6833,0x6860,0x6861,0x684E,0x6862,0x6844,0x6864,/* 0xE0-0xE7 */ + 0x6883,0x681D,0x6855,0x6866,0x6841,0x6867,0x6840,0x683E,/* 0xE8-0xEF */ + 0x684A,0x6849,0x6829,0x68B5,0x688F,0x6874,0x6877,0x6893,/* 0xF0-0xF7 */ + 0x686B,0x68C2,0x696E,0x68FC,0x691F,0x6920,0x68F9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9527,0x9533,0x953D,0x9543,0x9548,0x954B,0x9555,0x955A,/* 0x40-0x47 */ + 0x9560,0x956E,0x9574,0x9575,0x9577,0x9578,0x9579,0x957A,/* 0x48-0x4F */ + 0x957B,0x957C,0x957D,0x957E,0x9580,0x9581,0x9582,0x9583,/* 0x50-0x57 */ + 0x9584,0x9585,0x9586,0x9587,0x9588,0x9589,0x958A,0x958B,/* 0x58-0x5F */ + 0x958C,0x958D,0x958E,0x958F,0x9590,0x9591,0x9592,0x9593,/* 0x60-0x67 */ + 0x9594,0x9595,0x9596,0x9597,0x9598,0x9599,0x959A,0x959B,/* 0x68-0x6F */ + 0x959C,0x959D,0x959E,0x959F,0x95A0,0x95A1,0x95A2,0x95A3,/* 0x70-0x77 */ + 0x95A4,0x95A5,0x95A6,0x95A7,0x95A8,0x95A9,0x95AA,0x0000,/* 0x78-0x7F */ + + 0x95AB,0x95AC,0x95AD,0x95AE,0x95AF,0x95B0,0x95B1,0x95B2,/* 0x80-0x87 */ + 0x95B3,0x95B4,0x95B5,0x95B6,0x95B7,0x95B8,0x95B9,0x95BA,/* 0x88-0x8F */ + 0x95BB,0x95BC,0x95BD,0x95BE,0x95BF,0x95C0,0x95C1,0x95C2,/* 0x90-0x97 */ + 0x95C3,0x95C4,0x95C5,0x95C6,0x95C7,0x95C8,0x95C9,0x95CA,/* 0x98-0x9F */ + 0x95CB,0x6924,0x68F0,0x690B,0x6901,0x6957,0x68E3,0x6910,/* 0xA0-0xA7 */ + 0x6971,0x6939,0x6960,0x6942,0x695D,0x6984,0x696B,0x6980,/* 0xA8-0xAF */ + 0x6998,0x6978,0x6934,0x69CC,0x6987,0x6988,0x69CE,0x6989,/* 0xB0-0xB7 */ + 0x6966,0x6963,0x6979,0x699B,0x69A7,0x69BB,0x69AB,0x69AD,/* 0xB8-0xBF */ + 0x69D4,0x69B1,0x69C1,0x69CA,0x69DF,0x6995,0x69E0,0x698D,/* 0xC0-0xC7 */ + 0x69FF,0x6A2F,0x69ED,0x6A17,0x6A18,0x6A65,0x69F2,0x6A44,/* 0xC8-0xCF */ + 0x6A3E,0x6AA0,0x6A50,0x6A5B,0x6A35,0x6A8E,0x6A79,0x6A3D,/* 0xD0-0xD7 */ + 0x6A28,0x6A58,0x6A7C,0x6A91,0x6A90,0x6AA9,0x6A97,0x6AAB,/* 0xD8-0xDF */ + 0x7337,0x7352,0x6B81,0x6B82,0x6B87,0x6B84,0x6B92,0x6B93,/* 0xE0-0xE7 */ + 0x6B8D,0x6B9A,0x6B9B,0x6BA1,0x6BAA,0x8F6B,0x8F6D,0x8F71,/* 0xE8-0xEF */ + 0x8F72,0x8F73,0x8F75,0x8F76,0x8F78,0x8F77,0x8F79,0x8F7A,/* 0xF0-0xF7 */ + 0x8F7C,0x8F7E,0x8F81,0x8F82,0x8F84,0x8F87,0x8F8B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x95CC,0x95CD,0x95CE,0x95CF,0x95D0,0x95D1,0x95D2,0x95D3,/* 0x40-0x47 */ + 0x95D4,0x95D5,0x95D6,0x95D7,0x95D8,0x95D9,0x95DA,0x95DB,/* 0x48-0x4F */ + 0x95DC,0x95DD,0x95DE,0x95DF,0x95E0,0x95E1,0x95E2,0x95E3,/* 0x50-0x57 */ + 0x95E4,0x95E5,0x95E6,0x95E7,0x95EC,0x95FF,0x9607,0x9613,/* 0x58-0x5F */ + 0x9618,0x961B,0x961E,0x9620,0x9623,0x9624,0x9625,0x9626,/* 0x60-0x67 */ + 0x9627,0x9628,0x9629,0x962B,0x962C,0x962D,0x962F,0x9630,/* 0x68-0x6F */ + 0x9637,0x9638,0x9639,0x963A,0x963E,0x9641,0x9643,0x964A,/* 0x70-0x77 */ + 0x964E,0x964F,0x9651,0x9652,0x9653,0x9656,0x9657,0x0000,/* 0x78-0x7F */ + + 0x9658,0x9659,0x965A,0x965C,0x965D,0x965E,0x9660,0x9663,/* 0x80-0x87 */ + 0x9665,0x9666,0x966B,0x966D,0x966E,0x966F,0x9670,0x9671,/* 0x88-0x8F */ + 0x9673,0x9678,0x9679,0x967A,0x967B,0x967C,0x967D,0x967E,/* 0x90-0x97 */ + 0x967F,0x9680,0x9681,0x9682,0x9683,0x9684,0x9687,0x9689,/* 0x98-0x9F */ + 0x968A,0x8F8D,0x8F8E,0x8F8F,0x8F98,0x8F9A,0x8ECE,0x620B,/* 0xA0-0xA7 */ + 0x6217,0x621B,0x621F,0x6222,0x6221,0x6225,0x6224,0x622C,/* 0xA8-0xAF */ + 0x81E7,0x74EF,0x74F4,0x74FF,0x750F,0x7511,0x7513,0x6534,/* 0xB0-0xB7 */ + 0x65EE,0x65EF,0x65F0,0x660A,0x6619,0x6772,0x6603,0x6615,/* 0xB8-0xBF */ + 0x6600,0x7085,0x66F7,0x661D,0x6634,0x6631,0x6636,0x6635,/* 0xC0-0xC7 */ + 0x8006,0x665F,0x6654,0x6641,0x664F,0x6656,0x6661,0x6657,/* 0xC8-0xCF */ + 0x6677,0x6684,0x668C,0x66A7,0x669D,0x66BE,0x66DB,0x66DC,/* 0xD0-0xD7 */ + 0x66E6,0x66E9,0x8D32,0x8D33,0x8D36,0x8D3B,0x8D3D,0x8D40,/* 0xD8-0xDF */ + 0x8D45,0x8D46,0x8D48,0x8D49,0x8D47,0x8D4D,0x8D55,0x8D59,/* 0xE0-0xE7 */ + 0x89C7,0x89CA,0x89CB,0x89CC,0x89CE,0x89CF,0x89D0,0x89D1,/* 0xE8-0xEF */ + 0x726E,0x729F,0x725D,0x7266,0x726F,0x727E,0x727F,0x7284,/* 0xF0-0xF7 */ + 0x728B,0x728D,0x728F,0x7292,0x6308,0x6332,0x63B0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x968C,0x968E,0x9691,0x9692,0x9693,0x9695,0x9696,0x969A,/* 0x40-0x47 */ + 0x969B,0x969D,0x969E,0x969F,0x96A0,0x96A1,0x96A2,0x96A3,/* 0x48-0x4F */ + 0x96A4,0x96A5,0x96A6,0x96A8,0x96A9,0x96AA,0x96AB,0x96AC,/* 0x50-0x57 */ + 0x96AD,0x96AE,0x96AF,0x96B1,0x96B2,0x96B4,0x96B5,0x96B7,/* 0x58-0x5F */ + 0x96B8,0x96BA,0x96BB,0x96BF,0x96C2,0x96C3,0x96C8,0x96CA,/* 0x60-0x67 */ + 0x96CB,0x96D0,0x96D1,0x96D3,0x96D4,0x96D6,0x96D7,0x96D8,/* 0x68-0x6F */ + 0x96D9,0x96DA,0x96DB,0x96DC,0x96DD,0x96DE,0x96DF,0x96E1,/* 0x70-0x77 */ + 0x96E2,0x96E3,0x96E4,0x96E5,0x96E6,0x96E7,0x96EB,0x0000,/* 0x78-0x7F */ + + 0x96EC,0x96ED,0x96EE,0x96F0,0x96F1,0x96F2,0x96F4,0x96F5,/* 0x80-0x87 */ + 0x96F8,0x96FA,0x96FB,0x96FC,0x96FD,0x96FF,0x9702,0x9703,/* 0x88-0x8F */ + 0x9705,0x970A,0x970B,0x970C,0x9710,0x9711,0x9712,0x9714,/* 0x90-0x97 */ + 0x9715,0x9717,0x9718,0x9719,0x971A,0x971B,0x971D,0x971F,/* 0x98-0x9F */ + 0x9720,0x643F,0x64D8,0x8004,0x6BEA,0x6BF3,0x6BFD,0x6BF5,/* 0xA0-0xA7 */ + 0x6BF9,0x6C05,0x6C07,0x6C06,0x6C0D,0x6C15,0x6C18,0x6C19,/* 0xA8-0xAF */ + 0x6C1A,0x6C21,0x6C29,0x6C24,0x6C2A,0x6C32,0x6535,0x6555,/* 0xB0-0xB7 */ + 0x656B,0x724D,0x7252,0x7256,0x7230,0x8662,0x5216,0x809F,/* 0xB8-0xBF */ + 0x809C,0x8093,0x80BC,0x670A,0x80BD,0x80B1,0x80AB,0x80AD,/* 0xC0-0xC7 */ + 0x80B4,0x80B7,0x80E7,0x80E8,0x80E9,0x80EA,0x80DB,0x80C2,/* 0xC8-0xCF */ + 0x80C4,0x80D9,0x80CD,0x80D7,0x6710,0x80DD,0x80EB,0x80F1,/* 0xD0-0xD7 */ + 0x80F4,0x80ED,0x810D,0x810E,0x80F2,0x80FC,0x6715,0x8112,/* 0xD8-0xDF */ + 0x8C5A,0x8136,0x811E,0x812C,0x8118,0x8132,0x8148,0x814C,/* 0xE0-0xE7 */ + 0x8153,0x8174,0x8159,0x815A,0x8171,0x8160,0x8169,0x817C,/* 0xE8-0xEF */ + 0x817D,0x816D,0x8167,0x584D,0x5AB5,0x8188,0x8182,0x8191,/* 0xF0-0xF7 */ + 0x6ED5,0x81A3,0x81AA,0x81CC,0x6726,0x81CA,0x81BB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9721,0x9722,0x9723,0x9724,0x9725,0x9726,0x9727,0x9728,/* 0x40-0x47 */ + 0x9729,0x972B,0x972C,0x972E,0x972F,0x9731,0x9733,0x9734,/* 0x48-0x4F */ + 0x9735,0x9736,0x9737,0x973A,0x973B,0x973C,0x973D,0x973F,/* 0x50-0x57 */ + 0x9740,0x9741,0x9742,0x9743,0x9744,0x9745,0x9746,0x9747,/* 0x58-0x5F */ + 0x9748,0x9749,0x974A,0x974B,0x974C,0x974D,0x974E,0x974F,/* 0x60-0x67 */ + 0x9750,0x9751,0x9754,0x9755,0x9757,0x9758,0x975A,0x975C,/* 0x68-0x6F */ + 0x975D,0x975F,0x9763,0x9764,0x9766,0x9767,0x9768,0x976A,/* 0x70-0x77 */ + 0x976B,0x976C,0x976D,0x976E,0x976F,0x9770,0x9771,0x0000,/* 0x78-0x7F */ + + 0x9772,0x9775,0x9777,0x9778,0x9779,0x977A,0x977B,0x977D,/* 0x80-0x87 */ + 0x977E,0x977F,0x9780,0x9781,0x9782,0x9783,0x9784,0x9786,/* 0x88-0x8F */ + 0x9787,0x9788,0x9789,0x978A,0x978C,0x978E,0x978F,0x9790,/* 0x90-0x97 */ + 0x9793,0x9795,0x9796,0x9797,0x9799,0x979A,0x979B,0x979C,/* 0x98-0x9F */ + 0x979D,0x81C1,0x81A6,0x6B24,0x6B37,0x6B39,0x6B43,0x6B46,/* 0xA0-0xA7 */ + 0x6B59,0x98D1,0x98D2,0x98D3,0x98D5,0x98D9,0x98DA,0x6BB3,/* 0xA8-0xAF */ + 0x5F40,0x6BC2,0x89F3,0x6590,0x9F51,0x6593,0x65BC,0x65C6,/* 0xB0-0xB7 */ + 0x65C4,0x65C3,0x65CC,0x65CE,0x65D2,0x65D6,0x7080,0x709C,/* 0xB8-0xBF */ + 0x7096,0x709D,0x70BB,0x70C0,0x70B7,0x70AB,0x70B1,0x70E8,/* 0xC0-0xC7 */ + 0x70CA,0x7110,0x7113,0x7116,0x712F,0x7131,0x7173,0x715C,/* 0xC8-0xCF */ + 0x7168,0x7145,0x7172,0x714A,0x7178,0x717A,0x7198,0x71B3,/* 0xD0-0xD7 */ + 0x71B5,0x71A8,0x71A0,0x71E0,0x71D4,0x71E7,0x71F9,0x721D,/* 0xD8-0xDF */ + 0x7228,0x706C,0x7118,0x7166,0x71B9,0x623E,0x623D,0x6243,/* 0xE0-0xE7 */ + 0x6248,0x6249,0x793B,0x7940,0x7946,0x7949,0x795B,0x795C,/* 0xE8-0xEF */ + 0x7953,0x795A,0x7962,0x7957,0x7960,0x796F,0x7967,0x797A,/* 0xF0-0xF7 */ + 0x7985,0x798A,0x799A,0x79A7,0x79B3,0x5FD1,0x5FD0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x979E,0x979F,0x97A1,0x97A2,0x97A4,0x97A5,0x97A6,0x97A7,/* 0x40-0x47 */ + 0x97A8,0x97A9,0x97AA,0x97AC,0x97AE,0x97B0,0x97B1,0x97B3,/* 0x48-0x4F */ + 0x97B5,0x97B6,0x97B7,0x97B8,0x97B9,0x97BA,0x97BB,0x97BC,/* 0x50-0x57 */ + 0x97BD,0x97BE,0x97BF,0x97C0,0x97C1,0x97C2,0x97C3,0x97C4,/* 0x58-0x5F */ + 0x97C5,0x97C6,0x97C7,0x97C8,0x97C9,0x97CA,0x97CB,0x97CC,/* 0x60-0x67 */ + 0x97CD,0x97CE,0x97CF,0x97D0,0x97D1,0x97D2,0x97D3,0x97D4,/* 0x68-0x6F */ + 0x97D5,0x97D6,0x97D7,0x97D8,0x97D9,0x97DA,0x97DB,0x97DC,/* 0x70-0x77 */ + 0x97DD,0x97DE,0x97DF,0x97E0,0x97E1,0x97E2,0x97E3,0x0000,/* 0x78-0x7F */ + + 0x97E4,0x97E5,0x97E8,0x97EE,0x97EF,0x97F0,0x97F1,0x97F2,/* 0x80-0x87 */ + 0x97F4,0x97F7,0x97F8,0x97F9,0x97FA,0x97FB,0x97FC,0x97FD,/* 0x88-0x8F */ + 0x97FE,0x97FF,0x9800,0x9801,0x9802,0x9803,0x9804,0x9805,/* 0x90-0x97 */ + 0x9806,0x9807,0x9808,0x9809,0x980A,0x980B,0x980C,0x980D,/* 0x98-0x9F */ + 0x980E,0x603C,0x605D,0x605A,0x6067,0x6041,0x6059,0x6063,/* 0xA0-0xA7 */ + 0x60AB,0x6106,0x610D,0x615D,0x61A9,0x619D,0x61CB,0x61D1,/* 0xA8-0xAF */ + 0x6206,0x8080,0x807F,0x6C93,0x6CF6,0x6DFC,0x77F6,0x77F8,/* 0xB0-0xB7 */ + 0x7800,0x7809,0x7817,0x7818,0x7811,0x65AB,0x782D,0x781C,/* 0xB8-0xBF */ + 0x781D,0x7839,0x783A,0x783B,0x781F,0x783C,0x7825,0x782C,/* 0xC0-0xC7 */ + 0x7823,0x7829,0x784E,0x786D,0x7856,0x7857,0x7826,0x7850,/* 0xC8-0xCF */ + 0x7847,0x784C,0x786A,0x789B,0x7893,0x789A,0x7887,0x789C,/* 0xD0-0xD7 */ + 0x78A1,0x78A3,0x78B2,0x78B9,0x78A5,0x78D4,0x78D9,0x78C9,/* 0xD8-0xDF */ + 0x78EC,0x78F2,0x7905,0x78F4,0x7913,0x7924,0x791E,0x7934,/* 0xE0-0xE7 */ + 0x9F9B,0x9EF9,0x9EFB,0x9EFC,0x76F1,0x7704,0x770D,0x76F9,/* 0xE8-0xEF */ + 0x7707,0x7708,0x771A,0x7722,0x7719,0x772D,0x7726,0x7735,/* 0xF0-0xF7 */ + 0x7738,0x7750,0x7751,0x7747,0x7743,0x775A,0x7768,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x980F,0x9810,0x9811,0x9812,0x9813,0x9814,0x9815,0x9816,/* 0x40-0x47 */ + 0x9817,0x9818,0x9819,0x981A,0x981B,0x981C,0x981D,0x981E,/* 0x48-0x4F */ + 0x981F,0x9820,0x9821,0x9822,0x9823,0x9824,0x9825,0x9826,/* 0x50-0x57 */ + 0x9827,0x9828,0x9829,0x982A,0x982B,0x982C,0x982D,0x982E,/* 0x58-0x5F */ + 0x982F,0x9830,0x9831,0x9832,0x9833,0x9834,0x9835,0x9836,/* 0x60-0x67 */ + 0x9837,0x9838,0x9839,0x983A,0x983B,0x983C,0x983D,0x983E,/* 0x68-0x6F */ + 0x983F,0x9840,0x9841,0x9842,0x9843,0x9844,0x9845,0x9846,/* 0x70-0x77 */ + 0x9847,0x9848,0x9849,0x984A,0x984B,0x984C,0x984D,0x0000,/* 0x78-0x7F */ + + 0x984E,0x984F,0x9850,0x9851,0x9852,0x9853,0x9854,0x9855,/* 0x80-0x87 */ + 0x9856,0x9857,0x9858,0x9859,0x985A,0x985B,0x985C,0x985D,/* 0x88-0x8F */ + 0x985E,0x985F,0x9860,0x9861,0x9862,0x9863,0x9864,0x9865,/* 0x90-0x97 */ + 0x9866,0x9867,0x9868,0x9869,0x986A,0x986B,0x986C,0x986D,/* 0x98-0x9F */ + 0x986E,0x7762,0x7765,0x777F,0x778D,0x777D,0x7780,0x778C,/* 0xA0-0xA7 */ + 0x7791,0x779F,0x77A0,0x77B0,0x77B5,0x77BD,0x753A,0x7540,/* 0xA8-0xAF */ + 0x754E,0x754B,0x7548,0x755B,0x7572,0x7579,0x7583,0x7F58,/* 0xB0-0xB7 */ + 0x7F61,0x7F5F,0x8A48,0x7F68,0x7F74,0x7F71,0x7F79,0x7F81,/* 0xB8-0xBF */ + 0x7F7E,0x76CD,0x76E5,0x8832,0x9485,0x9486,0x9487,0x948B,/* 0xC0-0xC7 */ + 0x948A,0x948C,0x948D,0x948F,0x9490,0x9494,0x9497,0x9495,/* 0xC8-0xCF */ + 0x949A,0x949B,0x949C,0x94A3,0x94A4,0x94AB,0x94AA,0x94AD,/* 0xD0-0xD7 */ + 0x94AC,0x94AF,0x94B0,0x94B2,0x94B4,0x94B6,0x94B7,0x94B8,/* 0xD8-0xDF */ + 0x94B9,0x94BA,0x94BC,0x94BD,0x94BF,0x94C4,0x94C8,0x94C9,/* 0xE0-0xE7 */ + 0x94CA,0x94CB,0x94CC,0x94CD,0x94CE,0x94D0,0x94D1,0x94D2,/* 0xE8-0xEF */ + 0x94D5,0x94D6,0x94D7,0x94D9,0x94D8,0x94DB,0x94DE,0x94DF,/* 0xF0-0xF7 */ + 0x94E0,0x94E2,0x94E4,0x94E5,0x94E7,0x94E8,0x94EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x986F,0x9870,0x9871,0x9872,0x9873,0x9874,0x988B,0x988E,/* 0x40-0x47 */ + 0x9892,0x9895,0x9899,0x98A3,0x98A8,0x98A9,0x98AA,0x98AB,/* 0x48-0x4F */ + 0x98AC,0x98AD,0x98AE,0x98AF,0x98B0,0x98B1,0x98B2,0x98B3,/* 0x50-0x57 */ + 0x98B4,0x98B5,0x98B6,0x98B7,0x98B8,0x98B9,0x98BA,0x98BB,/* 0x58-0x5F */ + 0x98BC,0x98BD,0x98BE,0x98BF,0x98C0,0x98C1,0x98C2,0x98C3,/* 0x60-0x67 */ + 0x98C4,0x98C5,0x98C6,0x98C7,0x98C8,0x98C9,0x98CA,0x98CB,/* 0x68-0x6F */ + 0x98CC,0x98CD,0x98CF,0x98D0,0x98D4,0x98D6,0x98D7,0x98DB,/* 0x70-0x77 */ + 0x98DC,0x98DD,0x98E0,0x98E1,0x98E2,0x98E3,0x98E4,0x0000,/* 0x78-0x7F */ + + 0x98E5,0x98E6,0x98E9,0x98EA,0x98EB,0x98EC,0x98ED,0x98EE,/* 0x80-0x87 */ + 0x98EF,0x98F0,0x98F1,0x98F2,0x98F3,0x98F4,0x98F5,0x98F6,/* 0x88-0x8F */ + 0x98F7,0x98F8,0x98F9,0x98FA,0x98FB,0x98FC,0x98FD,0x98FE,/* 0x90-0x97 */ + 0x98FF,0x9900,0x9901,0x9902,0x9903,0x9904,0x9905,0x9906,/* 0x98-0x9F */ + 0x9907,0x94E9,0x94EB,0x94EE,0x94EF,0x94F3,0x94F4,0x94F5,/* 0xA0-0xA7 */ + 0x94F7,0x94F9,0x94FC,0x94FD,0x94FF,0x9503,0x9502,0x9506,/* 0xA8-0xAF */ + 0x9507,0x9509,0x950A,0x950D,0x950E,0x950F,0x9512,0x9513,/* 0xB0-0xB7 */ + 0x9514,0x9515,0x9516,0x9518,0x951B,0x951D,0x951E,0x951F,/* 0xB8-0xBF */ + 0x9522,0x952A,0x952B,0x9529,0x952C,0x9531,0x9532,0x9534,/* 0xC0-0xC7 */ + 0x9536,0x9537,0x9538,0x953C,0x953E,0x953F,0x9542,0x9535,/* 0xC8-0xCF */ + 0x9544,0x9545,0x9546,0x9549,0x954C,0x954E,0x954F,0x9552,/* 0xD0-0xD7 */ + 0x9553,0x9554,0x9556,0x9557,0x9558,0x9559,0x955B,0x955E,/* 0xD8-0xDF */ + 0x955F,0x955D,0x9561,0x9562,0x9564,0x9565,0x9566,0x9567,/* 0xE0-0xE7 */ + 0x9568,0x9569,0x956A,0x956B,0x956C,0x956F,0x9571,0x9572,/* 0xE8-0xEF */ + 0x9573,0x953A,0x77E7,0x77EC,0x96C9,0x79D5,0x79ED,0x79E3,/* 0xF0-0xF7 */ + 0x79EB,0x7A06,0x5D47,0x7A03,0x7A02,0x7A1E,0x7A14,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9908,0x9909,0x990A,0x990B,0x990C,0x990E,0x990F,0x9911,/* 0x40-0x47 */ + 0x9912,0x9913,0x9914,0x9915,0x9916,0x9917,0x9918,0x9919,/* 0x48-0x4F */ + 0x991A,0x991B,0x991C,0x991D,0x991E,0x991F,0x9920,0x9921,/* 0x50-0x57 */ + 0x9922,0x9923,0x9924,0x9925,0x9926,0x9927,0x9928,0x9929,/* 0x58-0x5F */ + 0x992A,0x992B,0x992C,0x992D,0x992F,0x9930,0x9931,0x9932,/* 0x60-0x67 */ + 0x9933,0x9934,0x9935,0x9936,0x9937,0x9938,0x9939,0x993A,/* 0x68-0x6F */ + 0x993B,0x993C,0x993D,0x993E,0x993F,0x9940,0x9941,0x9942,/* 0x70-0x77 */ + 0x9943,0x9944,0x9945,0x9946,0x9947,0x9948,0x9949,0x0000,/* 0x78-0x7F */ + + 0x994A,0x994B,0x994C,0x994D,0x994E,0x994F,0x9950,0x9951,/* 0x80-0x87 */ + 0x9952,0x9953,0x9956,0x9957,0x9958,0x9959,0x995A,0x995B,/* 0x88-0x8F */ + 0x995C,0x995D,0x995E,0x995F,0x9960,0x9961,0x9962,0x9964,/* 0x90-0x97 */ + 0x9966,0x9973,0x9978,0x9979,0x997B,0x997E,0x9982,0x9983,/* 0x98-0x9F */ + 0x9989,0x7A39,0x7A37,0x7A51,0x9ECF,0x99A5,0x7A70,0x7688,/* 0xA0-0xA7 */ + 0x768E,0x7693,0x7699,0x76A4,0x74DE,0x74E0,0x752C,0x9E20,/* 0xA8-0xAF */ + 0x9E22,0x9E28,0x9E29,0x9E2A,0x9E2B,0x9E2C,0x9E32,0x9E31,/* 0xB0-0xB7 */ + 0x9E36,0x9E38,0x9E37,0x9E39,0x9E3A,0x9E3E,0x9E41,0x9E42,/* 0xB8-0xBF */ + 0x9E44,0x9E46,0x9E47,0x9E48,0x9E49,0x9E4B,0x9E4C,0x9E4E,/* 0xC0-0xC7 */ + 0x9E51,0x9E55,0x9E57,0x9E5A,0x9E5B,0x9E5C,0x9E5E,0x9E63,/* 0xC8-0xCF */ + 0x9E66,0x9E67,0x9E68,0x9E69,0x9E6A,0x9E6B,0x9E6C,0x9E71,/* 0xD0-0xD7 */ + 0x9E6D,0x9E73,0x7592,0x7594,0x7596,0x75A0,0x759D,0x75AC,/* 0xD8-0xDF */ + 0x75A3,0x75B3,0x75B4,0x75B8,0x75C4,0x75B1,0x75B0,0x75C3,/* 0xE0-0xE7 */ + 0x75C2,0x75D6,0x75CD,0x75E3,0x75E8,0x75E6,0x75E4,0x75EB,/* 0xE8-0xEF */ + 0x75E7,0x7603,0x75F1,0x75FC,0x75FF,0x7610,0x7600,0x7605,/* 0xF0-0xF7 */ + 0x760C,0x7617,0x760A,0x7625,0x7618,0x7615,0x7619,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x998C,0x998E,0x999A,0x999B,0x999C,0x999D,0x999E,0x999F,/* 0x40-0x47 */ + 0x99A0,0x99A1,0x99A2,0x99A3,0x99A4,0x99A6,0x99A7,0x99A9,/* 0x48-0x4F */ + 0x99AA,0x99AB,0x99AC,0x99AD,0x99AE,0x99AF,0x99B0,0x99B1,/* 0x50-0x57 */ + 0x99B2,0x99B3,0x99B4,0x99B5,0x99B6,0x99B7,0x99B8,0x99B9,/* 0x58-0x5F */ + 0x99BA,0x99BB,0x99BC,0x99BD,0x99BE,0x99BF,0x99C0,0x99C1,/* 0x60-0x67 */ + 0x99C2,0x99C3,0x99C4,0x99C5,0x99C6,0x99C7,0x99C8,0x99C9,/* 0x68-0x6F */ + 0x99CA,0x99CB,0x99CC,0x99CD,0x99CE,0x99CF,0x99D0,0x99D1,/* 0x70-0x77 */ + 0x99D2,0x99D3,0x99D4,0x99D5,0x99D6,0x99D7,0x99D8,0x0000,/* 0x78-0x7F */ + + 0x99D9,0x99DA,0x99DB,0x99DC,0x99DD,0x99DE,0x99DF,0x99E0,/* 0x80-0x87 */ + 0x99E1,0x99E2,0x99E3,0x99E4,0x99E5,0x99E6,0x99E7,0x99E8,/* 0x88-0x8F */ + 0x99E9,0x99EA,0x99EB,0x99EC,0x99ED,0x99EE,0x99EF,0x99F0,/* 0x90-0x97 */ + 0x99F1,0x99F2,0x99F3,0x99F4,0x99F5,0x99F6,0x99F7,0x99F8,/* 0x98-0x9F */ + 0x99F9,0x761B,0x763C,0x7622,0x7620,0x7640,0x762D,0x7630,/* 0xA0-0xA7 */ + 0x763F,0x7635,0x7643,0x763E,0x7633,0x764D,0x765E,0x7654,/* 0xA8-0xAF */ + 0x765C,0x7656,0x766B,0x766F,0x7FCA,0x7AE6,0x7A78,0x7A79,/* 0xB0-0xB7 */ + 0x7A80,0x7A86,0x7A88,0x7A95,0x7AA6,0x7AA0,0x7AAC,0x7AA8,/* 0xB8-0xBF */ + 0x7AAD,0x7AB3,0x8864,0x8869,0x8872,0x887D,0x887F,0x8882,/* 0xC0-0xC7 */ + 0x88A2,0x88C6,0x88B7,0x88BC,0x88C9,0x88E2,0x88CE,0x88E3,/* 0xC8-0xCF */ + 0x88E5,0x88F1,0x891A,0x88FC,0x88E8,0x88FE,0x88F0,0x8921,/* 0xD0-0xD7 */ + 0x8919,0x8913,0x891B,0x890A,0x8934,0x892B,0x8936,0x8941,/* 0xD8-0xDF */ + 0x8966,0x897B,0x758B,0x80E5,0x76B2,0x76B4,0x77DC,0x8012,/* 0xE0-0xE7 */ + 0x8014,0x8016,0x801C,0x8020,0x8022,0x8025,0x8026,0x8027,/* 0xE8-0xEF */ + 0x8029,0x8028,0x8031,0x800B,0x8035,0x8043,0x8046,0x804D,/* 0xF0-0xF7 */ + 0x8052,0x8069,0x8071,0x8983,0x9878,0x9880,0x9883,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x99FA,0x99FB,0x99FC,0x99FD,0x99FE,0x99FF,0x9A00,0x9A01,/* 0x40-0x47 */ + 0x9A02,0x9A03,0x9A04,0x9A05,0x9A06,0x9A07,0x9A08,0x9A09,/* 0x48-0x4F */ + 0x9A0A,0x9A0B,0x9A0C,0x9A0D,0x9A0E,0x9A0F,0x9A10,0x9A11,/* 0x50-0x57 */ + 0x9A12,0x9A13,0x9A14,0x9A15,0x9A16,0x9A17,0x9A18,0x9A19,/* 0x58-0x5F */ + 0x9A1A,0x9A1B,0x9A1C,0x9A1D,0x9A1E,0x9A1F,0x9A20,0x9A21,/* 0x60-0x67 */ + 0x9A22,0x9A23,0x9A24,0x9A25,0x9A26,0x9A27,0x9A28,0x9A29,/* 0x68-0x6F */ + 0x9A2A,0x9A2B,0x9A2C,0x9A2D,0x9A2E,0x9A2F,0x9A30,0x9A31,/* 0x70-0x77 */ + 0x9A32,0x9A33,0x9A34,0x9A35,0x9A36,0x9A37,0x9A38,0x0000,/* 0x78-0x7F */ + + 0x9A39,0x9A3A,0x9A3B,0x9A3C,0x9A3D,0x9A3E,0x9A3F,0x9A40,/* 0x80-0x87 */ + 0x9A41,0x9A42,0x9A43,0x9A44,0x9A45,0x9A46,0x9A47,0x9A48,/* 0x88-0x8F */ + 0x9A49,0x9A4A,0x9A4B,0x9A4C,0x9A4D,0x9A4E,0x9A4F,0x9A50,/* 0x90-0x97 */ + 0x9A51,0x9A52,0x9A53,0x9A54,0x9A55,0x9A56,0x9A57,0x9A58,/* 0x98-0x9F */ + 0x9A59,0x9889,0x988C,0x988D,0x988F,0x9894,0x989A,0x989B,/* 0xA0-0xA7 */ + 0x989E,0x989F,0x98A1,0x98A2,0x98A5,0x98A6,0x864D,0x8654,/* 0xA8-0xAF */ + 0x866C,0x866E,0x867F,0x867A,0x867C,0x867B,0x86A8,0x868D,/* 0xB0-0xB7 */ + 0x868B,0x86AC,0x869D,0x86A7,0x86A3,0x86AA,0x8693,0x86A9,/* 0xB8-0xBF */ + 0x86B6,0x86C4,0x86B5,0x86CE,0x86B0,0x86BA,0x86B1,0x86AF,/* 0xC0-0xC7 */ + 0x86C9,0x86CF,0x86B4,0x86E9,0x86F1,0x86F2,0x86ED,0x86F3,/* 0xC8-0xCF */ + 0x86D0,0x8713,0x86DE,0x86F4,0x86DF,0x86D8,0x86D1,0x8703,/* 0xD0-0xD7 */ + 0x8707,0x86F8,0x8708,0x870A,0x870D,0x8709,0x8723,0x873B,/* 0xD8-0xDF */ + 0x871E,0x8725,0x872E,0x871A,0x873E,0x8748,0x8734,0x8731,/* 0xE0-0xE7 */ + 0x8729,0x8737,0x873F,0x8782,0x8722,0x877D,0x877E,0x877B,/* 0xE8-0xEF */ + 0x8760,0x8770,0x874C,0x876E,0x878B,0x8753,0x8763,0x877C,/* 0xF0-0xF7 */ + 0x8764,0x8759,0x8765,0x8793,0x87AF,0x87A8,0x87D2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9A5A,0x9A5B,0x9A5C,0x9A5D,0x9A5E,0x9A5F,0x9A60,0x9A61,/* 0x40-0x47 */ + 0x9A62,0x9A63,0x9A64,0x9A65,0x9A66,0x9A67,0x9A68,0x9A69,/* 0x48-0x4F */ + 0x9A6A,0x9A6B,0x9A72,0x9A83,0x9A89,0x9A8D,0x9A8E,0x9A94,/* 0x50-0x57 */ + 0x9A95,0x9A99,0x9AA6,0x9AA9,0x9AAA,0x9AAB,0x9AAC,0x9AAD,/* 0x58-0x5F */ + 0x9AAE,0x9AAF,0x9AB2,0x9AB3,0x9AB4,0x9AB5,0x9AB9,0x9ABB,/* 0x60-0x67 */ + 0x9ABD,0x9ABE,0x9ABF,0x9AC3,0x9AC4,0x9AC6,0x9AC7,0x9AC8,/* 0x68-0x6F */ + 0x9AC9,0x9ACA,0x9ACD,0x9ACE,0x9ACF,0x9AD0,0x9AD2,0x9AD4,/* 0x70-0x77 */ + 0x9AD5,0x9AD6,0x9AD7,0x9AD9,0x9ADA,0x9ADB,0x9ADC,0x0000,/* 0x78-0x7F */ + + 0x9ADD,0x9ADE,0x9AE0,0x9AE2,0x9AE3,0x9AE4,0x9AE5,0x9AE7,/* 0x80-0x87 */ + 0x9AE8,0x9AE9,0x9AEA,0x9AEC,0x9AEE,0x9AF0,0x9AF1,0x9AF2,/* 0x88-0x8F */ + 0x9AF3,0x9AF4,0x9AF5,0x9AF6,0x9AF7,0x9AF8,0x9AFA,0x9AFC,/* 0x90-0x97 */ + 0x9AFD,0x9AFE,0x9AFF,0x9B00,0x9B01,0x9B02,0x9B04,0x9B05,/* 0x98-0x9F */ + 0x9B06,0x87C6,0x8788,0x8785,0x87AD,0x8797,0x8783,0x87AB,/* 0xA0-0xA7 */ + 0x87E5,0x87AC,0x87B5,0x87B3,0x87CB,0x87D3,0x87BD,0x87D1,/* 0xA8-0xAF */ + 0x87C0,0x87CA,0x87DB,0x87EA,0x87E0,0x87EE,0x8816,0x8813,/* 0xB0-0xB7 */ + 0x87FE,0x880A,0x881B,0x8821,0x8839,0x883C,0x7F36,0x7F42,/* 0xB8-0xBF */ + 0x7F44,0x7F45,0x8210,0x7AFA,0x7AFD,0x7B08,0x7B03,0x7B04,/* 0xC0-0xC7 */ + 0x7B15,0x7B0A,0x7B2B,0x7B0F,0x7B47,0x7B38,0x7B2A,0x7B19,/* 0xC8-0xCF */ + 0x7B2E,0x7B31,0x7B20,0x7B25,0x7B24,0x7B33,0x7B3E,0x7B1E,/* 0xD0-0xD7 */ + 0x7B58,0x7B5A,0x7B45,0x7B75,0x7B4C,0x7B5D,0x7B60,0x7B6E,/* 0xD8-0xDF */ + 0x7B7B,0x7B62,0x7B72,0x7B71,0x7B90,0x7BA6,0x7BA7,0x7BB8,/* 0xE0-0xE7 */ + 0x7BAC,0x7B9D,0x7BA8,0x7B85,0x7BAA,0x7B9C,0x7BA2,0x7BAB,/* 0xE8-0xEF */ + 0x7BB4,0x7BD1,0x7BC1,0x7BCC,0x7BDD,0x7BDA,0x7BE5,0x7BE6,/* 0xF0-0xF7 */ + 0x7BEA,0x7C0C,0x7BFE,0x7BFC,0x7C0F,0x7C16,0x7C0B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B07,0x9B09,0x9B0A,0x9B0B,0x9B0C,0x9B0D,0x9B0E,0x9B10,/* 0x40-0x47 */ + 0x9B11,0x9B12,0x9B14,0x9B15,0x9B16,0x9B17,0x9B18,0x9B19,/* 0x48-0x4F */ + 0x9B1A,0x9B1B,0x9B1C,0x9B1D,0x9B1E,0x9B20,0x9B21,0x9B22,/* 0x50-0x57 */ + 0x9B24,0x9B25,0x9B26,0x9B27,0x9B28,0x9B29,0x9B2A,0x9B2B,/* 0x58-0x5F */ + 0x9B2C,0x9B2D,0x9B2E,0x9B30,0x9B31,0x9B33,0x9B34,0x9B35,/* 0x60-0x67 */ + 0x9B36,0x9B37,0x9B38,0x9B39,0x9B3A,0x9B3D,0x9B3E,0x9B3F,/* 0x68-0x6F */ + 0x9B40,0x9B46,0x9B4A,0x9B4B,0x9B4C,0x9B4E,0x9B50,0x9B52,/* 0x70-0x77 */ + 0x9B53,0x9B55,0x9B56,0x9B57,0x9B58,0x9B59,0x9B5A,0x0000,/* 0x78-0x7F */ + + 0x9B5B,0x9B5C,0x9B5D,0x9B5E,0x9B5F,0x9B60,0x9B61,0x9B62,/* 0x80-0x87 */ + 0x9B63,0x9B64,0x9B65,0x9B66,0x9B67,0x9B68,0x9B69,0x9B6A,/* 0x88-0x8F */ + 0x9B6B,0x9B6C,0x9B6D,0x9B6E,0x9B6F,0x9B70,0x9B71,0x9B72,/* 0x90-0x97 */ + 0x9B73,0x9B74,0x9B75,0x9B76,0x9B77,0x9B78,0x9B79,0x9B7A,/* 0x98-0x9F */ + 0x9B7B,0x7C1F,0x7C2A,0x7C26,0x7C38,0x7C41,0x7C40,0x81FE,/* 0xA0-0xA7 */ + 0x8201,0x8202,0x8204,0x81EC,0x8844,0x8221,0x8222,0x8223,/* 0xA8-0xAF */ + 0x822D,0x822F,0x8228,0x822B,0x8238,0x823B,0x8233,0x8234,/* 0xB0-0xB7 */ + 0x823E,0x8244,0x8249,0x824B,0x824F,0x825A,0x825F,0x8268,/* 0xB8-0xBF */ + 0x887E,0x8885,0x8888,0x88D8,0x88DF,0x895E,0x7F9D,0x7F9F,/* 0xC0-0xC7 */ + 0x7FA7,0x7FAF,0x7FB0,0x7FB2,0x7C7C,0x6549,0x7C91,0x7C9D,/* 0xC8-0xCF */ + 0x7C9C,0x7C9E,0x7CA2,0x7CB2,0x7CBC,0x7CBD,0x7CC1,0x7CC7,/* 0xD0-0xD7 */ + 0x7CCC,0x7CCD,0x7CC8,0x7CC5,0x7CD7,0x7CE8,0x826E,0x66A8,/* 0xD8-0xDF */ + 0x7FBF,0x7FCE,0x7FD5,0x7FE5,0x7FE1,0x7FE6,0x7FE9,0x7FEE,/* 0xE0-0xE7 */ + 0x7FF3,0x7CF8,0x7D77,0x7DA6,0x7DAE,0x7E47,0x7E9B,0x9EB8,/* 0xE8-0xEF */ + 0x9EB4,0x8D73,0x8D84,0x8D94,0x8D91,0x8DB1,0x8D67,0x8D6D,/* 0xF0-0xF7 */ + 0x8C47,0x8C49,0x914A,0x9150,0x914E,0x914F,0x9164,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B7C,0x9B7D,0x9B7E,0x9B7F,0x9B80,0x9B81,0x9B82,0x9B83,/* 0x40-0x47 */ + 0x9B84,0x9B85,0x9B86,0x9B87,0x9B88,0x9B89,0x9B8A,0x9B8B,/* 0x48-0x4F */ + 0x9B8C,0x9B8D,0x9B8E,0x9B8F,0x9B90,0x9B91,0x9B92,0x9B93,/* 0x50-0x57 */ + 0x9B94,0x9B95,0x9B96,0x9B97,0x9B98,0x9B99,0x9B9A,0x9B9B,/* 0x58-0x5F */ + 0x9B9C,0x9B9D,0x9B9E,0x9B9F,0x9BA0,0x9BA1,0x9BA2,0x9BA3,/* 0x60-0x67 */ + 0x9BA4,0x9BA5,0x9BA6,0x9BA7,0x9BA8,0x9BA9,0x9BAA,0x9BAB,/* 0x68-0x6F */ + 0x9BAC,0x9BAD,0x9BAE,0x9BAF,0x9BB0,0x9BB1,0x9BB2,0x9BB3,/* 0x70-0x77 */ + 0x9BB4,0x9BB5,0x9BB6,0x9BB7,0x9BB8,0x9BB9,0x9BBA,0x0000,/* 0x78-0x7F */ + + 0x9BBB,0x9BBC,0x9BBD,0x9BBE,0x9BBF,0x9BC0,0x9BC1,0x9BC2,/* 0x80-0x87 */ + 0x9BC3,0x9BC4,0x9BC5,0x9BC6,0x9BC7,0x9BC8,0x9BC9,0x9BCA,/* 0x88-0x8F */ + 0x9BCB,0x9BCC,0x9BCD,0x9BCE,0x9BCF,0x9BD0,0x9BD1,0x9BD2,/* 0x90-0x97 */ + 0x9BD3,0x9BD4,0x9BD5,0x9BD6,0x9BD7,0x9BD8,0x9BD9,0x9BDA,/* 0x98-0x9F */ + 0x9BDB,0x9162,0x9161,0x9170,0x9169,0x916F,0x917D,0x917E,/* 0xA0-0xA7 */ + 0x9172,0x9174,0x9179,0x918C,0x9185,0x9190,0x918D,0x9191,/* 0xA8-0xAF */ + 0x91A2,0x91A3,0x91AA,0x91AD,0x91AE,0x91AF,0x91B5,0x91B4,/* 0xB0-0xB7 */ + 0x91BA,0x8C55,0x9E7E,0x8DB8,0x8DEB,0x8E05,0x8E59,0x8E69,/* 0xB8-0xBF */ + 0x8DB5,0x8DBF,0x8DBC,0x8DBA,0x8DC4,0x8DD6,0x8DD7,0x8DDA,/* 0xC0-0xC7 */ + 0x8DDE,0x8DCE,0x8DCF,0x8DDB,0x8DC6,0x8DEC,0x8DF7,0x8DF8,/* 0xC8-0xCF */ + 0x8DE3,0x8DF9,0x8DFB,0x8DE4,0x8E09,0x8DFD,0x8E14,0x8E1D,/* 0xD0-0xD7 */ + 0x8E1F,0x8E2C,0x8E2E,0x8E23,0x8E2F,0x8E3A,0x8E40,0x8E39,/* 0xD8-0xDF */ + 0x8E35,0x8E3D,0x8E31,0x8E49,0x8E41,0x8E42,0x8E51,0x8E52,/* 0xE0-0xE7 */ + 0x8E4A,0x8E70,0x8E76,0x8E7C,0x8E6F,0x8E74,0x8E85,0x8E8F,/* 0xE8-0xEF */ + 0x8E94,0x8E90,0x8E9C,0x8E9E,0x8C78,0x8C82,0x8C8A,0x8C85,/* 0xF0-0xF7 */ + 0x8C98,0x8C94,0x659B,0x89D6,0x89DE,0x89DA,0x89DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9BDC,0x9BDD,0x9BDE,0x9BDF,0x9BE0,0x9BE1,0x9BE2,0x9BE3,/* 0x40-0x47 */ + 0x9BE4,0x9BE5,0x9BE6,0x9BE7,0x9BE8,0x9BE9,0x9BEA,0x9BEB,/* 0x48-0x4F */ + 0x9BEC,0x9BED,0x9BEE,0x9BEF,0x9BF0,0x9BF1,0x9BF2,0x9BF3,/* 0x50-0x57 */ + 0x9BF4,0x9BF5,0x9BF6,0x9BF7,0x9BF8,0x9BF9,0x9BFA,0x9BFB,/* 0x58-0x5F */ + 0x9BFC,0x9BFD,0x9BFE,0x9BFF,0x9C00,0x9C01,0x9C02,0x9C03,/* 0x60-0x67 */ + 0x9C04,0x9C05,0x9C06,0x9C07,0x9C08,0x9C09,0x9C0A,0x9C0B,/* 0x68-0x6F */ + 0x9C0C,0x9C0D,0x9C0E,0x9C0F,0x9C10,0x9C11,0x9C12,0x9C13,/* 0x70-0x77 */ + 0x9C14,0x9C15,0x9C16,0x9C17,0x9C18,0x9C19,0x9C1A,0x0000,/* 0x78-0x7F */ + + 0x9C1B,0x9C1C,0x9C1D,0x9C1E,0x9C1F,0x9C20,0x9C21,0x9C22,/* 0x80-0x87 */ + 0x9C23,0x9C24,0x9C25,0x9C26,0x9C27,0x9C28,0x9C29,0x9C2A,/* 0x88-0x8F */ + 0x9C2B,0x9C2C,0x9C2D,0x9C2E,0x9C2F,0x9C30,0x9C31,0x9C32,/* 0x90-0x97 */ + 0x9C33,0x9C34,0x9C35,0x9C36,0x9C37,0x9C38,0x9C39,0x9C3A,/* 0x98-0x9F */ + 0x9C3B,0x89E5,0x89EB,0x89EF,0x8A3E,0x8B26,0x9753,0x96E9,/* 0xA0-0xA7 */ + 0x96F3,0x96EF,0x9706,0x9701,0x9708,0x970F,0x970E,0x972A,/* 0xA8-0xAF */ + 0x972D,0x9730,0x973E,0x9F80,0x9F83,0x9F85,0x9F86,0x9F87,/* 0xB0-0xB7 */ + 0x9F88,0x9F89,0x9F8A,0x9F8C,0x9EFE,0x9F0B,0x9F0D,0x96B9,/* 0xB8-0xBF */ + 0x96BC,0x96BD,0x96CE,0x96D2,0x77BF,0x96E0,0x928E,0x92AE,/* 0xC0-0xC7 */ + 0x92C8,0x933E,0x936A,0x93CA,0x938F,0x943E,0x946B,0x9C7F,/* 0xC8-0xCF */ + 0x9C82,0x9C85,0x9C86,0x9C87,0x9C88,0x7A23,0x9C8B,0x9C8E,/* 0xD0-0xD7 */ + 0x9C90,0x9C91,0x9C92,0x9C94,0x9C95,0x9C9A,0x9C9B,0x9C9E,/* 0xD8-0xDF */ + 0x9C9F,0x9CA0,0x9CA1,0x9CA2,0x9CA3,0x9CA5,0x9CA6,0x9CA7,/* 0xE0-0xE7 */ + 0x9CA8,0x9CA9,0x9CAB,0x9CAD,0x9CAE,0x9CB0,0x9CB1,0x9CB2,/* 0xE8-0xEF */ + 0x9CB3,0x9CB4,0x9CB5,0x9CB6,0x9CB7,0x9CBA,0x9CBB,0x9CBC,/* 0xF0-0xF7 */ + 0x9CBD,0x9CC4,0x9CC5,0x9CC6,0x9CC7,0x9CCA,0x9CCB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9C3C,0x9C3D,0x9C3E,0x9C3F,0x9C40,0x9C41,0x9C42,0x9C43,/* 0x40-0x47 */ + 0x9C44,0x9C45,0x9C46,0x9C47,0x9C48,0x9C49,0x9C4A,0x9C4B,/* 0x48-0x4F */ + 0x9C4C,0x9C4D,0x9C4E,0x9C4F,0x9C50,0x9C51,0x9C52,0x9C53,/* 0x50-0x57 */ + 0x9C54,0x9C55,0x9C56,0x9C57,0x9C58,0x9C59,0x9C5A,0x9C5B,/* 0x58-0x5F */ + 0x9C5C,0x9C5D,0x9C5E,0x9C5F,0x9C60,0x9C61,0x9C62,0x9C63,/* 0x60-0x67 */ + 0x9C64,0x9C65,0x9C66,0x9C67,0x9C68,0x9C69,0x9C6A,0x9C6B,/* 0x68-0x6F */ + 0x9C6C,0x9C6D,0x9C6E,0x9C6F,0x9C70,0x9C71,0x9C72,0x9C73,/* 0x70-0x77 */ + 0x9C74,0x9C75,0x9C76,0x9C77,0x9C78,0x9C79,0x9C7A,0x0000,/* 0x78-0x7F */ + + 0x9C7B,0x9C7D,0x9C7E,0x9C80,0x9C83,0x9C84,0x9C89,0x9C8A,/* 0x80-0x87 */ + 0x9C8C,0x9C8F,0x9C93,0x9C96,0x9C97,0x9C98,0x9C99,0x9C9D,/* 0x88-0x8F */ + 0x9CAA,0x9CAC,0x9CAF,0x9CB9,0x9CBE,0x9CBF,0x9CC0,0x9CC1,/* 0x90-0x97 */ + 0x9CC2,0x9CC8,0x9CC9,0x9CD1,0x9CD2,0x9CDA,0x9CDB,0x9CE0,/* 0x98-0x9F */ + 0x9CE1,0x9CCC,0x9CCD,0x9CCE,0x9CCF,0x9CD0,0x9CD3,0x9CD4,/* 0xA0-0xA7 */ + 0x9CD5,0x9CD7,0x9CD8,0x9CD9,0x9CDC,0x9CDD,0x9CDF,0x9CE2,/* 0xA8-0xAF */ + 0x977C,0x9785,0x9791,0x9792,0x9794,0x97AF,0x97AB,0x97A3,/* 0xB0-0xB7 */ + 0x97B2,0x97B4,0x9AB1,0x9AB0,0x9AB7,0x9E58,0x9AB6,0x9ABA,/* 0xB8-0xBF */ + 0x9ABC,0x9AC1,0x9AC0,0x9AC5,0x9AC2,0x9ACB,0x9ACC,0x9AD1,/* 0xC0-0xC7 */ + 0x9B45,0x9B43,0x9B47,0x9B49,0x9B48,0x9B4D,0x9B51,0x98E8,/* 0xC8-0xCF */ + 0x990D,0x992E,0x9955,0x9954,0x9ADF,0x9AE1,0x9AE6,0x9AEF,/* 0xD0-0xD7 */ + 0x9AEB,0x9AFB,0x9AED,0x9AF9,0x9B08,0x9B0F,0x9B13,0x9B1F,/* 0xD8-0xDF */ + 0x9B23,0x9EBD,0x9EBE,0x7E3B,0x9E82,0x9E87,0x9E88,0x9E8B,/* 0xE0-0xE7 */ + 0x9E92,0x93D6,0x9E9D,0x9E9F,0x9EDB,0x9EDC,0x9EDD,0x9EE0,/* 0xE8-0xEF */ + 0x9EDF,0x9EE2,0x9EE9,0x9EE7,0x9EE5,0x9EEA,0x9EEF,0x9F22,/* 0xF0-0xF7 */ + 0x9F2C,0x9F2F,0x9F39,0x9F37,0x9F3D,0x9F3E,0x9F44,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9CE3,0x9CE4,0x9CE5,0x9CE6,0x9CE7,0x9CE8,0x9CE9,0x9CEA,/* 0x40-0x47 */ + 0x9CEB,0x9CEC,0x9CED,0x9CEE,0x9CEF,0x9CF0,0x9CF1,0x9CF2,/* 0x48-0x4F */ + 0x9CF3,0x9CF4,0x9CF5,0x9CF6,0x9CF7,0x9CF8,0x9CF9,0x9CFA,/* 0x50-0x57 */ + 0x9CFB,0x9CFC,0x9CFD,0x9CFE,0x9CFF,0x9D00,0x9D01,0x9D02,/* 0x58-0x5F */ + 0x9D03,0x9D04,0x9D05,0x9D06,0x9D07,0x9D08,0x9D09,0x9D0A,/* 0x60-0x67 */ + 0x9D0B,0x9D0C,0x9D0D,0x9D0E,0x9D0F,0x9D10,0x9D11,0x9D12,/* 0x68-0x6F */ + 0x9D13,0x9D14,0x9D15,0x9D16,0x9D17,0x9D18,0x9D19,0x9D1A,/* 0x70-0x77 */ + 0x9D1B,0x9D1C,0x9D1D,0x9D1E,0x9D1F,0x9D20,0x9D21,0x0000,/* 0x78-0x7F */ + + 0x9D22,0x9D23,0x9D24,0x9D25,0x9D26,0x9D27,0x9D28,0x9D29,/* 0x80-0x87 */ + 0x9D2A,0x9D2B,0x9D2C,0x9D2D,0x9D2E,0x9D2F,0x9D30,0x9D31,/* 0x88-0x8F */ + 0x9D32,0x9D33,0x9D34,0x9D35,0x9D36,0x9D37,0x9D38,0x9D39,/* 0x90-0x97 */ + 0x9D3A,0x9D3B,0x9D3C,0x9D3D,0x9D3E,0x9D3F,0x9D40,0x9D41,/* 0x98-0x9F */ + 0x9D42,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9D43,0x9D44,0x9D45,0x9D46,0x9D47,0x9D48,0x9D49,0x9D4A,/* 0x40-0x47 */ + 0x9D4B,0x9D4C,0x9D4D,0x9D4E,0x9D4F,0x9D50,0x9D51,0x9D52,/* 0x48-0x4F */ + 0x9D53,0x9D54,0x9D55,0x9D56,0x9D57,0x9D58,0x9D59,0x9D5A,/* 0x50-0x57 */ + 0x9D5B,0x9D5C,0x9D5D,0x9D5E,0x9D5F,0x9D60,0x9D61,0x9D62,/* 0x58-0x5F */ + 0x9D63,0x9D64,0x9D65,0x9D66,0x9D67,0x9D68,0x9D69,0x9D6A,/* 0x60-0x67 */ + 0x9D6B,0x9D6C,0x9D6D,0x9D6E,0x9D6F,0x9D70,0x9D71,0x9D72,/* 0x68-0x6F */ + 0x9D73,0x9D74,0x9D75,0x9D76,0x9D77,0x9D78,0x9D79,0x9D7A,/* 0x70-0x77 */ + 0x9D7B,0x9D7C,0x9D7D,0x9D7E,0x9D7F,0x9D80,0x9D81,0x0000,/* 0x78-0x7F */ + + 0x9D82,0x9D83,0x9D84,0x9D85,0x9D86,0x9D87,0x9D88,0x9D89,/* 0x80-0x87 */ + 0x9D8A,0x9D8B,0x9D8C,0x9D8D,0x9D8E,0x9D8F,0x9D90,0x9D91,/* 0x88-0x8F */ + 0x9D92,0x9D93,0x9D94,0x9D95,0x9D96,0x9D97,0x9D98,0x9D99,/* 0x90-0x97 */ + 0x9D9A,0x9D9B,0x9D9C,0x9D9D,0x9D9E,0x9D9F,0x9DA0,0x9DA1,/* 0x98-0x9F */ + 0x9DA2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9DA3,0x9DA4,0x9DA5,0x9DA6,0x9DA7,0x9DA8,0x9DA9,0x9DAA,/* 0x40-0x47 */ + 0x9DAB,0x9DAC,0x9DAD,0x9DAE,0x9DAF,0x9DB0,0x9DB1,0x9DB2,/* 0x48-0x4F */ + 0x9DB3,0x9DB4,0x9DB5,0x9DB6,0x9DB7,0x9DB8,0x9DB9,0x9DBA,/* 0x50-0x57 */ + 0x9DBB,0x9DBC,0x9DBD,0x9DBE,0x9DBF,0x9DC0,0x9DC1,0x9DC2,/* 0x58-0x5F */ + 0x9DC3,0x9DC4,0x9DC5,0x9DC6,0x9DC7,0x9DC8,0x9DC9,0x9DCA,/* 0x60-0x67 */ + 0x9DCB,0x9DCC,0x9DCD,0x9DCE,0x9DCF,0x9DD0,0x9DD1,0x9DD2,/* 0x68-0x6F */ + 0x9DD3,0x9DD4,0x9DD5,0x9DD6,0x9DD7,0x9DD8,0x9DD9,0x9DDA,/* 0x70-0x77 */ + 0x9DDB,0x9DDC,0x9DDD,0x9DDE,0x9DDF,0x9DE0,0x9DE1,0x0000,/* 0x78-0x7F */ + + 0x9DE2,0x9DE3,0x9DE4,0x9DE5,0x9DE6,0x9DE7,0x9DE8,0x9DE9,/* 0x80-0x87 */ + 0x9DEA,0x9DEB,0x9DEC,0x9DED,0x9DEE,0x9DEF,0x9DF0,0x9DF1,/* 0x88-0x8F */ + 0x9DF2,0x9DF3,0x9DF4,0x9DF5,0x9DF6,0x9DF7,0x9DF8,0x9DF9,/* 0x90-0x97 */ + 0x9DFA,0x9DFB,0x9DFC,0x9DFD,0x9DFE,0x9DFF,0x9E00,0x9E01,/* 0x98-0x9F */ + 0x9E02,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9E03,0x9E04,0x9E05,0x9E06,0x9E07,0x9E08,0x9E09,0x9E0A,/* 0x40-0x47 */ + 0x9E0B,0x9E0C,0x9E0D,0x9E0E,0x9E0F,0x9E10,0x9E11,0x9E12,/* 0x48-0x4F */ + 0x9E13,0x9E14,0x9E15,0x9E16,0x9E17,0x9E18,0x9E19,0x9E1A,/* 0x50-0x57 */ + 0x9E1B,0x9E1C,0x9E1D,0x9E1E,0x9E24,0x9E27,0x9E2E,0x9E30,/* 0x58-0x5F */ + 0x9E34,0x9E3B,0x9E3C,0x9E40,0x9E4D,0x9E50,0x9E52,0x9E53,/* 0x60-0x67 */ + 0x9E54,0x9E56,0x9E59,0x9E5D,0x9E5F,0x9E60,0x9E61,0x9E62,/* 0x68-0x6F */ + 0x9E65,0x9E6E,0x9E6F,0x9E72,0x9E74,0x9E75,0x9E76,0x9E77,/* 0x70-0x77 */ + 0x9E78,0x9E79,0x9E7A,0x9E7B,0x9E7C,0x9E7D,0x9E80,0x0000,/* 0x78-0x7F */ + + 0x9E81,0x9E83,0x9E84,0x9E85,0x9E86,0x9E89,0x9E8A,0x9E8C,/* 0x80-0x87 */ + 0x9E8D,0x9E8E,0x9E8F,0x9E90,0x9E91,0x9E94,0x9E95,0x9E96,/* 0x88-0x8F */ + 0x9E97,0x9E98,0x9E99,0x9E9A,0x9E9B,0x9E9C,0x9E9E,0x9EA0,/* 0x90-0x97 */ + 0x9EA1,0x9EA2,0x9EA3,0x9EA4,0x9EA5,0x9EA7,0x9EA8,0x9EA9,/* 0x98-0x9F */ + 0x9EAA,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9EAB,0x9EAC,0x9EAD,0x9EAE,0x9EAF,0x9EB0,0x9EB1,0x9EB2,/* 0x40-0x47 */ + 0x9EB3,0x9EB5,0x9EB6,0x9EB7,0x9EB9,0x9EBA,0x9EBC,0x9EBF,/* 0x48-0x4F */ + 0x9EC0,0x9EC1,0x9EC2,0x9EC3,0x9EC5,0x9EC6,0x9EC7,0x9EC8,/* 0x50-0x57 */ + 0x9ECA,0x9ECB,0x9ECC,0x9ED0,0x9ED2,0x9ED3,0x9ED5,0x9ED6,/* 0x58-0x5F */ + 0x9ED7,0x9ED9,0x9EDA,0x9EDE,0x9EE1,0x9EE3,0x9EE4,0x9EE6,/* 0x60-0x67 */ + 0x9EE8,0x9EEB,0x9EEC,0x9EED,0x9EEE,0x9EF0,0x9EF1,0x9EF2,/* 0x68-0x6F */ + 0x9EF3,0x9EF4,0x9EF5,0x9EF6,0x9EF7,0x9EF8,0x9EFA,0x9EFD,/* 0x70-0x77 */ + 0x9EFF,0x9F00,0x9F01,0x9F02,0x9F03,0x9F04,0x9F05,0x0000,/* 0x78-0x7F */ + + 0x9F06,0x9F07,0x9F08,0x9F09,0x9F0A,0x9F0C,0x9F0F,0x9F11,/* 0x80-0x87 */ + 0x9F12,0x9F14,0x9F15,0x9F16,0x9F18,0x9F1A,0x9F1B,0x9F1C,/* 0x88-0x8F */ + 0x9F1D,0x9F1E,0x9F1F,0x9F21,0x9F23,0x9F24,0x9F25,0x9F26,/* 0x90-0x97 */ + 0x9F27,0x9F28,0x9F29,0x9F2A,0x9F2B,0x9F2D,0x9F2E,0x9F30,/* 0x98-0x9F */ + 0x9F31,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9F32,0x9F33,0x9F34,0x9F35,0x9F36,0x9F38,0x9F3A,0x9F3C,/* 0x40-0x47 */ + 0x9F3F,0x9F40,0x9F41,0x9F42,0x9F43,0x9F45,0x9F46,0x9F47,/* 0x48-0x4F */ + 0x9F48,0x9F49,0x9F4A,0x9F4B,0x9F4C,0x9F4D,0x9F4E,0x9F4F,/* 0x50-0x57 */ + 0x9F52,0x9F53,0x9F54,0x9F55,0x9F56,0x9F57,0x9F58,0x9F59,/* 0x58-0x5F */ + 0x9F5A,0x9F5B,0x9F5C,0x9F5D,0x9F5E,0x9F5F,0x9F60,0x9F61,/* 0x60-0x67 */ + 0x9F62,0x9F63,0x9F64,0x9F65,0x9F66,0x9F67,0x9F68,0x9F69,/* 0x68-0x6F */ + 0x9F6A,0x9F6B,0x9F6C,0x9F6D,0x9F6E,0x9F6F,0x9F70,0x9F71,/* 0x70-0x77 */ + 0x9F72,0x9F73,0x9F74,0x9F75,0x9F76,0x9F77,0x9F78,0x0000,/* 0x78-0x7F */ + + 0x9F79,0x9F7A,0x9F7B,0x9F7C,0x9F7D,0x9F7E,0x9F81,0x9F82,/* 0x80-0x87 */ + 0x9F8D,0x9F8E,0x9F8F,0x9F90,0x9F91,0x9F92,0x9F93,0x9F94,/* 0x88-0x8F */ + 0x9F95,0x9F96,0x9F97,0x9F98,0x9F9C,0x9F9D,0x9F9E,0x9FA1,/* 0x90-0x97 */ + 0x9FA2,0x9FA3,0x9FA4,0x9FA5,0xF92C,0xF979,0xF995,0xF9E7,/* 0x98-0x9F */ + 0xF9F1,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFA0C,0xFA0D,0xFA0E,0xFA0F,0xFA11,0xFA13,0xFA14,0xFA18,/* 0x40-0x47 */ + 0xFA1F,0xFA20,0xFA21,0xFA23,0xFA24,0xFA27,0xFA28,0xFA29,/* 0x48-0x4F */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, + c2u_C8, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL, +}; + +static const unsigned char u2c_00[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */ + 0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */ + 0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ + 0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_01[512] = { + 0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA5, 0xA8, 0xA5, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA7, 0xA8, 0xA7, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA9, 0xA8, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xA8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xA8, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA8, 0xAD, 0xA8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0xA8, 0xB1, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA8, 0xA3, 0xA8, 0xA3, 0xA8, 0xAB, /* 0xCC-0xCF */ + 0xA8, 0xAB, 0xA8, 0xAF, 0xA8, 0xAF, 0xA8, 0xB3, /* 0xD0-0xD3 */ + 0xA8, 0xB3, 0xA8, 0xB5, 0xA8, 0xB5, 0xA8, 0xB6, /* 0xD4-0xD7 */ + 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB7, 0xA8, 0xB8, /* 0xD8-0xDB */ + 0xA8, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA8, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xA1, 0xA5, 0xA8, 0x40, 0xA8, 0x41, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA8, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA6, 0xA1, 0xA6, 0xA2, 0xA6, 0xA3, /* 0x90-0x93 */ + 0xA6, 0xA4, 0xA6, 0xA5, 0xA6, 0xA6, 0xA6, 0xA7, /* 0x94-0x97 */ + 0xA6, 0xA8, 0xA6, 0xA9, 0xA6, 0xAA, 0xA6, 0xAB, /* 0x98-0x9B */ + 0xA6, 0xAC, 0xA6, 0xAD, 0xA6, 0xAE, 0xA6, 0xAF, /* 0x9C-0x9F */ + 0xA6, 0xB0, 0xA6, 0xB1, 0x00, 0x00, 0xA6, 0xB2, /* 0xA0-0xA3 */ + 0xA6, 0xB3, 0xA6, 0xB4, 0xA6, 0xB5, 0xA6, 0xB6, /* 0xA4-0xA7 */ + 0xA6, 0xB7, 0xA6, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA6, 0xC1, 0xA6, 0xC2, 0xA6, 0xC3, /* 0xB0-0xB3 */ + 0xA6, 0xC4, 0xA6, 0xC5, 0xA6, 0xC6, 0xA6, 0xC7, /* 0xB4-0xB7 */ + 0xA6, 0xC8, 0xA6, 0xC9, 0xA6, 0xCA, 0xA6, 0xCB, /* 0xB8-0xBB */ + 0xA6, 0xCC, 0xA6, 0xCD, 0xA6, 0xCE, 0xA6, 0xCF, /* 0xBC-0xBF */ + 0xA6, 0xD0, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0xC0-0xC3 */ + 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xD5, 0xA6, 0xD6, /* 0xC4-0xC7 */ + 0xA6, 0xD7, 0xA6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0xA7, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, 0xA7, 0xA4, /* 0x10-0x13 */ + 0xA7, 0xA5, 0xA7, 0xA6, 0xA7, 0xA8, 0xA7, 0xA9, /* 0x14-0x17 */ + 0xA7, 0xAA, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x18-0x1B */ + 0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x1C-0x1F */ + 0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xB5, /* 0x20-0x23 */ + 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, 0xA7, 0xB9, /* 0x24-0x27 */ + 0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xBC, 0xA7, 0xBD, /* 0x28-0x2B */ + 0xA7, 0xBE, 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, /* 0x2C-0x2F */ + 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, 0xA7, 0xD4, /* 0x30-0x33 */ + 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD8, 0xA7, 0xD9, /* 0x34-0x37 */ + 0xA7, 0xDA, 0xA7, 0xDB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x38-0x3B */ + 0xA7, 0xDE, 0xA7, 0xDF, 0xA7, 0xE0, 0xA7, 0xE1, /* 0x3C-0x3F */ + 0xA7, 0xE2, 0xA7, 0xE3, 0xA7, 0xE4, 0xA7, 0xE5, /* 0x40-0x43 */ + 0xA7, 0xE6, 0xA7, 0xE7, 0xA7, 0xE8, 0xA7, 0xE9, /* 0x44-0x47 */ + 0xA7, 0xEA, 0xA7, 0xEB, 0xA7, 0xEC, 0xA7, 0xED, /* 0x48-0x4B */ + 0xA7, 0xEE, 0xA7, 0xEF, 0xA7, 0xF0, 0xA7, 0xF1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA9, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x43, /* 0x10-0x13 */ + 0xA1, 0xAA, 0xA8, 0x44, 0xA1, 0xAC, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA8, 0x45, 0xA1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0xEB, 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, /* 0x30-0x33 */ + 0x00, 0x00, 0xA8, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA3, 0xFE, 0x00, 0x00, /* 0x3C-0x3F */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE6, /* 0x00-0x03 */ + 0x00, 0x00, 0xA8, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA8, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA9, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, 0xA2, 0xF4, /* 0x60-0x63 */ + 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, 0xA2, 0xF8, /* 0x64-0x67 */ + 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, 0xA2, 0xFC, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA2, 0xA1, 0xA2, 0xA2, 0xA2, 0xA3, 0xA2, 0xA4, /* 0x70-0x73 */ + 0xA2, 0xA5, 0xA2, 0xA6, 0xA2, 0xA7, 0xA2, 0xA8, /* 0x74-0x77 */ + 0xA2, 0xA9, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xFB, 0xA1, 0xFC, 0xA1, 0xFA, 0xA1, 0xFD, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x49, 0xA8, 0x4A, /* 0x94-0x97 */ + 0xA8, 0x4B, 0xA8, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_22[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC7, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA1, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xE3, 0x00, 0x00, 0xA1, 0xCC, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xDE, 0xA8, 0x4E, /* 0x1C-0x1F */ + 0xA1, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x4F, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0xA1, 0xC4, /* 0x24-0x27 */ + 0xA1, 0xC5, 0xA1, 0xC9, 0xA1, 0xC8, 0xA1, 0xD2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD3, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xE0, 0xA1, 0xDF, 0xA1, 0xC3, 0xA1, 0xCB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA1, 0xAB, 0xA1, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xA1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x50, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xD9, 0xA1, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0xDC, 0xA1, 0xDD, 0xA8, 0x51, 0xA8, 0x52, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x53, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD0, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xD9, 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, /* 0x60-0x63 */ + 0xA2, 0xDD, 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, /* 0x64-0x67 */ + 0xA2, 0xE1, 0xA2, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA2, 0xC5, 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, /* 0x74-0x77 */ + 0xA2, 0xC9, 0xA2, 0xCA, 0xA2, 0xCB, 0xA2, 0xCC, /* 0x78-0x7B */ + 0xA2, 0xCD, 0xA2, 0xCE, 0xA2, 0xCF, 0xA2, 0xD0, /* 0x7C-0x7F */ + + 0xA2, 0xD1, 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, /* 0x80-0x83 */ + 0xA2, 0xD5, 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, /* 0x84-0x87 */ + 0xA2, 0xB1, 0xA2, 0xB2, 0xA2, 0xB3, 0xA2, 0xB4, /* 0x88-0x8B */ + 0xA2, 0xB5, 0xA2, 0xB6, 0xA2, 0xB7, 0xA2, 0xB8, /* 0x8C-0x8F */ + 0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x90-0x93 */ + 0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x94-0x97 */ + 0xA2, 0xC1, 0xA2, 0xC2, 0xA2, 0xC3, 0xA2, 0xC4, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_25[512] = { + 0xA9, 0xA4, 0xA9, 0xA5, 0xA9, 0xA6, 0xA9, 0xA7, /* 0x00-0x03 */ + 0xA9, 0xA8, 0xA9, 0xA9, 0xA9, 0xAA, 0xA9, 0xAB, /* 0x04-0x07 */ + 0xA9, 0xAC, 0xA9, 0xAD, 0xA9, 0xAE, 0xA9, 0xAF, /* 0x08-0x0B */ + 0xA9, 0xB0, 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, /* 0x0C-0x0F */ + 0xA9, 0xB4, 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x10-0x13 */ + 0xA9, 0xB8, 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, /* 0x14-0x17 */ + 0xA9, 0xBC, 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, /* 0x18-0x1B */ + 0xA9, 0xC0, 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, /* 0x1C-0x1F */ + 0xA9, 0xC4, 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, /* 0x20-0x23 */ + 0xA9, 0xC8, 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, /* 0x24-0x27 */ + 0xA9, 0xCC, 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, /* 0x28-0x2B */ + 0xA9, 0xD0, 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, /* 0x2C-0x2F */ + 0xA9, 0xD4, 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, /* 0x30-0x33 */ + 0xA9, 0xD8, 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, /* 0x34-0x37 */ + 0xA9, 0xDC, 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, /* 0x38-0x3B */ + 0xA9, 0xE0, 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, /* 0x3C-0x3F */ + 0xA9, 0xE4, 0xA9, 0xE5, 0xA9, 0xE6, 0xA9, 0xE7, /* 0x40-0x43 */ + 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, 0xA9, 0xEB, /* 0x44-0x47 */ + 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, 0xA9, 0xEF, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xA8, 0x54, 0xA8, 0x55, 0xA8, 0x56, 0xA8, 0x57, /* 0x50-0x53 */ + 0xA8, 0x58, 0xA8, 0x59, 0xA8, 0x5A, 0xA8, 0x5B, /* 0x54-0x57 */ + 0xA8, 0x5C, 0xA8, 0x5D, 0xA8, 0x5E, 0xA8, 0x5F, /* 0x58-0x5B */ + 0xA8, 0x60, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x5C-0x5F */ + 0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x60-0x63 */ + 0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x64-0x67 */ + 0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x68-0x6B */ + 0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x6C-0x6F */ + 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, 0xA8, 0x77, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0x80-0x83 */ + 0xA8, 0x7B, 0xA8, 0x7C, 0xA8, 0x7D, 0xA8, 0x7E, /* 0x84-0x87 */ + 0xA8, 0x80, 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, /* 0x88-0x8B */ + 0xA8, 0x84, 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x88, /* 0x90-0x93 */ + 0xA8, 0x89, 0xA8, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xF6, 0xA1, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA8, 0x8B, 0xA8, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF4, 0xA1, 0xF3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF0, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF2, 0xA1, 0xF1, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x8D, 0xA8, 0x8E, /* 0xE0-0xE3 */ + 0xA8, 0x8F, 0xA8, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA8, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xE2, 0x00, 0x00, 0xA1, 0xE1, 0x00, 0x00, /* 0x40-0x43 */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xA9, 0xA9, 0x65, 0xA9, 0x96, /* 0x04-0x07 */ + 0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */ + 0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */ + 0xA1, 0xBE, 0xA1, 0xBF, 0xA8, 0x93, 0xA1, 0xFE, /* 0x10-0x13 */ + 0xA1, 0xB2, 0xA1, 0xB3, 0xA1, 0xBC, 0xA1, 0xBD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA8, 0x94, 0xA8, 0x95, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA9, 0x40, 0xA9, 0x41, 0xA9, 0x42, /* 0x20-0x23 */ + 0xA9, 0x43, 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, /* 0x24-0x27 */ + 0xA9, 0x47, 0xA9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x40-0x43 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x44-0x47 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x48-0x4B */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x4C-0x4F */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x50-0x53 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x54-0x57 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x58-0x5B */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x5C-0x5F */ + 0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x60-0x63 */ + 0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x64-0x67 */ + 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x68-0x6B */ + 0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x6C-0x6F */ + 0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x70-0x73 */ + 0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x74-0x77 */ + 0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x78-0x7B */ + 0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x7C-0x7F */ + + 0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x80-0x83 */ + 0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x84-0x87 */ + 0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x88-0x8B */ + 0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x8C-0x8F */ + 0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x61, /* 0x98-0x9B */ + 0xA9, 0x62, 0xA9, 0x66, 0xA9, 0x67, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, /* 0xA0-0xA3 */ + 0xA5, 0xA4, 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, /* 0xA4-0xA7 */ + 0xA5, 0xA8, 0xA5, 0xA9, 0xA5, 0xAA, 0xA5, 0xAB, /* 0xA8-0xAB */ + 0xA5, 0xAC, 0xA5, 0xAD, 0xA5, 0xAE, 0xA5, 0xAF, /* 0xAC-0xAF */ + 0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0xB0-0xB3 */ + 0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0xB4-0xB7 */ + 0xA5, 0xB8, 0xA5, 0xB9, 0xA5, 0xBA, 0xA5, 0xBB, /* 0xB8-0xBB */ + 0xA5, 0xBC, 0xA5, 0xBD, 0xA5, 0xBE, 0xA5, 0xBF, /* 0xBC-0xBF */ + 0xA5, 0xC0, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0xC0-0xC3 */ + 0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0xC4-0xC7 */ + 0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0xC8-0xCB */ + 0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0xCC-0xCF */ + 0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0xD0-0xD3 */ + 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, 0xA5, 0xD7, /* 0xD4-0xD7 */ + 0xA5, 0xD8, 0xA5, 0xD9, 0xA5, 0xDA, 0xA5, 0xDB, /* 0xD8-0xDB */ + 0xA5, 0xDC, 0xA5, 0xDD, 0xA5, 0xDE, 0xA5, 0xDF, /* 0xDC-0xDF */ + 0xA5, 0xE0, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xE0-0xE3 */ + 0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xE4-0xE7 */ + 0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xE8-0xEB */ + 0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xEC-0xEF */ + 0xA5, 0xF0, 0xA5, 0xF1, 0xA5, 0xF2, 0xA5, 0xF3, /* 0xF0-0xF3 */ + 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xA9, 0x60, 0xA9, 0x63, 0xA9, 0x64, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, /* 0x04-0x07 */ + 0xA8, 0xC8, 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, /* 0x08-0x0B */ + 0xA8, 0xCC, 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, /* 0x0C-0x0F */ + 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, /* 0x10-0x13 */ + 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, /* 0x14-0x17 */ + 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, /* 0x18-0x1B */ + 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, /* 0x1C-0x1F */ + 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, /* 0x20-0x23 */ + 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, 0xA8, 0xE7, /* 0x24-0x27 */ + 0xA8, 0xE8, 0xA8, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBB, 0xB6, 0xFE, /* 0x90-0x93 */ + 0xC8, 0xFD, 0xCB, 0xC4, 0xC9, 0xCF, 0xD6, 0xD0, /* 0x94-0x97 */ + 0xCF, 0xC2, 0xBC, 0xD7, 0xD2, 0xD2, 0xB1, 0xFB, /* 0x98-0x9B */ + 0xB6, 0xA1, 0xCC, 0xEC, 0xB5, 0xD8, 0xC8, 0xCB, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA2, 0xE5, 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, /* 0x20-0x23 */ + 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, 0xA2, 0xEC, /* 0x24-0x27 */ + 0xA2, 0xED, 0xA2, 0xEE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x28-0x2B */ + 0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x2C-0x2F */ + 0xC8, 0xD5, 0xA9, 0x5A, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x30-0x33 */ + 0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x34-0x37 */ + 0xC0, 0xCD, 0xB4, 0xFA, 0xBA, 0xF4, 0xD1, 0xA7, /* 0x38-0x3B */ + 0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0x3C-0x3F */ + 0xBC, 0xC0, 0xD0, 0xDD, 0xD7, 0xD4, 0xD6, 0xC1, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD2, 0xBB, 0xB6, 0xFE, 0xC8, 0xFD, 0xCB, 0xC4, /* 0x80-0x83 */ + 0xCE, 0xE5, 0xC1, 0xF9, 0xC6, 0xDF, 0xB0, 0xCB, /* 0x84-0x87 */ + 0xBE, 0xC5, 0xCA, 0xAE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x88-0x8B */ + 0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x8C-0x8F */ + 0xC8, 0xD5, 0xD6, 0xEA, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x90-0x93 */ + 0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x94-0x97 */ + 0xC0, 0xCD, 0xC3, 0xD8, 0xC4, 0xD0, 0xC5, 0xAE, /* 0x98-0x9B */ + 0xCA, 0xCA, 0xD3, 0xC5, 0x00, 0x00, 0xD7, 0xA2, /* 0x9C-0x9F */ + 0xCF, 0xEE, 0xD0, 0xDD, 0xD0, 0xB4, 0xA9, 0x49, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD2, 0xBD, 0xD7, 0xDA, 0xD1, 0xA7, /* 0xA8-0xAB */ + 0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0xAC-0xAF */ + 0xD2, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x4A, 0xA9, 0x4B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA9, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xA9, 0x52, 0xA9, 0x53, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xA9, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_4E[512] = { + 0xD2, 0xBB, 0xB6, 0xA1, 0x81, 0x40, 0xC6, 0xDF, /* 0x00-0x03 */ + 0x81, 0x41, 0x81, 0x42, 0x81, 0x43, 0xCD, 0xF2, /* 0x04-0x07 */ + 0xD5, 0xC9, 0xC8, 0xFD, 0xC9, 0xCF, 0xCF, 0xC2, /* 0x08-0x0B */ + 0xD8, 0xA2, 0xB2, 0xBB, 0xD3, 0xEB, 0x81, 0x44, /* 0x0C-0x0F */ + 0xD8, 0xA4, 0xB3, 0xF3, 0x81, 0x45, 0xD7, 0xA8, /* 0x10-0x13 */ + 0xC7, 0xD2, 0xD8, 0xA7, 0xCA, 0xC0, 0x81, 0x46, /* 0x14-0x17 */ + 0xC7, 0xF0, 0xB1, 0xFB, 0xD2, 0xB5, 0xB4, 0xD4, /* 0x18-0x1B */ + 0xB6, 0xAB, 0xCB, 0xBF, 0xD8, 0xA9, 0x81, 0x47, /* 0x1C-0x1F */ + 0x81, 0x48, 0x81, 0x49, 0xB6, 0xAA, 0x81, 0x4A, /* 0x20-0x23 */ + 0xC1, 0xBD, 0xD1, 0xCF, 0x81, 0x4B, 0xC9, 0xA5, /* 0x24-0x27 */ + 0xD8, 0xAD, 0x81, 0x4C, 0xB8, 0xF6, 0xD1, 0xBE, /* 0x28-0x2B */ + 0xE3, 0xDC, 0xD6, 0xD0, 0x81, 0x4D, 0x81, 0x4E, /* 0x2C-0x2F */ + 0xB7, 0xE1, 0x81, 0x4F, 0xB4, 0xAE, 0x81, 0x50, /* 0x30-0x33 */ + 0xC1, 0xD9, 0x81, 0x51, 0xD8, 0xBC, 0x81, 0x52, /* 0x34-0x37 */ + 0xCD, 0xE8, 0xB5, 0xA4, 0xCE, 0xAA, 0xD6, 0xF7, /* 0x38-0x3B */ + 0x81, 0x53, 0xC0, 0xF6, 0xBE, 0xD9, 0xD8, 0xAF, /* 0x3C-0x3F */ + 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, 0xC4, 0xCB, /* 0x40-0x43 */ + 0x81, 0x57, 0xBE, 0xC3, 0x81, 0x58, 0xD8, 0xB1, /* 0x44-0x47 */ + 0xC3, 0xB4, 0xD2, 0xE5, 0x81, 0x59, 0xD6, 0xAE, /* 0x48-0x4B */ + 0xCE, 0xDA, 0xD5, 0xA7, 0xBA, 0xF5, 0xB7, 0xA6, /* 0x4C-0x4F */ + 0xC0, 0xD6, 0x81, 0x5A, 0xC6, 0xB9, 0xC5, 0xD2, /* 0x50-0x53 */ + 0xC7, 0xC7, 0x81, 0x5B, 0xB9, 0xD4, 0x81, 0x5C, /* 0x54-0x57 */ + 0xB3, 0xCB, 0xD2, 0xD2, 0x81, 0x5D, 0x81, 0x5E, /* 0x58-0x5B */ + 0xD8, 0xBF, 0xBE, 0xC5, 0xC6, 0xF2, 0xD2, 0xB2, /* 0x5C-0x5F */ + 0xCF, 0xB0, 0xCF, 0xE7, 0x81, 0x5F, 0x81, 0x60, /* 0x60-0x63 */ + 0x81, 0x61, 0x81, 0x62, 0xCA, 0xE9, 0x81, 0x63, /* 0x64-0x67 */ + 0x81, 0x64, 0xD8, 0xC0, 0x81, 0x65, 0x81, 0x66, /* 0x68-0x6B */ + 0x81, 0x67, 0x81, 0x68, 0x81, 0x69, 0x81, 0x6A, /* 0x6C-0x6F */ + 0xC2, 0xF2, 0xC2, 0xD2, 0x81, 0x6B, 0xC8, 0xE9, /* 0x70-0x73 */ + 0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x74-0x77 */ + 0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, /* 0x78-0x7B */ + 0x81, 0x74, 0x81, 0x75, 0xC7, 0xAC, 0x81, 0x76, /* 0x7C-0x7F */ + + 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, 0x81, 0x7A, /* 0x80-0x83 */ + 0x81, 0x7B, 0x81, 0x7C, 0xC1, 0xCB, 0x81, 0x7D, /* 0x84-0x87 */ + 0xD3, 0xE8, 0xD5, 0xF9, 0x81, 0x7E, 0xCA, 0xC2, /* 0x88-0x8B */ + 0xB6, 0xFE, 0xD8, 0xA1, 0xD3, 0xDA, 0xBF, 0xF7, /* 0x8C-0x8F */ + 0x81, 0x80, 0xD4, 0xC6, 0xBB, 0xA5, 0xD8, 0xC1, /* 0x90-0x93 */ + 0xCE, 0xE5, 0xBE, 0xAE, 0x81, 0x81, 0x81, 0x82, /* 0x94-0x97 */ + 0xD8, 0xA8, 0x81, 0x83, 0xD1, 0xC7, 0xD0, 0xA9, /* 0x98-0x9B */ + 0x81, 0x84, 0x81, 0x85, 0x81, 0x86, 0xD8, 0xBD, /* 0x9C-0x9F */ + 0xD9, 0xEF, 0xCD, 0xF6, 0xBF, 0xBA, 0x81, 0x87, /* 0xA0-0xA3 */ + 0xBD, 0xBB, 0xBA, 0xA5, 0xD2, 0xE0, 0xB2, 0xFA, /* 0xA4-0xA7 */ + 0xBA, 0xE0, 0xC4, 0xB6, 0x81, 0x88, 0xCF, 0xED, /* 0xA8-0xAB */ + 0xBE, 0xA9, 0xCD, 0xA4, 0xC1, 0xC1, 0x81, 0x89, /* 0xAC-0xAF */ + 0x81, 0x8A, 0x81, 0x8B, 0xC7, 0xD7, 0xD9, 0xF1, /* 0xB0-0xB3 */ + 0x81, 0x8C, 0xD9, 0xF4, 0x81, 0x8D, 0x81, 0x8E, /* 0xB4-0xB7 */ + 0x81, 0x8F, 0x81, 0x90, 0xC8, 0xCB, 0xD8, 0xE9, /* 0xB8-0xBB */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0xD2, 0xDA, /* 0xBC-0xBF */ + 0xCA, 0xB2, 0xC8, 0xCA, 0xD8, 0xEC, 0xD8, 0xEA, /* 0xC0-0xC3 */ + 0xD8, 0xC6, 0xBD, 0xF6, 0xC6, 0xCD, 0xB3, 0xF0, /* 0xC4-0xC7 */ + 0x81, 0x94, 0xD8, 0xEB, 0xBD, 0xF1, 0xBD, 0xE9, /* 0xC8-0xCB */ + 0x81, 0x95, 0xC8, 0xD4, 0xB4, 0xD3, 0x81, 0x96, /* 0xCC-0xCF */ + 0x81, 0x97, 0xC2, 0xD8, 0x81, 0x98, 0xB2, 0xD6, /* 0xD0-0xD3 */ + 0xD7, 0xD0, 0xCA, 0xCB, 0xCB, 0xFB, 0xD5, 0xCC, /* 0xD4-0xD7 */ + 0xB8, 0xB6, 0xCF, 0xC9, 0x81, 0x99, 0x81, 0x9A, /* 0xD8-0xDB */ + 0x81, 0x9B, 0xD9, 0xDA, 0xD8, 0xF0, 0xC7, 0xAA, /* 0xDC-0xDF */ + 0x81, 0x9C, 0xD8, 0xEE, 0x81, 0x9D, 0xB4, 0xFA, /* 0xE0-0xE3 */ + 0xC1, 0xEE, 0xD2, 0xD4, 0x81, 0x9E, 0x81, 0x9F, /* 0xE4-0xE7 */ + 0xD8, 0xED, 0x81, 0xA0, 0xD2, 0xC7, 0xD8, 0xEF, /* 0xE8-0xEB */ + 0xC3, 0xC7, 0x81, 0xA1, 0x81, 0xA2, 0x81, 0xA3, /* 0xEC-0xEF */ + 0xD1, 0xF6, 0x81, 0xA4, 0xD6, 0xD9, 0xD8, 0xF2, /* 0xF0-0xF3 */ + 0x81, 0xA5, 0xD8, 0xF5, 0xBC, 0xFE, 0xBC, 0xDB, /* 0xF4-0xF7 */ + 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, 0xC8, 0xCE, /* 0xF8-0xFB */ + 0x81, 0xA9, 0xB7, 0xDD, 0x81, 0xAA, 0xB7, 0xC2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0x81, 0xAB, 0xC6, 0xF3, 0x81, 0xAC, 0x81, 0xAD, /* 0x00-0x03 */ + 0x81, 0xAE, 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, /* 0x04-0x07 */ + 0x81, 0xB2, 0xD8, 0xF8, 0xD2, 0xC1, 0x81, 0xB3, /* 0x08-0x0B */ + 0x81, 0xB4, 0xCE, 0xE9, 0xBC, 0xBF, 0xB7, 0xFC, /* 0x0C-0x0F */ + 0xB7, 0xA5, 0xD0, 0xDD, 0x81, 0xB5, 0x81, 0xB6, /* 0x10-0x13 */ + 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, 0xD6, 0xDA, /* 0x14-0x17 */ + 0xD3, 0xC5, 0xBB, 0xEF, 0xBB, 0xE1, 0xD8, 0xF1, /* 0x18-0x1B */ + 0x81, 0xBA, 0x81, 0xBB, 0xC9, 0xA1, 0xCE, 0xB0, /* 0x1C-0x1F */ + 0xB4, 0xAB, 0x81, 0xBC, 0xD8, 0xF3, 0x81, 0xBD, /* 0x20-0x23 */ + 0xC9, 0xCB, 0xD8, 0xF6, 0xC2, 0xD7, 0xD8, 0xF7, /* 0x24-0x27 */ + 0x81, 0xBE, 0x81, 0xBF, 0xCE, 0xB1, 0xD8, 0xF9, /* 0x28-0x2B */ + 0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0xB2, 0xAE, /* 0x2C-0x2F */ + 0xB9, 0xC0, 0x81, 0xC3, 0xD9, 0xA3, 0x81, 0xC4, /* 0x30-0x33 */ + 0xB0, 0xE9, 0x81, 0xC5, 0xC1, 0xE6, 0x81, 0xC6, /* 0x34-0x37 */ + 0xC9, 0xEC, 0x81, 0xC7, 0xCB, 0xC5, 0x81, 0xC8, /* 0x38-0x3B */ + 0xCB, 0xC6, 0xD9, 0xA4, 0x81, 0xC9, 0x81, 0xCA, /* 0x3C-0x3F */ + 0x81, 0xCB, 0x81, 0xCC, 0x81, 0xCD, 0xB5, 0xE8, /* 0x40-0x43 */ + 0x81, 0xCE, 0x81, 0xCF, 0xB5, 0xAB, 0x81, 0xD0, /* 0x44-0x47 */ + 0x81, 0xD1, 0x81, 0xD2, 0x81, 0xD3, 0x81, 0xD4, /* 0x48-0x4B */ + 0x81, 0xD5, 0xCE, 0xBB, 0xB5, 0xCD, 0xD7, 0xA1, /* 0x4C-0x4F */ + 0xD7, 0xF4, 0xD3, 0xD3, 0x81, 0xD6, 0xCC, 0xE5, /* 0x50-0x53 */ + 0x81, 0xD7, 0xBA, 0xCE, 0x81, 0xD8, 0xD9, 0xA2, /* 0x54-0x57 */ + 0xD9, 0xDC, 0xD3, 0xE0, 0xD8, 0xFD, 0xB7, 0xF0, /* 0x58-0x5B */ + 0xD7, 0xF7, 0xD8, 0xFE, 0xD8, 0xFA, 0xD9, 0xA1, /* 0x5C-0x5F */ + 0xC4, 0xE3, 0x81, 0xD9, 0x81, 0xDA, 0xD3, 0xB6, /* 0x60-0x63 */ + 0xD8, 0xF4, 0xD9, 0xDD, 0x81, 0xDB, 0xD8, 0xFB, /* 0x64-0x67 */ + 0x81, 0xDC, 0xC5, 0xE5, 0x81, 0xDD, 0x81, 0xDE, /* 0x68-0x6B */ + 0xC0, 0xD0, 0x81, 0xDF, 0x81, 0xE0, 0xD1, 0xF0, /* 0x6C-0x6F */ + 0xB0, 0xDB, 0x81, 0xE1, 0x81, 0xE2, 0xBC, 0xD1, /* 0x70-0x73 */ + 0xD9, 0xA6, 0x81, 0xE3, 0xD9, 0xA5, 0x81, 0xE4, /* 0x74-0x77 */ + 0x81, 0xE5, 0x81, 0xE6, 0x81, 0xE7, 0xD9, 0xAC, /* 0x78-0x7B */ + 0xD9, 0xAE, 0x81, 0xE8, 0xD9, 0xAB, 0xCA, 0xB9, /* 0x7C-0x7F */ + + 0x81, 0xE9, 0x81, 0xEA, 0x81, 0xEB, 0xD9, 0xA9, /* 0x80-0x83 */ + 0xD6, 0xB6, 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, /* 0x84-0x87 */ + 0xB3, 0xDE, 0xD9, 0xA8, 0x81, 0xEF, 0xC0, 0xFD, /* 0x88-0x8B */ + 0x81, 0xF0, 0xCA, 0xCC, 0x81, 0xF1, 0xD9, 0xAA, /* 0x8C-0x8F */ + 0x81, 0xF2, 0xD9, 0xA7, 0x81, 0xF3, 0x81, 0xF4, /* 0x90-0x93 */ + 0xD9, 0xB0, 0x81, 0xF5, 0x81, 0xF6, 0xB6, 0xB1, /* 0x94-0x97 */ + 0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0xB9, 0xA9, /* 0x98-0x9B */ + 0x81, 0xFA, 0xD2, 0xC0, 0x81, 0xFB, 0x81, 0xFC, /* 0x9C-0x9F */ + 0xCF, 0xC0, 0x81, 0xFD, 0x81, 0xFE, 0xC2, 0xC2, /* 0xA0-0xA3 */ + 0x82, 0x40, 0xBD, 0xC4, 0xD5, 0xEC, 0xB2, 0xE0, /* 0xA4-0xA7 */ + 0xC7, 0xC8, 0xBF, 0xEB, 0xD9, 0xAD, 0x82, 0x41, /* 0xA8-0xAB */ + 0xD9, 0xAF, 0x82, 0x42, 0xCE, 0xEA, 0xBA, 0xEE, /* 0xAC-0xAF */ + 0x82, 0x43, 0x82, 0x44, 0x82, 0x45, 0x82, 0x46, /* 0xB0-0xB3 */ + 0x82, 0x47, 0xC7, 0xD6, 0x82, 0x48, 0x82, 0x49, /* 0xB4-0xB7 */ + 0x82, 0x4A, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0xB8-0xBB */ + 0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0xB1, 0xE3, /* 0xBC-0xBF */ + 0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0xB4, 0xD9, /* 0xC0-0xC3 */ + 0xB6, 0xED, 0xD9, 0xB4, 0x82, 0x54, 0x82, 0x55, /* 0xC4-0xC7 */ + 0x82, 0x56, 0x82, 0x57, 0xBF, 0xA1, 0x82, 0x58, /* 0xC8-0xCB */ + 0x82, 0x59, 0x82, 0x5A, 0xD9, 0xDE, 0xC7, 0xCE, /* 0xCC-0xCF */ + 0xC0, 0xFE, 0xD9, 0xB8, 0x82, 0x5B, 0x82, 0x5C, /* 0xD0-0xD3 */ + 0x82, 0x5D, 0x82, 0x5E, 0x82, 0x5F, 0xCB, 0xD7, /* 0xD4-0xD7 */ + 0xB7, 0xFD, 0x82, 0x60, 0xD9, 0xB5, 0x82, 0x61, /* 0xD8-0xDB */ + 0xD9, 0xB7, 0xB1, 0xA3, 0xD3, 0xE1, 0xD9, 0xB9, /* 0xDC-0xDF */ + 0x82, 0x62, 0xD0, 0xC5, 0x82, 0x63, 0xD9, 0xB6, /* 0xE0-0xE3 */ + 0x82, 0x64, 0x82, 0x65, 0xD9, 0xB1, 0x82, 0x66, /* 0xE4-0xE7 */ + 0xD9, 0xB2, 0xC1, 0xA9, 0xD9, 0xB3, 0x82, 0x67, /* 0xE8-0xEB */ + 0x82, 0x68, 0xBC, 0xF3, 0xD0, 0xDE, 0xB8, 0xA9, /* 0xEC-0xEF */ + 0x82, 0x69, 0xBE, 0xE3, 0x82, 0x6A, 0xD9, 0xBD, /* 0xF0-0xF3 */ + 0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0xF4-0xF7 */ + 0xD9, 0xBA, 0x82, 0x6F, 0xB0, 0xB3, 0x82, 0x70, /* 0xF8-0xFB */ + 0x82, 0x71, 0x82, 0x72, 0xD9, 0xC2, 0x82, 0x73, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x00-0x03 */ + 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, 0x82, 0x7B, /* 0x04-0x07 */ + 0x82, 0x7C, 0x82, 0x7D, 0x82, 0x7E, 0x82, 0x80, /* 0x08-0x0B */ + 0xD9, 0xC4, 0xB1, 0xB6, 0x82, 0x81, 0xD9, 0xBF, /* 0x0C-0x0F */ + 0x82, 0x82, 0x82, 0x83, 0xB5, 0xB9, 0x82, 0x84, /* 0x10-0x13 */ + 0xBE, 0xF3, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x14-0x17 */ + 0xCC, 0xC8, 0xBA, 0xF2, 0xD2, 0xD0, 0x82, 0x88, /* 0x18-0x1B */ + 0xD9, 0xC3, 0x82, 0x89, 0x82, 0x8A, 0xBD, 0xE8, /* 0x1C-0x1F */ + 0x82, 0x8B, 0xB3, 0xAB, 0x82, 0x8C, 0x82, 0x8D, /* 0x20-0x23 */ + 0x82, 0x8E, 0xD9, 0xC5, 0xBE, 0xEB, 0x82, 0x8F, /* 0x24-0x27 */ + 0xD9, 0xC6, 0xD9, 0xBB, 0xC4, 0xDF, 0x82, 0x90, /* 0x28-0x2B */ + 0xD9, 0xBE, 0xD9, 0xC1, 0xD9, 0xC0, 0x82, 0x91, /* 0x2C-0x2F */ + 0x82, 0x92, 0x82, 0x93, 0x82, 0x94, 0x82, 0x95, /* 0x30-0x33 */ + 0x82, 0x96, 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, /* 0x34-0x37 */ + 0x82, 0x9A, 0x82, 0x9B, 0xD5, 0xAE, 0x82, 0x9C, /* 0x38-0x3B */ + 0xD6, 0xB5, 0x82, 0x9D, 0xC7, 0xE3, 0x82, 0x9E, /* 0x3C-0x3F */ + 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, 0xD9, 0xC8, /* 0x40-0x43 */ + 0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0xBC, 0xD9, /* 0x44-0x47 */ + 0xD9, 0xCA, 0x82, 0xA5, 0x82, 0xA6, 0x82, 0xA7, /* 0x48-0x4B */ + 0xD9, 0xBC, 0x82, 0xA8, 0xD9, 0xCB, 0xC6, 0xAB, /* 0x4C-0x4F */ + 0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x50-0x53 */ + 0x82, 0xAD, 0xD9, 0xC9, 0x82, 0xAE, 0x82, 0xAF, /* 0x54-0x57 */ + 0x82, 0xB0, 0x82, 0xB1, 0xD7, 0xF6, 0x82, 0xB2, /* 0x58-0x5B */ + 0xCD, 0xA3, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x5C-0x5F */ + 0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x60-0x63 */ + 0x82, 0xBA, 0xBD, 0xA1, 0x82, 0xBB, 0x82, 0xBC, /* 0x64-0x67 */ + 0x82, 0xBD, 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, /* 0x68-0x6B */ + 0xD9, 0xCC, 0x82, 0xC1, 0x82, 0xC2, 0x82, 0xC3, /* 0x6C-0x6F */ + 0x82, 0xC4, 0x82, 0xC5, 0x82, 0xC6, 0x82, 0xC7, /* 0x70-0x73 */ + 0x82, 0xC8, 0x82, 0xC9, 0xC5, 0xBC, 0xCD, 0xB5, /* 0x74-0x77 */ + 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0xD9, 0xCD, /* 0x78-0x7B */ + 0x82, 0xCD, 0x82, 0xCE, 0xD9, 0xC7, 0xB3, 0xA5, /* 0x7C-0x7F */ + + 0xBF, 0xFE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x80-0x83 */ + 0x82, 0xD2, 0xB8, 0xB5, 0x82, 0xD3, 0x82, 0xD4, /* 0x84-0x87 */ + 0xC0, 0xFC, 0x82, 0xD5, 0x82, 0xD6, 0x82, 0xD7, /* 0x88-0x8B */ + 0x82, 0xD8, 0xB0, 0xF8, 0x82, 0xD9, 0x82, 0xDA, /* 0x8C-0x8F */ + 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, 0x82, 0xDE, /* 0x90-0x93 */ + 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, 0x82, 0xE2, /* 0x94-0x97 */ + 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, 0x82, 0xE6, /* 0x98-0x9B */ + 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, 0x82, 0xEA, /* 0x9C-0x9F */ + 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, 0xB4, 0xF6, /* 0xA0-0xA3 */ + 0x82, 0xEE, 0xD9, 0xCE, 0x82, 0xEF, 0xD9, 0xCF, /* 0xA4-0xA7 */ + 0xB4, 0xA2, 0xD9, 0xD0, 0x82, 0xF0, 0x82, 0xF1, /* 0xA8-0xAB */ + 0xB4, 0xDF, 0x82, 0xF2, 0x82, 0xF3, 0x82, 0xF4, /* 0xAC-0xAF */ + 0x82, 0xF5, 0x82, 0xF6, 0xB0, 0xC1, 0x82, 0xF7, /* 0xB0-0xB3 */ + 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, 0x82, 0xFB, /* 0xB4-0xB7 */ + 0x82, 0xFC, 0x82, 0xFD, 0xD9, 0xD1, 0xC9, 0xB5, /* 0xB8-0xBB */ + 0x82, 0xFE, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xBC-0xBF */ + 0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xC0-0xC3 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xC4-0xC7 */ + 0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xC8-0xCB */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0xCF, 0xF1, /* 0xCC-0xCF */ + 0x83, 0x52, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0xD0-0xD3 */ + 0x83, 0x56, 0x83, 0x57, 0xD9, 0xD2, 0x83, 0x58, /* 0xD4-0xD7 */ + 0x83, 0x59, 0x83, 0x5A, 0xC1, 0xC5, 0x83, 0x5B, /* 0xD8-0xDB */ + 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, 0x83, 0x5F, /* 0xDC-0xDF */ + 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0xE0-0xE3 */ + 0x83, 0x64, 0x83, 0x65, 0xD9, 0xD6, 0xC9, 0xAE, /* 0xE4-0xE7 */ + 0x83, 0x66, 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, /* 0xE8-0xEB */ + 0xD9, 0xD5, 0xD9, 0xD4, 0xD9, 0xD7, 0x83, 0x6A, /* 0xEC-0xEF */ + 0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0xCB, 0xDB, /* 0xF0-0xF3 */ + 0x83, 0x6E, 0xBD, 0xA9, 0x83, 0x6F, 0x83, 0x70, /* 0xF4-0xF7 */ + 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, 0xC6, 0xA7, /* 0xF8-0xFB */ + 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, 0x83, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, 0x83, 0x7B, /* 0x00-0x03 */ + 0x83, 0x7C, 0x83, 0x7D, 0xD9, 0xD3, 0xD9, 0xD8, /* 0x04-0x07 */ + 0x83, 0x7E, 0x83, 0x80, 0x83, 0x81, 0xD9, 0xD9, /* 0x08-0x0B */ + 0x83, 0x82, 0x83, 0x83, 0x83, 0x84, 0x83, 0x85, /* 0x0C-0x0F */ + 0x83, 0x86, 0x83, 0x87, 0xC8, 0xE5, 0x83, 0x88, /* 0x10-0x13 */ + 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, 0x83, 0x8C, /* 0x14-0x17 */ + 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, 0x83, 0x90, /* 0x18-0x1B */ + 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, 0x83, 0x94, /* 0x1C-0x1F */ + 0x83, 0x95, 0xC0, 0xDC, 0x83, 0x96, 0x83, 0x97, /* 0x20-0x23 */ + 0x83, 0x98, 0x83, 0x99, 0x83, 0x9A, 0x83, 0x9B, /* 0x24-0x27 */ + 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, 0x83, 0x9F, /* 0x28-0x2B */ + 0x83, 0xA0, 0x83, 0xA1, 0x83, 0xA2, 0x83, 0xA3, /* 0x2C-0x2F */ + 0x83, 0xA4, 0x83, 0xA5, 0x83, 0xA6, 0x83, 0xA7, /* 0x30-0x33 */ + 0x83, 0xA8, 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, /* 0x34-0x37 */ + 0x83, 0xAC, 0x83, 0xAD, 0x83, 0xAE, 0x83, 0xAF, /* 0x38-0x3B */ + 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, 0xB6, 0xF9, /* 0x3C-0x3F */ + 0xD8, 0xA3, 0xD4, 0xCA, 0x83, 0xB3, 0xD4, 0xAA, /* 0x40-0x43 */ + 0xD0, 0xD6, 0xB3, 0xE4, 0xD5, 0xD7, 0x83, 0xB4, /* 0x44-0x47 */ + 0xCF, 0xC8, 0xB9, 0xE2, 0x83, 0xB5, 0xBF, 0xCB, /* 0x48-0x4B */ + 0x83, 0xB6, 0xC3, 0xE2, 0x83, 0xB7, 0x83, 0xB8, /* 0x4C-0x4F */ + 0x83, 0xB9, 0xB6, 0xD2, 0x83, 0xBA, 0x83, 0xBB, /* 0x50-0x53 */ + 0xCD, 0xC3, 0xD9, 0xEE, 0xD9, 0xF0, 0x83, 0xBC, /* 0x54-0x57 */ + 0x83, 0xBD, 0x83, 0xBE, 0xB5, 0xB3, 0x83, 0xBF, /* 0x58-0x5B */ + 0xB6, 0xB5, 0x83, 0xC0, 0x83, 0xC1, 0x83, 0xC2, /* 0x5C-0x5F */ + 0x83, 0xC3, 0x83, 0xC4, 0xBE, 0xA4, 0x83, 0xC5, /* 0x60-0x63 */ + 0x83, 0xC6, 0xC8, 0xEB, 0x83, 0xC7, 0x83, 0xC8, /* 0x64-0x67 */ + 0xC8, 0xAB, 0x83, 0xC9, 0x83, 0xCA, 0xB0, 0xCB, /* 0x68-0x6B */ + 0xB9, 0xAB, 0xC1, 0xF9, 0xD9, 0xE2, 0x83, 0xCB, /* 0x6C-0x6F */ + 0xC0, 0xBC, 0xB9, 0xB2, 0x83, 0xCC, 0xB9, 0xD8, /* 0x70-0x73 */ + 0xD0, 0xCB, 0xB1, 0xF8, 0xC6, 0xE4, 0xBE, 0xDF, /* 0x74-0x77 */ + 0xB5, 0xE4, 0xD7, 0xC8, 0x83, 0xCD, 0xD1, 0xF8, /* 0x78-0x7B */ + 0xBC, 0xE6, 0xCA, 0xDE, 0x83, 0xCE, 0x83, 0xCF, /* 0x7C-0x7F */ + + 0xBC, 0xBD, 0xD9, 0xE6, 0xD8, 0xE7, 0x83, 0xD0, /* 0x80-0x83 */ + 0x83, 0xD1, 0xC4, 0xDA, 0x83, 0xD2, 0x83, 0xD3, /* 0x84-0x87 */ + 0xB8, 0xD4, 0xC8, 0xBD, 0x83, 0xD4, 0x83, 0xD5, /* 0x88-0x8B */ + 0xB2, 0xE1, 0xD4, 0xD9, 0x83, 0xD6, 0x83, 0xD7, /* 0x8C-0x8F */ + 0x83, 0xD8, 0x83, 0xD9, 0xC3, 0xB0, 0x83, 0xDA, /* 0x90-0x93 */ + 0x83, 0xDB, 0xC3, 0xE1, 0xDA, 0xA2, 0xC8, 0xDF, /* 0x94-0x97 */ + 0x83, 0xDC, 0xD0, 0xB4, 0x83, 0xDD, 0xBE, 0xFC, /* 0x98-0x9B */ + 0xC5, 0xA9, 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, /* 0x9C-0x9F */ + 0xB9, 0xDA, 0x83, 0xE1, 0xDA, 0xA3, 0x83, 0xE2, /* 0xA0-0xA3 */ + 0xD4, 0xA9, 0xDA, 0xA4, 0x83, 0xE3, 0x83, 0xE4, /* 0xA4-0xA7 */ + 0x83, 0xE5, 0x83, 0xE6, 0x83, 0xE7, 0xD9, 0xFB, /* 0xA8-0xAB */ + 0xB6, 0xAC, 0x83, 0xE8, 0x83, 0xE9, 0xB7, 0xEB, /* 0xAC-0xAF */ + 0xB1, 0xF9, 0xD9, 0xFC, 0xB3, 0xE5, 0xBE, 0xF6, /* 0xB0-0xB3 */ + 0x83, 0xEA, 0xBF, 0xF6, 0xD2, 0xB1, 0xC0, 0xE4, /* 0xB4-0xB7 */ + 0x83, 0xEB, 0x83, 0xEC, 0x83, 0xED, 0xB6, 0xB3, /* 0xB8-0xBB */ + 0xD9, 0xFE, 0xD9, 0xFD, 0x83, 0xEE, 0x83, 0xEF, /* 0xBC-0xBF */ + 0xBE, 0xBB, 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, /* 0xC0-0xC3 */ + 0xC6, 0xE0, 0x83, 0xF3, 0xD7, 0xBC, 0xDA, 0xA1, /* 0xC4-0xC7 */ + 0x83, 0xF4, 0xC1, 0xB9, 0x83, 0xF5, 0xB5, 0xF2, /* 0xC8-0xCB */ + 0xC1, 0xE8, 0x83, 0xF6, 0x83, 0xF7, 0xBC, 0xF5, /* 0xCC-0xCF */ + 0x83, 0xF8, 0xB4, 0xD5, 0x83, 0xF9, 0x83, 0xFA, /* 0xD0-0xD3 */ + 0x83, 0xFB, 0x83, 0xFC, 0x83, 0xFD, 0x83, 0xFE, /* 0xD4-0xD7 */ + 0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0xC1, 0xDD, /* 0xD8-0xDB */ + 0x84, 0x43, 0xC4, 0xFD, 0x84, 0x44, 0x84, 0x45, /* 0xDC-0xDF */ + 0xBC, 0xB8, 0xB7, 0xB2, 0x84, 0x46, 0x84, 0x47, /* 0xE0-0xE3 */ + 0xB7, 0xEF, 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, /* 0xE4-0xE7 */ + 0x84, 0x4B, 0x84, 0x4C, 0x84, 0x4D, 0xD9, 0xEC, /* 0xE8-0xEB */ + 0x84, 0x4E, 0xC6, 0xBE, 0x84, 0x4F, 0xBF, 0xAD, /* 0xEC-0xEF */ + 0xBB, 0xCB, 0x84, 0x50, 0x84, 0x51, 0xB5, 0xCA, /* 0xF0-0xF3 */ + 0x84, 0x52, 0xDB, 0xC9, 0xD0, 0xD7, 0x84, 0x53, /* 0xF4-0xF7 */ + 0xCD, 0xB9, 0xB0, 0xBC, 0xB3, 0xF6, 0xBB, 0xF7, /* 0xF8-0xFB */ + 0xDB, 0xCA, 0xBA, 0xAF, 0x84, 0x54, 0xD4, 0xE4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xB5, 0xB6, 0xB5, 0xF3, 0xD8, 0xD6, 0xC8, 0xD0, /* 0x00-0x03 */ + 0x84, 0x55, 0x84, 0x56, 0xB7, 0xD6, 0xC7, 0xD0, /* 0x04-0x07 */ + 0xD8, 0xD7, 0x84, 0x57, 0xBF, 0xAF, 0x84, 0x58, /* 0x08-0x0B */ + 0x84, 0x59, 0xDB, 0xBB, 0xD8, 0xD8, 0x84, 0x5A, /* 0x0C-0x0F */ + 0x84, 0x5B, 0xD0, 0xCC, 0xBB, 0xAE, 0x84, 0x5C, /* 0x10-0x13 */ + 0x84, 0x5D, 0x84, 0x5E, 0xEB, 0xBE, 0xC1, 0xD0, /* 0x14-0x17 */ + 0xC1, 0xF5, 0xD4, 0xF2, 0xB8, 0xD5, 0xB4, 0xB4, /* 0x18-0x1B */ + 0x84, 0x5F, 0xB3, 0xF5, 0x84, 0x60, 0x84, 0x61, /* 0x1C-0x1F */ + 0xC9, 0xBE, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x20-0x23 */ + 0xC5, 0xD0, 0x84, 0x65, 0x84, 0x66, 0x84, 0x67, /* 0x24-0x27 */ + 0xC5, 0xD9, 0xC0, 0xFB, 0x84, 0x68, 0xB1, 0xF0, /* 0x28-0x2B */ + 0x84, 0x69, 0xD8, 0xD9, 0xB9, 0xCE, 0x84, 0x6A, /* 0x2C-0x2F */ + 0xB5, 0xBD, 0x84, 0x6B, 0x84, 0x6C, 0xD8, 0xDA, /* 0x30-0x33 */ + 0x84, 0x6D, 0x84, 0x6E, 0xD6, 0xC6, 0xCB, 0xA2, /* 0x34-0x37 */ + 0xC8, 0xAF, 0xC9, 0xB2, 0xB4, 0xCC, 0xBF, 0xCC, /* 0x38-0x3B */ + 0x84, 0x6F, 0xB9, 0xF4, 0x84, 0x70, 0xD8, 0xDB, /* 0x3C-0x3F */ + 0xD8, 0xDC, 0xB6, 0xE7, 0xBC, 0xC1, 0xCC, 0xEA, /* 0x40-0x43 */ + 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, 0x84, 0x74, /* 0x44-0x47 */ + 0x84, 0x75, 0x84, 0x76, 0xCF, 0xF7, 0x84, 0x77, /* 0x48-0x4B */ + 0xD8, 0xDD, 0xC7, 0xB0, 0x84, 0x78, 0x84, 0x79, /* 0x4C-0x4F */ + 0xB9, 0xD0, 0xBD, 0xA3, 0x84, 0x7A, 0x84, 0x7B, /* 0x50-0x53 */ + 0xCC, 0xDE, 0x84, 0x7C, 0xC6, 0xCA, 0x84, 0x7D, /* 0x54-0x57 */ + 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, 0x84, 0x82, /* 0x58-0x5B */ + 0xD8, 0xE0, 0x84, 0x83, 0xD8, 0xDE, 0x84, 0x84, /* 0x5C-0x5F */ + 0x84, 0x85, 0xD8, 0xDF, 0x84, 0x86, 0x84, 0x87, /* 0x60-0x63 */ + 0x84, 0x88, 0xB0, 0xFE, 0x84, 0x89, 0xBE, 0xE7, /* 0x64-0x67 */ + 0x84, 0x8A, 0xCA, 0xA3, 0xBC, 0xF4, 0x84, 0x8B, /* 0x68-0x6B */ + 0x84, 0x8C, 0x84, 0x8D, 0x84, 0x8E, 0xB8, 0xB1, /* 0x6C-0x6F */ + 0x84, 0x8F, 0x84, 0x90, 0xB8, 0xEE, 0x84, 0x91, /* 0x70-0x73 */ + 0x84, 0x92, 0x84, 0x93, 0x84, 0x94, 0x84, 0x95, /* 0x74-0x77 */ + 0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x78-0x7B */ + 0x84, 0x9A, 0xD8, 0xE2, 0x84, 0x9B, 0xBD, 0xCB, /* 0x7C-0x7F */ + + 0x84, 0x9C, 0xD8, 0xE4, 0xD8, 0xE3, 0x84, 0x9D, /* 0x80-0x83 */ + 0x84, 0x9E, 0x84, 0x9F, 0x84, 0xA0, 0x84, 0xA1, /* 0x84-0x87 */ + 0xC5, 0xFC, 0x84, 0xA2, 0x84, 0xA3, 0x84, 0xA4, /* 0x88-0x8B */ + 0x84, 0xA5, 0x84, 0xA6, 0x84, 0xA7, 0x84, 0xA8, /* 0x8C-0x8F */ + 0xD8, 0xE5, 0x84, 0xA9, 0x84, 0xAA, 0xD8, 0xE6, /* 0x90-0x93 */ + 0x84, 0xAB, 0x84, 0xAC, 0x84, 0xAD, 0x84, 0xAE, /* 0x94-0x97 */ + 0x84, 0xAF, 0x84, 0xB0, 0x84, 0xB1, 0xC1, 0xA6, /* 0x98-0x9B */ + 0x84, 0xB2, 0xC8, 0xB0, 0xB0, 0xEC, 0xB9, 0xA6, /* 0x9C-0x9F */ + 0xBC, 0xD3, 0xCE, 0xF1, 0xDB, 0xBD, 0xC1, 0xD3, /* 0xA0-0xA3 */ + 0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0xA4-0xA7 */ + 0xB6, 0xAF, 0xD6, 0xFA, 0xC5, 0xAC, 0xBD, 0xD9, /* 0xA8-0xAB */ + 0xDB, 0xBE, 0xDB, 0xBF, 0x84, 0xB7, 0x84, 0xB8, /* 0xAC-0xAF */ + 0x84, 0xB9, 0xC0, 0xF8, 0xBE, 0xA2, 0xC0, 0xCD, /* 0xB0-0xB3 */ + 0x84, 0xBA, 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, /* 0xB4-0xB7 */ + 0x84, 0xBE, 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, /* 0xB8-0xBB */ + 0x84, 0xC2, 0x84, 0xC3, 0xDB, 0xC0, 0xCA, 0xC6, /* 0xBC-0xBF */ + 0x84, 0xC4, 0x84, 0xC5, 0x84, 0xC6, 0xB2, 0xAA, /* 0xC0-0xC3 */ + 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, 0xD3, 0xC2, /* 0xC4-0xC7 */ + 0x84, 0xCA, 0xC3, 0xE3, 0x84, 0xCB, 0xD1, 0xAB, /* 0xC8-0xCB */ + 0x84, 0xCC, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0xCC-0xCF */ + 0xDB, 0xC2, 0x84, 0xD0, 0xC0, 0xD5, 0x84, 0xD1, /* 0xD0-0xD3 */ + 0x84, 0xD2, 0x84, 0xD3, 0xDB, 0xC3, 0x84, 0xD4, /* 0xD4-0xD7 */ + 0xBF, 0xB1, 0x84, 0xD5, 0x84, 0xD6, 0x84, 0xD7, /* 0xD8-0xDB */ + 0x84, 0xD8, 0x84, 0xD9, 0x84, 0xDA, 0xC4, 0xBC, /* 0xDC-0xDF */ + 0x84, 0xDB, 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, /* 0xE0-0xE3 */ + 0xC7, 0xDA, 0x84, 0xDF, 0x84, 0xE0, 0x84, 0xE1, /* 0xE4-0xE7 */ + 0x84, 0xE2, 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, /* 0xE8-0xEB */ + 0x84, 0xE6, 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, /* 0xEC-0xEF */ + 0xDB, 0xC4, 0x84, 0xEA, 0x84, 0xEB, 0x84, 0xEC, /* 0xF0-0xF3 */ + 0x84, 0xED, 0x84, 0xEE, 0x84, 0xEF, 0x84, 0xF0, /* 0xF4-0xF7 */ + 0x84, 0xF1, 0xD9, 0xE8, 0xC9, 0xD7, 0x84, 0xF2, /* 0xF8-0xFB */ + 0x84, 0xF3, 0x84, 0xF4, 0xB9, 0xB4, 0xCE, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0xD4, 0xC8, 0x84, 0xF5, 0x84, 0xF6, 0x84, 0xF7, /* 0x00-0x03 */ + 0x84, 0xF8, 0xB0, 0xFC, 0xB4, 0xD2, 0x84, 0xF9, /* 0x04-0x07 */ + 0xD0, 0xD9, 0x84, 0xFA, 0x84, 0xFB, 0x84, 0xFC, /* 0x08-0x0B */ + 0x84, 0xFD, 0xD9, 0xE9, 0x84, 0xFE, 0xDE, 0xCB, /* 0x0C-0x0F */ + 0xD9, 0xEB, 0x85, 0x40, 0x85, 0x41, 0x85, 0x42, /* 0x10-0x13 */ + 0x85, 0x43, 0xD8, 0xB0, 0xBB, 0xAF, 0xB1, 0xB1, /* 0x14-0x17 */ + 0x85, 0x44, 0xB3, 0xD7, 0xD8, 0xCE, 0x85, 0x45, /* 0x18-0x1B */ + 0x85, 0x46, 0xD4, 0xD1, 0x85, 0x47, 0x85, 0x48, /* 0x1C-0x1F */ + 0xBD, 0xB3, 0xBF, 0xEF, 0x85, 0x49, 0xCF, 0xBB, /* 0x20-0x23 */ + 0x85, 0x4A, 0x85, 0x4B, 0xD8, 0xD0, 0x85, 0x4C, /* 0x24-0x27 */ + 0x85, 0x4D, 0x85, 0x4E, 0xB7, 0xCB, 0x85, 0x4F, /* 0x28-0x2B */ + 0x85, 0x50, 0x85, 0x51, 0xD8, 0xD1, 0x85, 0x52, /* 0x2C-0x2F */ + 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, 0x85, 0x56, /* 0x30-0x33 */ + 0x85, 0x57, 0x85, 0x58, 0x85, 0x59, 0x85, 0x5A, /* 0x34-0x37 */ + 0x85, 0x5B, 0xC6, 0xA5, 0xC7, 0xF8, 0xD2, 0xBD, /* 0x38-0x3B */ + 0x85, 0x5C, 0x85, 0x5D, 0xD8, 0xD2, 0xC4, 0xE4, /* 0x3C-0x3F */ + 0x85, 0x5E, 0xCA, 0xAE, 0x85, 0x5F, 0xC7, 0xA7, /* 0x40-0x43 */ + 0x85, 0x60, 0xD8, 0xA6, 0x85, 0x61, 0xC9, 0xFD, /* 0x44-0x47 */ + 0xCE, 0xE7, 0xBB, 0xDC, 0xB0, 0xEB, 0x85, 0x62, /* 0x48-0x4B */ + 0x85, 0x63, 0x85, 0x64, 0xBB, 0xAA, 0xD0, 0xAD, /* 0x4C-0x4F */ + 0x85, 0x65, 0xB1, 0xB0, 0xD7, 0xE4, 0xD7, 0xBF, /* 0x50-0x53 */ + 0x85, 0x66, 0xB5, 0xA5, 0xC2, 0xF4, 0xC4, 0xCF, /* 0x54-0x57 */ + 0x85, 0x67, 0x85, 0x68, 0xB2, 0xA9, 0x85, 0x69, /* 0x58-0x5B */ + 0xB2, 0xB7, 0x85, 0x6A, 0xB1, 0xE5, 0xDF, 0xB2, /* 0x5C-0x5F */ + 0xD5, 0xBC, 0xBF, 0xA8, 0xC2, 0xAC, 0xD8, 0xD5, /* 0x60-0x63 */ + 0xC2, 0xB1, 0x85, 0x6B, 0xD8, 0xD4, 0xCE, 0xD4, /* 0x64-0x67 */ + 0x85, 0x6C, 0xDA, 0xE0, 0x85, 0x6D, 0xCE, 0xC0, /* 0x68-0x6B */ + 0x85, 0x6E, 0x85, 0x6F, 0xD8, 0xB4, 0xC3, 0xAE, /* 0x6C-0x6F */ + 0xD3, 0xA1, 0xCE, 0xA3, 0x85, 0x70, 0xBC, 0xB4, /* 0x70-0x73 */ + 0xC8, 0xB4, 0xC2, 0xD1, 0x85, 0x71, 0xBE, 0xED, /* 0x74-0x77 */ + 0xD0, 0xB6, 0x85, 0x72, 0xDA, 0xE1, 0x85, 0x73, /* 0x78-0x7B */ + 0x85, 0x74, 0x85, 0x75, 0x85, 0x76, 0xC7, 0xE4, /* 0x7C-0x7F */ + + 0x85, 0x77, 0x85, 0x78, 0xB3, 0xA7, 0x85, 0x79, /* 0x80-0x83 */ + 0xB6, 0xF2, 0xCC, 0xFC, 0xC0, 0xFA, 0x85, 0x7A, /* 0x84-0x87 */ + 0x85, 0x7B, 0xC0, 0xF7, 0x85, 0x7C, 0xD1, 0xB9, /* 0x88-0x8B */ + 0xD1, 0xE1, 0xD8, 0xC7, 0x85, 0x7D, 0x85, 0x7E, /* 0x8C-0x8F */ + 0x85, 0x80, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x90-0x93 */ + 0x85, 0x84, 0xB2, 0xDE, 0x85, 0x85, 0x85, 0x86, /* 0x94-0x97 */ + 0xC0, 0xE5, 0x85, 0x87, 0xBA, 0xF1, 0x85, 0x88, /* 0x98-0x9B */ + 0x85, 0x89, 0xD8, 0xC8, 0x85, 0x8A, 0xD4, 0xAD, /* 0x9C-0x9F */ + 0x85, 0x8B, 0x85, 0x8C, 0xCF, 0xE1, 0xD8, 0xC9, /* 0xA0-0xA3 */ + 0x85, 0x8D, 0xD8, 0xCA, 0xCF, 0xC3, 0x85, 0x8E, /* 0xA4-0xA7 */ + 0xB3, 0xF8, 0xBE, 0xC7, 0x85, 0x8F, 0x85, 0x90, /* 0xA8-0xAB */ + 0x85, 0x91, 0x85, 0x92, 0xD8, 0xCB, 0x85, 0x93, /* 0xAC-0xAF */ + 0x85, 0x94, 0x85, 0x95, 0x85, 0x96, 0x85, 0x97, /* 0xB0-0xB3 */ + 0x85, 0x98, 0x85, 0x99, 0xDB, 0xCC, 0x85, 0x9A, /* 0xB4-0xB7 */ + 0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0xC8, 0xA5, /* 0xB8-0xBB */ + 0x85, 0x9E, 0x85, 0x9F, 0x85, 0xA0, 0xCF, 0xD8, /* 0xBC-0xBF */ + 0x85, 0xA1, 0xC8, 0xFE, 0xB2, 0xCE, 0x85, 0xA2, /* 0xC0-0xC3 */ + 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, 0x85, 0xA6, /* 0xC4-0xC7 */ + 0xD3, 0xD6, 0xB2, 0xE6, 0xBC, 0xB0, 0xD3, 0xD1, /* 0xC8-0xCB */ + 0xCB, 0xAB, 0xB7, 0xB4, 0x85, 0xA7, 0x85, 0xA8, /* 0xCC-0xCF */ + 0x85, 0xA9, 0xB7, 0xA2, 0x85, 0xAA, 0x85, 0xAB, /* 0xD0-0xD3 */ + 0xCA, 0xE5, 0x85, 0xAC, 0xC8, 0xA1, 0xCA, 0xDC, /* 0xD4-0xD7 */ + 0xB1, 0xE4, 0xD0, 0xF0, 0x85, 0xAD, 0xC5, 0xD1, /* 0xD8-0xDB */ + 0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0xDB, 0xC5, /* 0xDC-0xDF */ + 0xB5, 0xFE, 0x85, 0xB1, 0x85, 0xB2, 0xBF, 0xDA, /* 0xE0-0xE3 */ + 0xB9, 0xC5, 0xBE, 0xE4, 0xC1, 0xED, 0x85, 0xB3, /* 0xE4-0xE7 */ + 0xDF, 0xB6, 0xDF, 0xB5, 0xD6, 0xBB, 0xBD, 0xD0, /* 0xE8-0xEB */ + 0xD5, 0xD9, 0xB0, 0xC8, 0xB6, 0xA3, 0xBF, 0xC9, /* 0xEC-0xEF */ + 0xCC, 0xA8, 0xDF, 0xB3, 0xCA, 0xB7, 0xD3, 0xD2, /* 0xF0-0xF3 */ + 0x85, 0xB4, 0xD8, 0xCF, 0xD2, 0xB6, 0xBA, 0xC5, /* 0xF4-0xF7 */ + 0xCB, 0xBE, 0xCC, 0xBE, 0x85, 0xB5, 0xDF, 0xB7, /* 0xF8-0xFB */ + 0xB5, 0xF0, 0xDF, 0xB4, 0x85, 0xB6, 0x85, 0xB7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_54[512] = { + 0x85, 0xB8, 0xD3, 0xF5, 0x85, 0xB9, 0xB3, 0xD4, /* 0x00-0x03 */ + 0xB8, 0xF7, 0x85, 0xBA, 0xDF, 0xBA, 0x85, 0xBB, /* 0x04-0x07 */ + 0xBA, 0xCF, 0xBC, 0xAA, 0xB5, 0xF5, 0x85, 0xBC, /* 0x08-0x0B */ + 0xCD, 0xAC, 0xC3, 0xFB, 0xBA, 0xF3, 0xC0, 0xF4, /* 0x0C-0x0F */ + 0xCD, 0xC2, 0xCF, 0xF2, 0xDF, 0xB8, 0xCF, 0xC5, /* 0x10-0x13 */ + 0x85, 0xBD, 0xC2, 0xC0, 0xDF, 0xB9, 0xC2, 0xF0, /* 0x14-0x17 */ + 0x85, 0xBE, 0x85, 0xBF, 0x85, 0xC0, 0xBE, 0xFD, /* 0x18-0x1B */ + 0x85, 0xC1, 0xC1, 0xDF, 0xCD, 0xCC, 0xD2, 0xF7, /* 0x1C-0x1F */ + 0xB7, 0xCD, 0xDF, 0xC1, 0x85, 0xC2, 0xDF, 0xC4, /* 0x20-0x23 */ + 0x85, 0xC3, 0x85, 0xC4, 0xB7, 0xF1, 0xB0, 0xC9, /* 0x24-0x27 */ + 0xB6, 0xD6, 0xB7, 0xD4, 0x85, 0xC5, 0xBA, 0xAC, /* 0x28-0x2B */ + 0xCC, 0xFD, 0xBF, 0xD4, 0xCB, 0xB1, 0xC6, 0xF4, /* 0x2C-0x2F */ + 0x85, 0xC6, 0xD6, 0xA8, 0xDF, 0xC5, 0x85, 0xC7, /* 0x30-0x33 */ + 0xCE, 0xE2, 0xB3, 0xB3, 0x85, 0xC8, 0x85, 0xC9, /* 0x34-0x37 */ + 0xCE, 0xFC, 0xB4, 0xB5, 0x85, 0xCA, 0xCE, 0xC7, /* 0x38-0x3B */ + 0xBA, 0xF0, 0x85, 0xCB, 0xCE, 0xE1, 0x85, 0xCC, /* 0x3C-0x3F */ + 0xD1, 0xBD, 0x85, 0xCD, 0x85, 0xCE, 0xDF, 0xC0, /* 0x40-0x43 */ + 0x85, 0xCF, 0x85, 0xD0, 0xB4, 0xF4, 0x85, 0xD1, /* 0x44-0x47 */ + 0xB3, 0xCA, 0x85, 0xD2, 0xB8, 0xE6, 0xDF, 0xBB, /* 0x48-0x4B */ + 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, 0x85, 0xD6, /* 0x4C-0x4F */ + 0xC4, 0xC5, 0x85, 0xD7, 0xDF, 0xBC, 0xDF, 0xBD, /* 0x50-0x53 */ + 0xDF, 0xBE, 0xC5, 0xBB, 0xDF, 0xBF, 0xDF, 0xC2, /* 0x54-0x57 */ + 0xD4, 0xB1, 0xDF, 0xC3, 0x85, 0xD8, 0xC7, 0xBA, /* 0x58-0x5B */ + 0xCE, 0xD8, 0x85, 0xD9, 0x85, 0xDA, 0x85, 0xDB, /* 0x5C-0x5F */ + 0x85, 0xDC, 0x85, 0xDD, 0xC4, 0xD8, 0x85, 0xDE, /* 0x60-0x63 */ + 0xDF, 0xCA, 0x85, 0xDF, 0xDF, 0xCF, 0x85, 0xE0, /* 0x64-0x67 */ + 0xD6, 0xDC, 0x85, 0xE1, 0x85, 0xE2, 0x85, 0xE3, /* 0x68-0x6B */ + 0x85, 0xE4, 0x85, 0xE5, 0x85, 0xE6, 0x85, 0xE7, /* 0x6C-0x6F */ + 0x85, 0xE8, 0xDF, 0xC9, 0xDF, 0xDA, 0xCE, 0xB6, /* 0x70-0x73 */ + 0x85, 0xE9, 0xBA, 0xC7, 0xDF, 0xCE, 0xDF, 0xC8, /* 0x74-0x77 */ + 0xC5, 0xDE, 0x85, 0xEA, 0x85, 0xEB, 0xC9, 0xEB, /* 0x78-0x7B */ + 0xBA, 0xF4, 0xC3, 0xFC, 0x85, 0xEC, 0x85, 0xED, /* 0x7C-0x7F */ + + 0xBE, 0xD7, 0x85, 0xEE, 0xDF, 0xC6, 0x85, 0xEF, /* 0x80-0x83 */ + 0xDF, 0xCD, 0x85, 0xF0, 0xC5, 0xD8, 0x85, 0xF1, /* 0x84-0x87 */ + 0x85, 0xF2, 0x85, 0xF3, 0x85, 0xF4, 0xD5, 0xA6, /* 0x88-0x8B */ + 0xBA, 0xCD, 0x85, 0xF5, 0xBE, 0xCC, 0xD3, 0xBD, /* 0x8C-0x8F */ + 0xB8, 0xC0, 0x85, 0xF6, 0xD6, 0xE4, 0x85, 0xF7, /* 0x90-0x93 */ + 0xDF, 0xC7, 0xB9, 0xBE, 0xBF, 0xA7, 0x85, 0xF8, /* 0x94-0x97 */ + 0x85, 0xF9, 0xC1, 0xFC, 0xDF, 0xCB, 0xDF, 0xCC, /* 0x98-0x9B */ + 0x85, 0xFA, 0xDF, 0xD0, 0x85, 0xFB, 0x85, 0xFC, /* 0x9C-0x9F */ + 0x85, 0xFD, 0x85, 0xFE, 0x86, 0x40, 0xDF, 0xDB, /* 0xA0-0xA3 */ + 0xDF, 0xE5, 0x86, 0x41, 0xDF, 0xD7, 0xDF, 0xD6, /* 0xA4-0xA7 */ + 0xD7, 0xC9, 0xDF, 0xE3, 0xDF, 0xE4, 0xE5, 0xEB, /* 0xA8-0xAB */ + 0xD2, 0xA7, 0xDF, 0xD2, 0x86, 0x42, 0xBF, 0xA9, /* 0xAC-0xAF */ + 0x86, 0x43, 0xD4, 0xDB, 0x86, 0x44, 0xBF, 0xC8, /* 0xB0-0xB3 */ + 0xDF, 0xD4, 0x86, 0x45, 0x86, 0x46, 0x86, 0x47, /* 0xB4-0xB7 */ + 0xCF, 0xCC, 0x86, 0x48, 0x86, 0x49, 0xDF, 0xDD, /* 0xB8-0xBB */ + 0x86, 0x4A, 0xD1, 0xCA, 0x86, 0x4B, 0xDF, 0xDE, /* 0xBC-0xBF */ + 0xB0, 0xA7, 0xC6, 0xB7, 0xDF, 0xD3, 0x86, 0x4C, /* 0xC0-0xC3 */ + 0xBA, 0xE5, 0x86, 0x4D, 0xB6, 0xDF, 0xCD, 0xDB, /* 0xC4-0xC7 */ + 0xB9, 0xFE, 0xD4, 0xD5, 0x86, 0x4E, 0x86, 0x4F, /* 0xC8-0xCB */ + 0xDF, 0xDF, 0xCF, 0xEC, 0xB0, 0xA5, 0xDF, 0xE7, /* 0xCC-0xCF */ + 0xDF, 0xD1, 0xD1, 0xC6, 0xDF, 0xD5, 0xDF, 0xD8, /* 0xD0-0xD3 */ + 0xDF, 0xD9, 0xDF, 0xDC, 0x86, 0x50, 0xBB, 0xA9, /* 0xD4-0xD7 */ + 0x86, 0x51, 0xDF, 0xE0, 0xDF, 0xE1, 0x86, 0x52, /* 0xD8-0xDB */ + 0xDF, 0xE2, 0xDF, 0xE6, 0xDF, 0xE8, 0xD3, 0xB4, /* 0xDC-0xDF */ + 0x86, 0x53, 0x86, 0x54, 0x86, 0x55, 0x86, 0x56, /* 0xE0-0xE3 */ + 0x86, 0x57, 0xB8, 0xE7, 0xC5, 0xB6, 0xDF, 0xEA, /* 0xE4-0xE7 */ + 0xC9, 0xDA, 0xC1, 0xA8, 0xC4, 0xC4, 0x86, 0x58, /* 0xE8-0xEB */ + 0x86, 0x59, 0xBF, 0xDE, 0xCF, 0xF8, 0x86, 0x5A, /* 0xEC-0xEF */ + 0x86, 0x5B, 0x86, 0x5C, 0xD5, 0xDC, 0xDF, 0xEE, /* 0xF0-0xF3 */ + 0x86, 0x5D, 0x86, 0x5E, 0x86, 0x5F, 0x86, 0x60, /* 0xF4-0xF7 */ + 0x86, 0x61, 0x86, 0x62, 0xB2, 0xB8, 0x86, 0x63, /* 0xF8-0xFB */ + 0xBA, 0xDF, 0xDF, 0xEC, 0x86, 0x64, 0xDB, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x86, 0x65, 0xD1, 0xE4, 0x86, 0x66, 0x86, 0x67, /* 0x00-0x03 */ + 0x86, 0x68, 0x86, 0x69, 0xCB, 0xF4, 0xB4, 0xBD, /* 0x04-0x07 */ + 0x86, 0x6A, 0xB0, 0xA6, 0x86, 0x6B, 0x86, 0x6C, /* 0x08-0x0B */ + 0x86, 0x6D, 0x86, 0x6E, 0x86, 0x6F, 0xDF, 0xF1, /* 0x0C-0x0F */ + 0xCC, 0xC6, 0xDF, 0xF2, 0x86, 0x70, 0x86, 0x71, /* 0x10-0x13 */ + 0xDF, 0xED, 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, /* 0x14-0x17 */ + 0x86, 0x75, 0x86, 0x76, 0x86, 0x77, 0xDF, 0xE9, /* 0x18-0x1B */ + 0x86, 0x78, 0x86, 0x79, 0x86, 0x7A, 0x86, 0x7B, /* 0x1C-0x1F */ + 0xDF, 0xEB, 0x86, 0x7C, 0xDF, 0xEF, 0xDF, 0xF0, /* 0x20-0x23 */ + 0xBB, 0xBD, 0x86, 0x7D, 0x86, 0x7E, 0xDF, 0xF3, /* 0x24-0x27 */ + 0x86, 0x80, 0x86, 0x81, 0xDF, 0xF4, 0x86, 0x82, /* 0x28-0x2B */ + 0xBB, 0xA3, 0x86, 0x83, 0xCA, 0xDB, 0xCE, 0xA8, /* 0x2C-0x2F */ + 0xE0, 0xA7, 0xB3, 0xAA, 0x86, 0x84, 0xE0, 0xA6, /* 0x30-0x33 */ + 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, 0xE0, 0xA1, /* 0x34-0x37 */ + 0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0x38-0x3B */ + 0xDF, 0xFE, 0x86, 0x8C, 0xCD, 0xD9, 0xDF, 0xFC, /* 0x3C-0x3F */ + 0x86, 0x8D, 0xDF, 0xFA, 0x86, 0x8E, 0xBF, 0xD0, /* 0x40-0x43 */ + 0xD7, 0xC4, 0x86, 0x8F, 0xC9, 0xCC, 0x86, 0x90, /* 0x44-0x47 */ + 0x86, 0x91, 0xDF, 0xF8, 0xB0, 0xA1, 0x86, 0x92, /* 0x48-0x4B */ + 0x86, 0x93, 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, /* 0x4C-0x4F */ + 0xDF, 0xFD, 0x86, 0x97, 0x86, 0x98, 0x86, 0x99, /* 0x50-0x53 */ + 0x86, 0x9A, 0xDF, 0xFB, 0xE0, 0xA2, 0x86, 0x9B, /* 0x54-0x57 */ + 0x86, 0x9C, 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, /* 0x58-0x5B */ + 0xE0, 0xA8, 0x86, 0xA0, 0x86, 0xA1, 0x86, 0xA2, /* 0x5C-0x5F */ + 0x86, 0xA3, 0xB7, 0xC8, 0x86, 0xA4, 0x86, 0xA5, /* 0x60-0x63 */ + 0xC6, 0xA1, 0xC9, 0xB6, 0xC0, 0xB2, 0xDF, 0xF5, /* 0x64-0x67 */ + 0x86, 0xA6, 0x86, 0xA7, 0xC5, 0xBE, 0x86, 0xA8, /* 0x68-0x6B */ + 0xD8, 0xC4, 0xDF, 0xF9, 0xC4, 0xF6, 0x86, 0xA9, /* 0x6C-0x6F */ + 0x86, 0xAA, 0x86, 0xAB, 0x86, 0xAC, 0x86, 0xAD, /* 0x70-0x73 */ + 0x86, 0xAE, 0xE0, 0xA3, 0xE0, 0xA4, 0xE0, 0xA5, /* 0x74-0x77 */ + 0xD0, 0xA5, 0x86, 0xAF, 0x86, 0xB0, 0xE0, 0xB4, /* 0x78-0x7B */ + 0xCC, 0xE4, 0x86, 0xB1, 0xE0, 0xB1, 0x86, 0xB2, /* 0x7C-0x7F */ + + 0xBF, 0xA6, 0xE0, 0xAF, 0xCE, 0xB9, 0xE0, 0xAB, /* 0x80-0x83 */ + 0xC9, 0xC6, 0x86, 0xB3, 0x86, 0xB4, 0xC0, 0xAE, /* 0x84-0x87 */ + 0xE0, 0xAE, 0xBA, 0xED, 0xBA, 0xB0, 0xE0, 0xA9, /* 0x88-0x8B */ + 0x86, 0xB5, 0x86, 0xB6, 0x86, 0xB7, 0xDF, 0xF6, /* 0x8C-0x8F */ + 0x86, 0xB8, 0xE0, 0xB3, 0x86, 0xB9, 0x86, 0xBA, /* 0x90-0x93 */ + 0xE0, 0xB8, 0x86, 0xBB, 0x86, 0xBC, 0x86, 0xBD, /* 0x94-0x97 */ + 0xB4, 0xAD, 0xE0, 0xB9, 0x86, 0xBE, 0x86, 0xBF, /* 0x98-0x9B */ + 0xCF, 0xB2, 0xBA, 0xC8, 0x86, 0xC0, 0xE0, 0xB0, /* 0x9C-0x9F */ + 0x86, 0xC1, 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, /* 0xA0-0xA3 */ + 0x86, 0xC5, 0x86, 0xC6, 0x86, 0xC7, 0xD0, 0xFA, /* 0xA4-0xA7 */ + 0x86, 0xC8, 0x86, 0xC9, 0x86, 0xCA, 0x86, 0xCB, /* 0xA8-0xAB */ + 0x86, 0xCC, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0xAC-0xAF */ + 0x86, 0xD0, 0xE0, 0xAC, 0x86, 0xD1, 0xD4, 0xFB, /* 0xB0-0xB3 */ + 0x86, 0xD2, 0xDF, 0xF7, 0x86, 0xD3, 0xC5, 0xE7, /* 0xB4-0xB7 */ + 0x86, 0xD4, 0xE0, 0xAD, 0x86, 0xD5, 0xD3, 0xF7, /* 0xB8-0xBB */ + 0x86, 0xD6, 0xE0, 0xB6, 0xE0, 0xB7, 0x86, 0xD7, /* 0xBC-0xBF */ + 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, 0x86, 0xDB, /* 0xC0-0xC3 */ + 0xE0, 0xC4, 0xD0, 0xE1, 0x86, 0xDC, 0x86, 0xDD, /* 0xC4-0xC7 */ + 0x86, 0xDE, 0xE0, 0xBC, 0x86, 0xDF, 0x86, 0xE0, /* 0xC8-0xCB */ + 0xE0, 0xC9, 0xE0, 0xCA, 0x86, 0xE1, 0x86, 0xE2, /* 0xCC-0xCF */ + 0x86, 0xE3, 0xE0, 0xBE, 0xE0, 0xAA, 0xC9, 0xA4, /* 0xD0-0xD3 */ + 0xE0, 0xC1, 0x86, 0xE4, 0xE0, 0xB2, 0x86, 0xE5, /* 0xD4-0xD7 */ + 0x86, 0xE6, 0x86, 0xE7, 0x86, 0xE8, 0x86, 0xE9, /* 0xD8-0xDB */ + 0xCA, 0xC8, 0xE0, 0xC3, 0x86, 0xEA, 0xE0, 0xB5, /* 0xDC-0xDF */ + 0x86, 0xEB, 0xCE, 0xCB, 0x86, 0xEC, 0xCB, 0xC3, /* 0xE0-0xE3 */ + 0xE0, 0xCD, 0xE0, 0xC6, 0xE0, 0xC2, 0x86, 0xED, /* 0xE4-0xE7 */ + 0xE0, 0xCB, 0x86, 0xEE, 0xE0, 0xBA, 0xE0, 0xBF, /* 0xE8-0xEB */ + 0xE0, 0xC0, 0x86, 0xEF, 0x86, 0xF0, 0xE0, 0xC5, /* 0xEC-0xEF */ + 0x86, 0xF1, 0x86, 0xF2, 0xE0, 0xC7, 0xE0, 0xC8, /* 0xF0-0xF3 */ + 0x86, 0xF3, 0xE0, 0xCC, 0x86, 0xF4, 0xE0, 0xBB, /* 0xF4-0xF7 */ + 0x86, 0xF5, 0x86, 0xF6, 0x86, 0xF7, 0x86, 0xF8, /* 0xF8-0xFB */ + 0x86, 0xF9, 0xCB, 0xD4, 0xE0, 0xD5, 0x86, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0xE0, 0xD6, 0xE0, 0xD2, 0x86, 0xFB, 0x86, 0xFC, /* 0x00-0x03 */ + 0x86, 0xFD, 0x86, 0xFE, 0x87, 0x40, 0x87, 0x41, /* 0x04-0x07 */ + 0xE0, 0xD0, 0xBC, 0xCE, 0x87, 0x42, 0x87, 0x43, /* 0x08-0x0B */ + 0xE0, 0xD1, 0x87, 0x44, 0xB8, 0xC2, 0xD8, 0xC5, /* 0x0C-0x0F */ + 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, 0x87, 0x48, /* 0x10-0x13 */ + 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, 0x87, 0x4C, /* 0x14-0x17 */ + 0xD0, 0xEA, 0x87, 0x4D, 0x87, 0x4E, 0xC2, 0xEF, /* 0x18-0x1B */ + 0x87, 0x4F, 0x87, 0x50, 0xE0, 0xCF, 0xE0, 0xBD, /* 0x1C-0x1F */ + 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, 0xE0, 0xD4, /* 0x20-0x23 */ + 0xE0, 0xD3, 0x87, 0x54, 0x87, 0x55, 0xE0, 0xD7, /* 0x24-0x27 */ + 0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0x28-0x2B */ + 0xE0, 0xDC, 0xE0, 0xD8, 0x87, 0x5A, 0x87, 0x5B, /* 0x2C-0x2F */ + 0x87, 0x5C, 0xD6, 0xF6, 0xB3, 0xB0, 0x87, 0x5D, /* 0x30-0x33 */ + 0xD7, 0xEC, 0x87, 0x5E, 0xCB, 0xBB, 0x87, 0x5F, /* 0x34-0x37 */ + 0x87, 0x60, 0xE0, 0xDA, 0x87, 0x61, 0xCE, 0xFB, /* 0x38-0x3B */ + 0x87, 0x62, 0x87, 0x63, 0x87, 0x64, 0xBA, 0xD9, /* 0x3C-0x3F */ + 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, 0x87, 0x68, /* 0x40-0x43 */ + 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, 0x87, 0x6C, /* 0x44-0x47 */ + 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, 0x87, 0x70, /* 0x48-0x4B */ + 0xE0, 0xE1, 0xE0, 0xDD, 0xD2, 0xAD, 0x87, 0x71, /* 0x4C-0x4F */ + 0x87, 0x72, 0x87, 0x73, 0x87, 0x74, 0x87, 0x75, /* 0x50-0x53 */ + 0xE0, 0xE2, 0x87, 0x76, 0x87, 0x77, 0xE0, 0xDB, /* 0x54-0x57 */ + 0xE0, 0xD9, 0xE0, 0xDF, 0x87, 0x78, 0x87, 0x79, /* 0x58-0x5B */ + 0xE0, 0xE0, 0x87, 0x7A, 0x87, 0x7B, 0x87, 0x7C, /* 0x5C-0x5F */ + 0x87, 0x7D, 0x87, 0x7E, 0xE0, 0xDE, 0x87, 0x80, /* 0x60-0x63 */ + 0xE0, 0xE4, 0x87, 0x81, 0x87, 0x82, 0x87, 0x83, /* 0x64-0x67 */ + 0xC6, 0xF7, 0xD8, 0xAC, 0xD4, 0xEB, 0xE0, 0xE6, /* 0x68-0x6B */ + 0xCA, 0xC9, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0x6C-0x6F */ + 0x87, 0x87, 0xE0, 0xE5, 0x87, 0x88, 0x87, 0x89, /* 0x70-0x73 */ + 0x87, 0x8A, 0x87, 0x8B, 0xB8, 0xC1, 0x87, 0x8C, /* 0x74-0x77 */ + 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, 0xE0, 0xE7, /* 0x78-0x7B */ + 0xE0, 0xE8, 0x87, 0x90, 0x87, 0x91, 0x87, 0x92, /* 0x7C-0x7F */ + + 0x87, 0x93, 0x87, 0x94, 0x87, 0x95, 0x87, 0x96, /* 0x80-0x83 */ + 0x87, 0x97, 0xE0, 0xE9, 0xE0, 0xE3, 0x87, 0x98, /* 0x84-0x87 */ + 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, 0x87, 0x9C, /* 0x88-0x8B */ + 0x87, 0x9D, 0x87, 0x9E, 0xBA, 0xBF, 0xCC, 0xE7, /* 0x8C-0x8F */ + 0x87, 0x9F, 0x87, 0xA0, 0x87, 0xA1, 0xE0, 0xEA, /* 0x90-0x93 */ + 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, 0x87, 0xA5, /* 0x94-0x97 */ + 0x87, 0xA6, 0x87, 0xA7, 0x87, 0xA8, 0x87, 0xA9, /* 0x98-0x9B */ + 0x87, 0xAA, 0x87, 0xAB, 0x87, 0xAC, 0x87, 0xAD, /* 0x9C-0x9F */ + 0x87, 0xAE, 0x87, 0xAF, 0x87, 0xB0, 0xCF, 0xF9, /* 0xA0-0xA3 */ + 0x87, 0xB1, 0x87, 0xB2, 0x87, 0xB3, 0x87, 0xB4, /* 0xA4-0xA7 */ + 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, 0x87, 0xB8, /* 0xA8-0xAB */ + 0x87, 0xB9, 0x87, 0xBA, 0x87, 0xBB, 0xE0, 0xEB, /* 0xAC-0xAF */ + 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, 0x87, 0xBF, /* 0xB0-0xB3 */ + 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, 0xC8, 0xC2, /* 0xB4-0xB7 */ + 0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0xB8-0xBB */ + 0xBD, 0xC0, 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, /* 0xBC-0xBF */ + 0x87, 0xCA, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0xC0-0xC3 */ + 0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0xC4-0xC7 */ + 0x87, 0xD2, 0x87, 0xD3, 0xC4, 0xD2, 0x87, 0xD4, /* 0xC8-0xCB */ + 0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0xCC-0xCF */ + 0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0xD0-0xD3 */ + 0xE0, 0xEC, 0x87, 0xDD, 0x87, 0xDE, 0xE0, 0xED, /* 0xD4-0xD7 */ + 0x87, 0xDF, 0x87, 0xE0, 0xC7, 0xF4, 0xCB, 0xC4, /* 0xD8-0xDB */ + 0x87, 0xE1, 0xE0, 0xEE, 0xBB, 0xD8, 0xD8, 0xB6, /* 0xDC-0xDF */ + 0xD2, 0xF2, 0xE0, 0xEF, 0xCD, 0xC5, 0x87, 0xE2, /* 0xE0-0xE3 */ + 0xB6, 0xDA, 0x87, 0xE3, 0x87, 0xE4, 0x87, 0xE5, /* 0xE4-0xE7 */ + 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, 0xE0, 0xF1, /* 0xE8-0xEB */ + 0x87, 0xE9, 0xD4, 0xB0, 0x87, 0xEA, 0x87, 0xEB, /* 0xEC-0xEF */ + 0xC0, 0xA7, 0xB4, 0xD1, 0x87, 0xEC, 0x87, 0xED, /* 0xF0-0xF3 */ + 0xCE, 0xA7, 0xE0, 0xF0, 0x87, 0xEE, 0x87, 0xEF, /* 0xF4-0xF7 */ + 0x87, 0xF0, 0xE0, 0xF2, 0xB9, 0xCC, 0x87, 0xF1, /* 0xF8-0xFB */ + 0x87, 0xF2, 0xB9, 0xFA, 0xCD, 0xBC, 0xE0, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, 0xC6, 0xD4, /* 0x00-0x03 */ + 0xE0, 0xF4, 0x87, 0xF6, 0xD4, 0xB2, 0x87, 0xF7, /* 0x04-0x07 */ + 0xC8, 0xA6, 0xE0, 0xF6, 0xE0, 0xF5, 0x87, 0xF8, /* 0x08-0x0B */ + 0x87, 0xF9, 0x87, 0xFA, 0x87, 0xFB, 0x87, 0xFC, /* 0x0C-0x0F */ + 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x40, 0x88, 0x41, /* 0x10-0x13 */ + 0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x14-0x17 */ + 0x88, 0x46, 0x88, 0x47, 0x88, 0x48, 0x88, 0x49, /* 0x18-0x1B */ + 0xE0, 0xF7, 0x88, 0x4A, 0x88, 0x4B, 0xCD, 0xC1, /* 0x1C-0x1F */ + 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, 0xCA, 0xA5, /* 0x20-0x23 */ + 0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x24-0x27 */ + 0xD4, 0xDA, 0xDB, 0xD7, 0xDB, 0xD9, 0x88, 0x53, /* 0x28-0x2B */ + 0xDB, 0xD8, 0xB9, 0xE7, 0xDB, 0xDC, 0xDB, 0xDD, /* 0x2C-0x2F */ + 0xB5, 0xD8, 0x88, 0x54, 0x88, 0x55, 0xDB, 0xDA, /* 0x30-0x33 */ + 0x88, 0x56, 0x88, 0x57, 0x88, 0x58, 0x88, 0x59, /* 0x34-0x37 */ + 0x88, 0x5A, 0xDB, 0xDB, 0xB3, 0xA1, 0xDB, 0xDF, /* 0x38-0x3B */ + 0x88, 0x5B, 0x88, 0x5C, 0xBB, 0xF8, 0x88, 0x5D, /* 0x3C-0x3F */ + 0xD6, 0xB7, 0x88, 0x5E, 0xDB, 0xE0, 0x88, 0x5F, /* 0x40-0x43 */ + 0x88, 0x60, 0x88, 0x61, 0x88, 0x62, 0xBE, 0xF9, /* 0x44-0x47 */ + 0x88, 0x63, 0x88, 0x64, 0xB7, 0xBB, 0x88, 0x65, /* 0x48-0x4B */ + 0xDB, 0xD0, 0xCC, 0xAE, 0xBF, 0xB2, 0xBB, 0xB5, /* 0x4C-0x4F */ + 0xD7, 0xF8, 0xBF, 0xD3, 0x88, 0x66, 0x88, 0x67, /* 0x50-0x53 */ + 0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0xBF, 0xE9, /* 0x54-0x57 */ + 0x88, 0x6B, 0x88, 0x6C, 0xBC, 0xE1, 0xCC, 0xB3, /* 0x58-0x5B */ + 0xDB, 0xDE, 0xB0, 0xD3, 0xCE, 0xEB, 0xB7, 0xD8, /* 0x5C-0x5F */ + 0xD7, 0xB9, 0xC6, 0xC2, 0x88, 0x6D, 0x88, 0x6E, /* 0x60-0x63 */ + 0xC0, 0xA4, 0x88, 0x6F, 0xCC, 0xB9, 0x88, 0x70, /* 0x64-0x67 */ + 0xDB, 0xE7, 0xDB, 0xE1, 0xC6, 0xBA, 0xDB, 0xE3, /* 0x68-0x6B */ + 0x88, 0x71, 0xDB, 0xE8, 0x88, 0x72, 0xC5, 0xF7, /* 0x6C-0x6F */ + 0x88, 0x73, 0x88, 0x74, 0x88, 0x75, 0xDB, 0xEA, /* 0x70-0x73 */ + 0x88, 0x76, 0x88, 0x77, 0xDB, 0xE9, 0xBF, 0xC0, /* 0x74-0x77 */ + 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, 0xDB, 0xE6, /* 0x78-0x7B */ + 0xDB, 0xE5, 0x88, 0x7B, 0x88, 0x7C, 0x88, 0x7D, /* 0x7C-0x7F */ + + 0x88, 0x7E, 0x88, 0x80, 0xB4, 0xB9, 0xC0, 0xAC, /* 0x80-0x83 */ + 0xC2, 0xA2, 0xDB, 0xE2, 0xDB, 0xE4, 0x88, 0x81, /* 0x84-0x87 */ + 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, 0xD0, 0xCD, /* 0x88-0x8B */ + 0xDB, 0xED, 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, /* 0x8C-0x8F */ + 0x88, 0x88, 0x88, 0x89, 0xC0, 0xDD, 0xDB, 0xF2, /* 0x90-0x93 */ + 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, 0x88, 0x8D, /* 0x94-0x97 */ + 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, 0xB6, 0xE2, /* 0x98-0x9B */ + 0x88, 0x91, 0x88, 0x92, 0x88, 0x93, 0x88, 0x94, /* 0x9C-0x9F */ + 0xDB, 0xF3, 0xDB, 0xD2, 0xB9, 0xB8, 0xD4, 0xAB, /* 0xA0-0xA3 */ + 0xDB, 0xEC, 0x88, 0x95, 0xBF, 0xD1, 0xDB, 0xF0, /* 0xA4-0xA7 */ + 0x88, 0x96, 0xDB, 0xD1, 0x88, 0x97, 0xB5, 0xE6, /* 0xA8-0xAB */ + 0x88, 0x98, 0xDB, 0xEB, 0xBF, 0xE5, 0x88, 0x99, /* 0xAC-0xAF */ + 0x88, 0x9A, 0x88, 0x9B, 0xDB, 0xEE, 0x88, 0x9C, /* 0xB0-0xB3 */ + 0xDB, 0xF1, 0x88, 0x9D, 0x88, 0x9E, 0x88, 0x9F, /* 0xB4-0xB7 */ + 0xDB, 0xF9, 0x88, 0xA0, 0x88, 0xA1, 0x88, 0xA2, /* 0xB8-0xBB */ + 0x88, 0xA3, 0x88, 0xA4, 0x88, 0xA5, 0x88, 0xA6, /* 0xBC-0xBF */ + 0x88, 0xA7, 0x88, 0xA8, 0xB9, 0xA1, 0xB0, 0xA3, /* 0xC0-0xC3 */ + 0x88, 0xA9, 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, /* 0xC4-0xC7 */ + 0x88, 0xAD, 0x88, 0xAE, 0x88, 0xAF, 0xC2, 0xF1, /* 0xC8-0xCB */ + 0x88, 0xB0, 0x88, 0xB1, 0xB3, 0xC7, 0xDB, 0xEF, /* 0xCC-0xCF */ + 0x88, 0xB2, 0x88, 0xB3, 0xDB, 0xF8, 0x88, 0xB4, /* 0xD0-0xD3 */ + 0xC6, 0xD2, 0xDB, 0xF4, 0x88, 0xB5, 0x88, 0xB6, /* 0xD4-0xD7 */ + 0xDB, 0xF5, 0xDB, 0xF7, 0xDB, 0xF6, 0x88, 0xB7, /* 0xD8-0xDB */ + 0x88, 0xB8, 0xDB, 0xFE, 0x88, 0xB9, 0xD3, 0xF2, /* 0xDC-0xDF */ + 0xB2, 0xBA, 0x88, 0xBA, 0x88, 0xBB, 0x88, 0xBC, /* 0xE0-0xE3 */ + 0xDB, 0xFD, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0xE4-0xE7 */ + 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, 0x88, 0xC3, /* 0xE8-0xEB */ + 0x88, 0xC4, 0xDC, 0xA4, 0x88, 0xC5, 0xDB, 0xFB, /* 0xEC-0xEF */ + 0x88, 0xC6, 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, /* 0xF0-0xF3 */ + 0xDB, 0xFA, 0x88, 0xCA, 0x88, 0xCB, 0x88, 0xCC, /* 0xF4-0xF7 */ + 0xDB, 0xFC, 0xC5, 0xE0, 0xBB, 0xF9, 0x88, 0xCD, /* 0xF8-0xFB */ + 0x88, 0xCE, 0xDC, 0xA3, 0x88, 0xCF, 0x88, 0xD0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xDC, 0xA5, 0x88, 0xD1, 0xCC, 0xC3, 0x88, 0xD2, /* 0x00-0x03 */ + 0x88, 0xD3, 0x88, 0xD4, 0xB6, 0xD1, 0xDD, 0xC0, /* 0x04-0x07 */ + 0x88, 0xD5, 0x88, 0xD6, 0x88, 0xD7, 0xDC, 0xA1, /* 0x08-0x0B */ + 0x88, 0xD8, 0xDC, 0xA2, 0x88, 0xD9, 0x88, 0xDA, /* 0x0C-0x0F */ + 0x88, 0xDB, 0xC7, 0xB5, 0x88, 0xDC, 0x88, 0xDD, /* 0x10-0x13 */ + 0x88, 0xDE, 0xB6, 0xE9, 0x88, 0xDF, 0x88, 0xE0, /* 0x14-0x17 */ + 0x88, 0xE1, 0xDC, 0xA7, 0x88, 0xE2, 0x88, 0xE3, /* 0x18-0x1B */ + 0x88, 0xE4, 0x88, 0xE5, 0xDC, 0xA6, 0x88, 0xE6, /* 0x1C-0x1F */ + 0xDC, 0xA9, 0xB1, 0xA4, 0x88, 0xE7, 0x88, 0xE8, /* 0x20-0x23 */ + 0xB5, 0xCC, 0x88, 0xE9, 0x88, 0xEA, 0x88, 0xEB, /* 0x24-0x27 */ + 0x88, 0xEC, 0x88, 0xED, 0xBF, 0xB0, 0x88, 0xEE, /* 0x28-0x2B */ + 0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x2C-0x2F */ + 0xD1, 0xDF, 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, /* 0x30-0x33 */ + 0x88, 0xF6, 0xB6, 0xC2, 0x88, 0xF7, 0x88, 0xF8, /* 0x34-0x37 */ + 0x88, 0xF9, 0x88, 0xFA, 0x88, 0xFB, 0x88, 0xFC, /* 0x38-0x3B */ + 0x88, 0xFD, 0x88, 0xFE, 0x89, 0x40, 0x89, 0x41, /* 0x3C-0x3F */ + 0x89, 0x42, 0x89, 0x43, 0x89, 0x44, 0x89, 0x45, /* 0x40-0x43 */ + 0xDC, 0xA8, 0x89, 0x46, 0x89, 0x47, 0x89, 0x48, /* 0x44-0x47 */ + 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, 0x89, 0x4C, /* 0x48-0x4B */ + 0xCB, 0xFA, 0xEB, 0xF3, 0x89, 0x4D, 0x89, 0x4E, /* 0x4C-0x4F */ + 0x89, 0x4F, 0xCB, 0xDC, 0x89, 0x50, 0x89, 0x51, /* 0x50-0x53 */ + 0xCB, 0xFE, 0x89, 0x52, 0x89, 0x53, 0x89, 0x54, /* 0x54-0x57 */ + 0xCC, 0xC1, 0x89, 0x55, 0x89, 0x56, 0x89, 0x57, /* 0x58-0x5B */ + 0x89, 0x58, 0x89, 0x59, 0xC8, 0xFB, 0x89, 0x5A, /* 0x5C-0x5F */ + 0x89, 0x5B, 0x89, 0x5C, 0x89, 0x5D, 0x89, 0x5E, /* 0x60-0x63 */ + 0x89, 0x5F, 0xDC, 0xAA, 0x89, 0x60, 0x89, 0x61, /* 0x64-0x67 */ + 0x89, 0x62, 0x89, 0x63, 0x89, 0x64, 0xCC, 0xEE, /* 0x68-0x6B */ + 0xDC, 0xAB, 0x89, 0x65, 0x89, 0x66, 0x89, 0x67, /* 0x6C-0x6F */ + 0x89, 0x68, 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, /* 0x70-0x73 */ + 0x89, 0x6C, 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, /* 0x74-0x77 */ + 0x89, 0x70, 0x89, 0x71, 0x89, 0x72, 0x89, 0x73, /* 0x78-0x7B */ + 0x89, 0x74, 0x89, 0x75, 0xDB, 0xD3, 0x89, 0x76, /* 0x7C-0x7F */ + + 0xDC, 0xAF, 0xDC, 0xAC, 0x89, 0x77, 0xBE, 0xB3, /* 0x80-0x83 */ + 0x89, 0x78, 0xCA, 0xFB, 0x89, 0x79, 0x89, 0x7A, /* 0x84-0x87 */ + 0x89, 0x7B, 0xDC, 0xAD, 0x89, 0x7C, 0x89, 0x7D, /* 0x88-0x8B */ + 0x89, 0x7E, 0x89, 0x80, 0x89, 0x81, 0x89, 0x82, /* 0x8C-0x8F */ + 0x89, 0x83, 0x89, 0x84, 0xC9, 0xCA, 0xC4, 0xB9, /* 0x90-0x93 */ + 0x89, 0x85, 0x89, 0x86, 0x89, 0x87, 0x89, 0x88, /* 0x94-0x97 */ + 0x89, 0x89, 0xC7, 0xBD, 0xDC, 0xAE, 0x89, 0x8A, /* 0x98-0x9B */ + 0x89, 0x8B, 0x89, 0x8C, 0xD4, 0xF6, 0xD0, 0xE6, /* 0x9C-0x9F */ + 0x89, 0x8D, 0x89, 0x8E, 0x89, 0x8F, 0x89, 0x90, /* 0xA0-0xA3 */ + 0x89, 0x91, 0x89, 0x92, 0x89, 0x93, 0x89, 0x94, /* 0xA4-0xA7 */ + 0xC4, 0xAB, 0xB6, 0xD5, 0x89, 0x95, 0x89, 0x96, /* 0xA8-0xAB */ + 0x89, 0x97, 0x89, 0x98, 0x89, 0x99, 0x89, 0x9A, /* 0xAC-0xAF */ + 0x89, 0x9B, 0x89, 0x9C, 0x89, 0x9D, 0x89, 0x9E, /* 0xB0-0xB3 */ + 0x89, 0x9F, 0x89, 0xA0, 0x89, 0xA1, 0x89, 0xA2, /* 0xB4-0xB7 */ + 0x89, 0xA3, 0x89, 0xA4, 0x89, 0xA5, 0x89, 0xA6, /* 0xB8-0xBB */ + 0xDB, 0xD4, 0x89, 0xA7, 0x89, 0xA8, 0x89, 0xA9, /* 0xBC-0xBF */ + 0x89, 0xAA, 0xB1, 0xDA, 0x89, 0xAB, 0x89, 0xAC, /* 0xC0-0xC3 */ + 0x89, 0xAD, 0xDB, 0xD5, 0x89, 0xAE, 0x89, 0xAF, /* 0xC4-0xC7 */ + 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, 0x89, 0xB3, /* 0xC8-0xCB */ + 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, 0x89, 0xB7, /* 0xCC-0xCF */ + 0x89, 0xB8, 0xDB, 0xD6, 0x89, 0xB9, 0x89, 0xBA, /* 0xD0-0xD3 */ + 0x89, 0xBB, 0xBA, 0xBE, 0x89, 0xBC, 0x89, 0xBD, /* 0xD4-0xD7 */ + 0x89, 0xBE, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xD8-0xDB */ + 0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0xDC-0xDF */ + 0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0xE0-0xE3 */ + 0xC8, 0xC0, 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, /* 0xE4-0xE7 */ + 0x89, 0xCD, 0x89, 0xCE, 0x89, 0xCF, 0xCA, 0xBF, /* 0xE8-0xEB */ + 0xC8, 0xC9, 0x89, 0xD0, 0xD7, 0xB3, 0x89, 0xD1, /* 0xEC-0xEF */ + 0xC9, 0xF9, 0x89, 0xD2, 0x89, 0xD3, 0xBF, 0xC7, /* 0xF0-0xF3 */ + 0x89, 0xD4, 0x89, 0xD5, 0xBA, 0xF8, 0x89, 0xD6, /* 0xF4-0xF7 */ + 0x89, 0xD7, 0xD2, 0xBC, 0x89, 0xD8, 0x89, 0xD9, /* 0xF8-0xFB */ + 0x89, 0xDA, 0x89, 0xDB, 0x89, 0xDC, 0x89, 0xDD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x89, 0xDE, 0x89, 0xDF, 0xE2, 0xBA, 0x89, 0xE0, /* 0x00-0x03 */ + 0xB4, 0xA6, 0x89, 0xE1, 0x89, 0xE2, 0xB1, 0xB8, /* 0x04-0x07 */ + 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, 0x89, 0xE6, /* 0x08-0x0B */ + 0x89, 0xE7, 0xB8, 0xB4, 0x89, 0xE8, 0xCF, 0xC4, /* 0x0C-0x0F */ + 0x89, 0xE9, 0x89, 0xEA, 0x89, 0xEB, 0x89, 0xEC, /* 0x10-0x13 */ + 0xD9, 0xE7, 0xCF, 0xA6, 0xCD, 0xE2, 0x89, 0xED, /* 0x14-0x17 */ + 0x89, 0xEE, 0xD9, 0xED, 0xB6, 0xE0, 0x89, 0xEF, /* 0x18-0x1B */ + 0xD2, 0xB9, 0x89, 0xF0, 0x89, 0xF1, 0xB9, 0xBB, /* 0x1C-0x1F */ + 0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x20-0x23 */ + 0xE2, 0xB9, 0xE2, 0xB7, 0x89, 0xF6, 0xB4, 0xF3, /* 0x24-0x27 */ + 0x89, 0xF7, 0xCC, 0xEC, 0xCC, 0xAB, 0xB7, 0xF2, /* 0x28-0x2B */ + 0x89, 0xF8, 0xD8, 0xB2, 0xD1, 0xEB, 0xBA, 0xBB, /* 0x2C-0x2F */ + 0x89, 0xF9, 0xCA, 0xA7, 0x89, 0xFA, 0x89, 0xFB, /* 0x30-0x33 */ + 0xCD, 0xB7, 0x89, 0xFC, 0x89, 0xFD, 0xD2, 0xC4, /* 0x34-0x37 */ + 0xBF, 0xE4, 0xBC, 0xD0, 0xB6, 0xE1, 0x89, 0xFE, /* 0x38-0x3B */ + 0xDE, 0xC5, 0x8A, 0x40, 0x8A, 0x41, 0x8A, 0x42, /* 0x3C-0x3F */ + 0x8A, 0x43, 0xDE, 0xC6, 0xDB, 0xBC, 0x8A, 0x44, /* 0x40-0x43 */ + 0xD1, 0xD9, 0x8A, 0x45, 0x8A, 0x46, 0xC6, 0xE6, /* 0x44-0x47 */ + 0xC4, 0xCE, 0xB7, 0xEE, 0x8A, 0x47, 0xB7, 0xDC, /* 0x48-0x4B */ + 0x8A, 0x48, 0x8A, 0x49, 0xBF, 0xFC, 0xD7, 0xE0, /* 0x4C-0x4F */ + 0x8A, 0x4A, 0xC6, 0xF5, 0x8A, 0x4B, 0x8A, 0x4C, /* 0x50-0x53 */ + 0xB1, 0xBC, 0xDE, 0xC8, 0xBD, 0xB1, 0xCC, 0xD7, /* 0x54-0x57 */ + 0xDE, 0xCA, 0x8A, 0x4D, 0xDE, 0xC9, 0x8A, 0x4E, /* 0x58-0x5B */ + 0x8A, 0x4F, 0x8A, 0x50, 0x8A, 0x51, 0x8A, 0x52, /* 0x5C-0x5F */ + 0xB5, 0xEC, 0x8A, 0x53, 0xC9, 0xDD, 0x8A, 0x54, /* 0x60-0x63 */ + 0x8A, 0x55, 0xB0, 0xC2, 0x8A, 0x56, 0x8A, 0x57, /* 0x64-0x67 */ + 0x8A, 0x58, 0x8A, 0x59, 0x8A, 0x5A, 0x8A, 0x5B, /* 0x68-0x6B */ + 0x8A, 0x5C, 0x8A, 0x5D, 0x8A, 0x5E, 0x8A, 0x5F, /* 0x6C-0x6F */ + 0x8A, 0x60, 0x8A, 0x61, 0x8A, 0x62, 0xC5, 0xAE, /* 0x70-0x73 */ + 0xC5, 0xAB, 0x8A, 0x63, 0xC4, 0xCC, 0x8A, 0x64, /* 0x74-0x77 */ + 0xBC, 0xE9, 0xCB, 0xFD, 0x8A, 0x65, 0x8A, 0x66, /* 0x78-0x7B */ + 0x8A, 0x67, 0xBA, 0xC3, 0x8A, 0x68, 0x8A, 0x69, /* 0x7C-0x7F */ + + 0x8A, 0x6A, 0xE5, 0xF9, 0xC8, 0xE7, 0xE5, 0xFA, /* 0x80-0x83 */ + 0xCD, 0xFD, 0x8A, 0x6B, 0xD7, 0xB1, 0xB8, 0xBE, /* 0x84-0x87 */ + 0xC2, 0xE8, 0x8A, 0x6C, 0xC8, 0xD1, 0x8A, 0x6D, /* 0x88-0x8B */ + 0x8A, 0x6E, 0xE5, 0xFB, 0x8A, 0x6F, 0x8A, 0x70, /* 0x8C-0x8F */ + 0x8A, 0x71, 0x8A, 0x72, 0xB6, 0xCA, 0xBC, 0xCB, /* 0x90-0x93 */ + 0x8A, 0x73, 0x8A, 0x74, 0xD1, 0xFD, 0xE6, 0xA1, /* 0x94-0x97 */ + 0x8A, 0x75, 0xC3, 0xEE, 0x8A, 0x76, 0x8A, 0x77, /* 0x98-0x9B */ + 0x8A, 0x78, 0x8A, 0x79, 0xE6, 0xA4, 0x8A, 0x7A, /* 0x9C-0x9F */ + 0x8A, 0x7B, 0x8A, 0x7C, 0x8A, 0x7D, 0xE5, 0xFE, /* 0xA0-0xA3 */ + 0xE6, 0xA5, 0xCD, 0xD7, 0x8A, 0x7E, 0x8A, 0x80, /* 0xA4-0xA7 */ + 0xB7, 0xC1, 0xE5, 0xFC, 0xE5, 0xFD, 0xE6, 0xA3, /* 0xA8-0xAB */ + 0x8A, 0x81, 0x8A, 0x82, 0xC4, 0xDD, 0xE6, 0xA8, /* 0xAC-0xAF */ + 0x8A, 0x83, 0x8A, 0x84, 0xE6, 0xA7, 0x8A, 0x85, /* 0xB0-0xB3 */ + 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, 0x8A, 0x89, /* 0xB4-0xB7 */ + 0x8A, 0x8A, 0xC3, 0xC3, 0x8A, 0x8B, 0xC6, 0xDE, /* 0xB8-0xBB */ + 0x8A, 0x8C, 0x8A, 0x8D, 0xE6, 0xAA, 0x8A, 0x8E, /* 0xBC-0xBF */ + 0x8A, 0x8F, 0x8A, 0x90, 0x8A, 0x91, 0x8A, 0x92, /* 0xC0-0xC3 */ + 0x8A, 0x93, 0x8A, 0x94, 0xC4, 0xB7, 0x8A, 0x95, /* 0xC4-0xC7 */ + 0x8A, 0x96, 0x8A, 0x97, 0xE6, 0xA2, 0xCA, 0xBC, /* 0xC8-0xCB */ + 0x8A, 0x98, 0x8A, 0x99, 0x8A, 0x9A, 0x8A, 0x9B, /* 0xCC-0xCF */ + 0xBD, 0xE3, 0xB9, 0xC3, 0xE6, 0xA6, 0xD0, 0xD5, /* 0xD0-0xD3 */ + 0xCE, 0xAF, 0x8A, 0x9C, 0x8A, 0x9D, 0xE6, 0xA9, /* 0xD4-0xD7 */ + 0xE6, 0xB0, 0x8A, 0x9E, 0xD2, 0xA6, 0x8A, 0x9F, /* 0xD8-0xDB */ + 0xBD, 0xAA, 0xE6, 0xAD, 0x8A, 0xA0, 0x8A, 0xA1, /* 0xDC-0xDF */ + 0x8A, 0xA2, 0x8A, 0xA3, 0x8A, 0xA4, 0xE6, 0xAF, /* 0xE0-0xE3 */ + 0x8A, 0xA5, 0xC0, 0xD1, 0x8A, 0xA6, 0x8A, 0xA7, /* 0xE4-0xE7 */ + 0xD2, 0xCC, 0x8A, 0xA8, 0x8A, 0xA9, 0x8A, 0xAA, /* 0xE8-0xEB */ + 0xBC, 0xA7, 0x8A, 0xAB, 0x8A, 0xAC, 0x8A, 0xAD, /* 0xEC-0xEF */ + 0x8A, 0xAE, 0x8A, 0xAF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xF0-0xF3 */ + 0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xF4-0xF7 */ + 0x8A, 0xB6, 0xE6, 0xB1, 0x8A, 0xB7, 0xD2, 0xF6, /* 0xF8-0xFB */ + 0x8A, 0xB8, 0x8A, 0xB9, 0x8A, 0xBA, 0xD7, 0xCB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x8A, 0xBB, 0xCD, 0xFE, 0x8A, 0xBC, 0xCD, 0xDE, /* 0x00-0x03 */ + 0xC2, 0xA6, 0xE6, 0xAB, 0xE6, 0xAC, 0xBD, 0xBF, /* 0x04-0x07 */ + 0xE6, 0xAE, 0xE6, 0xB3, 0x8A, 0xBD, 0x8A, 0xBE, /* 0x08-0x0B */ + 0xE6, 0xB2, 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, /* 0x0C-0x0F */ + 0x8A, 0xC2, 0xE6, 0xB6, 0x8A, 0xC3, 0xE6, 0xB8, /* 0x10-0x13 */ + 0x8A, 0xC4, 0x8A, 0xC5, 0x8A, 0xC6, 0x8A, 0xC7, /* 0x14-0x17 */ + 0xC4, 0xEF, 0x8A, 0xC8, 0x8A, 0xC9, 0x8A, 0xCA, /* 0x18-0x1B */ + 0xC4, 0xC8, 0x8A, 0xCB, 0x8A, 0xCC, 0xBE, 0xEA, /* 0x1C-0x1F */ + 0xC9, 0xEF, 0x8A, 0xCD, 0x8A, 0xCE, 0xE6, 0xB7, /* 0x20-0x23 */ + 0x8A, 0xCF, 0xB6, 0xF0, 0x8A, 0xD0, 0x8A, 0xD1, /* 0x24-0x27 */ + 0x8A, 0xD2, 0xC3, 0xE4, 0x8A, 0xD3, 0x8A, 0xD4, /* 0x28-0x2B */ + 0x8A, 0xD5, 0x8A, 0xD6, 0x8A, 0xD7, 0x8A, 0xD8, /* 0x2C-0x2F */ + 0x8A, 0xD9, 0xD3, 0xE9, 0xE6, 0xB4, 0x8A, 0xDA, /* 0x30-0x33 */ + 0xE6, 0xB5, 0x8A, 0xDB, 0xC8, 0xA2, 0x8A, 0xDC, /* 0x34-0x37 */ + 0x8A, 0xDD, 0x8A, 0xDE, 0x8A, 0xDF, 0x8A, 0xE0, /* 0x38-0x3B */ + 0xE6, 0xBD, 0x8A, 0xE1, 0x8A, 0xE2, 0x8A, 0xE3, /* 0x3C-0x3F */ + 0xE6, 0xB9, 0x8A, 0xE4, 0x8A, 0xE5, 0x8A, 0xE6, /* 0x40-0x43 */ + 0x8A, 0xE7, 0x8A, 0xE8, 0xC6, 0xC5, 0x8A, 0xE9, /* 0x44-0x47 */ + 0x8A, 0xEA, 0xCD, 0xF1, 0xE6, 0xBB, 0x8A, 0xEB, /* 0x48-0x4B */ + 0x8A, 0xEC, 0x8A, 0xED, 0x8A, 0xEE, 0x8A, 0xEF, /* 0x4C-0x4F */ + 0x8A, 0xF0, 0x8A, 0xF1, 0x8A, 0xF2, 0x8A, 0xF3, /* 0x50-0x53 */ + 0x8A, 0xF4, 0xE6, 0xBC, 0x8A, 0xF5, 0x8A, 0xF6, /* 0x54-0x57 */ + 0x8A, 0xF7, 0x8A, 0xF8, 0xBB, 0xE9, 0x8A, 0xF9, /* 0x58-0x5B */ + 0x8A, 0xFA, 0x8A, 0xFB, 0x8A, 0xFC, 0x8A, 0xFD, /* 0x5C-0x5F */ + 0x8A, 0xFE, 0x8B, 0x40, 0xE6, 0xBE, 0x8B, 0x41, /* 0x60-0x63 */ + 0x8B, 0x42, 0x8B, 0x43, 0x8B, 0x44, 0xE6, 0xBA, /* 0x64-0x67 */ + 0x8B, 0x45, 0x8B, 0x46, 0xC0, 0xB7, 0x8B, 0x47, /* 0x68-0x6B */ + 0x8B, 0x48, 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, /* 0x6C-0x6F */ + 0x8B, 0x4C, 0x8B, 0x4D, 0x8B, 0x4E, 0x8B, 0x4F, /* 0x70-0x73 */ + 0xD3, 0xA4, 0xE6, 0xBF, 0xC9, 0xF4, 0xE6, 0xC3, /* 0x74-0x77 */ + 0x8B, 0x50, 0x8B, 0x51, 0xE6, 0xC4, 0x8B, 0x52, /* 0x78-0x7B */ + 0x8B, 0x53, 0x8B, 0x54, 0x8B, 0x55, 0xD0, 0xF6, /* 0x7C-0x7F */ + + 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, 0x8B, 0x59, /* 0x80-0x83 */ + 0x8B, 0x5A, 0x8B, 0x5B, 0x8B, 0x5C, 0x8B, 0x5D, /* 0x84-0x87 */ + 0x8B, 0x5E, 0x8B, 0x5F, 0x8B, 0x60, 0x8B, 0x61, /* 0x88-0x8B */ + 0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0x8B, 0x65, /* 0x8C-0x8F */ + 0x8B, 0x66, 0x8B, 0x67, 0xC3, 0xBD, 0x8B, 0x68, /* 0x90-0x93 */ + 0x8B, 0x69, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x94-0x97 */ + 0x8B, 0x6D, 0x8B, 0x6E, 0xC3, 0xC4, 0xE6, 0xC2, /* 0x98-0x9B */ + 0x8B, 0x6F, 0x8B, 0x70, 0x8B, 0x71, 0x8B, 0x72, /* 0x9C-0x9F */ + 0x8B, 0x73, 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, /* 0xA0-0xA3 */ + 0x8B, 0x77, 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, /* 0xA4-0xA7 */ + 0x8B, 0x7B, 0x8B, 0x7C, 0xE6, 0xC1, 0x8B, 0x7D, /* 0xA8-0xAB */ + 0x8B, 0x7E, 0x8B, 0x80, 0x8B, 0x81, 0x8B, 0x82, /* 0xAC-0xAF */ + 0x8B, 0x83, 0x8B, 0x84, 0xE6, 0xC7, 0xCF, 0xB1, /* 0xB0-0xB3 */ + 0x8B, 0x85, 0xEB, 0xF4, 0x8B, 0x86, 0x8B, 0x87, /* 0xB4-0xB7 */ + 0xE6, 0xCA, 0x8B, 0x88, 0x8B, 0x89, 0x8B, 0x8A, /* 0xB8-0xBB */ + 0x8B, 0x8B, 0x8B, 0x8C, 0xE6, 0xC5, 0x8B, 0x8D, /* 0xBC-0xBF */ + 0x8B, 0x8E, 0xBC, 0xDE, 0xC9, 0xA9, 0x8B, 0x8F, /* 0xC0-0xC3 */ + 0x8B, 0x90, 0x8B, 0x91, 0x8B, 0x92, 0x8B, 0x93, /* 0xC4-0xC7 */ + 0x8B, 0x94, 0xBC, 0xB5, 0x8B, 0x95, 0x8B, 0x96, /* 0xC8-0xCB */ + 0xCF, 0xD3, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0xCC-0xCF */ + 0x8B, 0x9A, 0x8B, 0x9B, 0xE6, 0xC8, 0x8B, 0x9C, /* 0xD0-0xD3 */ + 0xE6, 0xC9, 0x8B, 0x9D, 0xE6, 0xCE, 0x8B, 0x9E, /* 0xD4-0xD7 */ + 0xE6, 0xD0, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0xD8-0xDB */ + 0xE6, 0xD1, 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, /* 0xDC-0xDF */ + 0xE6, 0xCB, 0xB5, 0xD5, 0x8B, 0xA5, 0xE6, 0xCC, /* 0xE0-0xE3 */ + 0x8B, 0xA6, 0x8B, 0xA7, 0xE6, 0xCF, 0x8B, 0xA8, /* 0xE4-0xE7 */ + 0x8B, 0xA9, 0xC4, 0xDB, 0x8B, 0xAA, 0xE6, 0xC6, /* 0xE8-0xEB */ + 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, 0x8B, 0xAE, /* 0xEC-0xEF */ + 0x8B, 0xAF, 0xE6, 0xCD, 0x8B, 0xB0, 0x8B, 0xB1, /* 0xF0-0xF3 */ + 0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0xF4-0xF7 */ + 0x8B, 0xB6, 0x8B, 0xB7, 0x8B, 0xB8, 0x8B, 0xB9, /* 0xF8-0xFB */ + 0x8B, 0xBA, 0x8B, 0xBB, 0x8B, 0xBC, 0x8B, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5B[512] = { + 0x8B, 0xBE, 0x8B, 0xBF, 0x8B, 0xC0, 0x8B, 0xC1, /* 0x00-0x03 */ + 0x8B, 0xC2, 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, /* 0x04-0x07 */ + 0x8B, 0xC6, 0xE6, 0xD2, 0x8B, 0xC7, 0x8B, 0xC8, /* 0x08-0x0B */ + 0x8B, 0xC9, 0x8B, 0xCA, 0x8B, 0xCB, 0x8B, 0xCC, /* 0x0C-0x0F */ + 0x8B, 0xCD, 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, /* 0x10-0x13 */ + 0x8B, 0xD1, 0x8B, 0xD2, 0xE6, 0xD4, 0xE6, 0xD3, /* 0x14-0x17 */ + 0x8B, 0xD3, 0x8B, 0xD4, 0x8B, 0xD5, 0x8B, 0xD6, /* 0x18-0x1B */ + 0x8B, 0xD7, 0x8B, 0xD8, 0x8B, 0xD9, 0x8B, 0xDA, /* 0x1C-0x1F */ + 0x8B, 0xDB, 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, /* 0x20-0x23 */ + 0x8B, 0xDF, 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, /* 0x24-0x27 */ + 0x8B, 0xE3, 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, /* 0x28-0x2B */ + 0x8B, 0xE7, 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, /* 0x2C-0x2F */ + 0x8B, 0xEB, 0x8B, 0xEC, 0xE6, 0xD5, 0x8B, 0xED, /* 0x30-0x33 */ + 0xD9, 0xF8, 0x8B, 0xEE, 0x8B, 0xEF, 0xE6, 0xD6, /* 0x34-0x37 */ + 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, 0x8B, 0xF3, /* 0x38-0x3B */ + 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, 0x8B, 0xF7, /* 0x3C-0x3F */ + 0xE6, 0xD7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0x40-0x43 */ + 0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0x44-0x47 */ + 0x8C, 0x40, 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, /* 0x48-0x4B */ + 0x8C, 0x44, 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, /* 0x4C-0x4F */ + 0xD7, 0xD3, 0xE6, 0xDD, 0x8C, 0x48, 0xE6, 0xDE, /* 0x50-0x53 */ + 0xBF, 0xD7, 0xD4, 0xD0, 0x8C, 0x49, 0xD7, 0xD6, /* 0x54-0x57 */ + 0xB4, 0xE6, 0xCB, 0xEF, 0xE6, 0xDA, 0xD8, 0xC3, /* 0x58-0x5B */ + 0xD7, 0xCE, 0xD0, 0xA2, 0x8C, 0x4A, 0xC3, 0xCF, /* 0x5C-0x5F */ + 0x8C, 0x4B, 0x8C, 0x4C, 0xE6, 0xDF, 0xBC, 0xBE, /* 0x60-0x63 */ + 0xB9, 0xC2, 0xE6, 0xDB, 0xD1, 0xA7, 0x8C, 0x4D, /* 0x64-0x67 */ + 0x8C, 0x4E, 0xBA, 0xA2, 0xC2, 0xCF, 0x8C, 0x4F, /* 0x68-0x6B */ + 0xD8, 0xAB, 0x8C, 0x50, 0x8C, 0x51, 0x8C, 0x52, /* 0x6C-0x6F */ + 0xCA, 0xEB, 0xE5, 0xEE, 0x8C, 0x53, 0xE6, 0xDC, /* 0x70-0x73 */ + 0x8C, 0x54, 0xB7, 0xF5, 0x8C, 0x55, 0x8C, 0x56, /* 0x74-0x77 */ + 0x8C, 0x57, 0x8C, 0x58, 0xC8, 0xE6, 0x8C, 0x59, /* 0x78-0x7B */ + 0x8C, 0x5A, 0xC4, 0xF5, 0x8C, 0x5B, 0x8C, 0x5C, /* 0x7C-0x7F */ + + 0xE5, 0xB2, 0xC4, 0xFE, 0x8C, 0x5D, 0xCB, 0xFC, /* 0x80-0x83 */ + 0xE5, 0xB3, 0xD5, 0xAC, 0x8C, 0x5E, 0xD3, 0xEE, /* 0x84-0x87 */ + 0xCA, 0xD8, 0xB0, 0xB2, 0x8C, 0x5F, 0xCB, 0xCE, /* 0x88-0x8B */ + 0xCD, 0xEA, 0x8C, 0x60, 0x8C, 0x61, 0xBA, 0xEA, /* 0x8C-0x8F */ + 0x8C, 0x62, 0x8C, 0x63, 0x8C, 0x64, 0xE5, 0xB5, /* 0x90-0x93 */ + 0x8C, 0x65, 0xE5, 0xB4, 0x8C, 0x66, 0xD7, 0xDA, /* 0x94-0x97 */ + 0xB9, 0xD9, 0xD6, 0xE6, 0xB6, 0xA8, 0xCD, 0xF0, /* 0x98-0x9B */ + 0xD2, 0xCB, 0xB1, 0xA6, 0xCA, 0xB5, 0x8C, 0x67, /* 0x9C-0x9F */ + 0xB3, 0xE8, 0xC9, 0xF3, 0xBF, 0xCD, 0xD0, 0xFB, /* 0xA0-0xA3 */ + 0xCA, 0xD2, 0xE5, 0xB6, 0xBB, 0xC2, 0x8C, 0x68, /* 0xA4-0xA7 */ + 0x8C, 0x69, 0x8C, 0x6A, 0xCF, 0xDC, 0xB9, 0xAC, /* 0xA8-0xAB */ + 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, 0x8C, 0x6E, /* 0xAC-0xAF */ + 0xD4, 0xD7, 0x8C, 0x6F, 0x8C, 0x70, 0xBA, 0xA6, /* 0xB0-0xB3 */ + 0xD1, 0xE7, 0xCF, 0xFC, 0xBC, 0xD2, 0x8C, 0x71, /* 0xB4-0xB7 */ + 0xE5, 0xB7, 0xC8, 0xDD, 0x8C, 0x72, 0x8C, 0x73, /* 0xB8-0xBB */ + 0x8C, 0x74, 0xBF, 0xED, 0xB1, 0xF6, 0xCB, 0xDE, /* 0xBC-0xBF */ + 0x8C, 0x75, 0x8C, 0x76, 0xBC, 0xC5, 0x8C, 0x77, /* 0xC0-0xC3 */ + 0xBC, 0xC4, 0xD2, 0xFA, 0xC3, 0xDC, 0xBF, 0xDC, /* 0xC4-0xC7 */ + 0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x7B, /* 0xC8-0xCB */ + 0xB8, 0xBB, 0x8C, 0x7C, 0x8C, 0x7D, 0x8C, 0x7E, /* 0xCC-0xCF */ + 0xC3, 0xC2, 0x8C, 0x80, 0xBA, 0xAE, 0xD4, 0xA2, /* 0xD0-0xD3 */ + 0x8C, 0x81, 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, /* 0xD4-0xD7 */ + 0x8C, 0x85, 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, /* 0xD8-0xDB */ + 0x8C, 0x89, 0xC7, 0xDE, 0xC4, 0xAF, 0xB2, 0xEC, /* 0xDC-0xDF */ + 0x8C, 0x8A, 0xB9, 0xD1, 0x8C, 0x8B, 0x8C, 0x8C, /* 0xE0-0xE3 */ + 0xE5, 0xBB, 0xC1, 0xC8, 0x8C, 0x8D, 0x8C, 0x8E, /* 0xE4-0xE7 */ + 0xD5, 0xAF, 0x8C, 0x8F, 0x8C, 0x90, 0x8C, 0x91, /* 0xE8-0xEB */ + 0x8C, 0x92, 0x8C, 0x93, 0xE5, 0xBC, 0x8C, 0x94, /* 0xEC-0xEF */ + 0xE5, 0xBE, 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, /* 0xF0-0xF3 */ + 0x8C, 0x98, 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, /* 0xF4-0xF7 */ + 0xB4, 0xE7, 0xB6, 0xD4, 0xCB, 0xC2, 0xD1, 0xB0, /* 0xF8-0xFB */ + 0xB5, 0xBC, 0x8C, 0x9C, 0x8C, 0x9D, 0xCA, 0xD9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x8C, 0x9E, 0xB7, 0xE2, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x00-0x03 */ + 0xC9, 0xE4, 0x8C, 0xA1, 0xBD, 0xAB, 0x8C, 0xA2, /* 0x04-0x07 */ + 0x8C, 0xA3, 0xCE, 0xBE, 0xD7, 0xF0, 0x8C, 0xA4, /* 0x08-0x0B */ + 0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0xD0, 0xA1, /* 0x0C-0x0F */ + 0x8C, 0xA8, 0xC9, 0xD9, 0x8C, 0xA9, 0x8C, 0xAA, /* 0x10-0x13 */ + 0xB6, 0xFB, 0xE6, 0xD8, 0xBC, 0xE2, 0x8C, 0xAB, /* 0x14-0x17 */ + 0xB3, 0xBE, 0x8C, 0xAC, 0xC9, 0xD0, 0x8C, 0xAD, /* 0x18-0x1B */ + 0xE6, 0xD9, 0xB3, 0xA2, 0x8C, 0xAE, 0x8C, 0xAF, /* 0x1C-0x1F */ + 0x8C, 0xB0, 0x8C, 0xB1, 0xDE, 0xCC, 0x8C, 0xB2, /* 0x20-0x23 */ + 0xD3, 0xC8, 0xDE, 0xCD, 0x8C, 0xB3, 0xD2, 0xA2, /* 0x24-0x27 */ + 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, 0x8C, 0xB7, /* 0x28-0x2B */ + 0xDE, 0xCE, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x2C-0x2F */ + 0x8C, 0xBB, 0xBE, 0xCD, 0x8C, 0xBC, 0x8C, 0xBD, /* 0x30-0x33 */ + 0xDE, 0xCF, 0x8C, 0xBE, 0x8C, 0xBF, 0x8C, 0xC0, /* 0x34-0x37 */ + 0xCA, 0xAC, 0xD2, 0xFC, 0xB3, 0xDF, 0xE5, 0xEA, /* 0x38-0x3B */ + 0xC4, 0xE1, 0xBE, 0xA1, 0xCE, 0xB2, 0xC4, 0xF2, /* 0x3C-0x3F */ + 0xBE, 0xD6, 0xC6, 0xA8, 0xB2, 0xE3, 0x8C, 0xC1, /* 0x40-0x43 */ + 0x8C, 0xC2, 0xBE, 0xD3, 0x8C, 0xC3, 0x8C, 0xC4, /* 0x44-0x47 */ + 0xC7, 0xFC, 0xCC, 0xEB, 0xBD, 0xEC, 0xCE, 0xDD, /* 0x48-0x4B */ + 0x8C, 0xC5, 0x8C, 0xC6, 0xCA, 0xBA, 0xC6, 0xC1, /* 0x4C-0x4F */ + 0xE5, 0xEC, 0xD0, 0xBC, 0x8C, 0xC7, 0x8C, 0xC8, /* 0x50-0x53 */ + 0x8C, 0xC9, 0xD5, 0xB9, 0x8C, 0xCA, 0x8C, 0xCB, /* 0x54-0x57 */ + 0x8C, 0xCC, 0xE5, 0xED, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x58-0x5B */ + 0x8C, 0xCF, 0x8C, 0xD0, 0xCA, 0xF4, 0x8C, 0xD1, /* 0x5C-0x5F */ + 0xCD, 0xC0, 0xC2, 0xC5, 0x8C, 0xD2, 0xE5, 0xEF, /* 0x60-0x63 */ + 0x8C, 0xD3, 0xC2, 0xC4, 0xE5, 0xF0, 0x8C, 0xD4, /* 0x64-0x67 */ + 0x8C, 0xD5, 0x8C, 0xD6, 0x8C, 0xD7, 0x8C, 0xD8, /* 0x68-0x6B */ + 0x8C, 0xD9, 0x8C, 0xDA, 0xE5, 0xF8, 0xCD, 0xCD, /* 0x6C-0x6F */ + 0x8C, 0xDB, 0xC9, 0xBD, 0x8C, 0xDC, 0x8C, 0xDD, /* 0x70-0x73 */ + 0x8C, 0xDE, 0x8C, 0xDF, 0x8C, 0xE0, 0x8C, 0xE1, /* 0x74-0x77 */ + 0x8C, 0xE2, 0xD2, 0xD9, 0xE1, 0xA8, 0x8C, 0xE3, /* 0x78-0x7B */ + 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, 0xD3, 0xEC, /* 0x7C-0x7F */ + + 0x8C, 0xE7, 0xCB, 0xEA, 0xC6, 0xF1, 0x8C, 0xE8, /* 0x80-0x83 */ + 0x8C, 0xE9, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0x84-0x87 */ + 0xE1, 0xAC, 0x8C, 0xED, 0x8C, 0xEE, 0x8C, 0xEF, /* 0x88-0x8B */ + 0xE1, 0xA7, 0xE1, 0xA9, 0x8C, 0xF0, 0x8C, 0xF1, /* 0x8C-0x8F */ + 0xE1, 0xAA, 0xE1, 0xAF, 0x8C, 0xF2, 0x8C, 0xF3, /* 0x90-0x93 */ + 0xB2, 0xED, 0x8C, 0xF4, 0xE1, 0xAB, 0xB8, 0xDA, /* 0x94-0x97 */ + 0xE1, 0xAD, 0xE1, 0xAE, 0xE1, 0xB0, 0xB5, 0xBA, /* 0x98-0x9B */ + 0xE1, 0xB1, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0x9C-0x9F */ + 0x8C, 0xF8, 0x8C, 0xF9, 0xE1, 0xB3, 0xE1, 0xB8, /* 0xA0-0xA3 */ + 0x8C, 0xFA, 0x8C, 0xFB, 0x8C, 0xFC, 0x8C, 0xFD, /* 0xA4-0xA7 */ + 0x8C, 0xFE, 0xD1, 0xD2, 0x8D, 0x40, 0xE1, 0xB6, /* 0xA8-0xAB */ + 0xE1, 0xB5, 0xC1, 0xEB, 0x8D, 0x41, 0x8D, 0x42, /* 0xAC-0xAF */ + 0x8D, 0x43, 0xE1, 0xB7, 0x8D, 0x44, 0xD4, 0xC0, /* 0xB0-0xB3 */ + 0x8D, 0x45, 0xE1, 0xB2, 0x8D, 0x46, 0xE1, 0xBA, /* 0xB4-0xB7 */ + 0xB0, 0xB6, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xB8-0xBB */ + 0x8D, 0x4A, 0xE1, 0xB4, 0x8D, 0x4B, 0xBF, 0xF9, /* 0xBC-0xBF */ + 0x8D, 0x4C, 0xE1, 0xB9, 0x8D, 0x4D, 0x8D, 0x4E, /* 0xC0-0xC3 */ + 0xE1, 0xBB, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xC4-0xC7 */ + 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, 0xE1, 0xBE, /* 0xC8-0xCB */ + 0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xCC-0xCF */ + 0x8D, 0x59, 0x8D, 0x5A, 0xE1, 0xBC, 0x8D, 0x5B, /* 0xD0-0xD3 */ + 0x8D, 0x5C, 0x8D, 0x5D, 0x8D, 0x5E, 0x8D, 0x5F, /* 0xD4-0xD7 */ + 0x8D, 0x60, 0xD6, 0xC5, 0x8D, 0x61, 0x8D, 0x62, /* 0xD8-0xDB */ + 0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xDC-0xDF */ + 0x8D, 0x67, 0xCF, 0xBF, 0x8D, 0x68, 0x8D, 0x69, /* 0xE0-0xE3 */ + 0xE1, 0xBD, 0xE1, 0xBF, 0xC2, 0xCD, 0x8D, 0x6A, /* 0xE4-0xE7 */ + 0xB6, 0xEB, 0x8D, 0x6B, 0xD3, 0xF8, 0x8D, 0x6C, /* 0xE8-0xEB */ + 0x8D, 0x6D, 0xC7, 0xCD, 0x8D, 0x6E, 0x8D, 0x6F, /* 0xEC-0xEF */ + 0xB7, 0xE5, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xF0-0xF3 */ + 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, 0x8D, 0x76, /* 0xF4-0xF7 */ + 0x8D, 0x77, 0x8D, 0x78, 0x8D, 0x79, 0xBE, 0xFE, /* 0xF8-0xFB */ + 0x8D, 0x7A, 0x8D, 0x7B, 0x8D, 0x7C, 0x8D, 0x7D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x8D, 0x7E, 0x8D, 0x80, 0xE1, 0xC0, 0xE1, 0xC1, /* 0x00-0x03 */ + 0x8D, 0x81, 0x8D, 0x82, 0xE1, 0xC7, 0xB3, 0xE7, /* 0x04-0x07 */ + 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, 0x8D, 0x86, /* 0x08-0x0B */ + 0x8D, 0x87, 0x8D, 0x88, 0xC6, 0xE9, 0x8D, 0x89, /* 0x0C-0x0F */ + 0x8D, 0x8A, 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, /* 0x10-0x13 */ + 0xB4, 0xDE, 0x8D, 0x8E, 0xD1, 0xC2, 0x8D, 0x8F, /* 0x14-0x17 */ + 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, 0xE1, 0xC8, /* 0x18-0x1B */ + 0x8D, 0x93, 0x8D, 0x94, 0xE1, 0xC6, 0x8D, 0x95, /* 0x1C-0x1F */ + 0x8D, 0x96, 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, /* 0x20-0x23 */ + 0xE1, 0xC5, 0x8D, 0x9A, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x24-0x27 */ + 0x8D, 0x9B, 0xB1, 0xC0, 0x8D, 0x9C, 0x8D, 0x9D, /* 0x28-0x2B */ + 0x8D, 0x9E, 0xD5, 0xB8, 0xE1, 0xC4, 0x8D, 0x9F, /* 0x2C-0x2F */ + 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, 0x8D, 0xA3, /* 0x30-0x33 */ + 0xE1, 0xCB, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x34-0x37 */ + 0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x38-0x3B */ + 0x8D, 0xAB, 0xE1, 0xCC, 0xE1, 0xCA, 0x8D, 0xAC, /* 0x3C-0x3F */ + 0x8D, 0xAD, 0x8D, 0xAE, 0x8D, 0xAF, 0x8D, 0xB0, /* 0x40-0x43 */ + 0x8D, 0xB1, 0x8D, 0xB2, 0x8D, 0xB3, 0xEF, 0xFA, /* 0x44-0x47 */ + 0x8D, 0xB4, 0x8D, 0xB5, 0xE1, 0xD3, 0xE1, 0xD2, /* 0x48-0x4B */ + 0xC7, 0xB6, 0x8D, 0xB6, 0x8D, 0xB7, 0x8D, 0xB8, /* 0x4C-0x4F */ + 0x8D, 0xB9, 0x8D, 0xBA, 0x8D, 0xBB, 0x8D, 0xBC, /* 0x50-0x53 */ + 0x8D, 0xBD, 0x8D, 0xBE, 0x8D, 0xBF, 0x8D, 0xC0, /* 0x54-0x57 */ + 0xE1, 0xC9, 0x8D, 0xC1, 0x8D, 0xC2, 0xE1, 0xCE, /* 0x58-0x5B */ + 0x8D, 0xC3, 0xE1, 0xD0, 0x8D, 0xC4, 0x8D, 0xC5, /* 0x5C-0x5F */ + 0x8D, 0xC6, 0x8D, 0xC7, 0x8D, 0xC8, 0x8D, 0xC9, /* 0x60-0x63 */ + 0x8D, 0xCA, 0x8D, 0xCB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x64-0x67 */ + 0x8D, 0xCE, 0xE1, 0xD4, 0x8D, 0xCF, 0xE1, 0xD1, /* 0x68-0x6B */ + 0xE1, 0xCD, 0x8D, 0xD0, 0x8D, 0xD1, 0xE1, 0xCF, /* 0x6C-0x6F */ + 0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x70-0x73 */ + 0xE1, 0xD5, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x74-0x77 */ + 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, 0x8D, 0xDC, /* 0x78-0x7B */ + 0x8D, 0xDD, 0x8D, 0xDE, 0x8D, 0xDF, 0x8D, 0xE0, /* 0x7C-0x7F */ + + 0x8D, 0xE1, 0x8D, 0xE2, 0xE1, 0xD6, 0x8D, 0xE3, /* 0x80-0x83 */ + 0x8D, 0xE4, 0x8D, 0xE5, 0x8D, 0xE6, 0x8D, 0xE7, /* 0x84-0x87 */ + 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, 0x8D, 0xEB, /* 0x88-0x8B */ + 0x8D, 0xEC, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x8C-0x8F */ + 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, 0x8D, 0xF3, /* 0x90-0x93 */ + 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, 0x8D, 0xF7, /* 0x94-0x97 */ + 0x8D, 0xF8, 0xE1, 0xD7, 0x8D, 0xF9, 0x8D, 0xFA, /* 0x98-0x9B */ + 0x8D, 0xFB, 0xE1, 0xD8, 0x8D, 0xFC, 0x8D, 0xFD, /* 0x9C-0x9F */ + 0x8D, 0xFE, 0x8E, 0x40, 0x8E, 0x41, 0x8E, 0x42, /* 0xA0-0xA3 */ + 0x8E, 0x43, 0x8E, 0x44, 0x8E, 0x45, 0x8E, 0x46, /* 0xA4-0xA7 */ + 0x8E, 0x47, 0x8E, 0x48, 0x8E, 0x49, 0x8E, 0x4A, /* 0xA8-0xAB */ + 0x8E, 0x4B, 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, /* 0xAC-0xAF */ + 0x8E, 0x4F, 0x8E, 0x50, 0x8E, 0x51, 0x8E, 0x52, /* 0xB0-0xB3 */ + 0x8E, 0x53, 0x8E, 0x54, 0x8E, 0x55, 0xE1, 0xDA, /* 0xB4-0xB7 */ + 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, 0x8E, 0x59, /* 0xB8-0xBB */ + 0x8E, 0x5A, 0x8E, 0x5B, 0x8E, 0x5C, 0x8E, 0x5D, /* 0xBC-0xBF */ + 0x8E, 0x5E, 0x8E, 0x5F, 0x8E, 0x60, 0x8E, 0x61, /* 0xC0-0xC3 */ + 0x8E, 0x62, 0xE1, 0xDB, 0x8E, 0x63, 0x8E, 0x64, /* 0xC4-0xC7 */ + 0x8E, 0x65, 0x8E, 0x66, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */ + 0x8E, 0x69, 0xCE, 0xA1, 0x8E, 0x6A, 0x8E, 0x6B, /* 0xCC-0xCF */ + 0x8E, 0x6C, 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, /* 0xD0-0xD3 */ + 0x8E, 0x70, 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, /* 0xD4-0xD7 */ + 0x8E, 0x74, 0x8E, 0x75, 0x8E, 0x76, 0xE7, 0xDD, /* 0xD8-0xDB */ + 0x8E, 0x77, 0xB4, 0xA8, 0xD6, 0xDD, 0x8E, 0x78, /* 0xDC-0xDF */ + 0x8E, 0x79, 0xD1, 0xB2, 0xB3, 0xB2, 0x8E, 0x7A, /* 0xE0-0xE3 */ + 0x8E, 0x7B, 0xB9, 0xA4, 0xD7, 0xF3, 0xC7, 0xC9, /* 0xE4-0xE7 */ + 0xBE, 0xDE, 0xB9, 0xAE, 0x8E, 0x7C, 0xCE, 0xD7, /* 0xE8-0xEB */ + 0x8E, 0x7D, 0x8E, 0x7E, 0xB2, 0xEE, 0xDB, 0xCF, /* 0xEC-0xEF */ + 0x8E, 0x80, 0xBC, 0xBA, 0xD2, 0xD1, 0xCB, 0xC8, /* 0xF0-0xF3 */ + 0xB0, 0xCD, 0x8E, 0x81, 0x8E, 0x82, 0xCF, 0xEF, /* 0xF4-0xF7 */ + 0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xF8-0xFB */ + 0x8E, 0x87, 0xD9, 0xE3, 0xBD, 0xED, 0x8E, 0x88, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x8E, 0x89, 0xB1, 0xD2, 0xCA, 0xD0, 0xB2, 0xBC, /* 0x00-0x03 */ + 0x8E, 0x8A, 0xCB, 0xA7, 0xB7, 0xAB, 0x8E, 0x8B, /* 0x04-0x07 */ + 0xCA, 0xA6, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0x08-0x0B */ + 0xCF, 0xA3, 0x8E, 0x8F, 0x8E, 0x90, 0xE0, 0xF8, /* 0x0C-0x0F */ + 0xD5, 0xCA, 0xE0, 0xFB, 0x8E, 0x91, 0x8E, 0x92, /* 0x10-0x13 */ + 0xE0, 0xFA, 0xC5, 0xC1, 0xCC, 0xFB, 0x8E, 0x93, /* 0x14-0x17 */ + 0xC1, 0xB1, 0xE0, 0xF9, 0xD6, 0xE3, 0xB2, 0xAF, /* 0x18-0x1B */ + 0xD6, 0xC4, 0xB5, 0xDB, 0x8E, 0x94, 0x8E, 0x95, /* 0x1C-0x1F */ + 0x8E, 0x96, 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, /* 0x20-0x23 */ + 0x8E, 0x9A, 0x8E, 0x9B, 0xB4, 0xF8, 0xD6, 0xA1, /* 0x24-0x27 */ + 0x8E, 0x9C, 0x8E, 0x9D, 0x8E, 0x9E, 0x8E, 0x9F, /* 0x28-0x2B */ + 0x8E, 0xA0, 0xCF, 0xAF, 0xB0, 0xEF, 0x8E, 0xA1, /* 0x2C-0x2F */ + 0x8E, 0xA2, 0xE0, 0xFC, 0x8E, 0xA3, 0x8E, 0xA4, /* 0x30-0x33 */ + 0x8E, 0xA5, 0x8E, 0xA6, 0x8E, 0xA7, 0xE1, 0xA1, /* 0x34-0x37 */ + 0xB3, 0xA3, 0x8E, 0xA8, 0x8E, 0xA9, 0xE0, 0xFD, /* 0x38-0x3B */ + 0xE0, 0xFE, 0xC3, 0xB1, 0x8E, 0xAA, 0x8E, 0xAB, /* 0x3C-0x3F */ + 0x8E, 0xAC, 0x8E, 0xAD, 0xC3, 0xDD, 0x8E, 0xAE, /* 0x40-0x43 */ + 0xE1, 0xA2, 0xB7, 0xF9, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x44-0x47 */ + 0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x48-0x4B */ + 0xBB, 0xCF, 0x8E, 0xB5, 0x8E, 0xB6, 0x8E, 0xB7, /* 0x4C-0x4F */ + 0x8E, 0xB8, 0x8E, 0xB9, 0x8E, 0xBA, 0x8E, 0xBB, /* 0x50-0x53 */ + 0xE1, 0xA3, 0xC4, 0xBB, 0x8E, 0xBC, 0x8E, 0xBD, /* 0x54-0x57 */ + 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, 0xE1, 0xA4, /* 0x58-0x5B */ + 0x8E, 0xC1, 0x8E, 0xC2, 0xE1, 0xA5, 0x8E, 0xC3, /* 0x5C-0x5F */ + 0x8E, 0xC4, 0xE1, 0xA6, 0xB4, 0xB1, 0x8E, 0xC5, /* 0x60-0x63 */ + 0x8E, 0xC6, 0x8E, 0xC7, 0x8E, 0xC8, 0x8E, 0xC9, /* 0x64-0x67 */ + 0x8E, 0xCA, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x68-0x6B */ + 0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x6C-0x6F */ + 0x8E, 0xD2, 0x8E, 0xD3, 0xB8, 0xC9, 0xC6, 0xBD, /* 0x70-0x73 */ + 0xC4, 0xEA, 0x8E, 0xD4, 0xB2, 0xA2, 0x8E, 0xD5, /* 0x74-0x77 */ + 0xD0, 0xD2, 0x8E, 0xD6, 0xE7, 0xDB, 0xBB, 0xC3, /* 0x78-0x7B */ + 0xD3, 0xD7, 0xD3, 0xC4, 0x8E, 0xD7, 0xB9, 0xE3, /* 0x7C-0x7F */ + + 0xE2, 0xCF, 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, /* 0x80-0x83 */ + 0xD7, 0xAF, 0x8E, 0xDB, 0xC7, 0xEC, 0xB1, 0xD3, /* 0x84-0x87 */ + 0x8E, 0xDC, 0x8E, 0xDD, 0xB4, 0xB2, 0xE2, 0xD1, /* 0x88-0x8B */ + 0x8E, 0xDE, 0x8E, 0xDF, 0x8E, 0xE0, 0xD0, 0xF2, /* 0x8C-0x8F */ + 0xC2, 0xAE, 0xE2, 0xD0, 0x8E, 0xE1, 0xBF, 0xE2, /* 0x90-0x93 */ + 0xD3, 0xA6, 0xB5, 0xD7, 0xE2, 0xD2, 0xB5, 0xEA, /* 0x94-0x97 */ + 0x8E, 0xE2, 0xC3, 0xED, 0xB8, 0xFD, 0x8E, 0xE3, /* 0x98-0x9B */ + 0xB8, 0xAE, 0x8E, 0xE4, 0xC5, 0xD3, 0xB7, 0xCF, /* 0x9C-0x9F */ + 0xE2, 0xD4, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0xA0-0xA3 */ + 0x8E, 0xE8, 0xE2, 0xD3, 0xB6, 0xC8, 0xD7, 0xF9, /* 0xA4-0xA7 */ + 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, 0x8E, 0xEC, /* 0xA8-0xAB */ + 0x8E, 0xED, 0xCD, 0xA5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0xAC-0xAF */ + 0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0xE2, 0xD8, /* 0xB0-0xB3 */ + 0x8E, 0xF3, 0xE2, 0xD6, 0xCA, 0xFC, 0xBF, 0xB5, /* 0xB4-0xB7 */ + 0xD3, 0xB9, 0xE2, 0xD5, 0x8E, 0xF4, 0x8E, 0xF5, /* 0xB8-0xBB */ + 0x8E, 0xF6, 0x8E, 0xF7, 0xE2, 0xD7, 0x8E, 0xF8, /* 0xBC-0xBF */ + 0x8E, 0xF9, 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, /* 0xC0-0xC3 */ + 0x8E, 0xFD, 0x8E, 0xFE, 0x8F, 0x40, 0x8F, 0x41, /* 0xC4-0xC7 */ + 0x8F, 0x42, 0xC1, 0xAE, 0xC0, 0xC8, 0x8F, 0x43, /* 0xC8-0xCB */ + 0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0xCC-0xCF */ + 0x8F, 0x48, 0xE2, 0xDB, 0xE2, 0xDA, 0xC0, 0xAA, /* 0xD0-0xD3 */ + 0x8F, 0x49, 0x8F, 0x4A, 0xC1, 0xCE, 0x8F, 0x4B, /* 0xD4-0xD7 */ + 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, 0xE2, 0xDC, /* 0xD8-0xDB */ + 0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0xDC-0xDF */ + 0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0xE0-0xE3 */ + 0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0xE4-0xE7 */ + 0xE2, 0xDD, 0x8F, 0x5B, 0xE2, 0xDE, 0x8F, 0x5C, /* 0xE8-0xEB */ + 0x8F, 0x5D, 0x8F, 0x5E, 0x8F, 0x5F, 0x8F, 0x60, /* 0xEC-0xEF */ + 0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xF0-0xF3 */ + 0xDB, 0xC8, 0x8F, 0x65, 0xD1, 0xD3, 0xCD, 0xA2, /* 0xF4-0xF7 */ + 0x8F, 0x66, 0x8F, 0x67, 0xBD, 0xA8, 0x8F, 0x68, /* 0xF8-0xFB */ + 0x8F, 0x69, 0x8F, 0x6A, 0xDE, 0xC3, 0xD8, 0xA5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0xBF, 0xAA, 0xDB, 0xCD, 0xD2, 0xEC, 0xC6, 0xFA, /* 0x00-0x03 */ + 0xC5, 0xAA, 0x8F, 0x6B, 0x8F, 0x6C, 0x8F, 0x6D, /* 0x04-0x07 */ + 0xDE, 0xC4, 0x8F, 0x6E, 0xB1, 0xD7, 0xDF, 0xAE, /* 0x08-0x0B */ + 0x8F, 0x6F, 0x8F, 0x70, 0x8F, 0x71, 0xCA, 0xBD, /* 0x0C-0x0F */ + 0x8F, 0x72, 0xDF, 0xB1, 0x8F, 0x73, 0xB9, 0xAD, /* 0x10-0x13 */ + 0x8F, 0x74, 0xD2, 0xFD, 0x8F, 0x75, 0xB8, 0xA5, /* 0x14-0x17 */ + 0xBA, 0xEB, 0x8F, 0x76, 0x8F, 0x77, 0xB3, 0xDA, /* 0x18-0x1B */ + 0x8F, 0x78, 0x8F, 0x79, 0x8F, 0x7A, 0xB5, 0xDC, /* 0x1C-0x1F */ + 0xD5, 0xC5, 0x8F, 0x7B, 0x8F, 0x7C, 0x8F, 0x7D, /* 0x20-0x23 */ + 0x8F, 0x7E, 0xC3, 0xD6, 0xCF, 0xD2, 0xBB, 0xA1, /* 0x24-0x27 */ + 0x8F, 0x80, 0xE5, 0xF3, 0xE5, 0xF2, 0x8F, 0x81, /* 0x28-0x2B */ + 0x8F, 0x82, 0xE5, 0xF4, 0x8F, 0x83, 0xCD, 0xE4, /* 0x2C-0x2F */ + 0x8F, 0x84, 0xC8, 0xF5, 0x8F, 0x85, 0x8F, 0x86, /* 0x30-0x33 */ + 0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0x34-0x37 */ + 0x8F, 0x8B, 0xB5, 0xAF, 0xC7, 0xBF, 0x8F, 0x8C, /* 0x38-0x3B */ + 0xE5, 0xF6, 0x8F, 0x8D, 0x8F, 0x8E, 0x8F, 0x8F, /* 0x3C-0x3F */ + 0xEC, 0xB0, 0x8F, 0x90, 0x8F, 0x91, 0x8F, 0x92, /* 0x40-0x43 */ + 0x8F, 0x93, 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, /* 0x44-0x47 */ + 0x8F, 0x97, 0x8F, 0x98, 0x8F, 0x99, 0x8F, 0x9A, /* 0x48-0x4B */ + 0x8F, 0x9B, 0x8F, 0x9C, 0x8F, 0x9D, 0x8F, 0x9E, /* 0x4C-0x4F */ + 0xE5, 0xE6, 0x8F, 0x9F, 0xB9, 0xE9, 0xB5, 0xB1, /* 0x50-0x53 */ + 0x8F, 0xA0, 0xC2, 0xBC, 0xE5, 0xE8, 0xE5, 0xE7, /* 0x54-0x57 */ + 0xE5, 0xE9, 0x8F, 0xA1, 0x8F, 0xA2, 0x8F, 0xA3, /* 0x58-0x5B */ + 0x8F, 0xA4, 0xD2, 0xCD, 0x8F, 0xA5, 0x8F, 0xA6, /* 0x5C-0x5F */ + 0x8F, 0xA7, 0xE1, 0xEA, 0xD0, 0xCE, 0x8F, 0xA8, /* 0x60-0x63 */ + 0xCD, 0xAE, 0x8F, 0xA9, 0xD1, 0xE5, 0x8F, 0xAA, /* 0x64-0x67 */ + 0x8F, 0xAB, 0xB2, 0xCA, 0xB1, 0xEB, 0x8F, 0xAC, /* 0x68-0x6B */ + 0xB1, 0xF2, 0xC5, 0xED, 0x8F, 0xAD, 0x8F, 0xAE, /* 0x6C-0x6F */ + 0xD5, 0xC3, 0xD3, 0xB0, 0x8F, 0xAF, 0xE1, 0xDC, /* 0x70-0x73 */ + 0x8F, 0xB0, 0x8F, 0xB1, 0x8F, 0xB2, 0xE1, 0xDD, /* 0x74-0x77 */ + 0x8F, 0xB3, 0xD2, 0xDB, 0x8F, 0xB4, 0xB3, 0xB9, /* 0x78-0x7B */ + 0xB1, 0xCB, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x7C-0x7F */ + + 0xCD, 0xF9, 0xD5, 0xF7, 0xE1, 0xDE, 0x8F, 0xB8, /* 0x80-0x83 */ + 0xBE, 0xB6, 0xB4, 0xFD, 0x8F, 0xB9, 0xE1, 0xDF, /* 0x84-0x87 */ + 0xBA, 0xDC, 0xE1, 0xE0, 0xBB, 0xB2, 0xC2, 0xC9, /* 0x88-0x8B */ + 0xE1, 0xE1, 0x8F, 0xBA, 0x8F, 0xBB, 0x8F, 0xBC, /* 0x8C-0x8F */ + 0xD0, 0xEC, 0x8F, 0xBD, 0xCD, 0xBD, 0x8F, 0xBE, /* 0x90-0x93 */ + 0x8F, 0xBF, 0xE1, 0xE2, 0x8F, 0xC0, 0xB5, 0xC3, /* 0x94-0x97 */ + 0xC5, 0xC7, 0xE1, 0xE3, 0x8F, 0xC1, 0x8F, 0xC2, /* 0x98-0x9B */ + 0xE1, 0xE4, 0x8F, 0xC3, 0x8F, 0xC4, 0x8F, 0xC5, /* 0x9C-0x9F */ + 0x8F, 0xC6, 0xD3, 0xF9, 0x8F, 0xC7, 0x8F, 0xC8, /* 0xA0-0xA3 */ + 0x8F, 0xC9, 0x8F, 0xCA, 0x8F, 0xCB, 0x8F, 0xCC, /* 0xA4-0xA7 */ + 0xE1, 0xE5, 0x8F, 0xCD, 0xD1, 0xAD, 0x8F, 0xCE, /* 0xA8-0xAB */ + 0x8F, 0xCF, 0xE1, 0xE6, 0xCE, 0xA2, 0x8F, 0xD0, /* 0xAC-0xAF */ + 0x8F, 0xD1, 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, /* 0xB0-0xB3 */ + 0x8F, 0xD5, 0xE1, 0xE7, 0x8F, 0xD6, 0xB5, 0xC2, /* 0xB4-0xB7 */ + 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, 0x8F, 0xDA, /* 0xB8-0xBB */ + 0xE1, 0xE8, 0xBB, 0xD5, 0x8F, 0xDB, 0x8F, 0xDC, /* 0xBC-0xBF */ + 0x8F, 0xDD, 0x8F, 0xDE, 0x8F, 0xDF, 0xD0, 0xC4, /* 0xC0-0xC3 */ + 0xE2, 0xE0, 0xB1, 0xD8, 0xD2, 0xE4, 0x8F, 0xE0, /* 0xC4-0xC7 */ + 0x8F, 0xE1, 0xE2, 0xE1, 0x8F, 0xE2, 0x8F, 0xE3, /* 0xC8-0xCB */ + 0xBC, 0xC9, 0xC8, 0xCC, 0x8F, 0xE4, 0xE2, 0xE3, /* 0xCC-0xCF */ + 0xEC, 0xFE, 0xEC, 0xFD, 0xDF, 0xAF, 0x8F, 0xE5, /* 0xD0-0xD3 */ + 0x8F, 0xE6, 0x8F, 0xE7, 0xE2, 0xE2, 0xD6, 0xBE, /* 0xD4-0xD7 */ + 0xCD, 0xFC, 0xC3, 0xA6, 0x8F, 0xE8, 0x8F, 0xE9, /* 0xD8-0xDB */ + 0x8F, 0xEA, 0xE3, 0xC3, 0x8F, 0xEB, 0x8F, 0xEC, /* 0xDC-0xDF */ + 0xD6, 0xD2, 0xE2, 0xE7, 0x8F, 0xED, 0x8F, 0xEE, /* 0xE0-0xE3 */ + 0xE2, 0xE8, 0x8F, 0xEF, 0x8F, 0xF0, 0xD3, 0xC7, /* 0xE4-0xE7 */ + 0x8F, 0xF1, 0x8F, 0xF2, 0xE2, 0xEC, 0xBF, 0xEC, /* 0xE8-0xEB */ + 0x8F, 0xF3, 0xE2, 0xED, 0xE2, 0xE5, 0x8F, 0xF4, /* 0xEC-0xEF */ + 0x8F, 0xF5, 0xB3, 0xC0, 0x8F, 0xF6, 0x8F, 0xF7, /* 0xF0-0xF3 */ + 0x8F, 0xF8, 0xC4, 0xEE, 0x8F, 0xF9, 0x8F, 0xFA, /* 0xF4-0xF7 */ + 0xE2, 0xEE, 0x8F, 0xFB, 0x8F, 0xFC, 0xD0, 0xC3, /* 0xF8-0xFB */ + 0x8F, 0xFD, 0xBA, 0xF6, 0xE2, 0xE9, 0xB7, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0xBB, 0xB3, 0xCC, 0xAC, 0xCB, 0xCB, 0xE2, 0xE4, /* 0x00-0x03 */ + 0xE2, 0xE6, 0xE2, 0xEA, 0xE2, 0xEB, 0x8F, 0xFE, /* 0x04-0x07 */ + 0x90, 0x40, 0x90, 0x41, 0xE2, 0xF7, 0x90, 0x42, /* 0x08-0x0B */ + 0x90, 0x43, 0xE2, 0xF4, 0xD4, 0xF5, 0xE2, 0xF3, /* 0x0C-0x0F */ + 0x90, 0x44, 0x90, 0x45, 0xC5, 0xAD, 0x90, 0x46, /* 0x10-0x13 */ + 0xD5, 0xFA, 0xC5, 0xC2, 0xB2, 0xC0, 0x90, 0x47, /* 0x14-0x17 */ + 0x90, 0x48, 0xE2, 0xEF, 0x90, 0x49, 0xE2, 0xF2, /* 0x18-0x1B */ + 0xC1, 0xAF, 0xCB, 0xBC, 0x90, 0x4A, 0x90, 0x4B, /* 0x1C-0x1F */ + 0xB5, 0xA1, 0xE2, 0xF9, 0x90, 0x4C, 0x90, 0x4D, /* 0x20-0x23 */ + 0x90, 0x4E, 0xBC, 0xB1, 0xE2, 0xF1, 0xD0, 0xD4, /* 0x24-0x27 */ + 0xD4, 0xB9, 0xE2, 0xF5, 0xB9, 0xD6, 0xE2, 0xF6, /* 0x28-0x2B */ + 0x90, 0x4F, 0x90, 0x50, 0x90, 0x51, 0xC7, 0xD3, /* 0x2C-0x2F */ + 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, 0x90, 0x55, /* 0x30-0x33 */ + 0x90, 0x56, 0xE2, 0xF0, 0x90, 0x57, 0x90, 0x58, /* 0x34-0x37 */ + 0x90, 0x59, 0x90, 0x5A, 0x90, 0x5B, 0xD7, 0xDC, /* 0x38-0x3B */ + 0xED, 0xA1, 0x90, 0x5C, 0x90, 0x5D, 0xE2, 0xF8, /* 0x3C-0x3F */ + 0x90, 0x5E, 0xED, 0xA5, 0xE2, 0xFE, 0xCA, 0xD1, /* 0x40-0x43 */ + 0x90, 0x5F, 0x90, 0x60, 0x90, 0x61, 0x90, 0x62, /* 0x44-0x47 */ + 0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0xC1, 0xB5, /* 0x48-0x4B */ + 0x90, 0x66, 0xBB, 0xD0, 0x90, 0x67, 0x90, 0x68, /* 0x4C-0x4F */ + 0xBF, 0xD6, 0x90, 0x69, 0xBA, 0xE3, 0x90, 0x6A, /* 0x50-0x53 */ + 0x90, 0x6B, 0xCB, 0xA1, 0x90, 0x6C, 0x90, 0x6D, /* 0x54-0x57 */ + 0x90, 0x6E, 0xED, 0xA6, 0xED, 0xA3, 0x90, 0x6F, /* 0x58-0x5B */ + 0x90, 0x70, 0xED, 0xA2, 0x90, 0x71, 0x90, 0x72, /* 0x5C-0x5F */ + 0x90, 0x73, 0x90, 0x74, 0xBB, 0xD6, 0xED, 0xA7, /* 0x60-0x63 */ + 0xD0, 0xF4, 0x90, 0x75, 0x90, 0x76, 0xED, 0xA4, /* 0x64-0x67 */ + 0xBA, 0xDE, 0xB6, 0xF7, 0xE3, 0xA1, 0xB6, 0xB2, /* 0x68-0x6B */ + 0xCC, 0xF1, 0xB9, 0xA7, 0x90, 0x77, 0xCF, 0xA2, /* 0x6C-0x6F */ + 0xC7, 0xA1, 0x90, 0x78, 0x90, 0x79, 0xBF, 0xD2, /* 0x70-0x73 */ + 0x90, 0x7A, 0x90, 0x7B, 0xB6, 0xF1, 0x90, 0x7C, /* 0x74-0x77 */ + 0xE2, 0xFA, 0xE2, 0xFB, 0xE2, 0xFD, 0xE2, 0xFC, /* 0x78-0x7B */ + 0xC4, 0xD5, 0xE3, 0xA2, 0x90, 0x7D, 0xD3, 0xC1, /* 0x7C-0x7F */ + + 0x90, 0x7E, 0x90, 0x80, 0x90, 0x81, 0xE3, 0xA7, /* 0x80-0x83 */ + 0xC7, 0xC4, 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, /* 0x84-0x87 */ + 0x90, 0x85, 0xCF, 0xA4, 0x90, 0x86, 0x90, 0x87, /* 0x88-0x8B */ + 0xE3, 0xA9, 0xBA, 0xB7, 0x90, 0x88, 0x90, 0x89, /* 0x8C-0x8F */ + 0x90, 0x8A, 0x90, 0x8B, 0xE3, 0xA8, 0x90, 0x8C, /* 0x90-0x93 */ + 0xBB, 0xDA, 0x90, 0x8D, 0xE3, 0xA3, 0x90, 0x8E, /* 0x94-0x97 */ + 0x90, 0x8F, 0x90, 0x90, 0xE3, 0xA4, 0xE3, 0xAA, /* 0x98-0x9B */ + 0x90, 0x91, 0xE3, 0xA6, 0x90, 0x92, 0xCE, 0xF2, /* 0x9C-0x9F */ + 0xD3, 0xC6, 0x90, 0x93, 0x90, 0x94, 0xBB, 0xBC, /* 0xA0-0xA3 */ + 0x90, 0x95, 0x90, 0x96, 0xD4, 0xC3, 0x90, 0x97, /* 0xA4-0xA7 */ + 0xC4, 0xFA, 0x90, 0x98, 0x90, 0x99, 0xED, 0xA8, /* 0xA8-0xAB */ + 0xD0, 0xFC, 0xE3, 0xA5, 0x90, 0x9A, 0xC3, 0xF5, /* 0xAC-0xAF */ + 0x90, 0x9B, 0xE3, 0xAD, 0xB1, 0xAF, 0x90, 0x9C, /* 0xB0-0xB3 */ + 0xE3, 0xB2, 0x90, 0x9D, 0x90, 0x9E, 0x90, 0x9F, /* 0xB4-0xB7 */ + 0xBC, 0xC2, 0x90, 0xA0, 0x90, 0xA1, 0xE3, 0xAC, /* 0xB8-0xBB */ + 0xB5, 0xBF, 0x90, 0xA2, 0x90, 0xA3, 0x90, 0xA4, /* 0xBC-0xBF */ + 0x90, 0xA5, 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, /* 0xC0-0xC3 */ + 0x90, 0xA9, 0xC7, 0xE9, 0xE3, 0xB0, 0x90, 0xAA, /* 0xC4-0xC7 */ + 0x90, 0xAB, 0x90, 0xAC, 0xBE, 0xAA, 0xCD, 0xEF, /* 0xC8-0xCB */ + 0x90, 0xAD, 0x90, 0xAE, 0x90, 0xAF, 0x90, 0xB0, /* 0xCC-0xCF */ + 0x90, 0xB1, 0xBB, 0xF3, 0x90, 0xB2, 0x90, 0xB3, /* 0xD0-0xD3 */ + 0x90, 0xB4, 0xCC, 0xE8, 0x90, 0xB5, 0x90, 0xB6, /* 0xD4-0xD7 */ + 0xE3, 0xAF, 0x90, 0xB7, 0xE3, 0xB1, 0x90, 0xB8, /* 0xD8-0xDB */ + 0xCF, 0xA7, 0xE3, 0xAE, 0x90, 0xB9, 0xCE, 0xA9, /* 0xDC-0xDF */ + 0xBB, 0xDD, 0x90, 0xBA, 0x90, 0xBB, 0x90, 0xBC, /* 0xE0-0xE3 */ + 0x90, 0xBD, 0x90, 0xBE, 0xB5, 0xEB, 0xBE, 0xE5, /* 0xE4-0xE7 */ + 0xB2, 0xD2, 0xB3, 0xCD, 0x90, 0xBF, 0xB1, 0xB9, /* 0xE8-0xEB */ + 0xE3, 0xAB, 0xB2, 0xD1, 0xB5, 0xAC, 0xB9, 0xDF, /* 0xEC-0xEF */ + 0xB6, 0xE8, 0x90, 0xC0, 0x90, 0xC1, 0xCF, 0xEB, /* 0xF0-0xF3 */ + 0xE3, 0xB7, 0x90, 0xC2, 0xBB, 0xCC, 0x90, 0xC3, /* 0xF4-0xF7 */ + 0x90, 0xC4, 0xC8, 0xC7, 0xD0, 0xCA, 0x90, 0xC5, /* 0xF8-0xFB */ + 0x90, 0xC6, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_61[512] = { + 0xE3, 0xB8, 0xB3, 0xEE, 0x90, 0xCA, 0x90, 0xCB, /* 0x00-0x03 */ + 0x90, 0xCC, 0x90, 0xCD, 0xED, 0xA9, 0x90, 0xCE, /* 0x04-0x07 */ + 0xD3, 0xFA, 0xD3, 0xE4, 0x90, 0xCF, 0x90, 0xD0, /* 0x08-0x0B */ + 0x90, 0xD1, 0xED, 0xAA, 0xE3, 0xB9, 0xD2, 0xE2, /* 0x0C-0x0F */ + 0x90, 0xD2, 0x90, 0xD3, 0x90, 0xD4, 0x90, 0xD5, /* 0x10-0x13 */ + 0x90, 0xD6, 0xE3, 0xB5, 0x90, 0xD7, 0x90, 0xD8, /* 0x14-0x17 */ + 0x90, 0xD9, 0x90, 0xDA, 0xD3, 0xDE, 0x90, 0xDB, /* 0x18-0x1B */ + 0x90, 0xDC, 0x90, 0xDD, 0x90, 0xDE, 0xB8, 0xD0, /* 0x1C-0x1F */ + 0xE3, 0xB3, 0x90, 0xDF, 0x90, 0xE0, 0xE3, 0xB6, /* 0x20-0x23 */ + 0xB7, 0xDF, 0x90, 0xE1, 0xE3, 0xB4, 0xC0, 0xA2, /* 0x24-0x27 */ + 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, 0xE3, 0xBA, /* 0x28-0x2B */ + 0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x2C-0x2F */ + 0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x30-0x33 */ + 0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x34-0x37 */ + 0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x38-0x3B */ + 0x90, 0xF5, 0x90, 0xF6, 0x90, 0xF7, 0xD4, 0xB8, /* 0x3C-0x3F */ + 0x90, 0xF8, 0x90, 0xF9, 0x90, 0xFA, 0x90, 0xFB, /* 0x40-0x43 */ + 0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x40, /* 0x44-0x47 */ + 0xB4, 0xC8, 0x91, 0x41, 0xE3, 0xBB, 0x91, 0x42, /* 0x48-0x4B */ + 0xBB, 0xC5, 0x91, 0x43, 0xC9, 0xF7, 0x91, 0x44, /* 0x4C-0x4F */ + 0x91, 0x45, 0xC9, 0xE5, 0x91, 0x46, 0x91, 0x47, /* 0x50-0x53 */ + 0x91, 0x48, 0xC4, 0xBD, 0x91, 0x49, 0x91, 0x4A, /* 0x54-0x57 */ + 0x91, 0x4B, 0x91, 0x4C, 0x91, 0x4D, 0x91, 0x4E, /* 0x58-0x5B */ + 0x91, 0x4F, 0xED, 0xAB, 0x91, 0x50, 0x91, 0x51, /* 0x5C-0x5F */ + 0x91, 0x52, 0x91, 0x53, 0xC2, 0xFD, 0x91, 0x54, /* 0x60-0x63 */ + 0x91, 0x55, 0x91, 0x56, 0x91, 0x57, 0xBB, 0xDB, /* 0x64-0x67 */ + 0xBF, 0xAE, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x68-0x6B */ + 0x91, 0x5B, 0x91, 0x5C, 0x91, 0x5D, 0x91, 0x5E, /* 0x6C-0x6F */ + 0xCE, 0xBF, 0x91, 0x5F, 0x91, 0x60, 0x91, 0x61, /* 0x70-0x73 */ + 0x91, 0x62, 0xE3, 0xBC, 0x91, 0x63, 0xBF, 0xB6, /* 0x74-0x77 */ + 0x91, 0x64, 0x91, 0x65, 0x91, 0x66, 0x91, 0x67, /* 0x78-0x7B */ + 0x91, 0x68, 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, /* 0x7C-0x7F */ + + 0x91, 0x6C, 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, /* 0x80-0x83 */ + 0x91, 0x70, 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, /* 0x84-0x87 */ + 0x91, 0x74, 0x91, 0x75, 0x91, 0x76, 0xB1, 0xEF, /* 0x88-0x8B */ + 0x91, 0x77, 0x91, 0x78, 0xD4, 0xF7, 0x91, 0x79, /* 0x8C-0x8F */ + 0x91, 0x7A, 0x91, 0x7B, 0x91, 0x7C, 0x91, 0x7D, /* 0x90-0x93 */ + 0xE3, 0xBE, 0x91, 0x7E, 0x91, 0x80, 0x91, 0x81, /* 0x94-0x97 */ + 0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x98-0x9B */ + 0x91, 0x86, 0xED, 0xAD, 0x91, 0x87, 0x91, 0x88, /* 0x9C-0x9F */ + 0x91, 0x89, 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, /* 0xA0-0xA3 */ + 0x91, 0x8D, 0x91, 0x8E, 0x91, 0x8F, 0xE3, 0xBF, /* 0xA4-0xA7 */ + 0xBA, 0xA9, 0xED, 0xAC, 0x91, 0x90, 0x91, 0x91, /* 0xA8-0xAB */ + 0xE3, 0xBD, 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, /* 0xAC-0xAF */ + 0x91, 0x95, 0x91, 0x96, 0x91, 0x97, 0x91, 0x98, /* 0xB0-0xB3 */ + 0x91, 0x99, 0x91, 0x9A, 0x91, 0x9B, 0xE3, 0xC0, /* 0xB4-0xB7 */ + 0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB8-0xBB */ + 0x91, 0xA0, 0x91, 0xA1, 0xBA, 0xB6, 0x91, 0xA2, /* 0xBC-0xBF */ + 0x91, 0xA3, 0x91, 0xA4, 0xB6, 0xAE, 0x91, 0xA5, /* 0xC0-0xC3 */ + 0x91, 0xA6, 0x91, 0xA7, 0x91, 0xA8, 0x91, 0xA9, /* 0xC4-0xC7 */ + 0xD0, 0xB8, 0x91, 0xAA, 0xB0, 0xC3, 0xED, 0xAE, /* 0xC8-0xCB */ + 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, 0x91, 0xAE, /* 0xCC-0xCF */ + 0x91, 0xAF, 0xED, 0xAF, 0xC0, 0xC1, 0x91, 0xB0, /* 0xD0-0xD3 */ + 0xE3, 0xC1, 0x91, 0xB1, 0x91, 0xB2, 0x91, 0xB3, /* 0xD4-0xD7 */ + 0x91, 0xB4, 0x91, 0xB5, 0x91, 0xB6, 0x91, 0xB7, /* 0xD8-0xDB */ + 0x91, 0xB8, 0x91, 0xB9, 0x91, 0xBA, 0x91, 0xBB, /* 0xDC-0xDF */ + 0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xE0-0xE3 */ + 0x91, 0xC0, 0x91, 0xC1, 0xC5, 0xB3, 0x91, 0xC2, /* 0xE4-0xE7 */ + 0x91, 0xC3, 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, /* 0xE8-0xEB */ + 0x91, 0xC7, 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, /* 0xEC-0xEF */ + 0x91, 0xCB, 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, /* 0xF0-0xF3 */ + 0x91, 0xCF, 0xE3, 0xC2, 0x91, 0xD0, 0x91, 0xD1, /* 0xF4-0xF7 */ + 0x91, 0xD2, 0x91, 0xD3, 0x91, 0xD4, 0x91, 0xD5, /* 0xF8-0xFB */ + 0x91, 0xD6, 0x91, 0xD7, 0x91, 0xD8, 0xDC, 0xB2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, 0x91, 0xDC, /* 0x00-0x03 */ + 0x91, 0xDD, 0x91, 0xDE, 0xED, 0xB0, 0x91, 0xDF, /* 0x04-0x07 */ + 0xB8, 0xEA, 0x91, 0xE0, 0xCE, 0xEC, 0xEA, 0xA7, /* 0x08-0x0B */ + 0xD0, 0xE7, 0xCA, 0xF9, 0xC8, 0xD6, 0xCF, 0xB7, /* 0x0C-0x0F */ + 0xB3, 0xC9, 0xCE, 0xD2, 0xBD, 0xE4, 0x91, 0xE1, /* 0x10-0x13 */ + 0x91, 0xE2, 0xE3, 0xDE, 0xBB, 0xF2, 0xEA, 0xA8, /* 0x14-0x17 */ + 0xD5, 0xBD, 0x91, 0xE3, 0xC6, 0xDD, 0xEA, 0xA9, /* 0x18-0x1B */ + 0x91, 0xE4, 0x91, 0xE5, 0x91, 0xE6, 0xEA, 0xAA, /* 0x1C-0x1F */ + 0x91, 0xE7, 0xEA, 0xAC, 0xEA, 0xAB, 0x91, 0xE8, /* 0x20-0x23 */ + 0xEA, 0xAE, 0xEA, 0xAD, 0x91, 0xE9, 0x91, 0xEA, /* 0x24-0x27 */ + 0x91, 0xEB, 0x91, 0xEC, 0xBD, 0xD8, 0x91, 0xED, /* 0x28-0x2B */ + 0xEA, 0xAF, 0x91, 0xEE, 0xC2, 0xBE, 0x91, 0xEF, /* 0x2C-0x2F */ + 0x91, 0xF0, 0x91, 0xF1, 0x91, 0xF2, 0xB4, 0xC1, /* 0x30-0x33 */ + 0xB4, 0xF7, 0x91, 0xF3, 0x91, 0xF4, 0xBB, 0xA7, /* 0x34-0x37 */ + 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, 0x91, 0xF8, /* 0x38-0x3B */ + 0x91, 0xF9, 0xEC, 0xE6, 0xEC, 0xE5, 0xB7, 0xBF, /* 0x3C-0x3F */ + 0xCB, 0xF9, 0xB1, 0xE2, 0x91, 0xFA, 0xEC, 0xE7, /* 0x40-0x43 */ + 0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0xC9, 0xC8, /* 0x44-0x47 */ + 0xEC, 0xE8, 0xEC, 0xE9, 0x91, 0xFE, 0xCA, 0xD6, /* 0x48-0x4B */ + 0xDE, 0xD0, 0xB2, 0xC5, 0xD4, 0xFA, 0x92, 0x40, /* 0x4C-0x4F */ + 0x92, 0x41, 0xC6, 0xCB, 0xB0, 0xC7, 0xB4, 0xF2, /* 0x50-0x53 */ + 0xC8, 0xD3, 0x92, 0x42, 0x92, 0x43, 0x92, 0x44, /* 0x54-0x57 */ + 0xCD, 0xD0, 0x92, 0x45, 0x92, 0x46, 0xBF, 0xB8, /* 0x58-0x5B */ + 0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x5C-0x5F */ + 0x92, 0x4B, 0x92, 0x4C, 0x92, 0x4D, 0xBF, 0xDB, /* 0x60-0x63 */ + 0x92, 0x4E, 0x92, 0x4F, 0xC7, 0xA4, 0xD6, 0xB4, /* 0x64-0x67 */ + 0x92, 0x50, 0xC0, 0xA9, 0xDE, 0xD1, 0xC9, 0xA8, /* 0x68-0x6B */ + 0xD1, 0xEF, 0xC5, 0xA4, 0xB0, 0xE7, 0xB3, 0xB6, /* 0x6C-0x6F */ + 0xC8, 0xC5, 0x92, 0x51, 0x92, 0x52, 0xB0, 0xE2, /* 0x70-0x73 */ + 0x92, 0x53, 0x92, 0x54, 0xB7, 0xF6, 0x92, 0x55, /* 0x74-0x77 */ + 0x92, 0x56, 0xC5, 0xFA, 0x92, 0x57, 0x92, 0x58, /* 0x78-0x7B */ + 0xB6, 0xF3, 0x92, 0x59, 0xD5, 0xD2, 0xB3, 0xD0, /* 0x7C-0x7F */ + + 0xBC, 0xBC, 0x92, 0x5A, 0x92, 0x5B, 0x92, 0x5C, /* 0x80-0x83 */ + 0xB3, 0xAD, 0x92, 0x5D, 0x92, 0x5E, 0x92, 0x5F, /* 0x84-0x87 */ + 0x92, 0x60, 0xBE, 0xF1, 0xB0, 0xD1, 0x92, 0x61, /* 0x88-0x8B */ + 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, 0x92, 0x65, /* 0x8C-0x8F */ + 0x92, 0x66, 0xD2, 0xD6, 0xCA, 0xE3, 0xD7, 0xA5, /* 0x90-0x93 */ + 0x92, 0x67, 0xCD, 0xB6, 0xB6, 0xB6, 0xBF, 0xB9, /* 0x94-0x97 */ + 0xD5, 0xDB, 0x92, 0x68, 0xB8, 0xA7, 0xC5, 0xD7, /* 0x98-0x9B */ + 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, 0xDE, 0xD2, /* 0x9C-0x9F */ + 0xBF, 0xD9, 0xC2, 0xD5, 0xC7, 0xC0, 0x92, 0x6C, /* 0xA0-0xA3 */ + 0xBB, 0xA4, 0xB1, 0xA8, 0x92, 0x6D, 0x92, 0x6E, /* 0xA4-0xA7 */ + 0xC5, 0xEA, 0x92, 0x6F, 0x92, 0x70, 0xC5, 0xFB, /* 0xA8-0xAB */ + 0xCC, 0xA7, 0x92, 0x71, 0x92, 0x72, 0x92, 0x73, /* 0xAC-0xAF */ + 0x92, 0x74, 0xB1, 0xA7, 0x92, 0x75, 0x92, 0x76, /* 0xB0-0xB3 */ + 0x92, 0x77, 0xB5, 0xD6, 0x92, 0x78, 0x92, 0x79, /* 0xB4-0xB7 */ + 0x92, 0x7A, 0xC4, 0xA8, 0x92, 0x7B, 0xDE, 0xD3, /* 0xB8-0xBB */ + 0xD1, 0xBA, 0xB3, 0xE9, 0x92, 0x7C, 0xC3, 0xF2, /* 0xBC-0xBF */ + 0x92, 0x7D, 0x92, 0x7E, 0xB7, 0xF7, 0x92, 0x80, /* 0xC0-0xC3 */ + 0xD6, 0xF4, 0xB5, 0xA3, 0xB2, 0xF0, 0xC4, 0xB4, /* 0xC4-0xC7 */ + 0xC4, 0xE9, 0xC0, 0xAD, 0xDE, 0xD4, 0x92, 0x81, /* 0xC8-0xCB */ + 0xB0, 0xE8, 0xC5, 0xC4, 0xC1, 0xE0, 0x92, 0x82, /* 0xCC-0xCF */ + 0xB9, 0xD5, 0x92, 0x83, 0xBE, 0xDC, 0xCD, 0xD8, /* 0xD0-0xD3 */ + 0xB0, 0xCE, 0x92, 0x84, 0xCD, 0xCF, 0xDE, 0xD6, /* 0xD4-0xD7 */ + 0xBE, 0xD0, 0xD7, 0xBE, 0xDE, 0xD5, 0xD5, 0xD0, /* 0xD8-0xDB */ + 0xB0, 0xDD, 0x92, 0x85, 0x92, 0x86, 0xC4, 0xE2, /* 0xDC-0xDF */ + 0x92, 0x87, 0x92, 0x88, 0xC2, 0xA3, 0xBC, 0xF0, /* 0xE0-0xE3 */ + 0x92, 0x89, 0xD3, 0xB5, 0xC0, 0xB9, 0xC5, 0xA1, /* 0xE4-0xE7 */ + 0xB2, 0xA6, 0xD4, 0xF1, 0x92, 0x8A, 0x92, 0x8B, /* 0xE8-0xEB */ + 0xC0, 0xA8, 0xCA, 0xC3, 0xDE, 0xD7, 0xD5, 0xFC, /* 0xEC-0xEF */ + 0x92, 0x8C, 0xB9, 0xB0, 0x92, 0x8D, 0xC8, 0xAD, /* 0xF0-0xF3 */ + 0xCB, 0xA9, 0x92, 0x8E, 0xDE, 0xD9, 0xBF, 0xBD, /* 0xF4-0xF7 */ + 0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0xF8-0xFB */ + 0xC6, 0xB4, 0xD7, 0xA7, 0xCA, 0xB0, 0xC4, 0xC3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x92, 0x93, 0xB3, 0xD6, 0xB9, 0xD2, 0x92, 0x94, /* 0x00-0x03 */ + 0x92, 0x95, 0x92, 0x96, 0x92, 0x97, 0xD6, 0xB8, /* 0x04-0x07 */ + 0xEA, 0xFC, 0xB0, 0xB4, 0x92, 0x98, 0x92, 0x99, /* 0x08-0x0B */ + 0x92, 0x9A, 0x92, 0x9B, 0xBF, 0xE6, 0x92, 0x9C, /* 0x0C-0x0F */ + 0x92, 0x9D, 0xCC, 0xF4, 0x92, 0x9E, 0x92, 0x9F, /* 0x10-0x13 */ + 0x92, 0xA0, 0x92, 0xA1, 0xCD, 0xDA, 0x92, 0xA2, /* 0x14-0x17 */ + 0x92, 0xA3, 0x92, 0xA4, 0xD6, 0xBF, 0xC2, 0xCE, /* 0x18-0x1B */ + 0x92, 0xA5, 0xCE, 0xCE, 0xCC, 0xA2, 0xD0, 0xAE, /* 0x1C-0x1F */ + 0xC4, 0xD3, 0xB5, 0xB2, 0xDE, 0xD8, 0xD5, 0xF5, /* 0x20-0x23 */ + 0xBC, 0xB7, 0xBB, 0xD3, 0x92, 0xA6, 0x92, 0xA7, /* 0x24-0x27 */ + 0xB0, 0xA4, 0x92, 0xA8, 0xC5, 0xB2, 0xB4, 0xEC, /* 0x28-0x2B */ + 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, 0xD5, 0xF1, /* 0x2C-0x2F */ + 0x92, 0xAC, 0x92, 0xAD, 0xEA, 0xFD, 0x92, 0xAE, /* 0x30-0x33 */ + 0x92, 0xAF, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0x34-0x37 */ + 0x92, 0xB3, 0xDE, 0xDA, 0xCD, 0xA6, 0x92, 0xB4, /* 0x38-0x3B */ + 0x92, 0xB5, 0xCD, 0xEC, 0x92, 0xB6, 0x92, 0xB7, /* 0x3C-0x3F */ + 0x92, 0xB8, 0x92, 0xB9, 0xCE, 0xE6, 0xDE, 0xDC, /* 0x40-0x43 */ + 0x92, 0xBA, 0xCD, 0xB1, 0xC0, 0xA6, 0x92, 0xBB, /* 0x44-0x47 */ + 0x92, 0xBC, 0xD7, 0xBD, 0x92, 0xBD, 0xDE, 0xDB, /* 0x48-0x4B */ + 0xB0, 0xC6, 0xBA, 0xB4, 0xC9, 0xD3, 0xC4, 0xF3, /* 0x4C-0x4F */ + 0xBE, 0xE8, 0x92, 0xBE, 0x92, 0xBF, 0x92, 0xC0, /* 0x50-0x53 */ + 0x92, 0xC1, 0xB2, 0xB6, 0x92, 0xC2, 0x92, 0xC3, /* 0x54-0x57 */ + 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, 0x92, 0xC7, /* 0x58-0x5B */ + 0x92, 0xC8, 0x92, 0xC9, 0xC0, 0xCC, 0xCB, 0xF0, /* 0x5C-0x5F */ + 0x92, 0xCA, 0xBC, 0xF1, 0xBB, 0xBB, 0xB5, 0xB7, /* 0x60-0x63 */ + 0x92, 0xCB, 0x92, 0xCC, 0x92, 0xCD, 0xC5, 0xF5, /* 0x64-0x67 */ + 0x92, 0xCE, 0xDE, 0xE6, 0x92, 0xCF, 0x92, 0xD0, /* 0x68-0x6B */ + 0x92, 0xD1, 0xDE, 0xE3, 0xBE, 0xDD, 0x92, 0xD2, /* 0x6C-0x6F */ + 0x92, 0xD3, 0xDE, 0xDF, 0x92, 0xD4, 0x92, 0xD5, /* 0x70-0x73 */ + 0x92, 0xD6, 0x92, 0xD7, 0xB4, 0xB7, 0xBD, 0xDD, /* 0x74-0x77 */ + 0x92, 0xD8, 0x92, 0xD9, 0xDE, 0xE0, 0xC4, 0xED, /* 0x78-0x7B */ + 0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0x7C-0x7F */ + + 0xCF, 0xC6, 0x92, 0xDE, 0xB5, 0xE0, 0x92, 0xDF, /* 0x80-0x83 */ + 0x92, 0xE0, 0x92, 0xE1, 0x92, 0xE2, 0xB6, 0xDE, /* 0x84-0x87 */ + 0xCA, 0xDA, 0xB5, 0xF4, 0xDE, 0xE5, 0x92, 0xE3, /* 0x88-0x8B */ + 0xD5, 0xC6, 0x92, 0xE4, 0xDE, 0xE1, 0xCC, 0xCD, /* 0x8C-0x8F */ + 0xC6, 0xFE, 0x92, 0xE5, 0xC5, 0xC5, 0x92, 0xE6, /* 0x90-0x93 */ + 0x92, 0xE7, 0x92, 0xE8, 0xD2, 0xB4, 0x92, 0xE9, /* 0x94-0x97 */ + 0xBE, 0xF2, 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, /* 0x98-0x9B */ + 0x92, 0xED, 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, /* 0x9C-0x9F */ + 0xC2, 0xD3, 0x92, 0xF1, 0xCC, 0xBD, 0xB3, 0xB8, /* 0xA0-0xA3 */ + 0x92, 0xF2, 0xBD, 0xD3, 0x92, 0xF3, 0xBF, 0xD8, /* 0xA4-0xA7 */ + 0xCD, 0xC6, 0xD1, 0xDA, 0xB4, 0xEB, 0x92, 0xF4, /* 0xA8-0xAB */ + 0xDE, 0xE4, 0xDE, 0xDD, 0xDE, 0xE7, 0x92, 0xF5, /* 0xAC-0xAF */ + 0xEA, 0xFE, 0x92, 0xF6, 0x92, 0xF7, 0xC2, 0xB0, /* 0xB0-0xB3 */ + 0xDE, 0xE2, 0x92, 0xF8, 0x92, 0xF9, 0xD6, 0xC0, /* 0xB4-0xB7 */ + 0xB5, 0xA7, 0x92, 0xFA, 0xB2, 0xF4, 0x92, 0xFB, /* 0xB8-0xBB */ + 0xDE, 0xE8, 0x92, 0xFC, 0xDE, 0xF2, 0x92, 0xFD, /* 0xBC-0xBF */ + 0x92, 0xFE, 0x93, 0x40, 0x93, 0x41, 0x93, 0x42, /* 0xC0-0xC3 */ + 0xDE, 0xED, 0x93, 0x43, 0xDE, 0xF1, 0x93, 0x44, /* 0xC4-0xC7 */ + 0x93, 0x45, 0xC8, 0xE0, 0x93, 0x46, 0x93, 0x47, /* 0xC8-0xCB */ + 0x93, 0x48, 0xD7, 0xE1, 0xDE, 0xEF, 0xC3, 0xE8, /* 0xCC-0xCF */ + 0xCC, 0xE1, 0x93, 0x49, 0xB2, 0xE5, 0x93, 0x4A, /* 0xD0-0xD3 */ + 0x93, 0x4B, 0x93, 0x4C, 0xD2, 0xBE, 0x93, 0x4D, /* 0xD4-0xD7 */ + 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, 0x93, 0x51, /* 0xD8-0xDB */ + 0x93, 0x52, 0x93, 0x53, 0xDE, 0xEE, 0x93, 0x54, /* 0xDC-0xDF */ + 0xDE, 0xEB, 0xCE, 0xD5, 0x93, 0x55, 0xB4, 0xA7, /* 0xE0-0xE3 */ + 0x93, 0x56, 0x93, 0x57, 0x93, 0x58, 0x93, 0x59, /* 0xE4-0xE7 */ + 0x93, 0x5A, 0xBF, 0xAB, 0xBE, 0xBE, 0x93, 0x5B, /* 0xE8-0xEB */ + 0x93, 0x5C, 0xBD, 0xD2, 0x93, 0x5D, 0x93, 0x5E, /* 0xEC-0xEF */ + 0x93, 0x5F, 0x93, 0x60, 0xDE, 0xE9, 0x93, 0x61, /* 0xF0-0xF3 */ + 0xD4, 0xAE, 0x93, 0x62, 0xDE, 0xDE, 0x93, 0x63, /* 0xF4-0xF7 */ + 0xDE, 0xEA, 0x93, 0x64, 0x93, 0x65, 0x93, 0x66, /* 0xF8-0xFB */ + 0x93, 0x67, 0xC0, 0xBF, 0x93, 0x68, 0xDE, 0xEC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_64[512] = { + 0xB2, 0xF3, 0xB8, 0xE9, 0xC2, 0xA7, 0x93, 0x69, /* 0x00-0x03 */ + 0x93, 0x6A, 0xBD, 0xC1, 0x93, 0x6B, 0x93, 0x6C, /* 0x04-0x07 */ + 0x93, 0x6D, 0x93, 0x6E, 0x93, 0x6F, 0xDE, 0xF5, /* 0x08-0x0B */ + 0xDE, 0xF8, 0x93, 0x70, 0x93, 0x71, 0xB2, 0xAB, /* 0x0C-0x0F */ + 0xB4, 0xA4, 0x93, 0x72, 0x93, 0x73, 0xB4, 0xEA, /* 0x10-0x13 */ + 0xC9, 0xA6, 0x93, 0x74, 0x93, 0x75, 0x93, 0x76, /* 0x14-0x17 */ + 0x93, 0x77, 0x93, 0x78, 0x93, 0x79, 0xDE, 0xF6, /* 0x18-0x1B */ + 0xCB, 0xD1, 0x93, 0x7A, 0xB8, 0xE3, 0x93, 0x7B, /* 0x1C-0x1F */ + 0xDE, 0xF7, 0xDE, 0xFA, 0x93, 0x7C, 0x93, 0x7D, /* 0x20-0x23 */ + 0x93, 0x7E, 0x93, 0x80, 0xDE, 0xF9, 0x93, 0x81, /* 0x24-0x27 */ + 0x93, 0x82, 0x93, 0x83, 0xCC, 0xC2, 0x93, 0x84, /* 0x28-0x2B */ + 0xB0, 0xE1, 0xB4, 0xEE, 0x93, 0x85, 0x93, 0x86, /* 0x2C-0x2F */ + 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, 0x93, 0x8A, /* 0x30-0x33 */ + 0xE5, 0xBA, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x34-0x37 */ + 0x93, 0x8E, 0x93, 0x8F, 0xD0, 0xAF, 0x93, 0x90, /* 0x38-0x3B */ + 0x93, 0x91, 0xB2, 0xEB, 0x93, 0x92, 0xEB, 0xA1, /* 0x3C-0x3F */ + 0x93, 0x93, 0xDE, 0xF4, 0x93, 0x94, 0x93, 0x95, /* 0x40-0x43 */ + 0xC9, 0xE3, 0xDE, 0xF3, 0xB0, 0xDA, 0xD2, 0xA1, /* 0x44-0x47 */ + 0xB1, 0xF7, 0x93, 0x96, 0xCC, 0xAF, 0x93, 0x97, /* 0x48-0x4B */ + 0x93, 0x98, 0x93, 0x99, 0x93, 0x9A, 0x93, 0x9B, /* 0x4C-0x4F */ + 0x93, 0x9C, 0x93, 0x9D, 0xDE, 0xF0, 0x93, 0x9E, /* 0x50-0x53 */ + 0xCB, 0xA4, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x54-0x57 */ + 0xD5, 0xAA, 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, /* 0x58-0x5B */ + 0x93, 0xA5, 0x93, 0xA6, 0xDE, 0xFB, 0x93, 0xA7, /* 0x5C-0x5F */ + 0x93, 0xA8, 0x93, 0xA9, 0x93, 0xAA, 0x93, 0xAB, /* 0x60-0x63 */ + 0x93, 0xAC, 0x93, 0xAD, 0x93, 0xAE, 0xB4, 0xDD, /* 0x64-0x67 */ + 0x93, 0xAF, 0xC4, 0xA6, 0x93, 0xB0, 0x93, 0xB1, /* 0x68-0x6B */ + 0x93, 0xB2, 0xDE, 0xFD, 0x93, 0xB3, 0x93, 0xB4, /* 0x6C-0x6F */ + 0x93, 0xB5, 0x93, 0xB6, 0x93, 0xB7, 0x93, 0xB8, /* 0x70-0x73 */ + 0x93, 0xB9, 0x93, 0xBA, 0x93, 0xBB, 0x93, 0xBC, /* 0x74-0x77 */ + 0xC3, 0xFE, 0xC4, 0xA1, 0xDF, 0xA1, 0x93, 0xBD, /* 0x78-0x7B */ + 0x93, 0xBE, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0x7C-0x7F */ + + 0x93, 0xC2, 0x93, 0xC3, 0xC1, 0xCC, 0x93, 0xC4, /* 0x80-0x83 */ + 0xDE, 0xFC, 0xBE, 0xEF, 0x93, 0xC5, 0xC6, 0xB2, /* 0x84-0x87 */ + 0x93, 0xC6, 0x93, 0xC7, 0x93, 0xC8, 0x93, 0xC9, /* 0x88-0x8B */ + 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, 0x93, 0xCD, /* 0x8C-0x8F */ + 0x93, 0xCE, 0xB3, 0xC5, 0xC8, 0xF6, 0x93, 0xCF, /* 0x90-0x93 */ + 0x93, 0xD0, 0xCB, 0xBA, 0xDE, 0xFE, 0x93, 0xD1, /* 0x94-0x97 */ + 0x93, 0xD2, 0xDF, 0xA4, 0x93, 0xD3, 0x93, 0xD4, /* 0x98-0x9B */ + 0x93, 0xD5, 0x93, 0xD6, 0xD7, 0xB2, 0x93, 0xD7, /* 0x9C-0x9F */ + 0x93, 0xD8, 0x93, 0xD9, 0x93, 0xDA, 0x93, 0xDB, /* 0xA0-0xA3 */ + 0xB3, 0xB7, 0x93, 0xDC, 0x93, 0xDD, 0x93, 0xDE, /* 0xA4-0xA7 */ + 0x93, 0xDF, 0xC1, 0xC3, 0x93, 0xE0, 0x93, 0xE1, /* 0xA8-0xAB */ + 0xC7, 0xCB, 0xB2, 0xA5, 0xB4, 0xE9, 0x93, 0xE2, /* 0xAC-0xAF */ + 0xD7, 0xAB, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xB0-0xB3 */ + 0x93, 0xE6, 0xC4, 0xEC, 0x93, 0xE7, 0xDF, 0xA2, /* 0xB4-0xB7 */ + 0xDF, 0xA3, 0x93, 0xE8, 0xDF, 0xA5, 0x93, 0xE9, /* 0xB8-0xBB */ + 0xBA, 0xB3, 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, /* 0xBC-0xBF */ + 0xDF, 0xA6, 0x93, 0xED, 0xC0, 0xDE, 0x93, 0xEE, /* 0xC0-0xC3 */ + 0x93, 0xEF, 0xC9, 0xC3, 0x93, 0xF0, 0x93, 0xF1, /* 0xC4-0xC7 */ + 0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xC8-0xCB */ + 0x93, 0xF6, 0xB2, 0xD9, 0xC7, 0xE6, 0x93, 0xF7, /* 0xCC-0xCF */ + 0xDF, 0xA7, 0x93, 0xF8, 0xC7, 0xDC, 0x93, 0xF9, /* 0xD0-0xD3 */ + 0x93, 0xFA, 0x93, 0xFB, 0x93, 0xFC, 0xDF, 0xA8, /* 0xD4-0xD7 */ + 0xEB, 0xA2, 0x93, 0xFD, 0x93, 0xFE, 0x94, 0x40, /* 0xD8-0xDB */ + 0x94, 0x41, 0x94, 0x42, 0xCB, 0xD3, 0x94, 0x43, /* 0xDC-0xDF */ + 0x94, 0x44, 0x94, 0x45, 0xDF, 0xAA, 0x94, 0x46, /* 0xE0-0xE3 */ + 0xDF, 0xA9, 0x94, 0x47, 0xB2, 0xC1, 0x94, 0x48, /* 0xE4-0xE7 */ + 0x94, 0x49, 0x94, 0x4A, 0x94, 0x4B, 0x94, 0x4C, /* 0xE8-0xEB */ + 0x94, 0x4D, 0x94, 0x4E, 0x94, 0x4F, 0x94, 0x50, /* 0xEC-0xEF */ + 0x94, 0x51, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0xF0-0xF3 */ + 0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0xF4-0xF7 */ + 0x94, 0x59, 0x94, 0x5A, 0x94, 0x5B, 0x94, 0x5C, /* 0xF8-0xFB */ + 0x94, 0x5D, 0x94, 0x5E, 0x94, 0x5F, 0x94, 0x60, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xC5, 0xCA, 0x94, 0x61, 0x94, 0x62, 0x94, 0x63, /* 0x00-0x03 */ + 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, 0x94, 0x67, /* 0x04-0x07 */ + 0x94, 0x68, 0xDF, 0xAB, 0x94, 0x69, 0x94, 0x6A, /* 0x08-0x0B */ + 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, 0x94, 0x6E, /* 0x0C-0x0F */ + 0x94, 0x6F, 0x94, 0x70, 0xD4, 0xDC, 0x94, 0x71, /* 0x10-0x13 */ + 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, 0x94, 0x75, /* 0x14-0x17 */ + 0xC8, 0xC1, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x18-0x1B */ + 0x94, 0x79, 0x94, 0x7A, 0x94, 0x7B, 0x94, 0x7C, /* 0x1C-0x1F */ + 0x94, 0x7D, 0x94, 0x7E, 0x94, 0x80, 0x94, 0x81, /* 0x20-0x23 */ + 0x94, 0x82, 0xDF, 0xAC, 0x94, 0x83, 0x94, 0x84, /* 0x24-0x27 */ + 0x94, 0x85, 0x94, 0x86, 0x94, 0x87, 0xBE, 0xF0, /* 0x28-0x2B */ + 0x94, 0x88, 0x94, 0x89, 0xDF, 0xAD, 0xD6, 0xA7, /* 0x2C-0x2F */ + 0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x30-0x33 */ + 0xEA, 0xB7, 0xEB, 0xB6, 0xCA, 0xD5, 0x94, 0x8E, /* 0x34-0x37 */ + 0xD8, 0xFC, 0xB8, 0xC4, 0x94, 0x8F, 0xB9, 0xA5, /* 0x38-0x3B */ + 0x94, 0x90, 0x94, 0x91, 0xB7, 0xC5, 0xD5, 0xFE, /* 0x3C-0x3F */ + 0x94, 0x92, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x40-0x43 */ + 0x94, 0x96, 0xB9, 0xCA, 0x94, 0x97, 0x94, 0x98, /* 0x44-0x47 */ + 0xD0, 0xA7, 0xF4, 0xCD, 0x94, 0x99, 0x94, 0x9A, /* 0x48-0x4B */ + 0xB5, 0xD0, 0x94, 0x9B, 0x94, 0x9C, 0xC3, 0xF4, /* 0x4C-0x4F */ + 0x94, 0x9D, 0xBE, 0xC8, 0x94, 0x9E, 0x94, 0x9F, /* 0x50-0x53 */ + 0x94, 0xA0, 0xEB, 0xB7, 0xB0, 0xBD, 0x94, 0xA1, /* 0x54-0x57 */ + 0x94, 0xA2, 0xBD, 0xCC, 0x94, 0xA3, 0xC1, 0xB2, /* 0x58-0x5B */ + 0x94, 0xA4, 0xB1, 0xD6, 0xB3, 0xA8, 0x94, 0xA5, /* 0x5C-0x5F */ + 0x94, 0xA6, 0x94, 0xA7, 0xB8, 0xD2, 0xC9, 0xA2, /* 0x60-0x63 */ + 0x94, 0xA8, 0x94, 0xA9, 0xB6, 0xD8, 0x94, 0xAA, /* 0x64-0x67 */ + 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, 0xEB, 0xB8, /* 0x68-0x6B */ + 0xBE, 0xB4, 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, /* 0x6C-0x6F */ + 0xCA, 0xFD, 0x94, 0xB1, 0xC7, 0xC3, 0x94, 0xB2, /* 0x70-0x73 */ + 0xD5, 0xFB, 0x94, 0xB3, 0x94, 0xB4, 0xB7, 0xF3, /* 0x74-0x77 */ + 0x94, 0xB5, 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, /* 0x78-0x7B */ + 0x94, 0xB9, 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, /* 0x7C-0x7F */ + + 0x94, 0xBD, 0x94, 0xBE, 0x94, 0xBF, 0x94, 0xC0, /* 0x80-0x83 */ + 0x94, 0xC1, 0x94, 0xC2, 0x94, 0xC3, 0xCE, 0xC4, /* 0x84-0x87 */ + 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, 0xD5, 0xAB, /* 0x88-0x8B */ + 0xB1, 0xF3, 0x94, 0xC7, 0x94, 0xC8, 0x94, 0xC9, /* 0x8C-0x8F */ + 0xEC, 0xB3, 0xB0, 0xDF, 0x94, 0xCA, 0xEC, 0xB5, /* 0x90-0x93 */ + 0x94, 0xCB, 0x94, 0xCC, 0x94, 0xCD, 0xB6, 0xB7, /* 0x94-0x97 */ + 0x94, 0xCE, 0xC1, 0xCF, 0x94, 0xCF, 0xF5, 0xFA, /* 0x98-0x9B */ + 0xD0, 0xB1, 0x94, 0xD0, 0x94, 0xD1, 0xD5, 0xE5, /* 0x9C-0x9F */ + 0x94, 0xD2, 0xCE, 0xD3, 0x94, 0xD3, 0x94, 0xD4, /* 0xA0-0xA3 */ + 0xBD, 0xEF, 0xB3, 0xE2, 0x94, 0xD5, 0xB8, 0xAB, /* 0xA4-0xA7 */ + 0x94, 0xD6, 0xD5, 0xB6, 0x94, 0xD7, 0xED, 0xBD, /* 0xA8-0xAB */ + 0x94, 0xD8, 0xB6, 0xCF, 0x94, 0xD9, 0xCB, 0xB9, /* 0xAC-0xAF */ + 0xD0, 0xC2, 0x94, 0xDA, 0x94, 0xDB, 0x94, 0xDC, /* 0xB0-0xB3 */ + 0x94, 0xDD, 0x94, 0xDE, 0x94, 0xDF, 0x94, 0xE0, /* 0xB4-0xB7 */ + 0x94, 0xE1, 0xB7, 0xBD, 0x94, 0xE2, 0x94, 0xE3, /* 0xB8-0xBB */ + 0xEC, 0xB6, 0xCA, 0xA9, 0x94, 0xE4, 0x94, 0xE5, /* 0xBC-0xBF */ + 0x94, 0xE6, 0xC5, 0xD4, 0x94, 0xE7, 0xEC, 0xB9, /* 0xC0-0xC3 */ + 0xEC, 0xB8, 0xC2, 0xC3, 0xEC, 0xB7, 0x94, 0xE8, /* 0xC4-0xC7 */ + 0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0xD0, 0xFD, /* 0xC8-0xCB */ + 0xEC, 0xBA, 0x94, 0xEC, 0xEC, 0xBB, 0xD7, 0xE5, /* 0xCC-0xCF */ + 0x94, 0xED, 0x94, 0xEE, 0xEC, 0xBC, 0x94, 0xEF, /* 0xD0-0xD3 */ + 0x94, 0xF0, 0x94, 0xF1, 0xEC, 0xBD, 0xC6, 0xEC, /* 0xD4-0xD7 */ + 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, 0x94, 0xF5, /* 0xD8-0xDB */ + 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, 0x94, 0xF9, /* 0xDC-0xDF */ + 0xCE, 0xDE, 0x94, 0xFA, 0xBC, 0xC8, 0x94, 0xFB, /* 0xE0-0xE3 */ + 0x94, 0xFC, 0xC8, 0xD5, 0xB5, 0xA9, 0xBE, 0xC9, /* 0xE4-0xE7 */ + 0xD6, 0xBC, 0xD4, 0xE7, 0x94, 0xFD, 0x94, 0xFE, /* 0xE8-0xEB */ + 0xD1, 0xAE, 0xD0, 0xF1, 0xEA, 0xB8, 0xEA, 0xB9, /* 0xEC-0xEF */ + 0xEA, 0xBA, 0xBA, 0xB5, 0x95, 0x40, 0x95, 0x41, /* 0xF0-0xF3 */ + 0x95, 0x42, 0x95, 0x43, 0xCA, 0xB1, 0xBF, 0xF5, /* 0xF4-0xF7 */ + 0x95, 0x44, 0x95, 0x45, 0xCD, 0xFA, 0x95, 0x46, /* 0xF8-0xFB */ + 0x95, 0x47, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0xEA, 0xC0, 0x95, 0x4B, 0xB0, 0xBA, 0xEA, 0xBE, /* 0x00-0x03 */ + 0x95, 0x4C, 0x95, 0x4D, 0xC0, 0xA5, 0x95, 0x4E, /* 0x04-0x07 */ + 0x95, 0x4F, 0x95, 0x50, 0xEA, 0xBB, 0x95, 0x51, /* 0x08-0x0B */ + 0xB2, 0xFD, 0x95, 0x52, 0xC3, 0xF7, 0xBB, 0xE8, /* 0x0C-0x0F */ + 0x95, 0x53, 0x95, 0x54, 0x95, 0x55, 0xD2, 0xD7, /* 0x10-0x13 */ + 0xCE, 0xF4, 0xEA, 0xBF, 0x95, 0x56, 0x95, 0x57, /* 0x14-0x17 */ + 0x95, 0x58, 0xEA, 0xBC, 0x95, 0x59, 0x95, 0x5A, /* 0x18-0x1B */ + 0x95, 0x5B, 0xEA, 0xC3, 0x95, 0x5C, 0xD0, 0xC7, /* 0x1C-0x1F */ + 0xD3, 0xB3, 0x95, 0x5D, 0x95, 0x5E, 0x95, 0x5F, /* 0x20-0x23 */ + 0x95, 0x60, 0xB4, 0xBA, 0x95, 0x61, 0xC3, 0xC1, /* 0x24-0x27 */ + 0xD7, 0xF2, 0x95, 0x62, 0x95, 0x63, 0x95, 0x64, /* 0x28-0x2B */ + 0x95, 0x65, 0xD5, 0xD1, 0x95, 0x66, 0xCA, 0xC7, /* 0x2C-0x2F */ + 0x95, 0x67, 0xEA, 0xC5, 0x95, 0x68, 0x95, 0x69, /* 0x30-0x33 */ + 0xEA, 0xC4, 0xEA, 0xC7, 0xEA, 0xC6, 0x95, 0x6A, /* 0x34-0x37 */ + 0x95, 0x6B, 0x95, 0x6C, 0x95, 0x6D, 0x95, 0x6E, /* 0x38-0x3B */ + 0xD6, 0xE7, 0x95, 0x6F, 0xCF, 0xD4, 0x95, 0x70, /* 0x3C-0x3F */ + 0x95, 0x71, 0xEA, 0xCB, 0x95, 0x72, 0xBB, 0xCE, /* 0x40-0x43 */ + 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, 0x95, 0x76, /* 0x44-0x47 */ + 0x95, 0x77, 0x95, 0x78, 0x95, 0x79, 0xBD, 0xFA, /* 0x48-0x4B */ + 0xC9, 0xCE, 0x95, 0x7A, 0x95, 0x7B, 0xEA, 0xCC, /* 0x4C-0x4F */ + 0x95, 0x7C, 0x95, 0x7D, 0xC9, 0xB9, 0xCF, 0xFE, /* 0x50-0x53 */ + 0xEA, 0xCA, 0xD4, 0xCE, 0xEA, 0xCD, 0xEA, 0xCF, /* 0x54-0x57 */ + 0x95, 0x7E, 0x95, 0x80, 0xCD, 0xED, 0x95, 0x81, /* 0x58-0x5B */ + 0x95, 0x82, 0x95, 0x83, 0x95, 0x84, 0xEA, 0xC9, /* 0x5C-0x5F */ + 0x95, 0x85, 0xEA, 0xCE, 0x95, 0x86, 0x95, 0x87, /* 0x60-0x63 */ + 0xCE, 0xEE, 0x95, 0x88, 0xBB, 0xDE, 0x95, 0x89, /* 0x64-0x67 */ + 0xB3, 0xBF, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x68-0x6B */ + 0x95, 0x8D, 0x95, 0x8E, 0xC6, 0xD5, 0xBE, 0xB0, /* 0x6C-0x6F */ + 0xCE, 0xFA, 0x95, 0x8F, 0x95, 0x90, 0x95, 0x91, /* 0x70-0x73 */ + 0xC7, 0xE7, 0x95, 0x92, 0xBE, 0xA7, 0xEA, 0xD0, /* 0x74-0x77 */ + 0x95, 0x93, 0x95, 0x94, 0xD6, 0xC7, 0x95, 0x95, /* 0x78-0x7B */ + 0x95, 0x96, 0x95, 0x97, 0xC1, 0xC0, 0x95, 0x98, /* 0x7C-0x7F */ + + 0x95, 0x99, 0x95, 0x9A, 0xD4, 0xDD, 0x95, 0x9B, /* 0x80-0x83 */ + 0xEA, 0xD1, 0x95, 0x9C, 0x95, 0x9D, 0xCF, 0xBE, /* 0x84-0x87 */ + 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, 0x95, 0xA1, /* 0x88-0x8B */ + 0xEA, 0xD2, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x8C-0x8F */ + 0x95, 0xA5, 0xCA, 0xEE, 0x95, 0xA6, 0x95, 0xA7, /* 0x90-0x93 */ + 0x95, 0xA8, 0x95, 0xA9, 0xC5, 0xAF, 0xB0, 0xB5, /* 0x94-0x97 */ + 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, 0x95, 0xAD, /* 0x98-0x9B */ + 0x95, 0xAE, 0xEA, 0xD4, 0x95, 0xAF, 0x95, 0xB0, /* 0x9C-0x9F */ + 0x95, 0xB1, 0x95, 0xB2, 0x95, 0xB3, 0x95, 0xB4, /* 0xA0-0xA3 */ + 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, 0xEA, 0xD3, /* 0xA4-0xA7 */ + 0xF4, 0xDF, 0x95, 0xB8, 0x95, 0xB9, 0x95, 0xBA, /* 0xA8-0xAB */ + 0x95, 0xBB, 0x95, 0xBC, 0xC4, 0xBA, 0x95, 0xBD, /* 0xAC-0xAF */ + 0x95, 0xBE, 0x95, 0xBF, 0x95, 0xC0, 0x95, 0xC1, /* 0xB0-0xB3 */ + 0xB1, 0xA9, 0x95, 0xC2, 0x95, 0xC3, 0x95, 0xC4, /* 0xB4-0xB7 */ + 0x95, 0xC5, 0xE5, 0xDF, 0x95, 0xC6, 0x95, 0xC7, /* 0xB8-0xBB */ + 0x95, 0xC8, 0x95, 0xC9, 0xEA, 0xD5, 0x95, 0xCA, /* 0xBC-0xBF */ + 0x95, 0xCB, 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, /* 0xC0-0xC3 */ + 0x95, 0xCF, 0x95, 0xD0, 0x95, 0xD1, 0x95, 0xD2, /* 0xC4-0xC7 */ + 0x95, 0xD3, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0xC8-0xCB */ + 0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0xCC-0xCF */ + 0x95, 0xDB, 0x95, 0xDC, 0x95, 0xDD, 0x95, 0xDE, /* 0xD0-0xD3 */ + 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, 0x95, 0xE2, /* 0xD4-0xD7 */ + 0x95, 0xE3, 0xCA, 0xEF, 0x95, 0xE4, 0xEA, 0xD6, /* 0xD8-0xDB */ + 0xEA, 0xD7, 0xC6, 0xD8, 0x95, 0xE5, 0x95, 0xE6, /* 0xDC-0xDF */ + 0x95, 0xE7, 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, /* 0xE0-0xE3 */ + 0x95, 0xEB, 0x95, 0xEC, 0xEA, 0xD8, 0x95, 0xED, /* 0xE4-0xE7 */ + 0x95, 0xEE, 0xEA, 0xD9, 0x95, 0xEF, 0x95, 0xF0, /* 0xE8-0xEB */ + 0x95, 0xF1, 0x95, 0xF2, 0x95, 0xF3, 0x95, 0xF4, /* 0xEC-0xEF */ + 0xD4, 0xBB, 0x95, 0xF5, 0xC7, 0xFA, 0xD2, 0xB7, /* 0xF0-0xF3 */ + 0xB8, 0xFC, 0x95, 0xF6, 0x95, 0xF7, 0xEA, 0xC2, /* 0xF4-0xF7 */ + 0x95, 0xF8, 0xB2, 0xDC, 0x95, 0xF9, 0x95, 0xFA, /* 0xF8-0xFB */ + 0xC2, 0xFC, 0x95, 0xFB, 0xD4, 0xF8, 0xCC, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xD7, 0xEE, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0x00-0x03 */ + 0x96, 0x40, 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, /* 0x04-0x07 */ + 0xD4, 0xC2, 0xD3, 0xD0, 0xEB, 0xC3, 0xC5, 0xF3, /* 0x08-0x0B */ + 0x96, 0x44, 0xB7, 0xFE, 0x96, 0x45, 0x96, 0x46, /* 0x0C-0x0F */ + 0xEB, 0xD4, 0x96, 0x47, 0x96, 0x48, 0x96, 0x49, /* 0x10-0x13 */ + 0xCB, 0xB7, 0xEB, 0xDE, 0x96, 0x4A, 0xC0, 0xCA, /* 0x14-0x17 */ + 0x96, 0x4B, 0x96, 0x4C, 0x96, 0x4D, 0xCD, 0xFB, /* 0x18-0x1B */ + 0x96, 0x4E, 0xB3, 0xAF, 0x96, 0x4F, 0xC6, 0xDA, /* 0x1C-0x1F */ + 0x96, 0x50, 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, /* 0x20-0x23 */ + 0x96, 0x54, 0x96, 0x55, 0xEB, 0xFC, 0x96, 0x56, /* 0x24-0x27 */ + 0xC4, 0xBE, 0x96, 0x57, 0xCE, 0xB4, 0xC4, 0xA9, /* 0x28-0x2B */ + 0xB1, 0xBE, 0xD4, 0xFD, 0x96, 0x58, 0xCA, 0xF5, /* 0x2C-0x2F */ + 0x96, 0x59, 0xD6, 0xEC, 0x96, 0x5A, 0x96, 0x5B, /* 0x30-0x33 */ + 0xC6, 0xD3, 0xB6, 0xE4, 0x96, 0x5C, 0x96, 0x5D, /* 0x34-0x37 */ + 0x96, 0x5E, 0x96, 0x5F, 0xBB, 0xFA, 0x96, 0x60, /* 0x38-0x3B */ + 0x96, 0x61, 0xD0, 0xE0, 0x96, 0x62, 0x96, 0x63, /* 0x3C-0x3F */ + 0xC9, 0xB1, 0x96, 0x64, 0xD4, 0xD3, 0xC8, 0xA8, /* 0x40-0x43 */ + 0x96, 0x65, 0x96, 0x66, 0xB8, 0xCB, 0x96, 0x67, /* 0x44-0x47 */ + 0xE8, 0xBE, 0xC9, 0xBC, 0x96, 0x68, 0x96, 0x69, /* 0x48-0x4B */ + 0xE8, 0xBB, 0x96, 0x6A, 0xC0, 0xEE, 0xD0, 0xD3, /* 0x4C-0x4F */ + 0xB2, 0xC4, 0xB4, 0xE5, 0x96, 0x6B, 0xE8, 0xBC, /* 0x50-0x53 */ + 0x96, 0x6C, 0x96, 0x6D, 0xD5, 0xC8, 0x96, 0x6E, /* 0x54-0x57 */ + 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, 0x96, 0x72, /* 0x58-0x5B */ + 0xB6, 0xC5, 0x96, 0x73, 0xE8, 0xBD, 0xCA, 0xF8, /* 0x5C-0x5F */ + 0xB8, 0xDC, 0xCC, 0xF5, 0x96, 0x74, 0x96, 0x75, /* 0x60-0x63 */ + 0x96, 0x76, 0xC0, 0xB4, 0x96, 0x77, 0x96, 0x78, /* 0x64-0x67 */ + 0xD1, 0xEE, 0xE8, 0xBF, 0xE8, 0xC2, 0x96, 0x79, /* 0x68-0x6B */ + 0x96, 0x7A, 0xBA, 0xBC, 0x96, 0x7B, 0xB1, 0xAD, /* 0x6C-0x6F */ + 0xBD, 0xDC, 0x96, 0x7C, 0xEA, 0xBD, 0xE8, 0xC3, /* 0x70-0x73 */ + 0x96, 0x7D, 0xE8, 0xC6, 0x96, 0x7E, 0xE8, 0xCB, /* 0x74-0x77 */ + 0x96, 0x80, 0x96, 0x81, 0x96, 0x82, 0x96, 0x83, /* 0x78-0x7B */ + 0xE8, 0xCC, 0x96, 0x84, 0xCB, 0xC9, 0xB0, 0xE5, /* 0x7C-0x7F */ + + 0x96, 0x85, 0xBC, 0xAB, 0x96, 0x86, 0x96, 0x87, /* 0x80-0x83 */ + 0xB9, 0xB9, 0x96, 0x88, 0x96, 0x89, 0xE8, 0xC1, /* 0x84-0x87 */ + 0x96, 0x8A, 0xCD, 0xF7, 0x96, 0x8B, 0xE8, 0xCA, /* 0x88-0x8B */ + 0x96, 0x8C, 0x96, 0x8D, 0x96, 0x8E, 0x96, 0x8F, /* 0x8C-0x8F */ + 0xCE, 0xF6, 0x96, 0x90, 0x96, 0x91, 0x96, 0x92, /* 0x90-0x93 */ + 0x96, 0x93, 0xD5, 0xED, 0x96, 0x94, 0xC1, 0xD6, /* 0x94-0x97 */ + 0xE8, 0xC4, 0x96, 0x95, 0xC3, 0xB6, 0x96, 0x96, /* 0x98-0x9B */ + 0xB9, 0xFB, 0xD6, 0xA6, 0xE8, 0xC8, 0x96, 0x97, /* 0x9C-0x9F */ + 0x96, 0x98, 0x96, 0x99, 0xCA, 0xE0, 0xD4, 0xE6, /* 0xA0-0xA3 */ + 0x96, 0x9A, 0xE8, 0xC0, 0x96, 0x9B, 0xE8, 0xC5, /* 0xA4-0xA7 */ + 0xE8, 0xC7, 0x96, 0x9C, 0xC7, 0xB9, 0xB7, 0xE3, /* 0xA8-0xAB */ + 0x96, 0x9D, 0xE8, 0xC9, 0x96, 0x9E, 0xBF, 0xDD, /* 0xAC-0xAF */ + 0xE8, 0xD2, 0x96, 0x9F, 0x96, 0xA0, 0xE8, 0xD7, /* 0xB0-0xB3 */ + 0x96, 0xA1, 0xE8, 0xD5, 0xBC, 0xDC, 0xBC, 0xCF, /* 0xB4-0xB7 */ + 0xE8, 0xDB, 0x96, 0xA2, 0x96, 0xA3, 0x96, 0xA4, /* 0xB8-0xBB */ + 0x96, 0xA5, 0x96, 0xA6, 0x96, 0xA7, 0x96, 0xA8, /* 0xBC-0xBF */ + 0x96, 0xA9, 0xE8, 0xDE, 0x96, 0xAA, 0xE8, 0xDA, /* 0xC0-0xC3 */ + 0xB1, 0xFA, 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, /* 0xC4-0xC7 */ + 0x96, 0xAE, 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, /* 0xC8-0xCB */ + 0x96, 0xB2, 0x96, 0xB3, 0x96, 0xB4, 0xB0, 0xD8, /* 0xCC-0xCF */ + 0xC4, 0xB3, 0xB8, 0xCC, 0xC6, 0xE2, 0xC8, 0xBE, /* 0xD0-0xD3 */ + 0xC8, 0xE1, 0x96, 0xB5, 0x96, 0xB6, 0x96, 0xB7, /* 0xD4-0xD7 */ + 0xE8, 0xCF, 0xE8, 0xD4, 0xE8, 0xD6, 0x96, 0xB8, /* 0xD8-0xDB */ + 0xB9, 0xF1, 0xE8, 0xD8, 0xD7, 0xF5, 0x96, 0xB9, /* 0xDC-0xDF */ + 0xC4, 0xFB, 0x96, 0xBA, 0xE8, 0xDC, 0x96, 0xBB, /* 0xE0-0xE3 */ + 0x96, 0xBC, 0xB2, 0xE9, 0x96, 0xBD, 0x96, 0xBE, /* 0xE4-0xE7 */ + 0x96, 0xBF, 0xE8, 0xD1, 0x96, 0xC0, 0x96, 0xC1, /* 0xE8-0xEB */ + 0xBC, 0xED, 0x96, 0xC2, 0x96, 0xC3, 0xBF, 0xC2, /* 0xEC-0xEF */ + 0xE8, 0xCD, 0xD6, 0xF9, 0x96, 0xC4, 0xC1, 0xF8, /* 0xF0-0xF3 */ + 0xB2, 0xF1, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0xF4-0xF7 */ + 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, 0x96, 0xCB, /* 0xF8-0xFB */ + 0x96, 0xCC, 0xE8, 0xDF, 0x96, 0xCD, 0xCA, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0xE8, 0xD9, 0x96, 0xCE, 0x96, 0xCF, 0x96, 0xD0, /* 0x00-0x03 */ + 0x96, 0xD1, 0xD5, 0xA4, 0x96, 0xD2, 0xB1, 0xEA, /* 0x04-0x07 */ + 0xD5, 0xBB, 0xE8, 0xCE, 0xE8, 0xD0, 0xB6, 0xB0, /* 0x08-0x0B */ + 0xE8, 0xD3, 0x96, 0xD3, 0xE8, 0xDD, 0xC0, 0xB8, /* 0x0C-0x0F */ + 0x96, 0xD4, 0xCA, 0xF7, 0x96, 0xD5, 0xCB, 0xA8, /* 0x10-0x13 */ + 0x96, 0xD6, 0x96, 0xD7, 0xC6, 0xDC, 0xC0, 0xF5, /* 0x14-0x17 */ + 0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x18-0x1B */ + 0x96, 0xDC, 0xE8, 0xE9, 0x96, 0xDD, 0x96, 0xDE, /* 0x1C-0x1F */ + 0x96, 0xDF, 0xD0, 0xA3, 0x96, 0xE0, 0x96, 0xE1, /* 0x20-0x23 */ + 0x96, 0xE2, 0x96, 0xE3, 0x96, 0xE4, 0x96, 0xE5, /* 0x24-0x27 */ + 0x96, 0xE6, 0xE8, 0xF2, 0xD6, 0xEA, 0x96, 0xE7, /* 0x28-0x2B */ + 0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x2C-0x2F */ + 0x96, 0xEC, 0x96, 0xED, 0xE8, 0xE0, 0xE8, 0xE1, /* 0x30-0x33 */ + 0x96, 0xEE, 0x96, 0xEF, 0x96, 0xF0, 0xD1, 0xF9, /* 0x34-0x37 */ + 0xBA, 0xCB, 0xB8, 0xF9, 0x96, 0xF1, 0x96, 0xF2, /* 0x38-0x3B */ + 0xB8, 0xF1, 0xD4, 0xD4, 0xE8, 0xEF, 0x96, 0xF3, /* 0x3C-0x3F */ + 0xE8, 0xEE, 0xE8, 0xEC, 0xB9, 0xF0, 0xCC, 0xD2, /* 0x40-0x43 */ + 0xE8, 0xE6, 0xCE, 0xA6, 0xBF, 0xF2, 0x96, 0xF4, /* 0x44-0x47 */ + 0xB0, 0xB8, 0xE8, 0xF1, 0xE8, 0xF0, 0x96, 0xF5, /* 0x48-0x4B */ + 0xD7, 0xC0, 0x96, 0xF6, 0xE8, 0xE4, 0x96, 0xF7, /* 0x4C-0x4F */ + 0xCD, 0xA9, 0xC9, 0xA3, 0x96, 0xF8, 0xBB, 0xB8, /* 0x50-0x53 */ + 0xBD, 0xDB, 0xE8, 0xEA, 0x96, 0xF9, 0x96, 0xFA, /* 0x54-0x57 */ + 0x96, 0xFB, 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, /* 0x58-0x5B */ + 0x97, 0x40, 0x97, 0x41, 0x97, 0x42, 0x97, 0x43, /* 0x5C-0x5F */ + 0xE8, 0xE2, 0xE8, 0xE3, 0xE8, 0xE5, 0xB5, 0xB5, /* 0x60-0x63 */ + 0xE8, 0xE7, 0xC7, 0xC5, 0xE8, 0xEB, 0xE8, 0xED, /* 0x64-0x67 */ + 0xBD, 0xB0, 0xD7, 0xAE, 0x97, 0x44, 0xE8, 0xF8, /* 0x68-0x6B */ + 0x97, 0x45, 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, /* 0x6C-0x6F */ + 0x97, 0x49, 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, /* 0x70-0x73 */ + 0xE8, 0xF5, 0x97, 0x4D, 0xCD, 0xB0, 0xE8, 0xF6, /* 0x74-0x77 */ + 0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x78-0x7B */ + 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, 0x97, 0x55, /* 0x7C-0x7F */ + + 0x97, 0x56, 0xC1, 0xBA, 0x97, 0x57, 0xE8, 0xE8, /* 0x80-0x83 */ + 0x97, 0x58, 0xC3, 0xB7, 0xB0, 0xF0, 0x97, 0x59, /* 0x84-0x87 */ + 0x97, 0x5A, 0x97, 0x5B, 0x97, 0x5C, 0x97, 0x5D, /* 0x88-0x8B */ + 0x97, 0x5E, 0x97, 0x5F, 0x97, 0x60, 0xE8, 0xF4, /* 0x8C-0x8F */ + 0x97, 0x61, 0x97, 0x62, 0x97, 0x63, 0xE8, 0xF7, /* 0x90-0x93 */ + 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, 0xB9, 0xA3, /* 0x94-0x97 */ + 0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0x98-0x9B */ + 0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0x9C-0x9F */ + 0x97, 0x6F, 0x97, 0x70, 0xC9, 0xD2, 0x97, 0x71, /* 0xA0-0xA3 */ + 0x97, 0x72, 0x97, 0x73, 0xC3, 0xCE, 0xCE, 0xE0, /* 0xA4-0xA7 */ + 0xC0, 0xE6, 0x97, 0x74, 0x97, 0x75, 0x97, 0x76, /* 0xA8-0xAB */ + 0x97, 0x77, 0xCB, 0xF3, 0x97, 0x78, 0xCC, 0xDD, /* 0xAC-0xAF */ + 0xD0, 0xB5, 0x97, 0x79, 0x97, 0x7A, 0xCA, 0xE1, /* 0xB0-0xB3 */ + 0x97, 0x7B, 0xE8, 0xF3, 0x97, 0x7C, 0x97, 0x7D, /* 0xB4-0xB7 */ + 0x97, 0x7E, 0x97, 0x80, 0x97, 0x81, 0x97, 0x82, /* 0xB8-0xBB */ + 0x97, 0x83, 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, /* 0xBC-0xBF */ + 0xBC, 0xEC, 0x97, 0x87, 0xE8, 0xF9, 0x97, 0x88, /* 0xC0-0xC3 */ + 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, 0x97, 0x8C, /* 0xC4-0xC7 */ + 0x97, 0x8D, 0xC3, 0xDE, 0x97, 0x8E, 0xC6, 0xE5, /* 0xC8-0xCB */ + 0x97, 0x8F, 0xB9, 0xF7, 0x97, 0x90, 0x97, 0x91, /* 0xCC-0xCF */ + 0x97, 0x92, 0x97, 0x93, 0xB0, 0xF4, 0x97, 0x94, /* 0xD0-0xD3 */ + 0x97, 0x95, 0xD7, 0xD8, 0x97, 0x96, 0x97, 0x97, /* 0xD4-0xD7 */ + 0xBC, 0xAC, 0x97, 0x98, 0xC5, 0xEF, 0x97, 0x99, /* 0xD8-0xDB */ + 0x97, 0x9A, 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, /* 0xDC-0xDF */ + 0xCC, 0xC4, 0x97, 0x9E, 0x97, 0x9F, 0xE9, 0xA6, /* 0xE0-0xE3 */ + 0x97, 0xA0, 0x97, 0xA1, 0x97, 0xA2, 0x97, 0xA3, /* 0xE4-0xE7 */ + 0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE8-0xEB */ + 0x97, 0xA8, 0x97, 0xA9, 0xC9, 0xAD, 0x97, 0xAA, /* 0xEC-0xEF */ + 0xE9, 0xA2, 0xC0, 0xE2, 0x97, 0xAB, 0x97, 0xAC, /* 0xF0-0xF3 */ + 0x97, 0xAD, 0xBF, 0xC3, 0x97, 0xAE, 0x97, 0xAF, /* 0xF4-0xF7 */ + 0x97, 0xB0, 0xE8, 0xFE, 0xB9, 0xD7, 0x97, 0xB1, /* 0xF8-0xFB */ + 0xE8, 0xFB, 0x97, 0xB2, 0x97, 0xB3, 0x97, 0xB4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_69[512] = { + 0x97, 0xB5, 0xE9, 0xA4, 0x97, 0xB6, 0x97, 0xB7, /* 0x00-0x03 */ + 0x97, 0xB8, 0xD2, 0xCE, 0x97, 0xB9, 0x97, 0xBA, /* 0x04-0x07 */ + 0x97, 0xBB, 0x97, 0xBC, 0x97, 0xBD, 0xE9, 0xA3, /* 0x08-0x0B */ + 0x97, 0xBE, 0xD6, 0xB2, 0xD7, 0xB5, 0x97, 0xBF, /* 0x0C-0x0F */ + 0xE9, 0xA7, 0x97, 0xC0, 0xBD, 0xB7, 0x97, 0xC1, /* 0x10-0x13 */ + 0x97, 0xC2, 0x97, 0xC3, 0x97, 0xC4, 0x97, 0xC5, /* 0x14-0x17 */ + 0x97, 0xC6, 0x97, 0xC7, 0x97, 0xC8, 0x97, 0xC9, /* 0x18-0x1B */ + 0x97, 0xCA, 0x97, 0xCB, 0x97, 0xCC, 0xE8, 0xFC, /* 0x1C-0x1F */ + 0xE8, 0xFD, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x20-0x23 */ + 0xE9, 0xA1, 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, /* 0x24-0x27 */ + 0x97, 0xD3, 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, /* 0x28-0x2B */ + 0x97, 0xD7, 0xCD, 0xD6, 0x97, 0xD8, 0x97, 0xD9, /* 0x2C-0x2F */ + 0xD2, 0xAC, 0x97, 0xDA, 0x97, 0xDB, 0x97, 0xDC, /* 0x30-0x33 */ + 0xE9, 0xB2, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x34-0x37 */ + 0x97, 0xE0, 0xE9, 0xA9, 0x97, 0xE1, 0x97, 0xE2, /* 0x38-0x3B */ + 0x97, 0xE3, 0xB4, 0xAA, 0x97, 0xE4, 0xB4, 0xBB, /* 0x3C-0x3F */ + 0x97, 0xE5, 0x97, 0xE6, 0xE9, 0xAB, 0x97, 0xE7, /* 0x40-0x43 */ + 0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x44-0x47 */ + 0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x48-0x4B */ + 0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x4C-0x4F */ + 0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x50-0x53 */ + 0xD0, 0xA8, 0x97, 0xF8, 0x97, 0xF9, 0xE9, 0xA5, /* 0x54-0x57 */ + 0x97, 0xFA, 0x97, 0xFB, 0xB3, 0xFE, 0x97, 0xFC, /* 0x58-0x5B */ + 0x97, 0xFD, 0xE9, 0xAC, 0xC0, 0xE3, 0x97, 0xFE, /* 0x5C-0x5F */ + 0xE9, 0xAA, 0x98, 0x40, 0x98, 0x41, 0xE9, 0xB9, /* 0x60-0x63 */ + 0x98, 0x42, 0x98, 0x43, 0xE9, 0xB8, 0x98, 0x44, /* 0x64-0x67 */ + 0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0xE9, 0xAE, /* 0x68-0x6B */ + 0x98, 0x48, 0x98, 0x49, 0xE8, 0xFA, 0x98, 0x4A, /* 0x6C-0x6F */ + 0x98, 0x4B, 0xE9, 0xA8, 0x98, 0x4C, 0x98, 0x4D, /* 0x70-0x73 */ + 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, 0xBF, 0xAC, /* 0x74-0x77 */ + 0xE9, 0xB1, 0xE9, 0xBA, 0x98, 0x51, 0x98, 0x52, /* 0x78-0x7B */ + 0xC2, 0xA5, 0x98, 0x53, 0x98, 0x54, 0x98, 0x55, /* 0x7C-0x7F */ + + 0xE9, 0xAF, 0x98, 0x56, 0xB8, 0xC5, 0x98, 0x57, /* 0x80-0x83 */ + 0xE9, 0xAD, 0x98, 0x58, 0xD3, 0xDC, 0xE9, 0xB4, /* 0x84-0x87 */ + 0xE9, 0xB5, 0xE9, 0xB7, 0x98, 0x59, 0x98, 0x5A, /* 0x88-0x8B */ + 0x98, 0x5B, 0xE9, 0xC7, 0x98, 0x5C, 0x98, 0x5D, /* 0x8C-0x8F */ + 0x98, 0x5E, 0x98, 0x5F, 0x98, 0x60, 0x98, 0x61, /* 0x90-0x93 */ + 0xC0, 0xC6, 0xE9, 0xC5, 0x98, 0x62, 0x98, 0x63, /* 0x94-0x97 */ + 0xE9, 0xB0, 0x98, 0x64, 0x98, 0x65, 0xE9, 0xBB, /* 0x98-0x9B */ + 0xB0, 0xF1, 0x98, 0x66, 0x98, 0x67, 0x98, 0x68, /* 0x9C-0x9F */ + 0x98, 0x69, 0x98, 0x6A, 0x98, 0x6B, 0x98, 0x6C, /* 0xA0-0xA3 */ + 0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0xE9, 0xBC, /* 0xA4-0xA7 */ + 0xD5, 0xA5, 0x98, 0x70, 0x98, 0x71, 0xE9, 0xBE, /* 0xA8-0xAB */ + 0x98, 0x72, 0xE9, 0xBF, 0x98, 0x73, 0x98, 0x74, /* 0xAC-0xAF */ + 0x98, 0x75, 0xE9, 0xC1, 0x98, 0x76, 0x98, 0x77, /* 0xB0-0xB3 */ + 0xC1, 0xF1, 0x98, 0x78, 0x98, 0x79, 0xC8, 0xB6, /* 0xB4-0xB7 */ + 0x98, 0x7A, 0x98, 0x7B, 0x98, 0x7C, 0xE9, 0xBD, /* 0xB8-0xBB */ + 0x98, 0x7D, 0x98, 0x7E, 0x98, 0x80, 0x98, 0x81, /* 0xBC-0xBF */ + 0x98, 0x82, 0xE9, 0xC2, 0x98, 0x83, 0x98, 0x84, /* 0xC0-0xC3 */ + 0x98, 0x85, 0x98, 0x86, 0x98, 0x87, 0x98, 0x88, /* 0xC4-0xC7 */ + 0x98, 0x89, 0x98, 0x8A, 0xE9, 0xC3, 0x98, 0x8B, /* 0xC8-0xCB */ + 0xE9, 0xB3, 0x98, 0x8C, 0xE9, 0xB6, 0x98, 0x8D, /* 0xCC-0xCF */ + 0xBB, 0xB1, 0x98, 0x8E, 0x98, 0x8F, 0x98, 0x90, /* 0xD0-0xD3 */ + 0xE9, 0xC0, 0x98, 0x91, 0x98, 0x92, 0x98, 0x93, /* 0xD4-0xD7 */ + 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, 0xBC, 0xF7, /* 0xD8-0xDB */ + 0x98, 0x97, 0x98, 0x98, 0x98, 0x99, 0xE9, 0xC4, /* 0xDC-0xDF */ + 0xE9, 0xC6, 0x98, 0x9A, 0x98, 0x9B, 0x98, 0x9C, /* 0xE0-0xE3 */ + 0x98, 0x9D, 0x98, 0x9E, 0x98, 0x9F, 0x98, 0xA0, /* 0xE4-0xE7 */ + 0x98, 0xA1, 0x98, 0xA2, 0x98, 0xA3, 0x98, 0xA4, /* 0xE8-0xEB */ + 0x98, 0xA5, 0xE9, 0xCA, 0x98, 0xA6, 0x98, 0xA7, /* 0xEC-0xEF */ + 0x98, 0xA8, 0x98, 0xA9, 0xE9, 0xCE, 0x98, 0xAA, /* 0xF0-0xF3 */ + 0x98, 0xAB, 0x98, 0xAC, 0x98, 0xAD, 0x98, 0xAE, /* 0xF4-0xF7 */ + 0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xF8-0xFB */ + 0x98, 0xB3, 0xB2, 0xDB, 0x98, 0xB4, 0xE9, 0xC8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x98, 0xB5, 0x98, 0xB6, 0x98, 0xB7, 0x98, 0xB8, /* 0x00-0x03 */ + 0x98, 0xB9, 0x98, 0xBA, 0x98, 0xBB, 0x98, 0xBC, /* 0x04-0x07 */ + 0x98, 0xBD, 0x98, 0xBE, 0xB7, 0xAE, 0x98, 0xBF, /* 0x08-0x0B */ + 0x98, 0xC0, 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, /* 0x0C-0x0F */ + 0x98, 0xC4, 0x98, 0xC5, 0x98, 0xC6, 0x98, 0xC7, /* 0x10-0x13 */ + 0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0xE9, 0xCB, /* 0x14-0x17 */ + 0xE9, 0xCC, 0x98, 0xCB, 0x98, 0xCC, 0x98, 0xCD, /* 0x18-0x1B */ + 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, 0xD5, 0xC1, /* 0x1C-0x1F */ + 0x98, 0xD1, 0xC4, 0xA3, 0x98, 0xD2, 0x98, 0xD3, /* 0x20-0x23 */ + 0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0x24-0x27 */ + 0xE9, 0xD8, 0x98, 0xD8, 0xBA, 0xE1, 0x98, 0xD9, /* 0x28-0x2B */ + 0x98, 0xDA, 0x98, 0xDB, 0x98, 0xDC, 0xE9, 0xC9, /* 0x2C-0x2F */ + 0x98, 0xDD, 0xD3, 0xA3, 0x98, 0xDE, 0x98, 0xDF, /* 0x30-0x33 */ + 0x98, 0xE0, 0xE9, 0xD4, 0x98, 0xE1, 0x98, 0xE2, /* 0x34-0x37 */ + 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, 0x98, 0xE6, /* 0x38-0x3B */ + 0x98, 0xE7, 0xE9, 0xD7, 0xE9, 0xD0, 0x98, 0xE8, /* 0x3C-0x3F */ + 0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x40-0x43 */ + 0xE9, 0xCF, 0x98, 0xED, 0x98, 0xEE, 0xC7, 0xC1, /* 0x44-0x47 */ + 0x98, 0xEF, 0x98, 0xF0, 0x98, 0xF1, 0x98, 0xF2, /* 0x48-0x4B */ + 0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x4C-0x4F */ + 0xE9, 0xD2, 0x98, 0xF7, 0x98, 0xF8, 0x98, 0xF9, /* 0x50-0x53 */ + 0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x54-0x57 */ + 0xE9, 0xD9, 0xB3, 0xC8, 0x98, 0xFE, 0xE9, 0xD3, /* 0x58-0x5B */ + 0x99, 0x40, 0x99, 0x41, 0x99, 0x42, 0x99, 0x43, /* 0x5C-0x5F */ + 0x99, 0x44, 0xCF, 0xF0, 0x99, 0x45, 0x99, 0x46, /* 0x60-0x63 */ + 0x99, 0x47, 0xE9, 0xCD, 0x99, 0x48, 0x99, 0x49, /* 0x64-0x67 */ + 0x99, 0x4A, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x68-0x6B */ + 0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x6C-0x6F */ + 0x99, 0x52, 0xB3, 0xF7, 0x99, 0x53, 0x99, 0x54, /* 0x70-0x73 */ + 0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x74-0x77 */ + 0x99, 0x59, 0xE9, 0xD6, 0x99, 0x5A, 0x99, 0x5B, /* 0x78-0x7B */ + 0xE9, 0xDA, 0x99, 0x5C, 0x99, 0x5D, 0x99, 0x5E, /* 0x7C-0x7F */ + + 0xCC, 0xB4, 0x99, 0x5F, 0x99, 0x60, 0x99, 0x61, /* 0x80-0x83 */ + 0xCF, 0xAD, 0x99, 0x62, 0x99, 0x63, 0x99, 0x64, /* 0x84-0x87 */ + 0x99, 0x65, 0x99, 0x66, 0x99, 0x67, 0x99, 0x68, /* 0x88-0x8B */ + 0x99, 0x69, 0x99, 0x6A, 0xE9, 0xD5, 0x99, 0x6B, /* 0x8C-0x8F */ + 0xE9, 0xDC, 0xE9, 0xDB, 0x99, 0x6C, 0x99, 0x6D, /* 0x90-0x93 */ + 0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0xE9, 0xDE, /* 0x94-0x97 */ + 0x99, 0x71, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x98-0x9B */ + 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, 0x99, 0x78, /* 0x9C-0x9F */ + 0xE9, 0xD1, 0x99, 0x79, 0x99, 0x7A, 0x99, 0x7B, /* 0xA0-0xA3 */ + 0x99, 0x7C, 0x99, 0x7D, 0x99, 0x7E, 0x99, 0x80, /* 0xA4-0xA7 */ + 0x99, 0x81, 0xE9, 0xDD, 0x99, 0x82, 0xE9, 0xDF, /* 0xA8-0xAB */ + 0xC3, 0xCA, 0x99, 0x83, 0x99, 0x84, 0x99, 0x85, /* 0xAC-0xAF */ + 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, 0x99, 0x89, /* 0xB0-0xB3 */ + 0x99, 0x8A, 0x99, 0x8B, 0x99, 0x8C, 0x99, 0x8D, /* 0xB4-0xB7 */ + 0x99, 0x8E, 0x99, 0x8F, 0x99, 0x90, 0x99, 0x91, /* 0xB8-0xBB */ + 0x99, 0x92, 0x99, 0x93, 0x99, 0x94, 0x99, 0x95, /* 0xBC-0xBF */ + 0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0xC0-0xC3 */ + 0x99, 0x9A, 0x99, 0x9B, 0x99, 0x9C, 0x99, 0x9D, /* 0xC4-0xC7 */ + 0x99, 0x9E, 0x99, 0x9F, 0x99, 0xA0, 0x99, 0xA1, /* 0xC8-0xCB */ + 0x99, 0xA2, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xCC-0xCF */ + 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, 0x99, 0xA9, /* 0xD0-0xD3 */ + 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, 0x99, 0xAD, /* 0xD4-0xD7 */ + 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, 0x99, 0xB1, /* 0xD8-0xDB */ + 0x99, 0xB2, 0x99, 0xB3, 0x99, 0xB4, 0x99, 0xB5, /* 0xDC-0xDF */ + 0x99, 0xB6, 0x99, 0xB7, 0x99, 0xB8, 0x99, 0xB9, /* 0xE0-0xE3 */ + 0x99, 0xBA, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xE4-0xE7 */ + 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, 0x99, 0xC1, /* 0xE8-0xEB */ + 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, 0x99, 0xC5, /* 0xEC-0xEF */ + 0x99, 0xC6, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xF0-0xF3 */ + 0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xF4-0xF7 */ + 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, 0x99, 0xD1, /* 0xF8-0xFB */ + 0x99, 0xD2, 0x99, 0xD3, 0x99, 0xD4, 0x99, 0xD5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6B[512] = { + 0x99, 0xD6, 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, /* 0x00-0x03 */ + 0x99, 0xDA, 0x99, 0xDB, 0x99, 0xDC, 0x99, 0xDD, /* 0x04-0x07 */ + 0x99, 0xDE, 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE1, /* 0x08-0x0B */ + 0x99, 0xE2, 0x99, 0xE3, 0x99, 0xE4, 0x99, 0xE5, /* 0x0C-0x0F */ + 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, 0x99, 0xE9, /* 0x10-0x13 */ + 0x99, 0xEA, 0x99, 0xEB, 0x99, 0xEC, 0x99, 0xED, /* 0x14-0x17 */ + 0x99, 0xEE, 0x99, 0xEF, 0x99, 0xF0, 0x99, 0xF1, /* 0x18-0x1B */ + 0x99, 0xF2, 0x99, 0xF3, 0x99, 0xF4, 0x99, 0xF5, /* 0x1C-0x1F */ + 0xC7, 0xB7, 0xB4, 0xCE, 0xBB, 0xB6, 0xD0, 0xC0, /* 0x20-0x23 */ + 0xEC, 0xA3, 0x99, 0xF6, 0x99, 0xF7, 0xC5, 0xB7, /* 0x24-0x27 */ + 0x99, 0xF8, 0x99, 0xF9, 0x99, 0xFA, 0x99, 0xFB, /* 0x28-0x2B */ + 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, 0x9A, 0x40, /* 0x2C-0x2F */ + 0x9A, 0x41, 0x9A, 0x42, 0xD3, 0xFB, 0x9A, 0x43, /* 0x30-0x33 */ + 0x9A, 0x44, 0x9A, 0x45, 0x9A, 0x46, 0xEC, 0xA4, /* 0x34-0x37 */ + 0x9A, 0x47, 0xEC, 0xA5, 0xC6, 0xDB, 0x9A, 0x48, /* 0x38-0x3B */ + 0x9A, 0x49, 0x9A, 0x4A, 0xBF, 0xEE, 0x9A, 0x4B, /* 0x3C-0x3F */ + 0x9A, 0x4C, 0x9A, 0x4D, 0x9A, 0x4E, 0xEC, 0xA6, /* 0x40-0x43 */ + 0x9A, 0x4F, 0x9A, 0x50, 0xEC, 0xA7, 0xD0, 0xAA, /* 0x44-0x47 */ + 0x9A, 0x51, 0xC7, 0xB8, 0x9A, 0x52, 0x9A, 0x53, /* 0x48-0x4B */ + 0xB8, 0xE8, 0x9A, 0x54, 0x9A, 0x55, 0x9A, 0x56, /* 0x4C-0x4F */ + 0x9A, 0x57, 0x9A, 0x58, 0x9A, 0x59, 0x9A, 0x5A, /* 0x50-0x53 */ + 0x9A, 0x5B, 0x9A, 0x5C, 0x9A, 0x5D, 0x9A, 0x5E, /* 0x54-0x57 */ + 0x9A, 0x5F, 0xEC, 0xA8, 0x9A, 0x60, 0x9A, 0x61, /* 0x58-0x5B */ + 0x9A, 0x62, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x5C-0x5F */ + 0x9A, 0x66, 0x9A, 0x67, 0xD6, 0xB9, 0xD5, 0xFD, /* 0x60-0x63 */ + 0xB4, 0xCB, 0xB2, 0xBD, 0xCE, 0xE4, 0xC6, 0xE7, /* 0x64-0x67 */ + 0x9A, 0x68, 0x9A, 0x69, 0xCD, 0xE1, 0x9A, 0x6A, /* 0x68-0x6B */ + 0x9A, 0x6B, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x6C-0x6F */ + 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, 0x9A, 0x72, /* 0x70-0x73 */ + 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, 0x9A, 0x76, /* 0x74-0x77 */ + 0x9A, 0x77, 0xB4, 0xF5, 0x9A, 0x78, 0xCB, 0xC0, /* 0x78-0x7B */ + 0xBC, 0xDF, 0x9A, 0x79, 0x9A, 0x7A, 0x9A, 0x7B, /* 0x7C-0x7F */ + + 0x9A, 0x7C, 0xE9, 0xE2, 0xE9, 0xE3, 0xD1, 0xEA, /* 0x80-0x83 */ + 0xE9, 0xE5, 0x9A, 0x7D, 0xB4, 0xF9, 0xE9, 0xE4, /* 0x84-0x87 */ + 0x9A, 0x7E, 0xD1, 0xB3, 0xCA, 0xE2, 0xB2, 0xD0, /* 0x88-0x8B */ + 0x9A, 0x80, 0xE9, 0xE8, 0x9A, 0x81, 0x9A, 0x82, /* 0x8C-0x8F */ + 0x9A, 0x83, 0x9A, 0x84, 0xE9, 0xE6, 0xE9, 0xE7, /* 0x90-0x93 */ + 0x9A, 0x85, 0x9A, 0x86, 0xD6, 0xB3, 0x9A, 0x87, /* 0x94-0x97 */ + 0x9A, 0x88, 0x9A, 0x89, 0xE9, 0xE9, 0xE9, 0xEA, /* 0x98-0x9B */ + 0x9A, 0x8A, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x9C-0x9F */ + 0x9A, 0x8E, 0xE9, 0xEB, 0x9A, 0x8F, 0x9A, 0x90, /* 0xA0-0xA3 */ + 0x9A, 0x91, 0x9A, 0x92, 0x9A, 0x93, 0x9A, 0x94, /* 0xA4-0xA7 */ + 0x9A, 0x95, 0x9A, 0x96, 0xE9, 0xEC, 0x9A, 0x97, /* 0xA8-0xAB */ + 0x9A, 0x98, 0x9A, 0x99, 0x9A, 0x9A, 0x9A, 0x9B, /* 0xAC-0xAF */ + 0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0xEC, 0xAF, /* 0xB0-0xB3 */ + 0xC5, 0xB9, 0xB6, 0xCE, 0x9A, 0x9F, 0xD2, 0xF3, /* 0xB4-0xB7 */ + 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, 0x9A, 0xA3, /* 0xB8-0xBB */ + 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, 0xB5, 0xEE, /* 0xBC-0xBF */ + 0x9A, 0xA7, 0xBB, 0xD9, 0xEC, 0xB1, 0x9A, 0xA8, /* 0xC0-0xC3 */ + 0x9A, 0xA9, 0xD2, 0xE3, 0x9A, 0xAA, 0x9A, 0xAB, /* 0xC4-0xC7 */ + 0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0xCE, 0xE3, /* 0xC8-0xCB */ + 0x9A, 0xAF, 0xC4, 0xB8, 0x9A, 0xB0, 0xC3, 0xBF, /* 0xCC-0xCF */ + 0x9A, 0xB1, 0x9A, 0xB2, 0xB6, 0xBE, 0xD8, 0xB9, /* 0xD0-0xD3 */ + 0xB1, 0xC8, 0xB1, 0xCF, 0xB1, 0xD1, 0xC5, 0xFE, /* 0xD4-0xD7 */ + 0x9A, 0xB3, 0xB1, 0xD0, 0x9A, 0xB4, 0xC3, 0xAB, /* 0xD8-0xDB */ + 0x9A, 0xB5, 0x9A, 0xB6, 0x9A, 0xB7, 0x9A, 0xB8, /* 0xDC-0xDF */ + 0x9A, 0xB9, 0xD5, 0xB1, 0x9A, 0xBA, 0x9A, 0xBB, /* 0xE0-0xE3 */ + 0x9A, 0xBC, 0x9A, 0xBD, 0x9A, 0xBE, 0x9A, 0xBF, /* 0xE4-0xE7 */ + 0x9A, 0xC0, 0x9A, 0xC1, 0xEB, 0xA4, 0xBA, 0xC1, /* 0xE8-0xEB */ + 0x9A, 0xC2, 0x9A, 0xC3, 0x9A, 0xC4, 0xCC, 0xBA, /* 0xEC-0xEF */ + 0x9A, 0xC5, 0x9A, 0xC6, 0x9A, 0xC7, 0xEB, 0xA5, /* 0xF0-0xF3 */ + 0x9A, 0xC8, 0xEB, 0xA7, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xF4-0xF7 */ + 0x9A, 0xCB, 0xEB, 0xA8, 0x9A, 0xCC, 0x9A, 0xCD, /* 0xF8-0xFB */ + 0x9A, 0xCE, 0xEB, 0xA6, 0x9A, 0xCF, 0x9A, 0xD0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6C[512] = { + 0x9A, 0xD1, 0x9A, 0xD2, 0x9A, 0xD3, 0x9A, 0xD4, /* 0x00-0x03 */ + 0x9A, 0xD5, 0xEB, 0xA9, 0xEB, 0xAB, 0xEB, 0xAA, /* 0x04-0x07 */ + 0x9A, 0xD6, 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, /* 0x08-0x0B */ + 0x9A, 0xDA, 0xEB, 0xAC, 0x9A, 0xDB, 0xCA, 0xCF, /* 0x0C-0x0F */ + 0xD8, 0xB5, 0xC3, 0xF1, 0x9A, 0xDC, 0xC3, 0xA5, /* 0x10-0x13 */ + 0xC6, 0xF8, 0xEB, 0xAD, 0xC4, 0xCA, 0x9A, 0xDD, /* 0x14-0x17 */ + 0xEB, 0xAE, 0xEB, 0xAF, 0xEB, 0xB0, 0xB7, 0xD5, /* 0x18-0x1B */ + 0x9A, 0xDE, 0x9A, 0xDF, 0x9A, 0xE0, 0xB7, 0xFA, /* 0x1C-0x1F */ + 0x9A, 0xE1, 0xEB, 0xB1, 0xC7, 0xE2, 0x9A, 0xE2, /* 0x20-0x23 */ + 0xEB, 0xB3, 0x9A, 0xE3, 0xBA, 0xA4, 0xD1, 0xF5, /* 0x24-0x27 */ + 0xB0, 0xB1, 0xEB, 0xB2, 0xEB, 0xB4, 0x9A, 0xE4, /* 0x28-0x2B */ + 0x9A, 0xE5, 0x9A, 0xE6, 0xB5, 0xAA, 0xC2, 0xC8, /* 0x2C-0x2F */ + 0xC7, 0xE8, 0x9A, 0xE7, 0xEB, 0xB5, 0x9A, 0xE8, /* 0x30-0x33 */ + 0xCB, 0xAE, 0xE3, 0xDF, 0x9A, 0xE9, 0x9A, 0xEA, /* 0x34-0x37 */ + 0xD3, 0xC0, 0x9A, 0xEB, 0x9A, 0xEC, 0x9A, 0xED, /* 0x38-0x3B */ + 0x9A, 0xEE, 0xD9, 0xDB, 0x9A, 0xEF, 0x9A, 0xF0, /* 0x3C-0x3F */ + 0xCD, 0xA1, 0xD6, 0xAD, 0xC7, 0xF3, 0x9A, 0xF1, /* 0x40-0x43 */ + 0x9A, 0xF2, 0x9A, 0xF3, 0xD9, 0xE0, 0xBB, 0xE3, /* 0x44-0x47 */ + 0x9A, 0xF4, 0xBA, 0xBA, 0xE3, 0xE2, 0x9A, 0xF5, /* 0x48-0x4B */ + 0x9A, 0xF6, 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, /* 0x4C-0x4F */ + 0xCF, 0xAB, 0x9A, 0xFA, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x50-0x53 */ + 0xE3, 0xE0, 0xC9, 0xC7, 0x9A, 0xFD, 0xBA, 0xB9, /* 0x54-0x57 */ + 0x9A, 0xFE, 0x9B, 0x40, 0x9B, 0x41, 0xD1, 0xB4, /* 0x58-0x5B */ + 0xE3, 0xE1, 0xC8, 0xEA, 0xB9, 0xAF, 0xBD, 0xAD, /* 0x5C-0x5F */ + 0xB3, 0xD8, 0xCE, 0xDB, 0x9B, 0x42, 0x9B, 0x43, /* 0x60-0x63 */ + 0xCC, 0xC0, 0x9B, 0x44, 0x9B, 0x45, 0x9B, 0x46, /* 0x64-0x67 */ + 0xE3, 0xE8, 0xE3, 0xE9, 0xCD, 0xF4, 0x9B, 0x47, /* 0x68-0x6B */ + 0x9B, 0x48, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x6C-0x6F */ + 0xCC, 0xAD, 0x9B, 0x4C, 0xBC, 0xB3, 0x9B, 0x4D, /* 0x70-0x73 */ + 0xE3, 0xEA, 0x9B, 0x4E, 0xE3, 0xEB, 0x9B, 0x4F, /* 0x74-0x77 */ + 0x9B, 0x50, 0xD0, 0xDA, 0x9B, 0x51, 0x9B, 0x52, /* 0x78-0x7B */ + 0x9B, 0x53, 0xC6, 0xFB, 0xB7, 0xDA, 0x9B, 0x54, /* 0x7C-0x7F */ + + 0x9B, 0x55, 0xC7, 0xDF, 0xD2, 0xCA, 0xCE, 0xD6, /* 0x80-0x83 */ + 0x9B, 0x56, 0xE3, 0xE4, 0xE3, 0xEC, 0x9B, 0x57, /* 0x84-0x87 */ + 0xC9, 0xF2, 0xB3, 0xC1, 0x9B, 0x58, 0x9B, 0x59, /* 0x88-0x8B */ + 0xE3, 0xE7, 0x9B, 0x5A, 0x9B, 0x5B, 0xC6, 0xE3, /* 0x8C-0x8F */ + 0xE3, 0xE5, 0x9B, 0x5C, 0x9B, 0x5D, 0xED, 0xB3, /* 0x90-0x93 */ + 0xE3, 0xE6, 0x9B, 0x5E, 0x9B, 0x5F, 0x9B, 0x60, /* 0x94-0x97 */ + 0x9B, 0x61, 0xC9, 0xB3, 0x9B, 0x62, 0xC5, 0xE6, /* 0x98-0x9B */ + 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, 0xB9, 0xB5, /* 0x9C-0x9F */ + 0x9B, 0x66, 0xC3, 0xBB, 0x9B, 0x67, 0xE3, 0xE3, /* 0xA0-0xA3 */ + 0xC5, 0xBD, 0xC1, 0xA4, 0xC2, 0xD9, 0xB2, 0xD7, /* 0xA4-0xA7 */ + 0x9B, 0x68, 0xE3, 0xED, 0xBB, 0xA6, 0xC4, 0xAD, /* 0xA8-0xAB */ + 0x9B, 0x69, 0xE3, 0xF0, 0xBE, 0xDA, 0x9B, 0x6A, /* 0xAC-0xAF */ + 0x9B, 0x6B, 0xE3, 0xFB, 0xE3, 0xF5, 0xBA, 0xD3, /* 0xB0-0xB3 */ + 0x9B, 0x6C, 0x9B, 0x6D, 0x9B, 0x6E, 0x9B, 0x6F, /* 0xB4-0xB7 */ + 0xB7, 0xD0, 0xD3, 0xCD, 0x9B, 0x70, 0xD6, 0xCE, /* 0xB8-0xBB */ + 0xD5, 0xD3, 0xB9, 0xC1, 0xD5, 0xB4, 0xD1, 0xD8, /* 0xBC-0xBF */ + 0x9B, 0x71, 0x9B, 0x72, 0x9B, 0x73, 0x9B, 0x74, /* 0xC0-0xC3 */ + 0xD0, 0xB9, 0xC7, 0xF6, 0x9B, 0x75, 0x9B, 0x76, /* 0xC4-0xC7 */ + 0x9B, 0x77, 0xC8, 0xAA, 0xB2, 0xB4, 0x9B, 0x78, /* 0xC8-0xCB */ + 0xC3, 0xDA, 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x7B, /* 0xCC-0xCF */ + 0xE3, 0xEE, 0x9B, 0x7C, 0x9B, 0x7D, 0xE3, 0xFC, /* 0xD0-0xD3 */ + 0xE3, 0xEF, 0xB7, 0xA8, 0xE3, 0xF7, 0xE3, 0xF4, /* 0xD4-0xD7 */ + 0x9B, 0x7E, 0x9B, 0x80, 0x9B, 0x81, 0xB7, 0xBA, /* 0xD8-0xDB */ + 0x9B, 0x82, 0x9B, 0x83, 0xC5, 0xA2, 0x9B, 0x84, /* 0xDC-0xDF */ + 0xE3, 0xF6, 0xC5, 0xDD, 0xB2, 0xA8, 0xC6, 0xFC, /* 0xE0-0xE3 */ + 0x9B, 0x85, 0xC4, 0xE0, 0x9B, 0x86, 0x9B, 0x87, /* 0xE4-0xE7 */ + 0xD7, 0xA2, 0x9B, 0x88, 0xC0, 0xE1, 0xE3, 0xF9, /* 0xE8-0xEB */ + 0x9B, 0x89, 0x9B, 0x8A, 0xE3, 0xFA, 0xE3, 0xFD, /* 0xEC-0xEF */ + 0xCC, 0xA9, 0xE3, 0xF3, 0x9B, 0x8B, 0xD3, 0xBE, /* 0xF0-0xF3 */ + 0x9B, 0x8C, 0xB1, 0xC3, 0xED, 0xB4, 0xE3, 0xF1, /* 0xF4-0xF7 */ + 0xE3, 0xF2, 0x9B, 0x8D, 0xE3, 0xF8, 0xD0, 0xBA, /* 0xF8-0xFB */ + 0xC6, 0xC3, 0xD4, 0xF3, 0xE3, 0xFE, 0x9B, 0x8E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6D[512] = { + 0x9B, 0x8F, 0xBD, 0xE0, 0x9B, 0x90, 0x9B, 0x91, /* 0x00-0x03 */ + 0xE4, 0xA7, 0x9B, 0x92, 0x9B, 0x93, 0xE4, 0xA6, /* 0x04-0x07 */ + 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, 0xD1, 0xF3, /* 0x08-0x0B */ + 0xE4, 0xA3, 0x9B, 0x97, 0xE4, 0xA9, 0x9B, 0x98, /* 0x0C-0x0F */ + 0x9B, 0x99, 0x9B, 0x9A, 0xC8, 0xF7, 0x9B, 0x9B, /* 0x10-0x13 */ + 0x9B, 0x9C, 0x9B, 0x9D, 0x9B, 0x9E, 0xCF, 0xB4, /* 0x14-0x17 */ + 0x9B, 0x9F, 0xE4, 0xA8, 0xE4, 0xAE, 0xC2, 0xE5, /* 0x18-0x1B */ + 0x9B, 0xA0, 0x9B, 0xA1, 0xB6, 0xB4, 0x9B, 0xA2, /* 0x1C-0x1F */ + 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, 0x9B, 0xA6, /* 0x20-0x23 */ + 0x9B, 0xA7, 0xBD, 0xF2, 0x9B, 0xA8, 0xE4, 0xA2, /* 0x24-0x27 */ + 0x9B, 0xA9, 0x9B, 0xAA, 0xBA, 0xE9, 0xE4, 0xAA, /* 0x28-0x2B */ + 0x9B, 0xAB, 0x9B, 0xAC, 0xE4, 0xAC, 0x9B, 0xAD, /* 0x2C-0x2F */ + 0x9B, 0xAE, 0xB6, 0xFD, 0xD6, 0xDE, 0xE4, 0xB2, /* 0x30-0x33 */ + 0x9B, 0xAF, 0xE4, 0xAD, 0x9B, 0xB0, 0x9B, 0xB1, /* 0x34-0x37 */ + 0x9B, 0xB2, 0xE4, 0xA1, 0x9B, 0xB3, 0xBB, 0xEE, /* 0x38-0x3B */ + 0xCD, 0xDD, 0xC7, 0xA2, 0xC5, 0xC9, 0x9B, 0xB4, /* 0x3C-0x3F */ + 0x9B, 0xB5, 0xC1, 0xF7, 0x9B, 0xB6, 0xE4, 0xA4, /* 0x40-0x43 */ + 0x9B, 0xB7, 0xC7, 0xB3, 0xBD, 0xAC, 0xBD, 0xBD, /* 0x44-0x47 */ + 0xE4, 0xA5, 0x9B, 0xB8, 0xD7, 0xC7, 0xB2, 0xE2, /* 0x48-0x4B */ + 0x9B, 0xB9, 0xE4, 0xAB, 0xBC, 0xC3, 0xE4, 0xAF, /* 0x4C-0x4F */ + 0x9B, 0xBA, 0xBB, 0xEB, 0xE4, 0xB0, 0xC5, 0xA8, /* 0x50-0x53 */ + 0xE4, 0xB1, 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, /* 0x54-0x57 */ + 0x9B, 0xBE, 0xD5, 0xE3, 0xBF, 0xA3, 0x9B, 0xBF, /* 0x58-0x5B */ + 0xE4, 0xBA, 0x9B, 0xC0, 0xE4, 0xB7, 0x9B, 0xC1, /* 0x5C-0x5F */ + 0xE4, 0xBB, 0x9B, 0xC2, 0x9B, 0xC3, 0xE4, 0xBD, /* 0x60-0x63 */ + 0x9B, 0xC4, 0x9B, 0xC5, 0xC6, 0xD6, 0x9B, 0xC6, /* 0x64-0x67 */ + 0x9B, 0xC7, 0xBA, 0xC6, 0xC0, 0xCB, 0x9B, 0xC8, /* 0x68-0x6B */ + 0x9B, 0xC9, 0x9B, 0xCA, 0xB8, 0xA1, 0xE4, 0xB4, /* 0x6C-0x6F */ + 0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0x70-0x73 */ + 0xD4, 0xA1, 0x9B, 0xCF, 0x9B, 0xD0, 0xBA, 0xA3, /* 0x74-0x77 */ + 0xBD, 0xFE, 0x9B, 0xD1, 0x9B, 0xD2, 0x9B, 0xD3, /* 0x78-0x7B */ + 0xE4, 0xBC, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0x7C-0x7F */ + + 0x9B, 0xD7, 0x9B, 0xD8, 0xCD, 0xBF, 0x9B, 0xD9, /* 0x80-0x83 */ + 0x9B, 0xDA, 0xC4, 0xF9, 0x9B, 0xDB, 0x9B, 0xDC, /* 0x84-0x87 */ + 0xCF, 0xFB, 0xC9, 0xE6, 0x9B, 0xDD, 0x9B, 0xDE, /* 0x88-0x8B */ + 0xD3, 0xBF, 0x9B, 0xDF, 0xCF, 0xD1, 0x9B, 0xE0, /* 0x8C-0x8F */ + 0x9B, 0xE1, 0xE4, 0xB3, 0x9B, 0xE2, 0xE4, 0xB8, /* 0x90-0x93 */ + 0xE4, 0xB9, 0xCC, 0xE9, 0x9B, 0xE3, 0x9B, 0xE4, /* 0x94-0x97 */ + 0x9B, 0xE5, 0x9B, 0xE6, 0x9B, 0xE7, 0xCC, 0xCE, /* 0x98-0x9B */ + 0x9B, 0xE8, 0xC0, 0xD4, 0xE4, 0xB5, 0xC1, 0xB0, /* 0x9C-0x9F */ + 0xE4, 0xB6, 0xCE, 0xD0, 0x9B, 0xE9, 0xBB, 0xC1, /* 0xA0-0xA3 */ + 0xB5, 0xD3, 0x9B, 0xEA, 0xC8, 0xF3, 0xBD, 0xA7, /* 0xA4-0xA7 */ + 0xD5, 0xC7, 0xC9, 0xAC, 0xB8, 0xA2, 0xE4, 0xCA, /* 0xA8-0xAB */ + 0x9B, 0xEB, 0x9B, 0xEC, 0xE4, 0xCC, 0xD1, 0xC4, /* 0xAC-0xAF */ + 0x9B, 0xED, 0x9B, 0xEE, 0xD2, 0xBA, 0x9B, 0xEF, /* 0xB0-0xB3 */ + 0x9B, 0xF0, 0xBA, 0xAD, 0x9B, 0xF1, 0x9B, 0xF2, /* 0xB4-0xB7 */ + 0xBA, 0xD4, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xB8-0xBB */ + 0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0xE4, 0xC3, /* 0xBC-0xBF */ + 0xB5, 0xED, 0x9B, 0xF9, 0x9B, 0xFA, 0x9B, 0xFB, /* 0xC0-0xC3 */ + 0xD7, 0xCD, 0xE4, 0xC0, 0xCF, 0xFD, 0xE4, 0xBF, /* 0xC4-0xC7 */ + 0x9B, 0xFC, 0x9B, 0xFD, 0x9B, 0xFE, 0xC1, 0xDC, /* 0xC8-0xCB */ + 0xCC, 0xCA, 0x9C, 0x40, 0x9C, 0x41, 0x9C, 0x42, /* 0xCC-0xCF */ + 0x9C, 0x43, 0xCA, 0xE7, 0x9C, 0x44, 0x9C, 0x45, /* 0xD0-0xD3 */ + 0x9C, 0x46, 0x9C, 0x47, 0xC4, 0xD7, 0x9C, 0x48, /* 0xD4-0xD7 */ + 0xCC, 0xD4, 0xE4, 0xC8, 0x9C, 0x49, 0x9C, 0x4A, /* 0xD8-0xDB */ + 0x9C, 0x4B, 0xE4, 0xC7, 0xE4, 0xC1, 0x9C, 0x4C, /* 0xDC-0xDF */ + 0xE4, 0xC4, 0xB5, 0xAD, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xE0-0xE3 */ + 0xD3, 0xD9, 0x9C, 0x4F, 0xE4, 0xC6, 0x9C, 0x50, /* 0xE4-0xE7 */ + 0x9C, 0x51, 0x9C, 0x52, 0x9C, 0x53, 0xD2, 0xF9, /* 0xE8-0xEB */ + 0xB4, 0xE3, 0x9C, 0x54, 0xBB, 0xB4, 0x9C, 0x55, /* 0xEC-0xEF */ + 0x9C, 0x56, 0xC9, 0xEE, 0x9C, 0x57, 0xB4, 0xBE, /* 0xF0-0xF3 */ + 0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0xBB, 0xEC, /* 0xF4-0xF7 */ + 0x9C, 0x5B, 0xD1, 0xCD, 0x9C, 0x5C, 0xCC, 0xED, /* 0xF8-0xFB */ + 0xED, 0xB5, 0x9C, 0x5D, 0x9C, 0x5E, 0x9C, 0x5F, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0x9C, 0x60, 0x9C, 0x61, 0x9C, 0x62, 0x9C, 0x63, /* 0x00-0x03 */ + 0x9C, 0x64, 0xC7, 0xE5, 0x9C, 0x65, 0x9C, 0x66, /* 0x04-0x07 */ + 0x9C, 0x67, 0x9C, 0x68, 0xD4, 0xA8, 0x9C, 0x69, /* 0x08-0x0B */ + 0xE4, 0xCB, 0xD7, 0xD5, 0xE4, 0xC2, 0x9C, 0x6A, /* 0x0C-0x0F */ + 0xBD, 0xA5, 0xE4, 0xC5, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x10-0x13 */ + 0xD3, 0xE6, 0x9C, 0x6D, 0xE4, 0xC9, 0xC9, 0xF8, /* 0x14-0x17 */ + 0x9C, 0x6E, 0x9C, 0x6F, 0xE4, 0xBE, 0x9C, 0x70, /* 0x18-0x1B */ + 0x9C, 0x71, 0xD3, 0xE5, 0x9C, 0x72, 0x9C, 0x73, /* 0x1C-0x1F */ + 0xC7, 0xFE, 0xB6, 0xC9, 0x9C, 0x74, 0xD4, 0xFC, /* 0x20-0x23 */ + 0xB2, 0xB3, 0xE4, 0xD7, 0x9C, 0x75, 0x9C, 0x76, /* 0x24-0x27 */ + 0x9C, 0x77, 0xCE, 0xC2, 0x9C, 0x78, 0xE4, 0xCD, /* 0x28-0x2B */ + 0x9C, 0x79, 0xCE, 0xBC, 0x9C, 0x7A, 0xB8, 0xDB, /* 0x2C-0x2F */ + 0x9C, 0x7B, 0x9C, 0x7C, 0xE4, 0xD6, 0x9C, 0x7D, /* 0x30-0x33 */ + 0xBF, 0xCA, 0x9C, 0x7E, 0x9C, 0x80, 0x9C, 0x81, /* 0x34-0x37 */ + 0xD3, 0xCE, 0x9C, 0x82, 0xC3, 0xEC, 0x9C, 0x83, /* 0x38-0x3B */ + 0x9C, 0x84, 0x9C, 0x85, 0x9C, 0x86, 0x9C, 0x87, /* 0x3C-0x3F */ + 0x9C, 0x88, 0x9C, 0x89, 0x9C, 0x8A, 0xC5, 0xC8, /* 0x40-0x43 */ + 0xE4, 0xD8, 0x9C, 0x8B, 0x9C, 0x8C, 0x9C, 0x8D, /* 0x44-0x47 */ + 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, 0x9C, 0x91, /* 0x48-0x4B */ + 0x9C, 0x92, 0xCD, 0xC4, 0xE4, 0xCF, 0x9C, 0x93, /* 0x4C-0x4F */ + 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, 0xE4, 0xD4, /* 0x50-0x53 */ + 0xE4, 0xD5, 0x9C, 0x97, 0xBA, 0xFE, 0x9C, 0x98, /* 0x54-0x57 */ + 0xCF, 0xE6, 0x9C, 0x99, 0x9C, 0x9A, 0xD5, 0xBF, /* 0x58-0x5B */ + 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xD2, /* 0x5C-0x5F */ + 0x9C, 0x9E, 0x9C, 0x9F, 0x9C, 0xA0, 0x9C, 0xA1, /* 0x60-0x63 */ + 0x9C, 0xA2, 0x9C, 0xA3, 0x9C, 0xA4, 0x9C, 0xA5, /* 0x64-0x67 */ + 0x9C, 0xA6, 0x9C, 0xA7, 0x9C, 0xA8, 0xE4, 0xD0, /* 0x68-0x6B */ + 0x9C, 0xA9, 0x9C, 0xAA, 0xE4, 0xCE, 0x9C, 0xAB, /* 0x6C-0x6F */ + 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, 0x9C, 0xAF, /* 0x70-0x73 */ + 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, 0x9C, 0xB3, /* 0x74-0x77 */ + 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, 0x9C, 0xB7, /* 0x78-0x7B */ + 0x9C, 0xB8, 0x9C, 0xB9, 0xCD, 0xE5, 0xCA, 0xAA, /* 0x7C-0x7F */ + + 0x9C, 0xBA, 0x9C, 0xBB, 0x9C, 0xBC, 0xC0, 0xA3, /* 0x80-0x83 */ + 0x9C, 0xBD, 0xBD, 0xA6, 0xE4, 0xD3, 0x9C, 0xBE, /* 0x84-0x87 */ + 0x9C, 0xBF, 0xB8, 0xC8, 0x9C, 0xC0, 0x9C, 0xC1, /* 0x88-0x8B */ + 0x9C, 0xC2, 0x9C, 0xC3, 0x9C, 0xC4, 0xE4, 0xE7, /* 0x8C-0x8F */ + 0xD4, 0xB4, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x90-0x93 */ + 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, 0x9C, 0xCB, /* 0x94-0x97 */ + 0xE4, 0xDB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x98-0x9B */ + 0xC1, 0xEF, 0x9C, 0xCF, 0x9C, 0xD0, 0xE4, 0xE9, /* 0x9C-0x9F */ + 0x9C, 0xD1, 0x9C, 0xD2, 0xD2, 0xE7, 0x9C, 0xD3, /* 0xA0-0xA3 */ + 0x9C, 0xD4, 0xE4, 0xDF, 0x9C, 0xD5, 0xE4, 0xE0, /* 0xA4-0xA7 */ + 0x9C, 0xD6, 0x9C, 0xD7, 0xCF, 0xAA, 0x9C, 0xD8, /* 0xA8-0xAB */ + 0x9C, 0xD9, 0x9C, 0xDA, 0x9C, 0xDB, 0xCB, 0xDD, /* 0xAC-0xAF */ + 0x9C, 0xDC, 0xE4, 0xDA, 0xE4, 0xD1, 0x9C, 0xDD, /* 0xB0-0xB3 */ + 0xE4, 0xE5, 0x9C, 0xDE, 0xC8, 0xDC, 0xE4, 0xE3, /* 0xB4-0xB7 */ + 0x9C, 0xDF, 0x9C, 0xE0, 0xC4, 0xE7, 0xE4, 0xE2, /* 0xB8-0xBB */ + 0x9C, 0xE1, 0xE4, 0xE1, 0x9C, 0xE2, 0x9C, 0xE3, /* 0xBC-0xBF */ + 0x9C, 0xE4, 0xB3, 0xFC, 0xE4, 0xE8, 0x9C, 0xE5, /* 0xC0-0xC3 */ + 0x9C, 0xE6, 0x9C, 0xE7, 0x9C, 0xE8, 0xB5, 0xE1, /* 0xC4-0xC7 */ + 0x9C, 0xE9, 0x9C, 0xEA, 0x9C, 0xEB, 0xD7, 0xCC, /* 0xC8-0xCB */ + 0x9C, 0xEC, 0x9C, 0xED, 0x9C, 0xEE, 0xE4, 0xE6, /* 0xCC-0xCF */ + 0x9C, 0xEF, 0xBB, 0xAC, 0x9C, 0xF0, 0xD7, 0xD2, /* 0xD0-0xD3 */ + 0xCC, 0xCF, 0xEB, 0xF8, 0x9C, 0xF1, 0xE4, 0xE4, /* 0xD4-0xD7 */ + 0x9C, 0xF2, 0x9C, 0xF3, 0xB9, 0xF6, 0x9C, 0xF4, /* 0xD8-0xDB */ + 0x9C, 0xF5, 0x9C, 0xF6, 0xD6, 0xCD, 0xE4, 0xD9, /* 0xDC-0xDF */ + 0xE4, 0xDC, 0xC2, 0xFA, 0xE4, 0xDE, 0x9C, 0xF7, /* 0xE0-0xE3 */ + 0xC2, 0xCB, 0xC0, 0xC4, 0xC2, 0xD0, 0x9C, 0xF8, /* 0xE4-0xE7 */ + 0xB1, 0xF5, 0xCC, 0xB2, 0x9C, 0xF9, 0x9C, 0xFA, /* 0xE8-0xEB */ + 0x9C, 0xFB, 0x9C, 0xFC, 0x9C, 0xFD, 0x9C, 0xFE, /* 0xEC-0xEF */ + 0x9D, 0x40, 0x9D, 0x41, 0x9D, 0x42, 0x9D, 0x43, /* 0xF0-0xF3 */ + 0xB5, 0xCE, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xF4-0xF7 */ + 0x9D, 0x47, 0xE4, 0xEF, 0x9D, 0x48, 0x9D, 0x49, /* 0xF8-0xFB */ + 0x9D, 0x4A, 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x9D, 0x4E, 0x9D, 0x4F, 0xC6, 0xAF, 0x9D, 0x50, /* 0x00-0x03 */ + 0x9D, 0x51, 0x9D, 0x52, 0xC6, 0xE1, 0x9D, 0x53, /* 0x04-0x07 */ + 0x9D, 0x54, 0xE4, 0xF5, 0x9D, 0x55, 0x9D, 0x56, /* 0x08-0x0B */ + 0x9D, 0x57, 0x9D, 0x58, 0x9D, 0x59, 0xC2, 0xA9, /* 0x0C-0x0F */ + 0x9D, 0x5A, 0x9D, 0x5B, 0x9D, 0x5C, 0xC0, 0xEC, /* 0x10-0x13 */ + 0xD1, 0xDD, 0xE4, 0xEE, 0x9D, 0x5D, 0x9D, 0x5E, /* 0x14-0x17 */ + 0x9D, 0x5F, 0x9D, 0x60, 0x9D, 0x61, 0x9D, 0x62, /* 0x18-0x1B */ + 0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0x1C-0x1F */ + 0xC4, 0xAE, 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, /* 0x20-0x23 */ + 0xE4, 0xED, 0x9D, 0x6A, 0x9D, 0x6B, 0x9D, 0x6C, /* 0x24-0x27 */ + 0x9D, 0x6D, 0xE4, 0xF6, 0xE4, 0xF4, 0xC2, 0xFE, /* 0x28-0x2B */ + 0x9D, 0x6E, 0xE4, 0xDD, 0x9D, 0x6F, 0xE4, 0xF0, /* 0x2C-0x2F */ + 0x9D, 0x70, 0xCA, 0xFE, 0x9D, 0x71, 0xD5, 0xC4, /* 0x30-0x33 */ + 0x9D, 0x72, 0x9D, 0x73, 0xE4, 0xF1, 0x9D, 0x74, /* 0x34-0x37 */ + 0x9D, 0x75, 0x9D, 0x76, 0x9D, 0x77, 0x9D, 0x78, /* 0x38-0x3B */ + 0x9D, 0x79, 0x9D, 0x7A, 0xD1, 0xFA, 0x9D, 0x7B, /* 0x3C-0x3F */ + 0x9D, 0x7C, 0x9D, 0x7D, 0x9D, 0x7E, 0x9D, 0x80, /* 0x40-0x43 */ + 0x9D, 0x81, 0x9D, 0x82, 0xE4, 0xEB, 0xE4, 0xEC, /* 0x44-0x47 */ + 0x9D, 0x83, 0x9D, 0x84, 0x9D, 0x85, 0xE4, 0xF2, /* 0x48-0x4B */ + 0x9D, 0x86, 0xCE, 0xAB, 0x9D, 0x87, 0x9D, 0x88, /* 0x4C-0x4F */ + 0x9D, 0x89, 0x9D, 0x8A, 0x9D, 0x8B, 0x9D, 0x8C, /* 0x50-0x53 */ + 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, 0x9D, 0x90, /* 0x54-0x57 */ + 0xC5, 0xCB, 0x9D, 0x91, 0x9D, 0x92, 0x9D, 0x93, /* 0x58-0x5B */ + 0xC7, 0xB1, 0x9D, 0x94, 0xC2, 0xBA, 0x9D, 0x95, /* 0x5C-0x5F */ + 0x9D, 0x96, 0x9D, 0x97, 0xE4, 0xEA, 0x9D, 0x98, /* 0x60-0x63 */ + 0x9D, 0x99, 0x9D, 0x9A, 0xC1, 0xCA, 0x9D, 0x9B, /* 0x64-0x67 */ + 0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x68-0x6B */ + 0x9D, 0xA0, 0xCC, 0xB6, 0xB3, 0xB1, 0x9D, 0xA1, /* 0x6C-0x6F */ + 0x9D, 0xA2, 0x9D, 0xA3, 0xE4, 0xFB, 0x9D, 0xA4, /* 0x70-0x73 */ + 0xE4, 0xF3, 0x9D, 0xA5, 0x9D, 0xA6, 0x9D, 0xA7, /* 0x74-0x77 */ + 0xE4, 0xFA, 0x9D, 0xA8, 0xE4, 0xFD, 0x9D, 0xA9, /* 0x78-0x7B */ + 0xE4, 0xFC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x7C-0x7F */ + + 0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x80-0x83 */ + 0xB3, 0xCE, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x84-0x87 */ + 0xB3, 0xBA, 0xE4, 0xF7, 0x9D, 0xB4, 0x9D, 0xB5, /* 0x88-0x8B */ + 0xE4, 0xF9, 0xE4, 0xF8, 0xC5, 0xEC, 0x9D, 0xB6, /* 0x8C-0x8F */ + 0x9D, 0xB7, 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, /* 0x90-0x93 */ + 0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBD, 0x9D, 0xBE, /* 0x94-0x97 */ + 0x9D, 0xBF, 0x9D, 0xC0, 0x9D, 0xC1, 0x9D, 0xC2, /* 0x98-0x9B */ + 0xC0, 0xBD, 0x9D, 0xC3, 0x9D, 0xC4, 0x9D, 0xC5, /* 0x9C-0x9F */ + 0x9D, 0xC6, 0xD4, 0xE8, 0x9D, 0xC7, 0x9D, 0xC8, /* 0xA0-0xA3 */ + 0x9D, 0xC9, 0x9D, 0xCA, 0x9D, 0xCB, 0xE5, 0xA2, /* 0xA4-0xA7 */ + 0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0xA8-0xAB */ + 0x9D, 0xD0, 0x9D, 0xD1, 0x9D, 0xD2, 0x9D, 0xD3, /* 0xAC-0xAF */ + 0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xB0, 0xC4, /* 0xB0-0xB3 */ + 0x9D, 0xD7, 0x9D, 0xD8, 0xE5, 0xA4, 0x9D, 0xD9, /* 0xB4-0xB7 */ + 0x9D, 0xDA, 0xE5, 0xA3, 0x9D, 0xDB, 0x9D, 0xDC, /* 0xB8-0xBB */ + 0x9D, 0xDD, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0xBC-0xBF */ + 0xBC, 0xA4, 0x9D, 0xE1, 0xE5, 0xA5, 0x9D, 0xE2, /* 0xC0-0xC3 */ + 0x9D, 0xE3, 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, /* 0xC4-0xC7 */ + 0x9D, 0xE7, 0xE5, 0xA1, 0x9D, 0xE8, 0x9D, 0xE9, /* 0xC8-0xCB */ + 0x9D, 0xEA, 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, /* 0xCC-0xCF */ + 0x9D, 0xEE, 0xE4, 0xFE, 0xB1, 0xF4, 0x9D, 0xEF, /* 0xD0-0xD3 */ + 0x9D, 0xF0, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0xD4-0xD7 */ + 0x9D, 0xF4, 0x9D, 0xF5, 0x9D, 0xF6, 0x9D, 0xF7, /* 0xD8-0xDB */ + 0x9D, 0xF8, 0x9D, 0xF9, 0xE5, 0xA8, 0x9D, 0xFA, /* 0xDC-0xDF */ + 0xE5, 0xA9, 0xE5, 0xA6, 0x9D, 0xFB, 0x9D, 0xFC, /* 0xE0-0xE3 */ + 0x9D, 0xFD, 0x9D, 0xFE, 0x9E, 0x40, 0x9E, 0x41, /* 0xE4-0xE7 */ + 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, 0x9E, 0x45, /* 0xE8-0xEB */ + 0x9E, 0x46, 0x9E, 0x47, 0xE5, 0xA7, 0xE5, 0xAA, /* 0xEC-0xEF */ + 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, 0x9E, 0x4B, /* 0xF0-0xF3 */ + 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, 0x9E, 0x4F, /* 0xF4-0xF7 */ + 0x9E, 0x50, 0x9E, 0x51, 0x9E, 0x52, 0x9E, 0x53, /* 0xF8-0xFB */ + 0x9E, 0x54, 0x9E, 0x55, 0x9E, 0x56, 0x9E, 0x57, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x9E, 0x58, 0x9E, 0x59, 0x9E, 0x5A, 0x9E, 0x5B, /* 0x00-0x03 */ + 0x9E, 0x5C, 0x9E, 0x5D, 0x9E, 0x5E, 0x9E, 0x5F, /* 0x04-0x07 */ + 0x9E, 0x60, 0x9E, 0x61, 0x9E, 0x62, 0x9E, 0x63, /* 0x08-0x0B */ + 0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0x0C-0x0F */ + 0x9E, 0x68, 0xC6, 0xD9, 0x9E, 0x69, 0x9E, 0x6A, /* 0x10-0x13 */ + 0x9E, 0x6B, 0x9E, 0x6C, 0x9E, 0x6D, 0x9E, 0x6E, /* 0x14-0x17 */ + 0x9E, 0x6F, 0x9E, 0x70, 0xE5, 0xAB, 0xE5, 0xAD, /* 0x18-0x1B */ + 0x9E, 0x71, 0x9E, 0x72, 0x9E, 0x73, 0x9E, 0x74, /* 0x1C-0x1F */ + 0x9E, 0x75, 0x9E, 0x76, 0x9E, 0x77, 0xE5, 0xAC, /* 0x20-0x23 */ + 0x9E, 0x78, 0x9E, 0x79, 0x9E, 0x7A, 0x9E, 0x7B, /* 0x24-0x27 */ + 0x9E, 0x7C, 0x9E, 0x7D, 0x9E, 0x7E, 0x9E, 0x80, /* 0x28-0x2B */ + 0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0x2C-0x2F */ + 0x9E, 0x85, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0x30-0x33 */ + 0x9E, 0x89, 0xE5, 0xAF, 0x9E, 0x8A, 0x9E, 0x8B, /* 0x34-0x37 */ + 0x9E, 0x8C, 0xE5, 0xAE, 0x9E, 0x8D, 0x9E, 0x8E, /* 0x38-0x3B */ + 0x9E, 0x8F, 0x9E, 0x90, 0x9E, 0x91, 0x9E, 0x92, /* 0x3C-0x3F */ + 0x9E, 0x93, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x40-0x43 */ + 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, 0x9E, 0x9A, /* 0x44-0x47 */ + 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, 0x9E, 0x9E, /* 0x48-0x4B */ + 0xB9, 0xE0, 0x9E, 0x9F, 0x9E, 0xA0, 0xE5, 0xB0, /* 0x4C-0x4F */ + 0x9E, 0xA1, 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, /* 0x50-0x53 */ + 0x9E, 0xA5, 0x9E, 0xA6, 0x9E, 0xA7, 0x9E, 0xA8, /* 0x54-0x57 */ + 0x9E, 0xA9, 0x9E, 0xAA, 0x9E, 0xAB, 0x9E, 0xAC, /* 0x58-0x5B */ + 0x9E, 0xAD, 0x9E, 0xAE, 0xE5, 0xB1, 0x9E, 0xAF, /* 0x5C-0x5F */ + 0x9E, 0xB0, 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, /* 0x60-0x63 */ + 0x9E, 0xB4, 0x9E, 0xB5, 0x9E, 0xB6, 0x9E, 0xB7, /* 0x64-0x67 */ + 0x9E, 0xB8, 0x9E, 0xB9, 0x9E, 0xBA, 0xBB, 0xF0, /* 0x68-0x6B */ + 0xEC, 0xE1, 0xC3, 0xF0, 0x9E, 0xBB, 0xB5, 0xC6, /* 0x6C-0x6F */ + 0xBB, 0xD2, 0x9E, 0xBC, 0x9E, 0xBD, 0x9E, 0xBE, /* 0x70-0x73 */ + 0x9E, 0xBF, 0xC1, 0xE9, 0xD4, 0xEE, 0x9E, 0xC0, /* 0x74-0x77 */ + 0xBE, 0xC4, 0x9E, 0xC1, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x78-0x7B */ + 0xD7, 0xC6, 0x9E, 0xC4, 0xD4, 0xD6, 0xB2, 0xD3, /* 0x7C-0x7F */ + + 0xEC, 0xBE, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x80-0x83 */ + 0x9E, 0xC8, 0xEA, 0xC1, 0x9E, 0xC9, 0x9E, 0xCA, /* 0x84-0x87 */ + 0x9E, 0xCB, 0xC2, 0xAF, 0xB4, 0xB6, 0x9E, 0xCC, /* 0x88-0x8B */ + 0x9E, 0xCD, 0x9E, 0xCE, 0xD1, 0xD7, 0x9E, 0xCF, /* 0x8C-0x8F */ + 0x9E, 0xD0, 0x9E, 0xD1, 0xB3, 0xB4, 0x9E, 0xD2, /* 0x90-0x93 */ + 0xC8, 0xB2, 0xBF, 0xBB, 0xEC, 0xC0, 0x9E, 0xD3, /* 0x94-0x97 */ + 0x9E, 0xD4, 0xD6, 0xCB, 0x9E, 0xD5, 0x9E, 0xD6, /* 0x98-0x9B */ + 0xEC, 0xBF, 0xEC, 0xC1, 0x9E, 0xD7, 0x9E, 0xD8, /* 0x9C-0x9F */ + 0x9E, 0xD9, 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, /* 0xA0-0xA3 */ + 0x9E, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, 0x9E, 0xE0, /* 0xA4-0xA7 */ + 0x9E, 0xE1, 0x9E, 0xE2, 0x9E, 0xE3, 0xEC, 0xC5, /* 0xA8-0xAB */ + 0xBE, 0xE6, 0xCC, 0xBF, 0xC5, 0xDA, 0xBE, 0xBC, /* 0xAC-0xAF */ + 0x9E, 0xE4, 0xEC, 0xC6, 0x9E, 0xE5, 0xB1, 0xFE, /* 0xB0-0xB3 */ + 0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0xEC, 0xC4, /* 0xB4-0xB7 */ + 0xD5, 0xA8, 0xB5, 0xE3, 0x9E, 0xE9, 0xEC, 0xC2, /* 0xB8-0xBB */ + 0xC1, 0xB6, 0xB3, 0xE3, 0x9E, 0xEA, 0x9E, 0xEB, /* 0xBC-0xBF */ + 0xEC, 0xC3, 0xCB, 0xB8, 0xC0, 0xC3, 0xCC, 0xFE, /* 0xC0-0xC3 */ + 0x9E, 0xEC, 0x9E, 0xED, 0x9E, 0xEE, 0x9E, 0xEF, /* 0xC4-0xC7 */ + 0xC1, 0xD2, 0x9E, 0xF0, 0xEC, 0xC8, 0x9E, 0xF1, /* 0xC8-0xCB */ + 0x9E, 0xF2, 0x9E, 0xF3, 0x9E, 0xF4, 0x9E, 0xF5, /* 0xCC-0xCF */ + 0x9E, 0xF6, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0xD0-0xD3 */ + 0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xD4-0xD7 */ + 0xBA, 0xE6, 0xC0, 0xD3, 0x9E, 0xFE, 0xD6, 0xF2, /* 0xD8-0xDB */ + 0x9F, 0x40, 0x9F, 0x41, 0x9F, 0x42, 0xD1, 0xCC, /* 0xDC-0xDF */ + 0x9F, 0x43, 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, /* 0xE0-0xE3 */ + 0xBF, 0xBE, 0x9F, 0x47, 0xB7, 0xB3, 0xC9, 0xD5, /* 0xE4-0xE7 */ + 0xEC, 0xC7, 0xBB, 0xE2, 0x9F, 0x48, 0xCC, 0xCC, /* 0xE8-0xEB */ + 0xBD, 0xFD, 0xC8, 0xC8, 0x9F, 0x49, 0xCF, 0xA9, /* 0xEC-0xEF */ + 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, 0x9F, 0x4D, /* 0xF0-0xF3 */ + 0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0xCD, 0xE9, /* 0xF4-0xF7 */ + 0x9F, 0x51, 0xC5, 0xEB, 0x9F, 0x52, 0x9F, 0x53, /* 0xF8-0xFB */ + 0x9F, 0x54, 0xB7, 0xE9, 0x9F, 0x55, 0x9F, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, 0x9F, 0x5A, /* 0x00-0x03 */ + 0x9F, 0x5B, 0x9F, 0x5C, 0x9F, 0x5D, 0x9F, 0x5E, /* 0x04-0x07 */ + 0x9F, 0x5F, 0xD1, 0xC9, 0xBA, 0xB8, 0x9F, 0x60, /* 0x08-0x0B */ + 0x9F, 0x61, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0x0C-0x0F */ + 0xEC, 0xC9, 0x9F, 0x65, 0x9F, 0x66, 0xEC, 0xCA, /* 0x10-0x13 */ + 0x9F, 0x67, 0xBB, 0xC0, 0xEC, 0xCB, 0x9F, 0x68, /* 0x14-0x17 */ + 0xEC, 0xE2, 0xB1, 0xBA, 0xB7, 0xD9, 0x9F, 0x69, /* 0x18-0x1B */ + 0x9F, 0x6A, 0x9F, 0x6B, 0x9F, 0x6C, 0x9F, 0x6D, /* 0x1C-0x1F */ + 0x9F, 0x6E, 0x9F, 0x6F, 0x9F, 0x70, 0x9F, 0x71, /* 0x20-0x23 */ + 0x9F, 0x72, 0x9F, 0x73, 0xBD, 0xB9, 0x9F, 0x74, /* 0x24-0x27 */ + 0x9F, 0x75, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0x28-0x2B */ + 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x7B, 0xEC, 0xCC, /* 0x2C-0x2F */ + 0xD1, 0xE6, 0xEC, 0xCD, 0x9F, 0x7C, 0x9F, 0x7D, /* 0x30-0x33 */ + 0x9F, 0x7E, 0x9F, 0x80, 0xC8, 0xBB, 0x9F, 0x81, /* 0x34-0x37 */ + 0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0x38-0x3B */ + 0x9F, 0x86, 0x9F, 0x87, 0x9F, 0x88, 0x9F, 0x89, /* 0x3C-0x3F */ + 0x9F, 0x8A, 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, /* 0x40-0x43 */ + 0x9F, 0x8E, 0xEC, 0xD1, 0x9F, 0x8F, 0x9F, 0x90, /* 0x44-0x47 */ + 0x9F, 0x91, 0x9F, 0x92, 0xEC, 0xD3, 0x9F, 0x93, /* 0x48-0x4B */ + 0xBB, 0xCD, 0x9F, 0x94, 0xBC, 0xE5, 0x9F, 0x95, /* 0x4C-0x4F */ + 0x9F, 0x96, 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, /* 0x50-0x53 */ + 0x9F, 0x9A, 0x9F, 0x9B, 0x9F, 0x9C, 0x9F, 0x9D, /* 0x54-0x57 */ + 0x9F, 0x9E, 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, /* 0x58-0x5B */ + 0xEC, 0xCF, 0x9F, 0xA2, 0xC9, 0xB7, 0x9F, 0xA3, /* 0x5C-0x5F */ + 0x9F, 0xA4, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x60-0x63 */ + 0xC3, 0xBA, 0x9F, 0xA8, 0xEC, 0xE3, 0xD5, 0xD5, /* 0x64-0x67 */ + 0xEC, 0xD0, 0x9F, 0xA9, 0x9F, 0xAA, 0x9F, 0xAB, /* 0x68-0x6B */ + 0x9F, 0xAC, 0x9F, 0xAD, 0xD6, 0xF3, 0x9F, 0xAE, /* 0x6C-0x6F */ + 0x9F, 0xAF, 0x9F, 0xB0, 0xEC, 0xD2, 0xEC, 0xCE, /* 0x70-0x73 */ + 0x9F, 0xB1, 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, /* 0x74-0x77 */ + 0xEC, 0xD4, 0x9F, 0xB5, 0xEC, 0xD5, 0x9F, 0xB6, /* 0x78-0x7B */ + 0x9F, 0xB7, 0xC9, 0xBF, 0x9F, 0xB8, 0x9F, 0xB9, /* 0x7C-0x7F */ + + 0x9F, 0xBA, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x80-0x83 */ + 0xCF, 0xA8, 0x9F, 0xBE, 0x9F, 0xBF, 0x9F, 0xC0, /* 0x84-0x87 */ + 0x9F, 0xC1, 0x9F, 0xC2, 0xD0, 0xDC, 0x9F, 0xC3, /* 0x88-0x8B */ + 0x9F, 0xC4, 0x9F, 0xC5, 0x9F, 0xC6, 0xD1, 0xAC, /* 0x8C-0x8F */ + 0x9F, 0xC7, 0x9F, 0xC8, 0x9F, 0xC9, 0x9F, 0xCA, /* 0x90-0x93 */ + 0xC8, 0xDB, 0x9F, 0xCB, 0x9F, 0xCC, 0x9F, 0xCD, /* 0x94-0x97 */ + 0xEC, 0xD6, 0xCE, 0xF5, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x98-0x9B */ + 0x9F, 0xD0, 0x9F, 0xD1, 0x9F, 0xD2, 0xCA, 0xEC, /* 0x9C-0x9F */ + 0xEC, 0xDA, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0xA0-0xA3 */ + 0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0xA4-0xA7 */ + 0xEC, 0xD9, 0x9F, 0xDA, 0x9F, 0xDB, 0x9F, 0xDC, /* 0xA8-0xAB */ + 0xB0, 0xBE, 0x9F, 0xDD, 0x9F, 0xDE, 0x9F, 0xDF, /* 0xAC-0xAF */ + 0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xEC, 0xD7, /* 0xB0-0xB3 */ + 0x9F, 0xE3, 0xEC, 0xD8, 0x9F, 0xE4, 0x9F, 0xE5, /* 0xB4-0xB7 */ + 0x9F, 0xE6, 0xEC, 0xE4, 0x9F, 0xE7, 0x9F, 0xE8, /* 0xB8-0xBB */ + 0x9F, 0xE9, 0x9F, 0xEA, 0x9F, 0xEB, 0x9F, 0xEC, /* 0xBC-0xBF */ + 0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0xC8, 0xBC, /* 0xC0-0xC3 */ + 0x9F, 0xF0, 0x9F, 0xF1, 0x9F, 0xF2, 0x9F, 0xF3, /* 0xC4-0xC7 */ + 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, 0x9F, 0xF7, /* 0xC8-0xCB */ + 0x9F, 0xF8, 0x9F, 0xF9, 0xC1, 0xC7, 0x9F, 0xFA, /* 0xCC-0xCF */ + 0x9F, 0xFB, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xD0-0xD3 */ + 0xEC, 0xDC, 0xD1, 0xE0, 0xA0, 0x40, 0xA0, 0x41, /* 0xD4-0xD7 */ + 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, 0xA0, 0x45, /* 0xD8-0xDB */ + 0xA0, 0x46, 0xA0, 0x47, 0xA0, 0x48, 0xA0, 0x49, /* 0xDC-0xDF */ + 0xEC, 0xDB, 0xA0, 0x4A, 0xA0, 0x4B, 0xA0, 0x4C, /* 0xE0-0xE3 */ + 0xA0, 0x4D, 0xD4, 0xEF, 0xA0, 0x4E, 0xEC, 0xDD, /* 0xE4-0xE7 */ + 0xA0, 0x4F, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xE8-0xEB */ + 0xA0, 0x53, 0xA0, 0x54, 0xDB, 0xC6, 0xA0, 0x55, /* 0xEC-0xEF */ + 0xA0, 0x56, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xF0-0xF3 */ + 0xA0, 0x5A, 0xA0, 0x5B, 0xA0, 0x5C, 0xA0, 0x5D, /* 0xF4-0xF7 */ + 0xA0, 0x5E, 0xEC, 0xDE, 0xA0, 0x5F, 0xA0, 0x60, /* 0xF8-0xFB */ + 0xA0, 0x61, 0xA0, 0x62, 0xA0, 0x63, 0xA0, 0x64, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0xA0, 0x65, 0xA0, 0x66, 0xA0, 0x67, 0xA0, 0x68, /* 0x00-0x03 */ + 0xA0, 0x69, 0xA0, 0x6A, 0xB1, 0xAC, 0xA0, 0x6B, /* 0x04-0x07 */ + 0xA0, 0x6C, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0x08-0x0B */ + 0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0x0C-0x0F */ + 0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0x10-0x13 */ + 0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x7B, /* 0x14-0x17 */ + 0xA0, 0x7C, 0xA0, 0x7D, 0xA0, 0x7E, 0xA0, 0x80, /* 0x18-0x1B */ + 0xA0, 0x81, 0xEC, 0xDF, 0xA0, 0x82, 0xA0, 0x83, /* 0x1C-0x1F */ + 0xA0, 0x84, 0xA0, 0x85, 0xA0, 0x86, 0xA0, 0x87, /* 0x20-0x23 */ + 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, 0xA0, 0x8B, /* 0x24-0x27 */ + 0xEC, 0xE0, 0xA0, 0x8C, 0xD7, 0xA6, 0xA0, 0x8D, /* 0x28-0x2B */ + 0xC5, 0xC0, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x2C-0x2F */ + 0xEB, 0xBC, 0xB0, 0xAE, 0xA0, 0x91, 0xA0, 0x92, /* 0x30-0x33 */ + 0xA0, 0x93, 0xBE, 0xF4, 0xB8, 0xB8, 0xD2, 0xAF, /* 0x34-0x37 */ + 0xB0, 0xD6, 0xB5, 0xF9, 0xA0, 0x94, 0xD8, 0xB3, /* 0x38-0x3B */ + 0xA0, 0x95, 0xCB, 0xAC, 0xA0, 0x96, 0xE3, 0xDD, /* 0x3C-0x3F */ + 0xA0, 0x97, 0xA0, 0x98, 0xA0, 0x99, 0xA0, 0x9A, /* 0x40-0x43 */ + 0xA0, 0x9B, 0xA0, 0x9C, 0xA0, 0x9D, 0xC6, 0xAC, /* 0x44-0x47 */ + 0xB0, 0xE6, 0xA0, 0x9E, 0xA0, 0x9F, 0xA0, 0xA0, /* 0x48-0x4B */ + 0xC5, 0xC6, 0xEB, 0xB9, 0xA0, 0xA1, 0xA0, 0xA2, /* 0x4C-0x4F */ + 0xA0, 0xA3, 0xA0, 0xA4, 0xEB, 0xBA, 0xA0, 0xA5, /* 0x50-0x53 */ + 0xA0, 0xA6, 0xA0, 0xA7, 0xEB, 0xBB, 0xA0, 0xA8, /* 0x54-0x57 */ + 0xA0, 0xA9, 0xD1, 0xC0, 0xA0, 0xAA, 0xC5, 0xA3, /* 0x58-0x5B */ + 0xA0, 0xAB, 0xEA, 0xF2, 0xA0, 0xAC, 0xC4, 0xB2, /* 0x5C-0x5F */ + 0xA0, 0xAD, 0xC4, 0xB5, 0xC0, 0xCE, 0xA0, 0xAE, /* 0x60-0x63 */ + 0xA0, 0xAF, 0xA0, 0xB0, 0xEA, 0xF3, 0xC4, 0xC1, /* 0x64-0x67 */ + 0xA0, 0xB1, 0xCE, 0xEF, 0xA0, 0xB2, 0xA0, 0xB3, /* 0x68-0x6B */ + 0xA0, 0xB4, 0xA0, 0xB5, 0xEA, 0xF0, 0xEA, 0xF4, /* 0x6C-0x6F */ + 0xA0, 0xB6, 0xA0, 0xB7, 0xC9, 0xFC, 0xA0, 0xB8, /* 0x70-0x73 */ + 0xA0, 0xB9, 0xC7, 0xA3, 0xA0, 0xBA, 0xA0, 0xBB, /* 0x74-0x77 */ + 0xA0, 0xBC, 0xCC, 0xD8, 0xCE, 0xFE, 0xA0, 0xBD, /* 0x78-0x7B */ + 0xA0, 0xBE, 0xA0, 0xBF, 0xEA, 0xF5, 0xEA, 0xF6, /* 0x7C-0x7F */ + + 0xCF, 0xAC, 0xC0, 0xE7, 0xA0, 0xC0, 0xA0, 0xC1, /* 0x80-0x83 */ + 0xEA, 0xF7, 0xA0, 0xC2, 0xA0, 0xC3, 0xA0, 0xC4, /* 0x84-0x87 */ + 0xA0, 0xC5, 0xA0, 0xC6, 0xB6, 0xBF, 0xEA, 0xF8, /* 0x88-0x8B */ + 0xA0, 0xC7, 0xEA, 0xF9, 0xA0, 0xC8, 0xEA, 0xFA, /* 0x8C-0x8F */ + 0xA0, 0xC9, 0xA0, 0xCA, 0xEA, 0xFB, 0xA0, 0xCB, /* 0x90-0x93 */ + 0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x94-0x97 */ + 0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x98-0x9B */ + 0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xEA, 0xF1, /* 0x9C-0x9F */ + 0xA0, 0xD7, 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, /* 0xA0-0xA3 */ + 0xA0, 0xDB, 0xA0, 0xDC, 0xA0, 0xDD, 0xA0, 0xDE, /* 0xA4-0xA7 */ + 0xA0, 0xDF, 0xA0, 0xE0, 0xA0, 0xE1, 0xA0, 0xE2, /* 0xA8-0xAB */ + 0xC8, 0xAE, 0xE1, 0xEB, 0xA0, 0xE3, 0xB7, 0xB8, /* 0xAC-0xAF */ + 0xE1, 0xEC, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0xB0-0xB3 */ + 0xE1, 0xED, 0xA0, 0xE7, 0xD7, 0xB4, 0xE1, 0xEE, /* 0xB4-0xB7 */ + 0xE1, 0xEF, 0xD3, 0xCC, 0xA0, 0xE8, 0xA0, 0xE9, /* 0xB8-0xBB */ + 0xA0, 0xEA, 0xA0, 0xEB, 0xA0, 0xEC, 0xA0, 0xED, /* 0xBC-0xBF */ + 0xA0, 0xEE, 0xE1, 0xF1, 0xBF, 0xF1, 0xE1, 0xF0, /* 0xC0-0xC3 */ + 0xB5, 0xD2, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0xC4-0xC7 */ + 0xB1, 0xB7, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0xC8-0xCB */ + 0xA0, 0xF5, 0xE1, 0xF3, 0xE1, 0xF2, 0xA0, 0xF6, /* 0xCC-0xCF */ + 0xBA, 0xFC, 0xA0, 0xF7, 0xE1, 0xF4, 0xA0, 0xF8, /* 0xD0-0xD3 */ + 0xA0, 0xF9, 0xA0, 0xFA, 0xA0, 0xFB, 0xB9, 0xB7, /* 0xD4-0xD7 */ + 0xA0, 0xFC, 0xBE, 0xD1, 0xA0, 0xFD, 0xA0, 0xFE, /* 0xD8-0xDB */ + 0xAA, 0x40, 0xAA, 0x41, 0xC4, 0xFC, 0xAA, 0x42, /* 0xDC-0xDF */ + 0xBA, 0xDD, 0xBD, 0xC6, 0xAA, 0x43, 0xAA, 0x44, /* 0xE0-0xE3 */ + 0xAA, 0x45, 0xAA, 0x46, 0xAA, 0x47, 0xAA, 0x48, /* 0xE4-0xE7 */ + 0xE1, 0xF5, 0xE1, 0xF7, 0xAA, 0x49, 0xAA, 0x4A, /* 0xE8-0xEB */ + 0xB6, 0xC0, 0xCF, 0xC1, 0xCA, 0xA8, 0xE1, 0xF6, /* 0xEC-0xEF */ + 0xD5, 0xF8, 0xD3, 0xFC, 0xE1, 0xF8, 0xE1, 0xFC, /* 0xF0-0xF3 */ + 0xE1, 0xF9, 0xAA, 0x4B, 0xAA, 0x4C, 0xE1, 0xFA, /* 0xF4-0xF7 */ + 0xC0, 0xEA, 0xAA, 0x4D, 0xE1, 0xFE, 0xE2, 0xA1, /* 0xF8-0xFB */ + 0xC0, 0xC7, 0xAA, 0x4E, 0xAA, 0x4F, 0xAA, 0x50, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0xAA, 0x51, 0xE1, 0xFB, 0xAA, 0x52, 0xE1, 0xFD, /* 0x00-0x03 */ + 0xAA, 0x53, 0xAA, 0x54, 0xAA, 0x55, 0xAA, 0x56, /* 0x04-0x07 */ + 0xAA, 0x57, 0xAA, 0x58, 0xE2, 0xA5, 0xAA, 0x59, /* 0x08-0x0B */ + 0xAA, 0x5A, 0xAA, 0x5B, 0xC1, 0xD4, 0xAA, 0x5C, /* 0x0C-0x0F */ + 0xAA, 0x5D, 0xAA, 0x5E, 0xAA, 0x5F, 0xE2, 0xA3, /* 0x10-0x13 */ + 0xAA, 0x60, 0xE2, 0xA8, 0xB2, 0xFE, 0xE2, 0xA2, /* 0x14-0x17 */ + 0xAA, 0x61, 0xAA, 0x62, 0xAA, 0x63, 0xC3, 0xCD, /* 0x18-0x1B */ + 0xB2, 0xC2, 0xE2, 0xA7, 0xE2, 0xA6, 0xAA, 0x64, /* 0x1C-0x1F */ + 0xAA, 0x65, 0xE2, 0xA4, 0xE2, 0xA9, 0xAA, 0x66, /* 0x20-0x23 */ + 0xAA, 0x67, 0xE2, 0xAB, 0xAA, 0x68, 0xAA, 0x69, /* 0x24-0x27 */ + 0xAA, 0x6A, 0xD0, 0xC9, 0xD6, 0xED, 0xC3, 0xA8, /* 0x28-0x2B */ + 0xE2, 0xAC, 0xAA, 0x6B, 0xCF, 0xD7, 0xAA, 0x6C, /* 0x2C-0x2F */ + 0xAA, 0x6D, 0xE2, 0xAE, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x30-0x33 */ + 0xBA, 0xEF, 0xAA, 0x70, 0xAA, 0x71, 0xE9, 0xE0, /* 0x34-0x37 */ + 0xE2, 0xAD, 0xE2, 0xAA, 0xAA, 0x72, 0xAA, 0x73, /* 0x38-0x3B */ + 0xAA, 0x74, 0xAA, 0x75, 0xBB, 0xAB, 0xD4, 0xB3, /* 0x3C-0x3F */ + 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, 0xAA, 0x79, /* 0x40-0x43 */ + 0xAA, 0x7A, 0xAA, 0x7B, 0xAA, 0x7C, 0xAA, 0x7D, /* 0x44-0x47 */ + 0xAA, 0x7E, 0xAA, 0x80, 0xAA, 0x81, 0xAA, 0x82, /* 0x48-0x4B */ + 0xAA, 0x83, 0xE2, 0xB0, 0xAA, 0x84, 0xAA, 0x85, /* 0x4C-0x4F */ + 0xE2, 0xAF, 0xAA, 0x86, 0xE9, 0xE1, 0xAA, 0x87, /* 0x50-0x53 */ + 0xAA, 0x88, 0xAA, 0x89, 0xAA, 0x8A, 0xE2, 0xB1, /* 0x54-0x57 */ + 0xAA, 0x8B, 0xAA, 0x8C, 0xAA, 0x8D, 0xAA, 0x8E, /* 0x58-0x5B */ + 0xAA, 0x8F, 0xAA, 0x90, 0xAA, 0x91, 0xAA, 0x92, /* 0x5C-0x5F */ + 0xE2, 0xB2, 0xAA, 0x93, 0xAA, 0x94, 0xAA, 0x95, /* 0x60-0x63 */ + 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, 0xAA, 0x99, /* 0x64-0x67 */ + 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, 0xAA, 0x9D, /* 0x68-0x6B */ + 0xE2, 0xB3, 0xCC, 0xA1, 0xAA, 0x9E, 0xE2, 0xB4, /* 0x6C-0x6F */ + 0xAA, 0x9F, 0xAA, 0xA0, 0xAB, 0x40, 0xAB, 0x41, /* 0x70-0x73 */ + 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, 0xAB, 0x45, /* 0x74-0x77 */ + 0xAB, 0x46, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x78-0x7B */ + 0xAB, 0x4A, 0xAB, 0x4B, 0xE2, 0xB5, 0xAB, 0x4C, /* 0x7C-0x7F */ + + 0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0x80-0x83 */ + 0xD0, 0xFE, 0xAB, 0x51, 0xAB, 0x52, 0xC2, 0xCA, /* 0x84-0x87 */ + 0xAB, 0x53, 0xD3, 0xF1, 0xAB, 0x54, 0xCD, 0xF5, /* 0x88-0x8B */ + 0xAB, 0x55, 0xAB, 0x56, 0xE7, 0xE0, 0xAB, 0x57, /* 0x8C-0x8F */ + 0xAB, 0x58, 0xE7, 0xE1, 0xAB, 0x59, 0xAB, 0x5A, /* 0x90-0x93 */ + 0xAB, 0x5B, 0xAB, 0x5C, 0xBE, 0xC1, 0xAB, 0x5D, /* 0x94-0x97 */ + 0xAB, 0x5E, 0xAB, 0x5F, 0xAB, 0x60, 0xC2, 0xEA, /* 0x98-0x9B */ + 0xAB, 0x61, 0xAB, 0x62, 0xAB, 0x63, 0xE7, 0xE4, /* 0x9C-0x9F */ + 0xAB, 0x64, 0xAB, 0x65, 0xE7, 0xE3, 0xAB, 0x66, /* 0xA0-0xA3 */ + 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, 0xAB, 0x6A, /* 0xA4-0xA7 */ + 0xAB, 0x6B, 0xCD, 0xE6, 0xAB, 0x6C, 0xC3, 0xB5, /* 0xA8-0xAB */ + 0xAB, 0x6D, 0xAB, 0x6E, 0xE7, 0xE2, 0xBB, 0xB7, /* 0xAC-0xAF */ + 0xCF, 0xD6, 0xAB, 0x6F, 0xC1, 0xE1, 0xE7, 0xE9, /* 0xB0-0xB3 */ + 0xAB, 0x70, 0xAB, 0x71, 0xAB, 0x72, 0xE7, 0xE8, /* 0xB4-0xB7 */ + 0xAB, 0x73, 0xAB, 0x74, 0xE7, 0xF4, 0xB2, 0xA3, /* 0xB8-0xBB */ + 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, 0xAB, 0x78, /* 0xBC-0xBF */ + 0xE7, 0xEA, 0xAB, 0x79, 0xE7, 0xE6, 0xAB, 0x7A, /* 0xC0-0xC3 */ + 0xAB, 0x7B, 0xAB, 0x7C, 0xAB, 0x7D, 0xAB, 0x7E, /* 0xC4-0xC7 */ + 0xE7, 0xEC, 0xE7, 0xEB, 0xC9, 0xBA, 0xAB, 0x80, /* 0xC8-0xCB */ + 0xAB, 0x81, 0xD5, 0xE4, 0xAB, 0x82, 0xE7, 0xE5, /* 0xCC-0xCF */ + 0xB7, 0xA9, 0xE7, 0xE7, 0xAB, 0x83, 0xAB, 0x84, /* 0xD0-0xD3 */ + 0xAB, 0x85, 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, /* 0xD4-0xD7 */ + 0xAB, 0x89, 0xE7, 0xEE, 0xAB, 0x8A, 0xAB, 0x8B, /* 0xD8-0xDB */ + 0xAB, 0x8C, 0xAB, 0x8D, 0xE7, 0xF3, 0xAB, 0x8E, /* 0xDC-0xDF */ + 0xD6, 0xE9, 0xAB, 0x8F, 0xAB, 0x90, 0xAB, 0x91, /* 0xE0-0xE3 */ + 0xAB, 0x92, 0xE7, 0xED, 0xAB, 0x93, 0xE7, 0xF2, /* 0xE4-0xE7 */ + 0xAB, 0x94, 0xE7, 0xF1, 0xAB, 0x95, 0xAB, 0x96, /* 0xE8-0xEB */ + 0xAB, 0x97, 0xB0, 0xE0, 0xAB, 0x98, 0xAB, 0x99, /* 0xEC-0xEF */ + 0xAB, 0x9A, 0xAB, 0x9B, 0xE7, 0xF5, 0xAB, 0x9C, /* 0xF0-0xF3 */ + 0xAB, 0x9D, 0xAB, 0x9E, 0xAB, 0x9F, 0xAB, 0xA0, /* 0xF4-0xF7 */ + 0xAC, 0x40, 0xAC, 0x41, 0xAC, 0x42, 0xAC, 0x43, /* 0xF8-0xFB */ + 0xAC, 0x44, 0xAC, 0x45, 0xAC, 0x46, 0xAC, 0x47, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0xAC, 0x48, 0xAC, 0x49, 0xAC, 0x4A, 0xC7, 0xF2, /* 0x00-0x03 */ + 0xAC, 0x4B, 0xC0, 0xC5, 0xC0, 0xED, 0xAC, 0x4C, /* 0x04-0x07 */ + 0xAC, 0x4D, 0xC1, 0xF0, 0xE7, 0xF0, 0xAC, 0x4E, /* 0x08-0x0B */ + 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, 0xE7, 0xF6, /* 0x0C-0x0F */ + 0xCB, 0xF6, 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, /* 0x10-0x13 */ + 0xAC, 0x55, 0xAC, 0x56, 0xAC, 0x57, 0xAC, 0x58, /* 0x14-0x17 */ + 0xAC, 0x59, 0xAC, 0x5A, 0xE8, 0xA2, 0xE8, 0xA1, /* 0x18-0x1B */ + 0xAC, 0x5B, 0xAC, 0x5C, 0xAC, 0x5D, 0xAC, 0x5E, /* 0x1C-0x1F */ + 0xAC, 0x5F, 0xAC, 0x60, 0xD7, 0xC1, 0xAC, 0x61, /* 0x20-0x23 */ + 0xAC, 0x62, 0xE7, 0xFA, 0xE7, 0xF9, 0xAC, 0x63, /* 0x24-0x27 */ + 0xE7, 0xFB, 0xAC, 0x64, 0xE7, 0xF7, 0xAC, 0x65, /* 0x28-0x2B */ + 0xE7, 0xFE, 0xAC, 0x66, 0xE7, 0xFD, 0xAC, 0x67, /* 0x2C-0x2F */ + 0xE7, 0xFC, 0xAC, 0x68, 0xAC, 0x69, 0xC1, 0xD5, /* 0x30-0x33 */ + 0xC7, 0xD9, 0xC5, 0xFD, 0xC5, 0xC3, 0xAC, 0x6A, /* 0x34-0x37 */ + 0xAC, 0x6B, 0xAC, 0x6C, 0xAC, 0x6D, 0xAC, 0x6E, /* 0x38-0x3B */ + 0xC7, 0xED, 0xAC, 0x6F, 0xAC, 0x70, 0xAC, 0x71, /* 0x3C-0x3F */ + 0xAC, 0x72, 0xE8, 0xA3, 0xAC, 0x73, 0xAC, 0x74, /* 0x40-0x43 */ + 0xAC, 0x75, 0xAC, 0x76, 0xAC, 0x77, 0xAC, 0x78, /* 0x44-0x47 */ + 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x7B, 0xAC, 0x7C, /* 0x48-0x4B */ + 0xAC, 0x7D, 0xAC, 0x7E, 0xAC, 0x80, 0xAC, 0x81, /* 0x4C-0x4F */ + 0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x50-0x53 */ + 0xAC, 0x86, 0xE8, 0xA6, 0xAC, 0x87, 0xE8, 0xA5, /* 0x54-0x57 */ + 0xAC, 0x88, 0xE8, 0xA7, 0xBA, 0xF7, 0xE7, 0xF8, /* 0x58-0x5B */ + 0xE8, 0xA4, 0xAC, 0x89, 0xC8, 0xF0, 0xC9, 0xAA, /* 0x5C-0x5F */ + 0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x60-0x63 */ + 0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x64-0x67 */ + 0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x68-0x6B */ + 0xAC, 0x96, 0xE8, 0xA9, 0xAC, 0x97, 0xAC, 0x98, /* 0x6C-0x6F */ + 0xB9, 0xE5, 0xAC, 0x99, 0xAC, 0x9A, 0xAC, 0x9B, /* 0x70-0x73 */ + 0xAC, 0x9C, 0xAC, 0x9D, 0xD1, 0xFE, 0xE8, 0xA8, /* 0x74-0x77 */ + 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, 0xAD, 0x40, /* 0x78-0x7B */ + 0xAD, 0x41, 0xAD, 0x42, 0xE8, 0xAA, 0xAD, 0x43, /* 0x7C-0x7F */ + + 0xE8, 0xAD, 0xE8, 0xAE, 0xAD, 0x44, 0xC1, 0xA7, /* 0x80-0x83 */ + 0xAD, 0x45, 0xAD, 0x46, 0xAD, 0x47, 0xE8, 0xAF, /* 0x84-0x87 */ + 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, 0xE8, 0xB0, /* 0x88-0x8B */ + 0xAD, 0x4B, 0xAD, 0x4C, 0xE8, 0xAC, 0xAD, 0x4D, /* 0x8C-0x8F */ + 0xE8, 0xB4, 0xAD, 0x4E, 0xAD, 0x4F, 0xAD, 0x50, /* 0x90-0x93 */ + 0xAD, 0x51, 0xAD, 0x52, 0xAD, 0x53, 0xAD, 0x54, /* 0x94-0x97 */ + 0xAD, 0x55, 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, /* 0x98-0x9B */ + 0xE8, 0xAB, 0xAD, 0x59, 0xE8, 0xB1, 0xAD, 0x5A, /* 0x9C-0x9F */ + 0xAD, 0x5B, 0xAD, 0x5C, 0xAD, 0x5D, 0xAD, 0x5E, /* 0xA0-0xA3 */ + 0xAD, 0x5F, 0xAD, 0x60, 0xAD, 0x61, 0xE8, 0xB5, /* 0xA4-0xA7 */ + 0xE8, 0xB2, 0xE8, 0xB3, 0xAD, 0x62, 0xAD, 0x63, /* 0xA8-0xAB */ + 0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0xAC-0xAF */ + 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, 0xAD, 0x6B, /* 0xB0-0xB3 */ + 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, 0xAD, 0x6F, /* 0xB4-0xB7 */ + 0xAD, 0x70, 0xAD, 0x71, 0xE8, 0xB7, 0xAD, 0x72, /* 0xB8-0xBB */ + 0xAD, 0x73, 0xAD, 0x74, 0xAD, 0x75, 0xAD, 0x76, /* 0xBC-0xBF */ + 0xAD, 0x77, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0xC0-0xC3 */ + 0xAD, 0x7B, 0xAD, 0x7C, 0xAD, 0x7D, 0xAD, 0x7E, /* 0xC4-0xC7 */ + 0xAD, 0x80, 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, /* 0xC8-0xCB */ + 0xAD, 0x84, 0xAD, 0x85, 0xAD, 0x86, 0xAD, 0x87, /* 0xCC-0xCF */ + 0xAD, 0x88, 0xAD, 0x89, 0xE8, 0xB6, 0xAD, 0x8A, /* 0xD0-0xD3 */ + 0xAD, 0x8B, 0xAD, 0x8C, 0xAD, 0x8D, 0xAD, 0x8E, /* 0xD4-0xD7 */ + 0xAD, 0x8F, 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, /* 0xD8-0xDB */ + 0xB9, 0xCF, 0xAD, 0x93, 0xF0, 0xAC, 0xAD, 0x94, /* 0xDC-0xDF */ + 0xF0, 0xAD, 0xAD, 0x95, 0xC6, 0xB0, 0xB0, 0xEA, /* 0xE0-0xE3 */ + 0xC8, 0xBF, 0xAD, 0x96, 0xCD, 0xDF, 0xAD, 0x97, /* 0xE4-0xE7 */ + 0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xE8-0xEB */ + 0xAD, 0x9C, 0xAD, 0x9D, 0xCE, 0xCD, 0xEA, 0xB1, /* 0xEC-0xEF */ + 0xAD, 0x9E, 0xAD, 0x9F, 0xAD, 0xA0, 0xAE, 0x40, /* 0xF0-0xF3 */ + 0xEA, 0xB2, 0xAE, 0x41, 0xC6, 0xBF, 0xB4, 0xC9, /* 0xF4-0xF7 */ + 0xAE, 0x42, 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, /* 0xF8-0xFB */ + 0xAE, 0x46, 0xAE, 0x47, 0xAE, 0x48, 0xEA, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_75[512] = { + 0xAE, 0x49, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0x00-0x03 */ + 0xD5, 0xE7, 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, /* 0x04-0x07 */ + 0xAE, 0x50, 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, /* 0x08-0x0B */ + 0xAE, 0x54, 0xDD, 0xF9, 0xAE, 0x55, 0xEA, 0xB4, /* 0x0C-0x0F */ + 0xAE, 0x56, 0xEA, 0xB5, 0xAE, 0x57, 0xEA, 0xB6, /* 0x10-0x13 */ + 0xAE, 0x58, 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x5B, /* 0x14-0x17 */ + 0xB8, 0xCA, 0xDF, 0xB0, 0xC9, 0xF5, 0xAE, 0x5C, /* 0x18-0x1B */ + 0xCC, 0xF0, 0xAE, 0x5D, 0xAE, 0x5E, 0xC9, 0xFA, /* 0x1C-0x1F */ + 0xAE, 0x5F, 0xAE, 0x60, 0xAE, 0x61, 0xAE, 0x62, /* 0x20-0x23 */ + 0xAE, 0x63, 0xC9, 0xFB, 0xAE, 0x64, 0xAE, 0x65, /* 0x24-0x27 */ + 0xD3, 0xC3, 0xCB, 0xA6, 0xAE, 0x66, 0xB8, 0xA6, /* 0x28-0x2B */ + 0xF0, 0xAE, 0xB1, 0xC2, 0xAE, 0x67, 0xE5, 0xB8, /* 0x2C-0x2F */ + 0xCC, 0xEF, 0xD3, 0xC9, 0xBC, 0xD7, 0xC9, 0xEA, /* 0x30-0x33 */ + 0xAE, 0x68, 0xB5, 0xE7, 0xAE, 0x69, 0xC4, 0xD0, /* 0x34-0x37 */ + 0xB5, 0xE9, 0xAE, 0x6A, 0xEE, 0xAE, 0xBB, 0xAD, /* 0x38-0x3B */ + 0xAE, 0x6B, 0xAE, 0x6C, 0xE7, 0xDE, 0xAE, 0x6D, /* 0x3C-0x3F */ + 0xEE, 0xAF, 0xAE, 0x6E, 0xAE, 0x6F, 0xAE, 0x70, /* 0x40-0x43 */ + 0xAE, 0x71, 0xB3, 0xA9, 0xAE, 0x72, 0xAE, 0x73, /* 0x44-0x47 */ + 0xEE, 0xB2, 0xAE, 0x74, 0xAE, 0x75, 0xEE, 0xB1, /* 0x48-0x4B */ + 0xBD, 0xE7, 0xAE, 0x76, 0xEE, 0xB0, 0xCE, 0xB7, /* 0x4C-0x4F */ + 0xAE, 0x77, 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, /* 0x50-0x53 */ + 0xC5, 0xCF, 0xAE, 0x7B, 0xAE, 0x7C, 0xAE, 0x7D, /* 0x54-0x57 */ + 0xAE, 0x7E, 0xC1, 0xF4, 0xDB, 0xCE, 0xEE, 0xB3, /* 0x58-0x5B */ + 0xD0, 0xF3, 0xAE, 0x80, 0xAE, 0x81, 0xAE, 0x82, /* 0x5C-0x5F */ + 0xAE, 0x83, 0xAE, 0x84, 0xAE, 0x85, 0xAE, 0x86, /* 0x60-0x63 */ + 0xAE, 0x87, 0xC2, 0xD4, 0xC6, 0xE8, 0xAE, 0x88, /* 0x64-0x67 */ + 0xAE, 0x89, 0xAE, 0x8A, 0xB7, 0xAC, 0xAE, 0x8B, /* 0x68-0x6B */ + 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, 0xAE, 0x8F, /* 0x6C-0x6F */ + 0xAE, 0x90, 0xAE, 0x91, 0xEE, 0xB4, 0xAE, 0x92, /* 0x70-0x73 */ + 0xB3, 0xEB, 0xAE, 0x93, 0xAE, 0x94, 0xAE, 0x95, /* 0x74-0x77 */ + 0xBB, 0xFB, 0xEE, 0xB5, 0xAE, 0x96, 0xAE, 0x97, /* 0x78-0x7B */ + 0xAE, 0x98, 0xAE, 0x99, 0xAE, 0x9A, 0xE7, 0xDC, /* 0x7C-0x7F */ + + 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, 0xEE, 0xB6, /* 0x80-0x83 */ + 0xAE, 0x9E, 0xAE, 0x9F, 0xBD, 0xAE, 0xAE, 0xA0, /* 0x84-0x87 */ + 0xAF, 0x40, 0xAF, 0x41, 0xAF, 0x42, 0xF1, 0xE2, /* 0x88-0x8B */ + 0xAF, 0x43, 0xAF, 0x44, 0xAF, 0x45, 0xCA, 0xE8, /* 0x8C-0x8F */ + 0xAF, 0x46, 0xD2, 0xC9, 0xF0, 0xDA, 0xAF, 0x47, /* 0x90-0x93 */ + 0xF0, 0xDB, 0xAF, 0x48, 0xF0, 0xDC, 0xC1, 0xC6, /* 0x94-0x97 */ + 0xAF, 0x49, 0xB8, 0xED, 0xBE, 0xCE, 0xAF, 0x4A, /* 0x98-0x9B */ + 0xAF, 0x4B, 0xF0, 0xDE, 0xAF, 0x4C, 0xC5, 0xB1, /* 0x9C-0x9F */ + 0xF0, 0xDD, 0xD1, 0xF1, 0xAF, 0x4D, 0xF0, 0xE0, /* 0xA0-0xA3 */ + 0xB0, 0xCC, 0xBD, 0xEA, 0xAF, 0x4E, 0xAF, 0x4F, /* 0xA4-0xA7 */ + 0xAF, 0x50, 0xAF, 0x51, 0xAF, 0x52, 0xD2, 0xDF, /* 0xA8-0xAB */ + 0xF0, 0xDF, 0xAF, 0x53, 0xB4, 0xAF, 0xB7, 0xE8, /* 0xAC-0xAF */ + 0xF0, 0xE6, 0xF0, 0xE5, 0xC6, 0xA3, 0xF0, 0xE1, /* 0xB0-0xB3 */ + 0xF0, 0xE2, 0xB4, 0xC3, 0xAF, 0x54, 0xAF, 0x55, /* 0xB4-0xB7 */ + 0xF0, 0xE3, 0xD5, 0xEE, 0xAF, 0x56, 0xAF, 0x57, /* 0xB8-0xBB */ + 0xCC, 0xDB, 0xBE, 0xD2, 0xBC, 0xB2, 0xAF, 0x58, /* 0xBC-0xBF */ + 0xAF, 0x59, 0xAF, 0x5A, 0xF0, 0xE8, 0xF0, 0xE7, /* 0xC0-0xC3 */ + 0xF0, 0xE4, 0xB2, 0xA1, 0xAF, 0x5B, 0xD6, 0xA2, /* 0xC4-0xC7 */ + 0xD3, 0xB8, 0xBE, 0xB7, 0xC8, 0xAC, 0xAF, 0x5C, /* 0xC8-0xCB */ + 0xAF, 0x5D, 0xF0, 0xEA, 0xAF, 0x5E, 0xAF, 0x5F, /* 0xCC-0xCF */ + 0xAF, 0x60, 0xAF, 0x61, 0xD1, 0xF7, 0xAF, 0x62, /* 0xD0-0xD3 */ + 0xD6, 0xCC, 0xBA, 0xDB, 0xF0, 0xE9, 0xAF, 0x63, /* 0xD4-0xD7 */ + 0xB6, 0xBB, 0xAF, 0x64, 0xAF, 0x65, 0xCD, 0xB4, /* 0xD8-0xDB */ + 0xAF, 0x66, 0xAF, 0x67, 0xC6, 0xA6, 0xAF, 0x68, /* 0xDC-0xDF */ + 0xAF, 0x69, 0xAF, 0x6A, 0xC1, 0xA1, 0xF0, 0xEB, /* 0xE0-0xE3 */ + 0xF0, 0xEE, 0xAF, 0x6B, 0xF0, 0xED, 0xF0, 0xF0, /* 0xE4-0xE7 */ + 0xF0, 0xEC, 0xAF, 0x6C, 0xBB, 0xBE, 0xF0, 0xEF, /* 0xE8-0xEB */ + 0xAF, 0x6D, 0xAF, 0x6E, 0xAF, 0x6F, 0xAF, 0x70, /* 0xEC-0xEF */ + 0xCC, 0xB5, 0xF0, 0xF2, 0xAF, 0x71, 0xAF, 0x72, /* 0xF0-0xF3 */ + 0xB3, 0xD5, 0xAF, 0x73, 0xAF, 0x74, 0xAF, 0x75, /* 0xF4-0xF7 */ + 0xAF, 0x76, 0xB1, 0xD4, 0xAF, 0x77, 0xAF, 0x78, /* 0xF8-0xFB */ + 0xF0, 0xF3, 0xAF, 0x79, 0xAF, 0x7A, 0xF0, 0xF4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xF0, 0xF6, 0xB4, 0xE1, 0xAF, 0x7B, 0xF0, 0xF1, /* 0x00-0x03 */ + 0xAF, 0x7C, 0xF0, 0xF7, 0xAF, 0x7D, 0xAF, 0x7E, /* 0x04-0x07 */ + 0xAF, 0x80, 0xAF, 0x81, 0xF0, 0xFA, 0xAF, 0x82, /* 0x08-0x0B */ + 0xF0, 0xF8, 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, /* 0x0C-0x0F */ + 0xF0, 0xF5, 0xAF, 0x86, 0xAF, 0x87, 0xAF, 0x88, /* 0x10-0x13 */ + 0xAF, 0x89, 0xF0, 0xFD, 0xAF, 0x8A, 0xF0, 0xF9, /* 0x14-0x17 */ + 0xF0, 0xFC, 0xF0, 0xFE, 0xAF, 0x8B, 0xF1, 0xA1, /* 0x18-0x1B */ + 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, 0xCE, 0xC1, /* 0x1C-0x1F */ + 0xF1, 0xA4, 0xAF, 0x8F, 0xF1, 0xA3, 0xAF, 0x90, /* 0x20-0x23 */ + 0xC1, 0xF6, 0xF0, 0xFB, 0xCA, 0xDD, 0xAF, 0x91, /* 0x24-0x27 */ + 0xAF, 0x92, 0xB4, 0xF1, 0xB1, 0xF1, 0xCC, 0xB1, /* 0x28-0x2B */ + 0xAF, 0x93, 0xF1, 0xA6, 0xAF, 0x94, 0xAF, 0x95, /* 0x2C-0x2F */ + 0xF1, 0xA7, 0xAF, 0x96, 0xAF, 0x97, 0xF1, 0xAC, /* 0x30-0x33 */ + 0xD5, 0xCE, 0xF1, 0xA9, 0xAF, 0x98, 0xAF, 0x99, /* 0x34-0x37 */ + 0xC8, 0xB3, 0xAF, 0x9A, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x38-0x3B */ + 0xF1, 0xA2, 0xAF, 0x9D, 0xF1, 0xAB, 0xF1, 0xA8, /* 0x3C-0x3F */ + 0xF1, 0xA5, 0xAF, 0x9E, 0xAF, 0x9F, 0xF1, 0xAA, /* 0x40-0x43 */ + 0xAF, 0xA0, 0xB0, 0x40, 0xB0, 0x41, 0xB0, 0x42, /* 0x44-0x47 */ + 0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x48-0x4B */ + 0xB0, 0xA9, 0xF1, 0xAD, 0xB0, 0x47, 0xB0, 0x48, /* 0x4C-0x4F */ + 0xB0, 0x49, 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, /* 0x50-0x53 */ + 0xF1, 0xAF, 0xB0, 0x4D, 0xF1, 0xB1, 0xB0, 0x4E, /* 0x54-0x57 */ + 0xB0, 0x4F, 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, /* 0x58-0x5B */ + 0xF1, 0xB0, 0xB0, 0x53, 0xF1, 0xAE, 0xB0, 0x54, /* 0x5C-0x5F */ + 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, 0xD1, 0xA2, /* 0x60-0x63 */ + 0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x5B, /* 0x64-0x67 */ + 0xB0, 0x5C, 0xB0, 0x5D, 0xB0, 0x5E, 0xF1, 0xB2, /* 0x68-0x6B */ + 0xB0, 0x5F, 0xB0, 0x60, 0xB0, 0x61, 0xF1, 0xB3, /* 0x6C-0x6F */ + 0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0x70-0x73 */ + 0xB0, 0x66, 0xB0, 0x67, 0xB0, 0x68, 0xB0, 0x69, /* 0x74-0x77 */ + 0xB9, 0xEF, 0xB0, 0x6A, 0xB0, 0x6B, 0xB5, 0xC7, /* 0x78-0x7B */ + 0xB0, 0x6C, 0xB0, 0xD7, 0xB0, 0xD9, 0xB0, 0x6D, /* 0x7C-0x7F */ + + 0xB0, 0x6E, 0xB0, 0x6F, 0xD4, 0xED, 0xB0, 0x70, /* 0x80-0x83 */ + 0xB5, 0xC4, 0xB0, 0x71, 0xBD, 0xD4, 0xBB, 0xCA, /* 0x84-0x87 */ + 0xF0, 0xA7, 0xB0, 0x72, 0xB0, 0x73, 0xB8, 0xDE, /* 0x88-0x8B */ + 0xB0, 0x74, 0xB0, 0x75, 0xF0, 0xA8, 0xB0, 0x76, /* 0x8C-0x8F */ + 0xB0, 0x77, 0xB0, 0xA8, 0xB0, 0x78, 0xF0, 0xA9, /* 0x90-0x93 */ + 0xB0, 0x79, 0xB0, 0x7A, 0xCD, 0xEE, 0xB0, 0x7B, /* 0x94-0x97 */ + 0xB0, 0x7C, 0xF0, 0xAA, 0xB0, 0x7D, 0xB0, 0x7E, /* 0x98-0x9B */ + 0xB0, 0x80, 0xB0, 0x81, 0xB0, 0x82, 0xB0, 0x83, /* 0x9C-0x9F */ + 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, 0xB0, 0x87, /* 0xA0-0xA3 */ + 0xF0, 0xAB, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xA4-0xA7 */ + 0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xA8-0xAB */ + 0xB0, 0x8F, 0xB0, 0x90, 0xC6, 0xA4, 0xB0, 0x91, /* 0xAC-0xAF */ + 0xB0, 0x92, 0xD6, 0xE5, 0xF1, 0xE4, 0xB0, 0x93, /* 0xB0-0xB3 */ + 0xF1, 0xE5, 0xB0, 0x94, 0xB0, 0x95, 0xB0, 0x96, /* 0xB4-0xB7 */ + 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, 0xB0, 0x9A, /* 0xB8-0xBB */ + 0xB0, 0x9B, 0xB0, 0x9C, 0xB0, 0x9D, 0xC3, 0xF3, /* 0xBC-0xBF */ + 0xB0, 0x9E, 0xB0, 0x9F, 0xD3, 0xDB, 0xB0, 0xA0, /* 0xC0-0xC3 */ + 0xB1, 0x40, 0xD6, 0xD1, 0xC5, 0xE8, 0xB1, 0x41, /* 0xC4-0xC7 */ + 0xD3, 0xAF, 0xB1, 0x42, 0xD2, 0xE6, 0xB1, 0x43, /* 0xC8-0xCB */ + 0xB1, 0x44, 0xEE, 0xC1, 0xB0, 0xBB, 0xD5, 0xB5, /* 0xCC-0xCF */ + 0xD1, 0xCE, 0xBC, 0xE0, 0xBA, 0xD0, 0xB1, 0x45, /* 0xD0-0xD3 */ + 0xBF, 0xF8, 0xB1, 0x46, 0xB8, 0xC7, 0xB5, 0xC1, /* 0xD4-0xD7 */ + 0xC5, 0xCC, 0xB1, 0x47, 0xB1, 0x48, 0xCA, 0xA2, /* 0xD8-0xDB */ + 0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xC3, 0xCB, /* 0xDC-0xDF */ + 0xB1, 0x4C, 0xB1, 0x4D, 0xB1, 0x4E, 0xB1, 0x4F, /* 0xE0-0xE3 */ + 0xB1, 0x50, 0xEE, 0xC2, 0xB1, 0x51, 0xB1, 0x52, /* 0xE4-0xE7 */ + 0xB1, 0x53, 0xB1, 0x54, 0xB1, 0x55, 0xB1, 0x56, /* 0xE8-0xEB */ + 0xB1, 0x57, 0xB1, 0x58, 0xC4, 0xBF, 0xB6, 0xA2, /* 0xEC-0xEF */ + 0xB1, 0x59, 0xED, 0xEC, 0xC3, 0xA4, 0xB1, 0x5A, /* 0xF0-0xF3 */ + 0xD6, 0xB1, 0xB1, 0x5B, 0xB1, 0x5C, 0xB1, 0x5D, /* 0xF4-0xF7 */ + 0xCF, 0xE0, 0xED, 0xEF, 0xB1, 0x5E, 0xB1, 0x5F, /* 0xF8-0xFB */ + 0xC5, 0xCE, 0xB1, 0x60, 0xB6, 0xDC, 0xB1, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0xB1, 0x62, 0xCA, 0xA1, 0xB1, 0x63, 0xB1, 0x64, /* 0x00-0x03 */ + 0xED, 0xED, 0xB1, 0x65, 0xB1, 0x66, 0xED, 0xF0, /* 0x04-0x07 */ + 0xED, 0xF1, 0xC3, 0xBC, 0xB1, 0x67, 0xBF, 0xB4, /* 0x08-0x0B */ + 0xB1, 0x68, 0xED, 0xEE, 0xB1, 0x69, 0xB1, 0x6A, /* 0x0C-0x0F */ + 0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x10-0x13 */ + 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, 0xB1, 0x72, /* 0x14-0x17 */ + 0xB1, 0x73, 0xED, 0xF4, 0xED, 0xF2, 0xB1, 0x74, /* 0x18-0x1B */ + 0xB1, 0x75, 0xB1, 0x76, 0xB1, 0x77, 0xD5, 0xE6, /* 0x1C-0x1F */ + 0xC3, 0xDF, 0xB1, 0x78, 0xED, 0xF3, 0xB1, 0x79, /* 0x20-0x23 */ + 0xB1, 0x7A, 0xB1, 0x7B, 0xED, 0xF6, 0xB1, 0x7C, /* 0x24-0x27 */ + 0xD5, 0xA3, 0xD1, 0xA3, 0xB1, 0x7D, 0xB1, 0x7E, /* 0x28-0x2B */ + 0xB1, 0x80, 0xED, 0xF5, 0xB1, 0x81, 0xC3, 0xD0, /* 0x2C-0x2F */ + 0xB1, 0x82, 0xB1, 0x83, 0xB1, 0x84, 0xB1, 0x85, /* 0x30-0x33 */ + 0xB1, 0x86, 0xED, 0xF7, 0xBF, 0xF4, 0xBE, 0xEC, /* 0x34-0x37 */ + 0xED, 0xF8, 0xB1, 0x87, 0xCC, 0xF7, 0xB1, 0x88, /* 0x38-0x3B */ + 0xD1, 0xDB, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x3C-0x3F */ + 0xD7, 0xC5, 0xD5, 0xF6, 0xB1, 0x8C, 0xED, 0xFC, /* 0x40-0x43 */ + 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, 0xED, 0xFB, /* 0x44-0x47 */ + 0xB1, 0x90, 0xB1, 0x91, 0xB1, 0x92, 0xB1, 0x93, /* 0x48-0x4B */ + 0xB1, 0x94, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x4C-0x4F */ + 0xED, 0xF9, 0xED, 0xFA, 0xB1, 0x98, 0xB1, 0x99, /* 0x50-0x53 */ + 0xB1, 0x9A, 0xB1, 0x9B, 0xB1, 0x9C, 0xB1, 0x9D, /* 0x54-0x57 */ + 0xB1, 0x9E, 0xB1, 0x9F, 0xED, 0xFD, 0xBE, 0xA6, /* 0x58-0x5B */ + 0xB1, 0xA0, 0xB2, 0x40, 0xB2, 0x41, 0xB2, 0x42, /* 0x5C-0x5F */ + 0xB2, 0x43, 0xCB, 0xAF, 0xEE, 0xA1, 0xB6, 0xBD, /* 0x60-0x63 */ + 0xB2, 0x44, 0xEE, 0xA2, 0xC4, 0xC0, 0xB2, 0x45, /* 0x64-0x67 */ + 0xED, 0xFE, 0xB2, 0x46, 0xB2, 0x47, 0xBD, 0xDE, /* 0x68-0x6B */ + 0xB2, 0xC7, 0xB2, 0x48, 0xB2, 0x49, 0xB2, 0x4A, /* 0x6C-0x6F */ + 0xB2, 0x4B, 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, /* 0x70-0x73 */ + 0xB2, 0x4F, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x74-0x77 */ + 0xB2, 0x53, 0xB6, 0xC3, 0xB2, 0x54, 0xB2, 0x55, /* 0x78-0x7B */ + 0xB2, 0x56, 0xEE, 0xA5, 0xD8, 0xBA, 0xEE, 0xA3, /* 0x7C-0x7F */ + + 0xEE, 0xA6, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x80-0x83 */ + 0xC3, 0xE9, 0xB3, 0xF2, 0xB2, 0x5A, 0xB2, 0x5B, /* 0x84-0x87 */ + 0xB2, 0x5C, 0xB2, 0x5D, 0xB2, 0x5E, 0xB2, 0x5F, /* 0x88-0x8B */ + 0xEE, 0xA7, 0xEE, 0xA4, 0xCF, 0xB9, 0xB2, 0x60, /* 0x8C-0x8F */ + 0xB2, 0x61, 0xEE, 0xA8, 0xC2, 0xF7, 0xB2, 0x62, /* 0x90-0x93 */ + 0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x94-0x97 */ + 0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x98-0x9B */ + 0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xEE, 0xA9, /* 0x9C-0x9F */ + 0xEE, 0xAA, 0xB2, 0x6E, 0xDE, 0xAB, 0xB2, 0x6F, /* 0xA0-0xA3 */ + 0xB2, 0x70, 0xC6, 0xB3, 0xB2, 0x71, 0xC7, 0xC6, /* 0xA4-0xA7 */ + 0xB2, 0x72, 0xD6, 0xF5, 0xB5, 0xC9, 0xB2, 0x73, /* 0xA8-0xAB */ + 0xCB, 0xB2, 0xB2, 0x74, 0xB2, 0x75, 0xB2, 0x76, /* 0xAC-0xAF */ + 0xEE, 0xAB, 0xB2, 0x77, 0xB2, 0x78, 0xCD, 0xAB, /* 0xB0-0xB3 */ + 0xB2, 0x79, 0xEE, 0xAC, 0xB2, 0x7A, 0xB2, 0x7B, /* 0xB4-0xB7 */ + 0xB2, 0x7C, 0xB2, 0x7D, 0xB2, 0x7E, 0xD5, 0xB0, /* 0xB8-0xBB */ + 0xB2, 0x80, 0xEE, 0xAD, 0xB2, 0x81, 0xF6, 0xC4, /* 0xBC-0xBF */ + 0xB2, 0x82, 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, /* 0xC0-0xC3 */ + 0xB2, 0x86, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xC4-0xC7 */ + 0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xC8-0xCB */ + 0xB2, 0x8E, 0xDB, 0xC7, 0xB2, 0x8F, 0xB2, 0x90, /* 0xCC-0xCF */ + 0xB2, 0x91, 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, /* 0xD0-0xD3 */ + 0xB2, 0x95, 0xB2, 0x96, 0xB2, 0x97, 0xB4, 0xA3, /* 0xD4-0xD7 */ + 0xB2, 0x98, 0xB2, 0x99, 0xB2, 0x9A, 0xC3, 0xAC, /* 0xD8-0xDB */ + 0xF1, 0xE6, 0xB2, 0x9B, 0xB2, 0x9C, 0xB2, 0x9D, /* 0xDC-0xDF */ + 0xB2, 0x9E, 0xB2, 0x9F, 0xCA, 0xB8, 0xD2, 0xD3, /* 0xE0-0xE3 */ + 0xB2, 0xA0, 0xD6, 0xAA, 0xB3, 0x40, 0xEF, 0xF2, /* 0xE4-0xE7 */ + 0xB3, 0x41, 0xBE, 0xD8, 0xB3, 0x42, 0xBD, 0xC3, /* 0xE8-0xEB */ + 0xEF, 0xF3, 0xB6, 0xCC, 0xB0, 0xAB, 0xB3, 0x43, /* 0xEC-0xEF */ + 0xB3, 0x44, 0xB3, 0x45, 0xB3, 0x46, 0xCA, 0xAF, /* 0xF0-0xF3 */ + 0xB3, 0x47, 0xB3, 0x48, 0xED, 0xB6, 0xB3, 0x49, /* 0xF4-0xF7 */ + 0xED, 0xB7, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xF8-0xFB */ + 0xB3, 0x4D, 0xCE, 0xF9, 0xB7, 0xAF, 0xBF, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0xED, 0xB8, 0xC2, 0xEB, 0xC9, 0xB0, 0xB3, 0x4E, /* 0x00-0x03 */ + 0xB3, 0x4F, 0xB3, 0x50, 0xB3, 0x51, 0xB3, 0x52, /* 0x04-0x07 */ + 0xB3, 0x53, 0xED, 0xB9, 0xB3, 0x54, 0xB3, 0x55, /* 0x08-0x0B */ + 0xC6, 0xF6, 0xBF, 0xB3, 0xB3, 0x56, 0xB3, 0x57, /* 0x0C-0x0F */ + 0xB3, 0x58, 0xED, 0xBC, 0xC5, 0xF8, 0xB3, 0x59, /* 0x10-0x13 */ + 0xD1, 0xD0, 0xB3, 0x5A, 0xD7, 0xA9, 0xED, 0xBA, /* 0x14-0x17 */ + 0xED, 0xBB, 0xB3, 0x5B, 0xD1, 0xE2, 0xB3, 0x5C, /* 0x18-0x1B */ + 0xED, 0xBF, 0xED, 0xC0, 0xB3, 0x5D, 0xED, 0xC4, /* 0x1C-0x1F */ + 0xB3, 0x5E, 0xB3, 0x5F, 0xB3, 0x60, 0xED, 0xC8, /* 0x20-0x23 */ + 0xB3, 0x61, 0xED, 0xC6, 0xED, 0xCE, 0xD5, 0xE8, /* 0x24-0x27 */ + 0xB3, 0x62, 0xED, 0xC9, 0xB3, 0x63, 0xB3, 0x64, /* 0x28-0x2B */ + 0xED, 0xC7, 0xED, 0xBE, 0xB3, 0x65, 0xB3, 0x66, /* 0x2C-0x2F */ + 0xC5, 0xE9, 0xB3, 0x67, 0xB3, 0x68, 0xB3, 0x69, /* 0x30-0x33 */ + 0xC6, 0xC6, 0xB3, 0x6A, 0xB3, 0x6B, 0xC9, 0xE9, /* 0x34-0x37 */ + 0xD4, 0xD2, 0xED, 0xC1, 0xED, 0xC2, 0xED, 0xC3, /* 0x38-0x3B */ + 0xED, 0xC5, 0xB3, 0x6C, 0xC0, 0xF9, 0xB3, 0x6D, /* 0x3C-0x3F */ + 0xB4, 0xA1, 0xB3, 0x6E, 0xB3, 0x6F, 0xB3, 0x70, /* 0x40-0x43 */ + 0xB3, 0x71, 0xB9, 0xE8, 0xB3, 0x72, 0xED, 0xD0, /* 0x44-0x47 */ + 0xB3, 0x73, 0xB3, 0x74, 0xB3, 0x75, 0xB3, 0x76, /* 0x48-0x4B */ + 0xED, 0xD1, 0xB3, 0x77, 0xED, 0xCA, 0xB3, 0x78, /* 0x4C-0x4F */ + 0xED, 0xCF, 0xB3, 0x79, 0xCE, 0xF8, 0xB3, 0x7A, /* 0x50-0x53 */ + 0xB3, 0x7B, 0xCB, 0xB6, 0xED, 0xCC, 0xED, 0xCD, /* 0x54-0x57 */ + 0xB3, 0x7C, 0xB3, 0x7D, 0xB3, 0x7E, 0xB3, 0x80, /* 0x58-0x5B */ + 0xB3, 0x81, 0xCF, 0xF5, 0xB3, 0x82, 0xB3, 0x83, /* 0x5C-0x5F */ + 0xB3, 0x84, 0xB3, 0x85, 0xB3, 0x86, 0xB3, 0x87, /* 0x60-0x63 */ + 0xB3, 0x88, 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, /* 0x64-0x67 */ + 0xB3, 0x8C, 0xB3, 0x8D, 0xED, 0xD2, 0xC1, 0xF2, /* 0x68-0x6B */ + 0xD3, 0xB2, 0xED, 0xCB, 0xC8, 0xB7, 0xB3, 0x8E, /* 0x6C-0x6F */ + 0xB3, 0x8F, 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, /* 0x70-0x73 */ + 0xB3, 0x93, 0xB3, 0x94, 0xB3, 0x95, 0xBC, 0xEF, /* 0x74-0x77 */ + 0xB3, 0x96, 0xB3, 0x97, 0xB3, 0x98, 0xB3, 0x99, /* 0x78-0x7B */ + 0xC5, 0xF0, 0xB3, 0x9A, 0xB3, 0x9B, 0xB3, 0x9C, /* 0x7C-0x7F */ + + 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, 0xB3, 0xA0, /* 0x80-0x83 */ + 0xB4, 0x40, 0xB4, 0x41, 0xB4, 0x42, 0xED, 0xD6, /* 0x84-0x87 */ + 0xB4, 0x43, 0xB5, 0xEF, 0xB4, 0x44, 0xB4, 0x45, /* 0x88-0x8B */ + 0xC2, 0xB5, 0xB0, 0xAD, 0xCB, 0xE9, 0xB4, 0x46, /* 0x8C-0x8F */ + 0xB4, 0x47, 0xB1, 0xAE, 0xB4, 0x48, 0xED, 0xD4, /* 0x90-0x93 */ + 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, 0xCD, 0xEB, /* 0x94-0x97 */ + 0xB5, 0xE2, 0xB4, 0x4C, 0xED, 0xD5, 0xED, 0xD3, /* 0x98-0x9B */ + 0xED, 0xD7, 0xB4, 0x4D, 0xB4, 0x4E, 0xB5, 0xFA, /* 0x9C-0x9F */ + 0xB4, 0x4F, 0xED, 0xD8, 0xB4, 0x50, 0xED, 0xD9, /* 0xA0-0xA3 */ + 0xB4, 0x51, 0xED, 0xDC, 0xB4, 0x52, 0xB1, 0xCC, /* 0xA4-0xA7 */ + 0xB4, 0x53, 0xB4, 0x54, 0xB4, 0x55, 0xB4, 0x56, /* 0xA8-0xAB */ + 0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0xAC-0xAF */ + 0xC5, 0xF6, 0xBC, 0xEE, 0xED, 0xDA, 0xCC, 0xBC, /* 0xB0-0xB3 */ + 0xB2, 0xEA, 0xB4, 0x5B, 0xB4, 0x5C, 0xB4, 0x5D, /* 0xB4-0xB7 */ + 0xB4, 0x5E, 0xED, 0xDB, 0xB4, 0x5F, 0xB4, 0x60, /* 0xB8-0xBB */ + 0xB4, 0x61, 0xB4, 0x62, 0xC4, 0xEB, 0xB4, 0x63, /* 0xBC-0xBF */ + 0xB4, 0x64, 0xB4, 0xC5, 0xB4, 0x65, 0xB4, 0x66, /* 0xC0-0xC3 */ + 0xB4, 0x67, 0xB0, 0xF5, 0xB4, 0x68, 0xB4, 0x69, /* 0xC4-0xC7 */ + 0xB4, 0x6A, 0xED, 0xDF, 0xC0, 0xDA, 0xB4, 0xE8, /* 0xC8-0xCB */ + 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, 0xB4, 0x6E, /* 0xCC-0xCF */ + 0xC5, 0xCD, 0xB4, 0x6F, 0xB4, 0x70, 0xB4, 0x71, /* 0xD0-0xD3 */ + 0xED, 0xDD, 0xBF, 0xC4, 0xB4, 0x72, 0xB4, 0x73, /* 0xD4-0xD7 */ + 0xB4, 0x74, 0xED, 0xDE, 0xB4, 0x75, 0xB4, 0x76, /* 0xD8-0xDB */ + 0xB4, 0x77, 0xB4, 0x78, 0xB4, 0x79, 0xB4, 0x7A, /* 0xDC-0xDF */ + 0xB4, 0x7B, 0xB4, 0x7C, 0xB4, 0x7D, 0xB4, 0x7E, /* 0xE0-0xE3 */ + 0xB4, 0x80, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0xE4-0xE7 */ + 0xC4, 0xA5, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0xE8-0xEB */ + 0xED, 0xE0, 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, /* 0xEC-0xEF */ + 0xB4, 0x8A, 0xB4, 0x8B, 0xED, 0xE1, 0xB4, 0x8C, /* 0xF0-0xF3 */ + 0xED, 0xE3, 0xB4, 0x8D, 0xB4, 0x8E, 0xC1, 0xD7, /* 0xF4-0xF7 */ + 0xB4, 0x8F, 0xB4, 0x90, 0xBB, 0xC7, 0xB4, 0x91, /* 0xF8-0xFB */ + 0xB4, 0x92, 0xB4, 0x93, 0xB4, 0x94, 0xB4, 0x95, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0xB4, 0x96, 0xBD, 0xB8, 0xB4, 0x97, 0xB4, 0x98, /* 0x00-0x03 */ + 0xB4, 0x99, 0xED, 0xE2, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x04-0x07 */ + 0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x08-0x0B */ + 0xB4, 0xA0, 0xB5, 0x40, 0xB5, 0x41, 0xB5, 0x42, /* 0x0C-0x0F */ + 0xB5, 0x43, 0xB5, 0x44, 0xB5, 0x45, 0xED, 0xE4, /* 0x10-0x13 */ + 0xB5, 0x46, 0xB5, 0x47, 0xB5, 0x48, 0xB5, 0x49, /* 0x14-0x17 */ + 0xB5, 0x4A, 0xB5, 0x4B, 0xB5, 0x4C, 0xB5, 0x4D, /* 0x18-0x1B */ + 0xB5, 0x4E, 0xB5, 0x4F, 0xED, 0xE6, 0xB5, 0x50, /* 0x1C-0x1F */ + 0xB5, 0x51, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0x20-0x23 */ + 0xED, 0xE5, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0x24-0x27 */ + 0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x5B, /* 0x28-0x2B */ + 0xB5, 0x5C, 0xB5, 0x5D, 0xB5, 0x5E, 0xB5, 0x5F, /* 0x2C-0x2F */ + 0xB5, 0x60, 0xB5, 0x61, 0xB5, 0x62, 0xB5, 0x63, /* 0x30-0x33 */ + 0xED, 0xE7, 0xB5, 0x64, 0xB5, 0x65, 0xB5, 0x66, /* 0x34-0x37 */ + 0xB5, 0x67, 0xB5, 0x68, 0xCA, 0xBE, 0xEC, 0xEA, /* 0x38-0x3B */ + 0xC0, 0xF1, 0xB5, 0x69, 0xC9, 0xE7, 0xB5, 0x6A, /* 0x3C-0x3F */ + 0xEC, 0xEB, 0xC6, 0xEE, 0xB5, 0x6B, 0xB5, 0x6C, /* 0x40-0x43 */ + 0xB5, 0x6D, 0xB5, 0x6E, 0xEC, 0xEC, 0xB5, 0x6F, /* 0x44-0x47 */ + 0xC6, 0xED, 0xEC, 0xED, 0xB5, 0x70, 0xB5, 0x71, /* 0x48-0x4B */ + 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, 0xB5, 0x75, /* 0x4C-0x4F */ + 0xB5, 0x76, 0xB5, 0x77, 0xB5, 0x78, 0xEC, 0xF0, /* 0x50-0x53 */ + 0xB5, 0x79, 0xB5, 0x7A, 0xD7, 0xE6, 0xEC, 0xF3, /* 0x54-0x57 */ + 0xB5, 0x7B, 0xB5, 0x7C, 0xEC, 0xF1, 0xEC, 0xEE, /* 0x58-0x5B */ + 0xEC, 0xEF, 0xD7, 0xA3, 0xC9, 0xF1, 0xCB, 0xEE, /* 0x5C-0x5F */ + 0xEC, 0xF4, 0xB5, 0x7D, 0xEC, 0xF2, 0xB5, 0x7E, /* 0x60-0x63 */ + 0xB5, 0x80, 0xCF, 0xE9, 0xB5, 0x81, 0xEC, 0xF6, /* 0x64-0x67 */ + 0xC6, 0xB1, 0xB5, 0x82, 0xB5, 0x83, 0xB5, 0x84, /* 0x68-0x6B */ + 0xB5, 0x85, 0xBC, 0xC0, 0xB5, 0x86, 0xEC, 0xF5, /* 0x6C-0x6F */ + 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, 0xB5, 0x8A, /* 0x70-0x73 */ + 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, 0xB5, 0xBB, /* 0x74-0x77 */ + 0xBB, 0xF6, 0xB5, 0x8E, 0xEC, 0xF7, 0xB5, 0x8F, /* 0x78-0x7B */ + 0xB5, 0x90, 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, /* 0x7C-0x7F */ + + 0xD9, 0xF7, 0xBD, 0xFB, 0xB5, 0x94, 0xB5, 0x95, /* 0x80-0x83 */ + 0xC2, 0xBB, 0xEC, 0xF8, 0xB5, 0x96, 0xB5, 0x97, /* 0x84-0x87 */ + 0xB5, 0x98, 0xB5, 0x99, 0xEC, 0xF9, 0xB5, 0x9A, /* 0x88-0x8B */ + 0xB5, 0x9B, 0xB5, 0x9C, 0xB5, 0x9D, 0xB8, 0xA3, /* 0x8C-0x8F */ + 0xB5, 0x9E, 0xB5, 0x9F, 0xB5, 0xA0, 0xB6, 0x40, /* 0x90-0x93 */ + 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, 0xB6, 0x44, /* 0x94-0x97 */ + 0xB6, 0x45, 0xB6, 0x46, 0xEC, 0xFA, 0xB6, 0x47, /* 0x98-0x9B */ + 0xB6, 0x48, 0xB6, 0x49, 0xB6, 0x4A, 0xB6, 0x4B, /* 0x9C-0x9F */ + 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, 0xB6, 0x4F, /* 0xA0-0xA3 */ + 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, 0xEC, 0xFB, /* 0xA4-0xA7 */ + 0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0xA8-0xAB */ + 0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0xAC-0xAF */ + 0xB6, 0x5B, 0xB6, 0x5C, 0xB6, 0x5D, 0xEC, 0xFC, /* 0xB0-0xB3 */ + 0xB6, 0x5E, 0xB6, 0x5F, 0xB6, 0x60, 0xB6, 0x61, /* 0xB4-0xB7 */ + 0xB6, 0x62, 0xD3, 0xED, 0xD8, 0xAE, 0xC0, 0xEB, /* 0xB8-0xBB */ + 0xB6, 0x63, 0xC7, 0xDD, 0xBA, 0xCC, 0xB6, 0x64, /* 0xBC-0xBF */ + 0xD0, 0xE3, 0xCB, 0xBD, 0xB6, 0x65, 0xCD, 0xBA, /* 0xC0-0xC3 */ + 0xB6, 0x66, 0xB6, 0x67, 0xB8, 0xD1, 0xB6, 0x68, /* 0xC4-0xC7 */ + 0xB6, 0x69, 0xB1, 0xFC, 0xB6, 0x6A, 0xC7, 0xEF, /* 0xC8-0xCB */ + 0xB6, 0x6B, 0xD6, 0xD6, 0xB6, 0x6C, 0xB6, 0x6D, /* 0xCC-0xCF */ + 0xB6, 0x6E, 0xBF, 0xC6, 0xC3, 0xEB, 0xB6, 0x6F, /* 0xD0-0xD3 */ + 0xB6, 0x70, 0xEF, 0xF5, 0xB6, 0x71, 0xB6, 0x72, /* 0xD4-0xD7 */ + 0xC3, 0xD8, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0xD8-0xDB */ + 0xB6, 0x76, 0xB6, 0x77, 0xB6, 0x78, 0xD7, 0xE2, /* 0xDC-0xDF */ + 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x7B, 0xEF, 0xF7, /* 0xE0-0xE3 */ + 0xB3, 0xD3, 0xB6, 0x7C, 0xC7, 0xD8, 0xD1, 0xED, /* 0xE4-0xE7 */ + 0xB6, 0x7D, 0xD6, 0xC8, 0xB6, 0x7E, 0xEF, 0xF8, /* 0xE8-0xEB */ + 0xB6, 0x80, 0xEF, 0xF6, 0xB6, 0x81, 0xBB, 0xFD, /* 0xEC-0xEF */ + 0xB3, 0xC6, 0xB6, 0x82, 0xB6, 0x83, 0xB6, 0x84, /* 0xF0-0xF3 */ + 0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0xF4-0xF7 */ + 0xBD, 0xD5, 0xB6, 0x89, 0xB6, 0x8A, 0xD2, 0xC6, /* 0xF8-0xFB */ + 0xB6, 0x8B, 0xBB, 0xE0, 0xB6, 0x8C, 0xB6, 0x8D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7A[512] = { + 0xCF, 0xA1, 0xB6, 0x8E, 0xEF, 0xFC, 0xEF, 0xFB, /* 0x00-0x03 */ + 0xB6, 0x8F, 0xB6, 0x90, 0xEF, 0xF9, 0xB6, 0x91, /* 0x04-0x07 */ + 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, 0xB3, 0xCC, /* 0x08-0x0B */ + 0xB6, 0x95, 0xC9, 0xD4, 0xCB, 0xB0, 0xB6, 0x96, /* 0x0C-0x0F */ + 0xB6, 0x97, 0xB6, 0x98, 0xB6, 0x99, 0xB6, 0x9A, /* 0x10-0x13 */ + 0xEF, 0xFE, 0xB6, 0x9B, 0xB6, 0x9C, 0xB0, 0xDE, /* 0x14-0x17 */ + 0xB6, 0x9D, 0xB6, 0x9E, 0xD6, 0xC9, 0xB6, 0x9F, /* 0x18-0x1B */ + 0xB6, 0xA0, 0xB7, 0x40, 0xEF, 0xFD, 0xB7, 0x41, /* 0x1C-0x1F */ + 0xB3, 0xED, 0xB7, 0x42, 0xB7, 0x43, 0xF6, 0xD5, /* 0x20-0x23 */ + 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, 0xB7, 0x47, /* 0x24-0x27 */ + 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, 0xB7, 0x4B, /* 0x28-0x2B */ + 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, 0xB7, 0x4F, /* 0x2C-0x2F */ + 0xB7, 0x50, 0xB7, 0x51, 0xB7, 0x52, 0xCE, 0xC8, /* 0x30-0x33 */ + 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, 0xF0, 0xA2, /* 0x34-0x37 */ + 0xB7, 0x56, 0xF0, 0xA1, 0xB7, 0x57, 0xB5, 0xBE, /* 0x38-0x3B */ + 0xBC, 0xDA, 0xBB, 0xFC, 0xB7, 0x58, 0xB8, 0xE5, /* 0x3C-0x3F */ + 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x5B, 0xB7, 0x5C, /* 0x40-0x43 */ + 0xB7, 0x5D, 0xB7, 0x5E, 0xC4, 0xC2, 0xB7, 0x5F, /* 0x44-0x47 */ + 0xB7, 0x60, 0xB7, 0x61, 0xB7, 0x62, 0xB7, 0x63, /* 0x48-0x4B */ + 0xB7, 0x64, 0xB7, 0x65, 0xB7, 0x66, 0xB7, 0x67, /* 0x4C-0x4F */ + 0xB7, 0x68, 0xF0, 0xA3, 0xB7, 0x69, 0xB7, 0x6A, /* 0x50-0x53 */ + 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, 0xCB, 0xEB, /* 0x54-0x57 */ + 0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x58-0x5B */ + 0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x5C-0x5F */ + 0xB7, 0x76, 0xB7, 0x77, 0xB7, 0x78, 0xB7, 0x79, /* 0x60-0x63 */ + 0xB7, 0x7A, 0xB7, 0x7B, 0xB7, 0x7C, 0xB7, 0x7D, /* 0x64-0x67 */ + 0xB7, 0x7E, 0xB7, 0x80, 0xB7, 0x81, 0xB7, 0x82, /* 0x68-0x6B */ + 0xB7, 0x83, 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, /* 0x6C-0x6F */ + 0xF0, 0xA6, 0xB7, 0x87, 0xB7, 0x88, 0xB7, 0x89, /* 0x70-0x73 */ + 0xD1, 0xA8, 0xB7, 0x8A, 0xBE, 0xBF, 0xC7, 0xEE, /* 0x74-0x77 */ + 0xF1, 0xB6, 0xF1, 0xB7, 0xBF, 0xD5, 0xB7, 0x8B, /* 0x78-0x7B */ + 0xB7, 0x8C, 0xB7, 0x8D, 0xB7, 0x8E, 0xB4, 0xA9, /* 0x7C-0x7F */ + + 0xF1, 0xB8, 0xCD, 0xBB, 0xB7, 0x8F, 0xC7, 0xD4, /* 0x80-0x83 */ + 0xD5, 0xAD, 0xB7, 0x90, 0xF1, 0xB9, 0xB7, 0x91, /* 0x84-0x87 */ + 0xF1, 0xBA, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0x88-0x8B */ + 0xB7, 0x95, 0xC7, 0xCF, 0xB7, 0x96, 0xB7, 0x97, /* 0x8C-0x8F */ + 0xB7, 0x98, 0xD2, 0xA4, 0xD6, 0xCF, 0xB7, 0x99, /* 0x90-0x93 */ + 0xB7, 0x9A, 0xF1, 0xBB, 0xBD, 0xD1, 0xB4, 0xB0, /* 0x94-0x97 */ + 0xBE, 0xBD, 0xB7, 0x9B, 0xB7, 0x9C, 0xB7, 0x9D, /* 0x98-0x9B */ + 0xB4, 0xDC, 0xCE, 0xD1, 0xB7, 0x9E, 0xBF, 0xDF, /* 0x9C-0x9F */ + 0xF1, 0xBD, 0xB7, 0x9F, 0xB7, 0xA0, 0xB8, 0x40, /* 0xA0-0xA3 */ + 0xB8, 0x41, 0xBF, 0xFA, 0xF1, 0xBC, 0xB8, 0x42, /* 0xA4-0xA7 */ + 0xF1, 0xBF, 0xB8, 0x43, 0xB8, 0x44, 0xB8, 0x45, /* 0xA8-0xAB */ + 0xF1, 0xBE, 0xF1, 0xC0, 0xB8, 0x46, 0xB8, 0x47, /* 0xAC-0xAF */ + 0xB8, 0x48, 0xB8, 0x49, 0xB8, 0x4A, 0xF1, 0xC1, /* 0xB0-0xB3 */ + 0xB8, 0x4B, 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, /* 0xB4-0xB7 */ + 0xB8, 0x4F, 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, /* 0xB8-0xBB */ + 0xB8, 0x53, 0xB8, 0x54, 0xB8, 0x55, 0xC1, 0xFE, /* 0xBC-0xBF */ + 0xB8, 0x56, 0xB8, 0x57, 0xB8, 0x58, 0xB8, 0x59, /* 0xC0-0xC3 */ + 0xB8, 0x5A, 0xB8, 0x5B, 0xB8, 0x5C, 0xB8, 0x5D, /* 0xC4-0xC7 */ + 0xB8, 0x5E, 0xB8, 0x5F, 0xB8, 0x60, 0xC1, 0xA2, /* 0xC8-0xCB */ + 0xB8, 0x61, 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, /* 0xCC-0xCF */ + 0xB8, 0x65, 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, /* 0xD0-0xD3 */ + 0xB8, 0x69, 0xB8, 0x6A, 0xCA, 0xFA, 0xB8, 0x6B, /* 0xD4-0xD7 */ + 0xB8, 0x6C, 0xD5, 0xBE, 0xB8, 0x6D, 0xB8, 0x6E, /* 0xD8-0xDB */ + 0xB8, 0x6F, 0xB8, 0x70, 0xBE, 0xBA, 0xBE, 0xB9, /* 0xDC-0xDF */ + 0xD5, 0xC2, 0xB8, 0x71, 0xB8, 0x72, 0xBF, 0xA2, /* 0xE0-0xE3 */ + 0xB8, 0x73, 0xCD, 0xAF, 0xF1, 0xB5, 0xB8, 0x74, /* 0xE4-0xE7 */ + 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, 0xB8, 0x78, /* 0xE8-0xEB */ + 0xB8, 0x79, 0xBD, 0xDF, 0xB8, 0x7A, 0xB6, 0xCB, /* 0xEC-0xEF */ + 0xB8, 0x7B, 0xB8, 0x7C, 0xB8, 0x7D, 0xB8, 0x7E, /* 0xF0-0xF3 */ + 0xB8, 0x80, 0xB8, 0x81, 0xB8, 0x82, 0xB8, 0x83, /* 0xF4-0xF7 */ + 0xB8, 0x84, 0xD6, 0xF1, 0xF3, 0xC3, 0xB8, 0x85, /* 0xF8-0xFB */ + 0xB8, 0x86, 0xF3, 0xC4, 0xB8, 0x87, 0xB8, 0xCD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, 0xF3, 0xC6, /* 0x00-0x03 */ + 0xF3, 0xC7, 0xB8, 0x8B, 0xB0, 0xCA, 0xB8, 0x8C, /* 0x04-0x07 */ + 0xF3, 0xC5, 0xB8, 0x8D, 0xF3, 0xC9, 0xCB, 0xF1, /* 0x08-0x0B */ + 0xB8, 0x8E, 0xB8, 0x8F, 0xB8, 0x90, 0xF3, 0xCB, /* 0x0C-0x0F */ + 0xB8, 0x91, 0xD0, 0xA6, 0xB8, 0x92, 0xB8, 0x93, /* 0x10-0x13 */ + 0xB1, 0xCA, 0xF3, 0xC8, 0xB8, 0x94, 0xB8, 0x95, /* 0x14-0x17 */ + 0xB8, 0x96, 0xF3, 0xCF, 0xB8, 0x97, 0xB5, 0xD1, /* 0x18-0x1B */ + 0xB8, 0x98, 0xB8, 0x99, 0xF3, 0xD7, 0xB8, 0x9A, /* 0x1C-0x1F */ + 0xF3, 0xD2, 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, /* 0x20-0x23 */ + 0xF3, 0xD4, 0xF3, 0xD3, 0xB7, 0xFB, 0xB8, 0x9E, /* 0x24-0x27 */ + 0xB1, 0xBF, 0xB8, 0x9F, 0xF3, 0xCE, 0xF3, 0xCA, /* 0x28-0x2B */ + 0xB5, 0xDA, 0xB8, 0xA0, 0xF3, 0xD0, 0xB9, 0x40, /* 0x2C-0x2F */ + 0xB9, 0x41, 0xF3, 0xD1, 0xB9, 0x42, 0xF3, 0xD5, /* 0x30-0x33 */ + 0xB9, 0x43, 0xB9, 0x44, 0xB9, 0x45, 0xB9, 0x46, /* 0x34-0x37 */ + 0xF3, 0xCD, 0xB9, 0x47, 0xBC, 0xE3, 0xB9, 0x48, /* 0x38-0x3B */ + 0xC1, 0xFD, 0xB9, 0x49, 0xF3, 0xD6, 0xB9, 0x4A, /* 0x3C-0x3F */ + 0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x40-0x43 */ + 0xB9, 0x4F, 0xF3, 0xDA, 0xB9, 0x50, 0xF3, 0xCC, /* 0x44-0x47 */ + 0xB9, 0x51, 0xB5, 0xC8, 0xB9, 0x52, 0xBD, 0xEE, /* 0x48-0x4B */ + 0xF3, 0xDC, 0xB9, 0x53, 0xB9, 0x54, 0xB7, 0xA4, /* 0x4C-0x4F */ + 0xBF, 0xF0, 0xD6, 0xFE, 0xCD, 0xB2, 0xB9, 0x55, /* 0x50-0x53 */ + 0xB4, 0xF0, 0xB9, 0x56, 0xB2, 0xDF, 0xB9, 0x57, /* 0x54-0x57 */ + 0xF3, 0xD8, 0xB9, 0x58, 0xF3, 0xD9, 0xC9, 0xB8, /* 0x58-0x5B */ + 0xB9, 0x59, 0xF3, 0xDD, 0xB9, 0x5A, 0xB9, 0x5B, /* 0x5C-0x5F */ + 0xF3, 0xDE, 0xB9, 0x5C, 0xF3, 0xE1, 0xB9, 0x5D, /* 0x60-0x63 */ + 0xB9, 0x5E, 0xB9, 0x5F, 0xB9, 0x60, 0xB9, 0x61, /* 0x64-0x67 */ + 0xB9, 0x62, 0xB9, 0x63, 0xB9, 0x64, 0xB9, 0x65, /* 0x68-0x6B */ + 0xB9, 0x66, 0xB9, 0x67, 0xF3, 0xDF, 0xB9, 0x68, /* 0x6C-0x6F */ + 0xB9, 0x69, 0xF3, 0xE3, 0xF3, 0xE2, 0xB9, 0x6A, /* 0x70-0x73 */ + 0xB9, 0x6B, 0xF3, 0xDB, 0xB9, 0x6C, 0xBF, 0xEA, /* 0x74-0x77 */ + 0xB9, 0x6D, 0xB3, 0xEF, 0xB9, 0x6E, 0xF3, 0xE0, /* 0x78-0x7B */ + 0xB9, 0x6F, 0xB9, 0x70, 0xC7, 0xA9, 0xB9, 0x71, /* 0x7C-0x7F */ + + 0xBC, 0xF2, 0xB9, 0x72, 0xB9, 0x73, 0xB9, 0x74, /* 0x80-0x83 */ + 0xB9, 0x75, 0xF3, 0xEB, 0xB9, 0x76, 0xB9, 0x77, /* 0x84-0x87 */ + 0xB9, 0x78, 0xB9, 0x79, 0xB9, 0x7A, 0xB9, 0x7B, /* 0x88-0x8B */ + 0xB9, 0x7C, 0xB9, 0xBF, 0xB9, 0x7D, 0xB9, 0x7E, /* 0x8C-0x8F */ + 0xF3, 0xE4, 0xB9, 0x80, 0xB9, 0x81, 0xB9, 0x82, /* 0x90-0x93 */ + 0xB2, 0xAD, 0xBB, 0xFE, 0xB9, 0x83, 0xCB, 0xE3, /* 0x94-0x97 */ + 0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x98-0x9B */ + 0xF3, 0xED, 0xF3, 0xE9, 0xB9, 0x88, 0xB9, 0x89, /* 0x9C-0x9F */ + 0xB9, 0x8A, 0xB9, 0xDC, 0xF3, 0xEE, 0xB9, 0x8B, /* 0xA0-0xA3 */ + 0xB9, 0x8C, 0xB9, 0x8D, 0xF3, 0xE5, 0xF3, 0xE6, /* 0xA4-0xA7 */ + 0xF3, 0xEA, 0xC2, 0xE1, 0xF3, 0xEC, 0xF3, 0xEF, /* 0xA8-0xAB */ + 0xF3, 0xE8, 0xBC, 0xFD, 0xB9, 0x8E, 0xB9, 0x8F, /* 0xAC-0xAF */ + 0xB9, 0x90, 0xCF, 0xE4, 0xB9, 0x91, 0xB9, 0x92, /* 0xB0-0xB3 */ + 0xF3, 0xF0, 0xB9, 0x93, 0xB9, 0x94, 0xB9, 0x95, /* 0xB4-0xB7 */ + 0xF3, 0xE7, 0xB9, 0x96, 0xB9, 0x97, 0xB9, 0x98, /* 0xB8-0xBB */ + 0xB9, 0x99, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0xBC-0xBF */ + 0xB9, 0x9D, 0xF3, 0xF2, 0xB9, 0x9E, 0xB9, 0x9F, /* 0xC0-0xC3 */ + 0xB9, 0xA0, 0xBA, 0x40, 0xD7, 0xAD, 0xC6, 0xAA, /* 0xC4-0xC7 */ + 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, 0xBA, 0x44, /* 0xC8-0xCB */ + 0xF3, 0xF3, 0xBA, 0x45, 0xBA, 0x46, 0xBA, 0x47, /* 0xCC-0xCF */ + 0xBA, 0x48, 0xF3, 0xF1, 0xBA, 0x49, 0xC2, 0xA8, /* 0xD0-0xD3 */ + 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, 0xBA, 0x4D, /* 0xD4-0xD7 */ + 0xBA, 0x4E, 0xB8, 0xDD, 0xF3, 0xF5, 0xBA, 0x4F, /* 0xD8-0xDB */ + 0xBA, 0x50, 0xF3, 0xF4, 0xBA, 0x51, 0xBA, 0x52, /* 0xDC-0xDF */ + 0xBA, 0x53, 0xB4, 0xDB, 0xBA, 0x54, 0xBA, 0x55, /* 0xE0-0xE3 */ + 0xBA, 0x56, 0xF3, 0xF6, 0xF3, 0xF7, 0xBA, 0x57, /* 0xE4-0xE7 */ + 0xBA, 0x58, 0xBA, 0x59, 0xF3, 0xF8, 0xBA, 0x5A, /* 0xE8-0xEB */ + 0xBA, 0x5B, 0xBA, 0x5C, 0xC0, 0xBA, 0xBA, 0x5D, /* 0xEC-0xEF */ + 0xBA, 0x5E, 0xC0, 0xE9, 0xBA, 0x5F, 0xBA, 0x60, /* 0xF0-0xF3 */ + 0xBA, 0x61, 0xBA, 0x62, 0xBA, 0x63, 0xC5, 0xF1, /* 0xF4-0xF7 */ + 0xBA, 0x64, 0xBA, 0x65, 0xBA, 0x66, 0xBA, 0x67, /* 0xF8-0xFB */ + 0xF3, 0xFB, 0xBA, 0x68, 0xF3, 0xFA, 0xBA, 0x69, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7C[512] = { + 0xBA, 0x6A, 0xBA, 0x6B, 0xBA, 0x6C, 0xBA, 0x6D, /* 0x00-0x03 */ + 0xBA, 0x6E, 0xBA, 0x6F, 0xBA, 0x70, 0xB4, 0xD8, /* 0x04-0x07 */ + 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, 0xF3, 0xFE, /* 0x08-0x0B */ + 0xF3, 0xF9, 0xBA, 0x74, 0xBA, 0x75, 0xF3, 0xFC, /* 0x0C-0x0F */ + 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, 0xBA, 0x79, /* 0x10-0x13 */ + 0xBA, 0x7A, 0xBA, 0x7B, 0xF3, 0xFD, 0xBA, 0x7C, /* 0x14-0x17 */ + 0xBA, 0x7D, 0xBA, 0x7E, 0xBA, 0x80, 0xBA, 0x81, /* 0x18-0x1B */ + 0xBA, 0x82, 0xBA, 0x83, 0xBA, 0x84, 0xF4, 0xA1, /* 0x1C-0x1F */ + 0xBA, 0x85, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0x20-0x23 */ + 0xBA, 0x89, 0xBA, 0x8A, 0xF4, 0xA3, 0xBB, 0xC9, /* 0x24-0x27 */ + 0xBA, 0x8B, 0xBA, 0x8C, 0xF4, 0xA2, 0xBA, 0x8D, /* 0x28-0x2B */ + 0xBA, 0x8E, 0xBA, 0x8F, 0xBA, 0x90, 0xBA, 0x91, /* 0x2C-0x2F */ + 0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0x30-0x33 */ + 0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0x34-0x37 */ + 0xF4, 0xA4, 0xBA, 0x9A, 0xBA, 0x9B, 0xBA, 0x9C, /* 0x38-0x3B */ + 0xBA, 0x9D, 0xBA, 0x9E, 0xBA, 0x9F, 0xB2, 0xBE, /* 0x3C-0x3F */ + 0xF4, 0xA6, 0xF4, 0xA5, 0xBA, 0xA0, 0xBB, 0x40, /* 0x40-0x43 */ + 0xBB, 0x41, 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, /* 0x44-0x47 */ + 0xBB, 0x45, 0xBB, 0x46, 0xBB, 0x47, 0xBB, 0x48, /* 0x48-0x4B */ + 0xBB, 0x49, 0xBC, 0xAE, 0xBB, 0x4A, 0xBB, 0x4B, /* 0x4C-0x4F */ + 0xBB, 0x4C, 0xBB, 0x4D, 0xBB, 0x4E, 0xBB, 0x4F, /* 0x50-0x53 */ + 0xBB, 0x50, 0xBB, 0x51, 0xBB, 0x52, 0xBB, 0x53, /* 0x54-0x57 */ + 0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x58-0x5B */ + 0xBB, 0x58, 0xBB, 0x59, 0xBB, 0x5A, 0xBB, 0x5B, /* 0x5C-0x5F */ + 0xBB, 0x5C, 0xBB, 0x5D, 0xBB, 0x5E, 0xBB, 0x5F, /* 0x60-0x63 */ + 0xBB, 0x60, 0xBB, 0x61, 0xBB, 0x62, 0xBB, 0x63, /* 0x64-0x67 */ + 0xBB, 0x64, 0xBB, 0x65, 0xBB, 0x66, 0xBB, 0x67, /* 0x68-0x6B */ + 0xBB, 0x68, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x6C-0x6F */ + 0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xC3, 0xD7, /* 0x70-0x73 */ + 0xD9, 0xE1, 0xBB, 0x6F, 0xBB, 0x70, 0xBB, 0x71, /* 0x74-0x77 */ + 0xBB, 0x72, 0xBB, 0x73, 0xBB, 0x74, 0xC0, 0xE0, /* 0x78-0x7B */ + 0xF4, 0xCC, 0xD7, 0xD1, 0xBB, 0x75, 0xBB, 0x76, /* 0x7C-0x7F */ + + 0xBB, 0x77, 0xBB, 0x78, 0xBB, 0x79, 0xBB, 0x7A, /* 0x80-0x83 */ + 0xBB, 0x7B, 0xBB, 0x7C, 0xBB, 0x7D, 0xBB, 0x7E, /* 0x84-0x87 */ + 0xBB, 0x80, 0xB7, 0xDB, 0xBB, 0x81, 0xBB, 0x82, /* 0x88-0x8B */ + 0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x8C-0x8F */ + 0xBB, 0x87, 0xF4, 0xCE, 0xC1, 0xA3, 0xBB, 0x88, /* 0x90-0x93 */ + 0xBB, 0x89, 0xC6, 0xC9, 0xBB, 0x8A, 0xB4, 0xD6, /* 0x94-0x97 */ + 0xD5, 0xB3, 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, /* 0x98-0x9B */ + 0xF4, 0xD0, 0xF4, 0xCF, 0xF4, 0xD1, 0xCB, 0xDA, /* 0x9C-0x9F */ + 0xBB, 0x8E, 0xBB, 0x8F, 0xF4, 0xD2, 0xBB, 0x90, /* 0xA0-0xA3 */ + 0xD4, 0xC1, 0xD6, 0xE0, 0xBB, 0x91, 0xBB, 0x92, /* 0xA4-0xA7 */ + 0xBB, 0x93, 0xBB, 0x94, 0xB7, 0xE0, 0xBB, 0x95, /* 0xA8-0xAB */ + 0xBB, 0x96, 0xBB, 0x97, 0xC1, 0xB8, 0xBB, 0x98, /* 0xAC-0xAF */ + 0xBB, 0x99, 0xC1, 0xBB, 0xF4, 0xD3, 0xBE, 0xAC, /* 0xB0-0xB3 */ + 0xBB, 0x9A, 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, /* 0xB4-0xB7 */ + 0xBB, 0x9E, 0xB4, 0xE2, 0xBB, 0x9F, 0xBB, 0xA0, /* 0xB8-0xBB */ + 0xF4, 0xD4, 0xF4, 0xD5, 0xBE, 0xAB, 0xBC, 0x40, /* 0xBC-0xBF */ + 0xBC, 0x41, 0xF4, 0xD6, 0xBC, 0x42, 0xBC, 0x43, /* 0xC0-0xC3 */ + 0xBC, 0x44, 0xF4, 0xDB, 0xBC, 0x45, 0xF4, 0xD7, /* 0xC4-0xC7 */ + 0xF4, 0xDA, 0xBC, 0x46, 0xBA, 0xFD, 0xBC, 0x47, /* 0xC8-0xCB */ + 0xF4, 0xD8, 0xF4, 0xD9, 0xBC, 0x48, 0xBC, 0x49, /* 0xCC-0xCF */ + 0xBC, 0x4A, 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, /* 0xD0-0xD3 */ + 0xBC, 0x4E, 0xB8, 0xE2, 0xCC, 0xC7, 0xF4, 0xDC, /* 0xD4-0xD7 */ + 0xBC, 0x4F, 0xB2, 0xDA, 0xBC, 0x50, 0xBC, 0x51, /* 0xD8-0xDB */ + 0xC3, 0xD3, 0xBC, 0x52, 0xBC, 0x53, 0xD4, 0xE3, /* 0xDC-0xDF */ + 0xBF, 0xB7, 0xBC, 0x54, 0xBC, 0x55, 0xBC, 0x56, /* 0xE0-0xE3 */ + 0xBC, 0x57, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0xE4-0xE7 */ + 0xF4, 0xDD, 0xBC, 0x5B, 0xBC, 0x5C, 0xBC, 0x5D, /* 0xE8-0xEB */ + 0xBC, 0x5E, 0xBC, 0x5F, 0xBC, 0x60, 0xC5, 0xB4, /* 0xEC-0xEF */ + 0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0xF0-0xF3 */ + 0xBC, 0x65, 0xBC, 0x66, 0xBC, 0x67, 0xBC, 0x68, /* 0xF4-0xF7 */ + 0xF4, 0xE9, 0xBC, 0x69, 0xBC, 0x6A, 0xCF, 0xB5, /* 0xF8-0xFB */ + 0xBC, 0x6B, 0xBC, 0x6C, 0xBC, 0x6D, 0xBC, 0x6E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xBC, 0x6F, 0xBC, 0x70, 0xBC, 0x71, 0xBC, 0x72, /* 0x00-0x03 */ + 0xBC, 0x73, 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, /* 0x04-0x07 */ + 0xBC, 0x77, 0xBC, 0x78, 0xCE, 0xC9, 0xBC, 0x79, /* 0x08-0x0B */ + 0xBC, 0x7A, 0xBC, 0x7B, 0xBC, 0x7C, 0xBC, 0x7D, /* 0x0C-0x0F */ + 0xBC, 0x7E, 0xBC, 0x80, 0xBC, 0x81, 0xBC, 0x82, /* 0x10-0x13 */ + 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, 0xBC, 0x86, /* 0x14-0x17 */ + 0xBC, 0x87, 0xBC, 0x88, 0xBC, 0x89, 0xBC, 0x8A, /* 0x18-0x1B */ + 0xBC, 0x8B, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0x1C-0x1F */ + 0xCB, 0xD8, 0xBC, 0x8F, 0xCB, 0xF7, 0xBC, 0x90, /* 0x20-0x23 */ + 0xBC, 0x91, 0xBC, 0x92, 0xBC, 0x93, 0xBD, 0xF4, /* 0x24-0x27 */ + 0xBC, 0x94, 0xBC, 0x95, 0xBC, 0x96, 0xD7, 0xCF, /* 0x28-0x2B */ + 0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xC0, 0xDB, /* 0x2C-0x2F */ + 0xBC, 0x9A, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0x30-0x33 */ + 0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x40, /* 0x34-0x37 */ + 0xBD, 0x41, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0x38-0x3B */ + 0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0x3C-0x3F */ + 0xBD, 0x49, 0xBD, 0x4A, 0xBD, 0x4B, 0xBD, 0x4C, /* 0x40-0x43 */ + 0xBD, 0x4D, 0xBD, 0x4E, 0xBD, 0x4F, 0xBD, 0x50, /* 0x44-0x47 */ + 0xBD, 0x51, 0xBD, 0x52, 0xBD, 0x53, 0xBD, 0x54, /* 0x48-0x4B */ + 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, 0xBD, 0x58, /* 0x4C-0x4F */ + 0xBD, 0x59, 0xBD, 0x5A, 0xBD, 0x5B, 0xBD, 0x5C, /* 0x50-0x53 */ + 0xBD, 0x5D, 0xBD, 0x5E, 0xBD, 0x5F, 0xBD, 0x60, /* 0x54-0x57 */ + 0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0x58-0x5B */ + 0xBD, 0x65, 0xBD, 0x66, 0xBD, 0x67, 0xBD, 0x68, /* 0x5C-0x5F */ + 0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x60-0x63 */ + 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, 0xBD, 0x70, /* 0x64-0x67 */ + 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, 0xBD, 0x74, /* 0x68-0x6B */ + 0xBD, 0x75, 0xBD, 0x76, 0xD0, 0xF5, 0xBD, 0x77, /* 0x6C-0x6F */ + 0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x7B, /* 0x70-0x73 */ + 0xBD, 0x7C, 0xBD, 0x7D, 0xBD, 0x7E, 0xF4, 0xEA, /* 0x74-0x77 */ + 0xBD, 0x80, 0xBD, 0x81, 0xBD, 0x82, 0xBD, 0x83, /* 0x78-0x7B */ + 0xBD, 0x84, 0xBD, 0x85, 0xBD, 0x86, 0xBD, 0x87, /* 0x7C-0x7F */ + + 0xBD, 0x88, 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, /* 0x80-0x83 */ + 0xBD, 0x8C, 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, /* 0x84-0x87 */ + 0xBD, 0x90, 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, /* 0x88-0x8B */ + 0xBD, 0x94, 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, /* 0x8C-0x8F */ + 0xBD, 0x98, 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, /* 0x90-0x93 */ + 0xBD, 0x9C, 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, /* 0x94-0x97 */ + 0xBD, 0xA0, 0xBE, 0x40, 0xBE, 0x41, 0xBE, 0x42, /* 0x98-0x9B */ + 0xBE, 0x43, 0xBE, 0x44, 0xBE, 0x45, 0xBE, 0x46, /* 0x9C-0x9F */ + 0xBE, 0x47, 0xBE, 0x48, 0xBE, 0x49, 0xBE, 0x4A, /* 0xA0-0xA3 */ + 0xBE, 0x4B, 0xBE, 0x4C, 0xF4, 0xEB, 0xBE, 0x4D, /* 0xA4-0xA7 */ + 0xBE, 0x4E, 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, /* 0xA8-0xAB */ + 0xBE, 0x52, 0xBE, 0x53, 0xF4, 0xEC, 0xBE, 0x54, /* 0xAC-0xAF */ + 0xBE, 0x55, 0xBE, 0x56, 0xBE, 0x57, 0xBE, 0x58, /* 0xB0-0xB3 */ + 0xBE, 0x59, 0xBE, 0x5A, 0xBE, 0x5B, 0xBE, 0x5C, /* 0xB4-0xB7 */ + 0xBE, 0x5D, 0xBE, 0x5E, 0xBE, 0x5F, 0xBE, 0x60, /* 0xB8-0xBB */ + 0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0xBC-0xBF */ + 0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0xC0-0xC3 */ + 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, 0xBE, 0x6C, /* 0xC4-0xC7 */ + 0xBE, 0x6D, 0xBE, 0x6E, 0xBE, 0x6F, 0xBE, 0x70, /* 0xC8-0xCB */ + 0xBE, 0x71, 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, /* 0xCC-0xCF */ + 0xBE, 0x75, 0xBE, 0x76, 0xBE, 0x77, 0xBE, 0x78, /* 0xD0-0xD3 */ + 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x7B, 0xBE, 0x7C, /* 0xD4-0xD7 */ + 0xBE, 0x7D, 0xBE, 0x7E, 0xBE, 0x80, 0xBE, 0x81, /* 0xD8-0xDB */ + 0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0xDC-0xDF */ + 0xBE, 0x86, 0xBE, 0x87, 0xBE, 0x88, 0xBE, 0x89, /* 0xE0-0xE3 */ + 0xBE, 0x8A, 0xBE, 0x8B, 0xBE, 0x8C, 0xBE, 0x8D, /* 0xE4-0xE7 */ + 0xBE, 0x8E, 0xBE, 0x8F, 0xBE, 0x90, 0xBE, 0x91, /* 0xE8-0xEB */ + 0xBE, 0x92, 0xBE, 0x93, 0xBE, 0x94, 0xBE, 0x95, /* 0xEC-0xEF */ + 0xBE, 0x96, 0xBE, 0x97, 0xBE, 0x98, 0xBE, 0x99, /* 0xF0-0xF3 */ + 0xBE, 0x9A, 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, /* 0xF4-0xF7 */ + 0xBE, 0x9E, 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x40, /* 0xF8-0xFB */ + 0xBF, 0x41, 0xBF, 0x42, 0xBF, 0x43, 0xBF, 0x44, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7E[512] = { + 0xBF, 0x45, 0xBF, 0x46, 0xBF, 0x47, 0xBF, 0x48, /* 0x00-0x03 */ + 0xBF, 0x49, 0xBF, 0x4A, 0xBF, 0x4B, 0xBF, 0x4C, /* 0x04-0x07 */ + 0xBF, 0x4D, 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, /* 0x08-0x0B */ + 0xBF, 0x51, 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, /* 0x0C-0x0F */ + 0xBF, 0x55, 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, /* 0x10-0x13 */ + 0xBF, 0x59, 0xBF, 0x5A, 0xBF, 0x5B, 0xBF, 0x5C, /* 0x14-0x17 */ + 0xBF, 0x5D, 0xBF, 0x5E, 0xBF, 0x5F, 0xBF, 0x60, /* 0x18-0x1B */ + 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, 0xBF, 0x64, /* 0x1C-0x1F */ + 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, 0xBF, 0x68, /* 0x20-0x23 */ + 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, 0xBF, 0x6C, /* 0x24-0x27 */ + 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, 0xBF, 0x70, /* 0x28-0x2B */ + 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, 0xBF, 0x74, /* 0x2C-0x2F */ + 0xBF, 0x75, 0xBF, 0x76, 0xBF, 0x77, 0xBF, 0x78, /* 0x30-0x33 */ + 0xBF, 0x79, 0xBF, 0x7A, 0xBF, 0x7B, 0xBF, 0x7C, /* 0x34-0x37 */ + 0xBF, 0x7D, 0xBF, 0x7E, 0xBF, 0x80, 0xF7, 0xE3, /* 0x38-0x3B */ + 0xBF, 0x81, 0xBF, 0x82, 0xBF, 0x83, 0xBF, 0x84, /* 0x3C-0x3F */ + 0xBF, 0x85, 0xB7, 0xB1, 0xBF, 0x86, 0xBF, 0x87, /* 0x40-0x43 */ + 0xBF, 0x88, 0xBF, 0x89, 0xBF, 0x8A, 0xF4, 0xED, /* 0x44-0x47 */ + 0xBF, 0x8B, 0xBF, 0x8C, 0xBF, 0x8D, 0xBF, 0x8E, /* 0x48-0x4B */ + 0xBF, 0x8F, 0xBF, 0x90, 0xBF, 0x91, 0xBF, 0x92, /* 0x4C-0x4F */ + 0xBF, 0x93, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0x50-0x53 */ + 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, 0xBF, 0x9A, /* 0x54-0x57 */ + 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, 0xBF, 0x9E, /* 0x58-0x5B */ + 0xBF, 0x9F, 0xBF, 0xA0, 0xC0, 0x40, 0xC0, 0x41, /* 0x5C-0x5F */ + 0xC0, 0x42, 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, /* 0x60-0x63 */ + 0xC0, 0x46, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x64-0x67 */ + 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, 0xC0, 0x4D, /* 0x68-0x6B */ + 0xC0, 0x4E, 0xC0, 0x4F, 0xC0, 0x50, 0xC0, 0x51, /* 0x6C-0x6F */ + 0xC0, 0x52, 0xC0, 0x53, 0xC0, 0x54, 0xC0, 0x55, /* 0x70-0x73 */ + 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, 0xC0, 0x59, /* 0x74-0x77 */ + 0xC0, 0x5A, 0xC0, 0x5B, 0xC0, 0x5C, 0xC0, 0x5D, /* 0x78-0x7B */ + 0xC0, 0x5E, 0xC0, 0x5F, 0xC0, 0x60, 0xC0, 0x61, /* 0x7C-0x7F */ + + 0xC0, 0x62, 0xC0, 0x63, 0xD7, 0xEB, 0xC0, 0x64, /* 0x80-0x83 */ + 0xC0, 0x65, 0xC0, 0x66, 0xC0, 0x67, 0xC0, 0x68, /* 0x84-0x87 */ + 0xC0, 0x69, 0xC0, 0x6A, 0xC0, 0x6B, 0xC0, 0x6C, /* 0x88-0x8B */ + 0xC0, 0x6D, 0xC0, 0x6E, 0xC0, 0x6F, 0xC0, 0x70, /* 0x8C-0x8F */ + 0xC0, 0x71, 0xC0, 0x72, 0xC0, 0x73, 0xC0, 0x74, /* 0x90-0x93 */ + 0xC0, 0x75, 0xC0, 0x76, 0xC0, 0x77, 0xC0, 0x78, /* 0x94-0x97 */ + 0xC0, 0x79, 0xC0, 0x7A, 0xC0, 0x7B, 0xF4, 0xEE, /* 0x98-0x9B */ + 0xC0, 0x7C, 0xC0, 0x7D, 0xC0, 0x7E, 0xE6, 0xF9, /* 0x9C-0x9F */ + 0xBE, 0xC0, 0xE6, 0xFA, 0xBA, 0xEC, 0xE6, 0xFB, /* 0xA0-0xA3 */ + 0xCF, 0xCB, 0xE6, 0xFC, 0xD4, 0xBC, 0xBC, 0xB6, /* 0xA4-0xA7 */ + 0xE6, 0xFD, 0xE6, 0xFE, 0xBC, 0xCD, 0xC8, 0xD2, /* 0xA8-0xAB */ + 0xCE, 0xB3, 0xE7, 0xA1, 0xC0, 0x80, 0xB4, 0xBF, /* 0xAC-0xAF */ + 0xE7, 0xA2, 0xC9, 0xB4, 0xB8, 0xD9, 0xC4, 0xC9, /* 0xB0-0xB3 */ + 0xC0, 0x81, 0xD7, 0xDD, 0xC2, 0xDA, 0xB7, 0xD7, /* 0xB4-0xB7 */ + 0xD6, 0xBD, 0xCE, 0xC6, 0xB7, 0xC4, 0xC0, 0x82, /* 0xB8-0xBB */ + 0xC0, 0x83, 0xC5, 0xA6, 0xE7, 0xA3, 0xCF, 0xDF, /* 0xBC-0xBF */ + 0xE7, 0xA4, 0xE7, 0xA5, 0xE7, 0xA6, 0xC1, 0xB7, /* 0xC0-0xC3 */ + 0xD7, 0xE9, 0xC9, 0xF0, 0xCF, 0xB8, 0xD6, 0xAF, /* 0xC4-0xC7 */ + 0xD6, 0xD5, 0xE7, 0xA7, 0xB0, 0xED, 0xE7, 0xA8, /* 0xC8-0xCB */ + 0xE7, 0xA9, 0xC9, 0xDC, 0xD2, 0xEF, 0xBE, 0xAD, /* 0xCC-0xCF */ + 0xE7, 0xAA, 0xB0, 0xF3, 0xC8, 0xDE, 0xBD, 0xE1, /* 0xD0-0xD3 */ + 0xE7, 0xAB, 0xC8, 0xC6, 0xC0, 0x84, 0xE7, 0xAC, /* 0xD4-0xD7 */ + 0xBB, 0xE6, 0xB8, 0xF8, 0xD1, 0xA4, 0xE7, 0xAD, /* 0xD8-0xDB */ + 0xC2, 0xE7, 0xBE, 0xF8, 0xBD, 0xCA, 0xCD, 0xB3, /* 0xDC-0xDF */ + 0xE7, 0xAE, 0xE7, 0xAF, 0xBE, 0xEE, 0xD0, 0xE5, /* 0xE0-0xE3 */ + 0xC0, 0x85, 0xCB, 0xE7, 0xCC, 0xD0, 0xBC, 0xCC, /* 0xE4-0xE7 */ + 0xE7, 0xB0, 0xBC, 0xA8, 0xD0, 0xF7, 0xE7, 0xB1, /* 0xE8-0xEB */ + 0xC0, 0x86, 0xD0, 0xF8, 0xE7, 0xB2, 0xE7, 0xB3, /* 0xEC-0xEF */ + 0xB4, 0xC2, 0xE7, 0xB4, 0xE7, 0xB5, 0xC9, 0xFE, /* 0xF0-0xF3 */ + 0xCE, 0xAC, 0xC3, 0xE0, 0xE7, 0xB7, 0xB1, 0xC1, /* 0xF4-0xF7 */ + 0xB3, 0xF1, 0xC0, 0x87, 0xE7, 0xB8, 0xE7, 0xB9, /* 0xF8-0xFB */ + 0xD7, 0xDB, 0xD5, 0xC0, 0xE7, 0xBA, 0xC2, 0xCC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7F[512] = { + 0xD7, 0xBA, 0xE7, 0xBB, 0xE7, 0xBC, 0xE7, 0xBD, /* 0x00-0x03 */ + 0xBC, 0xEA, 0xC3, 0xE5, 0xC0, 0xC2, 0xE7, 0xBE, /* 0x04-0x07 */ + 0xE7, 0xBF, 0xBC, 0xA9, 0xC0, 0x88, 0xE7, 0xC0, /* 0x08-0x0B */ + 0xE7, 0xC1, 0xE7, 0xB6, 0xB6, 0xD0, 0xE7, 0xC2, /* 0x0C-0x0F */ + 0xC0, 0x89, 0xE7, 0xC3, 0xE7, 0xC4, 0xBB, 0xBA, /* 0x10-0x13 */ + 0xB5, 0xDE, 0xC2, 0xC6, 0xB1, 0xE0, 0xE7, 0xC5, /* 0x14-0x17 */ + 0xD4, 0xB5, 0xE7, 0xC6, 0xB8, 0xBF, 0xE7, 0xC8, /* 0x18-0x1B */ + 0xE7, 0xC7, 0xB7, 0xEC, 0xC0, 0x8A, 0xE7, 0xC9, /* 0x1C-0x1F */ + 0xB2, 0xF8, 0xE7, 0xCA, 0xE7, 0xCB, 0xE7, 0xCC, /* 0x20-0x23 */ + 0xE7, 0xCD, 0xE7, 0xCE, 0xE7, 0xCF, 0xE7, 0xD0, /* 0x24-0x27 */ + 0xD3, 0xA7, 0xCB, 0xF5, 0xE7, 0xD1, 0xE7, 0xD2, /* 0x28-0x2B */ + 0xE7, 0xD3, 0xE7, 0xD4, 0xC9, 0xC9, 0xE7, 0xD5, /* 0x2C-0x2F */ + 0xE7, 0xD6, 0xE7, 0xD7, 0xE7, 0xD8, 0xE7, 0xD9, /* 0x30-0x33 */ + 0xBD, 0xC9, 0xE7, 0xDA, 0xF3, 0xBE, 0xC0, 0x8B, /* 0x34-0x37 */ + 0xB8, 0xD7, 0xC0, 0x8C, 0xC8, 0xB1, 0xC0, 0x8D, /* 0x38-0x3B */ + 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, 0xC0, 0x91, /* 0x3C-0x3F */ + 0xC0, 0x92, 0xC0, 0x93, 0xF3, 0xBF, 0xC0, 0x94, /* 0x40-0x43 */ + 0xF3, 0xC0, 0xF3, 0xC1, 0xC0, 0x95, 0xC0, 0x96, /* 0x44-0x47 */ + 0xC0, 0x97, 0xC0, 0x98, 0xC0, 0x99, 0xC0, 0x9A, /* 0x48-0x4B */ + 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, 0xC0, 0x9E, /* 0x4C-0x4F */ + 0xB9, 0xDE, 0xCD, 0xF8, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x50-0x53 */ + 0xD8, 0xE8, 0xBA, 0xB1, 0xC1, 0x40, 0xC2, 0xDE, /* 0x54-0x57 */ + 0xEE, 0xB7, 0xC1, 0x41, 0xB7, 0xA3, 0xC1, 0x42, /* 0x58-0x5B */ + 0xC1, 0x43, 0xC1, 0x44, 0xC1, 0x45, 0xEE, 0xB9, /* 0x5C-0x5F */ + 0xC1, 0x46, 0xEE, 0xB8, 0xB0, 0xD5, 0xC1, 0x47, /* 0x60-0x63 */ + 0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x64-0x67 */ + 0xEE, 0xBB, 0xD5, 0xD6, 0xD7, 0xEF, 0xC1, 0x4C, /* 0x68-0x6B */ + 0xC1, 0x4D, 0xC1, 0x4E, 0xD6, 0xC3, 0xC1, 0x4F, /* 0x6C-0x6F */ + 0xC1, 0x50, 0xEE, 0xBD, 0xCA, 0xF0, 0xC1, 0x51, /* 0x70-0x73 */ + 0xEE, 0xBC, 0xC1, 0x52, 0xC1, 0x53, 0xC1, 0x54, /* 0x74-0x77 */ + 0xC1, 0x55, 0xEE, 0xBE, 0xC1, 0x56, 0xC1, 0x57, /* 0x78-0x7B */ + 0xC1, 0x58, 0xC1, 0x59, 0xEE, 0xC0, 0xC1, 0x5A, /* 0x7C-0x7F */ + + 0xC1, 0x5B, 0xEE, 0xBF, 0xC1, 0x5C, 0xC1, 0x5D, /* 0x80-0x83 */ + 0xC1, 0x5E, 0xC1, 0x5F, 0xC1, 0x60, 0xC1, 0x61, /* 0x84-0x87 */ + 0xC1, 0x62, 0xC1, 0x63, 0xD1, 0xF2, 0xC1, 0x64, /* 0x88-0x8B */ + 0xC7, 0xBC, 0xC1, 0x65, 0xC3, 0xC0, 0xC1, 0x66, /* 0x8C-0x8F */ + 0xC1, 0x67, 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, /* 0x90-0x93 */ + 0xB8, 0xE1, 0xC1, 0x6B, 0xC1, 0x6C, 0xC1, 0x6D, /* 0x94-0x97 */ + 0xC1, 0x6E, 0xC1, 0x6F, 0xC1, 0xE7, 0xC1, 0x70, /* 0x98-0x9B */ + 0xC1, 0x71, 0xF4, 0xC6, 0xD0, 0xDF, 0xF4, 0xC7, /* 0x9C-0x9F */ + 0xC1, 0x72, 0xCF, 0xDB, 0xC1, 0x73, 0xC1, 0x74, /* 0xA0-0xA3 */ + 0xC8, 0xBA, 0xC1, 0x75, 0xC1, 0x76, 0xF4, 0xC8, /* 0xA4-0xA7 */ + 0xC1, 0x77, 0xC1, 0x78, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA8-0xAB */ + 0xC1, 0x7B, 0xC1, 0x7C, 0xC1, 0x7D, 0xF4, 0xC9, /* 0xAC-0xAF */ + 0xF4, 0xCA, 0xC1, 0x7E, 0xF4, 0xCB, 0xC1, 0x80, /* 0xB0-0xB3 */ + 0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xB4-0xB7 */ + 0xD9, 0xFA, 0xB8, 0xFE, 0xC1, 0x85, 0xC1, 0x86, /* 0xB8-0xBB */ + 0xE5, 0xF1, 0xD3, 0xF0, 0xC1, 0x87, 0xF4, 0xE0, /* 0xBC-0xBF */ + 0xC1, 0x88, 0xCE, 0xCC, 0xC1, 0x89, 0xC1, 0x8A, /* 0xC0-0xC3 */ + 0xC1, 0x8B, 0xB3, 0xE1, 0xC1, 0x8C, 0xC1, 0x8D, /* 0xC4-0xC7 */ + 0xC1, 0x8E, 0xC1, 0x8F, 0xF1, 0xB4, 0xC1, 0x90, /* 0xC8-0xCB */ + 0xD2, 0xEE, 0xC1, 0x91, 0xF4, 0xE1, 0xC1, 0x92, /* 0xCC-0xCF */ + 0xC1, 0x93, 0xC1, 0x94, 0xC1, 0x95, 0xC1, 0x96, /* 0xD0-0xD3 */ + 0xCF, 0xE8, 0xF4, 0xE2, 0xC1, 0x97, 0xC1, 0x98, /* 0xD4-0xD7 */ + 0xC7, 0xCC, 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, /* 0xD8-0xDB */ + 0xC1, 0x9C, 0xC1, 0x9D, 0xC1, 0x9E, 0xB5, 0xD4, /* 0xDC-0xDF */ + 0xB4, 0xE4, 0xF4, 0xE4, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xE0-0xE3 */ + 0xC2, 0x40, 0xF4, 0xE3, 0xF4, 0xE5, 0xC2, 0x41, /* 0xE4-0xE7 */ + 0xC2, 0x42, 0xF4, 0xE6, 0xC2, 0x43, 0xC2, 0x44, /* 0xE8-0xEB */ + 0xC2, 0x45, 0xC2, 0x46, 0xF4, 0xE7, 0xC2, 0x47, /* 0xEC-0xEF */ + 0xBA, 0xB2, 0xB0, 0xBF, 0xC2, 0x48, 0xF4, 0xE8, /* 0xF0-0xF3 */ + 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x4C, /* 0xF4-0xF7 */ + 0xC2, 0x4D, 0xC2, 0x4E, 0xC2, 0x4F, 0xB7, 0xAD, /* 0xF8-0xFB */ + 0xD2, 0xED, 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xD2, 0xAB, 0xC0, 0xCF, 0xC2, 0x53, 0xBF, 0xBC, /* 0x00-0x03 */ + 0xEB, 0xA3, 0xD5, 0xDF, 0xEA, 0xC8, 0xC2, 0x54, /* 0x04-0x07 */ + 0xC2, 0x55, 0xC2, 0x56, 0xC2, 0x57, 0xF1, 0xF3, /* 0x08-0x0B */ + 0xB6, 0xF8, 0xCB, 0xA3, 0xC2, 0x58, 0xC2, 0x59, /* 0x0C-0x0F */ + 0xC4, 0xCD, 0xC2, 0x5A, 0xF1, 0xE7, 0xC2, 0x5B, /* 0x10-0x13 */ + 0xF1, 0xE8, 0xB8, 0xFB, 0xF1, 0xE9, 0xBA, 0xC4, /* 0x14-0x17 */ + 0xD4, 0xC5, 0xB0, 0xD2, 0xC2, 0x5C, 0xC2, 0x5D, /* 0x18-0x1B */ + 0xF1, 0xEA, 0xC2, 0x5E, 0xC2, 0x5F, 0xC2, 0x60, /* 0x1C-0x1F */ + 0xF1, 0xEB, 0xC2, 0x61, 0xF1, 0xEC, 0xC2, 0x62, /* 0x20-0x23 */ + 0xC2, 0x63, 0xF1, 0xED, 0xF1, 0xEE, 0xF1, 0xEF, /* 0x24-0x27 */ + 0xF1, 0xF1, 0xF1, 0xF0, 0xC5, 0xD5, 0xC2, 0x64, /* 0x28-0x2B */ + 0xC2, 0x65, 0xC2, 0x66, 0xC2, 0x67, 0xC2, 0x68, /* 0x2C-0x2F */ + 0xC2, 0x69, 0xF1, 0xF2, 0xC2, 0x6A, 0xB6, 0xFA, /* 0x30-0x33 */ + 0xC2, 0x6B, 0xF1, 0xF4, 0xD2, 0xAE, 0xDE, 0xC7, /* 0x34-0x37 */ + 0xCB, 0xCA, 0xC2, 0x6C, 0xC2, 0x6D, 0xB3, 0xDC, /* 0x38-0x3B */ + 0xC2, 0x6E, 0xB5, 0xA2, 0xC2, 0x6F, 0xB9, 0xA2, /* 0x3C-0x3F */ + 0xC2, 0x70, 0xC2, 0x71, 0xC4, 0xF4, 0xF1, 0xF5, /* 0x40-0x43 */ + 0xC2, 0x72, 0xC2, 0x73, 0xF1, 0xF6, 0xC2, 0x74, /* 0x44-0x47 */ + 0xC2, 0x75, 0xC2, 0x76, 0xC1, 0xC4, 0xC1, 0xFB, /* 0x48-0x4B */ + 0xD6, 0xB0, 0xF1, 0xF7, 0xC2, 0x77, 0xC2, 0x78, /* 0x4C-0x4F */ + 0xC2, 0x79, 0xC2, 0x7A, 0xF1, 0xF8, 0xC2, 0x7B, /* 0x50-0x53 */ + 0xC1, 0xAA, 0xC2, 0x7C, 0xC2, 0x7D, 0xC2, 0x7E, /* 0x54-0x57 */ + 0xC6, 0xB8, 0xC2, 0x80, 0xBE, 0xDB, 0xC2, 0x81, /* 0x58-0x5B */ + 0xC2, 0x82, 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, /* 0x5C-0x5F */ + 0xC2, 0x86, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x60-0x63 */ + 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, 0xC2, 0x8D, /* 0x64-0x67 */ + 0xC2, 0x8E, 0xF1, 0xF9, 0xB4, 0xCF, 0xC2, 0x8F, /* 0x68-0x6B */ + 0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x6C-0x6F */ + 0xC2, 0x94, 0xF1, 0xFA, 0xC2, 0x95, 0xC2, 0x96, /* 0x70-0x73 */ + 0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x74-0x77 */ + 0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x78-0x7B */ + 0xC2, 0x9F, 0xC2, 0xA0, 0xC3, 0x40, 0xED, 0xB2, /* 0x7C-0x7F */ + + 0xED, 0xB1, 0xC3, 0x41, 0xC3, 0x42, 0xCB, 0xE0, /* 0x80-0x83 */ + 0xD2, 0xDE, 0xC3, 0x43, 0xCB, 0xC1, 0xD5, 0xD8, /* 0x84-0x87 */ + 0xC3, 0x44, 0xC8, 0xE2, 0xC3, 0x45, 0xC0, 0xDF, /* 0x88-0x8B */ + 0xBC, 0xA1, 0xC3, 0x46, 0xC3, 0x47, 0xC3, 0x48, /* 0x8C-0x8F */ + 0xC3, 0x49, 0xC3, 0x4A, 0xC3, 0x4B, 0xEB, 0xC1, /* 0x90-0x93 */ + 0xC3, 0x4C, 0xC3, 0x4D, 0xD0, 0xA4, 0xC3, 0x4E, /* 0x94-0x97 */ + 0xD6, 0xE2, 0xC3, 0x4F, 0xB6, 0xC7, 0xB8, 0xD8, /* 0x98-0x9B */ + 0xEB, 0xC0, 0xB8, 0xCE, 0xC3, 0x50, 0xEB, 0xBF, /* 0x9C-0x9F */ + 0xB3, 0xA6, 0xB9, 0xC9, 0xD6, 0xAB, 0xC3, 0x51, /* 0xA0-0xA3 */ + 0xB7, 0xF4, 0xB7, 0xCA, 0xC3, 0x52, 0xC3, 0x53, /* 0xA4-0xA7 */ + 0xC3, 0x54, 0xBC, 0xE7, 0xB7, 0xBE, 0xEB, 0xC6, /* 0xA8-0xAB */ + 0xC3, 0x55, 0xEB, 0xC7, 0xB0, 0xB9, 0xBF, 0xCF, /* 0xAC-0xAF */ + 0xC3, 0x56, 0xEB, 0xC5, 0xD3, 0xFD, 0xC3, 0x57, /* 0xB0-0xB3 */ + 0xEB, 0xC8, 0xC3, 0x58, 0xC3, 0x59, 0xEB, 0xC9, /* 0xB4-0xB7 */ + 0xC3, 0x5A, 0xC3, 0x5B, 0xB7, 0xCE, 0xC3, 0x5C, /* 0xB8-0xBB */ + 0xEB, 0xC2, 0xEB, 0xC4, 0xC9, 0xF6, 0xD6, 0xD7, /* 0xBC-0xBF */ + 0xD5, 0xCD, 0xD0, 0xB2, 0xEB, 0xCF, 0xCE, 0xB8, /* 0xC0-0xC3 */ + 0xEB, 0xD0, 0xC3, 0x5D, 0xB5, 0xA8, 0xC3, 0x5E, /* 0xC4-0xC7 */ + 0xC3, 0x5F, 0xC3, 0x60, 0xC3, 0x61, 0xC3, 0x62, /* 0xC8-0xCB */ + 0xB1, 0xB3, 0xEB, 0xD2, 0xCC, 0xA5, 0xC3, 0x63, /* 0xCC-0xCF */ + 0xC3, 0x64, 0xC3, 0x65, 0xC3, 0x66, 0xC3, 0x67, /* 0xD0-0xD3 */ + 0xC3, 0x68, 0xC3, 0x69, 0xC5, 0xD6, 0xEB, 0xD3, /* 0xD4-0xD7 */ + 0xC3, 0x6A, 0xEB, 0xD1, 0xC5, 0xDF, 0xEB, 0xCE, /* 0xD8-0xDB */ + 0xCA, 0xA4, 0xEB, 0xD5, 0xB0, 0xFB, 0xC3, 0x6B, /* 0xDC-0xDF */ + 0xC3, 0x6C, 0xBA, 0xFA, 0xC3, 0x6D, 0xC3, 0x6E, /* 0xE0-0xE3 */ + 0xD8, 0xB7, 0xF1, 0xE3, 0xC3, 0x6F, 0xEB, 0xCA, /* 0xE4-0xE7 */ + 0xEB, 0xCB, 0xEB, 0xCC, 0xEB, 0xCD, 0xEB, 0xD6, /* 0xE8-0xEB */ + 0xE6, 0xC0, 0xEB, 0xD9, 0xC3, 0x70, 0xBF, 0xE8, /* 0xEC-0xEF */ + 0xD2, 0xC8, 0xEB, 0xD7, 0xEB, 0xDC, 0xB8, 0xEC, /* 0xF0-0xF3 */ + 0xEB, 0xD8, 0xC3, 0x71, 0xBD, 0xBA, 0xC3, 0x72, /* 0xF4-0xF7 */ + 0xD0, 0xD8, 0xC3, 0x73, 0xB0, 0xB7, 0xC3, 0x74, /* 0xF8-0xFB */ + 0xEB, 0xDD, 0xC4, 0xDC, 0xC3, 0x75, 0xC3, 0x76, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0xC3, 0x77, 0xC3, 0x78, 0xD6, 0xAC, 0xC3, 0x79, /* 0x00-0x03 */ + 0xC3, 0x7A, 0xC3, 0x7B, 0xB4, 0xE0, 0xC3, 0x7C, /* 0x04-0x07 */ + 0xC3, 0x7D, 0xC2, 0xF6, 0xBC, 0xB9, 0xC3, 0x7E, /* 0x08-0x0B */ + 0xC3, 0x80, 0xEB, 0xDA, 0xEB, 0xDB, 0xD4, 0xE0, /* 0x0C-0x0F */ + 0xC6, 0xEA, 0xC4, 0xD4, 0xEB, 0xDF, 0xC5, 0xA7, /* 0x10-0x13 */ + 0xD9, 0xF5, 0xC3, 0x81, 0xB2, 0xB1, 0xC3, 0x82, /* 0x14-0x17 */ + 0xEB, 0xE4, 0xC3, 0x83, 0xBD, 0xC5, 0xC3, 0x84, /* 0x18-0x1B */ + 0xC3, 0x85, 0xC3, 0x86, 0xEB, 0xE2, 0xC3, 0x87, /* 0x1C-0x1F */ + 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x20-0x23 */ + 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, 0xC3, 0x8F, /* 0x24-0x27 */ + 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, 0xC3, 0x93, /* 0x28-0x2B */ + 0xEB, 0xE3, 0xC3, 0x94, 0xC3, 0x95, 0xB8, 0xAC, /* 0x2C-0x2F */ + 0xC3, 0x96, 0xCD, 0xD1, 0xEB, 0xE5, 0xC3, 0x97, /* 0x30-0x33 */ + 0xC3, 0x98, 0xC3, 0x99, 0xEB, 0xE1, 0xC3, 0x9A, /* 0x34-0x37 */ + 0xC1, 0xB3, 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, /* 0x38-0x3B */ + 0xC3, 0x9E, 0xC3, 0x9F, 0xC6, 0xA2, 0xC3, 0xA0, /* 0x3C-0x3F */ + 0xC4, 0x40, 0xC4, 0x41, 0xC4, 0x42, 0xC4, 0x43, /* 0x40-0x43 */ + 0xC4, 0x44, 0xC4, 0x45, 0xCC, 0xF3, 0xC4, 0x46, /* 0x44-0x47 */ + 0xEB, 0xE6, 0xC4, 0x47, 0xC0, 0xB0, 0xD2, 0xB8, /* 0x48-0x4B */ + 0xEB, 0xE7, 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, /* 0x4C-0x4F */ + 0xB8, 0xAF, 0xB8, 0xAD, 0xC4, 0x4B, 0xEB, 0xE8, /* 0x50-0x53 */ + 0xC7, 0xBB, 0xCD, 0xF3, 0xC4, 0x4C, 0xC4, 0x4D, /* 0x54-0x57 */ + 0xC4, 0x4E, 0xEB, 0xEA, 0xEB, 0xEB, 0xC4, 0x4F, /* 0x58-0x5B */ + 0xC4, 0x50, 0xC4, 0x51, 0xC4, 0x52, 0xC4, 0x53, /* 0x5C-0x5F */ + 0xEB, 0xED, 0xC4, 0x54, 0xC4, 0x55, 0xC4, 0x56, /* 0x60-0x63 */ + 0xC4, 0x57, 0xD0, 0xC8, 0xC4, 0x58, 0xEB, 0xF2, /* 0x64-0x67 */ + 0xC4, 0x59, 0xEB, 0xEE, 0xC4, 0x5A, 0xC4, 0x5B, /* 0x68-0x6B */ + 0xC4, 0x5C, 0xEB, 0xF1, 0xC8, 0xF9, 0xC4, 0x5D, /* 0x6C-0x6F */ + 0xD1, 0xFC, 0xEB, 0xEC, 0xC4, 0x5E, 0xC4, 0x5F, /* 0x70-0x73 */ + 0xEB, 0xE9, 0xC4, 0x60, 0xC4, 0x61, 0xC4, 0x62, /* 0x74-0x77 */ + 0xC4, 0x63, 0xB8, 0xB9, 0xCF, 0xD9, 0xC4, 0xE5, /* 0x78-0x7B */ + 0xEB, 0xEF, 0xEB, 0xF0, 0xCC, 0xDA, 0xCD, 0xC8, /* 0x7C-0x7F */ + + 0xB0, 0xF2, 0xC4, 0x64, 0xEB, 0xF6, 0xC4, 0x65, /* 0x80-0x83 */ + 0xC4, 0x66, 0xC4, 0x67, 0xC4, 0x68, 0xC4, 0x69, /* 0x84-0x87 */ + 0xEB, 0xF5, 0xC4, 0x6A, 0xB2, 0xB2, 0xC4, 0x6B, /* 0x88-0x8B */ + 0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xB8, 0xE0, /* 0x8C-0x8F */ + 0xC4, 0x6F, 0xEB, 0xF7, 0xC4, 0x70, 0xC4, 0x71, /* 0x90-0x93 */ + 0xC4, 0x72, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0x94-0x97 */ + 0xB1, 0xEC, 0xC4, 0x76, 0xC4, 0x77, 0xCC, 0xC5, /* 0x98-0x9B */ + 0xC4, 0xA4, 0xCF, 0xA5, 0xC4, 0x78, 0xC4, 0x79, /* 0x9C-0x9F */ + 0xC4, 0x7A, 0xC4, 0x7B, 0xC4, 0x7C, 0xEB, 0xF9, /* 0xA0-0xA3 */ + 0xC4, 0x7D, 0xC4, 0x7E, 0xEC, 0xA2, 0xC4, 0x80, /* 0xA4-0xA7 */ + 0xC5, 0xF2, 0xC4, 0x81, 0xEB, 0xFA, 0xC4, 0x82, /* 0xA8-0xAB */ + 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, 0xC4, 0x86, /* 0xAC-0xAF */ + 0xC4, 0x87, 0xC4, 0x88, 0xC4, 0x89, 0xC9, 0xC5, /* 0xB0-0xB3 */ + 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, 0xC4, 0x8D, /* 0xB4-0xB7 */ + 0xC4, 0x8E, 0xC4, 0x8F, 0xE2, 0xDF, 0xEB, 0xFE, /* 0xB8-0xBB */ + 0xC4, 0x90, 0xC4, 0x91, 0xC4, 0x92, 0xC4, 0x93, /* 0xBC-0xBF */ + 0xCD, 0xCE, 0xEC, 0xA1, 0xB1, 0xDB, 0xD3, 0xB7, /* 0xC0-0xC3 */ + 0xC4, 0x94, 0xC4, 0x95, 0xD2, 0xDC, 0xC4, 0x96, /* 0xC4-0xC7 */ + 0xC4, 0x97, 0xC4, 0x98, 0xEB, 0xFD, 0xC4, 0x99, /* 0xC8-0xCB */ + 0xEB, 0xFB, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0xCC-0xCF */ + 0xC4, 0x9D, 0xC4, 0x9E, 0xC4, 0x9F, 0xC4, 0xA0, /* 0xD0-0xD3 */ + 0xC5, 0x40, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0xD4-0xD7 */ + 0xC5, 0x44, 0xC5, 0x45, 0xC5, 0x46, 0xC5, 0x47, /* 0xD8-0xDB */ + 0xC5, 0x48, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0xDC-0xDF */ + 0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xB3, 0xBC, /* 0xE0-0xE3 */ + 0xC5, 0x4F, 0xC5, 0x50, 0xC5, 0x51, 0xEA, 0xB0, /* 0xE4-0xE7 */ + 0xC5, 0x52, 0xC5, 0x53, 0xD7, 0xD4, 0xC5, 0x54, /* 0xE8-0xEB */ + 0xF4, 0xAB, 0xB3, 0xF4, 0xC5, 0x55, 0xC5, 0x56, /* 0xEC-0xEF */ + 0xC5, 0x57, 0xC5, 0x58, 0xC5, 0x59, 0xD6, 0xC1, /* 0xF0-0xF3 */ + 0xD6, 0xC2, 0xC5, 0x5A, 0xC5, 0x5B, 0xC5, 0x5C, /* 0xF4-0xF7 */ + 0xC5, 0x5D, 0xC5, 0x5E, 0xC5, 0x5F, 0xD5, 0xE9, /* 0xF8-0xFB */ + 0xBE, 0xCA, 0xC5, 0x60, 0xF4, 0xA7, 0xC5, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0xD2, 0xA8, 0xF4, 0xA8, 0xF4, 0xA9, 0xC5, 0x62, /* 0x00-0x03 */ + 0xF4, 0xAA, 0xBE, 0xCB, 0xD3, 0xDF, 0xC5, 0x63, /* 0x04-0x07 */ + 0xC5, 0x64, 0xC5, 0x65, 0xC5, 0x66, 0xC5, 0x67, /* 0x08-0x0B */ + 0xC9, 0xE0, 0xC9, 0xE1, 0xC5, 0x68, 0xC5, 0x69, /* 0x0C-0x0F */ + 0xF3, 0xC2, 0xC5, 0x6A, 0xCA, 0xE6, 0xC5, 0x6B, /* 0x10-0x13 */ + 0xCC, 0xF2, 0xC5, 0x6C, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x14-0x17 */ + 0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xE2, 0xB6, /* 0x18-0x1B */ + 0xCB, 0xB4, 0xC5, 0x72, 0xCE, 0xE8, 0xD6, 0xDB, /* 0x1C-0x1F */ + 0xC5, 0x73, 0xF4, 0xAD, 0xF4, 0xAE, 0xF4, 0xAF, /* 0x20-0x23 */ + 0xC5, 0x74, 0xC5, 0x75, 0xC5, 0x76, 0xC5, 0x77, /* 0x24-0x27 */ + 0xF4, 0xB2, 0xC5, 0x78, 0xBA, 0xBD, 0xF4, 0xB3, /* 0x28-0x2B */ + 0xB0, 0xE3, 0xF4, 0xB0, 0xC5, 0x79, 0xF4, 0xB1, /* 0x2C-0x2F */ + 0xBD, 0xA2, 0xB2, 0xD5, 0xC5, 0x7A, 0xF4, 0xB6, /* 0x30-0x33 */ + 0xF4, 0xB7, 0xB6, 0xE6, 0xB2, 0xB0, 0xCF, 0xCF, /* 0x34-0x37 */ + 0xF4, 0xB4, 0xB4, 0xAC, 0xC5, 0x7B, 0xF4, 0xB5, /* 0x38-0x3B */ + 0xC5, 0x7C, 0xC5, 0x7D, 0xF4, 0xB8, 0xC5, 0x7E, /* 0x3C-0x3F */ + 0xC5, 0x80, 0xC5, 0x81, 0xC5, 0x82, 0xC5, 0x83, /* 0x40-0x43 */ + 0xF4, 0xB9, 0xC5, 0x84, 0xC5, 0x85, 0xCD, 0xA7, /* 0x44-0x47 */ + 0xC5, 0x86, 0xF4, 0xBA, 0xC5, 0x87, 0xF4, 0xBB, /* 0x48-0x4B */ + 0xC5, 0x88, 0xC5, 0x89, 0xC5, 0x8A, 0xF4, 0xBC, /* 0x4C-0x4F */ + 0xC5, 0x8B, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x50-0x53 */ + 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, 0xC5, 0x92, /* 0x54-0x57 */ + 0xCB, 0xD2, 0xC5, 0x93, 0xF4, 0xBD, 0xC5, 0x94, /* 0x58-0x5B */ + 0xC5, 0x95, 0xC5, 0x96, 0xC5, 0x97, 0xF4, 0xBE, /* 0x5C-0x5F */ + 0xC5, 0x98, 0xC5, 0x99, 0xC5, 0x9A, 0xC5, 0x9B, /* 0x60-0x63 */ + 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, 0xC5, 0x9F, /* 0x64-0x67 */ + 0xF4, 0xBF, 0xC5, 0xA0, 0xC6, 0x40, 0xC6, 0x41, /* 0x68-0x6B */ + 0xC6, 0x42, 0xC6, 0x43, 0xF4, 0xDE, 0xC1, 0xBC, /* 0x6C-0x6F */ + 0xBC, 0xE8, 0xC6, 0x44, 0xC9, 0xAB, 0xD1, 0xDE, /* 0x70-0x73 */ + 0xE5, 0xF5, 0xC6, 0x45, 0xC6, 0x46, 0xC6, 0x47, /* 0x74-0x77 */ + 0xC6, 0x48, 0xDC, 0xB3, 0xD2, 0xD5, 0xC6, 0x49, /* 0x78-0x7B */ + 0xC6, 0x4A, 0xDC, 0xB4, 0xB0, 0xAC, 0xDC, 0xB5, /* 0x7C-0x7F */ + + 0xC6, 0x4B, 0xC6, 0x4C, 0xBD, 0xDA, 0xC6, 0x4D, /* 0x80-0x83 */ + 0xDC, 0xB9, 0xC6, 0x4E, 0xC6, 0x4F, 0xC6, 0x50, /* 0x84-0x87 */ + 0xD8, 0xC2, 0xC6, 0x51, 0xDC, 0xB7, 0xD3, 0xF3, /* 0x88-0x8B */ + 0xC6, 0x52, 0xC9, 0xD6, 0xDC, 0xBA, 0xDC, 0xB6, /* 0x8C-0x8F */ + 0xC6, 0x53, 0xDC, 0xBB, 0xC3, 0xA2, 0xC6, 0x54, /* 0x90-0x93 */ + 0xC6, 0x55, 0xC6, 0x56, 0xC6, 0x57, 0xDC, 0xBC, /* 0x94-0x97 */ + 0xDC, 0xC5, 0xDC, 0xBD, 0xC6, 0x58, 0xC6, 0x59, /* 0x98-0x9B */ + 0xCE, 0xDF, 0xD6, 0xA5, 0xC6, 0x5A, 0xDC, 0xCF, /* 0x9C-0x9F */ + 0xC6, 0x5B, 0xDC, 0xCD, 0xC6, 0x5C, 0xC6, 0x5D, /* 0xA0-0xA3 */ + 0xDC, 0xD2, 0xBD, 0xE6, 0xC2, 0xAB, 0xC6, 0x5E, /* 0xA4-0xA7 */ + 0xDC, 0xB8, 0xDC, 0xCB, 0xDC, 0xCE, 0xDC, 0xBE, /* 0xA8-0xAB */ + 0xB7, 0xD2, 0xB0, 0xC5, 0xDC, 0xC7, 0xD0, 0xBE, /* 0xAC-0xAF */ + 0xDC, 0xC1, 0xBB, 0xA8, 0xC6, 0x5F, 0xB7, 0xBC, /* 0xB0-0xB3 */ + 0xDC, 0xCC, 0xC6, 0x60, 0xC6, 0x61, 0xDC, 0xC6, /* 0xB4-0xB7 */ + 0xDC, 0xBF, 0xC7, 0xDB, 0xC6, 0x62, 0xC6, 0x63, /* 0xB8-0xBB */ + 0xC6, 0x64, 0xD1, 0xBF, 0xDC, 0xC0, 0xC6, 0x65, /* 0xBC-0xBF */ + 0xC6, 0x66, 0xDC, 0xCA, 0xC6, 0x67, 0xC6, 0x68, /* 0xC0-0xC3 */ + 0xDC, 0xD0, 0xC6, 0x69, 0xC6, 0x6A, 0xCE, 0xAD, /* 0xC4-0xC7 */ + 0xDC, 0xC2, 0xC6, 0x6B, 0xDC, 0xC3, 0xDC, 0xC8, /* 0xC8-0xCB */ + 0xDC, 0xC9, 0xB2, 0xD4, 0xDC, 0xD1, 0xCB, 0xD5, /* 0xCC-0xCF */ + 0xC6, 0x6C, 0xD4, 0xB7, 0xDC, 0xDB, 0xDC, 0xDF, /* 0xD0-0xD3 */ + 0xCC, 0xA6, 0xDC, 0xE6, 0xC6, 0x6D, 0xC3, 0xE7, /* 0xD4-0xD7 */ + 0xDC, 0xDC, 0xC6, 0x6E, 0xC6, 0x6F, 0xBF, 0xC1, /* 0xD8-0xDB */ + 0xDC, 0xD9, 0xC6, 0x70, 0xB0, 0xFA, 0xB9, 0xB6, /* 0xDC-0xDF */ + 0xDC, 0xE5, 0xDC, 0xD3, 0xC6, 0x71, 0xDC, 0xC4, /* 0xE0-0xE3 */ + 0xDC, 0xD6, 0xC8, 0xF4, 0xBF, 0xE0, 0xC6, 0x72, /* 0xE4-0xE7 */ + 0xC6, 0x73, 0xC6, 0x74, 0xC6, 0x75, 0xC9, 0xBB, /* 0xE8-0xEB */ + 0xC6, 0x76, 0xC6, 0x77, 0xC6, 0x78, 0xB1, 0xBD, /* 0xEC-0xEF */ + 0xC6, 0x79, 0xD3, 0xA2, 0xC6, 0x7A, 0xC6, 0x7B, /* 0xF0-0xF3 */ + 0xDC, 0xDA, 0xC6, 0x7C, 0xC6, 0x7D, 0xDC, 0xD5, /* 0xF4-0xF7 */ + 0xC6, 0x7E, 0xC6, 0xBB, 0xC6, 0x80, 0xDC, 0xDE, /* 0xF8-0xFB */ + 0xC6, 0x81, 0xC6, 0x82, 0xC6, 0x83, 0xC6, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0xC6, 0x85, 0xD7, 0xC2, 0xC3, 0xAF, 0xB7, 0xB6, /* 0x00-0x03 */ + 0xC7, 0xD1, 0xC3, 0xA9, 0xDC, 0xE2, 0xDC, 0xD8, /* 0x04-0x07 */ + 0xDC, 0xEB, 0xDC, 0xD4, 0xC6, 0x86, 0xC6, 0x87, /* 0x08-0x0B */ + 0xDC, 0xDD, 0xC6, 0x88, 0xBE, 0xA5, 0xDC, 0xD7, /* 0x0C-0x0F */ + 0xC6, 0x89, 0xDC, 0xE0, 0xC6, 0x8A, 0xC6, 0x8B, /* 0x10-0x13 */ + 0xDC, 0xE3, 0xDC, 0xE4, 0xC6, 0x8C, 0xDC, 0xF8, /* 0x14-0x17 */ + 0xC6, 0x8D, 0xC6, 0x8E, 0xDC, 0xE1, 0xDD, 0xA2, /* 0x18-0x1B */ + 0xDC, 0xE7, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x91, /* 0x1C-0x1F */ + 0xC6, 0x92, 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x95, /* 0x20-0x23 */ + 0xC6, 0x96, 0xC6, 0x97, 0xC6, 0x98, 0xBC, 0xEB, /* 0x24-0x27 */ + 0xB4, 0xC4, 0xC6, 0x99, 0xC6, 0x9A, 0xC3, 0xA3, /* 0x28-0x2B */ + 0xB2, 0xE7, 0xDC, 0xFA, 0xC6, 0x9B, 0xDC, 0xF2, /* 0x2C-0x2F */ + 0xC6, 0x9C, 0xDC, 0xEF, 0xC6, 0x9D, 0xDC, 0xFC, /* 0x30-0x33 */ + 0xDC, 0xEE, 0xD2, 0xF0, 0xB2, 0xE8, 0xC6, 0x9E, /* 0x34-0x37 */ + 0xC8, 0xD7, 0xC8, 0xE3, 0xDC, 0xFB, 0xC6, 0x9F, /* 0x38-0x3B */ + 0xDC, 0xED, 0xC6, 0xA0, 0xC7, 0x40, 0xC7, 0x41, /* 0x3C-0x3F */ + 0xDC, 0xF7, 0xC7, 0x42, 0xC7, 0x43, 0xDC, 0xF5, /* 0x40-0x43 */ + 0xC7, 0x44, 0xC7, 0x45, 0xBE, 0xA3, 0xDC, 0xF4, /* 0x44-0x47 */ + 0xC7, 0x46, 0xB2, 0xDD, 0xC7, 0x47, 0xC7, 0x48, /* 0x48-0x4B */ + 0xC7, 0x49, 0xC7, 0x4A, 0xC7, 0x4B, 0xDC, 0xF3, /* 0x4C-0x4F */ + 0xBC, 0xF6, 0xDC, 0xE8, 0xBB, 0xC4, 0xC7, 0x4C, /* 0x50-0x53 */ + 0xC0, 0xF3, 0xC7, 0x4D, 0xC7, 0x4E, 0xC7, 0x4F, /* 0x54-0x57 */ + 0xC7, 0x50, 0xC7, 0x51, 0xBC, 0xD4, 0xDC, 0xE9, /* 0x58-0x5B */ + 0xDC, 0xEA, 0xC7, 0x52, 0xDC, 0xF1, 0xDC, 0xF6, /* 0x5C-0x5F */ + 0xDC, 0xF9, 0xB5, 0xB4, 0xC7, 0x53, 0xC8, 0xD9, /* 0x60-0x63 */ + 0xBB, 0xE7, 0xDC, 0xFE, 0xDC, 0xFD, 0xD3, 0xAB, /* 0x64-0x67 */ + 0xDD, 0xA1, 0xDD, 0xA3, 0xDD, 0xA5, 0xD2, 0xF1, /* 0x68-0x6B */ + 0xDD, 0xA4, 0xDD, 0xA6, 0xDD, 0xA7, 0xD2, 0xA9, /* 0x6C-0x6F */ + 0xC7, 0x54, 0xC7, 0x55, 0xC7, 0x56, 0xC7, 0x57, /* 0x70-0x73 */ + 0xC7, 0x58, 0xC7, 0x59, 0xC7, 0x5A, 0xBA, 0xC9, /* 0x74-0x77 */ + 0xDD, 0xA9, 0xC7, 0x5B, 0xC7, 0x5C, 0xDD, 0xB6, /* 0x78-0x7B */ + 0xDD, 0xB1, 0xDD, 0xB4, 0xC7, 0x5D, 0xC7, 0x5E, /* 0x7C-0x7F */ + + 0xC7, 0x5F, 0xC7, 0x60, 0xC7, 0x61, 0xC7, 0x62, /* 0x80-0x83 */ + 0xC7, 0x63, 0xDD, 0xB0, 0xC6, 0xCE, 0xC7, 0x64, /* 0x84-0x87 */ + 0xC7, 0x65, 0xC0, 0xF2, 0xC7, 0x66, 0xC7, 0x67, /* 0x88-0x8B */ + 0xC7, 0x68, 0xC7, 0x69, 0xC9, 0xAF, 0xC7, 0x6A, /* 0x8C-0x8F */ + 0xC7, 0x6B, 0xC7, 0x6C, 0xDC, 0xEC, 0xDD, 0xAE, /* 0x90-0x93 */ + 0xC7, 0x6D, 0xC7, 0x6E, 0xC7, 0x6F, 0xC7, 0x70, /* 0x94-0x97 */ + 0xDD, 0xB7, 0xC7, 0x71, 0xC7, 0x72, 0xDC, 0xF0, /* 0x98-0x9B */ + 0xDD, 0xAF, 0xC7, 0x73, 0xDD, 0xB8, 0xC7, 0x74, /* 0x9C-0x9F */ + 0xDD, 0xAC, 0xC7, 0x75, 0xC7, 0x76, 0xC7, 0x77, /* 0xA0-0xA3 */ + 0xC7, 0x78, 0xC7, 0x79, 0xC7, 0x7A, 0xC7, 0x7B, /* 0xA4-0xA7 */ + 0xDD, 0xB9, 0xDD, 0xB3, 0xDD, 0xAD, 0xC4, 0xAA, /* 0xA8-0xAB */ + 0xC7, 0x7C, 0xC7, 0x7D, 0xC7, 0x7E, 0xC7, 0x80, /* 0xAC-0xAF */ + 0xDD, 0xA8, 0xC0, 0xB3, 0xC1, 0xAB, 0xDD, 0xAA, /* 0xB0-0xB3 */ + 0xDD, 0xAB, 0xC7, 0x81, 0xDD, 0xB2, 0xBB, 0xF1, /* 0xB4-0xB7 */ + 0xDD, 0xB5, 0xD3, 0xA8, 0xDD, 0xBA, 0xC7, 0x82, /* 0xB8-0xBB */ + 0xDD, 0xBB, 0xC3, 0xA7, 0xC7, 0x83, 0xC7, 0x84, /* 0xBC-0xBF */ + 0xDD, 0xD2, 0xDD, 0xBC, 0xC7, 0x85, 0xC7, 0x86, /* 0xC0-0xC3 */ + 0xC7, 0x87, 0xDD, 0xD1, 0xC7, 0x88, 0xB9, 0xBD, /* 0xC4-0xC7 */ + 0xC7, 0x89, 0xC7, 0x8A, 0xBE, 0xD5, 0xC7, 0x8B, /* 0xC8-0xCB */ + 0xBE, 0xFA, 0xC7, 0x8C, 0xC7, 0x8D, 0xBA, 0xCA, /* 0xCC-0xCF */ + 0xC7, 0x8E, 0xC7, 0x8F, 0xC7, 0x90, 0xC7, 0x91, /* 0xD0-0xD3 */ + 0xDD, 0xCA, 0xC7, 0x92, 0xDD, 0xC5, 0xC7, 0x93, /* 0xD4-0xD7 */ + 0xDD, 0xBF, 0xC7, 0x94, 0xC7, 0x95, 0xC7, 0x96, /* 0xD8-0xDB */ + 0xB2, 0xCB, 0xDD, 0xC3, 0xC7, 0x97, 0xDD, 0xCB, /* 0xDC-0xDF */ + 0xB2, 0xA4, 0xDD, 0xD5, 0xC7, 0x98, 0xC7, 0x99, /* 0xE0-0xE3 */ + 0xC7, 0x9A, 0xDD, 0xBE, 0xC7, 0x9B, 0xC7, 0x9C, /* 0xE4-0xE7 */ + 0xC7, 0x9D, 0xC6, 0xD0, 0xDD, 0xD0, 0xC7, 0x9E, /* 0xE8-0xEB */ + 0xC7, 0x9F, 0xC7, 0xA0, 0xC8, 0x40, 0xC8, 0x41, /* 0xEC-0xEF */ + 0xDD, 0xD4, 0xC1, 0xE2, 0xB7, 0xC6, 0xC8, 0x42, /* 0xF0-0xF3 */ + 0xC8, 0x43, 0xC8, 0x44, 0xC8, 0x45, 0xC8, 0x46, /* 0xF4-0xF7 */ + 0xDD, 0xCE, 0xDD, 0xCF, 0xC8, 0x47, 0xC8, 0x48, /* 0xF8-0xFB */ + 0xC8, 0x49, 0xDD, 0xC4, 0xC8, 0x4A, 0xC8, 0x4B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0xC8, 0x4C, 0xDD, 0xBD, 0xC8, 0x4D, 0xDD, 0xCD, /* 0x00-0x03 */ + 0xCC, 0xD1, 0xC8, 0x4E, 0xDD, 0xC9, 0xC8, 0x4F, /* 0x04-0x07 */ + 0xC8, 0x50, 0xC8, 0x51, 0xC8, 0x52, 0xDD, 0xC2, /* 0x08-0x0B */ + 0xC3, 0xC8, 0xC6, 0xBC, 0xCE, 0xAE, 0xDD, 0xCC, /* 0x0C-0x0F */ + 0xC8, 0x53, 0xDD, 0xC8, 0xC8, 0x54, 0xC8, 0x55, /* 0x10-0x13 */ + 0xC8, 0x56, 0xC8, 0x57, 0xC8, 0x58, 0xC8, 0x59, /* 0x14-0x17 */ + 0xDD, 0xC1, 0xC8, 0x5A, 0xC8, 0x5B, 0xC8, 0x5C, /* 0x18-0x1B */ + 0xDD, 0xC6, 0xC2, 0xDC, 0xC8, 0x5D, 0xC8, 0x5E, /* 0x1C-0x1F */ + 0xC8, 0x5F, 0xC8, 0x60, 0xC8, 0x61, 0xC8, 0x62, /* 0x20-0x23 */ + 0xD3, 0xA9, 0xD3, 0xAA, 0xDD, 0xD3, 0xCF, 0xF4, /* 0x24-0x27 */ + 0xC8, 0xF8, 0xC8, 0x63, 0xC8, 0x64, 0xC8, 0x65, /* 0x28-0x2B */ + 0xC8, 0x66, 0xC8, 0x67, 0xC8, 0x68, 0xC8, 0x69, /* 0x2C-0x2F */ + 0xC8, 0x6A, 0xDD, 0xE6, 0xC8, 0x6B, 0xC8, 0x6C, /* 0x30-0x33 */ + 0xC8, 0x6D, 0xC8, 0x6E, 0xC8, 0x6F, 0xC8, 0x70, /* 0x34-0x37 */ + 0xDD, 0xC7, 0xC8, 0x71, 0xC8, 0x72, 0xC8, 0x73, /* 0x38-0x3B */ + 0xDD, 0xE0, 0xC2, 0xE4, 0xC8, 0x74, 0xC8, 0x75, /* 0x3C-0x3F */ + 0xC8, 0x76, 0xC8, 0x77, 0xC8, 0x78, 0xC8, 0x79, /* 0x40-0x43 */ + 0xC8, 0x7A, 0xC8, 0x7B, 0xDD, 0xE1, 0xC8, 0x7C, /* 0x44-0x47 */ + 0xC8, 0x7D, 0xC8, 0x7E, 0xC8, 0x80, 0xC8, 0x81, /* 0x48-0x4B */ + 0xC8, 0x82, 0xC8, 0x83, 0xC8, 0x84, 0xC8, 0x85, /* 0x4C-0x4F */ + 0xC8, 0x86, 0xDD, 0xD7, 0xC8, 0x87, 0xC8, 0x88, /* 0x50-0x53 */ + 0xC8, 0x89, 0xC8, 0x8A, 0xC8, 0x8B, 0xD6, 0xF8, /* 0x54-0x57 */ + 0xC8, 0x8C, 0xDD, 0xD9, 0xDD, 0xD8, 0xB8, 0xF0, /* 0x58-0x5B */ + 0xDD, 0xD6, 0xC8, 0x8D, 0xC8, 0x8E, 0xC8, 0x8F, /* 0x5C-0x5F */ + 0xC8, 0x90, 0xC6, 0xCF, 0xC8, 0x91, 0xB6, 0xAD, /* 0x60-0x63 */ + 0xC8, 0x92, 0xC8, 0x93, 0xC8, 0x94, 0xC8, 0x95, /* 0x64-0x67 */ + 0xC8, 0x96, 0xDD, 0xE2, 0xC8, 0x97, 0xBA, 0xF9, /* 0x68-0x6B */ + 0xD4, 0xE1, 0xDD, 0xE7, 0xC8, 0x98, 0xC8, 0x99, /* 0x6C-0x6F */ + 0xC8, 0x9A, 0xB4, 0xD0, 0xC8, 0x9B, 0xDD, 0xDA, /* 0x70-0x73 */ + 0xC8, 0x9C, 0xBF, 0xFB, 0xDD, 0xE3, 0xC8, 0x9D, /* 0x74-0x77 */ + 0xDD, 0xDF, 0xC8, 0x9E, 0xDD, 0xDD, 0xC8, 0x9F, /* 0x78-0x7B */ + 0xC8, 0xA0, 0xC9, 0x40, 0xC9, 0x41, 0xC9, 0x42, /* 0x7C-0x7F */ + + 0xC9, 0x43, 0xC9, 0x44, 0xB5, 0xD9, 0xC9, 0x45, /* 0x80-0x83 */ + 0xC9, 0x46, 0xC9, 0x47, 0xC9, 0x48, 0xDD, 0xDB, /* 0x84-0x87 */ + 0xDD, 0xDC, 0xDD, 0xDE, 0xC9, 0x49, 0xBD, 0xAF, /* 0x88-0x8B */ + 0xDD, 0xE4, 0xC9, 0x4A, 0xDD, 0xE5, 0xC9, 0x4B, /* 0x8C-0x8F */ + 0xC9, 0x4C, 0xC9, 0x4D, 0xC9, 0x4E, 0xC9, 0x4F, /* 0x90-0x93 */ + 0xC9, 0x50, 0xC9, 0x51, 0xC9, 0x52, 0xDD, 0xF5, /* 0x94-0x97 */ + 0xC9, 0x53, 0xC3, 0xC9, 0xC9, 0x54, 0xC9, 0x55, /* 0x98-0x9B */ + 0xCB, 0xE2, 0xC9, 0x56, 0xC9, 0x57, 0xC9, 0x58, /* 0x9C-0x9F */ + 0xC9, 0x59, 0xDD, 0xF2, 0xC9, 0x5A, 0xC9, 0x5B, /* 0xA0-0xA3 */ + 0xC9, 0x5C, 0xC9, 0x5D, 0xC9, 0x5E, 0xC9, 0x5F, /* 0xA4-0xA7 */ + 0xC9, 0x60, 0xC9, 0x61, 0xC9, 0x62, 0xC9, 0x63, /* 0xA8-0xAB */ + 0xC9, 0x64, 0xC9, 0x65, 0xC9, 0x66, 0xD8, 0xE1, /* 0xAC-0xAF */ + 0xC9, 0x67, 0xC9, 0x68, 0xC6, 0xD1, 0xC9, 0x69, /* 0xB0-0xB3 */ + 0xDD, 0xF4, 0xC9, 0x6A, 0xC9, 0x6B, 0xC9, 0x6C, /* 0xB4-0xB7 */ + 0xD5, 0xF4, 0xDD, 0xF3, 0xDD, 0xF0, 0xC9, 0x6D, /* 0xB8-0xBB */ + 0xC9, 0x6E, 0xDD, 0xEC, 0xC9, 0x6F, 0xDD, 0xEF, /* 0xBC-0xBF */ + 0xC9, 0x70, 0xDD, 0xE8, 0xC9, 0x71, 0xC9, 0x72, /* 0xC0-0xC3 */ + 0xD0, 0xEE, 0xC9, 0x73, 0xC9, 0x74, 0xC9, 0x75, /* 0xC4-0xC7 */ + 0xC9, 0x76, 0xC8, 0xD8, 0xDD, 0xEE, 0xC9, 0x77, /* 0xC8-0xCB */ + 0xC9, 0x78, 0xDD, 0xE9, 0xC9, 0x79, 0xC9, 0x7A, /* 0xCC-0xCF */ + 0xDD, 0xEA, 0xCB, 0xF2, 0xC9, 0x7B, 0xDD, 0xED, /* 0xD0-0xD3 */ + 0xC9, 0x7C, 0xC9, 0x7D, 0xB1, 0xCD, 0xC9, 0x7E, /* 0xD4-0xD7 */ + 0xC9, 0x80, 0xC9, 0x81, 0xC9, 0x82, 0xC9, 0x83, /* 0xD8-0xDB */ + 0xC9, 0x84, 0xC0, 0xB6, 0xC9, 0x85, 0xBC, 0xBB, /* 0xDC-0xDF */ + 0xDD, 0xF1, 0xC9, 0x86, 0xC9, 0x87, 0xDD, 0xF7, /* 0xE0-0xE3 */ + 0xC9, 0x88, 0xDD, 0xF6, 0xDD, 0xEB, 0xC9, 0x89, /* 0xE4-0xE7 */ + 0xC9, 0x8A, 0xC9, 0x8B, 0xC9, 0x8C, 0xC9, 0x8D, /* 0xE8-0xEB */ + 0xC5, 0xEE, 0xC9, 0x8E, 0xC9, 0x8F, 0xC9, 0x90, /* 0xEC-0xEF */ + 0xDD, 0xFB, 0xC9, 0x91, 0xC9, 0x92, 0xC9, 0x93, /* 0xF0-0xF3 */ + 0xC9, 0x94, 0xC9, 0x95, 0xC9, 0x96, 0xC9, 0x97, /* 0xF4-0xF7 */ + 0xC9, 0x98, 0xC9, 0x99, 0xC9, 0x9A, 0xC9, 0x9B, /* 0xF8-0xFB */ + 0xDE, 0xA4, 0xC9, 0x9C, 0xC9, 0x9D, 0xDE, 0xA3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0xC9, 0x9E, 0xC9, 0x9F, 0xC9, 0xA0, 0xCA, 0x40, /* 0x00-0x03 */ + 0xCA, 0x41, 0xCA, 0x42, 0xCA, 0x43, 0xCA, 0x44, /* 0x04-0x07 */ + 0xCA, 0x45, 0xCA, 0x46, 0xCA, 0x47, 0xCA, 0x48, /* 0x08-0x0B */ + 0xDD, 0xF8, 0xCA, 0x49, 0xCA, 0x4A, 0xCA, 0x4B, /* 0x0C-0x0F */ + 0xCA, 0x4C, 0xC3, 0xEF, 0xCA, 0x4D, 0xC2, 0xFB, /* 0x10-0x13 */ + 0xCA, 0x4E, 0xCA, 0x4F, 0xCA, 0x50, 0xD5, 0xE1, /* 0x14-0x17 */ + 0xCA, 0x51, 0xCA, 0x52, 0xCE, 0xB5, 0xCA, 0x53, /* 0x18-0x1B */ + 0xCA, 0x54, 0xCA, 0x55, 0xCA, 0x56, 0xDD, 0xFD, /* 0x1C-0x1F */ + 0xCA, 0x57, 0xB2, 0xCC, 0xCA, 0x58, 0xCA, 0x59, /* 0x20-0x23 */ + 0xCA, 0x5A, 0xCA, 0x5B, 0xCA, 0x5C, 0xCA, 0x5D, /* 0x24-0x27 */ + 0xCA, 0x5E, 0xCA, 0x5F, 0xCA, 0x60, 0xC4, 0xE8, /* 0x28-0x2B */ + 0xCA, 0xDF, 0xCA, 0x61, 0xCA, 0x62, 0xCA, 0x63, /* 0x2C-0x2F */ + 0xCA, 0x64, 0xCA, 0x65, 0xCA, 0x66, 0xCA, 0x67, /* 0x30-0x33 */ + 0xCA, 0x68, 0xCA, 0x69, 0xCA, 0x6A, 0xC7, 0xBE, /* 0x34-0x37 */ + 0xDD, 0xFA, 0xDD, 0xFC, 0xDD, 0xFE, 0xDE, 0xA2, /* 0x38-0x3B */ + 0xB0, 0xAA, 0xB1, 0xCE, 0xCA, 0x6B, 0xCA, 0x6C, /* 0x3C-0x3F */ + 0xCA, 0x6D, 0xCA, 0x6E, 0xCA, 0x6F, 0xDE, 0xAC, /* 0x40-0x43 */ + 0xCA, 0x70, 0xCA, 0x71, 0xCA, 0x72, 0xCA, 0x73, /* 0x44-0x47 */ + 0xDE, 0xA6, 0xBD, 0xB6, 0xC8, 0xEF, 0xCA, 0x74, /* 0x48-0x4B */ + 0xCA, 0x75, 0xCA, 0x76, 0xCA, 0x77, 0xCA, 0x78, /* 0x4C-0x4F */ + 0xCA, 0x79, 0xCA, 0x7A, 0xCA, 0x7B, 0xCA, 0x7C, /* 0x50-0x53 */ + 0xCA, 0x7D, 0xCA, 0x7E, 0xDE, 0xA1, 0xCA, 0x80, /* 0x54-0x57 */ + 0xCA, 0x81, 0xDE, 0xA5, 0xCA, 0x82, 0xCA, 0x83, /* 0x58-0x5B */ + 0xCA, 0x84, 0xCA, 0x85, 0xDE, 0xA9, 0xCA, 0x86, /* 0x5C-0x5F */ + 0xCA, 0x87, 0xCA, 0x88, 0xCA, 0x89, 0xCA, 0x8A, /* 0x60-0x63 */ + 0xDE, 0xA8, 0xCA, 0x8B, 0xCA, 0x8C, 0xCA, 0x8D, /* 0x64-0x67 */ + 0xDE, 0xA7, 0xCA, 0x8E, 0xCA, 0x8F, 0xCA, 0x90, /* 0x68-0x6B */ + 0xCA, 0x91, 0xCA, 0x92, 0xCA, 0x93, 0xCA, 0x94, /* 0x6C-0x6F */ + 0xCA, 0x95, 0xCA, 0x96, 0xDE, 0xAD, 0xCA, 0x97, /* 0x70-0x73 */ + 0xD4, 0xCC, 0xCA, 0x98, 0xCA, 0x99, 0xCA, 0x9A, /* 0x74-0x77 */ + 0xCA, 0x9B, 0xDE, 0xB3, 0xDE, 0xAA, 0xDE, 0xAE, /* 0x78-0x7B */ + 0xCA, 0x9C, 0xCA, 0x9D, 0xC0, 0xD9, 0xCA, 0x9E, /* 0x7C-0x7F */ + + 0xCA, 0x9F, 0xCA, 0xA0, 0xCB, 0x40, 0xCB, 0x41, /* 0x80-0x83 */ + 0xB1, 0xA1, 0xDE, 0xB6, 0xCB, 0x42, 0xDE, 0xB1, /* 0x84-0x87 */ + 0xCB, 0x43, 0xCB, 0x44, 0xCB, 0x45, 0xCB, 0x46, /* 0x88-0x8B */ + 0xCB, 0x47, 0xCB, 0x48, 0xCB, 0x49, 0xDE, 0xB2, /* 0x8C-0x8F */ + 0xCB, 0x4A, 0xCB, 0x4B, 0xCB, 0x4C, 0xCB, 0x4D, /* 0x90-0x93 */ + 0xCB, 0x4E, 0xCB, 0x4F, 0xCB, 0x50, 0xCB, 0x51, /* 0x94-0x97 */ + 0xCB, 0x52, 0xCB, 0x53, 0xCB, 0x54, 0xD1, 0xA6, /* 0x98-0x9B */ + 0xDE, 0xB5, 0xCB, 0x55, 0xCB, 0x56, 0xCB, 0x57, /* 0x9C-0x9F */ + 0xCB, 0x58, 0xCB, 0x59, 0xCB, 0x5A, 0xCB, 0x5B, /* 0xA0-0xA3 */ + 0xDE, 0xAF, 0xCB, 0x5C, 0xCB, 0x5D, 0xCB, 0x5E, /* 0xA4-0xA7 */ + 0xDE, 0xB0, 0xCB, 0x5F, 0xD0, 0xBD, 0xCB, 0x60, /* 0xA8-0xAB */ + 0xCB, 0x61, 0xCB, 0x62, 0xDE, 0xB4, 0xCA, 0xED, /* 0xAC-0xAF */ + 0xDE, 0xB9, 0xCB, 0x63, 0xCB, 0x64, 0xCB, 0x65, /* 0xB0-0xB3 */ + 0xCB, 0x66, 0xCB, 0x67, 0xCB, 0x68, 0xDE, 0xB8, /* 0xB4-0xB7 */ + 0xCB, 0x69, 0xDE, 0xB7, 0xCB, 0x6A, 0xCB, 0x6B, /* 0xB8-0xBB */ + 0xCB, 0x6C, 0xCB, 0x6D, 0xCB, 0x6E, 0xCB, 0x6F, /* 0xBC-0xBF */ + 0xCB, 0x70, 0xDE, 0xBB, 0xCB, 0x71, 0xCB, 0x72, /* 0xC0-0xC3 */ + 0xCB, 0x73, 0xCB, 0x74, 0xCB, 0x75, 0xCB, 0x76, /* 0xC4-0xC7 */ + 0xCB, 0x77, 0xBD, 0xE5, 0xCB, 0x78, 0xCB, 0x79, /* 0xC8-0xCB */ + 0xCB, 0x7A, 0xCB, 0x7B, 0xCB, 0x7C, 0xB2, 0xD8, /* 0xCC-0xCF */ + 0xC3, 0xEA, 0xCB, 0x7D, 0xCB, 0x7E, 0xDE, 0xBA, /* 0xD0-0xD3 */ + 0xCB, 0x80, 0xC5, 0xBA, 0xCB, 0x81, 0xCB, 0x82, /* 0xD4-0xD7 */ + 0xCB, 0x83, 0xCB, 0x84, 0xCB, 0x85, 0xCB, 0x86, /* 0xD8-0xDB */ + 0xDE, 0xBC, 0xCB, 0x87, 0xCB, 0x88, 0xCB, 0x89, /* 0xDC-0xDF */ + 0xCB, 0x8A, 0xCB, 0x8B, 0xCB, 0x8C, 0xCB, 0x8D, /* 0xE0-0xE3 */ + 0xCC, 0xD9, 0xCB, 0x8E, 0xCB, 0x8F, 0xCB, 0x90, /* 0xE4-0xE7 */ + 0xCB, 0x91, 0xB7, 0xAA, 0xCB, 0x92, 0xCB, 0x93, /* 0xE8-0xEB */ + 0xCB, 0x94, 0xCB, 0x95, 0xCB, 0x96, 0xCB, 0x97, /* 0xEC-0xEF */ + 0xCB, 0x98, 0xCB, 0x99, 0xCB, 0x9A, 0xCB, 0x9B, /* 0xF0-0xF3 */ + 0xCB, 0x9C, 0xCB, 0x9D, 0xCB, 0x9E, 0xCB, 0x9F, /* 0xF4-0xF7 */ + 0xCB, 0xA0, 0xCC, 0x40, 0xCC, 0x41, 0xD4, 0xE5, /* 0xF8-0xFB */ + 0xCC, 0x42, 0xCC, 0x43, 0xCC, 0x44, 0xDE, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0xCC, 0x45, 0xCC, 0x46, 0xCC, 0x47, 0xCC, 0x48, /* 0x00-0x03 */ + 0xCC, 0x49, 0xDE, 0xBF, 0xCC, 0x4A, 0xCC, 0x4B, /* 0x04-0x07 */ + 0xCC, 0x4C, 0xCC, 0x4D, 0xCC, 0x4E, 0xCC, 0x4F, /* 0x08-0x0B */ + 0xCC, 0x50, 0xCC, 0x51, 0xCC, 0x52, 0xCC, 0x53, /* 0x0C-0x0F */ + 0xCC, 0x54, 0xC4, 0xA2, 0xCC, 0x55, 0xCC, 0x56, /* 0x10-0x13 */ + 0xCC, 0x57, 0xCC, 0x58, 0xDE, 0xC1, 0xCC, 0x59, /* 0x14-0x17 */ + 0xCC, 0x5A, 0xCC, 0x5B, 0xCC, 0x5C, 0xCC, 0x5D, /* 0x18-0x1B */ + 0xCC, 0x5E, 0xCC, 0x5F, 0xCC, 0x60, 0xCC, 0x61, /* 0x1C-0x1F */ + 0xCC, 0x62, 0xCC, 0x63, 0xCC, 0x64, 0xCC, 0x65, /* 0x20-0x23 */ + 0xCC, 0x66, 0xCC, 0x67, 0xCC, 0x68, 0xDE, 0xBE, /* 0x24-0x27 */ + 0xCC, 0x69, 0xDE, 0xC0, 0xCC, 0x6A, 0xCC, 0x6B, /* 0x28-0x2B */ + 0xCC, 0x6C, 0xCC, 0x6D, 0xCC, 0x6E, 0xCC, 0x6F, /* 0x2C-0x2F */ + 0xCC, 0x70, 0xCC, 0x71, 0xCC, 0x72, 0xCC, 0x73, /* 0x30-0x33 */ + 0xCC, 0x74, 0xCC, 0x75, 0xCC, 0x76, 0xCC, 0x77, /* 0x34-0x37 */ + 0xD5, 0xBA, 0xCC, 0x78, 0xCC, 0x79, 0xCC, 0x7A, /* 0x38-0x3B */ + 0xDE, 0xC2, 0xCC, 0x7B, 0xCC, 0x7C, 0xCC, 0x7D, /* 0x3C-0x3F */ + 0xCC, 0x7E, 0xCC, 0x80, 0xCC, 0x81, 0xCC, 0x82, /* 0x40-0x43 */ + 0xCC, 0x83, 0xCC, 0x84, 0xCC, 0x85, 0xCC, 0x86, /* 0x44-0x47 */ + 0xCC, 0x87, 0xCC, 0x88, 0xCC, 0x89, 0xCC, 0x8A, /* 0x48-0x4B */ + 0xCC, 0x8B, 0xF2, 0xAE, 0xBB, 0xA2, 0xC2, 0xB2, /* 0x4C-0x4F */ + 0xC5, 0xB0, 0xC2, 0xC7, 0xCC, 0x8C, 0xCC, 0x8D, /* 0x50-0x53 */ + 0xF2, 0xAF, 0xCC, 0x8E, 0xCC, 0x8F, 0xCC, 0x90, /* 0x54-0x57 */ + 0xCC, 0x91, 0xCC, 0x92, 0xD0, 0xE9, 0xCC, 0x93, /* 0x58-0x5B */ + 0xCC, 0x94, 0xCC, 0x95, 0xD3, 0xDD, 0xCC, 0x96, /* 0x5C-0x5F */ + 0xCC, 0x97, 0xCC, 0x98, 0xEB, 0xBD, 0xCC, 0x99, /* 0x60-0x63 */ + 0xCC, 0x9A, 0xCC, 0x9B, 0xCC, 0x9C, 0xCC, 0x9D, /* 0x64-0x67 */ + 0xCC, 0x9E, 0xCC, 0x9F, 0xCC, 0xA0, 0xB3, 0xE6, /* 0x68-0x6B */ + 0xF2, 0xB0, 0xCD, 0x40, 0xF2, 0xB1, 0xCD, 0x41, /* 0x6C-0x6F */ + 0xCD, 0x42, 0xCA, 0xAD, 0xCD, 0x43, 0xCD, 0x44, /* 0x70-0x73 */ + 0xCD, 0x45, 0xCD, 0x46, 0xCD, 0x47, 0xCD, 0x48, /* 0x74-0x77 */ + 0xCD, 0x49, 0xBA, 0xE7, 0xF2, 0xB3, 0xF2, 0xB5, /* 0x78-0x7B */ + 0xF2, 0xB4, 0xCB, 0xE4, 0xCF, 0xBA, 0xF2, 0xB2, /* 0x7C-0x7F */ + + 0xCA, 0xB4, 0xD2, 0xCF, 0xC2, 0xEC, 0xCD, 0x4A, /* 0x80-0x83 */ + 0xCD, 0x4B, 0xCD, 0x4C, 0xCD, 0x4D, 0xCD, 0x4E, /* 0x84-0x87 */ + 0xCD, 0x4F, 0xCD, 0x50, 0xCE, 0xC3, 0xF2, 0xB8, /* 0x88-0x8B */ + 0xB0, 0xF6, 0xF2, 0xB7, 0xCD, 0x51, 0xCD, 0x52, /* 0x8C-0x8F */ + 0xCD, 0x53, 0xCD, 0x54, 0xCD, 0x55, 0xF2, 0xBE, /* 0x90-0x93 */ + 0xCD, 0x56, 0xB2, 0xCF, 0xCD, 0x57, 0xCD, 0x58, /* 0x94-0x97 */ + 0xCD, 0x59, 0xCD, 0x5A, 0xCD, 0x5B, 0xCD, 0x5C, /* 0x98-0x9B */ + 0xD1, 0xC1, 0xF2, 0xBA, 0xCD, 0x5D, 0xCD, 0x5E, /* 0x9C-0x9F */ + 0xCD, 0x5F, 0xCD, 0x60, 0xCD, 0x61, 0xF2, 0xBC, /* 0xA0-0xA3 */ + 0xD4, 0xE9, 0xCD, 0x62, 0xCD, 0x63, 0xF2, 0xBB, /* 0xA4-0xA7 */ + 0xF2, 0xB6, 0xF2, 0xBF, 0xF2, 0xBD, 0xCD, 0x64, /* 0xA8-0xAB */ + 0xF2, 0xB9, 0xCD, 0x65, 0xCD, 0x66, 0xF2, 0xC7, /* 0xAC-0xAF */ + 0xF2, 0xC4, 0xF2, 0xC6, 0xCD, 0x67, 0xCD, 0x68, /* 0xB0-0xB3 */ + 0xF2, 0xCA, 0xF2, 0xC2, 0xF2, 0xC0, 0xCD, 0x69, /* 0xB4-0xB7 */ + 0xCD, 0x6A, 0xCD, 0x6B, 0xF2, 0xC5, 0xCD, 0x6C, /* 0xB8-0xBB */ + 0xCD, 0x6D, 0xCD, 0x6E, 0xCD, 0x6F, 0xCD, 0x70, /* 0xBC-0xBF */ + 0xD6, 0xFB, 0xCD, 0x71, 0xCD, 0x72, 0xCD, 0x73, /* 0xC0-0xC3 */ + 0xF2, 0xC1, 0xCD, 0x74, 0xC7, 0xF9, 0xC9, 0xDF, /* 0xC4-0xC7 */ + 0xCD, 0x75, 0xF2, 0xC8, 0xB9, 0xC6, 0xB5, 0xB0, /* 0xC8-0xCB */ + 0xCD, 0x76, 0xCD, 0x77, 0xF2, 0xC3, 0xF2, 0xC9, /* 0xCC-0xCF */ + 0xF2, 0xD0, 0xF2, 0xD6, 0xCD, 0x78, 0xCD, 0x79, /* 0xD0-0xD3 */ + 0xBB, 0xD7, 0xCD, 0x7A, 0xCD, 0x7B, 0xCD, 0x7C, /* 0xD4-0xD7 */ + 0xF2, 0xD5, 0xCD, 0xDC, 0xCD, 0x7D, 0xD6, 0xEB, /* 0xD8-0xDB */ + 0xCD, 0x7E, 0xCD, 0x80, 0xF2, 0xD2, 0xF2, 0xD4, /* 0xDC-0xDF */ + 0xCD, 0x81, 0xCD, 0x82, 0xCD, 0x83, 0xCD, 0x84, /* 0xE0-0xE3 */ + 0xB8, 0xF2, 0xCD, 0x85, 0xCD, 0x86, 0xCD, 0x87, /* 0xE4-0xE7 */ + 0xCD, 0x88, 0xF2, 0xCB, 0xCD, 0x89, 0xCD, 0x8A, /* 0xE8-0xEB */ + 0xCD, 0x8B, 0xF2, 0xCE, 0xC2, 0xF9, 0xCD, 0x8C, /* 0xEC-0xEF */ + 0xD5, 0xDD, 0xF2, 0xCC, 0xF2, 0xCD, 0xF2, 0xCF, /* 0xF0-0xF3 */ + 0xF2, 0xD3, 0xCD, 0x8D, 0xCD, 0x8E, 0xCD, 0x8F, /* 0xF4-0xF7 */ + 0xF2, 0xD9, 0xD3, 0xBC, 0xCD, 0x90, 0xCD, 0x91, /* 0xF8-0xFB */ + 0xCD, 0x92, 0xCD, 0x93, 0xB6, 0xEA, 0xCD, 0x94, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xCA, 0xF1, 0xCD, 0x95, 0xB7, 0xE4, 0xF2, 0xD7, /* 0x00-0x03 */ + 0xCD, 0x96, 0xCD, 0x97, 0xCD, 0x98, 0xF2, 0xD8, /* 0x04-0x07 */ + 0xF2, 0xDA, 0xF2, 0xDD, 0xF2, 0xDB, 0xCD, 0x99, /* 0x08-0x0B */ + 0xCD, 0x9A, 0xF2, 0xDC, 0xCD, 0x9B, 0xCD, 0x9C, /* 0x0C-0x0F */ + 0xCD, 0x9D, 0xCD, 0x9E, 0xD1, 0xD1, 0xF2, 0xD1, /* 0x10-0x13 */ + 0xCD, 0x9F, 0xCD, 0xC9, 0xCD, 0xA0, 0xCE, 0xCF, /* 0x14-0x17 */ + 0xD6, 0xA9, 0xCE, 0x40, 0xF2, 0xE3, 0xCE, 0x41, /* 0x18-0x1B */ + 0xC3, 0xDB, 0xCE, 0x42, 0xF2, 0xE0, 0xCE, 0x43, /* 0x1C-0x1F */ + 0xCE, 0x44, 0xC0, 0xAF, 0xF2, 0xEC, 0xF2, 0xDE, /* 0x20-0x23 */ + 0xCE, 0x45, 0xF2, 0xE1, 0xCE, 0x46, 0xCE, 0x47, /* 0x24-0x27 */ + 0xCE, 0x48, 0xF2, 0xE8, 0xCE, 0x49, 0xCE, 0x4A, /* 0x28-0x2B */ + 0xCE, 0x4B, 0xCE, 0x4C, 0xF2, 0xE2, 0xCE, 0x4D, /* 0x2C-0x2F */ + 0xCE, 0x4E, 0xF2, 0xE7, 0xCE, 0x4F, 0xCE, 0x50, /* 0x30-0x33 */ + 0xF2, 0xE6, 0xCE, 0x51, 0xCE, 0x52, 0xF2, 0xE9, /* 0x34-0x37 */ + 0xCE, 0x53, 0xCE, 0x54, 0xCE, 0x55, 0xF2, 0xDF, /* 0x38-0x3B */ + 0xCE, 0x56, 0xCE, 0x57, 0xF2, 0xE4, 0xF2, 0xEA, /* 0x3C-0x3F */ + 0xCE, 0x58, 0xCE, 0x59, 0xCE, 0x5A, 0xCE, 0x5B, /* 0x40-0x43 */ + 0xCE, 0x5C, 0xCE, 0x5D, 0xCE, 0x5E, 0xD3, 0xAC, /* 0x44-0x47 */ + 0xF2, 0xE5, 0xB2, 0xF5, 0xCE, 0x5F, 0xCE, 0x60, /* 0x48-0x4B */ + 0xF2, 0xF2, 0xCE, 0x61, 0xD0, 0xAB, 0xCE, 0x62, /* 0x4C-0x4F */ + 0xCE, 0x63, 0xCE, 0x64, 0xCE, 0x65, 0xF2, 0xF5, /* 0x50-0x53 */ + 0xCE, 0x66, 0xCE, 0x67, 0xCE, 0x68, 0xBB, 0xC8, /* 0x54-0x57 */ + 0xCE, 0x69, 0xF2, 0xF9, 0xCE, 0x6A, 0xCE, 0x6B, /* 0x58-0x5B */ + 0xCE, 0x6C, 0xCE, 0x6D, 0xCE, 0x6E, 0xCE, 0x6F, /* 0x5C-0x5F */ + 0xF2, 0xF0, 0xCE, 0x70, 0xCE, 0x71, 0xF2, 0xF6, /* 0x60-0x63 */ + 0xF2, 0xF8, 0xF2, 0xFA, 0xCE, 0x72, 0xCE, 0x73, /* 0x64-0x67 */ + 0xCE, 0x74, 0xCE, 0x75, 0xCE, 0x76, 0xCE, 0x77, /* 0x68-0x6B */ + 0xCE, 0x78, 0xCE, 0x79, 0xF2, 0xF3, 0xCE, 0x7A, /* 0x6C-0x6F */ + 0xF2, 0xF1, 0xCE, 0x7B, 0xCE, 0x7C, 0xCE, 0x7D, /* 0x70-0x73 */ + 0xBA, 0xFB, 0xCE, 0x7E, 0xB5, 0xFB, 0xCE, 0x80, /* 0x74-0x77 */ + 0xCE, 0x81, 0xCE, 0x82, 0xCE, 0x83, 0xF2, 0xEF, /* 0x78-0x7B */ + 0xF2, 0xF7, 0xF2, 0xED, 0xF2, 0xEE, 0xCE, 0x84, /* 0x7C-0x7F */ + + 0xCE, 0x85, 0xCE, 0x86, 0xF2, 0xEB, 0xF3, 0xA6, /* 0x80-0x83 */ + 0xCE, 0x87, 0xF3, 0xA3, 0xCE, 0x88, 0xCE, 0x89, /* 0x84-0x87 */ + 0xF3, 0xA2, 0xCE, 0x8A, 0xCE, 0x8B, 0xF2, 0xF4, /* 0x88-0x8B */ + 0xCE, 0x8C, 0xC8, 0xDA, 0xCE, 0x8D, 0xCE, 0x8E, /* 0x8C-0x8F */ + 0xCE, 0x8F, 0xCE, 0x90, 0xCE, 0x91, 0xF2, 0xFB, /* 0x90-0x93 */ + 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xF3, 0xA5, /* 0x94-0x97 */ + 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, /* 0x98-0x9B */ + 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xC3, 0xF8, /* 0x9C-0x9F */ + 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, /* 0xA0-0xA3 */ + 0xCE, 0xA0, 0xCF, 0x40, 0xCF, 0x41, 0xCF, 0x42, /* 0xA4-0xA7 */ + 0xF2, 0xFD, 0xCF, 0x43, 0xCF, 0x44, 0xF3, 0xA7, /* 0xA8-0xAB */ + 0xF3, 0xA9, 0xF3, 0xA4, 0xCF, 0x45, 0xF2, 0xFC, /* 0xAC-0xAF */ + 0xCF, 0x46, 0xCF, 0x47, 0xCF, 0x48, 0xF3, 0xAB, /* 0xB0-0xB3 */ + 0xCF, 0x49, 0xF3, 0xAA, 0xCF, 0x4A, 0xCF, 0x4B, /* 0xB4-0xB7 */ + 0xCF, 0x4C, 0xCF, 0x4D, 0xC2, 0xDD, 0xCF, 0x4E, /* 0xB8-0xBB */ + 0xCF, 0x4F, 0xF3, 0xAE, 0xCF, 0x50, 0xCF, 0x51, /* 0xBC-0xBF */ + 0xF3, 0xB0, 0xCF, 0x52, 0xCF, 0x53, 0xCF, 0x54, /* 0xC0-0xC3 */ + 0xCF, 0x55, 0xCF, 0x56, 0xF3, 0xA1, 0xCF, 0x57, /* 0xC4-0xC7 */ + 0xCF, 0x58, 0xCF, 0x59, 0xF3, 0xB1, 0xF3, 0xAC, /* 0xC8-0xCB */ + 0xCF, 0x5A, 0xCF, 0x5B, 0xCF, 0x5C, 0xCF, 0x5D, /* 0xCC-0xCF */ + 0xCF, 0x5E, 0xF3, 0xAF, 0xF2, 0xFE, 0xF3, 0xAD, /* 0xD0-0xD3 */ + 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x61, 0xCF, 0x62, /* 0xD4-0xD7 */ + 0xCF, 0x63, 0xCF, 0x64, 0xCF, 0x65, 0xF3, 0xB2, /* 0xD8-0xDB */ + 0xCF, 0x66, 0xCF, 0x67, 0xCF, 0x68, 0xCF, 0x69, /* 0xDC-0xDF */ + 0xF3, 0xB4, 0xCF, 0x6A, 0xCF, 0x6B, 0xCF, 0x6C, /* 0xE0-0xE3 */ + 0xCF, 0x6D, 0xF3, 0xA8, 0xCF, 0x6E, 0xCF, 0x6F, /* 0xE4-0xE7 */ + 0xCF, 0x70, 0xCF, 0x71, 0xF3, 0xB3, 0xCF, 0x72, /* 0xE8-0xEB */ + 0xCF, 0x73, 0xCF, 0x74, 0xF3, 0xB5, 0xCF, 0x75, /* 0xEC-0xEF */ + 0xCF, 0x76, 0xCF, 0x77, 0xCF, 0x78, 0xCF, 0x79, /* 0xF0-0xF3 */ + 0xCF, 0x7A, 0xCF, 0x7B, 0xCF, 0x7C, 0xCF, 0x7D, /* 0xF4-0xF7 */ + 0xCF, 0x7E, 0xD0, 0xB7, 0xCF, 0x80, 0xCF, 0x81, /* 0xF8-0xFB */ + 0xCF, 0x82, 0xCF, 0x83, 0xF3, 0xB8, 0xCF, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xD9, 0xF9, /* 0x00-0x03 */ + 0xCF, 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, /* 0x04-0x07 */ + 0xCF, 0x8C, 0xCF, 0x8D, 0xF3, 0xB9, 0xCF, 0x8E, /* 0x08-0x0B */ + 0xCF, 0x8F, 0xCF, 0x90, 0xCF, 0x91, 0xCF, 0x92, /* 0x0C-0x0F */ + 0xCF, 0x93, 0xCF, 0x94, 0xCF, 0x95, 0xF3, 0xB7, /* 0x10-0x13 */ + 0xCF, 0x96, 0xC8, 0xE4, 0xF3, 0xB6, 0xCF, 0x97, /* 0x14-0x17 */ + 0xCF, 0x98, 0xCF, 0x99, 0xCF, 0x9A, 0xF3, 0xBA, /* 0x18-0x1B */ + 0xCF, 0x9B, 0xCF, 0x9C, 0xCF, 0x9D, 0xCF, 0x9E, /* 0x1C-0x1F */ + 0xCF, 0x9F, 0xF3, 0xBB, 0xB4, 0xC0, 0xCF, 0xA0, /* 0x20-0x23 */ + 0xD0, 0x40, 0xD0, 0x41, 0xD0, 0x42, 0xD0, 0x43, /* 0x24-0x27 */ + 0xD0, 0x44, 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x47, /* 0x28-0x2B */ + 0xD0, 0x48, 0xD0, 0x49, 0xD0, 0x4A, 0xD0, 0x4B, /* 0x2C-0x2F */ + 0xD0, 0x4C, 0xD0, 0x4D, 0xEE, 0xC3, 0xD0, 0x4E, /* 0x30-0x33 */ + 0xD0, 0x4F, 0xD0, 0x50, 0xD0, 0x51, 0xD0, 0x52, /* 0x34-0x37 */ + 0xD0, 0x53, 0xF3, 0xBC, 0xD0, 0x54, 0xD0, 0x55, /* 0x38-0x3B */ + 0xF3, 0xBD, 0xD0, 0x56, 0xD0, 0x57, 0xD0, 0x58, /* 0x3C-0x3F */ + 0xD1, 0xAA, 0xD0, 0x59, 0xD0, 0x5A, 0xD0, 0x5B, /* 0x40-0x43 */ + 0xF4, 0xAC, 0xD0, 0xC6, 0xD0, 0x5C, 0xD0, 0x5D, /* 0x44-0x47 */ + 0xD0, 0x5E, 0xD0, 0x5F, 0xD0, 0x60, 0xD0, 0x61, /* 0x48-0x4B */ + 0xD0, 0xD0, 0xD1, 0xDC, 0xD0, 0x62, 0xD0, 0x63, /* 0x4C-0x4F */ + 0xD0, 0x64, 0xD0, 0x65, 0xD0, 0x66, 0xD0, 0x67, /* 0x50-0x53 */ + 0xCF, 0xCE, 0xD0, 0x68, 0xD0, 0x69, 0xBD, 0xD6, /* 0x54-0x57 */ + 0xD0, 0x6A, 0xD1, 0xC3, 0xD0, 0x6B, 0xD0, 0x6C, /* 0x58-0x5B */ + 0xD0, 0x6D, 0xD0, 0x6E, 0xD0, 0x6F, 0xD0, 0x70, /* 0x5C-0x5F */ + 0xD0, 0x71, 0xBA, 0xE2, 0xE1, 0xE9, 0xD2, 0xC2, /* 0x60-0x63 */ + 0xF1, 0xC2, 0xB2, 0xB9, 0xD0, 0x72, 0xD0, 0x73, /* 0x64-0x67 */ + 0xB1, 0xED, 0xF1, 0xC3, 0xD0, 0x74, 0xC9, 0xC0, /* 0x68-0x6B */ + 0xB3, 0xC4, 0xD0, 0x75, 0xD9, 0xF2, 0xD0, 0x76, /* 0x6C-0x6F */ + 0xCB, 0xA5, 0xD0, 0x77, 0xF1, 0xC4, 0xD0, 0x78, /* 0x70-0x73 */ + 0xD0, 0x79, 0xD0, 0x7A, 0xD0, 0x7B, 0xD6, 0xD4, /* 0x74-0x77 */ + 0xD0, 0x7C, 0xD0, 0x7D, 0xD0, 0x7E, 0xD0, 0x80, /* 0x78-0x7B */ + 0xD0, 0x81, 0xF1, 0xC5, 0xF4, 0xC0, 0xF1, 0xC6, /* 0x7C-0x7F */ + + 0xD0, 0x82, 0xD4, 0xAC, 0xF1, 0xC7, 0xD0, 0x83, /* 0x80-0x83 */ + 0xB0, 0xC0, 0xF4, 0xC1, 0xD0, 0x84, 0xD0, 0x85, /* 0x84-0x87 */ + 0xF4, 0xC2, 0xD0, 0x86, 0xD0, 0x87, 0xB4, 0xFC, /* 0x88-0x8B */ + 0xD0, 0x88, 0xC5, 0xDB, 0xD0, 0x89, 0xD0, 0x8A, /* 0x8C-0x8F */ + 0xD0, 0x8B, 0xD0, 0x8C, 0xCC, 0xBB, 0xD0, 0x8D, /* 0x90-0x93 */ + 0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0xE4, 0xD0, 0x90, /* 0x94-0x97 */ + 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, /* 0x98-0x9B */ + 0xCD, 0xE0, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, /* 0x9C-0x9F */ + 0xD0, 0x98, 0xD0, 0x99, 0xF1, 0xC8, 0xD0, 0x9A, /* 0xA0-0xA3 */ + 0xD9, 0xF3, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, /* 0xA4-0xA7 */ + 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xB1, 0xBB, /* 0xA8-0xAB */ + 0xD1, 0x40, 0xCF, 0xAE, 0xD1, 0x41, 0xD1, 0x42, /* 0xAC-0xAF */ + 0xD1, 0x43, 0xB8, 0xA4, 0xD1, 0x44, 0xD1, 0x45, /* 0xB0-0xB3 */ + 0xD1, 0x46, 0xD1, 0x47, 0xD1, 0x48, 0xF1, 0xCA, /* 0xB4-0xB7 */ + 0xD1, 0x49, 0xD1, 0x4A, 0xD1, 0x4B, 0xD1, 0x4C, /* 0xB8-0xBB */ + 0xF1, 0xCB, 0xD1, 0x4D, 0xD1, 0x4E, 0xD1, 0x4F, /* 0xBC-0xBF */ + 0xD1, 0x50, 0xB2, 0xC3, 0xC1, 0xD1, 0xD1, 0x51, /* 0xC0-0xC3 */ + 0xD1, 0x52, 0xD7, 0xB0, 0xF1, 0xC9, 0xD1, 0x53, /* 0xC4-0xC7 */ + 0xD1, 0x54, 0xF1, 0xCC, 0xD1, 0x55, 0xD1, 0x56, /* 0xC8-0xCB */ + 0xD1, 0x57, 0xD1, 0x58, 0xF1, 0xCE, 0xD1, 0x59, /* 0xCC-0xCF */ + 0xD1, 0x5A, 0xD1, 0x5B, 0xD9, 0xF6, 0xD1, 0x5C, /* 0xD0-0xD3 */ + 0xD2, 0xE1, 0xD4, 0xA3, 0xD1, 0x5D, 0xD1, 0x5E, /* 0xD4-0xD7 */ + 0xF4, 0xC3, 0xC8, 0xB9, 0xD1, 0x5F, 0xD1, 0x60, /* 0xD8-0xDB */ + 0xD1, 0x61, 0xD1, 0x62, 0xD1, 0x63, 0xF4, 0xC4, /* 0xDC-0xDF */ + 0xD1, 0x64, 0xD1, 0x65, 0xF1, 0xCD, 0xF1, 0xCF, /* 0xE0-0xE3 */ + 0xBF, 0xE3, 0xF1, 0xD0, 0xD1, 0x66, 0xD1, 0x67, /* 0xE4-0xE7 */ + 0xF1, 0xD4, 0xD1, 0x68, 0xD1, 0x69, 0xD1, 0x6A, /* 0xE8-0xEB */ + 0xD1, 0x6B, 0xD1, 0x6C, 0xD1, 0x6D, 0xD1, 0x6E, /* 0xEC-0xEF */ + 0xF1, 0xD6, 0xF1, 0xD1, 0xD1, 0x6F, 0xC9, 0xD1, /* 0xF0-0xF3 */ + 0xC5, 0xE1, 0xD1, 0x70, 0xD1, 0x71, 0xD1, 0x72, /* 0xF4-0xF7 */ + 0xC2, 0xE3, 0xB9, 0xFC, 0xD1, 0x73, 0xD1, 0x74, /* 0xF8-0xFB */ + 0xF1, 0xD3, 0xD1, 0x75, 0xF1, 0xD5, 0xD1, 0x76, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0xD1, 0x77, 0xD1, 0x78, 0xB9, 0xD3, 0xD1, 0x79, /* 0x00-0x03 */ + 0xD1, 0x7A, 0xD1, 0x7B, 0xD1, 0x7C, 0xD1, 0x7D, /* 0x04-0x07 */ + 0xD1, 0x7E, 0xD1, 0x80, 0xF1, 0xDB, 0xD1, 0x81, /* 0x08-0x0B */ + 0xD1, 0x82, 0xD1, 0x83, 0xD1, 0x84, 0xD1, 0x85, /* 0x0C-0x0F */ + 0xBA, 0xD6, 0xD1, 0x86, 0xB0, 0xFD, 0xF1, 0xD9, /* 0x10-0x13 */ + 0xD1, 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, /* 0x14-0x17 */ + 0xD1, 0x8B, 0xF1, 0xD8, 0xF1, 0xD2, 0xF1, 0xDA, /* 0x18-0x1B */ + 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, 0x8F, /* 0x1C-0x1F */ + 0xD1, 0x90, 0xF1, 0xD7, 0xD1, 0x91, 0xD1, 0x92, /* 0x20-0x23 */ + 0xD1, 0x93, 0xC8, 0xEC, 0xD1, 0x94, 0xD1, 0x95, /* 0x24-0x27 */ + 0xD1, 0x96, 0xD1, 0x97, 0xCD, 0xCA, 0xF1, 0xDD, /* 0x28-0x2B */ + 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, 0x9B, /* 0x2C-0x2F */ + 0xE5, 0xBD, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, /* 0x30-0x33 */ + 0xF1, 0xDC, 0xD1, 0x9F, 0xF1, 0xDE, 0xD1, 0xA0, /* 0x34-0x37 */ + 0xD2, 0x40, 0xD2, 0x41, 0xD2, 0x42, 0xD2, 0x43, /* 0x38-0x3B */ + 0xD2, 0x44, 0xD2, 0x45, 0xD2, 0x46, 0xD2, 0x47, /* 0x3C-0x3F */ + 0xD2, 0x48, 0xF1, 0xDF, 0xD2, 0x49, 0xD2, 0x4A, /* 0x40-0x43 */ + 0xCF, 0xE5, 0xD2, 0x4B, 0xD2, 0x4C, 0xD2, 0x4D, /* 0x44-0x47 */ + 0xD2, 0x4E, 0xD2, 0x4F, 0xD2, 0x50, 0xD2, 0x51, /* 0x48-0x4B */ + 0xD2, 0x52, 0xD2, 0x53, 0xD2, 0x54, 0xD2, 0x55, /* 0x4C-0x4F */ + 0xD2, 0x56, 0xD2, 0x57, 0xD2, 0x58, 0xD2, 0x59, /* 0x50-0x53 */ + 0xD2, 0x5A, 0xD2, 0x5B, 0xD2, 0x5C, 0xD2, 0x5D, /* 0x54-0x57 */ + 0xD2, 0x5E, 0xD2, 0x5F, 0xD2, 0x60, 0xD2, 0x61, /* 0x58-0x5B */ + 0xD2, 0x62, 0xD2, 0x63, 0xF4, 0xC5, 0xBD, 0xF3, /* 0x5C-0x5F */ + 0xD2, 0x64, 0xD2, 0x65, 0xD2, 0x66, 0xD2, 0x67, /* 0x60-0x63 */ + 0xD2, 0x68, 0xD2, 0x69, 0xF1, 0xE0, 0xD2, 0x6A, /* 0x64-0x67 */ + 0xD2, 0x6B, 0xD2, 0x6C, 0xD2, 0x6D, 0xD2, 0x6E, /* 0x68-0x6B */ + 0xD2, 0x6F, 0xD2, 0x70, 0xD2, 0x71, 0xD2, 0x72, /* 0x6C-0x6F */ + 0xD2, 0x73, 0xD2, 0x74, 0xD2, 0x75, 0xD2, 0x76, /* 0x70-0x73 */ + 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, 0xD2, 0x7A, /* 0x74-0x77 */ + 0xD2, 0x7B, 0xD2, 0x7C, 0xD2, 0x7D, 0xF1, 0xE1, /* 0x78-0x7B */ + 0xD2, 0x7E, 0xD2, 0x80, 0xD2, 0x81, 0xCE, 0xF7, /* 0x7C-0x7F */ + + 0xD2, 0x82, 0xD2, 0xAA, 0xD2, 0x83, 0xF1, 0xFB, /* 0x80-0x83 */ + 0xD2, 0x84, 0xD2, 0x85, 0xB8, 0xB2, 0xD2, 0x86, /* 0x84-0x87 */ + 0xD2, 0x87, 0xD2, 0x88, 0xD2, 0x89, 0xD2, 0x8A, /* 0x88-0x8B */ + 0xD2, 0x8B, 0xD2, 0x8C, 0xD2, 0x8D, 0xD2, 0x8E, /* 0x8C-0x8F */ + 0xD2, 0x8F, 0xD2, 0x90, 0xD2, 0x91, 0xD2, 0x92, /* 0x90-0x93 */ + 0xD2, 0x93, 0xD2, 0x94, 0xD2, 0x95, 0xD2, 0x96, /* 0x94-0x97 */ + 0xD2, 0x97, 0xD2, 0x98, 0xD2, 0x99, 0xD2, 0x9A, /* 0x98-0x9B */ + 0xD2, 0x9B, 0xD2, 0x9C, 0xD2, 0x9D, 0xD2, 0x9E, /* 0x9C-0x9F */ + 0xD2, 0x9F, 0xD2, 0xA0, 0xD3, 0x40, 0xD3, 0x41, /* 0xA0-0xA3 */ + 0xD3, 0x42, 0xD3, 0x43, 0xD3, 0x44, 0xD3, 0x45, /* 0xA4-0xA7 */ + 0xD3, 0x46, 0xD3, 0x47, 0xD3, 0x48, 0xD3, 0x49, /* 0xA8-0xAB */ + 0xD3, 0x4A, 0xD3, 0x4B, 0xD3, 0x4C, 0xD3, 0x4D, /* 0xAC-0xAF */ + 0xD3, 0x4E, 0xD3, 0x4F, 0xD3, 0x50, 0xD3, 0x51, /* 0xB0-0xB3 */ + 0xD3, 0x52, 0xD3, 0x53, 0xD3, 0x54, 0xD3, 0x55, /* 0xB4-0xB7 */ + 0xD3, 0x56, 0xD3, 0x57, 0xD3, 0x58, 0xD3, 0x59, /* 0xB8-0xBB */ + 0xD3, 0x5A, 0xD3, 0x5B, 0xD3, 0x5C, 0xD3, 0x5D, /* 0xBC-0xBF */ + 0xD3, 0x5E, 0xBC, 0xFB, 0xB9, 0xDB, 0xD3, 0x5F, /* 0xC0-0xC3 */ + 0xB9, 0xE6, 0xC3, 0xD9, 0xCA, 0xD3, 0xEA, 0xE8, /* 0xC4-0xC7 */ + 0xC0, 0xC0, 0xBE, 0xF5, 0xEA, 0xE9, 0xEA, 0xEA, /* 0xC8-0xCB */ + 0xEA, 0xEB, 0xD3, 0x60, 0xEA, 0xEC, 0xEA, 0xED, /* 0xCC-0xCF */ + 0xEA, 0xEE, 0xEA, 0xEF, 0xBD, 0xC7, 0xD3, 0x61, /* 0xD0-0xD3 */ + 0xD3, 0x62, 0xD3, 0x63, 0xF5, 0xFB, 0xD3, 0x64, /* 0xD4-0xD7 */ + 0xD3, 0x65, 0xD3, 0x66, 0xF5, 0xFD, 0xD3, 0x67, /* 0xD8-0xDB */ + 0xF5, 0xFE, 0xD3, 0x68, 0xF5, 0xFC, 0xD3, 0x69, /* 0xDC-0xDF */ + 0xD3, 0x6A, 0xD3, 0x6B, 0xD3, 0x6C, 0xBD, 0xE2, /* 0xE0-0xE3 */ + 0xD3, 0x6D, 0xF6, 0xA1, 0xB4, 0xA5, 0xD3, 0x6E, /* 0xE4-0xE7 */ + 0xD3, 0x6F, 0xD3, 0x70, 0xD3, 0x71, 0xF6, 0xA2, /* 0xE8-0xEB */ + 0xD3, 0x72, 0xD3, 0x73, 0xD3, 0x74, 0xF6, 0xA3, /* 0xEC-0xEF */ + 0xD3, 0x75, 0xD3, 0x76, 0xD3, 0x77, 0xEC, 0xB2, /* 0xF0-0xF3 */ + 0xD3, 0x78, 0xD3, 0x79, 0xD3, 0x7A, 0xD3, 0x7B, /* 0xF4-0xF7 */ + 0xD3, 0x7C, 0xD3, 0x7D, 0xD3, 0x7E, 0xD3, 0x80, /* 0xF8-0xFB */ + 0xD3, 0x81, 0xD3, 0x82, 0xD3, 0x83, 0xD3, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8A[512] = { + 0xD1, 0xD4, 0xD3, 0x85, 0xD3, 0x86, 0xD3, 0x87, /* 0x00-0x03 */ + 0xD3, 0x88, 0xD3, 0x89, 0xD3, 0x8A, 0xD9, 0xEA, /* 0x04-0x07 */ + 0xD3, 0x8B, 0xD3, 0x8C, 0xD3, 0x8D, 0xD3, 0x8E, /* 0x08-0x0B */ + 0xD3, 0x8F, 0xD3, 0x90, 0xD3, 0x91, 0xD3, 0x92, /* 0x0C-0x0F */ + 0xD3, 0x93, 0xD3, 0x94, 0xD3, 0x95, 0xD3, 0x96, /* 0x10-0x13 */ + 0xD3, 0x97, 0xD3, 0x98, 0xD3, 0x99, 0xD3, 0x9A, /* 0x14-0x17 */ + 0xD3, 0x9B, 0xD3, 0x9C, 0xD3, 0x9D, 0xD3, 0x9E, /* 0x18-0x1B */ + 0xD3, 0x9F, 0xD3, 0xA0, 0xD4, 0x40, 0xD4, 0x41, /* 0x1C-0x1F */ + 0xD4, 0x42, 0xD4, 0x43, 0xD4, 0x44, 0xD4, 0x45, /* 0x20-0x23 */ + 0xD4, 0x46, 0xD4, 0x47, 0xD4, 0x48, 0xD4, 0x49, /* 0x24-0x27 */ + 0xD4, 0x4A, 0xD4, 0x4B, 0xD4, 0x4C, 0xD4, 0x4D, /* 0x28-0x2B */ + 0xD4, 0x4E, 0xD4, 0x4F, 0xD4, 0x50, 0xD4, 0x51, /* 0x2C-0x2F */ + 0xD4, 0x52, 0xD4, 0x53, 0xD4, 0x54, 0xD4, 0x55, /* 0x30-0x33 */ + 0xD4, 0x56, 0xD4, 0x57, 0xD4, 0x58, 0xD4, 0x59, /* 0x34-0x37 */ + 0xD4, 0x5A, 0xD4, 0x5B, 0xD4, 0x5C, 0xD4, 0x5D, /* 0x38-0x3B */ + 0xD4, 0x5E, 0xD4, 0x5F, 0xF6, 0xA4, 0xD4, 0x60, /* 0x3C-0x3F */ + 0xD4, 0x61, 0xD4, 0x62, 0xD4, 0x63, 0xD4, 0x64, /* 0x40-0x43 */ + 0xD4, 0x65, 0xD4, 0x66, 0xD4, 0x67, 0xD4, 0x68, /* 0x44-0x47 */ + 0xEE, 0xBA, 0xD4, 0x69, 0xD4, 0x6A, 0xD4, 0x6B, /* 0x48-0x4B */ + 0xD4, 0x6C, 0xD4, 0x6D, 0xD4, 0x6E, 0xD4, 0x6F, /* 0x4C-0x4F */ + 0xD4, 0x70, 0xD4, 0x71, 0xD4, 0x72, 0xD4, 0x73, /* 0x50-0x53 */ + 0xD4, 0x74, 0xD4, 0x75, 0xD4, 0x76, 0xD4, 0x77, /* 0x54-0x57 */ + 0xD4, 0x78, 0xD4, 0x79, 0xD4, 0x7A, 0xD4, 0x7B, /* 0x58-0x5B */ + 0xD4, 0x7C, 0xD4, 0x7D, 0xD4, 0x7E, 0xD4, 0x80, /* 0x5C-0x5F */ + 0xD4, 0x81, 0xD4, 0x82, 0xD4, 0x83, 0xD4, 0x84, /* 0x60-0x63 */ + 0xD4, 0x85, 0xD4, 0x86, 0xD4, 0x87, 0xD4, 0x88, /* 0x64-0x67 */ + 0xD4, 0x89, 0xD4, 0x8A, 0xD4, 0x8B, 0xD4, 0x8C, /* 0x68-0x6B */ + 0xD4, 0x8D, 0xD4, 0x8E, 0xD4, 0x8F, 0xD4, 0x90, /* 0x6C-0x6F */ + 0xD4, 0x91, 0xD4, 0x92, 0xD4, 0x93, 0xD4, 0x94, /* 0x70-0x73 */ + 0xD4, 0x95, 0xD4, 0x96, 0xD4, 0x97, 0xD4, 0x98, /* 0x74-0x77 */ + 0xD4, 0x99, 0xD5, 0xB2, 0xD4, 0x9A, 0xD4, 0x9B, /* 0x78-0x7B */ + 0xD4, 0x9C, 0xD4, 0x9D, 0xD4, 0x9E, 0xD4, 0x9F, /* 0x7C-0x7F */ + + 0xD4, 0xA0, 0xD5, 0x40, 0xD5, 0x41, 0xD5, 0x42, /* 0x80-0x83 */ + 0xD5, 0x43, 0xD5, 0x44, 0xD5, 0x45, 0xD5, 0x46, /* 0x84-0x87 */ + 0xD5, 0x47, 0xD3, 0xFE, 0xCC, 0xDC, 0xD5, 0x48, /* 0x88-0x8B */ + 0xD5, 0x49, 0xD5, 0x4A, 0xD5, 0x4B, 0xD5, 0x4C, /* 0x8C-0x8F */ + 0xD5, 0x4D, 0xD5, 0x4E, 0xD5, 0x4F, 0xCA, 0xC4, /* 0x90-0x93 */ + 0xD5, 0x50, 0xD5, 0x51, 0xD5, 0x52, 0xD5, 0x53, /* 0x94-0x97 */ + 0xD5, 0x54, 0xD5, 0x55, 0xD5, 0x56, 0xD5, 0x57, /* 0x98-0x9B */ + 0xD5, 0x58, 0xD5, 0x59, 0xD5, 0x5A, 0xD5, 0x5B, /* 0x9C-0x9F */ + 0xD5, 0x5C, 0xD5, 0x5D, 0xD5, 0x5E, 0xD5, 0x5F, /* 0xA0-0xA3 */ + 0xD5, 0x60, 0xD5, 0x61, 0xD5, 0x62, 0xD5, 0x63, /* 0xA4-0xA7 */ + 0xD5, 0x64, 0xD5, 0x65, 0xD5, 0x66, 0xD5, 0x67, /* 0xA8-0xAB */ + 0xD5, 0x68, 0xD5, 0x69, 0xD5, 0x6A, 0xD5, 0x6B, /* 0xAC-0xAF */ + 0xD5, 0x6C, 0xD5, 0x6D, 0xD5, 0x6E, 0xD5, 0x6F, /* 0xB0-0xB3 */ + 0xD5, 0x70, 0xD5, 0x71, 0xD5, 0x72, 0xD5, 0x73, /* 0xB4-0xB7 */ + 0xD5, 0x74, 0xD5, 0x75, 0xD5, 0x76, 0xD5, 0x77, /* 0xB8-0xBB */ + 0xD5, 0x78, 0xD5, 0x79, 0xD5, 0x7A, 0xD5, 0x7B, /* 0xBC-0xBF */ + 0xD5, 0x7C, 0xD5, 0x7D, 0xD5, 0x7E, 0xD5, 0x80, /* 0xC0-0xC3 */ + 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, /* 0xC4-0xC7 */ + 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, /* 0xC8-0xCB */ + 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, /* 0xCC-0xCF */ + 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, /* 0xD0-0xD3 */ + 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, /* 0xD4-0xD7 */ + 0xD5, 0x95, 0xD5, 0x96, 0xD5, 0x97, 0xD5, 0x98, /* 0xD8-0xDB */ + 0xD5, 0x99, 0xD5, 0x9A, 0xD5, 0x9B, 0xD5, 0x9C, /* 0xDC-0xDF */ + 0xD5, 0x9D, 0xD5, 0x9E, 0xD5, 0x9F, 0xD5, 0xA0, /* 0xE0-0xE3 */ + 0xD6, 0x40, 0xD6, 0x41, 0xD6, 0x42, 0xD6, 0x43, /* 0xE4-0xE7 */ + 0xD6, 0x44, 0xD6, 0x45, 0xD6, 0x46, 0xD6, 0x47, /* 0xE8-0xEB */ + 0xD6, 0x48, 0xD6, 0x49, 0xD6, 0x4A, 0xD6, 0x4B, /* 0xEC-0xEF */ + 0xD6, 0x4C, 0xD6, 0x4D, 0xD6, 0x4E, 0xD6, 0x4F, /* 0xF0-0xF3 */ + 0xD6, 0x50, 0xD6, 0x51, 0xD6, 0x52, 0xD6, 0x53, /* 0xF4-0xF7 */ + 0xD6, 0x54, 0xD6, 0x55, 0xD6, 0x56, 0xD6, 0x57, /* 0xF8-0xFB */ + 0xD6, 0x58, 0xD6, 0x59, 0xD6, 0x5A, 0xD6, 0x5B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xD6, 0x5C, 0xD6, 0x5D, 0xD6, 0x5E, 0xD6, 0x5F, /* 0x00-0x03 */ + 0xD6, 0x60, 0xD6, 0x61, 0xD6, 0x62, 0xE5, 0xC0, /* 0x04-0x07 */ + 0xD6, 0x63, 0xD6, 0x64, 0xD6, 0x65, 0xD6, 0x66, /* 0x08-0x0B */ + 0xD6, 0x67, 0xD6, 0x68, 0xD6, 0x69, 0xD6, 0x6A, /* 0x0C-0x0F */ + 0xD6, 0x6B, 0xD6, 0x6C, 0xD6, 0x6D, 0xD6, 0x6E, /* 0x10-0x13 */ + 0xD6, 0x6F, 0xD6, 0x70, 0xD6, 0x71, 0xD6, 0x72, /* 0x14-0x17 */ + 0xD6, 0x73, 0xD6, 0x74, 0xD6, 0x75, 0xD6, 0x76, /* 0x18-0x1B */ + 0xD6, 0x77, 0xD6, 0x78, 0xD6, 0x79, 0xD6, 0x7A, /* 0x1C-0x1F */ + 0xD6, 0x7B, 0xD6, 0x7C, 0xD6, 0x7D, 0xD6, 0x7E, /* 0x20-0x23 */ + 0xD6, 0x80, 0xD6, 0x81, 0xF6, 0xA5, 0xD6, 0x82, /* 0x24-0x27 */ + 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, 0x86, /* 0x28-0x2B */ + 0xD6, 0x87, 0xD6, 0x88, 0xD6, 0x89, 0xD6, 0x8A, /* 0x2C-0x2F */ + 0xD6, 0x8B, 0xD6, 0x8C, 0xD6, 0x8D, 0xD6, 0x8E, /* 0x30-0x33 */ + 0xD6, 0x8F, 0xD6, 0x90, 0xD6, 0x91, 0xD6, 0x92, /* 0x34-0x37 */ + 0xD6, 0x93, 0xD6, 0x94, 0xD6, 0x95, 0xD6, 0x96, /* 0x38-0x3B */ + 0xD6, 0x97, 0xD6, 0x98, 0xD6, 0x99, 0xD6, 0x9A, /* 0x3C-0x3F */ + 0xD6, 0x9B, 0xD6, 0x9C, 0xD6, 0x9D, 0xD6, 0x9E, /* 0x40-0x43 */ + 0xD6, 0x9F, 0xD6, 0xA0, 0xD7, 0x40, 0xD7, 0x41, /* 0x44-0x47 */ + 0xD7, 0x42, 0xD7, 0x43, 0xD7, 0x44, 0xD7, 0x45, /* 0x48-0x4B */ + 0xD7, 0x46, 0xD7, 0x47, 0xD7, 0x48, 0xD7, 0x49, /* 0x4C-0x4F */ + 0xD7, 0x4A, 0xD7, 0x4B, 0xD7, 0x4C, 0xD7, 0x4D, /* 0x50-0x53 */ + 0xD7, 0x4E, 0xD7, 0x4F, 0xD7, 0x50, 0xD7, 0x51, /* 0x54-0x57 */ + 0xD7, 0x52, 0xD7, 0x53, 0xD7, 0x54, 0xD7, 0x55, /* 0x58-0x5B */ + 0xD7, 0x56, 0xD7, 0x57, 0xD7, 0x58, 0xD7, 0x59, /* 0x5C-0x5F */ + 0xD7, 0x5A, 0xD7, 0x5B, 0xD7, 0x5C, 0xD7, 0x5D, /* 0x60-0x63 */ + 0xD7, 0x5E, 0xD7, 0x5F, 0xBE, 0xAF, 0xD7, 0x60, /* 0x64-0x67 */ + 0xD7, 0x61, 0xD7, 0x62, 0xD7, 0x63, 0xD7, 0x64, /* 0x68-0x6B */ + 0xC6, 0xA9, 0xD7, 0x65, 0xD7, 0x66, 0xD7, 0x67, /* 0x6C-0x6F */ + 0xD7, 0x68, 0xD7, 0x69, 0xD7, 0x6A, 0xD7, 0x6B, /* 0x70-0x73 */ + 0xD7, 0x6C, 0xD7, 0x6D, 0xD7, 0x6E, 0xD7, 0x6F, /* 0x74-0x77 */ + 0xD7, 0x70, 0xD7, 0x71, 0xD7, 0x72, 0xD7, 0x73, /* 0x78-0x7B */ + 0xD7, 0x74, 0xD7, 0x75, 0xD7, 0x76, 0xD7, 0x77, /* 0x7C-0x7F */ + + 0xD7, 0x78, 0xD7, 0x79, 0xD7, 0x7A, 0xD7, 0x7B, /* 0x80-0x83 */ + 0xD7, 0x7C, 0xD7, 0x7D, 0xD7, 0x7E, 0xD7, 0x80, /* 0x84-0x87 */ + 0xD7, 0x81, 0xD7, 0x82, 0xD7, 0x83, 0xD7, 0x84, /* 0x88-0x8B */ + 0xD7, 0x85, 0xD7, 0x86, 0xD7, 0x87, 0xD7, 0x88, /* 0x8C-0x8F */ + 0xD7, 0x89, 0xD7, 0x8A, 0xD7, 0x8B, 0xD7, 0x8C, /* 0x90-0x93 */ + 0xD7, 0x8D, 0xD7, 0x8E, 0xD7, 0x8F, 0xD7, 0x90, /* 0x94-0x97 */ + 0xD7, 0x91, 0xD7, 0x92, 0xD7, 0x93, 0xD7, 0x94, /* 0x98-0x9B */ + 0xD7, 0x95, 0xD7, 0x96, 0xD7, 0x97, 0xD7, 0x98, /* 0x9C-0x9F */ + 0xDA, 0xA5, 0xBC, 0xC6, 0xB6, 0xA9, 0xB8, 0xBC, /* 0xA0-0xA3 */ + 0xC8, 0xCF, 0xBC, 0xA5, 0xDA, 0xA6, 0xDA, 0xA7, /* 0xA4-0xA7 */ + 0xCC, 0xD6, 0xC8, 0xC3, 0xDA, 0xA8, 0xC6, 0xFD, /* 0xA8-0xAB */ + 0xD7, 0x99, 0xD1, 0xB5, 0xD2, 0xE9, 0xD1, 0xB6, /* 0xAC-0xAF */ + 0xBC, 0xC7, 0xD7, 0x9A, 0xBD, 0xB2, 0xBB, 0xE4, /* 0xB0-0xB3 */ + 0xDA, 0xA9, 0xDA, 0xAA, 0xD1, 0xC8, 0xDA, 0xAB, /* 0xB4-0xB7 */ + 0xD0, 0xED, 0xB6, 0xEF, 0xC2, 0xDB, 0xD7, 0x9B, /* 0xB8-0xBB */ + 0xCB, 0xCF, 0xB7, 0xED, 0xC9, 0xE8, 0xB7, 0xC3, /* 0xBC-0xBF */ + 0xBE, 0xF7, 0xD6, 0xA4, 0xDA, 0xAC, 0xDA, 0xAD, /* 0xC0-0xC3 */ + 0xC6, 0xC0, 0xD7, 0xE7, 0xCA, 0xB6, 0xD7, 0x9C, /* 0xC4-0xC7 */ + 0xD5, 0xA9, 0xCB, 0xDF, 0xD5, 0xEF, 0xDA, 0xAE, /* 0xC8-0xCB */ + 0xD6, 0xDF, 0xB4, 0xCA, 0xDA, 0xB0, 0xDA, 0xAF, /* 0xCC-0xCF */ + 0xD7, 0x9D, 0xD2, 0xEB, 0xDA, 0xB1, 0xDA, 0xB2, /* 0xD0-0xD3 */ + 0xDA, 0xB3, 0xCA, 0xD4, 0xDA, 0xB4, 0xCA, 0xAB, /* 0xD4-0xD7 */ + 0xDA, 0xB5, 0xDA, 0xB6, 0xB3, 0xCF, 0xD6, 0xEF, /* 0xD8-0xDB */ + 0xDA, 0xB7, 0xBB, 0xB0, 0xB5, 0xAE, 0xDA, 0xB8, /* 0xDC-0xDF */ + 0xDA, 0xB9, 0xB9, 0xEE, 0xD1, 0xAF, 0xD2, 0xE8, /* 0xE0-0xE3 */ + 0xDA, 0xBA, 0xB8, 0xC3, 0xCF, 0xEA, 0xB2, 0xEF, /* 0xE4-0xE7 */ + 0xDA, 0xBB, 0xDA, 0xBC, 0xD7, 0x9E, 0xBD, 0xEB, /* 0xE8-0xEB */ + 0xCE, 0xDC, 0xD3, 0xEF, 0xDA, 0xBD, 0xCE, 0xF3, /* 0xEC-0xEF */ + 0xDA, 0xBE, 0xD3, 0xD5, 0xBB, 0xE5, 0xDA, 0xBF, /* 0xF0-0xF3 */ + 0xCB, 0xB5, 0xCB, 0xD0, 0xDA, 0xC0, 0xC7, 0xEB, /* 0xF4-0xF7 */ + 0xD6, 0xEE, 0xDA, 0xC1, 0xC5, 0xB5, 0xB6, 0xC1, /* 0xF8-0xFB */ + 0xDA, 0xC2, 0xB7, 0xCC, 0xBF, 0xCE, 0xDA, 0xC3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8C[512] = { + 0xDA, 0xC4, 0xCB, 0xAD, 0xDA, 0xC5, 0xB5, 0xF7, /* 0x00-0x03 */ + 0xDA, 0xC6, 0xC1, 0xC2, 0xD7, 0xBB, 0xDA, 0xC7, /* 0x04-0x07 */ + 0xCC, 0xB8, 0xD7, 0x9F, 0xD2, 0xEA, 0xC4, 0xB1, /* 0x08-0x0B */ + 0xDA, 0xC8, 0xB5, 0xFD, 0xBB, 0xD1, 0xDA, 0xC9, /* 0x0C-0x0F */ + 0xD0, 0xB3, 0xDA, 0xCA, 0xDA, 0xCB, 0xCE, 0xBD, /* 0x10-0x13 */ + 0xDA, 0xCC, 0xDA, 0xCD, 0xDA, 0xCE, 0xB2, 0xF7, /* 0x14-0x17 */ + 0xDA, 0xD1, 0xDA, 0xCF, 0xD1, 0xE8, 0xDA, 0xD0, /* 0x18-0x1B */ + 0xC3, 0xD5, 0xDA, 0xD2, 0xD7, 0xA0, 0xDA, 0xD3, /* 0x1C-0x1F */ + 0xDA, 0xD4, 0xDA, 0xD5, 0xD0, 0xBB, 0xD2, 0xA5, /* 0x20-0x23 */ + 0xB0, 0xF9, 0xDA, 0xD6, 0xC7, 0xAB, 0xDA, 0xD7, /* 0x24-0x27 */ + 0xBD, 0xF7, 0xC3, 0xA1, 0xDA, 0xD8, 0xDA, 0xD9, /* 0x28-0x2B */ + 0xC3, 0xFD, 0xCC, 0xB7, 0xDA, 0xDA, 0xDA, 0xDB, /* 0x2C-0x2F */ + 0xC0, 0xBE, 0xC6, 0xD7, 0xDA, 0xDC, 0xDA, 0xDD, /* 0x30-0x33 */ + 0xC7, 0xB4, 0xDA, 0xDE, 0xDA, 0xDF, 0xB9, 0xC8, /* 0x34-0x37 */ + 0xD8, 0x40, 0xD8, 0x41, 0xD8, 0x42, 0xD8, 0x43, /* 0x38-0x3B */ + 0xD8, 0x44, 0xD8, 0x45, 0xD8, 0x46, 0xD8, 0x47, /* 0x3C-0x3F */ + 0xD8, 0x48, 0xBB, 0xED, 0xD8, 0x49, 0xD8, 0x4A, /* 0x40-0x43 */ + 0xD8, 0x4B, 0xD8, 0x4C, 0xB6, 0xB9, 0xF4, 0xF8, /* 0x44-0x47 */ + 0xD8, 0x4D, 0xF4, 0xF9, 0xD8, 0x4E, 0xD8, 0x4F, /* 0x48-0x4B */ + 0xCD, 0xE3, 0xD8, 0x50, 0xD8, 0x51, 0xD8, 0x52, /* 0x4C-0x4F */ + 0xD8, 0x53, 0xD8, 0x54, 0xD8, 0x55, 0xD8, 0x56, /* 0x50-0x53 */ + 0xD8, 0x57, 0xF5, 0xB9, 0xD8, 0x58, 0xD8, 0x59, /* 0x54-0x57 */ + 0xD8, 0x5A, 0xD8, 0x5B, 0xEB, 0xE0, 0xD8, 0x5C, /* 0x58-0x5B */ + 0xD8, 0x5D, 0xD8, 0x5E, 0xD8, 0x5F, 0xD8, 0x60, /* 0x5C-0x5F */ + 0xD8, 0x61, 0xCF, 0xF3, 0xBB, 0xBF, 0xD8, 0x62, /* 0x60-0x63 */ + 0xD8, 0x63, 0xD8, 0x64, 0xD8, 0x65, 0xD8, 0x66, /* 0x64-0x67 */ + 0xD8, 0x67, 0xD8, 0x68, 0xBA, 0xC0, 0xD4, 0xA5, /* 0x68-0x6B */ + 0xD8, 0x69, 0xD8, 0x6A, 0xD8, 0x6B, 0xD8, 0x6C, /* 0x6C-0x6F */ + 0xD8, 0x6D, 0xD8, 0x6E, 0xD8, 0x6F, 0xE1, 0xD9, /* 0x70-0x73 */ + 0xD8, 0x70, 0xD8, 0x71, 0xD8, 0x72, 0xD8, 0x73, /* 0x74-0x77 */ + 0xF5, 0xF4, 0xB1, 0xAA, 0xB2, 0xF2, 0xD8, 0x74, /* 0x78-0x7B */ + 0xD8, 0x75, 0xD8, 0x76, 0xD8, 0x77, 0xD8, 0x78, /* 0x7C-0x7F */ + + 0xD8, 0x79, 0xD8, 0x7A, 0xF5, 0xF5, 0xD8, 0x7B, /* 0x80-0x83 */ + 0xD8, 0x7C, 0xF5, 0xF7, 0xD8, 0x7D, 0xD8, 0x7E, /* 0x84-0x87 */ + 0xD8, 0x80, 0xBA, 0xD1, 0xF5, 0xF6, 0xD8, 0x81, /* 0x88-0x8B */ + 0xC3, 0xB2, 0xD8, 0x82, 0xD8, 0x83, 0xD8, 0x84, /* 0x8C-0x8F */ + 0xD8, 0x85, 0xD8, 0x86, 0xD8, 0x87, 0xD8, 0x88, /* 0x90-0x93 */ + 0xF5, 0xF9, 0xD8, 0x89, 0xD8, 0x8A, 0xD8, 0x8B, /* 0x94-0x97 */ + 0xF5, 0xF8, 0xD8, 0x8C, 0xD8, 0x8D, 0xD8, 0x8E, /* 0x98-0x9B */ + 0xD8, 0x8F, 0xD8, 0x90, 0xD8, 0x91, 0xD8, 0x92, /* 0x9C-0x9F */ + 0xD8, 0x93, 0xD8, 0x94, 0xD8, 0x95, 0xD8, 0x96, /* 0xA0-0xA3 */ + 0xD8, 0x97, 0xD8, 0x98, 0xD8, 0x99, 0xD8, 0x9A, /* 0xA4-0xA7 */ + 0xD8, 0x9B, 0xD8, 0x9C, 0xD8, 0x9D, 0xD8, 0x9E, /* 0xA8-0xAB */ + 0xD8, 0x9F, 0xD8, 0xA0, 0xD9, 0x40, 0xD9, 0x41, /* 0xAC-0xAF */ + 0xD9, 0x42, 0xD9, 0x43, 0xD9, 0x44, 0xD9, 0x45, /* 0xB0-0xB3 */ + 0xD9, 0x46, 0xD9, 0x47, 0xD9, 0x48, 0xD9, 0x49, /* 0xB4-0xB7 */ + 0xD9, 0x4A, 0xD9, 0x4B, 0xD9, 0x4C, 0xD9, 0x4D, /* 0xB8-0xBB */ + 0xD9, 0x4E, 0xD9, 0x4F, 0xD9, 0x50, 0xD9, 0x51, /* 0xBC-0xBF */ + 0xD9, 0x52, 0xD9, 0x53, 0xD9, 0x54, 0xD9, 0x55, /* 0xC0-0xC3 */ + 0xD9, 0x56, 0xD9, 0x57, 0xD9, 0x58, 0xD9, 0x59, /* 0xC4-0xC7 */ + 0xD9, 0x5A, 0xD9, 0x5B, 0xD9, 0x5C, 0xD9, 0x5D, /* 0xC8-0xCB */ + 0xD9, 0x5E, 0xD9, 0x5F, 0xD9, 0x60, 0xD9, 0x61, /* 0xCC-0xCF */ + 0xD9, 0x62, 0xD9, 0x63, 0xD9, 0x64, 0xD9, 0x65, /* 0xD0-0xD3 */ + 0xD9, 0x66, 0xD9, 0x67, 0xD9, 0x68, 0xD9, 0x69, /* 0xD4-0xD7 */ + 0xD9, 0x6A, 0xD9, 0x6B, 0xD9, 0x6C, 0xD9, 0x6D, /* 0xD8-0xDB */ + 0xD9, 0x6E, 0xD9, 0x6F, 0xD9, 0x70, 0xD9, 0x71, /* 0xDC-0xDF */ + 0xD9, 0x72, 0xD9, 0x73, 0xD9, 0x74, 0xD9, 0x75, /* 0xE0-0xE3 */ + 0xD9, 0x76, 0xD9, 0x77, 0xD9, 0x78, 0xD9, 0x79, /* 0xE4-0xE7 */ + 0xD9, 0x7A, 0xD9, 0x7B, 0xD9, 0x7C, 0xD9, 0x7D, /* 0xE8-0xEB */ + 0xD9, 0x7E, 0xD9, 0x80, 0xD9, 0x81, 0xD9, 0x82, /* 0xEC-0xEF */ + 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, /* 0xF0-0xF3 */ + 0xD9, 0x87, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x8A, /* 0xF4-0xF7 */ + 0xD9, 0x8B, 0xD9, 0x8C, 0xD9, 0x8D, 0xD9, 0x8E, /* 0xF8-0xFB */ + 0xD9, 0x8F, 0xD9, 0x90, 0xD9, 0x91, 0xD9, 0x92, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0xD9, 0x93, 0xD9, 0x94, 0xD9, 0x95, 0xD9, 0x96, /* 0x00-0x03 */ + 0xD9, 0x97, 0xD9, 0x98, 0xD9, 0x99, 0xD9, 0x9A, /* 0x04-0x07 */ + 0xD9, 0x9B, 0xD9, 0x9C, 0xD9, 0x9D, 0xD9, 0x9E, /* 0x08-0x0B */ + 0xD9, 0x9F, 0xD9, 0xA0, 0xDA, 0x40, 0xDA, 0x41, /* 0x0C-0x0F */ + 0xDA, 0x42, 0xDA, 0x43, 0xDA, 0x44, 0xDA, 0x45, /* 0x10-0x13 */ + 0xDA, 0x46, 0xDA, 0x47, 0xDA, 0x48, 0xDA, 0x49, /* 0x14-0x17 */ + 0xDA, 0x4A, 0xDA, 0x4B, 0xDA, 0x4C, 0xDA, 0x4D, /* 0x18-0x1B */ + 0xDA, 0x4E, 0xB1, 0xB4, 0xD5, 0xEA, 0xB8, 0xBA, /* 0x1C-0x1F */ + 0xDA, 0x4F, 0xB9, 0xB1, 0xB2, 0xC6, 0xD4, 0xF0, /* 0x20-0x23 */ + 0xCF, 0xCD, 0xB0, 0xDC, 0xD5, 0xCB, 0xBB, 0xF5, /* 0x24-0x27 */ + 0xD6, 0xCA, 0xB7, 0xB7, 0xCC, 0xB0, 0xC6, 0xB6, /* 0x28-0x2B */ + 0xB1, 0xE1, 0xB9, 0xBA, 0xD6, 0xFC, 0xB9, 0xE1, /* 0x2C-0x2F */ + 0xB7, 0xA1, 0xBC, 0xFA, 0xEA, 0xDA, 0xEA, 0xDB, /* 0x30-0x33 */ + 0xCC, 0xF9, 0xB9, 0xF3, 0xEA, 0xDC, 0xB4, 0xFB, /* 0x34-0x37 */ + 0xC3, 0xB3, 0xB7, 0xD1, 0xBA, 0xD8, 0xEA, 0xDD, /* 0x38-0x3B */ + 0xD4, 0xF4, 0xEA, 0xDE, 0xBC, 0xD6, 0xBB, 0xDF, /* 0x3C-0x3F */ + 0xEA, 0xDF, 0xC1, 0xDE, 0xC2, 0xB8, 0xD4, 0xDF, /* 0x40-0x43 */ + 0xD7, 0xCA, 0xEA, 0xE0, 0xEA, 0xE1, 0xEA, 0xE4, /* 0x44-0x47 */ + 0xEA, 0xE2, 0xEA, 0xE3, 0xC9, 0xDE, 0xB8, 0xB3, /* 0x48-0x4B */ + 0xB6, 0xC4, 0xEA, 0xE5, 0xCA, 0xEA, 0xC9, 0xCD, /* 0x4C-0x4F */ + 0xB4, 0xCD, 0xDA, 0x50, 0xDA, 0x51, 0xE2, 0xD9, /* 0x50-0x53 */ + 0xC5, 0xE2, 0xEA, 0xE6, 0xC0, 0xB5, 0xDA, 0x52, /* 0x54-0x57 */ + 0xD7, 0xB8, 0xEA, 0xE7, 0xD7, 0xAC, 0xC8, 0xFC, /* 0x58-0x5B */ + 0xD8, 0xD3, 0xD8, 0xCD, 0xD4, 0xDE, 0xDA, 0x53, /* 0x5C-0x5F */ + 0xD4, 0xF9, 0xC9, 0xC4, 0xD3, 0xAE, 0xB8, 0xD3, /* 0x60-0x63 */ + 0xB3, 0xE0, 0xDA, 0x54, 0xC9, 0xE2, 0xF4, 0xF6, /* 0x64-0x67 */ + 0xDA, 0x55, 0xDA, 0x56, 0xDA, 0x57, 0xBA, 0xD5, /* 0x68-0x6B */ + 0xDA, 0x58, 0xF4, 0xF7, 0xDA, 0x59, 0xDA, 0x5A, /* 0x6C-0x6F */ + 0xD7, 0xDF, 0xDA, 0x5B, 0xDA, 0x5C, 0xF4, 0xF1, /* 0x70-0x73 */ + 0xB8, 0xB0, 0xD5, 0xD4, 0xB8, 0xCF, 0xC6, 0xF0, /* 0x74-0x77 */ + 0xDA, 0x5D, 0xDA, 0x5E, 0xDA, 0x5F, 0xDA, 0x60, /* 0x78-0x7B */ + 0xDA, 0x61, 0xDA, 0x62, 0xDA, 0x63, 0xDA, 0x64, /* 0x7C-0x7F */ + + 0xDA, 0x65, 0xB3, 0xC3, 0xDA, 0x66, 0xDA, 0x67, /* 0x80-0x83 */ + 0xF4, 0xF2, 0xB3, 0xAC, 0xDA, 0x68, 0xDA, 0x69, /* 0x84-0x87 */ + 0xDA, 0x6A, 0xDA, 0x6B, 0xD4, 0xBD, 0xC7, 0xF7, /* 0x88-0x8B */ + 0xDA, 0x6C, 0xDA, 0x6D, 0xDA, 0x6E, 0xDA, 0x6F, /* 0x8C-0x8F */ + 0xDA, 0x70, 0xF4, 0xF4, 0xDA, 0x71, 0xDA, 0x72, /* 0x90-0x93 */ + 0xF4, 0xF3, 0xDA, 0x73, 0xDA, 0x74, 0xDA, 0x75, /* 0x94-0x97 */ + 0xDA, 0x76, 0xDA, 0x77, 0xDA, 0x78, 0xDA, 0x79, /* 0x98-0x9B */ + 0xDA, 0x7A, 0xDA, 0x7B, 0xDA, 0x7C, 0xCC, 0xCB, /* 0x9C-0x9F */ + 0xDA, 0x7D, 0xDA, 0x7E, 0xDA, 0x80, 0xC8, 0xA4, /* 0xA0-0xA3 */ + 0xDA, 0x81, 0xDA, 0x82, 0xDA, 0x83, 0xDA, 0x84, /* 0xA4-0xA7 */ + 0xDA, 0x85, 0xDA, 0x86, 0xDA, 0x87, 0xDA, 0x88, /* 0xA8-0xAB */ + 0xDA, 0x89, 0xDA, 0x8A, 0xDA, 0x8B, 0xDA, 0x8C, /* 0xAC-0xAF */ + 0xDA, 0x8D, 0xF4, 0xF5, 0xDA, 0x8E, 0xD7, 0xE3, /* 0xB0-0xB3 */ + 0xC5, 0xBF, 0xF5, 0xC0, 0xDA, 0x8F, 0xDA, 0x90, /* 0xB4-0xB7 */ + 0xF5, 0xBB, 0xDA, 0x91, 0xF5, 0xC3, 0xDA, 0x92, /* 0xB8-0xBB */ + 0xF5, 0xC2, 0xDA, 0x93, 0xD6, 0xBA, 0xF5, 0xC1, /* 0xBC-0xBF */ + 0xDA, 0x94, 0xDA, 0x95, 0xDA, 0x96, 0xD4, 0xBE, /* 0xC0-0xC3 */ + 0xF5, 0xC4, 0xDA, 0x97, 0xF5, 0xCC, 0xDA, 0x98, /* 0xC4-0xC7 */ + 0xDA, 0x99, 0xDA, 0x9A, 0xDA, 0x9B, 0xB0, 0xCF, /* 0xC8-0xCB */ + 0xB5, 0xF8, 0xDA, 0x9C, 0xF5, 0xC9, 0xF5, 0xCA, /* 0xCC-0xCF */ + 0xDA, 0x9D, 0xC5, 0xDC, 0xDA, 0x9E, 0xDA, 0x9F, /* 0xD0-0xD3 */ + 0xDA, 0xA0, 0xDB, 0x40, 0xF5, 0xC5, 0xF5, 0xC6, /* 0xD4-0xD7 */ + 0xDB, 0x41, 0xDB, 0x42, 0xF5, 0xC7, 0xF5, 0xCB, /* 0xD8-0xDB */ + 0xDB, 0x43, 0xBE, 0xE0, 0xF5, 0xC8, 0xB8, 0xFA, /* 0xDC-0xDF */ + 0xDB, 0x44, 0xDB, 0x45, 0xDB, 0x46, 0xF5, 0xD0, /* 0xE0-0xE3 */ + 0xF5, 0xD3, 0xDB, 0x47, 0xDB, 0x48, 0xDB, 0x49, /* 0xE4-0xE7 */ + 0xBF, 0xE7, 0xDB, 0x4A, 0xB9, 0xF2, 0xF5, 0xBC, /* 0xE8-0xEB */ + 0xF5, 0xCD, 0xDB, 0x4B, 0xDB, 0x4C, 0xC2, 0xB7, /* 0xEC-0xEF */ + 0xDB, 0x4D, 0xDB, 0x4E, 0xDB, 0x4F, 0xCC, 0xF8, /* 0xF0-0xF3 */ + 0xDB, 0x50, 0xBC, 0xF9, 0xDB, 0x51, 0xF5, 0xCE, /* 0xF4-0xF7 */ + 0xF5, 0xCF, 0xF5, 0xD1, 0xB6, 0xE5, 0xF5, 0xD2, /* 0xF8-0xFB */ + 0xDB, 0x52, 0xF5, 0xD5, 0xDB, 0x53, 0xDB, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0xDB, 0x55, 0xDB, 0x56, 0xDB, 0x57, 0xDB, 0x58, /* 0x00-0x03 */ + 0xDB, 0x59, 0xF5, 0xBD, 0xDB, 0x5A, 0xDB, 0x5B, /* 0x04-0x07 */ + 0xDB, 0x5C, 0xF5, 0xD4, 0xD3, 0xBB, 0xDB, 0x5D, /* 0x08-0x0B */ + 0xB3, 0xEC, 0xDB, 0x5E, 0xDB, 0x5F, 0xCC, 0xA4, /* 0x0C-0x0F */ + 0xDB, 0x60, 0xDB, 0x61, 0xDB, 0x62, 0xDB, 0x63, /* 0x10-0x13 */ + 0xF5, 0xD6, 0xDB, 0x64, 0xDB, 0x65, 0xDB, 0x66, /* 0x14-0x17 */ + 0xDB, 0x67, 0xDB, 0x68, 0xDB, 0x69, 0xDB, 0x6A, /* 0x18-0x1B */ + 0xDB, 0x6B, 0xF5, 0xD7, 0xBE, 0xE1, 0xF5, 0xD8, /* 0x1C-0x1F */ + 0xDB, 0x6C, 0xDB, 0x6D, 0xCC, 0xDF, 0xF5, 0xDB, /* 0x20-0x23 */ + 0xDB, 0x6E, 0xDB, 0x6F, 0xDB, 0x70, 0xDB, 0x71, /* 0x24-0x27 */ + 0xDB, 0x72, 0xB2, 0xC8, 0xD7, 0xD9, 0xDB, 0x73, /* 0x28-0x2B */ + 0xF5, 0xD9, 0xDB, 0x74, 0xF5, 0xDA, 0xF5, 0xDC, /* 0x2C-0x2F */ + 0xDB, 0x75, 0xF5, 0xE2, 0xDB, 0x76, 0xDB, 0x77, /* 0x30-0x33 */ + 0xDB, 0x78, 0xF5, 0xE0, 0xDB, 0x79, 0xDB, 0x7A, /* 0x34-0x37 */ + 0xDB, 0x7B, 0xF5, 0xDF, 0xF5, 0xDD, 0xDB, 0x7C, /* 0x38-0x3B */ + 0xDB, 0x7D, 0xF5, 0xE1, 0xDB, 0x7E, 0xDB, 0x80, /* 0x3C-0x3F */ + 0xF5, 0xDE, 0xF5, 0xE4, 0xF5, 0xE5, 0xDB, 0x81, /* 0x40-0x43 */ + 0xCC, 0xE3, 0xDB, 0x82, 0xDB, 0x83, 0xE5, 0xBF, /* 0x44-0x47 */ + 0xB5, 0xB8, 0xF5, 0xE3, 0xF5, 0xE8, 0xCC, 0xA3, /* 0x48-0x4B */ + 0xDB, 0x84, 0xDB, 0x85, 0xDB, 0x86, 0xDB, 0x87, /* 0x4C-0x4F */ + 0xDB, 0x88, 0xF5, 0xE6, 0xF5, 0xE7, 0xDB, 0x89, /* 0x50-0x53 */ + 0xDB, 0x8A, 0xDB, 0x8B, 0xDB, 0x8C, 0xDB, 0x8D, /* 0x54-0x57 */ + 0xDB, 0x8E, 0xF5, 0xBE, 0xDB, 0x8F, 0xDB, 0x90, /* 0x58-0x5B */ + 0xDB, 0x91, 0xDB, 0x92, 0xDB, 0x93, 0xDB, 0x94, /* 0x5C-0x5F */ + 0xDB, 0x95, 0xDB, 0x96, 0xDB, 0x97, 0xDB, 0x98, /* 0x60-0x63 */ + 0xDB, 0x99, 0xDB, 0x9A, 0xB1, 0xC4, 0xDB, 0x9B, /* 0x64-0x67 */ + 0xDB, 0x9C, 0xF5, 0xBF, 0xDB, 0x9D, 0xDB, 0x9E, /* 0x68-0x6B */ + 0xB5, 0xC5, 0xB2, 0xE4, 0xDB, 0x9F, 0xF5, 0xEC, /* 0x6C-0x6F */ + 0xF5, 0xE9, 0xDB, 0xA0, 0xB6, 0xD7, 0xDC, 0x40, /* 0x70-0x73 */ + 0xF5, 0xED, 0xDC, 0x41, 0xF5, 0xEA, 0xDC, 0x42, /* 0x74-0x77 */ + 0xDC, 0x43, 0xDC, 0x44, 0xDC, 0x45, 0xDC, 0x46, /* 0x78-0x7B */ + 0xF5, 0xEB, 0xDC, 0x47, 0xDC, 0x48, 0xB4, 0xDA, /* 0x7C-0x7F */ + + 0xDC, 0x49, 0xD4, 0xEA, 0xDC, 0x4A, 0xDC, 0x4B, /* 0x80-0x83 */ + 0xDC, 0x4C, 0xF5, 0xEE, 0xDC, 0x4D, 0xB3, 0xF9, /* 0x84-0x87 */ + 0xDC, 0x4E, 0xDC, 0x4F, 0xDC, 0x50, 0xDC, 0x51, /* 0x88-0x8B */ + 0xDC, 0x52, 0xDC, 0x53, 0xDC, 0x54, 0xF5, 0xEF, /* 0x8C-0x8F */ + 0xF5, 0xF1, 0xDC, 0x55, 0xDC, 0x56, 0xDC, 0x57, /* 0x90-0x93 */ + 0xF5, 0xF0, 0xDC, 0x58, 0xDC, 0x59, 0xDC, 0x5A, /* 0x94-0x97 */ + 0xDC, 0x5B, 0xDC, 0x5C, 0xDC, 0x5D, 0xDC, 0x5E, /* 0x98-0x9B */ + 0xF5, 0xF2, 0xDC, 0x5F, 0xF5, 0xF3, 0xDC, 0x60, /* 0x9C-0x9F */ + 0xDC, 0x61, 0xDC, 0x62, 0xDC, 0x63, 0xDC, 0x64, /* 0xA0-0xA3 */ + 0xDC, 0x65, 0xDC, 0x66, 0xDC, 0x67, 0xDC, 0x68, /* 0xA4-0xA7 */ + 0xDC, 0x69, 0xDC, 0x6A, 0xDC, 0x6B, 0xC9, 0xED, /* 0xA8-0xAB */ + 0xB9, 0xAA, 0xDC, 0x6C, 0xDC, 0x6D, 0xC7, 0xFB, /* 0xAC-0xAF */ + 0xDC, 0x6E, 0xDC, 0x6F, 0xB6, 0xE3, 0xDC, 0x70, /* 0xB0-0xB3 */ + 0xDC, 0x71, 0xDC, 0x72, 0xDC, 0x73, 0xDC, 0x74, /* 0xB4-0xB7 */ + 0xDC, 0x75, 0xDC, 0x76, 0xCC, 0xC9, 0xDC, 0x77, /* 0xB8-0xBB */ + 0xDC, 0x78, 0xDC, 0x79, 0xDC, 0x7A, 0xDC, 0x7B, /* 0xBC-0xBF */ + 0xDC, 0x7C, 0xDC, 0x7D, 0xDC, 0x7E, 0xDC, 0x80, /* 0xC0-0xC3 */ + 0xDC, 0x81, 0xDC, 0x82, 0xDC, 0x83, 0xDC, 0x84, /* 0xC4-0xC7 */ + 0xDC, 0x85, 0xDC, 0x86, 0xDC, 0x87, 0xDC, 0x88, /* 0xC8-0xCB */ + 0xDC, 0x89, 0xDC, 0x8A, 0xEA, 0xA6, 0xDC, 0x8B, /* 0xCC-0xCF */ + 0xDC, 0x8C, 0xDC, 0x8D, 0xDC, 0x8E, 0xDC, 0x8F, /* 0xD0-0xD3 */ + 0xDC, 0x90, 0xDC, 0x91, 0xDC, 0x92, 0xDC, 0x93, /* 0xD4-0xD7 */ + 0xDC, 0x94, 0xDC, 0x95, 0xDC, 0x96, 0xDC, 0x97, /* 0xD8-0xDB */ + 0xDC, 0x98, 0xDC, 0x99, 0xDC, 0x9A, 0xDC, 0x9B, /* 0xDC-0xDF */ + 0xDC, 0x9C, 0xDC, 0x9D, 0xDC, 0x9E, 0xDC, 0x9F, /* 0xE0-0xE3 */ + 0xDC, 0xA0, 0xDD, 0x40, 0xDD, 0x41, 0xDD, 0x42, /* 0xE4-0xE7 */ + 0xDD, 0x43, 0xDD, 0x44, 0xDD, 0x45, 0xDD, 0x46, /* 0xE8-0xEB */ + 0xDD, 0x47, 0xDD, 0x48, 0xDD, 0x49, 0xDD, 0x4A, /* 0xEC-0xEF */ + 0xDD, 0x4B, 0xDD, 0x4C, 0xDD, 0x4D, 0xDD, 0x4E, /* 0xF0-0xF3 */ + 0xDD, 0x4F, 0xDD, 0x50, 0xDD, 0x51, 0xDD, 0x52, /* 0xF4-0xF7 */ + 0xDD, 0x53, 0xDD, 0x54, 0xDD, 0x55, 0xDD, 0x56, /* 0xF8-0xFB */ + 0xDD, 0x57, 0xDD, 0x58, 0xDD, 0x59, 0xDD, 0x5A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0xDD, 0x5B, 0xDD, 0x5C, 0xDD, 0x5D, 0xDD, 0x5E, /* 0x00-0x03 */ + 0xDD, 0x5F, 0xDD, 0x60, 0xDD, 0x61, 0xDD, 0x62, /* 0x04-0x07 */ + 0xDD, 0x63, 0xDD, 0x64, 0xDD, 0x65, 0xDD, 0x66, /* 0x08-0x0B */ + 0xDD, 0x67, 0xDD, 0x68, 0xDD, 0x69, 0xDD, 0x6A, /* 0x0C-0x0F */ + 0xDD, 0x6B, 0xDD, 0x6C, 0xDD, 0x6D, 0xDD, 0x6E, /* 0x10-0x13 */ + 0xDD, 0x6F, 0xDD, 0x70, 0xDD, 0x71, 0xDD, 0x72, /* 0x14-0x17 */ + 0xDD, 0x73, 0xDD, 0x74, 0xDD, 0x75, 0xDD, 0x76, /* 0x18-0x1B */ + 0xDD, 0x77, 0xDD, 0x78, 0xDD, 0x79, 0xDD, 0x7A, /* 0x1C-0x1F */ + 0xDD, 0x7B, 0xDD, 0x7C, 0xDD, 0x7D, 0xDD, 0x7E, /* 0x20-0x23 */ + 0xDD, 0x80, 0xDD, 0x81, 0xDD, 0x82, 0xDD, 0x83, /* 0x24-0x27 */ + 0xDD, 0x84, 0xDD, 0x85, 0xDD, 0x86, 0xDD, 0x87, /* 0x28-0x2B */ + 0xDD, 0x88, 0xDD, 0x89, 0xDD, 0x8A, 0xDD, 0x8B, /* 0x2C-0x2F */ + 0xDD, 0x8C, 0xDD, 0x8D, 0xDD, 0x8E, 0xDD, 0x8F, /* 0x30-0x33 */ + 0xDD, 0x90, 0xDD, 0x91, 0xDD, 0x92, 0xDD, 0x93, /* 0x34-0x37 */ + 0xDD, 0x94, 0xDD, 0x95, 0xDD, 0x96, 0xDD, 0x97, /* 0x38-0x3B */ + 0xDD, 0x98, 0xDD, 0x99, 0xDD, 0x9A, 0xDD, 0x9B, /* 0x3C-0x3F */ + 0xDD, 0x9C, 0xDD, 0x9D, 0xDD, 0x9E, 0xDD, 0x9F, /* 0x40-0x43 */ + 0xDD, 0xA0, 0xDE, 0x40, 0xDE, 0x41, 0xDE, 0x42, /* 0x44-0x47 */ + 0xDE, 0x43, 0xDE, 0x44, 0xDE, 0x45, 0xDE, 0x46, /* 0x48-0x4B */ + 0xDE, 0x47, 0xDE, 0x48, 0xDE, 0x49, 0xDE, 0x4A, /* 0x4C-0x4F */ + 0xDE, 0x4B, 0xDE, 0x4C, 0xDE, 0x4D, 0xDE, 0x4E, /* 0x50-0x53 */ + 0xDE, 0x4F, 0xDE, 0x50, 0xDE, 0x51, 0xDE, 0x52, /* 0x54-0x57 */ + 0xDE, 0x53, 0xDE, 0x54, 0xDE, 0x55, 0xDE, 0x56, /* 0x58-0x5B */ + 0xDE, 0x57, 0xDE, 0x58, 0xDE, 0x59, 0xDE, 0x5A, /* 0x5C-0x5F */ + 0xDE, 0x5B, 0xDE, 0x5C, 0xDE, 0x5D, 0xDE, 0x5E, /* 0x60-0x63 */ + 0xDE, 0x5F, 0xDE, 0x60, 0xB3, 0xB5, 0xD4, 0xFE, /* 0x64-0x67 */ + 0xB9, 0xEC, 0xD0, 0xF9, 0xDE, 0x61, 0xE9, 0xED, /* 0x68-0x6B */ + 0xD7, 0xAA, 0xE9, 0xEE, 0xC2, 0xD6, 0xC8, 0xED, /* 0x6C-0x6F */ + 0xBA, 0xE4, 0xE9, 0xEF, 0xE9, 0xF0, 0xE9, 0xF1, /* 0x70-0x73 */ + 0xD6, 0xE1, 0xE9, 0xF2, 0xE9, 0xF3, 0xE9, 0xF5, /* 0x74-0x77 */ + 0xE9, 0xF4, 0xE9, 0xF6, 0xE9, 0xF7, 0xC7, 0xE1, /* 0x78-0x7B */ + 0xE9, 0xF8, 0xD4, 0xD8, 0xE9, 0xF9, 0xBD, 0xCE, /* 0x7C-0x7F */ + + 0xDE, 0x62, 0xE9, 0xFA, 0xE9, 0xFB, 0xBD, 0xCF, /* 0x80-0x83 */ + 0xE9, 0xFC, 0xB8, 0xA8, 0xC1, 0xBE, 0xE9, 0xFD, /* 0x84-0x87 */ + 0xB1, 0xB2, 0xBB, 0xD4, 0xB9, 0xF5, 0xE9, 0xFE, /* 0x88-0x8B */ + 0xDE, 0x63, 0xEA, 0xA1, 0xEA, 0xA2, 0xEA, 0xA3, /* 0x8C-0x8F */ + 0xB7, 0xF8, 0xBC, 0xAD, 0xDE, 0x64, 0xCA, 0xE4, /* 0x90-0x93 */ + 0xE0, 0xCE, 0xD4, 0xAF, 0xCF, 0xBD, 0xD5, 0xB7, /* 0x94-0x97 */ + 0xEA, 0xA4, 0xD5, 0xDE, 0xEA, 0xA5, 0xD0, 0xC1, /* 0x98-0x9B */ + 0xB9, 0xBC, 0xDE, 0x65, 0xB4, 0xC7, 0xB1, 0xD9, /* 0x9C-0x9F */ + 0xDE, 0x66, 0xDE, 0x67, 0xDE, 0x68, 0xC0, 0xB1, /* 0xA0-0xA3 */ + 0xDE, 0x69, 0xDE, 0x6A, 0xDE, 0x6B, 0xDE, 0x6C, /* 0xA4-0xA7 */ + 0xB1, 0xE6, 0xB1, 0xE7, 0xDE, 0x6D, 0xB1, 0xE8, /* 0xA8-0xAB */ + 0xDE, 0x6E, 0xDE, 0x6F, 0xDE, 0x70, 0xDE, 0x71, /* 0xAC-0xAF */ + 0xB3, 0xBD, 0xC8, 0xE8, 0xDE, 0x72, 0xDE, 0x73, /* 0xB0-0xB3 */ + 0xDE, 0x74, 0xDE, 0x75, 0xE5, 0xC1, 0xDE, 0x76, /* 0xB4-0xB7 */ + 0xDE, 0x77, 0xB1, 0xDF, 0xDE, 0x78, 0xDE, 0x79, /* 0xB8-0xBB */ + 0xDE, 0x7A, 0xC1, 0xC9, 0xB4, 0xEF, 0xDE, 0x7B, /* 0xBC-0xBF */ + 0xDE, 0x7C, 0xC7, 0xA8, 0xD3, 0xD8, 0xDE, 0x7D, /* 0xC0-0xC3 */ + 0xC6, 0xF9, 0xD1, 0xB8, 0xDE, 0x7E, 0xB9, 0xFD, /* 0xC4-0xC7 */ + 0xC2, 0xF5, 0xDE, 0x80, 0xDE, 0x81, 0xDE, 0x82, /* 0xC8-0xCB */ + 0xDE, 0x83, 0xDE, 0x84, 0xD3, 0xAD, 0xDE, 0x85, /* 0xCC-0xCF */ + 0xD4, 0xCB, 0xBD, 0xFC, 0xDE, 0x86, 0xE5, 0xC2, /* 0xD0-0xD3 */ + 0xB7, 0xB5, 0xE5, 0xC3, 0xDE, 0x87, 0xDE, 0x88, /* 0xD4-0xD7 */ + 0xBB, 0xB9, 0xD5, 0xE2, 0xDE, 0x89, 0xBD, 0xF8, /* 0xD8-0xDB */ + 0xD4, 0xB6, 0xCE, 0xA5, 0xC1, 0xAC, 0xB3, 0xD9, /* 0xDC-0xDF */ + 0xDE, 0x8A, 0xDE, 0x8B, 0xCC, 0xF6, 0xDE, 0x8C, /* 0xE0-0xE3 */ + 0xE5, 0xC6, 0xE5, 0xC4, 0xE5, 0xC8, 0xDE, 0x8D, /* 0xE4-0xE7 */ + 0xE5, 0xCA, 0xE5, 0xC7, 0xB5, 0xCF, 0xC6, 0xC8, /* 0xE8-0xEB */ + 0xDE, 0x8E, 0xB5, 0xFC, 0xE5, 0xC5, 0xDE, 0x8F, /* 0xEC-0xEF */ + 0xCA, 0xF6, 0xDE, 0x90, 0xDE, 0x91, 0xE5, 0xC9, /* 0xF0-0xF3 */ + 0xDE, 0x92, 0xDE, 0x93, 0xDE, 0x94, 0xC3, 0xD4, /* 0xF4-0xF7 */ + 0xB1, 0xC5, 0xBC, 0xA3, 0xDE, 0x95, 0xDE, 0x96, /* 0xF8-0xFB */ + 0xDE, 0x97, 0xD7, 0xB7, 0xDE, 0x98, 0xDE, 0x99, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xCD, 0xCB, 0xCB, 0xCD, 0xCA, 0xCA, 0xCC, 0xD3, /* 0x00-0x03 */ + 0xE5, 0xCC, 0xE5, 0xCB, 0xC4, 0xE6, 0xDE, 0x9A, /* 0x04-0x07 */ + 0xDE, 0x9B, 0xD1, 0xA1, 0xD1, 0xB7, 0xE5, 0xCD, /* 0x08-0x0B */ + 0xDE, 0x9C, 0xE5, 0xD0, 0xDE, 0x9D, 0xCD, 0xB8, /* 0x0C-0x0F */ + 0xD6, 0xF0, 0xE5, 0xCF, 0xB5, 0xDD, 0xDE, 0x9E, /* 0x10-0x13 */ + 0xCD, 0xBE, 0xDE, 0x9F, 0xE5, 0xD1, 0xB6, 0xBA, /* 0x14-0x17 */ + 0xDE, 0xA0, 0xDF, 0x40, 0xCD, 0xA8, 0xB9, 0xE4, /* 0x18-0x1B */ + 0xDF, 0x41, 0xCA, 0xC5, 0xB3, 0xD1, 0xCB, 0xD9, /* 0x1C-0x1F */ + 0xD4, 0xEC, 0xE5, 0xD2, 0xB7, 0xEA, 0xDF, 0x42, /* 0x20-0x23 */ + 0xDF, 0x43, 0xDF, 0x44, 0xE5, 0xCE, 0xDF, 0x45, /* 0x24-0x27 */ + 0xDF, 0x46, 0xDF, 0x47, 0xDF, 0x48, 0xDF, 0x49, /* 0x28-0x2B */ + 0xDF, 0x4A, 0xE5, 0xD5, 0xB4, 0xFE, 0xE5, 0xD6, /* 0x2C-0x2F */ + 0xDF, 0x4B, 0xDF, 0x4C, 0xDF, 0x4D, 0xDF, 0x4E, /* 0x30-0x33 */ + 0xDF, 0x4F, 0xE5, 0xD3, 0xE5, 0xD4, 0xDF, 0x50, /* 0x34-0x37 */ + 0xD2, 0xDD, 0xDF, 0x51, 0xDF, 0x52, 0xC2, 0xDF, /* 0x38-0x3B */ + 0xB1, 0xC6, 0xDF, 0x53, 0xD3, 0xE2, 0xDF, 0x54, /* 0x3C-0x3F */ + 0xDF, 0x55, 0xB6, 0xDD, 0xCB, 0xEC, 0xDF, 0x56, /* 0x40-0x43 */ + 0xE5, 0xD7, 0xDF, 0x57, 0xDF, 0x58, 0xD3, 0xF6, /* 0x44-0x47 */ + 0xDF, 0x59, 0xDF, 0x5A, 0xDF, 0x5B, 0xDF, 0x5C, /* 0x48-0x4B */ + 0xDF, 0x5D, 0xB1, 0xE9, 0xDF, 0x5E, 0xB6, 0xF4, /* 0x4C-0x4F */ + 0xE5, 0xDA, 0xE5, 0xD8, 0xE5, 0xD9, 0xB5, 0xC0, /* 0x50-0x53 */ + 0xDF, 0x5F, 0xDF, 0x60, 0xDF, 0x61, 0xD2, 0xC5, /* 0x54-0x57 */ + 0xE5, 0xDC, 0xDF, 0x62, 0xDF, 0x63, 0xE5, 0xDE, /* 0x58-0x5B */ + 0xDF, 0x64, 0xDF, 0x65, 0xDF, 0x66, 0xDF, 0x67, /* 0x5C-0x5F */ + 0xDF, 0x68, 0xDF, 0x69, 0xE5, 0xDD, 0xC7, 0xB2, /* 0x60-0x63 */ + 0xDF, 0x6A, 0xD2, 0xA3, 0xDF, 0x6B, 0xDF, 0x6C, /* 0x64-0x67 */ + 0xE5, 0xDB, 0xDF, 0x6D, 0xDF, 0x6E, 0xDF, 0x6F, /* 0x68-0x6B */ + 0xDF, 0x70, 0xD4, 0xE2, 0xD5, 0xDA, 0xDF, 0x71, /* 0x6C-0x6F */ + 0xDF, 0x72, 0xDF, 0x73, 0xDF, 0x74, 0xDF, 0x75, /* 0x70-0x73 */ + 0xE5, 0xE0, 0xD7, 0xF1, 0xDF, 0x76, 0xDF, 0x77, /* 0x74-0x77 */ + 0xDF, 0x78, 0xDF, 0x79, 0xDF, 0x7A, 0xDF, 0x7B, /* 0x78-0x7B */ + 0xDF, 0x7C, 0xE5, 0xE1, 0xDF, 0x7D, 0xB1, 0xDC, /* 0x7C-0x7F */ + + 0xD1, 0xFB, 0xDF, 0x7E, 0xE5, 0xE2, 0xE5, 0xE4, /* 0x80-0x83 */ + 0xDF, 0x80, 0xDF, 0x81, 0xDF, 0x82, 0xDF, 0x83, /* 0x84-0x87 */ + 0xE5, 0xE3, 0xDF, 0x84, 0xDF, 0x85, 0xE5, 0xE5, /* 0x88-0x8B */ + 0xDF, 0x86, 0xDF, 0x87, 0xDF, 0x88, 0xDF, 0x89, /* 0x8C-0x8F */ + 0xDF, 0x8A, 0xD2, 0xD8, 0xDF, 0x8B, 0xB5, 0xCB, /* 0x90-0x93 */ + 0xDF, 0x8C, 0xE7, 0xDF, 0xDF, 0x8D, 0xDA, 0xF5, /* 0x94-0x97 */ + 0xDF, 0x8E, 0xDA, 0xF8, 0xDF, 0x8F, 0xDA, 0xF6, /* 0x98-0x9B */ + 0xDF, 0x90, 0xDA, 0xF7, 0xDF, 0x91, 0xDF, 0x92, /* 0x9C-0x9F */ + 0xDF, 0x93, 0xDA, 0xFA, 0xD0, 0xCF, 0xC4, 0xC7, /* 0xA0-0xA3 */ + 0xDF, 0x94, 0xDF, 0x95, 0xB0, 0xEE, 0xDF, 0x96, /* 0xA4-0xA7 */ + 0xDF, 0x97, 0xDF, 0x98, 0xD0, 0xB0, 0xDF, 0x99, /* 0xA8-0xAB */ + 0xDA, 0xF9, 0xDF, 0x9A, 0xD3, 0xCA, 0xBA, 0xAA, /* 0xAC-0xAF */ + 0xDB, 0xA2, 0xC7, 0xF1, 0xDF, 0x9B, 0xDA, 0xFC, /* 0xB0-0xB3 */ + 0xDA, 0xFB, 0xC9, 0xDB, 0xDA, 0xFD, 0xDF, 0x9C, /* 0xB4-0xB7 */ + 0xDB, 0xA1, 0xD7, 0xDE, 0xDA, 0xFE, 0xC1, 0xDA, /* 0xB8-0xBB */ + 0xDF, 0x9D, 0xDF, 0x9E, 0xDB, 0xA5, 0xDF, 0x9F, /* 0xBC-0xBF */ + 0xDF, 0xA0, 0xD3, 0xF4, 0xE0, 0x40, 0xE0, 0x41, /* 0xC0-0xC3 */ + 0xDB, 0xA7, 0xDB, 0xA4, 0xE0, 0x42, 0xDB, 0xA8, /* 0xC4-0xC7 */ + 0xE0, 0x43, 0xE0, 0x44, 0xBD, 0xBC, 0xE0, 0x45, /* 0xC8-0xCB */ + 0xE0, 0x46, 0xE0, 0x47, 0xC0, 0xC9, 0xDB, 0xA3, /* 0xCC-0xCF */ + 0xDB, 0xA6, 0xD6, 0xA3, 0xE0, 0x48, 0xDB, 0xA9, /* 0xD0-0xD3 */ + 0xE0, 0x49, 0xE0, 0x4A, 0xE0, 0x4B, 0xDB, 0xAD, /* 0xD4-0xD7 */ + 0xE0, 0x4C, 0xE0, 0x4D, 0xE0, 0x4E, 0xDB, 0xAE, /* 0xD8-0xDB */ + 0xDB, 0xAC, 0xBA, 0xC2, 0xE0, 0x4F, 0xE0, 0x50, /* 0xDC-0xDF */ + 0xE0, 0x51, 0xBF, 0xA4, 0xDB, 0xAB, 0xE0, 0x52, /* 0xE0-0xE3 */ + 0xE0, 0x53, 0xE0, 0x54, 0xDB, 0xAA, 0xD4, 0xC7, /* 0xE4-0xE7 */ + 0xB2, 0xBF, 0xE0, 0x55, 0xE0, 0x56, 0xDB, 0xAF, /* 0xE8-0xEB */ + 0xE0, 0x57, 0xB9, 0xF9, 0xE0, 0x58, 0xDB, 0xB0, /* 0xEC-0xEF */ + 0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x5B, 0xE0, 0x5C, /* 0xF0-0xF3 */ + 0xB3, 0xBB, 0xE0, 0x5D, 0xE0, 0x5E, 0xE0, 0x5F, /* 0xF4-0xF7 */ + 0xB5, 0xA6, 0xE0, 0x60, 0xE0, 0x61, 0xE0, 0x62, /* 0xF8-0xFB */ + 0xE0, 0x63, 0xB6, 0xBC, 0xDB, 0xB1, 0xE0, 0x64, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0xE0, 0x65, 0xE0, 0x66, 0xB6, 0xF5, 0xE0, 0x67, /* 0x00-0x03 */ + 0xDB, 0xB2, 0xE0, 0x68, 0xE0, 0x69, 0xE0, 0x6A, /* 0x04-0x07 */ + 0xE0, 0x6B, 0xE0, 0x6C, 0xE0, 0x6D, 0xE0, 0x6E, /* 0x08-0x0B */ + 0xE0, 0x6F, 0xE0, 0x70, 0xE0, 0x71, 0xE0, 0x72, /* 0x0C-0x0F */ + 0xE0, 0x73, 0xE0, 0x74, 0xE0, 0x75, 0xE0, 0x76, /* 0x10-0x13 */ + 0xE0, 0x77, 0xE0, 0x78, 0xE0, 0x79, 0xE0, 0x7A, /* 0x14-0x17 */ + 0xE0, 0x7B, 0xB1, 0xC9, 0xE0, 0x7C, 0xE0, 0x7D, /* 0x18-0x1B */ + 0xE0, 0x7E, 0xE0, 0x80, 0xDB, 0xB4, 0xE0, 0x81, /* 0x1C-0x1F */ + 0xE0, 0x82, 0xE0, 0x83, 0xDB, 0xB3, 0xDB, 0xB5, /* 0x20-0x23 */ + 0xE0, 0x84, 0xE0, 0x85, 0xE0, 0x86, 0xE0, 0x87, /* 0x24-0x27 */ + 0xE0, 0x88, 0xE0, 0x89, 0xE0, 0x8A, 0xE0, 0x8B, /* 0x28-0x2B */ + 0xE0, 0x8C, 0xE0, 0x8D, 0xE0, 0x8E, 0xDB, 0xB7, /* 0x2C-0x2F */ + 0xE0, 0x8F, 0xDB, 0xB6, 0xE0, 0x90, 0xE0, 0x91, /* 0x30-0x33 */ + 0xE0, 0x92, 0xE0, 0x93, 0xE0, 0x94, 0xE0, 0x95, /* 0x34-0x37 */ + 0xE0, 0x96, 0xDB, 0xB8, 0xE0, 0x97, 0xE0, 0x98, /* 0x38-0x3B */ + 0xE0, 0x99, 0xE0, 0x9A, 0xE0, 0x9B, 0xE0, 0x9C, /* 0x3C-0x3F */ + 0xE0, 0x9D, 0xE0, 0x9E, 0xE0, 0x9F, 0xDB, 0xB9, /* 0x40-0x43 */ + 0xE0, 0xA0, 0xE1, 0x40, 0xDB, 0xBA, 0xE1, 0x41, /* 0x44-0x47 */ + 0xE1, 0x42, 0xD3, 0xCF, 0xF4, 0xFA, 0xC7, 0xF5, /* 0x48-0x4B */ + 0xD7, 0xC3, 0xC5, 0xE4, 0xF4, 0xFC, 0xF4, 0xFD, /* 0x4C-0x4F */ + 0xF4, 0xFB, 0xE1, 0x43, 0xBE, 0xC6, 0xE1, 0x44, /* 0x50-0x53 */ + 0xE1, 0x45, 0xE1, 0x46, 0xE1, 0x47, 0xD0, 0xEF, /* 0x54-0x57 */ + 0xE1, 0x48, 0xE1, 0x49, 0xB7, 0xD3, 0xE1, 0x4A, /* 0x58-0x5B */ + 0xE1, 0x4B, 0xD4, 0xCD, 0xCC, 0xAA, 0xE1, 0x4C, /* 0x5C-0x5F */ + 0xE1, 0x4D, 0xF5, 0xA2, 0xF5, 0xA1, 0xBA, 0xA8, /* 0x60-0x63 */ + 0xF4, 0xFE, 0xCB, 0xD6, 0xE1, 0x4E, 0xE1, 0x4F, /* 0x64-0x67 */ + 0xE1, 0x50, 0xF5, 0xA4, 0xC0, 0xD2, 0xE1, 0x51, /* 0x68-0x6B */ + 0xB3, 0xEA, 0xE1, 0x52, 0xCD, 0xAA, 0xF5, 0xA5, /* 0x6C-0x6F */ + 0xF5, 0xA3, 0xBD, 0xB4, 0xF5, 0xA8, 0xE1, 0x53, /* 0x70-0x73 */ + 0xF5, 0xA9, 0xBD, 0xCD, 0xC3, 0xB8, 0xBF, 0xE1, /* 0x74-0x77 */ + 0xCB, 0xE1, 0xF5, 0xAA, 0xE1, 0x54, 0xE1, 0x55, /* 0x78-0x7B */ + 0xE1, 0x56, 0xF5, 0xA6, 0xF5, 0xA7, 0xC4, 0xF0, /* 0x7C-0x7F */ + + 0xE1, 0x57, 0xE1, 0x58, 0xE1, 0x59, 0xE1, 0x5A, /* 0x80-0x83 */ + 0xE1, 0x5B, 0xF5, 0xAC, 0xE1, 0x5C, 0xB4, 0xBC, /* 0x84-0x87 */ + 0xE1, 0x5D, 0xD7, 0xED, 0xE1, 0x5E, 0xB4, 0xD7, /* 0x88-0x8B */ + 0xF5, 0xAB, 0xF5, 0xAE, 0xE1, 0x5F, 0xE1, 0x60, /* 0x8C-0x8F */ + 0xF5, 0xAD, 0xF5, 0xAF, 0xD0, 0xD1, 0xE1, 0x61, /* 0x90-0x93 */ + 0xE1, 0x62, 0xE1, 0x63, 0xE1, 0x64, 0xE1, 0x65, /* 0x94-0x97 */ + 0xE1, 0x66, 0xE1, 0x67, 0xC3, 0xD1, 0xC8, 0xA9, /* 0x98-0x9B */ + 0xE1, 0x68, 0xE1, 0x69, 0xE1, 0x6A, 0xE1, 0x6B, /* 0x9C-0x9F */ + 0xE1, 0x6C, 0xE1, 0x6D, 0xF5, 0xB0, 0xF5, 0xB1, /* 0xA0-0xA3 */ + 0xE1, 0x6E, 0xE1, 0x6F, 0xE1, 0x70, 0xE1, 0x71, /* 0xA4-0xA7 */ + 0xE1, 0x72, 0xE1, 0x73, 0xF5, 0xB2, 0xE1, 0x74, /* 0xA8-0xAB */ + 0xE1, 0x75, 0xF5, 0xB3, 0xF5, 0xB4, 0xF5, 0xB5, /* 0xAC-0xAF */ + 0xE1, 0x76, 0xE1, 0x77, 0xE1, 0x78, 0xE1, 0x79, /* 0xB0-0xB3 */ + 0xF5, 0xB7, 0xF5, 0xB6, 0xE1, 0x7A, 0xE1, 0x7B, /* 0xB4-0xB7 */ + 0xE1, 0x7C, 0xE1, 0x7D, 0xF5, 0xB8, 0xE1, 0x7E, /* 0xB8-0xBB */ + 0xE1, 0x80, 0xE1, 0x81, 0xE1, 0x82, 0xE1, 0x83, /* 0xBC-0xBF */ + 0xE1, 0x84, 0xE1, 0x85, 0xE1, 0x86, 0xE1, 0x87, /* 0xC0-0xC3 */ + 0xE1, 0x88, 0xE1, 0x89, 0xE1, 0x8A, 0xB2, 0xC9, /* 0xC4-0xC7 */ + 0xE1, 0x8B, 0xD3, 0xD4, 0xCA, 0xCD, 0xE1, 0x8C, /* 0xC8-0xCB */ + 0xC0, 0xEF, 0xD6, 0xD8, 0xD2, 0xB0, 0xC1, 0xBF, /* 0xCC-0xCF */ + 0xE1, 0x8D, 0xBD, 0xF0, 0xE1, 0x8E, 0xE1, 0x8F, /* 0xD0-0xD3 */ + 0xE1, 0x90, 0xE1, 0x91, 0xE1, 0x92, 0xE1, 0x93, /* 0xD4-0xD7 */ + 0xE1, 0x94, 0xE1, 0x95, 0xE1, 0x96, 0xE1, 0x97, /* 0xD8-0xDB */ + 0xB8, 0xAA, 0xE1, 0x98, 0xE1, 0x99, 0xE1, 0x9A, /* 0xDC-0xDF */ + 0xE1, 0x9B, 0xE1, 0x9C, 0xE1, 0x9D, 0xE1, 0x9E, /* 0xE0-0xE3 */ + 0xE1, 0x9F, 0xE1, 0xA0, 0xE2, 0x40, 0xE2, 0x41, /* 0xE4-0xE7 */ + 0xE2, 0x42, 0xE2, 0x43, 0xE2, 0x44, 0xE2, 0x45, /* 0xE8-0xEB */ + 0xE2, 0x46, 0xE2, 0x47, 0xE2, 0x48, 0xE2, 0x49, /* 0xEC-0xEF */ + 0xE2, 0x4A, 0xE2, 0x4B, 0xE2, 0x4C, 0xE2, 0x4D, /* 0xF0-0xF3 */ + 0xE2, 0x4E, 0xE2, 0x4F, 0xE2, 0x50, 0xE2, 0x51, /* 0xF4-0xF7 */ + 0xE2, 0x52, 0xE2, 0x53, 0xE2, 0x54, 0xE2, 0x55, /* 0xF8-0xFB */ + 0xE2, 0x56, 0xE2, 0x57, 0xE2, 0x58, 0xE2, 0x59, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0xE2, 0x5A, 0xE2, 0x5B, 0xE2, 0x5C, 0xE2, 0x5D, /* 0x00-0x03 */ + 0xE2, 0x5E, 0xE2, 0x5F, 0xE2, 0x60, 0xE2, 0x61, /* 0x04-0x07 */ + 0xE2, 0x62, 0xE2, 0x63, 0xE2, 0x64, 0xE2, 0x65, /* 0x08-0x0B */ + 0xE2, 0x66, 0xE2, 0x67, 0xE2, 0x68, 0xE2, 0x69, /* 0x0C-0x0F */ + 0xE2, 0x6A, 0xE2, 0x6B, 0xE2, 0x6C, 0xE2, 0x6D, /* 0x10-0x13 */ + 0xE2, 0x6E, 0xE2, 0x6F, 0xE2, 0x70, 0xE2, 0x71, /* 0x14-0x17 */ + 0xE2, 0x72, 0xE2, 0x73, 0xE2, 0x74, 0xE2, 0x75, /* 0x18-0x1B */ + 0xE2, 0x76, 0xE2, 0x77, 0xE2, 0x78, 0xE2, 0x79, /* 0x1C-0x1F */ + 0xE2, 0x7A, 0xE2, 0x7B, 0xE2, 0x7C, 0xE2, 0x7D, /* 0x20-0x23 */ + 0xE2, 0x7E, 0xE2, 0x80, 0xE2, 0x81, 0xE2, 0x82, /* 0x24-0x27 */ + 0xE2, 0x83, 0xE2, 0x84, 0xE2, 0x85, 0xE2, 0x86, /* 0x28-0x2B */ + 0xE2, 0x87, 0xE2, 0x88, 0xE2, 0x89, 0xE2, 0x8A, /* 0x2C-0x2F */ + 0xE2, 0x8B, 0xE2, 0x8C, 0xE2, 0x8D, 0xE2, 0x8E, /* 0x30-0x33 */ + 0xE2, 0x8F, 0xE2, 0x90, 0xE2, 0x91, 0xE2, 0x92, /* 0x34-0x37 */ + 0xE2, 0x93, 0xE2, 0x94, 0xE2, 0x95, 0xE2, 0x96, /* 0x38-0x3B */ + 0xE2, 0x97, 0xE2, 0x98, 0xE2, 0x99, 0xE2, 0x9A, /* 0x3C-0x3F */ + 0xE2, 0x9B, 0xE2, 0x9C, 0xE2, 0x9D, 0xE2, 0x9E, /* 0x40-0x43 */ + 0xE2, 0x9F, 0xE2, 0xA0, 0xE3, 0x40, 0xE3, 0x41, /* 0x44-0x47 */ + 0xE3, 0x42, 0xE3, 0x43, 0xE3, 0x44, 0xE3, 0x45, /* 0x48-0x4B */ + 0xE3, 0x46, 0xE3, 0x47, 0xE3, 0x48, 0xE3, 0x49, /* 0x4C-0x4F */ + 0xE3, 0x4A, 0xE3, 0x4B, 0xE3, 0x4C, 0xE3, 0x4D, /* 0x50-0x53 */ + 0xE3, 0x4E, 0xE3, 0x4F, 0xE3, 0x50, 0xE3, 0x51, /* 0x54-0x57 */ + 0xE3, 0x52, 0xE3, 0x53, 0xE3, 0x54, 0xE3, 0x55, /* 0x58-0x5B */ + 0xE3, 0x56, 0xE3, 0x57, 0xE3, 0x58, 0xE3, 0x59, /* 0x5C-0x5F */ + 0xE3, 0x5A, 0xE3, 0x5B, 0xE3, 0x5C, 0xE3, 0x5D, /* 0x60-0x63 */ + 0xE3, 0x5E, 0xE3, 0x5F, 0xE3, 0x60, 0xE3, 0x61, /* 0x64-0x67 */ + 0xE3, 0x62, 0xE3, 0x63, 0xE3, 0x64, 0xE3, 0x65, /* 0x68-0x6B */ + 0xE3, 0x66, 0xE3, 0x67, 0xE3, 0x68, 0xE3, 0x69, /* 0x6C-0x6F */ + 0xE3, 0x6A, 0xE3, 0x6B, 0xE3, 0x6C, 0xE3, 0x6D, /* 0x70-0x73 */ + 0xBC, 0xF8, 0xE3, 0x6E, 0xE3, 0x6F, 0xE3, 0x70, /* 0x74-0x77 */ + 0xE3, 0x71, 0xE3, 0x72, 0xE3, 0x73, 0xE3, 0x74, /* 0x78-0x7B */ + 0xE3, 0x75, 0xE3, 0x76, 0xE3, 0x77, 0xE3, 0x78, /* 0x7C-0x7F */ + + 0xE3, 0x79, 0xE3, 0x7A, 0xE3, 0x7B, 0xE3, 0x7C, /* 0x80-0x83 */ + 0xE3, 0x7D, 0xE3, 0x7E, 0xE3, 0x80, 0xE3, 0x81, /* 0x84-0x87 */ + 0xE3, 0x82, 0xE3, 0x83, 0xE3, 0x84, 0xE3, 0x85, /* 0x88-0x8B */ + 0xE3, 0x86, 0xE3, 0x87, 0xF6, 0xC6, 0xE3, 0x88, /* 0x8C-0x8F */ + 0xE3, 0x89, 0xE3, 0x8A, 0xE3, 0x8B, 0xE3, 0x8C, /* 0x90-0x93 */ + 0xE3, 0x8D, 0xE3, 0x8E, 0xE3, 0x8F, 0xE3, 0x90, /* 0x94-0x97 */ + 0xE3, 0x91, 0xE3, 0x92, 0xE3, 0x93, 0xE3, 0x94, /* 0x98-0x9B */ + 0xE3, 0x95, 0xE3, 0x96, 0xE3, 0x97, 0xE3, 0x98, /* 0x9C-0x9F */ + 0xE3, 0x99, 0xE3, 0x9A, 0xE3, 0x9B, 0xE3, 0x9C, /* 0xA0-0xA3 */ + 0xE3, 0x9D, 0xE3, 0x9E, 0xE3, 0x9F, 0xE3, 0xA0, /* 0xA4-0xA7 */ + 0xE4, 0x40, 0xE4, 0x41, 0xE4, 0x42, 0xE4, 0x43, /* 0xA8-0xAB */ + 0xE4, 0x44, 0xE4, 0x45, 0xF6, 0xC7, 0xE4, 0x46, /* 0xAC-0xAF */ + 0xE4, 0x47, 0xE4, 0x48, 0xE4, 0x49, 0xE4, 0x4A, /* 0xB0-0xB3 */ + 0xE4, 0x4B, 0xE4, 0x4C, 0xE4, 0x4D, 0xE4, 0x4E, /* 0xB4-0xB7 */ + 0xE4, 0x4F, 0xE4, 0x50, 0xE4, 0x51, 0xE4, 0x52, /* 0xB8-0xBB */ + 0xE4, 0x53, 0xE4, 0x54, 0xE4, 0x55, 0xE4, 0x56, /* 0xBC-0xBF */ + 0xE4, 0x57, 0xE4, 0x58, 0xE4, 0x59, 0xE4, 0x5A, /* 0xC0-0xC3 */ + 0xE4, 0x5B, 0xE4, 0x5C, 0xE4, 0x5D, 0xE4, 0x5E, /* 0xC4-0xC7 */ + 0xF6, 0xC8, 0xE4, 0x5F, 0xE4, 0x60, 0xE4, 0x61, /* 0xC8-0xCB */ + 0xE4, 0x62, 0xE4, 0x63, 0xE4, 0x64, 0xE4, 0x65, /* 0xCC-0xCF */ + 0xE4, 0x66, 0xE4, 0x67, 0xE4, 0x68, 0xE4, 0x69, /* 0xD0-0xD3 */ + 0xE4, 0x6A, 0xE4, 0x6B, 0xE4, 0x6C, 0xE4, 0x6D, /* 0xD4-0xD7 */ + 0xE4, 0x6E, 0xE4, 0x6F, 0xE4, 0x70, 0xE4, 0x71, /* 0xD8-0xDB */ + 0xE4, 0x72, 0xE4, 0x73, 0xE4, 0x74, 0xE4, 0x75, /* 0xDC-0xDF */ + 0xE4, 0x76, 0xE4, 0x77, 0xE4, 0x78, 0xE4, 0x79, /* 0xE0-0xE3 */ + 0xE4, 0x7A, 0xE4, 0x7B, 0xE4, 0x7C, 0xE4, 0x7D, /* 0xE4-0xE7 */ + 0xE4, 0x7E, 0xE4, 0x80, 0xE4, 0x81, 0xE4, 0x82, /* 0xE8-0xEB */ + 0xE4, 0x83, 0xE4, 0x84, 0xE4, 0x85, 0xE4, 0x86, /* 0xEC-0xEF */ + 0xE4, 0x87, 0xE4, 0x88, 0xE4, 0x89, 0xE4, 0x8A, /* 0xF0-0xF3 */ + 0xE4, 0x8B, 0xE4, 0x8C, 0xE4, 0x8D, 0xE4, 0x8E, /* 0xF4-0xF7 */ + 0xE4, 0x8F, 0xE4, 0x90, 0xE4, 0x91, 0xE4, 0x92, /* 0xF8-0xFB */ + 0xE4, 0x93, 0xE4, 0x94, 0xE4, 0x95, 0xE4, 0x96, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0xE4, 0x97, 0xE4, 0x98, 0xE4, 0x99, 0xE4, 0x9A, /* 0x00-0x03 */ + 0xE4, 0x9B, 0xE4, 0x9C, 0xE4, 0x9D, 0xE4, 0x9E, /* 0x04-0x07 */ + 0xE4, 0x9F, 0xE4, 0xA0, 0xE5, 0x40, 0xE5, 0x41, /* 0x08-0x0B */ + 0xE5, 0x42, 0xE5, 0x43, 0xE5, 0x44, 0xE5, 0x45, /* 0x0C-0x0F */ + 0xE5, 0x46, 0xE5, 0x47, 0xE5, 0x48, 0xE5, 0x49, /* 0x10-0x13 */ + 0xE5, 0x4A, 0xE5, 0x4B, 0xE5, 0x4C, 0xE5, 0x4D, /* 0x14-0x17 */ + 0xE5, 0x4E, 0xE5, 0x4F, 0xE5, 0x50, 0xE5, 0x51, /* 0x18-0x1B */ + 0xE5, 0x52, 0xE5, 0x53, 0xE5, 0x54, 0xE5, 0x55, /* 0x1C-0x1F */ + 0xE5, 0x56, 0xE5, 0x57, 0xE5, 0x58, 0xE5, 0x59, /* 0x20-0x23 */ + 0xE5, 0x5A, 0xE5, 0x5B, 0xE5, 0x5C, 0xE5, 0x5D, /* 0x24-0x27 */ + 0xE5, 0x5E, 0xE5, 0x5F, 0xE5, 0x60, 0xE5, 0x61, /* 0x28-0x2B */ + 0xE5, 0x62, 0xE5, 0x63, 0xE5, 0x64, 0xE5, 0x65, /* 0x2C-0x2F */ + 0xE5, 0x66, 0xE5, 0x67, 0xE5, 0x68, 0xE5, 0x69, /* 0x30-0x33 */ + 0xE5, 0x6A, 0xE5, 0x6B, 0xE5, 0x6C, 0xE5, 0x6D, /* 0x34-0x37 */ + 0xE5, 0x6E, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x71, /* 0x38-0x3B */ + 0xE5, 0x72, 0xE5, 0x73, 0xF6, 0xC9, 0xE5, 0x74, /* 0x3C-0x3F */ + 0xE5, 0x75, 0xE5, 0x76, 0xE5, 0x77, 0xE5, 0x78, /* 0x40-0x43 */ + 0xE5, 0x79, 0xE5, 0x7A, 0xE5, 0x7B, 0xE5, 0x7C, /* 0x44-0x47 */ + 0xE5, 0x7D, 0xE5, 0x7E, 0xE5, 0x80, 0xE5, 0x81, /* 0x48-0x4B */ + 0xE5, 0x82, 0xE5, 0x83, 0xE5, 0x84, 0xE5, 0x85, /* 0x4C-0x4F */ + 0xE5, 0x86, 0xE5, 0x87, 0xE5, 0x88, 0xE5, 0x89, /* 0x50-0x53 */ + 0xE5, 0x8A, 0xE5, 0x8B, 0xE5, 0x8C, 0xE5, 0x8D, /* 0x54-0x57 */ + 0xE5, 0x8E, 0xE5, 0x8F, 0xE5, 0x90, 0xE5, 0x91, /* 0x58-0x5B */ + 0xE5, 0x92, 0xE5, 0x93, 0xE5, 0x94, 0xE5, 0x95, /* 0x5C-0x5F */ + 0xE5, 0x96, 0xE5, 0x97, 0xE5, 0x98, 0xE5, 0x99, /* 0x60-0x63 */ + 0xE5, 0x9A, 0xE5, 0x9B, 0xE5, 0x9C, 0xE5, 0x9D, /* 0x64-0x67 */ + 0xE5, 0x9E, 0xE5, 0x9F, 0xF6, 0xCA, 0xE5, 0xA0, /* 0x68-0x6B */ + 0xE6, 0x40, 0xE6, 0x41, 0xE6, 0x42, 0xE6, 0x43, /* 0x6C-0x6F */ + 0xE6, 0x44, 0xE6, 0x45, 0xE6, 0x46, 0xE6, 0x47, /* 0x70-0x73 */ + 0xE6, 0x48, 0xE6, 0x49, 0xE6, 0x4A, 0xE6, 0x4B, /* 0x74-0x77 */ + 0xE6, 0x4C, 0xE6, 0x4D, 0xE6, 0x4E, 0xE6, 0x4F, /* 0x78-0x7B */ + 0xE6, 0x50, 0xE6, 0x51, 0xE6, 0x52, 0xE6, 0x53, /* 0x7C-0x7F */ + + 0xE6, 0x54, 0xE6, 0x55, 0xE6, 0x56, 0xE6, 0x57, /* 0x80-0x83 */ + 0xE6, 0x58, 0xE6, 0x59, 0xE6, 0x5A, 0xE6, 0x5B, /* 0x84-0x87 */ + 0xE6, 0x5C, 0xE6, 0x5D, 0xE6, 0x5E, 0xE6, 0x5F, /* 0x88-0x8B */ + 0xE6, 0x60, 0xE6, 0x61, 0xE6, 0x62, 0xF6, 0xCC, /* 0x8C-0x8F */ + 0xE6, 0x63, 0xE6, 0x64, 0xE6, 0x65, 0xE6, 0x66, /* 0x90-0x93 */ + 0xE6, 0x67, 0xE6, 0x68, 0xE6, 0x69, 0xE6, 0x6A, /* 0x94-0x97 */ + 0xE6, 0x6B, 0xE6, 0x6C, 0xE6, 0x6D, 0xE6, 0x6E, /* 0x98-0x9B */ + 0xE6, 0x6F, 0xE6, 0x70, 0xE6, 0x71, 0xE6, 0x72, /* 0x9C-0x9F */ + 0xE6, 0x73, 0xE6, 0x74, 0xE6, 0x75, 0xE6, 0x76, /* 0xA0-0xA3 */ + 0xE6, 0x77, 0xE6, 0x78, 0xE6, 0x79, 0xE6, 0x7A, /* 0xA4-0xA7 */ + 0xE6, 0x7B, 0xE6, 0x7C, 0xE6, 0x7D, 0xE6, 0x7E, /* 0xA8-0xAB */ + 0xE6, 0x80, 0xE6, 0x81, 0xE6, 0x82, 0xE6, 0x83, /* 0xAC-0xAF */ + 0xE6, 0x84, 0xE6, 0x85, 0xE6, 0x86, 0xE6, 0x87, /* 0xB0-0xB3 */ + 0xE6, 0x88, 0xE6, 0x89, 0xE6, 0x8A, 0xE6, 0x8B, /* 0xB4-0xB7 */ + 0xE6, 0x8C, 0xE6, 0x8D, 0xE6, 0x8E, 0xE6, 0x8F, /* 0xB8-0xBB */ + 0xE6, 0x90, 0xE6, 0x91, 0xE6, 0x92, 0xE6, 0x93, /* 0xBC-0xBF */ + 0xE6, 0x94, 0xE6, 0x95, 0xE6, 0x96, 0xE6, 0x97, /* 0xC0-0xC3 */ + 0xE6, 0x98, 0xE6, 0x99, 0xE6, 0x9A, 0xE6, 0x9B, /* 0xC4-0xC7 */ + 0xE6, 0x9C, 0xE6, 0x9D, 0xF6, 0xCB, 0xE6, 0x9E, /* 0xC8-0xCB */ + 0xE6, 0x9F, 0xE6, 0xA0, 0xE7, 0x40, 0xE7, 0x41, /* 0xCC-0xCF */ + 0xE7, 0x42, 0xE7, 0x43, 0xE7, 0x44, 0xE7, 0x45, /* 0xD0-0xD3 */ + 0xE7, 0x46, 0xE7, 0x47, 0xF7, 0xE9, 0xE7, 0x48, /* 0xD4-0xD7 */ + 0xE7, 0x49, 0xE7, 0x4A, 0xE7, 0x4B, 0xE7, 0x4C, /* 0xD8-0xDB */ + 0xE7, 0x4D, 0xE7, 0x4E, 0xE7, 0x4F, 0xE7, 0x50, /* 0xDC-0xDF */ + 0xE7, 0x51, 0xE7, 0x52, 0xE7, 0x53, 0xE7, 0x54, /* 0xE0-0xE3 */ + 0xE7, 0x55, 0xE7, 0x56, 0xE7, 0x57, 0xE7, 0x58, /* 0xE4-0xE7 */ + 0xE7, 0x59, 0xE7, 0x5A, 0xE7, 0x5B, 0xE7, 0x5C, /* 0xE8-0xEB */ + 0xE7, 0x5D, 0xE7, 0x5E, 0xE7, 0x5F, 0xE7, 0x60, /* 0xEC-0xEF */ + 0xE7, 0x61, 0xE7, 0x62, 0xE7, 0x63, 0xE7, 0x64, /* 0xF0-0xF3 */ + 0xE7, 0x65, 0xE7, 0x66, 0xE7, 0x67, 0xE7, 0x68, /* 0xF4-0xF7 */ + 0xE7, 0x69, 0xE7, 0x6A, 0xE7, 0x6B, 0xE7, 0x6C, /* 0xF8-0xFB */ + 0xE7, 0x6D, 0xE7, 0x6E, 0xE7, 0x6F, 0xE7, 0x70, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_94[512] = { + 0xE7, 0x71, 0xE7, 0x72, 0xE7, 0x73, 0xE7, 0x74, /* 0x00-0x03 */ + 0xE7, 0x75, 0xE7, 0x76, 0xE7, 0x77, 0xE7, 0x78, /* 0x04-0x07 */ + 0xE7, 0x79, 0xE7, 0x7A, 0xE7, 0x7B, 0xE7, 0x7C, /* 0x08-0x0B */ + 0xE7, 0x7D, 0xE7, 0x7E, 0xE7, 0x80, 0xE7, 0x81, /* 0x0C-0x0F */ + 0xE7, 0x82, 0xE7, 0x83, 0xE7, 0x84, 0xE7, 0x85, /* 0x10-0x13 */ + 0xE7, 0x86, 0xE7, 0x87, 0xE7, 0x88, 0xE7, 0x89, /* 0x14-0x17 */ + 0xE7, 0x8A, 0xE7, 0x8B, 0xE7, 0x8C, 0xE7, 0x8D, /* 0x18-0x1B */ + 0xE7, 0x8E, 0xE7, 0x8F, 0xE7, 0x90, 0xE7, 0x91, /* 0x1C-0x1F */ + 0xE7, 0x92, 0xE7, 0x93, 0xE7, 0x94, 0xE7, 0x95, /* 0x20-0x23 */ + 0xE7, 0x96, 0xE7, 0x97, 0xE7, 0x98, 0xE7, 0x99, /* 0x24-0x27 */ + 0xE7, 0x9A, 0xE7, 0x9B, 0xE7, 0x9C, 0xE7, 0x9D, /* 0x28-0x2B */ + 0xE7, 0x9E, 0xE7, 0x9F, 0xE7, 0xA0, 0xE8, 0x40, /* 0x2C-0x2F */ + 0xE8, 0x41, 0xE8, 0x42, 0xE8, 0x43, 0xE8, 0x44, /* 0x30-0x33 */ + 0xE8, 0x45, 0xE8, 0x46, 0xE8, 0x47, 0xE8, 0x48, /* 0x34-0x37 */ + 0xE8, 0x49, 0xE8, 0x4A, 0xE8, 0x4B, 0xE8, 0x4C, /* 0x38-0x3B */ + 0xE8, 0x4D, 0xE8, 0x4E, 0xF6, 0xCD, 0xE8, 0x4F, /* 0x3C-0x3F */ + 0xE8, 0x50, 0xE8, 0x51, 0xE8, 0x52, 0xE8, 0x53, /* 0x40-0x43 */ + 0xE8, 0x54, 0xE8, 0x55, 0xE8, 0x56, 0xE8, 0x57, /* 0x44-0x47 */ + 0xE8, 0x58, 0xE8, 0x59, 0xE8, 0x5A, 0xE8, 0x5B, /* 0x48-0x4B */ + 0xE8, 0x5C, 0xE8, 0x5D, 0xE8, 0x5E, 0xE8, 0x5F, /* 0x4C-0x4F */ + 0xE8, 0x60, 0xE8, 0x61, 0xE8, 0x62, 0xE8, 0x63, /* 0x50-0x53 */ + 0xE8, 0x64, 0xE8, 0x65, 0xE8, 0x66, 0xE8, 0x67, /* 0x54-0x57 */ + 0xE8, 0x68, 0xE8, 0x69, 0xE8, 0x6A, 0xE8, 0x6B, /* 0x58-0x5B */ + 0xE8, 0x6C, 0xE8, 0x6D, 0xE8, 0x6E, 0xE8, 0x6F, /* 0x5C-0x5F */ + 0xE8, 0x70, 0xE8, 0x71, 0xE8, 0x72, 0xE8, 0x73, /* 0x60-0x63 */ + 0xE8, 0x74, 0xE8, 0x75, 0xE8, 0x76, 0xE8, 0x77, /* 0x64-0x67 */ + 0xE8, 0x78, 0xE8, 0x79, 0xE8, 0x7A, 0xF6, 0xCE, /* 0x68-0x6B */ + 0xE8, 0x7B, 0xE8, 0x7C, 0xE8, 0x7D, 0xE8, 0x7E, /* 0x6C-0x6F */ + 0xE8, 0x80, 0xE8, 0x81, 0xE8, 0x82, 0xE8, 0x83, /* 0x70-0x73 */ + 0xE8, 0x84, 0xE8, 0x85, 0xE8, 0x86, 0xE8, 0x87, /* 0x74-0x77 */ + 0xE8, 0x88, 0xE8, 0x89, 0xE8, 0x8A, 0xE8, 0x8B, /* 0x78-0x7B */ + 0xE8, 0x8C, 0xE8, 0x8D, 0xE8, 0x8E, 0xE8, 0x8F, /* 0x7C-0x7F */ + + 0xE8, 0x90, 0xE8, 0x91, 0xE8, 0x92, 0xE8, 0x93, /* 0x80-0x83 */ + 0xE8, 0x94, 0xEE, 0xC4, 0xEE, 0xC5, 0xEE, 0xC6, /* 0x84-0x87 */ + 0xD5, 0xEB, 0xB6, 0xA4, 0xEE, 0xC8, 0xEE, 0xC7, /* 0x88-0x8B */ + 0xEE, 0xC9, 0xEE, 0xCA, 0xC7, 0xA5, 0xEE, 0xCB, /* 0x8C-0x8F */ + 0xEE, 0xCC, 0xE8, 0x95, 0xB7, 0xB0, 0xB5, 0xF6, /* 0x90-0x93 */ + 0xEE, 0xCD, 0xEE, 0xCF, 0xE8, 0x96, 0xEE, 0xCE, /* 0x94-0x97 */ + 0xE8, 0x97, 0xB8, 0xC6, 0xEE, 0xD0, 0xEE, 0xD1, /* 0x98-0x9B */ + 0xEE, 0xD2, 0xB6, 0xDB, 0xB3, 0xAE, 0xD6, 0xD3, /* 0x9C-0x9F */ + 0xC4, 0xC6, 0xB1, 0xB5, 0xB8, 0xD6, 0xEE, 0xD3, /* 0xA0-0xA3 */ + 0xEE, 0xD4, 0xD4, 0xBF, 0xC7, 0xD5, 0xBE, 0xFB, /* 0xA4-0xA7 */ + 0xCE, 0xD9, 0xB9, 0xB3, 0xEE, 0xD6, 0xEE, 0xD5, /* 0xA8-0xAB */ + 0xEE, 0xD8, 0xEE, 0xD7, 0xC5, 0xA5, 0xEE, 0xD9, /* 0xAC-0xAF */ + 0xEE, 0xDA, 0xC7, 0xAE, 0xEE, 0xDB, 0xC7, 0xAF, /* 0xB0-0xB3 */ + 0xEE, 0xDC, 0xB2, 0xA7, 0xEE, 0xDD, 0xEE, 0xDE, /* 0xB4-0xB7 */ + 0xEE, 0xDF, 0xEE, 0xE0, 0xEE, 0xE1, 0xD7, 0xEA, /* 0xB8-0xBB */ + 0xEE, 0xE2, 0xEE, 0xE3, 0xBC, 0xD8, 0xEE, 0xE4, /* 0xBC-0xBF */ + 0xD3, 0xCB, 0xCC, 0xFA, 0xB2, 0xAC, 0xC1, 0xE5, /* 0xC0-0xC3 */ + 0xEE, 0xE5, 0xC7, 0xA6, 0xC3, 0xAD, 0xE8, 0x98, /* 0xC4-0xC7 */ + 0xEE, 0xE6, 0xEE, 0xE7, 0xEE, 0xE8, 0xEE, 0xE9, /* 0xC8-0xCB */ + 0xEE, 0xEA, 0xEE, 0xEB, 0xEE, 0xEC, 0xE8, 0x99, /* 0xCC-0xCF */ + 0xEE, 0xED, 0xEE, 0xEE, 0xEE, 0xEF, 0xE8, 0x9A, /* 0xD0-0xD3 */ + 0xE8, 0x9B, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0xD4-0xD7 */ + 0xEE, 0xF4, 0xEE, 0xF3, 0xE8, 0x9C, 0xEE, 0xF5, /* 0xD8-0xDB */ + 0xCD, 0xAD, 0xC2, 0xC1, 0xEE, 0xF6, 0xEE, 0xF7, /* 0xDC-0xDF */ + 0xEE, 0xF8, 0xD5, 0xA1, 0xEE, 0xF9, 0xCF, 0xB3, /* 0xE0-0xE3 */ + 0xEE, 0xFA, 0xEE, 0xFB, 0xE8, 0x9D, 0xEE, 0xFC, /* 0xE4-0xE7 */ + 0xEE, 0xFD, 0xEF, 0xA1, 0xEE, 0xFE, 0xEF, 0xA2, /* 0xE8-0xEB */ + 0xB8, 0xF5, 0xC3, 0xFA, 0xEF, 0xA3, 0xEF, 0xA4, /* 0xEC-0xEF */ + 0xBD, 0xC2, 0xD2, 0xBF, 0xB2, 0xF9, 0xEF, 0xA5, /* 0xF0-0xF3 */ + 0xEF, 0xA6, 0xEF, 0xA7, 0xD2, 0xF8, 0xEF, 0xA8, /* 0xF4-0xF7 */ + 0xD6, 0xFD, 0xEF, 0xA9, 0xC6, 0xCC, 0xE8, 0x9E, /* 0xF8-0xFB */ + 0xEF, 0xAA, 0xEF, 0xAB, 0xC1, 0xB4, 0xEF, 0xAC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_95[512] = { + 0xCF, 0xFA, 0xCB, 0xF8, 0xEF, 0xAE, 0xEF, 0xAD, /* 0x00-0x03 */ + 0xB3, 0xFA, 0xB9, 0xF8, 0xEF, 0xAF, 0xEF, 0xB0, /* 0x04-0x07 */ + 0xD0, 0xE2, 0xEF, 0xB1, 0xEF, 0xB2, 0xB7, 0xE6, /* 0x08-0x0B */ + 0xD0, 0xBF, 0xEF, 0xB3, 0xEF, 0xB4, 0xEF, 0xB5, /* 0x0C-0x0F */ + 0xC8, 0xF1, 0xCC, 0xE0, 0xEF, 0xB6, 0xEF, 0xB7, /* 0x10-0x13 */ + 0xEF, 0xB8, 0xEF, 0xB9, 0xEF, 0xBA, 0xD5, 0xE0, /* 0x14-0x17 */ + 0xEF, 0xBB, 0xB4, 0xED, 0xC3, 0xAA, 0xEF, 0xBC, /* 0x18-0x1B */ + 0xE8, 0x9F, 0xEF, 0xBD, 0xEF, 0xBE, 0xEF, 0xBF, /* 0x1C-0x1F */ + 0xE8, 0xA0, 0xCE, 0xFD, 0xEF, 0xC0, 0xC2, 0xE0, /* 0x20-0x23 */ + 0xB4, 0xB8, 0xD7, 0xB6, 0xBD, 0xF5, 0xE9, 0x40, /* 0x24-0x27 */ + 0xCF, 0xC7, 0xEF, 0xC3, 0xEF, 0xC1, 0xEF, 0xC2, /* 0x28-0x2B */ + 0xEF, 0xC4, 0xB6, 0xA7, 0xBC, 0xFC, 0xBE, 0xE2, /* 0x2C-0x2F */ + 0xC3, 0xCC, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x41, /* 0x30-0x33 */ + 0xEF, 0xC7, 0xEF, 0xCF, 0xEF, 0xC8, 0xEF, 0xC9, /* 0x34-0x37 */ + 0xEF, 0xCA, 0xC7, 0xC2, 0xEF, 0xF1, 0xB6, 0xCD, /* 0x38-0x3B */ + 0xEF, 0xCB, 0xE9, 0x42, 0xEF, 0xCC, 0xEF, 0xCD, /* 0x3C-0x3F */ + 0xB6, 0xC6, 0xC3, 0xBE, 0xEF, 0xCE, 0xE9, 0x43, /* 0x40-0x43 */ + 0xEF, 0xD0, 0xEF, 0xD1, 0xEF, 0xD2, 0xD5, 0xF2, /* 0x44-0x47 */ + 0xE9, 0x44, 0xEF, 0xD3, 0xC4, 0xF7, 0xE9, 0x45, /* 0x48-0x4B */ + 0xEF, 0xD4, 0xC4, 0xF8, 0xEF, 0xD5, 0xEF, 0xD6, /* 0x4C-0x4F */ + 0xB8, 0xE4, 0xB0, 0xF7, 0xEF, 0xD7, 0xEF, 0xD8, /* 0x50-0x53 */ + 0xEF, 0xD9, 0xE9, 0x46, 0xEF, 0xDA, 0xEF, 0xDB, /* 0x54-0x57 */ + 0xEF, 0xDC, 0xEF, 0xDD, 0xE9, 0x47, 0xEF, 0xDE, /* 0x58-0x5B */ + 0xBE, 0xB5, 0xEF, 0xE1, 0xEF, 0xDF, 0xEF, 0xE0, /* 0x5C-0x5F */ + 0xE9, 0x48, 0xEF, 0xE2, 0xEF, 0xE3, 0xC1, 0xCD, /* 0x60-0x63 */ + 0xEF, 0xE4, 0xEF, 0xE5, 0xEF, 0xE6, 0xEF, 0xE7, /* 0x64-0x67 */ + 0xEF, 0xE8, 0xEF, 0xE9, 0xEF, 0xEA, 0xEF, 0xEB, /* 0x68-0x6B */ + 0xEF, 0xEC, 0xC0, 0xD8, 0xE9, 0x49, 0xEF, 0xED, /* 0x6C-0x6F */ + 0xC1, 0xAD, 0xEF, 0xEE, 0xEF, 0xEF, 0xEF, 0xF0, /* 0x70-0x73 */ + 0xE9, 0x4A, 0xE9, 0x4B, 0xCF, 0xE2, 0xE9, 0x4C, /* 0x74-0x77 */ + 0xE9, 0x4D, 0xE9, 0x4E, 0xE9, 0x4F, 0xE9, 0x50, /* 0x78-0x7B */ + 0xE9, 0x51, 0xE9, 0x52, 0xE9, 0x53, 0xB3, 0xA4, /* 0x7C-0x7F */ + + 0xE9, 0x54, 0xE9, 0x55, 0xE9, 0x56, 0xE9, 0x57, /* 0x80-0x83 */ + 0xE9, 0x58, 0xE9, 0x59, 0xE9, 0x5A, 0xE9, 0x5B, /* 0x84-0x87 */ + 0xE9, 0x5C, 0xE9, 0x5D, 0xE9, 0x5E, 0xE9, 0x5F, /* 0x88-0x8B */ + 0xE9, 0x60, 0xE9, 0x61, 0xE9, 0x62, 0xE9, 0x63, /* 0x8C-0x8F */ + 0xE9, 0x64, 0xE9, 0x65, 0xE9, 0x66, 0xE9, 0x67, /* 0x90-0x93 */ + 0xE9, 0x68, 0xE9, 0x69, 0xE9, 0x6A, 0xE9, 0x6B, /* 0x94-0x97 */ + 0xE9, 0x6C, 0xE9, 0x6D, 0xE9, 0x6E, 0xE9, 0x6F, /* 0x98-0x9B */ + 0xE9, 0x70, 0xE9, 0x71, 0xE9, 0x72, 0xE9, 0x73, /* 0x9C-0x9F */ + 0xE9, 0x74, 0xE9, 0x75, 0xE9, 0x76, 0xE9, 0x77, /* 0xA0-0xA3 */ + 0xE9, 0x78, 0xE9, 0x79, 0xE9, 0x7A, 0xE9, 0x7B, /* 0xA4-0xA7 */ + 0xE9, 0x7C, 0xE9, 0x7D, 0xE9, 0x7E, 0xE9, 0x80, /* 0xA8-0xAB */ + 0xE9, 0x81, 0xE9, 0x82, 0xE9, 0x83, 0xE9, 0x84, /* 0xAC-0xAF */ + 0xE9, 0x85, 0xE9, 0x86, 0xE9, 0x87, 0xE9, 0x88, /* 0xB0-0xB3 */ + 0xE9, 0x89, 0xE9, 0x8A, 0xE9, 0x8B, 0xE9, 0x8C, /* 0xB4-0xB7 */ + 0xE9, 0x8D, 0xE9, 0x8E, 0xE9, 0x8F, 0xE9, 0x90, /* 0xB8-0xBB */ + 0xE9, 0x91, 0xE9, 0x92, 0xE9, 0x93, 0xE9, 0x94, /* 0xBC-0xBF */ + 0xE9, 0x95, 0xE9, 0x96, 0xE9, 0x97, 0xE9, 0x98, /* 0xC0-0xC3 */ + 0xE9, 0x99, 0xE9, 0x9A, 0xE9, 0x9B, 0xE9, 0x9C, /* 0xC4-0xC7 */ + 0xE9, 0x9D, 0xE9, 0x9E, 0xE9, 0x9F, 0xE9, 0xA0, /* 0xC8-0xCB */ + 0xEA, 0x40, 0xEA, 0x41, 0xEA, 0x42, 0xEA, 0x43, /* 0xCC-0xCF */ + 0xEA, 0x44, 0xEA, 0x45, 0xEA, 0x46, 0xEA, 0x47, /* 0xD0-0xD3 */ + 0xEA, 0x48, 0xEA, 0x49, 0xEA, 0x4A, 0xEA, 0x4B, /* 0xD4-0xD7 */ + 0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x4E, 0xEA, 0x4F, /* 0xD8-0xDB */ + 0xEA, 0x50, 0xEA, 0x51, 0xEA, 0x52, 0xEA, 0x53, /* 0xDC-0xDF */ + 0xEA, 0x54, 0xEA, 0x55, 0xEA, 0x56, 0xEA, 0x57, /* 0xE0-0xE3 */ + 0xEA, 0x58, 0xEA, 0x59, 0xEA, 0x5A, 0xEA, 0x5B, /* 0xE4-0xE7 */ + 0xC3, 0xC5, 0xE3, 0xC5, 0xC9, 0xC1, 0xE3, 0xC6, /* 0xE8-0xEB */ + 0xEA, 0x5C, 0xB1, 0xD5, 0xCE, 0xCA, 0xB4, 0xB3, /* 0xEC-0xEF */ + 0xC8, 0xF2, 0xE3, 0xC7, 0xCF, 0xD0, 0xE3, 0xC8, /* 0xF0-0xF3 */ + 0xBC, 0xE4, 0xE3, 0xC9, 0xE3, 0xCA, 0xC3, 0xC6, /* 0xF4-0xF7 */ + 0xD5, 0xA2, 0xC4, 0xD6, 0xB9, 0xEB, 0xCE, 0xC5, /* 0xF8-0xFB */ + 0xE3, 0xCB, 0xC3, 0xF6, 0xE3, 0xCC, 0xEA, 0x5D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_96[512] = { + 0xB7, 0xA7, 0xB8, 0xF3, 0xBA, 0xD2, 0xE3, 0xCD, /* 0x00-0x03 */ + 0xE3, 0xCE, 0xD4, 0xC4, 0xE3, 0xCF, 0xEA, 0x5E, /* 0x04-0x07 */ + 0xE3, 0xD0, 0xD1, 0xCB, 0xE3, 0xD1, 0xE3, 0xD2, /* 0x08-0x0B */ + 0xE3, 0xD3, 0xE3, 0xD4, 0xD1, 0xD6, 0xE3, 0xD5, /* 0x0C-0x0F */ + 0xB2, 0xFB, 0xC0, 0xBB, 0xE3, 0xD6, 0xEA, 0x5F, /* 0x10-0x13 */ + 0xC0, 0xAB, 0xE3, 0xD7, 0xE3, 0xD8, 0xE3, 0xD9, /* 0x14-0x17 */ + 0xEA, 0x60, 0xE3, 0xDA, 0xE3, 0xDB, 0xEA, 0x61, /* 0x18-0x1B */ + 0xB8, 0xB7, 0xDA, 0xE2, 0xEA, 0x62, 0xB6, 0xD3, /* 0x1C-0x1F */ + 0xEA, 0x63, 0xDA, 0xE4, 0xDA, 0xE3, 0xEA, 0x64, /* 0x20-0x23 */ + 0xEA, 0x65, 0xEA, 0x66, 0xEA, 0x67, 0xEA, 0x68, /* 0x24-0x27 */ + 0xEA, 0x69, 0xEA, 0x6A, 0xDA, 0xE6, 0xEA, 0x6B, /* 0x28-0x2B */ + 0xEA, 0x6C, 0xEA, 0x6D, 0xC8, 0xEE, 0xEA, 0x6E, /* 0x2C-0x2F */ + 0xEA, 0x6F, 0xDA, 0xE5, 0xB7, 0xC0, 0xD1, 0xF4, /* 0x30-0x33 */ + 0xD2, 0xF5, 0xD5, 0xF3, 0xBD, 0xD7, 0xEA, 0x70, /* 0x34-0x37 */ + 0xEA, 0x71, 0xEA, 0x72, 0xEA, 0x73, 0xD7, 0xE8, /* 0x38-0x3B */ + 0xDA, 0xE8, 0xDA, 0xE7, 0xEA, 0x74, 0xB0, 0xA2, /* 0x3C-0x3F */ + 0xCD, 0xD3, 0xEA, 0x75, 0xDA, 0xE9, 0xEA, 0x76, /* 0x40-0x43 */ + 0xB8, 0xBD, 0xBC, 0xCA, 0xC2, 0xBD, 0xC2, 0xA4, /* 0x44-0x47 */ + 0xB3, 0xC2, 0xDA, 0xEA, 0xEA, 0x77, 0xC2, 0xAA, /* 0x48-0x4B */ + 0xC4, 0xB0, 0xBD, 0xB5, 0xEA, 0x78, 0xEA, 0x79, /* 0x4C-0x4F */ + 0xCF, 0xDE, 0xEA, 0x7A, 0xEA, 0x7B, 0xEA, 0x7C, /* 0x50-0x53 */ + 0xDA, 0xEB, 0xC9, 0xC2, 0xEA, 0x7D, 0xEA, 0x7E, /* 0x54-0x57 */ + 0xEA, 0x80, 0xEA, 0x81, 0xEA, 0x82, 0xB1, 0xDD, /* 0x58-0x5B */ + 0xEA, 0x83, 0xEA, 0x84, 0xEA, 0x85, 0xDA, 0xEC, /* 0x5C-0x5F */ + 0xEA, 0x86, 0xB6, 0xB8, 0xD4, 0xBA, 0xEA, 0x87, /* 0x60-0x63 */ + 0xB3, 0xFD, 0xEA, 0x88, 0xEA, 0x89, 0xDA, 0xED, /* 0x64-0x67 */ + 0xD4, 0xC9, 0xCF, 0xD5, 0xC5, 0xE3, 0xEA, 0x8A, /* 0x68-0x6B */ + 0xDA, 0xEE, 0xEA, 0x8B, 0xEA, 0x8C, 0xEA, 0x8D, /* 0x6C-0x6F */ + 0xEA, 0x8E, 0xEA, 0x8F, 0xDA, 0xEF, 0xEA, 0x90, /* 0x70-0x73 */ + 0xDA, 0xF0, 0xC1, 0xEA, 0xCC, 0xD5, 0xCF, 0xDD, /* 0x74-0x77 */ + 0xEA, 0x91, 0xEA, 0x92, 0xEA, 0x93, 0xEA, 0x94, /* 0x78-0x7B */ + 0xEA, 0x95, 0xEA, 0x96, 0xEA, 0x97, 0xEA, 0x98, /* 0x7C-0x7F */ + + 0xEA, 0x99, 0xEA, 0x9A, 0xEA, 0x9B, 0xEA, 0x9C, /* 0x80-0x83 */ + 0xEA, 0x9D, 0xD3, 0xE7, 0xC2, 0xA1, 0xEA, 0x9E, /* 0x84-0x87 */ + 0xDA, 0xF1, 0xEA, 0x9F, 0xEA, 0xA0, 0xCB, 0xE5, /* 0x88-0x8B */ + 0xEB, 0x40, 0xDA, 0xF2, 0xEB, 0x41, 0xCB, 0xE6, /* 0x8C-0x8F */ + 0xD2, 0xFE, 0xEB, 0x42, 0xEB, 0x43, 0xEB, 0x44, /* 0x90-0x93 */ + 0xB8, 0xF4, 0xEB, 0x45, 0xEB, 0x46, 0xDA, 0xF3, /* 0x94-0x97 */ + 0xB0, 0xAF, 0xCF, 0xB6, 0xEB, 0x47, 0xEB, 0x48, /* 0x98-0x9B */ + 0xD5, 0xCF, 0xEB, 0x49, 0xEB, 0x4A, 0xEB, 0x4B, /* 0x9C-0x9F */ + 0xEB, 0x4C, 0xEB, 0x4D, 0xEB, 0x4E, 0xEB, 0x4F, /* 0xA0-0xA3 */ + 0xEB, 0x50, 0xEB, 0x51, 0xEB, 0x52, 0xCB, 0xED, /* 0xA4-0xA7 */ + 0xEB, 0x53, 0xEB, 0x54, 0xEB, 0x55, 0xEB, 0x56, /* 0xA8-0xAB */ + 0xEB, 0x57, 0xEB, 0x58, 0xEB, 0x59, 0xEB, 0x5A, /* 0xAC-0xAF */ + 0xDA, 0xF4, 0xEB, 0x5B, 0xEB, 0x5C, 0xE3, 0xC4, /* 0xB0-0xB3 */ + 0xEB, 0x5D, 0xEB, 0x5E, 0xC1, 0xA5, 0xEB, 0x5F, /* 0xB4-0xB7 */ + 0xEB, 0x60, 0xF6, 0xBF, 0xEB, 0x61, 0xEB, 0x62, /* 0xB8-0xBB */ + 0xF6, 0xC0, 0xF6, 0xC1, 0xC4, 0xD1, 0xEB, 0x63, /* 0xBC-0xBF */ + 0xC8, 0xB8, 0xD1, 0xE3, 0xEB, 0x64, 0xEB, 0x65, /* 0xC0-0xC3 */ + 0xD0, 0xDB, 0xD1, 0xC5, 0xBC, 0xAF, 0xB9, 0xCD, /* 0xC4-0xC7 */ + 0xEB, 0x66, 0xEF, 0xF4, 0xEB, 0x67, 0xEB, 0x68, /* 0xC8-0xCB */ + 0xB4, 0xC6, 0xD3, 0xBA, 0xF6, 0xC2, 0xB3, 0xFB, /* 0xCC-0xCF */ + 0xEB, 0x69, 0xEB, 0x6A, 0xF6, 0xC3, 0xEB, 0x6B, /* 0xD0-0xD3 */ + 0xEB, 0x6C, 0xB5, 0xF1, 0xEB, 0x6D, 0xEB, 0x6E, /* 0xD4-0xD7 */ + 0xEB, 0x6F, 0xEB, 0x70, 0xEB, 0x71, 0xEB, 0x72, /* 0xD8-0xDB */ + 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x75, 0xEB, 0x76, /* 0xDC-0xDF */ + 0xF6, 0xC5, 0xEB, 0x77, 0xEB, 0x78, 0xEB, 0x79, /* 0xE0-0xE3 */ + 0xEB, 0x7A, 0xEB, 0x7B, 0xEB, 0x7C, 0xEB, 0x7D, /* 0xE4-0xE7 */ + 0xD3, 0xEA, 0xF6, 0xA7, 0xD1, 0xA9, 0xEB, 0x7E, /* 0xE8-0xEB */ + 0xEB, 0x80, 0xEB, 0x81, 0xEB, 0x82, 0xF6, 0xA9, /* 0xEC-0xEF */ + 0xEB, 0x83, 0xEB, 0x84, 0xEB, 0x85, 0xF6, 0xA8, /* 0xF0-0xF3 */ + 0xEB, 0x86, 0xEB, 0x87, 0xC1, 0xE3, 0xC0, 0xD7, /* 0xF4-0xF7 */ + 0xEB, 0x88, 0xB1, 0xA2, 0xEB, 0x89, 0xEB, 0x8A, /* 0xF8-0xFB */ + 0xEB, 0x8B, 0xEB, 0x8C, 0xCE, 0xED, 0xEB, 0x8D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_97[512] = { + 0xD0, 0xE8, 0xF6, 0xAB, 0xEB, 0x8E, 0xEB, 0x8F, /* 0x00-0x03 */ + 0xCF, 0xF6, 0xEB, 0x90, 0xF6, 0xAA, 0xD5, 0xF0, /* 0x04-0x07 */ + 0xF6, 0xAC, 0xC3, 0xB9, 0xEB, 0x91, 0xEB, 0x92, /* 0x08-0x0B */ + 0xEB, 0x93, 0xBB, 0xF4, 0xF6, 0xAE, 0xF6, 0xAD, /* 0x0C-0x0F */ + 0xEB, 0x94, 0xEB, 0x95, 0xEB, 0x96, 0xC4, 0xDE, /* 0x10-0x13 */ + 0xEB, 0x97, 0xEB, 0x98, 0xC1, 0xD8, 0xEB, 0x99, /* 0x14-0x17 */ + 0xEB, 0x9A, 0xEB, 0x9B, 0xEB, 0x9C, 0xEB, 0x9D, /* 0x18-0x1B */ + 0xCB, 0xAA, 0xEB, 0x9E, 0xCF, 0xBC, 0xEB, 0x9F, /* 0x1C-0x1F */ + 0xEB, 0xA0, 0xEC, 0x40, 0xEC, 0x41, 0xEC, 0x42, /* 0x20-0x23 */ + 0xEC, 0x43, 0xEC, 0x44, 0xEC, 0x45, 0xEC, 0x46, /* 0x24-0x27 */ + 0xEC, 0x47, 0xEC, 0x48, 0xF6, 0xAF, 0xEC, 0x49, /* 0x28-0x2B */ + 0xEC, 0x4A, 0xF6, 0xB0, 0xEC, 0x4B, 0xEC, 0x4C, /* 0x2C-0x2F */ + 0xF6, 0xB1, 0xEC, 0x4D, 0xC2, 0xB6, 0xEC, 0x4E, /* 0x30-0x33 */ + 0xEC, 0x4F, 0xEC, 0x50, 0xEC, 0x51, 0xEC, 0x52, /* 0x34-0x37 */ + 0xB0, 0xD4, 0xC5, 0xF9, 0xEC, 0x53, 0xEC, 0x54, /* 0x38-0x3B */ + 0xEC, 0x55, 0xEC, 0x56, 0xF6, 0xB2, 0xEC, 0x57, /* 0x3C-0x3F */ + 0xEC, 0x58, 0xEC, 0x59, 0xEC, 0x5A, 0xEC, 0x5B, /* 0x40-0x43 */ + 0xEC, 0x5C, 0xEC, 0x5D, 0xEC, 0x5E, 0xEC, 0x5F, /* 0x44-0x47 */ + 0xEC, 0x60, 0xEC, 0x61, 0xEC, 0x62, 0xEC, 0x63, /* 0x48-0x4B */ + 0xEC, 0x64, 0xEC, 0x65, 0xEC, 0x66, 0xEC, 0x67, /* 0x4C-0x4F */ + 0xEC, 0x68, 0xEC, 0x69, 0xC7, 0xE0, 0xF6, 0xA6, /* 0x50-0x53 */ + 0xEC, 0x6A, 0xEC, 0x6B, 0xBE, 0xB8, 0xEC, 0x6C, /* 0x54-0x57 */ + 0xEC, 0x6D, 0xBE, 0xB2, 0xEC, 0x6E, 0xB5, 0xE5, /* 0x58-0x5B */ + 0xEC, 0x6F, 0xEC, 0x70, 0xB7, 0xC7, 0xEC, 0x71, /* 0x5C-0x5F */ + 0xBF, 0xBF, 0xC3, 0xD2, 0xC3, 0xE6, 0xEC, 0x72, /* 0x60-0x63 */ + 0xEC, 0x73, 0xD8, 0xCC, 0xEC, 0x74, 0xEC, 0x75, /* 0x64-0x67 */ + 0xEC, 0x76, 0xB8, 0xEF, 0xEC, 0x77, 0xEC, 0x78, /* 0x68-0x6B */ + 0xEC, 0x79, 0xEC, 0x7A, 0xEC, 0x7B, 0xEC, 0x7C, /* 0x6C-0x6F */ + 0xEC, 0x7D, 0xEC, 0x7E, 0xEC, 0x80, 0xBD, 0xF9, /* 0x70-0x73 */ + 0xD1, 0xA5, 0xEC, 0x81, 0xB0, 0xD0, 0xEC, 0x82, /* 0x74-0x77 */ + 0xEC, 0x83, 0xEC, 0x84, 0xEC, 0x85, 0xEC, 0x86, /* 0x78-0x7B */ + 0xF7, 0xB0, 0xEC, 0x87, 0xEC, 0x88, 0xEC, 0x89, /* 0x7C-0x7F */ + + 0xEC, 0x8A, 0xEC, 0x8B, 0xEC, 0x8C, 0xEC, 0x8D, /* 0x80-0x83 */ + 0xEC, 0x8E, 0xF7, 0xB1, 0xEC, 0x8F, 0xEC, 0x90, /* 0x84-0x87 */ + 0xEC, 0x91, 0xEC, 0x92, 0xEC, 0x93, 0xD0, 0xAC, /* 0x88-0x8B */ + 0xEC, 0x94, 0xB0, 0xB0, 0xEC, 0x95, 0xEC, 0x96, /* 0x8C-0x8F */ + 0xEC, 0x97, 0xF7, 0xB2, 0xF7, 0xB3, 0xEC, 0x98, /* 0x90-0x93 */ + 0xF7, 0xB4, 0xEC, 0x99, 0xEC, 0x9A, 0xEC, 0x9B, /* 0x94-0x97 */ + 0xC7, 0xCA, 0xEC, 0x9C, 0xEC, 0x9D, 0xEC, 0x9E, /* 0x98-0x9B */ + 0xEC, 0x9F, 0xEC, 0xA0, 0xED, 0x40, 0xED, 0x41, /* 0x9C-0x9F */ + 0xBE, 0xCF, 0xED, 0x42, 0xED, 0x43, 0xF7, 0xB7, /* 0xA0-0xA3 */ + 0xED, 0x44, 0xED, 0x45, 0xED, 0x46, 0xED, 0x47, /* 0xA4-0xA7 */ + 0xED, 0x48, 0xED, 0x49, 0xED, 0x4A, 0xF7, 0xB6, /* 0xA8-0xAB */ + 0xED, 0x4B, 0xB1, 0xDE, 0xED, 0x4C, 0xF7, 0xB5, /* 0xAC-0xAF */ + 0xED, 0x4D, 0xED, 0x4E, 0xF7, 0xB8, 0xED, 0x4F, /* 0xB0-0xB3 */ + 0xF7, 0xB9, 0xED, 0x50, 0xED, 0x51, 0xED, 0x52, /* 0xB4-0xB7 */ + 0xED, 0x53, 0xED, 0x54, 0xED, 0x55, 0xED, 0x56, /* 0xB8-0xBB */ + 0xED, 0x57, 0xED, 0x58, 0xED, 0x59, 0xED, 0x5A, /* 0xBC-0xBF */ + 0xED, 0x5B, 0xED, 0x5C, 0xED, 0x5D, 0xED, 0x5E, /* 0xC0-0xC3 */ + 0xED, 0x5F, 0xED, 0x60, 0xED, 0x61, 0xED, 0x62, /* 0xC4-0xC7 */ + 0xED, 0x63, 0xED, 0x64, 0xED, 0x65, 0xED, 0x66, /* 0xC8-0xCB */ + 0xED, 0x67, 0xED, 0x68, 0xED, 0x69, 0xED, 0x6A, /* 0xCC-0xCF */ + 0xED, 0x6B, 0xED, 0x6C, 0xED, 0x6D, 0xED, 0x6E, /* 0xD0-0xD3 */ + 0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xED, 0x72, /* 0xD4-0xD7 */ + 0xED, 0x73, 0xED, 0x74, 0xED, 0x75, 0xED, 0x76, /* 0xD8-0xDB */ + 0xED, 0x77, 0xED, 0x78, 0xED, 0x79, 0xED, 0x7A, /* 0xDC-0xDF */ + 0xED, 0x7B, 0xED, 0x7C, 0xED, 0x7D, 0xED, 0x7E, /* 0xE0-0xE3 */ + 0xED, 0x80, 0xED, 0x81, 0xCE, 0xA4, 0xC8, 0xCD, /* 0xE4-0xE7 */ + 0xED, 0x82, 0xBA, 0xAB, 0xE8, 0xB8, 0xE8, 0xB9, /* 0xE8-0xEB */ + 0xE8, 0xBA, 0xBE, 0xC2, 0xED, 0x83, 0xED, 0x84, /* 0xEC-0xEF */ + 0xED, 0x85, 0xED, 0x86, 0xED, 0x87, 0xD2, 0xF4, /* 0xF0-0xF3 */ + 0xED, 0x88, 0xD4, 0xCF, 0xC9, 0xD8, 0xED, 0x89, /* 0xF4-0xF7 */ + 0xED, 0x8A, 0xED, 0x8B, 0xED, 0x8C, 0xED, 0x8D, /* 0xF8-0xFB */ + 0xED, 0x8E, 0xED, 0x8F, 0xED, 0x90, 0xED, 0x91, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xED, 0x92, 0xED, 0x93, 0xED, 0x94, 0xED, 0x95, /* 0x00-0x03 */ + 0xED, 0x96, 0xED, 0x97, 0xED, 0x98, 0xED, 0x99, /* 0x04-0x07 */ + 0xED, 0x9A, 0xED, 0x9B, 0xED, 0x9C, 0xED, 0x9D, /* 0x08-0x0B */ + 0xED, 0x9E, 0xED, 0x9F, 0xED, 0xA0, 0xEE, 0x40, /* 0x0C-0x0F */ + 0xEE, 0x41, 0xEE, 0x42, 0xEE, 0x43, 0xEE, 0x44, /* 0x10-0x13 */ + 0xEE, 0x45, 0xEE, 0x46, 0xEE, 0x47, 0xEE, 0x48, /* 0x14-0x17 */ + 0xEE, 0x49, 0xEE, 0x4A, 0xEE, 0x4B, 0xEE, 0x4C, /* 0x18-0x1B */ + 0xEE, 0x4D, 0xEE, 0x4E, 0xEE, 0x4F, 0xEE, 0x50, /* 0x1C-0x1F */ + 0xEE, 0x51, 0xEE, 0x52, 0xEE, 0x53, 0xEE, 0x54, /* 0x20-0x23 */ + 0xEE, 0x55, 0xEE, 0x56, 0xEE, 0x57, 0xEE, 0x58, /* 0x24-0x27 */ + 0xEE, 0x59, 0xEE, 0x5A, 0xEE, 0x5B, 0xEE, 0x5C, /* 0x28-0x2B */ + 0xEE, 0x5D, 0xEE, 0x5E, 0xEE, 0x5F, 0xEE, 0x60, /* 0x2C-0x2F */ + 0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x64, /* 0x30-0x33 */ + 0xEE, 0x65, 0xEE, 0x66, 0xEE, 0x67, 0xEE, 0x68, /* 0x34-0x37 */ + 0xEE, 0x69, 0xEE, 0x6A, 0xEE, 0x6B, 0xEE, 0x6C, /* 0x38-0x3B */ + 0xEE, 0x6D, 0xEE, 0x6E, 0xEE, 0x6F, 0xEE, 0x70, /* 0x3C-0x3F */ + 0xEE, 0x71, 0xEE, 0x72, 0xEE, 0x73, 0xEE, 0x74, /* 0x40-0x43 */ + 0xEE, 0x75, 0xEE, 0x76, 0xEE, 0x77, 0xEE, 0x78, /* 0x44-0x47 */ + 0xEE, 0x79, 0xEE, 0x7A, 0xEE, 0x7B, 0xEE, 0x7C, /* 0x48-0x4B */ + 0xEE, 0x7D, 0xEE, 0x7E, 0xEE, 0x80, 0xEE, 0x81, /* 0x4C-0x4F */ + 0xEE, 0x82, 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x85, /* 0x50-0x53 */ + 0xEE, 0x86, 0xEE, 0x87, 0xEE, 0x88, 0xEE, 0x89, /* 0x54-0x57 */ + 0xEE, 0x8A, 0xEE, 0x8B, 0xEE, 0x8C, 0xEE, 0x8D, /* 0x58-0x5B */ + 0xEE, 0x8E, 0xEE, 0x8F, 0xEE, 0x90, 0xEE, 0x91, /* 0x5C-0x5F */ + 0xEE, 0x92, 0xEE, 0x93, 0xEE, 0x94, 0xEE, 0x95, /* 0x60-0x63 */ + 0xEE, 0x96, 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x99, /* 0x64-0x67 */ + 0xEE, 0x9A, 0xEE, 0x9B, 0xEE, 0x9C, 0xEE, 0x9D, /* 0x68-0x6B */ + 0xEE, 0x9E, 0xEE, 0x9F, 0xEE, 0xA0, 0xEF, 0x40, /* 0x6C-0x6F */ + 0xEF, 0x41, 0xEF, 0x42, 0xEF, 0x43, 0xEF, 0x44, /* 0x70-0x73 */ + 0xEF, 0x45, 0xD2, 0xB3, 0xB6, 0xA5, 0xC7, 0xEA, /* 0x74-0x77 */ + 0xF1, 0xFC, 0xCF, 0xEE, 0xCB, 0xB3, 0xD0, 0xEB, /* 0x78-0x7B */ + 0xE7, 0xEF, 0xCD, 0xE7, 0xB9, 0xCB, 0xB6, 0xD9, /* 0x7C-0x7F */ + + 0xF1, 0xFD, 0xB0, 0xE4, 0xCB, 0xCC, 0xF1, 0xFE, /* 0x80-0x83 */ + 0xD4, 0xA4, 0xC2, 0xAD, 0xC1, 0xEC, 0xC6, 0xC4, /* 0x84-0x87 */ + 0xBE, 0xB1, 0xF2, 0xA1, 0xBC, 0xD5, 0xEF, 0x46, /* 0x88-0x8B */ + 0xF2, 0xA2, 0xF2, 0xA3, 0xEF, 0x47, 0xF2, 0xA4, /* 0x8C-0x8F */ + 0xD2, 0xC3, 0xC6, 0xB5, 0xEF, 0x48, 0xCD, 0xC7, /* 0x90-0x93 */ + 0xF2, 0xA5, 0xEF, 0x49, 0xD3, 0xB1, 0xBF, 0xC5, /* 0x94-0x97 */ + 0xCC, 0xE2, 0xEF, 0x4A, 0xF2, 0xA6, 0xF2, 0xA7, /* 0x98-0x9B */ + 0xD1, 0xD5, 0xB6, 0xEE, 0xF2, 0xA8, 0xF2, 0xA9, /* 0x9C-0x9F */ + 0xB5, 0xDF, 0xF2, 0xAA, 0xF2, 0xAB, 0xEF, 0x4B, /* 0xA0-0xA3 */ + 0xB2, 0xFC, 0xF2, 0xAC, 0xF2, 0xAD, 0xC8, 0xA7, /* 0xA4-0xA7 */ + 0xEF, 0x4C, 0xEF, 0x4D, 0xEF, 0x4E, 0xEF, 0x4F, /* 0xA8-0xAB */ + 0xEF, 0x50, 0xEF, 0x51, 0xEF, 0x52, 0xEF, 0x53, /* 0xAC-0xAF */ + 0xEF, 0x54, 0xEF, 0x55, 0xEF, 0x56, 0xEF, 0x57, /* 0xB0-0xB3 */ + 0xEF, 0x58, 0xEF, 0x59, 0xEF, 0x5A, 0xEF, 0x5B, /* 0xB4-0xB7 */ + 0xEF, 0x5C, 0xEF, 0x5D, 0xEF, 0x5E, 0xEF, 0x5F, /* 0xB8-0xBB */ + 0xEF, 0x60, 0xEF, 0x61, 0xEF, 0x62, 0xEF, 0x63, /* 0xBC-0xBF */ + 0xEF, 0x64, 0xEF, 0x65, 0xEF, 0x66, 0xEF, 0x67, /* 0xC0-0xC3 */ + 0xEF, 0x68, 0xEF, 0x69, 0xEF, 0x6A, 0xEF, 0x6B, /* 0xC4-0xC7 */ + 0xEF, 0x6C, 0xEF, 0x6D, 0xEF, 0x6E, 0xEF, 0x6F, /* 0xC8-0xCB */ + 0xEF, 0x70, 0xEF, 0x71, 0xB7, 0xE7, 0xEF, 0x72, /* 0xCC-0xCF */ + 0xEF, 0x73, 0xEC, 0xA9, 0xEC, 0xAA, 0xEC, 0xAB, /* 0xD0-0xD3 */ + 0xEF, 0x74, 0xEC, 0xAC, 0xEF, 0x75, 0xEF, 0x76, /* 0xD4-0xD7 */ + 0xC6, 0xAE, 0xEC, 0xAD, 0xEC, 0xAE, 0xEF, 0x77, /* 0xD8-0xDB */ + 0xEF, 0x78, 0xEF, 0x79, 0xB7, 0xC9, 0xCA, 0xB3, /* 0xDC-0xDF */ + 0xEF, 0x7A, 0xEF, 0x7B, 0xEF, 0x7C, 0xEF, 0x7D, /* 0xE0-0xE3 */ + 0xEF, 0x7E, 0xEF, 0x80, 0xEF, 0x81, 0xE2, 0xB8, /* 0xE4-0xE7 */ + 0xF7, 0xCF, 0xEF, 0x82, 0xEF, 0x83, 0xEF, 0x84, /* 0xE8-0xEB */ + 0xEF, 0x85, 0xEF, 0x86, 0xEF, 0x87, 0xEF, 0x88, /* 0xEC-0xEF */ + 0xEF, 0x89, 0xEF, 0x8A, 0xEF, 0x8B, 0xEF, 0x8C, /* 0xF0-0xF3 */ + 0xEF, 0x8D, 0xEF, 0x8E, 0xEF, 0x8F, 0xEF, 0x90, /* 0xF4-0xF7 */ + 0xEF, 0x91, 0xEF, 0x92, 0xEF, 0x93, 0xEF, 0x94, /* 0xF8-0xFB */ + 0xEF, 0x95, 0xEF, 0x96, 0xEF, 0x97, 0xEF, 0x98, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0xEF, 0x99, 0xEF, 0x9A, 0xEF, 0x9B, 0xEF, 0x9C, /* 0x00-0x03 */ + 0xEF, 0x9D, 0xEF, 0x9E, 0xEF, 0x9F, 0xEF, 0xA0, /* 0x04-0x07 */ + 0xF0, 0x40, 0xF0, 0x41, 0xF0, 0x42, 0xF0, 0x43, /* 0x08-0x0B */ + 0xF0, 0x44, 0xF7, 0xD0, 0xF0, 0x45, 0xF0, 0x46, /* 0x0C-0x0F */ + 0xB2, 0xCD, 0xF0, 0x47, 0xF0, 0x48, 0xF0, 0x49, /* 0x10-0x13 */ + 0xF0, 0x4A, 0xF0, 0x4B, 0xF0, 0x4C, 0xF0, 0x4D, /* 0x14-0x17 */ + 0xF0, 0x4E, 0xF0, 0x4F, 0xF0, 0x50, 0xF0, 0x51, /* 0x18-0x1B */ + 0xF0, 0x52, 0xF0, 0x53, 0xF0, 0x54, 0xF0, 0x55, /* 0x1C-0x1F */ + 0xF0, 0x56, 0xF0, 0x57, 0xF0, 0x58, 0xF0, 0x59, /* 0x20-0x23 */ + 0xF0, 0x5A, 0xF0, 0x5B, 0xF0, 0x5C, 0xF0, 0x5D, /* 0x24-0x27 */ + 0xF0, 0x5E, 0xF0, 0x5F, 0xF0, 0x60, 0xF0, 0x61, /* 0x28-0x2B */ + 0xF0, 0x62, 0xF0, 0x63, 0xF7, 0xD1, 0xF0, 0x64, /* 0x2C-0x2F */ + 0xF0, 0x65, 0xF0, 0x66, 0xF0, 0x67, 0xF0, 0x68, /* 0x30-0x33 */ + 0xF0, 0x69, 0xF0, 0x6A, 0xF0, 0x6B, 0xF0, 0x6C, /* 0x34-0x37 */ + 0xF0, 0x6D, 0xF0, 0x6E, 0xF0, 0x6F, 0xF0, 0x70, /* 0x38-0x3B */ + 0xF0, 0x71, 0xF0, 0x72, 0xF0, 0x73, 0xF0, 0x74, /* 0x3C-0x3F */ + 0xF0, 0x75, 0xF0, 0x76, 0xF0, 0x77, 0xF0, 0x78, /* 0x40-0x43 */ + 0xF0, 0x79, 0xF0, 0x7A, 0xF0, 0x7B, 0xF0, 0x7C, /* 0x44-0x47 */ + 0xF0, 0x7D, 0xF0, 0x7E, 0xF0, 0x80, 0xF0, 0x81, /* 0x48-0x4B */ + 0xF0, 0x82, 0xF0, 0x83, 0xF0, 0x84, 0xF0, 0x85, /* 0x4C-0x4F */ + 0xF0, 0x86, 0xF0, 0x87, 0xF0, 0x88, 0xF0, 0x89, /* 0x50-0x53 */ + 0xF7, 0xD3, 0xF7, 0xD2, 0xF0, 0x8A, 0xF0, 0x8B, /* 0x54-0x57 */ + 0xF0, 0x8C, 0xF0, 0x8D, 0xF0, 0x8E, 0xF0, 0x8F, /* 0x58-0x5B */ + 0xF0, 0x90, 0xF0, 0x91, 0xF0, 0x92, 0xF0, 0x93, /* 0x5C-0x5F */ + 0xF0, 0x94, 0xF0, 0x95, 0xF0, 0x96, 0xE2, 0xBB, /* 0x60-0x63 */ + 0xF0, 0x97, 0xBC, 0xA2, 0xF0, 0x98, 0xE2, 0xBC, /* 0x64-0x67 */ + 0xE2, 0xBD, 0xE2, 0xBE, 0xE2, 0xBF, 0xE2, 0xC0, /* 0x68-0x6B */ + 0xE2, 0xC1, 0xB7, 0xB9, 0xD2, 0xFB, 0xBD, 0xA4, /* 0x6C-0x6F */ + 0xCA, 0xCE, 0xB1, 0xA5, 0xCB, 0xC7, 0xF0, 0x99, /* 0x70-0x73 */ + 0xE2, 0xC2, 0xB6, 0xFC, 0xC8, 0xC4, 0xE2, 0xC3, /* 0x74-0x77 */ + 0xF0, 0x9A, 0xF0, 0x9B, 0xBD, 0xC8, 0xF0, 0x9C, /* 0x78-0x7B */ + 0xB1, 0xFD, 0xE2, 0xC4, 0xF0, 0x9D, 0xB6, 0xF6, /* 0x7C-0x7F */ + + 0xE2, 0xC5, 0xC4, 0xD9, 0xF0, 0x9E, 0xF0, 0x9F, /* 0x80-0x83 */ + 0xE2, 0xC6, 0xCF, 0xDA, 0xB9, 0xDD, 0xE2, 0xC7, /* 0x84-0x87 */ + 0xC0, 0xA1, 0xF0, 0xA0, 0xE2, 0xC8, 0xB2, 0xF6, /* 0x88-0x8B */ + 0xF1, 0x40, 0xE2, 0xC9, 0xF1, 0x41, 0xC1, 0xF3, /* 0x8C-0x8F */ + 0xE2, 0xCA, 0xE2, 0xCB, 0xC2, 0xF8, 0xE2, 0xCC, /* 0x90-0x93 */ + 0xE2, 0xCD, 0xE2, 0xCE, 0xCA, 0xD7, 0xD8, 0xB8, /* 0x94-0x97 */ + 0xD9, 0xE5, 0xCF, 0xE3, 0xF1, 0x42, 0xF1, 0x43, /* 0x98-0x9B */ + 0xF1, 0x44, 0xF1, 0x45, 0xF1, 0x46, 0xF1, 0x47, /* 0x9C-0x9F */ + 0xF1, 0x48, 0xF1, 0x49, 0xF1, 0x4A, 0xF1, 0x4B, /* 0xA0-0xA3 */ + 0xF1, 0x4C, 0xF0, 0xA5, 0xF1, 0x4D, 0xF1, 0x4E, /* 0xA4-0xA7 */ + 0xDC, 0xB0, 0xF1, 0x4F, 0xF1, 0x50, 0xF1, 0x51, /* 0xA8-0xAB */ + 0xF1, 0x52, 0xF1, 0x53, 0xF1, 0x54, 0xF1, 0x55, /* 0xAC-0xAF */ + 0xF1, 0x56, 0xF1, 0x57, 0xF1, 0x58, 0xF1, 0x59, /* 0xB0-0xB3 */ + 0xF1, 0x5A, 0xF1, 0x5B, 0xF1, 0x5C, 0xF1, 0x5D, /* 0xB4-0xB7 */ + 0xF1, 0x5E, 0xF1, 0x5F, 0xF1, 0x60, 0xF1, 0x61, /* 0xB8-0xBB */ + 0xF1, 0x62, 0xF1, 0x63, 0xF1, 0x64, 0xF1, 0x65, /* 0xBC-0xBF */ + 0xF1, 0x66, 0xF1, 0x67, 0xF1, 0x68, 0xF1, 0x69, /* 0xC0-0xC3 */ + 0xF1, 0x6A, 0xF1, 0x6B, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xC4-0xC7 */ + 0xF1, 0x6E, 0xF1, 0x6F, 0xF1, 0x70, 0xF1, 0x71, /* 0xC8-0xCB */ + 0xF1, 0x72, 0xF1, 0x73, 0xF1, 0x74, 0xF1, 0x75, /* 0xCC-0xCF */ + 0xF1, 0x76, 0xF1, 0x77, 0xF1, 0x78, 0xF1, 0x79, /* 0xD0-0xD3 */ + 0xF1, 0x7A, 0xF1, 0x7B, 0xF1, 0x7C, 0xF1, 0x7D, /* 0xD4-0xD7 */ + 0xF1, 0x7E, 0xF1, 0x80, 0xF1, 0x81, 0xF1, 0x82, /* 0xD8-0xDB */ + 0xF1, 0x83, 0xF1, 0x84, 0xF1, 0x85, 0xF1, 0x86, /* 0xDC-0xDF */ + 0xF1, 0x87, 0xF1, 0x88, 0xF1, 0x89, 0xF1, 0x8A, /* 0xE0-0xE3 */ + 0xF1, 0x8B, 0xF1, 0x8C, 0xF1, 0x8D, 0xF1, 0x8E, /* 0xE4-0xE7 */ + 0xF1, 0x8F, 0xF1, 0x90, 0xF1, 0x91, 0xF1, 0x92, /* 0xE8-0xEB */ + 0xF1, 0x93, 0xF1, 0x94, 0xF1, 0x95, 0xF1, 0x96, /* 0xEC-0xEF */ + 0xF1, 0x97, 0xF1, 0x98, 0xF1, 0x99, 0xF1, 0x9A, /* 0xF0-0xF3 */ + 0xF1, 0x9B, 0xF1, 0x9C, 0xF1, 0x9D, 0xF1, 0x9E, /* 0xF4-0xF7 */ + 0xF1, 0x9F, 0xF1, 0xA0, 0xF2, 0x40, 0xF2, 0x41, /* 0xF8-0xFB */ + 0xF2, 0x42, 0xF2, 0x43, 0xF2, 0x44, 0xF2, 0x45, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0xF2, 0x46, 0xF2, 0x47, 0xF2, 0x48, 0xF2, 0x49, /* 0x00-0x03 */ + 0xF2, 0x4A, 0xF2, 0x4B, 0xF2, 0x4C, 0xF2, 0x4D, /* 0x04-0x07 */ + 0xF2, 0x4E, 0xF2, 0x4F, 0xF2, 0x50, 0xF2, 0x51, /* 0x08-0x0B */ + 0xF2, 0x52, 0xF2, 0x53, 0xF2, 0x54, 0xF2, 0x55, /* 0x0C-0x0F */ + 0xF2, 0x56, 0xF2, 0x57, 0xF2, 0x58, 0xF2, 0x59, /* 0x10-0x13 */ + 0xF2, 0x5A, 0xF2, 0x5B, 0xF2, 0x5C, 0xF2, 0x5D, /* 0x14-0x17 */ + 0xF2, 0x5E, 0xF2, 0x5F, 0xF2, 0x60, 0xF2, 0x61, /* 0x18-0x1B */ + 0xF2, 0x62, 0xF2, 0x63, 0xF2, 0x64, 0xF2, 0x65, /* 0x1C-0x1F */ + 0xF2, 0x66, 0xF2, 0x67, 0xF2, 0x68, 0xF2, 0x69, /* 0x20-0x23 */ + 0xF2, 0x6A, 0xF2, 0x6B, 0xF2, 0x6C, 0xF2, 0x6D, /* 0x24-0x27 */ + 0xF2, 0x6E, 0xF2, 0x6F, 0xF2, 0x70, 0xF2, 0x71, /* 0x28-0x2B */ + 0xF2, 0x72, 0xF2, 0x73, 0xF2, 0x74, 0xF2, 0x75, /* 0x2C-0x2F */ + 0xF2, 0x76, 0xF2, 0x77, 0xF2, 0x78, 0xF2, 0x79, /* 0x30-0x33 */ + 0xF2, 0x7A, 0xF2, 0x7B, 0xF2, 0x7C, 0xF2, 0x7D, /* 0x34-0x37 */ + 0xF2, 0x7E, 0xF2, 0x80, 0xF2, 0x81, 0xF2, 0x82, /* 0x38-0x3B */ + 0xF2, 0x83, 0xF2, 0x84, 0xF2, 0x85, 0xF2, 0x86, /* 0x3C-0x3F */ + 0xF2, 0x87, 0xF2, 0x88, 0xF2, 0x89, 0xF2, 0x8A, /* 0x40-0x43 */ + 0xF2, 0x8B, 0xF2, 0x8C, 0xF2, 0x8D, 0xF2, 0x8E, /* 0x44-0x47 */ + 0xF2, 0x8F, 0xF2, 0x90, 0xF2, 0x91, 0xF2, 0x92, /* 0x48-0x4B */ + 0xF2, 0x93, 0xF2, 0x94, 0xF2, 0x95, 0xF2, 0x96, /* 0x4C-0x4F */ + 0xF2, 0x97, 0xF2, 0x98, 0xF2, 0x99, 0xF2, 0x9A, /* 0x50-0x53 */ + 0xF2, 0x9B, 0xF2, 0x9C, 0xF2, 0x9D, 0xF2, 0x9E, /* 0x54-0x57 */ + 0xF2, 0x9F, 0xF2, 0xA0, 0xF3, 0x40, 0xF3, 0x41, /* 0x58-0x5B */ + 0xF3, 0x42, 0xF3, 0x43, 0xF3, 0x44, 0xF3, 0x45, /* 0x5C-0x5F */ + 0xF3, 0x46, 0xF3, 0x47, 0xF3, 0x48, 0xF3, 0x49, /* 0x60-0x63 */ + 0xF3, 0x4A, 0xF3, 0x4B, 0xF3, 0x4C, 0xF3, 0x4D, /* 0x64-0x67 */ + 0xF3, 0x4E, 0xF3, 0x4F, 0xF3, 0x50, 0xF3, 0x51, /* 0x68-0x6B */ + 0xC2, 0xED, 0xD4, 0xA6, 0xCD, 0xD4, 0xD1, 0xB1, /* 0x6C-0x6F */ + 0xB3, 0xDB, 0xC7, 0xFD, 0xF3, 0x52, 0xB2, 0xB5, /* 0x70-0x73 */ + 0xC2, 0xBF, 0xE6, 0xE0, 0xCA, 0xBB, 0xE6, 0xE1, /* 0x74-0x77 */ + 0xE6, 0xE2, 0xBE, 0xD4, 0xE6, 0xE3, 0xD7, 0xA4, /* 0x78-0x7B */ + 0xCD, 0xD5, 0xE6, 0xE5, 0xBC, 0xDD, 0xE6, 0xE4, /* 0x7C-0x7F */ + + 0xE6, 0xE6, 0xE6, 0xE7, 0xC2, 0xEE, 0xF3, 0x53, /* 0x80-0x83 */ + 0xBD, 0xBE, 0xE6, 0xE8, 0xC2, 0xE6, 0xBA, 0xA7, /* 0x84-0x87 */ + 0xE6, 0xE9, 0xF3, 0x54, 0xE6, 0xEA, 0xB3, 0xD2, /* 0x88-0x8B */ + 0xD1, 0xE9, 0xF3, 0x55, 0xF3, 0x56, 0xBF, 0xA5, /* 0x8C-0x8F */ + 0xE6, 0xEB, 0xC6, 0xEF, 0xE6, 0xEC, 0xE6, 0xED, /* 0x90-0x93 */ + 0xF3, 0x57, 0xF3, 0x58, 0xE6, 0xEE, 0xC6, 0xAD, /* 0x94-0x97 */ + 0xE6, 0xEF, 0xF3, 0x59, 0xC9, 0xA7, 0xE6, 0xF0, /* 0x98-0x9B */ + 0xE6, 0xF1, 0xE6, 0xF2, 0xE5, 0xB9, 0xE6, 0xF3, /* 0x9C-0x9F */ + 0xE6, 0xF4, 0xC2, 0xE2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */ + 0xD6, 0xE8, 0xE6, 0xF7, 0xF3, 0x5A, 0xE6, 0xF8, /* 0xA4-0xA7 */ + 0xB9, 0xC7, 0xF3, 0x5B, 0xF3, 0x5C, 0xF3, 0x5D, /* 0xA8-0xAB */ + 0xF3, 0x5E, 0xF3, 0x5F, 0xF3, 0x60, 0xF3, 0x61, /* 0xAC-0xAF */ + 0xF7, 0xBB, 0xF7, 0xBA, 0xF3, 0x62, 0xF3, 0x63, /* 0xB0-0xB3 */ + 0xF3, 0x64, 0xF3, 0x65, 0xF7, 0xBE, 0xF7, 0xBC, /* 0xB4-0xB7 */ + 0xBA, 0xA1, 0xF3, 0x66, 0xF7, 0xBF, 0xF3, 0x67, /* 0xB8-0xBB */ + 0xF7, 0xC0, 0xF3, 0x68, 0xF3, 0x69, 0xF3, 0x6A, /* 0xBC-0xBF */ + 0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xC4, 0xF3, 0x6B, /* 0xC0-0xC3 */ + 0xF3, 0x6C, 0xF7, 0xC3, 0xF3, 0x6D, 0xF3, 0x6E, /* 0xC4-0xC7 */ + 0xF3, 0x6F, 0xF3, 0x70, 0xF3, 0x71, 0xF7, 0xC5, /* 0xC8-0xCB */ + 0xF7, 0xC6, 0xF3, 0x72, 0xF3, 0x73, 0xF3, 0x74, /* 0xCC-0xCF */ + 0xF3, 0x75, 0xF7, 0xC7, 0xF3, 0x76, 0xCB, 0xE8, /* 0xD0-0xD3 */ + 0xF3, 0x77, 0xF3, 0x78, 0xF3, 0x79, 0xF3, 0x7A, /* 0xD4-0xD7 */ + 0xB8, 0xDF, 0xF3, 0x7B, 0xF3, 0x7C, 0xF3, 0x7D, /* 0xD8-0xDB */ + 0xF3, 0x7E, 0xF3, 0x80, 0xF3, 0x81, 0xF7, 0xD4, /* 0xDC-0xDF */ + 0xF3, 0x82, 0xF7, 0xD5, 0xF3, 0x83, 0xF3, 0x84, /* 0xE0-0xE3 */ + 0xF3, 0x85, 0xF3, 0x86, 0xF7, 0xD6, 0xF3, 0x87, /* 0xE4-0xE7 */ + 0xF3, 0x88, 0xF3, 0x89, 0xF3, 0x8A, 0xF7, 0xD8, /* 0xE8-0xEB */ + 0xF3, 0x8B, 0xF7, 0xDA, 0xF3, 0x8C, 0xF7, 0xD7, /* 0xEC-0xEF */ + 0xF3, 0x8D, 0xF3, 0x8E, 0xF3, 0x8F, 0xF3, 0x90, /* 0xF0-0xF3 */ + 0xF3, 0x91, 0xF3, 0x92, 0xF3, 0x93, 0xF3, 0x94, /* 0xF4-0xF7 */ + 0xF3, 0x95, 0xF7, 0xDB, 0xF3, 0x96, 0xF7, 0xD9, /* 0xF8-0xFB */ + 0xF3, 0x97, 0xF3, 0x98, 0xF3, 0x99, 0xF3, 0x9A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9B[512] = { + 0xF3, 0x9B, 0xF3, 0x9C, 0xF3, 0x9D, 0xD7, 0xD7, /* 0x00-0x03 */ + 0xF3, 0x9E, 0xF3, 0x9F, 0xF3, 0xA0, 0xF4, 0x40, /* 0x04-0x07 */ + 0xF7, 0xDC, 0xF4, 0x41, 0xF4, 0x42, 0xF4, 0x43, /* 0x08-0x0B */ + 0xF4, 0x44, 0xF4, 0x45, 0xF4, 0x46, 0xF7, 0xDD, /* 0x0C-0x0F */ + 0xF4, 0x47, 0xF4, 0x48, 0xF4, 0x49, 0xF7, 0xDE, /* 0x10-0x13 */ + 0xF4, 0x4A, 0xF4, 0x4B, 0xF4, 0x4C, 0xF4, 0x4D, /* 0x14-0x17 */ + 0xF4, 0x4E, 0xF4, 0x4F, 0xF4, 0x50, 0xF4, 0x51, /* 0x18-0x1B */ + 0xF4, 0x52, 0xF4, 0x53, 0xF4, 0x54, 0xF7, 0xDF, /* 0x1C-0x1F */ + 0xF4, 0x55, 0xF4, 0x56, 0xF4, 0x57, 0xF7, 0xE0, /* 0x20-0x23 */ + 0xF4, 0x58, 0xF4, 0x59, 0xF4, 0x5A, 0xF4, 0x5B, /* 0x24-0x27 */ + 0xF4, 0x5C, 0xF4, 0x5D, 0xF4, 0x5E, 0xF4, 0x5F, /* 0x28-0x2B */ + 0xF4, 0x60, 0xF4, 0x61, 0xF4, 0x62, 0xDB, 0xCB, /* 0x2C-0x2F */ + 0xF4, 0x63, 0xF4, 0x64, 0xD8, 0xAA, 0xF4, 0x65, /* 0x30-0x33 */ + 0xF4, 0x66, 0xF4, 0x67, 0xF4, 0x68, 0xF4, 0x69, /* 0x34-0x37 */ + 0xF4, 0x6A, 0xF4, 0x6B, 0xF4, 0x6C, 0xE5, 0xF7, /* 0x38-0x3B */ + 0xB9, 0xED, 0xF4, 0x6D, 0xF4, 0x6E, 0xF4, 0x6F, /* 0x3C-0x3F */ + 0xF4, 0x70, 0xBF, 0xFD, 0xBB, 0xEA, 0xF7, 0xC9, /* 0x40-0x43 */ + 0xC6, 0xC7, 0xF7, 0xC8, 0xF4, 0x71, 0xF7, 0xCA, /* 0x44-0x47 */ + 0xF7, 0xCC, 0xF7, 0xCB, 0xF4, 0x72, 0xF4, 0x73, /* 0x48-0x4B */ + 0xF4, 0x74, 0xF7, 0xCD, 0xF4, 0x75, 0xCE, 0xBA, /* 0x4C-0x4F */ + 0xF4, 0x76, 0xF7, 0xCE, 0xF4, 0x77, 0xF4, 0x78, /* 0x50-0x53 */ + 0xC4, 0xA7, 0xF4, 0x79, 0xF4, 0x7A, 0xF4, 0x7B, /* 0x54-0x57 */ + 0xF4, 0x7C, 0xF4, 0x7D, 0xF4, 0x7E, 0xF4, 0x80, /* 0x58-0x5B */ + 0xF4, 0x81, 0xF4, 0x82, 0xF4, 0x83, 0xF4, 0x84, /* 0x5C-0x5F */ + 0xF4, 0x85, 0xF4, 0x86, 0xF4, 0x87, 0xF4, 0x88, /* 0x60-0x63 */ + 0xF4, 0x89, 0xF4, 0x8A, 0xF4, 0x8B, 0xF4, 0x8C, /* 0x64-0x67 */ + 0xF4, 0x8D, 0xF4, 0x8E, 0xF4, 0x8F, 0xF4, 0x90, /* 0x68-0x6B */ + 0xF4, 0x91, 0xF4, 0x92, 0xF4, 0x93, 0xF4, 0x94, /* 0x6C-0x6F */ + 0xF4, 0x95, 0xF4, 0x96, 0xF4, 0x97, 0xF4, 0x98, /* 0x70-0x73 */ + 0xF4, 0x99, 0xF4, 0x9A, 0xF4, 0x9B, 0xF4, 0x9C, /* 0x74-0x77 */ + 0xF4, 0x9D, 0xF4, 0x9E, 0xF4, 0x9F, 0xF4, 0xA0, /* 0x78-0x7B */ + 0xF5, 0x40, 0xF5, 0x41, 0xF5, 0x42, 0xF5, 0x43, /* 0x7C-0x7F */ + + 0xF5, 0x44, 0xF5, 0x45, 0xF5, 0x46, 0xF5, 0x47, /* 0x80-0x83 */ + 0xF5, 0x48, 0xF5, 0x49, 0xF5, 0x4A, 0xF5, 0x4B, /* 0x84-0x87 */ + 0xF5, 0x4C, 0xF5, 0x4D, 0xF5, 0x4E, 0xF5, 0x4F, /* 0x88-0x8B */ + 0xF5, 0x50, 0xF5, 0x51, 0xF5, 0x52, 0xF5, 0x53, /* 0x8C-0x8F */ + 0xF5, 0x54, 0xF5, 0x55, 0xF5, 0x56, 0xF5, 0x57, /* 0x90-0x93 */ + 0xF5, 0x58, 0xF5, 0x59, 0xF5, 0x5A, 0xF5, 0x5B, /* 0x94-0x97 */ + 0xF5, 0x5C, 0xF5, 0x5D, 0xF5, 0x5E, 0xF5, 0x5F, /* 0x98-0x9B */ + 0xF5, 0x60, 0xF5, 0x61, 0xF5, 0x62, 0xF5, 0x63, /* 0x9C-0x9F */ + 0xF5, 0x64, 0xF5, 0x65, 0xF5, 0x66, 0xF5, 0x67, /* 0xA0-0xA3 */ + 0xF5, 0x68, 0xF5, 0x69, 0xF5, 0x6A, 0xF5, 0x6B, /* 0xA4-0xA7 */ + 0xF5, 0x6C, 0xF5, 0x6D, 0xF5, 0x6E, 0xF5, 0x6F, /* 0xA8-0xAB */ + 0xF5, 0x70, 0xF5, 0x71, 0xF5, 0x72, 0xF5, 0x73, /* 0xAC-0xAF */ + 0xF5, 0x74, 0xF5, 0x75, 0xF5, 0x76, 0xF5, 0x77, /* 0xB0-0xB3 */ + 0xF5, 0x78, 0xF5, 0x79, 0xF5, 0x7A, 0xF5, 0x7B, /* 0xB4-0xB7 */ + 0xF5, 0x7C, 0xF5, 0x7D, 0xF5, 0x7E, 0xF5, 0x80, /* 0xB8-0xBB */ + 0xF5, 0x81, 0xF5, 0x82, 0xF5, 0x83, 0xF5, 0x84, /* 0xBC-0xBF */ + 0xF5, 0x85, 0xF5, 0x86, 0xF5, 0x87, 0xF5, 0x88, /* 0xC0-0xC3 */ + 0xF5, 0x89, 0xF5, 0x8A, 0xF5, 0x8B, 0xF5, 0x8C, /* 0xC4-0xC7 */ + 0xF5, 0x8D, 0xF5, 0x8E, 0xF5, 0x8F, 0xF5, 0x90, /* 0xC8-0xCB */ + 0xF5, 0x91, 0xF5, 0x92, 0xF5, 0x93, 0xF5, 0x94, /* 0xCC-0xCF */ + 0xF5, 0x95, 0xF5, 0x96, 0xF5, 0x97, 0xF5, 0x98, /* 0xD0-0xD3 */ + 0xF5, 0x99, 0xF5, 0x9A, 0xF5, 0x9B, 0xF5, 0x9C, /* 0xD4-0xD7 */ + 0xF5, 0x9D, 0xF5, 0x9E, 0xF5, 0x9F, 0xF5, 0xA0, /* 0xD8-0xDB */ + 0xF6, 0x40, 0xF6, 0x41, 0xF6, 0x42, 0xF6, 0x43, /* 0xDC-0xDF */ + 0xF6, 0x44, 0xF6, 0x45, 0xF6, 0x46, 0xF6, 0x47, /* 0xE0-0xE3 */ + 0xF6, 0x48, 0xF6, 0x49, 0xF6, 0x4A, 0xF6, 0x4B, /* 0xE4-0xE7 */ + 0xF6, 0x4C, 0xF6, 0x4D, 0xF6, 0x4E, 0xF6, 0x4F, /* 0xE8-0xEB */ + 0xF6, 0x50, 0xF6, 0x51, 0xF6, 0x52, 0xF6, 0x53, /* 0xEC-0xEF */ + 0xF6, 0x54, 0xF6, 0x55, 0xF6, 0x56, 0xF6, 0x57, /* 0xF0-0xF3 */ + 0xF6, 0x58, 0xF6, 0x59, 0xF6, 0x5A, 0xF6, 0x5B, /* 0xF4-0xF7 */ + 0xF6, 0x5C, 0xF6, 0x5D, 0xF6, 0x5E, 0xF6, 0x5F, /* 0xF8-0xFB */ + 0xF6, 0x60, 0xF6, 0x61, 0xF6, 0x62, 0xF6, 0x63, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9C[512] = { + 0xF6, 0x64, 0xF6, 0x65, 0xF6, 0x66, 0xF6, 0x67, /* 0x00-0x03 */ + 0xF6, 0x68, 0xF6, 0x69, 0xF6, 0x6A, 0xF6, 0x6B, /* 0x04-0x07 */ + 0xF6, 0x6C, 0xF6, 0x6D, 0xF6, 0x6E, 0xF6, 0x6F, /* 0x08-0x0B */ + 0xF6, 0x70, 0xF6, 0x71, 0xF6, 0x72, 0xF6, 0x73, /* 0x0C-0x0F */ + 0xF6, 0x74, 0xF6, 0x75, 0xF6, 0x76, 0xF6, 0x77, /* 0x10-0x13 */ + 0xF6, 0x78, 0xF6, 0x79, 0xF6, 0x7A, 0xF6, 0x7B, /* 0x14-0x17 */ + 0xF6, 0x7C, 0xF6, 0x7D, 0xF6, 0x7E, 0xF6, 0x80, /* 0x18-0x1B */ + 0xF6, 0x81, 0xF6, 0x82, 0xF6, 0x83, 0xF6, 0x84, /* 0x1C-0x1F */ + 0xF6, 0x85, 0xF6, 0x86, 0xF6, 0x87, 0xF6, 0x88, /* 0x20-0x23 */ + 0xF6, 0x89, 0xF6, 0x8A, 0xF6, 0x8B, 0xF6, 0x8C, /* 0x24-0x27 */ + 0xF6, 0x8D, 0xF6, 0x8E, 0xF6, 0x8F, 0xF6, 0x90, /* 0x28-0x2B */ + 0xF6, 0x91, 0xF6, 0x92, 0xF6, 0x93, 0xF6, 0x94, /* 0x2C-0x2F */ + 0xF6, 0x95, 0xF6, 0x96, 0xF6, 0x97, 0xF6, 0x98, /* 0x30-0x33 */ + 0xF6, 0x99, 0xF6, 0x9A, 0xF6, 0x9B, 0xF6, 0x9C, /* 0x34-0x37 */ + 0xF6, 0x9D, 0xF6, 0x9E, 0xF6, 0x9F, 0xF6, 0xA0, /* 0x38-0x3B */ + 0xF7, 0x40, 0xF7, 0x41, 0xF7, 0x42, 0xF7, 0x43, /* 0x3C-0x3F */ + 0xF7, 0x44, 0xF7, 0x45, 0xF7, 0x46, 0xF7, 0x47, /* 0x40-0x43 */ + 0xF7, 0x48, 0xF7, 0x49, 0xF7, 0x4A, 0xF7, 0x4B, /* 0x44-0x47 */ + 0xF7, 0x4C, 0xF7, 0x4D, 0xF7, 0x4E, 0xF7, 0x4F, /* 0x48-0x4B */ + 0xF7, 0x50, 0xF7, 0x51, 0xF7, 0x52, 0xF7, 0x53, /* 0x4C-0x4F */ + 0xF7, 0x54, 0xF7, 0x55, 0xF7, 0x56, 0xF7, 0x57, /* 0x50-0x53 */ + 0xF7, 0x58, 0xF7, 0x59, 0xF7, 0x5A, 0xF7, 0x5B, /* 0x54-0x57 */ + 0xF7, 0x5C, 0xF7, 0x5D, 0xF7, 0x5E, 0xF7, 0x5F, /* 0x58-0x5B */ + 0xF7, 0x60, 0xF7, 0x61, 0xF7, 0x62, 0xF7, 0x63, /* 0x5C-0x5F */ + 0xF7, 0x64, 0xF7, 0x65, 0xF7, 0x66, 0xF7, 0x67, /* 0x60-0x63 */ + 0xF7, 0x68, 0xF7, 0x69, 0xF7, 0x6A, 0xF7, 0x6B, /* 0x64-0x67 */ + 0xF7, 0x6C, 0xF7, 0x6D, 0xF7, 0x6E, 0xF7, 0x6F, /* 0x68-0x6B */ + 0xF7, 0x70, 0xF7, 0x71, 0xF7, 0x72, 0xF7, 0x73, /* 0x6C-0x6F */ + 0xF7, 0x74, 0xF7, 0x75, 0xF7, 0x76, 0xF7, 0x77, /* 0x70-0x73 */ + 0xF7, 0x78, 0xF7, 0x79, 0xF7, 0x7A, 0xF7, 0x7B, /* 0x74-0x77 */ + 0xF7, 0x7C, 0xF7, 0x7D, 0xF7, 0x7E, 0xF7, 0x80, /* 0x78-0x7B */ + 0xD3, 0xE3, 0xF7, 0x81, 0xF7, 0x82, 0xF6, 0xCF, /* 0x7C-0x7F */ + + 0xF7, 0x83, 0xC2, 0xB3, 0xF6, 0xD0, 0xF7, 0x84, /* 0x80-0x83 */ + 0xF7, 0x85, 0xF6, 0xD1, 0xF6, 0xD2, 0xF6, 0xD3, /* 0x84-0x87 */ + 0xF6, 0xD4, 0xF7, 0x86, 0xF7, 0x87, 0xF6, 0xD6, /* 0x88-0x8B */ + 0xF7, 0x88, 0xB1, 0xAB, 0xF6, 0xD7, 0xF7, 0x89, /* 0x8C-0x8F */ + 0xF6, 0xD8, 0xF6, 0xD9, 0xF6, 0xDA, 0xF7, 0x8A, /* 0x90-0x93 */ + 0xF6, 0xDB, 0xF6, 0xDC, 0xF7, 0x8B, 0xF7, 0x8C, /* 0x94-0x97 */ + 0xF7, 0x8D, 0xF7, 0x8E, 0xF6, 0xDD, 0xF6, 0xDE, /* 0x98-0x9B */ + 0xCF, 0xCA, 0xF7, 0x8F, 0xF6, 0xDF, 0xF6, 0xE0, /* 0x9C-0x9F */ + 0xF6, 0xE1, 0xF6, 0xE2, 0xF6, 0xE3, 0xF6, 0xE4, /* 0xA0-0xA3 */ + 0xC0, 0xF0, 0xF6, 0xE5, 0xF6, 0xE6, 0xF6, 0xE7, /* 0xA4-0xA7 */ + 0xF6, 0xE8, 0xF6, 0xE9, 0xF7, 0x90, 0xF6, 0xEA, /* 0xA8-0xAB */ + 0xF7, 0x91, 0xF6, 0xEB, 0xF6, 0xEC, 0xF7, 0x92, /* 0xAC-0xAF */ + 0xF6, 0xED, 0xF6, 0xEE, 0xF6, 0xEF, 0xF6, 0xF0, /* 0xB0-0xB3 */ + 0xF6, 0xF1, 0xF6, 0xF2, 0xF6, 0xF3, 0xF6, 0xF4, /* 0xB4-0xB7 */ + 0xBE, 0xA8, 0xF7, 0x93, 0xF6, 0xF5, 0xF6, 0xF6, /* 0xB8-0xBB */ + 0xF6, 0xF7, 0xF6, 0xF8, 0xF7, 0x94, 0xF7, 0x95, /* 0xBC-0xBF */ + 0xF7, 0x96, 0xF7, 0x97, 0xF7, 0x98, 0xC8, 0xFA, /* 0xC0-0xC3 */ + 0xF6, 0xF9, 0xF6, 0xFA, 0xF6, 0xFB, 0xF6, 0xFC, /* 0xC4-0xC7 */ + 0xF7, 0x99, 0xF7, 0x9A, 0xF6, 0xFD, 0xF6, 0xFE, /* 0xC8-0xCB */ + 0xF7, 0xA1, 0xF7, 0xA2, 0xF7, 0xA3, 0xF7, 0xA4, /* 0xCC-0xCF */ + 0xF7, 0xA5, 0xF7, 0x9B, 0xF7, 0x9C, 0xF7, 0xA6, /* 0xD0-0xD3 */ + 0xF7, 0xA7, 0xF7, 0xA8, 0xB1, 0xEE, 0xF7, 0xA9, /* 0xD4-0xD7 */ + 0xF7, 0xAA, 0xF7, 0xAB, 0xF7, 0x9D, 0xF7, 0x9E, /* 0xD8-0xDB */ + 0xF7, 0xAC, 0xF7, 0xAD, 0xC1, 0xDB, 0xF7, 0xAE, /* 0xDC-0xDF */ + 0xF7, 0x9F, 0xF7, 0xA0, 0xF7, 0xAF, 0xF8, 0x40, /* 0xE0-0xE3 */ + 0xF8, 0x41, 0xF8, 0x42, 0xF8, 0x43, 0xF8, 0x44, /* 0xE4-0xE7 */ + 0xF8, 0x45, 0xF8, 0x46, 0xF8, 0x47, 0xF8, 0x48, /* 0xE8-0xEB */ + 0xF8, 0x49, 0xF8, 0x4A, 0xF8, 0x4B, 0xF8, 0x4C, /* 0xEC-0xEF */ + 0xF8, 0x4D, 0xF8, 0x4E, 0xF8, 0x4F, 0xF8, 0x50, /* 0xF0-0xF3 */ + 0xF8, 0x51, 0xF8, 0x52, 0xF8, 0x53, 0xF8, 0x54, /* 0xF4-0xF7 */ + 0xF8, 0x55, 0xF8, 0x56, 0xF8, 0x57, 0xF8, 0x58, /* 0xF8-0xFB */ + 0xF8, 0x59, 0xF8, 0x5A, 0xF8, 0x5B, 0xF8, 0x5C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9D[512] = { + 0xF8, 0x5D, 0xF8, 0x5E, 0xF8, 0x5F, 0xF8, 0x60, /* 0x00-0x03 */ + 0xF8, 0x61, 0xF8, 0x62, 0xF8, 0x63, 0xF8, 0x64, /* 0x04-0x07 */ + 0xF8, 0x65, 0xF8, 0x66, 0xF8, 0x67, 0xF8, 0x68, /* 0x08-0x0B */ + 0xF8, 0x69, 0xF8, 0x6A, 0xF8, 0x6B, 0xF8, 0x6C, /* 0x0C-0x0F */ + 0xF8, 0x6D, 0xF8, 0x6E, 0xF8, 0x6F, 0xF8, 0x70, /* 0x10-0x13 */ + 0xF8, 0x71, 0xF8, 0x72, 0xF8, 0x73, 0xF8, 0x74, /* 0x14-0x17 */ + 0xF8, 0x75, 0xF8, 0x76, 0xF8, 0x77, 0xF8, 0x78, /* 0x18-0x1B */ + 0xF8, 0x79, 0xF8, 0x7A, 0xF8, 0x7B, 0xF8, 0x7C, /* 0x1C-0x1F */ + 0xF8, 0x7D, 0xF8, 0x7E, 0xF8, 0x80, 0xF8, 0x81, /* 0x20-0x23 */ + 0xF8, 0x82, 0xF8, 0x83, 0xF8, 0x84, 0xF8, 0x85, /* 0x24-0x27 */ + 0xF8, 0x86, 0xF8, 0x87, 0xF8, 0x88, 0xF8, 0x89, /* 0x28-0x2B */ + 0xF8, 0x8A, 0xF8, 0x8B, 0xF8, 0x8C, 0xF8, 0x8D, /* 0x2C-0x2F */ + 0xF8, 0x8E, 0xF8, 0x8F, 0xF8, 0x90, 0xF8, 0x91, /* 0x30-0x33 */ + 0xF8, 0x92, 0xF8, 0x93, 0xF8, 0x94, 0xF8, 0x95, /* 0x34-0x37 */ + 0xF8, 0x96, 0xF8, 0x97, 0xF8, 0x98, 0xF8, 0x99, /* 0x38-0x3B */ + 0xF8, 0x9A, 0xF8, 0x9B, 0xF8, 0x9C, 0xF8, 0x9D, /* 0x3C-0x3F */ + 0xF8, 0x9E, 0xF8, 0x9F, 0xF8, 0xA0, 0xF9, 0x40, /* 0x40-0x43 */ + 0xF9, 0x41, 0xF9, 0x42, 0xF9, 0x43, 0xF9, 0x44, /* 0x44-0x47 */ + 0xF9, 0x45, 0xF9, 0x46, 0xF9, 0x47, 0xF9, 0x48, /* 0x48-0x4B */ + 0xF9, 0x49, 0xF9, 0x4A, 0xF9, 0x4B, 0xF9, 0x4C, /* 0x4C-0x4F */ + 0xF9, 0x4D, 0xF9, 0x4E, 0xF9, 0x4F, 0xF9, 0x50, /* 0x50-0x53 */ + 0xF9, 0x51, 0xF9, 0x52, 0xF9, 0x53, 0xF9, 0x54, /* 0x54-0x57 */ + 0xF9, 0x55, 0xF9, 0x56, 0xF9, 0x57, 0xF9, 0x58, /* 0x58-0x5B */ + 0xF9, 0x59, 0xF9, 0x5A, 0xF9, 0x5B, 0xF9, 0x5C, /* 0x5C-0x5F */ + 0xF9, 0x5D, 0xF9, 0x5E, 0xF9, 0x5F, 0xF9, 0x60, /* 0x60-0x63 */ + 0xF9, 0x61, 0xF9, 0x62, 0xF9, 0x63, 0xF9, 0x64, /* 0x64-0x67 */ + 0xF9, 0x65, 0xF9, 0x66, 0xF9, 0x67, 0xF9, 0x68, /* 0x68-0x6B */ + 0xF9, 0x69, 0xF9, 0x6A, 0xF9, 0x6B, 0xF9, 0x6C, /* 0x6C-0x6F */ + 0xF9, 0x6D, 0xF9, 0x6E, 0xF9, 0x6F, 0xF9, 0x70, /* 0x70-0x73 */ + 0xF9, 0x71, 0xF9, 0x72, 0xF9, 0x73, 0xF9, 0x74, /* 0x74-0x77 */ + 0xF9, 0x75, 0xF9, 0x76, 0xF9, 0x77, 0xF9, 0x78, /* 0x78-0x7B */ + 0xF9, 0x79, 0xF9, 0x7A, 0xF9, 0x7B, 0xF9, 0x7C, /* 0x7C-0x7F */ + + 0xF9, 0x7D, 0xF9, 0x7E, 0xF9, 0x80, 0xF9, 0x81, /* 0x80-0x83 */ + 0xF9, 0x82, 0xF9, 0x83, 0xF9, 0x84, 0xF9, 0x85, /* 0x84-0x87 */ + 0xF9, 0x86, 0xF9, 0x87, 0xF9, 0x88, 0xF9, 0x89, /* 0x88-0x8B */ + 0xF9, 0x8A, 0xF9, 0x8B, 0xF9, 0x8C, 0xF9, 0x8D, /* 0x8C-0x8F */ + 0xF9, 0x8E, 0xF9, 0x8F, 0xF9, 0x90, 0xF9, 0x91, /* 0x90-0x93 */ + 0xF9, 0x92, 0xF9, 0x93, 0xF9, 0x94, 0xF9, 0x95, /* 0x94-0x97 */ + 0xF9, 0x96, 0xF9, 0x97, 0xF9, 0x98, 0xF9, 0x99, /* 0x98-0x9B */ + 0xF9, 0x9A, 0xF9, 0x9B, 0xF9, 0x9C, 0xF9, 0x9D, /* 0x9C-0x9F */ + 0xF9, 0x9E, 0xF9, 0x9F, 0xF9, 0xA0, 0xFA, 0x40, /* 0xA0-0xA3 */ + 0xFA, 0x41, 0xFA, 0x42, 0xFA, 0x43, 0xFA, 0x44, /* 0xA4-0xA7 */ + 0xFA, 0x45, 0xFA, 0x46, 0xFA, 0x47, 0xFA, 0x48, /* 0xA8-0xAB */ + 0xFA, 0x49, 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, /* 0xAC-0xAF */ + 0xFA, 0x4D, 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, /* 0xB0-0xB3 */ + 0xFA, 0x51, 0xFA, 0x52, 0xFA, 0x53, 0xFA, 0x54, /* 0xB4-0xB7 */ + 0xFA, 0x55, 0xFA, 0x56, 0xFA, 0x57, 0xFA, 0x58, /* 0xB8-0xBB */ + 0xFA, 0x59, 0xFA, 0x5A, 0xFA, 0x5B, 0xFA, 0x5C, /* 0xBC-0xBF */ + 0xFA, 0x5D, 0xFA, 0x5E, 0xFA, 0x5F, 0xFA, 0x60, /* 0xC0-0xC3 */ + 0xFA, 0x61, 0xFA, 0x62, 0xFA, 0x63, 0xFA, 0x64, /* 0xC4-0xC7 */ + 0xFA, 0x65, 0xFA, 0x66, 0xFA, 0x67, 0xFA, 0x68, /* 0xC8-0xCB */ + 0xFA, 0x69, 0xFA, 0x6A, 0xFA, 0x6B, 0xFA, 0x6C, /* 0xCC-0xCF */ + 0xFA, 0x6D, 0xFA, 0x6E, 0xFA, 0x6F, 0xFA, 0x70, /* 0xD0-0xD3 */ + 0xFA, 0x71, 0xFA, 0x72, 0xFA, 0x73, 0xFA, 0x74, /* 0xD4-0xD7 */ + 0xFA, 0x75, 0xFA, 0x76, 0xFA, 0x77, 0xFA, 0x78, /* 0xD8-0xDB */ + 0xFA, 0x79, 0xFA, 0x7A, 0xFA, 0x7B, 0xFA, 0x7C, /* 0xDC-0xDF */ + 0xFA, 0x7D, 0xFA, 0x7E, 0xFA, 0x80, 0xFA, 0x81, /* 0xE0-0xE3 */ + 0xFA, 0x82, 0xFA, 0x83, 0xFA, 0x84, 0xFA, 0x85, /* 0xE4-0xE7 */ + 0xFA, 0x86, 0xFA, 0x87, 0xFA, 0x88, 0xFA, 0x89, /* 0xE8-0xEB */ + 0xFA, 0x8A, 0xFA, 0x8B, 0xFA, 0x8C, 0xFA, 0x8D, /* 0xEC-0xEF */ + 0xFA, 0x8E, 0xFA, 0x8F, 0xFA, 0x90, 0xFA, 0x91, /* 0xF0-0xF3 */ + 0xFA, 0x92, 0xFA, 0x93, 0xFA, 0x94, 0xFA, 0x95, /* 0xF4-0xF7 */ + 0xFA, 0x96, 0xFA, 0x97, 0xFA, 0x98, 0xFA, 0x99, /* 0xF8-0xFB */ + 0xFA, 0x9A, 0xFA, 0x9B, 0xFA, 0x9C, 0xFA, 0x9D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0xFA, 0x9E, 0xFA, 0x9F, 0xFA, 0xA0, 0xFB, 0x40, /* 0x00-0x03 */ + 0xFB, 0x41, 0xFB, 0x42, 0xFB, 0x43, 0xFB, 0x44, /* 0x04-0x07 */ + 0xFB, 0x45, 0xFB, 0x46, 0xFB, 0x47, 0xFB, 0x48, /* 0x08-0x0B */ + 0xFB, 0x49, 0xFB, 0x4A, 0xFB, 0x4B, 0xFB, 0x4C, /* 0x0C-0x0F */ + 0xFB, 0x4D, 0xFB, 0x4E, 0xFB, 0x4F, 0xFB, 0x50, /* 0x10-0x13 */ + 0xFB, 0x51, 0xFB, 0x52, 0xFB, 0x53, 0xFB, 0x54, /* 0x14-0x17 */ + 0xFB, 0x55, 0xFB, 0x56, 0xFB, 0x57, 0xFB, 0x58, /* 0x18-0x1B */ + 0xFB, 0x59, 0xFB, 0x5A, 0xFB, 0x5B, 0xC4, 0xF1, /* 0x1C-0x1F */ + 0xF0, 0xAF, 0xBC, 0xA6, 0xF0, 0xB0, 0xC3, 0xF9, /* 0x20-0x23 */ + 0xFB, 0x5C, 0xC5, 0xB8, 0xD1, 0xBB, 0xFB, 0x5D, /* 0x24-0x27 */ + 0xF0, 0xB1, 0xF0, 0xB2, 0xF0, 0xB3, 0xF0, 0xB4, /* 0x28-0x2B */ + 0xF0, 0xB5, 0xD1, 0xBC, 0xFB, 0x5E, 0xD1, 0xEC, /* 0x2C-0x2F */ + 0xFB, 0x5F, 0xF0, 0xB7, 0xF0, 0xB6, 0xD4, 0xA7, /* 0x30-0x33 */ + 0xFB, 0x60, 0xCD, 0xD2, 0xF0, 0xB8, 0xF0, 0xBA, /* 0x34-0x37 */ + 0xF0, 0xB9, 0xF0, 0xBB, 0xF0, 0xBC, 0xFB, 0x61, /* 0x38-0x3B */ + 0xFB, 0x62, 0xB8, 0xEB, 0xF0, 0xBD, 0xBA, 0xE8, /* 0x3C-0x3F */ + 0xFB, 0x63, 0xF0, 0xBE, 0xF0, 0xBF, 0xBE, 0xE9, /* 0x40-0x43 */ + 0xF0, 0xC0, 0xB6, 0xEC, 0xF0, 0xC1, 0xF0, 0xC2, /* 0x44-0x47 */ + 0xF0, 0xC3, 0xF0, 0xC4, 0xC8, 0xB5, 0xF0, 0xC5, /* 0x48-0x4B */ + 0xF0, 0xC6, 0xFB, 0x64, 0xF0, 0xC7, 0xC5, 0xF4, /* 0x4C-0x4F */ + 0xFB, 0x65, 0xF0, 0xC8, 0xFB, 0x66, 0xFB, 0x67, /* 0x50-0x53 */ + 0xFB, 0x68, 0xF0, 0xC9, 0xFB, 0x69, 0xF0, 0xCA, /* 0x54-0x57 */ + 0xF7, 0xBD, 0xFB, 0x6A, 0xF0, 0xCB, 0xF0, 0xCC, /* 0x58-0x5B */ + 0xF0, 0xCD, 0xFB, 0x6B, 0xF0, 0xCE, 0xFB, 0x6C, /* 0x5C-0x5F */ + 0xFB, 0x6D, 0xFB, 0x6E, 0xFB, 0x6F, 0xF0, 0xCF, /* 0x60-0x63 */ + 0xBA, 0xD7, 0xFB, 0x70, 0xF0, 0xD0, 0xF0, 0xD1, /* 0x64-0x67 */ + 0xF0, 0xD2, 0xF0, 0xD3, 0xF0, 0xD4, 0xF0, 0xD5, /* 0x68-0x6B */ + 0xF0, 0xD6, 0xF0, 0xD8, 0xFB, 0x71, 0xFB, 0x72, /* 0x6C-0x6F */ + 0xD3, 0xA5, 0xF0, 0xD7, 0xFB, 0x73, 0xF0, 0xD9, /* 0x70-0x73 */ + 0xFB, 0x74, 0xFB, 0x75, 0xFB, 0x76, 0xFB, 0x77, /* 0x74-0x77 */ + 0xFB, 0x78, 0xFB, 0x79, 0xFB, 0x7A, 0xFB, 0x7B, /* 0x78-0x7B */ + 0xFB, 0x7C, 0xFB, 0x7D, 0xF5, 0xBA, 0xC2, 0xB9, /* 0x7C-0x7F */ + + 0xFB, 0x7E, 0xFB, 0x80, 0xF7, 0xE4, 0xFB, 0x81, /* 0x80-0x83 */ + 0xFB, 0x82, 0xFB, 0x83, 0xFB, 0x84, 0xF7, 0xE5, /* 0x84-0x87 */ + 0xF7, 0xE6, 0xFB, 0x85, 0xFB, 0x86, 0xF7, 0xE7, /* 0x88-0x8B */ + 0xFB, 0x87, 0xFB, 0x88, 0xFB, 0x89, 0xFB, 0x8A, /* 0x8C-0x8F */ + 0xFB, 0x8B, 0xFB, 0x8C, 0xF7, 0xE8, 0xC2, 0xB4, /* 0x90-0x93 */ + 0xFB, 0x8D, 0xFB, 0x8E, 0xFB, 0x8F, 0xFB, 0x90, /* 0x94-0x97 */ + 0xFB, 0x91, 0xFB, 0x92, 0xFB, 0x93, 0xFB, 0x94, /* 0x98-0x9B */ + 0xFB, 0x95, 0xF7, 0xEA, 0xFB, 0x96, 0xF7, 0xEB, /* 0x9C-0x9F */ + 0xFB, 0x97, 0xFB, 0x98, 0xFB, 0x99, 0xFB, 0x9A, /* 0xA0-0xA3 */ + 0xFB, 0x9B, 0xFB, 0x9C, 0xC2, 0xF3, 0xFB, 0x9D, /* 0xA4-0xA7 */ + 0xFB, 0x9E, 0xFB, 0x9F, 0xFB, 0xA0, 0xFC, 0x40, /* 0xA8-0xAB */ + 0xFC, 0x41, 0xFC, 0x42, 0xFC, 0x43, 0xFC, 0x44, /* 0xAC-0xAF */ + 0xFC, 0x45, 0xFC, 0x46, 0xFC, 0x47, 0xFC, 0x48, /* 0xB0-0xB3 */ + 0xF4, 0xF0, 0xFC, 0x49, 0xFC, 0x4A, 0xFC, 0x4B, /* 0xB4-0xB7 */ + 0xF4, 0xEF, 0xFC, 0x4C, 0xFC, 0x4D, 0xC2, 0xE9, /* 0xB8-0xBB */ + 0xFC, 0x4E, 0xF7, 0xE1, 0xF7, 0xE2, 0xFC, 0x4F, /* 0xBC-0xBF */ + 0xFC, 0x50, 0xFC, 0x51, 0xFC, 0x52, 0xFC, 0x53, /* 0xC0-0xC3 */ + 0xBB, 0xC6, 0xFC, 0x54, 0xFC, 0x55, 0xFC, 0x56, /* 0xC4-0xC7 */ + 0xFC, 0x57, 0xD9, 0xE4, 0xFC, 0x58, 0xFC, 0x59, /* 0xC8-0xCB */ + 0xFC, 0x5A, 0xCA, 0xF2, 0xC0, 0xE8, 0xF0, 0xA4, /* 0xCC-0xCF */ + 0xFC, 0x5B, 0xBA, 0xDA, 0xFC, 0x5C, 0xFC, 0x5D, /* 0xD0-0xD3 */ + 0xC7, 0xAD, 0xFC, 0x5E, 0xFC, 0x5F, 0xFC, 0x60, /* 0xD4-0xD7 */ + 0xC4, 0xAC, 0xFC, 0x61, 0xFC, 0x62, 0xF7, 0xEC, /* 0xD8-0xDB */ + 0xF7, 0xED, 0xF7, 0xEE, 0xFC, 0x63, 0xF7, 0xF0, /* 0xDC-0xDF */ + 0xF7, 0xEF, 0xFC, 0x64, 0xF7, 0xF1, 0xFC, 0x65, /* 0xE0-0xE3 */ + 0xFC, 0x66, 0xF7, 0xF4, 0xFC, 0x67, 0xF7, 0xF3, /* 0xE4-0xE7 */ + 0xFC, 0x68, 0xF7, 0xF2, 0xF7, 0xF5, 0xFC, 0x69, /* 0xE8-0xEB */ + 0xFC, 0x6A, 0xFC, 0x6B, 0xFC, 0x6C, 0xF7, 0xF6, /* 0xEC-0xEF */ + 0xFC, 0x6D, 0xFC, 0x6E, 0xFC, 0x6F, 0xFC, 0x70, /* 0xF0-0xF3 */ + 0xFC, 0x71, 0xFC, 0x72, 0xFC, 0x73, 0xFC, 0x74, /* 0xF4-0xF7 */ + 0xFC, 0x75, 0xED, 0xE9, 0xFC, 0x76, 0xED, 0xEA, /* 0xF8-0xFB */ + 0xED, 0xEB, 0xFC, 0x77, 0xF6, 0xBC, 0xFC, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0xFC, 0x79, 0xFC, 0x7A, 0xFC, 0x7B, 0xFC, 0x7C, /* 0x00-0x03 */ + 0xFC, 0x7D, 0xFC, 0x7E, 0xFC, 0x80, 0xFC, 0x81, /* 0x04-0x07 */ + 0xFC, 0x82, 0xFC, 0x83, 0xFC, 0x84, 0xF6, 0xBD, /* 0x08-0x0B */ + 0xFC, 0x85, 0xF6, 0xBE, 0xB6, 0xA6, 0xFC, 0x86, /* 0x0C-0x0F */ + 0xD8, 0xBE, 0xFC, 0x87, 0xFC, 0x88, 0xB9, 0xC4, /* 0x10-0x13 */ + 0xFC, 0x89, 0xFC, 0x8A, 0xFC, 0x8B, 0xD8, 0xBB, /* 0x14-0x17 */ + 0xFC, 0x8C, 0xDC, 0xB1, 0xFC, 0x8D, 0xFC, 0x8E, /* 0x18-0x1B */ + 0xFC, 0x8F, 0xFC, 0x90, 0xFC, 0x91, 0xFC, 0x92, /* 0x1C-0x1F */ + 0xCA, 0xF3, 0xFC, 0x93, 0xF7, 0xF7, 0xFC, 0x94, /* 0x20-0x23 */ + 0xFC, 0x95, 0xFC, 0x96, 0xFC, 0x97, 0xFC, 0x98, /* 0x24-0x27 */ + 0xFC, 0x99, 0xFC, 0x9A, 0xFC, 0x9B, 0xFC, 0x9C, /* 0x28-0x2B */ + 0xF7, 0xF8, 0xFC, 0x9D, 0xFC, 0x9E, 0xF7, 0xF9, /* 0x2C-0x2F */ + 0xFC, 0x9F, 0xFC, 0xA0, 0xFD, 0x40, 0xFD, 0x41, /* 0x30-0x33 */ + 0xFD, 0x42, 0xFD, 0x43, 0xFD, 0x44, 0xF7, 0xFB, /* 0x34-0x37 */ + 0xFD, 0x45, 0xF7, 0xFA, 0xFD, 0x46, 0xB1, 0xC7, /* 0x38-0x3B */ + 0xFD, 0x47, 0xF7, 0xFC, 0xF7, 0xFD, 0xFD, 0x48, /* 0x3C-0x3F */ + 0xFD, 0x49, 0xFD, 0x4A, 0xFD, 0x4B, 0xFD, 0x4C, /* 0x40-0x43 */ + 0xF7, 0xFE, 0xFD, 0x4D, 0xFD, 0x4E, 0xFD, 0x4F, /* 0x44-0x47 */ + 0xFD, 0x50, 0xFD, 0x51, 0xFD, 0x52, 0xFD, 0x53, /* 0x48-0x4B */ + 0xFD, 0x54, 0xFD, 0x55, 0xFD, 0x56, 0xFD, 0x57, /* 0x4C-0x4F */ + 0xC6, 0xEB, 0xEC, 0xB4, 0xFD, 0x58, 0xFD, 0x59, /* 0x50-0x53 */ + 0xFD, 0x5A, 0xFD, 0x5B, 0xFD, 0x5C, 0xFD, 0x5D, /* 0x54-0x57 */ + 0xFD, 0x5E, 0xFD, 0x5F, 0xFD, 0x60, 0xFD, 0x61, /* 0x58-0x5B */ + 0xFD, 0x62, 0xFD, 0x63, 0xFD, 0x64, 0xFD, 0x65, /* 0x5C-0x5F */ + 0xFD, 0x66, 0xFD, 0x67, 0xFD, 0x68, 0xFD, 0x69, /* 0x60-0x63 */ + 0xFD, 0x6A, 0xFD, 0x6B, 0xFD, 0x6C, 0xFD, 0x6D, /* 0x64-0x67 */ + 0xFD, 0x6E, 0xFD, 0x6F, 0xFD, 0x70, 0xFD, 0x71, /* 0x68-0x6B */ + 0xFD, 0x72, 0xFD, 0x73, 0xFD, 0x74, 0xFD, 0x75, /* 0x6C-0x6F */ + 0xFD, 0x76, 0xFD, 0x77, 0xFD, 0x78, 0xFD, 0x79, /* 0x70-0x73 */ + 0xFD, 0x7A, 0xFD, 0x7B, 0xFD, 0x7C, 0xFD, 0x7D, /* 0x74-0x77 */ + 0xFD, 0x7E, 0xFD, 0x80, 0xFD, 0x81, 0xFD, 0x82, /* 0x78-0x7B */ + 0xFD, 0x83, 0xFD, 0x84, 0xFD, 0x85, 0xB3, 0xDD, /* 0x7C-0x7F */ + + 0xF6, 0xB3, 0xFD, 0x86, 0xFD, 0x87, 0xF6, 0xB4, /* 0x80-0x83 */ + 0xC1, 0xE4, 0xF6, 0xB5, 0xF6, 0xB6, 0xF6, 0xB7, /* 0x84-0x87 */ + 0xF6, 0xB8, 0xF6, 0xB9, 0xF6, 0xBA, 0xC8, 0xA3, /* 0x88-0x8B */ + 0xF6, 0xBB, 0xFD, 0x88, 0xFD, 0x89, 0xFD, 0x8A, /* 0x8C-0x8F */ + 0xFD, 0x8B, 0xFD, 0x8C, 0xFD, 0x8D, 0xFD, 0x8E, /* 0x90-0x93 */ + 0xFD, 0x8F, 0xFD, 0x90, 0xFD, 0x91, 0xFD, 0x92, /* 0x94-0x97 */ + 0xFD, 0x93, 0xC1, 0xFA, 0xB9, 0xA8, 0xED, 0xE8, /* 0x98-0x9B */ + 0xFD, 0x94, 0xFD, 0x95, 0xFD, 0x96, 0xB9, 0xEA, /* 0x9C-0x9F */ + 0xD9, 0xDF, 0xFD, 0x97, 0xFD, 0x98, 0xFD, 0x99, /* 0xA0-0xA3 */ + 0xFD, 0x9A, 0xFD, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xD8, 0x4D, 0xB8, 0xFC, 0xDC, 0x87, 0xD9, 0x5A, /* 0x00-0x03 */ + 0xBB, 0xAC, 0xB4, 0xAE, 0xBE, 0xE4, 0xFD, 0x94, /* 0x04-0x07 */ + 0xFD, 0x94, 0xC6, 0xF5, 0xBD, 0xF0, 0xC0, 0xAE, /* 0x08-0x0B */ + 0xC4, 0xCE, 0x91, 0xD0, 0xB0, 0x5D, 0xC1, 0x5F, /* 0x0C-0x0F */ + 0xCC, 0x7D, 0xC2, 0xDD, 0xC2, 0xE3, 0xDF, 0x89, /* 0x10-0x13 */ + 0x98, 0xB7, 0xC2, 0xE5, 0xC0, 0xD3, 0xE7, 0xF3, /* 0x14-0x17 */ + 0xC2, 0xE4, 0xC0, 0xD2, 0xF1, 0x98, 0x81, 0x79, /* 0x18-0x1B */ + 0xC2, 0xD1, 0x99, 0xDA, 0xA0, 0x80, 0xCC, 0x6D, /* 0x1C-0x1F */ + 0xFB, 0x5B, 0x8D, 0xB9, 0x9E, 0x45, 0xCB, 0x7B, /* 0x20-0x23 */ + 0xD2, 0x68, 0xC0, 0xAD, 0xC5, 0x44, 0xCF, 0x9E, /* 0x24-0x27 */ + 0xC0, 0xC8, 0xC0, 0xCA, 0xC0, 0xCB, 0xC0, 0xC7, /* 0x28-0x2B */ + 0xFD, 0x9C, 0x81, 0xED, 0xC0, 0xE4, 0x84, 0xDA, /* 0x2C-0x2F */ + 0x93, 0xEF, 0x99, 0xA9, 0xA0, 0x74, 0xB1, 0x52, /* 0x30-0x33 */ + 0xC0, 0xCF, 0xCC, 0x4A, 0xCC, 0x94, 0xC2, 0xB7, /* 0x34-0x37 */ + 0xC2, 0xB6, 0xF4, 0x94, 0xFA, 0x98, 0xC2, 0xB5, /* 0x38-0x3B */ + 0xB5, 0x93, 0xBE, 0x47, 0xC7, 0x8A, 0xE4, 0x9B, /* 0x3C-0x3F */ + 0xC2, 0xB9, 0xD5, 0x93, 0x89, 0xC5, 0xC5, 0xAA, /* 0x40-0x43 */ + 0xBB, 0x5C, 0xC3, 0x40, 0xC0, 0xCE, 0xC0, 0xDA, /* 0x44-0x47 */ + 0xD9, 0x54, 0xC0, 0xD7, 0x89, 0xBE, 0x8C, 0xD2, /* 0x48-0x4B */ + 0x98, 0xC7, 0x9C, 0x49, 0xC2, 0xA9, 0xC0, 0xDB, /* 0x4C-0x4F */ + 0xBF, 0x7C, 0xC2, 0xAA, 0xC0, 0xD5, 0xC0, 0xDF, /* 0x50-0x53 */ + 0x84, 0x43, 0xC1, 0xE8, 0xB6, 0xA0, 0xBE, 0x63, /* 0x54-0x57 */ + 0xC1, 0xE2, 0xC1, 0xEA, 0xD7, 0x78, 0x92, 0x82, /* 0x58-0x5B */ + 0x98, 0xB7, 0xD6, 0x5A, 0xB5, 0xA4, 0x8C, 0x8E, /* 0x5C-0x5F */ + 0xC5, 0xAD, 0xC2, 0xCA, 0xAE, 0x90, 0xB1, 0xB1, /* 0x60-0x63 */ + 0xB4, 0x91, 0xB1, 0xE3, 0x8F, 0xCD, 0xB2, 0xBB, /* 0x64-0x67 */ + 0xC3, 0xDA, 0x94, 0xB5, 0xCB, 0xF7, 0x85, 0xA2, /* 0x68-0x6B */ + 0xC8, 0xFB, 0xCA, 0xA1, 0xC8, 0x7E, 0xD5, 0x66, /* 0x6C-0x6F */ + 0x9A, 0xA2, 0xB3, 0xBD, 0xC9, 0xF2, 0xCA, 0xB0, /* 0x70-0x73 */ + 0xC8, 0xF4, 0xC2, 0xD3, 0xC2, 0xD4, 0xC1, 0xC1, /* 0x74-0x77 */ + 0x83, 0xC9, 0xFD, 0x9D, 0xC1, 0xBA, 0xBC, 0x5A, /* 0x78-0x7B */ + 0xC1, 0xBC, 0xD5, 0x8F, 0xC1, 0xBF, 0x84, 0xEE, /* 0x7C-0x7F */ + + 0x85, 0xCE, 0xC5, 0xAE, 0x8F, 0x5D, 0xC2, 0xC3, /* 0x80-0x83 */ + 0x9E, 0x56, 0xB5, 0x5A, 0xE9, 0x82, 0xF3, 0x50, /* 0x84-0x87 */ + 0xFB, 0x90, 0xC0, 0xE8, 0xC1, 0xA6, 0x95, 0xD1, /* 0x88-0x8B */ + 0x9A, 0x76, 0xDE, 0x5D, 0xC4, 0xEA, 0x91, 0x7A, /* 0x8C-0x8F */ + 0x91, 0xD9, 0x93, 0xD3, 0x9D, 0x69, 0x9F, 0x92, /* 0x90-0x93 */ + 0xAD, 0x49, 0xFD, 0x9E, 0xBE, 0x9A, 0xC2, 0x93, /* 0x94-0x97 */ + 0xDD, 0x82, 0xC9, 0x8F, 0xDF, 0x42, 0xE5, 0x80, /* 0x98-0x9B */ + 0xC1, 0xD0, 0xC1, 0xD3, 0xD1, 0xCA, 0xC1, 0xD2, /* 0x9C-0x9F */ + 0xC1, 0xD1, 0xD5, 0x66, 0xC1, 0xAE, 0xC4, 0xEE, /* 0xA0-0xA3 */ + 0xC4, 0xED, 0x9A, 0x9A, 0xBA, 0x9F, 0xAB, 0x43, /* 0xA4-0xA7 */ + 0xC1, 0xEE, 0xE0, 0xF2, 0x8C, 0x8E, 0x8E, 0x58, /* 0xA8-0xAB */ + 0xC1, 0xAF, 0xC1, 0xE1, 0xAC, 0x93, 0xC1, 0xE7, /* 0xAC-0xAF */ + 0xF1, 0xF6, 0xE2, 0x8F, 0xC1, 0xE3, 0xEC, 0x60, /* 0xB0-0xB3 */ + 0xEE, 0x49, 0xC0, 0xFD, 0xB6, 0x59, 0xF5, 0xB7, /* 0xB4-0xB7 */ + 0xEB, 0x60, 0x90, 0xBA, 0xC1, 0xCB, 0xC1, 0xC5, /* 0xB8-0xBB */ + 0xE5, 0xBC, 0xC4, 0xF2, 0xC1, 0xCF, 0x98, 0xB7, /* 0xBC-0xBF */ + 0xC1, 0xC7, 0xAF, 0x9F, 0xDE, 0xA4, 0xDF, 0x7C, /* 0xC0-0xC3 */ + 0xFD, 0x88, 0x95, 0x9E, 0xC8, 0xEE, 0x84, 0xA2, /* 0xC4-0xC7 */ + 0x96, 0x83, 0xC1, 0xF8, 0xC1, 0xF7, 0xC1, 0xEF, /* 0xC8-0xCB */ + 0xC1, 0xF0, 0xC1, 0xF4, 0xC1, 0xF2, 0xBC, 0x7E, /* 0xCC-0xCF */ + 0xEE, 0x90, 0xC1, 0xF9, 0xC2, 0xBE, 0xEA, 0x91, /* 0xD0-0xD3 */ + 0x82, 0x90, 0x8D, 0x91, 0x9C, 0x53, 0xDD, 0x86, /* 0xD4-0xD7 */ + 0xC2, 0xC9, 0x90, 0xFC, 0xC0, 0xF5, 0xC2, 0xCA, /* 0xD8-0xDB */ + 0xC2, 0xA1, 0xC0, 0xFB, 0xC0, 0xF4, 0xC2, 0xC4, /* 0xDC-0xDF */ + 0xD2, 0xD7, 0xC0, 0xEE, 0xC0, 0xE6, 0xC4, 0xE0, /* 0xE0-0xE3 */ + 0xC0, 0xED, 0xC1, 0xA1, 0xEE, 0xBE, 0xFD, 0x9F, /* 0xE4-0xE7 */ + 0xD1, 0x65, 0xC0, 0xEF, 0xEB, 0x78, 0xC4, 0xE4, /* 0xE8-0xEB */ + 0xC4, 0xE7, 0xC1, 0xDF, 0x9F, 0xFB, 0xAD, 0x55, /* 0xEC-0xEF */ + 0xCC, 0x41, 0xFD, 0xA0, 0xF7, 0x5B, 0xF7, 0xEB, /* 0xF0-0xF3 */ + 0xC1, 0xD6, 0xC1, 0xDC, 0xC5, 0x52, 0xC1, 0xA2, /* 0xF4-0xF7 */ + 0xF3, 0xD2, 0xC1, 0xA3, 0xA0, 0xEE, 0xD6, 0xCB, /* 0xF8-0xFB */ + 0xD7, 0x52, 0xCA, 0xB2, 0xB2, 0xE8, 0xB4, 0xCC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xC7, 0xD0, 0xB6, 0xC8, 0xCD, 0xD8, 0xCC, 0xC7, /* 0x00-0x03 */ + 0xD5, 0xAC, 0xB6, 0xB4, 0xB1, 0xA9, 0xDD, 0x97, /* 0x04-0x07 */ + 0xD0, 0xD0, 0xBD, 0xB5, 0xD2, 0x8A, 0xC0, 0xAA, /* 0x08-0x0B */ + 0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42, 0xFE, 0x43, /* 0x0C-0x0F */ + 0x89, 0x56, 0xFE, 0x44, 0xC7, 0xE7, 0xFE, 0x45, /* 0x10-0x13 */ + 0xFE, 0x46, 0x84, 0x44, 0xD8, 0x69, 0xD2, 0xE6, /* 0x14-0x17 */ + 0xFE, 0x47, 0xC9, 0xF1, 0xCF, 0xE9, 0xB8, 0xA3, /* 0x18-0x1B */ + 0xBE, 0xB8, 0xBE, 0xAB, 0xD3, 0xF0, 0xFE, 0x48, /* 0x1C-0x1F */ + 0xFE, 0x49, 0xFE, 0x4A, 0xD6, 0x54, 0xFE, 0x4B, /* 0x20-0x23 */ + 0xFE, 0x4C, 0xD2, 0xDD, 0xB6, 0xBC, 0xFE, 0x4D, /* 0x24-0x27 */ + 0xFE, 0x4E, 0xFE, 0x4F, 0xEF, 0x88, 0xEF, 0x95, /* 0x28-0x2B */ + 0xF0, 0x5E, 0xFA, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FE[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA9, 0x55, 0xA6, 0xF2, 0x00, 0x00, 0xA6, 0xF4, /* 0x30-0x33 */ + 0xA6, 0xF5, 0xA6, 0xE0, 0xA6, 0xE1, 0xA6, 0xF0, /* 0x34-0x37 */ + 0xA6, 0xF1, 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xEE, /* 0x38-0x3B */ + 0xA6, 0xEF, 0xA6, 0xE6, 0xA6, 0xE7, 0xA6, 0xE4, /* 0x3C-0x3F */ + 0xA6, 0xE5, 0xA6, 0xE8, 0xA6, 0xE9, 0xA6, 0xEA, /* 0x40-0x43 */ + 0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA9, 0x68, 0xA9, 0x69, 0xA9, 0x6A, /* 0x48-0x4B */ + 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, 0xA9, 0x6E, /* 0x4C-0x4F */ + 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, 0x00, 0x00, /* 0x50-0x53 */ + 0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0x54-0x57 */ + 0x00, 0x00, 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, /* 0x58-0x5B */ + 0xA9, 0x79, 0xA9, 0x7A, 0xA9, 0x7B, 0xA9, 0x7C, /* 0x5C-0x5F */ + 0xA9, 0x7D, 0xA9, 0x7E, 0xA9, 0x80, 0xA9, 0x81, /* 0x60-0x63 */ + 0xA9, 0x82, 0xA9, 0x83, 0xA9, 0x84, 0x00, 0x00, /* 0x64-0x67 */ + 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, 0xA9, 0x88, /* 0x68-0x6B */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */ + 0xA1, 0xE7, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */ + 0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */ + 0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */ + 0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */ + 0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */ + 0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */ + 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */ + 0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */ + 0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */ + 0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */ + 0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */ + 0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */ + 0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */ + 0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */ + 0xA3, 0xDC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */ + 0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */ + 0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */ + 0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */ + 0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */ + 0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */ + 0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */ + 0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */ + 0xA3, 0xFC, 0xA3, 0xFD, 0xA1, 0xAB, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA1, 0xE9, 0xA1, 0xEA, 0xA9, 0x56, 0xA3, 0xFE, /* 0xE0-0xE3 */ + 0xA9, 0x57, 0xA3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, u2c_FE, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + unsigned char out0,out1; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */ + out[0] = 0x80; + return 1; + } + + if (ch == 0) { /* handle the U00 plane*/ + /* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/ + out0 = u2c_00[cl*2]; + out1 = u2c_00[cl*2+1]; + if (out0 == 0x00 && out1 == 0x00) { + if (cl<0x80) { + out[0] = cl; + return 1; + } + return -EINVAL; + } else { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = out0; + out[1] = out1; + return 2; + } + } + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + return 2; + } + else + return -EINVAL; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */ + *uni = 0x20ac; + } else { + *uni = rawstring[0]; + } + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */ + *uni = 0x20ac; + } else { + *uni = ch; + } + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp936", + .alias = "gb2312", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp936(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp936(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp936) +module_exit(exit_nls_cp936) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(gb2312); diff --git a/kernel/fs/nls/nls_cp949.c b/kernel/fs/nls/nls_cp949.c new file mode 100644 index 000000000..199171e97 --- /dev/null +++ b/kernel/fs/nls/nls_cp949.c @@ -0,0 +1,13946 @@ +/* + * linux/fs/nls/nls_cp949.c + * + * Charset cp949 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAC02,0xAC03,0xAC05,0xAC06,0xAC0B,0xAC0C,0xAC0D,/* 0x40-0x47 */ + 0xAC0E,0xAC0F,0xAC18,0xAC1E,0xAC1F,0xAC21,0xAC22,0xAC23,/* 0x48-0x4F */ + 0xAC25,0xAC26,0xAC27,0xAC28,0xAC29,0xAC2A,0xAC2B,0xAC2E,/* 0x50-0x57 */ + 0xAC32,0xAC33,0xAC34,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAC35,0xAC36,0xAC37,0xAC3A,0xAC3B,0xAC3D,0xAC3E,/* 0x60-0x67 */ + 0xAC3F,0xAC41,0xAC42,0xAC43,0xAC44,0xAC45,0xAC46,0xAC47,/* 0x68-0x6F */ + 0xAC48,0xAC49,0xAC4A,0xAC4C,0xAC4E,0xAC4F,0xAC50,0xAC51,/* 0x70-0x77 */ + 0xAC52,0xAC53,0xAC55,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAC56,0xAC57,0xAC59,0xAC5A,0xAC5B,0xAC5D,0xAC5E,/* 0x80-0x87 */ + 0xAC5F,0xAC60,0xAC61,0xAC62,0xAC63,0xAC64,0xAC65,0xAC66,/* 0x88-0x8F */ + 0xAC67,0xAC68,0xAC69,0xAC6A,0xAC6B,0xAC6C,0xAC6D,0xAC6E,/* 0x90-0x97 */ + 0xAC6F,0xAC72,0xAC73,0xAC75,0xAC76,0xAC79,0xAC7B,0xAC7C,/* 0x98-0x9F */ + 0xAC7D,0xAC7E,0xAC7F,0xAC82,0xAC87,0xAC88,0xAC8D,0xAC8E,/* 0xA0-0xA7 */ + 0xAC8F,0xAC91,0xAC92,0xAC93,0xAC95,0xAC96,0xAC97,0xAC98,/* 0xA8-0xAF */ + 0xAC99,0xAC9A,0xAC9B,0xAC9E,0xACA2,0xACA3,0xACA4,0xACA5,/* 0xB0-0xB7 */ + 0xACA6,0xACA7,0xACAB,0xACAD,0xACAE,0xACB1,0xACB2,0xACB3,/* 0xB8-0xBF */ + 0xACB4,0xACB5,0xACB6,0xACB7,0xACBA,0xACBE,0xACBF,0xACC0,/* 0xC0-0xC7 */ + 0xACC2,0xACC3,0xACC5,0xACC6,0xACC7,0xACC9,0xACCA,0xACCB,/* 0xC8-0xCF */ + 0xACCD,0xACCE,0xACCF,0xACD0,0xACD1,0xACD2,0xACD3,0xACD4,/* 0xD0-0xD7 */ + 0xACD6,0xACD8,0xACD9,0xACDA,0xACDB,0xACDC,0xACDD,0xACDE,/* 0xD8-0xDF */ + 0xACDF,0xACE2,0xACE3,0xACE5,0xACE6,0xACE9,0xACEB,0xACED,/* 0xE0-0xE7 */ + 0xACEE,0xACF2,0xACF4,0xACF7,0xACF8,0xACF9,0xACFA,0xACFB,/* 0xE8-0xEF */ + 0xACFE,0xACFF,0xAD01,0xAD02,0xAD03,0xAD05,0xAD07,0xAD08,/* 0xF0-0xF7 */ + 0xAD09,0xAD0A,0xAD0B,0xAD0E,0xAD10,0xAD12,0xAD13,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAD14,0xAD15,0xAD16,0xAD17,0xAD19,0xAD1A,0xAD1B,/* 0x40-0x47 */ + 0xAD1D,0xAD1E,0xAD1F,0xAD21,0xAD22,0xAD23,0xAD24,0xAD25,/* 0x48-0x4F */ + 0xAD26,0xAD27,0xAD28,0xAD2A,0xAD2B,0xAD2E,0xAD2F,0xAD30,/* 0x50-0x57 */ + 0xAD31,0xAD32,0xAD33,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAD36,0xAD37,0xAD39,0xAD3A,0xAD3B,0xAD3D,0xAD3E,/* 0x60-0x67 */ + 0xAD3F,0xAD40,0xAD41,0xAD42,0xAD43,0xAD46,0xAD48,0xAD4A,/* 0x68-0x6F */ + 0xAD4B,0xAD4C,0xAD4D,0xAD4E,0xAD4F,0xAD51,0xAD52,0xAD53,/* 0x70-0x77 */ + 0xAD55,0xAD56,0xAD57,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAD59,0xAD5A,0xAD5B,0xAD5C,0xAD5D,0xAD5E,0xAD5F,/* 0x80-0x87 */ + 0xAD60,0xAD62,0xAD64,0xAD65,0xAD66,0xAD67,0xAD68,0xAD69,/* 0x88-0x8F */ + 0xAD6A,0xAD6B,0xAD6E,0xAD6F,0xAD71,0xAD72,0xAD77,0xAD78,/* 0x90-0x97 */ + 0xAD79,0xAD7A,0xAD7E,0xAD80,0xAD83,0xAD84,0xAD85,0xAD86,/* 0x98-0x9F */ + 0xAD87,0xAD8A,0xAD8B,0xAD8D,0xAD8E,0xAD8F,0xAD91,0xAD92,/* 0xA0-0xA7 */ + 0xAD93,0xAD94,0xAD95,0xAD96,0xAD97,0xAD98,0xAD99,0xAD9A,/* 0xA8-0xAF */ + 0xAD9B,0xAD9E,0xAD9F,0xADA0,0xADA1,0xADA2,0xADA3,0xADA5,/* 0xB0-0xB7 */ + 0xADA6,0xADA7,0xADA8,0xADA9,0xADAA,0xADAB,0xADAC,0xADAD,/* 0xB8-0xBF */ + 0xADAE,0xADAF,0xADB0,0xADB1,0xADB2,0xADB3,0xADB4,0xADB5,/* 0xC0-0xC7 */ + 0xADB6,0xADB8,0xADB9,0xADBA,0xADBB,0xADBC,0xADBD,0xADBE,/* 0xC8-0xCF */ + 0xADBF,0xADC2,0xADC3,0xADC5,0xADC6,0xADC7,0xADC9,0xADCA,/* 0xD0-0xD7 */ + 0xADCB,0xADCC,0xADCD,0xADCE,0xADCF,0xADD2,0xADD4,0xADD5,/* 0xD8-0xDF */ + 0xADD6,0xADD7,0xADD8,0xADD9,0xADDA,0xADDB,0xADDD,0xADDE,/* 0xE0-0xE7 */ + 0xADDF,0xADE1,0xADE2,0xADE3,0xADE5,0xADE6,0xADE7,0xADE8,/* 0xE8-0xEF */ + 0xADE9,0xADEA,0xADEB,0xADEC,0xADED,0xADEE,0xADEF,0xADF0,/* 0xF0-0xF7 */ + 0xADF1,0xADF2,0xADF3,0xADF4,0xADF5,0xADF6,0xADF7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xADFA,0xADFB,0xADFD,0xADFE,0xAE02,0xAE03,0xAE04,/* 0x40-0x47 */ + 0xAE05,0xAE06,0xAE07,0xAE0A,0xAE0C,0xAE0E,0xAE0F,0xAE10,/* 0x48-0x4F */ + 0xAE11,0xAE12,0xAE13,0xAE15,0xAE16,0xAE17,0xAE18,0xAE19,/* 0x50-0x57 */ + 0xAE1A,0xAE1B,0xAE1C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAE1D,0xAE1E,0xAE1F,0xAE20,0xAE21,0xAE22,0xAE23,/* 0x60-0x67 */ + 0xAE24,0xAE25,0xAE26,0xAE27,0xAE28,0xAE29,0xAE2A,0xAE2B,/* 0x68-0x6F */ + 0xAE2C,0xAE2D,0xAE2E,0xAE2F,0xAE32,0xAE33,0xAE35,0xAE36,/* 0x70-0x77 */ + 0xAE39,0xAE3B,0xAE3C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAE3D,0xAE3E,0xAE3F,0xAE42,0xAE44,0xAE47,0xAE48,/* 0x80-0x87 */ + 0xAE49,0xAE4B,0xAE4F,0xAE51,0xAE52,0xAE53,0xAE55,0xAE57,/* 0x88-0x8F */ + 0xAE58,0xAE59,0xAE5A,0xAE5B,0xAE5E,0xAE62,0xAE63,0xAE64,/* 0x90-0x97 */ + 0xAE66,0xAE67,0xAE6A,0xAE6B,0xAE6D,0xAE6E,0xAE6F,0xAE71,/* 0x98-0x9F */ + 0xAE72,0xAE73,0xAE74,0xAE75,0xAE76,0xAE77,0xAE7A,0xAE7E,/* 0xA0-0xA7 */ + 0xAE7F,0xAE80,0xAE81,0xAE82,0xAE83,0xAE86,0xAE87,0xAE88,/* 0xA8-0xAF */ + 0xAE89,0xAE8A,0xAE8B,0xAE8D,0xAE8E,0xAE8F,0xAE90,0xAE91,/* 0xB0-0xB7 */ + 0xAE92,0xAE93,0xAE94,0xAE95,0xAE96,0xAE97,0xAE98,0xAE99,/* 0xB8-0xBF */ + 0xAE9A,0xAE9B,0xAE9C,0xAE9D,0xAE9E,0xAE9F,0xAEA0,0xAEA1,/* 0xC0-0xC7 */ + 0xAEA2,0xAEA3,0xAEA4,0xAEA5,0xAEA6,0xAEA7,0xAEA8,0xAEA9,/* 0xC8-0xCF */ + 0xAEAA,0xAEAB,0xAEAC,0xAEAD,0xAEAE,0xAEAF,0xAEB0,0xAEB1,/* 0xD0-0xD7 */ + 0xAEB2,0xAEB3,0xAEB4,0xAEB5,0xAEB6,0xAEB7,0xAEB8,0xAEB9,/* 0xD8-0xDF */ + 0xAEBA,0xAEBB,0xAEBF,0xAEC1,0xAEC2,0xAEC3,0xAEC5,0xAEC6,/* 0xE0-0xE7 */ + 0xAEC7,0xAEC8,0xAEC9,0xAECA,0xAECB,0xAECE,0xAED2,0xAED3,/* 0xE8-0xEF */ + 0xAED4,0xAED5,0xAED6,0xAED7,0xAEDA,0xAEDB,0xAEDD,0xAEDE,/* 0xF0-0xF7 */ + 0xAEDF,0xAEE0,0xAEE1,0xAEE2,0xAEE3,0xAEE4,0xAEE5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAEE6,0xAEE7,0xAEE9,0xAEEA,0xAEEC,0xAEEE,0xAEEF,/* 0x40-0x47 */ + 0xAEF0,0xAEF1,0xAEF2,0xAEF3,0xAEF5,0xAEF6,0xAEF7,0xAEF9,/* 0x48-0x4F */ + 0xAEFA,0xAEFB,0xAEFD,0xAEFE,0xAEFF,0xAF00,0xAF01,0xAF02,/* 0x50-0x57 */ + 0xAF03,0xAF04,0xAF05,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAF06,0xAF09,0xAF0A,0xAF0B,0xAF0C,0xAF0E,0xAF0F,/* 0x60-0x67 */ + 0xAF11,0xAF12,0xAF13,0xAF14,0xAF15,0xAF16,0xAF17,0xAF18,/* 0x68-0x6F */ + 0xAF19,0xAF1A,0xAF1B,0xAF1C,0xAF1D,0xAF1E,0xAF1F,0xAF20,/* 0x70-0x77 */ + 0xAF21,0xAF22,0xAF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAF24,0xAF25,0xAF26,0xAF27,0xAF28,0xAF29,0xAF2A,/* 0x80-0x87 */ + 0xAF2B,0xAF2E,0xAF2F,0xAF31,0xAF33,0xAF35,0xAF36,0xAF37,/* 0x88-0x8F */ + 0xAF38,0xAF39,0xAF3A,0xAF3B,0xAF3E,0xAF40,0xAF44,0xAF45,/* 0x90-0x97 */ + 0xAF46,0xAF47,0xAF4A,0xAF4B,0xAF4C,0xAF4D,0xAF4E,0xAF4F,/* 0x98-0x9F */ + 0xAF51,0xAF52,0xAF53,0xAF54,0xAF55,0xAF56,0xAF57,0xAF58,/* 0xA0-0xA7 */ + 0xAF59,0xAF5A,0xAF5B,0xAF5E,0xAF5F,0xAF60,0xAF61,0xAF62,/* 0xA8-0xAF */ + 0xAF63,0xAF66,0xAF67,0xAF68,0xAF69,0xAF6A,0xAF6B,0xAF6C,/* 0xB0-0xB7 */ + 0xAF6D,0xAF6E,0xAF6F,0xAF70,0xAF71,0xAF72,0xAF73,0xAF74,/* 0xB8-0xBF */ + 0xAF75,0xAF76,0xAF77,0xAF78,0xAF7A,0xAF7B,0xAF7C,0xAF7D,/* 0xC0-0xC7 */ + 0xAF7E,0xAF7F,0xAF81,0xAF82,0xAF83,0xAF85,0xAF86,0xAF87,/* 0xC8-0xCF */ + 0xAF89,0xAF8A,0xAF8B,0xAF8C,0xAF8D,0xAF8E,0xAF8F,0xAF92,/* 0xD0-0xD7 */ + 0xAF93,0xAF94,0xAF96,0xAF97,0xAF98,0xAF99,0xAF9A,0xAF9B,/* 0xD8-0xDF */ + 0xAF9D,0xAF9E,0xAF9F,0xAFA0,0xAFA1,0xAFA2,0xAFA3,0xAFA4,/* 0xE0-0xE7 */ + 0xAFA5,0xAFA6,0xAFA7,0xAFA8,0xAFA9,0xAFAA,0xAFAB,0xAFAC,/* 0xE8-0xEF */ + 0xAFAD,0xAFAE,0xAFAF,0xAFB0,0xAFB1,0xAFB2,0xAFB3,0xAFB4,/* 0xF0-0xF7 */ + 0xAFB5,0xAFB6,0xAFB7,0xAFBA,0xAFBB,0xAFBD,0xAFBE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_85[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAFBF,0xAFC1,0xAFC2,0xAFC3,0xAFC4,0xAFC5,0xAFC6,/* 0x40-0x47 */ + 0xAFCA,0xAFCC,0xAFCF,0xAFD0,0xAFD1,0xAFD2,0xAFD3,0xAFD5,/* 0x48-0x4F */ + 0xAFD6,0xAFD7,0xAFD8,0xAFD9,0xAFDA,0xAFDB,0xAFDD,0xAFDE,/* 0x50-0x57 */ + 0xAFDF,0xAFE0,0xAFE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAFE2,0xAFE3,0xAFE4,0xAFE5,0xAFE6,0xAFE7,0xAFEA,/* 0x60-0x67 */ + 0xAFEB,0xAFEC,0xAFED,0xAFEE,0xAFEF,0xAFF2,0xAFF3,0xAFF5,/* 0x68-0x6F */ + 0xAFF6,0xAFF7,0xAFF9,0xAFFA,0xAFFB,0xAFFC,0xAFFD,0xAFFE,/* 0x70-0x77 */ + 0xAFFF,0xB002,0xB003,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB005,0xB006,0xB007,0xB008,0xB009,0xB00A,0xB00B,/* 0x80-0x87 */ + 0xB00D,0xB00E,0xB00F,0xB011,0xB012,0xB013,0xB015,0xB016,/* 0x88-0x8F */ + 0xB017,0xB018,0xB019,0xB01A,0xB01B,0xB01E,0xB01F,0xB020,/* 0x90-0x97 */ + 0xB021,0xB022,0xB023,0xB024,0xB025,0xB026,0xB027,0xB029,/* 0x98-0x9F */ + 0xB02A,0xB02B,0xB02C,0xB02D,0xB02E,0xB02F,0xB030,0xB031,/* 0xA0-0xA7 */ + 0xB032,0xB033,0xB034,0xB035,0xB036,0xB037,0xB038,0xB039,/* 0xA8-0xAF */ + 0xB03A,0xB03B,0xB03C,0xB03D,0xB03E,0xB03F,0xB040,0xB041,/* 0xB0-0xB7 */ + 0xB042,0xB043,0xB046,0xB047,0xB049,0xB04B,0xB04D,0xB04F,/* 0xB8-0xBF */ + 0xB050,0xB051,0xB052,0xB056,0xB058,0xB05A,0xB05B,0xB05C,/* 0xC0-0xC7 */ + 0xB05E,0xB05F,0xB060,0xB061,0xB062,0xB063,0xB064,0xB065,/* 0xC8-0xCF */ + 0xB066,0xB067,0xB068,0xB069,0xB06A,0xB06B,0xB06C,0xB06D,/* 0xD0-0xD7 */ + 0xB06E,0xB06F,0xB070,0xB071,0xB072,0xB073,0xB074,0xB075,/* 0xD8-0xDF */ + 0xB076,0xB077,0xB078,0xB079,0xB07A,0xB07B,0xB07E,0xB07F,/* 0xE0-0xE7 */ + 0xB081,0xB082,0xB083,0xB085,0xB086,0xB087,0xB088,0xB089,/* 0xE8-0xEF */ + 0xB08A,0xB08B,0xB08E,0xB090,0xB092,0xB093,0xB094,0xB095,/* 0xF0-0xF7 */ + 0xB096,0xB097,0xB09B,0xB09D,0xB09E,0xB0A3,0xB0A4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_86[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB0A5,0xB0A6,0xB0A7,0xB0AA,0xB0B0,0xB0B2,0xB0B6,/* 0x40-0x47 */ + 0xB0B7,0xB0B9,0xB0BA,0xB0BB,0xB0BD,0xB0BE,0xB0BF,0xB0C0,/* 0x48-0x4F */ + 0xB0C1,0xB0C2,0xB0C3,0xB0C6,0xB0CA,0xB0CB,0xB0CC,0xB0CD,/* 0x50-0x57 */ + 0xB0CE,0xB0CF,0xB0D2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB0D3,0xB0D5,0xB0D6,0xB0D7,0xB0D9,0xB0DA,0xB0DB,/* 0x60-0x67 */ + 0xB0DC,0xB0DD,0xB0DE,0xB0DF,0xB0E1,0xB0E2,0xB0E3,0xB0E4,/* 0x68-0x6F */ + 0xB0E6,0xB0E7,0xB0E8,0xB0E9,0xB0EA,0xB0EB,0xB0EC,0xB0ED,/* 0x70-0x77 */ + 0xB0EE,0xB0EF,0xB0F0,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB0F1,0xB0F2,0xB0F3,0xB0F4,0xB0F5,0xB0F6,0xB0F7,/* 0x80-0x87 */ + 0xB0F8,0xB0F9,0xB0FA,0xB0FB,0xB0FC,0xB0FD,0xB0FE,0xB0FF,/* 0x88-0x8F */ + 0xB100,0xB101,0xB102,0xB103,0xB104,0xB105,0xB106,0xB107,/* 0x90-0x97 */ + 0xB10A,0xB10D,0xB10E,0xB10F,0xB111,0xB114,0xB115,0xB116,/* 0x98-0x9F */ + 0xB117,0xB11A,0xB11E,0xB11F,0xB120,0xB121,0xB122,0xB126,/* 0xA0-0xA7 */ + 0xB127,0xB129,0xB12A,0xB12B,0xB12D,0xB12E,0xB12F,0xB130,/* 0xA8-0xAF */ + 0xB131,0xB132,0xB133,0xB136,0xB13A,0xB13B,0xB13C,0xB13D,/* 0xB0-0xB7 */ + 0xB13E,0xB13F,0xB142,0xB143,0xB145,0xB146,0xB147,0xB149,/* 0xB8-0xBF */ + 0xB14A,0xB14B,0xB14C,0xB14D,0xB14E,0xB14F,0xB152,0xB153,/* 0xC0-0xC7 */ + 0xB156,0xB157,0xB159,0xB15A,0xB15B,0xB15D,0xB15E,0xB15F,/* 0xC8-0xCF */ + 0xB161,0xB162,0xB163,0xB164,0xB165,0xB166,0xB167,0xB168,/* 0xD0-0xD7 */ + 0xB169,0xB16A,0xB16B,0xB16C,0xB16D,0xB16E,0xB16F,0xB170,/* 0xD8-0xDF */ + 0xB171,0xB172,0xB173,0xB174,0xB175,0xB176,0xB177,0xB17A,/* 0xE0-0xE7 */ + 0xB17B,0xB17D,0xB17E,0xB17F,0xB181,0xB183,0xB184,0xB185,/* 0xE8-0xEF */ + 0xB186,0xB187,0xB18A,0xB18C,0xB18E,0xB18F,0xB190,0xB191,/* 0xF0-0xF7 */ + 0xB195,0xB196,0xB197,0xB199,0xB19A,0xB19B,0xB19D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB19E,0xB19F,0xB1A0,0xB1A1,0xB1A2,0xB1A3,0xB1A4,/* 0x40-0x47 */ + 0xB1A5,0xB1A6,0xB1A7,0xB1A9,0xB1AA,0xB1AB,0xB1AC,0xB1AD,/* 0x48-0x4F */ + 0xB1AE,0xB1AF,0xB1B0,0xB1B1,0xB1B2,0xB1B3,0xB1B4,0xB1B5,/* 0x50-0x57 */ + 0xB1B6,0xB1B7,0xB1B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB1B9,0xB1BA,0xB1BB,0xB1BC,0xB1BD,0xB1BE,0xB1BF,/* 0x60-0x67 */ + 0xB1C0,0xB1C1,0xB1C2,0xB1C3,0xB1C4,0xB1C5,0xB1C6,0xB1C7,/* 0x68-0x6F */ + 0xB1C8,0xB1C9,0xB1CA,0xB1CB,0xB1CD,0xB1CE,0xB1CF,0xB1D1,/* 0x70-0x77 */ + 0xB1D2,0xB1D3,0xB1D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB1D6,0xB1D7,0xB1D8,0xB1D9,0xB1DA,0xB1DB,0xB1DE,/* 0x80-0x87 */ + 0xB1E0,0xB1E1,0xB1E2,0xB1E3,0xB1E4,0xB1E5,0xB1E6,0xB1E7,/* 0x88-0x8F */ + 0xB1EA,0xB1EB,0xB1ED,0xB1EE,0xB1EF,0xB1F1,0xB1F2,0xB1F3,/* 0x90-0x97 */ + 0xB1F4,0xB1F5,0xB1F6,0xB1F7,0xB1F8,0xB1FA,0xB1FC,0xB1FE,/* 0x98-0x9F */ + 0xB1FF,0xB200,0xB201,0xB202,0xB203,0xB206,0xB207,0xB209,/* 0xA0-0xA7 */ + 0xB20A,0xB20D,0xB20E,0xB20F,0xB210,0xB211,0xB212,0xB213,/* 0xA8-0xAF */ + 0xB216,0xB218,0xB21A,0xB21B,0xB21C,0xB21D,0xB21E,0xB21F,/* 0xB0-0xB7 */ + 0xB221,0xB222,0xB223,0xB224,0xB225,0xB226,0xB227,0xB228,/* 0xB8-0xBF */ + 0xB229,0xB22A,0xB22B,0xB22C,0xB22D,0xB22E,0xB22F,0xB230,/* 0xC0-0xC7 */ + 0xB231,0xB232,0xB233,0xB235,0xB236,0xB237,0xB238,0xB239,/* 0xC8-0xCF */ + 0xB23A,0xB23B,0xB23D,0xB23E,0xB23F,0xB240,0xB241,0xB242,/* 0xD0-0xD7 */ + 0xB243,0xB244,0xB245,0xB246,0xB247,0xB248,0xB249,0xB24A,/* 0xD8-0xDF */ + 0xB24B,0xB24C,0xB24D,0xB24E,0xB24F,0xB250,0xB251,0xB252,/* 0xE0-0xE7 */ + 0xB253,0xB254,0xB255,0xB256,0xB257,0xB259,0xB25A,0xB25B,/* 0xE8-0xEF */ + 0xB25D,0xB25E,0xB25F,0xB261,0xB262,0xB263,0xB264,0xB265,/* 0xF0-0xF7 */ + 0xB266,0xB267,0xB26A,0xB26B,0xB26C,0xB26D,0xB26E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB26F,0xB270,0xB271,0xB272,0xB273,0xB276,0xB277,/* 0x40-0x47 */ + 0xB278,0xB279,0xB27A,0xB27B,0xB27D,0xB27E,0xB27F,0xB280,/* 0x48-0x4F */ + 0xB281,0xB282,0xB283,0xB286,0xB287,0xB288,0xB28A,0xB28B,/* 0x50-0x57 */ + 0xB28C,0xB28D,0xB28E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB28F,0xB292,0xB293,0xB295,0xB296,0xB297,0xB29B,/* 0x60-0x67 */ + 0xB29C,0xB29D,0xB29E,0xB29F,0xB2A2,0xB2A4,0xB2A7,0xB2A8,/* 0x68-0x6F */ + 0xB2A9,0xB2AB,0xB2AD,0xB2AE,0xB2AF,0xB2B1,0xB2B2,0xB2B3,/* 0x70-0x77 */ + 0xB2B5,0xB2B6,0xB2B7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB2B8,0xB2B9,0xB2BA,0xB2BB,0xB2BC,0xB2BD,0xB2BE,/* 0x80-0x87 */ + 0xB2BF,0xB2C0,0xB2C1,0xB2C2,0xB2C3,0xB2C4,0xB2C5,0xB2C6,/* 0x88-0x8F */ + 0xB2C7,0xB2CA,0xB2CB,0xB2CD,0xB2CE,0xB2CF,0xB2D1,0xB2D3,/* 0x90-0x97 */ + 0xB2D4,0xB2D5,0xB2D6,0xB2D7,0xB2DA,0xB2DC,0xB2DE,0xB2DF,/* 0x98-0x9F */ + 0xB2E0,0xB2E1,0xB2E3,0xB2E7,0xB2E9,0xB2EA,0xB2F0,0xB2F1,/* 0xA0-0xA7 */ + 0xB2F2,0xB2F6,0xB2FC,0xB2FD,0xB2FE,0xB302,0xB303,0xB305,/* 0xA8-0xAF */ + 0xB306,0xB307,0xB309,0xB30A,0xB30B,0xB30C,0xB30D,0xB30E,/* 0xB0-0xB7 */ + 0xB30F,0xB312,0xB316,0xB317,0xB318,0xB319,0xB31A,0xB31B,/* 0xB8-0xBF */ + 0xB31D,0xB31E,0xB31F,0xB320,0xB321,0xB322,0xB323,0xB324,/* 0xC0-0xC7 */ + 0xB325,0xB326,0xB327,0xB328,0xB329,0xB32A,0xB32B,0xB32C,/* 0xC8-0xCF */ + 0xB32D,0xB32E,0xB32F,0xB330,0xB331,0xB332,0xB333,0xB334,/* 0xD0-0xD7 */ + 0xB335,0xB336,0xB337,0xB338,0xB339,0xB33A,0xB33B,0xB33C,/* 0xD8-0xDF */ + 0xB33D,0xB33E,0xB33F,0xB340,0xB341,0xB342,0xB343,0xB344,/* 0xE0-0xE7 */ + 0xB345,0xB346,0xB347,0xB348,0xB349,0xB34A,0xB34B,0xB34C,/* 0xE8-0xEF */ + 0xB34D,0xB34E,0xB34F,0xB350,0xB351,0xB352,0xB353,0xB357,/* 0xF0-0xF7 */ + 0xB359,0xB35A,0xB35D,0xB360,0xB361,0xB362,0xB363,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB366,0xB368,0xB36A,0xB36C,0xB36D,0xB36F,0xB372,/* 0x40-0x47 */ + 0xB373,0xB375,0xB376,0xB377,0xB379,0xB37A,0xB37B,0xB37C,/* 0x48-0x4F */ + 0xB37D,0xB37E,0xB37F,0xB382,0xB386,0xB387,0xB388,0xB389,/* 0x50-0x57 */ + 0xB38A,0xB38B,0xB38D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB38E,0xB38F,0xB391,0xB392,0xB393,0xB395,0xB396,/* 0x60-0x67 */ + 0xB397,0xB398,0xB399,0xB39A,0xB39B,0xB39C,0xB39D,0xB39E,/* 0x68-0x6F */ + 0xB39F,0xB3A2,0xB3A3,0xB3A4,0xB3A5,0xB3A6,0xB3A7,0xB3A9,/* 0x70-0x77 */ + 0xB3AA,0xB3AB,0xB3AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB3AE,0xB3AF,0xB3B0,0xB3B1,0xB3B2,0xB3B3,0xB3B4,/* 0x80-0x87 */ + 0xB3B5,0xB3B6,0xB3B7,0xB3B8,0xB3B9,0xB3BA,0xB3BB,0xB3BC,/* 0x88-0x8F */ + 0xB3BD,0xB3BE,0xB3BF,0xB3C0,0xB3C1,0xB3C2,0xB3C3,0xB3C6,/* 0x90-0x97 */ + 0xB3C7,0xB3C9,0xB3CA,0xB3CD,0xB3CF,0xB3D1,0xB3D2,0xB3D3,/* 0x98-0x9F */ + 0xB3D6,0xB3D8,0xB3DA,0xB3DC,0xB3DE,0xB3DF,0xB3E1,0xB3E2,/* 0xA0-0xA7 */ + 0xB3E3,0xB3E5,0xB3E6,0xB3E7,0xB3E9,0xB3EA,0xB3EB,0xB3EC,/* 0xA8-0xAF */ + 0xB3ED,0xB3EE,0xB3EF,0xB3F0,0xB3F1,0xB3F2,0xB3F3,0xB3F4,/* 0xB0-0xB7 */ + 0xB3F5,0xB3F6,0xB3F7,0xB3F8,0xB3F9,0xB3FA,0xB3FB,0xB3FD,/* 0xB8-0xBF */ + 0xB3FE,0xB3FF,0xB400,0xB401,0xB402,0xB403,0xB404,0xB405,/* 0xC0-0xC7 */ + 0xB406,0xB407,0xB408,0xB409,0xB40A,0xB40B,0xB40C,0xB40D,/* 0xC8-0xCF */ + 0xB40E,0xB40F,0xB411,0xB412,0xB413,0xB414,0xB415,0xB416,/* 0xD0-0xD7 */ + 0xB417,0xB419,0xB41A,0xB41B,0xB41D,0xB41E,0xB41F,0xB421,/* 0xD8-0xDF */ + 0xB422,0xB423,0xB424,0xB425,0xB426,0xB427,0xB42A,0xB42C,/* 0xE0-0xE7 */ + 0xB42D,0xB42E,0xB42F,0xB430,0xB431,0xB432,0xB433,0xB435,/* 0xE8-0xEF */ + 0xB436,0xB437,0xB438,0xB439,0xB43A,0xB43B,0xB43C,0xB43D,/* 0xF0-0xF7 */ + 0xB43E,0xB43F,0xB440,0xB441,0xB442,0xB443,0xB444,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB445,0xB446,0xB447,0xB448,0xB449,0xB44A,0xB44B,/* 0x40-0x47 */ + 0xB44C,0xB44D,0xB44E,0xB44F,0xB452,0xB453,0xB455,0xB456,/* 0x48-0x4F */ + 0xB457,0xB459,0xB45A,0xB45B,0xB45C,0xB45D,0xB45E,0xB45F,/* 0x50-0x57 */ + 0xB462,0xB464,0xB466,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB467,0xB468,0xB469,0xB46A,0xB46B,0xB46D,0xB46E,/* 0x60-0x67 */ + 0xB46F,0xB470,0xB471,0xB472,0xB473,0xB474,0xB475,0xB476,/* 0x68-0x6F */ + 0xB477,0xB478,0xB479,0xB47A,0xB47B,0xB47C,0xB47D,0xB47E,/* 0x70-0x77 */ + 0xB47F,0xB481,0xB482,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB483,0xB484,0xB485,0xB486,0xB487,0xB489,0xB48A,/* 0x80-0x87 */ + 0xB48B,0xB48C,0xB48D,0xB48E,0xB48F,0xB490,0xB491,0xB492,/* 0x88-0x8F */ + 0xB493,0xB494,0xB495,0xB496,0xB497,0xB498,0xB499,0xB49A,/* 0x90-0x97 */ + 0xB49B,0xB49C,0xB49E,0xB49F,0xB4A0,0xB4A1,0xB4A2,0xB4A3,/* 0x98-0x9F */ + 0xB4A5,0xB4A6,0xB4A7,0xB4A9,0xB4AA,0xB4AB,0xB4AD,0xB4AE,/* 0xA0-0xA7 */ + 0xB4AF,0xB4B0,0xB4B1,0xB4B2,0xB4B3,0xB4B4,0xB4B6,0xB4B8,/* 0xA8-0xAF */ + 0xB4BA,0xB4BB,0xB4BC,0xB4BD,0xB4BE,0xB4BF,0xB4C1,0xB4C2,/* 0xB0-0xB7 */ + 0xB4C3,0xB4C5,0xB4C6,0xB4C7,0xB4C9,0xB4CA,0xB4CB,0xB4CC,/* 0xB8-0xBF */ + 0xB4CD,0xB4CE,0xB4CF,0xB4D1,0xB4D2,0xB4D3,0xB4D4,0xB4D6,/* 0xC0-0xC7 */ + 0xB4D7,0xB4D8,0xB4D9,0xB4DA,0xB4DB,0xB4DE,0xB4DF,0xB4E1,/* 0xC8-0xCF */ + 0xB4E2,0xB4E5,0xB4E7,0xB4E8,0xB4E9,0xB4EA,0xB4EB,0xB4EE,/* 0xD0-0xD7 */ + 0xB4F0,0xB4F2,0xB4F3,0xB4F4,0xB4F5,0xB4F6,0xB4F7,0xB4F9,/* 0xD8-0xDF */ + 0xB4FA,0xB4FB,0xB4FC,0xB4FD,0xB4FE,0xB4FF,0xB500,0xB501,/* 0xE0-0xE7 */ + 0xB502,0xB503,0xB504,0xB505,0xB506,0xB507,0xB508,0xB509,/* 0xE8-0xEF */ + 0xB50A,0xB50B,0xB50C,0xB50D,0xB50E,0xB50F,0xB510,0xB511,/* 0xF0-0xF7 */ + 0xB512,0xB513,0xB516,0xB517,0xB519,0xB51A,0xB51D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB51E,0xB51F,0xB520,0xB521,0xB522,0xB523,0xB526,/* 0x40-0x47 */ + 0xB52B,0xB52C,0xB52D,0xB52E,0xB52F,0xB532,0xB533,0xB535,/* 0x48-0x4F */ + 0xB536,0xB537,0xB539,0xB53A,0xB53B,0xB53C,0xB53D,0xB53E,/* 0x50-0x57 */ + 0xB53F,0xB542,0xB546,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB547,0xB548,0xB549,0xB54A,0xB54E,0xB54F,0xB551,/* 0x60-0x67 */ + 0xB552,0xB553,0xB555,0xB556,0xB557,0xB558,0xB559,0xB55A,/* 0x68-0x6F */ + 0xB55B,0xB55E,0xB562,0xB563,0xB564,0xB565,0xB566,0xB567,/* 0x70-0x77 */ + 0xB568,0xB569,0xB56A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB56B,0xB56C,0xB56D,0xB56E,0xB56F,0xB570,0xB571,/* 0x80-0x87 */ + 0xB572,0xB573,0xB574,0xB575,0xB576,0xB577,0xB578,0xB579,/* 0x88-0x8F */ + 0xB57A,0xB57B,0xB57C,0xB57D,0xB57E,0xB57F,0xB580,0xB581,/* 0x90-0x97 */ + 0xB582,0xB583,0xB584,0xB585,0xB586,0xB587,0xB588,0xB589,/* 0x98-0x9F */ + 0xB58A,0xB58B,0xB58C,0xB58D,0xB58E,0xB58F,0xB590,0xB591,/* 0xA0-0xA7 */ + 0xB592,0xB593,0xB594,0xB595,0xB596,0xB597,0xB598,0xB599,/* 0xA8-0xAF */ + 0xB59A,0xB59B,0xB59C,0xB59D,0xB59E,0xB59F,0xB5A2,0xB5A3,/* 0xB0-0xB7 */ + 0xB5A5,0xB5A6,0xB5A7,0xB5A9,0xB5AC,0xB5AD,0xB5AE,0xB5AF,/* 0xB8-0xBF */ + 0xB5B2,0xB5B6,0xB5B7,0xB5B8,0xB5B9,0xB5BA,0xB5BE,0xB5BF,/* 0xC0-0xC7 */ + 0xB5C1,0xB5C2,0xB5C3,0xB5C5,0xB5C6,0xB5C7,0xB5C8,0xB5C9,/* 0xC8-0xCF */ + 0xB5CA,0xB5CB,0xB5CE,0xB5D2,0xB5D3,0xB5D4,0xB5D5,0xB5D6,/* 0xD0-0xD7 */ + 0xB5D7,0xB5D9,0xB5DA,0xB5DB,0xB5DC,0xB5DD,0xB5DE,0xB5DF,/* 0xD8-0xDF */ + 0xB5E0,0xB5E1,0xB5E2,0xB5E3,0xB5E4,0xB5E5,0xB5E6,0xB5E7,/* 0xE0-0xE7 */ + 0xB5E8,0xB5E9,0xB5EA,0xB5EB,0xB5ED,0xB5EE,0xB5EF,0xB5F0,/* 0xE8-0xEF */ + 0xB5F1,0xB5F2,0xB5F3,0xB5F4,0xB5F5,0xB5F6,0xB5F7,0xB5F8,/* 0xF0-0xF7 */ + 0xB5F9,0xB5FA,0xB5FB,0xB5FC,0xB5FD,0xB5FE,0xB5FF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB600,0xB601,0xB602,0xB603,0xB604,0xB605,0xB606,/* 0x40-0x47 */ + 0xB607,0xB608,0xB609,0xB60A,0xB60B,0xB60C,0xB60D,0xB60E,/* 0x48-0x4F */ + 0xB60F,0xB612,0xB613,0xB615,0xB616,0xB617,0xB619,0xB61A,/* 0x50-0x57 */ + 0xB61B,0xB61C,0xB61D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB61E,0xB61F,0xB620,0xB621,0xB622,0xB623,0xB624,/* 0x60-0x67 */ + 0xB626,0xB627,0xB628,0xB629,0xB62A,0xB62B,0xB62D,0xB62E,/* 0x68-0x6F */ + 0xB62F,0xB630,0xB631,0xB632,0xB633,0xB635,0xB636,0xB637,/* 0x70-0x77 */ + 0xB638,0xB639,0xB63A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB63B,0xB63C,0xB63D,0xB63E,0xB63F,0xB640,0xB641,/* 0x80-0x87 */ + 0xB642,0xB643,0xB644,0xB645,0xB646,0xB647,0xB649,0xB64A,/* 0x88-0x8F */ + 0xB64B,0xB64C,0xB64D,0xB64E,0xB64F,0xB650,0xB651,0xB652,/* 0x90-0x97 */ + 0xB653,0xB654,0xB655,0xB656,0xB657,0xB658,0xB659,0xB65A,/* 0x98-0x9F */ + 0xB65B,0xB65C,0xB65D,0xB65E,0xB65F,0xB660,0xB661,0xB662,/* 0xA0-0xA7 */ + 0xB663,0xB665,0xB666,0xB667,0xB669,0xB66A,0xB66B,0xB66C,/* 0xA8-0xAF */ + 0xB66D,0xB66E,0xB66F,0xB670,0xB671,0xB672,0xB673,0xB674,/* 0xB0-0xB7 */ + 0xB675,0xB676,0xB677,0xB678,0xB679,0xB67A,0xB67B,0xB67C,/* 0xB8-0xBF */ + 0xB67D,0xB67E,0xB67F,0xB680,0xB681,0xB682,0xB683,0xB684,/* 0xC0-0xC7 */ + 0xB685,0xB686,0xB687,0xB688,0xB689,0xB68A,0xB68B,0xB68C,/* 0xC8-0xCF */ + 0xB68D,0xB68E,0xB68F,0xB690,0xB691,0xB692,0xB693,0xB694,/* 0xD0-0xD7 */ + 0xB695,0xB696,0xB697,0xB698,0xB699,0xB69A,0xB69B,0xB69E,/* 0xD8-0xDF */ + 0xB69F,0xB6A1,0xB6A2,0xB6A3,0xB6A5,0xB6A6,0xB6A7,0xB6A8,/* 0xE0-0xE7 */ + 0xB6A9,0xB6AA,0xB6AD,0xB6AE,0xB6AF,0xB6B0,0xB6B2,0xB6B3,/* 0xE8-0xEF */ + 0xB6B4,0xB6B5,0xB6B6,0xB6B7,0xB6B8,0xB6B9,0xB6BA,0xB6BB,/* 0xF0-0xF7 */ + 0xB6BC,0xB6BD,0xB6BE,0xB6BF,0xB6C0,0xB6C1,0xB6C2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB6C3,0xB6C4,0xB6C5,0xB6C6,0xB6C7,0xB6C8,0xB6C9,/* 0x40-0x47 */ + 0xB6CA,0xB6CB,0xB6CC,0xB6CD,0xB6CE,0xB6CF,0xB6D0,0xB6D1,/* 0x48-0x4F */ + 0xB6D2,0xB6D3,0xB6D5,0xB6D6,0xB6D7,0xB6D8,0xB6D9,0xB6DA,/* 0x50-0x57 */ + 0xB6DB,0xB6DC,0xB6DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB6DE,0xB6DF,0xB6E0,0xB6E1,0xB6E2,0xB6E3,0xB6E4,/* 0x60-0x67 */ + 0xB6E5,0xB6E6,0xB6E7,0xB6E8,0xB6E9,0xB6EA,0xB6EB,0xB6EC,/* 0x68-0x6F */ + 0xB6ED,0xB6EE,0xB6EF,0xB6F1,0xB6F2,0xB6F3,0xB6F5,0xB6F6,/* 0x70-0x77 */ + 0xB6F7,0xB6F9,0xB6FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB6FB,0xB6FC,0xB6FD,0xB6FE,0xB6FF,0xB702,0xB703,/* 0x80-0x87 */ + 0xB704,0xB706,0xB707,0xB708,0xB709,0xB70A,0xB70B,0xB70C,/* 0x88-0x8F */ + 0xB70D,0xB70E,0xB70F,0xB710,0xB711,0xB712,0xB713,0xB714,/* 0x90-0x97 */ + 0xB715,0xB716,0xB717,0xB718,0xB719,0xB71A,0xB71B,0xB71C,/* 0x98-0x9F */ + 0xB71D,0xB71E,0xB71F,0xB720,0xB721,0xB722,0xB723,0xB724,/* 0xA0-0xA7 */ + 0xB725,0xB726,0xB727,0xB72A,0xB72B,0xB72D,0xB72E,0xB731,/* 0xA8-0xAF */ + 0xB732,0xB733,0xB734,0xB735,0xB736,0xB737,0xB73A,0xB73C,/* 0xB0-0xB7 */ + 0xB73D,0xB73E,0xB73F,0xB740,0xB741,0xB742,0xB743,0xB745,/* 0xB8-0xBF */ + 0xB746,0xB747,0xB749,0xB74A,0xB74B,0xB74D,0xB74E,0xB74F,/* 0xC0-0xC7 */ + 0xB750,0xB751,0xB752,0xB753,0xB756,0xB757,0xB758,0xB759,/* 0xC8-0xCF */ + 0xB75A,0xB75B,0xB75C,0xB75D,0xB75E,0xB75F,0xB761,0xB762,/* 0xD0-0xD7 */ + 0xB763,0xB765,0xB766,0xB767,0xB769,0xB76A,0xB76B,0xB76C,/* 0xD8-0xDF */ + 0xB76D,0xB76E,0xB76F,0xB772,0xB774,0xB776,0xB777,0xB778,/* 0xE0-0xE7 */ + 0xB779,0xB77A,0xB77B,0xB77E,0xB77F,0xB781,0xB782,0xB783,/* 0xE8-0xEF */ + 0xB785,0xB786,0xB787,0xB788,0xB789,0xB78A,0xB78B,0xB78E,/* 0xF0-0xF7 */ + 0xB793,0xB794,0xB795,0xB79A,0xB79B,0xB79D,0xB79E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB79F,0xB7A1,0xB7A2,0xB7A3,0xB7A4,0xB7A5,0xB7A6,/* 0x40-0x47 */ + 0xB7A7,0xB7AA,0xB7AE,0xB7AF,0xB7B0,0xB7B1,0xB7B2,0xB7B3,/* 0x48-0x4F */ + 0xB7B6,0xB7B7,0xB7B9,0xB7BA,0xB7BB,0xB7BC,0xB7BD,0xB7BE,/* 0x50-0x57 */ + 0xB7BF,0xB7C0,0xB7C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB7C2,0xB7C3,0xB7C4,0xB7C5,0xB7C6,0xB7C8,0xB7CA,/* 0x60-0x67 */ + 0xB7CB,0xB7CC,0xB7CD,0xB7CE,0xB7CF,0xB7D0,0xB7D1,0xB7D2,/* 0x68-0x6F */ + 0xB7D3,0xB7D4,0xB7D5,0xB7D6,0xB7D7,0xB7D8,0xB7D9,0xB7DA,/* 0x70-0x77 */ + 0xB7DB,0xB7DC,0xB7DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB7DE,0xB7DF,0xB7E0,0xB7E1,0xB7E2,0xB7E3,0xB7E4,/* 0x80-0x87 */ + 0xB7E5,0xB7E6,0xB7E7,0xB7E8,0xB7E9,0xB7EA,0xB7EB,0xB7EE,/* 0x88-0x8F */ + 0xB7EF,0xB7F1,0xB7F2,0xB7F3,0xB7F5,0xB7F6,0xB7F7,0xB7F8,/* 0x90-0x97 */ + 0xB7F9,0xB7FA,0xB7FB,0xB7FE,0xB802,0xB803,0xB804,0xB805,/* 0x98-0x9F */ + 0xB806,0xB80A,0xB80B,0xB80D,0xB80E,0xB80F,0xB811,0xB812,/* 0xA0-0xA7 */ + 0xB813,0xB814,0xB815,0xB816,0xB817,0xB81A,0xB81C,0xB81E,/* 0xA8-0xAF */ + 0xB81F,0xB820,0xB821,0xB822,0xB823,0xB826,0xB827,0xB829,/* 0xB0-0xB7 */ + 0xB82A,0xB82B,0xB82D,0xB82E,0xB82F,0xB830,0xB831,0xB832,/* 0xB8-0xBF */ + 0xB833,0xB836,0xB83A,0xB83B,0xB83C,0xB83D,0xB83E,0xB83F,/* 0xC0-0xC7 */ + 0xB841,0xB842,0xB843,0xB845,0xB846,0xB847,0xB848,0xB849,/* 0xC8-0xCF */ + 0xB84A,0xB84B,0xB84C,0xB84D,0xB84E,0xB84F,0xB850,0xB852,/* 0xD0-0xD7 */ + 0xB854,0xB855,0xB856,0xB857,0xB858,0xB859,0xB85A,0xB85B,/* 0xD8-0xDF */ + 0xB85E,0xB85F,0xB861,0xB862,0xB863,0xB865,0xB866,0xB867,/* 0xE0-0xE7 */ + 0xB868,0xB869,0xB86A,0xB86B,0xB86E,0xB870,0xB872,0xB873,/* 0xE8-0xEF */ + 0xB874,0xB875,0xB876,0xB877,0xB879,0xB87A,0xB87B,0xB87D,/* 0xF0-0xF7 */ + 0xB87E,0xB87F,0xB880,0xB881,0xB882,0xB883,0xB884,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB885,0xB886,0xB887,0xB888,0xB889,0xB88A,0xB88B,/* 0x40-0x47 */ + 0xB88C,0xB88E,0xB88F,0xB890,0xB891,0xB892,0xB893,0xB894,/* 0x48-0x4F */ + 0xB895,0xB896,0xB897,0xB898,0xB899,0xB89A,0xB89B,0xB89C,/* 0x50-0x57 */ + 0xB89D,0xB89E,0xB89F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB8A0,0xB8A1,0xB8A2,0xB8A3,0xB8A4,0xB8A5,0xB8A6,/* 0x60-0x67 */ + 0xB8A7,0xB8A9,0xB8AA,0xB8AB,0xB8AC,0xB8AD,0xB8AE,0xB8AF,/* 0x68-0x6F */ + 0xB8B1,0xB8B2,0xB8B3,0xB8B5,0xB8B6,0xB8B7,0xB8B9,0xB8BA,/* 0x70-0x77 */ + 0xB8BB,0xB8BC,0xB8BD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB8BE,0xB8BF,0xB8C2,0xB8C4,0xB8C6,0xB8C7,0xB8C8,/* 0x80-0x87 */ + 0xB8C9,0xB8CA,0xB8CB,0xB8CD,0xB8CE,0xB8CF,0xB8D1,0xB8D2,/* 0x88-0x8F */ + 0xB8D3,0xB8D5,0xB8D6,0xB8D7,0xB8D8,0xB8D9,0xB8DA,0xB8DB,/* 0x90-0x97 */ + 0xB8DC,0xB8DE,0xB8E0,0xB8E2,0xB8E3,0xB8E4,0xB8E5,0xB8E6,/* 0x98-0x9F */ + 0xB8E7,0xB8EA,0xB8EB,0xB8ED,0xB8EE,0xB8EF,0xB8F1,0xB8F2,/* 0xA0-0xA7 */ + 0xB8F3,0xB8F4,0xB8F5,0xB8F6,0xB8F7,0xB8FA,0xB8FC,0xB8FE,/* 0xA8-0xAF */ + 0xB8FF,0xB900,0xB901,0xB902,0xB903,0xB905,0xB906,0xB907,/* 0xB0-0xB7 */ + 0xB908,0xB909,0xB90A,0xB90B,0xB90C,0xB90D,0xB90E,0xB90F,/* 0xB8-0xBF */ + 0xB910,0xB911,0xB912,0xB913,0xB914,0xB915,0xB916,0xB917,/* 0xC0-0xC7 */ + 0xB919,0xB91A,0xB91B,0xB91C,0xB91D,0xB91E,0xB91F,0xB921,/* 0xC8-0xCF */ + 0xB922,0xB923,0xB924,0xB925,0xB926,0xB927,0xB928,0xB929,/* 0xD0-0xD7 */ + 0xB92A,0xB92B,0xB92C,0xB92D,0xB92E,0xB92F,0xB930,0xB931,/* 0xD8-0xDF */ + 0xB932,0xB933,0xB934,0xB935,0xB936,0xB937,0xB938,0xB939,/* 0xE0-0xE7 */ + 0xB93A,0xB93B,0xB93E,0xB93F,0xB941,0xB942,0xB943,0xB945,/* 0xE8-0xEF */ + 0xB946,0xB947,0xB948,0xB949,0xB94A,0xB94B,0xB94D,0xB94E,/* 0xF0-0xF7 */ + 0xB950,0xB952,0xB953,0xB954,0xB955,0xB956,0xB957,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB95A,0xB95B,0xB95D,0xB95E,0xB95F,0xB961,0xB962,/* 0x40-0x47 */ + 0xB963,0xB964,0xB965,0xB966,0xB967,0xB96A,0xB96C,0xB96E,/* 0x48-0x4F */ + 0xB96F,0xB970,0xB971,0xB972,0xB973,0xB976,0xB977,0xB979,/* 0x50-0x57 */ + 0xB97A,0xB97B,0xB97D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB97E,0xB97F,0xB980,0xB981,0xB982,0xB983,0xB986,/* 0x60-0x67 */ + 0xB988,0xB98B,0xB98C,0xB98F,0xB990,0xB991,0xB992,0xB993,/* 0x68-0x6F */ + 0xB994,0xB995,0xB996,0xB997,0xB998,0xB999,0xB99A,0xB99B,/* 0x70-0x77 */ + 0xB99C,0xB99D,0xB99E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB99F,0xB9A0,0xB9A1,0xB9A2,0xB9A3,0xB9A4,0xB9A5,/* 0x80-0x87 */ + 0xB9A6,0xB9A7,0xB9A8,0xB9A9,0xB9AA,0xB9AB,0xB9AE,0xB9AF,/* 0x88-0x8F */ + 0xB9B1,0xB9B2,0xB9B3,0xB9B5,0xB9B6,0xB9B7,0xB9B8,0xB9B9,/* 0x90-0x97 */ + 0xB9BA,0xB9BB,0xB9BE,0xB9C0,0xB9C2,0xB9C3,0xB9C4,0xB9C5,/* 0x98-0x9F */ + 0xB9C6,0xB9C7,0xB9CA,0xB9CB,0xB9CD,0xB9D3,0xB9D4,0xB9D5,/* 0xA0-0xA7 */ + 0xB9D6,0xB9D7,0xB9DA,0xB9DC,0xB9DF,0xB9E0,0xB9E2,0xB9E6,/* 0xA8-0xAF */ + 0xB9E7,0xB9E9,0xB9EA,0xB9EB,0xB9ED,0xB9EE,0xB9EF,0xB9F0,/* 0xB0-0xB7 */ + 0xB9F1,0xB9F2,0xB9F3,0xB9F6,0xB9FB,0xB9FC,0xB9FD,0xB9FE,/* 0xB8-0xBF */ + 0xB9FF,0xBA02,0xBA03,0xBA04,0xBA05,0xBA06,0xBA07,0xBA09,/* 0xC0-0xC7 */ + 0xBA0A,0xBA0B,0xBA0C,0xBA0D,0xBA0E,0xBA0F,0xBA10,0xBA11,/* 0xC8-0xCF */ + 0xBA12,0xBA13,0xBA14,0xBA16,0xBA17,0xBA18,0xBA19,0xBA1A,/* 0xD0-0xD7 */ + 0xBA1B,0xBA1C,0xBA1D,0xBA1E,0xBA1F,0xBA20,0xBA21,0xBA22,/* 0xD8-0xDF */ + 0xBA23,0xBA24,0xBA25,0xBA26,0xBA27,0xBA28,0xBA29,0xBA2A,/* 0xE0-0xE7 */ + 0xBA2B,0xBA2C,0xBA2D,0xBA2E,0xBA2F,0xBA30,0xBA31,0xBA32,/* 0xE8-0xEF */ + 0xBA33,0xBA34,0xBA35,0xBA36,0xBA37,0xBA3A,0xBA3B,0xBA3D,/* 0xF0-0xF7 */ + 0xBA3E,0xBA3F,0xBA41,0xBA43,0xBA44,0xBA45,0xBA46,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBA47,0xBA4A,0xBA4C,0xBA4F,0xBA50,0xBA51,0xBA52,/* 0x40-0x47 */ + 0xBA56,0xBA57,0xBA59,0xBA5A,0xBA5B,0xBA5D,0xBA5E,0xBA5F,/* 0x48-0x4F */ + 0xBA60,0xBA61,0xBA62,0xBA63,0xBA66,0xBA6A,0xBA6B,0xBA6C,/* 0x50-0x57 */ + 0xBA6D,0xBA6E,0xBA6F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBA72,0xBA73,0xBA75,0xBA76,0xBA77,0xBA79,0xBA7A,/* 0x60-0x67 */ + 0xBA7B,0xBA7C,0xBA7D,0xBA7E,0xBA7F,0xBA80,0xBA81,0xBA82,/* 0x68-0x6F */ + 0xBA86,0xBA88,0xBA89,0xBA8A,0xBA8B,0xBA8D,0xBA8E,0xBA8F,/* 0x70-0x77 */ + 0xBA90,0xBA91,0xBA92,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBA93,0xBA94,0xBA95,0xBA96,0xBA97,0xBA98,0xBA99,/* 0x80-0x87 */ + 0xBA9A,0xBA9B,0xBA9C,0xBA9D,0xBA9E,0xBA9F,0xBAA0,0xBAA1,/* 0x88-0x8F */ + 0xBAA2,0xBAA3,0xBAA4,0xBAA5,0xBAA6,0xBAA7,0xBAAA,0xBAAD,/* 0x90-0x97 */ + 0xBAAE,0xBAAF,0xBAB1,0xBAB3,0xBAB4,0xBAB5,0xBAB6,0xBAB7,/* 0x98-0x9F */ + 0xBABA,0xBABC,0xBABE,0xBABF,0xBAC0,0xBAC1,0xBAC2,0xBAC3,/* 0xA0-0xA7 */ + 0xBAC5,0xBAC6,0xBAC7,0xBAC9,0xBACA,0xBACB,0xBACC,0xBACD,/* 0xA8-0xAF */ + 0xBACE,0xBACF,0xBAD0,0xBAD1,0xBAD2,0xBAD3,0xBAD4,0xBAD5,/* 0xB0-0xB7 */ + 0xBAD6,0xBAD7,0xBADA,0xBADB,0xBADC,0xBADD,0xBADE,0xBADF,/* 0xB8-0xBF */ + 0xBAE0,0xBAE1,0xBAE2,0xBAE3,0xBAE4,0xBAE5,0xBAE6,0xBAE7,/* 0xC0-0xC7 */ + 0xBAE8,0xBAE9,0xBAEA,0xBAEB,0xBAEC,0xBAED,0xBAEE,0xBAEF,/* 0xC8-0xCF */ + 0xBAF0,0xBAF1,0xBAF2,0xBAF3,0xBAF4,0xBAF5,0xBAF6,0xBAF7,/* 0xD0-0xD7 */ + 0xBAF8,0xBAF9,0xBAFA,0xBAFB,0xBAFD,0xBAFE,0xBAFF,0xBB01,/* 0xD8-0xDF */ + 0xBB02,0xBB03,0xBB05,0xBB06,0xBB07,0xBB08,0xBB09,0xBB0A,/* 0xE0-0xE7 */ + 0xBB0B,0xBB0C,0xBB0E,0xBB10,0xBB12,0xBB13,0xBB14,0xBB15,/* 0xE8-0xEF */ + 0xBB16,0xBB17,0xBB19,0xBB1A,0xBB1B,0xBB1D,0xBB1E,0xBB1F,/* 0xF0-0xF7 */ + 0xBB21,0xBB22,0xBB23,0xBB24,0xBB25,0xBB26,0xBB27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBB28,0xBB2A,0xBB2C,0xBB2D,0xBB2E,0xBB2F,0xBB30,/* 0x40-0x47 */ + 0xBB31,0xBB32,0xBB33,0xBB37,0xBB39,0xBB3A,0xBB3F,0xBB40,/* 0x48-0x4F */ + 0xBB41,0xBB42,0xBB43,0xBB46,0xBB48,0xBB4A,0xBB4B,0xBB4C,/* 0x50-0x57 */ + 0xBB4E,0xBB51,0xBB52,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBB53,0xBB55,0xBB56,0xBB57,0xBB59,0xBB5A,0xBB5B,/* 0x60-0x67 */ + 0xBB5C,0xBB5D,0xBB5E,0xBB5F,0xBB60,0xBB62,0xBB64,0xBB65,/* 0x68-0x6F */ + 0xBB66,0xBB67,0xBB68,0xBB69,0xBB6A,0xBB6B,0xBB6D,0xBB6E,/* 0x70-0x77 */ + 0xBB6F,0xBB70,0xBB71,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBB72,0xBB73,0xBB74,0xBB75,0xBB76,0xBB77,0xBB78,/* 0x80-0x87 */ + 0xBB79,0xBB7A,0xBB7B,0xBB7C,0xBB7D,0xBB7E,0xBB7F,0xBB80,/* 0x88-0x8F */ + 0xBB81,0xBB82,0xBB83,0xBB84,0xBB85,0xBB86,0xBB87,0xBB89,/* 0x90-0x97 */ + 0xBB8A,0xBB8B,0xBB8D,0xBB8E,0xBB8F,0xBB91,0xBB92,0xBB93,/* 0x98-0x9F */ + 0xBB94,0xBB95,0xBB96,0xBB97,0xBB98,0xBB99,0xBB9A,0xBB9B,/* 0xA0-0xA7 */ + 0xBB9C,0xBB9D,0xBB9E,0xBB9F,0xBBA0,0xBBA1,0xBBA2,0xBBA3,/* 0xA8-0xAF */ + 0xBBA5,0xBBA6,0xBBA7,0xBBA9,0xBBAA,0xBBAB,0xBBAD,0xBBAE,/* 0xB0-0xB7 */ + 0xBBAF,0xBBB0,0xBBB1,0xBBB2,0xBBB3,0xBBB5,0xBBB6,0xBBB8,/* 0xB8-0xBF */ + 0xBBB9,0xBBBA,0xBBBB,0xBBBC,0xBBBD,0xBBBE,0xBBBF,0xBBC1,/* 0xC0-0xC7 */ + 0xBBC2,0xBBC3,0xBBC5,0xBBC6,0xBBC7,0xBBC9,0xBBCA,0xBBCB,/* 0xC8-0xCF */ + 0xBBCC,0xBBCD,0xBBCE,0xBBCF,0xBBD1,0xBBD2,0xBBD4,0xBBD5,/* 0xD0-0xD7 */ + 0xBBD6,0xBBD7,0xBBD8,0xBBD9,0xBBDA,0xBBDB,0xBBDC,0xBBDD,/* 0xD8-0xDF */ + 0xBBDE,0xBBDF,0xBBE0,0xBBE1,0xBBE2,0xBBE3,0xBBE4,0xBBE5,/* 0xE0-0xE7 */ + 0xBBE6,0xBBE7,0xBBE8,0xBBE9,0xBBEA,0xBBEB,0xBBEC,0xBBED,/* 0xE8-0xEF */ + 0xBBEE,0xBBEF,0xBBF0,0xBBF1,0xBBF2,0xBBF3,0xBBF4,0xBBF5,/* 0xF0-0xF7 */ + 0xBBF6,0xBBF7,0xBBFA,0xBBFB,0xBBFD,0xBBFE,0xBC01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBC03,0xBC04,0xBC05,0xBC06,0xBC07,0xBC0A,0xBC0E,/* 0x40-0x47 */ + 0xBC10,0xBC12,0xBC13,0xBC19,0xBC1A,0xBC20,0xBC21,0xBC22,/* 0x48-0x4F */ + 0xBC23,0xBC26,0xBC28,0xBC2A,0xBC2B,0xBC2C,0xBC2E,0xBC2F,/* 0x50-0x57 */ + 0xBC32,0xBC33,0xBC35,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBC36,0xBC37,0xBC39,0xBC3A,0xBC3B,0xBC3C,0xBC3D,/* 0x60-0x67 */ + 0xBC3E,0xBC3F,0xBC42,0xBC46,0xBC47,0xBC48,0xBC4A,0xBC4B,/* 0x68-0x6F */ + 0xBC4E,0xBC4F,0xBC51,0xBC52,0xBC53,0xBC54,0xBC55,0xBC56,/* 0x70-0x77 */ + 0xBC57,0xBC58,0xBC59,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBC5A,0xBC5B,0xBC5C,0xBC5E,0xBC5F,0xBC60,0xBC61,/* 0x80-0x87 */ + 0xBC62,0xBC63,0xBC64,0xBC65,0xBC66,0xBC67,0xBC68,0xBC69,/* 0x88-0x8F */ + 0xBC6A,0xBC6B,0xBC6C,0xBC6D,0xBC6E,0xBC6F,0xBC70,0xBC71,/* 0x90-0x97 */ + 0xBC72,0xBC73,0xBC74,0xBC75,0xBC76,0xBC77,0xBC78,0xBC79,/* 0x98-0x9F */ + 0xBC7A,0xBC7B,0xBC7C,0xBC7D,0xBC7E,0xBC7F,0xBC80,0xBC81,/* 0xA0-0xA7 */ + 0xBC82,0xBC83,0xBC86,0xBC87,0xBC89,0xBC8A,0xBC8D,0xBC8F,/* 0xA8-0xAF */ + 0xBC90,0xBC91,0xBC92,0xBC93,0xBC96,0xBC98,0xBC9B,0xBC9C,/* 0xB0-0xB7 */ + 0xBC9D,0xBC9E,0xBC9F,0xBCA2,0xBCA3,0xBCA5,0xBCA6,0xBCA9,/* 0xB8-0xBF */ + 0xBCAA,0xBCAB,0xBCAC,0xBCAD,0xBCAE,0xBCAF,0xBCB2,0xBCB6,/* 0xC0-0xC7 */ + 0xBCB7,0xBCB8,0xBCB9,0xBCBA,0xBCBB,0xBCBE,0xBCBF,0xBCC1,/* 0xC8-0xCF */ + 0xBCC2,0xBCC3,0xBCC5,0xBCC6,0xBCC7,0xBCC8,0xBCC9,0xBCCA,/* 0xD0-0xD7 */ + 0xBCCB,0xBCCC,0xBCCE,0xBCD2,0xBCD3,0xBCD4,0xBCD6,0xBCD7,/* 0xD8-0xDF */ + 0xBCD9,0xBCDA,0xBCDB,0xBCDD,0xBCDE,0xBCDF,0xBCE0,0xBCE1,/* 0xE0-0xE7 */ + 0xBCE2,0xBCE3,0xBCE4,0xBCE5,0xBCE6,0xBCE7,0xBCE8,0xBCE9,/* 0xE8-0xEF */ + 0xBCEA,0xBCEB,0xBCEC,0xBCED,0xBCEE,0xBCEF,0xBCF0,0xBCF1,/* 0xF0-0xF7 */ + 0xBCF2,0xBCF3,0xBCF7,0xBCF9,0xBCFA,0xBCFB,0xBCFD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBCFE,0xBCFF,0xBD00,0xBD01,0xBD02,0xBD03,0xBD06,/* 0x40-0x47 */ + 0xBD08,0xBD0A,0xBD0B,0xBD0C,0xBD0D,0xBD0E,0xBD0F,0xBD11,/* 0x48-0x4F */ + 0xBD12,0xBD13,0xBD15,0xBD16,0xBD17,0xBD18,0xBD19,0xBD1A,/* 0x50-0x57 */ + 0xBD1B,0xBD1C,0xBD1D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBD1E,0xBD1F,0xBD20,0xBD21,0xBD22,0xBD23,0xBD25,/* 0x60-0x67 */ + 0xBD26,0xBD27,0xBD28,0xBD29,0xBD2A,0xBD2B,0xBD2D,0xBD2E,/* 0x68-0x6F */ + 0xBD2F,0xBD30,0xBD31,0xBD32,0xBD33,0xBD34,0xBD35,0xBD36,/* 0x70-0x77 */ + 0xBD37,0xBD38,0xBD39,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBD3A,0xBD3B,0xBD3C,0xBD3D,0xBD3E,0xBD3F,0xBD41,/* 0x80-0x87 */ + 0xBD42,0xBD43,0xBD44,0xBD45,0xBD46,0xBD47,0xBD4A,0xBD4B,/* 0x88-0x8F */ + 0xBD4D,0xBD4E,0xBD4F,0xBD51,0xBD52,0xBD53,0xBD54,0xBD55,/* 0x90-0x97 */ + 0xBD56,0xBD57,0xBD5A,0xBD5B,0xBD5C,0xBD5D,0xBD5E,0xBD5F,/* 0x98-0x9F */ + 0xBD60,0xBD61,0xBD62,0xBD63,0xBD65,0xBD66,0xBD67,0xBD69,/* 0xA0-0xA7 */ + 0xBD6A,0xBD6B,0xBD6C,0xBD6D,0xBD6E,0xBD6F,0xBD70,0xBD71,/* 0xA8-0xAF */ + 0xBD72,0xBD73,0xBD74,0xBD75,0xBD76,0xBD77,0xBD78,0xBD79,/* 0xB0-0xB7 */ + 0xBD7A,0xBD7B,0xBD7C,0xBD7D,0xBD7E,0xBD7F,0xBD82,0xBD83,/* 0xB8-0xBF */ + 0xBD85,0xBD86,0xBD8B,0xBD8C,0xBD8D,0xBD8E,0xBD8F,0xBD92,/* 0xC0-0xC7 */ + 0xBD94,0xBD96,0xBD97,0xBD98,0xBD9B,0xBD9D,0xBD9E,0xBD9F,/* 0xC8-0xCF */ + 0xBDA0,0xBDA1,0xBDA2,0xBDA3,0xBDA5,0xBDA6,0xBDA7,0xBDA8,/* 0xD0-0xD7 */ + 0xBDA9,0xBDAA,0xBDAB,0xBDAC,0xBDAD,0xBDAE,0xBDAF,0xBDB1,/* 0xD8-0xDF */ + 0xBDB2,0xBDB3,0xBDB4,0xBDB5,0xBDB6,0xBDB7,0xBDB9,0xBDBA,/* 0xE0-0xE7 */ + 0xBDBB,0xBDBC,0xBDBD,0xBDBE,0xBDBF,0xBDC0,0xBDC1,0xBDC2,/* 0xE8-0xEF */ + 0xBDC3,0xBDC4,0xBDC5,0xBDC6,0xBDC7,0xBDC8,0xBDC9,0xBDCA,/* 0xF0-0xF7 */ + 0xBDCB,0xBDCC,0xBDCD,0xBDCE,0xBDCF,0xBDD0,0xBDD1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBDD2,0xBDD3,0xBDD6,0xBDD7,0xBDD9,0xBDDA,0xBDDB,/* 0x40-0x47 */ + 0xBDDD,0xBDDE,0xBDDF,0xBDE0,0xBDE1,0xBDE2,0xBDE3,0xBDE4,/* 0x48-0x4F */ + 0xBDE5,0xBDE6,0xBDE7,0xBDE8,0xBDEA,0xBDEB,0xBDEC,0xBDED,/* 0x50-0x57 */ + 0xBDEE,0xBDEF,0xBDF1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBDF2,0xBDF3,0xBDF5,0xBDF6,0xBDF7,0xBDF9,0xBDFA,/* 0x60-0x67 */ + 0xBDFB,0xBDFC,0xBDFD,0xBDFE,0xBDFF,0xBE01,0xBE02,0xBE04,/* 0x68-0x6F */ + 0xBE06,0xBE07,0xBE08,0xBE09,0xBE0A,0xBE0B,0xBE0E,0xBE0F,/* 0x70-0x77 */ + 0xBE11,0xBE12,0xBE13,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBE15,0xBE16,0xBE17,0xBE18,0xBE19,0xBE1A,0xBE1B,/* 0x80-0x87 */ + 0xBE1E,0xBE20,0xBE21,0xBE22,0xBE23,0xBE24,0xBE25,0xBE26,/* 0x88-0x8F */ + 0xBE27,0xBE28,0xBE29,0xBE2A,0xBE2B,0xBE2C,0xBE2D,0xBE2E,/* 0x90-0x97 */ + 0xBE2F,0xBE30,0xBE31,0xBE32,0xBE33,0xBE34,0xBE35,0xBE36,/* 0x98-0x9F */ + 0xBE37,0xBE38,0xBE39,0xBE3A,0xBE3B,0xBE3C,0xBE3D,0xBE3E,/* 0xA0-0xA7 */ + 0xBE3F,0xBE40,0xBE41,0xBE42,0xBE43,0xBE46,0xBE47,0xBE49,/* 0xA8-0xAF */ + 0xBE4A,0xBE4B,0xBE4D,0xBE4F,0xBE50,0xBE51,0xBE52,0xBE53,/* 0xB0-0xB7 */ + 0xBE56,0xBE58,0xBE5C,0xBE5D,0xBE5E,0xBE5F,0xBE62,0xBE63,/* 0xB8-0xBF */ + 0xBE65,0xBE66,0xBE67,0xBE69,0xBE6B,0xBE6C,0xBE6D,0xBE6E,/* 0xC0-0xC7 */ + 0xBE6F,0xBE72,0xBE76,0xBE77,0xBE78,0xBE79,0xBE7A,0xBE7E,/* 0xC8-0xCF */ + 0xBE7F,0xBE81,0xBE82,0xBE83,0xBE85,0xBE86,0xBE87,0xBE88,/* 0xD0-0xD7 */ + 0xBE89,0xBE8A,0xBE8B,0xBE8E,0xBE92,0xBE93,0xBE94,0xBE95,/* 0xD8-0xDF */ + 0xBE96,0xBE97,0xBE9A,0xBE9B,0xBE9C,0xBE9D,0xBE9E,0xBE9F,/* 0xE0-0xE7 */ + 0xBEA0,0xBEA1,0xBEA2,0xBEA3,0xBEA4,0xBEA5,0xBEA6,0xBEA7,/* 0xE8-0xEF */ + 0xBEA9,0xBEAA,0xBEAB,0xBEAC,0xBEAD,0xBEAE,0xBEAF,0xBEB0,/* 0xF0-0xF7 */ + 0xBEB1,0xBEB2,0xBEB3,0xBEB4,0xBEB5,0xBEB6,0xBEB7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBEB8,0xBEB9,0xBEBA,0xBEBB,0xBEBC,0xBEBD,0xBEBE,/* 0x40-0x47 */ + 0xBEBF,0xBEC0,0xBEC1,0xBEC2,0xBEC3,0xBEC4,0xBEC5,0xBEC6,/* 0x48-0x4F */ + 0xBEC7,0xBEC8,0xBEC9,0xBECA,0xBECB,0xBECC,0xBECD,0xBECE,/* 0x50-0x57 */ + 0xBECF,0xBED2,0xBED3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBED5,0xBED6,0xBED9,0xBEDA,0xBEDB,0xBEDC,0xBEDD,/* 0x60-0x67 */ + 0xBEDE,0xBEDF,0xBEE1,0xBEE2,0xBEE6,0xBEE7,0xBEE8,0xBEE9,/* 0x68-0x6F */ + 0xBEEA,0xBEEB,0xBEED,0xBEEE,0xBEEF,0xBEF0,0xBEF1,0xBEF2,/* 0x70-0x77 */ + 0xBEF3,0xBEF4,0xBEF5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBEF6,0xBEF7,0xBEF8,0xBEF9,0xBEFA,0xBEFB,0xBEFC,/* 0x80-0x87 */ + 0xBEFD,0xBEFE,0xBEFF,0xBF00,0xBF02,0xBF03,0xBF04,0xBF05,/* 0x88-0x8F */ + 0xBF06,0xBF07,0xBF0A,0xBF0B,0xBF0C,0xBF0D,0xBF0E,0xBF0F,/* 0x90-0x97 */ + 0xBF10,0xBF11,0xBF12,0xBF13,0xBF14,0xBF15,0xBF16,0xBF17,/* 0x98-0x9F */ + 0xBF1A,0xBF1E,0xBF1F,0xBF20,0xBF21,0xBF22,0xBF23,0xBF24,/* 0xA0-0xA7 */ + 0xBF25,0xBF26,0xBF27,0xBF28,0xBF29,0xBF2A,0xBF2B,0xBF2C,/* 0xA8-0xAF */ + 0xBF2D,0xBF2E,0xBF2F,0xBF30,0xBF31,0xBF32,0xBF33,0xBF34,/* 0xB0-0xB7 */ + 0xBF35,0xBF36,0xBF37,0xBF38,0xBF39,0xBF3A,0xBF3B,0xBF3C,/* 0xB8-0xBF */ + 0xBF3D,0xBF3E,0xBF3F,0xBF42,0xBF43,0xBF45,0xBF46,0xBF47,/* 0xC0-0xC7 */ + 0xBF49,0xBF4A,0xBF4B,0xBF4C,0xBF4D,0xBF4E,0xBF4F,0xBF52,/* 0xC8-0xCF */ + 0xBF53,0xBF54,0xBF56,0xBF57,0xBF58,0xBF59,0xBF5A,0xBF5B,/* 0xD0-0xD7 */ + 0xBF5C,0xBF5D,0xBF5E,0xBF5F,0xBF60,0xBF61,0xBF62,0xBF63,/* 0xD8-0xDF */ + 0xBF64,0xBF65,0xBF66,0xBF67,0xBF68,0xBF69,0xBF6A,0xBF6B,/* 0xE0-0xE7 */ + 0xBF6C,0xBF6D,0xBF6E,0xBF6F,0xBF70,0xBF71,0xBF72,0xBF73,/* 0xE8-0xEF */ + 0xBF74,0xBF75,0xBF76,0xBF77,0xBF78,0xBF79,0xBF7A,0xBF7B,/* 0xF0-0xF7 */ + 0xBF7C,0xBF7D,0xBF7E,0xBF7F,0xBF80,0xBF81,0xBF82,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBF83,0xBF84,0xBF85,0xBF86,0xBF87,0xBF88,0xBF89,/* 0x40-0x47 */ + 0xBF8A,0xBF8B,0xBF8C,0xBF8D,0xBF8E,0xBF8F,0xBF90,0xBF91,/* 0x48-0x4F */ + 0xBF92,0xBF93,0xBF95,0xBF96,0xBF97,0xBF98,0xBF99,0xBF9A,/* 0x50-0x57 */ + 0xBF9B,0xBF9C,0xBF9D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBF9E,0xBF9F,0xBFA0,0xBFA1,0xBFA2,0xBFA3,0xBFA4,/* 0x60-0x67 */ + 0xBFA5,0xBFA6,0xBFA7,0xBFA8,0xBFA9,0xBFAA,0xBFAB,0xBFAC,/* 0x68-0x6F */ + 0xBFAD,0xBFAE,0xBFAF,0xBFB1,0xBFB2,0xBFB3,0xBFB4,0xBFB5,/* 0x70-0x77 */ + 0xBFB6,0xBFB7,0xBFB8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBFB9,0xBFBA,0xBFBB,0xBFBC,0xBFBD,0xBFBE,0xBFBF,/* 0x80-0x87 */ + 0xBFC0,0xBFC1,0xBFC2,0xBFC3,0xBFC4,0xBFC6,0xBFC7,0xBFC8,/* 0x88-0x8F */ + 0xBFC9,0xBFCA,0xBFCB,0xBFCE,0xBFCF,0xBFD1,0xBFD2,0xBFD3,/* 0x90-0x97 */ + 0xBFD5,0xBFD6,0xBFD7,0xBFD8,0xBFD9,0xBFDA,0xBFDB,0xBFDD,/* 0x98-0x9F */ + 0xBFDE,0xBFE0,0xBFE2,0xBFE3,0xBFE4,0xBFE5,0xBFE6,0xBFE7,/* 0xA0-0xA7 */ + 0xBFE8,0xBFE9,0xBFEA,0xBFEB,0xBFEC,0xBFED,0xBFEE,0xBFEF,/* 0xA8-0xAF */ + 0xBFF0,0xBFF1,0xBFF2,0xBFF3,0xBFF4,0xBFF5,0xBFF6,0xBFF7,/* 0xB0-0xB7 */ + 0xBFF8,0xBFF9,0xBFFA,0xBFFB,0xBFFC,0xBFFD,0xBFFE,0xBFFF,/* 0xB8-0xBF */ + 0xC000,0xC001,0xC002,0xC003,0xC004,0xC005,0xC006,0xC007,/* 0xC0-0xC7 */ + 0xC008,0xC009,0xC00A,0xC00B,0xC00C,0xC00D,0xC00E,0xC00F,/* 0xC8-0xCF */ + 0xC010,0xC011,0xC012,0xC013,0xC014,0xC015,0xC016,0xC017,/* 0xD0-0xD7 */ + 0xC018,0xC019,0xC01A,0xC01B,0xC01C,0xC01D,0xC01E,0xC01F,/* 0xD8-0xDF */ + 0xC020,0xC021,0xC022,0xC023,0xC024,0xC025,0xC026,0xC027,/* 0xE0-0xE7 */ + 0xC028,0xC029,0xC02A,0xC02B,0xC02C,0xC02D,0xC02E,0xC02F,/* 0xE8-0xEF */ + 0xC030,0xC031,0xC032,0xC033,0xC034,0xC035,0xC036,0xC037,/* 0xF0-0xF7 */ + 0xC038,0xC039,0xC03A,0xC03B,0xC03D,0xC03E,0xC03F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC040,0xC041,0xC042,0xC043,0xC044,0xC045,0xC046,/* 0x40-0x47 */ + 0xC047,0xC048,0xC049,0xC04A,0xC04B,0xC04C,0xC04D,0xC04E,/* 0x48-0x4F */ + 0xC04F,0xC050,0xC052,0xC053,0xC054,0xC055,0xC056,0xC057,/* 0x50-0x57 */ + 0xC059,0xC05A,0xC05B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC05D,0xC05E,0xC05F,0xC061,0xC062,0xC063,0xC064,/* 0x60-0x67 */ + 0xC065,0xC066,0xC067,0xC06A,0xC06B,0xC06C,0xC06D,0xC06E,/* 0x68-0x6F */ + 0xC06F,0xC070,0xC071,0xC072,0xC073,0xC074,0xC075,0xC076,/* 0x70-0x77 */ + 0xC077,0xC078,0xC079,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC07A,0xC07B,0xC07C,0xC07D,0xC07E,0xC07F,0xC080,/* 0x80-0x87 */ + 0xC081,0xC082,0xC083,0xC084,0xC085,0xC086,0xC087,0xC088,/* 0x88-0x8F */ + 0xC089,0xC08A,0xC08B,0xC08C,0xC08D,0xC08E,0xC08F,0xC092,/* 0x90-0x97 */ + 0xC093,0xC095,0xC096,0xC097,0xC099,0xC09A,0xC09B,0xC09C,/* 0x98-0x9F */ + 0xC09D,0xC09E,0xC09F,0xC0A2,0xC0A4,0xC0A6,0xC0A7,0xC0A8,/* 0xA0-0xA7 */ + 0xC0A9,0xC0AA,0xC0AB,0xC0AE,0xC0B1,0xC0B2,0xC0B7,0xC0B8,/* 0xA8-0xAF */ + 0xC0B9,0xC0BA,0xC0BB,0xC0BE,0xC0C2,0xC0C3,0xC0C4,0xC0C6,/* 0xB0-0xB7 */ + 0xC0C7,0xC0CA,0xC0CB,0xC0CD,0xC0CE,0xC0CF,0xC0D1,0xC0D2,/* 0xB8-0xBF */ + 0xC0D3,0xC0D4,0xC0D5,0xC0D6,0xC0D7,0xC0DA,0xC0DE,0xC0DF,/* 0xC0-0xC7 */ + 0xC0E0,0xC0E1,0xC0E2,0xC0E3,0xC0E6,0xC0E7,0xC0E9,0xC0EA,/* 0xC8-0xCF */ + 0xC0EB,0xC0ED,0xC0EE,0xC0EF,0xC0F0,0xC0F1,0xC0F2,0xC0F3,/* 0xD0-0xD7 */ + 0xC0F6,0xC0F8,0xC0FA,0xC0FB,0xC0FC,0xC0FD,0xC0FE,0xC0FF,/* 0xD8-0xDF */ + 0xC101,0xC102,0xC103,0xC105,0xC106,0xC107,0xC109,0xC10A,/* 0xE0-0xE7 */ + 0xC10B,0xC10C,0xC10D,0xC10E,0xC10F,0xC111,0xC112,0xC113,/* 0xE8-0xEF */ + 0xC114,0xC116,0xC117,0xC118,0xC119,0xC11A,0xC11B,0xC121,/* 0xF0-0xF7 */ + 0xC122,0xC125,0xC128,0xC129,0xC12A,0xC12B,0xC12E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC132,0xC133,0xC134,0xC135,0xC137,0xC13A,0xC13B,/* 0x40-0x47 */ + 0xC13D,0xC13E,0xC13F,0xC141,0xC142,0xC143,0xC144,0xC145,/* 0x48-0x4F */ + 0xC146,0xC147,0xC14A,0xC14E,0xC14F,0xC150,0xC151,0xC152,/* 0x50-0x57 */ + 0xC153,0xC156,0xC157,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC159,0xC15A,0xC15B,0xC15D,0xC15E,0xC15F,0xC160,/* 0x60-0x67 */ + 0xC161,0xC162,0xC163,0xC166,0xC16A,0xC16B,0xC16C,0xC16D,/* 0x68-0x6F */ + 0xC16E,0xC16F,0xC171,0xC172,0xC173,0xC175,0xC176,0xC177,/* 0x70-0x77 */ + 0xC179,0xC17A,0xC17B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC17C,0xC17D,0xC17E,0xC17F,0xC180,0xC181,0xC182,/* 0x80-0x87 */ + 0xC183,0xC184,0xC186,0xC187,0xC188,0xC189,0xC18A,0xC18B,/* 0x88-0x8F */ + 0xC18F,0xC191,0xC192,0xC193,0xC195,0xC197,0xC198,0xC199,/* 0x90-0x97 */ + 0xC19A,0xC19B,0xC19E,0xC1A0,0xC1A2,0xC1A3,0xC1A4,0xC1A6,/* 0x98-0x9F */ + 0xC1A7,0xC1AA,0xC1AB,0xC1AD,0xC1AE,0xC1AF,0xC1B1,0xC1B2,/* 0xA0-0xA7 */ + 0xC1B3,0xC1B4,0xC1B5,0xC1B6,0xC1B7,0xC1B8,0xC1B9,0xC1BA,/* 0xA8-0xAF */ + 0xC1BB,0xC1BC,0xC1BE,0xC1BF,0xC1C0,0xC1C1,0xC1C2,0xC1C3,/* 0xB0-0xB7 */ + 0xC1C5,0xC1C6,0xC1C7,0xC1C9,0xC1CA,0xC1CB,0xC1CD,0xC1CE,/* 0xB8-0xBF */ + 0xC1CF,0xC1D0,0xC1D1,0xC1D2,0xC1D3,0xC1D5,0xC1D6,0xC1D9,/* 0xC0-0xC7 */ + 0xC1DA,0xC1DB,0xC1DC,0xC1DD,0xC1DE,0xC1DF,0xC1E1,0xC1E2,/* 0xC8-0xCF */ + 0xC1E3,0xC1E5,0xC1E6,0xC1E7,0xC1E9,0xC1EA,0xC1EB,0xC1EC,/* 0xD0-0xD7 */ + 0xC1ED,0xC1EE,0xC1EF,0xC1F2,0xC1F4,0xC1F5,0xC1F6,0xC1F7,/* 0xD8-0xDF */ + 0xC1F8,0xC1F9,0xC1FA,0xC1FB,0xC1FE,0xC1FF,0xC201,0xC202,/* 0xE0-0xE7 */ + 0xC203,0xC205,0xC206,0xC207,0xC208,0xC209,0xC20A,0xC20B,/* 0xE8-0xEF */ + 0xC20E,0xC210,0xC212,0xC213,0xC214,0xC215,0xC216,0xC217,/* 0xF0-0xF7 */ + 0xC21A,0xC21B,0xC21D,0xC21E,0xC221,0xC222,0xC223,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC224,0xC225,0xC226,0xC227,0xC22A,0xC22C,0xC22E,/* 0x40-0x47 */ + 0xC230,0xC233,0xC235,0xC236,0xC237,0xC238,0xC239,0xC23A,/* 0x48-0x4F */ + 0xC23B,0xC23C,0xC23D,0xC23E,0xC23F,0xC240,0xC241,0xC242,/* 0x50-0x57 */ + 0xC243,0xC244,0xC245,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC246,0xC247,0xC249,0xC24A,0xC24B,0xC24C,0xC24D,/* 0x60-0x67 */ + 0xC24E,0xC24F,0xC252,0xC253,0xC255,0xC256,0xC257,0xC259,/* 0x68-0x6F */ + 0xC25A,0xC25B,0xC25C,0xC25D,0xC25E,0xC25F,0xC261,0xC262,/* 0x70-0x77 */ + 0xC263,0xC264,0xC266,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC267,0xC268,0xC269,0xC26A,0xC26B,0xC26E,0xC26F,/* 0x80-0x87 */ + 0xC271,0xC272,0xC273,0xC275,0xC276,0xC277,0xC278,0xC279,/* 0x88-0x8F */ + 0xC27A,0xC27B,0xC27E,0xC280,0xC282,0xC283,0xC284,0xC285,/* 0x90-0x97 */ + 0xC286,0xC287,0xC28A,0xC28B,0xC28C,0xC28D,0xC28E,0xC28F,/* 0x98-0x9F */ + 0xC291,0xC292,0xC293,0xC294,0xC295,0xC296,0xC297,0xC299,/* 0xA0-0xA7 */ + 0xC29A,0xC29C,0xC29E,0xC29F,0xC2A0,0xC2A1,0xC2A2,0xC2A3,/* 0xA8-0xAF */ + 0xC2A6,0xC2A7,0xC2A9,0xC2AA,0xC2AB,0xC2AE,0xC2AF,0xC2B0,/* 0xB0-0xB7 */ + 0xC2B1,0xC2B2,0xC2B3,0xC2B6,0xC2B8,0xC2BA,0xC2BB,0xC2BC,/* 0xB8-0xBF */ + 0xC2BD,0xC2BE,0xC2BF,0xC2C0,0xC2C1,0xC2C2,0xC2C3,0xC2C4,/* 0xC0-0xC7 */ + 0xC2C5,0xC2C6,0xC2C7,0xC2C8,0xC2C9,0xC2CA,0xC2CB,0xC2CC,/* 0xC8-0xCF */ + 0xC2CD,0xC2CE,0xC2CF,0xC2D0,0xC2D1,0xC2D2,0xC2D3,0xC2D4,/* 0xD0-0xD7 */ + 0xC2D5,0xC2D6,0xC2D7,0xC2D8,0xC2D9,0xC2DA,0xC2DB,0xC2DE,/* 0xD8-0xDF */ + 0xC2DF,0xC2E1,0xC2E2,0xC2E5,0xC2E6,0xC2E7,0xC2E8,0xC2E9,/* 0xE0-0xE7 */ + 0xC2EA,0xC2EE,0xC2F0,0xC2F2,0xC2F3,0xC2F4,0xC2F5,0xC2F7,/* 0xE8-0xEF */ + 0xC2FA,0xC2FD,0xC2FE,0xC2FF,0xC301,0xC302,0xC303,0xC304,/* 0xF0-0xF7 */ + 0xC305,0xC306,0xC307,0xC30A,0xC30B,0xC30E,0xC30F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC310,0xC311,0xC312,0xC316,0xC317,0xC319,0xC31A,/* 0x40-0x47 */ + 0xC31B,0xC31D,0xC31E,0xC31F,0xC320,0xC321,0xC322,0xC323,/* 0x48-0x4F */ + 0xC326,0xC327,0xC32A,0xC32B,0xC32C,0xC32D,0xC32E,0xC32F,/* 0x50-0x57 */ + 0xC330,0xC331,0xC332,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC333,0xC334,0xC335,0xC336,0xC337,0xC338,0xC339,/* 0x60-0x67 */ + 0xC33A,0xC33B,0xC33C,0xC33D,0xC33E,0xC33F,0xC340,0xC341,/* 0x68-0x6F */ + 0xC342,0xC343,0xC344,0xC346,0xC347,0xC348,0xC349,0xC34A,/* 0x70-0x77 */ + 0xC34B,0xC34C,0xC34D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC34E,0xC34F,0xC350,0xC351,0xC352,0xC353,0xC354,/* 0x80-0x87 */ + 0xC355,0xC356,0xC357,0xC358,0xC359,0xC35A,0xC35B,0xC35C,/* 0x88-0x8F */ + 0xC35D,0xC35E,0xC35F,0xC360,0xC361,0xC362,0xC363,0xC364,/* 0x90-0x97 */ + 0xC365,0xC366,0xC367,0xC36A,0xC36B,0xC36D,0xC36E,0xC36F,/* 0x98-0x9F */ + 0xC371,0xC373,0xC374,0xC375,0xC376,0xC377,0xC37A,0xC37B,/* 0xA0-0xA7 */ + 0xC37E,0xC37F,0xC380,0xC381,0xC382,0xC383,0xC385,0xC386,/* 0xA8-0xAF */ + 0xC387,0xC389,0xC38A,0xC38B,0xC38D,0xC38E,0xC38F,0xC390,/* 0xB0-0xB7 */ + 0xC391,0xC392,0xC393,0xC394,0xC395,0xC396,0xC397,0xC398,/* 0xB8-0xBF */ + 0xC399,0xC39A,0xC39B,0xC39C,0xC39D,0xC39E,0xC39F,0xC3A0,/* 0xC0-0xC7 */ + 0xC3A1,0xC3A2,0xC3A3,0xC3A4,0xC3A5,0xC3A6,0xC3A7,0xC3A8,/* 0xC8-0xCF */ + 0xC3A9,0xC3AA,0xC3AB,0xC3AC,0xC3AD,0xC3AE,0xC3AF,0xC3B0,/* 0xD0-0xD7 */ + 0xC3B1,0xC3B2,0xC3B3,0xC3B4,0xC3B5,0xC3B6,0xC3B7,0xC3B8,/* 0xD8-0xDF */ + 0xC3B9,0xC3BA,0xC3BB,0xC3BC,0xC3BD,0xC3BE,0xC3BF,0xC3C1,/* 0xE0-0xE7 */ + 0xC3C2,0xC3C3,0xC3C4,0xC3C5,0xC3C6,0xC3C7,0xC3C8,0xC3C9,/* 0xE8-0xEF */ + 0xC3CA,0xC3CB,0xC3CC,0xC3CD,0xC3CE,0xC3CF,0xC3D0,0xC3D1,/* 0xF0-0xF7 */ + 0xC3D2,0xC3D3,0xC3D4,0xC3D5,0xC3D6,0xC3D7,0xC3DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC3DB,0xC3DD,0xC3DE,0xC3E1,0xC3E3,0xC3E4,0xC3E5,/* 0x40-0x47 */ + 0xC3E6,0xC3E7,0xC3EA,0xC3EB,0xC3EC,0xC3EE,0xC3EF,0xC3F0,/* 0x48-0x4F */ + 0xC3F1,0xC3F2,0xC3F3,0xC3F6,0xC3F7,0xC3F9,0xC3FA,0xC3FB,/* 0x50-0x57 */ + 0xC3FC,0xC3FD,0xC3FE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC3FF,0xC400,0xC401,0xC402,0xC403,0xC404,0xC405,/* 0x60-0x67 */ + 0xC406,0xC407,0xC409,0xC40A,0xC40B,0xC40C,0xC40D,0xC40E,/* 0x68-0x6F */ + 0xC40F,0xC411,0xC412,0xC413,0xC414,0xC415,0xC416,0xC417,/* 0x70-0x77 */ + 0xC418,0xC419,0xC41A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC41B,0xC41C,0xC41D,0xC41E,0xC41F,0xC420,0xC421,/* 0x80-0x87 */ + 0xC422,0xC423,0xC425,0xC426,0xC427,0xC428,0xC429,0xC42A,/* 0x88-0x8F */ + 0xC42B,0xC42D,0xC42E,0xC42F,0xC431,0xC432,0xC433,0xC435,/* 0x90-0x97 */ + 0xC436,0xC437,0xC438,0xC439,0xC43A,0xC43B,0xC43E,0xC43F,/* 0x98-0x9F */ + 0xC440,0xC441,0xC442,0xC443,0xC444,0xC445,0xC446,0xC447,/* 0xA0-0xA7 */ + 0xC449,0xC44A,0xC44B,0xC44C,0xC44D,0xC44E,0xC44F,0xC450,/* 0xA8-0xAF */ + 0xC451,0xC452,0xC453,0xC454,0xC455,0xC456,0xC457,0xC458,/* 0xB0-0xB7 */ + 0xC459,0xC45A,0xC45B,0xC45C,0xC45D,0xC45E,0xC45F,0xC460,/* 0xB8-0xBF */ + 0xC461,0xC462,0xC463,0xC466,0xC467,0xC469,0xC46A,0xC46B,/* 0xC0-0xC7 */ + 0xC46D,0xC46E,0xC46F,0xC470,0xC471,0xC472,0xC473,0xC476,/* 0xC8-0xCF */ + 0xC477,0xC478,0xC47A,0xC47B,0xC47C,0xC47D,0xC47E,0xC47F,/* 0xD0-0xD7 */ + 0xC481,0xC482,0xC483,0xC484,0xC485,0xC486,0xC487,0xC488,/* 0xD8-0xDF */ + 0xC489,0xC48A,0xC48B,0xC48C,0xC48D,0xC48E,0xC48F,0xC490,/* 0xE0-0xE7 */ + 0xC491,0xC492,0xC493,0xC495,0xC496,0xC497,0xC498,0xC499,/* 0xE8-0xEF */ + 0xC49A,0xC49B,0xC49D,0xC49E,0xC49F,0xC4A0,0xC4A1,0xC4A2,/* 0xF0-0xF7 */ + 0xC4A3,0xC4A4,0xC4A5,0xC4A6,0xC4A7,0xC4A8,0xC4A9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC4AA,0xC4AB,0xC4AC,0xC4AD,0xC4AE,0xC4AF,0xC4B0,/* 0x40-0x47 */ + 0xC4B1,0xC4B2,0xC4B3,0xC4B4,0xC4B5,0xC4B6,0xC4B7,0xC4B9,/* 0x48-0x4F */ + 0xC4BA,0xC4BB,0xC4BD,0xC4BE,0xC4BF,0xC4C0,0xC4C1,0xC4C2,/* 0x50-0x57 */ + 0xC4C3,0xC4C4,0xC4C5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC4C6,0xC4C7,0xC4C8,0xC4C9,0xC4CA,0xC4CB,0xC4CC,/* 0x60-0x67 */ + 0xC4CD,0xC4CE,0xC4CF,0xC4D0,0xC4D1,0xC4D2,0xC4D3,0xC4D4,/* 0x68-0x6F */ + 0xC4D5,0xC4D6,0xC4D7,0xC4D8,0xC4D9,0xC4DA,0xC4DB,0xC4DC,/* 0x70-0x77 */ + 0xC4DD,0xC4DE,0xC4DF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC4E0,0xC4E1,0xC4E2,0xC4E3,0xC4E4,0xC4E5,0xC4E6,/* 0x80-0x87 */ + 0xC4E7,0xC4E8,0xC4EA,0xC4EB,0xC4EC,0xC4ED,0xC4EE,0xC4EF,/* 0x88-0x8F */ + 0xC4F2,0xC4F3,0xC4F5,0xC4F6,0xC4F7,0xC4F9,0xC4FB,0xC4FC,/* 0x90-0x97 */ + 0xC4FD,0xC4FE,0xC502,0xC503,0xC504,0xC505,0xC506,0xC507,/* 0x98-0x9F */ + 0xC508,0xC509,0xC50A,0xC50B,0xC50D,0xC50E,0xC50F,0xC511,/* 0xA0-0xA7 */ + 0xC512,0xC513,0xC515,0xC516,0xC517,0xC518,0xC519,0xC51A,/* 0xA8-0xAF */ + 0xC51B,0xC51D,0xC51E,0xC51F,0xC520,0xC521,0xC522,0xC523,/* 0xB0-0xB7 */ + 0xC524,0xC525,0xC526,0xC527,0xC52A,0xC52B,0xC52D,0xC52E,/* 0xB8-0xBF */ + 0xC52F,0xC531,0xC532,0xC533,0xC534,0xC535,0xC536,0xC537,/* 0xC0-0xC7 */ + 0xC53A,0xC53C,0xC53E,0xC53F,0xC540,0xC541,0xC542,0xC543,/* 0xC8-0xCF */ + 0xC546,0xC547,0xC54B,0xC54F,0xC550,0xC551,0xC552,0xC556,/* 0xD0-0xD7 */ + 0xC55A,0xC55B,0xC55C,0xC55F,0xC562,0xC563,0xC565,0xC566,/* 0xD8-0xDF */ + 0xC567,0xC569,0xC56A,0xC56B,0xC56C,0xC56D,0xC56E,0xC56F,/* 0xE0-0xE7 */ + 0xC572,0xC576,0xC577,0xC578,0xC579,0xC57A,0xC57B,0xC57E,/* 0xE8-0xEF */ + 0xC57F,0xC581,0xC582,0xC583,0xC585,0xC586,0xC588,0xC589,/* 0xF0-0xF7 */ + 0xC58A,0xC58B,0xC58E,0xC590,0xC592,0xC593,0xC594,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC596,0xC599,0xC59A,0xC59B,0xC59D,0xC59E,0xC59F,/* 0x40-0x47 */ + 0xC5A1,0xC5A2,0xC5A3,0xC5A4,0xC5A5,0xC5A6,0xC5A7,0xC5A8,/* 0x48-0x4F */ + 0xC5AA,0xC5AB,0xC5AC,0xC5AD,0xC5AE,0xC5AF,0xC5B0,0xC5B1,/* 0x50-0x57 */ + 0xC5B2,0xC5B3,0xC5B6,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC5B7,0xC5BA,0xC5BF,0xC5C0,0xC5C1,0xC5C2,0xC5C3,/* 0x60-0x67 */ + 0xC5CB,0xC5CD,0xC5CF,0xC5D2,0xC5D3,0xC5D5,0xC5D6,0xC5D7,/* 0x68-0x6F */ + 0xC5D9,0xC5DA,0xC5DB,0xC5DC,0xC5DD,0xC5DE,0xC5DF,0xC5E2,/* 0x70-0x77 */ + 0xC5E4,0xC5E6,0xC5E7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC5E8,0xC5E9,0xC5EA,0xC5EB,0xC5EF,0xC5F1,0xC5F2,/* 0x80-0x87 */ + 0xC5F3,0xC5F5,0xC5F8,0xC5F9,0xC5FA,0xC5FB,0xC602,0xC603,/* 0x88-0x8F */ + 0xC604,0xC609,0xC60A,0xC60B,0xC60D,0xC60E,0xC60F,0xC611,/* 0x90-0x97 */ + 0xC612,0xC613,0xC614,0xC615,0xC616,0xC617,0xC61A,0xC61D,/* 0x98-0x9F */ + 0xC61E,0xC61F,0xC620,0xC621,0xC622,0xC623,0xC626,0xC627,/* 0xA0-0xA7 */ + 0xC629,0xC62A,0xC62B,0xC62F,0xC631,0xC632,0xC636,0xC638,/* 0xA8-0xAF */ + 0xC63A,0xC63C,0xC63D,0xC63E,0xC63F,0xC642,0xC643,0xC645,/* 0xB0-0xB7 */ + 0xC646,0xC647,0xC649,0xC64A,0xC64B,0xC64C,0xC64D,0xC64E,/* 0xB8-0xBF */ + 0xC64F,0xC652,0xC656,0xC657,0xC658,0xC659,0xC65A,0xC65B,/* 0xC0-0xC7 */ + 0xC65E,0xC65F,0xC661,0xC662,0xC663,0xC664,0xC665,0xC666,/* 0xC8-0xCF */ + 0xC667,0xC668,0xC669,0xC66A,0xC66B,0xC66D,0xC66E,0xC670,/* 0xD0-0xD7 */ + 0xC672,0xC673,0xC674,0xC675,0xC676,0xC677,0xC67A,0xC67B,/* 0xD8-0xDF */ + 0xC67D,0xC67E,0xC67F,0xC681,0xC682,0xC683,0xC684,0xC685,/* 0xE0-0xE7 */ + 0xC686,0xC687,0xC68A,0xC68C,0xC68E,0xC68F,0xC690,0xC691,/* 0xE8-0xEF */ + 0xC692,0xC693,0xC696,0xC697,0xC699,0xC69A,0xC69B,0xC69D,/* 0xF0-0xF7 */ + 0xC69E,0xC69F,0xC6A0,0xC6A1,0xC6A2,0xC6A3,0xC6A6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC6A8,0xC6AA,0xC6AB,0xC6AC,0xC6AD,0xC6AE,0xC6AF,/* 0x40-0x47 */ + 0xC6B2,0xC6B3,0xC6B5,0xC6B6,0xC6B7,0xC6BB,0xC6BC,0xC6BD,/* 0x48-0x4F */ + 0xC6BE,0xC6BF,0xC6C2,0xC6C4,0xC6C6,0xC6C7,0xC6C8,0xC6C9,/* 0x50-0x57 */ + 0xC6CA,0xC6CB,0xC6CE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC6CF,0xC6D1,0xC6D2,0xC6D3,0xC6D5,0xC6D6,0xC6D7,/* 0x60-0x67 */ + 0xC6D8,0xC6D9,0xC6DA,0xC6DB,0xC6DE,0xC6DF,0xC6E2,0xC6E3,/* 0x68-0x6F */ + 0xC6E4,0xC6E5,0xC6E6,0xC6E7,0xC6EA,0xC6EB,0xC6ED,0xC6EE,/* 0x70-0x77 */ + 0xC6EF,0xC6F1,0xC6F2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC6F3,0xC6F4,0xC6F5,0xC6F6,0xC6F7,0xC6FA,0xC6FB,/* 0x80-0x87 */ + 0xC6FC,0xC6FE,0xC6FF,0xC700,0xC701,0xC702,0xC703,0xC706,/* 0x88-0x8F */ + 0xC707,0xC709,0xC70A,0xC70B,0xC70D,0xC70E,0xC70F,0xC710,/* 0x90-0x97 */ + 0xC711,0xC712,0xC713,0xC716,0xC718,0xC71A,0xC71B,0xC71C,/* 0x98-0x9F */ + 0xC71D,0xC71E,0xC71F,0xC722,0xC723,0xC725,0xC726,0xC727,/* 0xA0-0xA7 */ + 0xC729,0xC72A,0xC72B,0xC72C,0xC72D,0xC72E,0xC72F,0xC732,/* 0xA8-0xAF */ + 0xC734,0xC736,0xC738,0xC739,0xC73A,0xC73B,0xC73E,0xC73F,/* 0xB0-0xB7 */ + 0xC741,0xC742,0xC743,0xC745,0xC746,0xC747,0xC748,0xC749,/* 0xB8-0xBF */ + 0xC74B,0xC74E,0xC750,0xC759,0xC75A,0xC75B,0xC75D,0xC75E,/* 0xC0-0xC7 */ + 0xC75F,0xC761,0xC762,0xC763,0xC764,0xC765,0xC766,0xC767,/* 0xC8-0xCF */ + 0xC769,0xC76A,0xC76C,0xC76D,0xC76E,0xC76F,0xC770,0xC771,/* 0xD0-0xD7 */ + 0xC772,0xC773,0xC776,0xC777,0xC779,0xC77A,0xC77B,0xC77F,/* 0xD8-0xDF */ + 0xC780,0xC781,0xC782,0xC786,0xC78B,0xC78C,0xC78D,0xC78F,/* 0xE0-0xE7 */ + 0xC792,0xC793,0xC795,0xC799,0xC79B,0xC79C,0xC79D,0xC79E,/* 0xE8-0xEF */ + 0xC79F,0xC7A2,0xC7A7,0xC7A8,0xC7A9,0xC7AA,0xC7AB,0xC7AE,/* 0xF0-0xF7 */ + 0xC7AF,0xC7B1,0xC7B2,0xC7B3,0xC7B5,0xC7B6,0xC7B7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC7B8,0xC7B9,0xC7BA,0xC7BB,0xC7BE,0xC7C2,0xC7C3,/* 0x40-0x47 */ + 0xC7C4,0xC7C5,0xC7C6,0xC7C7,0xC7CA,0xC7CB,0xC7CD,0xC7CF,/* 0x48-0x4F */ + 0xC7D1,0xC7D2,0xC7D3,0xC7D4,0xC7D5,0xC7D6,0xC7D7,0xC7D9,/* 0x50-0x57 */ + 0xC7DA,0xC7DB,0xC7DC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC7DE,0xC7DF,0xC7E0,0xC7E1,0xC7E2,0xC7E3,0xC7E5,/* 0x60-0x67 */ + 0xC7E6,0xC7E7,0xC7E9,0xC7EA,0xC7EB,0xC7ED,0xC7EE,0xC7EF,/* 0x68-0x6F */ + 0xC7F0,0xC7F1,0xC7F2,0xC7F3,0xC7F4,0xC7F5,0xC7F6,0xC7F7,/* 0x70-0x77 */ + 0xC7F8,0xC7F9,0xC7FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC7FB,0xC7FC,0xC7FD,0xC7FE,0xC7FF,0xC802,0xC803,/* 0x80-0x87 */ + 0xC805,0xC806,0xC807,0xC809,0xC80B,0xC80C,0xC80D,0xC80E,/* 0x88-0x8F */ + 0xC80F,0xC812,0xC814,0xC817,0xC818,0xC819,0xC81A,0xC81B,/* 0x90-0x97 */ + 0xC81E,0xC81F,0xC821,0xC822,0xC823,0xC825,0xC826,0xC827,/* 0x98-0x9F */ + 0xC828,0xC829,0xC82A,0xC82B,0xC82E,0xC830,0xC832,0xC833,/* 0xA0-0xA7 */ + 0xC834,0xC835,0xC836,0xC837,0xC839,0xC83A,0xC83B,0xC83D,/* 0xA8-0xAF */ + 0xC83E,0xC83F,0xC841,0xC842,0xC843,0xC844,0xC845,0xC846,/* 0xB0-0xB7 */ + 0xC847,0xC84A,0xC84B,0xC84E,0xC84F,0xC850,0xC851,0xC852,/* 0xB8-0xBF */ + 0xC853,0xC855,0xC856,0xC857,0xC858,0xC859,0xC85A,0xC85B,/* 0xC0-0xC7 */ + 0xC85C,0xC85D,0xC85E,0xC85F,0xC860,0xC861,0xC862,0xC863,/* 0xC8-0xCF */ + 0xC864,0xC865,0xC866,0xC867,0xC868,0xC869,0xC86A,0xC86B,/* 0xD0-0xD7 */ + 0xC86C,0xC86D,0xC86E,0xC86F,0xC872,0xC873,0xC875,0xC876,/* 0xD8-0xDF */ + 0xC877,0xC879,0xC87B,0xC87C,0xC87D,0xC87E,0xC87F,0xC882,/* 0xE0-0xE7 */ + 0xC884,0xC888,0xC889,0xC88A,0xC88E,0xC88F,0xC890,0xC891,/* 0xE8-0xEF */ + 0xC892,0xC893,0xC895,0xC896,0xC897,0xC898,0xC899,0xC89A,/* 0xF0-0xF7 */ + 0xC89B,0xC89C,0xC89E,0xC8A0,0xC8A2,0xC8A3,0xC8A4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC8A5,0xC8A6,0xC8A7,0xC8A9,0xC8AA,0xC8AB,0xC8AC,/* 0x40-0x47 */ + 0xC8AD,0xC8AE,0xC8AF,0xC8B0,0xC8B1,0xC8B2,0xC8B3,0xC8B4,/* 0x48-0x4F */ + 0xC8B5,0xC8B6,0xC8B7,0xC8B8,0xC8B9,0xC8BA,0xC8BB,0xC8BE,/* 0x50-0x57 */ + 0xC8BF,0xC8C0,0xC8C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC8C2,0xC8C3,0xC8C5,0xC8C6,0xC8C7,0xC8C9,0xC8CA,/* 0x60-0x67 */ + 0xC8CB,0xC8CD,0xC8CE,0xC8CF,0xC8D0,0xC8D1,0xC8D2,0xC8D3,/* 0x68-0x6F */ + 0xC8D6,0xC8D8,0xC8DA,0xC8DB,0xC8DC,0xC8DD,0xC8DE,0xC8DF,/* 0x70-0x77 */ + 0xC8E2,0xC8E3,0xC8E5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC8E6,0xC8E7,0xC8E8,0xC8E9,0xC8EA,0xC8EB,0xC8EC,/* 0x80-0x87 */ + 0xC8ED,0xC8EE,0xC8EF,0xC8F0,0xC8F1,0xC8F2,0xC8F3,0xC8F4,/* 0x88-0x8F */ + 0xC8F6,0xC8F7,0xC8F8,0xC8F9,0xC8FA,0xC8FB,0xC8FE,0xC8FF,/* 0x90-0x97 */ + 0xC901,0xC902,0xC903,0xC907,0xC908,0xC909,0xC90A,0xC90B,/* 0x98-0x9F */ + 0xC90E,0x3000,0x3001,0x3002,0x00B7,0x2025,0x2026,0x00A8,/* 0xA0-0xA7 */ + 0x3003,0x00AD,0x2015,0x2225,0xFF3C,0x223C,0x2018,0x2019,/* 0xA8-0xAF */ + 0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */ + 0x300C,0x300D,0x300E,0x300F,0x3010,0x3011,0x00B1,0x00D7,/* 0xB8-0xBF */ + 0x00F7,0x2260,0x2264,0x2265,0x221E,0x2234,0x00B0,0x2032,/* 0xC0-0xC7 */ + 0x2033,0x2103,0x212B,0xFFE0,0xFFE1,0xFFE5,0x2642,0x2640,/* 0xC8-0xCF */ + 0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,0x2252,0x00A7,/* 0xD0-0xD7 */ + 0x203B,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0xD8-0xDF */ + 0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x2192,0x2190,/* 0xE0-0xE7 */ + 0x2191,0x2193,0x2194,0x3013,0x226A,0x226B,0x221A,0x223D,/* 0xE8-0xEF */ + 0x221D,0x2235,0x222B,0x222C,0x2208,0x220B,0x2286,0x2287,/* 0xF0-0xF7 */ + 0x2282,0x2283,0x222A,0x2229,0x2227,0x2228,0xFFE2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC910,0xC912,0xC913,0xC914,0xC915,0xC916,0xC917,/* 0x40-0x47 */ + 0xC919,0xC91A,0xC91B,0xC91C,0xC91D,0xC91E,0xC91F,0xC920,/* 0x48-0x4F */ + 0xC921,0xC922,0xC923,0xC924,0xC925,0xC926,0xC927,0xC928,/* 0x50-0x57 */ + 0xC929,0xC92A,0xC92B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC92D,0xC92E,0xC92F,0xC930,0xC931,0xC932,0xC933,/* 0x60-0x67 */ + 0xC935,0xC936,0xC937,0xC938,0xC939,0xC93A,0xC93B,0xC93C,/* 0x68-0x6F */ + 0xC93D,0xC93E,0xC93F,0xC940,0xC941,0xC942,0xC943,0xC944,/* 0x70-0x77 */ + 0xC945,0xC946,0xC947,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC948,0xC949,0xC94A,0xC94B,0xC94C,0xC94D,0xC94E,/* 0x80-0x87 */ + 0xC94F,0xC952,0xC953,0xC955,0xC956,0xC957,0xC959,0xC95A,/* 0x88-0x8F */ + 0xC95B,0xC95C,0xC95D,0xC95E,0xC95F,0xC962,0xC964,0xC965,/* 0x90-0x97 */ + 0xC966,0xC967,0xC968,0xC969,0xC96A,0xC96B,0xC96D,0xC96E,/* 0x98-0x9F */ + 0xC96F,0x21D2,0x21D4,0x2200,0x2203,0x00B4,0xFF5E,0x02C7,/* 0xA0-0xA7 */ + 0x02D8,0x02DD,0x02DA,0x02D9,0x00B8,0x02DB,0x00A1,0x00BF,/* 0xA8-0xAF */ + 0x02D0,0x222E,0x2211,0x220F,0x00A4,0x2109,0x2030,0x25C1,/* 0xB0-0xB7 */ + 0x25C0,0x25B7,0x25B6,0x2664,0x2660,0x2661,0x2665,0x2667,/* 0xB8-0xBF */ + 0x2663,0x2299,0x25C8,0x25A3,0x25D0,0x25D1,0x2592,0x25A4,/* 0xC0-0xC7 */ + 0x25A5,0x25A8,0x25A7,0x25A6,0x25A9,0x2668,0x260F,0x260E,/* 0xC8-0xCF */ + 0x261C,0x261E,0x00B6,0x2020,0x2021,0x2195,0x2197,0x2199,/* 0xD0-0xD7 */ + 0x2196,0x2198,0x266D,0x2669,0x266A,0x266C,0x327F,0x321C,/* 0xD8-0xDF */ + 0x2116,0x33C7,0x2122,0x33C2,0x33D8,0x2121,0x20AC,0x00AE,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC971,0xC972,0xC973,0xC975,0xC976,0xC977,0xC978,/* 0x40-0x47 */ + 0xC979,0xC97A,0xC97B,0xC97D,0xC97E,0xC97F,0xC980,0xC981,/* 0x48-0x4F */ + 0xC982,0xC983,0xC984,0xC985,0xC986,0xC987,0xC98A,0xC98B,/* 0x50-0x57 */ + 0xC98D,0xC98E,0xC98F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC991,0xC992,0xC993,0xC994,0xC995,0xC996,0xC997,/* 0x60-0x67 */ + 0xC99A,0xC99C,0xC99E,0xC99F,0xC9A0,0xC9A1,0xC9A2,0xC9A3,/* 0x68-0x6F */ + 0xC9A4,0xC9A5,0xC9A6,0xC9A7,0xC9A8,0xC9A9,0xC9AA,0xC9AB,/* 0x70-0x77 */ + 0xC9AC,0xC9AD,0xC9AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC9AF,0xC9B0,0xC9B1,0xC9B2,0xC9B3,0xC9B4,0xC9B5,/* 0x80-0x87 */ + 0xC9B6,0xC9B7,0xC9B8,0xC9B9,0xC9BA,0xC9BB,0xC9BC,0xC9BD,/* 0x88-0x8F */ + 0xC9BE,0xC9BF,0xC9C2,0xC9C3,0xC9C5,0xC9C6,0xC9C9,0xC9CB,/* 0x90-0x97 */ + 0xC9CC,0xC9CD,0xC9CE,0xC9CF,0xC9D2,0xC9D4,0xC9D7,0xC9D8,/* 0x98-0x9F */ + 0xC9DB,0xFF01,0xFF02,0xFF03,0xFF04,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */ + 0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */ + 0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */ + 0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */ + 0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */ + 0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */ + 0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */ + 0xFF38,0xFF39,0xFF3A,0xFF3B,0xFFE6,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */ + 0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */ + 0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC9DE,0xC9DF,0xC9E1,0xC9E3,0xC9E5,0xC9E6,0xC9E8,/* 0x40-0x47 */ + 0xC9E9,0xC9EA,0xC9EB,0xC9EE,0xC9F2,0xC9F3,0xC9F4,0xC9F5,/* 0x48-0x4F */ + 0xC9F6,0xC9F7,0xC9FA,0xC9FB,0xC9FD,0xC9FE,0xC9FF,0xCA01,/* 0x50-0x57 */ + 0xCA02,0xCA03,0xCA04,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCA05,0xCA06,0xCA07,0xCA0A,0xCA0E,0xCA0F,0xCA10,/* 0x60-0x67 */ + 0xCA11,0xCA12,0xCA13,0xCA15,0xCA16,0xCA17,0xCA19,0xCA1A,/* 0x68-0x6F */ + 0xCA1B,0xCA1C,0xCA1D,0xCA1E,0xCA1F,0xCA20,0xCA21,0xCA22,/* 0x70-0x77 */ + 0xCA23,0xCA24,0xCA25,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCA26,0xCA27,0xCA28,0xCA2A,0xCA2B,0xCA2C,0xCA2D,/* 0x80-0x87 */ + 0xCA2E,0xCA2F,0xCA30,0xCA31,0xCA32,0xCA33,0xCA34,0xCA35,/* 0x88-0x8F */ + 0xCA36,0xCA37,0xCA38,0xCA39,0xCA3A,0xCA3B,0xCA3C,0xCA3D,/* 0x90-0x97 */ + 0xCA3E,0xCA3F,0xCA40,0xCA41,0xCA42,0xCA43,0xCA44,0xCA45,/* 0x98-0x9F */ + 0xCA46,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,/* 0xA0-0xA7 */ + 0x3138,0x3139,0x313A,0x313B,0x313C,0x313D,0x313E,0x313F,/* 0xA8-0xAF */ + 0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,/* 0xB0-0xB7 */ + 0x3148,0x3149,0x314A,0x314B,0x314C,0x314D,0x314E,0x314F,/* 0xB8-0xBF */ + 0x3150,0x3151,0x3152,0x3153,0x3154,0x3155,0x3156,0x3157,/* 0xC0-0xC7 */ + 0x3158,0x3159,0x315A,0x315B,0x315C,0x315D,0x315E,0x315F,/* 0xC8-0xCF */ + 0x3160,0x3161,0x3162,0x3163,0x3164,0x3165,0x3166,0x3167,/* 0xD0-0xD7 */ + 0x3168,0x3169,0x316A,0x316B,0x316C,0x316D,0x316E,0x316F,/* 0xD8-0xDF */ + 0x3170,0x3171,0x3172,0x3173,0x3174,0x3175,0x3176,0x3177,/* 0xE0-0xE7 */ + 0x3178,0x3179,0x317A,0x317B,0x317C,0x317D,0x317E,0x317F,/* 0xE8-0xEF */ + 0x3180,0x3181,0x3182,0x3183,0x3184,0x3185,0x3186,0x3187,/* 0xF0-0xF7 */ + 0x3188,0x3189,0x318A,0x318B,0x318C,0x318D,0x318E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCA47,0xCA48,0xCA49,0xCA4A,0xCA4B,0xCA4E,0xCA4F,/* 0x40-0x47 */ + 0xCA51,0xCA52,0xCA53,0xCA55,0xCA56,0xCA57,0xCA58,0xCA59,/* 0x48-0x4F */ + 0xCA5A,0xCA5B,0xCA5E,0xCA62,0xCA63,0xCA64,0xCA65,0xCA66,/* 0x50-0x57 */ + 0xCA67,0xCA69,0xCA6A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCA6B,0xCA6C,0xCA6D,0xCA6E,0xCA6F,0xCA70,0xCA71,/* 0x60-0x67 */ + 0xCA72,0xCA73,0xCA74,0xCA75,0xCA76,0xCA77,0xCA78,0xCA79,/* 0x68-0x6F */ + 0xCA7A,0xCA7B,0xCA7C,0xCA7E,0xCA7F,0xCA80,0xCA81,0xCA82,/* 0x70-0x77 */ + 0xCA83,0xCA85,0xCA86,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCA87,0xCA88,0xCA89,0xCA8A,0xCA8B,0xCA8C,0xCA8D,/* 0x80-0x87 */ + 0xCA8E,0xCA8F,0xCA90,0xCA91,0xCA92,0xCA93,0xCA94,0xCA95,/* 0x88-0x8F */ + 0xCA96,0xCA97,0xCA99,0xCA9A,0xCA9B,0xCA9C,0xCA9D,0xCA9E,/* 0x90-0x97 */ + 0xCA9F,0xCAA0,0xCAA1,0xCAA2,0xCAA3,0xCAA4,0xCAA5,0xCAA6,/* 0x98-0x9F */ + 0xCAA7,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */ + 0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,0x2167,/* 0xB0-0xB7 */ + 0x2168,0x2169,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */ + 0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xC0-0xC7 */ + 0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xC8-0xCF */ + 0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xD0-0xD7 */ + 0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xE0-0xE7 */ + 0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xE8-0xEF */ + 0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xF0-0xF7 */ + 0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCAA8,0xCAA9,0xCAAA,0xCAAB,0xCAAC,0xCAAD,0xCAAE,/* 0x40-0x47 */ + 0xCAAF,0xCAB0,0xCAB1,0xCAB2,0xCAB3,0xCAB4,0xCAB5,0xCAB6,/* 0x48-0x4F */ + 0xCAB7,0xCAB8,0xCAB9,0xCABA,0xCABB,0xCABE,0xCABF,0xCAC1,/* 0x50-0x57 */ + 0xCAC2,0xCAC3,0xCAC5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCAC6,0xCAC7,0xCAC8,0xCAC9,0xCACA,0xCACB,0xCACE,/* 0x60-0x67 */ + 0xCAD0,0xCAD2,0xCAD4,0xCAD5,0xCAD6,0xCAD7,0xCADA,0xCADB,/* 0x68-0x6F */ + 0xCADC,0xCADD,0xCADE,0xCADF,0xCAE1,0xCAE2,0xCAE3,0xCAE4,/* 0x70-0x77 */ + 0xCAE5,0xCAE6,0xCAE7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCAE8,0xCAE9,0xCAEA,0xCAEB,0xCAED,0xCAEE,0xCAEF,/* 0x80-0x87 */ + 0xCAF0,0xCAF1,0xCAF2,0xCAF3,0xCAF5,0xCAF6,0xCAF7,0xCAF8,/* 0x88-0x8F */ + 0xCAF9,0xCAFA,0xCAFB,0xCAFC,0xCAFD,0xCAFE,0xCAFF,0xCB00,/* 0x90-0x97 */ + 0xCB01,0xCB02,0xCB03,0xCB04,0xCB05,0xCB06,0xCB07,0xCB09,/* 0x98-0x9F */ + 0xCB0A,0x2500,0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,/* 0xA0-0xA7 */ + 0x252C,0x2524,0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,/* 0xA8-0xAF */ + 0x251B,0x2517,0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,/* 0xB0-0xB7 */ + 0x252F,0x2528,0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,/* 0xB8-0xBF */ + 0x2542,0x2512,0x2511,0x251A,0x2519,0x2516,0x2515,0x250E,/* 0xC0-0xC7 */ + 0x250D,0x251E,0x251F,0x2521,0x2522,0x2526,0x2527,0x2529,/* 0xC8-0xCF */ + 0x252A,0x252D,0x252E,0x2531,0x2532,0x2535,0x2536,0x2539,/* 0xD0-0xD7 */ + 0x253A,0x253D,0x253E,0x2540,0x2541,0x2543,0x2544,0x2545,/* 0xD8-0xDF */ + 0x2546,0x2547,0x2548,0x2549,0x254A,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCB0B,0xCB0C,0xCB0D,0xCB0E,0xCB0F,0xCB11,0xCB12,/* 0x40-0x47 */ + 0xCB13,0xCB15,0xCB16,0xCB17,0xCB19,0xCB1A,0xCB1B,0xCB1C,/* 0x48-0x4F */ + 0xCB1D,0xCB1E,0xCB1F,0xCB22,0xCB23,0xCB24,0xCB25,0xCB26,/* 0x50-0x57 */ + 0xCB27,0xCB28,0xCB29,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCB2A,0xCB2B,0xCB2C,0xCB2D,0xCB2E,0xCB2F,0xCB30,/* 0x60-0x67 */ + 0xCB31,0xCB32,0xCB33,0xCB34,0xCB35,0xCB36,0xCB37,0xCB38,/* 0x68-0x6F */ + 0xCB39,0xCB3A,0xCB3B,0xCB3C,0xCB3D,0xCB3E,0xCB3F,0xCB40,/* 0x70-0x77 */ + 0xCB42,0xCB43,0xCB44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCB45,0xCB46,0xCB47,0xCB4A,0xCB4B,0xCB4D,0xCB4E,/* 0x80-0x87 */ + 0xCB4F,0xCB51,0xCB52,0xCB53,0xCB54,0xCB55,0xCB56,0xCB57,/* 0x88-0x8F */ + 0xCB5A,0xCB5B,0xCB5C,0xCB5E,0xCB5F,0xCB60,0xCB61,0xCB62,/* 0x90-0x97 */ + 0xCB63,0xCB65,0xCB66,0xCB67,0xCB68,0xCB69,0xCB6A,0xCB6B,/* 0x98-0x9F */ + 0xCB6C,0x3395,0x3396,0x3397,0x2113,0x3398,0x33C4,0x33A3,/* 0xA0-0xA7 */ + 0x33A4,0x33A5,0x33A6,0x3399,0x339A,0x339B,0x339C,0x339D,/* 0xA8-0xAF */ + 0x339E,0x339F,0x33A0,0x33A1,0x33A2,0x33CA,0x338D,0x338E,/* 0xB0-0xB7 */ + 0x338F,0x33CF,0x3388,0x3389,0x33C8,0x33A7,0x33A8,0x33B0,/* 0xB8-0xBF */ + 0x33B1,0x33B2,0x33B3,0x33B4,0x33B5,0x33B6,0x33B7,0x33B8,/* 0xC0-0xC7 */ + 0x33B9,0x3380,0x3381,0x3382,0x3383,0x3384,0x33BA,0x33BB,/* 0xC8-0xCF */ + 0x33BC,0x33BD,0x33BE,0x33BF,0x3390,0x3391,0x3392,0x3393,/* 0xD0-0xD7 */ + 0x3394,0x2126,0x33C0,0x33C1,0x338A,0x338B,0x338C,0x33D6,/* 0xD8-0xDF */ + 0x33C5,0x33AD,0x33AE,0x33AF,0x33DB,0x33A9,0x33AA,0x33AB,/* 0xE0-0xE7 */ + 0x33AC,0x33DD,0x33D0,0x33D3,0x33C3,0x33C9,0x33DC,0x33C6,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCB6D,0xCB6E,0xCB6F,0xCB70,0xCB71,0xCB72,0xCB73,/* 0x40-0x47 */ + 0xCB74,0xCB75,0xCB76,0xCB77,0xCB7A,0xCB7B,0xCB7C,0xCB7D,/* 0x48-0x4F */ + 0xCB7E,0xCB7F,0xCB80,0xCB81,0xCB82,0xCB83,0xCB84,0xCB85,/* 0x50-0x57 */ + 0xCB86,0xCB87,0xCB88,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCB89,0xCB8A,0xCB8B,0xCB8C,0xCB8D,0xCB8E,0xCB8F,/* 0x60-0x67 */ + 0xCB90,0xCB91,0xCB92,0xCB93,0xCB94,0xCB95,0xCB96,0xCB97,/* 0x68-0x6F */ + 0xCB98,0xCB99,0xCB9A,0xCB9B,0xCB9D,0xCB9E,0xCB9F,0xCBA0,/* 0x70-0x77 */ + 0xCBA1,0xCBA2,0xCBA3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCBA4,0xCBA5,0xCBA6,0xCBA7,0xCBA8,0xCBA9,0xCBAA,/* 0x80-0x87 */ + 0xCBAB,0xCBAC,0xCBAD,0xCBAE,0xCBAF,0xCBB0,0xCBB1,0xCBB2,/* 0x88-0x8F */ + 0xCBB3,0xCBB4,0xCBB5,0xCBB6,0xCBB7,0xCBB9,0xCBBA,0xCBBB,/* 0x90-0x97 */ + 0xCBBC,0xCBBD,0xCBBE,0xCBBF,0xCBC0,0xCBC1,0xCBC2,0xCBC3,/* 0x98-0x9F */ + 0xCBC4,0x00C6,0x00D0,0x00AA,0x0126,0x0000,0x0132,0x0000,/* 0xA0-0xA7 */ + 0x013F,0x0141,0x00D8,0x0152,0x00BA,0x00DE,0x0166,0x014A,/* 0xA8-0xAF */ + 0x0000,0x3260,0x3261,0x3262,0x3263,0x3264,0x3265,0x3266,/* 0xB0-0xB7 */ + 0x3267,0x3268,0x3269,0x326A,0x326B,0x326C,0x326D,0x326E,/* 0xB8-0xBF */ + 0x326F,0x3270,0x3271,0x3272,0x3273,0x3274,0x3275,0x3276,/* 0xC0-0xC7 */ + 0x3277,0x3278,0x3279,0x327A,0x327B,0x24D0,0x24D1,0x24D2,/* 0xC8-0xCF */ + 0x24D3,0x24D4,0x24D5,0x24D6,0x24D7,0x24D8,0x24D9,0x24DA,/* 0xD0-0xD7 */ + 0x24DB,0x24DC,0x24DD,0x24DE,0x24DF,0x24E0,0x24E1,0x24E2,/* 0xD8-0xDF */ + 0x24E3,0x24E4,0x24E5,0x24E6,0x24E7,0x24E8,0x24E9,0x2460,/* 0xE0-0xE7 */ + 0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,0x2468,/* 0xE8-0xEF */ + 0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x00BD,0x2153,/* 0xF0-0xF7 */ + 0x2154,0x00BC,0x00BE,0x215B,0x215C,0x215D,0x215E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCBC5,0xCBC6,0xCBC7,0xCBC8,0xCBC9,0xCBCA,0xCBCB,/* 0x40-0x47 */ + 0xCBCC,0xCBCD,0xCBCE,0xCBCF,0xCBD0,0xCBD1,0xCBD2,0xCBD3,/* 0x48-0x4F */ + 0xCBD5,0xCBD6,0xCBD7,0xCBD8,0xCBD9,0xCBDA,0xCBDB,0xCBDC,/* 0x50-0x57 */ + 0xCBDD,0xCBDE,0xCBDF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCBE0,0xCBE1,0xCBE2,0xCBE3,0xCBE5,0xCBE6,0xCBE8,/* 0x60-0x67 */ + 0xCBEA,0xCBEB,0xCBEC,0xCBED,0xCBEE,0xCBEF,0xCBF0,0xCBF1,/* 0x68-0x6F */ + 0xCBF2,0xCBF3,0xCBF4,0xCBF5,0xCBF6,0xCBF7,0xCBF8,0xCBF9,/* 0x70-0x77 */ + 0xCBFA,0xCBFB,0xCBFC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCBFD,0xCBFE,0xCBFF,0xCC00,0xCC01,0xCC02,0xCC03,/* 0x80-0x87 */ + 0xCC04,0xCC05,0xCC06,0xCC07,0xCC08,0xCC09,0xCC0A,0xCC0B,/* 0x88-0x8F */ + 0xCC0E,0xCC0F,0xCC11,0xCC12,0xCC13,0xCC15,0xCC16,0xCC17,/* 0x90-0x97 */ + 0xCC18,0xCC19,0xCC1A,0xCC1B,0xCC1E,0xCC1F,0xCC20,0xCC23,/* 0x98-0x9F */ + 0xCC24,0x00E6,0x0111,0x00F0,0x0127,0x0131,0x0133,0x0138,/* 0xA0-0xA7 */ + 0x0140,0x0142,0x00F8,0x0153,0x00DF,0x00FE,0x0167,0x014B,/* 0xA8-0xAF */ + 0x0149,0x3200,0x3201,0x3202,0x3203,0x3204,0x3205,0x3206,/* 0xB0-0xB7 */ + 0x3207,0x3208,0x3209,0x320A,0x320B,0x320C,0x320D,0x320E,/* 0xB8-0xBF */ + 0x320F,0x3210,0x3211,0x3212,0x3213,0x3214,0x3215,0x3216,/* 0xC0-0xC7 */ + 0x3217,0x3218,0x3219,0x321A,0x321B,0x249C,0x249D,0x249E,/* 0xC8-0xCF */ + 0x249F,0x24A0,0x24A1,0x24A2,0x24A3,0x24A4,0x24A5,0x24A6,/* 0xD0-0xD7 */ + 0x24A7,0x24A8,0x24A9,0x24AA,0x24AB,0x24AC,0x24AD,0x24AE,/* 0xD8-0xDF */ + 0x24AF,0x24B0,0x24B1,0x24B2,0x24B3,0x24B4,0x24B5,0x2474,/* 0xE0-0xE7 */ + 0x2475,0x2476,0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,/* 0xE8-0xEF */ + 0x247D,0x247E,0x247F,0x2480,0x2481,0x2482,0x00B9,0x00B2,/* 0xF0-0xF7 */ + 0x00B3,0x2074,0x207F,0x2081,0x2082,0x2083,0x2084,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCC25,0xCC26,0xCC2A,0xCC2B,0xCC2D,0xCC2F,0xCC31,/* 0x40-0x47 */ + 0xCC32,0xCC33,0xCC34,0xCC35,0xCC36,0xCC37,0xCC3A,0xCC3F,/* 0x48-0x4F */ + 0xCC40,0xCC41,0xCC42,0xCC43,0xCC46,0xCC47,0xCC49,0xCC4A,/* 0x50-0x57 */ + 0xCC4B,0xCC4D,0xCC4E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCC4F,0xCC50,0xCC51,0xCC52,0xCC53,0xCC56,0xCC5A,/* 0x60-0x67 */ + 0xCC5B,0xCC5C,0xCC5D,0xCC5E,0xCC5F,0xCC61,0xCC62,0xCC63,/* 0x68-0x6F */ + 0xCC65,0xCC67,0xCC69,0xCC6A,0xCC6B,0xCC6C,0xCC6D,0xCC6E,/* 0x70-0x77 */ + 0xCC6F,0xCC71,0xCC72,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCC73,0xCC74,0xCC76,0xCC77,0xCC78,0xCC79,0xCC7A,/* 0x80-0x87 */ + 0xCC7B,0xCC7C,0xCC7D,0xCC7E,0xCC7F,0xCC80,0xCC81,0xCC82,/* 0x88-0x8F */ + 0xCC83,0xCC84,0xCC85,0xCC86,0xCC87,0xCC88,0xCC89,0xCC8A,/* 0x90-0x97 */ + 0xCC8B,0xCC8C,0xCC8D,0xCC8E,0xCC8F,0xCC90,0xCC91,0xCC92,/* 0x98-0x9F */ + 0xCC93,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */ + 0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */ + 0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */ + 0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */ + 0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */ + 0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */ + 0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */ + 0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */ + 0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */ + 0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */ + 0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCC94,0xCC95,0xCC96,0xCC97,0xCC9A,0xCC9B,0xCC9D,/* 0x40-0x47 */ + 0xCC9E,0xCC9F,0xCCA1,0xCCA2,0xCCA3,0xCCA4,0xCCA5,0xCCA6,/* 0x48-0x4F */ + 0xCCA7,0xCCAA,0xCCAE,0xCCAF,0xCCB0,0xCCB1,0xCCB2,0xCCB3,/* 0x50-0x57 */ + 0xCCB6,0xCCB7,0xCCB9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCCBA,0xCCBB,0xCCBD,0xCCBE,0xCCBF,0xCCC0,0xCCC1,/* 0x60-0x67 */ + 0xCCC2,0xCCC3,0xCCC6,0xCCC8,0xCCCA,0xCCCB,0xCCCC,0xCCCD,/* 0x68-0x6F */ + 0xCCCE,0xCCCF,0xCCD1,0xCCD2,0xCCD3,0xCCD5,0xCCD6,0xCCD7,/* 0x70-0x77 */ + 0xCCD8,0xCCD9,0xCCDA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCCDB,0xCCDC,0xCCDD,0xCCDE,0xCCDF,0xCCE0,0xCCE1,/* 0x80-0x87 */ + 0xCCE2,0xCCE3,0xCCE5,0xCCE6,0xCCE7,0xCCE8,0xCCE9,0xCCEA,/* 0x88-0x8F */ + 0xCCEB,0xCCED,0xCCEE,0xCCEF,0xCCF1,0xCCF2,0xCCF3,0xCCF4,/* 0x90-0x97 */ + 0xCCF5,0xCCF6,0xCCF7,0xCCF8,0xCCF9,0xCCFA,0xCCFB,0xCCFC,/* 0x98-0x9F */ + 0xCCFD,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */ + 0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */ + 0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */ + 0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */ + 0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */ + 0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */ + 0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */ + 0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */ + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCCFE,0xCCFF,0xCD00,0xCD02,0xCD03,0xCD04,0xCD05,/* 0x40-0x47 */ + 0xCD06,0xCD07,0xCD0A,0xCD0B,0xCD0D,0xCD0E,0xCD0F,0xCD11,/* 0x48-0x4F */ + 0xCD12,0xCD13,0xCD14,0xCD15,0xCD16,0xCD17,0xCD1A,0xCD1C,/* 0x50-0x57 */ + 0xCD1E,0xCD1F,0xCD20,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCD21,0xCD22,0xCD23,0xCD25,0xCD26,0xCD27,0xCD29,/* 0x60-0x67 */ + 0xCD2A,0xCD2B,0xCD2D,0xCD2E,0xCD2F,0xCD30,0xCD31,0xCD32,/* 0x68-0x6F */ + 0xCD33,0xCD34,0xCD35,0xCD36,0xCD37,0xCD38,0xCD3A,0xCD3B,/* 0x70-0x77 */ + 0xCD3C,0xCD3D,0xCD3E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCD3F,0xCD40,0xCD41,0xCD42,0xCD43,0xCD44,0xCD45,/* 0x80-0x87 */ + 0xCD46,0xCD47,0xCD48,0xCD49,0xCD4A,0xCD4B,0xCD4C,0xCD4D,/* 0x88-0x8F */ + 0xCD4E,0xCD4F,0xCD50,0xCD51,0xCD52,0xCD53,0xCD54,0xCD55,/* 0x90-0x97 */ + 0xCD56,0xCD57,0xCD58,0xCD59,0xCD5A,0xCD5B,0xCD5D,0xCD5E,/* 0x98-0x9F */ + 0xCD5F,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */ + 0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */ + 0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */ + 0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */ + 0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */ + 0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */ + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCD61,0xCD62,0xCD63,0xCD65,0xCD66,0xCD67,0xCD68,/* 0x40-0x47 */ + 0xCD69,0xCD6A,0xCD6B,0xCD6E,0xCD70,0xCD72,0xCD73,0xCD74,/* 0x48-0x4F */ + 0xCD75,0xCD76,0xCD77,0xCD79,0xCD7A,0xCD7B,0xCD7C,0xCD7D,/* 0x50-0x57 */ + 0xCD7E,0xCD7F,0xCD80,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCD81,0xCD82,0xCD83,0xCD84,0xCD85,0xCD86,0xCD87,/* 0x60-0x67 */ + 0xCD89,0xCD8A,0xCD8B,0xCD8C,0xCD8D,0xCD8E,0xCD8F,0xCD90,/* 0x68-0x6F */ + 0xCD91,0xCD92,0xCD93,0xCD96,0xCD97,0xCD99,0xCD9A,0xCD9B,/* 0x70-0x77 */ + 0xCD9D,0xCD9E,0xCD9F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCDA0,0xCDA1,0xCDA2,0xCDA3,0xCDA6,0xCDA8,0xCDAA,/* 0x80-0x87 */ + 0xCDAB,0xCDAC,0xCDAD,0xCDAE,0xCDAF,0xCDB1,0xCDB2,0xCDB3,/* 0x88-0x8F */ + 0xCDB4,0xCDB5,0xCDB6,0xCDB7,0xCDB8,0xCDB9,0xCDBA,0xCDBB,/* 0x90-0x97 */ + 0xCDBC,0xCDBD,0xCDBE,0xCDBF,0xCDC0,0xCDC1,0xCDC2,0xCDC3,/* 0x98-0x9F */ + 0xCDC5,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCDC6,0xCDC7,0xCDC8,0xCDC9,0xCDCA,0xCDCB,0xCDCD,/* 0x40-0x47 */ + 0xCDCE,0xCDCF,0xCDD1,0xCDD2,0xCDD3,0xCDD4,0xCDD5,0xCDD6,/* 0x48-0x4F */ + 0xCDD7,0xCDD8,0xCDD9,0xCDDA,0xCDDB,0xCDDC,0xCDDD,0xCDDE,/* 0x50-0x57 */ + 0xCDDF,0xCDE0,0xCDE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCDE2,0xCDE3,0xCDE4,0xCDE5,0xCDE6,0xCDE7,0xCDE9,/* 0x60-0x67 */ + 0xCDEA,0xCDEB,0xCDED,0xCDEE,0xCDEF,0xCDF1,0xCDF2,0xCDF3,/* 0x68-0x6F */ + 0xCDF4,0xCDF5,0xCDF6,0xCDF7,0xCDFA,0xCDFC,0xCDFE,0xCDFF,/* 0x70-0x77 */ + 0xCE00,0xCE01,0xCE02,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCE03,0xCE05,0xCE06,0xCE07,0xCE09,0xCE0A,0xCE0B,/* 0x80-0x87 */ + 0xCE0D,0xCE0E,0xCE0F,0xCE10,0xCE11,0xCE12,0xCE13,0xCE15,/* 0x88-0x8F */ + 0xCE16,0xCE17,0xCE18,0xCE1A,0xCE1B,0xCE1C,0xCE1D,0xCE1E,/* 0x90-0x97 */ + 0xCE1F,0xCE22,0xCE23,0xCE25,0xCE26,0xCE27,0xCE29,0xCE2A,/* 0x98-0x9F */ + 0xCE2B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCE2C,0xCE2D,0xCE2E,0xCE2F,0xCE32,0xCE34,0xCE36,/* 0x40-0x47 */ + 0xCE37,0xCE38,0xCE39,0xCE3A,0xCE3B,0xCE3C,0xCE3D,0xCE3E,/* 0x48-0x4F */ + 0xCE3F,0xCE40,0xCE41,0xCE42,0xCE43,0xCE44,0xCE45,0xCE46,/* 0x50-0x57 */ + 0xCE47,0xCE48,0xCE49,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCE4A,0xCE4B,0xCE4C,0xCE4D,0xCE4E,0xCE4F,0xCE50,/* 0x60-0x67 */ + 0xCE51,0xCE52,0xCE53,0xCE54,0xCE55,0xCE56,0xCE57,0xCE5A,/* 0x68-0x6F */ + 0xCE5B,0xCE5D,0xCE5E,0xCE62,0xCE63,0xCE64,0xCE65,0xCE66,/* 0x70-0x77 */ + 0xCE67,0xCE6A,0xCE6C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCE6E,0xCE6F,0xCE70,0xCE71,0xCE72,0xCE73,0xCE76,/* 0x80-0x87 */ + 0xCE77,0xCE79,0xCE7A,0xCE7B,0xCE7D,0xCE7E,0xCE7F,0xCE80,/* 0x88-0x8F */ + 0xCE81,0xCE82,0xCE83,0xCE86,0xCE88,0xCE8A,0xCE8B,0xCE8C,/* 0x90-0x97 */ + 0xCE8D,0xCE8E,0xCE8F,0xCE92,0xCE93,0xCE95,0xCE96,0xCE97,/* 0x98-0x9F */ + 0xCE99,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCE9A,0xCE9B,0xCE9C,0xCE9D,0xCE9E,0xCE9F,0xCEA2,/* 0x40-0x47 */ + 0xCEA6,0xCEA7,0xCEA8,0xCEA9,0xCEAA,0xCEAB,0xCEAE,0xCEAF,/* 0x48-0x4F */ + 0xCEB0,0xCEB1,0xCEB2,0xCEB3,0xCEB4,0xCEB5,0xCEB6,0xCEB7,/* 0x50-0x57 */ + 0xCEB8,0xCEB9,0xCEBA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCEBB,0xCEBC,0xCEBD,0xCEBE,0xCEBF,0xCEC0,0xCEC2,/* 0x60-0x67 */ + 0xCEC3,0xCEC4,0xCEC5,0xCEC6,0xCEC7,0xCEC8,0xCEC9,0xCECA,/* 0x68-0x6F */ + 0xCECB,0xCECC,0xCECD,0xCECE,0xCECF,0xCED0,0xCED1,0xCED2,/* 0x70-0x77 */ + 0xCED3,0xCED4,0xCED5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCED6,0xCED7,0xCED8,0xCED9,0xCEDA,0xCEDB,0xCEDC,/* 0x80-0x87 */ + 0xCEDD,0xCEDE,0xCEDF,0xCEE0,0xCEE1,0xCEE2,0xCEE3,0xCEE6,/* 0x88-0x8F */ + 0xCEE7,0xCEE9,0xCEEA,0xCEED,0xCEEE,0xCEEF,0xCEF0,0xCEF1,/* 0x90-0x97 */ + 0xCEF2,0xCEF3,0xCEF6,0xCEFA,0xCEFB,0xCEFC,0xCEFD,0xCEFE,/* 0x98-0x9F */ + 0xCEFF,0xAC00,0xAC01,0xAC04,0xAC07,0xAC08,0xAC09,0xAC0A,/* 0xA0-0xA7 */ + 0xAC10,0xAC11,0xAC12,0xAC13,0xAC14,0xAC15,0xAC16,0xAC17,/* 0xA8-0xAF */ + 0xAC19,0xAC1A,0xAC1B,0xAC1C,0xAC1D,0xAC20,0xAC24,0xAC2C,/* 0xB0-0xB7 */ + 0xAC2D,0xAC2F,0xAC30,0xAC31,0xAC38,0xAC39,0xAC3C,0xAC40,/* 0xB8-0xBF */ + 0xAC4B,0xAC4D,0xAC54,0xAC58,0xAC5C,0xAC70,0xAC71,0xAC74,/* 0xC0-0xC7 */ + 0xAC77,0xAC78,0xAC7A,0xAC80,0xAC81,0xAC83,0xAC84,0xAC85,/* 0xC8-0xCF */ + 0xAC86,0xAC89,0xAC8A,0xAC8B,0xAC8C,0xAC90,0xAC94,0xAC9C,/* 0xD0-0xD7 */ + 0xAC9D,0xAC9F,0xACA0,0xACA1,0xACA8,0xACA9,0xACAA,0xACAC,/* 0xD8-0xDF */ + 0xACAF,0xACB0,0xACB8,0xACB9,0xACBB,0xACBC,0xACBD,0xACC1,/* 0xE0-0xE7 */ + 0xACC4,0xACC8,0xACCC,0xACD5,0xACD7,0xACE0,0xACE1,0xACE4,/* 0xE8-0xEF */ + 0xACE7,0xACE8,0xACEA,0xACEC,0xACEF,0xACF0,0xACF1,0xACF3,/* 0xF0-0xF7 */ + 0xACF5,0xACF6,0xACFC,0xACFD,0xAD00,0xAD04,0xAD06,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCF02,0xCF03,0xCF05,0xCF06,0xCF07,0xCF09,0xCF0A,/* 0x40-0x47 */ + 0xCF0B,0xCF0C,0xCF0D,0xCF0E,0xCF0F,0xCF12,0xCF14,0xCF16,/* 0x48-0x4F */ + 0xCF17,0xCF18,0xCF19,0xCF1A,0xCF1B,0xCF1D,0xCF1E,0xCF1F,/* 0x50-0x57 */ + 0xCF21,0xCF22,0xCF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCF25,0xCF26,0xCF27,0xCF28,0xCF29,0xCF2A,0xCF2B,/* 0x60-0x67 */ + 0xCF2E,0xCF32,0xCF33,0xCF34,0xCF35,0xCF36,0xCF37,0xCF39,/* 0x68-0x6F */ + 0xCF3A,0xCF3B,0xCF3C,0xCF3D,0xCF3E,0xCF3F,0xCF40,0xCF41,/* 0x70-0x77 */ + 0xCF42,0xCF43,0xCF44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCF45,0xCF46,0xCF47,0xCF48,0xCF49,0xCF4A,0xCF4B,/* 0x80-0x87 */ + 0xCF4C,0xCF4D,0xCF4E,0xCF4F,0xCF50,0xCF51,0xCF52,0xCF53,/* 0x88-0x8F */ + 0xCF56,0xCF57,0xCF59,0xCF5A,0xCF5B,0xCF5D,0xCF5E,0xCF5F,/* 0x90-0x97 */ + 0xCF60,0xCF61,0xCF62,0xCF63,0xCF66,0xCF68,0xCF6A,0xCF6B,/* 0x98-0x9F */ + 0xCF6C,0xAD0C,0xAD0D,0xAD0F,0xAD11,0xAD18,0xAD1C,0xAD20,/* 0xA0-0xA7 */ + 0xAD29,0xAD2C,0xAD2D,0xAD34,0xAD35,0xAD38,0xAD3C,0xAD44,/* 0xA8-0xAF */ + 0xAD45,0xAD47,0xAD49,0xAD50,0xAD54,0xAD58,0xAD61,0xAD63,/* 0xB0-0xB7 */ + 0xAD6C,0xAD6D,0xAD70,0xAD73,0xAD74,0xAD75,0xAD76,0xAD7B,/* 0xB8-0xBF */ + 0xAD7C,0xAD7D,0xAD7F,0xAD81,0xAD82,0xAD88,0xAD89,0xAD8C,/* 0xC0-0xC7 */ + 0xAD90,0xAD9C,0xAD9D,0xADA4,0xADB7,0xADC0,0xADC1,0xADC4,/* 0xC8-0xCF */ + 0xADC8,0xADD0,0xADD1,0xADD3,0xADDC,0xADE0,0xADE4,0xADF8,/* 0xD0-0xD7 */ + 0xADF9,0xADFC,0xADFF,0xAE00,0xAE01,0xAE08,0xAE09,0xAE0B,/* 0xD8-0xDF */ + 0xAE0D,0xAE14,0xAE30,0xAE31,0xAE34,0xAE37,0xAE38,0xAE3A,/* 0xE0-0xE7 */ + 0xAE40,0xAE41,0xAE43,0xAE45,0xAE46,0xAE4A,0xAE4C,0xAE4D,/* 0xE8-0xEF */ + 0xAE4E,0xAE50,0xAE54,0xAE56,0xAE5C,0xAE5D,0xAE5F,0xAE60,/* 0xF0-0xF7 */ + 0xAE61,0xAE65,0xAE68,0xAE69,0xAE6C,0xAE70,0xAE78,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCF6D,0xCF6E,0xCF6F,0xCF72,0xCF73,0xCF75,0xCF76,/* 0x40-0x47 */ + 0xCF77,0xCF79,0xCF7A,0xCF7B,0xCF7C,0xCF7D,0xCF7E,0xCF7F,/* 0x48-0x4F */ + 0xCF81,0xCF82,0xCF83,0xCF84,0xCF86,0xCF87,0xCF88,0xCF89,/* 0x50-0x57 */ + 0xCF8A,0xCF8B,0xCF8D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCF8E,0xCF8F,0xCF90,0xCF91,0xCF92,0xCF93,0xCF94,/* 0x60-0x67 */ + 0xCF95,0xCF96,0xCF97,0xCF98,0xCF99,0xCF9A,0xCF9B,0xCF9C,/* 0x68-0x6F */ + 0xCF9D,0xCF9E,0xCF9F,0xCFA0,0xCFA2,0xCFA3,0xCFA4,0xCFA5,/* 0x70-0x77 */ + 0xCFA6,0xCFA7,0xCFA9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCFAA,0xCFAB,0xCFAC,0xCFAD,0xCFAE,0xCFAF,0xCFB1,/* 0x80-0x87 */ + 0xCFB2,0xCFB3,0xCFB4,0xCFB5,0xCFB6,0xCFB7,0xCFB8,0xCFB9,/* 0x88-0x8F */ + 0xCFBA,0xCFBB,0xCFBC,0xCFBD,0xCFBE,0xCFBF,0xCFC0,0xCFC1,/* 0x90-0x97 */ + 0xCFC2,0xCFC3,0xCFC5,0xCFC6,0xCFC7,0xCFC8,0xCFC9,0xCFCA,/* 0x98-0x9F */ + 0xCFCB,0xAE79,0xAE7B,0xAE7C,0xAE7D,0xAE84,0xAE85,0xAE8C,/* 0xA0-0xA7 */ + 0xAEBC,0xAEBD,0xAEBE,0xAEC0,0xAEC4,0xAECC,0xAECD,0xAECF,/* 0xA8-0xAF */ + 0xAED0,0xAED1,0xAED8,0xAED9,0xAEDC,0xAEE8,0xAEEB,0xAEED,/* 0xB0-0xB7 */ + 0xAEF4,0xAEF8,0xAEFC,0xAF07,0xAF08,0xAF0D,0xAF10,0xAF2C,/* 0xB8-0xBF */ + 0xAF2D,0xAF30,0xAF32,0xAF34,0xAF3C,0xAF3D,0xAF3F,0xAF41,/* 0xC0-0xC7 */ + 0xAF42,0xAF43,0xAF48,0xAF49,0xAF50,0xAF5C,0xAF5D,0xAF64,/* 0xC8-0xCF */ + 0xAF65,0xAF79,0xAF80,0xAF84,0xAF88,0xAF90,0xAF91,0xAF95,/* 0xD0-0xD7 */ + 0xAF9C,0xAFB8,0xAFB9,0xAFBC,0xAFC0,0xAFC7,0xAFC8,0xAFC9,/* 0xD8-0xDF */ + 0xAFCB,0xAFCD,0xAFCE,0xAFD4,0xAFDC,0xAFE8,0xAFE9,0xAFF0,/* 0xE0-0xE7 */ + 0xAFF1,0xAFF4,0xAFF8,0xB000,0xB001,0xB004,0xB00C,0xB010,/* 0xE8-0xEF */ + 0xB014,0xB01C,0xB01D,0xB028,0xB044,0xB045,0xB048,0xB04A,/* 0xF0-0xF7 */ + 0xB04C,0xB04E,0xB053,0xB054,0xB055,0xB057,0xB059,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCFCC,0xCFCD,0xCFCE,0xCFCF,0xCFD0,0xCFD1,0xCFD2,/* 0x40-0x47 */ + 0xCFD3,0xCFD4,0xCFD5,0xCFD6,0xCFD7,0xCFD8,0xCFD9,0xCFDA,/* 0x48-0x4F */ + 0xCFDB,0xCFDC,0xCFDD,0xCFDE,0xCFDF,0xCFE2,0xCFE3,0xCFE5,/* 0x50-0x57 */ + 0xCFE6,0xCFE7,0xCFE9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCFEA,0xCFEB,0xCFEC,0xCFED,0xCFEE,0xCFEF,0xCFF2,/* 0x60-0x67 */ + 0xCFF4,0xCFF6,0xCFF7,0xCFF8,0xCFF9,0xCFFA,0xCFFB,0xCFFD,/* 0x68-0x6F */ + 0xCFFE,0xCFFF,0xD001,0xD002,0xD003,0xD005,0xD006,0xD007,/* 0x70-0x77 */ + 0xD008,0xD009,0xD00A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD00B,0xD00C,0xD00D,0xD00E,0xD00F,0xD010,0xD012,/* 0x80-0x87 */ + 0xD013,0xD014,0xD015,0xD016,0xD017,0xD019,0xD01A,0xD01B,/* 0x88-0x8F */ + 0xD01C,0xD01D,0xD01E,0xD01F,0xD020,0xD021,0xD022,0xD023,/* 0x90-0x97 */ + 0xD024,0xD025,0xD026,0xD027,0xD028,0xD029,0xD02A,0xD02B,/* 0x98-0x9F */ + 0xD02C,0xB05D,0xB07C,0xB07D,0xB080,0xB084,0xB08C,0xB08D,/* 0xA0-0xA7 */ + 0xB08F,0xB091,0xB098,0xB099,0xB09A,0xB09C,0xB09F,0xB0A0,/* 0xA8-0xAF */ + 0xB0A1,0xB0A2,0xB0A8,0xB0A9,0xB0AB,0xB0AC,0xB0AD,0xB0AE,/* 0xB0-0xB7 */ + 0xB0AF,0xB0B1,0xB0B3,0xB0B4,0xB0B5,0xB0B8,0xB0BC,0xB0C4,/* 0xB8-0xBF */ + 0xB0C5,0xB0C7,0xB0C8,0xB0C9,0xB0D0,0xB0D1,0xB0D4,0xB0D8,/* 0xC0-0xC7 */ + 0xB0E0,0xB0E5,0xB108,0xB109,0xB10B,0xB10C,0xB110,0xB112,/* 0xC8-0xCF */ + 0xB113,0xB118,0xB119,0xB11B,0xB11C,0xB11D,0xB123,0xB124,/* 0xD0-0xD7 */ + 0xB125,0xB128,0xB12C,0xB134,0xB135,0xB137,0xB138,0xB139,/* 0xD8-0xDF */ + 0xB140,0xB141,0xB144,0xB148,0xB150,0xB151,0xB154,0xB155,/* 0xE0-0xE7 */ + 0xB158,0xB15C,0xB160,0xB178,0xB179,0xB17C,0xB180,0xB182,/* 0xE8-0xEF */ + 0xB188,0xB189,0xB18B,0xB18D,0xB192,0xB193,0xB194,0xB198,/* 0xF0-0xF7 */ + 0xB19C,0xB1A8,0xB1CC,0xB1D0,0xB1D4,0xB1DC,0xB1DD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD02E,0xD02F,0xD030,0xD031,0xD032,0xD033,0xD036,/* 0x40-0x47 */ + 0xD037,0xD039,0xD03A,0xD03B,0xD03D,0xD03E,0xD03F,0xD040,/* 0x48-0x4F */ + 0xD041,0xD042,0xD043,0xD046,0xD048,0xD04A,0xD04B,0xD04C,/* 0x50-0x57 */ + 0xD04D,0xD04E,0xD04F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD051,0xD052,0xD053,0xD055,0xD056,0xD057,0xD059,/* 0x60-0x67 */ + 0xD05A,0xD05B,0xD05C,0xD05D,0xD05E,0xD05F,0xD061,0xD062,/* 0x68-0x6F */ + 0xD063,0xD064,0xD065,0xD066,0xD067,0xD068,0xD069,0xD06A,/* 0x70-0x77 */ + 0xD06B,0xD06E,0xD06F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD071,0xD072,0xD073,0xD075,0xD076,0xD077,0xD078,/* 0x80-0x87 */ + 0xD079,0xD07A,0xD07B,0xD07E,0xD07F,0xD080,0xD082,0xD083,/* 0x88-0x8F */ + 0xD084,0xD085,0xD086,0xD087,0xD088,0xD089,0xD08A,0xD08B,/* 0x90-0x97 */ + 0xD08C,0xD08D,0xD08E,0xD08F,0xD090,0xD091,0xD092,0xD093,/* 0x98-0x9F */ + 0xD094,0xB1DF,0xB1E8,0xB1E9,0xB1EC,0xB1F0,0xB1F9,0xB1FB,/* 0xA0-0xA7 */ + 0xB1FD,0xB204,0xB205,0xB208,0xB20B,0xB20C,0xB214,0xB215,/* 0xA8-0xAF */ + 0xB217,0xB219,0xB220,0xB234,0xB23C,0xB258,0xB25C,0xB260,/* 0xB0-0xB7 */ + 0xB268,0xB269,0xB274,0xB275,0xB27C,0xB284,0xB285,0xB289,/* 0xB8-0xBF */ + 0xB290,0xB291,0xB294,0xB298,0xB299,0xB29A,0xB2A0,0xB2A1,/* 0xC0-0xC7 */ + 0xB2A3,0xB2A5,0xB2A6,0xB2AA,0xB2AC,0xB2B0,0xB2B4,0xB2C8,/* 0xC8-0xCF */ + 0xB2C9,0xB2CC,0xB2D0,0xB2D2,0xB2D8,0xB2D9,0xB2DB,0xB2DD,/* 0xD0-0xD7 */ + 0xB2E2,0xB2E4,0xB2E5,0xB2E6,0xB2E8,0xB2EB,0xB2EC,0xB2ED,/* 0xD8-0xDF */ + 0xB2EE,0xB2EF,0xB2F3,0xB2F4,0xB2F5,0xB2F7,0xB2F8,0xB2F9,/* 0xE0-0xE7 */ + 0xB2FA,0xB2FB,0xB2FF,0xB300,0xB301,0xB304,0xB308,0xB310,/* 0xE8-0xEF */ + 0xB311,0xB313,0xB314,0xB315,0xB31C,0xB354,0xB355,0xB356,/* 0xF0-0xF7 */ + 0xB358,0xB35B,0xB35C,0xB35E,0xB35F,0xB364,0xB365,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD095,0xD096,0xD097,0xD098,0xD099,0xD09A,0xD09B,/* 0x40-0x47 */ + 0xD09C,0xD09D,0xD09E,0xD09F,0xD0A0,0xD0A1,0xD0A2,0xD0A3,/* 0x48-0x4F */ + 0xD0A6,0xD0A7,0xD0A9,0xD0AA,0xD0AB,0xD0AD,0xD0AE,0xD0AF,/* 0x50-0x57 */ + 0xD0B0,0xD0B1,0xD0B2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD0B3,0xD0B6,0xD0B8,0xD0BA,0xD0BB,0xD0BC,0xD0BD,/* 0x60-0x67 */ + 0xD0BE,0xD0BF,0xD0C2,0xD0C3,0xD0C5,0xD0C6,0xD0C7,0xD0CA,/* 0x68-0x6F */ + 0xD0CB,0xD0CC,0xD0CD,0xD0CE,0xD0CF,0xD0D2,0xD0D6,0xD0D7,/* 0x70-0x77 */ + 0xD0D8,0xD0D9,0xD0DA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD0DB,0xD0DE,0xD0DF,0xD0E1,0xD0E2,0xD0E3,0xD0E5,/* 0x80-0x87 */ + 0xD0E6,0xD0E7,0xD0E8,0xD0E9,0xD0EA,0xD0EB,0xD0EE,0xD0F2,/* 0x88-0x8F */ + 0xD0F3,0xD0F4,0xD0F5,0xD0F6,0xD0F7,0xD0F9,0xD0FA,0xD0FB,/* 0x90-0x97 */ + 0xD0FC,0xD0FD,0xD0FE,0xD0FF,0xD100,0xD101,0xD102,0xD103,/* 0x98-0x9F */ + 0xD104,0xB367,0xB369,0xB36B,0xB36E,0xB370,0xB371,0xB374,/* 0xA0-0xA7 */ + 0xB378,0xB380,0xB381,0xB383,0xB384,0xB385,0xB38C,0xB390,/* 0xA8-0xAF */ + 0xB394,0xB3A0,0xB3A1,0xB3A8,0xB3AC,0xB3C4,0xB3C5,0xB3C8,/* 0xB0-0xB7 */ + 0xB3CB,0xB3CC,0xB3CE,0xB3D0,0xB3D4,0xB3D5,0xB3D7,0xB3D9,/* 0xB8-0xBF */ + 0xB3DB,0xB3DD,0xB3E0,0xB3E4,0xB3E8,0xB3FC,0xB410,0xB418,/* 0xC0-0xC7 */ + 0xB41C,0xB420,0xB428,0xB429,0xB42B,0xB434,0xB450,0xB451,/* 0xC8-0xCF */ + 0xB454,0xB458,0xB460,0xB461,0xB463,0xB465,0xB46C,0xB480,/* 0xD0-0xD7 */ + 0xB488,0xB49D,0xB4A4,0xB4A8,0xB4AC,0xB4B5,0xB4B7,0xB4B9,/* 0xD8-0xDF */ + 0xB4C0,0xB4C4,0xB4C8,0xB4D0,0xB4D5,0xB4DC,0xB4DD,0xB4E0,/* 0xE0-0xE7 */ + 0xB4E3,0xB4E4,0xB4E6,0xB4EC,0xB4ED,0xB4EF,0xB4F1,0xB4F8,/* 0xE8-0xEF */ + 0xB514,0xB515,0xB518,0xB51B,0xB51C,0xB524,0xB525,0xB527,/* 0xF0-0xF7 */ + 0xB528,0xB529,0xB52A,0xB530,0xB531,0xB534,0xB538,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD105,0xD106,0xD107,0xD108,0xD109,0xD10A,0xD10B,/* 0x40-0x47 */ + 0xD10C,0xD10E,0xD10F,0xD110,0xD111,0xD112,0xD113,0xD114,/* 0x48-0x4F */ + 0xD115,0xD116,0xD117,0xD118,0xD119,0xD11A,0xD11B,0xD11C,/* 0x50-0x57 */ + 0xD11D,0xD11E,0xD11F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD120,0xD121,0xD122,0xD123,0xD124,0xD125,0xD126,/* 0x60-0x67 */ + 0xD127,0xD128,0xD129,0xD12A,0xD12B,0xD12C,0xD12D,0xD12E,/* 0x68-0x6F */ + 0xD12F,0xD132,0xD133,0xD135,0xD136,0xD137,0xD139,0xD13B,/* 0x70-0x77 */ + 0xD13C,0xD13D,0xD13E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD13F,0xD142,0xD146,0xD147,0xD148,0xD149,0xD14A,/* 0x80-0x87 */ + 0xD14B,0xD14E,0xD14F,0xD151,0xD152,0xD153,0xD155,0xD156,/* 0x88-0x8F */ + 0xD157,0xD158,0xD159,0xD15A,0xD15B,0xD15E,0xD160,0xD162,/* 0x90-0x97 */ + 0xD163,0xD164,0xD165,0xD166,0xD167,0xD169,0xD16A,0xD16B,/* 0x98-0x9F */ + 0xD16D,0xB540,0xB541,0xB543,0xB544,0xB545,0xB54B,0xB54C,/* 0xA0-0xA7 */ + 0xB54D,0xB550,0xB554,0xB55C,0xB55D,0xB55F,0xB560,0xB561,/* 0xA8-0xAF */ + 0xB5A0,0xB5A1,0xB5A4,0xB5A8,0xB5AA,0xB5AB,0xB5B0,0xB5B1,/* 0xB0-0xB7 */ + 0xB5B3,0xB5B4,0xB5B5,0xB5BB,0xB5BC,0xB5BD,0xB5C0,0xB5C4,/* 0xB8-0xBF */ + 0xB5CC,0xB5CD,0xB5CF,0xB5D0,0xB5D1,0xB5D8,0xB5EC,0xB610,/* 0xC0-0xC7 */ + 0xB611,0xB614,0xB618,0xB625,0xB62C,0xB634,0xB648,0xB664,/* 0xC8-0xCF */ + 0xB668,0xB69C,0xB69D,0xB6A0,0xB6A4,0xB6AB,0xB6AC,0xB6B1,/* 0xD0-0xD7 */ + 0xB6D4,0xB6F0,0xB6F4,0xB6F8,0xB700,0xB701,0xB705,0xB728,/* 0xD8-0xDF */ + 0xB729,0xB72C,0xB72F,0xB730,0xB738,0xB739,0xB73B,0xB744,/* 0xE0-0xE7 */ + 0xB748,0xB74C,0xB754,0xB755,0xB760,0xB764,0xB768,0xB770,/* 0xE8-0xEF */ + 0xB771,0xB773,0xB775,0xB77C,0xB77D,0xB780,0xB784,0xB78C,/* 0xF0-0xF7 */ + 0xB78D,0xB78F,0xB790,0xB791,0xB792,0xB796,0xB797,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD16E,0xD16F,0xD170,0xD171,0xD172,0xD173,0xD174,/* 0x40-0x47 */ + 0xD175,0xD176,0xD177,0xD178,0xD179,0xD17A,0xD17B,0xD17D,/* 0x48-0x4F */ + 0xD17E,0xD17F,0xD180,0xD181,0xD182,0xD183,0xD185,0xD186,/* 0x50-0x57 */ + 0xD187,0xD189,0xD18A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD18B,0xD18C,0xD18D,0xD18E,0xD18F,0xD190,0xD191,/* 0x60-0x67 */ + 0xD192,0xD193,0xD194,0xD195,0xD196,0xD197,0xD198,0xD199,/* 0x68-0x6F */ + 0xD19A,0xD19B,0xD19C,0xD19D,0xD19E,0xD19F,0xD1A2,0xD1A3,/* 0x70-0x77 */ + 0xD1A5,0xD1A6,0xD1A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD1A9,0xD1AA,0xD1AB,0xD1AC,0xD1AD,0xD1AE,0xD1AF,/* 0x80-0x87 */ + 0xD1B2,0xD1B4,0xD1B6,0xD1B7,0xD1B8,0xD1B9,0xD1BB,0xD1BD,/* 0x88-0x8F */ + 0xD1BE,0xD1BF,0xD1C1,0xD1C2,0xD1C3,0xD1C4,0xD1C5,0xD1C6,/* 0x90-0x97 */ + 0xD1C7,0xD1C8,0xD1C9,0xD1CA,0xD1CB,0xD1CC,0xD1CD,0xD1CE,/* 0x98-0x9F */ + 0xD1CF,0xB798,0xB799,0xB79C,0xB7A0,0xB7A8,0xB7A9,0xB7AB,/* 0xA0-0xA7 */ + 0xB7AC,0xB7AD,0xB7B4,0xB7B5,0xB7B8,0xB7C7,0xB7C9,0xB7EC,/* 0xA8-0xAF */ + 0xB7ED,0xB7F0,0xB7F4,0xB7FC,0xB7FD,0xB7FF,0xB800,0xB801,/* 0xB0-0xB7 */ + 0xB807,0xB808,0xB809,0xB80C,0xB810,0xB818,0xB819,0xB81B,/* 0xB8-0xBF */ + 0xB81D,0xB824,0xB825,0xB828,0xB82C,0xB834,0xB835,0xB837,/* 0xC0-0xC7 */ + 0xB838,0xB839,0xB840,0xB844,0xB851,0xB853,0xB85C,0xB85D,/* 0xC8-0xCF */ + 0xB860,0xB864,0xB86C,0xB86D,0xB86F,0xB871,0xB878,0xB87C,/* 0xD0-0xD7 */ + 0xB88D,0xB8A8,0xB8B0,0xB8B4,0xB8B8,0xB8C0,0xB8C1,0xB8C3,/* 0xD8-0xDF */ + 0xB8C5,0xB8CC,0xB8D0,0xB8D4,0xB8DD,0xB8DF,0xB8E1,0xB8E8,/* 0xE0-0xE7 */ + 0xB8E9,0xB8EC,0xB8F0,0xB8F8,0xB8F9,0xB8FB,0xB8FD,0xB904,/* 0xE8-0xEF */ + 0xB918,0xB920,0xB93C,0xB93D,0xB940,0xB944,0xB94C,0xB94F,/* 0xF0-0xF7 */ + 0xB951,0xB958,0xB959,0xB95C,0xB960,0xB968,0xB969,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD1D0,0xD1D1,0xD1D2,0xD1D3,0xD1D4,0xD1D5,0xD1D6,/* 0x40-0x47 */ + 0xD1D7,0xD1D9,0xD1DA,0xD1DB,0xD1DC,0xD1DD,0xD1DE,0xD1DF,/* 0x48-0x4F */ + 0xD1E0,0xD1E1,0xD1E2,0xD1E3,0xD1E4,0xD1E5,0xD1E6,0xD1E7,/* 0x50-0x57 */ + 0xD1E8,0xD1E9,0xD1EA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD1EB,0xD1EC,0xD1ED,0xD1EE,0xD1EF,0xD1F0,0xD1F1,/* 0x60-0x67 */ + 0xD1F2,0xD1F3,0xD1F5,0xD1F6,0xD1F7,0xD1F9,0xD1FA,0xD1FB,/* 0x68-0x6F */ + 0xD1FC,0xD1FD,0xD1FE,0xD1FF,0xD200,0xD201,0xD202,0xD203,/* 0x70-0x77 */ + 0xD204,0xD205,0xD206,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD208,0xD20A,0xD20B,0xD20C,0xD20D,0xD20E,0xD20F,/* 0x80-0x87 */ + 0xD211,0xD212,0xD213,0xD214,0xD215,0xD216,0xD217,0xD218,/* 0x88-0x8F */ + 0xD219,0xD21A,0xD21B,0xD21C,0xD21D,0xD21E,0xD21F,0xD220,/* 0x90-0x97 */ + 0xD221,0xD222,0xD223,0xD224,0xD225,0xD226,0xD227,0xD228,/* 0x98-0x9F */ + 0xD229,0xB96B,0xB96D,0xB974,0xB975,0xB978,0xB97C,0xB984,/* 0xA0-0xA7 */ + 0xB985,0xB987,0xB989,0xB98A,0xB98D,0xB98E,0xB9AC,0xB9AD,/* 0xA8-0xAF */ + 0xB9B0,0xB9B4,0xB9BC,0xB9BD,0xB9BF,0xB9C1,0xB9C8,0xB9C9,/* 0xB0-0xB7 */ + 0xB9CC,0xB9CE,0xB9CF,0xB9D0,0xB9D1,0xB9D2,0xB9D8,0xB9D9,/* 0xB8-0xBF */ + 0xB9DB,0xB9DD,0xB9DE,0xB9E1,0xB9E3,0xB9E4,0xB9E5,0xB9E8,/* 0xC0-0xC7 */ + 0xB9EC,0xB9F4,0xB9F5,0xB9F7,0xB9F8,0xB9F9,0xB9FA,0xBA00,/* 0xC8-0xCF */ + 0xBA01,0xBA08,0xBA15,0xBA38,0xBA39,0xBA3C,0xBA40,0xBA42,/* 0xD0-0xD7 */ + 0xBA48,0xBA49,0xBA4B,0xBA4D,0xBA4E,0xBA53,0xBA54,0xBA55,/* 0xD8-0xDF */ + 0xBA58,0xBA5C,0xBA64,0xBA65,0xBA67,0xBA68,0xBA69,0xBA70,/* 0xE0-0xE7 */ + 0xBA71,0xBA74,0xBA78,0xBA83,0xBA84,0xBA85,0xBA87,0xBA8C,/* 0xE8-0xEF */ + 0xBAA8,0xBAA9,0xBAAB,0xBAAC,0xBAB0,0xBAB2,0xBAB8,0xBAB9,/* 0xF0-0xF7 */ + 0xBABB,0xBABD,0xBAC4,0xBAC8,0xBAD8,0xBAD9,0xBAFC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD22A,0xD22B,0xD22E,0xD22F,0xD231,0xD232,0xD233,/* 0x40-0x47 */ + 0xD235,0xD236,0xD237,0xD238,0xD239,0xD23A,0xD23B,0xD23E,/* 0x48-0x4F */ + 0xD240,0xD242,0xD243,0xD244,0xD245,0xD246,0xD247,0xD249,/* 0x50-0x57 */ + 0xD24A,0xD24B,0xD24C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD24D,0xD24E,0xD24F,0xD250,0xD251,0xD252,0xD253,/* 0x60-0x67 */ + 0xD254,0xD255,0xD256,0xD257,0xD258,0xD259,0xD25A,0xD25B,/* 0x68-0x6F */ + 0xD25D,0xD25E,0xD25F,0xD260,0xD261,0xD262,0xD263,0xD265,/* 0x70-0x77 */ + 0xD266,0xD267,0xD268,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD269,0xD26A,0xD26B,0xD26C,0xD26D,0xD26E,0xD26F,/* 0x80-0x87 */ + 0xD270,0xD271,0xD272,0xD273,0xD274,0xD275,0xD276,0xD277,/* 0x88-0x8F */ + 0xD278,0xD279,0xD27A,0xD27B,0xD27C,0xD27D,0xD27E,0xD27F,/* 0x90-0x97 */ + 0xD282,0xD283,0xD285,0xD286,0xD287,0xD289,0xD28A,0xD28B,/* 0x98-0x9F */ + 0xD28C,0xBB00,0xBB04,0xBB0D,0xBB0F,0xBB11,0xBB18,0xBB1C,/* 0xA0-0xA7 */ + 0xBB20,0xBB29,0xBB2B,0xBB34,0xBB35,0xBB36,0xBB38,0xBB3B,/* 0xA8-0xAF */ + 0xBB3C,0xBB3D,0xBB3E,0xBB44,0xBB45,0xBB47,0xBB49,0xBB4D,/* 0xB0-0xB7 */ + 0xBB4F,0xBB50,0xBB54,0xBB58,0xBB61,0xBB63,0xBB6C,0xBB88,/* 0xB8-0xBF */ + 0xBB8C,0xBB90,0xBBA4,0xBBA8,0xBBAC,0xBBB4,0xBBB7,0xBBC0,/* 0xC0-0xC7 */ + 0xBBC4,0xBBC8,0xBBD0,0xBBD3,0xBBF8,0xBBF9,0xBBFC,0xBBFF,/* 0xC8-0xCF */ + 0xBC00,0xBC02,0xBC08,0xBC09,0xBC0B,0xBC0C,0xBC0D,0xBC0F,/* 0xD0-0xD7 */ + 0xBC11,0xBC14,0xBC15,0xBC16,0xBC17,0xBC18,0xBC1B,0xBC1C,/* 0xD8-0xDF */ + 0xBC1D,0xBC1E,0xBC1F,0xBC24,0xBC25,0xBC27,0xBC29,0xBC2D,/* 0xE0-0xE7 */ + 0xBC30,0xBC31,0xBC34,0xBC38,0xBC40,0xBC41,0xBC43,0xBC44,/* 0xE8-0xEF */ + 0xBC45,0xBC49,0xBC4C,0xBC4D,0xBC50,0xBC5D,0xBC84,0xBC85,/* 0xF0-0xF7 */ + 0xBC88,0xBC8B,0xBC8C,0xBC8E,0xBC94,0xBC95,0xBC97,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD28D,0xD28E,0xD28F,0xD292,0xD293,0xD294,0xD296,/* 0x40-0x47 */ + 0xD297,0xD298,0xD299,0xD29A,0xD29B,0xD29D,0xD29E,0xD29F,/* 0x48-0x4F */ + 0xD2A1,0xD2A2,0xD2A3,0xD2A5,0xD2A6,0xD2A7,0xD2A8,0xD2A9,/* 0x50-0x57 */ + 0xD2AA,0xD2AB,0xD2AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD2AE,0xD2AF,0xD2B0,0xD2B2,0xD2B3,0xD2B4,0xD2B5,/* 0x60-0x67 */ + 0xD2B6,0xD2B7,0xD2BA,0xD2BB,0xD2BD,0xD2BE,0xD2C1,0xD2C3,/* 0x68-0x6F */ + 0xD2C4,0xD2C5,0xD2C6,0xD2C7,0xD2CA,0xD2CC,0xD2CD,0xD2CE,/* 0x70-0x77 */ + 0xD2CF,0xD2D0,0xD2D1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD2D2,0xD2D3,0xD2D5,0xD2D6,0xD2D7,0xD2D9,0xD2DA,/* 0x80-0x87 */ + 0xD2DB,0xD2DD,0xD2DE,0xD2DF,0xD2E0,0xD2E1,0xD2E2,0xD2E3,/* 0x88-0x8F */ + 0xD2E6,0xD2E7,0xD2E8,0xD2E9,0xD2EA,0xD2EB,0xD2EC,0xD2ED,/* 0x90-0x97 */ + 0xD2EE,0xD2EF,0xD2F2,0xD2F3,0xD2F5,0xD2F6,0xD2F7,0xD2F9,/* 0x98-0x9F */ + 0xD2FA,0xBC99,0xBC9A,0xBCA0,0xBCA1,0xBCA4,0xBCA7,0xBCA8,/* 0xA0-0xA7 */ + 0xBCB0,0xBCB1,0xBCB3,0xBCB4,0xBCB5,0xBCBC,0xBCBD,0xBCC0,/* 0xA8-0xAF */ + 0xBCC4,0xBCCD,0xBCCF,0xBCD0,0xBCD1,0xBCD5,0xBCD8,0xBCDC,/* 0xB0-0xB7 */ + 0xBCF4,0xBCF5,0xBCF6,0xBCF8,0xBCFC,0xBD04,0xBD05,0xBD07,/* 0xB8-0xBF */ + 0xBD09,0xBD10,0xBD14,0xBD24,0xBD2C,0xBD40,0xBD48,0xBD49,/* 0xC0-0xC7 */ + 0xBD4C,0xBD50,0xBD58,0xBD59,0xBD64,0xBD68,0xBD80,0xBD81,/* 0xC8-0xCF */ + 0xBD84,0xBD87,0xBD88,0xBD89,0xBD8A,0xBD90,0xBD91,0xBD93,/* 0xD0-0xD7 */ + 0xBD95,0xBD99,0xBD9A,0xBD9C,0xBDA4,0xBDB0,0xBDB8,0xBDD4,/* 0xD8-0xDF */ + 0xBDD5,0xBDD8,0xBDDC,0xBDE9,0xBDF0,0xBDF4,0xBDF8,0xBE00,/* 0xE0-0xE7 */ + 0xBE03,0xBE05,0xBE0C,0xBE0D,0xBE10,0xBE14,0xBE1C,0xBE1D,/* 0xE8-0xEF */ + 0xBE1F,0xBE44,0xBE45,0xBE48,0xBE4C,0xBE4E,0xBE54,0xBE55,/* 0xF0-0xF7 */ + 0xBE57,0xBE59,0xBE5A,0xBE5B,0xBE60,0xBE61,0xBE64,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD2FB,0xD2FC,0xD2FD,0xD2FE,0xD2FF,0xD302,0xD304,/* 0x40-0x47 */ + 0xD306,0xD307,0xD308,0xD309,0xD30A,0xD30B,0xD30F,0xD311,/* 0x48-0x4F */ + 0xD312,0xD313,0xD315,0xD317,0xD318,0xD319,0xD31A,0xD31B,/* 0x50-0x57 */ + 0xD31E,0xD322,0xD323,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD324,0xD326,0xD327,0xD32A,0xD32B,0xD32D,0xD32E,/* 0x60-0x67 */ + 0xD32F,0xD331,0xD332,0xD333,0xD334,0xD335,0xD336,0xD337,/* 0x68-0x6F */ + 0xD33A,0xD33E,0xD33F,0xD340,0xD341,0xD342,0xD343,0xD346,/* 0x70-0x77 */ + 0xD347,0xD348,0xD349,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD34A,0xD34B,0xD34C,0xD34D,0xD34E,0xD34F,0xD350,/* 0x80-0x87 */ + 0xD351,0xD352,0xD353,0xD354,0xD355,0xD356,0xD357,0xD358,/* 0x88-0x8F */ + 0xD359,0xD35A,0xD35B,0xD35C,0xD35D,0xD35E,0xD35F,0xD360,/* 0x90-0x97 */ + 0xD361,0xD362,0xD363,0xD364,0xD365,0xD366,0xD367,0xD368,/* 0x98-0x9F */ + 0xD369,0xBE68,0xBE6A,0xBE70,0xBE71,0xBE73,0xBE74,0xBE75,/* 0xA0-0xA7 */ + 0xBE7B,0xBE7C,0xBE7D,0xBE80,0xBE84,0xBE8C,0xBE8D,0xBE8F,/* 0xA8-0xAF */ + 0xBE90,0xBE91,0xBE98,0xBE99,0xBEA8,0xBED0,0xBED1,0xBED4,/* 0xB0-0xB7 */ + 0xBED7,0xBED8,0xBEE0,0xBEE3,0xBEE4,0xBEE5,0xBEEC,0xBF01,/* 0xB8-0xBF */ + 0xBF08,0xBF09,0xBF18,0xBF19,0xBF1B,0xBF1C,0xBF1D,0xBF40,/* 0xC0-0xC7 */ + 0xBF41,0xBF44,0xBF48,0xBF50,0xBF51,0xBF55,0xBF94,0xBFB0,/* 0xC8-0xCF */ + 0xBFC5,0xBFCC,0xBFCD,0xBFD0,0xBFD4,0xBFDC,0xBFDF,0xBFE1,/* 0xD0-0xD7 */ + 0xC03C,0xC051,0xC058,0xC05C,0xC060,0xC068,0xC069,0xC090,/* 0xD8-0xDF */ + 0xC091,0xC094,0xC098,0xC0A0,0xC0A1,0xC0A3,0xC0A5,0xC0AC,/* 0xE0-0xE7 */ + 0xC0AD,0xC0AF,0xC0B0,0xC0B3,0xC0B4,0xC0B5,0xC0B6,0xC0BC,/* 0xE8-0xEF */ + 0xC0BD,0xC0BF,0xC0C0,0xC0C1,0xC0C5,0xC0C8,0xC0C9,0xC0CC,/* 0xF0-0xF7 */ + 0xC0D0,0xC0D8,0xC0D9,0xC0DB,0xC0DC,0xC0DD,0xC0E4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD36A,0xD36B,0xD36C,0xD36D,0xD36E,0xD36F,0xD370,/* 0x40-0x47 */ + 0xD371,0xD372,0xD373,0xD374,0xD375,0xD376,0xD377,0xD378,/* 0x48-0x4F */ + 0xD379,0xD37A,0xD37B,0xD37E,0xD37F,0xD381,0xD382,0xD383,/* 0x50-0x57 */ + 0xD385,0xD386,0xD387,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD388,0xD389,0xD38A,0xD38B,0xD38E,0xD392,0xD393,/* 0x60-0x67 */ + 0xD394,0xD395,0xD396,0xD397,0xD39A,0xD39B,0xD39D,0xD39E,/* 0x68-0x6F */ + 0xD39F,0xD3A1,0xD3A2,0xD3A3,0xD3A4,0xD3A5,0xD3A6,0xD3A7,/* 0x70-0x77 */ + 0xD3AA,0xD3AC,0xD3AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD3AF,0xD3B0,0xD3B1,0xD3B2,0xD3B3,0xD3B5,0xD3B6,/* 0x80-0x87 */ + 0xD3B7,0xD3B9,0xD3BA,0xD3BB,0xD3BD,0xD3BE,0xD3BF,0xD3C0,/* 0x88-0x8F */ + 0xD3C1,0xD3C2,0xD3C3,0xD3C6,0xD3C7,0xD3CA,0xD3CB,0xD3CC,/* 0x90-0x97 */ + 0xD3CD,0xD3CE,0xD3CF,0xD3D1,0xD3D2,0xD3D3,0xD3D4,0xD3D5,/* 0x98-0x9F */ + 0xD3D6,0xC0E5,0xC0E8,0xC0EC,0xC0F4,0xC0F5,0xC0F7,0xC0F9,/* 0xA0-0xA7 */ + 0xC100,0xC104,0xC108,0xC110,0xC115,0xC11C,0xC11D,0xC11E,/* 0xA8-0xAF */ + 0xC11F,0xC120,0xC123,0xC124,0xC126,0xC127,0xC12C,0xC12D,/* 0xB0-0xB7 */ + 0xC12F,0xC130,0xC131,0xC136,0xC138,0xC139,0xC13C,0xC140,/* 0xB8-0xBF */ + 0xC148,0xC149,0xC14B,0xC14C,0xC14D,0xC154,0xC155,0xC158,/* 0xC0-0xC7 */ + 0xC15C,0xC164,0xC165,0xC167,0xC168,0xC169,0xC170,0xC174,/* 0xC8-0xCF */ + 0xC178,0xC185,0xC18C,0xC18D,0xC18E,0xC190,0xC194,0xC196,/* 0xD0-0xD7 */ + 0xC19C,0xC19D,0xC19F,0xC1A1,0xC1A5,0xC1A8,0xC1A9,0xC1AC,/* 0xD8-0xDF */ + 0xC1B0,0xC1BD,0xC1C4,0xC1C8,0xC1CC,0xC1D4,0xC1D7,0xC1D8,/* 0xE0-0xE7 */ + 0xC1E0,0xC1E4,0xC1E8,0xC1F0,0xC1F1,0xC1F3,0xC1FC,0xC1FD,/* 0xE8-0xEF */ + 0xC200,0xC204,0xC20C,0xC20D,0xC20F,0xC211,0xC218,0xC219,/* 0xF0-0xF7 */ + 0xC21C,0xC21F,0xC220,0xC228,0xC229,0xC22B,0xC22D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD3D7,0xD3D9,0xD3DA,0xD3DB,0xD3DC,0xD3DD,0xD3DE,/* 0x40-0x47 */ + 0xD3DF,0xD3E0,0xD3E2,0xD3E4,0xD3E5,0xD3E6,0xD3E7,0xD3E8,/* 0x48-0x4F */ + 0xD3E9,0xD3EA,0xD3EB,0xD3EE,0xD3EF,0xD3F1,0xD3F2,0xD3F3,/* 0x50-0x57 */ + 0xD3F5,0xD3F6,0xD3F7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD3F8,0xD3F9,0xD3FA,0xD3FB,0xD3FE,0xD400,0xD402,/* 0x60-0x67 */ + 0xD403,0xD404,0xD405,0xD406,0xD407,0xD409,0xD40A,0xD40B,/* 0x68-0x6F */ + 0xD40C,0xD40D,0xD40E,0xD40F,0xD410,0xD411,0xD412,0xD413,/* 0x70-0x77 */ + 0xD414,0xD415,0xD416,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD417,0xD418,0xD419,0xD41A,0xD41B,0xD41C,0xD41E,/* 0x80-0x87 */ + 0xD41F,0xD420,0xD421,0xD422,0xD423,0xD424,0xD425,0xD426,/* 0x88-0x8F */ + 0xD427,0xD428,0xD429,0xD42A,0xD42B,0xD42C,0xD42D,0xD42E,/* 0x90-0x97 */ + 0xD42F,0xD430,0xD431,0xD432,0xD433,0xD434,0xD435,0xD436,/* 0x98-0x9F */ + 0xD437,0xC22F,0xC231,0xC232,0xC234,0xC248,0xC250,0xC251,/* 0xA0-0xA7 */ + 0xC254,0xC258,0xC260,0xC265,0xC26C,0xC26D,0xC270,0xC274,/* 0xA8-0xAF */ + 0xC27C,0xC27D,0xC27F,0xC281,0xC288,0xC289,0xC290,0xC298,/* 0xB0-0xB7 */ + 0xC29B,0xC29D,0xC2A4,0xC2A5,0xC2A8,0xC2AC,0xC2AD,0xC2B4,/* 0xB8-0xBF */ + 0xC2B5,0xC2B7,0xC2B9,0xC2DC,0xC2DD,0xC2E0,0xC2E3,0xC2E4,/* 0xC0-0xC7 */ + 0xC2EB,0xC2EC,0xC2ED,0xC2EF,0xC2F1,0xC2F6,0xC2F8,0xC2F9,/* 0xC8-0xCF */ + 0xC2FB,0xC2FC,0xC300,0xC308,0xC309,0xC30C,0xC30D,0xC313,/* 0xD0-0xD7 */ + 0xC314,0xC315,0xC318,0xC31C,0xC324,0xC325,0xC328,0xC329,/* 0xD8-0xDF */ + 0xC345,0xC368,0xC369,0xC36C,0xC370,0xC372,0xC378,0xC379,/* 0xE0-0xE7 */ + 0xC37C,0xC37D,0xC384,0xC388,0xC38C,0xC3C0,0xC3D8,0xC3D9,/* 0xE8-0xEF */ + 0xC3DC,0xC3DF,0xC3E0,0xC3E2,0xC3E8,0xC3E9,0xC3ED,0xC3F4,/* 0xF0-0xF7 */ + 0xC3F5,0xC3F8,0xC408,0xC410,0xC424,0xC42C,0xC430,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD438,0xD439,0xD43A,0xD43B,0xD43C,0xD43D,0xD43E,/* 0x40-0x47 */ + 0xD43F,0xD441,0xD442,0xD443,0xD445,0xD446,0xD447,0xD448,/* 0x48-0x4F */ + 0xD449,0xD44A,0xD44B,0xD44C,0xD44D,0xD44E,0xD44F,0xD450,/* 0x50-0x57 */ + 0xD451,0xD452,0xD453,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD454,0xD455,0xD456,0xD457,0xD458,0xD459,0xD45A,/* 0x60-0x67 */ + 0xD45B,0xD45D,0xD45E,0xD45F,0xD461,0xD462,0xD463,0xD465,/* 0x68-0x6F */ + 0xD466,0xD467,0xD468,0xD469,0xD46A,0xD46B,0xD46C,0xD46E,/* 0x70-0x77 */ + 0xD470,0xD471,0xD472,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD473,0xD474,0xD475,0xD476,0xD477,0xD47A,0xD47B,/* 0x80-0x87 */ + 0xD47D,0xD47E,0xD481,0xD483,0xD484,0xD485,0xD486,0xD487,/* 0x88-0x8F */ + 0xD48A,0xD48C,0xD48E,0xD48F,0xD490,0xD491,0xD492,0xD493,/* 0x90-0x97 */ + 0xD495,0xD496,0xD497,0xD498,0xD499,0xD49A,0xD49B,0xD49C,/* 0x98-0x9F */ + 0xD49D,0xC434,0xC43C,0xC43D,0xC448,0xC464,0xC465,0xC468,/* 0xA0-0xA7 */ + 0xC46C,0xC474,0xC475,0xC479,0xC480,0xC494,0xC49C,0xC4B8,/* 0xA8-0xAF */ + 0xC4BC,0xC4E9,0xC4F0,0xC4F1,0xC4F4,0xC4F8,0xC4FA,0xC4FF,/* 0xB0-0xB7 */ + 0xC500,0xC501,0xC50C,0xC510,0xC514,0xC51C,0xC528,0xC529,/* 0xB8-0xBF */ + 0xC52C,0xC530,0xC538,0xC539,0xC53B,0xC53D,0xC544,0xC545,/* 0xC0-0xC7 */ + 0xC548,0xC549,0xC54A,0xC54C,0xC54D,0xC54E,0xC553,0xC554,/* 0xC8-0xCF */ + 0xC555,0xC557,0xC558,0xC559,0xC55D,0xC55E,0xC560,0xC561,/* 0xD0-0xD7 */ + 0xC564,0xC568,0xC570,0xC571,0xC573,0xC574,0xC575,0xC57C,/* 0xD8-0xDF */ + 0xC57D,0xC580,0xC584,0xC587,0xC58C,0xC58D,0xC58F,0xC591,/* 0xE0-0xE7 */ + 0xC595,0xC597,0xC598,0xC59C,0xC5A0,0xC5A9,0xC5B4,0xC5B5,/* 0xE8-0xEF */ + 0xC5B8,0xC5B9,0xC5BB,0xC5BC,0xC5BD,0xC5BE,0xC5C4,0xC5C5,/* 0xF0-0xF7 */ + 0xC5C6,0xC5C7,0xC5C8,0xC5C9,0xC5CA,0xC5CC,0xC5CE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD49E,0xD49F,0xD4A0,0xD4A1,0xD4A2,0xD4A3,0xD4A4,/* 0x40-0x47 */ + 0xD4A5,0xD4A6,0xD4A7,0xD4A8,0xD4AA,0xD4AB,0xD4AC,0xD4AD,/* 0x48-0x4F */ + 0xD4AE,0xD4AF,0xD4B0,0xD4B1,0xD4B2,0xD4B3,0xD4B4,0xD4B5,/* 0x50-0x57 */ + 0xD4B6,0xD4B7,0xD4B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD4B9,0xD4BA,0xD4BB,0xD4BC,0xD4BD,0xD4BE,0xD4BF,/* 0x60-0x67 */ + 0xD4C0,0xD4C1,0xD4C2,0xD4C3,0xD4C4,0xD4C5,0xD4C6,0xD4C7,/* 0x68-0x6F */ + 0xD4C8,0xD4C9,0xD4CA,0xD4CB,0xD4CD,0xD4CE,0xD4CF,0xD4D1,/* 0x70-0x77 */ + 0xD4D2,0xD4D3,0xD4D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD4D6,0xD4D7,0xD4D8,0xD4D9,0xD4DA,0xD4DB,0xD4DD,/* 0x80-0x87 */ + 0xD4DE,0xD4E0,0xD4E1,0xD4E2,0xD4E3,0xD4E4,0xD4E5,0xD4E6,/* 0x88-0x8F */ + 0xD4E7,0xD4E9,0xD4EA,0xD4EB,0xD4ED,0xD4EE,0xD4EF,0xD4F1,/* 0x90-0x97 */ + 0xD4F2,0xD4F3,0xD4F4,0xD4F5,0xD4F6,0xD4F7,0xD4F9,0xD4FA,/* 0x98-0x9F */ + 0xD4FC,0xC5D0,0xC5D1,0xC5D4,0xC5D8,0xC5E0,0xC5E1,0xC5E3,/* 0xA0-0xA7 */ + 0xC5E5,0xC5EC,0xC5ED,0xC5EE,0xC5F0,0xC5F4,0xC5F6,0xC5F7,/* 0xA8-0xAF */ + 0xC5FC,0xC5FD,0xC5FE,0xC5FF,0xC600,0xC601,0xC605,0xC606,/* 0xB0-0xB7 */ + 0xC607,0xC608,0xC60C,0xC610,0xC618,0xC619,0xC61B,0xC61C,/* 0xB8-0xBF */ + 0xC624,0xC625,0xC628,0xC62C,0xC62D,0xC62E,0xC630,0xC633,/* 0xC0-0xC7 */ + 0xC634,0xC635,0xC637,0xC639,0xC63B,0xC640,0xC641,0xC644,/* 0xC8-0xCF */ + 0xC648,0xC650,0xC651,0xC653,0xC654,0xC655,0xC65C,0xC65D,/* 0xD0-0xD7 */ + 0xC660,0xC66C,0xC66F,0xC671,0xC678,0xC679,0xC67C,0xC680,/* 0xD8-0xDF */ + 0xC688,0xC689,0xC68B,0xC68D,0xC694,0xC695,0xC698,0xC69C,/* 0xE0-0xE7 */ + 0xC6A4,0xC6A5,0xC6A7,0xC6A9,0xC6B0,0xC6B1,0xC6B4,0xC6B8,/* 0xE8-0xEF */ + 0xC6B9,0xC6BA,0xC6C0,0xC6C1,0xC6C3,0xC6C5,0xC6CC,0xC6CD,/* 0xF0-0xF7 */ + 0xC6D0,0xC6D4,0xC6DC,0xC6DD,0xC6E0,0xC6E1,0xC6E8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD4FE,0xD4FF,0xD500,0xD501,0xD502,0xD503,0xD505,/* 0x40-0x47 */ + 0xD506,0xD507,0xD509,0xD50A,0xD50B,0xD50D,0xD50E,0xD50F,/* 0x48-0x4F */ + 0xD510,0xD511,0xD512,0xD513,0xD516,0xD518,0xD519,0xD51A,/* 0x50-0x57 */ + 0xD51B,0xD51C,0xD51D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD51E,0xD51F,0xD520,0xD521,0xD522,0xD523,0xD524,/* 0x60-0x67 */ + 0xD525,0xD526,0xD527,0xD528,0xD529,0xD52A,0xD52B,0xD52C,/* 0x68-0x6F */ + 0xD52D,0xD52E,0xD52F,0xD530,0xD531,0xD532,0xD533,0xD534,/* 0x70-0x77 */ + 0xD535,0xD536,0xD537,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD538,0xD539,0xD53A,0xD53B,0xD53E,0xD53F,0xD541,/* 0x80-0x87 */ + 0xD542,0xD543,0xD545,0xD546,0xD547,0xD548,0xD549,0xD54A,/* 0x88-0x8F */ + 0xD54B,0xD54E,0xD550,0xD552,0xD553,0xD554,0xD555,0xD556,/* 0x90-0x97 */ + 0xD557,0xD55A,0xD55B,0xD55D,0xD55E,0xD55F,0xD561,0xD562,/* 0x98-0x9F */ + 0xD563,0xC6E9,0xC6EC,0xC6F0,0xC6F8,0xC6F9,0xC6FD,0xC704,/* 0xA0-0xA7 */ + 0xC705,0xC708,0xC70C,0xC714,0xC715,0xC717,0xC719,0xC720,/* 0xA8-0xAF */ + 0xC721,0xC724,0xC728,0xC730,0xC731,0xC733,0xC735,0xC737,/* 0xB0-0xB7 */ + 0xC73C,0xC73D,0xC740,0xC744,0xC74A,0xC74C,0xC74D,0xC74F,/* 0xB8-0xBF */ + 0xC751,0xC752,0xC753,0xC754,0xC755,0xC756,0xC757,0xC758,/* 0xC0-0xC7 */ + 0xC75C,0xC760,0xC768,0xC76B,0xC774,0xC775,0xC778,0xC77C,/* 0xC8-0xCF */ + 0xC77D,0xC77E,0xC783,0xC784,0xC785,0xC787,0xC788,0xC789,/* 0xD0-0xD7 */ + 0xC78A,0xC78E,0xC790,0xC791,0xC794,0xC796,0xC797,0xC798,/* 0xD8-0xDF */ + 0xC79A,0xC7A0,0xC7A1,0xC7A3,0xC7A4,0xC7A5,0xC7A6,0xC7AC,/* 0xE0-0xE7 */ + 0xC7AD,0xC7B0,0xC7B4,0xC7BC,0xC7BD,0xC7BF,0xC7C0,0xC7C1,/* 0xE8-0xEF */ + 0xC7C8,0xC7C9,0xC7CC,0xC7CE,0xC7D0,0xC7D8,0xC7DD,0xC7E4,/* 0xF0-0xF7 */ + 0xC7E8,0xC7EC,0xC800,0xC801,0xC804,0xC808,0xC80A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD564,0xD566,0xD567,0xD56A,0xD56C,0xD56E,0xD56F,/* 0x40-0x47 */ + 0xD570,0xD571,0xD572,0xD573,0xD576,0xD577,0xD579,0xD57A,/* 0x48-0x4F */ + 0xD57B,0xD57D,0xD57E,0xD57F,0xD580,0xD581,0xD582,0xD583,/* 0x50-0x57 */ + 0xD586,0xD58A,0xD58B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD58C,0xD58D,0xD58E,0xD58F,0xD591,0xD592,0xD593,/* 0x60-0x67 */ + 0xD594,0xD595,0xD596,0xD597,0xD598,0xD599,0xD59A,0xD59B,/* 0x68-0x6F */ + 0xD59C,0xD59D,0xD59E,0xD59F,0xD5A0,0xD5A1,0xD5A2,0xD5A3,/* 0x70-0x77 */ + 0xD5A4,0xD5A6,0xD5A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD5A8,0xD5A9,0xD5AA,0xD5AB,0xD5AC,0xD5AD,0xD5AE,/* 0x80-0x87 */ + 0xD5AF,0xD5B0,0xD5B1,0xD5B2,0xD5B3,0xD5B4,0xD5B5,0xD5B6,/* 0x88-0x8F */ + 0xD5B7,0xD5B8,0xD5B9,0xD5BA,0xD5BB,0xD5BC,0xD5BD,0xD5BE,/* 0x90-0x97 */ + 0xD5BF,0xD5C0,0xD5C1,0xD5C2,0xD5C3,0xD5C4,0xD5C5,0xD5C6,/* 0x98-0x9F */ + 0xD5C7,0xC810,0xC811,0xC813,0xC815,0xC816,0xC81C,0xC81D,/* 0xA0-0xA7 */ + 0xC820,0xC824,0xC82C,0xC82D,0xC82F,0xC831,0xC838,0xC83C,/* 0xA8-0xAF */ + 0xC840,0xC848,0xC849,0xC84C,0xC84D,0xC854,0xC870,0xC871,/* 0xB0-0xB7 */ + 0xC874,0xC878,0xC87A,0xC880,0xC881,0xC883,0xC885,0xC886,/* 0xB8-0xBF */ + 0xC887,0xC88B,0xC88C,0xC88D,0xC894,0xC89D,0xC89F,0xC8A1,/* 0xC0-0xC7 */ + 0xC8A8,0xC8BC,0xC8BD,0xC8C4,0xC8C8,0xC8CC,0xC8D4,0xC8D5,/* 0xC8-0xCF */ + 0xC8D7,0xC8D9,0xC8E0,0xC8E1,0xC8E4,0xC8F5,0xC8FC,0xC8FD,/* 0xD0-0xD7 */ + 0xC900,0xC904,0xC905,0xC906,0xC90C,0xC90D,0xC90F,0xC911,/* 0xD8-0xDF */ + 0xC918,0xC92C,0xC934,0xC950,0xC951,0xC954,0xC958,0xC960,/* 0xE0-0xE7 */ + 0xC961,0xC963,0xC96C,0xC970,0xC974,0xC97C,0xC988,0xC989,/* 0xE8-0xEF */ + 0xC98C,0xC990,0xC998,0xC999,0xC99B,0xC99D,0xC9C0,0xC9C1,/* 0xF0-0xF7 */ + 0xC9C4,0xC9C7,0xC9C8,0xC9CA,0xC9D0,0xC9D1,0xC9D3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD5CA,0xD5CB,0xD5CD,0xD5CE,0xD5CF,0xD5D1,0xD5D3,/* 0x40-0x47 */ + 0xD5D4,0xD5D5,0xD5D6,0xD5D7,0xD5DA,0xD5DC,0xD5DE,0xD5DF,/* 0x48-0x4F */ + 0xD5E0,0xD5E1,0xD5E2,0xD5E3,0xD5E6,0xD5E7,0xD5E9,0xD5EA,/* 0x50-0x57 */ + 0xD5EB,0xD5ED,0xD5EE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD5EF,0xD5F0,0xD5F1,0xD5F2,0xD5F3,0xD5F6,0xD5F8,/* 0x60-0x67 */ + 0xD5FA,0xD5FB,0xD5FC,0xD5FD,0xD5FE,0xD5FF,0xD602,0xD603,/* 0x68-0x6F */ + 0xD605,0xD606,0xD607,0xD609,0xD60A,0xD60B,0xD60C,0xD60D,/* 0x70-0x77 */ + 0xD60E,0xD60F,0xD612,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD616,0xD617,0xD618,0xD619,0xD61A,0xD61B,0xD61D,/* 0x80-0x87 */ + 0xD61E,0xD61F,0xD621,0xD622,0xD623,0xD625,0xD626,0xD627,/* 0x88-0x8F */ + 0xD628,0xD629,0xD62A,0xD62B,0xD62C,0xD62E,0xD62F,0xD630,/* 0x90-0x97 */ + 0xD631,0xD632,0xD633,0xD634,0xD635,0xD636,0xD637,0xD63A,/* 0x98-0x9F */ + 0xD63B,0xC9D5,0xC9D6,0xC9D9,0xC9DA,0xC9DC,0xC9DD,0xC9E0,/* 0xA0-0xA7 */ + 0xC9E2,0xC9E4,0xC9E7,0xC9EC,0xC9ED,0xC9EF,0xC9F0,0xC9F1,/* 0xA8-0xAF */ + 0xC9F8,0xC9F9,0xC9FC,0xCA00,0xCA08,0xCA09,0xCA0B,0xCA0C,/* 0xB0-0xB7 */ + 0xCA0D,0xCA14,0xCA18,0xCA29,0xCA4C,0xCA4D,0xCA50,0xCA54,/* 0xB8-0xBF */ + 0xCA5C,0xCA5D,0xCA5F,0xCA60,0xCA61,0xCA68,0xCA7D,0xCA84,/* 0xC0-0xC7 */ + 0xCA98,0xCABC,0xCABD,0xCAC0,0xCAC4,0xCACC,0xCACD,0xCACF,/* 0xC8-0xCF */ + 0xCAD1,0xCAD3,0xCAD8,0xCAD9,0xCAE0,0xCAEC,0xCAF4,0xCB08,/* 0xD0-0xD7 */ + 0xCB10,0xCB14,0xCB18,0xCB20,0xCB21,0xCB41,0xCB48,0xCB49,/* 0xD8-0xDF */ + 0xCB4C,0xCB50,0xCB58,0xCB59,0xCB5D,0xCB64,0xCB78,0xCB79,/* 0xE0-0xE7 */ + 0xCB9C,0xCBB8,0xCBD4,0xCBE4,0xCBE7,0xCBE9,0xCC0C,0xCC0D,/* 0xE8-0xEF */ + 0xCC10,0xCC14,0xCC1C,0xCC1D,0xCC21,0xCC22,0xCC27,0xCC28,/* 0xF0-0xF7 */ + 0xCC29,0xCC2C,0xCC2E,0xCC30,0xCC38,0xCC39,0xCC3B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD63D,0xD63E,0xD63F,0xD641,0xD642,0xD643,0xD644,/* 0x40-0x47 */ + 0xD646,0xD647,0xD64A,0xD64C,0xD64E,0xD64F,0xD650,0xD652,/* 0x48-0x4F */ + 0xD653,0xD656,0xD657,0xD659,0xD65A,0xD65B,0xD65D,0xD65E,/* 0x50-0x57 */ + 0xD65F,0xD660,0xD661,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD662,0xD663,0xD664,0xD665,0xD666,0xD668,0xD66A,/* 0x60-0x67 */ + 0xD66B,0xD66C,0xD66D,0xD66E,0xD66F,0xD672,0xD673,0xD675,/* 0x68-0x6F */ + 0xD676,0xD677,0xD678,0xD679,0xD67A,0xD67B,0xD67C,0xD67D,/* 0x70-0x77 */ + 0xD67E,0xD67F,0xD680,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD681,0xD682,0xD684,0xD686,0xD687,0xD688,0xD689,/* 0x80-0x87 */ + 0xD68A,0xD68B,0xD68E,0xD68F,0xD691,0xD692,0xD693,0xD695,/* 0x88-0x8F */ + 0xD696,0xD697,0xD698,0xD699,0xD69A,0xD69B,0xD69C,0xD69E,/* 0x90-0x97 */ + 0xD6A0,0xD6A2,0xD6A3,0xD6A4,0xD6A5,0xD6A6,0xD6A7,0xD6A9,/* 0x98-0x9F */ + 0xD6AA,0xCC3C,0xCC3D,0xCC3E,0xCC44,0xCC45,0xCC48,0xCC4C,/* 0xA0-0xA7 */ + 0xCC54,0xCC55,0xCC57,0xCC58,0xCC59,0xCC60,0xCC64,0xCC66,/* 0xA8-0xAF */ + 0xCC68,0xCC70,0xCC75,0xCC98,0xCC99,0xCC9C,0xCCA0,0xCCA8,/* 0xB0-0xB7 */ + 0xCCA9,0xCCAB,0xCCAC,0xCCAD,0xCCB4,0xCCB5,0xCCB8,0xCCBC,/* 0xB8-0xBF */ + 0xCCC4,0xCCC5,0xCCC7,0xCCC9,0xCCD0,0xCCD4,0xCCE4,0xCCEC,/* 0xC0-0xC7 */ + 0xCCF0,0xCD01,0xCD08,0xCD09,0xCD0C,0xCD10,0xCD18,0xCD19,/* 0xC8-0xCF */ + 0xCD1B,0xCD1D,0xCD24,0xCD28,0xCD2C,0xCD39,0xCD5C,0xCD60,/* 0xD0-0xD7 */ + 0xCD64,0xCD6C,0xCD6D,0xCD6F,0xCD71,0xCD78,0xCD88,0xCD94,/* 0xD8-0xDF */ + 0xCD95,0xCD98,0xCD9C,0xCDA4,0xCDA5,0xCDA7,0xCDA9,0xCDB0,/* 0xE0-0xE7 */ + 0xCDC4,0xCDCC,0xCDD0,0xCDE8,0xCDEC,0xCDF0,0xCDF8,0xCDF9,/* 0xE8-0xEF */ + 0xCDFB,0xCDFD,0xCE04,0xCE08,0xCE0C,0xCE14,0xCE19,0xCE20,/* 0xF0-0xF7 */ + 0xCE21,0xCE24,0xCE28,0xCE30,0xCE31,0xCE33,0xCE35,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD6AB,0xD6AD,0xD6AE,0xD6AF,0xD6B1,0xD6B2,0xD6B3,/* 0x40-0x47 */ + 0xD6B4,0xD6B5,0xD6B6,0xD6B7,0xD6B8,0xD6BA,0xD6BC,0xD6BD,/* 0x48-0x4F */ + 0xD6BE,0xD6BF,0xD6C0,0xD6C1,0xD6C2,0xD6C3,0xD6C6,0xD6C7,/* 0x50-0x57 */ + 0xD6C9,0xD6CA,0xD6CB,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD6CD,0xD6CE,0xD6CF,0xD6D0,0xD6D2,0xD6D3,0xD6D5,/* 0x60-0x67 */ + 0xD6D6,0xD6D8,0xD6DA,0xD6DB,0xD6DC,0xD6DD,0xD6DE,0xD6DF,/* 0x68-0x6F */ + 0xD6E1,0xD6E2,0xD6E3,0xD6E5,0xD6E6,0xD6E7,0xD6E9,0xD6EA,/* 0x70-0x77 */ + 0xD6EB,0xD6EC,0xD6ED,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD6EE,0xD6EF,0xD6F1,0xD6F2,0xD6F3,0xD6F4,0xD6F6,/* 0x80-0x87 */ + 0xD6F7,0xD6F8,0xD6F9,0xD6FA,0xD6FB,0xD6FE,0xD6FF,0xD701,/* 0x88-0x8F */ + 0xD702,0xD703,0xD705,0xD706,0xD707,0xD708,0xD709,0xD70A,/* 0x90-0x97 */ + 0xD70B,0xD70C,0xD70D,0xD70E,0xD70F,0xD710,0xD712,0xD713,/* 0x98-0x9F */ + 0xD714,0xCE58,0xCE59,0xCE5C,0xCE5F,0xCE60,0xCE61,0xCE68,/* 0xA0-0xA7 */ + 0xCE69,0xCE6B,0xCE6D,0xCE74,0xCE75,0xCE78,0xCE7C,0xCE84,/* 0xA8-0xAF */ + 0xCE85,0xCE87,0xCE89,0xCE90,0xCE91,0xCE94,0xCE98,0xCEA0,/* 0xB0-0xB7 */ + 0xCEA1,0xCEA3,0xCEA4,0xCEA5,0xCEAC,0xCEAD,0xCEC1,0xCEE4,/* 0xB8-0xBF */ + 0xCEE5,0xCEE8,0xCEEB,0xCEEC,0xCEF4,0xCEF5,0xCEF7,0xCEF8,/* 0xC0-0xC7 */ + 0xCEF9,0xCF00,0xCF01,0xCF04,0xCF08,0xCF10,0xCF11,0xCF13,/* 0xC8-0xCF */ + 0xCF15,0xCF1C,0xCF20,0xCF24,0xCF2C,0xCF2D,0xCF2F,0xCF30,/* 0xD0-0xD7 */ + 0xCF31,0xCF38,0xCF54,0xCF55,0xCF58,0xCF5C,0xCF64,0xCF65,/* 0xD8-0xDF */ + 0xCF67,0xCF69,0xCF70,0xCF71,0xCF74,0xCF78,0xCF80,0xCF85,/* 0xE0-0xE7 */ + 0xCF8C,0xCFA1,0xCFA8,0xCFB0,0xCFC4,0xCFE0,0xCFE1,0xCFE4,/* 0xE8-0xEF */ + 0xCFE8,0xCFF0,0xCFF1,0xCFF3,0xCFF5,0xCFFC,0xD000,0xD004,/* 0xF0-0xF7 */ + 0xD011,0xD018,0xD02D,0xD034,0xD035,0xD038,0xD03C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD715,0xD716,0xD717,0xD71A,0xD71B,0xD71D,0xD71E,/* 0x40-0x47 */ + 0xD71F,0xD721,0xD722,0xD723,0xD724,0xD725,0xD726,0xD727,/* 0x48-0x4F */ + 0xD72A,0xD72C,0xD72E,0xD72F,0xD730,0xD731,0xD732,0xD733,/* 0x50-0x57 */ + 0xD736,0xD737,0xD739,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD73A,0xD73B,0xD73D,0xD73E,0xD73F,0xD740,0xD741,/* 0x60-0x67 */ + 0xD742,0xD743,0xD745,0xD746,0xD748,0xD74A,0xD74B,0xD74C,/* 0x68-0x6F */ + 0xD74D,0xD74E,0xD74F,0xD752,0xD753,0xD755,0xD75A,0xD75B,/* 0x70-0x77 */ + 0xD75C,0xD75D,0xD75E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD75F,0xD762,0xD764,0xD766,0xD767,0xD768,0xD76A,/* 0x80-0x87 */ + 0xD76B,0xD76D,0xD76E,0xD76F,0xD771,0xD772,0xD773,0xD775,/* 0x88-0x8F */ + 0xD776,0xD777,0xD778,0xD779,0xD77A,0xD77B,0xD77E,0xD77F,/* 0x90-0x97 */ + 0xD780,0xD782,0xD783,0xD784,0xD785,0xD786,0xD787,0xD78A,/* 0x98-0x9F */ + 0xD78B,0xD044,0xD045,0xD047,0xD049,0xD050,0xD054,0xD058,/* 0xA0-0xA7 */ + 0xD060,0xD06C,0xD06D,0xD070,0xD074,0xD07C,0xD07D,0xD081,/* 0xA8-0xAF */ + 0xD0A4,0xD0A5,0xD0A8,0xD0AC,0xD0B4,0xD0B5,0xD0B7,0xD0B9,/* 0xB0-0xB7 */ + 0xD0C0,0xD0C1,0xD0C4,0xD0C8,0xD0C9,0xD0D0,0xD0D1,0xD0D3,/* 0xB8-0xBF */ + 0xD0D4,0xD0D5,0xD0DC,0xD0DD,0xD0E0,0xD0E4,0xD0EC,0xD0ED,/* 0xC0-0xC7 */ + 0xD0EF,0xD0F0,0xD0F1,0xD0F8,0xD10D,0xD130,0xD131,0xD134,/* 0xC8-0xCF */ + 0xD138,0xD13A,0xD140,0xD141,0xD143,0xD144,0xD145,0xD14C,/* 0xD0-0xD7 */ + 0xD14D,0xD150,0xD154,0xD15C,0xD15D,0xD15F,0xD161,0xD168,/* 0xD8-0xDF */ + 0xD16C,0xD17C,0xD184,0xD188,0xD1A0,0xD1A1,0xD1A4,0xD1A8,/* 0xE0-0xE7 */ + 0xD1B0,0xD1B1,0xD1B3,0xD1B5,0xD1BA,0xD1BC,0xD1C0,0xD1D8,/* 0xE8-0xEF */ + 0xD1F4,0xD1F8,0xD207,0xD209,0xD210,0xD22C,0xD22D,0xD230,/* 0xF0-0xF7 */ + 0xD234,0xD23C,0xD23D,0xD23F,0xD241,0xD248,0xD25C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD78D,0xD78E,0xD78F,0xD791,0xD792,0xD793,0xD794,/* 0x40-0x47 */ + 0xD795,0xD796,0xD797,0xD79A,0xD79C,0xD79E,0xD79F,0xD7A0,/* 0x48-0x4F */ + 0xD7A1,0xD7A2,0xD7A3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD264,0xD280,0xD281,0xD284,0xD288,0xD290,0xD291,/* 0xA0-0xA7 */ + 0xD295,0xD29C,0xD2A0,0xD2A4,0xD2AC,0xD2B1,0xD2B8,0xD2B9,/* 0xA8-0xAF */ + 0xD2BC,0xD2BF,0xD2C0,0xD2C2,0xD2C8,0xD2C9,0xD2CB,0xD2D4,/* 0xB0-0xB7 */ + 0xD2D8,0xD2DC,0xD2E4,0xD2E5,0xD2F0,0xD2F1,0xD2F4,0xD2F8,/* 0xB8-0xBF */ + 0xD300,0xD301,0xD303,0xD305,0xD30C,0xD30D,0xD30E,0xD310,/* 0xC0-0xC7 */ + 0xD314,0xD316,0xD31C,0xD31D,0xD31F,0xD320,0xD321,0xD325,/* 0xC8-0xCF */ + 0xD328,0xD329,0xD32C,0xD330,0xD338,0xD339,0xD33B,0xD33C,/* 0xD0-0xD7 */ + 0xD33D,0xD344,0xD345,0xD37C,0xD37D,0xD380,0xD384,0xD38C,/* 0xD8-0xDF */ + 0xD38D,0xD38F,0xD390,0xD391,0xD398,0xD399,0xD39C,0xD3A0,/* 0xE0-0xE7 */ + 0xD3A8,0xD3A9,0xD3AB,0xD3AD,0xD3B4,0xD3B8,0xD3BC,0xD3C4,/* 0xE8-0xEF */ + 0xD3C5,0xD3C8,0xD3C9,0xD3D0,0xD3D8,0xD3E1,0xD3E3,0xD3EC,/* 0xF0-0xF7 */ + 0xD3ED,0xD3F0,0xD3F4,0xD3FC,0xD3FD,0xD3FF,0xD401,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD408,0xD41D,0xD440,0xD444,0xD45C,0xD460,0xD464,/* 0xA0-0xA7 */ + 0xD46D,0xD46F,0xD478,0xD479,0xD47C,0xD47F,0xD480,0xD482,/* 0xA8-0xAF */ + 0xD488,0xD489,0xD48B,0xD48D,0xD494,0xD4A9,0xD4CC,0xD4D0,/* 0xB0-0xB7 */ + 0xD4D4,0xD4DC,0xD4DF,0xD4E8,0xD4EC,0xD4F0,0xD4F8,0xD4FB,/* 0xB8-0xBF */ + 0xD4FD,0xD504,0xD508,0xD50C,0xD514,0xD515,0xD517,0xD53C,/* 0xC0-0xC7 */ + 0xD53D,0xD540,0xD544,0xD54C,0xD54D,0xD54F,0xD551,0xD558,/* 0xC8-0xCF */ + 0xD559,0xD55C,0xD560,0xD565,0xD568,0xD569,0xD56B,0xD56D,/* 0xD0-0xD7 */ + 0xD574,0xD575,0xD578,0xD57C,0xD584,0xD585,0xD587,0xD588,/* 0xD8-0xDF */ + 0xD589,0xD590,0xD5A5,0xD5C8,0xD5C9,0xD5CC,0xD5D0,0xD5D2,/* 0xE0-0xE7 */ + 0xD5D8,0xD5D9,0xD5DB,0xD5DD,0xD5E4,0xD5E5,0xD5E8,0xD5EC,/* 0xE8-0xEF */ + 0xD5F4,0xD5F5,0xD5F7,0xD5F9,0xD600,0xD601,0xD604,0xD608,/* 0xF0-0xF7 */ + 0xD610,0xD611,0xD613,0xD614,0xD615,0xD61C,0xD620,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD624,0xD62D,0xD638,0xD639,0xD63C,0xD640,0xD645,/* 0xA0-0xA7 */ + 0xD648,0xD649,0xD64B,0xD64D,0xD651,0xD654,0xD655,0xD658,/* 0xA8-0xAF */ + 0xD65C,0xD667,0xD669,0xD670,0xD671,0xD674,0xD683,0xD685,/* 0xB0-0xB7 */ + 0xD68C,0xD68D,0xD690,0xD694,0xD69D,0xD69F,0xD6A1,0xD6A8,/* 0xB8-0xBF */ + 0xD6AC,0xD6B0,0xD6B9,0xD6BB,0xD6C4,0xD6C5,0xD6C8,0xD6CC,/* 0xC0-0xC7 */ + 0xD6D1,0xD6D4,0xD6D7,0xD6D9,0xD6E0,0xD6E4,0xD6E8,0xD6F0,/* 0xC8-0xCF */ + 0xD6F5,0xD6FC,0xD6FD,0xD700,0xD704,0xD711,0xD718,0xD719,/* 0xD0-0xD7 */ + 0xD71C,0xD720,0xD728,0xD729,0xD72B,0xD72D,0xD734,0xD735,/* 0xD8-0xDF */ + 0xD738,0xD73C,0xD744,0xD747,0xD749,0xD750,0xD751,0xD754,/* 0xE0-0xE7 */ + 0xD756,0xD757,0xD758,0xD759,0xD760,0xD761,0xD763,0xD765,/* 0xE8-0xEF */ + 0xD769,0xD76C,0xD770,0xD774,0xD77C,0xD77D,0xD781,0xD788,/* 0xF0-0xF7 */ + 0xD789,0xD78C,0xD790,0xD798,0xD799,0xD79B,0xD79D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4F3D,0x4F73,0x5047,0x50F9,0x52A0,0x53EF,0x5475,/* 0xA0-0xA7 */ + 0x54E5,0x5609,0x5AC1,0x5BB6,0x6687,0x67B6,0x67B7,0x67EF,/* 0xA8-0xAF */ + 0x6B4C,0x73C2,0x75C2,0x7A3C,0x82DB,0x8304,0x8857,0x8888,/* 0xB0-0xB7 */ + 0x8A36,0x8CC8,0x8DCF,0x8EFB,0x8FE6,0x99D5,0x523B,0x5374,/* 0xB8-0xBF */ + 0x5404,0x606A,0x6164,0x6BBC,0x73CF,0x811A,0x89BA,0x89D2,/* 0xC0-0xC7 */ + 0x95A3,0x4F83,0x520A,0x58BE,0x5978,0x59E6,0x5E72,0x5E79,/* 0xC8-0xCF */ + 0x61C7,0x63C0,0x6746,0x67EC,0x687F,0x6F97,0x764E,0x770B,/* 0xD0-0xD7 */ + 0x78F5,0x7A08,0x7AFF,0x7C21,0x809D,0x826E,0x8271,0x8AEB,/* 0xD8-0xDF */ + 0x9593,0x4E6B,0x559D,0x66F7,0x6E34,0x78A3,0x7AED,0x845B,/* 0xE0-0xE7 */ + 0x8910,0x874E,0x97A8,0x52D8,0x574E,0x582A,0x5D4C,0x611F,/* 0xE8-0xEF */ + 0x61BE,0x6221,0x6562,0x67D1,0x6A44,0x6E1B,0x7518,0x75B3,/* 0xF0-0xF7 */ + 0x76E3,0x77B0,0x7D3A,0x90AF,0x9451,0x9452,0x9F95,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5323,0x5CAC,0x7532,0x80DB,0x9240,0x9598,0x525B,/* 0xA0-0xA7 */ + 0x5808,0x59DC,0x5CA1,0x5D17,0x5EB7,0x5F3A,0x5F4A,0x6177,/* 0xA8-0xAF */ + 0x6C5F,0x757A,0x7586,0x7CE0,0x7D73,0x7DB1,0x7F8C,0x8154,/* 0xB0-0xB7 */ + 0x8221,0x8591,0x8941,0x8B1B,0x92FC,0x964D,0x9C47,0x4ECB,/* 0xB8-0xBF */ + 0x4EF7,0x500B,0x51F1,0x584F,0x6137,0x613E,0x6168,0x6539,/* 0xC0-0xC7 */ + 0x69EA,0x6F11,0x75A5,0x7686,0x76D6,0x7B87,0x82A5,0x84CB,/* 0xC8-0xCF */ + 0xF900,0x93A7,0x958B,0x5580,0x5BA2,0x5751,0xF901,0x7CB3,/* 0xD0-0xD7 */ + 0x7FB9,0x91B5,0x5028,0x53BB,0x5C45,0x5DE8,0x62D2,0x636E,/* 0xD8-0xDF */ + 0x64DA,0x64E7,0x6E20,0x70AC,0x795B,0x8DDD,0x8E1E,0xF902,/* 0xE0-0xE7 */ + 0x907D,0x9245,0x92F8,0x4E7E,0x4EF6,0x5065,0x5DFE,0x5EFA,/* 0xE8-0xEF */ + 0x6106,0x6957,0x8171,0x8654,0x8E47,0x9375,0x9A2B,0x4E5E,/* 0xF0-0xF7 */ + 0x5091,0x6770,0x6840,0x5109,0x528D,0x5292,0x6AA2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77BC,0x9210,0x9ED4,0x52AB,0x602F,0x8FF2,0x5048,/* 0xA0-0xA7 */ + 0x61A9,0x63ED,0x64CA,0x683C,0x6A84,0x6FC0,0x8188,0x89A1,/* 0xA8-0xAF */ + 0x9694,0x5805,0x727D,0x72AC,0x7504,0x7D79,0x7E6D,0x80A9,/* 0xB0-0xB7 */ + 0x898B,0x8B74,0x9063,0x9D51,0x6289,0x6C7A,0x6F54,0x7D50,/* 0xB8-0xBF */ + 0x7F3A,0x8A23,0x517C,0x614A,0x7B9D,0x8B19,0x9257,0x938C,/* 0xC0-0xC7 */ + 0x4EAC,0x4FD3,0x501E,0x50BE,0x5106,0x52C1,0x52CD,0x537F,/* 0xC8-0xCF */ + 0x5770,0x5883,0x5E9A,0x5F91,0x6176,0x61AC,0x64CE,0x656C,/* 0xD0-0xD7 */ + 0x666F,0x66BB,0x66F4,0x6897,0x6D87,0x7085,0x70F1,0x749F,/* 0xD8-0xDF */ + 0x74A5,0x74CA,0x75D9,0x786C,0x78EC,0x7ADF,0x7AF6,0x7D45,/* 0xE0-0xE7 */ + 0x7D93,0x8015,0x803F,0x811B,0x8396,0x8B66,0x8F15,0x9015,/* 0xE8-0xEF */ + 0x93E1,0x9803,0x9838,0x9A5A,0x9BE8,0x4FC2,0x5553,0x583A,/* 0xF0-0xF7 */ + 0x5951,0x5B63,0x5C46,0x60B8,0x6212,0x6842,0x68B0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x68E8,0x6EAA,0x754C,0x7678,0x78CE,0x7A3D,0x7CFB,/* 0xA0-0xA7 */ + 0x7E6B,0x7E7C,0x8A08,0x8AA1,0x8C3F,0x968E,0x9DC4,0x53E4,/* 0xA8-0xAF */ + 0x53E9,0x544A,0x5471,0x56FA,0x59D1,0x5B64,0x5C3B,0x5EAB,/* 0xB0-0xB7 */ + 0x62F7,0x6537,0x6545,0x6572,0x66A0,0x67AF,0x69C1,0x6CBD,/* 0xB8-0xBF */ + 0x75FC,0x7690,0x777E,0x7A3F,0x7F94,0x8003,0x80A1,0x818F,/* 0xC0-0xC7 */ + 0x82E6,0x82FD,0x83F0,0x85C1,0x8831,0x88B4,0x8AA5,0xF903,/* 0xC8-0xCF */ + 0x8F9C,0x932E,0x96C7,0x9867,0x9AD8,0x9F13,0x54ED,0x659B,/* 0xD0-0xD7 */ + 0x66F2,0x688F,0x7A40,0x8C37,0x9D60,0x56F0,0x5764,0x5D11,/* 0xD8-0xDF */ + 0x6606,0x68B1,0x68CD,0x6EFE,0x7428,0x889E,0x9BE4,0x6C68,/* 0xE0-0xE7 */ + 0xF904,0x9AA8,0x4F9B,0x516C,0x5171,0x529F,0x5B54,0x5DE5,/* 0xE8-0xEF */ + 0x6050,0x606D,0x62F1,0x63A7,0x653B,0x73D9,0x7A7A,0x86A3,/* 0xF0-0xF7 */ + 0x8CA2,0x978F,0x4E32,0x5BE1,0x6208,0x679C,0x74DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79D1,0x83D3,0x8A87,0x8AB2,0x8DE8,0x904E,0x934B,/* 0xA0-0xA7 */ + 0x9846,0x5ED3,0x69E8,0x85FF,0x90ED,0xF905,0x51A0,0x5B98,/* 0xA8-0xAF */ + 0x5BEC,0x6163,0x68FA,0x6B3E,0x704C,0x742F,0x74D8,0x7BA1,/* 0xB0-0xB7 */ + 0x7F50,0x83C5,0x89C0,0x8CAB,0x95DC,0x9928,0x522E,0x605D,/* 0xB8-0xBF */ + 0x62EC,0x9002,0x4F8A,0x5149,0x5321,0x58D9,0x5EE3,0x66E0,/* 0xC0-0xC7 */ + 0x6D38,0x709A,0x72C2,0x73D6,0x7B50,0x80F1,0x945B,0x5366,/* 0xC8-0xCF */ + 0x639B,0x7F6B,0x4E56,0x5080,0x584A,0x58DE,0x602A,0x6127,/* 0xD0-0xD7 */ + 0x62D0,0x69D0,0x9B41,0x5B8F,0x7D18,0x80B1,0x8F5F,0x4EA4,/* 0xD8-0xDF */ + 0x50D1,0x54AC,0x55AC,0x5B0C,0x5DA0,0x5DE7,0x652A,0x654E,/* 0xE0-0xE7 */ + 0x6821,0x6A4B,0x72E1,0x768E,0x77EF,0x7D5E,0x7FF9,0x81A0,/* 0xE8-0xEF */ + 0x854E,0x86DF,0x8F03,0x8F4E,0x90CA,0x9903,0x9A55,0x9BAB,/* 0xF0-0xF7 */ + 0x4E18,0x4E45,0x4E5D,0x4EC7,0x4FF1,0x5177,0x52FE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5340,0x53E3,0x53E5,0x548E,0x5614,0x5775,0x57A2,/* 0xA0-0xA7 */ + 0x5BC7,0x5D87,0x5ED0,0x61FC,0x62D8,0x6551,0x67B8,0x67E9,/* 0xA8-0xAF */ + 0x69CB,0x6B50,0x6BC6,0x6BEC,0x6C42,0x6E9D,0x7078,0x72D7,/* 0xB0-0xB7 */ + 0x7396,0x7403,0x77BF,0x77E9,0x7A76,0x7D7F,0x8009,0x81FC,/* 0xB8-0xBF */ + 0x8205,0x820A,0x82DF,0x8862,0x8B33,0x8CFC,0x8EC0,0x9011,/* 0xC0-0xC7 */ + 0x90B1,0x9264,0x92B6,0x99D2,0x9A45,0x9CE9,0x9DD7,0x9F9C,/* 0xC8-0xCF */ + 0x570B,0x5C40,0x83CA,0x97A0,0x97AB,0x9EB4,0x541B,0x7A98,/* 0xD0-0xD7 */ + 0x7FA4,0x88D9,0x8ECD,0x90E1,0x5800,0x5C48,0x6398,0x7A9F,/* 0xD8-0xDF */ + 0x5BAE,0x5F13,0x7A79,0x7AAE,0x828E,0x8EAC,0x5026,0x5238,/* 0xE0-0xE7 */ + 0x52F8,0x5377,0x5708,0x62F3,0x6372,0x6B0A,0x6DC3,0x7737,/* 0xE8-0xEF */ + 0x53A5,0x7357,0x8568,0x8E76,0x95D5,0x673A,0x6AC3,0x6F70,/* 0xF0-0xF7 */ + 0x8A6D,0x8ECC,0x994B,0xF906,0x6677,0x6B78,0x8CB4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9B3C,0xF907,0x53EB,0x572D,0x594E,0x63C6,0x69FB,/* 0xA0-0xA7 */ + 0x73EA,0x7845,0x7ABA,0x7AC5,0x7CFE,0x8475,0x898F,0x8D73,/* 0xA8-0xAF */ + 0x9035,0x95A8,0x52FB,0x5747,0x7547,0x7B60,0x83CC,0x921E,/* 0xB0-0xB7 */ + 0xF908,0x6A58,0x514B,0x524B,0x5287,0x621F,0x68D8,0x6975,/* 0xB8-0xBF */ + 0x9699,0x50C5,0x52A4,0x52E4,0x61C3,0x65A4,0x6839,0x69FF,/* 0xC0-0xC7 */ + 0x747E,0x7B4B,0x82B9,0x83EB,0x89B2,0x8B39,0x8FD1,0x9949,/* 0xC8-0xCF */ + 0xF909,0x4ECA,0x5997,0x64D2,0x6611,0x6A8E,0x7434,0x7981,/* 0xD0-0xD7 */ + 0x79BD,0x82A9,0x887E,0x887F,0x895F,0xF90A,0x9326,0x4F0B,/* 0xD8-0xDF */ + 0x53CA,0x6025,0x6271,0x6C72,0x7D1A,0x7D66,0x4E98,0x5162,/* 0xE0-0xE7 */ + 0x77DC,0x80AF,0x4F01,0x4F0E,0x5176,0x5180,0x55DC,0x5668,/* 0xE8-0xEF */ + 0x573B,0x57FA,0x57FC,0x5914,0x5947,0x5993,0x5BC4,0x5C90,/* 0xF0-0xF7 */ + 0x5D0E,0x5DF1,0x5E7E,0x5FCC,0x6280,0x65D7,0x65E3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x671E,0x671F,0x675E,0x68CB,0x68C4,0x6A5F,0x6B3A,/* 0xA0-0xA7 */ + 0x6C23,0x6C7D,0x6C82,0x6DC7,0x7398,0x7426,0x742A,0x7482,/* 0xA8-0xAF */ + 0x74A3,0x7578,0x757F,0x7881,0x78EF,0x7941,0x7947,0x7948,/* 0xB0-0xB7 */ + 0x797A,0x7B95,0x7D00,0x7DBA,0x7F88,0x8006,0x802D,0x808C,/* 0xB8-0xBF */ + 0x8A18,0x8B4F,0x8C48,0x8D77,0x9321,0x9324,0x98E2,0x9951,/* 0xC0-0xC7 */ + 0x9A0E,0x9A0F,0x9A65,0x9E92,0x7DCA,0x4F76,0x5409,0x62EE,/* 0xC8-0xCF */ + 0x6854,0x91D1,0x55AB,0x513A,0xF90B,0xF90C,0x5A1C,0x61E6,/* 0xD0-0xD7 */ + 0xF90D,0x62CF,0x62FF,0xF90E,0xF90F,0xF910,0xF911,0xF912,/* 0xD8-0xDF */ + 0xF913,0x90A3,0xF914,0xF915,0xF916,0xF917,0xF918,0x8AFE,/* 0xE0-0xE7 */ + 0xF919,0xF91A,0xF91B,0xF91C,0x6696,0xF91D,0x7156,0xF91E,/* 0xE8-0xEF */ + 0xF91F,0x96E3,0xF920,0x634F,0x637A,0x5357,0xF921,0x678F,/* 0xF0-0xF7 */ + 0x6960,0x6E73,0xF922,0x7537,0xF923,0xF924,0xF925,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7D0D,0xF926,0xF927,0x8872,0x56CA,0x5A18,0xF928,/* 0xA0-0xA7 */ + 0xF929,0xF92A,0xF92B,0xF92C,0x4E43,0xF92D,0x5167,0x5948,/* 0xA8-0xAF */ + 0x67F0,0x8010,0xF92E,0x5973,0x5E74,0x649A,0x79CA,0x5FF5,/* 0xB0-0xB7 */ + 0x606C,0x62C8,0x637B,0x5BE7,0x5BD7,0x52AA,0xF92F,0x5974,/* 0xB8-0xBF */ + 0x5F29,0x6012,0xF930,0xF931,0xF932,0x7459,0xF933,0xF934,/* 0xC0-0xC7 */ + 0xF935,0xF936,0xF937,0xF938,0x99D1,0xF939,0xF93A,0xF93B,/* 0xC8-0xCF */ + 0xF93C,0xF93D,0xF93E,0xF93F,0xF940,0xF941,0xF942,0xF943,/* 0xD0-0xD7 */ + 0x6FC3,0xF944,0xF945,0x81BF,0x8FB2,0x60F1,0xF946,0xF947,/* 0xD8-0xDF */ + 0x8166,0xF948,0xF949,0x5C3F,0xF94A,0xF94B,0xF94C,0xF94D,/* 0xE0-0xE7 */ + 0xF94E,0xF94F,0xF950,0xF951,0x5AE9,0x8A25,0x677B,0x7D10,/* 0xE8-0xEF */ + 0xF952,0xF953,0xF954,0xF955,0xF956,0xF957,0x80FD,0xF958,/* 0xF0-0xF7 */ + 0xF959,0x5C3C,0x6CE5,0x533F,0x6EBA,0x591A,0x8336,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4E39,0x4EB6,0x4F46,0x55AE,0x5718,0x58C7,0x5F56,/* 0xA0-0xA7 */ + 0x65B7,0x65E6,0x6A80,0x6BB5,0x6E4D,0x77ED,0x7AEF,0x7C1E,/* 0xA8-0xAF */ + 0x7DDE,0x86CB,0x8892,0x9132,0x935B,0x64BB,0x6FBE,0x737A,/* 0xB0-0xB7 */ + 0x75B8,0x9054,0x5556,0x574D,0x61BA,0x64D4,0x66C7,0x6DE1,/* 0xB8-0xBF */ + 0x6E5B,0x6F6D,0x6FB9,0x75F0,0x8043,0x81BD,0x8541,0x8983,/* 0xC0-0xC7 */ + 0x8AC7,0x8B5A,0x931F,0x6C93,0x7553,0x7B54,0x8E0F,0x905D,/* 0xC8-0xCF */ + 0x5510,0x5802,0x5858,0x5E62,0x6207,0x649E,0x68E0,0x7576,/* 0xD0-0xD7 */ + 0x7CD6,0x87B3,0x9EE8,0x4EE3,0x5788,0x576E,0x5927,0x5C0D,/* 0xD8-0xDF */ + 0x5CB1,0x5E36,0x5F85,0x6234,0x64E1,0x73B3,0x81FA,0x888B,/* 0xE0-0xE7 */ + 0x8CB8,0x968A,0x9EDB,0x5B85,0x5FB7,0x60B3,0x5012,0x5200,/* 0xE8-0xEF */ + 0x5230,0x5716,0x5835,0x5857,0x5C0E,0x5C60,0x5CF6,0x5D8B,/* 0xF0-0xF7 */ + 0x5EA6,0x5F92,0x60BC,0x6311,0x6389,0x6417,0x6843,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x68F9,0x6AC2,0x6DD8,0x6E21,0x6ED4,0x6FE4,0x71FE,/* 0xA0-0xA7 */ + 0x76DC,0x7779,0x79B1,0x7A3B,0x8404,0x89A9,0x8CED,0x8DF3,/* 0xA8-0xAF */ + 0x8E48,0x9003,0x9014,0x9053,0x90FD,0x934D,0x9676,0x97DC,/* 0xB0-0xB7 */ + 0x6BD2,0x7006,0x7258,0x72A2,0x7368,0x7763,0x79BF,0x7BE4,/* 0xB8-0xBF */ + 0x7E9B,0x8B80,0x58A9,0x60C7,0x6566,0x65FD,0x66BE,0x6C8C,/* 0xC0-0xC7 */ + 0x711E,0x71C9,0x8C5A,0x9813,0x4E6D,0x7A81,0x4EDD,0x51AC,/* 0xC8-0xCF */ + 0x51CD,0x52D5,0x540C,0x61A7,0x6771,0x6850,0x68DF,0x6D1E,/* 0xD0-0xD7 */ + 0x6F7C,0x75BC,0x77B3,0x7AE5,0x80F4,0x8463,0x9285,0x515C,/* 0xD8-0xDF */ + 0x6597,0x675C,0x6793,0x75D8,0x7AC7,0x8373,0xF95A,0x8C46,/* 0xE0-0xE7 */ + 0x9017,0x982D,0x5C6F,0x81C0,0x829A,0x9041,0x906F,0x920D,/* 0xE8-0xEF */ + 0x5F97,0x5D9D,0x6A59,0x71C8,0x767B,0x7B49,0x85E4,0x8B04,/* 0xF0-0xF7 */ + 0x9127,0x9A30,0x5587,0x61F6,0xF95B,0x7669,0x7F85,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x863F,0x87BA,0x88F8,0x908F,0xF95C,0x6D1B,0x70D9,/* 0xA0-0xA7 */ + 0x73DE,0x7D61,0x843D,0xF95D,0x916A,0x99F1,0xF95E,0x4E82,/* 0xA8-0xAF */ + 0x5375,0x6B04,0x6B12,0x703E,0x721B,0x862D,0x9E1E,0x524C,/* 0xB0-0xB7 */ + 0x8FA3,0x5D50,0x64E5,0x652C,0x6B16,0x6FEB,0x7C43,0x7E9C,/* 0xB8-0xBF */ + 0x85CD,0x8964,0x89BD,0x62C9,0x81D8,0x881F,0x5ECA,0x6717,/* 0xC0-0xC7 */ + 0x6D6A,0x72FC,0x7405,0x746F,0x8782,0x90DE,0x4F86,0x5D0D,/* 0xC8-0xCF */ + 0x5FA0,0x840A,0x51B7,0x63A0,0x7565,0x4EAE,0x5006,0x5169,/* 0xD0-0xD7 */ + 0x51C9,0x6881,0x6A11,0x7CAE,0x7CB1,0x7CE7,0x826F,0x8AD2,/* 0xD8-0xDF */ + 0x8F1B,0x91CF,0x4FB6,0x5137,0x52F5,0x5442,0x5EEC,0x616E,/* 0xE0-0xE7 */ + 0x623E,0x65C5,0x6ADA,0x6FFE,0x792A,0x85DC,0x8823,0x95AD,/* 0xE8-0xEF */ + 0x9A62,0x9A6A,0x9E97,0x9ECE,0x529B,0x66C6,0x6B77,0x701D,/* 0xF0-0xF7 */ + 0x792B,0x8F62,0x9742,0x6190,0x6200,0x6523,0x6F23,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7149,0x7489,0x7DF4,0x806F,0x84EE,0x8F26,0x9023,/* 0xA0-0xA7 */ + 0x934A,0x51BD,0x5217,0x52A3,0x6D0C,0x70C8,0x88C2,0x5EC9,/* 0xA8-0xAF */ + 0x6582,0x6BAE,0x6FC2,0x7C3E,0x7375,0x4EE4,0x4F36,0x56F9,/* 0xB0-0xB7 */ + 0xF95F,0x5CBA,0x5DBA,0x601C,0x73B2,0x7B2D,0x7F9A,0x7FCE,/* 0xB8-0xBF */ + 0x8046,0x901E,0x9234,0x96F6,0x9748,0x9818,0x9F61,0x4F8B,/* 0xC0-0xC7 */ + 0x6FA7,0x79AE,0x91B4,0x96B7,0x52DE,0xF960,0x6488,0x64C4,/* 0xC8-0xCF */ + 0x6AD3,0x6F5E,0x7018,0x7210,0x76E7,0x8001,0x8606,0x865C,/* 0xD0-0xD7 */ + 0x8DEF,0x8F05,0x9732,0x9B6F,0x9DFA,0x9E75,0x788C,0x797F,/* 0xD8-0xDF */ + 0x7DA0,0x83C9,0x9304,0x9E7F,0x9E93,0x8AD6,0x58DF,0x5F04,/* 0xE0-0xE7 */ + 0x6727,0x7027,0x74CF,0x7C60,0x807E,0x5121,0x7028,0x7262,/* 0xE8-0xEF */ + 0x78CA,0x8CC2,0x8CDA,0x8CF4,0x96F7,0x4E86,0x50DA,0x5BEE,/* 0xF0-0xF7 */ + 0x5ED6,0x6599,0x71CE,0x7642,0x77AD,0x804A,0x84FC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x907C,0x9B27,0x9F8D,0x58D8,0x5A41,0x5C62,0x6A13,/* 0xA0-0xA7 */ + 0x6DDA,0x6F0F,0x763B,0x7D2F,0x7E37,0x851E,0x8938,0x93E4,/* 0xA8-0xAF */ + 0x964B,0x5289,0x65D2,0x67F3,0x69B4,0x6D41,0x6E9C,0x700F,/* 0xB0-0xB7 */ + 0x7409,0x7460,0x7559,0x7624,0x786B,0x8B2C,0x985E,0x516D,/* 0xB8-0xBF */ + 0x622E,0x9678,0x4F96,0x502B,0x5D19,0x6DEA,0x7DB8,0x8F2A,/* 0xC0-0xC7 */ + 0x5F8B,0x6144,0x6817,0xF961,0x9686,0x52D2,0x808B,0x51DC,/* 0xC8-0xCF */ + 0x51CC,0x695E,0x7A1C,0x7DBE,0x83F1,0x9675,0x4FDA,0x5229,/* 0xD0-0xD7 */ + 0x5398,0x540F,0x550E,0x5C65,0x60A7,0x674E,0x68A8,0x6D6C,/* 0xD8-0xDF */ + 0x7281,0x72F8,0x7406,0x7483,0xF962,0x75E2,0x7C6C,0x7F79,/* 0xE0-0xE7 */ + 0x7FB8,0x8389,0x88CF,0x88E1,0x91CC,0x91D0,0x96E2,0x9BC9,/* 0xE8-0xEF */ + 0x541D,0x6F7E,0x71D0,0x7498,0x85FA,0x8EAA,0x96A3,0x9C57,/* 0xF0-0xF7 */ + 0x9E9F,0x6797,0x6DCB,0x7433,0x81E8,0x9716,0x782C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7ACB,0x7B20,0x7C92,0x6469,0x746A,0x75F2,0x78BC,/* 0xA0-0xA7 */ + 0x78E8,0x99AC,0x9B54,0x9EBB,0x5BDE,0x5E55,0x6F20,0x819C,/* 0xA8-0xAF */ + 0x83AB,0x9088,0x4E07,0x534D,0x5A29,0x5DD2,0x5F4E,0x6162,/* 0xB0-0xB7 */ + 0x633D,0x6669,0x66FC,0x6EFF,0x6F2B,0x7063,0x779E,0x842C,/* 0xB8-0xBF */ + 0x8513,0x883B,0x8F13,0x9945,0x9C3B,0x551C,0x62B9,0x672B,/* 0xC0-0xC7 */ + 0x6CAB,0x8309,0x896A,0x977A,0x4EA1,0x5984,0x5FD8,0x5FD9,/* 0xC8-0xCF */ + 0x671B,0x7DB2,0x7F54,0x8292,0x832B,0x83BD,0x8F1E,0x9099,/* 0xD0-0xD7 */ + 0x57CB,0x59B9,0x5A92,0x5BD0,0x6627,0x679A,0x6885,0x6BCF,/* 0xD8-0xDF */ + 0x7164,0x7F75,0x8CB7,0x8CE3,0x9081,0x9B45,0x8108,0x8C8A,/* 0xE0-0xE7 */ + 0x964C,0x9A40,0x9EA5,0x5B5F,0x6C13,0x731B,0x76F2,0x76DF,/* 0xE8-0xEF */ + 0x840C,0x51AA,0x8993,0x514D,0x5195,0x52C9,0x68C9,0x6C94,/* 0xF0-0xF7 */ + 0x7704,0x7720,0x7DBF,0x7DEC,0x9762,0x9EB5,0x6EC5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8511,0x51A5,0x540D,0x547D,0x660E,0x669D,0x6927,/* 0xA0-0xA7 */ + 0x6E9F,0x76BF,0x7791,0x8317,0x84C2,0x879F,0x9169,0x9298,/* 0xA8-0xAF */ + 0x9CF4,0x8882,0x4FAE,0x5192,0x52DF,0x59C6,0x5E3D,0x6155,/* 0xB0-0xB7 */ + 0x6478,0x6479,0x66AE,0x67D0,0x6A21,0x6BCD,0x6BDB,0x725F,/* 0xB8-0xBF */ + 0x7261,0x7441,0x7738,0x77DB,0x8017,0x82BC,0x8305,0x8B00,/* 0xC0-0xC7 */ + 0x8B28,0x8C8C,0x6728,0x6C90,0x7267,0x76EE,0x7766,0x7A46,/* 0xC8-0xCF */ + 0x9DA9,0x6B7F,0x6C92,0x5922,0x6726,0x8499,0x536F,0x5893,/* 0xD0-0xD7 */ + 0x5999,0x5EDF,0x63CF,0x6634,0x6773,0x6E3A,0x732B,0x7AD7,/* 0xD8-0xDF */ + 0x82D7,0x9328,0x52D9,0x5DEB,0x61AE,0x61CB,0x620A,0x62C7,/* 0xE0-0xE7 */ + 0x64AB,0x65E0,0x6959,0x6B66,0x6BCB,0x7121,0x73F7,0x755D,/* 0xE8-0xEF */ + 0x7E46,0x821E,0x8302,0x856A,0x8AA3,0x8CBF,0x9727,0x9D61,/* 0xF0-0xF7 */ + 0x58A8,0x9ED8,0x5011,0x520E,0x543B,0x554F,0x6587,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6C76,0x7D0A,0x7D0B,0x805E,0x868A,0x9580,0x96EF,/* 0xA0-0xA7 */ + 0x52FF,0x6C95,0x7269,0x5473,0x5A9A,0x5C3E,0x5D4B,0x5F4C,/* 0xA8-0xAF */ + 0x5FAE,0x672A,0x68B6,0x6963,0x6E3C,0x6E44,0x7709,0x7C73,/* 0xB0-0xB7 */ + 0x7F8E,0x8587,0x8B0E,0x8FF7,0x9761,0x9EF4,0x5CB7,0x60B6,/* 0xB8-0xBF */ + 0x610D,0x61AB,0x654F,0x65FB,0x65FC,0x6C11,0x6CEF,0x739F,/* 0xC0-0xC7 */ + 0x73C9,0x7DE1,0x9594,0x5BC6,0x871C,0x8B10,0x525D,0x535A,/* 0xC8-0xCF */ + 0x62CD,0x640F,0x64B2,0x6734,0x6A38,0x6CCA,0x73C0,0x749E,/* 0xD0-0xD7 */ + 0x7B94,0x7C95,0x7E1B,0x818A,0x8236,0x8584,0x8FEB,0x96F9,/* 0xD8-0xDF */ + 0x99C1,0x4F34,0x534A,0x53CD,0x53DB,0x62CC,0x642C,0x6500,/* 0xE0-0xE7 */ + 0x6591,0x69C3,0x6CEE,0x6F58,0x73ED,0x7554,0x7622,0x76E4,/* 0xE8-0xEF */ + 0x76FC,0x78D0,0x78FB,0x792C,0x7D46,0x822C,0x87E0,0x8FD4,/* 0xF0-0xF7 */ + 0x9812,0x98EF,0x52C3,0x62D4,0x64A5,0x6E24,0x6F51,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x767C,0x8DCB,0x91B1,0x9262,0x9AEE,0x9B43,0x5023,/* 0xA0-0xA7 */ + 0x508D,0x574A,0x59A8,0x5C28,0x5E47,0x5F77,0x623F,0x653E,/* 0xA8-0xAF */ + 0x65B9,0x65C1,0x6609,0x678B,0x699C,0x6EC2,0x78C5,0x7D21,/* 0xB0-0xB7 */ + 0x80AA,0x8180,0x822B,0x82B3,0x84A1,0x868C,0x8A2A,0x8B17,/* 0xB8-0xBF */ + 0x90A6,0x9632,0x9F90,0x500D,0x4FF3,0xF963,0x57F9,0x5F98,/* 0xC0-0xC7 */ + 0x62DC,0x6392,0x676F,0x6E43,0x7119,0x76C3,0x80CC,0x80DA,/* 0xC8-0xCF */ + 0x88F4,0x88F5,0x8919,0x8CE0,0x8F29,0x914D,0x966A,0x4F2F,/* 0xD0-0xD7 */ + 0x4F70,0x5E1B,0x67CF,0x6822,0x767D,0x767E,0x9B44,0x5E61,/* 0xD8-0xDF */ + 0x6A0A,0x7169,0x71D4,0x756A,0xF964,0x7E41,0x8543,0x85E9,/* 0xE0-0xE7 */ + 0x98DC,0x4F10,0x7B4F,0x7F70,0x95A5,0x51E1,0x5E06,0x68B5,/* 0xE8-0xEF */ + 0x6C3E,0x6C4E,0x6CDB,0x72AF,0x7BC4,0x8303,0x6CD5,0x743A,/* 0xF0-0xF7 */ + 0x50FB,0x5288,0x58C1,0x64D8,0x6A97,0x74A7,0x7656,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x78A7,0x8617,0x95E2,0x9739,0xF965,0x535E,0x5F01,/* 0xA0-0xA7 */ + 0x8B8A,0x8FA8,0x8FAF,0x908A,0x5225,0x77A5,0x9C49,0x9F08,/* 0xA8-0xAF */ + 0x4E19,0x5002,0x5175,0x5C5B,0x5E77,0x661E,0x663A,0x67C4,/* 0xB0-0xB7 */ + 0x68C5,0x70B3,0x7501,0x75C5,0x79C9,0x7ADD,0x8F27,0x9920,/* 0xB8-0xBF */ + 0x9A08,0x4FDD,0x5821,0x5831,0x5BF6,0x666E,0x6B65,0x6D11,/* 0xC0-0xC7 */ + 0x6E7A,0x6F7D,0x73E4,0x752B,0x83E9,0x88DC,0x8913,0x8B5C,/* 0xC8-0xCF */ + 0x8F14,0x4F0F,0x50D5,0x5310,0x535C,0x5B93,0x5FA9,0x670D,/* 0xD0-0xD7 */ + 0x798F,0x8179,0x832F,0x8514,0x8907,0x8986,0x8F39,0x8F3B,/* 0xD8-0xDF */ + 0x99A5,0x9C12,0x672C,0x4E76,0x4FF8,0x5949,0x5C01,0x5CEF,/* 0xE0-0xE7 */ + 0x5CF0,0x6367,0x68D2,0x70FD,0x71A2,0x742B,0x7E2B,0x84EC,/* 0xE8-0xEF */ + 0x8702,0x9022,0x92D2,0x9CF3,0x4E0D,0x4ED8,0x4FEF,0x5085,/* 0xF0-0xF7 */ + 0x5256,0x526F,0x5426,0x5490,0x57E0,0x592B,0x5A66,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5B5A,0x5B75,0x5BCC,0x5E9C,0xF966,0x6276,0x6577,/* 0xA0-0xA7 */ + 0x65A7,0x6D6E,0x6EA5,0x7236,0x7B26,0x7C3F,0x7F36,0x8150,/* 0xA8-0xAF */ + 0x8151,0x819A,0x8240,0x8299,0x83A9,0x8A03,0x8CA0,0x8CE6,/* 0xB0-0xB7 */ + 0x8CFB,0x8D74,0x8DBA,0x90E8,0x91DC,0x961C,0x9644,0x99D9,/* 0xB8-0xBF */ + 0x9CE7,0x5317,0x5206,0x5429,0x5674,0x58B3,0x5954,0x596E,/* 0xC0-0xC7 */ + 0x5FFF,0x61A4,0x626E,0x6610,0x6C7E,0x711A,0x76C6,0x7C89,/* 0xC8-0xCF */ + 0x7CDE,0x7D1B,0x82AC,0x8CC1,0x96F0,0xF967,0x4F5B,0x5F17,/* 0xD0-0xD7 */ + 0x5F7F,0x62C2,0x5D29,0x670B,0x68DA,0x787C,0x7E43,0x9D6C,/* 0xD8-0xDF */ + 0x4E15,0x5099,0x5315,0x532A,0x5351,0x5983,0x5A62,0x5E87,/* 0xE0-0xE7 */ + 0x60B2,0x618A,0x6249,0x6279,0x6590,0x6787,0x69A7,0x6BD4,/* 0xE8-0xEF */ + 0x6BD6,0x6BD7,0x6BD8,0x6CB8,0xF968,0x7435,0x75FA,0x7812,/* 0xF0-0xF7 */ + 0x7891,0x79D5,0x79D8,0x7C83,0x7DCB,0x7FE1,0x80A5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x813E,0x81C2,0x83F2,0x871A,0x88E8,0x8AB9,0x8B6C,/* 0xA0-0xA7 */ + 0x8CBB,0x9119,0x975E,0x98DB,0x9F3B,0x56AC,0x5B2A,0x5F6C,/* 0xA8-0xAF */ + 0x658C,0x6AB3,0x6BAF,0x6D5C,0x6FF1,0x7015,0x725D,0x73AD,/* 0xB0-0xB7 */ + 0x8CA7,0x8CD3,0x983B,0x6191,0x6C37,0x8058,0x9A01,0x4E4D,/* 0xB8-0xBF */ + 0x4E8B,0x4E9B,0x4ED5,0x4F3A,0x4F3C,0x4F7F,0x4FDF,0x50FF,/* 0xC0-0xC7 */ + 0x53F2,0x53F8,0x5506,0x55E3,0x56DB,0x58EB,0x5962,0x5A11,/* 0xC8-0xCF */ + 0x5BEB,0x5BFA,0x5C04,0x5DF3,0x5E2B,0x5F99,0x601D,0x6368,/* 0xD0-0xD7 */ + 0x659C,0x65AF,0x67F6,0x67FB,0x68AD,0x6B7B,0x6C99,0x6CD7,/* 0xD8-0xDF */ + 0x6E23,0x7009,0x7345,0x7802,0x793E,0x7940,0x7960,0x79C1,/* 0xE0-0xE7 */ + 0x7BE9,0x7D17,0x7D72,0x8086,0x820D,0x838E,0x84D1,0x86C7,/* 0xE8-0xEF */ + 0x88DF,0x8A50,0x8A5E,0x8B1D,0x8CDC,0x8D66,0x8FAD,0x90AA,/* 0xF0-0xF7 */ + 0x98FC,0x99DF,0x9E9D,0x524A,0xF969,0x6714,0xF96A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5098,0x522A,0x5C71,0x6563,0x6C55,0x73CA,0x7523,/* 0xA0-0xA7 */ + 0x759D,0x7B97,0x849C,0x9178,0x9730,0x4E77,0x6492,0x6BBA,/* 0xA8-0xAF */ + 0x715E,0x85A9,0x4E09,0xF96B,0x6749,0x68EE,0x6E17,0x829F,/* 0xB0-0xB7 */ + 0x8518,0x886B,0x63F7,0x6F81,0x9212,0x98AF,0x4E0A,0x50B7,/* 0xB8-0xBF */ + 0x50CF,0x511F,0x5546,0x55AA,0x5617,0x5B40,0x5C19,0x5CE0,/* 0xC0-0xC7 */ + 0x5E38,0x5E8A,0x5EA0,0x5EC2,0x60F3,0x6851,0x6A61,0x6E58,/* 0xC8-0xCF */ + 0x723D,0x7240,0x72C0,0x76F8,0x7965,0x7BB1,0x7FD4,0x88F3,/* 0xD0-0xD7 */ + 0x89F4,0x8A73,0x8C61,0x8CDE,0x971C,0x585E,0x74BD,0x8CFD,/* 0xD8-0xDF */ + 0x55C7,0xF96C,0x7A61,0x7D22,0x8272,0x7272,0x751F,0x7525,/* 0xE0-0xE7 */ + 0xF96D,0x7B19,0x5885,0x58FB,0x5DBC,0x5E8F,0x5EB6,0x5F90,/* 0xE8-0xEF */ + 0x6055,0x6292,0x637F,0x654D,0x6691,0x66D9,0x66F8,0x6816,/* 0xF0-0xF7 */ + 0x68F2,0x7280,0x745E,0x7B6E,0x7D6E,0x7DD6,0x7F72,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x80E5,0x8212,0x85AF,0x897F,0x8A93,0x901D,0x92E4,/* 0xA0-0xA7 */ + 0x9ECD,0x9F20,0x5915,0x596D,0x5E2D,0x60DC,0x6614,0x6673,/* 0xA8-0xAF */ + 0x6790,0x6C50,0x6DC5,0x6F5F,0x77F3,0x78A9,0x84C6,0x91CB,/* 0xB0-0xB7 */ + 0x932B,0x4ED9,0x50CA,0x5148,0x5584,0x5B0B,0x5BA3,0x6247,/* 0xB8-0xBF */ + 0x657E,0x65CB,0x6E32,0x717D,0x7401,0x7444,0x7487,0x74BF,/* 0xC0-0xC7 */ + 0x766C,0x79AA,0x7DDA,0x7E55,0x7FA8,0x817A,0x81B3,0x8239,/* 0xC8-0xCF */ + 0x861A,0x87EC,0x8A75,0x8DE3,0x9078,0x9291,0x9425,0x994D,/* 0xD0-0xD7 */ + 0x9BAE,0x5368,0x5C51,0x6954,0x6CC4,0x6D29,0x6E2B,0x820C,/* 0xD8-0xDF */ + 0x859B,0x893B,0x8A2D,0x8AAA,0x96EA,0x9F67,0x5261,0x66B9,/* 0xE0-0xE7 */ + 0x6BB2,0x7E96,0x87FE,0x8D0D,0x9583,0x965D,0x651D,0x6D89,/* 0xE8-0xEF */ + 0x71EE,0xF96E,0x57CE,0x59D3,0x5BAC,0x6027,0x60FA,0x6210,/* 0xF0-0xF7 */ + 0x661F,0x665F,0x7329,0x73F9,0x76DB,0x7701,0x7B6C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8056,0x8072,0x8165,0x8AA0,0x9192,0x4E16,0x52E2,/* 0xA0-0xA7 */ + 0x6B72,0x6D17,0x7A05,0x7B39,0x7D30,0xF96F,0x8CB0,0x53EC,/* 0xA8-0xAF */ + 0x562F,0x5851,0x5BB5,0x5C0F,0x5C11,0x5DE2,0x6240,0x6383,/* 0xB0-0xB7 */ + 0x6414,0x662D,0x68B3,0x6CBC,0x6D88,0x6EAF,0x701F,0x70A4,/* 0xB8-0xBF */ + 0x71D2,0x7526,0x758F,0x758E,0x7619,0x7B11,0x7BE0,0x7C2B,/* 0xC0-0xC7 */ + 0x7D20,0x7D39,0x852C,0x856D,0x8607,0x8A34,0x900D,0x9061,/* 0xC8-0xCF */ + 0x90B5,0x92B7,0x97F6,0x9A37,0x4FD7,0x5C6C,0x675F,0x6D91,/* 0xD0-0xD7 */ + 0x7C9F,0x7E8C,0x8B16,0x8D16,0x901F,0x5B6B,0x5DFD,0x640D,/* 0xD8-0xDF */ + 0x84C0,0x905C,0x98E1,0x7387,0x5B8B,0x609A,0x677E,0x6DDE,/* 0xE0-0xE7 */ + 0x8A1F,0x8AA6,0x9001,0x980C,0x5237,0xF970,0x7051,0x788E,/* 0xE8-0xEF */ + 0x9396,0x8870,0x91D7,0x4FEE,0x53D7,0x55FD,0x56DA,0x5782,/* 0xF0-0xF7 */ + 0x58FD,0x5AC2,0x5B88,0x5CAB,0x5CC0,0x5E25,0x6101,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x620D,0x624B,0x6388,0x641C,0x6536,0x6578,0x6A39,/* 0xA0-0xA7 */ + 0x6B8A,0x6C34,0x6D19,0x6F31,0x71E7,0x72E9,0x7378,0x7407,/* 0xA8-0xAF */ + 0x74B2,0x7626,0x7761,0x79C0,0x7A57,0x7AEA,0x7CB9,0x7D8F,/* 0xB0-0xB7 */ + 0x7DAC,0x7E61,0x7F9E,0x8129,0x8331,0x8490,0x84DA,0x85EA,/* 0xB8-0xBF */ + 0x8896,0x8AB0,0x8B90,0x8F38,0x9042,0x9083,0x916C,0x9296,/* 0xC0-0xC7 */ + 0x92B9,0x968B,0x96A7,0x96A8,0x96D6,0x9700,0x9808,0x9996,/* 0xC8-0xCF */ + 0x9AD3,0x9B1A,0x53D4,0x587E,0x5919,0x5B70,0x5BBF,0x6DD1,/* 0xD0-0xD7 */ + 0x6F5A,0x719F,0x7421,0x74B9,0x8085,0x83FD,0x5DE1,0x5F87,/* 0xD8-0xDF */ + 0x5FAA,0x6042,0x65EC,0x6812,0x696F,0x6A53,0x6B89,0x6D35,/* 0xE0-0xE7 */ + 0x6DF3,0x73E3,0x76FE,0x77AC,0x7B4D,0x7D14,0x8123,0x821C,/* 0xE8-0xEF */ + 0x8340,0x84F4,0x8563,0x8A62,0x8AC4,0x9187,0x931E,0x9806,/* 0xF0-0xF7 */ + 0x99B4,0x620C,0x8853,0x8FF0,0x9265,0x5D07,0x5D27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5D69,0x745F,0x819D,0x8768,0x6FD5,0x62FE,0x7FD2,/* 0xA0-0xA7 */ + 0x8936,0x8972,0x4E1E,0x4E58,0x50E7,0x52DD,0x5347,0x627F,/* 0xA8-0xAF */ + 0x6607,0x7E69,0x8805,0x965E,0x4F8D,0x5319,0x5636,0x59CB,/* 0xB0-0xB7 */ + 0x5AA4,0x5C38,0x5C4E,0x5C4D,0x5E02,0x5F11,0x6043,0x65BD,/* 0xB8-0xBF */ + 0x662F,0x6642,0x67BE,0x67F4,0x731C,0x77E2,0x793A,0x7FC5,/* 0xC0-0xC7 */ + 0x8494,0x84CD,0x8996,0x8A66,0x8A69,0x8AE1,0x8C55,0x8C7A,/* 0xC8-0xCF */ + 0x57F4,0x5BD4,0x5F0F,0x606F,0x62ED,0x690D,0x6B96,0x6E5C,/* 0xD0-0xD7 */ + 0x7184,0x7BD2,0x8755,0x8B58,0x8EFE,0x98DF,0x98FE,0x4F38,/* 0xD8-0xDF */ + 0x4F81,0x4FE1,0x547B,0x5A20,0x5BB8,0x613C,0x65B0,0x6668,/* 0xE0-0xE7 */ + 0x71FC,0x7533,0x795E,0x7D33,0x814E,0x81E3,0x8398,0x85AA,/* 0xE8-0xEF */ + 0x85CE,0x8703,0x8A0A,0x8EAB,0x8F9B,0xF971,0x8FC5,0x5931,/* 0xF0-0xF7 */ + 0x5BA4,0x5BE6,0x6089,0x5BE9,0x5C0B,0x5FC3,0x6C81,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF972,0x6DF1,0x700B,0x751A,0x82AF,0x8AF6,0x4EC0,/* 0xA0-0xA7 */ + 0x5341,0xF973,0x96D9,0x6C0F,0x4E9E,0x4FC4,0x5152,0x555E,/* 0xA8-0xAF */ + 0x5A25,0x5CE8,0x6211,0x7259,0x82BD,0x83AA,0x86FE,0x8859,/* 0xB0-0xB7 */ + 0x8A1D,0x963F,0x96C5,0x9913,0x9D09,0x9D5D,0x580A,0x5CB3,/* 0xB8-0xBF */ + 0x5DBD,0x5E44,0x60E1,0x6115,0x63E1,0x6A02,0x6E25,0x9102,/* 0xC0-0xC7 */ + 0x9354,0x984E,0x9C10,0x9F77,0x5B89,0x5CB8,0x6309,0x664F,/* 0xC8-0xCF */ + 0x6848,0x773C,0x96C1,0x978D,0x9854,0x9B9F,0x65A1,0x8B01,/* 0xD0-0xD7 */ + 0x8ECB,0x95BC,0x5535,0x5CA9,0x5DD6,0x5EB5,0x6697,0x764C,/* 0xD8-0xDF */ + 0x83F4,0x95C7,0x58D3,0x62BC,0x72CE,0x9D28,0x4EF0,0x592E,/* 0xE0-0xE7 */ + 0x600F,0x663B,0x6B83,0x79E7,0x9D26,0x5393,0x54C0,0x57C3,/* 0xE8-0xEF */ + 0x5D16,0x611B,0x66D6,0x6DAF,0x788D,0x827E,0x9698,0x9744,/* 0xF0-0xF7 */ + 0x5384,0x627C,0x6396,0x6DB2,0x7E0A,0x814B,0x984D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6AFB,0x7F4C,0x9DAF,0x9E1A,0x4E5F,0x503B,0x51B6,/* 0xA0-0xA7 */ + 0x591C,0x60F9,0x63F6,0x6930,0x723A,0x8036,0xF974,0x91CE,/* 0xA8-0xAF */ + 0x5F31,0xF975,0xF976,0x7D04,0x82E5,0x846F,0x84BB,0x85E5,/* 0xB0-0xB7 */ + 0x8E8D,0xF977,0x4F6F,0xF978,0xF979,0x58E4,0x5B43,0x6059,/* 0xB8-0xBF */ + 0x63DA,0x6518,0x656D,0x6698,0xF97A,0x694A,0x6A23,0x6D0B,/* 0xC0-0xC7 */ + 0x7001,0x716C,0x75D2,0x760D,0x79B3,0x7A70,0xF97B,0x7F8A,/* 0xC8-0xCF */ + 0xF97C,0x8944,0xF97D,0x8B93,0x91C0,0x967D,0xF97E,0x990A,/* 0xD0-0xD7 */ + 0x5704,0x5FA1,0x65BC,0x6F01,0x7600,0x79A6,0x8A9E,0x99AD,/* 0xD8-0xDF */ + 0x9B5A,0x9F6C,0x5104,0x61B6,0x6291,0x6A8D,0x81C6,0x5043,/* 0xE0-0xE7 */ + 0x5830,0x5F66,0x7109,0x8A00,0x8AFA,0x5B7C,0x8616,0x4FFA,/* 0xE8-0xEF */ + 0x513C,0x56B4,0x5944,0x63A9,0x6DF9,0x5DAA,0x696D,0x5186,/* 0xF0-0xF7 */ + 0x4E88,0x4F59,0xF97F,0xF980,0xF981,0x5982,0xF982,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF983,0x6B5F,0x6C5D,0xF984,0x74B5,0x7916,0xF985,/* 0xA0-0xA7 */ + 0x8207,0x8245,0x8339,0x8F3F,0x8F5D,0xF986,0x9918,0xF987,/* 0xA8-0xAF */ + 0xF988,0xF989,0x4EA6,0xF98A,0x57DF,0x5F79,0x6613,0xF98B,/* 0xB0-0xB7 */ + 0xF98C,0x75AB,0x7E79,0x8B6F,0xF98D,0x9006,0x9A5B,0x56A5,/* 0xB8-0xBF */ + 0x5827,0x59F8,0x5A1F,0x5BB4,0xF98E,0x5EF6,0xF98F,0xF990,/* 0xC0-0xC7 */ + 0x6350,0x633B,0xF991,0x693D,0x6C87,0x6CBF,0x6D8E,0x6D93,/* 0xC8-0xCF */ + 0x6DF5,0x6F14,0xF992,0x70DF,0x7136,0x7159,0xF993,0x71C3,/* 0xD0-0xD7 */ + 0x71D5,0xF994,0x784F,0x786F,0xF995,0x7B75,0x7DE3,0xF996,/* 0xD8-0xDF */ + 0x7E2F,0xF997,0x884D,0x8EDF,0xF998,0xF999,0xF99A,0x925B,/* 0xE0-0xE7 */ + 0xF99B,0x9CF6,0xF99C,0xF99D,0xF99E,0x6085,0x6D85,0xF99F,/* 0xE8-0xEF */ + 0x71B1,0xF9A0,0xF9A1,0x95B1,0x53AD,0xF9A2,0xF9A3,0xF9A4,/* 0xF0-0xF7 */ + 0x67D3,0xF9A5,0x708E,0x7130,0x7430,0x8276,0x82D2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF9A6,0x95BB,0x9AE5,0x9E7D,0x66C4,0xF9A7,0x71C1,/* 0xA0-0xA7 */ + 0x8449,0xF9A8,0xF9A9,0x584B,0xF9AA,0xF9AB,0x5DB8,0x5F71,/* 0xA8-0xAF */ + 0xF9AC,0x6620,0x668E,0x6979,0x69AE,0x6C38,0x6CF3,0x6E36,/* 0xB0-0xB7 */ + 0x6F41,0x6FDA,0x701B,0x702F,0x7150,0x71DF,0x7370,0xF9AD,/* 0xB8-0xBF */ + 0x745B,0xF9AE,0x74D4,0x76C8,0x7A4E,0x7E93,0xF9AF,0xF9B0,/* 0xC0-0xC7 */ + 0x82F1,0x8A60,0x8FCE,0xF9B1,0x9348,0xF9B2,0x9719,0xF9B3,/* 0xC8-0xCF */ + 0xF9B4,0x4E42,0x502A,0xF9B5,0x5208,0x53E1,0x66F3,0x6C6D,/* 0xD0-0xD7 */ + 0x6FCA,0x730A,0x777F,0x7A62,0x82AE,0x85DD,0x8602,0xF9B6,/* 0xD8-0xDF */ + 0x88D4,0x8A63,0x8B7D,0x8C6B,0xF9B7,0x92B3,0xF9B8,0x9713,/* 0xE0-0xE7 */ + 0x9810,0x4E94,0x4F0D,0x4FC9,0x50B2,0x5348,0x543E,0x5433,/* 0xE8-0xEF */ + 0x55DA,0x5862,0x58BA,0x5967,0x5A1B,0x5BE4,0x609F,0xF9B9,/* 0xF0-0xF7 */ + 0x61CA,0x6556,0x65FF,0x6664,0x68A7,0x6C5A,0x6FB3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x70CF,0x71AC,0x7352,0x7B7D,0x8708,0x8AA4,0x9C32,/* 0xA0-0xA7 */ + 0x9F07,0x5C4B,0x6C83,0x7344,0x7389,0x923A,0x6EAB,0x7465,/* 0xA8-0xAF */ + 0x761F,0x7A69,0x7E15,0x860A,0x5140,0x58C5,0x64C1,0x74EE,/* 0xB0-0xB7 */ + 0x7515,0x7670,0x7FC1,0x9095,0x96CD,0x9954,0x6E26,0x74E6,/* 0xB8-0xBF */ + 0x7AA9,0x7AAA,0x81E5,0x86D9,0x8778,0x8A1B,0x5A49,0x5B8C,/* 0xC0-0xC7 */ + 0x5B9B,0x68A1,0x6900,0x6D63,0x73A9,0x7413,0x742C,0x7897,/* 0xC8-0xCF */ + 0x7DE9,0x7FEB,0x8118,0x8155,0x839E,0x8C4C,0x962E,0x9811,/* 0xD0-0xD7 */ + 0x66F0,0x5F80,0x65FA,0x6789,0x6C6A,0x738B,0x502D,0x5A03,/* 0xD8-0xDF */ + 0x6B6A,0x77EE,0x5916,0x5D6C,0x5DCD,0x7325,0x754F,0xF9BA,/* 0xE0-0xE7 */ + 0xF9BB,0x50E5,0x51F9,0x582F,0x592D,0x5996,0x59DA,0x5BE5,/* 0xE8-0xEF */ + 0xF9BC,0xF9BD,0x5DA2,0x62D7,0x6416,0x6493,0x64FE,0xF9BE,/* 0xF0-0xF7 */ + 0x66DC,0xF9BF,0x6A48,0xF9C0,0x71FF,0x7464,0xF9C1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7A88,0x7AAF,0x7E47,0x7E5E,0x8000,0x8170,0xF9C2,/* 0xA0-0xA7 */ + 0x87EF,0x8981,0x8B20,0x9059,0xF9C3,0x9080,0x9952,0x617E,/* 0xA8-0xAF */ + 0x6B32,0x6D74,0x7E1F,0x8925,0x8FB1,0x4FD1,0x50AD,0x5197,/* 0xB0-0xB7 */ + 0x52C7,0x57C7,0x5889,0x5BB9,0x5EB8,0x6142,0x6995,0x6D8C,/* 0xB8-0xBF */ + 0x6E67,0x6EB6,0x7194,0x7462,0x7528,0x752C,0x8073,0x8338,/* 0xC0-0xC7 */ + 0x84C9,0x8E0A,0x9394,0x93DE,0xF9C4,0x4E8E,0x4F51,0x5076,/* 0xC8-0xCF */ + 0x512A,0x53C8,0x53CB,0x53F3,0x5B87,0x5BD3,0x5C24,0x611A,/* 0xD0-0xD7 */ + 0x6182,0x65F4,0x725B,0x7397,0x7440,0x76C2,0x7950,0x7991,/* 0xD8-0xDF */ + 0x79B9,0x7D06,0x7FBD,0x828B,0x85D5,0x865E,0x8FC2,0x9047,/* 0xE0-0xE7 */ + 0x90F5,0x91EA,0x9685,0x96E8,0x96E9,0x52D6,0x5F67,0x65ED,/* 0xE8-0xEF */ + 0x6631,0x682F,0x715C,0x7A36,0x90C1,0x980A,0x4E91,0xF9C5,/* 0xF0-0xF7 */ + 0x6A52,0x6B9E,0x6F90,0x7189,0x8018,0x82B8,0x8553,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x904B,0x9695,0x96F2,0x97FB,0x851A,0x9B31,0x4E90,/* 0xA0-0xA7 */ + 0x718A,0x96C4,0x5143,0x539F,0x54E1,0x5713,0x5712,0x57A3,/* 0xA8-0xAF */ + 0x5A9B,0x5AC4,0x5BC3,0x6028,0x613F,0x63F4,0x6C85,0x6D39,/* 0xB0-0xB7 */ + 0x6E72,0x6E90,0x7230,0x733F,0x7457,0x82D1,0x8881,0x8F45,/* 0xB8-0xBF */ + 0x9060,0xF9C6,0x9662,0x9858,0x9D1B,0x6708,0x8D8A,0x925E,/* 0xC0-0xC7 */ + 0x4F4D,0x5049,0x50DE,0x5371,0x570D,0x59D4,0x5A01,0x5C09,/* 0xC8-0xCF */ + 0x6170,0x6690,0x6E2D,0x7232,0x744B,0x7DEF,0x80C3,0x840E,/* 0xD0-0xD7 */ + 0x8466,0x853F,0x875F,0x885B,0x8918,0x8B02,0x9055,0x97CB,/* 0xD8-0xDF */ + 0x9B4F,0x4E73,0x4F91,0x5112,0x516A,0xF9C7,0x552F,0x55A9,/* 0xE0-0xE7 */ + 0x5B7A,0x5BA5,0x5E7C,0x5E7D,0x5EBE,0x60A0,0x60DF,0x6108,/* 0xE8-0xEF */ + 0x6109,0x63C4,0x6538,0x6709,0xF9C8,0x67D4,0x67DA,0xF9C9,/* 0xF0-0xF7 */ + 0x6961,0x6962,0x6CB9,0x6D27,0xF9CA,0x6E38,0xF9CB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6FE1,0x7336,0x7337,0xF9CC,0x745C,0x7531,0xF9CD,/* 0xA0-0xA7 */ + 0x7652,0xF9CE,0xF9CF,0x7DAD,0x81FE,0x8438,0x88D5,0x8A98,/* 0xA8-0xAF */ + 0x8ADB,0x8AED,0x8E30,0x8E42,0x904A,0x903E,0x907A,0x9149,/* 0xB0-0xB7 */ + 0x91C9,0x936E,0xF9D0,0xF9D1,0x5809,0xF9D2,0x6BD3,0x8089,/* 0xB8-0xBF */ + 0x80B2,0xF9D3,0xF9D4,0x5141,0x596B,0x5C39,0xF9D5,0xF9D6,/* 0xC0-0xC7 */ + 0x6F64,0x73A7,0x80E4,0x8D07,0xF9D7,0x9217,0x958F,0xF9D8,/* 0xC8-0xCF */ + 0xF9D9,0xF9DA,0xF9DB,0x807F,0x620E,0x701C,0x7D68,0x878D,/* 0xD0-0xD7 */ + 0xF9DC,0x57A0,0x6069,0x6147,0x6BB7,0x8ABE,0x9280,0x96B1,/* 0xD8-0xDF */ + 0x4E59,0x541F,0x6DEB,0x852D,0x9670,0x97F3,0x98EE,0x63D6,/* 0xE0-0xE7 */ + 0x6CE3,0x9091,0x51DD,0x61C9,0x81BA,0x9DF9,0x4F9D,0x501A,/* 0xE8-0xEF */ + 0x5100,0x5B9C,0x610F,0x61FF,0x64EC,0x6905,0x6BC5,0x7591,/* 0xF0-0xF7 */ + 0x77E3,0x7FA9,0x8264,0x858F,0x87FB,0x8863,0x8ABC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8B70,0x91AB,0x4E8C,0x4EE5,0x4F0A,0xF9DD,0xF9DE,/* 0xA0-0xA7 */ + 0x5937,0x59E8,0xF9DF,0x5DF2,0x5F1B,0x5F5B,0x6021,0xF9E0,/* 0xA8-0xAF */ + 0xF9E1,0xF9E2,0xF9E3,0x723E,0x73E5,0xF9E4,0x7570,0x75CD,/* 0xB0-0xB7 */ + 0xF9E5,0x79FB,0xF9E6,0x800C,0x8033,0x8084,0x82E1,0x8351,/* 0xB8-0xBF */ + 0xF9E7,0xF9E8,0x8CBD,0x8CB3,0x9087,0xF9E9,0xF9EA,0x98F4,/* 0xC0-0xC7 */ + 0x990C,0xF9EB,0xF9EC,0x7037,0x76CA,0x7FCA,0x7FCC,0x7FFC,/* 0xC8-0xCF */ + 0x8B1A,0x4EBA,0x4EC1,0x5203,0x5370,0xF9ED,0x54BD,0x56E0,/* 0xD0-0xD7 */ + 0x59FB,0x5BC5,0x5F15,0x5FCD,0x6E6E,0xF9EE,0xF9EF,0x7D6A,/* 0xD8-0xDF */ + 0x8335,0xF9F0,0x8693,0x8A8D,0xF9F1,0x976D,0x9777,0xF9F2,/* 0xE0-0xE7 */ + 0xF9F3,0x4E00,0x4F5A,0x4F7E,0x58F9,0x65E5,0x6EA2,0x9038,/* 0xE8-0xEF */ + 0x93B0,0x99B9,0x4EFB,0x58EC,0x598A,0x59D9,0x6041,0xF9F4,/* 0xF0-0xF7 */ + 0xF9F5,0x7A14,0xF9F6,0x834F,0x8CC3,0x5165,0x5344,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF9F7,0xF9F8,0xF9F9,0x4ECD,0x5269,0x5B55,0x82BF,/* 0xA0-0xA7 */ + 0x4ED4,0x523A,0x54A8,0x59C9,0x59FF,0x5B50,0x5B57,0x5B5C,/* 0xA8-0xAF */ + 0x6063,0x6148,0x6ECB,0x7099,0x716E,0x7386,0x74F7,0x75B5,/* 0xB0-0xB7 */ + 0x78C1,0x7D2B,0x8005,0x81EA,0x8328,0x8517,0x85C9,0x8AEE,/* 0xB8-0xBF */ + 0x8CC7,0x96CC,0x4F5C,0x52FA,0x56BC,0x65AB,0x6628,0x707C,/* 0xC0-0xC7 */ + 0x70B8,0x7235,0x7DBD,0x828D,0x914C,0x96C0,0x9D72,0x5B71,/* 0xC8-0xCF */ + 0x68E7,0x6B98,0x6F7A,0x76DE,0x5C91,0x66AB,0x6F5B,0x7BB4,/* 0xD0-0xD7 */ + 0x7C2A,0x8836,0x96DC,0x4E08,0x4ED7,0x5320,0x5834,0x58BB,/* 0xD8-0xDF */ + 0x58EF,0x596C,0x5C07,0x5E33,0x5E84,0x5F35,0x638C,0x66B2,/* 0xE0-0xE7 */ + 0x6756,0x6A1F,0x6AA3,0x6B0C,0x6F3F,0x7246,0xF9FA,0x7350,/* 0xE8-0xEF */ + 0x748B,0x7AE0,0x7CA7,0x8178,0x81DF,0x81E7,0x838A,0x846C,/* 0xF0-0xF7 */ + 0x8523,0x8594,0x85CF,0x88DD,0x8D13,0x91AC,0x9577,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x969C,0x518D,0x54C9,0x5728,0x5BB0,0x624D,0x6750,/* 0xA0-0xA7 */ + 0x683D,0x6893,0x6E3D,0x6ED3,0x707D,0x7E21,0x88C1,0x8CA1,/* 0xA8-0xAF */ + 0x8F09,0x9F4B,0x9F4E,0x722D,0x7B8F,0x8ACD,0x931A,0x4F47,/* 0xB0-0xB7 */ + 0x4F4E,0x5132,0x5480,0x59D0,0x5E95,0x62B5,0x6775,0x696E,/* 0xB8-0xBF */ + 0x6A17,0x6CAE,0x6E1A,0x72D9,0x732A,0x75BD,0x7BB8,0x7D35,/* 0xC0-0xC7 */ + 0x82E7,0x83F9,0x8457,0x85F7,0x8A5B,0x8CAF,0x8E87,0x9019,/* 0xC8-0xCF */ + 0x90B8,0x96CE,0x9F5F,0x52E3,0x540A,0x5AE1,0x5BC2,0x6458,/* 0xD0-0xD7 */ + 0x6575,0x6EF4,0x72C4,0xF9FB,0x7684,0x7A4D,0x7B1B,0x7C4D,/* 0xD8-0xDF */ + 0x7E3E,0x7FDF,0x837B,0x8B2B,0x8CCA,0x8D64,0x8DE1,0x8E5F,/* 0xE0-0xE7 */ + 0x8FEA,0x8FF9,0x9069,0x93D1,0x4F43,0x4F7A,0x50B3,0x5168,/* 0xE8-0xEF */ + 0x5178,0x524D,0x526A,0x5861,0x587C,0x5960,0x5C08,0x5C55,/* 0xF0-0xF7 */ + 0x5EDB,0x609B,0x6230,0x6813,0x6BBF,0x6C08,0x6FB1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x714E,0x7420,0x7530,0x7538,0x7551,0x7672,0x7B4C,/* 0xA0-0xA7 */ + 0x7B8B,0x7BAD,0x7BC6,0x7E8F,0x8A6E,0x8F3E,0x8F49,0x923F,/* 0xA8-0xAF */ + 0x9293,0x9322,0x942B,0x96FB,0x985A,0x986B,0x991E,0x5207,/* 0xB0-0xB7 */ + 0x622A,0x6298,0x6D59,0x7664,0x7ACA,0x7BC0,0x7D76,0x5360,/* 0xB8-0xBF */ + 0x5CBE,0x5E97,0x6F38,0x70B9,0x7C98,0x9711,0x9B8E,0x9EDE,/* 0xC0-0xC7 */ + 0x63A5,0x647A,0x8776,0x4E01,0x4E95,0x4EAD,0x505C,0x5075,/* 0xC8-0xCF */ + 0x5448,0x59C3,0x5B9A,0x5E40,0x5EAD,0x5EF7,0x5F81,0x60C5,/* 0xD0-0xD7 */ + 0x633A,0x653F,0x6574,0x65CC,0x6676,0x6678,0x67FE,0x6968,/* 0xD8-0xDF */ + 0x6A89,0x6B63,0x6C40,0x6DC0,0x6DE8,0x6E1F,0x6E5E,0x701E,/* 0xE0-0xE7 */ + 0x70A1,0x738E,0x73FD,0x753A,0x775B,0x7887,0x798E,0x7A0B,/* 0xE8-0xEF */ + 0x7A7D,0x7CBE,0x7D8E,0x8247,0x8A02,0x8AEA,0x8C9E,0x912D,/* 0xF0-0xF7 */ + 0x914A,0x91D8,0x9266,0x92CC,0x9320,0x9706,0x9756,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x975C,0x9802,0x9F0E,0x5236,0x5291,0x557C,0x5824,/* 0xA0-0xA7 */ + 0x5E1D,0x5F1F,0x608C,0x63D0,0x68AF,0x6FDF,0x796D,0x7B2C,/* 0xA8-0xAF */ + 0x81CD,0x85BA,0x88FD,0x8AF8,0x8E44,0x918D,0x9664,0x969B,/* 0xB0-0xB7 */ + 0x973D,0x984C,0x9F4A,0x4FCE,0x5146,0x51CB,0x52A9,0x5632,/* 0xB8-0xBF */ + 0x5F14,0x5F6B,0x63AA,0x64CD,0x65E9,0x6641,0x66FA,0x66F9,/* 0xC0-0xC7 */ + 0x671D,0x689D,0x68D7,0x69FD,0x6F15,0x6F6E,0x7167,0x71E5,/* 0xC8-0xCF */ + 0x722A,0x74AA,0x773A,0x7956,0x795A,0x79DF,0x7A20,0x7A95,/* 0xD0-0xD7 */ + 0x7C97,0x7CDF,0x7D44,0x7E70,0x8087,0x85FB,0x86A4,0x8A54,/* 0xD8-0xDF */ + 0x8ABF,0x8D99,0x8E81,0x9020,0x906D,0x91E3,0x963B,0x96D5,/* 0xE0-0xE7 */ + 0x9CE5,0x65CF,0x7C07,0x8DB3,0x93C3,0x5B58,0x5C0A,0x5352,/* 0xE8-0xEF */ + 0x62D9,0x731D,0x5027,0x5B97,0x5F9E,0x60B0,0x616B,0x68D5,/* 0xF0-0xF7 */ + 0x6DD9,0x742E,0x7A2E,0x7D42,0x7D9C,0x7E31,0x816B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8E2A,0x8E35,0x937E,0x9418,0x4F50,0x5750,0x5DE6,/* 0xA0-0xA7 */ + 0x5EA7,0x632B,0x7F6A,0x4E3B,0x4F4F,0x4F8F,0x505A,0x59DD,/* 0xA8-0xAF */ + 0x80C4,0x546A,0x5468,0x55FE,0x594F,0x5B99,0x5DDE,0x5EDA,/* 0xB0-0xB7 */ + 0x665D,0x6731,0x67F1,0x682A,0x6CE8,0x6D32,0x6E4A,0x6F8D,/* 0xB8-0xBF */ + 0x70B7,0x73E0,0x7587,0x7C4C,0x7D02,0x7D2C,0x7DA2,0x821F,/* 0xC0-0xC7 */ + 0x86DB,0x8A3B,0x8A85,0x8D70,0x8E8A,0x8F33,0x9031,0x914E,/* 0xC8-0xCF */ + 0x9152,0x9444,0x99D0,0x7AF9,0x7CA5,0x4FCA,0x5101,0x51C6,/* 0xD0-0xD7 */ + 0x57C8,0x5BEF,0x5CFB,0x6659,0x6A3D,0x6D5A,0x6E96,0x6FEC,/* 0xD8-0xDF */ + 0x710C,0x756F,0x7AE3,0x8822,0x9021,0x9075,0x96CB,0x99FF,/* 0xE0-0xE7 */ + 0x8301,0x4E2D,0x4EF2,0x8846,0x91CD,0x537D,0x6ADB,0x696B,/* 0xE8-0xEF */ + 0x6C41,0x847A,0x589E,0x618E,0x66FE,0x62EF,0x70DD,0x7511,/* 0xF0-0xF7 */ + 0x75C7,0x7E52,0x84B8,0x8B49,0x8D08,0x4E4B,0x53EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54AB,0x5730,0x5740,0x5FD7,0x6301,0x6307,0x646F,/* 0xA0-0xA7 */ + 0x652F,0x65E8,0x667A,0x679D,0x67B3,0x6B62,0x6C60,0x6C9A,/* 0xA8-0xAF */ + 0x6F2C,0x77E5,0x7825,0x7949,0x7957,0x7D19,0x80A2,0x8102,/* 0xB0-0xB7 */ + 0x81F3,0x829D,0x82B7,0x8718,0x8A8C,0xF9FC,0x8D04,0x8DBE,/* 0xB8-0xBF */ + 0x9072,0x76F4,0x7A19,0x7A37,0x7E54,0x8077,0x5507,0x55D4,/* 0xC0-0xC7 */ + 0x5875,0x632F,0x6422,0x6649,0x664B,0x686D,0x699B,0x6B84,/* 0xC8-0xCF */ + 0x6D25,0x6EB1,0x73CD,0x7468,0x74A1,0x755B,0x75B9,0x76E1,/* 0xD0-0xD7 */ + 0x771E,0x778B,0x79E6,0x7E09,0x7E1D,0x81FB,0x852F,0x8897,/* 0xD8-0xDF */ + 0x8A3A,0x8CD1,0x8EEB,0x8FB0,0x9032,0x93AD,0x9663,0x9673,/* 0xE0-0xE7 */ + 0x9707,0x4F84,0x53F1,0x59EA,0x5AC9,0x5E19,0x684E,0x74C6,/* 0xE8-0xEF */ + 0x75BE,0x79E9,0x7A92,0x81A3,0x86ED,0x8CEA,0x8DCC,0x8FED,/* 0xF0-0xF7 */ + 0x659F,0x6715,0xF9FD,0x57F7,0x6F57,0x7DDD,0x8F2F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x93F6,0x96C6,0x5FB5,0x61F2,0x6F84,0x4E14,0x4F98,/* 0xA0-0xA7 */ + 0x501F,0x53C9,0x55DF,0x5D6F,0x5DEE,0x6B21,0x6B64,0x78CB,/* 0xA8-0xAF */ + 0x7B9A,0xF9FE,0x8E49,0x8ECA,0x906E,0x6349,0x643E,0x7740,/* 0xB0-0xB7 */ + 0x7A84,0x932F,0x947F,0x9F6A,0x64B0,0x6FAF,0x71E6,0x74A8,/* 0xB8-0xBF */ + 0x74DA,0x7AC4,0x7C12,0x7E82,0x7CB2,0x7E98,0x8B9A,0x8D0A,/* 0xC0-0xC7 */ + 0x947D,0x9910,0x994C,0x5239,0x5BDF,0x64E6,0x672D,0x7D2E,/* 0xC8-0xCF */ + 0x50ED,0x53C3,0x5879,0x6158,0x6159,0x61FA,0x65AC,0x7AD9,/* 0xD0-0xD7 */ + 0x8B92,0x8B96,0x5009,0x5021,0x5275,0x5531,0x5A3C,0x5EE0,/* 0xD8-0xDF */ + 0x5F70,0x6134,0x655E,0x660C,0x6636,0x66A2,0x69CD,0x6EC4,/* 0xE0-0xE7 */ + 0x6F32,0x7316,0x7621,0x7A93,0x8139,0x8259,0x83D6,0x84BC,/* 0xE8-0xEF */ + 0x50B5,0x57F0,0x5BC0,0x5BE8,0x5F69,0x63A1,0x7826,0x7DB5,/* 0xF0-0xF7 */ + 0x83DC,0x8521,0x91C7,0x91F5,0x518A,0x67F5,0x7B56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8CAC,0x51C4,0x59BB,0x60BD,0x8655,0x501C,0xF9FF,/* 0xA0-0xA7 */ + 0x5254,0x5C3A,0x617D,0x621A,0x62D3,0x64F2,0x65A5,0x6ECC,/* 0xA8-0xAF */ + 0x7620,0x810A,0x8E60,0x965F,0x96BB,0x4EDF,0x5343,0x5598,/* 0xB0-0xB7 */ + 0x5929,0x5DDD,0x64C5,0x6CC9,0x6DFA,0x7394,0x7A7F,0x821B,/* 0xB8-0xBF */ + 0x85A6,0x8CE4,0x8E10,0x9077,0x91E7,0x95E1,0x9621,0x97C6,/* 0xC0-0xC7 */ + 0x51F8,0x54F2,0x5586,0x5FB9,0x64A4,0x6F88,0x7DB4,0x8F1F,/* 0xC8-0xCF */ + 0x8F4D,0x9435,0x50C9,0x5C16,0x6CBE,0x6DFB,0x751B,0x77BB,/* 0xD0-0xD7 */ + 0x7C3D,0x7C64,0x8A79,0x8AC2,0x581E,0x59BE,0x5E16,0x6377,/* 0xD8-0xDF */ + 0x7252,0x758A,0x776B,0x8ADC,0x8CBC,0x8F12,0x5EF3,0x6674,/* 0xE0-0xE7 */ + 0x6DF8,0x807D,0x83C1,0x8ACB,0x9751,0x9BD6,0xFA00,0x5243,/* 0xE8-0xEF */ + 0x66FF,0x6D95,0x6EEF,0x7DE0,0x8AE6,0x902E,0x905E,0x9AD4,/* 0xF0-0xF7 */ + 0x521D,0x527F,0x54E8,0x6194,0x6284,0x62DB,0x68A2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6912,0x695A,0x6A35,0x7092,0x7126,0x785D,0x7901,/* 0xA0-0xA7 */ + 0x790E,0x79D2,0x7A0D,0x8096,0x8278,0x82D5,0x8349,0x8549,/* 0xA8-0xAF */ + 0x8C82,0x8D85,0x9162,0x918B,0x91AE,0x4FC3,0x56D1,0x71ED,/* 0xB0-0xB7 */ + 0x77D7,0x8700,0x89F8,0x5BF8,0x5FD6,0x6751,0x90A8,0x53E2,/* 0xB8-0xBF */ + 0x585A,0x5BF5,0x60A4,0x6181,0x6460,0x7E3D,0x8070,0x8525,/* 0xC0-0xC7 */ + 0x9283,0x64AE,0x50AC,0x5D14,0x6700,0x589C,0x62BD,0x63A8,/* 0xC8-0xCF */ + 0x690E,0x6978,0x6A1E,0x6E6B,0x76BA,0x79CB,0x82BB,0x8429,/* 0xD0-0xD7 */ + 0x8ACF,0x8DA8,0x8FFD,0x9112,0x914B,0x919C,0x9310,0x9318,/* 0xD8-0xDF */ + 0x939A,0x96DB,0x9A36,0x9C0D,0x4E11,0x755C,0x795D,0x7AFA,/* 0xE0-0xE7 */ + 0x7B51,0x7BC9,0x7E2E,0x84C4,0x8E59,0x8E74,0x8EF8,0x9010,/* 0xE8-0xEF */ + 0x6625,0x693F,0x7443,0x51FA,0x672E,0x9EDC,0x5145,0x5FE0,/* 0xF0-0xF7 */ + 0x6C96,0x87F2,0x885D,0x8877,0x60B4,0x81B5,0x8403,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8D05,0x53D6,0x5439,0x5634,0x5A36,0x5C31,0x708A,/* 0xA0-0xA7 */ + 0x7FE0,0x805A,0x8106,0x81ED,0x8DA3,0x9189,0x9A5F,0x9DF2,/* 0xA8-0xAF */ + 0x5074,0x4EC4,0x53A0,0x60FB,0x6E2C,0x5C64,0x4F88,0x5024,/* 0xB0-0xB7 */ + 0x55E4,0x5CD9,0x5E5F,0x6065,0x6894,0x6CBB,0x6DC4,0x71BE,/* 0xB8-0xBF */ + 0x75D4,0x75F4,0x7661,0x7A1A,0x7A49,0x7DC7,0x7DFB,0x7F6E,/* 0xC0-0xC7 */ + 0x81F4,0x86A9,0x8F1C,0x96C9,0x99B3,0x9F52,0x5247,0x52C5,/* 0xC8-0xCF */ + 0x98ED,0x89AA,0x4E03,0x67D2,0x6F06,0x4FB5,0x5BE2,0x6795,/* 0xD0-0xD7 */ + 0x6C88,0x6D78,0x741B,0x7827,0x91DD,0x937C,0x87C4,0x79E4,/* 0xD8-0xDF */ + 0x7A31,0x5FEB,0x4ED6,0x54A4,0x553E,0x58AE,0x59A5,0x60F0,/* 0xE0-0xE7 */ + 0x6253,0x62D6,0x6736,0x6955,0x8235,0x9640,0x99B1,0x99DD,/* 0xE8-0xEF */ + 0x502C,0x5353,0x5544,0x577C,0xFA01,0x6258,0xFA02,0x64E2,/* 0xF0-0xF7 */ + 0x666B,0x67DD,0x6FC1,0x6FEF,0x7422,0x7438,0x8A17,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9438,0x5451,0x5606,0x5766,0x5F48,0x619A,0x6B4E,/* 0xA0-0xA7 */ + 0x7058,0x70AD,0x7DBB,0x8A95,0x596A,0x812B,0x63A2,0x7708,/* 0xA8-0xAF */ + 0x803D,0x8CAA,0x5854,0x642D,0x69BB,0x5B95,0x5E11,0x6E6F,/* 0xB0-0xB7 */ + 0xFA03,0x8569,0x514C,0x53F0,0x592A,0x6020,0x614B,0x6B86,/* 0xB8-0xBF */ + 0x6C70,0x6CF0,0x7B1E,0x80CE,0x82D4,0x8DC6,0x90B0,0x98B1,/* 0xC0-0xC7 */ + 0xFA04,0x64C7,0x6FA4,0x6491,0x6504,0x514E,0x5410,0x571F,/* 0xC8-0xCF */ + 0x8A0E,0x615F,0x6876,0xFA05,0x75DB,0x7B52,0x7D71,0x901A,/* 0xD0-0xD7 */ + 0x5806,0x69CC,0x817F,0x892A,0x9000,0x9839,0x5078,0x5957,/* 0xD8-0xDF */ + 0x59AC,0x6295,0x900F,0x9B2A,0x615D,0x7279,0x95D6,0x5761,/* 0xE0-0xE7 */ + 0x5A46,0x5DF4,0x628A,0x64AD,0x64FA,0x6777,0x6CE2,0x6D3E,/* 0xE8-0xEF */ + 0x722C,0x7436,0x7834,0x7F77,0x82AD,0x8DDB,0x9817,0x5224,/* 0xF0-0xF7 */ + 0x5742,0x677F,0x7248,0x74E3,0x8CA9,0x8FA6,0x9211,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x962A,0x516B,0x53ED,0x634C,0x4F69,0x5504,0x6096,/* 0xA0-0xA7 */ + 0x6557,0x6C9B,0x6D7F,0x724C,0x72FD,0x7A17,0x8987,0x8C9D,/* 0xA8-0xAF */ + 0x5F6D,0x6F8E,0x70F9,0x81A8,0x610E,0x4FBF,0x504F,0x6241,/* 0xB0-0xB7 */ + 0x7247,0x7BC7,0x7DE8,0x7FE9,0x904D,0x97AD,0x9A19,0x8CB6,/* 0xB8-0xBF */ + 0x576A,0x5E73,0x67B0,0x840D,0x8A55,0x5420,0x5B16,0x5E63,/* 0xC0-0xC7 */ + 0x5EE2,0x5F0A,0x6583,0x80BA,0x853D,0x9589,0x965B,0x4F48,/* 0xC8-0xCF */ + 0x5305,0x530D,0x530F,0x5486,0x54FA,0x5703,0x5E03,0x6016,/* 0xD0-0xD7 */ + 0x629B,0x62B1,0x6355,0xFA06,0x6CE1,0x6D66,0x75B1,0x7832,/* 0xD8-0xDF */ + 0x80DE,0x812F,0x82DE,0x8461,0x84B2,0x888D,0x8912,0x900B,/* 0xE0-0xE7 */ + 0x92EA,0x98FD,0x9B91,0x5E45,0x66B4,0x66DD,0x7011,0x7206,/* 0xE8-0xEF */ + 0xFA07,0x4FF5,0x527D,0x5F6A,0x6153,0x6753,0x6A19,0x6F02,/* 0xF0-0xF7 */ + 0x74E2,0x7968,0x8868,0x8C79,0x98C7,0x98C4,0x9A43,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54C1,0x7A1F,0x6953,0x8AF7,0x8C4A,0x98A8,0x99AE,/* 0xA0-0xA7 */ + 0x5F7C,0x62AB,0x75B2,0x76AE,0x88AB,0x907F,0x9642,0x5339,/* 0xA8-0xAF */ + 0x5F3C,0x5FC5,0x6CCC,0x73CC,0x7562,0x758B,0x7B46,0x82FE,/* 0xB0-0xB7 */ + 0x999D,0x4E4F,0x903C,0x4E0B,0x4F55,0x53A6,0x590F,0x5EC8,/* 0xB8-0xBF */ + 0x6630,0x6CB3,0x7455,0x8377,0x8766,0x8CC0,0x9050,0x971E,/* 0xC0-0xC7 */ + 0x9C15,0x58D1,0x5B78,0x8650,0x8B14,0x9DB4,0x5BD2,0x6068,/* 0xC8-0xCF */ + 0x608D,0x65F1,0x6C57,0x6F22,0x6FA3,0x701A,0x7F55,0x7FF0,/* 0xD0-0xD7 */ + 0x9591,0x9592,0x9650,0x97D3,0x5272,0x8F44,0x51FD,0x542B,/* 0xD8-0xDF */ + 0x54B8,0x5563,0x558A,0x6ABB,0x6DB5,0x7DD8,0x8266,0x929C,/* 0xE0-0xE7 */ + 0x9677,0x9E79,0x5408,0x54C8,0x76D2,0x86E4,0x95A4,0x95D4,/* 0xE8-0xEF */ + 0x965C,0x4EA2,0x4F09,0x59EE,0x5AE6,0x5DF7,0x6052,0x6297,/* 0xF0-0xF7 */ + 0x676D,0x6841,0x6C86,0x6E2F,0x7F38,0x809B,0x822A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFA08,0xFA09,0x9805,0x4EA5,0x5055,0x54B3,0x5793,/* 0xA0-0xA7 */ + 0x595A,0x5B69,0x5BB3,0x61C8,0x6977,0x6D77,0x7023,0x87F9,/* 0xA8-0xAF */ + 0x89E3,0x8A72,0x8AE7,0x9082,0x99ED,0x9AB8,0x52BE,0x6838,/* 0xB0-0xB7 */ + 0x5016,0x5E78,0x674F,0x8347,0x884C,0x4EAB,0x5411,0x56AE,/* 0xB8-0xBF */ + 0x73E6,0x9115,0x97FF,0x9909,0x9957,0x9999,0x5653,0x589F,/* 0xC0-0xC7 */ + 0x865B,0x8A31,0x61B2,0x6AF6,0x737B,0x8ED2,0x6B47,0x96AA,/* 0xC8-0xCF */ + 0x9A57,0x5955,0x7200,0x8D6B,0x9769,0x4FD4,0x5CF4,0x5F26,/* 0xD0-0xD7 */ + 0x61F8,0x665B,0x6CEB,0x70AB,0x7384,0x73B9,0x73FE,0x7729,/* 0xD8-0xDF */ + 0x774D,0x7D43,0x7D62,0x7E23,0x8237,0x8852,0xFA0A,0x8CE2,/* 0xE0-0xE7 */ + 0x9249,0x986F,0x5B51,0x7A74,0x8840,0x9801,0x5ACC,0x4FE0,/* 0xE8-0xEF */ + 0x5354,0x593E,0x5CFD,0x633E,0x6D79,0x72F9,0x8105,0x8107,/* 0xF0-0xF7 */ + 0x83A2,0x92CF,0x9830,0x4EA8,0x5144,0x5211,0x578B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5F62,0x6CC2,0x6ECE,0x7005,0x7050,0x70AF,0x7192,/* 0xA0-0xA7 */ + 0x73E9,0x7469,0x834A,0x87A2,0x8861,0x9008,0x90A2,0x93A3,/* 0xA8-0xAF */ + 0x99A8,0x516E,0x5F57,0x60E0,0x6167,0x66B3,0x8559,0x8E4A,/* 0xB0-0xB7 */ + 0x91AF,0x978B,0x4E4E,0x4E92,0x547C,0x58D5,0x58FA,0x597D,/* 0xB8-0xBF */ + 0x5CB5,0x5F27,0x6236,0x6248,0x660A,0x6667,0x6BEB,0x6D69,/* 0xC0-0xC7 */ + 0x6DCF,0x6E56,0x6EF8,0x6F94,0x6FE0,0x6FE9,0x705D,0x72D0,/* 0xC8-0xCF */ + 0x7425,0x745A,0x74E0,0x7693,0x795C,0x7CCA,0x7E1E,0x80E1,/* 0xD0-0xD7 */ + 0x82A6,0x846B,0x84BF,0x864E,0x865F,0x8774,0x8B77,0x8C6A,/* 0xD8-0xDF */ + 0x93AC,0x9800,0x9865,0x60D1,0x6216,0x9177,0x5A5A,0x660F,/* 0xE0-0xE7 */ + 0x6DF7,0x6E3E,0x743F,0x9B42,0x5FFD,0x60DA,0x7B0F,0x54C4,/* 0xE8-0xEF */ + 0x5F18,0x6C5E,0x6CD3,0x6D2A,0x70D8,0x7D05,0x8679,0x8A0C,/* 0xF0-0xF7 */ + 0x9D3B,0x5316,0x548C,0x5B05,0x6A3A,0x706B,0x7575,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x798D,0x79BE,0x82B1,0x83EF,0x8A71,0x8B41,0x8CA8,/* 0xA0-0xA7 */ + 0x9774,0xFA0B,0x64F4,0x652B,0x78BA,0x78BB,0x7A6B,0x4E38,/* 0xA8-0xAF */ + 0x559A,0x5950,0x5BA6,0x5E7B,0x60A3,0x63DB,0x6B61,0x6665,/* 0xB0-0xB7 */ + 0x6853,0x6E19,0x7165,0x74B0,0x7D08,0x9084,0x9A69,0x9C25,/* 0xB8-0xBF */ + 0x6D3B,0x6ED1,0x733E,0x8C41,0x95CA,0x51F0,0x5E4C,0x5FA8,/* 0xC0-0xC7 */ + 0x604D,0x60F6,0x6130,0x614C,0x6643,0x6644,0x69A5,0x6CC1,/* 0xC8-0xCF */ + 0x6E5F,0x6EC9,0x6F62,0x714C,0x749C,0x7687,0x7BC1,0x7C27,/* 0xD0-0xD7 */ + 0x8352,0x8757,0x9051,0x968D,0x9EC3,0x532F,0x56DE,0x5EFB,/* 0xD8-0xDF */ + 0x5F8A,0x6062,0x6094,0x61F7,0x6666,0x6703,0x6A9C,0x6DEE,/* 0xE0-0xE7 */ + 0x6FAE,0x7070,0x736A,0x7E6A,0x81BE,0x8334,0x86D4,0x8AA8,/* 0xE8-0xEF */ + 0x8CC4,0x5283,0x7372,0x5B96,0x6A6B,0x9404,0x54EE,0x5686,/* 0xF0-0xF7 */ + 0x5B5D,0x6548,0x6585,0x66C9,0x689F,0x6D8D,0x6DC6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x723B,0x80B4,0x9175,0x9A4D,0x4FAF,0x5019,0x539A,/* 0xA0-0xA7 */ + 0x540E,0x543C,0x5589,0x55C5,0x5E3F,0x5F8C,0x673D,0x7166,/* 0xA8-0xAF */ + 0x73DD,0x9005,0x52DB,0x52F3,0x5864,0x58CE,0x7104,0x718F,/* 0xB0-0xB7 */ + 0x71FB,0x85B0,0x8A13,0x6688,0x85A8,0x55A7,0x6684,0x714A,/* 0xB8-0xBF */ + 0x8431,0x5349,0x5599,0x6BC1,0x5F59,0x5FBD,0x63EE,0x6689,/* 0xC0-0xC7 */ + 0x7147,0x8AF1,0x8F1D,0x9EBE,0x4F11,0x643A,0x70CB,0x7566,/* 0xC8-0xCF */ + 0x8667,0x6064,0x8B4E,0x9DF8,0x5147,0x51F6,0x5308,0x6D36,/* 0xD0-0xD7 */ + 0x80F8,0x9ED1,0x6615,0x6B23,0x7098,0x75D5,0x5403,0x5C79,/* 0xD8-0xDF */ + 0x7D07,0x8A16,0x6B20,0x6B3D,0x6B46,0x5438,0x6070,0x6D3D,/* 0xE0-0xE7 */ + 0x7FD5,0x8208,0x50D6,0x51DE,0x559C,0x566B,0x56CD,0x59EC,/* 0xE8-0xEF */ + 0x5B09,0x5E0C,0x6199,0x6198,0x6231,0x665E,0x66E6,0x7199,/* 0xF0-0xF7 */ + 0x71B9,0x71BA,0x72A7,0x79A7,0x7A00,0x7FB2,0x8A70,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, + c2u_C8, NULL, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, NULL, NULL, +}; + +static const unsigned char u2c_01[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA9, 0xA2, 0xA9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xA9, 0xA4, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA9, 0xA5, 0xA8, 0xA6, 0xA9, 0xA6, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xA9, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA8, /* 0x3C-0x3F */ + 0xA9, 0xA8, 0xA8, 0xA9, 0xA9, 0xA9, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA9, 0xB0, 0xA8, 0xAF, 0xA9, 0xAF, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAB, 0xA9, 0xAB, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAE, 0xA9, 0xAE, /* 0x64-0x67 */ +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA7, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xA2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xA2, 0xA8, 0xA2, 0xAB, 0xA2, 0xAA, 0xA2, 0xAD, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA2, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0x90-0x93 */ + 0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0x94-0x97 */ + 0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0x98-0x9B */ + 0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0x9C-0x9F */ + 0xA5, 0xD0, 0xA5, 0xD1, 0x00, 0x00, 0xA5, 0xD2, /* 0xA0-0xA3 */ + 0xA5, 0xD3, 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, /* 0xA4-0xA7 */ + 0xA5, 0xD7, 0xA5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xB0-0xB3 */ + 0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xB4-0xB7 */ + 0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xB8-0xBB */ + 0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xBC-0xBF */ + 0xA5, 0xF0, 0xA5, 0xF1, 0x00, 0x00, 0xA5, 0xF2, /* 0xC0-0xC3 */ + 0xA5, 0xF3, 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, /* 0xC4-0xC7 */ + 0xA5, 0xF7, 0xA5, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0xAC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xAC, 0xA1, 0xAC, 0xA2, 0xAC, 0xA3, 0xAC, 0xA4, /* 0x10-0x13 */ + 0xAC, 0xA5, 0xAC, 0xA6, 0xAC, 0xA8, 0xAC, 0xA9, /* 0x14-0x17 */ + 0xAC, 0xAA, 0xAC, 0xAB, 0xAC, 0xAC, 0xAC, 0xAD, /* 0x18-0x1B */ + 0xAC, 0xAE, 0xAC, 0xAF, 0xAC, 0xB0, 0xAC, 0xB1, /* 0x1C-0x1F */ + 0xAC, 0xB2, 0xAC, 0xB3, 0xAC, 0xB4, 0xAC, 0xB5, /* 0x20-0x23 */ + 0xAC, 0xB6, 0xAC, 0xB7, 0xAC, 0xB8, 0xAC, 0xB9, /* 0x24-0x27 */ + 0xAC, 0xBA, 0xAC, 0xBB, 0xAC, 0xBC, 0xAC, 0xBD, /* 0x28-0x2B */ + 0xAC, 0xBE, 0xAC, 0xBF, 0xAC, 0xC0, 0xAC, 0xC1, /* 0x2C-0x2F */ + 0xAC, 0xD1, 0xAC, 0xD2, 0xAC, 0xD3, 0xAC, 0xD4, /* 0x30-0x33 */ + 0xAC, 0xD5, 0xAC, 0xD6, 0xAC, 0xD8, 0xAC, 0xD9, /* 0x34-0x37 */ + 0xAC, 0xDA, 0xAC, 0xDB, 0xAC, 0xDC, 0xAC, 0xDD, /* 0x38-0x3B */ + 0xAC, 0xDE, 0xAC, 0xDF, 0xAC, 0xE0, 0xAC, 0xE1, /* 0x3C-0x3F */ + 0xAC, 0xE2, 0xAC, 0xE3, 0xAC, 0xE4, 0xAC, 0xE5, /* 0x40-0x43 */ + 0xAC, 0xE6, 0xAC, 0xE7, 0xAC, 0xE8, 0xAC, 0xE9, /* 0x44-0x47 */ + 0xAC, 0xEA, 0xAC, 0xEB, 0xAC, 0xEC, 0xAC, 0xED, /* 0x48-0x4B */ + 0xAC, 0xEE, 0xAC, 0xEF, 0xAC, 0xF0, 0xAC, 0xF1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xAC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_11[512] = { + 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA4, 0xA4, 0xA7, /* 0x00-0x03 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xB1, 0xA4, 0xB2, /* 0x04-0x07 */ + 0xA4, 0xB3, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x08-0x0B */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x0C-0x0F */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0x10-0x13 */ + 0xA4, 0xD5, 0xA4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, /* 0x18-0x1B */ + 0xA4, 0xDE, 0xA4, 0xE1, 0xA4, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0xE3, 0xA4, 0xB4, 0xA4, 0xE4, 0xA4, 0xE5, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE6, /* 0x24-0x27 */ + 0x00, 0x00, 0xA4, 0xE7, 0x00, 0x00, 0xA4, 0xE8, /* 0x28-0x2B */ + 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, 0xA4, 0xEC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xED, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xEE, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB5, 0xA4, 0xB6, /* 0x3C-0x3F */ + 0xA4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xA4, 0xF2, 0xA4, 0xF3, 0xA4, 0xF0, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB8, 0xA4, 0xB9, /* 0x4C-0x4F */ + 0xA4, 0xB8, 0xA4, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xA4, 0xBA, 0xA4, 0xBA, 0x00, 0x00, 0xA4, 0xF4, /* 0x54-0x57 */ + 0xA4, 0xF5, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x60-0x63 */ + 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, 0xA4, 0xC5, /* 0x64-0x67 */ + 0xA4, 0xC6, 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, /* 0x68-0x6B */ + 0xA4, 0xCA, 0xA4, 0xCB, 0xA4, 0xCC, 0xA4, 0xCD, /* 0x6C-0x6F */ + 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, 0xA4, 0xD1, /* 0x70-0x73 */ + 0xA4, 0xD2, 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xA4, 0xF7, 0xA4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xA4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA4, 0xFA, 0xA4, 0xFB, 0x00, 0x00, /* 0x90-0x93 */ + 0xA4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xFD, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, 0xA4, 0xA4, /* 0xA8-0xAB */ + 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, 0xA4, 0xA9, /* 0xAC-0xAF */ + 0xA4, 0xAA, 0xA4, 0xAB, 0xA4, 0xAC, 0xA4, 0xAD, /* 0xB0-0xB3 */ + 0xA4, 0xAE, 0xA4, 0xAF, 0xA4, 0xB0, 0xA4, 0xB1, /* 0xB4-0xB7 */ + 0xA4, 0xB2, 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xB8-0xBB */ + 0xA4, 0xB7, 0xA4, 0xB8, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xBC-0xBF */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD6, 0xA4, 0xD7, /* 0xC4-0xC7 */ + 0xA4, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xA4, 0xD9, 0x00, 0x00, 0xA4, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xA4, 0xDE, 0xA4, 0xDF, 0x00, 0x00, 0xA4, 0xE0, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE8, 0xA4, 0xEA, /* 0xE4-0xE7 */ + 0xA4, 0xEC, 0x00, 0x00, 0xA4, 0xED, 0xA4, 0xEF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA4, 0xB7, 0xA4, 0xF2, 0xA4, 0xF3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xA4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA2, 0xD3, 0xA2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA2, 0xB6, 0x00, 0x00, 0xA1, 0xC7, 0xA1, 0xC8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0xFA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA9, 0xFB, 0xA9, 0xFC, 0xA9, 0xFD, /* 0x80-0x83 */ + 0xA9, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA2, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA4, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xE0, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA2, 0xE5, 0xA2, 0xE2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xD9, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xCA, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF7, /* 0x50-0x53 */ + 0xA8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xFB, /* 0x58-0x5B */ + 0xA8, 0xFC, 0xA8, 0xFD, 0xA8, 0xFE, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0x60-0x63 */ + 0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0x64-0x67 */ + 0xA5, 0xB8, 0xA5, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, 0xA5, 0xA4, /* 0x70-0x73 */ + 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, 0xA5, 0xA8, /* 0x74-0x77 */ + 0xA5, 0xA9, 0xA5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xE7, 0xA1, 0xE8, 0xA1, 0xE6, 0xA1, 0xE9, /* 0x90-0x93 */ + 0xA1, 0xEA, 0xA2, 0xD5, 0xA2, 0xD8, 0xA2, 0xD6, /* 0x94-0x97 */ + 0xA2, 0xD9, 0xA2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA1, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_22[512] = { + 0xA2, 0xA3, 0x00, 0x00, 0xA1, 0xD3, 0xA2, 0xA4, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD4, /* 0x04-0x07 */ + 0xA1, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF5, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xB3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA2, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEE, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA1, 0xF0, 0xA1, 0xC4, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0xA1, 0xFC, /* 0x24-0x27 */ + 0xA1, 0xFD, 0xA1, 0xFB, 0xA1, 0xFA, 0xA1, 0xF2, /* 0x28-0x2B */ + 0xA1, 0xF3, 0x00, 0x00, 0xA2, 0xB1, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xC5, 0xA1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA1, 0xAD, 0xA1, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD6, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xC1, 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0xC2, 0xA1, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, 0xA1, 0xED, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF6, 0xA1, 0xF7, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA2, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD2, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA8, 0xE7, 0xA8, 0xE8, 0xA8, 0xE9, 0xA8, 0xEA, /* 0x60-0x63 */ + 0xA8, 0xEB, 0xA8, 0xEC, 0xA8, 0xED, 0xA8, 0xEE, /* 0x64-0x67 */ + 0xA8, 0xEF, 0xA8, 0xF0, 0xA8, 0xF1, 0xA8, 0xF2, /* 0x68-0x6B */ + 0xA8, 0xF3, 0xA8, 0xF4, 0xA8, 0xF5, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA9, 0xE7, 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, /* 0x74-0x77 */ + 0xA9, 0xEB, 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, /* 0x78-0x7B */ + 0xA9, 0xEF, 0xA9, 0xF0, 0xA9, 0xF1, 0xA9, 0xF2, /* 0x7C-0x7F */ + + 0xA9, 0xF3, 0xA9, 0xF4, 0xA9, 0xF5, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, 0xA9, 0xD0, /* 0x9C-0x9F */ + 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, 0xA9, 0xD4, /* 0xA0-0xA3 */ + 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, 0xA9, 0xD8, /* 0xA4-0xA7 */ + 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, 0xA9, 0xDC, /* 0xA8-0xAB */ + 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, 0xA9, 0xE0, /* 0xAC-0xAF */ + 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, 0xA9, 0xE4, /* 0xB0-0xB3 */ + 0xA9, 0xE5, 0xA9, 0xE6, 0xA8, 0xCD, 0xA8, 0xCE, /* 0xB4-0xB7 */ + 0xA8, 0xCF, 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, /* 0xB8-0xBB */ + 0xA8, 0xD3, 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, /* 0xBC-0xBF */ + 0xA8, 0xD7, 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, /* 0xC0-0xC3 */ + 0xA8, 0xDB, 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, /* 0xC4-0xC7 */ + 0xA8, 0xDF, 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, /* 0xC8-0xCB */ + 0xA8, 0xE3, 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, /* 0xCC-0xCF */ + 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, 0xA8, 0xD0, /* 0xD0-0xD3 */ + 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, 0xA8, 0xD4, /* 0xD4-0xD7 */ + 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, 0xA8, 0xD8, /* 0xD8-0xDB */ + 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, 0xA8, 0xDC, /* 0xDC-0xDF */ + 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, 0xA8, 0xE0, /* 0xE0-0xE3 */ + 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, 0xA8, 0xE4, /* 0xE4-0xE7 */ + 0xA8, 0xE5, 0xA8, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_25[512] = { + 0xA6, 0xA1, 0xA6, 0xAC, 0xA6, 0xA2, 0xA6, 0xAD, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xA6, 0xA3, 0xA6, 0xC8, 0xA6, 0xC7, 0xA6, 0xAE, /* 0x0C-0x0F */ + 0xA6, 0xA4, 0xA6, 0xC2, 0xA6, 0xC1, 0xA6, 0xAF, /* 0x10-0x13 */ + 0xA6, 0xA6, 0xA6, 0xC6, 0xA6, 0xC5, 0xA6, 0xB1, /* 0x14-0x17 */ + 0xA6, 0xA5, 0xA6, 0xC4, 0xA6, 0xC3, 0xA6, 0xB0, /* 0x18-0x1B */ + 0xA6, 0xA7, 0xA6, 0xBC, 0xA6, 0xC9, 0xA6, 0xCA, /* 0x1C-0x1F */ + 0xA6, 0xB7, 0xA6, 0xCB, 0xA6, 0xCC, 0xA6, 0xB2, /* 0x20-0x23 */ + 0xA6, 0xA9, 0xA6, 0xBE, 0xA6, 0xCD, 0xA6, 0xCE, /* 0x24-0x27 */ + 0xA6, 0xB9, 0xA6, 0xCF, 0xA6, 0xD0, 0xA6, 0xB4, /* 0x28-0x2B */ + 0xA6, 0xA8, 0xA6, 0xD1, 0xA6, 0xD2, 0xA6, 0xB8, /* 0x2C-0x2F */ + 0xA6, 0xBD, 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xB3, /* 0x30-0x33 */ + 0xA6, 0xAA, 0xA6, 0xD5, 0xA6, 0xD6, 0xA6, 0xBA, /* 0x34-0x37 */ + 0xA6, 0xBF, 0xA6, 0xD7, 0xA6, 0xD8, 0xA6, 0xB5, /* 0x38-0x3B */ + 0xA6, 0xAB, 0xA6, 0xD9, 0xA6, 0xDA, 0xA6, 0xBB, /* 0x3C-0x3F */ + 0xA6, 0xDB, 0xA6, 0xDC, 0xA6, 0xC0, 0xA6, 0xDD, /* 0x40-0x43 */ + 0xA6, 0xDE, 0xA6, 0xDF, 0xA6, 0xE0, 0xA6, 0xE1, /* 0x44-0x47 */ + 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xE4, 0xA6, 0xB6, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xC6, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xE1, 0xA1, 0xE0, 0x00, 0x00, 0xA2, 0xC3, /* 0xA0-0xA3 */ + 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xCB, 0xA2, 0xCA, /* 0xA4-0xA7 */ + 0xA2, 0xC9, 0xA2, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE3, 0xA1, 0xE2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xBA, 0xA2, 0xB9, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA1, 0xE5, 0xA1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xA2, 0xB8, 0xA2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDF, 0xA1, 0xDE, /* 0xC4-0xC7 */ + 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDD, 0xA1, 0xDC, /* 0xCC-0xCF */ + 0xA2, 0xC4, 0xA2, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xD9, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xCF, 0xA2, 0xCE, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA2, 0xD0, 0x00, 0x00, 0xA2, 0xD1, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xCF, 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xBC, 0xA2, 0xBD, 0x00, 0x00, 0xA2, 0xC0, /* 0x60-0x63 */ + 0xA2, 0xBB, 0xA2, 0xBE, 0x00, 0x00, 0xA2, 0xBF, /* 0x64-0x67 */ + 0xA2, 0xCD, 0xA2, 0xDB, 0xA2, 0xDC, 0x00, 0x00, /* 0x68-0x6B */ + 0xA2, 0xDD, 0xA2, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */ + 0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */ + 0xA1, 0xBC, 0xA1, 0xBD, 0x00, 0x00, 0xA1, 0xEB, /* 0x10-0x13 */ + 0xA1, 0xB2, 0xA1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xAA, 0xA1, 0xAA, 0xA2, 0xAA, 0xA3, /* 0x40-0x43 */ + 0xAA, 0xA4, 0xAA, 0xA5, 0xAA, 0xA6, 0xAA, 0xA7, /* 0x44-0x47 */ + 0xAA, 0xA8, 0xAA, 0xA9, 0xAA, 0xAA, 0xAA, 0xAB, /* 0x48-0x4B */ + 0xAA, 0xAC, 0xAA, 0xAD, 0xAA, 0xAE, 0xAA, 0xAF, /* 0x4C-0x4F */ + 0xAA, 0xB0, 0xAA, 0xB1, 0xAA, 0xB2, 0xAA, 0xB3, /* 0x50-0x53 */ + 0xAA, 0xB4, 0xAA, 0xB5, 0xAA, 0xB6, 0xAA, 0xB7, /* 0x54-0x57 */ + 0xAA, 0xB8, 0xAA, 0xB9, 0xAA, 0xBA, 0xAA, 0xBB, /* 0x58-0x5B */ + 0xAA, 0xBC, 0xAA, 0xBD, 0xAA, 0xBE, 0xAA, 0xBF, /* 0x5C-0x5F */ + 0xAA, 0xC0, 0xAA, 0xC1, 0xAA, 0xC2, 0xAA, 0xC3, /* 0x60-0x63 */ + 0xAA, 0xC4, 0xAA, 0xC5, 0xAA, 0xC6, 0xAA, 0xC7, /* 0x64-0x67 */ + 0xAA, 0xC8, 0xAA, 0xC9, 0xAA, 0xCA, 0xAA, 0xCB, /* 0x68-0x6B */ + 0xAA, 0xCC, 0xAA, 0xCD, 0xAA, 0xCE, 0xAA, 0xCF, /* 0x6C-0x6F */ + 0xAA, 0xD0, 0xAA, 0xD1, 0xAA, 0xD2, 0xAA, 0xD3, /* 0x70-0x73 */ + 0xAA, 0xD4, 0xAA, 0xD5, 0xAA, 0xD6, 0xAA, 0xD7, /* 0x74-0x77 */ + 0xAA, 0xD8, 0xAA, 0xD9, 0xAA, 0xDA, 0xAA, 0xDB, /* 0x78-0x7B */ + 0xAA, 0xDC, 0xAA, 0xDD, 0xAA, 0xDE, 0xAA, 0xDF, /* 0x7C-0x7F */ + + 0xAA, 0xE0, 0xAA, 0xE1, 0xAA, 0xE2, 0xAA, 0xE3, /* 0x80-0x83 */ + 0xAA, 0xE4, 0xAA, 0xE5, 0xAA, 0xE6, 0xAA, 0xE7, /* 0x84-0x87 */ + 0xAA, 0xE8, 0xAA, 0xE9, 0xAA, 0xEA, 0xAA, 0xEB, /* 0x88-0x8B */ + 0xAA, 0xEC, 0xAA, 0xED, 0xAA, 0xEE, 0xAA, 0xEF, /* 0x8C-0x8F */ + 0xAA, 0xF0, 0xAA, 0xF1, 0xAA, 0xF2, 0xAA, 0xF3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xAB, 0xA1, 0xAB, 0xA2, 0xAB, 0xA3, /* 0xA0-0xA3 */ + 0xAB, 0xA4, 0xAB, 0xA5, 0xAB, 0xA6, 0xAB, 0xA7, /* 0xA4-0xA7 */ + 0xAB, 0xA8, 0xAB, 0xA9, 0xAB, 0xAA, 0xAB, 0xAB, /* 0xA8-0xAB */ + 0xAB, 0xAC, 0xAB, 0xAD, 0xAB, 0xAE, 0xAB, 0xAF, /* 0xAC-0xAF */ + 0xAB, 0xB0, 0xAB, 0xB1, 0xAB, 0xB2, 0xAB, 0xB3, /* 0xB0-0xB3 */ + 0xAB, 0xB4, 0xAB, 0xB5, 0xAB, 0xB6, 0xAB, 0xB7, /* 0xB4-0xB7 */ + 0xAB, 0xB8, 0xAB, 0xB9, 0xAB, 0xBA, 0xAB, 0xBB, /* 0xB8-0xBB */ + 0xAB, 0xBC, 0xAB, 0xBD, 0xAB, 0xBE, 0xAB, 0xBF, /* 0xBC-0xBF */ + 0xAB, 0xC0, 0xAB, 0xC1, 0xAB, 0xC2, 0xAB, 0xC3, /* 0xC0-0xC3 */ + 0xAB, 0xC4, 0xAB, 0xC5, 0xAB, 0xC6, 0xAB, 0xC7, /* 0xC4-0xC7 */ + 0xAB, 0xC8, 0xAB, 0xC9, 0xAB, 0xCA, 0xAB, 0xCB, /* 0xC8-0xCB */ + 0xAB, 0xCC, 0xAB, 0xCD, 0xAB, 0xCE, 0xAB, 0xCF, /* 0xCC-0xCF */ + 0xAB, 0xD0, 0xAB, 0xD1, 0xAB, 0xD2, 0xAB, 0xD3, /* 0xD0-0xD3 */ + 0xAB, 0xD4, 0xAB, 0xD5, 0xAB, 0xD6, 0xAB, 0xD7, /* 0xD4-0xD7 */ + 0xAB, 0xD8, 0xAB, 0xD9, 0xAB, 0xDA, 0xAB, 0xDB, /* 0xD8-0xDB */ + 0xAB, 0xDC, 0xAB, 0xDD, 0xAB, 0xDE, 0xAB, 0xDF, /* 0xDC-0xDF */ + 0xAB, 0xE0, 0xAB, 0xE1, 0xAB, 0xE2, 0xAB, 0xE3, /* 0xE0-0xE3 */ + 0xAB, 0xE4, 0xAB, 0xE5, 0xAB, 0xE6, 0xAB, 0xE7, /* 0xE4-0xE7 */ + 0xAB, 0xE8, 0xAB, 0xE9, 0xAB, 0xEA, 0xAB, 0xEB, /* 0xE8-0xEB */ + 0xAB, 0xEC, 0xAB, 0xED, 0xAB, 0xEE, 0xAB, 0xEF, /* 0xEC-0xEF */ + 0xAB, 0xF0, 0xAB, 0xF1, 0xAB, 0xF2, 0xAB, 0xF3, /* 0xF0-0xF3 */ + 0xAB, 0xF4, 0xAB, 0xF5, 0xAB, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x30-0x33 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x34-0x37 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x38-0x3B */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x3C-0x3F */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x40-0x43 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x44-0x47 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x48-0x4B */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x4C-0x4F */ + 0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x50-0x53 */ + 0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x54-0x57 */ + 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x58-0x5B */ + 0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x5C-0x5F */ + 0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x60-0x63 */ + 0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x64-0x67 */ + 0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x68-0x6B */ + 0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x6C-0x6F */ + 0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x70-0x73 */ + 0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x74-0x77 */ + 0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x78-0x7B */ + 0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x7C-0x7F */ + + 0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x80-0x83 */ + 0xA4, 0xF4, 0xA4, 0xF5, 0xA4, 0xF6, 0xA4, 0xF7, /* 0x84-0x87 */ + 0xA4, 0xF8, 0xA4, 0xF9, 0xA4, 0xFA, 0xA4, 0xFB, /* 0x88-0x8B */ + 0xA4, 0xFC, 0xA4, 0xFD, 0xA4, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE9, 0xEC, 0xA3, /* 0x90-0x93 */ + 0xDF, 0xB2, 0xDE, 0xCC, 0xDF, 0xBE, 0xF1, 0xE9, /* 0x94-0x97 */ + 0xF9, 0xBB, 0xCB, 0xA3, 0xEB, 0xE0, 0xDC, 0xB0, /* 0x98-0x9B */ + 0xEF, 0xCB, 0xF4, 0xB8, 0xF2, 0xA2, 0xEC, 0xD1, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, 0xA9, 0xB4, /* 0x00-0x03 */ + 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, 0xA9, 0xB8, /* 0x04-0x07 */ + 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, 0xA9, 0xBC, /* 0x08-0x0B */ + 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, 0xA9, 0xC0, /* 0x0C-0x0F */ + 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, 0xA9, 0xC4, /* 0x10-0x13 */ + 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, 0xA9, 0xC8, /* 0x14-0x17 */ + 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, 0xA9, 0xCC, /* 0x18-0x1B */ + 0xA2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x20-0x23 */ + 0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x24-0x27 */ + 0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x28-0x2B */ + 0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x2C-0x2F */ + 0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x30-0x33 */ + 0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x34-0x37 */ + 0xD6, 0xCC, 0xD3, 0xDB, 0xFB, 0xBC, 0xF9, 0xCA, /* 0x38-0x3B */ + 0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0x3C-0x3F */ + 0xF0, 0xAE, 0xFD, 0xCC, 0xED, 0xBB, 0xF2, 0xB8, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA8, 0xB1, 0xA8, 0xB2, 0xA8, 0xB3, 0xA8, 0xB4, /* 0x60-0x63 */ + 0xA8, 0xB5, 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB8, /* 0x64-0x67 */ + 0xA8, 0xB9, 0xA8, 0xBA, 0xA8, 0xBB, 0xA8, 0xBC, /* 0x68-0x6B */ + 0xA8, 0xBD, 0xA8, 0xBE, 0xA8, 0xBF, 0xA8, 0xC0, /* 0x6C-0x6F */ + 0xA8, 0xC1, 0xA8, 0xC2, 0xA8, 0xC3, 0xA8, 0xC4, /* 0x70-0x73 */ + 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, 0xA8, 0xC8, /* 0x74-0x77 */ + 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, 0xA8, 0xCC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xDE, /* 0x7C-0x7F */ + + 0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x80-0x83 */ + 0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x84-0x87 */ + 0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x88-0x8B */ + 0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x8C-0x8F */ + 0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x90-0x93 */ + 0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x94-0x97 */ + 0xD6, 0xCC, 0xDD, 0xFA, 0xD1, 0xFB, 0xD2, 0xB3, /* 0x98-0x9B */ + 0xEE, 0xEA, 0xE9, 0xD0, 0xEC, 0xD4, 0xF1, 0xBC, /* 0x9C-0x9F */ + 0xFA, 0xA3, 0xFD, 0xCC, 0xDE, 0xD0, 0xEF, 0xE1, /* 0xA0-0xA3 */ + 0xDF, 0xBE, 0xF1, 0xE9, 0xF9, 0xBB, 0xF1, 0xA7, /* 0xA4-0xA7 */ + 0xE9, 0xD3, 0xEC, 0xA2, 0xF0, 0xF3, 0xF9, 0xCA, /* 0xA8-0xAB */ + 0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0xAC-0xAF */ + 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xA7, 0xC9, 0xA7, 0xCA, 0xA7, 0xCB, 0xA7, 0xCC, /* 0x80-0x83 */ + 0xA7, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x88-0x8B */ + 0xA7, 0xDE, 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x8C-0x8F */ + 0xA7, 0xD4, 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD7, /* 0x90-0x93 */ + 0xA7, 0xD8, 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, /* 0x94-0x97 */ + 0xA7, 0xA5, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x98-0x9B */ + 0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x9C-0x9F */ + 0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xA7, /* 0xA0-0xA3 */ + 0xA7, 0xA8, 0xA7, 0xA9, 0xA7, 0xAA, 0xA7, 0xBD, /* 0xA4-0xA7 */ + 0xA7, 0xBE, 0xA7, 0xE5, 0xA7, 0xE6, 0xA7, 0xE7, /* 0xA8-0xAB */ + 0xA7, 0xE8, 0xA7, 0xE1, 0xA7, 0xE2, 0xA7, 0xE3, /* 0xAC-0xAF */ + 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, 0xA7, 0xC2, /* 0xB0-0xB3 */ + 0xA7, 0xC3, 0xA7, 0xC4, 0xA7, 0xC5, 0xA7, 0xC6, /* 0xB4-0xB7 */ + 0xA7, 0xC7, 0xA7, 0xC8, 0xA7, 0xCE, 0xA7, 0xCF, /* 0xB8-0xBB */ + 0xA7, 0xD0, 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, /* 0xBC-0xBF */ + 0xA7, 0xDA, 0xA7, 0xDB, 0xA2, 0xE3, 0xA7, 0xEC, /* 0xC0-0xC3 */ + 0xA7, 0xA6, 0xA7, 0xE0, 0xA7, 0xEF, 0xA2, 0xE1, /* 0xC4-0xC7 */ + 0xA7, 0xBC, 0xA7, 0xED, 0xA7, 0xB5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB9, /* 0xCC-0xCF */ + 0xA7, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xEB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xDF, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xA2, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xE4, /* 0xD8-0xDB */ + 0xA7, 0xEE, 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_4E[512] = { + 0xEC, 0xE9, 0xEF, 0xCB, 0x00, 0x00, 0xF6, 0xD2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB2, /* 0x04-0x07 */ + 0xED, 0xDB, 0xDF, 0xB2, 0xDF, 0xBE, 0xF9, 0xBB, /* 0x08-0x0B */ + 0x00, 0x00, 0xDC, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF3, 0xA6, 0xDD, 0xE0, 0xE1, 0xA6, 0x00, 0x00, /* 0x14-0x17 */ + 0xCE, 0xF8, 0xDC, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xF1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xFA, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFC, 0xAF, 0xD3, 0xA1, 0x00, 0x00, 0xF1, 0xAB, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD1, 0xD2, 0xAC, /* 0x40-0x43 */ + 0x00, 0x00, 0xCE, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0xDE, 0xBF, 0xFB, 0xBA, 0xF9, 0xB9, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD2, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xAB, 0xEB, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xCE, 0xFA, 0xCB, 0xF7, 0xE5, 0xA5, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE1, /* 0x68-0x6B */ + 0x00, 0x00, 0xD4, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE1, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE3, 0xDF, 0xAD, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEB, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xAF, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF5, 0x00, 0x00, /* 0x84-0x87 */ + 0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC0, /* 0x88-0x8B */ + 0xEC, 0xA3, 0x00, 0x00, 0xE9, 0xCD, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xA7, 0xE9, 0xF6, 0xFB, 0xBB, 0x00, 0x00, /* 0x90-0x93 */ + 0xE7, 0xE9, 0xEF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD0, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC1, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD8, 0xCC, 0xF9, 0xF1, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCE, 0xDF, 0xFA, 0xA4, 0xE6, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFA, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBD, /* 0xA8-0xAB */ + 0xCC, 0xC8, 0xEF, 0xCD, 0xD5, 0xD5, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE4, 0xA7, 0xEC, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF6, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xD1, 0xCB, 0xBF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xED, 0xA8, 0xDE, 0xC2, 0xF6, 0xE2, 0xED, 0xDC, /* 0xD4-0xD7 */ + 0xDC, 0xF5, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xD4, 0xCE, 0x00, 0x00, 0xF4, 0xB5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDB, /* 0xE0-0xE3 */ + 0xD6, 0xB5, 0xEC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE4, 0xE6, 0x00, 0x00, 0xF1, 0xEA, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEC, 0xCB, 0xC0, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_4F[512] = { + 0x00, 0x00, 0xD0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF9, 0xF2, 0xEC, 0xA5, 0xD0, 0xDF, /* 0x08-0x0B */ + 0x00, 0x00, 0xE7, 0xEA, 0xD0, 0xEB, 0xDC, 0xD1, /* 0x0C-0x0F */ + 0xDB, 0xE9, 0xFD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD7, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xDA, 0xE1, 0x00, 0x00, 0xD6, 0xB6, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0xDF, 0x00, 0x00, 0xDE, 0xC3, 0x00, 0x00, /* 0x38-0x3B */ + 0xDE, 0xC4, 0xCA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xEC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA3, 0xEE, 0xB7, /* 0x44-0x47 */ + 0xF8, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEA, 0xC8, 0xEE, 0xB8, 0xF1, 0xAC, /* 0x4C-0x4F */ + 0xF1, 0xA5, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xF9, 0xEC, 0xEA, 0xDD, 0xD6, /* 0x58-0x5B */ + 0xED, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xF8, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBA, /* 0x6C-0x6F */ + 0xDB, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA2, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCD, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xED, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xEB, 0xDE, 0xC5, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE3, 0xE0, 0x00, 0x00, 0xCA, 0xC9, /* 0x80-0x83 */ + 0xF2, 0xE9, 0x00, 0x00, 0xD5, 0xCE, 0x00, 0x00, /* 0x84-0x87 */ + 0xF6, 0xB6, 0x00, 0x00, 0xCE, 0xC2, 0xD6, 0xC7, /* 0x88-0x8B */ + 0x00, 0x00, 0xE3, 0xB4, 0x00, 0x00, 0xF1, 0xAD, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xC2, 0x00, 0x00, /* 0x94-0x97 */ + 0xF3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xEA, /* 0x98-0x9B */ + 0x00, 0x00, 0xEB, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB2, 0xFD, 0xA5, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF6, 0xD5, 0xD5, 0xE2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB5, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF5, 0xF5, 0xB5, /* 0xC0-0xC3 */ + 0xE4, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE7, 0xEB, 0xF1, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBB, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0xB5, 0x00, 0x00, 0xCC, 0xC9, /* 0xD0-0xD3 */ + 0xFA, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xD6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDC, 0xC1, 0x00, 0x00, 0xDE, 0xC6, /* 0xDC-0xDF */ + 0xFA, 0xEF, 0xE3, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, 0xDC, 0xF6, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0xFC, 0x00, 0x00, 0xDB, 0xC4, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF8, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDC, 0xE4, 0x00, 0x00, 0xE5, 0xEF, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_50[512] = { + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB1, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xD6, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF3, 0xDA, 0x00, 0x00, 0xCB, 0xC1, /* 0x08-0x0B */ + 0x00, 0x00, 0xDB, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD9, 0xFA, 0xD3, 0xEE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB8, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xFD, 0xA6, 0xEB, 0xEF, 0x00, 0x00, /* 0x18-0x1B */ + 0xF4, 0xA6, 0x00, 0x00, 0xCC, 0xCA, 0xF3, 0xA8, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF3, 0xDB, 0x00, 0x00, 0xDB, 0xA7, /* 0x20-0x23 */ + 0xF6, 0xB7, 0x00, 0x00, 0xCF, 0xE6, 0xF0, 0xF2, /* 0x24-0x27 */ + 0xCB, 0xDA, 0x00, 0x00, 0xE7, 0xD2, 0xD7, 0xC3, /* 0x28-0x2B */ + 0xF6, 0xF0, 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE7, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA3, /* 0x44-0x47 */ + 0xCC, 0xA7, 0xEA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB6, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xFA, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x58-0x5B */ + 0xEF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xCB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xF6, 0xB0, 0xEF, 0xCF, 0xE9, 0xCF, 0x00, 0x00, /* 0x74-0x77 */ + 0xF7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCE, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xDC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDB, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCB, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xDF, 0xA1, 0xDD, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF5, 0xCA, 0xE9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEC, 0xEE, 0xEE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF3, 0xF0, 0x00, 0x00, 0xDF, 0xBF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD0, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF4, 0xD2, 0xE0, 0xBA, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC0, /* 0xCC-0xCF */ + 0x00, 0x00, 0xCE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDC, 0xD2, 0xFD, 0xEA, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0xE9, 0x00, 0x00, 0xE3, 0xAC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCA, 0xA4, 0x00, 0x00, 0xDB, 0xF8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0xEB, 0xF0, 0xF1, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xE2, 0x00, 0x00, 0xCC, 0xCC, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE3, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC1, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xD0, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB9, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xD3, 0x00, 0x00, /* 0x38-0x3B */ + 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE8, 0xB4, 0xEB, 0xC3, 0x00, 0x00, 0xEA, 0xAA, /* 0x40-0x43 */ + 0xFA, 0xFC, 0xF5, 0xF6, 0xF0, 0xBC, 0xFD, 0xD4, /* 0x44-0x47 */ + 0xE0, 0xBB, 0xCE, 0xC3, 0x00, 0x00, 0xD0, 0xBA, /* 0x48-0x4B */ + 0xF7, 0xBA, 0xD8, 0xF3, 0xF7, 0xCD, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAE, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEC, 0xFD, 0x00, 0x00, 0xD2, 0xAE, /* 0x64-0x67 */ + 0xEE, 0xEF, 0xD5, 0xD7, 0xEA, 0xE4, 0xF8, 0xA2, /* 0x68-0x6B */ + 0xCD, 0xEB, 0xD7, 0xBF, 0xFB, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCD, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xDC, 0xB2, 0xD0, 0xEC, 0xCE, 0xFD, /* 0x74-0x77 */ + 0xEE, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCC, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFC, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xEE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xD8, 0xF4, 0x00, 0x00, 0xE9, 0xB7, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD4, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, 0xD5, 0xD2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD6, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF4, 0xA2, 0x00, 0x00, 0xF1, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD5, 0xD8, 0x00, 0x00, 0xF0, 0xBD, /* 0xC8-0xCB */ + 0xD7, 0xD0, 0xD4, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD7, 0xCF, 0xEB, 0xEA, 0xFD, 0xEB, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xDB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xFC, 0xC5, 0xCB, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD5, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF4, 0xC8, 0xE8, 0xEA, 0xF5, 0xF3, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF9, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xD3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD3, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC2, 0xEF, 0xB7, /* 0x04-0x07 */ + 0xE7, 0xD4, 0x00, 0x00, 0xCA, 0xCA, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFB, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xFA, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xF4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xF7, 0xF7, 0xDC, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xD7, 0xD7, 0xDF, 0xA2, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xBE, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD3, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA4, 0xE1, 0xEC, /* 0x34-0x37 */ + 0xCF, 0xE7, 0xF3, 0xCB, 0xED, 0xA9, 0xCA, 0xBE, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCE, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xFB, 0xD0, 0xBB, /* 0x48-0x4B */ + 0xD5, 0xB7, 0xEE, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF4, 0xA8, 0x00, 0x00, 0xDC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA7, /* 0x58-0x5B */ + 0x00, 0x00, 0xDA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE0, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xED, 0xA5, 0xEE, 0xF2, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF9, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDC, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF3, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF8, 0xF2, 0x00, 0x00, 0xF4, 0xF9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF1, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBC, /* 0x84-0x87 */ + 0xDB, 0xF9, 0xD7, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xCB, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF0, 0xA5, 0xCB, 0xFD, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF4, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xED, /* 0x9C-0x9F */ + 0xCA, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAB, /* 0xA0-0xA3 */ + 0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xF0, 0xBE, 0xD2, 0xBD, 0xCC, 0xA4, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCC, 0xCD, 0x00, 0x00, 0xDA, 0xFA, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xCF, 0x00, 0x00, 0xE9, 0xB8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD8, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xD4, 0xD1, 0xE9, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCA, 0xEB, 0xD9, 0xE2, 0x00, 0x00, 0xFD, 0xB2, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE3, 0xAD, 0xD6, 0xCC, 0xD9, 0xB4, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0xEE, 0xD3, /* 0xE0-0xE3 */ + 0xD0, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB3, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xCF, 0xE8, 0x00, 0x00, 0xED, 0xC3, 0xD0, 0xB2, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFE, 0xDA, 0xA8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xF8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xFD, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xF8, 0xD1, 0x00, 0x00, 0xF8, 0xD2, /* 0x0C-0x0F */ + 0xDC, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xDD, 0xE2, 0xFB, 0xF9, 0xDD, 0xC1, /* 0x14-0x17 */ + 0x00, 0x00, 0xE3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xED, 0xDD, 0xCE, 0xC4, 0x00, 0x00, 0xCB, 0xA1, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDD, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFB, /* 0x3C-0x3F */ + 0xCF, 0xA1, 0xE4, 0xA8, 0x00, 0x00, 0xF4, 0xB6, /* 0x40-0x43 */ + 0xEC, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, /* 0x44-0x47 */ + 0xE7, 0xED, 0xFD, 0xC1, 0xDA, 0xE2, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD8, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDD, 0xE4, 0xF0, 0xEF, 0xF6, 0xF1, /* 0x50-0x53 */ + 0xFA, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF5, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xCF, 0x00, 0x00, /* 0x58-0x5B */ + 0xDC, 0xD4, 0x00, 0x00, 0xDC, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEF, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCF, 0x00, 0x00, /* 0x64-0x67 */ + 0xE0, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD6, /* 0x6C-0x6F */ + 0xEC, 0xD4, 0xEA, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCA, 0xBF, 0xD5, 0xB0, 0x00, 0x00, 0xCF, 0xE9, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF1, 0xED, 0x00, 0x00, 0xCC, 0xCF, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD7, 0xD8, 0x00, 0x00, 0xFD, 0xA7, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAB, /* 0x9C-0x9F */ + 0xF6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCF, 0xF0, 0xF9, 0xBD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE6, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE9, 0xD1, 0xF3, 0xA9, 0xD0, 0xE0, 0xE9, 0xD2, /* 0xC8-0xCB */ + 0x00, 0x00, 0xDA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE2, 0xD2, 0x00, 0x00, 0xF6, 0xA2, 0xE1, 0xF4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE7, 0xD5, 0xF5, 0xBF, 0xCF, 0xA2, /* 0xE0-0xE3 */ + 0xCD, 0xAF, 0xCF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCD, 0xB0, 0xF1, 0xFE, 0xD0, 0xA3, /* 0xE8-0xEB */ + 0xE1, 0xAF, 0xF8, 0xA3, 0x00, 0x00, 0xCA, 0xA6, /* 0xEC-0xEF */ + 0xF7, 0xBB, 0xF2, 0xEA, 0xDE, 0xC8, 0xE9, 0xD3, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDE, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xDE, /* 0x00-0x03 */ + 0xCA, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF9, 0xEA, 0xD1, 0xCE, 0xEE, 0xD4, 0x00, 0x00, /* 0x08-0x0B */ + 0xD4, 0xD2, 0xD9, 0xA3, 0xFD, 0xA8, 0xD7, 0xD9, /* 0x0C-0x0F */ + 0xF7, 0xCE, 0xFA, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0xD7, 0xF0, 0x00, 0x00, 0xEB, 0xE1, /* 0x1C-0x1F */ + 0xF8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xFA, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xDD, 0xC3, 0x00, 0x00, 0xF9, 0xDF, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEF, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFD, 0xE5, 0xF6, 0xA3, 0x00, 0x00, 0xD9, 0xFC, /* 0x38-0x3B */ + 0xFD, 0xA9, 0x00, 0x00, 0xE7, 0xEE, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE5, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xEF, 0xD0, 0x00, 0x00, 0xCD, 0xB1, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xF7, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF1, 0xB2, 0x00, 0x00, 0xF1, 0xB1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCD, 0xB2, 0x00, 0x00, 0xDA, 0xAB, /* 0x70-0x73 */ + 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE2, /* 0x78-0x7B */ + 0xFB, 0xBC, 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xFB, 0xFA, 0x00, 0x00, 0xCF, 0xA4, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF6, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xED, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA1, /* 0xA8-0xAB */ + 0xCE, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF9, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE4, 0xEE, 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xFB, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xF9, 0xEB, 0xEE, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEA, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xCA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF4, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCD, 0xD6, 0xFC, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD4, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF8, 0xA6, 0x00, 0x00, 0xDE, 0xCA, 0xF2, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDA, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xD8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE6, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF3, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF6, 0xF2, 0x00, 0x00, 0xDF, 0xC2, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFD, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xBA, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE1, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF0, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCB, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0xBC, 0x00, 0x00, 0xF4, 0xCA, 0xD4, 0xFA, /* 0x84-0x87 */ + 0x00, 0x00, 0xFD, 0xAA, 0xF9, 0xE2, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xF4, 0xB7, 0xFD, 0xC2, 0xFC, 0xB0, 0x00, 0x00, /* 0x98-0x9B */ + 0xFD, 0xEC, 0xCA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBD, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0xE7, 0xDF, 0xC3, 0xD1, 0xD2, /* 0xA8-0xAB */ + 0xCE, 0xE2, 0x00, 0x00, 0xD3, 0xA4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xFD, 0xAB, 0x00, 0x00, 0xDF, 0xE0, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF2, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF0, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD0, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCB, /* 0xE0-0xE3 */ + 0xF6, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xF5, 0xF1, 0xB3, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xA3, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCA, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xCF, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC4, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB0, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBF, 0x00, 0x00, /* 0x30-0x33 */ + 0xF6, 0xA4, 0x00, 0x00, 0xE3, 0xB6, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD0, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xED, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xDD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF7, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xDE, 0xAD, 0x00, 0x00, 0xFA, 0xBF, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xFD, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF5, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF6, 0xDE, 0xCC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDE, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEC, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xCD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD6, 0xB7, 0xCD, 0xB3, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_57[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD5, /* 0x00-0x03 */ + 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCF, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD0, /* 0x08-0x0B */ + 0x00, 0x00, 0xEA, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAE, 0xEA, 0xAD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF1, 0x00, 0x00, /* 0x14-0x17 */ + 0xD3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xCF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xEE, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD0, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF2, 0xA3, 0x00, 0x00, 0xF7, 0xF8, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA9, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD3, 0xBB, 0xCA, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF1, 0xA6, 0xCB, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xCD, 0xDE, 0x00, 0x00, 0xF7, 0xA4, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC0, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDD, 0x00, 0x00, /* 0x6C-0x6F */ + 0xCC, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xCF, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF7, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD3, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xFE, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA7, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEB, 0xD9, 0x00, 0x00, 0xCF, 0xA7, 0xEA, 0xAF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, /* 0xC4-0xC7 */ + 0xF1, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD8, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF2, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB4, /* 0xDC-0xDF */ + 0xDC, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF3, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDB, 0xC6, 0xD0, 0xF1, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD0, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xCF, 0xDC, 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xCC, 0xB1, 0xF7, 0xD8, 0x00, 0x00, /* 0x04-0x07 */ + 0xCB, 0xA8, 0xEB, 0xBC, 0xE4, 0xBE, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDC, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xDC, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xF0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC0, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xED, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEB, /* 0x2C-0x2F */ + 0xE5, 0xE8, 0xDC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xED, 0xDE, 0xD3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD4, 0xE7, 0xAB, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC3, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF7, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF3, /* 0x54-0x57 */ + 0xD3, 0xD2, 0x00, 0x00, 0xF5, 0xC0, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xDD, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xEE, 0xF3, 0xE7, 0xF1, 0x00, 0x00, /* 0x60-0x63 */ + 0xFD, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xEE, 0xF4, 0x00, 0x00, 0xE2, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD1, /* 0x80-0x83 */ + 0x00, 0x00, 0xDF, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE9, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD7, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF5, 0xCD, 0x00, 0x00, 0xF1, 0xF2, 0xFA, 0xC7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xD9, 0xF8, 0xD4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE5, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF2, 0xED, 0xDF, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE8, 0xB5, 0x00, 0x00, 0xD3, 0xA6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB5, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF9, 0xC9, 0x00, 0x00, 0xE4, 0xE2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xFB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD7, 0xA4, 0xCE, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD5, 0xD6, 0xE6, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCD, /* 0xE8-0xEB */ + 0xEC, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE0, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEC, 0xEC, 0xFB, 0xBE, 0xDF, 0xEB, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBE, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD0, 0xF3, 0xE0, 0xAA, 0xE8, 0xE2, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0xD4, 0xD2, 0xFD, 0x00, 0x00, /* 0x18-0x1B */ + 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDE, /* 0x24-0x27 */ + 0x00, 0x00, 0xF4, 0xB8, 0xF7, 0xBC, 0xDC, 0xFD, /* 0x28-0x2B */ + 0x00, 0x00, 0xE8, 0xEC, 0xE4, 0xE7, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA8, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF4, /* 0x44-0x47 */ + 0xD2, 0xAF, 0xDC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA5, 0xF1, 0xB4, /* 0x4C-0x4F */ + 0xFC, 0xB1, 0xCC, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xDD, 0xC6, 0xFA, 0xD1, 0x00, 0x00, 0xF7, 0xDF, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA8, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEE, 0xF5, 0x00, 0x00, 0xDE, 0xCE, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF3, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xAC, 0xEB, 0xC4, /* 0x68-0x6B */ + 0xED, 0xE1, 0xE0, 0xAB, 0xDD, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB3, /* 0x70-0x73 */ + 0xD2, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xFB, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xFD, 0xDD, 0xE5, /* 0x80-0x83 */ + 0xD8, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xED, 0xD0, 0xD2, /* 0x94-0x97 */ + 0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF6, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xDB, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF7, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD8, 0xD9, 0x00, 0x00, 0xF4, 0xA3, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDD, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0xAB, 0x00, 0x00, 0xE3, 0xB7, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xEE, 0xBB, 0xCD, 0xB4, 0x00, 0x00, 0xE0, 0xF3, /* 0xD0-0xD3 */ + 0xEA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEC, 0xF5, 0xE8, 0xEE, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCB, 0xA9, 0xF1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCD, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEC, 0xA9, 0x00, 0x00, 0xF2, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */ + 0xFD, 0xEF, 0x00, 0x00, 0xF9, 0xF3, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x00, 0x00, 0xEA, 0xCE, 0x00, 0x00, 0xE8, 0xDF, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF4, /* 0x18-0x1B */ + 0xD1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC2, /* 0x1C-0x1F */ + 0xE3, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xD8, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA5, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xF3, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD7, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xE6, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE6, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xFE, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xDA, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAC, 0xEA, 0xB0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCA, 0xAA, 0xE1, 0xF9, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xFA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF4, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xD2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xFD, 0xF0, 0x00, 0x00, 0xE0, 0xBD, /* 0x08-0x0B */ + 0xCE, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC6, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDF, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xED, 0xAD, 0xFA, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCD, 0xEE, 0xED, 0xA6, 0x00, 0x00, 0xED, 0xAE, /* 0x54-0x57 */ + 0xF0, 0xED, 0x00, 0x00, 0xDD, 0xA1, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xAF, 0xFC, 0xF8, 0x00, 0x00, 0xD8, 0xEB, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF9, /* 0x60-0x63 */ + 0xCD, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFA, 0xA9, 0x00, 0x00, 0xE1, 0xDD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0xD5, 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xDD, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xF9, 0xCA, 0x00, 0x00, 0xEA, 0xE8, 0x00, 0x00, /* 0x78-0x7B */ + 0xE5, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xEB, 0x00, 0x00, 0xE9, 0xD4, /* 0x84-0x87 */ + 0xE1, 0xFA, 0xE4, 0xCC, 0x00, 0x00, 0xE1, 0xE4, /* 0x88-0x8B */ + 0xE8, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xB5, 0xFC, 0xF3, 0xF0, 0xF3, /* 0x94-0x97 */ + 0xCE, 0xAF, 0xF1, 0xB5, 0xEF, 0xD2, 0xE8, 0xC8, /* 0x98-0x9B */ + 0xEB, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD4, 0xE0, 0xBE, /* 0xA0-0xA3 */ + 0xE3, 0xF8, 0xEA, 0xE9, 0xFC, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0xF4, 0x00, 0x00, 0xCF, 0xE0, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAA, /* 0xB0-0xB3 */ + 0xE6, 0xC3, 0xE1, 0xB2, 0xCA, 0xAB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE3, 0xE4, 0xE9, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD6, /* 0xBC-0xBF */ + 0xF3, 0xF2, 0x00, 0x00, 0xEE, 0xD6, 0xEA, 0xB2, /* 0xC0-0xC3 */ + 0xD0, 0xF6, 0xEC, 0xD9, 0xDA, 0xCB, 0xCF, 0xA8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD8, 0xDB, 0x00, 0x00, 0xF9, 0xCE, 0xE9, 0xD5, /* 0xD0-0xD3 */ + 0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAC, 0xF3, 0xCC, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCD, 0xFB, 0xF6, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE7, 0xF5, 0xE8, 0xEF, 0xE3, 0xF9, 0xD2, 0xBB, /* 0xE4-0xE7 */ + 0xF3, 0xF3, 0xE3, 0xFB, 0x00, 0x00, 0xDE, 0xD0, /* 0xE8-0xEB */ + 0xCE, 0xB0, 0x00, 0x00, 0xD6, 0xF7, 0xF1, 0xD9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF5, 0xC1, 0xDC, 0xC4, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xBB, 0x00, 0x00, 0xDE, 0xD1, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0xDC, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xDE, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE2, /* 0x04-0x07 */ + 0xEE, 0xF6, 0xEA, 0xCF, 0xF0, 0xEE, 0xE3, 0xFC, /* 0x08-0x0B */ + 0x00, 0x00, 0xD3, 0xDF, 0xD3, 0xF4, 0xE1, 0xB3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD3, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDF, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xDB, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF6, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0xB9, 0xEB, 0xC5, 0xF4, 0xA9, 0xCD, 0xB6, /* 0x38-0x3B */ + 0xD2, 0xF9, 0x00, 0x00, 0xDA, 0xAD, 0xD2, 0xE3, /* 0x3C-0x3F */ + 0xCF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCB, 0xDC, 0xCC, 0xFA, 0x00, 0x00, /* 0x44-0x47 */ + 0xCF, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0x48-0x4B */ + 0x00, 0x00, 0xE3, 0xBB, 0xE3, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE0, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xEE, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD3, 0xF5, 0x00, 0x00, 0xD7, 0xA6, 0x00, 0x00, /* 0x60-0x63 */ + 0xF6, 0xB5, 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xEA, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xFD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD0, 0xF7, 0xED, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCB, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE1, 0xFB, /* 0xA8-0xAB */ + 0xCB, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD3, 0xE0, 0x00, 0x00, 0xE4, 0xBF, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xFB, 0xC0, 0x00, 0x00, 0xDA, 0xBE, /* 0xB4-0xB7 */ + 0xE4, 0xCD, 0x00, 0x00, 0xD6, 0xB9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xC0, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF6, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE7, /* 0xEC-0xEF */ + 0xDC, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xFA, 0xD6, 0x00, 0x00, 0xD3, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDA, /* 0xF8-0xFB */ + 0x00, 0x00, 0xFA, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFD, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xD5, 0xCF, 0xD0, 0xF8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xCD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF5, 0xCB, 0x00, 0x00, 0xE4, 0xF0, 0xCB, 0xAB, /* 0x14-0x17 */ + 0x00, 0x00, 0xD7, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFE, /* 0x24-0x27 */ + 0x00, 0x00, 0xDD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAE, /* 0x48-0x4B */ + 0xCA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD5, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA9, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF7, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xD4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xE4, 0x00, 0x00, 0xE8, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF5, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE7, 0xAE, 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */ + 0xDF, 0xEC, 0xE4, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF4, 0xB9, 0xF1, 0xB6, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE2, 0xDE, 0xE1, 0xB5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xCD, 0xEF, 0xF1, 0xA7, 0xCE, 0xE5, /* 0xE4-0xE7 */ + 0xCB, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE3, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD0, 0xF9, 0xEC, 0xAB, 0xDE, 0xD3, /* 0xF0-0xF3 */ + 0xF7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xDE, 0xCB, 0xEE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xF8, 0xD6, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xEE, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xFD, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF7, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDE, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF2, 0xED, 0x00, 0x00, 0xDB, 0xD9, /* 0x18-0x1B */ + 0x00, 0x00, 0xF0, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD4, /* 0x28-0x2B */ + 0x00, 0x00, 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE1, 0x00, 0x00, /* 0x34-0x37 */ + 0xDF, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xD9, 0xB6, 0x00, 0x00, 0xFD, 0xAC, /* 0x3C-0x3F */ + 0xEF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE4, 0xC1, 0xF8, 0xEB, 0x00, 0x00, 0xDB, 0xAC, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xFC, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xD8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDB, 0xDF, 0xD3, 0xD3, 0xF8, 0xC7, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCE, 0xF8, 0xC1, /* 0x70-0x73 */ + 0xD2, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB4, /* 0x74-0x77 */ + 0xFA, 0xB9, 0xCA, 0xCF, 0x00, 0x00, 0xFC, 0xB3, /* 0x78-0x7B */ + 0xEA, 0xEA, 0xEA, 0xEB, 0xD0, 0xFA, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xED, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE7, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC9, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xED, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEE, 0xBC, 0x00, 0x00, 0xEF, 0xC1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD2, 0x00, 0x00, /* 0x98-0x9B */ + 0xDD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDF, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF8, 0xF1, 0xA8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB7, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0xDD, 0xDF, 0xEE, 0xCB, 0xAC, /* 0xB4-0xB7 */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEC, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xCB, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xF9, 0xBF, 0xD6, 0xAF, 0xD5, 0xC6, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xCF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xB7, 0xEE, 0xF8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD9, /* 0xDC-0xDF */ + 0xF3, 0xDF, 0x00, 0x00, 0xF8, 0xC8, 0xCE, 0xC6, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD5, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE6, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC5, 0xEF, 0xD5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEF, 0xFC, 0xDF, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0xDC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD2, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, 0xCF, 0xE1, /* 0x10-0x13 */ + 0xF0, 0xC0, 0xEC, 0xDA, 0x00, 0x00, 0xDD, 0xD7, /* 0x14-0x17 */ + 0xFB, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xAC, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA9, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD7, 0xFB, 0xC1, /* 0x24-0x27 */ + 0x00, 0x00, 0xD2, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE5, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xED, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0xA5, 0x00, 0x00, 0xCB, 0xAE, 0x00, 0x00, /* 0x48-0x4B */ + 0xDA, 0xAF, 0x00, 0x00, 0xD8, 0xB6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA7, 0xFB, 0xB2, /* 0x54-0x57 */ + 0x00, 0x00, 0xFD, 0xC4, 0x00, 0x00, 0xEC, 0xAD, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xA1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE9, 0xE9, 0xEE, /* 0x64-0x67 */ + 0x00, 0x00, 0xF3, 0xF4, 0xF8, 0xF3, 0xF0, 0xC1, /* 0x68-0x6B */ + 0xDE, 0xAF, 0xF8, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF3, 0xE0, 0xE7, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAD, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF9, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD8, /* 0x7C-0x7F */ + + 0xE8, 0xD9, 0xEF, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xE2, 0x00, 0x00, 0xE2, 0xDF, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE0, 0xD7, 0xC8, /* 0x88-0x8B */ + 0xFD, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDF, 0xEF, 0xCC, 0xD3, 0xD3, 0xF9, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF0, /* 0x94-0x97 */ + 0xDB, 0xC7, 0xDE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF4, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD5, 0xD0, 0xE5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFC, 0xC7, 0xDC, 0xD6, 0xE2, 0xE0, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB0, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF3, 0xA3, 0x00, 0x00, 0xD3, 0xEC, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xFD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xD0, 0xFB, 0xEC, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xBC, 0xF2, 0xA4, /* 0xD4-0xD7 */ + 0xD8, 0xCE, 0xD8, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF5, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xD2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xFB, 0xEC, 0x00, 0x00, 0xDD, 0xC8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE8, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xC1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD7, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xD6, 0xBB, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF7, 0xBD, 0xEC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD0, 0xE1, 0x00, 0x00, 0xE0, 0xF5, /* 0x24-0x27 */ + 0xEA, 0xB3, 0x00, 0x00, 0xCE, 0xD6, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xA5, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEC, 0xF6, 0xE2, 0xE1, 0xE3, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFC, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCD, 0xF0, 0x00, 0x00, 0xF9, 0xF6, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xDF, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xCE, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE1, 0xED, 0xB0, /* 0x60-0x63 */ + 0xFD, 0xD1, 0xF6, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF9, 0xCF, 0xEB, 0xDA, 0xCA, 0xC1, 0x00, 0x00, /* 0x68-0x6B */ + 0xD2, 0xB8, 0xCD, 0xF1, 0x00, 0x00, 0xE3, 0xD3, /* 0x6C-0x6F */ + 0xFD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE3, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xF0, 0xAA, 0xF9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xFC, 0xE2, 0x00, 0x00, 0xF8, 0xA7, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0xEE, 0xF9, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF6, /* 0x9C-0x9F */ + 0xEA, 0xED, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xB4, /* 0xA0-0xA3 */ + 0xF5, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF0, 0xF5, 0x00, 0x00, 0xDD, 0xE8, 0xD3, 0xED, /* 0xB0-0xB3 */ + 0xF5, 0xFC, 0x00, 0x00, 0xDA, 0xBF, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD3, 0xFA, 0xF4, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEF, 0xD7, 0x00, 0x00, 0xD4, 0xC3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFB, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xED, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE0, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEE, /* 0xDC-0xDF */ + 0xFB, 0xB3, 0xE4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF6, 0xE7, 0xD2, 0xDD, 0x00, 0x00, 0xDF, 0xCC, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC9, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE5, 0xA9, 0xE0, 0xF6, 0xF6, 0xB3, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_61[512] = { + 0x00, 0x00, 0xE1, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, 0x00, 0x00, /* 0x04-0x07 */ + 0xEA, 0xEF, 0xEA, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xDA, 0xC0, 0xF8, 0xB4, 0xEB, 0xF2, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xD7, 0xE4, 0xF1, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xEF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xFC, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xF3, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC4, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xE3, 0xE5, 0x00, 0x00, 0xCB, 0xC5, 0xEA, 0xB4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBD, 0x00, 0x00, /* 0x40-0x43 */ + 0xD7, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xDB, /* 0x44-0x47 */ + 0xED, 0xB1, 0x00, 0x00, 0xCC, 0xC3, 0xF7, 0xBE, /* 0x48-0x4B */ + 0xFC, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF4, /* 0x50-0x53 */ + 0x00, 0x00, 0xD9, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF3, 0xD3, 0xF3, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF7, 0xE4, 0x00, 0x00, 0xF7, 0xD1, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB7, 0xCE, 0xB1, /* 0x60-0x63 */ + 0xCA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB4, /* 0x64-0x67 */ + 0xCB, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF6, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE7, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEA, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD4, 0xCB, 0xAF, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF4, 0xAA, 0xE9, 0xAF, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xF5, 0xC3, 0xE9, 0xD8, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE9, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD5, 0xFB, 0xDE, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xF4, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xFD, 0xF3, 0xFD, 0xF2, 0xF7, 0xA6, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xDD, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD3, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCC, 0xA8, 0x00, 0x00, 0xDA, 0xC1, /* 0xA8-0xAB */ + 0xCC, 0xD5, 0x00, 0x00, 0xD9, 0xE4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xBC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD0, /* 0xC4-0xC7 */ + 0xFA, 0xAB, 0xEB, 0xEB, 0xE7, 0xF8, 0xD9, 0xE5, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA4, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xFB, 0xFC, 0xE3, /* 0xF4-0xF7 */ + 0xFA, 0xD8, 0x00, 0x00, 0xF3, 0xD5, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0xD5, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD4, /* 0x04-0x07 */ + 0xCD, 0xFC, 0x00, 0x00, 0xD9, 0xE6, 0x00, 0x00, /* 0x08-0x0B */ + 0xE2, 0xF9, 0xE2, 0xA1, 0xEB, 0xD4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE0, 0xF7, 0xE4, 0xB2, 0xCC, 0xFC, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xE4, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xAB, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBD, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB8, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xC0, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEE, 0xFA, 0xFD, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD3, 0xE3, 0x00, 0x00, 0xFB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE8, 0xDB, 0xAE, /* 0x3C-0x3F */ + 0xE1, 0xB6, 0xF8, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBF, /* 0x44-0x47 */ + 0xFB, 0xC3, 0xDD, 0xEA, 0x00, 0x00, 0xE2, 0xA2, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE8, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF6, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xCA, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xD0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDD, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAF, /* 0x7C-0x7F */ + + 0xD0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xCC, 0xBC, 0xF7, 0xEA, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE5, 0xE4, 0xDF, 0xF1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xE1, 0x00, 0x00, 0xF9, 0xF7, /* 0x94-0x97 */ + 0xEF, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE4, 0xE3, 0xF5, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD9, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE7, /* 0xC4-0xC7 */ + 0xD2, 0xB9, 0xD5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDA, 0xE5, 0xDA, 0xD0, 0x00, 0x00, 0xD1, 0xD9, /* 0xCC-0xCF */ + 0xCE, 0xD8, 0x00, 0x00, 0xCB, 0xDE, 0xF4, 0xAC, /* 0xD0-0xD3 */ + 0xDA, 0xFB, 0x00, 0x00, 0xF6, 0xE9, 0xE8, 0xF3, /* 0xD4-0xD7 */ + 0xCF, 0xAC, 0xF0, 0xF0, 0x00, 0x00, 0xF4, 0xFD, /* 0xD8-0xDB */ + 0xDB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCE, 0xC0, 0xE3, 0xD4, 0xD1, 0xCF, 0xF1, 0xF5, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCD, 0xF2, 0x00, 0x00, 0xCF, 0xEB, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA6, 0xD1, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x00, 0x00, 0xF2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD8, 0xE6, 0xC9, /* 0x38-0x3B */ + 0x00, 0x00, 0xD8, 0xB8, 0xFA, 0xF3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF8, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF3, /* 0x4C-0x4F */ + 0xE6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF8, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE9, /* 0x64-0x67 */ + 0xDE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEC, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDF, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF4, 0xD2, 0xBA, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE2, 0xA3, 0xD3, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, 0x00, 0x00, /* 0x94-0x97 */ + 0xCF, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD5, 0xD3, 0xF3, 0xF5, 0xF7, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEF, 0xC8, 0x00, 0x00, 0xCD, 0xF3, /* 0xA4-0xA7 */ + 0xF5, 0xCF, 0xE5, 0xF3, 0xF0, 0xC2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCA, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xF1, 0x00, 0x00, 0xD0, 0xA6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDA, /* 0xCC-0xCF */ + 0xF0, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE7, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC0, 0xFC, 0xB5, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE4, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCC, 0xA9, 0xFD, 0xC6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEA, 0xB5, 0x00, 0x00, 0xE5, 0xAA, 0xDF, 0xBA, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0xDF, 0x00, 0x00, 0xDA, 0xD1, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE1, 0xB8, 0x00, 0x00, 0xE8, 0xF4, 0xD3, 0xFD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xE2, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCA, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xDA, 0xE6, 0xF7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCD, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB6, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xEE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xF5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA7, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xD9, 0xB8, 0xD9, 0xB9, 0xEF, 0xC9, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF7, 0xCB, 0xDF, 0xAE, 0xE8, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB5, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF4, 0xCC, 0xDA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE8, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF7, 0xEB, 0xF5, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF3, 0xBC, 0x00, 0x00, 0xDA, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB5, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD6, 0xCF, 0xF4, 0xBA, 0x00, 0x00, 0xF7, 0xC9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xAA, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF0, 0xC3, 0xCC, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD3, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDB, 0xFB, 0x00, 0x00, 0xCB, 0xE0, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD3, 0xE4, 0xF6, 0xF7, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD5, 0xBA, 0xF3, 0xCD, 0xCB, 0xE1, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xEB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xAD, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xFC, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF6, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xDA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF7, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE6, 0xFC, 0xAB, /* 0x28-0x2B */ + 0xD5, 0xBB, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xA5, 0xCD, 0xB9, /* 0x34-0x37 */ + 0xEA, 0xF2, 0xCB, 0xC7, 0x00, 0x00, 0xCD, 0xF4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAF, 0xEF, 0xD9, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCD, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xFC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xDF, 0xF3, 0xCE, 0xE7, 0xDA, 0xC2, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCF, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, 0xF8, 0xA8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xE2, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xF2, 0xDF, 0xA4, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC4, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xCC, 0xD7, 0xE5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBB, 0x00, 0x00, /* 0x70-0x73 */ + 0xEF, 0xDA, 0xEE, 0xD8, 0x00, 0x00, 0xDD, 0xA7, /* 0x74-0x77 */ + 0xE2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC0, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xB0, 0xF8, 0xCA, /* 0x80-0x83 */ + 0x00, 0x00, 0xFC, 0xFA, 0x00, 0x00, 0xD9, 0xFE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xDE, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDD, 0xEC, 0xDA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE0, /* 0x94-0x97 */ + 0x00, 0x00, 0xD6, 0xF9, 0x00, 0x00, 0xCD, 0xD7, /* 0x98-0x9B */ + 0xDE, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF8, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD0, 0xC5, 0xF4, 0xAE, 0x00, 0x00, 0xDD, 0xA8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC5, /* 0xA8-0xAB */ + 0xF3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD9, /* 0xAC-0xAF */ + 0xE3, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDB, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE5, 0xDA, 0xE3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDB, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC1, /* 0xC8-0xCB */ + 0xEF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE9, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFD, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEC, 0xED, 0xD3, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF2, 0xA9, 0xF0, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE2, 0xE2, 0xE9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xF9, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE9, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDA, 0xDA, 0xC3, /* 0xF8-0xFB */ + 0xDA, 0xC4, 0xD4, 0xC5, 0x00, 0x00, 0xE7, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xE3, 0xB0, /* 0x04-0x07 */ + 0x00, 0x00, 0xDB, 0xB2, 0xFB, 0xC4, 0x00, 0x00, /* 0x08-0x0B */ + 0xF3, 0xE3, 0x00, 0x00, 0xD9, 0xA5, 0xFB, 0xE7, /* 0x0C-0x0F */ + 0xDD, 0xCB, 0xD0, 0xD4, 0x00, 0x00, 0xE6, 0xB6, /* 0x10-0x13 */ + 0xE0, 0xAE, 0xFD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB5, 0xE0, 0xF8, /* 0x1C-0x1F */ + 0xE7, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF5, 0xF0, 0x00, 0x00, 0xD8, 0xDC, /* 0x24-0x27 */ + 0xED, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, 0xE3, 0xC0, /* 0x2C-0x2F */ + 0xF9, 0xC0, 0xE9, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD9, 0xDB, 0x00, 0x00, 0xF3, 0xE4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB6, 0xE4, 0xE9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xF0, 0xC5, 0xE3, 0xC1, 0xFC, 0xCC, /* 0x40-0x43 */ + 0xFC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF2, 0xCB, 0x00, 0x00, 0xF2, 0xCC, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF1, 0xDB, 0x00, 0x00, 0xFA, 0xD9, /* 0x58-0x5B */ + 0x00, 0x00, 0xF1, 0xB8, 0xFD, 0xF5, 0xE0, 0xF9, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE7, 0xFB, 0xFC, 0xB7, 0xFC, 0xE4, 0xFB, 0xC5, /* 0x64-0x67 */ + 0xE3, 0xE7, 0xD8, 0xB9, 0x00, 0x00, 0xF6, 0xF8, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, 0xCC, 0xD8, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAF, /* 0x70-0x73 */ + 0xF4, 0xE7, 0x00, 0x00, 0xEF, 0xDC, 0xCF, 0xFC, /* 0x74-0x77 */ + 0xEF, 0xDD, 0x00, 0x00, 0xF2, 0xAA, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xFD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAC, /* 0x84-0x87 */ + 0xFD, 0xBB, 0xFD, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB2, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xD1, 0xDF, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xE4, 0xDE, /* 0x94-0x97 */ + 0xE5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xD9, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCD, 0xBC, 0x00, 0x00, 0xF3, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xE7, 0xFB, 0xB5, /* 0xB0-0xB3 */ + 0xF8, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0xE7, 0x00, 0x00, 0xCC, 0xD9, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE7, 0xA5, 0x00, 0x00, 0xD5, 0xF5, 0xD3, 0xBE, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xFC, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF2, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDF, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0xF8, 0xF8, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xF6, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE8, 0xD8, 0x00, 0x00, 0xCD, 0xD8, 0xE7, 0xD6, /* 0xF0-0xF3 */ + 0xCC, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE3, /* 0xF4-0xF7 */ + 0xDF, 0xF6, 0xF0, 0xC7, 0xF0, 0xC6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD8, 0xBA, 0x00, 0x00, 0xF1, 0xF4, 0xF4, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xF5, 0xCC, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xEA, 0xC5, 0xEA, 0xF3, 0x00, 0x00, 0xDD, 0xDB, /* 0x08-0x0B */ + 0x00, 0x00, 0xDC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xDE, 0xFD, 0xF2, 0xF9, 0x00, 0x00, 0xD5, 0xC7, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD0, /* 0x18-0x1B */ + 0x00, 0x00, 0xF0, 0xC8, 0xD1, 0xA1, 0xD1, 0xA2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD4, 0xD6, 0xE8, /* 0x24-0x27 */ + 0xD9, 0xCA, 0x00, 0x00, 0xDA, 0xB1, 0xD8, 0xC7, /* 0x28-0x2B */ + 0xDC, 0xE2, 0xF3, 0xCE, 0xF5, 0xF4, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF1, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xDA, 0xD3, 0x00, 0x00, 0xF6, 0xEA, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF5, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD2, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xDF, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDD, 0xFA, 0xBA, /* 0x4C-0x4F */ + 0xEE, 0xA7, 0xF5, 0xBD, 0x00, 0x00, 0xF8, 0xF5, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xE8, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xE1, 0x00, 0x00, 0xD1, 0xA3, 0xE1, 0xD6, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0xF8, 0x00, 0x00, 0xDB, 0xCA, /* 0x6C-0x6F */ + 0xCB, 0xF9, 0xD4, 0xD4, 0x00, 0x00, 0xD9, 0xDC, /* 0x70-0x73 */ + 0x00, 0x00, 0xEE, 0xBE, 0x00, 0x00, 0xF7, 0xED, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xEE, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0xF7, 0xF9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xED, /* 0x84-0x87 */ + 0x00, 0x00, 0xE8, 0xDB, 0x00, 0x00, 0xDB, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF7, /* 0x8C-0x8F */ + 0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE2, /* 0x90-0x93 */ + 0x00, 0x00, 0xF6, 0xD7, 0x00, 0x00, 0xD7, 0xF9, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xDD, 0x00, 0x00, /* 0x98-0x9B */ + 0xCD, 0xFD, 0xF2, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBD, /* 0xAC-0xAF */ + 0xF8, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xAC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAD, 0xCA, 0xAE, /* 0xB4-0xB7 */ + 0xCF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xDA, /* 0xCC-0xCF */ + 0xD9, 0xBB, 0xCA, 0xF3, 0xF6, 0xD3, 0xE6, 0xF8, /* 0xD0-0xD3 */ + 0xEA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCA, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAF, /* 0xEC-0xEF */ + 0xD2, 0xB0, 0xF1, 0xBA, 0x00, 0x00, 0xD7, 0xB3, /* 0xF0-0xF3 */ + 0xE3, 0xC3, 0xF3, 0xFD, 0xDE, 0xDA, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE3, 0xEE, 0xFB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF7, 0xD7, 0xCA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCE, 0xE8, 0xDB, 0xDB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBB, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFA, 0xB7, 0xD0, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xCC, 0xAB, 0xEE, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCB, 0xFA, 0xF9, 0xF9, 0xCC, 0xFD, 0xD3, 0xFE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xEE, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD4, 0xD5, 0xDF, 0xCD, 0x00, 0x00, 0xFC, 0xB8, /* 0x50-0x53 */ + 0xD1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF2, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD2, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD9, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA9, /* 0x90-0x93 */ + 0xF6, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xDB, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF0, 0xC9, 0x00, 0x00, 0xFC, 0xFC, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE8, 0xC9, 0xF4, 0xFE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFC, /* 0xA4-0xA7 */ + 0xD7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xDC, 0x00, 0x00, 0xF0, 0xAC, /* 0xAC-0xAF */ + 0xCC, 0xFE, 0xCD, 0xE1, 0x00, 0x00, 0xE1, 0xBA, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xDB, 0xEF, 0xDA, 0xB2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD1, 0xA5, 0xDC, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD8, 0xF6, 0x00, 0x00, 0xD1, 0xA4, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF0, 0xF7, 0x00, 0x00, 0xF0, 0xCA, /* 0xD4-0xD7 */ + 0xD0, 0xBE, 0x00, 0x00, 0xDD, 0xDC, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD6, /* 0xDC-0xDF */ + 0xD3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, /* 0xE4-0xE7 */ + 0xCD, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD4, 0xA1, 0xCE, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_69[512] = { + 0xE8, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xEB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE3, 0xD5, 0xF5, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE5, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE6, 0xCB, 0x00, 0x00, 0xF5, 0xF1, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC5, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA3, /* 0x50-0x53 */ + 0xE0, 0xDB, 0xF6, 0xEB, 0x00, 0x00, 0xCB, 0xF1, /* 0x54-0x57 */ + 0x00, 0x00, 0xD9, 0xEA, 0xF5, 0xA2, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD1, 0xF8, 0xEA, 0xF8, 0xEA, 0xF9, 0xDA, 0xB3, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEF, /* 0x68-0x6B */ + 0x00, 0x00, 0xE5, 0xF6, 0xEE, 0xBF, 0xE2, 0xE4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD0, 0xBF, 0x00, 0x00, 0xFA, 0xAC, /* 0x74-0x77 */ + 0xF5, 0xD1, 0xE7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCE, /* 0x98-0x9B */ + 0xDB, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xFC, 0xCE, 0x00, 0x00, 0xDD, 0xEE, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xD7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xB4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCD, 0xBE, 0x00, 0x00, 0xDA, 0xE9, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, /* 0xC8-0xCB */ + 0xF7, 0xD9, 0xF3, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xCE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCE, 0xAA, 0x00, 0x00, 0xCB, 0xC8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF0, 0xCB, 0x00, 0x00, 0xD0, 0xC7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xE0, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0xD7, 0xA7, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD2, 0xED, 0xE9, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD9, 0xBC, 0x00, 0x00, 0xE5, 0xC6, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF5, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xDA, 0xD4, 0xE2, 0xA7, 0xFB, 0xFC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF1, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xCA, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE9, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF8, 0xE2, 0xE5, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD0, 0xB9, 0xD4, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA6, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF4, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD3, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xCC, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xE5, 0xD0, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFC, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xFC, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFE, 0xED, 0xEA, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE3, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xA2, 0xCF, 0xF6, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD0, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEA, 0xF1, 0xEE, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCB, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA1, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD5, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xED, 0x00, 0x00, /* 0x08-0x0B */ + 0xED, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBC, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xFD, 0xE2, 0xF3, 0xAD, 0x00, 0x00, 0xFD, 0xDB, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xE3, 0xCE, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xE4, 0xFA, 0xCE, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCA, 0xB0, 0x00, 0x00, 0xF7, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCF, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xFC, 0xB6, 0xF2, 0xAD, 0xEF, 0xE1, /* 0x60-0x63 */ + 0xF3, 0xAE, 0xDC, 0xC6, 0xD9, 0xEB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE0, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF6, /* 0x74-0x77 */ + 0xCF, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDD, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD1, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEA, /* 0x80-0x83 */ + 0xF2, 0xCF, 0x00, 0x00, 0xF7, 0xBF, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE2, 0xE6, 0xE2, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF9, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xB1, 0xDE, 0xB2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xD3, 0xAB, 0x00, 0x00, 0xEB, 0xDC, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xAF, 0x00, 0x00, /* 0xB8-0xBB */ + 0xCA, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xFC, /* 0xBC-0xBF */ + 0x00, 0x00, 0xFD, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEB, 0xF6, 0xCF, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEC, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD9, 0xBD, 0x00, 0x00, 0xD8, 0xDF, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xB8, 0xEB, 0xBE, /* 0xD0-0xD3 */ + 0xDD, 0xEF, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xF1, /* 0xD4-0xD7 */ + 0xDD, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBE, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC6, /* 0xE8-0xEB */ + 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_6C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xEE, 0xFD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAB, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDA, 0xC5, 0x00, 0x00, 0xD8, 0xEC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA8, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE2, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xBC, /* 0x34-0x37 */ + 0xE7, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF0, 0x00, 0x00, /* 0x3C-0x3F */ + 0xEF, 0xE2, 0xF1, 0xF0, 0xCF, 0xB4, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xDF, 0xA5, 0x00, 0x00, 0xF9, 0xD2, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFD, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0xA3, 0xFB, 0xF1, 0xCB, 0xB0, /* 0x5C-0x5F */ + 0xF2, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xCD, 0xE7, 0x00, 0x00, 0xE8, 0xDC, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF7, 0xC0, 0x00, 0x00, 0xD0, 0xE3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xBD, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xD1, 0xA9, 0xDD, 0xCC, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE3, 0xFE, 0xD1, 0xAA, 0xE8, 0xAA, /* 0x80-0x83 */ + 0x00, 0x00, 0xEA, 0xB6, 0xF9, 0xFA, 0xE6, 0xCC, /* 0x84-0x87 */ + 0xF6, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD4, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD9, 0xCB, 0x00, 0x00, 0xD9, 0xD2, 0xD3, 0xCB, /* 0x90-0x93 */ + 0xD8, 0xF7, 0xDA, 0xA9, 0xF5, 0xF8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDE, 0xDE, 0xF2, 0xAF, 0xF8, 0xA9, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC8, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xDD, 0xF3, 0xEA, 0xFA, 0x00, 0x00, 0xF6, 0xBD, /* 0xB8-0xBB */ + 0xE1, 0xBB, 0xCD, 0xBF, 0xF4, 0xD4, 0xE6, 0xCD, /* 0xBC-0xBF */ + 0x00, 0x00, 0xFC, 0xCF, 0xFB, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE0, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF4, 0xBB, 0xDA, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDB, 0xF6, 0x00, 0x00, 0xDE, 0xDF, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF8, 0xDC, 0xF7, 0xEE, 0xEB, 0xE8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF1, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDA, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xC6, /* 0xEC-0xEF */ + 0xF7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC7, /* 0x08-0x0B */ + 0xD6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDC, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0xAA, 0x00, 0x00, 0xD5, 0xA6, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF2, 0xD0, 0x00, 0x00, 0xEA, 0xFB, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xDD, 0xFB, 0xF3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBD, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0xE7, 0xFD, 0xD7, 0x00, 0x00, /* 0x34-0x37 */ + 0xCE, 0xC8, 0xEA, 0xB7, 0x00, 0x00, 0xFC, 0xC0, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xE7, 0xF7, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD7, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xEF, 0xBA, 0xF1, 0xDD, 0x00, 0x00, /* 0x58-0x5B */ + 0xDE, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCB, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDD, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFB, 0xC7, 0xD5, 0xC8, 0x00, 0x00, /* 0x68-0x6B */ + 0xD7, 0xDF, 0x00, 0x00, 0xDD, 0xA9, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAD, /* 0x74-0x77 */ + 0xF6, 0xD9, 0xFA, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xEE, 0x00, 0x00, 0xCC, 0xDC, /* 0x84-0x87 */ + 0xE1, 0xBC, 0xE0, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE9, 0xBF, 0xFC, 0xFD, 0xE6, 0xCE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE6, 0xCF, /* 0x90-0x93 */ + 0x00, 0x00, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEE, /* 0xC0-0xC3 */ + 0xF6, 0xBE, 0xE0, 0xB2, 0xFC, 0xFE, 0xD1, 0xAB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFA, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC8, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD4, 0xA3, 0xF0, 0xF8, 0xD7, 0xA8, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEF, 0xE4, 0x00, 0x00, 0xD7, 0xC5, 0xEB, 0xE2, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE4, 0xA2, 0x00, 0x00, 0xE2, 0xE8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE6, 0xD0, 0x00, 0x00, 0xFB, 0xE8, /* 0xF4-0xF7 */ + 0xF4, 0xE8, 0xE5, 0xF4, 0xF4, 0xBC, 0xF4, 0xD5, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB6, /* 0x14-0x17 */ + 0x00, 0x00, 0xFC, 0xB9, 0xEE, 0xC2, 0xCA, 0xF5, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE5, /* 0x1C-0x1F */ + 0xCB, 0xE2, 0xD4, 0xA4, 0x00, 0x00, 0xDE, 0xE0, /* 0x20-0x23 */ + 0xDA, 0xFD, 0xE4, 0xC6, 0xE8, 0xBE, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0x28-0x2B */ + 0xF6, 0xB4, 0xEA, 0xD2, 0x00, 0x00, 0xF9, 0xFB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC2, 0x00, 0x00, /* 0x30-0x33 */ + 0xCA, 0xE4, 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, /* 0x34-0x37 */ + 0xEA, 0xFD, 0x00, 0x00, 0xD9, 0xDD, 0x00, 0x00, /* 0x38-0x3B */ + 0xDA, 0xB4, 0xEE, 0xAA, 0xFB, 0xE9, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCB, /* 0x40-0x43 */ + 0xDA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBE, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD3, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC9, 0x00, 0x00, /* 0x54-0x57 */ + 0xDF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC0, /* 0x58-0x5B */ + 0xE3, 0xD7, 0x00, 0x00, 0xEF, 0xE6, 0xFC, 0xD0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC0, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xDC, 0xF7, 0xB7, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xB8, 0xD1, 0xF9, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC8, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDE, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xD7, 0xB6, 0xCF, 0xB5, 0x00, 0x00, 0xD9, 0xA8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xEE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDD, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA2, 0xE8, 0xAE, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xB5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF3, 0xE7, 0xD8, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xFC, 0xD1, 0x00, 0x00, 0xED, 0xB2, /* 0xC8-0xCB */ + 0xF4, 0xAF, 0x00, 0x00, 0xFB, 0xA3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFC, 0xC1, 0x00, 0x00, 0xEE, 0xAB, /* 0xD0-0xD3 */ + 0xD4, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFB, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE3, 0xD8, 0xBB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x00, 0x00, 0xE5, 0xDB, 0xF8, 0xF7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xD4, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA9, /* 0x0C-0x0F */ + 0x00, 0x00, 0xCB, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE6, 0xD1, 0xF0, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xAE, 0x00, 0x00, 0xF9, 0xD3, 0xD5, 0xFE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBC, /* 0x28-0x2B */ + 0xF2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0xAB, 0xF3, 0xE8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEF, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xEC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCC, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFC, /* 0x54-0x57 */ + 0xDA, 0xEB, 0x00, 0x00, 0xE2, 0xD8, 0xED, 0xD6, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD1, 0xE0, 0xB3, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD2, 0x00, 0x00, /* 0x60-0x63 */ + 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xD3, 0xC1, 0xF0, 0xCD, 0x00, 0x00, /* 0x6C-0x6F */ + 0xCF, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, 0x00, 0x00, /* 0x78-0x7B */ + 0xD4, 0xD8, 0xDC, 0xC9, 0xD7, 0xF1, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xDF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xF4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xF1, 0xBF, 0xF8, 0xB1, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE9, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xFB, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD5, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD4, /* 0xA0-0xA3 */ + 0xF7, 0xCA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE8, 0xF3, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEE, 0xFE, 0x00, 0x00, 0xE7, 0xFE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD3, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCC, 0xAD, 0xF6, 0xFA, 0xD6, 0xB2, 0xD2, 0xD8, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD8, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB9, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xAD, /* 0xDC-0xDF */ + 0xFB, 0xCC, 0xEB, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xFB, 0xCD, 0x00, 0x00, 0xD5, 0xBD, /* 0xE8-0xEB */ + 0xF1, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFB, /* 0xEC-0xEF */ + 0x00, 0x00, 0xDE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEB, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x00, 0x00, 0xE5, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFB, 0xA4, 0xD4, 0xB9, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xDE, 0xE1, 0x00, 0x00, 0xE4, 0xA3, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB7, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xDE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD6, 0xD2, 0x00, 0x00, 0xF9, 0xD5, 0xE7, 0xBA, /* 0x18-0x1B */ + 0xEB, 0xD5, 0xD5, 0xF7, 0xEF, 0xE7, 0xE1, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE9, /* 0x24-0x27 */ + 0xD6, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCE, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xFB, 0xA5, 0xE1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF7, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xFB, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xFD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xFC, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCF, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xED, 0xC7, 0xEE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xCC, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFA, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA4, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xFD, 0xDC, 0xED, 0xB3, 0xCE, 0xC9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEF, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDB, /* 0xA8-0xAB */ + 0xCB, 0xE3, 0xF7, 0xA9, 0x00, 0x00, 0xFB, 0xA6, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xC0, /* 0xB4-0xB7 */ + 0xED, 0xC8, 0xEF, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD6, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA1, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xFB, 0xF4, 0xD5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF1, 0xF6, 0x00, 0x00, 0xE6, 0xD3, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCC, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xF8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDC, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xFD, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE5, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDB, 0xCC, 0xDD, 0xCD, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC8, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD9, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA5, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xC8, /* 0x44-0x47 */ + 0x00, 0x00, 0xD6, 0xA1, 0xFD, 0xBF, 0x00, 0x00, /* 0x48-0x4B */ + 0xFC, 0xD3, 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE6, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE9, 0xF2, 0x00, 0x00, 0xDF, 0xB0, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xD8, 0xE0, 0xFC, 0xBA, 0xFD, 0xAF, 0xF0, 0xCE, /* 0x64-0x67 */ + 0x00, 0x00, 0xDB, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE5, 0xC9, 0x00, 0x00, 0xED, 0xB4, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE3, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE9, 0xFB, 0xEA, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xA7, 0x00, 0x00, /* 0x90-0x93 */ + 0xE9, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xFD, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD9, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xFD, 0xF8, 0xFD, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE7, 0xA7, 0x00, 0x00, 0xE6, 0xD7, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD4, 0xF3, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFA, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD7, 0xF2, 0x00, 0x00, 0xE1, 0xC0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDB, 0xE2, 0xE6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF0, 0xCF, 0xF3, 0xBE, 0xE2, 0xAC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF5, 0xB7, 0xE0, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB8, /* 0xF8-0xFB */ + 0xE3, 0xE8, 0x00, 0x00, 0xD4, 0xA7, 0xE8, 0xFC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0xFA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xEF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD6, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD0, 0x00, 0x00, /* 0x28-0x2B */ + 0xF7, 0xF0, 0xEE, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEA, 0xBA, 0x00, 0x00, 0xEA, 0xD3, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xED, 0xC9, 0xDD, 0xAB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, 0xFD, 0xA1, /* 0x38-0x3B */ + 0x00, 0x00, 0xDF, 0xD0, 0xEC, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xED, 0xF8, 0xB8, /* 0x44-0x47 */ + 0xF7, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF8, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD4, 0xBA, 0xE4, 0xB3, 0x00, 0x00, 0xE9, 0xDA, /* 0x58-0x5B */ + 0x00, 0x00, 0xDE, 0xB6, 0x00, 0x00, 0xD9, 0xBF, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD9, 0xC0, 0xD6, 0xEF, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCC, /* 0x64-0x67 */ + 0x00, 0x00, 0xDA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE5, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xCC, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDF, 0xF9, 0xD7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xBB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFA, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xCC, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDF, 0xD2, 0x00, 0x00, 0xCE, 0xCA, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEE, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, 0x00, 0x00, /* 0xCC-0xCF */ + 0xFB, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB7, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE2, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD7, 0xE1, 0xFA, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD5, 0xC9, 0xF8, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xE9, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xED, /* 0x18-0x1B */ + 0xE3, 0xC4, 0xF0, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xFA, 0xEE, 0xC4, 0xD9, 0xDE, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xA2, 0xEB, 0xA3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC2, 0xEA, 0xBB, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE8, 0xAB, 0xDE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xED, 0xEF, 0x00, 0x00, 0xE8, 0xA3, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF1, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD4, 0xBC, 0x00, 0x00, 0xFC, 0xEA, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE7, 0xBE, 0x00, 0x00, 0xFC, 0xF2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE2, 0xAE, 0x00, 0x00, 0xD3, 0xB7, 0xFA, 0xCC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xFA, 0xDC, 0x00, 0x00, 0xED, 0xB5, 0xE1, 0xE3, /* 0x84-0x87 */ + 0x00, 0x00, 0xE8, 0xAC, 0x00, 0x00, 0xE8, 0xDD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE9, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xF4, 0xBD, 0x00, 0x00, 0xCF, 0xB8, 0xE9, 0xDB, /* 0x94-0x97 */ + 0xD1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xC7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xC9, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xBC, 0xD3, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xFA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDA, 0xD6, 0x00, 0x00, 0xCA, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDA, 0xC8, 0xDF, 0xA6, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF9, 0xB3, 0xF2, 0xD2, 0x00, 0x00, 0xCA, 0xC4, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xFD, 0xB0, 0xD5, 0xA8, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF1, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE9, /* 0xE0-0xE3 */ + 0xDC, 0xCA, 0xEC, 0xB4, 0xFA, 0xC0, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xFB, 0xA8, 0xD0, 0xA8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xDA, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEE, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xEF, 0xEA, 0xFA, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0x00, 0x00, 0xE0, 0xC4, 0x00, 0x00, 0xCF, 0xB9, /* 0x00-0x03 */ + 0x00, 0x00, 0xD5, 0xCA, 0xD7, 0xE2, 0xE2, 0xAF, /* 0x04-0x07 */ + 0x00, 0x00, 0xD7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xEF, 0xA2, 0xE2, 0xDA, 0xF6, 0xFC, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xFB, 0xD0, 0xD1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */ + 0xCD, 0xE4, 0x00, 0x00, 0xD1, 0xAE, 0xDC, 0xED, /* 0x28-0x2B */ + 0xE8, 0xCE, 0x00, 0x00, 0xF0, 0xF9, 0xCE, 0xB5, /* 0x2C-0x2F */ + 0xE6, 0xFC, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFB, /* 0x30-0x33 */ + 0xD0, 0xD6, 0xDD, 0xF5, 0xF7, 0xF1, 0x00, 0x00, /* 0x34-0x37 */ + 0xF6, 0xFD, 0x00, 0x00, 0xDB, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEA, /* 0x3C-0x3F */ + 0xE9, 0xDC, 0xD9, 0xC1, 0x00, 0x00, 0xF5, 0xF2, /* 0x40-0x43 */ + 0xE0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF9, 0xC2, 0x00, 0x00, 0xEA, 0xBC, /* 0x54-0x57 */ + 0x00, 0x00, 0xD2, 0xC5, 0xFB, 0xD1, 0xE7, 0xC0, /* 0x58-0x5B */ + 0xEB, 0xA5, 0x00, 0x00, 0xDF, 0xFA, 0xE3, 0xA2, /* 0x5C-0x5F */ + 0xD7, 0xB9, 0x00, 0x00, 0xE9, 0xC3, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xFD, 0xE8, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF2, 0xD3, 0xFB, 0xA9, 0xD8, 0xA5, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC8, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xAF, 0xD7, 0xE3, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC6, /* 0x84-0x87 */ + 0x00, 0x00, 0xD6, 0xA2, 0x00, 0x00, 0xED, 0xF0, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD7, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xFC, 0xD4, 0x00, 0x00, 0xDA, 0xD7, 0xCC, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xF2, 0xD4, 0x00, 0x00, 0xD1, 0xB0, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCC, 0xE0, 0x00, 0x00, 0xDB, 0xFD, /* 0xA4-0xA7 */ + 0xF3, 0xBF, 0x00, 0x00, 0xF0, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xFC, 0xBB, 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE6, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xDF, 0xDE, 0x00, 0x00, 0xE0, 0xC7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xEF, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE1, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE7, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCE, 0xB6, 0x00, 0x00, 0xF3, 0xC0, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCD, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xFB, 0xD2, 0x00, 0x00, 0xF8, 0xF8, 0xF7, 0xFB, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB6, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_75[512] = { + 0x00, 0x00, 0xDC, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xCC, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF1, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xCA, 0xF6, 0x00, 0x00, 0xE4, 0xA4, 0xF4, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE6, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0xDF, 0xE7, 0xE1, 0xC1, 0x00, 0x00, /* 0x24-0x27 */ + 0xE9, 0xC4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xCB, /* 0x28-0x2B */ + 0xE9, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEF, 0xA3, 0xEB, 0xA6, 0xCB, 0xA3, 0xE3, 0xE9, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, /* 0x34-0x37 */ + 0xEF, 0xA4, 0x00, 0x00, 0xEF, 0xEB, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB4, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEF, 0xA5, 0x00, 0x00, 0xD3, 0xCC, /* 0x50-0x53 */ + 0xDA, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD7, 0xBA, 0x00, 0x00, 0xF2, 0xD5, /* 0x58-0x5B */ + 0xF5, 0xE5, 0xD9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD5, 0xD4, 0xFD, 0xCF, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xE3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE1, /* 0x6C-0x6F */ + 0xEC, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xFB, 0xFE, 0xD3, 0xD7, 0x00, 0x00, /* 0x74-0x77 */ + 0xD1, 0xB1, 0x00, 0x00, 0xCB, 0xB1, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xB2, 0xF1, 0xC2, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE1, 0xF9, 0xB5, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEB, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xDF, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF8, 0xDE, 0xF9, 0xAA, 0xCA, 0xF7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xED, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD3, 0xB8, 0xF2, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD4, 0xD9, 0xEE, 0xC5, 0xF2, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDC, 0xBB, 0x00, 0x00, 0xF1, 0xF8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xEC, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF6, 0xC0, 0xFD, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD4, 0xE3, 0xCC, 0xE2, 0x00, 0x00, 0xF7, 0xD4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xD3, 0xC3, 0x00, 0x00, 0xD8, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCD, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xE5, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB0, /* 0x1C-0x1F */ + 0xF4, 0xB0, 0xF3, 0xEA, 0xDA, 0xEE, 0x00, 0x00, /* 0x20-0x23 */ + 0xD7, 0xBB, 0x00, 0x00, 0xE2, 0xB1, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAA, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFB, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE4, 0xDF, 0x00, 0x00, 0xCA, 0xD6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xA8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF6, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEF, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD4, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE8, 0xB9, 0x00, 0x00, 0xEF, 0xA6, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF4, /* 0x78-0x7B */ + 0xDB, 0xA1, 0xDB, 0xDC, 0xDB, 0xDD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xEE, 0xDC, 0x00, 0x00, 0xCB, 0xCB, 0xFC, 0xD5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xEB, 0x00, 0x00, /* 0x8C-0x8F */ + 0xCD, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAB, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD4, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA9, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDD, 0xDB, 0xCD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xCE, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE7, 0xC3, 0x00, 0x00, 0xEC, 0xCC, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xEC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFC, /* 0xD8-0xDB */ + 0xD4, 0xA8, 0x00, 0x00, 0xED, 0xD3, 0xD8, 0xEF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF2, 0xD7, 0x00, 0x00, 0xCA, 0xF8, /* 0xE0-0xE3 */ + 0xDA, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD4, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCD, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xEE, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDA, 0xF0, 0x00, 0x00, 0xE2, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0xE0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF7, 0xAF, 0xDA, 0xB6, 0x00, 0x00, 0xCA, 0xD7, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD8, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xFA, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEF, /* 0x34-0x37 */ + 0xD9, 0xC2, 0x00, 0x00, 0xF0, 0xD2, 0x00, 0x00, /* 0x38-0x3B */ + 0xE4, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF3, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEC, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0xB2, 0x00, 0x00, 0xD4, 0xBD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCE, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xD4, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC2, 0xE7, 0xDA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD9, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD9, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE2, 0xEB, 0xD6, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCA, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xDA, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD7, /* 0xB8-0xBB */ + 0xCC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBA, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB8, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xC3, /* 0xD8-0xDB */ + 0xD0, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC5, 0xEB, 0xF8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF2, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xD3, 0xAD, 0xE8, 0xE1, 0xCE, 0xEC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE3, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF2, 0xB2, 0xF3, 0xF6, 0xF6, 0xDB, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDF, 0x00, 0x00, /* 0x30-0x33 */ + 0xF7, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD0, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDA, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF5, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0x68-0x6B */ + 0xCC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xDD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xED, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD6, 0xDE, 0xE4, 0xF4, 0xE1, 0xEF, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xDD, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCF, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE5, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xAC, 0xFC, 0xAD, /* 0xB8-0xBB */ + 0xD8, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xED, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF0, 0xF3, 0xAF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA5, 0x00, 0x00, /* 0xCC-0xCF */ + 0xDA, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xD8, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCC, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB4, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xCA, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF2, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0xF5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEC, 0xD5, 0xF8, /* 0x28-0x2B */ + 0xDA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDE, 0xE5, 0xD1, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB6, /* 0x44-0x47 */ + 0xD1, 0xB7, 0xF2, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE9, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD3, 0xF2, 0xB4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, 0xCB, 0xE4, /* 0x58-0x5B */ + 0xFB, 0xD4, 0xF5, 0xE6, 0xE3, 0xEA, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDE, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xDF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB8, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDF, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xFC, 0xA1, 0xEF, 0xEE, 0xDC, 0xD8, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE9, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, 0xFD, 0xFB, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD4, 0xAA, 0x00, 0x00, 0xE5, 0xCC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD0, 0xD8, 0xFC, 0xA2, 0xD4, 0xBE, /* 0xBC-0xBF */ + 0xE2, 0xB3, 0xDE, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDC, 0xBC, 0xD2, 0xB6, 0xF5, 0xD5, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xCE, 0xA1, 0xF5, 0xA9, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDD, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDD, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF6, 0xDF, 0x00, 0x00, 0xF2, 0xDA, 0xE4, 0xEB, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xF2, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB9, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7A[512] = { + 0xFD, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCA, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEF, /* 0x08-0x0B */ + 0x00, 0x00, 0xF5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xEC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAD, /* 0x14-0x17 */ + 0x00, 0x00, 0xF2, 0xC2, 0xF6, 0xC3, 0x00, 0x00, /* 0x18-0x1B */ + 0xD7, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA2, /* 0x1C-0x1F */ + 0xF0, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFA, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF6, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0xF2, 0xC3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAB, /* 0x38-0x3B */ + 0xCA, 0xB3, 0xCD, 0xA6, 0x00, 0x00, 0xCD, 0xC3, /* 0x3C-0x3F */ + 0xCD, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCF, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xDD, 0xE7, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xE2, 0xE7, 0xDB, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE8, 0xB1, 0x00, 0x00, 0xFC, 0xAE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE5, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFA, 0xEB, 0x00, 0x00, 0xCF, 0xBC, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xCF, 0xE2, 0xCD, 0xF6, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xEF, 0xF0, 0x00, 0x00, 0xF4, 0xBE, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF2, 0xF3, 0xEB, /* 0x90-0x93 */ + 0x00, 0x00, 0xF0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCF, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE8, 0xC0, 0xE8, 0xC1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xE3, 0xE9, 0xA2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAA, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF3, 0xC1, 0xD0, 0xAB, 0x00, 0x00, 0xD4, 0xE4, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xBC, 0xD8, 0xA1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDF, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF3, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDC, 0xBD, 0x00, 0x00, 0xCC, 0xE5, /* 0xDC-0xDF */ + 0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE2, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD4, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB5, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, 0xD3, 0xAE, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xF1, 0xD3, 0xF5, 0xE7, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEE, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDF, 0xE9, 0x00, 0x00, 0xEE, 0xDE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xC2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xAC, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xF0, 0xAF, 0xD6, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB6, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xD4, 0xF5, 0x00, 0x00, 0xD0, 0xC9, /* 0x48-0x4B */ + 0xEF, 0xA7, 0xE2, 0xEC, 0x00, 0x00, 0xDB, 0xEA, /* 0x4C-0x4F */ + 0xCE, 0xCC, 0xF5, 0xE8, 0xF7, 0xD5, 0x00, 0x00, /* 0x50-0x53 */ + 0xD3, 0xCD, 0x00, 0x00, 0xF3, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD0, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xFE, 0x00, 0x00, 0xDF, 0xFB, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xA8, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB4, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xDA, 0xD8, 0xD1, 0xB9, 0x00, 0x00, 0xDF, 0xA9, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB0, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCC, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEF, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xDF, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xBD, 0xFC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDB, 0xF4, 0x00, 0x00, 0xEF, 0xAA, 0xF8, 0xB9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD9, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE1, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDE, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_7C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEA, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xAF, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD8, 0xE1, 0xC7, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF4, 0xD8, 0xD6, 0xB3, 0xDD, 0xAD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF1, 0xC3, 0xEE, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xF4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xD7, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB7, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xFB, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xDD, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xA3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xDA, 0xD9, 0x00, 0x00, 0xF0, 0xD8, /* 0x94-0x97 */ + 0xEF, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF1, 0xD4, 0x00, 0x00, 0xED, 0xF2, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDB, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD5, 0xDC, 0xF3, 0xC4, 0xCB, 0xD7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF1, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD0, 0xF0, 0xD9, /* 0xDC-0xDF */ + 0xCB, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDD, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xD1, 0xBA, 0x00, 0x00, 0xF1, 0xC4, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xB3, 0xFB, 0xF5, 0xE9, 0xE1, 0xFD, 0xE0, /* 0x04-0x07 */ + 0xFC, 0xBC, 0x00, 0x00, 0xDA, 0xA2, 0xDA, 0xA3, /* 0x08-0x0B */ + 0x00, 0x00, 0xD2, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE9, /* 0x14-0x17 */ + 0xCE, 0xDC, 0xF2, 0xB5, 0xD0, 0xE4, 0xDD, 0xD1, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE1, 0xC8, 0xDB, 0xB7, 0xDF, 0xE3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB9, /* 0x28-0x2B */ + 0xF1, 0xC5, 0x00, 0x00, 0xF3, 0xCF, 0xD7, 0xAB, /* 0x2C-0x2F */ + 0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEB, /* 0x30-0x33 */ + 0x00, 0x00, 0xEE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE1, 0xC9, 0xCA, 0xFA, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFB, 0xFA, 0xE1, /* 0x40-0x43 */ + 0xF0, 0xDA, 0xCC, 0xE7, 0xDA, 0xF4, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xED, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD5, 0xA9, 0xFA, 0xE2, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE5, 0x00, 0x00, /* 0x64-0x67 */ + 0xEB, 0xD6, 0x00, 0x00, 0xEC, 0xDF, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFC, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF7, 0xD6, 0xDE, 0xEA, 0xCB, 0xB4, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xBE, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xCC, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF2, 0xE2, 0xB7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD6, 0xE0, 0x00, 0x00, 0xF1, 0xC6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE2, 0xB8, 0xEB, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCB, 0xB5, 0xD8, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF4, 0xCE, 0xF3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD7, 0xC6, 0x00, 0x00, 0xD1, 0xBB, 0xF7, 0xAA, /* 0xB8-0xBB */ + 0x00, 0x00, 0xED, 0xCA, 0xD7, 0xD3, 0xD8, 0xFA, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCC, 0xDD, 0xFC, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFD, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xF9, 0xE5, 0x00, 0x00, 0xE0, 0xCA, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF2, 0xFD, 0xD3, 0xB0, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF4, 0xF3, 0xDA, 0xC9, 0x00, 0x00, 0xE6, 0xDE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF8, 0xBA, 0xE8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD8, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD5, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC6, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF2, 0xDB, 0xE4, 0xFC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0xF2, 0xDC, 0xFB, 0xD6, 0xE9, 0xB2, /* 0x1C-0x1F */ + 0x00, 0x00, 0xEE, 0xAD, 0x00, 0x00, 0xFA, 0xE3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEE, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xEA, 0xE6, 0xE0, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAC, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF5, 0xC5, 0xEE, 0xE0, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDB, 0xE5, 0x00, 0x00, 0xDD, 0xDE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF0, 0xE9, 0xA3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x50-0x53 */ + 0xF2, 0xC4, 0xE0, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xB1, 0xFC, 0xEB, 0xCD, 0xA8, /* 0x68-0x6B */ + 0x00, 0x00, 0xCC, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF0, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCD, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC3, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE1, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x94-0x97 */ + 0xF3, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC0, /* 0x98-0x9B */ + 0xD5, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xAE, 0x00, 0x00, /* 0x34-0x37 */ + 0xF9, 0xFC, 0x00, 0x00, 0xCC, 0xC0, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCE, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD8, 0xD2, 0xF9, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAA, 0xCE, 0xD1, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDB, 0xEB, 0x00, 0x00, 0xDF, 0xFE, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, 0xF7, 0xF3, /* 0x74-0x77 */ + 0x00, 0x00, 0xD7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD1, 0xBC, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x88-0x8B */ + 0xCB, 0xB6, 0x00, 0x00, 0xDA, 0xB8, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xBE, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE0, 0xCC, 0xEB, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFD, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD7, 0xE8, 0xCB, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE8, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCD, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEC, 0xCE, 0x00, 0x00, 0xD6, 0xBF, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDF, 0xD6, 0xFD, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE1, /* 0xDC-0xDF */ + 0xF6, 0xA8, 0xDD, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xF8, 0xBB, 0x00, 0x00, 0xE8, 0xD1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF9, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xEC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xE9, 0xA5, 0xD6, 0xD5, 0x00, 0x00, 0xCD, 0xC5, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xBA, 0xD1, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xEC, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0xD9, 0xC4, /* 0x14-0x17 */ + 0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xBC, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAD, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF7, 0xB0, 0x00, 0x00, 0xCC, 0xEA, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC4, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC0, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFD, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA1, 0x00, 0x00, /* 0x54-0x57 */ + 0xDE, 0xBD, 0x00, 0x00, 0xF6, 0xA9, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA4, /* 0x6C-0x6F */ + 0xF5, 0xC6, 0x00, 0x00, 0xE1, 0xA2, 0xE9, 0xC6, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC5, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF4, 0xE9, 0xD6, 0xEC, 0xEB, 0xD3, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xEC, 0xBD, 0xE2, 0xDC, 0xDE, 0xEB, 0xF0, 0xDC, /* 0x84-0x87 */ + 0x00, 0x00, 0xEB, 0xBF, 0x00, 0x00, 0xD7, 0xCE, /* 0x88-0x8B */ + 0xD1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xAB, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFD, /* 0x98-0x9B */ + 0x00, 0x00, 0xCA, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCD, 0xC6, 0xF2, 0xB6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDD, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCC, 0xB7, 0xDB, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE9, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCE, 0xDD, 0xEB, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xFD, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCB, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, /* 0xC0-0xC3 */ + 0xF1, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDB, 0xCE, 0x00, 0x00, 0xF7, 0xC3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCF, 0xCB, 0xA4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE0, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xFB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xEB, 0xCA, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD4, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFD, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xD2, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFA, 0xF6, 0xF6, 0xAA, 0xFA, 0xF7, /* 0x04-0x07 */ + 0xD8, 0xE6, 0x00, 0x00, 0xF4, 0xB1, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE8, 0xD2, 0x00, 0x00, 0xCA, 0xC5, 0xCC, 0xEB, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xEE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE2, 0xBB, 0x00, 0x00, 0xF7, 0xAD, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE1, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xF3, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDD, 0xAF, 0xDD, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCB, 0xB7, 0xE8, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xA3, 0xD2, 0xE0, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE9, 0xA6, 0xCB, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xED, 0xF3, 0xDC, 0xD9, 0xE0, 0xCD, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDA, /* 0x7C-0x7F */ + + 0xDB, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xCC, 0xAE, 0x00, 0x00, 0xDA, 0xDB, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB1, 0x00, 0x00, /* 0x98-0x9B */ + 0xD8, 0xAF, 0xE3, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF3, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF8, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF5, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xEC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD3, 0xC5, 0xFC, 0xEC, 0xD2, 0xDB, /* 0xBC-0xBF */ + 0xD4, 0xEB, 0x00, 0x00, 0xDE, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xED, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0xED, 0xF5, /* 0xE4-0xE7 */ + 0xD7, 0xFC, 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB8, /* 0xF0-0xF3 */ + 0xF6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE6, 0xF2, 0xDD, /* 0xF8-0xFB */ + 0xCF, 0xBF, 0x00, 0x00, 0xEB, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xCF, 0xC0, 0x00, 0x00, 0xE6, 0xA8, /* 0x04-0x07 */ + 0xFD, 0xE9, 0x00, 0x00, 0xCF, 0xC1, 0x00, 0x00, /* 0x08-0x0B */ + 0xE0, 0xDF, 0xDE, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBF, /* 0x18-0x1B */ + 0xE2, 0xEF, 0x00, 0x00, 0xD9, 0xF1, 0xF1, 0xC7, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCB, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, 0xDB, 0xBA, /* 0x28-0x2B */ + 0xDA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0xEC, 0xDA, 0xDC, 0xFA, 0xE4, /* 0x34-0x37 */ + 0x00, 0x00, 0xE0, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDD, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, 0xEF, 0xF3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF3, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEB, 0xFA, 0x00, 0x00, 0xF9, 0xE6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDD, 0xD5, 0xDE, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCA, 0xDE, 0xDF, 0xE4, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFD, 0x00, 0x00, /* 0x74-0x77 */ + 0xF5, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, /* 0x88-0x8B */ + 0x00, 0x00, 0xED, 0xCB, 0xCF, 0xE4, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDD, 0xB3, 0xD4, 0xEC, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF2, 0xB9, 0x00, 0x00, 0xDF, 0xB7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xCE, 0xFB, 0xD8, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD0, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xDD, 0xD2, 0xF7, 0xF4, 0xE7, 0xDC, 0xE4, 0xA5, /* 0xAC-0xAF */ + 0x00, 0x00, 0xFC, 0xA3, 0x00, 0x00, 0xDB, 0xBB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBA, /* 0xB4-0xB7 */ + 0xE9, 0xFD, 0xD0, 0xCA, 0x00, 0x00, 0xF5, 0xD6, /* 0xB8-0xBB */ + 0xD9, 0xC5, 0xE4, 0xB4, 0x00, 0x00, 0xED, 0xA7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEA, 0xBD, 0xE6, 0xFE, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF7, 0xC4, 0xF5, 0xAD, 0x00, 0x00, 0xD9, 0xE0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE2, 0xCF, 0xC2, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEC, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE5, 0xB4, 0xCD, 0xC8, 0xEE, 0xC8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE7, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xCD, 0xC9, 0xF9, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0x00, 0x00, 0xF1, 0xE8, 0xD9, 0xF2, 0xDB, 0xF5, /* 0x00-0x03 */ + 0xCA, 0xB5, 0xD9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xD8, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAB, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xED, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD4, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDA, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xFC, 0xED, 0xEC, 0xE0, 0xD2, 0xFE, 0x00, 0x00, /* 0x34-0x37 */ + 0xE9, 0xC7, 0xE6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBB, /* 0x44-0x47 */ + 0x00, 0x00, 0xF5, 0xAE, 0xFB, 0xAA, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xFB, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEC, 0xBF, 0xFC, 0xD8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE5, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC3, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE2, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xD7, 0xE9, 0xED, 0xF6, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xED, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0x94-0x97 */ + 0xE3, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xDD, 0xB4, 0xE4, 0xB5, 0xD8, 0xB0, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCE, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD6, 0xE1, 0xCF, 0xD2, 0x00, 0x00, /* 0xC8-0xCB */ + 0xD0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xEE, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF3, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDC, 0xCC, 0x00, 0x00, 0xD0, 0xCB, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xA4, /* 0xEC-0xEF */ + 0xCD, 0xCA, 0xD7, 0xD4, 0xDE, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFE, /* 0x00-0x03 */ + 0xD4, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xD1, 0x00, 0x00, /* 0x08-0x0B */ + 0xD8, 0xF0, 0xF8, 0xC3, 0xEA, 0xD7, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD8, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFD, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xD5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCA, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF8, 0xE3, 0x00, 0x00, 0xD4, 0xDD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD8, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD9, /* 0x68-0x6B */ + 0xED, 0xF7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD0, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF1, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xD9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xDF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xDB, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB6, /* 0xB8-0xBB */ + 0xF3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDA, /* 0xBC-0xBF */ + 0xE1, 0xE0, 0x00, 0x00, 0xD9, 0xAC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF5, 0xEB, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE9, 0xC8, 0x00, 0x00, 0xCB, 0xCF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xDC, 0xEF, 0x00, 0x00, 0xD6, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD6, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD9, 0xA1, 0x00, 0x00, 0xD8, 0xC0, /* 0x10-0x13 */ + 0xDC, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x14-0x17 */ + 0xDF, 0xB8, 0x00, 0x00, 0xEA, 0xA5, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAD, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF3, 0xF9, 0x00, 0x00, 0xED, 0xF8, /* 0x20-0x23 */ + 0x00, 0x00, 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE1, 0xCA, 0xEB, 0xE3, 0x00, 0x00, 0xF2, 0xDE, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF8, 0xCC, 0x00, 0x00, 0xEA, 0xD9, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD3, 0xC6, 0x00, 0x00, 0xDB, 0xE6, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF5, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF0, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFE, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xFB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xCF, 0xF2, 0xF7, 0xB9, 0xD9, 0xF3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE1, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xDA, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xFB, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCB, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFD, 0xBC, 0xDF, 0xB1, 0xE3, 0xEF, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0xAC-0xAF */ + 0xFD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCD, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD5, 0xC0, 0xE3, 0xF0, 0xED, 0xFA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD5, 0xED, 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xF6, 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDB, 0xE7, 0xE2, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF4, 0xF0, 0xDD, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xDE, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD6, 0xE1, 0xCC, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xEE, 0xDC, 0xA2, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD5, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA1, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCB, 0xF3, 0xF4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC8, /* 0x58-0x5B */ + 0xD6, 0xD7, 0x00, 0x00, 0xE9, 0xE5, 0xFB, 0xDC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD0, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xFB, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA5, 0x00, 0x00, /* 0x88-0x8B */ + 0xDB, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xF7, /* 0xA0-0xA3 */ + 0xF0, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xF6, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xEF, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xFC, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE8, 0xC3, 0x00, 0x00, 0xF1, 0xC8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF1, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF9, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF2, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB6, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xF5, 0xB9, 0x00, 0x00, 0xDC, 0xF0, 0xE3, 0xF1, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE8, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF2, 0xBB, 0x00, 0x00, 0xDE, 0xA4, 0x00, 0x00, /* 0x18-0x1B */ + 0xDA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE9, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, 0xFC, 0xD9, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDA, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFB, 0xDD, 0x00, 0x00, 0xEF, 0xCA, 0x00, 0x00, /* 0x74-0x77 */ + 0xE8, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCC, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xEB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAD, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA2, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF6, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDA, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xFA, 0xAF, 0x00, 0x00, 0xEB, 0xFC, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE3, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC5, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE3, 0xD5, 0xEE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xCD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD9, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC1, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xFA, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEB, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xFA, 0xBC, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE5, 0xE2, 0xFA, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB6, /* 0x54-0x57 */ + 0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0xEA, 0xDB, /* 0x58-0x5B */ + 0x00, 0x00, 0xF5, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xFB, 0xAC, 0xCF, 0xC3, 0xEB, 0xFD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB9, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE1, 0xF1, 0x00, 0x00, 0xD2, 0xA4, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDA, 0xD0, 0xDB, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEA, 0xBE, 0xD9, 0xB1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xCA, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE7, /* 0x88-0x8B */ + 0x00, 0x00, 0xF8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, 0xF2, 0xDF, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCD, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xEE, 0xAE, 0xD6, 0xAE, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE7, 0xE0, 0xEB, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCF, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDC, 0xCD, 0xED, 0xFB, 0x00, 0x00, 0xDE, 0xF0, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD7, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xD7, /* 0xF0-0xF3 */ + 0xDB, 0xD0, 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD5, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF0, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDC, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xCA, 0xE8, 0x00, 0x00, 0xF8, 0xE6, 0xDC, 0xCE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xEA, 0xDC, 0xDB, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDB, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, 0x00, 0x00, /* 0x34-0x37 */ + 0xD7, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE1, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCB, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xD5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xCA, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA9, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE9, 0xA9, 0x00, 0x00, 0xD3, 0xC7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0xF8, 0xAE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xB8, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAE, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCA, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCC, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD4, 0xAD, 0xF6, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xCC, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC6, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB0, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_8A[512] = { + 0xE5, 0xEB, 0x00, 0x00, 0xEF, 0xF4, 0xDD, 0xB5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCD, 0xAA, 0x00, 0x00, 0xE3, 0xF2, 0x00, 0x00, /* 0x08-0x0B */ + 0xFB, 0xF7, 0x00, 0x00, 0xF7, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBA, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xE1, 0xF6, 0xFE, /* 0x14-0x17 */ + 0xD1, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC5, /* 0x18-0x1B */ + 0x00, 0x00, 0xE4, 0xB8, 0x00, 0x00, 0xE1, 0xE8, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC1, /* 0x20-0x23 */ + 0x00, 0x00, 0xD2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xBE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE1, 0xCD, 0x00, 0x00, 0xCA, 0xB8, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE0, 0xF1, 0xC9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDE, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF0, 0xDF, 0xF8, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCC, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xF2, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE7, 0xC9, 0x00, 0x00, 0xE2, 0xF3, 0xE7, 0xE1, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xCF, 0xF8, 0xEF, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */ + 0xFD, 0xFE, 0xFC, 0xA5, 0xFA, 0xB1, 0xDF, 0xD9, /* 0x70-0x73 */ + 0x00, 0x00, 0xE0, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xF1, 0xCA, 0x00, 0x00, 0xCE, 0xA3, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xF2, 0xBC, 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA5, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xEB, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE1, 0xA4, 0xCD, 0xAB, 0x00, 0x00, 0xD9, 0xF4, /* 0xA0-0xA3 */ + 0xE8, 0xA6, 0xCD, 0xCE, 0xE1, 0xE9, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFC, 0xEF, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE2, 0xC1, 0x00, 0x00, 0xCE, 0xA4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEB, 0xFE, 0x00, 0x00, 0xEB, 0xDD, 0xF0, 0xE0, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDB, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE2, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEB, /* 0xC8-0xCB */ + 0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0xF5, 0xD8, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE5, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB0, /* 0xD8-0xDB */ + 0xF4, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE3, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF4, 0xFA, 0xB2, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF5, 0xCA, 0xDF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEB, 0xB1, 0xED, 0xBF, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xFD, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA6, 0xF9, 0xA4, /* 0xF4-0xF7 */ + 0xF0, 0xB3, 0x00, 0x00, 0xE5, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xE7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xD9, 0xC7, 0xE4, 0xD7, 0xEA, 0xDD, 0x00, 0x00, /* 0x00-0x03 */ + 0xD4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBA, 0x00, 0x00, /* 0x0C-0x0F */ + 0xDA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF9, 0xCC, 0x00, 0x00, 0xE1, 0xDA, 0xDB, 0xBF, /* 0x14-0x17 */ + 0x00, 0x00, 0xCC, 0xC5, 0xEC, 0xD0, 0xCB, 0xBB, /* 0x18-0x1B */ + 0x00, 0x00, 0xDE, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, /* 0x28-0x2B */ + 0xD7, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC4, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xD0, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xFC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF1, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD2, 0xD1, 0xC1, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xDB, 0x00, 0x00, 0xD3, 0xC9, 0x00, 0x00, /* 0x58-0x5B */ + 0xDC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xDE, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xBB, /* 0x6C-0x6F */ + 0xEC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCC, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDE, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE7, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD4, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xC2, 0x00, 0x00, 0xF3, 0xD8, 0xE5, 0xD3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD9, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC6, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xDB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xFC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE7, 0x00, 0x00, /* 0x44-0x47 */ + 0xD1, 0xC2, 0x00, 0x00, 0xF9, 0xA5, 0x00, 0x00, /* 0x48-0x4B */ + 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xCA, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDF, 0xE7, 0xE3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF8, 0xFB, 0xE3, 0xCF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB0, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */ + 0xD9, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF8, 0xAF, 0xEF, 0xF6, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDD, 0xB6, 0xEE, 0xAF, 0xCD, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB8, /* 0xA4-0xA7 */ + 0xFC, 0xA7, 0xF7, 0xFC, 0xF7, 0xB1, 0xCE, 0xBB, /* 0xA8-0xAB */ + 0xF4, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCD, /* 0xAC-0xAF */ + 0xE1, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC3, /* 0xB0-0xB3 */ + 0xCF, 0xFE, 0x00, 0x00, 0xF8, 0xBF, 0xD8, 0xE2, /* 0xB4-0xB7 */ + 0xD3, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA8, /* 0xB8-0xBB */ + 0xF4, 0xE4, 0xEC, 0xC2, 0x00, 0x00, 0xD9, 0xF5, /* 0xBC-0xBF */ + 0xF9, 0xC5, 0xDD, 0xD3, 0xD6, 0xF1, 0xEC, 0xFC, /* 0xC0-0xC3 */ + 0xFC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, /* 0xC4-0xC7 */ + 0xCA, 0xB9, 0x00, 0x00, 0xEE, 0xE4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF2, 0xE1, 0x00, 0x00, 0xDE, 0xB9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF2, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDE, 0xF4, 0x00, 0x00, 0xDF, 0xDB, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDB, 0xD3, 0x00, 0x00, 0xFA, 0xE7, 0xD8, 0xE3, /* 0xE0-0xE3 */ + 0xF4, 0xC1, 0x00, 0x00, 0xDD, 0xB7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF5, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB8, /* 0xF8-0xFB */ + 0xCF, 0xC5, 0xDF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF2, 0xBE, 0xF6, 0xA1, 0x00, 0x00, 0xEB, 0xCB, /* 0x04-0x07 */ + 0xF1, 0xFC, 0x00, 0x00, 0xF3, 0xC7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDB, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEE, 0xE5, 0x00, 0x00, 0xDE, 0xF5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF1, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAF, /* 0x70-0x73 */ + 0xDD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC3, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xF5, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC6, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xF0, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF5, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xC5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA2, /* 0xC8-0xCB */ + 0xF2, 0xF6, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF5, /* 0xD8-0xDB */ + 0x00, 0x00, 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEE, 0xE6, 0x00, 0x00, 0xE0, 0xD3, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAF, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_8E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xCE, /* 0x0C-0x0F */ + 0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE6, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA1, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEB, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF1, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB3, 0x00, 0x00, /* 0x40-0x43 */ + 0xF0, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF4, /* 0x44-0x47 */ + 0xD4, 0xB0, 0xF3, 0xB2, 0xFB, 0xB7, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF5, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE7, /* 0x5C-0x5F */ + 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xF5, 0xED, 0x00, 0x00, 0xCF, 0xF3, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xF0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCC, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF5, 0xE3, 0xF3, /* 0xA8-0xAB */ + 0xCF, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCF, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB3, 0xE4, 0xD8, /* 0xC8-0xCB */ + 0xCF, 0xF9, 0xCF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE2, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF2, /* 0x00-0x03 */ + 0x00, 0x00, 0xD6, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xEE, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE5, 0xD8, 0xC2, /* 0x10-0x13 */ + 0xDC, 0xD0, 0xCC, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE0, /* 0x18-0x1B */ + 0xF6, 0xCA, 0xFD, 0xCA, 0xD8, 0xD6, 0xF4, 0xCF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA6, 0xDC, 0xBE, /* 0x24-0x27 */ + 0x00, 0x00, 0xDB, 0xD4, 0xD7, 0xC7, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFE, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCD, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE2, 0xC3, 0xDC, 0xDE, 0x00, 0x00, 0xDC, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAD, 0xE6, 0xAB, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF9, 0xDD, 0xEA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xF4, 0xD0, 0xCE, 0xF3, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, 0xCE, 0xDE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF9, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, /* 0x98-0x9B */ + 0xCD, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB8, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xFD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xDC, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xF6, 0x00, 0x00, 0xDC, 0xAA, /* 0xAC-0xAF */ + 0xF2, 0xE3, 0xE9, 0xB4, 0xD2, 0xDC, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDA, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBC, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE8, 0xDA, 0xDE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF2, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE2, 0xFB, 0x00, 0x00, 0xCC, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEE, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xF7, 0xDC, 0xE1, 0xEA, 0xCE, 0xC1, 0xD4, 0xB1, /* 0x00-0x03 */ + 0x00, 0x00, 0xFD, 0xB1, 0xE6, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0xFB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE7, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0xCE, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */ + 0xF5, 0xEF, 0xCF, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD4, 0xB2, 0xCC, 0xEF, 0x00, 0x00, 0xD4, 0xE8, /* 0x14-0x17 */ + 0x00, 0x00, 0xEE, 0xCF, 0xF7, 0xD7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0xA6, 0xD6, 0xC1, 0xE1, 0xDC, /* 0x1C-0x1F */ + 0xF0, 0xE3, 0xF1, 0xE4, 0xDC, 0xF1, 0xD6, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF5, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF1, 0xCE, 0xF2, 0xE4, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEC, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xF9, 0xBA, 0x00, 0x00, 0xEB, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD4, 0xED, 0xE2, 0xC4, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE7, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB4, 0xEA, 0xA1, /* 0x48-0x4B */ + 0x00, 0x00, 0xF8, 0xBC, 0xCE, 0xA6, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xC6, 0xFC, 0xDA, 0x00, 0x00, 0xD4, 0xB3, /* 0x50-0x53 */ + 0xD3, 0xB9, 0xEA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE9, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE1, 0xE1, 0xD3, 0xCF, 0xF4, 0xF6, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEA, 0xC0, 0xE1, 0xCF, 0x00, 0x00, 0xCC, 0xBA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF0, 0xE4, 0xF3, 0xB4, 0xD4, 0xEE, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC0, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF1, 0xE5, 0x00, 0x00, 0xF4, 0xC3, /* 0x74-0x77 */ + 0xE0, 0xD4, 0x00, 0x00, 0xEB, 0xB6, 0x00, 0x00, /* 0x78-0x7B */ + 0xD7, 0xA1, 0xCB, 0xE8, 0x00, 0x00, 0xF9, 0xAD, /* 0x7C-0x7F */ + + 0xE9, 0xAD, 0xD8, 0xE4, 0xFA, 0xB3, 0xE2, 0xC5, /* 0x80-0x83 */ + 0xFC, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC4, /* 0x84-0x87 */ + 0xD8, 0xB1, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA4, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xD8, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAE, 0xD1, 0xE1, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF5, 0xBE, 0x00, 0x00, 0xDE, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xFB, /* 0xAC-0xAF */ + 0xF7, 0xC6, 0xCF, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEE, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE9, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCD, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDD, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE9, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xD4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xDB, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xFA, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDE, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF8, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB3, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xEB, 0xB7, 0xEF, 0xF8, 0xF5, 0xDC, /* 0x48-0x4B */ + 0xED, 0xCC, 0xDB, 0xD5, 0xF1, 0xCF, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD0, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB2, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD9, 0xAE, 0xD5, 0xAC, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xFD, 0xA3, 0x00, 0x00, 0xFB, 0xE5, /* 0x74-0x77 */ + 0xDF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF5, /* 0x84-0x87 */ + 0x00, 0x00, 0xF6, 0xAD, 0x00, 0x00, 0xF5, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0xF0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF5, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA2, /* 0xA8-0xAB */ + 0xED, 0xFD, 0x00, 0x00, 0xF5, 0xB4, 0xFB, 0xB8, /* 0xAC-0xAF */ + 0x00, 0x00, 0xDB, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xD6, 0xCA, 0xCB, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE5, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xEB, 0xB8, 0x00, 0x00, 0xE0, 0xB7, /* 0xC8-0xCB */ + 0xD7, 0xEC, 0xF1, 0xEC, 0xE5, 0xAF, 0xD5, 0xE1, /* 0xCC-0xCF */ + 0xD7, 0xED, 0xD1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF2, /* 0xD4-0xD7 */ + 0xEF, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDD, 0xBC, 0xF6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE5, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC4, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_92[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xD4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xCC, 0xA2, 0xF7, 0xFE, 0xDF, 0xBC, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD6, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAF, /* 0x3C-0x3F */ + 0xCB, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xFA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC6, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, /* 0x60-0x63 */ + 0xCF, 0xC9, 0xE2, 0xFC, 0xEF, 0xFA, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEB, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xC8, /* 0x80-0x83 */ + 0x00, 0x00, 0xD4, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0xD5, 0x00, 0x00, 0xEF, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC7, 0x00, 0x00, /* 0x94-0x97 */ + 0xD9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCA, 0xE1, 0xD1, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEF, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF9, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xCB, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCB, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF5, 0xDF, 0x00, 0x00, 0xEE, 0xB6, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, 0xD3, 0xCA, /* 0x1C-0x1F */ + 0xEF, 0xFC, 0xD1, 0xC4, 0xEF, 0xB1, 0x00, 0x00, /* 0x20-0x23 */ + 0xD1, 0xC5, 0x00, 0x00, 0xD0, 0xDE, 0x00, 0x00, /* 0x24-0x27 */ + 0xD9, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD1, 0xF3, 0xB9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE7, 0xCC, 0x00, 0x00, 0xD6, 0xA8, 0xCE, 0xA7, /* 0x48-0x4B */ + 0x00, 0x00, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB4, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB9, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xCB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF6, 0xDD, 0x00, 0x00, 0xF1, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xCC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE9, 0xCA, 0x00, 0x00, 0xE1, 0xF0, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE0, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xFB, 0xE0, 0xF2, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEC, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEE, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xCB, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD7, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_94[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xFC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE0, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF4, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xF7, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF1, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCA, 0xFC, 0xCA, 0xFD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCE, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF3, 0xC8, 0x00, 0x00, 0xF3, 0xBA, /* 0x7C-0x7F */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFE, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDA, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEC, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0xCB, 0xD2, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCE, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF9, 0xD8, 0xF9, 0xD9, 0xCA, 0xE0, /* 0x90-0x93 */ + 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCB, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC8, /* 0xA0-0xA3 */ + 0xF9, 0xEE, 0xDB, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xD0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xD5, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE6, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA2, /* 0xB8-0xBB */ + 0xE4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF9, 0xEF, 0xCF, 0xF4, 0xF7, 0xE6, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCE, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF4, 0xC5, 0xDC, 0xA3, 0x00, 0x00, /* 0xE0-0xE3 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xDD, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF4, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xA1, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD6, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC1, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB9, /* 0x3C-0x3F */ + 0xF6, 0xED, 0x00, 0x00, 0xF9, 0xAE, 0x00, 0x00, /* 0x40-0x43 */ + 0xDD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB0, /* 0x48-0x4B */ + 0xD8, 0xE8, 0xCB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCE, /* 0x58-0x5B */ + 0xF9, 0xF0, 0xE0, 0xED, 0xE3, 0xB3, 0xF4, 0xB3, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC2, 0xF2, 0xE6, /* 0x60-0x63 */ + 0xF0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD6, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEB, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE7, /* 0x70-0x73 */ + 0x00, 0x00, 0xD7, 0xD5, 0xD4, 0xB6, 0xF9, 0xE8, /* 0x74-0x77 */ + 0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE9, 0xEA, 0xD7, 0xCC, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE9, 0xE2, 0xC9, /* 0x88-0x8B */ + 0x00, 0x00, 0xFC, 0xDB, 0xCD, 0xAD, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCC, 0xB0, 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE4, 0xF6, 0xD0, 0xC0, 0x00, 0x00, 0xF0, 0xB7, /* 0x98-0x9B */ + 0xEE, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF6, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0xA4-0xA7 */ + 0xE2, 0xCB, 0x00, 0x00, 0xFA, 0xCF, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xCB, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xB4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0xCD, 0xE4, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xA9, 0xE4, 0xBA, 0xF3, 0xA2, 0xCD, 0xD2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF6, 0xCB, 0x00, 0x00, 0xF1, 0xE6, /* 0xC8-0xCB */ + 0xED, 0xC1, 0xE8, 0xBC, 0xEE, 0xD1, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF0, 0xE7, 0xE2, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0xAA, 0x00, 0x00, 0xF5, 0xE1, /* 0xD8-0xDB */ + 0xED, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEE, 0xD1, 0xF1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE9, 0xEB, 0xE9, 0xEC, 0xE0, 0xE4, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA7, /* 0xEC-0xEF */ + 0xDD, 0xD4, 0x00, 0x00, 0xEA, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC3, 0xD6, 0xF4, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDA, 0xDF, 0x00, 0x00, 0xEF, 0xB3, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_97[512] = { + 0xE2, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xFD, 0xF2, 0xE8, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xEF, 0xC5, 0x00, 0x00, 0xE7, 0xE7, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xDF, 0xDC, 0x00, 0x00, 0xF9, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF6, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xDF, 0xAC, 0x00, 0x00, 0xD6, 0xDA, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xDC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF0, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFA, 0x00, 0x00, /* 0x40-0x43 */ + 0xE4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD6, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xF4, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0xA1, 0x00, 0x00, 0xDE, 0xAA, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDA, 0xBC, 0xD8, 0xFC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFA, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xEC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFC, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE6, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xCB, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB9, /* 0x88-0x8B */ + 0x00, 0x00, 0xE4, 0xD3, 0x00, 0x00, 0xCD, 0xF9, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xCA, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD4, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF8, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD4, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA4, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xFB, 0xE1, 0xFA, 0xED, 0xF0, 0xA2, 0xCC, 0xF1, /* 0x00-0x03 */ + 0x00, 0x00, 0xFA, 0xA3, 0xE2, 0xF7, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0xCE, 0x00, 0x00, 0xE9, 0xF5, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE7, 0xE8, 0xE8, 0xD7, 0xDA, 0xF8, 0xD4, 0xCB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF6, /* 0x14-0x17 */ + 0xD6, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD4, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xFA, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xCC, 0xF2, 0xF7, 0xDD, 0x00, 0x00, 0xDE, 0xBA, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF0, 0xB9, 0xE4, 0xFE, 0xE4, 0xC9, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE4, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xEA, 0xC3, 0x00, 0x00, 0xEF, 0xB4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xFB, 0xE2, 0x00, 0x00, 0xCD, 0xD3, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE9, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF7, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF8, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAB, /* 0xD8-0xDB */ + 0xDB, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDD, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE1, 0xE2, 0xD1, 0xC6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xD0, 0xEB, 0xE6, 0xDA, 0xF9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDE, 0xF8, 0xF8, 0xE9, 0xE3, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xFA, 0xC3, 0xE5, 0xD7, 0x00, 0x00, /* 0x08-0x0B */ + 0xEC, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF3, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE6, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xDC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xD0, 0xCF, 0x00, 0x00, 0xCF, 0xFA, /* 0x48-0x4B */ + 0xF3, 0xCA, 0xE0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD1, 0xC7, 0xE9, 0xAE, 0x00, 0x00, /* 0x50-0x53 */ + 0xE8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xFA, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDC, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFB, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD8, 0xA9, 0xE5, 0xDF, 0xF9, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF6, 0xEE, 0x00, 0x00, 0xF6, 0xCC, /* 0xB0-0xB3 */ + 0xE2, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xF1, 0xD2, 0xD2, 0xCC, 0xCF, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xCA, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDD, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF6, 0xEF, 0x00, 0x00, 0xDE, 0xF9, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xFA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD5, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xDE, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xDC, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC8, 0xD1, 0xC9, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF6, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE2, 0xE1, 0xD3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFE, /* 0x40-0x43 */ + 0x00, 0x00, 0xCF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xCE, 0xF6, 0x00, 0x00, 0xFA, 0xD0, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF3, 0xE6, 0xBE, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF0, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD1, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFC, 0xBE, 0xD5, 0xF1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xCD, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xFA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0xD0-0xD3 */ + 0xF4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCD, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA2, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xEA, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xD0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCE, 0xDA, 0xFB, 0xEB, 0xDB, 0xA6, /* 0x40-0x43 */ + 0xDB, 0xDE, 0xD8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE0, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE0, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xC6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD5, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF7, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD7, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xCD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCC, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_9C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xF5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0xCA, 0x00, 0x00, 0xDC, 0xE1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xF9, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xFC, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA7, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xBE, /* 0x44-0x47 */ + 0x00, 0x00, 0xDC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF7, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF0, 0xE8, 0x00, 0x00, 0xDD, 0xC0, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF3, /* 0xF0-0xF3 */ + 0xD9, 0xB0, 0x00, 0x00, 0xE6, 0xE9, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0x24-0x27 */ + 0xE4, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCC, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE4, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xCD, 0xDC, 0xD9, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xDD, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xCE, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xCD, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFD, 0xD3, 0xEB, 0xED, 0xD6, 0xDC, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_9E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF9, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE7, 0xA4, 0x00, 0x00, 0xD6, 0xE3, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCB, 0xD6, 0xE4, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF2, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xDE, 0xFA, 0x00, 0x00, 0xD7, 0xF8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCF, 0xD5, 0xD8, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDC, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE0, 0xA8, 0xD5, 0xF3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFD, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xCC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xEA, /* 0xD8-0xDB */ + 0xF5, 0xF5, 0x00, 0x00, 0xEF, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xD3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA8, /* 0x04-0x07 */ + 0xDC, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA3, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAC, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBA, 0xEE, 0xB1, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB2, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD6, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xBB, 0x00, 0x00, /* 0x68-0x6B */ + 0xE5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xD7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDB, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xCA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xCF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_AC[512] = { + 0xB0, 0xA1, 0xB0, 0xA2, 0x81, 0x41, 0x81, 0x42, /* 0x00-0x03 */ + 0xB0, 0xA3, 0x81, 0x43, 0x81, 0x44, 0xB0, 0xA4, /* 0x04-0x07 */ + 0xB0, 0xA5, 0xB0, 0xA6, 0xB0, 0xA7, 0x81, 0x45, /* 0x08-0x0B */ + 0x81, 0x46, 0x81, 0x47, 0x81, 0x48, 0x81, 0x49, /* 0x0C-0x0F */ + 0xB0, 0xA8, 0xB0, 0xA9, 0xB0, 0xAA, 0xB0, 0xAB, /* 0x10-0x13 */ + 0xB0, 0xAC, 0xB0, 0xAD, 0xB0, 0xAE, 0xB0, 0xAF, /* 0x14-0x17 */ + 0x81, 0x4A, 0xB0, 0xB0, 0xB0, 0xB1, 0xB0, 0xB2, /* 0x18-0x1B */ + 0xB0, 0xB3, 0xB0, 0xB4, 0x81, 0x4B, 0x81, 0x4C, /* 0x1C-0x1F */ + 0xB0, 0xB5, 0x81, 0x4D, 0x81, 0x4E, 0x81, 0x4F, /* 0x20-0x23 */ + 0xB0, 0xB6, 0x81, 0x50, 0x81, 0x51, 0x81, 0x52, /* 0x24-0x27 */ + 0x81, 0x53, 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, /* 0x28-0x2B */ + 0xB0, 0xB7, 0xB0, 0xB8, 0x81, 0x57, 0xB0, 0xB9, /* 0x2C-0x2F */ + 0xB0, 0xBA, 0xB0, 0xBB, 0x81, 0x58, 0x81, 0x59, /* 0x30-0x33 */ + 0x81, 0x5A, 0x81, 0x61, 0x81, 0x62, 0x81, 0x63, /* 0x34-0x37 */ + 0xB0, 0xBC, 0xB0, 0xBD, 0x81, 0x64, 0x81, 0x65, /* 0x38-0x3B */ + 0xB0, 0xBE, 0x81, 0x66, 0x81, 0x67, 0x81, 0x68, /* 0x3C-0x3F */ + 0xB0, 0xBF, 0x81, 0x69, 0x81, 0x6A, 0x81, 0x6B, /* 0x40-0x43 */ + 0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x44-0x47 */ + 0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0xB0, 0xC0, /* 0x48-0x4B */ + 0x81, 0x73, 0xB0, 0xC1, 0x81, 0x74, 0x81, 0x75, /* 0x4C-0x4F */ + 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, /* 0x50-0x53 */ + 0xB0, 0xC2, 0x81, 0x7A, 0x81, 0x81, 0x81, 0x82, /* 0x54-0x57 */ + 0xB0, 0xC3, 0x81, 0x83, 0x81, 0x84, 0x81, 0x85, /* 0x58-0x5B */ + 0xB0, 0xC4, 0x81, 0x86, 0x81, 0x87, 0x81, 0x88, /* 0x5C-0x5F */ + 0x81, 0x89, 0x81, 0x8A, 0x81, 0x8B, 0x81, 0x8C, /* 0x60-0x63 */ + 0x81, 0x8D, 0x81, 0x8E, 0x81, 0x8F, 0x81, 0x90, /* 0x64-0x67 */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0x81, 0x94, /* 0x68-0x6B */ + 0x81, 0x95, 0x81, 0x96, 0x81, 0x97, 0x81, 0x98, /* 0x6C-0x6F */ + 0xB0, 0xC5, 0xB0, 0xC6, 0x81, 0x99, 0x81, 0x9A, /* 0x70-0x73 */ + 0xB0, 0xC7, 0x81, 0x9B, 0x81, 0x9C, 0xB0, 0xC8, /* 0x74-0x77 */ + 0xB0, 0xC9, 0x81, 0x9D, 0xB0, 0xCA, 0x81, 0x9E, /* 0x78-0x7B */ + 0x81, 0x9F, 0x81, 0xA0, 0x81, 0xA1, 0x81, 0xA2, /* 0x7C-0x7F */ + + 0xB0, 0xCB, 0xB0, 0xCC, 0x81, 0xA3, 0xB0, 0xCD, /* 0x80-0x83 */ + 0xB0, 0xCE, 0xB0, 0xCF, 0xB0, 0xD0, 0x81, 0xA4, /* 0x84-0x87 */ + 0x81, 0xA5, 0xB0, 0xD1, 0xB0, 0xD2, 0xB0, 0xD3, /* 0x88-0x8B */ + 0xB0, 0xD4, 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, /* 0x8C-0x8F */ + 0xB0, 0xD5, 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xAB, /* 0x90-0x93 */ + 0xB0, 0xD6, 0x81, 0xAC, 0x81, 0xAD, 0x81, 0xAE, /* 0x94-0x97 */ + 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, 0x81, 0xB2, /* 0x98-0x9B */ + 0xB0, 0xD7, 0xB0, 0xD8, 0x81, 0xB3, 0xB0, 0xD9, /* 0x9C-0x9F */ + 0xB0, 0xDA, 0xB0, 0xDB, 0x81, 0xB4, 0x81, 0xB5, /* 0xA0-0xA3 */ + 0x81, 0xB6, 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, /* 0xA4-0xA7 */ + 0xB0, 0xDC, 0xB0, 0xDD, 0xB0, 0xDE, 0x81, 0xBA, /* 0xA8-0xAB */ + 0xB0, 0xDF, 0x81, 0xBB, 0x81, 0xBC, 0xB0, 0xE0, /* 0xAC-0xAF */ + 0xB0, 0xE1, 0x81, 0xBD, 0x81, 0xBE, 0x81, 0xBF, /* 0xB0-0xB3 */ + 0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0x81, 0xC3, /* 0xB4-0xB7 */ + 0xB0, 0xE2, 0xB0, 0xE3, 0x81, 0xC4, 0xB0, 0xE4, /* 0xB8-0xBB */ + 0xB0, 0xE5, 0xB0, 0xE6, 0x81, 0xC5, 0x81, 0xC6, /* 0xBC-0xBF */ + 0x81, 0xC7, 0xB0, 0xE7, 0x81, 0xC8, 0x81, 0xC9, /* 0xC0-0xC3 */ + 0xB0, 0xE8, 0x81, 0xCA, 0x81, 0xCB, 0x81, 0xCC, /* 0xC4-0xC7 */ + 0xB0, 0xE9, 0x81, 0xCD, 0x81, 0xCE, 0x81, 0xCF, /* 0xC8-0xCB */ + 0xB0, 0xEA, 0x81, 0xD0, 0x81, 0xD1, 0x81, 0xD2, /* 0xCC-0xCF */ + 0x81, 0xD3, 0x81, 0xD4, 0x81, 0xD5, 0x81, 0xD6, /* 0xD0-0xD3 */ + 0x81, 0xD7, 0xB0, 0xEB, 0x81, 0xD8, 0xB0, 0xEC, /* 0xD4-0xD7 */ + 0x81, 0xD9, 0x81, 0xDA, 0x81, 0xDB, 0x81, 0xDC, /* 0xD8-0xDB */ + 0x81, 0xDD, 0x81, 0xDE, 0x81, 0xDF, 0x81, 0xE0, /* 0xDC-0xDF */ + 0xB0, 0xED, 0xB0, 0xEE, 0x81, 0xE1, 0x81, 0xE2, /* 0xE0-0xE3 */ + 0xB0, 0xEF, 0x81, 0xE3, 0x81, 0xE4, 0xB0, 0xF0, /* 0xE4-0xE7 */ + 0xB0, 0xF1, 0x81, 0xE5, 0xB0, 0xF2, 0x81, 0xE6, /* 0xE8-0xEB */ + 0xB0, 0xF3, 0x81, 0xE7, 0x81, 0xE8, 0xB0, 0xF4, /* 0xEC-0xEF */ + 0xB0, 0xF5, 0xB0, 0xF6, 0x81, 0xE9, 0xB0, 0xF7, /* 0xF0-0xF3 */ + 0x81, 0xEA, 0xB0, 0xF8, 0xB0, 0xF9, 0x81, 0xEB, /* 0xF4-0xF7 */ + 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, 0x81, 0xEF, /* 0xF8-0xFB */ + 0xB0, 0xFA, 0xB0, 0xFB, 0x81, 0xF0, 0x81, 0xF1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AD[512] = { + 0xB0, 0xFC, 0x81, 0xF2, 0x81, 0xF3, 0x81, 0xF4, /* 0x00-0x03 */ + 0xB0, 0xFD, 0x81, 0xF5, 0xB0, 0xFE, 0x81, 0xF6, /* 0x04-0x07 */ + 0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0x81, 0xFA, /* 0x08-0x0B */ + 0xB1, 0xA1, 0xB1, 0xA2, 0x81, 0xFB, 0xB1, 0xA3, /* 0x0C-0x0F */ + 0x81, 0xFC, 0xB1, 0xA4, 0x81, 0xFD, 0x81, 0xFE, /* 0x10-0x13 */ + 0x82, 0x41, 0x82, 0x42, 0x82, 0x43, 0x82, 0x44, /* 0x14-0x17 */ + 0xB1, 0xA5, 0x82, 0x45, 0x82, 0x46, 0x82, 0x47, /* 0x18-0x1B */ + 0xB1, 0xA6, 0x82, 0x48, 0x82, 0x49, 0x82, 0x4A, /* 0x1C-0x1F */ + 0xB1, 0xA7, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0x20-0x23 */ + 0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, /* 0x24-0x27 */ + 0x82, 0x52, 0xB1, 0xA8, 0x82, 0x53, 0x82, 0x54, /* 0x28-0x2B */ + 0xB1, 0xA9, 0xB1, 0xAA, 0x82, 0x55, 0x82, 0x56, /* 0x2C-0x2F */ + 0x82, 0x57, 0x82, 0x58, 0x82, 0x59, 0x82, 0x5A, /* 0x30-0x33 */ + 0xB1, 0xAB, 0xB1, 0xAC, 0x82, 0x61, 0x82, 0x62, /* 0x34-0x37 */ + 0xB1, 0xAD, 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, /* 0x38-0x3B */ + 0xB1, 0xAE, 0x82, 0x66, 0x82, 0x67, 0x82, 0x68, /* 0x3C-0x3F */ + 0x82, 0x69, 0x82, 0x6A, 0x82, 0x6B, 0x82, 0x6C, /* 0x40-0x43 */ + 0xB1, 0xAF, 0xB1, 0xB0, 0x82, 0x6D, 0xB1, 0xB1, /* 0x44-0x47 */ + 0x82, 0x6E, 0xB1, 0xB2, 0x82, 0x6F, 0x82, 0x70, /* 0x48-0x4B */ + 0x82, 0x71, 0x82, 0x72, 0x82, 0x73, 0x82, 0x74, /* 0x4C-0x4F */ + 0xB1, 0xB3, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x50-0x53 */ + 0xB1, 0xB4, 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, /* 0x54-0x57 */ + 0xB1, 0xB5, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x58-0x5B */ + 0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x5C-0x5F */ + 0x82, 0x88, 0xB1, 0xB6, 0x82, 0x89, 0xB1, 0xB7, /* 0x60-0x63 */ + 0x82, 0x8A, 0x82, 0x8B, 0x82, 0x8C, 0x82, 0x8D, /* 0x64-0x67 */ + 0x82, 0x8E, 0x82, 0x8F, 0x82, 0x90, 0x82, 0x91, /* 0x68-0x6B */ + 0xB1, 0xB8, 0xB1, 0xB9, 0x82, 0x92, 0x82, 0x93, /* 0x6C-0x6F */ + 0xB1, 0xBA, 0x82, 0x94, 0x82, 0x95, 0xB1, 0xBB, /* 0x70-0x73 */ + 0xB1, 0xBC, 0xB1, 0xBD, 0xB1, 0xBE, 0x82, 0x96, /* 0x74-0x77 */ + 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, 0xB1, 0xBF, /* 0x78-0x7B */ + 0xB1, 0xC0, 0xB1, 0xC1, 0x82, 0x9A, 0xB1, 0xC2, /* 0x7C-0x7F */ + + 0x82, 0x9B, 0xB1, 0xC3, 0xB1, 0xC4, 0x82, 0x9C, /* 0x80-0x83 */ + 0x82, 0x9D, 0x82, 0x9E, 0x82, 0x9F, 0x82, 0xA0, /* 0x84-0x87 */ + 0xB1, 0xC5, 0xB1, 0xC6, 0x82, 0xA1, 0x82, 0xA2, /* 0x88-0x8B */ + 0xB1, 0xC7, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x8C-0x8F */ + 0xB1, 0xC8, 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, /* 0x90-0x93 */ + 0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x94-0x97 */ + 0x82, 0xAD, 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, /* 0x98-0x9B */ + 0xB1, 0xC9, 0xB1, 0xCA, 0x82, 0xB1, 0x82, 0xB2, /* 0x9C-0x9F */ + 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, 0x82, 0xB6, /* 0xA0-0xA3 */ + 0xB1, 0xCB, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0xA4-0xA7 */ + 0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0xA8-0xAB */ + 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0xAC-0xAF */ + 0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0xB0-0xB3 */ + 0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0xB1, 0xCC, /* 0xB4-0xB7 */ + 0x82, 0xC9, 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, /* 0xB8-0xBB */ + 0x82, 0xCD, 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, /* 0xBC-0xBF */ + 0xB1, 0xCD, 0xB1, 0xCE, 0x82, 0xD1, 0x82, 0xD2, /* 0xC0-0xC3 */ + 0xB1, 0xCF, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0xC4-0xC7 */ + 0xB1, 0xD0, 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, /* 0xC8-0xCB */ + 0x82, 0xD9, 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, /* 0xCC-0xCF */ + 0xB1, 0xD1, 0xB1, 0xD2, 0x82, 0xDD, 0xB1, 0xD3, /* 0xD0-0xD3 */ + 0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0xD4-0xD7 */ + 0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0xD8-0xDB */ + 0xB1, 0xD4, 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, /* 0xDC-0xDF */ + 0xB1, 0xD5, 0x82, 0xE9, 0x82, 0xEA, 0x82, 0xEB, /* 0xE0-0xE3 */ + 0xB1, 0xD6, 0x82, 0xEC, 0x82, 0xED, 0x82, 0xEE, /* 0xE4-0xE7 */ + 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, 0x82, 0xF2, /* 0xE8-0xEB */ + 0x82, 0xF3, 0x82, 0xF4, 0x82, 0xF5, 0x82, 0xF6, /* 0xEC-0xEF */ + 0x82, 0xF7, 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, /* 0xF0-0xF3 */ + 0x82, 0xFB, 0x82, 0xFC, 0x82, 0xFD, 0x82, 0xFE, /* 0xF4-0xF7 */ + 0xB1, 0xD7, 0xB1, 0xD8, 0x83, 0x41, 0x83, 0x42, /* 0xF8-0xFB */ + 0xB1, 0xD9, 0x83, 0x43, 0x83, 0x44, 0xB1, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AE[512] = { + 0xB1, 0xDB, 0xB1, 0xDC, 0x83, 0x45, 0x83, 0x46, /* 0x00-0x03 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0x04-0x07 */ + 0xB1, 0xDD, 0xB1, 0xDE, 0x83, 0x4B, 0xB1, 0xDF, /* 0x08-0x0B */ + 0x83, 0x4C, 0xB1, 0xE0, 0x83, 0x4D, 0x83, 0x4E, /* 0x0C-0x0F */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0x10-0x13 */ + 0xB1, 0xE1, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0x14-0x17 */ + 0x83, 0x56, 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, /* 0x18-0x1B */ + 0x83, 0x5A, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0x1C-0x1F */ + 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, 0x83, 0x67, /* 0x20-0x23 */ + 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, 0x83, 0x6B, /* 0x24-0x27 */ + 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, 0x83, 0x6F, /* 0x28-0x2B */ + 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, /* 0x2C-0x2F */ + 0xB1, 0xE2, 0xB1, 0xE3, 0x83, 0x74, 0x83, 0x75, /* 0x30-0x33 */ + 0xB1, 0xE4, 0x83, 0x76, 0x83, 0x77, 0xB1, 0xE5, /* 0x34-0x37 */ + 0xB1, 0xE6, 0x83, 0x78, 0xB1, 0xE7, 0x83, 0x79, /* 0x38-0x3B */ + 0x83, 0x7A, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0x3C-0x3F */ + 0xB1, 0xE8, 0xB1, 0xE9, 0x83, 0x84, 0xB1, 0xEA, /* 0x40-0x43 */ + 0x83, 0x85, 0xB1, 0xEB, 0xB1, 0xEC, 0x83, 0x86, /* 0x44-0x47 */ + 0x83, 0x87, 0x83, 0x88, 0xB1, 0xED, 0x83, 0x89, /* 0x48-0x4B */ + 0xB1, 0xEE, 0xB1, 0xEF, 0xB1, 0xF0, 0x83, 0x8A, /* 0x4C-0x4F */ + 0xB1, 0xF1, 0x83, 0x8B, 0x83, 0x8C, 0x83, 0x8D, /* 0x50-0x53 */ + 0xB1, 0xF2, 0x83, 0x8E, 0xB1, 0xF3, 0x83, 0x8F, /* 0x54-0x57 */ + 0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0x58-0x5B */ + 0xB1, 0xF4, 0xB1, 0xF5, 0x83, 0x94, 0xB1, 0xF6, /* 0x5C-0x5F */ + 0xB1, 0xF7, 0xB1, 0xF8, 0x83, 0x95, 0x83, 0x96, /* 0x60-0x63 */ + 0x83, 0x97, 0xB1, 0xF9, 0x83, 0x98, 0x83, 0x99, /* 0x64-0x67 */ + 0xB1, 0xFA, 0xB1, 0xFB, 0x83, 0x9A, 0x83, 0x9B, /* 0x68-0x6B */ + 0xB1, 0xFC, 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, /* 0x6C-0x6F */ + 0xB1, 0xFD, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x70-0x73 */ + 0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x74-0x77 */ + 0xB1, 0xFE, 0xB2, 0xA1, 0x83, 0xA6, 0xB2, 0xA2, /* 0x78-0x7B */ + 0xB2, 0xA3, 0xB2, 0xA4, 0x83, 0xA7, 0x83, 0xA8, /* 0x7C-0x7F */ + + 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, /* 0x80-0x83 */ + 0xB2, 0xA5, 0xB2, 0xA6, 0x83, 0xAD, 0x83, 0xAE, /* 0x84-0x87 */ + 0x83, 0xAF, 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, /* 0x88-0x8B */ + 0xB2, 0xA7, 0x83, 0xB3, 0x83, 0xB4, 0x83, 0xB5, /* 0x8C-0x8F */ + 0x83, 0xB6, 0x83, 0xB7, 0x83, 0xB8, 0x83, 0xB9, /* 0x90-0x93 */ + 0x83, 0xBA, 0x83, 0xBB, 0x83, 0xBC, 0x83, 0xBD, /* 0x94-0x97 */ + 0x83, 0xBE, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0x98-0x9B */ + 0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0x9C-0x9F */ + 0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xA0-0xA3 */ + 0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xA4-0xA7 */ + 0x83, 0xCE, 0x83, 0xCF, 0x83, 0xD0, 0x83, 0xD1, /* 0xA8-0xAB */ + 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, 0x83, 0xD5, /* 0xAC-0xAF */ + 0x83, 0xD6, 0x83, 0xD7, 0x83, 0xD8, 0x83, 0xD9, /* 0xB0-0xB3 */ + 0x83, 0xDA, 0x83, 0xDB, 0x83, 0xDC, 0x83, 0xDD, /* 0xB4-0xB7 */ + 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, 0x83, 0xE1, /* 0xB8-0xBB */ + 0xB2, 0xA8, 0xB2, 0xA9, 0xB2, 0xAA, 0x83, 0xE2, /* 0xBC-0xBF */ + 0xB2, 0xAB, 0x83, 0xE3, 0x83, 0xE4, 0x83, 0xE5, /* 0xC0-0xC3 */ + 0xB2, 0xAC, 0x83, 0xE6, 0x83, 0xE7, 0x83, 0xE8, /* 0xC4-0xC7 */ + 0x83, 0xE9, 0x83, 0xEA, 0x83, 0xEB, 0x83, 0xEC, /* 0xC8-0xCB */ + 0xB2, 0xAD, 0xB2, 0xAE, 0x83, 0xED, 0xB2, 0xAF, /* 0xCC-0xCF */ + 0xB2, 0xB0, 0xB2, 0xB1, 0x83, 0xEE, 0x83, 0xEF, /* 0xD0-0xD3 */ + 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, 0x83, 0xF3, /* 0xD4-0xD7 */ + 0xB2, 0xB2, 0xB2, 0xB3, 0x83, 0xF4, 0x83, 0xF5, /* 0xD8-0xDB */ + 0xB2, 0xB4, 0x83, 0xF6, 0x83, 0xF7, 0x83, 0xF8, /* 0xDC-0xDF */ + 0x83, 0xF9, 0x83, 0xFA, 0x83, 0xFB, 0x83, 0xFC, /* 0xE0-0xE3 */ + 0x83, 0xFD, 0x83, 0xFE, 0x84, 0x41, 0x84, 0x42, /* 0xE4-0xE7 */ + 0xB2, 0xB5, 0x84, 0x43, 0x84, 0x44, 0xB2, 0xB6, /* 0xE8-0xEB */ + 0x84, 0x45, 0xB2, 0xB7, 0x84, 0x46, 0x84, 0x47, /* 0xEC-0xEF */ + 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, /* 0xF0-0xF3 */ + 0xB2, 0xB8, 0x84, 0x4C, 0x84, 0x4D, 0x84, 0x4E, /* 0xF4-0xF7 */ + 0xB2, 0xB9, 0x84, 0x4F, 0x84, 0x50, 0x84, 0x51, /* 0xF8-0xFB */ + 0xB2, 0xBA, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AF[512] = { + 0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x00-0x03 */ + 0x84, 0x59, 0x84, 0x5A, 0x84, 0x61, 0xB2, 0xBB, /* 0x04-0x07 */ + 0xB2, 0xBC, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x08-0x0B */ + 0x84, 0x65, 0xB2, 0xBD, 0x84, 0x66, 0x84, 0x67, /* 0x0C-0x0F */ + 0xB2, 0xBE, 0x84, 0x68, 0x84, 0x69, 0x84, 0x6A, /* 0x10-0x13 */ + 0x84, 0x6B, 0x84, 0x6C, 0x84, 0x6D, 0x84, 0x6E, /* 0x14-0x17 */ + 0x84, 0x6F, 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, /* 0x18-0x1B */ + 0x84, 0x73, 0x84, 0x74, 0x84, 0x75, 0x84, 0x76, /* 0x1C-0x1F */ + 0x84, 0x77, 0x84, 0x78, 0x84, 0x79, 0x84, 0x7A, /* 0x20-0x23 */ + 0x84, 0x81, 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, /* 0x24-0x27 */ + 0x84, 0x85, 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, /* 0x28-0x2B */ + 0xB2, 0xBF, 0xB2, 0xC0, 0x84, 0x89, 0x84, 0x8A, /* 0x2C-0x2F */ + 0xB2, 0xC1, 0x84, 0x8B, 0xB2, 0xC2, 0x84, 0x8C, /* 0x30-0x33 */ + 0xB2, 0xC3, 0x84, 0x8D, 0x84, 0x8E, 0x84, 0x8F, /* 0x34-0x37 */ + 0x84, 0x90, 0x84, 0x91, 0x84, 0x92, 0x84, 0x93, /* 0x38-0x3B */ + 0xB2, 0xC4, 0xB2, 0xC5, 0x84, 0x94, 0xB2, 0xC6, /* 0x3C-0x3F */ + 0x84, 0x95, 0xB2, 0xC7, 0xB2, 0xC8, 0xB2, 0xC9, /* 0x40-0x43 */ + 0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x44-0x47 */ + 0xB2, 0xCA, 0xB2, 0xCB, 0x84, 0x9A, 0x84, 0x9B, /* 0x48-0x4B */ + 0x84, 0x9C, 0x84, 0x9D, 0x84, 0x9E, 0x84, 0x9F, /* 0x4C-0x4F */ + 0xB2, 0xCC, 0x84, 0xA0, 0x84, 0xA1, 0x84, 0xA2, /* 0x50-0x53 */ + 0x84, 0xA3, 0x84, 0xA4, 0x84, 0xA5, 0x84, 0xA6, /* 0x54-0x57 */ + 0x84, 0xA7, 0x84, 0xA8, 0x84, 0xA9, 0x84, 0xAA, /* 0x58-0x5B */ + 0xB2, 0xCD, 0xB2, 0xCE, 0x84, 0xAB, 0x84, 0xAC, /* 0x5C-0x5F */ + 0x84, 0xAD, 0x84, 0xAE, 0x84, 0xAF, 0x84, 0xB0, /* 0x60-0x63 */ + 0xB2, 0xCF, 0xB2, 0xD0, 0x84, 0xB1, 0x84, 0xB2, /* 0x64-0x67 */ + 0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0x68-0x6B */ + 0x84, 0xB7, 0x84, 0xB8, 0x84, 0xB9, 0x84, 0xBA, /* 0x6C-0x6F */ + 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, 0x84, 0xBE, /* 0x70-0x73 */ + 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, 0x84, 0xC2, /* 0x74-0x77 */ + 0x84, 0xC3, 0xB2, 0xD1, 0x84, 0xC4, 0x84, 0xC5, /* 0x78-0x7B */ + 0x84, 0xC6, 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, /* 0x7C-0x7F */ + + 0xB2, 0xD2, 0x84, 0xCA, 0x84, 0xCB, 0x84, 0xCC, /* 0x80-0x83 */ + 0xB2, 0xD3, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0x84-0x87 */ + 0xB2, 0xD4, 0x84, 0xD0, 0x84, 0xD1, 0x84, 0xD2, /* 0x88-0x8B */ + 0x84, 0xD3, 0x84, 0xD4, 0x84, 0xD5, 0x84, 0xD6, /* 0x8C-0x8F */ + 0xB2, 0xD5, 0xB2, 0xD6, 0x84, 0xD7, 0x84, 0xD8, /* 0x90-0x93 */ + 0x84, 0xD9, 0xB2, 0xD7, 0x84, 0xDA, 0x84, 0xDB, /* 0x94-0x97 */ + 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, 0x84, 0xDF, /* 0x98-0x9B */ + 0xB2, 0xD8, 0x84, 0xE0, 0x84, 0xE1, 0x84, 0xE2, /* 0x9C-0x9F */ + 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, 0x84, 0xE6, /* 0xA0-0xA3 */ + 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, 0x84, 0xEA, /* 0xA4-0xA7 */ + 0x84, 0xEB, 0x84, 0xEC, 0x84, 0xED, 0x84, 0xEE, /* 0xA8-0xAB */ + 0x84, 0xEF, 0x84, 0xF0, 0x84, 0xF1, 0x84, 0xF2, /* 0xAC-0xAF */ + 0x84, 0xF3, 0x84, 0xF4, 0x84, 0xF5, 0x84, 0xF6, /* 0xB0-0xB3 */ + 0x84, 0xF7, 0x84, 0xF8, 0x84, 0xF9, 0x84, 0xFA, /* 0xB4-0xB7 */ + 0xB2, 0xD9, 0xB2, 0xDA, 0x84, 0xFB, 0x84, 0xFC, /* 0xB8-0xBB */ + 0xB2, 0xDB, 0x84, 0xFD, 0x84, 0xFE, 0x85, 0x41, /* 0xBC-0xBF */ + 0xB2, 0xDC, 0x85, 0x42, 0x85, 0x43, 0x85, 0x44, /* 0xC0-0xC3 */ + 0x85, 0x45, 0x85, 0x46, 0x85, 0x47, 0xB2, 0xDD, /* 0xC4-0xC7 */ + 0xB2, 0xDE, 0xB2, 0xDF, 0x85, 0x48, 0xB2, 0xE0, /* 0xC8-0xCB */ + 0x85, 0x49, 0xB2, 0xE1, 0xB2, 0xE2, 0x85, 0x4A, /* 0xCC-0xCF */ + 0x85, 0x4B, 0x85, 0x4C, 0x85, 0x4D, 0x85, 0x4E, /* 0xD0-0xD3 */ + 0xB2, 0xE3, 0x85, 0x4F, 0x85, 0x50, 0x85, 0x51, /* 0xD4-0xD7 */ + 0x85, 0x52, 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, /* 0xD8-0xDB */ + 0xB2, 0xE4, 0x85, 0x56, 0x85, 0x57, 0x85, 0x58, /* 0xDC-0xDF */ + 0x85, 0x59, 0x85, 0x5A, 0x85, 0x61, 0x85, 0x62, /* 0xE0-0xE3 */ + 0x85, 0x63, 0x85, 0x64, 0x85, 0x65, 0x85, 0x66, /* 0xE4-0xE7 */ + 0xB2, 0xE5, 0xB2, 0xE6, 0x85, 0x67, 0x85, 0x68, /* 0xE8-0xEB */ + 0x85, 0x69, 0x85, 0x6A, 0x85, 0x6B, 0x85, 0x6C, /* 0xEC-0xEF */ + 0xB2, 0xE7, 0xB2, 0xE8, 0x85, 0x6D, 0x85, 0x6E, /* 0xF0-0xF3 */ + 0xB2, 0xE9, 0x85, 0x6F, 0x85, 0x70, 0x85, 0x71, /* 0xF4-0xF7 */ + 0xB2, 0xEA, 0x85, 0x72, 0x85, 0x73, 0x85, 0x74, /* 0xF8-0xFB */ + 0x85, 0x75, 0x85, 0x76, 0x85, 0x77, 0x85, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B0[512] = { + 0xB2, 0xEB, 0xB2, 0xEC, 0x85, 0x79, 0x85, 0x7A, /* 0x00-0x03 */ + 0xB2, 0xED, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x04-0x07 */ + 0x85, 0x84, 0x85, 0x85, 0x85, 0x86, 0x85, 0x87, /* 0x08-0x0B */ + 0xB2, 0xEE, 0x85, 0x88, 0x85, 0x89, 0x85, 0x8A, /* 0x0C-0x0F */ + 0xB2, 0xEF, 0x85, 0x8B, 0x85, 0x8C, 0x85, 0x8D, /* 0x10-0x13 */ + 0xB2, 0xF0, 0x85, 0x8E, 0x85, 0x8F, 0x85, 0x90, /* 0x14-0x17 */ + 0x85, 0x91, 0x85, 0x92, 0x85, 0x93, 0x85, 0x94, /* 0x18-0x1B */ + 0xB2, 0xF1, 0xB2, 0xF2, 0x85, 0x95, 0x85, 0x96, /* 0x1C-0x1F */ + 0x85, 0x97, 0x85, 0x98, 0x85, 0x99, 0x85, 0x9A, /* 0x20-0x23 */ + 0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0x85, 0x9E, /* 0x24-0x27 */ + 0xB2, 0xF3, 0x85, 0x9F, 0x85, 0xA0, 0x85, 0xA1, /* 0x28-0x2B */ + 0x85, 0xA2, 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, /* 0x2C-0x2F */ + 0x85, 0xA6, 0x85, 0xA7, 0x85, 0xA8, 0x85, 0xA9, /* 0x30-0x33 */ + 0x85, 0xAA, 0x85, 0xAB, 0x85, 0xAC, 0x85, 0xAD, /* 0x34-0x37 */ + 0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0x85, 0xB1, /* 0x38-0x3B */ + 0x85, 0xB2, 0x85, 0xB3, 0x85, 0xB4, 0x85, 0xB5, /* 0x3C-0x3F */ + 0x85, 0xB6, 0x85, 0xB7, 0x85, 0xB8, 0x85, 0xB9, /* 0x40-0x43 */ + 0xB2, 0xF4, 0xB2, 0xF5, 0x85, 0xBA, 0x85, 0xBB, /* 0x44-0x47 */ + 0xB2, 0xF6, 0x85, 0xBC, 0xB2, 0xF7, 0x85, 0xBD, /* 0x48-0x4B */ + 0xB2, 0xF8, 0x85, 0xBE, 0xB2, 0xF9, 0x85, 0xBF, /* 0x4C-0x4F */ + 0x85, 0xC0, 0x85, 0xC1, 0x85, 0xC2, 0xB2, 0xFA, /* 0x50-0x53 */ + 0xB2, 0xFB, 0xB2, 0xFC, 0x85, 0xC3, 0xB2, 0xFD, /* 0x54-0x57 */ + 0x85, 0xC4, 0xB2, 0xFE, 0x85, 0xC5, 0x85, 0xC6, /* 0x58-0x5B */ + 0x85, 0xC7, 0xB3, 0xA1, 0x85, 0xC8, 0x85, 0xC9, /* 0x5C-0x5F */ + 0x85, 0xCA, 0x85, 0xCB, 0x85, 0xCC, 0x85, 0xCD, /* 0x60-0x63 */ + 0x85, 0xCE, 0x85, 0xCF, 0x85, 0xD0, 0x85, 0xD1, /* 0x64-0x67 */ + 0x85, 0xD2, 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, /* 0x68-0x6B */ + 0x85, 0xD6, 0x85, 0xD7, 0x85, 0xD8, 0x85, 0xD9, /* 0x6C-0x6F */ + 0x85, 0xDA, 0x85, 0xDB, 0x85, 0xDC, 0x85, 0xDD, /* 0x70-0x73 */ + 0x85, 0xDE, 0x85, 0xDF, 0x85, 0xE0, 0x85, 0xE1, /* 0x74-0x77 */ + 0x85, 0xE2, 0x85, 0xE3, 0x85, 0xE4, 0x85, 0xE5, /* 0x78-0x7B */ + 0xB3, 0xA2, 0xB3, 0xA3, 0x85, 0xE6, 0x85, 0xE7, /* 0x7C-0x7F */ + + 0xB3, 0xA4, 0x85, 0xE8, 0x85, 0xE9, 0x85, 0xEA, /* 0x80-0x83 */ + 0xB3, 0xA5, 0x85, 0xEB, 0x85, 0xEC, 0x85, 0xED, /* 0x84-0x87 */ + 0x85, 0xEE, 0x85, 0xEF, 0x85, 0xF0, 0x85, 0xF1, /* 0x88-0x8B */ + 0xB3, 0xA6, 0xB3, 0xA7, 0x85, 0xF2, 0xB3, 0xA8, /* 0x8C-0x8F */ + 0x85, 0xF3, 0xB3, 0xA9, 0x85, 0xF4, 0x85, 0xF5, /* 0x90-0x93 */ + 0x85, 0xF6, 0x85, 0xF7, 0x85, 0xF8, 0x85, 0xF9, /* 0x94-0x97 */ + 0xB3, 0xAA, 0xB3, 0xAB, 0xB3, 0xAC, 0x85, 0xFA, /* 0x98-0x9B */ + 0xB3, 0xAD, 0x85, 0xFB, 0x85, 0xFC, 0xB3, 0xAE, /* 0x9C-0x9F */ + 0xB3, 0xAF, 0xB3, 0xB0, 0xB3, 0xB1, 0x85, 0xFD, /* 0xA0-0xA3 */ + 0x85, 0xFE, 0x86, 0x41, 0x86, 0x42, 0x86, 0x43, /* 0xA4-0xA7 */ + 0xB3, 0xB2, 0xB3, 0xB3, 0x86, 0x44, 0xB3, 0xB4, /* 0xA8-0xAB */ + 0xB3, 0xB5, 0xB3, 0xB6, 0xB3, 0xB7, 0xB3, 0xB8, /* 0xAC-0xAF */ + 0x86, 0x45, 0xB3, 0xB9, 0x86, 0x46, 0xB3, 0xBA, /* 0xB0-0xB3 */ + 0xB3, 0xBB, 0xB3, 0xBC, 0x86, 0x47, 0x86, 0x48, /* 0xB4-0xB7 */ + 0xB3, 0xBD, 0x86, 0x49, 0x86, 0x4A, 0x86, 0x4B, /* 0xB8-0xBB */ + 0xB3, 0xBE, 0x86, 0x4C, 0x86, 0x4D, 0x86, 0x4E, /* 0xBC-0xBF */ + 0x86, 0x4F, 0x86, 0x50, 0x86, 0x51, 0x86, 0x52, /* 0xC0-0xC3 */ + 0xB3, 0xBF, 0xB3, 0xC0, 0x86, 0x53, 0xB3, 0xC1, /* 0xC4-0xC7 */ + 0xB3, 0xC2, 0xB3, 0xC3, 0x86, 0x54, 0x86, 0x55, /* 0xC8-0xCB */ + 0x86, 0x56, 0x86, 0x57, 0x86, 0x58, 0x86, 0x59, /* 0xCC-0xCF */ + 0xB3, 0xC4, 0xB3, 0xC5, 0x86, 0x5A, 0x86, 0x61, /* 0xD0-0xD3 */ + 0xB3, 0xC6, 0x86, 0x62, 0x86, 0x63, 0x86, 0x64, /* 0xD4-0xD7 */ + 0xB3, 0xC7, 0x86, 0x65, 0x86, 0x66, 0x86, 0x67, /* 0xD8-0xDB */ + 0x86, 0x68, 0x86, 0x69, 0x86, 0x6A, 0x86, 0x6B, /* 0xDC-0xDF */ + 0xB3, 0xC8, 0x86, 0x6C, 0x86, 0x6D, 0x86, 0x6E, /* 0xE0-0xE3 */ + 0x86, 0x6F, 0xB3, 0xC9, 0x86, 0x70, 0x86, 0x71, /* 0xE4-0xE7 */ + 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, 0x86, 0x75, /* 0xE8-0xEB */ + 0x86, 0x76, 0x86, 0x77, 0x86, 0x78, 0x86, 0x79, /* 0xEC-0xEF */ + 0x86, 0x7A, 0x86, 0x81, 0x86, 0x82, 0x86, 0x83, /* 0xF0-0xF3 */ + 0x86, 0x84, 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, /* 0xF4-0xF7 */ + 0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0xF8-0xFB */ + 0x86, 0x8C, 0x86, 0x8D, 0x86, 0x8E, 0x86, 0x8F, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B1[512] = { + 0x86, 0x90, 0x86, 0x91, 0x86, 0x92, 0x86, 0x93, /* 0x00-0x03 */ + 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, 0x86, 0x97, /* 0x04-0x07 */ + 0xB3, 0xCA, 0xB3, 0xCB, 0x86, 0x98, 0xB3, 0xCC, /* 0x08-0x0B */ + 0xB3, 0xCD, 0x86, 0x99, 0x86, 0x9A, 0x86, 0x9B, /* 0x0C-0x0F */ + 0xB3, 0xCE, 0x86, 0x9C, 0xB3, 0xCF, 0xB3, 0xD0, /* 0x10-0x13 */ + 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, 0x86, 0xA0, /* 0x14-0x17 */ + 0xB3, 0xD1, 0xB3, 0xD2, 0x86, 0xA1, 0xB3, 0xD3, /* 0x18-0x1B */ + 0xB3, 0xD4, 0xB3, 0xD5, 0x86, 0xA2, 0x86, 0xA3, /* 0x1C-0x1F */ + 0x86, 0xA4, 0x86, 0xA5, 0x86, 0xA6, 0xB3, 0xD6, /* 0x20-0x23 */ + 0xB3, 0xD7, 0xB3, 0xD8, 0x86, 0xA7, 0x86, 0xA8, /* 0x24-0x27 */ + 0xB3, 0xD9, 0x86, 0xA9, 0x86, 0xAA, 0x86, 0xAB, /* 0x28-0x2B */ + 0xB3, 0xDA, 0x86, 0xAC, 0x86, 0xAD, 0x86, 0xAE, /* 0x2C-0x2F */ + 0x86, 0xAF, 0x86, 0xB0, 0x86, 0xB1, 0x86, 0xB2, /* 0x30-0x33 */ + 0xB3, 0xDB, 0xB3, 0xDC, 0x86, 0xB3, 0xB3, 0xDD, /* 0x34-0x37 */ + 0xB3, 0xDE, 0xB3, 0xDF, 0x86, 0xB4, 0x86, 0xB5, /* 0x38-0x3B */ + 0x86, 0xB6, 0x86, 0xB7, 0x86, 0xB8, 0x86, 0xB9, /* 0x3C-0x3F */ + 0xB3, 0xE0, 0xB3, 0xE1, 0x86, 0xBA, 0x86, 0xBB, /* 0x40-0x43 */ + 0xB3, 0xE2, 0x86, 0xBC, 0x86, 0xBD, 0x86, 0xBE, /* 0x44-0x47 */ + 0xB3, 0xE3, 0x86, 0xBF, 0x86, 0xC0, 0x86, 0xC1, /* 0x48-0x4B */ + 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, 0x86, 0xC5, /* 0x4C-0x4F */ + 0xB3, 0xE4, 0xB3, 0xE5, 0x86, 0xC6, 0x86, 0xC7, /* 0x50-0x53 */ + 0xB3, 0xE6, 0xB3, 0xE7, 0x86, 0xC8, 0x86, 0xC9, /* 0x54-0x57 */ + 0xB3, 0xE8, 0x86, 0xCA, 0x86, 0xCB, 0x86, 0xCC, /* 0x58-0x5B */ + 0xB3, 0xE9, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0x5C-0x5F */ + 0xB3, 0xEA, 0x86, 0xD0, 0x86, 0xD1, 0x86, 0xD2, /* 0x60-0x63 */ + 0x86, 0xD3, 0x86, 0xD4, 0x86, 0xD5, 0x86, 0xD6, /* 0x64-0x67 */ + 0x86, 0xD7, 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, /* 0x68-0x6B */ + 0x86, 0xDB, 0x86, 0xDC, 0x86, 0xDD, 0x86, 0xDE, /* 0x6C-0x6F */ + 0x86, 0xDF, 0x86, 0xE0, 0x86, 0xE1, 0x86, 0xE2, /* 0x70-0x73 */ + 0x86, 0xE3, 0x86, 0xE4, 0x86, 0xE5, 0x86, 0xE6, /* 0x74-0x77 */ + 0xB3, 0xEB, 0xB3, 0xEC, 0x86, 0xE7, 0x86, 0xE8, /* 0x78-0x7B */ + 0xB3, 0xED, 0x86, 0xE9, 0x86, 0xEA, 0x86, 0xEB, /* 0x7C-0x7F */ + + 0xB3, 0xEE, 0x86, 0xEC, 0xB3, 0xEF, 0x86, 0xED, /* 0x80-0x83 */ + 0x86, 0xEE, 0x86, 0xEF, 0x86, 0xF0, 0x86, 0xF1, /* 0x84-0x87 */ + 0xB3, 0xF0, 0xB3, 0xF1, 0x86, 0xF2, 0xB3, 0xF2, /* 0x88-0x8B */ + 0x86, 0xF3, 0xB3, 0xF3, 0x86, 0xF4, 0x86, 0xF5, /* 0x8C-0x8F */ + 0x86, 0xF6, 0x86, 0xF7, 0xB3, 0xF4, 0xB3, 0xF5, /* 0x90-0x93 */ + 0xB3, 0xF6, 0x86, 0xF8, 0x86, 0xF9, 0x86, 0xFA, /* 0x94-0x97 */ + 0xB3, 0xF7, 0x86, 0xFB, 0x86, 0xFC, 0x86, 0xFD, /* 0x98-0x9B */ + 0xB3, 0xF8, 0x86, 0xFE, 0x87, 0x41, 0x87, 0x42, /* 0x9C-0x9F */ + 0x87, 0x43, 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, /* 0xA0-0xA3 */ + 0x87, 0x47, 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, /* 0xA4-0xA7 */ + 0xB3, 0xF9, 0x87, 0x4B, 0x87, 0x4C, 0x87, 0x4D, /* 0xA8-0xAB */ + 0x87, 0x4E, 0x87, 0x4F, 0x87, 0x50, 0x87, 0x51, /* 0xAC-0xAF */ + 0x87, 0x52, 0x87, 0x53, 0x87, 0x54, 0x87, 0x55, /* 0xB0-0xB3 */ + 0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0xB4-0xB7 */ + 0x87, 0x5A, 0x87, 0x61, 0x87, 0x62, 0x87, 0x63, /* 0xB8-0xBB */ + 0x87, 0x64, 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, /* 0xBC-0xBF */ + 0x87, 0x68, 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, /* 0xC0-0xC3 */ + 0x87, 0x6C, 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, /* 0xC4-0xC7 */ + 0x87, 0x70, 0x87, 0x71, 0x87, 0x72, 0x87, 0x73, /* 0xC8-0xCB */ + 0xB3, 0xFA, 0x87, 0x74, 0x87, 0x75, 0x87, 0x76, /* 0xCC-0xCF */ + 0xB3, 0xFB, 0x87, 0x77, 0x87, 0x78, 0x87, 0x79, /* 0xD0-0xD3 */ + 0xB3, 0xFC, 0x87, 0x7A, 0x87, 0x81, 0x87, 0x82, /* 0xD4-0xD7 */ + 0x87, 0x83, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0xD8-0xDB */ + 0xB3, 0xFD, 0xB3, 0xFE, 0x87, 0x87, 0xB4, 0xA1, /* 0xDC-0xDF */ + 0x87, 0x88, 0x87, 0x89, 0x87, 0x8A, 0x87, 0x8B, /* 0xE0-0xE3 */ + 0x87, 0x8C, 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, /* 0xE4-0xE7 */ + 0xB4, 0xA2, 0xB4, 0xA3, 0x87, 0x90, 0x87, 0x91, /* 0xE8-0xEB */ + 0xB4, 0xA4, 0x87, 0x92, 0x87, 0x93, 0x87, 0x94, /* 0xEC-0xEF */ + 0xB4, 0xA5, 0x87, 0x95, 0x87, 0x96, 0x87, 0x97, /* 0xF0-0xF3 */ + 0x87, 0x98, 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, /* 0xF4-0xF7 */ + 0x87, 0x9C, 0xB4, 0xA6, 0x87, 0x9D, 0xB4, 0xA7, /* 0xF8-0xFB */ + 0x87, 0x9E, 0xB4, 0xA8, 0x87, 0x9F, 0x87, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B2[512] = { + 0x87, 0xA1, 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, /* 0x00-0x03 */ + 0xB4, 0xA9, 0xB4, 0xAA, 0x87, 0xA5, 0x87, 0xA6, /* 0x04-0x07 */ + 0xB4, 0xAB, 0x87, 0xA7, 0x87, 0xA8, 0xB4, 0xAC, /* 0x08-0x0B */ + 0xB4, 0xAD, 0x87, 0xA9, 0x87, 0xAA, 0x87, 0xAB, /* 0x0C-0x0F */ + 0x87, 0xAC, 0x87, 0xAD, 0x87, 0xAE, 0x87, 0xAF, /* 0x10-0x13 */ + 0xB4, 0xAE, 0xB4, 0xAF, 0x87, 0xB0, 0xB4, 0xB0, /* 0x14-0x17 */ + 0x87, 0xB1, 0xB4, 0xB1, 0x87, 0xB2, 0x87, 0xB3, /* 0x18-0x1B */ + 0x87, 0xB4, 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, /* 0x1C-0x1F */ + 0xB4, 0xB2, 0x87, 0xB8, 0x87, 0xB9, 0x87, 0xBA, /* 0x20-0x23 */ + 0x87, 0xBB, 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, /* 0x24-0x27 */ + 0x87, 0xBF, 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, /* 0x28-0x2B */ + 0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0x2C-0x2F */ + 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, 0x87, 0xCA, /* 0x30-0x33 */ + 0xB4, 0xB3, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0x34-0x37 */ + 0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0x38-0x3B */ + 0xB4, 0xB4, 0x87, 0xD2, 0x87, 0xD3, 0x87, 0xD4, /* 0x3C-0x3F */ + 0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0x40-0x43 */ + 0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0x44-0x47 */ + 0x87, 0xDD, 0x87, 0xDE, 0x87, 0xDF, 0x87, 0xE0, /* 0x48-0x4B */ + 0x87, 0xE1, 0x87, 0xE2, 0x87, 0xE3, 0x87, 0xE4, /* 0x4C-0x4F */ + 0x87, 0xE5, 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, /* 0x50-0x53 */ + 0x87, 0xE9, 0x87, 0xEA, 0x87, 0xEB, 0x87, 0xEC, /* 0x54-0x57 */ + 0xB4, 0xB5, 0x87, 0xED, 0x87, 0xEE, 0x87, 0xEF, /* 0x58-0x5B */ + 0xB4, 0xB6, 0x87, 0xF0, 0x87, 0xF1, 0x87, 0xF2, /* 0x5C-0x5F */ + 0xB4, 0xB7, 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, /* 0x60-0x63 */ + 0x87, 0xF6, 0x87, 0xF7, 0x87, 0xF8, 0x87, 0xF9, /* 0x64-0x67 */ + 0xB4, 0xB8, 0xB4, 0xB9, 0x87, 0xFA, 0x87, 0xFB, /* 0x68-0x6B */ + 0x87, 0xFC, 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x41, /* 0x6C-0x6F */ + 0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x70-0x73 */ + 0xB4, 0xBA, 0xB4, 0xBB, 0x88, 0x46, 0x88, 0x47, /* 0x74-0x77 */ + 0x88, 0x48, 0x88, 0x49, 0x88, 0x4A, 0x88, 0x4B, /* 0x78-0x7B */ + 0xB4, 0xBC, 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, /* 0x7C-0x7F */ + + 0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x80-0x83 */ + 0xB4, 0xBD, 0xB4, 0xBE, 0x88, 0x53, 0x88, 0x54, /* 0x84-0x87 */ + 0x88, 0x55, 0xB4, 0xBF, 0x88, 0x56, 0x88, 0x57, /* 0x88-0x8B */ + 0x88, 0x58, 0x88, 0x59, 0x88, 0x5A, 0x88, 0x61, /* 0x8C-0x8F */ + 0xB4, 0xC0, 0xB4, 0xC1, 0x88, 0x62, 0x88, 0x63, /* 0x90-0x93 */ + 0xB4, 0xC2, 0x88, 0x64, 0x88, 0x65, 0x88, 0x66, /* 0x94-0x97 */ + 0xB4, 0xC3, 0xB4, 0xC4, 0xB4, 0xC5, 0x88, 0x67, /* 0x98-0x9B */ + 0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0x88, 0x6B, /* 0x9C-0x9F */ + 0xB4, 0xC6, 0xB4, 0xC7, 0x88, 0x6C, 0xB4, 0xC8, /* 0xA0-0xA3 */ + 0x88, 0x6D, 0xB4, 0xC9, 0xB4, 0xCA, 0x88, 0x6E, /* 0xA4-0xA7 */ + 0x88, 0x6F, 0x88, 0x70, 0xB4, 0xCB, 0x88, 0x71, /* 0xA8-0xAB */ + 0xB4, 0xCC, 0x88, 0x72, 0x88, 0x73, 0x88, 0x74, /* 0xAC-0xAF */ + 0xB4, 0xCD, 0x88, 0x75, 0x88, 0x76, 0x88, 0x77, /* 0xB0-0xB3 */ + 0xB4, 0xCE, 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, /* 0xB4-0xB7 */ + 0x88, 0x81, 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, /* 0xB8-0xBB */ + 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, 0x88, 0x88, /* 0xBC-0xBF */ + 0x88, 0x89, 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, /* 0xC0-0xC3 */ + 0x88, 0x8D, 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, /* 0xC4-0xC7 */ + 0xB4, 0xCF, 0xB4, 0xD0, 0x88, 0x91, 0x88, 0x92, /* 0xC8-0xCB */ + 0xB4, 0xD1, 0x88, 0x93, 0x88, 0x94, 0x88, 0x95, /* 0xCC-0xCF */ + 0xB4, 0xD2, 0x88, 0x96, 0xB4, 0xD3, 0x88, 0x97, /* 0xD0-0xD3 */ + 0x88, 0x98, 0x88, 0x99, 0x88, 0x9A, 0x88, 0x9B, /* 0xD4-0xD7 */ + 0xB4, 0xD4, 0xB4, 0xD5, 0x88, 0x9C, 0xB4, 0xD6, /* 0xD8-0xDB */ + 0x88, 0x9D, 0xB4, 0xD7, 0x88, 0x9E, 0x88, 0x9F, /* 0xDC-0xDF */ + 0x88, 0xA0, 0x88, 0xA1, 0xB4, 0xD8, 0x88, 0xA2, /* 0xE0-0xE3 */ + 0xB4, 0xD9, 0xB4, 0xDA, 0xB4, 0xDB, 0x88, 0xA3, /* 0xE4-0xE7 */ + 0xB4, 0xDC, 0x88, 0xA4, 0x88, 0xA5, 0xB4, 0xDD, /* 0xE8-0xEB */ + 0xB4, 0xDE, 0xB4, 0xDF, 0xB4, 0xE0, 0xB4, 0xE1, /* 0xEC-0xEF */ + 0x88, 0xA6, 0x88, 0xA7, 0x88, 0xA8, 0xB4, 0xE2, /* 0xF0-0xF3 */ + 0xB4, 0xE3, 0xB4, 0xE4, 0x88, 0xA9, 0xB4, 0xE5, /* 0xF4-0xF7 */ + 0xB4, 0xE6, 0xB4, 0xE7, 0xB4, 0xE8, 0xB4, 0xE9, /* 0xF8-0xFB */ + 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, 0xB4, 0xEA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B3[512] = { + 0xB4, 0xEB, 0xB4, 0xEC, 0x88, 0xAD, 0x88, 0xAE, /* 0x00-0x03 */ + 0xB4, 0xED, 0x88, 0xAF, 0x88, 0xB0, 0x88, 0xB1, /* 0x04-0x07 */ + 0xB4, 0xEE, 0x88, 0xB2, 0x88, 0xB3, 0x88, 0xB4, /* 0x08-0x0B */ + 0x88, 0xB5, 0x88, 0xB6, 0x88, 0xB7, 0x88, 0xB8, /* 0x0C-0x0F */ + 0xB4, 0xEF, 0xB4, 0xF0, 0x88, 0xB9, 0xB4, 0xF1, /* 0x10-0x13 */ + 0xB4, 0xF2, 0xB4, 0xF3, 0x88, 0xBA, 0x88, 0xBB, /* 0x14-0x17 */ + 0x88, 0xBC, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0x18-0x1B */ + 0xB4, 0xF4, 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, /* 0x1C-0x1F */ + 0x88, 0xC3, 0x88, 0xC4, 0x88, 0xC5, 0x88, 0xC6, /* 0x20-0x23 */ + 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, 0x88, 0xCA, /* 0x24-0x27 */ + 0x88, 0xCB, 0x88, 0xCC, 0x88, 0xCD, 0x88, 0xCE, /* 0x28-0x2B */ + 0x88, 0xCF, 0x88, 0xD0, 0x88, 0xD1, 0x88, 0xD2, /* 0x2C-0x2F */ + 0x88, 0xD3, 0x88, 0xD4, 0x88, 0xD5, 0x88, 0xD6, /* 0x30-0x33 */ + 0x88, 0xD7, 0x88, 0xD8, 0x88, 0xD9, 0x88, 0xDA, /* 0x34-0x37 */ + 0x88, 0xDB, 0x88, 0xDC, 0x88, 0xDD, 0x88, 0xDE, /* 0x38-0x3B */ + 0x88, 0xDF, 0x88, 0xE0, 0x88, 0xE1, 0x88, 0xE2, /* 0x3C-0x3F */ + 0x88, 0xE3, 0x88, 0xE4, 0x88, 0xE5, 0x88, 0xE6, /* 0x40-0x43 */ + 0x88, 0xE7, 0x88, 0xE8, 0x88, 0xE9, 0x88, 0xEA, /* 0x44-0x47 */ + 0x88, 0xEB, 0x88, 0xEC, 0x88, 0xED, 0x88, 0xEE, /* 0x48-0x4B */ + 0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x4C-0x4F */ + 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, 0x88, 0xF6, /* 0x50-0x53 */ + 0xB4, 0xF5, 0xB4, 0xF6, 0xB4, 0xF7, 0x88, 0xF7, /* 0x54-0x57 */ + 0xB4, 0xF8, 0x88, 0xF8, 0x88, 0xF9, 0xB4, 0xF9, /* 0x58-0x5B */ + 0xB4, 0xFA, 0x88, 0xFA, 0xB4, 0xFB, 0xB4, 0xFC, /* 0x5C-0x5F */ + 0x88, 0xFB, 0x88, 0xFC, 0x88, 0xFD, 0x88, 0xFE, /* 0x60-0x63 */ + 0xB4, 0xFD, 0xB4, 0xFE, 0x89, 0x41, 0xB5, 0xA1, /* 0x64-0x67 */ + 0x89, 0x42, 0xB5, 0xA2, 0x89, 0x43, 0xB5, 0xA3, /* 0x68-0x6B */ + 0x89, 0x44, 0x89, 0x45, 0xB5, 0xA4, 0x89, 0x46, /* 0x6C-0x6F */ + 0xB5, 0xA5, 0xB5, 0xA6, 0x89, 0x47, 0x89, 0x48, /* 0x70-0x73 */ + 0xB5, 0xA7, 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, /* 0x74-0x77 */ + 0xB5, 0xA8, 0x89, 0x4C, 0x89, 0x4D, 0x89, 0x4E, /* 0x78-0x7B */ + 0x89, 0x4F, 0x89, 0x50, 0x89, 0x51, 0x89, 0x52, /* 0x7C-0x7F */ + + 0xB5, 0xA9, 0xB5, 0xAA, 0x89, 0x53, 0xB5, 0xAB, /* 0x80-0x83 */ + 0xB5, 0xAC, 0xB5, 0xAD, 0x89, 0x54, 0x89, 0x55, /* 0x84-0x87 */ + 0x89, 0x56, 0x89, 0x57, 0x89, 0x58, 0x89, 0x59, /* 0x88-0x8B */ + 0xB5, 0xAE, 0x89, 0x5A, 0x89, 0x61, 0x89, 0x62, /* 0x8C-0x8F */ + 0xB5, 0xAF, 0x89, 0x63, 0x89, 0x64, 0x89, 0x65, /* 0x90-0x93 */ + 0xB5, 0xB0, 0x89, 0x66, 0x89, 0x67, 0x89, 0x68, /* 0x94-0x97 */ + 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, 0x89, 0x6C, /* 0x98-0x9B */ + 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, 0x89, 0x70, /* 0x9C-0x9F */ + 0xB5, 0xB1, 0xB5, 0xB2, 0x89, 0x71, 0x89, 0x72, /* 0xA0-0xA3 */ + 0x89, 0x73, 0x89, 0x74, 0x89, 0x75, 0x89, 0x76, /* 0xA4-0xA7 */ + 0xB5, 0xB3, 0x89, 0x77, 0x89, 0x78, 0x89, 0x79, /* 0xA8-0xAB */ + 0xB5, 0xB4, 0x89, 0x7A, 0x89, 0x81, 0x89, 0x82, /* 0xAC-0xAF */ + 0x89, 0x83, 0x89, 0x84, 0x89, 0x85, 0x89, 0x86, /* 0xB0-0xB3 */ + 0x89, 0x87, 0x89, 0x88, 0x89, 0x89, 0x89, 0x8A, /* 0xB4-0xB7 */ + 0x89, 0x8B, 0x89, 0x8C, 0x89, 0x8D, 0x89, 0x8E, /* 0xB8-0xBB */ + 0x89, 0x8F, 0x89, 0x90, 0x89, 0x91, 0x89, 0x92, /* 0xBC-0xBF */ + 0x89, 0x93, 0x89, 0x94, 0x89, 0x95, 0x89, 0x96, /* 0xC0-0xC3 */ + 0xB5, 0xB5, 0xB5, 0xB6, 0x89, 0x97, 0x89, 0x98, /* 0xC4-0xC7 */ + 0xB5, 0xB7, 0x89, 0x99, 0x89, 0x9A, 0xB5, 0xB8, /* 0xC8-0xCB */ + 0xB5, 0xB9, 0x89, 0x9B, 0xB5, 0xBA, 0x89, 0x9C, /* 0xCC-0xCF */ + 0xB5, 0xBB, 0x89, 0x9D, 0x89, 0x9E, 0x89, 0x9F, /* 0xD0-0xD3 */ + 0xB5, 0xBC, 0xB5, 0xBD, 0x89, 0xA0, 0xB5, 0xBE, /* 0xD4-0xD7 */ + 0x89, 0xA1, 0xB5, 0xBF, 0x89, 0xA2, 0xB5, 0xC0, /* 0xD8-0xDB */ + 0x89, 0xA3, 0xB5, 0xC1, 0x89, 0xA4, 0x89, 0xA5, /* 0xDC-0xDF */ + 0xB5, 0xC2, 0x89, 0xA6, 0x89, 0xA7, 0x89, 0xA8, /* 0xE0-0xE3 */ + 0xB5, 0xC3, 0x89, 0xA9, 0x89, 0xAA, 0x89, 0xAB, /* 0xE4-0xE7 */ + 0xB5, 0xC4, 0x89, 0xAC, 0x89, 0xAD, 0x89, 0xAE, /* 0xE8-0xEB */ + 0x89, 0xAF, 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, /* 0xEC-0xEF */ + 0x89, 0xB3, 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, /* 0xF0-0xF3 */ + 0x89, 0xB7, 0x89, 0xB8, 0x89, 0xB9, 0x89, 0xBA, /* 0xF4-0xF7 */ + 0x89, 0xBB, 0x89, 0xBC, 0x89, 0xBD, 0x89, 0xBE, /* 0xF8-0xFB */ + 0xB5, 0xC5, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B4[512] = { + 0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0x00-0x03 */ + 0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0x04-0x07 */ + 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, 0x89, 0xCD, /* 0x08-0x0B */ + 0x89, 0xCE, 0x89, 0xCF, 0x89, 0xD0, 0x89, 0xD1, /* 0x0C-0x0F */ + 0xB5, 0xC6, 0x89, 0xD2, 0x89, 0xD3, 0x89, 0xD4, /* 0x10-0x13 */ + 0x89, 0xD5, 0x89, 0xD6, 0x89, 0xD7, 0x89, 0xD8, /* 0x14-0x17 */ + 0xB5, 0xC7, 0x89, 0xD9, 0x89, 0xDA, 0x89, 0xDB, /* 0x18-0x1B */ + 0xB5, 0xC8, 0x89, 0xDC, 0x89, 0xDD, 0x89, 0xDE, /* 0x1C-0x1F */ + 0xB5, 0xC9, 0x89, 0xDF, 0x89, 0xE0, 0x89, 0xE1, /* 0x20-0x23 */ + 0x89, 0xE2, 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, /* 0x24-0x27 */ + 0xB5, 0xCA, 0xB5, 0xCB, 0x89, 0xE6, 0xB5, 0xCC, /* 0x28-0x2B */ + 0x89, 0xE7, 0x89, 0xE8, 0x89, 0xE9, 0x89, 0xEA, /* 0x2C-0x2F */ + 0x89, 0xEB, 0x89, 0xEC, 0x89, 0xED, 0x89, 0xEE, /* 0x30-0x33 */ + 0xB5, 0xCD, 0x89, 0xEF, 0x89, 0xF0, 0x89, 0xF1, /* 0x34-0x37 */ + 0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x38-0x3B */ + 0x89, 0xF6, 0x89, 0xF7, 0x89, 0xF8, 0x89, 0xF9, /* 0x3C-0x3F */ + 0x89, 0xFA, 0x89, 0xFB, 0x89, 0xFC, 0x89, 0xFD, /* 0x40-0x43 */ + 0x89, 0xFE, 0x8A, 0x41, 0x8A, 0x42, 0x8A, 0x43, /* 0x44-0x47 */ + 0x8A, 0x44, 0x8A, 0x45, 0x8A, 0x46, 0x8A, 0x47, /* 0x48-0x4B */ + 0x8A, 0x48, 0x8A, 0x49, 0x8A, 0x4A, 0x8A, 0x4B, /* 0x4C-0x4F */ + 0xB5, 0xCE, 0xB5, 0xCF, 0x8A, 0x4C, 0x8A, 0x4D, /* 0x50-0x53 */ + 0xB5, 0xD0, 0x8A, 0x4E, 0x8A, 0x4F, 0x8A, 0x50, /* 0x54-0x57 */ + 0xB5, 0xD1, 0x8A, 0x51, 0x8A, 0x52, 0x8A, 0x53, /* 0x58-0x5B */ + 0x8A, 0x54, 0x8A, 0x55, 0x8A, 0x56, 0x8A, 0x57, /* 0x5C-0x5F */ + 0xB5, 0xD2, 0xB5, 0xD3, 0x8A, 0x58, 0xB5, 0xD4, /* 0x60-0x63 */ + 0x8A, 0x59, 0xB5, 0xD5, 0x8A, 0x5A, 0x8A, 0x61, /* 0x64-0x67 */ + 0x8A, 0x62, 0x8A, 0x63, 0x8A, 0x64, 0x8A, 0x65, /* 0x68-0x6B */ + 0xB5, 0xD6, 0x8A, 0x66, 0x8A, 0x67, 0x8A, 0x68, /* 0x6C-0x6F */ + 0x8A, 0x69, 0x8A, 0x6A, 0x8A, 0x6B, 0x8A, 0x6C, /* 0x70-0x73 */ + 0x8A, 0x6D, 0x8A, 0x6E, 0x8A, 0x6F, 0x8A, 0x70, /* 0x74-0x77 */ + 0x8A, 0x71, 0x8A, 0x72, 0x8A, 0x73, 0x8A, 0x74, /* 0x78-0x7B */ + 0x8A, 0x75, 0x8A, 0x76, 0x8A, 0x77, 0x8A, 0x78, /* 0x7C-0x7F */ + + 0xB5, 0xD7, 0x8A, 0x79, 0x8A, 0x7A, 0x8A, 0x81, /* 0x80-0x83 */ + 0x8A, 0x82, 0x8A, 0x83, 0x8A, 0x84, 0x8A, 0x85, /* 0x84-0x87 */ + 0xB5, 0xD8, 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, /* 0x88-0x8B */ + 0x8A, 0x89, 0x8A, 0x8A, 0x8A, 0x8B, 0x8A, 0x8C, /* 0x8C-0x8F */ + 0x8A, 0x8D, 0x8A, 0x8E, 0x8A, 0x8F, 0x8A, 0x90, /* 0x90-0x93 */ + 0x8A, 0x91, 0x8A, 0x92, 0x8A, 0x93, 0x8A, 0x94, /* 0x94-0x97 */ + 0x8A, 0x95, 0x8A, 0x96, 0x8A, 0x97, 0x8A, 0x98, /* 0x98-0x9B */ + 0x8A, 0x99, 0xB5, 0xD9, 0x8A, 0x9A, 0x8A, 0x9B, /* 0x9C-0x9F */ + 0x8A, 0x9C, 0x8A, 0x9D, 0x8A, 0x9E, 0x8A, 0x9F, /* 0xA0-0xA3 */ + 0xB5, 0xDA, 0x8A, 0xA0, 0x8A, 0xA1, 0x8A, 0xA2, /* 0xA4-0xA7 */ + 0xB5, 0xDB, 0x8A, 0xA3, 0x8A, 0xA4, 0x8A, 0xA5, /* 0xA8-0xAB */ + 0xB5, 0xDC, 0x8A, 0xA6, 0x8A, 0xA7, 0x8A, 0xA8, /* 0xAC-0xAF */ + 0x8A, 0xA9, 0x8A, 0xAA, 0x8A, 0xAB, 0x8A, 0xAC, /* 0xB0-0xB3 */ + 0x8A, 0xAD, 0xB5, 0xDD, 0x8A, 0xAE, 0xB5, 0xDE, /* 0xB4-0xB7 */ + 0x8A, 0xAF, 0xB5, 0xDF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xB8-0xBB */ + 0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xBC-0xBF */ + 0xB5, 0xE0, 0x8A, 0xB6, 0x8A, 0xB7, 0x8A, 0xB8, /* 0xC0-0xC3 */ + 0xB5, 0xE1, 0x8A, 0xB9, 0x8A, 0xBA, 0x8A, 0xBB, /* 0xC4-0xC7 */ + 0xB5, 0xE2, 0x8A, 0xBC, 0x8A, 0xBD, 0x8A, 0xBE, /* 0xC8-0xCB */ + 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, 0x8A, 0xC2, /* 0xCC-0xCF */ + 0xB5, 0xE3, 0x8A, 0xC3, 0x8A, 0xC4, 0x8A, 0xC5, /* 0xD0-0xD3 */ + 0x8A, 0xC6, 0xB5, 0xE4, 0x8A, 0xC7, 0x8A, 0xC8, /* 0xD4-0xD7 */ + 0x8A, 0xC9, 0x8A, 0xCA, 0x8A, 0xCB, 0x8A, 0xCC, /* 0xD8-0xDB */ + 0xB5, 0xE5, 0xB5, 0xE6, 0x8A, 0xCD, 0x8A, 0xCE, /* 0xDC-0xDF */ + 0xB5, 0xE7, 0x8A, 0xCF, 0x8A, 0xD0, 0xB5, 0xE8, /* 0xE0-0xE3 */ + 0xB5, 0xE9, 0x8A, 0xD1, 0xB5, 0xEA, 0x8A, 0xD2, /* 0xE4-0xE7 */ + 0x8A, 0xD3, 0x8A, 0xD4, 0x8A, 0xD5, 0x8A, 0xD6, /* 0xE8-0xEB */ + 0xB5, 0xEB, 0xB5, 0xEC, 0x8A, 0xD7, 0xB5, 0xED, /* 0xEC-0xEF */ + 0x8A, 0xD8, 0xB5, 0xEE, 0x8A, 0xD9, 0x8A, 0xDA, /* 0xF0-0xF3 */ + 0x8A, 0xDB, 0x8A, 0xDC, 0x8A, 0xDD, 0x8A, 0xDE, /* 0xF4-0xF7 */ + 0xB5, 0xEF, 0x8A, 0xDF, 0x8A, 0xE0, 0x8A, 0xE1, /* 0xF8-0xFB */ + 0x8A, 0xE2, 0x8A, 0xE3, 0x8A, 0xE4, 0x8A, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B5[512] = { + 0x8A, 0xE6, 0x8A, 0xE7, 0x8A, 0xE8, 0x8A, 0xE9, /* 0x00-0x03 */ + 0x8A, 0xEA, 0x8A, 0xEB, 0x8A, 0xEC, 0x8A, 0xED, /* 0x04-0x07 */ + 0x8A, 0xEE, 0x8A, 0xEF, 0x8A, 0xF0, 0x8A, 0xF1, /* 0x08-0x0B */ + 0x8A, 0xF2, 0x8A, 0xF3, 0x8A, 0xF4, 0x8A, 0xF5, /* 0x0C-0x0F */ + 0x8A, 0xF6, 0x8A, 0xF7, 0x8A, 0xF8, 0x8A, 0xF9, /* 0x10-0x13 */ + 0xB5, 0xF0, 0xB5, 0xF1, 0x8A, 0xFA, 0x8A, 0xFB, /* 0x14-0x17 */ + 0xB5, 0xF2, 0x8A, 0xFC, 0x8A, 0xFD, 0xB5, 0xF3, /* 0x18-0x1B */ + 0xB5, 0xF4, 0x8A, 0xFE, 0x8B, 0x41, 0x8B, 0x42, /* 0x1C-0x1F */ + 0x8B, 0x43, 0x8B, 0x44, 0x8B, 0x45, 0x8B, 0x46, /* 0x20-0x23 */ + 0xB5, 0xF5, 0xB5, 0xF6, 0x8B, 0x47, 0xB5, 0xF7, /* 0x24-0x27 */ + 0xB5, 0xF8, 0xB5, 0xF9, 0xB5, 0xFA, 0x8B, 0x48, /* 0x28-0x2B */ + 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, 0x8B, 0x4C, /* 0x2C-0x2F */ + 0xB5, 0xFB, 0xB5, 0xFC, 0x8B, 0x4D, 0x8B, 0x4E, /* 0x30-0x33 */ + 0xB5, 0xFD, 0x8B, 0x4F, 0x8B, 0x50, 0x8B, 0x51, /* 0x34-0x37 */ + 0xB5, 0xFE, 0x8B, 0x52, 0x8B, 0x53, 0x8B, 0x54, /* 0x38-0x3B */ + 0x8B, 0x55, 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, /* 0x3C-0x3F */ + 0xB6, 0xA1, 0xB6, 0xA2, 0x8B, 0x59, 0xB6, 0xA3, /* 0x40-0x43 */ + 0xB6, 0xA4, 0xB6, 0xA5, 0x8B, 0x5A, 0x8B, 0x61, /* 0x44-0x47 */ + 0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0xB6, 0xA6, /* 0x48-0x4B */ + 0xB6, 0xA7, 0xB6, 0xA8, 0x8B, 0x65, 0x8B, 0x66, /* 0x4C-0x4F */ + 0xB6, 0xA9, 0x8B, 0x67, 0x8B, 0x68, 0x8B, 0x69, /* 0x50-0x53 */ + 0xB6, 0xAA, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x54-0x57 */ + 0x8B, 0x6D, 0x8B, 0x6E, 0x8B, 0x6F, 0x8B, 0x70, /* 0x58-0x5B */ + 0xB6, 0xAB, 0xB6, 0xAC, 0x8B, 0x71, 0xB6, 0xAD, /* 0x5C-0x5F */ + 0xB6, 0xAE, 0xB6, 0xAF, 0x8B, 0x72, 0x8B, 0x73, /* 0x60-0x63 */ + 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, 0x8B, 0x77, /* 0x64-0x67 */ + 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, 0x8B, 0x81, /* 0x68-0x6B */ + 0x8B, 0x82, 0x8B, 0x83, 0x8B, 0x84, 0x8B, 0x85, /* 0x6C-0x6F */ + 0x8B, 0x86, 0x8B, 0x87, 0x8B, 0x88, 0x8B, 0x89, /* 0x70-0x73 */ + 0x8B, 0x8A, 0x8B, 0x8B, 0x8B, 0x8C, 0x8B, 0x8D, /* 0x74-0x77 */ + 0x8B, 0x8E, 0x8B, 0x8F, 0x8B, 0x90, 0x8B, 0x91, /* 0x78-0x7B */ + 0x8B, 0x92, 0x8B, 0x93, 0x8B, 0x94, 0x8B, 0x95, /* 0x7C-0x7F */ + + 0x8B, 0x96, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0x80-0x83 */ + 0x8B, 0x9A, 0x8B, 0x9B, 0x8B, 0x9C, 0x8B, 0x9D, /* 0x84-0x87 */ + 0x8B, 0x9E, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0x88-0x8B */ + 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, 0x8B, 0xA5, /* 0x8C-0x8F */ + 0x8B, 0xA6, 0x8B, 0xA7, 0x8B, 0xA8, 0x8B, 0xA9, /* 0x90-0x93 */ + 0x8B, 0xAA, 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, /* 0x94-0x97 */ + 0x8B, 0xAE, 0x8B, 0xAF, 0x8B, 0xB0, 0x8B, 0xB1, /* 0x98-0x9B */ + 0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0x9C-0x9F */ + 0xB6, 0xB0, 0xB6, 0xB1, 0x8B, 0xB6, 0x8B, 0xB7, /* 0xA0-0xA3 */ + 0xB6, 0xB2, 0x8B, 0xB8, 0x8B, 0xB9, 0x8B, 0xBA, /* 0xA4-0xA7 */ + 0xB6, 0xB3, 0x8B, 0xBB, 0xB6, 0xB4, 0xB6, 0xB5, /* 0xA8-0xAB */ + 0x8B, 0xBC, 0x8B, 0xBD, 0x8B, 0xBE, 0x8B, 0xBF, /* 0xAC-0xAF */ + 0xB6, 0xB6, 0xB6, 0xB7, 0x8B, 0xC0, 0xB6, 0xB8, /* 0xB0-0xB3 */ + 0xB6, 0xB9, 0xB6, 0xBA, 0x8B, 0xC1, 0x8B, 0xC2, /* 0xB4-0xB7 */ + 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, 0xB6, 0xBB, /* 0xB8-0xBB */ + 0xB6, 0xBC, 0xB6, 0xBD, 0x8B, 0xC6, 0x8B, 0xC7, /* 0xBC-0xBF */ + 0xB6, 0xBE, 0x8B, 0xC8, 0x8B, 0xC9, 0x8B, 0xCA, /* 0xC0-0xC3 */ + 0xB6, 0xBF, 0x8B, 0xCB, 0x8B, 0xCC, 0x8B, 0xCD, /* 0xC4-0xC7 */ + 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, 0x8B, 0xD1, /* 0xC8-0xCB */ + 0xB6, 0xC0, 0xB6, 0xC1, 0x8B, 0xD2, 0xB6, 0xC2, /* 0xCC-0xCF */ + 0xB6, 0xC3, 0xB6, 0xC4, 0x8B, 0xD3, 0x8B, 0xD4, /* 0xD0-0xD3 */ + 0x8B, 0xD5, 0x8B, 0xD6, 0x8B, 0xD7, 0x8B, 0xD8, /* 0xD4-0xD7 */ + 0xB6, 0xC5, 0x8B, 0xD9, 0x8B, 0xDA, 0x8B, 0xDB, /* 0xD8-0xDB */ + 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, 0x8B, 0xDF, /* 0xDC-0xDF */ + 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, 0x8B, 0xE3, /* 0xE0-0xE3 */ + 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, 0x8B, 0xE7, /* 0xE4-0xE7 */ + 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, 0x8B, 0xEB, /* 0xE8-0xEB */ + 0xB6, 0xC6, 0x8B, 0xEC, 0x8B, 0xED, 0x8B, 0xEE, /* 0xEC-0xEF */ + 0x8B, 0xEF, 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, /* 0xF0-0xF3 */ + 0x8B, 0xF3, 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, /* 0xF4-0xF7 */ + 0x8B, 0xF7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0xF8-0xFB */ + 0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B6[512] = { + 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, 0x8C, 0x44, /* 0x00-0x03 */ + 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, 0x8C, 0x48, /* 0x04-0x07 */ + 0x8C, 0x49, 0x8C, 0x4A, 0x8C, 0x4B, 0x8C, 0x4C, /* 0x08-0x0B */ + 0x8C, 0x4D, 0x8C, 0x4E, 0x8C, 0x4F, 0x8C, 0x50, /* 0x0C-0x0F */ + 0xB6, 0xC7, 0xB6, 0xC8, 0x8C, 0x51, 0x8C, 0x52, /* 0x10-0x13 */ + 0xB6, 0xC9, 0x8C, 0x53, 0x8C, 0x54, 0x8C, 0x55, /* 0x14-0x17 */ + 0xB6, 0xCA, 0x8C, 0x56, 0x8C, 0x57, 0x8C, 0x58, /* 0x18-0x1B */ + 0x8C, 0x59, 0x8C, 0x5A, 0x8C, 0x61, 0x8C, 0x62, /* 0x1C-0x1F */ + 0x8C, 0x63, 0x8C, 0x64, 0x8C, 0x65, 0x8C, 0x66, /* 0x20-0x23 */ + 0x8C, 0x67, 0xB6, 0xCB, 0x8C, 0x68, 0x8C, 0x69, /* 0x24-0x27 */ + 0x8C, 0x6A, 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, /* 0x28-0x2B */ + 0xB6, 0xCC, 0x8C, 0x6E, 0x8C, 0x6F, 0x8C, 0x70, /* 0x2C-0x2F */ + 0x8C, 0x71, 0x8C, 0x72, 0x8C, 0x73, 0x8C, 0x74, /* 0x30-0x33 */ + 0xB6, 0xCD, 0x8C, 0x75, 0x8C, 0x76, 0x8C, 0x77, /* 0x34-0x37 */ + 0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x81, /* 0x38-0x3B */ + 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, 0x8C, 0x85, /* 0x3C-0x3F */ + 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, 0x8C, 0x89, /* 0x40-0x43 */ + 0x8C, 0x8A, 0x8C, 0x8B, 0x8C, 0x8C, 0x8C, 0x8D, /* 0x44-0x47 */ + 0xB6, 0xCE, 0x8C, 0x8E, 0x8C, 0x8F, 0x8C, 0x90, /* 0x48-0x4B */ + 0x8C, 0x91, 0x8C, 0x92, 0x8C, 0x93, 0x8C, 0x94, /* 0x4C-0x4F */ + 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, 0x8C, 0x98, /* 0x50-0x53 */ + 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, 0x8C, 0x9C, /* 0x54-0x57 */ + 0x8C, 0x9D, 0x8C, 0x9E, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x58-0x5B */ + 0x8C, 0xA1, 0x8C, 0xA2, 0x8C, 0xA3, 0x8C, 0xA4, /* 0x5C-0x5F */ + 0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0x8C, 0xA8, /* 0x60-0x63 */ + 0xB6, 0xCF, 0x8C, 0xA9, 0x8C, 0xAA, 0x8C, 0xAB, /* 0x64-0x67 */ + 0xB6, 0xD0, 0x8C, 0xAC, 0x8C, 0xAD, 0x8C, 0xAE, /* 0x68-0x6B */ + 0x8C, 0xAF, 0x8C, 0xB0, 0x8C, 0xB1, 0x8C, 0xB2, /* 0x6C-0x6F */ + 0x8C, 0xB3, 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, /* 0x70-0x73 */ + 0x8C, 0xB7, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x74-0x77 */ + 0x8C, 0xBB, 0x8C, 0xBC, 0x8C, 0xBD, 0x8C, 0xBE, /* 0x78-0x7B */ + 0x8C, 0xBF, 0x8C, 0xC0, 0x8C, 0xC1, 0x8C, 0xC2, /* 0x7C-0x7F */ + + 0x8C, 0xC3, 0x8C, 0xC4, 0x8C, 0xC5, 0x8C, 0xC6, /* 0x80-0x83 */ + 0x8C, 0xC7, 0x8C, 0xC8, 0x8C, 0xC9, 0x8C, 0xCA, /* 0x84-0x87 */ + 0x8C, 0xCB, 0x8C, 0xCC, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x88-0x8B */ + 0x8C, 0xCF, 0x8C, 0xD0, 0x8C, 0xD1, 0x8C, 0xD2, /* 0x8C-0x8F */ + 0x8C, 0xD3, 0x8C, 0xD4, 0x8C, 0xD5, 0x8C, 0xD6, /* 0x90-0x93 */ + 0x8C, 0xD7, 0x8C, 0xD8, 0x8C, 0xD9, 0x8C, 0xDA, /* 0x94-0x97 */ + 0x8C, 0xDB, 0x8C, 0xDC, 0x8C, 0xDD, 0x8C, 0xDE, /* 0x98-0x9B */ + 0xB6, 0xD1, 0xB6, 0xD2, 0x8C, 0xDF, 0x8C, 0xE0, /* 0x9C-0x9F */ + 0xB6, 0xD3, 0x8C, 0xE1, 0x8C, 0xE2, 0x8C, 0xE3, /* 0xA0-0xA3 */ + 0xB6, 0xD4, 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, /* 0xA4-0xA7 */ + 0x8C, 0xE7, 0x8C, 0xE8, 0x8C, 0xE9, 0xB6, 0xD5, /* 0xA8-0xAB */ + 0xB6, 0xD6, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0xAC-0xAF */ + 0x8C, 0xED, 0xB6, 0xD7, 0x8C, 0xEE, 0x8C, 0xEF, /* 0xB0-0xB3 */ + 0x8C, 0xF0, 0x8C, 0xF1, 0x8C, 0xF2, 0x8C, 0xF3, /* 0xB4-0xB7 */ + 0x8C, 0xF4, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0xB8-0xBB */ + 0x8C, 0xF8, 0x8C, 0xF9, 0x8C, 0xFA, 0x8C, 0xFB, /* 0xBC-0xBF */ + 0x8C, 0xFC, 0x8C, 0xFD, 0x8C, 0xFE, 0x8D, 0x41, /* 0xC0-0xC3 */ + 0x8D, 0x42, 0x8D, 0x43, 0x8D, 0x44, 0x8D, 0x45, /* 0xC4-0xC7 */ + 0x8D, 0x46, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xC8-0xCB */ + 0x8D, 0x4A, 0x8D, 0x4B, 0x8D, 0x4C, 0x8D, 0x4D, /* 0xCC-0xCF */ + 0x8D, 0x4E, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xD0-0xD3 */ + 0xB6, 0xD8, 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, /* 0xD4-0xD7 */ + 0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xD8-0xDB */ + 0x8D, 0x59, 0x8D, 0x5A, 0x8D, 0x61, 0x8D, 0x62, /* 0xDC-0xDF */ + 0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xE0-0xE3 */ + 0x8D, 0x67, 0x8D, 0x68, 0x8D, 0x69, 0x8D, 0x6A, /* 0xE4-0xE7 */ + 0x8D, 0x6B, 0x8D, 0x6C, 0x8D, 0x6D, 0x8D, 0x6E, /* 0xE8-0xEB */ + 0x8D, 0x6F, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xEC-0xEF */ + 0xB6, 0xD9, 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, /* 0xF0-0xF3 */ + 0xB6, 0xDA, 0x8D, 0x76, 0x8D, 0x77, 0x8D, 0x78, /* 0xF4-0xF7 */ + 0xB6, 0xDB, 0x8D, 0x79, 0x8D, 0x7A, 0x8D, 0x81, /* 0xF8-0xFB */ + 0x8D, 0x82, 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B7[512] = { + 0xB6, 0xDC, 0xB6, 0xDD, 0x8D, 0x86, 0x8D, 0x87, /* 0x00-0x03 */ + 0x8D, 0x88, 0xB6, 0xDE, 0x8D, 0x89, 0x8D, 0x8A, /* 0x04-0x07 */ + 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, 0x8D, 0x8E, /* 0x08-0x0B */ + 0x8D, 0x8F, 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, /* 0x0C-0x0F */ + 0x8D, 0x93, 0x8D, 0x94, 0x8D, 0x95, 0x8D, 0x96, /* 0x10-0x13 */ + 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, 0x8D, 0x9A, /* 0x14-0x17 */ + 0x8D, 0x9B, 0x8D, 0x9C, 0x8D, 0x9D, 0x8D, 0x9E, /* 0x18-0x1B */ + 0x8D, 0x9F, 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, /* 0x1C-0x1F */ + 0x8D, 0xA3, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x20-0x23 */ + 0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x24-0x27 */ + 0xB6, 0xDF, 0xB6, 0xE0, 0x8D, 0xAB, 0x8D, 0xAC, /* 0x28-0x2B */ + 0xB6, 0xE1, 0x8D, 0xAD, 0x8D, 0xAE, 0xB6, 0xE2, /* 0x2C-0x2F */ + 0xB6, 0xE3, 0x8D, 0xAF, 0x8D, 0xB0, 0x8D, 0xB1, /* 0x30-0x33 */ + 0x8D, 0xB2, 0x8D, 0xB3, 0x8D, 0xB4, 0x8D, 0xB5, /* 0x34-0x37 */ + 0xB6, 0xE4, 0xB6, 0xE5, 0x8D, 0xB6, 0xB6, 0xE6, /* 0x38-0x3B */ + 0x8D, 0xB7, 0x8D, 0xB8, 0x8D, 0xB9, 0x8D, 0xBA, /* 0x3C-0x3F */ + 0x8D, 0xBB, 0x8D, 0xBC, 0x8D, 0xBD, 0x8D, 0xBE, /* 0x40-0x43 */ + 0xB6, 0xE7, 0x8D, 0xBF, 0x8D, 0xC0, 0x8D, 0xC1, /* 0x44-0x47 */ + 0xB6, 0xE8, 0x8D, 0xC2, 0x8D, 0xC3, 0x8D, 0xC4, /* 0x48-0x4B */ + 0xB6, 0xE9, 0x8D, 0xC5, 0x8D, 0xC6, 0x8D, 0xC7, /* 0x4C-0x4F */ + 0x8D, 0xC8, 0x8D, 0xC9, 0x8D, 0xCA, 0x8D, 0xCB, /* 0x50-0x53 */ + 0xB6, 0xEA, 0xB6, 0xEB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x54-0x57 */ + 0x8D, 0xCE, 0x8D, 0xCF, 0x8D, 0xD0, 0x8D, 0xD1, /* 0x58-0x5B */ + 0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x5C-0x5F */ + 0xB6, 0xEC, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x60-0x63 */ + 0xB6, 0xED, 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, /* 0x64-0x67 */ + 0xB6, 0xEE, 0x8D, 0xDC, 0x8D, 0xDD, 0x8D, 0xDE, /* 0x68-0x6B */ + 0x8D, 0xDF, 0x8D, 0xE0, 0x8D, 0xE1, 0x8D, 0xE2, /* 0x6C-0x6F */ + 0xB6, 0xEF, 0xB6, 0xF0, 0x8D, 0xE3, 0xB6, 0xF1, /* 0x70-0x73 */ + 0x8D, 0xE4, 0xB6, 0xF2, 0x8D, 0xE5, 0x8D, 0xE6, /* 0x74-0x77 */ + 0x8D, 0xE7, 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, /* 0x78-0x7B */ + 0xB6, 0xF3, 0xB6, 0xF4, 0x8D, 0xEB, 0x8D, 0xEC, /* 0x7C-0x7F */ + + 0xB6, 0xF5, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x80-0x83 */ + 0xB6, 0xF6, 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, /* 0x84-0x87 */ + 0x8D, 0xF3, 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, /* 0x88-0x8B */ + 0xB6, 0xF7, 0xB6, 0xF8, 0x8D, 0xF7, 0xB6, 0xF9, /* 0x8C-0x8F */ + 0xB6, 0xFA, 0xB6, 0xFB, 0xB6, 0xFC, 0x8D, 0xF8, /* 0x90-0x93 */ + 0x8D, 0xF9, 0x8D, 0xFA, 0xB6, 0xFD, 0xB6, 0xFE, /* 0x94-0x97 */ + 0xB7, 0xA1, 0xB7, 0xA2, 0x8D, 0xFB, 0x8D, 0xFC, /* 0x98-0x9B */ + 0xB7, 0xA3, 0x8D, 0xFD, 0x8D, 0xFE, 0x8E, 0x41, /* 0x9C-0x9F */ + 0xB7, 0xA4, 0x8E, 0x42, 0x8E, 0x43, 0x8E, 0x44, /* 0xA0-0xA3 */ + 0x8E, 0x45, 0x8E, 0x46, 0x8E, 0x47, 0x8E, 0x48, /* 0xA4-0xA7 */ + 0xB7, 0xA5, 0xB7, 0xA6, 0x8E, 0x49, 0xB7, 0xA7, /* 0xA8-0xAB */ + 0xB7, 0xA8, 0xB7, 0xA9, 0x8E, 0x4A, 0x8E, 0x4B, /* 0xAC-0xAF */ + 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, 0x8E, 0x4F, /* 0xB0-0xB3 */ + 0xB7, 0xAA, 0xB7, 0xAB, 0x8E, 0x50, 0x8E, 0x51, /* 0xB4-0xB7 */ + 0xB7, 0xAC, 0x8E, 0x52, 0x8E, 0x53, 0x8E, 0x54, /* 0xB8-0xBB */ + 0x8E, 0x55, 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, /* 0xBC-0xBF */ + 0x8E, 0x59, 0x8E, 0x5A, 0x8E, 0x61, 0x8E, 0x62, /* 0xC0-0xC3 */ + 0x8E, 0x63, 0x8E, 0x64, 0x8E, 0x65, 0xB7, 0xAD, /* 0xC4-0xC7 */ + 0x8E, 0x66, 0xB7, 0xAE, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */ + 0x8E, 0x69, 0x8E, 0x6A, 0x8E, 0x6B, 0x8E, 0x6C, /* 0xCC-0xCF */ + 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, 0x8E, 0x70, /* 0xD0-0xD3 */ + 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, 0x8E, 0x74, /* 0xD4-0xD7 */ + 0x8E, 0x75, 0x8E, 0x76, 0x8E, 0x77, 0x8E, 0x78, /* 0xD8-0xDB */ + 0x8E, 0x79, 0x8E, 0x7A, 0x8E, 0x81, 0x8E, 0x82, /* 0xDC-0xDF */ + 0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xE0-0xE3 */ + 0x8E, 0x87, 0x8E, 0x88, 0x8E, 0x89, 0x8E, 0x8A, /* 0xE4-0xE7 */ + 0x8E, 0x8B, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0xE8-0xEB */ + 0xB7, 0xAF, 0xB7, 0xB0, 0x8E, 0x8F, 0x8E, 0x90, /* 0xEC-0xEF */ + 0xB7, 0xB1, 0x8E, 0x91, 0x8E, 0x92, 0x8E, 0x93, /* 0xF0-0xF3 */ + 0xB7, 0xB2, 0x8E, 0x94, 0x8E, 0x95, 0x8E, 0x96, /* 0xF4-0xF7 */ + 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, 0x8E, 0x9A, /* 0xF8-0xFB */ + 0xB7, 0xB3, 0xB7, 0xB4, 0x8E, 0x9B, 0xB7, 0xB5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B8[512] = { + 0xB7, 0xB6, 0xB7, 0xB7, 0x8E, 0x9C, 0x8E, 0x9D, /* 0x00-0x03 */ + 0x8E, 0x9E, 0x8E, 0x9F, 0x8E, 0xA0, 0xB7, 0xB8, /* 0x04-0x07 */ + 0xB7, 0xB9, 0xB7, 0xBA, 0x8E, 0xA1, 0x8E, 0xA2, /* 0x08-0x0B */ + 0xB7, 0xBB, 0x8E, 0xA3, 0x8E, 0xA4, 0x8E, 0xA5, /* 0x0C-0x0F */ + 0xB7, 0xBC, 0x8E, 0xA6, 0x8E, 0xA7, 0x8E, 0xA8, /* 0x10-0x13 */ + 0x8E, 0xA9, 0x8E, 0xAA, 0x8E, 0xAB, 0x8E, 0xAC, /* 0x14-0x17 */ + 0xB7, 0xBD, 0xB7, 0xBE, 0x8E, 0xAD, 0xB7, 0xBF, /* 0x18-0x1B */ + 0x8E, 0xAE, 0xB7, 0xC0, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x1C-0x1F */ + 0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x20-0x23 */ + 0xB7, 0xC1, 0xB7, 0xC2, 0x8E, 0xB5, 0x8E, 0xB6, /* 0x24-0x27 */ + 0xB7, 0xC3, 0x8E, 0xB7, 0x8E, 0xB8, 0x8E, 0xB9, /* 0x28-0x2B */ + 0xB7, 0xC4, 0x8E, 0xBA, 0x8E, 0xBB, 0x8E, 0xBC, /* 0x2C-0x2F */ + 0x8E, 0xBD, 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, /* 0x30-0x33 */ + 0xB7, 0xC5, 0xB7, 0xC6, 0x8E, 0xC1, 0xB7, 0xC7, /* 0x34-0x37 */ + 0xB7, 0xC8, 0xB7, 0xC9, 0x8E, 0xC2, 0x8E, 0xC3, /* 0x38-0x3B */ + 0x8E, 0xC4, 0x8E, 0xC5, 0x8E, 0xC6, 0x8E, 0xC7, /* 0x3C-0x3F */ + 0xB7, 0xCA, 0x8E, 0xC8, 0x8E, 0xC9, 0x8E, 0xCA, /* 0x40-0x43 */ + 0xB7, 0xCB, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x44-0x47 */ + 0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x48-0x4B */ + 0x8E, 0xD2, 0x8E, 0xD3, 0x8E, 0xD4, 0x8E, 0xD5, /* 0x4C-0x4F */ + 0x8E, 0xD6, 0xB7, 0xCC, 0x8E, 0xD7, 0xB7, 0xCD, /* 0x50-0x53 */ + 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, 0x8E, 0xDB, /* 0x54-0x57 */ + 0x8E, 0xDC, 0x8E, 0xDD, 0x8E, 0xDE, 0x8E, 0xDF, /* 0x58-0x5B */ + 0xB7, 0xCE, 0xB7, 0xCF, 0x8E, 0xE0, 0x8E, 0xE1, /* 0x5C-0x5F */ + 0xB7, 0xD0, 0x8E, 0xE2, 0x8E, 0xE3, 0x8E, 0xE4, /* 0x60-0x63 */ + 0xB7, 0xD1, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0x64-0x67 */ + 0x8E, 0xE8, 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, /* 0x68-0x6B */ + 0xB7, 0xD2, 0xB7, 0xD3, 0x8E, 0xEC, 0xB7, 0xD4, /* 0x6C-0x6F */ + 0x8E, 0xED, 0xB7, 0xD5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0x70-0x73 */ + 0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0x8E, 0xF3, /* 0x74-0x77 */ + 0xB7, 0xD6, 0x8E, 0xF4, 0x8E, 0xF5, 0x8E, 0xF6, /* 0x78-0x7B */ + 0xB7, 0xD7, 0x8E, 0xF7, 0x8E, 0xF8, 0x8E, 0xF9, /* 0x7C-0x7F */ + + 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, 0x8E, 0xFD, /* 0x80-0x83 */ + 0x8E, 0xFE, 0x8F, 0x41, 0x8F, 0x42, 0x8F, 0x43, /* 0x84-0x87 */ + 0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0x88-0x8B */ + 0x8F, 0x48, 0xB7, 0xD8, 0x8F, 0x49, 0x8F, 0x4A, /* 0x8C-0x8F */ + 0x8F, 0x4B, 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, /* 0x90-0x93 */ + 0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0x94-0x97 */ + 0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0x98-0x9B */ + 0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0x9C-0x9F */ + 0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xA0-0xA3 */ + 0x8F, 0x65, 0x8F, 0x66, 0x8F, 0x67, 0x8F, 0x68, /* 0xA4-0xA7 */ + 0xB7, 0xD9, 0x8F, 0x69, 0x8F, 0x6A, 0x8F, 0x6B, /* 0xA8-0xAB */ + 0x8F, 0x6C, 0x8F, 0x6D, 0x8F, 0x6E, 0x8F, 0x6F, /* 0xAC-0xAF */ + 0xB7, 0xDA, 0x8F, 0x70, 0x8F, 0x71, 0x8F, 0x72, /* 0xB0-0xB3 */ + 0xB7, 0xDB, 0x8F, 0x73, 0x8F, 0x74, 0x8F, 0x75, /* 0xB4-0xB7 */ + 0xB7, 0xDC, 0x8F, 0x76, 0x8F, 0x77, 0x8F, 0x78, /* 0xB8-0xBB */ + 0x8F, 0x79, 0x8F, 0x7A, 0x8F, 0x81, 0x8F, 0x82, /* 0xBC-0xBF */ + 0xB7, 0xDD, 0xB7, 0xDE, 0x8F, 0x83, 0xB7, 0xDF, /* 0xC0-0xC3 */ + 0x8F, 0x84, 0xB7, 0xE0, 0x8F, 0x85, 0x8F, 0x86, /* 0xC4-0xC7 */ + 0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0xC8-0xCB */ + 0xB7, 0xE1, 0x8F, 0x8B, 0x8F, 0x8C, 0x8F, 0x8D, /* 0xCC-0xCF */ + 0xB7, 0xE2, 0x8F, 0x8E, 0x8F, 0x8F, 0x8F, 0x90, /* 0xD0-0xD3 */ + 0xB7, 0xE3, 0x8F, 0x91, 0x8F, 0x92, 0x8F, 0x93, /* 0xD4-0xD7 */ + 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, 0x8F, 0x97, /* 0xD8-0xDB */ + 0x8F, 0x98, 0xB7, 0xE4, 0x8F, 0x99, 0xB7, 0xE5, /* 0xDC-0xDF */ + 0x8F, 0x9A, 0xB7, 0xE6, 0x8F, 0x9B, 0x8F, 0x9C, /* 0xE0-0xE3 */ + 0x8F, 0x9D, 0x8F, 0x9E, 0x8F, 0x9F, 0x8F, 0xA0, /* 0xE4-0xE7 */ + 0xB7, 0xE7, 0xB7, 0xE8, 0x8F, 0xA1, 0x8F, 0xA2, /* 0xE8-0xEB */ + 0xB7, 0xE9, 0x8F, 0xA3, 0x8F, 0xA4, 0x8F, 0xA5, /* 0xEC-0xEF */ + 0xB7, 0xEA, 0x8F, 0xA6, 0x8F, 0xA7, 0x8F, 0xA8, /* 0xF0-0xF3 */ + 0x8F, 0xA9, 0x8F, 0xAA, 0x8F, 0xAB, 0x8F, 0xAC, /* 0xF4-0xF7 */ + 0xB7, 0xEB, 0xB7, 0xEC, 0x8F, 0xAD, 0xB7, 0xED, /* 0xF8-0xFB */ + 0x8F, 0xAE, 0xB7, 0xEE, 0x8F, 0xAF, 0x8F, 0xB0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B9[512] = { + 0x8F, 0xB1, 0x8F, 0xB2, 0x8F, 0xB3, 0x8F, 0xB4, /* 0x00-0x03 */ + 0xB7, 0xEF, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x04-0x07 */ + 0x8F, 0xB8, 0x8F, 0xB9, 0x8F, 0xBA, 0x8F, 0xBB, /* 0x08-0x0B */ + 0x8F, 0xBC, 0x8F, 0xBD, 0x8F, 0xBE, 0x8F, 0xBF, /* 0x0C-0x0F */ + 0x8F, 0xC0, 0x8F, 0xC1, 0x8F, 0xC2, 0x8F, 0xC3, /* 0x10-0x13 */ + 0x8F, 0xC4, 0x8F, 0xC5, 0x8F, 0xC6, 0x8F, 0xC7, /* 0x14-0x17 */ + 0xB7, 0xF0, 0x8F, 0xC8, 0x8F, 0xC9, 0x8F, 0xCA, /* 0x18-0x1B */ + 0x8F, 0xCB, 0x8F, 0xCC, 0x8F, 0xCD, 0x8F, 0xCE, /* 0x1C-0x1F */ + 0xB7, 0xF1, 0x8F, 0xCF, 0x8F, 0xD0, 0x8F, 0xD1, /* 0x20-0x23 */ + 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, 0x8F, 0xD5, /* 0x24-0x27 */ + 0x8F, 0xD6, 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, /* 0x28-0x2B */ + 0x8F, 0xDA, 0x8F, 0xDB, 0x8F, 0xDC, 0x8F, 0xDD, /* 0x2C-0x2F */ + 0x8F, 0xDE, 0x8F, 0xDF, 0x8F, 0xE0, 0x8F, 0xE1, /* 0x30-0x33 */ + 0x8F, 0xE2, 0x8F, 0xE3, 0x8F, 0xE4, 0x8F, 0xE5, /* 0x34-0x37 */ + 0x8F, 0xE6, 0x8F, 0xE7, 0x8F, 0xE8, 0x8F, 0xE9, /* 0x38-0x3B */ + 0xB7, 0xF2, 0xB7, 0xF3, 0x8F, 0xEA, 0x8F, 0xEB, /* 0x3C-0x3F */ + 0xB7, 0xF4, 0x8F, 0xEC, 0x8F, 0xED, 0x8F, 0xEE, /* 0x40-0x43 */ + 0xB7, 0xF5, 0x8F, 0xEF, 0x8F, 0xF0, 0x8F, 0xF1, /* 0x44-0x47 */ + 0x8F, 0xF2, 0x8F, 0xF3, 0x8F, 0xF4, 0x8F, 0xF5, /* 0x48-0x4B */ + 0xB7, 0xF6, 0x8F, 0xF6, 0x8F, 0xF7, 0xB7, 0xF7, /* 0x4C-0x4F */ + 0x8F, 0xF8, 0xB7, 0xF8, 0x8F, 0xF9, 0x8F, 0xFA, /* 0x50-0x53 */ + 0x8F, 0xFB, 0x8F, 0xFC, 0x8F, 0xFD, 0x8F, 0xFE, /* 0x54-0x57 */ + 0xB7, 0xF9, 0xB7, 0xFA, 0x90, 0x41, 0x90, 0x42, /* 0x58-0x5B */ + 0xB7, 0xFB, 0x90, 0x43, 0x90, 0x44, 0x90, 0x45, /* 0x5C-0x5F */ + 0xB7, 0xFC, 0x90, 0x46, 0x90, 0x47, 0x90, 0x48, /* 0x60-0x63 */ + 0x90, 0x49, 0x90, 0x4A, 0x90, 0x4B, 0x90, 0x4C, /* 0x64-0x67 */ + 0xB7, 0xFD, 0xB7, 0xFE, 0x90, 0x4D, 0xB8, 0xA1, /* 0x68-0x6B */ + 0x90, 0x4E, 0xB8, 0xA2, 0x90, 0x4F, 0x90, 0x50, /* 0x6C-0x6F */ + 0x90, 0x51, 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, /* 0x70-0x73 */ + 0xB8, 0xA3, 0xB8, 0xA4, 0x90, 0x55, 0x90, 0x56, /* 0x74-0x77 */ + 0xB8, 0xA5, 0x90, 0x57, 0x90, 0x58, 0x90, 0x59, /* 0x78-0x7B */ + 0xB8, 0xA6, 0x90, 0x5A, 0x90, 0x61, 0x90, 0x62, /* 0x7C-0x7F */ + + 0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0x90, 0x66, /* 0x80-0x83 */ + 0xB8, 0xA7, 0xB8, 0xA8, 0x90, 0x67, 0xB8, 0xA9, /* 0x84-0x87 */ + 0x90, 0x68, 0xB8, 0xAA, 0xB8, 0xAB, 0x90, 0x69, /* 0x88-0x8B */ + 0x90, 0x6A, 0xB8, 0xAC, 0xB8, 0xAD, 0x90, 0x6B, /* 0x8C-0x8F */ + 0x90, 0x6C, 0x90, 0x6D, 0x90, 0x6E, 0x90, 0x6F, /* 0x90-0x93 */ + 0x90, 0x70, 0x90, 0x71, 0x90, 0x72, 0x90, 0x73, /* 0x94-0x97 */ + 0x90, 0x74, 0x90, 0x75, 0x90, 0x76, 0x90, 0x77, /* 0x98-0x9B */ + 0x90, 0x78, 0x90, 0x79, 0x90, 0x7A, 0x90, 0x81, /* 0x9C-0x9F */ + 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, 0x90, 0x85, /* 0xA0-0xA3 */ + 0x90, 0x86, 0x90, 0x87, 0x90, 0x88, 0x90, 0x89, /* 0xA4-0xA7 */ + 0x90, 0x8A, 0x90, 0x8B, 0x90, 0x8C, 0x90, 0x8D, /* 0xA8-0xAB */ + 0xB8, 0xAE, 0xB8, 0xAF, 0x90, 0x8E, 0x90, 0x8F, /* 0xAC-0xAF */ + 0xB8, 0xB0, 0x90, 0x90, 0x90, 0x91, 0x90, 0x92, /* 0xB0-0xB3 */ + 0xB8, 0xB1, 0x90, 0x93, 0x90, 0x94, 0x90, 0x95, /* 0xB4-0xB7 */ + 0x90, 0x96, 0x90, 0x97, 0x90, 0x98, 0x90, 0x99, /* 0xB8-0xBB */ + 0xB8, 0xB2, 0xB8, 0xB3, 0x90, 0x9A, 0xB8, 0xB4, /* 0xBC-0xBF */ + 0x90, 0x9B, 0xB8, 0xB5, 0x90, 0x9C, 0x90, 0x9D, /* 0xC0-0xC3 */ + 0x90, 0x9E, 0x90, 0x9F, 0x90, 0xA0, 0x90, 0xA1, /* 0xC4-0xC7 */ + 0xB8, 0xB6, 0xB8, 0xB7, 0x90, 0xA2, 0x90, 0xA3, /* 0xC8-0xCB */ + 0xB8, 0xB8, 0x90, 0xA4, 0xB8, 0xB9, 0xB8, 0xBA, /* 0xCC-0xCF */ + 0xB8, 0xBB, 0xB8, 0xBC, 0xB8, 0xBD, 0x90, 0xA5, /* 0xD0-0xD3 */ + 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, 0x90, 0xA9, /* 0xD4-0xD7 */ + 0xB8, 0xBE, 0xB8, 0xBF, 0x90, 0xAA, 0xB8, 0xC0, /* 0xD8-0xDB */ + 0x90, 0xAB, 0xB8, 0xC1, 0xB8, 0xC2, 0x90, 0xAC, /* 0xDC-0xDF */ + 0x90, 0xAD, 0xB8, 0xC3, 0x90, 0xAE, 0xB8, 0xC4, /* 0xE0-0xE3 */ + 0xB8, 0xC5, 0xB8, 0xC6, 0x90, 0xAF, 0x90, 0xB0, /* 0xE4-0xE7 */ + 0xB8, 0xC7, 0x90, 0xB1, 0x90, 0xB2, 0x90, 0xB3, /* 0xE8-0xEB */ + 0xB8, 0xC8, 0x90, 0xB4, 0x90, 0xB5, 0x90, 0xB6, /* 0xEC-0xEF */ + 0x90, 0xB7, 0x90, 0xB8, 0x90, 0xB9, 0x90, 0xBA, /* 0xF0-0xF3 */ + 0xB8, 0xC9, 0xB8, 0xCA, 0x90, 0xBB, 0xB8, 0xCB, /* 0xF4-0xF7 */ + 0xB8, 0xCC, 0xB8, 0xCD, 0xB8, 0xCE, 0x90, 0xBC, /* 0xF8-0xFB */ + 0x90, 0xBD, 0x90, 0xBE, 0x90, 0xBF, 0x90, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BA[512] = { + 0xB8, 0xCF, 0xB8, 0xD0, 0x90, 0xC1, 0x90, 0xC2, /* 0x00-0x03 */ + 0x90, 0xC3, 0x90, 0xC4, 0x90, 0xC5, 0x90, 0xC6, /* 0x04-0x07 */ + 0xB8, 0xD1, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0x08-0x0B */ + 0x90, 0xCA, 0x90, 0xCB, 0x90, 0xCC, 0x90, 0xCD, /* 0x0C-0x0F */ + 0x90, 0xCE, 0x90, 0xCF, 0x90, 0xD0, 0x90, 0xD1, /* 0x10-0x13 */ + 0x90, 0xD2, 0xB8, 0xD2, 0x90, 0xD3, 0x90, 0xD4, /* 0x14-0x17 */ + 0x90, 0xD5, 0x90, 0xD6, 0x90, 0xD7, 0x90, 0xD8, /* 0x18-0x1B */ + 0x90, 0xD9, 0x90, 0xDA, 0x90, 0xDB, 0x90, 0xDC, /* 0x1C-0x1F */ + 0x90, 0xDD, 0x90, 0xDE, 0x90, 0xDF, 0x90, 0xE0, /* 0x20-0x23 */ + 0x90, 0xE1, 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, /* 0x24-0x27 */ + 0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x28-0x2B */ + 0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x2C-0x2F */ + 0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x30-0x33 */ + 0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x34-0x37 */ + 0xB8, 0xD3, 0xB8, 0xD4, 0x90, 0xF5, 0x90, 0xF6, /* 0x38-0x3B */ + 0xB8, 0xD5, 0x90, 0xF7, 0x90, 0xF8, 0x90, 0xF9, /* 0x3C-0x3F */ + 0xB8, 0xD6, 0x90, 0xFA, 0xB8, 0xD7, 0x90, 0xFB, /* 0x40-0x43 */ + 0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x41, /* 0x44-0x47 */ + 0xB8, 0xD8, 0xB8, 0xD9, 0x91, 0x42, 0xB8, 0xDA, /* 0x48-0x4B */ + 0x91, 0x43, 0xB8, 0xDB, 0xB8, 0xDC, 0x91, 0x44, /* 0x4C-0x4F */ + 0x91, 0x45, 0x91, 0x46, 0x91, 0x47, 0xB8, 0xDD, /* 0x50-0x53 */ + 0xB8, 0xDE, 0xB8, 0xDF, 0x91, 0x48, 0x91, 0x49, /* 0x54-0x57 */ + 0xB8, 0xE0, 0x91, 0x4A, 0x91, 0x4B, 0x91, 0x4C, /* 0x58-0x5B */ + 0xB8, 0xE1, 0x91, 0x4D, 0x91, 0x4E, 0x91, 0x4F, /* 0x5C-0x5F */ + 0x91, 0x50, 0x91, 0x51, 0x91, 0x52, 0x91, 0x53, /* 0x60-0x63 */ + 0xB8, 0xE2, 0xB8, 0xE3, 0x91, 0x54, 0xB8, 0xE4, /* 0x64-0x67 */ + 0xB8, 0xE5, 0xB8, 0xE6, 0x91, 0x55, 0x91, 0x56, /* 0x68-0x6B */ + 0x91, 0x57, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x6C-0x6F */ + 0xB8, 0xE7, 0xB8, 0xE8, 0x91, 0x61, 0x91, 0x62, /* 0x70-0x73 */ + 0xB8, 0xE9, 0x91, 0x63, 0x91, 0x64, 0x91, 0x65, /* 0x74-0x77 */ + 0xB8, 0xEA, 0x91, 0x66, 0x91, 0x67, 0x91, 0x68, /* 0x78-0x7B */ + 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, 0x91, 0x6C, /* 0x7C-0x7F */ + + 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, 0xB8, 0xEB, /* 0x80-0x83 */ + 0xB8, 0xEC, 0xB8, 0xED, 0x91, 0x70, 0xB8, 0xEE, /* 0x84-0x87 */ + 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, 0x91, 0x74, /* 0x88-0x8B */ + 0xB8, 0xEF, 0x91, 0x75, 0x91, 0x76, 0x91, 0x77, /* 0x8C-0x8F */ + 0x91, 0x78, 0x91, 0x79, 0x91, 0x7A, 0x91, 0x81, /* 0x90-0x93 */ + 0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x94-0x97 */ + 0x91, 0x86, 0x91, 0x87, 0x91, 0x88, 0x91, 0x89, /* 0x98-0x9B */ + 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, 0x91, 0x8D, /* 0x9C-0x9F */ + 0x91, 0x8E, 0x91, 0x8F, 0x91, 0x90, 0x91, 0x91, /* 0xA0-0xA3 */ + 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, 0x91, 0x95, /* 0xA4-0xA7 */ + 0xB8, 0xF0, 0xB8, 0xF1, 0x91, 0x96, 0xB8, 0xF2, /* 0xA8-0xAB */ + 0xB8, 0xF3, 0x91, 0x97, 0x91, 0x98, 0x91, 0x99, /* 0xAC-0xAF */ + 0xB8, 0xF4, 0x91, 0x9A, 0xB8, 0xF5, 0x91, 0x9B, /* 0xB0-0xB3 */ + 0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB4-0xB7 */ + 0xB8, 0xF6, 0xB8, 0xF7, 0x91, 0xA0, 0xB8, 0xF8, /* 0xB8-0xBB */ + 0x91, 0xA1, 0xB8, 0xF9, 0x91, 0xA2, 0x91, 0xA3, /* 0xBC-0xBF */ + 0x91, 0xA4, 0x91, 0xA5, 0x91, 0xA6, 0x91, 0xA7, /* 0xC0-0xC3 */ + 0xB8, 0xFA, 0x91, 0xA8, 0x91, 0xA9, 0x91, 0xAA, /* 0xC4-0xC7 */ + 0xB8, 0xFB, 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, /* 0xC8-0xCB */ + 0x91, 0xAE, 0x91, 0xAF, 0x91, 0xB0, 0x91, 0xB1, /* 0xCC-0xCF */ + 0x91, 0xB2, 0x91, 0xB3, 0x91, 0xB4, 0x91, 0xB5, /* 0xD0-0xD3 */ + 0x91, 0xB6, 0x91, 0xB7, 0x91, 0xB8, 0x91, 0xB9, /* 0xD4-0xD7 */ + 0xB8, 0xFC, 0xB8, 0xFD, 0x91, 0xBA, 0x91, 0xBB, /* 0xD8-0xDB */ + 0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xDC-0xDF */ + 0x91, 0xC0, 0x91, 0xC1, 0x91, 0xC2, 0x91, 0xC3, /* 0xE0-0xE3 */ + 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, 0x91, 0xC7, /* 0xE4-0xE7 */ + 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, 0x91, 0xCB, /* 0xE8-0xEB */ + 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, 0x91, 0xCF, /* 0xEC-0xEF */ + 0x91, 0xD0, 0x91, 0xD1, 0x91, 0xD2, 0x91, 0xD3, /* 0xF0-0xF3 */ + 0x91, 0xD4, 0x91, 0xD5, 0x91, 0xD6, 0x91, 0xD7, /* 0xF4-0xF7 */ + 0x91, 0xD8, 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, /* 0xF8-0xFB */ + 0xB8, 0xFE, 0x91, 0xDC, 0x91, 0xDD, 0x91, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BB[512] = { + 0xB9, 0xA1, 0x91, 0xDF, 0x91, 0xE0, 0x91, 0xE1, /* 0x00-0x03 */ + 0xB9, 0xA2, 0x91, 0xE2, 0x91, 0xE3, 0x91, 0xE4, /* 0x04-0x07 */ + 0x91, 0xE5, 0x91, 0xE6, 0x91, 0xE7, 0x91, 0xE8, /* 0x08-0x0B */ + 0x91, 0xE9, 0xB9, 0xA3, 0x91, 0xEA, 0xB9, 0xA4, /* 0x0C-0x0F */ + 0x91, 0xEB, 0xB9, 0xA5, 0x91, 0xEC, 0x91, 0xED, /* 0x10-0x13 */ + 0x91, 0xEE, 0x91, 0xEF, 0x91, 0xF0, 0x91, 0xF1, /* 0x14-0x17 */ + 0xB9, 0xA6, 0x91, 0xF2, 0x91, 0xF3, 0x91, 0xF4, /* 0x18-0x1B */ + 0xB9, 0xA7, 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, /* 0x1C-0x1F */ + 0xB9, 0xA8, 0x91, 0xF8, 0x91, 0xF9, 0x91, 0xFA, /* 0x20-0x23 */ + 0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0x91, 0xFE, /* 0x24-0x27 */ + 0x92, 0x41, 0xB9, 0xA9, 0x92, 0x42, 0xB9, 0xAA, /* 0x28-0x2B */ + 0x92, 0x43, 0x92, 0x44, 0x92, 0x45, 0x92, 0x46, /* 0x2C-0x2F */ + 0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x30-0x33 */ + 0xB9, 0xAB, 0xB9, 0xAC, 0xB9, 0xAD, 0x92, 0x4B, /* 0x34-0x37 */ + 0xB9, 0xAE, 0x92, 0x4C, 0x92, 0x4D, 0xB9, 0xAF, /* 0x38-0x3B */ + 0xB9, 0xB0, 0xB9, 0xB1, 0xB9, 0xB2, 0x92, 0x4E, /* 0x3C-0x3F */ + 0x92, 0x4F, 0x92, 0x50, 0x92, 0x51, 0x92, 0x52, /* 0x40-0x43 */ + 0xB9, 0xB3, 0xB9, 0xB4, 0x92, 0x53, 0xB9, 0xB5, /* 0x44-0x47 */ + 0x92, 0x54, 0xB9, 0xB6, 0x92, 0x55, 0x92, 0x56, /* 0x48-0x4B */ + 0x92, 0x57, 0xB9, 0xB7, 0x92, 0x58, 0xB9, 0xB8, /* 0x4C-0x4F */ + 0xB9, 0xB9, 0x92, 0x59, 0x92, 0x5A, 0x92, 0x61, /* 0x50-0x53 */ + 0xB9, 0xBA, 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, /* 0x54-0x57 */ + 0xB9, 0xBB, 0x92, 0x65, 0x92, 0x66, 0x92, 0x67, /* 0x58-0x5B */ + 0x92, 0x68, 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, /* 0x5C-0x5F */ + 0x92, 0x6C, 0xB9, 0xBC, 0x92, 0x6D, 0xB9, 0xBD, /* 0x60-0x63 */ + 0x92, 0x6E, 0x92, 0x6F, 0x92, 0x70, 0x92, 0x71, /* 0x64-0x67 */ + 0x92, 0x72, 0x92, 0x73, 0x92, 0x74, 0x92, 0x75, /* 0x68-0x6B */ + 0xB9, 0xBE, 0x92, 0x76, 0x92, 0x77, 0x92, 0x78, /* 0x6C-0x6F */ + 0x92, 0x79, 0x92, 0x7A, 0x92, 0x81, 0x92, 0x82, /* 0x70-0x73 */ + 0x92, 0x83, 0x92, 0x84, 0x92, 0x85, 0x92, 0x86, /* 0x74-0x77 */ + 0x92, 0x87, 0x92, 0x88, 0x92, 0x89, 0x92, 0x8A, /* 0x78-0x7B */ + 0x92, 0x8B, 0x92, 0x8C, 0x92, 0x8D, 0x92, 0x8E, /* 0x7C-0x7F */ + + 0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0x80-0x83 */ + 0x92, 0x93, 0x92, 0x94, 0x92, 0x95, 0x92, 0x96, /* 0x84-0x87 */ + 0xB9, 0xBF, 0x92, 0x97, 0x92, 0x98, 0x92, 0x99, /* 0x88-0x8B */ + 0xB9, 0xC0, 0x92, 0x9A, 0x92, 0x9B, 0x92, 0x9C, /* 0x8C-0x8F */ + 0xB9, 0xC1, 0x92, 0x9D, 0x92, 0x9E, 0x92, 0x9F, /* 0x90-0x93 */ + 0x92, 0xA0, 0x92, 0xA1, 0x92, 0xA2, 0x92, 0xA3, /* 0x94-0x97 */ + 0x92, 0xA4, 0x92, 0xA5, 0x92, 0xA6, 0x92, 0xA7, /* 0x98-0x9B */ + 0x92, 0xA8, 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, /* 0x9C-0x9F */ + 0x92, 0xAC, 0x92, 0xAD, 0x92, 0xAE, 0x92, 0xAF, /* 0xA0-0xA3 */ + 0xB9, 0xC2, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0xA4-0xA7 */ + 0xB9, 0xC3, 0x92, 0xB3, 0x92, 0xB4, 0x92, 0xB5, /* 0xA8-0xAB */ + 0xB9, 0xC4, 0x92, 0xB6, 0x92, 0xB7, 0x92, 0xB8, /* 0xAC-0xAF */ + 0x92, 0xB9, 0x92, 0xBA, 0x92, 0xBB, 0x92, 0xBC, /* 0xB0-0xB3 */ + 0xB9, 0xC5, 0x92, 0xBD, 0x92, 0xBE, 0xB9, 0xC6, /* 0xB4-0xB7 */ + 0x92, 0xBF, 0x92, 0xC0, 0x92, 0xC1, 0x92, 0xC2, /* 0xB8-0xBB */ + 0x92, 0xC3, 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, /* 0xBC-0xBF */ + 0xB9, 0xC7, 0x92, 0xC7, 0x92, 0xC8, 0x92, 0xC9, /* 0xC0-0xC3 */ + 0xB9, 0xC8, 0x92, 0xCA, 0x92, 0xCB, 0x92, 0xCC, /* 0xC4-0xC7 */ + 0xB9, 0xC9, 0x92, 0xCD, 0x92, 0xCE, 0x92, 0xCF, /* 0xC8-0xCB */ + 0x92, 0xD0, 0x92, 0xD1, 0x92, 0xD2, 0x92, 0xD3, /* 0xCC-0xCF */ + 0xB9, 0xCA, 0x92, 0xD4, 0x92, 0xD5, 0xB9, 0xCB, /* 0xD0-0xD3 */ + 0x92, 0xD6, 0x92, 0xD7, 0x92, 0xD8, 0x92, 0xD9, /* 0xD4-0xD7 */ + 0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0xD8-0xDB */ + 0x92, 0xDE, 0x92, 0xDF, 0x92, 0xE0, 0x92, 0xE1, /* 0xDC-0xDF */ + 0x92, 0xE2, 0x92, 0xE3, 0x92, 0xE4, 0x92, 0xE5, /* 0xE0-0xE3 */ + 0x92, 0xE6, 0x92, 0xE7, 0x92, 0xE8, 0x92, 0xE9, /* 0xE4-0xE7 */ + 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, 0x92, 0xED, /* 0xE8-0xEB */ + 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, 0x92, 0xF1, /* 0xEC-0xEF */ + 0x92, 0xF2, 0x92, 0xF3, 0x92, 0xF4, 0x92, 0xF5, /* 0xF0-0xF3 */ + 0x92, 0xF6, 0x92, 0xF7, 0x92, 0xF8, 0x92, 0xF9, /* 0xF4-0xF7 */ + 0xB9, 0xCC, 0xB9, 0xCD, 0x92, 0xFA, 0x92, 0xFB, /* 0xF8-0xFB */ + 0xB9, 0xCE, 0x92, 0xFC, 0x92, 0xFD, 0xB9, 0xCF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BC[512] = { + 0xB9, 0xD0, 0x92, 0xFE, 0xB9, 0xD1, 0x93, 0x41, /* 0x00-0x03 */ + 0x93, 0x42, 0x93, 0x43, 0x93, 0x44, 0x93, 0x45, /* 0x04-0x07 */ + 0xB9, 0xD2, 0xB9, 0xD3, 0x93, 0x46, 0xB9, 0xD4, /* 0x08-0x0B */ + 0xB9, 0xD5, 0xB9, 0xD6, 0x93, 0x47, 0xB9, 0xD7, /* 0x0C-0x0F */ + 0x93, 0x48, 0xB9, 0xD8, 0x93, 0x49, 0x93, 0x4A, /* 0x10-0x13 */ + 0xB9, 0xD9, 0xB9, 0xDA, 0xB9, 0xDB, 0xB9, 0xDC, /* 0x14-0x17 */ + 0xB9, 0xDD, 0x93, 0x4B, 0x93, 0x4C, 0xB9, 0xDE, /* 0x18-0x1B */ + 0xB9, 0xDF, 0xB9, 0xE0, 0xB9, 0xE1, 0xB9, 0xE2, /* 0x1C-0x1F */ + 0x93, 0x4D, 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, /* 0x20-0x23 */ + 0xB9, 0xE3, 0xB9, 0xE4, 0x93, 0x51, 0xB9, 0xE5, /* 0x24-0x27 */ + 0x93, 0x52, 0xB9, 0xE6, 0x93, 0x53, 0x93, 0x54, /* 0x28-0x2B */ + 0x93, 0x55, 0xB9, 0xE7, 0x93, 0x56, 0x93, 0x57, /* 0x2C-0x2F */ + 0xB9, 0xE8, 0xB9, 0xE9, 0x93, 0x58, 0x93, 0x59, /* 0x30-0x33 */ + 0xB9, 0xEA, 0x93, 0x5A, 0x93, 0x61, 0x93, 0x62, /* 0x34-0x37 */ + 0xB9, 0xEB, 0x93, 0x63, 0x93, 0x64, 0x93, 0x65, /* 0x38-0x3B */ + 0x93, 0x66, 0x93, 0x67, 0x93, 0x68, 0x93, 0x69, /* 0x3C-0x3F */ + 0xB9, 0xEC, 0xB9, 0xED, 0x93, 0x6A, 0xB9, 0xEE, /* 0x40-0x43 */ + 0xB9, 0xEF, 0xB9, 0xF0, 0x93, 0x6B, 0x93, 0x6C, /* 0x44-0x47 */ + 0x93, 0x6D, 0xB9, 0xF1, 0x93, 0x6E, 0x93, 0x6F, /* 0x48-0x4B */ + 0xB9, 0xF2, 0xB9, 0xF3, 0x93, 0x70, 0x93, 0x71, /* 0x4C-0x4F */ + 0xB9, 0xF4, 0x93, 0x72, 0x93, 0x73, 0x93, 0x74, /* 0x50-0x53 */ + 0x93, 0x75, 0x93, 0x76, 0x93, 0x77, 0x93, 0x78, /* 0x54-0x57 */ + 0x93, 0x79, 0x93, 0x7A, 0x93, 0x81, 0x93, 0x82, /* 0x58-0x5B */ + 0x93, 0x83, 0xB9, 0xF5, 0x93, 0x84, 0x93, 0x85, /* 0x5C-0x5F */ + 0x93, 0x86, 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, /* 0x60-0x63 */ + 0x93, 0x8A, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x64-0x67 */ + 0x93, 0x8E, 0x93, 0x8F, 0x93, 0x90, 0x93, 0x91, /* 0x68-0x6B */ + 0x93, 0x92, 0x93, 0x93, 0x93, 0x94, 0x93, 0x95, /* 0x6C-0x6F */ + 0x93, 0x96, 0x93, 0x97, 0x93, 0x98, 0x93, 0x99, /* 0x70-0x73 */ + 0x93, 0x9A, 0x93, 0x9B, 0x93, 0x9C, 0x93, 0x9D, /* 0x74-0x77 */ + 0x93, 0x9E, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x78-0x7B */ + 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, 0x93, 0xA5, /* 0x7C-0x7F */ + + 0x93, 0xA6, 0x93, 0xA7, 0x93, 0xA8, 0x93, 0xA9, /* 0x80-0x83 */ + 0xB9, 0xF6, 0xB9, 0xF7, 0x93, 0xAA, 0x93, 0xAB, /* 0x84-0x87 */ + 0xB9, 0xF8, 0x93, 0xAC, 0x93, 0xAD, 0xB9, 0xF9, /* 0x88-0x8B */ + 0xB9, 0xFA, 0x93, 0xAE, 0xB9, 0xFB, 0x93, 0xAF, /* 0x8C-0x8F */ + 0x93, 0xB0, 0x93, 0xB1, 0x93, 0xB2, 0x93, 0xB3, /* 0x90-0x93 */ + 0xB9, 0xFC, 0xB9, 0xFD, 0x93, 0xB4, 0xB9, 0xFE, /* 0x94-0x97 */ + 0x93, 0xB5, 0xBA, 0xA1, 0xBA, 0xA2, 0x93, 0xB6, /* 0x98-0x9B */ + 0x93, 0xB7, 0x93, 0xB8, 0x93, 0xB9, 0x93, 0xBA, /* 0x9C-0x9F */ + 0xBA, 0xA3, 0xBA, 0xA4, 0x93, 0xBB, 0x93, 0xBC, /* 0xA0-0xA3 */ + 0xBA, 0xA5, 0x93, 0xBD, 0x93, 0xBE, 0xBA, 0xA6, /* 0xA4-0xA7 */ + 0xBA, 0xA7, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0xA8-0xAB */ + 0x93, 0xC2, 0x93, 0xC3, 0x93, 0xC4, 0x93, 0xC5, /* 0xAC-0xAF */ + 0xBA, 0xA8, 0xBA, 0xA9, 0x93, 0xC6, 0xBA, 0xAA, /* 0xB0-0xB3 */ + 0xBA, 0xAB, 0xBA, 0xAC, 0x93, 0xC7, 0x93, 0xC8, /* 0xB4-0xB7 */ + 0x93, 0xC9, 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, /* 0xB8-0xBB */ + 0xBA, 0xAD, 0xBA, 0xAE, 0x93, 0xCD, 0x93, 0xCE, /* 0xBC-0xBF */ + 0xBA, 0xAF, 0x93, 0xCF, 0x93, 0xD0, 0x93, 0xD1, /* 0xC0-0xC3 */ + 0xBA, 0xB0, 0x93, 0xD2, 0x93, 0xD3, 0x93, 0xD4, /* 0xC4-0xC7 */ + 0x93, 0xD5, 0x93, 0xD6, 0x93, 0xD7, 0x93, 0xD8, /* 0xC8-0xCB */ + 0x93, 0xD9, 0xBA, 0xB1, 0x93, 0xDA, 0xBA, 0xB2, /* 0xCC-0xCF */ + 0xBA, 0xB3, 0xBA, 0xB4, 0x93, 0xDB, 0x93, 0xDC, /* 0xD0-0xD3 */ + 0x93, 0xDD, 0xBA, 0xB5, 0x93, 0xDE, 0x93, 0xDF, /* 0xD4-0xD7 */ + 0xBA, 0xB6, 0x93, 0xE0, 0x93, 0xE1, 0x93, 0xE2, /* 0xD8-0xDB */ + 0xBA, 0xB7, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xDC-0xDF */ + 0x93, 0xE6, 0x93, 0xE7, 0x93, 0xE8, 0x93, 0xE9, /* 0xE0-0xE3 */ + 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, 0x93, 0xED, /* 0xE4-0xE7 */ + 0x93, 0xEE, 0x93, 0xEF, 0x93, 0xF0, 0x93, 0xF1, /* 0xE8-0xEB */ + 0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xEC-0xEF */ + 0x93, 0xF6, 0x93, 0xF7, 0x93, 0xF8, 0x93, 0xF9, /* 0xF0-0xF3 */ + 0xBA, 0xB8, 0xBA, 0xB9, 0xBA, 0xBA, 0x93, 0xFA, /* 0xF4-0xF7 */ + 0xBA, 0xBB, 0x93, 0xFB, 0x93, 0xFC, 0x93, 0xFD, /* 0xF8-0xFB */ + 0xBA, 0xBC, 0x93, 0xFE, 0x94, 0x41, 0x94, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BD[512] = { + 0x94, 0x43, 0x94, 0x44, 0x94, 0x45, 0x94, 0x46, /* 0x00-0x03 */ + 0xBA, 0xBD, 0xBA, 0xBE, 0x94, 0x47, 0xBA, 0xBF, /* 0x04-0x07 */ + 0x94, 0x48, 0xBA, 0xC0, 0x94, 0x49, 0x94, 0x4A, /* 0x08-0x0B */ + 0x94, 0x4B, 0x94, 0x4C, 0x94, 0x4D, 0x94, 0x4E, /* 0x0C-0x0F */ + 0xBA, 0xC1, 0x94, 0x4F, 0x94, 0x50, 0x94, 0x51, /* 0x10-0x13 */ + 0xBA, 0xC2, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0x14-0x17 */ + 0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0x18-0x1B */ + 0x94, 0x59, 0x94, 0x5A, 0x94, 0x61, 0x94, 0x62, /* 0x1C-0x1F */ + 0x94, 0x63, 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, /* 0x20-0x23 */ + 0xBA, 0xC3, 0x94, 0x67, 0x94, 0x68, 0x94, 0x69, /* 0x24-0x27 */ + 0x94, 0x6A, 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, /* 0x28-0x2B */ + 0xBA, 0xC4, 0x94, 0x6E, 0x94, 0x6F, 0x94, 0x70, /* 0x2C-0x2F */ + 0x94, 0x71, 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, /* 0x30-0x33 */ + 0x94, 0x75, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x34-0x37 */ + 0x94, 0x79, 0x94, 0x7A, 0x94, 0x81, 0x94, 0x82, /* 0x38-0x3B */ + 0x94, 0x83, 0x94, 0x84, 0x94, 0x85, 0x94, 0x86, /* 0x3C-0x3F */ + 0xBA, 0xC5, 0x94, 0x87, 0x94, 0x88, 0x94, 0x89, /* 0x40-0x43 */ + 0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x44-0x47 */ + 0xBA, 0xC6, 0xBA, 0xC7, 0x94, 0x8E, 0x94, 0x8F, /* 0x48-0x4B */ + 0xBA, 0xC8, 0x94, 0x90, 0x94, 0x91, 0x94, 0x92, /* 0x4C-0x4F */ + 0xBA, 0xC9, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x50-0x53 */ + 0x94, 0x96, 0x94, 0x97, 0x94, 0x98, 0x94, 0x99, /* 0x54-0x57 */ + 0xBA, 0xCA, 0xBA, 0xCB, 0x94, 0x9A, 0x94, 0x9B, /* 0x58-0x5B */ + 0x94, 0x9C, 0x94, 0x9D, 0x94, 0x9E, 0x94, 0x9F, /* 0x5C-0x5F */ + 0x94, 0xA0, 0x94, 0xA1, 0x94, 0xA2, 0x94, 0xA3, /* 0x60-0x63 */ + 0xBA, 0xCC, 0x94, 0xA4, 0x94, 0xA5, 0x94, 0xA6, /* 0x64-0x67 */ + 0xBA, 0xCD, 0x94, 0xA7, 0x94, 0xA8, 0x94, 0xA9, /* 0x68-0x6B */ + 0x94, 0xAA, 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, /* 0x6C-0x6F */ + 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, 0x94, 0xB1, /* 0x70-0x73 */ + 0x94, 0xB2, 0x94, 0xB3, 0x94, 0xB4, 0x94, 0xB5, /* 0x74-0x77 */ + 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, 0x94, 0xB9, /* 0x78-0x7B */ + 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, 0x94, 0xBD, /* 0x7C-0x7F */ + + 0xBA, 0xCE, 0xBA, 0xCF, 0x94, 0xBE, 0x94, 0xBF, /* 0x80-0x83 */ + 0xBA, 0xD0, 0x94, 0xC0, 0x94, 0xC1, 0xBA, 0xD1, /* 0x84-0x87 */ + 0xBA, 0xD2, 0xBA, 0xD3, 0xBA, 0xD4, 0x94, 0xC2, /* 0x88-0x8B */ + 0x94, 0xC3, 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, /* 0x8C-0x8F */ + 0xBA, 0xD5, 0xBA, 0xD6, 0x94, 0xC7, 0xBA, 0xD7, /* 0x90-0x93 */ + 0x94, 0xC8, 0xBA, 0xD8, 0x94, 0xC9, 0x94, 0xCA, /* 0x94-0x97 */ + 0x94, 0xCB, 0xBA, 0xD9, 0xBA, 0xDA, 0x94, 0xCC, /* 0x98-0x9B */ + 0xBA, 0xDB, 0x94, 0xCD, 0x94, 0xCE, 0x94, 0xCF, /* 0x9C-0x9F */ + 0x94, 0xD0, 0x94, 0xD1, 0x94, 0xD2, 0x94, 0xD3, /* 0xA0-0xA3 */ + 0xBA, 0xDC, 0x94, 0xD4, 0x94, 0xD5, 0x94, 0xD6, /* 0xA4-0xA7 */ + 0x94, 0xD7, 0x94, 0xD8, 0x94, 0xD9, 0x94, 0xDA, /* 0xA8-0xAB */ + 0x94, 0xDB, 0x94, 0xDC, 0x94, 0xDD, 0x94, 0xDE, /* 0xAC-0xAF */ + 0xBA, 0xDD, 0x94, 0xDF, 0x94, 0xE0, 0x94, 0xE1, /* 0xB0-0xB3 */ + 0x94, 0xE2, 0x94, 0xE3, 0x94, 0xE4, 0x94, 0xE5, /* 0xB4-0xB7 */ + 0xBA, 0xDE, 0x94, 0xE6, 0x94, 0xE7, 0x94, 0xE8, /* 0xB8-0xBB */ + 0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0x94, 0xEC, /* 0xBC-0xBF */ + 0x94, 0xED, 0x94, 0xEE, 0x94, 0xEF, 0x94, 0xF0, /* 0xC0-0xC3 */ + 0x94, 0xF1, 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, /* 0xC4-0xC7 */ + 0x94, 0xF5, 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, /* 0xC8-0xCB */ + 0x94, 0xF9, 0x94, 0xFA, 0x94, 0xFB, 0x94, 0xFC, /* 0xCC-0xCF */ + 0x94, 0xFD, 0x94, 0xFE, 0x95, 0x41, 0x95, 0x42, /* 0xD0-0xD3 */ + 0xBA, 0xDF, 0xBA, 0xE0, 0x95, 0x43, 0x95, 0x44, /* 0xD4-0xD7 */ + 0xBA, 0xE1, 0x95, 0x45, 0x95, 0x46, 0x95, 0x47, /* 0xD8-0xDB */ + 0xBA, 0xE2, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xDC-0xDF */ + 0x95, 0x4B, 0x95, 0x4C, 0x95, 0x4D, 0x95, 0x4E, /* 0xE0-0xE3 */ + 0x95, 0x4F, 0x95, 0x50, 0x95, 0x51, 0x95, 0x52, /* 0xE4-0xE7 */ + 0x95, 0x53, 0xBA, 0xE3, 0x95, 0x54, 0x95, 0x55, /* 0xE8-0xEB */ + 0x95, 0x56, 0x95, 0x57, 0x95, 0x58, 0x95, 0x59, /* 0xEC-0xEF */ + 0xBA, 0xE4, 0x95, 0x5A, 0x95, 0x61, 0x95, 0x62, /* 0xF0-0xF3 */ + 0xBA, 0xE5, 0x95, 0x63, 0x95, 0x64, 0x95, 0x65, /* 0xF4-0xF7 */ + 0xBA, 0xE6, 0x95, 0x66, 0x95, 0x67, 0x95, 0x68, /* 0xF8-0xFB */ + 0x95, 0x69, 0x95, 0x6A, 0x95, 0x6B, 0x95, 0x6C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BE[512] = { + 0xBA, 0xE7, 0x95, 0x6D, 0x95, 0x6E, 0xBA, 0xE8, /* 0x00-0x03 */ + 0x95, 0x6F, 0xBA, 0xE9, 0x95, 0x70, 0x95, 0x71, /* 0x04-0x07 */ + 0x95, 0x72, 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, /* 0x08-0x0B */ + 0xBA, 0xEA, 0xBA, 0xEB, 0x95, 0x76, 0x95, 0x77, /* 0x0C-0x0F */ + 0xBA, 0xEC, 0x95, 0x78, 0x95, 0x79, 0x95, 0x7A, /* 0x10-0x13 */ + 0xBA, 0xED, 0x95, 0x81, 0x95, 0x82, 0x95, 0x83, /* 0x14-0x17 */ + 0x95, 0x84, 0x95, 0x85, 0x95, 0x86, 0x95, 0x87, /* 0x18-0x1B */ + 0xBA, 0xEE, 0xBA, 0xEF, 0x95, 0x88, 0xBA, 0xF0, /* 0x1C-0x1F */ + 0x95, 0x89, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x20-0x23 */ + 0x95, 0x8D, 0x95, 0x8E, 0x95, 0x8F, 0x95, 0x90, /* 0x24-0x27 */ + 0x95, 0x91, 0x95, 0x92, 0x95, 0x93, 0x95, 0x94, /* 0x28-0x2B */ + 0x95, 0x95, 0x95, 0x96, 0x95, 0x97, 0x95, 0x98, /* 0x2C-0x2F */ + 0x95, 0x99, 0x95, 0x9A, 0x95, 0x9B, 0x95, 0x9C, /* 0x30-0x33 */ + 0x95, 0x9D, 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, /* 0x34-0x37 */ + 0x95, 0xA1, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x38-0x3B */ + 0x95, 0xA5, 0x95, 0xA6, 0x95, 0xA7, 0x95, 0xA8, /* 0x3C-0x3F */ + 0x95, 0xA9, 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, /* 0x40-0x43 */ + 0xBA, 0xF1, 0xBA, 0xF2, 0x95, 0xAD, 0x95, 0xAE, /* 0x44-0x47 */ + 0xBA, 0xF3, 0x95, 0xAF, 0x95, 0xB0, 0x95, 0xB1, /* 0x48-0x4B */ + 0xBA, 0xF4, 0x95, 0xB2, 0xBA, 0xF5, 0x95, 0xB3, /* 0x4C-0x4F */ + 0x95, 0xB4, 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, /* 0x50-0x53 */ + 0xBA, 0xF6, 0xBA, 0xF7, 0x95, 0xB8, 0xBA, 0xF8, /* 0x54-0x57 */ + 0x95, 0xB9, 0xBA, 0xF9, 0xBA, 0xFA, 0xBA, 0xFB, /* 0x58-0x5B */ + 0x95, 0xBA, 0x95, 0xBB, 0x95, 0xBC, 0x95, 0xBD, /* 0x5C-0x5F */ + 0xBA, 0xFC, 0xBA, 0xFD, 0x95, 0xBE, 0x95, 0xBF, /* 0x60-0x63 */ + 0xBA, 0xFE, 0x95, 0xC0, 0x95, 0xC1, 0x95, 0xC2, /* 0x64-0x67 */ + 0xBB, 0xA1, 0x95, 0xC3, 0xBB, 0xA2, 0x95, 0xC4, /* 0x68-0x6B */ + 0x95, 0xC5, 0x95, 0xC6, 0x95, 0xC7, 0x95, 0xC8, /* 0x6C-0x6F */ + 0xBB, 0xA3, 0xBB, 0xA4, 0x95, 0xC9, 0xBB, 0xA5, /* 0x70-0x73 */ + 0xBB, 0xA6, 0xBB, 0xA7, 0x95, 0xCA, 0x95, 0xCB, /* 0x74-0x77 */ + 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, 0xBB, 0xA8, /* 0x78-0x7B */ + 0xBB, 0xA9, 0xBB, 0xAA, 0x95, 0xCF, 0x95, 0xD0, /* 0x7C-0x7F */ + + 0xBB, 0xAB, 0x95, 0xD1, 0x95, 0xD2, 0x95, 0xD3, /* 0x80-0x83 */ + 0xBB, 0xAC, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0x84-0x87 */ + 0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0x88-0x8B */ + 0xBB, 0xAD, 0xBB, 0xAE, 0x95, 0xDB, 0xBB, 0xAF, /* 0x8C-0x8F */ + 0xBB, 0xB0, 0xBB, 0xB1, 0x95, 0xDC, 0x95, 0xDD, /* 0x90-0x93 */ + 0x95, 0xDE, 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, /* 0x94-0x97 */ + 0xBB, 0xB2, 0xBB, 0xB3, 0x95, 0xE2, 0x95, 0xE3, /* 0x98-0x9B */ + 0x95, 0xE4, 0x95, 0xE5, 0x95, 0xE6, 0x95, 0xE7, /* 0x9C-0x9F */ + 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, 0x95, 0xEB, /* 0xA0-0xA3 */ + 0x95, 0xEC, 0x95, 0xED, 0x95, 0xEE, 0x95, 0xEF, /* 0xA4-0xA7 */ + 0xBB, 0xB4, 0x95, 0xF0, 0x95, 0xF1, 0x95, 0xF2, /* 0xA8-0xAB */ + 0x95, 0xF3, 0x95, 0xF4, 0x95, 0xF5, 0x95, 0xF6, /* 0xAC-0xAF */ + 0x95, 0xF7, 0x95, 0xF8, 0x95, 0xF9, 0x95, 0xFA, /* 0xB0-0xB3 */ + 0x95, 0xFB, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0xB4-0xB7 */ + 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, 0x96, 0x44, /* 0xB8-0xBB */ + 0x96, 0x45, 0x96, 0x46, 0x96, 0x47, 0x96, 0x48, /* 0xBC-0xBF */ + 0x96, 0x49, 0x96, 0x4A, 0x96, 0x4B, 0x96, 0x4C, /* 0xC0-0xC3 */ + 0x96, 0x4D, 0x96, 0x4E, 0x96, 0x4F, 0x96, 0x50, /* 0xC4-0xC7 */ + 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, 0x96, 0x54, /* 0xC8-0xCB */ + 0x96, 0x55, 0x96, 0x56, 0x96, 0x57, 0x96, 0x58, /* 0xCC-0xCF */ + 0xBB, 0xB5, 0xBB, 0xB6, 0x96, 0x59, 0x96, 0x5A, /* 0xD0-0xD3 */ + 0xBB, 0xB7, 0x96, 0x61, 0x96, 0x62, 0xBB, 0xB8, /* 0xD4-0xD7 */ + 0xBB, 0xB9, 0x96, 0x63, 0x96, 0x64, 0x96, 0x65, /* 0xD8-0xDB */ + 0x96, 0x66, 0x96, 0x67, 0x96, 0x68, 0x96, 0x69, /* 0xDC-0xDF */ + 0xBB, 0xBA, 0x96, 0x6A, 0x96, 0x6B, 0xBB, 0xBB, /* 0xE0-0xE3 */ + 0xBB, 0xBC, 0xBB, 0xBD, 0x96, 0x6C, 0x96, 0x6D, /* 0xE4-0xE7 */ + 0x96, 0x6E, 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, /* 0xE8-0xEB */ + 0xBB, 0xBE, 0x96, 0x72, 0x96, 0x73, 0x96, 0x74, /* 0xEC-0xEF */ + 0x96, 0x75, 0x96, 0x76, 0x96, 0x77, 0x96, 0x78, /* 0xF0-0xF3 */ + 0x96, 0x79, 0x96, 0x7A, 0x96, 0x81, 0x96, 0x82, /* 0xF4-0xF7 */ + 0x96, 0x83, 0x96, 0x84, 0x96, 0x85, 0x96, 0x86, /* 0xF8-0xFB */ + 0x96, 0x87, 0x96, 0x88, 0x96, 0x89, 0x96, 0x8A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BF[512] = { + 0x96, 0x8B, 0xBB, 0xBF, 0x96, 0x8C, 0x96, 0x8D, /* 0x00-0x03 */ + 0x96, 0x8E, 0x96, 0x8F, 0x96, 0x90, 0x96, 0x91, /* 0x04-0x07 */ + 0xBB, 0xC0, 0xBB, 0xC1, 0x96, 0x92, 0x96, 0x93, /* 0x08-0x0B */ + 0x96, 0x94, 0x96, 0x95, 0x96, 0x96, 0x96, 0x97, /* 0x0C-0x0F */ + 0x96, 0x98, 0x96, 0x99, 0x96, 0x9A, 0x96, 0x9B, /* 0x10-0x13 */ + 0x96, 0x9C, 0x96, 0x9D, 0x96, 0x9E, 0x96, 0x9F, /* 0x14-0x17 */ + 0xBB, 0xC2, 0xBB, 0xC3, 0x96, 0xA0, 0xBB, 0xC4, /* 0x18-0x1B */ + 0xBB, 0xC5, 0xBB, 0xC6, 0x96, 0xA1, 0x96, 0xA2, /* 0x1C-0x1F */ + 0x96, 0xA3, 0x96, 0xA4, 0x96, 0xA5, 0x96, 0xA6, /* 0x20-0x23 */ + 0x96, 0xA7, 0x96, 0xA8, 0x96, 0xA9, 0x96, 0xAA, /* 0x24-0x27 */ + 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, 0x96, 0xAE, /* 0x28-0x2B */ + 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, 0x96, 0xB2, /* 0x2C-0x2F */ + 0x96, 0xB3, 0x96, 0xB4, 0x96, 0xB5, 0x96, 0xB6, /* 0x30-0x33 */ + 0x96, 0xB7, 0x96, 0xB8, 0x96, 0xB9, 0x96, 0xBA, /* 0x34-0x37 */ + 0x96, 0xBB, 0x96, 0xBC, 0x96, 0xBD, 0x96, 0xBE, /* 0x38-0x3B */ + 0x96, 0xBF, 0x96, 0xC0, 0x96, 0xC1, 0x96, 0xC2, /* 0x3C-0x3F */ + 0xBB, 0xC7, 0xBB, 0xC8, 0x96, 0xC3, 0x96, 0xC4, /* 0x40-0x43 */ + 0xBB, 0xC9, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0x44-0x47 */ + 0xBB, 0xCA, 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, /* 0x48-0x4B */ + 0x96, 0xCB, 0x96, 0xCC, 0x96, 0xCD, 0x96, 0xCE, /* 0x4C-0x4F */ + 0xBB, 0xCB, 0xBB, 0xCC, 0x96, 0xCF, 0x96, 0xD0, /* 0x50-0x53 */ + 0x96, 0xD1, 0xBB, 0xCD, 0x96, 0xD2, 0x96, 0xD3, /* 0x54-0x57 */ + 0x96, 0xD4, 0x96, 0xD5, 0x96, 0xD6, 0x96, 0xD7, /* 0x58-0x5B */ + 0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x5C-0x5F */ + 0x96, 0xDC, 0x96, 0xDD, 0x96, 0xDE, 0x96, 0xDF, /* 0x60-0x63 */ + 0x96, 0xE0, 0x96, 0xE1, 0x96, 0xE2, 0x96, 0xE3, /* 0x64-0x67 */ + 0x96, 0xE4, 0x96, 0xE5, 0x96, 0xE6, 0x96, 0xE7, /* 0x68-0x6B */ + 0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x6C-0x6F */ + 0x96, 0xEC, 0x96, 0xED, 0x96, 0xEE, 0x96, 0xEF, /* 0x70-0x73 */ + 0x96, 0xF0, 0x96, 0xF1, 0x96, 0xF2, 0x96, 0xF3, /* 0x74-0x77 */ + 0x96, 0xF4, 0x96, 0xF5, 0x96, 0xF6, 0x96, 0xF7, /* 0x78-0x7B */ + 0x96, 0xF8, 0x96, 0xF9, 0x96, 0xFA, 0x96, 0xFB, /* 0x7C-0x7F */ + + 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, 0x97, 0x41, /* 0x80-0x83 */ + 0x97, 0x42, 0x97, 0x43, 0x97, 0x44, 0x97, 0x45, /* 0x84-0x87 */ + 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, 0x97, 0x49, /* 0x88-0x8B */ + 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, 0x97, 0x4D, /* 0x8C-0x8F */ + 0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x90-0x93 */ + 0xBB, 0xCE, 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, /* 0x94-0x97 */ + 0x97, 0x55, 0x97, 0x56, 0x97, 0x57, 0x97, 0x58, /* 0x98-0x9B */ + 0x97, 0x59, 0x97, 0x5A, 0x97, 0x61, 0x97, 0x62, /* 0x9C-0x9F */ + 0x97, 0x63, 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, /* 0xA0-0xA3 */ + 0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0xA4-0xA7 */ + 0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0xA8-0xAB */ + 0x97, 0x6F, 0x97, 0x70, 0x97, 0x71, 0x97, 0x72, /* 0xAC-0xAF */ + 0xBB, 0xCF, 0x97, 0x73, 0x97, 0x74, 0x97, 0x75, /* 0xB0-0xB3 */ + 0x97, 0x76, 0x97, 0x77, 0x97, 0x78, 0x97, 0x79, /* 0xB4-0xB7 */ + 0x97, 0x7A, 0x97, 0x81, 0x97, 0x82, 0x97, 0x83, /* 0xB8-0xBB */ + 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, 0x97, 0x87, /* 0xBC-0xBF */ + 0x97, 0x88, 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, /* 0xC0-0xC3 */ + 0x97, 0x8C, 0xBB, 0xD0, 0x97, 0x8D, 0x97, 0x8E, /* 0xC4-0xC7 */ + 0x97, 0x8F, 0x97, 0x90, 0x97, 0x91, 0x97, 0x92, /* 0xC8-0xCB */ + 0xBB, 0xD1, 0xBB, 0xD2, 0x97, 0x93, 0x97, 0x94, /* 0xCC-0xCF */ + 0xBB, 0xD3, 0x97, 0x95, 0x97, 0x96, 0x97, 0x97, /* 0xD0-0xD3 */ + 0xBB, 0xD4, 0x97, 0x98, 0x97, 0x99, 0x97, 0x9A, /* 0xD4-0xD7 */ + 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, 0x97, 0x9E, /* 0xD8-0xDB */ + 0xBB, 0xD5, 0x97, 0x9F, 0x97, 0xA0, 0xBB, 0xD6, /* 0xDC-0xDF */ + 0x97, 0xA1, 0xBB, 0xD7, 0x97, 0xA2, 0x97, 0xA3, /* 0xE0-0xE3 */ + 0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE4-0xE7 */ + 0x97, 0xA8, 0x97, 0xA9, 0x97, 0xAA, 0x97, 0xAB, /* 0xE8-0xEB */ + 0x97, 0xAC, 0x97, 0xAD, 0x97, 0xAE, 0x97, 0xAF, /* 0xEC-0xEF */ + 0x97, 0xB0, 0x97, 0xB1, 0x97, 0xB2, 0x97, 0xB3, /* 0xF0-0xF3 */ + 0x97, 0xB4, 0x97, 0xB5, 0x97, 0xB6, 0x97, 0xB7, /* 0xF4-0xF7 */ + 0x97, 0xB8, 0x97, 0xB9, 0x97, 0xBA, 0x97, 0xBB, /* 0xF8-0xFB */ + 0x97, 0xBC, 0x97, 0xBD, 0x97, 0xBE, 0x97, 0xBF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C0[512] = { + 0x97, 0xC0, 0x97, 0xC1, 0x97, 0xC2, 0x97, 0xC3, /* 0x00-0x03 */ + 0x97, 0xC4, 0x97, 0xC5, 0x97, 0xC6, 0x97, 0xC7, /* 0x04-0x07 */ + 0x97, 0xC8, 0x97, 0xC9, 0x97, 0xCA, 0x97, 0xCB, /* 0x08-0x0B */ + 0x97, 0xCC, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x0C-0x0F */ + 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, 0x97, 0xD3, /* 0x10-0x13 */ + 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, 0x97, 0xD7, /* 0x14-0x17 */ + 0x97, 0xD8, 0x97, 0xD9, 0x97, 0xDA, 0x97, 0xDB, /* 0x18-0x1B */ + 0x97, 0xDC, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x1C-0x1F */ + 0x97, 0xE0, 0x97, 0xE1, 0x97, 0xE2, 0x97, 0xE3, /* 0x20-0x23 */ + 0x97, 0xE4, 0x97, 0xE5, 0x97, 0xE6, 0x97, 0xE7, /* 0x24-0x27 */ + 0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x28-0x2B */ + 0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x2C-0x2F */ + 0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x30-0x33 */ + 0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x34-0x37 */ + 0x97, 0xF8, 0x97, 0xF9, 0x97, 0xFA, 0x97, 0xFB, /* 0x38-0x3B */ + 0xBB, 0xD8, 0x97, 0xFC, 0x97, 0xFD, 0x97, 0xFE, /* 0x3C-0x3F */ + 0x98, 0x41, 0x98, 0x42, 0x98, 0x43, 0x98, 0x44, /* 0x40-0x43 */ + 0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0x98, 0x48, /* 0x44-0x47 */ + 0x98, 0x49, 0x98, 0x4A, 0x98, 0x4B, 0x98, 0x4C, /* 0x48-0x4B */ + 0x98, 0x4D, 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, /* 0x4C-0x4F */ + 0x98, 0x51, 0xBB, 0xD9, 0x98, 0x52, 0x98, 0x53, /* 0x50-0x53 */ + 0x98, 0x54, 0x98, 0x55, 0x98, 0x56, 0x98, 0x57, /* 0x54-0x57 */ + 0xBB, 0xDA, 0x98, 0x58, 0x98, 0x59, 0x98, 0x5A, /* 0x58-0x5B */ + 0xBB, 0xDB, 0x98, 0x61, 0x98, 0x62, 0x98, 0x63, /* 0x5C-0x5F */ + 0xBB, 0xDC, 0x98, 0x64, 0x98, 0x65, 0x98, 0x66, /* 0x60-0x63 */ + 0x98, 0x67, 0x98, 0x68, 0x98, 0x69, 0x98, 0x6A, /* 0x64-0x67 */ + 0xBB, 0xDD, 0xBB, 0xDE, 0x98, 0x6B, 0x98, 0x6C, /* 0x68-0x6B */ + 0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0x98, 0x70, /* 0x6C-0x6F */ + 0x98, 0x71, 0x98, 0x72, 0x98, 0x73, 0x98, 0x74, /* 0x70-0x73 */ + 0x98, 0x75, 0x98, 0x76, 0x98, 0x77, 0x98, 0x78, /* 0x74-0x77 */ + 0x98, 0x79, 0x98, 0x7A, 0x98, 0x81, 0x98, 0x82, /* 0x78-0x7B */ + 0x98, 0x83, 0x98, 0x84, 0x98, 0x85, 0x98, 0x86, /* 0x7C-0x7F */ + + 0x98, 0x87, 0x98, 0x88, 0x98, 0x89, 0x98, 0x8A, /* 0x80-0x83 */ + 0x98, 0x8B, 0x98, 0x8C, 0x98, 0x8D, 0x98, 0x8E, /* 0x84-0x87 */ + 0x98, 0x8F, 0x98, 0x90, 0x98, 0x91, 0x98, 0x92, /* 0x88-0x8B */ + 0x98, 0x93, 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, /* 0x8C-0x8F */ + 0xBB, 0xDF, 0xBB, 0xE0, 0x98, 0x97, 0x98, 0x98, /* 0x90-0x93 */ + 0xBB, 0xE1, 0x98, 0x99, 0x98, 0x9A, 0x98, 0x9B, /* 0x94-0x97 */ + 0xBB, 0xE2, 0x98, 0x9C, 0x98, 0x9D, 0x98, 0x9E, /* 0x98-0x9B */ + 0x98, 0x9F, 0x98, 0xA0, 0x98, 0xA1, 0x98, 0xA2, /* 0x9C-0x9F */ + 0xBB, 0xE3, 0xBB, 0xE4, 0x98, 0xA3, 0xBB, 0xE5, /* 0xA0-0xA3 */ + 0x98, 0xA4, 0xBB, 0xE6, 0x98, 0xA5, 0x98, 0xA6, /* 0xA4-0xA7 */ + 0x98, 0xA7, 0x98, 0xA8, 0x98, 0xA9, 0x98, 0xAA, /* 0xA8-0xAB */ + 0xBB, 0xE7, 0xBB, 0xE8, 0x98, 0xAB, 0xBB, 0xE9, /* 0xAC-0xAF */ + 0xBB, 0xEA, 0x98, 0xAC, 0x98, 0xAD, 0xBB, 0xEB, /* 0xB0-0xB3 */ + 0xBB, 0xEC, 0xBB, 0xED, 0xBB, 0xEE, 0x98, 0xAE, /* 0xB4-0xB7 */ + 0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xB8-0xBB */ + 0xBB, 0xEF, 0xBB, 0xF0, 0x98, 0xB3, 0xBB, 0xF1, /* 0xBC-0xBF */ + 0xBB, 0xF2, 0xBB, 0xF3, 0x98, 0xB4, 0x98, 0xB5, /* 0xC0-0xC3 */ + 0x98, 0xB6, 0xBB, 0xF4, 0x98, 0xB7, 0x98, 0xB8, /* 0xC4-0xC7 */ + 0xBB, 0xF5, 0xBB, 0xF6, 0x98, 0xB9, 0x98, 0xBA, /* 0xC8-0xCB */ + 0xBB, 0xF7, 0x98, 0xBB, 0x98, 0xBC, 0x98, 0xBD, /* 0xCC-0xCF */ + 0xBB, 0xF8, 0x98, 0xBE, 0x98, 0xBF, 0x98, 0xC0, /* 0xD0-0xD3 */ + 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, 0x98, 0xC4, /* 0xD4-0xD7 */ + 0xBB, 0xF9, 0xBB, 0xFA, 0x98, 0xC5, 0xBB, 0xFB, /* 0xD8-0xDB */ + 0xBB, 0xFC, 0xBB, 0xFD, 0x98, 0xC6, 0x98, 0xC7, /* 0xDC-0xDF */ + 0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0x98, 0xCB, /* 0xE0-0xE3 */ + 0xBB, 0xFE, 0xBC, 0xA1, 0x98, 0xCC, 0x98, 0xCD, /* 0xE4-0xE7 */ + 0xBC, 0xA2, 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, /* 0xE8-0xEB */ + 0xBC, 0xA3, 0x98, 0xD1, 0x98, 0xD2, 0x98, 0xD3, /* 0xEC-0xEF */ + 0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0xF0-0xF3 */ + 0xBC, 0xA4, 0xBC, 0xA5, 0x98, 0xD8, 0xBC, 0xA6, /* 0xF4-0xF7 */ + 0x98, 0xD9, 0xBC, 0xA7, 0x98, 0xDA, 0x98, 0xDB, /* 0xF8-0xFB */ + 0x98, 0xDC, 0x98, 0xDD, 0x98, 0xDE, 0x98, 0xDF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C1[512] = { + 0xBC, 0xA8, 0x98, 0xE0, 0x98, 0xE1, 0x98, 0xE2, /* 0x00-0x03 */ + 0xBC, 0xA9, 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, /* 0x04-0x07 */ + 0xBC, 0xAA, 0x98, 0xE6, 0x98, 0xE7, 0x98, 0xE8, /* 0x08-0x0B */ + 0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x0C-0x0F */ + 0xBC, 0xAB, 0x98, 0xED, 0x98, 0xEE, 0x98, 0xEF, /* 0x10-0x13 */ + 0x98, 0xF0, 0xBC, 0xAC, 0x98, 0xF1, 0x98, 0xF2, /* 0x14-0x17 */ + 0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x18-0x1B */ + 0xBC, 0xAD, 0xBC, 0xAE, 0xBC, 0xAF, 0xBC, 0xB0, /* 0x1C-0x1F */ + 0xBC, 0xB1, 0x98, 0xF7, 0x98, 0xF8, 0xBC, 0xB2, /* 0x20-0x23 */ + 0xBC, 0xB3, 0x98, 0xF9, 0xBC, 0xB4, 0xBC, 0xB5, /* 0x24-0x27 */ + 0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x28-0x2B */ + 0xBC, 0xB6, 0xBC, 0xB7, 0x98, 0xFE, 0xBC, 0xB8, /* 0x2C-0x2F */ + 0xBC, 0xB9, 0xBC, 0xBA, 0x99, 0x41, 0x99, 0x42, /* 0x30-0x33 */ + 0x99, 0x43, 0x99, 0x44, 0xBC, 0xBB, 0x99, 0x45, /* 0x34-0x37 */ + 0xBC, 0xBC, 0xBC, 0xBD, 0x99, 0x46, 0x99, 0x47, /* 0x38-0x3B */ + 0xBC, 0xBE, 0x99, 0x48, 0x99, 0x49, 0x99, 0x4A, /* 0x3C-0x3F */ + 0xBC, 0xBF, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x40-0x43 */ + 0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x44-0x47 */ + 0xBC, 0xC0, 0xBC, 0xC1, 0x99, 0x52, 0xBC, 0xC2, /* 0x48-0x4B */ + 0xBC, 0xC3, 0xBC, 0xC4, 0x99, 0x53, 0x99, 0x54, /* 0x4C-0x4F */ + 0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x50-0x53 */ + 0xBC, 0xC5, 0xBC, 0xC6, 0x99, 0x59, 0x99, 0x5A, /* 0x54-0x57 */ + 0xBC, 0xC7, 0x99, 0x61, 0x99, 0x62, 0x99, 0x63, /* 0x58-0x5B */ + 0xBC, 0xC8, 0x99, 0x64, 0x99, 0x65, 0x99, 0x66, /* 0x5C-0x5F */ + 0x99, 0x67, 0x99, 0x68, 0x99, 0x69, 0x99, 0x6A, /* 0x60-0x63 */ + 0xBC, 0xC9, 0xBC, 0xCA, 0x99, 0x6B, 0xBC, 0xCB, /* 0x64-0x67 */ + 0xBC, 0xCC, 0xBC, 0xCD, 0x99, 0x6C, 0x99, 0x6D, /* 0x68-0x6B */ + 0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0x6C-0x6F */ + 0xBC, 0xCE, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x70-0x73 */ + 0xBC, 0xCF, 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, /* 0x74-0x77 */ + 0xBC, 0xD0, 0x99, 0x78, 0x99, 0x79, 0x99, 0x7A, /* 0x78-0x7B */ + 0x99, 0x81, 0x99, 0x82, 0x99, 0x83, 0x99, 0x84, /* 0x7C-0x7F */ + + 0x99, 0x85, 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, /* 0x80-0x83 */ + 0x99, 0x89, 0xBC, 0xD1, 0x99, 0x8A, 0x99, 0x8B, /* 0x84-0x87 */ + 0x99, 0x8C, 0x99, 0x8D, 0x99, 0x8E, 0x99, 0x8F, /* 0x88-0x8B */ + 0xBC, 0xD2, 0xBC, 0xD3, 0xBC, 0xD4, 0x99, 0x90, /* 0x8C-0x8F */ + 0xBC, 0xD5, 0x99, 0x91, 0x99, 0x92, 0x99, 0x93, /* 0x90-0x93 */ + 0xBC, 0xD6, 0x99, 0x94, 0xBC, 0xD7, 0x99, 0x95, /* 0x94-0x97 */ + 0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0x98-0x9B */ + 0xBC, 0xD8, 0xBC, 0xD9, 0x99, 0x9A, 0xBC, 0xDA, /* 0x9C-0x9F */ + 0x99, 0x9B, 0xBC, 0xDB, 0x99, 0x9C, 0x99, 0x9D, /* 0xA0-0xA3 */ + 0x99, 0x9E, 0xBC, 0xDC, 0x99, 0x9F, 0x99, 0xA0, /* 0xA4-0xA7 */ + 0xBC, 0xDD, 0xBC, 0xDE, 0x99, 0xA1, 0x99, 0xA2, /* 0xA8-0xAB */ + 0xBC, 0xDF, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xAC-0xAF */ + 0xBC, 0xE0, 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, /* 0xB0-0xB3 */ + 0x99, 0xA9, 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, /* 0xB4-0xB7 */ + 0x99, 0xAD, 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, /* 0xB8-0xBB */ + 0x99, 0xB1, 0xBC, 0xE1, 0x99, 0xB2, 0x99, 0xB3, /* 0xBC-0xBF */ + 0x99, 0xB4, 0x99, 0xB5, 0x99, 0xB6, 0x99, 0xB7, /* 0xC0-0xC3 */ + 0xBC, 0xE2, 0x99, 0xB8, 0x99, 0xB9, 0x99, 0xBA, /* 0xC4-0xC7 */ + 0xBC, 0xE3, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xC8-0xCB */ + 0xBC, 0xE4, 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, /* 0xCC-0xCF */ + 0x99, 0xC1, 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, /* 0xD0-0xD3 */ + 0xBC, 0xE5, 0x99, 0xC5, 0x99, 0xC6, 0xBC, 0xE6, /* 0xD4-0xD7 */ + 0xBC, 0xE7, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xD8-0xDB */ + 0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xDC-0xDF */ + 0xBC, 0xE8, 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, /* 0xE0-0xE3 */ + 0xBC, 0xE9, 0x99, 0xD1, 0x99, 0xD2, 0x99, 0xD3, /* 0xE4-0xE7 */ + 0xBC, 0xEA, 0x99, 0xD4, 0x99, 0xD5, 0x99, 0xD6, /* 0xE8-0xEB */ + 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, 0x99, 0xDA, /* 0xEC-0xEF */ + 0xBC, 0xEB, 0xBC, 0xEC, 0x99, 0xDB, 0xBC, 0xED, /* 0xF0-0xF3 */ + 0x99, 0xDC, 0x99, 0xDD, 0x99, 0xDE, 0x99, 0xDF, /* 0xF4-0xF7 */ + 0x99, 0xE0, 0x99, 0xE1, 0x99, 0xE2, 0x99, 0xE3, /* 0xF8-0xFB */ + 0xBC, 0xEE, 0xBC, 0xEF, 0x99, 0xE4, 0x99, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C2[512] = { + 0xBC, 0xF0, 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, /* 0x00-0x03 */ + 0xBC, 0xF1, 0x99, 0xE9, 0x99, 0xEA, 0x99, 0xEB, /* 0x04-0x07 */ + 0x99, 0xEC, 0x99, 0xED, 0x99, 0xEE, 0x99, 0xEF, /* 0x08-0x0B */ + 0xBC, 0xF2, 0xBC, 0xF3, 0x99, 0xF0, 0xBC, 0xF4, /* 0x0C-0x0F */ + 0x99, 0xF1, 0xBC, 0xF5, 0x99, 0xF2, 0x99, 0xF3, /* 0x10-0x13 */ + 0x99, 0xF4, 0x99, 0xF5, 0x99, 0xF6, 0x99, 0xF7, /* 0x14-0x17 */ + 0xBC, 0xF6, 0xBC, 0xF7, 0x99, 0xF8, 0x99, 0xF9, /* 0x18-0x1B */ + 0xBC, 0xF8, 0x99, 0xFA, 0x99, 0xFB, 0xBC, 0xF9, /* 0x1C-0x1F */ + 0xBC, 0xFA, 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, /* 0x20-0x23 */ + 0x9A, 0x41, 0x9A, 0x42, 0x9A, 0x43, 0x9A, 0x44, /* 0x24-0x27 */ + 0xBC, 0xFB, 0xBC, 0xFC, 0x9A, 0x45, 0xBC, 0xFD, /* 0x28-0x2B */ + 0x9A, 0x46, 0xBC, 0xFE, 0x9A, 0x47, 0xBD, 0xA1, /* 0x2C-0x2F */ + 0x9A, 0x48, 0xBD, 0xA2, 0xBD, 0xA3, 0x9A, 0x49, /* 0x30-0x33 */ + 0xBD, 0xA4, 0x9A, 0x4A, 0x9A, 0x4B, 0x9A, 0x4C, /* 0x34-0x37 */ + 0x9A, 0x4D, 0x9A, 0x4E, 0x9A, 0x4F, 0x9A, 0x50, /* 0x38-0x3B */ + 0x9A, 0x51, 0x9A, 0x52, 0x9A, 0x53, 0x9A, 0x54, /* 0x3C-0x3F */ + 0x9A, 0x55, 0x9A, 0x56, 0x9A, 0x57, 0x9A, 0x58, /* 0x40-0x43 */ + 0x9A, 0x59, 0x9A, 0x5A, 0x9A, 0x61, 0x9A, 0x62, /* 0x44-0x47 */ + 0xBD, 0xA5, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x48-0x4B */ + 0x9A, 0x66, 0x9A, 0x67, 0x9A, 0x68, 0x9A, 0x69, /* 0x4C-0x4F */ + 0xBD, 0xA6, 0xBD, 0xA7, 0x9A, 0x6A, 0x9A, 0x6B, /* 0x50-0x53 */ + 0xBD, 0xA8, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x54-0x57 */ + 0xBD, 0xA9, 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, /* 0x58-0x5B */ + 0x9A, 0x72, 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, /* 0x5C-0x5F */ + 0xBD, 0xAA, 0x9A, 0x76, 0x9A, 0x77, 0x9A, 0x78, /* 0x60-0x63 */ + 0x9A, 0x79, 0xBD, 0xAB, 0x9A, 0x7A, 0x9A, 0x81, /* 0x64-0x67 */ + 0x9A, 0x82, 0x9A, 0x83, 0x9A, 0x84, 0x9A, 0x85, /* 0x68-0x6B */ + 0xBD, 0xAC, 0xBD, 0xAD, 0x9A, 0x86, 0x9A, 0x87, /* 0x6C-0x6F */ + 0xBD, 0xAE, 0x9A, 0x88, 0x9A, 0x89, 0x9A, 0x8A, /* 0x70-0x73 */ + 0xBD, 0xAF, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x74-0x77 */ + 0x9A, 0x8E, 0x9A, 0x8F, 0x9A, 0x90, 0x9A, 0x91, /* 0x78-0x7B */ + 0xBD, 0xB0, 0xBD, 0xB1, 0x9A, 0x92, 0xBD, 0xB2, /* 0x7C-0x7F */ + + 0x9A, 0x93, 0xBD, 0xB3, 0x9A, 0x94, 0x9A, 0x95, /* 0x80-0x83 */ + 0x9A, 0x96, 0x9A, 0x97, 0x9A, 0x98, 0x9A, 0x99, /* 0x84-0x87 */ + 0xBD, 0xB4, 0xBD, 0xB5, 0x9A, 0x9A, 0x9A, 0x9B, /* 0x88-0x8B */ + 0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0x9A, 0x9F, /* 0x8C-0x8F */ + 0xBD, 0xB6, 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, /* 0x90-0x93 */ + 0x9A, 0xA3, 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, /* 0x94-0x97 */ + 0xBD, 0xB7, 0x9A, 0xA7, 0x9A, 0xA8, 0xBD, 0xB8, /* 0x98-0x9B */ + 0x9A, 0xA9, 0xBD, 0xB9, 0x9A, 0xAA, 0x9A, 0xAB, /* 0x9C-0x9F */ + 0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0x9A, 0xAF, /* 0xA0-0xA3 */ + 0xBD, 0xBA, 0xBD, 0xBB, 0x9A, 0xB0, 0x9A, 0xB1, /* 0xA4-0xA7 */ + 0xBD, 0xBC, 0x9A, 0xB2, 0x9A, 0xB3, 0x9A, 0xB4, /* 0xA8-0xAB */ + 0xBD, 0xBD, 0xBD, 0xBE, 0x9A, 0xB5, 0x9A, 0xB6, /* 0xAC-0xAF */ + 0x9A, 0xB7, 0x9A, 0xB8, 0x9A, 0xB9, 0x9A, 0xBA, /* 0xB0-0xB3 */ + 0xBD, 0xBF, 0xBD, 0xC0, 0x9A, 0xBB, 0xBD, 0xC1, /* 0xB4-0xB7 */ + 0x9A, 0xBC, 0xBD, 0xC2, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xB8-0xBB */ + 0x9A, 0xBF, 0x9A, 0xC0, 0x9A, 0xC1, 0x9A, 0xC2, /* 0xBC-0xBF */ + 0x9A, 0xC3, 0x9A, 0xC4, 0x9A, 0xC5, 0x9A, 0xC6, /* 0xC0-0xC3 */ + 0x9A, 0xC7, 0x9A, 0xC8, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xC4-0xC7 */ + 0x9A, 0xCB, 0x9A, 0xCC, 0x9A, 0xCD, 0x9A, 0xCE, /* 0xC8-0xCB */ + 0x9A, 0xCF, 0x9A, 0xD0, 0x9A, 0xD1, 0x9A, 0xD2, /* 0xCC-0xCF */ + 0x9A, 0xD3, 0x9A, 0xD4, 0x9A, 0xD5, 0x9A, 0xD6, /* 0xD0-0xD3 */ + 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, 0x9A, 0xDA, /* 0xD4-0xD7 */ + 0x9A, 0xDB, 0x9A, 0xDC, 0x9A, 0xDD, 0x9A, 0xDE, /* 0xD8-0xDB */ + 0xBD, 0xC3, 0xBD, 0xC4, 0x9A, 0xDF, 0x9A, 0xE0, /* 0xDC-0xDF */ + 0xBD, 0xC5, 0x9A, 0xE1, 0x9A, 0xE2, 0xBD, 0xC6, /* 0xE0-0xE3 */ + 0xBD, 0xC7, 0x9A, 0xE3, 0x9A, 0xE4, 0x9A, 0xE5, /* 0xE4-0xE7 */ + 0x9A, 0xE6, 0x9A, 0xE7, 0x9A, 0xE8, 0xBD, 0xC8, /* 0xE8-0xEB */ + 0xBD, 0xC9, 0xBD, 0xCA, 0x9A, 0xE9, 0xBD, 0xCB, /* 0xEC-0xEF */ + 0x9A, 0xEA, 0xBD, 0xCC, 0x9A, 0xEB, 0x9A, 0xEC, /* 0xF0-0xF3 */ + 0x9A, 0xED, 0x9A, 0xEE, 0xBD, 0xCD, 0x9A, 0xEF, /* 0xF4-0xF7 */ + 0xBD, 0xCE, 0xBD, 0xCF, 0x9A, 0xF0, 0xBD, 0xD0, /* 0xF8-0xFB */ + 0xBD, 0xD1, 0x9A, 0xF1, 0x9A, 0xF2, 0x9A, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C3[512] = { + 0xBD, 0xD2, 0x9A, 0xF4, 0x9A, 0xF5, 0x9A, 0xF6, /* 0x00-0x03 */ + 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, 0x9A, 0xFA, /* 0x04-0x07 */ + 0xBD, 0xD3, 0xBD, 0xD4, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x08-0x0B */ + 0xBD, 0xD5, 0xBD, 0xD6, 0x9A, 0xFD, 0x9A, 0xFE, /* 0x0C-0x0F */ + 0x9B, 0x41, 0x9B, 0x42, 0x9B, 0x43, 0xBD, 0xD7, /* 0x10-0x13 */ + 0xBD, 0xD8, 0xBD, 0xD9, 0x9B, 0x44, 0x9B, 0x45, /* 0x14-0x17 */ + 0xBD, 0xDA, 0x9B, 0x46, 0x9B, 0x47, 0x9B, 0x48, /* 0x18-0x1B */ + 0xBD, 0xDB, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x1C-0x1F */ + 0x9B, 0x4C, 0x9B, 0x4D, 0x9B, 0x4E, 0x9B, 0x4F, /* 0x20-0x23 */ + 0xBD, 0xDC, 0xBD, 0xDD, 0x9B, 0x50, 0x9B, 0x51, /* 0x24-0x27 */ + 0xBD, 0xDE, 0xBD, 0xDF, 0x9B, 0x52, 0x9B, 0x53, /* 0x28-0x2B */ + 0x9B, 0x54, 0x9B, 0x55, 0x9B, 0x56, 0x9B, 0x57, /* 0x2C-0x2F */ + 0x9B, 0x58, 0x9B, 0x59, 0x9B, 0x5A, 0x9B, 0x61, /* 0x30-0x33 */ + 0x9B, 0x62, 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, /* 0x34-0x37 */ + 0x9B, 0x66, 0x9B, 0x67, 0x9B, 0x68, 0x9B, 0x69, /* 0x38-0x3B */ + 0x9B, 0x6A, 0x9B, 0x6B, 0x9B, 0x6C, 0x9B, 0x6D, /* 0x3C-0x3F */ + 0x9B, 0x6E, 0x9B, 0x6F, 0x9B, 0x70, 0x9B, 0x71, /* 0x40-0x43 */ + 0x9B, 0x72, 0xBD, 0xE0, 0x9B, 0x73, 0x9B, 0x74, /* 0x44-0x47 */ + 0x9B, 0x75, 0x9B, 0x76, 0x9B, 0x77, 0x9B, 0x78, /* 0x48-0x4B */ + 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x81, 0x9B, 0x82, /* 0x4C-0x4F */ + 0x9B, 0x83, 0x9B, 0x84, 0x9B, 0x85, 0x9B, 0x86, /* 0x50-0x53 */ + 0x9B, 0x87, 0x9B, 0x88, 0x9B, 0x89, 0x9B, 0x8A, /* 0x54-0x57 */ + 0x9B, 0x8B, 0x9B, 0x8C, 0x9B, 0x8D, 0x9B, 0x8E, /* 0x58-0x5B */ + 0x9B, 0x8F, 0x9B, 0x90, 0x9B, 0x91, 0x9B, 0x92, /* 0x5C-0x5F */ + 0x9B, 0x93, 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, /* 0x60-0x63 */ + 0x9B, 0x97, 0x9B, 0x98, 0x9B, 0x99, 0x9B, 0x9A, /* 0x64-0x67 */ + 0xBD, 0xE1, 0xBD, 0xE2, 0x9B, 0x9B, 0x9B, 0x9C, /* 0x68-0x6B */ + 0xBD, 0xE3, 0x9B, 0x9D, 0x9B, 0x9E, 0x9B, 0x9F, /* 0x6C-0x6F */ + 0xBD, 0xE4, 0x9B, 0xA0, 0xBD, 0xE5, 0x9B, 0xA1, /* 0x70-0x73 */ + 0x9B, 0xA2, 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, /* 0x74-0x77 */ + 0xBD, 0xE6, 0xBD, 0xE7, 0x9B, 0xA6, 0x9B, 0xA7, /* 0x78-0x7B */ + 0xBD, 0xE8, 0xBD, 0xE9, 0x9B, 0xA8, 0x9B, 0xA9, /* 0x7C-0x7F */ + + 0x9B, 0xAA, 0x9B, 0xAB, 0x9B, 0xAC, 0x9B, 0xAD, /* 0x80-0x83 */ + 0xBD, 0xEA, 0x9B, 0xAE, 0x9B, 0xAF, 0x9B, 0xB0, /* 0x84-0x87 */ + 0xBD, 0xEB, 0x9B, 0xB1, 0x9B, 0xB2, 0x9B, 0xB3, /* 0x88-0x8B */ + 0xBD, 0xEC, 0x9B, 0xB4, 0x9B, 0xB5, 0x9B, 0xB6, /* 0x8C-0x8F */ + 0x9B, 0xB7, 0x9B, 0xB8, 0x9B, 0xB9, 0x9B, 0xBA, /* 0x90-0x93 */ + 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, 0x9B, 0xBE, /* 0x94-0x97 */ + 0x9B, 0xBF, 0x9B, 0xC0, 0x9B, 0xC1, 0x9B, 0xC2, /* 0x98-0x9B */ + 0x9B, 0xC3, 0x9B, 0xC4, 0x9B, 0xC5, 0x9B, 0xC6, /* 0x9C-0x9F */ + 0x9B, 0xC7, 0x9B, 0xC8, 0x9B, 0xC9, 0x9B, 0xCA, /* 0xA0-0xA3 */ + 0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0xA4-0xA7 */ + 0x9B, 0xCF, 0x9B, 0xD0, 0x9B, 0xD1, 0x9B, 0xD2, /* 0xA8-0xAB */ + 0x9B, 0xD3, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0xAC-0xAF */ + 0x9B, 0xD7, 0x9B, 0xD8, 0x9B, 0xD9, 0x9B, 0xDA, /* 0xB0-0xB3 */ + 0x9B, 0xDB, 0x9B, 0xDC, 0x9B, 0xDD, 0x9B, 0xDE, /* 0xB4-0xB7 */ + 0x9B, 0xDF, 0x9B, 0xE0, 0x9B, 0xE1, 0x9B, 0xE2, /* 0xB8-0xBB */ + 0x9B, 0xE3, 0x9B, 0xE4, 0x9B, 0xE5, 0x9B, 0xE6, /* 0xBC-0xBF */ + 0xBD, 0xED, 0x9B, 0xE7, 0x9B, 0xE8, 0x9B, 0xE9, /* 0xC0-0xC3 */ + 0x9B, 0xEA, 0x9B, 0xEB, 0x9B, 0xEC, 0x9B, 0xED, /* 0xC4-0xC7 */ + 0x9B, 0xEE, 0x9B, 0xEF, 0x9B, 0xF0, 0x9B, 0xF1, /* 0xC8-0xCB */ + 0x9B, 0xF2, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xCC-0xCF */ + 0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0x9B, 0xF9, /* 0xD0-0xD3 */ + 0x9B, 0xFA, 0x9B, 0xFB, 0x9B, 0xFC, 0x9B, 0xFD, /* 0xD4-0xD7 */ + 0xBD, 0xEE, 0xBD, 0xEF, 0x9B, 0xFE, 0x9C, 0x41, /* 0xD8-0xDB */ + 0xBD, 0xF0, 0x9C, 0x42, 0x9C, 0x43, 0xBD, 0xF1, /* 0xDC-0xDF */ + 0xBD, 0xF2, 0x9C, 0x44, 0xBD, 0xF3, 0x9C, 0x45, /* 0xE0-0xE3 */ + 0x9C, 0x46, 0x9C, 0x47, 0x9C, 0x48, 0x9C, 0x49, /* 0xE4-0xE7 */ + 0xBD, 0xF4, 0xBD, 0xF5, 0x9C, 0x4A, 0x9C, 0x4B, /* 0xE8-0xEB */ + 0x9C, 0x4C, 0xBD, 0xF6, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xEC-0xEF */ + 0x9C, 0x4F, 0x9C, 0x50, 0x9C, 0x51, 0x9C, 0x52, /* 0xF0-0xF3 */ + 0xBD, 0xF7, 0xBD, 0xF8, 0x9C, 0x53, 0x9C, 0x54, /* 0xF4-0xF7 */ + 0xBD, 0xF9, 0x9C, 0x55, 0x9C, 0x56, 0x9C, 0x57, /* 0xF8-0xFB */ + 0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0x9C, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C4[512] = { + 0x9C, 0x62, 0x9C, 0x63, 0x9C, 0x64, 0x9C, 0x65, /* 0x00-0x03 */ + 0x9C, 0x66, 0x9C, 0x67, 0x9C, 0x68, 0x9C, 0x69, /* 0x04-0x07 */ + 0xBD, 0xFA, 0x9C, 0x6A, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x08-0x0B */ + 0x9C, 0x6D, 0x9C, 0x6E, 0x9C, 0x6F, 0x9C, 0x70, /* 0x0C-0x0F */ + 0xBD, 0xFB, 0x9C, 0x71, 0x9C, 0x72, 0x9C, 0x73, /* 0x10-0x13 */ + 0x9C, 0x74, 0x9C, 0x75, 0x9C, 0x76, 0x9C, 0x77, /* 0x14-0x17 */ + 0x9C, 0x78, 0x9C, 0x79, 0x9C, 0x7A, 0x9C, 0x81, /* 0x18-0x1B */ + 0x9C, 0x82, 0x9C, 0x83, 0x9C, 0x84, 0x9C, 0x85, /* 0x1C-0x1F */ + 0x9C, 0x86, 0x9C, 0x87, 0x9C, 0x88, 0x9C, 0x89, /* 0x20-0x23 */ + 0xBD, 0xFC, 0x9C, 0x8A, 0x9C, 0x8B, 0x9C, 0x8C, /* 0x24-0x27 */ + 0x9C, 0x8D, 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, /* 0x28-0x2B */ + 0xBD, 0xFD, 0x9C, 0x91, 0x9C, 0x92, 0x9C, 0x93, /* 0x2C-0x2F */ + 0xBD, 0xFE, 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, /* 0x30-0x33 */ + 0xBE, 0xA1, 0x9C, 0x97, 0x9C, 0x98, 0x9C, 0x99, /* 0x34-0x37 */ + 0x9C, 0x9A, 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, /* 0x38-0x3B */ + 0xBE, 0xA2, 0xBE, 0xA3, 0x9C, 0x9E, 0x9C, 0x9F, /* 0x3C-0x3F */ + 0x9C, 0xA0, 0x9C, 0xA1, 0x9C, 0xA2, 0x9C, 0xA3, /* 0x40-0x43 */ + 0x9C, 0xA4, 0x9C, 0xA5, 0x9C, 0xA6, 0x9C, 0xA7, /* 0x44-0x47 */ + 0xBE, 0xA4, 0x9C, 0xA8, 0x9C, 0xA9, 0x9C, 0xAA, /* 0x48-0x4B */ + 0x9C, 0xAB, 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, /* 0x4C-0x4F */ + 0x9C, 0xAF, 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, /* 0x50-0x53 */ + 0x9C, 0xB3, 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, /* 0x54-0x57 */ + 0x9C, 0xB7, 0x9C, 0xB8, 0x9C, 0xB9, 0x9C, 0xBA, /* 0x58-0x5B */ + 0x9C, 0xBB, 0x9C, 0xBC, 0x9C, 0xBD, 0x9C, 0xBE, /* 0x5C-0x5F */ + 0x9C, 0xBF, 0x9C, 0xC0, 0x9C, 0xC1, 0x9C, 0xC2, /* 0x60-0x63 */ + 0xBE, 0xA5, 0xBE, 0xA6, 0x9C, 0xC3, 0x9C, 0xC4, /* 0x64-0x67 */ + 0xBE, 0xA7, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x68-0x6B */ + 0xBE, 0xA8, 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, /* 0x6C-0x6F */ + 0x9C, 0xCB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x70-0x73 */ + 0xBE, 0xA9, 0xBE, 0xAA, 0x9C, 0xCF, 0x9C, 0xD0, /* 0x74-0x77 */ + 0x9C, 0xD1, 0xBE, 0xAB, 0x9C, 0xD2, 0x9C, 0xD3, /* 0x78-0x7B */ + 0x9C, 0xD4, 0x9C, 0xD5, 0x9C, 0xD6, 0x9C, 0xD7, /* 0x7C-0x7F */ + + 0xBE, 0xAC, 0x9C, 0xD8, 0x9C, 0xD9, 0x9C, 0xDA, /* 0x80-0x83 */ + 0x9C, 0xDB, 0x9C, 0xDC, 0x9C, 0xDD, 0x9C, 0xDE, /* 0x84-0x87 */ + 0x9C, 0xDF, 0x9C, 0xE0, 0x9C, 0xE1, 0x9C, 0xE2, /* 0x88-0x8B */ + 0x9C, 0xE3, 0x9C, 0xE4, 0x9C, 0xE5, 0x9C, 0xE6, /* 0x8C-0x8F */ + 0x9C, 0xE7, 0x9C, 0xE8, 0x9C, 0xE9, 0x9C, 0xEA, /* 0x90-0x93 */ + 0xBE, 0xAD, 0x9C, 0xEB, 0x9C, 0xEC, 0x9C, 0xED, /* 0x94-0x97 */ + 0x9C, 0xEE, 0x9C, 0xEF, 0x9C, 0xF0, 0x9C, 0xF1, /* 0x98-0x9B */ + 0xBE, 0xAE, 0x9C, 0xF2, 0x9C, 0xF3, 0x9C, 0xF4, /* 0x9C-0x9F */ + 0x9C, 0xF5, 0x9C, 0xF6, 0x9C, 0xF7, 0x9C, 0xF8, /* 0xA0-0xA3 */ + 0x9C, 0xF9, 0x9C, 0xFA, 0x9C, 0xFB, 0x9C, 0xFC, /* 0xA4-0xA7 */ + 0x9C, 0xFD, 0x9C, 0xFE, 0x9D, 0x41, 0x9D, 0x42, /* 0xA8-0xAB */ + 0x9D, 0x43, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xAC-0xAF */ + 0x9D, 0x47, 0x9D, 0x48, 0x9D, 0x49, 0x9D, 0x4A, /* 0xB0-0xB3 */ + 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, 0x9D, 0x4E, /* 0xB4-0xB7 */ + 0xBE, 0xAF, 0x9D, 0x4F, 0x9D, 0x50, 0x9D, 0x51, /* 0xB8-0xBB */ + 0xBE, 0xB0, 0x9D, 0x52, 0x9D, 0x53, 0x9D, 0x54, /* 0xBC-0xBF */ + 0x9D, 0x55, 0x9D, 0x56, 0x9D, 0x57, 0x9D, 0x58, /* 0xC0-0xC3 */ + 0x9D, 0x59, 0x9D, 0x5A, 0x9D, 0x61, 0x9D, 0x62, /* 0xC4-0xC7 */ + 0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0xC8-0xCB */ + 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, 0x9D, 0x6A, /* 0xCC-0xCF */ + 0x9D, 0x6B, 0x9D, 0x6C, 0x9D, 0x6D, 0x9D, 0x6E, /* 0xD0-0xD3 */ + 0x9D, 0x6F, 0x9D, 0x70, 0x9D, 0x71, 0x9D, 0x72, /* 0xD4-0xD7 */ + 0x9D, 0x73, 0x9D, 0x74, 0x9D, 0x75, 0x9D, 0x76, /* 0xD8-0xDB */ + 0x9D, 0x77, 0x9D, 0x78, 0x9D, 0x79, 0x9D, 0x7A, /* 0xDC-0xDF */ + 0x9D, 0x81, 0x9D, 0x82, 0x9D, 0x83, 0x9D, 0x84, /* 0xE0-0xE3 */ + 0x9D, 0x85, 0x9D, 0x86, 0x9D, 0x87, 0x9D, 0x88, /* 0xE4-0xE7 */ + 0x9D, 0x89, 0xBE, 0xB1, 0x9D, 0x8A, 0x9D, 0x8B, /* 0xE8-0xEB */ + 0x9D, 0x8C, 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, /* 0xEC-0xEF */ + 0xBE, 0xB2, 0xBE, 0xB3, 0x9D, 0x90, 0x9D, 0x91, /* 0xF0-0xF3 */ + 0xBE, 0xB4, 0x9D, 0x92, 0x9D, 0x93, 0x9D, 0x94, /* 0xF4-0xF7 */ + 0xBE, 0xB5, 0x9D, 0x95, 0xBE, 0xB6, 0x9D, 0x96, /* 0xF8-0xFB */ + 0x9D, 0x97, 0x9D, 0x98, 0x9D, 0x99, 0xBE, 0xB7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C5[512] = { + 0xBE, 0xB8, 0xBE, 0xB9, 0x9D, 0x9A, 0x9D, 0x9B, /* 0x00-0x03 */ + 0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x04-0x07 */ + 0x9D, 0xA0, 0x9D, 0xA1, 0x9D, 0xA2, 0x9D, 0xA3, /* 0x08-0x0B */ + 0xBE, 0xBA, 0x9D, 0xA4, 0x9D, 0xA5, 0x9D, 0xA6, /* 0x0C-0x0F */ + 0xBE, 0xBB, 0x9D, 0xA7, 0x9D, 0xA8, 0x9D, 0xA9, /* 0x10-0x13 */ + 0xBE, 0xBC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x14-0x17 */ + 0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x18-0x1B */ + 0xBE, 0xBD, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x1C-0x1F */ + 0x9D, 0xB4, 0x9D, 0xB5, 0x9D, 0xB6, 0x9D, 0xB7, /* 0x20-0x23 */ + 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, 0x9D, 0xBB, /* 0x24-0x27 */ + 0xBE, 0xBE, 0xBE, 0xBF, 0x9D, 0xBC, 0x9D, 0xBD, /* 0x28-0x2B */ + 0xBE, 0xC0, 0x9D, 0xBE, 0x9D, 0xBF, 0x9D, 0xC0, /* 0x2C-0x2F */ + 0xBE, 0xC1, 0x9D, 0xC1, 0x9D, 0xC2, 0x9D, 0xC3, /* 0x30-0x33 */ + 0x9D, 0xC4, 0x9D, 0xC5, 0x9D, 0xC6, 0x9D, 0xC7, /* 0x34-0x37 */ + 0xBE, 0xC2, 0xBE, 0xC3, 0x9D, 0xC8, 0xBE, 0xC4, /* 0x38-0x3B */ + 0x9D, 0xC9, 0xBE, 0xC5, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x3C-0x3F */ + 0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0x40-0x43 */ + 0xBE, 0xC6, 0xBE, 0xC7, 0x9D, 0xD0, 0x9D, 0xD1, /* 0x44-0x47 */ + 0xBE, 0xC8, 0xBE, 0xC9, 0xBE, 0xCA, 0x9D, 0xD2, /* 0x48-0x4B */ + 0xBE, 0xCB, 0xBE, 0xCC, 0xBE, 0xCD, 0x9D, 0xD3, /* 0x4C-0x4F */ + 0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xBE, 0xCE, /* 0x50-0x53 */ + 0xBE, 0xCF, 0xBE, 0xD0, 0x9D, 0xD7, 0xBE, 0xD1, /* 0x54-0x57 */ + 0xBE, 0xD2, 0xBE, 0xD3, 0x9D, 0xD8, 0x9D, 0xD9, /* 0x58-0x5B */ + 0x9D, 0xDA, 0xBE, 0xD4, 0xBE, 0xD5, 0x9D, 0xDB, /* 0x5C-0x5F */ + 0xBE, 0xD6, 0xBE, 0xD7, 0x9D, 0xDC, 0x9D, 0xDD, /* 0x60-0x63 */ + 0xBE, 0xD8, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0x64-0x67 */ + 0xBE, 0xD9, 0x9D, 0xE1, 0x9D, 0xE2, 0x9D, 0xE3, /* 0x68-0x6B */ + 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, 0x9D, 0xE7, /* 0x6C-0x6F */ + 0xBE, 0xDA, 0xBE, 0xDB, 0x9D, 0xE8, 0xBE, 0xDC, /* 0x70-0x73 */ + 0xBE, 0xDD, 0xBE, 0xDE, 0x9D, 0xE9, 0x9D, 0xEA, /* 0x74-0x77 */ + 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, 0x9D, 0xEE, /* 0x78-0x7B */ + 0xBE, 0xDF, 0xBE, 0xE0, 0x9D, 0xEF, 0x9D, 0xF0, /* 0x7C-0x7F */ + + 0xBE, 0xE1, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0x80-0x83 */ + 0xBE, 0xE2, 0x9D, 0xF4, 0x9D, 0xF5, 0xBE, 0xE3, /* 0x84-0x87 */ + 0x9D, 0xF6, 0x9D, 0xF7, 0x9D, 0xF8, 0x9D, 0xF9, /* 0x88-0x8B */ + 0xBE, 0xE4, 0xBE, 0xE5, 0x9D, 0xFA, 0xBE, 0xE6, /* 0x8C-0x8F */ + 0x9D, 0xFB, 0xBE, 0xE7, 0x9D, 0xFC, 0x9D, 0xFD, /* 0x90-0x93 */ + 0x9D, 0xFE, 0xBE, 0xE8, 0x9E, 0x41, 0xBE, 0xE9, /* 0x94-0x97 */ + 0xBE, 0xEA, 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, /* 0x98-0x9B */ + 0xBE, 0xEB, 0x9E, 0x45, 0x9E, 0x46, 0x9E, 0x47, /* 0x9C-0x9F */ + 0xBE, 0xEC, 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, /* 0xA0-0xA3 */ + 0x9E, 0x4B, 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, /* 0xA4-0xA7 */ + 0x9E, 0x4F, 0xBE, 0xED, 0x9E, 0x50, 0x9E, 0x51, /* 0xA8-0xAB */ + 0x9E, 0x52, 0x9E, 0x53, 0x9E, 0x54, 0x9E, 0x55, /* 0xAC-0xAF */ + 0x9E, 0x56, 0x9E, 0x57, 0x9E, 0x58, 0x9E, 0x59, /* 0xB0-0xB3 */ + 0xBE, 0xEE, 0xBE, 0xEF, 0x9E, 0x5A, 0x9E, 0x61, /* 0xB4-0xB7 */ + 0xBE, 0xF0, 0xBE, 0xF1, 0x9E, 0x62, 0xBE, 0xF2, /* 0xB8-0xBB */ + 0xBE, 0xF3, 0xBE, 0xF4, 0xBE, 0xF5, 0x9E, 0x63, /* 0xBC-0xBF */ + 0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0xC0-0xC3 */ + 0xBE, 0xF6, 0xBE, 0xF7, 0xBE, 0xF8, 0xBE, 0xF9, /* 0xC4-0xC7 */ + 0xBE, 0xFA, 0xBE, 0xFB, 0xBE, 0xFC, 0x9E, 0x68, /* 0xC8-0xCB */ + 0xBE, 0xFD, 0x9E, 0x69, 0xBE, 0xFE, 0x9E, 0x6A, /* 0xCC-0xCF */ + 0xBF, 0xA1, 0xBF, 0xA2, 0x9E, 0x6B, 0x9E, 0x6C, /* 0xD0-0xD3 */ + 0xBF, 0xA3, 0x9E, 0x6D, 0x9E, 0x6E, 0x9E, 0x6F, /* 0xD4-0xD7 */ + 0xBF, 0xA4, 0x9E, 0x70, 0x9E, 0x71, 0x9E, 0x72, /* 0xD8-0xDB */ + 0x9E, 0x73, 0x9E, 0x74, 0x9E, 0x75, 0x9E, 0x76, /* 0xDC-0xDF */ + 0xBF, 0xA5, 0xBF, 0xA6, 0x9E, 0x77, 0xBF, 0xA7, /* 0xE0-0xE3 */ + 0x9E, 0x78, 0xBF, 0xA8, 0x9E, 0x79, 0x9E, 0x7A, /* 0xE4-0xE7 */ + 0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0xE8-0xEB */ + 0xBF, 0xA9, 0xBF, 0xAA, 0xBF, 0xAB, 0x9E, 0x85, /* 0xEC-0xEF */ + 0xBF, 0xAC, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0xF0-0xF3 */ + 0xBF, 0xAD, 0x9E, 0x89, 0xBF, 0xAE, 0xBF, 0xAF, /* 0xF4-0xF7 */ + 0x9E, 0x8A, 0x9E, 0x8B, 0x9E, 0x8C, 0x9E, 0x8D, /* 0xF8-0xFB */ + 0xBF, 0xB0, 0xBF, 0xB1, 0xBF, 0xB2, 0xBF, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C6[512] = { + 0xBF, 0xB4, 0xBF, 0xB5, 0x9E, 0x8E, 0x9E, 0x8F, /* 0x00-0x03 */ + 0x9E, 0x90, 0xBF, 0xB6, 0xBF, 0xB7, 0xBF, 0xB8, /* 0x04-0x07 */ + 0xBF, 0xB9, 0x9E, 0x91, 0x9E, 0x92, 0x9E, 0x93, /* 0x08-0x0B */ + 0xBF, 0xBA, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x0C-0x0F */ + 0xBF, 0xBB, 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, /* 0x10-0x13 */ + 0x9E, 0x9A, 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, /* 0x14-0x17 */ + 0xBF, 0xBC, 0xBF, 0xBD, 0x9E, 0x9E, 0xBF, 0xBE, /* 0x18-0x1B */ + 0xBF, 0xBF, 0x9E, 0x9F, 0x9E, 0xA0, 0x9E, 0xA1, /* 0x1C-0x1F */ + 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, 0x9E, 0xA5, /* 0x20-0x23 */ + 0xBF, 0xC0, 0xBF, 0xC1, 0x9E, 0xA6, 0x9E, 0xA7, /* 0x24-0x27 */ + 0xBF, 0xC2, 0x9E, 0xA8, 0x9E, 0xA9, 0x9E, 0xAA, /* 0x28-0x2B */ + 0xBF, 0xC3, 0xBF, 0xC4, 0xBF, 0xC5, 0x9E, 0xAB, /* 0x2C-0x2F */ + 0xBF, 0xC6, 0x9E, 0xAC, 0x9E, 0xAD, 0xBF, 0xC7, /* 0x30-0x33 */ + 0xBF, 0xC8, 0xBF, 0xC9, 0x9E, 0xAE, 0xBF, 0xCA, /* 0x34-0x37 */ + 0x9E, 0xAF, 0xBF, 0xCB, 0x9E, 0xB0, 0xBF, 0xCC, /* 0x38-0x3B */ + 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, 0x9E, 0xB4, /* 0x3C-0x3F */ + 0xBF, 0xCD, 0xBF, 0xCE, 0x9E, 0xB5, 0x9E, 0xB6, /* 0x40-0x43 */ + 0xBF, 0xCF, 0x9E, 0xB7, 0x9E, 0xB8, 0x9E, 0xB9, /* 0x44-0x47 */ + 0xBF, 0xD0, 0x9E, 0xBA, 0x9E, 0xBB, 0x9E, 0xBC, /* 0x48-0x4B */ + 0x9E, 0xBD, 0x9E, 0xBE, 0x9E, 0xBF, 0x9E, 0xC0, /* 0x4C-0x4F */ + 0xBF, 0xD1, 0xBF, 0xD2, 0x9E, 0xC1, 0xBF, 0xD3, /* 0x50-0x53 */ + 0xBF, 0xD4, 0xBF, 0xD5, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x54-0x57 */ + 0x9E, 0xC4, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x58-0x5B */ + 0xBF, 0xD6, 0xBF, 0xD7, 0x9E, 0xC8, 0x9E, 0xC9, /* 0x5C-0x5F */ + 0xBF, 0xD8, 0x9E, 0xCA, 0x9E, 0xCB, 0x9E, 0xCC, /* 0x60-0x63 */ + 0x9E, 0xCD, 0x9E, 0xCE, 0x9E, 0xCF, 0x9E, 0xD0, /* 0x64-0x67 */ + 0x9E, 0xD1, 0x9E, 0xD2, 0x9E, 0xD3, 0x9E, 0xD4, /* 0x68-0x6B */ + 0xBF, 0xD9, 0x9E, 0xD5, 0x9E, 0xD6, 0xBF, 0xDA, /* 0x6C-0x6F */ + 0x9E, 0xD7, 0xBF, 0xDB, 0x9E, 0xD8, 0x9E, 0xD9, /* 0x70-0x73 */ + 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, 0x9E, 0xDD, /* 0x74-0x77 */ + 0xBF, 0xDC, 0xBF, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, /* 0x78-0x7B */ + 0xBF, 0xDE, 0x9E, 0xE0, 0x9E, 0xE1, 0x9E, 0xE2, /* 0x7C-0x7F */ + + 0xBF, 0xDF, 0x9E, 0xE3, 0x9E, 0xE4, 0x9E, 0xE5, /* 0x80-0x83 */ + 0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0x9E, 0xE9, /* 0x84-0x87 */ + 0xBF, 0xE0, 0xBF, 0xE1, 0x9E, 0xEA, 0xBF, 0xE2, /* 0x88-0x8B */ + 0x9E, 0xEB, 0xBF, 0xE3, 0x9E, 0xEC, 0x9E, 0xED, /* 0x8C-0x8F */ + 0x9E, 0xEE, 0x9E, 0xEF, 0x9E, 0xF0, 0x9E, 0xF1, /* 0x90-0x93 */ + 0xBF, 0xE4, 0xBF, 0xE5, 0x9E, 0xF2, 0x9E, 0xF3, /* 0x94-0x97 */ + 0xBF, 0xE6, 0x9E, 0xF4, 0x9E, 0xF5, 0x9E, 0xF6, /* 0x98-0x9B */ + 0xBF, 0xE7, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0x9C-0x9F */ + 0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xA0-0xA3 */ + 0xBF, 0xE8, 0xBF, 0xE9, 0x9E, 0xFE, 0xBF, 0xEA, /* 0xA4-0xA7 */ + 0x9F, 0x41, 0xBF, 0xEB, 0x9F, 0x42, 0x9F, 0x43, /* 0xA8-0xAB */ + 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, 0x9F, 0x47, /* 0xAC-0xAF */ + 0xBF, 0xEC, 0xBF, 0xED, 0x9F, 0x48, 0x9F, 0x49, /* 0xB0-0xB3 */ + 0xBF, 0xEE, 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, /* 0xB4-0xB7 */ + 0xBF, 0xEF, 0xBF, 0xF0, 0xBF, 0xF1, 0x9F, 0x4D, /* 0xB8-0xBB */ + 0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0x9F, 0x51, /* 0xBC-0xBF */ + 0xBF, 0xF2, 0xBF, 0xF3, 0x9F, 0x52, 0xBF, 0xF4, /* 0xC0-0xC3 */ + 0x9F, 0x53, 0xBF, 0xF5, 0x9F, 0x54, 0x9F, 0x55, /* 0xC4-0xC7 */ + 0x9F, 0x56, 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, /* 0xC8-0xCB */ + 0xBF, 0xF6, 0xBF, 0xF7, 0x9F, 0x5A, 0x9F, 0x61, /* 0xCC-0xCF */ + 0xBF, 0xF8, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0xD0-0xD3 */ + 0xBF, 0xF9, 0x9F, 0x65, 0x9F, 0x66, 0x9F, 0x67, /* 0xD4-0xD7 */ + 0x9F, 0x68, 0x9F, 0x69, 0x9F, 0x6A, 0x9F, 0x6B, /* 0xD8-0xDB */ + 0xBF, 0xFA, 0xBF, 0xFB, 0x9F, 0x6C, 0x9F, 0x6D, /* 0xDC-0xDF */ + 0xBF, 0xFC, 0xBF, 0xFD, 0x9F, 0x6E, 0x9F, 0x6F, /* 0xE0-0xE3 */ + 0x9F, 0x70, 0x9F, 0x71, 0x9F, 0x72, 0x9F, 0x73, /* 0xE4-0xE7 */ + 0xBF, 0xFE, 0xC0, 0xA1, 0x9F, 0x74, 0x9F, 0x75, /* 0xE8-0xEB */ + 0xC0, 0xA2, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0xEC-0xEF */ + 0xC0, 0xA3, 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x81, /* 0xF0-0xF3 */ + 0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0xF4-0xF7 */ + 0xC0, 0xA4, 0xC0, 0xA5, 0x9F, 0x86, 0x9F, 0x87, /* 0xF8-0xFB */ + 0x9F, 0x88, 0xC0, 0xA6, 0x9F, 0x89, 0x9F, 0x8A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C7[512] = { + 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, 0x9F, 0x8E, /* 0x00-0x03 */ + 0xC0, 0xA7, 0xC0, 0xA8, 0x9F, 0x8F, 0x9F, 0x90, /* 0x04-0x07 */ + 0xC0, 0xA9, 0x9F, 0x91, 0x9F, 0x92, 0x9F, 0x93, /* 0x08-0x0B */ + 0xC0, 0xAA, 0x9F, 0x94, 0x9F, 0x95, 0x9F, 0x96, /* 0x0C-0x0F */ + 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, 0x9F, 0x9A, /* 0x10-0x13 */ + 0xC0, 0xAB, 0xC0, 0xAC, 0x9F, 0x9B, 0xC0, 0xAD, /* 0x14-0x17 */ + 0x9F, 0x9C, 0xC0, 0xAE, 0x9F, 0x9D, 0x9F, 0x9E, /* 0x18-0x1B */ + 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, 0x9F, 0xA2, /* 0x1C-0x1F */ + 0xC0, 0xAF, 0xC0, 0xB0, 0x9F, 0xA3, 0x9F, 0xA4, /* 0x20-0x23 */ + 0xC0, 0xB1, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x24-0x27 */ + 0xC0, 0xB2, 0x9F, 0xA8, 0x9F, 0xA9, 0x9F, 0xAA, /* 0x28-0x2B */ + 0x9F, 0xAB, 0x9F, 0xAC, 0x9F, 0xAD, 0x9F, 0xAE, /* 0x2C-0x2F */ + 0xC0, 0xB3, 0xC0, 0xB4, 0x9F, 0xAF, 0xC0, 0xB5, /* 0x30-0x33 */ + 0x9F, 0xB0, 0xC0, 0xB6, 0x9F, 0xB1, 0xC0, 0xB7, /* 0x34-0x37 */ + 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, 0x9F, 0xB5, /* 0x38-0x3B */ + 0xC0, 0xB8, 0xC0, 0xB9, 0x9F, 0xB6, 0x9F, 0xB7, /* 0x3C-0x3F */ + 0xC0, 0xBA, 0x9F, 0xB8, 0x9F, 0xB9, 0x9F, 0xBA, /* 0x40-0x43 */ + 0xC0, 0xBB, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x44-0x47 */ + 0x9F, 0xBE, 0x9F, 0xBF, 0xC0, 0xBC, 0x9F, 0xC0, /* 0x48-0x4B */ + 0xC0, 0xBD, 0xC0, 0xBE, 0x9F, 0xC1, 0xC0, 0xBF, /* 0x4C-0x4F */ + 0x9F, 0xC2, 0xC0, 0xC0, 0xC0, 0xC1, 0xC0, 0xC2, /* 0x50-0x53 */ + 0xC0, 0xC3, 0xC0, 0xC4, 0xC0, 0xC5, 0xC0, 0xC6, /* 0x54-0x57 */ + 0xC0, 0xC7, 0x9F, 0xC3, 0x9F, 0xC4, 0x9F, 0xC5, /* 0x58-0x5B */ + 0xC0, 0xC8, 0x9F, 0xC6, 0x9F, 0xC7, 0x9F, 0xC8, /* 0x5C-0x5F */ + 0xC0, 0xC9, 0x9F, 0xC9, 0x9F, 0xCA, 0x9F, 0xCB, /* 0x60-0x63 */ + 0x9F, 0xCC, 0x9F, 0xCD, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x64-0x67 */ + 0xC0, 0xCA, 0x9F, 0xD0, 0x9F, 0xD1, 0xC0, 0xCB, /* 0x68-0x6B */ + 0x9F, 0xD2, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0x6C-0x6F */ + 0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0x70-0x73 */ + 0xC0, 0xCC, 0xC0, 0xCD, 0x9F, 0xDA, 0x9F, 0xDB, /* 0x74-0x77 */ + 0xC0, 0xCE, 0x9F, 0xDC, 0x9F, 0xDD, 0x9F, 0xDE, /* 0x78-0x7B */ + 0xC0, 0xCF, 0xC0, 0xD0, 0xC0, 0xD1, 0x9F, 0xDF, /* 0x7C-0x7F */ + + 0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xC0, 0xD2, /* 0x80-0x83 */ + 0xC0, 0xD3, 0xC0, 0xD4, 0x9F, 0xE3, 0xC0, 0xD5, /* 0x84-0x87 */ + 0xC0, 0xD6, 0xC0, 0xD7, 0xC0, 0xD8, 0x9F, 0xE4, /* 0x88-0x8B */ + 0x9F, 0xE5, 0x9F, 0xE6, 0xC0, 0xD9, 0x9F, 0xE7, /* 0x8C-0x8F */ + 0xC0, 0xDA, 0xC0, 0xDB, 0x9F, 0xE8, 0x9F, 0xE9, /* 0x90-0x93 */ + 0xC0, 0xDC, 0x9F, 0xEA, 0xC0, 0xDD, 0xC0, 0xDE, /* 0x94-0x97 */ + 0xC0, 0xDF, 0x9F, 0xEB, 0xC0, 0xE0, 0x9F, 0xEC, /* 0x98-0x9B */ + 0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0x9F, 0xF0, /* 0x9C-0x9F */ + 0xC0, 0xE1, 0xC0, 0xE2, 0x9F, 0xF1, 0xC0, 0xE3, /* 0xA0-0xA3 */ + 0xC0, 0xE4, 0xC0, 0xE5, 0xC0, 0xE6, 0x9F, 0xF2, /* 0xA4-0xA7 */ + 0x9F, 0xF3, 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, /* 0xA8-0xAB */ + 0xC0, 0xE7, 0xC0, 0xE8, 0x9F, 0xF7, 0x9F, 0xF8, /* 0xAC-0xAF */ + 0xC0, 0xE9, 0x9F, 0xF9, 0x9F, 0xFA, 0x9F, 0xFB, /* 0xB0-0xB3 */ + 0xC0, 0xEA, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xB4-0xB7 */ + 0xA0, 0x41, 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, /* 0xB8-0xBB */ + 0xC0, 0xEB, 0xC0, 0xEC, 0xA0, 0x45, 0xC0, 0xED, /* 0xBC-0xBF */ + 0xC0, 0xEE, 0xC0, 0xEF, 0xA0, 0x46, 0xA0, 0x47, /* 0xC0-0xC3 */ + 0xA0, 0x48, 0xA0, 0x49, 0xA0, 0x4A, 0xA0, 0x4B, /* 0xC4-0xC7 */ + 0xC0, 0xF0, 0xC0, 0xF1, 0xA0, 0x4C, 0xA0, 0x4D, /* 0xC8-0xCB */ + 0xC0, 0xF2, 0xA0, 0x4E, 0xC0, 0xF3, 0xA0, 0x4F, /* 0xCC-0xCF */ + 0xC0, 0xF4, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xD0-0xD3 */ + 0xA0, 0x53, 0xA0, 0x54, 0xA0, 0x55, 0xA0, 0x56, /* 0xD4-0xD7 */ + 0xC0, 0xF5, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xD8-0xDB */ + 0xA0, 0x5A, 0xC0, 0xF6, 0xA0, 0x61, 0xA0, 0x62, /* 0xDC-0xDF */ + 0xA0, 0x63, 0xA0, 0x64, 0xA0, 0x65, 0xA0, 0x66, /* 0xE0-0xE3 */ + 0xC0, 0xF7, 0xA0, 0x67, 0xA0, 0x68, 0xA0, 0x69, /* 0xE4-0xE7 */ + 0xC0, 0xF8, 0xA0, 0x6A, 0xA0, 0x6B, 0xA0, 0x6C, /* 0xE8-0xEB */ + 0xC0, 0xF9, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0xEC-0xEF */ + 0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0xF0-0xF3 */ + 0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0xF4-0xF7 */ + 0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x81, /* 0xF8-0xFB */ + 0xA0, 0x82, 0xA0, 0x83, 0xA0, 0x84, 0xA0, 0x85, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C8[512] = { + 0xC0, 0xFA, 0xC0, 0xFB, 0xA0, 0x86, 0xA0, 0x87, /* 0x00-0x03 */ + 0xC0, 0xFC, 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, /* 0x04-0x07 */ + 0xC0, 0xFD, 0xA0, 0x8B, 0xC0, 0xFE, 0xA0, 0x8C, /* 0x08-0x0B */ + 0xA0, 0x8D, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x0C-0x0F */ + 0xC1, 0xA1, 0xC1, 0xA2, 0xA0, 0x91, 0xC1, 0xA3, /* 0x10-0x13 */ + 0xA0, 0x92, 0xC1, 0xA4, 0xC1, 0xA5, 0xA0, 0x93, /* 0x14-0x17 */ + 0xA0, 0x94, 0xA0, 0x95, 0xA0, 0x96, 0xA0, 0x97, /* 0x18-0x1B */ + 0xC1, 0xA6, 0xC1, 0xA7, 0xA0, 0x98, 0xA0, 0x99, /* 0x1C-0x1F */ + 0xC1, 0xA8, 0xA0, 0x9A, 0xA0, 0x9B, 0xA0, 0x9C, /* 0x20-0x23 */ + 0xC1, 0xA9, 0xA0, 0x9D, 0xA0, 0x9E, 0xA0, 0x9F, /* 0x24-0x27 */ + 0xA0, 0xA0, 0xA0, 0xA1, 0xA0, 0xA2, 0xA0, 0xA3, /* 0x28-0x2B */ + 0xC1, 0xAA, 0xC1, 0xAB, 0xA0, 0xA4, 0xC1, 0xAC, /* 0x2C-0x2F */ + 0xA0, 0xA5, 0xC1, 0xAD, 0xA0, 0xA6, 0xA0, 0xA7, /* 0x30-0x33 */ + 0xA0, 0xA8, 0xA0, 0xA9, 0xA0, 0xAA, 0xA0, 0xAB, /* 0x34-0x37 */ + 0xC1, 0xAE, 0xA0, 0xAC, 0xA0, 0xAD, 0xA0, 0xAE, /* 0x38-0x3B */ + 0xC1, 0xAF, 0xA0, 0xAF, 0xA0, 0xB0, 0xA0, 0xB1, /* 0x3C-0x3F */ + 0xC1, 0xB0, 0xA0, 0xB2, 0xA0, 0xB3, 0xA0, 0xB4, /* 0x40-0x43 */ + 0xA0, 0xB5, 0xA0, 0xB6, 0xA0, 0xB7, 0xA0, 0xB8, /* 0x44-0x47 */ + 0xC1, 0xB1, 0xC1, 0xB2, 0xA0, 0xB9, 0xA0, 0xBA, /* 0x48-0x4B */ + 0xC1, 0xB3, 0xC1, 0xB4, 0xA0, 0xBB, 0xA0, 0xBC, /* 0x4C-0x4F */ + 0xA0, 0xBD, 0xA0, 0xBE, 0xA0, 0xBF, 0xA0, 0xC0, /* 0x50-0x53 */ + 0xC1, 0xB5, 0xA0, 0xC1, 0xA0, 0xC2, 0xA0, 0xC3, /* 0x54-0x57 */ + 0xA0, 0xC4, 0xA0, 0xC5, 0xA0, 0xC6, 0xA0, 0xC7, /* 0x58-0x5B */ + 0xA0, 0xC8, 0xA0, 0xC9, 0xA0, 0xCA, 0xA0, 0xCB, /* 0x5C-0x5F */ + 0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x60-0x63 */ + 0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x64-0x67 */ + 0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xA0, 0xD7, /* 0x68-0x6B */ + 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, 0xA0, 0xDB, /* 0x6C-0x6F */ + 0xC1, 0xB6, 0xC1, 0xB7, 0xA0, 0xDC, 0xA0, 0xDD, /* 0x70-0x73 */ + 0xC1, 0xB8, 0xA0, 0xDE, 0xA0, 0xDF, 0xA0, 0xE0, /* 0x74-0x77 */ + 0xC1, 0xB9, 0xA0, 0xE1, 0xC1, 0xBA, 0xA0, 0xE2, /* 0x78-0x7B */ + 0xA0, 0xE3, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0x7C-0x7F */ + + 0xC1, 0xBB, 0xC1, 0xBC, 0xA0, 0xE7, 0xC1, 0xBD, /* 0x80-0x83 */ + 0xA0, 0xE8, 0xC1, 0xBE, 0xC1, 0xBF, 0xC1, 0xC0, /* 0x84-0x87 */ + 0xA0, 0xE9, 0xA0, 0xEA, 0xA0, 0xEB, 0xC1, 0xC1, /* 0x88-0x8B */ + 0xC1, 0xC2, 0xC1, 0xC3, 0xA0, 0xEC, 0xA0, 0xED, /* 0x8C-0x8F */ + 0xA0, 0xEE, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0x90-0x93 */ + 0xC1, 0xC4, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0x94-0x97 */ + 0xA0, 0xF5, 0xA0, 0xF6, 0xA0, 0xF7, 0xA0, 0xF8, /* 0x98-0x9B */ + 0xA0, 0xF9, 0xC1, 0xC5, 0xA0, 0xFA, 0xC1, 0xC6, /* 0x9C-0x9F */ + 0xA0, 0xFB, 0xC1, 0xC7, 0xA0, 0xFC, 0xA0, 0xFD, /* 0xA0-0xA3 */ + 0xA0, 0xFE, 0xA1, 0x41, 0xA1, 0x42, 0xA1, 0x43, /* 0xA4-0xA7 */ + 0xC1, 0xC8, 0xA1, 0x44, 0xA1, 0x45, 0xA1, 0x46, /* 0xA8-0xAB */ + 0xA1, 0x47, 0xA1, 0x48, 0xA1, 0x49, 0xA1, 0x4A, /* 0xAC-0xAF */ + 0xA1, 0x4B, 0xA1, 0x4C, 0xA1, 0x4D, 0xA1, 0x4E, /* 0xB0-0xB3 */ + 0xA1, 0x4F, 0xA1, 0x50, 0xA1, 0x51, 0xA1, 0x52, /* 0xB4-0xB7 */ + 0xA1, 0x53, 0xA1, 0x54, 0xA1, 0x55, 0xA1, 0x56, /* 0xB8-0xBB */ + 0xC1, 0xC9, 0xC1, 0xCA, 0xA1, 0x57, 0xA1, 0x58, /* 0xBC-0xBF */ + 0xA1, 0x59, 0xA1, 0x5A, 0xA1, 0x61, 0xA1, 0x62, /* 0xC0-0xC3 */ + 0xC1, 0xCB, 0xA1, 0x63, 0xA1, 0x64, 0xA1, 0x65, /* 0xC4-0xC7 */ + 0xC1, 0xCC, 0xA1, 0x66, 0xA1, 0x67, 0xA1, 0x68, /* 0xC8-0xCB */ + 0xC1, 0xCD, 0xA1, 0x69, 0xA1, 0x6A, 0xA1, 0x6B, /* 0xCC-0xCF */ + 0xA1, 0x6C, 0xA1, 0x6D, 0xA1, 0x6E, 0xA1, 0x6F, /* 0xD0-0xD3 */ + 0xC1, 0xCE, 0xC1, 0xCF, 0xA1, 0x70, 0xC1, 0xD0, /* 0xD4-0xD7 */ + 0xA1, 0x71, 0xC1, 0xD1, 0xA1, 0x72, 0xA1, 0x73, /* 0xD8-0xDB */ + 0xA1, 0x74, 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x77, /* 0xDC-0xDF */ + 0xC1, 0xD2, 0xC1, 0xD3, 0xA1, 0x78, 0xA1, 0x79, /* 0xE0-0xE3 */ + 0xC1, 0xD4, 0xA1, 0x7A, 0xA1, 0x81, 0xA1, 0x82, /* 0xE4-0xE7 */ + 0xA1, 0x83, 0xA1, 0x84, 0xA1, 0x85, 0xA1, 0x86, /* 0xE8-0xEB */ + 0xA1, 0x87, 0xA1, 0x88, 0xA1, 0x89, 0xA1, 0x8A, /* 0xEC-0xEF */ + 0xA1, 0x8B, 0xA1, 0x8C, 0xA1, 0x8D, 0xA1, 0x8E, /* 0xF0-0xF3 */ + 0xA1, 0x8F, 0xC1, 0xD5, 0xA1, 0x90, 0xA1, 0x91, /* 0xF4-0xF7 */ + 0xA1, 0x92, 0xA1, 0x93, 0xA1, 0x94, 0xA1, 0x95, /* 0xF8-0xFB */ + 0xC1, 0xD6, 0xC1, 0xD7, 0xA1, 0x96, 0xA1, 0x97, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C9[512] = { + 0xC1, 0xD8, 0xA1, 0x98, 0xA1, 0x99, 0xA1, 0x9A, /* 0x00-0x03 */ + 0xC1, 0xD9, 0xC1, 0xDA, 0xC1, 0xDB, 0xA1, 0x9B, /* 0x04-0x07 */ + 0xA1, 0x9C, 0xA1, 0x9D, 0xA1, 0x9E, 0xA1, 0x9F, /* 0x08-0x0B */ + 0xC1, 0xDC, 0xC1, 0xDD, 0xA1, 0xA0, 0xC1, 0xDE, /* 0x0C-0x0F */ + 0xA2, 0x41, 0xC1, 0xDF, 0xA2, 0x42, 0xA2, 0x43, /* 0x10-0x13 */ + 0xA2, 0x44, 0xA2, 0x45, 0xA2, 0x46, 0xA2, 0x47, /* 0x14-0x17 */ + 0xC1, 0xE0, 0xA2, 0x48, 0xA2, 0x49, 0xA2, 0x4A, /* 0x18-0x1B */ + 0xA2, 0x4B, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x1C-0x1F */ + 0xA2, 0x4F, 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, /* 0x20-0x23 */ + 0xA2, 0x53, 0xA2, 0x54, 0xA2, 0x55, 0xA2, 0x56, /* 0x24-0x27 */ + 0xA2, 0x57, 0xA2, 0x58, 0xA2, 0x59, 0xA2, 0x5A, /* 0x28-0x2B */ + 0xC1, 0xE1, 0xA2, 0x61, 0xA2, 0x62, 0xA2, 0x63, /* 0x2C-0x2F */ + 0xA2, 0x64, 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, /* 0x30-0x33 */ + 0xC1, 0xE2, 0xA2, 0x68, 0xA2, 0x69, 0xA2, 0x6A, /* 0x34-0x37 */ + 0xA2, 0x6B, 0xA2, 0x6C, 0xA2, 0x6D, 0xA2, 0x6E, /* 0x38-0x3B */ + 0xA2, 0x6F, 0xA2, 0x70, 0xA2, 0x71, 0xA2, 0x72, /* 0x3C-0x3F */ + 0xA2, 0x73, 0xA2, 0x74, 0xA2, 0x75, 0xA2, 0x76, /* 0x40-0x43 */ + 0xA2, 0x77, 0xA2, 0x78, 0xA2, 0x79, 0xA2, 0x7A, /* 0x44-0x47 */ + 0xA2, 0x81, 0xA2, 0x82, 0xA2, 0x83, 0xA2, 0x84, /* 0x48-0x4B */ + 0xA2, 0x85, 0xA2, 0x86, 0xA2, 0x87, 0xA2, 0x88, /* 0x4C-0x4F */ + 0xC1, 0xE3, 0xC1, 0xE4, 0xA2, 0x89, 0xA2, 0x8A, /* 0x50-0x53 */ + 0xC1, 0xE5, 0xA2, 0x8B, 0xA2, 0x8C, 0xA2, 0x8D, /* 0x54-0x57 */ + 0xC1, 0xE6, 0xA2, 0x8E, 0xA2, 0x8F, 0xA2, 0x90, /* 0x58-0x5B */ + 0xA2, 0x91, 0xA2, 0x92, 0xA2, 0x93, 0xA2, 0x94, /* 0x5C-0x5F */ + 0xC1, 0xE7, 0xC1, 0xE8, 0xA2, 0x95, 0xC1, 0xE9, /* 0x60-0x63 */ + 0xA2, 0x96, 0xA2, 0x97, 0xA2, 0x98, 0xA2, 0x99, /* 0x64-0x67 */ + 0xA2, 0x9A, 0xA2, 0x9B, 0xA2, 0x9C, 0xA2, 0x9D, /* 0x68-0x6B */ + 0xC1, 0xEA, 0xA2, 0x9E, 0xA2, 0x9F, 0xA2, 0xA0, /* 0x6C-0x6F */ + 0xC1, 0xEB, 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, /* 0x70-0x73 */ + 0xC1, 0xEC, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x74-0x77 */ + 0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x78-0x7B */ + 0xC1, 0xED, 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, /* 0x7C-0x7F */ + + 0xA3, 0x4E, 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, /* 0x80-0x83 */ + 0xA3, 0x52, 0xA3, 0x53, 0xA3, 0x54, 0xA3, 0x55, /* 0x84-0x87 */ + 0xC1, 0xEE, 0xC1, 0xEF, 0xA3, 0x56, 0xA3, 0x57, /* 0x88-0x8B */ + 0xC1, 0xF0, 0xA3, 0x58, 0xA3, 0x59, 0xA3, 0x5A, /* 0x8C-0x8F */ + 0xC1, 0xF1, 0xA3, 0x61, 0xA3, 0x62, 0xA3, 0x63, /* 0x90-0x93 */ + 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, 0xA3, 0x67, /* 0x94-0x97 */ + 0xC1, 0xF2, 0xC1, 0xF3, 0xA3, 0x68, 0xC1, 0xF4, /* 0x98-0x9B */ + 0xA3, 0x69, 0xC1, 0xF5, 0xA3, 0x6A, 0xA3, 0x6B, /* 0x9C-0x9F */ + 0xA3, 0x6C, 0xA3, 0x6D, 0xA3, 0x6E, 0xA3, 0x6F, /* 0xA0-0xA3 */ + 0xA3, 0x70, 0xA3, 0x71, 0xA3, 0x72, 0xA3, 0x73, /* 0xA4-0xA7 */ + 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, 0xA3, 0x77, /* 0xA8-0xAB */ + 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, 0xA3, 0x81, /* 0xAC-0xAF */ + 0xA3, 0x82, 0xA3, 0x83, 0xA3, 0x84, 0xA3, 0x85, /* 0xB0-0xB3 */ + 0xA3, 0x86, 0xA3, 0x87, 0xA3, 0x88, 0xA3, 0x89, /* 0xB4-0xB7 */ + 0xA3, 0x8A, 0xA3, 0x8B, 0xA3, 0x8C, 0xA3, 0x8D, /* 0xB8-0xBB */ + 0xA3, 0x8E, 0xA3, 0x8F, 0xA3, 0x90, 0xA3, 0x91, /* 0xBC-0xBF */ + 0xC1, 0xF6, 0xC1, 0xF7, 0xA3, 0x92, 0xA3, 0x93, /* 0xC0-0xC3 */ + 0xC1, 0xF8, 0xA3, 0x94, 0xA3, 0x95, 0xC1, 0xF9, /* 0xC4-0xC7 */ + 0xC1, 0xFA, 0xA3, 0x96, 0xC1, 0xFB, 0xA3, 0x97, /* 0xC8-0xCB */ + 0xA3, 0x98, 0xA3, 0x99, 0xA3, 0x9A, 0xA3, 0x9B, /* 0xCC-0xCF */ + 0xC1, 0xFC, 0xC1, 0xFD, 0xA3, 0x9C, 0xC1, 0xFE, /* 0xD0-0xD3 */ + 0xA3, 0x9D, 0xC2, 0xA1, 0xC2, 0xA2, 0xA3, 0x9E, /* 0xD4-0xD7 */ + 0xA3, 0x9F, 0xC2, 0xA3, 0xC2, 0xA4, 0xA3, 0xA0, /* 0xD8-0xDB */ + 0xC2, 0xA5, 0xC2, 0xA6, 0xA4, 0x41, 0xA4, 0x42, /* 0xDC-0xDF */ + 0xC2, 0xA7, 0xA4, 0x43, 0xC2, 0xA8, 0xA4, 0x44, /* 0xE0-0xE3 */ + 0xC2, 0xA9, 0xA4, 0x45, 0xA4, 0x46, 0xC2, 0xAA, /* 0xE4-0xE7 */ + 0xA4, 0x47, 0xA4, 0x48, 0xA4, 0x49, 0xA4, 0x4A, /* 0xE8-0xEB */ + 0xC2, 0xAB, 0xC2, 0xAC, 0xA4, 0x4B, 0xC2, 0xAD, /* 0xEC-0xEF */ + 0xC2, 0xAE, 0xC2, 0xAF, 0xA4, 0x4C, 0xA4, 0x4D, /* 0xF0-0xF3 */ + 0xA4, 0x4E, 0xA4, 0x4F, 0xA4, 0x50, 0xA4, 0x51, /* 0xF4-0xF7 */ + 0xC2, 0xB0, 0xC2, 0xB1, 0xA4, 0x52, 0xA4, 0x53, /* 0xF8-0xFB */ + 0xC2, 0xB2, 0xA4, 0x54, 0xA4, 0x55, 0xA4, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CA[512] = { + 0xC2, 0xB3, 0xA4, 0x57, 0xA4, 0x58, 0xA4, 0x59, /* 0x00-0x03 */ + 0xA4, 0x5A, 0xA4, 0x61, 0xA4, 0x62, 0xA4, 0x63, /* 0x04-0x07 */ + 0xC2, 0xB4, 0xC2, 0xB5, 0xA4, 0x64, 0xC2, 0xB6, /* 0x08-0x0B */ + 0xC2, 0xB7, 0xC2, 0xB8, 0xA4, 0x65, 0xA4, 0x66, /* 0x0C-0x0F */ + 0xA4, 0x67, 0xA4, 0x68, 0xA4, 0x69, 0xA4, 0x6A, /* 0x10-0x13 */ + 0xC2, 0xB9, 0xA4, 0x6B, 0xA4, 0x6C, 0xA4, 0x6D, /* 0x14-0x17 */ + 0xC2, 0xBA, 0xA4, 0x6E, 0xA4, 0x6F, 0xA4, 0x70, /* 0x18-0x1B */ + 0xA4, 0x71, 0xA4, 0x72, 0xA4, 0x73, 0xA4, 0x74, /* 0x1C-0x1F */ + 0xA4, 0x75, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0x20-0x23 */ + 0xA4, 0x79, 0xA4, 0x7A, 0xA4, 0x81, 0xA4, 0x82, /* 0x24-0x27 */ + 0xA4, 0x83, 0xC2, 0xBB, 0xA4, 0x84, 0xA4, 0x85, /* 0x28-0x2B */ + 0xA4, 0x86, 0xA4, 0x87, 0xA4, 0x88, 0xA4, 0x89, /* 0x2C-0x2F */ + 0xA4, 0x8A, 0xA4, 0x8B, 0xA4, 0x8C, 0xA4, 0x8D, /* 0x30-0x33 */ + 0xA4, 0x8E, 0xA4, 0x8F, 0xA4, 0x90, 0xA4, 0x91, /* 0x34-0x37 */ + 0xA4, 0x92, 0xA4, 0x93, 0xA4, 0x94, 0xA4, 0x95, /* 0x38-0x3B */ + 0xA4, 0x96, 0xA4, 0x97, 0xA4, 0x98, 0xA4, 0x99, /* 0x3C-0x3F */ + 0xA4, 0x9A, 0xA4, 0x9B, 0xA4, 0x9C, 0xA4, 0x9D, /* 0x40-0x43 */ + 0xA4, 0x9E, 0xA4, 0x9F, 0xA4, 0xA0, 0xA5, 0x41, /* 0x44-0x47 */ + 0xA5, 0x42, 0xA5, 0x43, 0xA5, 0x44, 0xA5, 0x45, /* 0x48-0x4B */ + 0xC2, 0xBC, 0xC2, 0xBD, 0xA5, 0x46, 0xA5, 0x47, /* 0x4C-0x4F */ + 0xC2, 0xBE, 0xA5, 0x48, 0xA5, 0x49, 0xA5, 0x4A, /* 0x50-0x53 */ + 0xC2, 0xBF, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0x54-0x57 */ + 0xA5, 0x4E, 0xA5, 0x4F, 0xA5, 0x50, 0xA5, 0x51, /* 0x58-0x5B */ + 0xC2, 0xC0, 0xC2, 0xC1, 0xA5, 0x52, 0xC2, 0xC2, /* 0x5C-0x5F */ + 0xC2, 0xC3, 0xC2, 0xC4, 0xA5, 0x53, 0xA5, 0x54, /* 0x60-0x63 */ + 0xA5, 0x55, 0xA5, 0x56, 0xA5, 0x57, 0xA5, 0x58, /* 0x64-0x67 */ + 0xC2, 0xC5, 0xA5, 0x59, 0xA5, 0x5A, 0xA5, 0x61, /* 0x68-0x6B */ + 0xA5, 0x62, 0xA5, 0x63, 0xA5, 0x64, 0xA5, 0x65, /* 0x6C-0x6F */ + 0xA5, 0x66, 0xA5, 0x67, 0xA5, 0x68, 0xA5, 0x69, /* 0x70-0x73 */ + 0xA5, 0x6A, 0xA5, 0x6B, 0xA5, 0x6C, 0xA5, 0x6D, /* 0x74-0x77 */ + 0xA5, 0x6E, 0xA5, 0x6F, 0xA5, 0x70, 0xA5, 0x71, /* 0x78-0x7B */ + 0xA5, 0x72, 0xC2, 0xC6, 0xA5, 0x73, 0xA5, 0x74, /* 0x7C-0x7F */ + + 0xA5, 0x75, 0xA5, 0x76, 0xA5, 0x77, 0xA5, 0x78, /* 0x80-0x83 */ + 0xC2, 0xC7, 0xA5, 0x79, 0xA5, 0x7A, 0xA5, 0x81, /* 0x84-0x87 */ + 0xA5, 0x82, 0xA5, 0x83, 0xA5, 0x84, 0xA5, 0x85, /* 0x88-0x8B */ + 0xA5, 0x86, 0xA5, 0x87, 0xA5, 0x88, 0xA5, 0x89, /* 0x8C-0x8F */ + 0xA5, 0x8A, 0xA5, 0x8B, 0xA5, 0x8C, 0xA5, 0x8D, /* 0x90-0x93 */ + 0xA5, 0x8E, 0xA5, 0x8F, 0xA5, 0x90, 0xA5, 0x91, /* 0x94-0x97 */ + 0xC2, 0xC8, 0xA5, 0x92, 0xA5, 0x93, 0xA5, 0x94, /* 0x98-0x9B */ + 0xA5, 0x95, 0xA5, 0x96, 0xA5, 0x97, 0xA5, 0x98, /* 0x9C-0x9F */ + 0xA5, 0x99, 0xA5, 0x9A, 0xA5, 0x9B, 0xA5, 0x9C, /* 0xA0-0xA3 */ + 0xA5, 0x9D, 0xA5, 0x9E, 0xA5, 0x9F, 0xA5, 0xA0, /* 0xA4-0xA7 */ + 0xA6, 0x41, 0xA6, 0x42, 0xA6, 0x43, 0xA6, 0x44, /* 0xA8-0xAB */ + 0xA6, 0x45, 0xA6, 0x46, 0xA6, 0x47, 0xA6, 0x48, /* 0xAC-0xAF */ + 0xA6, 0x49, 0xA6, 0x4A, 0xA6, 0x4B, 0xA6, 0x4C, /* 0xB0-0xB3 */ + 0xA6, 0x4D, 0xA6, 0x4E, 0xA6, 0x4F, 0xA6, 0x50, /* 0xB4-0xB7 */ + 0xA6, 0x51, 0xA6, 0x52, 0xA6, 0x53, 0xA6, 0x54, /* 0xB8-0xBB */ + 0xC2, 0xC9, 0xC2, 0xCA, 0xA6, 0x55, 0xA6, 0x56, /* 0xBC-0xBF */ + 0xC2, 0xCB, 0xA6, 0x57, 0xA6, 0x58, 0xA6, 0x59, /* 0xC0-0xC3 */ + 0xC2, 0xCC, 0xA6, 0x5A, 0xA6, 0x61, 0xA6, 0x62, /* 0xC4-0xC7 */ + 0xA6, 0x63, 0xA6, 0x64, 0xA6, 0x65, 0xA6, 0x66, /* 0xC8-0xCB */ + 0xC2, 0xCD, 0xC2, 0xCE, 0xA6, 0x67, 0xC2, 0xCF, /* 0xCC-0xCF */ + 0xA6, 0x68, 0xC2, 0xD0, 0xA6, 0x69, 0xC2, 0xD1, /* 0xD0-0xD3 */ + 0xA6, 0x6A, 0xA6, 0x6B, 0xA6, 0x6C, 0xA6, 0x6D, /* 0xD4-0xD7 */ + 0xC2, 0xD2, 0xC2, 0xD3, 0xA6, 0x6E, 0xA6, 0x6F, /* 0xD8-0xDB */ + 0xA6, 0x70, 0xA6, 0x71, 0xA6, 0x72, 0xA6, 0x73, /* 0xDC-0xDF */ + 0xC2, 0xD4, 0xA6, 0x74, 0xA6, 0x75, 0xA6, 0x76, /* 0xE0-0xE3 */ + 0xA6, 0x77, 0xA6, 0x78, 0xA6, 0x79, 0xA6, 0x7A, /* 0xE4-0xE7 */ + 0xA6, 0x81, 0xA6, 0x82, 0xA6, 0x83, 0xA6, 0x84, /* 0xE8-0xEB */ + 0xC2, 0xD5, 0xA6, 0x85, 0xA6, 0x86, 0xA6, 0x87, /* 0xEC-0xEF */ + 0xA6, 0x88, 0xA6, 0x89, 0xA6, 0x8A, 0xA6, 0x8B, /* 0xF0-0xF3 */ + 0xC2, 0xD6, 0xA6, 0x8C, 0xA6, 0x8D, 0xA6, 0x8E, /* 0xF4-0xF7 */ + 0xA6, 0x8F, 0xA6, 0x90, 0xA6, 0x91, 0xA6, 0x92, /* 0xF8-0xFB */ + 0xA6, 0x93, 0xA6, 0x94, 0xA6, 0x95, 0xA6, 0x96, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CB[512] = { + 0xA6, 0x97, 0xA6, 0x98, 0xA6, 0x99, 0xA6, 0x9A, /* 0x00-0x03 */ + 0xA6, 0x9B, 0xA6, 0x9C, 0xA6, 0x9D, 0xA6, 0x9E, /* 0x04-0x07 */ + 0xC2, 0xD7, 0xA6, 0x9F, 0xA6, 0xA0, 0xA7, 0x41, /* 0x08-0x0B */ + 0xA7, 0x42, 0xA7, 0x43, 0xA7, 0x44, 0xA7, 0x45, /* 0x0C-0x0F */ + 0xC2, 0xD8, 0xA7, 0x46, 0xA7, 0x47, 0xA7, 0x48, /* 0x10-0x13 */ + 0xC2, 0xD9, 0xA7, 0x49, 0xA7, 0x4A, 0xA7, 0x4B, /* 0x14-0x17 */ + 0xC2, 0xDA, 0xA7, 0x4C, 0xA7, 0x4D, 0xA7, 0x4E, /* 0x18-0x1B */ + 0xA7, 0x4F, 0xA7, 0x50, 0xA7, 0x51, 0xA7, 0x52, /* 0x1C-0x1F */ + 0xC2, 0xDB, 0xC2, 0xDC, 0xA7, 0x53, 0xA7, 0x54, /* 0x20-0x23 */ + 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x57, 0xA7, 0x58, /* 0x24-0x27 */ + 0xA7, 0x59, 0xA7, 0x5A, 0xA7, 0x61, 0xA7, 0x62, /* 0x28-0x2B */ + 0xA7, 0x63, 0xA7, 0x64, 0xA7, 0x65, 0xA7, 0x66, /* 0x2C-0x2F */ + 0xA7, 0x67, 0xA7, 0x68, 0xA7, 0x69, 0xA7, 0x6A, /* 0x30-0x33 */ + 0xA7, 0x6B, 0xA7, 0x6C, 0xA7, 0x6D, 0xA7, 0x6E, /* 0x34-0x37 */ + 0xA7, 0x6F, 0xA7, 0x70, 0xA7, 0x71, 0xA7, 0x72, /* 0x38-0x3B */ + 0xA7, 0x73, 0xA7, 0x74, 0xA7, 0x75, 0xA7, 0x76, /* 0x3C-0x3F */ + 0xA7, 0x77, 0xC2, 0xDD, 0xA7, 0x78, 0xA7, 0x79, /* 0x40-0x43 */ + 0xA7, 0x7A, 0xA7, 0x81, 0xA7, 0x82, 0xA7, 0x83, /* 0x44-0x47 */ + 0xC2, 0xDE, 0xC2, 0xDF, 0xA7, 0x84, 0xA7, 0x85, /* 0x48-0x4B */ + 0xC2, 0xE0, 0xA7, 0x86, 0xA7, 0x87, 0xA7, 0x88, /* 0x4C-0x4F */ + 0xC2, 0xE1, 0xA7, 0x89, 0xA7, 0x8A, 0xA7, 0x8B, /* 0x50-0x53 */ + 0xA7, 0x8C, 0xA7, 0x8D, 0xA7, 0x8E, 0xA7, 0x8F, /* 0x54-0x57 */ + 0xC2, 0xE2, 0xC2, 0xE3, 0xA7, 0x90, 0xA7, 0x91, /* 0x58-0x5B */ + 0xA7, 0x92, 0xC2, 0xE4, 0xA7, 0x93, 0xA7, 0x94, /* 0x5C-0x5F */ + 0xA7, 0x95, 0xA7, 0x96, 0xA7, 0x97, 0xA7, 0x98, /* 0x60-0x63 */ + 0xC2, 0xE5, 0xA7, 0x99, 0xA7, 0x9A, 0xA7, 0x9B, /* 0x64-0x67 */ + 0xA7, 0x9C, 0xA7, 0x9D, 0xA7, 0x9E, 0xA7, 0x9F, /* 0x68-0x6B */ + 0xA7, 0xA0, 0xA8, 0x41, 0xA8, 0x42, 0xA8, 0x43, /* 0x6C-0x6F */ + 0xA8, 0x44, 0xA8, 0x45, 0xA8, 0x46, 0xA8, 0x47, /* 0x70-0x73 */ + 0xA8, 0x48, 0xA8, 0x49, 0xA8, 0x4A, 0xA8, 0x4B, /* 0x74-0x77 */ + 0xC2, 0xE6, 0xC2, 0xE7, 0xA8, 0x4C, 0xA8, 0x4D, /* 0x78-0x7B */ + 0xA8, 0x4E, 0xA8, 0x4F, 0xA8, 0x50, 0xA8, 0x51, /* 0x7C-0x7F */ + + 0xA8, 0x52, 0xA8, 0x53, 0xA8, 0x54, 0xA8, 0x55, /* 0x80-0x83 */ + 0xA8, 0x56, 0xA8, 0x57, 0xA8, 0x58, 0xA8, 0x59, /* 0x84-0x87 */ + 0xA8, 0x5A, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x88-0x8B */ + 0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x8C-0x8F */ + 0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x90-0x93 */ + 0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x94-0x97 */ + 0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x98-0x9B */ + 0xC2, 0xE8, 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, /* 0x9C-0x9F */ + 0xA8, 0x77, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0xA0-0xA3 */ + 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, 0xA8, 0x84, /* 0xA4-0xA7 */ + 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, 0xA8, 0x88, /* 0xA8-0xAB */ + 0xA8, 0x89, 0xA8, 0x8A, 0xA8, 0x8B, 0xA8, 0x8C, /* 0xAC-0xAF */ + 0xA8, 0x8D, 0xA8, 0x8E, 0xA8, 0x8F, 0xA8, 0x90, /* 0xB0-0xB3 */ + 0xA8, 0x91, 0xA8, 0x92, 0xA8, 0x93, 0xA8, 0x94, /* 0xB4-0xB7 */ + 0xC2, 0xE9, 0xA8, 0x95, 0xA8, 0x96, 0xA8, 0x97, /* 0xB8-0xBB */ + 0xA8, 0x98, 0xA8, 0x99, 0xA8, 0x9A, 0xA8, 0x9B, /* 0xBC-0xBF */ + 0xA8, 0x9C, 0xA8, 0x9D, 0xA8, 0x9E, 0xA8, 0x9F, /* 0xC0-0xC3 */ + 0xA8, 0xA0, 0xA9, 0x41, 0xA9, 0x42, 0xA9, 0x43, /* 0xC4-0xC7 */ + 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, 0xA9, 0x47, /* 0xC8-0xCB */ + 0xA9, 0x48, 0xA9, 0x49, 0xA9, 0x4A, 0xA9, 0x4B, /* 0xCC-0xCF */ + 0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0xA9, 0x4F, /* 0xD0-0xD3 */ + 0xC2, 0xEA, 0xA9, 0x50, 0xA9, 0x51, 0xA9, 0x52, /* 0xD4-0xD7 */ + 0xA9, 0x53, 0xA9, 0x54, 0xA9, 0x55, 0xA9, 0x56, /* 0xD8-0xDB */ + 0xA9, 0x57, 0xA9, 0x58, 0xA9, 0x59, 0xA9, 0x5A, /* 0xDC-0xDF */ + 0xA9, 0x61, 0xA9, 0x62, 0xA9, 0x63, 0xA9, 0x64, /* 0xE0-0xE3 */ + 0xC2, 0xEB, 0xA9, 0x65, 0xA9, 0x66, 0xC2, 0xEC, /* 0xE4-0xE7 */ + 0xA9, 0x67, 0xC2, 0xED, 0xA9, 0x68, 0xA9, 0x69, /* 0xE8-0xEB */ + 0xA9, 0x6A, 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, /* 0xEC-0xEF */ + 0xA9, 0x6E, 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, /* 0xF0-0xF3 */ + 0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0xF4-0xF7 */ + 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, 0xA9, 0x79, /* 0xF8-0xFB */ + 0xA9, 0x7A, 0xA9, 0x81, 0xA9, 0x82, 0xA9, 0x83, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CC[512] = { + 0xA9, 0x84, 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, /* 0x00-0x03 */ + 0xA9, 0x88, 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, /* 0x04-0x07 */ + 0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, 0x8F, /* 0x08-0x0B */ + 0xC2, 0xEE, 0xC2, 0xEF, 0xA9, 0x90, 0xA9, 0x91, /* 0x0C-0x0F */ + 0xC2, 0xF0, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, /* 0x10-0x13 */ + 0xC2, 0xF1, 0xA9, 0x95, 0xA9, 0x96, 0xA9, 0x97, /* 0x14-0x17 */ + 0xA9, 0x98, 0xA9, 0x99, 0xA9, 0x9A, 0xA9, 0x9B, /* 0x18-0x1B */ + 0xC2, 0xF2, 0xC2, 0xF3, 0xA9, 0x9C, 0xA9, 0x9D, /* 0x1C-0x1F */ + 0xA9, 0x9E, 0xC2, 0xF4, 0xC2, 0xF5, 0xA9, 0x9F, /* 0x20-0x23 */ + 0xA9, 0xA0, 0xAA, 0x41, 0xAA, 0x42, 0xC2, 0xF6, /* 0x24-0x27 */ + 0xC2, 0xF7, 0xC2, 0xF8, 0xAA, 0x43, 0xAA, 0x44, /* 0x28-0x2B */ + 0xC2, 0xF9, 0xAA, 0x45, 0xC2, 0xFA, 0xAA, 0x46, /* 0x2C-0x2F */ + 0xC2, 0xFB, 0xAA, 0x47, 0xAA, 0x48, 0xAA, 0x49, /* 0x30-0x33 */ + 0xAA, 0x4A, 0xAA, 0x4B, 0xAA, 0x4C, 0xAA, 0x4D, /* 0x34-0x37 */ + 0xC2, 0xFC, 0xC2, 0xFD, 0xAA, 0x4E, 0xC2, 0xFE, /* 0x38-0x3B */ + 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, 0xAA, 0x4F, /* 0x3C-0x3F */ + 0xAA, 0x50, 0xAA, 0x51, 0xAA, 0x52, 0xAA, 0x53, /* 0x40-0x43 */ + 0xC3, 0xA4, 0xC3, 0xA5, 0xAA, 0x54, 0xAA, 0x55, /* 0x44-0x47 */ + 0xC3, 0xA6, 0xAA, 0x56, 0xAA, 0x57, 0xAA, 0x58, /* 0x48-0x4B */ + 0xC3, 0xA7, 0xAA, 0x59, 0xAA, 0x5A, 0xAA, 0x61, /* 0x4C-0x4F */ + 0xAA, 0x62, 0xAA, 0x63, 0xAA, 0x64, 0xAA, 0x65, /* 0x50-0x53 */ + 0xC3, 0xA8, 0xC3, 0xA9, 0xAA, 0x66, 0xC3, 0xAA, /* 0x54-0x57 */ + 0xC3, 0xAB, 0xC3, 0xAC, 0xAA, 0x67, 0xAA, 0x68, /* 0x58-0x5B */ + 0xAA, 0x69, 0xAA, 0x6A, 0xAA, 0x6B, 0xAA, 0x6C, /* 0x5C-0x5F */ + 0xC3, 0xAD, 0xAA, 0x6D, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x60-0x63 */ + 0xC3, 0xAE, 0xAA, 0x70, 0xC3, 0xAF, 0xAA, 0x71, /* 0x64-0x67 */ + 0xC3, 0xB0, 0xAA, 0x72, 0xAA, 0x73, 0xAA, 0x74, /* 0x68-0x6B */ + 0xAA, 0x75, 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, /* 0x6C-0x6F */ + 0xC3, 0xB1, 0xAA, 0x79, 0xAA, 0x7A, 0xAA, 0x81, /* 0x70-0x73 */ + 0xAA, 0x82, 0xC3, 0xB2, 0xAA, 0x83, 0xAA, 0x84, /* 0x74-0x77 */ + 0xAA, 0x85, 0xAA, 0x86, 0xAA, 0x87, 0xAA, 0x88, /* 0x78-0x7B */ + 0xAA, 0x89, 0xAA, 0x8A, 0xAA, 0x8B, 0xAA, 0x8C, /* 0x7C-0x7F */ + + 0xAA, 0x8D, 0xAA, 0x8E, 0xAA, 0x8F, 0xAA, 0x90, /* 0x80-0x83 */ + 0xAA, 0x91, 0xAA, 0x92, 0xAA, 0x93, 0xAA, 0x94, /* 0x84-0x87 */ + 0xAA, 0x95, 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, /* 0x88-0x8B */ + 0xAA, 0x99, 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, /* 0x8C-0x8F */ + 0xAA, 0x9D, 0xAA, 0x9E, 0xAA, 0x9F, 0xAA, 0xA0, /* 0x90-0x93 */ + 0xAB, 0x41, 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, /* 0x94-0x97 */ + 0xC3, 0xB3, 0xC3, 0xB4, 0xAB, 0x45, 0xAB, 0x46, /* 0x98-0x9B */ + 0xC3, 0xB5, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x9C-0x9F */ + 0xC3, 0xB6, 0xAB, 0x4A, 0xAB, 0x4B, 0xAB, 0x4C, /* 0xA0-0xA3 */ + 0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0xA4-0xA7 */ + 0xC3, 0xB7, 0xC3, 0xB8, 0xAB, 0x51, 0xC3, 0xB9, /* 0xA8-0xAB */ + 0xC3, 0xBA, 0xC3, 0xBB, 0xAB, 0x52, 0xAB, 0x53, /* 0xAC-0xAF */ + 0xAB, 0x54, 0xAB, 0x55, 0xAB, 0x56, 0xAB, 0x57, /* 0xB0-0xB3 */ + 0xC3, 0xBC, 0xC3, 0xBD, 0xAB, 0x58, 0xAB, 0x59, /* 0xB4-0xB7 */ + 0xC3, 0xBE, 0xAB, 0x5A, 0xAB, 0x61, 0xAB, 0x62, /* 0xB8-0xBB */ + 0xC3, 0xBF, 0xAB, 0x63, 0xAB, 0x64, 0xAB, 0x65, /* 0xBC-0xBF */ + 0xAB, 0x66, 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, /* 0xC0-0xC3 */ + 0xC3, 0xC0, 0xC3, 0xC1, 0xAB, 0x6A, 0xC3, 0xC2, /* 0xC4-0xC7 */ + 0xAB, 0x6B, 0xC3, 0xC3, 0xAB, 0x6C, 0xAB, 0x6D, /* 0xC8-0xCB */ + 0xAB, 0x6E, 0xAB, 0x6F, 0xAB, 0x70, 0xAB, 0x71, /* 0xCC-0xCF */ + 0xC3, 0xC4, 0xAB, 0x72, 0xAB, 0x73, 0xAB, 0x74, /* 0xD0-0xD3 */ + 0xC3, 0xC5, 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, /* 0xD4-0xD7 */ + 0xAB, 0x78, 0xAB, 0x79, 0xAB, 0x7A, 0xAB, 0x81, /* 0xD8-0xDB */ + 0xAB, 0x82, 0xAB, 0x83, 0xAB, 0x84, 0xAB, 0x85, /* 0xDC-0xDF */ + 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, 0xAB, 0x89, /* 0xE0-0xE3 */ + 0xC3, 0xC6, 0xAB, 0x8A, 0xAB, 0x8B, 0xAB, 0x8C, /* 0xE4-0xE7 */ + 0xAB, 0x8D, 0xAB, 0x8E, 0xAB, 0x8F, 0xAB, 0x90, /* 0xE8-0xEB */ + 0xC3, 0xC7, 0xAB, 0x91, 0xAB, 0x92, 0xAB, 0x93, /* 0xEC-0xEF */ + 0xC3, 0xC8, 0xAB, 0x94, 0xAB, 0x95, 0xAB, 0x96, /* 0xF0-0xF3 */ + 0xAB, 0x97, 0xAB, 0x98, 0xAB, 0x99, 0xAB, 0x9A, /* 0xF4-0xF7 */ + 0xAB, 0x9B, 0xAB, 0x9C, 0xAB, 0x9D, 0xAB, 0x9E, /* 0xF8-0xFB */ + 0xAB, 0x9F, 0xAB, 0xA0, 0xAC, 0x41, 0xAC, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CD[512] = { + 0xAC, 0x43, 0xC3, 0xC9, 0xAC, 0x44, 0xAC, 0x45, /* 0x00-0x03 */ + 0xAC, 0x46, 0xAC, 0x47, 0xAC, 0x48, 0xAC, 0x49, /* 0x04-0x07 */ + 0xC3, 0xCA, 0xC3, 0xCB, 0xAC, 0x4A, 0xAC, 0x4B, /* 0x08-0x0B */ + 0xC3, 0xCC, 0xAC, 0x4C, 0xAC, 0x4D, 0xAC, 0x4E, /* 0x0C-0x0F */ + 0xC3, 0xCD, 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, /* 0x10-0x13 */ + 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, 0xAC, 0x55, /* 0x14-0x17 */ + 0xC3, 0xCE, 0xC3, 0xCF, 0xAC, 0x56, 0xC3, 0xD0, /* 0x18-0x1B */ + 0xAC, 0x57, 0xC3, 0xD1, 0xAC, 0x58, 0xAC, 0x59, /* 0x1C-0x1F */ + 0xAC, 0x5A, 0xAC, 0x61, 0xAC, 0x62, 0xAC, 0x63, /* 0x20-0x23 */ + 0xC3, 0xD2, 0xAC, 0x64, 0xAC, 0x65, 0xAC, 0x66, /* 0x24-0x27 */ + 0xC3, 0xD3, 0xAC, 0x67, 0xAC, 0x68, 0xAC, 0x69, /* 0x28-0x2B */ + 0xC3, 0xD4, 0xAC, 0x6A, 0xAC, 0x6B, 0xAC, 0x6C, /* 0x2C-0x2F */ + 0xAC, 0x6D, 0xAC, 0x6E, 0xAC, 0x6F, 0xAC, 0x70, /* 0x30-0x33 */ + 0xAC, 0x71, 0xAC, 0x72, 0xAC, 0x73, 0xAC, 0x74, /* 0x34-0x37 */ + 0xAC, 0x75, 0xC3, 0xD5, 0xAC, 0x76, 0xAC, 0x77, /* 0x38-0x3B */ + 0xAC, 0x78, 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x81, /* 0x3C-0x3F */ + 0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x40-0x43 */ + 0xAC, 0x86, 0xAC, 0x87, 0xAC, 0x88, 0xAC, 0x89, /* 0x44-0x47 */ + 0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x48-0x4B */ + 0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x4C-0x4F */ + 0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x50-0x53 */ + 0xAC, 0x96, 0xAC, 0x97, 0xAC, 0x98, 0xAC, 0x99, /* 0x54-0x57 */ + 0xAC, 0x9A, 0xAC, 0x9B, 0xAC, 0x9C, 0xAC, 0x9D, /* 0x58-0x5B */ + 0xC3, 0xD6, 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, /* 0x5C-0x5F */ + 0xC3, 0xD7, 0xAD, 0x41, 0xAD, 0x42, 0xAD, 0x43, /* 0x60-0x63 */ + 0xC3, 0xD8, 0xAD, 0x44, 0xAD, 0x45, 0xAD, 0x46, /* 0x64-0x67 */ + 0xAD, 0x47, 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, /* 0x68-0x6B */ + 0xC3, 0xD9, 0xC3, 0xDA, 0xAD, 0x4B, 0xC3, 0xDB, /* 0x6C-0x6F */ + 0xAD, 0x4C, 0xC3, 0xDC, 0xAD, 0x4D, 0xAD, 0x4E, /* 0x70-0x73 */ + 0xAD, 0x4F, 0xAD, 0x50, 0xAD, 0x51, 0xAD, 0x52, /* 0x74-0x77 */ + 0xC3, 0xDD, 0xAD, 0x53, 0xAD, 0x54, 0xAD, 0x55, /* 0x78-0x7B */ + 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, 0xAD, 0x59, /* 0x7C-0x7F */ + + 0xAD, 0x5A, 0xAD, 0x61, 0xAD, 0x62, 0xAD, 0x63, /* 0x80-0x83 */ + 0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0x84-0x87 */ + 0xC3, 0xDE, 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, /* 0x88-0x8B */ + 0xAD, 0x6B, 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, /* 0x8C-0x8F */ + 0xAD, 0x6F, 0xAD, 0x70, 0xAD, 0x71, 0xAD, 0x72, /* 0x90-0x93 */ + 0xC3, 0xDF, 0xC3, 0xE0, 0xAD, 0x73, 0xAD, 0x74, /* 0x94-0x97 */ + 0xC3, 0xE1, 0xAD, 0x75, 0xAD, 0x76, 0xAD, 0x77, /* 0x98-0x9B */ + 0xC3, 0xE2, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0x9C-0x9F */ + 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, 0xAD, 0x84, /* 0xA0-0xA3 */ + 0xC3, 0xE3, 0xC3, 0xE4, 0xAD, 0x85, 0xC3, 0xE5, /* 0xA4-0xA7 */ + 0xAD, 0x86, 0xC3, 0xE6, 0xAD, 0x87, 0xAD, 0x88, /* 0xA8-0xAB */ + 0xAD, 0x89, 0xAD, 0x8A, 0xAD, 0x8B, 0xAD, 0x8C, /* 0xAC-0xAF */ + 0xC3, 0xE7, 0xAD, 0x8D, 0xAD, 0x8E, 0xAD, 0x8F, /* 0xB0-0xB3 */ + 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, 0xAD, 0x93, /* 0xB4-0xB7 */ + 0xAD, 0x94, 0xAD, 0x95, 0xAD, 0x96, 0xAD, 0x97, /* 0xB8-0xBB */ + 0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xBC-0xBF */ + 0xAD, 0x9C, 0xAD, 0x9D, 0xAD, 0x9E, 0xAD, 0x9F, /* 0xC0-0xC3 */ + 0xC3, 0xE8, 0xAD, 0xA0, 0xAE, 0x41, 0xAE, 0x42, /* 0xC4-0xC7 */ + 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, 0xAE, 0x46, /* 0xC8-0xCB */ + 0xC3, 0xE9, 0xAE, 0x47, 0xAE, 0x48, 0xAE, 0x49, /* 0xCC-0xCF */ + 0xC3, 0xEA, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0xD0-0xD3 */ + 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, 0xAE, 0x50, /* 0xD4-0xD7 */ + 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, 0xAE, 0x54, /* 0xD8-0xDB */ + 0xAE, 0x55, 0xAE, 0x56, 0xAE, 0x57, 0xAE, 0x58, /* 0xDC-0xDF */ + 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x61, 0xAE, 0x62, /* 0xE0-0xE3 */ + 0xAE, 0x63, 0xAE, 0x64, 0xAE, 0x65, 0xAE, 0x66, /* 0xE4-0xE7 */ + 0xC3, 0xEB, 0xAE, 0x67, 0xAE, 0x68, 0xAE, 0x69, /* 0xE8-0xEB */ + 0xC3, 0xEC, 0xAE, 0x6A, 0xAE, 0x6B, 0xAE, 0x6C, /* 0xEC-0xEF */ + 0xC3, 0xED, 0xAE, 0x6D, 0xAE, 0x6E, 0xAE, 0x6F, /* 0xF0-0xF3 */ + 0xAE, 0x70, 0xAE, 0x71, 0xAE, 0x72, 0xAE, 0x73, /* 0xF4-0xF7 */ + 0xC3, 0xEE, 0xC3, 0xEF, 0xAE, 0x74, 0xC3, 0xF0, /* 0xF8-0xFB */ + 0xAE, 0x75, 0xC3, 0xF1, 0xAE, 0x76, 0xAE, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CE[512] = { + 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, 0xAE, 0x81, /* 0x00-0x03 */ + 0xC3, 0xF2, 0xAE, 0x82, 0xAE, 0x83, 0xAE, 0x84, /* 0x04-0x07 */ + 0xC3, 0xF3, 0xAE, 0x85, 0xAE, 0x86, 0xAE, 0x87, /* 0x08-0x0B */ + 0xC3, 0xF4, 0xAE, 0x88, 0xAE, 0x89, 0xAE, 0x8A, /* 0x0C-0x0F */ + 0xAE, 0x8B, 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, /* 0x10-0x13 */ + 0xC3, 0xF5, 0xAE, 0x8F, 0xAE, 0x90, 0xAE, 0x91, /* 0x14-0x17 */ + 0xAE, 0x92, 0xC3, 0xF6, 0xAE, 0x93, 0xAE, 0x94, /* 0x18-0x1B */ + 0xAE, 0x95, 0xAE, 0x96, 0xAE, 0x97, 0xAE, 0x98, /* 0x1C-0x1F */ + 0xC3, 0xF7, 0xC3, 0xF8, 0xAE, 0x99, 0xAE, 0x9A, /* 0x20-0x23 */ + 0xC3, 0xF9, 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, /* 0x24-0x27 */ + 0xC3, 0xFA, 0xAE, 0x9E, 0xAE, 0x9F, 0xAE, 0xA0, /* 0x28-0x2B */ + 0xAF, 0x41, 0xAF, 0x42, 0xAF, 0x43, 0xAF, 0x44, /* 0x2C-0x2F */ + 0xC3, 0xFB, 0xC3, 0xFC, 0xAF, 0x45, 0xC3, 0xFD, /* 0x30-0x33 */ + 0xAF, 0x46, 0xC3, 0xFE, 0xAF, 0x47, 0xAF, 0x48, /* 0x34-0x37 */ + 0xAF, 0x49, 0xAF, 0x4A, 0xAF, 0x4B, 0xAF, 0x4C, /* 0x38-0x3B */ + 0xAF, 0x4D, 0xAF, 0x4E, 0xAF, 0x4F, 0xAF, 0x50, /* 0x3C-0x3F */ + 0xAF, 0x51, 0xAF, 0x52, 0xAF, 0x53, 0xAF, 0x54, /* 0x40-0x43 */ + 0xAF, 0x55, 0xAF, 0x56, 0xAF, 0x57, 0xAF, 0x58, /* 0x44-0x47 */ + 0xAF, 0x59, 0xAF, 0x5A, 0xAF, 0x61, 0xAF, 0x62, /* 0x48-0x4B */ + 0xAF, 0x63, 0xAF, 0x64, 0xAF, 0x65, 0xAF, 0x66, /* 0x4C-0x4F */ + 0xAF, 0x67, 0xAF, 0x68, 0xAF, 0x69, 0xAF, 0x6A, /* 0x50-0x53 */ + 0xAF, 0x6B, 0xAF, 0x6C, 0xAF, 0x6D, 0xAF, 0x6E, /* 0x54-0x57 */ + 0xC4, 0xA1, 0xC4, 0xA2, 0xAF, 0x6F, 0xAF, 0x70, /* 0x58-0x5B */ + 0xC4, 0xA3, 0xAF, 0x71, 0xAF, 0x72, 0xC4, 0xA4, /* 0x5C-0x5F */ + 0xC4, 0xA5, 0xC4, 0xA6, 0xAF, 0x73, 0xAF, 0x74, /* 0x60-0x63 */ + 0xAF, 0x75, 0xAF, 0x76, 0xAF, 0x77, 0xAF, 0x78, /* 0x64-0x67 */ + 0xC4, 0xA7, 0xC4, 0xA8, 0xAF, 0x79, 0xC4, 0xA9, /* 0x68-0x6B */ + 0xAF, 0x7A, 0xC4, 0xAA, 0xAF, 0x81, 0xAF, 0x82, /* 0x6C-0x6F */ + 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, 0xAF, 0x86, /* 0x70-0x73 */ + 0xC4, 0xAB, 0xC4, 0xAC, 0xAF, 0x87, 0xAF, 0x88, /* 0x74-0x77 */ + 0xC4, 0xAD, 0xAF, 0x89, 0xAF, 0x8A, 0xAF, 0x8B, /* 0x78-0x7B */ + 0xC4, 0xAE, 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, /* 0x7C-0x7F */ + + 0xAF, 0x8F, 0xAF, 0x90, 0xAF, 0x91, 0xAF, 0x92, /* 0x80-0x83 */ + 0xC4, 0xAF, 0xC4, 0xB0, 0xAF, 0x93, 0xC4, 0xB1, /* 0x84-0x87 */ + 0xAF, 0x94, 0xC4, 0xB2, 0xAF, 0x95, 0xAF, 0x96, /* 0x88-0x8B */ + 0xAF, 0x97, 0xAF, 0x98, 0xAF, 0x99, 0xAF, 0x9A, /* 0x8C-0x8F */ + 0xC4, 0xB3, 0xC4, 0xB4, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x90-0x93 */ + 0xC4, 0xB5, 0xAF, 0x9D, 0xAF, 0x9E, 0xAF, 0x9F, /* 0x94-0x97 */ + 0xC4, 0xB6, 0xAF, 0xA0, 0xB0, 0x41, 0xB0, 0x42, /* 0x98-0x9B */ + 0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x9C-0x9F */ + 0xC4, 0xB7, 0xC4, 0xB8, 0xB0, 0x47, 0xC4, 0xB9, /* 0xA0-0xA3 */ + 0xC4, 0xBA, 0xC4, 0xBB, 0xB0, 0x48, 0xB0, 0x49, /* 0xA4-0xA7 */ + 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, 0xB0, 0x4D, /* 0xA8-0xAB */ + 0xC4, 0xBC, 0xC4, 0xBD, 0xB0, 0x4E, 0xB0, 0x4F, /* 0xAC-0xAF */ + 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, 0xB0, 0x53, /* 0xB0-0xB3 */ + 0xB0, 0x54, 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, /* 0xB4-0xB7 */ + 0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x61, /* 0xB8-0xBB */ + 0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0xBC-0xBF */ + 0xB0, 0x66, 0xC4, 0xBE, 0xB0, 0x67, 0xB0, 0x68, /* 0xC0-0xC3 */ + 0xB0, 0x69, 0xB0, 0x6A, 0xB0, 0x6B, 0xB0, 0x6C, /* 0xC4-0xC7 */ + 0xB0, 0x6D, 0xB0, 0x6E, 0xB0, 0x6F, 0xB0, 0x70, /* 0xC8-0xCB */ + 0xB0, 0x71, 0xB0, 0x72, 0xB0, 0x73, 0xB0, 0x74, /* 0xCC-0xCF */ + 0xB0, 0x75, 0xB0, 0x76, 0xB0, 0x77, 0xB0, 0x78, /* 0xD0-0xD3 */ + 0xB0, 0x79, 0xB0, 0x7A, 0xB0, 0x81, 0xB0, 0x82, /* 0xD4-0xD7 */ + 0xB0, 0x83, 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, /* 0xD8-0xDB */ + 0xB0, 0x87, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xDC-0xDF */ + 0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xE0-0xE3 */ + 0xC4, 0xBF, 0xC4, 0xC0, 0xB0, 0x8F, 0xB0, 0x90, /* 0xE4-0xE7 */ + 0xC4, 0xC1, 0xB0, 0x91, 0xB0, 0x92, 0xC4, 0xC2, /* 0xE8-0xEB */ + 0xC4, 0xC3, 0xB0, 0x93, 0xB0, 0x94, 0xB0, 0x95, /* 0xEC-0xEF */ + 0xB0, 0x96, 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, /* 0xF0-0xF3 */ + 0xC4, 0xC4, 0xC4, 0xC5, 0xB0, 0x9A, 0xC4, 0xC6, /* 0xF4-0xF7 */ + 0xC4, 0xC7, 0xC4, 0xC8, 0xB0, 0x9B, 0xB0, 0x9C, /* 0xF8-0xFB */ + 0xB0, 0x9D, 0xB0, 0x9E, 0xB0, 0x9F, 0xB0, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CF[512] = { + 0xC4, 0xC9, 0xC4, 0xCA, 0xB1, 0x41, 0xB1, 0x42, /* 0x00-0x03 */ + 0xC4, 0xCB, 0xB1, 0x43, 0xB1, 0x44, 0xB1, 0x45, /* 0x04-0x07 */ + 0xC4, 0xCC, 0xB1, 0x46, 0xB1, 0x47, 0xB1, 0x48, /* 0x08-0x0B */ + 0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xB1, 0x4C, /* 0x0C-0x0F */ + 0xC4, 0xCD, 0xC4, 0xCE, 0xB1, 0x4D, 0xC4, 0xCF, /* 0x10-0x13 */ + 0xB1, 0x4E, 0xC4, 0xD0, 0xB1, 0x4F, 0xB1, 0x50, /* 0x14-0x17 */ + 0xB1, 0x51, 0xB1, 0x52, 0xB1, 0x53, 0xB1, 0x54, /* 0x18-0x1B */ + 0xC4, 0xD1, 0xB1, 0x55, 0xB1, 0x56, 0xB1, 0x57, /* 0x1C-0x1F */ + 0xC4, 0xD2, 0xB1, 0x58, 0xB1, 0x59, 0xB1, 0x5A, /* 0x20-0x23 */ + 0xC4, 0xD3, 0xB1, 0x61, 0xB1, 0x62, 0xB1, 0x63, /* 0x24-0x27 */ + 0xB1, 0x64, 0xB1, 0x65, 0xB1, 0x66, 0xB1, 0x67, /* 0x28-0x2B */ + 0xC4, 0xD4, 0xC4, 0xD5, 0xB1, 0x68, 0xC4, 0xD6, /* 0x2C-0x2F */ + 0xC4, 0xD7, 0xC4, 0xD8, 0xB1, 0x69, 0xB1, 0x6A, /* 0x30-0x33 */ + 0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x34-0x37 */ + 0xC4, 0xD9, 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, /* 0x38-0x3B */ + 0xB1, 0x72, 0xB1, 0x73, 0xB1, 0x74, 0xB1, 0x75, /* 0x3C-0x3F */ + 0xB1, 0x76, 0xB1, 0x77, 0xB1, 0x78, 0xB1, 0x79, /* 0x40-0x43 */ + 0xB1, 0x7A, 0xB1, 0x81, 0xB1, 0x82, 0xB1, 0x83, /* 0x44-0x47 */ + 0xB1, 0x84, 0xB1, 0x85, 0xB1, 0x86, 0xB1, 0x87, /* 0x48-0x4B */ + 0xB1, 0x88, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x4C-0x4F */ + 0xB1, 0x8C, 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, /* 0x50-0x53 */ + 0xC4, 0xDA, 0xC4, 0xDB, 0xB1, 0x90, 0xB1, 0x91, /* 0x54-0x57 */ + 0xC4, 0xDC, 0xB1, 0x92, 0xB1, 0x93, 0xB1, 0x94, /* 0x58-0x5B */ + 0xC4, 0xDD, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x5C-0x5F */ + 0xB1, 0x98, 0xB1, 0x99, 0xB1, 0x9A, 0xB1, 0x9B, /* 0x60-0x63 */ + 0xC4, 0xDE, 0xC4, 0xDF, 0xB1, 0x9C, 0xC4, 0xE0, /* 0x64-0x67 */ + 0xB1, 0x9D, 0xC4, 0xE1, 0xB1, 0x9E, 0xB1, 0x9F, /* 0x68-0x6B */ + 0xB1, 0xA0, 0xB2, 0x41, 0xB2, 0x42, 0xB2, 0x43, /* 0x6C-0x6F */ + 0xC4, 0xE2, 0xC4, 0xE3, 0xB2, 0x44, 0xB2, 0x45, /* 0x70-0x73 */ + 0xC4, 0xE4, 0xB2, 0x46, 0xB2, 0x47, 0xB2, 0x48, /* 0x74-0x77 */ + 0xC4, 0xE5, 0xB2, 0x49, 0xB2, 0x4A, 0xB2, 0x4B, /* 0x78-0x7B */ + 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, 0xB2, 0x4F, /* 0x7C-0x7F */ + + 0xC4, 0xE6, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x80-0x83 */ + 0xB2, 0x53, 0xC4, 0xE7, 0xB2, 0x54, 0xB2, 0x55, /* 0x84-0x87 */ + 0xB2, 0x56, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x88-0x8B */ + 0xC4, 0xE8, 0xB2, 0x5A, 0xB2, 0x61, 0xB2, 0x62, /* 0x8C-0x8F */ + 0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x90-0x93 */ + 0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x94-0x97 */ + 0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xB2, 0x6E, /* 0x98-0x9B */ + 0xB2, 0x6F, 0xB2, 0x70, 0xB2, 0x71, 0xB2, 0x72, /* 0x9C-0x9F */ + 0xB2, 0x73, 0xC4, 0xE9, 0xB2, 0x74, 0xB2, 0x75, /* 0xA0-0xA3 */ + 0xB2, 0x76, 0xB2, 0x77, 0xB2, 0x78, 0xB2, 0x79, /* 0xA4-0xA7 */ + 0xC4, 0xEA, 0xB2, 0x7A, 0xB2, 0x81, 0xB2, 0x82, /* 0xA8-0xAB */ + 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, 0xB2, 0x86, /* 0xAC-0xAF */ + 0xC4, 0xEB, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xB0-0xB3 */ + 0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xB4-0xB7 */ + 0xB2, 0x8E, 0xB2, 0x8F, 0xB2, 0x90, 0xB2, 0x91, /* 0xB8-0xBB */ + 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, 0xB2, 0x95, /* 0xBC-0xBF */ + 0xB2, 0x96, 0xB2, 0x97, 0xB2, 0x98, 0xB2, 0x99, /* 0xC0-0xC3 */ + 0xC4, 0xEC, 0xB2, 0x9A, 0xB2, 0x9B, 0xB2, 0x9C, /* 0xC4-0xC7 */ + 0xB2, 0x9D, 0xB2, 0x9E, 0xB2, 0x9F, 0xB2, 0xA0, /* 0xC8-0xCB */ + 0xB3, 0x41, 0xB3, 0x42, 0xB3, 0x43, 0xB3, 0x44, /* 0xCC-0xCF */ + 0xB3, 0x45, 0xB3, 0x46, 0xB3, 0x47, 0xB3, 0x48, /* 0xD0-0xD3 */ + 0xB3, 0x49, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xD4-0xD7 */ + 0xB3, 0x4D, 0xB3, 0x4E, 0xB3, 0x4F, 0xB3, 0x50, /* 0xD8-0xDB */ + 0xB3, 0x51, 0xB3, 0x52, 0xB3, 0x53, 0xB3, 0x54, /* 0xDC-0xDF */ + 0xC4, 0xED, 0xC4, 0xEE, 0xB3, 0x55, 0xB3, 0x56, /* 0xE0-0xE3 */ + 0xC4, 0xEF, 0xB3, 0x57, 0xB3, 0x58, 0xB3, 0x59, /* 0xE4-0xE7 */ + 0xC4, 0xF0, 0xB3, 0x5A, 0xB3, 0x61, 0xB3, 0x62, /* 0xE8-0xEB */ + 0xB3, 0x63, 0xB3, 0x64, 0xB3, 0x65, 0xB3, 0x66, /* 0xEC-0xEF */ + 0xC4, 0xF1, 0xC4, 0xF2, 0xB3, 0x67, 0xC4, 0xF3, /* 0xF0-0xF3 */ + 0xB3, 0x68, 0xC4, 0xF4, 0xB3, 0x69, 0xB3, 0x6A, /* 0xF4-0xF7 */ + 0xB3, 0x6B, 0xB3, 0x6C, 0xB3, 0x6D, 0xB3, 0x6E, /* 0xF8-0xFB */ + 0xC4, 0xF5, 0xB3, 0x6F, 0xB3, 0x70, 0xB3, 0x71, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D0[512] = { + 0xC4, 0xF6, 0xB3, 0x72, 0xB3, 0x73, 0xB3, 0x74, /* 0x00-0x03 */ + 0xC4, 0xF7, 0xB3, 0x75, 0xB3, 0x76, 0xB3, 0x77, /* 0x04-0x07 */ + 0xB3, 0x78, 0xB3, 0x79, 0xB3, 0x7A, 0xB3, 0x81, /* 0x08-0x0B */ + 0xB3, 0x82, 0xB3, 0x83, 0xB3, 0x84, 0xB3, 0x85, /* 0x0C-0x0F */ + 0xB3, 0x86, 0xC4, 0xF8, 0xB3, 0x87, 0xB3, 0x88, /* 0x10-0x13 */ + 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, 0xB3, 0x8C, /* 0x14-0x17 */ + 0xC4, 0xF9, 0xB3, 0x8D, 0xB3, 0x8E, 0xB3, 0x8F, /* 0x18-0x1B */ + 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, 0xB3, 0x93, /* 0x1C-0x1F */ + 0xB3, 0x94, 0xB3, 0x95, 0xB3, 0x96, 0xB3, 0x97, /* 0x20-0x23 */ + 0xB3, 0x98, 0xB3, 0x99, 0xB3, 0x9A, 0xB3, 0x9B, /* 0x24-0x27 */ + 0xB3, 0x9C, 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, /* 0x28-0x2B */ + 0xB3, 0xA0, 0xC4, 0xFA, 0xB4, 0x41, 0xB4, 0x42, /* 0x2C-0x2F */ + 0xB4, 0x43, 0xB4, 0x44, 0xB4, 0x45, 0xB4, 0x46, /* 0x30-0x33 */ + 0xC4, 0xFB, 0xC4, 0xFC, 0xB4, 0x47, 0xB4, 0x48, /* 0x34-0x37 */ + 0xC4, 0xFD, 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, /* 0x38-0x3B */ + 0xC4, 0xFE, 0xB4, 0x4C, 0xB4, 0x4D, 0xB4, 0x4E, /* 0x3C-0x3F */ + 0xB4, 0x4F, 0xB4, 0x50, 0xB4, 0x51, 0xB4, 0x52, /* 0x40-0x43 */ + 0xC5, 0xA1, 0xC5, 0xA2, 0xB4, 0x53, 0xC5, 0xA3, /* 0x44-0x47 */ + 0xB4, 0x54, 0xC5, 0xA4, 0xB4, 0x55, 0xB4, 0x56, /* 0x48-0x4B */ + 0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0x4C-0x4F */ + 0xC5, 0xA5, 0xB4, 0x61, 0xB4, 0x62, 0xB4, 0x63, /* 0x50-0x53 */ + 0xC5, 0xA6, 0xB4, 0x64, 0xB4, 0x65, 0xB4, 0x66, /* 0x54-0x57 */ + 0xC5, 0xA7, 0xB4, 0x67, 0xB4, 0x68, 0xB4, 0x69, /* 0x58-0x5B */ + 0xB4, 0x6A, 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, /* 0x5C-0x5F */ + 0xC5, 0xA8, 0xB4, 0x6E, 0xB4, 0x6F, 0xB4, 0x70, /* 0x60-0x63 */ + 0xB4, 0x71, 0xB4, 0x72, 0xB4, 0x73, 0xB4, 0x74, /* 0x64-0x67 */ + 0xB4, 0x75, 0xB4, 0x76, 0xB4, 0x77, 0xB4, 0x78, /* 0x68-0x6B */ + 0xC5, 0xA9, 0xC5, 0xAA, 0xB4, 0x79, 0xB4, 0x7A, /* 0x6C-0x6F */ + 0xC5, 0xAB, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0x70-0x73 */ + 0xC5, 0xAC, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0x74-0x77 */ + 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, 0xB4, 0x8A, /* 0x78-0x7B */ + 0xC5, 0xAD, 0xC5, 0xAE, 0xB4, 0x8B, 0xB4, 0x8C, /* 0x7C-0x7F */ + + 0xB4, 0x8D, 0xC5, 0xAF, 0xB4, 0x8E, 0xB4, 0x8F, /* 0x80-0x83 */ + 0xB4, 0x90, 0xB4, 0x91, 0xB4, 0x92, 0xB4, 0x93, /* 0x84-0x87 */ + 0xB4, 0x94, 0xB4, 0x95, 0xB4, 0x96, 0xB4, 0x97, /* 0x88-0x8B */ + 0xB4, 0x98, 0xB4, 0x99, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x8C-0x8F */ + 0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x90-0x93 */ + 0xB4, 0xA0, 0xB5, 0x41, 0xB5, 0x42, 0xB5, 0x43, /* 0x94-0x97 */ + 0xB5, 0x44, 0xB5, 0x45, 0xB5, 0x46, 0xB5, 0x47, /* 0x98-0x9B */ + 0xB5, 0x48, 0xB5, 0x49, 0xB5, 0x4A, 0xB5, 0x4B, /* 0x9C-0x9F */ + 0xB5, 0x4C, 0xB5, 0x4D, 0xB5, 0x4E, 0xB5, 0x4F, /* 0xA0-0xA3 */ + 0xC5, 0xB0, 0xC5, 0xB1, 0xB5, 0x50, 0xB5, 0x51, /* 0xA4-0xA7 */ + 0xC5, 0xB2, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0xA8-0xAB */ + 0xC5, 0xB3, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0xAC-0xAF */ + 0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x61, /* 0xB0-0xB3 */ + 0xC5, 0xB4, 0xC5, 0xB5, 0xB5, 0x62, 0xC5, 0xB6, /* 0xB4-0xB7 */ + 0xB5, 0x63, 0xC5, 0xB7, 0xB5, 0x64, 0xB5, 0x65, /* 0xB8-0xBB */ + 0xB5, 0x66, 0xB5, 0x67, 0xB5, 0x68, 0xB5, 0x69, /* 0xBC-0xBF */ + 0xC5, 0xB8, 0xC5, 0xB9, 0xB5, 0x6A, 0xB5, 0x6B, /* 0xC0-0xC3 */ + 0xC5, 0xBA, 0xB5, 0x6C, 0xB5, 0x6D, 0xB5, 0x6E, /* 0xC4-0xC7 */ + 0xC5, 0xBB, 0xC5, 0xBC, 0xB5, 0x6F, 0xB5, 0x70, /* 0xC8-0xCB */ + 0xB5, 0x71, 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, /* 0xCC-0xCF */ + 0xC5, 0xBD, 0xC5, 0xBE, 0xB5, 0x75, 0xC5, 0xBF, /* 0xD0-0xD3 */ + 0xC5, 0xC0, 0xC5, 0xC1, 0xB5, 0x76, 0xB5, 0x77, /* 0xD4-0xD7 */ + 0xB5, 0x78, 0xB5, 0x79, 0xB5, 0x7A, 0xB5, 0x81, /* 0xD8-0xDB */ + 0xC5, 0xC2, 0xC5, 0xC3, 0xB5, 0x82, 0xB5, 0x83, /* 0xDC-0xDF */ + 0xC5, 0xC4, 0xB5, 0x84, 0xB5, 0x85, 0xB5, 0x86, /* 0xE0-0xE3 */ + 0xC5, 0xC5, 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, /* 0xE4-0xE7 */ + 0xB5, 0x8A, 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, /* 0xE8-0xEB */ + 0xC5, 0xC6, 0xC5, 0xC7, 0xB5, 0x8E, 0xC5, 0xC8, /* 0xEC-0xEF */ + 0xC5, 0xC9, 0xC5, 0xCA, 0xB5, 0x8F, 0xB5, 0x90, /* 0xF0-0xF3 */ + 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, 0xB5, 0x94, /* 0xF4-0xF7 */ + 0xC5, 0xCB, 0xB5, 0x95, 0xB5, 0x96, 0xB5, 0x97, /* 0xF8-0xFB */ + 0xB5, 0x98, 0xB5, 0x99, 0xB5, 0x9A, 0xB5, 0x9B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D1[512] = { + 0xB5, 0x9C, 0xB5, 0x9D, 0xB5, 0x9E, 0xB5, 0x9F, /* 0x00-0x03 */ + 0xB5, 0xA0, 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, /* 0x04-0x07 */ + 0xB6, 0x44, 0xB6, 0x45, 0xB6, 0x46, 0xB6, 0x47, /* 0x08-0x0B */ + 0xB6, 0x48, 0xC5, 0xCC, 0xB6, 0x49, 0xB6, 0x4A, /* 0x0C-0x0F */ + 0xB6, 0x4B, 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, /* 0x10-0x13 */ + 0xB6, 0x4F, 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, /* 0x14-0x17 */ + 0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0x18-0x1B */ + 0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0x1C-0x1F */ + 0xB6, 0x61, 0xB6, 0x62, 0xB6, 0x63, 0xB6, 0x64, /* 0x20-0x23 */ + 0xB6, 0x65, 0xB6, 0x66, 0xB6, 0x67, 0xB6, 0x68, /* 0x24-0x27 */ + 0xB6, 0x69, 0xB6, 0x6A, 0xB6, 0x6B, 0xB6, 0x6C, /* 0x28-0x2B */ + 0xB6, 0x6D, 0xB6, 0x6E, 0xB6, 0x6F, 0xB6, 0x70, /* 0x2C-0x2F */ + 0xC5, 0xCD, 0xC5, 0xCE, 0xB6, 0x71, 0xB6, 0x72, /* 0x30-0x33 */ + 0xC5, 0xCF, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0x34-0x37 */ + 0xC5, 0xD0, 0xB6, 0x76, 0xC5, 0xD1, 0xB6, 0x77, /* 0x38-0x3B */ + 0xB6, 0x78, 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x81, /* 0x3C-0x3F */ + 0xC5, 0xD2, 0xC5, 0xD3, 0xB6, 0x82, 0xC5, 0xD4, /* 0x40-0x43 */ + 0xC5, 0xD5, 0xC5, 0xD6, 0xB6, 0x83, 0xB6, 0x84, /* 0x44-0x47 */ + 0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0x48-0x4B */ + 0xC5, 0xD7, 0xC5, 0xD8, 0xB6, 0x89, 0xB6, 0x8A, /* 0x4C-0x4F */ + 0xC5, 0xD9, 0xB6, 0x8B, 0xB6, 0x8C, 0xB6, 0x8D, /* 0x50-0x53 */ + 0xC5, 0xDA, 0xB6, 0x8E, 0xB6, 0x8F, 0xB6, 0x90, /* 0x54-0x57 */ + 0xB6, 0x91, 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, /* 0x58-0x5B */ + 0xC5, 0xDB, 0xC5, 0xDC, 0xB6, 0x95, 0xC5, 0xDD, /* 0x5C-0x5F */ + 0xB6, 0x96, 0xC5, 0xDE, 0xB6, 0x97, 0xB6, 0x98, /* 0x60-0x63 */ + 0xB6, 0x99, 0xB6, 0x9A, 0xB6, 0x9B, 0xB6, 0x9C, /* 0x64-0x67 */ + 0xC5, 0xDF, 0xB6, 0x9D, 0xB6, 0x9E, 0xB6, 0x9F, /* 0x68-0x6B */ + 0xC5, 0xE0, 0xB6, 0xA0, 0xB7, 0x41, 0xB7, 0x42, /* 0x6C-0x6F */ + 0xB7, 0x43, 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, /* 0x70-0x73 */ + 0xB7, 0x47, 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, /* 0x74-0x77 */ + 0xB7, 0x4B, 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, /* 0x78-0x7B */ + 0xC5, 0xE1, 0xB7, 0x4F, 0xB7, 0x50, 0xB7, 0x51, /* 0x7C-0x7F */ + + 0xB7, 0x52, 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, /* 0x80-0x83 */ + 0xC5, 0xE2, 0xB7, 0x56, 0xB7, 0x57, 0xB7, 0x58, /* 0x84-0x87 */ + 0xC5, 0xE3, 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x61, /* 0x88-0x8B */ + 0xB7, 0x62, 0xB7, 0x63, 0xB7, 0x64, 0xB7, 0x65, /* 0x8C-0x8F */ + 0xB7, 0x66, 0xB7, 0x67, 0xB7, 0x68, 0xB7, 0x69, /* 0x90-0x93 */ + 0xB7, 0x6A, 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, /* 0x94-0x97 */ + 0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x98-0x9B */ + 0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x9C-0x9F */ + 0xC5, 0xE4, 0xC5, 0xE5, 0xB7, 0x76, 0xB7, 0x77, /* 0xA0-0xA3 */ + 0xC5, 0xE6, 0xB7, 0x78, 0xB7, 0x79, 0xB7, 0x7A, /* 0xA4-0xA7 */ + 0xC5, 0xE7, 0xB7, 0x81, 0xB7, 0x82, 0xB7, 0x83, /* 0xA8-0xAB */ + 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, 0xB7, 0x87, /* 0xAC-0xAF */ + 0xC5, 0xE8, 0xC5, 0xE9, 0xB7, 0x88, 0xC5, 0xEA, /* 0xB0-0xB3 */ + 0xB7, 0x89, 0xC5, 0xEB, 0xB7, 0x8A, 0xB7, 0x8B, /* 0xB4-0xB7 */ + 0xB7, 0x8C, 0xB7, 0x8D, 0xC5, 0xEC, 0xB7, 0x8E, /* 0xB8-0xBB */ + 0xC5, 0xED, 0xB7, 0x8F, 0xB7, 0x90, 0xB7, 0x91, /* 0xBC-0xBF */ + 0xC5, 0xEE, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0xC0-0xC3 */ + 0xB7, 0x95, 0xB7, 0x96, 0xB7, 0x97, 0xB7, 0x98, /* 0xC4-0xC7 */ + 0xB7, 0x99, 0xB7, 0x9A, 0xB7, 0x9B, 0xB7, 0x9C, /* 0xC8-0xCB */ + 0xB7, 0x9D, 0xB7, 0x9E, 0xB7, 0x9F, 0xB7, 0xA0, /* 0xCC-0xCF */ + 0xB8, 0x41, 0xB8, 0x42, 0xB8, 0x43, 0xB8, 0x44, /* 0xD0-0xD3 */ + 0xB8, 0x45, 0xB8, 0x46, 0xB8, 0x47, 0xB8, 0x48, /* 0xD4-0xD7 */ + 0xC5, 0xEF, 0xB8, 0x49, 0xB8, 0x4A, 0xB8, 0x4B, /* 0xD8-0xDB */ + 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, 0xB8, 0x4F, /* 0xDC-0xDF */ + 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, 0xB8, 0x53, /* 0xE0-0xE3 */ + 0xB8, 0x54, 0xB8, 0x55, 0xB8, 0x56, 0xB8, 0x57, /* 0xE4-0xE7 */ + 0xB8, 0x58, 0xB8, 0x59, 0xB8, 0x5A, 0xB8, 0x61, /* 0xE8-0xEB */ + 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, 0xB8, 0x65, /* 0xEC-0xEF */ + 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, 0xB8, 0x69, /* 0xF0-0xF3 */ + 0xC5, 0xF0, 0xB8, 0x6A, 0xB8, 0x6B, 0xB8, 0x6C, /* 0xF4-0xF7 */ + 0xC5, 0xF1, 0xB8, 0x6D, 0xB8, 0x6E, 0xB8, 0x6F, /* 0xF8-0xFB */ + 0xB8, 0x70, 0xB8, 0x71, 0xB8, 0x72, 0xB8, 0x73, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D2[512] = { + 0xB8, 0x74, 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, /* 0x00-0x03 */ + 0xB8, 0x78, 0xB8, 0x79, 0xB8, 0x7A, 0xC5, 0xF2, /* 0x04-0x07 */ + 0xB8, 0x81, 0xC5, 0xF3, 0xB8, 0x82, 0xB8, 0x83, /* 0x08-0x0B */ + 0xB8, 0x84, 0xB8, 0x85, 0xB8, 0x86, 0xB8, 0x87, /* 0x0C-0x0F */ + 0xC5, 0xF4, 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, /* 0x10-0x13 */ + 0xB8, 0x8B, 0xB8, 0x8C, 0xB8, 0x8D, 0xB8, 0x8E, /* 0x14-0x17 */ + 0xB8, 0x8F, 0xB8, 0x90, 0xB8, 0x91, 0xB8, 0x92, /* 0x18-0x1B */ + 0xB8, 0x93, 0xB8, 0x94, 0xB8, 0x95, 0xB8, 0x96, /* 0x1C-0x1F */ + 0xB8, 0x97, 0xB8, 0x98, 0xB8, 0x99, 0xB8, 0x9A, /* 0x20-0x23 */ + 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, 0xB8, 0x9E, /* 0x24-0x27 */ + 0xB8, 0x9F, 0xB8, 0xA0, 0xB9, 0x41, 0xB9, 0x42, /* 0x28-0x2B */ + 0xC5, 0xF5, 0xC5, 0xF6, 0xB9, 0x43, 0xB9, 0x44, /* 0x2C-0x2F */ + 0xC5, 0xF7, 0xB9, 0x45, 0xB9, 0x46, 0xB9, 0x47, /* 0x30-0x33 */ + 0xC5, 0xF8, 0xB9, 0x48, 0xB9, 0x49, 0xB9, 0x4A, /* 0x34-0x37 */ + 0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x38-0x3B */ + 0xC5, 0xF9, 0xC5, 0xFA, 0xB9, 0x4F, 0xC5, 0xFB, /* 0x3C-0x3F */ + 0xB9, 0x50, 0xC5, 0xFC, 0xB9, 0x51, 0xB9, 0x52, /* 0x40-0x43 */ + 0xB9, 0x53, 0xB9, 0x54, 0xB9, 0x55, 0xB9, 0x56, /* 0x44-0x47 */ + 0xC5, 0xFD, 0xB9, 0x57, 0xB9, 0x58, 0xB9, 0x59, /* 0x48-0x4B */ + 0xB9, 0x5A, 0xB9, 0x61, 0xB9, 0x62, 0xB9, 0x63, /* 0x4C-0x4F */ + 0xB9, 0x64, 0xB9, 0x65, 0xB9, 0x66, 0xB9, 0x67, /* 0x50-0x53 */ + 0xB9, 0x68, 0xB9, 0x69, 0xB9, 0x6A, 0xB9, 0x6B, /* 0x54-0x57 */ + 0xB9, 0x6C, 0xB9, 0x6D, 0xB9, 0x6E, 0xB9, 0x6F, /* 0x58-0x5B */ + 0xC5, 0xFE, 0xB9, 0x70, 0xB9, 0x71, 0xB9, 0x72, /* 0x5C-0x5F */ + 0xB9, 0x73, 0xB9, 0x74, 0xB9, 0x75, 0xB9, 0x76, /* 0x60-0x63 */ + 0xC6, 0xA1, 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x79, /* 0x64-0x67 */ + 0xB9, 0x7A, 0xB9, 0x81, 0xB9, 0x82, 0xB9, 0x83, /* 0x68-0x6B */ + 0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x6C-0x6F */ + 0xB9, 0x88, 0xB9, 0x89, 0xB9, 0x8A, 0xB9, 0x8B, /* 0x70-0x73 */ + 0xB9, 0x8C, 0xB9, 0x8D, 0xB9, 0x8E, 0xB9, 0x8F, /* 0x74-0x77 */ + 0xB9, 0x90, 0xB9, 0x91, 0xB9, 0x92, 0xB9, 0x93, /* 0x78-0x7B */ + 0xB9, 0x94, 0xB9, 0x95, 0xB9, 0x96, 0xB9, 0x97, /* 0x7C-0x7F */ + + 0xC6, 0xA2, 0xC6, 0xA3, 0xB9, 0x98, 0xB9, 0x99, /* 0x80-0x83 */ + 0xC6, 0xA4, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0x84-0x87 */ + 0xC6, 0xA5, 0xB9, 0x9D, 0xB9, 0x9E, 0xB9, 0x9F, /* 0x88-0x8B */ + 0xB9, 0xA0, 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, /* 0x8C-0x8F */ + 0xC6, 0xA6, 0xC6, 0xA7, 0xBA, 0x44, 0xBA, 0x45, /* 0x90-0x93 */ + 0xBA, 0x46, 0xC6, 0xA8, 0xBA, 0x47, 0xBA, 0x48, /* 0x94-0x97 */ + 0xBA, 0x49, 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, /* 0x98-0x9B */ + 0xC6, 0xA9, 0xBA, 0x4D, 0xBA, 0x4E, 0xBA, 0x4F, /* 0x9C-0x9F */ + 0xC6, 0xAA, 0xBA, 0x50, 0xBA, 0x51, 0xBA, 0x52, /* 0xA0-0xA3 */ + 0xC6, 0xAB, 0xBA, 0x53, 0xBA, 0x54, 0xBA, 0x55, /* 0xA4-0xA7 */ + 0xBA, 0x56, 0xBA, 0x57, 0xBA, 0x58, 0xBA, 0x59, /* 0xA8-0xAB */ + 0xC6, 0xAC, 0xBA, 0x5A, 0xBA, 0x61, 0xBA, 0x62, /* 0xAC-0xAF */ + 0xBA, 0x63, 0xC6, 0xAD, 0xBA, 0x64, 0xBA, 0x65, /* 0xB0-0xB3 */ + 0xBA, 0x66, 0xBA, 0x67, 0xBA, 0x68, 0xBA, 0x69, /* 0xB4-0xB7 */ + 0xC6, 0xAE, 0xC6, 0xAF, 0xBA, 0x6A, 0xBA, 0x6B, /* 0xB8-0xBB */ + 0xC6, 0xB0, 0xBA, 0x6C, 0xBA, 0x6D, 0xC6, 0xB1, /* 0xBC-0xBF */ + 0xC6, 0xB2, 0xBA, 0x6E, 0xC6, 0xB3, 0xBA, 0x6F, /* 0xC0-0xC3 */ + 0xBA, 0x70, 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, /* 0xC4-0xC7 */ + 0xC6, 0xB4, 0xC6, 0xB5, 0xBA, 0x74, 0xC6, 0xB6, /* 0xC8-0xCB */ + 0xBA, 0x75, 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, /* 0xCC-0xCF */ + 0xBA, 0x79, 0xBA, 0x7A, 0xBA, 0x81, 0xBA, 0x82, /* 0xD0-0xD3 */ + 0xC6, 0xB7, 0xBA, 0x83, 0xBA, 0x84, 0xBA, 0x85, /* 0xD4-0xD7 */ + 0xC6, 0xB8, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0xD8-0xDB */ + 0xC6, 0xB9, 0xBA, 0x89, 0xBA, 0x8A, 0xBA, 0x8B, /* 0xDC-0xDF */ + 0xBA, 0x8C, 0xBA, 0x8D, 0xBA, 0x8E, 0xBA, 0x8F, /* 0xE0-0xE3 */ + 0xC6, 0xBA, 0xC6, 0xBB, 0xBA, 0x90, 0xBA, 0x91, /* 0xE4-0xE7 */ + 0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0xE8-0xEB */ + 0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0xEC-0xEF */ + 0xC6, 0xBC, 0xC6, 0xBD, 0xBA, 0x9A, 0xBA, 0x9B, /* 0xF0-0xF3 */ + 0xC6, 0xBE, 0xBA, 0x9C, 0xBA, 0x9D, 0xBA, 0x9E, /* 0xF4-0xF7 */ + 0xC6, 0xBF, 0xBA, 0x9F, 0xBA, 0xA0, 0xBB, 0x41, /* 0xF8-0xFB */ + 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, 0xBB, 0x45, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D3[512] = { + 0xC6, 0xC0, 0xC6, 0xC1, 0xBB, 0x46, 0xC6, 0xC2, /* 0x00-0x03 */ + 0xBB, 0x47, 0xC6, 0xC3, 0xBB, 0x48, 0xBB, 0x49, /* 0x04-0x07 */ + 0xBB, 0x4A, 0xBB, 0x4B, 0xBB, 0x4C, 0xBB, 0x4D, /* 0x08-0x0B */ + 0xC6, 0xC4, 0xC6, 0xC5, 0xC6, 0xC6, 0xBB, 0x4E, /* 0x0C-0x0F */ + 0xC6, 0xC7, 0xBB, 0x4F, 0xBB, 0x50, 0xBB, 0x51, /* 0x10-0x13 */ + 0xC6, 0xC8, 0xBB, 0x52, 0xC6, 0xC9, 0xBB, 0x53, /* 0x14-0x17 */ + 0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x18-0x1B */ + 0xC6, 0xCA, 0xC6, 0xCB, 0xBB, 0x58, 0xC6, 0xCC, /* 0x1C-0x1F */ + 0xC6, 0xCD, 0xC6, 0xCE, 0xBB, 0x59, 0xBB, 0x5A, /* 0x20-0x23 */ + 0xBB, 0x61, 0xC6, 0xCF, 0xBB, 0x62, 0xBB, 0x63, /* 0x24-0x27 */ + 0xC6, 0xD0, 0xC6, 0xD1, 0xBB, 0x64, 0xBB, 0x65, /* 0x28-0x2B */ + 0xC6, 0xD2, 0xBB, 0x66, 0xBB, 0x67, 0xBB, 0x68, /* 0x2C-0x2F */ + 0xC6, 0xD3, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x30-0x33 */ + 0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xBB, 0x6F, /* 0x34-0x37 */ + 0xC6, 0xD4, 0xC6, 0xD5, 0xBB, 0x70, 0xC6, 0xD6, /* 0x38-0x3B */ + 0xC6, 0xD7, 0xC6, 0xD8, 0xBB, 0x71, 0xBB, 0x72, /* 0x3C-0x3F */ + 0xBB, 0x73, 0xBB, 0x74, 0xBB, 0x75, 0xBB, 0x76, /* 0x40-0x43 */ + 0xC6, 0xD9, 0xC6, 0xDA, 0xBB, 0x77, 0xBB, 0x78, /* 0x44-0x47 */ + 0xBB, 0x79, 0xBB, 0x7A, 0xBB, 0x81, 0xBB, 0x82, /* 0x48-0x4B */ + 0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x4C-0x4F */ + 0xBB, 0x87, 0xBB, 0x88, 0xBB, 0x89, 0xBB, 0x8A, /* 0x50-0x53 */ + 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, 0xBB, 0x8E, /* 0x54-0x57 */ + 0xBB, 0x8F, 0xBB, 0x90, 0xBB, 0x91, 0xBB, 0x92, /* 0x58-0x5B */ + 0xBB, 0x93, 0xBB, 0x94, 0xBB, 0x95, 0xBB, 0x96, /* 0x5C-0x5F */ + 0xBB, 0x97, 0xBB, 0x98, 0xBB, 0x99, 0xBB, 0x9A, /* 0x60-0x63 */ + 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, 0xBB, 0x9E, /* 0x64-0x67 */ + 0xBB, 0x9F, 0xBB, 0xA0, 0xBC, 0x41, 0xBC, 0x42, /* 0x68-0x6B */ + 0xBC, 0x43, 0xBC, 0x44, 0xBC, 0x45, 0xBC, 0x46, /* 0x6C-0x6F */ + 0xBC, 0x47, 0xBC, 0x48, 0xBC, 0x49, 0xBC, 0x4A, /* 0x70-0x73 */ + 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, 0xBC, 0x4E, /* 0x74-0x77 */ + 0xBC, 0x4F, 0xBC, 0x50, 0xBC, 0x51, 0xBC, 0x52, /* 0x78-0x7B */ + 0xC6, 0xDB, 0xC6, 0xDC, 0xBC, 0x53, 0xBC, 0x54, /* 0x7C-0x7F */ + + 0xC6, 0xDD, 0xBC, 0x55, 0xBC, 0x56, 0xBC, 0x57, /* 0x80-0x83 */ + 0xC6, 0xDE, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0x84-0x87 */ + 0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0x88-0x8B */ + 0xC6, 0xDF, 0xC6, 0xE0, 0xBC, 0x65, 0xC6, 0xE1, /* 0x8C-0x8F */ + 0xC6, 0xE2, 0xC6, 0xE3, 0xBC, 0x66, 0xBC, 0x67, /* 0x90-0x93 */ + 0xBC, 0x68, 0xBC, 0x69, 0xBC, 0x6A, 0xBC, 0x6B, /* 0x94-0x97 */ + 0xC6, 0xE4, 0xC6, 0xE5, 0xBC, 0x6C, 0xBC, 0x6D, /* 0x98-0x9B */ + 0xC6, 0xE6, 0xBC, 0x6E, 0xBC, 0x6F, 0xBC, 0x70, /* 0x9C-0x9F */ + 0xC6, 0xE7, 0xBC, 0x71, 0xBC, 0x72, 0xBC, 0x73, /* 0xA0-0xA3 */ + 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, 0xBC, 0x77, /* 0xA4-0xA7 */ + 0xC6, 0xE8, 0xC6, 0xE9, 0xBC, 0x78, 0xC6, 0xEA, /* 0xA8-0xAB */ + 0xBC, 0x79, 0xC6, 0xEB, 0xBC, 0x7A, 0xBC, 0x81, /* 0xAC-0xAF */ + 0xBC, 0x82, 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, /* 0xB0-0xB3 */ + 0xC6, 0xEC, 0xBC, 0x86, 0xBC, 0x87, 0xBC, 0x88, /* 0xB4-0xB7 */ + 0xC6, 0xED, 0xBC, 0x89, 0xBC, 0x8A, 0xBC, 0x8B, /* 0xB8-0xBB */ + 0xC6, 0xEE, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0xBC-0xBF */ + 0xBC, 0x8F, 0xBC, 0x90, 0xBC, 0x91, 0xBC, 0x92, /* 0xC0-0xC3 */ + 0xC6, 0xEF, 0xC6, 0xF0, 0xBC, 0x93, 0xBC, 0x94, /* 0xC4-0xC7 */ + 0xC6, 0xF1, 0xC6, 0xF2, 0xBC, 0x95, 0xBC, 0x96, /* 0xC8-0xCB */ + 0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xBC, 0x9A, /* 0xCC-0xCF */ + 0xC6, 0xF3, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0xD0-0xD3 */ + 0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x41, /* 0xD4-0xD7 */ + 0xC6, 0xF4, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0xD8-0xDB */ + 0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0xDC-0xDF */ + 0xBD, 0x49, 0xC6, 0xF5, 0xBD, 0x4A, 0xC6, 0xF6, /* 0xE0-0xE3 */ + 0xBD, 0x4B, 0xBD, 0x4C, 0xBD, 0x4D, 0xBD, 0x4E, /* 0xE4-0xE7 */ + 0xBD, 0x4F, 0xBD, 0x50, 0xBD, 0x51, 0xBD, 0x52, /* 0xE8-0xEB */ + 0xC6, 0xF7, 0xC6, 0xF8, 0xBD, 0x53, 0xBD, 0x54, /* 0xEC-0xEF */ + 0xC6, 0xF9, 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, /* 0xF0-0xF3 */ + 0xC6, 0xFA, 0xBD, 0x58, 0xBD, 0x59, 0xBD, 0x5A, /* 0xF4-0xF7 */ + 0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0xF8-0xFB */ + 0xC6, 0xFB, 0xC6, 0xFC, 0xBD, 0x65, 0xC6, 0xFD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D4[512] = { + 0xBD, 0x66, 0xC6, 0xFE, 0xBD, 0x67, 0xBD, 0x68, /* 0x00-0x03 */ + 0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x04-0x07 */ + 0xC7, 0xA1, 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, /* 0x08-0x0B */ + 0xBD, 0x70, 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, /* 0x0C-0x0F */ + 0xBD, 0x74, 0xBD, 0x75, 0xBD, 0x76, 0xBD, 0x77, /* 0x10-0x13 */ + 0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x81, /* 0x14-0x17 */ + 0xBD, 0x82, 0xBD, 0x83, 0xBD, 0x84, 0xBD, 0x85, /* 0x18-0x1B */ + 0xBD, 0x86, 0xC7, 0xA2, 0xBD, 0x87, 0xBD, 0x88, /* 0x1C-0x1F */ + 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, 0xBD, 0x8C, /* 0x20-0x23 */ + 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, 0xBD, 0x90, /* 0x24-0x27 */ + 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, 0xBD, 0x94, /* 0x28-0x2B */ + 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, 0xBD, 0x98, /* 0x2C-0x2F */ + 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, 0xBD, 0x9C, /* 0x30-0x33 */ + 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, 0xBD, 0xA0, /* 0x34-0x37 */ + 0xBE, 0x41, 0xBE, 0x42, 0xBE, 0x43, 0xBE, 0x44, /* 0x38-0x3B */ + 0xBE, 0x45, 0xBE, 0x46, 0xBE, 0x47, 0xBE, 0x48, /* 0x3C-0x3F */ + 0xC7, 0xA3, 0xBE, 0x49, 0xBE, 0x4A, 0xBE, 0x4B, /* 0x40-0x43 */ + 0xC7, 0xA4, 0xBE, 0x4C, 0xBE, 0x4D, 0xBE, 0x4E, /* 0x44-0x47 */ + 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, 0xBE, 0x52, /* 0x48-0x4B */ + 0xBE, 0x53, 0xBE, 0x54, 0xBE, 0x55, 0xBE, 0x56, /* 0x4C-0x4F */ + 0xBE, 0x57, 0xBE, 0x58, 0xBE, 0x59, 0xBE, 0x5A, /* 0x50-0x53 */ + 0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0x54-0x57 */ + 0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0x58-0x5B */ + 0xC7, 0xA5, 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, /* 0x5C-0x5F */ + 0xC7, 0xA6, 0xBE, 0x6C, 0xBE, 0x6D, 0xBE, 0x6E, /* 0x60-0x63 */ + 0xC7, 0xA7, 0xBE, 0x6F, 0xBE, 0x70, 0xBE, 0x71, /* 0x64-0x67 */ + 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, 0xBE, 0x75, /* 0x68-0x6B */ + 0xBE, 0x76, 0xC7, 0xA8, 0xBE, 0x77, 0xC7, 0xA9, /* 0x6C-0x6F */ + 0xBE, 0x78, 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x81, /* 0x70-0x73 */ + 0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0x74-0x77 */ + 0xC7, 0xAA, 0xC7, 0xAB, 0xBE, 0x86, 0xBE, 0x87, /* 0x78-0x7B */ + 0xC7, 0xAC, 0xBE, 0x88, 0xBE, 0x89, 0xC7, 0xAD, /* 0x7C-0x7F */ + + 0xC7, 0xAE, 0xBE, 0x8A, 0xC7, 0xAF, 0xBE, 0x8B, /* 0x80-0x83 */ + 0xBE, 0x8C, 0xBE, 0x8D, 0xBE, 0x8E, 0xBE, 0x8F, /* 0x84-0x87 */ + 0xC7, 0xB0, 0xC7, 0xB1, 0xBE, 0x90, 0xC7, 0xB2, /* 0x88-0x8B */ + 0xBE, 0x91, 0xC7, 0xB3, 0xBE, 0x92, 0xBE, 0x93, /* 0x8C-0x8F */ + 0xBE, 0x94, 0xBE, 0x95, 0xBE, 0x96, 0xBE, 0x97, /* 0x90-0x93 */ + 0xC7, 0xB4, 0xBE, 0x98, 0xBE, 0x99, 0xBE, 0x9A, /* 0x94-0x97 */ + 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, 0xBE, 0x9E, /* 0x98-0x9B */ + 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x41, 0xBF, 0x42, /* 0x9C-0x9F */ + 0xBF, 0x43, 0xBF, 0x44, 0xBF, 0x45, 0xBF, 0x46, /* 0xA0-0xA3 */ + 0xBF, 0x47, 0xBF, 0x48, 0xBF, 0x49, 0xBF, 0x4A, /* 0xA4-0xA7 */ + 0xBF, 0x4B, 0xC7, 0xB5, 0xBF, 0x4C, 0xBF, 0x4D, /* 0xA8-0xAB */ + 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, 0xBF, 0x51, /* 0xAC-0xAF */ + 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, 0xBF, 0x55, /* 0xB0-0xB3 */ + 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, 0xBF, 0x59, /* 0xB4-0xB7 */ + 0xBF, 0x5A, 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, /* 0xB8-0xBB */ + 0xBF, 0x64, 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, /* 0xBC-0xBF */ + 0xBF, 0x68, 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, /* 0xC0-0xC3 */ + 0xBF, 0x6C, 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, /* 0xC4-0xC7 */ + 0xBF, 0x70, 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, /* 0xC8-0xCB */ + 0xC7, 0xB6, 0xBF, 0x74, 0xBF, 0x75, 0xBF, 0x76, /* 0xCC-0xCF */ + 0xC7, 0xB7, 0xBF, 0x77, 0xBF, 0x78, 0xBF, 0x79, /* 0xD0-0xD3 */ + 0xC7, 0xB8, 0xBF, 0x7A, 0xBF, 0x81, 0xBF, 0x82, /* 0xD4-0xD7 */ + 0xBF, 0x83, 0xBF, 0x84, 0xBF, 0x85, 0xBF, 0x86, /* 0xD8-0xDB */ + 0xC7, 0xB9, 0xBF, 0x87, 0xBF, 0x88, 0xC7, 0xBA, /* 0xDC-0xDF */ + 0xBF, 0x89, 0xBF, 0x8A, 0xBF, 0x8B, 0xBF, 0x8C, /* 0xE0-0xE3 */ + 0xBF, 0x8D, 0xBF, 0x8E, 0xBF, 0x8F, 0xBF, 0x90, /* 0xE4-0xE7 */ + 0xC7, 0xBB, 0xBF, 0x91, 0xBF, 0x92, 0xBF, 0x93, /* 0xE8-0xEB */ + 0xC7, 0xBC, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0xEC-0xEF */ + 0xC7, 0xBD, 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, /* 0xF0-0xF3 */ + 0xBF, 0x9A, 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, /* 0xF4-0xF7 */ + 0xC7, 0xBE, 0xBF, 0x9E, 0xBF, 0x9F, 0xC7, 0xBF, /* 0xF8-0xFB */ + 0xBF, 0xA0, 0xC7, 0xC0, 0xC0, 0x41, 0xC0, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D5[512] = { + 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, 0xC0, 0x46, /* 0x00-0x03 */ + 0xC7, 0xC1, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x04-0x07 */ + 0xC7, 0xC2, 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, /* 0x08-0x0B */ + 0xC7, 0xC3, 0xC0, 0x4D, 0xC0, 0x4E, 0xC0, 0x4F, /* 0x0C-0x0F */ + 0xC0, 0x50, 0xC0, 0x51, 0xC0, 0x52, 0xC0, 0x53, /* 0x10-0x13 */ + 0xC7, 0xC4, 0xC7, 0xC5, 0xC0, 0x54, 0xC7, 0xC6, /* 0x14-0x17 */ + 0xC0, 0x55, 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, /* 0x18-0x1B */ + 0xC0, 0x59, 0xC0, 0x5A, 0xC0, 0x61, 0xC0, 0x62, /* 0x1C-0x1F */ + 0xC0, 0x63, 0xC0, 0x64, 0xC0, 0x65, 0xC0, 0x66, /* 0x20-0x23 */ + 0xC0, 0x67, 0xC0, 0x68, 0xC0, 0x69, 0xC0, 0x6A, /* 0x24-0x27 */ + 0xC0, 0x6B, 0xC0, 0x6C, 0xC0, 0x6D, 0xC0, 0x6E, /* 0x28-0x2B */ + 0xC0, 0x6F, 0xC0, 0x70, 0xC0, 0x71, 0xC0, 0x72, /* 0x2C-0x2F */ + 0xC0, 0x73, 0xC0, 0x74, 0xC0, 0x75, 0xC0, 0x76, /* 0x30-0x33 */ + 0xC0, 0x77, 0xC0, 0x78, 0xC0, 0x79, 0xC0, 0x7A, /* 0x34-0x37 */ + 0xC0, 0x81, 0xC0, 0x82, 0xC0, 0x83, 0xC0, 0x84, /* 0x38-0x3B */ + 0xC7, 0xC7, 0xC7, 0xC8, 0xC0, 0x85, 0xC0, 0x86, /* 0x3C-0x3F */ + 0xC7, 0xC9, 0xC0, 0x87, 0xC0, 0x88, 0xC0, 0x89, /* 0x40-0x43 */ + 0xC7, 0xCA, 0xC0, 0x8A, 0xC0, 0x8B, 0xC0, 0x8C, /* 0x44-0x47 */ + 0xC0, 0x8D, 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, /* 0x48-0x4B */ + 0xC7, 0xCB, 0xC7, 0xCC, 0xC0, 0x91, 0xC7, 0xCD, /* 0x4C-0x4F */ + 0xC0, 0x92, 0xC7, 0xCE, 0xC0, 0x93, 0xC0, 0x94, /* 0x50-0x53 */ + 0xC0, 0x95, 0xC0, 0x96, 0xC0, 0x97, 0xC0, 0x98, /* 0x54-0x57 */ + 0xC7, 0xCF, 0xC7, 0xD0, 0xC0, 0x99, 0xC0, 0x9A, /* 0x58-0x5B */ + 0xC7, 0xD1, 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, /* 0x5C-0x5F */ + 0xC7, 0xD2, 0xC0, 0x9E, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x60-0x63 */ + 0xC1, 0x41, 0xC7, 0xD3, 0xC1, 0x42, 0xC1, 0x43, /* 0x64-0x67 */ + 0xC7, 0xD4, 0xC7, 0xD5, 0xC1, 0x44, 0xC7, 0xD6, /* 0x68-0x6B */ + 0xC1, 0x45, 0xC7, 0xD7, 0xC1, 0x46, 0xC1, 0x47, /* 0x6C-0x6F */ + 0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x70-0x73 */ + 0xC7, 0xD8, 0xC7, 0xD9, 0xC1, 0x4C, 0xC1, 0x4D, /* 0x74-0x77 */ + 0xC7, 0xDA, 0xC1, 0x4E, 0xC1, 0x4F, 0xC1, 0x50, /* 0x78-0x7B */ + 0xC7, 0xDB, 0xC1, 0x51, 0xC1, 0x52, 0xC1, 0x53, /* 0x7C-0x7F */ + + 0xC1, 0x54, 0xC1, 0x55, 0xC1, 0x56, 0xC1, 0x57, /* 0x80-0x83 */ + 0xC7, 0xDC, 0xC7, 0xDD, 0xC1, 0x58, 0xC7, 0xDE, /* 0x84-0x87 */ + 0xC7, 0xDF, 0xC7, 0xE0, 0xC1, 0x59, 0xC1, 0x5A, /* 0x88-0x8B */ + 0xC1, 0x61, 0xC1, 0x62, 0xC1, 0x63, 0xC1, 0x64, /* 0x8C-0x8F */ + 0xC7, 0xE1, 0xC1, 0x65, 0xC1, 0x66, 0xC1, 0x67, /* 0x90-0x93 */ + 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, 0xC1, 0x6B, /* 0x94-0x97 */ + 0xC1, 0x6C, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x6F, /* 0x98-0x9B */ + 0xC1, 0x70, 0xC1, 0x71, 0xC1, 0x72, 0xC1, 0x73, /* 0x9C-0x9F */ + 0xC1, 0x74, 0xC1, 0x75, 0xC1, 0x76, 0xC1, 0x77, /* 0xA0-0xA3 */ + 0xC1, 0x78, 0xC7, 0xE2, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA4-0xA7 */ + 0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xA8-0xAB */ + 0xC1, 0x85, 0xC1, 0x86, 0xC1, 0x87, 0xC1, 0x88, /* 0xAC-0xAF */ + 0xC1, 0x89, 0xC1, 0x8A, 0xC1, 0x8B, 0xC1, 0x8C, /* 0xB0-0xB3 */ + 0xC1, 0x8D, 0xC1, 0x8E, 0xC1, 0x8F, 0xC1, 0x90, /* 0xB4-0xB7 */ + 0xC1, 0x91, 0xC1, 0x92, 0xC1, 0x93, 0xC1, 0x94, /* 0xB8-0xBB */ + 0xC1, 0x95, 0xC1, 0x96, 0xC1, 0x97, 0xC1, 0x98, /* 0xBC-0xBF */ + 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, 0xC1, 0x9C, /* 0xC0-0xC3 */ + 0xC1, 0x9D, 0xC1, 0x9E, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xC4-0xC7 */ + 0xC7, 0xE3, 0xC7, 0xE4, 0xC2, 0x41, 0xC2, 0x42, /* 0xC8-0xCB */ + 0xC7, 0xE5, 0xC2, 0x43, 0xC2, 0x44, 0xC2, 0x45, /* 0xCC-0xCF */ + 0xC7, 0xE6, 0xC2, 0x46, 0xC7, 0xE7, 0xC2, 0x47, /* 0xD0-0xD3 */ + 0xC2, 0x48, 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, /* 0xD4-0xD7 */ + 0xC7, 0xE8, 0xC7, 0xE9, 0xC2, 0x4C, 0xC7, 0xEA, /* 0xD8-0xDB */ + 0xC2, 0x4D, 0xC7, 0xEB, 0xC2, 0x4E, 0xC2, 0x4F, /* 0xDC-0xDF */ + 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, 0xC2, 0x53, /* 0xE0-0xE3 */ + 0xC7, 0xEC, 0xC7, 0xED, 0xC2, 0x54, 0xC2, 0x55, /* 0xE4-0xE7 */ + 0xC7, 0xEE, 0xC2, 0x56, 0xC2, 0x57, 0xC2, 0x58, /* 0xE8-0xEB */ + 0xC7, 0xEF, 0xC2, 0x59, 0xC2, 0x5A, 0xC2, 0x61, /* 0xEC-0xEF */ + 0xC2, 0x62, 0xC2, 0x63, 0xC2, 0x64, 0xC2, 0x65, /* 0xF0-0xF3 */ + 0xC7, 0xF0, 0xC7, 0xF1, 0xC2, 0x66, 0xC7, 0xF2, /* 0xF4-0xF7 */ + 0xC2, 0x67, 0xC7, 0xF3, 0xC2, 0x68, 0xC2, 0x69, /* 0xF8-0xFB */ + 0xC2, 0x6A, 0xC2, 0x6B, 0xC2, 0x6C, 0xC2, 0x6D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D6[512] = { + 0xC7, 0xF4, 0xC7, 0xF5, 0xC2, 0x6E, 0xC2, 0x6F, /* 0x00-0x03 */ + 0xC7, 0xF6, 0xC2, 0x70, 0xC2, 0x71, 0xC2, 0x72, /* 0x04-0x07 */ + 0xC7, 0xF7, 0xC2, 0x73, 0xC2, 0x74, 0xC2, 0x75, /* 0x08-0x0B */ + 0xC2, 0x76, 0xC2, 0x77, 0xC2, 0x78, 0xC2, 0x79, /* 0x0C-0x0F */ + 0xC7, 0xF8, 0xC7, 0xF9, 0xC2, 0x7A, 0xC7, 0xFA, /* 0x10-0x13 */ + 0xC7, 0xFB, 0xC7, 0xFC, 0xC2, 0x81, 0xC2, 0x82, /* 0x14-0x17 */ + 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, 0xC2, 0x86, /* 0x18-0x1B */ + 0xC7, 0xFD, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x1C-0x1F */ + 0xC7, 0xFE, 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, /* 0x20-0x23 */ + 0xC8, 0xA1, 0xC2, 0x8D, 0xC2, 0x8E, 0xC2, 0x8F, /* 0x24-0x27 */ + 0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x28-0x2B */ + 0xC2, 0x94, 0xC8, 0xA2, 0xC2, 0x95, 0xC2, 0x96, /* 0x2C-0x2F */ + 0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x30-0x33 */ + 0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x34-0x37 */ + 0xC8, 0xA3, 0xC8, 0xA4, 0xC2, 0x9F, 0xC2, 0xA0, /* 0x38-0x3B */ + 0xC8, 0xA5, 0xC3, 0x41, 0xC3, 0x42, 0xC3, 0x43, /* 0x3C-0x3F */ + 0xC8, 0xA6, 0xC3, 0x44, 0xC3, 0x45, 0xC3, 0x46, /* 0x40-0x43 */ + 0xC3, 0x47, 0xC8, 0xA7, 0xC3, 0x48, 0xC3, 0x49, /* 0x44-0x47 */ + 0xC8, 0xA8, 0xC8, 0xA9, 0xC3, 0x4A, 0xC8, 0xAA, /* 0x48-0x4B */ + 0xC3, 0x4B, 0xC8, 0xAB, 0xC3, 0x4C, 0xC3, 0x4D, /* 0x4C-0x4F */ + 0xC3, 0x4E, 0xC8, 0xAC, 0xC3, 0x4F, 0xC3, 0x50, /* 0x50-0x53 */ + 0xC8, 0xAD, 0xC8, 0xAE, 0xC3, 0x51, 0xC3, 0x52, /* 0x54-0x57 */ + 0xC8, 0xAF, 0xC3, 0x53, 0xC3, 0x54, 0xC3, 0x55, /* 0x58-0x5B */ + 0xC8, 0xB0, 0xC3, 0x56, 0xC3, 0x57, 0xC3, 0x58, /* 0x5C-0x5F */ + 0xC3, 0x59, 0xC3, 0x5A, 0xC3, 0x61, 0xC3, 0x62, /* 0x60-0x63 */ + 0xC3, 0x63, 0xC3, 0x64, 0xC3, 0x65, 0xC8, 0xB1, /* 0x64-0x67 */ + 0xC3, 0x66, 0xC8, 0xB2, 0xC3, 0x67, 0xC3, 0x68, /* 0x68-0x6B */ + 0xC3, 0x69, 0xC3, 0x6A, 0xC3, 0x6B, 0xC3, 0x6C, /* 0x6C-0x6F */ + 0xC8, 0xB3, 0xC8, 0xB4, 0xC3, 0x6D, 0xC3, 0x6E, /* 0x70-0x73 */ + 0xC8, 0xB5, 0xC3, 0x6F, 0xC3, 0x70, 0xC3, 0x71, /* 0x74-0x77 */ + 0xC3, 0x72, 0xC3, 0x73, 0xC3, 0x74, 0xC3, 0x75, /* 0x78-0x7B */ + 0xC3, 0x76, 0xC3, 0x77, 0xC3, 0x78, 0xC3, 0x79, /* 0x7C-0x7F */ + + 0xC3, 0x7A, 0xC3, 0x81, 0xC3, 0x82, 0xC8, 0xB6, /* 0x80-0x83 */ + 0xC3, 0x83, 0xC8, 0xB7, 0xC3, 0x84, 0xC3, 0x85, /* 0x84-0x87 */ + 0xC3, 0x86, 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, /* 0x88-0x8B */ + 0xC8, 0xB8, 0xC8, 0xB9, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x8C-0x8F */ + 0xC8, 0xBA, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, /* 0x90-0x93 */ + 0xC8, 0xBB, 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, /* 0x94-0x97 */ + 0xC3, 0x92, 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, /* 0x98-0x9B */ + 0xC3, 0x96, 0xC8, 0xBC, 0xC3, 0x97, 0xC8, 0xBD, /* 0x9C-0x9F */ + 0xC3, 0x98, 0xC8, 0xBE, 0xC3, 0x99, 0xC3, 0x9A, /* 0xA0-0xA3 */ + 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, /* 0xA4-0xA7 */ + 0xC8, 0xBF, 0xC3, 0x9F, 0xC3, 0xA0, 0xC4, 0x41, /* 0xA8-0xAB */ + 0xC8, 0xC0, 0xC4, 0x42, 0xC4, 0x43, 0xC4, 0x44, /* 0xAC-0xAF */ + 0xC8, 0xC1, 0xC4, 0x45, 0xC4, 0x46, 0xC4, 0x47, /* 0xB0-0xB3 */ + 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, 0xC4, 0x4B, /* 0xB4-0xB7 */ + 0xC4, 0x4C, 0xC8, 0xC2, 0xC4, 0x4D, 0xC8, 0xC3, /* 0xB8-0xBB */ + 0xC4, 0x4E, 0xC4, 0x4F, 0xC4, 0x50, 0xC4, 0x51, /* 0xBC-0xBF */ + 0xC4, 0x52, 0xC4, 0x53, 0xC4, 0x54, 0xC4, 0x55, /* 0xC0-0xC3 */ + 0xC8, 0xC4, 0xC8, 0xC5, 0xC4, 0x56, 0xC4, 0x57, /* 0xC4-0xC7 */ + 0xC8, 0xC6, 0xC4, 0x58, 0xC4, 0x59, 0xC4, 0x5A, /* 0xC8-0xCB */ + 0xC8, 0xC7, 0xC4, 0x61, 0xC4, 0x62, 0xC4, 0x63, /* 0xCC-0xCF */ + 0xC4, 0x64, 0xC8, 0xC8, 0xC4, 0x65, 0xC4, 0x66, /* 0xD0-0xD3 */ + 0xC8, 0xC9, 0xC4, 0x67, 0xC4, 0x68, 0xC8, 0xCA, /* 0xD4-0xD7 */ + 0xC4, 0x69, 0xC8, 0xCB, 0xC4, 0x6A, 0xC4, 0x6B, /* 0xD8-0xDB */ + 0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xC4, 0x6F, /* 0xDC-0xDF */ + 0xC8, 0xCC, 0xC4, 0x70, 0xC4, 0x71, 0xC4, 0x72, /* 0xE0-0xE3 */ + 0xC8, 0xCD, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0xE4-0xE7 */ + 0xC8, 0xCE, 0xC4, 0x76, 0xC4, 0x77, 0xC4, 0x78, /* 0xE8-0xEB */ + 0xC4, 0x79, 0xC4, 0x7A, 0xC4, 0x81, 0xC4, 0x82, /* 0xEC-0xEF */ + 0xC8, 0xCF, 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, /* 0xF0-0xF3 */ + 0xC4, 0x86, 0xC8, 0xD0, 0xC4, 0x87, 0xC4, 0x88, /* 0xF4-0xF7 */ + 0xC4, 0x89, 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, /* 0xF8-0xFB */ + 0xC8, 0xD1, 0xC8, 0xD2, 0xC4, 0x8D, 0xC4, 0x8E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D7[512] = { + 0xC8, 0xD3, 0xC4, 0x8F, 0xC4, 0x90, 0xC4, 0x91, /* 0x00-0x03 */ + 0xC8, 0xD4, 0xC4, 0x92, 0xC4, 0x93, 0xC4, 0x94, /* 0x04-0x07 */ + 0xC4, 0x95, 0xC4, 0x96, 0xC4, 0x97, 0xC4, 0x98, /* 0x08-0x0B */ + 0xC4, 0x99, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0x0C-0x0F */ + 0xC4, 0x9D, 0xC8, 0xD5, 0xC4, 0x9E, 0xC4, 0x9F, /* 0x10-0x13 */ + 0xC4, 0xA0, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0x14-0x17 */ + 0xC8, 0xD6, 0xC8, 0xD7, 0xC5, 0x44, 0xC5, 0x45, /* 0x18-0x1B */ + 0xC8, 0xD8, 0xC5, 0x46, 0xC5, 0x47, 0xC5, 0x48, /* 0x1C-0x1F */ + 0xC8, 0xD9, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0x20-0x23 */ + 0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xC5, 0x4F, /* 0x24-0x27 */ + 0xC8, 0xDA, 0xC8, 0xDB, 0xC5, 0x50, 0xC8, 0xDC, /* 0x28-0x2B */ + 0xC5, 0x51, 0xC8, 0xDD, 0xC5, 0x52, 0xC5, 0x53, /* 0x2C-0x2F */ + 0xC5, 0x54, 0xC5, 0x55, 0xC5, 0x56, 0xC5, 0x57, /* 0x30-0x33 */ + 0xC8, 0xDE, 0xC8, 0xDF, 0xC5, 0x58, 0xC5, 0x59, /* 0x34-0x37 */ + 0xC8, 0xE0, 0xC5, 0x5A, 0xC5, 0x61, 0xC5, 0x62, /* 0x38-0x3B */ + 0xC8, 0xE1, 0xC5, 0x63, 0xC5, 0x64, 0xC5, 0x65, /* 0x3C-0x3F */ + 0xC5, 0x66, 0xC5, 0x67, 0xC5, 0x68, 0xC5, 0x69, /* 0x40-0x43 */ + 0xC8, 0xE2, 0xC5, 0x6A, 0xC5, 0x6B, 0xC8, 0xE3, /* 0x44-0x47 */ + 0xC5, 0x6C, 0xC8, 0xE4, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x48-0x4B */ + 0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xC5, 0x72, /* 0x4C-0x4F */ + 0xC8, 0xE5, 0xC8, 0xE6, 0xC5, 0x73, 0xC5, 0x74, /* 0x50-0x53 */ + 0xC8, 0xE7, 0xC5, 0x75, 0xC8, 0xE8, 0xC8, 0xE9, /* 0x54-0x57 */ + 0xC8, 0xEA, 0xC8, 0xEB, 0xC5, 0x76, 0xC5, 0x77, /* 0x58-0x5B */ + 0xC5, 0x78, 0xC5, 0x79, 0xC5, 0x7A, 0xC5, 0x81, /* 0x5C-0x5F */ + 0xC8, 0xEC, 0xC8, 0xED, 0xC5, 0x82, 0xC8, 0xEE, /* 0x60-0x63 */ + 0xC5, 0x83, 0xC8, 0xEF, 0xC5, 0x84, 0xC5, 0x85, /* 0x64-0x67 */ + 0xC5, 0x86, 0xC8, 0xF0, 0xC5, 0x87, 0xC5, 0x88, /* 0x68-0x6B */ + 0xC8, 0xF1, 0xC5, 0x89, 0xC5, 0x8A, 0xC5, 0x8B, /* 0x6C-0x6F */ + 0xC8, 0xF2, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x70-0x73 */ + 0xC8, 0xF3, 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, /* 0x74-0x77 */ + 0xC5, 0x92, 0xC5, 0x93, 0xC5, 0x94, 0xC5, 0x95, /* 0x78-0x7B */ + 0xC8, 0xF4, 0xC8, 0xF5, 0xC5, 0x96, 0xC5, 0x97, /* 0x7C-0x7F */ + + 0xC5, 0x98, 0xC8, 0xF6, 0xC5, 0x99, 0xC5, 0x9A, /* 0x80-0x83 */ + 0xC5, 0x9B, 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, /* 0x84-0x87 */ + 0xC8, 0xF7, 0xC8, 0xF8, 0xC5, 0x9F, 0xC5, 0xA0, /* 0x88-0x8B */ + 0xC8, 0xF9, 0xC6, 0x41, 0xC6, 0x42, 0xC6, 0x43, /* 0x8C-0x8F */ + 0xC8, 0xFA, 0xC6, 0x44, 0xC6, 0x45, 0xC6, 0x46, /* 0x90-0x93 */ + 0xC6, 0x47, 0xC6, 0x48, 0xC6, 0x49, 0xC6, 0x4A, /* 0x94-0x97 */ + 0xC8, 0xFB, 0xC8, 0xFC, 0xC6, 0x4B, 0xC8, 0xFD, /* 0x98-0x9B */ + 0xC6, 0x4C, 0xC8, 0xFE, 0xC6, 0x4D, 0xC6, 0x4E, /* 0x9C-0x9F */ + 0xC6, 0x4F, 0xC6, 0x50, 0xC6, 0x51, 0xC6, 0x52, /* 0xA0-0xA3 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xCB, 0xD0, 0xCB, 0xD6, 0xCB, 0xE7, 0xCD, 0xCF, /* 0x00-0x03 */ + 0xCD, 0xE8, 0xCE, 0xAD, 0xCF, 0xFB, 0xD0, 0xA2, /* 0x04-0x07 */ + 0xD0, 0xB8, 0xD0, 0xD0, 0xD0, 0xDD, 0xD1, 0xD4, /* 0x08-0x0B */ + 0xD1, 0xD5, 0xD1, 0xD8, 0xD1, 0xDB, 0xD1, 0xDC, /* 0x0C-0x0F */ + 0xD1, 0xDD, 0xD1, 0xDE, 0xD1, 0xDF, 0xD1, 0xE0, /* 0x10-0x13 */ + 0xD1, 0xE2, 0xD1, 0xE3, 0xD1, 0xE4, 0xD1, 0xE5, /* 0x14-0x17 */ + 0xD1, 0xE6, 0xD1, 0xE8, 0xD1, 0xE9, 0xD1, 0xEA, /* 0x18-0x1B */ + 0xD1, 0xEB, 0xD1, 0xED, 0xD1, 0xEF, 0xD1, 0xF0, /* 0x1C-0x1F */ + 0xD1, 0xF2, 0xD1, 0xF6, 0xD1, 0xFA, 0xD1, 0xFC, /* 0x20-0x23 */ + 0xD1, 0xFD, 0xD1, 0xFE, 0xD2, 0xA2, 0xD2, 0xA3, /* 0x24-0x27 */ + 0xD2, 0xA7, 0xD2, 0xA8, 0xD2, 0xA9, 0xD2, 0xAA, /* 0x28-0x2B */ + 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, 0xB2, 0xD2, 0xBE, /* 0x2C-0x2F */ + 0xD2, 0xC2, 0xD2, 0xC3, 0xD2, 0xC4, 0xD2, 0xC6, /* 0x30-0x33 */ + 0xD2, 0xC7, 0xD2, 0xC8, 0xD2, 0xC9, 0xD2, 0xCA, /* 0x34-0x37 */ + 0xD2, 0xCB, 0xD2, 0xCD, 0xD2, 0xCE, 0xD2, 0xCF, /* 0x38-0x3B */ + 0xD2, 0xD0, 0xD2, 0xD1, 0xD2, 0xD2, 0xD2, 0xD3, /* 0x3C-0x3F */ + 0xD2, 0xD4, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xD7, /* 0x40-0x43 */ + 0xD2, 0xD9, 0xD2, 0xDA, 0xD2, 0xDE, 0xD2, 0xDF, /* 0x44-0x47 */ + 0xD2, 0xE1, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE5, /* 0x48-0x4B */ + 0xD2, 0xE6, 0xD2, 0xE7, 0xD2, 0xE8, 0xD2, 0xE9, /* 0x4C-0x4F */ + 0xD2, 0xEA, 0xD2, 0xEB, 0xD2, 0xF0, 0xD2, 0xF1, /* 0x50-0x53 */ + 0xD2, 0xF2, 0xD2, 0xF3, 0xD2, 0xF4, 0xD2, 0xF5, /* 0x54-0x57 */ + 0xD2, 0xF7, 0xD2, 0xF8, 0xD4, 0xE6, 0xD4, 0xFC, /* 0x58-0x5B */ + 0xD5, 0xA5, 0xD5, 0xAB, 0xD5, 0xAE, 0xD6, 0xB8, /* 0x5C-0x5F */ + 0xD6, 0xCD, 0xD7, 0xCB, 0xD7, 0xE4, 0xDB, 0xC5, /* 0x60-0x63 */ + 0xDB, 0xE4, 0xDC, 0xA5, 0xDD, 0xA5, 0xDD, 0xD5, /* 0x64-0x67 */ + 0xDD, 0xF4, 0xDE, 0xFC, 0xDE, 0xFE, 0xDF, 0xB3, /* 0x68-0x6B */ + 0xDF, 0xE1, 0xDF, 0xE8, 0xE0, 0xF1, 0xE1, 0xAD, /* 0x6C-0x6F */ + 0xE1, 0xED, 0xE3, 0xF5, 0xE4, 0xA1, 0xE4, 0xA9, /* 0x70-0x73 */ + 0xE5, 0xAE, 0xE5, 0xB1, 0xE5, 0xB2, 0xE5, 0xB9, /* 0x74-0x77 */ + 0xE5, 0xBB, 0xE5, 0xBC, 0xE5, 0xC4, 0xE5, 0xCE, /* 0x78-0x7B */ + 0xE5, 0xD0, 0xE5, 0xD2, 0xE5, 0xD6, 0xE5, 0xFA, /* 0x7C-0x7F */ + + 0xE5, 0xFB, 0xE5, 0xFC, 0xE5, 0xFE, 0xE6, 0xA1, /* 0x80-0x83 */ + 0xE6, 0xA4, 0xE6, 0xA7, 0xE6, 0xAD, 0xE6, 0xAF, /* 0x84-0x87 */ + 0xE6, 0xB0, 0xE6, 0xB1, 0xE6, 0xB3, 0xE6, 0xB7, /* 0x88-0x8B */ + 0xE6, 0xB8, 0xE6, 0xBC, 0xE6, 0xC4, 0xE6, 0xC6, /* 0x8C-0x8F */ + 0xE6, 0xC7, 0xE6, 0xCA, 0xE6, 0xD2, 0xE6, 0xD6, /* 0x90-0x93 */ + 0xE6, 0xD9, 0xE6, 0xDC, 0xE6, 0xDF, 0xE6, 0xE1, /* 0x94-0x97 */ + 0xE6, 0xE4, 0xE6, 0xE5, 0xE6, 0xE6, 0xE6, 0xE8, /* 0x98-0x9B */ + 0xE6, 0xEA, 0xE6, 0xEB, 0xE6, 0xEC, 0xE6, 0xEF, /* 0x9C-0x9F */ + 0xE6, 0xF1, 0xE6, 0xF2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */ + 0xE6, 0xF7, 0xE6, 0xF9, 0xE7, 0xA1, 0xE7, 0xA6, /* 0xA4-0xA7 */ + 0xE7, 0xA9, 0xE7, 0xAA, 0xE7, 0xAC, 0xE7, 0xAD, /* 0xA8-0xAB */ + 0xE7, 0xB0, 0xE7, 0xBF, 0xE7, 0xC1, 0xE7, 0xC6, /* 0xAC-0xAF */ + 0xE7, 0xC7, 0xE7, 0xCB, 0xE7, 0xCD, 0xE7, 0xCF, /* 0xB0-0xB3 */ + 0xE7, 0xD0, 0xE7, 0xD3, 0xE7, 0xDF, 0xE7, 0xE4, /* 0xB4-0xB7 */ + 0xE7, 0xE6, 0xE7, 0xF7, 0xE8, 0xE7, 0xE8, 0xE8, /* 0xB8-0xBB */ + 0xE8, 0xF0, 0xE8, 0xF1, 0xE8, 0xF7, 0xE8, 0xF9, /* 0xBC-0xBF */ + 0xE8, 0xFB, 0xE8, 0xFE, 0xE9, 0xA7, 0xE9, 0xAC, /* 0xC0-0xC3 */ + 0xE9, 0xCC, 0xE9, 0xF7, 0xEA, 0xC1, 0xEA, 0xE5, /* 0xC4-0xC7 */ + 0xEA, 0xF4, 0xEA, 0xF7, 0xEA, 0xFC, 0xEA, 0xFE, /* 0xC8-0xCB */ + 0xEB, 0xA4, 0xEB, 0xA7, 0xEB, 0xA9, 0xEB, 0xAA, /* 0xCC-0xCF */ + 0xEB, 0xBA, 0xEB, 0xBB, 0xEB, 0xBD, 0xEB, 0xC1, /* 0xD0-0xD3 */ + 0xEB, 0xC2, 0xEB, 0xC6, 0xEB, 0xC7, 0xEB, 0xCC, /* 0xD4-0xD7 */ + 0xEB, 0xCF, 0xEB, 0xD0, 0xEB, 0xD1, 0xEB, 0xD2, /* 0xD8-0xDB */ + 0xEB, 0xD8, 0xEC, 0xA6, 0xEC, 0xA7, 0xEC, 0xAA, /* 0xDC-0xDF */ + 0xEC, 0xAF, 0xEC, 0xB0, 0xEC, 0xB1, 0xEC, 0xB2, /* 0xE0-0xE3 */ + 0xEC, 0xB5, 0xEC, 0xB8, 0xEC, 0xBA, 0xEC, 0xC0, /* 0xE4-0xE7 */ + 0xEC, 0xC1, 0xEC, 0xC5, 0xEC, 0xC6, 0xEC, 0xC9, /* 0xE8-0xEB */ + 0xEC, 0xCA, 0xEC, 0xD5, 0xEC, 0xDD, 0xEC, 0xDE, /* 0xEC-0xEF */ + 0xEC, 0xE1, 0xEC, 0xE4, 0xEC, 0xE7, 0xEC, 0xE8, /* 0xF0-0xF3 */ + 0xEC, 0xF7, 0xEC, 0xF8, 0xEC, 0xFA, 0xED, 0xA1, /* 0xF4-0xF7 */ + 0xED, 0xA2, 0xED, 0xA3, 0xED, 0xEE, 0xEE, 0xDB, /* 0xF8-0xFB */ + 0xF2, 0xBD, 0xF2, 0xFA, 0xF3, 0xB1, 0xF4, 0xA7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xF4, 0xEE, 0xF6, 0xF4, 0xF6, 0xF6, 0xF7, 0xB8, /* 0x00-0x03 */ + 0xF7, 0xC8, 0xF7, 0xD3, 0xF8, 0xDB, 0xF8, 0xF0, /* 0x04-0x07 */ + 0xFA, 0xA1, 0xFA, 0xA2, 0xFA, 0xE6, 0xFC, 0xA9, /* 0x08-0x0B */ + 0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0xC0, 0x00, 0x00, 0xF4, 0xE7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xFD, 0xEB, 0x00, 0x00, 0xEC, 0xCC, /* 0x14-0x17 */ + 0x00, 0x00, 0xE3, 0xEA, 0xDF, 0xD4, 0xDC, 0xD8, /* 0x18-0x1B */ + 0xEF, 0xFE, 0xEF, 0xF1, 0xE9, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xB3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xEC, 0xEF, 0xD4, 0xB4, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF9, 0xDE, 0xF8, /* 0x28-0x2B */ + 0xCE, 0xBD, 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */ + 0xA3, 0xA4, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */ + 0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */ + 0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */ + 0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */ + 0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */ + 0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */ + 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */ + 0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */ + 0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */ + 0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */ + 0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */ + 0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */ + 0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */ + 0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */ + 0xA1, 0xAC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */ + 0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */ + 0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */ + 0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */ + 0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */ + 0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */ + 0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */ + 0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */ + 0xA3, 0xFC, 0xA3, 0xFD, 0xA2, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA4, 0xD4, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0xA0-0xA3 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0xA4-0xA7 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0xA8-0xAB */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0xAC-0xAF */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xB0-0xB3 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0xB4-0xB7 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xB8-0xBB */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, /* 0xC0-0xC3 */ + 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xC5, 0xA4, 0xC6, /* 0xC8-0xCB */ + 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xCB, 0xA4, 0xCC, /* 0xD0-0xD3 */ + 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD2, /* 0xD8-0xDB */ + 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA1, 0xCB, 0xA1, 0xCC, 0xA1, 0xFE, 0xA3, 0xFE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA1, 0xCD, 0xA3, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, u2c_01, u2c_02, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_11, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_AC, u2c_AD, u2c_AE, u2c_AF, + u2c_B0, u2c_B1, u2c_B2, u2c_B3, u2c_B4, u2c_B5, u2c_B6, u2c_B7, + u2c_B8, u2c_B9, u2c_BA, u2c_BB, u2c_BC, u2c_BD, u2c_BE, u2c_BF, + u2c_C0, u2c_C1, u2c_C2, u2c_C3, u2c_C4, u2c_C5, u2c_C6, u2c_C7, + u2c_C8, u2c_C9, u2c_CA, u2c_CB, u2c_CC, u2c_CD, u2c_CE, u2c_CF, + u2c_D0, u2c_D1, u2c_D2, u2c_D3, u2c_D4, u2c_D5, u2c_D6, u2c_D7, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, NULL, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + n = 2; + } else if (ch==0 && cl) { + out[0] = cl; + n = 1; + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + *uni = rawstring[0]; + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + *uni = ch; + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp949", + .alias = "euc-kr", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp949(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp949(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp949) +module_exit(exit_nls_cp949) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(euc-kr); diff --git a/kernel/fs/nls/nls_cp950.c b/kernel/fs/nls/nls_cp950.c new file mode 100644 index 000000000..8e1418708 --- /dev/null +++ b/kernel/fs/nls/nls_cp950.c @@ -0,0 +1,9482 @@ +/* + * linux/fs/nls/nls_cp950.c + * + * Charset cp950 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3000,0xFF0C,0x3001,0x3002,0xFF0E,0x2027,0xFF1B,0xFF1A,/* 0x40-0x47 */ + 0xFF1F,0xFF01,0xFE30,0x2026,0x2025,0xFE50,0xFE51,0xFE52,/* 0x48-0x4F */ + 0x00B7,0xFE54,0xFE55,0xFE56,0xFE57,0xFF5C,0x2013,0xFE31,/* 0x50-0x57 */ + 0x2014,0xFE33,0x2574,0xFE34,0xFE4F,0xFF08,0xFF09,0xFE35,/* 0x58-0x5F */ + 0xFE36,0xFF5B,0xFF5D,0xFE37,0xFE38,0x3014,0x3015,0xFE39,/* 0x60-0x67 */ + 0xFE3A,0x3010,0x3011,0xFE3B,0xFE3C,0x300A,0x300B,0xFE3D,/* 0x68-0x6F */ + 0xFE3E,0x3008,0x3009,0xFE3F,0xFE40,0x300C,0x300D,0xFE41,/* 0x70-0x77 */ + 0xFE42,0x300E,0x300F,0xFE43,0xFE44,0xFE59,0xFE5A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFE5B,0xFE5C,0xFE5D,0xFE5E,0x2018,0x2019,0x201C,/* 0xA0-0xA7 */ + 0x201D,0x301D,0x301E,0x2035,0x2032,0xFF03,0xFF06,0xFF0A,/* 0xA8-0xAF */ + 0x203B,0x00A7,0x3003,0x25CB,0x25CF,0x25B3,0x25B2,0x25CE,/* 0xB0-0xB7 */ + 0x2606,0x2605,0x25C7,0x25C6,0x25A1,0x25A0,0x25BD,0x25BC,/* 0xB8-0xBF */ + 0x32A3,0x2105,0x00AF,0xFFE3,0xFF3F,0x02CD,0xFE49,0xFE4A,/* 0xC0-0xC7 */ + 0xFE4D,0xFE4E,0xFE4B,0xFE4C,0xFE5F,0xFE60,0xFE61,0xFF0B,/* 0xC8-0xCF */ + 0xFF0D,0x00D7,0x00F7,0x00B1,0x221A,0xFF1C,0xFF1E,0xFF1D,/* 0xD0-0xD7 */ + 0x2266,0x2267,0x2260,0x221E,0x2252,0x2261,0xFE62,0xFE63,/* 0xD8-0xDF */ + 0xFE64,0xFE65,0xFE66,0xFF5E,0x2229,0x222A,0x22A5,0x2220,/* 0xE0-0xE7 */ + 0x221F,0x22BF,0x33D2,0x33D1,0x222B,0x222E,0x2235,0x2234,/* 0xE8-0xEF */ + 0x2640,0x2642,0x2295,0x2299,0x2191,0x2193,0x2190,0x2192,/* 0xF0-0xF7 */ + 0x2196,0x2197,0x2199,0x2198,0x2225,0x2223,0xFF0F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFF3C,0x2215,0xFE68,0xFF04,0xFFE5,0x3012,0xFFE0,0xFFE1,/* 0x40-0x47 */ + 0xFF05,0xFF20,0x2103,0x2109,0xFE69,0xFE6A,0xFE6B,0x33D5,/* 0x48-0x4F */ + 0x339C,0x339D,0x339E,0x33CE,0x33A1,0x338E,0x338F,0x33C4,/* 0x50-0x57 */ + 0x00B0,0x5159,0x515B,0x515E,0x515D,0x5161,0x5163,0x55E7,/* 0x58-0x5F */ + 0x74E9,0x7CCE,0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,/* 0x60-0x67 */ + 0x2587,0x2588,0x258F,0x258E,0x258D,0x258C,0x258B,0x258A,/* 0x68-0x6F */ + 0x2589,0x253C,0x2534,0x252C,0x2524,0x251C,0x2594,0x2500,/* 0x70-0x77 */ + 0x2502,0x2595,0x250C,0x2510,0x2514,0x2518,0x256D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x256E,0x2570,0x256F,0x2550,0x255E,0x256A,0x2561,/* 0xA0-0xA7 */ + 0x25E2,0x25E3,0x25E5,0x25E4,0x2571,0x2572,0x2573,0xFF10,/* 0xA8-0xAF */ + 0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0xB0-0xB7 */ + 0xFF19,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xB8-0xBF */ + 0x2167,0x2168,0x2169,0x3021,0x3022,0x3023,0x3024,0x3025,/* 0xC0-0xC7 */ + 0x3026,0x3027,0x3028,0x3029,0x5341,0x5344,0x5345,0xFF21,/* 0xC8-0xCF */ + 0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,0xFF29,/* 0xD0-0xD7 */ + 0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,0xFF31,/* 0xD8-0xDF */ + 0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,0xFF39,/* 0xE0-0xE7 */ + 0xFF3A,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE8-0xEF */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xF0-0xF7 */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFF57,0xFF58,0xFF59,0xFF5A,0x0391,0x0392,0x0393,0x0394,/* 0x40-0x47 */ + 0x0395,0x0396,0x0397,0x0398,0x0399,0x039A,0x039B,0x039C,/* 0x48-0x4F */ + 0x039D,0x039E,0x039F,0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,/* 0x50-0x57 */ + 0x03A6,0x03A7,0x03A8,0x03A9,0x03B1,0x03B2,0x03B3,0x03B4,/* 0x58-0x5F */ + 0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,/* 0x60-0x67 */ + 0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,/* 0x68-0x6F */ + 0x03C6,0x03C7,0x03C8,0x03C9,0x3105,0x3106,0x3107,0x3108,/* 0x70-0x77 */ + 0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,/* 0xA0-0xA7 */ + 0x3117,0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,/* 0xA8-0xAF */ + 0x311F,0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,/* 0xB0-0xB7 */ + 0x3127,0x3128,0x3129,0x02D9,0x02C9,0x02CA,0x02C7,0x02CB,/* 0xB8-0xBF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0x0000,0x20AC,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E00,0x4E59,0x4E01,0x4E03,0x4E43,0x4E5D,0x4E86,0x4E8C,/* 0x40-0x47 */ + 0x4EBA,0x513F,0x5165,0x516B,0x51E0,0x5200,0x5201,0x529B,/* 0x48-0x4F */ + 0x5315,0x5341,0x535C,0x53C8,0x4E09,0x4E0B,0x4E08,0x4E0A,/* 0x50-0x57 */ + 0x4E2B,0x4E38,0x51E1,0x4E45,0x4E48,0x4E5F,0x4E5E,0x4E8E,/* 0x58-0x5F */ + 0x4EA1,0x5140,0x5203,0x52FA,0x5343,0x53C9,0x53E3,0x571F,/* 0x60-0x67 */ + 0x58EB,0x5915,0x5927,0x5973,0x5B50,0x5B51,0x5B53,0x5BF8,/* 0x68-0x6F */ + 0x5C0F,0x5C22,0x5C38,0x5C71,0x5DDD,0x5DE5,0x5DF1,0x5DF2,/* 0x70-0x77 */ + 0x5DF3,0x5DFE,0x5E72,0x5EFE,0x5F0B,0x5F13,0x624D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4E11,0x4E10,0x4E0D,0x4E2D,0x4E30,0x4E39,0x4E4B,/* 0xA0-0xA7 */ + 0x5C39,0x4E88,0x4E91,0x4E95,0x4E92,0x4E94,0x4EA2,0x4EC1,/* 0xA8-0xAF */ + 0x4EC0,0x4EC3,0x4EC6,0x4EC7,0x4ECD,0x4ECA,0x4ECB,0x4EC4,/* 0xB0-0xB7 */ + 0x5143,0x5141,0x5167,0x516D,0x516E,0x516C,0x5197,0x51F6,/* 0xB8-0xBF */ + 0x5206,0x5207,0x5208,0x52FB,0x52FE,0x52FF,0x5316,0x5339,/* 0xC0-0xC7 */ + 0x5348,0x5347,0x5345,0x535E,0x5384,0x53CB,0x53CA,0x53CD,/* 0xC8-0xCF */ + 0x58EC,0x5929,0x592B,0x592A,0x592D,0x5B54,0x5C11,0x5C24,/* 0xD0-0xD7 */ + 0x5C3A,0x5C6F,0x5DF4,0x5E7B,0x5EFF,0x5F14,0x5F15,0x5FC3,/* 0xD8-0xDF */ + 0x6208,0x6236,0x624B,0x624E,0x652F,0x6587,0x6597,0x65A4,/* 0xE0-0xE7 */ + 0x65B9,0x65E5,0x66F0,0x6708,0x6728,0x6B20,0x6B62,0x6B79,/* 0xE8-0xEF */ + 0x6BCB,0x6BD4,0x6BDB,0x6C0F,0x6C34,0x706B,0x722A,0x7236,/* 0xF0-0xF7 */ + 0x723B,0x7247,0x7259,0x725B,0x72AC,0x738B,0x4E19,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E16,0x4E15,0x4E14,0x4E18,0x4E3B,0x4E4D,0x4E4F,0x4E4E,/* 0x40-0x47 */ + 0x4EE5,0x4ED8,0x4ED4,0x4ED5,0x4ED6,0x4ED7,0x4EE3,0x4EE4,/* 0x48-0x4F */ + 0x4ED9,0x4EDE,0x5145,0x5144,0x5189,0x518A,0x51AC,0x51F9,/* 0x50-0x57 */ + 0x51FA,0x51F8,0x520A,0x52A0,0x529F,0x5305,0x5306,0x5317,/* 0x58-0x5F */ + 0x531D,0x4EDF,0x534A,0x5349,0x5361,0x5360,0x536F,0x536E,/* 0x60-0x67 */ + 0x53BB,0x53EF,0x53E4,0x53F3,0x53EC,0x53EE,0x53E9,0x53E8,/* 0x68-0x6F */ + 0x53FC,0x53F8,0x53F5,0x53EB,0x53E6,0x53EA,0x53F2,0x53F1,/* 0x70-0x77 */ + 0x53F0,0x53E5,0x53ED,0x53FB,0x56DB,0x56DA,0x5916,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x592E,0x5931,0x5974,0x5976,0x5B55,0x5B83,0x5C3C,/* 0xA0-0xA7 */ + 0x5DE8,0x5DE7,0x5DE6,0x5E02,0x5E03,0x5E73,0x5E7C,0x5F01,/* 0xA8-0xAF */ + 0x5F18,0x5F17,0x5FC5,0x620A,0x6253,0x6254,0x6252,0x6251,/* 0xB0-0xB7 */ + 0x65A5,0x65E6,0x672E,0x672C,0x672A,0x672B,0x672D,0x6B63,/* 0xB8-0xBF */ + 0x6BCD,0x6C11,0x6C10,0x6C38,0x6C41,0x6C40,0x6C3E,0x72AF,/* 0xC0-0xC7 */ + 0x7384,0x7389,0x74DC,0x74E6,0x7518,0x751F,0x7528,0x7529,/* 0xC8-0xCF */ + 0x7530,0x7531,0x7532,0x7533,0x758B,0x767D,0x76AE,0x76BF,/* 0xD0-0xD7 */ + 0x76EE,0x77DB,0x77E2,0x77F3,0x793A,0x79BE,0x7A74,0x7ACB,/* 0xD8-0xDF */ + 0x4E1E,0x4E1F,0x4E52,0x4E53,0x4E69,0x4E99,0x4EA4,0x4EA6,/* 0xE0-0xE7 */ + 0x4EA5,0x4EFF,0x4F09,0x4F19,0x4F0A,0x4F15,0x4F0D,0x4F10,/* 0xE8-0xEF */ + 0x4F11,0x4F0F,0x4EF2,0x4EF6,0x4EFB,0x4EF0,0x4EF3,0x4EFD,/* 0xF0-0xF7 */ + 0x4F01,0x4F0B,0x5149,0x5147,0x5146,0x5148,0x5168,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5171,0x518D,0x51B0,0x5217,0x5211,0x5212,0x520E,0x5216,/* 0x40-0x47 */ + 0x52A3,0x5308,0x5321,0x5320,0x5370,0x5371,0x5409,0x540F,/* 0x48-0x4F */ + 0x540C,0x540A,0x5410,0x5401,0x540B,0x5404,0x5411,0x540D,/* 0x50-0x57 */ + 0x5408,0x5403,0x540E,0x5406,0x5412,0x56E0,0x56DE,0x56DD,/* 0x58-0x5F */ + 0x5733,0x5730,0x5728,0x572D,0x572C,0x572F,0x5729,0x5919,/* 0x60-0x67 */ + 0x591A,0x5937,0x5938,0x5984,0x5978,0x5983,0x597D,0x5979,/* 0x68-0x6F */ + 0x5982,0x5981,0x5B57,0x5B58,0x5B87,0x5B88,0x5B85,0x5B89,/* 0x70-0x77 */ + 0x5BFA,0x5C16,0x5C79,0x5DDE,0x5E06,0x5E76,0x5E74,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5F0F,0x5F1B,0x5FD9,0x5FD6,0x620E,0x620C,0x620D,/* 0xA0-0xA7 */ + 0x6210,0x6263,0x625B,0x6258,0x6536,0x65E9,0x65E8,0x65EC,/* 0xA8-0xAF */ + 0x65ED,0x66F2,0x66F3,0x6709,0x673D,0x6734,0x6731,0x6735,/* 0xB0-0xB7 */ + 0x6B21,0x6B64,0x6B7B,0x6C16,0x6C5D,0x6C57,0x6C59,0x6C5F,/* 0xB8-0xBF */ + 0x6C60,0x6C50,0x6C55,0x6C61,0x6C5B,0x6C4D,0x6C4E,0x7070,/* 0xC0-0xC7 */ + 0x725F,0x725D,0x767E,0x7AF9,0x7C73,0x7CF8,0x7F36,0x7F8A,/* 0xC8-0xCF */ + 0x7FBD,0x8001,0x8003,0x800C,0x8012,0x8033,0x807F,0x8089,/* 0xD0-0xD7 */ + 0x808B,0x808C,0x81E3,0x81EA,0x81F3,0x81FC,0x820C,0x821B,/* 0xD8-0xDF */ + 0x821F,0x826E,0x8272,0x827E,0x866B,0x8840,0x884C,0x8863,/* 0xE0-0xE7 */ + 0x897F,0x9621,0x4E32,0x4EA8,0x4F4D,0x4F4F,0x4F47,0x4F57,/* 0xE8-0xEF */ + 0x4F5E,0x4F34,0x4F5B,0x4F55,0x4F30,0x4F50,0x4F51,0x4F3D,/* 0xF0-0xF7 */ + 0x4F3A,0x4F38,0x4F43,0x4F54,0x4F3C,0x4F46,0x4F63,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4F5C,0x4F60,0x4F2F,0x4F4E,0x4F36,0x4F59,0x4F5D,0x4F48,/* 0x40-0x47 */ + 0x4F5A,0x514C,0x514B,0x514D,0x5175,0x51B6,0x51B7,0x5225,/* 0x48-0x4F */ + 0x5224,0x5229,0x522A,0x5228,0x52AB,0x52A9,0x52AA,0x52AC,/* 0x50-0x57 */ + 0x5323,0x5373,0x5375,0x541D,0x542D,0x541E,0x543E,0x5426,/* 0x58-0x5F */ + 0x544E,0x5427,0x5446,0x5443,0x5433,0x5448,0x5442,0x541B,/* 0x60-0x67 */ + 0x5429,0x544A,0x5439,0x543B,0x5438,0x542E,0x5435,0x5436,/* 0x68-0x6F */ + 0x5420,0x543C,0x5440,0x5431,0x542B,0x541F,0x542C,0x56EA,/* 0x70-0x77 */ + 0x56F0,0x56E4,0x56EB,0x574A,0x5751,0x5740,0x574D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5747,0x574E,0x573E,0x5750,0x574F,0x573B,0x58EF,/* 0xA0-0xA7 */ + 0x593E,0x599D,0x5992,0x59A8,0x599E,0x59A3,0x5999,0x5996,/* 0xA8-0xAF */ + 0x598D,0x59A4,0x5993,0x598A,0x59A5,0x5B5D,0x5B5C,0x5B5A,/* 0xB0-0xB7 */ + 0x5B5B,0x5B8C,0x5B8B,0x5B8F,0x5C2C,0x5C40,0x5C41,0x5C3F,/* 0xB8-0xBF */ + 0x5C3E,0x5C90,0x5C91,0x5C94,0x5C8C,0x5DEB,0x5E0C,0x5E8F,/* 0xC0-0xC7 */ + 0x5E87,0x5E8A,0x5EF7,0x5F04,0x5F1F,0x5F64,0x5F62,0x5F77,/* 0xC8-0xCF */ + 0x5F79,0x5FD8,0x5FCC,0x5FD7,0x5FCD,0x5FF1,0x5FEB,0x5FF8,/* 0xD0-0xD7 */ + 0x5FEA,0x6212,0x6211,0x6284,0x6297,0x6296,0x6280,0x6276,/* 0xD8-0xDF */ + 0x6289,0x626D,0x628A,0x627C,0x627E,0x6279,0x6273,0x6292,/* 0xE0-0xE7 */ + 0x626F,0x6298,0x626E,0x6295,0x6293,0x6291,0x6286,0x6539,/* 0xE8-0xEF */ + 0x653B,0x6538,0x65F1,0x66F4,0x675F,0x674E,0x674F,0x6750,/* 0xF0-0xF7 */ + 0x6751,0x675C,0x6756,0x675E,0x6749,0x6746,0x6760,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6753,0x6757,0x6B65,0x6BCF,0x6C42,0x6C5E,0x6C99,0x6C81,/* 0x40-0x47 */ + 0x6C88,0x6C89,0x6C85,0x6C9B,0x6C6A,0x6C7A,0x6C90,0x6C70,/* 0x48-0x4F */ + 0x6C8C,0x6C68,0x6C96,0x6C92,0x6C7D,0x6C83,0x6C72,0x6C7E,/* 0x50-0x57 */ + 0x6C74,0x6C86,0x6C76,0x6C8D,0x6C94,0x6C98,0x6C82,0x7076,/* 0x58-0x5F */ + 0x707C,0x707D,0x7078,0x7262,0x7261,0x7260,0x72C4,0x72C2,/* 0x60-0x67 */ + 0x7396,0x752C,0x752B,0x7537,0x7538,0x7682,0x76EF,0x77E3,/* 0x68-0x6F */ + 0x79C1,0x79C0,0x79BF,0x7A76,0x7CFB,0x7F55,0x8096,0x8093,/* 0x70-0x77 */ + 0x809D,0x8098,0x809B,0x809A,0x80B2,0x826F,0x8292,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x828B,0x828D,0x898B,0x89D2,0x8A00,0x8C37,0x8C46,/* 0xA0-0xA7 */ + 0x8C55,0x8C9D,0x8D64,0x8D70,0x8DB3,0x8EAB,0x8ECA,0x8F9B,/* 0xA8-0xAF */ + 0x8FB0,0x8FC2,0x8FC6,0x8FC5,0x8FC4,0x5DE1,0x9091,0x90A2,/* 0xB0-0xB7 */ + 0x90AA,0x90A6,0x90A3,0x9149,0x91C6,0x91CC,0x9632,0x962E,/* 0xB8-0xBF */ + 0x9631,0x962A,0x962C,0x4E26,0x4E56,0x4E73,0x4E8B,0x4E9B,/* 0xC0-0xC7 */ + 0x4E9E,0x4EAB,0x4EAC,0x4F6F,0x4F9D,0x4F8D,0x4F73,0x4F7F,/* 0xC8-0xCF */ + 0x4F6C,0x4F9B,0x4F8B,0x4F86,0x4F83,0x4F70,0x4F75,0x4F88,/* 0xD0-0xD7 */ + 0x4F69,0x4F7B,0x4F96,0x4F7E,0x4F8F,0x4F91,0x4F7A,0x5154,/* 0xD8-0xDF */ + 0x5152,0x5155,0x5169,0x5177,0x5176,0x5178,0x51BD,0x51FD,/* 0xE0-0xE7 */ + 0x523B,0x5238,0x5237,0x523A,0x5230,0x522E,0x5236,0x5241,/* 0xE8-0xEF */ + 0x52BE,0x52BB,0x5352,0x5354,0x5353,0x5351,0x5366,0x5377,/* 0xF0-0xF7 */ + 0x5378,0x5379,0x53D6,0x53D4,0x53D7,0x5473,0x5475,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5496,0x5478,0x5495,0x5480,0x547B,0x5477,0x5484,0x5492,/* 0x40-0x47 */ + 0x5486,0x547C,0x5490,0x5471,0x5476,0x548C,0x549A,0x5462,/* 0x48-0x4F */ + 0x5468,0x548B,0x547D,0x548E,0x56FA,0x5783,0x5777,0x576A,/* 0x50-0x57 */ + 0x5769,0x5761,0x5766,0x5764,0x577C,0x591C,0x5949,0x5947,/* 0x58-0x5F */ + 0x5948,0x5944,0x5954,0x59BE,0x59BB,0x59D4,0x59B9,0x59AE,/* 0x60-0x67 */ + 0x59D1,0x59C6,0x59D0,0x59CD,0x59CB,0x59D3,0x59CA,0x59AF,/* 0x68-0x6F */ + 0x59B3,0x59D2,0x59C5,0x5B5F,0x5B64,0x5B63,0x5B97,0x5B9A,/* 0x70-0x77 */ + 0x5B98,0x5B9C,0x5B99,0x5B9B,0x5C1A,0x5C48,0x5C45,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5C46,0x5CB7,0x5CA1,0x5CB8,0x5CA9,0x5CAB,0x5CB1,/* 0xA0-0xA7 */ + 0x5CB3,0x5E18,0x5E1A,0x5E16,0x5E15,0x5E1B,0x5E11,0x5E78,/* 0xA8-0xAF */ + 0x5E9A,0x5E97,0x5E9C,0x5E95,0x5E96,0x5EF6,0x5F26,0x5F27,/* 0xB0-0xB7 */ + 0x5F29,0x5F80,0x5F81,0x5F7F,0x5F7C,0x5FDD,0x5FE0,0x5FFD,/* 0xB8-0xBF */ + 0x5FF5,0x5FFF,0x600F,0x6014,0x602F,0x6035,0x6016,0x602A,/* 0xC0-0xC7 */ + 0x6015,0x6021,0x6027,0x6029,0x602B,0x601B,0x6216,0x6215,/* 0xC8-0xCF */ + 0x623F,0x623E,0x6240,0x627F,0x62C9,0x62CC,0x62C4,0x62BF,/* 0xD0-0xD7 */ + 0x62C2,0x62B9,0x62D2,0x62DB,0x62AB,0x62D3,0x62D4,0x62CB,/* 0xD8-0xDF */ + 0x62C8,0x62A8,0x62BD,0x62BC,0x62D0,0x62D9,0x62C7,0x62CD,/* 0xE0-0xE7 */ + 0x62B5,0x62DA,0x62B1,0x62D8,0x62D6,0x62D7,0x62C6,0x62AC,/* 0xE8-0xEF */ + 0x62CE,0x653E,0x65A7,0x65BC,0x65FA,0x6614,0x6613,0x660C,/* 0xF0-0xF7 */ + 0x6606,0x6602,0x660E,0x6600,0x660F,0x6615,0x660A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6607,0x670D,0x670B,0x676D,0x678B,0x6795,0x6771,0x679C,/* 0x40-0x47 */ + 0x6773,0x6777,0x6787,0x679D,0x6797,0x676F,0x6770,0x677F,/* 0x48-0x4F */ + 0x6789,0x677E,0x6790,0x6775,0x679A,0x6793,0x677C,0x676A,/* 0x50-0x57 */ + 0x6772,0x6B23,0x6B66,0x6B67,0x6B7F,0x6C13,0x6C1B,0x6CE3,/* 0x58-0x5F */ + 0x6CE8,0x6CF3,0x6CB1,0x6CCC,0x6CE5,0x6CB3,0x6CBD,0x6CBE,/* 0x60-0x67 */ + 0x6CBC,0x6CE2,0x6CAB,0x6CD5,0x6CD3,0x6CB8,0x6CC4,0x6CB9,/* 0x68-0x6F */ + 0x6CC1,0x6CAE,0x6CD7,0x6CC5,0x6CF1,0x6CBF,0x6CBB,0x6CE1,/* 0x70-0x77 */ + 0x6CDB,0x6CCA,0x6CAC,0x6CEF,0x6CDC,0x6CD6,0x6CE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7095,0x708E,0x7092,0x708A,0x7099,0x722C,0x722D,/* 0xA0-0xA7 */ + 0x7238,0x7248,0x7267,0x7269,0x72C0,0x72CE,0x72D9,0x72D7,/* 0xA8-0xAF */ + 0x72D0,0x73A9,0x73A8,0x739F,0x73AB,0x73A5,0x753D,0x759D,/* 0xB0-0xB7 */ + 0x7599,0x759A,0x7684,0x76C2,0x76F2,0x76F4,0x77E5,0x77FD,/* 0xB8-0xBF */ + 0x793E,0x7940,0x7941,0x79C9,0x79C8,0x7A7A,0x7A79,0x7AFA,/* 0xC0-0xC7 */ + 0x7CFE,0x7F54,0x7F8C,0x7F8B,0x8005,0x80BA,0x80A5,0x80A2,/* 0xC8-0xCF */ + 0x80B1,0x80A1,0x80AB,0x80A9,0x80B4,0x80AA,0x80AF,0x81E5,/* 0xD0-0xD7 */ + 0x81FE,0x820D,0x82B3,0x829D,0x8299,0x82AD,0x82BD,0x829F,/* 0xD8-0xDF */ + 0x82B9,0x82B1,0x82AC,0x82A5,0x82AF,0x82B8,0x82A3,0x82B0,/* 0xE0-0xE7 */ + 0x82BE,0x82B7,0x864E,0x8671,0x521D,0x8868,0x8ECB,0x8FCE,/* 0xE8-0xEF */ + 0x8FD4,0x8FD1,0x90B5,0x90B8,0x90B1,0x90B6,0x91C7,0x91D1,/* 0xF0-0xF7 */ + 0x9577,0x9580,0x961C,0x9640,0x963F,0x963B,0x9644,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9642,0x96B9,0x96E8,0x9752,0x975E,0x4E9F,0x4EAD,0x4EAE,/* 0x40-0x47 */ + 0x4FE1,0x4FB5,0x4FAF,0x4FBF,0x4FE0,0x4FD1,0x4FCF,0x4FDD,/* 0x48-0x4F */ + 0x4FC3,0x4FB6,0x4FD8,0x4FDF,0x4FCA,0x4FD7,0x4FAE,0x4FD0,/* 0x50-0x57 */ + 0x4FC4,0x4FC2,0x4FDA,0x4FCE,0x4FDE,0x4FB7,0x5157,0x5192,/* 0x58-0x5F */ + 0x5191,0x51A0,0x524E,0x5243,0x524A,0x524D,0x524C,0x524B,/* 0x60-0x67 */ + 0x5247,0x52C7,0x52C9,0x52C3,0x52C1,0x530D,0x5357,0x537B,/* 0x68-0x6F */ + 0x539A,0x53DB,0x54AC,0x54C0,0x54A8,0x54CE,0x54C9,0x54B8,/* 0x70-0x77 */ + 0x54A6,0x54B3,0x54C7,0x54C2,0x54BD,0x54AA,0x54C1,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54C4,0x54C8,0x54AF,0x54AB,0x54B1,0x54BB,0x54A9,/* 0xA0-0xA7 */ + 0x54A7,0x54BF,0x56FF,0x5782,0x578B,0x57A0,0x57A3,0x57A2,/* 0xA8-0xAF */ + 0x57CE,0x57AE,0x5793,0x5955,0x5951,0x594F,0x594E,0x5950,/* 0xB0-0xB7 */ + 0x59DC,0x59D8,0x59FF,0x59E3,0x59E8,0x5A03,0x59E5,0x59EA,/* 0xB8-0xBF */ + 0x59DA,0x59E6,0x5A01,0x59FB,0x5B69,0x5BA3,0x5BA6,0x5BA4,/* 0xC0-0xC7 */ + 0x5BA2,0x5BA5,0x5C01,0x5C4E,0x5C4F,0x5C4D,0x5C4B,0x5CD9,/* 0xC8-0xCF */ + 0x5CD2,0x5DF7,0x5E1D,0x5E25,0x5E1F,0x5E7D,0x5EA0,0x5EA6,/* 0xD0-0xD7 */ + 0x5EFA,0x5F08,0x5F2D,0x5F65,0x5F88,0x5F85,0x5F8A,0x5F8B,/* 0xD8-0xDF */ + 0x5F87,0x5F8C,0x5F89,0x6012,0x601D,0x6020,0x6025,0x600E,/* 0xE0-0xE7 */ + 0x6028,0x604D,0x6070,0x6068,0x6062,0x6046,0x6043,0x606C,/* 0xE8-0xEF */ + 0x606B,0x606A,0x6064,0x6241,0x62DC,0x6316,0x6309,0x62FC,/* 0xF0-0xF7 */ + 0x62ED,0x6301,0x62EE,0x62FD,0x6307,0x62F1,0x62F7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x62EF,0x62EC,0x62FE,0x62F4,0x6311,0x6302,0x653F,0x6545,/* 0x40-0x47 */ + 0x65AB,0x65BD,0x65E2,0x6625,0x662D,0x6620,0x6627,0x662F,/* 0x48-0x4F */ + 0x661F,0x6628,0x6631,0x6624,0x66F7,0x67FF,0x67D3,0x67F1,/* 0x50-0x57 */ + 0x67D4,0x67D0,0x67EC,0x67B6,0x67AF,0x67F5,0x67E9,0x67EF,/* 0x58-0x5F */ + 0x67C4,0x67D1,0x67B4,0x67DA,0x67E5,0x67B8,0x67CF,0x67DE,/* 0x60-0x67 */ + 0x67F3,0x67B0,0x67D9,0x67E2,0x67DD,0x67D2,0x6B6A,0x6B83,/* 0x68-0x6F */ + 0x6B86,0x6BB5,0x6BD2,0x6BD7,0x6C1F,0x6CC9,0x6D0B,0x6D32,/* 0x70-0x77 */ + 0x6D2A,0x6D41,0x6D25,0x6D0C,0x6D31,0x6D1E,0x6D17,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6D3B,0x6D3D,0x6D3E,0x6D36,0x6D1B,0x6CF5,0x6D39,/* 0xA0-0xA7 */ + 0x6D27,0x6D38,0x6D29,0x6D2E,0x6D35,0x6D0E,0x6D2B,0x70AB,/* 0xA8-0xAF */ + 0x70BA,0x70B3,0x70AC,0x70AF,0x70AD,0x70B8,0x70AE,0x70A4,/* 0xB0-0xB7 */ + 0x7230,0x7272,0x726F,0x7274,0x72E9,0x72E0,0x72E1,0x73B7,/* 0xB8-0xBF */ + 0x73CA,0x73BB,0x73B2,0x73CD,0x73C0,0x73B3,0x751A,0x752D,/* 0xC0-0xC7 */ + 0x754F,0x754C,0x754E,0x754B,0x75AB,0x75A4,0x75A5,0x75A2,/* 0xC8-0xCF */ + 0x75A3,0x7678,0x7686,0x7687,0x7688,0x76C8,0x76C6,0x76C3,/* 0xD0-0xD7 */ + 0x76C5,0x7701,0x76F9,0x76F8,0x7709,0x770B,0x76FE,0x76FC,/* 0xD8-0xDF */ + 0x7707,0x77DC,0x7802,0x7814,0x780C,0x780D,0x7946,0x7949,/* 0xE0-0xE7 */ + 0x7948,0x7947,0x79B9,0x79BA,0x79D1,0x79D2,0x79CB,0x7A7F,/* 0xE8-0xEF */ + 0x7A81,0x7AFF,0x7AFD,0x7C7D,0x7D02,0x7D05,0x7D00,0x7D09,/* 0xF0-0xF7 */ + 0x7D07,0x7D04,0x7D06,0x7F38,0x7F8E,0x7FBF,0x8004,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8010,0x800D,0x8011,0x8036,0x80D6,0x80E5,0x80DA,0x80C3,/* 0x40-0x47 */ + 0x80C4,0x80CC,0x80E1,0x80DB,0x80CE,0x80DE,0x80E4,0x80DD,/* 0x48-0x4F */ + 0x81F4,0x8222,0x82E7,0x8303,0x8305,0x82E3,0x82DB,0x82E6,/* 0x50-0x57 */ + 0x8304,0x82E5,0x8302,0x8309,0x82D2,0x82D7,0x82F1,0x8301,/* 0x58-0x5F */ + 0x82DC,0x82D4,0x82D1,0x82DE,0x82D3,0x82DF,0x82EF,0x8306,/* 0x60-0x67 */ + 0x8650,0x8679,0x867B,0x867A,0x884D,0x886B,0x8981,0x89D4,/* 0x68-0x6F */ + 0x8A08,0x8A02,0x8A03,0x8C9E,0x8CA0,0x8D74,0x8D73,0x8DB4,/* 0x70-0x77 */ + 0x8ECD,0x8ECC,0x8FF0,0x8FE6,0x8FE2,0x8FEA,0x8FE5,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8FED,0x8FEB,0x8FE4,0x8FE8,0x90CA,0x90CE,0x90C1,/* 0xA0-0xA7 */ + 0x90C3,0x914B,0x914A,0x91CD,0x9582,0x9650,0x964B,0x964C,/* 0xA8-0xAF */ + 0x964D,0x9762,0x9769,0x97CB,0x97ED,0x97F3,0x9801,0x98A8,/* 0xB0-0xB7 */ + 0x98DB,0x98DF,0x9996,0x9999,0x4E58,0x4EB3,0x500C,0x500D,/* 0xB8-0xBF */ + 0x5023,0x4FEF,0x5026,0x5025,0x4FF8,0x5029,0x5016,0x5006,/* 0xC0-0xC7 */ + 0x503C,0x501F,0x501A,0x5012,0x5011,0x4FFA,0x5000,0x5014,/* 0xC8-0xCF */ + 0x5028,0x4FF1,0x5021,0x500B,0x5019,0x5018,0x4FF3,0x4FEE,/* 0xD0-0xD7 */ + 0x502D,0x502A,0x4FFE,0x502B,0x5009,0x517C,0x51A4,0x51A5,/* 0xD8-0xDF */ + 0x51A2,0x51CD,0x51CC,0x51C6,0x51CB,0x5256,0x525C,0x5254,/* 0xE0-0xE7 */ + 0x525B,0x525D,0x532A,0x537F,0x539F,0x539D,0x53DF,0x54E8,/* 0xE8-0xEF */ + 0x5510,0x5501,0x5537,0x54FC,0x54E5,0x54F2,0x5506,0x54FA,/* 0xF0-0xF7 */ + 0x5514,0x54E9,0x54ED,0x54E1,0x5509,0x54EE,0x54EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54E6,0x5527,0x5507,0x54FD,0x550F,0x5703,0x5704,0x57C2,/* 0x40-0x47 */ + 0x57D4,0x57CB,0x57C3,0x5809,0x590F,0x5957,0x5958,0x595A,/* 0x48-0x4F */ + 0x5A11,0x5A18,0x5A1C,0x5A1F,0x5A1B,0x5A13,0x59EC,0x5A20,/* 0x50-0x57 */ + 0x5A23,0x5A29,0x5A25,0x5A0C,0x5A09,0x5B6B,0x5C58,0x5BB0,/* 0x58-0x5F */ + 0x5BB3,0x5BB6,0x5BB4,0x5BAE,0x5BB5,0x5BB9,0x5BB8,0x5C04,/* 0x60-0x67 */ + 0x5C51,0x5C55,0x5C50,0x5CED,0x5CFD,0x5CFB,0x5CEA,0x5CE8,/* 0x68-0x6F */ + 0x5CF0,0x5CF6,0x5D01,0x5CF4,0x5DEE,0x5E2D,0x5E2B,0x5EAB,/* 0x70-0x77 */ + 0x5EAD,0x5EA7,0x5F31,0x5F92,0x5F91,0x5F90,0x6059,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6063,0x6065,0x6050,0x6055,0x606D,0x6069,0x606F,/* 0xA0-0xA7 */ + 0x6084,0x609F,0x609A,0x608D,0x6094,0x608C,0x6085,0x6096,/* 0xA8-0xAF */ + 0x6247,0x62F3,0x6308,0x62FF,0x634E,0x633E,0x632F,0x6355,/* 0xB0-0xB7 */ + 0x6342,0x6346,0x634F,0x6349,0x633A,0x6350,0x633D,0x632A,/* 0xB8-0xBF */ + 0x632B,0x6328,0x634D,0x634C,0x6548,0x6549,0x6599,0x65C1,/* 0xC0-0xC7 */ + 0x65C5,0x6642,0x6649,0x664F,0x6643,0x6652,0x664C,0x6645,/* 0xC8-0xCF */ + 0x6641,0x66F8,0x6714,0x6715,0x6717,0x6821,0x6838,0x6848,/* 0xD0-0xD7 */ + 0x6846,0x6853,0x6839,0x6842,0x6854,0x6829,0x68B3,0x6817,/* 0xD8-0xDF */ + 0x684C,0x6851,0x683D,0x67F4,0x6850,0x6840,0x683C,0x6843,/* 0xE0-0xE7 */ + 0x682A,0x6845,0x6813,0x6818,0x6841,0x6B8A,0x6B89,0x6BB7,/* 0xE8-0xEF */ + 0x6C23,0x6C27,0x6C28,0x6C26,0x6C24,0x6CF0,0x6D6A,0x6D95,/* 0xF0-0xF7 */ + 0x6D88,0x6D87,0x6D66,0x6D78,0x6D77,0x6D59,0x6D93,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6D6C,0x6D89,0x6D6E,0x6D5A,0x6D74,0x6D69,0x6D8C,0x6D8A,/* 0x40-0x47 */ + 0x6D79,0x6D85,0x6D65,0x6D94,0x70CA,0x70D8,0x70E4,0x70D9,/* 0x48-0x4F */ + 0x70C8,0x70CF,0x7239,0x7279,0x72FC,0x72F9,0x72FD,0x72F8,/* 0x50-0x57 */ + 0x72F7,0x7386,0x73ED,0x7409,0x73EE,0x73E0,0x73EA,0x73DE,/* 0x58-0x5F */ + 0x7554,0x755D,0x755C,0x755A,0x7559,0x75BE,0x75C5,0x75C7,/* 0x60-0x67 */ + 0x75B2,0x75B3,0x75BD,0x75BC,0x75B9,0x75C2,0x75B8,0x768B,/* 0x68-0x6F */ + 0x76B0,0x76CA,0x76CD,0x76CE,0x7729,0x771F,0x7720,0x7728,/* 0x70-0x77 */ + 0x77E9,0x7830,0x7827,0x7838,0x781D,0x7834,0x7837,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7825,0x782D,0x7820,0x781F,0x7832,0x7955,0x7950,/* 0xA0-0xA7 */ + 0x7960,0x795F,0x7956,0x795E,0x795D,0x7957,0x795A,0x79E4,/* 0xA8-0xAF */ + 0x79E3,0x79E7,0x79DF,0x79E6,0x79E9,0x79D8,0x7A84,0x7A88,/* 0xB0-0xB7 */ + 0x7AD9,0x7B06,0x7B11,0x7C89,0x7D21,0x7D17,0x7D0B,0x7D0A,/* 0xB8-0xBF */ + 0x7D20,0x7D22,0x7D14,0x7D10,0x7D15,0x7D1A,0x7D1C,0x7D0D,/* 0xC0-0xC7 */ + 0x7D19,0x7D1B,0x7F3A,0x7F5F,0x7F94,0x7FC5,0x7FC1,0x8006,/* 0xC8-0xCF */ + 0x8018,0x8015,0x8019,0x8017,0x803D,0x803F,0x80F1,0x8102,/* 0xD0-0xD7 */ + 0x80F0,0x8105,0x80ED,0x80F4,0x8106,0x80F8,0x80F3,0x8108,/* 0xD8-0xDF */ + 0x80FD,0x810A,0x80FC,0x80EF,0x81ED,0x81EC,0x8200,0x8210,/* 0xE0-0xE7 */ + 0x822A,0x822B,0x8228,0x822C,0x82BB,0x832B,0x8352,0x8354,/* 0xE8-0xEF */ + 0x834A,0x8338,0x8350,0x8349,0x8335,0x8334,0x834F,0x8332,/* 0xF0-0xF7 */ + 0x8339,0x8336,0x8317,0x8340,0x8331,0x8328,0x8343,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8654,0x868A,0x86AA,0x8693,0x86A4,0x86A9,0x868C,0x86A3,/* 0x40-0x47 */ + 0x869C,0x8870,0x8877,0x8881,0x8882,0x887D,0x8879,0x8A18,/* 0x48-0x4F */ + 0x8A10,0x8A0E,0x8A0C,0x8A15,0x8A0A,0x8A17,0x8A13,0x8A16,/* 0x50-0x57 */ + 0x8A0F,0x8A11,0x8C48,0x8C7A,0x8C79,0x8CA1,0x8CA2,0x8D77,/* 0x58-0x5F */ + 0x8EAC,0x8ED2,0x8ED4,0x8ECF,0x8FB1,0x9001,0x9006,0x8FF7,/* 0x60-0x67 */ + 0x9000,0x8FFA,0x8FF4,0x9003,0x8FFD,0x9005,0x8FF8,0x9095,/* 0x68-0x6F */ + 0x90E1,0x90DD,0x90E2,0x9152,0x914D,0x914C,0x91D8,0x91DD,/* 0x70-0x77 */ + 0x91D7,0x91DC,0x91D9,0x9583,0x9662,0x9663,0x9661,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x965B,0x965D,0x9664,0x9658,0x965E,0x96BB,0x98E2,/* 0xA0-0xA7 */ + 0x99AC,0x9AA8,0x9AD8,0x9B25,0x9B32,0x9B3C,0x4E7E,0x507A,/* 0xA8-0xAF */ + 0x507D,0x505C,0x5047,0x5043,0x504C,0x505A,0x5049,0x5065,/* 0xB0-0xB7 */ + 0x5076,0x504E,0x5055,0x5075,0x5074,0x5077,0x504F,0x500F,/* 0xB8-0xBF */ + 0x506F,0x506D,0x515C,0x5195,0x51F0,0x526A,0x526F,0x52D2,/* 0xC0-0xC7 */ + 0x52D9,0x52D8,0x52D5,0x5310,0x530F,0x5319,0x533F,0x5340,/* 0xC8-0xCF */ + 0x533E,0x53C3,0x66FC,0x5546,0x556A,0x5566,0x5544,0x555E,/* 0xD0-0xD7 */ + 0x5561,0x5543,0x554A,0x5531,0x5556,0x554F,0x5555,0x552F,/* 0xD8-0xDF */ + 0x5564,0x5538,0x552E,0x555C,0x552C,0x5563,0x5533,0x5541,/* 0xE0-0xE7 */ + 0x5557,0x5708,0x570B,0x5709,0x57DF,0x5805,0x580A,0x5806,/* 0xE8-0xEF */ + 0x57E0,0x57E4,0x57FA,0x5802,0x5835,0x57F7,0x57F9,0x5920,/* 0xF0-0xF7 */ + 0x5962,0x5A36,0x5A41,0x5A49,0x5A66,0x5A6A,0x5A40,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A3C,0x5A62,0x5A5A,0x5A46,0x5A4A,0x5B70,0x5BC7,0x5BC5,/* 0x40-0x47 */ + 0x5BC4,0x5BC2,0x5BBF,0x5BC6,0x5C09,0x5C08,0x5C07,0x5C60,/* 0x48-0x4F */ + 0x5C5C,0x5C5D,0x5D07,0x5D06,0x5D0E,0x5D1B,0x5D16,0x5D22,/* 0x50-0x57 */ + 0x5D11,0x5D29,0x5D14,0x5D19,0x5D24,0x5D27,0x5D17,0x5DE2,/* 0x58-0x5F */ + 0x5E38,0x5E36,0x5E33,0x5E37,0x5EB7,0x5EB8,0x5EB6,0x5EB5,/* 0x60-0x67 */ + 0x5EBE,0x5F35,0x5F37,0x5F57,0x5F6C,0x5F69,0x5F6B,0x5F97,/* 0x68-0x6F */ + 0x5F99,0x5F9E,0x5F98,0x5FA1,0x5FA0,0x5F9C,0x607F,0x60A3,/* 0x70-0x77 */ + 0x6089,0x60A0,0x60A8,0x60CB,0x60B4,0x60E6,0x60BD,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x60C5,0x60BB,0x60B5,0x60DC,0x60BC,0x60D8,0x60D5,/* 0xA0-0xA7 */ + 0x60C6,0x60DF,0x60B8,0x60DA,0x60C7,0x621A,0x621B,0x6248,/* 0xA8-0xAF */ + 0x63A0,0x63A7,0x6372,0x6396,0x63A2,0x63A5,0x6377,0x6367,/* 0xB0-0xB7 */ + 0x6398,0x63AA,0x6371,0x63A9,0x6389,0x6383,0x639B,0x636B,/* 0xB8-0xBF */ + 0x63A8,0x6384,0x6388,0x6399,0x63A1,0x63AC,0x6392,0x638F,/* 0xC0-0xC7 */ + 0x6380,0x637B,0x6369,0x6368,0x637A,0x655D,0x6556,0x6551,/* 0xC8-0xCF */ + 0x6559,0x6557,0x555F,0x654F,0x6558,0x6555,0x6554,0x659C,/* 0xD0-0xD7 */ + 0x659B,0x65AC,0x65CF,0x65CB,0x65CC,0x65CE,0x665D,0x665A,/* 0xD8-0xDF */ + 0x6664,0x6668,0x6666,0x665E,0x66F9,0x52D7,0x671B,0x6881,/* 0xE0-0xE7 */ + 0x68AF,0x68A2,0x6893,0x68B5,0x687F,0x6876,0x68B1,0x68A7,/* 0xE8-0xEF */ + 0x6897,0x68B0,0x6883,0x68C4,0x68AD,0x6886,0x6885,0x6894,/* 0xF0-0xF7 */ + 0x689D,0x68A8,0x689F,0x68A1,0x6882,0x6B32,0x6BBA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BEB,0x6BEC,0x6C2B,0x6D8E,0x6DBC,0x6DF3,0x6DD9,0x6DB2,/* 0x40-0x47 */ + 0x6DE1,0x6DCC,0x6DE4,0x6DFB,0x6DFA,0x6E05,0x6DC7,0x6DCB,/* 0x48-0x4F */ + 0x6DAF,0x6DD1,0x6DAE,0x6DDE,0x6DF9,0x6DB8,0x6DF7,0x6DF5,/* 0x50-0x57 */ + 0x6DC5,0x6DD2,0x6E1A,0x6DB5,0x6DDA,0x6DEB,0x6DD8,0x6DEA,/* 0x58-0x5F */ + 0x6DF1,0x6DEE,0x6DE8,0x6DC6,0x6DC4,0x6DAA,0x6DEC,0x6DBF,/* 0x60-0x67 */ + 0x6DE6,0x70F9,0x7109,0x710A,0x70FD,0x70EF,0x723D,0x727D,/* 0x68-0x6F */ + 0x7281,0x731C,0x731B,0x7316,0x7313,0x7319,0x7387,0x7405,/* 0x70-0x77 */ + 0x740A,0x7403,0x7406,0x73FE,0x740D,0x74E0,0x74F6,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x74F7,0x751C,0x7522,0x7565,0x7566,0x7562,0x7570,/* 0xA0-0xA7 */ + 0x758F,0x75D4,0x75D5,0x75B5,0x75CA,0x75CD,0x768E,0x76D4,/* 0xA8-0xAF */ + 0x76D2,0x76DB,0x7737,0x773E,0x773C,0x7736,0x7738,0x773A,/* 0xB0-0xB7 */ + 0x786B,0x7843,0x784E,0x7965,0x7968,0x796D,0x79FB,0x7A92,/* 0xB8-0xBF */ + 0x7A95,0x7B20,0x7B28,0x7B1B,0x7B2C,0x7B26,0x7B19,0x7B1E,/* 0xC0-0xC7 */ + 0x7B2E,0x7C92,0x7C97,0x7C95,0x7D46,0x7D43,0x7D71,0x7D2E,/* 0xC8-0xCF */ + 0x7D39,0x7D3C,0x7D40,0x7D30,0x7D33,0x7D44,0x7D2F,0x7D42,/* 0xD0-0xD7 */ + 0x7D32,0x7D31,0x7F3D,0x7F9E,0x7F9A,0x7FCC,0x7FCE,0x7FD2,/* 0xD8-0xDF */ + 0x801C,0x804A,0x8046,0x812F,0x8116,0x8123,0x812B,0x8129,/* 0xE0-0xE7 */ + 0x8130,0x8124,0x8202,0x8235,0x8237,0x8236,0x8239,0x838E,/* 0xE8-0xEF */ + 0x839E,0x8398,0x8378,0x83A2,0x8396,0x83BD,0x83AB,0x8392,/* 0xF0-0xF7 */ + 0x838A,0x8393,0x8389,0x83A0,0x8377,0x837B,0x837C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8386,0x83A7,0x8655,0x5F6A,0x86C7,0x86C0,0x86B6,0x86C4,/* 0x40-0x47 */ + 0x86B5,0x86C6,0x86CB,0x86B1,0x86AF,0x86C9,0x8853,0x889E,/* 0x48-0x4F */ + 0x8888,0x88AB,0x8892,0x8896,0x888D,0x888B,0x8993,0x898F,/* 0x50-0x57 */ + 0x8A2A,0x8A1D,0x8A23,0x8A25,0x8A31,0x8A2D,0x8A1F,0x8A1B,/* 0x58-0x5F */ + 0x8A22,0x8C49,0x8C5A,0x8CA9,0x8CAC,0x8CAB,0x8CA8,0x8CAA,/* 0x60-0x67 */ + 0x8CA7,0x8D67,0x8D66,0x8DBE,0x8DBA,0x8EDB,0x8EDF,0x9019,/* 0x68-0x6F */ + 0x900D,0x901A,0x9017,0x9023,0x901F,0x901D,0x9010,0x9015,/* 0x70-0x77 */ + 0x901E,0x9020,0x900F,0x9022,0x9016,0x901B,0x9014,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x90E8,0x90ED,0x90FD,0x9157,0x91CE,0x91F5,0x91E6,/* 0xA0-0xA7 */ + 0x91E3,0x91E7,0x91ED,0x91E9,0x9589,0x966A,0x9675,0x9673,/* 0xA8-0xAF */ + 0x9678,0x9670,0x9674,0x9676,0x9677,0x966C,0x96C0,0x96EA,/* 0xB0-0xB7 */ + 0x96E9,0x7AE0,0x7ADF,0x9802,0x9803,0x9B5A,0x9CE5,0x9E75,/* 0xB8-0xBF */ + 0x9E7F,0x9EA5,0x9EBB,0x50A2,0x508D,0x5085,0x5099,0x5091,/* 0xC0-0xC7 */ + 0x5080,0x5096,0x5098,0x509A,0x6700,0x51F1,0x5272,0x5274,/* 0xC8-0xCF */ + 0x5275,0x5269,0x52DE,0x52DD,0x52DB,0x535A,0x53A5,0x557B,/* 0xD0-0xD7 */ + 0x5580,0x55A7,0x557C,0x558A,0x559D,0x5598,0x5582,0x559C,/* 0xD8-0xDF */ + 0x55AA,0x5594,0x5587,0x558B,0x5583,0x55B3,0x55AE,0x559F,/* 0xE0-0xE7 */ + 0x553E,0x55B2,0x559A,0x55BB,0x55AC,0x55B1,0x557E,0x5589,/* 0xE8-0xEF */ + 0x55AB,0x5599,0x570D,0x582F,0x582A,0x5834,0x5824,0x5830,/* 0xF0-0xF7 */ + 0x5831,0x5821,0x581D,0x5820,0x58F9,0x58FA,0x5960,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A77,0x5A9A,0x5A7F,0x5A92,0x5A9B,0x5AA7,0x5B73,0x5B71,/* 0x40-0x47 */ + 0x5BD2,0x5BCC,0x5BD3,0x5BD0,0x5C0A,0x5C0B,0x5C31,0x5D4C,/* 0x48-0x4F */ + 0x5D50,0x5D34,0x5D47,0x5DFD,0x5E45,0x5E3D,0x5E40,0x5E43,/* 0x50-0x57 */ + 0x5E7E,0x5ECA,0x5EC1,0x5EC2,0x5EC4,0x5F3C,0x5F6D,0x5FA9,/* 0x58-0x5F */ + 0x5FAA,0x5FA8,0x60D1,0x60E1,0x60B2,0x60B6,0x60E0,0x611C,/* 0x60-0x67 */ + 0x6123,0x60FA,0x6115,0x60F0,0x60FB,0x60F4,0x6168,0x60F1,/* 0x68-0x6F */ + 0x610E,0x60F6,0x6109,0x6100,0x6112,0x621F,0x6249,0x63A3,/* 0x70-0x77 */ + 0x638C,0x63CF,0x63C0,0x63E9,0x63C9,0x63C6,0x63CD,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x63D2,0x63E3,0x63D0,0x63E1,0x63D6,0x63ED,0x63EE,/* 0xA0-0xA7 */ + 0x6376,0x63F4,0x63EA,0x63DB,0x6452,0x63DA,0x63F9,0x655E,/* 0xA8-0xAF */ + 0x6566,0x6562,0x6563,0x6591,0x6590,0x65AF,0x666E,0x6670,/* 0xB0-0xB7 */ + 0x6674,0x6676,0x666F,0x6691,0x667A,0x667E,0x6677,0x66FE,/* 0xB8-0xBF */ + 0x66FF,0x671F,0x671D,0x68FA,0x68D5,0x68E0,0x68D8,0x68D7,/* 0xC0-0xC7 */ + 0x6905,0x68DF,0x68F5,0x68EE,0x68E7,0x68F9,0x68D2,0x68F2,/* 0xC8-0xCF */ + 0x68E3,0x68CB,0x68CD,0x690D,0x6912,0x690E,0x68C9,0x68DA,/* 0xD0-0xD7 */ + 0x696E,0x68FB,0x6B3E,0x6B3A,0x6B3D,0x6B98,0x6B96,0x6BBC,/* 0xD8-0xDF */ + 0x6BEF,0x6C2E,0x6C2F,0x6C2C,0x6E2F,0x6E38,0x6E54,0x6E21,/* 0xE0-0xE7 */ + 0x6E32,0x6E67,0x6E4A,0x6E20,0x6E25,0x6E23,0x6E1B,0x6E5B,/* 0xE8-0xEF */ + 0x6E58,0x6E24,0x6E56,0x6E6E,0x6E2D,0x6E26,0x6E6F,0x6E34,/* 0xF0-0xF7 */ + 0x6E4D,0x6E3A,0x6E2C,0x6E43,0x6E1D,0x6E3E,0x6ECB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6E89,0x6E19,0x6E4E,0x6E63,0x6E44,0x6E72,0x6E69,0x6E5F,/* 0x40-0x47 */ + 0x7119,0x711A,0x7126,0x7130,0x7121,0x7136,0x716E,0x711C,/* 0x48-0x4F */ + 0x724C,0x7284,0x7280,0x7336,0x7325,0x7334,0x7329,0x743A,/* 0x50-0x57 */ + 0x742A,0x7433,0x7422,0x7425,0x7435,0x7436,0x7434,0x742F,/* 0x58-0x5F */ + 0x741B,0x7426,0x7428,0x7525,0x7526,0x756B,0x756A,0x75E2,/* 0x60-0x67 */ + 0x75DB,0x75E3,0x75D9,0x75D8,0x75DE,0x75E0,0x767B,0x767C,/* 0x68-0x6F */ + 0x7696,0x7693,0x76B4,0x76DC,0x774F,0x77ED,0x785D,0x786C,/* 0x70-0x77 */ + 0x786F,0x7A0D,0x7A08,0x7A0B,0x7A05,0x7A00,0x7A98,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7A97,0x7A96,0x7AE5,0x7AE3,0x7B49,0x7B56,0x7B46,/* 0xA0-0xA7 */ + 0x7B50,0x7B52,0x7B54,0x7B4D,0x7B4B,0x7B4F,0x7B51,0x7C9F,/* 0xA8-0xAF */ + 0x7CA5,0x7D5E,0x7D50,0x7D68,0x7D55,0x7D2B,0x7D6E,0x7D72,/* 0xB0-0xB7 */ + 0x7D61,0x7D66,0x7D62,0x7D70,0x7D73,0x5584,0x7FD4,0x7FD5,/* 0xB8-0xBF */ + 0x800B,0x8052,0x8085,0x8155,0x8154,0x814B,0x8151,0x814E,/* 0xC0-0xC7 */ + 0x8139,0x8146,0x813E,0x814C,0x8153,0x8174,0x8212,0x821C,/* 0xC8-0xCF */ + 0x83E9,0x8403,0x83F8,0x840D,0x83E0,0x83C5,0x840B,0x83C1,/* 0xD0-0xD7 */ + 0x83EF,0x83F1,0x83F4,0x8457,0x840A,0x83F0,0x840C,0x83CC,/* 0xD8-0xDF */ + 0x83FD,0x83F2,0x83CA,0x8438,0x840E,0x8404,0x83DC,0x8407,/* 0xE0-0xE7 */ + 0x83D4,0x83DF,0x865B,0x86DF,0x86D9,0x86ED,0x86D4,0x86DB,/* 0xE8-0xEF */ + 0x86E4,0x86D0,0x86DE,0x8857,0x88C1,0x88C2,0x88B1,0x8983,/* 0xF0-0xF7 */ + 0x8996,0x8A3B,0x8A60,0x8A55,0x8A5E,0x8A3C,0x8A41,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A54,0x8A5B,0x8A50,0x8A46,0x8A34,0x8A3A,0x8A36,0x8A56,/* 0x40-0x47 */ + 0x8C61,0x8C82,0x8CAF,0x8CBC,0x8CB3,0x8CBD,0x8CC1,0x8CBB,/* 0x48-0x4F */ + 0x8CC0,0x8CB4,0x8CB7,0x8CB6,0x8CBF,0x8CB8,0x8D8A,0x8D85,/* 0x50-0x57 */ + 0x8D81,0x8DCE,0x8DDD,0x8DCB,0x8DDA,0x8DD1,0x8DCC,0x8DDB,/* 0x58-0x5F */ + 0x8DC6,0x8EFB,0x8EF8,0x8EFC,0x8F9C,0x902E,0x9035,0x9031,/* 0x60-0x67 */ + 0x9038,0x9032,0x9036,0x9102,0x90F5,0x9109,0x90FE,0x9163,/* 0x68-0x6F */ + 0x9165,0x91CF,0x9214,0x9215,0x9223,0x9209,0x921E,0x920D,/* 0x70-0x77 */ + 0x9210,0x9207,0x9211,0x9594,0x958F,0x958B,0x9591,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9593,0x9592,0x958E,0x968A,0x968E,0x968B,0x967D,/* 0xA0-0xA7 */ + 0x9685,0x9686,0x968D,0x9672,0x9684,0x96C1,0x96C5,0x96C4,/* 0xA8-0xAF */ + 0x96C6,0x96C7,0x96EF,0x96F2,0x97CC,0x9805,0x9806,0x9808,/* 0xB0-0xB7 */ + 0x98E7,0x98EA,0x98EF,0x98E9,0x98F2,0x98ED,0x99AE,0x99AD,/* 0xB8-0xBF */ + 0x9EC3,0x9ECD,0x9ED1,0x4E82,0x50AD,0x50B5,0x50B2,0x50B3,/* 0xC0-0xC7 */ + 0x50C5,0x50BE,0x50AC,0x50B7,0x50BB,0x50AF,0x50C7,0x527F,/* 0xC8-0xCF */ + 0x5277,0x527D,0x52DF,0x52E6,0x52E4,0x52E2,0x52E3,0x532F,/* 0xD0-0xD7 */ + 0x55DF,0x55E8,0x55D3,0x55E6,0x55CE,0x55DC,0x55C7,0x55D1,/* 0xD8-0xDF */ + 0x55E3,0x55E4,0x55EF,0x55DA,0x55E1,0x55C5,0x55C6,0x55E5,/* 0xE0-0xE7 */ + 0x55C9,0x5712,0x5713,0x585E,0x5851,0x5858,0x5857,0x585A,/* 0xE8-0xEF */ + 0x5854,0x586B,0x584C,0x586D,0x584A,0x5862,0x5852,0x584B,/* 0xF0-0xF7 */ + 0x5967,0x5AC1,0x5AC9,0x5ACC,0x5ABE,0x5ABD,0x5ABC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5AB3,0x5AC2,0x5AB2,0x5D69,0x5D6F,0x5E4C,0x5E79,0x5EC9,/* 0x40-0x47 */ + 0x5EC8,0x5F12,0x5F59,0x5FAC,0x5FAE,0x611A,0x610F,0x6148,/* 0x48-0x4F */ + 0x611F,0x60F3,0x611B,0x60F9,0x6101,0x6108,0x614E,0x614C,/* 0x50-0x57 */ + 0x6144,0x614D,0x613E,0x6134,0x6127,0x610D,0x6106,0x6137,/* 0x58-0x5F */ + 0x6221,0x6222,0x6413,0x643E,0x641E,0x642A,0x642D,0x643D,/* 0x60-0x67 */ + 0x642C,0x640F,0x641C,0x6414,0x640D,0x6436,0x6416,0x6417,/* 0x68-0x6F */ + 0x6406,0x656C,0x659F,0x65B0,0x6697,0x6689,0x6687,0x6688,/* 0x70-0x77 */ + 0x6696,0x6684,0x6698,0x668D,0x6703,0x6994,0x696D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x695A,0x6977,0x6960,0x6954,0x6975,0x6930,0x6982,/* 0xA0-0xA7 */ + 0x694A,0x6968,0x696B,0x695E,0x6953,0x6979,0x6986,0x695D,/* 0xA8-0xAF */ + 0x6963,0x695B,0x6B47,0x6B72,0x6BC0,0x6BBF,0x6BD3,0x6BFD,/* 0xB0-0xB7 */ + 0x6EA2,0x6EAF,0x6ED3,0x6EB6,0x6EC2,0x6E90,0x6E9D,0x6EC7,/* 0xB8-0xBF */ + 0x6EC5,0x6EA5,0x6E98,0x6EBC,0x6EBA,0x6EAB,0x6ED1,0x6E96,/* 0xC0-0xC7 */ + 0x6E9C,0x6EC4,0x6ED4,0x6EAA,0x6EA7,0x6EB4,0x714E,0x7159,/* 0xC8-0xCF */ + 0x7169,0x7164,0x7149,0x7167,0x715C,0x716C,0x7166,0x714C,/* 0xD0-0xD7 */ + 0x7165,0x715E,0x7146,0x7168,0x7156,0x723A,0x7252,0x7337,/* 0xD8-0xDF */ + 0x7345,0x733F,0x733E,0x746F,0x745A,0x7455,0x745F,0x745E,/* 0xE0-0xE7 */ + 0x7441,0x743F,0x7459,0x745B,0x745C,0x7576,0x7578,0x7600,/* 0xE8-0xEF */ + 0x75F0,0x7601,0x75F2,0x75F1,0x75FA,0x75FF,0x75F4,0x75F3,/* 0xF0-0xF7 */ + 0x76DE,0x76DF,0x775B,0x776B,0x7766,0x775E,0x7763,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7779,0x776A,0x776C,0x775C,0x7765,0x7768,0x7762,0x77EE,/* 0x40-0x47 */ + 0x788E,0x78B0,0x7897,0x7898,0x788C,0x7889,0x787C,0x7891,/* 0x48-0x4F */ + 0x7893,0x787F,0x797A,0x797F,0x7981,0x842C,0x79BD,0x7A1C,/* 0x50-0x57 */ + 0x7A1A,0x7A20,0x7A14,0x7A1F,0x7A1E,0x7A9F,0x7AA0,0x7B77,/* 0x58-0x5F */ + 0x7BC0,0x7B60,0x7B6E,0x7B67,0x7CB1,0x7CB3,0x7CB5,0x7D93,/* 0x60-0x67 */ + 0x7D79,0x7D91,0x7D81,0x7D8F,0x7D5B,0x7F6E,0x7F69,0x7F6A,/* 0x68-0x6F */ + 0x7F72,0x7FA9,0x7FA8,0x7FA4,0x8056,0x8058,0x8086,0x8084,/* 0x70-0x77 */ + 0x8171,0x8170,0x8178,0x8165,0x816E,0x8173,0x816B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8179,0x817A,0x8166,0x8205,0x8247,0x8482,0x8477,/* 0xA0-0xA7 */ + 0x843D,0x8431,0x8475,0x8466,0x846B,0x8449,0x846C,0x845B,/* 0xA8-0xAF */ + 0x843C,0x8435,0x8461,0x8463,0x8469,0x846D,0x8446,0x865E,/* 0xB0-0xB7 */ + 0x865C,0x865F,0x86F9,0x8713,0x8708,0x8707,0x8700,0x86FE,/* 0xB8-0xBF */ + 0x86FB,0x8702,0x8703,0x8706,0x870A,0x8859,0x88DF,0x88D4,/* 0xC0-0xC7 */ + 0x88D9,0x88DC,0x88D8,0x88DD,0x88E1,0x88CA,0x88D5,0x88D2,/* 0xC8-0xCF */ + 0x899C,0x89E3,0x8A6B,0x8A72,0x8A73,0x8A66,0x8A69,0x8A70,/* 0xD0-0xD7 */ + 0x8A87,0x8A7C,0x8A63,0x8AA0,0x8A71,0x8A85,0x8A6D,0x8A62,/* 0xD8-0xDF */ + 0x8A6E,0x8A6C,0x8A79,0x8A7B,0x8A3E,0x8A68,0x8C62,0x8C8A,/* 0xE0-0xE7 */ + 0x8C89,0x8CCA,0x8CC7,0x8CC8,0x8CC4,0x8CB2,0x8CC3,0x8CC2,/* 0xE8-0xEF */ + 0x8CC5,0x8DE1,0x8DDF,0x8DE8,0x8DEF,0x8DF3,0x8DFA,0x8DEA,/* 0xF0-0xF7 */ + 0x8DE4,0x8DE6,0x8EB2,0x8F03,0x8F09,0x8EFE,0x8F0A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F9F,0x8FB2,0x904B,0x904A,0x9053,0x9042,0x9054,0x903C,/* 0x40-0x47 */ + 0x9055,0x9050,0x9047,0x904F,0x904E,0x904D,0x9051,0x903E,/* 0x48-0x4F */ + 0x9041,0x9112,0x9117,0x916C,0x916A,0x9169,0x91C9,0x9237,/* 0x50-0x57 */ + 0x9257,0x9238,0x923D,0x9240,0x923E,0x925B,0x924B,0x9264,/* 0x58-0x5F */ + 0x9251,0x9234,0x9249,0x924D,0x9245,0x9239,0x923F,0x925A,/* 0x60-0x67 */ + 0x9598,0x9698,0x9694,0x9695,0x96CD,0x96CB,0x96C9,0x96CA,/* 0x68-0x6F */ + 0x96F7,0x96FB,0x96F9,0x96F6,0x9756,0x9774,0x9776,0x9810,/* 0x70-0x77 */ + 0x9811,0x9813,0x980A,0x9812,0x980C,0x98FC,0x98F4,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x98FD,0x98FE,0x99B3,0x99B1,0x99B4,0x9AE1,0x9CE9,/* 0xA0-0xA7 */ + 0x9E82,0x9F0E,0x9F13,0x9F20,0x50E7,0x50EE,0x50E5,0x50D6,/* 0xA8-0xAF */ + 0x50ED,0x50DA,0x50D5,0x50CF,0x50D1,0x50F1,0x50CE,0x50E9,/* 0xB0-0xB7 */ + 0x5162,0x51F3,0x5283,0x5282,0x5331,0x53AD,0x55FE,0x5600,/* 0xB8-0xBF */ + 0x561B,0x5617,0x55FD,0x5614,0x5606,0x5609,0x560D,0x560E,/* 0xC0-0xC7 */ + 0x55F7,0x5616,0x561F,0x5608,0x5610,0x55F6,0x5718,0x5716,/* 0xC8-0xCF */ + 0x5875,0x587E,0x5883,0x5893,0x588A,0x5879,0x5885,0x587D,/* 0xD0-0xD7 */ + 0x58FD,0x5925,0x5922,0x5924,0x596A,0x5969,0x5AE1,0x5AE6,/* 0xD8-0xDF */ + 0x5AE9,0x5AD7,0x5AD6,0x5AD8,0x5AE3,0x5B75,0x5BDE,0x5BE7,/* 0xE0-0xE7 */ + 0x5BE1,0x5BE5,0x5BE6,0x5BE8,0x5BE2,0x5BE4,0x5BDF,0x5C0D,/* 0xE8-0xEF */ + 0x5C62,0x5D84,0x5D87,0x5E5B,0x5E63,0x5E55,0x5E57,0x5E54,/* 0xF0-0xF7 */ + 0x5ED3,0x5ED6,0x5F0A,0x5F46,0x5F70,0x5FB9,0x6147,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x613F,0x614B,0x6177,0x6162,0x6163,0x615F,0x615A,0x6158,/* 0x40-0x47 */ + 0x6175,0x622A,0x6487,0x6458,0x6454,0x64A4,0x6478,0x645F,/* 0x48-0x4F */ + 0x647A,0x6451,0x6467,0x6434,0x646D,0x647B,0x6572,0x65A1,/* 0x50-0x57 */ + 0x65D7,0x65D6,0x66A2,0x66A8,0x669D,0x699C,0x69A8,0x6995,/* 0x58-0x5F */ + 0x69C1,0x69AE,0x69D3,0x69CB,0x699B,0x69B7,0x69BB,0x69AB,/* 0x60-0x67 */ + 0x69B4,0x69D0,0x69CD,0x69AD,0x69CC,0x69A6,0x69C3,0x69A3,/* 0x68-0x6F */ + 0x6B49,0x6B4C,0x6C33,0x6F33,0x6F14,0x6EFE,0x6F13,0x6EF4,/* 0x70-0x77 */ + 0x6F29,0x6F3E,0x6F20,0x6F2C,0x6F0F,0x6F02,0x6F22,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6EFF,0x6EEF,0x6F06,0x6F31,0x6F38,0x6F32,0x6F23,/* 0xA0-0xA7 */ + 0x6F15,0x6F2B,0x6F2F,0x6F88,0x6F2A,0x6EEC,0x6F01,0x6EF2,/* 0xA8-0xAF */ + 0x6ECC,0x6EF7,0x7194,0x7199,0x717D,0x718A,0x7184,0x7192,/* 0xB0-0xB7 */ + 0x723E,0x7292,0x7296,0x7344,0x7350,0x7464,0x7463,0x746A,/* 0xB8-0xBF */ + 0x7470,0x746D,0x7504,0x7591,0x7627,0x760D,0x760B,0x7609,/* 0xC0-0xC7 */ + 0x7613,0x76E1,0x76E3,0x7784,0x777D,0x777F,0x7761,0x78C1,/* 0xC8-0xCF */ + 0x789F,0x78A7,0x78B3,0x78A9,0x78A3,0x798E,0x798F,0x798D,/* 0xD0-0xD7 */ + 0x7A2E,0x7A31,0x7AAA,0x7AA9,0x7AED,0x7AEF,0x7BA1,0x7B95,/* 0xD8-0xDF */ + 0x7B8B,0x7B75,0x7B97,0x7B9D,0x7B94,0x7B8F,0x7BB8,0x7B87,/* 0xE0-0xE7 */ + 0x7B84,0x7CB9,0x7CBD,0x7CBE,0x7DBB,0x7DB0,0x7D9C,0x7DBD,/* 0xE8-0xEF */ + 0x7DBE,0x7DA0,0x7DCA,0x7DB4,0x7DB2,0x7DB1,0x7DBA,0x7DA2,/* 0xF0-0xF7 */ + 0x7DBF,0x7DB5,0x7DB8,0x7DAD,0x7DD2,0x7DC7,0x7DAC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F70,0x7FE0,0x7FE1,0x7FDF,0x805E,0x805A,0x8087,0x8150,/* 0x40-0x47 */ + 0x8180,0x818F,0x8188,0x818A,0x817F,0x8182,0x81E7,0x81FA,/* 0x48-0x4F */ + 0x8207,0x8214,0x821E,0x824B,0x84C9,0x84BF,0x84C6,0x84C4,/* 0x50-0x57 */ + 0x8499,0x849E,0x84B2,0x849C,0x84CB,0x84B8,0x84C0,0x84D3,/* 0x58-0x5F */ + 0x8490,0x84BC,0x84D1,0x84CA,0x873F,0x871C,0x873B,0x8722,/* 0x60-0x67 */ + 0x8725,0x8734,0x8718,0x8755,0x8737,0x8729,0x88F3,0x8902,/* 0x68-0x6F */ + 0x88F4,0x88F9,0x88F8,0x88FD,0x88E8,0x891A,0x88EF,0x8AA6,/* 0x70-0x77 */ + 0x8A8C,0x8A9E,0x8AA3,0x8A8D,0x8AA1,0x8A93,0x8AA4,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8AAA,0x8AA5,0x8AA8,0x8A98,0x8A91,0x8A9A,0x8AA7,/* 0xA0-0xA7 */ + 0x8C6A,0x8C8D,0x8C8C,0x8CD3,0x8CD1,0x8CD2,0x8D6B,0x8D99,/* 0xA8-0xAF */ + 0x8D95,0x8DFC,0x8F14,0x8F12,0x8F15,0x8F13,0x8FA3,0x9060,/* 0xB0-0xB7 */ + 0x9058,0x905C,0x9063,0x9059,0x905E,0x9062,0x905D,0x905B,/* 0xB8-0xBF */ + 0x9119,0x9118,0x911E,0x9175,0x9178,0x9177,0x9174,0x9278,/* 0xC0-0xC7 */ + 0x9280,0x9285,0x9298,0x9296,0x927B,0x9293,0x929C,0x92A8,/* 0xC8-0xCF */ + 0x927C,0x9291,0x95A1,0x95A8,0x95A9,0x95A3,0x95A5,0x95A4,/* 0xD0-0xD7 */ + 0x9699,0x969C,0x969B,0x96CC,0x96D2,0x9700,0x977C,0x9785,/* 0xD8-0xDF */ + 0x97F6,0x9817,0x9818,0x98AF,0x98B1,0x9903,0x9905,0x990C,/* 0xE0-0xE7 */ + 0x9909,0x99C1,0x9AAF,0x9AB0,0x9AE6,0x9B41,0x9B42,0x9CF4,/* 0xE8-0xEF */ + 0x9CF6,0x9CF3,0x9EBC,0x9F3B,0x9F4A,0x5104,0x5100,0x50FB,/* 0xF0-0xF7 */ + 0x50F5,0x50F9,0x5102,0x5108,0x5109,0x5105,0x51DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5287,0x5288,0x5289,0x528D,0x528A,0x52F0,0x53B2,0x562E,/* 0x40-0x47 */ + 0x563B,0x5639,0x5632,0x563F,0x5634,0x5629,0x5653,0x564E,/* 0x48-0x4F */ + 0x5657,0x5674,0x5636,0x562F,0x5630,0x5880,0x589F,0x589E,/* 0x50-0x57 */ + 0x58B3,0x589C,0x58AE,0x58A9,0x58A6,0x596D,0x5B09,0x5AFB,/* 0x58-0x5F */ + 0x5B0B,0x5AF5,0x5B0C,0x5B08,0x5BEE,0x5BEC,0x5BE9,0x5BEB,/* 0x60-0x67 */ + 0x5C64,0x5C65,0x5D9D,0x5D94,0x5E62,0x5E5F,0x5E61,0x5EE2,/* 0x68-0x6F */ + 0x5EDA,0x5EDF,0x5EDD,0x5EE3,0x5EE0,0x5F48,0x5F71,0x5FB7,/* 0x70-0x77 */ + 0x5FB5,0x6176,0x6167,0x616E,0x615D,0x6155,0x6182,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x617C,0x6170,0x616B,0x617E,0x61A7,0x6190,0x61AB,/* 0xA0-0xA7 */ + 0x618E,0x61AC,0x619A,0x61A4,0x6194,0x61AE,0x622E,0x6469,/* 0xA8-0xAF */ + 0x646F,0x6479,0x649E,0x64B2,0x6488,0x6490,0x64B0,0x64A5,/* 0xB0-0xB7 */ + 0x6493,0x6495,0x64A9,0x6492,0x64AE,0x64AD,0x64AB,0x649A,/* 0xB8-0xBF */ + 0x64AC,0x6499,0x64A2,0x64B3,0x6575,0x6577,0x6578,0x66AE,/* 0xC0-0xC7 */ + 0x66AB,0x66B4,0x66B1,0x6A23,0x6A1F,0x69E8,0x6A01,0x6A1E,/* 0xC8-0xCF */ + 0x6A19,0x69FD,0x6A21,0x6A13,0x6A0A,0x69F3,0x6A02,0x6A05,/* 0xD0-0xD7 */ + 0x69ED,0x6A11,0x6B50,0x6B4E,0x6BA4,0x6BC5,0x6BC6,0x6F3F,/* 0xD8-0xDF */ + 0x6F7C,0x6F84,0x6F51,0x6F66,0x6F54,0x6F86,0x6F6D,0x6F5B,/* 0xE0-0xE7 */ + 0x6F78,0x6F6E,0x6F8E,0x6F7A,0x6F70,0x6F64,0x6F97,0x6F58,/* 0xE8-0xEF */ + 0x6ED5,0x6F6F,0x6F60,0x6F5F,0x719F,0x71AC,0x71B1,0x71A8,/* 0xF0-0xF7 */ + 0x7256,0x729B,0x734E,0x7357,0x7469,0x748B,0x7483,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x747E,0x7480,0x757F,0x7620,0x7629,0x761F,0x7624,0x7626,/* 0x40-0x47 */ + 0x7621,0x7622,0x769A,0x76BA,0x76E4,0x778E,0x7787,0x778C,/* 0x48-0x4F */ + 0x7791,0x778B,0x78CB,0x78C5,0x78BA,0x78CA,0x78BE,0x78D5,/* 0x50-0x57 */ + 0x78BC,0x78D0,0x7A3F,0x7A3C,0x7A40,0x7A3D,0x7A37,0x7A3B,/* 0x58-0x5F */ + 0x7AAF,0x7AAE,0x7BAD,0x7BB1,0x7BC4,0x7BB4,0x7BC6,0x7BC7,/* 0x60-0x67 */ + 0x7BC1,0x7BA0,0x7BCC,0x7CCA,0x7DE0,0x7DF4,0x7DEF,0x7DFB,/* 0x68-0x6F */ + 0x7DD8,0x7DEC,0x7DDD,0x7DE8,0x7DE3,0x7DDA,0x7DDE,0x7DE9,/* 0x70-0x77 */ + 0x7D9E,0x7DD9,0x7DF2,0x7DF9,0x7F75,0x7F77,0x7FAF,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7FE9,0x8026,0x819B,0x819C,0x819D,0x81A0,0x819A,/* 0xA0-0xA7 */ + 0x8198,0x8517,0x853D,0x851A,0x84EE,0x852C,0x852D,0x8513,/* 0xA8-0xAF */ + 0x8511,0x8523,0x8521,0x8514,0x84EC,0x8525,0x84FF,0x8506,/* 0xB0-0xB7 */ + 0x8782,0x8774,0x8776,0x8760,0x8766,0x8778,0x8768,0x8759,/* 0xB8-0xBF */ + 0x8757,0x874C,0x8753,0x885B,0x885D,0x8910,0x8907,0x8912,/* 0xC0-0xC7 */ + 0x8913,0x8915,0x890A,0x8ABC,0x8AD2,0x8AC7,0x8AC4,0x8A95,/* 0xC8-0xCF */ + 0x8ACB,0x8AF8,0x8AB2,0x8AC9,0x8AC2,0x8ABF,0x8AB0,0x8AD6,/* 0xD0-0xD7 */ + 0x8ACD,0x8AB6,0x8AB9,0x8ADB,0x8C4C,0x8C4E,0x8C6C,0x8CE0,/* 0xD8-0xDF */ + 0x8CDE,0x8CE6,0x8CE4,0x8CEC,0x8CED,0x8CE2,0x8CE3,0x8CDC,/* 0xE0-0xE7 */ + 0x8CEA,0x8CE1,0x8D6D,0x8D9F,0x8DA3,0x8E2B,0x8E10,0x8E1D,/* 0xE8-0xEF */ + 0x8E22,0x8E0F,0x8E29,0x8E1F,0x8E21,0x8E1E,0x8EBA,0x8F1D,/* 0xF0-0xF7 */ + 0x8F1B,0x8F1F,0x8F29,0x8F26,0x8F2A,0x8F1C,0x8F1E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F25,0x9069,0x906E,0x9068,0x906D,0x9077,0x9130,0x912D,/* 0x40-0x47 */ + 0x9127,0x9131,0x9187,0x9189,0x918B,0x9183,0x92C5,0x92BB,/* 0x48-0x4F */ + 0x92B7,0x92EA,0x92AC,0x92E4,0x92C1,0x92B3,0x92BC,0x92D2,/* 0x50-0x57 */ + 0x92C7,0x92F0,0x92B2,0x95AD,0x95B1,0x9704,0x9706,0x9707,/* 0x58-0x5F */ + 0x9709,0x9760,0x978D,0x978B,0x978F,0x9821,0x982B,0x981C,/* 0x60-0x67 */ + 0x98B3,0x990A,0x9913,0x9912,0x9918,0x99DD,0x99D0,0x99DF,/* 0x68-0x6F */ + 0x99DB,0x99D1,0x99D5,0x99D2,0x99D9,0x9AB7,0x9AEE,0x9AEF,/* 0x70-0x77 */ + 0x9B27,0x9B45,0x9B44,0x9B77,0x9B6F,0x9D06,0x9D09,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9D03,0x9EA9,0x9EBE,0x9ECE,0x58A8,0x9F52,0x5112,/* 0xA0-0xA7 */ + 0x5118,0x5114,0x5110,0x5115,0x5180,0x51AA,0x51DD,0x5291,/* 0xA8-0xAF */ + 0x5293,0x52F3,0x5659,0x566B,0x5679,0x5669,0x5664,0x5678,/* 0xB0-0xB7 */ + 0x566A,0x5668,0x5665,0x5671,0x566F,0x566C,0x5662,0x5676,/* 0xB8-0xBF */ + 0x58C1,0x58BE,0x58C7,0x58C5,0x596E,0x5B1D,0x5B34,0x5B78,/* 0xC0-0xC7 */ + 0x5BF0,0x5C0E,0x5F4A,0x61B2,0x6191,0x61A9,0x618A,0x61CD,/* 0xC8-0xCF */ + 0x61B6,0x61BE,0x61CA,0x61C8,0x6230,0x64C5,0x64C1,0x64CB,/* 0xD0-0xD7 */ + 0x64BB,0x64BC,0x64DA,0x64C4,0x64C7,0x64C2,0x64CD,0x64BF,/* 0xD8-0xDF */ + 0x64D2,0x64D4,0x64BE,0x6574,0x66C6,0x66C9,0x66B9,0x66C4,/* 0xE0-0xE7 */ + 0x66C7,0x66B8,0x6A3D,0x6A38,0x6A3A,0x6A59,0x6A6B,0x6A58,/* 0xE8-0xEF */ + 0x6A39,0x6A44,0x6A62,0x6A61,0x6A4B,0x6A47,0x6A35,0x6A5F,/* 0xF0-0xF7 */ + 0x6A48,0x6B59,0x6B77,0x6C05,0x6FC2,0x6FB1,0x6FA1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FC3,0x6FA4,0x6FC1,0x6FA7,0x6FB3,0x6FC0,0x6FB9,0x6FB6,/* 0x40-0x47 */ + 0x6FA6,0x6FA0,0x6FB4,0x71BE,0x71C9,0x71D0,0x71D2,0x71C8,/* 0x48-0x4F */ + 0x71D5,0x71B9,0x71CE,0x71D9,0x71DC,0x71C3,0x71C4,0x7368,/* 0x50-0x57 */ + 0x749C,0x74A3,0x7498,0x749F,0x749E,0x74E2,0x750C,0x750D,/* 0x58-0x5F */ + 0x7634,0x7638,0x763A,0x76E7,0x76E5,0x77A0,0x779E,0x779F,/* 0x60-0x67 */ + 0x77A5,0x78E8,0x78DA,0x78EC,0x78E7,0x79A6,0x7A4D,0x7A4E,/* 0x68-0x6F */ + 0x7A46,0x7A4C,0x7A4B,0x7ABA,0x7BD9,0x7C11,0x7BC9,0x7BE4,/* 0x70-0x77 */ + 0x7BDB,0x7BE1,0x7BE9,0x7BE6,0x7CD5,0x7CD6,0x7E0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7E11,0x7E08,0x7E1B,0x7E23,0x7E1E,0x7E1D,0x7E09,/* 0xA0-0xA7 */ + 0x7E10,0x7F79,0x7FB2,0x7FF0,0x7FF1,0x7FEE,0x8028,0x81B3,/* 0xA8-0xAF */ + 0x81A9,0x81A8,0x81FB,0x8208,0x8258,0x8259,0x854A,0x8559,/* 0xB0-0xB7 */ + 0x8548,0x8568,0x8569,0x8543,0x8549,0x856D,0x856A,0x855E,/* 0xB8-0xBF */ + 0x8783,0x879F,0x879E,0x87A2,0x878D,0x8861,0x892A,0x8932,/* 0xC0-0xC7 */ + 0x8925,0x892B,0x8921,0x89AA,0x89A6,0x8AE6,0x8AFA,0x8AEB,/* 0xC8-0xCF */ + 0x8AF1,0x8B00,0x8ADC,0x8AE7,0x8AEE,0x8AFE,0x8B01,0x8B02,/* 0xD0-0xD7 */ + 0x8AF7,0x8AED,0x8AF3,0x8AF6,0x8AFC,0x8C6B,0x8C6D,0x8C93,/* 0xD8-0xDF */ + 0x8CF4,0x8E44,0x8E31,0x8E34,0x8E42,0x8E39,0x8E35,0x8F3B,/* 0xE0-0xE7 */ + 0x8F2F,0x8F38,0x8F33,0x8FA8,0x8FA6,0x9075,0x9074,0x9078,/* 0xE8-0xEF */ + 0x9072,0x907C,0x907A,0x9134,0x9192,0x9320,0x9336,0x92F8,/* 0xF0-0xF7 */ + 0x9333,0x932F,0x9322,0x92FC,0x932B,0x9304,0x931A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9310,0x9326,0x9321,0x9315,0x932E,0x9319,0x95BB,0x96A7,/* 0x40-0x47 */ + 0x96A8,0x96AA,0x96D5,0x970E,0x9711,0x9716,0x970D,0x9713,/* 0x48-0x4F */ + 0x970F,0x975B,0x975C,0x9766,0x9798,0x9830,0x9838,0x983B,/* 0x50-0x57 */ + 0x9837,0x982D,0x9839,0x9824,0x9910,0x9928,0x991E,0x991B,/* 0x58-0x5F */ + 0x9921,0x991A,0x99ED,0x99E2,0x99F1,0x9AB8,0x9ABC,0x9AFB,/* 0x60-0x67 */ + 0x9AED,0x9B28,0x9B91,0x9D15,0x9D23,0x9D26,0x9D28,0x9D12,/* 0x68-0x6F */ + 0x9D1B,0x9ED8,0x9ED4,0x9F8D,0x9F9C,0x512A,0x511F,0x5121,/* 0x70-0x77 */ + 0x5132,0x52F5,0x568E,0x5680,0x5690,0x5685,0x5687,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x568F,0x58D5,0x58D3,0x58D1,0x58CE,0x5B30,0x5B2A,/* 0xA0-0xA7 */ + 0x5B24,0x5B7A,0x5C37,0x5C68,0x5DBC,0x5DBA,0x5DBD,0x5DB8,/* 0xA8-0xAF */ + 0x5E6B,0x5F4C,0x5FBD,0x61C9,0x61C2,0x61C7,0x61E6,0x61CB,/* 0xB0-0xB7 */ + 0x6232,0x6234,0x64CE,0x64CA,0x64D8,0x64E0,0x64F0,0x64E6,/* 0xB8-0xBF */ + 0x64EC,0x64F1,0x64E2,0x64ED,0x6582,0x6583,0x66D9,0x66D6,/* 0xC0-0xC7 */ + 0x6A80,0x6A94,0x6A84,0x6AA2,0x6A9C,0x6ADB,0x6AA3,0x6A7E,/* 0xC8-0xCF */ + 0x6A97,0x6A90,0x6AA0,0x6B5C,0x6BAE,0x6BDA,0x6C08,0x6FD8,/* 0xD0-0xD7 */ + 0x6FF1,0x6FDF,0x6FE0,0x6FDB,0x6FE4,0x6FEB,0x6FEF,0x6F80,/* 0xD8-0xDF */ + 0x6FEC,0x6FE1,0x6FE9,0x6FD5,0x6FEE,0x6FF0,0x71E7,0x71DF,/* 0xE0-0xE7 */ + 0x71EE,0x71E6,0x71E5,0x71ED,0x71EC,0x71F4,0x71E0,0x7235,/* 0xE8-0xEF */ + 0x7246,0x7370,0x7372,0x74A9,0x74B0,0x74A6,0x74A8,0x7646,/* 0xF0-0xF7 */ + 0x7642,0x764C,0x76EA,0x77B3,0x77AA,0x77B0,0x77AC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x77A7,0x77AD,0x77EF,0x78F7,0x78FA,0x78F4,0x78EF,0x7901,/* 0x40-0x47 */ + 0x79A7,0x79AA,0x7A57,0x7ABF,0x7C07,0x7C0D,0x7BFE,0x7BF7,/* 0x48-0x4F */ + 0x7C0C,0x7BE0,0x7CE0,0x7CDC,0x7CDE,0x7CE2,0x7CDF,0x7CD9,/* 0x50-0x57 */ + 0x7CDD,0x7E2E,0x7E3E,0x7E46,0x7E37,0x7E32,0x7E43,0x7E2B,/* 0x58-0x5F */ + 0x7E3D,0x7E31,0x7E45,0x7E41,0x7E34,0x7E39,0x7E48,0x7E35,/* 0x60-0x67 */ + 0x7E3F,0x7E2F,0x7F44,0x7FF3,0x7FFC,0x8071,0x8072,0x8070,/* 0x68-0x6F */ + 0x806F,0x8073,0x81C6,0x81C3,0x81BA,0x81C2,0x81C0,0x81BF,/* 0x70-0x77 */ + 0x81BD,0x81C9,0x81BE,0x81E8,0x8209,0x8271,0x85AA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8584,0x857E,0x859C,0x8591,0x8594,0x85AF,0x859B,/* 0xA0-0xA7 */ + 0x8587,0x85A8,0x858A,0x8667,0x87C0,0x87D1,0x87B3,0x87D2,/* 0xA8-0xAF */ + 0x87C6,0x87AB,0x87BB,0x87BA,0x87C8,0x87CB,0x893B,0x8936,/* 0xB0-0xB7 */ + 0x8944,0x8938,0x893D,0x89AC,0x8B0E,0x8B17,0x8B19,0x8B1B,/* 0xB8-0xBF */ + 0x8B0A,0x8B20,0x8B1D,0x8B04,0x8B10,0x8C41,0x8C3F,0x8C73,/* 0xC0-0xC7 */ + 0x8CFA,0x8CFD,0x8CFC,0x8CF8,0x8CFB,0x8DA8,0x8E49,0x8E4B,/* 0xC8-0xCF */ + 0x8E48,0x8E4A,0x8F44,0x8F3E,0x8F42,0x8F45,0x8F3F,0x907F,/* 0xD0-0xD7 */ + 0x907D,0x9084,0x9081,0x9082,0x9080,0x9139,0x91A3,0x919E,/* 0xD8-0xDF */ + 0x919C,0x934D,0x9382,0x9328,0x9375,0x934A,0x9365,0x934B,/* 0xE0-0xE7 */ + 0x9318,0x937E,0x936C,0x935B,0x9370,0x935A,0x9354,0x95CA,/* 0xE8-0xEF */ + 0x95CB,0x95CC,0x95C8,0x95C6,0x96B1,0x96B8,0x96D6,0x971C,/* 0xF0-0xF7 */ + 0x971E,0x97A0,0x97D3,0x9846,0x98B6,0x9935,0x9A01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x99FF,0x9BAE,0x9BAB,0x9BAA,0x9BAD,0x9D3B,0x9D3F,0x9E8B,/* 0x40-0x47 */ + 0x9ECF,0x9EDE,0x9EDC,0x9EDD,0x9EDB,0x9F3E,0x9F4B,0x53E2,/* 0x48-0x4F */ + 0x5695,0x56AE,0x58D9,0x58D8,0x5B38,0x5F5D,0x61E3,0x6233,/* 0x50-0x57 */ + 0x64F4,0x64F2,0x64FE,0x6506,0x64FA,0x64FB,0x64F7,0x65B7,/* 0x58-0x5F */ + 0x66DC,0x6726,0x6AB3,0x6AAC,0x6AC3,0x6ABB,0x6AB8,0x6AC2,/* 0x60-0x67 */ + 0x6AAE,0x6AAF,0x6B5F,0x6B78,0x6BAF,0x7009,0x700B,0x6FFE,/* 0x68-0x6F */ + 0x7006,0x6FFA,0x7011,0x700F,0x71FB,0x71FC,0x71FE,0x71F8,/* 0x70-0x77 */ + 0x7377,0x7375,0x74A7,0x74BF,0x7515,0x7656,0x7658,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7652,0x77BD,0x77BF,0x77BB,0x77BC,0x790E,0x79AE,/* 0xA0-0xA7 */ + 0x7A61,0x7A62,0x7A60,0x7AC4,0x7AC5,0x7C2B,0x7C27,0x7C2A,/* 0xA8-0xAF */ + 0x7C1E,0x7C23,0x7C21,0x7CE7,0x7E54,0x7E55,0x7E5E,0x7E5A,/* 0xB0-0xB7 */ + 0x7E61,0x7E52,0x7E59,0x7F48,0x7FF9,0x7FFB,0x8077,0x8076,/* 0xB8-0xBF */ + 0x81CD,0x81CF,0x820A,0x85CF,0x85A9,0x85CD,0x85D0,0x85C9,/* 0xC0-0xC7 */ + 0x85B0,0x85BA,0x85B9,0x85A6,0x87EF,0x87EC,0x87F2,0x87E0,/* 0xC8-0xCF */ + 0x8986,0x89B2,0x89F4,0x8B28,0x8B39,0x8B2C,0x8B2B,0x8C50,/* 0xD0-0xD7 */ + 0x8D05,0x8E59,0x8E63,0x8E66,0x8E64,0x8E5F,0x8E55,0x8EC0,/* 0xD8-0xDF */ + 0x8F49,0x8F4D,0x9087,0x9083,0x9088,0x91AB,0x91AC,0x91D0,/* 0xE0-0xE7 */ + 0x9394,0x938A,0x9396,0x93A2,0x93B3,0x93AE,0x93AC,0x93B0,/* 0xE8-0xEF */ + 0x9398,0x939A,0x9397,0x95D4,0x95D6,0x95D0,0x95D5,0x96E2,/* 0xF0-0xF7 */ + 0x96DC,0x96D9,0x96DB,0x96DE,0x9724,0x97A3,0x97A6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x97AD,0x97F9,0x984D,0x984F,0x984C,0x984E,0x9853,0x98BA,/* 0x40-0x47 */ + 0x993E,0x993F,0x993D,0x992E,0x99A5,0x9A0E,0x9AC1,0x9B03,/* 0x48-0x4F */ + 0x9B06,0x9B4F,0x9B4E,0x9B4D,0x9BCA,0x9BC9,0x9BFD,0x9BC8,/* 0x50-0x57 */ + 0x9BC0,0x9D51,0x9D5D,0x9D60,0x9EE0,0x9F15,0x9F2C,0x5133,/* 0x58-0x5F */ + 0x56A5,0x58DE,0x58DF,0x58E2,0x5BF5,0x9F90,0x5EEC,0x61F2,/* 0x60-0x67 */ + 0x61F7,0x61F6,0x61F5,0x6500,0x650F,0x66E0,0x66DD,0x6AE5,/* 0x68-0x6F */ + 0x6ADD,0x6ADA,0x6AD3,0x701B,0x701F,0x7028,0x701A,0x701D,/* 0x70-0x77 */ + 0x7015,0x7018,0x7206,0x720D,0x7258,0x72A2,0x7378,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x737A,0x74BD,0x74CA,0x74E3,0x7587,0x7586,0x765F,/* 0xA0-0xA7 */ + 0x7661,0x77C7,0x7919,0x79B1,0x7A6B,0x7A69,0x7C3E,0x7C3F,/* 0xA8-0xAF */ + 0x7C38,0x7C3D,0x7C37,0x7C40,0x7E6B,0x7E6D,0x7E79,0x7E69,/* 0xB0-0xB7 */ + 0x7E6A,0x7F85,0x7E73,0x7FB6,0x7FB9,0x7FB8,0x81D8,0x85E9,/* 0xB8-0xBF */ + 0x85DD,0x85EA,0x85D5,0x85E4,0x85E5,0x85F7,0x87FB,0x8805,/* 0xC0-0xC7 */ + 0x880D,0x87F9,0x87FE,0x8960,0x895F,0x8956,0x895E,0x8B41,/* 0xC8-0xCF */ + 0x8B5C,0x8B58,0x8B49,0x8B5A,0x8B4E,0x8B4F,0x8B46,0x8B59,/* 0xD0-0xD7 */ + 0x8D08,0x8D0A,0x8E7C,0x8E72,0x8E87,0x8E76,0x8E6C,0x8E7A,/* 0xD8-0xDF */ + 0x8E74,0x8F54,0x8F4E,0x8FAD,0x908A,0x908B,0x91B1,0x91AE,/* 0xE0-0xE7 */ + 0x93E1,0x93D1,0x93DF,0x93C3,0x93C8,0x93DC,0x93DD,0x93D6,/* 0xE8-0xEF */ + 0x93E2,0x93CD,0x93D8,0x93E4,0x93D7,0x93E8,0x95DC,0x96B4,/* 0xF0-0xF7 */ + 0x96E3,0x972A,0x9727,0x9761,0x97DC,0x97FB,0x985E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9858,0x985B,0x98BC,0x9945,0x9949,0x9A16,0x9A19,0x9B0D,/* 0x40-0x47 */ + 0x9BE8,0x9BE7,0x9BD6,0x9BDB,0x9D89,0x9D61,0x9D72,0x9D6A,/* 0x48-0x4F */ + 0x9D6C,0x9E92,0x9E97,0x9E93,0x9EB4,0x52F8,0x56A8,0x56B7,/* 0x50-0x57 */ + 0x56B6,0x56B4,0x56BC,0x58E4,0x5B40,0x5B43,0x5B7D,0x5BF6,/* 0x58-0x5F */ + 0x5DC9,0x61F8,0x61FA,0x6518,0x6514,0x6519,0x66E6,0x6727,/* 0x60-0x67 */ + 0x6AEC,0x703E,0x7030,0x7032,0x7210,0x737B,0x74CF,0x7662,/* 0x68-0x6F */ + 0x7665,0x7926,0x792A,0x792C,0x792B,0x7AC7,0x7AF6,0x7C4C,/* 0x70-0x77 */ + 0x7C43,0x7C4D,0x7CEF,0x7CF0,0x8FAE,0x7E7D,0x7E7C,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7E82,0x7F4C,0x8000,0x81DA,0x8266,0x85FB,0x85F9,/* 0xA0-0xA7 */ + 0x8611,0x85FA,0x8606,0x860B,0x8607,0x860A,0x8814,0x8815,/* 0xA8-0xAF */ + 0x8964,0x89BA,0x89F8,0x8B70,0x8B6C,0x8B66,0x8B6F,0x8B5F,/* 0xB0-0xB7 */ + 0x8B6B,0x8D0F,0x8D0D,0x8E89,0x8E81,0x8E85,0x8E82,0x91B4,/* 0xB8-0xBF */ + 0x91CB,0x9418,0x9403,0x93FD,0x95E1,0x9730,0x98C4,0x9952,/* 0xC0-0xC7 */ + 0x9951,0x99A8,0x9A2B,0x9A30,0x9A37,0x9A35,0x9C13,0x9C0D,/* 0xC8-0xCF */ + 0x9E79,0x9EB5,0x9EE8,0x9F2F,0x9F5F,0x9F63,0x9F61,0x5137,/* 0xD0-0xD7 */ + 0x5138,0x56C1,0x56C0,0x56C2,0x5914,0x5C6C,0x5DCD,0x61FC,/* 0xD8-0xDF */ + 0x61FE,0x651D,0x651C,0x6595,0x66E9,0x6AFB,0x6B04,0x6AFA,/* 0xE0-0xE7 */ + 0x6BB2,0x704C,0x721B,0x72A7,0x74D6,0x74D4,0x7669,0x77D3,/* 0xE8-0xEF */ + 0x7C50,0x7E8F,0x7E8C,0x7FBC,0x8617,0x862D,0x861A,0x8823,/* 0xF0-0xF7 */ + 0x8822,0x8821,0x881F,0x896A,0x896C,0x89BD,0x8B74,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B77,0x8B7D,0x8D13,0x8E8A,0x8E8D,0x8E8B,0x8F5F,0x8FAF,/* 0x40-0x47 */ + 0x91BA,0x942E,0x9433,0x9435,0x943A,0x9438,0x9432,0x942B,/* 0x48-0x4F */ + 0x95E2,0x9738,0x9739,0x9732,0x97FF,0x9867,0x9865,0x9957,/* 0x50-0x57 */ + 0x9A45,0x9A43,0x9A40,0x9A3E,0x9ACF,0x9B54,0x9B51,0x9C2D,/* 0x58-0x5F */ + 0x9C25,0x9DAF,0x9DB4,0x9DC2,0x9DB8,0x9E9D,0x9EEF,0x9F19,/* 0x60-0x67 */ + 0x9F5C,0x9F66,0x9F67,0x513C,0x513B,0x56C8,0x56CA,0x56C9,/* 0x68-0x6F */ + 0x5B7F,0x5DD4,0x5DD2,0x5F4E,0x61FF,0x6524,0x6B0A,0x6B61,/* 0x70-0x77 */ + 0x7051,0x7058,0x7380,0x74E4,0x758A,0x766E,0x766C,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79B3,0x7C60,0x7C5F,0x807E,0x807D,0x81DF,0x8972,/* 0xA0-0xA7 */ + 0x896F,0x89FC,0x8B80,0x8D16,0x8D17,0x8E91,0x8E93,0x8F61,/* 0xA8-0xAF */ + 0x9148,0x9444,0x9451,0x9452,0x973D,0x973E,0x97C3,0x97C1,/* 0xB0-0xB7 */ + 0x986B,0x9955,0x9A55,0x9A4D,0x9AD2,0x9B1A,0x9C49,0x9C31,/* 0xB8-0xBF */ + 0x9C3E,0x9C3B,0x9DD3,0x9DD7,0x9F34,0x9F6C,0x9F6A,0x9F94,/* 0xC0-0xC7 */ + 0x56CC,0x5DD6,0x6200,0x6523,0x652B,0x652A,0x66EC,0x6B10,/* 0xC8-0xCF */ + 0x74DA,0x7ACA,0x7C64,0x7C63,0x7C65,0x7E93,0x7E96,0x7E94,/* 0xD0-0xD7 */ + 0x81E2,0x8638,0x863F,0x8831,0x8B8A,0x9090,0x908F,0x9463,/* 0xD8-0xDF */ + 0x9460,0x9464,0x9768,0x986F,0x995C,0x9A5A,0x9A5B,0x9A57,/* 0xE0-0xE7 */ + 0x9AD3,0x9AD4,0x9AD1,0x9C54,0x9C57,0x9C56,0x9DE5,0x9E9F,/* 0xE8-0xEF */ + 0x9EF4,0x56D1,0x58E9,0x652C,0x705E,0x7671,0x7672,0x77D7,/* 0xF0-0xF7 */ + 0x7F50,0x7F88,0x8836,0x8839,0x8862,0x8B93,0x8B92,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B96,0x8277,0x8D1B,0x91C0,0x946A,0x9742,0x9748,0x9744,/* 0x40-0x47 */ + 0x97C6,0x9870,0x9A5F,0x9B22,0x9B58,0x9C5F,0x9DF9,0x9DFA,/* 0x48-0x4F */ + 0x9E7C,0x9E7D,0x9F07,0x9F77,0x9F72,0x5EF3,0x6B16,0x7063,/* 0x50-0x57 */ + 0x7C6C,0x7C6E,0x883B,0x89C0,0x8EA1,0x91C1,0x9472,0x9470,/* 0x58-0x5F */ + 0x9871,0x995E,0x9AD6,0x9B23,0x9ECC,0x7064,0x77DA,0x8B9A,/* 0x60-0x67 */ + 0x9477,0x97C9,0x9A62,0x9A65,0x7E9C,0x8B9C,0x8EAA,0x91C5,/* 0x68-0x6F */ + 0x947D,0x947E,0x947C,0x9C77,0x9C78,0x9EF7,0x8C54,0x947F,/* 0x70-0x77 */ + 0x9E1A,0x7228,0x9A6A,0x9B31,0x9E1B,0x9E1E,0x7C72,0x0000,/* 0x78-0x7F */ +}; + +static const wchar_t c2u_C9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E42,0x4E5C,0x51F5,0x531A,0x5382,0x4E07,0x4E0C,0x4E47,/* 0x40-0x47 */ + 0x4E8D,0x56D7,0xFA0C,0x5C6E,0x5F73,0x4E0F,0x5187,0x4E0E,/* 0x48-0x4F */ + 0x4E2E,0x4E93,0x4EC2,0x4EC9,0x4EC8,0x5198,0x52FC,0x536C,/* 0x50-0x57 */ + 0x53B9,0x5720,0x5903,0x592C,0x5C10,0x5DFF,0x65E1,0x6BB3,/* 0x58-0x5F */ + 0x6BCC,0x6C14,0x723F,0x4E31,0x4E3C,0x4EE8,0x4EDC,0x4EE9,/* 0x60-0x67 */ + 0x4EE1,0x4EDD,0x4EDA,0x520C,0x531C,0x534C,0x5722,0x5723,/* 0x68-0x6F */ + 0x5917,0x592F,0x5B81,0x5B84,0x5C12,0x5C3B,0x5C74,0x5C73,/* 0x70-0x77 */ + 0x5E04,0x5E80,0x5E82,0x5FC9,0x6209,0x6250,0x6C15,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6C36,0x6C43,0x6C3F,0x6C3B,0x72AE,0x72B0,0x738A,/* 0xA0-0xA7 */ + 0x79B8,0x808A,0x961E,0x4F0E,0x4F18,0x4F2C,0x4EF5,0x4F14,/* 0xA8-0xAF */ + 0x4EF1,0x4F00,0x4EF7,0x4F08,0x4F1D,0x4F02,0x4F05,0x4F22,/* 0xB0-0xB7 */ + 0x4F13,0x4F04,0x4EF4,0x4F12,0x51B1,0x5213,0x5209,0x5210,/* 0xB8-0xBF */ + 0x52A6,0x5322,0x531F,0x534D,0x538A,0x5407,0x56E1,0x56DF,/* 0xC0-0xC7 */ + 0x572E,0x572A,0x5734,0x593C,0x5980,0x597C,0x5985,0x597B,/* 0xC8-0xCF */ + 0x597E,0x5977,0x597F,0x5B56,0x5C15,0x5C25,0x5C7C,0x5C7A,/* 0xD0-0xD7 */ + 0x5C7B,0x5C7E,0x5DDF,0x5E75,0x5E84,0x5F02,0x5F1A,0x5F74,/* 0xD8-0xDF */ + 0x5FD5,0x5FD4,0x5FCF,0x625C,0x625E,0x6264,0x6261,0x6266,/* 0xE0-0xE7 */ + 0x6262,0x6259,0x6260,0x625A,0x6265,0x65EF,0x65EE,0x673E,/* 0xE8-0xEF */ + 0x6739,0x6738,0x673B,0x673A,0x673F,0x673C,0x6733,0x6C18,/* 0xF0-0xF7 */ + 0x6C46,0x6C52,0x6C5C,0x6C4F,0x6C4A,0x6C54,0x6C4B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6C4C,0x7071,0x725E,0x72B4,0x72B5,0x738E,0x752A,0x767F,/* 0x40-0x47 */ + 0x7A75,0x7F51,0x8278,0x827C,0x8280,0x827D,0x827F,0x864D,/* 0x48-0x4F */ + 0x897E,0x9099,0x9097,0x9098,0x909B,0x9094,0x9622,0x9624,/* 0x50-0x57 */ + 0x9620,0x9623,0x4F56,0x4F3B,0x4F62,0x4F49,0x4F53,0x4F64,/* 0x58-0x5F */ + 0x4F3E,0x4F67,0x4F52,0x4F5F,0x4F41,0x4F58,0x4F2D,0x4F33,/* 0x60-0x67 */ + 0x4F3F,0x4F61,0x518F,0x51B9,0x521C,0x521E,0x5221,0x52AD,/* 0x68-0x6F */ + 0x52AE,0x5309,0x5363,0x5372,0x538E,0x538F,0x5430,0x5437,/* 0x70-0x77 */ + 0x542A,0x5454,0x5445,0x5419,0x541C,0x5425,0x5418,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x543D,0x544F,0x5441,0x5428,0x5424,0x5447,0x56EE,/* 0xA0-0xA7 */ + 0x56E7,0x56E5,0x5741,0x5745,0x574C,0x5749,0x574B,0x5752,/* 0xA8-0xAF */ + 0x5906,0x5940,0x59A6,0x5998,0x59A0,0x5997,0x598E,0x59A2,/* 0xB0-0xB7 */ + 0x5990,0x598F,0x59A7,0x59A1,0x5B8E,0x5B92,0x5C28,0x5C2A,/* 0xB8-0xBF */ + 0x5C8D,0x5C8F,0x5C88,0x5C8B,0x5C89,0x5C92,0x5C8A,0x5C86,/* 0xC0-0xC7 */ + 0x5C93,0x5C95,0x5DE0,0x5E0A,0x5E0E,0x5E8B,0x5E89,0x5E8C,/* 0xC8-0xCF */ + 0x5E88,0x5E8D,0x5F05,0x5F1D,0x5F78,0x5F76,0x5FD2,0x5FD1,/* 0xD0-0xD7 */ + 0x5FD0,0x5FED,0x5FE8,0x5FEE,0x5FF3,0x5FE1,0x5FE4,0x5FE3,/* 0xD8-0xDF */ + 0x5FFA,0x5FEF,0x5FF7,0x5FFB,0x6000,0x5FF4,0x623A,0x6283,/* 0xE0-0xE7 */ + 0x628C,0x628E,0x628F,0x6294,0x6287,0x6271,0x627B,0x627A,/* 0xE8-0xEF */ + 0x6270,0x6281,0x6288,0x6277,0x627D,0x6272,0x6274,0x6537,/* 0xF0-0xF7 */ + 0x65F0,0x65F4,0x65F3,0x65F2,0x65F5,0x6745,0x6747,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6759,0x6755,0x674C,0x6748,0x675D,0x674D,0x675A,0x674B,/* 0x40-0x47 */ + 0x6BD0,0x6C19,0x6C1A,0x6C78,0x6C67,0x6C6B,0x6C84,0x6C8B,/* 0x48-0x4F */ + 0x6C8F,0x6C71,0x6C6F,0x6C69,0x6C9A,0x6C6D,0x6C87,0x6C95,/* 0x50-0x57 */ + 0x6C9C,0x6C66,0x6C73,0x6C65,0x6C7B,0x6C8E,0x7074,0x707A,/* 0x58-0x5F */ + 0x7263,0x72BF,0x72BD,0x72C3,0x72C6,0x72C1,0x72BA,0x72C5,/* 0x60-0x67 */ + 0x7395,0x7397,0x7393,0x7394,0x7392,0x753A,0x7539,0x7594,/* 0x68-0x6F */ + 0x7595,0x7681,0x793D,0x8034,0x8095,0x8099,0x8090,0x8092,/* 0x70-0x77 */ + 0x809C,0x8290,0x828F,0x8285,0x828E,0x8291,0x8293,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x828A,0x8283,0x8284,0x8C78,0x8FC9,0x8FBF,0x909F,/* 0xA0-0xA7 */ + 0x90A1,0x90A5,0x909E,0x90A7,0x90A0,0x9630,0x9628,0x962F,/* 0xA8-0xAF */ + 0x962D,0x4E33,0x4F98,0x4F7C,0x4F85,0x4F7D,0x4F80,0x4F87,/* 0xB0-0xB7 */ + 0x4F76,0x4F74,0x4F89,0x4F84,0x4F77,0x4F4C,0x4F97,0x4F6A,/* 0xB8-0xBF */ + 0x4F9A,0x4F79,0x4F81,0x4F78,0x4F90,0x4F9C,0x4F94,0x4F9E,/* 0xC0-0xC7 */ + 0x4F92,0x4F82,0x4F95,0x4F6B,0x4F6E,0x519E,0x51BC,0x51BE,/* 0xC8-0xCF */ + 0x5235,0x5232,0x5233,0x5246,0x5231,0x52BC,0x530A,0x530B,/* 0xD0-0xD7 */ + 0x533C,0x5392,0x5394,0x5487,0x547F,0x5481,0x5491,0x5482,/* 0xD8-0xDF */ + 0x5488,0x546B,0x547A,0x547E,0x5465,0x546C,0x5474,0x5466,/* 0xE0-0xE7 */ + 0x548D,0x546F,0x5461,0x5460,0x5498,0x5463,0x5467,0x5464,/* 0xE8-0xEF */ + 0x56F7,0x56F9,0x576F,0x5772,0x576D,0x576B,0x5771,0x5770,/* 0xF0-0xF7 */ + 0x5776,0x5780,0x5775,0x577B,0x5773,0x5774,0x5762,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5768,0x577D,0x590C,0x5945,0x59B5,0x59BA,0x59CF,0x59CE,/* 0x40-0x47 */ + 0x59B2,0x59CC,0x59C1,0x59B6,0x59BC,0x59C3,0x59D6,0x59B1,/* 0x48-0x4F */ + 0x59BD,0x59C0,0x59C8,0x59B4,0x59C7,0x5B62,0x5B65,0x5B93,/* 0x50-0x57 */ + 0x5B95,0x5C44,0x5C47,0x5CAE,0x5CA4,0x5CA0,0x5CB5,0x5CAF,/* 0x58-0x5F */ + 0x5CA8,0x5CAC,0x5C9F,0x5CA3,0x5CAD,0x5CA2,0x5CAA,0x5CA7,/* 0x60-0x67 */ + 0x5C9D,0x5CA5,0x5CB6,0x5CB0,0x5CA6,0x5E17,0x5E14,0x5E19,/* 0x68-0x6F */ + 0x5F28,0x5F22,0x5F23,0x5F24,0x5F54,0x5F82,0x5F7E,0x5F7D,/* 0x70-0x77 */ + 0x5FDE,0x5FE5,0x602D,0x6026,0x6019,0x6032,0x600B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6034,0x600A,0x6017,0x6033,0x601A,0x601E,0x602C,/* 0xA0-0xA7 */ + 0x6022,0x600D,0x6010,0x602E,0x6013,0x6011,0x600C,0x6009,/* 0xA8-0xAF */ + 0x601C,0x6214,0x623D,0x62AD,0x62B4,0x62D1,0x62BE,0x62AA,/* 0xB0-0xB7 */ + 0x62B6,0x62CA,0x62AE,0x62B3,0x62AF,0x62BB,0x62A9,0x62B0,/* 0xB8-0xBF */ + 0x62B8,0x653D,0x65A8,0x65BB,0x6609,0x65FC,0x6604,0x6612,/* 0xC0-0xC7 */ + 0x6608,0x65FB,0x6603,0x660B,0x660D,0x6605,0x65FD,0x6611,/* 0xC8-0xCF */ + 0x6610,0x66F6,0x670A,0x6785,0x676C,0x678E,0x6792,0x6776,/* 0xD0-0xD7 */ + 0x677B,0x6798,0x6786,0x6784,0x6774,0x678D,0x678C,0x677A,/* 0xD8-0xDF */ + 0x679F,0x6791,0x6799,0x6783,0x677D,0x6781,0x6778,0x6779,/* 0xE0-0xE7 */ + 0x6794,0x6B25,0x6B80,0x6B7E,0x6BDE,0x6C1D,0x6C93,0x6CEC,/* 0xE8-0xEF */ + 0x6CEB,0x6CEE,0x6CD9,0x6CB6,0x6CD4,0x6CAD,0x6CE7,0x6CB7,/* 0xF0-0xF7 */ + 0x6CD0,0x6CC2,0x6CBA,0x6CC3,0x6CC6,0x6CED,0x6CF2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6CD2,0x6CDD,0x6CB4,0x6C8A,0x6C9D,0x6C80,0x6CDE,0x6CC0,/* 0x40-0x47 */ + 0x6D30,0x6CCD,0x6CC7,0x6CB0,0x6CF9,0x6CCF,0x6CE9,0x6CD1,/* 0x48-0x4F */ + 0x7094,0x7098,0x7085,0x7093,0x7086,0x7084,0x7091,0x7096,/* 0x50-0x57 */ + 0x7082,0x709A,0x7083,0x726A,0x72D6,0x72CB,0x72D8,0x72C9,/* 0x58-0x5F */ + 0x72DC,0x72D2,0x72D4,0x72DA,0x72CC,0x72D1,0x73A4,0x73A1,/* 0x60-0x67 */ + 0x73AD,0x73A6,0x73A2,0x73A0,0x73AC,0x739D,0x74DD,0x74E8,/* 0x68-0x6F */ + 0x753F,0x7540,0x753E,0x758C,0x7598,0x76AF,0x76F3,0x76F1,/* 0x70-0x77 */ + 0x76F0,0x76F5,0x77F8,0x77FC,0x77F9,0x77FB,0x77FA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77F7,0x7942,0x793F,0x79C5,0x7A78,0x7A7B,0x7AFB,/* 0xA0-0xA7 */ + 0x7C75,0x7CFD,0x8035,0x808F,0x80AE,0x80A3,0x80B8,0x80B5,/* 0xA8-0xAF */ + 0x80AD,0x8220,0x82A0,0x82C0,0x82AB,0x829A,0x8298,0x829B,/* 0xB0-0xB7 */ + 0x82B5,0x82A7,0x82AE,0x82BC,0x829E,0x82BA,0x82B4,0x82A8,/* 0xB8-0xBF */ + 0x82A1,0x82A9,0x82C2,0x82A4,0x82C3,0x82B6,0x82A2,0x8670,/* 0xC0-0xC7 */ + 0x866F,0x866D,0x866E,0x8C56,0x8FD2,0x8FCB,0x8FD3,0x8FCD,/* 0xC8-0xCF */ + 0x8FD6,0x8FD5,0x8FD7,0x90B2,0x90B4,0x90AF,0x90B3,0x90B0,/* 0xD0-0xD7 */ + 0x9639,0x963D,0x963C,0x963A,0x9643,0x4FCD,0x4FC5,0x4FD3,/* 0xD8-0xDF */ + 0x4FB2,0x4FC9,0x4FCB,0x4FC1,0x4FD4,0x4FDC,0x4FD9,0x4FBB,/* 0xE0-0xE7 */ + 0x4FB3,0x4FDB,0x4FC7,0x4FD6,0x4FBA,0x4FC0,0x4FB9,0x4FEC,/* 0xE8-0xEF */ + 0x5244,0x5249,0x52C0,0x52C2,0x533D,0x537C,0x5397,0x5396,/* 0xF0-0xF7 */ + 0x5399,0x5398,0x54BA,0x54A1,0x54AD,0x54A5,0x54CF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54C3,0x830D,0x54B7,0x54AE,0x54D6,0x54B6,0x54C5,0x54C6,/* 0x40-0x47 */ + 0x54A0,0x5470,0x54BC,0x54A2,0x54BE,0x5472,0x54DE,0x54B0,/* 0x48-0x4F */ + 0x57B5,0x579E,0x579F,0x57A4,0x578C,0x5797,0x579D,0x579B,/* 0x50-0x57 */ + 0x5794,0x5798,0x578F,0x5799,0x57A5,0x579A,0x5795,0x58F4,/* 0x58-0x5F */ + 0x590D,0x5953,0x59E1,0x59DE,0x59EE,0x5A00,0x59F1,0x59DD,/* 0x60-0x67 */ + 0x59FA,0x59FD,0x59FC,0x59F6,0x59E4,0x59F2,0x59F7,0x59DB,/* 0x68-0x6F */ + 0x59E9,0x59F3,0x59F5,0x59E0,0x59FE,0x59F4,0x59ED,0x5BA8,/* 0x70-0x77 */ + 0x5C4C,0x5CD0,0x5CD8,0x5CCC,0x5CD7,0x5CCB,0x5CDB,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5CDE,0x5CDA,0x5CC9,0x5CC7,0x5CCA,0x5CD6,0x5CD3,/* 0xA0-0xA7 */ + 0x5CD4,0x5CCF,0x5CC8,0x5CC6,0x5CCE,0x5CDF,0x5CF8,0x5DF9,/* 0xA8-0xAF */ + 0x5E21,0x5E22,0x5E23,0x5E20,0x5E24,0x5EB0,0x5EA4,0x5EA2,/* 0xB0-0xB7 */ + 0x5E9B,0x5EA3,0x5EA5,0x5F07,0x5F2E,0x5F56,0x5F86,0x6037,/* 0xB8-0xBF */ + 0x6039,0x6054,0x6072,0x605E,0x6045,0x6053,0x6047,0x6049,/* 0xC0-0xC7 */ + 0x605B,0x604C,0x6040,0x6042,0x605F,0x6024,0x6044,0x6058,/* 0xC8-0xCF */ + 0x6066,0x606E,0x6242,0x6243,0x62CF,0x630D,0x630B,0x62F5,/* 0xD0-0xD7 */ + 0x630E,0x6303,0x62EB,0x62F9,0x630F,0x630C,0x62F8,0x62F6,/* 0xD8-0xDF */ + 0x6300,0x6313,0x6314,0x62FA,0x6315,0x62FB,0x62F0,0x6541,/* 0xE0-0xE7 */ + 0x6543,0x65AA,0x65BF,0x6636,0x6621,0x6632,0x6635,0x661C,/* 0xE8-0xEF */ + 0x6626,0x6622,0x6633,0x662B,0x663A,0x661D,0x6634,0x6639,/* 0xF0-0xF7 */ + 0x662E,0x670F,0x6710,0x67C1,0x67F2,0x67C8,0x67BA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x67DC,0x67BB,0x67F8,0x67D8,0x67C0,0x67B7,0x67C5,0x67EB,/* 0x40-0x47 */ + 0x67E4,0x67DF,0x67B5,0x67CD,0x67B3,0x67F7,0x67F6,0x67EE,/* 0x48-0x4F */ + 0x67E3,0x67C2,0x67B9,0x67CE,0x67E7,0x67F0,0x67B2,0x67FC,/* 0x50-0x57 */ + 0x67C6,0x67ED,0x67CC,0x67AE,0x67E6,0x67DB,0x67FA,0x67C9,/* 0x58-0x5F */ + 0x67CA,0x67C3,0x67EA,0x67CB,0x6B28,0x6B82,0x6B84,0x6BB6,/* 0x60-0x67 */ + 0x6BD6,0x6BD8,0x6BE0,0x6C20,0x6C21,0x6D28,0x6D34,0x6D2D,/* 0x68-0x6F */ + 0x6D1F,0x6D3C,0x6D3F,0x6D12,0x6D0A,0x6CDA,0x6D33,0x6D04,/* 0x70-0x77 */ + 0x6D19,0x6D3A,0x6D1A,0x6D11,0x6D00,0x6D1D,0x6D42,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6D01,0x6D18,0x6D37,0x6D03,0x6D0F,0x6D40,0x6D07,/* 0xA0-0xA7 */ + 0x6D20,0x6D2C,0x6D08,0x6D22,0x6D09,0x6D10,0x70B7,0x709F,/* 0xA8-0xAF */ + 0x70BE,0x70B1,0x70B0,0x70A1,0x70B4,0x70B5,0x70A9,0x7241,/* 0xB0-0xB7 */ + 0x7249,0x724A,0x726C,0x7270,0x7273,0x726E,0x72CA,0x72E4,/* 0xB8-0xBF */ + 0x72E8,0x72EB,0x72DF,0x72EA,0x72E6,0x72E3,0x7385,0x73CC,/* 0xC0-0xC7 */ + 0x73C2,0x73C8,0x73C5,0x73B9,0x73B6,0x73B5,0x73B4,0x73EB,/* 0xC8-0xCF */ + 0x73BF,0x73C7,0x73BE,0x73C3,0x73C6,0x73B8,0x73CB,0x74EC,/* 0xD0-0xD7 */ + 0x74EE,0x752E,0x7547,0x7548,0x75A7,0x75AA,0x7679,0x76C4,/* 0xD8-0xDF */ + 0x7708,0x7703,0x7704,0x7705,0x770A,0x76F7,0x76FB,0x76FA,/* 0xE0-0xE7 */ + 0x77E7,0x77E8,0x7806,0x7811,0x7812,0x7805,0x7810,0x780F,/* 0xE8-0xEF */ + 0x780E,0x7809,0x7803,0x7813,0x794A,0x794C,0x794B,0x7945,/* 0xF0-0xF7 */ + 0x7944,0x79D5,0x79CD,0x79CF,0x79D6,0x79CE,0x7A80,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A7E,0x7AD1,0x7B00,0x7B01,0x7C7A,0x7C78,0x7C79,0x7C7F,/* 0x40-0x47 */ + 0x7C80,0x7C81,0x7D03,0x7D08,0x7D01,0x7F58,0x7F91,0x7F8D,/* 0x48-0x4F */ + 0x7FBE,0x8007,0x800E,0x800F,0x8014,0x8037,0x80D8,0x80C7,/* 0x50-0x57 */ + 0x80E0,0x80D1,0x80C8,0x80C2,0x80D0,0x80C5,0x80E3,0x80D9,/* 0x58-0x5F */ + 0x80DC,0x80CA,0x80D5,0x80C9,0x80CF,0x80D7,0x80E6,0x80CD,/* 0x60-0x67 */ + 0x81FF,0x8221,0x8294,0x82D9,0x82FE,0x82F9,0x8307,0x82E8,/* 0x68-0x6F */ + 0x8300,0x82D5,0x833A,0x82EB,0x82D6,0x82F4,0x82EC,0x82E1,/* 0x70-0x77 */ + 0x82F2,0x82F5,0x830C,0x82FB,0x82F6,0x82F0,0x82EA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x82E4,0x82E0,0x82FA,0x82F3,0x82ED,0x8677,0x8674,/* 0xA0-0xA7 */ + 0x867C,0x8673,0x8841,0x884E,0x8867,0x886A,0x8869,0x89D3,/* 0xA8-0xAF */ + 0x8A04,0x8A07,0x8D72,0x8FE3,0x8FE1,0x8FEE,0x8FE0,0x90F1,/* 0xB0-0xB7 */ + 0x90BD,0x90BF,0x90D5,0x90C5,0x90BE,0x90C7,0x90CB,0x90C8,/* 0xB8-0xBF */ + 0x91D4,0x91D3,0x9654,0x964F,0x9651,0x9653,0x964A,0x964E,/* 0xC0-0xC7 */ + 0x501E,0x5005,0x5007,0x5013,0x5022,0x5030,0x501B,0x4FF5,/* 0xC8-0xCF */ + 0x4FF4,0x5033,0x5037,0x502C,0x4FF6,0x4FF7,0x5017,0x501C,/* 0xD0-0xD7 */ + 0x5020,0x5027,0x5035,0x502F,0x5031,0x500E,0x515A,0x5194,/* 0xD8-0xDF */ + 0x5193,0x51CA,0x51C4,0x51C5,0x51C8,0x51CE,0x5261,0x525A,/* 0xE0-0xE7 */ + 0x5252,0x525E,0x525F,0x5255,0x5262,0x52CD,0x530E,0x539E,/* 0xE8-0xEF */ + 0x5526,0x54E2,0x5517,0x5512,0x54E7,0x54F3,0x54E4,0x551A,/* 0xF0-0xF7 */ + 0x54FF,0x5504,0x5508,0x54EB,0x5511,0x5505,0x54F1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x550A,0x54FB,0x54F7,0x54F8,0x54E0,0x550E,0x5503,0x550B,/* 0x40-0x47 */ + 0x5701,0x5702,0x57CC,0x5832,0x57D5,0x57D2,0x57BA,0x57C6,/* 0x48-0x4F */ + 0x57BD,0x57BC,0x57B8,0x57B6,0x57BF,0x57C7,0x57D0,0x57B9,/* 0x50-0x57 */ + 0x57C1,0x590E,0x594A,0x5A19,0x5A16,0x5A2D,0x5A2E,0x5A15,/* 0x58-0x5F */ + 0x5A0F,0x5A17,0x5A0A,0x5A1E,0x5A33,0x5B6C,0x5BA7,0x5BAD,/* 0x60-0x67 */ + 0x5BAC,0x5C03,0x5C56,0x5C54,0x5CEC,0x5CFF,0x5CEE,0x5CF1,/* 0x68-0x6F */ + 0x5CF7,0x5D00,0x5CF9,0x5E29,0x5E28,0x5EA8,0x5EAE,0x5EAA,/* 0x70-0x77 */ + 0x5EAC,0x5F33,0x5F30,0x5F67,0x605D,0x605A,0x6067,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6041,0x60A2,0x6088,0x6080,0x6092,0x6081,0x609D,/* 0xA0-0xA7 */ + 0x6083,0x6095,0x609B,0x6097,0x6087,0x609C,0x608E,0x6219,/* 0xA8-0xAF */ + 0x6246,0x62F2,0x6310,0x6356,0x632C,0x6344,0x6345,0x6336,/* 0xB0-0xB7 */ + 0x6343,0x63E4,0x6339,0x634B,0x634A,0x633C,0x6329,0x6341,/* 0xB8-0xBF */ + 0x6334,0x6358,0x6354,0x6359,0x632D,0x6347,0x6333,0x635A,/* 0xC0-0xC7 */ + 0x6351,0x6338,0x6357,0x6340,0x6348,0x654A,0x6546,0x65C6,/* 0xC8-0xCF */ + 0x65C3,0x65C4,0x65C2,0x664A,0x665F,0x6647,0x6651,0x6712,/* 0xD0-0xD7 */ + 0x6713,0x681F,0x681A,0x6849,0x6832,0x6833,0x683B,0x684B,/* 0xD8-0xDF */ + 0x684F,0x6816,0x6831,0x681C,0x6835,0x682B,0x682D,0x682F,/* 0xE0-0xE7 */ + 0x684E,0x6844,0x6834,0x681D,0x6812,0x6814,0x6826,0x6828,/* 0xE8-0xEF */ + 0x682E,0x684D,0x683A,0x6825,0x6820,0x6B2C,0x6B2F,0x6B2D,/* 0xF0-0xF7 */ + 0x6B31,0x6B34,0x6B6D,0x8082,0x6B88,0x6BE6,0x6BE4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BE8,0x6BE3,0x6BE2,0x6BE7,0x6C25,0x6D7A,0x6D63,0x6D64,/* 0x40-0x47 */ + 0x6D76,0x6D0D,0x6D61,0x6D92,0x6D58,0x6D62,0x6D6D,0x6D6F,/* 0x48-0x4F */ + 0x6D91,0x6D8D,0x6DEF,0x6D7F,0x6D86,0x6D5E,0x6D67,0x6D60,/* 0x50-0x57 */ + 0x6D97,0x6D70,0x6D7C,0x6D5F,0x6D82,0x6D98,0x6D2F,0x6D68,/* 0x58-0x5F */ + 0x6D8B,0x6D7E,0x6D80,0x6D84,0x6D16,0x6D83,0x6D7B,0x6D7D,/* 0x60-0x67 */ + 0x6D75,0x6D90,0x70DC,0x70D3,0x70D1,0x70DD,0x70CB,0x7F39,/* 0x68-0x6F */ + 0x70E2,0x70D7,0x70D2,0x70DE,0x70E0,0x70D4,0x70CD,0x70C5,/* 0x70-0x77 */ + 0x70C6,0x70C7,0x70DA,0x70CE,0x70E1,0x7242,0x7278,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7277,0x7276,0x7300,0x72FA,0x72F4,0x72FE,0x72F6,/* 0xA0-0xA7 */ + 0x72F3,0x72FB,0x7301,0x73D3,0x73D9,0x73E5,0x73D6,0x73BC,/* 0xA8-0xAF */ + 0x73E7,0x73E3,0x73E9,0x73DC,0x73D2,0x73DB,0x73D4,0x73DD,/* 0xB0-0xB7 */ + 0x73DA,0x73D7,0x73D8,0x73E8,0x74DE,0x74DF,0x74F4,0x74F5,/* 0xB8-0xBF */ + 0x7521,0x755B,0x755F,0x75B0,0x75C1,0x75BB,0x75C4,0x75C0,/* 0xC0-0xC7 */ + 0x75BF,0x75B6,0x75BA,0x768A,0x76C9,0x771D,0x771B,0x7710,/* 0xC8-0xCF */ + 0x7713,0x7712,0x7723,0x7711,0x7715,0x7719,0x771A,0x7722,/* 0xD0-0xD7 */ + 0x7727,0x7823,0x782C,0x7822,0x7835,0x782F,0x7828,0x782E,/* 0xD8-0xDF */ + 0x782B,0x7821,0x7829,0x7833,0x782A,0x7831,0x7954,0x795B,/* 0xE0-0xE7 */ + 0x794F,0x795C,0x7953,0x7952,0x7951,0x79EB,0x79EC,0x79E0,/* 0xE8-0xEF */ + 0x79EE,0x79ED,0x79EA,0x79DC,0x79DE,0x79DD,0x7A86,0x7A89,/* 0xF0-0xF7 */ + 0x7A85,0x7A8B,0x7A8C,0x7A8A,0x7A87,0x7AD8,0x7B10,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7B04,0x7B13,0x7B05,0x7B0F,0x7B08,0x7B0A,0x7B0E,0x7B09,/* 0x40-0x47 */ + 0x7B12,0x7C84,0x7C91,0x7C8A,0x7C8C,0x7C88,0x7C8D,0x7C85,/* 0x48-0x4F */ + 0x7D1E,0x7D1D,0x7D11,0x7D0E,0x7D18,0x7D16,0x7D13,0x7D1F,/* 0x50-0x57 */ + 0x7D12,0x7D0F,0x7D0C,0x7F5C,0x7F61,0x7F5E,0x7F60,0x7F5D,/* 0x58-0x5F */ + 0x7F5B,0x7F96,0x7F92,0x7FC3,0x7FC2,0x7FC0,0x8016,0x803E,/* 0x60-0x67 */ + 0x8039,0x80FA,0x80F2,0x80F9,0x80F5,0x8101,0x80FB,0x8100,/* 0x68-0x6F */ + 0x8201,0x822F,0x8225,0x8333,0x832D,0x8344,0x8319,0x8351,/* 0x70-0x77 */ + 0x8325,0x8356,0x833F,0x8341,0x8326,0x831C,0x8322,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8342,0x834E,0x831B,0x832A,0x8308,0x833C,0x834D,/* 0xA0-0xA7 */ + 0x8316,0x8324,0x8320,0x8337,0x832F,0x8329,0x8347,0x8345,/* 0xA8-0xAF */ + 0x834C,0x8353,0x831E,0x832C,0x834B,0x8327,0x8348,0x8653,/* 0xB0-0xB7 */ + 0x8652,0x86A2,0x86A8,0x8696,0x868D,0x8691,0x869E,0x8687,/* 0xB8-0xBF */ + 0x8697,0x8686,0x868B,0x869A,0x8685,0x86A5,0x8699,0x86A1,/* 0xC0-0xC7 */ + 0x86A7,0x8695,0x8698,0x868E,0x869D,0x8690,0x8694,0x8843,/* 0xC8-0xCF */ + 0x8844,0x886D,0x8875,0x8876,0x8872,0x8880,0x8871,0x887F,/* 0xD0-0xD7 */ + 0x886F,0x8883,0x887E,0x8874,0x887C,0x8A12,0x8C47,0x8C57,/* 0xD8-0xDF */ + 0x8C7B,0x8CA4,0x8CA3,0x8D76,0x8D78,0x8DB5,0x8DB7,0x8DB6,/* 0xE0-0xE7 */ + 0x8ED1,0x8ED3,0x8FFE,0x8FF5,0x9002,0x8FFF,0x8FFB,0x9004,/* 0xE8-0xEF */ + 0x8FFC,0x8FF6,0x90D6,0x90E0,0x90D9,0x90DA,0x90E3,0x90DF,/* 0xF0-0xF7 */ + 0x90E5,0x90D8,0x90DB,0x90D7,0x90DC,0x90E4,0x9150,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x914E,0x914F,0x91D5,0x91E2,0x91DA,0x965C,0x965F,0x96BC,/* 0x40-0x47 */ + 0x98E3,0x9ADF,0x9B2F,0x4E7F,0x5070,0x506A,0x5061,0x505E,/* 0x48-0x4F */ + 0x5060,0x5053,0x504B,0x505D,0x5072,0x5048,0x504D,0x5041,/* 0x50-0x57 */ + 0x505B,0x504A,0x5062,0x5015,0x5045,0x505F,0x5069,0x506B,/* 0x58-0x5F */ + 0x5063,0x5064,0x5046,0x5040,0x506E,0x5073,0x5057,0x5051,/* 0x60-0x67 */ + 0x51D0,0x526B,0x526D,0x526C,0x526E,0x52D6,0x52D3,0x532D,/* 0x68-0x6F */ + 0x539C,0x5575,0x5576,0x553C,0x554D,0x5550,0x5534,0x552A,/* 0x70-0x77 */ + 0x5551,0x5562,0x5536,0x5535,0x5530,0x5552,0x5545,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x550C,0x5532,0x5565,0x554E,0x5539,0x5548,0x552D,/* 0xA0-0xA7 */ + 0x553B,0x5540,0x554B,0x570A,0x5707,0x57FB,0x5814,0x57E2,/* 0xA8-0xAF */ + 0x57F6,0x57DC,0x57F4,0x5800,0x57ED,0x57FD,0x5808,0x57F8,/* 0xB0-0xB7 */ + 0x580B,0x57F3,0x57CF,0x5807,0x57EE,0x57E3,0x57F2,0x57E5,/* 0xB8-0xBF */ + 0x57EC,0x57E1,0x580E,0x57FC,0x5810,0x57E7,0x5801,0x580C,/* 0xC0-0xC7 */ + 0x57F1,0x57E9,0x57F0,0x580D,0x5804,0x595C,0x5A60,0x5A58,/* 0xC8-0xCF */ + 0x5A55,0x5A67,0x5A5E,0x5A38,0x5A35,0x5A6D,0x5A50,0x5A5F,/* 0xD0-0xD7 */ + 0x5A65,0x5A6C,0x5A53,0x5A64,0x5A57,0x5A43,0x5A5D,0x5A52,/* 0xD8-0xDF */ + 0x5A44,0x5A5B,0x5A48,0x5A8E,0x5A3E,0x5A4D,0x5A39,0x5A4C,/* 0xE0-0xE7 */ + 0x5A70,0x5A69,0x5A47,0x5A51,0x5A56,0x5A42,0x5A5C,0x5B72,/* 0xE8-0xEF */ + 0x5B6E,0x5BC1,0x5BC0,0x5C59,0x5D1E,0x5D0B,0x5D1D,0x5D1A,/* 0xF0-0xF7 */ + 0x5D20,0x5D0C,0x5D28,0x5D0D,0x5D26,0x5D25,0x5D0F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5D30,0x5D12,0x5D23,0x5D1F,0x5D2E,0x5E3E,0x5E34,0x5EB1,/* 0x40-0x47 */ + 0x5EB4,0x5EB9,0x5EB2,0x5EB3,0x5F36,0x5F38,0x5F9B,0x5F96,/* 0x48-0x4F */ + 0x5F9F,0x608A,0x6090,0x6086,0x60BE,0x60B0,0x60BA,0x60D3,/* 0x50-0x57 */ + 0x60D4,0x60CF,0x60E4,0x60D9,0x60DD,0x60C8,0x60B1,0x60DB,/* 0x58-0x5F */ + 0x60B7,0x60CA,0x60BF,0x60C3,0x60CD,0x60C0,0x6332,0x6365,/* 0x60-0x67 */ + 0x638A,0x6382,0x637D,0x63BD,0x639E,0x63AD,0x639D,0x6397,/* 0x68-0x6F */ + 0x63AB,0x638E,0x636F,0x6387,0x6390,0x636E,0x63AF,0x6375,/* 0x70-0x77 */ + 0x639C,0x636D,0x63AE,0x637C,0x63A4,0x633B,0x639F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6378,0x6385,0x6381,0x6391,0x638D,0x6370,0x6553,/* 0xA0-0xA7 */ + 0x65CD,0x6665,0x6661,0x665B,0x6659,0x665C,0x6662,0x6718,/* 0xA8-0xAF */ + 0x6879,0x6887,0x6890,0x689C,0x686D,0x686E,0x68AE,0x68AB,/* 0xB0-0xB7 */ + 0x6956,0x686F,0x68A3,0x68AC,0x68A9,0x6875,0x6874,0x68B2,/* 0xB8-0xBF */ + 0x688F,0x6877,0x6892,0x687C,0x686B,0x6872,0x68AA,0x6880,/* 0xC0-0xC7 */ + 0x6871,0x687E,0x689B,0x6896,0x688B,0x68A0,0x6889,0x68A4,/* 0xC8-0xCF */ + 0x6878,0x687B,0x6891,0x688C,0x688A,0x687D,0x6B36,0x6B33,/* 0xD0-0xD7 */ + 0x6B37,0x6B38,0x6B91,0x6B8F,0x6B8D,0x6B8E,0x6B8C,0x6C2A,/* 0xD8-0xDF */ + 0x6DC0,0x6DAB,0x6DB4,0x6DB3,0x6E74,0x6DAC,0x6DE9,0x6DE2,/* 0xE0-0xE7 */ + 0x6DB7,0x6DF6,0x6DD4,0x6E00,0x6DC8,0x6DE0,0x6DDF,0x6DD6,/* 0xE8-0xEF */ + 0x6DBE,0x6DE5,0x6DDC,0x6DDD,0x6DDB,0x6DF4,0x6DCA,0x6DBD,/* 0xF0-0xF7 */ + 0x6DED,0x6DF0,0x6DBA,0x6DD5,0x6DC2,0x6DCF,0x6DC9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6DD0,0x6DF2,0x6DD3,0x6DFD,0x6DD7,0x6DCD,0x6DE3,0x6DBB,/* 0x40-0x47 */ + 0x70FA,0x710D,0x70F7,0x7117,0x70F4,0x710C,0x70F0,0x7104,/* 0x48-0x4F */ + 0x70F3,0x7110,0x70FC,0x70FF,0x7106,0x7113,0x7100,0x70F8,/* 0x50-0x57 */ + 0x70F6,0x710B,0x7102,0x710E,0x727E,0x727B,0x727C,0x727F,/* 0x58-0x5F */ + 0x731D,0x7317,0x7307,0x7311,0x7318,0x730A,0x7308,0x72FF,/* 0x60-0x67 */ + 0x730F,0x731E,0x7388,0x73F6,0x73F8,0x73F5,0x7404,0x7401,/* 0x68-0x6F */ + 0x73FD,0x7407,0x7400,0x73FA,0x73FC,0x73FF,0x740C,0x740B,/* 0x70-0x77 */ + 0x73F4,0x7408,0x7564,0x7563,0x75CE,0x75D2,0x75CF,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x75CB,0x75CC,0x75D1,0x75D0,0x768F,0x7689,0x76D3,/* 0xA0-0xA7 */ + 0x7739,0x772F,0x772D,0x7731,0x7732,0x7734,0x7733,0x773D,/* 0xA8-0xAF */ + 0x7725,0x773B,0x7735,0x7848,0x7852,0x7849,0x784D,0x784A,/* 0xB0-0xB7 */ + 0x784C,0x7826,0x7845,0x7850,0x7964,0x7967,0x7969,0x796A,/* 0xB8-0xBF */ + 0x7963,0x796B,0x7961,0x79BB,0x79FA,0x79F8,0x79F6,0x79F7,/* 0xC0-0xC7 */ + 0x7A8F,0x7A94,0x7A90,0x7B35,0x7B47,0x7B34,0x7B25,0x7B30,/* 0xC8-0xCF */ + 0x7B22,0x7B24,0x7B33,0x7B18,0x7B2A,0x7B1D,0x7B31,0x7B2B,/* 0xD0-0xD7 */ + 0x7B2D,0x7B2F,0x7B32,0x7B38,0x7B1A,0x7B23,0x7C94,0x7C98,/* 0xD8-0xDF */ + 0x7C96,0x7CA3,0x7D35,0x7D3D,0x7D38,0x7D36,0x7D3A,0x7D45,/* 0xE0-0xE7 */ + 0x7D2C,0x7D29,0x7D41,0x7D47,0x7D3E,0x7D3F,0x7D4A,0x7D3B,/* 0xE8-0xEF */ + 0x7D28,0x7F63,0x7F95,0x7F9C,0x7F9D,0x7F9B,0x7FCA,0x7FCB,/* 0xF0-0xF7 */ + 0x7FCD,0x7FD0,0x7FD1,0x7FC7,0x7FCF,0x7FC9,0x801F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x801E,0x801B,0x8047,0x8043,0x8048,0x8118,0x8125,0x8119,/* 0x40-0x47 */ + 0x811B,0x812D,0x811F,0x812C,0x811E,0x8121,0x8115,0x8127,/* 0x48-0x4F */ + 0x811D,0x8122,0x8211,0x8238,0x8233,0x823A,0x8234,0x8232,/* 0x50-0x57 */ + 0x8274,0x8390,0x83A3,0x83A8,0x838D,0x837A,0x8373,0x83A4,/* 0x58-0x5F */ + 0x8374,0x838F,0x8381,0x8395,0x8399,0x8375,0x8394,0x83A9,/* 0x60-0x67 */ + 0x837D,0x8383,0x838C,0x839D,0x839B,0x83AA,0x838B,0x837E,/* 0x68-0x6F */ + 0x83A5,0x83AF,0x8388,0x8397,0x83B0,0x837F,0x83A6,0x8387,/* 0x70-0x77 */ + 0x83AE,0x8376,0x839A,0x8659,0x8656,0x86BF,0x86B7,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x86C2,0x86C1,0x86C5,0x86BA,0x86B0,0x86C8,0x86B9,/* 0xA0-0xA7 */ + 0x86B3,0x86B8,0x86CC,0x86B4,0x86BB,0x86BC,0x86C3,0x86BD,/* 0xA8-0xAF */ + 0x86BE,0x8852,0x8889,0x8895,0x88A8,0x88A2,0x88AA,0x889A,/* 0xB0-0xB7 */ + 0x8891,0x88A1,0x889F,0x8898,0x88A7,0x8899,0x889B,0x8897,/* 0xB8-0xBF */ + 0x88A4,0x88AC,0x888C,0x8893,0x888E,0x8982,0x89D6,0x89D9,/* 0xC0-0xC7 */ + 0x89D5,0x8A30,0x8A27,0x8A2C,0x8A1E,0x8C39,0x8C3B,0x8C5C,/* 0xC8-0xCF */ + 0x8C5D,0x8C7D,0x8CA5,0x8D7D,0x8D7B,0x8D79,0x8DBC,0x8DC2,/* 0xD0-0xD7 */ + 0x8DB9,0x8DBF,0x8DC1,0x8ED8,0x8EDE,0x8EDD,0x8EDC,0x8ED7,/* 0xD8-0xDF */ + 0x8EE0,0x8EE1,0x9024,0x900B,0x9011,0x901C,0x900C,0x9021,/* 0xE0-0xE7 */ + 0x90EF,0x90EA,0x90F0,0x90F4,0x90F2,0x90F3,0x90D4,0x90EB,/* 0xE8-0xEF */ + 0x90EC,0x90E9,0x9156,0x9158,0x915A,0x9153,0x9155,0x91EC,/* 0xF0-0xF7 */ + 0x91F4,0x91F1,0x91F3,0x91F8,0x91E4,0x91F9,0x91EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x91EB,0x91F7,0x91E8,0x91EE,0x957A,0x9586,0x9588,0x967C,/* 0x40-0x47 */ + 0x966D,0x966B,0x9671,0x966F,0x96BF,0x976A,0x9804,0x98E5,/* 0x48-0x4F */ + 0x9997,0x509B,0x5095,0x5094,0x509E,0x508B,0x50A3,0x5083,/* 0x50-0x57 */ + 0x508C,0x508E,0x509D,0x5068,0x509C,0x5092,0x5082,0x5087,/* 0x58-0x5F */ + 0x515F,0x51D4,0x5312,0x5311,0x53A4,0x53A7,0x5591,0x55A8,/* 0x60-0x67 */ + 0x55A5,0x55AD,0x5577,0x5645,0x55A2,0x5593,0x5588,0x558F,/* 0x68-0x6F */ + 0x55B5,0x5581,0x55A3,0x5592,0x55A4,0x557D,0x558C,0x55A6,/* 0x70-0x77 */ + 0x557F,0x5595,0x55A1,0x558E,0x570C,0x5829,0x5837,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5819,0x581E,0x5827,0x5823,0x5828,0x57F5,0x5848,/* 0xA0-0xA7 */ + 0x5825,0x581C,0x581B,0x5833,0x583F,0x5836,0x582E,0x5839,/* 0xA8-0xAF */ + 0x5838,0x582D,0x582C,0x583B,0x5961,0x5AAF,0x5A94,0x5A9F,/* 0xB0-0xB7 */ + 0x5A7A,0x5AA2,0x5A9E,0x5A78,0x5AA6,0x5A7C,0x5AA5,0x5AAC,/* 0xB8-0xBF */ + 0x5A95,0x5AAE,0x5A37,0x5A84,0x5A8A,0x5A97,0x5A83,0x5A8B,/* 0xC0-0xC7 */ + 0x5AA9,0x5A7B,0x5A7D,0x5A8C,0x5A9C,0x5A8F,0x5A93,0x5A9D,/* 0xC8-0xCF */ + 0x5BEA,0x5BCD,0x5BCB,0x5BD4,0x5BD1,0x5BCA,0x5BCE,0x5C0C,/* 0xD0-0xD7 */ + 0x5C30,0x5D37,0x5D43,0x5D6B,0x5D41,0x5D4B,0x5D3F,0x5D35,/* 0xD8-0xDF */ + 0x5D51,0x5D4E,0x5D55,0x5D33,0x5D3A,0x5D52,0x5D3D,0x5D31,/* 0xE0-0xE7 */ + 0x5D59,0x5D42,0x5D39,0x5D49,0x5D38,0x5D3C,0x5D32,0x5D36,/* 0xE8-0xEF */ + 0x5D40,0x5D45,0x5E44,0x5E41,0x5F58,0x5FA6,0x5FA5,0x5FAB,/* 0xF0-0xF7 */ + 0x60C9,0x60B9,0x60CC,0x60E2,0x60CE,0x60C4,0x6114,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x60F2,0x610A,0x6116,0x6105,0x60F5,0x6113,0x60F8,0x60FC,/* 0x40-0x47 */ + 0x60FE,0x60C1,0x6103,0x6118,0x611D,0x6110,0x60FF,0x6104,/* 0x48-0x4F */ + 0x610B,0x624A,0x6394,0x63B1,0x63B0,0x63CE,0x63E5,0x63E8,/* 0x50-0x57 */ + 0x63EF,0x63C3,0x649D,0x63F3,0x63CA,0x63E0,0x63F6,0x63D5,/* 0x58-0x5F */ + 0x63F2,0x63F5,0x6461,0x63DF,0x63BE,0x63DD,0x63DC,0x63C4,/* 0x60-0x67 */ + 0x63D8,0x63D3,0x63C2,0x63C7,0x63CC,0x63CB,0x63C8,0x63F0,/* 0x68-0x6F */ + 0x63D7,0x63D9,0x6532,0x6567,0x656A,0x6564,0x655C,0x6568,/* 0x70-0x77 */ + 0x6565,0x658C,0x659D,0x659E,0x65AE,0x65D0,0x65D2,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x667C,0x666C,0x667B,0x6680,0x6671,0x6679,0x666A,/* 0xA0-0xA7 */ + 0x6672,0x6701,0x690C,0x68D3,0x6904,0x68DC,0x692A,0x68EC,/* 0xA8-0xAF */ + 0x68EA,0x68F1,0x690F,0x68D6,0x68F7,0x68EB,0x68E4,0x68F6,/* 0xB0-0xB7 */ + 0x6913,0x6910,0x68F3,0x68E1,0x6907,0x68CC,0x6908,0x6970,/* 0xB8-0xBF */ + 0x68B4,0x6911,0x68EF,0x68C6,0x6914,0x68F8,0x68D0,0x68FD,/* 0xC0-0xC7 */ + 0x68FC,0x68E8,0x690B,0x690A,0x6917,0x68CE,0x68C8,0x68DD,/* 0xC8-0xCF */ + 0x68DE,0x68E6,0x68F4,0x68D1,0x6906,0x68D4,0x68E9,0x6915,/* 0xD0-0xD7 */ + 0x6925,0x68C7,0x6B39,0x6B3B,0x6B3F,0x6B3C,0x6B94,0x6B97,/* 0xD8-0xDF */ + 0x6B99,0x6B95,0x6BBD,0x6BF0,0x6BF2,0x6BF3,0x6C30,0x6DFC,/* 0xE0-0xE7 */ + 0x6E46,0x6E47,0x6E1F,0x6E49,0x6E88,0x6E3C,0x6E3D,0x6E45,/* 0xE8-0xEF */ + 0x6E62,0x6E2B,0x6E3F,0x6E41,0x6E5D,0x6E73,0x6E1C,0x6E33,/* 0xF0-0xF7 */ + 0x6E4B,0x6E40,0x6E51,0x6E3B,0x6E03,0x6E2E,0x6E5E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6E68,0x6E5C,0x6E61,0x6E31,0x6E28,0x6E60,0x6E71,0x6E6B,/* 0x40-0x47 */ + 0x6E39,0x6E22,0x6E30,0x6E53,0x6E65,0x6E27,0x6E78,0x6E64,/* 0x48-0x4F */ + 0x6E77,0x6E55,0x6E79,0x6E52,0x6E66,0x6E35,0x6E36,0x6E5A,/* 0x50-0x57 */ + 0x7120,0x711E,0x712F,0x70FB,0x712E,0x7131,0x7123,0x7125,/* 0x58-0x5F */ + 0x7122,0x7132,0x711F,0x7128,0x713A,0x711B,0x724B,0x725A,/* 0x60-0x67 */ + 0x7288,0x7289,0x7286,0x7285,0x728B,0x7312,0x730B,0x7330,/* 0x68-0x6F */ + 0x7322,0x7331,0x7333,0x7327,0x7332,0x732D,0x7326,0x7323,/* 0x70-0x77 */ + 0x7335,0x730C,0x742E,0x742C,0x7430,0x742B,0x7416,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x741A,0x7421,0x742D,0x7431,0x7424,0x7423,0x741D,/* 0xA0-0xA7 */ + 0x7429,0x7420,0x7432,0x74FB,0x752F,0x756F,0x756C,0x75E7,/* 0xA8-0xAF */ + 0x75DA,0x75E1,0x75E6,0x75DD,0x75DF,0x75E4,0x75D7,0x7695,/* 0xB0-0xB7 */ + 0x7692,0x76DA,0x7746,0x7747,0x7744,0x774D,0x7745,0x774A,/* 0xB8-0xBF */ + 0x774E,0x774B,0x774C,0x77DE,0x77EC,0x7860,0x7864,0x7865,/* 0xC0-0xC7 */ + 0x785C,0x786D,0x7871,0x786A,0x786E,0x7870,0x7869,0x7868,/* 0xC8-0xCF */ + 0x785E,0x7862,0x7974,0x7973,0x7972,0x7970,0x7A02,0x7A0A,/* 0xD0-0xD7 */ + 0x7A03,0x7A0C,0x7A04,0x7A99,0x7AE6,0x7AE4,0x7B4A,0x7B3B,/* 0xD8-0xDF */ + 0x7B44,0x7B48,0x7B4C,0x7B4E,0x7B40,0x7B58,0x7B45,0x7CA2,/* 0xE0-0xE7 */ + 0x7C9E,0x7CA8,0x7CA1,0x7D58,0x7D6F,0x7D63,0x7D53,0x7D56,/* 0xE8-0xEF */ + 0x7D67,0x7D6A,0x7D4F,0x7D6D,0x7D5C,0x7D6B,0x7D52,0x7D54,/* 0xF0-0xF7 */ + 0x7D69,0x7D51,0x7D5F,0x7D4E,0x7F3E,0x7F3F,0x7F65,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F66,0x7FA2,0x7FA0,0x7FA1,0x7FD7,0x8051,0x804F,0x8050,/* 0x40-0x47 */ + 0x80FE,0x80D4,0x8143,0x814A,0x8152,0x814F,0x8147,0x813D,/* 0x48-0x4F */ + 0x814D,0x813A,0x81E6,0x81EE,0x81F7,0x81F8,0x81F9,0x8204,/* 0x50-0x57 */ + 0x823C,0x823D,0x823F,0x8275,0x833B,0x83CF,0x83F9,0x8423,/* 0x58-0x5F */ + 0x83C0,0x83E8,0x8412,0x83E7,0x83E4,0x83FC,0x83F6,0x8410,/* 0x60-0x67 */ + 0x83C6,0x83C8,0x83EB,0x83E3,0x83BF,0x8401,0x83DD,0x83E5,/* 0x68-0x6F */ + 0x83D8,0x83FF,0x83E1,0x83CB,0x83CE,0x83D6,0x83F5,0x83C9,/* 0x70-0x77 */ + 0x8409,0x840F,0x83DE,0x8411,0x8406,0x83C2,0x83F3,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x83D5,0x83FA,0x83C7,0x83D1,0x83EA,0x8413,0x83C3,/* 0xA0-0xA7 */ + 0x83EC,0x83EE,0x83C4,0x83FB,0x83D7,0x83E2,0x841B,0x83DB,/* 0xA8-0xAF */ + 0x83FE,0x86D8,0x86E2,0x86E6,0x86D3,0x86E3,0x86DA,0x86EA,/* 0xB0-0xB7 */ + 0x86DD,0x86EB,0x86DC,0x86EC,0x86E9,0x86D7,0x86E8,0x86D1,/* 0xB8-0xBF */ + 0x8848,0x8856,0x8855,0x88BA,0x88D7,0x88B9,0x88B8,0x88C0,/* 0xC0-0xC7 */ + 0x88BE,0x88B6,0x88BC,0x88B7,0x88BD,0x88B2,0x8901,0x88C9,/* 0xC8-0xCF */ + 0x8995,0x8998,0x8997,0x89DD,0x89DA,0x89DB,0x8A4E,0x8A4D,/* 0xD0-0xD7 */ + 0x8A39,0x8A59,0x8A40,0x8A57,0x8A58,0x8A44,0x8A45,0x8A52,/* 0xD8-0xDF */ + 0x8A48,0x8A51,0x8A4A,0x8A4C,0x8A4F,0x8C5F,0x8C81,0x8C80,/* 0xE0-0xE7 */ + 0x8CBA,0x8CBE,0x8CB0,0x8CB9,0x8CB5,0x8D84,0x8D80,0x8D89,/* 0xE8-0xEF */ + 0x8DD8,0x8DD3,0x8DCD,0x8DC7,0x8DD6,0x8DDC,0x8DCF,0x8DD5,/* 0xF0-0xF7 */ + 0x8DD9,0x8DC8,0x8DD7,0x8DC5,0x8EEF,0x8EF7,0x8EFA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8EF9,0x8EE6,0x8EEE,0x8EE5,0x8EF5,0x8EE7,0x8EE8,0x8EF6,/* 0x40-0x47 */ + 0x8EEB,0x8EF1,0x8EEC,0x8EF4,0x8EE9,0x902D,0x9034,0x902F,/* 0x48-0x4F */ + 0x9106,0x912C,0x9104,0x90FF,0x90FC,0x9108,0x90F9,0x90FB,/* 0x50-0x57 */ + 0x9101,0x9100,0x9107,0x9105,0x9103,0x9161,0x9164,0x915F,/* 0x58-0x5F */ + 0x9162,0x9160,0x9201,0x920A,0x9225,0x9203,0x921A,0x9226,/* 0x60-0x67 */ + 0x920F,0x920C,0x9200,0x9212,0x91FF,0x91FD,0x9206,0x9204,/* 0x68-0x6F */ + 0x9227,0x9202,0x921C,0x9224,0x9219,0x9217,0x9205,0x9216,/* 0x70-0x77 */ + 0x957B,0x958D,0x958C,0x9590,0x9687,0x967E,0x9688,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9689,0x9683,0x9680,0x96C2,0x96C8,0x96C3,0x96F1,/* 0xA0-0xA7 */ + 0x96F0,0x976C,0x9770,0x976E,0x9807,0x98A9,0x98EB,0x9CE6,/* 0xA8-0xAF */ + 0x9EF9,0x4E83,0x4E84,0x4EB6,0x50BD,0x50BF,0x50C6,0x50AE,/* 0xB0-0xB7 */ + 0x50C4,0x50CA,0x50B4,0x50C8,0x50C2,0x50B0,0x50C1,0x50BA,/* 0xB8-0xBF */ + 0x50B1,0x50CB,0x50C9,0x50B6,0x50B8,0x51D7,0x527A,0x5278,/* 0xC0-0xC7 */ + 0x527B,0x527C,0x55C3,0x55DB,0x55CC,0x55D0,0x55CB,0x55CA,/* 0xC8-0xCF */ + 0x55DD,0x55C0,0x55D4,0x55C4,0x55E9,0x55BF,0x55D2,0x558D,/* 0xD0-0xD7 */ + 0x55CF,0x55D5,0x55E2,0x55D6,0x55C8,0x55F2,0x55CD,0x55D9,/* 0xD8-0xDF */ + 0x55C2,0x5714,0x5853,0x5868,0x5864,0x584F,0x584D,0x5849,/* 0xE0-0xE7 */ + 0x586F,0x5855,0x584E,0x585D,0x5859,0x5865,0x585B,0x583D,/* 0xE8-0xEF */ + 0x5863,0x5871,0x58FC,0x5AC7,0x5AC4,0x5ACB,0x5ABA,0x5AB8,/* 0xF0-0xF7 */ + 0x5AB1,0x5AB5,0x5AB0,0x5ABF,0x5AC8,0x5ABB,0x5AC6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5AB7,0x5AC0,0x5ACA,0x5AB4,0x5AB6,0x5ACD,0x5AB9,0x5A90,/* 0x40-0x47 */ + 0x5BD6,0x5BD8,0x5BD9,0x5C1F,0x5C33,0x5D71,0x5D63,0x5D4A,/* 0x48-0x4F */ + 0x5D65,0x5D72,0x5D6C,0x5D5E,0x5D68,0x5D67,0x5D62,0x5DF0,/* 0x50-0x57 */ + 0x5E4F,0x5E4E,0x5E4A,0x5E4D,0x5E4B,0x5EC5,0x5ECC,0x5EC6,/* 0x58-0x5F */ + 0x5ECB,0x5EC7,0x5F40,0x5FAF,0x5FAD,0x60F7,0x6149,0x614A,/* 0x60-0x67 */ + 0x612B,0x6145,0x6136,0x6132,0x612E,0x6146,0x612F,0x614F,/* 0x68-0x6F */ + 0x6129,0x6140,0x6220,0x9168,0x6223,0x6225,0x6224,0x63C5,/* 0x70-0x77 */ + 0x63F1,0x63EB,0x6410,0x6412,0x6409,0x6420,0x6424,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6433,0x6443,0x641F,0x6415,0x6418,0x6439,0x6437,/* 0xA0-0xA7 */ + 0x6422,0x6423,0x640C,0x6426,0x6430,0x6428,0x6441,0x6435,/* 0xA8-0xAF */ + 0x642F,0x640A,0x641A,0x6440,0x6425,0x6427,0x640B,0x63E7,/* 0xB0-0xB7 */ + 0x641B,0x642E,0x6421,0x640E,0x656F,0x6592,0x65D3,0x6686,/* 0xB8-0xBF */ + 0x668C,0x6695,0x6690,0x668B,0x668A,0x6699,0x6694,0x6678,/* 0xC0-0xC7 */ + 0x6720,0x6966,0x695F,0x6938,0x694E,0x6962,0x6971,0x693F,/* 0xC8-0xCF */ + 0x6945,0x696A,0x6939,0x6942,0x6957,0x6959,0x697A,0x6948,/* 0xD0-0xD7 */ + 0x6949,0x6935,0x696C,0x6933,0x693D,0x6965,0x68F0,0x6978,/* 0xD8-0xDF */ + 0x6934,0x6969,0x6940,0x696F,0x6944,0x6976,0x6958,0x6941,/* 0xE0-0xE7 */ + 0x6974,0x694C,0x693B,0x694B,0x6937,0x695C,0x694F,0x6951,/* 0xE8-0xEF */ + 0x6932,0x6952,0x692F,0x697B,0x693C,0x6B46,0x6B45,0x6B43,/* 0xF0-0xF7 */ + 0x6B42,0x6B48,0x6B41,0x6B9B,0xFA0D,0x6BFB,0x6BFC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BF9,0x6BF7,0x6BF8,0x6E9B,0x6ED6,0x6EC8,0x6E8F,0x6EC0,/* 0x40-0x47 */ + 0x6E9F,0x6E93,0x6E94,0x6EA0,0x6EB1,0x6EB9,0x6EC6,0x6ED2,/* 0x48-0x4F */ + 0x6EBD,0x6EC1,0x6E9E,0x6EC9,0x6EB7,0x6EB0,0x6ECD,0x6EA6,/* 0x50-0x57 */ + 0x6ECF,0x6EB2,0x6EBE,0x6EC3,0x6EDC,0x6ED8,0x6E99,0x6E92,/* 0x58-0x5F */ + 0x6E8E,0x6E8D,0x6EA4,0x6EA1,0x6EBF,0x6EB3,0x6ED0,0x6ECA,/* 0x60-0x67 */ + 0x6E97,0x6EAE,0x6EA3,0x7147,0x7154,0x7152,0x7163,0x7160,/* 0x68-0x6F */ + 0x7141,0x715D,0x7162,0x7172,0x7178,0x716A,0x7161,0x7142,/* 0x70-0x77 */ + 0x7158,0x7143,0x714B,0x7170,0x715F,0x7150,0x7153,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7144,0x714D,0x715A,0x724F,0x728D,0x728C,0x7291,/* 0xA0-0xA7 */ + 0x7290,0x728E,0x733C,0x7342,0x733B,0x733A,0x7340,0x734A,/* 0xA8-0xAF */ + 0x7349,0x7444,0x744A,0x744B,0x7452,0x7451,0x7457,0x7440,/* 0xB0-0xB7 */ + 0x744F,0x7450,0x744E,0x7442,0x7446,0x744D,0x7454,0x74E1,/* 0xB8-0xBF */ + 0x74FF,0x74FE,0x74FD,0x751D,0x7579,0x7577,0x6983,0x75EF,/* 0xC0-0xC7 */ + 0x760F,0x7603,0x75F7,0x75FE,0x75FC,0x75F9,0x75F8,0x7610,/* 0xC8-0xCF */ + 0x75FB,0x75F6,0x75ED,0x75F5,0x75FD,0x7699,0x76B5,0x76DD,/* 0xD0-0xD7 */ + 0x7755,0x775F,0x7760,0x7752,0x7756,0x775A,0x7769,0x7767,/* 0xD8-0xDF */ + 0x7754,0x7759,0x776D,0x77E0,0x7887,0x789A,0x7894,0x788F,/* 0xE0-0xE7 */ + 0x7884,0x7895,0x7885,0x7886,0x78A1,0x7883,0x7879,0x7899,/* 0xE8-0xEF */ + 0x7880,0x7896,0x787B,0x797C,0x7982,0x797D,0x7979,0x7A11,/* 0xF0-0xF7 */ + 0x7A18,0x7A19,0x7A12,0x7A17,0x7A15,0x7A22,0x7A13,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A1B,0x7A10,0x7AA3,0x7AA2,0x7A9E,0x7AEB,0x7B66,0x7B64,/* 0x40-0x47 */ + 0x7B6D,0x7B74,0x7B69,0x7B72,0x7B65,0x7B73,0x7B71,0x7B70,/* 0x48-0x4F */ + 0x7B61,0x7B78,0x7B76,0x7B63,0x7CB2,0x7CB4,0x7CAF,0x7D88,/* 0x50-0x57 */ + 0x7D86,0x7D80,0x7D8D,0x7D7F,0x7D85,0x7D7A,0x7D8E,0x7D7B,/* 0x58-0x5F */ + 0x7D83,0x7D7C,0x7D8C,0x7D94,0x7D84,0x7D7D,0x7D92,0x7F6D,/* 0x60-0x67 */ + 0x7F6B,0x7F67,0x7F68,0x7F6C,0x7FA6,0x7FA5,0x7FA7,0x7FDB,/* 0x68-0x6F */ + 0x7FDC,0x8021,0x8164,0x8160,0x8177,0x815C,0x8169,0x815B,/* 0x70-0x77 */ + 0x8162,0x8172,0x6721,0x815E,0x8176,0x8167,0x816F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8144,0x8161,0x821D,0x8249,0x8244,0x8240,0x8242,/* 0xA0-0xA7 */ + 0x8245,0x84F1,0x843F,0x8456,0x8476,0x8479,0x848F,0x848D,/* 0xA8-0xAF */ + 0x8465,0x8451,0x8440,0x8486,0x8467,0x8430,0x844D,0x847D,/* 0xB0-0xB7 */ + 0x845A,0x8459,0x8474,0x8473,0x845D,0x8507,0x845E,0x8437,/* 0xB8-0xBF */ + 0x843A,0x8434,0x847A,0x8443,0x8478,0x8432,0x8445,0x8429,/* 0xC0-0xC7 */ + 0x83D9,0x844B,0x842F,0x8442,0x842D,0x845F,0x8470,0x8439,/* 0xC8-0xCF */ + 0x844E,0x844C,0x8452,0x846F,0x84C5,0x848E,0x843B,0x8447,/* 0xD0-0xD7 */ + 0x8436,0x8433,0x8468,0x847E,0x8444,0x842B,0x8460,0x8454,/* 0xD8-0xDF */ + 0x846E,0x8450,0x870B,0x8704,0x86F7,0x870C,0x86FA,0x86D6,/* 0xE0-0xE7 */ + 0x86F5,0x874D,0x86F8,0x870E,0x8709,0x8701,0x86F6,0x870D,/* 0xE8-0xEF */ + 0x8705,0x88D6,0x88CB,0x88CD,0x88CE,0x88DE,0x88DB,0x88DA,/* 0xF0-0xF7 */ + 0x88CC,0x88D0,0x8985,0x899B,0x89DF,0x89E5,0x89E4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x89E1,0x89E0,0x89E2,0x89DC,0x89E6,0x8A76,0x8A86,0x8A7F,/* 0x40-0x47 */ + 0x8A61,0x8A3F,0x8A77,0x8A82,0x8A84,0x8A75,0x8A83,0x8A81,/* 0x48-0x4F */ + 0x8A74,0x8A7A,0x8C3C,0x8C4B,0x8C4A,0x8C65,0x8C64,0x8C66,/* 0x50-0x57 */ + 0x8C86,0x8C84,0x8C85,0x8CCC,0x8D68,0x8D69,0x8D91,0x8D8C,/* 0x58-0x5F */ + 0x8D8E,0x8D8F,0x8D8D,0x8D93,0x8D94,0x8D90,0x8D92,0x8DF0,/* 0x60-0x67 */ + 0x8DE0,0x8DEC,0x8DF1,0x8DEE,0x8DD0,0x8DE9,0x8DE3,0x8DE2,/* 0x68-0x6F */ + 0x8DE7,0x8DF2,0x8DEB,0x8DF4,0x8F06,0x8EFF,0x8F01,0x8F00,/* 0x70-0x77 */ + 0x8F05,0x8F07,0x8F08,0x8F02,0x8F0B,0x9052,0x903F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9044,0x9049,0x903D,0x9110,0x910D,0x910F,0x9111,/* 0xA0-0xA7 */ + 0x9116,0x9114,0x910B,0x910E,0x916E,0x916F,0x9248,0x9252,/* 0xA8-0xAF */ + 0x9230,0x923A,0x9266,0x9233,0x9265,0x925E,0x9283,0x922E,/* 0xB0-0xB7 */ + 0x924A,0x9246,0x926D,0x926C,0x924F,0x9260,0x9267,0x926F,/* 0xB8-0xBF */ + 0x9236,0x9261,0x9270,0x9231,0x9254,0x9263,0x9250,0x9272,/* 0xC0-0xC7 */ + 0x924E,0x9253,0x924C,0x9256,0x9232,0x959F,0x959C,0x959E,/* 0xC8-0xCF */ + 0x959B,0x9692,0x9693,0x9691,0x9697,0x96CE,0x96FA,0x96FD,/* 0xD0-0xD7 */ + 0x96F8,0x96F5,0x9773,0x9777,0x9778,0x9772,0x980F,0x980D,/* 0xD8-0xDF */ + 0x980E,0x98AC,0x98F6,0x98F9,0x99AF,0x99B2,0x99B0,0x99B5,/* 0xE0-0xE7 */ + 0x9AAD,0x9AAB,0x9B5B,0x9CEA,0x9CED,0x9CE7,0x9E80,0x9EFD,/* 0xE8-0xEF */ + 0x50E6,0x50D4,0x50D7,0x50E8,0x50F3,0x50DB,0x50EA,0x50DD,/* 0xF0-0xF7 */ + 0x50E4,0x50D3,0x50EC,0x50F0,0x50EF,0x50E3,0x50E0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x51D8,0x5280,0x5281,0x52E9,0x52EB,0x5330,0x53AC,0x5627,/* 0x40-0x47 */ + 0x5615,0x560C,0x5612,0x55FC,0x560F,0x561C,0x5601,0x5613,/* 0x48-0x4F */ + 0x5602,0x55FA,0x561D,0x5604,0x55FF,0x55F9,0x5889,0x587C,/* 0x50-0x57 */ + 0x5890,0x5898,0x5886,0x5881,0x587F,0x5874,0x588B,0x587A,/* 0x58-0x5F */ + 0x5887,0x5891,0x588E,0x5876,0x5882,0x5888,0x587B,0x5894,/* 0x60-0x67 */ + 0x588F,0x58FE,0x596B,0x5ADC,0x5AEE,0x5AE5,0x5AD5,0x5AEA,/* 0x68-0x6F */ + 0x5ADA,0x5AED,0x5AEB,0x5AF3,0x5AE2,0x5AE0,0x5ADB,0x5AEC,/* 0x70-0x77 */ + 0x5ADE,0x5ADD,0x5AD9,0x5AE8,0x5ADF,0x5B77,0x5BE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5BE3,0x5C63,0x5D82,0x5D80,0x5D7D,0x5D86,0x5D7A,/* 0xA0-0xA7 */ + 0x5D81,0x5D77,0x5D8A,0x5D89,0x5D88,0x5D7E,0x5D7C,0x5D8D,/* 0xA8-0xAF */ + 0x5D79,0x5D7F,0x5E58,0x5E59,0x5E53,0x5ED8,0x5ED1,0x5ED7,/* 0xB0-0xB7 */ + 0x5ECE,0x5EDC,0x5ED5,0x5ED9,0x5ED2,0x5ED4,0x5F44,0x5F43,/* 0xB8-0xBF */ + 0x5F6F,0x5FB6,0x612C,0x6128,0x6141,0x615E,0x6171,0x6173,/* 0xC0-0xC7 */ + 0x6152,0x6153,0x6172,0x616C,0x6180,0x6174,0x6154,0x617A,/* 0xC8-0xCF */ + 0x615B,0x6165,0x613B,0x616A,0x6161,0x6156,0x6229,0x6227,/* 0xD0-0xD7 */ + 0x622B,0x642B,0x644D,0x645B,0x645D,0x6474,0x6476,0x6472,/* 0xD8-0xDF */ + 0x6473,0x647D,0x6475,0x6466,0x64A6,0x644E,0x6482,0x645E,/* 0xE0-0xE7 */ + 0x645C,0x644B,0x6453,0x6460,0x6450,0x647F,0x643F,0x646C,/* 0xE8-0xEF */ + 0x646B,0x6459,0x6465,0x6477,0x6573,0x65A0,0x66A1,0x66A0,/* 0xF0-0xF7 */ + 0x669F,0x6705,0x6704,0x6722,0x69B1,0x69B6,0x69C9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x69A0,0x69CE,0x6996,0x69B0,0x69AC,0x69BC,0x6991,0x6999,/* 0x40-0x47 */ + 0x698E,0x69A7,0x698D,0x69A9,0x69BE,0x69AF,0x69BF,0x69C4,/* 0x48-0x4F */ + 0x69BD,0x69A4,0x69D4,0x69B9,0x69CA,0x699A,0x69CF,0x69B3,/* 0x50-0x57 */ + 0x6993,0x69AA,0x69A1,0x699E,0x69D9,0x6997,0x6990,0x69C2,/* 0x58-0x5F */ + 0x69B5,0x69A5,0x69C6,0x6B4A,0x6B4D,0x6B4B,0x6B9E,0x6B9F,/* 0x60-0x67 */ + 0x6BA0,0x6BC3,0x6BC4,0x6BFE,0x6ECE,0x6EF5,0x6EF1,0x6F03,/* 0x68-0x6F */ + 0x6F25,0x6EF8,0x6F37,0x6EFB,0x6F2E,0x6F09,0x6F4E,0x6F19,/* 0x70-0x77 */ + 0x6F1A,0x6F27,0x6F18,0x6F3B,0x6F12,0x6EED,0x6F0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6F36,0x6F73,0x6EF9,0x6EEE,0x6F2D,0x6F40,0x6F30,/* 0xA0-0xA7 */ + 0x6F3C,0x6F35,0x6EEB,0x6F07,0x6F0E,0x6F43,0x6F05,0x6EFD,/* 0xA8-0xAF */ + 0x6EF6,0x6F39,0x6F1C,0x6EFC,0x6F3A,0x6F1F,0x6F0D,0x6F1E,/* 0xB0-0xB7 */ + 0x6F08,0x6F21,0x7187,0x7190,0x7189,0x7180,0x7185,0x7182,/* 0xB8-0xBF */ + 0x718F,0x717B,0x7186,0x7181,0x7197,0x7244,0x7253,0x7297,/* 0xC0-0xC7 */ + 0x7295,0x7293,0x7343,0x734D,0x7351,0x734C,0x7462,0x7473,/* 0xC8-0xCF */ + 0x7471,0x7475,0x7472,0x7467,0x746E,0x7500,0x7502,0x7503,/* 0xD0-0xD7 */ + 0x757D,0x7590,0x7616,0x7608,0x760C,0x7615,0x7611,0x760A,/* 0xD8-0xDF */ + 0x7614,0x76B8,0x7781,0x777C,0x7785,0x7782,0x776E,0x7780,/* 0xE0-0xE7 */ + 0x776F,0x777E,0x7783,0x78B2,0x78AA,0x78B4,0x78AD,0x78A8,/* 0xE8-0xEF */ + 0x787E,0x78AB,0x789E,0x78A5,0x78A0,0x78AC,0x78A2,0x78A4,/* 0xF0-0xF7 */ + 0x7998,0x798A,0x798B,0x7996,0x7995,0x7994,0x7993,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7997,0x7988,0x7992,0x7990,0x7A2B,0x7A4A,0x7A30,0x7A2F,/* 0x40-0x47 */ + 0x7A28,0x7A26,0x7AA8,0x7AAB,0x7AAC,0x7AEE,0x7B88,0x7B9C,/* 0x48-0x4F */ + 0x7B8A,0x7B91,0x7B90,0x7B96,0x7B8D,0x7B8C,0x7B9B,0x7B8E,/* 0x50-0x57 */ + 0x7B85,0x7B98,0x5284,0x7B99,0x7BA4,0x7B82,0x7CBB,0x7CBF,/* 0x58-0x5F */ + 0x7CBC,0x7CBA,0x7DA7,0x7DB7,0x7DC2,0x7DA3,0x7DAA,0x7DC1,/* 0x60-0x67 */ + 0x7DC0,0x7DC5,0x7D9D,0x7DCE,0x7DC4,0x7DC6,0x7DCB,0x7DCC,/* 0x68-0x6F */ + 0x7DAF,0x7DB9,0x7D96,0x7DBC,0x7D9F,0x7DA6,0x7DAE,0x7DA9,/* 0x70-0x77 */ + 0x7DA1,0x7DC9,0x7F73,0x7FE2,0x7FE3,0x7FE5,0x7FDE,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8024,0x805D,0x805C,0x8189,0x8186,0x8183,0x8187,/* 0xA0-0xA7 */ + 0x818D,0x818C,0x818B,0x8215,0x8497,0x84A4,0x84A1,0x849F,/* 0xA8-0xAF */ + 0x84BA,0x84CE,0x84C2,0x84AC,0x84AE,0x84AB,0x84B9,0x84B4,/* 0xB0-0xB7 */ + 0x84C1,0x84CD,0x84AA,0x849A,0x84B1,0x84D0,0x849D,0x84A7,/* 0xB8-0xBF */ + 0x84BB,0x84A2,0x8494,0x84C7,0x84CC,0x849B,0x84A9,0x84AF,/* 0xC0-0xC7 */ + 0x84A8,0x84D6,0x8498,0x84B6,0x84CF,0x84A0,0x84D7,0x84D4,/* 0xC8-0xCF */ + 0x84D2,0x84DB,0x84B0,0x8491,0x8661,0x8733,0x8723,0x8728,/* 0xD0-0xD7 */ + 0x876B,0x8740,0x872E,0x871E,0x8721,0x8719,0x871B,0x8743,/* 0xD8-0xDF */ + 0x872C,0x8741,0x873E,0x8746,0x8720,0x8732,0x872A,0x872D,/* 0xE0-0xE7 */ + 0x873C,0x8712,0x873A,0x8731,0x8735,0x8742,0x8726,0x8727,/* 0xE8-0xEF */ + 0x8738,0x8724,0x871A,0x8730,0x8711,0x88F7,0x88E7,0x88F1,/* 0xF0-0xF7 */ + 0x88F2,0x88FA,0x88FE,0x88EE,0x88FC,0x88F6,0x88FB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x88F0,0x88EC,0x88EB,0x899D,0x89A1,0x899F,0x899E,0x89E9,/* 0x40-0x47 */ + 0x89EB,0x89E8,0x8AAB,0x8A99,0x8A8B,0x8A92,0x8A8F,0x8A96,/* 0x48-0x4F */ + 0x8C3D,0x8C68,0x8C69,0x8CD5,0x8CCF,0x8CD7,0x8D96,0x8E09,/* 0x50-0x57 */ + 0x8E02,0x8DFF,0x8E0D,0x8DFD,0x8E0A,0x8E03,0x8E07,0x8E06,/* 0x58-0x5F */ + 0x8E05,0x8DFE,0x8E00,0x8E04,0x8F10,0x8F11,0x8F0E,0x8F0D,/* 0x60-0x67 */ + 0x9123,0x911C,0x9120,0x9122,0x911F,0x911D,0x911A,0x9124,/* 0x68-0x6F */ + 0x9121,0x911B,0x917A,0x9172,0x9179,0x9173,0x92A5,0x92A4,/* 0x70-0x77 */ + 0x9276,0x929B,0x927A,0x92A0,0x9294,0x92AA,0x928D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x92A6,0x929A,0x92AB,0x9279,0x9297,0x927F,0x92A3,/* 0xA0-0xA7 */ + 0x92EE,0x928E,0x9282,0x9295,0x92A2,0x927D,0x9288,0x92A1,/* 0xA8-0xAF */ + 0x928A,0x9286,0x928C,0x9299,0x92A7,0x927E,0x9287,0x92A9,/* 0xB0-0xB7 */ + 0x929D,0x928B,0x922D,0x969E,0x96A1,0x96FF,0x9758,0x977D,/* 0xB8-0xBF */ + 0x977A,0x977E,0x9783,0x9780,0x9782,0x977B,0x9784,0x9781,/* 0xC0-0xC7 */ + 0x977F,0x97CE,0x97CD,0x9816,0x98AD,0x98AE,0x9902,0x9900,/* 0xC8-0xCF */ + 0x9907,0x999D,0x999C,0x99C3,0x99B9,0x99BB,0x99BA,0x99C2,/* 0xD0-0xD7 */ + 0x99BD,0x99C7,0x9AB1,0x9AE3,0x9AE7,0x9B3E,0x9B3F,0x9B60,/* 0xD8-0xDF */ + 0x9B61,0x9B5F,0x9CF1,0x9CF2,0x9CF5,0x9EA7,0x50FF,0x5103,/* 0xE0-0xE7 */ + 0x5130,0x50F8,0x5106,0x5107,0x50F6,0x50FE,0x510B,0x510C,/* 0xE8-0xEF */ + 0x50FD,0x510A,0x528B,0x528C,0x52F1,0x52EF,0x5648,0x5642,/* 0xF0-0xF7 */ + 0x564C,0x5635,0x5641,0x564A,0x5649,0x5646,0x5658,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x565A,0x5640,0x5633,0x563D,0x562C,0x563E,0x5638,0x562A,/* 0x40-0x47 */ + 0x563A,0x571A,0x58AB,0x589D,0x58B1,0x58A0,0x58A3,0x58AF,/* 0x48-0x4F */ + 0x58AC,0x58A5,0x58A1,0x58FF,0x5AFF,0x5AF4,0x5AFD,0x5AF7,/* 0x50-0x57 */ + 0x5AF6,0x5B03,0x5AF8,0x5B02,0x5AF9,0x5B01,0x5B07,0x5B05,/* 0x58-0x5F */ + 0x5B0F,0x5C67,0x5D99,0x5D97,0x5D9F,0x5D92,0x5DA2,0x5D93,/* 0x60-0x67 */ + 0x5D95,0x5DA0,0x5D9C,0x5DA1,0x5D9A,0x5D9E,0x5E69,0x5E5D,/* 0x68-0x6F */ + 0x5E60,0x5E5C,0x7DF3,0x5EDB,0x5EDE,0x5EE1,0x5F49,0x5FB2,/* 0x70-0x77 */ + 0x618B,0x6183,0x6179,0x61B1,0x61B0,0x61A2,0x6189,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x619B,0x6193,0x61AF,0x61AD,0x619F,0x6192,0x61AA,/* 0xA0-0xA7 */ + 0x61A1,0x618D,0x6166,0x61B3,0x622D,0x646E,0x6470,0x6496,/* 0xA8-0xAF */ + 0x64A0,0x6485,0x6497,0x649C,0x648F,0x648B,0x648A,0x648C,/* 0xB0-0xB7 */ + 0x64A3,0x649F,0x6468,0x64B1,0x6498,0x6576,0x657A,0x6579,/* 0xB8-0xBF */ + 0x657B,0x65B2,0x65B3,0x66B5,0x66B0,0x66A9,0x66B2,0x66B7,/* 0xC0-0xC7 */ + 0x66AA,0x66AF,0x6A00,0x6A06,0x6A17,0x69E5,0x69F8,0x6A15,/* 0xC8-0xCF */ + 0x69F1,0x69E4,0x6A20,0x69FF,0x69EC,0x69E2,0x6A1B,0x6A1D,/* 0xD0-0xD7 */ + 0x69FE,0x6A27,0x69F2,0x69EE,0x6A14,0x69F7,0x69E7,0x6A40,/* 0xD8-0xDF */ + 0x6A08,0x69E6,0x69FB,0x6A0D,0x69FC,0x69EB,0x6A09,0x6A04,/* 0xE0-0xE7 */ + 0x6A18,0x6A25,0x6A0F,0x69F6,0x6A26,0x6A07,0x69F4,0x6A16,/* 0xE8-0xEF */ + 0x6B51,0x6BA5,0x6BA3,0x6BA2,0x6BA6,0x6C01,0x6C00,0x6BFF,/* 0xF0-0xF7 */ + 0x6C02,0x6F41,0x6F26,0x6F7E,0x6F87,0x6FC6,0x6F92,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6F8D,0x6F89,0x6F8C,0x6F62,0x6F4F,0x6F85,0x6F5A,0x6F96,/* 0x40-0x47 */ + 0x6F76,0x6F6C,0x6F82,0x6F55,0x6F72,0x6F52,0x6F50,0x6F57,/* 0x48-0x4F */ + 0x6F94,0x6F93,0x6F5D,0x6F00,0x6F61,0x6F6B,0x6F7D,0x6F67,/* 0x50-0x57 */ + 0x6F90,0x6F53,0x6F8B,0x6F69,0x6F7F,0x6F95,0x6F63,0x6F77,/* 0x58-0x5F */ + 0x6F6A,0x6F7B,0x71B2,0x71AF,0x719B,0x71B0,0x71A0,0x719A,/* 0x60-0x67 */ + 0x71A9,0x71B5,0x719D,0x71A5,0x719E,0x71A4,0x71A1,0x71AA,/* 0x68-0x6F */ + 0x719C,0x71A7,0x71B3,0x7298,0x729A,0x7358,0x7352,0x735E,/* 0x70-0x77 */ + 0x735F,0x7360,0x735D,0x735B,0x7361,0x735A,0x7359,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7362,0x7487,0x7489,0x748A,0x7486,0x7481,0x747D,/* 0xA0-0xA7 */ + 0x7485,0x7488,0x747C,0x7479,0x7508,0x7507,0x757E,0x7625,/* 0xA8-0xAF */ + 0x761E,0x7619,0x761D,0x761C,0x7623,0x761A,0x7628,0x761B,/* 0xB0-0xB7 */ + 0x769C,0x769D,0x769E,0x769B,0x778D,0x778F,0x7789,0x7788,/* 0xB8-0xBF */ + 0x78CD,0x78BB,0x78CF,0x78CC,0x78D1,0x78CE,0x78D4,0x78C8,/* 0xC0-0xC7 */ + 0x78C3,0x78C4,0x78C9,0x799A,0x79A1,0x79A0,0x799C,0x79A2,/* 0xC8-0xCF */ + 0x799B,0x6B76,0x7A39,0x7AB2,0x7AB4,0x7AB3,0x7BB7,0x7BCB,/* 0xD0-0xD7 */ + 0x7BBE,0x7BAC,0x7BCE,0x7BAF,0x7BB9,0x7BCA,0x7BB5,0x7CC5,/* 0xD8-0xDF */ + 0x7CC8,0x7CCC,0x7CCB,0x7DF7,0x7DDB,0x7DEA,0x7DE7,0x7DD7,/* 0xE0-0xE7 */ + 0x7DE1,0x7E03,0x7DFA,0x7DE6,0x7DF6,0x7DF1,0x7DF0,0x7DEE,/* 0xE8-0xEF */ + 0x7DDF,0x7F76,0x7FAC,0x7FB0,0x7FAD,0x7FED,0x7FEB,0x7FEA,/* 0xF0-0xF7 */ + 0x7FEC,0x7FE6,0x7FE8,0x8064,0x8067,0x81A3,0x819F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x819E,0x8195,0x81A2,0x8199,0x8197,0x8216,0x824F,0x8253,/* 0x40-0x47 */ + 0x8252,0x8250,0x824E,0x8251,0x8524,0x853B,0x850F,0x8500,/* 0x48-0x4F */ + 0x8529,0x850E,0x8509,0x850D,0x851F,0x850A,0x8527,0x851C,/* 0x50-0x57 */ + 0x84FB,0x852B,0x84FA,0x8508,0x850C,0x84F4,0x852A,0x84F2,/* 0x58-0x5F */ + 0x8515,0x84F7,0x84EB,0x84F3,0x84FC,0x8512,0x84EA,0x84E9,/* 0x60-0x67 */ + 0x8516,0x84FE,0x8528,0x851D,0x852E,0x8502,0x84FD,0x851E,/* 0x68-0x6F */ + 0x84F6,0x8531,0x8526,0x84E7,0x84E8,0x84F0,0x84EF,0x84F9,/* 0x70-0x77 */ + 0x8518,0x8520,0x8530,0x850B,0x8519,0x852F,0x8662,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8756,0x8763,0x8764,0x8777,0x87E1,0x8773,0x8758,/* 0xA0-0xA7 */ + 0x8754,0x875B,0x8752,0x8761,0x875A,0x8751,0x875E,0x876D,/* 0xA8-0xAF */ + 0x876A,0x8750,0x874E,0x875F,0x875D,0x876F,0x876C,0x877A,/* 0xB0-0xB7 */ + 0x876E,0x875C,0x8765,0x874F,0x877B,0x8775,0x8762,0x8767,/* 0xB8-0xBF */ + 0x8769,0x885A,0x8905,0x890C,0x8914,0x890B,0x8917,0x8918,/* 0xC0-0xC7 */ + 0x8919,0x8906,0x8916,0x8911,0x890E,0x8909,0x89A2,0x89A4,/* 0xC8-0xCF */ + 0x89A3,0x89ED,0x89F0,0x89EC,0x8ACF,0x8AC6,0x8AB8,0x8AD3,/* 0xD0-0xD7 */ + 0x8AD1,0x8AD4,0x8AD5,0x8ABB,0x8AD7,0x8ABE,0x8AC0,0x8AC5,/* 0xD8-0xDF */ + 0x8AD8,0x8AC3,0x8ABA,0x8ABD,0x8AD9,0x8C3E,0x8C4D,0x8C8F,/* 0xE0-0xE7 */ + 0x8CE5,0x8CDF,0x8CD9,0x8CE8,0x8CDA,0x8CDD,0x8CE7,0x8DA0,/* 0xE8-0xEF */ + 0x8D9C,0x8DA1,0x8D9B,0x8E20,0x8E23,0x8E25,0x8E24,0x8E2E,/* 0xF0-0xF7 */ + 0x8E15,0x8E1B,0x8E16,0x8E11,0x8E19,0x8E26,0x8E27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E14,0x8E12,0x8E18,0x8E13,0x8E1C,0x8E17,0x8E1A,0x8F2C,/* 0x40-0x47 */ + 0x8F24,0x8F18,0x8F1A,0x8F20,0x8F23,0x8F16,0x8F17,0x9073,/* 0x48-0x4F */ + 0x9070,0x906F,0x9067,0x906B,0x912F,0x912B,0x9129,0x912A,/* 0x50-0x57 */ + 0x9132,0x9126,0x912E,0x9185,0x9186,0x918A,0x9181,0x9182,/* 0x58-0x5F */ + 0x9184,0x9180,0x92D0,0x92C3,0x92C4,0x92C0,0x92D9,0x92B6,/* 0x60-0x67 */ + 0x92CF,0x92F1,0x92DF,0x92D8,0x92E9,0x92D7,0x92DD,0x92CC,/* 0x68-0x6F */ + 0x92EF,0x92C2,0x92E8,0x92CA,0x92C8,0x92CE,0x92E6,0x92CD,/* 0x70-0x77 */ + 0x92D5,0x92C9,0x92E0,0x92DE,0x92E7,0x92D1,0x92D3,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x92B5,0x92E1,0x92C6,0x92B4,0x957C,0x95AC,0x95AB,/* 0xA0-0xA7 */ + 0x95AE,0x95B0,0x96A4,0x96A2,0x96D3,0x9705,0x9708,0x9702,/* 0xA8-0xAF */ + 0x975A,0x978A,0x978E,0x9788,0x97D0,0x97CF,0x981E,0x981D,/* 0xB0-0xB7 */ + 0x9826,0x9829,0x9828,0x9820,0x981B,0x9827,0x98B2,0x9908,/* 0xB8-0xBF */ + 0x98FA,0x9911,0x9914,0x9916,0x9917,0x9915,0x99DC,0x99CD,/* 0xC0-0xC7 */ + 0x99CF,0x99D3,0x99D4,0x99CE,0x99C9,0x99D6,0x99D8,0x99CB,/* 0xC8-0xCF */ + 0x99D7,0x99CC,0x9AB3,0x9AEC,0x9AEB,0x9AF3,0x9AF2,0x9AF1,/* 0xD0-0xD7 */ + 0x9B46,0x9B43,0x9B67,0x9B74,0x9B71,0x9B66,0x9B76,0x9B75,/* 0xD8-0xDF */ + 0x9B70,0x9B68,0x9B64,0x9B6C,0x9CFC,0x9CFA,0x9CFD,0x9CFF,/* 0xE0-0xE7 */ + 0x9CF7,0x9D07,0x9D00,0x9CF9,0x9CFB,0x9D08,0x9D05,0x9D04,/* 0xE8-0xEF */ + 0x9E83,0x9ED3,0x9F0F,0x9F10,0x511C,0x5113,0x5117,0x511A,/* 0xF0-0xF7 */ + 0x5111,0x51DE,0x5334,0x53E1,0x5670,0x5660,0x566E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5673,0x5666,0x5663,0x566D,0x5672,0x565E,0x5677,0x571C,/* 0x40-0x47 */ + 0x571B,0x58C8,0x58BD,0x58C9,0x58BF,0x58BA,0x58C2,0x58BC,/* 0x48-0x4F */ + 0x58C6,0x5B17,0x5B19,0x5B1B,0x5B21,0x5B14,0x5B13,0x5B10,/* 0x50-0x57 */ + 0x5B16,0x5B28,0x5B1A,0x5B20,0x5B1E,0x5BEF,0x5DAC,0x5DB1,/* 0x58-0x5F */ + 0x5DA9,0x5DA7,0x5DB5,0x5DB0,0x5DAE,0x5DAA,0x5DA8,0x5DB2,/* 0x60-0x67 */ + 0x5DAD,0x5DAF,0x5DB4,0x5E67,0x5E68,0x5E66,0x5E6F,0x5EE9,/* 0x68-0x6F */ + 0x5EE7,0x5EE6,0x5EE8,0x5EE5,0x5F4B,0x5FBC,0x619D,0x61A8,/* 0x70-0x77 */ + 0x6196,0x61C5,0x61B4,0x61C6,0x61C1,0x61CC,0x61BA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x61BF,0x61B8,0x618C,0x64D7,0x64D6,0x64D0,0x64CF,/* 0xA0-0xA7 */ + 0x64C9,0x64BD,0x6489,0x64C3,0x64DB,0x64F3,0x64D9,0x6533,/* 0xA8-0xAF */ + 0x657F,0x657C,0x65A2,0x66C8,0x66BE,0x66C0,0x66CA,0x66CB,/* 0xB0-0xB7 */ + 0x66CF,0x66BD,0x66BB,0x66BA,0x66CC,0x6723,0x6A34,0x6A66,/* 0xB8-0xBF */ + 0x6A49,0x6A67,0x6A32,0x6A68,0x6A3E,0x6A5D,0x6A6D,0x6A76,/* 0xC0-0xC7 */ + 0x6A5B,0x6A51,0x6A28,0x6A5A,0x6A3B,0x6A3F,0x6A41,0x6A6A,/* 0xC8-0xCF */ + 0x6A64,0x6A50,0x6A4F,0x6A54,0x6A6F,0x6A69,0x6A60,0x6A3C,/* 0xD0-0xD7 */ + 0x6A5E,0x6A56,0x6A55,0x6A4D,0x6A4E,0x6A46,0x6B55,0x6B54,/* 0xD8-0xDF */ + 0x6B56,0x6BA7,0x6BAA,0x6BAB,0x6BC8,0x6BC7,0x6C04,0x6C03,/* 0xE0-0xE7 */ + 0x6C06,0x6FAD,0x6FCB,0x6FA3,0x6FC7,0x6FBC,0x6FCE,0x6FC8,/* 0xE8-0xEF */ + 0x6F5E,0x6FC4,0x6FBD,0x6F9E,0x6FCA,0x6FA8,0x7004,0x6FA5,/* 0xF0-0xF7 */ + 0x6FAE,0x6FBA,0x6FAC,0x6FAA,0x6FCF,0x6FBF,0x6FB8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FA2,0x6FC9,0x6FAB,0x6FCD,0x6FAF,0x6FB2,0x6FB0,0x71C5,/* 0x40-0x47 */ + 0x71C2,0x71BF,0x71B8,0x71D6,0x71C0,0x71C1,0x71CB,0x71D4,/* 0x48-0x4F */ + 0x71CA,0x71C7,0x71CF,0x71BD,0x71D8,0x71BC,0x71C6,0x71DA,/* 0x50-0x57 */ + 0x71DB,0x729D,0x729E,0x7369,0x7366,0x7367,0x736C,0x7365,/* 0x58-0x5F */ + 0x736B,0x736A,0x747F,0x749A,0x74A0,0x7494,0x7492,0x7495,/* 0x60-0x67 */ + 0x74A1,0x750B,0x7580,0x762F,0x762D,0x7631,0x763D,0x7633,/* 0x68-0x6F */ + 0x763C,0x7635,0x7632,0x7630,0x76BB,0x76E6,0x779A,0x779D,/* 0x70-0x77 */ + 0x77A1,0x779C,0x779B,0x77A2,0x77A3,0x7795,0x7799,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7797,0x78DD,0x78E9,0x78E5,0x78EA,0x78DE,0x78E3,/* 0xA0-0xA7 */ + 0x78DB,0x78E1,0x78E2,0x78ED,0x78DF,0x78E0,0x79A4,0x7A44,/* 0xA8-0xAF */ + 0x7A48,0x7A47,0x7AB6,0x7AB8,0x7AB5,0x7AB1,0x7AB7,0x7BDE,/* 0xB0-0xB7 */ + 0x7BE3,0x7BE7,0x7BDD,0x7BD5,0x7BE5,0x7BDA,0x7BE8,0x7BF9,/* 0xB8-0xBF */ + 0x7BD4,0x7BEA,0x7BE2,0x7BDC,0x7BEB,0x7BD8,0x7BDF,0x7CD2,/* 0xC0-0xC7 */ + 0x7CD4,0x7CD7,0x7CD0,0x7CD1,0x7E12,0x7E21,0x7E17,0x7E0C,/* 0xC8-0xCF */ + 0x7E1F,0x7E20,0x7E13,0x7E0E,0x7E1C,0x7E15,0x7E1A,0x7E22,/* 0xD0-0xD7 */ + 0x7E0B,0x7E0F,0x7E16,0x7E0D,0x7E14,0x7E25,0x7E24,0x7F43,/* 0xD8-0xDF */ + 0x7F7B,0x7F7C,0x7F7A,0x7FB1,0x7FEF,0x802A,0x8029,0x806C,/* 0xE0-0xE7 */ + 0x81B1,0x81A6,0x81AE,0x81B9,0x81B5,0x81AB,0x81B0,0x81AC,/* 0xE8-0xEF */ + 0x81B4,0x81B2,0x81B7,0x81A7,0x81F2,0x8255,0x8256,0x8257,/* 0xF0-0xF7 */ + 0x8556,0x8545,0x856B,0x854D,0x8553,0x8561,0x8558,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8540,0x8546,0x8564,0x8541,0x8562,0x8544,0x8551,0x8547,/* 0x40-0x47 */ + 0x8563,0x853E,0x855B,0x8571,0x854E,0x856E,0x8575,0x8555,/* 0x48-0x4F */ + 0x8567,0x8560,0x858C,0x8566,0x855D,0x8554,0x8565,0x856C,/* 0x50-0x57 */ + 0x8663,0x8665,0x8664,0x879B,0x878F,0x8797,0x8793,0x8792,/* 0x58-0x5F */ + 0x8788,0x8781,0x8796,0x8798,0x8779,0x8787,0x87A3,0x8785,/* 0x60-0x67 */ + 0x8790,0x8791,0x879D,0x8784,0x8794,0x879C,0x879A,0x8789,/* 0x68-0x6F */ + 0x891E,0x8926,0x8930,0x892D,0x892E,0x8927,0x8931,0x8922,/* 0x70-0x77 */ + 0x8929,0x8923,0x892F,0x892C,0x891F,0x89F1,0x8AE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8AE2,0x8AF2,0x8AF4,0x8AF5,0x8ADD,0x8B14,0x8AE4,/* 0xA0-0xA7 */ + 0x8ADF,0x8AF0,0x8AC8,0x8ADE,0x8AE1,0x8AE8,0x8AFF,0x8AEF,/* 0xA8-0xAF */ + 0x8AFB,0x8C91,0x8C92,0x8C90,0x8CF5,0x8CEE,0x8CF1,0x8CF0,/* 0xB0-0xB7 */ + 0x8CF3,0x8D6C,0x8D6E,0x8DA5,0x8DA7,0x8E33,0x8E3E,0x8E38,/* 0xB8-0xBF */ + 0x8E40,0x8E45,0x8E36,0x8E3C,0x8E3D,0x8E41,0x8E30,0x8E3F,/* 0xC0-0xC7 */ + 0x8EBD,0x8F36,0x8F2E,0x8F35,0x8F32,0x8F39,0x8F37,0x8F34,/* 0xC8-0xCF */ + 0x9076,0x9079,0x907B,0x9086,0x90FA,0x9133,0x9135,0x9136,/* 0xD0-0xD7 */ + 0x9193,0x9190,0x9191,0x918D,0x918F,0x9327,0x931E,0x9308,/* 0xD8-0xDF */ + 0x931F,0x9306,0x930F,0x937A,0x9338,0x933C,0x931B,0x9323,/* 0xE0-0xE7 */ + 0x9312,0x9301,0x9346,0x932D,0x930E,0x930D,0x92CB,0x931D,/* 0xE8-0xEF */ + 0x92FA,0x9325,0x9313,0x92F9,0x92F7,0x9334,0x9302,0x9324,/* 0xF0-0xF7 */ + 0x92FF,0x9329,0x9339,0x9335,0x932A,0x9314,0x930C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x930B,0x92FE,0x9309,0x9300,0x92FB,0x9316,0x95BC,0x95CD,/* 0x40-0x47 */ + 0x95BE,0x95B9,0x95BA,0x95B6,0x95BF,0x95B5,0x95BD,0x96A9,/* 0x48-0x4F */ + 0x96D4,0x970B,0x9712,0x9710,0x9799,0x9797,0x9794,0x97F0,/* 0x50-0x57 */ + 0x97F8,0x9835,0x982F,0x9832,0x9924,0x991F,0x9927,0x9929,/* 0x58-0x5F */ + 0x999E,0x99EE,0x99EC,0x99E5,0x99E4,0x99F0,0x99E3,0x99EA,/* 0x60-0x67 */ + 0x99E9,0x99E7,0x9AB9,0x9ABF,0x9AB4,0x9ABB,0x9AF6,0x9AFA,/* 0x68-0x6F */ + 0x9AF9,0x9AF7,0x9B33,0x9B80,0x9B85,0x9B87,0x9B7C,0x9B7E,/* 0x70-0x77 */ + 0x9B7B,0x9B82,0x9B93,0x9B92,0x9B90,0x9B7A,0x9B95,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9B7D,0x9B88,0x9D25,0x9D17,0x9D20,0x9D1E,0x9D14,/* 0xA0-0xA7 */ + 0x9D29,0x9D1D,0x9D18,0x9D22,0x9D10,0x9D19,0x9D1F,0x9E88,/* 0xA8-0xAF */ + 0x9E86,0x9E87,0x9EAE,0x9EAD,0x9ED5,0x9ED6,0x9EFA,0x9F12,/* 0xB0-0xB7 */ + 0x9F3D,0x5126,0x5125,0x5122,0x5124,0x5120,0x5129,0x52F4,/* 0xB8-0xBF */ + 0x5693,0x568C,0x568D,0x5686,0x5684,0x5683,0x567E,0x5682,/* 0xC0-0xC7 */ + 0x567F,0x5681,0x58D6,0x58D4,0x58CF,0x58D2,0x5B2D,0x5B25,/* 0xC8-0xCF */ + 0x5B32,0x5B23,0x5B2C,0x5B27,0x5B26,0x5B2F,0x5B2E,0x5B7B,/* 0xD0-0xD7 */ + 0x5BF1,0x5BF2,0x5DB7,0x5E6C,0x5E6A,0x5FBE,0x5FBB,0x61C3,/* 0xD8-0xDF */ + 0x61B5,0x61BC,0x61E7,0x61E0,0x61E5,0x61E4,0x61E8,0x61DE,/* 0xE0-0xE7 */ + 0x64EF,0x64E9,0x64E3,0x64EB,0x64E4,0x64E8,0x6581,0x6580,/* 0xE8-0xEF */ + 0x65B6,0x65DA,0x66D2,0x6A8D,0x6A96,0x6A81,0x6AA5,0x6A89,/* 0xF0-0xF7 */ + 0x6A9F,0x6A9B,0x6AA1,0x6A9E,0x6A87,0x6A93,0x6A8E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A95,0x6A83,0x6AA8,0x6AA4,0x6A91,0x6A7F,0x6AA6,0x6A9A,/* 0x40-0x47 */ + 0x6A85,0x6A8C,0x6A92,0x6B5B,0x6BAD,0x6C09,0x6FCC,0x6FA9,/* 0x48-0x4F */ + 0x6FF4,0x6FD4,0x6FE3,0x6FDC,0x6FED,0x6FE7,0x6FE6,0x6FDE,/* 0x50-0x57 */ + 0x6FF2,0x6FDD,0x6FE2,0x6FE8,0x71E1,0x71F1,0x71E8,0x71F2,/* 0x58-0x5F */ + 0x71E4,0x71F0,0x71E2,0x7373,0x736E,0x736F,0x7497,0x74B2,/* 0x60-0x67 */ + 0x74AB,0x7490,0x74AA,0x74AD,0x74B1,0x74A5,0x74AF,0x7510,/* 0x68-0x6F */ + 0x7511,0x7512,0x750F,0x7584,0x7643,0x7648,0x7649,0x7647,/* 0x70-0x77 */ + 0x76A4,0x76E9,0x77B5,0x77AB,0x77B2,0x77B7,0x77B6,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77B4,0x77B1,0x77A8,0x77F0,0x78F3,0x78FD,0x7902,/* 0xA0-0xA7 */ + 0x78FB,0x78FC,0x78F2,0x7905,0x78F9,0x78FE,0x7904,0x79AB,/* 0xA8-0xAF */ + 0x79A8,0x7A5C,0x7A5B,0x7A56,0x7A58,0x7A54,0x7A5A,0x7ABE,/* 0xB0-0xB7 */ + 0x7AC0,0x7AC1,0x7C05,0x7C0F,0x7BF2,0x7C00,0x7BFF,0x7BFB,/* 0xB8-0xBF */ + 0x7C0E,0x7BF4,0x7C0B,0x7BF3,0x7C02,0x7C09,0x7C03,0x7C01,/* 0xC0-0xC7 */ + 0x7BF8,0x7BFD,0x7C06,0x7BF0,0x7BF1,0x7C10,0x7C0A,0x7CE8,/* 0xC8-0xCF */ + 0x7E2D,0x7E3C,0x7E42,0x7E33,0x9848,0x7E38,0x7E2A,0x7E49,/* 0xD0-0xD7 */ + 0x7E40,0x7E47,0x7E29,0x7E4C,0x7E30,0x7E3B,0x7E36,0x7E44,/* 0xD8-0xDF */ + 0x7E3A,0x7F45,0x7F7F,0x7F7E,0x7F7D,0x7FF4,0x7FF2,0x802C,/* 0xE0-0xE7 */ + 0x81BB,0x81C4,0x81CC,0x81CA,0x81C5,0x81C7,0x81BC,0x81E9,/* 0xE8-0xEF */ + 0x825B,0x825A,0x825C,0x8583,0x8580,0x858F,0x85A7,0x8595,/* 0xF0-0xF7 */ + 0x85A0,0x858B,0x85A3,0x857B,0x85A4,0x859A,0x859E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8577,0x857C,0x8589,0x85A1,0x857A,0x8578,0x8557,0x858E,/* 0x40-0x47 */ + 0x8596,0x8586,0x858D,0x8599,0x859D,0x8581,0x85A2,0x8582,/* 0x48-0x4F */ + 0x8588,0x8585,0x8579,0x8576,0x8598,0x8590,0x859F,0x8668,/* 0x50-0x57 */ + 0x87BE,0x87AA,0x87AD,0x87C5,0x87B0,0x87AC,0x87B9,0x87B5,/* 0x58-0x5F */ + 0x87BC,0x87AE,0x87C9,0x87C3,0x87C2,0x87CC,0x87B7,0x87AF,/* 0x60-0x67 */ + 0x87C4,0x87CA,0x87B4,0x87B6,0x87BF,0x87B8,0x87BD,0x87DE,/* 0x68-0x6F */ + 0x87B2,0x8935,0x8933,0x893C,0x893E,0x8941,0x8952,0x8937,/* 0x70-0x77 */ + 0x8942,0x89AD,0x89AF,0x89AE,0x89F2,0x89F3,0x8B1E,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8B18,0x8B16,0x8B11,0x8B05,0x8B0B,0x8B22,0x8B0F,/* 0xA0-0xA7 */ + 0x8B12,0x8B15,0x8B07,0x8B0D,0x8B08,0x8B06,0x8B1C,0x8B13,/* 0xA8-0xAF */ + 0x8B1A,0x8C4F,0x8C70,0x8C72,0x8C71,0x8C6F,0x8C95,0x8C94,/* 0xB0-0xB7 */ + 0x8CF9,0x8D6F,0x8E4E,0x8E4D,0x8E53,0x8E50,0x8E4C,0x8E47,/* 0xB8-0xBF */ + 0x8F43,0x8F40,0x9085,0x907E,0x9138,0x919A,0x91A2,0x919B,/* 0xC0-0xC7 */ + 0x9199,0x919F,0x91A1,0x919D,0x91A0,0x93A1,0x9383,0x93AF,/* 0xC8-0xCF */ + 0x9364,0x9356,0x9347,0x937C,0x9358,0x935C,0x9376,0x9349,/* 0xD0-0xD7 */ + 0x9350,0x9351,0x9360,0x936D,0x938F,0x934C,0x936A,0x9379,/* 0xD8-0xDF */ + 0x9357,0x9355,0x9352,0x934F,0x9371,0x9377,0x937B,0x9361,/* 0xE0-0xE7 */ + 0x935E,0x9363,0x9367,0x9380,0x934E,0x9359,0x95C7,0x95C0,/* 0xE8-0xEF */ + 0x95C9,0x95C3,0x95C5,0x95B7,0x96AE,0x96B0,0x96AC,0x9720,/* 0xF0-0xF7 */ + 0x971F,0x9718,0x971D,0x9719,0x979A,0x97A1,0x979C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x979E,0x979D,0x97D5,0x97D4,0x97F1,0x9841,0x9844,0x984A,/* 0x40-0x47 */ + 0x9849,0x9845,0x9843,0x9925,0x992B,0x992C,0x992A,0x9933,/* 0x48-0x4F */ + 0x9932,0x992F,0x992D,0x9931,0x9930,0x9998,0x99A3,0x99A1,/* 0x50-0x57 */ + 0x9A02,0x99FA,0x99F4,0x99F7,0x99F9,0x99F8,0x99F6,0x99FB,/* 0x58-0x5F */ + 0x99FD,0x99FE,0x99FC,0x9A03,0x9ABE,0x9AFE,0x9AFD,0x9B01,/* 0x60-0x67 */ + 0x9AFC,0x9B48,0x9B9A,0x9BA8,0x9B9E,0x9B9B,0x9BA6,0x9BA1,/* 0x68-0x6F */ + 0x9BA5,0x9BA4,0x9B86,0x9BA2,0x9BA0,0x9BAF,0x9D33,0x9D41,/* 0x70-0x77 */ + 0x9D67,0x9D36,0x9D2E,0x9D2F,0x9D31,0x9D38,0x9D30,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9D45,0x9D42,0x9D43,0x9D3E,0x9D37,0x9D40,0x9D3D,/* 0xA0-0xA7 */ + 0x7FF5,0x9D2D,0x9E8A,0x9E89,0x9E8D,0x9EB0,0x9EC8,0x9EDA,/* 0xA8-0xAF */ + 0x9EFB,0x9EFF,0x9F24,0x9F23,0x9F22,0x9F54,0x9FA0,0x5131,/* 0xB0-0xB7 */ + 0x512D,0x512E,0x5698,0x569C,0x5697,0x569A,0x569D,0x5699,/* 0xB8-0xBF */ + 0x5970,0x5B3C,0x5C69,0x5C6A,0x5DC0,0x5E6D,0x5E6E,0x61D8,/* 0xC0-0xC7 */ + 0x61DF,0x61ED,0x61EE,0x61F1,0x61EA,0x61F0,0x61EB,0x61D6,/* 0xC8-0xCF */ + 0x61E9,0x64FF,0x6504,0x64FD,0x64F8,0x6501,0x6503,0x64FC,/* 0xD0-0xD7 */ + 0x6594,0x65DB,0x66DA,0x66DB,0x66D8,0x6AC5,0x6AB9,0x6ABD,/* 0xD8-0xDF */ + 0x6AE1,0x6AC6,0x6ABA,0x6AB6,0x6AB7,0x6AC7,0x6AB4,0x6AAD,/* 0xE0-0xE7 */ + 0x6B5E,0x6BC9,0x6C0B,0x7007,0x700C,0x700D,0x7001,0x7005,/* 0xE8-0xEF */ + 0x7014,0x700E,0x6FFF,0x7000,0x6FFB,0x7026,0x6FFC,0x6FF7,/* 0xF0-0xF7 */ + 0x700A,0x7201,0x71FF,0x71F9,0x7203,0x71FD,0x7376,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74B8,0x74C0,0x74B5,0x74C1,0x74BE,0x74B6,0x74BB,0x74C2,/* 0x40-0x47 */ + 0x7514,0x7513,0x765C,0x7664,0x7659,0x7650,0x7653,0x7657,/* 0x48-0x4F */ + 0x765A,0x76A6,0x76BD,0x76EC,0x77C2,0x77BA,0x78FF,0x790C,/* 0x50-0x57 */ + 0x7913,0x7914,0x7909,0x7910,0x7912,0x7911,0x79AD,0x79AC,/* 0x58-0x5F */ + 0x7A5F,0x7C1C,0x7C29,0x7C19,0x7C20,0x7C1F,0x7C2D,0x7C1D,/* 0x60-0x67 */ + 0x7C26,0x7C28,0x7C22,0x7C25,0x7C30,0x7E5C,0x7E50,0x7E56,/* 0x68-0x6F */ + 0x7E63,0x7E58,0x7E62,0x7E5F,0x7E51,0x7E60,0x7E57,0x7E53,/* 0x70-0x77 */ + 0x7FB5,0x7FB3,0x7FF7,0x7FF8,0x8075,0x81D1,0x81D2,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x81D0,0x825F,0x825E,0x85B4,0x85C6,0x85C0,0x85C3,/* 0xA0-0xA7 */ + 0x85C2,0x85B3,0x85B5,0x85BD,0x85C7,0x85C4,0x85BF,0x85CB,/* 0xA8-0xAF */ + 0x85CE,0x85C8,0x85C5,0x85B1,0x85B6,0x85D2,0x8624,0x85B8,/* 0xB0-0xB7 */ + 0x85B7,0x85BE,0x8669,0x87E7,0x87E6,0x87E2,0x87DB,0x87EB,/* 0xB8-0xBF */ + 0x87EA,0x87E5,0x87DF,0x87F3,0x87E4,0x87D4,0x87DC,0x87D3,/* 0xC0-0xC7 */ + 0x87ED,0x87D8,0x87E3,0x87A4,0x87D7,0x87D9,0x8801,0x87F4,/* 0xC8-0xCF */ + 0x87E8,0x87DD,0x8953,0x894B,0x894F,0x894C,0x8946,0x8950,/* 0xD0-0xD7 */ + 0x8951,0x8949,0x8B2A,0x8B27,0x8B23,0x8B33,0x8B30,0x8B35,/* 0xD8-0xDF */ + 0x8B47,0x8B2F,0x8B3C,0x8B3E,0x8B31,0x8B25,0x8B37,0x8B26,/* 0xE0-0xE7 */ + 0x8B36,0x8B2E,0x8B24,0x8B3B,0x8B3D,0x8B3A,0x8C42,0x8C75,/* 0xE8-0xEF */ + 0x8C99,0x8C98,0x8C97,0x8CFE,0x8D04,0x8D02,0x8D00,0x8E5C,/* 0xF0-0xF7 */ + 0x8E62,0x8E60,0x8E57,0x8E56,0x8E5E,0x8E65,0x8E67,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E5B,0x8E5A,0x8E61,0x8E5D,0x8E69,0x8E54,0x8F46,0x8F47,/* 0x40-0x47 */ + 0x8F48,0x8F4B,0x9128,0x913A,0x913B,0x913E,0x91A8,0x91A5,/* 0x48-0x4F */ + 0x91A7,0x91AF,0x91AA,0x93B5,0x938C,0x9392,0x93B7,0x939B,/* 0x50-0x57 */ + 0x939D,0x9389,0x93A7,0x938E,0x93AA,0x939E,0x93A6,0x9395,/* 0x58-0x5F */ + 0x9388,0x9399,0x939F,0x938D,0x93B1,0x9391,0x93B2,0x93A4,/* 0x60-0x67 */ + 0x93A8,0x93B4,0x93A3,0x93A5,0x95D2,0x95D3,0x95D1,0x96B3,/* 0x68-0x6F */ + 0x96D7,0x96DA,0x5DC2,0x96DF,0x96D8,0x96DD,0x9723,0x9722,/* 0x70-0x77 */ + 0x9725,0x97AC,0x97AE,0x97A8,0x97AB,0x97A4,0x97AA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x97A2,0x97A5,0x97D7,0x97D9,0x97D6,0x97D8,0x97FA,/* 0xA0-0xA7 */ + 0x9850,0x9851,0x9852,0x98B8,0x9941,0x993C,0x993A,0x9A0F,/* 0xA8-0xAF */ + 0x9A0B,0x9A09,0x9A0D,0x9A04,0x9A11,0x9A0A,0x9A05,0x9A07,/* 0xB0-0xB7 */ + 0x9A06,0x9AC0,0x9ADC,0x9B08,0x9B04,0x9B05,0x9B29,0x9B35,/* 0xB8-0xBF */ + 0x9B4A,0x9B4C,0x9B4B,0x9BC7,0x9BC6,0x9BC3,0x9BBF,0x9BC1,/* 0xC0-0xC7 */ + 0x9BB5,0x9BB8,0x9BD3,0x9BB6,0x9BC4,0x9BB9,0x9BBD,0x9D5C,/* 0xC8-0xCF */ + 0x9D53,0x9D4F,0x9D4A,0x9D5B,0x9D4B,0x9D59,0x9D56,0x9D4C,/* 0xD0-0xD7 */ + 0x9D57,0x9D52,0x9D54,0x9D5F,0x9D58,0x9D5A,0x9E8E,0x9E8C,/* 0xD8-0xDF */ + 0x9EDF,0x9F01,0x9F00,0x9F16,0x9F25,0x9F2B,0x9F2A,0x9F29,/* 0xE0-0xE7 */ + 0x9F28,0x9F4C,0x9F55,0x5134,0x5135,0x5296,0x52F7,0x53B4,/* 0xE8-0xEF */ + 0x56AB,0x56AD,0x56A6,0x56A7,0x56AA,0x56AC,0x58DA,0x58DD,/* 0xF0-0xF7 */ + 0x58DB,0x5912,0x5B3D,0x5B3E,0x5B3F,0x5DC3,0x5E70,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5FBF,0x61FB,0x6507,0x6510,0x650D,0x6509,0x650C,0x650E,/* 0x40-0x47 */ + 0x6584,0x65DE,0x65DD,0x66DE,0x6AE7,0x6AE0,0x6ACC,0x6AD1,/* 0x48-0x4F */ + 0x6AD9,0x6ACB,0x6ADF,0x6ADC,0x6AD0,0x6AEB,0x6ACF,0x6ACD,/* 0x50-0x57 */ + 0x6ADE,0x6B60,0x6BB0,0x6C0C,0x7019,0x7027,0x7020,0x7016,/* 0x58-0x5F */ + 0x702B,0x7021,0x7022,0x7023,0x7029,0x7017,0x7024,0x701C,/* 0x60-0x67 */ + 0x702A,0x720C,0x720A,0x7207,0x7202,0x7205,0x72A5,0x72A6,/* 0x68-0x6F */ + 0x72A4,0x72A3,0x72A1,0x74CB,0x74C5,0x74B7,0x74C3,0x7516,/* 0x70-0x77 */ + 0x7660,0x77C9,0x77CA,0x77C4,0x77F1,0x791D,0x791B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7921,0x791C,0x7917,0x791E,0x79B0,0x7A67,0x7A68,/* 0xA0-0xA7 */ + 0x7C33,0x7C3C,0x7C39,0x7C2C,0x7C3B,0x7CEC,0x7CEA,0x7E76,/* 0xA8-0xAF */ + 0x7E75,0x7E78,0x7E70,0x7E77,0x7E6F,0x7E7A,0x7E72,0x7E74,/* 0xB0-0xB7 */ + 0x7E68,0x7F4B,0x7F4A,0x7F83,0x7F86,0x7FB7,0x7FFD,0x7FFE,/* 0xB8-0xBF */ + 0x8078,0x81D7,0x81D5,0x8264,0x8261,0x8263,0x85EB,0x85F1,/* 0xC0-0xC7 */ + 0x85ED,0x85D9,0x85E1,0x85E8,0x85DA,0x85D7,0x85EC,0x85F2,/* 0xC8-0xCF */ + 0x85F8,0x85D8,0x85DF,0x85E3,0x85DC,0x85D1,0x85F0,0x85E6,/* 0xD0-0xD7 */ + 0x85EF,0x85DE,0x85E2,0x8800,0x87FA,0x8803,0x87F6,0x87F7,/* 0xD8-0xDF */ + 0x8809,0x880C,0x880B,0x8806,0x87FC,0x8808,0x87FF,0x880A,/* 0xE0-0xE7 */ + 0x8802,0x8962,0x895A,0x895B,0x8957,0x8961,0x895C,0x8958,/* 0xE8-0xEF */ + 0x895D,0x8959,0x8988,0x89B7,0x89B6,0x89F6,0x8B50,0x8B48,/* 0xF0-0xF7 */ + 0x8B4A,0x8B40,0x8B53,0x8B56,0x8B54,0x8B4B,0x8B55,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B51,0x8B42,0x8B52,0x8B57,0x8C43,0x8C77,0x8C76,0x8C9A,/* 0x40-0x47 */ + 0x8D06,0x8D07,0x8D09,0x8DAC,0x8DAA,0x8DAD,0x8DAB,0x8E6D,/* 0x48-0x4F */ + 0x8E78,0x8E73,0x8E6A,0x8E6F,0x8E7B,0x8EC2,0x8F52,0x8F51,/* 0x50-0x57 */ + 0x8F4F,0x8F50,0x8F53,0x8FB4,0x9140,0x913F,0x91B0,0x91AD,/* 0x58-0x5F */ + 0x93DE,0x93C7,0x93CF,0x93C2,0x93DA,0x93D0,0x93F9,0x93EC,/* 0x60-0x67 */ + 0x93CC,0x93D9,0x93A9,0x93E6,0x93CA,0x93D4,0x93EE,0x93E3,/* 0x68-0x6F */ + 0x93D5,0x93C4,0x93CE,0x93C0,0x93D2,0x93E7,0x957D,0x95DA,/* 0x70-0x77 */ + 0x95DB,0x96E1,0x9729,0x972B,0x972C,0x9728,0x9726,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x97B3,0x97B7,0x97B6,0x97DD,0x97DE,0x97DF,0x985C,/* 0xA0-0xA7 */ + 0x9859,0x985D,0x9857,0x98BF,0x98BD,0x98BB,0x98BE,0x9948,/* 0xA8-0xAF */ + 0x9947,0x9943,0x99A6,0x99A7,0x9A1A,0x9A15,0x9A25,0x9A1D,/* 0xB0-0xB7 */ + 0x9A24,0x9A1B,0x9A22,0x9A20,0x9A27,0x9A23,0x9A1E,0x9A1C,/* 0xB8-0xBF */ + 0x9A14,0x9AC2,0x9B0B,0x9B0A,0x9B0E,0x9B0C,0x9B37,0x9BEA,/* 0xC0-0xC7 */ + 0x9BEB,0x9BE0,0x9BDE,0x9BE4,0x9BE6,0x9BE2,0x9BF0,0x9BD4,/* 0xC8-0xCF */ + 0x9BD7,0x9BEC,0x9BDC,0x9BD9,0x9BE5,0x9BD5,0x9BE1,0x9BDA,/* 0xD0-0xD7 */ + 0x9D77,0x9D81,0x9D8A,0x9D84,0x9D88,0x9D71,0x9D80,0x9D78,/* 0xD8-0xDF */ + 0x9D86,0x9D8B,0x9D8C,0x9D7D,0x9D6B,0x9D74,0x9D75,0x9D70,/* 0xE0-0xE7 */ + 0x9D69,0x9D85,0x9D73,0x9D7B,0x9D82,0x9D6F,0x9D79,0x9D7F,/* 0xE8-0xEF */ + 0x9D87,0x9D68,0x9E94,0x9E91,0x9EC0,0x9EFC,0x9F2D,0x9F40,/* 0xF0-0xF7 */ + 0x9F41,0x9F4D,0x9F56,0x9F57,0x9F58,0x5337,0x56B2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x56B5,0x56B3,0x58E3,0x5B45,0x5DC6,0x5DC7,0x5EEE,0x5EEF,/* 0x40-0x47 */ + 0x5FC0,0x5FC1,0x61F9,0x6517,0x6516,0x6515,0x6513,0x65DF,/* 0x48-0x4F */ + 0x66E8,0x66E3,0x66E4,0x6AF3,0x6AF0,0x6AEA,0x6AE8,0x6AF9,/* 0x50-0x57 */ + 0x6AF1,0x6AEE,0x6AEF,0x703C,0x7035,0x702F,0x7037,0x7034,/* 0x58-0x5F */ + 0x7031,0x7042,0x7038,0x703F,0x703A,0x7039,0x7040,0x703B,/* 0x60-0x67 */ + 0x7033,0x7041,0x7213,0x7214,0x72A8,0x737D,0x737C,0x74BA,/* 0x68-0x6F */ + 0x76AB,0x76AA,0x76BE,0x76ED,0x77CC,0x77CE,0x77CF,0x77CD,/* 0x70-0x77 */ + 0x77F2,0x7925,0x7923,0x7927,0x7928,0x7924,0x7929,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79B2,0x7A6E,0x7A6C,0x7A6D,0x7AF7,0x7C49,0x7C48,/* 0xA0-0xA7 */ + 0x7C4A,0x7C47,0x7C45,0x7CEE,0x7E7B,0x7E7E,0x7E81,0x7E80,/* 0xA8-0xAF */ + 0x7FBA,0x7FFF,0x8079,0x81DB,0x81D9,0x820B,0x8268,0x8269,/* 0xB0-0xB7 */ + 0x8622,0x85FF,0x8601,0x85FE,0x861B,0x8600,0x85F6,0x8604,/* 0xB8-0xBF */ + 0x8609,0x8605,0x860C,0x85FD,0x8819,0x8810,0x8811,0x8817,/* 0xC0-0xC7 */ + 0x8813,0x8816,0x8963,0x8966,0x89B9,0x89F7,0x8B60,0x8B6A,/* 0xC8-0xCF */ + 0x8B5D,0x8B68,0x8B63,0x8B65,0x8B67,0x8B6D,0x8DAE,0x8E86,/* 0xD0-0xD7 */ + 0x8E88,0x8E84,0x8F59,0x8F56,0x8F57,0x8F55,0x8F58,0x8F5A,/* 0xD8-0xDF */ + 0x908D,0x9143,0x9141,0x91B7,0x91B5,0x91B2,0x91B3,0x940B,/* 0xE0-0xE7 */ + 0x9413,0x93FB,0x9420,0x940F,0x9414,0x93FE,0x9415,0x9410,/* 0xE8-0xEF */ + 0x9428,0x9419,0x940D,0x93F5,0x9400,0x93F7,0x9407,0x940E,/* 0xF0-0xF7 */ + 0x9416,0x9412,0x93FA,0x9409,0x93F8,0x940A,0x93FF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x93FC,0x940C,0x93F6,0x9411,0x9406,0x95DE,0x95E0,0x95DF,/* 0x40-0x47 */ + 0x972E,0x972F,0x97B9,0x97BB,0x97FD,0x97FE,0x9860,0x9862,/* 0x48-0x4F */ + 0x9863,0x985F,0x98C1,0x98C2,0x9950,0x994E,0x9959,0x994C,/* 0x50-0x57 */ + 0x994B,0x9953,0x9A32,0x9A34,0x9A31,0x9A2C,0x9A2A,0x9A36,/* 0x58-0x5F */ + 0x9A29,0x9A2E,0x9A38,0x9A2D,0x9AC7,0x9ACA,0x9AC6,0x9B10,/* 0x60-0x67 */ + 0x9B12,0x9B11,0x9C0B,0x9C08,0x9BF7,0x9C05,0x9C12,0x9BF8,/* 0x68-0x6F */ + 0x9C40,0x9C07,0x9C0E,0x9C06,0x9C17,0x9C14,0x9C09,0x9D9F,/* 0x70-0x77 */ + 0x9D99,0x9DA4,0x9D9D,0x9D92,0x9D98,0x9D90,0x9D9B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9DA0,0x9D94,0x9D9C,0x9DAA,0x9D97,0x9DA1,0x9D9A,/* 0xA0-0xA7 */ + 0x9DA2,0x9DA8,0x9D9E,0x9DA3,0x9DBF,0x9DA9,0x9D96,0x9DA6,/* 0xA8-0xAF */ + 0x9DA7,0x9E99,0x9E9B,0x9E9A,0x9EE5,0x9EE4,0x9EE7,0x9EE6,/* 0xB0-0xB7 */ + 0x9F30,0x9F2E,0x9F5B,0x9F60,0x9F5E,0x9F5D,0x9F59,0x9F91,/* 0xB8-0xBF */ + 0x513A,0x5139,0x5298,0x5297,0x56C3,0x56BD,0x56BE,0x5B48,/* 0xC0-0xC7 */ + 0x5B47,0x5DCB,0x5DCF,0x5EF1,0x61FD,0x651B,0x6B02,0x6AFC,/* 0xC8-0xCF */ + 0x6B03,0x6AF8,0x6B00,0x7043,0x7044,0x704A,0x7048,0x7049,/* 0xD0-0xD7 */ + 0x7045,0x7046,0x721D,0x721A,0x7219,0x737E,0x7517,0x766A,/* 0xD8-0xDF */ + 0x77D0,0x792D,0x7931,0x792F,0x7C54,0x7C53,0x7CF2,0x7E8A,/* 0xE0-0xE7 */ + 0x7E87,0x7E88,0x7E8B,0x7E86,0x7E8D,0x7F4D,0x7FBB,0x8030,/* 0xE8-0xEF */ + 0x81DD,0x8618,0x862A,0x8626,0x861F,0x8623,0x861C,0x8619,/* 0xF0-0xF7 */ + 0x8627,0x862E,0x8621,0x8620,0x8629,0x861E,0x8625,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8829,0x881D,0x881B,0x8820,0x8824,0x881C,0x882B,0x884A,/* 0x40-0x47 */ + 0x896D,0x8969,0x896E,0x896B,0x89FA,0x8B79,0x8B78,0x8B45,/* 0x48-0x4F */ + 0x8B7A,0x8B7B,0x8D10,0x8D14,0x8DAF,0x8E8E,0x8E8C,0x8F5E,/* 0x50-0x57 */ + 0x8F5B,0x8F5D,0x9146,0x9144,0x9145,0x91B9,0x943F,0x943B,/* 0x58-0x5F */ + 0x9436,0x9429,0x943D,0x943C,0x9430,0x9439,0x942A,0x9437,/* 0x60-0x67 */ + 0x942C,0x9440,0x9431,0x95E5,0x95E4,0x95E3,0x9735,0x973A,/* 0x68-0x6F */ + 0x97BF,0x97E1,0x9864,0x98C9,0x98C6,0x98C0,0x9958,0x9956,/* 0x70-0x77 */ + 0x9A39,0x9A3D,0x9A46,0x9A44,0x9A42,0x9A41,0x9A3A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9A3F,0x9ACD,0x9B15,0x9B17,0x9B18,0x9B16,0x9B3A,/* 0xA0-0xA7 */ + 0x9B52,0x9C2B,0x9C1D,0x9C1C,0x9C2C,0x9C23,0x9C28,0x9C29,/* 0xA8-0xAF */ + 0x9C24,0x9C21,0x9DB7,0x9DB6,0x9DBC,0x9DC1,0x9DC7,0x9DCA,/* 0xB0-0xB7 */ + 0x9DCF,0x9DBE,0x9DC5,0x9DC3,0x9DBB,0x9DB5,0x9DCE,0x9DB9,/* 0xB8-0xBF */ + 0x9DBA,0x9DAC,0x9DC8,0x9DB1,0x9DAD,0x9DCC,0x9DB3,0x9DCD,/* 0xC0-0xC7 */ + 0x9DB2,0x9E7A,0x9E9C,0x9EEB,0x9EEE,0x9EED,0x9F1B,0x9F18,/* 0xC8-0xCF */ + 0x9F1A,0x9F31,0x9F4E,0x9F65,0x9F64,0x9F92,0x4EB9,0x56C6,/* 0xD0-0xD7 */ + 0x56C5,0x56CB,0x5971,0x5B4B,0x5B4C,0x5DD5,0x5DD1,0x5EF2,/* 0xD8-0xDF */ + 0x6521,0x6520,0x6526,0x6522,0x6B0B,0x6B08,0x6B09,0x6C0D,/* 0xE0-0xE7 */ + 0x7055,0x7056,0x7057,0x7052,0x721E,0x721F,0x72A9,0x737F,/* 0xE8-0xEF */ + 0x74D8,0x74D5,0x74D9,0x74D7,0x766D,0x76AD,0x7935,0x79B4,/* 0xF0-0xF7 */ + 0x7A70,0x7A71,0x7C57,0x7C5C,0x7C59,0x7C5B,0x7C5A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7CF4,0x7CF1,0x7E91,0x7F4F,0x7F87,0x81DE,0x826B,0x8634,/* 0x40-0x47 */ + 0x8635,0x8633,0x862C,0x8632,0x8636,0x882C,0x8828,0x8826,/* 0x48-0x4F */ + 0x882A,0x8825,0x8971,0x89BF,0x89BE,0x89FB,0x8B7E,0x8B84,/* 0x50-0x57 */ + 0x8B82,0x8B86,0x8B85,0x8B7F,0x8D15,0x8E95,0x8E94,0x8E9A,/* 0x58-0x5F */ + 0x8E92,0x8E90,0x8E96,0x8E97,0x8F60,0x8F62,0x9147,0x944C,/* 0x60-0x67 */ + 0x9450,0x944A,0x944B,0x944F,0x9447,0x9445,0x9448,0x9449,/* 0x68-0x6F */ + 0x9446,0x973F,0x97E3,0x986A,0x9869,0x98CB,0x9954,0x995B,/* 0x70-0x77 */ + 0x9A4E,0x9A53,0x9A54,0x9A4C,0x9A4F,0x9A48,0x9A4A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9A49,0x9A52,0x9A50,0x9AD0,0x9B19,0x9B2B,0x9B3B,/* 0xA0-0xA7 */ + 0x9B56,0x9B55,0x9C46,0x9C48,0x9C3F,0x9C44,0x9C39,0x9C33,/* 0xA8-0xAF */ + 0x9C41,0x9C3C,0x9C37,0x9C34,0x9C32,0x9C3D,0x9C36,0x9DDB,/* 0xB0-0xB7 */ + 0x9DD2,0x9DDE,0x9DDA,0x9DCB,0x9DD0,0x9DDC,0x9DD1,0x9DDF,/* 0xB8-0xBF */ + 0x9DE9,0x9DD9,0x9DD8,0x9DD6,0x9DF5,0x9DD5,0x9DDD,0x9EB6,/* 0xC0-0xC7 */ + 0x9EF0,0x9F35,0x9F33,0x9F32,0x9F42,0x9F6B,0x9F95,0x9FA2,/* 0xC8-0xCF */ + 0x513D,0x5299,0x58E8,0x58E7,0x5972,0x5B4D,0x5DD8,0x882F,/* 0xD0-0xD7 */ + 0x5F4F,0x6201,0x6203,0x6204,0x6529,0x6525,0x6596,0x66EB,/* 0xD8-0xDF */ + 0x6B11,0x6B12,0x6B0F,0x6BCA,0x705B,0x705A,0x7222,0x7382,/* 0xE0-0xE7 */ + 0x7381,0x7383,0x7670,0x77D4,0x7C67,0x7C66,0x7E95,0x826C,/* 0xE8-0xEF */ + 0x863A,0x8640,0x8639,0x863C,0x8631,0x863B,0x863E,0x8830,/* 0xF0-0xF7 */ + 0x8832,0x882E,0x8833,0x8976,0x8974,0x8973,0x89FE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B8C,0x8B8E,0x8B8B,0x8B88,0x8C45,0x8D19,0x8E98,0x8F64,/* 0x40-0x47 */ + 0x8F63,0x91BC,0x9462,0x9455,0x945D,0x9457,0x945E,0x97C4,/* 0x48-0x4F */ + 0x97C5,0x9800,0x9A56,0x9A59,0x9B1E,0x9B1F,0x9B20,0x9C52,/* 0x50-0x57 */ + 0x9C58,0x9C50,0x9C4A,0x9C4D,0x9C4B,0x9C55,0x9C59,0x9C4C,/* 0x58-0x5F */ + 0x9C4E,0x9DFB,0x9DF7,0x9DEF,0x9DE3,0x9DEB,0x9DF8,0x9DE4,/* 0x60-0x67 */ + 0x9DF6,0x9DE1,0x9DEE,0x9DE6,0x9DF2,0x9DF0,0x9DE2,0x9DEC,/* 0x68-0x6F */ + 0x9DF4,0x9DF3,0x9DE8,0x9DED,0x9EC2,0x9ED0,0x9EF2,0x9EF3,/* 0x70-0x77 */ + 0x9F06,0x9F1C,0x9F38,0x9F37,0x9F36,0x9F43,0x9F4F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9F71,0x9F70,0x9F6E,0x9F6F,0x56D3,0x56CD,0x5B4E,/* 0xA0-0xA7 */ + 0x5C6D,0x652D,0x66ED,0x66EE,0x6B13,0x705F,0x7061,0x705D,/* 0xA8-0xAF */ + 0x7060,0x7223,0x74DB,0x74E5,0x77D5,0x7938,0x79B7,0x79B6,/* 0xB0-0xB7 */ + 0x7C6A,0x7E97,0x7F89,0x826D,0x8643,0x8838,0x8837,0x8835,/* 0xB8-0xBF */ + 0x884B,0x8B94,0x8B95,0x8E9E,0x8E9F,0x8EA0,0x8E9D,0x91BE,/* 0xC0-0xC7 */ + 0x91BD,0x91C2,0x946B,0x9468,0x9469,0x96E5,0x9746,0x9743,/* 0xC8-0xCF */ + 0x9747,0x97C7,0x97E5,0x9A5E,0x9AD5,0x9B59,0x9C63,0x9C67,/* 0xD0-0xD7 */ + 0x9C66,0x9C62,0x9C5E,0x9C60,0x9E02,0x9DFE,0x9E07,0x9E03,/* 0xD8-0xDF */ + 0x9E06,0x9E05,0x9E00,0x9E01,0x9E09,0x9DFF,0x9DFD,0x9E04,/* 0xE0-0xE7 */ + 0x9EA0,0x9F1E,0x9F46,0x9F74,0x9F75,0x9F76,0x56D4,0x652E,/* 0xE8-0xEF */ + 0x65B8,0x6B18,0x6B19,0x6B17,0x6B1A,0x7062,0x7226,0x72AA,/* 0xF0-0xF7 */ + 0x77D8,0x77D9,0x7939,0x7C69,0x7C6B,0x7CF6,0x7E9A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E98,0x7E9B,0x7E99,0x81E0,0x81E1,0x8646,0x8647,0x8648,/* 0x40-0x47 */ + 0x8979,0x897A,0x897C,0x897B,0x89FF,0x8B98,0x8B99,0x8EA5,/* 0x48-0x4F */ + 0x8EA4,0x8EA3,0x946E,0x946D,0x946F,0x9471,0x9473,0x9749,/* 0x50-0x57 */ + 0x9872,0x995F,0x9C68,0x9C6E,0x9C6D,0x9E0B,0x9E0D,0x9E10,/* 0x58-0x5F */ + 0x9E0F,0x9E12,0x9E11,0x9EA1,0x9EF5,0x9F09,0x9F47,0x9F78,/* 0x60-0x67 */ + 0x9F7B,0x9F7A,0x9F79,0x571E,0x7066,0x7C6F,0x883C,0x8DB2,/* 0x68-0x6F */ + 0x8EA6,0x91C3,0x9474,0x9478,0x9476,0x9475,0x9A60,0x9C74,/* 0x70-0x77 */ + 0x9C73,0x9C71,0x9C75,0x9E14,0x9E13,0x9EF6,0x9F0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9FA4,0x7068,0x7065,0x7CF7,0x866A,0x883E,0x883D,/* 0xA0-0xA7 */ + 0x883F,0x8B9E,0x8C9C,0x8EA9,0x8EC9,0x974B,0x9873,0x9874,/* 0xA8-0xAF */ + 0x98CC,0x9961,0x99AB,0x9A64,0x9A66,0x9A67,0x9B24,0x9E15,/* 0xB0-0xB7 */ + 0x9E17,0x9F48,0x6207,0x6B1E,0x7227,0x864C,0x8EA8,0x9482,/* 0xB8-0xBF */ + 0x9480,0x9481,0x9A69,0x9A68,0x9B2E,0x9E19,0x7229,0x864B,/* 0xC0-0xC7 */ + 0x8B9F,0x9483,0x9C79,0x9EB7,0x7675,0x9A6B,0x9C7A,0x9E1D,/* 0xC8-0xCF */ + 0x7069,0x706A,0x9EA4,0x9F7E,0x9F49,0x9F98,0x7881,0x92B9,/* 0xD0-0xD7 */ + 0x88CF,0x58BB,0x6052,0x7CA7,0x5AFA,0x2554,0x2566,0x2557,/* 0xD8-0xDF */ + 0x2560,0x256C,0x2563,0x255A,0x2569,0x255D,0x2552,0x2564,/* 0xE0-0xE7 */ + 0x2555,0x255E,0x256A,0x2561,0x2558,0x2567,0x255B,0x2553,/* 0xE8-0xEF */ + 0x2565,0x2556,0x255F,0x256B,0x2562,0x2559,0x2568,0x255C,/* 0xF0-0xF7 */ + 0x2551,0x2550,0x256D,0x256E,0x2570,0x256F,0x2593,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, NULL, + NULL, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0xBE, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x90-0x93 */ + 0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x94-0x97 */ + 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, 0xA3, 0x4E, /* 0x98-0x9B */ + 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, 0xA3, 0x52, /* 0x9C-0x9F */ + 0xA3, 0x53, 0xA3, 0x54, 0x00, 0x00, 0xA3, 0x55, /* 0xA0-0xA3 */ + 0xA3, 0x56, 0xA3, 0x57, 0xA3, 0x58, 0xA3, 0x59, /* 0xA4-0xA7 */ + 0xA3, 0x5A, 0xA3, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA3, 0x5C, 0xA3, 0x5D, 0xA3, 0x5E, /* 0xB0-0xB3 */ + 0xA3, 0x5F, 0xA3, 0x60, 0xA3, 0x61, 0xA3, 0x62, /* 0xB4-0xB7 */ + 0xA3, 0x63, 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, /* 0xB8-0xBB */ + 0xA3, 0x67, 0xA3, 0x68, 0xA3, 0x69, 0xA3, 0x6A, /* 0xBC-0xBF */ + 0xA3, 0x6B, 0xA3, 0x6C, 0x00, 0x00, 0xA3, 0x6D, /* 0xC0-0xC3 */ + 0xA3, 0x6E, 0xA3, 0x6F, 0xA3, 0x70, 0xA3, 0x71, /* 0xC4-0xC7 */ + 0xA3, 0x72, 0xA3, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x56, /* 0x10-0x13 */ + 0xA1, 0x58, 0xA2, 0x77, 0xA1, 0xFC, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xA7, 0xA1, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0x45, 0x00, 0x00, /* 0x20-0x23 */ + 0xA3, 0xBB, 0xA1, 0x4C, 0xA1, 0x4B, 0xA1, 0x45, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xAC, 0xA1, 0xB2, /* 0x30-0x33 */ + 0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB0, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC3, 0x00, 0x00, /* 0x3C-0x3F */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0x4A, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA2, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x60-0x63 */ + 0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x64-0x67 */ + 0xA2, 0xC1, 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xF6, 0xA1, 0xF4, 0xA1, 0xF7, 0xA1, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x94-0x97 */ + 0xA1, 0xFB, 0xA1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_22[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA2, 0x41, 0xA2, 0x42, 0x00, 0x00, /* 0x14-0x17 */ + 0xA2, 0x58, 0x00, 0x00, 0xA1, 0xD4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, 0xA1, 0xE8, /* 0x1C-0x1F */ + 0xA1, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xFD, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, 0xA1, 0xEC, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDC, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xDA, 0xA1, 0xDD, 0x00, 0x00, 0xA1, 0xDD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xD9, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA1, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE9, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x5B, /* 0x04-0x07 */ +}; + +static const unsigned char u2c_25[512] = { + 0xA2, 0x77, 0x00, 0x00, 0xA2, 0x78, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xA2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xA2, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA2, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA2, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xA2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xA2, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA2, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA2, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xF9, 0xF9, 0xF8, 0xF9, 0xE6, 0xF9, 0xEF, /* 0x50-0x53 */ + 0xF9, 0xDD, 0xF9, 0xE8, 0xF9, 0xF1, 0xF9, 0xDF, /* 0x54-0x57 */ + 0xF9, 0xEC, 0xF9, 0xF5, 0xF9, 0xE3, 0xF9, 0xEE, /* 0x58-0x5B */ + 0xF9, 0xF7, 0xF9, 0xE5, 0xF9, 0xE9, 0xF9, 0xF2, /* 0x5C-0x5F */ + 0xF9, 0xE0, 0xF9, 0xEB, 0xF9, 0xF4, 0xF9, 0xE2, /* 0x60-0x63 */ + 0xF9, 0xE7, 0xF9, 0xF0, 0xF9, 0xDE, 0xF9, 0xED, /* 0x64-0x67 */ + 0xF9, 0xF6, 0xF9, 0xE4, 0xF9, 0xEA, 0xF9, 0xF3, /* 0x68-0x6B */ + 0xF9, 0xE1, 0xA2, 0x7E, 0xA2, 0xA1, 0xA2, 0xA3, /* 0x6C-0x6F */ + 0xA2, 0xA2, 0xA2, 0xAC, 0xA2, 0xAD, 0xA2, 0xAE, /* 0x70-0x73 */ + 0xA1, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA2, 0x62, 0xA2, 0x63, 0xA2, 0x64, /* 0x80-0x83 */ + 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, 0xA2, 0x68, /* 0x84-0x87 */ + 0xA2, 0x69, 0xA2, 0x70, 0xA2, 0x6F, 0xA2, 0x6E, /* 0x88-0x8B */ + 0xA2, 0x6D, 0xA2, 0x6C, 0xA2, 0x6B, 0xA2, 0x6A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, /* 0x90-0x93 */ + 0xA2, 0x76, 0xA2, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xBD, 0xA1, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB6, 0xA1, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA1, 0xBF, 0xA1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xBB, 0xA1, 0xBA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB3, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB7, 0xA1, 0xB4, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA8, 0xA2, 0xA9, /* 0xE0-0xE3 */ + 0xA2, 0xAB, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xB9, 0xA1, 0xB8, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xF0, 0xA1, 0xF2, 0xA1, 0xF1, 0x00, 0x00, /* 0x40-0x43 */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0x40, 0xA1, 0x42, 0xA1, 0x43, 0xA1, 0xB2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0x71, 0xA1, 0x72, 0xA1, 0x6D, 0xA1, 0x6E, /* 0x08-0x0B */ + 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x79, 0xA1, 0x7A, /* 0x0C-0x0F */ + 0xA1, 0x69, 0xA1, 0x6A, 0xA2, 0x45, 0x00, 0x00, /* 0x10-0x13 */ + 0xA1, 0x65, 0xA1, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xE3, 0xA1, 0xA9, 0xA1, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA2, 0xC3, 0xA2, 0xC4, 0xA2, 0xC5, /* 0x20-0x23 */ + 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xC9, /* 0x24-0x27 */ + 0xA2, 0xCA, 0xA2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, /* 0x04-0x07 */ + 0xA3, 0x77, 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, /* 0x08-0x0B */ + 0xA3, 0x7B, 0xA3, 0x7C, 0xA3, 0x7D, 0xA3, 0x7E, /* 0x0C-0x0F */ + 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, 0xA3, 0xA4, /* 0x10-0x13 */ + 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, 0xA3, 0xA8, /* 0x14-0x17 */ + 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, 0xA3, 0xAC, /* 0x18-0x1B */ + 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, 0xA3, 0xB0, /* 0x1C-0x1F */ + 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, 0xA3, 0xB4, /* 0x20-0x23 */ + 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, 0xA3, 0xB8, /* 0x24-0x27 */ + 0xA3, 0xB9, 0xA3, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x40, 0xA4, 0x47, /* 0x90-0x93 */ + 0xA4, 0x54, 0xA5, 0x7C, 0xA4, 0x57, 0xA4, 0xA4, /* 0x94-0x97 */ + 0xA4, 0x55, 0xA5, 0xD2, 0xA4, 0x41, 0xA4, 0xFE, /* 0x98-0x9B */ + 0xA4, 0x42, 0xA4, 0xD1, 0xA6, 0x61, 0xA4, 0x48, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x20-0x23 */ + 0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x24-0x27 */ + 0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x28-0x2B */ + 0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x2C-0x2F */ + 0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x30-0x33 */ + 0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x34-0x37 */ + 0xB3, 0xD2, 0xA5, 0x4E, 0xA9, 0x49, 0xBE, 0xC7, /* 0x38-0x3B */ + 0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0x3C-0x3F */ + 0xB2, 0xBD, 0xA5, 0xF0, 0xA6, 0xDB, 0xA6, 0xDC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x80-0x83 */ + 0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x84-0x87 */ + 0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x88-0x8B */ + 0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x8C-0x8F */ + 0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x90-0x93 */ + 0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x94-0x97 */ + 0xB3, 0xD2, 0xAF, 0xB5, 0xA8, 0x6B, 0xA4, 0x6B, /* 0x98-0x9B */ + 0xBE, 0x41, 0xC0, 0x75, 0xA6, 0x4C, 0xAA, 0x60, /* 0x9C-0x9F */ + 0xB6, 0xB5, 0xA5, 0xF0, 0xBC, 0x67, 0xA1, 0xC0, /* 0xA0-0xA3 */ + 0xA4, 0x57, 0xA4, 0xA4, 0xA4, 0x55, 0xA5, 0xAA, /* 0xA4-0xA7 */ + 0xA5, 0x6B, 0xC2, 0xE5, 0xA9, 0x76, 0xBE, 0xC7, /* 0xA8-0xAB */ + 0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0xAC-0xAF */ + 0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0x55, 0xA2, 0x56, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA2, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0x53, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xA1, 0xEB, 0xA1, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xA2, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_4E[512] = { + 0xA4, 0x40, 0xA4, 0x42, 0x00, 0x00, 0xA4, 0x43, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x45, /* 0x04-0x07 */ + 0xA4, 0x56, 0xA4, 0x54, 0xA4, 0x57, 0xA4, 0x55, /* 0x08-0x0B */ + 0xC9, 0x46, 0xA4, 0xA3, 0xC9, 0x4F, 0xC9, 0x4D, /* 0x0C-0x0F */ + 0xA4, 0xA2, 0xA4, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xA5, 0x42, 0xA5, 0x41, 0xA5, 0x40, 0x00, 0x00, /* 0x14-0x17 */ + 0xA5, 0x43, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xE0, 0xA5, 0xE1, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0x58, /* 0x28-0x2B */ + 0x00, 0x00, 0xA4, 0xA4, 0xC9, 0x50, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA4, 0xA5, 0xC9, 0x63, 0xA6, 0xEA, 0xCB, 0xB1, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xA4, 0x59, 0xA4, 0xA6, 0x00, 0x00, 0xA5, 0x44, /* 0x38-0x3B */ + 0xC9, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0x40, 0xA4, 0x44, /* 0x40-0x43 */ + 0x00, 0x00, 0xA4, 0x5B, 0x00, 0x00, 0xC9, 0x47, /* 0x44-0x47 */ + 0xA4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xA7, /* 0x48-0x4B */ + 0x00, 0x00, 0xA5, 0x45, 0xA5, 0x47, 0xA5, 0x46, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xE2, 0xA5, 0xE3, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC4, 0x00, 0x00, /* 0x54-0x57 */ + 0xAD, 0xBC, 0xA4, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xC9, 0x41, 0xA4, 0x45, 0xA4, 0x5E, 0xA4, 0x5D, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xA5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC5, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xAE, 0xD4, 0x4B, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xC3, 0xDC, 0xB1, /* 0x80-0x83 */ + 0xDC, 0xB2, 0x00, 0x00, 0xA4, 0x46, 0x00, 0x00, /* 0x84-0x87 */ + 0xA4, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC6, /* 0x88-0x8B */ + 0xA4, 0x47, 0xC9, 0x48, 0xA4, 0x5F, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA4, 0xAA, 0xA4, 0xAC, 0xC9, 0x51, /* 0x90-0x93 */ + 0xA4, 0xAD, 0xA4, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA5, 0xE5, 0x00, 0x00, 0xA8, 0xC7, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC8, 0xAB, 0x45, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA4, 0x60, 0xA4, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA5, 0xE6, 0xA5, 0xE8, 0xA5, 0xE7, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC9, /* 0xA8-0xAB */ + 0xA8, 0xCA, 0xAB, 0x46, 0xAB, 0x47, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF6, 0xD6, 0xA4, 0x48, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xA4, 0xB0, 0xA4, 0xAF, 0xC9, 0x52, 0xA4, 0xB1, /* 0xC0-0xC3 */ + 0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xC4-0xC7 */ + 0xC9, 0x54, 0xC9, 0x53, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA5, 0x4A, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0xD4-0xD7 */ + 0xA5, 0x49, 0xA5, 0x50, 0xC9, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC9, 0x66, 0xC9, 0x69, 0xA5, 0x51, 0xA5, 0x61, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC9, 0x68, 0x00, 0x00, 0xA5, 0x4E, /* 0xE0-0xE3 */ + 0xA5, 0x4F, 0xA5, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xC9, 0x65, 0xC9, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA5, 0xF5, 0xC9, 0xB0, 0xA5, 0xF2, 0xA5, 0xF6, /* 0xF0-0xF3 */ + 0xC9, 0xBA, 0xC9, 0xAE, 0xA5, 0xF3, 0xC9, 0xB2, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0xF4, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA5, 0xF7, 0x00, 0x00, 0xA5, 0xE9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0xC9, 0xB1, 0xA5, 0xF8, 0xC9, 0xB5, 0x00, 0x00, /* 0x00-0x03 */ + 0xC9, 0xB9, 0xC9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xC9, 0xB3, 0xA5, 0xEA, 0xA5, 0xEC, 0xA5, 0xF9, /* 0x08-0x0B */ + 0x00, 0x00, 0xA5, 0xEE, 0xC9, 0xAB, 0xA5, 0xF1, /* 0x0C-0x0F */ + 0xA5, 0xEF, 0xA5, 0xF0, 0xC9, 0xBB, 0xC9, 0xB8, /* 0x10-0x13 */ + 0xC9, 0xAF, 0xA5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xC9, 0xAC, 0xA5, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xC9, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xB7, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xC9, 0xAD, 0xCA, 0x66, 0x00, 0x00, 0xA7, 0x42, /* 0x2C-0x2F */ + 0xA6, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x67, /* 0x30-0x33 */ + 0xA6, 0xF1, 0x00, 0x00, 0xA7, 0x44, 0x00, 0x00, /* 0x34-0x37 */ + 0xA6, 0xF9, 0x00, 0x00, 0xA6, 0xF8, 0xCA, 0x5B, /* 0x38-0x3B */ + 0xA6, 0xFC, 0xA6, 0xF7, 0xCA, 0x60, 0xCA, 0x68, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCA, 0x64, 0x00, 0x00, 0xA6, 0xFA, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xFD, 0xA6, 0xEE, /* 0x44-0x47 */ + 0xA7, 0x47, 0xCA, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCB, 0xBD, 0xA6, 0xEC, 0xA7, 0x43, 0xA6, 0xED, /* 0x4C-0x4F */ + 0xA6, 0xF5, 0xA6, 0xF6, 0xCA, 0x62, 0xCA, 0x5E, /* 0x50-0x53 */ + 0xA6, 0xFB, 0xA6, 0xF3, 0xCA, 0x5A, 0xA6, 0xEF, /* 0x54-0x57 */ + 0xCA, 0x65, 0xA7, 0x45, 0xA7, 0x48, 0xA6, 0xF2, /* 0x58-0x5B */ + 0xA7, 0x40, 0xA7, 0x46, 0xA6, 0xF0, 0xCA, 0x63, /* 0x5C-0x5F */ + 0xA7, 0x41, 0xCA, 0x69, 0xCA, 0x5C, 0xA6, 0xFE, /* 0x60-0x63 */ + 0xCA, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x61, /* 0x64-0x67 */ + 0x00, 0x00, 0xA8, 0xD8, 0xCB, 0xBF, 0xCB, 0xCB, /* 0x68-0x6B */ + 0xA8, 0xD0, 0x00, 0x00, 0xCB, 0xCC, 0xA8, 0xCB, /* 0x6C-0x6F */ + 0xA8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xCE, /* 0x70-0x73 */ + 0xCB, 0xB9, 0xA8, 0xD6, 0xCB, 0xB8, 0xCB, 0xBC, /* 0x74-0x77 */ + 0xCB, 0xC3, 0xCB, 0xC1, 0xA8, 0xDE, 0xA8, 0xD9, /* 0x78-0x7B */ + 0xCB, 0xB3, 0xCB, 0xB5, 0xA8, 0xDB, 0xA8, 0xCF, /* 0x7C-0x7F */ + + 0xCB, 0xB6, 0xCB, 0xC2, 0xCB, 0xC9, 0xA8, 0xD4, /* 0x80-0x83 */ + 0xCB, 0xBB, 0xCB, 0xB4, 0xA8, 0xD3, 0xCB, 0xB7, /* 0x84-0x87 */ + 0xA8, 0xD7, 0xCB, 0xBA, 0x00, 0x00, 0xA8, 0xD2, /* 0x88-0x8B */ + 0x00, 0x00, 0xA8, 0xCD, 0x00, 0x00, 0xA8, 0xDC, /* 0x8C-0x8F */ + 0xCB, 0xC4, 0xA8, 0xDD, 0xCB, 0xC8, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0xC6, 0xCB, 0xCA, 0xA8, 0xDA, 0xCB, 0xBE, /* 0x94-0x97 */ + 0xCB, 0xB2, 0x00, 0x00, 0xCB, 0xC0, 0xA8, 0xD1, /* 0x98-0x9B */ + 0xCB, 0xC5, 0xA8, 0xCC, 0xCB, 0xC7, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x56, 0xAB, 0x4A, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xCD, 0xE8, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xAB, 0x49, 0xAB, 0x51, 0xAB, 0x5D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xCD, 0xEE, 0xCD, 0xEC, 0xCD, 0xE7, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x4B, /* 0xBC-0xBF */ + 0xCD, 0xED, 0xCD, 0xE3, 0xAB, 0x59, 0xAB, 0x50, /* 0xC0-0xC3 */ + 0xAB, 0x58, 0xCD, 0xDE, 0x00, 0x00, 0xCD, 0xEA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCD, 0xE1, 0xAB, 0x54, 0xCD, 0xE2, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xDD, 0xAB, 0x5B, 0xAB, 0x4E, /* 0xCC-0xCF */ + 0xAB, 0x57, 0xAB, 0x4D, 0x00, 0x00, 0xCD, 0xDF, /* 0xD0-0xD3 */ + 0xCD, 0xE4, 0x00, 0x00, 0xCD, 0xEB, 0xAB, 0x55, /* 0xD4-0xD7 */ + 0xAB, 0x52, 0xCD, 0xE6, 0xAB, 0x5A, 0xCD, 0xE9, /* 0xD8-0xDB */ + 0xCD, 0xE5, 0xAB, 0x4F, 0xAB, 0x5C, 0xAB, 0x53, /* 0xDC-0xDF */ + 0xAB, 0x4C, 0xAB, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCD, 0xEF, 0x00, 0x00, 0xAD, 0xD7, 0xAD, 0xC1, /* 0xEC-0xEF */ + 0x00, 0x00, 0xAD, 0xD1, 0x00, 0x00, 0xAD, 0xD6, /* 0xF0-0xF3 */ + 0xD0, 0xD0, 0xD0, 0xCF, 0xD0, 0xD4, 0xD0, 0xD5, /* 0xF4-0xF7 */ + 0xAD, 0xC4, 0x00, 0x00, 0xAD, 0xCD, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xDA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0xAD, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xD0, 0xC9, 0xAD, 0xC7, 0xD0, 0xCA, /* 0x04-0x07 */ + 0x00, 0x00, 0xAD, 0xDC, 0x00, 0x00, 0xAD, 0xD3, /* 0x08-0x0B */ + 0xAD, 0xBE, 0xAD, 0xBF, 0xD0, 0xDD, 0xB0, 0xBF, /* 0x0C-0x0F */ + 0x00, 0x00, 0xAD, 0xCC, 0xAD, 0xCB, 0xD0, 0xCB, /* 0x10-0x13 */ + 0xAD, 0xCF, 0xD4, 0x5B, 0xAD, 0xC6, 0xD0, 0xD6, /* 0x14-0x17 */ + 0xAD, 0xD5, 0xAD, 0xD4, 0xAD, 0xCA, 0xD0, 0xCE, /* 0x18-0x1B */ + 0xD0, 0xD7, 0x00, 0x00, 0xD0, 0xC8, 0xAD, 0xC9, /* 0x1C-0x1F */ + 0xD0, 0xD8, 0xAD, 0xD2, 0xD0, 0xCC, 0xAD, 0xC0, /* 0x20-0x23 */ + 0x00, 0x00, 0xAD, 0xC3, 0xAD, 0xC2, 0xD0, 0xD9, /* 0x24-0x27 */ + 0xAD, 0xD0, 0xAD, 0xC5, 0xAD, 0xD9, 0xAD, 0xDB, /* 0x28-0x2B */ + 0xD0, 0xD3, 0xAD, 0xD8, 0x00, 0x00, 0xD0, 0xDB, /* 0x2C-0x2F */ + 0xD0, 0xCD, 0xD0, 0xDC, 0x00, 0x00, 0xD0, 0xD1, /* 0x30-0x33 */ + 0x00, 0x00, 0xD0, 0xDA, 0x00, 0x00, 0xD0, 0xD2, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xAD, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD4, 0x63, 0xD4, 0x57, 0x00, 0x00, 0xB0, 0xB3, /* 0x40-0x43 */ + 0x00, 0x00, 0xD4, 0x5C, 0xD4, 0x62, 0xB0, 0xB2, /* 0x44-0x47 */ + 0xD4, 0x55, 0xB0, 0xB6, 0xD4, 0x59, 0xD4, 0x52, /* 0x48-0x4B */ + 0xB0, 0xB4, 0xD4, 0x56, 0xB0, 0xB9, 0xB0, 0xBE, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD4, 0x67, 0x00, 0x00, 0xD4, 0x51, /* 0x50-0x53 */ + 0x00, 0x00, 0xB0, 0xBA, 0x00, 0x00, 0xD4, 0x66, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xB5, 0xD4, 0x58, /* 0x58-0x5B */ + 0xB0, 0xB1, 0xD4, 0x53, 0xD4, 0x4F, 0xD4, 0x5D, /* 0x5C-0x5F */ + 0xD4, 0x50, 0xD4, 0x4E, 0xD4, 0x5A, 0xD4, 0x60, /* 0x60-0x63 */ + 0xD4, 0x61, 0xB0, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD8, 0x5B, 0xD4, 0x5E, 0xD4, 0x4D, 0xD4, 0x5F, /* 0x68-0x6B */ + 0x00, 0x00, 0xB0, 0xC1, 0xD4, 0x64, 0xB0, 0xC0, /* 0x6C-0x6F */ + 0xD4, 0x4C, 0x00, 0x00, 0xD4, 0x54, 0xD4, 0x65, /* 0x70-0x73 */ + 0xB0, 0xBC, 0xB0, 0xBB, 0xB0, 0xB8, 0xB0, 0xBD, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xAF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xB0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xB3, 0xC8, 0x00, 0x00, 0xD8, 0x5E, 0xD8, 0x57, /* 0x80-0x83 */ + 0x00, 0x00, 0xB3, 0xC5, 0x00, 0x00, 0xD8, 0x5F, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x55, /* 0x88-0x8B */ + 0xD8, 0x58, 0xB3, 0xC4, 0xD8, 0x59, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB3, 0xC7, 0xD8, 0x5D, 0x00, 0x00, /* 0x90-0x93 */ + 0xD8, 0x53, 0xD8, 0x52, 0xB3, 0xC9, 0x00, 0x00, /* 0x94-0x97 */ + 0xB3, 0xCA, 0xB3, 0xC6, 0xB3, 0xCB, 0xD8, 0x51, /* 0x98-0x9B */ + 0xD8, 0x5C, 0xD8, 0x5A, 0xD8, 0x54, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC3, 0xD8, 0x56, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xB6, 0xCA, 0xB6, 0xC4, 0xDC, 0xB7, 0xB6, 0xCD, /* 0xAC-0xAF */ + 0xDC, 0xBD, 0xDC, 0xC0, 0xB6, 0xC6, 0xB6, 0xC7, /* 0xB0-0xB3 */ + 0xDC, 0xBA, 0xB6, 0xC5, 0xDC, 0xC3, 0xB6, 0xCB, /* 0xB4-0xB7 */ + 0xDC, 0xC4, 0x00, 0x00, 0xDC, 0xBF, 0xB6, 0xCC, /* 0xB8-0xBB */ + 0x00, 0x00, 0xDC, 0xB4, 0xB6, 0xC9, 0xDC, 0xB5, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDC, 0xBE, 0xDC, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xB8, 0xB6, 0xC8, 0xDC, 0xB6, 0xB6, 0xCE, /* 0xC4-0xC7 */ + 0xDC, 0xBB, 0xDC, 0xC2, 0xDC, 0xB9, 0xDC, 0xC1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xB6, 0xB9, 0xB3, /* 0xCC-0xCF */ + 0x00, 0x00, 0xB9, 0xB4, 0x00, 0x00, 0xE0, 0xF9, /* 0xD0-0xD3 */ + 0xE0, 0xF1, 0xB9, 0xB2, 0xB9, 0xAF, 0xE0, 0xF2, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xB1, 0xE0, 0xF5, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE0, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE0, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFD, /* 0xE0-0xE3 */ + 0xE0, 0xF8, 0xB9, 0xAE, 0xE0, 0xF0, 0xB9, 0xAC, /* 0xE4-0xE7 */ + 0xE0, 0xF3, 0xB9, 0xB7, 0xE0, 0xF6, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0xFA, 0xB9, 0xB0, 0xB9, 0xAD, 0xE0, 0xFC, /* 0xEC-0xEF */ + 0xE0, 0xFB, 0xB9, 0xB5, 0x00, 0x00, 0xE0, 0xF4, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xBB, 0xF8, 0xE4, 0xEC, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE4, 0xE9, 0xBB, 0xF9, 0x00, 0x00, 0xBB, 0xF7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE4, 0xF0, 0xE4, 0xED, 0xE4, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0xBB, 0xF6, 0x00, 0x00, 0xBB, 0xFA, 0xE4, 0xE7, /* 0x00-0x03 */ + 0xBB, 0xF5, 0xBB, 0xFD, 0xE4, 0xEA, 0xE4, 0xEB, /* 0x04-0x07 */ + 0xBB, 0xFB, 0xBB, 0xFC, 0xE4, 0xF1, 0xE4, 0xEE, /* 0x08-0x0B */ + 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xBE, 0xAA, 0xE8, 0xF8, 0xBE, 0xA7, 0xE8, 0xF5, /* 0x10-0x13 */ + 0xBE, 0xA9, 0xBE, 0xAB, 0x00, 0x00, 0xE8, 0xF6, /* 0x14-0x17 */ + 0xBE, 0xA8, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x18-0x1B */ + 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x76, /* 0x1C-0x1F */ + 0xEC, 0xBD, 0xC0, 0x77, 0xEC, 0xBB, 0x00, 0x00, /* 0x20-0x23 */ + 0xEC, 0xBC, 0xEC, 0xBA, 0xEC, 0xB9, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xEC, 0xBE, 0xC0, 0x75, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xB8, 0xEF, 0xB9, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE4, 0xE8, 0xEF, 0xB7, 0xC0, 0x78, 0xC3, 0x5F, /* 0x30-0x33 */ + 0xF1, 0xEB, 0xF1, 0xEC, 0x00, 0x00, 0xC4, 0xD7, /* 0x34-0x37 */ + 0xC4, 0xD8, 0xF5, 0xC1, 0xF5, 0xC0, 0xC5, 0x6C, /* 0x38-0x3B */ + 0xC5, 0x6B, 0xF7, 0xD0, 0x00, 0x00, 0xA4, 0x49, /* 0x3C-0x3F */ + 0xA4, 0x61, 0xA4, 0xB9, 0x00, 0x00, 0xA4, 0xB8, /* 0x40-0x43 */ + 0xA5, 0x53, 0xA5, 0x52, 0xA5, 0xFC, 0xA5, 0xFB, /* 0x44-0x47 */ + 0xA5, 0xFD, 0xA5, 0xFA, 0x00, 0x00, 0xA7, 0x4A, /* 0x48-0x4B */ + 0xA7, 0x49, 0xA7, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xE0, 0x00, 0x00, /* 0x50-0x53 */ + 0xA8, 0xDF, 0xA8, 0xE1, 0x00, 0x00, 0xAB, 0x5E, /* 0x54-0x57 */ + 0x00, 0x00, 0xA2, 0x59, 0xD0, 0xDE, 0xA2, 0x5A, /* 0x58-0x5B */ + 0xB0, 0xC2, 0xA2, 0x5C, 0xA2, 0x5B, 0xD8, 0x60, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA2, 0x5D, 0xB9, 0xB8, 0xA2, 0x5E, /* 0x60-0x63 */ + 0x00, 0x00, 0xA4, 0x4A, 0x00, 0x00, 0xA4, 0xBA, /* 0x64-0x67 */ + 0xA5, 0xFE, 0xA8, 0xE2, 0x00, 0x00, 0xA4, 0x4B, /* 0x68-0x6B */ + 0xA4, 0xBD, 0xA4, 0xBB, 0xA4, 0xBC, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xA6, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xA7, 0x4C, 0xA8, 0xE4, 0xA8, 0xE3, /* 0x74-0x77 */ + 0xA8, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xAD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xBE, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x4E, /* 0x84-0x87 */ + 0x00, 0x00, 0xA5, 0x54, 0xA5, 0x55, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xA6, 0x41, 0x00, 0x00, 0xCA, 0x6A, /* 0x8C-0x8F */ + 0x00, 0x00, 0xAB, 0x60, 0xAB, 0x5F, 0xD0, 0xE0, /* 0x90-0x93 */ + 0xD0, 0xDF, 0xB0, 0xC3, 0x00, 0x00, 0xA4, 0xBE, /* 0x94-0x97 */ + 0xC9, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAB, 0x61, 0x00, 0x00, 0xAD, 0xE0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xAD, 0xDE, 0xAD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0xAD, 0x00, 0x00, /* 0xA8-0xAB */ + 0xA5, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xA6, 0x42, 0xC9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x4D, 0xA7, 0x4E, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xCA, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xCB, 0xCE, 0xA8, 0xE6, 0xCB, 0xCF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD0, 0xE2, 0xD0, 0xE3, 0xAD, 0xE3, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD0, 0xE4, 0x00, 0x00, 0xD0, 0xE1, 0xAD, 0xE4, /* 0xC8-0xCB */ + 0xAD, 0xE2, 0xAD, 0xE1, 0xD0, 0xE5, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD4, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD8, 0x61, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, /* 0xD4-0xD7 */ + 0xE1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xBB, 0xFE, 0xBE, 0xAE, 0xE8, 0xF9, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA4, 0x4C, 0xA4, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xB0, 0xC4, 0xB3, 0xCD, 0x00, 0x00, 0xB9, 0xB9, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC9, 0x42, 0xA4, 0xBF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA5, 0x59, 0xA5, 0x57, 0xA5, 0x58, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA8, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xA4, 0x4D, 0xA4, 0x4E, 0x00, 0x00, 0xA4, 0x62, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x04-0x07 */ + 0xA4, 0xC2, 0xC9, 0xBE, 0xA5, 0x5A, 0x00, 0x00, /* 0x08-0x0B */ + 0xC9, 0x6B, 0x00, 0x00, 0xA6, 0x46, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC9, 0xBF, 0xA6, 0x44, 0xA6, 0x45, 0xC9, 0xBD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0x47, 0xA6, 0x43, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xCA, 0x6C, 0xAA, 0xEC, 0xCA, 0x6D, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xA7, 0x50, 0xA7, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xA7, 0x53, 0xA7, 0x51, 0xA7, 0x52, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA8, 0xEC, 0xCB, 0xD4, 0xCB, 0xD1, 0xCB, 0xD2, /* 0x30-0x33 */ + 0x00, 0x00, 0xCB, 0xD0, 0xA8, 0xEE, 0xA8, 0xEA, /* 0x34-0x37 */ + 0xA8, 0xE9, 0x00, 0x00, 0xA8, 0xEB, 0xA8, 0xE8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xA8, 0xEF, 0x00, 0x00, 0xAB, 0x63, /* 0x40-0x43 */ + 0xCD, 0xF0, 0x00, 0x00, 0xCB, 0xD3, 0xAB, 0x68, /* 0x44-0x47 */ + 0x00, 0x00, 0xCD, 0xF1, 0xAB, 0x64, 0xAB, 0x67, /* 0x48-0x4B */ + 0xAB, 0x66, 0xAB, 0x65, 0xAB, 0x62, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE8, 0x00, 0x00, /* 0x50-0x53 */ + 0xAD, 0xE7, 0xD0, 0xEB, 0xAD, 0xE5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0xAD, 0xE8, /* 0x58-0x5B */ + 0xAD, 0xE6, 0xAD, 0xE9, 0xD0, 0xE9, 0xD0, 0xEA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD0, 0xE6, 0xD0, 0xEC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xB3, 0xD1, 0xB0, 0xC5, 0xD4, 0x69, /* 0x68-0x6B */ + 0xD4, 0x6B, 0xD4, 0x6A, 0xD4, 0x6C, 0xB0, 0xC6, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xCE, 0x00, 0x00, /* 0x70-0x73 */ + 0xB3, 0xCF, 0xB3, 0xD0, 0x00, 0x00, 0xB6, 0xD0, /* 0x74-0x77 */ + 0xDC, 0xC7, 0x00, 0x00, 0xDC, 0xC6, 0xDC, 0xC8, /* 0x78-0x7B */ + 0xDC, 0xC9, 0xB6, 0xD1, 0x00, 0x00, 0xB6, 0xCF, /* 0x7C-0x7F */ + + 0xE1, 0x41, 0xE1, 0x42, 0xB9, 0xBB, 0xB9, 0xBA, /* 0x80-0x83 */ + 0xE3, 0x5A, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x40, /* 0x84-0x87 */ + 0xBC, 0x41, 0xBC, 0x42, 0xBC, 0x44, 0xE4, 0xF2, /* 0x88-0x8B */ + 0xE4, 0xF3, 0xBC, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBE, 0xAF, 0x00, 0x00, 0xBE, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xED, 0xF5, 0xC3, /* 0x94-0x97 */ + 0xF5, 0xC2, 0xF7, 0xD1, 0x00, 0x00, 0xA4, 0x4F, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x5C, /* 0x9C-0x9F */ + 0xA5, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x48, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x54, /* 0xA8-0xAB */ + 0xA7, 0x57, 0xCA, 0x6F, 0xCA, 0x70, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF1, /* 0xB8-0xBB */ + 0xCB, 0xD5, 0x00, 0x00, 0xA8, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCD, 0xF2, 0xAB, 0x6C, 0xCD, 0xF3, 0xAB, 0x6B, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x69, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xAB, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xC7, 0xD4, 0x6E, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xB0, 0xCA, 0xD4, 0x6D, 0xB1, 0xE5, /* 0xD4-0xD7 */ + 0xB0, 0xC9, 0xB0, 0xC8, 0x00, 0x00, 0xB3, 0xD4, /* 0xD8-0xDB */ + 0x00, 0x00, 0xB3, 0xD3, 0xB3, 0xD2, 0xB6, 0xD2, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xD5, 0xB6, 0xD6, /* 0xE0-0xE3 */ + 0xB6, 0xD4, 0x00, 0x00, 0xB6, 0xD3, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0xE1, 0x44, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0xEC-0xEF */ + 0xBC, 0x45, 0xE4, 0xF4, 0x00, 0x00, 0xBE, 0xB1, /* 0xF0-0xF3 */ + 0xEC, 0xBF, 0xC0, 0x79, 0x00, 0x00, 0xF1, 0xEE, /* 0xF4-0xF7 */ + 0xC4, 0x55, 0x00, 0x00, 0xA4, 0x63, 0xA4, 0xC3, /* 0xF8-0xFB */ + 0xC9, 0x56, 0x00, 0x00, 0xA4, 0xC4, 0xA4, 0xC5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA5, 0x5D, 0xA5, 0x5E, 0x00, 0x00, /* 0x04-0x07 */ + 0xA6, 0x49, 0xCA, 0x71, 0xCB, 0xD6, 0xCB, 0xD7, /* 0x08-0x0B */ + 0x00, 0x00, 0xAB, 0x6D, 0xD0, 0xEE, 0xB0, 0xCC, /* 0x0C-0x0F */ + 0xB0, 0xCB, 0xD8, 0x63, 0xD8, 0x62, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA4, 0x50, 0xA4, 0xC6, 0xA5, 0x5F, /* 0x14-0x17 */ + 0x00, 0x00, 0xB0, 0xCD, 0xC9, 0x43, 0x00, 0x00, /* 0x18-0x1B */ + 0xC9, 0x6C, 0xA5, 0x60, 0x00, 0x00, 0xC9, 0xC2, /* 0x1C-0x1F */ + 0xA6, 0x4B, 0xA6, 0x4A, 0xC9, 0xC1, 0xA7, 0x58, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEA, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD4, 0x6F, 0x00, 0x00, 0xB6, 0xD7, /* 0x2C-0x2F */ + 0xE1, 0x45, 0xB9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFD, /* 0x34-0x37 */ + 0x00, 0x00, 0xA4, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xCB, 0xD8, 0xCD, 0xF4, 0xB0, 0xD0, 0xB0, 0xCE, /* 0x3C-0x3F */ + 0xB0, 0xCF, 0xA4, 0x51, 0x00, 0x00, 0xA4, 0x64, /* 0x40-0x43 */ + 0xA2, 0xCD, 0xA4, 0xCA, 0x00, 0x00, 0xA4, 0xC9, /* 0x44-0x47 */ + 0xA4, 0xC8, 0xA5, 0x63, 0xA5, 0x62, 0x00, 0x00, /* 0x48-0x4B */ + 0xC9, 0x6D, 0xC9, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA8, 0xF5, 0xA8, 0xF2, 0xA8, 0xF4, /* 0x50-0x53 */ + 0xA8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x6E, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD5, 0x00, 0x00, /* 0x58-0x5B */ + 0xA4, 0x52, 0x00, 0x00, 0xA4, 0xCB, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA5, 0x65, 0xA5, 0x64, 0x00, 0x00, 0xCA, 0x72, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xC9, 0x57, 0x00, 0x00, 0xA5, 0x67, 0xA5, 0x66, /* 0x6C-0x6F */ + 0xA6, 0x4C, 0xA6, 0x4D, 0xCA, 0x73, 0xA7, 0x59, /* 0x70-0x73 */ + 0x00, 0x00, 0xA7, 0x5A, 0x00, 0x00, 0xA8, 0xF7, /* 0x74-0x77 */ + 0xA8, 0xF8, 0xA8, 0xF9, 0x00, 0x00, 0xAB, 0x6F, /* 0x78-0x7B */ + 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEB, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xC9, 0x44, 0x00, 0x00, /* 0x80-0x83 */ + 0xA4, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xC4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0x74, 0xCA, 0x75, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD9, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0xDA, 0x00, 0x00, 0xCD, 0xF7, 0xCD, 0xF6, /* 0x94-0x97 */ + 0xCD, 0xF9, 0xCD, 0xF8, 0xAB, 0x70, 0x00, 0x00, /* 0x98-0x9B */ + 0xD4, 0x70, 0xAD, 0xED, 0xD0, 0xEF, 0xAD, 0xEC, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD8, 0x64, 0xB3, 0xD6, 0x00, 0x00, 0xD8, 0x65, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE1, 0x46, 0xB9, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF1, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xC9, 0x58, 0x00, 0x00, 0xA5, 0x68, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xA4, 0x53, 0xA4, 0x65, 0xA4, 0xCE, 0xA4, 0xCD, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA4, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA8, 0xFB, 0x00, 0x00, 0xA8, 0xFA, 0xA8, 0xFC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x71, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE8, 0xFB, 0xC2, 0x4F, 0xA4, 0x66, /* 0xE0-0xE3 */ + 0xA5, 0x6A, 0xA5, 0x79, 0xA5, 0x74, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA5, 0x6F, 0xA5, 0x6E, 0xA5, 0x75, 0xA5, 0x73, /* 0xE8-0xEB */ + 0xA5, 0x6C, 0xA5, 0x7A, 0xA5, 0x6D, 0xA5, 0x69, /* 0xEC-0xEF */ + 0xA5, 0x78, 0xA5, 0x77, 0xA5, 0x76, 0xA5, 0x6B, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xA5, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA5, 0x71, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7B, /* 0xF8-0xFB */ + 0xA5, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0xA6, 0x53, 0x00, 0x00, 0xA6, 0x59, /* 0x00-0x03 */ + 0xA6, 0x55, 0x00, 0x00, 0xA6, 0x5B, 0xC9, 0xC5, /* 0x04-0x07 */ + 0xA6, 0x58, 0xA6, 0x4E, 0xA6, 0x51, 0xA6, 0x54, /* 0x08-0x0B */ + 0xA6, 0x50, 0xA6, 0x57, 0xA6, 0x5A, 0xA6, 0x4F, /* 0x0C-0x0F */ + 0xA6, 0x52, 0xA6, 0x56, 0xA6, 0x5C, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xCA, 0x7E, 0xCA, 0x7B, 0x00, 0x00, 0xA7, 0x67, /* 0x18-0x1B */ + 0xCA, 0x7C, 0xA7, 0x5B, 0xA7, 0x5D, 0xA7, 0x75, /* 0x1C-0x1F */ + 0xA7, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xCA, 0xA5, 0xCA, 0x7D, 0xA7, 0x5F, 0xA7, 0x61, /* 0x24-0x27 */ + 0xCA, 0xA4, 0xA7, 0x68, 0xCA, 0x78, 0xA7, 0x74, /* 0x28-0x2B */ + 0xA7, 0x76, 0xA7, 0x5C, 0xA7, 0x6D, 0x00, 0x00, /* 0x2C-0x2F */ + 0xCA, 0x76, 0xA7, 0x73, 0x00, 0x00, 0xA7, 0x64, /* 0x30-0x33 */ + 0x00, 0x00, 0xA7, 0x6E, 0xA7, 0x6F, 0xCA, 0x77, /* 0x34-0x37 */ + 0xA7, 0x6C, 0xA7, 0x6A, 0x00, 0x00, 0xA7, 0x6B, /* 0x38-0x3B */ + 0xA7, 0x71, 0xCA, 0xA1, 0xA7, 0x5E, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA7, 0x72, 0xCA, 0xA3, 0xA7, 0x66, 0xA7, 0x63, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0x7A, 0xA7, 0x62, 0xCA, 0xA6, /* 0x44-0x47 */ + 0xA7, 0x65, 0x00, 0x00, 0xA7, 0x69, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x60, 0xCA, 0xA2, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCA, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xCB, 0xEB, 0xCB, 0xEA, 0xA9, 0x4F, 0xCB, 0xED, /* 0x60-0x63 */ + 0xCB, 0xEF, 0xCB, 0xE4, 0xCB, 0xE7, 0xCB, 0xEE, /* 0x64-0x67 */ + 0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE1, /* 0x68-0x6B */ + 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE9, /* 0x6C-0x6F */ + 0xCE, 0x49, 0xA9, 0x4B, 0xCE, 0x4D, 0xA8, 0xFD, /* 0x70-0x73 */ + 0xCB, 0xE6, 0xA8, 0xFE, 0xA9, 0x4C, 0xA9, 0x45, /* 0x74-0x77 */ + 0xA9, 0x41, 0x00, 0x00, 0xCB, 0xE2, 0xA9, 0x44, /* 0x78-0x7B */ + 0xA9, 0x49, 0xA9, 0x52, 0xCB, 0xE3, 0xCB, 0xDC, /* 0x7C-0x7F */ + + 0xA9, 0x43, 0xCB, 0xDD, 0xCB, 0xDF, 0x00, 0x00, /* 0x80-0x83 */ + 0xA9, 0x46, 0x00, 0x00, 0xA9, 0x48, 0xCB, 0xDB, /* 0x84-0x87 */ + 0xCB, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, /* 0x88-0x8B */ + 0xA9, 0x4D, 0xCB, 0xE8, 0xA9, 0x53, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA9, 0x4A, 0xCB, 0xDE, 0xA9, 0x47, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA9, 0x42, 0xA9, 0x40, 0x00, 0x00, /* 0x94-0x97 */ + 0xCB, 0xEC, 0x00, 0x00, 0xA9, 0x4E, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0x48, 0xCD, 0xFB, 0xCE, 0x4B, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCD, 0xFD, 0xAB, 0x78, 0xAB, 0xA8, /* 0xA4-0xA7 */ + 0xAB, 0x74, 0xAB, 0xA7, 0xAB, 0x7D, 0xAB, 0xA4, /* 0xA8-0xAB */ + 0xAB, 0x72, 0xCD, 0xFC, 0xCE, 0x43, 0xAB, 0xA3, /* 0xAC-0xAF */ + 0xCE, 0x4F, 0xAB, 0xA5, 0x00, 0x00, 0xAB, 0x79, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x45, 0xCE, 0x42, /* 0xB4-0xB7 */ + 0xAB, 0x77, 0x00, 0x00, 0xCD, 0xFA, 0xAB, 0xA6, /* 0xB8-0xBB */ + 0xCE, 0x4A, 0xAB, 0x7C, 0xCE, 0x4C, 0xAB, 0xA9, /* 0xBC-0xBF */ + 0xAB, 0x73, 0xAB, 0x7E, 0xAB, 0x7B, 0xCE, 0x40, /* 0xC0-0xC3 */ + 0xAB, 0xA1, 0xCE, 0x46, 0xCE, 0x47, 0xAB, 0x7A, /* 0xC4-0xC7 */ + 0xAB, 0xA2, 0xAB, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x75, 0xCD, 0xFE, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x4E, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD1, 0x44, 0xAD, 0xFB, 0xD0, 0xF1, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD0, 0xF6, 0xAD, 0xF4, 0xAE, 0x40, 0xD0, 0xF4, /* 0xE4-0xE7 */ + 0xAD, 0xEF, 0xAD, 0xF9, 0xAD, 0xFE, 0xD0, 0xFB, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xFA, 0xAD, 0xFD, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD0, 0xFE, 0xAD, 0xF5, 0xD0, 0xF5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x42, /* 0xF4-0xF7 */ + 0xD1, 0x43, 0x00, 0x00, 0xAD, 0xF7, 0xD1, 0x41, /* 0xF8-0xFB */ + 0xAD, 0xF3, 0xAE, 0x43, 0x00, 0x00, 0xD0, 0xF8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0xAD, 0xF1, 0x00, 0x00, 0xD1, 0x46, /* 0x00-0x03 */ + 0xD0, 0xF9, 0xD0, 0xFD, 0xAD, 0xF6, 0xAE, 0x42, /* 0x04-0x07 */ + 0xD0, 0xFA, 0xAD, 0xFC, 0xD1, 0x40, 0xD1, 0x47, /* 0x08-0x0B */ + 0xD4, 0xA1, 0x00, 0x00, 0xD1, 0x45, 0xAE, 0x44, /* 0x0C-0x0F */ + 0xAD, 0xF0, 0xD0, 0xFC, 0xD0, 0xF3, 0x00, 0x00, /* 0x10-0x13 */ + 0xAD, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF2, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, 0xAE, 0x41, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0x77, 0x00, 0x00, /* 0x28-0x2B */ + 0xB0, 0xE4, 0xD4, 0xA7, 0xB0, 0xE2, 0xB0, 0xDF, /* 0x2C-0x2F */ + 0xD4, 0x7C, 0xB0, 0xDB, 0xD4, 0xA2, 0xB0, 0xE6, /* 0x30-0x33 */ + 0xD4, 0x76, 0xD4, 0x7B, 0xD4, 0x7A, 0xAD, 0xF2, /* 0x34-0x37 */ + 0xB0, 0xE1, 0xD4, 0xA5, 0x00, 0x00, 0xD4, 0xA8, /* 0x38-0x3B */ + 0xD4, 0x73, 0x00, 0x00, 0xB3, 0xE8, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD4, 0xA9, 0xB0, 0xE7, 0x00, 0x00, 0xB0, 0xD9, /* 0x40-0x43 */ + 0xB0, 0xD6, 0xD4, 0x7E, 0xB0, 0xD3, 0x00, 0x00, /* 0x44-0x47 */ + 0xD4, 0xA6, 0x00, 0x00, 0xB0, 0xDA, 0xD4, 0xAA, /* 0x48-0x4B */ + 0x00, 0x00, 0xD4, 0x74, 0xD4, 0xA4, 0xB0, 0xDD, /* 0x4C-0x4F */ + 0xD4, 0x75, 0xD4, 0x78, 0xD4, 0x7D, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xB0, 0xDE, 0xB0, 0xDC, 0xB0, 0xE8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xB0, 0xE3, 0x00, 0x00, 0xB0, 0xD7, 0xB1, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB0, 0xD8, 0xD4, 0x79, 0xB0, 0xE5, /* 0x60-0x63 */ + 0xB0, 0xE0, 0xD4, 0xA3, 0xB0, 0xD5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD4, 0x71, 0xD4, 0x72, 0xD8, 0x6A, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD7, /* 0x78-0x7B */ + 0xB3, 0xDA, 0xD8, 0x75, 0xB3, 0xEE, 0xD8, 0x78, /* 0x7C-0x7F */ + + 0xB3, 0xD8, 0xD8, 0x71, 0xB3, 0xDE, 0xB3, 0xE4, /* 0x80-0x83 */ + 0xB5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xE2, /* 0x84-0x87 */ + 0xD8, 0x6E, 0xB3, 0xEF, 0xB3, 0xDB, 0xB3, 0xE3, /* 0x88-0x8B */ + 0xD8, 0x76, 0xDC, 0xD7, 0xD8, 0x7B, 0xD8, 0x6F, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD8, 0x66, 0xD8, 0x73, 0xD8, 0x6D, /* 0x90-0x93 */ + 0xB3, 0xE1, 0xD8, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xB3, 0xDD, 0xB3, 0xF1, 0xB3, 0xEA, 0x00, 0x00, /* 0x98-0x9B */ + 0xB3, 0xDF, 0xB3, 0xDC, 0x00, 0x00, 0xB3, 0xE7, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD8, 0x7A, 0xD8, 0x6C, 0xD8, 0x72, /* 0xA0-0xA3 */ + 0xD8, 0x74, 0xD8, 0x68, 0xD8, 0x77, 0xB3, 0xD9, /* 0xA4-0xA7 */ + 0xD8, 0x67, 0x00, 0x00, 0xB3, 0xE0, 0xB3, 0xF0, /* 0xA8-0xAB */ + 0xB3, 0xEC, 0xD8, 0x69, 0xB3, 0xE6, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB3, 0xED, 0xB3, 0xE9, 0xB3, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xD8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xEB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0xBC-0xBF */ + 0xDC, 0xD1, 0x00, 0x00, 0xDC, 0xE0, 0xDC, 0xCA, /* 0xC0-0xC3 */ + 0xDC, 0xD3, 0xB6, 0xE5, 0xB6, 0xE6, 0xB6, 0xDE, /* 0xC4-0xC7 */ + 0xDC, 0xDC, 0xB6, 0xE8, 0xDC, 0xCF, 0xDC, 0xCE, /* 0xC8-0xCB */ + 0xDC, 0xCC, 0xDC, 0xDE, 0xB6, 0xDC, 0xDC, 0xD8, /* 0xCC-0xCF */ + 0xDC, 0xCD, 0xB6, 0xDF, 0xDC, 0xD6, 0xB6, 0xDA, /* 0xD0-0xD3 */ + 0xDC, 0xD2, 0xDC, 0xD9, 0xDC, 0xDB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDC, 0xDF, 0xB6, 0xE3, 0xDC, 0xCB, /* 0xD8-0xDB */ + 0xB6, 0xDD, 0xDC, 0xD0, 0x00, 0x00, 0xB6, 0xD8, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB6, 0xE4, 0xDC, 0xDA, 0xB6, 0xE0, /* 0xE0-0xE3 */ + 0xB6, 0xE1, 0xB6, 0xE7, 0xB6, 0xDB, 0xA2, 0x5F, /* 0xE4-0xE7 */ + 0xB6, 0xD9, 0xDC, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xCD, 0xB9, 0xC8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE1, 0x55, 0xE1, 0x51, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0x4B, 0xB9, 0xC2, 0xB9, 0xBE, 0xE1, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0xB9, 0xBF, 0xE1, 0x4E, 0xE1, 0x50, 0x00, 0x00, /* 0x00-0x03 */ + 0xE1, 0x53, 0x00, 0x00, 0xB9, 0xC4, 0x00, 0x00, /* 0x04-0x07 */ + 0xB9, 0xCB, 0xB9, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0x49, 0xB9, 0xC6, 0xB9, 0xC7, 0xE1, 0x4C, /* 0x0C-0x0F */ + 0xB9, 0xCC, 0x00, 0x00, 0xE1, 0x4A, 0xE1, 0x4F, /* 0x10-0x13 */ + 0xB9, 0xC3, 0xE1, 0x48, 0xB9, 0xC9, 0xB9, 0xC1, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB9, 0xC0, /* 0x18-0x1B */ + 0xE1, 0x4D, 0xE1, 0x52, 0x00, 0x00, 0xB9, 0xCA, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x47, /* 0x24-0x27 */ + 0x00, 0x00, 0xBC, 0x4D, 0xE5, 0x47, 0x00, 0x00, /* 0x28-0x2B */ + 0xE5, 0x44, 0x00, 0x00, 0xBC, 0x47, 0xBC, 0x53, /* 0x2C-0x2F */ + 0xBC, 0x54, 0x00, 0x00, 0xBC, 0x4A, 0xE5, 0x42, /* 0x30-0x33 */ + 0xBC, 0x4C, 0xE4, 0xF9, 0xBC, 0x52, 0x00, 0x00, /* 0x34-0x37 */ + 0xE5, 0x46, 0xBC, 0x49, 0xE5, 0x48, 0xBC, 0x48, /* 0x38-0x3B */ + 0x00, 0x00, 0xE5, 0x43, 0xE5, 0x45, 0xBC, 0x4B, /* 0x3C-0x3F */ + 0xE5, 0x41, 0xE4, 0xFA, 0xE4, 0xF7, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0x6B, 0xE4, 0xFD, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xF6, 0xE4, 0xFC, 0xE4, 0xFB, 0x00, 0x00, /* 0x48-0x4B */ + 0xE4, 0xF8, 0x00, 0x00, 0xBC, 0x4F, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x50, /* 0x54-0x57 */ + 0xE4, 0xFE, 0xBE, 0xB2, 0xE5, 0x40, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0xFD, 0x00, 0x00, 0xBE, 0xBE, 0xE9, 0x42, /* 0x60-0x63 */ + 0xBE, 0xB6, 0xBE, 0xBA, 0xE9, 0x41, 0x00, 0x00, /* 0x64-0x67 */ + 0xBE, 0xB9, 0xBE, 0xB5, 0xBE, 0xB8, 0xBE, 0xB3, /* 0x68-0x6B */ + 0xBE, 0xBD, 0xE9, 0x43, 0xE8, 0xFE, 0xBE, 0xBC, /* 0x6C-0x6F */ + 0xE8, 0xFC, 0xBE, 0xBB, 0xE9, 0x44, 0xE9, 0x40, /* 0x70-0x73 */ + 0xBC, 0x51, 0x00, 0x00, 0xBE, 0xBF, 0xE9, 0x46, /* 0x74-0x77 */ + 0xBE, 0xB7, 0xBE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC6, 0xEC, 0xC8, /* 0x7C-0x7F */ + + 0xC0, 0x7B, 0xEC, 0xC9, 0xEC, 0xC7, 0xEC, 0xC5, /* 0x80-0x83 */ + 0xEC, 0xC4, 0xC0, 0x7D, 0xEC, 0xC3, 0xC0, 0x7E, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xEC, 0xC1, 0xEC, 0xC2, 0xC0, 0x7A, 0xC0, 0xA1, /* 0x8C-0x8F */ + 0xC0, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC0, /* 0x90-0x93 */ + 0x00, 0x00, 0xC2, 0x50, 0x00, 0x00, 0xEF, 0xBC, /* 0x94-0x97 */ + 0xEF, 0xBA, 0xEF, 0xBF, 0xEF, 0xBD, 0x00, 0x00, /* 0x98-0x9B */ + 0xEF, 0xBB, 0xEF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xC3, 0x60, 0xF1, 0xF2, 0xF1, 0xF3, /* 0xA4-0xA7 */ + 0xC4, 0x56, 0x00, 0x00, 0xF1, 0xF4, 0xF1, 0xF0, /* 0xA8-0xAB */ + 0xF1, 0xF5, 0xF1, 0xF1, 0xC2, 0x51, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFE, 0xF4, 0x41, /* 0xB0-0xB3 */ + 0xC4, 0x59, 0xF4, 0x40, 0xC4, 0x58, 0xC4, 0x57, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xC4, 0x5A, 0xF5, 0xC5, 0xF5, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC4, 0xDA, 0xC4, 0xD9, 0xC4, 0xDB, 0xF5, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xD8, 0xF6, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xC5, 0x6D, 0xC5, 0x6F, 0xC5, 0x6E, 0xF6, 0xD9, /* 0xC8-0xCB */ + 0xC5, 0xC8, 0xF8, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC5, 0xF1, 0x00, 0x00, 0xF8, 0xA5, /* 0xD0-0xD3 */ + 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x49, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7D, 0xA5, 0x7C, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA6, 0x5F, 0xA6, 0x5E, 0xC9, 0xC7, /* 0xDC-0xDF */ + 0xA6, 0x5D, 0xC9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xA7, 0x79, 0xCA, 0xA9, 0x00, 0x00, 0xCA, 0xA8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x77, 0xA7, 0x7A, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA7, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCB, 0xF1, 0xA9, 0x54, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x00, 0x00, 0xD1, 0x48, 0xD1, 0x49, 0xAE, 0x45, /* 0x00-0x03 */ + 0xAE, 0x46, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAC, /* 0x04-0x07 */ + 0xB0, 0xE9, 0xB0, 0xEB, 0xD4, 0xAB, 0xB0, 0xEA, /* 0x08-0x0B */ + 0xD8, 0x7C, 0xB3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE9, 0xB6, 0xEA, /* 0x10-0x13 */ + 0xDC, 0xE1, 0x00, 0x00, 0xB9, 0xCF, 0x00, 0x00, /* 0x14-0x17 */ + 0xB9, 0xCE, 0x00, 0x00, 0xE5, 0x49, 0xE9, 0x48, /* 0x18-0x1B */ + 0xE9, 0x47, 0x00, 0x00, 0xF9, 0x6B, 0xA4, 0x67, /* 0x1C-0x1F */ + 0xC9, 0x59, 0x00, 0x00, 0xC9, 0x6E, 0xC9, 0x6F, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xA6, 0x62, 0xA6, 0x66, 0xC9, 0xC9, 0x00, 0x00, /* 0x28-0x2B */ + 0xA6, 0x64, 0xA6, 0x63, 0xC9, 0xC8, 0xA6, 0x65, /* 0x2C-0x2F */ + 0xA6, 0x61, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x60, /* 0x30-0x33 */ + 0xC9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA7, 0x7D, 0xCA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0xAB, 0x00, 0x00, 0xA7, 0xA1, /* 0x44-0x47 */ + 0x00, 0x00, 0xCA, 0xAD, 0xA7, 0x7B, 0xCA, 0xAE, /* 0x48-0x4B */ + 0xCA, 0xAC, 0xA7, 0x7E, 0xA7, 0xA2, 0xA7, 0xA5, /* 0x4C-0x4F */ + 0xA7, 0xA4, 0xA7, 0x7C, 0xCA, 0xAF, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA9, 0x59, 0xCB, 0xFE, 0x00, 0x00, /* 0x60-0x63 */ + 0xA9, 0x5B, 0x00, 0x00, 0xA9, 0x5A, 0x00, 0x00, /* 0x64-0x67 */ + 0xCC, 0x40, 0xA9, 0x58, 0xA9, 0x57, 0xCB, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0xCB, 0xF4, 0x00, 0x00, 0xCB, 0xF2, /* 0x6C-0x6F */ + 0xCB, 0xF7, 0xCB, 0xF6, 0xCB, 0xF3, 0xCB, 0xFC, /* 0x70-0x73 */ + 0xCB, 0xFD, 0xCB, 0xFA, 0xCB, 0xF8, 0xA9, 0x56, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFB, /* 0x78-0x7B */ + 0xA9, 0x5C, 0xCC, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCB, 0xF9, 0x00, 0x00, 0xAB, 0xAB, 0xA9, 0x55, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAC, /* 0x88-0x8B */ + 0xCE, 0x54, 0x00, 0x00, 0x00, 0x00, 0xCE, 0x5A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB2, /* 0x90-0x93 */ + 0xCE, 0x58, 0xCE, 0x5E, 0x00, 0x00, 0xCE, 0x55, /* 0x94-0x97 */ + 0xCE, 0x59, 0xCE, 0x5B, 0xCE, 0x5D, 0xCE, 0x57, /* 0x98-0x9B */ + 0x00, 0x00, 0xCE, 0x56, 0xCE, 0x51, 0xCE, 0x52, /* 0x9C-0x9F */ + 0xAB, 0xAD, 0x00, 0x00, 0xAB, 0xAF, 0xAB, 0xAE, /* 0xA0-0xA3 */ + 0xCE, 0x53, 0xCE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xCE, 0x50, 0xD1, 0x53, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD1, 0x52, 0xD1, 0x57, 0xD1, 0x4E, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD1, 0x51, 0xD1, 0x50, 0x00, 0x00, 0xD1, 0x54, /* 0xBC-0xBF */ + 0x00, 0x00, 0xD1, 0x58, 0xAE, 0x47, 0xAE, 0x4A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0x4F, 0xD1, 0x55, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x49, /* 0xC8-0xCB */ + 0xD1, 0x4A, 0x00, 0x00, 0xAB, 0xB0, 0xD4, 0xBA, /* 0xCC-0xCF */ + 0xD1, 0x56, 0x00, 0x00, 0xD1, 0x4D, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xAE, 0x48, 0xD1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xEC, /* 0xDC-0xDF */ + 0xB0, 0xF0, 0xD4, 0xC1, 0xD4, 0xAF, 0xD4, 0xBD, /* 0xE0-0xE3 */ + 0xB0, 0xF1, 0xD4, 0xBF, 0x00, 0x00, 0xD4, 0xC5, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD4, 0xC0, 0xD4, 0xB4, 0xD4, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */ + 0xD4, 0xCA, 0xD4, 0xC8, 0xD4, 0xBE, 0xD4, 0xB9, /* 0xF0-0xF3 */ + 0xD4, 0xB2, 0xD8, 0xA6, 0xD4, 0xB0, 0xB0, 0xF5, /* 0xF4-0xF7 */ + 0xD4, 0xB7, 0xB0, 0xF6, 0xB0, 0xF2, 0xD4, 0xAD, /* 0xF8-0xFB */ + 0xD4, 0xC3, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xD4, 0xB3, 0xD4, 0xC6, 0xB0, 0xF3, 0x00, 0x00, /* 0x00-0x03 */ + 0xD4, 0xCC, 0xB0, 0xED, 0xB0, 0xEF, 0xD4, 0xBB, /* 0x04-0x07 */ + 0xD4, 0xB6, 0xAE, 0x4B, 0xB0, 0xEE, 0xD4, 0xB8, /* 0x08-0x0B */ + 0xD4, 0xC7, 0xD4, 0xCB, 0xD4, 0xC2, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD4, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD8, 0xA1, 0x00, 0x00, 0xD8, 0xAA, /* 0x18-0x1B */ + 0xD8, 0xA9, 0xB3, 0xFA, 0xD8, 0xA2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB3, 0xFB, 0xB3, 0xF9, 0x00, 0x00, 0xD8, 0xA4, /* 0x20-0x23 */ + 0xB3, 0xF6, 0xD8, 0xA8, 0x00, 0x00, 0xD8, 0xA3, /* 0x24-0x27 */ + 0xD8, 0xA5, 0xD8, 0x7D, 0xB3, 0xF4, 0x00, 0x00, /* 0x28-0x2B */ + 0xD8, 0xB2, 0xD8, 0xB1, 0xD8, 0xAE, 0xB3, 0xF3, /* 0x2C-0x2F */ + 0xB3, 0xF7, 0xB3, 0xF8, 0xD1, 0x4B, 0xD8, 0xAB, /* 0x30-0x33 */ + 0xB3, 0xF5, 0xB0, 0xF4, 0xD8, 0xAD, 0xD8, 0x7E, /* 0x34-0x37 */ + 0xD8, 0xB0, 0xD8, 0xAF, 0x00, 0x00, 0xD8, 0xB3, /* 0x38-0x3B */ + 0x00, 0x00, 0xDC, 0xEF, 0x00, 0x00, 0xD8, 0xAC, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD8, 0xA7, 0xDC, 0xE7, 0xB6, 0xF4, 0xB6, 0xF7, /* 0x48-0x4B */ + 0xB6, 0xF2, 0xDC, 0xE6, 0xDC, 0xEA, 0xDC, 0xE5, /* 0x4C-0x4F */ + 0x00, 0x00, 0xB6, 0xEC, 0xB6, 0xF6, 0xDC, 0xE2, /* 0x50-0x53 */ + 0xB6, 0xF0, 0xDC, 0xE9, 0x00, 0x00, 0xB6, 0xEE, /* 0x54-0x57 */ + 0xB6, 0xED, 0xDC, 0xEC, 0xB6, 0xEF, 0xDC, 0xEE, /* 0x58-0x5B */ + 0x00, 0x00, 0xDC, 0xEB, 0xB6, 0xEB, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF5, 0xDC, 0xF0, /* 0x60-0x63 */ + 0xDC, 0xE4, 0xDC, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF1, /* 0x68-0x6B */ + 0x00, 0x00, 0xB6, 0xF3, 0x00, 0x00, 0xDC, 0xE8, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE1, 0x5D, 0xB9, 0xD0, 0xE1, 0x63, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB9, 0xD5, 0xE1, 0x5F, 0xE1, 0x66, /* 0x78-0x7B */ + 0xE1, 0x57, 0xB9, 0xD7, 0xB9, 0xD1, 0xE1, 0x5C, /* 0x7C-0x7F */ + + 0xBC, 0x55, 0xE1, 0x5B, 0xE1, 0x64, 0xB9, 0xD2, /* 0x80-0x83 */ + 0x00, 0x00, 0xB9, 0xD6, 0xE1, 0x5A, 0xE1, 0x60, /* 0x84-0x87 */ + 0xE1, 0x65, 0xE1, 0x56, 0xB9, 0xD4, 0xE1, 0x5E, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x62, 0xE1, 0x68, /* 0x8C-0x8F */ + 0xE1, 0x58, 0xE1, 0x61, 0x00, 0x00, 0xB9, 0xD3, /* 0x90-0x93 */ + 0xE1, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE1, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xBC, 0x59, 0xE5, 0x4B, 0xBC, 0x57, 0xBC, 0x56, /* 0x9C-0x9F */ + 0xE5, 0x4D, 0xE5, 0x52, 0x00, 0x00, 0xE5, 0x4E, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE5, 0x51, 0xBC, 0x5C, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBE, 0xA5, 0xBC, 0x5B, 0x00, 0x00, 0xE5, 0x4A, /* 0xA8-0xAB */ + 0xE5, 0x50, 0x00, 0x00, 0xBC, 0x5A, 0xE5, 0x4F, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE5, 0x4C, 0x00, 0x00, 0xBC, 0x58, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4D, 0xF9, 0xD9, /* 0xB8-0xBB */ + 0xE9, 0x4F, 0xE9, 0x4A, 0xBE, 0xC1, 0xE9, 0x4C, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBE, 0xC0, 0xE9, 0x4E, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xBE, 0xC3, 0xE9, 0x50, 0xBE, 0xC2, /* 0xC4-0xC7 */ + 0xE9, 0x49, 0xE9, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xA5, 0xEC, 0xCC, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC0, 0xA4, 0xEC, 0xCD, 0xC0, 0xA3, /* 0xD0-0xD3 */ + 0xEC, 0xCB, 0xC0, 0xA2, 0xEC, 0xCA, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC2, 0x53, 0xC2, 0x52, 0xF1, 0xF6, 0xF1, 0xF8, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF1, 0xF7, 0xC3, 0x61, 0xC3, 0x62, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0x63, 0xF4, 0x42, /* 0xE0-0xE3 */ + 0xC4, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD3, /* 0xE4-0xE7 */ + 0xF7, 0xD2, 0xC5, 0xF2, 0x00, 0x00, 0xA4, 0x68, /* 0xE8-0xEB */ + 0xA4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA7, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xCE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB3, 0xFC, 0xB3, 0xFD, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDC, 0xF2, 0xB9, 0xD8, 0xE1, 0x69, 0xE5, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x5A, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB0, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xCC, 0x42, 0xCE, 0x60, 0xD1, 0x59, 0xAE, 0x4C, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x10-0x13 */ + 0xC4, 0xDC, 0xA4, 0x69, 0xA5, 0x7E, 0xC9, 0x70, /* 0x14-0x17 */ + 0x00, 0x00, 0xA6, 0x67, 0xA6, 0x68, 0x00, 0x00, /* 0x18-0x1B */ + 0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB0, 0xF7, 0x00, 0x00, 0xB9, 0xDA, 0x00, 0x00, /* 0x20-0x23 */ + 0xB9, 0xDB, 0xB9, 0xD9, 0x00, 0x00, 0xA4, 0x6A, /* 0x24-0x27 */ + 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD3, 0xA4, 0xD2, /* 0x28-0x2B */ + 0xC9, 0x5B, 0xA4, 0xD4, 0xA5, 0xA1, 0xC9, 0x71, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA5, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x69, /* 0x34-0x37 */ + 0xA6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xC9, 0xCB, 0x00, 0x00, 0xA7, 0xA8, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xA9, 0x61, 0xCC, 0x43, 0x00, 0x00, 0xA9, 0x5F, /* 0x44-0x47 */ + 0xA9, 0x60, 0xA9, 0x5E, 0xD1, 0x5A, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB6, 0xAB, 0xB5, /* 0x4C-0x4F */ + 0xAB, 0xB7, 0xAB, 0xB4, 0x00, 0x00, 0xCE, 0x61, /* 0x50-0x53 */ + 0xA9, 0x62, 0xAB, 0xB3, 0x00, 0x00, 0xAE, 0x4D, /* 0x54-0x57 */ + 0xAE, 0x4E, 0x00, 0x00, 0xAE, 0x4F, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB3, 0xFE, 0xD8, 0xB4, 0xB0, 0xF8, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF8, /* 0x64-0x67 */ + 0x00, 0x00, 0xB9, 0xDD, 0xB9, 0xDC, 0xE1, 0x6A, /* 0x68-0x6B */ + 0x00, 0x00, 0xBC, 0x5D, 0xBE, 0xC4, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEF, 0xC0, 0xF6, 0xDA, 0xF7, 0xD4, 0xA4, 0x6B, /* 0x70-0x73 */ + 0xA5, 0xA3, 0x00, 0x00, 0xA5, 0xA4, 0xC9, 0xD1, /* 0x74-0x77 */ + 0xA6, 0x6C, 0xA6, 0x6F, 0x00, 0x00, 0xC9, 0xCF, /* 0x78-0x7B */ + 0xC9, 0xCD, 0xA6, 0x6E, 0xC9, 0xD0, 0xC9, 0xD2, /* 0x7C-0x7F */ + + 0xC9, 0xCC, 0xA6, 0x71, 0xA6, 0x70, 0xA6, 0x6D, /* 0x80-0x83 */ + 0xA6, 0x6B, 0xC9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB3, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xA7, 0xB0, 0xCA, 0xB6, 0xCA, 0xB9, /* 0x8C-0x8F */ + 0xCA, 0xB8, 0x00, 0x00, 0xA7, 0xAA, 0xA7, 0xB2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xAF, 0xCA, 0xB5, /* 0x94-0x97 */ + 0xCA, 0xB3, 0xA7, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xA7, 0xA9, 0xA7, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCA, 0xB4, 0xCA, 0xBB, 0xCA, 0xB7, 0xA7, 0xAD, /* 0xA0-0xA3 */ + 0xA7, 0xB1, 0xA7, 0xB4, 0xCA, 0xB2, 0xCA, 0xBA, /* 0xA4-0xA7 */ + 0xA7, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x67, 0xA9, 0x6F, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCC, 0x4F, 0xCC, 0x48, 0xA9, 0x70, /* 0xB0-0xB3 */ + 0xCC, 0x53, 0xCC, 0x44, 0xCC, 0x4B, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xA9, 0x66, 0xCC, 0x45, 0xA9, 0x64, /* 0xB8-0xBB */ + 0xCC, 0x4C, 0xCC, 0x50, 0xA9, 0x63, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCC, 0x51, 0xCC, 0x4A, 0x00, 0x00, 0xCC, 0x4D, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xA9, 0x72, 0xA9, 0x69, 0xCC, 0x54, /* 0xC4-0xC7 */ + 0xCC, 0x52, 0x00, 0x00, 0xA9, 0x6E, 0xA9, 0x6C, /* 0xC8-0xCB */ + 0xCC, 0x49, 0xA9, 0x6B, 0xCC, 0x47, 0xCC, 0x46, /* 0xCC-0xCF */ + 0xA9, 0x6A, 0xA9, 0x68, 0xA9, 0x71, 0xA9, 0x6D, /* 0xD0-0xD3 */ + 0xA9, 0x65, 0x00, 0x00, 0xCC, 0x4E, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xAB, 0xB9, 0x00, 0x00, 0xAB, 0xC0, 0xCE, 0x6F, /* 0xD8-0xDB */ + 0xAB, 0xB8, 0xCE, 0x67, 0xCE, 0x63, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCE, 0x73, 0xCE, 0x62, 0x00, 0x00, 0xAB, 0xBB, /* 0xE0-0xE3 */ + 0xCE, 0x6C, 0xAB, 0xBE, 0xAB, 0xC1, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAB, 0xBC, 0xCE, 0x70, 0xAB, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0xAE, 0x56, 0xCE, 0x76, 0xCE, 0x64, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0x66, 0xCE, 0x6D, 0xCE, 0x71, /* 0xF0-0xF3 */ + 0xCE, 0x75, 0xCE, 0x72, 0xCE, 0x6B, 0xCE, 0x6E, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x68, 0xAB, 0xC3, /* 0xF8-0xFB */ + 0xCE, 0x6A, 0xCE, 0x69, 0xCE, 0x74, 0xAB, 0xBA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0xCE, 0x65, 0xAB, 0xC2, 0x00, 0x00, 0xAB, 0xBD, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xAE, 0x5C, 0xD1, 0x62, 0x00, 0x00, /* 0x08-0x0B */ + 0xAE, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x60, /* 0x0C-0x0F */ + 0x00, 0x00, 0xAE, 0x50, 0x00, 0x00, 0xAE, 0x55, /* 0x10-0x13 */ + 0x00, 0x00, 0xD1, 0x5F, 0xD1, 0x5C, 0xD1, 0x61, /* 0x14-0x17 */ + 0xAE, 0x51, 0xD1, 0x5B, 0x00, 0x00, 0xAE, 0x54, /* 0x18-0x1B */ + 0xAE, 0x52, 0x00, 0x00, 0xD1, 0x63, 0xAE, 0x53, /* 0x1C-0x1F */ + 0xAE, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x58, /* 0x20-0x23 */ + 0x00, 0x00, 0xAE, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xAE, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0x5D, 0xD1, 0x5E, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x64, /* 0x30-0x33 */ + 0x00, 0x00, 0xD4, 0xD4, 0xB0, 0xF9, 0xD8, 0xC2, /* 0x34-0x37 */ + 0xD4, 0xD3, 0xD4, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB1, 0x40, 0x00, 0x00, 0xD4, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0xB0, 0xFE, 0xB0, 0xFA, 0xD4, 0xED, 0xD4, 0xDD, /* 0x40-0x43 */ + 0xD4, 0xE0, 0x00, 0x00, 0xB1, 0x43, 0xD4, 0xEA, /* 0x44-0x47 */ + 0xD4, 0xE2, 0xB0, 0xFB, 0xB1, 0x44, 0x00, 0x00, /* 0x48-0x4B */ + 0xD4, 0xE7, 0xD4, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD4, 0xD6, 0xD4, 0xEB, 0xD4, 0xDF, 0xD4, 0xDA, /* 0x50-0x53 */ + 0x00, 0x00, 0xD4, 0xD0, 0xD4, 0xEC, 0xD4, 0xDC, /* 0x54-0x57 */ + 0xD4, 0xCF, 0x00, 0x00, 0xB1, 0x42, 0xD4, 0xE1, /* 0x58-0x5B */ + 0xD4, 0xEE, 0xD4, 0xDE, 0xD4, 0xD2, 0xD4, 0xD7, /* 0x5C-0x5F */ + 0xD4, 0xCE, 0x00, 0x00, 0xB1, 0x41, 0x00, 0x00, /* 0x60-0x63 */ + 0xD4, 0xDB, 0xD4, 0xD8, 0xB0, 0xFC, 0xD4, 0xD1, /* 0x64-0x67 */ + 0x00, 0x00, 0xD4, 0xE9, 0xB0, 0xFD, 0x00, 0x00, /* 0x68-0x6B */ + 0xD4, 0xD9, 0xD4, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xD4, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x40, /* 0x74-0x77 */ + 0xD8, 0xBB, 0x00, 0x00, 0xD8, 0xB8, 0xD8, 0xC9, /* 0x78-0x7B */ + 0xD8, 0xBD, 0xD8, 0xCA, 0x00, 0x00, 0xB4, 0x42, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC6, /* 0x80-0x83 */ + 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, 0xD8, 0xC7, /* 0x88-0x8B */ + 0xD8, 0xCB, 0x00, 0x00, 0xD4, 0xE3, 0xD8, 0xCD, /* 0x8C-0x8F */ + 0xDD, 0x47, 0x00, 0x00, 0xB4, 0x43, 0xD8, 0xCE, /* 0x90-0x93 */ + 0xD8, 0xB6, 0xD8, 0xC0, 0x00, 0x00, 0xD8, 0xC5, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0x41, 0xB4, 0x44, /* 0x98-0x9B */ + 0xD8, 0xCC, 0xD8, 0xCF, 0xD8, 0xBA, 0xD8, 0xB7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB9, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xBE, 0xD8, 0xBC, 0xB4, 0x45, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD8, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD8, 0xBF, 0x00, 0x00, 0xD8, 0xC1, 0xD8, 0xB5, /* 0xAC-0xAF */ + 0xDC, 0xFA, 0xDC, 0xF8, 0xB7, 0x42, 0xB7, 0x40, /* 0xB0-0xB3 */ + 0xDD, 0x43, 0xDC, 0xF9, 0xDD, 0x44, 0xDD, 0x40, /* 0xB4-0xB7 */ + 0xDC, 0xF7, 0xDD, 0x46, 0xDC, 0xF6, 0xDC, 0xFD, /* 0xB8-0xBB */ + 0xB6, 0xFE, 0xB6, 0xFD, 0xB6, 0xFC, 0xDC, 0xFB, /* 0xBC-0xBF */ + 0xDD, 0x41, 0xB6, 0xF9, 0xB7, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xF4, 0x00, 0x00, 0xDC, 0xFE, 0xDC, 0xF3, /* 0xC4-0xC7 */ + 0xDC, 0xFC, 0xB6, 0xFA, 0xDD, 0x42, 0xDC, 0xF5, /* 0xC8-0xCB */ + 0xB6, 0xFB, 0xDD, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE1, 0x6E, 0xB9, 0xE2, 0xB9, 0xE1, /* 0xD4-0xD7 */ + 0xB9, 0xE3, 0xE1, 0x7A, 0xE1, 0x70, 0xE1, 0x76, /* 0xD8-0xDB */ + 0xE1, 0x6B, 0xE1, 0x79, 0xE1, 0x78, 0xE1, 0x7C, /* 0xDC-0xDF */ + 0xE1, 0x75, 0xB9, 0xDE, 0xE1, 0x74, 0xB9, 0xE4, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE1, 0x6D, 0xB9, 0xDF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE1, 0x7B, 0xB9, 0xE0, 0xE1, 0x6F, 0xE1, 0x72, /* 0xE8-0xEB */ + 0xE1, 0x77, 0xE1, 0x71, 0xE1, 0x6C, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x73, /* 0xF0-0xF3 */ + 0xE5, 0x55, 0xBC, 0x61, 0xE5, 0x58, 0xE5, 0x57, /* 0xF4-0xF7 */ + 0xE5, 0x5A, 0xE5, 0x5C, 0xF9, 0xDC, 0xBC, 0x5F, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE5, 0x56, 0x00, 0x00, 0xE5, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5B, 0xE5, 0x59, /* 0x00-0x03 */ + 0x00, 0x00, 0xE5, 0x5F, 0x00, 0x00, 0xE5, 0x5E, /* 0x04-0x07 */ + 0xBC, 0x63, 0xBC, 0x5E, 0x00, 0x00, 0xBC, 0x60, /* 0x08-0x0B */ + 0xBC, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, /* 0x0C-0x0F */ + 0xE9, 0x57, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x56, /* 0x10-0x13 */ + 0xE9, 0x55, 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x51, /* 0x14-0x17 */ + 0x00, 0x00, 0xE9, 0x52, 0xE9, 0x5A, 0xE9, 0x53, /* 0x18-0x1B */ + 0x00, 0x00, 0xBE, 0xC5, 0xE9, 0x5C, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0x5B, 0xE9, 0x54, 0x00, 0x00, 0xEC, 0xD1, /* 0x20-0x23 */ + 0xC0, 0xA8, 0xEC, 0xCF, 0xEC, 0xD4, 0xEC, 0xD3, /* 0x24-0x27 */ + 0xE9, 0x59, 0x00, 0x00, 0xC0, 0xA7, 0x00, 0x00, /* 0x28-0x2B */ + 0xEC, 0xD2, 0xEC, 0xCE, 0xEC, 0xD6, 0xEC, 0xD5, /* 0x2C-0x2F */ + 0xC0, 0xA6, 0x00, 0x00, 0xEC, 0xD0, 0x00, 0x00, /* 0x30-0x33 */ + 0xBE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC2, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEF, 0xC1, 0xF1, 0xFA, 0xF1, 0xFB, 0xF1, 0xFC, /* 0x3C-0x3F */ + 0xC4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x5D, /* 0x40-0x43 */ + 0x00, 0x00, 0xF4, 0x43, 0x00, 0x00, 0xF5, 0xC8, /* 0x44-0x47 */ + 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDB, /* 0x48-0x4B */ + 0xF6, 0xDC, 0xF7, 0xD5, 0xF8, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */ + 0xA4, 0x6C, 0xA4, 0x6D, 0x00, 0x00, 0xA4, 0x6E, /* 0x50-0x53 */ + 0xA4, 0xD5, 0xA5, 0xA5, 0xC9, 0xD3, 0xA6, 0x72, /* 0x54-0x57 */ + 0xA6, 0x73, 0x00, 0x00, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x58-0x5B */ + 0xA7, 0xB6, 0xA7, 0xB5, 0x00, 0x00, 0xA9, 0x73, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x55, 0xA9, 0x75, /* 0x60-0x63 */ + 0xA9, 0x74, 0xCC, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xAB, 0xC4, 0x00, 0x00, 0xAE, 0x5D, /* 0x68-0x6B */ + 0xD1, 0x65, 0x00, 0x00, 0xD4, 0xF0, 0x00, 0x00, /* 0x6C-0x6F */ + 0xB1, 0x45, 0xB4, 0x47, 0xD4, 0xEF, 0xB4, 0x46, /* 0x70-0x73 */ + 0x00, 0x00, 0xB9, 0xE5, 0x00, 0x00, 0xE1, 0x7D, /* 0x74-0x77 */ + 0xBE, 0xC7, 0x00, 0x00, 0xC0, 0xA9, 0xEC, 0xD7, /* 0x78-0x7B */ + 0x00, 0x00, 0xC4, 0x5E, 0x00, 0x00, 0xC5, 0x70, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xC9, 0x72, 0x00, 0x00, 0xA5, 0xA6, /* 0x80-0x83 */ + 0xC9, 0x73, 0xA6, 0x76, 0x00, 0x00, 0xA6, 0x74, /* 0x84-0x87 */ + 0xA6, 0x75, 0xA6, 0x77, 0x00, 0x00, 0xA7, 0xBA, /* 0x88-0x8B */ + 0xA7, 0xB9, 0x00, 0x00, 0xCA, 0xBC, 0xA7, 0xBB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBD, 0xCC, 0x57, /* 0x90-0x93 */ + 0x00, 0x00, 0xCC, 0x58, 0x00, 0x00, 0xA9, 0x76, /* 0x94-0x97 */ + 0xA9, 0x78, 0xA9, 0x7A, 0xA9, 0x77, 0xA9, 0x7B, /* 0x98-0x9B */ + 0xA9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xC8, 0xAB, 0xC5, /* 0xA0-0xA3 */ + 0xAB, 0xC7, 0xAB, 0xC9, 0xAB, 0xC6, 0xD1, 0x66, /* 0xA4-0xA7 */ + 0xCE, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD1, 0x68, 0xD1, 0x67, 0xAE, 0x63, 0x00, 0x00, /* 0xAC-0xAF */ + 0xAE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x60, /* 0xB0-0xB3 */ + 0xAE, 0x62, 0xAE, 0x64, 0xAE, 0x61, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAE, 0x66, 0xAE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4A, /* 0xBC-0xBF */ + 0xD4, 0xF2, 0xD4, 0xF1, 0xB1, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB1, 0x48, 0xB1, 0x47, 0xB1, 0x4B, 0xB1, 0x46, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD5, 0xD8, 0xD2, /* 0xC8-0xCB */ + 0xB4, 0x49, 0xD8, 0xD1, 0xD8, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */ + 0xB4, 0x4B, 0xD8, 0xD4, 0xB4, 0x48, 0xB4, 0x4A, /* 0xD0-0xD3 */ + 0xD8, 0xD3, 0x00, 0x00, 0xDD, 0x48, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDD, 0x49, 0xDD, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xE6, 0xB9, 0xEE, /* 0xDC-0xDF */ + 0xE1, 0x7E, 0xB9, 0xE8, 0xB9, 0xEC, 0xE1, 0xA1, /* 0xE0-0xE3 */ + 0xB9, 0xED, 0xB9, 0xE9, 0xB9, 0xEA, 0xB9, 0xE7, /* 0xE4-0xE7 */ + 0xB9, 0xEB, 0xBC, 0x66, 0xD8, 0xD0, 0xBC, 0x67, /* 0xE8-0xEB */ + 0xBC, 0x65, 0x00, 0x00, 0xBC, 0x64, 0xE9, 0x5D, /* 0xEC-0xEF */ + 0xBE, 0xC8, 0xEC, 0xD8, 0xEC, 0xD9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC3, 0x64, 0xC4, 0x5F, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA4, 0x6F, 0x00, 0x00, 0xA6, 0x78, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0xAB, 0xCA, 0x00, 0x00, 0xD1, 0x69, /* 0x00-0x03 */ + 0xAE, 0x67, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4E, /* 0x04-0x07 */ + 0xB1, 0x4D, 0xB1, 0x4C, 0xB4, 0x4C, 0xB4, 0x4D, /* 0x08-0x0B */ + 0xD8, 0xD7, 0xB9, 0xEF, 0xBE, 0xC9, 0xA4, 0x70, /* 0x0C-0x0F */ + 0xC9, 0x5C, 0xA4, 0xD6, 0xC9, 0x74, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xC9, 0xD4, 0xA6, 0x79, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x7C, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0x4B, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x71, 0x00, 0x00, /* 0x20-0x23 */ + 0xA4, 0xD7, 0xC9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCA, 0xBE, 0x00, 0x00, 0xCA, 0xBF, 0x00, 0x00, /* 0x28-0x2B */ + 0xA7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD8, 0xD8, 0xB4, 0x4E, 0x00, 0x00, 0xDD, 0x4C, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xAA, /* 0x34-0x37 */ + 0xA4, 0x72, 0xA4, 0xA8, 0xA4, 0xD8, 0xC9, 0x75, /* 0x38-0x3B */ + 0xA5, 0xA7, 0x00, 0x00, 0xA7, 0xC0, 0xA7, 0xBF, /* 0x3C-0x3F */ + 0xA7, 0xBD, 0xA7, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xCC, 0x59, 0xA9, 0x7E, 0xA9, 0xA1, 0xCC, 0x5A, /* 0x44-0x47 */ + 0xA9, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xCE, /* 0x48-0x4B */ + 0xCE, 0x78, 0xAB, 0xCD, 0xAB, 0xCB, 0xAB, 0xCC, /* 0x4C-0x4F */ + 0xAE, 0x6A, 0xAE, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD1, 0x6B, 0xAE, 0x69, 0xD1, 0x6A, 0x00, 0x00, /* 0x54-0x57 */ + 0xAE, 0x5E, 0xD4, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xB1, 0x50, 0xB1, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB1, 0x4F, 0x00, 0x00, 0xB9, 0xF0, 0xE1, 0xA2, /* 0x60-0x63 */ + 0xBC, 0x68, 0xBC, 0x69, 0x00, 0x00, 0xE5, 0x61, /* 0x64-0x67 */ + 0xC0, 0xAB, 0xEF, 0xC2, 0xEF, 0xC3, 0x00, 0x00, /* 0x68-0x6B */ + 0xC4, 0xDD, 0xF8, 0xA8, 0xC9, 0x4B, 0xA4, 0xD9, /* 0x6C-0x6F */ + 0x00, 0x00, 0xA4, 0x73, 0x00, 0x00, 0xC9, 0x77, /* 0x70-0x73 */ + 0xC9, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xA6, 0x7A, 0xC9, 0xD7, 0xC9, 0xD8, /* 0x78-0x7B */ + 0xC9, 0xD6, 0x00, 0x00, 0xC9, 0xD9, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0x84-0x87 */ + 0xCA, 0xC2, 0xCA, 0xC4, 0xCA, 0xC6, 0xCA, 0xC3, /* 0x88-0x8B */ + 0xA7, 0xC4, 0xCA, 0xC0, 0x00, 0x00, 0xCA, 0xC1, /* 0x8C-0x8F */ + 0xA7, 0xC1, 0xA7, 0xC2, 0xCA, 0xC5, 0xCA, 0xC8, /* 0x90-0x93 */ + 0xA7, 0xC3, 0xCA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCC, 0x68, 0x00, 0x00, 0xCC, 0x62, /* 0x9C-0x9F */ + 0xCC, 0x5D, 0xA9, 0xA3, 0xCC, 0x65, 0xCC, 0x63, /* 0xA0-0xA3 */ + 0xCC, 0x5C, 0xCC, 0x69, 0xCC, 0x6C, 0xCC, 0x67, /* 0xA4-0xA7 */ + 0xCC, 0x60, 0xA9, 0xA5, 0xCC, 0x66, 0xA9, 0xA6, /* 0xA8-0xAB */ + 0xCC, 0x61, 0xCC, 0x64, 0xCC, 0x5B, 0xCC, 0x5F, /* 0xAC-0xAF */ + 0xCC, 0x6B, 0xA9, 0xA7, 0x00, 0x00, 0xA9, 0xA8, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xCC, 0x5E, 0xCC, 0x6A, 0xA9, 0xA2, /* 0xB4-0xB7 */ + 0xA9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, 0xCE, 0xA4, /* 0xC4-0xC7 */ + 0xCE, 0xAA, 0xCE, 0xA3, 0xCE, 0xA5, 0xCE, 0x7D, /* 0xC8-0xCB */ + 0xCE, 0x7B, 0x00, 0x00, 0xCE, 0xAC, 0xCE, 0xA9, /* 0xCC-0xCF */ + 0xCE, 0x79, 0x00, 0x00, 0xAB, 0xD0, 0xCE, 0xA7, /* 0xD0-0xD3 */ + 0xCE, 0xA8, 0x00, 0x00, 0xCE, 0xA6, 0xCE, 0x7C, /* 0xD4-0xD7 */ + 0xCE, 0x7A, 0xAB, 0xCF, 0xCE, 0xA2, 0xCE, 0x7E, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA1, 0xCE, 0xAD, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAE, 0x6F, 0x00, 0x00, 0xAE, 0x6E, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD1, 0x6C, 0xAE, 0x6B, 0xD1, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */ + 0xAE, 0x70, 0xD1, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xAE, 0x73, 0x00, 0x00, 0xAE, 0x71, 0xD1, 0x70, /* 0xF4-0xF7 */ + 0xCE, 0xAE, 0xD1, 0x72, 0x00, 0x00, 0xAE, 0x6D, /* 0xF8-0xFB */ + 0x00, 0x00, 0xAE, 0x6C, 0x00, 0x00, 0xD1, 0x6D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0xD1, 0x71, 0xAE, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0x53, 0xB1, 0x52, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF5, /* 0x08-0x0B */ + 0xD4, 0xF9, 0xD4, 0xFB, 0xB1, 0x54, 0xD4, 0xFE, /* 0x0C-0x0F */ + 0x00, 0x00, 0xB1, 0x58, 0xD5, 0x41, 0x00, 0x00, /* 0x10-0x13 */ + 0xB1, 0x5A, 0x00, 0x00, 0xB1, 0x56, 0xB1, 0x5E, /* 0x14-0x17 */ + 0x00, 0x00, 0xB1, 0x5B, 0xD4, 0xF7, 0xB1, 0x55, /* 0x18-0x1B */ + 0x00, 0x00, 0xD4, 0xF6, 0xD4, 0xF4, 0xD5, 0x43, /* 0x1C-0x1F */ + 0xD4, 0xF8, 0x00, 0x00, 0xB1, 0x57, 0xD5, 0x42, /* 0x20-0x23 */ + 0xB1, 0x5C, 0xD4, 0xFD, 0xD4, 0xFC, 0xB1, 0x5D, /* 0x24-0x27 */ + 0xD4, 0xFA, 0xB1, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x44, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD5, 0x40, 0xD8, 0xE7, 0xD8, 0xEE, 0xD8, 0xE3, /* 0x30-0x33 */ + 0xB4, 0x51, 0xD8, 0xDF, 0xD8, 0xEF, 0xD8, 0xD9, /* 0x34-0x37 */ + 0xD8, 0xEC, 0xD8, 0xEA, 0xD8, 0xE4, 0x00, 0x00, /* 0x38-0x3B */ + 0xD8, 0xED, 0xD8, 0xE6, 0x00, 0x00, 0xD8, 0xDE, /* 0x3C-0x3F */ + 0xD8, 0xF0, 0xD8, 0xDC, 0xD8, 0xE9, 0xD8, 0xDA, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, 0xB4, 0x52, /* 0x44-0x47 */ + 0x00, 0x00, 0xD8, 0xEB, 0xDD, 0x4F, 0xD8, 0xDD, /* 0x48-0x4B */ + 0xB4, 0x4F, 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xB4, 0x50, 0xD8, 0xE0, 0xD8, 0xE5, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xD8, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD8, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x53, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x56, 0xDD, 0x4E, /* 0x60-0x63 */ + 0x00, 0x00, 0xDD, 0x50, 0x00, 0x00, 0xDD, 0x55, /* 0x64-0x67 */ + 0xDD, 0x54, 0xB7, 0x43, 0x00, 0x00, 0xD8, 0xDB, /* 0x68-0x6B */ + 0xDD, 0x52, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDD, 0x4D, 0xDD, 0x51, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x74-0x77 */ + 0x00, 0x00, 0xE1, 0xB0, 0xE1, 0xA7, 0x00, 0x00, /* 0x78-0x7B */ + 0xE1, 0xAE, 0xE1, 0xA5, 0xE1, 0xAD, 0xE1, 0xB1, /* 0x7C-0x7F */ + + 0xE1, 0xA4, 0xE1, 0xA8, 0xE1, 0xA3, 0x00, 0x00, /* 0x80-0x83 */ + 0xB9, 0xF1, 0x00, 0x00, 0xE1, 0xA6, 0xB9, 0xF2, /* 0x84-0x87 */ + 0xE1, 0xAC, 0xE1, 0xAB, 0xE1, 0xAA, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x65, 0xE5, 0x67, /* 0x90-0x93 */ + 0xBC, 0x6B, 0xE5, 0x68, 0x00, 0x00, 0xE5, 0x63, /* 0x94-0x97 */ + 0x00, 0x00, 0xE5, 0x62, 0xE5, 0x6C, 0x00, 0x00, /* 0x98-0x9B */ + 0xE5, 0x6A, 0xBC, 0x6A, 0xE5, 0x6D, 0xE5, 0x64, /* 0x9C-0x9F */ + 0xE5, 0x69, 0xE5, 0x6B, 0xE5, 0x66, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x61, /* 0xA4-0xA7 */ + 0xE9, 0x66, 0xE9, 0x60, 0xE9, 0x65, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE9, 0x5E, 0xE9, 0x68, 0xE9, 0x64, 0xE9, 0x69, /* 0xAC-0xAF */ + 0xE9, 0x63, 0xE9, 0x5F, 0xE9, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE9, 0x6A, 0xE9, 0x62, 0x00, 0x00, 0xEC, 0xDA, /* 0xB4-0xB7 */ + 0xC0, 0xAF, 0x00, 0x00, 0xC0, 0xAD, 0x00, 0x00, /* 0xB8-0xBB */ + 0xC0, 0xAC, 0xC0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xC4, 0x00, 0x00, 0xF1, 0x72, 0xF1, 0xFD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0x44, 0xF4, 0x45, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC4, 0x60, 0x00, 0x00, 0xF5, 0xC9, /* 0xC8-0xCB */ + 0x00, 0x00, 0xC4, 0xDE, 0x00, 0x00, 0xF5, 0xCA, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF6, 0xDE, 0xC5, 0x72, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC5, 0x71, 0xF6, 0xDD, 0xC5, 0xC9, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xF7, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA4, 0x74, 0xA6, 0x7B, 0xC9, 0xDA, /* 0xDC-0xDF */ + 0xCA, 0xCA, 0xA8, 0xB5, 0xB1, 0x5F, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA4, 0x75, 0xA5, 0xAA, 0xA5, 0xA9, /* 0xE4-0xE7 */ + 0xA5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xAE, 0x74, 0x00, 0x00, /* 0xEC-0xEF */ + 0xDD, 0x57, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0xF0-0xF3 */ + 0xA4, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCE, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xB4, 0x53, 0xA4, 0x79, 0xC9, 0x5D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xAB, 0xA5, 0xAC, /* 0x00-0x03 */ + 0xC9, 0x78, 0x00, 0x00, 0xA6, 0x7C, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0x08-0x0B */ + 0xA7, 0xC6, 0x00, 0x00, 0xCA, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA9, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xCC, 0x6E, 0xA9, 0xAC, 0xA9, 0xAB, 0xCC, 0x6D, /* 0x14-0x17 */ + 0xA9, 0xA9, 0xCC, 0x6F, 0xA9, 0xAA, 0xA9, 0xAD, /* 0x18-0x1B */ + 0x00, 0x00, 0xAB, 0xD2, 0x00, 0x00, 0xAB, 0xD4, /* 0x1C-0x1F */ + 0xCE, 0xB3, 0xCE, 0xB0, 0xCE, 0xB1, 0xCE, 0xB2, /* 0x20-0x23 */ + 0xCE, 0xB4, 0xAB, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD1, 0x74, 0xD1, 0x73, 0x00, 0x00, 0xAE, 0x76, /* 0x28-0x2B */ + 0x00, 0x00, 0xAE, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x62, /* 0x30-0x33 */ + 0xD5, 0x46, 0x00, 0x00, 0xB1, 0x61, 0xB1, 0x63, /* 0x34-0x37 */ + 0xB1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB4, 0x55, 0xD5, 0x45, 0x00, 0x00, /* 0x3C-0x3F */ + 0xB4, 0x56, 0xD8, 0xF3, 0x00, 0x00, 0xB4, 0x57, /* 0x40-0x43 */ + 0xD8, 0xF2, 0xB4, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x5A, 0xDD, 0x5C, /* 0x48-0x4B */ + 0xB7, 0x45, 0xDD, 0x5B, 0xDD, 0x59, 0xDD, 0x58, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, /* 0x50-0x53 */ + 0xB9, 0xF7, 0xB9, 0xF5, 0x00, 0x00, 0xB9, 0xF6, /* 0x54-0x57 */ + 0xE1, 0xB2, 0xE1, 0xB3, 0x00, 0x00, 0xB9, 0xF3, /* 0x58-0x5B */ + 0xE5, 0x71, 0xE5, 0x6F, 0x00, 0x00, 0xBC, 0x6D, /* 0x5C-0x5F */ + 0xE5, 0x70, 0xBC, 0x6E, 0xBC, 0x6C, 0xB9, 0xF4, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6D, 0xE9, 0x6B, /* 0x64-0x67 */ + 0xE9, 0x6C, 0xE5, 0x6E, 0xEC, 0xDC, 0xC0, 0xB0, /* 0x68-0x6B */ + 0xEC, 0xDB, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x6E, /* 0x6C-0x6F */ + 0xF1, 0xFE, 0x00, 0x00, 0xA4, 0x7A, 0xA5, 0xAD, /* 0x70-0x73 */ + 0xA6, 0x7E, 0xC9, 0xDB, 0xA6, 0x7D, 0x00, 0x00, /* 0x74-0x77 */ + 0xA9, 0xAF, 0xB7, 0x46, 0x00, 0x00, 0xA4, 0xDB, /* 0x78-0x7B */ + 0xA5, 0xAE, 0xAB, 0xD5, 0xB4, 0x58, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xC9, 0x79, 0x00, 0x00, 0xC9, 0x7A, 0x00, 0x00, /* 0x80-0x83 */ + 0xC9, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC8, /* 0x84-0x87 */ + 0xCA, 0xD0, 0xCA, 0xCE, 0xA7, 0xC9, 0xCA, 0xCD, /* 0x88-0x8B */ + 0xCA, 0xCF, 0xCA, 0xD1, 0x00, 0x00, 0xA7, 0xC7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA9, 0xB3, 0xA9, 0xB4, 0xA9, 0xB1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xB0, 0xCE, 0xB8, /* 0x98-0x9B */ + 0xA9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAB, 0xD6, 0x00, 0x00, 0xCE, 0xB7, 0xCE, 0xB9, /* 0xA0-0xA3 */ + 0xCE, 0xB6, 0xCE, 0xBA, 0xAB, 0xD7, 0xAE, 0x79, /* 0xA4-0xA7 */ + 0xD1, 0x75, 0x00, 0x00, 0xD1, 0x77, 0xAE, 0x77, /* 0xA8-0xAB */ + 0xD1, 0x78, 0xAE, 0x78, 0xD1, 0x76, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCE, 0xB5, 0xD5, 0x47, 0xD5, 0x4A, 0xD5, 0x4B, /* 0xB0-0xB3 */ + 0xD5, 0x48, 0xB1, 0x67, 0xB1, 0x66, 0xB1, 0x64, /* 0xB4-0xB7 */ + 0xB1, 0x65, 0xD5, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0x68, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xB4, 0x5A, 0xB4, 0x5B, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB4, 0x5C, 0xDD, 0x5D, 0xDD, 0x5F, 0xDD, 0x61, /* 0xC4-0xC7 */ + 0xB7, 0x48, 0xB7, 0x47, 0xB4, 0x59, 0xDD, 0x60, /* 0xC8-0xCB */ + 0xDD, 0x5E, 0x00, 0x00, 0xE1, 0xB8, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE1, 0xB6, 0xE1, 0xBC, 0xB9, 0xF8, /* 0xD0-0xD3 */ + 0xE1, 0xBD, 0xE1, 0xBA, 0xB9, 0xF9, 0xE1, 0xB7, /* 0xD4-0xD7 */ + 0xE1, 0xB5, 0xE1, 0xBB, 0xBC, 0x70, 0xE5, 0x73, /* 0xD8-0xDB */ + 0xE1, 0xB9, 0xBC, 0x72, 0xE5, 0x74, 0xBC, 0x71, /* 0xDC-0xDF */ + 0xBC, 0x74, 0xE5, 0x75, 0xBC, 0x6F, 0xBC, 0x73, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE9, 0x73, 0xE9, 0x71, 0xE9, 0x70, /* 0xE4-0xE7 */ + 0xE9, 0x72, 0xE9, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xC3, 0x66, 0x00, 0x00, 0xF4, 0x46, 0xF4, 0x47, /* 0xEC-0xEF */ + 0x00, 0x00, 0xF5, 0xCB, 0xF6, 0xDF, 0xC6, 0x55, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xB5, 0xA7, 0xCA, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD8, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x7B, 0xA4, 0xDC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0xA5, 0xAF, 0xC9, 0xDD, 0x00, 0x00, /* 0x00-0x03 */ + 0xA7, 0xCB, 0xCA, 0xD2, 0x00, 0x00, 0xCE, 0xBB, /* 0x04-0x07 */ + 0xAB, 0xD9, 0x00, 0x00, 0xB9, 0xFA, 0xA4, 0x7C, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xA1, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x49, 0xA4, 0x7D, /* 0x10-0x13 */ + 0xA4, 0xDD, 0xA4, 0xDE, 0x00, 0x00, 0xA5, 0xB1, /* 0x14-0x17 */ + 0xA5, 0xB0, 0x00, 0x00, 0xC9, 0xDE, 0xA6, 0xA2, /* 0x18-0x1B */ + 0x00, 0x00, 0xCA, 0xD3, 0x00, 0x00, 0xA7, 0xCC, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x71, 0xCC, 0x72, /* 0x20-0x23 */ + 0xCC, 0x73, 0x00, 0x00, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x24-0x27 */ + 0xCC, 0x70, 0xA9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xAB, 0xDA, 0xCE, 0xBC, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD1, 0x7A, 0xAE, 0x7A, 0x00, 0x00, 0xD1, 0x79, /* 0x30-0x33 */ + 0x00, 0x00, 0xB1, 0x69, 0xD5, 0x4C, 0xB1, 0x6A, /* 0x34-0x37 */ + 0xD5, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB4, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDD, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBF, /* 0x40-0x43 */ + 0xE1, 0xBE, 0x00, 0x00, 0xB9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xBC, 0x75, 0xE5, 0x76, 0xBE, 0xCA, 0xE9, 0x74, /* 0x48-0x4B */ + 0xC0, 0xB1, 0x00, 0x00, 0xC5, 0x73, 0xF7, 0xD8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCC, 0x74, 0x00, 0x00, 0xCE, 0xBD, 0xB1, 0x6B, /* 0x54-0x57 */ + 0xD8, 0xF4, 0xB7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xC2, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xCE, 0x00, 0x00, /* 0x60-0x63 */ + 0xA7, 0xCD, 0xAB, 0xDB, 0x00, 0x00, 0xD1, 0x7B, /* 0x64-0x67 */ + 0x00, 0x00, 0xB1, 0x6D, 0xB3, 0x43, 0xB1, 0x6E, /* 0x68-0x6B */ + 0xB1, 0x6C, 0xB4, 0x5E, 0x00, 0x00, 0xE1, 0xC0, /* 0x6C-0x6F */ + 0xB9, 0xFC, 0xBC, 0x76, 0x00, 0x00, 0xC9, 0x4C, /* 0x70-0x73 */ + 0xC9, 0xDF, 0x00, 0x00, 0xCA, 0xD5, 0xA7, 0xCF, /* 0x74-0x77 */ + 0xCA, 0xD4, 0xA7, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xA9, 0xBC, 0xCC, 0x77, 0xCC, 0x76, 0xA9, 0xBB, /* 0x7C-0x7F */ + + 0xA9, 0xB9, 0xA9, 0xBA, 0xCC, 0x75, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xAB, 0xDD, 0xCE, 0xBE, 0xAB, 0xE0, /* 0x84-0x87 */ + 0xAB, 0xDC, 0xAB, 0xE2, 0xAB, 0xDE, 0xAB, 0xDF, /* 0x88-0x8B */ + 0xAB, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xAE, 0x7D, 0xAE, 0x7C, 0xAE, 0x7B, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x4F, 0xB1, 0x6F, /* 0x94-0x97 */ + 0xB1, 0x72, 0xB1, 0x70, 0x00, 0x00, 0xD5, 0x4E, /* 0x98-0x9B */ + 0xB1, 0x75, 0x00, 0x00, 0xB1, 0x71, 0xD5, 0x50, /* 0x9C-0x9F */ + 0xB1, 0x74, 0xB1, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xF6, 0xD8, 0xF5, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB4, 0x61, 0xB4, 0x5F, 0xB4, 0x60, 0xD8, 0xF7, /* 0xA8-0xAB */ + 0xB7, 0x4B, 0xDD, 0x64, 0xB7, 0x4C, 0xDD, 0x63, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x77, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xBC, 0x78, 0xE1, 0xC1, 0xBC, 0x77, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xB9, 0xFD, 0x00, 0x00, 0xEC, 0xDE, /* 0xB8-0xBB */ + 0xE9, 0x75, 0xC0, 0xB2, 0xEC, 0xDD, 0xF2, 0x40, /* 0xBC-0xBF */ + 0xF4, 0x48, 0xF4, 0x49, 0x00, 0x00, 0xA4, 0xDF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xA5, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xA7, 0xD2, 0xA7, 0xD4, 0x00, 0x00, 0xC9, 0xE2, /* 0xCC-0xCF */ + 0xCA, 0xD8, 0xCA, 0xD7, 0xCA, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC9, 0xE1, 0xC9, 0xE0, 0xA6, 0xA4, 0xA7, 0xD3, /* 0xD4-0xD7 */ + 0xA7, 0xD1, 0xA6, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA9, 0xBD, 0xCC, 0x78, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA9, 0xBE, 0xCA, 0xDD, 0x00, 0x00, 0xCA, 0xDF, /* 0xE0-0xE3 */ + 0xCA, 0xDE, 0xCC, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCA, 0xDA, 0x00, 0x00, 0xA7, 0xD8, 0xA7, 0xD6, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCA, 0xD9, 0xCA, 0xDB, 0xCA, 0xE1, /* 0xEC-0xEF */ + 0x00, 0x00, 0xA7, 0xD5, 0x00, 0x00, 0xCA, 0xDC, /* 0xF0-0xF3 */ + 0xCA, 0xE5, 0xA9, 0xC0, 0x00, 0x00, 0xCA, 0xE2, /* 0xF4-0xF7 */ + 0xA7, 0xD7, 0x00, 0x00, 0xCA, 0xE0, 0xCA, 0xE3, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA9, 0xBF, 0x00, 0x00, 0xA9, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0xCA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCC, 0xAF, 0xCC, 0xA2, 0xCC, 0x7E, /* 0x08-0x0B */ + 0xCC, 0xAE, 0xCC, 0xA9, 0xAB, 0xE7, 0xA9, 0xC2, /* 0x0C-0x0F */ + 0xCC, 0xAA, 0xCC, 0xAD, 0xAB, 0xE3, 0xCC, 0xAC, /* 0x10-0x13 */ + 0xA9, 0xC3, 0xA9, 0xC8, 0xA9, 0xC6, 0xCC, 0xA3, /* 0x14-0x17 */ + 0x00, 0x00, 0xCC, 0x7C, 0xCC, 0xA5, 0xA9, 0xCD, /* 0x18-0x1B */ + 0xCC, 0xB0, 0xAB, 0xE4, 0xCC, 0xA6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xAB, 0xE5, 0xA9, 0xC9, 0xCC, 0xA8, 0x00, 0x00, /* 0x20-0x23 */ + 0xCE, 0xCD, 0xAB, 0xE6, 0xCC, 0x7B, 0xA9, 0xCA, /* 0x24-0x27 */ + 0xAB, 0xE8, 0xA9, 0xCB, 0xA9, 0xC7, 0xA9, 0xCC, /* 0x28-0x2B */ + 0xCC, 0xA7, 0xCC, 0x7A, 0xCC, 0xAB, 0xA9, 0xC4, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x7D, 0xCC, 0xA4, /* 0x30-0x33 */ + 0xCC, 0xA1, 0xA9, 0xC5, 0x00, 0x00, 0xCE, 0xBF, /* 0x34-0x37 */ + 0x00, 0x00, 0xCE, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCE, 0xCA, 0xD1, 0xA1, 0xCE, 0xCB, 0xAB, 0xEE, /* 0x40-0x43 */ + 0xCE, 0xCE, 0xCE, 0xC4, 0xAB, 0xED, 0xCE, 0xC6, /* 0x44-0x47 */ + 0x00, 0x00, 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCE, 0xC9, 0xAB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xAE, 0xA3, 0x00, 0x00, 0xF9, 0xDA, 0xCE, 0xC5, /* 0x50-0x53 */ + 0xCE, 0xC1, 0xAE, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xCE, 0xCF, 0xAE, 0x7E, 0xD1, 0x7D, 0xCE, 0xC8, /* 0x58-0x5B */ + 0x00, 0x00, 0xD1, 0x7C, 0xCE, 0xC3, 0xCE, 0xCC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xEC, 0xAE, 0xA1, /* 0x60-0x63 */ + 0xAB, 0xF2, 0xAE, 0xA2, 0xCE, 0xD0, 0xD1, 0x7E, /* 0x64-0x67 */ + 0xAB, 0xEB, 0xAE, 0xA6, 0xAB, 0xF1, 0xAB, 0xF0, /* 0x68-0x6B */ + 0xAB, 0xEF, 0xAE, 0xA5, 0xCE, 0xD1, 0xAE, 0xA7, /* 0x6C-0x6F */ + 0xAB, 0xEA, 0x00, 0x00, 0xCE, 0xC2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x76, /* 0x7C-0x7F */ + + 0xD1, 0xA4, 0xD1, 0xA6, 0x00, 0x00, 0xD1, 0xA8, /* 0x80-0x83 */ + 0xAE, 0xA8, 0xAE, 0xAE, 0xD5, 0x53, 0xD1, 0xAC, /* 0x84-0x87 */ + 0xD1, 0xA3, 0xB1, 0x78, 0xD5, 0x51, 0x00, 0x00, /* 0x88-0x8B */ + 0xAE, 0xAD, 0xAE, 0xAB, 0xD1, 0xAE, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD5, 0x52, 0x00, 0x00, 0xD1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */ + 0xAE, 0xAC, 0xD1, 0xA9, 0xAE, 0xAF, 0xD1, 0xAB, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xAE, 0xAA, 0xD1, 0xAA, /* 0x98-0x9B */ + 0xD1, 0xAD, 0xD1, 0xA7, 0x00, 0x00, 0xAE, 0xA9, /* 0x9C-0x9F */ + 0xB1, 0x79, 0x00, 0x00, 0xD1, 0xA2, 0xB1, 0x77, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB1, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xD5, 0x55, 0xD5, 0x5E, 0xB4, 0x64, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xB1, 0x7C, 0xB1, 0xA3, 0xB4, 0x65, 0xD5, 0x60, /* 0xB4-0xB7 */ + 0xB1, 0xAA, 0xD8, 0xF9, 0xD5, 0x56, 0xB1, 0xA2, /* 0xB8-0xBB */ + 0xB1, 0xA5, 0xB1, 0x7E, 0xD5, 0x54, 0xD5, 0x62, /* 0xBC-0xBF */ + 0xD5, 0x65, 0xD9, 0x49, 0x00, 0x00, 0xD5, 0x63, /* 0xC0-0xC3 */ + 0xD8, 0xFD, 0xB1, 0xA1, 0xB1, 0xA8, 0xB1, 0xAC, /* 0xC4-0xC7 */ + 0xD5, 0x5D, 0xD8, 0xF8, 0xD5, 0x61, 0xB1, 0x7B, /* 0xC8-0xCB */ + 0xD8, 0xFA, 0xD5, 0x64, 0xD8, 0xFC, 0xD5, 0x59, /* 0xCC-0xCF */ + 0x00, 0x00, 0xB4, 0x62, 0x00, 0x00, 0xD5, 0x57, /* 0xD0-0xD3 */ + 0xD5, 0x58, 0xB1, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xB1, 0xA6, 0xD5, 0x5B, 0xB1, 0xAB, 0xD5, 0x5F, /* 0xD8-0xDB */ + 0xB1, 0xA4, 0xD5, 0x5C, 0x00, 0x00, 0xB1, 0xA9, /* 0xDC-0xDF */ + 0xB4, 0x66, 0xB4, 0x63, 0xD8, 0xFB, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD5, 0x5A, 0x00, 0x00, 0xB1, 0x7D, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xB4, 0x6B, 0xB4, 0x6F, 0xD9, 0x40, 0xB7, 0x51, /* 0xF0-0xF3 */ + 0xB4, 0x6D, 0xD9, 0x44, 0xB4, 0x71, 0xDD, 0x65, /* 0xF4-0xF7 */ + 0xD9, 0x46, 0xB7, 0x53, 0xB4, 0x69, 0xB4, 0x6C, /* 0xF8-0xFB */ + 0xD9, 0x47, 0x00, 0x00, 0xD9, 0x48, 0xD9, 0x4E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_61[512] = { + 0xB4, 0x73, 0xB7, 0x54, 0x00, 0x00, 0xD9, 0x4A, /* 0x00-0x03 */ + 0xD9, 0x4F, 0xD9, 0x43, 0xB7, 0x5E, 0x00, 0x00, /* 0x04-0x07 */ + 0xB7, 0x55, 0xB4, 0x72, 0xD9, 0x41, 0xD9, 0x50, /* 0x08-0x0B */ + 0x00, 0x00, 0xB7, 0x5D, 0xB4, 0x70, 0xB7, 0x4E, /* 0x0C-0x0F */ + 0xD9, 0x4D, 0x00, 0x00, 0xB4, 0x74, 0xD9, 0x45, /* 0x10-0x13 */ + 0xD8, 0xFE, 0xB4, 0x6A, 0xD9, 0x42, 0x00, 0x00, /* 0x14-0x17 */ + 0xD9, 0x4B, 0x00, 0x00, 0xB7, 0x4D, 0xB7, 0x52, /* 0x18-0x1B */ + 0xB4, 0x67, 0xD9, 0x4C, 0x00, 0x00, 0xB7, 0x50, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x68, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5C, /* 0x24-0x27 */ + 0xE1, 0xC3, 0xDD, 0x70, 0x00, 0x00, 0xDD, 0x68, /* 0x28-0x2B */ + 0xE1, 0xC2, 0x00, 0x00, 0xDD, 0x6C, 0xDD, 0x6E, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x6B, 0x00, 0x00, /* 0x30-0x33 */ + 0xB7, 0x5B, 0x00, 0x00, 0xDD, 0x6A, 0xB7, 0x5F, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5A, 0xBA, 0x40, /* 0x3C-0x3F */ + 0xDD, 0x71, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xB7, 0x58, 0xDD, 0x69, 0xDD, 0x6D, 0xB9, 0xFE, /* 0x44-0x47 */ + 0xB7, 0x4F, 0xDD, 0x66, 0xDD, 0x67, 0xBA, 0x41, /* 0x48-0x4B */ + 0xB7, 0x57, 0xB7, 0x59, 0xB7, 0x56, 0xDD, 0x6F, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xC9, /* 0x50-0x53 */ + 0xE1, 0xCE, 0xBC, 0x7D, 0xE1, 0xD5, 0x00, 0x00, /* 0x54-0x57 */ + 0xBA, 0x47, 0x00, 0x00, 0xBA, 0x46, 0xE1, 0xD0, /* 0x58-0x5B */ + 0x00, 0x00, 0xBC, 0x7C, 0xE1, 0xC5, 0xBA, 0x45, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE1, 0xD4, 0xBA, 0x43, 0xBA, 0x44, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xD1, 0xE5, 0xAA, 0xBC, 0x7A, /* 0x64-0x67 */ + 0xB4, 0x6E, 0x00, 0x00, 0xE1, 0xD3, 0xBC, 0xA3, /* 0x68-0x6B */ + 0xE1, 0xCB, 0x00, 0x00, 0xBC, 0x7B, 0x00, 0x00, /* 0x6C-0x6F */ + 0xBC, 0xA2, 0xE1, 0xC6, 0xE1, 0xCA, 0xE1, 0xC7, /* 0x70-0x73 */ + 0xE1, 0xCD, 0xBA, 0x48, 0xBC, 0x79, 0xBA, 0x42, /* 0x74-0x77 */ + 0x00, 0x00, 0xE5, 0x7A, 0xE1, 0xCF, 0x00, 0x00, /* 0x78-0x7B */ + 0xBC, 0xA1, 0x00, 0x00, 0xBC, 0xA4, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE1, 0xCC, 0x00, 0x00, 0xBC, 0x7E, 0xE5, 0x79, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE5, 0x7E, 0xBE, 0xCE, 0xE5, 0x78, /* 0x88-0x8B */ + 0xE9, 0xA3, 0xE5, 0xA9, 0xBC, 0xA8, 0x00, 0x00, /* 0x8C-0x8F */ + 0xBC, 0xA6, 0xBE, 0xCC, 0xE5, 0xA6, 0xE5, 0xA2, /* 0x90-0x93 */ + 0xBC, 0xAC, 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xAA, 0xE5, 0xA1, /* 0x98-0x9B */ + 0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0xE5, 0xA5, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE5, 0xA8, 0xE5, 0x7D, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xBC, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xA5, /* 0xA4-0xA7 */ + 0xE9, 0x77, 0xBE, 0xCD, 0xE5, 0xA7, 0xBC, 0xA7, /* 0xA8-0xAB */ + 0xBC, 0xA9, 0xE5, 0xA4, 0xBC, 0xAD, 0xE5, 0xA3, /* 0xAC-0xAF */ + 0xE5, 0x7C, 0xE5, 0x7B, 0xBE, 0xCB, 0xE5, 0xAB, /* 0xB0-0xB3 */ + 0xE9, 0x7A, 0xEC, 0xE0, 0xBE, 0xD0, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE9, 0xA2, 0x00, 0x00, 0xE9, 0x7E, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEC, 0xE1, 0x00, 0x00, 0xBE, 0xD1, 0xE9, 0xA1, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE9, 0x7C, 0xC0, 0xB4, 0xEC, 0xDF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE9, 0x79, 0xE9, 0x7B, 0xC0, 0xB5, /* 0xC4-0xC7 */ + 0xBE, 0xD3, 0xC0, 0xB3, 0xBE, 0xD2, 0xC0, 0xB7, /* 0xC8-0xCB */ + 0xE9, 0x7D, 0xBE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xCF, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE7, 0xEF, 0xC8, /* 0xDC-0xDF */ + 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x56, /* 0xE0-0xE3 */ + 0xEC, 0xE5, 0xEC, 0xE4, 0xC0, 0xB6, 0xEC, 0xE2, /* 0xE4-0xE7 */ + 0xEC, 0xE6, 0xEF, 0xD0, 0xEF, 0xCC, 0xEF, 0xCE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEF, 0xC9, 0xEF, 0xCA, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEF, 0xCD, 0xEF, 0xCB, 0xC3, 0x67, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC3, 0x6A, 0xC3, 0x69, 0xC3, 0x68, /* 0xF4-0xF7 */ + 0xC4, 0x61, 0xF4, 0x4A, 0xC4, 0x62, 0xF2, 0x41, /* 0xF8-0xFB */ + 0xC4, 0xDF, 0xF5, 0xCC, 0xC4, 0xE0, 0xC5, 0x74, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0xC5, 0xCA, 0xF7, 0xD9, 0x00, 0x00, 0xF7, 0xDA, /* 0x00-0x03 */ + 0xF7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBA, /* 0x04-0x07 */ + 0xA4, 0xE0, 0xC9, 0x7C, 0xA5, 0xB3, 0x00, 0x00, /* 0x08-0x0B */ + 0xA6, 0xA6, 0xA6, 0xA7, 0xA6, 0xA5, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA6, 0xA8, 0xA7, 0xDA, 0xA7, 0xD9, 0x00, 0x00, /* 0x10-0x13 */ + 0xCC, 0xB1, 0xA9, 0xCF, 0xA9, 0xCE, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD1, 0xAF, 0xB1, 0xAD, 0xB1, 0xAE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x75, /* 0x1C-0x1F */ + 0xDD, 0x72, 0xB7, 0x60, 0xB7, 0x61, 0xDD, 0x74, /* 0x20-0x23 */ + 0xDD, 0x76, 0xDD, 0x75, 0x00, 0x00, 0xE1, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0xE1, 0xD6, 0xBA, 0x49, 0xE1, 0xD8, /* 0x28-0x2B */ + 0x00, 0x00, 0xE5, 0xAC, 0xBC, 0xAE, 0x00, 0x00, /* 0x2C-0x2F */ + 0xBE, 0xD4, 0x00, 0x00, 0xC0, 0xB8, 0xC2, 0x57, /* 0x30-0x33 */ + 0xC0, 0xB9, 0x00, 0x00, 0xA4, 0xE1, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xCC, 0xB2, 0xA9, 0xD1, 0xA9, 0xD0, /* 0x3C-0x3F */ + 0xA9, 0xD2, 0xAB, 0xF3, 0xCE, 0xD2, 0xCE, 0xD3, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB0, 0xAE, 0xB0, /* 0x44-0x47 */ + 0xB1, 0xAF, 0xB4, 0x76, 0xD9, 0x51, 0xA4, 0xE2, /* 0x48-0x4B */ + 0x00, 0x00, 0xA4, 0x7E, 0xA4, 0xE3, 0x00, 0x00, /* 0x4C-0x4F */ + 0xC9, 0x7D, 0xA5, 0xB7, 0xA5, 0xB6, 0xA5, 0xB4, /* 0x50-0x53 */ + 0xA5, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xA6, 0xAB, 0xC9, 0xE9, 0xC9, 0xEB, 0xA6, 0xAA, /* 0x58-0x5B */ + 0xC9, 0xE3, 0x00, 0x00, 0xC9, 0xE4, 0x00, 0x00, /* 0x5C-0x5F */ + 0xC9, 0xEA, 0xC9, 0xE6, 0xC9, 0xE8, 0xA6, 0xA9, /* 0x60-0x63 */ + 0xC9, 0xE5, 0xC9, 0xEC, 0xC9, 0xE7, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xA7, 0xE1, 0xA7, 0xEA, 0xA7, 0xE8, /* 0x6C-0x6F */ + 0xCA, 0xF0, 0xCA, 0xED, 0xCA, 0xF5, 0xA7, 0xE6, /* 0x70-0x73 */ + 0xCA, 0xF6, 0x00, 0x00, 0xA7, 0xDF, 0xCA, 0xF3, /* 0x74-0x77 */ + 0x00, 0x00, 0xA7, 0xE5, 0xCA, 0xEF, 0xCA, 0xEE, /* 0x78-0x7B */ + 0xA7, 0xE3, 0xCA, 0xF4, 0xA7, 0xE4, 0xA9, 0xD3, /* 0x7C-0x7F */ + + 0xA7, 0xDE, 0xCA, 0xF1, 0x00, 0x00, 0xCA, 0xE7, /* 0x80-0x83 */ + 0xA7, 0xDB, 0x00, 0x00, 0xA7, 0xEE, 0xCA, 0xEC, /* 0x84-0x87 */ + 0xCA, 0xF2, 0xA7, 0xE0, 0xA7, 0xE2, 0x00, 0x00, /* 0x88-0x8B */ + 0xCA, 0xE8, 0x00, 0x00, 0xCA, 0xE9, 0xCA, 0xEA, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA7, 0xED, 0xA7, 0xE7, 0xA7, 0xEC, /* 0x90-0x93 */ + 0xCA, 0xEB, 0xA7, 0xEB, 0xA7, 0xDD, 0xA7, 0xDC, /* 0x94-0x97 */ + 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA9, 0xE1, 0xCC, 0xBE, 0xCC, 0xB7, 0xA9, 0xDC, /* 0xA8-0xAB */ + 0xA9, 0xEF, 0xCC, 0xB3, 0xCC, 0xBA, 0xCC, 0xBC, /* 0xAC-0xAF */ + 0xCC, 0xBF, 0xA9, 0xEA, 0x00, 0x00, 0xCC, 0xBB, /* 0xB0-0xB3 */ + 0xCC, 0xB4, 0xA9, 0xE8, 0xCC, 0xB8, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCC, 0xC0, 0xA9, 0xD9, 0x00, 0x00, 0xCC, 0xBD, /* 0xB8-0xBB */ + 0xA9, 0xE3, 0xA9, 0xE2, 0xCC, 0xB6, 0xA9, 0xD7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xD8, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA9, 0xD6, 0x00, 0x00, 0xA9, 0xEE, 0xA9, 0xE6, /* 0xC4-0xC7 */ + 0xA9, 0xE0, 0xA9, 0xD4, 0xCC, 0xB9, 0xA9, 0xDF, /* 0xC8-0xCB */ + 0xA9, 0xD5, 0xA9, 0xE7, 0xA9, 0xF0, 0xCE, 0xD4, /* 0xCC-0xCF */ + 0xA9, 0xE4, 0xCC, 0xB5, 0xA9, 0xDA, 0xA9, 0xDD, /* 0xD0-0xD3 */ + 0xA9, 0xDE, 0x00, 0x00, 0xA9, 0xEC, 0xA9, 0xED, /* 0xD4-0xD7 */ + 0xA9, 0xEB, 0xA9, 0xE5, 0xA9, 0xE9, 0xA9, 0xDB, /* 0xD8-0xDB */ + 0xAB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDA, /* 0xE8-0xEB */ + 0xAC, 0x41, 0xAB, 0xF8, 0xAB, 0xFA, 0xAC, 0x40, /* 0xEC-0xEF */ + 0xCE, 0xE6, 0xAB, 0xFD, 0xD1, 0xB1, 0xAE, 0xB1, /* 0xF0-0xF3 */ + 0xAC, 0x43, 0xCE, 0xD7, 0xCE, 0xDF, 0xAB, 0xFE, /* 0xF4-0xF7 */ + 0xCE, 0xDE, 0xCE, 0xDB, 0xCE, 0xE3, 0xCE, 0xE5, /* 0xF8-0xFB */ + 0xAB, 0xF7, 0xAB, 0xFB, 0xAC, 0x42, 0xAE, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0xCE, 0xE0, 0xAB, 0xF9, 0xAC, 0x45, 0xCE, 0xD9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xFC, /* 0x04-0x07 */ + 0xAE, 0xB2, 0xAB, 0xF6, 0x00, 0x00, 0xCE, 0xD6, /* 0x08-0x0B */ + 0xCE, 0xDD, 0xCE, 0xD5, 0xCE, 0xD8, 0xCE, 0xDC, /* 0x0C-0x0F */ + 0xD1, 0xB2, 0xAC, 0x44, 0x00, 0x00, 0xCE, 0xE1, /* 0x10-0x13 */ + 0xCE, 0xE2, 0xCE, 0xE4, 0xAB, 0xF5, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xAE, 0xC1, 0xD1, 0xBE, 0xAE, 0xBF, 0xAE, 0xC0, /* 0x28-0x2B */ + 0xD1, 0xB4, 0xD1, 0xC4, 0x00, 0x00, 0xAE, 0xB6, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x66, 0xD1, 0xC6, /* 0x30-0x33 */ + 0xD1, 0xC0, 0x00, 0x00, 0xD1, 0xB7, 0x00, 0x00, /* 0x34-0x37 */ + 0xD1, 0xC9, 0xD1, 0xBA, 0xAE, 0xBC, 0xD5, 0x7D, /* 0x38-0x3B */ + 0xD1, 0xBD, 0xAE, 0xBE, 0xAE, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD1, 0xCB, 0xD1, 0xBF, 0xAE, 0xB8, 0xD1, 0xB8, /* 0x40-0x43 */ + 0xD1, 0xB5, 0xD1, 0xB6, 0xAE, 0xB9, 0xD1, 0xC5, /* 0x44-0x47 */ + 0xD1, 0xCC, 0xAE, 0xBB, 0xD1, 0xBC, 0xD1, 0xBB, /* 0x48-0x4B */ + 0xAE, 0xC3, 0xAE, 0xC2, 0xAE, 0xB4, 0xAE, 0xBA, /* 0x4C-0x4F */ + 0xAE, 0xBD, 0xD1, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD1, 0xC2, 0xAE, 0xB7, 0xD1, 0xB3, 0xD1, 0xCA, /* 0x54-0x57 */ + 0xD1, 0xC1, 0xD1, 0xC3, 0xD1, 0xC7, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD5, 0x67, 0x00, 0x00, 0xB1, 0xB7, /* 0x64-0x67 */ + 0xB1, 0xCB, 0xB1, 0xCA, 0x00, 0x00, 0xB1, 0xBF, /* 0x68-0x6B */ + 0x00, 0x00, 0xD5, 0x79, 0xD5, 0x75, 0xD5, 0x72, /* 0x6C-0x6F */ + 0xD5, 0xA6, 0xB1, 0xBA, 0xB1, 0xB2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD5, 0x77, 0xB4, 0xA8, 0xB1, 0xB6, /* 0x74-0x77 */ + 0xD5, 0xA1, 0x00, 0x00, 0xB1, 0xCC, 0xB1, 0xC9, /* 0x78-0x7B */ + 0xD5, 0x7B, 0xD5, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xB1, 0xC8, 0xD5, 0xA3, 0xD5, 0x69, 0xB1, 0xBD, /* 0x80-0x83 */ + 0xB1, 0xC1, 0xD5, 0xA2, 0x00, 0x00, 0xD5, 0x73, /* 0x84-0x87 */ + 0xB1, 0xC2, 0xB1, 0xBC, 0xD5, 0x68, 0x00, 0x00, /* 0x88-0x8B */ + 0xB4, 0x78, 0xD5, 0xA5, 0xD5, 0x71, 0xB1, 0xC7, /* 0x8C-0x8F */ + 0xD5, 0x74, 0xD5, 0xA4, 0xB1, 0xC6, 0x00, 0x00, /* 0x90-0x93 */ + 0xD9, 0x52, 0x00, 0x00, 0xB1, 0xB3, 0xD5, 0x6F, /* 0x94-0x97 */ + 0xB1, 0xB8, 0xB1, 0xC3, 0x00, 0x00, 0xB1, 0xBE, /* 0x98-0x9B */ + 0xD5, 0x78, 0xD5, 0x6E, 0xD5, 0x6C, 0xD5, 0x7E, /* 0x9C-0x9F */ + 0xB1, 0xB0, 0xB1, 0xC4, 0xB1, 0xB4, 0xB4, 0x77, /* 0xA0-0xA3 */ + 0xD5, 0x7C, 0xB1, 0xB5, 0x00, 0x00, 0xB1, 0xB1, /* 0xA4-0xA7 */ + 0xB1, 0xC0, 0xB1, 0xBB, 0xB1, 0xB9, 0xD5, 0x70, /* 0xA8-0xAB */ + 0xB1, 0xC5, 0xD5, 0x6D, 0xD5, 0x7A, 0xD5, 0x76, /* 0xAC-0xAF */ + 0xD9, 0x54, 0xD9, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD5, 0x6B, 0xD9, 0x64, 0x00, 0x00, /* 0xBC-0xBF */ + 0xB4, 0x7A, 0x00, 0x00, 0xD9, 0x6A, 0xD9, 0x59, /* 0xC0-0xC3 */ + 0xD9, 0x67, 0xDD, 0x77, 0xB4, 0x7D, 0xD9, 0x6B, /* 0xC4-0xC7 */ + 0xD9, 0x6E, 0xB4, 0x7C, 0xD9, 0x5C, 0xD9, 0x6D, /* 0xC8-0xCB */ + 0xD9, 0x6C, 0xB4, 0x7E, 0xD9, 0x55, 0xB4, 0x79, /* 0xCC-0xCF */ + 0xB4, 0xA3, 0x00, 0x00, 0xB4, 0xA1, 0xD9, 0x69, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xD9, 0x5F, 0xB4, 0xA5, 0xD9, 0x70, /* 0xD4-0xD7 */ + 0xD9, 0x68, 0xD9, 0x71, 0xB4, 0xAD, 0xB4, 0xAB, /* 0xD8-0xDB */ + 0xD9, 0x66, 0xD9, 0x65, 0x00, 0x00, 0xD9, 0x63, /* 0xDC-0xDF */ + 0xD9, 0x5D, 0xB4, 0xA4, 0x00, 0x00, 0xB4, 0xA2, /* 0xE0-0xE3 */ + 0xD1, 0xB9, 0xD9, 0x56, 0x00, 0x00, 0xDD, 0xB7, /* 0xE4-0xE7 */ + 0xD9, 0x57, 0xB4, 0x7B, 0xB4, 0xAA, 0xDD, 0x79, /* 0xE8-0xEB */ + 0x00, 0x00, 0xB4, 0xA6, 0xB4, 0xA7, 0xD9, 0x58, /* 0xEC-0xEF */ + 0xD9, 0x6F, 0xDD, 0x78, 0xD9, 0x60, 0xD9, 0x5B, /* 0xF0-0xF3 */ + 0xB4, 0xA9, 0xD9, 0x61, 0xD9, 0x5E, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x70, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xDD, 0x7C, 0xDD, 0xB1, 0xDD, 0xB6, /* 0x08-0x0B */ + 0xDD, 0xAA, 0xB7, 0x6C, 0xDD, 0xBB, 0xB7, 0x69, /* 0x0C-0x0F */ + 0xDD, 0x7A, 0x00, 0x00, 0xDD, 0x7B, 0xB7, 0x62, /* 0x10-0x13 */ + 0xB7, 0x6B, 0xDD, 0xA4, 0xB7, 0x6E, 0xB7, 0x6F, /* 0x14-0x17 */ + 0xDD, 0xA5, 0x00, 0x00, 0xDD, 0xB2, 0xDD, 0xB8, /* 0x18-0x1B */ + 0xB7, 0x6A, 0x00, 0x00, 0xB7, 0x64, 0xDD, 0xA3, /* 0x1C-0x1F */ + 0xDD, 0x7D, 0xDD, 0xBA, 0xDD, 0xA8, 0xDD, 0xA9, /* 0x20-0x23 */ + 0xDD, 0x7E, 0xDD, 0xB4, 0xDD, 0xAB, 0xDD, 0xB5, /* 0x24-0x27 */ + 0xDD, 0xAD, 0x00, 0x00, 0xB7, 0x65, 0xE1, 0xD9, /* 0x28-0x2B */ + 0xB7, 0x68, 0xB7, 0x66, 0xDD, 0xB9, 0xDD, 0xB0, /* 0x2C-0x2F */ + 0xDD, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA1, /* 0x30-0x33 */ + 0xBA, 0x53, 0xDD, 0xAF, 0xB7, 0x6D, 0xDD, 0xA7, /* 0x34-0x37 */ + 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB7, 0x67, 0xB7, 0x63, 0xE1, 0xEE, /* 0x3C-0x3F */ + 0xDD, 0xB3, 0xDD, 0xAE, 0x00, 0x00, 0xDD, 0xA2, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE9, /* 0x48-0x4B */ + 0x00, 0x00, 0xE1, 0xDA, 0xE1, 0xE5, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE1, 0xEC, 0xBA, 0x51, 0xB4, 0xAC, 0xE1, 0xEA, /* 0x50-0x53 */ + 0xBA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xBA, 0x4B, 0xE1, 0xF1, 0x00, 0x00, 0xE1, 0xDB, /* 0x58-0x5B */ + 0xE1, 0xE8, 0xE1, 0xDC, 0xE1, 0xE7, 0xBA, 0x4F, /* 0x5C-0x5F */ + 0xE1, 0xEB, 0xD9, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xF2, 0xE1, 0xE3, 0xBA, 0x52, /* 0x64-0x67 */ + 0xE5, 0xBA, 0xBC, 0xAF, 0x00, 0x00, 0xE1, 0xF0, /* 0x68-0x6B */ + 0xE1, 0xEF, 0xBA, 0x54, 0xE5, 0xAD, 0xBC, 0xB0, /* 0x6C-0x6F */ + 0xE5, 0xAE, 0x00, 0x00, 0xE1, 0xDF, 0xE1, 0xE0, /* 0x70-0x73 */ + 0xE1, 0xDD, 0xE1, 0xE2, 0xE1, 0xDE, 0xE1, 0xF3, /* 0x74-0x77 */ + 0xBA, 0x4E, 0xBC, 0xB1, 0xBA, 0x50, 0xBA, 0x55, /* 0x78-0x7B */ + 0x00, 0x00, 0xE1, 0xE1, 0x00, 0x00, 0xE1, 0xED, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE5, 0xB1, 0x00, 0x00, 0xBA, 0x4A, /* 0x84-0x87 */ + 0xBC, 0xB4, 0xE9, 0xAA, 0xE5, 0xB6, 0xE5, 0xB5, /* 0x88-0x8B */ + 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB4, /* 0x8C-0x8F */ + 0xBC, 0xB5, 0x00, 0x00, 0xBC, 0xBB, 0xBC, 0xB8, /* 0x90-0x93 */ + 0x00, 0x00, 0xBC, 0xB9, 0xE5, 0xAF, 0xE5, 0xB2, /* 0x94-0x97 */ + 0xE5, 0xBC, 0xBC, 0xC1, 0xBC, 0xBF, 0x00, 0x00, /* 0x98-0x9B */ + 0xE5, 0xB3, 0xD9, 0x5A, 0xBC, 0xB2, 0xE5, 0xB9, /* 0x9C-0x9F */ + 0xE5, 0xB0, 0x00, 0x00, 0xBC, 0xC2, 0xE5, 0xB8, /* 0xA0-0xA3 */ + 0xBA, 0x4D, 0xBC, 0xB7, 0xE1, 0xE4, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xBC, 0xBA, 0x00, 0x00, 0xBC, 0xBE, /* 0xA8-0xAB */ + 0xBC, 0xC0, 0xBC, 0xBD, 0xBC, 0xBC, 0x00, 0x00, /* 0xAC-0xAF */ + 0xBC, 0xB6, 0xE5, 0xBB, 0xBC, 0xB3, 0xBC, 0xC3, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xD8, /* 0xB8-0xBB */ + 0xBE, 0xD9, 0xE9, 0xA9, 0xBE, 0xE2, 0xBE, 0xDF, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBE, 0xD6, 0xBE, 0xDD, 0xE9, 0xAB, /* 0xC0-0xC3 */ + 0xBE, 0xDB, 0xBE, 0xD5, 0x00, 0x00, 0xBE, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE9, 0xA8, 0xC0, 0xBB, 0xBE, 0xD7, /* 0xC8-0xCB */ + 0x00, 0x00, 0xBE, 0xDE, 0xC0, 0xBA, 0xE9, 0xA7, /* 0xCC-0xCF */ + 0xE9, 0xA6, 0x00, 0x00, 0xBE, 0xE0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xBE, 0xE1, 0x00, 0x00, 0xE9, 0xA5, 0xE9, 0xA4, /* 0xD4-0xD7 */ + 0xC0, 0xBC, 0xE9, 0xAE, 0xBE, 0xDA, 0xE9, 0xAC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xC0, 0xBD, 0x00, 0x00, 0xC0, 0xC2, 0xEC, 0xEA, /* 0xE0-0xE3 */ + 0xEC, 0xEC, 0x00, 0x00, 0xC0, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEC, 0xED, 0xEC, 0xE9, 0x00, 0x00, 0xEC, 0xEB, /* 0xE8-0xEB */ + 0xC0, 0xC0, 0xC0, 0xC3, 0x00, 0x00, 0xEC, 0xE8, /* 0xEC-0xEF */ + 0xC0, 0xBE, 0xC0, 0xC1, 0xC2, 0x59, 0xE9, 0xAD, /* 0xF0-0xF3 */ + 0xC2, 0x58, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x5E, /* 0xF4-0xF7 */ + 0xEF, 0xD4, 0x00, 0x00, 0xC2, 0x5C, 0xC2, 0x5D, /* 0xF8-0xFB */ + 0xEF, 0xD7, 0xEF, 0xD3, 0xC2, 0x5A, 0xEF, 0xD1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xC3, 0x6B, 0xEF, 0xD5, 0x00, 0x00, 0xEF, 0xD6, /* 0x00-0x03 */ + 0xEF, 0xD2, 0x00, 0x00, 0xC2, 0x5B, 0xF2, 0x42, /* 0x04-0x07 */ + 0x00, 0x00, 0xF2, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF2, 0x46, 0xF2, 0x44, 0xF2, 0x47, 0xC3, 0x6C, /* 0x0C-0x0F */ + 0xF2, 0x43, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x4E, /* 0x10-0x13 */ + 0xC4, 0x64, 0xF4, 0x4D, 0xF4, 0x4C, 0xF4, 0x4B, /* 0x14-0x17 */ + 0xC4, 0x63, 0xC4, 0x65, 0x00, 0x00, 0xF5, 0xCD, /* 0x18-0x1B */ + 0xC4, 0xE2, 0xC4, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF6, 0xE1, 0xF6, 0xE0, 0xF6, 0xE3, 0xC5, 0xCB, /* 0x20-0x23 */ + 0xC5, 0x75, 0xF7, 0xDD, 0xF6, 0xE2, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xF7, 0xDC, 0xC5, 0xCD, 0xC5, 0xCC, /* 0x28-0x2B */ + 0xC5, 0xF3, 0xF8, 0xA9, 0xF8, 0xEF, 0xA4, 0xE4, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0x72, 0xE9, 0xAF, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xAC, 0xCA, 0xF7, /* 0x34-0x37 */ + 0xA7, 0xF1, 0xA7, 0xEF, 0x00, 0x00, 0xA7, 0xF0, /* 0x38-0x3B */ + 0x00, 0x00, 0xCC, 0xC1, 0xA9, 0xF1, 0xAC, 0x46, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCE, 0xE7, 0x00, 0x00, 0xCE, 0xE8, /* 0x40-0x43 */ + 0x00, 0x00, 0xAC, 0x47, 0xD1, 0xCE, 0x00, 0x00, /* 0x44-0x47 */ + 0xAE, 0xC4, 0xAE, 0xC5, 0xD1, 0xCD, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xD3, /* 0x4C-0x4F */ + 0x00, 0x00, 0xB1, 0xCF, 0x00, 0x00, 0xD5, 0xA7, /* 0x50-0x53 */ + 0xB1, 0xD6, 0xB1, 0xD5, 0xB1, 0xCE, 0xB1, 0xD1, /* 0x54-0x57 */ + 0xB1, 0xD4, 0xB1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD9, 0x76, 0xB1, 0xCD, 0xB4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0xB1, 0xB4, 0xB2, /* 0x60-0x63 */ + 0xD9, 0x75, 0xD9, 0x78, 0xB4, 0xB0, 0xD9, 0x73, /* 0x64-0x67 */ + 0xD9, 0x77, 0x00, 0x00, 0xD9, 0x74, 0x00, 0x00, /* 0x68-0x6B */ + 0xB7, 0x71, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBC, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0x56, 0xE1, 0xF4, /* 0x70-0x73 */ + 0xBE, 0xE3, 0xBC, 0xC4, 0xE5, 0xBD, 0xBC, 0xC5, /* 0x74-0x77 */ + 0xBC, 0xC6, 0xE5, 0xBF, 0xE5, 0xBE, 0xE5, 0xC0, /* 0x78-0x7B */ + 0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, /* 0x7C-0x7F */ + + 0xEC, 0xEF, 0xEC, 0xEE, 0xC0, 0xC4, 0xC0, 0xC5, /* 0x80-0x83 */ + 0xF2, 0x48, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xB4, 0xB4, 0xB4, 0xB3, 0xDD, 0xBD, 0x00, 0x00, /* 0x90-0x93 */ + 0xEF, 0xD8, 0xC4, 0xE3, 0xF7, 0xDE, 0xA4, 0xE6, /* 0x94-0x97 */ + 0x00, 0x00, 0xAE, 0xC6, 0x00, 0x00, 0xB1, 0xD8, /* 0x98-0x9B */ + 0xB1, 0xD7, 0xD9, 0x7A, 0xD9, 0x7B, 0xB7, 0x72, /* 0x9C-0x9F */ + 0xE1, 0xF5, 0xBA, 0x57, 0xE9, 0xB2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA4, 0xE7, 0xA5, 0xB8, 0x00, 0x00, 0xA9, 0xF2, /* 0xA4-0xA7 */ + 0xCC, 0xC2, 0x00, 0x00, 0xCE, 0xE9, 0xAC, 0x48, /* 0xA8-0xAB */ + 0xB1, 0xD9, 0x00, 0x00, 0xD9, 0x7C, 0xB4, 0xB5, /* 0xAC-0xAF */ + 0xB7, 0x73, 0x00, 0x00, 0xE5, 0xC1, 0xE5, 0xC2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF0, 0xC2, 0x5F, /* 0xB4-0xB7 */ + 0xF8, 0xF0, 0xA4, 0xE8, 0x00, 0x00, 0xCC, 0xC3, /* 0xB8-0xBB */ + 0xA9, 0xF3, 0xAC, 0x49, 0x00, 0x00, 0xCE, 0xEA, /* 0xBC-0xBF */ + 0x00, 0x00, 0xAE, 0xC7, 0xD1, 0xD2, 0xD1, 0xD0, /* 0xC0-0xC3 */ + 0xD1, 0xD1, 0xAE, 0xC8, 0xD1, 0xCF, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xDB, /* 0xC8-0xCB */ + 0xB1, 0xDC, 0xD5, 0xA8, 0xB1, 0xDD, 0xB1, 0xDA, /* 0xCC-0xCF */ + 0xD9, 0x7D, 0x00, 0x00, 0xD9, 0x7E, 0xDD, 0xBE, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0x59, 0xBA, 0x58, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF1, 0xEF, 0xD9, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF2, 0x4A, 0xF2, 0x49, 0xF4, 0x4F, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC9, 0x5E, 0xAC, 0x4A, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA4, 0xE9, 0xA5, 0xB9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA6, 0xAE, 0xA6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xA6, 0xAF, 0xA6, 0xB0, 0xC9, 0xEE, 0xC9, 0xED, /* 0xEC-0xEF */ + 0xCA, 0xF8, 0xA7, 0xF2, 0xCA, 0xFB, 0xCA, 0xFA, /* 0xF0-0xF3 */ + 0xCA, 0xF9, 0xCA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xF4, 0xCC, 0xC9, /* 0xF8-0xFB */ + 0xCC, 0xC5, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0xA9, 0xFB, 0x00, 0x00, 0xA9, 0xF9, 0xCC, 0xCA, /* 0x00-0x03 */ + 0xCC, 0xC6, 0xCC, 0xCD, 0xA9, 0xF8, 0xAA, 0x40, /* 0x04-0x07 */ + 0xCC, 0xC8, 0xCC, 0xC4, 0xA9, 0xFE, 0xCC, 0xCB, /* 0x08-0x0B */ + 0xA9, 0xF7, 0xCC, 0xCC, 0xA9, 0xFA, 0xA9, 0xFC, /* 0x0C-0x0F */ + 0xCC, 0xD0, 0xCC, 0xCF, 0xCC, 0xC7, 0xA9, 0xF6, /* 0x10-0x13 */ + 0xA9, 0xF5, 0xA9, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xCE, 0xEF, 0xCE, 0xF5, 0x00, 0x00, 0xAC, 0x50, /* 0x1C-0x1F */ + 0xAC, 0x4D, 0xCE, 0xEC, 0xCE, 0xF1, 0x00, 0x00, /* 0x20-0x23 */ + 0xAC, 0x53, 0xAC, 0x4B, 0xCE, 0xF0, 0xAC, 0x4E, /* 0x24-0x27 */ + 0xAC, 0x51, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF3, /* 0x28-0x2B */ + 0x00, 0x00, 0xAC, 0x4C, 0xCE, 0xF8, 0xAC, 0x4F, /* 0x2C-0x2F */ + 0x00, 0x00, 0xAC, 0x52, 0xCE, 0xED, 0xCE, 0xF2, /* 0x30-0x33 */ + 0xCE, 0xF6, 0xCE, 0xEE, 0xCE, 0xEB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xCE, 0xF7, 0xCE, 0xF4, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xAE, 0xD0, 0xAE, 0xC9, 0xAE, 0xCC, /* 0x40-0x43 */ + 0x00, 0x00, 0xAE, 0xCF, 0x00, 0x00, 0xD1, 0xD5, /* 0x44-0x47 */ + 0x00, 0x00, 0xAE, 0xCA, 0xD1, 0xD3, 0x00, 0x00, /* 0x48-0x4B */ + 0xAE, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xAE, 0xCB, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD1, 0xD6, 0xAE, 0xCD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD5, 0xAC, 0xB1, 0xDF, 0xD5, 0xAB, /* 0x58-0x5B */ + 0xD5, 0xAD, 0xB1, 0xDE, 0xB1, 0xE3, 0xD1, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD5, 0xAA, 0xD5, 0xAE, 0x00, 0x00, /* 0x60-0x63 */ + 0xB1, 0xE0, 0xD5, 0xA9, 0xB1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */ + 0xB1, 0xE1, 0x00, 0x00, 0xD9, 0xA7, 0x00, 0x00, /* 0x68-0x6B */ + 0xD9, 0xA2, 0x00, 0x00, 0xB4, 0xB6, 0xB4, 0xBA, /* 0x6C-0x6F */ + 0xB4, 0xB7, 0xD9, 0xA5, 0xD9, 0xA8, 0x00, 0x00, /* 0x70-0x73 */ + 0xB4, 0xB8, 0x00, 0x00, 0xB4, 0xB9, 0xB4, 0xBE, /* 0x74-0x77 */ + 0xDD, 0xC7, 0xD9, 0xA6, 0xB4, 0xBC, 0xD9, 0xA3, /* 0x78-0x7B */ + 0xD9, 0xA1, 0x00, 0x00, 0xB4, 0xBD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xB7, 0x79, 0x00, 0x00, 0xDD, 0xBF, 0xB7, 0x76, /* 0x84-0x87 */ + 0xB7, 0x77, 0xB7, 0x75, 0xDD, 0xC4, 0xDD, 0xC3, /* 0x88-0x8B */ + 0xDD, 0xC0, 0xB7, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDD, 0xC2, 0xB4, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xDD, 0xC6, 0xDD, 0xC1, 0xB7, 0x78, 0xB7, 0x74, /* 0x94-0x97 */ + 0xB7, 0x7A, 0xDD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xBA, 0x5C, 0x00, 0x00, 0xE1, 0xF8, /* 0x9C-0x9F */ + 0xE1, 0xF7, 0xE1, 0xF6, 0xBA, 0x5A, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBA, 0x5B, 0xE5, 0xC5, 0xE5, 0xC8, 0xBC, 0xC8, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xC7, 0xE5, 0xC9, /* 0xAC-0xAF */ + 0xE5, 0xC4, 0xBC, 0xCA, 0xE5, 0xC6, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBC, 0xC9, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC7, /* 0xB4-0xB7 */ + 0xBE, 0xE9, 0xBE, 0xE6, 0xE9, 0xBB, 0xE9, 0xBA, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xB4, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBE, 0xE7, 0x00, 0x00, 0xBE, 0xE4, 0xBE, 0xE8, /* 0xC4-0xC7 */ + 0xE9, 0xB3, 0xBE, 0xE5, 0xE9, 0xB6, 0xE9, 0xB7, /* 0xC8-0xCB */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB8, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xC7, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEF, 0xDC, 0xC0, 0xC6, 0xEF, 0xDA, 0xEF, 0xDB, /* 0xD8-0xDB */ + 0xC2, 0x60, 0xC3, 0x6E, 0xF2, 0x4B, 0x00, 0x00, /* 0xDC-0xDF */ + 0xC3, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x51, /* 0xE0-0xE3 */ + 0xF4, 0x52, 0x00, 0x00, 0xC4, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF4, 0x50, 0xC4, 0xE4, 0x00, 0x00, 0xF7, 0xDF, /* 0xE8-0xEB */ + 0xC5, 0xCE, 0xF8, 0xAA, 0xF8, 0xAB, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA4, 0xEA, 0x00, 0x00, 0xA6, 0xB1, 0xA6, 0xB2, /* 0xF0-0xF3 */ + 0xA7, 0xF3, 0x00, 0x00, 0xCC, 0xD1, 0xAC, 0x54, /* 0xF4-0xF7 */ + 0xAE, 0xD1, 0xB1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xB0, 0xD2, 0x00, 0x00, 0xB4, 0xBF, 0xB4, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xB3, 0xCC, 0xD9, 0xA9, 0x00, 0x00, 0xB7, 0x7C, /* 0x00-0x03 */ + 0xE1, 0xFA, 0xE1, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA4, 0xEB, 0xA6, 0xB3, 0xCC, 0xD2, 0xAA, 0x42, /* 0x08-0x0B */ + 0x00, 0x00, 0xAA, 0x41, 0x00, 0x00, 0xCE, 0xF9, /* 0x0C-0x0F */ + 0xCE, 0xFA, 0x00, 0x00, 0xD1, 0xD7, 0xD1, 0xD8, /* 0x10-0x13 */ + 0xAE, 0xD2, 0xAE, 0xD3, 0x00, 0x00, 0xAE, 0xD4, /* 0x14-0x17 */ + 0xD5, 0xAF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xE6, /* 0x18-0x1B */ + 0x00, 0x00, 0xB4, 0xC2, 0x00, 0x00, 0xB4, 0xC1, /* 0x1C-0x1F */ + 0xDD, 0xC8, 0xDF, 0x7A, 0xE1, 0xFB, 0xE9, 0xBD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x61, 0xC4, 0x67, /* 0x24-0x27 */ + 0xA4, 0xEC, 0x00, 0x00, 0xA5, 0xBC, 0xA5, 0xBD, /* 0x28-0x2B */ + 0xA5, 0xBB, 0xA5, 0xBE, 0xA5, 0xBA, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA6, 0xB6, 0x00, 0x00, 0xC9, 0xF6, /* 0x30-0x33 */ + 0xA6, 0xB5, 0xA6, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC9, 0xF1, 0xC9, 0xF0, 0xC9, 0xF3, 0xC9, 0xF2, /* 0x38-0x3B */ + 0xC9, 0xF5, 0xA6, 0xB4, 0xC9, 0xEF, 0xC9, 0xF4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0xFD, 0xA7, 0xFD, 0xCA, 0xFE, /* 0x44-0x47 */ + 0xCB, 0x43, 0xA7, 0xFC, 0x00, 0x00, 0xCB, 0x47, /* 0x48-0x4B */ + 0xCB, 0x42, 0xCB, 0x45, 0xA7, 0xF5, 0xA7, 0xF6, /* 0x4C-0x4F */ + 0xA7, 0xF7, 0xA7, 0xF8, 0x00, 0x00, 0xA8, 0x40, /* 0x50-0x53 */ + 0x00, 0x00, 0xCB, 0x41, 0xA7, 0xFA, 0xA8, 0x41, /* 0x54-0x57 */ + 0x00, 0x00, 0xCB, 0x40, 0xCB, 0x46, 0x00, 0x00, /* 0x58-0x5B */ + 0xA7, 0xF9, 0xCB, 0x44, 0xA7, 0xFB, 0xA7, 0xF4, /* 0x5C-0x5F */ + 0xA7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0x57, 0x00, 0x00, /* 0x68-0x6B */ + 0xCC, 0xD4, 0xAA, 0x43, 0x00, 0x00, 0xAA, 0x4D, /* 0x6C-0x6F */ + 0xAA, 0x4E, 0xAA, 0x46, 0xAA, 0x58, 0xAA, 0x48, /* 0x70-0x73 */ + 0xCC, 0xDC, 0xAA, 0x53, 0xCC, 0xD7, 0xAA, 0x49, /* 0x74-0x77 */ + 0xCC, 0xE6, 0xCC, 0xE7, 0xCC, 0xDF, 0xCC, 0xD8, /* 0x78-0x7B */ + 0xAA, 0x56, 0xCC, 0xE4, 0xAA, 0x51, 0xAA, 0x4F, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xCC, 0xE5, 0x00, 0x00, 0xCC, 0xE3, /* 0x80-0x83 */ + 0xCC, 0xDB, 0xCC, 0xD3, 0xCC, 0xDA, 0xAA, 0x4A, /* 0x84-0x87 */ + 0x00, 0x00, 0xAA, 0x50, 0x00, 0x00, 0xAA, 0x44, /* 0x88-0x8B */ + 0xCC, 0xDE, 0xCC, 0xDD, 0xCC, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */ + 0xAA, 0x52, 0xCC, 0xE1, 0xCC, 0xD6, 0xAA, 0x55, /* 0x90-0x93 */ + 0xCC, 0xE8, 0xAA, 0x45, 0x00, 0x00, 0xAA, 0x4C, /* 0x94-0x97 */ + 0xCC, 0xD9, 0xCC, 0xE2, 0xAA, 0x54, 0x00, 0x00, /* 0x98-0x9B */ + 0xAA, 0x47, 0xAA, 0x4B, 0x00, 0x00, 0xCC, 0xE0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x5B, 0xAC, 0x5C, /* 0xAC-0xAF */ + 0xAC, 0x69, 0x00, 0x00, 0xCF, 0x56, 0xCF, 0x4C, /* 0xB0-0xB3 */ + 0xAC, 0x62, 0xCF, 0x4A, 0xAC, 0x5B, 0xCF, 0x45, /* 0xB4-0xB7 */ + 0xAC, 0x65, 0xCF, 0x52, 0xCE, 0xFE, 0xCF, 0x41, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCF, 0x44, 0xCE, 0xFB, 0xCF, 0x51, 0xCF, 0x61, /* 0xC0-0xC3 */ + 0xAC, 0x60, 0xCF, 0x46, 0xCF, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xCE, 0xFD, 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x63, /* 0xC8-0xCB */ + 0xCF, 0x5A, 0xCF, 0x4B, 0xCF, 0x53, 0xAC, 0x66, /* 0xCC-0xCF */ + 0xAC, 0x59, 0xAC, 0x61, 0xAC, 0x6D, 0xAC, 0x56, /* 0xD0-0xD3 */ + 0xAC, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCF, 0x43, 0xAC, 0x6A, 0xAC, 0x63, 0xCF, 0x5D, /* 0xD8-0xDB */ + 0xCF, 0x40, 0xAC, 0x6C, 0xAC, 0x67, 0xCF, 0x49, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0x6B, 0xCF, 0x50, /* 0xE0-0xE3 */ + 0xCF, 0x48, 0xAC, 0x64, 0xCF, 0x5C, 0xCF, 0x54, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xAC, 0x5E, 0xCF, 0x62, 0xCF, 0x47, /* 0xE8-0xEB */ + 0xAC, 0x5A, 0xCF, 0x59, 0xCF, 0x4F, 0xAC, 0x5F, /* 0xEC-0xEF */ + 0xCF, 0x55, 0xAC, 0x57, 0xCE, 0xFC, 0xAC, 0x68, /* 0xF0-0xF3 */ + 0xAE, 0xE3, 0xAC, 0x5D, 0xCF, 0x4E, 0xCF, 0x4D, /* 0xF4-0xF7 */ + 0xCF, 0x42, 0x00, 0x00, 0xCF, 0x5E, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCF, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAC, 0x55, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xAE, 0xEA, /* 0x10-0x13 */ + 0xD1, 0xED, 0x00, 0x00, 0xD1, 0xE1, 0xAE, 0xDF, /* 0x14-0x17 */ + 0xAE, 0xEB, 0x00, 0x00, 0xD1, 0xDA, 0x00, 0x00, /* 0x18-0x1B */ + 0xD1, 0xE3, 0xD1, 0xEB, 0x00, 0x00, 0xD1, 0xD9, /* 0x1C-0x1F */ + 0xD1, 0xF4, 0xAE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD1, 0xF3, 0xD1, 0xEE, 0x00, 0x00, /* 0x24-0x27 */ + 0xD1, 0xEF, 0xAE, 0xDD, 0xAE, 0xE8, 0xD1, 0xE5, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0xE6, 0xD1, 0xF0, 0xD1, 0xE7, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD1, 0xE2, 0xD1, 0xDC, 0xD1, 0xDD, /* 0x30-0x33 */ + 0xD1, 0xEA, 0xD1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xAE, 0xD6, 0xAE, 0xDA, 0xD1, 0xF2, 0xD1, 0xDE, /* 0x38-0x3B */ + 0xAE, 0xE6, 0xAE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xAE, 0xE5, 0xAE, 0xEC, 0xAE, 0xDB, 0xAE, 0xE7, /* 0x40-0x43 */ + 0xD1, 0xE9, 0xAE, 0xE9, 0xAE, 0xD8, 0x00, 0x00, /* 0x44-0x47 */ + 0xAE, 0xD7, 0xD1, 0xDB, 0x00, 0x00, 0xD1, 0xDF, /* 0x48-0x4B */ + 0xAE, 0xE0, 0xD1, 0xF1, 0xD1, 0xE8, 0xD1, 0xE0, /* 0x4C-0x4F */ + 0xAE, 0xE4, 0xAE, 0xE1, 0x00, 0x00, 0xAE, 0xD9, /* 0x50-0x53 */ + 0xAE, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC4, /* 0x68-0x6B */ + 0x00, 0x00, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, 0xB9, /* 0x6C-0x6F */ + 0x00, 0x00, 0xD5, 0xC8, 0xD5, 0xC5, 0x00, 0x00, /* 0x70-0x73 */ + 0xD5, 0xBE, 0xD5, 0xBD, 0xB1, 0xED, 0xD5, 0xC1, /* 0x74-0x77 */ + 0xD5, 0xD0, 0xD5, 0xB0, 0x00, 0x00, 0xD5, 0xD1, /* 0x78-0x7B */ + 0xD5, 0xC3, 0xD5, 0xD5, 0xD5, 0xC9, 0xB1, 0xEC, /* 0x7C-0x7F */ + + 0xD5, 0xC7, 0xB1, 0xE7, 0xB1, 0xFC, 0xB1, 0xF2, /* 0x80-0x83 */ + 0x00, 0x00, 0xB1, 0xF6, 0xB1, 0xF5, 0xD5, 0xB1, /* 0x84-0x87 */ + 0x00, 0x00, 0xD5, 0xCE, 0xD5, 0xD4, 0xD5, 0xCC, /* 0x88-0x8B */ + 0xD5, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC0, /* 0x8C-0x8F */ + 0xD5, 0xB2, 0xD5, 0xD2, 0xD5, 0xC2, 0xB1, 0xEA, /* 0x90-0x93 */ + 0xB1, 0xF7, 0x00, 0x00, 0xD5, 0xCB, 0xB1, 0xF0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCA, /* 0x98-0x9B */ + 0xD5, 0xB3, 0xB1, 0xF8, 0x00, 0x00, 0xB1, 0xFA, /* 0x9C-0x9F */ + 0xD5, 0xCD, 0xB1, 0xFB, 0xB1, 0xE9, 0xD5, 0xBA, /* 0xA0-0xA3 */ + 0xD5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xEF, /* 0xA4-0xA7 */ + 0xB1, 0xF9, 0xD5, 0xBC, 0xD5, 0xC6, 0xD5, 0xB7, /* 0xA8-0xAB */ + 0xD5, 0xBB, 0xB1, 0xF4, 0xD5, 0xB6, 0xB1, 0xE8, /* 0xAC-0xAF */ + 0xB1, 0xF1, 0xB1, 0xEE, 0xD5, 0xBF, 0xAE, 0xDE, /* 0xB0-0xB3 */ + 0xD9, 0xC0, 0xB1, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB1, 0xF3, 0x00, 0x00, 0xD9, 0xC3, 0xD9, 0xD9, /* 0xC4-0xC7 */ + 0xD9, 0xCE, 0xB4, 0xD6, 0x00, 0x00, 0xB4, 0xD1, /* 0xC8-0xCB */ + 0xD9, 0xBD, 0xB4, 0xD2, 0xD9, 0xCD, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD9, 0xC6, 0xD9, 0xD3, 0xB4, 0xCE, 0xD9, 0xAB, /* 0xD0-0xD3 */ + 0xD9, 0xD5, 0xB4, 0xC4, 0xD9, 0xB3, 0xB4, 0xC7, /* 0xD4-0xD7 */ + 0xB4, 0xC6, 0x00, 0x00, 0xB4, 0xD7, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD9, 0xAD, 0xD9, 0xCF, 0xD9, 0xD0, 0xB4, 0xC9, /* 0xDC-0xDF */ + 0xB4, 0xC5, 0xD9, 0xBB, 0x00, 0x00, 0xB4, 0xD0, /* 0xE0-0xE3 */ + 0xD9, 0xB6, 0x00, 0x00, 0xD9, 0xD1, 0xB4, 0xCC, /* 0xE4-0xE7 */ + 0xD9, 0xC9, 0xD9, 0xD6, 0xD9, 0xB0, 0xD9, 0xB5, /* 0xE8-0xEB */ + 0xD9, 0xAF, 0x00, 0x00, 0xB4, 0xCB, 0xD9, 0xC2, /* 0xEC-0xEF */ + 0xDD, 0xDE, 0xD9, 0xB1, 0xB4, 0xCF, 0xD9, 0xBA, /* 0xF0-0xF3 */ + 0xD9, 0xD2, 0xB4, 0xCA, 0xD9, 0xB7, 0xD9, 0xB4, /* 0xF4-0xF7 */ + 0xD9, 0xC5, 0xB4, 0xCD, 0xB4, 0xC3, 0xB4, 0xD9, /* 0xF8-0xFB */ + 0xD9, 0xC8, 0xD9, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_69[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD9, 0xAC, 0xB4, 0xC8, 0xD9, 0xD4, 0xD9, 0xBC, /* 0x04-0x07 */ + 0xD9, 0xBE, 0x00, 0x00, 0xD9, 0xCB, 0xD9, 0xCA, /* 0x08-0x0B */ + 0xD9, 0xAA, 0xB4, 0xD3, 0xB4, 0xD5, 0xD9, 0xB2, /* 0x0C-0x0F */ + 0xD9, 0xB9, 0xD9, 0xC1, 0xB4, 0xD4, 0xD9, 0xB8, /* 0x10-0x13 */ + 0xD9, 0xC4, 0xD9, 0xD7, 0x00, 0x00, 0xD9, 0xCC, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF2, /* 0x2C-0x2F */ + 0xB7, 0xA6, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xDB, /* 0x30-0x33 */ + 0xDD, 0xE0, 0xDD, 0xD9, 0x00, 0x00, 0xDD, 0xEC, /* 0x34-0x37 */ + 0xDD, 0xCB, 0xDD, 0xD2, 0x00, 0x00, 0xDD, 0xEA, /* 0x38-0x3B */ + 0xDD, 0xF4, 0xDD, 0xDC, 0x00, 0x00, 0xDD, 0xCF, /* 0x3C-0x3F */ + 0xDD, 0xE2, 0xDD, 0xE7, 0xDD, 0xD3, 0x00, 0x00, /* 0x40-0x43 */ + 0xDD, 0xE4, 0xDD, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xDD, 0xD7, 0xDD, 0xD8, 0xB7, 0xA8, 0xDD, 0xEB, /* 0x48-0x4B */ + 0xDD, 0xE9, 0x00, 0x00, 0xDD, 0xCC, 0xDD, 0xEE, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDD, 0xEF, 0xDD, 0xF1, 0xB7, 0xAC, /* 0x50-0x53 */ + 0xB7, 0xA4, 0x00, 0x00, 0xD5, 0xB8, 0xDD, 0xD4, /* 0x54-0x57 */ + 0xDD, 0xE6, 0xDD, 0xD5, 0xB7, 0xA1, 0xB7, 0xB1, /* 0x58-0x5B */ + 0xDD, 0xED, 0xB7, 0xAF, 0xB7, 0xAB, 0xDD, 0xCA, /* 0x5C-0x5F */ + 0xB7, 0xA3, 0x00, 0x00, 0xDD, 0xCD, 0xB7, 0xB0, /* 0x60-0x63 */ + 0x00, 0x00, 0xDD, 0xDD, 0xDD, 0xC9, 0x00, 0x00, /* 0x64-0x67 */ + 0xB7, 0xA9, 0xDD, 0xE1, 0xDD, 0xD1, 0xB7, 0xAA, /* 0x68-0x6B */ + 0xDD, 0xDA, 0xB7, 0x7E, 0xB4, 0xD8, 0xDD, 0xE3, /* 0x6C-0x6F */ + 0xD9, 0xBF, 0xDD, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xDD, 0xE8, 0xB7, 0xA5, 0xDD, 0xE5, 0xB7, 0xA2, /* 0x74-0x77 */ + 0xDD, 0xDF, 0xB7, 0xAD, 0xDD, 0xD6, 0xDD, 0xF3, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xA7, 0xDE, 0xC6, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xAE, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE2, 0x4A, 0xE2, 0x48, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0x5E, 0xE2, 0x46, 0x00, 0x00, 0xE2, 0x58, /* 0x90-0x93 */ + 0xB7, 0x7D, 0xBA, 0x5F, 0xE2, 0x42, 0xE2, 0x5D, /* 0x94-0x97 */ + 0x00, 0x00, 0xE2, 0x47, 0xE2, 0x55, 0xBA, 0x64, /* 0x98-0x9B */ + 0xBA, 0x5D, 0x00, 0x00, 0xE2, 0x5B, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE2, 0x40, 0xE2, 0x5A, 0x00, 0x00, 0xBA, 0x6F, /* 0xA0-0xA3 */ + 0xE2, 0x51, 0xE2, 0x61, 0xBA, 0x6D, 0xE2, 0x49, /* 0xA4-0xA7 */ + 0xBA, 0x5E, 0xE2, 0x4B, 0xE2, 0x59, 0xBA, 0x67, /* 0xA8-0xAB */ + 0xE2, 0x44, 0xBA, 0x6B, 0xBA, 0x61, 0xE2, 0x4D, /* 0xAC-0xAF */ + 0xE2, 0x43, 0xE1, 0xFC, 0x00, 0x00, 0xE2, 0x57, /* 0xB0-0xB3 */ + 0xBA, 0x68, 0xE2, 0x60, 0xE1, 0xFD, 0xBA, 0x65, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, 0xBA, 0x66, /* 0xB8-0xBB */ + 0xE2, 0x45, 0xE2, 0x50, 0xE2, 0x4C, 0xE2, 0x4E, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBA, 0x60, 0xE2, 0x5F, 0xBA, 0x6E, /* 0xC0-0xC3 */ + 0xE2, 0x4F, 0x00, 0x00, 0xE2, 0x62, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE1, 0xFE, 0xE2, 0x54, 0xBA, 0x63, /* 0xC8-0xCB */ + 0xBA, 0x6C, 0xBA, 0x6A, 0xE2, 0x41, 0xE2, 0x56, /* 0xCC-0xCF */ + 0xBA, 0x69, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x62, /* 0xD0-0xD3 */ + 0xE2, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE5, 0xD1, 0xE5, 0xCD, 0xE5, 0xE1, 0xE5, 0xDE, /* 0xE4-0xE7 */ + 0xBC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE5, /* 0xE8-0xEB */ + 0xE5, 0xD4, 0xBC, 0xD8, 0xE5, 0xDB, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE5, 0xD0, 0xE5, 0xDA, 0xBC, 0xD5, /* 0xF0-0xF3 */ + 0xE5, 0xEE, 0x00, 0x00, 0xE5, 0xEB, 0xE5, 0xDD, /* 0xF4-0xF7 */ + 0xE5, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE2, /* 0xF8-0xFB */ + 0xE5, 0xE4, 0xBC, 0xD1, 0xE5, 0xD8, 0xE5, 0xD3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0xE5, 0xCA, 0xBC, 0xCE, 0xBC, 0xD6, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xE7, 0xBC, 0xD7, 0xE5, 0xCB, 0xE5, 0xED, /* 0x04-0x07 */ + 0xE5, 0xE0, 0xE5, 0xE6, 0xBC, 0xD4, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, 0xE5, 0xEA, /* 0x0C-0x0F */ + 0x00, 0x00, 0xBC, 0xD9, 0x00, 0x00, 0xBC, 0xD3, /* 0x10-0x13 */ + 0xE5, 0xDC, 0xE5, 0xCF, 0xE5, 0xEF, 0xE5, 0xCC, /* 0x14-0x17 */ + 0xE5, 0xE8, 0xBC, 0xD0, 0x00, 0x00, 0xE5, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0xE5, 0xD7, 0xBC, 0xCF, 0xBC, 0xCC, /* 0x1C-0x1F */ + 0xE5, 0xD2, 0xBC, 0xD2, 0x00, 0x00, 0xBC, 0xCB, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0xE9, 0xE5, 0xEC, 0xE5, 0xD9, /* 0x24-0x27 */ + 0xE9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0x30-0x33 */ + 0xE9, 0xBE, 0xBE, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xBE, 0xEB, 0xBE, 0xF0, 0xBE, 0xEC, 0xE9, 0xCC, /* 0x38-0x3B */ + 0xE9, 0xD7, 0xBE, 0xEA, 0xE9, 0xC4, 0xE9, 0xCD, /* 0x3C-0x3F */ + 0xE5, 0xDF, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xBE, 0xF1, 0x00, 0x00, 0xE9, 0xDD, 0xBE, 0xF5, /* 0x44-0x47 */ + 0xBE, 0xF8, 0xE9, 0xC0, 0x00, 0x00, 0xBE, 0xF4, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0xDB, 0xE9, 0xDC, 0xE9, 0xD2, /* 0x4C-0x4F */ + 0xE9, 0xD1, 0xE9, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE9, 0xD3, 0xE9, 0xDA, 0xE9, 0xD9, 0x00, 0x00, /* 0x54-0x57 */ + 0xBE, 0xEF, 0xBE, 0xED, 0xE9, 0xCB, 0xE9, 0xC8, /* 0x58-0x5B */ + 0x00, 0x00, 0xE9, 0xC5, 0xE9, 0xD8, 0xBE, 0xF7, /* 0x5C-0x5F */ + 0xE9, 0xD6, 0xBE, 0xF3, 0xBE, 0xF2, 0x00, 0x00, /* 0x60-0x63 */ + 0xE9, 0xD0, 0x00, 0x00, 0xE9, 0xBF, 0xE9, 0xC1, /* 0x64-0x67 */ + 0xE9, 0xC3, 0xE9, 0xD5, 0xE9, 0xCF, 0xBE, 0xEE, /* 0x68-0x6B */ + 0x00, 0x00, 0xE9, 0xC6, 0x00, 0x00, 0xE9, 0xD4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC7, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xCF, 0xED, 0x45, /* 0x7C-0x7F */ + + 0xC0, 0xC8, 0xEC, 0xF5, 0x00, 0x00, 0xED, 0x41, /* 0x80-0x83 */ + 0xC0, 0xCA, 0xED, 0x48, 0x00, 0x00, 0xEC, 0xFC, /* 0x84-0x87 */ + 0x00, 0x00, 0xEC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xED, 0x49, 0xEC, 0xF3, 0xEC, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */ + 0xC0, 0xD1, 0xED, 0x44, 0xED, 0x4A, 0xEC, 0xFD, /* 0x90-0x93 */ + 0xC0, 0xC9, 0xED, 0x40, 0xEC, 0xF4, 0xC0, 0xD0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x47, 0xEC, 0xF9, /* 0x98-0x9B */ + 0xC0, 0xCC, 0x00, 0x00, 0xEC, 0xFB, 0xEC, 0xF8, /* 0x9C-0x9F */ + 0xC0, 0xD2, 0xEC, 0xFA, 0xC0, 0xCB, 0xC0, 0xCE, /* 0xA0-0xA3 */ + 0xED, 0x43, 0xEC, 0xF6, 0xED, 0x46, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC2, 0x63, 0xEF, 0xE7, 0xC2, 0x68, 0xC2, 0x69, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x62, /* 0xB0-0xB3 */ + 0xEF, 0xE6, 0x00, 0x00, 0xEF, 0xE3, 0xEF, 0xE4, /* 0xB4-0xB7 */ + 0xC2, 0x66, 0xEF, 0xDE, 0xEF, 0xE2, 0xC2, 0x65, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x67, 0xC2, 0x64, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEF, 0xDD, 0xEF, 0xE1, 0xEF, 0xE5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0x51, /* 0xC8-0xCB */ + 0xF2, 0x4E, 0xF2, 0x57, 0x00, 0x00, 0xF2, 0x56, /* 0xCC-0xCF */ + 0xF2, 0x54, 0xF2, 0x4F, 0x00, 0x00, 0xC3, 0x72, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF2, 0x50, 0xC3, 0x71, 0xC0, 0xCD, /* 0xD8-0xDB */ + 0xF2, 0x53, 0xC3, 0x70, 0xF2, 0x58, 0xF2, 0x52, /* 0xDC-0xDF */ + 0xF2, 0x4D, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xC3, 0x6F, 0x00, 0x00, 0xF2, 0x4C, /* 0xE4-0xE7 */ + 0xF4, 0x56, 0x00, 0x00, 0xF4, 0x55, 0xF2, 0x55, /* 0xE8-0xEB */ + 0xC4, 0x68, 0x00, 0x00, 0xF4, 0x59, 0xF4, 0x5A, /* 0xEC-0xEF */ + 0xF4, 0x54, 0xF4, 0x58, 0x00, 0x00, 0xF4, 0x53, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xD1, 0xF4, 0x57, 0xC4, 0xE7, 0xC4, 0xE5, /* 0xF8-0xFB */ + 0xF5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6B[512] = { + 0xF5, 0xD2, 0x00, 0x00, 0xF5, 0xCE, 0xF5, 0xD0, /* 0x00-0x03 */ + 0xC4, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF6, 0xE5, 0xF6, 0xE6, 0xC5, 0x76, 0xF6, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */ + 0xC5, 0xCF, 0xF7, 0xE0, 0xF7, 0xE1, 0xF8, 0xAC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x56, 0xF8, 0xF3, /* 0x14-0x17 */ + 0xF8, 0xF1, 0xF8, 0xF2, 0xF8, 0xF4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBB, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0xED, 0xA6, 0xB8, 0x00, 0x00, 0xAA, 0x59, /* 0x20-0x23 */ + 0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCF, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD1, 0xF5, 0xD1, 0xF7, 0x00, 0x00, 0xD1, 0xF6, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD1, 0xF8, 0xB1, 0xFD, 0xD5, 0xD7, /* 0x30-0x33 */ + 0xD1, 0xF9, 0x00, 0x00, 0xD5, 0xD6, 0xD5, 0xD8, /* 0x34-0x37 */ + 0xD5, 0xD9, 0xD9, 0xDA, 0xB4, 0xDB, 0xD9, 0xDB, /* 0x38-0x3B */ + 0xD9, 0xDD, 0xB4, 0xDC, 0xB4, 0xDA, 0xD9, 0xDC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDD, 0xFA, 0xDD, 0xF8, 0xDD, 0xF7, /* 0x40-0x43 */ + 0x00, 0x00, 0xDD, 0xF6, 0xDD, 0xF5, 0xB7, 0xB2, /* 0x44-0x47 */ + 0xDD, 0xF9, 0xBA, 0x70, 0xE2, 0x63, 0xE2, 0x65, /* 0x48-0x4B */ + 0xBA, 0x71, 0xE2, 0x64, 0xBC, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xBC, 0xDA, 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE9, 0xDF, 0xE9, 0xDE, 0xE9, 0xE0, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xBE, 0xF9, 0x00, 0x00, 0xED, 0x4B, /* 0x58-0x5B */ + 0xC0, 0xD3, 0x00, 0x00, 0xEF, 0xE8, 0xC2, 0x6A, /* 0x5C-0x5F */ + 0xF2, 0x59, 0xC5, 0x77, 0xA4, 0xEE, 0xA5, 0xBF, /* 0x60-0x63 */ + 0xA6, 0xB9, 0xA8, 0x42, 0xAA, 0x5A, 0xAA, 0x5B, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0x6E, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xD1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xB3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD1, 0xBE, 0xFA, /* 0x74-0x77 */ + 0xC2, 0x6B, 0xA4, 0xEF, 0x00, 0x00, 0xA6, 0xBA, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEB, 0xAA, 0x5C, /* 0x7C-0x7F */ + + 0xCC, 0xEA, 0x00, 0x00, 0xCF, 0x65, 0xAC, 0x6F, /* 0x80-0x83 */ + 0xCF, 0x66, 0x00, 0x00, 0xAC, 0x70, 0x00, 0x00, /* 0x84-0x87 */ + 0xD1, 0xFC, 0xAE, 0xEE, 0xAE, 0xED, 0x00, 0x00, /* 0x88-0x8B */ + 0xD5, 0xDE, 0xD5, 0xDC, 0xD5, 0xDD, 0xD5, 0xDB, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xD9, 0xDE, 0xD9, 0xE1, 0xB4, 0xDE, 0xD9, 0xDF, /* 0x94-0x97 */ + 0xB4, 0xDD, 0xD9, 0xE0, 0x00, 0x00, 0xDD, 0xFB, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x66, 0xE2, 0x67, /* 0x9C-0x9F */ + 0xE2, 0x68, 0x00, 0x00, 0xE5, 0xF3, 0xE5, 0xF2, /* 0xA0-0xA3 */ + 0xBC, 0xDC, 0xE5, 0xF1, 0xE5, 0xF4, 0xE9, 0xE1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0xE9, 0xE3, /* 0xA8-0xAB */ + 0x00, 0x00, 0xED, 0x4C, 0xC0, 0xD4, 0xC2, 0x6C, /* 0xAC-0xAF */ + 0xF2, 0x5A, 0x00, 0x00, 0xC4, 0xE8, 0xC9, 0x5F, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xAC, 0x71, 0xCF, 0x67, 0xAE, 0xEF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0xFE, 0x00, 0x00, /* 0xB8-0xBB */ + 0xB4, 0xDF, 0xD9, 0xE2, 0x00, 0x00, 0xB7, 0xB5, /* 0xBC-0xBF */ + 0xB7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x69, /* 0xC0-0xC3 */ + 0xE2, 0x6A, 0xBC, 0xDD, 0xBC, 0xDE, 0xE9, 0xE5, /* 0xC4-0xC7 */ + 0xE9, 0xE4, 0xEF, 0xE9, 0xF7, 0xE3, 0xA4, 0xF0, /* 0xC8-0xCB */ + 0xC9, 0x60, 0xA5, 0xC0, 0x00, 0x00, 0xA8, 0x43, /* 0xCC-0xCF */ + 0xCB, 0x48, 0x00, 0x00, 0xAC, 0x72, 0xB7, 0xB6, /* 0xD0-0xD3 */ + 0xA4, 0xF1, 0x00, 0x00, 0xCF, 0x68, 0xAC, 0x73, /* 0xD4-0xD7 */ + 0xCF, 0x69, 0x00, 0x00, 0xC0, 0xD5, 0xA4, 0xF2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCF, 0x6A, 0x00, 0x00, 0xD2, 0x42, 0xD2, 0x41, /* 0xE0-0xE3 */ + 0xD1, 0xFE, 0x00, 0x00, 0xD1, 0xFD, 0xD2, 0x43, /* 0xE4-0xE7 */ + 0xD2, 0x40, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x40, /* 0xE8-0xEB */ + 0xB2, 0x41, 0x00, 0x00, 0x00, 0x00, 0xB4, 0xE0, /* 0xEC-0xEF */ + 0xD9, 0xE3, 0x00, 0x00, 0xD9, 0xE4, 0xD9, 0xE5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0x41, /* 0xF4-0xF7 */ + 0xDE, 0x42, 0xDE, 0x40, 0x00, 0x00, 0xDD, 0xFD, /* 0xF8-0xFB */ + 0xDD, 0xFE, 0xB7, 0xB7, 0xE2, 0x6B, 0xE5, 0xF7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6C[512] = { + 0xE5, 0xF6, 0xE5, 0xF5, 0xE5, 0xF8, 0xE9, 0xE7, /* 0x00-0x03 */ + 0xE9, 0xE6, 0xBE, 0xFB, 0xE9, 0xE8, 0x00, 0x00, /* 0x04-0x07 */ + 0xC0, 0xD6, 0xED, 0x4D, 0x00, 0x00, 0xEF, 0xEA, /* 0x08-0x0B */ + 0xF2, 0x5B, 0xF6, 0xE7, 0x00, 0x00, 0xA4, 0xF3, /* 0x0C-0x0F */ + 0xA5, 0xC2, 0xA5, 0xC1, 0x00, 0x00, 0xAA, 0x5D, /* 0x10-0x13 */ + 0xC9, 0x61, 0xC9, 0x7E, 0xA6, 0xBB, 0x00, 0x00, /* 0x14-0x17 */ + 0xC9, 0xF7, 0xCB, 0x49, 0xCB, 0x4A, 0xAA, 0x5E, /* 0x18-0x1B */ + 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, 0xAC, 0x74, /* 0x1C-0x1F */ + 0xCF, 0x6B, 0xCF, 0x6C, 0x00, 0x00, 0xAE, 0xF0, /* 0x20-0x23 */ + 0xAE, 0xF4, 0xD2, 0x44, 0xAE, 0xF3, 0xAE, 0xF1, /* 0x24-0x27 */ + 0xAE, 0xF2, 0x00, 0x00, 0xD5, 0xDF, 0xB2, 0x42, /* 0x28-0x2B */ + 0xB4, 0xE3, 0x00, 0x00, 0xB4, 0xE1, 0xB4, 0xE2, /* 0x2C-0x2F */ + 0xD9, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x72, /* 0x30-0x33 */ + 0xA4, 0xF4, 0x00, 0x00, 0xC9, 0xA1, 0x00, 0x00, /* 0x34-0x37 */ + 0xA5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xC9, 0xA4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xC6, 0xC9, 0xA3, /* 0x3C-0x3F */ + 0xA5, 0xC5, 0xA5, 0xC4, 0xA8, 0x44, 0xC9, 0xA2, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xF8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xFC, 0xC9, 0xFE, /* 0x48-0x4B */ + 0xCA, 0x40, 0xA6, 0xC5, 0xA6, 0xC6, 0xC9, 0xFB, /* 0x4C-0x4F */ + 0xA6, 0xC1, 0x00, 0x00, 0xC9, 0xF9, 0x00, 0x00, /* 0x50-0x53 */ + 0xC9, 0xFD, 0xA6, 0xC2, 0x00, 0x00, 0xA6, 0xBD, /* 0x54-0x57 */ + 0x00, 0x00, 0xA6, 0xBE, 0x00, 0x00, 0xA6, 0xC4, /* 0x58-0x5B */ + 0xC9, 0xFA, 0xA6, 0xBC, 0xA8, 0x45, 0xA6, 0xBF, /* 0x5C-0x5F */ + 0xA6, 0xC0, 0xA6, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xCB, 0x5B, 0xCB, 0x59, 0xCB, 0x4C, /* 0x64-0x67 */ + 0xA8, 0x51, 0xCB, 0x53, 0xA8, 0x4C, 0xCB, 0x4D, /* 0x68-0x6B */ + 0x00, 0x00, 0xCB, 0x55, 0x00, 0x00, 0xCB, 0x52, /* 0x6C-0x6F */ + 0xA8, 0x4F, 0xCB, 0x51, 0xA8, 0x56, 0xCB, 0x5A, /* 0x70-0x73 */ + 0xA8, 0x58, 0x00, 0x00, 0xA8, 0x5A, 0x00, 0x00, /* 0x74-0x77 */ + 0xCB, 0x4B, 0x00, 0x00, 0xA8, 0x4D, 0xCB, 0x5C, /* 0x78-0x7B */ + 0x00, 0x00, 0xA8, 0x54, 0xA8, 0x57, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCD, 0x45, 0xA8, 0x47, 0xA8, 0x5E, 0xA8, 0x55, /* 0x80-0x83 */ + 0xCB, 0x4E, 0xA8, 0x4A, 0xA8, 0x59, 0xCB, 0x56, /* 0x84-0x87 */ + 0xA8, 0x48, 0xA8, 0x49, 0xCD, 0x43, 0xCB, 0x4F, /* 0x88-0x8B */ + 0xA8, 0x50, 0xA8, 0x5B, 0xCB, 0x5D, 0xCB, 0x50, /* 0x8C-0x8F */ + 0xA8, 0x4E, 0x00, 0x00, 0xA8, 0x53, 0xCC, 0xEE, /* 0x90-0x93 */ + 0xA8, 0x5C, 0xCB, 0x57, 0xA8, 0x52, 0x00, 0x00, /* 0x94-0x97 */ + 0xA8, 0x5D, 0xA8, 0x46, 0xCB, 0x54, 0xA8, 0x4B, /* 0x98-0x9B */ + 0xCB, 0x58, 0xCD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0x6A, /* 0xA8-0xAB */ + 0xAA, 0x7A, 0xCC, 0xF5, 0xAA, 0x71, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCD, 0x4B, 0xAA, 0x62, 0x00, 0x00, 0xAA, 0x65, /* 0xB0-0xB3 */ + 0xCD, 0x42, 0x00, 0x00, 0xCC, 0xF3, 0xCC, 0xF7, /* 0xB4-0xB7 */ + 0xAA, 0x6D, 0xAA, 0x6F, 0xCC, 0xFA, 0xAA, 0x76, /* 0xB8-0xBB */ + 0xAA, 0x68, 0xAA, 0x66, 0xAA, 0x67, 0xAA, 0x75, /* 0xBC-0xBF */ + 0xCD, 0x47, 0xAA, 0x70, 0xCC, 0xF9, 0xCC, 0xFB, /* 0xC0-0xC3 */ + 0xAA, 0x6E, 0xAA, 0x73, 0xCC, 0xFC, 0xCD, 0x4A, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xAC, 0x75, 0xAA, 0x79, 0x00, 0x00, /* 0xC8-0xCB */ + 0xAA, 0x63, 0xCD, 0x49, 0x00, 0x00, 0xCD, 0x4D, /* 0xCC-0xCF */ + 0xCC, 0xF8, 0xCD, 0x4F, 0xCD, 0x40, 0xAA, 0x6C, /* 0xD0-0xD3 */ + 0xCC, 0xF4, 0xAA, 0x6B, 0xAA, 0x7D, 0xAA, 0x72, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCC, 0xF2, 0xCF, 0x75, 0xAA, 0x78, /* 0xD8-0xDB */ + 0xAA, 0x7C, 0xCD, 0x41, 0xCD, 0x46, 0x00, 0x00, /* 0xDC-0xDF */ + 0xAA, 0x7E, 0xAA, 0x77, 0xAA, 0x69, 0xAA, 0x5F, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0x64, 0x00, 0x00, 0xCC, 0xF6, /* 0xE4-0xE7 */ + 0xAA, 0x60, 0xCD, 0x4E, 0x00, 0x00, 0xCC, 0xF0, /* 0xE8-0xEB */ + 0xCC, 0xEF, 0xCC, 0xFD, 0xCC, 0xF1, 0xAA, 0x7B, /* 0xEC-0xEF */ + 0xAE, 0xF5, 0xAA, 0x74, 0xCC, 0xFE, 0xAA, 0x61, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xAC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCD, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6D[512] = { + 0xCF, 0x7C, 0xCF, 0xA1, 0x00, 0x00, 0xCF, 0xA4, /* 0x00-0x03 */ + 0xCF, 0x77, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA7, /* 0x04-0x07 */ + 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0x74, 0xAC, 0x76, /* 0x08-0x0B */ + 0xAC, 0x7B, 0xD2, 0x49, 0xAC, 0xAD, 0xCF, 0xA5, /* 0x0C-0x0F */ + 0xCF, 0xAD, 0xCF, 0x7B, 0xCF, 0x73, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0x64, 0xAC, 0x7E, /* 0x14-0x17 */ + 0xCF, 0xA2, 0xCF, 0x78, 0xCF, 0x7A, 0xAC, 0xA5, /* 0x18-0x1B */ + 0x00, 0x00, 0xCF, 0x7D, 0xAC, 0x7D, 0xCF, 0x70, /* 0x1C-0x1F */ + 0xCF, 0xA8, 0x00, 0x00, 0xCF, 0xAB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xAC, 0x7A, 0x00, 0x00, 0xAC, 0xA8, /* 0x24-0x27 */ + 0xCF, 0x6D, 0xAC, 0xAA, 0xAC, 0x78, 0xAC, 0xAE, /* 0x28-0x2B */ + 0xCF, 0xA9, 0xCF, 0x6F, 0xAC, 0xAB, 0xD2, 0x5E, /* 0x2C-0x2F */ + 0xCD, 0x48, 0xAC, 0x7C, 0xAC, 0x77, 0xCF, 0x76, /* 0x30-0x33 */ + 0xCF, 0x6E, 0xAC, 0xAC, 0xAC, 0xA4, 0xCF, 0xA3, /* 0x34-0x37 */ + 0xAC, 0xA9, 0xAC, 0xA7, 0xCF, 0x79, 0xAC, 0xA1, /* 0x38-0x3B */ + 0xCF, 0x71, 0xAC, 0xA2, 0xAC, 0xA3, 0xCF, 0x72, /* 0x3C-0x3F */ + 0xCF, 0xA6, 0xAC, 0x79, 0xCF, 0x7E, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD2, 0x4C, 0xAE, 0xFD, 0xAF, 0x43, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0x55, 0xD2, 0x5B, /* 0x5C-0x5F */ + 0xD2, 0x57, 0xD2, 0x4A, 0xD2, 0x4D, 0xD2, 0x46, /* 0x60-0x63 */ + 0xD2, 0x47, 0xAF, 0x4A, 0xAE, 0xFA, 0xD2, 0x56, /* 0x64-0x67 */ + 0xD2, 0x5F, 0xAF, 0x45, 0xAE, 0xF6, 0x00, 0x00, /* 0x68-0x6B */ + 0xAF, 0x40, 0xD2, 0x4E, 0xAF, 0x42, 0xD2, 0x4F, /* 0x6C-0x6F */ + 0xD2, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xAF, 0x44, 0xD2, 0x68, 0xD2, 0x48, 0xAE, 0xFC, /* 0x74-0x77 */ + 0xAE, 0xFB, 0xAF, 0x48, 0xD2, 0x45, 0xD2, 0x66, /* 0x78-0x7B */ + 0xD2, 0x5A, 0xD2, 0x67, 0xD2, 0x61, 0xD2, 0x53, /* 0x7C-0x7F */ + + 0xD2, 0x62, 0x00, 0x00, 0xD2, 0x5C, 0xD2, 0x65, /* 0x80-0x83 */ + 0xD2, 0x63, 0xAF, 0x49, 0xD2, 0x54, 0xAE, 0xF9, /* 0x84-0x87 */ + 0xAE, 0xF8, 0xAF, 0x41, 0xAF, 0x47, 0xD2, 0x60, /* 0x88-0x8B */ + 0xAF, 0x46, 0xD2, 0x51, 0xB2, 0x43, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD2, 0x69, 0xD2, 0x50, 0xD2, 0x4B, 0xAE, 0xFE, /* 0x90-0x93 */ + 0xAF, 0x4B, 0xAE, 0xF7, 0x00, 0x00, 0xD2, 0x58, /* 0x94-0x97 */ + 0xD2, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x65, 0xD5, 0xE1, /* 0xA8-0xAB */ + 0xD5, 0xE5, 0x00, 0x00, 0xB2, 0x52, 0xB2, 0x50, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x47, 0xD5, 0xE3, /* 0xB0-0xB3 */ + 0xD5, 0xE2, 0xB2, 0x5B, 0x00, 0x00, 0xD5, 0xE8, /* 0xB4-0xB7 */ + 0xB2, 0x55, 0x00, 0x00, 0xD5, 0xFA, 0xD6, 0x47, /* 0xB8-0xBB */ + 0xB2, 0x44, 0xD5, 0xF7, 0xD5, 0xF0, 0xB2, 0x67, /* 0xBC-0xBF */ + 0xD5, 0xE0, 0x00, 0x00, 0xD5, 0xFC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB2, 0x64, 0xB2, 0x58, 0xB2, 0x63, 0xB2, 0x4E, /* 0xC4-0xC7 */ + 0xD5, 0xEC, 0xD5, 0xFE, 0xD5, 0xF6, 0xB2, 0x4F, /* 0xC8-0xCB */ + 0xB2, 0x49, 0xD6, 0x45, 0x00, 0x00, 0xD5, 0xFD, /* 0xCC-0xCF */ + 0xD6, 0x40, 0xB2, 0x51, 0xB2, 0x59, 0xD6, 0x42, /* 0xD0-0xD3 */ + 0xD5, 0xEA, 0xD5, 0xFB, 0xD5, 0xEF, 0xD6, 0x44, /* 0xD4-0xD7 */ + 0xB2, 0x5E, 0xB2, 0x46, 0xB2, 0x5C, 0xD5, 0xF4, /* 0xD8-0xDB */ + 0xD5, 0xF2, 0xD5, 0xF3, 0xB2, 0x53, 0xD5, 0xEE, /* 0xDC-0xDF */ + 0xD5, 0xED, 0xB2, 0x48, 0xD5, 0xE7, 0xD6, 0x46, /* 0xE0-0xE3 */ + 0xB2, 0x4A, 0xD5, 0xF1, 0xB2, 0x68, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB2, 0x62, 0xD5, 0xE6, 0xB2, 0x5F, 0xB2, 0x5D, /* 0xE8-0xEB */ + 0xB2, 0x66, 0xD5, 0xF8, 0xB2, 0x61, 0xD2, 0x52, /* 0xEC-0xEF */ + 0xD5, 0xF9, 0xB2, 0x60, 0xD6, 0x41, 0xB2, 0x45, /* 0xF0-0xF3 */ + 0xD5, 0xF5, 0xB2, 0x57, 0xD5, 0xE9, 0xB2, 0x56, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB2, 0x54, 0xB2, 0x4C, 0xB2, 0x4B, /* 0xF8-0xFB */ + 0xD9, 0xE7, 0xD6, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0xD5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFC, /* 0x00-0x03 */ + 0x00, 0x00, 0xB2, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xB5, 0x41, 0xB2, 0x5A, 0xB4, 0xEE, /* 0x18-0x1B */ + 0xD9, 0xF6, 0xB4, 0xFC, 0x00, 0x00, 0xD9, 0xEA, /* 0x1C-0x1F */ + 0xB4, 0xEB, 0xB4, 0xE7, 0xDA, 0x49, 0xB4, 0xED, /* 0x20-0x23 */ + 0xB4, 0xF1, 0xB4, 0xEC, 0xB4, 0xF5, 0xDA, 0x4D, /* 0x24-0x27 */ + 0xDA, 0x44, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF1, /* 0x28-0x2B */ + 0xB4, 0xFA, 0xB4, 0xF4, 0xD9, 0xFD, 0xB4, 0xE4, /* 0x2C-0x2F */ + 0xDA, 0x4A, 0xDA, 0x43, 0xB4, 0xE8, 0xD9, 0xF7, /* 0x30-0x33 */ + 0xB4, 0xF7, 0xDA, 0x55, 0xDA, 0x56, 0x00, 0x00, /* 0x34-0x37 */ + 0xB4, 0xE5, 0xDA, 0x48, 0xB4, 0xF9, 0xD9, 0xFB, /* 0x38-0x3B */ + 0xD9, 0xED, 0xD9, 0xEE, 0xB4, 0xFD, 0xD9, 0xF2, /* 0x3C-0x3F */ + 0xD9, 0xF9, 0xD9, 0xF3, 0x00, 0x00, 0xB4, 0xFB, /* 0x40-0x43 */ + 0xB5, 0x44, 0xD9, 0xEF, 0xD9, 0xE8, 0xD9, 0xE9, /* 0x44-0x47 */ + 0x00, 0x00, 0xD9, 0xEB, 0xB4, 0xEA, 0xD9, 0xF8, /* 0x48-0x4B */ + 0x00, 0x00, 0xB4, 0xF8, 0xB5, 0x42, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD9, 0xFA, 0xDA, 0x53, 0xDA, 0x4B, /* 0x50-0x53 */ + 0xB4, 0xE6, 0xDA, 0x51, 0xB4, 0xF2, 0x00, 0x00, /* 0x54-0x57 */ + 0xB4, 0xF0, 0x00, 0x00, 0xDA, 0x57, 0xB4, 0xEF, /* 0x58-0x5B */ + 0xDA, 0x41, 0xD9, 0xF4, 0xD9, 0xFE, 0xB5, 0x47, /* 0x5C-0x5F */ + 0xDA, 0x45, 0xDA, 0x42, 0xD9, 0xF0, 0xB5, 0x43, /* 0x60-0x63 */ + 0xDA, 0x4F, 0xDA, 0x4C, 0xDA, 0x54, 0xB4, 0xE9, /* 0x64-0x67 */ + 0xDA, 0x40, 0xB5, 0x46, 0x00, 0x00, 0xDA, 0x47, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0xF3, 0xB4, 0xF6, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDA, 0x46, 0xB5, 0x45, 0xD9, 0xF5, /* 0x70-0x73 */ + 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDA, 0x50, /* 0x74-0x77 */ + 0xDA, 0x4E, 0xDA, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD9, 0xEC, 0xB5, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDE, 0x61, 0xDE, 0x60, 0xDE, 0x46, /* 0x8C-0x8F */ + 0xB7, 0xBD, 0x00, 0x00, 0xDE, 0x5F, 0xDE, 0x49, /* 0x90-0x93 */ + 0xDE, 0x4A, 0x00, 0x00, 0xB7, 0xC7, 0xDE, 0x68, /* 0x94-0x97 */ + 0xB7, 0xC2, 0xDE, 0x5E, 0x00, 0x00, 0xDE, 0x43, /* 0x98-0x9B */ + 0xB7, 0xC8, 0xB7, 0xBE, 0xDE, 0x52, 0xDE, 0x48, /* 0x9C-0x9F */ + 0xDE, 0x4B, 0xDE, 0x63, 0xB7, 0xB8, 0xDE, 0x6A, /* 0xA0-0xA3 */ + 0xDE, 0x62, 0xB7, 0xC1, 0xDE, 0x57, 0xB7, 0xCC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xCB, 0xB7, 0xC5, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0x69, 0xB7, 0xB9, /* 0xAC-0xAF */ + 0xDE, 0x55, 0xDE, 0x4C, 0xDE, 0x59, 0xDE, 0x65, /* 0xB0-0xB3 */ + 0xB7, 0xCD, 0x00, 0x00, 0xB7, 0xBB, 0xDE, 0x54, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDE, 0x4D, 0xB7, 0xC4, 0x00, 0x00, /* 0xB8-0xBB */ + 0xB7, 0xC3, 0xDE, 0x50, 0xDE, 0x5A, 0xDE, 0x64, /* 0xBC-0xBF */ + 0xDE, 0x47, 0xDE, 0x51, 0xB7, 0xBC, 0xDE, 0x5B, /* 0xC0-0xC3 */ + 0xB7, 0xC9, 0xB7, 0xC0, 0xDE, 0x4E, 0xB7, 0xBF, /* 0xC4-0xC7 */ + 0xDE, 0x45, 0xDE, 0x53, 0xDE, 0x67, 0xB4, 0xFE, /* 0xC8-0xCB */ + 0xBA, 0xB0, 0xDE, 0x56, 0xE2, 0x6C, 0xDE, 0x58, /* 0xCC-0xCF */ + 0xDE, 0x66, 0xB7, 0xC6, 0xDE, 0x4F, 0xB7, 0xBA, /* 0xD0-0xD3 */ + 0xB7, 0xCA, 0xBC, 0xF0, 0xDE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDE, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDE, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xAA, /* 0xE8-0xEB */ + 0xBA, 0xAD, 0xE2, 0x7D, 0xE2, 0xA4, 0xBA, 0xA2, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE2, 0x6E, 0xBA, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xBA, 0x77, 0xE2, 0x6D, 0xE2, 0xB0, 0xBA, 0xB1, /* 0xF4-0xF7 */ + 0xE2, 0x71, 0xE2, 0xA3, 0x00, 0x00, 0xE2, 0x73, /* 0xF8-0xFB */ + 0xE2, 0xB3, 0xE2, 0xAF, 0xBA, 0x75, 0xBA, 0xA1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0xE6, 0x53, 0xBA, 0xAE, 0xBA, 0x7D, 0xE2, 0x6F, /* 0x00-0x03 */ + 0x00, 0x00, 0xE2, 0xAE, 0xBA, 0xA3, 0xE2, 0xAB, /* 0x04-0x07 */ + 0xE2, 0xB8, 0xE2, 0x75, 0xE2, 0x7E, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0xB6, 0xE2, 0xAC, 0xBA, 0x7C, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x7C, 0xBA, 0x76, /* 0x10-0x13 */ + 0xBA, 0x74, 0xBA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE2, 0x7A, 0xE2, 0x77, 0xE2, 0x78, 0x00, 0x00, /* 0x18-0x1B */ + 0xE2, 0xB2, 0x00, 0x00, 0xE2, 0xB7, 0xE2, 0xB5, /* 0x1C-0x1F */ + 0xBA, 0x7A, 0xE2, 0xB9, 0xBA, 0x7E, 0xBA, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0xE2, 0x70, 0xE5, 0xFA, 0xE2, 0x79, /* 0x24-0x27 */ + 0x00, 0x00, 0xBA, 0x78, 0xBA, 0xAC, 0xBA, 0xA9, /* 0x28-0x2B */ + 0xBA, 0x7B, 0xE2, 0xA5, 0xE2, 0x74, 0xBA, 0xAA, /* 0x2C-0x2F */ + 0xE2, 0xA7, 0xBA, 0xA4, 0xBA, 0xA6, 0xBA, 0x73, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0xA9, 0xE2, 0xA1, 0xE2, 0x72, /* 0x34-0x37 */ + 0xBA, 0xA5, 0xE2, 0xB1, 0xE2, 0xB4, 0xE2, 0x7B, /* 0x38-0x3B */ + 0xE2, 0xA8, 0x00, 0x00, 0xBA, 0x79, 0xBC, 0xDF, /* 0x3C-0x3F */ + 0xE2, 0xA6, 0xE5, 0xF9, 0x00, 0x00, 0xE2, 0xAD, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x76, 0xE6, 0x44, /* 0x4C-0x4F */ + 0xE6, 0x4E, 0xBC, 0xE2, 0xE6, 0x4D, 0xE6, 0x59, /* 0x50-0x53 */ + 0xBC, 0xE4, 0xE6, 0x4B, 0x00, 0x00, 0xE6, 0x4F, /* 0x54-0x57 */ + 0xBC, 0xEF, 0x00, 0x00, 0xE6, 0x46, 0xBC, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0x52, 0xE9, 0xF0, 0xBC, 0xF3, /* 0x5C-0x5F */ + 0xBC, 0xF2, 0xE6, 0x54, 0xE6, 0x43, 0xE6, 0x5E, /* 0x60-0x63 */ + 0xBC, 0xED, 0x00, 0x00, 0xBC, 0xE3, 0xE6, 0x57, /* 0x64-0x67 */ + 0x00, 0x00, 0xE6, 0x5B, 0xE6, 0x60, 0xE6, 0x55, /* 0x68-0x6B */ + 0xE6, 0x49, 0xBC, 0xE6, 0xBC, 0xE9, 0xBC, 0xF1, /* 0x6C-0x6F */ + 0xBC, 0xEC, 0x00, 0x00, 0xE6, 0x4C, 0xE2, 0xA2, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x48, 0xE6, 0x5F, /* 0x74-0x77 */ + 0xBC, 0xE8, 0x00, 0x00, 0xBC, 0xEB, 0xE6, 0x61, /* 0x78-0x7B */ + 0xBC, 0xE0, 0xE6, 0x56, 0xE5, 0xFB, 0xE6, 0x5C, /* 0x7C-0x7F */ + + 0xC0, 0xDF, 0x00, 0x00, 0xE6, 0x4A, 0x00, 0x00, /* 0x80-0x83 */ + 0xBC, 0xE1, 0xE6, 0x45, 0xBC, 0xE5, 0xE5, 0xFC, /* 0x84-0x87 */ + 0xBA, 0xAB, 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x5A, /* 0x88-0x8B */ + 0xE6, 0x42, 0xE6, 0x40, 0xBC, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE6, 0x58, 0x00, 0x00, 0xE5, 0xFE, 0xE6, 0x51, /* 0x90-0x93 */ + 0xE6, 0x50, 0xE6, 0x5D, 0xE6, 0x47, 0xBC, 0xEE, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0x00, 0x00, /* 0x9C-0x9F */ + 0xBF, 0x49, 0xBE, 0xFE, 0xEA, 0x40, 0xE9, 0xEB, /* 0xA0-0xA3 */ + 0xBF, 0x41, 0xE9, 0xF7, 0xBF, 0x48, 0xBF, 0x43, /* 0xA4-0xA7 */ + 0xE9, 0xF5, 0xED, 0x4F, 0xE9, 0xFB, 0xEA, 0x42, /* 0xA8-0xAB */ + 0xE9, 0xFA, 0xE9, 0xE9, 0xE9, 0xF8, 0xEA, 0x44, /* 0xAC-0xAF */ + 0xEA, 0x46, 0xBE, 0xFD, 0xEA, 0x45, 0xBF, 0x44, /* 0xB0-0xB3 */ + 0xBF, 0x4A, 0x00, 0x00, 0xBF, 0x47, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE9, 0xFE, 0xBF, 0x46, 0xE9, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0xED, 0xE9, 0xF2, 0x00, 0x00, 0xE9, 0xFD, /* 0xBC-0xBF */ + 0xBF, 0x45, 0xBF, 0x42, 0xBE, 0xFC, 0xBF, 0x40, /* 0xC0-0xC3 */ + 0xE9, 0xF1, 0x00, 0x00, 0xE5, 0xFD, 0xE9, 0xEC, /* 0xC4-0xC7 */ + 0xE9, 0xEF, 0xEA, 0x41, 0xE9, 0xF4, 0xE9, 0xEA, /* 0xC8-0xCB */ + 0xED, 0x4E, 0xEA, 0x43, 0xE9, 0xEE, 0xE9, 0xFC, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xED, 0x51, 0xC0, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC0, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xDB, /* 0xD8-0xDB */ + 0xED, 0x53, 0xED, 0x59, 0xED, 0x57, 0xC0, 0xD9, /* 0xDC-0xDF */ + 0xC0, 0xDA, 0xC0, 0xE1, 0xED, 0x5A, 0xED, 0x52, /* 0xE0-0xE3 */ + 0xC0, 0xDC, 0x00, 0x00, 0xED, 0x56, 0xED, 0x55, /* 0xE4-0xE7 */ + 0xED, 0x5B, 0xC0, 0xE2, 0x00, 0x00, 0xC0, 0xDD, /* 0xE8-0xEB */ + 0xC0, 0xE0, 0xED, 0x54, 0xC0, 0xE4, 0xC0, 0xDE, /* 0xEC-0xEF */ + 0xC0, 0xE5, 0xC0, 0xD8, 0xED, 0x58, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xED, 0x50, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF7, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x71, 0xEF, 0xF4, /* 0xF8-0xFB */ + 0xEF, 0xF6, 0x00, 0x00, 0xC2, 0x6F, 0xEF, 0xF2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0xEF, 0xF3, 0xEF, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE9, 0xF6, 0xEF, 0xEF, 0xC2, 0x70, 0xEF, 0xEB, /* 0x04-0x07 */ + 0x00, 0x00, 0xC2, 0x6D, 0xEF, 0xF8, 0xC2, 0x6E, /* 0x08-0x0B */ + 0xEF, 0xEC, 0xEF, 0xED, 0xEF, 0xF1, 0xC2, 0x73, /* 0x0C-0x0F */ + 0x00, 0x00, 0xC2, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xEF, 0xF0, 0xC3, 0x78, 0xF2, 0x5F, 0xF2, 0x65, /* 0x14-0x17 */ + 0xC3, 0x79, 0xF2, 0x5C, 0xC3, 0x76, 0xC3, 0x73, /* 0x18-0x1B */ + 0xF2, 0x67, 0xC3, 0x77, 0x00, 0x00, 0xC3, 0x74, /* 0x1C-0x1F */ + 0xF2, 0x5E, 0xF2, 0x61, 0xF2, 0x62, 0xF2, 0x63, /* 0x20-0x23 */ + 0xF2, 0x66, 0x00, 0x00, 0xEF, 0xF5, 0xF2, 0x5D, /* 0x24-0x27 */ + 0xC3, 0x75, 0xF2, 0x64, 0xF2, 0x68, 0xF2, 0x60, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x5D, /* 0x2C-0x2F */ + 0xC4, 0x6A, 0xF4, 0x60, 0xC4, 0x6B, 0xF4, 0x68, /* 0x30-0x33 */ + 0xF4, 0x5F, 0xF4, 0x5C, 0x00, 0x00, 0xF4, 0x5E, /* 0x34-0x37 */ + 0xF4, 0x62, 0xF4, 0x65, 0xF4, 0x64, 0xF4, 0x67, /* 0x38-0x3B */ + 0xF4, 0x5B, 0x00, 0x00, 0xC4, 0x69, 0xF4, 0x63, /* 0x3C-0x3F */ + 0xF4, 0x66, 0xF4, 0x69, 0xF4, 0x61, 0xF5, 0xD3, /* 0x40-0x43 */ + 0xF5, 0xD4, 0xF5, 0xD8, 0xF5, 0xD9, 0x00, 0x00, /* 0x44-0x47 */ + 0xF5, 0xD6, 0xF5, 0xD7, 0xF5, 0xD5, 0x00, 0x00, /* 0x48-0x4B */ + 0xC4, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC5, 0x78, 0xF6, 0xEB, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF6, 0xE8, 0xF6, 0xE9, 0xF6, 0xEA, /* 0x54-0x57 */ + 0xC5, 0x79, 0x00, 0x00, 0xF7, 0xE5, 0xF7, 0xE4, /* 0x58-0x5B */ + 0x00, 0x00, 0xF8, 0xAF, 0xC5, 0xF4, 0xF8, 0xAD, /* 0x5C-0x5F */ + 0xF8, 0xB0, 0xF8, 0xAE, 0xF8, 0xF5, 0xC6, 0x57, /* 0x60-0x63 */ + 0xC6, 0x65, 0xF9, 0xA3, 0xF9, 0x6C, 0x00, 0x00, /* 0x64-0x67 */ + 0xF9, 0xA2, 0xF9, 0xD0, 0xF9, 0xD1, 0xA4, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA6, 0xC7, 0xCA, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCB, 0x5E, 0x00, 0x00, 0xA8, 0x5F, 0x00, 0x00, /* 0x74-0x77 */ + 0xA8, 0x62, 0x00, 0x00, 0xCB, 0x5F, 0x00, 0x00, /* 0x78-0x7B */ + 0xA8, 0x60, 0xA8, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xCD, 0x58, 0xCD, 0x5A, /* 0x80-0x83 */ + 0xCD, 0x55, 0xCD, 0x52, 0xCD, 0x54, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xA4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xA2, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCD, 0x56, 0xAA, 0xA3, 0xCD, 0x53, /* 0x90-0x93 */ + 0xCD, 0x50, 0xAA, 0xA1, 0xCD, 0x57, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0x51, 0xAA, 0xA5, 0xCD, 0x59, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xAC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCF, 0xB6, 0x00, 0x00, 0xAC, 0xAF, /* 0xA8-0xAB */ + 0xAC, 0xB2, 0xAC, 0xB4, 0xAC, 0xB6, 0xAC, 0xB3, /* 0xAC-0xAF */ + 0xCF, 0xB2, 0xCF, 0xB1, 0x00, 0x00, 0xAC, 0xB1, /* 0xB0-0xB3 */ + 0xCF, 0xB4, 0xCF, 0xB5, 0x00, 0x00, 0xCF, 0xAE, /* 0xB4-0xB7 */ + 0xAC, 0xB5, 0x00, 0x00, 0xAC, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, /* 0xC4-0xC7 */ + 0xAF, 0x50, 0x00, 0x00, 0xAF, 0x4C, 0xD2, 0x6E, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD2, 0x76, 0xD2, 0x7B, 0xAF, 0x51, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD2, 0x6C, 0xD2, 0x72, 0xD2, 0x6B, /* 0xD0-0xD3 */ + 0xD2, 0x75, 0x00, 0x00, 0x00, 0x00, 0xD2, 0x71, /* 0xD4-0xD7 */ + 0xAF, 0x4D, 0xAF, 0x4F, 0xD2, 0x7A, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD2, 0x6A, 0xD2, 0x6D, 0xD2, 0x73, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD2, 0x74, 0xD2, 0x7C, 0xD2, 0x70, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xAF, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x6D, /* 0xEC-0xEF */ + 0xD6, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x50, /* 0xF0-0xF3 */ + 0xD6, 0x4C, 0x00, 0x00, 0xD6, 0x58, 0xD6, 0x4A, /* 0xF4-0xF7 */ + 0xD6, 0x57, 0xB2, 0x69, 0xD6, 0x48, 0xDA, 0x5B, /* 0xF8-0xFB */ + 0xD6, 0x52, 0xB2, 0x6C, 0x00, 0x00, 0xD6, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0xD6, 0x56, 0x00, 0x00, 0xD6, 0x5A, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0x4F, 0x00, 0x00, 0xD6, 0x54, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xB2, 0x6A, 0xB2, 0x6B, 0xD6, 0x59, /* 0x08-0x0B */ + 0xD6, 0x4D, 0xD6, 0x49, 0xD6, 0x5B, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD6, 0x51, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x55, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x4B, /* 0x14-0x17 */ + 0x00, 0x00, 0xB5, 0x48, 0xB5, 0x49, 0xDA, 0x65, /* 0x18-0x1B */ + 0xB5, 0x4F, 0x00, 0x00, 0xDA, 0x59, 0xDA, 0x62, /* 0x1C-0x1F */ + 0xDA, 0x58, 0xB5, 0x4C, 0xDA, 0x60, 0xDA, 0x5E, /* 0x20-0x23 */ + 0x00, 0x00, 0xDA, 0x5F, 0xB5, 0x4A, 0x00, 0x00, /* 0x24-0x27 */ + 0xDA, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x5C, 0xDA, 0x5A, /* 0x2C-0x2F */ + 0xB5, 0x4B, 0xDA, 0x5D, 0xDA, 0x61, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x4D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x64, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDE, 0x70, 0xDE, 0x77, 0xDE, 0x79, /* 0x40-0x43 */ + 0xDE, 0xA1, 0x00, 0x00, 0xB7, 0xDA, 0xDE, 0x6B, /* 0x44-0x47 */ + 0x00, 0x00, 0xB7, 0xD2, 0x00, 0x00, 0xDE, 0x7A, /* 0x48-0x4B */ + 0xB7, 0xD7, 0xDE, 0xA2, 0xB7, 0xCE, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDE, 0x7D, 0x00, 0x00, 0xDE, 0x6D, 0xDE, 0x7E, /* 0x50-0x53 */ + 0xDE, 0x6C, 0x00, 0x00, 0xB7, 0xDC, 0x00, 0x00, /* 0x54-0x57 */ + 0xDE, 0x78, 0xB7, 0xCF, 0xDE, 0xA3, 0x00, 0x00, /* 0x58-0x5B */ + 0xB7, 0xD4, 0xDE, 0x71, 0xB7, 0xD9, 0xDE, 0x7C, /* 0x5C-0x5F */ + 0xDE, 0x6F, 0xDE, 0x76, 0xDE, 0x72, 0xDE, 0x6E, /* 0x60-0x63 */ + 0xB7, 0xD1, 0xB7, 0xD8, 0xB7, 0xD6, 0xB7, 0xD3, /* 0x64-0x67 */ + 0xB7, 0xDB, 0xB7, 0xD0, 0xDE, 0x75, 0x00, 0x00, /* 0x68-0x6B */ + 0xB7, 0xD5, 0x00, 0x00, 0xB5, 0x4E, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDE, 0x7B, 0x00, 0x00, 0xDE, 0x73, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xDE, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC1, /* 0x78-0x7B */ + 0x00, 0x00, 0xBA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE2, 0xBD, 0xE2, 0xC3, 0xE2, 0xBF, 0x00, 0x00, /* 0x80-0x83 */ + 0xBA, 0xB6, 0xE2, 0xBE, 0xE2, 0xC2, 0xE2, 0xBA, /* 0x84-0x87 */ + 0x00, 0x00, 0xE2, 0xBC, 0xBA, 0xB5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, /* 0x8C-0x8F */ + 0xE2, 0xBB, 0x00, 0x00, 0xBA, 0xB7, 0x00, 0x00, /* 0x90-0x93 */ + 0xBA, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC4, /* 0x94-0x97 */ + 0x00, 0x00, 0xBA, 0xB3, 0xE6, 0x67, 0xE6, 0x64, /* 0x98-0x9B */ + 0xE6, 0x70, 0xE6, 0x6A, 0xE6, 0x6C, 0xBC, 0xF4, /* 0x9C-0x9F */ + 0xE6, 0x66, 0xE6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE6, 0x6D, 0xE6, 0x6B, 0x00, 0x00, 0xE6, 0x71, /* 0xA4-0xA7 */ + 0xBC, 0xF7, 0xE6, 0x68, 0xE6, 0x6F, 0x00, 0x00, /* 0xA8-0xAB */ + 0xBC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x63, /* 0xAC-0xAF */ + 0xE6, 0x65, 0xBC, 0xF6, 0xE6, 0x62, 0xE6, 0x72, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x4A, 0xBF, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEA, 0x55, 0xEA, 0x53, 0xBF, 0x4B, 0xEA, 0x49, /* 0xBC-0xBF */ + 0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x48, 0xBF, 0x55, /* 0xC0-0xC3 */ + 0xBF, 0x56, 0xEA, 0x47, 0xEA, 0x56, 0xEA, 0x51, /* 0xC4-0xC7 */ + 0xBF, 0x4F, 0xBF, 0x4C, 0xEA, 0x50, 0xEA, 0x4E, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0x52, 0xEA, 0x52, /* 0xCC-0xCF */ + 0xBF, 0x4D, 0x00, 0x00, 0xBF, 0x4E, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0x4F, 0xBF, 0x50, 0xEA, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEA, 0x54, 0xBF, 0x53, 0xEA, 0x57, 0xEA, 0x58, /* 0xD8-0xDB */ + 0xBF, 0x54, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xE7, /* 0xDC-0xDF */ + 0xC0, 0xEE, 0xED, 0x5C, 0xED, 0x62, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xED, 0x60, 0xC0, 0xEA, 0xC0, 0xE9, 0xC0, 0xE6, /* 0xE4-0xE7 */ + 0xED, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xC0, 0xEC, 0xC0, 0xEB, 0xC0, 0xE8, 0x00, 0x00, /* 0xEC-0xEF */ + 0xED, 0x61, 0xED, 0x5D, 0xED, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xC0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xC2, 0x77, 0xEF, 0xFB, 0x00, 0x00, 0xC2, 0x74, /* 0xF8-0xFB */ + 0xC2, 0x75, 0xEF, 0xFD, 0xC2, 0x76, 0xEF, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0x00, 0x00, 0xEF, 0xF9, 0xF2, 0x6C, 0xEF, 0xFC, /* 0x00-0x03 */ + 0x00, 0x00, 0xF2, 0x6D, 0xC3, 0x7A, 0xF2, 0x6B, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0x6A, 0x00, 0x00, /* 0x08-0x0B */ + 0xF2, 0x69, 0xC3, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC4, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x6A, /* 0x10-0x13 */ + 0xF4, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF5, 0xDC, 0xF5, 0xDB, 0xC4, 0xEA, /* 0x18-0x1B */ + 0x00, 0x00, 0xF5, 0xDA, 0xF6, 0xEC, 0xF6, 0xED, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE6, 0xF8, 0xB1, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF6, 0xF9, 0xBC, /* 0x24-0x27 */ + 0xC6, 0x79, 0xF9, 0xC6, 0xA4, 0xF6, 0x00, 0x00, /* 0x28-0x2B */ + 0xAA, 0xA6, 0xAA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xAC, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xC0, 0xEF, 0xA4, 0xF7, 0x00, 0x00, /* 0x34-0x37 */ + 0xAA, 0xA8, 0xAF, 0x52, 0xB7, 0xDD, 0xA4, 0xF8, /* 0x38-0x3B */ + 0x00, 0x00, 0xB2, 0x6E, 0xBA, 0xB8, 0xC9, 0x62, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCF, 0xB7, 0xD2, 0x7D, 0x00, 0x00, /* 0x40-0x43 */ + 0xE2, 0xC5, 0x00, 0x00, 0xC0, 0xF0, 0xA4, 0xF9, /* 0x44-0x47 */ + 0xAA, 0xA9, 0xCF, 0xB8, 0xCF, 0xB9, 0xDA, 0x66, /* 0x48-0x4B */ + 0xB5, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA4, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xDE, 0xE2, 0xC6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */ + 0xC3, 0x7C, 0xA4, 0xFA, 0xDA, 0x67, 0xA4, 0xFB, /* 0x58-0x5B */ + 0x00, 0x00, 0xA6, 0xC9, 0xCA, 0x42, 0xA6, 0xC8, /* 0x5C-0x5F */ + 0xA8, 0x65, 0xA8, 0x64, 0xA8, 0x63, 0xCB, 0x60, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, /* 0x64-0x67 */ + 0x00, 0x00, 0xAA, 0xAB, 0xCD, 0x5B, 0x00, 0x00, /* 0x68-0x6B */ + 0xCF, 0xBA, 0x00, 0x00, 0xCF, 0xBD, 0xAC, 0xBA, /* 0x6C-0x6F */ + 0xCF, 0xBB, 0x00, 0x00, 0xAC, 0xB9, 0xCF, 0xBC, /* 0x70-0x73 */ + 0xAC, 0xBB, 0x00, 0x00, 0xD2, 0xA2, 0xD2, 0xA1, /* 0x74-0x77 */ + 0xD2, 0x7E, 0xAF, 0x53, 0x00, 0x00, 0xD6, 0x5D, /* 0x78-0x7B */ + 0xD6, 0x5E, 0xB2, 0x6F, 0xD6, 0x5C, 0xD6, 0x5F, /* 0x7C-0x7F */ + + 0xB5, 0x52, 0xB2, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xB5, 0x51, 0xDA, 0x6B, 0xDA, 0x6A, 0x00, 0x00, /* 0x84-0x87 */ + 0xDA, 0x68, 0xDA, 0x69, 0x00, 0x00, 0xDA, 0x6C, /* 0x88-0x8B */ + 0xDE, 0xA6, 0xDE, 0xA5, 0xDE, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDE, 0xA8, 0xDE, 0xA7, 0xBA, 0xB9, 0xE2, 0xC9, /* 0x90-0x93 */ + 0x00, 0x00, 0xE2, 0xC8, 0xBA, 0xBA, 0xE2, 0xC7, /* 0x94-0x97 */ + 0xE6, 0x73, 0x00, 0x00, 0xE6, 0x74, 0xBC, 0xF9, /* 0x98-0x9B */ + 0x00, 0x00, 0xEA, 0x59, 0xEA, 0x5A, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xF2, 0x72, 0xC3, 0x7D, 0xF2, 0x71, /* 0xA0-0xA3 */ + 0xF2, 0x70, 0xF2, 0x6E, 0xF2, 0x6F, 0xC4, 0xEB, /* 0xA4-0xA7 */ + 0xF4, 0x6C, 0xF6, 0xEE, 0xF8, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */ + 0xA4, 0xFC, 0x00, 0x00, 0xC9, 0xA5, 0xA5, 0xC7, /* 0xAC-0xAF */ + 0xC9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCA, 0x43, 0xCA, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0x66, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xCB, 0x62, 0x00, 0x00, 0xCB, 0x61, /* 0xBC-0xBF */ + 0xAA, 0xAC, 0xCB, 0x65, 0xA8, 0x67, 0xCB, 0x63, /* 0xC0-0xC3 */ + 0xA8, 0x66, 0xCB, 0x67, 0xCB, 0x64, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCD, 0x5F, 0xCF, 0xBE, 0xCD, 0x5D, /* 0xC8-0xCB */ + 0xCD, 0x64, 0x00, 0x00, 0xAA, 0xAD, 0x00, 0x00, /* 0xCC-0xCF */ + 0xAA, 0xB0, 0xCD, 0x65, 0xCD, 0x61, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xCD, 0x62, 0x00, 0x00, 0xCD, 0x5C, 0xAA, 0xAF, /* 0xD4-0xD7 */ + 0xCD, 0x5E, 0xAA, 0xAE, 0xCD, 0x63, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCD, 0x60, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC2, /* 0xDC-0xDF */ + 0xAC, 0xBD, 0xAC, 0xBE, 0x00, 0x00, 0xCF, 0xC5, /* 0xE0-0xE3 */ + 0xCF, 0xBF, 0x00, 0x00, 0xCF, 0xC4, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCF, 0xC0, 0xAC, 0xBC, 0xCF, 0xC3, 0xCF, 0xC1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA8, /* 0xF0-0xF3 */ + 0xD2, 0xA5, 0x00, 0x00, 0xD2, 0xA7, 0xAF, 0x58, /* 0xF4-0xF7 */ + 0xAF, 0x57, 0xAF, 0x55, 0xD2, 0xA4, 0xD2, 0xA9, /* 0xF8-0xFB */ + 0xAF, 0x54, 0xAF, 0x56, 0xD2, 0xA6, 0xD6, 0x67, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0xD2, 0xA3, 0xD2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x62, /* 0x04-0x07 */ + 0xD6, 0x66, 0x00, 0x00, 0xD6, 0x65, 0xDA, 0x6E, /* 0x08-0x0B */ + 0xDA, 0x79, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x68, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD6, 0x63, 0xDA, 0x6D, 0xB2, 0x74, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x73, 0xD6, 0x61, /* 0x14-0x17 */ + 0xD6, 0x64, 0xB2, 0x75, 0x00, 0x00, 0xB2, 0x72, /* 0x18-0x1B */ + 0xB2, 0x71, 0xD6, 0x60, 0xD6, 0x69, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x70, 0xDA, 0x77, /* 0x20-0x23 */ + 0x00, 0x00, 0xB5, 0x54, 0xDA, 0x76, 0xDA, 0x73, /* 0x24-0x27 */ + 0x00, 0x00, 0xB5, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xDA, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xDA, 0x6F, 0xDA, 0x71, 0xDA, 0x74, 0xDA, 0x72, /* 0x30-0x33 */ + 0xB5, 0x55, 0xDA, 0x78, 0xB5, 0x53, 0xB7, 0xDF, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xDE, 0xAC, /* 0x38-0x3B */ + 0xDE, 0xAA, 0x00, 0x00, 0xB7, 0xE2, 0xB7, 0xE1, /* 0x3C-0x3F */ + 0xDE, 0xAE, 0x00, 0x00, 0xDE, 0xAB, 0xE2, 0xCA, /* 0x40-0x43 */ + 0xBA, 0xBB, 0xB7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xDE, 0xB0, 0xDE, 0xAF, 0x00, 0x00, /* 0x48-0x4B */ + 0xE2, 0xCD, 0xE2, 0xCB, 0xBC, 0xFA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xBA, 0xBC, 0xE2, 0xCC, 0xE6, 0x76, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xFB, /* 0x54-0x57 */ + 0xE6, 0x75, 0xE6, 0x7E, 0xE6, 0x7D, 0xE6, 0x7B, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0x7A, 0xE6, 0x77, 0xE6, 0x78, /* 0x5C-0x5F */ + 0xE6, 0x79, 0xE6, 0x7C, 0xE6, 0xA1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEA, 0x5F, 0xEA, 0x5C, 0xEA, 0x5D, /* 0x64-0x67 */ + 0xBF, 0x57, 0xEA, 0x5B, 0xEA, 0x61, 0xEA, 0x60, /* 0x68-0x6B */ + 0xEA, 0x5E, 0x00, 0x00, 0xED, 0x64, 0xED, 0x65, /* 0x6C-0x6F */ + 0xC0, 0xF1, 0x00, 0x00, 0xC0, 0xF2, 0xED, 0x63, /* 0x70-0x73 */ + 0x00, 0x00, 0xC2, 0x79, 0xEF, 0xFE, 0xC2, 0x78, /* 0x74-0x77 */ + 0xC3, 0x7E, 0x00, 0x00, 0xC3, 0xA1, 0xC4, 0x6D, /* 0x78-0x7B */ + 0xF4, 0x6E, 0xF4, 0x6D, 0xF5, 0xDD, 0xF6, 0xEF, /* 0x7C-0x7F */ + + 0xC5, 0x7A, 0xF7, 0xE8, 0xF7, 0xE7, 0xF7, 0xE9, /* 0x80-0x83 */ + 0xA5, 0xC8, 0xCF, 0xC6, 0xAF, 0x59, 0xB2, 0x76, /* 0x84-0x87 */ + 0xD6, 0x6A, 0xA5, 0xC9, 0xC9, 0xA7, 0xA4, 0xFD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0x45, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0x6C, 0xCB, 0x6A, /* 0x90-0x93 */ + 0xCB, 0x6B, 0xCB, 0x68, 0xA8, 0x68, 0xCB, 0x69, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCD, 0x6D, 0x00, 0x00, 0xAA, 0xB3, /* 0x9C-0x9F */ + 0xCD, 0x6B, 0xCD, 0x67, 0xCD, 0x6A, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCD, 0x66, 0xAA, 0xB5, 0xCD, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xAA, 0xB2, 0xAA, 0xB1, 0x00, 0x00, 0xAA, 0xB4, /* 0xA8-0xAB */ + 0xCD, 0x6C, 0xCD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xC2, 0xAC, 0xC5, /* 0xB0-0xB3 */ + 0xCF, 0xCE, 0xCF, 0xCD, 0xCF, 0xCC, 0xAC, 0xBF, /* 0xB4-0xB7 */ + 0xCF, 0xD5, 0xCF, 0xCB, 0x00, 0x00, 0xAC, 0xC1, /* 0xB8-0xBB */ + 0xD2, 0xAF, 0x00, 0x00, 0xCF, 0xD2, 0xCF, 0xD0, /* 0xBC-0xBF */ + 0xAC, 0xC4, 0x00, 0x00, 0xCF, 0xC8, 0xCF, 0xD3, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCF, 0xCA, 0xCF, 0xD4, 0xCF, 0xD1, /* 0xC4-0xC7 */ + 0xCF, 0xC9, 0x00, 0x00, 0xAC, 0xC0, 0xCF, 0xD6, /* 0xC8-0xCB */ + 0xCF, 0xC7, 0xAC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB4, 0xD2, 0xAB, /* 0xD0-0xD3 */ + 0xD2, 0xB6, 0x00, 0x00, 0xD2, 0xAE, 0xD2, 0xB9, /* 0xD4-0xD7 */ + 0xD2, 0xBA, 0xD2, 0xAC, 0xD2, 0xB8, 0xD2, 0xB5, /* 0xD8-0xDB */ + 0xD2, 0xB3, 0xD2, 0xB7, 0xAF, 0x5F, 0x00, 0x00, /* 0xDC-0xDF */ + 0xAF, 0x5D, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD2, 0xAD, 0x00, 0x00, 0xD2, 0xB0, /* 0xE4-0xE7 */ + 0xD2, 0xBB, 0xD2, 0xB2, 0xAF, 0x5E, 0xCF, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAF, 0x5A, 0xAF, 0x5C, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0x78, 0xD6, 0x6D, 0xD6, 0x6B, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD6, 0x6C, 0x00, 0x00, 0xD6, 0x73, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD6, 0x74, 0xD6, 0x70, 0xB2, 0x7B, 0xD6, 0x75, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0xD6, 0x72, 0xD6, 0x6F, 0x00, 0x00, 0xB2, 0x79, /* 0x00-0x03 */ + 0xD6, 0x6E, 0xB2, 0x77, 0xB2, 0x7A, 0xD6, 0x71, /* 0x04-0x07 */ + 0xD6, 0x79, 0xAF, 0x5B, 0xB2, 0x78, 0xD6, 0x77, /* 0x08-0x0B */ + 0xD6, 0x76, 0xB2, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x7E, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0xB5, 0x60, /* 0x18-0x1B */ + 0x00, 0x00, 0xDA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xDA, 0xA9, 0xDA, 0xA2, 0xB5, 0x5A, 0xDA, 0xA6, /* 0x20-0x23 */ + 0xDA, 0xA5, 0xB5, 0x5B, 0xB5, 0x61, 0x00, 0x00, /* 0x24-0x27 */ + 0xB5, 0x62, 0xDA, 0xA8, 0xB5, 0x58, 0xDA, 0x7D, /* 0x28-0x2B */ + 0xDA, 0x7B, 0xDA, 0xA3, 0xDA, 0x7A, 0xB5, 0x5F, /* 0x2C-0x2F */ + 0xDA, 0x7C, 0xDA, 0xA4, 0xDA, 0xAA, 0xB5, 0x59, /* 0x30-0x33 */ + 0xB5, 0x5E, 0xB5, 0x5C, 0xB5, 0x5D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x57, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0xE9, /* 0x3C-0x3F */ + 0xDE, 0xB7, 0xB7, 0xE8, 0xDE, 0xBB, 0x00, 0x00, /* 0x40-0x43 */ + 0xDE, 0xB1, 0x00, 0x00, 0xDE, 0xBC, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB2, 0xDE, 0xB3, /* 0x48-0x4B */ + 0x00, 0x00, 0xDE, 0xBD, 0xDE, 0xBA, 0xDE, 0xB8, /* 0x4C-0x4F */ + 0xDE, 0xB9, 0xDE, 0xB5, 0xDE, 0xB4, 0x00, 0x00, /* 0x50-0x53 */ + 0xDE, 0xBE, 0xB7, 0xE5, 0x00, 0x00, 0xDE, 0xB6, /* 0x54-0x57 */ + 0x00, 0x00, 0xB7, 0xEA, 0xB7, 0xE4, 0xB7, 0xEB, /* 0x58-0x5B */ + 0xB7, 0xEC, 0x00, 0x00, 0xB7, 0xE7, 0xB7, 0xE6, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, 0xBA, 0xBE, /* 0x60-0x63 */ + 0xBA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD3, /* 0x64-0x67 */ + 0x00, 0x00, 0xBC, 0xFC, 0xBA, 0xBF, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xBA, 0xC1, 0xE2, 0xD4, 0xB7, 0xE3, /* 0x6C-0x6F */ + 0xBA, 0xC0, 0xE2, 0xD0, 0xE2, 0xD2, 0xE2, 0xCF, /* 0x70-0x73 */ + 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0xAA, 0xE6, 0xA7, 0xBD, 0x40, 0xEA, 0x62, /* 0x7C-0x7F */ + + 0xBD, 0x41, 0xE6, 0xA6, 0x00, 0x00, 0xBC, 0xFE, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xA8, 0xE6, 0xA5, 0xE6, 0xA2, /* 0x84-0x87 */ + 0xE6, 0xA9, 0xE6, 0xA3, 0xE6, 0xA4, 0xBC, 0xFD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xED, 0x69, 0x00, 0x00, 0xEA, 0x66, 0x00, 0x00, /* 0x90-0x93 */ + 0xEA, 0x65, 0xEA, 0x67, 0x00, 0x00, 0xED, 0x66, /* 0x94-0x97 */ + 0xBF, 0x5A, 0x00, 0x00, 0xEA, 0x63, 0x00, 0x00, /* 0x98-0x9B */ + 0xBF, 0x58, 0x00, 0x00, 0xBF, 0x5C, 0xBF, 0x5B, /* 0x9C-0x9F */ + 0xEA, 0x64, 0xEA, 0x68, 0x00, 0x00, 0xBF, 0x59, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xED, 0x6D, 0xC0, 0xF5, 0xC2, 0x7A, /* 0xA4-0xA7 */ + 0xC0, 0xF6, 0xC0, 0xF3, 0xED, 0x6A, 0xED, 0x68, /* 0xA8-0xAB */ + 0x00, 0x00, 0xED, 0x6B, 0x00, 0x00, 0xED, 0x6E, /* 0xAC-0xAF */ + 0xC0, 0xF4, 0xED, 0x6C, 0xED, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF0, 0x42, 0xF0, 0x45, 0xF2, 0x75, /* 0xB4-0xB7 */ + 0xF0, 0x40, 0x00, 0x00, 0xF4, 0x6F, 0xF0, 0x46, /* 0xB8-0xBB */ + 0x00, 0x00, 0xC3, 0xA2, 0xF0, 0x44, 0xC2, 0x7B, /* 0xBC-0xBF */ + 0xF0, 0x41, 0xF0, 0x43, 0xF0, 0x47, 0xF2, 0x76, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF2, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA3, 0xF2, 0x73, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x6E, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC4, 0xED, 0xF6, 0xF1, 0xC4, 0xEC, 0xF6, 0xF3, /* 0xD4-0xD7 */ + 0xF6, 0xF0, 0xF6, 0xF2, 0xC5, 0xD0, 0xF8, 0xB2, /* 0xD8-0xDB */ + 0xA5, 0xCA, 0xCD, 0x6E, 0xD2, 0xBC, 0xD2, 0xBD, /* 0xDC-0xDF */ + 0xB2, 0x7D, 0xDE, 0xBF, 0xBF, 0x5D, 0xC3, 0xA4, /* 0xE0-0xE3 */ + 0xC5, 0x7B, 0xF8, 0xB3, 0xA5, 0xCB, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCD, 0x6F, 0xA2, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCF, 0xD7, 0x00, 0x00, 0xCF, 0xD8, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD2, 0xBE, 0xD2, 0xBF, 0xB2, 0x7E, 0xB2, 0xA1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAB, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDE, 0xC2, 0xDE, 0xC1, 0xDE, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_75[512] = { + 0xE2, 0xD5, 0x00, 0x00, 0xE2, 0xD6, 0xE2, 0xD7, /* 0x00-0x03 */ + 0xBA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAD, /* 0x04-0x07 */ + 0xE6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x69, /* 0x08-0x0B */ + 0xBF, 0x5E, 0xBF, 0x5F, 0x00, 0x00, 0xED, 0x72, /* 0x0C-0x0F */ + 0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xF0, 0x49, /* 0x10-0x13 */ + 0xF0, 0x48, 0xC2, 0x7C, 0xF2, 0x77, 0xF5, 0xDE, /* 0x14-0x17 */ + 0xA5, 0xCC, 0x00, 0x00, 0xAC, 0xC6, 0x00, 0x00, /* 0x18-0x1B */ + 0xB2, 0xA2, 0xDE, 0xC3, 0x00, 0x00, 0xA5, 0xCD, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD2, 0xC0, 0xB2, 0xA3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xB5, 0x63, 0xB5, 0x64, 0x00, 0x00, /* 0x24-0x27 */ + 0xA5, 0xCE, 0xA5, 0xCF, 0xCA, 0x46, 0xA8, 0x6A, /* 0x28-0x2B */ + 0xA8, 0x69, 0xAC, 0xC7, 0xCF, 0xD9, 0xDA, 0xAC, /* 0x2C-0x2F */ + 0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x6B, /* 0x34-0x37 */ + 0xA8, 0x6C, 0xCB, 0x6E, 0xCB, 0x6D, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xAA, 0xB6, 0xCD, 0x72, 0xCD, 0x70, /* 0x3C-0x3F */ + 0xCD, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDA, /* 0x44-0x47 */ + 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCB, /* 0x48-0x4B */ + 0xAC, 0xC9, 0x00, 0x00, 0xAC, 0xCA, 0xAC, 0xC8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xAF, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xAF, 0x64, 0xAF, 0x63, 0xD2, 0xC1, /* 0x58-0x5B */ + 0xAF, 0x62, 0xAF, 0x61, 0x00, 0x00, 0xD2, 0xC2, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA6, 0xD6, 0x7B, /* 0x60-0x63 */ + 0xD6, 0x7A, 0xB2, 0xA4, 0xB2, 0xA5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x66, 0xB5, 0x65, /* 0x68-0x6B */ + 0xDA, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAD, /* 0x6C-0x6F */ + 0xB2, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xED, 0xDE, 0xC5, /* 0x74-0x77 */ + 0xB7, 0xEE, 0xDE, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE2, 0xD8, 0xE6, 0xAE, 0xBD, 0x42, /* 0x7C-0x7F */ + + 0xEA, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xED, 0x73, 0x00, 0x00, 0xC3, 0xA6, 0xC3, 0xA5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0x7C, 0xA5, 0xD4, /* 0x88-0x8B */ + 0xCD, 0x73, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA8, /* 0x8C-0x8F */ + 0xE2, 0xD9, 0xBA, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0x6F, 0xCB, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0x74, 0xAA, 0xB8, 0xAA, 0xB9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xAA, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCF, 0xAC, 0xD0, /* 0xA0-0xA3 */ + 0xAC, 0xCD, 0xAC, 0xCE, 0x00, 0x00, 0xCF, 0xDC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDD, 0xAC, 0xCC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xD2, 0xC3, 0x00, 0x00, 0xAF, 0x68, 0xAF, 0x69, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xB2, 0xAB, 0xD2, 0xC9, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAF, 0x6E, 0xAF, 0x6C, 0xD2, 0xCA, 0xD2, 0xC5, /* 0xB8-0xBB */ + 0xAF, 0x6B, 0xAF, 0x6A, 0xAF, 0x65, 0xD2, 0xC8, /* 0xBC-0xBF */ + 0xD2, 0xC7, 0xD2, 0xC4, 0xAF, 0x6D, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD2, 0xC6, 0xAF, 0x66, 0x00, 0x00, 0xAF, 0x67, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xAC, 0xD6, 0xA1, /* 0xC8-0xCB */ + 0xD6, 0xA2, 0xB2, 0xAD, 0xD6, 0x7C, 0xD6, 0x7E, /* 0xCC-0xCF */ + 0xD6, 0xA4, 0xD6, 0xA3, 0xD6, 0x7D, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB2, 0xA9, 0xB2, 0xAA, 0x00, 0x00, 0xDA, 0xB6, /* 0xD4-0xD7 */ + 0xB5, 0x6B, 0xB5, 0x6A, 0xDA, 0xB0, 0xB5, 0x68, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDA, 0xB3, 0xB5, 0x6C, 0xDA, 0xB4, /* 0xDC-0xDF */ + 0xB5, 0x6D, 0xDA, 0xB1, 0xB5, 0x67, 0xB5, 0x69, /* 0xE0-0xE3 */ + 0xDA, 0xB5, 0x00, 0x00, 0xDA, 0xB2, 0xDA, 0xAF, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xDE, 0xD2, 0x00, 0x00, 0xDE, 0xC7, /* 0xEC-0xEF */ + 0xB7, 0xF0, 0xB7, 0xF3, 0xB7, 0xF2, 0xB7, 0xF7, /* 0xF0-0xF3 */ + 0xB7, 0xF6, 0xDE, 0xD3, 0xDE, 0xD1, 0xDE, 0xCA, /* 0xF4-0xF7 */ + 0xDE, 0xCE, 0xDE, 0xCD, 0xB7, 0xF4, 0xDE, 0xD0, /* 0xF8-0xFB */ + 0xDE, 0xCC, 0xDE, 0xD4, 0xDE, 0xCB, 0xB7, 0xF5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xB7, 0xEF, 0xB7, 0xF1, 0x00, 0x00, 0xDE, 0xC9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0xDB, 0xBA, 0xC7, 0xE2, 0xDF, 0xBA, 0xC6, /* 0x08-0x0B */ + 0xE2, 0xDC, 0xBA, 0xC5, 0x00, 0x00, 0xDE, 0xC8, /* 0x0C-0x0F */ + 0xDE, 0xCF, 0xE2, 0xDE, 0x00, 0x00, 0xBA, 0xC8, /* 0x10-0x13 */ + 0xE2, 0xE0, 0xE2, 0xDD, 0xE2, 0xDA, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE6, 0xB1, 0xE6, 0xB5, 0xE6, 0xB7, /* 0x18-0x1B */ + 0xE6, 0xB3, 0xE6, 0xB2, 0xE6, 0xB0, 0xBD, 0x45, /* 0x1C-0x1F */ + 0xBD, 0x43, 0xBD, 0x48, 0xBD, 0x49, 0xE6, 0xB4, /* 0x20-0x23 */ + 0xBD, 0x46, 0xE6, 0xAF, 0xBD, 0x47, 0xBA, 0xC4, /* 0x24-0x27 */ + 0xE6, 0xB6, 0xBD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0xEA, 0x6B, /* 0x2C-0x2F */ + 0xEA, 0x73, 0xEA, 0x6D, 0xEA, 0x72, 0xEA, 0x6F, /* 0x30-0x33 */ + 0xBF, 0x60, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xBF, 0x61, 0x00, 0x00, 0xBF, 0x62, 0x00, 0x00, /* 0x38-0x3B */ + 0xEA, 0x70, 0xEA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xF8, 0xED, 0x74, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xF7, 0xED, 0x77, /* 0x44-0x47 */ + 0xED, 0x75, 0xED, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xC0, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF0, 0x4D, 0x00, 0x00, 0xC2, 0xA1, 0xF0, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x7D, 0xF0, 0x4F, /* 0x54-0x57 */ + 0xC2, 0x7E, 0xF0, 0x4C, 0xF0, 0x50, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0x4A, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA7, /* 0x5C-0x5F */ + 0xF2, 0x78, 0xC3, 0xA8, 0xC4, 0x6F, 0x00, 0x00, /* 0x60-0x63 */ + 0xF0, 0x4B, 0xC4, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xC4, 0xEE, 0xF5, 0xDF, 0x00, 0x00, /* 0x68-0x6B */ + 0xC5, 0x7E, 0xF6, 0xF4, 0xC5, 0x7D, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF7, 0xEA, 0xC5, 0xF5, 0xC5, 0xF6, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF9, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xAC, 0xD1, 0xCF, 0xDE, 0x00, 0x00, 0xB5, 0x6E, /* 0x78-0x7B */ + 0xB5, 0x6F, 0xA5, 0xD5, 0xA6, 0xCA, 0xCA, 0x47, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xCB, 0x71, 0xA8, 0x6D, 0x00, 0x00, /* 0x80-0x83 */ + 0xAA, 0xBA, 0x00, 0x00, 0xAC, 0xD2, 0xAC, 0xD3, /* 0x84-0x87 */ + 0xAC, 0xD4, 0xD6, 0xA6, 0xD2, 0xCB, 0xAF, 0x6F, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xAE, 0xD6, 0xA5, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB8, 0xB5, 0x71, /* 0x90-0x93 */ + 0x00, 0x00, 0xDA, 0xB7, 0xB5, 0x70, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDE, 0xD5, 0xBD, 0x4A, 0xE6, 0xBB, /* 0x98-0x9B */ + 0xE6, 0xB8, 0xE6, 0xB9, 0xE6, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xED, 0x78, 0x00, 0x00, 0xF0, 0x51, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0x71, 0xF4, 0x70, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF6, 0xF5, 0xA5, 0xD6, 0xCD, 0x75, /* 0xAC-0xAF */ + 0xAF, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xB5, 0x72, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE2, 0xE1, 0x00, 0x00, 0xBD, 0x4B, 0xEA, 0x74, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF0, 0x52, 0xF4, 0x72, 0xA5, 0xD7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xBB, 0xAC, 0xD7, /* 0xC0-0xC3 */ + 0xCF, 0xDF, 0xAC, 0xD8, 0xAC, 0xD6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xAC, 0xD5, 0xD2, 0xCC, 0xAF, 0x71, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xAF, 0x72, 0xAF, 0x73, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB0, 0xD6, 0xA7, /* 0xD0-0xD3 */ + 0xB2, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, 0xB2, 0xB1, /* 0xD8-0xDB */ + 0xB5, 0x73, 0xDE, 0xD7, 0xB7, 0xF8, 0xB7, 0xF9, /* 0xDC-0xDF */ + 0x00, 0x00, 0xBA, 0xC9, 0x00, 0x00, 0xBA, 0xCA, /* 0xE0-0xE3 */ + 0xBD, 0x4C, 0xBF, 0x64, 0xEA, 0x75, 0xBF, 0x63, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xED, 0x79, 0xC0, 0xFA, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF0, 0x53, 0xF4, 0x73, 0xA5, 0xD8, 0xA8, 0x6E, /* 0xEC-0xEF */ + 0xCD, 0x78, 0xCD, 0x77, 0xAA, 0xBC, 0xCD, 0x76, /* 0xF0-0xF3 */ + 0xAA, 0xBD, 0xCD, 0x79, 0x00, 0x00, 0xCF, 0xE5, /* 0xF4-0xF7 */ + 0xAC, 0xDB, 0xAC, 0xDA, 0xCF, 0xE7, 0xCF, 0xE6, /* 0xF8-0xFB */ + 0xAC, 0xDF, 0x00, 0x00, 0xAC, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0xAC, 0xD9, 0x00, 0x00, 0xCF, 0xE1, /* 0x00-0x03 */ + 0xCF, 0xE2, 0xCF, 0xE3, 0x00, 0x00, 0xAC, 0xE0, /* 0x04-0x07 */ + 0xCF, 0xE0, 0xAC, 0xDC, 0xCF, 0xE4, 0xAC, 0xDD, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xCF, 0xD2, 0xD3, 0xD2, 0xD1, 0xD2, 0xD0, /* 0x10-0x13 */ + 0x00, 0x00, 0xD2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xCE, /* 0x18-0x1B */ + 0x00, 0x00, 0xD2, 0xCD, 0x00, 0x00, 0xAF, 0x75, /* 0x1C-0x1F */ + 0xAF, 0x76, 0x00, 0x00, 0xD2, 0xD7, 0xD2, 0xD2, /* 0x20-0x23 */ + 0x00, 0x00, 0xD6, 0xB0, 0x00, 0x00, 0xD2, 0xD8, /* 0x24-0x27 */ + 0xAF, 0x77, 0xAF, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD6, 0xAA, 0x00, 0x00, 0xD6, 0xA9, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD6, 0xAB, 0xD6, 0xAC, 0xD6, 0xAE, /* 0x30-0x33 */ + 0xD6, 0xAD, 0xD6, 0xB2, 0xB2, 0xB5, 0xB2, 0xB2, /* 0x34-0x37 */ + 0xB2, 0xB6, 0xD6, 0xA8, 0xB2, 0xB7, 0xD6, 0xB1, /* 0x38-0x3B */ + 0xB2, 0xB4, 0xD6, 0xAF, 0xB2, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDA, 0xBC, 0xDA, 0xBE, 0xDA, 0xBA, 0xDA, 0xBB, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBF, 0xDA, 0xC1, /* 0x48-0x4B */ + 0xDA, 0xC2, 0xDA, 0xBD, 0xDA, 0xC0, 0xB5, 0x74, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, 0x00, 0x00, /* 0x50-0x53 */ + 0xDE, 0xE0, 0xDE, 0xD8, 0xDE, 0xDC, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xDE, 0xE1, 0xDE, 0xDD, 0xB7, 0xFA, /* 0x58-0x5B */ + 0xB8, 0x43, 0x00, 0x00, 0xB7, 0xFD, 0xDE, 0xD9, /* 0x5C-0x5F */ + 0xDE, 0xDA, 0xBA, 0xCE, 0xB8, 0x46, 0xB7, 0xFE, /* 0x60-0x63 */ + 0x00, 0x00, 0xB8, 0x44, 0xB7, 0xFC, 0xDE, 0xDF, /* 0x64-0x67 */ + 0xB8, 0x45, 0xDE, 0xDE, 0xB8, 0x41, 0xB7, 0xFB, /* 0x68-0x6B */ + 0xB8, 0x42, 0xDE, 0xE2, 0xE2, 0xE6, 0xE2, 0xE8, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE2, 0xE3, 0xBA, 0xCC, 0xE2, 0xE9, 0xBA, 0xCD, /* 0x7C-0x7F */ + + 0xE2, 0xE7, 0xE2, 0xE2, 0xE2, 0xE5, 0xE2, 0xEA, /* 0x80-0x83 */ + 0xBA, 0xCB, 0xE2, 0xE4, 0x00, 0x00, 0xBD, 0x4E, /* 0x84-0x87 */ + 0xE6, 0xBF, 0xE6, 0xBE, 0x00, 0x00, 0xBD, 0x51, /* 0x88-0x8B */ + 0xBD, 0x4F, 0xE6, 0xBC, 0xBD, 0x4D, 0xE6, 0xBD, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBD, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x7D, 0x00, 0x00, 0xEA, 0xA1, /* 0x94-0x97 */ + 0x00, 0x00, 0xEA, 0x7E, 0xEA, 0x76, 0xEA, 0x7A, /* 0x98-0x9B */ + 0xEA, 0x79, 0xEA, 0x77, 0xBF, 0x66, 0xBF, 0x67, /* 0x9C-0x9F */ + 0xBF, 0x65, 0xEA, 0x78, 0xEA, 0x7B, 0xEA, 0x7C, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xBF, 0x68, 0x00, 0x00, 0xC1, 0x40, /* 0xA4-0xA7 */ + 0xED, 0xA3, 0x00, 0x00, 0xC0, 0xFC, 0xED, 0x7B, /* 0xA8-0xAB */ + 0xC0, 0xFE, 0xC1, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xC0, 0xFD, 0xED, 0xA2, 0xED, 0x7C, 0xC0, 0xFB, /* 0xB0-0xB3 */ + 0xED, 0xA1, 0xED, 0x7A, 0xED, 0x7E, 0xED, 0x7D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0x55, 0xC2, 0xA4, /* 0xB8-0xBB */ + 0xC2, 0xA5, 0xC2, 0xA2, 0x00, 0x00, 0xC2, 0xA3, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0x54, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF2, 0x79, 0xF2, 0x7A, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF4, 0x74, 0xF4, 0x77, 0xF4, 0x75, 0xF4, 0x76, /* 0xCC-0xCF */ + 0xF5, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xEF, /* 0xD0-0xD3 */ + 0xF7, 0xEB, 0xF8, 0xB4, 0x00, 0x00, 0xC5, 0xF7, /* 0xD4-0xD7 */ + 0xF8, 0xF8, 0xF8, 0xF9, 0xC6, 0x66, 0xA5, 0xD9, /* 0xD8-0xDB */ + 0xAC, 0xE1, 0x00, 0x00, 0xDA, 0xC3, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDE, 0xE3, 0x00, 0x00, 0xA5, 0xDA, 0xA8, 0x6F, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0xBE, 0x00, 0x00, 0xCF, 0xE8, /* 0xE4-0xE7 */ + 0xCF, 0xE9, 0xAF, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xDA, 0xC4, 0xB5, 0x75, 0xB8, 0x47, 0xC1, 0x42, /* 0xEC-0xEF */ + 0xED, 0xA4, 0xF2, 0x7C, 0xF4, 0x78, 0xA5, 0xDB, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA1, /* 0xF4-0xF7 */ + 0xCD, 0x7A, 0xCD, 0x7C, 0xCD, 0x7E, 0xCD, 0x7D, /* 0xF8-0xFB */ + 0xCD, 0x7B, 0xAA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xE2, 0xCF, 0xF2, /* 0x00-0x03 */ + 0x00, 0x00, 0xCF, 0xED, 0xCF, 0xEA, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCF, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xAC, 0xE4, 0xAC, 0xE5, 0xCF, 0xF0, 0xCF, 0xEF, /* 0x0C-0x0F */ + 0xCF, 0xEE, 0xCF, 0xEB, 0xCF, 0xEC, 0xCF, 0xF3, /* 0x10-0x13 */ + 0xAC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xAF, 0x7C, 0x00, 0x00, 0xAF, 0xA4, /* 0x1C-0x1F */ + 0xAF, 0xA3, 0xD2, 0xE1, 0xD2, 0xDB, 0xD2, 0xD9, /* 0x20-0x23 */ + 0x00, 0x00, 0xAF, 0xA1, 0xD6, 0xB9, 0xAF, 0x7A, /* 0x24-0x27 */ + 0xD2, 0xDE, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE0, /* 0x28-0x2B */ + 0xD2, 0xDA, 0xAF, 0xA2, 0xD2, 0xDF, 0xD2, 0xDD, /* 0x2C-0x2F */ + 0xAF, 0x79, 0xD2, 0xE5, 0xAF, 0xA5, 0xD2, 0xE3, /* 0x30-0x33 */ + 0xAF, 0x7D, 0xD2, 0xDC, 0x00, 0x00, 0xAF, 0x7E, /* 0x34-0x37 */ + 0xAF, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB9, /* 0x40-0x43 */ + 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD6, 0xB3, 0xD6, 0xB5, 0xD6, 0xB7, 0x00, 0x00, /* 0x48-0x4B */ + 0xD6, 0xB8, 0xD6, 0xB6, 0xB2, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD6, 0xBB, 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xDA, 0xC8, 0xB5, 0x76, 0xDA, 0xD0, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDA, 0xC5, 0x00, 0x00, 0xDA, 0xD1, 0x00, 0x00, /* 0x60-0x63 */ + 0xDA, 0xC6, 0xDA, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDA, 0xCF, 0xDA, 0xCE, 0xDA, 0xCB, 0xB2, 0xB8, /* 0x68-0x6B */ + 0xB5, 0x77, 0xDA, 0xC9, 0xDA, 0xCC, 0xB5, 0x78, /* 0x6C-0x6F */ + 0xDA, 0xCD, 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0xDE, 0xF2, /* 0x78-0x7B */ + 0xB8, 0x4E, 0x00, 0x00, 0xE2, 0xF0, 0xB8, 0x51, /* 0x7C-0x7F */ + + 0xDE, 0xF0, 0xF9, 0xD6, 0x00, 0x00, 0xDE, 0xED, /* 0x80-0x83 */ + 0xDE, 0xE8, 0xDE, 0xEA, 0xDE, 0xEB, 0xDE, 0xE4, /* 0x84-0x87 */ + 0x00, 0x00, 0xB8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xB8, 0x4C, 0x00, 0x00, 0xB8, 0x48, 0xDE, 0xE7, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB8, 0x4F, 0x00, 0x00, 0xB8, 0x50, /* 0x90-0x93 */ + 0xDE, 0xE6, 0xDE, 0xE9, 0xDE, 0xF1, 0xB8, 0x4A, /* 0x94-0x97 */ + 0xB8, 0x4B, 0xDE, 0xEF, 0xDE, 0xE5, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, 0xBA, 0xD0, /* 0x9C-0x9F */ + 0xE2, 0xF4, 0xDE, 0xEC, 0xE2, 0xF6, 0xBA, 0xD4, /* 0xA0-0xA3 */ + 0xE2, 0xF7, 0xE2, 0xF3, 0x00, 0x00, 0xBA, 0xD1, /* 0xA4-0xA7 */ + 0xE2, 0xEF, 0xBA, 0xD3, 0xE2, 0xEC, 0xE2, 0xF1, /* 0xA8-0xAB */ + 0xE2, 0xF5, 0xE2, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xB8, 0x49, 0x00, 0x00, 0xE2, 0xEB, 0xBA, 0xD2, /* 0xB0-0xB3 */ + 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0x54, 0xE6, 0xC1, /* 0xB8-0xBB */ + 0xBD, 0x58, 0x00, 0x00, 0xBD, 0x56, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBA, 0xCF, 0x00, 0x00, 0xE6, 0xC8, /* 0xC0-0xC3 */ + 0xE6, 0xC9, 0xBD, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE6, 0xC7, 0xE6, 0xCA, 0xBD, 0x55, 0xBD, 0x52, /* 0xC8-0xCB */ + 0xE6, 0xC3, 0xE6, 0xC0, 0xE6, 0xC5, 0xE6, 0xC2, /* 0xCC-0xCF */ + 0xBD, 0x59, 0xE6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE6, 0xC6, 0xBD, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0x6A, 0xEA, 0xA8, /* 0xD8-0xDB */ + 0x00, 0x00, 0xEA, 0xA2, 0xEA, 0xA6, 0xEA, 0xAC, /* 0xDC-0xDF */ + 0xEA, 0xAD, 0xEA, 0xA9, 0xEA, 0xAA, 0xEA, 0xA7, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0xBF, 0x6C, /* 0xE4-0xE7 */ + 0xBF, 0x69, 0xEA, 0xA3, 0xEA, 0xA5, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBF, 0x6B, 0xEA, 0xAB, 0x00, 0x00, 0xC1, 0x46, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xAA, 0xED, 0xA5, /* 0xF0-0xF3 */ + 0xC1, 0x45, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x43, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xED, 0xAC, 0xC1, 0x44, 0xED, 0xA8, /* 0xF8-0xFB */ + 0xED, 0xA9, 0xED, 0xA6, 0xED, 0xAD, 0xF0, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0xC1, 0x47, 0xED, 0xA7, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xAE, 0xED, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF0, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF0, 0x57, 0x00, 0x00, 0xC2, 0xA6, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF0, 0x5B, 0xF0, 0x5D, 0xF0, 0x5C, 0xF0, 0x58, /* 0x10-0x13 */ + 0xF0, 0x59, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA3, /* 0x14-0x17 */ + 0x00, 0x00, 0xC3, 0xAA, 0x00, 0x00, 0xF2, 0x7E, /* 0x18-0x1B */ + 0xF2, 0xA2, 0xF2, 0x7D, 0xF2, 0xA4, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF2, 0xA1, 0x00, 0x00, 0xF4, 0x7A, /* 0x20-0x23 */ + 0xF4, 0x7D, 0xF4, 0x79, 0xC4, 0x71, 0xF4, 0x7B, /* 0x24-0x27 */ + 0xF4, 0x7C, 0xF4, 0x7E, 0xC4, 0x72, 0xC4, 0x74, /* 0x28-0x2B */ + 0xC4, 0x73, 0xF5, 0xE1, 0x00, 0x00, 0xF5, 0xE3, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF5, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xF8, 0xB5, 0xF8, 0xFA, 0xA5, 0xDC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xCB, 0x72, 0xAA, 0xC0, 0xCD, 0xA3, /* 0x3C-0x3F */ + 0xAA, 0xC1, 0xAA, 0xC2, 0xCD, 0xA2, 0x00, 0x00, /* 0x40-0x43 */ + 0xCF, 0xF8, 0xCF, 0xF7, 0xAC, 0xE6, 0xAC, 0xE9, /* 0x44-0x47 */ + 0xAC, 0xE8, 0xAC, 0xE7, 0xCF, 0xF4, 0xCF, 0xF6, /* 0x48-0x4B */ + 0xCF, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xE8, /* 0x4C-0x4F */ + 0xAF, 0xA7, 0xD2, 0xEC, 0xD2, 0xEB, 0xD2, 0xEA, /* 0x50-0x53 */ + 0xD2, 0xE6, 0xAF, 0xA6, 0xAF, 0xAA, 0xAF, 0xAD, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xAF, 0xAE, 0xD2, 0xE7, /* 0x58-0x5B */ + 0xD2, 0xE9, 0xAF, 0xAC, 0xAF, 0xAB, 0xAF, 0xA9, /* 0x5C-0x5F */ + 0xAF, 0xA8, 0xD6, 0xC2, 0x00, 0x00, 0xD6, 0xC0, /* 0x60-0x63 */ + 0xD6, 0xBC, 0xB2, 0xBB, 0x00, 0x00, 0xD6, 0xBD, /* 0x64-0x67 */ + 0xB2, 0xBC, 0xD6, 0xBE, 0xD6, 0xBF, 0xD6, 0xC1, /* 0x68-0x6B */ + 0x00, 0x00, 0xB2, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDA, 0xD5, 0x00, 0x00, 0xDA, 0xD4, 0xDA, 0xD3, /* 0x70-0x73 */ + 0xDA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDE, 0xF6, 0xB8, 0x52, 0x00, 0x00, /* 0x78-0x7B */ + 0xDE, 0xF3, 0xDE, 0xF5, 0x00, 0x00, 0xB8, 0x53, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xB8, 0x54, 0xDE, 0xF4, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE3, 0x41, 0x00, 0x00, 0xE2, 0xF9, 0xE2, 0xFA, /* 0x88-0x8B */ + 0x00, 0x00, 0xBA, 0xD7, 0xBA, 0xD5, 0xBA, 0xD6, /* 0x8C-0x8F */ + 0xE3, 0x43, 0x00, 0x00, 0xE3, 0x42, 0xE2, 0xFE, /* 0x90-0x93 */ + 0xE2, 0xFD, 0xE2, 0xFC, 0xE2, 0xFB, 0xE3, 0x40, /* 0x94-0x97 */ + 0xE2, 0xF8, 0x00, 0x00, 0xE6, 0xCB, 0xE6, 0xD0, /* 0x98-0x9B */ + 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE6, 0xCD, 0xE6, 0xCC, 0xE6, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEA, 0xAE, 0x00, 0x00, 0xBF, 0x6D, 0xC1, 0x48, /* 0xA4-0xA7 */ + 0xED, 0xB0, 0x00, 0x00, 0xC1, 0x49, 0xED, 0xAF, /* 0xA8-0xAB */ + 0xF0, 0x5F, 0xF0, 0x5E, 0xC2, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF2, 0xA5, 0xC3, 0xAB, 0xF4, 0xA1, 0xC5, 0xA1, /* 0xB0-0xB3 */ + 0xF6, 0xF7, 0x00, 0x00, 0xF8, 0xB7, 0xF8, 0xB6, /* 0xB4-0xB7 */ + 0xC9, 0xA8, 0xAC, 0xEA, 0xAC, 0xEB, 0xD6, 0xC3, /* 0xB8-0xBB */ + 0x00, 0x00, 0xB8, 0x56, 0xA5, 0xDD, 0xA8, 0x72, /* 0xBC-0xBF */ + 0xA8, 0x71, 0xA8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xAA, 0xC4, 0xAA, 0xC3, 0x00, 0x00, 0xAC, 0xEE, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCF, 0xFA, 0xCF, 0xFD, 0xCF, 0xFB, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAC, 0xEC, 0xAC, 0xED, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xCF, 0xF9, 0xCF, 0xFC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xAF, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD2, 0xF3, 0xD2, 0xF5, 0xD2, 0xF4, 0xAF, 0xB2, /* 0xDC-0xDF */ + 0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xAF, 0xB0, /* 0xE0-0xE3 */ + 0xAF, 0xAF, 0x00, 0x00, 0xAF, 0xB3, 0xAF, 0xB1, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xAF, 0xB4, 0xD2, 0xF2, 0xD2, 0xED, /* 0xE8-0xEB */ + 0xD2, 0xEE, 0xD2, 0xF1, 0xD2, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC6, 0xD6, 0xC7, /* 0xF4-0xF7 */ + 0xD6, 0xC5, 0x00, 0x00, 0xD6, 0xC4, 0xB2, 0xBE, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7A[512] = { + 0xB5, 0x7D, 0x00, 0x00, 0xDA, 0xD6, 0xDA, 0xD8, /* 0x00-0x03 */ + 0xDA, 0xDA, 0xB5, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xB5, 0x7A, 0x00, 0x00, 0xDA, 0xD7, 0xB5, 0x7B, /* 0x08-0x0B */ + 0xDA, 0xD9, 0xB5, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xDF, 0x41, 0xDE, 0xF7, 0xDE, 0xFA, 0xDE, 0xFE, /* 0x10-0x13 */ + 0xB8, 0x5A, 0xDE, 0xFC, 0x00, 0x00, 0xDE, 0xFB, /* 0x14-0x17 */ + 0xDE, 0xF8, 0xDE, 0xF9, 0xB8, 0x58, 0xDF, 0x40, /* 0x18-0x1B */ + 0xB8, 0x57, 0x00, 0x00, 0xB8, 0x5C, 0xB8, 0x5B, /* 0x1C-0x1F */ + 0xB8, 0x59, 0x00, 0x00, 0xDE, 0xFD, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, /* 0x24-0x27 */ + 0xE3, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x44, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0xD8, 0xE3, 0x47, /* 0x2C-0x2F */ + 0xE3, 0x46, 0xBA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBD, 0x5E, /* 0x34-0x37 */ + 0x00, 0x00, 0xE6, 0xD2, 0x00, 0x00, 0xBD, 0x5F, /* 0x38-0x3B */ + 0xBD, 0x5B, 0xBD, 0x5D, 0x00, 0x00, 0xBD, 0x5A, /* 0x3C-0x3F */ + 0xBD, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xEA, 0xAF, 0x00, 0x00, 0xBF, 0x70, 0xEA, 0xB1, /* 0x44-0x47 */ + 0xEA, 0xB0, 0x00, 0x00, 0xE3, 0x45, 0xBF, 0x72, /* 0x48-0x4B */ + 0xBF, 0x71, 0xBF, 0x6E, 0xBF, 0x6F, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xED, 0xB5, 0x00, 0x00, 0xED, 0xB3, 0xC1, 0x4A, /* 0x54-0x57 */ + 0xED, 0xB4, 0x00, 0x00, 0xED, 0xB6, 0xED, 0xB2, /* 0x58-0x5B */ + 0xED, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x60, /* 0x5C-0x5F */ + 0xC2, 0xAA, 0xC2, 0xA8, 0xC2, 0xA9, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x64-0x67 */ + 0xF2, 0xA7, 0xC3, 0xAD, 0x00, 0x00, 0xC3, 0xAC, /* 0x68-0x6B */ + 0xF4, 0xA3, 0xF4, 0xA4, 0xF4, 0xA2, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF6, 0xF8, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA5, 0xDE, 0xCA, 0x48, 0xA8, 0x73, 0x00, 0x00, /* 0x74-0x77 */ + 0xCD, 0xA5, 0xAA, 0xC6, 0xAA, 0xC5, 0xCD, 0xA6, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x40, 0xAC, 0xEF, /* 0x7C-0x7F */ + + 0xCF, 0xFE, 0xAC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xAF, 0xB6, 0xD2, 0xF8, 0xD2, 0xF6, 0xD2, 0xFC, /* 0x84-0x87 */ + 0xAF, 0xB7, 0xD2, 0xF7, 0xD2, 0xFB, 0xD2, 0xF9, /* 0x88-0x8B */ + 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0x8C-0x8F */ + 0xD6, 0xCA, 0x00, 0x00, 0xB2, 0xBF, 0x00, 0x00, /* 0x90-0x93 */ + 0xD6, 0xC9, 0xB2, 0xC0, 0xB5, 0xA2, 0xB5, 0xA1, /* 0x94-0x97 */ + 0xB5, 0x7E, 0xDA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0x44, 0xB8, 0x5D, /* 0x9C-0x9F */ + 0xB8, 0x5E, 0x00, 0x00, 0xDF, 0x43, 0xDF, 0x42, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE3, 0x4A, 0xBA, 0xDB, 0xBA, 0xDA, 0xE3, 0x4B, /* 0xA8-0xAB */ + 0xE3, 0x4C, 0x00, 0x00, 0xBD, 0x61, 0xBD, 0x60, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEA, 0xB5, 0xE6, 0xD3, 0xE6, 0xD5, /* 0xB0-0xB3 */ + 0xE6, 0xD4, 0xEA, 0xB4, 0xEA, 0xB2, 0xEA, 0xB6, /* 0xB4-0xB7 */ + 0xEA, 0xB3, 0x00, 0x00, 0xBF, 0x73, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xB7, 0xC1, 0x4B, /* 0xBC-0xBF */ + 0xED, 0xB8, 0xED, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xC2, 0xAB, 0xC2, 0xAC, 0x00, 0x00, 0xC4, 0x75, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD1, 0xA5, 0xDF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD0, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD2, 0xFD, 0xAF, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xBA, /* 0xDC-0xDF */ + 0xB3, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xA4, /* 0xE0-0xE3 */ + 0xDA, 0xDD, 0xB5, 0xA3, 0xDA, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x45, /* 0xE8-0xEB */ + 0x00, 0x00, 0xBA, 0xDC, 0xE3, 0x4D, 0xBA, 0xDD, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xC4, 0x76, 0xF4, 0xA5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA6, 0xCB, 0xAA, 0xC7, 0xCD, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xAC, 0xF2, 0x00, 0x00, 0xAC, 0xF1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0xD0, 0x42, 0xD0, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD3, 0x40, 0xD3, 0x42, 0xAF, 0xB9, 0x00, 0x00, /* 0x04-0x07 */ + 0xD3, 0x44, 0xD3, 0x47, 0xD3, 0x45, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0x46, 0xD3, 0x43, /* 0x0C-0x0F */ + 0xD2, 0xFE, 0xAF, 0xBA, 0xD3, 0x48, 0xD3, 0x41, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD6, 0xD3, 0xB2, 0xC6, 0xD6, 0xDC, 0xB2, 0xC3, /* 0x18-0x1B */ + 0x00, 0x00, 0xD6, 0xD5, 0xB2, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB2, 0xC1, 0x00, 0x00, 0xD6, 0xD0, 0xD6, 0xDD, /* 0x20-0x23 */ + 0xD6, 0xD1, 0xD6, 0xCE, 0xB2, 0xC5, 0x00, 0x00, /* 0x24-0x27 */ + 0xB2, 0xC2, 0x00, 0x00, 0xD6, 0xD4, 0xD6, 0xD7, /* 0x28-0x2B */ + 0xB2, 0xC4, 0xD6, 0xD8, 0xB2, 0xC8, 0xD6, 0xD9, /* 0x2C-0x2F */ + 0xD6, 0xCF, 0xD6, 0xD6, 0xD6, 0xDA, 0xD6, 0xD2, /* 0x30-0x33 */ + 0xD6, 0xCD, 0xD6, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xD6, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDA, 0xE0, 0xDA, 0xE6, 0xB5, 0xA7, 0xD6, 0xCC, /* 0x44-0x47 */ + 0xDA, 0xE1, 0xB5, 0xA5, 0xDA, 0xDE, 0xB5, 0xAC, /* 0x48-0x4B */ + 0xDA, 0xE2, 0xB5, 0xAB, 0xDA, 0xE3, 0xB5, 0xAD, /* 0x4C-0x4F */ + 0xB5, 0xA8, 0xB5, 0xAE, 0xB5, 0xA9, 0x00, 0x00, /* 0x50-0x53 */ + 0xB5, 0xAA, 0x00, 0x00, 0xB5, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0xDA, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB8, 0x61, 0xDF, 0x50, 0x00, 0x00, 0xDF, 0x53, /* 0x60-0x63 */ + 0xDF, 0x47, 0xDF, 0x4C, 0xDF, 0x46, 0xB8, 0x63, /* 0x64-0x67 */ + 0x00, 0x00, 0xDF, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xDF, 0x48, 0xB8, 0x62, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDF, 0x4F, 0xDF, 0x4E, 0xDF, 0x4B, 0xDF, 0x4D, /* 0x70-0x73 */ + 0xDF, 0x49, 0xBA, 0xE1, 0xDF, 0x52, 0xB8, 0x5F, /* 0x74-0x77 */ + 0xDF, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, 0x00, 0x00, /* 0x80-0x83 */ + 0xBA, 0xE8, 0xE3, 0x58, 0x00, 0x00, 0xBA, 0xE7, /* 0x84-0x87 */ + 0xE3, 0x4E, 0x00, 0x00, 0xE3, 0x50, 0xBA, 0xE0, /* 0x88-0x8B */ + 0xE3, 0x55, 0xE3, 0x54, 0xE3, 0x57, 0xBA, 0xE5, /* 0x8C-0x8F */ + 0xE3, 0x52, 0xE3, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xBA, 0xE4, 0xBA, 0xDF, 0xE3, 0x53, 0xBA, 0xE2, /* 0x94-0x97 */ + 0xE3, 0x59, 0xE3, 0x5B, 0x00, 0x00, 0xE3, 0x56, /* 0x98-0x9B */ + 0xE3, 0x4F, 0xBA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xBD, 0x69, 0xBA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0xD9, 0xBD, 0x62, 0x00, 0x00, 0xE6, 0xDB, /* 0xAC-0xAF */ + 0x00, 0x00, 0xBD, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBD, 0x65, 0xE6, 0xDE, 0x00, 0x00, 0xE6, 0xD6, /* 0xB4-0xB7 */ + 0xBA, 0xE6, 0xE6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xB8, 0x60, 0xBD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBD, 0x64, 0x00, 0x00, 0xBD, 0x66, 0xBD, 0x67, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xBF, 0x76, 0xE6, 0xDD, 0xE6, 0xD7, /* 0xC8-0xCB */ + 0xBD, 0x6A, 0x00, 0x00, 0xE6, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0xC0, 0xEA, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEA, 0xC5, 0xBF, 0x74, 0xEA, 0xBD, 0xBF, 0x78, /* 0xD8-0xDB */ + 0xEA, 0xC3, 0xEA, 0xBA, 0xEA, 0xB7, 0xEA, 0xC6, /* 0xDC-0xDF */ + 0xC1, 0x51, 0xBF, 0x79, 0xEA, 0xC2, 0xEA, 0xB8, /* 0xE0-0xE3 */ + 0xBF, 0x77, 0xEA, 0xBC, 0xBF, 0x7B, 0xEA, 0xB9, /* 0xE4-0xE7 */ + 0xEA, 0xBE, 0xBF, 0x7A, 0xEA, 0xC1, 0xEA, 0xC4, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xED, 0xCB, 0xED, 0xCC, 0xED, 0xBC, 0xED, 0xC3, /* 0xF0-0xF3 */ + 0xED, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x4F, /* 0xF4-0xF7 */ + 0xED, 0xC8, 0xEA, 0xBF, 0x00, 0x00, 0xED, 0xBF, /* 0xF8-0xFB */ + 0x00, 0x00, 0xED, 0xC9, 0xC1, 0x4E, 0xED, 0xBE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7C[512] = { + 0xED, 0xBD, 0xED, 0xC7, 0xED, 0xC4, 0xED, 0xC6, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xBA, 0xED, 0xCA, 0xC1, 0x4C, /* 0x04-0x07 */ + 0x00, 0x00, 0xED, 0xC5, 0xED, 0xCE, 0xED, 0xC2, /* 0x08-0x0B */ + 0xC1, 0x50, 0xC1, 0x4D, 0xED, 0xC0, 0xED, 0xBB, /* 0x0C-0x0F */ + 0xED, 0xCD, 0xBF, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF0, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xF0, 0x61, 0xF0, 0x67, 0xC2, 0xB0, 0xF0, 0x65, /* 0x1C-0x1F */ + 0xF0, 0x64, 0xC2, 0xB2, 0xF0, 0x6A, 0xC2, 0xB1, /* 0x20-0x23 */ + 0x00, 0x00, 0xF0, 0x6B, 0xF0, 0x68, 0xC2, 0xAE, /* 0x24-0x27 */ + 0xF0, 0x69, 0xF0, 0x62, 0xC2, 0xAF, 0xC2, 0xAD, /* 0x28-0x2B */ + 0xF2, 0xAB, 0xF0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF0, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xB2, /* 0x34-0x37 */ + 0xC3, 0xB0, 0xF2, 0xAA, 0x00, 0x00, 0xF2, 0xAC, /* 0x38-0x3B */ + 0xF2, 0xA9, 0xC3, 0xB1, 0xC3, 0xAE, 0xC3, 0xAF, /* 0x3C-0x3F */ + 0xC3, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x78, /* 0x40-0x43 */ + 0x00, 0x00, 0xF4, 0xAA, 0x00, 0x00, 0xF4, 0xA9, /* 0x44-0x47 */ + 0xF4, 0xA7, 0xF4, 0xA6, 0xF4, 0xA8, 0x00, 0x00, /* 0x48-0x4B */ + 0xC4, 0x77, 0xC4, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xC4, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE5, /* 0x50-0x53 */ + 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFA, /* 0x54-0x57 */ + 0x00, 0x00, 0xF6, 0xFC, 0xF6, 0xFE, 0xF6, 0xFD, /* 0x58-0x5B */ + 0xF6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xA3, /* 0x5C-0x5F */ + 0xC5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD3, /* 0x60-0x63 */ + 0xC5, 0xD2, 0xC5, 0xD4, 0xF7, 0xED, 0xF7, 0xEC, /* 0x64-0x67 */ + 0x00, 0x00, 0xF8, 0xFB, 0xF8, 0xB8, 0xF8, 0xFC, /* 0x68-0x6B */ + 0xC6, 0x58, 0x00, 0x00, 0xC6, 0x59, 0xF9, 0x6D, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x7E, 0xA6, 0xCC, /* 0x70-0x73 */ + 0x00, 0x00, 0xCD, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x44, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xAC, 0xF3, 0x00, 0x00, 0xD0, 0x47, /* 0x7C-0x7F */ + + 0xD0, 0x48, 0xD0, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xD3, 0x49, 0xD3, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD3, 0x4D, 0xAF, 0xBB, 0xD3, 0x4B, 0x00, 0x00, /* 0x88-0x8B */ + 0xD3, 0x4C, 0xD3, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD3, 0x4A, 0xB2, 0xC9, 0x00, 0x00, /* 0x90-0x93 */ + 0xD6, 0xDE, 0xB2, 0xCB, 0xD6, 0xE0, 0xB2, 0xCA, /* 0x94-0x97 */ + 0xD6, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE8, 0xB5, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xE7, 0xD6, 0xE1, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xB5, 0xB0, 0x00, 0x00, 0xF9, 0xDB, /* 0xA4-0xA7 */ + 0xDA, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x56, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB8, 0x64, 0xDF, 0x54, 0xB8, 0x65, /* 0xB0-0xB3 */ + 0xDF, 0x55, 0xB8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xBA, 0xE9, 0xE3, 0x61, 0xE3, 0x5E, /* 0xB8-0xBB */ + 0xE3, 0x60, 0xBA, 0xEA, 0xBA, 0xEB, 0xE3, 0x5F, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE6, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE6, 0xE0, 0x00, 0x00, 0xBD, 0x6B, 0xE6, 0xE2, /* 0xC8-0xCB */ + 0xE6, 0xE1, 0x00, 0x00, 0xA2, 0x61, 0x00, 0x00, /* 0xCC-0xCF */ + 0xEA, 0xCA, 0xEA, 0xCB, 0xEA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0xC8, 0xBF, 0x7C, 0xBF, 0x7D, 0xEA, 0xC9, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xC1, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC1, 0x53, 0xC1, 0x58, 0xC1, 0x54, 0xC1, 0x56, /* 0xDC-0xDF */ + 0xC1, 0x52, 0x00, 0x00, 0xC1, 0x55, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0xB3, /* 0xE4-0xE7 */ + 0xED, 0xCF, 0x00, 0x00, 0xF2, 0xAE, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF2, 0xAD, 0x00, 0x00, 0xF4, 0xAB, 0xC4, 0x7A, /* 0xEC-0xEF */ + 0xC4, 0x7B, 0xF7, 0x41, 0xF5, 0xE6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF7, 0x40, 0x00, 0x00, 0xF8, 0xFD, 0xF9, 0xA4, /* 0xF4-0xF7 */ + 0xA6, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x74, /* 0xF8-0xFB */ + 0x00, 0x00, 0xCD, 0xA9, 0xAA, 0xC8, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xAC, 0xF6, 0xD0, 0x4C, 0xAC, 0xF4, 0xD0, 0x4A, /* 0x00-0x03 */ + 0xAC, 0xF9, 0xAC, 0xF5, 0xAC, 0xFA, 0xAC, 0xF8, /* 0x04-0x07 */ + 0xD0, 0x4B, 0xAC, 0xF7, 0xAF, 0xBF, 0xAF, 0xBE, /* 0x08-0x0B */ + 0xD3, 0x5A, 0xAF, 0xC7, 0xD3, 0x53, 0xD3, 0x59, /* 0x0C-0x0F */ + 0xAF, 0xC3, 0xD3, 0x52, 0xD3, 0x58, 0xD3, 0x56, /* 0x10-0x13 */ + 0xAF, 0xC2, 0xAF, 0xC4, 0xD3, 0x55, 0xAF, 0xBD, /* 0x14-0x17 */ + 0xD3, 0x54, 0xAF, 0xC8, 0xAF, 0xC5, 0xAF, 0xC9, /* 0x18-0x1B */ + 0xAF, 0xC6, 0xD3, 0x51, 0xD3, 0x50, 0xD3, 0x57, /* 0x1C-0x1F */ + 0xAF, 0xC0, 0xAF, 0xBC, 0xAF, 0xC1, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD6, 0xF0, 0xD6, 0xE9, 0x00, 0x00, 0xB5, 0xB5, /* 0x28-0x2B */ + 0xD6, 0xE8, 0x00, 0x00, 0xB2, 0xCF, 0xB2, 0xD6, /* 0x2C-0x2F */ + 0xB2, 0xD3, 0xB2, 0xD9, 0xB2, 0xD8, 0xB2, 0xD4, /* 0x30-0x33 */ + 0x00, 0x00, 0xD6, 0xE2, 0xD6, 0xE5, 0x00, 0x00, /* 0x34-0x37 */ + 0xD6, 0xE4, 0xB2, 0xD0, 0xD6, 0xE6, 0xD6, 0xEF, /* 0x38-0x3B */ + 0xB2, 0xD1, 0xD6, 0xE3, 0xD6, 0xEC, 0xD6, 0xED, /* 0x3C-0x3F */ + 0xB2, 0xD2, 0xD6, 0xEA, 0xB2, 0xD7, 0xB2, 0xCD, /* 0x40-0x43 */ + 0xB2, 0xD5, 0xD6, 0xE7, 0xB2, 0xCC, 0xD6, 0xEB, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEE, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xFB, 0xDA, 0xF2, /* 0x4C-0x4F */ + 0xB5, 0xB2, 0xDA, 0xF9, 0xDA, 0xF6, 0xDA, 0xEE, /* 0x50-0x53 */ + 0xDA, 0xF7, 0xB5, 0xB4, 0xDA, 0xEF, 0x00, 0x00, /* 0x54-0x57 */ + 0xDA, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xB8, 0x6C, /* 0x58-0x5B */ + 0xDA, 0xF4, 0x00, 0x00, 0xB5, 0xB1, 0xDA, 0xFA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB5, 0xB8, 0xB5, 0xBA, 0xDA, 0xED, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0xB9, 0xDA, 0xF0, /* 0x64-0x67 */ + 0xB5, 0xB3, 0xDA, 0xF8, 0xDA, 0xF1, 0xDA, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0xDA, 0xF3, 0xB5, 0xB6, 0xDA, 0xEC, /* 0x6C-0x6F */ + 0xB5, 0xBB, 0xB2, 0xCE, 0xB5, 0xB7, 0xB5, 0xBC, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0x68, 0xDF, 0x5D, 0xDF, 0x5F, /* 0x78-0x7B */ + 0xDF, 0x61, 0xDF, 0x65, 0x00, 0x00, 0xDF, 0x5B, /* 0x7C-0x7F */ + + 0xDF, 0x59, 0xB8, 0x6A, 0x00, 0x00, 0xDF, 0x60, /* 0x80-0x83 */ + 0xDF, 0x64, 0xDF, 0x5C, 0xDF, 0x58, 0x00, 0x00, /* 0x84-0x87 */ + 0xDF, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xDF, 0x62, 0xDF, 0x5A, 0xDF, 0x5E, 0xB8, 0x6B, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB8, 0x69, 0xDF, 0x66, 0xB8, 0x67, /* 0x90-0x93 */ + 0xDF, 0x63, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xBA, 0xEE, 0xE3, 0x6A, 0xBD, 0x78, 0xE3, 0x74, /* 0x9C-0x9F */ + 0xBA, 0xF1, 0xE3, 0x78, 0xBA, 0xF7, 0xE3, 0x65, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x62, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE3, 0x77, 0xE3, 0x66, 0x00, 0x00, /* 0xA8-0xAB */ + 0xBA, 0xFE, 0xBA, 0xFB, 0xE3, 0x76, 0xE3, 0x70, /* 0xAC-0xAF */ + 0xBA, 0xED, 0xBA, 0xF5, 0xBA, 0xF4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBA, 0xF3, 0xBA, 0xF9, 0x00, 0x00, 0xE3, 0x63, /* 0xB4-0xB7 */ + 0xBA, 0xFA, 0xE3, 0x71, 0xBA, 0xF6, 0xBA, 0xEC, /* 0xB8-0xBB */ + 0xE3, 0x73, 0xBA, 0xEF, 0xBA, 0xF0, 0xBA, 0xF8, /* 0xBC-0xBF */ + 0xE3, 0x68, 0xE3, 0x67, 0xE3, 0x64, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE3, 0x6C, 0xE3, 0x69, 0xE3, 0x6D, 0xBA, 0xFD, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE3, 0x79, 0xBA, 0xF2, 0xE3, 0x6E, /* 0xC8-0xCB */ + 0xE3, 0x6F, 0x00, 0x00, 0xE3, 0x6B, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0xFC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0xD4-0xD7 */ + 0xBD, 0x70, 0xBD, 0x79, 0xBD, 0x75, 0xE6, 0xE4, /* 0xD8-0xDB */ + 0x00, 0x00, 0xBD, 0x72, 0xBD, 0x76, 0xE6, 0xF0, /* 0xDC-0xDF */ + 0xBD, 0x6C, 0xE6, 0xE8, 0x00, 0x00, 0xBD, 0x74, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, 0xE6, 0xE6, /* 0xE4-0xE7 */ + 0xBD, 0x73, 0xBD, 0x77, 0xE6, 0xE5, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBD, 0x71, 0x00, 0x00, 0xE6, 0xEF, 0xBD, 0x6E, /* 0xEC-0xEF */ + 0xE6, 0xEE, 0xE6, 0xED, 0xBD, 0x7A, 0xE5, 0x72, /* 0xF0-0xF3 */ + 0xBD, 0x6D, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xE3, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xBD, 0x7B, 0xE6, 0xEA, 0xBD, 0x6F, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xBF, 0xA2, 0xBF, 0xA7, 0xBF, 0x7E, 0xEA, 0xD8, /* 0x08-0x0B */ + 0xEA, 0xCF, 0xEA, 0xDB, 0xEA, 0xD3, 0xEA, 0xD9, /* 0x0C-0x0F */ + 0xBF, 0xA8, 0xBF, 0xA1, 0xEA, 0xCC, 0xEA, 0xD2, /* 0x10-0x13 */ + 0xEA, 0xDC, 0xEA, 0xD5, 0xEA, 0xDA, 0xEA, 0xCE, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, 0xBF, 0xA3, /* 0x18-0x1B */ + 0xEA, 0xD4, 0xBF, 0xA6, 0xBF, 0xA5, 0xEA, 0xD0, /* 0x1C-0x1F */ + 0xEA, 0xD1, 0xEA, 0xCD, 0xEA, 0xD7, 0xBF, 0xA4, /* 0x20-0x23 */ + 0xEA, 0xDE, 0xEA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xED, 0xDA, 0xED, 0xD6, 0xC1, 0x5F, /* 0x28-0x2B */ + 0x00, 0x00, 0xED, 0xD0, 0xC1, 0x59, 0xC1, 0x69, /* 0x2C-0x2F */ + 0xED, 0xDC, 0xC1, 0x61, 0xC1, 0x5D, 0xED, 0xD3, /* 0x30-0x33 */ + 0xC1, 0x64, 0xC1, 0x67, 0xED, 0xDE, 0xC1, 0x5C, /* 0x34-0x37 */ + 0xED, 0xD5, 0xC1, 0x65, 0xED, 0xE0, 0xED, 0xDD, /* 0x38-0x3B */ + 0xED, 0xD1, 0xC1, 0x60, 0xC1, 0x5A, 0xC1, 0x68, /* 0x3C-0x3F */ + 0xED, 0xD8, 0xC1, 0x63, 0xED, 0xD2, 0xC1, 0x5E, /* 0x40-0x43 */ + 0xED, 0xDF, 0xC1, 0x62, 0xC1, 0x5B, 0xED, 0xD9, /* 0x44-0x47 */ + 0xC1, 0x66, 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xED, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF0, 0x6E, 0xF0, 0x74, 0xC2, 0xB9, 0xF0, 0x77, /* 0x50-0x53 */ + 0xC2, 0xB4, 0xC2, 0xB5, 0xF0, 0x6F, 0xF0, 0x76, /* 0x54-0x57 */ + 0xF0, 0x71, 0xC2, 0xBA, 0xC2, 0xB7, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0x6D, 0x00, 0x00, 0xC2, 0xB6, 0xF0, 0x73, /* 0x5C-0x5F */ + 0xF0, 0x75, 0xC2, 0xB8, 0xF0, 0x72, 0xF0, 0x70, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF2, 0xB8, 0xC3, 0xB7, 0xC3, 0xB8, 0xC3, 0xB4, /* 0x68-0x6B */ + 0x00, 0x00, 0xC3, 0xB5, 0x00, 0x00, 0xF2, 0xB4, /* 0x6C-0x6F */ + 0xF2, 0xB2, 0x00, 0x00, 0xF2, 0xB6, 0xC3, 0xBA, /* 0x70-0x73 */ + 0xF2, 0xB7, 0xF2, 0xB0, 0xF2, 0xAF, 0xF2, 0xB3, /* 0x74-0x77 */ + 0xF2, 0xB1, 0xC3, 0xB6, 0xF2, 0xB5, 0xF4, 0xAC, /* 0x78-0x7B */ + 0xC4, 0x7E, 0xC4, 0x7D, 0xF4, 0xAD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xF4, 0xAF, 0xF4, 0xAE, 0xC4, 0xA1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xEB, 0xF5, 0xE8, /* 0x84-0x87 */ + 0xF5, 0xE9, 0x00, 0x00, 0xF5, 0xE7, 0xF5, 0xEA, /* 0x88-0x8B */ + 0xC4, 0xF2, 0xF5, 0xEC, 0x00, 0x00, 0xC4, 0xF1, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF7, 0x42, 0x00, 0x00, 0xC5, 0xD5, /* 0x90-0x93 */ + 0xC5, 0xD7, 0xF7, 0xEE, 0xC5, 0xD6, 0xF8, 0xB9, /* 0x94-0x97 */ + 0xF9, 0x40, 0xF9, 0x42, 0xF8, 0xFE, 0xF9, 0x41, /* 0x98-0x9B */ + 0xC6, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xCE, 0x00, 0x00, /* 0x34-0x37 */ + 0xAC, 0xFB, 0xD2, 0x6F, 0xAF, 0xCA, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB2, 0xDA, 0xDA, 0xFC, 0xDA, 0xFD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0x40-0x43 */ + 0xC1, 0x6A, 0xED, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xC2, 0xBB, 0x00, 0x00, 0xF2, 0xBA, 0xF2, 0xB9, /* 0x48-0x4B */ + 0xC4, 0xA2, 0xF5, 0xED, 0x00, 0x00, 0xF7, 0x43, /* 0x4C-0x4F */ + 0xC5, 0xF8, 0xCA, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xAA, 0xC9, 0xA8, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x60, /* 0x58-0x5B */ + 0xD3, 0x5B, 0xD3, 0x5F, 0xD3, 0x5D, 0xAF, 0xCB, /* 0x5C-0x5F */ + 0xD3, 0x5E, 0xD3, 0x5C, 0x00, 0x00, 0xD6, 0xF1, /* 0x60-0x63 */ + 0x00, 0x00, 0xDA, 0xFE, 0xDB, 0x40, 0xDF, 0x69, /* 0x64-0x67 */ + 0xDF, 0x6A, 0xB8, 0x6E, 0xB8, 0x6F, 0xDF, 0x68, /* 0x68-0x6B */ + 0xDF, 0x6B, 0xDF, 0x67, 0xB8, 0x6D, 0x00, 0x00, /* 0x6C-0x6F */ + 0xBB, 0x40, 0x00, 0x00, 0xB8, 0x70, 0xE3, 0x7A, /* 0x70-0x73 */ + 0x00, 0x00, 0xBD, 0x7C, 0xE6, 0xF1, 0xBD, 0x7D, /* 0x74-0x77 */ + 0x00, 0x00, 0xBF, 0xA9, 0xEA, 0xE2, 0xEA, 0xE0, /* 0x78-0x7B */ + 0xEA, 0xE1, 0xED, 0xE4, 0xED, 0xE3, 0xED, 0xE2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBB, /* 0x80-0x83 */ + 0x00, 0x00, 0xC3, 0xB9, 0xF2, 0xBC, 0xF7, 0x44, /* 0x84-0x87 */ + 0xC5, 0xF9, 0xF8, 0xBA, 0xA6, 0xCF, 0xAA, 0xCB, /* 0x88-0x8B */ + 0xAA, 0xCA, 0xD0, 0x4F, 0xAC, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD0, 0x4E, 0xD3, 0x62, 0x00, 0x00, /* 0x90-0x93 */ + 0xAF, 0xCC, 0xD6, 0xF2, 0xD3, 0x61, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xDC, 0xD6, 0xF5, /* 0x98-0x9B */ + 0xD6, 0xF3, 0xD6, 0xF4, 0xB2, 0xDB, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDB, 0x42, 0xDB, 0x43, 0xDB, 0x41, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xB8, 0x73, 0xDF, 0x6D, 0xDF, 0x6C, 0xDF, 0x6E, /* 0xA4-0xA7 */ + 0xB8, 0x72, 0xB8, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0xF2, 0xE6, 0xF4, 0x00, 0x00, 0xBD, 0x7E, /* 0xAC-0xAF */ + 0xE6, 0xF3, 0xEA, 0xE3, 0xBF, 0xAA, 0xF0, 0x79, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF0, 0x78, 0xC3, 0xBB, 0xF2, 0xBD, /* 0xB4-0xB7 */ + 0xC3, 0xBD, 0xC3, 0xBC, 0xF4, 0xB0, 0xF5, 0xEE, /* 0xB8-0xBB */ + 0xC4, 0xF3, 0xA6, 0xD0, 0xD0, 0x50, 0xAC, 0xFD, /* 0xBC-0xBF */ + 0xD3, 0x65, 0xAF, 0xCE, 0xD3, 0x64, 0xD3, 0x63, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xAF, 0xCD, 0x00, 0x00, 0xD6, 0xFB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD6, 0xFD, 0xD6, 0xF6, 0xD6, 0xF7, /* 0xC8-0xCB */ + 0xB2, 0xDD, 0xD6, 0xF8, 0xB2, 0xDE, 0xD6, 0xFC, /* 0xCC-0xCF */ + 0xD6, 0xF9, 0xD6, 0xFA, 0xB2, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB5, 0xBE, 0xB5, 0xBF, 0x00, 0x00, 0xDB, 0x44, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x6F, /* 0xD8-0xDB */ + 0xDF, 0x70, 0x00, 0x00, 0xE3, 0x7E, 0xBB, 0x43, /* 0xDC-0xDF */ + 0xBB, 0x41, 0xBB, 0x42, 0xE3, 0x7B, 0xE3, 0x7C, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE3, 0x7D, 0xE6, 0xF9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE6, 0xFA, 0xBD, 0xA1, 0xE6, 0xF7, 0xE6, 0xF6, /* 0xE8-0xEB */ + 0xE6, 0xF8, 0xE6, 0xF5, 0xBF, 0xAD, 0xEA, 0xE4, /* 0xEC-0xEF */ + 0xBF, 0xAB, 0xBF, 0xAC, 0xED, 0xE6, 0xC1, 0x6B, /* 0xF0-0xF3 */ + 0xED, 0xE5, 0xEF, 0xA8, 0x00, 0x00, 0xF0, 0x7A, /* 0xF4-0xF7 */ + 0xF0, 0x7B, 0xC2, 0xBC, 0x00, 0x00, 0xC2, 0xBD, /* 0xF8-0xFB */ + 0xC1, 0x6C, 0xF2, 0xBE, 0xF2, 0xBF, 0xF4, 0xB1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xC4, 0xA3, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0x00-0x03 */ + 0xAC, 0xFE, 0xAA, 0xCC, 0xAF, 0xCF, 0xD0, 0x51, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xC0, /* 0x08-0x0B */ + 0xA6, 0xD3, 0xAD, 0x41, 0xD0, 0x52, 0xD0, 0x53, /* 0x0C-0x0F */ + 0xAD, 0x40, 0xAD, 0x42, 0xA6, 0xD4, 0x00, 0x00, /* 0x10-0x13 */ + 0xD0, 0x54, 0xAF, 0xD1, 0xD3, 0x66, 0xAF, 0xD3, /* 0x14-0x17 */ + 0xAF, 0xD0, 0xAF, 0xD2, 0x00, 0x00, 0xD7, 0x41, /* 0x18-0x1B */ + 0xB2, 0xE0, 0x00, 0x00, 0xD7, 0x40, 0xD6, 0xFE, /* 0x1C-0x1F */ + 0x00, 0x00, 0xDF, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE3, 0xA1, 0x00, 0x00, 0xBD, 0xA2, 0x00, 0x00, /* 0x24-0x27 */ + 0xBF, 0xAE, 0xEA, 0xE6, 0xEA, 0xE5, 0x00, 0x00, /* 0x28-0x2B */ + 0xED, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF5, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xD5, /* 0x30-0x33 */ + 0xCB, 0x73, 0xCD, 0xAA, 0xAD, 0x43, 0xD0, 0x55, /* 0x34-0x37 */ + 0x00, 0x00, 0xD3, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xAF, 0xD4, 0xD3, 0x67, 0xAF, 0xD5, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x43, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xE2, 0xD7, 0x42, /* 0x44-0x47 */ + 0xD7, 0x44, 0x00, 0x00, 0xB2, 0xE1, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x46, /* 0x4C-0x4F */ + 0xDB, 0x47, 0xDB, 0x45, 0xB5, 0xC1, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0x74, 0x00, 0x00, /* 0x54-0x57 */ + 0xB8, 0x75, 0x00, 0x00, 0xBB, 0x45, 0x00, 0x00, /* 0x58-0x5B */ + 0xE3, 0xA3, 0xE3, 0xA2, 0xBB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFC, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xEA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x70, /* 0x6C-0x6F */ + 0xC1, 0x6F, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x71, /* 0x70-0x73 */ + 0x00, 0x00, 0xF0, 0x7C, 0xC2, 0xBF, 0xC2, 0xBE, /* 0x74-0x77 */ + 0xF2, 0xC0, 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xC5, 0xA5, 0xC5, 0xA4, 0xA6, 0xD6, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, 0x00, 0x00, /* 0x80-0x83 */ + 0xB8, 0x77, 0xB5, 0xC2, 0xB8, 0x76, 0xBB, 0x46, /* 0x84-0x87 */ + 0x00, 0x00, 0xA6, 0xD7, 0xC9, 0xA9, 0xA6, 0xD8, /* 0x88-0x8B */ + 0xA6, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAB, /* 0x8C-0x8F */ + 0xCB, 0x76, 0x00, 0x00, 0xCB, 0x77, 0xA8, 0x77, /* 0x90-0x93 */ + 0x00, 0x00, 0xCB, 0x74, 0xA8, 0x76, 0x00, 0x00, /* 0x94-0x97 */ + 0xA8, 0x79, 0xCB, 0x75, 0xA8, 0x7B, 0xA8, 0x7A, /* 0x98-0x9B */ + 0xCB, 0x78, 0xA8, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xAA, 0xD1, 0xAA, 0xCF, 0xCD, 0xAD, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xAA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xAA, 0xD3, 0xAA, 0xD5, 0xAA, 0xD2, /* 0xA8-0xAB */ + 0x00, 0x00, 0xCD, 0xB0, 0xCD, 0xAC, 0xAA, 0xD6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xAA, 0xD0, 0xA8, 0x7C, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xAA, 0xD4, 0xCD, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCD, 0xAE, 0x00, 0x00, 0xAA, 0xCD, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x5B, 0xAD, 0x47, /* 0xC0-0xC3 */ + 0xAD, 0x48, 0xD0, 0x5D, 0x00, 0x00, 0xD0, 0x57, /* 0xC4-0xC7 */ + 0xD0, 0x5A, 0xD0, 0x63, 0xD0, 0x61, 0x00, 0x00, /* 0xC8-0xCB */ + 0xAD, 0x49, 0xD0, 0x67, 0xAD, 0x4C, 0xD0, 0x64, /* 0xCC-0xCF */ + 0xD0, 0x5C, 0xD0, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDB, 0x49, 0xD0, 0x62, 0xAD, 0x44, 0xD0, 0x65, /* 0xD4-0xD7 */ + 0xD0, 0x56, 0xD0, 0x5F, 0xAD, 0x46, 0xAD, 0x4B, /* 0xD8-0xDB */ + 0xD0, 0x60, 0xAD, 0x4F, 0xAD, 0x4D, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD0, 0x58, 0xAD, 0x4A, 0x00, 0x00, 0xD0, 0x5E, /* 0xE0-0xE3 */ + 0xAD, 0x4E, 0xAD, 0x45, 0xD0, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAF, 0xDA, 0x00, 0x00, 0xAF, 0xE3, /* 0xEC-0xEF */ + 0xAF, 0xD8, 0xAF, 0xD6, 0xD3, 0x6A, 0xAF, 0xDE, /* 0xF0-0xF3 */ + 0xAF, 0xDB, 0xD3, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xAF, 0xDD, 0xD3, 0x6B, 0xD3, 0x69, 0xD3, 0x6E, /* 0xF8-0xFB */ + 0xAF, 0xE2, 0xAF, 0xE0, 0xDB, 0x48, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0xD3, 0x6F, 0xD3, 0x6D, 0xAF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xAF, 0xD9, 0xAF, 0xDC, 0x00, 0x00, /* 0x04-0x07 */ + 0xAF, 0xDF, 0x00, 0x00, 0xAF, 0xE1, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xD7, 0x4E, 0xB2, 0xE4, 0x00, 0x00, /* 0x14-0x17 */ + 0xD7, 0x45, 0xD7, 0x47, 0x00, 0x00, 0xD7, 0x48, /* 0x18-0x1B */ + 0x00, 0x00, 0xD7, 0x50, 0xD7, 0x4C, 0xD7, 0x4A, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD7, 0x4D, 0xD7, 0x51, 0xB2, 0xE5, /* 0x20-0x23 */ + 0xB2, 0xE9, 0xD7, 0x46, 0x00, 0x00, 0xD7, 0x4F, /* 0x24-0x27 */ + 0x00, 0x00, 0xB2, 0xE7, 0x00, 0x00, 0xB2, 0xE6, /* 0x28-0x2B */ + 0xD7, 0x4B, 0xD7, 0x49, 0x00, 0x00, 0xB2, 0xE3, /* 0x2C-0x2F */ + 0xB2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xB5, 0xC8, 0xDB, 0x51, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xDB, 0x4F, 0xB5, 0xCA, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4A, /* 0x40-0x43 */ + 0xDF, 0xA1, 0x00, 0x00, 0xB5, 0xC9, 0xDB, 0x4E, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4B, 0xB5, 0xC5, /* 0x48-0x4B */ + 0xB5, 0xCB, 0xDB, 0x50, 0xB5, 0xC7, 0xDB, 0x4D, /* 0x4C-0x4F */ + 0xBB, 0x47, 0xB5, 0xC6, 0xDB, 0x4C, 0xB5, 0xCC, /* 0x50-0x53 */ + 0xB5, 0xC4, 0xB5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x77, /* 0x58-0x5B */ + 0xDF, 0x75, 0x00, 0x00, 0xDF, 0x7B, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDF, 0x73, 0xDF, 0xA2, 0xDF, 0x78, 0x00, 0x00, /* 0x60-0x63 */ + 0xDF, 0x72, 0xB8, 0x7B, 0xB8, 0xA3, 0xDF, 0x7D, /* 0x64-0x67 */ + 0x00, 0x00, 0xDF, 0x76, 0x00, 0x00, 0xB8, 0x7E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0x7C, 0xDF, 0x7E, /* 0x6C-0x6F */ + 0xB8, 0x79, 0xB8, 0x78, 0xDF, 0x79, 0xB8, 0x7D, /* 0x70-0x73 */ + 0xB5, 0xCD, 0x00, 0x00, 0xDF, 0x7C, 0xDF, 0x74, /* 0x74-0x77 */ + 0xB8, 0x7A, 0xB8, 0xA1, 0xB8, 0xA2, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x4C, /* 0x7C-0x7F */ + + 0xBB, 0x48, 0x00, 0x00, 0xBB, 0x4D, 0xE3, 0xA6, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA5, 0xE3, 0xA7, /* 0x84-0x87 */ + 0xBB, 0x4A, 0xE3, 0xA4, 0xBB, 0x4B, 0xE3, 0xAA, /* 0x88-0x8B */ + 0xE3, 0xA9, 0xE3, 0xA8, 0x00, 0x00, 0xBB, 0x49, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE7, 0x41, 0x00, 0x00, 0xE7, 0x44, /* 0x94-0x97 */ + 0xBD, 0xA8, 0xE7, 0x43, 0xBD, 0xA7, 0xBD, 0xA3, /* 0x98-0x9B */ + 0xBD, 0xA4, 0xBD, 0xA5, 0xE7, 0x40, 0xE6, 0xFE, /* 0x9C-0x9F */ + 0xBD, 0xA6, 0x00, 0x00, 0xE7, 0x42, 0xE6, 0xFD, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE9, 0xEA, 0xF3, /* 0xA4-0xA7 */ + 0xBF, 0xB1, 0xBF, 0xB0, 0x00, 0x00, 0xEA, 0xED, /* 0xA8-0xAB */ + 0xEA, 0xEF, 0x00, 0x00, 0xEA, 0xEA, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEA, 0xEE, 0xEA, 0xE8, 0xEA, 0xF1, 0xBF, 0xAF, /* 0xB0-0xB3 */ + 0xEA, 0xF0, 0xEA, 0xEC, 0x00, 0x00, 0xEA, 0xF2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEA, 0xEB, 0xC1, 0x74, 0xED, 0xE8, /* 0xB8-0xBB */ + 0xED, 0xEE, 0xC1, 0x78, 0xC1, 0x7A, 0xC1, 0x77, /* 0xBC-0xBF */ + 0xC1, 0x76, 0x00, 0x00, 0xC1, 0x75, 0xC1, 0x73, /* 0xC0-0xC3 */ + 0xED, 0xE9, 0xED, 0xEC, 0xC1, 0x72, 0xED, 0xED, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC1, 0x79, 0xED, 0xEB, 0x00, 0x00, /* 0xC8-0xCB */ + 0xED, 0xEA, 0xC2, 0xC0, 0x00, 0x00, 0xC2, 0xC1, /* 0xCC-0xCF */ + 0xF0, 0xA1, 0xF0, 0x7D, 0xF0, 0x7E, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF2, 0xC2, 0x00, 0x00, 0xF2, 0xC1, /* 0xD4-0xD7 */ + 0xC3, 0xBE, 0xF4, 0xB4, 0xC4, 0xA4, 0xF4, 0xB3, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF5, 0xF0, 0xF7, 0x45, 0xC5, 0xA6, /* 0xDC-0xDF */ + 0xF9, 0x43, 0xF9, 0x44, 0xC5, 0xD8, 0xA6, 0xDA, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0xD7, 0xDB, 0x52, 0xBB, 0x4E, /* 0xE4-0xE7 */ + 0xC1, 0x7B, 0xED, 0xEF, 0xA6, 0xDB, 0x00, 0x00, /* 0xE8-0xEB */ + 0xAF, 0xE5, 0xAF, 0xE4, 0xDB, 0x53, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xF4, 0xA6, 0xDC, /* 0xF0-0xF3 */ + 0xAD, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x54, /* 0xF4-0xF7 */ + 0xDB, 0x55, 0xDB, 0x56, 0xBB, 0x4F, 0xBF, 0xB2, /* 0xF8-0xFB */ + 0xA6, 0xDD, 0x00, 0x00, 0xAA, 0xD8, 0xD0, 0x68, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0xAF, 0xE6, 0xD3, 0x70, 0xB2, 0xEA, 0x00, 0x00, /* 0x00-0x03 */ + 0xDB, 0x57, 0xB8, 0xA4, 0x00, 0x00, 0xBB, 0x50, /* 0x04-0x07 */ + 0xBF, 0xB3, 0xC1, 0x7C, 0xC2, 0xC2, 0xF4, 0xB5, /* 0x08-0x0B */ + 0xA6, 0xDE, 0xAA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xAF, 0xE7, 0xD7, 0x52, 0xB5, 0xCE, 0x00, 0x00, /* 0x10-0x13 */ + 0xBB, 0x51, 0xE3, 0xAB, 0xE7, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xDF, /* 0x18-0x1B */ + 0xB5, 0xCF, 0xDF, 0xA3, 0xBB, 0x52, 0xA6, 0xE0, /* 0x1C-0x1F */ + 0xCD, 0xB1, 0xD0, 0x69, 0xAD, 0x51, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD3, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xAF, 0xEA, 0x00, 0x00, 0xAF, 0xE8, 0xAF, 0xE9, /* 0x28-0x2B */ + 0xAF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x71, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0x57, 0xD7, 0x54, /* 0x30-0x33 */ + 0xD7, 0x56, 0xB2, 0xEB, 0xB2, 0xED, 0xB2, 0xEC, /* 0x34-0x37 */ + 0xD7, 0x53, 0xB2, 0xEE, 0xD7, 0x55, 0x00, 0x00, /* 0x38-0x3B */ + 0xDB, 0x58, 0xDB, 0x59, 0x00, 0x00, 0xDB, 0x5A, /* 0x3C-0x3F */ + 0xDF, 0xA6, 0x00, 0x00, 0xDF, 0xA7, 0x00, 0x00, /* 0x40-0x43 */ + 0xDF, 0xA5, 0xDF, 0xA8, 0x00, 0x00, 0xB8, 0xA5, /* 0x44-0x47 */ + 0x00, 0x00, 0xDF, 0xA4, 0x00, 0x00, 0xBB, 0x53, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4A, 0xE7, 0x46, /* 0x4C-0x4F */ + 0xE7, 0x49, 0xE7, 0x4B, 0xE7, 0x48, 0xE7, 0x47, /* 0x50-0x53 */ + 0x00, 0x00, 0xEA, 0xF5, 0xEA, 0xF6, 0xEA, 0xF7, /* 0x54-0x57 */ + 0xBF, 0xB4, 0xBF, 0xB5, 0xED, 0xF1, 0xED, 0xF0, /* 0x58-0x5B */ + 0xED, 0xF2, 0x00, 0x00, 0xF0, 0xA3, 0xF0, 0xA2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF2, 0xC4, 0x00, 0x00, 0xF2, 0xC5, /* 0x60-0x63 */ + 0xF2, 0xC3, 0x00, 0x00, 0xC4, 0xA5, 0x00, 0x00, /* 0x64-0x67 */ + 0xF4, 0xB6, 0xF4, 0xB7, 0x00, 0x00, 0xF7, 0x46, /* 0x68-0x6B */ + 0xF7, 0xEF, 0xF8, 0xBB, 0xA6, 0xE1, 0xA8, 0x7D, /* 0x6C-0x6F */ + 0x00, 0x00, 0xC1, 0x7D, 0xA6, 0xE2, 0x00, 0x00, /* 0x70-0x73 */ + 0xD7, 0x58, 0xDB, 0x5B, 0x00, 0x00, 0xC6, 0x41, /* 0x74-0x77 */ + 0xCA, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCA, 0x4B, 0xCA, 0x4D, 0xA6, 0xE3, 0xCA, 0x4E, /* 0x7C-0x7F */ + + 0xCA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA2, /* 0x80-0x83 */ + 0xCB, 0xA3, 0xCB, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA1, 0xA8, 0xA1, /* 0x88-0x8B */ + 0x00, 0x00, 0xA8, 0xA2, 0xCB, 0x7C, 0xCB, 0x7A, /* 0x8C-0x8F */ + 0xCB, 0x79, 0xCB, 0x7D, 0xA8, 0x7E, 0xCB, 0x7E, /* 0x90-0x93 */ + 0xD0, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0xB6, 0xAA, 0xDC, 0xCD, 0xB5, 0xCD, 0xB7, /* 0x98-0x9B */ + 0x00, 0x00, 0xAA, 0xDB, 0xCD, 0xBC, 0xAA, 0xDF, /* 0x9C-0x9F */ + 0xCD, 0xB2, 0xCD, 0xC0, 0xCD, 0xC6, 0xAA, 0xE6, /* 0xA0-0xA3 */ + 0xCD, 0xC3, 0xAA, 0xE3, 0x00, 0x00, 0xCD, 0xB9, /* 0xA4-0xA7 */ + 0xCD, 0xBF, 0xCD, 0xC1, 0x00, 0x00, 0xCD, 0xB4, /* 0xA8-0xAB */ + 0xAA, 0xE2, 0xAA, 0xDD, 0xCD, 0xBA, 0xAA, 0xE4, /* 0xAC-0xAF */ + 0xAA, 0xE7, 0xAA, 0xE1, 0x00, 0x00, 0xAA, 0xDA, /* 0xB0-0xB3 */ + 0xCD, 0xBE, 0xCD, 0xB8, 0xCD, 0xC5, 0xAA, 0xE9, /* 0xB4-0xB7 */ + 0xAA, 0xE5, 0xAA, 0xE0, 0xCD, 0xBD, 0xAF, 0xEC, /* 0xB8-0xBB */ + 0xCD, 0xBB, 0xAA, 0xDE, 0xAA, 0xE8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCD, 0xB3, 0x00, 0x00, 0xCD, 0xC2, 0xCD, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAD, 0x62, 0xAD, 0x5C, 0xAD, 0x64, /* 0xD0-0xD3 */ + 0xAD, 0x61, 0xD0, 0x71, 0xD0, 0x74, 0xAD, 0x5D, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xD0, 0x6B, 0x00, 0x00, 0xAD, 0x56, /* 0xD8-0xDB */ + 0xAD, 0x60, 0x00, 0x00, 0xAD, 0x63, 0xAD, 0x65, /* 0xDC-0xDF */ + 0xD0, 0xA2, 0xD0, 0x77, 0x00, 0x00, 0xAD, 0x55, /* 0xE0-0xE3 */ + 0xD0, 0xA1, 0xAD, 0x59, 0xAD, 0x57, 0xAD, 0x52, /* 0xE4-0xE7 */ + 0xD0, 0x6F, 0x00, 0x00, 0xD0, 0x7E, 0xD0, 0x73, /* 0xE8-0xEB */ + 0xD0, 0x76, 0xD0, 0xA5, 0x00, 0x00, 0xAD, 0x66, /* 0xEC-0xEF */ + 0xD0, 0x7D, 0xAD, 0x5E, 0xD0, 0x78, 0xD0, 0xA4, /* 0xF0-0xF3 */ + 0xD0, 0x75, 0xD0, 0x79, 0xD0, 0x7C, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD0, 0x6D, 0xD0, 0xA3, 0xD0, 0x7B, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x6C, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0xD0, 0x70, 0xAD, 0x5F, 0xAD, 0x5A, 0xAD, 0x53, /* 0x00-0x03 */ + 0xAD, 0x58, 0xAD, 0x54, 0xAD, 0x67, 0xD0, 0x6E, /* 0x04-0x07 */ + 0xD3, 0xA5, 0xAD, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xD0, 0x7A, 0xCE, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, 0xAF, 0xFA, /* 0x14-0x17 */ + 0x00, 0x00, 0xD3, 0x76, 0x00, 0x00, 0xD3, 0xA3, /* 0x18-0x1B */ + 0xD3, 0x7D, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD3, 0xAA, 0x00, 0x00, 0xD3, 0x7E, 0x00, 0x00, /* 0x20-0x23 */ + 0xD3, 0xA9, 0xD3, 0x78, 0xD3, 0x7C, 0xD3, 0xB5, /* 0x24-0x27 */ + 0xAF, 0xFD, 0xD3, 0xAD, 0xD3, 0xA4, 0xAF, 0xED, /* 0x28-0x2B */ + 0xD3, 0xB3, 0xD3, 0x74, 0x00, 0x00, 0xD3, 0xAC, /* 0x2C-0x2F */ + 0x00, 0x00, 0xAF, 0xFC, 0xAF, 0xF7, 0xD3, 0x73, /* 0x30-0x33 */ + 0xAF, 0xF5, 0xAF, 0xF4, 0xAF, 0xF9, 0xD3, 0xAB, /* 0x34-0x37 */ + 0xAF, 0xF1, 0xAF, 0xF8, 0xD0, 0x72, 0xDB, 0x5C, /* 0x38-0x3B */ + 0xD3, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x7A, /* 0x3C-0x3F */ + 0xAF, 0xFB, 0xD3, 0x7B, 0xD3, 0xA1, 0xAF, 0xFE, /* 0x40-0x43 */ + 0xD3, 0x75, 0xD3, 0xAF, 0x00, 0x00, 0xD3, 0xAE, /* 0x44-0x47 */ + 0xD3, 0xB6, 0xAF, 0xF3, 0xAF, 0xF0, 0xD3, 0xB4, /* 0x48-0x4B */ + 0xD3, 0xB0, 0xD3, 0xA7, 0xD3, 0xA2, 0xAF, 0xF6, /* 0x4C-0x4F */ + 0xAF, 0xF2, 0xD3, 0x77, 0xAF, 0xEE, 0xD3, 0xB1, /* 0x50-0x53 */ + 0xAF, 0xEF, 0x00, 0x00, 0xD3, 0x79, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x5E, /* 0x70-0x73 */ + 0xD7, 0x60, 0xD7, 0x65, 0xD7, 0x79, 0xB2, 0xFC, /* 0x74-0x77 */ + 0xB2, 0xF2, 0x00, 0x00, 0xD7, 0x5D, 0xB2, 0xFD, /* 0x78-0x7B */ + 0xB2, 0xFE, 0xD7, 0x68, 0xD7, 0x6F, 0xD7, 0x75, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD7, 0x62, 0x00, 0x00, 0xD7, 0x69, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x40, 0xD7, 0x77, /* 0x84-0x87 */ + 0xD7, 0x72, 0xB2, 0xFA, 0xB2, 0xF8, 0xD7, 0x6E, /* 0x88-0x8B */ + 0xD7, 0x6A, 0xD7, 0x5C, 0xB2, 0xEF, 0xD7, 0x61, /* 0x8C-0x8F */ + 0xD7, 0x59, 0x00, 0x00, 0xB2, 0xF7, 0xB2, 0xF9, /* 0x90-0x93 */ + 0xD7, 0x66, 0xD7, 0x63, 0xB2, 0xF4, 0xD7, 0x73, /* 0x94-0x97 */ + 0xB2, 0xF1, 0xD7, 0x64, 0xD7, 0x7A, 0xD7, 0x6C, /* 0x98-0x9B */ + 0x00, 0x00, 0xD7, 0x6B, 0xB2, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */ + 0xB2, 0xFB, 0x00, 0x00, 0xB2, 0xF3, 0xD7, 0x5A, /* 0xA0-0xA3 */ + 0xD7, 0x5F, 0xD7, 0x70, 0xD7, 0x76, 0xB3, 0x41, /* 0xA4-0xA7 */ + 0xD7, 0x5B, 0xD7, 0x67, 0xD7, 0x6D, 0xB2, 0xF6, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0x78, 0xD7, 0x71, /* 0xAC-0xAF */ + 0xD7, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xB2, 0xF5, 0x00, 0x00, 0xDB, 0x6C, /* 0xBC-0xBF */ + 0xDB, 0x60, 0xB5, 0xD7, 0xDB, 0x7D, 0xDB, 0xA7, /* 0xC0-0xC3 */ + 0xDB, 0xAA, 0xB5, 0xD5, 0xDB, 0x68, 0xDB, 0xA3, /* 0xC4-0xC7 */ + 0xDB, 0x69, 0xDB, 0x77, 0xB5, 0xE2, 0xDB, 0x73, /* 0xC8-0xCB */ + 0xB5, 0xDF, 0x00, 0x00, 0xDB, 0x74, 0xDB, 0x5D, /* 0xCC-0xCF */ + 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB5, 0xE8, 0xDB, 0xA1, 0xDB, 0x75, 0xDB, 0xAC, /* 0xD4-0xD7 */ + 0xDB, 0x70, 0xDF, 0xC8, 0x00, 0x00, 0xDB, 0xAF, /* 0xD8-0xDB */ + 0xB5, 0xE6, 0xDB, 0x6E, 0xDB, 0x7A, 0xB5, 0xE9, /* 0xDC-0xDF */ + 0xB5, 0xD4, 0xDB, 0x72, 0xDB, 0xAD, 0xDB, 0x6B, /* 0xE0-0xE3 */ + 0xDB, 0x64, 0xDB, 0x6F, 0x00, 0x00, 0xDB, 0x63, /* 0xE4-0xE7 */ + 0xDB, 0x61, 0xB5, 0xD0, 0xDB, 0xA5, 0xDB, 0x6A, /* 0xE8-0xEB */ + 0xDB, 0xA8, 0x00, 0x00, 0xDB, 0xA9, 0xB5, 0xD8, /* 0xEC-0xEF */ + 0xB5, 0xDD, 0xB5, 0xD9, 0xB5, 0xE1, 0xDB, 0x7E, /* 0xF0-0xF3 */ + 0xB5, 0xDA, 0xDB, 0x76, 0xDB, 0x66, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xB5, 0xD2, 0xDB, 0x5E, 0xDB, 0xA2, 0xDB, 0xAB, /* 0xF8-0xFB */ + 0xDB, 0x65, 0xB5, 0xE0, 0xDB, 0xB0, 0xDB, 0x71, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0xDB, 0x6D, 0x00, 0x00, 0xB5, 0xD1, /* 0x00-0x03 */ + 0xB5, 0xE5, 0x00, 0x00, 0xDB, 0x7C, 0xB5, 0xE7, /* 0x04-0x07 */ + 0x00, 0x00, 0xDB, 0x78, 0xB5, 0xDC, 0xB5, 0xD6, /* 0x08-0x0B */ + 0xB5, 0xDE, 0xB5, 0xD3, 0xB5, 0xE4, 0xDB, 0x79, /* 0x0C-0x0F */ + 0xDB, 0x67, 0xDB, 0x7B, 0xDB, 0x62, 0xDB, 0xA6, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x5F, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xDF, 0xC7, 0x00, 0x00, 0xDF, 0xDD, /* 0x28-0x2B */ + 0xB8, 0x55, 0xDF, 0xCC, 0x00, 0x00, 0xDF, 0xCA, /* 0x2C-0x2F */ + 0xDF, 0xB5, 0xB8, 0xA9, 0xDF, 0xC5, 0xDF, 0xD9, /* 0x30-0x33 */ + 0xDF, 0xC1, 0xB8, 0xB1, 0xDF, 0xD8, 0xDF, 0xBF, /* 0x34-0x37 */ + 0xB5, 0xE3, 0xDF, 0xCF, 0xDF, 0xC0, 0xDF, 0xD6, /* 0x38-0x3B */ + 0xB8, 0xB0, 0xB8, 0xA8, 0x00, 0x00, 0xDF, 0xAA, /* 0x3C-0x3F */ + 0xDF, 0xB2, 0x00, 0x00, 0xDF, 0xCB, 0xDF, 0xC3, /* 0x40-0x43 */ + 0xDF, 0xDC, 0xDF, 0xC6, 0xB8, 0xB6, 0xDF, 0xD7, /* 0x44-0x47 */ + 0x00, 0x00, 0xB8, 0xAD, 0x00, 0x00, 0xDF, 0xC9, /* 0x48-0x4B */ + 0xDF, 0xD1, 0xDF, 0xB6, 0xDF, 0xD0, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDF, 0xE1, 0xDF, 0xB1, 0xDF, 0xD2, 0x00, 0x00, /* 0x50-0x53 */ + 0xDF, 0xDF, 0x00, 0x00, 0xDF, 0xAB, 0xB5, 0xDB, /* 0x54-0x57 */ + 0x00, 0x00, 0xDF, 0xB9, 0xDF, 0xB8, 0xB8, 0xAF, /* 0x58-0x5B */ + 0x00, 0x00, 0xDF, 0xBC, 0xDF, 0xBE, 0xDF, 0xCD, /* 0x5C-0x5F */ + 0xDF, 0xDE, 0xB8, 0xB2, 0x00, 0x00, 0xB8, 0xB3, /* 0x60-0x63 */ + 0x00, 0x00, 0xDF, 0xB0, 0xB8, 0xAB, 0xDF, 0xB4, /* 0x64-0x67 */ + 0xDF, 0xDA, 0xB8, 0xB4, 0x00, 0x00, 0xB8, 0xAC, /* 0x68-0x6B */ + 0xB8, 0xAE, 0xB8, 0xB5, 0xDF, 0xE0, 0xDF, 0xD3, /* 0x6C-0x6F */ + 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBB, /* 0x70-0x73 */ + 0xDF, 0xBA, 0xB8, 0xAA, 0xDF, 0xAC, 0xB8, 0xA7, /* 0x74-0x77 */ + 0xDF, 0xC4, 0xDF, 0xAD, 0xDF, 0xC2, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xDF, 0xB7, 0xDF, 0xDB, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xA6, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB3, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDF, 0xAF, 0xDF, 0xD5, 0xDF, 0xAE, /* 0x8C-0x8F */ + 0xBB, 0x60, 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x94-0x97 */ + 0xE3, 0xCA, 0xBB, 0x58, 0xE3, 0xBB, 0xE3, 0xC5, /* 0x98-0x9B */ + 0xBB, 0x5B, 0xE3, 0xBE, 0xBB, 0x59, 0xE3, 0xAF, /* 0x9C-0x9F */ + 0xE3, 0xCD, 0xE3, 0xAE, 0xE3, 0xC1, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBF, /* 0xA4-0xA7 */ + 0xE3, 0xC8, 0xE3, 0xC6, 0xE3, 0xBA, 0xE3, 0xB5, /* 0xA8-0xAB */ + 0xE3, 0xB3, 0x00, 0x00, 0xE3, 0xB4, 0xE3, 0xC7, /* 0xAC-0xAF */ + 0xE3, 0xD2, 0xE3, 0xBC, 0xBB, 0x5A, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE3, 0xB7, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xBB, 0x5D, 0xE3, 0xB6, 0xE3, 0xB0, 0xE3, 0xC0, /* 0xB8-0xBB */ + 0xBB, 0x61, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x55, /* 0xBC-0xBF */ + 0xBB, 0x5E, 0xE3, 0xB8, 0xE3, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBB, 0x57, 0xDF, 0xD4, 0xBB, 0x56, 0xE3, 0xC3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xBB, 0x54, 0xBB, 0x63, 0xBB, 0x5C, /* 0xC8-0xCB */ + 0xE3, 0xC4, 0xE3, 0xB9, 0xE3, 0xB1, 0xE3, 0xCC, /* 0xCC-0xCF */ + 0xE3, 0xBD, 0xBB, 0x62, 0xE3, 0xD0, 0xBB, 0x5F, /* 0xD0-0xD3 */ + 0xE3, 0xCF, 0x00, 0x00, 0xE3, 0xC9, 0xE3, 0xCE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD1, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0xE4-0xE7 */ + 0xE7, 0x74, 0xE7, 0x67, 0xE7, 0x66, 0xE7, 0x62, /* 0xE8-0xEB */ + 0xBD, 0xB4, 0x00, 0x00, 0xBD, 0xAC, 0xE7, 0x76, /* 0xEC-0xEF */ + 0xE7, 0x75, 0xDF, 0xA9, 0xE7, 0x5F, 0xE7, 0x63, /* 0xF0-0xF3 */ + 0xE7, 0x5D, 0x00, 0x00, 0xE7, 0x70, 0xE7, 0x61, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE7, 0x77, 0xE7, 0x5A, 0xE7, 0x58, /* 0xF8-0xFB */ + 0xE7, 0x64, 0xE7, 0x6E, 0xE7, 0x69, 0xBD, 0xB6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0xE7, 0x4F, 0x00, 0x00, 0xE7, 0x6D, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xB7, 0xDF, 0xBD, /* 0x04-0x07 */ + 0xE7, 0x5B, 0xE7, 0x52, 0xE7, 0x55, 0xE7, 0x7B, /* 0x08-0x0B */ + 0xE7, 0x5C, 0xE7, 0x53, 0xE7, 0x51, 0xE7, 0x4E, /* 0x0C-0x0F */ + 0x00, 0x00, 0xBD, 0xB0, 0xE7, 0x65, 0xBD, 0xAF, /* 0x10-0x13 */ + 0xBD, 0xB3, 0xE7, 0x60, 0xE7, 0x68, 0xBD, 0xA9, /* 0x14-0x17 */ + 0xE7, 0x78, 0xE7, 0x7C, 0xBD, 0xAB, 0x00, 0x00, /* 0x18-0x1B */ + 0xE7, 0x57, 0xE7, 0x6B, 0xE7, 0x6F, 0xE7, 0x54, /* 0x1C-0x1F */ + 0xE7, 0x79, 0xBD, 0xB2, 0x00, 0x00, 0xBD, 0xB1, /* 0x20-0x23 */ + 0xE7, 0x4C, 0xBD, 0xB5, 0xE7, 0x72, 0xE7, 0x56, /* 0x24-0x27 */ + 0xE7, 0x6A, 0xE7, 0x50, 0xE7, 0x5E, 0xE7, 0x59, /* 0x28-0x2B */ + 0xBD, 0xAD, 0xBD, 0xAE, 0xE7, 0x6C, 0xE7, 0x7D, /* 0x2C-0x2F */ + 0xE7, 0x7A, 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4D, /* 0x38-0x3B */ + 0x00, 0x00, 0xBD, 0xAA, 0xEB, 0x49, 0x00, 0x00, /* 0x3C-0x3F */ + 0xEB, 0x40, 0xEB, 0x43, 0x00, 0x00, 0xBF, 0xBB, /* 0x40-0x43 */ + 0xEB, 0x45, 0xEA, 0xF9, 0xEB, 0x41, 0xEB, 0x47, /* 0x44-0x47 */ + 0xBF, 0xB8, 0xBF, 0xBC, 0xBF, 0xB6, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEA, 0xFB, 0xEB, 0x4C, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEB, 0x46, 0x00, 0x00, 0xEA, 0xFC, /* 0x50-0x53 */ + 0xEB, 0x55, 0xEB, 0x4F, 0xEA, 0xF8, 0xEE, 0x46, /* 0x54-0x57 */ + 0xEA, 0xFE, 0xBF, 0xB7, 0x00, 0x00, 0xEB, 0x4A, /* 0x58-0x5B */ + 0x00, 0x00, 0xEB, 0x54, 0xBF, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEB, 0x51, 0xEA, 0xFD, 0xEB, 0x44, 0xEB, 0x48, /* 0x60-0x63 */ + 0xEB, 0x42, 0xEB, 0x56, 0xEB, 0x53, 0xEB, 0x50, /* 0x64-0x67 */ + 0xBF, 0xB9, 0xBF, 0xBA, 0xBF, 0xBE, 0xEA, 0xFA, /* 0x68-0x6B */ + 0xEB, 0x57, 0xBF, 0xBD, 0xEB, 0x4D, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xEB, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xEB, 0x4E, 0xEE, 0x53, 0xEE, 0x40, /* 0x74-0x77 */ + 0xEE, 0x45, 0xEE, 0x52, 0xEE, 0x44, 0xED, 0xFB, /* 0x78-0x7B */ + 0xEE, 0x41, 0x00, 0x00, 0xC1, 0xA2, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xED, 0xF4, 0xEE, 0x4D, 0xEE, 0x4F, 0xED, 0xF3, /* 0x80-0x83 */ + 0xC1, 0xA1, 0xEE, 0x51, 0xEE, 0x49, 0xC1, 0xA8, /* 0x84-0x87 */ + 0xEE, 0x50, 0xEE, 0x42, 0xC1, 0xAA, 0xED, 0xF9, /* 0x88-0x8B */ + 0xEB, 0x52, 0xEE, 0x4A, 0xEE, 0x47, 0xED, 0xF5, /* 0x8C-0x8F */ + 0xEE, 0x55, 0xC1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xC1, 0xA5, 0xED, 0xF7, 0xEE, 0x48, 0x00, 0x00, /* 0x94-0x97 */ + 0xEE, 0x54, 0xEE, 0x4B, 0xED, 0xFD, 0xC1, 0xA7, /* 0x98-0x9B */ + 0xC1, 0xA3, 0xEE, 0x4C, 0xED, 0xFE, 0xEE, 0x56, /* 0x9C-0x9F */ + 0xED, 0xF8, 0xEE, 0x43, 0xEE, 0x4E, 0xED, 0xFA, /* 0xA0-0xA3 */ + 0xED, 0xFC, 0x00, 0x00, 0xC2, 0xCB, 0xED, 0xF6, /* 0xA4-0xA7 */ + 0xC1, 0xA9, 0xC2, 0xC4, 0xC1, 0x7E, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xA6, /* 0xAC-0xAF */ + 0xC2, 0xC8, 0xF0, 0xB3, 0x00, 0x00, 0xF0, 0xA9, /* 0xB0-0xB3 */ + 0xF0, 0xA4, 0xF0, 0xAA, 0xF0, 0xB4, 0xF0, 0xB8, /* 0xB4-0xB7 */ + 0xF0, 0xB7, 0xC2, 0xCA, 0xC2, 0xC9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF0, 0xAB, 0xF0, 0xB9, 0xF0, 0xAE, /* 0xBC-0xBF */ + 0xF0, 0xA6, 0x00, 0x00, 0xF0, 0xA8, 0xF0, 0xA7, /* 0xC0-0xC3 */ + 0xF0, 0xAD, 0xF0, 0xB2, 0xF0, 0xA5, 0xF0, 0xAC, /* 0xC4-0xC7 */ + 0xF0, 0xB1, 0xC2, 0xC7, 0x00, 0x00, 0xF0, 0xAF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xC2, 0xC5, 0xF0, 0xB0, 0xC2, 0xC3, /* 0xCC-0xCF */ + 0xC2, 0xC6, 0xF2, 0xD5, 0xF0, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xC3, 0xC2, 0x00, 0x00, 0xF2, 0xCD, /* 0xD4-0xD7 */ + 0xF2, 0xD1, 0xF2, 0xC9, 0xF2, 0xCC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF2, 0xD4, 0xC3, 0xC0, 0xF2, 0xD9, 0xF2, 0xD2, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF2, 0xCA, 0xF2, 0xDA, 0xF2, 0xD3, /* 0xE0-0xE3 */ + 0xC3, 0xC3, 0xC3, 0xC4, 0xF2, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF2, 0xCB, 0xC3, 0xBF, 0xC3, 0xC1, 0xF2, 0xC6, /* 0xE8-0xEB */ + 0xF2, 0xCE, 0xF2, 0xC8, 0x00, 0x00, 0xF2, 0xD8, /* 0xEC-0xEF */ + 0xF2, 0xD6, 0xF2, 0xC7, 0xF2, 0xCF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBE, 0xC3, 0xC5, /* 0xF4-0xF7 */ + 0xF2, 0xD0, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xA6, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF4, 0xC3, 0xF4, 0xBB, 0xF4, 0xB9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0xF4, 0xBD, 0xF4, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF4, 0xBF, 0xF4, 0xC1, 0xC4, 0xAA, 0xC4, 0xAC, /* 0x04-0x07 */ + 0x00, 0x00, 0xF4, 0xC0, 0xC4, 0xAD, 0xC4, 0xAB, /* 0x08-0x0B */ + 0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xC4, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xF4, /* 0x14-0x17 */ + 0xF5, 0xF1, 0xF5, 0xF7, 0xC4, 0xF6, 0xF4, 0xBC, /* 0x18-0x1B */ + 0xF5, 0xF6, 0x00, 0x00, 0xF5, 0xFD, 0xF5, 0xF4, /* 0x1C-0x1F */ + 0xF5, 0xFB, 0xF5, 0xFA, 0xF4, 0xB8, 0xF5, 0xF5, /* 0x20-0x23 */ + 0xF0, 0xB6, 0xF5, 0xFE, 0xF5, 0xF3, 0xF5, 0xF8, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0xFC, 0xF5, 0xF2, 0x00, 0x00, /* 0x28-0x2B */ + 0xF7, 0x4A, 0xC4, 0xF5, 0xF5, 0xF9, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF7, 0xF4, 0xF7, 0x4B, 0xF7, 0x49, /* 0x30-0x33 */ + 0xF7, 0x47, 0xF7, 0x48, 0xF7, 0x4C, 0x00, 0x00, /* 0x34-0x37 */ + 0xC5, 0xD9, 0xF7, 0xF2, 0xF7, 0xF0, 0xF7, 0xF5, /* 0x38-0x3B */ + 0xF7, 0xF3, 0x00, 0x00, 0xF7, 0xF6, 0xC5, 0xDA, /* 0x3C-0x3F */ + 0xF7, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xBC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0x45, 0xF9, 0x46, /* 0x44-0x47 */ + 0xF9, 0x47, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC7, /* 0x48-0x4B */ + 0xF9, 0xBD, 0xCA, 0x4F, 0xAA, 0xEA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xAD, 0x68, 0x00, 0x00, 0xD3, 0xB8, 0xD3, 0xB7, /* 0x50-0x53 */ + 0xB0, 0x40, 0xB3, 0x42, 0xD7, 0x7C, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD7, 0x7B, 0x00, 0x00, 0xB5, 0xEA, /* 0x58-0x5B */ + 0xB8, 0xB8, 0x00, 0x00, 0xB8, 0xB7, 0xB8, 0xB9, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE3, 0xD4, 0xE7, 0x7E, 0xEB, 0x58, /* 0x60-0x63 */ + 0xEB, 0x5A, 0xEB, 0x59, 0x00, 0x00, 0xC1, 0xAB, /* 0x64-0x67 */ + 0xEE, 0x57, 0xF0, 0xBA, 0xF9, 0xA5, 0xA6, 0xE4, /* 0x68-0x6B */ + 0x00, 0x00, 0xCD, 0xC9, 0xCD, 0xCA, 0xCD, 0xC8, /* 0x6C-0x6F */ + 0xCD, 0xC7, 0xAA, 0xEB, 0x00, 0x00, 0xD0, 0xA9, /* 0x70-0x73 */ + 0xD0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA6, /* 0x74-0x77 */ + 0x00, 0x00, 0xAD, 0x69, 0xAD, 0x6B, 0xAD, 0x6A, /* 0x78-0x7B */ + 0xD0, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xC4, 0xD3, 0xC1, 0xD3, 0xBF, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0x41, 0xD3, 0xC2, /* 0x88-0x8B */ + 0xB0, 0x46, 0xD3, 0xBC, 0xD3, 0xCB, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD3, 0xCD, 0xD3, 0xBD, 0x00, 0x00, 0xB0, 0x43, /* 0x90-0x93 */ + 0xD3, 0xCE, 0xD3, 0xC9, 0xD3, 0xBB, 0xD3, 0xC0, /* 0x94-0x97 */ + 0xD3, 0xCA, 0xD3, 0xC6, 0xD3, 0xC3, 0x00, 0x00, /* 0x98-0x9B */ + 0xB0, 0x48, 0xD3, 0xCC, 0xD3, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD3, 0xC7, 0xD3, 0xB9, 0xB0, 0x47, /* 0xA0-0xA3 */ + 0xB0, 0x44, 0xD3, 0xC5, 0x00, 0x00, 0xD3, 0xC8, /* 0xA4-0xA7 */ + 0xD3, 0xBA, 0xB0, 0x45, 0xB0, 0x42, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4C, /* 0xAC-0xAF */ + 0xD7, 0xA5, 0xB3, 0x4B, 0x00, 0x00, 0xD7, 0xA8, /* 0xB0-0xB3 */ + 0xD7, 0xAB, 0xB3, 0x48, 0xB3, 0x46, 0xD7, 0x7E, /* 0xB4-0xB7 */ + 0xD7, 0xA9, 0xD7, 0xA7, 0xD7, 0xA4, 0xD7, 0xAC, /* 0xB8-0xBB */ + 0xD7, 0xAD, 0xD7, 0xAF, 0xD7, 0xB0, 0xD7, 0x7D, /* 0xBC-0xBF */ + 0xB3, 0x45, 0xD7, 0xA2, 0xD7, 0xA1, 0xD7, 0xAE, /* 0xC0-0xC3 */ + 0xB3, 0x47, 0xD7, 0xA3, 0xB3, 0x49, 0xB3, 0x44, /* 0xC4-0xC7 */ + 0xD7, 0xA6, 0xB3, 0x4D, 0x00, 0x00, 0xB3, 0x4A, /* 0xC8-0xCB */ + 0xD7, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xB5, 0xF1, 0xDB, 0xBF, 0x00, 0x00, 0xDB, 0xB4, /* 0xD0-0xD3 */ + 0xB5, 0xEE, 0x00, 0x00, 0xDF, 0xE7, 0xDB, 0xBD, /* 0xD4-0xD7 */ + 0xDB, 0xB1, 0xB5, 0xEC, 0xDB, 0xB6, 0xB5, 0xEF, /* 0xD8-0xDB */ + 0xDB, 0xBA, 0xDB, 0xB8, 0xB5, 0xF2, 0xB5, 0xEB, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xB2, 0xDB, 0xB5, /* 0xE0-0xE3 */ + 0xB5, 0xF0, 0x00, 0x00, 0xDB, 0xB3, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDB, 0xBE, 0xDB, 0xBC, 0xDB, 0xB7, 0xDB, 0xB9, /* 0xE8-0xEB */ + 0xDB, 0xBB, 0xB5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xDF, 0xE8, 0xDF, 0xEE, 0xDF, 0xE4, /* 0xF4-0xF7 */ + 0xDF, 0xEA, 0xB8, 0xBA, 0xDF, 0xE6, 0xB8, 0xC0, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xBF, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xB8, 0xBE, 0xDF, 0xED, 0xB8, 0xC1, 0xB8, 0xC2, /* 0x00-0x03 */ + 0xDF, 0xE3, 0xDF, 0xF0, 0xB8, 0xC3, 0xB8, 0xBD, /* 0x04-0x07 */ + 0xB8, 0xBC, 0xDF, 0xEC, 0xB8, 0xC4, 0xDF, 0xE2, /* 0x08-0x0B */ + 0xDF, 0xE5, 0xDF, 0xEF, 0xDF, 0xEB, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE3, 0xF4, 0xE3, 0xE9, 0xB8, 0xBB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xBB, 0x6A, 0xE3, 0xDD, 0xE3, 0xF2, 0xE3, 0xDE, /* 0x18-0x1B */ + 0xBB, 0x65, 0x00, 0x00, 0xE3, 0xDB, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE3, 0xE4, 0xE3, 0xDC, 0xBB, 0x67, 0xE3, 0xD6, /* 0x20-0x23 */ + 0xE3, 0xF1, 0xBB, 0x68, 0xE3, 0xEE, 0xE3, 0xEF, /* 0x24-0x27 */ + 0xE3, 0xD7, 0xBB, 0x6D, 0xE3, 0xE6, 0x00, 0x00, /* 0x28-0x2B */ + 0xE3, 0xE0, 0xE3, 0xE7, 0xE3, 0xDA, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE3, 0xF3, 0xE3, 0xEB, 0xE3, 0xE5, 0xE3, 0xD5, /* 0x30-0x33 */ + 0xBB, 0x69, 0xE3, 0xEC, 0x00, 0x00, 0xBB, 0x6C, /* 0x34-0x37 */ + 0xE3, 0xF0, 0x00, 0x00, 0xE3, 0xEA, 0xBB, 0x66, /* 0x38-0x3B */ + 0xE3, 0xE8, 0x00, 0x00, 0xE3, 0xE2, 0xBB, 0x64, /* 0x3C-0x3F */ + 0xE3, 0xD9, 0xE3, 0xE1, 0xE3, 0xED, 0xE3, 0xDF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE3, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xBD, 0xC1, 0xDF, 0xE9, 0xE7, 0xB2, 0xE7, 0xBB, /* 0x4C-0x4F */ + 0xE7, 0xB1, 0xE7, 0xAD, 0xE7, 0xAA, 0xBD, 0xC2, /* 0x50-0x53 */ + 0xE7, 0xA8, 0xBB, 0x6B, 0xE7, 0xA1, 0xBD, 0xC0, /* 0x54-0x57 */ + 0xE7, 0xA7, 0xBD, 0xBF, 0xE7, 0xAC, 0xE7, 0xA9, /* 0x58-0x5B */ + 0xE7, 0xB9, 0xE7, 0xB4, 0xE7, 0xAE, 0xE7, 0xB3, /* 0x5C-0x5F */ + 0xBD, 0xBB, 0xE7, 0xAB, 0xE7, 0xBE, 0xE7, 0xA2, /* 0x60-0x63 */ + 0xE7, 0xA3, 0xE7, 0xBA, 0xBD, 0xBC, 0xE7, 0xBF, /* 0x64-0x67 */ + 0xBD, 0xBE, 0xE7, 0xC0, 0xE7, 0xB0, 0xE3, 0xD8, /* 0x68-0x6B */ + 0xE7, 0xB6, 0xE7, 0xAF, 0xE7, 0xB8, 0xE7, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA6, /* 0x70-0x73 */ + 0xBD, 0xB9, 0xE7, 0xBD, 0xBD, 0xBA, 0xE7, 0xA4, /* 0x74-0x77 */ + 0xBD, 0xBD, 0xEB, 0x64, 0xE7, 0xB7, 0xE7, 0xBC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEB, 0x61, 0xBD, 0xB8, 0xBF, 0xC0, /* 0x80-0x83 */ + 0xEB, 0x6B, 0xEB, 0x67, 0x00, 0x00, 0xEB, 0x65, /* 0x84-0x87 */ + 0xEB, 0x60, 0xEB, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xBF, 0xC4, 0x00, 0x00, 0xEB, 0x5C, /* 0x8C-0x8F */ + 0xEB, 0x68, 0xEB, 0x69, 0xEB, 0x5F, 0xEB, 0x5E, /* 0x90-0x93 */ + 0xEB, 0x6C, 0x00, 0x00, 0xEB, 0x62, 0xEB, 0x5D, /* 0x94-0x97 */ + 0xEB, 0x63, 0x00, 0x00, 0xEB, 0x6E, 0xEB, 0x5B, /* 0x98-0x9B */ + 0xEB, 0x6D, 0xEB, 0x6A, 0xBF, 0xC2, 0xBF, 0xC1, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xC3, 0xEB, 0x66, /* 0xA0-0xA3 */ + 0xF0, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x59, 0xC1, 0xB1, /* 0xA8-0xAB */ + 0xEE, 0x5D, 0xEE, 0x5A, 0xEE, 0x61, 0xEE, 0x67, /* 0xAC-0xAF */ + 0xEE, 0x5C, 0x00, 0x00, 0xEE, 0x70, 0xC1, 0xAE, /* 0xB0-0xB3 */ + 0xEE, 0x6A, 0xEE, 0x5F, 0xEE, 0x6B, 0xEE, 0x66, /* 0xB4-0xB7 */ + 0xEE, 0x6D, 0xEE, 0x5E, 0xC1, 0xB3, 0xC1, 0xB2, /* 0xB8-0xBB */ + 0xEE, 0x60, 0xEE, 0x6E, 0xEE, 0x58, 0xEE, 0x6C, /* 0xBC-0xBF */ + 0xC1, 0xAC, 0x00, 0x00, 0xEE, 0x64, 0xEE, 0x63, /* 0xC0-0xC3 */ + 0xEE, 0x68, 0xEE, 0x5B, 0xC1, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xC1, 0xB4, 0xEE, 0x62, 0xEE, 0x69, 0xC1, 0xB5, /* 0xC8-0xCB */ + 0xEE, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC1, 0xAD, 0xC1, 0xAF, 0xF0, 0xC7, /* 0xD0-0xD3 */ + 0xF0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xCC, /* 0xD4-0xD7 */ + 0xF0, 0xC9, 0xF0, 0xCD, 0x00, 0x00, 0xF0, 0xBE, /* 0xD8-0xDB */ + 0xF0, 0xC6, 0xF0, 0xD1, 0xEE, 0x6F, 0xF0, 0xC2, /* 0xDC-0xDF */ + 0xC2, 0xCF, 0xE7, 0xA5, 0xF0, 0xBD, 0xF0, 0xCA, /* 0xE0-0xE3 */ + 0xF0, 0xC4, 0xF0, 0xC1, 0xF0, 0xBC, 0xF0, 0xBB, /* 0xE4-0xE7 */ + 0xF0, 0xD0, 0x00, 0x00, 0xF0, 0xC0, 0xF0, 0xBF, /* 0xE8-0xEB */ + 0xC2, 0xCD, 0xF0, 0xC8, 0x00, 0x00, 0xC2, 0xCC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0xCE, 0xF0, 0xC3, /* 0xF0-0xF3 */ + 0xF0, 0xCF, 0x00, 0x00, 0xF2, 0xDE, 0xF2, 0xDF, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xC3, 0xC9, 0xF2, 0xDC, 0xC3, 0xC6, /* 0xF8-0xFB */ + 0xF2, 0xE4, 0x00, 0x00, 0xC3, 0xCA, 0xF2, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0xF2, 0xDB, 0xF0, 0xCE, 0xF2, 0xE8, 0xF2, 0xDD, /* 0x00-0x03 */ + 0x00, 0x00, 0xC3, 0xC7, 0xF2, 0xE3, 0x00, 0x00, /* 0x04-0x07 */ + 0xF2, 0xE5, 0xF2, 0xE0, 0xF2, 0xE7, 0xF2, 0xE2, /* 0x08-0x0B */ + 0xF2, 0xE1, 0xC3, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF4, 0xC5, 0xF4, 0xC6, 0x00, 0x00, 0xF4, 0xC8, /* 0x10-0x13 */ + 0xC4, 0xAE, 0xC4, 0xAF, 0xF4, 0xC9, 0xF4, 0xC7, /* 0x14-0x17 */ + 0x00, 0x00, 0xF4, 0xC4, 0x00, 0x00, 0xF6, 0x42, /* 0x18-0x1B */ + 0xF6, 0x45, 0xF6, 0x41, 0x00, 0x00, 0xC4, 0xFA, /* 0x1C-0x1F */ + 0xF6, 0x43, 0xC4, 0xF9, 0xC4, 0xF8, 0xC4, 0xF7, /* 0x20-0x23 */ + 0xF6, 0x44, 0xF7, 0x51, 0xF7, 0x4F, 0x00, 0x00, /* 0x24-0x27 */ + 0xF7, 0x4E, 0xF6, 0x40, 0xF7, 0x50, 0xF6, 0x46, /* 0x28-0x2B */ + 0xF7, 0x4D, 0x00, 0x00, 0xF7, 0xF9, 0xF7, 0xD7, /* 0x2C-0x2F */ + 0xF7, 0xF7, 0xC5, 0xDB, 0xF7, 0xF8, 0xF7, 0xFA, /* 0x30-0x33 */ + 0x00, 0x00, 0xF8, 0xBF, 0xC5, 0xFA, 0xF8, 0xBE, /* 0x34-0x37 */ + 0xF8, 0xBD, 0xC5, 0xFB, 0x00, 0x00, 0xC6, 0x5A, /* 0x38-0x3B */ + 0xF9, 0x6E, 0xF9, 0xA7, 0xF9, 0xA6, 0xF9, 0xA8, /* 0x3C-0x3F */ + 0xA6, 0xE5, 0xD0, 0xAA, 0x00, 0x00, 0xD3, 0xCF, /* 0x40-0x43 */ + 0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xDB, 0xC0, 0x00, 0x00, 0xF6, 0x47, 0xF8, 0xC0, /* 0x48-0x4B */ + 0xA6, 0xE6, 0xAD, 0x6C, 0xD0, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB1, 0xB3, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0xDB, 0xC2, 0xDB, 0xC1, 0xB5, 0xF3, /* 0x54-0x57 */ + 0x00, 0x00, 0xB8, 0xC5, 0xE7, 0xC1, 0xBD, 0xC3, /* 0x58-0x5B */ + 0x00, 0x00, 0xBD, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xBF, 0xC5, 0xC5, 0xFC, 0xA6, 0xE7, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, /* 0x64-0x67 */ + 0xAA, 0xED, 0xD0, 0xAE, 0xD0, 0xAD, 0xAD, 0x6D, /* 0x68-0x6B */ + 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, 0xD3, 0xD8, /* 0x6C-0x6F */ + 0xB0, 0x49, 0xD3, 0xD6, 0xD3, 0xD4, 0x00, 0x00, /* 0x70-0x73 */ + 0xD3, 0xDB, 0xD3, 0xD2, 0xD3, 0xD3, 0xB0, 0x4A, /* 0x74-0x77 */ + 0x00, 0x00, 0xB0, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xD3, 0xDC, 0xB0, 0x4D, 0xD3, 0xDA, 0xD3, 0xD7, /* 0x7C-0x7F */ + + 0xD3, 0xD5, 0xB0, 0x4B, 0xB0, 0x4C, 0xD3, 0xD9, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xB3, 0x50, 0xD7, 0xB2, 0x00, 0x00, 0xB3, 0x55, /* 0x88-0x8B */ + 0xD7, 0xC2, 0xB3, 0x54, 0xD7, 0xC4, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD7, 0xB8, 0xB3, 0x52, 0xD7, 0xC3, /* 0x90-0x93 */ + 0x00, 0x00, 0xD7, 0xB3, 0xB3, 0x53, 0xD7, 0xBF, /* 0x94-0x97 */ + 0xD7, 0xBB, 0xD7, 0xBD, 0xD7, 0xB7, 0xD7, 0xBE, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4F, 0xD7, 0xBA, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD7, 0xB9, 0xD7, 0xB5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD7, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0xA4-0xA7 */ + 0xD7, 0xB4, 0x00, 0x00, 0xD7, 0xB6, 0xB3, 0x51, /* 0xA8-0xAB */ + 0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB5, 0xF6, 0xDB, 0xCD, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0xDB, 0xCB, /* 0xB4-0xB7 */ + 0xDB, 0xC6, 0xDB, 0xC5, 0xDB, 0xC3, 0x00, 0x00, /* 0xB8-0xBB */ + 0xDB, 0xCA, 0xDB, 0xCC, 0xDB, 0xC8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDB, 0xC7, 0xB5, 0xF4, 0xB5, 0xF5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDB, 0xCF, 0xB8, 0xCD, 0xDF, 0xF2, /* 0xC8-0xCB */ + 0xDF, 0xF8, 0xDF, 0xF3, 0xDF, 0xF4, 0xF9, 0xD8, /* 0xCC-0xCF */ + 0xDF, 0xF9, 0x00, 0x00, 0xB8, 0xCF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB8, 0xC7, 0xB8, 0xCE, 0xDF, 0xF1, 0xDB, 0xC4, /* 0xD4-0xD7 */ + 0xB8, 0xCA, 0xB8, 0xC8, 0xDF, 0xF7, 0xDF, 0xF6, /* 0xD8-0xDB */ + 0xB8, 0xC9, 0xB8, 0xCB, 0xDF, 0xF5, 0xB8, 0xC6, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xE4-0xE7 */ + 0xBB, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, /* 0xE8-0xEB */ + 0xE4, 0x41, 0x00, 0x00, 0xE3, 0xFB, 0xBB, 0x76, /* 0xEC-0xEF */ + 0xE4, 0x40, 0xE3, 0xF7, 0xE3, 0xF8, 0xBB, 0x6E, /* 0xF0-0xF3 */ + 0xBB, 0x70, 0x00, 0x00, 0xE3, 0xFD, 0xE3, 0xF5, /* 0xF4-0xF7 */ + 0xBB, 0x72, 0xBB, 0x71, 0xE3, 0xF9, 0xE3, 0xFE, /* 0xF8-0xFB */ + 0xE3, 0xFC, 0xBB, 0x73, 0xE3, 0xFA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0xDB, 0xCE, 0xBB, 0x6F, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0xC2, 0xE7, 0xC9, 0xBD, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0xE7, 0xCD, 0xBD, 0xCA, 0xE7, 0xC5, /* 0x08-0x0B */ + 0xE7, 0xC3, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */ + 0xBD, 0xC5, 0xE7, 0xCB, 0xBD, 0xC7, 0xBD, 0xC8, /* 0x10-0x13 */ + 0xE7, 0xC4, 0xBD, 0xC9, 0xE7, 0xCA, 0xE7, 0xC6, /* 0x14-0x17 */ + 0xE7, 0xC7, 0xE7, 0xC8, 0xBB, 0x75, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0x70, 0xEB, 0x7C, /* 0x1C-0x1F */ + 0x00, 0x00, 0xBF, 0xCA, 0xEB, 0x77, 0xEB, 0x79, /* 0x20-0x23 */ + 0x00, 0x00, 0xBF, 0xC8, 0xEB, 0x71, 0xEB, 0x75, /* 0x24-0x27 */ + 0x00, 0x00, 0xEB, 0x78, 0xBF, 0xC6, 0xBF, 0xC9, /* 0x28-0x2B */ + 0xEB, 0x7B, 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x7A, /* 0x2C-0x2F */ + 0xEB, 0x72, 0xEB, 0x76, 0xBF, 0xC7, 0xEE, 0x72, /* 0x30-0x33 */ + 0x00, 0x00, 0xEE, 0x71, 0xC1, 0xB7, 0xEE, 0x77, /* 0x34-0x37 */ + 0xC1, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xB6, /* 0x38-0x3B */ + 0xEE, 0x73, 0xC1, 0xBA, 0xEE, 0x74, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEE, 0x75, 0xEE, 0x78, 0x00, 0x00, /* 0x40-0x43 */ + 0xC1, 0xB8, 0x00, 0x00, 0xF0, 0xD6, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF0, 0xD9, 0x00, 0x00, 0xF0, 0xD3, /* 0x48-0x4B */ + 0xF0, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, /* 0x4C-0x4F */ + 0xF0, 0xD7, 0xF0, 0xD8, 0xEE, 0x76, 0xF0, 0xD2, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xCD, 0xF2, 0xEC, /* 0x54-0x57 */ + 0xF2, 0xEF, 0xF2, 0xF1, 0xF2, 0xEA, 0xF2, 0xEB, /* 0x58-0x5B */ + 0xF2, 0xEE, 0xF2, 0xF0, 0xC3, 0xCE, 0xC3, 0xCC, /* 0x5C-0x5F */ + 0xC3, 0xCB, 0xF2, 0xED, 0xF2, 0xE9, 0xF4, 0xCA, /* 0x60-0x63 */ + 0xC4, 0xB0, 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xF6, 0x49, 0xC4, 0xFB, 0xF6, 0x4B, /* 0x68-0x6B */ + 0xC4, 0xFC, 0xF6, 0x48, 0xF6, 0x4A, 0xC5, 0xA8, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF7, 0x52, 0xC5, 0xA7, 0xF7, 0xFD, /* 0x70-0x73 */ + 0xF7, 0xFC, 0x00, 0x00, 0xF7, 0xFB, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF9, 0x48, 0xF9, 0x49, 0xF9, 0x4B, /* 0x78-0x7B */ + 0xF9, 0x4A, 0x00, 0x00, 0xCA, 0x50, 0xA6, 0xE8, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xAD, 0x6E, 0xD7, 0xC5, 0xB5, 0xF7, /* 0x80-0x83 */ + 0x00, 0x00, 0xDF, 0xFA, 0xC2, 0xD0, 0x00, 0x00, /* 0x84-0x87 */ + 0xF2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x57, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x56, /* 0x90-0x93 */ + 0x00, 0x00, 0xDB, 0xD0, 0xB5, 0xF8, 0xDB, 0xD2, /* 0x94-0x97 */ + 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFB, /* 0x98-0x9B */ + 0xB8, 0xD0, 0xE4, 0x43, 0xE4, 0x46, 0xE4, 0x45, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0x44, 0xE7, 0xCE, 0xE7, 0xD0, /* 0xA0-0xA3 */ + 0xE7, 0xCF, 0x00, 0x00, 0xBF, 0xCC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCB, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC1, 0xBB, 0xEE, 0x79, 0xEE, 0x7B, 0xEE, 0x7A, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF4, 0xF2, 0xF3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF4, 0xCC, 0xC4, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xC4, 0xFD, 0xF7, 0x54, 0xF7, 0x53, /* 0xBC-0xBF */ + 0xC6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xD0, 0xAF, /* 0xD0-0xD3 */ + 0xAD, 0x6F, 0xD7, 0xC8, 0xD7, 0xC6, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xD7, 0xC7, 0xDB, 0xD4, 0xDB, 0xD5, /* 0xD8-0xDB */ + 0xE0, 0x43, 0xDB, 0xD3, 0x00, 0x00, 0xDF, 0xFC, /* 0xDC-0xDF */ + 0xE0, 0x41, 0xE0, 0x40, 0xE0, 0x42, 0xB8, 0xD1, /* 0xE0-0xE3 */ + 0xDF, 0xFE, 0xDF, 0xFD, 0xE0, 0x44, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE4, 0x49, 0xE4, 0x47, 0x00, 0x00, 0xE4, 0x48, /* 0xE8-0xEB */ + 0xE7, 0xD3, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE7, 0xD2, 0xEB, 0x7D, 0xEE, 0x7C, 0xEE, 0x7D, /* 0xF0-0xF3 */ + 0xC2, 0xD2, 0x00, 0x00, 0xF2, 0xF5, 0xF4, 0xCD, /* 0xF4-0xF7 */ + 0xC4, 0xB2, 0x00, 0x00, 0xF6, 0x4C, 0xF7, 0x55, /* 0xF8-0xFB */ + 0xC5, 0xA9, 0x00, 0x00, 0xF7, 0xFE, 0xF9, 0x4C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8A[512] = { + 0xA8, 0xA5, 0x00, 0x00, 0xAD, 0x71, 0xAD, 0x72, /* 0x00-0x03 */ + 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB1, /* 0x04-0x07 */ + 0xAD, 0x70, 0x00, 0x00, 0xB0, 0x54, 0x00, 0x00, /* 0x08-0x0B */ + 0xB0, 0x52, 0x00, 0x00, 0xB0, 0x51, 0xB0, 0x58, /* 0x0C-0x0F */ + 0xB0, 0x50, 0xB0, 0x59, 0xD3, 0xDD, 0xB0, 0x56, /* 0x10-0x13 */ + 0x00, 0x00, 0xB0, 0x53, 0xB0, 0x57, 0xB0, 0x55, /* 0x14-0x17 */ + 0xB0, 0x4F, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x5F, /* 0x18-0x1B */ + 0x00, 0x00, 0xB3, 0x59, 0xD7, 0xCC, 0xB3, 0x5E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x60, 0xB3, 0x5A, /* 0x20-0x23 */ + 0x00, 0x00, 0xB3, 0x5B, 0x00, 0x00, 0xD7, 0xCA, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x58, 0x00, 0x00, /* 0x28-0x2B */ + 0xD7, 0xCB, 0xB3, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD7, 0xC9, 0xB3, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xB6, 0x44, 0x00, 0x00, 0xB6, 0x46, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xDB, 0xD8, 0xB6, 0x45, 0xB5, 0xF9, /* 0x38-0x3B */ + 0xB5, 0xFD, 0x00, 0x00, 0xB8, 0xE4, 0xE0, 0x49, /* 0x3C-0x3F */ + 0xDB, 0xDA, 0xB5, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDB, 0xDD, 0xDB, 0xDE, 0xB6, 0x43, 0x00, 0x00, /* 0x44-0x47 */ + 0xDB, 0xE0, 0x00, 0x00, 0xDB, 0xE2, 0x00, 0x00, /* 0x48-0x4B */ + 0xDB, 0xE3, 0xDB, 0xD7, 0xDB, 0xD6, 0xDB, 0xE4, /* 0x4C-0x4F */ + 0xB6, 0x42, 0xDB, 0xE1, 0xDB, 0xDF, 0x00, 0x00, /* 0x50-0x53 */ + 0xB6, 0x40, 0xB5, 0xFB, 0xB6, 0x47, 0xDB, 0xDB, /* 0x54-0x57 */ + 0xDB, 0xDC, 0xDB, 0xD9, 0x00, 0x00, 0xB6, 0x41, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0xFC, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB5, 0xFA, 0xE0, 0x48, 0xB8, 0xDF, 0xB8, 0xDA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xD5, 0x00, 0x00, /* 0x64-0x67 */ + 0xB8, 0xE5, 0xB8, 0xD6, 0x00, 0x00, 0xB8, 0xD2, /* 0x68-0x6B */ + 0xB8, 0xE1, 0xB8, 0xDE, 0xB8, 0xE0, 0x00, 0x00, /* 0x6C-0x6F */ + 0xB8, 0xD7, 0xB8, 0xDC, 0xB8, 0xD3, 0xB8, 0xD4, /* 0x70-0x73 */ + 0xE0, 0x50, 0xE0, 0x4D, 0xE0, 0x45, 0xE0, 0x4A, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0xE2, 0xE0, 0x51, 0xB8, 0xE3, /* 0x78-0x7B */ + 0xB8, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x47, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE0, 0x4F, 0xE0, 0x4B, 0xE0, 0x4E, /* 0x80-0x83 */ + 0xE0, 0x4C, 0xB8, 0xDD, 0xE0, 0x46, 0xB8, 0xD8, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x4C, /* 0x88-0x8B */ + 0xBB, 0x78, 0xBB, 0x7B, 0x00, 0x00, 0xE4, 0x4E, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBB, 0xA5, 0xE4, 0x4D, 0xBB, 0x7D, /* 0x90-0x93 */ + 0x00, 0x00, 0xBD, 0xCF, 0xE4, 0x4F, 0x00, 0x00, /* 0x94-0x97 */ + 0xBB, 0xA4, 0xE4, 0x4B, 0xBB, 0xA6, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0x79, 0x00, 0x00, /* 0x9C-0x9F */ + 0xB8, 0xDB, 0xBB, 0x7C, 0x00, 0x00, 0xBB, 0x7A, /* 0xA0-0xA3 */ + 0xBB, 0x7E, 0xBB, 0xA2, 0xBB, 0x77, 0xBB, 0xA7, /* 0xA4-0xA7 */ + 0xBB, 0xA3, 0x00, 0x00, 0xBB, 0xA1, 0xE4, 0x4A, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xBD, 0xD6, 0x00, 0x00, 0xBD, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xD9, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE7, 0xD6, 0xBD, 0xDA, 0xE7, 0xE2, 0xE7, 0xDB, /* 0xB8-0xBB */ + 0xBD, 0xCB, 0xE7, 0xE3, 0xE7, 0xDD, 0xBD, 0xD5, /* 0xBC-0xBF */ + 0xE7, 0xDE, 0x00, 0x00, 0xBD, 0xD4, 0xE7, 0xE1, /* 0xC0-0xC3 */ + 0xBD, 0xCE, 0xE7, 0xDF, 0xE7, 0xD5, 0xBD, 0xCD, /* 0xC4-0xC7 */ + 0xEB, 0xAA, 0xBD, 0xD3, 0x00, 0x00, 0xBD, 0xD0, /* 0xC8-0xCB */ + 0x00, 0x00, 0xBD, 0xD8, 0x00, 0x00, 0xE7, 0xD4, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE7, 0xD8, 0xBD, 0xCC, 0xE7, 0xD7, /* 0xD0-0xD3 */ + 0xE7, 0xD9, 0xE7, 0xDA, 0xBD, 0xD7, 0xE7, 0xDC, /* 0xD4-0xD7 */ + 0xE7, 0xE0, 0xE7, 0xE4, 0x00, 0x00, 0xBD, 0xDB, /* 0xD8-0xDB */ + 0xBF, 0xD2, 0xEB, 0xA5, 0xEB, 0xAB, 0xEB, 0xA8, /* 0xDC-0xDF */ + 0xEB, 0x7E, 0xEB, 0xAC, 0xEB, 0xA1, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xEB, 0xA7, 0x00, 0x00, 0xBF, 0xCD, 0xBF, 0xD3, /* 0xE4-0xE7 */ + 0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xBF, 0xD9, 0xBF, 0xD4, 0xEB, 0xAF, /* 0xEC-0xEF */ + 0xEB, 0xA9, 0xBF, 0xD0, 0xEB, 0xA2, 0xBF, 0xDA, /* 0xF0-0xF3 */ + 0xEB, 0xA3, 0xEB, 0xA4, 0xBF, 0xDB, 0xBF, 0xD8, /* 0xF4-0xF7 */ + 0xBD, 0xD1, 0x00, 0x00, 0xBF, 0xCE, 0xEB, 0xB0, /* 0xF8-0xFB */ + 0xBF, 0xDC, 0x00, 0x00, 0xBF, 0xD5, 0xEB, 0xAE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xBF, 0xD1, 0xBF, 0xD6, 0xBF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */ + 0xC1, 0xC3, 0xEE, 0xA4, 0xEE, 0xAD, 0xEE, 0xAA, /* 0x04-0x07 */ + 0xEE, 0xAC, 0x00, 0x00, 0xC1, 0xC0, 0xEE, 0xA5, /* 0x08-0x0B */ + 0x00, 0x00, 0xEE, 0xAB, 0xC1, 0xBC, 0xEE, 0xA7, /* 0x0C-0x0F */ + 0xC1, 0xC4, 0xEE, 0xA3, 0xEE, 0xA8, 0xEE, 0xAF, /* 0x10-0x13 */ + 0xEB, 0xA6, 0xEE, 0xA9, 0xEE, 0xA2, 0xC1, 0xBD, /* 0x14-0x17 */ + 0xEE, 0xA1, 0xC1, 0xBE, 0xEE, 0xB0, 0xC1, 0xBF, /* 0x18-0x1B */ + 0xEE, 0xAE, 0xC1, 0xC2, 0xEE, 0x7E, 0x00, 0x00, /* 0x1C-0x1F */ + 0xC1, 0xC1, 0x00, 0x00, 0xEE, 0xA6, 0xF0, 0xDC, /* 0x20-0x23 */ + 0xF0, 0xEA, 0xF0, 0xE5, 0xF0, 0xE7, 0xF0, 0xDB, /* 0x24-0x27 */ + 0xC2, 0xD3, 0x00, 0x00, 0xF0, 0xDA, 0xC2, 0xD6, /* 0x28-0x2B */ + 0xC2, 0xD5, 0x00, 0x00, 0xF0, 0xE9, 0xF0, 0xE1, /* 0x2C-0x2F */ + 0xF0, 0xDE, 0xF0, 0xE4, 0x00, 0x00, 0xF0, 0xDD, /* 0x30-0x33 */ + 0x00, 0x00, 0xF0, 0xDF, 0xF0, 0xE8, 0xF0, 0xE6, /* 0x34-0x37 */ + 0x00, 0x00, 0xC2, 0xD4, 0xF0, 0xED, 0xF0, 0xEB, /* 0x38-0x3B */ + 0xF0, 0xE2, 0xF0, 0xEC, 0xF0, 0xE3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF2, 0xF9, 0xC3, 0xCF, 0xF3, 0x41, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xF6, 0x4F, 0xC3, 0xD6, 0xF0, 0xE0, /* 0x44-0x47 */ + 0xF2, 0xF7, 0xC3, 0xD2, 0xF2, 0xF8, 0xF2, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xD4, 0xC3, 0xD5, /* 0x4C-0x4F */ + 0xF2, 0xF6, 0xF3, 0x40, 0xF3, 0x42, 0xF2, 0xFA, /* 0x50-0x53 */ + 0xF2, 0xFC, 0xF2, 0xFE, 0xF2, 0xFB, 0xF3, 0x43, /* 0x54-0x57 */ + 0xC3, 0xD1, 0xC3, 0xD7, 0xC3, 0xD3, 0x00, 0x00, /* 0x58-0x5B */ + 0xC3, 0xD0, 0xF4, 0xD0, 0x00, 0x00, 0xC4, 0xB7, /* 0x5C-0x5F */ + 0xF4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD2, /* 0x60-0x63 */ + 0x00, 0x00, 0xF4, 0xD3, 0xC4, 0xB5, 0xF4, 0xD4, /* 0x64-0x67 */ + 0xF4, 0xD1, 0x00, 0x00, 0xF4, 0xCF, 0xC4, 0xB8, /* 0x68-0x6B */ + 0xC4, 0xB4, 0xF4, 0xD5, 0x00, 0x00, 0xC4, 0xB6, /* 0x6C-0x6F */ + 0xC4, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xC4, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x40, /* 0x74-0x77 */ + 0xF6, 0x4E, 0xF6, 0x4D, 0xF6, 0x50, 0xF6, 0x51, /* 0x78-0x7B */ + 0x00, 0x00, 0xC5, 0x41, 0xF7, 0x56, 0xF7, 0x5B, /* 0x7C-0x7F */ + + 0xC5, 0xAA, 0x00, 0x00, 0xF7, 0x58, 0x00, 0x00, /* 0x80-0x83 */ + 0xF7, 0x57, 0xF7, 0x5A, 0xF7, 0x59, 0x00, 0x00, /* 0x84-0x87 */ + 0xF8, 0x43, 0x00, 0x00, 0xC5, 0xDC, 0xF8, 0x42, /* 0x88-0x8B */ + 0xF8, 0x40, 0x00, 0x00, 0xF8, 0x41, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xFE, 0xC5, 0xFD, /* 0x90-0x93 */ + 0xF8, 0xC1, 0xF8, 0xC2, 0xC6, 0x40, 0x00, 0x00, /* 0x94-0x97 */ + 0xF9, 0x4D, 0xF9, 0x4E, 0xC6, 0x67, 0x00, 0x00, /* 0x98-0x9B */ + 0xC6, 0x6D, 0x00, 0x00, 0xF9, 0xA9, 0xF9, 0xC8, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA6, /* 0x34-0x37 */ + 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, 0xD7, 0xCE, /* 0x38-0x3B */ + 0xE0, 0x52, 0xE4, 0x50, 0xE7, 0xE5, 0xC1, 0xC6, /* 0x3C-0x3F */ + 0x00, 0x00, 0xC1, 0xC5, 0xF0, 0xEE, 0xF3, 0x44, /* 0x40-0x43 */ + 0x00, 0x00, 0xF8, 0x44, 0xA8, 0xA7, 0xD3, 0xDE, /* 0x44-0x47 */ + 0xB0, 0x5A, 0xB3, 0x61, 0xE0, 0x54, 0xE0, 0x53, /* 0x48-0x4B */ + 0xBD, 0xDC, 0xE7, 0xE6, 0xBD, 0xDD, 0xEE, 0xB1, /* 0x4C-0x4F */ + 0xC2, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xC6, 0x76, 0xA8, 0xA8, 0xCD, 0xCB, 0xD3, 0xDF, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x62, 0x00, 0x00, /* 0x58-0x5B */ + 0xD7, 0xCF, 0xD7, 0xD0, 0x00, 0x00, 0xDB, 0xE5, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB6, 0x48, 0xB8, 0xE6, 0x00, 0x00, /* 0x60-0x63 */ + 0xE0, 0x56, 0xE0, 0x55, 0xE0, 0x57, 0x00, 0x00, /* 0x64-0x67 */ + 0xE4, 0x51, 0xE4, 0x52, 0xBB, 0xA8, 0xBF, 0xDD, /* 0x68-0x6B */ + 0xBD, 0xDE, 0xBF, 0xDE, 0x00, 0x00, 0xEE, 0xB5, /* 0x6C-0x6F */ + 0xEE, 0xB2, 0xEE, 0xB4, 0xEE, 0xB3, 0xC1, 0xC7, /* 0x70-0x73 */ + 0x00, 0x00, 0xF0, 0xEF, 0xF3, 0x46, 0xF3, 0x45, /* 0x74-0x77 */ + 0xCB, 0xA4, 0xB0, 0x5C, 0xB0, 0x5B, 0xD3, 0xE0, /* 0x78-0x7B */ + 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDB, 0xE7, 0xDB, 0xE6, 0xB6, 0x49, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x58, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xB8, 0xE8, 0xB8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */ + 0xBB, 0xAA, 0xBB, 0xA9, 0x00, 0x00, 0xE7, 0xE7, /* 0x8C-0x8F */ + 0xEB, 0xB3, 0xEB, 0xB1, 0xEB, 0xB2, 0xBF, 0xDF, /* 0x90-0x93 */ + 0xEE, 0xB7, 0xEE, 0xB6, 0x00, 0x00, 0xF0, 0xF2, /* 0x94-0x97 */ + 0xF0, 0xF1, 0xF0, 0xF0, 0xF3, 0x47, 0x00, 0x00, /* 0x98-0x9B */ + 0xF9, 0xAA, 0xA8, 0xA9, 0xAD, 0x73, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAD, 0x74, 0xB0, 0x5D, 0xB0, 0x5E, 0xD3, 0xE2, /* 0xA0-0xA3 */ + 0xD3, 0xE1, 0xD7, 0xD2, 0x00, 0x00, 0xB3, 0x68, /* 0xA4-0xA7 */ + 0xB3, 0x66, 0xB3, 0x63, 0xB3, 0x67, 0xB3, 0x65, /* 0xA8-0xAB */ + 0xB3, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x4A, /* 0xAC-0xAF */ + 0xDB, 0xEA, 0x00, 0x00, 0xB8, 0xED, 0xB6, 0x4C, /* 0xB0-0xB3 */ + 0xB6, 0x51, 0xDB, 0xEC, 0xB6, 0x53, 0xB6, 0x52, /* 0xB4-0xB7 */ + 0xB6, 0x55, 0xDB, 0xEB, 0xDB, 0xE8, 0xB6, 0x4F, /* 0xB8-0xBB */ + 0xB6, 0x4B, 0xB6, 0x4D, 0xDB, 0xE9, 0xB6, 0x54, /* 0xBC-0xBF */ + 0xB6, 0x50, 0xB6, 0x4E, 0xB8, 0xEF, 0xB8, 0xEE, /* 0xC0-0xC3 */ + 0xB8, 0xEC, 0xB8, 0xF0, 0x00, 0x00, 0xB8, 0xEA, /* 0xC4-0xC7 */ + 0xB8, 0xEB, 0x00, 0x00, 0xB8, 0xE9, 0x00, 0x00, /* 0xC8-0xCB */ + 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0xCC-0xCF */ + 0x00, 0x00, 0xBB, 0xAC, 0xBB, 0xAD, 0xBB, 0xAB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE4, 0x53, 0x00, 0x00, 0xE4, 0x55, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE7, 0xEA, 0xE7, 0xEC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xBD, 0xE7, 0xE7, 0xED, 0xBD, 0xE0, 0xE7, 0xE9, /* 0xDC-0xDF */ + 0xBD, 0xDF, 0xBD, 0xE9, 0xBD, 0xE5, 0xBD, 0xE6, /* 0xE0-0xE3 */ + 0xBD, 0xE2, 0xE7, 0xE8, 0xBD, 0xE1, 0xE7, 0xEE, /* 0xE4-0xE7 */ + 0xE7, 0xEB, 0x00, 0x00, 0xBD, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBD, 0xE3, 0xBD, 0xE4, 0xEB, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEB, 0xB7, 0xEB, 0xB6, 0x00, 0x00, 0xEB, 0xB8, /* 0xF0-0xF3 */ + 0xBF, 0xE0, 0xEB, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xC1, 0xCB, 0xEE, 0xB8, 0xC1, 0xC8, 0xC1, 0xCC, /* 0xF8-0xFB */ + 0xC1, 0xCA, 0xC1, 0xC9, 0xF0, 0xF3, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0xF0, 0xF6, 0x00, 0x00, 0xF0, 0xF5, 0x00, 0x00, /* 0x00-0x03 */ + 0xF0, 0xF4, 0xC2, 0xD8, 0xF3, 0x48, 0xF3, 0x49, /* 0x04-0x07 */ + 0xC3, 0xD8, 0xF3, 0x4A, 0xC3, 0xD9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xC4, 0xBA, 0x00, 0x00, 0xC4, 0xB9, /* 0x0C-0x0F */ + 0xF6, 0x52, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x42, /* 0x10-0x13 */ + 0xF6, 0x53, 0xF7, 0x5C, 0xC5, 0xAB, 0xC5, 0xAC, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0x45, 0x00, 0x00, 0xC6, 0x42, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA8, 0xAA, 0x00, 0x00, 0xB3, 0x6A, 0xB3, 0x69, /* 0x64-0x67 */ + 0xE0, 0x5C, 0xE0, 0x5D, 0x00, 0x00, 0xBB, 0xAE, /* 0x68-0x6B */ + 0xEB, 0xB9, 0xBD, 0xEA, 0xEB, 0xBA, 0xEE, 0xB9, /* 0x6C-0x6F */ + 0xA8, 0xAB, 0x00, 0x00, 0xD0, 0xB2, 0xAD, 0x76, /* 0x70-0x73 */ + 0xAD, 0x75, 0x00, 0x00, 0xD3, 0xE3, 0xB0, 0x5F, /* 0x74-0x77 */ + 0xD3, 0xE4, 0xD7, 0xD5, 0x00, 0x00, 0xD7, 0xD4, /* 0x78-0x7B */ + 0x00, 0x00, 0xD7, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDB, 0xEE, 0xB6, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xDB, 0xED, 0xB6, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xDB, 0xEF, 0xB6, 0x56, 0x00, 0x00, /* 0x88-0x8B */ + 0xE0, 0x5F, 0xE0, 0x62, 0xE0, 0x60, 0xE0, 0x61, /* 0x8C-0x8F */ + 0xE0, 0x65, 0xE0, 0x5E, 0xE0, 0x66, 0xE0, 0x63, /* 0x90-0x93 */ + 0xE0, 0x64, 0xBB, 0xB0, 0xE4, 0x56, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xBB, 0xAF, 0x00, 0x00, 0xE7, 0xF2, /* 0x98-0x9B */ + 0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xBD, 0xEB, /* 0x9C-0x9F */ + 0xE7, 0xEF, 0xE7, 0xF1, 0x00, 0x00, 0xBD, 0xEC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEB, 0xBB, 0x00, 0x00, 0xEB, 0xBC, /* 0xA4-0xA7 */ + 0xC1, 0xCD, 0x00, 0x00, 0xF3, 0x4C, 0xF3, 0x4E, /* 0xA8-0xAB */ + 0xF3, 0x4B, 0xF3, 0x4D, 0xF4, 0xD6, 0xF6, 0x54, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0x6F, 0xA8, 0xAC, /* 0xB0-0xB3 */ + 0xAD, 0x77, 0xD3, 0xE5, 0xD3, 0xE7, 0xD3, 0xE6, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD7, 0xD8, 0xB3, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD7, 0xD6, 0x00, 0x00, 0xB3, 0x6B, 0xD7, 0xD9, /* 0xBC-0xBF */ + 0x00, 0x00, 0xD7, 0xDA, 0xD7, 0xD7, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDB, 0xFB, 0xB6, 0x60, 0xDB, 0xF3, /* 0xC4-0xC7 */ + 0xDB, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x5B, /* 0xC8-0xCB */ + 0xB6, 0x5E, 0xDB, 0xF2, 0xB6, 0x59, 0xDB, 0xF6, /* 0xCC-0xCF */ + 0xE0, 0x6C, 0xB6, 0x5D, 0x00, 0x00, 0xDB, 0xF1, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDB, 0xF7, 0xDB, 0xF4, 0xDB, 0xFA, /* 0xD4-0xD7 */ + 0xDB, 0xF0, 0xDB, 0xF8, 0xB6, 0x5C, 0xB6, 0x5F, /* 0xD8-0xDB */ + 0xDB, 0xF5, 0xB6, 0x5A, 0x00, 0x00, 0xB8, 0xF2, /* 0xDC-0xDF */ + 0xE0, 0x68, 0xB8, 0xF1, 0xE0, 0x6F, 0xE0, 0x6E, /* 0xE0-0xE3 */ + 0xB8, 0xF8, 0x00, 0x00, 0xB8, 0xF9, 0xE0, 0x70, /* 0xE4-0xE7 */ + 0xB8, 0xF3, 0xE0, 0x6D, 0xB8, 0xF7, 0xE0, 0x72, /* 0xE8-0xEB */ + 0xE0, 0x69, 0x00, 0x00, 0xE0, 0x6B, 0xB8, 0xF4, /* 0xEC-0xEF */ + 0xE0, 0x67, 0xE0, 0x6A, 0xE0, 0x71, 0xB8, 0xF5, /* 0xF0-0xF3 */ + 0xE0, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xBB, 0xB1, 0xE4, 0x5B, 0xE4, 0x61, 0xE4, 0x59, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0xE4, 0x62, 0x00, 0x00, 0xE4, 0x58, 0xE4, 0x5D, /* 0x00-0x03 */ + 0xE4, 0x63, 0xE4, 0x60, 0xE4, 0x5F, 0xE4, 0x5E, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0x57, 0xE4, 0x5C, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE4, 0x5A, 0x00, 0x00, 0xBD, 0xF1, /* 0x0C-0x0F */ + 0xBD, 0xEE, 0xE7, 0xFB, 0xE8, 0x41, 0xE8, 0x43, /* 0x10-0x13 */ + 0xE8, 0x40, 0xE7, 0xF8, 0xE7, 0xFA, 0xE8, 0x45, /* 0x14-0x17 */ + 0xE8, 0x42, 0xE7, 0xFC, 0xE8, 0x46, 0xE7, 0xF9, /* 0x18-0x1B */ + 0xE8, 0x44, 0xBD, 0xEF, 0xBD, 0xF5, 0xBD, 0xF3, /* 0x1C-0x1F */ + 0xE7, 0xF3, 0xBD, 0xF4, 0xBD, 0xF0, 0xE7, 0xF4, /* 0x20-0x23 */ + 0xE7, 0xF6, 0xE7, 0xF5, 0xE7, 0xFD, 0xE7, 0xFE, /* 0x24-0x27 */ + 0x00, 0x00, 0xBD, 0xF2, 0x00, 0x00, 0xBD, 0xED, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEB, 0xC6, 0xBF, 0xE2, 0x00, 0x00, 0xEB, 0xBD, /* 0x30-0x33 */ + 0xBF, 0xE3, 0xBF, 0xE6, 0xEB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xBF, 0xBF, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEB, 0xC3, 0xEB, 0xC4, 0xEB, 0xBE, 0xEB, 0xC7, /* 0x3C-0x3F */ + 0xEB, 0xC0, 0xEB, 0xC5, 0xBF, 0xE4, 0x00, 0x00, /* 0x40-0x43 */ + 0xBF, 0xE1, 0xEB, 0xC1, 0x00, 0x00, 0xEE, 0xBF, /* 0x44-0x47 */ + 0xC1, 0xD0, 0xC1, 0xCE, 0xC1, 0xD1, 0xC1, 0xCF, /* 0x48-0x4B */ + 0xEE, 0xBE, 0xEE, 0xBB, 0xEE, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xBC, /* 0x50-0x53 */ + 0xF1, 0x45, 0xC2, 0xDE, 0xF0, 0xFB, 0xF0, 0xFA, /* 0x54-0x57 */ + 0x00, 0x00, 0xC2, 0xD9, 0xF1, 0x41, 0xF1, 0x40, /* 0x58-0x5B */ + 0xF0, 0xF7, 0xF1, 0x43, 0xF0, 0xFC, 0xC2, 0xDD, /* 0x5C-0x5F */ + 0xF0, 0xF9, 0xF1, 0x42, 0xF0, 0xF8, 0xC2, 0xDA, /* 0x60-0x63 */ + 0xC2, 0xDC, 0xF0, 0xFD, 0xC2, 0xDB, 0xF0, 0xFE, /* 0x64-0x67 */ + 0x00, 0x00, 0xF1, 0x44, 0xF3, 0x52, 0x00, 0x00, /* 0x68-0x6B */ + 0xC3, 0xDE, 0xF3, 0x4F, 0x00, 0x00, 0xF3, 0x53, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xDB, 0xF3, 0x51, /* 0x70-0x73 */ + 0xC3, 0xE0, 0x00, 0x00, 0xC3, 0xDD, 0x00, 0x00, /* 0x74-0x77 */ + 0xF3, 0x50, 0x00, 0x00, 0xC3, 0xDF, 0xF3, 0x54, /* 0x78-0x7B */ + 0xC3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xC4, 0xBC, 0xC4, 0xBE, 0x00, 0x00, /* 0x80-0x83 */ + 0xF4, 0xD9, 0xC4, 0xBD, 0xF4, 0xD7, 0xC3, 0xDC, /* 0x84-0x87 */ + 0xF4, 0xD8, 0xC4, 0xBB, 0xC5, 0x43, 0xC5, 0x45, /* 0x88-0x8B */ + 0xF6, 0x56, 0xC5, 0x44, 0xF6, 0x55, 0x00, 0x00, /* 0x8C-0x8F */ + 0xF7, 0x61, 0xC5, 0xAD, 0xF7, 0x60, 0xC5, 0xAE, /* 0x90-0x93 */ + 0xF7, 0x5E, 0xF7, 0x5D, 0xF7, 0x62, 0xF7, 0x63, /* 0x94-0x97 */ + 0xF8, 0x46, 0x00, 0x00, 0xF7, 0x5F, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF8, 0xC6, 0xF8, 0xC3, 0xF8, 0xC4, /* 0x9C-0x9F */ + 0xF8, 0xC5, 0xC6, 0x5C, 0x00, 0x00, 0xF9, 0x51, /* 0xA0-0xA3 */ + 0xF9, 0x50, 0xF9, 0x4F, 0xF9, 0x70, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF9, 0xBE, 0xF9, 0xAB, 0xC6, 0x6E, 0xA8, 0xAD, /* 0xA8-0xAB */ + 0xB0, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xFA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xF6, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC2, 0xDF, 0x00, 0x00, 0xF3, 0x55, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF9, 0xAC, 0xA8, 0xAE, 0xAA, 0xEE, /* 0xC8-0xCB */ + 0xAD, 0x79, 0xAD, 0x78, 0x00, 0x00, 0xB0, 0x63, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD3, 0xE8, 0xB0, 0x61, 0xD3, 0xE9, /* 0xD0-0xD3 */ + 0xB0, 0x62, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDF, /* 0xD4-0xD7 */ + 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x6D, /* 0xD8-0xDB */ + 0xD7, 0xDE, 0xD7, 0xDD, 0xD7, 0xDC, 0xB3, 0x6E, /* 0xDC-0xDF */ + 0xD7, 0xE0, 0xD7, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xDC, 0x43, 0xDC, 0x41, 0xDC, 0x45, /* 0xE4-0xE7 */ + 0xDC, 0x46, 0xDC, 0x4C, 0x00, 0x00, 0xDC, 0x48, /* 0xE8-0xEB */ + 0xDC, 0x4A, 0x00, 0x00, 0xDC, 0x42, 0xDB, 0xFC, /* 0xEC-0xEF */ + 0x00, 0x00, 0xDC, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDC, 0x4B, 0xDC, 0x44, 0xDC, 0x47, 0xDB, 0xFD, /* 0xF4-0xF7 */ + 0xB6, 0x62, 0xDC, 0x40, 0xDB, 0xFE, 0xB6, 0x61, /* 0xF8-0xFB */ + 0xB6, 0x63, 0x00, 0x00, 0xB8, 0xFD, 0xE0, 0x75, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0xE0, 0x77, 0xE0, 0x76, 0xE0, 0x7B, 0xB8, 0xFB, /* 0x00-0x03 */ + 0x00, 0x00, 0xE0, 0x78, 0xE0, 0x74, 0xE0, 0x79, /* 0x04-0x07 */ + 0xE0, 0x7A, 0xB8, 0xFC, 0xB8, 0xFE, 0xE0, 0x7C, /* 0x08-0x0B */ + 0x00, 0x00, 0xE4, 0x67, 0xE4, 0x66, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0x64, 0xE4, 0x65, 0xBB, 0xB3, 0xBB, 0xB5, /* 0x10-0x13 */ + 0xBB, 0xB2, 0xBB, 0xB4, 0xE8, 0x4D, 0xE8, 0x4E, /* 0x14-0x17 */ + 0xE8, 0x49, 0x00, 0x00, 0xE8, 0x4A, 0xBD, 0xF8, /* 0x18-0x1B */ + 0xBD, 0xFD, 0xBD, 0xF7, 0xBD, 0xFE, 0xBD, 0xF9, /* 0x1C-0x1F */ + 0xE8, 0x4B, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x4C, /* 0x20-0x23 */ + 0xE8, 0x48, 0xBE, 0x40, 0xBD, 0xFB, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xBD, 0xFA, 0xBD, 0xFC, 0x00, 0x00, /* 0x28-0x2B */ + 0xE8, 0x47, 0x00, 0x00, 0xEB, 0xCA, 0xBF, 0xE8, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCC, 0xBF, 0xEA, /* 0x30-0x33 */ + 0xEB, 0xCF, 0xEB, 0xCB, 0xEB, 0xC9, 0xEB, 0xCE, /* 0x34-0x37 */ + 0xBF, 0xE9, 0xEB, 0xCD, 0x00, 0x00, 0xBF, 0xE7, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xC1, 0xD3, 0xC1, 0xD6, /* 0x3C-0x3F */ + 0xEE, 0xC1, 0x00, 0x00, 0xC1, 0xD4, 0xEE, 0xC0, /* 0x40-0x43 */ + 0xC1, 0xD2, 0xC1, 0xD5, 0xF1, 0x46, 0xF1, 0x47, /* 0x44-0x47 */ + 0xF1, 0x48, 0xC2, 0xE0, 0x00, 0x00, 0xF1, 0x49, /* 0x48-0x4B */ + 0x00, 0x00, 0xC2, 0xE1, 0xC3, 0xE2, 0xF3, 0x58, /* 0x4C-0x4F */ + 0xF3, 0x59, 0xF3, 0x57, 0xF3, 0x56, 0xF3, 0x5A, /* 0x50-0x53 */ + 0xC3, 0xE1, 0xF4, 0xDD, 0xF4, 0xDB, 0xF4, 0xDC, /* 0x54-0x57 */ + 0xF4, 0xDE, 0xF4, 0xDA, 0xF4, 0xDF, 0xF6, 0x58, /* 0x58-0x5B */ + 0x00, 0x00, 0xF6, 0x59, 0xF6, 0x57, 0xC5, 0x46, /* 0x5C-0x5F */ + 0xF7, 0x64, 0xC5, 0xAF, 0xF7, 0x65, 0xF8, 0x48, /* 0x60-0x63 */ + 0xF8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAF, /* 0x98-0x9B */ + 0xB6, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x40, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xB6, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xEC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xC3, 0xE3, 0xC4, 0x7C, 0xC5, 0x47, /* 0xAC-0xAF */ + 0xA8, 0xB0, 0xB0, 0x64, 0xB9, 0x41, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF3, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA8, 0xB4, 0xA8, 0xB3, 0xA8, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCB, 0xA5, 0x00, 0x00, 0xCD, 0xCD, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xCF, 0xAA, 0xEF, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAA, 0xF1, 0xCD, 0xCC, 0xCD, 0xCE, /* 0xD0-0xD3 */ + 0xAA, 0xF0, 0xCD, 0xD1, 0xCD, 0xD0, 0xCD, 0xD2, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD0, 0xB6, 0xD0, 0xB4, 0xAD, 0x7C, 0xD0, 0xB3, /* 0xE0-0xE3 */ + 0xAD, 0xA3, 0xAD, 0x7E, 0xAD, 0x7B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAD, 0xA4, 0x00, 0x00, 0xAD, 0x7D, 0xAD, 0xA2, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xA1, 0xD0, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0xAD, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xB0, 0x6A, 0xD3, 0xEB, 0xD3, 0xF1, 0xB0, 0x67, /* 0xF4-0xF7 */ + 0xB0, 0x6E, 0x00, 0x00, 0xB0, 0x69, 0xD3, 0xEE, /* 0xF8-0xFB */ + 0xD3, 0xF0, 0xB0, 0x6C, 0xD3, 0xEA, 0xD3, 0xED, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xB0, 0x68, 0xB0, 0x65, 0xD3, 0xEC, 0xB0, 0x6B, /* 0x00-0x03 */ + 0xD3, 0xEF, 0xB0, 0x6D, 0xB0, 0x66, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE3, /* 0x08-0x0B */ + 0xD7, 0xE6, 0xB3, 0x70, 0x00, 0x00, 0xB3, 0x7A, /* 0x0C-0x0F */ + 0xB3, 0x76, 0xD7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xB3, 0x7E, 0xB3, 0x77, 0xB3, 0x7C, 0xB3, 0x72, /* 0x14-0x17 */ + 0x00, 0x00, 0xB3, 0x6F, 0xB3, 0x71, 0xB3, 0x7D, /* 0x18-0x1B */ + 0xD7, 0xE5, 0xB3, 0x75, 0xB3, 0x78, 0xB3, 0x74, /* 0x1C-0x1F */ + 0xB3, 0x79, 0xD7, 0xE7, 0xB3, 0x7B, 0xB3, 0x73, /* 0x20-0x23 */ + 0xD7, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xDC, 0x4D, 0xB6, 0x65, 0xDC, 0x4F, /* 0x2C-0x2F */ + 0x00, 0x00, 0xB6, 0x67, 0xB6, 0x69, 0x00, 0x00, /* 0x30-0x33 */ + 0xDC, 0x4E, 0xB6, 0x66, 0xB6, 0x6A, 0x00, 0x00, /* 0x34-0x37 */ + 0xB6, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB9, 0x47, 0xE0, 0xA3, 0xB9, 0x4F, 0xE0, 0x7E, /* 0x3C-0x3F */ + 0x00, 0x00, 0xB9, 0x50, 0xB9, 0x45, 0x00, 0x00, /* 0x40-0x43 */ + 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x4A, /* 0x44-0x47 */ + 0x00, 0x00, 0xE0, 0xA2, 0xB9, 0x43, 0xB9, 0x42, /* 0x48-0x4B */ + 0x00, 0x00, 0xB9, 0x4D, 0xB9, 0x4C, 0xB9, 0x4B, /* 0x4C-0x4F */ + 0xB9, 0x49, 0xB9, 0x4E, 0xE0, 0x7D, 0xB9, 0x44, /* 0x50-0x53 */ + 0xB9, 0x46, 0xB9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xBB, 0xB8, 0xBB, 0xBB, 0x00, 0x00, 0xBB, 0xBF, /* 0x58-0x5B */ + 0xBB, 0xB9, 0xBB, 0xBE, 0xBB, 0xBC, 0x00, 0x00, /* 0x5C-0x5F */ + 0xBB, 0xB7, 0x00, 0x00, 0xBB, 0xBD, 0xBB, 0xBA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x52, /* 0x64-0x67 */ + 0xBE, 0x43, 0xBE, 0x41, 0x00, 0x00, 0xE8, 0x53, /* 0x68-0x6B */ + 0x00, 0x00, 0xBE, 0x44, 0xBE, 0x42, 0xE8, 0x51, /* 0x6C-0x6F */ + 0xE8, 0x50, 0x00, 0x00, 0xBF, 0xF0, 0xE8, 0x4F, /* 0x70-0x73 */ + 0xBF, 0xEE, 0xBF, 0xED, 0xEB, 0xD0, 0xBE, 0x45, /* 0x74-0x77 */ + 0xBF, 0xEF, 0xEB, 0xD1, 0xBF, 0xF2, 0xEB, 0xD2, /* 0x78-0x7B */ + 0xBF, 0xF1, 0xC1, 0xD8, 0xEE, 0xC3, 0xC1, 0xD7, /* 0x7C-0x7F */ + + 0xC1, 0xDC, 0xC1, 0xDA, 0xC1, 0xDB, 0xC2, 0xE3, /* 0x80-0x83 */ + 0xC1, 0xD9, 0xEE, 0xC2, 0xEB, 0xD3, 0xC2, 0xE2, /* 0x84-0x87 */ + 0xC2, 0xE4, 0x00, 0x00, 0xC3, 0xE4, 0xC3, 0xE5, /* 0x88-0x8B */ + 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, 0xC5, 0xDE, /* 0x8C-0x8F */ + 0xC5, 0xDD, 0xA8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCA, 0x55, 0xB0, 0x6F, 0x00, 0x00, 0xCA, 0x52, /* 0x94-0x97 */ + 0xCA, 0x53, 0xCA, 0x51, 0x00, 0x00, 0xCA, 0x54, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xAA, 0xCB, 0xA7, /* 0x9C-0x9F */ + 0xCB, 0xAC, 0xCB, 0xA8, 0xA8, 0xB7, 0xA8, 0xBA, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xA9, 0xA8, 0xB9, 0xCB, 0xAB, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0xAC-0xAF */ + 0xCD, 0xD7, 0xAA, 0xF4, 0xCD, 0xD3, 0xCD, 0xD6, /* 0xB0-0xB3 */ + 0xCD, 0xD4, 0xAA, 0xF2, 0xAA, 0xF5, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB9, /* 0xBC-0xBF */ + 0x00, 0x00, 0xAD, 0xA7, 0x00, 0x00, 0xAD, 0xA8, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD0, 0xBB, 0x00, 0x00, 0xD0, 0xBD, /* 0xC4-0xC7 */ + 0xD0, 0xBF, 0x00, 0x00, 0xAD, 0xA5, 0xD0, 0xBE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xA6, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD7, 0xEE, 0xD0, 0xBA, 0xD3, 0xF2, 0xD3, 0xFB, /* 0xD4-0xD7 */ + 0xD3, 0xF9, 0xD3, 0xF4, 0xD3, 0xF5, 0xD3, 0xFA, /* 0xD8-0xDB */ + 0xD3, 0xFC, 0xB0, 0x71, 0x00, 0x00, 0xD3, 0xF7, /* 0xDC-0xDF */ + 0xD3, 0xF3, 0xB0, 0x70, 0xB0, 0x72, 0xD3, 0xF6, /* 0xE0-0xE3 */ + 0xD3, 0xFD, 0xD3, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB3, 0xA1, 0xD7, 0xF1, 0xD7, 0xE9, 0xD7, 0xEF, /* 0xE8-0xEB */ + 0xD7, 0xF0, 0xB3, 0xA2, 0x00, 0x00, 0xD7, 0xE8, /* 0xEC-0xEF */ + 0xD7, 0xEA, 0xD0, 0xB7, 0xD7, 0xEC, 0xD7, 0xED, /* 0xF0-0xF3 */ + 0xD7, 0xEB, 0xB6, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDC, 0x56, 0xEB, 0xD4, 0xDC, 0x57, /* 0xF8-0xFB */ + 0xDC, 0x54, 0xB3, 0xA3, 0xB6, 0x6E, 0xDC, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0xDC, 0x59, 0xDC, 0x58, 0xB6, 0x6B, 0xDC, 0x5C, /* 0x00-0x03 */ + 0xDC, 0x52, 0xDC, 0x5B, 0xDC, 0x50, 0xDC, 0x5A, /* 0x04-0x07 */ + 0xDC, 0x55, 0xB6, 0x6D, 0x00, 0x00, 0xE0, 0xAA, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xA5, 0xE0, 0xAB, 0xE0, 0xA6, /* 0x0C-0x0F */ + 0xE0, 0xA4, 0xE0, 0xA7, 0xB9, 0x51, 0x00, 0x00, /* 0x10-0x13 */ + 0xE0, 0xA9, 0x00, 0x00, 0xE0, 0xA8, 0xB9, 0x52, /* 0x14-0x17 */ + 0xBB, 0xC1, 0xBB, 0xC0, 0xE4, 0x6E, 0xE4, 0x71, /* 0x18-0x1B */ + 0xE4, 0x69, 0xE4, 0x6D, 0xBB, 0xC2, 0xE4, 0x6C, /* 0x1C-0x1F */ + 0xE4, 0x6A, 0xE4, 0x70, 0xE4, 0x6B, 0xE4, 0x68, /* 0x20-0x23 */ + 0xE4, 0x6F, 0x00, 0x00, 0xE8, 0x59, 0xBE, 0x48, /* 0x24-0x27 */ + 0xF1, 0x4A, 0xE8, 0x56, 0xE8, 0x57, 0xE8, 0x55, /* 0x28-0x2B */ + 0xDC, 0x51, 0xBE, 0x47, 0xE8, 0x5A, 0xE8, 0x54, /* 0x2C-0x2F */ + 0xBE, 0x46, 0xBE, 0x49, 0xE8, 0x58, 0xEB, 0xD5, /* 0x30-0x33 */ + 0xBF, 0xF3, 0xEB, 0xD6, 0xEB, 0xD7, 0x00, 0x00, /* 0x34-0x37 */ + 0xEE, 0xC4, 0xC1, 0xDD, 0xF1, 0x4B, 0xF1, 0x4C, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0x4D, 0xF3, 0x5D, /* 0x3C-0x3F */ + 0xF3, 0x5C, 0xF4, 0xE2, 0x00, 0x00, 0xF4, 0xE1, /* 0x40-0x43 */ + 0xF6, 0x5B, 0xF6, 0x5C, 0xF6, 0x5A, 0xF7, 0x66, /* 0x44-0x47 */ + 0xC5, 0xB0, 0xA8, 0xBB, 0xAD, 0xAA, 0xAD, 0xA9, /* 0x48-0x4B */ + 0xB0, 0x75, 0xB0, 0x74, 0xD4, 0x40, 0xD4, 0x41, /* 0x4C-0x4F */ + 0xD3, 0xFE, 0x00, 0x00, 0xB0, 0x73, 0xD7, 0xF5, /* 0x50-0x53 */ + 0x00, 0x00, 0xD7, 0xF6, 0xD7, 0xF2, 0xB3, 0xA4, /* 0x54-0x57 */ + 0xD7, 0xF3, 0x00, 0x00, 0xD7, 0xF4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0x5F, /* 0x5C-0x5F */ + 0xDC, 0x61, 0xDC, 0x5D, 0xDC, 0x60, 0xB6, 0x6F, /* 0x60-0x63 */ + 0xDC, 0x5E, 0xB6, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDD, 0x73, 0xB9, 0x55, 0xB9, 0x54, 0x00, 0x00, /* 0x68-0x6B */ + 0xB9, 0x53, 0x00, 0x00, 0xE0, 0xAC, 0xE0, 0xAD, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0xE4, 0x75, /* 0x70-0x73 */ + 0xBB, 0xC6, 0xBB, 0xC3, 0x00, 0x00, 0xBB, 0xC5, /* 0x74-0x77 */ + 0xBB, 0xC4, 0xE4, 0x74, 0xE4, 0x72, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE8, 0x61, 0xE8, 0x5E, 0xE8, 0x5F, 0xBE, 0x4D, /* 0x80-0x83 */ + 0xE8, 0x60, 0xE8, 0x5B, 0xE8, 0x5C, 0xBE, 0x4A, /* 0x84-0x87 */ + 0x00, 0x00, 0xBE, 0x4B, 0xE8, 0x5D, 0xBE, 0x4C, /* 0x88-0x8B */ + 0x00, 0x00, 0xEB, 0xDB, 0x00, 0x00, 0xEB, 0xDC, /* 0x8C-0x8F */ + 0xEB, 0xD9, 0xEB, 0xDA, 0xBF, 0xF4, 0xEB, 0xD8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xEE, 0xC8, 0xEE, 0xC5, 0xEE, 0xC7, /* 0x98-0x9B */ + 0xC1, 0xE0, 0xEE, 0xCB, 0xC1, 0xDF, 0xEE, 0xC9, /* 0x9C-0x9F */ + 0xEE, 0xCC, 0xEE, 0xCA, 0xEE, 0xC6, 0xC1, 0xDE, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF1, 0x4F, 0x00, 0x00, 0xF1, 0x50, /* 0xA4-0xA7 */ + 0xF1, 0x4E, 0x00, 0x00, 0xF1, 0x52, 0xC2, 0xE5, /* 0xA8-0xAB */ + 0xC2, 0xE6, 0xF3, 0x5F, 0xC3, 0xE7, 0xF1, 0x51, /* 0xAC-0xAF */ + 0xF3, 0x5E, 0xC3, 0xE6, 0xF4, 0xE5, 0xF4, 0xE6, /* 0xB0-0xB3 */ + 0xC4, 0xBF, 0xF4, 0xE4, 0x00, 0x00, 0xF4, 0xE3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF6, 0x5D, 0xC5, 0x48, 0x00, 0x00, /* 0xB8-0xBB */ + 0xF8, 0x49, 0xF8, 0xC8, 0xF8, 0xC7, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC6, 0x43, 0xC6, 0x5D, 0xF8, 0xC9, 0xF9, 0x71, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xC6, 0x6F, 0xA8, 0xBC, 0xAA, 0xF6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xB9, 0x56, 0x00, 0x00, 0xC4, 0xC0, /* 0xC8-0xCB */ + 0xA8, 0xBD, 0xAD, 0xAB, 0xB3, 0xA5, 0xB6, 0x71, /* 0xCC-0xCF */ + 0xC2, 0xE7, 0xAA, 0xF7, 0x00, 0x00, 0xD0, 0xC1, /* 0xD0-0xD3 */ + 0xD0, 0xC0, 0xD4, 0x42, 0x00, 0x00, 0xB0, 0x78, /* 0xD4-0xD7 */ + 0xB0, 0x76, 0xB0, 0x7A, 0xD4, 0x44, 0x00, 0x00, /* 0xD8-0xDB */ + 0xB0, 0x79, 0xB0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0x43, 0xB3, 0xA8, /* 0xE0-0xE3 */ + 0xD7, 0xFC, 0x00, 0x00, 0xB3, 0xA7, 0xB3, 0xA9, /* 0xE4-0xE7 */ + 0xD8, 0x42, 0xB3, 0xAB, 0xD7, 0xFE, 0xD8, 0x40, /* 0xE8-0xEB */ + 0xD7, 0xF7, 0xB3, 0xAA, 0xD8, 0x43, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD7, 0xF9, 0x00, 0x00, 0xD7, 0xFA, /* 0xF0-0xF3 */ + 0xD7, 0xF8, 0xB3, 0xA6, 0x00, 0x00, 0xD8, 0x41, /* 0xF4-0xF7 */ + 0xD7, 0xFB, 0xD7, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDC, 0x6D, 0x00, 0x00, 0xDC, 0x6C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0xDC, 0x6A, 0xDC, 0x62, 0xDC, 0x71, 0xDC, 0x65, /* 0x00-0x03 */ + 0xDC, 0x6F, 0xDC, 0x76, 0xDC, 0x6E, 0xB6, 0x79, /* 0x04-0x07 */ + 0x00, 0x00, 0xB6, 0x75, 0xDC, 0x63, 0x00, 0x00, /* 0x08-0x0B */ + 0xDC, 0x69, 0xB6, 0x77, 0x00, 0x00, 0xDC, 0x68, /* 0x0C-0x0F */ + 0xB6, 0x78, 0xB6, 0x7A, 0xDC, 0x6B, 0x00, 0x00, /* 0x10-0x13 */ + 0xB6, 0x72, 0xB6, 0x73, 0xDC, 0x77, 0xDC, 0x75, /* 0x14-0x17 */ + 0x00, 0x00, 0xDC, 0x74, 0xDC, 0x66, 0x00, 0x00, /* 0x18-0x1B */ + 0xDC, 0x72, 0x00, 0x00, 0xB6, 0x76, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x74, /* 0x20-0x23 */ + 0xDC, 0x73, 0xDC, 0x64, 0xDC, 0x67, 0xDC, 0x70, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE4, 0xBA, 0xE0, 0xB7, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0xB0, 0xE0, 0xC3, 0xE0, 0xCC, 0xE0, 0xB3, /* 0x30-0x33 */ + 0xB9, 0x61, 0x00, 0x00, 0xE0, 0xC0, 0xB9, 0x57, /* 0x34-0x37 */ + 0xB9, 0x59, 0xB9, 0x65, 0xE0, 0xB1, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB9, 0x5A, 0xB9, 0x5C, 0xB9, 0x66, /* 0x3C-0x3F */ + 0xB9, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xB9, 0x64, 0xE0, 0xB9, 0x00, 0x00, /* 0x44-0x47 */ + 0xE0, 0xAE, 0xB9, 0x62, 0xE0, 0xB8, 0xB9, 0x5E, /* 0x48-0x4B */ + 0xE0, 0xCA, 0xB9, 0x63, 0xE0, 0xC8, 0xE0, 0xBC, /* 0x4C-0x4F */ + 0xE0, 0xC6, 0xB9, 0x60, 0xE0, 0xAF, 0xE0, 0xC9, /* 0x50-0x53 */ + 0xE0, 0xC4, 0x00, 0x00, 0xE0, 0xCB, 0xB9, 0x58, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0x67, 0xB9, 0x5D, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE0, 0xBD, 0xE0, 0xC1, 0x00, 0x00, 0xE0, 0xC5, /* 0x60-0x63 */ + 0xB9, 0x5F, 0xE0, 0xB4, 0xE0, 0xB2, 0xE0, 0xBE, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xBB, 0xE0, 0xBA, 0x00, 0x00, 0xE0, 0xBF, /* 0x6C-0x6F */ + 0xE0, 0xC2, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x78, 0x00, 0x00, /* 0x74-0x77 */ + 0xBB, 0xC7, 0xE4, 0xA4, 0xE4, 0x7A, 0xBB, 0xCC, /* 0x78-0x7B */ + 0xBB, 0xD0, 0xE4, 0xAD, 0xE4, 0xB5, 0xE4, 0xA6, /* 0x7C-0x7F */ + + 0xBB, 0xC8, 0x00, 0x00, 0xE4, 0xAA, 0xE0, 0xB6, /* 0x80-0x83 */ + 0x00, 0x00, 0xBB, 0xC9, 0xE4, 0xB1, 0xE4, 0xB6, /* 0x84-0x87 */ + 0xE4, 0xAE, 0x00, 0x00, 0xE4, 0xB0, 0xE4, 0xB9, /* 0x88-0x8B */ + 0xE4, 0xB2, 0xE4, 0x7E, 0xE4, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBB, 0xD1, 0x00, 0x00, 0xBB, 0xCD, /* 0x90-0x93 */ + 0xE4, 0x7C, 0xE4, 0xAB, 0xBB, 0xCB, 0xE4, 0xA5, /* 0x94-0x97 */ + 0xBB, 0xCA, 0xE4, 0xB3, 0xE4, 0xA2, 0xE4, 0x79, /* 0x98-0x9B */ + 0xBB, 0xCE, 0xE4, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE4, 0x7B, 0xE4, 0xAF, 0xE4, 0xAC, 0xE4, 0xA7, /* 0xA0-0xA3 */ + 0xE4, 0x77, 0xE4, 0x76, 0xE4, 0xA1, 0xE4, 0xB4, /* 0xA4-0xA7 */ + 0xBB, 0xCF, 0xE4, 0xB7, 0xE4, 0x7D, 0xE4, 0xA3, /* 0xA8-0xAB */ + 0xBE, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0x5A, 0xBE, 0x55, /* 0xB0-0xB3 */ + 0xE8, 0xA4, 0xE8, 0xA1, 0xE8, 0x67, 0xBE, 0x50, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF9, 0xD7, 0x00, 0x00, 0xBE, 0x4F, /* 0xB8-0xBB */ + 0xBE, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE8, 0x65, 0xBE, 0x54, 0xE8, 0x71, 0xE8, 0x63, /* 0xC0-0xC3 */ + 0xE8, 0x64, 0xBE, 0x4E, 0xE8, 0xA3, 0xBE, 0x58, /* 0xC4-0xC7 */ + 0xE8, 0x74, 0xE8, 0x79, 0xE8, 0x73, 0xEB, 0xEE, /* 0xC8-0xCB */ + 0xE8, 0x6F, 0xE8, 0x77, 0xE8, 0x75, 0xE8, 0x68, /* 0xCC-0xCF */ + 0xE8, 0x62, 0xE8, 0x7D, 0xBE, 0x57, 0xE8, 0x7E, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE8, 0x78, 0x00, 0x00, 0xE8, 0x6D, /* 0xD4-0xD7 */ + 0xE8, 0x6B, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE8, 0x6E, 0xE8, 0x7B, 0xE8, 0x6A, /* 0xDC-0xDF */ + 0xE8, 0x7A, 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xBE, 0x53, 0x00, 0x00, 0xE8, 0x76, 0xE8, 0x7C, /* 0xE4-0xE7 */ + 0xE8, 0x72, 0xE8, 0x6C, 0xBE, 0x51, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, 0xE8, 0x70, /* 0xEC-0xEF */ + 0xBE, 0x59, 0xE8, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF4, /* 0xF4-0xF7 */ + 0xBF, 0xF7, 0xEB, 0xF3, 0xEB, 0xF0, 0xEC, 0x44, /* 0xF8-0xFB */ + 0xBF, 0xFB, 0x00, 0x00, 0xEC, 0x41, 0xEB, 0xF8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0xEC, 0x43, 0xEB, 0xE9, 0xEB, 0xF6, 0x00, 0x00, /* 0x00-0x03 */ + 0xBF, 0xFD, 0x00, 0x00, 0xEB, 0xE1, 0x00, 0x00, /* 0x04-0x07 */ + 0xEB, 0xDF, 0xEC, 0x42, 0x00, 0x00, 0xEC, 0x40, /* 0x08-0x0B */ + 0xEB, 0xFE, 0xEB, 0xED, 0xEB, 0xEC, 0xEB, 0xE2, /* 0x0C-0x0F */ + 0xC0, 0x40, 0x00, 0x00, 0xEB, 0xE8, 0xEB, 0xF2, /* 0x10-0x13 */ + 0xEB, 0xFD, 0xC0, 0x43, 0xEC, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0xC1, 0xE8, 0xC0, 0x45, 0xBF, 0xFE, 0xEB, 0xE6, /* 0x18-0x1B */ + 0x00, 0x00, 0xEB, 0xEF, 0xEB, 0xDE, 0xEB, 0xE0, /* 0x1C-0x1F */ + 0xBF, 0xF5, 0xC0, 0x42, 0xBF, 0xFA, 0xEB, 0xE7, /* 0x20-0x23 */ + 0xEB, 0xF7, 0xEB, 0xF1, 0xC0, 0x41, 0xEB, 0xDD, /* 0x24-0x27 */ + 0xC1, 0xE3, 0xEB, 0xF9, 0xEB, 0xFC, 0xBF, 0xFC, /* 0x28-0x2B */ + 0x00, 0x00, 0xEB, 0xEB, 0xC0, 0x44, 0xBF, 0xF9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xF8, /* 0x30-0x33 */ + 0xEB, 0xF5, 0xEB, 0xFB, 0xBF, 0xF6, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xE4, 0xEB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xEA, 0xEE, 0xD2, /* 0x44-0x47 */ + 0x00, 0x00, 0xEE, 0xD7, 0xC1, 0xE5, 0xC1, 0xE7, /* 0x48-0x4B */ + 0xEE, 0xDD, 0xC1, 0xE1, 0xEE, 0xEC, 0xEE, 0xE3, /* 0x4C-0x4F */ + 0xEE, 0xD8, 0xEE, 0xD9, 0xEE, 0xE2, 0x00, 0x00, /* 0x50-0x53 */ + 0xC1, 0xEE, 0xEE, 0xE1, 0xEE, 0xD1, 0xEE, 0xE0, /* 0x54-0x57 */ + 0xEE, 0xD4, 0xEE, 0xED, 0xC1, 0xED, 0xC1, 0xEB, /* 0x58-0x5B */ + 0xEE, 0xD5, 0x00, 0x00, 0xEE, 0xE8, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEE, 0xDA, 0xEE, 0xE7, 0x00, 0x00, 0xEE, 0xE9, /* 0x60-0x63 */ + 0xEE, 0xD0, 0xC1, 0xE6, 0x00, 0x00, 0xEE, 0xEA, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x68-0x6B */ + 0xC1, 0xEA, 0xEE, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xC1, 0xEC, 0xEE, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xC1, 0xE4, 0xEE, 0xD6, 0xEE, 0xE5, /* 0x74-0x77 */ + 0x00, 0x00, 0xEE, 0xDF, 0xEB, 0xE3, 0xEE, 0xE6, /* 0x78-0x7B */ + 0xEE, 0xD3, 0x00, 0x00, 0xC1, 0xE9, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEE, 0xEB, 0x00, 0x00, 0xC1, 0xE2, 0xEE, 0xCE, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xF1, 0x60, 0xF1, 0x59, 0xC2, 0xE9, 0x00, 0x00, /* 0x88-0x8B */ + 0xF1, 0x54, 0xF1, 0x63, 0xF1, 0x5B, 0xEE, 0xDC, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF1, 0x65, 0xF1, 0x55, 0x00, 0x00, /* 0x90-0x93 */ + 0xC2, 0xE8, 0xF1, 0x5F, 0xC2, 0xEA, 0xC2, 0xF2, /* 0x94-0x97 */ + 0xC2, 0xF0, 0xF1, 0x61, 0xC2, 0xF1, 0xF1, 0x57, /* 0x98-0x9B */ + 0x00, 0x00, 0xF1, 0x58, 0xF1, 0x5D, 0xF1, 0x62, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEE, 0xCD, 0xC2, 0xEB, 0xF1, 0x6A, /* 0xA0-0xA3 */ + 0xF1, 0x67, 0xF1, 0x6B, 0xF1, 0x5E, 0xF1, 0x5A, /* 0xA4-0xA7 */ + 0xF1, 0x68, 0xF3, 0x6A, 0xF1, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC2, 0xEE, 0x00, 0x00, 0xC2, 0xED, 0xEE, 0xCF, /* 0xAC-0xAF */ + 0xC2, 0xEF, 0xF1, 0x64, 0xF1, 0x66, 0xC2, 0xEC, /* 0xB0-0xB3 */ + 0xF1, 0x69, 0xF1, 0x53, 0x00, 0x00, 0xF1, 0x56, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xF3, 0x73, 0x00, 0x00, 0xF3, 0x63, 0xC3, 0xEB, /* 0xC0-0xC3 */ + 0xF3, 0x71, 0x00, 0x00, 0x00, 0x00, 0xF3, 0x61, /* 0xC4-0xC7 */ + 0xC3, 0xEC, 0x00, 0x00, 0xF3, 0x6C, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF3, 0x68, 0xC3, 0xF1, 0xF3, 0x72, 0xF3, 0x62, /* 0xCC-0xCF */ + 0xF3, 0x65, 0xC3, 0xE9, 0xF3, 0x74, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF3, 0x6D, 0xF3, 0x70, 0xC3, 0xEF, 0xC3, 0xF4, /* 0xD4-0xD7 */ + 0xC3, 0xF2, 0xF3, 0x69, 0xF3, 0x64, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC3, 0xED, 0xC3, 0xEE, 0xF3, 0x60, 0xC3, 0xEA, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC3, 0xE8, 0xC3, 0xF0, 0xF3, 0x6F, /* 0xE0-0xE3 */ + 0xC3, 0xF3, 0x00, 0x00, 0xF3, 0x6B, 0xF3, 0x75, /* 0xE4-0xE7 */ + 0xC3, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF3, 0x67, 0x00, 0x00, 0xF3, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF4, 0xF3, 0xF5, 0x42, 0xF4, 0xF5, /* 0xF4-0xF7 */ + 0xF4, 0xFC, 0xF3, 0x66, 0xF4, 0xFA, 0xF4, 0xE9, /* 0xF8-0xFB */ + 0xF5, 0x40, 0xC4, 0xC3, 0xF4, 0xED, 0xF4, 0xFE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_94[512] = { + 0xF4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xC2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x44, 0xF4, 0xF6, /* 0x04-0x07 */ + 0x00, 0x00, 0xF4, 0xFB, 0xF4, 0xFD, 0xF4, 0xE7, /* 0x08-0x0B */ + 0xF5, 0x41, 0xF4, 0xF2, 0xF4, 0xF7, 0xF4, 0xEB, /* 0x0C-0x0F */ + 0xF4, 0xEF, 0xF5, 0x43, 0xF4, 0xF9, 0xF4, 0xE8, /* 0x10-0x13 */ + 0xF4, 0xEC, 0xF4, 0xEE, 0xF4, 0xF8, 0x00, 0x00, /* 0x14-0x17 */ + 0xC4, 0xC1, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF4, 0xF0, 0xF6, 0x61, 0xF6, 0x66, 0xC5, 0x4F, /* 0x28-0x2B */ + 0xF6, 0x68, 0x00, 0x00, 0xC5, 0x49, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF6, 0x64, 0xF6, 0x6A, 0xC5, 0x4E, 0xC5, 0x4A, /* 0x30-0x33 */ + 0x00, 0x00, 0xC5, 0x4B, 0xF6, 0x60, 0xF6, 0x67, /* 0x34-0x37 */ + 0xC5, 0x4D, 0xF6, 0x65, 0xC5, 0x4C, 0xF6, 0x5F, /* 0x38-0x3B */ + 0xF6, 0x63, 0xF6, 0x62, 0x00, 0x00, 0xF6, 0x5E, /* 0x3C-0x3F */ + 0xF6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xC5, 0xB1, 0xF7, 0x6D, 0xF7, 0x70, 0xF7, 0x6C, /* 0x44-0x47 */ + 0xF7, 0x6E, 0xF7, 0x6F, 0xF7, 0x69, 0xF7, 0x6A, /* 0x48-0x4B */ + 0xF7, 0x67, 0x00, 0x00, 0x00, 0x00, 0xF7, 0x6B, /* 0x4C-0x4F */ + 0xF7, 0x68, 0xC5, 0xB2, 0xC5, 0xB3, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF8, 0x4B, 0x00, 0x00, 0xF8, 0x4D, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF8, 0x4C, 0xF8, 0x4E, 0x00, 0x00, /* 0x5C-0x5F */ + 0xC5, 0xE0, 0x00, 0x00, 0xF8, 0x4A, 0xC5, 0xDF, /* 0x60-0x63 */ + 0xC5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xCB, 0xF8, 0xCC, 0xC6, 0x44, 0xF8, 0xCA, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0x53, 0xF9, 0x52, 0xF9, 0x54, /* 0x6C-0x6F */ + 0xC6, 0x5F, 0xF9, 0x55, 0xC6, 0x5E, 0xF9, 0x56, /* 0x70-0x73 */ + 0xF9, 0x72, 0xF9, 0x75, 0xF9, 0x74, 0xC6, 0x68, /* 0x74-0x77 */ + 0xF9, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xC6, 0x72, 0xC6, 0x70, 0xC6, 0x71, 0xC6, 0x77, /* 0x7C-0x7F */ + + 0xF9, 0xC0, 0xF9, 0xC1, 0xF9, 0xBF, 0xF9, 0xC9, /* 0x80-0x83 */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xF8, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0x44, 0xDC, 0x78, /* 0x78-0x7B */ + 0xE8, 0xA5, 0xF3, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xAA, 0xF9, 0x00, 0x00, 0xAD, 0xAC, 0xB0, 0x7B, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0x45, 0x00, 0x00, /* 0x84-0x87 */ + 0xD8, 0x46, 0xB3, 0xAC, 0x00, 0x00, 0xB6, 0x7D, /* 0x88-0x8B */ + 0xDC, 0x7A, 0xDC, 0x79, 0xB6, 0xA3, 0xB6, 0x7C, /* 0x8C-0x8F */ + 0xDC, 0x7B, 0xB6, 0x7E, 0xB6, 0xA2, 0xB6, 0xA1, /* 0x90-0x93 */ + 0xB6, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xB9, 0x68, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, /* 0x98-0x9B */ + 0xE0, 0xCE, 0x00, 0x00, 0xE0, 0xCF, 0xE0, 0xCD, /* 0x9C-0x9F */ + 0x00, 0x00, 0xBB, 0xD2, 0x00, 0x00, 0xBB, 0xD5, /* 0xA0-0xA3 */ + 0xBB, 0xD7, 0xBB, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBB, 0xD3, 0xBB, 0xD4, 0x00, 0x00, 0xE8, 0xA7, /* 0xA8-0xAB */ + 0xE8, 0xA6, 0xBE, 0x5B, 0xE8, 0xA8, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE8, 0xA9, 0xBE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xEC, 0x4D, 0xEC, 0x4B, 0xEE, 0xF3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEC, 0x49, 0xEC, 0x4A, 0xC0, 0x46, /* 0xB8-0xBB */ + 0xEC, 0x46, 0xEC, 0x4E, 0xEC, 0x48, 0xEC, 0x4C, /* 0xBC-0xBF */ + 0xEE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xF1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEE, 0xF2, 0xC1, 0xF3, 0xEE, 0xEE, /* 0xC4-0xC7 */ + 0xC1, 0xF2, 0xEE, 0xF0, 0xC1, 0xEF, 0xC1, 0xF0, /* 0xC8-0xCB */ + 0xC1, 0xF1, 0xEC, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xC2, 0xF5, 0xF1, 0x6E, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xD0-0xD3 */ + 0xC2, 0xF3, 0xC2, 0xF6, 0xC2, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0x77, 0xF3, 0x78, /* 0xD8-0xDB */ + 0xC3, 0xF6, 0x00, 0x00, 0xF5, 0x45, 0xF5, 0x47, /* 0xDC-0xDF */ + 0xF5, 0x46, 0xC4, 0xC4, 0xC5, 0x50, 0xF6, 0x6D, /* 0xE0-0xE3 */ + 0xF6, 0x6C, 0xF6, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xAA, 0xFA, 0x00, 0x00, 0xC9, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0xCA, 0x58, 0xA6, 0xE9, 0xCA, 0x56, 0xCA, 0x59, /* 0x20-0x23 */ + 0xCA, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCB, 0xAE, 0x00, 0x00, 0xA8, 0xC1, 0x00, 0x00, /* 0x28-0x2B */ + 0xA8, 0xC2, 0xCB, 0xB0, 0xA8, 0xBF, 0xCB, 0xAF, /* 0x2C-0x2F */ + 0xCB, 0xAD, 0xA8, 0xC0, 0xA8, 0xBE, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xCD, 0xD8, 0xCD, 0xDB, 0xAA, 0xFD, /* 0x38-0x3B */ + 0xCD, 0xDA, 0xCD, 0xD9, 0x00, 0x00, 0xAA, 0xFC, /* 0x3C-0x3F */ + 0xAA, 0xFB, 0x00, 0x00, 0xAB, 0x40, 0xCD, 0xDC, /* 0x40-0x43 */ + 0xAA, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC6, 0xAD, 0xAE, /* 0x48-0x4B */ + 0xAD, 0xAF, 0xAD, 0xB0, 0xD0, 0xC7, 0xD0, 0xC3, /* 0x4C-0x4F */ + 0xAD, 0xAD, 0xD0, 0xC4, 0x00, 0x00, 0xD0, 0xC5, /* 0x50-0x53 */ + 0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xB0, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA1, /* 0x58-0x5B */ + 0xD4, 0x45, 0xB0, 0xA2, 0xB0, 0xA5, 0xD4, 0x46, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB0, 0x7E, 0xB0, 0x7C, 0xB0, 0x7D, /* 0x60-0x63 */ + 0xB0, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xAD, 0xD8, 0x49, /* 0x68-0x6B */ + 0xB3, 0xB5, 0xD8, 0x48, 0x00, 0x00, 0xD8, 0x4B, /* 0x6C-0x6F */ + 0xB3, 0xB1, 0xD8, 0x4A, 0xB6, 0xAB, 0xB3, 0xAF, /* 0x70-0x73 */ + 0xB3, 0xB2, 0xB3, 0xAE, 0xB3, 0xB3, 0xB3, 0xB4, /* 0x74-0x77 */ + 0xB3, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xD8, 0x47, 0xB6, 0xA7, 0xDC, 0x7D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA2, /* 0x80-0x83 */ + 0xB6, 0xAC, 0xB6, 0xA8, 0xB6, 0xA9, 0xDC, 0x7C, /* 0x84-0x87 */ + 0xDC, 0x7E, 0xDC, 0xA1, 0xB6, 0xA4, 0xB6, 0xA6, /* 0x88-0x8B */ + 0x00, 0x00, 0xB6, 0xAA, 0xB6, 0xA5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0xD3, 0xE0, 0xD1, 0xE0, 0xD2, /* 0x90-0x93 */ + 0xB9, 0x6A, 0xB9, 0x6B, 0x00, 0x00, 0xE0, 0xD4, /* 0x94-0x97 */ + 0xB9, 0x69, 0xBB, 0xD8, 0x00, 0x00, 0xBB, 0xDA, /* 0x98-0x9B */ + 0xBB, 0xD9, 0x00, 0x00, 0xE4, 0xBB, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xBC, 0xE8, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x47, /* 0xA4-0xA7 */ + 0xC0, 0x48, 0xEC, 0x4F, 0xC0, 0x49, 0x00, 0x00, /* 0xA8-0xAB */ + 0xEE, 0xF6, 0x00, 0x00, 0xEE, 0xF4, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEE, 0xF5, 0xC1, 0xF4, 0x00, 0x00, 0xF1, 0x6F, /* 0xB0-0xB3 */ + 0xC3, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xC1, 0xF5, 0xAB, 0x41, 0x00, 0x00, 0xB0, 0xA6, /* 0xB8-0xBB */ + 0xD4, 0x47, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x4C, /* 0xBC-0xBF */ + 0xB3, 0xB6, 0xB6, 0xAD, 0xDC, 0xA4, 0xDC, 0xA6, /* 0xC0-0xC3 */ + 0xB6, 0xAF, 0xB6, 0xAE, 0xB6, 0xB0, 0xB6, 0xB1, /* 0xC4-0xC7 */ + 0xDC, 0xA5, 0xB9, 0x6E, 0xB9, 0x6F, 0xB9, 0x6D, /* 0xC8-0xCB */ + 0xBB, 0xDB, 0xB9, 0x6C, 0xE0, 0xD5, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xDC, 0xE8, 0xAC, /* 0xD0-0xD3 */ + 0xEC, 0x50, 0xC0, 0x4A, 0xC1, 0xF6, 0xF1, 0x70, /* 0xD4-0xD7 */ + 0xF1, 0x74, 0xC2, 0xF9, 0xF1, 0x71, 0xC2, 0xFA, /* 0xD8-0xDB */ + 0xC2, 0xF8, 0xF1, 0x75, 0xC2, 0xFB, 0xF1, 0x73, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF3, 0x79, 0xC2, 0xF7, 0xC3, 0xF8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAB, 0x42, 0xB3, 0xB8, 0xB3, 0xB7, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xB2, /* 0xEC-0xEF */ + 0xDC, 0xA8, 0xDC, 0xA7, 0xB6, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE0, 0xD9, 0xB9, 0x73, 0xB9, 0x70, /* 0xF4-0xF7 */ + 0xE0, 0xD8, 0xB9, 0x72, 0xE0, 0xD6, 0xB9, 0x71, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE0, 0xD7, 0x00, 0x00, 0xE4, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_97[512] = { + 0xBB, 0xDD, 0x00, 0x00, 0xE8, 0xAF, 0x00, 0x00, /* 0x00-0x03 */ + 0xBE, 0x5D, 0xE8, 0xAD, 0xBE, 0x5E, 0xBE, 0x5F, /* 0x04-0x07 */ + 0xE8, 0xAE, 0xBE, 0x60, 0x00, 0x00, 0xEC, 0x51, /* 0x08-0x0B */ + 0x00, 0x00, 0xC0, 0x4E, 0xC0, 0x4B, 0xC0, 0x50, /* 0x0C-0x0F */ + 0xEC, 0x53, 0xC0, 0x4C, 0xEC, 0x52, 0xC0, 0x4F, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x4D, 0x00, 0x00, /* 0x14-0x17 */ + 0xEE, 0xF9, 0xEE, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xC1, 0xF7, 0xEE, 0xFA, 0xC1, 0xF8, 0xEE, 0xF8, /* 0x1C-0x1F */ + 0xEE, 0xF7, 0x00, 0x00, 0xF1, 0x77, 0xF1, 0x76, /* 0x20-0x23 */ + 0xC2, 0xFC, 0xF1, 0x78, 0xF3, 0x7E, 0xC3, 0xFA, /* 0x24-0x27 */ + 0xF3, 0x7D, 0xF3, 0x7A, 0xC3, 0xF9, 0xF3, 0x7B, /* 0x28-0x2B */ + 0xF3, 0x7C, 0x00, 0x00, 0xF5, 0x48, 0xF5, 0x49, /* 0x2C-0x2F */ + 0xC4, 0xC5, 0x00, 0x00, 0xC5, 0x53, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC5, 0x51, 0xC5, 0x52, 0xF6, 0x6F, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xC5, 0xB4, 0xC5, 0xB5, 0xF7, 0x71, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x45, 0xF8, 0xCF, /* 0x40-0x43 */ + 0xC6, 0x47, 0x00, 0x00, 0xF8, 0xCE, 0xF8, 0xD0, /* 0x44-0x47 */ + 0xC6, 0x46, 0xF9, 0x57, 0x00, 0x00, 0xF9, 0xAD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x43, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0x74, 0x00, 0x00, /* 0x54-0x57 */ + 0xE4, 0xBE, 0x00, 0x00, 0xE8, 0xB0, 0xC0, 0x51, /* 0x58-0x5B */ + 0xC0, 0x52, 0x00, 0x00, 0xAB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */ + 0xBE, 0x61, 0xC3, 0xFB, 0xAD, 0xB1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x53, 0x00, 0x00, /* 0x64-0x67 */ + 0xC5, 0xE2, 0xAD, 0xB2, 0xD8, 0x4D, 0x00, 0x00, /* 0x68-0x6B */ + 0xDC, 0xA9, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDC, 0xAA, 0x00, 0x00, 0xE0, 0xDD, 0xE0, 0xDA, /* 0x70-0x73 */ + 0xB9, 0x75, 0x00, 0x00, 0xB9, 0x76, 0xE0, 0xDB, /* 0x74-0x77 */ + 0xE0, 0xDC, 0x00, 0x00, 0xE4, 0xC0, 0xE4, 0xC5, /* 0x78-0x7B */ + 0xBB, 0xDE, 0xE4, 0xBF, 0xE4, 0xC1, 0xE4, 0xC8, /* 0x7C-0x7F */ + + 0xE4, 0xC3, 0xE4, 0xC7, 0xE4, 0xC4, 0xE4, 0xC2, /* 0x80-0x83 */ + 0xE4, 0xC6, 0xBB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB1, 0xBE, 0x63, /* 0x88-0x8B */ + 0x00, 0x00, 0xBE, 0x62, 0xE8, 0xB2, 0xBE, 0x64, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xEC, 0x56, 0x00, 0x00, 0x00, 0x00, 0xEC, 0x55, /* 0x94-0x97 */ + 0xC0, 0x54, 0xEC, 0x54, 0xEE, 0xFC, 0x00, 0x00, /* 0x98-0x9B */ + 0xEE, 0xFE, 0xEF, 0x41, 0xEF, 0x40, 0x00, 0x00, /* 0x9C-0x9F */ + 0xC1, 0xF9, 0xEE, 0xFD, 0xF1, 0xA1, 0xC2, 0xFD, /* 0xA0-0xA3 */ + 0xF1, 0x7D, 0xF1, 0xA2, 0xC2, 0xFE, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF1, 0x7B, 0x00, 0x00, 0xF1, 0x7E, 0xF1, 0x7C, /* 0xA8-0xAB */ + 0xF1, 0x79, 0xC3, 0x40, 0xF1, 0x7A, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA3, 0xF3, 0xA2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF5, 0x4A, 0x00, 0x00, 0xF5, 0x4B, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0x70, /* 0xBC-0xBF */ + 0x00, 0x00, 0xC5, 0xB7, 0x00, 0x00, 0xC5, 0xB6, /* 0xC0-0xC3 */ + 0xF8, 0x4F, 0xF8, 0x50, 0xC6, 0x48, 0xF8, 0xD1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC6, 0x69, 0x00, 0x00, 0xAD, 0xB3, /* 0xC8-0xCB */ + 0xB6, 0xB4, 0xE4, 0xCA, 0xE4, 0xC9, 0xE8, 0xB5, /* 0xCC-0xCF */ + 0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFA, /* 0xD0-0xD3 */ + 0xEF, 0x43, 0xEF, 0x42, 0xF1, 0xA5, 0xF1, 0xA3, /* 0xD4-0xD7 */ + 0xF1, 0xA6, 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC3, 0xFC, 0xF3, 0xA4, 0xF3, 0xA5, 0xF3, 0xA6, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF6, 0x71, 0x00, 0x00, 0xF7, 0x72, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF8, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEC, 0x57, 0xEF, 0x44, 0x00, 0x00, 0xAD, 0xB5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xE0, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEC, 0x58, 0xC3, 0x41, 0xF1, 0xA7, 0xC3, 0xFD, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF5, 0x4C, 0xF5, 0x4D, 0xC5, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xF8, 0x51, 0xAD, 0xB6, 0xB3, 0xBB, 0xB3, 0xBC, /* 0x00-0x03 */ + 0xD8, 0x4E, 0xB6, 0xB5, 0xB6, 0xB6, 0xDC, 0xAC, /* 0x04-0x07 */ + 0xB6, 0xB7, 0x00, 0x00, 0xB9, 0x7A, 0x00, 0x00, /* 0x08-0x0B */ + 0xB9, 0x7C, 0xE0, 0xDF, 0xE0, 0xE0, 0xE0, 0xDE, /* 0x0C-0x0F */ + 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x7B, 0xB9, 0x79, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, 0xBB, 0xE1, /* 0x14-0x17 */ + 0xBB, 0xE2, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBC, /* 0x18-0x1B */ + 0xBE, 0x67, 0xE8, 0xB7, 0xE8, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE8, 0xBB, 0xBE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xC0, 0x5B, 0x00, 0x00, 0xE8, 0xB8, 0xE8, 0xBD, /* 0x24-0x27 */ + 0xE8, 0xBA, 0xE8, 0xB9, 0x00, 0x00, 0xBE, 0x66, /* 0x28-0x2B */ + 0x00, 0x00, 0xC0, 0x59, 0x00, 0x00, 0xEC, 0x5A, /* 0x2C-0x2F */ + 0xC0, 0x55, 0x00, 0x00, 0xEC, 0x5B, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xEC, 0x59, 0x00, 0x00, 0xC0, 0x58, /* 0x34-0x37 */ + 0xC0, 0x56, 0xC0, 0x5A, 0x00, 0x00, 0xC0, 0x57, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEF, 0x45, 0x00, 0x00, 0xEF, 0x4A, /* 0x40-0x43 */ + 0xEF, 0x46, 0xEF, 0x49, 0xC1, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xED, 0xD4, 0xEF, 0x48, 0xEF, 0x47, 0x00, 0x00, /* 0x48-0x4B */ + 0xC3, 0x44, 0xC3, 0x42, 0xC3, 0x45, 0xC3, 0x43, /* 0x4C-0x4F */ + 0xF1, 0xA8, 0xF1, 0xA9, 0xF1, 0xAA, 0xC3, 0x46, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0x54-0x57 */ + 0xC4, 0x40, 0xF3, 0xA8, 0x00, 0x00, 0xC4, 0x41, /* 0x58-0x5B */ + 0xF3, 0xA7, 0xF3, 0xA9, 0xC3, 0xFE, 0xF5, 0x51, /* 0x5C-0x5F */ + 0xF5, 0x4E, 0x00, 0x00, 0xF5, 0x4F, 0xF5, 0x50, /* 0x60-0x63 */ + 0xF6, 0x72, 0xC5, 0x56, 0x00, 0x00, 0xC5, 0x55, /* 0x64-0x67 */ + 0x00, 0x00, 0xF7, 0x74, 0xF7, 0x73, 0xC5, 0xB8, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xE3, /* 0x6C-0x6F */ + 0xC6, 0x49, 0xC6, 0x60, 0xF9, 0x58, 0xF9, 0xAE, /* 0x70-0x73 */ + 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xAD, 0xB7, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0xE1, 0xE4, 0xCC, 0xE4, 0xCD, 0xBB, 0xE3, /* 0xAC-0xAF */ + 0x00, 0x00, 0xBB, 0xE4, 0xE8, 0xBE, 0xBE, 0x68, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFC, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xAB, 0x00, 0x00, 0xC3, 0x47, 0xF3, 0xAD, /* 0xB8-0xBB */ + 0xC4, 0x42, 0xF3, 0xAC, 0xF3, 0xAE, 0xF3, 0xAB, /* 0xBC-0xBF */ + 0xF6, 0x75, 0xF5, 0x52, 0xF5, 0x53, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xC4, 0xC6, 0x00, 0x00, 0xF6, 0x74, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF6, 0x73, 0x00, 0x00, 0xF7, 0x75, /* 0xC8-0xCB */ + 0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB9, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA7, 0xD4, 0x48, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD8, 0x4F, 0x00, 0x00, 0xB6, 0xB8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xB6, 0xBB, 0xB6, 0xB9, 0xDC, 0xAE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xB6, 0xBD, 0x00, 0x00, 0xB6, 0xBA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xBC, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xB9, 0x7E, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0xE3, 0xE8, 0xC0, 0x00, 0x00, /* 0xF8-0xFB */ + 0xB9, 0x7D, 0xB9, 0xA1, 0xB9, 0xA2, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0xE4, 0xCF, 0x00, 0x00, 0xE4, 0xCE, 0xBB, 0xE5, /* 0x00-0x03 */ + 0x00, 0x00, 0xBB, 0xE6, 0x00, 0x00, 0xE4, 0xD0, /* 0x04-0x07 */ + 0xE8, 0xBF, 0xBB, 0xE8, 0xBE, 0x69, 0x00, 0x00, /* 0x08-0x0B */ + 0xBB, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC0, 0x5C, 0xE8, 0xC1, 0xBE, 0x6B, 0xBE, 0x6A, /* 0x10-0x13 */ + 0xE8, 0xC2, 0xE8, 0xC5, 0xE8, 0xC3, 0xE8, 0xC4, /* 0x14-0x17 */ + 0xBE, 0x6C, 0x00, 0x00, 0xC0, 0x61, 0xC0, 0x5F, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x5E, 0xEC, 0x5D, /* 0x1C-0x1F */ + 0x00, 0x00, 0xC0, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xEC, 0x5C, 0xEF, 0x4B, 0x00, 0x00, 0xEC, 0x5E, /* 0x24-0x27 */ + 0xC0, 0x5D, 0xEC, 0x5F, 0xEF, 0x4E, 0xEF, 0x4C, /* 0x28-0x2B */ + 0xEF, 0x4D, 0xEF, 0x52, 0xC3, 0x4B, 0xEF, 0x51, /* 0x2C-0x2F */ + 0xEF, 0x54, 0xEF, 0x53, 0xEF, 0x50, 0xEF, 0x4F, /* 0x30-0x33 */ + 0x00, 0x00, 0xC1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x38-0x3B */ + 0xF1, 0xAD, 0xC3, 0x4A, 0xC3, 0x48, 0xC3, 0x49, /* 0x3C-0x3F */ + 0x00, 0x00, 0xF1, 0xAC, 0x00, 0x00, 0xF3, 0xB1, /* 0x40-0x43 */ + 0x00, 0x00, 0xC4, 0x43, 0x00, 0x00, 0xF3, 0xB0, /* 0x44-0x47 */ + 0xF3, 0xAF, 0xC4, 0x44, 0x00, 0x00, 0xF5, 0x58, /* 0x48-0x4B */ + 0xF5, 0x57, 0x00, 0x00, 0xF5, 0x55, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF5, 0x54, 0xC4, 0xC8, 0xC4, 0xC7, 0xF5, 0x59, /* 0x50-0x53 */ + 0xF7, 0x76, 0xC5, 0xB9, 0xF6, 0x77, 0xC5, 0x57, /* 0x54-0x57 */ + 0xF6, 0x76, 0xF5, 0x56, 0x00, 0x00, 0xF7, 0x77, /* 0x58-0x5B */ + 0xC5, 0xE4, 0x00, 0x00, 0xC6, 0x61, 0xF9, 0x59, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBA, 0xD8, 0x50, /* 0x94-0x97 */ + 0xEF, 0x55, 0xAD, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xE4, 0xD2, 0xE4, 0xD1, 0xEC, 0x60, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEF, 0x57, 0x00, 0x00, 0xEF, 0x56, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xC3, 0x4C, 0xF3, 0xB2, 0xF3, 0xB3, /* 0xA4-0xA7 */ + 0xC4, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB2, /* 0xA8-0xAB */ + 0xB0, 0xA8, 0xB6, 0xBF, 0xB6, 0xBE, 0xE0, 0xE4, /* 0xAC-0xAF */ + 0xE0, 0xE6, 0xB9, 0xA4, 0xE0, 0xE5, 0xB9, 0xA3, /* 0xB0-0xB3 */ + 0xB9, 0xA5, 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE4, 0xD4, 0xE4, 0xD6, 0xE4, 0xD5, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBB, 0xE9, 0xE4, 0xD7, 0xE4, 0xD3, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0xE8, 0xCF, /* 0xC8-0xCB */ + 0xE8, 0xD1, 0xE8, 0xC7, 0xE8, 0xCB, 0xE8, 0xC8, /* 0xCC-0xCF */ + 0xBE, 0x6E, 0xBE, 0x71, 0xBE, 0x73, 0xE8, 0xC9, /* 0xD0-0xD3 */ + 0xE8, 0xCA, 0xBE, 0x72, 0xE8, 0xCD, 0xE8, 0xD0, /* 0xD4-0xD7 */ + 0xE8, 0xCE, 0xBE, 0x74, 0x00, 0x00, 0xBE, 0x70, /* 0xD8-0xDB */ + 0xE8, 0xC6, 0xBE, 0x6D, 0x00, 0x00, 0xBE, 0x6F, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x63, 0xEC, 0x66, /* 0xE0-0xE3 */ + 0xEC, 0x64, 0xEC, 0x63, 0x00, 0x00, 0xEC, 0x69, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xEC, 0x68, 0xEC, 0x67, 0x00, 0x00, /* 0xE8-0xEB */ + 0xEC, 0x62, 0xC0, 0x62, 0xEC, 0x61, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEC, 0x65, 0xC0, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEF, 0x5A, 0x00, 0x00, 0xEF, 0x5E, 0xEF, 0x5B, /* 0xF4-0xF7 */ + 0xEF, 0x5D, 0xEF, 0x5C, 0xEF, 0x59, 0xEF, 0x5F, /* 0xF8-0xFB */ + 0xEF, 0x62, 0xEF, 0x60, 0xEF, 0x61, 0xC2, 0x40, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xC1, 0xFE, 0xEF, 0x58, 0xEF, 0x63, /* 0x00-0x03 */ + 0xF1, 0xB3, 0xF1, 0xB6, 0xF1, 0xB8, 0xF1, 0xB7, /* 0x04-0x07 */ + 0x00, 0x00, 0xF1, 0xB1, 0xF1, 0xB5, 0xF1, 0xB0, /* 0x08-0x0B */ + 0x00, 0x00, 0xF1, 0xB2, 0xC3, 0x4D, 0xF1, 0xAF, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF3, 0xC0, 0xF3, 0xB5, 0xC4, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xC4, 0x46, 0xF3, 0xB4, 0xF3, 0xB9, /* 0x18-0x1B */ + 0xF3, 0xBF, 0xF3, 0xB7, 0xF3, 0xBE, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF3, 0xBB, 0x00, 0x00, 0xF3, 0xBA, 0xF3, 0xBD, /* 0x20-0x23 */ + 0xF3, 0xB8, 0xF3, 0xB6, 0x00, 0x00, 0xF3, 0xBC, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0x60, 0xF5, 0x5E, 0xC4, 0xCA, /* 0x28-0x2B */ + 0xF5, 0x5D, 0xF5, 0x63, 0xF5, 0x61, 0x00, 0x00, /* 0x2C-0x2F */ + 0xC4, 0xCB, 0xF5, 0x5C, 0xF5, 0x5A, 0x00, 0x00, /* 0x30-0x33 */ + 0xF5, 0x5B, 0xC4, 0xCD, 0xF5, 0x5F, 0xC4, 0xCC, /* 0x34-0x37 */ + 0xF5, 0x62, 0xF6, 0x78, 0xF6, 0x7E, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF6, 0x79, 0xC5, 0x5B, 0xF6, 0xA1, /* 0x3C-0x3F */ + 0xC5, 0x5A, 0xF6, 0x7D, 0xF6, 0x7C, 0xC5, 0x59, /* 0x40-0x43 */ + 0xF6, 0x7B, 0xC5, 0x58, 0xF6, 0x7A, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0x7D, 0xF7, 0xA1, 0xF7, 0x7E, 0x00, 0x00, /* 0x48-0x4B */ + 0xF7, 0x7B, 0xC5, 0xBB, 0xF7, 0x78, 0xF7, 0x7C, /* 0x4C-0x4F */ + 0xF7, 0xA3, 0x00, 0x00, 0xF7, 0xA2, 0xF7, 0x79, /* 0x50-0x53 */ + 0xF7, 0x7A, 0xC5, 0xBA, 0xF8, 0x52, 0xC5, 0xE7, /* 0x54-0x57 */ + 0x00, 0x00, 0xF8, 0x53, 0xC5, 0xE5, 0xC5, 0xE6, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0xC6, 0x4A, /* 0x5C-0x5F */ + 0xF9, 0x76, 0x00, 0x00, 0xC6, 0x6A, 0x00, 0x00, /* 0x60-0x63 */ + 0xF9, 0xB3, 0xC6, 0x6B, 0xF9, 0xB4, 0xF9, 0xB5, /* 0x64-0x67 */ + 0xF9, 0xC3, 0xF9, 0xC2, 0xC6, 0x7A, 0xF9, 0xCD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, 0xBB, 0xEA, /* 0xAC-0xAF */ + 0xBB, 0xEB, 0xE4, 0xDA, 0x00, 0x00, 0xE8, 0xD2, /* 0xB0-0xB3 */ + 0xEC, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x75, /* 0xB4-0xB7 */ + 0xC0, 0x65, 0xEC, 0x6A, 0x00, 0x00, 0xEC, 0x6D, /* 0xB8-0xBB */ + 0xC0, 0x66, 0x00, 0x00, 0xEF, 0x64, 0xEC, 0x6B, /* 0xBC-0xBF */ + 0xF1, 0xB9, 0xC3, 0x4E, 0xF3, 0xC1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x66, 0xF5, 0x64, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x65, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF6, 0xA2, 0x00, 0x00, 0xC5, 0x5C, /* 0xCC-0xCF */ + 0xF7, 0xA4, 0xC5, 0xEA, 0xC5, 0xBC, 0xC5, 0xE8, /* 0xD0-0xD3 */ + 0xC5, 0xE9, 0xF8, 0xD4, 0xC6, 0x62, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xB0, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF1, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xD4, 0x49, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB9, 0xA6, 0x00, 0x00, 0xE4, 0xDB, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xEC, 0xE4, 0xDC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, /* 0xE8-0xEB */ + 0xE8, 0xD3, 0xC0, 0x68, 0xBE, 0x76, 0xBE, 0x77, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE8, 0xD7, 0xE8, 0xD6, 0xE8, 0xD5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0x6E, 0xEC, 0x71, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEC, 0x70, 0xEC, 0x6F, 0xC0, 0x67, /* 0xF8-0xFB */ + 0xEF, 0x68, 0xEF, 0x66, 0xEF, 0x65, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0xEF, 0x67, 0x00, 0x00, 0xC3, 0x4F, /* 0x00-0x03 */ + 0xF1, 0xBC, 0xF1, 0xBD, 0xC3, 0x50, 0x00, 0x00, /* 0x04-0x07 */ + 0xF1, 0xBB, 0x00, 0x00, 0xF3, 0xC3, 0xF3, 0xC2, /* 0x08-0x0B */ + 0xF3, 0xC5, 0xC4, 0x47, 0xF3, 0xC4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0x67, 0xF5, 0x69, 0xF5, 0x68, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xF6, 0xA3, 0xF6, 0xA6, 0xF6, 0xA4, /* 0x14-0x17 */ + 0xF6, 0xA5, 0xF7, 0xA5, 0xC5, 0xBD, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0x54, 0xF8, 0x55, /* 0x1C-0x1F */ + 0xF8, 0x56, 0x00, 0x00, 0xC6, 0x4B, 0xC6, 0x63, /* 0x20-0x23 */ + 0xF9, 0xB6, 0xB0, 0xAB, 0x00, 0x00, 0xBE, 0x78, /* 0x24-0x27 */ + 0xC0, 0x69, 0xF1, 0xBE, 0x00, 0x00, 0xF7, 0xA6, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0xD4, 0x4A, /* 0x2C-0x2F */ + 0x00, 0x00, 0xC6, 0x7B, 0xB0, 0xAC, 0xEC, 0x72, /* 0x30-0x33 */ + 0x00, 0x00, 0xF1, 0xBF, 0x00, 0x00, 0xF3, 0xC6, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0xF7, 0xA7, /* 0x38-0x3B */ + 0xB0, 0xAD, 0x00, 0x00, 0xE4, 0xDD, 0xE4, 0xDE, /* 0x3C-0x3F */ + 0x00, 0x00, 0xBB, 0xED, 0xBB, 0xEE, 0xE8, 0xD9, /* 0x40-0x43 */ + 0xBE, 0x7A, 0xBE, 0x79, 0xE8, 0xD8, 0x00, 0x00, /* 0x44-0x47 */ + 0xEF, 0x69, 0x00, 0x00, 0xF1, 0xC0, 0xF1, 0xC2, /* 0x48-0x4B */ + 0xF1, 0xC1, 0xC3, 0x53, 0xC3, 0x52, 0xC3, 0x51, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC5, 0x5E, 0xF6, 0xA8, 0x00, 0x00, /* 0x50-0x53 */ + 0xC5, 0x5D, 0xF7, 0xA9, 0xF7, 0xA8, 0x00, 0x00, /* 0x54-0x57 */ + 0xC6, 0x4C, 0xF8, 0xD5, 0xB3, 0xBD, 0xE0, 0xEA, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0x5C-0x5F */ + 0xE4, 0xDF, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xE2, 0x00, 0x00, 0xE8, 0xDD, 0xE8, 0xDA, /* 0x64-0x67 */ + 0xE8, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x7C, /* 0x6C-0x6F */ + 0xE8, 0xE0, 0xE8, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE8, 0xDB, 0xE8, 0xDF, 0xE8, 0xDE, 0xBE, 0x7B, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0x7D, 0xEC, 0x78, /* 0x78-0x7B */ + 0xEC, 0x76, 0xEC, 0xA1, 0xEC, 0x77, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEC, 0x73, 0x00, 0x00, 0xEC, 0x79, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xEC, 0x74, 0xEF, 0x72, 0xEC, 0x75, /* 0x84-0x87 */ + 0xEC, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEC, 0x7C, 0xC0, 0x6A, 0xEC, 0x7B, 0xEC, 0x7A, /* 0x90-0x93 */ + 0x00, 0x00, 0xEC, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x6A, 0xEF, 0x6D, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x6C, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEF, 0x74, 0xEF, 0x6F, 0xEF, 0x73, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEF, 0x71, 0xEF, 0x70, 0xEF, 0x6E, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xEF, 0x6B, 0x00, 0x00, 0xC2, 0x43, 0xC2, 0x42, /* 0xA8-0xAB */ + 0x00, 0x00, 0xC2, 0x44, 0xC2, 0x41, 0xEF, 0x75, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF1, 0xC8, 0xF1, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xC9, 0xF1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF1, 0xCE, 0x00, 0x00, 0xF1, 0xC6, /* 0xBC-0xBF */ + 0xC3, 0x58, 0xF1, 0xC7, 0x00, 0x00, 0xF1, 0xC5, /* 0xC0-0xC3 */ + 0xF1, 0xCC, 0x00, 0x00, 0xF1, 0xC4, 0xF1, 0xC3, /* 0xC4-0xC7 */ + 0xC3, 0x57, 0xC3, 0x55, 0xC3, 0x54, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCA, /* 0xD0-0xD3 */ + 0xF3, 0xCF, 0xF3, 0xD5, 0xC4, 0x4A, 0xF3, 0xD0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF3, 0xD3, 0xF3, 0xD7, 0xC4, 0x4B, /* 0xD8-0xDB */ + 0xF3, 0xD2, 0x00, 0x00, 0xF3, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF3, 0xC9, 0xF3, 0xD6, 0xF3, 0xCD, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF3, 0xCB, 0xF3, 0xD4, 0xF3, 0xCC, 0xC4, 0x49, /* 0xE4-0xE7 */ + 0xC4, 0x48, 0x00, 0x00, 0xF3, 0xC7, 0xF3, 0xC8, /* 0xE8-0xEB */ + 0xF3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF3, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6C, /* 0xF4-0xF7 */ + 0xF5, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xC3, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xF5, 0x6D, 0xF5, 0x73, 0xF5, 0x71, /* 0x04-0x07 */ + 0xF5, 0x6B, 0xF5, 0x76, 0x00, 0x00, 0xF5, 0x6A, /* 0x08-0x0B */ + 0x00, 0x00, 0xC4, 0xCF, 0xF5, 0x72, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6E, 0xC4, 0xCE, /* 0x10-0x13 */ + 0xF5, 0x75, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x74, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xF6, 0xAB, 0xF6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF6, 0xB1, 0x00, 0x00, 0xF6, 0xAD, /* 0x20-0x23 */ + 0xF6, 0xB0, 0xC5, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF6, 0xAE, 0xF6, 0xAF, 0x00, 0x00, 0xF6, 0xA9, /* 0x28-0x2B */ + 0xF6, 0xAC, 0xC5, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xC5, 0xBF, 0xF7, 0xB4, 0xF7, 0xAF, /* 0x30-0x33 */ + 0xF7, 0xB3, 0x00, 0x00, 0xF7, 0xB6, 0xF7, 0xB2, /* 0x34-0x37 */ + 0x00, 0x00, 0xF7, 0xAE, 0x00, 0x00, 0xC5, 0xC1, /* 0x38-0x3B */ + 0xF7, 0xB1, 0xF7, 0xB5, 0xC5, 0xC0, 0xF7, 0xAC, /* 0x3C-0x3F */ + 0xF5, 0x70, 0xF7, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF7, 0xAD, 0x00, 0x00, 0xF7, 0xAA, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0xAB, 0xC5, 0xBE, 0xF8, 0x5A, 0xF8, 0x5C, /* 0x48-0x4B */ + 0xF8, 0x5F, 0xF8, 0x5B, 0xF8, 0x60, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF8, 0x59, 0x00, 0x00, 0xF8, 0x57, 0x00, 0x00, /* 0x50-0x53 */ + 0xC5, 0xEB, 0xF8, 0x5D, 0xC5, 0xED, 0xC5, 0xEC, /* 0x54-0x57 */ + 0xF8, 0x58, 0xF8, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDA, 0xC6, 0x4D, /* 0x5C-0x5F */ + 0xF8, 0xDB, 0x00, 0x00, 0xF8, 0xD9, 0xF8, 0xD6, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, 0xF8, 0xD7, /* 0x64-0x67 */ + 0xF9, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0x5C, 0xF9, 0x5B, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF9, 0x79, 0x00, 0x00, 0xF9, 0x78, /* 0x70-0x73 */ + 0xF9, 0x77, 0xF9, 0x7A, 0x00, 0x00, 0xC6, 0x73, /* 0x74-0x77 */ + 0xC6, 0x74, 0xF9, 0xCA, 0xF9, 0xCE, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xB3, 0xBE, 0xDC, 0xAF, 0xE0, 0xED, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xB9, 0xA7, 0xE0, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE0, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE4, 0xE2, 0xE4, 0xE3, 0xBB, 0xF1, /* 0xF0-0xF3 */ + 0xBB, 0xEF, 0xE4, 0xE4, 0xBB, 0xF0, 0xE8, 0xE8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE8, 0xEB, 0xE8, 0xE5, 0xE8, 0xEC, /* 0xF8-0xFB */ + 0xE8, 0xE4, 0xE8, 0xE6, 0x00, 0x00, 0xE8, 0xE7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9D[512] = { + 0xE8, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA1, /* 0x00-0x03 */ + 0xE8, 0xEF, 0xE8, 0xEE, 0xBE, 0x7D, 0xE8, 0xE9, /* 0x04-0x07 */ + 0xE8, 0xED, 0xBE, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xEC, 0xAC, 0x00, 0x00, 0xC0, 0x6F, 0x00, 0x00, /* 0x10-0x13 */ + 0xEC, 0xA7, 0xC0, 0x6B, 0x00, 0x00, 0xEC, 0xA4, /* 0x14-0x17 */ + 0xEC, 0xAA, 0xEC, 0xAD, 0x00, 0x00, 0xC0, 0x70, /* 0x18-0x1B */ + 0x00, 0x00, 0xEC, 0xA9, 0xEC, 0xA6, 0xEC, 0xAE, /* 0x1C-0x1F */ + 0xEC, 0xA5, 0x00, 0x00, 0xEC, 0xAB, 0xC0, 0x6C, /* 0x20-0x23 */ + 0x00, 0x00, 0xEC, 0xA3, 0xC0, 0x6D, 0x00, 0x00, /* 0x24-0x27 */ + 0xC0, 0x6E, 0xEC, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xA9, 0xEF, 0x7A, 0xEF, 0x7B, /* 0x2C-0x2F */ + 0xEF, 0x7E, 0xEF, 0x7C, 0x00, 0x00, 0xEF, 0x76, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x79, 0xEF, 0xA5, /* 0x34-0x37 */ + 0xEF, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x45, /* 0x38-0x3B */ + 0x00, 0x00, 0xEF, 0xA7, 0xEF, 0xA4, 0xC2, 0x46, /* 0x3C-0x3F */ + 0xEF, 0xA6, 0xEF, 0x77, 0xEF, 0xA2, 0xEF, 0xA3, /* 0x40-0x43 */ + 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD2, 0xF1, 0xD4, /* 0x48-0x4B */ + 0xF1, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC3, 0x59, 0xF1, 0xD9, 0xF1, 0xD0, /* 0x50-0x53 */ + 0xF1, 0xDA, 0x00, 0x00, 0xF1, 0xD6, 0xF1, 0xD8, /* 0x54-0x57 */ + 0xF1, 0xDC, 0xF1, 0xD5, 0xF1, 0xDD, 0xF1, 0xD3, /* 0x58-0x5B */ + 0xF1, 0xCF, 0xC3, 0x5A, 0x00, 0x00, 0xF1, 0xDB, /* 0x5C-0x5F */ + 0xC3, 0x5B, 0xC4, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0x78, /* 0x64-0x67 */ + 0xF3, 0xF1, 0xF3, 0xE8, 0xC4, 0x4F, 0xF3, 0xE4, /* 0x68-0x6B */ + 0xC4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xED, /* 0x6C-0x6F */ + 0xF3, 0xE7, 0xF3, 0xDD, 0xC4, 0x4E, 0xF3, 0xEA, /* 0x70-0x73 */ + 0xF3, 0xE5, 0xF3, 0xE6, 0x00, 0x00, 0xF3, 0xD8, /* 0x74-0x77 */ + 0xF3, 0xDF, 0xF3, 0xEE, 0x00, 0x00, 0xF3, 0xEB, /* 0x78-0x7B */ + 0x00, 0x00, 0xF3, 0xE3, 0x00, 0x00, 0xF3, 0xEF, /* 0x7C-0x7F */ + + 0xF3, 0xDE, 0xF3, 0xD9, 0xF3, 0xEC, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xDB, 0xF3, 0xE9, 0xF3, 0xE0, 0xF3, 0xF0, /* 0x84-0x87 */ + 0xF3, 0xDC, 0xC4, 0x4C, 0xF3, 0xDA, 0xF3, 0xE1, /* 0x88-0x8B */ + 0xF3, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xF5, 0x7D, 0x00, 0x00, 0xF5, 0x7B, 0x00, 0x00, /* 0x90-0x93 */ + 0xF5, 0xA2, 0x00, 0x00, 0xF5, 0xAE, 0xF5, 0xA5, /* 0x94-0x97 */ + 0xF5, 0x7C, 0xF5, 0x78, 0xF5, 0xA7, 0xF5, 0x7E, /* 0x98-0x9B */ + 0xF5, 0xA3, 0xF5, 0x7A, 0xF5, 0xAA, 0xF5, 0x77, /* 0x9C-0x9F */ + 0xF5, 0xA1, 0xF5, 0xA6, 0xF5, 0xA8, 0xF5, 0xAB, /* 0xA0-0xA3 */ + 0xF5, 0x79, 0x00, 0x00, 0xF5, 0xAF, 0xF5, 0xB0, /* 0xA4-0xA7 */ + 0xF5, 0xA9, 0xF5, 0xAD, 0xF5, 0xA4, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF6, 0xC1, 0xF6, 0xC4, 0x00, 0x00, 0xC5, 0x61, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF6, 0xC3, 0xF6, 0xC8, 0xF6, 0xC6, /* 0xB0-0xB3 */ + 0xC5, 0x62, 0xF6, 0xBD, 0xF6, 0xB3, 0xF6, 0xB2, /* 0xB4-0xB7 */ + 0xC5, 0x64, 0xF6, 0xBF, 0xF6, 0xC0, 0xF6, 0xBC, /* 0xB8-0xBB */ + 0xF6, 0xB4, 0x00, 0x00, 0xF6, 0xB9, 0xF5, 0xAC, /* 0xBC-0xBF */ + 0x00, 0x00, 0xF6, 0xB5, 0xC5, 0x63, 0xF6, 0xBB, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xBA, 0x00, 0x00, 0xF6, 0xB6, /* 0xC4-0xC7 */ + 0xF6, 0xC2, 0x00, 0x00, 0xF6, 0xB7, 0xF7, 0xBB, /* 0xC8-0xCB */ + 0xF6, 0xC5, 0xF6, 0xC7, 0xF6, 0xBE, 0xF6, 0xB8, /* 0xCC-0xCF */ + 0xF7, 0xBC, 0xF7, 0xBE, 0xF7, 0xB8, 0xC5, 0xC2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF7, 0xC5, 0xF7, 0xC3, 0xC5, 0xC3, /* 0xD4-0xD7 */ + 0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xBA, 0xF7, 0xB7, /* 0xD8-0xDB */ + 0xF7, 0xBD, 0xF7, 0xC6, 0xF7, 0xB9, 0xF7, 0xBF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF8, 0x69, 0xF8, 0x6E, 0xF8, 0x64, /* 0xE0-0xE3 */ + 0xF8, 0x67, 0xC5, 0xEE, 0xF8, 0x6B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF8, 0x72, 0xF7, 0xC0, 0x00, 0x00, 0xF8, 0x65, /* 0xE8-0xEB */ + 0xF8, 0x6F, 0xF8, 0x73, 0xF8, 0x6A, 0xF8, 0x63, /* 0xEC-0xEF */ + 0xF8, 0x6D, 0x00, 0x00, 0xF8, 0x6C, 0xF8, 0x71, /* 0xF0-0xF3 */ + 0xF8, 0x70, 0xF7, 0xC4, 0xF8, 0x68, 0xF8, 0x62, /* 0xF4-0xF7 */ + 0xF8, 0x66, 0xC6, 0x4E, 0xC6, 0x4F, 0xF8, 0x61, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF8, 0xE6, 0xF8, 0xDD, 0xF8, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0xF8, 0xE2, 0xF8, 0xE3, 0xF8, 0xDC, 0xF8, 0xDF, /* 0x00-0x03 */ + 0xF8, 0xE7, 0xF8, 0xE1, 0xF8, 0xE0, 0xF8, 0xDE, /* 0x04-0x07 */ + 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, 0xF9, 0x5D, /* 0x08-0x0B */ + 0x00, 0x00, 0xF9, 0x5E, 0x00, 0x00, 0xF9, 0x60, /* 0x0C-0x0F */ + 0xF9, 0x5F, 0xF9, 0x62, 0xF9, 0x61, 0xF9, 0x7C, /* 0x10-0x13 */ + 0xF9, 0x7B, 0xF9, 0xB7, 0x00, 0x00, 0xF9, 0xB8, /* 0x14-0x17 */ + 0x00, 0x00, 0xF9, 0xC5, 0xC6, 0x78, 0xC6, 0x7C, /* 0x18-0x1B */ + 0x00, 0x00, 0xF9, 0xCF, 0xC6, 0x7D, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xB3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xC4, 0xD0, 0xF6, 0xC9, 0x00, 0x00, /* 0x78-0x7B */ + 0xC6, 0x50, 0xC6, 0x51, 0x00, 0x00, 0xB3, 0xC0, /* 0x7C-0x7F */ + + 0xE0, 0xEE, 0x00, 0x00, 0xB9, 0xA8, 0xE8, 0xF0, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB0, 0xEC, 0xB1, /* 0x84-0x87 */ + 0xEC, 0xAF, 0xEF, 0xAB, 0xEF, 0xAA, 0xC2, 0x47, /* 0x88-0x8B */ + 0xF1, 0xDF, 0xEF, 0xAC, 0xF1, 0xDE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF3, 0xF3, 0xC4, 0x51, 0xC4, 0x53, /* 0x90-0x93 */ + 0xF3, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x52, /* 0x94-0x97 */ + 0x00, 0x00, 0xF5, 0xB1, 0xF5, 0xB3, 0xF5, 0xB2, /* 0x98-0x9B */ + 0xF6, 0xCA, 0xC5, 0x65, 0x00, 0x00, 0xC5, 0xEF, /* 0x9C-0x9F */ + 0xF8, 0xE8, 0xF9, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF9, 0xD2, 0xB3, 0xC1, 0x00, 0x00, 0xE4, 0xE5, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xBE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEC, 0xB3, 0xEC, 0xB2, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEF, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xC4, 0x54, 0xC4, 0xD1, 0xF7, 0xC7, 0xF9, 0xCB, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC2, /* 0xB8-0xBB */ + 0xBB, 0xF2, 0x00, 0x00, 0xBE, 0xA3, 0x00, 0x00, /* 0xBC-0xBF */ + 0xF3, 0xF4, 0x00, 0x00, 0xF8, 0x74, 0xB6, 0xC0, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xC6, 0x64, 0xB6, 0xC1, 0xBE, 0xA4, 0xC2, 0x48, /* 0xCC-0xCF */ + 0xF8, 0x75, 0xB6, 0xC2, 0x00, 0x00, 0xE8, 0xF1, /* 0xD0-0xD3 */ + 0xC0, 0x72, 0xEC, 0xB4, 0xEC, 0xB5, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC0, 0x71, 0x00, 0x00, 0xEF, 0xAF, 0xC2, 0x4C, /* 0xD8-0xDB */ + 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x49, 0xF1, 0xE0, /* 0xDC-0xDF */ + 0xC3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF5, 0xB5, 0xF5, 0xB4, 0xF5, 0xB7, 0xF5, 0xB6, /* 0xE4-0xE7 */ + 0xC4, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCB, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xCD, 0xF6, 0xCC, 0xC5, 0x66, /* 0xEC-0xEF */ + 0xF7, 0xC8, 0x00, 0x00, 0xF8, 0x76, 0xF8, 0x77, /* 0xF0-0xF3 */ + 0xC5, 0xF0, 0xF9, 0x64, 0xF9, 0x7D, 0xC6, 0x75, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDC, 0xB0, 0xEC, 0xB6, 0xEF, 0xB0, /* 0xF8-0xFB */ + 0xF3, 0xF5, 0xE0, 0xEF, 0x00, 0x00, 0xEF, 0xB1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0xF1, 0xE2, 0xF1, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0x78, 0xC6, 0x52, /* 0x04-0x07 */ + 0x00, 0x00, 0xF9, 0x65, 0xF9, 0x7E, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xA9, 0xE8, 0xF2, /* 0x0C-0x0F */ + 0xE8, 0xF3, 0x00, 0x00, 0xEC, 0xB7, 0xB9, 0xAA, /* 0x10-0x13 */ + 0x00, 0x00, 0xC3, 0x5D, 0xF1, 0xE3, 0x00, 0x00, /* 0x14-0x17 */ + 0xF6, 0xCF, 0xC5, 0x67, 0xF6, 0xD0, 0xF6, 0xCE, /* 0x18-0x1B */ + 0xF8, 0x79, 0x00, 0x00, 0xF8, 0xE9, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB9, 0xAB, 0x00, 0x00, 0xEF, 0xB4, 0xEF, 0xB3, /* 0x20-0x23 */ + 0xEF, 0xB2, 0xF1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF1, 0xE8, 0xF1, 0xE7, 0xF1, 0xE6, 0xF1, 0xE5, /* 0x28-0x2B */ + 0xC3, 0x5E, 0xF3, 0xF6, 0xF5, 0xB9, 0xC4, 0xD3, /* 0x2C-0x2F */ + 0xF5, 0xB8, 0xF6, 0xD1, 0xF7, 0xCB, 0xF7, 0xCA, /* 0x30-0x33 */ + 0xC5, 0xC4, 0xF7, 0xC9, 0xF8, 0x7C, 0xF8, 0x7B, /* 0x34-0x37 */ + 0xF8, 0x7A, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xF3, /* 0x38-0x3B */ + 0x00, 0x00, 0xEC, 0xB8, 0xC2, 0x4D, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF3, 0xF7, 0xF3, 0xF8, 0xF7, 0xCC, 0xF8, 0x7D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xEA, 0xF9, 0x66, /* 0x44-0x47 */ + 0xF9, 0xB9, 0xF9, 0xD4, 0xBB, 0xF4, 0xC2, 0x4E, /* 0x48-0x4B */ + 0xF1, 0xE9, 0xF3, 0xF9, 0xF6, 0xD2, 0xF8, 0x7E, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA6, 0x00, 0x00, /* 0x50-0x53 */ + 0xEF, 0xB5, 0xF1, 0xEA, 0xF3, 0xFA, 0xF3, 0xFB, /* 0x54-0x57 */ + 0xF3, 0xFC, 0xF5, 0xBE, 0x00, 0x00, 0xF5, 0xBA, /* 0x58-0x5B */ + 0xC5, 0x68, 0xF5, 0xBD, 0xF5, 0xBC, 0xC4, 0xD4, /* 0x5C-0x5F */ + 0xF5, 0xBB, 0xC4, 0xD6, 0x00, 0x00, 0xC4, 0xD5, /* 0x60-0x63 */ + 0xF6, 0xD4, 0xF6, 0xD3, 0xC5, 0x69, 0xC5, 0x6A, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xC6, 0xF7, 0xCD, /* 0x68-0x6B */ + 0xC5, 0xC5, 0x00, 0x00, 0xF8, 0xA3, 0xF8, 0xA4, /* 0x6C-0x6F */ + 0xF8, 0xA2, 0xF8, 0xA1, 0xC6, 0x54, 0x00, 0x00, /* 0x70-0x73 */ + 0xF8, 0xEB, 0xF8, 0xEC, 0xF8, 0xED, 0xC6, 0x53, /* 0x74-0x77 */ + 0xF9, 0x67, 0xF9, 0x6A, 0xF9, 0x69, 0xF9, 0x68, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xC0, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xC3, 0x65, 0xF5, 0xBF, 0xF6, 0xD5, 0x00, 0x00, /* 0x90-0x93 */ + 0xC5, 0xC7, 0xF7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xF9, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xC0, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEF, 0xB6, 0x00, 0x00, 0xF7, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xB0, 0x5A, 0xA7, 0xF3, 0xA8, 0xAE, 0xB8, 0xEB, /* 0x00-0x03 */ + 0xB7, 0xC6, 0xA6, 0xEA, 0xA5, 0x79, 0xC0, 0x74, /* 0x04-0x07 */ + 0xC0, 0x74, 0xAB, 0xB4, 0xAA, 0xF7, 0xB3, 0xE2, /* 0x08-0x0B */ + 0xA9, 0x60, 0xC3, 0x69, 0xC4, 0xEE, 0xC3, 0xB9, /* 0x0C-0x0F */ + 0xC5, 0xDA, 0xC1, 0xB3, 0xBB, 0x72, 0xC5, 0xDE, /* 0x10-0x13 */ + 0xBC, 0xD6, 0xAC, 0xA5, 0xAF, 0x4F, 0xAF, 0x5F, /* 0x14-0x17 */ + 0xB8, 0xA8, 0xB9, 0x54, 0xC0, 0x64, 0xB6, 0xC3, /* 0x18-0x1B */ + 0xA7, 0x5A, 0xC4, 0xE6, 0xC4, 0xEA, 0xC4, 0xF5, /* 0x1C-0x1F */ + 0xC6, 0x7D, 0xB4, 0x50, 0xC0, 0xDD, 0xC2, 0xC5, /* 0x20-0x23 */ + 0xC4, 0xB0, 0xA9, 0xD4, 0xC3, 0xBE, 0xC4, 0xFA, /* 0x24-0x27 */ + 0xB4, 0x59, 0xAE, 0xD4, 0xAE, 0xF6, 0xAF, 0x54, /* 0x28-0x2B */ + 0x00, 0x00, 0xA8, 0xD3, 0xA7, 0x4E, 0xB3, 0xD2, /* 0x2C-0x2F */ + 0xBE, 0xDB, 0xC3, 0x72, 0xC4, 0x6C, 0xBF, 0x63, /* 0x30-0x33 */ + 0xA6, 0xD1, 0xC4, 0xAA, 0xB8, 0xB8, 0xB8, 0xF4, /* 0x34-0x37 */ + 0xC5, 0x53, 0xBE, 0x7C, 0xC6, 0x4F, 0xB8, 0x4C, /* 0x38-0x3B */ + 0xB8, 0x53, 0xBA, 0xF1, 0xDB, 0x77, 0xBF, 0xFD, /* 0x3C-0x3F */ + 0xB3, 0xC0, 0xBD, 0xD7, 0xC3, 0x62, 0xA7, 0xCB, /* 0x40-0x43 */ + 0xC5, 0xA2, 0xC5, 0xA4, 0xA8, 0x63, 0xBD, 0x55, /* 0x44-0x47 */ + 0xB8, 0xEF, 0xB9, 0x70, 0xC2, 0x53, 0xB9, 0xF0, /* 0x48-0x4B */ + 0xBC, 0xD3, 0xB2, 0x5C, 0xBA, 0x7C, 0xB2, 0xD6, /* 0x4C-0x4F */ + 0xC1, 0x5C, 0xAD, 0xAE, 0xB0, 0xC7, 0xA6, 0xD8, /* 0x50-0x53 */ + 0xBB, 0xFE, 0xAD, 0xE2, 0xB8, 0x57, 0xBA, 0xF0, /* 0x54-0x57 */ + 0xB5, 0xD9, 0xB3, 0xAE, 0xC5, 0xAA, 0xCE, 0xD4, /* 0x58-0x5B */ + 0xBC, 0xD6, 0xBF, 0xD5, 0xA4, 0xA6, 0xB9, 0xE7, /* 0x5C-0x5F */ + 0xAB, 0xE3, 0xB2, 0x76, 0xB2, 0xA7, 0xA5, 0x5F, /* 0x60-0x63 */ + 0xED, 0xA8, 0xAB, 0x4B, 0xB4, 0x5F, 0xA4, 0xA3, /* 0x64-0x67 */ + 0xAA, 0x63, 0xBC, 0xC6, 0xAF, 0xC1, 0xB0, 0xD1, /* 0x68-0x6B */ + 0xB6, 0xEB, 0xAC, 0xD9, 0xB8, 0xAD, 0xBB, 0xA1, /* 0x6C-0x6F */ + 0xB1, 0xFE, 0xA8, 0xB0, 0xA8, 0x48, 0xAC, 0x42, /* 0x70-0x73 */ + 0xAD, 0x59, 0xB1, 0xB0, 0xB2, 0xA4, 0xAB, 0x47, /* 0x74-0x77 */ + 0xA8, 0xE2, 0x00, 0x00, 0xB1, 0xE7, 0xC2, 0xB3, /* 0x78-0x7B */ + 0xA8, 0x7D, 0xBD, 0xCC, 0xB6, 0x71, 0xC0, 0x79, /* 0x7C-0x7F */ + + 0xA7, 0x66, 0xA4, 0x6B, 0xC3, 0x66, 0xAE, 0xC8, /* 0x80-0x83 */ + 0xC2, 0x6F, 0xC4, 0x72, 0xBE, 0x5B, 0xC6, 0x7A, /* 0x84-0x87 */ + 0xC4, 0x52, 0xBE, 0xA4, 0xA4, 0x4F, 0xBE, 0xE4, /* 0x88-0x8B */ + 0xBE, 0xFA, 0xF7, 0x65, 0xA6, 0x7E, 0xBC, 0xA6, /* 0x8C-0x8F */ + 0xC5, 0xCA, 0xBC, 0xBF, 0xBA, 0xA7, 0xB7, 0xD2, /* 0x90-0x93 */ + 0xE6, 0xA3, 0x00, 0x00, 0xBD, 0x6D, 0xC1, 0x70, /* 0x94-0x97 */ + 0xBD, 0xFB, 0xBD, 0xAC, 0xB3, 0x73, 0xC1, 0xE5, /* 0x98-0x9B */ + 0xA6, 0x43, 0xA6, 0x48, 0xAB, 0x7C, 0xAF, 0x50, /* 0x9C-0x9F */ + 0xB5, 0xF5, 0xBB, 0xA1, 0xB7, 0x47, 0xA9, 0xC0, /* 0xA0-0xA3 */ + 0xB1, 0xC9, 0xC0, 0xD4, 0xC3, 0xAE, 0xC2, 0x79, /* 0xA4-0xA7 */ + 0xA5, 0x4F, 0xCB, 0xF1, 0xB9, 0xE7, 0xC0, 0xAD, /* 0xA8-0xAB */ + 0xCC, 0xB0, 0xAC, 0xC2, 0xBC, 0xFC, 0xB2, 0xDC, /* 0xAC-0xAF */ + 0xB2, 0xE2, 0xB9, 0x61, 0xB9, 0x73, 0xC6, 0x46, /* 0xB0-0xB3 */ + 0xBB, 0xE2, 0xA8, 0xD2, 0xC2, 0xA7, 0xC4, 0xBF, /* 0xB4-0xB7 */ + 0xC1, 0xF5, 0xB4, 0x63, 0xA4, 0x46, 0xB9, 0xB1, /* 0xB8-0xBB */ + 0xBC, 0x64, 0xA7, 0xBF, 0xAE, 0xC6, 0xBC, 0xD6, /* 0xBC-0xBF */ + 0xBF, 0x52, 0xC0, 0xF8, 0xE7, 0x64, 0xBF, 0xF1, /* 0xC0-0xC3 */ + 0xC0, 0x73, 0xB7, 0x77, 0xA8, 0xBF, 0xBC, 0x42, /* 0xC4-0xC7 */ + 0xCC, 0xD8, 0xAC, 0x68, 0xAC, 0x79, 0xB7, 0xC8, /* 0xC8-0xCB */ + 0xAF, 0x5B, 0xAF, 0x64, 0xB2, 0xB8, 0xAF, 0xC3, /* 0xCC-0xCF */ + 0xC3, 0xFE, 0xA4, 0xBB, 0xBC, 0xAE, 0xB3, 0xB0, /* 0xD0-0xD3 */ + 0xAD, 0xDB, 0xB1, 0x5B, 0xB2, 0x5F, 0xBD, 0xFC, /* 0xD4-0xD7 */ + 0xAB, 0xDF, 0xB7, 0x58, 0xAE, 0xDF, 0xB2, 0x76, /* 0xD8-0xDB */ + 0xB6, 0xA9, 0xA7, 0x51, 0xA6, 0x4F, 0xBC, 0x69, /* 0xDC-0xDF */ + 0xA9, 0xF6, 0xA7, 0xF5, 0xB1, 0xF9, 0xAA, 0x64, /* 0xE0-0xE3 */ + 0xB2, 0x7A, 0xB5, 0x67, 0xBF, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB8, 0xCC, 0xA8, 0xBD, 0xC2, 0xF7, 0xB0, 0xCE, /* 0xE8-0xEB */ + 0xB7, 0xC4, 0xA7, 0x5B, 0xBF, 0x4D, 0xBF, 0x5A, /* 0xEC-0xEF */ + 0xC4, 0xA9, 0x00, 0x00, 0xC5, 0xEC, 0xC5, 0xEF, /* 0xF0-0xF3 */ + 0xAA, 0x4C, 0xB2, 0x4F, 0xC1, 0x7B, 0xA5, 0xDF, /* 0xF4-0xF7 */ + 0xB2, 0xC1, 0xB2, 0xC9, 0xAA, 0xAC, 0xAA, 0xA5, /* 0xF8-0xFB */ + 0xC3, 0xD1, 0xA4, 0xB0, 0xAF, 0xF9, 0xA8, 0xEB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xA4, 0xC1, 0xAB, 0xD7, 0xA9, 0xDD, 0xBF, 0x7D, /* 0x00-0x03 */ + 0xA6, 0x76, 0xAC, 0x7D, 0xBC, 0xC9, 0xBF, 0xE7, /* 0x04-0x07 */ + 0xA6, 0xE6, 0xAD, 0xB0, 0xA8, 0xA3, 0xB9, 0xF8, /* 0x08-0x0B */ + 0xC9, 0x4A, 0xDD, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xB6, 0xEF, 0x00, 0x00, 0xB4, 0xB8, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xF9, 0xBD, 0xDE, 0xAF, 0x71, /* 0x14-0x17 */ + 0x00, 0x00, 0xAF, 0xAB, 0xB2, 0xBB, 0xBA, 0xD6, /* 0x18-0x1B */ + 0xB9, 0x74, 0xBA, 0xEB, 0xA6, 0xD0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xD1, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xB6, 0x68, 0xB3, 0xA3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xBA, 0xB9, 0x7D, /* 0x28-0x2B */ + 0xC0, 0x5D, 0xC5, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FE[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0x4A, 0xA1, 0x57, 0x00, 0x00, 0xA1, 0x59, /* 0x30-0x33 */ + 0xA1, 0x5B, 0xA1, 0x5F, 0xA1, 0x60, 0xA1, 0x63, /* 0x34-0x37 */ + 0xA1, 0x64, 0xA1, 0x67, 0xA1, 0x68, 0xA1, 0x6B, /* 0x38-0x3B */ + 0xA1, 0x6C, 0xA1, 0x6F, 0xA1, 0x70, 0xA1, 0x73, /* 0x3C-0x3F */ + 0xA1, 0x74, 0xA1, 0x77, 0xA1, 0x78, 0xA1, 0x7B, /* 0x40-0x43 */ + 0xA1, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA1, 0xC6, 0xA1, 0xC7, 0xA1, 0xCA, /* 0x48-0x4B */ + 0xA1, 0xCB, 0xA1, 0xC8, 0xA1, 0xC9, 0xA1, 0x5C, /* 0x4C-0x4F */ + 0xA1, 0x4D, 0xA1, 0x4E, 0xA1, 0x4F, 0x00, 0x00, /* 0x50-0x53 */ + 0xA1, 0x51, 0xA1, 0x52, 0xA1, 0x53, 0xA1, 0x54, /* 0x54-0x57 */ + 0x00, 0x00, 0xA1, 0x7D, 0xA1, 0x7E, 0xA1, 0xA1, /* 0x58-0x5B */ + 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA4, 0xA1, 0xCC, /* 0x5C-0x5F */ + 0xA1, 0xCD, 0xA1, 0xCE, 0xA1, 0xDE, 0xA1, 0xDF, /* 0x60-0x63 */ + 0xA1, 0xE0, 0xA1, 0xE1, 0xA1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */ + 0xA2, 0x42, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x68-0x6B */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA1, 0x49, 0xA1, 0xA8, 0xA1, 0xAD, /* 0x00-0x03 */ + 0xA2, 0x43, 0xA2, 0x48, 0xA1, 0xAE, 0xA1, 0xA6, /* 0x04-0x07 */ + 0xA1, 0x5D, 0xA1, 0x5E, 0xA1, 0xAF, 0xA1, 0xCF, /* 0x08-0x0B */ + 0xA1, 0x41, 0xA1, 0xD0, 0xA1, 0x44, 0xA1, 0xFE, /* 0x0C-0x0F */ + 0xA2, 0xAF, 0xA2, 0xB0, 0xA2, 0xB1, 0xA2, 0xB2, /* 0x10-0x13 */ + 0xA2, 0xB3, 0xA2, 0xB4, 0xA2, 0xB5, 0xA2, 0xB6, /* 0x14-0x17 */ + 0xA2, 0xB7, 0xA2, 0xB8, 0xA1, 0x47, 0xA1, 0x46, /* 0x18-0x1B */ + 0xA1, 0xD5, 0xA1, 0xD7, 0xA1, 0xD6, 0xA1, 0x48, /* 0x1C-0x1F */ + 0xA2, 0x49, 0xA2, 0xCF, 0xA2, 0xD0, 0xA2, 0xD1, /* 0x20-0x23 */ + 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, 0xA2, 0xD5, /* 0x24-0x27 */ + 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, 0xA2, 0xD9, /* 0x28-0x2B */ + 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, 0xA2, 0xDD, /* 0x2C-0x2F */ + 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, 0xA2, 0xE1, /* 0x30-0x33 */ + 0xA2, 0xE2, 0xA2, 0xE3, 0xA2, 0xE4, 0xA2, 0xE5, /* 0x34-0x37 */ + 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, 0xA1, 0x65, /* 0x38-0x3B */ + 0xA2, 0x40, 0xA1, 0x66, 0xA1, 0x73, 0xA1, 0xC4, /* 0x3C-0x3F */ + 0xA1, 0xA5, 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, /* 0x40-0x43 */ + 0xA2, 0xEC, 0xA2, 0xED, 0xA2, 0xEE, 0xA2, 0xEF, /* 0x44-0x47 */ + 0xA2, 0xF0, 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, /* 0x48-0x4B */ + 0xA2, 0xF4, 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, /* 0x4C-0x4F */ + 0xA2, 0xF8, 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, /* 0x50-0x53 */ + 0xA2, 0xFC, 0xA2, 0xFD, 0xA2, 0xFE, 0xA3, 0x40, /* 0x54-0x57 */ + 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, 0xA1, 0x61, /* 0x58-0x5B */ + 0xA1, 0x55, 0xA1, 0x62, 0xA1, 0xE3, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA2, 0x46, 0xA2, 0x47, 0x00, 0x00, 0xA1, 0xC3, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA2, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, NULL, u2c_02, u2c_03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, NULL, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, u2c_FE, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + n = 2; + } else if (ch==0 && cl) { + out[0] = cl; + n = 1; + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + *uni = rawstring[0]; + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + *uni = ch; + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp950", + .alias = "big5", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_cp950(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp950(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp950) +module_exit(exit_nls_cp950) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(big5); diff --git a/kernel/fs/nls/nls_euc-jp.c b/kernel/fs/nls/nls_euc-jp.c new file mode 100644 index 000000000..162b3f160 --- /dev/null +++ b/kernel/fs/nls/nls_euc-jp.c @@ -0,0 +1,580 @@ +/* + * linux/fs/nls/nls_euc-jp.c + * + * Added `OSF/JVC Recommended Code Set Conversion Specification + * between Japanese EUC and Shift-JIS' support: + * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html) + */ + +#include +#include +#include +#include +#include + +static struct nls_table *p_nls; + +#define IS_SJIS_LOW_BYTE(l) ((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F)) +/* JIS X 0208 (include NEC spesial characters) */ +#define IS_SJIS_JISX0208(h, l) ((((0x81 <= (h)) && ((h) <= 0x9F)) \ + || ((0xE0 <= (h)) && ((h) <= 0xEA))) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_JISX0201KANA(c) ((0xA1 <= (c)) && ((c) <= 0xDF)) +#define IS_SJIS_UDC_LOW(h, l) (((0xF0 <= (h)) && ((h) <= 0xF4)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_UDC_HI(h, l) (((0xF5 <= (h)) && ((h) <= 0xF9)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_IBM(h, l) (((0xFA <= (h)) && ((h) <= 0xFC)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_NECIBM(h, l) (((0xED <= (h)) && ((h) <= 0xEE)) \ + && IS_SJIS_LOW_BYTE(l)) +#define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) { \ + if ((sjis_lo) >= 0x9F) { \ + (euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1); \ + (euc_lo) = (sjis_lo) + 2; \ + } else { \ + (euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p)); \ + (euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61); \ + } \ +} while(0) + +#define SS2 (0x8E) /* Single Shift 2 */ +#define SS3 (0x8F) /* Single Shift 3 */ +#define IS_EUC_BYTE(c) ((0xA1 <= (c)) && ((c) <= 0xFE)) +#define IS_EUC_JISX0208(h, l) (IS_EUC_BYTE(h) && IS_EUC_BYTE(l)) +#define IS_EUC_JISX0201KANA(h, l) (((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF)) +#define IS_EUC_UDC_LOW(h, l) (((0xF5 <= (h)) && ((h) <= 0xFE)) \ + && IS_EUC_BYTE(l)) +#define IS_EUC_UDC_HI(h, l) IS_EUC_UDC_LOW(h, l) /* G3 block */ +#define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) { \ + if ((euc_hi) & 1) { \ + (sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2); \ + (sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61); \ + } else { \ + (sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1); \ + (sjis_lo) = (euc_lo) - 2; \ + } \ +} while(0) + +/* SJIS IBM extended characters to EUC map */ +static const unsigned char sjisibm2euc_map[][2] = { + {0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7}, + {0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC}, + {0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3}, + {0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8}, + {0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB}, + {0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF}, + {0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE}, + {0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4}, + {0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4}, + {0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC}, + {0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8}, + {0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3}, + {0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED}, + {0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8}, + {0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2}, + {0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE}, + {0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4}, + {0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8}, + {0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE}, + {0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC}, + {0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA}, + {0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8}, + {0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0}, + {0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4}, + {0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7}, + {0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0}, + {0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4}, + {0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4}, + {0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2}, + {0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3}, + {0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2}, + {0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0}, + {0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9}, + {0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC}, + {0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9}, + {0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5}, + {0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3}, + {0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB}, + {0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6}, + {0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1}, + {0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD}, + {0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3}, + {0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3}, + {0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF}, + {0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4}, + {0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE}, + {0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9}, + {0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2}, + {0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD}, + {0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4}, + {0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5}, + {0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA}, + {0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8}, + {0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2}, + {0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2}, + {0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6}, + {0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2}, + {0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4}, + {0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5}, + {0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7}, + {0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA}, + {0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE}, + {0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF}, + {0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1}, + {0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF}, + {0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6}, + {0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3}, + {0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF}, + {0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC}, + {0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF}, + {0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2}, + {0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3}, + {0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE}, + {0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1}, + {0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9}, + {0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD}, + {0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5}, + {0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE}, +}; + +#define IS_EUC_IBM2JISX0208(h, l) \ + (((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8)) + +/* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */ +static struct { + unsigned short euc; + unsigned char sjis[2]; +} euc2sjisibm_jisx0212_map[] = { + {0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}}, + {0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}}, + {0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}}, + {0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}}, + {0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}}, + {0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}}, + {0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}}, + {0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}}, + {0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}}, + {0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}}, + {0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}}, + {0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}}, + {0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}}, + {0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}}, + {0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}}, + {0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}}, + {0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}}, + {0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}}, + {0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}}, + {0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}}, + {0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}}, + {0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}}, + {0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}}, + {0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}}, + {0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}}, + {0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}}, + {0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}}, + {0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}}, + {0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}}, + {0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}}, + {0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}}, + {0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}}, + {0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}}, + {0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}}, + {0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}}, + {0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}}, + {0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}}, + {0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}}, + {0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}}, + {0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}}, + {0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}}, + {0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}}, + {0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}}, + {0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}}, + {0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}}, + {0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}}, + {0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}}, + {0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}}, + {0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}}, + {0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}}, + {0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}}, + {0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}}, + {0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}}, + {0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}}, + {0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}}, + {0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}}, + {0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}}, + {0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}}, + {0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}}, + {0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}}, + {0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}}, + {0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}}, + {0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}}, + {0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}}, + {0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}}, + {0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}}, + {0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}}, + {0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}}, + {0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}}, + {0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}}, + {0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}}, + {0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}}, + {0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}}, + {0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}}, + {0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}}, + {0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}}, + {0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}}, + {0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}}, + {0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}}, + {0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}}, + {0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}}, + {0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}}, + {0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}}, + {0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}}, + {0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}}, + {0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}}, + {0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}}, + {0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}}, + {0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}}, + {0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}}, + {0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}}, + {0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}}, + {0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}}, + {0xECD6, {0xFC, 0x4A}}, +}; + +/* EUC to SJIS IBM extended characters map (G3 Upper block) */ +static const unsigned char euc2sjisibm_g3upper_map[][2] = { + {0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44}, + {0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49}, + {0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E}, + {0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53}, + {0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A}, + {0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A}, + {0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B}, + {0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0}, + {0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD}, + {0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5}, + {0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43}, + {0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E}, + {0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D}, + {0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86}, + {0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94}, + {0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1}, + {0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1}, + {0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8}, + {0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA}, + {0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7}, + {0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49}, + {0xFC, 0x4B}, +}; + +static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, + const unsigned char sjis_lo); +static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int sjisnec2sjisibm(unsigned char *sjisibm, + const unsigned char sjisnec_hi, + const unsigned char sjisnec_lo); + +/* SJIS IBM extended characters to EUC */ +static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, + const unsigned char sjis_lo) +{ + int index; + + index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40); + if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0], + sjisibm2euc_map[index][1])) { + euc[0] = sjisibm2euc_map[index][0]; + euc[1] = sjisibm2euc_map[index][1]; + return 2; + } else { + euc[0] = SS3; + euc[1] = sjisibm2euc_map[index][0]; + euc[2] = sjisibm2euc_map[index][1]; + return 3; + } +} + +/* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */ +static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int index, min_index, max_index; + unsigned short euc; + + min_index = 0; + max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1; + euc = (euc_hi << 8) | euc_lo; + + while (min_index <= max_index) { + index = (min_index + max_index) / 2; + if (euc < euc2sjisibm_jisx0212_map[index].euc) + max_index = index - 1; + else + min_index = index + 1; + if (euc == euc2sjisibm_jisx0212_map[index].euc) { + sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0]; + sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1]; + return 3; + } + } + return 0; +} + +/* EUC to SJIS IBM extended characters (G3 Upper block) */ +static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int index; + + if (euc_hi == 0xF3) + index = ((euc_hi << 8) | euc_lo) - 0xF3F3; + else + index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12; + + if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map))) + return 0; + + sjis[0] = euc2sjisibm_g3upper_map[index][0]; + sjis[1] = euc2sjisibm_g3upper_map[index][1]; + + return 3; +} + +/* EUC to SJIS IBM extended characters (G3 block) */ +static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int n; + +#if 0 + if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) { + sjis[0] = 0xFA; + sjis[1] = 0x54; + return 2; + } else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) { + sjis[0] = 0xFA; + sjis[1] = 0x5B; + return 2; + } +#endif + if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) { + return n; + } else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) { + return n; + } + + return 0; +} + +/* NEC/IBM extended characters to IBM extended characters */ +static inline int sjisnec2sjisibm(unsigned char *sjisibm, + const unsigned char sjisnec_hi, + const unsigned char sjisnec_lo) +{ + int count; + + if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo)) + return 0; + + if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) { + sjisibm[0] = 0x81; + sjisibm[1] = 0xCA; + return 2; + } + + if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) { + count = (sjisnec_hi << 8 | sjisnec_lo) + - (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10)); + } else { + count = (sjisnec_hi - 0xED) * (0xFC - 0x40) + + (sjisnec_lo - 0x40) + (0x5C - 0x40); + if (sjisnec_lo >= 0x7F) + count--; + } + + sjisibm[0] = 0xFA + (count / (0xFC - 0x40)); + sjisibm[1] = 0x40 + (count % (0xFC - 0x40)); + if (sjisibm[1] >= 0x7F) + sjisibm[1]++; + + return 2; +} + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + int n; + + if (!p_nls) + return -EINVAL; + if ((n = p_nls->uni2char(uni, out, boundlen)) < 0) + return n; + + /* translate SJIS into EUC-JP */ + if (n == 1) { + if (IS_SJIS_JISX0201KANA(out[0])) { + /* JIS X 0201 KANA */ + if (boundlen < 2) + return -ENAMETOOLONG; + + out[1] = out[0]; + out[0] = SS2; + return 2; + } + } else if (n == 2) { + /* NEC/IBM extended characters to IBM extended characters */ + sjisnec2sjisibm(out, out[0], out[1]); + + if (IS_SJIS_UDC_LOW(out[0], out[1])) { + /* User defined characters half low */ + MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5); + } else if (IS_SJIS_UDC_HI(out[0], out[1])) { + /* User defined characters half high */ + unsigned char ch, cl; + + if (boundlen < 3) + return -ENAMETOOLONG; + + n = 3; ch = out[0]; cl = out[1]; + out[0] = SS3; + MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5); + } else if (IS_SJIS_IBM(out[0], out[1])) { + /* IBM extended characters */ + unsigned char euc[3], i; + + n = sjisibm2euc(euc, out[0], out[1]); + if (boundlen < n) + return -ENAMETOOLONG; + for (i = 0; i < n; i++) + out[i] = euc[i]; + } else if (IS_SJIS_JISX0208(out[0], out[1])) { + /* JIS X 0208 (include NEC special characters) */ + out[0] = (out[0]^0xA0)*2 + 0x5F; + if (out[1] > 0x9E) + out[0]++; + + if (out[1] < 0x7F) + out[1] = out[1] + 0x61; + else if (out[1] < 0x9F) + out[1] = out[1] + 0x60; + else + out[1] = out[1] + 0x02; + } else { + /* Invalid characters */ + return -EINVAL; + } + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char sjis_temp[2]; + int euc_offset, n; + + if ( !p_nls ) + return -EINVAL; + if (boundlen <= 0) + return -ENAMETOOLONG; + + /* translate EUC-JP into SJIS */ + if (rawstring[0] > 0x7F) { + if (rawstring[0] == SS3) { + if (boundlen < 3) + return -EINVAL; + euc_offset = 3; + + if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) { + /* User defined characters half high */ + MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5, + sjis_temp[0], sjis_temp[1], 0xF5); + } else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) { + /* IBM extended characters */ + } else { + /* JIS X 0212 and Invalid characters*/ + return -EINVAL; + + /* 'GETA' with SJIS coding */ + /* sjis_temp[0] = 0x81; */ + /* sjis_temp[1] = 0xAC; */ + } + } else { + if (boundlen < 2) + return -EINVAL; + euc_offset = 2; + + if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) { + /* JIS X 0201 KANA */ + sjis_temp[0] = rawstring[1]; + sjis_temp[1] = 0x00; + } else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) { + /* User defined characters half low */ + MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5, + sjis_temp[0], sjis_temp[1], 0xF0); + } else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) { + /* JIS X 0208 (include NEC spesial characters) */ + sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0; + if (!(rawstring[0] & 1)) + sjis_temp[1] = rawstring[1] - 0x02; + else if (rawstring[1] < 0xE0) + sjis_temp[1] = rawstring[1] - 0x61; + else + sjis_temp[1] = rawstring[1] - 0x60; + } else { + /* Invalid characters */ + return -EINVAL; + } + } + } else { + euc_offset = 1; + + /* JIS X 0201 ROMAJI */ + sjis_temp[0] = rawstring[0]; + sjis_temp[1] = 0x00; + } + + if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0) + return n; + + return euc_offset; +} + +static struct nls_table table = { + .charset = "euc-jp", + .uni2char = uni2char, + .char2uni = char2uni, +}; + +static int __init init_nls_euc_jp(void) +{ + p_nls = load_nls("cp932"); + + if (p_nls) { + table.charset2upper = p_nls->charset2upper; + table.charset2lower = p_nls->charset2lower; + return register_nls(&table); + } + + return -EINVAL; +} + +static void __exit exit_nls_euc_jp(void) +{ + unregister_nls(&table); + unload_nls(p_nls); +} + +module_init(init_nls_euc_jp) +module_exit(exit_nls_euc_jp) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-1.c b/kernel/fs/nls/nls_iso8859-1.c new file mode 100644 index 000000000..69ac020d4 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-1.c @@ -0,0 +1,257 @@ +/* + * linux/fs/nls/nls_iso8859-1.c + * + * Charset iso8859-1 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-1", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_1(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_1(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_1) +module_exit(exit_nls_iso8859_1) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-13.c b/kernel/fs/nls/nls_iso8859-13.c new file mode 100644 index 000000000..afb3f8f27 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-13.c @@ -0,0 +1,285 @@ +/* + * linux/fs/nls/nls_iso8859-13.c + * + * Charset iso8859-13 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x201d, 0x00a2, 0x00a3, + 0x00a4, 0x201e, 0x00a6, 0x00a7, + 0x00d8, 0x00a9, 0x0156, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00c6, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x201c, 0x00b5, 0x00b6, 0x00b7, + 0x00f8, 0x00b9, 0x0157, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00e6, + /* 0xc0*/ + 0x0104, 0x012e, 0x0100, 0x0106, + 0x00c4, 0x00c5, 0x0118, 0x0112, + 0x010c, 0x00c9, 0x0179, 0x0116, + 0x0122, 0x0136, 0x012a, 0x013b, + /* 0xd0*/ + 0x0160, 0x0143, 0x0145, 0x00d3, + 0x014c, 0x00d5, 0x00d6, 0x00d7, + 0x0172, 0x0141, 0x015a, 0x016a, + 0x00dc, 0x017b, 0x017d, 0x00df, + /* 0xe0*/ + 0x0105, 0x012f, 0x0101, 0x0107, + 0x00e4, 0x00e5, 0x0119, 0x0113, + 0x010d, 0x00e9, 0x017a, 0x0117, + 0x0123, 0x0137, 0x012b, 0x013c, + /* 0xf0*/ + 0x0161, 0x0144, 0x0146, 0x00f3, + 0x014d, 0x00f5, 0x00f6, 0x00f7, + 0x0173, 0x0142, 0x015b, 0x016b, + 0x00fc, 0x017c, 0x017e, 0x2019, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0xc4, 0xc5, 0xaf, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0x00, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xa8, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0xe4, 0xe5, 0xbf, 0x00, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0x00, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xb8, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xc2, 0xe2, 0x00, 0x00, 0xc0, 0xe0, 0xc3, 0xe3, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0xc7, 0xe7, 0x00, 0x00, 0xcb, 0xeb, /* 0x10-0x17 */ + 0xc6, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0xce, 0xee, 0x00, 0x00, 0xc1, 0xe1, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xed, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0xcf, 0xef, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0xd9, 0xf9, 0xd1, 0xf1, 0xd2, 0xf2, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xd4, 0xf4, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xba, /* 0x50-0x57 */ + 0x00, 0x00, 0xda, 0xfa, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xd8, 0xf8, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0xca, 0xea, 0xdd, 0xfd, 0xde, 0xfe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xff, 0x00, 0x00, 0xb4, 0xa1, 0xa5, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-13", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_13(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_13(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_13) +module_exit(exit_nls_iso8859_13) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-14.c b/kernel/fs/nls/nls_iso8859-14.c new file mode 100644 index 000000000..046370f0b --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-14.c @@ -0,0 +1,341 @@ +/* + * linux/fs/nls/nls_iso8859-14.c + * + * Charset iso8859-14 translation tables. + * + * Generated automatically from the Unicode and charset table + * provided by the Unicode Organisation at + * http://www.unicode.org/ + * The Unicode to charset table has only exact mappings. + * + * Rhys Jones, Swansea University Computer Society + * rhys@sucs.swan.ac.uk + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x1e02, 0x1e03, 0x00a3, + 0x010a, 0x010b, 0x1e0a, 0x00a7, + 0x1e80, 0x00a9, 0x1e82, 0x1e0b, + 0x1ef2, 0x00ad, 0x00ae, 0x0178, + /* 0xb0*/ + 0x1e1e, 0x1e1f, 0x0120, 0x0121, + 0x1e40, 0x1e41, 0x00b6, 0x1e56, + 0x1e81, 0x1e57, 0x1e83, 0x1e60, + 0x1ef3, 0x1e84, 0x1e85, 0x1e61, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x0174, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x1e6a, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x0176, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x0175, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x1e6b, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x0177, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0x00, 0x00, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x00, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x00, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x00, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */ + 0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */ + 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page1e[256] = { + 0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */ + 0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */ + 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page1e, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa2, 0xa2, 0xa3, 0xab, 0xab, 0xab, 0xa7, /* 0xa0-0xa7 */ + 0xb8, 0xa9, 0xba, 0xab, 0xbc, 0xad, 0xae, 0xff, /* 0xa8-0xaf */ + 0xb1, 0xb1, 0xb3, 0xb3, 0xb5, 0xb5, 0xb6, 0xb9, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbf, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa1, 0xa3, 0xa6, 0xa6, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xa6, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb0, 0xb2, 0xb2, 0xb4, 0xb4, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xa8, 0xb7, 0xaa, 0xbb, 0xac, 0xbd, 0xbd, 0xbb, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xaf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-14", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_14(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_14(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_14) +module_exit(exit_nls_iso8859_14) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-15.c b/kernel/fs/nls/nls_iso8859-15.c new file mode 100644 index 000000000..7e34a841a --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-15.c @@ -0,0 +1,307 @@ +/* + * linux/fs/nls/nls_iso8859-15.c + * + * Charset iso8859-15 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x20ac, 0x00a5, 0x0160, 0x00a7, + 0x0161, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x017d, 0x00b5, 0x00b6, 0x00b7, + 0x017e, 0x00b9, 0x00ba, 0x00bb, + 0x0152, 0x0153, 0x0178, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0xa5, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0xb9, 0xba, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xbc, 0xbd, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xa6, 0xa8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xbe, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb8, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb8, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbd, 0xbd, 0xff, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa6, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb4, 0xb9, 0xba, 0xbb, 0xbc, 0xbc, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xbe, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-15", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_15(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_15(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_15) +module_exit(exit_nls_iso8859_15) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-2.c b/kernel/fs/nls/nls_iso8859-2.c new file mode 100644 index 000000000..7dd571181 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-2.c @@ -0,0 +1,308 @@ +/* + * linux/fs/nls/nls_iso8859-2.c + * + * Charset iso8859-2 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0104, 0x02d8, 0x0141, + 0x00a4, 0x013d, 0x015a, 0x00a7, + 0x00a8, 0x0160, 0x015e, 0x0164, + 0x0179, 0x00ad, 0x017d, 0x017b, + /* 0xb0*/ + 0x00b0, 0x0105, 0x02db, 0x0142, + 0x00b4, 0x013e, 0x015b, 0x02c7, + 0x00b8, 0x0161, 0x015f, 0x0165, + 0x017a, 0x02dd, 0x017e, 0x017c, + /* 0xc0*/ + 0x0154, 0x00c1, 0x00c2, 0x0102, + 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x011a, 0x00cd, 0x00ce, 0x010e, + /* 0xd0*/ + 0x0110, 0x0143, 0x0147, 0x00d3, + 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, + 0x00dc, 0x00dd, 0x0162, 0x00df, + /* 0xe0*/ + 0x0155, 0x00e1, 0x00e2, 0x0103, + 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x011b, 0x00ed, 0x00ee, 0x010f, + /* 0xf0*/ + 0x0111, 0x0144, 0x0148, 0x00f3, + 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, + 0x00fc, 0x00fd, 0x0163, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc3, 0xe3, 0xa1, 0xb1, 0xc6, 0xe6, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xa5, 0xb5, 0x00, /* 0x38-0x3f */ + 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ + 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ + 0xd8, 0xf8, 0xa6, 0xb6, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ + 0xa9, 0xb9, 0xde, 0xfe, 0xab, 0xbb, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ + 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0xac, 0xbc, 0xaf, 0xbf, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-2", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_2(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_2(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_2) +module_exit(exit_nls_iso8859_2) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-3.c b/kernel/fs/nls/nls_iso8859-3.c new file mode 100644 index 000000000..740b75ec4 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-3.c @@ -0,0 +1,308 @@ +/* + * linux/fs/nls/nls_iso8859-3.c + * + * Charset iso8859-3 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0126, 0x02d8, 0x00a3, + 0x00a4, 0x0000, 0x0124, 0x00a7, + 0x00a8, 0x0130, 0x015e, 0x011e, + 0x0134, 0x00ad, 0x0000, 0x017b, + /* 0xb0*/ + 0x00b0, 0x0127, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x0125, 0x00b7, + 0x00b8, 0x0131, 0x015f, 0x011f, + 0x0135, 0x00bd, 0x0000, 0x017c, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x0000, + 0x00c4, 0x010a, 0x0108, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x0000, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x0120, 0x00d6, 0x00d7, + 0x011c, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x016c, 0x015c, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x0000, + 0x00e4, 0x010b, 0x0109, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x0000, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x0121, 0x00f6, 0x00f7, + 0x011d, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x016d, 0x015d, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */ + 0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-3", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_3(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_3(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_3) +module_exit(exit_nls_iso8859_3) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-4.c b/kernel/fs/nls/nls_iso8859-4.c new file mode 100644 index 000000000..8826021e3 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-4.c @@ -0,0 +1,308 @@ +/* + * linux/fs/nls/nls_iso8859-4.c + * + * Charset iso8859-4 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0104, 0x0138, 0x0156, + 0x00a4, 0x0128, 0x013b, 0x00a7, + 0x00a8, 0x0160, 0x0112, 0x0122, + 0x0166, 0x00ad, 0x017d, 0x00af, + /* 0xb0*/ + 0x00b0, 0x0105, 0x02db, 0x0157, + 0x00b4, 0x0129, 0x013c, 0x02c7, + 0x00b8, 0x0161, 0x0113, 0x0123, + 0x0167, 0x014a, 0x017e, 0x014b, + /* 0xc0*/ + 0x0100, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x012e, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x0116, 0x00cd, 0x00ce, 0x012a, + /* 0xd0*/ + 0x0110, 0x0145, 0x014c, 0x0136, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x0172, 0x00da, 0x00db, + 0x00dc, 0x0168, 0x016a, 0x00df, + /* 0xe0*/ + 0x0101, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x012f, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x0117, 0x00ed, 0x00ee, 0x012b, + /* 0xf0*/ + 0x0111, 0x0146, 0x014d, 0x0137, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x0173, 0x00fa, 0x00fb, + 0x00fc, 0x0169, 0x016b, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ + 0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */ + 0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */ + 0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */ + 0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-4", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_4(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_4(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_4) +module_exit(exit_nls_iso8859_4) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-5.c b/kernel/fs/nls/nls_iso8859-5.c new file mode 100644 index 000000000..7c04057a1 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-5.c @@ -0,0 +1,272 @@ +/* + * linux/fs/nls/nls_iso8859-5.c + * + * Charset iso8859-5 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0401, 0x0402, 0x0403, + 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, + 0x040c, 0x00ad, 0x040e, 0x040f, + /* 0xb0*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0xc0*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xd0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xe0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, + /* 0xf0*/ + 0x2116, 0x0451, 0x0452, 0x0453, + 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045a, 0x045b, + 0x045c, 0x00a7, 0x045e, 0x045f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, /* 0x08-0x0f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0xfe, 0xff, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xa0-0xa7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xad, 0xfe, 0xff, /* 0xa8-0xaf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xb0-0xb7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xd0-0xd7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xf0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xf0-0xf7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xfd, 0xae, 0xaf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-5", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_5(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_5(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_5) +module_exit(exit_nls_iso8859_5) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-6.c b/kernel/fs/nls/nls_iso8859-6.c new file mode 100644 index 000000000..d4a881400 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-6.c @@ -0,0 +1,263 @@ +/* + * linux/fs/nls/nls_iso8859-6.c + * + * Charset iso8859-6 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0660, 0x0661, 0x0662, 0x0663, + 0x0664, 0x0665, 0x0666, 0x0667, + 0x0668, 0x0669, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0000, 0x0000, 0x0000, + 0x00a4, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x060c, 0x00ad, 0x0000, 0x0000, + /* 0xb0*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x061b, + 0x0000, 0x0000, 0x0000, 0x061f, + /* 0xc0*/ + 0x0000, 0x0621, 0x0622, 0x0623, + 0x0624, 0x0625, 0x0626, 0x0627, + 0x0628, 0x0629, 0x062a, 0x062b, + 0x062c, 0x062d, 0x062e, 0x062f, + /* 0xd0*/ + 0x0630, 0x0631, 0x0632, 0x0633, + 0x0634, 0x0635, 0x0636, 0x0637, + 0x0638, 0x0639, 0x063a, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xe0*/ + 0x0640, 0x0641, 0x0642, 0x0643, + 0x0644, 0x0645, 0x0646, 0x0647, + 0x0648, 0x0649, 0x064a, 0x064b, + 0x064c, 0x064d, 0x064e, 0x064f, + /* 0xf0*/ + 0x0650, 0x0651, 0x0652, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page06[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x60-0x67 */ + 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, page06, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-6", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_6(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_6(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_6) +module_exit(exit_nls_iso8859_6) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-7.c b/kernel/fs/nls/nls_iso8859-7.c new file mode 100644 index 000000000..37b75d825 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-7.c @@ -0,0 +1,317 @@ +/* + * linux/fs/nls/nls_iso8859-7.c + * + * Charset iso8859-7 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x02bd, 0x02bc, 0x00a3, + 0x0000, 0x0000, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x0000, 0x00ab, + 0x00ac, 0x00ad, 0x0000, 0x2015, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x0384, 0x0385, 0x0386, 0x00b7, + 0x0388, 0x0389, 0x038a, 0x00bb, + 0x038c, 0x00bd, 0x038e, 0x038f, + /* 0xc0*/ + 0x0390, 0x0391, 0x0392, 0x0393, + 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, + 0x039c, 0x039d, 0x039e, 0x039f, + /* 0xd0*/ + 0x03a0, 0x03a1, 0x0000, 0x03a3, + 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, + 0x03ac, 0x03ad, 0x03ae, 0x03af, + /* 0xe0*/ + 0x03b0, 0x03b1, 0x03b2, 0x03b3, + 0x03b4, 0x03b5, 0x03b6, 0x03b7, + 0x03b8, 0x03b9, 0x03ba, 0x03bb, + 0x03bc, 0x03bd, 0x03be, 0x03bf, + /* 0xf0*/ + 0x03c0, 0x03c1, 0x03c2, 0x03c3, + 0x03c4, 0x03c5, 0x03c6, 0x03c7, + 0x03c8, 0x03c9, 0x03ca, 0x03cb, + 0x03cc, 0x03cd, 0x03ce, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0x00, 0x00, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0xa2, 0xa1, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0x00, /* 0x80-0x87 */ + 0xb8, 0xb9, 0xba, 0x00, 0xbc, 0x00, 0xbe, 0xbf, /* 0x88-0x8f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x90-0x97 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x98-0x9f */ + 0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xa0-0xa7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xa8-0xaf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xb0-0xb7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xb8-0xbf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xc0-0xc7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xdc, 0xb7, /* 0xb0-0xb7 */ + 0xdd, 0xde, 0xdf, 0xbb, 0xfc, 0xbd, 0xfd, 0xfe, /* 0xb8-0xbf */ + 0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xb6, 0xb8, 0xb9, 0xba, /* 0xd8-0xdf */ + 0xe0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd3, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xbc, 0xbe, 0xbf, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-7", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_7(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_7(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_7) +module_exit(exit_nls_iso8859_7) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_iso8859-9.c b/kernel/fs/nls/nls_iso8859-9.c new file mode 100644 index 000000000..557b98250 --- /dev/null +++ b/kernel/fs/nls/nls_iso8859-9.c @@ -0,0 +1,272 @@ +/* + * linux/fs/nls/nls_iso8859-9.c + * + * Charset iso8859-9 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x011e, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x0130, 0x015e, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x011f, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x0131, 0x015f, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xdd, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, /* 0x58-0x5f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x49, 0xde, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-9", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_iso8859_9(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_9(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_9) +module_exit(exit_nls_iso8859_9) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_koi8-r.c b/kernel/fs/nls/nls_koi8-r.c new file mode 100644 index 000000000..811f232fc --- /dev/null +++ b/kernel/fs/nls/nls_koi8-r.c @@ -0,0 +1,323 @@ +/* + * linux/fs/nls/nls_koi8-r.c + * + * Charset koi8-r translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x2500, 0x2502, 0x250c, 0x2510, + 0x2514, 0x2518, 0x251c, 0x2524, + 0x252c, 0x2534, 0x253c, 0x2580, + 0x2584, 0x2588, 0x258c, 0x2590, + /* 0x90*/ + 0x2591, 0x2592, 0x2593, 0x2320, + 0x25a0, 0x2219, 0x221a, 0x2248, + 0x2264, 0x2265, 0x00a0, 0x2321, + 0x00b0, 0x00b2, 0x00b7, 0x00f7, + /* 0xa0*/ + 0x2550, 0x2551, 0x2552, 0x0451, + 0x2553, 0x2554, 0x2555, 0x2556, + 0x2557, 0x2558, 0x2559, 0x255a, + 0x255b, 0x255c, 0x255d, 0x255e, + /* 0xb0*/ + 0x255f, 0x2560, 0x2561, 0x0401, + 0x2562, 0x2563, 0x2564, 0x2565, + 0x2566, 0x2567, 0x2568, 0x2569, + 0x256a, 0x256b, 0x256c, 0x00a9, + /* 0xc0*/ + 0x044e, 0x0430, 0x0431, 0x0446, + 0x0434, 0x0435, 0x0444, 0x0433, + 0x0445, 0x0438, 0x0439, 0x043a, + 0x043b, 0x043c, 0x043d, 0x043e, + /* 0xd0*/ + 0x043f, 0x044f, 0x0440, 0x0441, + 0x0442, 0x0443, 0x0436, 0x0432, + 0x044c, 0x044b, 0x0437, 0x0448, + 0x044d, 0x0449, 0x0447, 0x044a, + /* 0xe0*/ + 0x042e, 0x0410, 0x0411, 0x0426, + 0x0414, 0x0415, 0x0424, 0x0413, + 0x0425, 0x0418, 0x0419, 0x041a, + 0x041b, 0x041c, 0x041d, 0x041e, + /* 0xf0*/ + 0x041f, 0x042f, 0x0420, 0x0421, + 0x0422, 0x0423, 0x0416, 0x0412, + 0x042c, 0x042b, 0x0417, 0x0428, + 0x042d, 0x0429, 0x0427, 0x042a, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ + 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ + 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ + 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ + 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ + 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ + 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ + 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ + 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */ + 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ + 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */ + 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "koi8-r", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_koi8_r(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_koi8_r(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_koi8_r) +module_exit(exit_nls_koi8_r) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_koi8-ru.c b/kernel/fs/nls/nls_koi8-ru.c new file mode 100644 index 000000000..a80a741a8 --- /dev/null +++ b/kernel/fs/nls/nls_koi8-ru.c @@ -0,0 +1,82 @@ +/* + * linux/fs/nls/nls_koi8-ru.c + * + * Charset koi8-ru translation based on charset koi8-u. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static struct nls_table *p_nls; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + if (boundlen <= 0) + return -ENAMETOOLONG; + + if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) { + /* koi8-ru and koi8-u differ only on two characters */ + if (uni == 0x040e) + out[0] = 0xbe; + else if (uni == 0x045e) + out[0] = 0xae; + else if (uni == 0x255d || uni == 0x256c) + return 0; + else + return p_nls->uni2char(uni, out, boundlen); + return 1; + } + else + /* fast path */ + return p_nls->uni2char(uni, out, boundlen); +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + int n; + + if ((*rawstring & 0xef) != 0xae) { + /* koi8-ru and koi8-u differ only on two characters */ + *uni = (*rawstring & 0x10) ? 0x040e : 0x045e; + return 1; + } + + n = p_nls->char2uni(rawstring, boundlen, uni); + return n; +} + +static struct nls_table table = { + .charset = "koi8-ru", + .uni2char = uni2char, + .char2uni = char2uni, +}; + +static int __init init_nls_koi8_ru(void) +{ + p_nls = load_nls("koi8-u"); + + if (p_nls) { + table.charset2upper = p_nls->charset2upper; + table.charset2lower = p_nls->charset2lower; + return register_nls(&table); + } + + return -EINVAL; +} + +static void __exit exit_nls_koi8_ru(void) +{ + unregister_nls(&table); + unload_nls(p_nls); +} + +module_init(init_nls_koi8_ru) +module_exit(exit_nls_koi8_ru) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_koi8-u.c b/kernel/fs/nls/nls_koi8-u.c new file mode 100644 index 000000000..7e029e4c1 --- /dev/null +++ b/kernel/fs/nls/nls_koi8-u.c @@ -0,0 +1,330 @@ +/* + * linux/fs/nls/nls_koi8-u.c + * + * Charset koi8-u translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x2500, 0x2502, 0x250c, 0x2510, + 0x2514, 0x2518, 0x251c, 0x2524, + 0x252c, 0x2534, 0x253c, 0x2580, + 0x2584, 0x2588, 0x258c, 0x2590, + /* 0x90*/ + 0x2591, 0x2592, 0x2593, 0x2320, + 0x25a0, 0x2219, 0x221a, 0x2248, + 0x2264, 0x2265, 0x00a0, 0x2321, + 0x00b0, 0x00b2, 0x00b7, 0x00f7, + /* 0xa0*/ + 0x2550, 0x2551, 0x2552, 0x0451, + 0x0454, 0x2554, 0x0456, 0x0457, + 0x2557, 0x2558, 0x2559, 0x255a, + 0x255b, 0x0491, 0x255d, 0x255e, + /* 0xb0*/ + 0x255f, 0x2560, 0x2561, 0x0401, + 0x0404, 0x2563, 0x0406, 0x0407, + 0x2566, 0x2567, 0x2568, 0x2569, + 0x256a, 0x0490, 0x256c, 0x00a9, + /* 0xc0*/ + 0x044e, 0x0430, 0x0431, 0x0446, + 0x0434, 0x0435, 0x0444, 0x0433, + 0x0445, 0x0438, 0x0439, 0x043a, + 0x043b, 0x043c, 0x043d, 0x043e, + /* 0xd0*/ + 0x043f, 0x044f, 0x0440, 0x0441, + 0x0442, 0x0443, 0x0436, 0x0432, + 0x044c, 0x044b, 0x0437, 0x0448, + 0x044d, 0x0449, 0x0447, 0x044a, + /* 0xe0*/ + 0x042e, 0x0410, 0x0411, 0x0426, + 0x0414, 0x0415, 0x0424, 0x0413, + 0x0425, 0x0418, 0x0419, 0x041a, + 0x041b, 0x041c, 0x041d, 0x041e, + /* 0xf0*/ + 0x041f, 0x042f, 0x0420, 0x0421, + 0x0422, 0x0423, 0x0416, 0x0412, + 0x042c, 0x042b, 0x0417, 0x0428, + 0x042d, 0x0429, 0x0427, 0x042a, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ + 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ + 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ + 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ + 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ + 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ + 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ + 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ + 0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */ + 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ + 0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */ + 0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "koi8-u", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +static int __init init_nls_koi8_u(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_koi8_u(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_koi8_u) +module_exit(exit_nls_koi8_u) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/nls/nls_utf8.c b/kernel/fs/nls/nls_utf8.c new file mode 100644 index 000000000..afcfbc4a1 --- /dev/null +++ b/kernel/fs/nls/nls_utf8.c @@ -0,0 +1,67 @@ +/* + * Module for handling utf8 just like any other charset. + * By Urban Widmark 2000 + */ + +#include +#include +#include +#include +#include + +static unsigned char identity[256]; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { + *out = '?'; + return -EINVAL; + } + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + int n; + unicode_t u; + + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { + *uni = 0x003f; /* ? */ + return -EINVAL; + } + *uni = (wchar_t) u; + return n; +} + +static struct nls_table table = { + .charset = "utf8", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = identity, /* no conversion */ + .charset2upper = identity, +}; + +static int __init init_nls_utf8(void) +{ + int i; + for (i=0; i<256; i++) + identity[i] = i; + + return register_nls(&table); +} + +static void __exit exit_nls_utf8(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_utf8) +module_exit(exit_nls_utf8) +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kernel/fs/no-block.c b/kernel/fs/no-block.c new file mode 100644 index 000000000..6e40e42a4 --- /dev/null +++ b/kernel/fs/no-block.c @@ -0,0 +1,23 @@ +/* no-block.c: implementation of routines required for non-BLOCK configuration + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +static int no_blkdev_open(struct inode * inode, struct file * filp) +{ + return -ENODEV; +} + +const struct file_operations def_blk_fops = { + .open = no_blkdev_open, + .llseek = noop_llseek, +}; diff --git a/kernel/fs/notify/Kconfig b/kernel/fs/notify/Kconfig new file mode 100644 index 000000000..2a24249b3 --- /dev/null +++ b/kernel/fs/notify/Kconfig @@ -0,0 +1,7 @@ +config FSNOTIFY + def_bool n + select SRCU + +source "fs/notify/dnotify/Kconfig" +source "fs/notify/inotify/Kconfig" +source "fs/notify/fanotify/Kconfig" diff --git a/kernel/fs/notify/Makefile b/kernel/fs/notify/Makefile new file mode 100644 index 000000000..96d3420d0 --- /dev/null +++ b/kernel/fs/notify/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ + mark.o vfsmount_mark.o fdinfo.o + +obj-y += dnotify/ +obj-y += inotify/ +obj-y += fanotify/ diff --git a/kernel/fs/notify/dnotify/Kconfig b/kernel/fs/notify/dnotify/Kconfig new file mode 100644 index 000000000..f9c1ca139 --- /dev/null +++ b/kernel/fs/notify/dnotify/Kconfig @@ -0,0 +1,11 @@ +config DNOTIFY + bool "Dnotify support" + select FSNOTIFY + default y + help + Dnotify is a directory-based per-fd file change notification system + that uses signals to communicate events to user-space. There exist + superior alternatives, but some applications may still rely on + dnotify. + + If unsure, say Y. diff --git a/kernel/fs/notify/dnotify/Makefile b/kernel/fs/notify/dnotify/Makefile new file mode 100644 index 000000000..f145251dc --- /dev/null +++ b/kernel/fs/notify/dnotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_DNOTIFY) += dnotify.o diff --git a/kernel/fs/notify/dnotify/dnotify.c b/kernel/fs/notify/dnotify/dnotify.c new file mode 100644 index 000000000..44523f4a6 --- /dev/null +++ b/kernel/fs/notify/dnotify/dnotify.c @@ -0,0 +1,388 @@ +/* + * Directory notifications for Linux. + * + * Copyright (C) 2000,2001,2002 Stephen Rothwell + * + * Copyright (C) 2009 Eric Paris + * dnotify was largly rewritten to use the new fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int dir_notify_enable __read_mostly = 1; + +static struct kmem_cache *dnotify_struct_cache __read_mostly; +static struct kmem_cache *dnotify_mark_cache __read_mostly; +static struct fsnotify_group *dnotify_group __read_mostly; + +/* + * dnotify will attach one of these to each inode (i_fsnotify_marks) which + * is being watched by dnotify. If multiple userspace applications are watching + * the same directory with dnotify their information is chained in dn + */ +struct dnotify_mark { + struct fsnotify_mark fsn_mark; + struct dnotify_struct *dn; +}; + +/* + * When a process starts or stops watching an inode the set of events which + * dnotify cares about for that inode may change. This function runs the + * list of everything receiving dnotify events about this directory and calculates + * the set of all those events. After it updates what dnotify is interested in + * it calls the fsnotify function so it can update the set of all events relevant + * to this inode. + */ +static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) +{ + __u32 new_mask, old_mask; + struct dnotify_struct *dn; + struct dnotify_mark *dn_mark = container_of(fsn_mark, + struct dnotify_mark, + fsn_mark); + + assert_spin_locked(&fsn_mark->lock); + + old_mask = fsn_mark->mask; + new_mask = 0; + for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) + new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); + fsnotify_set_mark_mask_locked(fsn_mark, new_mask); + + if (old_mask == new_mask) + return; + + if (fsn_mark->inode) + fsnotify_recalc_inode_mask(fsn_mark->inode); +} + +/* + * Mains fsnotify call where events are delivered to dnotify. + * Find the dnotify mark on the relevant inode, run the list of dnotify structs + * on that mark and determine which of them has expressed interest in receiving + * events of this type. When found send the correct process and signal and + * destroy the dnotify struct if it was not registered to receive multiple + * events. + */ +static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) +{ + struct dnotify_mark *dn_mark; + struct dnotify_struct *dn; + struct dnotify_struct **prev; + struct fown_struct *fown; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; + + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; + + BUG_ON(vfsmount_mark); + + dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); + + spin_lock(&inode_mark->lock); + prev = &dn_mark->dn; + while ((dn = *prev) != NULL) { + if ((dn->dn_mask & test_mask) == 0) { + prev = &dn->dn_next; + continue; + } + fown = &dn->dn_filp->f_owner; + send_sigio(fown, dn->dn_fd, POLL_MSG); + if (dn->dn_mask & FS_DN_MULTISHOT) + prev = &dn->dn_next; + else { + *prev = dn->dn_next; + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(inode_mark); + } + } + + spin_unlock(&inode_mark->lock); + + return 0; +} + +static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct dnotify_mark *dn_mark = container_of(fsn_mark, + struct dnotify_mark, + fsn_mark); + + BUG_ON(dn_mark->dn); + + kmem_cache_free(dnotify_mark_cache, dn_mark); +} + +static struct fsnotify_ops dnotify_fsnotify_ops = { + .handle_event = dnotify_handle_event, +}; + +/* + * Called every time a file is closed. Looks first for a dnotify mark on the + * inode. If one is found run all of the ->dn structures attached to that + * mark for one relevant to this process closing the file and remove that + * dnotify_struct. If that was the last dnotify_struct also remove the + * fsnotify_mark. + */ +void dnotify_flush(struct file *filp, fl_owner_t id) +{ + struct fsnotify_mark *fsn_mark; + struct dnotify_mark *dn_mark; + struct dnotify_struct *dn; + struct dnotify_struct **prev; + struct inode *inode; + + inode = file_inode(filp); + if (!S_ISDIR(inode->i_mode)) + return; + + fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + if (!fsn_mark) + return; + dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); + + mutex_lock(&dnotify_group->mark_mutex); + + spin_lock(&fsn_mark->lock); + prev = &dn_mark->dn; + while ((dn = *prev) != NULL) { + if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { + *prev = dn->dn_next; + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(fsn_mark); + break; + } + prev = &dn->dn_next; + } + + spin_unlock(&fsn_mark->lock); + + /* nothing else could have found us thanks to the dnotify_groups + mark_mutex */ + if (dn_mark->dn == NULL) + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); + + mutex_unlock(&dnotify_group->mark_mutex); + + fsnotify_put_mark(fsn_mark); +} + +/* this conversion is done only at watch creation */ +static __u32 convert_arg(unsigned long arg) +{ + __u32 new_mask = FS_EVENT_ON_CHILD; + + if (arg & DN_MULTISHOT) + new_mask |= FS_DN_MULTISHOT; + if (arg & DN_DELETE) + new_mask |= (FS_DELETE | FS_MOVED_FROM); + if (arg & DN_MODIFY) + new_mask |= FS_MODIFY; + if (arg & DN_ACCESS) + new_mask |= FS_ACCESS; + if (arg & DN_ATTRIB) + new_mask |= FS_ATTRIB; + if (arg & DN_RENAME) + new_mask |= FS_DN_RENAME; + if (arg & DN_CREATE) + new_mask |= (FS_CREATE | FS_MOVED_TO); + + return new_mask; +} + +/* + * If multiple processes watch the same inode with dnotify there is only one + * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct + * onto that mark. This function either attaches the new dnotify_struct onto + * that list, or it |= the mask onto an existing dnofiy_struct. + */ +static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, + fl_owner_t id, int fd, struct file *filp, __u32 mask) +{ + struct dnotify_struct *odn; + + odn = dn_mark->dn; + while (odn != NULL) { + /* adding more events to existing dnofiy_struct? */ + if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { + odn->dn_fd = fd; + odn->dn_mask |= mask; + return -EEXIST; + } + odn = odn->dn_next; + } + + dn->dn_mask = mask; + dn->dn_fd = fd; + dn->dn_filp = filp; + dn->dn_owner = id; + dn->dn_next = dn_mark->dn; + dn_mark->dn = dn; + + return 0; +} + +/* + * When a process calls fcntl to attach a dnotify watch to a directory it ends + * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be + * attached to the fsnotify_mark. + */ +int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +{ + struct dnotify_mark *new_dn_mark, *dn_mark; + struct fsnotify_mark *new_fsn_mark, *fsn_mark; + struct dnotify_struct *dn; + struct inode *inode; + fl_owner_t id = current->files; + struct file *f; + int destroy = 0, error = 0; + __u32 mask; + + /* we use these to tell if we need to kfree */ + new_fsn_mark = NULL; + dn = NULL; + + if (!dir_notify_enable) { + error = -EINVAL; + goto out_err; + } + + /* a 0 mask means we are explicitly removing the watch */ + if ((arg & ~DN_MULTISHOT) == 0) { + dnotify_flush(filp, id); + error = 0; + goto out_err; + } + + /* dnotify only works on directories */ + inode = file_inode(filp); + if (!S_ISDIR(inode->i_mode)) { + error = -ENOTDIR; + goto out_err; + } + + /* expect most fcntl to add new rather than augment old */ + dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); + if (!dn) { + error = -ENOMEM; + goto out_err; + } + + /* new fsnotify mark, we expect most fcntl calls to add a new mark */ + new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); + if (!new_dn_mark) { + error = -ENOMEM; + goto out_err; + } + + /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ + mask = convert_arg(arg); + + /* set up the new_fsn_mark and new_dn_mark */ + new_fsn_mark = &new_dn_mark->fsn_mark; + fsnotify_init_mark(new_fsn_mark, dnotify_free_mark); + new_fsn_mark->mask = mask; + new_dn_mark->dn = NULL; + + /* this is needed to prevent the fcntl/close race described below */ + mutex_lock(&dnotify_group->mark_mutex); + + /* add the new_fsn_mark or find an old one. */ + fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + if (fsn_mark) { + dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); + spin_lock(&fsn_mark->lock); + } else { + fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, + NULL, 0); + spin_lock(&new_fsn_mark->lock); + fsn_mark = new_fsn_mark; + dn_mark = new_dn_mark; + /* we used new_fsn_mark, so don't free it */ + new_fsn_mark = NULL; + } + + rcu_read_lock(); + f = fcheck(fd); + rcu_read_unlock(); + + /* if (f != filp) means that we lost a race and another task/thread + * actually closed the fd we are still playing with before we grabbed + * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the + * fd is the only time we clean up the marks we need to get our mark + * off the list. */ + if (f != filp) { + /* if we added ourselves, shoot ourselves, it's possible that + * the flush actually did shoot this fsn_mark. That's fine too + * since multiple calls to destroy_mark is perfectly safe, if + * we found a dn_mark already attached to the inode, just sod + * off silently as the flush at close time dealt with it. + */ + if (dn_mark == new_dn_mark) + destroy = 1; + goto out; + } + + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + + error = attach_dn(dn, dn_mark, id, fd, filp, mask); + /* !error means that we attached the dn to the dn_mark, so don't free it */ + if (!error) + dn = NULL; + /* -EEXIST means that we didn't add this new dn and used an old one. + * that isn't an error (and the unused dn should be freed) */ + else if (error == -EEXIST) + error = 0; + + dnotify_recalc_inode_mask(fsn_mark); +out: + spin_unlock(&fsn_mark->lock); + + if (destroy) + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); + + mutex_unlock(&dnotify_group->mark_mutex); + fsnotify_put_mark(fsn_mark); +out_err: + if (new_fsn_mark) + fsnotify_put_mark(new_fsn_mark); + if (dn) + kmem_cache_free(dnotify_struct_cache, dn); + return error; +} + +static int __init dnotify_init(void) +{ + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + + dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); + if (IS_ERR(dnotify_group)) + panic("unable to allocate fsnotify group for dnotify\n"); + return 0; +} + +module_init(dnotify_init) diff --git a/kernel/fs/notify/fanotify/Kconfig b/kernel/fs/notify/fanotify/Kconfig new file mode 100644 index 000000000..e5f911bd8 --- /dev/null +++ b/kernel/fs/notify/fanotify/Kconfig @@ -0,0 +1,26 @@ +config FANOTIFY + bool "Filesystem wide access notification" + select FSNOTIFY + select ANON_INODES + default n + ---help--- + Say Y here to enable fanotify support. fanotify is a file access + notification system which differs from inotify in that it sends + an open file descriptor to the userspace listener along with + the event. + + If unsure, say Y. + +config FANOTIFY_ACCESS_PERMISSIONS + bool "fanotify permissions checking" + depends on FANOTIFY + depends on SECURITY + default n + ---help--- + Say Y here is you want fanotify listeners to be able to make permissions + decisions concerning filesystem events. This is used by some fanotify + listeners which need to scan files before allowing the system access to + use those files. This is used by some anti-malware vendors and by some + hierarchical storage managent systems. + + If unsure, say N. diff --git a/kernel/fs/notify/fanotify/Makefile b/kernel/fs/notify/fanotify/Makefile new file mode 100644 index 000000000..0999213e7 --- /dev/null +++ b/kernel/fs/notify/fanotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o diff --git a/kernel/fs/notify/fanotify/fanotify.c b/kernel/fs/notify/fanotify/fanotify.c new file mode 100644 index 000000000..d2f97ecca --- /dev/null +++ b/kernel/fs/notify/fanotify/fanotify.c @@ -0,0 +1,270 @@ +#include +#include +#include +#include +#include +#include /* UINT_MAX */ +#include +#include +#include +#include + +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) +{ + struct fanotify_event_info *old, *new; + + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; + return false; +} + +/* and the list better be locked by something too! */ +static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) +{ + struct fsnotify_event *test_event; + bool do_merge = false; + + pr_debug("%s: list=%p event=%p\n", __func__, list, event); + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* + * Don't merge a permission event with any other event so that we know + * the event structure we have created in fanotify_handle_event() is the + * one we should check for permission response. + */ + if (event->mask & FAN_ALL_PERM_EVENTS) + return 0; +#endif + + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; + break; + } + } + + if (!do_merge) + return 0; + + test_event->mask |= event->mask; + return 1; +} + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +static int fanotify_get_response(struct fsnotify_group *group, + struct fanotify_perm_event_info *event) +{ + int ret; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + wait_event(group->fanotify_data.access_waitq, event->response || + atomic_read(&group->fanotify_data.bypass_perm)); + + if (!event->response) { /* bypass_perm set */ + /* + * Event was canceled because group is being destroyed. Remove + * it from group's event list because we are responsible for + * freeing the permission event. + */ + fsnotify_remove_event(group, &event->fae.fse); + return 0; + } + + /* userspace responded, convert to something usable */ + switch (event->response) { + case FAN_ALLOW: + ret = 0; + break; + case FAN_DENY: + default: + ret = -EPERM; + } + event->response = 0; + + pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, + group, event, ret); + + return ret; +} +#endif + +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmnt_mark, + u32 event_mask, + void *data, int data_type) +{ + __u32 marks_mask, marks_ignored_mask; + struct path *path = data; + + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); + + /* if we don't have enough info to send an event to userspace say no */ + if (data_type != FSNOTIFY_EVENT_PATH) + return false; + + /* sorry, fanotify only gives a damn about files and dirs */ + if (!d_is_reg(path->dentry) && + !d_can_lookup(path->dentry)) + return false; + + if (inode_mark && vfsmnt_mark) { + marks_mask = (vfsmnt_mark->mask | inode_mark->mask); + marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); + } else if (inode_mark) { + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if ((event_mask & FS_EVENT_ON_CHILD) && + !(inode_mark->mask & FS_EVENT_ON_CHILD)) + return false; + marks_mask = inode_mark->mask; + marks_ignored_mask = inode_mark->ignored_mask; + } else if (vfsmnt_mark) { + marks_mask = vfsmnt_mark->mask; + marks_ignored_mask = vfsmnt_mark->ignored_mask; + } else { + BUG(); + } + + if (d_is_dir(path->dentry) && + !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) + return false; + + if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask) + return true; + + return false; +} + +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path) +{ + struct fanotify_event_info *event; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + struct fanotify_perm_event_info *pevent; + + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, + GFP_KERNEL); + if (!pevent) + return NULL; + event = &pevent->fae; + pevent->response = 0; + goto init; + } +#endif + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; +init: __maybe_unused + fsnotify_init_event(&event->fse, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (path) { + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } + return event; +} + +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = fanotify_alloc_event(inode, mask, data); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); + if (ret) { + /* Permission events shouldn't be merged */ + BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + + return 0; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); + fsnotify_destroy_event(group, fsn_event); + } +#endif + return ret; +} + +static void fanotify_free_group_priv(struct fsnotify_group *group) +{ + struct user_struct *user; + + user = group->fanotify_data.user; + atomic_dec(&user->fanotify_listeners); + free_uid(user); +} + +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) { + kmem_cache_free(fanotify_perm_event_cachep, + FANOTIFY_PE(fsn_event)); + return; + } +#endif + kmem_cache_free(fanotify_event_cachep, event); +} + +const struct fsnotify_ops fanotify_fsnotify_ops = { + .handle_event = fanotify_handle_event, + .free_group_priv = fanotify_free_group_priv, + .free_event = fanotify_free_event, +}; diff --git a/kernel/fs/notify/fanotify/fanotify.h b/kernel/fs/notify/fanotify/fanotify.h new file mode 100644 index 000000000..2a5fb1411 --- /dev/null +++ b/kernel/fs/notify/fanotify/fanotify.h @@ -0,0 +1,50 @@ +#include +#include +#include + +extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_perm_event_cachep; + +/* + * Structure for normal fanotify events. It gets allocated in + * fanotify_handle_event() and freed when the information is retrieved by + * userspace + */ +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +}; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +/* + * Structure for permission fanotify events. It gets allocated and freed in + * fanotify_handle_event() since we wait there for user response. When the + * information is retrieved by userspace the structure is moved from + * group->notification_list to group->fanotify_data.access_list to wait for + * user response. + */ +struct fanotify_perm_event_info { + struct fanotify_event_info fae; + int response; /* userspace answer to question */ + int fd; /* fd we passed to userspace for this event */ +}; + +static inline struct fanotify_perm_event_info * +FANOTIFY_PE(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_perm_event_info, fae.fse); +} +#endif + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} + +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path); diff --git a/kernel/fs/notify/fanotify/fanotify_user.c b/kernel/fs/notify/fanotify/fanotify_user.c new file mode 100644 index 000000000..cf275500a --- /dev/null +++ b/kernel/fs/notify/fanotify/fanotify_user.c @@ -0,0 +1,940 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../../mount.h" +#include "../fdinfo.h" +#include "fanotify.h" + +#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 +#define FANOTIFY_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 + +/* + * All flags that may be specified in parameter event_f_flags of fanotify_init. + * + * Internal and external open flags are stored together in field f_flags of + * struct file. Only external open flags shall be allowed in event_f_flags. + * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be + * excluded. + */ +#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ + O_ACCMODE | O_APPEND | O_NONBLOCK | \ + __O_SYNC | O_DSYNC | O_CLOEXEC | \ + O_LARGEFILE | O_NOATIME ) + +extern const struct fsnotify_ops fanotify_fsnotify_ops; + +static struct kmem_cache *fanotify_mark_cache __read_mostly; +struct kmem_cache *fanotify_event_cachep __read_mostly; +struct kmem_cache *fanotify_perm_event_cachep __read_mostly; + +/* + * Get an fsnotify notification event if one exists and is small + * enough to fit in "count". Return an error pointer if the count + * is not large enough. + * + * Called with the group->notification_mutex held. + */ +static struct fsnotify_event *get_one_event(struct fsnotify_group *group, + size_t count) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + pr_debug("%s: group=%p count=%zd\n", __func__, group, count); + + if (fsnotify_notify_queue_is_empty(group)) + return NULL; + + if (FAN_EVENT_METADATA_LEN > count) + return ERR_PTR(-EINVAL); + + /* held the notification_mutex the whole time, so this is the + * same event we peeked above */ + return fsnotify_remove_first_event(group); +} + +static int create_fd(struct fsnotify_group *group, + struct fanotify_event_info *event, + struct file **file) +{ + int client_fd; + struct file *new_file; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); + if (client_fd < 0) + return client_fd; + + /* + * we need a new file handle for the userspace program so it can read even if it was + * originally opened O_WRONLY. + */ + /* it's possible this event was an overflow event. in that case dentry and mnt + * are NULL; That's fine, just don't call dentry open */ + if (event->path.dentry && event->path.mnt) + new_file = dentry_open(&event->path, + group->fanotify_data.f_flags | FMODE_NONOTIFY, + current_cred()); + else + new_file = ERR_PTR(-EOVERFLOW); + if (IS_ERR(new_file)) { + /* + * we still send an event even if we can't open the file. this + * can happen when say tasks are gone and we try to open their + * /proc files or we try to open a WRONLY file like in sysfs + * we just send the errno to userspace since there isn't much + * else we can do. + */ + put_unused_fd(client_fd); + client_fd = PTR_ERR(new_file); + } else { + *file = new_file; + } + + return client_fd; +} + +static int fill_event_metadata(struct fsnotify_group *group, + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) +{ + int ret = 0; + struct fanotify_event_info *event; + + pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, + group, metadata, fsn_event); + + *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); + metadata->event_len = FAN_EVENT_METADATA_LEN; + metadata->metadata_len = FAN_EVENT_METADATA_LEN; + metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->reserved = 0; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->pid = pid_vnr(event->tgid); + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) + metadata->fd = FAN_NOFD; + else { + metadata->fd = create_fd(group, event, file); + if (metadata->fd < 0) + ret = metadata->fd; + } + + return ret; +} + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +static struct fanotify_perm_event_info *dequeue_event( + struct fsnotify_group *group, int fd) +{ + struct fanotify_perm_event_info *event, *return_e = NULL; + + spin_lock(&group->fanotify_data.access_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) + continue; + + list_del_init(&event->fae.fse.list); + return_e = event; + break; + } + spin_unlock(&group->fanotify_data.access_lock); + + pr_debug("%s: found return_re=%p\n", __func__, return_e); + + return return_e; +} + +static int process_access_response(struct fsnotify_group *group, + struct fanotify_response *response_struct) +{ + struct fanotify_perm_event_info *event; + int fd = response_struct->fd; + int response = response_struct->response; + + pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, + fd, response); + /* + * make sure the response is valid, if invalid we do nothing and either + * userspace can send a valid response or we will clean it up after the + * timeout + */ + switch (response) { + case FAN_ALLOW: + case FAN_DENY: + break; + default: + return -EINVAL; + } + + if (fd < 0) + return -EINVAL; + + event = dequeue_event(group, fd); + if (!event) + return -ENOENT; + + event->response = response; + wake_up(&group->fanotify_data.access_waitq); + + return 0; +} +#endif + +static ssize_t copy_event_to_user(struct fsnotify_group *group, + struct fsnotify_event *event, + char __user *buf) +{ + struct fanotify_event_metadata fanotify_event_metadata; + struct file *f; + int fd, ret; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); + if (ret < 0) + return ret; + + fd = fanotify_event_metadata.fd; + ret = -EFAULT; + if (copy_to_user(buf, &fanotify_event_metadata, + fanotify_event_metadata.event_len)) + goto out_close_fd; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) + FANOTIFY_PE(event)->fd = fd; +#endif + + if (fd != FAN_NOFD) + fd_install(fd, f); + return fanotify_event_metadata.event_len; + +out_close_fd: + if (fd != FAN_NOFD) { + put_unused_fd(fd); + fput(f); + } + return ret; +} + +/* intofiy userspace file descriptor functions */ +static unsigned int fanotify_poll(struct file *file, poll_table *wait) +{ + struct fsnotify_group *group = file->private_data; + int ret = 0; + + poll_wait(file, &group->notification_waitq, wait); + mutex_lock(&group->notification_mutex); + if (!fsnotify_notify_queue_is_empty(group)) + ret = POLLIN | POLLRDNORM; + mutex_unlock(&group->notification_mutex); + + return ret; +} + +static ssize_t fanotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct fsnotify_group *group; + struct fsnotify_event *kevent; + char __user *start; + int ret; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + start = buf; + group = file->private_data; + + pr_debug("%s: group=%p\n", __func__, group); + + add_wait_queue(&group->notification_waitq, &wait); + while (1) { + mutex_lock(&group->notification_mutex); + kevent = get_one_event(group, count); + mutex_unlock(&group->notification_mutex); + + if (IS_ERR(kevent)) { + ret = PTR_ERR(kevent); + break; + } + + if (!kevent) { + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + break; + + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) + break; + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + continue; + } + + ret = copy_event_to_user(group, kevent, buf); + /* + * Permission events get queued to wait for response. Other + * events can be destroyed now. + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { + fsnotify_destroy_event(group, kevent); + if (ret < 0) + break; + } else { +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (ret < 0) { + FANOTIFY_PE(kevent)->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + break; + } + spin_lock(&group->fanotify_data.access_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->fanotify_data.access_lock); +#endif + } + buf += ret; + count -= ret; + } + remove_wait_queue(&group->notification_waitq, &wait); + + if (start != buf && ret != -EFAULT) + ret = buf - start; + return ret; +} + +static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) +{ +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fsnotify_group *group; + int ret; + + group = file->private_data; + + if (count > sizeof(response)) + count = sizeof(response); + + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + + if (copy_from_user(&response, buf, count)) + return -EFAULT; + + ret = process_access_response(group, &response); + if (ret < 0) + count = ret; + + return count; +#else + return -EINVAL; +#endif +} + +static int fanotify_release(struct inode *ignored, struct file *file) +{ + struct fsnotify_group *group = file->private_data; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_perm_event_info *event, *next; + + /* + * There may be still new events arriving in the notification queue + * but since userspace cannot use fanotify fd anymore, no event can + * enter or leave access_list by now. + */ + spin_lock(&group->fanotify_data.access_lock); + + atomic_inc(&group->fanotify_data.bypass_perm); + + list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, + fae.fse.list) { + pr_debug("%s: found group=%p event=%p\n", __func__, group, + event); + + list_del_init(&event->fae.fse.list); + event->response = FAN_ALLOW; + } + spin_unlock(&group->fanotify_data.access_lock); + + /* + * Since bypass_perm is set, newly queued events will not wait for + * access response. Wake up the already sleeping ones now. + * synchronize_srcu() in fsnotify_destroy_group() will wait for all + * processes sleeping in fanotify_handle_event() waiting for access + * response and thus also for all permission events to be freed. + */ + wake_up(&group->fanotify_data.access_waitq); +#endif + + /* matches the fanotify_init->fsnotify_alloc_group */ + fsnotify_destroy_group(group); + + return 0; +} + +static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct fsnotify_group *group; + struct fsnotify_event *fsn_event; + void __user *p; + int ret = -ENOTTY; + size_t send_len = 0; + + group = file->private_data; + + p = (void __user *) arg; + + switch (cmd) { + case FIONREAD: + mutex_lock(&group->notification_mutex); + list_for_each_entry(fsn_event, &group->notification_list, list) + send_len += FAN_EVENT_METADATA_LEN; + mutex_unlock(&group->notification_mutex); + ret = put_user(send_len, (int __user *) p); + break; + } + + return ret; +} + +static const struct file_operations fanotify_fops = { + .show_fdinfo = fanotify_show_fdinfo, + .poll = fanotify_poll, + .read = fanotify_read, + .write = fanotify_write, + .fasync = NULL, + .release = fanotify_release, + .unlocked_ioctl = fanotify_ioctl, + .compat_ioctl = fanotify_ioctl, + .llseek = noop_llseek, +}; + +static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + kmem_cache_free(fanotify_mark_cache, fsn_mark); +} + +static int fanotify_find_path(int dfd, const char __user *filename, + struct path *path, unsigned int flags) +{ + int ret; + + pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, + dfd, filename, flags); + + if (filename == NULL) { + struct fd f = fdget(dfd); + + ret = -EBADF; + if (!f.file) + goto out; + + ret = -ENOTDIR; + if ((flags & FAN_MARK_ONLYDIR) && + !(S_ISDIR(file_inode(f.file)->i_mode))) { + fdput(f); + goto out; + } + + *path = f.file->f_path; + path_get(path); + fdput(f); + } else { + unsigned int lookup_flags = 0; + + if (!(flags & FAN_MARK_DONT_FOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (flags & FAN_MARK_ONLYDIR) + lookup_flags |= LOOKUP_DIRECTORY; + + ret = user_path_at(dfd, filename, lookup_flags, path); + if (ret) + goto out; + } + + /* you can only watch an inode if you have read permissions on it */ + ret = inode_permission(path->dentry->d_inode, MAY_READ); + if (ret) + path_put(path); +out: + return ret; +} + +static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, + unsigned int flags, + int *destroy) +{ + __u32 oldmask = 0; + + spin_lock(&fsn_mark->lock); + if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask & ~mask; + + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; + + oldmask = fsn_mark->mask; + fsnotify_set_mark_mask_locked(fsn_mark, tmask); + } else { + __u32 tmask = fsn_mark->ignored_mask & ~mask; + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; + + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + } + *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); + spin_unlock(&fsn_mark->lock); + + return mask & oldmask; +} + +static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark = NULL; + __u32 removed; + int destroy_mark; + + mutex_lock(&group->mark_mutex); + fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); + return -ENOENT; + } + + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, + &destroy_mark); + if (destroy_mark) + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + + fsnotify_put_mark(fsn_mark); + if (removed & real_mount(mnt)->mnt_fsnotify_mask) + fsnotify_recalc_vfsmount_mask(mnt); + + return 0; +} + +static int fanotify_remove_inode_mark(struct fsnotify_group *group, + struct inode *inode, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark = NULL; + __u32 removed; + int destroy_mark; + + mutex_lock(&group->mark_mutex); + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); + return -ENOENT; + } + + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, + &destroy_mark); + if (destroy_mark) + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + + /* matches the fsnotify_find_inode_mark() */ + fsnotify_put_mark(fsn_mark); + if (removed & inode->i_fsnotify_mask) + fsnotify_recalc_inode_mask(inode); + + return 0; +} + +static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, + unsigned int flags) +{ + __u32 oldmask = -1; + + spin_lock(&fsn_mark->lock); + if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask | mask; + + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + + oldmask = fsn_mark->mask; + fsnotify_set_mark_mask_locked(fsn_mark, tmask); + } else { + __u32 tmask = fsn_mark->ignored_mask | mask; + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + if (flags & FAN_MARK_IGNORED_SURV_MODIFY) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + } + spin_unlock(&fsn_mark->lock); + + return mask & ~oldmask; +} + +static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + int ret; + + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return ERR_PTR(-ENOSPC); + + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!mark) + return ERR_PTR(-ENOMEM); + + fsnotify_init_mark(mark, fanotify_free_mark); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + if (ret) { + fsnotify_put_mark(mark); + return ERR_PTR(ret); + } + + return mark; +} + + +static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark; + __u32 added; + + mutex_lock(&group->mark_mutex); + fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + if (!fsn_mark) { + fsn_mark = fanotify_add_new_mark(group, NULL, mnt); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); + + if (added & ~real_mount(mnt)->mnt_fsnotify_mask) + fsnotify_recalc_vfsmount_mask(mnt); + + fsnotify_put_mark(fsn_mark); + return 0; +} + +static int fanotify_add_inode_mark(struct fsnotify_group *group, + struct inode *inode, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark; + __u32 added; + + pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); + + /* + * If some other task has this inode open for write we should not add + * an ignored mark, unless that ignored mark is supposed to survive + * modification changes anyway. + */ + if ((flags & FAN_MARK_IGNORED_MASK) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && + (atomic_read(&inode->i_writecount) > 0)) + return 0; + + mutex_lock(&group->mark_mutex); + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) { + fsn_mark = fanotify_add_new_mark(group, inode, NULL); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); + + if (added & ~inode->i_fsnotify_mask) + fsnotify_recalc_inode_mask(inode); + + fsnotify_put_mark(fsn_mark); + return 0; +} + +/* fanotify syscalls */ +SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) +{ + struct fsnotify_group *group; + int f_flags, fd; + struct user_struct *user; + struct fanotify_event_info *oevent; + + pr_debug("%s: flags=%d event_f_flags=%d\n", + __func__, flags, event_f_flags); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (flags & ~FAN_ALL_INIT_FLAGS) + return -EINVAL; + + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) + return -EINVAL; + + switch (event_f_flags & O_ACCMODE) { + case O_RDONLY: + case O_RDWR: + case O_WRONLY: + break; + default: + return -EINVAL; + } + + user = get_current_user(); + if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { + free_uid(user); + return -EMFILE; + } + + f_flags = O_RDWR | FMODE_NONOTIFY; + if (flags & FAN_CLOEXEC) + f_flags |= O_CLOEXEC; + if (flags & FAN_NONBLOCK) + f_flags |= O_NONBLOCK; + + /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ + group = fsnotify_alloc_group(&fanotify_fsnotify_ops); + if (IS_ERR(group)) { + free_uid(user); + return PTR_ERR(group); + } + + group->fanotify_data.user = user; + atomic_inc(&user->fanotify_listeners); + + oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); + if (unlikely(!oevent)) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = &oevent->fse; + + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; + group->fanotify_data.f_flags = event_f_flags; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + spin_lock_init(&group->fanotify_data.access_lock); + init_waitqueue_head(&group->fanotify_data.access_waitq); + INIT_LIST_HEAD(&group->fanotify_data.access_list); + atomic_set(&group->fanotify_data.bypass_perm, 0); +#endif + switch (flags & FAN_ALL_CLASS_BITS) { + case FAN_CLASS_NOTIF: + group->priority = FS_PRIO_0; + break; + case FAN_CLASS_CONTENT: + group->priority = FS_PRIO_1; + break; + case FAN_CLASS_PRE_CONTENT: + group->priority = FS_PRIO_2; + break; + default: + fd = -EINVAL; + goto out_destroy_group; + } + + if (flags & FAN_UNLIMITED_QUEUE) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_destroy_group; + group->max_events = UINT_MAX; + } else { + group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + } + + if (flags & FAN_UNLIMITED_MARKS) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_destroy_group; + group->fanotify_data.max_marks = UINT_MAX; + } else { + group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; + } + + fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); + if (fd < 0) + goto out_destroy_group; + + return fd; + +out_destroy_group: + fsnotify_destroy_group(group); + return fd; +} + +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) +{ + struct inode *inode = NULL; + struct vfsmount *mnt = NULL; + struct fsnotify_group *group; + struct fd f; + struct path path; + int ret; + + pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", + __func__, fanotify_fd, flags, dfd, pathname, mask); + + /* we only use the lower 32 bits as of right now. */ + if (mask & ((__u64)0xffffffff << 32)) + return -EINVAL; + + if (flags & ~FAN_ALL_MARK_FLAGS) + return -EINVAL; + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + case FAN_MARK_ADD: /* fallthrough */ + case FAN_MARK_REMOVE: + if (!mask) + return -EINVAL; + break; + case FAN_MARK_FLUSH: + if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (mask & FAN_ONDIR) { + flags |= FAN_MARK_ONDIR; + mask &= ~FAN_ONDIR; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) +#else + if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD)) +#endif + return -EINVAL; + + f = fdget(fanotify_fd); + if (unlikely(!f.file)) + return -EBADF; + + /* verify that this is indeed an fanotify instance */ + ret = -EINVAL; + if (unlikely(f.file->f_op != &fanotify_fops)) + goto fput_and_out; + group = f.file->private_data; + + /* + * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not + * allowed to set permissions events. + */ + ret = -EINVAL; + if (mask & FAN_ALL_PERM_EVENTS && + group->priority == FS_PRIO_0) + goto fput_and_out; + + if (flags & FAN_MARK_FLUSH) { + ret = 0; + if (flags & FAN_MARK_MOUNT) + fsnotify_clear_vfsmount_marks_by_group(group); + else + fsnotify_clear_inode_marks_by_group(group); + goto fput_and_out; + } + + ret = fanotify_find_path(dfd, pathname, &path, flags); + if (ret) + goto fput_and_out; + + /* inode held in place by reference to path; group by fget on fd */ + if (!(flags & FAN_MARK_MOUNT)) + inode = path.dentry->d_inode; + else + mnt = path.mnt; + + /* create/update an inode mark */ + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + case FAN_MARK_ADD: + if (flags & FAN_MARK_MOUNT) + ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + else + ret = fanotify_add_inode_mark(group, inode, mask, flags); + break; + case FAN_MARK_REMOVE: + if (flags & FAN_MARK_MOUNT) + ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + else + ret = fanotify_remove_inode_mark(group, inode, mask, flags); + break; + default: + ret = -EINVAL; + } + + path_put(&path); +fput_and_out: + fdput(f); + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(fanotify_mark, + int, fanotify_fd, unsigned int, flags, + __u32, mask0, __u32, mask1, int, dfd, + const char __user *, pathname) +{ + return sys_fanotify_mark(fanotify_fd, flags, +#ifdef __BIG_ENDIAN + ((__u64)mask0 << 32) | mask1, +#else + ((__u64)mask1 << 32) | mask0, +#endif + dfd, pathname); +} +#endif + +/* + * fanotify_user_setup - Our initialization function. Note that we cannot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init fanotify_user_setup(void) +{ + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info, + SLAB_PANIC); +#endif + + return 0; +} +device_initcall(fanotify_user_setup); diff --git a/kernel/fs/notify/fdinfo.c b/kernel/fs/notify/fdinfo.c new file mode 100644 index 000000000..58b7cdb63 --- /dev/null +++ b/kernel/fs/notify/fdinfo.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "inotify/inotify.h" +#include "../fs/mount.h" + +#if defined(CONFIG_PROC_FS) + +#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY) + +static void show_fdinfo(struct seq_file *m, struct file *f, + void (*show)(struct seq_file *m, + struct fsnotify_mark *mark)) +{ + struct fsnotify_group *group = f->private_data; + struct fsnotify_mark *mark; + + mutex_lock(&group->mark_mutex); + list_for_each_entry(mark, &group->marks_list, g_list) { + show(m, mark); + if (seq_has_overflowed(m)) + break; + } + mutex_unlock(&group->mark_mutex); +} + +#if defined(CONFIG_EXPORTFS) +static void show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ + struct { + struct file_handle handle; + u8 pad[MAX_HANDLE_SZ]; + } f; + int size, ret, i; + + f.handle.handle_bytes = sizeof(f.pad); + size = f.handle.handle_bytes >> 2; + + ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); + if ((ret == FILEID_INVALID) || (ret < 0)) { + WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + return; + } + + f.handle.handle_type = ret; + f.handle.handle_bytes = size * sizeof(u32); + + seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", + f.handle.handle_bytes, f.handle.handle_type); + + for (i = 0; i < f.handle.handle_bytes; i++) + seq_printf(m, "%02x", (int)f.handle.f_handle[i]); +} +#else +static void show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ +} +#endif + +#ifdef CONFIG_INOTIFY_USER + +static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + struct inotify_inode_mark *inode_mark; + struct inode *inode; + + if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + return; + + inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); + inode = igrab(mark->inode); + if (inode) { + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, + mark->mask, mark->ignored_mask); + show_mark_fhandle(m, inode); + seq_putc(m, '\n'); + iput(inode); + } +} + +void inotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + show_fdinfo(m, f, inotify_fdinfo); +} + +#endif /* CONFIG_INOTIFY_USER */ + +#ifdef CONFIG_FANOTIFY + +static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + unsigned int mflags = 0; + struct inode *inode; + + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) + return; + + if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + mflags |= FAN_MARK_IGNORED_SURV_MODIFY; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { + inode = igrab(mark->inode); + if (!inode) + return; + seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", + inode->i_ino, inode->i_sb->s_dev, + mflags, mark->mask, mark->ignored_mask); + show_mark_fhandle(m, inode); + seq_putc(m, '\n'); + iput(inode); + } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { + struct mount *mnt = real_mount(mark->mnt); + + seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", + mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + } +} + +void fanotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct fsnotify_group *group = f->private_data; + unsigned int flags = 0; + + switch (group->priority) { + case FS_PRIO_0: + flags |= FAN_CLASS_NOTIF; + break; + case FS_PRIO_1: + flags |= FAN_CLASS_CONTENT; + break; + case FS_PRIO_2: + flags |= FAN_CLASS_PRE_CONTENT; + break; + } + + if (group->max_events == UINT_MAX) + flags |= FAN_UNLIMITED_QUEUE; + + if (group->fanotify_data.max_marks == UINT_MAX) + flags |= FAN_UNLIMITED_MARKS; + + seq_printf(m, "fanotify flags:%x event-flags:%x\n", + flags, group->fanotify_data.f_flags); + + show_fdinfo(m, f, fanotify_fdinfo); +} + +#endif /* CONFIG_FANOTIFY */ + +#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */ + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/fs/notify/fdinfo.h b/kernel/fs/notify/fdinfo.h new file mode 100644 index 000000000..9664c4904 --- /dev/null +++ b/kernel/fs/notify/fdinfo.h @@ -0,0 +1,27 @@ +#ifndef __FSNOTIFY_FDINFO_H__ +#define __FSNOTIFY_FDINFO_H__ + +#include +#include + +struct seq_file; +struct file; + +#ifdef CONFIG_PROC_FS + +#ifdef CONFIG_INOTIFY_USER +void inotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#ifdef CONFIG_FANOTIFY +void fanotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#else /* CONFIG_PROC_FS */ + +#define inotify_show_fdinfo NULL +#define fanotify_show_fdinfo NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* __FSNOTIFY_FDINFO_H__ */ diff --git a/kernel/fs/notify/fsnotify.c b/kernel/fs/notify/fsnotify.c new file mode 100644 index 000000000..dd3fb0b17 --- /dev/null +++ b/kernel/fs/notify/fsnotify.c @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" +#include "../mount.h" + +/* + * Clear all of the marks on an inode when it is being evicted from core + */ +void __fsnotify_inode_delete(struct inode *inode) +{ + fsnotify_clear_marks_by_inode(inode); +} +EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); + +void __fsnotify_vfsmount_delete(struct vfsmount *mnt) +{ + fsnotify_clear_marks_by_mount(mnt); +} + +/* + * Given an inode, first check if we care what happens to our children. Inotify + * and dnotify both tell their parents about events. If we care about any event + * on a child we run all of our children and set a dentry flag saying that the + * parent cares. Thus when an event happens on a child it can quickly tell if + * if there is a need to find a parent and send the event to the parent. + */ +void __fsnotify_update_child_dentry_flags(struct inode *inode) +{ + struct dentry *alias; + int watched; + + if (!S_ISDIR(inode->i_mode)) + return; + + /* determine if the children should tell inode about their events */ + watched = fsnotify_inode_watches_children(inode); + + spin_lock(&inode->i_lock); + /* run all of the dentries associated with this inode. Since this is a + * directory, there damn well better only be one item on this list */ + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + struct dentry *child; + + /* run all of the children of the original inode and fix their + * d_flags to indicate parental interest (their parent is the + * original inode) */ + spin_lock(&alias->d_lock); + list_for_each_entry(child, &alias->d_subdirs, d_child) { + if (!child->d_inode) + continue; + + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (watched) + child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; + else + child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&child->d_lock); + } + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); +} + +/* Notify this dentry's parent about a child's events. */ +int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +{ + struct dentry *parent; + struct inode *p_inode; + int ret = 0; + + if (!dentry) + dentry = path->dentry; + + if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) + return 0; + + parent = dget_parent(dentry); + p_inode = parent->d_inode; + + if (unlikely(!fsnotify_inode_watches_children(p_inode))) + __fsnotify_update_child_dentry_flags(p_inode); + else if (p_inode->i_fsnotify_mask & mask) { + /* we are notifying a parent so come up with the new mask which + * specifies these are events which came from a child. */ + mask |= FS_EVENT_ON_CHILD; + + if (path) + ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, + dentry->d_name.name, 0); + else + ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, + dentry->d_name.name, 0); + } + + dput(parent); + + return ret; +} +EXPORT_SYMBOL_GPL(__fsnotify_parent); + +static int send_to_group(struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, + int data_is, u32 cookie, + const unsigned char *file_name) +{ + struct fsnotify_group *group = NULL; + __u32 inode_test_mask = 0; + __u32 vfsmount_test_mask = 0; + + if (unlikely(!inode_mark && !vfsmount_mark)) { + BUG(); + return 0; + } + + /* clear ignored on inode modification */ + if (mask & FS_MODIFY) { + if (inode_mark && + !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + inode_mark->ignored_mask = 0; + if (vfsmount_mark && + !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + vfsmount_mark->ignored_mask = 0; + } + + /* does the inode mark tell us to do something? */ + if (inode_mark) { + group = inode_mark->group; + inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); + inode_test_mask &= inode_mark->mask; + inode_test_mask &= ~inode_mark->ignored_mask; + } + + /* does the vfsmount_mark tell us to do something? */ + if (vfsmount_mark) { + vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + group = vfsmount_mark->group; + vfsmount_test_mask &= vfsmount_mark->mask; + vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; + if (inode_mark) + vfsmount_test_mask &= ~inode_mark->ignored_mask; + } + + pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" + " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " data=%p data_is=%d cookie=%d\n", + __func__, group, to_tell, mask, inode_mark, + inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + data_is, cookie); + + if (!inode_test_mask && !vfsmount_test_mask) + return 0; + + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name, cookie); +} + +/* + * This is the main call to fsnotify. The VFS calls into hook specific functions + * in linux/fsnotify.h. Those functions then in turn call here. Here will call + * out to all of the registered fsnotify_group. Those groups can then use the + * notification event in whatever means they feel necessary. + */ +int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, + const unsigned char *file_name, u32 cookie) +{ + struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; + struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; + struct fsnotify_group *inode_group, *vfsmount_group; + struct mount *mnt; + int idx, ret = 0; + /* global tests shouldn't care about events on child only the specific event */ + __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + + if (data_is == FSNOTIFY_EVENT_PATH) + mnt = real_mount(((struct path *)data)->mnt); + else + mnt = NULL; + + /* + * if this is a modify event we may need to clear the ignored masks + * otherwise return if neither the inode nor the vfsmount care about + * this type of event. + */ + if (!(mask & FS_MODIFY) && + !(test_mask & to_tell->i_fsnotify_mask) && + !(mnt && test_mask & mnt->mnt_fsnotify_mask)) + return 0; + + idx = srcu_read_lock(&fsnotify_mark_srcu); + + if ((mask & FS_MODIFY) || + (test_mask & to_tell->i_fsnotify_mask)) + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); + + if (mnt && ((mask & FS_MODIFY) || + (test_mask & mnt->mnt_fsnotify_mask))) { + vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, + &fsnotify_mark_srcu); + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); + } + + /* + * We need to merge inode & vfsmount mark lists so that inode mark + * ignore masks are properly reflected for mount mark notifications. + * That's why this traversal is so complicated... + */ + while (inode_node || vfsmount_node) { + inode_group = NULL; + inode_mark = NULL; + vfsmount_group = NULL; + vfsmount_mark = NULL; + + if (inode_node) { + inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), + struct fsnotify_mark, obj_list); + inode_group = inode_mark->group; + } + + if (vfsmount_node) { + vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), + struct fsnotify_mark, obj_list); + vfsmount_group = vfsmount_mark->group; + } + + if (inode_group && vfsmount_group) { + int cmp = fsnotify_compare_groups(inode_group, + vfsmount_group); + if (cmp > 0) { + inode_group = NULL; + inode_mark = NULL; + } else if (cmp < 0) { + vfsmount_group = NULL; + vfsmount_mark = NULL; + } + } + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, + data, data_is, cookie, file_name); + + if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) + goto out; + + if (inode_group) + inode_node = srcu_dereference(inode_node->next, + &fsnotify_mark_srcu); + if (vfsmount_group) + vfsmount_node = srcu_dereference(vfsmount_node->next, + &fsnotify_mark_srcu); + } + ret = 0; +out: + srcu_read_unlock(&fsnotify_mark_srcu, idx); + + return ret; +} +EXPORT_SYMBOL_GPL(fsnotify); + +static __init int fsnotify_init(void) +{ + int ret; + + BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23); + + ret = init_srcu_struct(&fsnotify_mark_srcu); + if (ret) + panic("initializing fsnotify_mark_srcu"); + + return 0; +} +core_initcall(fsnotify_init); diff --git a/kernel/fs/notify/fsnotify.h b/kernel/fs/notify/fsnotify.h new file mode 100644 index 000000000..13a00be51 --- /dev/null +++ b/kernel/fs/notify/fsnotify.h @@ -0,0 +1,60 @@ +#ifndef __FS_NOTIFY_FSNOTIFY_H_ +#define __FS_NOTIFY_FSNOTIFY_H_ + +#include +#include +#include +#include + +/* destroy all events sitting in this groups notification queue */ +extern void fsnotify_flush_notify(struct fsnotify_group *group); + +/* protects reads of inode and vfsmount marks list */ +extern struct srcu_struct fsnotify_mark_srcu; + +/* Calculate mask of events for a list of marks */ +extern u32 fsnotify_recalc_mask(struct hlist_head *head); + +/* compare two groups for sorting of marks lists */ +extern int fsnotify_compare_groups(struct fsnotify_group *a, + struct fsnotify_group *b); + +extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, + __u32 mask); +/* Add mark to a proper place in mark list */ +extern int fsnotify_add_mark_list(struct hlist_head *head, + struct fsnotify_mark *mark, + int allow_dups); +/* add a mark to an inode */ +extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + int allow_dups); +/* add a mark to a vfsmount */ +extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct vfsmount *mnt, + int allow_dups); + +/* vfsmount specific destruction of a mark */ +extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); +/* inode specific destruction of a mark */ +extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); +/* Destroy all marks in the given list */ +extern void fsnotify_destroy_marks(struct list_head *to_free); +/* Find mark belonging to given group in the list of marks */ +extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group); +/* run the list of all marks associated with inode and flag them to be freed */ +extern void fsnotify_clear_marks_by_inode(struct inode *inode); +/* run the list of all marks associated with vfsmount and flag them to be freed */ +extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); +/* + * update the dentry->d_flags of all of inode's children to indicate if inode cares + * about events that happen to its children. + */ +extern void __fsnotify_update_child_dentry_flags(struct inode *inode); + +/* allocate and destroy and event holder to attach events to notification/access queues */ +extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); +extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); + +#endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/kernel/fs/notify/group.c b/kernel/fs/notify/group.c new file mode 100644 index 000000000..d16b62cb2 --- /dev/null +++ b/kernel/fs/notify/group.c @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" + +#include + +/* + * Final freeing of a group + */ +static void fsnotify_final_destroy_group(struct fsnotify_group *group) +{ + if (group->ops->free_group_priv) + group->ops->free_group_priv(group); + + kfree(group); +} + +/* + * Trying to get rid of a group. Remove all marks, flush all events and release + * the group reference. + * Note that another thread calling fsnotify_clear_marks_by_group() may still + * hold a ref to the group. + */ +void fsnotify_destroy_group(struct fsnotify_group *group) +{ + /* clear all inode marks for this group */ + fsnotify_clear_marks_by_group(group); + + synchronize_srcu(&fsnotify_mark_srcu); + + /* clear the notification queue of all events */ + fsnotify_flush_notify(group); + + /* + * Destroy overflow event (we cannot use fsnotify_destroy_event() as + * that deliberately ignores overflow events. + */ + if (group->overflow_event) + group->ops->free_event(group->overflow_event); + + fsnotify_put_group(group); +} + +/* + * Get reference to a group. + */ +void fsnotify_get_group(struct fsnotify_group *group) +{ + atomic_inc(&group->refcnt); +} + +/* + * Drop a reference to a group. Free it if it's through. + */ +void fsnotify_put_group(struct fsnotify_group *group) +{ + if (atomic_dec_and_test(&group->refcnt)) + fsnotify_final_destroy_group(group); +} + +/* + * Create a new fsnotify_group and hold a reference for the group returned. + */ +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +{ + struct fsnotify_group *group; + + group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + /* set to 0 when there a no external references to this group */ + atomic_set(&group->refcnt, 1); + atomic_set(&group->num_marks, 0); + + mutex_init(&group->notification_mutex); + INIT_LIST_HEAD(&group->notification_list); + init_waitqueue_head(&group->notification_waitq); + group->max_events = UINT_MAX; + + mutex_init(&group->mark_mutex); + INIT_LIST_HEAD(&group->marks_list); + + group->ops = ops; + + return group; +} + +int fsnotify_fasync(int fd, struct file *file, int on) +{ + struct fsnotify_group *group = file->private_data; + + return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO; +} diff --git a/kernel/fs/notify/inode_mark.c b/kernel/fs/notify/inode_mark.c new file mode 100644 index 000000000..3daf513ee --- /dev/null +++ b/kernel/fs/notify/inode_mark.c @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +#include "../internal.h" + +/* + * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types + * any notifier is interested in hearing for this inode. + */ +void fsnotify_recalc_inode_mask(struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); + spin_unlock(&inode->i_lock); + + __fsnotify_update_child_dentry_flags(inode); +} + +void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) +{ + struct inode *inode = mark->inode; + + BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); + assert_spin_locked(&mark->lock); + + spin_lock(&inode->i_lock); + + hlist_del_init_rcu(&mark->obj_list); + mark->inode = NULL; + + /* + * this mark is now off the inode->i_fsnotify_marks list and we + * hold the inode->i_lock, so this is the perfect time to update the + * inode->i_fsnotify_mask + */ + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); + spin_unlock(&inode->i_lock); +} + +/* + * Given an inode, destroy all of the marks associated with that inode. + */ +void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + struct fsnotify_mark *mark; + struct hlist_node *n; + LIST_HEAD(free_list); + + spin_lock(&inode->i_lock); + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); + fsnotify_get_mark(mark); + } + spin_unlock(&inode->i_lock); + + fsnotify_destroy_marks(&free_list); +} + +/* + * Given a group clear all of the inode marks associated with that group. + */ +void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); +} + +/* + * given a group and inode, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, + struct inode *inode) +{ + struct fsnotify_mark *mark; + + spin_lock(&inode->i_lock); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); + spin_unlock(&inode->i_lock); + + return mark; +} + +/* + * If we are setting a mark mask on an inode mark we should pin the inode + * in memory. + */ +void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, + __u32 mask) +{ + struct inode *inode; + + assert_spin_locked(&mark->lock); + + if (mask && + mark->inode && + !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { + mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; + inode = igrab(mark->inode); + /* + * we shouldn't be able to get here if the inode wasn't + * already safely held in memory. But bug in case it + * ever is wrong. + */ + BUG_ON(!inode); + } +} + +/* + * Attach an initialized mark to a given inode. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which group and for which inodes. These + * marks are ordered according to priority, highest number first, and then by + * the group's location in memory. + */ +int fsnotify_add_inode_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + int allow_dups) +{ + int ret; + + mark->flags |= FSNOTIFY_MARK_FLAG_INODE; + + BUG_ON(!mutex_is_locked(&group->mark_mutex)); + assert_spin_locked(&mark->lock); + + spin_lock(&inode->i_lock); + mark->inode = inode; + ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, + allow_dups); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); + spin_unlock(&inode->i_lock); + + return ret; +} + +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @list: list of inodes being unmounted (sb->s_inodes) + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + */ +void fsnotify_unmount_inodes(struct list_head *list) +{ + struct inode *inode, *next_i, *need_iput = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inode *need_iput_tmp; + + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + + need_iput_tmp = need_iput; + need_iput = NULL; + + /* In case fsnotify_inode_delete() drops a reference. */ + if (inode != need_iput_tmp) + __iget(inode); + else + need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); + + /* In case the dropping of a reference would nuke next_i. */ + while (&next_i->i_sb_list != list) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && + atomic_read(&next_i->i_count)) { + __iget(next_i); + need_iput = next_i; + spin_unlock(&next_i->i_lock); + break; + } + spin_unlock(&next_i->i_lock); + next_i = list_entry(next_i->i_sb_list.next, + struct inode, i_sb_list); + } + + /* + * We can safely drop inode_sb_list_lock here because either + * we actually hold references on both inode and next_i or + * end of list. Also no new inodes will be added since the + * umount has begun. + */ + spin_unlock(&inode_sb_list_lock); + + if (need_iput_tmp) + iput(need_iput_tmp); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput(inode); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); +} diff --git a/kernel/fs/notify/inotify/Kconfig b/kernel/fs/notify/inotify/Kconfig new file mode 100644 index 000000000..b981fc0c8 --- /dev/null +++ b/kernel/fs/notify/inotify/Kconfig @@ -0,0 +1,17 @@ +config INOTIFY_USER + bool "Inotify support for userspace" + select ANON_INODES + select FSNOTIFY + default y + ---help--- + Say Y here to enable inotify support for userspace, including the + associated system calls. Inotify allows monitoring of both files and + directories via a single open fd. Events are read from the file + descriptor, which is also select()- and poll()-able. + Inotify fixes numerous shortcomings in dnotify and introduces several + new features including multiple file events, one-shot support, and + unmount notification. + + For more information, see + + If unsure, say Y. diff --git a/kernel/fs/notify/inotify/Makefile b/kernel/fs/notify/inotify/Makefile new file mode 100644 index 000000000..a380dabe0 --- /dev/null +++ b/kernel/fs/notify/inotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o diff --git a/kernel/fs/notify/inotify/inotify.h b/kernel/fs/notify/inotify/inotify.h new file mode 100644 index 000000000..ed855ef6f --- /dev/null +++ b/kernel/fs/notify/inotify/inotify.h @@ -0,0 +1,32 @@ +#include +#include +#include /* struct kmem_cache */ + +struct inotify_event_info { + struct fsnotify_event fse; + int wd; + u32 sync_cookie; + int name_len; + char name[]; +}; + +struct inotify_inode_mark { + struct fsnotify_mark fsn_mark; + int wd; +}; + +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + +extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, + struct fsnotify_group *group); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie); + +extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/kernel/fs/notify/inotify/inotify_fsnotify.c b/kernel/fs/notify/inotify/inotify_fsnotify.c new file mode 100644 index 000000000..2cd900c2c --- /dev/null +++ b/kernel/fs/notify/inotify/inotify_fsnotify.c @@ -0,0 +1,184 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include /* d_unlinked */ +#include /* struct inode */ +#include +#include +#include /* struct path */ +#include /* kmem_* */ +#include +#include + +#include "inotify.h" + +/* + * Check if 2 events contain the same information. + */ +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) +{ + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; + return false; +} + +static int inotify_merge(struct list_head *list, + struct fsnotify_event *event) +{ + struct fsnotify_event *last_event; + + last_event = list_entry(list->prev, struct fsnotify_event, list); + return event_compare(last_event, event); +} + +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) +{ + struct inotify_inode_mark *i_mark; + struct inotify_event_info *event; + struct fsnotify_event *fsn_event; + int ret; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); + + BUG_ON(vfsmount_mark); + + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + i_mark = container_of(inode_mark, struct inotify_inode_mark, + fsn_mark); + + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->sync_cookie = cookie; + event->name_len = len; + if (len) + strcpy(event->name, file_name); + + ret = fsnotify_add_event(group, fsn_event, inotify_merge); + if (ret) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + } + + if (inode_mark->mask & IN_ONESHOT) + fsnotify_destroy_mark(inode_mark, group); + + return 0; +} + +static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) +{ + inotify_ignored_and_remove_idr(fsn_mark, group); +} + +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ +static int idr_callback(int id, void *p, void *data) +{ + struct fsnotify_mark *fsn_mark; + struct inotify_inode_mark *i_mark; + static bool warned = false; + + if (warned) + return 0; + + warned = true; + fsn_mark = p; + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (fsn_mark) + printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", + fsn_mark->group, fsn_mark->inode, i_mark->wd); + return 0; +} + +static void inotify_free_group_priv(struct fsnotify_group *group) +{ + /* ideally the idr is empty and we won't hit the BUG in the callback */ + idr_for_each(&group->inotify_data.idr, idr_callback, group); + idr_destroy(&group->inotify_data.idr); + if (group->inotify_data.user) { + atomic_dec(&group->inotify_data.user->inotify_devs); + free_uid(group->inotify_data.user); + } +} + +static void inotify_free_event(struct fsnotify_event *fsn_event) +{ + kfree(INOTIFY_E(fsn_event)); +} + +const struct fsnotify_ops inotify_fsnotify_ops = { + .handle_event = inotify_handle_event, + .free_group_priv = inotify_free_group_priv, + .free_event = inotify_free_event, + .freeing_mark = inotify_freeing_mark, +}; diff --git a/kernel/fs/notify/inotify/inotify_user.c b/kernel/fs/notify/inotify/inotify_user.c new file mode 100644 index 000000000..450648697 --- /dev/null +++ b/kernel/fs/notify/inotify/inotify_user.c @@ -0,0 +1,815 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include /* struct inode */ +#include +#include +#include /* module_init */ +#include +#include /* roundup() */ +#include /* LOOKUP_FOLLOW */ +#include /* struct user */ +#include /* struct kmem_cache */ +#include +#include +#include +#include +#include +#include + +#include "inotify.h" +#include "../fdinfo.h" + +#include + +/* these are configurable via /proc/sys/fs/inotify/ */ +static int inotify_max_user_instances __read_mostly; +static int inotify_max_queued_events __read_mostly; +static int inotify_max_user_watches __read_mostly; + +static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; + +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +struct ctl_table inotify_table[] = { + { + .procname = "max_user_instances", + .data = &inotify_max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_user_watches", + .data = &inotify_max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_queued_events", + .data = &inotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static inline __u32 inotify_arg_to_mask(u32 arg) +{ + __u32 mask; + + /* + * everything should accept their own ignored, cares about children, + * and should receive events when the inode is unmounted + */ + mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT); + + /* mask off the flags used to open the fd */ + mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + + return mask; +} + +static inline u32 inotify_mask_to_arg(__u32 mask) +{ + return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | + IN_Q_OVERFLOW); +} + +/* intofiy userspace file descriptor functions */ +static unsigned int inotify_poll(struct file *file, poll_table *wait) +{ + struct fsnotify_group *group = file->private_data; + int ret = 0; + + poll_wait(file, &group->notification_waitq, wait); + mutex_lock(&group->notification_mutex); + if (!fsnotify_notify_queue_is_empty(group)) + ret = POLLIN | POLLRDNORM; + mutex_unlock(&group->notification_mutex); + + return ret; +} + +static int round_event_name_len(struct fsnotify_event *fsn_event) +{ + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); + if (!event->name_len) + return 0; + return roundup(event->name_len + 1, sizeof(struct inotify_event)); +} + +/* + * Get an inotify_kernel_event if one exists and is small + * enough to fit in "count". Return an error pointer if + * not large enough. + * + * Called with the group->notification_mutex held. + */ +static struct fsnotify_event *get_one_event(struct fsnotify_group *group, + size_t count) +{ + size_t event_size = sizeof(struct inotify_event); + struct fsnotify_event *event; + + if (fsnotify_notify_queue_is_empty(group)) + return NULL; + + event = fsnotify_peek_first_event(group); + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + event_size += round_event_name_len(event); + if (event_size > count) + return ERR_PTR(-EINVAL); + + /* held the notification_mutex the whole time, so this is the + * same event we peeked above */ + fsnotify_remove_first_event(group); + + return event; +} + +/* + * Copy an event to user space, returning how much we copied. + * + * We already checked that the event size is smaller than the + * buffer we had in "get_one_event()" above. + */ +static ssize_t copy_event_to_user(struct fsnotify_group *group, + struct fsnotify_event *fsn_event, + char __user *buf) +{ + struct inotify_event inotify_event; + struct inotify_event_info *event; + size_t event_size = sizeof(struct inotify_event); + size_t name_len; + size_t pad_name_len; + + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + + event = INOTIFY_E(fsn_event); + name_len = event->name_len; + /* + * round up name length so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + pad_name_len = round_event_name_len(fsn_event); + inotify_event.len = pad_name_len; + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; + inotify_event.cookie = event->sync_cookie; + + /* send the main event */ + if (copy_to_user(buf, &inotify_event, event_size)) + return -EFAULT; + + buf += event_size; + + /* + * fsnotify only stores the pathname, so here we have to send the pathname + * and then pad that pathname out to a multiple of sizeof(inotify_event) + * with zeros. + */ + if (pad_name_len) { + /* copy the path name */ + if (copy_to_user(buf, event->name, name_len)) + return -EFAULT; + buf += name_len; + + /* fill userspace with 0's */ + if (clear_user(buf, pad_name_len - name_len)) + return -EFAULT; + event_size += pad_name_len; + } + + return event_size; +} + +static ssize_t inotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct fsnotify_group *group; + struct fsnotify_event *kevent; + char __user *start; + int ret; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + start = buf; + group = file->private_data; + + add_wait_queue(&group->notification_waitq, &wait); + while (1) { + mutex_lock(&group->notification_mutex); + kevent = get_one_event(group, count); + mutex_unlock(&group->notification_mutex); + + pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); + + if (kevent) { + ret = PTR_ERR(kevent); + if (IS_ERR(kevent)) + break; + ret = copy_event_to_user(group, kevent, buf); + fsnotify_destroy_event(group, kevent); + if (ret < 0) + break; + buf += ret; + count -= ret; + continue; + } + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + break; + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) + break; + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&group->notification_waitq, &wait); + + if (start != buf && ret != -EFAULT) + ret = buf - start; + return ret; +} + +static int inotify_release(struct inode *ignored, struct file *file) +{ + struct fsnotify_group *group = file->private_data; + + pr_debug("%s: group=%p\n", __func__, group); + + /* free this group, matching get was inotify_init->fsnotify_obtain_group */ + fsnotify_destroy_group(group); + + return 0; +} + +static long inotify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fsnotify_group *group; + struct fsnotify_event *fsn_event; + void __user *p; + int ret = -ENOTTY; + size_t send_len = 0; + + group = file->private_data; + p = (void __user *) arg; + + pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd); + + switch (cmd) { + case FIONREAD: + mutex_lock(&group->notification_mutex); + list_for_each_entry(fsn_event, &group->notification_list, + list) { + send_len += sizeof(struct inotify_event); + send_len += round_event_name_len(fsn_event); + } + mutex_unlock(&group->notification_mutex); + ret = put_user(send_len, (int __user *) p); + break; + } + + return ret; +} + +static const struct file_operations inotify_fops = { + .show_fdinfo = inotify_show_fdinfo, + .poll = inotify_poll, + .read = inotify_read, + .fasync = fsnotify_fasync, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, + .compat_ioctl = inotify_ioctl, + .llseek = noop_llseek, +}; + + +/* + * find_inode - resolve a user-given path to a specific inode + */ +static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) +{ + int error; + + error = user_path_at(AT_FDCWD, dirname, flags, path); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = inode_permission(path->dentry->d_inode, MAY_READ); + if (error) + path_put(path); + return error; +} + +static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, + struct inotify_inode_mark *i_mark) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock(idr_lock); + + ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); + if (ret >= 0) { + /* we added the mark to the idr, take a reference */ + i_mark->wd = ret; + fsnotify_get_mark(&i_mark->fsn_mark); + } + + spin_unlock(idr_lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, + int wd) +{ + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + struct inotify_inode_mark *i_mark; + + assert_spin_locked(idr_lock); + + i_mark = idr_find(idr, wd); + if (i_mark) { + struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark; + + fsnotify_get_mark(fsn_mark); + /* One ref for being in the idr, one ref we just took */ + BUG_ON(atomic_read(&fsn_mark->refcnt) < 2); + } + + return i_mark; +} + +static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, + int wd) +{ + struct inotify_inode_mark *i_mark; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + + spin_lock(idr_lock); + i_mark = inotify_idr_find_locked(group, wd); + spin_unlock(idr_lock); + + return i_mark; +} + +static void do_inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark *i_mark) +{ + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + int wd = i_mark->wd; + + assert_spin_locked(idr_lock); + + idr_remove(idr, wd); + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(&i_mark->fsn_mark); +} + +/* + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark *i_mark) +{ + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + struct inotify_inode_mark *found_i_mark = NULL; + int wd; + + spin_lock(idr_lock); + wd = i_mark->wd; + + /* + * does this i_mark think it is in the idr? we shouldn't get called + * if it wasn't.... + */ + if (wd == -1) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + goto out; + } + + /* Lets look in the idr to see if we find it */ + found_i_mark = inotify_idr_find_locked(group, wd); + if (unlikely(!found_i_mark)) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + goto out; + } + + /* + * We found an mark in the idr at the right wd, but it's + * not the mark we were told to remove. eparis seriously + * fucked up somewhere. + */ + if (unlikely(found_i_mark != i_mark)) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " + "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " + "found_i_mark->group=%p found_i_mark->inode=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, + i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, + found_i_mark->fsn_mark.group, + found_i_mark->fsn_mark.inode); + goto out; + } + + /* + * One ref for being in the idr + * one ref held by the caller trying to kill us + * one ref grabbed by inotify_idr_find + */ + if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { + printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + /* we can't really recover with bad ref cnting.. */ + BUG(); + } + + do_inotify_remove_from_idr(group, i_mark); +out: + /* match the ref taken by inotify_idr_find_locked() */ + if (found_i_mark) + fsnotify_put_mark(&found_i_mark->fsn_mark); + i_mark->wd = -1; + spin_unlock(idr_lock); +} + +/* + * Send IN_IGNORED for this wd, remove this wd from the idr. + */ +void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, + struct fsnotify_group *group) +{ + struct inotify_inode_mark *i_mark; + + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + /* remove this mark from the idr */ + inotify_remove_from_idr(group, i_mark); + + atomic_dec(&group->inotify_data.user->inotify_watches); +} + +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct inotify_inode_mark *i_mark; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + kmem_cache_free(inotify_inode_mark_cachep, i_mark); +} + +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct fsnotify_mark *fsn_mark; + struct inotify_inode_mark *i_mark; + __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; + + mask = inotify_arg_to_mask(arg); + + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) + return -ENOENT; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + spin_lock(&fsn_mark->lock); + + old_mask = fsn_mark->mask; + if (add) + fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask)); + else + fsnotify_set_mark_mask_locked(fsn_mark, mask); + new_mask = fsn_mark->mask; + + spin_unlock(&fsn_mark->lock); + + if (old_mask != new_mask) { + /* more bits in old than in new? */ + int dropped = (old_mask & ~new_mask); + /* more bits in this fsn_mark than the inode's mask? */ + int do_inode = (new_mask & ~inode->i_fsnotify_mask); + + /* update the inode with this new fsn_mark */ + if (dropped || do_inode) + fsnotify_recalc_inode_mask(inode); + + } + + /* return the wd */ + ret = i_mark->wd; + + /* match the get from fsnotify_find_mark() */ + fsnotify_put_mark(fsn_mark); + + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark *tmp_i_mark; + __u32 mask; + int ret; + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + + mask = inotify_arg_to_mask(arg); + + tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_i_mark)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark); + tmp_i_mark->fsn_mark.mask = mask; + tmp_i_mark->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; + + ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); + if (ret) + goto out_err; + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, + NULL, 0); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_i_mark); + goto out_err; + } + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new mark */ + ret = tmp_i_mark->wd; + +out_err: + /* match the ref from fsnotify_init_mark() */ + fsnotify_put_mark(&tmp_i_mark->fsn_mark); + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + + mutex_lock(&group->mark_mutex); + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + mutex_unlock(&group->mark_mutex); + + return ret; +} + +static struct fsnotify_group *inotify_new_group(unsigned int max_events) +{ + struct fsnotify_group *group; + struct inotify_event_info *oevent; + + group = fsnotify_alloc_group(&inotify_fsnotify_ops); + if (IS_ERR(group)) + return group; + + oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); + if (unlikely(!oevent)) { + fsnotify_destroy_group(group); + return ERR_PTR(-ENOMEM); + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->wd = -1; + oevent->sync_cookie = 0; + oevent->name_len = 0; + + group->max_events = max_events; + + spin_lock_init(&group->inotify_data.idr_lock); + idr_init(&group->inotify_data.idr); + group->inotify_data.user = get_current_user(); + + if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > + inotify_max_user_instances) { + fsnotify_destroy_group(group); + return ERR_PTR(-EMFILE); + } + + return group; +} + + +/* inotify syscalls */ +SYSCALL_DEFINE1(inotify_init1, int, flags) +{ + struct fsnotify_group *group; + int ret; + + /* Check the IN_* constants for consistency. */ + BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); + + if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) + return -EINVAL; + + /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ + group = inotify_new_group(inotify_max_queued_events); + if (IS_ERR(group)) + return PTR_ERR(group); + + ret = anon_inode_getfd("inotify", &inotify_fops, group, + O_RDONLY | flags); + if (ret < 0) + fsnotify_destroy_group(group); + + return ret; +} + +SYSCALL_DEFINE0(inotify_init) +{ + return sys_inotify_init1(0); +} + +SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, + u32, mask) +{ + struct fsnotify_group *group; + struct inode *inode; + struct path path; + struct fd f; + int ret; + unsigned flags = 0; + + /* don't allow invalid bits: we don't want flags set */ + if (unlikely(!(mask & ALL_INOTIFY_BITS))) + return -EINVAL; + + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(f.file->f_op != &inotify_fops)) { + ret = -EINVAL; + goto fput_and_out; + } + + if (!(mask & IN_DONT_FOLLOW)) + flags |= LOOKUP_FOLLOW; + if (mask & IN_ONLYDIR) + flags |= LOOKUP_DIRECTORY; + + ret = inotify_find_inode(pathname, &path, flags); + if (ret) + goto fput_and_out; + + /* inode held in place by reference to path; group by fget on fd */ + inode = path.dentry->d_inode; + group = f.file->private_data; + + /* create/update an inode mark */ + ret = inotify_update_watch(group, inode, mask); + path_put(&path); +fput_and_out: + fdput(f); + return ret; +} + +SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) +{ + struct fsnotify_group *group; + struct inotify_inode_mark *i_mark; + struct fd f; + int ret = 0; + + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + ret = -EINVAL; + if (unlikely(f.file->f_op != &inotify_fops)) + goto out; + + group = f.file->private_data; + + ret = -EINVAL; + i_mark = inotify_idr_find(group, wd); + if (unlikely(!i_mark)) + goto out; + + ret = 0; + + fsnotify_destroy_mark(&i_mark->fsn_mark, group); + + /* match ref taken by inotify_idr_find */ + fsnotify_put_mark(&i_mark->fsn_mark); + +out: + fdput(f); + return ret; +} + +/* + * inotify_user_setup - Our initialization function. Note that we cannot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init inotify_user_setup(void) +{ + BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); + BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(IN_OPEN != FS_OPEN); + BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(IN_CREATE != FS_CREATE); + BUILD_BUG_ON(IN_DELETE != FS_DELETE); + BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); + BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); + BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); + BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); + BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); + BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); + + BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); + + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + + inotify_max_queued_events = 16384; + inotify_max_user_instances = 128; + inotify_max_user_watches = 8192; + + return 0; +} +module_init(inotify_user_setup); diff --git a/kernel/fs/notify/mark.c b/kernel/fs/notify/mark.c new file mode 100644 index 000000000..92e48c70f --- /dev/null +++ b/kernel/fs/notify/mark.c @@ -0,0 +1,494 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * fsnotify inode mark locking/lifetime/and refcnting + * + * REFCNT: + * The group->recnt and mark->refcnt tell how many "things" in the kernel + * currently are referencing the objects. Both kind of objects typically will + * live inside the kernel with a refcnt of 2, one for its creation and one for + * the reference a group and a mark hold to each other. + * If you are holding the appropriate locks, you can take a reference and the + * object itself is guaranteed to survive until the reference is dropped. + * + * LOCKING: + * There are 3 locks involved with fsnotify inode marks and they MUST be taken + * in order as follows: + * + * group->mark_mutex + * mark->lock + * inode->i_lock + * + * group->mark_mutex protects the marks_list anchored inside a given group and + * each mark is hooked via the g_list. It also protects the groups private + * data (i.e group limits). + + * mark->lock protects the marks attributes like its masks and flags. + * Furthermore it protects the access to a reference of the group that the mark + * is assigned to as well as the access to a reference of the inode/vfsmount + * that is being watched by the mark. + * + * inode->i_lock protects the i_fsnotify_marks list anchored inside a + * given inode and each mark is hooked via the i_list. (and sorta the + * free_i_list) + * + * + * LIFETIME: + * Inode marks survive between when they are added to an inode and when their + * refcnt==0. + * + * The inode mark can be cleared for a number of different reasons including: + * - The inode is unlinked for the last time. (fsnotify_inode_remove) + * - The inode is being evicted from cache. (fsnotify_inode_delete) + * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) + * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) + * - The fsnotify_group associated with the mark is going away and all such marks + * need to be cleaned up. (fsnotify_clear_marks_by_group) + * + * Worst case we are given an inode and need to clean up all the marks on that + * inode. We take i_lock and walk the i_fsnotify_marks safely. For each + * mark on the list we take a reference (so the mark can't disappear under us). + * We remove that mark form the inode's list of marks and we add this mark to a + * private list anchored on the stack using i_free_list; we walk i_free_list + * and before we destroy the mark we make sure that we dont race with a + * concurrent destroy_group by getting a ref to the marks group and taking the + * groups mutex. + + * Very similarly for freeing by group, except we use free_g_list. + * + * This has the very interesting property of being able to run concurrently with + * any (or all) other directions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +struct srcu_struct fsnotify_mark_srcu; +static DEFINE_SPINLOCK(destroy_lock); +static LIST_HEAD(destroy_list); +static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); + +void fsnotify_get_mark(struct fsnotify_mark *mark) +{ + atomic_inc(&mark->refcnt); +} + +void fsnotify_put_mark(struct fsnotify_mark *mark) +{ + if (atomic_dec_and_test(&mark->refcnt)) { + if (mark->group) + fsnotify_put_group(mark->group); + mark->free_mark(mark); + } +} + +/* Calculate mask of events for a list of marks */ +u32 fsnotify_recalc_mask(struct hlist_head *head) +{ + u32 new_mask = 0; + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) + new_mask |= mark->mask; + return new_mask; +} + +/* + * Any time a mark is getting freed we end up here. + * The caller had better be holding a reference to this mark so we don't actually + * do the final put under the mark->lock + */ +void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, + struct fsnotify_group *group) +{ + struct inode *inode = NULL; + + BUG_ON(!mutex_is_locked(&group->mark_mutex)); + + spin_lock(&mark->lock); + + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } + + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { + inode = mark->inode; + fsnotify_destroy_inode_mark(mark); + } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) + fsnotify_destroy_vfsmount_mark(mark); + else + BUG(); + + list_del_init(&mark->g_list); + + spin_unlock(&mark->lock); + + if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) + iput(inode); + /* release lock temporarily */ + mutex_unlock(&group->mark_mutex); + + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + wake_up(&destroy_waitq); + /* + * We don't necessarily have a ref on mark from caller so the above destroy + * may have actually freed it, unless this group provides a 'freeing_mark' + * function which must be holding a reference. + */ + + /* + * Some groups like to know that marks are being freed. This is a + * callback to the group function to let it know that this mark + * is being freed. + */ + if (group->ops->freeing_mark) + group->ops->freeing_mark(mark, group); + + /* + * __fsnotify_update_child_dentry_flags(inode); + * + * I really want to call that, but we can't, we have no idea if the inode + * still exists the second we drop the mark->lock. + * + * The next time an event arrive to this inode from one of it's children + * __fsnotify_parent will see that the inode doesn't care about it's + * children and will update all of these flags then. So really this + * is just a lazy update (and could be a perf win...) + */ + + atomic_dec(&group->num_marks); + + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); +} + +void fsnotify_destroy_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) +{ + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + fsnotify_destroy_mark_locked(mark, group); + mutex_unlock(&group->mark_mutex); +} + +/* + * Destroy all marks in the given list. The marks must be already detached from + * the original inode / vfsmount. + */ +void fsnotify_destroy_marks(struct list_head *to_free) +{ + struct fsnotify_mark *mark, *lmark; + struct fsnotify_group *group; + + list_for_each_entry_safe(mark, lmark, to_free, free_list) { + spin_lock(&mark->lock); + fsnotify_get_group(mark->group); + group = mark->group; + spin_unlock(&mark->lock); + + fsnotify_destroy_mark(mark, group); + fsnotify_put_mark(mark); + fsnotify_put_group(group); + } +} + +void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) +{ + assert_spin_locked(&mark->lock); + + mark->mask = mask; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) + fsnotify_set_inode_mark_mask_locked(mark, mask); +} + +void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) +{ + assert_spin_locked(&mark->lock); + + mark->ignored_mask = mask; +} + +/* + * Sorting function for lists of fsnotify marks. + * + * Fanotify supports different notification classes (reflected as priority of + * notification group). Events shall be passed to notification groups in + * decreasing priority order. To achieve this marks in notification lists for + * inodes and vfsmounts are sorted so that priorities of corresponding groups + * are descending. + * + * Furthermore correct handling of the ignore mask requires processing inode + * and vfsmount marks of each group together. Using the group address as + * further sort criterion provides a unique sorting order and thus we can + * merge inode and vfsmount lists of marks in linear time and find groups + * present in both lists. + * + * A return value of 1 signifies that b has priority over a. + * A return value of 0 signifies that the two marks have to be handled together. + * A return value of -1 signifies that a has priority over b. + */ +int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) +{ + if (a == b) + return 0; + if (!a) + return 1; + if (!b) + return -1; + if (a->priority < b->priority) + return 1; + if (a->priority > b->priority) + return -1; + if (a < b) + return 1; + return -1; +} + +/* Add mark into proper place in given list of marks */ +int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, + int allow_dups) +{ + struct fsnotify_mark *lmark, *last = NULL; + int cmp; + + /* is mark the first mark? */ + if (hlist_empty(head)) { + hlist_add_head_rcu(&mark->obj_list, head); + return 0; + } + + /* should mark be in the middle of the current list? */ + hlist_for_each_entry(lmark, head, obj_list) { + last = lmark; + + if ((lmark->group == mark->group) && !allow_dups) + return -EEXIST; + + cmp = fsnotify_compare_groups(lmark->group, mark->group); + if (cmp >= 0) { + hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); + return 0; + } + } + + BUG_ON(last == NULL); + /* mark should be the last entry. last is the current last entry */ + hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); + return 0; +} + +/* + * Attach an initialized mark to a given group and fs object. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which group. + */ +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + struct vfsmount *mnt, int allow_dups) +{ + int ret = 0; + + BUG_ON(inode && mnt); + BUG_ON(!inode && !mnt); + BUG_ON(!mutex_is_locked(&group->mark_mutex)); + + /* + * LOCKING ORDER!!!! + * group->mark_mutex + * mark->lock + * inode->i_lock + */ + spin_lock(&mark->lock); + mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + + fsnotify_get_group(group); + mark->group = group; + list_add(&mark->g_list, &group->marks_list); + atomic_inc(&group->num_marks); + fsnotify_get_mark(mark); /* for i_list and g_list */ + + if (inode) { + ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); + if (ret) + goto err; + } else if (mnt) { + ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); + if (ret) + goto err; + } else { + BUG(); + } + + /* this will pin the object if appropriate */ + fsnotify_set_mark_mask_locked(mark, mark->mask); + spin_unlock(&mark->lock); + + if (inode) + __fsnotify_update_child_dentry_flags(inode); + + return ret; +err: + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + list_del_init(&mark->g_list); + fsnotify_put_group(group); + mark->group = NULL; + atomic_dec(&group->num_marks); + + spin_unlock(&mark->lock); + + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + wake_up(&destroy_waitq); + + return ret; +} + +int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, + struct inode *inode, struct vfsmount *mnt, int allow_dups) +{ + int ret; + mutex_lock(&group->mark_mutex); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); + mutex_unlock(&group->mark_mutex); + return ret; +} + +/* + * Given a list of marks, find the mark associated with given group. If found + * take a reference to that mark and return it, else return NULL. + */ +struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group) +{ + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) { + if (mark->group == group) { + fsnotify_get_mark(mark); + return mark; + } + } + return NULL; +} + +/* + * clear any marks in a group in which mark->flags & flags is true + */ +void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, + unsigned int flags) +{ + struct fsnotify_mark *lmark, *mark; + + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { + if (mark->flags & flags) { + fsnotify_get_mark(mark); + fsnotify_destroy_mark_locked(mark, group); + fsnotify_put_mark(mark); + } + } + mutex_unlock(&group->mark_mutex); +} + +/* + * Given a group, destroy all of the marks associated with that group. + */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); +} + +void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) +{ + assert_spin_locked(&old->lock); + new->inode = old->inode; + new->mnt = old->mnt; + if (old->group) + fsnotify_get_group(old->group); + new->group = old->group; + new->mask = old->mask; + new->free_mark = old->free_mark; +} + +/* + * Nothing fancy, just initialize lists and locks and counters. + */ +void fsnotify_init_mark(struct fsnotify_mark *mark, + void (*free_mark)(struct fsnotify_mark *mark)) +{ + memset(mark, 0, sizeof(*mark)); + spin_lock_init(&mark->lock); + atomic_set(&mark->refcnt, 1); + mark->free_mark = free_mark; +} + +static int fsnotify_mark_destroy(void *ignored) +{ + struct fsnotify_mark *mark, *next; + struct list_head private_destroy_list; + + for (;;) { + spin_lock(&destroy_lock); + /* exchange the list head */ + list_replace_init(&destroy_list, &private_destroy_list); + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); + fsnotify_put_mark(mark); + } + + wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); + } + + return 0; +} + +static int __init fsnotify_mark_init(void) +{ + struct task_struct *thread; + + thread = kthread_run(fsnotify_mark_destroy, NULL, + "fsnotify_mark"); + if (IS_ERR(thread)) + panic("unable to start fsnotify mark destruction thread."); + + return 0; +} +device_initcall(fsnotify_mark_init); diff --git a/kernel/fs/notify/notification.c b/kernel/fs/notify/notification.c new file mode 100644 index 000000000..a95d8e037 --- /dev/null +++ b/kernel/fs/notify/notification.c @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Basic idea behind the notification queue: An fsnotify group (like inotify) + * sends the userspace notification about events asynchronously some time after + * the event happened. When inotify gets an event it will need to add that + * event to the group notify queue. Since a single event might need to be on + * multiple group's notification queues we can't add the event directly to each + * queue and instead add a small "event_holder" to each queue. This event_holder + * has a pointer back to the original event. Since the majority of events are + * going to end up on one, and only one, notification queue we embed one + * event_holder into each event. This means we have a single allocation instead + * of always needing two. If the embedded event_holder is already in use by + * another group a new event_holder (from fsnotify_event_holder_cachep) will be + * allocated and used. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); + +/** + * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. + * Called from fsnotify_move, which is inlined into filesystem modules. + */ +u32 fsnotify_get_cookie(void) +{ + return atomic_inc_return(&fsnotify_sync_cookie); +} +EXPORT_SYMBOL_GPL(fsnotify_get_cookie); + +/* return true if the notify queue is empty, false otherwise */ +bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + return list_empty(&group->notification_list) ? true : false; +} + +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) + return; + /* If the event is still queued, we have a problem... */ + WARN_ON(!list_empty(&event->list)); + group->ops->free_event(event); +} + +/* + * Add an event to the group notification queue. The group can later pull this + * event off the queue to deal with. The function returns 0 if the event was + * added to the queue, 1 if the event was merged with some other queued event, + * 2 if the queue of events has overflown. + */ +int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) +{ + int ret = 0; + struct list_head *list = &group->notification_list; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + mutex_lock(&group->notification_mutex); + + if (group->q_len >= group->max_events) { + ret = 2; + /* Queue overflow event only if it isn't already queued */ + if (!list_empty(&group->overflow_event->list)) { + mutex_unlock(&group->notification_mutex); + return ret; + } + event = group->overflow_event; + goto queue; + } + + if (!list_empty(list) && merge) { + ret = merge(list, event); + if (ret) { + mutex_unlock(&group->notification_mutex); + return ret; + } + } + +queue: + group->q_len++; + list_add_tail(&event->list, list); + mutex_unlock(&group->notification_mutex); + + wake_up(&group->notification_waitq); + kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); + return ret; +} + +/* + * Remove @event from group's notification queue. It is the responsibility of + * the caller to destroy the event. + */ +void fsnotify_remove_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + mutex_lock(&group->notification_mutex); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + group->q_len--; + } + mutex_unlock(&group->notification_mutex); +} + +/* + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event + */ +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + pr_debug("%s: group=%p\n", __func__, group); + + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + /* + * We need to init list head for the case of overflow event so that + * check in fsnotify_add_event() works + */ + list_del_init(&event->list); + group->q_len--; + + return event; +} + +/* + * This will not remove the event, that must be done with + * fsnotify_remove_first_event() + */ +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); +} + +/* + * Called when a group is being torn down to clean up any outstanding + * event notifications. + */ +void fsnotify_flush_notify(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + event = fsnotify_remove_first_event(group); + fsnotify_destroy_event(group, event); + } + mutex_unlock(&group->notification_mutex); +} + +/* + * fsnotify_create_event - Allocate a new event which will be sent to each + * group's handle_event function if the group was interested in this + * particular event. + * + * @inode the inode which is supposed to receive the event (sometimes a + * parent of the inode to which the event happened. + * @mask what actually happened. + * @data pointer to the object which was actually affected + * @data_type flag indication if the data is a file, path, inode, nothing... + * @name the filename, if available + */ +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) +{ + INIT_LIST_HEAD(&event->list); + event->inode = inode; + event->mask = mask; +} diff --git a/kernel/fs/notify/vfsmount_mark.c b/kernel/fs/notify/vfsmount_mark.c new file mode 100644 index 000000000..326b148e6 --- /dev/null +++ b/kernel/fs/notify/vfsmount_mark.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" +#include "../mount.h" + +void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + struct hlist_node *n; + struct mount *m = real_mount(mnt); + LIST_HEAD(free_list); + + spin_lock(&mnt->mnt_root->d_lock); + hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); + fsnotify_get_mark(mark); + } + spin_unlock(&mnt->mnt_root->d_lock); + + fsnotify_destroy_marks(&free_list); +} + +void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); +} + +/* + * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types + * any notifier is interested in hearing for this mount point + */ +void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) +{ + struct mount *m = real_mount(mnt); + + spin_lock(&mnt->mnt_root->d_lock); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); + spin_unlock(&mnt->mnt_root->d_lock); +} + +void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) +{ + struct vfsmount *mnt = mark->mnt; + struct mount *m = real_mount(mnt); + + BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); + assert_spin_locked(&mark->lock); + + spin_lock(&mnt->mnt_root->d_lock); + + hlist_del_init_rcu(&mark->obj_list); + mark->mnt = NULL; + + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); + spin_unlock(&mnt->mnt_root->d_lock); +} + +/* + * given a group and vfsmount, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt) +{ + struct mount *m = real_mount(mnt); + struct fsnotify_mark *mark; + + spin_lock(&mnt->mnt_root->d_lock); + mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); + spin_unlock(&mnt->mnt_root->d_lock); + + return mark; +} + +/* + * Attach an initialized mark to a given group and vfsmount. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which groups. + */ +int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct vfsmount *mnt, + int allow_dups) +{ + struct mount *m = real_mount(mnt); + int ret; + + mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; + + BUG_ON(!mutex_is_locked(&group->mark_mutex)); + assert_spin_locked(&mark->lock); + + spin_lock(&mnt->mnt_root->d_lock); + mark->mnt = mnt; + ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); + spin_unlock(&mnt->mnt_root->d_lock); + + return ret; +} diff --git a/kernel/fs/nsfs.c b/kernel/fs/nsfs.c new file mode 100644 index 000000000..99521e7c4 --- /dev/null +++ b/kernel/fs/nsfs.c @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include + +static struct vfsmount *nsfs_mnt; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +static void ns_prune_dentry(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + if (inode) { + struct ns_common *ns = inode->i_private; + atomic_long_set(&ns->stashed, 0); + } +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_prune = ns_prune_dentry, + .d_delete = always_delete_dentry, + .d_dname = ns_dname, +}; + +static void nsfs_evict(struct inode *inode) +{ + struct ns_common *ns = inode->i_private; + clear_inode(inode); + ns->ops->put(ns); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct vfsmount *mnt = mntget(nsfs_mnt); + struct qstr qname = { .name = "", }; + struct dentry *dentry; + struct inode *inode; + struct ns_common *ns; + unsigned long d; + +again: + ns = ns_ops->get(task); + if (!ns) { + mntput(mnt); + return ERR_PTR(-ENOENT); + } + rcu_read_lock(); + d = atomic_long_read(&ns->stashed); + if (!d) + goto slow; + dentry = (struct dentry *)d; + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto slow; + rcu_read_unlock(); + ns_ops->put(ns); +got_it: + path->mnt = mnt; + path->dentry = dentry; + return NULL; +slow: + rcu_read_unlock(); + inode = new_inode_pseudo(mnt->mnt_sb); + if (!inode) { + ns_ops->put(ns); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + inode->i_ino = ns->inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + inode->i_private = ns; + + dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); + if (!dentry) { + iput(inode); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + d_instantiate(dentry, inode); + dentry->d_fsdata = (void *)ns_ops; + d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); + if (d) { + d_delete(dentry); /* make sure ->d_prune() does nothing */ + dput(dentry); + cpu_relax(); + goto again; + } + goto got_it; +} + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + int res = -ENOENT; + ns = ns_ops->get(task); + if (ns) { + res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + ns_ops->put(ns); + } + return res; +} + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + +static const struct super_operations nsfs_ops = { + .statfs = simple_statfs, + .evict_inode = nsfs_evict, +}; +static struct dentry *nsfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, + &ns_dentry_operations, NSFS_MAGIC); +} +static struct file_system_type nsfs = { + .name = "nsfs", + .mount = nsfs_mount, + .kill_sb = kill_anon_super, +}; + +void __init nsfs_init(void) +{ + nsfs_mnt = kern_mount(&nsfs); + if (IS_ERR(nsfs_mnt)) + panic("can't set nsfs up\n"); + nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; +} diff --git a/kernel/fs/ntfs/Kconfig b/kernel/fs/ntfs/Kconfig new file mode 100644 index 000000000..f5a868cc9 --- /dev/null +++ b/kernel/fs/ntfs/Kconfig @@ -0,0 +1,78 @@ +config NTFS_FS + tristate "NTFS file system support" + select NLS + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + + Saying Y or M here enables read support. There is partial, but + safe, write support available. For write support you must also + say Y to "NTFS write support" below. + + There are also a number of user-space tools available, called + ntfsprogs. These include ntfsundelete and ntfsresize, that work + without NTFS support enabled in the kernel. + + This is a rewrite from scratch of Linux NTFS support and replaced + the old NTFS code starting with Linux 2.5.11. A backport to + the Linux 2.4 kernel series is separately available as a patch + from the project web site. + + For more information see + and . + + To compile this file system support as a module, choose M here: the + module will be called ntfs. + + If you are not using Windows NT, 2000, XP or 2003 in addition to + Linux on your computer it is safe to say N. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_RW + bool "NTFS write support" + depends on NTFS_FS + help + This enables the partial, but safe, write support in the NTFS driver. + + The only supported operation is overwriting existing files, without + changing the file length. No file or directory creation, deletion or + renaming is possible. Note only non-resident files can be written to + so you may find that some very small files (<500 bytes or so) cannot + be written to. + + While we cannot guarantee that it will not damage any data, we have + so far not received a single report where the driver would have + damaged someones data so we assume it is perfectly safe to use. + + Note: While write support is safe in this version (a rewrite from + scratch of the NTFS support), it should be noted that the old NTFS + write support, included in Linux 2.5.10 and before (since 1997), + is not safe. + + This is currently useful with TopologiLinux. TopologiLinux is run + on top of any DOS/Microsoft Windows system without partitioning your + hard disk. Unlike other Linux distributions TopologiLinux does not + need its own partition. For more information see + + + It is perfectly safe to say N here. diff --git a/kernel/fs/ntfs/Makefile b/kernel/fs/ntfs/Makefile new file mode 100644 index 000000000..2ff263e6d --- /dev/null +++ b/kernel/fs/ntfs/Makefile @@ -0,0 +1,14 @@ +# Rules for making the NTFS driver. + +obj-$(CONFIG_NTFS_FS) += ntfs.o + +ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ + index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ + unistr.o upcase.o + +ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o + +ccflags-y := -DNTFS_VERSION=\"2.1.32\" +ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG +ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW + diff --git a/kernel/fs/ntfs/aops.c b/kernel/fs/ntfs/aops.c new file mode 100644 index 000000000..f0de4b6b8 --- /dev/null +++ b/kernel/fs/ntfs/aops.c @@ -0,0 +1,1773 @@ +/** + * aops.c - NTFS kernel address space operations and page cache handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "debug.h" +#include "inode.h" +#include "mft.h" +#include "runlist.h" +#include "types.h" +#include "ntfs.h" + +/** + * ntfs_end_buffer_async_read - async io completion for reading attributes + * @bh: buffer head on which io is completed + * @uptodate: whether @bh is now uptodate or not + * + * Asynchronous I/O completion handler for reading pages belonging to the + * attribute address space of an inode. The inodes can either be files or + * directories or they can be fake inodes describing some attribute. + * + * If NInoMstProtected(), perform the post read mst fixups when all IO on the + * page has been completed and mark the page uptodate or set the error bit on + * the page. To determine the size of the records that need fixing up, we + * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs + * record size, and index_block_size_bits, to the log(base 2) of the ntfs + * record size. + */ +static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first, *tmp; + struct page *page; + struct inode *vi; + ntfs_inode *ni; + int page_uptodate = 1; + + page = bh->b_page; + vi = page->mapping->host; + ni = NTFS_I(vi); + + if (likely(uptodate)) { + loff_t i_size; + s64 file_ofs, init_size; + + set_buffer_uptodate(bh); + + file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); + read_lock_irqsave(&ni->size_lock, flags); + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + /* Check for the current buffer head overflowing. */ + if (unlikely(file_ofs + bh->b_size > init_size)) { + int ofs; + void *kaddr; + + ofs = 0; + if (file_ofs < init_size) + ofs = init_size - file_ofs; + local_irq_save(flags); + kaddr = kmap_atomic(page); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + local_irq_restore(flags); + } + } else { + clear_buffer_uptodate(bh); + SetPageError(page); + ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); + flags = bh_uptodate_lock_irqsave(first); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + if (likely(buffer_locked(tmp))) + goto still_busy; + /* Async buffers must be locked. */ + BUG(); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + bh_uptodate_unlock_irqrestore(first, flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the + * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. + * Note we ignore fixup errors as those are detected when + * map_mft_record() is called which gives us per record granularity + * rather than per page granularity. + */ + if (!NInoMstProtected(ni)) { + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } else { + u8 *kaddr; + unsigned int i, recs; + u32 rec_size; + + rec_size = ni->itype.index.block_size; + recs = PAGE_CACHE_SIZE / rec_size; + /* Should have been verified before we got here... */ + BUG_ON(!recs); + local_irq_save_nort(flags); + kaddr = kmap_atomic(page); + for (i = 0; i < recs; i++) + post_read_mst_fixup((NTFS_RECORD*)(kaddr + + i * rec_size), rec_size); + kunmap_atomic(kaddr); + local_irq_restore_nort(flags); + flush_dcache_page(page); + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } + unlock_page(page); + return; +still_busy: + bh_uptodate_unlock_irqrestore(first, flags); +} + +/** + * ntfs_read_block - fill a @page of an address space with data + * @page: page cache page to fill with data + * + * Fill the page @page of the address space belonging to the @page->host inode. + * We read each buffer asynchronously and when all buffers are read in, our io + * completion handler ntfs_end_buffer_read_async(), if required, automatically + * applies the mst fixups to the page before finally marking it uptodate and + * unlocking it. + * + * We only enforce allocated_size limit because i_size is checked for in + * generic_file_read(). + * + * Return 0 on success and -errno on error. + * + * Contains an adapted version of fs/buffer.c::block_read_full_page(). + */ +static int ntfs_read_block(struct page *page) +{ + loff_t i_size; + VCN vcn; + LCN lcn; + s64 init_size; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + sector_t iblock, lblock, zblock; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int i, nr; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + /* $MFT/$DATA must have its complete runlist in memory at all times. */ + BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); + + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) { + unlock_page(page); + return -ENOMEM; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* + * We may be racing with truncate. To avoid some of the problems we + * now take a snapshot of the various sizes and use those for the whole + * of the function. In case of an extending truncate it just means we + * may leave some buffers unmapped which are now allocated. This is + * not a problem since these buffers will just get mapped when a write + * occurs. In case of a shrinking truncate, we will detect this later + * on due to the runlist being incomplete and if the page is being + * fully truncated, truncate will throw it away as soon as we unlock + * it so no need to worry what we do with it. + */ + iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); + lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + zblock = (init_size + blocksize - 1) >> blocksize_bits; + + /* Loop through all the buffers in the page. */ + rl = NULL; + nr = i = 0; + do { + int err = 0; + + if (unlikely(buffer_uptodate(bh))) + continue; + if (unlikely(buffer_mapped(bh))) { + arr[nr++] = bh; + continue; + } + bh->b_bdev = vol->sb->s_bdev; + /* Is the block within the allowed limits? */ + if (iblock < lblock) { + bool is_retry = false; + + /* Convert iblock into corresponding vcn and offset. */ + vcn = (VCN)iblock << blocksize_bits >> + vol->cluster_size_bits; + vcn_ofs = ((VCN)iblock << blocksize_bits) & + vol->cluster_size_mask; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + /* Only read initialized data blocks. */ + if (iblock < zblock) { + arr[nr++] = bh; + continue; + } + /* Fully non-initialized data block, zero it. */ + goto handle_zblock; + } + /* It is a hole, need to zero it. */ + if (lcn == LCN_HOLE) + goto handle_hole; + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, treat it as a + * hole. This can happen due to concurrent truncate + * for example. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + err = 0; + goto handle_hole; + } + /* Hard error, zero out region. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + SetPageError(page); + ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "offset 0x%x because its location on " + "disk could not be determined%s " + "(error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + } + /* + * Either iblock was outside lblock limits or + * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion + * of the page and set the buffer uptodate. + */ +handle_hole: + bh->b_blocknr = -1UL; + clear_buffer_mapped(bh); +handle_zblock: + zero_user(page, i * blocksize, blocksize); + if (likely(!err)) + set_buffer_uptodate(bh); + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Check we have at least one buffer ready for i/o. */ + if (nr) { + struct buffer_head *tbh; + + /* Lock the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + lock_buffer(tbh); + tbh->b_end_io = ntfs_end_buffer_async_read; + set_buffer_async_read(tbh); + } + /* Finally, start i/o on the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + if (likely(!buffer_uptodate(tbh))) + submit_bh(READ, tbh); + else + ntfs_end_buffer_async_read(tbh, 1); + } + return 0; + } + /* No i/o was scheduled on any of the buffers. */ + if (likely(!PageError(page))) + SetPageUptodate(page); + else /* Signal synchronous i/o error. */ + nr = -EIO; + unlock_page(page); + return nr; +} + +/** + * ntfs_readpage - fill a @page of a @file with data from the device + * @file: open file to which the page @page belongs or NULL + * @page: page cache page to fill with data + * + * For non-resident attributes, ntfs_readpage() fills the @page of the open + * file @file by calling the ntfs version of the generic block_read_full_page() + * function, ntfs_read_block(), which in turn creates and reads in the buffers + * associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the + * data from the mft record (which at this stage is most likely in memory) and + * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as + * even if the mft record is not cached at this point in time, we need to wait + * for it to be read in before we can do the copy. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_readpage(struct file *file, struct page *page) +{ + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + u8 *addr; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + unsigned long flags; + u32 attr_len; + int err = 0; + +retry_readpage: + BUG_ON(!PageLocked(page)); + vi = page->mapping->host; + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + ntfs_debug("Read outside i_size - truncated?"); + goto done; + } + /* + * This can potentially happen because we clear PageUptodate() during + * ntfs_writepage() of MstProtected() attributes. + */ + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + ni = NTFS_I(vi); + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If attribute is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + BUG_ON(ni->type != AT_DATA); + err = -EACCES; + goto err_out; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + return ntfs_read_compressed_block(page); + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* Normal, non-resident data stream. */ + return ntfs_read_block(page); + } + /* + * Attribute is resident, implying it is not compressed or encrypted. + * This also means the attribute is smaller than an mft record and + * hence smaller than a page, so can simply zero out any pages with + * index above 0. Note the attribute can actually be marked compressed + * but if it is resident the actual data is not compressed so we are + * ok to ignore the compressed flag here. + */ + if (unlikely(page->index > 0)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + goto done; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + mrec = map_mft_record(base_ni); + if (IS_ERR(mrec)) { + err = PTR_ERR(mrec); + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the readpage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_readpage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, mrec); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto put_unm_err_out; + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + read_lock_irqsave(&ni->size_lock, flags); + if (unlikely(attr_len > ni->initialized_size)) + attr_len = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate. */ + attr_len = i_size; + } + addr = kmap_atomic(page); + /* Copy the data to the page. */ + memcpy(addr, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + attr_len); + /* Zero the remainder of the page. */ + memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + flush_dcache_page(page); + kunmap_atomic(addr); +put_unm_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(base_ni); +done: + SetPageUptodate(page); +err_out: + unlock_page(page); + return err; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, non-mst + * protected attributes to their backing store. + * + * For a page with buffers, map and write the dirty buffers asynchronously + * under page writeback. For a page without buffers, create buffers for the + * page, then proceed as above. + * + * If a page doesn't have buffers the page dirty state is definitive. If a page + * does have buffers, the page dirty state is just a hint, and the buffer dirty + * state is definitive. (A hint which has rules: dirty buffers against a clean + * page is illegal. Other combinations are legal and need to be handled. In + * particular a dirty page containing clean buffers for example.) + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_read_block() and __block_write_full_page(). + */ +static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +{ + VCN vcn; + LCN lcn; + s64 initialized_size; + loff_t i_size; + sector_t block, dblock, iblock; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int err; + bool need_end_writeback; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", ni->mft_no, ni->type, page->index); + + BUG_ON(!NInoNonResident(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + if (!page_has_buffers(page)) { + BUG_ON(!PageUptodate(page)); + create_empty_buffers(page, blocksize, + (1 << BH_Uptodate) | (1 << BH_Dirty)); + if (unlikely(!page_has_buffers(page))) { + ntfs_warning(vol->sb, "Error allocating page " + "buffers. Redirtying page so we try " + "again later."); + /* + * Put the page back on mapping->dirty_pages, but leave + * its buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* NOTE: Different naming scheme to ntfs_read_block()! */ + + /* The first block in the page. */ + block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + + /* The first out of bounds block for the data size. */ + dblock = (i_size + blocksize - 1) >> blocksize_bits; + + /* The last (fully or partially) initialized block. */ + iblock = initialized_size >> blocksize_bits; + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + /* + * Loop through all the buffers in the page, mapping all the dirty + * buffers to disk addresses and handling any aliases from the + * underlying block device's mapping. + */ + rl = NULL; + err = 0; + do { + bool is_retry = false; + + if (unlikely(block >= dblock)) { + /* + * Mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. The contents of such buffers + * were zeroed by ntfs_writepage(). + * + * FIXME: What about the small race window where + * ntfs_writepage() has not done any clearing because + * the page was within i_size but before we get here, + * vmtruncate() modifies i_size? + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + + /* Clean buffers are not written out, so no need to map them. */ + if (!buffer_dirty(bh)) + continue; + + /* Make sure we have enough initialized size. */ + if (unlikely((block >= iblock) && + (initialized_size < i_size))) { + /* + * If this page is fully outside initialized size, zero + * out all pages between the current initialized size + * and the current page. Just use ntfs_readpage() to do + * the zeroing transparently. + */ + if (block > iblock) { + // TODO: + // For each page do: + // - read_cache_page() + // Again for each page do: + // - wait_on_page_locked() + // - Check (PageUptodate(page) && + // !PageError(page)) + // Update initialized size in the attribute and + // in the inode. + // Again, for each page do: + // __set_page_dirty_buffers(); + // page_cache_release() + // We don't need to wait on the writes. + // Update iblock. + } + /* + * The current page straddles initialized size. Zero + * all non-uptodate buffers and set them uptodate (and + * dirty?). Note, there aren't any non-uptodate buffers + * if the page is uptodate. + * FIXME: For an uptodate page, the buffers may need to + * be written out because they were not initialized on + * disk before. + */ + if (!PageUptodate(page)) { + // TODO: + // Zero any non-uptodate buffers up to i_size. + // Set them uptodate and dirty. + } + // TODO: + // Update initialized size in the attribute and in the + // inode (up to i_size). + // Update iblock. + // FIXME: This is inefficient. Try to batch the two + // size changes to happen in one go. + ntfs_error(vol->sb, "Writing beyond initialized size " + "is not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + // Do NOT set_buffer_new() BUT DO clear buffer range + // outside write request range. + // set_buffer_uptodate() on complete buffers as well as + // set_buffer_dirty(). + } + + /* No need to map buffers that are already mapped. */ + if (buffer_mapped(bh)) + continue; + + /* Unmapped, dirty buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + + /* Convert block into corresponding vcn and offset. */ + vcn = (VCN)block << blocksize_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to point to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + continue; + } + /* It is a hole, need to instantiate it. */ + if (lcn == LCN_HOLE) { + u8 *kaddr; + unsigned long *bpos, *bend; + + /* Check if the buffer is zero. */ + kaddr = kmap_atomic(page); + bpos = (unsigned long *)(kaddr + bh_offset(bh)); + bend = (unsigned long *)((u8*)bpos + blocksize); + do { + if (unlikely(*bpos)) + break; + } while (likely(++bpos < bend)); + kunmap_atomic(kaddr); + if (bpos == bend) { + /* + * Buffer is zero and sparse, no need to write + * it. + */ + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + continue; + } + // TODO: Instantiate the hole. + // clear_buffer_new(bh); + // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + ntfs_error(vol->sb, "Writing into sparse regions is " + "not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + } + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, truncate has cut it out + * of the runlist. Just clean and clear the buffer and set it + * uptodate so it can get discarded by the VM. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + err = 0; + continue; + } + /* Failed to map the buffer, even after retrying. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, offset 0x%x " + "because its location on disk could not be " + "determined%s (error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + break; + } while (block++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* For the error case, need to reset bh to the beginning. */ + bh = head; + + /* Just an optimization, so ->readpage() is not called later. */ + if (unlikely(!PageUptodate(page))) { + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + bh = head; + break; + } + } while ((bh = bh->b_this_page) != head); + if (uptodate) + SetPageUptodate(page); + } + + /* Setup all mapped, dirty buffers for async write i/o. */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + BUG_ON(!buffer_uptodate(bh)); + mark_buffer_async_write(bh); + } else + unlock_buffer(bh); + } else if (unlikely(err)) { + /* + * For the error case. The buffer may have been set + * dirty during attachment to a dirty page. + */ + if (err != -ENOMEM) + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (unlikely(err)) { + // TODO: Remove the -EOPNOTSUPP check later on... + if (unlikely(err == -EOPNOTSUPP)) + err = 0; + else if (err == -ENOMEM) { + ntfs_warning(vol->sb, "Error allocating memory. " + "Redirtying page so we try again " + "later."); + /* + * Put the page back on mapping->dirty_pages, but + * leave its buffer's dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else + SetPageError(page); + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + + /* Submit the prepared buffers for i/o. */ + need_end_writeback = true; + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + need_end_writeback = false; + } + bh = next; + } while (bh != head); + unlock_page(page); + + /* If no i/o was started, need to end_page_writeback(). */ + if (unlikely(need_end_writeback)) + end_page_writeback(page); + + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_write_mst_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, mst protected + * attributes to their backing store. The only supported attributes are index + * allocation and $MFT/$DATA. Both directory inodes and index inodes are + * supported for the index allocation case. + * + * The page must remain locked for the duration of the write because we apply + * the mst fixups, write, and then undo the fixups, so if we were to unlock the + * page before undoing the fixups, any other user of the page will see the + * page contents as corrupt. + * + * We clear the page uptodate flag for the duration of the function to ensure + * exclusion for the $MFT/$DATA case against someone mapping an mft record we + * are about to apply the mst fixups to. + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_write_block(), ntfs_mft_writepage(), and + * write_mft_record_nolock(). + */ +static int ntfs_write_mst_block(struct page *page, + struct writeback_control *wbc) +{ + sector_t block, dblock, rec_block; + struct inode *vi = page->mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + u8 *kaddr; + unsigned int rec_size = ni->itype.index.block_size; + ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size]; + struct buffer_head *bh, *head, *tbh, *rec_start_bh; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + runlist_element *rl; + int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; + unsigned bh_size, rec_size_bits; + bool sync, is_mft, page_is_dirty, rec_is_dirty; + unsigned char bh_size_bits; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", vi->i_ino, ni->type, page->index); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(!NInoMstProtected(ni)); + is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); + /* + * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page + * in its page cache were to be marked dirty. However this should + * never happen with the current driver and considering we do not + * handle this case here we do want to BUG(), at least for now. + */ + BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || + (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; + max_bhs = PAGE_CACHE_SIZE / bh_size; + BUG_ON(!max_bhs); + BUG_ON(max_bhs > MAX_BUF_PER_PAGE); + + /* Were we called for sync purposes? */ + sync = (wbc->sync_mode == WB_SYNC_ALL); + + /* Make sure we have mapped buffers. */ + bh = head = page_buffers(page); + BUG_ON(!bh); + + rec_size_bits = ni->itype.index.block_size_bits; + BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits)); + bhs_per_rec = rec_size >> bh_size_bits; + BUG_ON(!bhs_per_rec); + + /* The first block in the page. */ + rec_block = block = (sector_t)page->index << + (PAGE_CACHE_SHIFT - bh_size_bits); + + /* The first out of bounds block for the data size. */ + dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; + + rl = NULL; + err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; + page_is_dirty = rec_is_dirty = false; + rec_start_bh = NULL; + do { + bool is_retry = false; + + if (likely(block < rec_block)) { + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + /* + * This block is not the first one in the record. We + * ignore the buffer's dirty state because we could + * have raced with a parallel mark_ntfs_record_dirty(). + */ + if (!rec_is_dirty) + continue; + if (unlikely(err2)) { + if (err2 != -ENOMEM) + clear_buffer_dirty(bh); + continue; + } + } else /* if (block == rec_block) */ { + BUG_ON(block > rec_block); + /* This block is the first one in the record. */ + rec_block += bhs_per_rec; + err2 = 0; + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + continue; + } + if (!buffer_dirty(bh)) { + /* Clean records are not written out. */ + rec_is_dirty = false; + continue; + } + rec_is_dirty = true; + rec_start_bh = bh; + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = (VCN)block << bh_size_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> bh_size_bits; + set_buffer_mapped(bh); + } else { + /* + * Remap failed. Retry to map the runlist once + * unless we are working on $MFT which always + * has the whole of its runlist in memory. + */ + if (!is_mft && !is_retry && + lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping + * lock for the duration. + */ + up_read(&ni->runlist.lock); + err2 = ntfs_map_runlist(ni, vcn); + if (likely(!err2)) + goto lock_retry_remap; + if (err2 == -ENOMEM) + page_is_dirty = true; + lcn = err2; + } else { + err2 = -EIO; + if (!rl) + up_read(&ni->runlist.lock); + } + /* Hard error. Abort writing this record. */ + if (!err || err == -ENOMEM) + err = err2; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write ntfs record " + "0x%llx (inode 0x%lx, " + "attribute type 0x%x) because " + "its location on disk could " + "not be determined (error " + "code %lli).", + (long long)block << + bh_size_bits >> + vol->mft_record_size_bits, + ni->mft_no, ni->type, + (long long)lcn); + /* + * If this is not the first buffer, remove the + * buffers in this record from the list of + * buffers to write and clear their dirty bit + * if not error -ENOMEM. + */ + if (rec_start_bh != bh) { + while (bhs[--nr_bhs] != rec_start_bh) + ; + if (err2 != -ENOMEM) { + do { + clear_buffer_dirty( + rec_start_bh); + } while ((rec_start_bh = + rec_start_bh-> + b_this_page) != + bh); + } + } + continue; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + } while (block++, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&ni->runlist.lock); + /* If there were no dirty buffers, we are done. */ + if (!nr_bhs) + goto done; + /* Map the page so we can access its contents. */ + kaddr = kmap(page); + /* Clear the page uptodate flag whilst the mst fixups are applied. */ + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + for (i = 0; i < nr_bhs; i++) { + unsigned int ofs; + + /* Skip buffers which are not at the beginning of records. */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + ofs = bh_offset(tbh); + if (is_mft) { + ntfs_inode *tni; + unsigned long mft_no; + + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + >> rec_size_bits; + /* Check whether to write this mft record. */ + tni = NULL; + if (!ntfs_may_write_mft_record(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), &tni)) { + /* + * The record should not be written. This + * means we need to redirty the page before + * returning. + */ + page_is_dirty = true; + /* + * Remove the buffers in this mft record from + * the list of buffers to write. + */ + do { + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + /* + * The record should be written. If a locked ntfs + * inode was returned, add it to the array of locked + * ntfs inodes. + */ + if (tni) + locked_nis[nr_locked_nis++] = tni; + } + /* Apply the mst protection fixups. */ + err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), + rec_size); + if (unlikely(err2)) { + if (!err || err == -ENOMEM) + err = -EIO; + ntfs_error(vol->sb, "Failed to apply mst fixups " + "(inode 0x%lx, attribute type 0x%x, " + "page index 0x%lx, page offset 0x%x)!" + " Unmount and run chkdsk.", vi->i_ino, + ni->type, page->index, ofs); + /* + * Mark all the buffers in this record clean as we do + * not want to write corrupt data to disk. + */ + do { + clear_buffer_dirty(bhs[i]); + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + nr_recs++; + } + /* If no records are to be written out, we are done. */ + if (!nr_recs) + goto unm_done; + flush_dcache_page(page); + /* Lock buffers and start synchronous write i/o on them. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + if (!trylock_buffer(tbh)) + BUG(); + /* The buffer dirty state is now irrelevant, just clean it. */ + clear_buffer_dirty(tbh); + BUG_ON(!buffer_uptodate(tbh)); + BUG_ON(!buffer_mapped(tbh)); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (is_mft && !sync) + goto do_mirror; +do_wait: + /* Wait on i/o completion of buffers. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_error(vol->sb, "I/O error while writing ntfs " + "record buffer (inode 0x%lx, " + "attribute type 0x%x, page index " + "0x%lx, page offset 0x%lx)! Unmount " + "and run chkdsk.", vi->i_ino, ni->type, + page->index, bh_offset(tbh)); + if (!err || err == -ENOMEM) + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (is_mft && sync) { +do_mirror: + for (i = 0; i < nr_bhs; i++) { + unsigned long mft_no; + unsigned int ofs; + + /* + * Skip buffers which are not at the beginning of + * records. + */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + /* Skip removed buffers (and hence records). */ + if (!tbh) + continue; + ofs = bh_offset(tbh); + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + >> rec_size_bits; + if (mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), + sync); + } + if (!sync) + goto do_wait; + } + /* Remove the mst protection fixups again. */ + for (i = 0; i < nr_bhs; i++) { + if (!(i % bhs_per_rec)) { + tbh = bhs[i]; + if (!tbh) + continue; + post_write_mst_fixup((NTFS_RECORD*)(kaddr + + bh_offset(tbh))); + } + } + flush_dcache_page(page); +unm_done: + /* Unlock any locked inodes. */ + while (nr_locked_nis-- > 0) { + ntfs_inode *tni, *base_tni; + + tni = locked_nis[nr_locked_nis]; + /* Get the base inode. */ + mutex_lock(&tni->extent_lock); + if (tni->nr_extents >= 0) + base_tni = tni; + else { + base_tni = tni->ext.base_ntfs_ino; + BUG_ON(!base_tni); + } + mutex_unlock(&tni->extent_lock); + ntfs_debug("Unlocking %s inode 0x%lx.", + tni == base_tni ? "base" : "extent", + tni->mft_no); + mutex_unlock(&tni->mrec_lock); + atomic_dec(&tni->count); + iput(VFS_I(base_tni)); + } + SetPageUptodate(page); + kunmap(page); +done: + if (unlikely(err && err != -ENOMEM)) { + /* + * Set page error if there is only one ntfs record in the page. + * Otherwise we would loose per-record granularity. + */ + if (ni->itype.index.block_size == PAGE_CACHE_SIZE) + SetPageError(page); + NVolSetErrors(vol); + } + if (page_is_dirty) { + ntfs_debug("Page still contains one or more dirty ntfs " + "records. Redirtying the page starting at " + "record 0x%lx.", page->index << + (PAGE_CACHE_SHIFT - rec_size_bits)); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } else { + /* + * Keep the VM happy. This must be done otherwise the + * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though + * the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + } + if (likely(!err)) + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_writepage - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This is called from the VM when it wants to have a dirty ntfs page cache + * page cleaned. The VM has already locked the page and marked it clean. + * + * For non-resident attributes, ntfs_writepage() writes the @page by calling + * the ntfs version of the generic block_write_full_page() function, + * ntfs_write_block(), which in turn if necessary creates and writes the + * buffers associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying + * the data to the mft record (which at this stage is most likely in memory). + * The mft record is then marked dirty and written out asynchronously via the + * vfs inode dirty code path for the inode the mft record belongs to or via the + * vm page dirty code path for the page the mft record is in. + * + * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + loff_t i_size; + struct inode *vi = page->mapping->host; + ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); + char *addr; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + u32 attr_len; + int err; + +retry_writepage: + BUG_ON(!PageLocked(page)); + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { + /* + * The page may have dirty, unmapped buffers. Make them + * freeable here, so the page does not leak. + */ + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + ntfs_debug("Write outside i_size - truncated?"); + return 0; + } + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + unlock_page(page); + BUG_ON(ni->type != AT_DATA); + ntfs_debug("Denying write access to encrypted file."); + return -EACCES; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + // TODO: Implement and replace this with + // return ntfs_write_compressed_block(page); + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to compressed files is " + "not supported yet. Sorry."); + return -EOPNOTSUPP; + } + // TODO: Implement and remove this check. + if (NInoNonResident(ni) && NInoSparse(ni)) { + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to sparse files is not " + "supported yet. Sorry."); + return -EOPNOTSUPP; + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* We have to zero every time due to mmap-at-end-of-file. */ + if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) { + /* The page straddles i_size. */ + unsigned int ofs = i_size & ~PAGE_CACHE_MASK; + zero_user_segment(page, ofs, PAGE_CACHE_SIZE); + } + /* Handle mst protected attributes. */ + if (NInoMstProtected(ni)) + return ntfs_write_mst_block(page, wbc); + /* Normal, non-resident data stream. */ + return ntfs_write_block(page, wbc); + } + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * mst protected. This also means the attribute is smaller than an mft + * record and hence smaller than a page, so can simply return error on + * any pages with index above 0. Note the attribute can actually be + * marked compressed but if it is resident the actual data is not + * compressed so we are ok to ignore the compressed flag here. + */ + BUG_ON(page_has_buffers(page)); + BUG_ON(!PageUptodate(page)); + if (unlikely(page->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " + "Aborting write.", page->index); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + return -EIO; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the writepage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_writepage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto err_out; + /* + * Keep the VM happy. This must be done otherwise the radix-tree tag + * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + i_size = i_size_read(vi); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate or a failed truncate. */ + attr_len = i_size; + /* + * If the truncate failed, fix it up now. If a concurrent + * truncate, we do its job, so it does not have to do anything. + */ + err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, + attr_len); + /* Shrinking cannot fail. */ + BUG_ON(err); + } + addr = kmap_atomic(page); + /* Copy the data from the page to the mft record. */ + memcpy((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + addr, attr_len); + /* Zero out of bounds area in the page cache page. */ + memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + kunmap_atomic(addr); + flush_dcache_page(page); + flush_dcache_mft_record_page(ctx->ntfs_ino); + /* We are done with the page. */ + end_page_writeback(page); + /* Finally, mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " + "page so we try again later."); + /* + * Put the page back on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else { + ntfs_error(vi->i_sb, "Resident attribute write failed with " + "error %i.", err); + SetPageError(page); + NVolSetErrors(ni->vol); + } + unlock_page(page); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +#endif /* NTFS_RW */ + +/** + * ntfs_bmap - map logical file block to physical device block + * @mapping: address space mapping to which the block to be mapped belongs + * @block: logical block to map to its physical device block + * + * For regular, non-resident files (i.e. not compressed and not encrypted), map + * the logical @block belonging to the file described by the address space + * mapping @mapping to its physical device block. + * + * The size of the block is equal to the @s_blocksize field of the super block + * of the mounted file system which is guaranteed to be smaller than or equal + * to the cluster size thus the block is guaranteed to fit entirely inside the + * cluster which means we do not need to care how many contiguous bytes are + * available after the beginning of the block. + * + * Return the physical device block if the mapping succeeded or 0 if the block + * is sparse or there was an error. + * + * Note: This is a problem if someone tries to run bmap() on $Boot system file + * as that really is in block zero but there is nothing we can do. bmap() is + * just broken in that respect (just like it cannot distinguish sparse from + * not available or error). + */ +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + s64 ofs, size; + loff_t i_size; + LCN lcn; + unsigned long blocksize, flags; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + unsigned delta; + unsigned char blocksize_bits, cluster_size_shift; + + ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", + ni->mft_no, (unsigned long long)block); + if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { + ntfs_error(vol->sb, "BMAP does not make sense for %s " + "attributes, returning 0.", + (ni->type != AT_DATA) ? "non-data" : + (!NInoNonResident(ni) ? "resident" : + "encrypted")); + return 0; + } + /* None of these can happen. */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + ofs = (s64)block << blocksize_bits; + read_lock_irqsave(&ni->size_lock, flags); + size = ni->initialized_size; + i_size = i_size_read(VFS_I(ni)); + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If the offset is outside the initialized size or the block straddles + * the initialized size then pretend it is a hole unless the + * initialized size equals the file size. + */ + if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) + goto hole; + cluster_size_shift = vol->cluster_size_bits; + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + /* + * Step down to an integer to avoid gcc doing a long long + * comparision in the switch when we know @lcn is between + * LCN_HOLE and LCN_EIO (i.e. -1 to -5). + * + * Otherwise older gcc (at least on some architectures) will + * try to use __cmpdi2() which is of course not available in + * the kernel. + */ + switch ((int)lcn) { + case LCN_ENOENT: + /* + * If the offset is out of bounds then pretend it is a + * hole. + */ + goto hole; + case LCN_ENOMEM: + ntfs_error(vol->sb, "Not enough memory to complete " + "mapping for inode 0x%lx. " + "Returning 0.", ni->mft_no); + break; + default: + ntfs_error(vol->sb, "Failed to complete mapping for " + "inode 0x%lx. Run chkdsk. " + "Returning 0.", ni->mft_no); + break; + } + return 0; + } + if (lcn < 0) { + /* It is a hole. */ +hole: + ntfs_debug("Done (returning hole)."); + return 0; + } + /* + * The block is really allocated and fullfils all our criteria. + * Convert the cluster to units of block size and return the result. + */ + delta = ofs & vol->cluster_size_mask; + if (unlikely(sizeof(block) < sizeof(lcn))) { + block = lcn = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + /* If the block number was truncated return 0. */ + if (unlikely(block != lcn)) { + ntfs_error(vol->sb, "Physical block 0x%llx is too " + "large to be returned, returning 0.", + (long long)lcn); + return 0; + } + } else + block = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); + return block; +} + +/** + * ntfs_normal_aops - address space operations for normal inodes and attributes + * + * Note these are not used for compressed or mst protected inodes and + * attributes. + */ +const struct address_space_operations ntfs_normal_aops = { + .readpage = ntfs_readpage, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .set_page_dirty = __set_page_dirty_buffers, +#endif /* NTFS_RW */ + .bmap = ntfs_bmap, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_compressed_aops - address space operations for compressed inodes + */ +const struct address_space_operations ntfs_compressed_aops = { + .readpage = ntfs_readpage, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .set_page_dirty = __set_page_dirty_buffers, +#endif /* NTFS_RW */ + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_mst_aops - general address space operations for mst protecteed inodes + * and attributes + */ +const struct address_space_operations ntfs_mst_aops = { + .readpage = ntfs_readpage, /* Fill page with data. */ +#ifdef NTFS_RW + .writepage = ntfs_writepage, /* Write dirty page to disk. */ + .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty + without touching the buffers + belonging to the page. */ +#endif /* NTFS_RW */ + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +#ifdef NTFS_RW + +/** + * mark_ntfs_record_dirty - mark an ntfs record dirty + * @page: page containing the ntfs record to mark dirty + * @ofs: byte offset within @page at which the ntfs record begins + * + * Set the buffers and the page in which the ntfs record is located dirty. + * + * The latter also marks the vfs inode the ntfs record belongs to dirty + * (I_DIRTY_PAGES only). + * + * If the page does not have buffers, we create them and set them uptodate. + * The page may not be locked which is why we need to handle the buffers under + * the mapping->private_lock. Once the buffers are marked dirty we no longer + * need the lock since try_to_free_buffers() does not free dirty buffers. + */ +void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + struct buffer_head *bh, *head, *buffers_to_free = NULL; + unsigned int end, bh_size, bh_ofs; + + BUG_ON(!PageUptodate(page)); + end = ofs + ni->itype.index.block_size; + bh_size = VFS_I(ni)->i_sb->s_blocksize; + spin_lock(&mapping->private_lock); + if (unlikely(!page_has_buffers(page))) { + spin_unlock(&mapping->private_lock); + bh = head = alloc_page_buffers(page, bh_size, 1); + spin_lock(&mapping->private_lock); + if (likely(!page_has_buffers(page))) { + struct buffer_head *tail; + + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); + } else + buffers_to_free = bh; + } + bh = head = page_buffers(page); + BUG_ON(!bh); + do { + bh_ofs = bh_offset(bh); + if (bh_ofs + bh_size <= ofs) + continue; + if (unlikely(bh_ofs >= end)) + break; + set_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&mapping->private_lock); + __set_page_dirty_nobuffers(page); + if (unlikely(buffers_to_free)) { + do { + bh = buffers_to_free->b_this_page; + free_buffer_head(buffers_to_free); + buffers_to_free = bh; + } while (buffers_to_free); + } +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/aops.h b/kernel/fs/ntfs/aops.h new file mode 100644 index 000000000..caecc58f5 --- /dev/null +++ b/kernel/fs/ntfs/aops.h @@ -0,0 +1,107 @@ +/** + * aops.h - Defines for NTFS kernel address space operations and page cache + * handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_AOPS_H +#define _LINUX_NTFS_AOPS_H + +#include +#include +#include +#include + +#include "inode.h" + +/** + * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() + * @page: the page to release + * + * Unpin, unmap and release a page that was obtained from ntfs_map_page(). + */ +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/** + * ntfs_map_page - map a page into accessible memory, reading it if necessary + * @mapping: address space for which to obtain the page + * @index: index into the page cache for @mapping of the page to map + * + * Read a page from the page cache of the address space @mapping at position + * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes. + * + * If the page is not in memory it is loaded from disk first using the readpage + * method defined in the address space operations of @mapping and the page is + * added to the page cache of @mapping in the process. + * + * If the page belongs to an mst protected attribute and it is marked as such + * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no + * error checking is performed. This means the caller has to verify whether + * the ntfs record(s) contained in the page are valid or not using one of the + * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are + * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) + * + * If the page is in high memory it is mapped into memory directly addressible + * by the kernel. + * + * Finally the page count is incremented, thus pinning the page into place. + * + * The above means that page_address(page) can be used on all pages obtained + * with ntfs_map_page() to get the kernel virtual address of the page. + * + * When finished with the page, the caller has to call ntfs_unmap_page() to + * unpin, unmap and release the page. + * + * Note this does not grant exclusive access. If such is desired, the caller + * must provide it independently of the ntfs_{un}map_page() calls by using + * a {rw_}semaphore or other means of serialization. A spin lock cannot be + * used as ntfs_map_page() can block. + * + * The unlocked and uptodate page is returned on success or an encoded error + * on failure. Caller has to test for error using the IS_ERR() macro on the + * return value. If that evaluates to 'true', the negative error code can be + * obtained using PTR_ERR() on the return value of ntfs_map_page(). + */ +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageError(page)) + return page; + ntfs_unmap_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +#ifdef NTFS_RW + +extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/kernel/fs/ntfs/attrib.c b/kernel/fs/ntfs/attrib.c new file mode 100644 index 000000000..250ed5b20 --- /dev/null +++ b/kernel/fs/ntfs/attrib.c @@ -0,0 +1,2614 @@ +/** + * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "layout.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" +#include "types.h" + +/** + * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * @ctx: active attribute search context if present or NULL if not + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_map_runlist_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Note the runlist can be NULL after this function returns if @vcn is zero and + * the attribute has zero allocated size, i.e. there simply is no runlist. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist will be modified. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) +{ + VCN end_vcn; + unsigned long flags; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + runlist_element *rl; + struct page *put_this_page = NULL; + int err = 0; + bool ctx_is_temporary, ctx_needs_reset; + ntfs_attr_search_ctx old_ctx = { NULL, }; + + ntfs_debug("Mapping runlist part containing vcn 0x%llx.", + (unsigned long long)vcn); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + if (!ctx) { + ctx_is_temporary = ctx_needs_reset = true; + m = map_mft_record(base_ni); + if (IS_ERR(m)) + return PTR_ERR(m); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + } else { + VCN allocated_size_vcn; + + BUG_ON(IS_ERR(ctx->mrec)); + a = ctx->attr; + BUG_ON(!a->non_resident); + ctx_is_temporary = false; + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + read_lock_irqsave(&ni->size_lock, flags); + allocated_size_vcn = ni->allocated_size >> + ni->vol->cluster_size_bits; + read_unlock_irqrestore(&ni->size_lock, flags); + if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) + end_vcn = allocated_size_vcn - 1; + /* + * If we already have the attribute extent containing @vcn in + * @ctx, no need to look it up again. We slightly cheat in + * that if vcn exceeds the allocated size, we will refuse to + * map the runlist below, so there is definitely no need to get + * the right attribute extent. + */ + if (vcn >= allocated_size_vcn || (a->type == ni->type && + a->name_length == ni->name_len && + !memcmp((u8*)a + le16_to_cpu(a->name_offset), + ni->name, ni->name_len) && + sle64_to_cpu(a->data.non_resident.lowest_vcn) + <= vcn && end_vcn >= vcn)) + ctx_needs_reset = false; + else { + /* Save the old search context. */ + old_ctx = *ctx; + /* + * If the currently mapped (extent) inode is not the + * base inode we will unmap it when we reinitialize the + * search context which means we need to get a + * reference to the page containing the mapped mft + * record so we do not accidentally drop changes to the + * mft record when it has not been marked dirty yet. + */ + if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { + put_this_page = old_ctx.ntfs_ino->page; + page_cache_get(put_this_page); + } + /* + * Reinitialize the search context so we can lookup the + * needed attribute extent. + */ + ntfs_attr_reinit_search_ctx(ctx); + ctx_needs_reset = true; + } + } + if (ctx_needs_reset) { + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + BUG_ON(!ctx->attr->non_resident); + } + a = ctx->attr; + /* + * Only decompress the mapping pairs if @vcn is inside it. Otherwise + * we get into problems when we try to map an out of bounds vcn because + * we then try to map the already mapped runlist fragment and + * ntfs_mapping_pairs_decompress() fails. + */ + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + if (unlikely(vcn && vcn >= end_vcn)) { + err = -ENOENT; + goto err_out; + } + rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else + ni->runlist.rl = rl; +err_out: + if (ctx_is_temporary) { + if (likely(ctx)) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } else if (ctx_needs_reset) { + /* + * If there is no attribute list, restoring the search context + * is accomplished simply by copying the saved context back over + * the caller supplied context. If there is an attribute list, + * things are more complicated as we need to deal with mapping + * of mft records and resulting potential changes in pointers. + */ + if (NInoAttrList(base_ni)) { + /* + * If the currently mapped (extent) inode is not the + * one we had before, we need to unmap it and map the + * old one. + */ + if (ctx->ntfs_ino != old_ctx.ntfs_ino) { + /* + * If the currently mapped inode is not the + * base inode, unmap it. + */ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != + ctx->base_ntfs_ino) { + unmap_extent_mft_record(ctx->ntfs_ino); + ctx->mrec = ctx->base_mrec; + BUG_ON(!ctx->mrec); + } + /* + * If the old mapped inode is not the base + * inode, map it. + */ + if (old_ctx.base_ntfs_ino && + old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { +retry_map: + ctx->mrec = map_mft_record( + old_ctx.ntfs_ino); + /* + * Something bad has happened. If out + * of memory retry till it succeeds. + * Any other errors are fatal and we + * return the error code in ctx->mrec. + * Let the caller deal with it... We + * just need to fudge things so the + * caller can reinit and/or put the + * search context safely. + */ + if (IS_ERR(ctx->mrec)) { + if (PTR_ERR(ctx->mrec) == + -ENOMEM) { + schedule(); + goto retry_map; + } else + old_ctx.ntfs_ino = + old_ctx. + base_ntfs_ino; + } + } + } + /* Update the changed pointers in the saved context. */ + if (ctx->mrec != old_ctx.mrec) { + if (!IS_ERR(ctx->mrec)) + old_ctx.attr = (ATTR_RECORD*)( + (u8*)ctx->mrec + + ((u8*)old_ctx.attr - + (u8*)old_ctx.mrec)); + old_ctx.mrec = ctx->mrec; + } + } + /* Restore the search context to the saved one. */ + *ctx = old_ctx; + /* + * We drop the reference on the page we took earlier. In the + * case that IS_ERR(ctx->mrec) is true this means we might lose + * some changes to the mft record that had been made between + * the last time it was marked dirty/written out and now. This + * at this stage is not a problem as the mapping error is fatal + * enough that the mft record cannot be written out anyway and + * the caller is very likely to shutdown the whole inode + * immediately and mark the volume dirty for chkdsk to pick up + * the pieces anyway. + */ + if (put_this_page) + page_cache_release(put_this_page); + } + return err; +} + +/** + * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Locking: - The runlist must be unlocked on entry and is unlocked on return. + * - This function takes the runlist lock for writing and may modify + * the runlist. + */ +int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +{ + int err = 0; + + down_write(&ni->runlist.lock); + /* Make sure someone else didn't do the work while we were sleeping. */ + if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= + LCN_RL_NOT_MAPPED)) + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + up_write(&ni->runlist.lock); + return err; +} + +/** + * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode + * @ni: ntfs inode of the attribute whose runlist to search + * @vcn: vcn to convert + * @write_locked: true if the runlist is locked for writing + * + * Find the virtual cluster number @vcn in the runlist of the ntfs attribute + * described by the ntfs inode @ni and return the corresponding logical cluster + * number (lcn). + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ========================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. + * LCN_ENOMEM Not enough memory to map runlist. + * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). + * + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is 'false', i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. + */ +LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked) +{ + LCN lcn; + unsigned long flags; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", + ni->mft_no, (unsigned long long)vcn, + write_locked ? "write" : "read"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return LCN_ENOENT; + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + /* Convert vcn to lcn. If that fails map the runlist and retry once. */ + lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); + if (likely(lcn >= LCN_HOLE)) { + ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); + return lcn; + } + if (lcn != LCN_RL_NOT_MAPPED) { + if (lcn != LCN_ENOENT) + lcn = LCN_EIO; + } else if (!is_retry) { + int err; + + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + if (err == -ENOENT) + lcn = LCN_ENOENT; + else if (err == -ENOMEM) + lcn = LCN_ENOMEM; + else + lcn = LCN_EIO; + } + if (lcn != LCN_ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %lli.", + (long long)lcn); + return lcn; +} + +/** + * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode + * @ni: ntfs inode describing the runlist to search + * @vcn: vcn to find + * @ctx: active attribute search context if present or NULL if not + * + * Find the virtual cluster number @vcn in the runlist described by the ntfs + * inode @ni and return the address of the runlist element containing the @vcn. + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_attr_find_vcn_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Note you need to distinguish between the lcn of the returned runlist element + * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on + * read and allocate clusters on write. + * + * Return the runlist element containing the @vcn on success and + * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() + * to decide if the return is success or failure and PTR_ERR() to get to the + * error code if IS_ERR() is true. + * + * The possible error return codes are: + * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. + * -ENOMEM - Not enough memory to map runlist. + * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, + ntfs_attr_search_ctx *ctx) +{ + unsigned long flags; + runlist_element *rl; + int err = 0; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", + ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return ERR_PTR(-ENOENT); + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + rl = ni->runlist.rl; + if (likely(rl && vcn >= rl[0].vcn)) { + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) { + ntfs_debug("Done."); + return rl; + } + break; + } + rl++; + } + if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { + if (likely(rl->lcn == LCN_ENOENT)) + err = -ENOENT; + else + err = -EIO; + } + } + if (!err && !is_retry) { + /* + * If the search context is invalid we cannot map the unmapped + * region. + */ + if (IS_ERR(ctx->mrec)) + err = PTR_ERR(ctx->mrec); + else { + /* + * The @vcn is in an unmapped region, map the runlist + * and retry. + */ + err = ntfs_map_runlist_nolock(ni, vcn, ctx); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + } + if (err == -EINVAL) + err = -EIO; + } else if (!err) + err = -EIO; + if (err != -ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %i.", err); + return ERR_PTR(err); +} + +/** + * ntfs_attr_find - find (next) attribute in mft record + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * ntfs_attr_find() takes a search context @ctx as parameter and searches the + * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an + * attribute of @type, optionally @name and @val. + * + * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will + * point to the found attribute. + * + * If the attribute is not found, ntfs_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute before which the attribute being + * searched for would need to be inserted if such an action were to be desired. + * + * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is + * undefined and in particular do not rely on it not changing. + * + * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it + * is 'false', the search begins after @ctx->attr. + * + * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and + * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record + * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at + * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case + * sensitive. When @name is present, @name_len is the @name length in Unicode + * characters. + * + * If @name is not present (NULL), we assume that the unnamed attribute is + * being searched for. + * + * Finally, the resident attribute value @val is looked for, if present. If + * @val is not present (NULL), @val_len is ignored. + * + * ntfs_attr_find() only searches the specified mft record and it ignores the + * presence of an attribute list attribute (unless it is the one being searched + * for, obviously). If you need to take attribute lists into consideration, + * use ntfs_attr_lookup() instead (see below). This also means that you cannot + * use ntfs_attr_find() to search for extent records of non-resident + * attributes, as extents with lowest_vcn != 0 are usually described by the + * attribute list attribute only. - Note that it is possible that the first + * extent is only in the attribute list while the last extent is in the base + * mft record, so do not rely on being able to find the first extent in the + * base mft record. + * + * Warning: Never use @val when looking for attribute types which can be + * non-resident as this most likely will result in a crash! + */ +static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ATTR_RECORD *a; + ntfs_volume *vol = ctx->ntfs_ino->vol; + ntfschar *upcase = vol->upcase; + u32 upcase_len = vol->upcase_len; + + /* + * Iterate over attributes in mft record starting at @ctx->attr, or the + * attribute following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + a = ctx->attr; + ctx->is_first = false; + } else + a = (ATTR_RECORD*)((u8*)ctx->attr + + le32_to_cpu(ctx->attr->length)); + for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + ctx->attr = a; + if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || + a->type == AT_END)) + return -ENOENT; + if (unlikely(!a->length)) + break; + if (a->type != type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + if (!name) { + /* The search failed if the found attribute is named. */ + if (a->name_length) + return -ENOENT; + } else if (!ntfs_are_names_equal(name, name_len, + (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), + a->name_length, ic, upcase, upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, IGNORE_CASE, + upcase, upcase_len); + /* + * If @name collates before a->name, there is no + * matching attribute. + */ + if (rc == -1) + return -ENOENT; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, CASE_SENSITIVE, + upcase, upcase_len); + if (rc == -1) + return -ENOENT; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. If no @val specified, we have found the attribute + * and are done. + */ + if (!val) + return 0; + /* @val is present; compare values. */ + else { + register int rc; + + rc = memcmp(val, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + min_t(u32, val_len, le32_to_cpu( + a->data.resident.value_length))); + /* + * If @val collates before the current attribute's + * value, there is no matching attribute. + */ + if (!rc) { + register u32 avl; + + avl = le32_to_cpu( + a->data.resident.value_length); + if (val_len == avl) + return 0; + if (val_len < avl) + return -ENOENT; + } else if (rc < 0) + return -ENOENT; + } + } + ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); + NVolSetErrors(vol); + return -EIO; +} + +/** + * load_attribute_list - load an attribute list into memory + * @vol: ntfs volume from which to read + * @runlist: runlist of the attribute list + * @al_start: destination buffer + * @size: size of the destination buffer in bytes + * @initialized_size: initialized size of the attribute list + * + * Walk the runlist @runlist and load all clusters from it copying them into + * the linear buffer @al. The maximum number of bytes copied to @al is @size + * bytes. Note, @size does not need to be a multiple of the cluster size. If + * @initialized_size is less than @size, the region in @al between + * @initialized_size and @size will be zeroed and not read from disk. + * + * Return 0 on success or -errno on error. + */ +int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, + const s64 size, const s64 initialized_size) +{ + LCN lcn; + u8 *al = al_start; + u8 *al_end = al + initialized_size; + runlist_element *rl; + struct buffer_head *bh; + struct super_block *sb; + unsigned long block_size; + unsigned long block, max_block; + int err = 0; + unsigned char block_size_bits; + + ntfs_debug("Entering."); + if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || + initialized_size > size) + return -EINVAL; + if (!initialized_size) { + memset(al, 0, size); + return 0; + } + sb = vol->sb; + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + down_read(&runlist->lock); + rl = runlist->rl; + if (!rl) { + ntfs_error(sb, "Cannot read attribute list since runlist is " + "missing."); + goto err_out; + } + /* Read all clusters specified by the runlist one run at a time. */ + while (rl->length) { + lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)rl->vcn, + (unsigned long long)lcn); + /* The attribute list cannot be sparse. */ + if (lcn < 0) { + ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " + "read attribute list."); + goto err_out; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the run from device in chunks of block_size bytes. */ + max_block = block + (rl->length << vol->cluster_size_bits >> + block_size_bits); + ntfs_debug("max_block = 0x%lx.", max_block); + do { + ntfs_debug("Reading block = 0x%lx.", block); + bh = sb_bread(sb, block); + if (!bh) { + ntfs_error(sb, "sb_bread() failed. Cannot " + "read attribute list."); + goto err_out; + } + if (al + block_size >= al_end) + goto do_final; + memcpy(al, bh->b_data, block_size); + brelse(bh); + al += block_size; + } while (++block < max_block); + rl++; + } + if (initialized_size < size) { +initialize: + memset(al_start + initialized_size, 0, size - initialized_size); + } +done: + up_read(&runlist->lock); + return err; +do_final: + if (al < al_end) { + /* + * Partial block. + * + * Note: The attribute list can be smaller than its allocation + * by multiple clusters. This has been encountered by at least + * two people running Windows XP, thus we cannot do any + * truncation sanity checking here. (AIA) + */ + memcpy(al, bh->b_data, al_end - al); + brelse(bh); + if (initialized_size < size) + goto initialize; + goto done; + } + brelse(bh); + /* Real overflow! */ + ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " + "is truncated."); +err_out: + err = -EIO; + goto done; +} + +/** + * ntfs_external_attr_find - find an attribute in the attribute list of an inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * Find an attribute by searching the attribute list for the corresponding + * attribute list entry. Having found the entry, map the mft record if the + * attribute is in a different mft record/inode, ntfs_attr_find() the attribute + * in there and return it. + * + * On first search @ctx->ntfs_ino must be the base mft record and @ctx must + * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent + * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is + * then the base inode). + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * If the attribute is found, ntfs_external_attr_find() returns 0 and + * @ctx->attr will point to the found attribute. @ctx->mrec will point to the + * mft record in which @ctx->attr is located and @ctx->al_entry will point to + * the attribute list entry for the attribute. + * + * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute in the base mft record before which + * the attribute being searched for would need to be inserted if such an action + * were to be desired. @ctx->mrec will point to the mft record in which + * @ctx->attr is located and @ctx->al_entry will point to the attribute list + * entry of the attribute before which the attribute being searched for would + * need to be inserted if such an action were to be desired. + * + * Thus to insert the not found attribute, one wants to add the attribute to + * @ctx->mrec (the base mft record) and if there is not enough space, the + * attribute should be placed in a newly allocated extent mft record. The + * attribute list entry for the inserted attribute should be inserted in the + * attribute list attribute at @ctx->al_entry. + * + * On actual error, ntfs_external_attr_find() returns -EIO. In this case + * @ctx->attr is undefined and in particular do not rely on it not changing. + */ +static int ntfs_external_attr_find(const ATTR_TYPE type, + const ntfschar *name, const u32 name_len, + const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni, *ni; + ntfs_volume *vol; + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_start, *al_end; + ATTR_RECORD *a; + ntfschar *al_name; + u32 al_name_len; + int err = 0; + static const char *es = " Unmount and run chkdsk."; + + ni = ctx->ntfs_ino; + base_ni = ctx->base_ntfs_ino; + ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); + if (!base_ni) { + /* First call happens with the base mft record. */ + base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; + ctx->base_mrec = ctx->mrec; + } + if (ni == base_ni) + ctx->base_attr = ctx->attr; + if (type == AT_END) + goto not_found; + vol = base_ni->vol; + al_start = base_ni->attr_list; + al_end = al_start + base_ni->attr_list_size; + if (!ctx->al_entry) + ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; + /* + * Iterate over entries in attribute list starting at @ctx->al_entry, + * or the entry following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + al_entry = ctx->al_entry; + ctx->is_first = false; + } else + al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + + le16_to_cpu(ctx->al_entry->length)); + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < base_ni->attr_list || + (u8*)al_entry > al_end) + break; /* Inode is corrupt. */ + ctx->al_entry = al_entry; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto not_found; + if (!al_entry->length) + break; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + break; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) + goto not_found; + if (type != al_entry->type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + al_name_len = al_entry->name_length; + al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); + if (!name) { + if (al_name_len) + goto not_found; + } else if (!ntfs_are_names_equal(al_name, al_name_len, name, + name_len, ic, vol->upcase, vol->upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, IGNORE_CASE, + vol->upcase, vol->upcase_len); + /* + * If @name collates before al_name, there is no + * matching attribute. + */ + if (rc == -1) + goto not_found; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + /* + * FIXME: Reverse engineering showed 0, IGNORE_CASE but + * that is inconsistent with ntfs_attr_find(). The + * subsequent rc checks were also different. Perhaps I + * made a mistake in one of the two. Need to recheck + * which is correct or at least see what is going on... + * (AIA) + */ + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, CASE_SENSITIVE, + vol->upcase, vol->upcase_len); + if (rc == -1) + goto not_found; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. Now check @lowest_vcn. Continue search if the + * next attribute list entry still fits @lowest_vcn. Otherwise + * we have reached the right one or the search has failed. + */ + if (lowest_vcn && (u8*)next_al_entry >= al_start && + (u8*)next_al_entry + 6 < al_end && + (u8*)next_al_entry + le16_to_cpu( + next_al_entry->length) <= al_end && + sle64_to_cpu(next_al_entry->lowest_vcn) <= + lowest_vcn && + next_al_entry->type == al_entry->type && + next_al_entry->name_length == al_name_len && + ntfs_are_names_equal((ntfschar*)((u8*) + next_al_entry + + next_al_entry->name_offset), + next_al_entry->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + continue; + if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { + if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { + ntfs_error(vol->sb, "Found stale mft " + "reference in attribute list " + "of base inode 0x%lx.%s", + base_ni->mft_no, es); + err = -EIO; + break; + } + } else { /* Mft references do not match. */ + /* If there is a mapped record unmap it first. */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + /* Do we want the base record back? */ + if (MREF_LE(al_entry->mft_reference) == + base_ni->mft_no) { + ni = ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + } else { + /* We want an extent record. */ + ctx->mrec = map_extent_mft_record(base_ni, + le64_to_cpu( + al_entry->mft_reference), &ni); + if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to map " + "extent mft record " + "0x%lx of base inode " + "0x%lx.%s", + MREF_LE(al_entry-> + mft_reference), + base_ni->mft_no, es); + err = PTR_ERR(ctx->mrec); + if (err == -ENOENT) + err = -EIO; + /* Cause @ctx to be sanitized below. */ + ni = NULL; + break; + } + ctx->ntfs_ino = ni; + } + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + } + /* + * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the + * mft record containing the attribute represented by the + * current al_entry. + */ + /* + * We could call into ntfs_attr_find() to find the right + * attribute in this mft record but this would be less + * efficient and not quite accurate as ntfs_attr_find() ignores + * the attribute instance numbers for example which become + * important when one plays with attribute lists. Also, + * because a proper match has been found in the attribute list + * entry above, the comparison can now be optimized. So it is + * worth re-implementing a simplified ntfs_attr_find() here. + */ + a = ctx->attr; + /* + * Use a manual loop so we can still use break and continue + * with the same meanings as above. + */ +do_next_attr_loop: + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + if (a->type == AT_END) + break; + if (!a->length) + break; + if (al_entry->instance != a->instance) + goto do_next_attr; + /* + * If the type and/or the name are mismatched between the + * attribute list entry and the attribute record, there is + * corruption so we break and return error EIO. + */ + if (al_entry->type != a->type) + break; + if (!ntfs_are_names_equal((ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), a->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + break; + ctx->attr = a; + /* + * If no @val specified or @val specified and it matches, we + * have found it! + */ + if (!val || (!a->non_resident && le32_to_cpu( + a->data.resident.value_length) == val_len && + !memcmp((u8*)a + + le16_to_cpu(a->data.resident.value_offset), + val, val_len))) { + ntfs_debug("Done, found."); + return 0; + } +do_next_attr: + /* Proceed to the next attribute in the current mft record. */ + a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); + goto do_next_attr_loop; + } + if (!err) { + ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " + "attribute list attribute.%s", base_ni->mft_no, + es); + err = -EIO; + } + if (ni != base_ni) { + if (ni) + unmap_extent_mft_record(ni); + ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + ctx->attr = ctx->base_attr; + } + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +not_found: + /* + * If we were looking for AT_END, we reset the search context @ctx and + * use ntfs_attr_find() to seek to the end of the base mft record. + */ + if (type == AT_END) { + ntfs_attr_reinit_search_ctx(ctx); + return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, + ctx); + } + /* + * The attribute was not found. Before we return, we want to ensure + * @ctx->mrec and @ctx->attr indicate the position at which the + * attribute should be inserted in the base mft record. Since we also + * want to preserve @ctx->al_entry we cannot reinitialize the search + * context using ntfs_attr_reinit_search_ctx() as this would set + * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see + * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve + * @ctx->al_entry as the remaining fields (base_*) are identical to + * their non base_ counterparts and we cannot set @ctx->base_attr + * correctly yet as we do not know what @ctx->attr will be set to by + * the call to ntfs_attr_find() below. + */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + ctx->mrec = ctx->base_mrec; + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + ctx->is_first = true; + ctx->ntfs_ino = base_ni; + ctx->base_ntfs_ino = NULL; + ctx->base_mrec = NULL; + ctx->base_attr = NULL; + /* + * In case there are multiple matches in the base mft record, need to + * keep enumerating until we get an attribute not found response (or + * another error), otherwise we would keep returning the same attribute + * over and over again and all programs using us for enumeration would + * lock up in a tight loop. + */ + do { + err = ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + } while (!err); + ntfs_debug("Done, not found."); + return err; +} + +/** + * ntfs_attr_lookup - find an attribute in an ntfs inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must + * be the base mft record and @ctx must have been obtained from a call to + * ntfs_attr_get_search_ctx(). + * + * This function transparently handles attribute lists and @ctx is used to + * continue searches where they were left off at. + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * Return 0 if the search was successful and -errno if not. + * + * When 0, @ctx->attr is the found attribute and it is in mft record + * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is + * the attribute list entry of the found attribute. + * + * When -ENOENT, @ctx->attr is the attribute which collates just after the + * attribute being searched for, i.e. if one wants to add the attribute to the + * mft record this is the correct place to insert it into. If an attribute + * list attribute is present, @ctx->al_entry is the attribute list entry which + * collates just after the attribute list entry of the attribute being searched + * for, i.e. if one wants to add the attribute to the mft record this is the + * correct place to insert its attribute list entry into. + * + * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is + * then undefined and in particular you should not rely on it not changing. + */ +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering."); + BUG_ON(IS_ERR(ctx->mrec)); + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + /* Sanity check, just for debugging really. */ + BUG_ON(!base_ni); + if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) + return ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, + val, val_len, ctx); +} + +/** + * ntfs_attr_init_search_ctx - initialize an attribute search context + * @ctx: attribute search context to initialize + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Initialize the attribute search context @ctx with @ni and @mrec. + */ +static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, + ntfs_inode *ni, MFT_RECORD *mrec) +{ + *ctx = (ntfs_attr_search_ctx) { + .mrec = mrec, + /* Sanity checks are performed elsewhere. */ + .attr = (ATTR_RECORD*)((u8*)mrec + + le16_to_cpu(mrec->attrs_offset)), + .is_first = true, + .ntfs_ino = ni, + }; +} + +/** + * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context + * @ctx: attribute search context to reinitialize + * + * Reinitialize the attribute search context @ctx, unmapping an associated + * extent mft record if present, and initialize the search context again. + * + * This is used when a search for a new attribute is being started to reset + * the search context to the beginning. + */ +void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (likely(!ctx->base_ntfs_ino)) { + /* No attribute list. */ + ctx->is_first = true; + /* Sanity checks are performed elsewhere. */ + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + /* + * This needs resetting due to ntfs_external_attr_find() which + * can leave it set despite having zeroed ctx->base_ntfs_ino. + */ + ctx->al_entry = NULL; + return; + } /* Attribute list. */ + if (ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); + return; +} + +/** + * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Allocate a new attribute search context, initialize it with @ni and @mrec, + * and return it. Return NULL if allocation failed. + */ +ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) +{ + ntfs_attr_search_ctx *ctx; + + ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); + if (ctx) + ntfs_attr_init_search_ctx(ctx, ni, mrec); + return ctx; +} + +/** + * ntfs_attr_put_search_ctx - release an attribute search context + * @ctx: attribute search context to free + * + * Release the attribute search context @ctx, unmapping an associated extent + * mft record if present. + */ +void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + kmem_cache_free(ntfs_attr_ctx_cache, ctx); + return; +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to find + * + * Search for the attribute definition record corresponding to the attribute + * @type in the $AttrDef system file. + * + * Return the attribute type definition record if found and NULL if not found. + */ +static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, + const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + BUG_ON(!vol->attrdef); + BUG_ON(!type); + for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < + vol->attrdef_size && ad->type; ++ad) { + /* We have not found it yet, carry on searching. */ + if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) + continue; + /* We found the attribute; return it. */ + if (likely(ad->type == type)) + return ad; + /* We have gone too far already. No point in continuing. */ + break; + } + /* Attribute not found. */ + ntfs_debug("Attribute type 0x%x not found in $AttrDef.", + le32_to_cpu(type)); + return NULL; +} + +/** + * ntfs_attr_size_bounds_check - check a size of an attribute type for validity + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * @size: size which to check + * + * Check whether the @size in bytes is valid for an attribute of @type on the + * ntfs volume @vol. This information is obtained from $AttrDef system file. + * + * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not + * listed in $AttrDef. + */ +int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, + const s64 size) +{ + ATTR_DEF *ad; + + BUG_ON(size < 0); + /* + * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not + * listed in $AttrDef. + */ + if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) + return -ERANGE; + /* Get the $AttrDef entry for the attribute @type. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Do the bounds check. */ + if (((sle64_to_cpu(ad->min_size) > 0) && + size < sle64_to_cpu(ad->min_size)) || + ((sle64_to_cpu(ad->max_size) > 0) && size > + sle64_to_cpu(ad->max_size))) + return -ERANGE; + return 0; +} + +/** + * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be non-resident. This information is obtained from $AttrDef system file. + * + * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and + * -ENOENT if the attribute is not listed in $AttrDef. + */ +int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + /* Find the attribute definition record in $AttrDef. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Check the flags and return the result. */ + if (ad->flags & ATTR_DEF_RESIDENT) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_can_be_resident - check if an attribute can be resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be resident. This information is derived from our ntfs knowledge and may + * not be completely accurate, especially when user defined attributes are + * present. Basically we allow everything to be resident except for index + * allocation and $EA attributes. + * + * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. + * + * Warning: In the system file $MFT the attribute $Bitmap must be non-resident + * otherwise windows will not boot (blue screen of death)! We cannot + * check for this here as we do not know which inode's $Bitmap is + * being asked about so the caller needs to special case this. + */ +int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + if (type == AT_INDEX_ALLOCATION) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_record_resize - resize an attribute record + * @m: mft record containing attribute record + * @a: attribute record to resize + * @new_size: new size in bytes to which to resize the attribute record @a + * + * Resize the attribute record @a, i.e. the resident part of the attribute, in + * the mft record @m to @new_size bytes. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) +{ + ntfs_debug("Entering for new_size %u.", new_size); + /* Align to 8 bytes if it is not already done. */ + if (new_size & 7) + new_size = (new_size + 7) & ~7; + /* If the actual attribute length has changed, move things around. */ + if (new_size != le32_to_cpu(a->length)) { + u32 new_muse = le32_to_cpu(m->bytes_in_use) - + le32_to_cpu(a->length) + new_size; + /* Not enough space in this mft record. */ + if (new_muse > le32_to_cpu(m->bytes_allocated)) + return -ENOSPC; + /* Move attributes following @a to their new location. */ + memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), + le32_to_cpu(m->bytes_in_use) - ((u8*)a - + (u8*)m) - le32_to_cpu(a->length)); + /* Adjust @m to reflect the change in used space. */ + m->bytes_in_use = cpu_to_le32(new_muse); + /* Adjust @a to reflect the new size. */ + if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) + a->length = cpu_to_le32(new_size); + } + return 0; +} + +/** + * ntfs_resident_attr_value_resize - resize the value of a resident attribute + * @m: mft record containing attribute record + * @a: attribute record whose value to resize + * @new_size: new size in bytes to which to resize the attribute value of @a + * + * Resize the value of the attribute @a in the mft record @m to @new_size bytes. + * If the value is made bigger, the newly allocated space is cleared. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size) +{ + u32 old_size; + + /* Resize the resident part of the attribute record. */ + if (ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + new_size)) + return -ENOSPC; + /* + * The resize succeeded! If we made the attribute value bigger, clear + * the area between the old size and @new_size. + */ + old_size = le32_to_cpu(a->data.resident.value_length); + if (new_size > old_size) + memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + old_size, 0, new_size - old_size); + /* Finally update the length of the attribute value. */ + a->data.resident.value_length = cpu_to_le32(new_size); + return 0; +} + +/** + * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute + * @ni: ntfs inode describing the attribute to convert + * @data_size: size of the resident data to copy to the non-resident attribute + * + * Convert the resident ntfs attribute described by the ntfs inode @ni to a + * non-resident one. + * + * @data_size must be equal to the attribute value size. This is needed since + * we need to know the size before we can map the mft record and our callers + * always know it. The reason we cannot simply read the size from the vfs + * inode i_size is that this is not necessarily uptodate. This happens when + * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). + * + * Return 0 on success and -errno on error. The following error return codes + * are defined: + * -EPERM - The attribute is not allowed to be non-resident. + * -ENOMEM - Not enough memory. + * -ENOSPC - Not enough disk space. + * -EINVAL - Attribute not defined on the volume. + * -EIO - I/o error or other error. + * Note that -ENOSPC is also returned in the case that there is not enough + * space in the mft record to do the conversion. This can happen when the mft + * record is already very full. The caller is responsible for trying to make + * space in the mft record and trying again. FIXME: Do we need a separate + * error return code for this kind of -ENOSPC or is it always worth trying + * again in case the attribute may then fit in a resident state so no need to + * make it non-resident at all? Ho-hum... (AIA) + * + * NOTE to self: No changes in the attribute list are required to move from + * a resident to a non-resident attribute. + * + * Locking: - The caller must hold i_mutex on the inode. + */ +int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) +{ + s64 new_size; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + struct page *page; + runlist_element *rl; + u8 *kaddr; + unsigned long flags; + int mp_size, mp_ofs, name_ofs, arec_size, err, err2; + u32 attr_size; + u8 old_res_attr_flags; + + /* Check that the attribute is allowed to be non-resident. */ + err = ntfs_attr_can_be_non_resident(vol, ni->type); + if (unlikely(err)) { + if (err == -EPERM) + ntfs_debug("Attribute is not allowed to be " + "non-resident."); + else + ntfs_debug("Attribute not defined on the NTFS " + "volume!"); + return err; + } + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (data_size + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + /* + * Will need the page later and since the page lock nests + * outside all ntfs locks, we need to get the page now. + */ + page = find_or_create_page(vi->i_mapping, 0, + mapping_gfp_mask(vi->i_mapping)); + if (unlikely(!page)) + return -ENOMEM; + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, new_size >> + vol->cluster_size_bits, -1, DATA_ZONE, true); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code " + "%i.", (new_size >> + vol->cluster_size_bits) > 1 ? "s" : "", + err); + goto page_err_out; + } + } else { + rl = NULL; + page = NULL; + } + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error " + "code %i.", err); + goto rl_err_out; + } + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(NInoNonResident(ni)); + BUG_ON(a->non_resident); + /* + * Calculate new offsets for the name and the mapping pairs array. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + + sizeof(a->data.non_resident.compressed_size) + + 7) & ~7; + else + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + /* + * Determine the size of the resident part of the now non-resident + * attribute record. + */ + arec_size = (mp_ofs + mp_size + 7) & ~7; + /* + * If the page is not uptodate bring it uptodate by copying from the + * attribute value. + */ + attr_size = le32_to_cpu(a->data.resident.value_length); + BUG_ON(attr_size != data_size); + if (page && !PageUptodate(page)) { + kaddr = kmap_atomic(page); + memcpy(kaddr, (u8*)a + + le16_to_cpu(a->data.resident.value_offset), + attr_size); + memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* Backup the attribute flag. */ + old_res_attr_flags = a->data.resident.flags; + /* Resize the resident part of the attribute record. */ + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + /* + * Convert the resident part of the attribute record to describe a + * non-resident attribute. + */ + a->non_resident = 1; + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + a->name_offset = cpu_to_le16(name_ofs); + /* Setup the fields specific to non-resident attributes. */ + a->data.non_resident.lowest_vcn = 0; + a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> + vol->cluster_size_bits); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + memset(&a->data.non_resident.reserved, 0, + sizeof(a->data.non_resident.reserved)); + a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size = + cpu_to_sle64(attr_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + a->data.non_resident.compression_unit = 0; + if (NInoCompressed(ni) || vol->major_ver < 3) + a->data.non_resident.compression_unit = 4; + a->data.non_resident.compressed_size = + a->data.non_resident.allocated_size; + } else + a->data.non_resident.compression_unit = 0; + /* Generate the mapping pairs array into the attribute record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_debug("Failed to build mapping pairs, error code %i.", + err); + goto undo_err_out; + } + /* Setup the in-memory attribute structure to be non-resident. */ + ni->runlist.rl = rl; + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_size; + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size = ni->allocated_size; + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << (a->data. + non_resident.compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed.block_size) - + 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = ni->allocated_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * This needs to be last since the address space operations ->readpage + * and ->writepage can run concurrently with us as they are not + * serialized on i_mutex. Note, we are not allowed to fail once we flip + * this switch, which is another reason to do this last. + */ + NInoSetNonResident(ni); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + if (page) { + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + ntfs_debug("Done."); + return 0; +undo_err_out: + /* Convert the attribute back into a resident attribute. */ + a->non_resident = 0; + /* Move the attribute name if it exists and update the offset. */ + name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + sizeof(a->data.resident.reserved) + 7) & ~7; + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + a->name_offset = cpu_to_le16(name_ofs); + arec_size = (mp_ofs + attr_size + 7) & ~7; + /* Resize the resident part of the attribute record. */ + err2 = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err2)) { + /* + * This cannot happen (well if memory corruption is at work it + * could happen in theory), but deal with it as well as we can. + * If the old size is too small, truncate the attribute, + * otherwise simply give it a larger allocated size. + * FIXME: Should check whether chkdsk complains when the + * allocated size is much bigger than the resident value size. + */ + arec_size = le32_to_cpu(a->length); + if ((mp_ofs + attr_size) > arec_size) { + err2 = attr_size; + attr_size = arec_size - mp_ofs; + ntfs_error(vol->sb, "Failed to undo partial resident " + "to non-resident attribute " + "conversion. Truncating inode 0x%lx, " + "attribute type 0x%x from %i bytes to " + "%i bytes to maintain metadata " + "consistency. THIS MEANS YOU ARE " + "LOSING %i BYTES DATA FROM THIS %s.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err2, attr_size, err2 - attr_size, + ((ni->type == AT_DATA) && + !ni->name_len) ? "FILE": "ATTRIBUTE"); + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = attr_size; + i_size_write(vi, attr_size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + } + /* Setup the fields specific to resident attributes. */ + a->data.resident.value_length = cpu_to_le32(attr_size); + a->data.resident.value_offset = cpu_to_le16(mp_ofs); + a->data.resident.flags = old_res_attr_flags; + memset(&a->data.resident.reserved, 0, + sizeof(a->data.resident.reserved)); + /* Copy the data from the page back to the attribute value. */ + if (page) { + kaddr = kmap_atomic(page); + memcpy((u8*)a + mp_ofs, kaddr, attr_size); + kunmap_atomic(kaddr); + } + /* Setup the allocated size in the ntfs inode in case it changed. */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = arec_size - mp_ofs; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + up_write(&ni->runlist.lock); +rl_err_out: + if (rl) { + if (ntfs_cluster_free_from_rl(vol, rl) < 0) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl); +page_err_out: + unlock_page(page); + page_cache_release(page); + } + if (err == -EINVAL) + err = -EIO; + return err; +} + +/** + * ntfs_attr_extend_allocation - extend the allocated space of an attribute + * @ni: ntfs inode of the attribute whose allocation to extend + * @new_alloc_size: new size in bytes to which to extend the allocation to + * @new_data_size: new size in bytes to which to extend the data to + * @data_start: beginning of region which is required to be non-sparse + * + * Extend the allocated space of an attribute described by the ntfs inode @ni + * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be + * implemented as a hole in the file (as long as both the volume and the ntfs + * inode @ni have sparse support enabled). If @data_start is >= 0, then the + * region between the old allocated size and @data_start - 1 may be made sparse + * but the regions between @data_start and @new_alloc_size must be backed by + * actual clusters. + * + * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size + * of the attribute is extended to @new_data_size. Note that the i_size of the + * vfs inode is not updated. Only the data size in the base attribute record + * is updated. The caller has to update i_size separately if this is required. + * WARNING: It is a BUG() for @new_data_size to be smaller than the old data + * size as well as for @new_data_size to be greater than @new_alloc_size. + * + * For resident attributes this involves resizing the attribute record and if + * necessary moving it and/or other attributes into extent mft records and/or + * converting the attribute to a non-resident attribute which in turn involves + * extending the allocation of a non-resident attribute as described below. + * + * For non-resident attributes this involves allocating clusters in the data + * zone on the volume (except for regions that are being made sparse) and + * extending the run list to describe the allocated clusters as well as + * updating the mapping pairs array of the attribute. This in turn involves + * resizing the attribute record and if necessary moving it and/or other + * attributes into extent mft records and/or splitting the attribute record + * into multiple extent attribute records. + * + * Also, the attribute list attribute is updated if present and in some of the + * above cases (the ones where extent mft records/attributes come into play), + * an attribute list attribute is created if not already present. + * + * Return the new allocated size on success and -errno on error. In the case + * that an error is encountered but a partial extension at least up to + * @data_start (if present) is possible, the allocation is partially extended + * and this is returned. This means the caller must check the returned size to + * determine if the extension was partial. If @data_start is -1 then partial + * allocations are not performed. + * + * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. + * + * Locking: This function takes the runlist lock of @ni for writing as well as + * locking the mft record of the base ntfs inode. These locks are maintained + * throughout execution of the function. These locks are required so that the + * attribute can be resized safely and so that it can for example be converted + * from resident to non-resident safely. + * + * TODO: At present attribute list attribute handling is not implemented. + * + * TODO: At present it is not safe to call this function for anything other + * than the $DATA attribute(s) of an uncompressed and unencrypted file. + */ +s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start) +{ + VCN vcn; + s64 ll, allocated_size, start = data_start; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + runlist_element *rl, *rl2; + unsigned long flags; + int err, mp_size; + u32 attr_len = 0; /* Silence stupid gcc warning. */ + bool mp_rebuilt; + +#ifdef DEBUG + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_allocated_size 0x%llx, " + "new_allocated_size 0x%llx, new_data_size 0x%llx, " + "data_start 0x%llx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)allocated_size, + (unsigned long long)new_alloc_size, + (unsigned long long)new_data_size, + (unsigned long long)start); +#endif +retry_extend: + /* + * For non-resident attributes, @start and @new_size need to be aligned + * to cluster boundaries for allocation purposes. + */ + if (NInoNonResident(ni)) { + if (start > 0) + start &= ~(s64)vol->cluster_size_mask; + new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + } + BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); + /* Check if new size is allowed in $AttrDef. */ + err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); + if (unlikely(err)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the new " + "allocation would exceed the " + "maximum allowed size for " + "this attribute type.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } else { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because this " + "attribute type is not " + "defined on the NTFS volume. " + "Possible corruption! You " + "should run chkdsk!", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + } + /* Translate error code to be POSIX conformant for write(2). */ + if (err == -ERANGE) + err = -EFBIG; + else + err = -EIO; + return err; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* + * We will be modifying both the runlist (if non-resident) and the mft + * record so lock them both down. + */ + down_write(&ni->runlist.lock); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If non-resident, seek to the last extent. If resident, there is + * only one extent, so seek to that. + */ + vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : + 0; + /* + * Abort if someone did the work whilst we waited for the locks. If we + * just converted the attribute from resident to non-resident it is + * likely that exactly this has happened already. We cannot quite + * abort if we need to update the data size. + */ + if (unlikely(new_alloc_size <= allocated_size)) { + ntfs_debug("Allocated size already exceeds requested size."); + new_alloc_size = allocated_size; + if (new_data_size < 0) + goto done; + /* + * We want the first attribute extent so that we can update the + * data size. + */ + vcn = 0; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + /* Use goto to reduce indentation. */ + if (a->non_resident) + goto do_non_resident_extend; + BUG_ON(NInoNonResident(ni)); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + /* + * Extend the attribute record to be able to store the new attribute + * size. ntfs_attr_record_resize() will not do anything if the size is + * not changing. + */ + if (new_alloc_size < vol->mft_record_size && + !ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + + new_alloc_size)) { + /* The resize succeeded! */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + write_unlock_irqrestore(&ni->size_lock, flags); + if (new_data_size >= 0) { + BUG_ON(new_data_size < attr_len); + a->data.resident.value_length = + cpu_to_le32((u32)new_data_size); + } + goto flush_done; + } + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the extension process. + */ + err = ntfs_attr_make_non_resident(ni, attr_len); + if (likely(!err)) + goto retry_extend; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the conversion from resident " + "to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft " + "record/on disk for the non-resident " + "attribute value. This case is not " + "implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not " + "implemented yet."); + } + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_extend: + BUG_ON(!NInoNonResident(ni)); + if (new_alloc_size == allocated_size) { + BUG_ON(vcn); + goto alloc_done; + } + /* + * If the data starts after the end of the old allocation, this is a + * $DATA attribute and sparse attributes are enabled on the volume and + * for this inode, then create a sparse region between the old + * allocated size and the start of the data. Otherwise simply proceed + * with filling the whole space between the old allocated size and the + * new allocated size with clusters. + */ + if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || + !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) + goto skip_sparse; + // TODO: This is not implemented yet. We just fill in with real + // clusters for now... + ntfs_debug("Inserting holes is not-implemented yet. Falling back to " + "allocating real clusters instead."); +skip_sparse: + rl = ni->runlist.rl; + if (likely(rl)) { + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* If this attribute extent is not mapped, map it now. */ + if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || + (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && + (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { + if (!rl && !allocated_size) + goto first_alloc; + rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the " + "mapping of a runlist " + "fragment failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err); + if (err != -ENOMEM) + err = -EIO; + goto err_out; + } + ni->runlist.rl = rl; + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* + * We now know the runlist of the last extent is mapped and @rl is at + * the end of the runlist. We want to begin allocating clusters + * starting at the last allocated cluster to reduce fragmentation. If + * there are no valid LCNs in the attribute we let the cluster + * allocator choose the starting cluster. + */ + /* If the last LCN is a hole or simillar seek back to last real LCN. */ + while (rl->lcn < 0 && rl > ni->runlist.rl) + rl--; +first_alloc: + // FIXME: Need to implement partial allocations so at least part of the + // write can be performed when start >= 0. (Needed for POSIX write(2) + // conformance.) + rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, + (new_alloc_size - allocated_size) >> + vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? + rl->lcn + rl->length : -1, DATA_ZONE, true); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the allocation of clusters " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM && err != -ENOSPC) + err = -EIO; + goto err_out; + } + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the runlist merge failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + goto err_out; + } + ni->runlist.rl = rl; + ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - + allocated_size) >> vol->cluster_size_bits); + /* Find the runlist element with which the attribute extent starts. */ + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, ll); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + mp_rebuilt = false; + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + err = mp_size; + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because determining the size for the " + "mapping pairs failed with error code " + "%i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Extend the attribute record to fit the bigger mapping pairs array. */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record, + // possibly by extending this extent partially and filling it + // and creating a new extent for the remainder, or by making + // other attributes non-resident and/or by moving other + // attributes out of this mft record. + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(err)) { + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because building the mapping pairs " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + /* + * We now have extended the allocated size of the attribute. Reflect + * this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto restore_undo_alloc; + /* @m is not used any more so no need to set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + /* + * FIXME: This would fail if @ni is a directory, $MFT, or an index, + * since those can have sparse/compressed set. For example can be + * set compressed even though it is not compressed itself and in that + * case the bit means that files are to be created compressed in the + * directory... At present this is ok as this code is only called for + * regular files, and only for their $DATA attribute(s). + * FIXME: The calculation is wrong if we created a hole above. For now + * it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - allocated_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); +alloc_done: + if (new_data_size >= 0) { + BUG_ON(new_data_size < + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_data_size); + } +flush_done: + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + ntfs_debug("Done, new_allocated_size 0x%llx.", + (unsigned long long)new_alloc_size); + return new_alloc_size; +restore_undo_alloc: + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot complete extension of allocation " + "of inode 0x%lx, attribute type 0x%x, because " + "lookup of first attribute extent failed with " + "error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err == -ENOENT) + err = -EIO; + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + allocated_size >> vol->cluster_size_bits, NULL, 0, + ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "attribute in error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + /* + * FIXME: This would fail if @ni is a directory... See above. + * FIXME: The calculation is wrong if we created a hole above. + * For now it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - + allocated_size; + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * The only thing that is now wrong is the allocated size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return err; + } + ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( + (allocated_size >> vol->cluster_size_bits) - 1); +undo_alloc: + ll = allocated_size >> vol->cluster_size_bits; + if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to release allocated cluster(s) " + "in error code path. Run chkdsk to recover " + "the lost cluster(s)."); + NVolSetErrors(vol); + } + m = ctx->mrec; + a = ctx->attr; + /* + * If the runlist truncation fails and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to %s in error code path. Run " + "chkdsk to recover.", IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist"); + NVolSetErrors(vol); + } else if (mp_rebuilt) { + if (ntfs_attr_record_resize(m, a, attr_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident. + mapping_pairs_offset), attr_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), rl2, ll, -1, + NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +conv_err_out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +/** + * ntfs_attr_set - fill (a part of) an attribute with a byte + * @ni: ntfs inode describing the attribute to fill + * @ofs: offset inside the attribute at which to start to fill + * @cnt: number of bytes to fill + * @val: the unsigned 8-bit value with which to fill the attribute + * + * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at + * byte offset @ofs inside the attribute with the constant byte @val. + * + * This function is effectively like memset() applied to an ntfs attribute. + * Note thie function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. + * + * Return 0 on success and -errno on error. An error code of -ESPIPE means + * that @ofs + @cnt were outside the end of the attribute and no write was + * performed. + */ +int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) +{ + ntfs_volume *vol = ni->vol; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + pgoff_t idx, end; + unsigned start_ofs, end_ofs, size; + + ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", + (long long)ofs, (long long)cnt, val); + BUG_ON(ofs < 0); + BUG_ON(cnt < 0); + if (!cnt) + goto done; + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + mapping = VFS_I(ni)->i_mapping; + /* Work out the starting index and page offset. */ + idx = ofs >> PAGE_CACHE_SHIFT; + start_ofs = ofs & ~PAGE_CACHE_MASK; + /* Work out the ending index and page offset. */ + end = ofs + cnt; + end_ofs = end & ~PAGE_CACHE_MASK; + /* If the end is outside the inode size return -ESPIPE. */ + if (unlikely(end > i_size_read(VFS_I(ni)))) { + ntfs_error(vol->sb, "Request exceeds end of attribute."); + return -ESPIPE; + } + end >>= PAGE_CACHE_SHIFT; + /* If there is a first partial page, need to do it the slow way. */ + if (start_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read first partial " + "page (error, index 0x%lx).", idx); + return PTR_ERR(page); + } + /* + * If the last page is the same as the first page, need to + * limit the write to the end offset. + */ + size = PAGE_CACHE_SIZE; + if (idx == end) + size = end_ofs; + kaddr = kmap_atomic(page); + memset(kaddr + start_ofs, val, size - start_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + if (idx == end) + goto done; + idx++; + } + /* Do the whole pages the fast way. */ + for (; idx < end; idx++) { + /* Find or create the current page. (The page is locked.) */ + page = grab_cache_page(mapping, idx); + if (unlikely(!page)) { + ntfs_error(vol->sb, "Insufficient memory to grab " + "page (index 0x%lx).", idx); + return -ENOMEM; + } + kaddr = kmap_atomic(page); + memset(kaddr, val, PAGE_CACHE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr); + /* + * If the page has buffers, mark them uptodate since buffer + * state and not page state is definitive in 2.6 kernels. + */ + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + set_buffer_uptodate(bh); + } while ((bh = bh->b_this_page) != head); + } + /* Now that buffers are uptodate, set the page uptodate, too. */ + SetPageUptodate(page); + /* + * Set the page and all its buffers dirty and mark the inode + * dirty, too. The VM will write the page later on. + */ + set_page_dirty(page); + /* Finally unlock and release the page. */ + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + /* If there is a last partial page, need to do it the slow way. */ + if (end_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read last partial page " + "(error, index 0x%lx).", idx); + return PTR_ERR(page); + } + kaddr = kmap_atomic(page); + memset(kaddr, val, end_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } +done: + ntfs_debug("Done."); + return 0; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/attrib.h b/kernel/fs/ntfs/attrib.h new file mode 100644 index 000000000..3c8b74c99 --- /dev/null +++ b/kernel/fs/ntfs/attrib.h @@ -0,0 +1,116 @@ +/* + * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_ATTRIB_H +#define _LINUX_NTFS_ATTRIB_H + +#include "endian.h" +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +/** + * ntfs_attr_search_ctx - used in attribute search functions + * @mrec: buffer containing mft record to search + * @attr: attribute record in @mrec where to begin/continue search + * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after + * + * Structure must be initialized to zero before the first call to one of the + * attribute search functions. Initialize @mrec to point to the mft record to + * search, and @attr to point to the first attribute within @mrec (not necessary + * if calling the _first() functions), and set @is_first to 'true' (not necessary + * if calling the _first() functions). + * + * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', + * the search begins after @attr. This is so that, after the first call to one + * of the search attribute functions, we can call the function again, without + * any modification of the search context, to automagically get the next + * matching attribute. + */ +typedef struct { + MFT_RECORD *mrec; + ATTR_RECORD *attr; + bool is_first; + ntfs_inode *ntfs_ino; + ATTR_LIST_ENTRY *al_entry; + ntfs_inode *base_ntfs_ino; + MFT_RECORD *base_mrec; + ATTR_RECORD *base_attr; +} ntfs_attr_search_ctx; + +extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, + ntfs_attr_search_ctx *ctx); +extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); + +extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked); + +extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, + const VCN vcn, ntfs_attr_search_ctx *ctx); + +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx); + +extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, + const s64 size, const s64 initialized_size); + +static inline s64 ntfs_attr_size(const ATTR_RECORD *a) +{ + if (!a->non_resident) + return (s64)le32_to_cpu(a->data.resident.value_length); + return sle64_to_cpu(a->data.non_resident.data_size); +} + +extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); +extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, + MFT_RECORD *mrec); +extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); + +#ifdef NTFS_RW + +extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, + const ATTR_TYPE type, const s64 size); +extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, + const ATTR_TYPE type); +extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, + const ATTR_TYPE type); + +extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); +extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size); + +extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); + +extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start); + +extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, + const u8 val); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/kernel/fs/ntfs/bitmap.c b/kernel/fs/ntfs/bitmap.c new file mode 100644 index 000000000..0809cf876 --- /dev/null +++ b/kernel/fs/ntfs/bitmap.c @@ -0,0 +1,193 @@ +/* + * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include + +#include "bitmap.h" +#include "debug.h" +#include "aops.h" +#include "ntfs.h" + +/** + * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * @is_rollback: if 'true' this is a rollback operation + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback) +{ + s64 cnt = count; + pgoff_t index, end_index; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + int pos, len; + u8 bit; + + BUG_ON(!vi); + ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " + "value %u.%s", vi->i_ino, (unsigned long long)start_bit, + (unsigned long long)cnt, (unsigned int)value, + is_rollback ? " (rollback)" : ""); + BUG_ON(start_bit < 0); + BUG_ON(cnt < 0); + BUG_ON(value > 1); + /* + * Calculate the indices for the pages containing the first and last + * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. + */ + index = start_bit >> (3 + PAGE_CACHE_SHIFT); + end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT); + + /* Get the page containing the first bit (@start_bit). */ + mapping = vi->i_mapping; + page = ntfs_map_page(mapping, index); + if (IS_ERR(page)) { + if (!is_rollback) + ntfs_error(vi->i_sb, "Failed to map first page (error " + "%li), aborting.", PTR_ERR(page)); + return PTR_ERR(page); + } + kaddr = page_address(page); + + /* Set @pos to the position of the byte containing @start_bit. */ + pos = (start_bit >> 3) & ~PAGE_CACHE_MASK; + + /* Calculate the position of @start_bit in the first byte. */ + bit = start_bit & 7; + + /* If the first byte is partial, modify the appropriate bits in it. */ + if (bit) { + u8 *byte = kaddr + pos; + while ((bit & 7) && cnt) { + cnt--; + if (value) + *byte |= 1 << bit++; + else + *byte &= ~(1 << bit++); + } + /* If we are done, unmap the page and return success. */ + if (!cnt) + goto done; + + /* Update @pos to the new position. */ + pos++; + } + /* + * Depending on @value, modify all remaining whole bytes in the page up + * to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos); + memset(kaddr + pos, value ? 0xff : 0, len); + cnt -= len << 3; + + /* Update @len to point to the first not-done byte in the page. */ + if (cnt < 8) + len += pos; + + /* If we are not in the last page, deal with all subsequent pages. */ + while (index < end_index) { + BUG_ON(cnt <= 0); + + /* Update @index and get the next page. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, ++index); + if (IS_ERR(page)) + goto rollback; + kaddr = page_address(page); + /* + * Depending on @value, modify all remaining whole bytes in the + * page up to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE); + memset(kaddr, value ? 0xff : 0, len); + cnt -= len << 3; + } + /* + * The currently mapped page is the last one. If the last byte is + * partial, modify the appropriate bits in it. Note, @len is the + * position of the last byte inside the page. + */ + if (cnt) { + u8 *byte; + + BUG_ON(cnt > 7); + + bit = cnt; + byte = kaddr + len; + while (bit--) { + if (value) + *byte |= 1 << bit; + else + *byte &= ~(1 << bit); + } + } +done: + /* We are done. Unmap the page and return success. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +rollback: + /* + * Current state: + * - no pages are mapped + * - @count - @cnt is the number of bits that have been modified + */ + if (is_rollback) + return PTR_ERR(page); + if (count != cnt) + pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, + value ? 0 : 1, true); + else + pos = 0; + if (!pos) { + /* Rollback was successful. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li), aborting.", PTR_ERR(page)); + } else { + /* Rollback failed. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li) and rollback failed (error %i). " + "Aborting and leaving inconsistent metadata. " + "Unmount and run chkdsk.", PTR_ERR(page), pos); + NVolSetErrors(NTFS_SB(vi->i_sb)); + } + return PTR_ERR(page); +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/bitmap.h b/kernel/fs/ntfs/bitmap.h new file mode 100644 index 000000000..72c9ad8be --- /dev/null +++ b/kernel/fs/ntfs/bitmap.h @@ -0,0 +1,118 @@ +/* + * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_BITMAP_H +#define _LINUX_NTFS_BITMAP_H + +#ifdef NTFS_RW + +#include + +#include "types.h" + +extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback); + +/** + * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, + const s64 start_bit, const s64 count, const u8 value) +{ + return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, + false); +} + +/** + * ntfs_bitmap_set_run - set a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); +} + +/** + * ntfs_bitmap_clear_run - clear a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to clear + * @count: number of bits to clear + * + * Clear @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); +} + +/** + * ntfs_bitmap_set_bit - set a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to set + * + * Set bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_set_run(vi, bit, 1); +} + +/** + * ntfs_bitmap_clear_bit - clear a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to clear + * + * Clear bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_clear_run(vi, bit, 1); +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/kernel/fs/ntfs/collate.c b/kernel/fs/ntfs/collate.c new file mode 100644 index 000000000..4a28ab389 --- /dev/null +++ b/kernel/fs/ntfs/collate.c @@ -0,0 +1,124 @@ +/* + * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "collate.h" +#include "debug.h" +#include "ntfs.h" + +static int ntfs_collate_binary(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + + ntfs_debug("Entering."); + rc = memcmp(data1, data2, min(data1_len, data2_len)); + if (!rc && (data1_len != data2_len)) { + if (data1_len < data2_len) + rc = -1; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + u32 d1, d2; + + ntfs_debug("Entering."); + // FIXME: We don't really want to bug here. + BUG_ON(data1_len != data2_len); + BUG_ON(data1_len != 4); + d1 = le32_to_cpup(data1); + d2 = le32_to_cpup(data2); + if (d1 < d2) + rc = -1; + else { + if (d1 == d2) + rc = 0; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, + const void *, const int); + +static ntfs_collate_func_t ntfs_do_collate0x0[3] = { + ntfs_collate_binary, + NULL/*ntfs_collate_file_name*/, + NULL/*ntfs_collate_unicode_string*/, +}; + +static ntfs_collate_func_t ntfs_do_collate0x1[4] = { + ntfs_collate_ntofs_ulong, + NULL/*ntfs_collate_ntofs_sid*/, + NULL/*ntfs_collate_ntofs_security_hash*/, + NULL/*ntfs_collate_ntofs_ulongs*/, +}; + +/** + * ntfs_collate - collate two data items using a specified collation rule + * @vol: ntfs volume to which the data items belong + * @cr: collation rule to use when comparing the items + * @data1: first data item to collate + * @data1_len: length in bytes of @data1 + * @data2: second data item to collate + * @data2_len: length in bytes of @data2 + * + * Collate the two data items @data1 and @data2 using the collation rule @cr + * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, + * to match, or to collate after @data2. + * + * For speed we use the collation rule @cr as an index into two tables of + * function pointers to call the appropriate collation function. + */ +int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len) { + int i; + + ntfs_debug("Entering."); + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. + */ + BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); + i = le32_to_cpu(cr); + BUG_ON(i < 0); + if (i <= 0x02) + return ntfs_do_collate0x0[i](vol, data1, data1_len, + data2, data2_len); + BUG_ON(i < 0x10); + i -= 0x10; + if (likely(i <= 3)) + return ntfs_do_collate0x1[i](vol, data1, data1_len, + data2, data2_len); + BUG(); + return 0; +} diff --git a/kernel/fs/ntfs/collate.h b/kernel/fs/ntfs/collate.h new file mode 100644 index 000000000..aba83347e --- /dev/null +++ b/kernel/fs/ntfs/collate.h @@ -0,0 +1,50 @@ +/* + * collate.h - Defines for NTFS kernel collation handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_COLLATE_H +#define _LINUX_NTFS_COLLATE_H + +#include "types.h" +#include "volume.h" + +static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { + int i; + + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we return false for everything else for + * now. + */ + if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) + return false; + i = le32_to_cpu(cr); + if (likely(((i >= 0) && (i <= 0x02)) || + ((i >= 0x10) && (i <= 0x13)))) + return true; + return false; +} + +extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len); + +#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/kernel/fs/ntfs/compress.c b/kernel/fs/ntfs/compress.c new file mode 100644 index 000000000..f82498c35 --- /dev/null +++ b/kernel/fs/ntfs/compress.c @@ -0,0 +1,969 @@ +/** + * compress.c - NTFS kernel compressed attributes handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "inode.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_compression_constants - enum of constants used in the compression code + */ +typedef enum { + /* Token types and access mask. */ + NTFS_SYMBOL_TOKEN = 0, + NTFS_PHRASE_TOKEN = 1, + NTFS_TOKEN_MASK = 1, + + /* Compression sub-block constants. */ + NTFS_SB_SIZE_MASK = 0x0fff, + NTFS_SB_SIZE = 0x1000, + NTFS_SB_IS_COMPRESSED = 0x8000, + + /* + * The maximum compression block size is by definition 16 * the cluster + * size, with the maximum supported cluster size being 4kiB. Thus the + * maximum compression buffer size is 64kiB, so we use this when + * initializing the compression buffer. + */ + NTFS_MAX_CB_SIZE = 64 * 1024, +} ntfs_compression_constants; + +/** + * ntfs_compression_buffer - one buffer for the decompression engine + */ +static u8 *ntfs_compression_buffer; + +/** + * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer + */ +static DEFINE_SPINLOCK(ntfs_cb_lock); + +/** + * allocate_compression_buffers - allocate the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + * + * Return 0 on success or -ENOMEM if the allocations failed. + */ +int allocate_compression_buffers(void) +{ + BUG_ON(ntfs_compression_buffer); + + ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); + if (!ntfs_compression_buffer) + return -ENOMEM; + return 0; +} + +/** + * free_compression_buffers - free the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + */ +void free_compression_buffers(void) +{ + BUG_ON(!ntfs_compression_buffer); + vfree(ntfs_compression_buffer); + ntfs_compression_buffer = NULL; +} + +/** + * zero_partial_compressed_page - zero out of bounds compressed page region + */ +static void zero_partial_compressed_page(struct page *page, + const s64 initialized_size) +{ + u8 *kp = page_address(page); + unsigned int kp_ofs; + + ntfs_debug("Zeroing page region outside initialized size."); + if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) { + /* + * FIXME: Using clear_page() will become wrong when we get + * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem. + */ + clear_page(kp); + return; + } + kp_ofs = initialized_size & ~PAGE_CACHE_MASK; + memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs); + return; +} + +/** + * handle_bounds_compressed_page - test for&handle out of bounds compressed page + */ +static inline void handle_bounds_compressed_page(struct page *page, + const loff_t i_size, const s64 initialized_size) +{ + if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) && + (initialized_size < i_size)) + zero_partial_compressed_page(page, initialized_size); + return; +} + +/** + * ntfs_decompress - decompress a compression block into an array of pages + * @dest_pages: destination array of pages + * @dest_index: current index into @dest_pages (IN/OUT) + * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) + * @dest_max_index: maximum index into @dest_pages (IN) + * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) + * @xpage: the target page (-1 if none) (IN) + * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) + * @cb_start: compression block to decompress (IN) + * @cb_size: size of compression block @cb_start in bytes (IN) + * @i_size: file size when we started the read (IN) + * @initialized_size: initialized file size when we started the read (IN) + * + * The caller must have disabled preemption. ntfs_decompress() reenables it when + * the critical section is finished. + * + * This decompresses the compression block @cb_start into the array of + * destination pages @dest_pages starting at index @dest_index into @dest_pages + * and at offset @dest_pos into the page @dest_pages[@dest_index]. + * + * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. + * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. + * + * @cb_start is a pointer to the compression block which needs decompressing + * and @cb_size is the size of @cb_start in bytes (8-64kiB). + * + * Return 0 if success or -EOVERFLOW on error in the compressed stream. + * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was + * completed during the decompression of the compression block (@cb_start). + * + * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up + * unpredicatbly! You have been warned! + * + * Note to hackers: This function may not sleep until it has finished accessing + * the compression block @cb_start as it is a per-CPU buffer. + */ +static int ntfs_decompress(struct page *dest_pages[], int *dest_index, + int *dest_ofs, const int dest_max_index, const int dest_max_ofs, + const int xpage, char *xpage_done, u8 *const cb_start, + const u32 cb_size, const loff_t i_size, + const s64 initialized_size) +{ + /* + * Pointers into the compressed data, i.e. the compression block (cb), + * and the therein contained sub-blocks (sb). + */ + u8 *cb_end = cb_start + cb_size; /* End of cb. */ + u8 *cb = cb_start; /* Current position in cb. */ + u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ + + /* Variables for uncompressed data / destination. */ + struct page *dp; /* Current destination page being worked on. */ + u8 *dp_addr; /* Current pointer into dp. */ + u8 *dp_sb_start; /* Start of current sub-block in dp. */ + u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + + NTFS_SB_SIZE). */ + u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ + u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + + NTFS_SB_SIZE). */ + + /* Variables for tag and token parsing. */ + u8 tag; /* Current tag. */ + int token; /* Loop counter for the eight tokens in tag. */ + + /* Need this because we can't sleep, so need two stages. */ + int completed_pages[dest_max_index - *dest_index + 1]; + int nr_completed_pages = 0; + + /* Default error code. */ + int err = -EOVERFLOW; + + ntfs_debug("Entering, cb_size = 0x%x.", cb_size); +do_next_sb: + ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", + cb - cb_start); + /* + * Have we reached the end of the compression block or the end of the + * decompressed data? The latter can happen for example if the current + * position in the compression block is one byte before its end so the + * first two checks do not detect it. + */ + if (cb == cb_end || !le16_to_cpup((le16*)cb) || + (*dest_index == dest_max_index && + *dest_ofs == dest_max_ofs)) { + int i; + + ntfs_debug("Completed. Returning success (0)."); + err = 0; +return_error: + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize completed pages. */ + if (nr_completed_pages > 0) { + for (i = 0; i < nr_completed_pages; i++) { + int di = completed_pages[i]; + + dp = dest_pages[di]; + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(dp, i_size, + initialized_size); + flush_dcache_page(dp); + kunmap(dp); + SetPageUptodate(dp); + unlock_page(dp); + if (di == xpage) + *xpage_done = 1; + else + page_cache_release(dp); + dest_pages[di] = NULL; + } + } + return err; + } + + /* Setup offsets for the current sub-block destination. */ + do_sb_start = *dest_ofs; + do_sb_end = do_sb_start + NTFS_SB_SIZE; + + /* Check that we are still within allowed boundaries. */ + if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) + goto return_overflow; + + /* Does the minimum size of a compressed sb overflow valid range? */ + if (cb + 6 > cb_end) + goto return_overflow; + + /* Setup the current sub-block source pointers and validate range. */ + cb_sb_start = cb; + cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + + 3; + if (cb_sb_end > cb_end) + goto return_overflow; + + /* Get the current destination page. */ + dp = dest_pages[*dest_index]; + if (!dp) { + /* No page present. Skip decompression of this sub-block. */ + cb = cb_sb_end; + + /* Advance destination position to next sub-block. */ + *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK; + if (!*dest_ofs && (++*dest_index > dest_max_index)) + goto return_overflow; + goto do_next_sb; + } + + /* We have a valid destination page. Setup the destination pointers. */ + dp_addr = (u8*)page_address(dp) + do_sb_start; + + /* Now, we are ready to process the current sub-block (sb). */ + if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { + ntfs_debug("Found uncompressed sub-block."); + /* This sb is not compressed, just copy it into destination. */ + + /* Advance source position to first data byte. */ + cb += 2; + + /* An uncompressed sb must be full size. */ + if (cb_sb_end - cb != NTFS_SB_SIZE) + goto return_overflow; + + /* Copy the block and advance the source position. */ + memcpy(dp_addr, cb, NTFS_SB_SIZE); + cb += NTFS_SB_SIZE; + + /* Advance destination position to next sub-block. */ + *dest_ofs += NTFS_SB_SIZE; + if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) { +finalize_page: + /* + * First stage: add current page index to array of + * completed pages. + */ + completed_pages[nr_completed_pages++] = *dest_index; + if (++*dest_index > dest_max_index) + goto return_overflow; + } + goto do_next_sb; + } + ntfs_debug("Found compressed sub-block."); + /* This sb is compressed, decompress it into destination. */ + + /* Setup destination pointers. */ + dp_sb_start = dp_addr; + dp_sb_end = dp_sb_start + NTFS_SB_SIZE; + + /* Forward to the first tag in the sub-block. */ + cb += 2; +do_next_tag: + if (cb == cb_sb_end) { + /* Check if the decompressed sub-block was not full-length. */ + if (dp_addr < dp_sb_end) { + int nr_bytes = do_sb_end - *dest_ofs; + + ntfs_debug("Filling incomplete sub-block with " + "zeroes."); + /* Zero remainder and update destination position. */ + memset(dp_addr, 0, nr_bytes); + *dest_ofs += nr_bytes; + } + /* We have finished the current sub-block. */ + if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) + goto finalize_page; + goto do_next_sb; + } + + /* Check we are still in range. */ + if (cb > cb_sb_end || dp_addr > dp_sb_end) + goto return_overflow; + + /* Get the next tag and advance to first token. */ + tag = *cb++; + + /* Parse the eight tokens described by the tag. */ + for (token = 0; token < 8; token++, tag >>= 1) { + u16 lg, pt, length, max_non_overlap; + register u16 i; + u8 *dp_back_addr; + + /* Check if we are done / still in range. */ + if (cb >= cb_sb_end || dp_addr > dp_sb_end) + break; + + /* Determine token type and parse appropriately.*/ + if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { + /* + * We have a symbol token, copy the symbol across, and + * advance the source and destination positions. + */ + *dp_addr++ = *cb++; + ++*dest_ofs; + + /* Continue with the next token. */ + continue; + } + + /* + * We have a phrase token. Make sure it is not the first tag in + * the sb as this is illegal and would confuse the code below. + */ + if (dp_addr == dp_sb_start) + goto return_overflow; + + /* + * Determine the number of bytes to go back (p) and the number + * of bytes to copy (l). We use an optimized algorithm in which + * we first calculate log2(current destination position in sb), + * which allows determination of l and p in O(1) rather than + * O(n). We just need an arch-optimized log2() function now. + */ + lg = 0; + for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) + lg++; + + /* Get the phrase token into i. */ + pt = le16_to_cpup((le16*)cb); + + /* + * Calculate starting position of the byte sequence in + * the destination using the fact that p = (pt >> (12 - lg)) + 1 + * and make sure we don't go too far back. + */ + dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; + if (dp_back_addr < dp_sb_start) + goto return_overflow; + + /* Now calculate the length of the byte sequence. */ + length = (pt & (0xfff >> lg)) + 3; + + /* Advance destination position and verify it is in range. */ + *dest_ofs += length; + if (*dest_ofs > do_sb_end) + goto return_overflow; + + /* The number of non-overlapping bytes. */ + max_non_overlap = dp_addr - dp_back_addr; + + if (length <= max_non_overlap) { + /* The byte sequence doesn't overlap, just copy it. */ + memcpy(dp_addr, dp_back_addr, length); + + /* Advance destination pointer. */ + dp_addr += length; + } else { + /* + * The byte sequence does overlap, copy non-overlapping + * part and then do a slow byte by byte copy for the + * overlapping part. Also, advance the destination + * pointer. + */ + memcpy(dp_addr, dp_back_addr, max_non_overlap); + dp_addr += max_non_overlap; + dp_back_addr += max_non_overlap; + length -= max_non_overlap; + while (length--) + *dp_addr++ = *dp_back_addr++; + } + + /* Advance source position and continue with the next token. */ + cb += 2; + } + + /* No tokens left in the current tag. Continue with the next tag. */ + goto do_next_tag; + +return_overflow: + ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); + goto return_error; +} + +/** + * ntfs_read_compressed_block - read a compressed block into the page cache + * @page: locked page in the compression block(s) we need to read + * + * When we are called the page has already been verified to be locked and the + * attribute is known to be non-resident, not encrypted, but compressed. + * + * 1. Determine which compression block(s) @page is in. + * 2. Get hold of all pages corresponding to this/these compression block(s). + * 3. Read the (first) compression block. + * 4. Decompress it into the corresponding pages. + * 5. Throw the compressed data away and proceed to 3. for the next compression + * block or return success if no more compression blocks left. + * + * Warning: We have to be careful what we do about existing pages. They might + * have been written to so that we would lose data if we were to just overwrite + * them with the out-of-date uncompressed data. + * + * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at + * the end of the file I think. We need to detect this case and zero the out + * of bounds remainder of the page in question and mark it as handled. At the + * moment we would just return -EIO on such a page. This bug will only become + * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte + * clusters so is probably not going to be seen by anyone. Still this should + * be fixed. (AIA) + * + * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in + * handling sparse and compressed cbs. (AIA) + * + * FIXME: At the moment we don't do any zeroing out in the case that + * initialized_size is less than data_size. This should be safe because of the + * nature of the compression algorithm used. Just in case we check and output + * an error message in read inode if the two sizes are not equal for a + * compressed file. (AIA) + */ +int ntfs_read_compressed_block(struct page *page) +{ + loff_t i_size; + s64 initialized_size; + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags, block_size = sb->s_blocksize; + unsigned char block_size_bits = sb->s_blocksize_bits; + u8 *cb, *cb_pos, *cb_end; + struct buffer_head **bhs; + unsigned long offset, index = page->index; + u32 cb_size = ni->itype.compressed.block_size; + u64 cb_size_mask = cb_size - 1UL; + VCN vcn; + LCN lcn; + /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */ + VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >> + vol->cluster_size_bits; + /* + * The first vcn after the last wanted vcn (minimum alignment is again + * PAGE_CACHE_SIZE. + */ + VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1) + & ~cb_size_mask) >> vol->cluster_size_bits; + /* Number of compression blocks (cbs) in the wanted vcn range. */ + unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits + >> ni->itype.compressed.block_size_bits; + /* + * Number of pages required to store the uncompressed data from all + * compression blocks (cbs) overlapping @page. Due to alignment + * guarantees of start_vcn and end_vcn, no need to round up here. + */ + unsigned int nr_pages = (end_vcn - start_vcn) << + vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + unsigned int xpage, max_page, cur_page, cur_ofs, i; + unsigned int cb_clusters, cb_max_ofs; + int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; + struct page **pages; + unsigned char xpage_done = 0; + + ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " + "%i.", index, cb_size, nr_pages); + /* + * Bad things happen if we get here for anything that is not an + * unnamed $DATA attribute. + */ + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS); + + /* Allocate memory to store the buffer heads we need. */ + bhs_size = cb_size / block_size * sizeof(struct buffer_head *); + bhs = kmalloc(bhs_size, GFP_NOFS); + + if (unlikely(!pages || !bhs)) { + kfree(bhs); + kfree(pages); + unlock_page(page); + ntfs_error(vol->sb, "Failed to allocate internal buffers."); + return -ENOMEM; + } + + /* + * We have already been given one page, this is the one we must do. + * Once again, the alignment guarantees keep it simple. + */ + offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + xpage = index - offset; + pages[xpage] = page; + /* + * The remaining pages need to be allocated and inserted into the page + * cache, alignment guarantees keep all the below much simpler. (-8 + */ + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(VFS_I(ni)); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + offset; + /* Is the page fully outside i_size? (truncate in progress) */ + if (xpage >= max_page) { + kfree(bhs); + kfree(pages); + zero_user(page, 0, PAGE_CACHE_SIZE); + ntfs_debug("Compressed read outside i_size - truncated?"); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + if (nr_pages < max_page) + max_page = nr_pages; + for (i = 0; i < max_page; i++, offset++) { + if (i != xpage) + pages[i] = grab_cache_page_nowait(mapping, offset); + page = pages[i]; + if (page) { + /* + * We only (re)read the page if it isn't already read + * in and/or dirty or we would be losing data or at + * least wasting our time. + */ + if (!PageDirty(page) && (!PageUptodate(page) || + PageError(page))) { + ClearPageError(page); + kmap(page); + continue; + } + unlock_page(page); + page_cache_release(page); + pages[i] = NULL; + } + } + + /* + * We have the runlist, and all the destination pages we need to fill. + * Now read the first compression block. + */ + cur_page = 0; + cur_ofs = 0; + cb_clusters = ni->itype.compressed.block_clusters; +do_next_cb: + nr_cbs--; + nr_bhs = 0; + + /* Read all cb buffer heads one cluster at a time. */ + rl = NULL; + for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; + vcn++) { + bool is_retry = false; + + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)vcn, + (unsigned long long)lcn); + if (lcn < 0) { + /* + * When we reach the first sparse cluster we have + * finished with the cb. + */ + if (lcn == LCN_HOLE) + break; + if (is_retry || lcn != LCN_RL_NOT_MAPPED) + goto rl_err; + is_retry = true; + /* + * Attempt to map runlist, dropping lock for the + * duration. + */ + up_read(&ni->runlist.lock); + if (!ntfs_map_runlist(ni, vcn)) + goto lock_retry_remap; + goto map_rl_err; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the lcn from device in chunks of block_size bytes. */ + max_block = block + (vol->cluster_size >> block_size_bits); + do { + ntfs_debug("block = 0x%x.", block); + if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) + goto getblk_err; + nr_bhs++; + } while (++block < max_block); + } + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Setup and initiate io on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (!trylock_buffer(tbh)) + continue; + if (unlikely(buffer_uptodate(tbh))) { + unlock_buffer(tbh); + continue; + } + get_bh(tbh); + tbh->b_end_io = end_buffer_read_sync; + submit_bh(READ, tbh); + } + + /* Wait for io completion on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (buffer_uptodate(tbh)) + continue; + wait_on_buffer(tbh); + /* + * We need an optimization barrier here, otherwise we start + * hitting the below fixup code when accessing a loopback + * mounted ntfs partition. This indicates either there is a + * race condition in the loop driver or, more likely, gcc + * overoptimises the code without the barrier and it doesn't + * do the Right Thing(TM). + */ + barrier(); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_warning(vol->sb, "Buffer is unlocked but not " + "uptodate! Unplugging the disk queue " + "and rescheduling."); + get_bh(tbh); + io_schedule(); + put_bh(tbh); + if (unlikely(!buffer_uptodate(tbh))) + goto read_err; + ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); + } + } + + /* + * Get the compression buffer. We must not sleep any more + * until we are finished with it. + */ + spin_lock(&ntfs_cb_lock); + cb = ntfs_compression_buffer; + + BUG_ON(!cb); + + cb_pos = cb; + cb_end = cb + cb_size; + + /* Copy the buffer heads into the contiguous buffer. */ + for (i = 0; i < nr_bhs; i++) { + memcpy(cb_pos, bhs[i]->b_data, block_size); + cb_pos += block_size; + } + + /* Just a precaution. */ + if (cb_pos + 2 <= cb + cb_size) + *(u16*)cb_pos = 0; + + /* Reset cb_pos back to the beginning. */ + cb_pos = cb; + + /* We now have both source (if present) and destination. */ + ntfs_debug("Successfully read the compression block."); + + /* The last page and maximum offset within it for the current cb. */ + cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size; + cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK; + cb_max_page >>= PAGE_CACHE_SHIFT; + + /* Catch end of file inside a compression block. */ + if (cb_max_page > max_page) + cb_max_page = max_page; + + if (vcn == start_vcn - cb_clusters) { + /* Sparse cb, zero out page range overlapping the cb. */ + ntfs_debug("Found sparse compression block."); + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + if (cb_max_ofs) + cb_max_page--; + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + /* + * FIXME: Using clear_page() will become wrong + * when we get PAGE_CACHE_SIZE != PAGE_SIZE but + * for now there is no problem. + */ + if (likely(!cur_ofs)) + clear_page(page_address(page)); + else + memset(page_address(page) + cur_ofs, 0, + PAGE_CACHE_SIZE - + cur_ofs); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur_page == xpage) + xpage_done = 1; + else + page_cache_release(page); + pages[cur_page] = NULL; + } + cb_pos += PAGE_CACHE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memset(page_address(page) + cur_ofs, 0, + cb_max_ofs - cur_ofs); + /* + * No need to update cb_pos at this stage: + * cb_pos += cb_max_ofs - cur_ofs; + */ + cur_ofs = cb_max_ofs; + } + } else if (vcn == start_vcn) { + /* We can't sleep so we need two stages. */ + unsigned int cur2_page = cur_page; + unsigned int cur_ofs2 = cur_ofs; + u8 *cb_pos2 = cb_pos; + + ntfs_debug("Found uncompressed compression block."); + /* Uncompressed cb, copy it to the destination pages. */ + /* + * TODO: As a big optimization, we could detect this case + * before we read all the pages and use block_read_full_page() + * on all full pages instead (we still have to treat partial + * pages especially but at least we are getting rid of the + * synchronous io for the majority of pages. + * Or if we choose not to do the read-ahead/-behind stuff, we + * could just return block_read_full_page(pages[xpage]) as long + * as PAGE_CACHE_SIZE <= cb_size. + */ + if (cb_max_ofs) + cb_max_page--; + /* First stage: copy data into destination pages. */ + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + PAGE_CACHE_SIZE - cur_ofs); + cb_pos += PAGE_CACHE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + cb_max_ofs - cur_ofs); + cb_pos += cb_max_ofs - cur_ofs; + cur_ofs = cb_max_ofs; + } + /* We can sleep from now on, so drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize pages. */ + for (; cur2_page < cb_max_page; cur2_page++) { + page = pages[cur2_page]; + if (page) { + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(page, i_size, + initialized_size); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur2_page == xpage) + xpage_done = 1; + else + page_cache_release(page); + pages[cur2_page] = NULL; + } + cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2; + cur_ofs2 = 0; + if (cb_pos2 >= cb_end) + break; + } + } else { + /* Compressed cb, decompress it into the destination page(s). */ + unsigned int prev_cur_page = cur_page; + + ntfs_debug("Found compressed compression block."); + err = ntfs_decompress(pages, &cur_page, &cur_ofs, + cb_max_page, cb_max_ofs, xpage, &xpage_done, + cb_pos, cb_size - (cb_pos - cb), i_size, + initialized_size); + /* + * We can sleep from now on, lock already dropped by + * ntfs_decompress(). + */ + if (err) { + ntfs_error(vol->sb, "ntfs_decompress() failed in inode " + "0x%lx with error code %i. Skipping " + "this compression block.", + ni->mft_no, -err); + /* Release the unfinished pages. */ + for (; prev_cur_page < cur_page; prev_cur_page++) { + page = pages[prev_cur_page]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (prev_cur_page != xpage) + page_cache_release(page); + pages[prev_cur_page] = NULL; + } + } + } + } + + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + /* Do we have more work to do? */ + if (nr_cbs) + goto do_next_cb; + + /* We no longer need the list of buffer heads. */ + kfree(bhs); + + /* Clean up if we have any pages left. Should never happen. */ + for (cur_page = 0; cur_page < max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + ntfs_error(vol->sb, "Still have pages left! " + "Terminating them with extreme " + "prejudice. Inode 0x%lx, page index " + "0x%lx.", ni->mft_no, page->index); + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (cur_page != xpage) + page_cache_release(page); + pages[cur_page] = NULL; + } + } + + /* We no longer need the list of pages. */ + kfree(pages); + + /* If we have completed the requested page, we return success. */ + if (likely(xpage_done)) + return 0; + + ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? + "EOVERFLOW" : (!err ? "EIO" : "unknown error")); + return err < 0 ? err : -EIO; + +read_err: + ntfs_error(vol->sb, "IO error while reading compressed data."); + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + goto err_out; + +map_rl_err: + ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " + "compression block."); + goto err_out; + +rl_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " + "compression block."); + goto err_out; + +getblk_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); + +err_out: + kfree(bhs); + for (i = cur_page; i < max_page; i++) { + page = pages[i]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (i != xpage) + page_cache_release(page); + } + } + kfree(pages); + return -EIO; +} diff --git a/kernel/fs/ntfs/debug.c b/kernel/fs/ntfs/debug.c new file mode 100644 index 000000000..825a54e8f --- /dev/null +++ b/kernel/fs/ntfs/debug.c @@ -0,0 +1,173 @@ +/* + * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include "debug.h" + +/** + * __ntfs_warning - output a warning to the syslog + * @function: name of function outputting the warning + * @sb: super block of mounted ntfs filesystem + * @fmt: warning string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs a warning to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the warning string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_warning is being + * called. + * + * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +/** + * __ntfs_error - output an error to the syslog + * @function: name of function outputting the error + * @sb: super block of mounted ntfs filesystem + * @fmt: error string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs an error to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the error string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_error is being + * called. + * + * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +#ifdef DEBUG + +/* If 1, output debug messages, and if 0, don't. */ +int debug_msgs = 0; + +void __ntfs_debug(const char *file, int line, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + + if (!debug_msgs) + return; + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); + va_end(args); +} + +/* Dump a runlist. Caller has to provide synchronisation for @rl. */ +void ntfs_debug_dump_runlist(const runlist_element *rl) +{ + int i; + const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", + "LCN_ENOENT ", "LCN_unknown " }; + + if (!debug_msgs) + return; + pr_debug("Dumping runlist (values in hex):\n"); + if (!rl) { + pr_debug("Run list not present.\n"); + return; + } + pr_debug("VCN LCN Run length\n"); + for (i = 0; ; i++) { + LCN lcn = (rl + i)->lcn; + + if (lcn < (LCN)0) { + int index = -lcn - 1; + + if (index > -LCN_ENOENT - 1) + index = 3; + pr_debug("%-16Lx %s %-16Lx%s\n", + (long long)(rl + i)->vcn, lcn_str[index], + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + } else + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", + (long long)(rl + i)->vcn, + (long long)(rl + i)->lcn, + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + if (!(rl + i)->length) + break; + } +} + +#endif diff --git a/kernel/fs/ntfs/debug.h b/kernel/fs/ntfs/debug.h new file mode 100644 index 000000000..61bf091e3 --- /dev/null +++ b/kernel/fs/ntfs/debug.h @@ -0,0 +1,71 @@ +/* + * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_DEBUG_H +#define _LINUX_NTFS_DEBUG_H + +#include + +#include "runlist.h" + +#ifdef DEBUG + +extern int debug_msgs; + +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); +/** + * ntfs_debug - write a debug level message to syslog + * @f: a printf format string containing the message + * @...: the variables to substitute into @f + * + * ntfs_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. + */ +#define ntfs_debug(f, a...) \ + __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) + +extern void ntfs_debug_dump_runlist(const runlist_element *rl); + +#else /* !DEBUG */ + +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + +#define ntfs_debug_dump_runlist(rl) do {} while (0) + +#endif /* !DEBUG */ + +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) + +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) + +#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/kernel/fs/ntfs/dir.c b/kernel/fs/ntfs/dir.c new file mode 100644 index 000000000..9e38dafa3 --- /dev/null +++ b/kernel/fs/ntfs/dir.c @@ -0,0 +1,1553 @@ +/** + * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include + +#include "dir.h" +#include "aops.h" +#include "attrib.h" +#include "mft.h" +#include "debug.h" +#include "ntfs.h" + +/** + * The little endian Unicode string $I30 as a global constant. + */ +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * @res: return the found file name if necessary (see below) + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + * + * Note, we look for a case sensitive match first but we also look for a case + * insensitive match at the same time. If we find a case insensitive match, we + * save that for the case that we don't find an exact match, where we return + * the case insensitive match and setup @res (which we allocate!) with the mft + * reference, the file name type, length and with a copy of the little endian + * Unicode file name itself. If we match a file name which is in the DOS name + * space, we only return the mft reference and file name type in @res. + * ntfs_lookup() then uses this to find the long file name in the inode itself. + * This is to avoid polluting the dcache with short file names. We want them to + * work but we don't care for how quickly one can access them. This also fixes + * the dcache aliasing issues. + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len, ntfs_name **res) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + ntfs_name *name = NULL; + + BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); + BUG_ON(NInoAttr(dir_ni)); + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 1. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present return -ENOENT, unless + * we have got a matching name cached in name in which case return the + * mft reference associated with it. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + if (name) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it2: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 2. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + unlock_page(page); + ntfs_unmap_page(page); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* + * No child node present, return -ENOENT, unless we have got a matching + * name cached in name in which case return the mft reference + * associated with it. + */ + if (name) { + unlock_page(page); + ntfs_unmap_page(page); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + if (name) { + kfree(name); + *res = NULL; + } + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#if 0 + +// TODO: (AIA) +// The algorithm embedded in this code will be required for the time when we +// want to support adding of entries to directories, where we require correct +// collation of file names in order not to cause corruption of the filesystem. + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + */ +u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + IGNORE_CASE_BOOL ic; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it: + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + /* No child node, return -ENOENT. */ + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it2: + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* No child node, return -ENOENT. */ + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#endif + +/** + * ntfs_filldir - ntfs specific filldir method + * @vol: current ntfs volume + * @ndir: ntfs inode of current directory + * @ia_page: page in which the index allocation buffer @ie is in resides + * @ie: current index entry + * @name: buffer to use for the converted name + * @actor: what to feed the entries to + * + * Convert the Unicode @name to the loaded NLS and pass it to the @filldir + * callback. + * + * If @ia_page is not NULL it is the locked page containing the index + * allocation block containing the index entry @ie. + * + * Note, we drop (and then reacquire) the page lock on @ia_page across the + * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup + * since ntfs_lookup() will lock the same page. As an optimization, we do not + * retake the lock if we are returning a non-zero value as ntfs_readdir() + * would need to drop the lock immediately anyway. + */ +static inline int ntfs_filldir(ntfs_volume *vol, + ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, + u8 *name, struct dir_context *actor) +{ + unsigned long mref; + int name_len; + unsigned dt_type; + FILE_NAME_TYPE_FLAGS name_type; + + name_type = ie->key.file_name.file_name_type; + if (name_type == FILE_NAME_DOS) { + ntfs_debug("Skipping DOS name space entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { + ntfs_debug("Skipping root directory self reference entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && + !NVolShowSystemFiles(vol)) { + ntfs_debug("Skipping system file."); + return 0; + } + name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, &name, + NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); + if (name_len <= 0) { + ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", + (long long)MREF_LE(ie->data.dir.indexed_file)); + return 0; + } + if (ie->key.file_name.file_attributes & + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) + dt_type = DT_DIR; + else + dt_type = DT_REG; + mref = MREF_LE(ie->data.dir.indexed_file); + /* + * Drop the page lock otherwise we deadlock with NFS when it calls + * ->lookup since ntfs_lookup() will lock the same page. + */ + if (ia_page) + unlock_page(ia_page); + ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " + "0x%lx, DT_%s.", name, name_len, actor->pos, mref, + dt_type == DT_DIR ? "DIR" : "REG"); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; + /* Relock the page but not if we are aborting ->readdir. */ + if (ia_page) + lock_page(ia_page); + return 0; +} + +/* + * We use the same basic approach as the old NTFS driver, i.e. we parse the + * index root entries and then the index allocation entries that are marked + * as in use in the index bitmap. + * + * While this will return the names in random order this doesn't matter for + * ->readdir but OTOH results in a faster ->readdir. + * + * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS + * parts (e.g. ->f_pos and ->i_size, and it also protects against directory + * modifications). + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +static int ntfs_readdir(struct file *file, struct dir_context *actor) +{ + s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; + loff_t i_size; + struct inode *bmp_vi, *vdir = file_inode(file); + struct super_block *sb = vdir->i_sb; + ntfs_inode *ndir = NTFS_I(vdir); + ntfs_volume *vol = NTFS_SB(sb); + MFT_RECORD *m; + INDEX_ROOT *ir = NULL; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *name = NULL; + int rc, err, ir_pos, cur_bmp_pos; + struct address_space *ia_mapping, *bmp_mapping; + struct page *bmp_page = NULL, *ia_page = NULL; + u8 *kaddr, *bmp, *index_end; + ntfs_attr_search_ctx *ctx; + + ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", + vdir->i_ino, actor->pos); + rc = err = 0; + /* Are we at end of dir yet? */ + i_size = i_size_read(vdir); + if (actor->pos >= i_size + vol->mft_record_size) + return 0; + /* Emulate . and .. for all directories. */ + if (!dir_emit_dots(file, actor)) + return 0; + m = NULL; + ctx = NULL; + /* + * Allocate a buffer to store the current name being processed + * converted to format determined by current NLS. + */ + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); + if (unlikely(!name)) { + err = -ENOMEM; + goto err_out; + } + /* Are we jumping straight into the index allocation attribute? */ + if (actor->pos >= vol->mft_record_size) + goto skip_index_root; + /* Get hold of the mft record for the directory. */ + m = map_mft_record(ndir); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ndir, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Get the offset into the index root attribute. */ + ir_pos = (s64)actor->pos; + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_error(sb, "Index root attribute missing in directory " + "inode 0x%lx.", vdir->i_ino); + goto err_out; + } + /* + * Copy the index root attribute value to a buffer so that we can put + * the search context and unmap the mft record before calling the + * filldir() callback. We need to do this because of NFSd which calls + * ->lookup() from its filldir callback() and this causes NTFS to + * deadlock as ntfs_lookup() maps the mft record of the directory and + * we have got it mapped here already. The only solution is for us to + * unmap the mft record here so that a call to ntfs_lookup() is able to + * map the mft record without deadlocking. + */ + rc = le32_to_cpu(ctx->attr->data.resident.value_length); + ir = kmalloc(rc, GFP_NOFS); + if (unlikely(!ir)) { + err = -ENOMEM; + goto err_out; + } + /* Copy the index root value (it has been verified in read_inode). */ + memcpy(ir, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), rc); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ndir); + ctx = NULL; + m = NULL; + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index root entry if continuing previous readdir. */ + if (ir_pos > (u8*)ie - (u8*)ir) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ir; + /* Submit the name to the filldir callback. */ + rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); + if (rc) { + kfree(ir); + goto abort; + } + } + /* We are done with the index root and can free the buffer. */ + kfree(ir); + ir = NULL; + /* If there is no index allocation attribute we are finished. */ + if (!NInoIndexAllocPresent(ndir)) + goto EOD; + /* Advance fpos to the beginning of the index allocation. */ + actor->pos = vol->mft_record_size; +skip_index_root: + kaddr = NULL; + prev_ia_pos = -1LL; + /* Get the offset into the index allocation attribute. */ + ia_pos = (s64)actor->pos - vol->mft_record_size; + ia_mapping = vdir->i_mapping; + ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); + bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); + if (IS_ERR(bmp_vi)) { + ntfs_error(sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bmp_vi); + goto err_out; + } + bmp_mapping = bmp_vi->i_mapping; + /* Get the starting bitmap bit position and sanity check it. */ + bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; + if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { + ntfs_error(sb, "Current index allocation position exceeds " + "index bitmap size."); + goto iput_err_out; + } + /* Get the starting bit position in the current bitmap page. */ + cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1); + bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1); +get_next_bmp_page: + ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", + (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT), + (unsigned long long)bmp_pos & + (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1)); + bmp_page = ntfs_map_page(bmp_mapping, + bmp_pos >> (3 + PAGE_CACHE_SHIFT)); + if (IS_ERR(bmp_page)) { + ntfs_error(sb, "Reading index bitmap failed."); + err = PTR_ERR(bmp_page); + bmp_page = NULL; + goto iput_err_out; + } + bmp = (u8*)page_address(bmp_page); + /* Find next index block in use. */ + while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { +find_next_index_buffer: + cur_bmp_pos++; + /* + * If we have reached the end of the bitmap page, get the next + * page, and put away the old one. + */ + if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) { + ntfs_unmap_page(bmp_page); + bmp_pos += PAGE_CACHE_SIZE * 8; + cur_bmp_pos = 0; + goto get_next_bmp_page; + } + /* If we have reached the end of the bitmap, we are done. */ + if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) + goto unm_EOD; + ia_pos = (bmp_pos + cur_bmp_pos) << + ndir->itype.index.block_size_bits; + } + ntfs_debug("Handling index buffer 0x%llx.", + (unsigned long long)bmp_pos + cur_bmp_pos); + /* If the current index buffer is in the same page we reuse the page. */ + if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) != + (ia_pos & (s64)PAGE_CACHE_MASK)) { + prev_ia_pos = ia_pos; + if (likely(ia_page != NULL)) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + /* + * Map the page cache page containing the current ia_pos, + * reading it from disk if necessary. + */ + ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR(ia_page)) { + ntfs_error(sb, "Reading index allocation data failed."); + err = PTR_ERR(ia_page); + ia_page = NULL; + goto err_out; + } + lock_page(ia_page); + kaddr = (u8*)page_address(ia_page); + } + /* Get the current index buffer. */ + ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK & + ~(s64)(ndir->itype.index.block_size - 1))); + /* Bounds checks. */ + if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", vdir->i_ino); + goto err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & + ~(s64)(ndir->itype.index.block_size - 1)) >> + ndir->itype.index.vcn_size_bits)) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug. ", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != + ndir->itype.index.block_size)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino, + le32_to_cpu(ia->index.allocated_size) + 0x18, + ndir->itype.index.block_size); + goto err_out; + } + index_end = (u8*)ia + ndir->itype.index.block_size; + if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + /* The first index entry in this index buffer. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index allocation, offset 0x%llx.", + (unsigned long long)ia_start + + (unsigned long long)((u8*)ie - (u8*)ia)); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index block entry if continuing previous readdir. */ + if (ia_pos - ia_start > (u8*)ie - (u8*)ia) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ia + + (sle64_to_cpu(ia->index_block_vcn) << + ndir->itype.index.vcn_size_bits) + + vol->mft_record_size; + /* + * Submit the name to the @filldir callback. Note, + * ntfs_filldir() drops the lock on @ia_page but it retakes it + * before returning, unless a non-zero value is returned in + * which case the page is left unlocked. + */ + rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); + if (rc) { + /* @ia_page is already unlocked in this case. */ + ntfs_unmap_page(ia_page); + ntfs_unmap_page(bmp_page); + iput(bmp_vi); + goto abort; + } + } + goto find_next_index_buffer; +unm_EOD: + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + ntfs_unmap_page(bmp_page); + iput(bmp_vi); +EOD: + /* We are finished, set fpos to EOD. */ + actor->pos = i_size + vol->mft_record_size; +abort: + kfree(name); + return 0; +err_out: + if (bmp_page) { + ntfs_unmap_page(bmp_page); +iput_err_out: + iput(bmp_vi); + } + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + kfree(ir); + kfree(name); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ndir); + if (!err) + err = -EIO; + ntfs_debug("Failed. Returning error code %i.", -err); + return err; +} + +/** + * ntfs_dir_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit directory size to the page cache limit on architectures where unsigned + * long is 32-bits. This is the most we can do for now without overflowing the + * page cache page index. Doing it this way means we don't run into problems + * because of existing too large directories. It would be better to allow the + * user to read the accessible part of the directory but I doubt very much + * anyone is going to hit this check on a 32-bit architecture, so there is no + * point in adding the extra complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + */ +static int ntfs_dir_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EFBIG; + } + return 0; +} + +#ifdef NTFS_RW + +/** + * ntfs_dir_fsync - sync a directory to disk + * @filp: directory to be synced + * @dentry: dentry describing the directory to sync + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and + * msync system calls. This function is based on file.c::ntfs_file_fsync(). + * + * Write the mft record and all associated extent mft records as well as the + * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. + * + * If @datasync is true, we do not wait on the inode(s) to be written out + * but we always wait on the page cache pages to be written out. + * + * Note: In the past @filp could be NULL so we ignore it as we don't need it + * anyway. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. We do write the $BITMAP attribute if it is present + * which is the important one for a directory so things are not too bad. + */ +static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bmp_vi, *vi = filp->f_mapping->host; + int err, ret; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = filemap_write_and_wait_range(vi->i_mapping, start, end); + if (err) + return err; + mutex_lock(&vi->i_mutex); + + BUG_ON(!S_ISDIR(vi->i_mode)); + /* If the bitmap attribute inode is in memory sync it, too. */ + na.mft_no = vi->i_ino; + na.type = AT_BITMAP; + na.name = I30; + na.name_len = 4; + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na); + if (bmp_vi) { + write_inode_now(bmp_vi, !datasync); + iput(bmp_vi); + } + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + mutex_unlock(&vi->i_mutex); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_dir_ops = { + .llseek = generic_file_llseek, /* Seek inside directory. */ + .read = generic_read_dir, /* Return -EISDIR. */ + .iterate = ntfs_readdir, /* Read directory contents. */ +#ifdef NTFS_RW + .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ + /*.aio_fsync = ,*/ /* Sync all outstanding async + i/o operations on a kiocb. */ +#endif /* NTFS_RW */ + /*.ioctl = ,*/ /* Perform function on the + mounted filesystem. */ + .open = ntfs_dir_open, /* Open directory. */ +}; diff --git a/kernel/fs/ntfs/dir.h b/kernel/fs/ntfs/dir.h new file mode 100644 index 000000000..aea7582d5 --- /dev/null +++ b/kernel/fs/ntfs/dir.h @@ -0,0 +1,48 @@ +/* + * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2002-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_DIR_H +#define _LINUX_NTFS_DIR_H + +#include "layout.h" +#include "inode.h" +#include "types.h" + +/* + * ntfs_name is used to return the file name to the caller of + * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) + * to be able to deal with dcache aliasing issues. + */ +typedef struct { + MFT_REF mref; + FILE_NAME_TYPE_FLAGS type; + u8 len; + ntfschar name[0]; +} __attribute__ ((__packed__)) ntfs_name; + +/* The little endian Unicode string $I30 as a global constant. */ +extern ntfschar I30[5]; + +extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, + const ntfschar *uname, const int uname_len, ntfs_name **res); + +#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/kernel/fs/ntfs/endian.h b/kernel/fs/ntfs/endian.h new file mode 100644 index 000000000..927b5bf04 --- /dev/null +++ b/kernel/fs/ntfs/endian.h @@ -0,0 +1,93 @@ +/* + * endian.h - Defines for endianness handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_ENDIAN_H +#define _LINUX_NTFS_ENDIAN_H + +#include +#include "types.h" + +/* + * Signed endianness conversion functions. + */ + +static inline s16 sle16_to_cpu(sle16 x) +{ + return le16_to_cpu((__force le16)x); +} + +static inline s32 sle32_to_cpu(sle32 x) +{ + return le32_to_cpu((__force le32)x); +} + +static inline s64 sle64_to_cpu(sle64 x) +{ + return le64_to_cpu((__force le64)x); +} + +static inline s16 sle16_to_cpup(sle16 *x) +{ + return le16_to_cpu(*(__force le16*)x); +} + +static inline s32 sle32_to_cpup(sle32 *x) +{ + return le32_to_cpu(*(__force le32*)x); +} + +static inline s64 sle64_to_cpup(sle64 *x) +{ + return le64_to_cpu(*(__force le64*)x); +} + +static inline sle16 cpu_to_sle16(s16 x) +{ + return (__force sle16)cpu_to_le16(x); +} + +static inline sle32 cpu_to_sle32(s32 x) +{ + return (__force sle32)cpu_to_le32(x); +} + +static inline sle64 cpu_to_sle64(s64 x) +{ + return (__force sle64)cpu_to_le64(x); +} + +static inline sle16 cpu_to_sle16p(s16 *x) +{ + return (__force sle16)cpu_to_le16(*x); +} + +static inline sle32 cpu_to_sle32p(s32 *x) +{ + return (__force sle32)cpu_to_le32(*x); +} + +static inline sle64 cpu_to_sle64p(s64 *x) +{ + return (__force sle64)cpu_to_le64(*x); +} + +#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/kernel/fs/ntfs/file.c b/kernel/fs/ntfs/file.c new file mode 100644 index 000000000..7bb487e66 --- /dev/null +++ b/kernel/fs/ntfs/file.c @@ -0,0 +1,2043 @@ +/* + * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "attrib.h" +#include "bitmap.h" +#include "inode.h" +#include "debug.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_file_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit file size to the page cache limit on architectures where unsigned long + * is 32-bits. This is the most we can do for now without overflowing the page + * cache page index. Doing it this way means we don't run into problems because + * of existing too large files. It would be better to allow the user to read + * the beginning of the file but I doubt very much anyone is going to hit this + * check on a 32-bit architecture, so there is no point in adding the extra + * complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + * + * After the check passes, just call generic_file_open() to do its work. + */ +static int ntfs_file_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EOVERFLOW; + } + return generic_file_open(vi, filp); +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_extend_initialized - extend the initialized size of an attribute + * @ni: ntfs inode of the attribute to extend + * @new_init_size: requested new initialized size in bytes + * + * Extend the initialized size of an attribute described by the ntfs inode @ni + * to @new_init_size bytes. This involves zeroing any non-sparse space between + * the old initialized size and @new_init_size both in the page cache and on + * disk (if relevant complete pages are already uptodate in the page cache then + * these are simply marked dirty). + * + * As a side-effect, the file size (vfs inode->i_size) may be incremented as, + * in the resident attribute case, it is tied to the initialized size and, in + * the non-resident attribute case, it may not fall below the initialized size. + * + * Note that if the attribute is resident, we do not need to touch the page + * cache at all. This is because if the page cache page is not uptodate we + * bring it uptodate later, when doing the write to the mft record since we + * then already have the page mapped. And if the page is uptodate, the + * non-initialized region will already have been zeroed when the page was + * brought uptodate and the region may in fact already have been overwritten + * with new data via mmap() based writes, so we cannot just zero it. And since + * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped + * is unspecified, we choose not to do zeroing and thus we do not need to touch + * the page at all. For a more detailed explanation see ntfs_truncate() in + * fs/ntfs/inode.c. + * + * Return 0 on success and -errno on error. In the case that an error is + * encountered it is possible that the initialized size will already have been + * incremented some way towards @new_init_size but it is guaranteed that if + * this is the case, the necessary zeroing will also have happened and that all + * metadata is self-consistent. + * + * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be + * held by the caller. + */ +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) +{ + s64 old_init_size; + loff_t old_i_size; + pgoff_t index, end_index; + unsigned long flags; + struct inode *vi = VFS_I(ni); + ntfs_inode *base_ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *kattr; + int err; + u32 attr_len; + + read_lock_irqsave(&ni->size_lock, flags); + old_init_size = ni->initialized_size; + old_i_size = i_size_read(vi); + BUG_ON(new_init_size > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_initialized_size 0x%llx, " + "new_initialized_size 0x%llx, i_size 0x%llx.", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + (unsigned long long)old_init_size, + (unsigned long long)new_init_size, old_i_size); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Use goto to reduce indentation and we need the label below anyway. */ + if (NInoNonResident(ni)) + goto do_non_resident_extend; + BUG_ON(old_init_size != old_i_size); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + BUG_ON(old_i_size != (loff_t)attr_len); + /* + * Do the zeroing in the mft record and update the attribute size in + * the mft record. + */ + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + memset(kattr + attr_len, 0, new_init_size - attr_len); + a->data.resident.value_length = cpu_to_le32((u32)new_init_size); + /* Finally, update the sizes in the vfs and ntfs inodes. */ + write_lock_irqsave(&ni->size_lock, flags); + i_size_write(vi, new_init_size); + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto done; +do_non_resident_extend: + /* + * If the new initialized size @new_init_size exceeds the current file + * size (vfs inode->i_size), we need to extend the file size to the + * new initialized size. + */ + if (new_init_size > old_i_size) { + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + BUG_ON(old_i_size != (loff_t) + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_init_size); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* Update the file size in the vfs inode. */ + i_size_write(vi, new_init_size); + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + unmap_mft_record(base_ni); + m = NULL; + } + mapping = vi->i_mapping; + index = old_init_size >> PAGE_CACHE_SHIFT; + end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + do { + /* + * Read the page. If the page is not present, this will zero + * the uninitialized regions for us. + */ + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto init_err_out; + } + if (unlikely(PageError(page))) { + page_cache_release(page); + err = -EIO; + goto init_err_out; + } + /* + * Update the initialized size in the ntfs inode. This is + * enough to make ntfs_writepage() work. + */ + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; + if (ni->initialized_size > new_init_size) + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Set the page dirty so it gets written out. */ + set_page_dirty(page); + page_cache_release(page); + /* + * Play nice with the vm and the rest of the system. This is + * very much needed as we can potentially be modifying the + * initialised size from a very small value to a really huge + * value, e.g. + * f = open(somefile, O_TRUNC); + * truncate(f, 10GiB); + * seek(f, 10GiB); + * write(f, 1); + * And this would mean we would be marking dirty hundreds of + * thousands of pages or as in the above example more than + * two and a half million pages! + * + * TODO: For sparse pages could optimize this workload by using + * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This + * would be set in readpage for sparse pages and here we would + * not need to mark dirty any pages which have this bit set. + * The only caveat is that we have to clear the bit everywhere + * where we allocate any clusters that lie in the page or that + * contain the page. + * + * TODO: An even greater optimization would be for us to only + * call readpage() on pages which are not in sparse regions as + * determined from the runlist. This would greatly reduce the + * number of pages we read and make dirty in the case of sparse + * files. + */ + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (++index < end_index); + read_lock_irqsave(&ni->size_lock, flags); + BUG_ON(ni->initialized_size != new_init_size); + read_unlock_irqrestore(&ni->size_lock, flags); + /* Now bring in sync the initialized_size in the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto init_err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto init_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto init_err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); +done: + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", + (unsigned long long)new_init_size, i_size_read(vi)); + return 0; +init_err_out: + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = old_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, + struct iov_iter *from) +{ + loff_t pos; + s64 end, ll; + ssize_t err; + unsigned long flags; + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%zx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)iocb->ki_pos, + iov_iter_count(from)); + err = generic_write_checks(iocb, from); + if (unlikely(err <= 0)) + goto out; + /* + * All checks have passed. Before we start doing any writing we want + * to abort any totally illegal writes. + */ + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->type != AT_DATA); + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + /* Only $DATA attributes can be encrypted. */ + /* + * Reminder for later: Encrypted files are _always_ + * non-resident so that the content can always be encrypted. + */ + ntfs_debug("Denying write access to encrypted file."); + err = -EACCES; + goto out; + } + if (NInoCompressed(ni)) { + /* Only unnamed $DATA attribute can be compressed. */ + BUG_ON(ni->name_len); + /* + * Reminder for later: If resident, the data is not actually + * compressed. Only on the switch to non-resident does + * compression kick in. This is in contrast to encrypted files + * (see above). + */ + ntfs_error(vi->i_sb, "Writing to compressed files is not " + "implemented yet. Sorry."); + err = -EOPNOTSUPP; + goto out; + } + base_ni = ni; + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + err = file_remove_suid(file); + if (unlikely(err)) + goto out; + /* + * Our ->update_time method always succeeds thus file_update_time() + * cannot fail either so there is no need to check the return code. + */ + file_update_time(file); + pos = iocb->ki_pos; + /* The first byte after the last cluster being written to. */ + end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & + ~(u64)vol->cluster_size_mask; + /* + * If the write goes beyond the allocated size, extend the allocation + * to cover the whole of the write, rounded up to the nearest cluster. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end > ll) { + /* + * Extend the allocation without changing the data size. + * + * Note we ensure the allocation is big enough to at least + * write some data but we do not require the allocation to be + * complete, i.e. it may be partial. + */ + ll = ntfs_attr_extend_allocation(ni, end, -1, pos); + if (likely(ll >= 0)) { + BUG_ON(pos >= ll); + /* If the extension was partial truncate the write. */ + if (end > ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "the allocation was only " + "partially extended.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + iov_iter_truncate(from, ll - pos); + } + } else { + err = ll; + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* Perform a partial write if possible or fail. */ + if (pos < ll) { + ntfs_debug("Truncating write to inode 0x%lx " + "attribute type 0x%x, because " + "extending the allocation " + "failed (error %d).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (int)-err); + iov_iter_truncate(from, ll - pos); + } else { + if (err != -ENOSPC) + ntfs_error(vi->i_sb, "Cannot perform " + "write to inode " + "0x%lx, attribute " + "type 0x%x, because " + "extending the " + "allocation failed " + "(error %ld).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (long)-err); + else + ntfs_debug("Cannot perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because there is not " + "space left.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + goto out; + } + } + } + /* + * If the write starts beyond the initialized size, extend it up to the + * beginning of the write and initialize all non-sparse space between + * the old initialized size and the new one. This automatically also + * increments the vfs inode->i_size to keep it above or equal to the + * initialized_size. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (pos > ll) { + /* + * Wait for ongoing direct i/o to complete before proceeding. + * New direct i/o cannot start as we hold i_mutex. + */ + inode_dio_wait(vi); + err = ntfs_attr_extend_initialized(ni, pos); + if (unlikely(err < 0)) + ntfs_error(vi->i_sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "extending the initialized size " + "failed (error %d).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (int)-err); + } +out: + return err; +} + +/** + * __ntfs_grab_cache_pages - obtain a number of locked pages + * @mapping: address space mapping from which to obtain page cache pages + * @index: starting index in @mapping at which to begin obtaining pages + * @nr_pages: number of page cache pages to obtain + * @pages: array of pages in which to return the obtained page cache pages + * @cached_page: allocated but as yet unused page + * + * Obtain @nr_pages locked page cache pages from the mapping @mapping and + * starting at index @index. + * + * If a page is newly created, add it to lru list + * + * Note, the page locks are obtained in ascending page index order. + */ +static inline int __ntfs_grab_cache_pages(struct address_space *mapping, + pgoff_t index, const unsigned nr_pages, struct page **pages, + struct page **cached_page) +{ + int err, nr; + + BUG_ON(!nr_pages); + err = nr = 0; + do { + pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | + FGP_ACCESSED); + if (!pages[nr]) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (unlikely(!*cached_page)) { + err = -ENOMEM; + goto err_out; + } + } + err = add_to_page_cache_lru(*cached_page, mapping, + index, GFP_KERNEL); + if (unlikely(err)) { + if (err == -EEXIST) + continue; + goto err_out; + } + pages[nr] = *cached_page; + *cached_page = NULL; + } + index++; + nr++; + } while (nr < nr_pages); +out: + return err; +err_out: + while (nr > 0) { + unlock_page(pages[--nr]); + page_cache_release(pages[nr]); + } + goto out; +} + +static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) +{ + lock_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + return submit_bh(READ, bh); +} + +/** + * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called for non-resident attributes from ntfs_file_buffered_write() + * with i_mutex held on the inode (@pages[0]->mapping->host). There are + * @nr_pages pages in @pages which are locked but not kmap()ped. The source + * data has not yet been copied into the @pages. + * + * Need to fill any holes with actual clusters, allocate buffers if necessary, + * ensure all the buffers are mapped, and bring uptodate any buffers that are + * only partially being written to. + * + * If @nr_pages is greater than one, we are guaranteed that the cluster size is + * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside + * the same cluster and that they are the entirety of that cluster, and that + * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. + * + * i_size is not to be modified yet. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, + unsigned nr_pages, s64 pos, size_t bytes) +{ + VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; + LCN lcn; + s64 bh_pos, vcn_len, end, initialized_size; + sector_t lcn_block; + struct page *page; + struct inode *vi; + ntfs_inode *ni, *base_ni = NULL; + ntfs_volume *vol; + runlist_element *rl, *rl2; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + ATTR_RECORD *a = NULL; + unsigned long flags; + u32 attr_rec_len = 0; + unsigned blocksize, u; + int err, mp_size; + bool rl_write_locked, was_hole, is_retry; + unsigned char blocksize_bits; + struct { + u8 runlist_merged:1; + u8 mft_attr_mapped:1; + u8 mp_rebuilt:1; + u8 attr_switched:1; + } status = { 0, 0, 0, 0 }; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + BUG_ON(!*pages); + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, pages[0]->index, nr_pages, + (long long)pos, bytes); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + u = 0; + do { + page = pages[u]; + BUG_ON(!page); + /* + * create_empty_buffers() will create uptodate/dirty buffers if + * the page is uptodate/dirty. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) + return -ENOMEM; + } + } while (++u < nr_pages); + rl_write_locked = false; + rl = NULL; + err = 0; + vcn = lcn = -1; + vcn_len = 0; + lcn_block = -1; + was_hole = false; + cpos = pos >> vol->cluster_size_bits; + end = pos + bytes; + cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; + /* + * Loop over each page and for each page over each buffer. Use goto to + * reduce indentation. + */ + u = 0; +do_next_page: + page = pages[u]; + bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh = head = page_buffers(page); + do { + VCN cdelta; + s64 bh_end; + unsigned bh_cofs; + + /* Clear buffer_new on all buffers to reinitialise state. */ + if (buffer_new(bh)) + clear_buffer_new(bh); + bh_end = bh_pos + blocksize; + bh_cpos = bh_pos >> vol->cluster_size_bits; + bh_cofs = bh_pos & vol->cluster_size_mask; + if (buffer_mapped(bh)) { + /* + * The buffer is already mapped. If it is uptodate, + * ignore it. + */ + if (buffer_uptodate(bh)) + continue; + /* + * The buffer is not uptodate. If the page is uptodate + * set the buffer uptodate and otherwise ignore it. + */ + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + /* + * Neither the page nor the buffer are uptodate. If + * the buffer is only partially being written to, we + * need to read it in before the write, i.e. now. + */ + if ((bh_pos < pos && bh_end > pos) || + (bh_pos < end && bh_end > end)) { + /* + * If the buffer is fully or partially within + * the initialized size, do an actual read. + * Otherwise, simply zero the buffer. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* Unmapped buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + /* + * If the current buffer is in the same clusters as the map + * cache, there is no need to check the runlist again. The + * map cache is made up of @vcn, which is the first cached file + * cluster, @vcn_len which is the number of cached file + * clusters, @lcn is the device cluster corresponding to @vcn, + * and @lcn_block is the block number corresponding to @lcn. + */ + cdelta = bh_cpos - vcn; + if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { +map_buffer_cached: + BUG_ON(lcn < 0); + bh->b_blocknr = lcn_block + + (cdelta << (vol->cluster_size_bits - + blocksize_bits)) + + (bh_cofs >> blocksize_bits); + set_buffer_mapped(bh); + /* + * If the page is uptodate so is the buffer. If the + * buffer is fully outside the write, we ignore it if + * it was already allocated and we mark it dirty so it + * gets written out if we allocated it. On the other + * hand, if we allocated the buffer but we are not + * marking it dirty we set buffer_new so we can do + * error recovery. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (unlikely(was_hole)) { + /* We allocated the buffer. */ + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (bh_end <= pos || bh_pos >= end) + mark_buffer_dirty(bh); + else + set_buffer_new(bh); + } + continue; + } + /* Page is _not_ uptodate. */ + if (likely(!was_hole)) { + /* + * Buffer was already allocated. If it is not + * uptodate and is only partially being written + * to, we need to read it in before the write, + * i.e. now. + */ + if (!buffer_uptodate(bh) && bh_pos < end && + bh_end > pos && + (bh_pos < pos || + bh_end > end)) { + /* + * If the buffer is fully or partially + * within the initialized size, do an + * actual read. Otherwise, simply zero + * the buffer. + */ + read_lock_irqsave(&ni->size_lock, + flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, + flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* We allocated the buffer. */ + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + /* + * If the buffer is fully outside the write, zero it, + * set it uptodate, and mark it dirty so it gets + * written out. If it is partially being written to, + * zero region surrounding the write but leave it to + * commit write to do anything else. Finally, if the + * buffer is fully being overwritten, do nothing. + */ + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + mark_buffer_dirty(bh); + continue; + } + set_buffer_new(bh); + if (!buffer_uptodate(bh) && + (bh_pos < pos || bh_end > end)) { + u8 *kaddr; + unsigned pofs; + + kaddr = kmap_atomic(page); + if (bh_pos < pos) { + pofs = bh_pos & ~PAGE_CACHE_MASK; + memset(kaddr + pofs, 0, pos - bh_pos); + } + if (bh_end > end) { + pofs = end & ~PAGE_CACHE_MASK; + memset(kaddr + pofs, 0, bh_end - end); + } + kunmap_atomic(kaddr); + flush_dcache_page(page); + } + continue; + } + /* + * Slow path: this is the first buffer in the cluster. If it + * is outside allocated size and is not uptodate, zero it and + * set it uptodate. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos > initialized_size) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + } + continue; + } + is_retry = false; + if (!rl) { + down_read(&ni->runlist.lock); +retry_remap: + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target cluster. */ + while (rl->length && rl[1].vcn <= bh_cpos) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); + if (likely(lcn >= 0)) { + /* + * Successful remap, setup the map cache and + * use that to deal with the buffer. + */ + was_hole = false; + vcn = bh_cpos; + vcn_len = rl[1].vcn - vcn; + lcn_block = lcn << (vol->cluster_size_bits - + blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters touched + * by the write is smaller or equal to the + * number of cached clusters, unlock the + * runlist as the map cache will be used from + * now on. + */ + if (likely(vcn + vcn_len >= cend)) { + if (rl_write_locked) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else + up_read(&ni->runlist.lock); + rl = NULL; + } + goto map_buffer_cached; + } + } else + lcn = LCN_RL_NOT_MAPPED; + /* + * If it is not a hole and not out of bounds, the runlist is + * probably unmapped so try to map it now. + */ + if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { + if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { + /* Attempt to map runlist. */ + if (!rl_write_locked) { + /* + * We need the runlist locked for + * writing, so if it is locked for + * reading relock it now and retry in + * case it changed whilst we dropped + * the lock. + */ + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + err = ntfs_map_runlist_nolock(ni, bh_cpos, + NULL); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + /* + * If @vcn is out of bounds, pretend @lcn is + * LCN_ENOENT. As long as the buffer is out + * of bounds this will work fine. + */ + if (err == -ENOENT) { + lcn = LCN_ENOENT; + err = 0; + goto rl_not_mapped_enoent; + } + } else + err = -EIO; + /* Failed to map the buffer, even after retrying. */ + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "vcn offset 0x%x, because its " + "location on disk could not be " + "determined%s (error code %i).", + ni->mft_no, ni->type, + (unsigned long long)bh_cpos, + (unsigned)bh_pos & + vol->cluster_size_mask, + is_retry ? " even after retrying" : "", + err); + break; + } +rl_not_mapped_enoent: + /* + * The buffer is in a hole or out of bounds. We need to fill + * the hole, unless the buffer is in a cluster which is not + * touched by the write, in which case we just leave the buffer + * unmapped. This can only happen when the cluster size is + * less than the page cache size. + */ + if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) { + bh_cend = (bh_end + vol->cluster_size - 1) >> + vol->cluster_size_bits; + if ((bh_cend <= cpos || bh_cpos >= cend)) { + bh->b_blocknr = -1; + /* + * If the buffer is uptodate we skip it. If it + * is not but the page is uptodate, we can set + * the buffer uptodate. If the page is not + * uptodate, we can clear the buffer and set it + * uptodate. Whether this is worthwhile is + * debatable and this could be removed. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + continue; + } + } + /* + * Out of bounds buffer is invalid if it was not really out of + * bounds. + */ + BUG_ON(lcn != LCN_HOLE); + /* + * We need the runlist locked for writing, so if it is locked + * for reading relock it now and retry in case it changed + * whilst we dropped the lock. + */ + BUG_ON(!rl); + if (!rl_write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + /* Find the previous last allocated cluster. */ + BUG_ON(rl->lcn != LCN_HOLE); + lcn = -1; + rl2 = rl; + while (--rl2 >= ni->runlist.rl) { + if (rl2->lcn >= 0) { + lcn = rl2->lcn + rl2->length; + break; + } + } + rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, + false); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + ntfs_debug("Failed to allocate cluster, error code %i.", + err); + break; + } + lcn = rl2->lcn; + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + break; + } + ni->runlist.rl = rl; + status.runlist_merged = 1; + ntfs_debug("Allocated cluster, lcn 0x%llx.", + (unsigned long long)lcn); + /* Map and lock the mft record and get the attribute record. */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + break; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + unmap_mft_record(base_ni); + break; + } + status.mft_attr_mapped = 1; + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + break; + } + m = ctx->mrec; + a = ctx->attr; + /* + * Find the runlist element with which the attribute extent + * starts. Note, we cannot use the _attr_ version because we + * have mapped the mft record. That is ok because we know the + * runlist fragment must be mapped already to have ever gotten + * here, so we can just use the _rl_ version. + */ + vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + /* + * If @highest_vcn is zero, calculate the real highest_vcn + * (which can really be zero). + */ + if (!highest_vcn) + highest_vcn = (sle64_to_cpu( + a->data.non_resident.allocated_size) >> + vol->cluster_size_bits) - 1; + /* + * Determine the size of the mapping pairs array for the new + * extent, i.e. the old extent with the hole filled. + */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, + highest_vcn); + if (unlikely(mp_size <= 0)) { + if (!(err = mp_size)) + err = -EIO; + ntfs_debug("Failed to get size for mapping pairs " + "array, error code %i.", err); + break; + } + /* + * Resize the attribute record to fit the new mapping pairs + * array. + */ + attr_rec_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by using the current attribute + // and fill it with as much of the mapping pairs + // array as possible. Then loop over each attribute + // extent rewriting the mapping pairs arrays as we go + // along and if when we reach the end we have not + // enough space, try to resize the last attribute + // extent and if even that fails, add a new attribute + // extent. + // We could also try to resize at each step in the hope + // that we will not need to rewrite every single extent. + // Note, we may need to decompress some extents to fill + // the runlist as we are walking the extents... + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + break ; + } + status.mp_rebuilt = 1; + /* + * Generate the mapping pairs array directly into the attribute + * record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, vcn, highest_vcn, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " + "attribute type 0x%x, because building " + "the mapping pairs failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + break; + } + /* Update the highest_vcn but only if it was not set. */ + if (unlikely(!a->data.non_resident.highest_vcn)) + a->data.non_resident.highest_vcn = + cpu_to_sle64(highest_vcn); + /* + * If the attribute is sparse/compressed, update the compressed + * size in the ntfs_inode structure and the attribute record. + */ + if (likely(NInoSparse(ni) || NInoCompressed(ni))) { + /* + * If we are not in the first attribute extent, switch + * to it, but first ensure the changes will make it to + * disk later. + */ + if (a->data.non_resident.lowest_vcn) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, + ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + status.attr_switched = 1; + break; + } + /* @m is not used any more so do not set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + /* Successfully filled the hole. */ + status.runlist_merged = 0; + status.mft_attr_mapped = 0; + status.mp_rebuilt = 0; + /* Setup the map cache and use that to deal with the buffer. */ + was_hole = true; + vcn = bh_cpos; + vcn_len = 1; + lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters in the @pages is smaller + * or equal to the number of cached clusters, unlock the + * runlist as the map cache will be used from now on. + */ + if (likely(vcn + vcn_len >= cend)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + rl = NULL; + } + goto map_buffer_cached; + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* If there are no errors, do the next page. */ + if (likely(!err && ++u < nr_pages)) + goto do_next_page; + /* If there are no errors, release the runlist lock if we took it. */ + if (likely(!err)) { + if (unlikely(rl_write_locked)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else if (unlikely(rl)) + up_read(&ni->runlist.lock); + rl = NULL; + } + /* If we issued read requests, let them complete. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + while (wait_bh > wait) { + bh = *--wait_bh; + wait_on_buffer(bh); + if (likely(buffer_uptodate(bh))) { + page = bh->b_page; + bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); + /* + * If the buffer overflows the initialized size, need + * to zero the overflowing region. + */ + if (unlikely(bh_pos + blocksize > initialized_size)) { + int ofs = 0; + + if (likely(bh_pos < initialized_size)) + ofs = initialized_size - bh_pos; + zero_user_segment(page, bh_offset(bh) + ofs, + blocksize); + } + } else /* if (unlikely(!buffer_uptodate(bh))) */ + err = -EIO; + } + if (likely(!err)) { + /* Clear buffer_new on all buffers. */ + u = 0; + do { + bh = head = page_buffers(pages[u]); + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u < nr_pages); + ntfs_debug("Done."); + return err; + } + if (status.attr_switched) { + /* Get back to the attribute extent we modified. */ + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find required " + "attribute extent of attribute in " + "error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* + * The only thing that is now wrong is the compressed + * size of the base attribute extent which chkdsk + * should be able to fix. + */ + NVolSetErrors(vol); + } else { + m = ctx->mrec; + a = ctx->attr; + status.attr_switched = 0; + } + } + /* + * If the runlist has been modified, need to restore it by punching a + * hole into it and we then need to deallocate the on-disk cluster as + * well. Note, we only modify the runlist if we are able to generate a + * new mapping pairs array, i.e. only when the mapped attribute extent + * is not switched. + */ + if (status.runlist_merged && !status.attr_switched) { + BUG_ON(!rl_write_locked); + /* Make the file cluster we allocated sparse in the runlist. */ + if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { + ntfs_error(vol->sb, "Failed to punch hole into " + "attribute runlist in error code " + "path. Run chkdsk to recover the " + "lost cluster."); + NVolSetErrors(vol); + } else /* if (success) */ { + status.runlist_merged = 0; + /* + * Deallocate the on-disk cluster we allocated but only + * if we succeeded in punching its vcn out of the + * runlist. + */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + } + } + /* + * Resize the attribute record to its old size and rebuild the mapping + * pairs array. Note, we only can do this if the runlist has been + * restored to its old state which also implies that the mapped + * attribute extent is not switched. + */ + if (status.mp_rebuilt && !status.runlist_merged) { + if (ntfs_attr_record_resize(m, a, attr_rec_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), attr_rec_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), ni->runlist.rl, + vcn, highest_vcn, NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } + /* Release the mft record and the attribute. */ + if (status.mft_attr_mapped) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } + /* Release the runlist lock. */ + if (rl_write_locked) + up_write(&ni->runlist.lock); + else if (rl) + up_read(&ni->runlist.lock); + /* + * Zero out any newly allocated blocks to avoid exposing stale data. + * If BH_New is set, we know that the block was newly allocated above + * and that it has not been fully zeroed and marked dirty yet. + */ + nr_pages = u; + u = 0; + end = bh_cpos << vol->cluster_size_bits; + do { + page = pages[u]; + bh = head = page_buffers(page); + do { + if (u == nr_pages && + ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh) >= end) + break; + if (!buffer_new(bh)) + continue; + clear_buffer_new(bh); + if (!buffer_uptodate(bh)) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + mark_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u <= nr_pages); + ntfs_error(vol->sb, "Failed. Returning error code %i.", err); + return err; +} + +static inline void ntfs_flush_dcache_pages(struct page **pages, + unsigned nr_pages) +{ + BUG_ON(!nr_pages); + /* + * Warning: Do not do the decrement at the same time as the call to + * flush_dcache_page() because it is a NULL macro on i386 and hence the + * decrement never happens so the loop never terminates. + */ + do { + --nr_pages; + flush_dcache_page(pages[nr_pages]); + } while (nr_pages > 0); +} + +/** + * ntfs_commit_pages_after_non_resident_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * See description of ntfs_commit_pages_after_write(), below. + */ +static inline int ntfs_commit_pages_after_non_resident_write( + struct page **pages, const unsigned nr_pages, + s64 pos, size_t bytes) +{ + s64 end, initialized_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct buffer_head *bh, *head; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + unsigned long flags; + unsigned blocksize, u; + int err; + + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + blocksize = vi->i_sb->s_blocksize; + end = pos + bytes; + u = 0; + do { + s64 bh_pos; + struct page *page; + bool partial; + + page = pages[u]; + bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh = head = page_buffers(page); + partial = false; + do { + s64 bh_end; + + bh_end = bh_pos + blocksize; + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) + partial = true; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* + * If all buffers are now uptodate but the page is not, set the + * page uptodate. + */ + if (!partial && !PageUptodate(page)) + SetPageUptodate(page); + } while (++u < nr_pages); + /* + * Finally, if we do not need to update initialized_size or i_size we + * are finished. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end <= initialized_size) { + ntfs_debug("Done."); + return 0; + } + /* + * Update initialized_size/i_size as appropriate, both in the inode and + * the mft record. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + BUG_ON(!NInoNonResident(ni)); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(!a->non_resident); + write_lock_irqsave(&ni->size_lock, flags); + BUG_ON(end > ni->allocated_size); + ni->initialized_size = end; + a->data.non_resident.initialized_size = cpu_to_sle64(end); + if (end > i_size_read(vi)) { + i_size_write(vi, end); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size; + } + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " + "code %i).", err); + if (err != -ENOMEM) + NVolSetErrors(ni->vol); + return err; +} + +/** + * ntfs_commit_pages_after_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called from ntfs_file_buffered_write() with i_mutex held on the inode + * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are + * locked but not kmap()ped. The source data has already been copied into the + * @page. ntfs_prepare_pages_for_non_resident_write() has been called before + * the data was copied (for non-resident attributes only) and it returned + * success. + * + * Need to set uptodate and mark dirty all buffers within the boundary of the + * write. If all buffers in a page are uptodate we set the page uptodate, too. + * + * Setting the buffers dirty ensures that they get written out later when + * ntfs_writepage() is invoked by the VM. + * + * Finally, we need to update i_size and initialized_size as appropriate both + * in the inode and the mft record. + * + * This is modelled after fs/buffer.c::generic_commit_write(), which marks + * buffers uptodate and dirty, sets the page uptodate if all buffers in the + * page are uptodate, and updates i_size if the end of io is beyond i_size. In + * that case, it also marks the inode dirty. + * + * If things have gone as outlined in + * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page + * content modifications here for non-resident attributes. For resident + * attributes we need to do the uptodate bringing here which we combine with + * the copying into the mft record which means we save one atomic kmap. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_commit_pages_after_write(struct page **pages, + const unsigned nr_pages, s64 pos, size_t bytes) +{ + s64 end, initialized_size; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct page *page; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + char *kattr, *kaddr; + unsigned long flags; + u32 attr_len; + int err; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + page = pages[0]; + BUG_ON(!page); + vi = page->mapping->host; + ni = NTFS_I(vi); + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, page->index, nr_pages, + (long long)pos, bytes); + if (NInoNonResident(ni)) + return ntfs_commit_pages_after_non_resident_write(pages, + nr_pages, pos, bytes); + BUG_ON(nr_pages > 1); + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * sparse. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + BUG_ON(NInoNonResident(ni)); + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + i_size = i_size_read(vi); + BUG_ON(attr_len != i_size); + BUG_ON(pos > attr_len); + end = pos + bytes; + BUG_ON(end > le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset)); + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + kaddr = kmap_atomic(page); + /* Copy the received data from the page to the mft record. */ + memcpy(kattr + pos, kaddr + pos, bytes); + /* Update the attribute length if necessary. */ + if (end > attr_len) { + attr_len = end; + a->data.resident.value_length = cpu_to_le32(attr_len); + } + /* + * If the page is not uptodate, bring the out of bounds area(s) + * uptodate by copying data from the mft record to the page. + */ + if (!PageUptodate(page)) { + if (pos > 0) + memcpy(kaddr, kattr, pos); + if (end < attr_len) + memcpy(kaddr + end, kattr + end, attr_len - end); + /* Zero the region outside the end of the attribute value. */ + memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + flush_dcache_page(page); + SetPageUptodate(page); + } + kunmap_atomic(kaddr); + /* Update initialized_size/i_size if necessary. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + BUG_ON(end > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + BUG_ON(initialized_size != i_size); + if (end > initialized_size) { + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = end; + i_size_write(vi, end); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory required to " + "commit the write."); + if (PageUptodate(page)) { + ntfs_warning(vi->i_sb, "Page is uptodate, setting " + "dirty so the write will be retried " + "later on by the VM."); + /* + * Put the page on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + __set_page_dirty_nobuffers(page); + err = 0; + } else + ntfs_error(vi->i_sb, "Page is not uptodate. Written " + "data has been lost."); + } else { + ntfs_error(vi->i_sb, "Resident attribute commit write failed " + "with error %i.", err); + NVolSetErrors(ni->vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +/* + * Copy as much as we can into the pages and return the number of bytes which + * were successfully copied. If a fault is encountered then clear the pages + * out to (ofs + bytes) and return the number of bytes which were copied. + */ +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, + unsigned ofs, struct iov_iter *i, size_t bytes) +{ + struct page **last_page = pages + nr_pages; + size_t total = 0; + struct iov_iter data = *i; + unsigned len, copied; + + do { + len = PAGE_CACHE_SIZE - ofs; + if (len > bytes) + len = bytes; + copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs, + len); + total += copied; + bytes -= copied; + if (!bytes) + break; + iov_iter_advance(&data, copied); + if (copied < len) + goto err; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err: + /* Zero the rest of the target like __copy_from_user(). */ + len = PAGE_CACHE_SIZE - copied; + do { + if (len > bytes) + len = bytes; + zero_user(*pages, copied, len); + bytes -= len; + copied = 0; + len = PAGE_CACHE_SIZE; + } while (++pages < last_page); + goto out; +} + +/** + * ntfs_perform_write - perform buffered write to a file + * @file: file to write to + * @i: iov_iter with data to write + * @pos: byte offset in file at which to begin writing to + */ +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, + loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *vi = mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; + struct page *cached_page = NULL; + VCN last_vcn; + LCN lcn; + size_t bytes; + ssize_t status, written = 0; + unsigned nr_pages; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%lx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)pos, + (unsigned long)iov_iter_count(i)); + /* + * If a previous ntfs_truncate() failed, repeat it and abort if it + * fails again. + */ + if (unlikely(NInoTruncateFailed(ni))) { + int err; + + inode_dio_wait(vi); + err = ntfs_truncate(vi); + if (err || NInoTruncateFailed(ni)) { + if (!err) + err = -EIO; + ntfs_error(vol->sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "ntfs_truncate() failed (error code " + "%i).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + return err; + } + } + /* + * Determine the number of pages per cluster for non-resident + * attributes. + */ + nr_pages = 1; + if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) + nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; + last_vcn = -1; + do { + VCN vcn; + pgoff_t idx, start_idx; + unsigned ofs, do_pages, u; + size_t copied; + + start_idx = idx = pos >> PAGE_CACHE_SHIFT; + ofs = pos & ~PAGE_CACHE_MASK; + bytes = PAGE_CACHE_SIZE - ofs; + do_pages = 1; + if (nr_pages > 1) { + vcn = pos >> vol->cluster_size_bits; + if (vcn != last_vcn) { + last_vcn = vcn; + /* + * Get the lcn of the vcn the write is in. If + * it is a hole, need to lock down all pages in + * the cluster. + */ + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> + vol->cluster_size_bits, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + if (lcn == LCN_ENOMEM) + status = -ENOMEM; + else { + status = -EIO; + ntfs_error(vol->sb, "Cannot " + "perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because the attribute " + "is corrupt.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + break; + } + if (lcn == LCN_HOLE) { + start_idx = (pos & ~(s64) + vol->cluster_size_mask) + >> PAGE_CACHE_SHIFT; + bytes = vol->cluster_size - (pos & + vol->cluster_size_mask); + do_pages = nr_pages; + } + } + } + if (bytes > iov_iter_count(i)) + bytes = iov_iter_count(i); +again: + /* + * Bring in the user page(s) that we will copy from _first_. + * Otherwise there is a nasty deadlock on copying from the same + * page(s) as we are writing to, without it/them being marked + * up-to-date. Note, at present there is nothing to stop the + * pages being swapped out between us bringing them into memory + * and doing the actual copying. + */ + if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) { + status = -EFAULT; + break; + } + /* Get and lock @do_pages starting at index @start_idx. */ + status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, + pages, &cached_page); + if (unlikely(status)) + break; + /* + * For non-resident attributes, we need to fill any holes with + * actual clusters and ensure all bufferes are mapped. We also + * need to bring uptodate any buffers that are only partially + * being written to. + */ + if (NInoNonResident(ni)) { + status = ntfs_prepare_pages_for_non_resident_write( + pages, do_pages, pos, bytes); + if (unlikely(status)) { + do { + unlock_page(pages[--do_pages]); + page_cache_release(pages[do_pages]); + } while (do_pages); + break; + } + } + u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; + copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, + i, bytes); + ntfs_flush_dcache_pages(pages + u, do_pages - u); + status = 0; + if (likely(copied == bytes)) { + status = ntfs_commit_pages_after_write(pages, do_pages, + pos, bytes); + if (!status) + status = bytes; + } + do { + unlock_page(pages[--do_pages]); + page_cache_release(pages[do_pages]); + } while (do_pages); + if (unlikely(status < 0)) + break; + copied = status; + cond_resched(); + if (unlikely(!copied)) { + size_t sc; + + /* + * We failed to copy anything. Fall back to single + * segment length write. + * + * This is needed to avoid possible livelock in the + * case that all segments in the iov cannot be copied + * at once without a pagefault. + */ + sc = iov_iter_single_seg_count(i); + if (bytes > sc) + bytes = sc; + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + } while (iov_iter_count(i)); + if (cached_page) + page_cache_release(cached_page); + ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", + written ? "written" : "status", (unsigned long)written, + (long)status); + return written ? written : status; +} + +/** + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Basically the same as generic_file_write_iter() except that it ends up + * up calling ntfs_perform_write() instead of generic_perform_write() and that + * O_DIRECT is not implemented. + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ssize_t written = 0; + ssize_t err; + + mutex_lock(&vi->i_mutex); + /* We can write back this queue in page reclaim. */ + current->backing_dev_info = inode_to_bdi(vi); + err = ntfs_prepare_file_for_write(iocb, from); + if (iov_iter_count(from) && !err) + written = ntfs_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + mutex_unlock(&vi->i_mutex); + if (likely(written > 0)) { + err = generic_write_sync(file, iocb->ki_pos, written); + if (err < 0) + written = 0; + } + iocb->ki_pos += written; + return written ? written : err; +} + +/** + * ntfs_file_fsync - sync a file to disk + * @filp: file to be synced + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync + * system calls. This function is inspired by fs/buffer.c::file_fsync(). + * + * If @datasync is false, write the mft record and all associated extent mft + * records as well as the $DATA attribute and then sync the block device. + * + * If @datasync is true and the attribute is non-resident, we skip the writing + * of the mft record and all associated extent mft records (this might still + * happen due to the write_inode_now() call). + * + * Also, if @datasync is true, we do not wait on the inode to be written out + * but we always wait on the page cache pages to be written out. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. + */ +static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *vi = filp->f_mapping->host; + int err, ret = 0; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = filemap_write_and_wait_range(vi->i_mapping, start, end); + if (err) + return err; + mutex_lock(&vi->i_mutex); + + BUG_ON(S_ISDIR(vi->i_mode)); + if (!datasync || !NInoNonResident(NTFS_I(vi))) + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + /* + * NOTE: If we were to use mapping->private_list (see ext2 and + * fs/buffer.c) for dirty blocks then we could optimize the below to be + * sync_mapping_buffers(vi->i_mapping). + */ + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + mutex_unlock(&vi->i_mutex); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_file_ops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, +#ifdef NTFS_RW + .write_iter = ntfs_file_write_iter, + .fsync = ntfs_file_fsync, +#endif /* NTFS_RW */ + .mmap = generic_file_mmap, + .open = ntfs_file_open, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations ntfs_file_inode_ops = { +#ifdef NTFS_RW + .setattr = ntfs_setattr, +#endif /* NTFS_RW */ +}; + +const struct file_operations ntfs_empty_file_ops = {}; + +const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/kernel/fs/ntfs/index.c b/kernel/fs/ntfs/index.c new file mode 100644 index 000000000..096c13569 --- /dev/null +++ b/kernel/fs/ntfs/index.c @@ -0,0 +1,454 @@ +/* + * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include "aops.h" +#include "collate.h" +#include "debug.h" +#include "index.h" +#include "ntfs.h" + +/** + * ntfs_index_ctx_get - allocate and initialize a new index context + * @idx_ni: ntfs index inode with which to initialize the context + * + * Allocate a new index context, initialize it with @idx_ni and return it. + * Return NULL if allocation failed. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) +{ + ntfs_index_context *ictx; + + ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); + if (ictx) + *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; + return ictx; +} + +/** + * ntfs_index_ctx_put - release an index context + * @ictx: index context to free + * + * Release the index context @ictx, releasing all associated resources. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +void ntfs_index_ctx_put(ntfs_index_context *ictx) +{ + if (ictx->entry) { + if (ictx->is_in_root) { + if (ictx->actx) + ntfs_attr_put_search_ctx(ictx->actx); + if (ictx->base_ni) + unmap_mft_record(ictx->base_ni); + } else { + struct page *page = ictx->page; + if (page) { + BUG_ON(!PageLocked(page)); + unlock_page(page); + ntfs_unmap_page(page); + } + } + } + kmem_cache_free(ntfs_index_ctx_cache, ictx); + return; +} + +/** + * ntfs_index_lookup - find a key in an index and return its index entry + * @key: [IN] key for which to search in the index + * @key_len: [IN] length of @key in bytes + * @ictx: [IN/OUT] context describing the index and the returned entry + * + * Before calling ntfs_index_lookup(), @ictx must have been obtained from a + * call to ntfs_index_ctx_get(). + * + * Look for the @key in the index specified by the index lookup context @ictx. + * ntfs_index_lookup() walks the contents of the index looking for the @key. + * + * If the @key is found in the index, 0 is returned and @ictx is setup to + * describe the index entry containing the matching @key. @ictx->entry is the + * index entry and @ictx->data and @ictx->data_len are the index entry data and + * its length in bytes, respectively. + * + * If the @key is not found in the index, -ENOENT is returned and @ictx is + * setup to describe the index entry whose key collates immediately after the + * search @key, i.e. this is the position in the index at which an index entry + * with a key of @key would need to be inserted. + * + * If an error occurs return the negative error code and @ictx is left + * untouched. + * + * When finished with the entry and its data, call ntfs_index_ctx_put() to free + * the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + * + * Locking: - Caller must hold i_mutex on the index inode. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx) +{ + VCN vcn, old_vcn; + ntfs_inode *idx_ni = ictx->idx_ni; + ntfs_volume *vol = idx_ni->vol; + struct super_block *sb = vol->sb; + ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end, *kaddr; + ntfs_attr_search_ctx *actx; + struct address_space *ia_mapping; + struct page *page; + int rc, err = 0; + + ntfs_debug("Entering."); + BUG_ON(!NInoAttr(idx_ni)); + BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); + BUG_ON(idx_ni->nr_extents != -1); + BUG_ON(!base_ni); + BUG_ON(!key); + BUG_ON(key_len <= 0); + if (!ntfs_is_collation_rule_supported( + idx_ni->itype.index.collation_rule)) { + ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " + "Aborting lookup.", le32_to_cpu( + idx_ni->itype.index.collation_rule)); + return -EOPNOTSUPP; + } + /* Get hold of the mft record for the index inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return PTR_ERR(m); + } + actx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!actx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, actx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in inode " + "0x%lx.", idx_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it has been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)actx->attr + + le16_to_cpu(actx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) + goto idx_err_out; + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) + goto idx_err_out; + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ir_done: + ictx->is_in_root = true; + ictx->ir = ir; + ictx->actx = actx; + ictx->base_ni = base_ni; + ictx->ia = NULL; + ictx->page = NULL; +done: + ictx->entry = ie; + ictx->data = (u8*)ie + + le16_to_cpu(ie->data.vi.data_offset); + ictx->data_len = le16_to_cpu(ie->data.vi.data_length); + ntfs_debug("Done."); + return err; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ir_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present setup @ictx and return + * -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ir_done; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(idx_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Inode 0x%lx is corrupt or " + "driver bug.", idx_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(idx_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(actx); + unmap_mft_record(base_ni); + m = NULL; + actx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt inode " + "0x%lx or driver bug.", idx_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " + "Corrupt inode 0x%lx. Run chkdsk.", + (long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). Inode " + "0x%lx is corrupt or driver bug.", + (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " + "a size (%u) differing from the index " + "specified size (%u). Inode is corrupt or " + "driver bug.", (unsigned long long)vcn, + idx_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + idx_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + idx_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " + "crosses page boundary. Impossible! Cannot " + "access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + idx_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " + "0x%lx exceeds maximum size.", + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ia_done: + ictx->is_in_root = false; + ictx->actx = NULL; + ictx->base_ni = NULL; + ictx->ia = ia; + ictx->page = page; + goto done; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ia_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node and if not present return -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ia_done; + } + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in a leaf " + "node in inode 0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* + * If vcn is in the same page cache page as old_vcn we recycle + * the mapped page. + */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", + idx_ni->mft_no); +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (actx) + ntfs_attr_put_search_ctx(actx); + if (m) + unmap_mft_record(base_ni); + return err; +idx_err_out: + ntfs_error(sb, "Corrupt index. Aborting lookup."); + goto err_out; +} diff --git a/kernel/fs/ntfs/index.h b/kernel/fs/ntfs/index.h new file mode 100644 index 000000000..8745469c3 --- /dev/null +++ b/kernel/fs/ntfs/index.h @@ -0,0 +1,148 @@ +/* + * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_INDEX_H +#define _LINUX_NTFS_INDEX_H + +#include + +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "attrib.h" +#include "mft.h" +#include "aops.h" + +/** + * @idx_ni: index inode containing the @entry described by this context + * @entry: index entry (points into @ir or @ia) + * @data: index entry data (points into @entry) + * @data_len: length in bytes of @data + * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia + * @ir: index root if @is_in_root and NULL otherwise + * @actx: attribute search context if @is_in_root and NULL otherwise + * @base_ni: base inode if @is_in_root and NULL otherwise + * @ia: index block if @is_in_root is 'false' and NULL otherwise + * @page: page if @is_in_root is 'false' and NULL otherwise + * + * @idx_ni is the index inode this context belongs to. + * + * @entry is the index entry described by this context. @data and @data_len + * are the index entry data and its length in bytes, respectively. @data + * simply points into @entry. This is probably what the user is interested in. + * + * If @is_in_root is 'true', @entry is in the index root attribute @ir described + * by the attribute search context @actx and the base inode @base_ni. @ia and + * @page are NULL in this case. + * + * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia + * and @page point to the index allocation block and the mapped, locked page it + * is in, respectively. @ir, @actx and @base_ni are NULL in this case. + * + * To obtain a context call ntfs_index_ctx_get(). + * + * We use this context to allow ntfs_index_lookup() to return the found index + * @entry and its @data without having to allocate a buffer and copy the @entry + * and/or its @data into it. + * + * When finished with the @entry and its @data, call ntfs_index_ctx_put() to + * free the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + */ +typedef struct { + ntfs_inode *idx_ni; + INDEX_ENTRY *entry; + void *data; + u16 data_len; + bool is_in_root; + INDEX_ROOT *ir; + ntfs_attr_search_ctx *actx; + ntfs_inode *base_ni; + INDEX_ALLOCATION *ia; + struct page *page; +} ntfs_index_context; + +extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); +extern void ntfs_index_ctx_put(ntfs_index_context *ictx); + +extern int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx); + +#ifdef NTFS_RW + +/** + * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries + * @ictx: ntfs index context describing the index entry + * + * Call flush_dcache_page() for the page in which an index entry resides. + * + * This must be called every time an index entry is modified, just after the + * modification. + * + * If the index entry is in the index root attribute, simply flush the page + * containing the mft record containing the index root attribute. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, simply flush the page cache page containing the index block. + */ +static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + flush_dcache_mft_record_page(ictx->actx->ntfs_ino); + else + flush_dcache_page(ictx->page); +} + +/** + * ntfs_index_entry_mark_dirty - mark an index entry dirty + * @ictx: ntfs index context describing the index entry + * + * Mark the index entry described by the index entry context @ictx dirty. + * + * If the index entry is in the index root attribute, simply mark the mft + * record containing the index root attribute dirty. This ensures the mft + * record, and hence the index root attribute, will be written out to disk + * later. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, mark the buffers belonging to the index record as well as the + * page cache page the index block is in dirty. This automatically marks the + * VFS inode of the ntfs index inode to which the index entry belongs dirty, + * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the + * dirty index block, will be written out to disk later. + */ +static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + mark_mft_record_dirty(ictx->actx->ntfs_ino); + else + mark_ntfs_record_dirty(ictx->page, + (u8*)ictx->ia - (u8*)page_address(ictx->page)); +} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/kernel/fs/ntfs/inode.c b/kernel/fs/ntfs/inode.c new file mode 100644 index 000000000..d284f07ed --- /dev/null +++ b/kernel/fs/ntfs/inode.c @@ -0,0 +1,3111 @@ +/** + * inode.c - NTFS kernel inode handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "bitmap.h" +#include "dir.h" +#include "debug.h" +#include "inode.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "time.h" +#include "ntfs.h" + +/** + * ntfs_test_inode - compare two (possibly fake) inodes for equality + * @vi: vfs inode which to test + * @na: ntfs attribute which is being tested with + * + * Compare the ntfs attribute embedded in the ntfs specific part of the vfs + * inode @vi for equality with the ntfs attribute @na. + * + * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. + * @na->name and @na->name_len are then ignored. + * + * Return 1 if the attributes match and 0 if not. + * + * NOTE: This function runs with the inode_hash_lock spin lock held so it is not + * allowed to sleep. + */ +int ntfs_test_inode(struct inode *vi, ntfs_attr *na) +{ + ntfs_inode *ni; + + if (vi->i_ino != na->mft_no) + return 0; + ni = NTFS_I(vi); + /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ + if (likely(!NInoAttr(ni))) { + /* If not looking for a normal inode this is a mismatch. */ + if (unlikely(na->type != AT_UNUSED)) + return 0; + } else { + /* A fake inode describing an attribute. */ + if (ni->type != na->type) + return 0; + if (ni->name_len != na->name_len) + return 0; + if (na->name_len && memcmp(ni->name, na->name, + na->name_len * sizeof(ntfschar))) + return 0; + } + /* Match! */ + return 1; +} + +/** + * ntfs_init_locked_inode - initialize an inode + * @vi: vfs inode to initialize + * @na: ntfs attribute which to initialize @vi to + * + * Initialize the vfs inode @vi with the values from the ntfs attribute @na in + * order to enable ntfs_test_inode() to do its work. + * + * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. + * In that case, @na->name and @na->name_len should be set to NULL and 0, + * respectively. Although that is not strictly necessary as + * ntfs_read_locked_inode() will fill them in later. + * + * Return 0 on success and -errno on error. + * + * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * allowed to sleep. (Hence the GFP_ATOMIC allocation.) + */ +static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) +{ + ntfs_inode *ni = NTFS_I(vi); + + vi->i_ino = na->mft_no; + + ni->type = na->type; + if (na->type == AT_INDEX_ALLOCATION) + NInoSetMstProtected(ni); + + ni->name = na->name; + ni->name_len = na->name_len; + + /* If initializing a normal inode, we are done. */ + if (likely(na->type == AT_UNUSED)) { + BUG_ON(na->name); + BUG_ON(na->name_len); + return 0; + } + + /* It is a fake inode. */ + NInoSetAttr(ni); + + /* + * We have I30 global constant as an optimization as it is the name + * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC + * allocation but that is ok. And most attributes are unnamed anyway, + * thus the fraction of named attributes with name != I30 is actually + * absolutely tiny. + */ + if (na->name_len && na->name != I30) { + unsigned int i; + + BUG_ON(!na->name); + i = na->name_len * sizeof(ntfschar); + ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); + if (!ni->name) + return -ENOMEM; + memcpy(ni->name, na->name, i); + ni->name[na->name_len] = 0; + } + return 0; +} + +typedef int (*set_t)(struct inode *, void *); +static int ntfs_read_locked_inode(struct inode *vi); +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); +static int ntfs_read_locked_index_inode(struct inode *base_vi, + struct inode *vi); + +/** + * ntfs_iget - obtain a struct inode corresponding to a specific normal inode + * @sb: super block of mounted volume + * @mft_no: mft record number / inode number to obtain + * + * Obtain the struct inode corresponding to a specific normal inode (i.e. a + * file or directory). + * + * If the inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and initialized, + * and finally ntfs_read_locked_inode() is called to read in the inode and + * fill in the remainder of the inode structure. + * + * Return the struct inode on success. Check the return value with IS_ERR() and + * if true, the function failed and the error code is obtained from PTR_ERR(). + */ +struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = mft_no; + na.type = AT_UNUSED; + na.name = NULL; + na.name_len = 0; + + vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_inode(vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad inodes around if the failure was + * due to ENOMEM. We want to be able to retry again later. + */ + if (unlikely(err == -ENOMEM)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_attr_iget - obtain a struct inode corresponding to an attribute + * @base_vi: vfs base inode containing the attribute + * @type: attribute type + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * + * Obtain the (fake) struct inode corresponding to the attribute specified by + * @type, @name, and @name_len, which is present in the base mft record + * specified by the vfs inode @base_vi. + * + * If the attribute inode is in the cache, it is just returned with an + * increased reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_attr_inode() is called to read the + * attribute and fill in the inode structure. + * + * Note, for index allocation attributes, you need to use ntfs_index_iget() + * instead of ntfs_attr_iget() as working with indices is a lot more complex. + * + * Return the struct inode of the attribute inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + /* Make sure no one calls ntfs_attr_iget() for indices. */ + BUG_ON(type == AT_INDEX_ALLOCATION); + + na.mft_no = base_vi->i_ino; + na.type = type; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_attr_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad attribute inodes around. This also + * simplifies things in that we never need to check for bad attribute + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_index_iget - obtain a struct inode corresponding to an index + * @base_vi: vfs base inode containing the index related attributes + * @name: Unicode name of the index + * @name_len: length of @name in Unicode characters + * + * Obtain the (fake) struct inode corresponding to the index specified by @name + * and @name_len, which is present in the base mft record specified by the vfs + * inode @base_vi. + * + * If the index inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_index_inode() is called to read + * the index related attributes and fill in the inode structure. + * + * Return the struct inode of the index inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = base_vi->i_ino; + na.type = AT_INDEX_ALLOCATION; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_index_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad index inodes around. This also + * simplifies things in that we never need to check for bad index + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +struct inode *ntfs_alloc_big_inode(struct super_block *sb) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return VFS_I(ni); + } + ntfs_error(sb, "Allocation of NTFS big inode structure failed."); + return NULL; +} + +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + +void ntfs_destroy_big_inode(struct inode *inode) +{ + ntfs_inode *ni = NTFS_I(inode); + + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + call_rcu(&inode->i_rcu, ntfs_i_callback); +} + +static inline ntfs_inode *ntfs_alloc_extent_inode(void) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return ni; + } + ntfs_error(NULL, "Allocation of NTFS inode structure failed."); + return NULL; +} + +static void ntfs_destroy_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + kmem_cache_free(ntfs_inode_cache, ni); +} + +/* + * The attribute runlist lock has separate locking rules from the + * normal runlist lock, so split the two lock-classes: + */ +static struct lock_class_key attr_list_rl_lock_class; + +/** + * __ntfs_init_inode - initialize ntfs specific part of an inode + * @sb: super block of mounted volume + * @ni: freshly allocated ntfs inode which to initialize + * + * Initialize an ntfs inode to defaults. + * + * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left + * untouched. Make sure to initialize them elsewhere. + * + * Return zero on success and -ENOMEM on error. + */ +void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) +{ + ntfs_debug("Entering."); + rwlock_init(&ni->size_lock); + ni->initialized_size = ni->allocated_size = 0; + ni->seq_no = 0; + atomic_set(&ni->count, 1); + ni->vol = NTFS_SB(sb); + ntfs_init_runlist(&ni->runlist); + mutex_init(&ni->mrec_lock); + ni->page = NULL; + ni->page_ofs = 0; + ni->attr_list_size = 0; + ni->attr_list = NULL; + ntfs_init_runlist(&ni->attr_list_rl); + lockdep_set_class(&ni->attr_list_rl.lock, + &attr_list_rl_lock_class); + ni->itype.index.block_size = 0; + ni->itype.index.vcn_size = 0; + ni->itype.index.collation_rule = 0; + ni->itype.index.block_size_bits = 0; + ni->itype.index.vcn_size_bits = 0; + mutex_init(&ni->extent_lock); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; +} + +/* + * Extent inodes get MFT-mapped in a nested way, while the base inode + * is still mapped. Teach this nesting to the lock validator by creating + * a separate class for nested inode's mrec_lock's: + */ +static struct lock_class_key extent_inode_mrec_lock_key; + +inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no) +{ + ntfs_inode *ni = ntfs_alloc_extent_inode(); + + ntfs_debug("Entering."); + if (likely(ni != NULL)) { + __ntfs_init_inode(sb, ni); + lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); + ni->mft_no = mft_no; + ni->type = AT_UNUSED; + ni->name = NULL; + ni->name_len = 0; + } + return ni; +} + +/** + * ntfs_is_extended_system_file - check if a file is in the $Extend directory + * @ctx: initialized attribute search context + * + * Search all file name attributes in the inode described by the attribute + * search context @ctx and check if any of the names are in the $Extend system + * directory. + * + * Return values: + * 1: file is in $Extend directory + * 0: file is not in $Extend directory + * -errno: failed to determine if the file is in the $Extend directory + */ +static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) +{ + int nr_links, err; + + /* Restart search. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Get number of hard links. */ + nr_links = le16_to_cpu(ctx->mrec->link_count); + + /* Loop through all hard links. */ + while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, + ctx))) { + FILE_NAME_ATTR *file_name_attr; + ATTR_RECORD *attr = ctx->attr; + u8 *p, *p2; + + nr_links--; + /* + * Maximum sanity checking as we are called on an inode that + * we suspect might be corrupt. + */ + p = (u8*)attr + le32_to_cpu(attr->length); + if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_in_use)) { +err_corrupt_attr: + ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " + "attribute. You should run chkdsk."); + return -EIO; + } + if (attr->non_resident) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " + "name. You should run chkdsk."); + return -EIO; + } + if (attr->flags) { + ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " + "invalid flags. You should run " + "chkdsk."); + return -EIO; + } + if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " + "name. You should run chkdsk."); + return -EIO; + } + file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + + le16_to_cpu(attr->data.resident.value_offset)); + p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length); + if (p2 < (u8*)attr || p2 > p) + goto err_corrupt_attr; + /* This attribute is ok, but is it in the $Extend directory? */ + if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) + return 1; /* YES, it's an extended system file. */ + } + if (unlikely(err != -ENOENT)) + return err; + if (unlikely(nr_links)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " + "doesn't match number of name attributes. You " + "should run chkdsk."); + return -EIO; + } + return 0; /* NO, it is not an extended system file. */ +} + +/** + * ntfs_read_locked_inode - read an inode from its device + * @vi: inode to read + * + * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode + * described by @vi into memory from the device. + * + * The only fields in @vi that we need to/can look at when the function is + * called are i_sb, pointing to the mounted device's super block, and i_ino, + * the number of the inode to load. + * + * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino + * for reading and sets up the necessary @vi fields as well as initializing + * the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * i_flags is set to 0 and we have no business touching it. Only an ioctl() + * is allowed to write to them. We should of course be honouring them but + * we need to do that using the IS_* macros defined in include/linux/fs.h. + * In any case ntfs_read_locked_inode() has nothing to do with i_flags. + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_inode(struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + STANDARD_INFORMATION *si; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + /* Setup the generic vfs inode parts now. */ + + /* + * This is for checking whether an inode has changed w.r.t. a file so + * that the file can be updated if necessary (compare with f_version). + */ + vi->i_version = 1; + + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + vi->i_mode = 0; + + /* + * Initialize the ntfs specific part of @vi special casing + * FILE_MFT which we need to do at mount time. + */ + if (vi->i_ino != FILE_MFT) + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + + if (!(m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vi->i_sb, "Inode is not in use!"); + goto unm_err_out; + } + if (m->base_mft_record) { + ntfs_error(vi->i_sb, "Inode is an extent inode!"); + goto unm_err_out; + } + + /* Transfer information from mft record into vfs and ntfs inodes. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* + * FIXME: Keep in mind that link_count is two for files which have both + * a long file name and a short file name as separate entries, so if + * we are hiding short file names this will be too high. Either we need + * to account for the short file names by subtracting them or we need + * to make sure we delete files even though i_nlink is not zero which + * might be tricky due to vfs interactions. Need to think about this + * some more when implementing the unlink command. + */ + set_nlink(vi, le16_to_cpu(m->link_count)); + /* + * FIXME: Reparse points can have the directory bit set even though + * they would be S_IFLNK. Need to deal with this further below when we + * implement reparse points / symbolic links but it will do for now. + * Also if not a directory, it could be something else, rather than + * a regular file. But again, will do for now. + */ + /* Everyone gets all permissions. */ + vi->i_mode |= S_IRWXUGO; + /* If read-only, no one gets write permissions. */ + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + if (m->flags & MFT_RECORD_IS_DIRECTORY) { + vi->i_mode |= S_IFDIR; + /* + * Apply the directory permissions mask set in the mount + * options. + */ + vi->i_mode &= ~vol->dmask; + /* Things break without this kludge! */ + if (vi->i_nlink > 1) + set_nlink(vi, 1); + } else { + vi->i_mode |= S_IFREG; + /* Apply the file permissions mask set in the mount options. */ + vi->i_mode &= ~vol->fmask; + } + /* + * Find the standard information attribute in the mft record. At this + * stage we haven't setup the attribute list stuff yet, so this could + * in fact fail if the standard information is in an extent record, but + * I don't think this actually ever happens. + */ + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + /* + * TODO: We should be performing a hot fix here (if the + * recover mount option is set) by creating a new + * attribute. + */ + ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Get the standard information attribute value. */ + si = (STANDARD_INFORMATION*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + + /* Transfer information from the standard information into vi. */ + /* + * Note: The i_?times do not quite map perfectly onto the NTFS times, + * but they are close enough, and in the end it doesn't really matter + * that much... + */ + /* + * mtime is the last change of the data within the file. Not changed + * when only metadata is changed, e.g. a rename doesn't affect mtime. + */ + vi->i_mtime = ntfs2utc(si->last_data_change_time); + /* + * ctime is the last change of the metadata of the file. This obviously + * always changes, when mtime is changed. ctime can be changed on its + * own, mtime is then not changed, e.g. when a file is renamed. + */ + vi->i_ctime = ntfs2utc(si->last_mft_change_time); + /* + * Last access to the data within the file. Not changed during a rename + * for example but changed whenever the file is written to. + */ + vi->i_atime = ntfs2utc(si->last_access_time); + + /* Find the attribute list attribute if present. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(vi->i_sb, "Failed to lookup attribute list " + "attribute."); + goto unm_err_out; + } + } else /* if (!err) */ { + if (vi->i_ino == FILE_MFT) + goto skip_attr_list_load; + ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Attribute list attribute is " + "compressed."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(vi->i_sb, "Not enough memory to allocate " + "buffer for attribute list."); + err = -ENOMEM; + goto unm_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "Attribute list has non " + "zero lowest_vcn."); + goto unm_err_out; + } + /* + * Setup the runlist. No need for locking as we have + * exclusive access to the inode at this time. + */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(vi->i_sb, "Mapping pairs " + "decompression failed."); + goto unm_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data.non_resident. + initialized_size)))) { + ntfs_error(vi->i_sb, "Failed to load " + "attribute list attribute."); + goto unm_err_out; + } + } else /* if (!a->non_resident) */ { + if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt attribute list " + "in inode."); + goto unm_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + } +skip_attr_list_load: + /* + * If an attribute list is present we now have the attribute list value + * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. + */ + if (S_ISDIR(vi->i_mode)) { + loff_t bvi_size; + ntfs_inode *bni; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + + /* It is a directory, find index root attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + // FIXME: File is corrupt! Hot-fix with empty + // index root attribute if recovery option is + // set. + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " + "resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " + "placed after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted index root just means that the newly + * created files in that directory should be created compressed/ + * encrypted. However index root cannot be both compressed and + * encrypted. + */ + if (a->flags & ATTR_COMPRESSION_MASK) + NInoSetCompressed(ni); + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + ir = (INDEX_ROOT*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Directory index is corrupt."); + goto unm_err_out; + } + if (ir->type != AT_FILE_NAME) { + ntfs_error(vi->i_sb, "Indexed attribute is not " + "$FILE_NAME."); + goto unm_err_out; + } + if (ir->collation_rule != COLLATION_FILE_NAME) { + ntfs_error(vi->i_sb, "Index collation rule is not " + "COLLATION_FILE_NAME."); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (ni->itype.index.block_size & + (ni->itype.index.block_size - 1)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a " + "power of two.", + ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > " + "PAGE_CACHE_SIZE (%ld) is not " + "supported. Sorry.", + ni->itype.index.block_size, + PAGE_CACHE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < " + "NTFS_BLOCK_SIZE (%i) is not " + "supported. Sorry.", + ni->itype.index.block_size, + NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = + ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the directory index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + + /* Setup the index allocation attribute, even if not present. */ + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + goto skip_large_dir_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " + "attribute is not present but " + "$INDEX_ROOT indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION " + "attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " + "is placed after the mapping pairs " + "array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of " + "$INDEX_ALLOCATION attribute has non " + "zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " + "and/or encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> + ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " + "for index allocation (0x%llx).", + bvi_size << 3, vi->i_size); + goto iput_unm_err_out; + } + /* No longer need the bitmap attribute inode. */ + iput(bvi); +skip_large_dir_stuff: + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_dir_inode_ops; + vi->i_fop = &ntfs_dir_ops; + vi->i_mapping->a_ops = &ntfs_mst_aops; + } else { + /* It is a file. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Setup the data attribute, even if not present. */ + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + + /* Find first extent of the unnamed data attribute. */ + err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); + if (unlikely(err)) { + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + if (err != -ENOENT) { + ntfs_error(vi->i_sb, "Failed to lookup $DATA " + "attribute."); + goto unm_err_out; + } + /* + * FILE_Secure does not have an unnamed $DATA + * attribute, so we special case it here. + */ + if (vi->i_ino == FILE_Secure) + goto no_data_attr_special_case; + /* + * Most if not all the system files in the $Extend + * system directory do not have unnamed data + * attributes so we need to check if the parent + * directory of the file is FILE_Extend and if it is + * ignore this error. To do this we need to get the + * name of this inode from the mft record as the name + * contains the back reference to the parent directory. + */ + if (ntfs_is_extended_system_file(ctx) > 0) + goto no_data_attr_special_case; + // FIXME: File is corrupt! Hot-fix with empty data + // attribute if recovery option is set. + ntfs_error(vi->i_sb, "$DATA attribute is missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Setup the state. */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found " + "compressed data but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) + != ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method " + "or corrupt file."); + goto unm_err_out; + } + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed data."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->non_resident) { + NInoSetNonResident(ni); + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found " + "non-standard " + "compression unit (%u " + "instead of 4). " + "Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype. + compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = + 1U << a->data. + non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = + 0; + ni->itype.compressed.block_clusters = + 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident. + compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } else { /* Resident attribute. */ + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu( + a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident data attribute " + "is corrupt (size exceeds " + "allocation)."); + goto unm_err_out; + } + } +no_data_attr_special_case: + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_file_inode_ops; + vi->i_fop = &ntfs_file_ops; + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + } + /* + * The number of 512-byte blocks used on disk (for stat). This is in so + * far inaccurate as it doesn't account for any named streams or other + * special non-resident attributes, but that is how Windows works, too, + * so we are at least consistent with Windows, if not entirely + * consistent with the Linux Way. Doing it the Linux Way would cause a + * significant slowdown as it would involve iterating over all + * attributes in the mft record and adding the allocated/compressed + * sizes of all non-resident attributes present to give us the Linux + * correct size that should go into i_blocks (after division by 512). + */ + if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " + "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_attr_inode - read an attribute inode from its base inode + * @base_vi: base inode + * @vi: attribute inode to read + * + * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the + * attribute inode described by @vi into memory from the base mft record + * described by @base_ni. + * + * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for + * reading and looks up the attribute described by @vi before setting up the + * necessary fields in @vi as well as initializing the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + * + * Note this cannot be called for AT_INDEX_ALLOCATION. + */ +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + + /* Just mirror the values from the base inode. */ + vi->i_version = base_vi->i_version; + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the attribute. */ + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto unm_err_out; + a = ctx->attr; + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if ((ni->type != AT_DATA) || (ni->type == AT_DATA && + ni->name_len)) { + ntfs_error(vi->i_sb, "Found compressed " + "non-data or named data " + "attribute. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto unm_err_out; + } + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found compressed " + "attribute but compression is " + "disabled due to cluster size " + "(%i) > 4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) != + ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method."); + goto unm_err_out; + } + } + /* + * The compressed/sparse flag set in an index root just means + * to compress all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is %s. Please " + "report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net", + NInoCompressed(ni) ? "compressed" : + "sparse"); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and compressed " + "data."); + goto unm_err_out; + } + /* + * The encryption flag set in an index root just means to + * encrypt all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is encrypted. " + "Please report you saw this message " + "to linux-ntfs-dev@lists.sourceforge." + "net"); + goto unm_err_out; + } + if (ni->type != AT_DATA) { + ntfs_error(vi->i_sb, "Found encrypted non-data " + "attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (!a->non_resident) { + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the attribute value."); + goto unm_err_out; + } + if (NInoMstProtected(ni)) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is resident. " + "Please report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net"); + goto unm_err_out; + } + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident attribute is corrupt " + "(size exceeds allocation)."); + goto unm_err_out; + } + } else { + NInoSetNonResident(ni); + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the mapping pairs array."); + goto unm_err_out; + } + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found non-standard " + "compression unit (%u instead " + "of 4). Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident.compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of attribute has " + "non-zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode does not go away and attach it to the + * attribute inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + + ntfs_debug("Done."); + return 0; + +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i while reading attribute " + "inode (mft_no 0x%lx, type 0x%x, name_len %i). " + "Marking corrupt inode and base inode 0x%lx as bad. " + "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, + base_vi->i_ino); + make_bad_inode(vi); + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_index_inode - read an index inode from its base inode + * @base_vi: base inode + * @vi: index inode to read + * + * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the + * index inode described by @vi into memory from the base mft record described + * by @base_ni. + * + * ntfs_read_locked_index_inode() maps, pins and locks the base inode for + * reading and looks up the attributes relating to the index described by @vi + * before setting up the necessary fields in @vi as well as initializing the + * ntfs inode. + * + * Note, index inodes are essentially attribute inodes (NInoAttr() is true) + * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they + * are setup like directory inodes since directories are a special case of + * indices ao they need to be treated in much the same way. Most importantly, + * for small indices the index allocation attribute might not actually exist. + * However, the index root attribute always exists but this does not need to + * have an inode associated with it and this is why we define a new inode type + * index. Also, like for directories, we need to have an attribute inode for + * the bitmap attribute corresponding to the index allocation attribute and we + * can store this in the appropriate field of the inode, just like we do for + * normal directory inodes. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) +{ + loff_t bvi_size; + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni, *bni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + /* Just mirror the values from the base inode. */ + vi->i_version = base_vi->i_version; + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + /* Map the mft record for the base inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the index root attribute. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " + "after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted/sparse index root is not allowed, except for + * directories of course but those are not dealt with here. + */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | + ATTR_IS_SPARSE)) { + ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " + "root attribute."); + goto unm_err_out; + } + ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Index is corrupt."); + goto unm_err_out; + } + if (ir->type) { + ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", + le32_to_cpu(ir->type)); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ntfs_debug("Index collation rule is 0x%x.", + le32_to_cpu(ir->collation_rule)); + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (!is_power_of_2(ni->itype.index.block_size)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " + "two.", ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE " + "(%ld) is not supported. Sorry.", + ni->itype.index.block_size, PAGE_CACHE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " + "(%i) is not supported. Sorry.", + ni->itype.index.block_size, NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + /* Check for presence of index allocation attribute. */ + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + goto skip_large_index_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "not present but $INDEX_ROOT " + "indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " + "placed after the mapping pairs array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " + "attribute has non zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " + "encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " + "index allocation (0x%llx).", bvi_size << 3, + vi->i_size); + goto iput_unm_err_out; + } + iput(bvi); +skip_large_index_stuff: + /* Setup the operations for this index inode. */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode doesn't go away and attach it to the + * index inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); +err_out: + ntfs_error(vi->i_sb, "Failed with error code %i while reading index " + "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, + ni->name_len); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/* + * The MFT inode has special locking, so teach the lock validator + * about this by splitting off the locking rules of the MFT from + * the locking rules of other inodes. The MFT inode can never be + * accessed from the VFS side (or even internally), only by the + * map_mft functions. + */ +static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; + +/** + * ntfs_read_inode_mount - special read_inode for mount time use only + * @vi: inode to read + * + * Read inode FILE_MFT at mount time, only called with super_block lock + * held from within the read_super() code path. + * + * This function exists because when it is called the page cache for $MFT/$DATA + * is not initialized and hence we cannot get at the contents of mft records + * by calling map_mft_record*(). + * + * Further it needs to cope with the circular references problem, i.e. cannot + * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because + * we do not know where the other extent mft records are yet and again, because + * we cannot call map_mft_record*() yet. Obviously this applies only when an + * attribute list is actually present in $MFT inode. + * + * We solve these problems by starting with the $DATA attribute before anything + * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each + * extent is found, we ntfs_mapping_pairs_decompress() including the implied + * ntfs_runlists_merge(). Each step of the iteration necessarily provides + * sufficient information for the next step to complete. + * + * This should work but there are two possible pit falls (see inline comments + * below), but only time will tell if they are real pits or just smoke... + */ +int ntfs_read_inode_mount(struct inode *vi) +{ + VCN next_vcn, last_vcn, highest_vcn; + s64 block; + struct super_block *sb = vi->i_sb; + ntfs_volume *vol = NTFS_SB(sb); + struct buffer_head *bh; + ntfs_inode *ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + unsigned int i, nr_blocks; + int err; + + ntfs_debug("Entering."); + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + + /* Setup the data attribute. It is special as it is mst protected. */ + NInoSetNonResident(ni); + NInoSetMstProtected(ni); + NInoSetSparseDisabled(ni); + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + /* + * This sets up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + ni->itype.index.block_size = vol->mft_record_size; + ni->itype.index.block_size_bits = vol->mft_record_size_bits; + + /* Very important! Needed to be able to call map_mft_record*(). */ + vol->mft_ino = vi; + + /* Allocate enough memory to read the first mft record. */ + if (vol->mft_record_size > 64 * 1024) { + ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", + vol->mft_record_size); + goto err_out; + } + i = vol->mft_record_size; + if (i < sb->s_blocksize) + i = sb->s_blocksize; + m = (MFT_RECORD*)ntfs_malloc_nofs(i); + if (!m) { + ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); + goto err_out; + } + + /* Determine the first block of the $MFT/$DATA attribute. */ + block = vol->mft_lcn << vol->cluster_size_bits >> + sb->s_blocksize_bits; + nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; + if (!nr_blocks) + nr_blocks = 1; + + /* Load $MFT/$DATA's first mft record. */ + for (i = 0; i < nr_blocks; i++) { + bh = sb_bread(sb, block++); + if (!bh) { + ntfs_error(sb, "Device read failed."); + goto err_out; + } + memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, + sb->s_blocksize); + brelse(bh); + } + + /* Apply the mst fixups. */ + if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { + /* FIXME: Try to use the $MFTMirr now. */ + ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); + goto err_out; + } + + /* Need this to sanity check attribute list references to $MFT. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* Provides readpage() and sync_page() for map_mft_record(). */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + + /* Find the attribute list attribute if present. */ + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(sb, "Failed to lookup attribute list " + "attribute. You should run chkdsk."); + goto put_err_out; + } + } else /* if (!err) */ { + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; + + ntfs_debug("Attribute list attribute found in $MFT."); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(sb, "Attribute list attribute is " + "compressed.%s", es); + goto put_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(sb, "Not enough memory to allocate buffer " + "for attribute list."); + goto put_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "Attribute list has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Setup the runlist. */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(sb, "Mapping pairs decompression " + "failed with error code %i.", + -err); + goto put_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data. + non_resident.initialized_size)))) { + ntfs_error(sb, "Failed to load attribute list " + "attribute with error code %i.", + -err); + goto put_err_out; + } + } else /* if (!ctx.attr->non_resident) */ { + if ((u8*)a + le16_to_cpu( + a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(sb, "Corrupt attribute list " + "attribute."); + goto put_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + /* The attribute list is now setup in memory. */ + /* + * FIXME: I don't know if this case is actually possible. + * According to logic it is not possible but I have seen too + * many weird things in MS software to rely on logic... Thus we + * perform a manual search and make sure the first $MFT/$DATA + * extent is in the base inode. If it is not we abort with an + * error and if we ever see a report of this error we will need + * to do some magic in order to have the necessary mft record + * loaded and in the right place in the page cache. But + * hopefully logic will prevail and this never happens... + */ + al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; + al_end = (u8*)al_entry + ni->attr_list_size; + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < ni->attr_list || + (u8*)al_entry > al_end) + goto em_put_err_out; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto em_put_err_out; + if (!al_entry->length) + goto em_put_err_out; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + goto em_put_err_out; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) + goto em_put_err_out; + if (AT_DATA != al_entry->type) + continue; + /* We want an unnamed attribute. */ + if (al_entry->name_length) + goto em_put_err_out; + /* Want the first entry, i.e. lowest_vcn == 0. */ + if (al_entry->lowest_vcn) + goto em_put_err_out; + /* First entry has to be in the base mft record. */ + if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { + /* MFT references do not match, logic fails. */ + ntfs_error(sb, "BUG: The first $DATA extent " + "of $MFT is not in the base " + "mft record. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto put_err_out; + } else { + /* Sequence numbers must match. */ + if (MSEQNO_LE(al_entry->mft_reference) != + ni->seq_no) + goto em_put_err_out; + /* Got it. All is ok. We can stop now. */ + break; + } + } + } + + ntfs_attr_reinit_search_ctx(ctx); + + /* Now load all attribute extents. */ + a = NULL; + next_vcn = last_vcn = highest_vcn = 0; + while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, + ctx))) { + runlist_element *nrl; + + /* Cache the current attribute. */ + a = ctx->attr; + /* $MFT must be non-resident. */ + if (!a->non_resident) { + ntfs_error(sb, "$MFT must be non-resident but a " + "resident extent was found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + /* $MFT must be uncompressed and unencrypted. */ + if (a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + ntfs_error(sb, "$MFT must be uncompressed, " + "non-sparse, and unencrypted but a " + "compressed/sparse/encrypted extent " + "was found. $MFT is corrupt. Run " + "chkdsk."); + goto put_err_out; + } + /* + * Decompress the mapping pairs array of this extent and merge + * the result into the existing runlist. No need for locking + * as we have exclusive access to the inode at this time and we + * are a mount in progress task, too. + */ + nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(nrl)) { + ntfs_error(sb, "ntfs_mapping_pairs_decompress() " + "failed with error code %ld. $MFT is " + "corrupt.", PTR_ERR(nrl)); + goto put_err_out; + } + ni->runlist.rl = nrl; + + /* Are we in the first extent? */ + if (!next_vcn) { + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Get the last vcn in the $DATA attribute. */ + last_vcn = sle64_to_cpu( + a->data.non_resident.allocated_size) + >> vol->cluster_size_bits; + /* Fill in the inode size. */ + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * Verify the number of mft records does not exceed + * 2^32 - 1. + */ + if ((vi->i_size >> vol->mft_record_size_bits) >= + (1ULL << 32)) { + ntfs_error(sb, "$MFT is too big! Aborting."); + goto put_err_out; + } + /* + * We have got the first extent of the runlist for + * $MFT which means it is now relatively safe to call + * the normal ntfs_read_inode() function. + * Complete reading the inode, this will actually + * re-read the mft record for $MFT, this time entering + * it into the page cache with which we complete the + * kick start of the volume. It should be safe to do + * this now as the first extent of $MFT/$DATA is + * already known and we would hope that we don't need + * further extents in order to find the other + * attributes belonging to $MFT. Only time will tell if + * this is really the case. If not we will have to play + * magic at this point, possibly duplicating a lot of + * ntfs_read_inode() at this point. We will need to + * ensure we do enough of its work to be able to call + * ntfs_read_inode() on extents of $MFT/$DATA. But lets + * hope this never happens... + */ + ntfs_read_locked_inode(vi); + if (is_bad_inode(vi)) { + ntfs_error(sb, "ntfs_read_inode() of $MFT " + "failed. BUG or corrupt $MFT. " + "Run chkdsk and if no errors " + "are found, please report you " + "saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + ntfs_attr_put_search_ctx(ctx); + /* Revert to the safe super operations. */ + ntfs_free(m); + return -1; + } + /* + * Re-initialize some specifics about $MFT's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + vi->i_uid = GLOBAL_ROOT_UID; + vi->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + vi->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFT. */ + vi->i_op = &ntfs_empty_inode_ops; + vi->i_fop = &ntfs_empty_file_ops; + } + + /* Get the lowest vcn for the next extent. */ + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + next_vcn = highest_vcn + 1; + + /* Only one extent or error, which we catch below. */ + if (next_vcn <= 0) + break; + + /* Avoid endless loops due to corruption. */ + if (next_vcn < sle64_to_cpu( + a->data.non_resident.lowest_vcn)) { + ntfs_error(sb, "$MFT has corrupt attribute list " + "attribute. Run chkdsk."); + goto put_err_out; + } + } + if (err != -ENOENT) { + ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " + "$MFT is corrupt. Run chkdsk."); + goto put_err_out; + } + if (!a) { + ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + if (highest_vcn && highest_vcn != last_vcn - 1) { + ntfs_error(sb, "Failed to load the complete runlist for " + "$MFT/$DATA. Driver bug or corrupt $MFT. " + "Run chkdsk."); + ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", + (unsigned long long)highest_vcn, + (unsigned long long)last_vcn - 1); + goto put_err_out; + } + ntfs_attr_put_search_ctx(ctx); + ntfs_debug("Done."); + ntfs_free(m); + + /* + * Split the locking rules of the MFT inode from the + * locking rules of other inodes: + */ + lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); + lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); + + return 0; + +em_put_err_out: + ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " + "attribute list. $MFT is corrupt. Run chkdsk."); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +err_out: + ntfs_error(sb, "Failed. Marking inode as bad."); + make_bad_inode(vi); + ntfs_free(m); + return -1; +} + +static void __ntfs_clear_inode(ntfs_inode *ni) +{ + /* Free all alocated memory. */ + down_write(&ni->runlist.lock); + if (ni->runlist.rl) { + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + } + up_write(&ni->runlist.lock); + + if (ni->attr_list) { + ntfs_free(ni->attr_list); + ni->attr_list = NULL; + } + + down_write(&ni->attr_list_rl.lock); + if (ni->attr_list_rl.rl) { + ntfs_free(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + } + up_write(&ni->attr_list_rl.lock); + + if (ni->name_len && ni->name != I30) { + /* Catch bugs... */ + BUG_ON(!ni->name); + kfree(ni->name); + } +} + +void ntfs_clear_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) + ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " + "Losing data! This is a BUG!!!"); + // FIXME: Do something!!! + } +#endif /* NTFS_RW */ + + __ntfs_clear_inode(ni); + + /* Bye, bye... */ + ntfs_destroy_extent_inode(ni); +} + +/** + * ntfs_evict_big_inode - clean up the ntfs specific part of an inode + * @vi: vfs inode pending annihilation + * + * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() + * is called, which deallocates all memory belonging to the NTFS specific part + * of the inode and returns. + * + * If the MFT record is dirty, we commit it before doing anything else. + */ +void ntfs_evict_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + truncate_inode_pages_final(&vi->i_data); + clear_inode(vi); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + bool was_bad = (is_bad_inode(vi)); + + /* Committing the inode also commits all extent inodes. */ + ntfs_commit_inode(vi); + + if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { + ntfs_error(vi->i_sb, "Failed to commit dirty inode " + "0x%lx. Losing data!", vi->i_ino); + // FIXME: Do something!!! + } + } +#endif /* NTFS_RW */ + + /* No need to lock at this stage as no one else has a reference. */ + if (ni->nr_extents > 0) { + int i; + + for (i = 0; i < ni->nr_extents; i++) + ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); + kfree(ni->ext.extent_ntfs_inos); + } + + __ntfs_clear_inode(ni); + + if (NInoAttr(ni)) { + /* Release the base inode if we are holding it. */ + if (ni->nr_extents == -1) { + iput(VFS_I(ni->ext.base_ntfs_ino)); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; + } + } + return; +} + +/** + * ntfs_show_options - show mount options in /proc/mounts + * @sf: seq_file in which to write our mount options + * @root: root of the mounted tree whose mount options to display + * + * Called by the VFS once for each mounted ntfs volume when someone reads + * /proc/mounts in order to display the NTFS specific mount options of each + * mount. The mount options of fs specified by @root are written to the seq file + * @sf and success is returned. + */ +int ntfs_show_options(struct seq_file *sf, struct dentry *root) +{ + ntfs_volume *vol = NTFS_SB(root->d_sb); + int i; + + seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); + seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); + if (vol->fmask == vol->dmask) + seq_printf(sf, ",umask=0%o", vol->fmask); + else { + seq_printf(sf, ",fmask=0%o", vol->fmask); + seq_printf(sf, ",dmask=0%o", vol->dmask); + } + seq_printf(sf, ",nls=%s", vol->nls_map->charset); + if (NVolCaseSensitive(vol)) + seq_printf(sf, ",case_sensitive"); + if (NVolShowSystemFiles(vol)) + seq_printf(sf, ",show_sys_files"); + if (!NVolSparseEnabled(vol)) + seq_printf(sf, ",disable_sparse"); + for (i = 0; on_errors_arr[i].val; i++) { + if (on_errors_arr[i].val & vol->on_errors) + seq_printf(sf, ",errors=%s", on_errors_arr[i].str); + } + seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); + return 0; +} + +#ifdef NTFS_RW + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_truncate - called when the i_size of an ntfs inode is changed + * @vi: inode for which the i_size was changed + * + * We only support i_size changes for normal files at present, i.e. not + * compressed and not encrypted. This is enforced in ntfs_setattr(), see + * below. + * + * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and + * that the change is allowed. + * + * This implies for us that @vi is a file inode rather than a directory, index, + * or attribute inode as well as that @vi is a base inode. + * + * Returns 0 on success or -errno on error. + * + * Called with ->i_mutex held. + */ +int ntfs_truncate(struct inode *vi) +{ + s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; + VCN highest_vcn; + unsigned long flags; + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + const char *te = " Leaving file length out of sync with i_size."; + int err, mp_size, size_change, alloc_change; + u32 attr_len; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(NInoAttr(ni)); + BUG_ON(S_ISDIR(vi->i_mode)); + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->nr_extents < 0); +retry_truncate: + /* + * Lock the runlist for writing and map the mft record to ensure it is + * safe to mess with the attribute runlist and sizes. + */ + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " + "(error code %d).%s", vi->i_ino, err, te); + ctx = NULL; + m = NULL; + goto old_bad_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vi->i_sb, "Failed to allocate a search context for " + "inode 0x%lx (not enough memory).%s", + vi->i_ino, te); + err = -ENOMEM; + goto old_bad_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(vi->i_sb, "Open attribute is missing from " + "mft record. Inode 0x%lx is corrupt. " + "Run chkdsk.%s", vi->i_ino, te); + err = -EIO; + } else + ntfs_error(vi->i_sb, "Failed to lookup attribute in " + "inode 0x%lx (error code %d).%s", + vi->i_ino, err, te); + goto old_bad_out; + } + m = ctx->mrec; + a = ctx->attr; + /* + * The i_size of the vfs inode is the new size for the attribute value. + */ + new_size = i_size_read(vi); + /* The current size of the attribute value is the old size. */ + old_size = ntfs_attr_size(a); + /* Calculate the new allocated size. */ + if (NInoNonResident(ni)) + new_alloc_size = (new_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + else + new_alloc_size = (new_size + 7) & ~7; + /* The current allocated size is the old allocated size. */ + read_lock_irqsave(&ni->size_lock, flags); + old_alloc_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * The change in the file size. This will be 0 if no change, >0 if the + * size is growing, and <0 if the size is shrinking. + */ + size_change = -1; + if (new_size - old_size >= 0) { + size_change = 1; + if (new_size == old_size) + size_change = 0; + } + /* As above for the allocated size. */ + alloc_change = -1; + if (new_alloc_size - old_alloc_size >= 0) { + alloc_change = 1; + if (new_alloc_size == old_alloc_size) + alloc_change = 0; + } + /* + * If neither the size nor the allocation are being changed there is + * nothing to do. + */ + if (!size_change && !alloc_change) + goto unm_done; + /* If the size is changing, check if new size is allowed in $AttrDef. */ + if (size_change) { + err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); + if (unlikely(err)) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Truncate would cause the " + "inode 0x%lx to %simum size " + "for its attribute type " + "(0x%x). Aborting truncate.", + vi->i_ino, + new_size > old_size ? "exceed " + "the max" : "go under the min", + le32_to_cpu(ni->type)); + err = -EFBIG; + } else { + ntfs_error(vol->sb, "Inode 0x%lx has unknown " + "attribute type 0x%x. " + "Aborting truncate.", + vi->i_ino, + le32_to_cpu(ni->type)); + err = -EIO; + } + /* Reset the vfs inode size to the old size. */ + i_size_write(vi, old_size); + goto err_out; + } + } + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size are not " + "supported yet for %s files, ignoring.", + NInoCompressed(ni) ? "compressed" : + "encrypted"); + err = -EOPNOTSUPP; + goto bad_out; + } + if (a->non_resident) + goto do_non_resident_truncate; + BUG_ON(NInoNonResident(ni)); + /* Resize the attribute record to best fit the new attribute size. */ + if (new_size < vol->mft_record_size && + !ntfs_resident_attr_value_resize(m, a, new_size)) { + /* The resize succeeded! */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + write_lock_irqsave(&ni->size_lock, flags); + /* Update the sizes in the ntfs inode and all is done. */ + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + /* + * Note ntfs_resident_attr_value_resize() has already done any + * necessary data clearing in the attribute record. When the + * file is being shrunk vmtruncate() will already have cleared + * the top part of the last partial page, i.e. since this is + * the resident case this is the page with index 0. However, + * when the file is being expanded, the page cache page data + * between the old data_size, i.e. old_size, and the new_size + * has not been zeroed. Fortunately, we do not need to zero it + * either since on one hand it will either already be zero due + * to both readpage and writepage clearing partial page data + * beyond i_size in which case there is nothing to do or in the + * case of the file being mmap()ped at the same time, POSIX + * specifies that the behaviour is unspecified thus we do not + * have to do anything. This means that in our implementation + * in the rare case that the file is mmap()ped and a write + * occurred into the mmap()ped region just beyond the file size + * and writepage has not yet been called to write out the page + * (which would clear the area beyond the file size) and we now + * extend the file size to incorporate this dirty region + * outside the file size, a write of the page would result in + * this data being written to disk instead of being cleared. + * Given both POSIX and the Linux mmap(2) man page specify that + * this corner case is undefined, we choose to leave it like + * that as this is much simpler for us as we cannot lock the + * relevant page now since we are holding too many ntfs locks + * which would result in a lock reversal deadlock. + */ + ni->initialized_size = new_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto unm_done; + } + /* If the above resize failed, this must be an attribute extension. */ + BUG_ON(size_change < 0); + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path as it only ever can happen + * once for any given file. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the truncation process. + */ + err = ntfs_attr_make_non_resident(ni, old_size); + if (likely(!err)) + goto retry_truncate; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " + "type 0x%x, because the conversion from " + "resident to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft record/on " + "disk for the non-resident attribute value. " + "This case is not implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not implemented " + "yet."); + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_truncate: + BUG_ON(!NInoNonResident(ni)); + if (alloc_change < 0) { + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + if (highest_vcn > 0 && + old_alloc_size >> vol->cluster_size_bits > + highest_vcn + 1) { + /* + * This attribute has multiple extents. Not yet + * supported. + */ + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " + "attribute type 0x%x, because the " + "attribute is highly fragmented (it " + "consists of multiple extents) and " + "this case is not implemented yet.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type)); + err = -EOPNOTSUPP; + goto bad_out; + } + } + /* + * If the size is shrinking, need to reduce the initialized_size and + * the data_size before reducing the allocation. + */ + if (size_change < 0) { + /* + * Make the valid size smaller (i_size is already up-to-date). + */ + write_lock_irqsave(&ni->size_lock, flags); + if (new_size < ni->initialized_size) { + ni->initialized_size = new_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(new_size); + } + a->data.non_resident.data_size = cpu_to_sle64(new_size); + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* If the allocated size is not changing, we are done. */ + if (!alloc_change) + goto unm_done; + /* + * If the size is shrinking it makes no sense for the + * allocation to be growing. + */ + BUG_ON(alloc_change > 0); + } else /* if (size_change >= 0) */ { + /* + * The file size is growing or staying the same but the + * allocation can be shrinking, growing or staying the same. + */ + if (alloc_change > 0) { + /* + * We need to extend the allocation and possibly update + * the data size. If we are updating the data size, + * since we are not touching the initialized_size we do + * not need to worry about the actual data on disk. + * And as far as the page cache is concerned, there + * will be no pages beyond the old data size and any + * partial region in the last page between the old and + * new data size (or the end of the page if the new + * data size is outside the page) does not need to be + * modified as explained above for the resident + * attribute truncate case. To do this, we simply drop + * the locks we hold and leave all the work to our + * friendly helper ntfs_attr_extend_allocation(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + err = ntfs_attr_extend_allocation(ni, new_size, + size_change > 0 ? new_size : -1, -1); + /* + * ntfs_attr_extend_allocation() will have done error + * output already. + */ + goto done; + } + if (!alloc_change) + goto alloc_done; + } + /* alloc_change < 0 */ + /* Free the clusters. */ + nr_freed = ntfs_cluster_free(ni, new_alloc_size >> + vol->cluster_size_bits, -1, ctx); + m = ctx->mrec; + a = ctx->attr; + if (unlikely(nr_freed < 0)) { + ntfs_error(vol->sb, "Failed to release cluster(s) (error code " + "%lli). Unmount and run chkdsk to recover " + "the lost cluster(s).", (long long)nr_freed); + NVolSetErrors(vol); + nr_freed = 0; + } + /* Truncate the runlist. */ + err = ntfs_rl_truncate_nolock(vol, &ni->runlist, + new_alloc_size >> vol->cluster_size_bits); + /* + * If the runlist truncation failed and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (unlikely(err || IS_ERR(m))) { + ntfs_error(vol->sb, "Failed to %s (error code %li).%s", + IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist", + IS_ERR(m) ? PTR_ERR(m) : err, es); + err = -EIO; + goto bad_out; + } + /* Get the size for the shrunk mapping pairs array for the runlist. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because determining the " + "size for the mapping pairs failed with error " + "code %i.%s", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), mp_size, es); + err = -EIO; + goto bad_out; + } + /* + * Shrink the attribute record for the new mapping pairs array. Note, + * this cannot fail since we are making the attribute smaller thus by + * definition there is enough space to do so. + */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + BUG_ON(err); + /* + * Generate the mapping pairs array directly into the attribute record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, ni->runlist.rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because building the " + "mapping pairs failed with error code %i.%s", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + err, es); + err = -EIO; + goto bad_out; + } + /* Update the allocated/compressed size as well as the highest vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + if (nr_freed) { + ni->itype.compressed.size -= nr_freed << + vol->cluster_size_bits; + BUG_ON(ni->itype.compressed.size < 0); + a->data.non_resident.compressed_size = cpu_to_sle64( + ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * We have shrunk the allocation. If this is a shrinking truncate we + * have already dealt with the initialized_size and the data_size above + * and we are done. If the truncate is only changing the allocation + * and not the data_size, we are also done. If this is an extending + * truncate, need to extend the data_size now which is ensured by the + * fact that @size_change is positive. + */ +alloc_done: + /* + * If the size is growing, need to update it now. If it is shrinking, + * we have already updated it above (before the allocation change). + */ + if (size_change > 0) + a->data.non_resident.data_size = cpu_to_sle64(new_size); + /* Ensure the modified mft record is written out. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +unm_done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +done: + /* Update the mtime and ctime on the base inode. */ + /* normally ->truncate shouldn't update ctime or mtime, + * but ntfs did before so it got a copy & paste version + * of file_update_time. one day someone should fix this + * for real. + */ + if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { + struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb); + int sync_it = 0; + + if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec_equal(&VFS_I(base_ni)->i_ctime, &now)) + sync_it = 1; + VFS_I(base_ni)->i_mtime = now; + VFS_I(base_ni)->i_ctime = now; + + if (sync_it) + mark_inode_dirty_sync(VFS_I(base_ni)); + } + + if (likely(!err)) { + NInoClearTruncateFailed(ni); + ntfs_debug("Done."); + } + return err; +old_bad_out: + old_size = -1; +bad_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else if (old_size >= 0) + i_size_write(vi, old_size); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +conv_err_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else + i_size_write(vi, old_size); + goto out; +} + +/** + * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value + * @vi: inode for which the i_size was changed + * + * Wrapper for ntfs_truncate() that has no return value. + * + * See ntfs_truncate() description above for details. + */ +#ifdef NTFS_RW +void ntfs_truncate_vfs(struct inode *vi) { + ntfs_truncate(vi); +} +#endif + +/** + * ntfs_setattr - called from notify_change() when an attribute is being changed + * @dentry: dentry whose attributes to change + * @attr: structure describing the attributes and the changes + * + * We have to trap VFS attempts to truncate the file described by @dentry as + * soon as possible, because we do not implement changes in i_size yet. So we + * abort all i_size changes here. + * + * We also abort all changes of user, group, and mode as we do not implement + * the NTFS ACLs yet. + * + * Called with ->i_mutex held. + */ +int ntfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *vi = d_inode(dentry); + int err; + unsigned int ia_valid = attr->ia_valid; + + err = inode_change_ok(vi, attr); + if (err) + goto out; + /* We do not support NTFS ACLs yet. */ + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { + ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " + "supported yet, ignoring."); + err = -EOPNOTSUPP; + goto out; + } + if (ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(vi)) { + ntfs_inode *ni = NTFS_I(vi); + /* + * FIXME: For now we do not support resizing of + * compressed or encrypted files yet. + */ + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size " + "are not supported yet for " + "%s files, ignoring.", + NInoCompressed(ni) ? + "compressed" : "encrypted"); + err = -EOPNOTSUPP; + } else { + truncate_setsize(vi, attr->ia_size); + ntfs_truncate_vfs(vi); + } + if (err || ia_valid == ATTR_SIZE) + goto out; + } else { + /* + * We skipped the truncate but must still update + * timestamps. + */ + ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + } + if (ia_valid & ATTR_ATIME) + vi->i_atime = timespec_trunc(attr->ia_atime, + vi->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + vi->i_mtime = timespec_trunc(attr->ia_mtime, + vi->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + vi->i_ctime = timespec_trunc(attr->ia_ctime, + vi->i_sb->s_time_gran); + mark_inode_dirty(vi); +out: + return err; +} + +/** + * ntfs_write_inode - write out a dirty inode + * @vi: inode to write out + * @sync: if true, write out synchronously + * + * Write out a dirty inode to disk including any extent inodes if present. + * + * If @sync is true, commit the inode to disk and wait for io completion. This + * is done using write_mft_record(). + * + * If @sync is false, just schedule the write to happen but do not wait for i/o + * completion. In 2.6 kernels, scheduling usually happens just by virtue of + * marking the page (and in this case mft record) dirty but we do not implement + * this yet as write_mft_record() largely ignores the @sync parameter and + * always performs synchronous writes. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_write_inode(struct inode *vi, int sync) +{ + sle64 nt; + ntfs_inode *ni = NTFS_I(vi); + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + STANDARD_INFORMATION *si; + int err = 0; + bool modified = false; + + ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", + vi->i_ino); + /* + * Dirty attribute inodes are written via their real inodes so just + * clean them here. Access time updates are taken care off when the + * real inode is written. + */ + if (NInoAttr(ni)) { + NInoClearDirty(ni); + ntfs_debug("Done."); + return 0; + } + /* Map, pin, and lock the mft record belonging to the inode. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + /* Update the access times in the standard information attribute. */ + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Update the access times if they have changed. */ + nt = utc2ntfs(vi->i_mtime); + if (si->last_data_change_time != nt) { + ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_data_change_time), + (long long)sle64_to_cpu(nt)); + si->last_data_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_ctime); + if (si->last_mft_change_time != nt) { + ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_mft_change_time), + (long long)sle64_to_cpu(nt)); + si->last_mft_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_atime); + if (si->last_access_time != nt) { + ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, + (long long)sle64_to_cpu(si->last_access_time), + (long long)sle64_to_cpu(nt)); + si->last_access_time = nt; + modified = true; + } + /* + * If we just modified the standard information attribute we need to + * mark the mft record it is in dirty. We do this manually so that + * mark_inode_dirty() is not called which would redirty the inode and + * hence result in an infinite loop of trying to write the inode. + * There is no need to mark the base inode nor the base mft record + * dirty, since we are going to write this mft record below in any case + * and the base mft record may actually not have been modified so it + * might not need to be written out. + * NOTE: It is not a problem when the inode for $MFT itself is being + * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES + * on the $MFT inode and hence ntfs_write_inode() will not be + * re-invoked because of it which in turn is ok since the dirtied mft + * record will be cleaned and written out to disk below, i.e. before + * this function returns. + */ + if (modified) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + if (!NInoTestSetDirty(ctx->ntfs_ino)) + mark_ntfs_record_dirty(ctx->ntfs_ino->page, + ctx->ntfs_ino->page_ofs); + } + ntfs_attr_put_search_ctx(ctx); + /* Now the access times are updated, write the base mft record. */ + if (NInoDirty(ni)) + err = write_mft_record(ni, m, sync); + /* Write all attached extent mft records. */ + mutex_lock(&ni->extent_lock); + if (ni->nr_extents > 0) { + ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; + int i; + + ntfs_debug("Writing %i extent inodes.", ni->nr_extents); + for (i = 0; i < ni->nr_extents; i++) { + ntfs_inode *tni = extent_nis[i]; + + if (NInoDirty(tni)) { + MFT_RECORD *tm = map_mft_record(tni); + int ret; + + if (IS_ERR(tm)) { + if (!err || err == -ENOMEM) + err = PTR_ERR(tm); + continue; + } + ret = write_mft_record(tni, tm, sync); + unmap_mft_record(tni); + if (unlikely(ret)) { + if (!err || err == -ENOMEM) + err = ret; + } + } + } + } + mutex_unlock(&ni->extent_lock); + unmap_mft_record(ni); + if (unlikely(err)) + goto err_out; + ntfs_debug("Done."); + return 0; +unm_err_out: + unmap_mft_record(ni); +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Not enough memory to write inode. " + "Marking the inode dirty again, so the VFS " + "retries later."); + mark_inode_dirty(vi); + } else { + ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); + NVolSetErrors(ni->vol); + } + return err; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/inode.h b/kernel/fs/ntfs/inode.h new file mode 100644 index 000000000..76b6cfb57 --- /dev/null +++ b/kernel/fs/ntfs/inode.h @@ -0,0 +1,325 @@ +/* + * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_INODE_H +#define _LINUX_NTFS_INODE_H + +#include + +#include +#include +#include +#include +#include + +#include "layout.h" +#include "volume.h" +#include "types.h" +#include "runlist.h" +#include "debug.h" + +typedef struct _ntfs_inode ntfs_inode; + +/* + * The NTFS in-memory inode structure. It is just used as an extension to the + * fields already provided in the VFS inode. + */ +struct _ntfs_inode { + rwlock_t size_lock; /* Lock serializing access to inode sizes. */ + s64 initialized_size; /* Copy from the attribute record. */ + s64 allocated_size; /* Copy from the attribute record. */ + unsigned long state; /* NTFS specific flags describing this inode. + See ntfs_inode_state_bits below. */ + unsigned long mft_no; /* Number of the mft record / inode. */ + u16 seq_no; /* Sequence number of the mft record. */ + atomic_t count; /* Inode reference count for book keeping. */ + ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + /* + * If NInoAttr() is true, the below fields describe the attribute which + * this fake inode belongs to. The actual inode of this attribute is + * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see + * below). For real inodes, we also set the type (AT_DATA for files and + * AT_INDEX_ALLOCATION for directories), with the name = NULL and + * name_len = 0 for files and name = I30 (global constant) and + * name_len = 4 for directories. + */ + ATTR_TYPE type; /* Attribute type of this fake inode. */ + ntfschar *name; /* Attribute name of this fake inode. */ + u32 name_len; /* Attribute name length of this fake inode. */ + runlist runlist; /* If state has the NI_NonResident bit set, + the runlist of the unnamed data attribute + (if a file) or of the index allocation + attribute (directory) or of the attribute + described by the fake inode (if NInoAttr()). + If runlist.rl is NULL, the runlist has not + been read in yet or has been unmapped. If + NI_NonResident is clear, the attribute is + resident (file and fake inode) or there is + no $I30 index allocation attribute + (small directory). In the latter case + runlist.rl is always NULL.*/ + /* + * The following fields are only valid for real inodes and extent + * inodes. + */ + struct mutex mrec_lock; /* Lock for serializing access to the + mft record belonging to this inode. */ + struct page *page; /* The page containing the mft record of the + inode. This should only be touched by the + (un)map_mft_record*() functions. */ + int page_ofs; /* Offset into the page at which the mft record + begins. This should only be touched by the + (un)map_mft_record*() functions. */ + /* + * Attribute list support (only for use by the attribute lookup + * functions). Setup during read_inode for all inodes with attribute + * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is + * further only valid if NI_AttrListNonResident is set. + */ + u32 attr_list_size; /* Length of attribute list value in bytes. */ + u8 *attr_list; /* Attribute list value itself. */ + runlist attr_list_rl; /* Run list for the attribute list value. */ + union { + struct { /* It is a directory, $MFT, or an index inode. */ + u32 block_size; /* Size of an index block. */ + u32 vcn_size; /* Size of a vcn in this + index. */ + COLLATION_RULE collation_rule; /* The collation rule + for the index. */ + u8 block_size_bits; /* Log2 of the above. */ + u8 vcn_size_bits; /* Log2 of the above. */ + } index; + struct { /* It is a compressed/sparse file/attribute inode. */ + s64 size; /* Copy of compressed_size from + $DATA. */ + u32 block_size; /* Size of a compression block + (cb). */ + u8 block_size_bits; /* Log2 of the size of a cb. */ + u8 block_clusters; /* Number of clusters per cb. */ + } compressed; + } itype; + struct mutex extent_lock; /* Lock for accessing/modifying the + below . */ + s32 nr_extents; /* For a base mft record, the number of attached extent + inodes (0 if none), for extent records and for fake + inodes describing an attribute this is -1. */ + union { /* This union is only used if nr_extents != 0. */ + ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of + the ntfs inodes of the extent + mft records belonging to + this base inode which have + been loaded. */ + ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the + ntfs inode of the base mft + record. For fake inodes, the + real (base) inode to which + the attribute belongs. */ + } ext; +}; + +/* + * Defined bits for the state field in the ntfs_inode structure. + * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only + */ +typedef enum { + NI_Dirty, /* 1: Mft record needs to be written to disk. */ + NI_AttrList, /* 1: Mft record contains an attribute list. */ + NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies + NI_AttrList is set. */ + + NI_Attr, /* 1: Fake inode for attribute i/o. + 0: Real inode or extent inode. */ + + NI_MstProtected, /* 1: Attribute is protected by MST fixups. + 0: Attribute is not protected by fixups. */ + NI_NonResident, /* 1: Unnamed data attr is non-resident (f). + 1: Attribute is non-resident (a). */ + NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is + present (d). */ + NI_Compressed, /* 1: Unnamed data attr is compressed (f). + 1: Create compressed files by default (d). + 1: Attribute is compressed (a). */ + NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). + 1: Create encrypted files by default (d). + 1: Attribute is encrypted (a). */ + NI_Sparse, /* 1: Unnamed data attr is sparse (f). + 1: Create sparse files by default (d). + 1: Attribute is sparse (a). */ + NI_SparseDisabled, /* 1: May not create sparse regions. */ + NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ +} ntfs_inode_state_bits; + +/* + * NOTE: We should be adding dirty mft records to a list somewhere and they + * should be independent of the (ntfs/vfs) inode structure so that an inode can + * be removed but the record can be left dirty for syncing later. + */ + +/* + * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() + * functions. + */ +#define NINO_FNS(flag) \ +static inline int NIno##flag(ntfs_inode *ni) \ +{ \ + return test_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoSet##flag(ntfs_inode *ni) \ +{ \ + set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoClear##flag(ntfs_inode *ni) \ +{ \ + clear_bit(NI_##flag, &(ni)->state); \ +} + +/* + * As above for NInoTestSetFoo() and NInoTestClearFoo(). + */ +#define TAS_NINO_FNS(flag) \ +static inline int NInoTestSet##flag(ntfs_inode *ni) \ +{ \ + return test_and_set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline int NInoTestClear##flag(ntfs_inode *ni) \ +{ \ + return test_and_clear_bit(NI_##flag, &(ni)->state); \ +} + +/* Emit the ntfs inode bitops functions. */ +NINO_FNS(Dirty) +TAS_NINO_FNS(Dirty) +NINO_FNS(AttrList) +NINO_FNS(AttrListNonResident) +NINO_FNS(Attr) +NINO_FNS(MstProtected) +NINO_FNS(NonResident) +NINO_FNS(IndexAllocPresent) +NINO_FNS(Compressed) +NINO_FNS(Encrypted) +NINO_FNS(Sparse) +NINO_FNS(SparseDisabled) +NINO_FNS(TruncateFailed) + +/* + * The full structure containing a ntfs_inode and a vfs struct inode. Used for + * all real and fake inodes but not for extent inodes which lack the vfs struct + * inode. + */ +typedef struct { + ntfs_inode ntfs_inode; + struct inode vfs_inode; /* The vfs inode structure. */ +} big_ntfs_inode; + +/** + * NTFS_I - return the ntfs inode given a vfs inode + * @inode: VFS inode + * + * NTFS_I() returns the ntfs inode associated with the VFS @inode. + */ +static inline ntfs_inode *NTFS_I(struct inode *inode) +{ + return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); +} + +static inline struct inode *VFS_I(ntfs_inode *ni) +{ + return &((big_ntfs_inode *)ni)->vfs_inode; +} + +/** + * ntfs_attr - ntfs in memory attribute structure + * @mft_no: mft record number of the base mft record of this attribute + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * @type: attribute type (see layout.h) + * + * This structure exists only to provide a small structure for the + * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. + * + * NOTE: Elements are ordered by size to make the structure as compact as + * possible on all architectures. + */ +typedef struct { + unsigned long mft_no; + ntfschar *name; + u32 name_len; + ATTR_TYPE type; +} ntfs_attr; + +typedef int (*test_t)(struct inode *, void *); + +extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na); + +extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); +extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len); +extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len); + +extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); +extern void ntfs_destroy_big_inode(struct inode *inode); +extern void ntfs_evict_big_inode(struct inode *vi); + +extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); + +static inline void ntfs_init_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + ntfs_debug("Entering."); + __ntfs_init_inode(vi->i_sb, ni); + ni->mft_no = vi->i_ino; +} + +extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no); +extern void ntfs_clear_extent_inode(ntfs_inode *ni); + +extern int ntfs_read_inode_mount(struct inode *vi); + +extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); + +#ifdef NTFS_RW + +extern int ntfs_truncate(struct inode *vi); +extern void ntfs_truncate_vfs(struct inode *vi); + +extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); + +extern int __ntfs_write_inode(struct inode *vi, int sync); + +static inline void ntfs_commit_inode(struct inode *vi) +{ + if (!is_bad_inode(vi)) + __ntfs_write_inode(vi, 1); + return; +} + +#else + +static inline void ntfs_truncate_vfs(struct inode *vi) {} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INODE_H */ diff --git a/kernel/fs/ntfs/layout.h b/kernel/fs/ntfs/layout.h new file mode 100644 index 000000000..809c0e6d8 --- /dev/null +++ b/kernel/fs/ntfs/layout.h @@ -0,0 +1,2435 @@ +/* + * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LAYOUT_H +#define _LINUX_NTFS_LAYOUT_H + +#include +#include +#include +#include + +#include "types.h" + +/* The NTFS oem_id "NTFS " */ +#define magicNTFS cpu_to_le64(0x202020205346544eULL) + +/* + * Location of bootsector on partition: + * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. + * On NT4 and above there is one backup copy of the boot sector to + * be found on the last sector of the partition (not normally accessible + * from within Windows as the bootsector contained number of sectors + * value is one less than the actual value!). + * On versions of NT 3.51 and earlier, the backup copy was located at + * number of sectors/2 (integer divide), i.e. in the middle of the volume. + */ + +/* + * BIOS parameter block (bpb) structure. + */ +typedef struct { + le16 bytes_per_sector; /* Size of a sector in bytes. */ + u8 sectors_per_cluster; /* Size of a cluster in sectors. */ + le16 reserved_sectors; /* zero */ + u8 fats; /* zero */ + le16 root_entries; /* zero */ + le16 sectors; /* zero */ + u8 media_type; /* 0xf8 = hard disk */ + le16 sectors_per_fat; /* zero */ + le16 sectors_per_track; /* irrelevant */ + le16 heads; /* irrelevant */ + le32 hidden_sectors; /* zero */ + le32 large_sectors; /* zero */ +} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; + +/* + * NTFS boot sector structure. + */ +typedef struct { + u8 jump[3]; /* Irrelevant (jump to boot up code).*/ + le64 oem_id; /* Magic "NTFS ". */ + BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ + u8 unused[4]; /* zero, NTFS diskedit.exe states that + this is actually: + __u8 physical_drive; // 0x80 + __u8 current_head; // zero + __u8 extended_boot_signature; + // 0x80 + __u8 unused; // zero + */ +/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives + maximum volume size of 2^63 sectors. + Assuming standard sector size of 512 + bytes, the maximum byte size is + approx. 4.7x10^21 bytes. (-; */ + sle64 mft_lcn; /* Cluster location of mft data. */ + sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ + s8 clusters_per_mft_record; /* Mft record size in clusters. */ + u8 reserved0[3]; /* zero */ + s8 clusters_per_index_record; /* Index block size in clusters. */ + u8 reserved1[3]; /* zero */ + le64 volume_serial_number; /* Irrelevant (serial number). */ + le32 checksum; /* Boot sector checksum. */ +/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ + le16 end_of_sector_marker; /* End of bootsector magic. Always is + 0xaa55 in little endian. */ +/* sizeof() = 512 (0x200) bytes */ +} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; + +/* + * Magic identifiers present at the beginning of all ntfs record containing + * records (like mft records for example). + */ +enum { + /* Found in $MFT/$DATA. */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + + /* Found in $LogFile/$DATA. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ + + /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + + /* Found in all ntfs record containing records. */ + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector + transfer was detected. */ + /* + * Found in $LogFile/$DATA when a page is full of 0xff bytes and is + * thus not initialized. Page must be initialized before using it. + */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ +}; + +typedef le32 NTFS_RECORD_TYPE; + +/* + * Generic magic comparison macros. Finally found a use for the ## preprocessor + * operator! (-8 + */ + +static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) +{ + return (x == r); +} +#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) + +static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) +{ + return (*p == r); +} +#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) + +/* + * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. + */ +#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) +#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) +#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) +#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) +#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) +#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) +#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) +#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) + +#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) +#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) +#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) +#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) + +#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) +#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) + +#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) +#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) + +#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) +#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) + +/* + * The Update Sequence Array (usa) is an array of the le16 values which belong + * to the end of each sector protected by the update sequence record in which + * this array is contained. Note that the first entry is the Update Sequence + * Number (usn), a cyclic counter of how many times the protected record has + * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All + * last le16's of each sector have to be equal to the usn (during reading) or + * are set to it (during writing). If they are not, an incomplete multi sector + * transfer has occurred when the data was written. + * The maximum size for the update sequence array is fixed to: + * maximum size = usa_ofs + (usa_count * 2) = 510 bytes + * The 510 bytes comes from the fact that the last le16 in the array has to + * (obviously) finish before the last le16 of the first 512-byte sector. + * This formula can be used as a consistency check in that usa_ofs + + * (usa_count * 2) has to be less than or equal to 510. + */ +typedef struct { + NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record + type and/or status. */ + le16 usa_ofs; /* Offset to the Update Sequence Array (usa) + from the start of the ntfs record. */ + le16 usa_count; /* Number of le16 sized entries in the usa + including the Update Sequence Number (usn), + thus the number of fixups is the usa_count + minus 1. */ +} __attribute__ ((__packed__)) NTFS_RECORD; + +/* + * System files mft record numbers. All these files are always marked as used + * in the bitmap attribute of the mft; presumably in order to avoid accidental + * allocation for random other mft records. Also, the sequence number for each + * of the system files is always equal to their mft record number and it is + * never modified. + */ +typedef enum { + FILE_MFT = 0, /* Master file table (mft). Data attribute + contains the entries and bitmap attribute + records which ones are in use (bit==1). */ + FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records + in data attribute. If cluster size > 4kiB, + copy of first N mft records, with + N = cluster_size / mft_record_size. */ + FILE_LogFile = 2, /* Journalling log in data attribute. */ + FILE_Volume = 3, /* Volume name attribute and volume information + attribute (flags and ntfs version). Windows + refers to this file as volume DASD (Direct + Access Storage Device). */ + FILE_AttrDef = 4, /* Array of attribute definitions in data + attribute. */ + FILE_root = 5, /* Root directory. */ + FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in + data attribute. */ + FILE_Boot = 7, /* Boot sector (always at cluster 0) in data + attribute. */ + FILE_BadClus = 8, /* Contains all bad clusters in the non-resident + data attribute. */ + FILE_Secure = 9, /* Shared security descriptors in data attribute + and two indexes into the descriptors. + Appeared in Windows 2000. Before that, this + file was named $Quota but was unused. */ + FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode + characters in data attribute. */ + FILE_Extend = 11, /* Directory containing other system files (eg. + $ObjId, $Quota, $Reparse and $UsnJrnl). This + is new to NTFS3.0. */ + FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ + FILE_reserved13 = 13, + FILE_reserved14 = 14, + FILE_reserved15 = 15, + FILE_first_user = 16, /* First user file, used as test limit for + whether to allow opening a file or not. */ +} NTFS_SYSTEM_FILES; + +/* + * These are the so far known MFT_RECORD_* flags (16-bit) which contain + * information about the mft record in which they are present. + */ +enum { + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), +} __attribute__ ((__packed__)); + +typedef le16 MFT_RECORD_FLAGS; + +/* + * mft references (aka file references or file record segment references) are + * used whenever a structure needs to refer to a record in the mft. + * + * A reference consists of a 48-bit index into the mft and a 16-bit sequence + * number used to detect stale references. + * + * For error reporting purposes we treat the 48-bit index as a signed quantity. + * + * The sequence number is a circular counter (skipping 0) describing how many + * times the referenced mft record has been (re)used. This has to match the + * sequence number of the mft record being referenced, otherwise the reference + * is considered stale and removed (FIXME: only ntfsck or the driver itself?). + * + * If the sequence number is zero it is assumed that no sequence number + * consistency checking should be performed. + * + * FIXME: Since inodes are 32-bit as of now, the driver needs to always check + * for high_part being 0 and if not either BUG(), cause a panic() or handle + * the situation in some other way. This shouldn't be a problem as a volume has + * to become HUGE in order to need more than 32-bits worth of mft records. + * Assuming the standard mft record size of 1kb only the records (never mind + * the non-resident attributes, etc.) would require 4Tb of space on their own + * for the first 32 bits worth of records. This is only if some strange person + * doesn't decide to foul play and make the mft sparse which would be a really + * horrible thing to do as it would trash our current driver implementation. )-: + * Do I hear screams "we want 64-bit inodes!" ?!? (-; + * + * FIXME: The mft zone is defined as the first 12% of the volume. This space is + * reserved so that the mft can grow contiguously and hence doesn't become + * fragmented. Volume free space includes the empty part of the mft zone and + * when the volume's free 88% are used up, the mft zone is shrunk by a factor + * of 2, thus making more space available for more files/data. This process is + * repeated every time there is no more free space except for the mft zone until + * there really is no more free space. + */ + +/* + * Typedef the MFT_REF as a 64-bit value for easier handling. + * Also define two unpacking macros to get to the reference (MREF) and + * sequence number (MSEQNO) respectively. + * The _LE versions are to be applied on little endian MFT_REFs. + * Note: The _LE versions will return a CPU endian formatted value! + */ +#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) + +typedef u64 MFT_REF; +typedef le64 leMFT_REF; + +#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ + ((MFT_REF)(m) & MFT_REF_MASK_CPU))) +#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) + +#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) +#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) +#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) +#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) + +#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) +#define ERR_MREF(x) ((u64)((s64)(x))) +#define MREF_ERR(x) ((int)((s64)(x))) + +/* + * The mft record header present at the beginning of every record in the mft. + * This is followed by a sequence of variable length attribute records which + * is terminated by an attribute of type AT_END which is a truncated attribute + * in that it only consists of the attribute type code AT_END and none of the + * other members of the attribute structure are present. + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ +/* 42*/ le16 reserved; /* Reserved/alignment. */ +/* 44*/ le32 mft_record_number; /* Number of this mft record. */ +/* sizeof() = 48 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD; + +/* This is the version without the NTFS 3.1+ specific fields. */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* sizeof() = 42 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD_OLD; + +/* + * System defined attributes (32-bit). Each attribute type has a corresponding + * attribute name (Unicode string of maximum 64 character length) as described + * by the attribute definitions present in the data attribute of the $AttrDef + * system file. On NTFS 3.0 volumes the names are just as the types are named + * in the below defines exchanging AT_ for the dollar sign ($). If that is not + * a revealing choice of symbol I do not know what is... (-; + */ +enum { + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) +}; + +typedef le32 ATTR_TYPE; + +/* + * The collation rules for sorting views/indexes/etc (32-bit). + * + * COLLATION_BINARY - Collate by binary compare where the first byte is most + * significant. + * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary + * Unicode values, except that when a character can be uppercased, the + * upper case value collates before the lower case one. + * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation + * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea + * what the difference is. Perhaps the difference is that file names + * would treat some special characters in an odd way (see + * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] + * for what I mean but COLLATION_UNICODE_STRING would not give any special + * treatment to any characters at all, but this is speculation. + * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key + * values. E.g. used for $SII index in FILE_Secure, which sorts by + * security_id (le32). + * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. + * E.g. used for $O index in FILE_Extend/$Quota. + * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash + * values and second by ascending security_id values. E.g. used for $SDH + * index in FILE_Secure. + * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending + * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which + * sorts by object_id (16-byte), by splitting up the object_id in four + * le32 values and using them as individual keys. E.g. take the following + * two security_ids, stored as follows on disk: + * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 + * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 + * To compare them, they are split into four le32 values each, like so: + * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 + * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 + * Now, it is apparent why the 2nd object_id collates after the 1st: the + * first le32 value of the 1st object_id is less than the first le32 of + * the 2nd object_id. If the first le32 values of both object_ids were + * equal then the second le32 values would be compared, etc. + */ +enum { + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), +}; + +typedef le32 COLLATION_RULE; + +/* + * The flags (32-bit) describing attribute properties in the attribute + * definition structure. FIXME: This information is based on Regis's + * information and, according to him, it is not certain and probably + * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * name attribute has this flag set and this is the only attribute indexed in + * NT4. + */ +enum { + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be + indexed. */ + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type + can be present multiple times in the + mft records of an inode. */ + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value + must contain at least one non-zero + byte. */ + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be + indexed and the attribute value must be + unique for the attribute type in all of + the mft records of an inode. */ + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be + named and the name must be unique for + the attribute type in all of the mft + records of an inode. */ + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be + resident. */ + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log + modifications to this attribute, + regardless of whether it is resident or + non-resident. Without this, only log + modifications if the attribute is + resident. */ +}; + +typedef le32 ATTR_DEF_FLAGS; + +/* + * The data attribute of FILE_AttrDef contains a sequence of attribute + * definitions for the NTFS volume. With this, it is supposed to be safe for an + * older NTFS driver to mount a volume containing a newer NTFS version without + * damaging it (that's the theory. In practice it's: not damaging it too much). + * Entries are sorted by attribute type. The flags describe whether the + * attribute can be resident/non-resident and possibly other things, but the + * actual bits are unknown. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero + terminated. */ +/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ +/* 84*/ le32 display_rule; /* Default display rule. + FIXME: What does it mean? (AIA) */ +/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ +/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ +/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ +/* 98*/ sle64 max_size; /* Maximum size of attribute. */ +/* sizeof() = 0xa0 or 160 bytes */ +} __attribute__ ((__packed__)) ATTR_DEF; + +/* + * Attribute flags (16-bit). + */ +enum { + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method + mask. Also, first + illegal value. */ + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), +} __attribute__ ((__packed__)); + +typedef le16 ATTR_FLAGS; + +/* + * Attribute compression. + * + * Only the data attribute is ever compressed in the current ntfs driver in + * Windows. Further, compression is only applied when the data attribute is + * non-resident. Finally, to use compression, the maximum allowed cluster size + * on a volume is 4kib. + * + * The compression method is based on independently compressing blocks of X + * clusters, where X is determined from the compression_unit value found in the + * non-resident attribute record header (more precisely: X = 2^compression_unit + * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). + * + * There are three different cases of how a compression block of X clusters + * can be stored: + * + * 1) The data in the block is all zero (a sparse block): + * This is stored as a sparse block in the runlist, i.e. the runlist + * entry has length = X and lcn = -1. The mapping pairs array actually + * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at + * all, which is then interpreted by the driver as lcn = -1. + * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then + * the same principles apply as above, except that the length is not + * restricted to being any particular value. + * + * 2) The data in the block is not compressed: + * This happens when compression doesn't reduce the size of the block + * in clusters. I.e. if compression has a small effect so that the + * compressed data still occupies X clusters, then the uncompressed data + * is stored in the block. + * This case is recognised by the fact that the runlist entry has + * length = X and lcn >= 0. The mapping pairs array stores this as + * normal with a run length of X and some specific delta_lcn, i.e. + * delta_lcn has to be present. + * + * 3) The data in the block is compressed: + * The common case. This case is recognised by the fact that the run + * list entry has length L < X and lcn >= 0. The mapping pairs array + * stores this as normal with a run length of X and some specific + * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is + * immediately followed by a sparse entry with length = X - L and + * lcn = -1. The latter entry is to make up the vcn counting to the + * full compression block size X. + * + * In fact, life is more complicated because adjacent entries of the same type + * can be coalesced. This means that one has to keep track of the number of + * clusters handled and work on a basis of X clusters at a time being one + * block. An example: if length L > X this means that this particular runlist + * entry contains a block of length X and part of one or more blocks of length + * L - X. Another example: if length L < X, this does not necessarily mean that + * the block is compressed as it might be that the lcn changes inside the block + * and hence the following runlist entry describes the continuation of the + * potentially compressed block. The block would be compressed if the + * following runlist entry describes at least X - L sparse clusters, thus + * making up the compression block length as described in point 3 above. (Of + * course, there can be several runlist entries with small lengths so that the + * sparse entry does not follow the first data containing entry with + * length < X.) + * + * NOTE: At the end of the compressed attribute value, there most likely is not + * just the right amount of data to make up a compression block, thus this data + * is not even attempted to be compressed. It is just stored as is, unless + * the number of clusters it occupies is reduced when compressed in which case + * it is stored as a compressed compression block, complete with sparse + * clusters at the end. + */ + +/* + * Flags of resident attributes (8-bit). + */ +enum { + RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index + (has implications for deleting and + modifying the attribute). */ +} __attribute__ ((__packed__)); + +typedef u8 RESIDENT_ATTR_FLAGS; + +/* + * Attribute record header. Always aligned to 8-byte boundary. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ +/* 4*/ le32 length; /* Byte size of the resident part of the + attribute (aligned to 8-byte boundary). + Used to get to the next attribute. */ +/* 8*/ u8 non_resident; /* If 0, attribute is resident. + If 1, attribute is non-resident. */ +/* 9*/ u8 name_length; /* Unicode character size of name of attribute. + 0 if unnamed. */ +/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the + beginning of the name from the attribute + record. Note that the name is stored as a + Unicode string. When creating, place offset + just at the end of the record header. Then, + follow with attribute value or mapping pairs + array, resident and non-resident attributes + respectively, aligning to an 8-byte + boundary. */ +/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ +/* 14*/ le16 instance; /* The instance of this attribute record. This + number is unique within this mft record (see + MFT_RECORD/next_attribute_instance notes in + in mft.h for more details). */ +/* 16*/ union { + /* Resident attributes. */ + struct { +/* 16 */ le32 value_length;/* Byte size of attribute value. */ +/* 20 */ le16 value_offset;/* Byte offset of the attribute + value from the start of the + attribute record. When creating, + align to 8-byte boundary if we + have a name present as this might + not have a length of a multiple + of 8-bytes. */ +/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ +/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) resident; + /* Non-resident attributes. */ + struct { +/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number + for this portion of the attribute value or + 0 if this is the only extent (usually the + case). - Only when an attribute list is used + does lowest_vcn != 0 ever occur. */ +/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of + the attribute value. - Usually there is only one + portion, so this usually equals the attribute + value size in clusters minus 1. Can be -1 for + zero length files. Can be 0 for "single extent" + attributes. */ +/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the + beginning of the structure to the mapping pairs + array which contains the mappings between the + vcns and the logical cluster numbers (lcns). + When creating, place this at the end of this + record header aligned to 8-byte boundary. */ +/* 34*/ u8 compression_unit; /* The compression unit expressed + as the log to the base 2 of the number of + clusters in a compression unit. 0 means not + compressed. (This effectively limits the + compression unit size to be a power of two + clusters.) WinNT4 only uses a value of 4. + Sparse files have this set to 0 on XPSP2. */ +/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ +/* The sizes below are only used when lowest_vcn is zero, as otherwise it would + be difficult to keep them up-to-date.*/ +/* 40*/ sle64 allocated_size; /* Byte size of disk space + allocated to hold the attribute value. Always + is a multiple of the cluster size. When a file + is compressed, this field is a multiple of the + compression block size (2^compression_unit) and + it represents the logically allocated space + rather than the actual on disk usage. For this + use the compressed_size (see below). */ +/* 48*/ sle64 data_size; /* Byte size of the attribute + value. Can be larger than allocated_size if + attribute value is compressed or sparse. */ +/* 56*/ sle64 initialized_size; /* Byte size of initialized + portion of the attribute value. Usually equals + data_size. */ +/* sizeof(uncompressed attr) = 64*/ +/* 64*/ sle64 compressed_size; /* Byte size of the attribute + value after compression. Only present when + compressed or sparse. Always is a multiple of + the cluster size. Represents the actual amount + of disk space being used on the disk. */ +/* sizeof(compressed attr) = 72*/ + } __attribute__ ((__packed__)) non_resident; + } __attribute__ ((__packed__)) data; +} __attribute__ ((__packed__)) ATTR_RECORD; + +typedef ATTR_RECORD ATTR_REC; + +/* + * File attribute flags (32-bit) appearing in the file_attributes fields of the + * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR + * attributes of MFT_RECORDs and directory index entries. + * + * All of the below flags appear in the directory index entries but only some + * appear in the STANDARD_INFORMATION attribute whilst only some others appear + * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the + * flags appear in all of the above. + */ +enum { + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ + + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), + /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is + reserved for the DOS SUBDIRECTORY flag. */ + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), + + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), + + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), + + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), + /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the + FILE_ATTR_DEVICE and preserves everything else. This mask is used + to obtain all flags that are valid for reading. */ + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), + /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the + F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, + F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask + is used to obtain all flags that are valid for setting. */ + /* + * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all + * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION + * attribute of an mft record. + */ + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this is a directory or not, i.e. whether it has + an index root attribute or not. */ + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this file has a view index present (eg. object id + index, quota index, one of the security indexes or the encrypting + filesystem related indexes). */ +}; + +typedef le32 FILE_ATTR_FLAGS; + +/* + * NOTE on times in NTFS: All times are in MS standard time format, i.e. they + * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 + * universal coordinated time (UTC). (In Linux time starts 1st January 1970, + * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) + */ + +/* + * Attribute: Standard information (0x10). + * + * NOTE: Always resident. + * NOTE: Present in all base file records on a volume. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*Ofs*/ +/* 0*/ sle64 creation_time; /* Time file was created. Updated when + a filename is changed(?). */ +/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 24*/ sle64 last_access_time; /* Approximate time when the file was + last accessed (obviously this is not + updated on read-only volumes). In + Windows this is only updated when + accessed if some time delta has + passed since the last update. Also, + last access time updates can be + disabled altogether for speed. */ +/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 36*/ union { + /* NTFS 1.2 */ + struct { + /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) v1; + /* sizeof() = 48 bytes */ + /* NTFS 3.x */ + struct { +/* + * If a volume has been upgraded from a previous NTFS version, then these + * fields are present only if the file has been accessed since the upgrade. + * Recognize the difference by comparing the length of the resident attribute + * value. If it is 48, then the following fields are missing. If it is 72 then + * the fields are present. Maybe just check like this: + * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { + * Assume NTFS 1.2- format. + * If (volume version is 3.x) + * Upgrade attribute to NTFS 3.x format. + * else + * Use NTFS 1.2- format for access. + * } else + * Use NTFS 3.x format for access. + * Only problem is that it might be legal to set the length of the value to + * arbitrarily large values thus spoiling this check. - But chkdsk probably + * views that as a corruption, assuming that it behaves like this for all + * attributes. + */ + /* 36*/ le32 maximum_versions; /* Maximum allowed versions for + file. Zero if version numbering is disabled. */ + /* 40*/ le32 version_number; /* This file's version (if any). + Set to zero if maximum_versions is zero. */ + /* 44*/ le32 class_id; /* Class id from bidirectional + class id index (?). */ + /* 48*/ le32 owner_id; /* Owner_id of the user owning + the file. Translate via $Q index in FILE_Extend + /$Quota to the quota control entry for the user + owning the file. Zero if quotas are disabled. */ + /* 52*/ le32 security_id; /* Security_id for the file. + Translate via $SII index and $SDS data stream + in FILE_Secure to the security descriptor. */ + /* 56*/ le64 quota_charged; /* Byte size of the charge to + the quota for all streams of the file. Note: Is + zero if quotas are disabled. */ + /* 64*/ leUSN usn; /* Last update sequence number + of the file. This is a direct index into the + transaction log file ($UsnJrnl). It is zero if + the usn journal is disabled or this file has + not been subject to logging yet. See usnjrnl.h + for details. */ + } __attribute__ ((__packed__)) v3; + /* sizeof() = 72 bytes (NTFS 3.x) */ + } __attribute__ ((__packed__)) ver; +} __attribute__ ((__packed__)) STANDARD_INFORMATION; + +/* + * Attribute: Attribute list (0x20). + * + * - Can be either resident or non-resident. + * - Value consists of a sequence of variable length, 8-byte aligned, + * ATTR_LIST_ENTRY records. + * - The list is not terminated by anything at all! The only way to know when + * the end is reached is to keep track of the current offset and compare it to + * the attribute value size. + * - The attribute list attribute contains one entry for each attribute of + * the file in which the list is located, except for the list attribute + * itself. The list is sorted: first by attribute type, second by attribute + * name (if present), third by instance number. The extents of one + * non-resident attribute (if present) immediately follow after the initial + * extent. They are ordered by lowest_vcn and have their instace set to zero. + * It is not allowed to have two attributes with all sorting keys equal. + * - Further restrictions: + * - If not resident, the vcn to lcn mapping array has to fit inside the + * base mft record. + * - The attribute list attribute value has a maximum size of 256kb. This + * is imposed by the Windows cache manager. + * - Attribute lists are only used when the attributes of mft record do not + * fit inside the mft record despite all attributes (that can be made + * non-resident) having been made non-resident. This can happen e.g. when: + * - File has a large number of hard links (lots of file name + * attributes present). + * - The mapping pairs array of some non-resident attribute becomes so + * large due to fragmentation that it overflows the mft record. + * - The security descriptor is very complex (not applicable to + * NTFS 3.0 volumes). + * - There are many named streams. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ +/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ +/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the + attribute or 0 if unnamed. */ +/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name + (always set this to where the name would + start even if unnamed). */ +/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion + of the attribute value. This is usually 0. It + is non-zero for the case where one attribute + does not fit into one mft record and thus + several mft records are allocated to hold + this attribute. In the latter case, each mft + record holds one extent of the attribute and + there is one attribute list entry for each + extent. NOTE: This is DEFINITELY a signed + value! The windows driver uses cmp, followed + by jg when comparing this, thus it treats it + as signed. */ +/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding + the ATTR_RECORD for this portion of the + attribute value. */ +/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the + attribute being referenced; otherwise 0. */ +/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use + name_offset to determine the location of the + name. */ +/* sizeof() = 26 + (attribute_name_length * 2) bytes */ +} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; + +/* + * The maximum allowed length for a file name. + */ +#define MAXIMUM_FILE_NAME_LENGTH 255 + +/* + * Possible namespaces for filenames in ntfs (8-bit). + */ +enum { + FILE_NAME_POSIX = 0x00, + /* This is the largest namespace. It is case sensitive and allows all + Unicode characters except for: '\0' and '/'. Beware that in + WinNT/2k/2003 by default files which eg have the same name except + for their case will not be distinguished by the standard utilities + and thus a "del filename" will delete both "filename" and "fileName" + without warning. However if for example Services For Unix (SFU) are + installed and the case sensitive option was enabled at installation + time, then you can create/access/delete such files. + Note that even SFU places restrictions on the filenames beyond the + '\0' and '/' and in particular the following set of characters is + not allowed: '"', '/', '<', '>', '\'. All other characters, + including the ones no allowed in WIN32 namespace are allowed. + Tested with SFU 3.5 (this is now free) running on Windows XP. */ + FILE_NAME_WIN32 = 0x01, + /* The standard WinNT/2k NTFS long filenames. Case insensitive. All + Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', + and '|'. Further, names cannot end with a '.' or a space. */ + FILE_NAME_DOS = 0x02, + /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit + characters greater space, except: '"', '*', '+', ',', '/', ':', ';', + '<', '=', '>', '?', and '\'. */ + FILE_NAME_WIN32_AND_DOS = 0x03, + /* 3 means that both the Win32 and the DOS filenames are identical and + hence have been saved in this single filename record. */ +} __attribute__ ((__packed__)); + +typedef u8 FILE_NAME_TYPE_FLAGS; + +/* + * Attribute: Filename (0x30). + * + * NOTE: Always resident. + * NOTE: All fields, except the parent_directory, are only updated when the + * filename is changed. Until then, they just become out of sync with + * reality and the more up to date values are present in the standard + * information attribute. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ leMFT_REF parent_directory; /* Directory this filename is + referenced from. */ +/* 8*/ sle64 creation_time; /* Time file was created. */ +/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 20*/ sle64 last_access_time; /* Time this mft record was last + accessed. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the unnamed data attribute. So + for normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. NOTE: + This is a multiple of the cluster + size. */ +/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed + data attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. */ +/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 3c*/ union { + /* 3c*/ struct { + /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to + pack the extended attributes + (EAs), if such are present.*/ + /* 3e*/ le16 reserved; /* Reserved for alignment. */ + } __attribute__ ((__packed__)) ea; + /* 3c*/ struct { + /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, + present only in reparse + points and only if there are + no EAs. */ + } __attribute__ ((__packed__)) rp; + } __attribute__ ((__packed__)) type; +/* 40*/ u8 file_name_length; /* Length of file name in + (Unicode) characters. */ +/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ +/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ +} __attribute__ ((__packed__)) FILE_NAME_ATTR; + +/* + * GUID structures store globally unique identifiers (GUID). A GUID is a + * 128-bit value consisting of one group of eight hexadecimal digits, followed + * by three groups of four hexadecimal digits each, followed by one group of + * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the + * distributed computing environment (DCE) universally unique identifier (UUID). + * Example of a GUID: + * 1F010768-5A73-BC91-0010A52216A7 + */ +typedef struct { + le32 data1; /* The first eight hexadecimal digits of the GUID. */ + le16 data2; /* The first group of four hexadecimal digits. */ + le16 data3; /* The second group of four hexadecimal digits. */ + u8 data4[8]; /* The first two bytes are the third group of four + hexadecimal digits. The remaining six bytes are the + final 12 hexadecimal digits. */ +} __attribute__ ((__packed__)) GUID; + +/* + * FILE_Extend/$ObjId contains an index named $O. This index contains all + * object_ids present on the volume as the index keys and the corresponding + * mft_record numbers as the index entry data parts. The data part (defined + * below) also contains three other object_ids: + * birth_volume_id - object_id of FILE_Volume on which the file was first + * created. Optional (i.e. can be zero). + * birth_object_id - object_id of file when it was first created. Usually + * equals the object_id. Optional (i.e. can be zero). + * domain_id - Reserved (always zero). + */ +typedef struct { + leMFT_REF mft_reference;/* Mft record containing the object_id in + the index entry key. */ + union { + struct { + GUID birth_volume_id; + GUID birth_object_id; + GUID domain_id; + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; + +/* + * Attribute: Object id (NTFS 3.0+) (0x40). + * + * NOTE: Always resident. + */ +typedef struct { + GUID object_id; /* Unique id assigned to the + file.*/ + /* The following fields are optional. The attribute value size is 16 + bytes, i.e. sizeof(GUID), if these are not present at all. Note, + the entries can be present but one or more (or all) can be zero + meaning that that particular value(s) is(are) not defined. */ + union { + struct { + GUID birth_volume_id; /* Unique id of volume on which + the file was first created.*/ + GUID birth_object_id; /* Unique id of file when it was + first created. */ + GUID domain_id; /* Reserved, zero. */ + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJECT_ID_ATTR; + +/* + * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in + * the SID structure (see below). + */ +//typedef enum { /* SID string prefix. */ +// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ +// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ +// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ +// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ +// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ +// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ +//} IDENTIFIER_AUTHORITIES; + +/* + * These relative identifiers (RIDs) are used with the above identifier + * authorities to make up universal well-known SIDs. + * + * Note: The relative identifier (RID) refers to the portion of a SID, which + * identifies a user or group in relation to the authority that issued the SID. + * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is + * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and + * the relative identifier SECURITY_CREATOR_OWNER_RID (0). + */ +typedef enum { /* Identifier authority. */ + SECURITY_NULL_RID = 0, /* S-1-0 */ + SECURITY_WORLD_RID = 0, /* S-1-1 */ + SECURITY_LOCAL_RID = 0, /* S-1-2 */ + + SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ + SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ + + SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ + SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ + + SECURITY_DIALUP_RID = 1, + SECURITY_NETWORK_RID = 2, + SECURITY_BATCH_RID = 3, + SECURITY_INTERACTIVE_RID = 4, + SECURITY_SERVICE_RID = 6, + SECURITY_ANONYMOUS_LOGON_RID = 7, + SECURITY_PROXY_RID = 8, + SECURITY_ENTERPRISE_CONTROLLERS_RID=9, + SECURITY_SERVER_LOGON_RID = 9, + SECURITY_PRINCIPAL_SELF_RID = 0xa, + SECURITY_AUTHENTICATED_USER_RID = 0xb, + SECURITY_RESTRICTED_CODE_RID = 0xc, + SECURITY_TERMINAL_SERVER_RID = 0xd, + + SECURITY_LOGON_IDS_RID = 5, + SECURITY_LOGON_IDS_RID_COUNT = 3, + + SECURITY_LOCAL_SYSTEM_RID = 0x12, + + SECURITY_NT_NON_UNIQUE = 0x15, + + SECURITY_BUILTIN_DOMAIN_RID = 0x20, + + /* + * Well-known domain relative sub-authority values (RIDs). + */ + + /* Users. */ + DOMAIN_USER_RID_ADMIN = 0x1f4, + DOMAIN_USER_RID_GUEST = 0x1f5, + DOMAIN_USER_RID_KRBTGT = 0x1f6, + + /* Groups. */ + DOMAIN_GROUP_RID_ADMINS = 0x200, + DOMAIN_GROUP_RID_USERS = 0x201, + DOMAIN_GROUP_RID_GUESTS = 0x202, + DOMAIN_GROUP_RID_COMPUTERS = 0x203, + DOMAIN_GROUP_RID_CONTROLLERS = 0x204, + DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, + DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, + DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, + DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, + + /* Aliases. */ + DOMAIN_ALIAS_RID_ADMINS = 0x220, + DOMAIN_ALIAS_RID_USERS = 0x221, + DOMAIN_ALIAS_RID_GUESTS = 0x222, + DOMAIN_ALIAS_RID_POWER_USERS = 0x223, + + DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, + DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, + DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, + DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, + + DOMAIN_ALIAS_RID_REPLICATOR = 0x228, + DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, + DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, +} RELATIVE_IDENTIFIERS; + +/* + * The universal well-known SIDs: + * + * NULL_SID S-1-0-0 + * WORLD_SID S-1-1-0 + * LOCAL_SID S-1-2-0 + * CREATOR_OWNER_SID S-1-3-0 + * CREATOR_GROUP_SID S-1-3-1 + * CREATOR_OWNER_SERVER_SID S-1-3-2 + * CREATOR_GROUP_SERVER_SID S-1-3-3 + * + * (Non-unique IDs) S-1-4 + * + * NT well-known SIDs: + * + * NT_AUTHORITY_SID S-1-5 + * DIALUP_SID S-1-5-1 + * + * NETWORD_SID S-1-5-2 + * BATCH_SID S-1-5-3 + * INTERACTIVE_SID S-1-5-4 + * SERVICE_SID S-1-5-6 + * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) + * PROXY_SID S-1-5-8 + * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) + * SELF_SID S-1-5-10 (self RID) + * AUTHENTICATED_USER_SID S-1-5-11 + * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) + * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) + * + * (Logon IDs) S-1-5-5-X-Y + * + * (NT non-unique IDs) S-1-5-0x15-... + * + * (Built-in domain) S-1-5-0x20 + */ + +/* + * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. + * + * NOTE: This is stored as a big endian number, hence the high_part comes + * before the low_part. + */ +typedef union { + struct { + u16 high_part; /* High 16-bits. */ + u32 low_part; /* Low 32-bits. */ + } __attribute__ ((__packed__)) parts; + u8 value[6]; /* Value as individual bytes. */ +} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; + +/* + * The SID structure is a variable-length structure used to uniquely identify + * users or groups. SID stands for security identifier. + * + * The standard textual representation of the SID is of the form: + * S-R-I-S-S... + * Where: + * - The first "S" is the literal character 'S' identifying the following + * digits as a SID. + * - R is the revision level of the SID expressed as a sequence of digits + * either in decimal or hexadecimal (if the later, prefixed by "0x"). + * - I is the 48-bit identifier_authority, expressed as digits as R above. + * - S... is one or more sub_authority values, expressed as digits as above. + * + * Example SID; the domain-relative SID of the local Administrators group on + * Windows NT/2k: + * S-1-5-32-544 + * This translates to a SID with: + * revision = 1, + * sub_authority_count = 2, + * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY + * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID + * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS + */ +typedef struct { + u8 revision; + u8 sub_authority_count; + SID_IDENTIFIER_AUTHORITY identifier_authority; + le32 sub_authority[1]; /* At least one sub_authority. */ +} __attribute__ ((__packed__)) SID; + +/* + * Current constants for SIDs. + */ +typedef enum { + SID_REVISION = 1, /* Current revision level. */ + SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ + SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in + a future revision. */ +} SID_CONSTANTS; + +/* + * The predefined ACE types (8-bit, see below). + */ +enum { + ACCESS_MIN_MS_ACE_TYPE = 0, + ACCESS_ALLOWED_ACE_TYPE = 0, + ACCESS_DENIED_ACE_TYPE = 1, + SYSTEM_AUDIT_ACE_TYPE = 2, + SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ + ACCESS_MAX_MS_V2_ACE_TYPE = 3, + + ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, + ACCESS_MAX_MS_V3_ACE_TYPE = 4, + + /* The following are Win2k only. */ + ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, + ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, + ACCESS_DENIED_OBJECT_ACE_TYPE = 6, + SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, + SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, + ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, + + ACCESS_MAX_MS_V4_ACE_TYPE = 8, + + /* This one is for WinNT/2k. */ + ACCESS_MAX_MS_ACE_TYPE = 8, +} __attribute__ ((__packed__)); + +typedef u8 ACE_TYPES; + +/* + * The ACE flags (8-bit) for audit and inheritance (see below). + * + * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE + * types to indicate that a message is generated (in Windows!) for successful + * accesses. + * + * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types + * to indicate that a message is generated (in Windows!) for failed accesses. + */ +enum { + /* The inheritance flags. */ + OBJECT_INHERIT_ACE = 0x01, + CONTAINER_INHERIT_ACE = 0x02, + NO_PROPAGATE_INHERIT_ACE = 0x04, + INHERIT_ONLY_ACE = 0x08, + INHERITED_ACE = 0x10, /* Win2k only. */ + VALID_INHERIT_FLAGS = 0x1f, + + /* The audit flags. */ + SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, + FAILED_ACCESS_ACE_FLAG = 0x80, +} __attribute__ ((__packed__)); + +typedef u8 ACE_FLAGS; + +/* + * An ACE is an access-control entry in an access-control list (ACL). + * An ACE defines access to an object for a specific user or group or defines + * the types of access that generate system-administration messages or alarms + * for a specific user or group. The user or group is identified by a security + * identifier (SID). + * + * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), + * which specifies the type and size of the ACE. The format of the subsequent + * data depends on the ACE type. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ACE_TYPES type; /* Type of the ACE. */ +/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ +/* 2*/ le16 size; /* Size in bytes of the ACE. */ +} __attribute__ ((__packed__)) ACE_HEADER; + +/* + * The access mask (32-bit). Defines the access rights. + * + * The specific rights (bits 0 to 15). These depend on the type of the object + * being secured by the ACE. + */ +enum { + /* Specific rights for files and directories are as follows: */ + + /* Right to read data from the file. (FILE) */ + FILE_READ_DATA = cpu_to_le32(0x00000001), + /* Right to list contents of a directory. (DIRECTORY) */ + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), + + /* Right to write data to the file. (FILE) */ + FILE_WRITE_DATA = cpu_to_le32(0x00000002), + /* Right to create a file in the directory. (DIRECTORY) */ + FILE_ADD_FILE = cpu_to_le32(0x00000002), + + /* Right to append data to the file. (FILE) */ + FILE_APPEND_DATA = cpu_to_le32(0x00000004), + /* Right to create a subdirectory. (DIRECTORY) */ + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), + + /* Right to read extended attributes. (FILE/DIRECTORY) */ + FILE_READ_EA = cpu_to_le32(0x00000008), + + /* Right to write extended attributes. (FILE/DIRECTORY) */ + FILE_WRITE_EA = cpu_to_le32(0x00000010), + + /* Right to execute a file. (FILE) */ + FILE_EXECUTE = cpu_to_le32(0x00000020), + /* Right to traverse the directory. (DIRECTORY) */ + FILE_TRAVERSE = cpu_to_le32(0x00000020), + + /* + * Right to delete a directory and all the files it contains (its + * children), even if the files are read-only. (DIRECTORY) + */ + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), + + /* Right to read file attributes. (FILE/DIRECTORY) */ + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), + + /* Right to change file attributes. (FILE/DIRECTORY) */ + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), + + /* + * The standard rights (bits 16 to 23). These are independent of the + * type of object being secured. + */ + + /* Right to delete the object. */ + DELETE = cpu_to_le32(0x00010000), + + /* + * Right to read the information in the object's security descriptor, + * not including the information in the SACL, i.e. right to read the + * security descriptor and owner. + */ + READ_CONTROL = cpu_to_le32(0x00020000), + + /* Right to modify the DACL in the object's security descriptor. */ + WRITE_DAC = cpu_to_le32(0x00040000), + + /* Right to change the owner in the object's security descriptor. */ + WRITE_OWNER = cpu_to_le32(0x00080000), + + /* + * Right to use the object for synchronization. Enables a process to + * wait until the object is in the signalled state. Some object types + * do not support this access right. + */ + SYNCHRONIZE = cpu_to_le32(0x00100000), + + /* + * The following STANDARD_RIGHTS_* are combinations of the above for + * convenience and are defined by the Win32 API. + */ + + /* These are currently defined to READ_CONTROL. */ + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), + + /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), + + /* + * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and + * SYNCHRONIZE access. + */ + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), + + /* + * The access system ACL and maximum allowed access types (bits 24 to + * 25, bits 26 to 27 are reserved). + */ + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), + + /* + * The generic rights (bits 28 to 31). These map onto the standard and + * specific rights. + */ + + /* Read, write, and execute access. */ + GENERIC_ALL = cpu_to_le32(0x10000000), + + /* Execute access. */ + GENERIC_EXECUTE = cpu_to_le32(0x20000000), + + /* + * Write access. For files, this maps onto: + * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | + * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE + * For directories, the mapping has the same numerical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_WRITE = cpu_to_le32(0x40000000), + + /* + * Read access. For files, this maps onto: + * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | + * STANDARD_RIGHTS_READ | SYNCHRONIZE + * For directories, the mapping has the same numberical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_READ = cpu_to_le32(0x80000000), +}; + +typedef le32 ACCESS_MASK; + +/* + * The generic mapping array. Used to denote the mapping of each generic + * access right to a specific access mask. + * + * FIXME: What exactly is this and what is it for? (AIA) + */ +typedef struct { + ACCESS_MASK generic_read; + ACCESS_MASK generic_write; + ACCESS_MASK generic_execute; + ACCESS_MASK generic_all; +} __attribute__ ((__packed__)) GENERIC_MAPPING; + +/* + * The predefined ACE type structures are as defined below. + */ + +/* + * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE + */ +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, + SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; + +/* + * The object ACE flags (32-bit). + */ +enum { + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), +}; + +typedef le32 OBJECT_ACE_FLAGS; + +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ +/* 12*/ GUID object_type; +/* 28*/ GUID inherited_object_type; + +/* 44*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, + ACCESS_DENIED_OBJECT_ACE, + SYSTEM_AUDIT_OBJECT_ACE, + SYSTEM_ALARM_OBJECT_ACE; + +/* + * An ACL is an access-control list (ACL). + * An ACL starts with an ACL header structure, which specifies the size of + * the ACL and the number of ACEs it contains. The ACL header is followed by + * zero or more access control entries (ACEs). The ACL as well as each ACE + * are aligned on 4-byte boundaries. + */ +typedef struct { + u8 revision; /* Revision of this ACL. */ + u8 alignment1; + le16 size; /* Allocated space in bytes for ACL. Includes this + header, the ACEs and the remaining free space. */ + le16 ace_count; /* Number of ACEs in the ACL. */ + le16 alignment2; +/* sizeof() = 8 bytes */ +} __attribute__ ((__packed__)) ACL; + +/* + * Current constants for ACLs. + */ +typedef enum { + /* Current revision. */ + ACL_REVISION = 2, + ACL_REVISION_DS = 4, + + /* History of revisions. */ + ACL_REVISION1 = 1, + MIN_ACL_REVISION = 2, + ACL_REVISION2 = 2, + ACL_REVISION3 = 3, + ACL_REVISION4 = 4, + MAX_ACL_REVISION = 4, +} ACL_CONSTANTS; + +/* + * The security descriptor control flags (16-bit). + * + * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID + * pointed to by the Owner field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the SID with + * respect to inheritance of an owner. + * + * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in + * the Group field was provided by a defaulting mechanism rather than + * explicitly provided by the original provider of the security + * descriptor. This may affect the treatment of the SID with respect to + * inheritance of a primary group. + * + * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a discretionary ACL. If this flag is set and the + * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is + * explicitly being specified. + * + * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Dacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * DaclPresent flag is not set. + * + * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a system ACL pointed to by the Sacl field. If this + * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then + * an empty (but present) ACL is being specified. + * + * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Sacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * SaclPresent flag is not set. + * + * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security + * descriptor is in self-relative form. In this form, all fields of the + * security descriptor are contiguous in memory and all pointer fields are + * expressed as offsets from the beginning of the security descriptor. + */ +enum { + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), + + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), + + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), + + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) +} __attribute__ ((__packed__)); + +typedef le16 SECURITY_DESCRIPTOR_CONTROL; + +/* + * Self-relative security descriptor. Contains the owner and group SIDs as well + * as the sacl and dacl ACLs inside the security descriptor itself. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + le32 owner; /* Byte offset to a SID representing an object's + owner. If this is NULL, no owner SID is present in + the descriptor. */ + le32 group; /* Byte offset to a SID representing an object's + primary group. If this is NULL, no primary group + SID is present in the descriptor. */ + le32 sacl; /* Byte offset to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +/* sizeof() = 0x14 bytes */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; + +/* + * Absolute security descriptor. Does not contain the owner and group SIDs, nor + * the sacl and dacl ACLs inside the security descriptor. Instead, it contains + * pointers to these structures in memory. Obviously, absolute security + * descriptors are only useful for in memory representations of security + * descriptors. On disk, a self-relative security descriptor is used. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + SID *owner; /* Points to a SID representing an object's owner. If + this is NULL, no owner SID is present in the + descriptor. */ + SID *group; /* Points to a SID representing an object's primary + group. If this is NULL, no primary group SID is + present in the descriptor. */ + ACL *sacl; /* Points to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + ACL *dacl; /* Points to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; + +/* + * Current constants for security descriptors. + */ +typedef enum { + /* Current revision. */ + SECURITY_DESCRIPTOR_REVISION = 1, + SECURITY_DESCRIPTOR_REVISION1 = 1, + + /* The sizes of both the absolute and relative security descriptors is + the same as pointers, at least on ia32 architecture are 32-bit. */ + SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), +} SECURITY_DESCRIPTOR_CONSTANTS; + +/* + * Attribute: Security descriptor (0x50). A standard self-relative security + * descriptor. + * + * NOTE: Can be resident or non-resident. + * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally + * in FILE_Secure and the correct descriptor is found using the security_id + * from the standard information attribute. + */ +typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; + +/* + * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one + * referenced instance of each unique security descriptor is stored. + * + * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It + * does, however, contain two indexes ($SDH and $SII) as well as a named data + * stream ($SDS). + * + * Every unique security descriptor is assigned a unique security identifier + * (security_id, not to be confused with a SID). The security_id is unique for + * the NTFS volume and is used as an index into the $SII index, which maps + * security_ids to the security descriptor's storage location within the $SDS + * data attribute. The $SII index is sorted by ascending security_id. + * + * A simple hash is computed from each security descriptor. This hash is used + * as an index into the $SDH index, which maps security descriptor hashes to + * the security descriptor's storage location within the $SDS data attribute. + * The $SDH index is sorted by security descriptor hash and is stored in a B+ + * tree. When searching $SDH (with the intent of determining whether or not a + * new security descriptor is already present in the $SDS data stream), if a + * matching hash is found, but the security descriptors do not match, the + * search in the $SDH index is continued, searching for a next matching hash. + * + * When a precise match is found, the security_id coresponding to the security + * descriptor in the $SDS attribute is read from the found $SDH index entry and + * is stored in the $STANDARD_INFORMATION attribute of the file/directory to + * which the security descriptor is being applied. The $STANDARD_INFORMATION + * attribute is present in all base mft records (i.e. in all files and + * directories). + * + * If a match is not found, the security descriptor is assigned a new unique + * security_id and is added to the $SDS data attribute. Then, entries + * referencing the this security descriptor in the $SDS data attribute are + * added to the $SDH and $SII indexes. + * + * Note: Entries are never deleted from FILE_Secure, even if nothing + * references an entry any more. + */ + +/* + * This header precedes each security descriptor in the $SDS data stream. + * This is also the index entry data part of both the $SII and $SDH indexes. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; + +/* + * The $SDS data stream contains the security descriptors, aligned on 16-byte + * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot + * cross 256kib boundaries (this restriction is imposed by the Windows cache + * manager). Each security descriptor is contained in a SDS_ENTRY structure. + * Also, each security descriptor is stored twice in the $SDS stream with a + * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) + * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the + * the first copy of the security descriptor will be at offset 0x51d0 in the + * $SDS data stream and the second copy will be at offset 0x451d0. + */ +typedef struct { +/*Ofs*/ +/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like + unnamed structs. */ + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security + descriptor. */ +} __attribute__ ((__packed__)) SDS_ENTRY; + +/* + * The index entry key used in the $SII index. The collation type is + * COLLATION_NTOFS_ULONG. + */ +typedef struct { + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SII_INDEX_KEY; + +/* + * The index entry key used in the $SDH index. The keys are sorted first by + * hash and then by security_id. The collation rule is + * COLLATION_NTOFS_SECURITY_HASH. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SDH_INDEX_KEY; + +/* + * Attribute: Volume name (0x60). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + */ +typedef struct { + ntfschar name[0]; /* The name of the volume in Unicode. */ +} __attribute__ ((__packed__)) VOLUME_NAME; + +/* + * Possible flags for the volume (16-bit). + */ +enum { + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), + + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), + + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), + + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), + + /* To make our life easier when checking if we must mount read-only. */ + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), +} __attribute__ ((__packed__)); + +typedef le16 VOLUME_FLAGS; + +/* + * Attribute: Volume information (0x70). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses + * NTFS 1.2. I haven't personally seen other values yet. + */ +typedef struct { + le64 reserved; /* Not used (yet?). */ + u8 major_ver; /* Major version of the ntfs format. */ + u8 minor_ver; /* Minor version of the ntfs format. */ + VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ +} __attribute__ ((__packed__)) VOLUME_INFORMATION; + +/* + * Attribute: Data attribute (0x80). + * + * NOTE: Can be resident or non-resident. + * + * Data contents of a file (i.e. the unnamed stream) or of a named stream. + */ +typedef struct { + u8 data[0]; /* The file's data contents. */ +} __attribute__ ((__packed__)) DATA_ATTR; + +/* + * Index header flags (8-bit). + */ +enum { + /* + * When index header is in an index root attribute: + */ + SMALL_INDEX = 0, /* The index is small enough to fit inside the index + root attribute and there is no index allocation + attribute present. */ + LARGE_INDEX = 1, /* The index is too large to fit in the index root + attribute and/or an index allocation attribute is + present. */ + /* + * When index header is in an index block, i.e. is part of index + * allocation attribute: + */ + LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes + branching off it. */ + INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf + node. */ + NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ +} __attribute__ ((__packed__)); + +typedef u8 INDEX_HEADER_FLAGS; + +/* + * This is the header for indexes, describing the INDEX_ENTRY records, which + * follow the INDEX_HEADER. Together the index header and the index entries + * make up a complete index. + * + * IMPORTANT NOTE: The offset, length and size structure members are counted + * relative to the start of the index header structure and not relative to the + * start of the index root or index allocation structures themselves. + */ +typedef struct { + le32 entries_offset; /* Byte offset to first INDEX_ENTRY + aligned to 8-byte boundary. */ + le32 index_length; /* Data size of the index in bytes, + i.e. bytes used from allocated + size, aligned to 8-byte boundary. */ + le32 allocated_size; /* Byte size of this index (block), + multiple of 8 bytes. */ + /* NOTE: For the index root attribute, the above two numbers are always + equal, as the attribute is resident and it is resized as needed. In + the case of the index allocation attribute the attribute is not + resident and hence the allocated_size is a fixed value and must + equal the index_block_size specified by the INDEX_ROOT attribute + corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK + belongs to. */ + INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ +} __attribute__ ((__packed__)) INDEX_HEADER; + +/* + * Attribute: Index root (0x90). + * + * NOTE: Always resident. + * + * This is followed by a sequence of index entries (INDEX_ENTRY structures) + * as described by the index header. + * + * When a directory is small enough to fit inside the index root then this + * is the only attribute describing the directory. When the directory is too + * large to fit in the index root, on the other hand, two additional attributes + * are present: an index allocation attribute, containing sub-nodes of the B+ + * directory tree (see below), and a bitmap attribute, describing which virtual + * cluster numbers (vcns) in the index allocation attribute are in use by an + * index block. + * + * NOTE: The root directory (FILE_root) contains an entry for itself. Other + * directories do not contain entries for themselves, though. + */ +typedef struct { + ATTR_TYPE type; /* Type of the indexed attribute. Is + $FILE_NAME for directories, zero + for view indexes. No other values + allowed. */ + COLLATION_RULE collation_rule; /* Collation rule used to sort the + index entries. If type is $FILE_NAME, + this must be COLLATION_FILE_NAME. */ + le32 index_block_size; /* Size of each index block in bytes (in + the index allocation attribute). */ + u8 clusters_per_index_block; /* Cluster size of each index block (in + the index allocation attribute), when + an index block is >= than a cluster, + otherwise this will be the log of + the size (like how the encoding of + the mft record size and the index + record size found in the boot sector + work). Has to be a power of 2. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ + INDEX_HEADER index; /* Index header describing the + following index entries. */ +} __attribute__ ((__packed__)) INDEX_ROOT; + +/* + * Attribute: Index allocation (0xa0). + * + * NOTE: Always non-resident (doesn't make sense to be resident anyway!). + * + * This is an array of index blocks. Each index block starts with an + * INDEX_BLOCK structure containing an index header, followed by a sequence of + * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. + */ +typedef struct { +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ + le16 usa_ofs; /* See NTFS_RECORD definition. */ + le16 usa_count; /* See NTFS_RECORD definition. */ + +/* 8*/ sle64 lsn; /* $LogFile sequence number of the last + modification of this index block. */ +/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. + If the cluster_size on the volume is <= the + index_block_size of the directory, + index_block_vcn counts in units of clusters, + and in units of sectors otherwise. */ +/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ +/* sizeof()= 40 (0x28) bytes */ +/* + * When creating the index block, we place the update sequence array at this + * offset, i.e. before we start with the index entries. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) INDEX_BLOCK; + +typedef INDEX_BLOCK INDEX_ALLOCATION; + +/* + * The system file FILE_Extend/$Reparse contains an index named $R listing + * all reparse points on the volume. The index entry keys are as defined + * below. Note, that there is no index data associated with the index entries. + * + * The index entries are sorted by the index key file_id. The collation rule is + * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the + * primary key / is not a key at all. (AIA) + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + leMFT_REF file_id; /* Mft record of the file containing the + reparse point attribute. */ +} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; + +/* + * Quota flags (32-bit). + * + * The user quota flags. Names explain meaning. + */ +enum { + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), + + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), + /* This is a bit mask for the user quota flags. */ + + /* + * These flags are only present in the quota defaults index entry, i.e. + * in the entry where owner_id = QUOTA_DEFAULTS_ID. + */ + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), + + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), +}; + +typedef le32 QUOTA_FLAGS; + +/* + * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas + * are on a per volume and per user basis. + * + * The $Q index contains one entry for each existing user_id on the volume. The + * index key is the user_id of the user/group owning this quota control entry, + * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the + * owner_id, is found in the standard information attribute. The collation rule + * for $Q is COLLATION_NTOFS_ULONG. + * + * The $O index contains one entry for each user/group who has been assigned + * a quota on that volume. The index key holds the SID of the user_id the + * entry belongs to, i.e. the owner_id. The collation rule for $O is + * COLLATION_NTOFS_SID. + * + * The $O index entry data is the user_id of the user corresponding to the SID. + * This user_id is used as an index into $Q to find the quota control entry + * associated with the SID. + * + * The $Q index entry data is the quota control entry and is defined below. + */ +typedef struct { + le32 version; /* Currently equals 2. */ + QUOTA_FLAGS flags; /* Flags describing this quota entry. */ + le64 bytes_used; /* How many bytes of the quota are in use. */ + sle64 change_time; /* Last time this quota entry was changed. */ + sle64 threshold; /* Soft quota (-1 if not limited). */ + sle64 limit; /* Hard quota (-1 if not limited). */ + sle64 exceeded_time; /* How long the soft quota has been exceeded. */ + SID sid; /* The SID of the user/object associated with + this quota entry. Equals zero for the quota + defaults entry (and in fact on a WinXP + volume, it is not present at all). */ +} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; + +/* + * Predefined owner_id values (32-bit). + */ +enum { + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), +}; + +/* + * Current constants for quota control entries. + */ +typedef enum { + /* Current version. */ + QUOTA_VERSION = 2, +} QUOTA_CONTROL_ENTRY_CONSTANTS; + +/* + * Index entry flags (16-bit). + */ +enum { + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a + sub-node, i.e. a reference to an index block in form of + a virtual cluster number (see below). */ + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last + entry in an index block. The index entry does not + represent a file but it can point to a sub-node. */ + + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force + enum bit width to 16-bit. */ +} __attribute__ ((__packed__)); + +typedef le16 INDEX_ENTRY_FLAGS; + +/* + * This the index entry header (see below). + */ +typedef struct { +/* 0*/ union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; +/* 8*/ le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ +/* 10*/ le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ +/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ +/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ +/* sizeof() = 16 bytes */ +} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; + +/* + * This is an index entry. A sequence of such entries follows each INDEX_HEADER + * structure. Together they make up a complete index. The index follows either + * an index root attribute or an index allocation attribute. + * + * NOTE: Before NTFS 3.0 only filename attributes were indexed. + */ +typedef struct { +/*Ofs*/ +/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ + union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; + le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ + le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ + INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ + le16 reserved; /* Reserved/align to 8-byte boundary. */ + +/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present + if INDEX_ENTRY_END bit in flags is not set. NOTE: On + NTFS versions before 3.0 the only valid key is the + FILE_NAME_ATTR. On NTFS 3.0+ the following + additional index keys are defined: */ + FILE_NAME_ATTR file_name;/* $I30 index in directories. */ + SII_INDEX_KEY sii; /* $SII index in $Secure. */ + SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ + GUID object_id; /* $O index in FILE_Extend/$ObjId: The + object_id of the mft record found in + the data part of the index. */ + REPARSE_INDEX_KEY reparse; /* $R index in + FILE_Extend/$Reparse. */ + SID sid; /* $O index in FILE_Extend/$Quota: + SID of the owner of the user_id. */ + le32 owner_id; /* $Q index in FILE_Extend/$Quota: + user_id of the owner of the quota + control entry in the data part of + the index. */ + } __attribute__ ((__packed__)) key; + /* The (optional) index data is inserted here when creating. */ + // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last + // eight bytes of this index entry contain the virtual + // cluster number of the index block that holds the + // entries immediately preceding the current entry (the + // vcn references the corresponding cluster in the data + // of the non-resident index allocation attribute). If + // the key_length is zero, then the vcn immediately + // follows the INDEX_ENTRY_HEADER. Regardless of + // key_length, the address of the 8-byte boundary + // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by + // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), + // where sizeof(VCN) can be hardcoded as 8 if wanted. */ +} __attribute__ ((__packed__)) INDEX_ENTRY; + +/* + * Attribute: Bitmap (0xb0). + * + * Contains an array of bits (aka a bitfield). + * + * When used in conjunction with the index allocation attribute, each bit + * corresponds to one index block within the index allocation attribute. Thus + * the number of bits in the bitmap * index block size / cluster size is the + * number of clusters in the index allocation attribute. + */ +typedef struct { + u8 bitmap[0]; /* Array of bits. */ +} __attribute__ ((__packed__)) BITMAP_ATTR; + +/* + * The reparse point tag defines the type of the reparse point. It also + * includes several flags, which further describe the reparse point. + * + * The reparse point tag is an unsigned 32-bit value divided in three parts: + * + * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of + * the reparse point. + * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. + * 3. The most significant three bits are flags describing the reparse point. + * They are defined as follows: + * bit 29: Name surrogate bit. If set, the filename is an alias for + * another object in the system. + * bit 30: High-latency bit. If set, accessing the first byte of data will + * be slow. (E.g. the data is stored on a tape drive.) + * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User + * defined tags have to use zero here. + * + * These are the predefined reparse point tags: + */ +enum { + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), + + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), + + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), + + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), + + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), + + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), + + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), +}; + +/* + * Attribute: Reparse point (0xc0). + * + * NOTE: Can be resident or non-resident. + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + le16 reparse_data_length; /* Byte size of reparse data. */ + le16 reserved; /* Align to 8-byte boundary. */ + u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ +} __attribute__ ((__packed__)) REPARSE_POINT; + +/* + * Attribute: Extended attribute (EA) information (0xd0). + * + * NOTE: Always resident. (Is this true???) + */ +typedef struct { + le16 ea_length; /* Byte size of the packed extended + attributes. */ + le16 need_ea_count; /* The number of extended attributes which have + the NEED_EA bit set. */ + le32 ea_query_length; /* Byte size of the buffer required to query + the extended attributes when calling + ZwQueryEaFile() in Windows NT/2k. I.e. the + byte size of the unpacked extended + attributes. */ +} __attribute__ ((__packed__)) EA_INFORMATION; + +/* + * Extended attribute flags (8-bit). + */ +enum { + NEED_EA = 0x80 /* If set the file to which the EA belongs + cannot be interpreted without understanding + the associates extended attributes. */ +} __attribute__ ((__packed__)); + +typedef u8 EA_FLAGS; + +/* + * Attribute: Extended attribute (EA) (0xe0). + * + * NOTE: Can be resident or non-resident. + * + * Like the attribute list and the index buffer list, the EA attribute value is + * a sequence of EA_ATTR variable length records. + */ +typedef struct { + le32 next_entry_offset; /* Offset to the next EA_ATTR. */ + EA_FLAGS flags; /* Flags describing the EA. */ + u8 ea_name_length; /* Length of the name of the EA in bytes + excluding the '\0' byte terminator. */ + le16 ea_value_length; /* Byte size of the EA's value. */ + u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not + Unicode and it is zero terminated. */ + u8 ea_value[0]; /* The value of the EA. Immediately follows + the name. */ +} __attribute__ ((__packed__)) EA_ATTR; + +/* + * Attribute: Property set (0xf0). + * + * Intended to support Native Structure Storage (NSS) - a feature removed from + * NTFS 3.0 during beta testing. + */ +typedef struct { + /* Irrelevant as feature unused. */ +} __attribute__ ((__packed__)) PROPERTY_SET; + +/* + * Attribute: Logged utility stream (0x100). + * + * NOTE: Can be resident or non-resident. + * + * Operations on this attribute are logged to the journal ($LogFile) like + * normal metadata changes. + * + * Used by the Encrypting File System (EFS). All encrypted files have this + * attribute with the name $EFS. + */ +typedef struct { + /* Can be anything the creator chooses. */ + /* EFS uses it as follows: */ + // FIXME: Type this info, verifying it along the way. (AIA) +} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; + +#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/kernel/fs/ntfs/lcnalloc.c b/kernel/fs/ntfs/lcnalloc.c new file mode 100644 index 000000000..1711b710b --- /dev/null +++ b/kernel/fs/ntfs/lcnalloc.c @@ -0,0 +1,1014 @@ +/* + * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include + +#include "lcnalloc.h" +#include "debug.h" +#include "bitmap.h" +#include "inode.h" +#include "volume.h" +#include "attrib.h" +#include "malloc.h" +#include "aops.h" +#include "ntfs.h" + +/** + * ntfs_cluster_free_from_rl_nolock - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - The volume lcn bitmap must be locked for writing on entry and is + * left locked on return. + */ +int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl) +{ + struct inode *lcnbmp_vi = vol->lcnbmp_ino; + int ret = 0; + + ntfs_debug("Entering."); + if (!rl) + return 0; + for (; rl->length; rl++) { + int err; + + if (rl->lcn < 0) + continue; + err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); + if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) + ret = err; + } + ntfs_debug("Done."); + return ret; +} + +/** + * ntfs_cluster_alloc - allocate clusters on an ntfs volume + * @vol: mounted ntfs volume on which to allocate the clusters + * @start_vcn: vcn to use for the first allocated cluster + * @count: number of clusters to allocate + * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) + * @zone: zone from which to allocate the clusters + * @is_extension: if 'true', this is an attribute extension + * + * Allocate @count clusters preferably starting at cluster @start_lcn or at the + * current allocator position if @start_lcn is -1, on the mounted ntfs volume + * @vol. @zone is either DATA_ZONE for allocation of normal clusters or + * MFT_ZONE for allocation of clusters for the master file table, i.e. the + * $MFT/$DATA attribute. + * + * @start_vcn specifies the vcn of the first allocated cluster. This makes + * merging the resulting runlist with the old runlist easier. + * + * If @is_extension is 'true', the caller is allocating clusters to extend an + * attribute and if it is 'false', the caller is allocating clusters to fill a + * hole in an attribute. Practically the difference is that if @is_extension + * is 'true' the returned runlist will be terminated with LCN_ENOENT and if + * @is_extension is 'false' the runlist will be terminated with + * LCN_RL_NOT_MAPPED. + * + * You need to check the return value with IS_ERR(). If this is false, the + * function was successful and the return value is a runlist describing the + * allocated cluster(s). If IS_ERR() is true, the function failed and + * PTR_ERR() gives you the error code. + * + * Notes on the allocation algorithm + * ================================= + * + * There are two data zones. First is the area between the end of the mft zone + * and the end of the volume, and second is the area between the start of the + * volume and the start of the mft zone. On unmodified/standard NTFS 1.x + * volumes, the second data zone does not exist due to the mft zone being + * expanded to cover the start of the volume in order to reserve space for the + * mft bitmap attribute. + * + * This is not the prettiest function but the complexity stems from the need of + * implementing the mft vs data zoned approach and from the fact that we have + * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we + * need to cope with crossing over boundaries of two buffers. Further, the + * fact that the allocator allows for caller supplied hints as to the location + * of where allocation should begin and the fact that the allocator keeps track + * of where in the data zones the next natural allocation should occur, + * contribute to the complexity of the function. But it should all be + * worthwhile, because this allocator should: 1) be a full implementation of + * the MFT zone approach used by Windows NT, 2) cause reduction in + * fragmentation, and 3) be speedy in allocations (the code is not optimized + * for speed, but the algorithm is, so further speed improvements are probably + * possible). + * + * FIXME: We should be monitoring cluster allocation and increment the MFT zone + * size dynamically but this is something for the future. We will just cause + * heavier fragmentation by not doing it and I am not even sure Windows would + * grow the MFT zone dynamically, so it might even be correct not to do this. + * The overhead in doing dynamic MFT zone expansion would be very large and + * unlikely worth the effort. (AIA) + * + * TODO: I have added in double the required zone position pointer wrap around + * logic which can be optimized to having only one of the two logic sets. + * However, having the double logic will work fine, but if we have only one of + * the sets and we get it wrong somewhere, then we get into trouble, so + * removing the duplicate logic requires _very_ careful consideration of _all_ + * possible code paths. So at least for now, I am leaving the double logic - + * better safe than sorry... (AIA) + * + * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + */ +runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, + const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension) +{ + LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; + LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; + s64 clusters; + loff_t i_size; + struct inode *lcnbmp_vi; + runlist_element *rl = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *buf, *byte; + int err = 0, rlpos, rlsize, buf_size; + u8 pass, done_zones, search_zone, need_writeback = 0, bit; + + ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " + "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, + (unsigned long long)count, + (unsigned long long)start_lcn, + zone == MFT_ZONE ? "MFT" : "DATA"); + BUG_ON(!vol); + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < 0); + BUG_ON(start_lcn < -1); + BUG_ON(zone < FIRST_ZONE); + BUG_ON(zone > LAST_ZONE); + + /* Return NULL if @count is zero. */ + if (!count) + return NULL; + /* Take the lcnbmp lock for writing. */ + down_write(&vol->lcnbmp_lock); + /* + * If no specific @start_lcn was requested, use the current data zone + * position, otherwise use the requested @start_lcn but make sure it + * lies outside the mft zone. Also set done_zones to 0 (no zones done) + * and pass depending on whether we are starting inside a zone (1) or + * at the beginning of a zone (2). If requesting from the MFT_ZONE, + * we either start at the current position within the mft zone or at + * the specified position. If the latter is out of bounds then we start + * at the beginning of the MFT_ZONE. + */ + done_zones = 0; + pass = 1; + /* + * zone_start and zone_end are the current search range. search_zone + * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of + * volume) and 4 for data zone 2 (start of volume till start of mft + * zone). + */ + zone_start = start_lcn; + if (zone_start < 0) { + if (zone == DATA_ZONE) + zone_start = vol->data1_zone_pos; + else + zone_start = vol->mft_zone_pos; + if (!zone_start) { + /* + * Zone starts at beginning of volume which means a + * single pass is sufficient. + */ + pass = 2; + } + } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && + zone_start < vol->mft_zone_end) { + zone_start = vol->mft_zone_end; + /* + * Starting at beginning of data1_zone which means a single + * pass in this zone is sufficient. + */ + pass = 2; + } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || + zone_start >= vol->mft_zone_end)) { + zone_start = vol->mft_lcn; + if (!vol->mft_zone_end) + zone_start = 0; + /* + * Starting at beginning of volume which means a single pass + * is sufficient. + */ + pass = 2; + } + if (zone == MFT_ZONE) { + zone_end = vol->mft_zone_end; + search_zone = 1; + } else /* if (zone == DATA_ZONE) */ { + /* Skip searching the mft zone. */ + done_zones |= 1; + if (zone_start >= vol->mft_zone_end) { + zone_end = vol->nr_clusters; + search_zone = 2; + } else { + zone_end = vol->mft_zone_start; + search_zone = 4; + } + } + /* + * bmp_pos is the current bit position inside the bitmap. We use + * bmp_initial_pos to determine whether or not to do a zone switch. + */ + bmp_pos = bmp_initial_pos = zone_start; + + /* Loop until all clusters are allocated, i.e. clusters == 0. */ + clusters = count; + rlpos = rlsize = 0; + mapping = lcnbmp_vi->i_mapping; + i_size = i_size_read(lcnbmp_vi); + while (1) { + ntfs_debug("Start of outer while loop: done_zones 0x%x, " + "search_zone %i, pass %i, zone_start 0x%llx, " + "zone_end 0x%llx, bmp_initial_pos 0x%llx, " + "bmp_pos 0x%llx, rlpos %i, rlsize %i.", + done_zones, search_zone, pass, + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_initial_pos, + (unsigned long long)bmp_pos, rlpos, rlsize); + /* Loop until we run out of free clusters. */ + last_read_pos = bmp_pos >> 3; + ntfs_debug("last_read_pos 0x%llx.", + (unsigned long long)last_read_pos); + if (last_read_pos > i_size) { + ntfs_debug("End of attribute reached. " + "Skipping to zone_pass_done."); + goto zone_pass_done; + } + if (likely(page)) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + page = ntfs_map_page(mapping, last_read_pos >> + PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + ntfs_error(vol->sb, "Failed to map page."); + goto out; + } + buf_size = last_read_pos & ~PAGE_CACHE_MASK; + buf = page_address(page) + buf_size; + buf_size = PAGE_CACHE_SIZE - buf_size; + if (unlikely(last_read_pos + buf_size > i_size)) + buf_size = i_size - last_read_pos; + buf_size <<= 3; + lcn = bmp_pos & 7; + bmp_pos &= ~(LCN)7; + ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " + "bmp_pos 0x%llx, need_writeback %i.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + while (lcn < buf_size && lcn + bmp_pos < zone_end) { + byte = buf + (lcn >> 3); + ntfs_debug("In inner while loop: buf_size %i, " + "lcn 0x%llx, bmp_pos 0x%llx, " + "need_writeback %i, byte ofs 0x%x, " + "*byte 0x%x.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + need_writeback, + (unsigned int)(lcn >> 3), + (unsigned int)*byte); + /* Skip full bytes. */ + if (*byte == 0xff) { + lcn = (lcn + 8) & ~(LCN)7; + ntfs_debug("Continuing while loop 1."); + continue; + } + bit = 1 << (lcn & 7); + ntfs_debug("bit 0x%x.", bit); + /* If the bit is already set, go onto the next one. */ + if (*byte & bit) { + lcn++; + ntfs_debug("Continuing while loop 2."); + continue; + } + /* + * Allocate more memory if needed, including space for + * the terminator element. + * ntfs_malloc_nofs() operates on whole pages only. + */ + if ((rlpos + 2) * sizeof(*rl) > rlsize) { + runlist_element *rl2; + + ntfs_debug("Reallocating memory."); + if (!rl) + ntfs_debug("First free bit is at LCN " + "0x%llx.", + (unsigned long long) + (lcn + bmp_pos)); + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + err = -ENOMEM; + ntfs_error(vol->sb, "Failed to " + "allocate memory."); + goto out; + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + ntfs_debug("Reallocated memory, rlsize 0x%x.", + rlsize); + } + /* Allocate the bitmap bit. */ + *byte |= bit; + /* We need to write this bitmap page to disk. */ + need_writeback = 1; + ntfs_debug("*byte 0x%x, need_writeback is set.", + (unsigned int)*byte); + /* + * Coalesce with previous run if adjacent LCNs. + * Otherwise, append a new run. + */ + ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " + "prev_lcn 0x%llx, lcn 0x%llx, " + "bmp_pos 0x%llx, prev_run_len 0x%llx, " + "rlpos %i.", + (unsigned long long)(lcn + bmp_pos), + 1ULL, (unsigned long long)prev_lcn, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + (unsigned long long)prev_run_len, + rlpos); + if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { + ntfs_debug("Coalescing to run (lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos - 1].length = ++prev_run_len; + ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " + "prev_run_len 0x%llx.", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length, + (unsigned long long) + prev_run_len); + } else { + if (likely(rlpos)) { + ntfs_debug("Adding new run, (previous " + "run lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos].vcn = rl[rlpos - 1].vcn + + prev_run_len; + } else { + ntfs_debug("Adding new run, is first " + "run."); + rl[rlpos].vcn = start_vcn; + } + rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; + rl[rlpos].length = prev_run_len = 1; + rlpos++; + } + /* Done? */ + if (!--clusters) { + LCN tc; + /* + * Update the current zone position. Positions + * of already scanned zones have been updated + * during the respective zone switches. + */ + tc = lcn + bmp_pos + 1; + ntfs_debug("Done. Updating current zone " + "position, tc 0x%llx, " + "search_zone %i.", + (unsigned long long)tc, + search_zone); + switch (search_zone) { + case 1: + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + break; + case 2: + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + break; + case 4: + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + break; + default: + BUG(); + } + ntfs_debug("Finished. Going to out."); + goto out; + } + lcn++; + } + bmp_pos += buf_size; + ntfs_debug("After inner while loop: buf_size 0x%x, lcn " + "0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + if (bmp_pos < zone_end) { + ntfs_debug("Continuing outer while loop, " + "bmp_pos 0x%llx, zone_end 0x%llx.", + (unsigned long long)bmp_pos, + (unsigned long long)zone_end); + continue; + } +zone_pass_done: /* Finished with the current zone pass. */ + ntfs_debug("At zone_pass_done, pass %i.", pass); + if (pass == 1) { + /* + * Now do pass 2, scanning the first part of the zone + * we omitted in pass 1. + */ + pass = 2; + zone_end = zone_start; + switch (search_zone) { + case 1: /* mft_zone */ + zone_start = vol->mft_zone_start; + break; + case 2: /* data1_zone */ + zone_start = vol->mft_zone_end; + break; + case 4: /* data2_zone */ + zone_start = 0; + break; + default: + BUG(); + } + /* Sanity check. */ + if (zone_end < zone_start) + zone_end = zone_start; + bmp_pos = zone_start; + ntfs_debug("Continuing outer while loop, pass 2, " + "zone_start 0x%llx, zone_end 0x%llx, " + "bmp_pos 0x%llx.", + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_pos); + continue; + } /* pass == 2 */ +done_zones_check: + ntfs_debug("At done_zones_check, search_zone %i, done_zones " + "before 0x%x, done_zones after 0x%x.", + search_zone, done_zones, + done_zones | search_zone); + done_zones |= search_zone; + if (done_zones < 7) { + ntfs_debug("Switching zone."); + /* Now switch to the next zone we haven't done yet. */ + pass = 1; + switch (search_zone) { + case 1: + ntfs_debug("Switching from mft zone to data1 " + "zone."); + /* Update mft zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + } + /* Switch from mft zone to data1 zone. */ +switch_to_data1_zone: search_zone = 2; + zone_start = bmp_initial_pos = + vol->data1_zone_pos; + zone_end = vol->nr_clusters; + if (zone_start == vol->mft_zone_end) + pass = 2; + if (zone_start >= zone_end) { + vol->data1_zone_pos = zone_start = + vol->mft_zone_end; + pass = 2; + } + break; + case 2: + ntfs_debug("Switching from data1 zone to " + "data2 zone."); + /* Update data1 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + } + /* Switch from data1 zone to data2 zone. */ + search_zone = 4; + zone_start = bmp_initial_pos = + vol->data2_zone_pos; + zone_end = vol->mft_zone_start; + if (!zone_start) + pass = 2; + if (zone_start >= zone_end) { + vol->data2_zone_pos = zone_start = + bmp_initial_pos = 0; + pass = 2; + } + break; + case 4: + ntfs_debug("Switching from data2 zone to " + "data1 zone."); + /* Update data2 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + } + /* Switch from data2 zone to data1 zone. */ + goto switch_to_data1_zone; + default: + BUG(); + } + ntfs_debug("After zone switch, search_zone %i, " + "pass %i, bmp_initial_pos 0x%llx, " + "zone_start 0x%llx, zone_end 0x%llx.", + search_zone, pass, + (unsigned long long)bmp_initial_pos, + (unsigned long long)zone_start, + (unsigned long long)zone_end); + bmp_pos = zone_start; + if (zone_start == zone_end) { + ntfs_debug("Empty zone, going to " + "done_zones_check."); + /* Empty zone. Don't bother searching it. */ + goto done_zones_check; + } + ntfs_debug("Continuing outer while loop."); + continue; + } /* done_zones == 7 */ + ntfs_debug("All zones are finished."); + /* + * All zones are finished! If DATA_ZONE, shrink mft zone. If + * MFT_ZONE, we have really run out of space. + */ + mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; + ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " + "0x%llx, mft_zone_size 0x%llx.", + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)mft_zone_size); + if (zone == MFT_ZONE || mft_zone_size <= 0) { + ntfs_debug("No free clusters left, going to out."); + /* Really no more space left on device. */ + err = -ENOSPC; + goto out; + } /* zone == DATA_ZONE && mft_zone_size > 0 */ + ntfs_debug("Shrinking mft zone."); + zone_end = vol->mft_zone_end; + mft_zone_size >>= 1; + if (mft_zone_size > 0) + vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; + else /* mft zone and data2 zone no longer exist. */ + vol->data2_zone_pos = vol->mft_zone_start = + vol->mft_zone_end = 0; + if (vol->mft_zone_pos >= vol->mft_zone_end) { + vol->mft_zone_pos = vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } + bmp_pos = zone_start = bmp_initial_pos = + vol->data1_zone_pos = vol->mft_zone_end; + search_zone = 2; + pass = 2; + done_zones &= ~2; + ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " + "vol->mft_zone_start 0x%llx, " + "vol->mft_zone_end 0x%llx, " + "vol->mft_zone_pos 0x%llx, search_zone 2, " + "pass 2, dones_zones 0x%x, zone_start 0x%llx, " + "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " + "continuing outer while loop.", + (unsigned long long)mft_zone_size, + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)vol->mft_zone_pos, + done_zones, (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)vol->data1_zone_pos); + } + ntfs_debug("After outer while loop."); +out: + ntfs_debug("At out."); + /* Add runlist terminator element. */ + if (likely(rl)) { + rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; + rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; + rl[rlpos].length = 0; + } + if (likely(page && !IS_ERR(page))) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + if (likely(!err)) { + up_write(&vol->lcnbmp_lock); + ntfs_debug("Done."); + return rl; + } + ntfs_error(vol->sb, "Failed to allocate clusters, aborting " + "(error %i).", err); + if (rl) { + int err2; + + if (err == -ENOSPC) + ntfs_debug("Not enough space to complete allocation, " + "err -ENOSPC, first free lcn 0x%llx, " + "could allocate up to 0x%llx " + "clusters.", + (unsigned long long)rl[0].lcn, + (unsigned long long)(count - clusters)); + /* Deallocate all allocated clusters. */ + ntfs_debug("Attempting rollback..."); + err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); + if (err2) { + ntfs_error(vol->sb, "Failed to rollback (error %i). " + "Leaving inconsistent metadata! " + "Unmount and run chkdsk.", err2); + NVolSetErrors(vol); + } + /* Free the runlist. */ + ntfs_free(rl); + } else if (err == -ENOSPC) + ntfs_debug("No space left at all, err = -ENOSPC, first free " + "lcn = 0x%llx.", + (long long)vol->data1_zone_pos); + up_write(&vol->lcnbmp_lock); + return ERR_PTR(err); +} + +/** + * __ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * @is_rollback: true if this is a rollback operation + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the vfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when __ntfs_cluster_free() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will + * perform the necessary mapping and unmapping. + * + * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_cluster_free() instead. + * + * Note, __ntfs_cluster_free() does not modify the runlist, so you have to + * remove from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, + ntfs_attr_search_ctx *ctx, const bool is_rollback) +{ + s64 delta, to_free, total_freed, real_freed; + ntfs_volume *vol; + struct inode *lcnbmp_vi; + runlist_element *rl; + int err; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " + "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, + (unsigned long long)count, + is_rollback ? " (rollback)" : ""); + vol = ni->vol; + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < -1); + /* + * Lock the lcn bitmap for writing but only if not rolling back. We + * must hold the lock all the way including through rollback otherwise + * rollback is not possible because once we have cleared a bit and + * dropped the lock, anyone could have set the bit again, thus + * allocating the cluster for another use. + */ + if (likely(!is_rollback)) + down_write(&vol->lcnbmp_lock); + + total_freed = real_freed = 0; + + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); + if (IS_ERR(rl)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to find first runlist " + "element (error %li), aborting.", + PTR_ERR(rl)); + err = PTR_ERR(rl); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "First runlist element has " + "invalid lcn, aborting."); + err = -EIO; + goto err_out; + } + /* Find the starting cluster inside the run that needs freeing. */ + delta = start_vcn - rl->vcn; + + /* The number of clusters in this run that need freeing. */ + to_free = rl->length - delta; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in this run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear first run " + "(error %i), aborting.", err); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed = to_free; + }; + /* Go to the next run and adjust the number of clusters left to free. */ + ++rl; + if (count >= 0) + count -= to_free; + + /* Keep track of the total "freed" clusters, including sparse ones. */ + total_freed = to_free; + /* + * Loop over the remaining runs, using @count as a capping value, and + * free them. + */ + for (; rl->length && count != 0; ++rl) { + if (unlikely(rl->lcn < LCN_HOLE)) { + VCN vcn; + + /* Attempt to map runlist. */ + vcn = rl->vcn; + rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (!is_rollback) + ntfs_error(vol->sb, "Failed to map " + "runlist fragment or " + "failed to find " + "subsequent runlist " + "element."); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "Runlist element " + "has invalid lcn " + "(0x%llx).", + (unsigned long long) + rl->lcn); + err = -EIO; + goto err_out; + } + } + /* The number of clusters in this run that need freeing. */ + to_free = rl->length; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in the run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear " + "subsequent run."); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed += to_free; + } + /* Adjust the number of clusters left to free. */ + if (count >= 0) + count -= to_free; + + /* Update the total done clusters. */ + total_freed += to_free; + } + if (likely(!is_rollback)) + up_write(&vol->lcnbmp_lock); + + BUG_ON(count > 0); + + /* We are done. Return the number of actually freed clusters. */ + ntfs_debug("Done."); + return real_freed; +err_out: + if (is_rollback) + return err; + /* If no real clusters were freed, no need to rollback. */ + if (!real_freed) { + up_write(&vol->lcnbmp_lock); + return err; + } + /* + * Attempt to rollback and if that succeeds just return the error code. + * If rollback fails, set the volume errors flag, emit an error + * message, and return the error code. + */ + delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); + if (delta < 0) { + ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " + "inconsistent metadata! Unmount and run " + "chkdsk.", (int)delta); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + ntfs_error(vol->sb, "Aborting (error %i).", err); + return err; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/lcnalloc.h b/kernel/fs/ntfs/lcnalloc.h new file mode 100644 index 000000000..2adb04316 --- /dev/null +++ b/kernel/fs/ntfs/lcnalloc.h @@ -0,0 +1,145 @@ +/* + * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LCNALLOC_H +#define _LINUX_NTFS_LCNALLOC_H + +#ifdef NTFS_RW + +#include + +#include "attrib.h" +#include "types.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +typedef enum { + FIRST_ZONE = 0, /* For sanity checking. */ + MFT_ZONE = 0, /* Allocate from $MFT zone. */ + DATA_ZONE = 1, /* Allocate from $DATA zone. */ + LAST_ZONE = 1, /* For sanity checking. */ +} NTFS_CLUSTER_ALLOCATION_ZONES; + +extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, + const VCN start_vcn, const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension); + +extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); + +/** + * ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the ntfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_cluster_free() encounters unmapped runlist + * fragments and allows their mapping. If you do not have the mft record + * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform + * the necessary mapping and unmapping. + * + * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove + * from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx) +{ + return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); +} + +extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl); + +/** + * ntfs_cluster_free_from_rl - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - The caller must have locked the runlist @rl for reading or + * writing. + */ +static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, + const runlist_element *rl) +{ + int ret; + + down_write(&vol->lcnbmp_lock); + ret = ntfs_cluster_free_from_rl_nolock(vol, rl); + up_write(&vol->lcnbmp_lock); + return ret; +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/kernel/fs/ntfs/logfile.c b/kernel/fs/ntfs/logfile.c new file mode 100644 index 000000000..c71de292c --- /dev/null +++ b/kernel/fs/ntfs/logfile.c @@ -0,0 +1,863 @@ +/* + * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2002-2007 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "debug.h" +#include "logfile.h" +#include "malloc.h" +#include "volume.h" +#include "ntfs.h" + +/** + * ntfs_check_restart_page_header - check the page header for consistency + * @vi: $LogFile inode to which the restart page header belongs + * @rp: restart page header to check + * @pos: position in @vi at which the restart page header resides + * + * Check the restart page header @rp for consistency and return 'true' if it is + * consistent and 'false' otherwise. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_page_header(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos) +{ + u32 logfile_system_page_size, logfile_log_page_size; + u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; + bool have_usa = true; + + ntfs_debug("Entering."); + /* + * If the system or log page sizes are smaller than the ntfs block size + * or either is not a power of 2 we cannot handle this log file. + */ + logfile_system_page_size = le32_to_cpu(rp->system_page_size); + logfile_log_page_size = le32_to_cpu(rp->log_page_size); + if (logfile_system_page_size < NTFS_BLOCK_SIZE || + logfile_log_page_size < NTFS_BLOCK_SIZE || + logfile_system_page_size & + (logfile_system_page_size - 1) || + !is_power_of_2(logfile_log_page_size)) { + ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); + return false; + } + /* + * We must be either at !pos (1st restart page) or at pos = system page + * size (2nd restart page). + */ + if (pos && pos != logfile_system_page_size) { + ntfs_error(vi->i_sb, "Found restart area in incorrect " + "position in $LogFile."); + return false; + } + /* We only know how to handle version 1.1. */ + if (sle16_to_cpu(rp->major_ver) != 1 || + sle16_to_cpu(rp->minor_ver) != 1) { + ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " + "supported. (This driver supports version " + "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), + (int)sle16_to_cpu(rp->minor_ver)); + return false; + } + /* + * If chkdsk has been run the restart page may not be protected by an + * update sequence array. + */ + if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { + have_usa = false; + goto skip_usa_checks; + } + /* Verify the size of the update sequence array. */ + usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); + if (usa_count != le16_to_cpu(rp->usa_count)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array count."); + return false; + } + /* Verify the position of the update sequence array. */ + usa_ofs = le16_to_cpu(rp->usa_ofs); + usa_end = usa_ofs + usa_count * sizeof(u16); + if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || + usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array offset."); + return false; + } +skip_usa_checks: + /* + * Verify the position of the restart area. It must be: + * - aligned to 8-byte boundary, + * - after the update sequence array, and + * - within the system page size. + */ + ra_ofs = le16_to_cpu(rp->restart_area_offset); + if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : + ra_ofs < sizeof(RESTART_PAGE_HEADER)) || + ra_ofs > logfile_system_page_size) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent restart area offset."); + return false; + } + /* + * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn + * set. + */ + if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { + ntfs_error(vi->i_sb, "$LogFile restart page is not modified " + "by chkdsk but a chkdsk LSN is specified."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_restart_area - check the restart area for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose restart area to check + * + * Check the restart area of the restart page @rp for consistency and return + * 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header has already been + * consistency checked. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) +{ + u64 file_size; + RESTART_AREA *ra; + u16 ra_ofs, ra_len, ca_ofs; + u8 fs_bits; + + ntfs_debug("Entering."); + ra_ofs = le16_to_cpu(rp->restart_area_offset); + ra = (RESTART_AREA*)((u8*)rp + ra_ofs); + /* + * Everything before ra->file_size must be before the first word + * protected by an update sequence number. This ensures that it is + * safe to access ra->client_array_offset. + */ + if (ra_ofs + offsetof(RESTART_AREA, file_size) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent file offset."); + return false; + } + /* + * Now that we can access ra->client_array_offset, make sure everything + * up to the log client array is before the first word protected by an + * update sequence number. This ensures we can access all of the + * restart area elements safely. Also, the client array offset must be + * aligned to an 8-byte boundary. + */ + ca_ofs = le16_to_cpu(ra->client_array_offset); + if (((ca_ofs + 7) & ~7) != ca_ofs || + ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent client array offset."); + return false; + } + /* + * The restart area must end within the system page size both when + * calculated manually and as specified by ra->restart_area_length. + * Also, the calculated length must not exceed the specified length. + */ + ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * + sizeof(LOG_CLIENT_RECORD); + if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || + ra_ofs + le16_to_cpu(ra->restart_area_length) > + le32_to_cpu(rp->system_page_size) || + ra_len > le16_to_cpu(ra->restart_area_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " + "of the system page size specified by the " + "restart page header and/or the specified " + "restart area length is inconsistent."); + return false; + } + /* + * The ra->client_free_list and ra->client_in_use_list must be either + * LOGFILE_NO_CLIENT or less than ra->log_clients or they are + * overflowing the client array. + */ + if ((ra->client_free_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_free_list) >= + le16_to_cpu(ra->log_clients)) || + (ra->client_in_use_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_in_use_list) >= + le16_to_cpu(ra->log_clients))) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "overflowing client free and/or in use lists."); + return false; + } + /* + * Check ra->seq_number_bits against ra->file_size for consistency. + * We cannot just use ffs() because the file size is not a power of 2. + */ + file_size = (u64)sle64_to_cpu(ra->file_size); + fs_bits = 0; + while (file_size) { + file_size >>= 1; + fs_bits++; + } + if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent sequence number bits."); + return false; + } + /* The log record header length must be a multiple of 8. */ + if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != + le16_to_cpu(ra->log_record_header_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log record header length."); + return false; + } + /* Dito for the log page data offset. */ + if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != + le16_to_cpu(ra->log_page_data_offset)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log page data offset."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_log_client_array - check the log client array for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose log client array to check + * + * Check the log client array of the restart page @rp for consistency and + * return 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header and the restart area have + * already been consistency checked. + * + * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this + * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full + * restart page and the page must be multi sector transfer deprotected. + */ +static bool ntfs_check_log_client_array(struct inode *vi, + RESTART_PAGE_HEADER *rp) +{ + RESTART_AREA *ra; + LOG_CLIENT_RECORD *ca, *cr; + u16 nr_clients, idx; + bool in_free_list, idx_is_first; + + ntfs_debug("Entering."); + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + ca = (LOG_CLIENT_RECORD*)((u8*)ra + + le16_to_cpu(ra->client_array_offset)); + /* + * Check the ra->client_free_list first and then check the + * ra->client_in_use_list. Check each of the log client records in + * each of the lists and check that the array does not overflow the + * ra->log_clients value. Also keep track of the number of records + * visited as there cannot be more than ra->log_clients records and + * that way we detect eventual loops in within a list. + */ + nr_clients = le16_to_cpu(ra->log_clients); + idx = le16_to_cpu(ra->client_free_list); + in_free_list = true; +check_list: + for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, + idx = le16_to_cpu(cr->next_client)) { + if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) + goto err_out; + /* Set @cr to the current log client record. */ + cr = ca + idx; + /* The first log client record must not have a prev_client. */ + if (idx_is_first) { + if (cr->prev_client != LOGFILE_NO_CLIENT) + goto err_out; + idx_is_first = false; + } + } + /* Switch to and check the in use list if we just did the free list. */ + if (in_free_list) { + in_free_list = false; + idx = le16_to_cpu(ra->client_in_use_list); + goto check_list; + } + ntfs_debug("Done."); + return true; +err_out: + ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); + return false; +} + +/** + * ntfs_check_and_load_restart_page - check the restart page for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page to check + * @pos: position in @vi at which the restart page resides + * @wrp: [OUT] copy of the multi sector transfer deprotected restart page + * @lsn: [OUT] set to the current logfile lsn on success + * + * Check the restart page @rp for consistency and return 0 if it is consistent + * and -errno otherwise. The restart page may have been modified by chkdsk in + * which case its magic is CHKD instead of RSTR. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + * + * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a + * copy of the complete multi sector transfer deprotected page. On failure, + * *@wrp is undefined. + * + * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current + * logfile lsn according to this restart page. On failure, *@lsn is undefined. + * + * The following error codes are defined: + * -EINVAL - The restart page is inconsistent. + * -ENOMEM - Not enough memory to load the restart page. + * -EIO - Failed to reading from $LogFile. + */ +static int ntfs_check_and_load_restart_page(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, + LSN *lsn) +{ + RESTART_AREA *ra; + RESTART_PAGE_HEADER *trp; + int size, err; + + ntfs_debug("Entering."); + /* Check the restart page header for consistency. */ + if (!ntfs_check_restart_page_header(vi, rp, pos)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + /* Check the restart area for consistency. */ + if (!ntfs_check_restart_area(vi, rp)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * Allocate a buffer to store the whole restart page so we can multi + * sector transfer deprotect it. + */ + trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); + if (!trp) { + ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " + "restart page buffer."); + return -ENOMEM; + } + /* + * Read the whole of the restart page into the buffer. If it fits + * completely inside @rp, just copy it from there. Otherwise map all + * the required pages and copy the data from them. + */ + size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK); + if (size >= le32_to_cpu(rp->system_page_size)) { + memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); + } else { + pgoff_t idx; + struct page *page; + int have_read, to_read; + + /* First copy what we already have in @rp. */ + memcpy(trp, rp, size); + /* Copy the remaining data one page at a time. */ + have_read = size; + to_read = le32_to_cpu(rp->system_page_size) - size; + idx = (pos + size) >> PAGE_CACHE_SHIFT; + BUG_ON((pos + size) & ~PAGE_CACHE_MASK); + do { + page = ntfs_map_page(vi->i_mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vi->i_sb, "Error mapping $LogFile " + "page (index %lu).", idx); + err = PTR_ERR(page); + if (err != -EIO && err != -ENOMEM) + err = -EIO; + goto err_out; + } + size = min_t(int, to_read, PAGE_CACHE_SIZE); + memcpy((u8*)trp + have_read, page_address(page), size); + ntfs_unmap_page(page); + have_read += size; + to_read -= size; + idx++; + } while (to_read > 0); + } + /* + * Perform the multi sector transfer deprotection on the buffer if the + * restart page is protected. + */ + if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) + && post_read_mst_fixup((NTFS_RECORD*)trp, + le32_to_cpu(rp->system_page_size))) { + /* + * A multi sector tranfer error was detected. We only need to + * abort if the restart page contents exceed the multi sector + * transfer fixup of the first sector. + */ + if (le16_to_cpu(rp->restart_area_offset) + + le16_to_cpu(ra->restart_area_length) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "Multi sector transfer error " + "detected in $LogFile restart page."); + err = -EINVAL; + goto err_out; + } + } + /* + * If the restart page is modified by chkdsk or there are no active + * logfile clients, the logfile is consistent. Otherwise, need to + * check the log client records for consistency, too. + */ + err = 0; + if (ntfs_is_rstr_record(rp->magic) && + ra->client_in_use_list != LOGFILE_NO_CLIENT) { + if (!ntfs_check_log_client_array(vi, trp)) { + err = -EINVAL; + goto err_out; + } + } + if (lsn) { + if (ntfs_is_rstr_record(rp->magic)) + *lsn = sle64_to_cpu(ra->current_lsn); + else /* if (ntfs_is_chkd_record(rp->magic)) */ + *lsn = sle64_to_cpu(rp->chkdsk_lsn); + } + ntfs_debug("Done."); + if (wrp) + *wrp = trp; + else { +err_out: + ntfs_free(trp); + } + return err; +} + +/** + * ntfs_check_logfile - check the journal for consistency + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: [OUT] on success this is a copy of the current restart page + * + * Check the $LogFile journal for consistency and return 'true' if it is + * consistent and 'false' if not. On success, the current restart page is + * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. + * + * At present we only check the two restart pages and ignore the log record + * pages. + * + * Note that the MstProtected flag is not set on the $LogFile inode and hence + * when reading pages they are not deprotected. This is because we do not know + * if the $LogFile was created on a system with a different page size to ours + * yet and mst deprotection would fail if our page size is smaller. + */ +bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) +{ + s64 size, pos; + LSN rstr1_lsn, rstr2_lsn; + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + struct address_space *mapping = log_vi->i_mapping; + struct page *page = NULL; + u8 *kaddr = NULL; + RESTART_PAGE_HEADER *rstr1_ph = NULL; + RESTART_PAGE_HEADER *rstr2_ph = NULL; + int log_page_size, log_page_mask, err; + bool logfile_is_empty = true; + u8 log_page_bits; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) + goto is_empty; + size = i_size_read(log_vi); + /* Make sure the file doesn't exceed the maximum allowed size. */ + if (size > MaxLogFileSize) + size = MaxLogFileSize; + /* + * Truncate size to a multiple of the page cache size or the default + * log page size if the page cache size is between the default log page + * log page size if the page cache size is between the default log page + * size and twice that. + */ + if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <= + DefaultLogPageSize * 2) + log_page_size = DefaultLogPageSize; + else + log_page_size = PAGE_CACHE_SIZE; + log_page_mask = log_page_size - 1; + /* + * Use ntfs_ffs() instead of ffs() to enable the compiler to + * optimize log_page_size and log_page_bits into constants. + */ + log_page_bits = ntfs_ffs(log_page_size) - 1; + size &= ~(s64)(log_page_size - 1); + /* + * Ensure the log file is big enough to store at least the two restart + * pages and the minimum number of log record pages. + */ + if (size < log_page_size * 2 || (size - log_page_size * 2) >> + log_page_bits < MinLogRecordPages) { + ntfs_error(vol->sb, "$LogFile is too small."); + return false; + } + /* + * Read through the file looking for a restart page. Since the restart + * page header is at the beginning of a page we only need to search at + * what could be the beginning of a page (for each page size) rather + * than scanning the whole file byte by byte. If all potential places + * contain empty and uninitialzed records, the log file can be assumed + * to be empty. + */ + for (pos = 0; pos < size; pos <<= 1) { + pgoff_t idx = pos >> PAGE_CACHE_SHIFT; + if (!page || page->index != idx) { + if (page) + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Error mapping $LogFile " + "page (index %lu).", idx); + goto err_out; + } + } + kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK); + /* + * A non-empty block means the logfile is not empty while an + * empty block after a non-empty block has been encountered + * means we are done. + */ + if (!ntfs_is_empty_recordp((le32*)kaddr)) + logfile_is_empty = false; + else if (!logfile_is_empty) + break; + /* + * A log record page means there cannot be a restart page after + * this so no need to continue searching. + */ + if (ntfs_is_rcrd_recordp((le32*)kaddr)) + break; + /* If not a (modified by chkdsk) restart page, continue. */ + if (!ntfs_is_rstr_recordp((le32*)kaddr) && + !ntfs_is_chkd_recordp((le32*)kaddr)) { + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * Check the (modified by chkdsk) restart page for consistency + * and get a copy of the complete multi sector transfer + * deprotected restart page. + */ + err = ntfs_check_and_load_restart_page(log_vi, + (RESTART_PAGE_HEADER*)kaddr, pos, + !rstr1_ph ? &rstr1_ph : &rstr2_ph, + !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); + if (!err) { + /* + * If we have now found the first (modified by chkdsk) + * restart page, continue looking for the second one. + */ + if (!pos) { + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * We have now found the second (modified by chkdsk) + * restart page, so we can stop looking. + */ + break; + } + /* + * Error output already done inside the function. Note, we do + * not abort if the restart page was invalid as we might still + * find a valid one further in the file. + */ + if (err != -EINVAL) { + ntfs_unmap_page(page); + goto err_out; + } + /* Continue looking. */ + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + } + if (page) + ntfs_unmap_page(page); + if (logfile_is_empty) { + NVolSetLogFileEmpty(vol); +is_empty: + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + if (!rstr1_ph) { + BUG_ON(rstr2_ph); + ntfs_error(vol->sb, "Did not find any restart pages in " + "$LogFile and it was not empty."); + return false; + } + /* If both restart pages were found, use the more recent one. */ + if (rstr2_ph) { + /* + * If the second restart area is more recent, switch to it. + * Otherwise just throw it away. + */ + if (rstr2_lsn > rstr1_lsn) { + ntfs_debug("Using second restart page as it is more " + "recent."); + ntfs_free(rstr1_ph); + rstr1_ph = rstr2_ph; + /* rstr1_lsn = rstr2_lsn; */ + } else { + ntfs_debug("Using first restart page as it is more " + "recent."); + ntfs_free(rstr2_ph); + } + rstr2_ph = NULL; + } + /* All consistency checks passed. */ + if (rp) + *rp = rstr1_ph; + else + ntfs_free(rstr1_ph); + ntfs_debug("Done."); + return true; +err_out: + if (rstr1_ph) + ntfs_free(rstr1_ph); + return false; +} + +/** + * ntfs_is_logfile_clean - check in the journal if the volume is clean + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: copy of the current restart page + * + * Analyze the $LogFile journal and return 'true' if it indicates the volume was + * shutdown cleanly and 'false' if not. + * + * At present we only look at the two restart pages and ignore the log record + * pages. This is a little bit crude in that there will be a very small number + * of cases where we think that a volume is dirty when in fact it is clean. + * This should only affect volumes that have not been shutdown cleanly but did + * not have any pending, non-check-pointed i/o, i.e. they were completely idle + * at least for the five seconds preceding the unclean shutdown. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and in particular if the $LogFile + * is empty this function requires that NVolLogFileEmpty() is true otherwise an + * empty volume will be reported as dirty. + */ +bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) +{ + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + RESTART_AREA *ra; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + BUG_ON(!rp); + if (!ntfs_is_rstr_record(rp->magic) && + !ntfs_is_chkd_record(rp->magic)) { + ntfs_error(vol->sb, "Restart page buffer is invalid. This is " + "probably a bug in that the $LogFile should " + "have been consistency checked before calling " + "this function."); + return false; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * If the $LogFile has active clients, i.e. it is open, and we do not + * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, + * we assume there was an unclean shutdown. + */ + if (ra->client_in_use_list != LOGFILE_NO_CLIENT && + !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { + ntfs_debug("Done. $LogFile indicates a dirty shutdown."); + return false; + } + /* $LogFile indicates a clean shutdown. */ + ntfs_debug("Done. $LogFile indicates a clean shutdown."); + return true; +} + +/** + * ntfs_empty_logfile - empty the contents of the $LogFile journal + * @log_vi: struct inode of loaded journal $LogFile to empty + * + * Empty the contents of the $LogFile journal @log_vi and return 'true' on + * success and 'false' on error. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() + * has been used to ensure that the $LogFile is clean. + */ +bool ntfs_empty_logfile(struct inode *log_vi) +{ + VCN vcn, end_vcn; + ntfs_inode *log_ni = NTFS_I(log_vi); + ntfs_volume *vol = log_ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags; + unsigned block_size, block_size_bits; + int err; + bool should_wait = true; + + ntfs_debug("Entering."); + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done."); + return true; + } + /* + * We cannot use ntfs_attr_set() because we may be still in the middle + * of a mount operation. Thus we do the emptying by hand by first + * zapping the page cache pages for the $LogFile/$DATA attribute and + * then emptying each of the buffers in each of the clusters specified + * by the runlist by hand. + */ + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + vcn = 0; + read_lock_irqsave(&log_ni->size_lock, flags); + end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> + vol->cluster_size_bits; + read_unlock_irqrestore(&log_ni->size_lock, flags); + truncate_inode_pages(log_vi->i_mapping, 0); + down_write(&log_ni->runlist.lock); + rl = log_ni->runlist.rl; + if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { +map_vcn: + err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); + if (err) { + ntfs_error(sb, "Failed to map runlist fragment (error " + "%d).", -err); + goto err; + } + rl = log_ni->runlist.rl; + BUG_ON(!rl || vcn < rl->vcn || !rl->length); + } + /* Seek to the runlist element containing @vcn. */ + while (rl->length && vcn >= rl[1].vcn) + rl++; + do { + LCN lcn; + sector_t block, end_block; + s64 len; + + /* + * If this run is not mapped map it now and start again as the + * runlist will have been updated. + */ + lcn = rl->lcn; + if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { + vcn = rl->vcn; + goto map_vcn; + } + /* If this run is not valid abort with an error. */ + if (unlikely(!rl->length || lcn < LCN_HOLE)) + goto rl_err; + /* Skip holes. */ + if (lcn == LCN_HOLE) + continue; + block = lcn << vol->cluster_size_bits >> block_size_bits; + len = rl->length; + if (rl[1].vcn > end_vcn) + len = end_vcn - rl->vcn; + end_block = (lcn + len) << vol->cluster_size_bits >> + block_size_bits; + /* Iterate over the blocks in the run and empty them. */ + do { + struct buffer_head *bh; + + /* Obtain the buffer, possibly not uptodate. */ + bh = sb_getblk(sb, block); + BUG_ON(!bh); + /* Setup buffer i/o submission. */ + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + /* Set the entire contents of the buffer to 0xff. */ + memset(bh->b_data, -1, block_size); + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_dirty(bh)) + clear_buffer_dirty(bh); + /* + * Submit the buffer and wait for i/o to complete but + * only for the first buffer so we do not miss really + * serious i/o errors. Once the first buffer has + * completed ignore errors afterwards as we can assume + * that if one buffer worked all of them will work. + */ + submit_bh(WRITE, bh); + if (should_wait) { + should_wait = false; + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + goto io_err; + } + brelse(bh); + } while (++block < end_block); + } while ((++rl)->vcn < end_vcn); + up_write(&log_ni->runlist.lock); + /* + * Zap the pages again just in case any got instantiated whilst we were + * emptying the blocks by hand. FIXME: We may not have completed + * writing to all the buffer heads yet so this may happen too early. + * We really should use a kernel thread to do the emptying + * asynchronously and then we can also set the volume dirty and output + * an error message if emptying should fail. + */ + truncate_inode_pages(log_vi->i_mapping, 0); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetLogFileEmpty(vol); + ntfs_debug("Done."); + return true; +io_err: + ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); + goto dirty_err; +rl_err: + ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); +dirty_err: + NVolSetErrors(vol); + err = -EIO; +err: + up_write(&log_ni->runlist.lock); + ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", + -err); + return false; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/logfile.h b/kernel/fs/ntfs/logfile.h new file mode 100644 index 000000000..aa2b6ac3f --- /dev/null +++ b/kernel/fs/ntfs/logfile.h @@ -0,0 +1,309 @@ +/* + * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2000-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LOGFILE_H +#define _LINUX_NTFS_LOGFILE_H + +#ifdef NTFS_RW + +#include + +#include "types.h" +#include "endian.h" +#include "layout.h" + +/* + * Journal ($LogFile) organization: + * + * Two restart areas present in the first two pages (restart pages, one restart + * area in each page). When the volume is dismounted they should be identical, + * except for the update sequence array which usually has a different update + * sequence number. + * + * These are followed by log records organized in pages headed by a log record + * header going up to log file size. Not all pages contain log records when a + * volume is first formatted, but as the volume ages, all records will be used. + * When the log file fills up, the records at the beginning are purged (by + * modifying the oldest_lsn to a higher value presumably) and writing begins + * at the beginning of the file. Effectively, the log file is viewed as a + * circular entity. + * + * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept + * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We + * probably only want to support 1.1 as this seems to be the current version + * and we don't know how that differs from the older versions. The only + * exception is if the journal is clean as marked by the two restart pages + * then it doesn't matter whether we are on an earlier version. We can just + * reinitialize the logfile and start again with version 1.1. + */ + +/* Some $LogFile related constants. */ +#define MaxLogFileSize 0x100000000ULL +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 48 + +/* + * Log file restart page header (begins the restart area). + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ +/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ +/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. + When creating, set this to be immediately + after this header structure (without any + alignment). */ +/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ + +/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by + chkdsk. Only used when the magic is changed + to "CHKD". Otherwise this is zero. */ +/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file + was created, has to be >= 512 and a power of + 2. Use this to calculate the required size + of the usa (usa_count) and add it to usa_ofs. + Then verify that the result is less than the + value of the restart_area_offset. */ +/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= + 512 and a power of 2. The default is 4096 + and is used when the system page size is + between 4096 and 8192. Otherwise this is + set to the system page size instead. */ +/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to + the RESTART_AREA. Value has to be aligned + to 8-byte boundary. When creating, set this + to be after the usa. */ +/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major + version is 1. */ +/* 28*/ sle16 major_ver; /* Log file major version. We only support + version 1.1. */ +/* sizeof() = 30 (0x1e) bytes */ +} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; + +/* + * Constant for the log client indices meaning that there are no client records + * in this particular client array. Also inside the client records themselves, + * this means that there are no client records preceding or following this one. + */ +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT_CPU 0xffff + +/* + * These are the so far known RESTART_AREA_* flags (16-bit) which contain + * information about the log file in which they are present. + */ +enum { + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ +} __attribute__ ((__packed__)); + +typedef le16 RESTART_AREA_FLAGS; + +/* + * Log file restart area record. The offset of this record is found by adding + * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found + * in it. See notes at restart_area_offset above. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log + when the restart area was last written. + This happens often but what is the interval? + Is it just fixed time or is it every time a + check point is written or somethine else? + On create set to 0. */ +/* 8*/ le16 log_clients; /* Number of log client records in the array of + log client records which follows this + restart area. Must be 1. */ +/* 10*/ le16 client_free_list; /* The index of the first free log client record + in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + free log client records in the array. + If != LOGFILE_NO_CLIENT, check that + log_clients > client_free_list. On Win2k + and presumably earlier, on a clean volume + this is != LOGFILE_NO_CLIENT, and it should + be 0, i.e. the first (and only) client + record is free and thus the logfile is + closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be LOGFILE_NO_CLIENT. On WinXP + and presumably later, the logfile is always + open, even on clean shutdown so this should + always be LOGFILE_NO_CLIENT. */ +/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client + record in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + in-use log client records in the array. If + != LOGFILE_NO_CLIENT check that log_clients + > client_in_use_list. On Win2k and + presumably earlier, on a clean volume this + is LOGFILE_NO_CLIENT, i.e. there are no + client records in use and thus the logfile + is closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be != LOGFILE_NO_CLIENT, and it + should be 0, i.e. the first (and only) + client record is in use. On WinXP and + presumably later, the logfile is always + open, even on clean shutdown so this should + always be 0. */ +/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k + and presumably earlier this is always 0. On + WinXP and presumably later, if the logfile + was shutdown cleanly, the second bit, + RESTART_VOLUME_IS_CLEAN, is set. This bit + is cleared when the volume is mounted by + WinXP and set when the volume is dismounted, + thus if the logfile is dirty, this bit is + clear. Thus we don't need to check the + Windows version to determine if the logfile + is clean. Instead if the logfile is closed, + we know it must be clean. If it is open and + this bit is set, we also know it must be + clean. If on the other hand the logfile is + open and this bit is clear, we can be almost + certain that the logfile is dirty. */ +/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence + number. This is calculated as 67 - the + number of bits required to store the logfile + size in bytes and this can be used in with + the specified file_size as a consistency + check. */ +/* 20*/ le16 restart_area_length;/* Length of the restart area including the + client array. Following checks required if + version matches. Otherwise, skip them. + restart_area_offset + restart_area_length + has to be <= system_page_size. Also, + restart_area_length has to be >= + client_array_offset + (log_clients * + sizeof(log client record)). */ +/* 22*/ le16 client_array_offset;/* Offset from the start of this record to + the first log client record if versions are + matched. When creating, set this to be + after this restart area structure, aligned + to 8-bytes boundary. If the versions do not + match, this is ignored and the offset is + assumed to be (sizeof(RESTART_AREA) + 7) & + ~7, i.e. rounded up to first 8-byte + boundary. Either way, client_array_offset + has to be aligned to an 8-byte boundary. + Also, restart_area_offset + + client_array_offset has to be <= 510. + Finally, client_array_offset + (log_clients + * sizeof(log client record)) has to be <= + system_page_size. On Win2k and presumably + earlier, this is 0x30, i.e. immediately + following this record. On WinXP and + presumably later, this is 0x40, i.e. there + are 16 extra bytes between this record and + the client array. This probably means that + the RESTART_AREA record is actually bigger + in WinXP and later. */ +/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the + restart_area_offset + the offset of the + file_size are > 510 then corruption has + occurred. This is the very first check when + starting with the restart_area as if it + fails it means that some of the above values + will be corrupted by the multi sector + transfer protection. The file_size has to + be rounded down to be a multiple of the + log_page_size in the RESTART_PAGE_HEADER and + then it has to be at least big enough to + store the two restart pages and 48 (0x30) + log record pages. */ +/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including + the log record header. On create set to + 0. */ +/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. + If the version matches then check that the + value of log_record_header_length is a + multiple of 8, i.e. + (log_record_header_length + 7) & ~7 == + log_record_header_length. When creating set + it to sizeof(LOG_RECORD_HEADER), aligned to + 8 bytes. */ +/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record + page. Must be a multiple of 8. On create + set it to immediately after the update + sequence array of the log record page. */ +/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every + time the logfile is restarted which happens + at mount time when the logfile is opened. + When creating set to a random value. Win2k + sets it to the low 32 bits of the current + system time in NTFS format (see time.h). */ +/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ +/* sizeof() = 48 (0x30) bytes */ +} __attribute__ ((__packed__)) RESTART_AREA; + +/* + * Log client record. The offset of this record is found by adding the offset + * of the RESTART_AREA to the client_array_offset value found in it. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create + set to 0. */ +/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart + the volume, i.e. the current position within + the log file. At present, if clean this + should = current_lsn in restart area but it + probably also = current_lsn when dirty most + of the time. At create set to 0. */ +/* 16*/ le16 prev_client; /* The offset to the previous log client record + in the array of log client records. + LOGFILE_NO_CLIENT means there is no previous + client record, i.e. this is the first one. + This is always LOGFILE_NO_CLIENT. */ +/* 18*/ le16 next_client; /* The offset to the next log client record in + the array of log client records. + LOGFILE_NO_CLIENT means there are no next + client records, i.e. this is the last one. + This is always LOGFILE_NO_CLIENT. */ +/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set + to zero every time the logfile is restarted + and it is incremented when the logfile is + closed at dismount time. Thus it is 0 when + dirty and 1 when clean. On WinXP and + presumably later, this is always 0. */ +/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ +/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should + always be 8. */ +/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should + always be "NTFS" with the remaining bytes + set to 0. */ +/* sizeof() = 160 (0xa0) bytes */ +} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; + +extern bool ntfs_check_logfile(struct inode *log_vi, + RESTART_PAGE_HEADER **rp); + +extern bool ntfs_is_logfile_clean(struct inode *log_vi, + const RESTART_PAGE_HEADER *rp); + +extern bool ntfs_empty_logfile(struct inode *log_vi); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/kernel/fs/ntfs/malloc.h b/kernel/fs/ntfs/malloc.h new file mode 100644 index 000000000..a44b14cbc --- /dev/null +++ b/kernel/fs/ntfs/malloc.h @@ -0,0 +1,96 @@ +/* + * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_MALLOC_H +#define _LINUX_NTFS_MALLOC_H + +#include +#include +#include + +/** + * __ntfs_malloc - allocate memory in multiples of pages + * @size: number of bytes to allocate + * @gfp_mask: extra flags for the allocator + * + * Internal function. You probably want ntfs_malloc_nofs()... + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + * Depending on @gfp_mask the allocation may be guaranteed to succeed. + */ +static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) +{ + if (likely(size <= PAGE_SIZE)) { + BUG_ON(!size); + /* kmalloc() has per-CPU caches so is faster for now. */ + return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); + /* return (void *)__get_free_page(gfp_mask); */ + } + if (likely((size >> PAGE_SHIFT) < totalram_pages)) + return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return NULL; +} + +/** + * ntfs_malloc_nofs - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); +} + +/** + * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs_nofail(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); +} + +static inline void ntfs_free(void *addr) +{ + if (!is_vmalloc_addr(addr)) { + kfree(addr); + /* free_page((unsigned long)addr); */ + return; + } + vfree(addr); +} + +#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/kernel/fs/ntfs/mft.c b/kernel/fs/ntfs/mft.c new file mode 100644 index 000000000..3014a36a2 --- /dev/null +++ b/kernel/fs/ntfs/mft.c @@ -0,0 +1,2917 @@ +/** + * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "bitmap.h" +#include "debug.h" +#include "dir.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * map_mft_record_page - map the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to map + * + * This maps the page in which the mft record of the ntfs inode @ni is situated + * and returns a pointer to the mft record within the mapped page. + * + * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() + * contains the negative error code returned. + */ +static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) +{ + loff_t i_size; + ntfs_volume *vol = ni->vol; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + unsigned long index, end_index; + unsigned ofs; + + BUG_ON(ni->page); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. FIXME: We need to check for + * overflowing the unsigned long, but I don't think we would ever get + * here if the volume was that big... + */ + index = (u64)ni->mft_no << vol->mft_record_size_bits >> + PAGE_CACHE_SHIFT; + ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + + i_size = i_size_read(mft_vi); + /* The maximum valid index into the page cache for $MFT's data. */ + end_index = i_size >> PAGE_CACHE_SHIFT; + + /* If the wanted index is out of bounds the mft record doesn't exist. */ + if (unlikely(index >= end_index)) { + if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs + + vol->mft_record_size) { + page = ERR_PTR(-ENOENT); + ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " + "which is beyond the end of the mft. " + "This is probably a bug in the ntfs " + "driver.", ni->mft_no); + goto err_out; + } + } + /* Read, map, and pin the page. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (likely(!IS_ERR(page))) { + /* Catch multi sector transfer fixup errors. */ + if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + + ofs)))) { + ni->page = page; + ni->page_ofs = ofs; + return page_address(page) + ofs; + } + ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " + "Run chkdsk.", ni->mft_no); + ntfs_unmap_page(page); + page = ERR_PTR(-EIO); + NVolSetErrors(vol); + } +err_out: + ni->page = NULL; + ni->page_ofs = 0; + return (void*)page; +} + +/** + * map_mft_record - map, pin and lock an mft record + * @ni: ntfs inode whose MFT record to map + * + * First, take the mrec_lock mutex. We might now be sleeping, while waiting + * for the mutex if it was already locked by someone else. + * + * The page of the record is mapped using map_mft_record_page() before being + * returned to the caller. + * + * This in turn uses ntfs_map_page() to get the page containing the wanted mft + * record (it in turn calls read_cache_page() which reads it in from disk if + * necessary, increments the use count on the page so that it cannot disappear + * under us and returns a reference to the page cache page). + * + * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it + * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed + * and the post-read mst fixups on each mft record in the page have been + * performed, the page gets PG_uptodate set and PG_locked cleared (this is done + * in our asynchronous I/O completion handler end_buffer_read_mft_async()). + * ntfs_map_page() waits for PG_locked to become clear and checks if + * PG_uptodate is set and returns an error code if not. This provides + * sufficient protection against races when reading/using the page. + * + * However there is the write mapping to think about. Doing the above described + * checking here will be fine, because when initiating the write we will set + * PG_locked and clear PG_uptodate making sure nobody is touching the page + * contents. Doing the locking this way means that the commit to disk code in + * the page cache code paths is automatically sufficiently locked with us as + * we will not touch a page that has been locked or is not uptodate. The only + * locking problem then is them locking the page while we are accessing it. + * + * So that code will end up having to own the mrec_lock of all mft + * records/inodes present in the page before I/O can proceed. In that case we + * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be + * accessing anything without owning the mrec_lock mutex. But we do need to + * use them because of the read_cache_page() invocation and the code becomes so + * much simpler this way that it is well worth it. + * + * The mft record is now ours and we return a pointer to it. You need to check + * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return + * the error code. + * + * NOTE: Caller is responsible for setting the mft record dirty before calling + * unmap_mft_record(). This is obviously only necessary if the caller really + * modified the mft record... + * Q: Do we want to recycle one of the VFS inode state bits instead? + * A: No, the inode ones mean we want to change the mft record, not we want to + * write it out. + */ +MFT_RECORD *map_mft_record(ntfs_inode *ni) +{ + MFT_RECORD *m; + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + + /* Serialize access to this mft record. */ + mutex_lock(&ni->mrec_lock); + + m = map_mft_record_page(ni); + if (likely(!IS_ERR(m))) + return m; + + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); + return m; +} + +/** + * unmap_mft_record_page - unmap the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to unmap + * + * This unmaps the page in which the mft record of the ntfs inode @ni is + * situated and returns. This is a NOOP if highmem is not configured. + * + * The unmap happens via ntfs_unmap_page() which in turn decrements the use + * count on the page thus releasing it from the pinned state. + * + * We do not actually unmap the page from memory of course, as that will be + * done by the page cache code itself when memory pressure increases or + * whatever. + */ +static inline void unmap_mft_record_page(ntfs_inode *ni) +{ + BUG_ON(!ni->page); + + // TODO: If dirty, blah... + ntfs_unmap_page(ni->page); + ni->page = NULL; + ni->page_ofs = 0; + return; +} + +/** + * unmap_mft_record - release a mapped mft record + * @ni: ntfs inode whose MFT record to unmap + * + * We release the page mapping and the mrec_lock mutex which unmaps the mft + * record and releases it for others to get hold of. We also release the ntfs + * inode by decrementing the ntfs inode reference count. + * + * NOTE: If caller has modified the mft record, it is imperative to set the mft + * record dirty BEFORE calling unmap_mft_record(). + */ +void unmap_mft_record(ntfs_inode *ni) +{ + struct page *page = ni->page; + + BUG_ON(!page); + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + unmap_mft_record_page(ni); + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + /* + * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to + * ntfs_clear_extent_inode() in the extent inode case, and to the + * caller in the non-extent, yet pure ntfs inode case, to do the actual + * tear down of all structures and freeing of all allocated memory. + */ + return; +} + +/** + * map_extent_mft_record - load an extent inode and attach it to its base + * @base_ni: base ntfs inode + * @mref: mft reference of the extent inode to load + * @ntfs_ino: on successful return, pointer to the ntfs_inode structure + * + * Load the extent mft record @mref and attach it to its base inode @base_ni. + * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise + * PTR_ERR(result) gives the negative error code. + * + * On successful return, @ntfs_ino contains a pointer to the ntfs_inode + * structure of the mapped extent inode. + */ +MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino) +{ + MFT_RECORD *m; + ntfs_inode *ni = NULL; + ntfs_inode **extent_nis = NULL; + int i; + unsigned long mft_no = MREF(mref); + u16 seq_no = MSEQNO(mref); + bool destroy_ni = false; + + ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", + mft_no, base_ni->mft_no); + /* Make sure the base ntfs inode doesn't go away. */ + atomic_inc(&base_ni->count); + /* + * Check if this extent inode has already been added to the base inode, + * in which case just return it. If not found, add it to the base + * inode before returning it. + */ + mutex_lock(&base_ni->extent_lock); + if (base_ni->nr_extents > 0) { + extent_nis = base_ni->ext.extent_ntfs_inos; + for (i = 0; i < base_ni->nr_extents; i++) { + if (mft_no != extent_nis[i]->mft_no) + continue; + ni = extent_nis[i]; + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + break; + } + } + if (likely(ni != NULL)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* We found the record; just have to map and return it. */ + m = map_mft_record(ni); + /* map_mft_record() has incremented this on success. */ + atomic_dec(&ni->count); + if (likely(!IS_ERR(m))) { + /* Verify the sequence number. */ + if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { + ntfs_debug("Done 1."); + *ntfs_ino = ni; + return m; + } + unmap_mft_record(ni); + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. " + "Run chkdsk."); + return ERR_PTR(-EIO); + } +map_err_out: + ntfs_error(base_ni->vol->sb, "Failed to map extent " + "mft record, error code %ld.", -PTR_ERR(m)); + return m; + } + /* Record wasn't there. Get a new ntfs inode and initialize it. */ + ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); + if (unlikely(!ni)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + return ERR_PTR(-ENOMEM); + } + ni->vol = base_ni->vol; + ni->seq_no = seq_no; + ni->nr_extents = -1; + ni->ext.base_ntfs_ino = base_ni; + /* Now map the record. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_clear_extent_inode(ni); + goto map_err_out; + } + /* Verify the sequence number if it is present. */ + if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. Run chkdsk."); + destroy_ni = true; + m = ERR_PTR(-EIO); + goto unm_err_out; + } + /* Attach extent inode to base inode, reallocating memory if needed. */ + if (!(base_ni->nr_extents & 3)) { + ntfs_inode **tmp; + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); + + tmp = kmalloc(new_size, GFP_NOFS); + if (unlikely(!tmp)) { + ntfs_error(base_ni->vol->sb, "Failed to allocate " + "internal buffer."); + destroy_ni = true; + m = ERR_PTR(-ENOMEM); + goto unm_err_out; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - + 4 * sizeof(ntfs_inode *)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = tmp; + } + base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_debug("Done 2."); + *ntfs_ino = ni; + return m; +unm_err_out: + unmap_mft_record(ni); + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* + * If the extent inode was not attached to the base inode we need to + * release it or we will leak memory. + */ + if (destroy_ni) + ntfs_clear_extent_inode(ni); + return m; +} + +#ifdef NTFS_RW + +/** + * __mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Internal function. Users should call mark_mft_record_dirty() instead. + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * on the base vfs inode, because even though file data may have been modified, + * it is dirty in the inode meta data rather than the data page cache of the + * inode, and thus there are no data pages that need writing out. Therefore, a + * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. + */ +void __mark_mft_record_dirty(ntfs_inode *ni) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + mark_ntfs_record_dirty(ni->page, ni->page_ofs); + /* Determine the base vfs inode and mark it dirty, too. */ + mutex_lock(&ni->extent_lock); + if (likely(ni->nr_extents >= 0)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC); +} + +static const char *ntfs_please_email = "Please email " + "linux-ntfs-dev@lists.sourceforge.net and say that you saw " + "this message. Thank you."; + +/** + * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, + * bypassing the page cache and the $MFTMirr inode itself. + * + * This function is only for use at umount time when the mft mirror inode has + * already been disposed off. We BUG() if we are called while the mft mirror + * inode is still attached to the volume. + * + * On success return 0. On error return -errno. + * + * NOTE: This function is not implemented yet as I am not convinced it can + * actually be triggered considering the sequence of commits we do in super.c:: + * ntfs_put_super(). But just in case we provide this place holder as the + * alternative would be either to BUG() or to get a NULL pointer dereference + * and Oops. + */ +static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, + const unsigned long mft_no, MFT_RECORD *m) +{ + BUG_ON(vol->mftmirr_ino); + ntfs_error(vol->sb, "Umount time mft mirror syncing is not " + "implemented yet. %s", ntfs_please_email); + return -EOPNOTSUPP; +} + +/** + * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * @sync: if true, wait for i/o completion + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. + * + * On success return 0. On error return -errno and set the volume errors flag + * in the ntfs volume @vol. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync) +{ + struct page *page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + u8 *kmirr; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end, page_ofs; + int i_bhs, nr_bhs, err = 0; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + BUG_ON(!max_bhs); + if (unlikely(!vol->mftmirr_ino)) { + /* This could happen during umount... */ + err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); + if (likely(!err)) + return err; + goto err_out; + } + /* Get the page containing the mirror copy of the mft record @m. */ + page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> + (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map mft mirror page."); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + /* Offset of the mft mirror record inside the page. */ + page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* The address in the page of the mirror copy of the mft record @m. */ + kmirr = page_address(page) + page_ofs; + /* Copy the mst protected mft record to the mirror. */ + memcpy(kmirr, m, vol->mft_record_size); + /* Create uptodate buffers if not present. */ + if (unlikely(!page_has_buffers(page))) { + struct buffer_head *tail; + + bh = head = alloc_page_buffers(page, blocksize, 1); + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); + } + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = kmirr - (u8*)page_address(page); + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mftmirr_ino)-> + runlist.lock); + rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; + /* + * $MFTMirr always has the whole of its runlist + * in memory. + */ + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFTMirr, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft mirror " + "record 0x%lx because its " + "location on disk could not " + "be determined (error code " + "%lli).", mft_no, + (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); + if (likely(!err)) { + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and + * buffer states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + } else /* if (unlikely(err)) */ { + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); + } + /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)kmirr); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + if (likely(!err)) { + ntfs_debug("Done."); + } else { + ntfs_error(vol->sb, "I/O error while writing mft mirror " + "record 0x%lx!", mft_no); +err_out: + ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " + "code %i). Volume will be left marked dirty " + "on umount. Run ntfsfix on the partition " + "after umounting to correct this.", -err); + NVolSetErrors(vol); + } + return err; +} + +/** + * write_mft_record_nolock - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * Write the mapped (extent) mft record @m described by the (regular or extent) + * ntfs inode @ni to backing store. If the mft record @m has a counterpart in + * the mft mirror, that is also updated. + * + * We only write the mft record if the ntfs inode @ni is dirty and the first + * buffer belonging to its mft record is dirty, too. We ignore the dirty state + * of subsequent buffers because we could have raced with + * fs/ntfs/aops.c::mark_ntfs_record_dirty(). + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * However, if the mft record has a counterpart in the mft mirror and @sync is + * true, we write the mft record, wait for i/o completion, and only then write + * the mft mirror copy. This ensures that if the system crashes either the mft + * or the mft mirror will contain a self-consistent mft record @m. If @sync is + * false on the other hand, we start i/o on both and then wait for completion + * on them. This provides a speedup but no longer guarantees that you will end + * up with a self-consistent mft record in the case of a crash but if you asked + * for asynchronous writing you probably do not care about that anyway. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page = ni->page; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + BUG_ON(!max_bhs); + BUG_ON(!PageLocked(page)); + /* + * If the ntfs_inode is clean no need to do anything. If it is dirty, + * mark it as clean now so that it can be redirtied later on if needed. + * There is no danger of races since the caller is holding the locks + * for the mft record @m and the page it is in. + */ + if (!NInoTestClearDirty(ni)) + goto done; + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = ni->page_ofs; + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* + * If this block is not the first one in the record, we ignore + * the buffer's dirty state because we could have raced with a + * parallel mark_ntfs_record_dirty(). + */ + if (block_start == m_start) { + /* This block is the first one in the record. */ + if (!buffer_dirty(bh)) { + BUG_ON(nr_bhs); + /* Clean records are not written out. */ + break; + } + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mft_ino)->runlist.lock); + rl = NTFS_I(vol->mft_ino)->runlist.rl; + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFT, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft record " + "0x%lx because its location " + "on disk could not be " + "determined (error code %lli).", + ni->mft_no, (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mft_ino)->runlist.lock); + if (!nr_bhs) + goto done; + if (unlikely(err)) + goto cleanup_out; + /* Apply the mst protection fixups. */ + err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); + if (err) { + ntfs_error(vol->sb, "Failed to apply mst fixups!"); + goto cleanup_out; + } + flush_dcache_mft_record_page(ni); + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (!sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)m); + flush_dcache_mft_record_page(ni); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft record " + "0x%lx! Marking base inode as bad. You " + "should unmount the volume and run chkdsk.", + ni->mft_no); + goto err_out; + } +done: + ntfs_debug("Done."); + return 0; +cleanup_out: + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); +err_out: + /* + * Current state: all buffers are clean, unlocked, and uptodate. + * The caller should mark the base inode as bad so that no more i/o + * happens. ->clear_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. + */ + if (err == -ENOMEM) { + ntfs_error(vol->sb, "Not enough memory to write mft record. " + "Redirtying so the write is retried later."); + mark_mft_record_dirty(ni); + err = 0; + } else + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_may_write_mft_record - check if an mft record may be written out + * @vol: [IN] ntfs volume on which the mft record to check resides + * @mft_no: [IN] mft record number of the mft record to check + * @m: [IN] mapped mft record to check + * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned + * + * Check if the mapped (base or extent) mft record @m with mft record number + * @mft_no belonging to the ntfs volume @vol may be written out. If necessary + * and possible the ntfs inode of the mft record is locked and the base vfs + * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The + * caller is responsible for unlocking the ntfs inode and unpinning the base + * vfs inode. + * + * Return 'true' if the mft record may be written out and 'false' if not. + * + * The caller has locked the page and cleared the uptodate flag on it which + * means that we can safely write out any dirty mft records that do not have + * their inodes in icache as determined by ilookup5() as anyone + * opening/creating such an inode would block when attempting to map the mft + * record in read_cache_page() until we are finished with the write out. + * + * Here is a description of the tests we perform: + * + * If the inode is found in icache we know the mft record must be a base mft + * record. If it is dirty, we do not write it and return 'false' as the vfs + * inode write paths will result in the access times being updated which would + * cause the base mft record to be redirtied and written out again. (We know + * the access time update will modify the base mft record because Windows + * chkdsk complains if the standard information attribute is not in the base + * mft record.) + * + * If the inode is in icache and not dirty, we attempt to lock the mft record + * and if we find the lock was already taken, it is not safe to write the mft + * record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the mft record, + * which also allows us safe writeout of the mft record. We then set + * @locked_ni to the locked ntfs inode and return 'true'. + * + * Note we cannot just lock the mft record and sleep while waiting for the lock + * because this would deadlock due to lock reversal (normally the mft record is + * locked before the page is locked but we already have the page locked here + * when we try to lock the mft record). + * + * If the inode is not in icache we need to perform further checks. + * + * If the mft record is not a FILE record or it is a base mft record, we can + * safely write it and return 'true'. + * + * We now know the mft record is an extent mft record. We check if the inode + * corresponding to its base mft record is in icache and obtain a reference to + * it if it is. If it is not, we can safely write it and return 'true'. + * + * We now have the base inode for the extent mft record. We check if it has an + * ntfs inode for the extent mft record attached and if not it is safe to write + * the extent mft record and we return 'true'. + * + * The ntfs inode for the extent mft record is attached to the base inode so we + * attempt to lock the extent mft record and if we find the lock was already + * taken, it is not safe to write the extent mft record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the extent mft + * record, which also allows us safe writeout of the extent mft record. We + * set the ntfs inode of the extent mft record clean and then set @locked_ni to + * the now locked ntfs inode and return 'true'. + * + * Note, the reason for actually writing dirty mft records here and not just + * relying on the vfs inode dirty code paths is that we can have mft records + * modified without them ever having actual inodes in memory. Also we can have + * dirty mft records with clean ntfs inodes in memory. None of the described + * cases would result in the dirty mft records being written out if we only + * relied on the vfs inode dirty code paths. And these cases can really occur + * during allocation of new mft records and in particular when the + * initialized_size of the $MFT/$DATA attribute is extended and the new space + * is initialized using ntfs_mft_record_format(). The clean inode can then + * appear if the mft record is reused for a new inode before it got written + * out. + */ +bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, + const MFT_RECORD *m, ntfs_inode **locked_ni) +{ + struct super_block *sb = vol->sb; + struct inode *mft_vi = vol->mft_ino; + struct inode *vi; + ntfs_inode *ni, *eni, **extent_nis; + int i; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + /* + * Normally we do not return a locked inode so set @locked_ni to NULL. + */ + BUG_ON(!locked_ni); + *locked_ni = NULL; + /* + * Check if the inode corresponding to this mft record is in the VFS + * inode cache and obtain a reference to it if it is. + */ + ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); + na.mft_no = mft_no; + na.name = NULL; + na.name_len = 0; + na.type = AT_UNUSED; + /* + * Optimize inode 0, i.e. $MFT itself, since we have it in memory and + * we get here for it rather often. + */ + if (!mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else { + /* + * Have to use ilookup5_nowait() since ilookup5() waits for the + * inode lock which causes ntfs to deadlock when a concurrent + * inode write via the inode dirty code paths and the page + * dirty code path of the inode dirty code path when writing + * $MFT occurs. + */ + vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); + } + if (vi) { + ntfs_debug("Base inode 0x%lx is in icache.", mft_no); + /* The inode is in icache. */ + ni = NTFS_I(vi); + /* Take a reference to the ntfs inode. */ + atomic_inc(&ni->count); + /* If the inode is dirty, do not write this record. */ + if (NInoDirty(ni)) { + ntfs_debug("Inode 0x%lx is dirty, do not write it.", + mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Inode 0x%lx is not dirty.", mft_no); + /* The inode is not dirty, try to take the mft record lock. */ + if (unlikely(!mutex_trylock(&ni->mrec_lock))) { + ntfs_debug("Mft record 0x%lx is already locked, do " + "not write it.", mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Managed to lock mft record 0x%lx, write it.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so + * return the locked ntfs inode. + */ + *locked_ni = ni; + return true; + } + ntfs_debug("Inode 0x%lx is not in icache.", mft_no); + /* The inode is not in icache. */ + /* Write the record if it is not a mft record (type "FILE"). */ + if (!ntfs_is_mft_record(m->magic)) { + ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", + mft_no); + return true; + } + /* Write the mft record if it is a base inode. */ + if (!m->base_mft_record) { + ntfs_debug("Mft record 0x%lx is a base record, write it.", + mft_no); + return true; + } + /* + * This is an extent mft record. Check if the inode corresponding to + * its base mft record is in icache and obtain a reference to it if it + * is. + */ + na.mft_no = MREF_LE(m->base_mft_record); + ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " + "inode 0x%lx in icache.", mft_no, na.mft_no); + if (!na.mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, + &na); + if (!vi) { + /* + * The base inode is not in icache, write this extent mft + * record. + */ + ntfs_debug("Base inode 0x%lx is not in icache, write the " + "extent record.", na.mft_no); + return true; + } + ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); + /* + * The base inode is in icache. Check if it has the extent inode + * corresponding to this extent mft record attached. + */ + ni = NTFS_I(vi); + mutex_lock(&ni->extent_lock); + if (ni->nr_extents <= 0) { + /* + * The base inode has no attached extent inodes, write this + * extent mft record. + */ + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Base inode 0x%lx has no attached extent inodes, " + "write the extent record.", na.mft_no); + return true; + } + /* Iterate over the attached extent inodes. */ + extent_nis = ni->ext.extent_ntfs_inos; + for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { + if (mft_no == extent_nis[i]->mft_no) { + /* + * Found the extent inode corresponding to this extent + * mft record. + */ + eni = extent_nis[i]; + break; + } + } + /* + * If the extent inode was not attached to the base inode, write this + * extent mft record. + */ + if (!eni) { + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Extent inode 0x%lx is not attached to its base " + "inode 0x%lx, write the extent record.", + mft_no, na.mft_no); + return true; + } + ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", + mft_no, na.mft_no); + /* Take a reference to the extent ntfs inode. */ + atomic_inc(&eni->count); + mutex_unlock(&ni->extent_lock); + /* + * Found the extent inode coresponding to this extent mft record. + * Try to take the mft record lock. + */ + if (unlikely(!mutex_trylock(&eni->mrec_lock))) { + atomic_dec(&eni->count); + iput(vi); + ntfs_debug("Extent mft record 0x%lx is already locked, do " + "not write it.", mft_no); + return false; + } + ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", + mft_no); + if (NInoTestClearDirty(eni)) + ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so return + * the locked extent ntfs inode. + */ + *locked_ni = eni; + return true; +} + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name + * @vol: volume on which to search for a free mft record + * @base_ni: open base inode if allocating an extent mft record or NULL + * + * Search for a free mft record in the mft bitmap attribute on the ntfs volume + * @vol. + * + * If @base_ni is NULL start the search at the default allocator position. + * + * If @base_ni is not NULL start the search at the mft record after the base + * mft record @base_ni. + * + * Return the free mft record on success and -errno on error. An error code of + * -ENOSPC means that there are no free mft records in the currently + * initialized mft bitmap. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, + ntfs_inode *base_ni) +{ + s64 pass_end, ll, data_pos, pass_start, ofs, bit; + unsigned long flags; + struct address_space *mftbmp_mapping; + u8 *buf, *byte; + struct page *page; + unsigned int page_ofs, size; + u8 pass, b; + + ntfs_debug("Searching for free mft record in the currently " + "initialized mft bitmap."); + mftbmp_mapping = vol->mftbmp_ino->i_mapping; + /* + * Set the end of the pass making sure we do not overflow the mft + * bitmap. + */ + read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); + pass_end = NTFS_I(vol->mft_ino)->allocated_size >> + vol->mft_record_size_bits; + read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); + read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; + read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + if (pass_end > ll) + pass_end = ll; + pass = 1; + if (!base_ni) + data_pos = vol->mft_data_pos; + else + data_pos = base_ni->mft_no + 1; + if (data_pos < 24) + data_pos = 24; + if (data_pos >= pass_end) { + data_pos = 24; + pass = 2; + /* This happens on a freshly formatted volume. */ + if (data_pos >= pass_end) + return -ENOSPC; + } + pass_start = data_pos; + ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " + "pass_end 0x%llx, data_pos 0x%llx.", pass, + (long long)pass_start, (long long)pass_end, + (long long)data_pos); + /* Loop until a free mft record is found. */ + for (; pass <= 2;) { + /* Cap size to pass_end. */ + ofs = data_pos >> 3; + page_ofs = ofs & ~PAGE_CACHE_MASK; + size = PAGE_CACHE_SIZE - page_ofs; + ll = ((pass_end + 7) >> 3) - ofs; + if (size > ll) + size = ll; + size <<= 3; + /* + * If we are still within the active pass, search the next page + * for a zero bit. + */ + if (size) { + page = ntfs_map_page(mftbmp_mapping, + ofs >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read mft " + "bitmap, aborting."); + return PTR_ERR(page); + } + buf = (u8*)page_address(page) + page_ofs; + bit = data_pos & 7; + data_pos &= ~7ull; + ntfs_debug("Before inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + for (; bit < size && data_pos + bit < pass_end; + bit &= ~7ull, bit += 8) { + byte = buf + (bit >> 3); + if (*byte == 0xff) + continue; + b = ffz((unsigned long)*byte); + if (b < 8 && b >= (bit & 7)) { + ll = data_pos + (bit & ~7ull) + b; + if (unlikely(ll > (1ll << 32))) { + ntfs_unmap_page(page); + return -ENOSPC; + } + *byte |= 1 << b; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done. (Found and " + "allocated mft record " + "0x%llx.)", + (long long)ll); + return ll; + } + } + ntfs_debug("After inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + data_pos += size; + ntfs_unmap_page(page); + /* + * If the end of the pass has not been reached yet, + * continue searching the mft bitmap for a zero bit. + */ + if (data_pos < pass_end) + continue; + } + /* Do the next pass. */ + if (++pass == 2) { + /* + * Starting the second pass, in which we scan the first + * part of the zone which we omitted earlier. + */ + pass_end = pass_start; + data_pos = pass_start = 24; + ntfs_debug("pass %i, pass_start 0x%llx, pass_end " + "0x%llx.", pass, (long long)pass_start, + (long long)pass_end); + if (data_pos >= pass_end) + break; + } + } + /* No free mft records in currently initialized mft bitmap. */ + ntfs_debug("Done. (No free mft records left in currently initialized " + "mft bitmap.)"); + return -ENOSPC; +} + +/** + * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for + * writing and releases it before returning. + * - This function takes vol->lcnbmp_lock for writing and releases it + * before returning. + */ +static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + s64 ll; + unsigned long flags; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni; + runlist_element *rl, *rl2 = NULL; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + u8 *b, tb; + struct { + u8 added_cluster:1; + u8 added_run:1; + u8 mp_rebuilt:1; + } status = { 0, 0, 0 }; + + ntfs_debug("Extending mft bitmap allocation."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + /* + * Determine the last lcn of the mft bitmap. The allocated size of the + * mft bitmap cannot be zero so we are ok to do this. + */ + down_write(&mftbmp_ni->runlist.lock); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ll = mftbmp_ni->allocated_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft bitmap attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", + (long long)lcn); + /* + * Attempt to get the cluster following the last allocated cluster by + * hand as it may be in the MFT zone so the allocator would not give it + * to us. + */ + ll = lcn >> 3; + page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, + ll >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to read from lcn bitmap."); + return PTR_ERR(page); + } + b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK); + tb = 1 << (lcn & 7ull); + down_write(&vol->lcnbmp_lock); + if (*b != 0xff && !(*b & tb)) { + /* Next cluster is free, allocate it. */ + *b |= tb; + flush_dcache_page(page); + set_page_dirty(page); + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Update the mft bitmap runlist. */ + rl->length++; + rl[1].vcn++; + status.added_cluster = 1; + ntfs_debug("Appending one cluster to mft bitmap."); + } else { + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Allocate a cluster from the DATA_ZONE. */ + rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, + true); + if (IS_ERR(rl2)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to allocate a cluster for " + "the mft bitmap."); + return PTR_ERR(rl2); + } + rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft " + "bitmap."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate " + "allocated cluster.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mftbmp_ni->runlist.rl = rl; + status.added_run = 1; + ntfs_debug("Adding one run to mft bitmap."); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + } + /* + * Update the attribute record as well. Note: @rl is the last + * (non-terminator) runlist element of mft bitmap. + */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft bitmap attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft bitmap attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: It will need to be a special mft record and if none of + // those are available it gets rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft bitmap attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + status.mp_rebuilt = 1; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array for " + "mft bitmap attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft bitmap allocated_size by one cluster. + * Reflect this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft bitmap attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + a->data.non_resident.allocated_size = + cpu_to_sle64(mftbmp_ni->allocated_size); + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute.%s", es); + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + a = ctx->attr; + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); +undo_alloc: + if (status.added_cluster) { + /* Truncate the last run in the runlist by one cluster. */ + rl->length--; + rl[1].vcn--; + } else if (status.added_run) { + lcn = rl->lcn; + /* Remove the last run from the runlist. */ + rl->lcn = rl[1].lcn; + rl->length = 0; + } + /* Deallocate the cluster. */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + if (status.mp_rebuilt) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the initialized portion of the mft bitmap attribute on the ntfs + * volume @vol by 8 bytes. + * + * Note: Only changes initialized_size and data_size, i.e. requires that + * allocated_size is big enough to fit the new initialized_size. + * + * Return 0 on success and -error on error. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) +{ + s64 old_data_size, old_initialized_size; + unsigned long flags; + struct inode *mftbmp_vi; + ntfs_inode *mft_ni, *mftbmp_ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + ATTR_RECORD *a; + int ret; + + ntfs_debug("Extending mft bitmap initiailized (and data) size."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_vi = vol->mftbmp_ino; + mftbmp_ni = NTFS_I(mftbmp_vi); + /* Get the attribute record. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + return PTR_ERR(mrec); + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto unm_err_out; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto put_err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = i_size_read(mftbmp_vi); + old_initialized_size = mftbmp_ni->initialized_size; + /* + * We can simply update the initialized_size before filling the space + * with zeroes because the caller is holding the mft bitmap lock for + * writing which ensures that no one else is trying to access the data. + */ + mftbmp_ni->initialized_size += 8; + a->data.non_resident.initialized_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + if (mftbmp_ni->initialized_size > old_data_size) { + i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + /* Initialize the mft bitmap attribute value with zeroes. */ + ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); + if (likely(!ret)) { + ntfs_debug("Done. (Wrote eight initialized bytes to mft " + "bitmap."); + return 0; + } + ntfs_error(vol->sb, "Failed to write to mft bitmap."); + /* Try to recover from the error. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record.%s", es); + NVolSetErrors(vol); + return ret; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context.%s", es); + NVolSetErrors(vol); + goto unm_err_out; + } + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute.%s", es); + NVolSetErrors(vol); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(mft_ni); + goto err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->initialized_size = old_initialized_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(old_initialized_size); + if (i_size_read(mftbmp_vi) != old_data_size) { + i_size_write(mftbmp_vi, old_data_size); + a->data.non_resident.data_size = cpu_to_sle64(old_data_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(mftbmp_vi), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ +err_out: + return ret; +} + +/** + * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute + * @vol: volume on which to extend the mft data attribute + * + * Extend the mft data attribute on the ntfs volume @vol by 16 mft records + * worth of clusters or if not enough space for this by one mft record worth + * of clusters. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for + * writing and releases it before returning. + * - This function calls functions which take vol->lcnbmp_lock for + * writing and release it before returning. + */ +static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + VCN old_last_vcn; + s64 min_nr, nr, ll; + unsigned long flags; + ntfs_inode *mft_ni; + runlist_element *rl, *rl2; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + bool mp_rebuilt = false; + + ntfs_debug("Extending mft data allocation."); + mft_ni = NTFS_I(vol->mft_ino); + /* + * Determine the preferred allocation location, i.e. the last lcn of + * the mft data attribute. The allocated size of the mft data + * attribute cannot be zero so we are ok to do this. + */ + down_write(&mft_ni->runlist.lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mft_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft data attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); + /* Minimum allocation is one mft record worth of clusters. */ + min_nr = vol->mft_record_size >> vol->cluster_size_bits; + if (!min_nr) + min_nr = 1; + /* Want to allocate 16 mft records worth of clusters. */ + nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; + if (!nr) + nr = min_nr; + /* Ensure we do not go above 2^32-1 mft records. */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + nr = min_nr; + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + ntfs_warning(vol->sb, "Cannot allocate mft record " + "because the maximum number of inodes " + "(2^32) has already been reached."); + up_write(&mft_ni->runlist.lock); + return -ENOSPC; + } + } + ntfs_debug("Trying mft data allocation with %s cluster count %lli.", + nr > min_nr ? "default" : "minimal", (long long)nr); + old_last_vcn = rl[1].vcn; + do { + rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, + true); + if (likely(!IS_ERR(rl2))) + break; + if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { + ntfs_error(vol->sb, "Failed to allocate the minimal " + "number of clusters (%lli) for the " + "mft data attribute.", (long long)nr); + up_write(&mft_ni->runlist.lock); + return PTR_ERR(rl2); + } + /* + * There is not enough space to do the allocation, but there + * might be enough space to do a minimal allocation so try that + * before failing. + */ + nr = min_nr; + ntfs_debug("Retrying mft data allocation with minimal cluster " + "count %lli.", (long long)nr); + } while (1); + rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft data " + "attribute."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate clusters " + "from the mft data attribute.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mft_ni->runlist.rl = rl; + ntfs_debug("Allocated %lli clusters.", (long long)nr); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + /* Update the attribute record as well. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft data attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft data attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: Use the special reserved mft records and ensure that + // this extent is not required to find the mft record in + // question. If no free special records left we would need to + // move an existing record away, insert ours in its place, and + // then place the moved record into the newly allocated space + // and we would then need to update all references to this mft + // record appropriately. This is rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft data attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array of " + "mft data attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft data allocated_size by nr clusters. + * Reflect this in the ntfs_inode structure and the attribute record. + * @rl is the last (non-terminator) runlist element of mft data + * attribute. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, + mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, + ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft data attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + a->data.non_resident.allocated_size = + cpu_to_sle64(mft_ni->allocated_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute.%s", es); + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + ctx->attr->data.non_resident.highest_vcn = + cpu_to_sle64(old_last_vcn - 1); +undo_alloc: + if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to free clusters from mft data " + "attribute.%s", es); + NVolSetErrors(vol); + } + a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { + ntfs_error(vol->sb, "Failed to truncate mft data attribute " + "runlist.%s", es); + NVolSetErrors(vol); + } + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); + NVolSetErrors(vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_record_layout - layout an mft record into a memory buffer + * @vol: volume to which the mft record will belong + * @mft_no: mft reference specifying the mft record number + * @m: destination buffer of size >= @vol->mft_record_size bytes + * + * Layout an empty, unused mft record with the mft record number @mft_no into + * the buffer @m. The volume @vol is needed because the mft record structure + * was modified in NTFS 3.1 so we need to know which volume version this mft + * record will be used on. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, + MFT_RECORD *m) +{ + ATTR_RECORD *a; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + if (mft_no >= (1ll << 32)) { + ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " + "maximum of 2^32.", (long long)mft_no); + return -ERANGE; + } + /* Start by clearing the whole mft record to gives us a clean slate. */ + memset(m, 0, vol->mft_record_size); + /* Aligned to 2-byte boundary. */ + if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); + else { + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); + /* + * Set the NTFS 3.1+ specific fields while we know that the + * volume version is 3.1+. + */ + m->reserved = 0; + m->mft_record_number = cpu_to_le32((u32)mft_no); + } + m->magic = magic_FILE; + if (vol->mft_record_size >= NTFS_BLOCK_SIZE) + m->usa_count = cpu_to_le16(vol->mft_record_size / + NTFS_BLOCK_SIZE + 1); + else { + m->usa_count = cpu_to_le16(1); + ntfs_warning(vol->sb, "Sector size is bigger than mft record " + "size. Setting usa_count to 1. If chkdsk " + "reports this as corruption, please email " + "linux-ntfs-dev@lists.sourceforge.net stating " + "that you saw this message and that the " + "modified filesystem created was corrupt. " + "Thank you."); + } + /* Set the update sequence number to 1. */ + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); + m->lsn = 0; + m->sequence_number = cpu_to_le16(1); + m->link_count = 0; + /* + * Place the attributes straight after the update sequence array, + * aligned to 8-byte boundary. + */ + m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + + (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); + m->flags = 0; + /* + * Using attrs_offset plus eight bytes (for the termination attribute). + * attrs_offset is already aligned to 8-byte boundary, so no need to + * align again. + */ + m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); + m->bytes_allocated = cpu_to_le32(vol->mft_record_size); + m->base_mft_record = 0; + m->next_attr_instance = 0; + /* Add the termination attribute. */ + a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); + a->type = AT_END; + a->length = 0; + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_format - format an mft record on an ntfs volume + * @vol: volume on which to format the mft record + * @mft_no: mft record number to format + * + * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused + * mft record into the appropriate place of the mft data attribute. This is + * used when extending the mft data attribute. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) +{ + loff_t i_size; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + MFT_RECORD *m; + pgoff_t index, end_index; + unsigned int ofs; + int err; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. + */ + index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; + ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* The maximum valid index into the page cache for $MFT's data. */ + i_size = i_size_read(mft_vi); + end_index = i_size >> PAGE_CACHE_SHIFT; + if (unlikely(index >= end_index)) { + if (unlikely(index > end_index || ofs + vol->mft_record_size >= + (i_size & ~PAGE_CACHE_MASK))) { + ntfs_error(vol->sb, "Tried to format non-existing mft " + "record 0x%llx.", (long long)mft_no); + return -ENOENT; + } + } + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing mft record " + "to format 0x%llx.", (long long)mft_no); + return PTR_ERR(page); + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + err = ntfs_mft_record_layout(vol, mft_no, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", + (long long)mft_no); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + return err; + } + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + /* + * Make sure the mft record is written out to disk. We could use + * ilookup5() to check if an inode is in icache and so on but this is + * unnecessary as ntfs_writepage() will write the dirty record anyway. + */ + mark_ntfs_record_dirty(page, ofs); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume + * @vol: [IN] volume on which to allocate the mft record + * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 + * @base_ni: [IN] open base inode if allocating an extent mft record or NULL + * @mrec: [OUT] on successful return this is the mapped mft record + * + * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. + * + * If @base_ni is NULL make the mft record a base mft record, i.e. a file or + * direvctory inode, and allocate it at the default allocator position. In + * this case @mode is the file mode as given to us by the caller. We in + * particular use @mode to distinguish whether a file or a directory is being + * created (S_IFDIR(mode) and S_IFREG(mode), respectively). + * + * If @base_ni is not NULL make the allocated mft record an extent record, + * allocate it starting at the mft record after the base mft record and attach + * the allocated and opened ntfs inode to the base inode @base_ni. In this + * case @mode must be 0 as it is meaningless for extent inodes. + * + * You need to check the return value with IS_ERR(). If false, the function + * was successful and the return value is the now opened ntfs inode of the + * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, + * and locked mft record. If IS_ERR() is true, the function failed and the + * error code is obtained from PTR_ERR(return value). *@mrec is undefined in + * this case. + * + * Allocation strategy: + * + * To find a free mft record, we scan the mft bitmap for a zero bit. To + * optimize this we start scanning at the place specified by @base_ni or if + * @base_ni is NULL we start where we last stopped and we perform wrap around + * when we reach the end. Note, we do not try to allocate mft records below + * number 24 because numbers 0 to 15 are the defined system files anyway and 16 + * to 24 are special in that they are used for storing extension mft records + * for the $DATA attribute of $MFT. This is required to avoid the possibility + * of creating a runlist with a circular dependency which once written to disk + * can never be read in again. Windows will only use records 16 to 24 for + * normal files if the volume is completely out of space. We never use them + * which means that when the volume is really out of space we cannot create any + * more files while Windows can still create up to 8 small files. We can start + * doing this at some later time, it does not matter much for now. + * + * When scanning the mft bitmap, we only search up to the last allocated mft + * record. If there are no free records left in the range 24 to number of + * allocated mft records, then we extend the $MFT/$DATA attribute in order to + * create free mft records. We extend the allocated size of $MFT/$DATA by 16 + * records at a time or one cluster, if cluster size is above 16kiB. If there + * is not sufficient space to do this, we try to extend by a single mft record + * or one cluster, if cluster size is above the mft record size. + * + * No matter how many mft records we allocate, we initialize only the first + * allocated mft record, incrementing mft data size and initialized size + * accordingly, open an ntfs_inode for it and return it to the caller, unless + * there are less than 24 mft records, in which case we allocate and initialize + * mft records until we reach record 24 which we consider as the first free mft + * record for use by normal files. + * + * If during any stage we overflow the initialized data in the mft bitmap, we + * extend the initialized size (and data size) by 8 bytes, allocating another + * cluster if required. The bitmap data size has to be at least equal to the + * number of mft records in the mft, but it can be bigger, in which case the + * superflous bits are padded with zeroes. + * + * Thus, when we return successfully (IS_ERR() is false), we will have: + * - initialized / extended the mft bitmap if necessary, + * - initialized / extended the mft data if necessary, + * - set the bit corresponding to the mft record being allocated in the + * mft bitmap, + * - opened an ntfs_inode for the allocated mft record, and we will have + * - returned the ntfs_inode as well as the allocated mapped, pinned, and + * locked mft record. + * + * On error, the volume will be left in a consistent state and no record will + * be allocated. If rolling back a partial operation fails, we may leave some + * inconsistent metadata in which case we set NVolErrors() so the volume is + * left dirty when unmounted. + * + * Note, this function cannot make use of most of the normal functions, like + * for example for attribute resizing, etc, because when the run list overflows + * the base mft record and an attribute list is used, it is very important that + * the extension mft records used to store the $DATA attribute of $MFT can be + * reached without having to read the information contained inside them, as + * this would make it impossible to find them in the first place after the + * volume is unmounted. $MFT/$BITMAP probably does not need to follow this + * rule because the bitmap is not essential for finding the mft records, but on + * the other hand, handling the bitmap in this special way would make life + * easier because otherwise there might be circular invocations of functions + * when reading the bitmap. + */ +ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec) +{ + s64 ll, bit, old_data_initialized, old_data_size; + unsigned long flags; + struct inode *vi; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni, *ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + pgoff_t index; + unsigned int ofs; + int err; + le16 seq_no, usn; + bool record_formatted = false; + + if (base_ni) { + ntfs_debug("Entering (allocating an extent mft record for " + "base mft record 0x%llx).", + (long long)base_ni->mft_no); + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(mode); + } else + ntfs_debug("Entering (allocating a base mft record)."); + if (mode) { + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(base_ni); + /* We only support creation of normal files and directories. */ + if (!S_ISREG(mode) && !S_ISDIR(mode)) + return ERR_PTR(-EOPNOTSUPP); + } + BUG_ON(!mrec); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + down_write(&vol->mftbmp_lock); + bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); + if (bit >= 0) { + ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", + (long long)bit); + goto have_alloc_rec; + } + if (bit != -ENOSPC) { + up_write(&vol->mftbmp_lock); + return ERR_PTR(bit); + } + /* + * No free mft records left. If the mft bitmap already covers more + * than the currently used mft records, the next records are all free, + * so we can simply allocate the first unused mft record. + * Note: We also have to make sure that the mft bitmap at least covers + * the first 24 mft records as they are special and whilst they may not + * be in use, we do not allocate from them. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->initialized_size >> vol->mft_record_size_bits; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_initialized = mftbmp_ni->initialized_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized << 3 > ll && old_data_initialized > 3) { + bit = ll; + if (bit < 24) + bit = 24; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + ntfs_debug("Found free record (#2), bit 0x%llx.", + (long long)bit); + goto found_free_rec; + } + /* + * The mft bitmap needs to be expanded until it covers the first unused + * mft record that we can allocate. + * Note: The smallest mft record we allocate is mft record 24. + */ + bit = old_data_initialized << 3; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = mftbmp_ni->allocated_size; + ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)old_data_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)old_data_initialized); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized + 8 > old_data_size) { + /* Need to extend bitmap by one more cluster. */ + ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); + err = ntfs_mft_bitmap_extend_allocation_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + } + /* + * We now have sufficient allocated space, extend the initialized_size + * as well as the data_size if necessary and fill the new space with + * zeroes. + */ + err = ntfs_mft_bitmap_extend_initialized_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after initialized extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); +found_free_rec: + /* @bit is the found free mft record, allocate it in the mft bitmap. */ + ntfs_debug("At found_free_rec."); + err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); + up_write(&vol->mftbmp_lock); + goto err_out; + } + ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); +have_alloc_rec: + /* + * The mft bitmap is now uptodate. Deal with mft data attribute now. + * Note, we keep hold of the mft bitmap lock for writing until all + * modifications to the mft data attribute are complete, too, as they + * will impact decisions for mft bitmap and mft record allocation done + * by a parallel allocation and if the lock is not maintained a + * parallel allocation could allocate the same mft record as this one. + */ + ll = (bit + 1) << vol->mft_record_size_bits; + read_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (ll <= old_data_initialized) { + ntfs_debug("Allocated mft record already initialized."); + goto mft_rec_already_initialized; + } + ntfs_debug("Initializing allocated mft record."); + /* + * The mft record is outside the initialized data. Extend the mft data + * attribute until it covers the allocated record. The loop is only + * actually traversed more than once when a freshly formatted volume is + * first written to so it optimizes away nicely in the common case. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data before extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + while (ll > mft_ni->allocated_size) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); + err = ntfs_mft_data_extend_allocation_nolock(vol); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to extend mft data " + "allocation."); + goto undo_mftbmp_alloc_nolock; + } + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + } + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* + * Extend mft data initialized size (and data size of course) to reach + * the allocated mft record, formatting the mft records allong the way. + * Note: We only modify the ntfs_inode structure as that is all that is + * needed by ntfs_mft_record_format(). We will update the attribute + * record itself in one fell swoop later on. + */ + write_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + old_data_size = vol->mft_ino->i_size; + while (ll > mft_ni->initialized_size) { + s64 new_initialized_size, mft_no; + + new_initialized_size = mft_ni->initialized_size + + vol->mft_record_size; + mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; + if (new_initialized_size > i_size_read(vol->mft_ino)) + i_size_write(vol->mft_ino, new_initialized_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_debug("Initializing mft record 0x%llx.", + (long long)mft_no); + err = ntfs_mft_record_format(vol, mft_no); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to format mft record."); + goto undo_data_init; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = new_initialized_size; + } + write_unlock_irqrestore(&mft_ni->size_lock, flags); + record_formatted = true; + /* Update the mft data attribute record to reflect the new sizes. */ + m = map_mft_record(mft_ni); + if (IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to map mft record."); + err = PTR_ERR(m); + goto undo_data_init; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + err = -ENOMEM; + unmap_mft_record(mft_ni); + goto undo_data_init; + } + err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft data attribute."); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + goto undo_data_init; + } + a = ctx->attr; + read_lock_irqsave(&mft_ni->size_lock, flags); + a->data.non_resident.initialized_size = + cpu_to_sle64(mft_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after mft record initialization: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); + BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); +mft_rec_already_initialized: + /* + * We can finally drop the mft bitmap lock as the mft data attribute + * has been fully updated. The only disparity left is that the + * allocated mft record still needs to be marked as in use to match the + * set bit in the mft bitmap but this is actually not a problem since + * this mft record is not referenced from anywhere yet and the fact + * that it is allocated in the mft bitmap means that no-one will try to + * allocate it either. + */ + up_write(&vol->mftbmp_lock); + /* + * We now have allocated and initialized the mft record. Calculate the + * index of and the offset within the page cache page the record is in. + */ + index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; + ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(vol->mft_ino->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing allocated " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(page); + goto undo_mftbmp_alloc; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + /* If we just formatted the mft record no need to do it again. */ + if (!record_formatted) { + /* Sanity check that the mft record is really not in use. */ + if (ntfs_is_file_record(m->magic) && + (m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vol->sb, "Mft record 0x%llx was marked " + "free in mft bitmap but is marked " + "used itself. Corrupt filesystem. " + "Unmount and run chkdsk.", + (long long)bit); + err = -EIO; + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + NVolSetErrors(vol); + goto undo_mftbmp_alloc; + } + /* + * We need to (re-)format the mft record, preserving the + * sequence number if it is not zero as well as the update + * sequence number if it is not zero or -1 (0xffff). This + * means we do not need to care whether or not something went + * wrong with the previous mft record. + */ + seq_no = m->sequence_number; + usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); + err = ntfs_mft_record_layout(vol, bit, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout allocated mft " + "record 0x%llx.", (long long)bit); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + if (seq_no) + m->sequence_number = seq_no; + if (usn && le16_to_cpu(usn) != 0xffff) + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; + } + /* Set the mft record itself in use. */ + m->flags |= MFT_RECORD_IN_USE; + if (S_ISDIR(mode)) + m->flags |= MFT_RECORD_IS_DIRECTORY; + flush_dcache_page(page); + SetPageUptodate(page); + if (base_ni) { + MFT_RECORD *m_tmp; + + /* + * Setup the base mft record in the extent mft record. This + * completes initialization of the allocated extent mft record + * and we can simply use it with map_extent_mft_record(). + */ + m->base_mft_record = MK_LE_MREF(base_ni->mft_no, + base_ni->seq_no); + /* + * Allocate an extent inode structure for the new mft record, + * attach it to the base inode @base_ni and map, pin, and lock + * its, i.e. the allocated, mft record. + */ + m_tmp = map_extent_mft_record(base_ni, bit, &ni); + if (IS_ERR(m_tmp)) { + ntfs_error(vol->sb, "Failed to map allocated extent " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(m_tmp); + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + BUG_ON(m != m_tmp); + /* + * Make sure the allocated mft record is written out to disk. + * No need to set the inode dirty because the caller is going + * to do that anyway after finishing with the new extent mft + * record (e.g. at a minimum a new attribute will be added to + * the mft record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + /* + * Need to unmap the page since map_extent_mft_record() mapped + * it as well so we have it mapped twice at the moment. + */ + ntfs_unmap_page(page); + } else { + /* + * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink + * is set to 1 but the mft record->link_count is 0. The caller + * needs to bear this in mind. + */ + vi = new_inode(vol->sb); + if (unlikely(!vi)) { + err = -ENOMEM; + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + vi->i_ino = bit; + /* + * This is for checking whether an inode has changed w.r.t. a + * file so that the file can be updated if necessary (compare + * with f_version). + */ + vi->i_version = 1; + + /* The owner and group come from the ntfs volume. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + /* + * Set the appropriate mode, attribute type, and name. For + * directories, also setup the index values to the defaults. + */ + if (S_ISDIR(mode)) { + vi->i_mode = S_IFDIR | S_IRWXUGO; + vi->i_mode &= ~vol->dmask; + + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + ni->itype.index.block_size = 4096; + ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; + ni->itype.index.collation_rule = COLLATION_FILE_NAME; + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = + vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = + vol->sector_size_bits; + } + } else { + vi->i_mode = S_IFREG | S_IRWXUGO; + vi->i_mode &= ~vol->fmask; + + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + } + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + + /* Set the inode times to the current time. */ + vi->i_atime = vi->i_mtime = vi->i_ctime = + current_fs_time(vi->i_sb); + /* + * Set the file size to 0, the ntfs inode sizes are set to 0 by + * the call to ntfs_init_big_inode() below. + */ + vi->i_size = 0; + vi->i_blocks = 0; + + /* Set the sequence number. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + /* + * Manually map, pin, and lock the mft record as we already + * have its page mapped and it is very easy to do. + */ + atomic_inc(&ni->count); + mutex_lock(&ni->mrec_lock); + ni->page = page; + ni->page_ofs = ofs; + /* + * Make sure the allocated mft record is written out to disk. + * NOTE: We do not set the ntfs inode dirty because this would + * fail in ntfs_write_inode() because the inode does not have a + * standard information attribute yet. Also, there is no need + * to set the inode dirty because the caller is going to do + * that anyway after finishing with the new mft record (e.g. at + * a minimum some new attributes will be added to the mft + * record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + + /* Add the inode to the inode hash for the superblock. */ + insert_inode_hash(vi); + + /* Update the default mft allocation position. */ + vol->mft_data_pos = bit + 1; + } + /* + * Return the opened, allocated inode of the allocated mft record as + * well as the mapped, pinned, and locked mft record. + */ + ntfs_debug("Returning opened, allocated %sinode 0x%llx.", + base_ni ? "extent " : "", (long long)bit); + *mrec = m; + return ni; +undo_data_init: + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = old_data_initialized; + i_size_write(vol->mft_ino, old_data_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + goto undo_mftbmp_alloc_nolock; +undo_mftbmp_alloc: + down_write(&vol->mftbmp_lock); +undo_mftbmp_alloc_nolock: + if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->mftbmp_lock); +err_out: + return ERR_PTR(err); +max_err_out: + ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " + "number of inodes (2^32) has already been reached."); + up_write(&vol->mftbmp_lock); + return ERR_PTR(-ENOSPC); +} + +/** + * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume + * @ni: ntfs inode of the mapped extent mft record to free + * @m: mapped extent mft record of the ntfs inode @ni + * + * Free the mapped extent mft record @m of the extent ntfs inode @ni. + * + * Note that this function unmaps the mft record and closes and destroys @ni + * internally and hence you cannot use either @ni nor @m any more after this + * function returns success. + * + * On success return 0 and on error return -errno. @ni and @m are still valid + * in this case and have not been freed. + * + * For some errors an error message is displayed and the success code 0 is + * returned and the volume is then left dirty on umount. This makes sense in + * case we could not rollback the changes that were already done since the + * caller no longer wants to reference this mft record so it does not matter to + * the caller if something is wrong with it as long as it is properly detached + * from the base inode. + */ +int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) +{ + unsigned long mft_no = ni->mft_no; + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + ntfs_inode **extent_nis; + int i, err; + le16 old_seq_no; + u16 seq_no; + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + + mutex_lock(&ni->extent_lock); + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + + BUG_ON(base_ni->nr_extents <= 0); + + ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", + mft_no, base_ni->mft_no); + + mutex_lock(&base_ni->extent_lock); + + /* Make sure we are holding the only reference to the extent inode. */ + if (atomic_read(&ni->count) > 2) { + ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " + "not freeing.", base_ni->mft_no); + mutex_unlock(&base_ni->extent_lock); + return -EBUSY; + } + + /* Dissociate the ntfs inode from the base inode. */ + extent_nis = base_ni->ext.extent_ntfs_inos; + err = -ENOENT; + for (i = 0; i < base_ni->nr_extents; i++) { + if (ni != extent_nis[i]) + continue; + extent_nis += i; + base_ni->nr_extents--; + memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * + sizeof(ntfs_inode*)); + err = 0; + break; + } + + mutex_unlock(&base_ni->extent_lock); + + if (unlikely(err)) { + ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " + "its base inode 0x%lx.", mft_no, + base_ni->mft_no); + BUG(); + } + + /* + * The extent inode is no longer attached to the base inode so no one + * can get a reference to it any more. + */ + + /* Mark the mft record as not in use. */ + m->flags &= ~MFT_RECORD_IN_USE; + + /* Increment the sequence number, skipping zero, if it is not zero. */ + old_seq_no = m->sequence_number; + seq_no = le16_to_cpu(old_seq_no); + if (seq_no == 0xffff) + seq_no = 1; + else if (seq_no) + seq_no++; + m->sequence_number = cpu_to_le16(seq_no); + + /* + * Set the ntfs inode dirty and write it out. We do not need to worry + * about the base inode here since whatever caused the extent mft + * record to be freed is guaranteed to do it already. + */ + NInoSetDirty(ni); + err = write_mft_record(ni, m, 0); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " + "freeing.", mft_no); + goto rollback; + } +rollback_error: + /* Unmap and throw away the now freed extent inode. */ + unmap_extent_mft_record(ni); + ntfs_clear_extent_inode(ni); + + /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ + down_write(&vol->mftbmp_lock); + err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); + up_write(&vol->mftbmp_lock); + if (unlikely(err)) { + /* + * The extent inode is gone but we failed to deallocate it in + * the mft bitmap. Just emit a warning and leave the volume + * dirty on umount. + */ + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + return 0; +rollback: + /* Rollback what we did... */ + mutex_lock(&base_ni->extent_lock); + extent_nis = base_ni->ext.extent_ntfs_inos; + if (!(base_ni->nr_extents & 3)) { + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); + + extent_nis = kmalloc(new_size, GFP_NOFS); + if (unlikely(!extent_nis)) { + ntfs_error(vol->sb, "Failed to allocate internal " + "buffer during rollback.%s", es); + mutex_unlock(&base_ni->extent_lock); + NVolSetErrors(vol); + goto rollback_error; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, + new_size - 4 * sizeof(ntfs_inode*)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = extent_nis; + } + m->flags |= MFT_RECORD_IN_USE; + m->sequence_number = old_seq_no; + extent_nis[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + mark_mft_record_dirty(ni); + return err; +} +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/mft.h b/kernel/fs/ntfs/mft.h new file mode 100644 index 000000000..b52bf87b9 --- /dev/null +++ b/kernel/fs/ntfs/mft.h @@ -0,0 +1,124 @@ +/* + * mft.h - Defines for mft record handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_MFT_H +#define _LINUX_NTFS_MFT_H + +#include +#include +#include + +#include "inode.h" + +extern MFT_RECORD *map_mft_record(ntfs_inode *ni); +extern void unmap_mft_record(ntfs_inode *ni); + +extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino); + +static inline void unmap_extent_mft_record(ntfs_inode *ni) +{ + unmap_mft_record(ni); + return; +} + +#ifdef NTFS_RW + +/** + * flush_dcache_mft_record_page - flush_dcache_page() for mft records + * @ni: ntfs inode structure of mft record + * + * Call flush_dcache_page() for the page in which an mft record resides. + * + * This must be called every time an mft record is modified, just after the + * modification. + */ +static inline void flush_dcache_mft_record_page(ntfs_inode *ni) +{ + flush_dcache_page(ni->page); +} + +extern void __mark_mft_record_dirty(ntfs_inode *ni); + +/** + * mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: Do not do anything if the mft record is already marked dirty. + */ +static inline void mark_mft_record_dirty(ntfs_inode *ni) +{ + if (!NInoTestSetDirty(ni)) + __mark_mft_record_dirty(ni); +} + +extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync); + +extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); + +/** + * write_mft_record - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * This is just a wrapper for write_mft_record_nolock() (see mft.c), which + * locks the page for the duration of the write. This ensures that there are + * no race conditions between writing the mft record via the dirty inode code + * paths and via the page cache write back code paths or between writing + * neighbouring mft records residing in the same page. + * + * Locking the page also serializes us against ->readpage() if the page is not + * uptodate. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + */ +static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + struct page *page = ni->page; + int err; + + BUG_ON(!page); + lock_page(page); + err = write_mft_record_nolock(ni, m, sync); + unlock_page(page); + return err; +} + +extern bool ntfs_may_write_mft_record(ntfs_volume *vol, + const unsigned long mft_no, const MFT_RECORD *m, + ntfs_inode **locked_ni); + +extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec); +extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_MFT_H */ diff --git a/kernel/fs/ntfs/mst.c b/kernel/fs/ntfs/mst.c new file mode 100644 index 000000000..5a858d839 --- /dev/null +++ b/kernel/fs/ntfs/mst.c @@ -0,0 +1,203 @@ +/* + * mst.c - NTFS multi sector transfer protection handling code. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "ntfs.h" + +/** + * post_read_mst_fixup - deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * @size: size in bytes of @b + * + * Perform the necessary post read multi sector transfer fixup and detect the + * presence of incomplete multi sector transfers. - In that case, overwrite the + * magic of the ntfs record header being processed with "BAAD" (in memory only!) + * and abort processing. + * + * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not protected at all and hence doesn't need to + * be fixed up. Thus, we return success and not failure in this case. This is + * in contrast to pre_write_mst_fixup(), see below. + */ +int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + u16 usa_ofs, usa_count, usn; + u16 *usa_pos, *data_pos; + + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return 0; + /* Position of usn in update sequence array. */ + usa_pos = (u16*)b + usa_ofs/sizeof(u16); + /* + * The update sequence number which has to be equal to each of the + * u16 values before they are fixed up. Note no need to care for + * endianness since we are comparing and moving data for on disk + * structures which means the data is consistent. - If it is + * consistenty the wrong endianness it doesn't make any difference. + */ + usn = *usa_pos; + /* + * Position in protected data of first u16 that needs fixing up. + */ + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* + * Check for incomplete multi sector transfer(s). + */ + while (usa_count--) { + if (*data_pos != usn) { + /* + * Incomplete multi sector transfer detected! )-: + * Set the magic to "BAAD" and return failure. + * Note that magic_BAAD is already converted to le32. + */ + b->magic = magic_BAAD; + return -EINVAL; + } + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + /* Re-setup the variables. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + return 0; +} + +/** + * pre_write_mst_fixup - apply multi sector transfer protection + * @b: pointer to the data to protect + * @size: size in bytes of @b + * + * Perform the necessary pre write multi sector transfer fixup on the data + * pointer to by @b of @size. + * + * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed + * (assumed not needed). This is in contrast to post_read_mst_fixup() above. + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not subject to protection and hence doesn't need + * to be fixed up. This means that you have to create a valid update sequence + * array header in the ntfs record before calling this function, otherwise it + * will fail (the header needs to contain the position of the update sequence + * array together with the number of elements in the array). You also need to + * initialise the update sequence number before calling this function + * otherwise a random word will be used (whatever was in the record at that + * position at that time). + */ +int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + le16 *usa_pos, *data_pos; + u16 usa_ofs, usa_count, usn; + le16 le_usn; + + /* Sanity check + only fixup if it makes sense. */ + if (!b || ntfs_is_baad_record(b->magic) || + ntfs_is_hole_record(b->magic)) + return -EINVAL; + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return -EINVAL; + /* Position of usn in update sequence array. */ + usa_pos = (le16*)((u8*)b + usa_ofs); + /* + * Cyclically increment the update sequence number + * (skipping 0 and -1, i.e. 0xffff). + */ + usn = le16_to_cpup(usa_pos) + 1; + if (usn == 0xffff || !usn) + usn = 1; + le_usn = cpu_to_le16(usn); + *usa_pos = le_usn; + /* Position in data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment the position in the usa and save the + * original data from the data buffer into the usa. + */ + *(++usa_pos) = *data_pos; + /* Apply fixup to data. */ + *data_pos = le_usn; + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } + return 0; +} + +/** + * post_write_mst_fixup - fast deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * + * Perform the necessary post write multi sector transfer fixup, not checking + * for any errors, because we assume we have just used pre_write_mst_fixup(), + * thus the data will be fine or we would never have gotten here. + */ +void post_write_mst_fixup(NTFS_RECORD *b) +{ + le16 *usa_pos, *data_pos; + + u16 usa_ofs = le16_to_cpu(b->usa_ofs); + u16 usa_count = le16_to_cpu(b->usa_count) - 1; + + /* Position of usn in update sequence array. */ + usa_pos = (le16*)b + usa_ofs/sizeof(le16); + + /* Position in protected data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } +} diff --git a/kernel/fs/ntfs/namei.c b/kernel/fs/ntfs/namei.c new file mode 100644 index 000000000..0f35b80d1 --- /dev/null +++ b/kernel/fs/ntfs/namei.c @@ -0,0 +1,405 @@ +/* + * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "dir.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_lookup - find the inode represented by a dentry in a directory inode + * @dir_ino: directory inode in which to look for the inode + * @dent: dentry representing the inode to look for + * @nd: lookup nameidata + * + * In short, ntfs_lookup() looks for the inode represented by the dentry @dent + * in the directory inode @dir_ino and if found attaches the inode to the + * dentry @dent. + * + * In more detail, the dentry @dent specifies which inode to look for by + * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() + * converts the name to Unicode and walks the contents of the directory inode + * @dir_ino looking for the converted Unicode name. If the name is found in the + * directory, the corresponding inode is loaded by calling ntfs_iget() on its + * inode number and the inode is associated with the dentry @dent via a call to + * d_splice_alias(). + * + * If the name is not found in the directory, a NULL inode is inserted into the + * dentry @dent via a call to d_add(). The dentry is then termed a negative + * dentry. + * + * Only if an actual error occurs, do we return an error via ERR_PTR(). + * + * In order to handle the case insensitivity issues of NTFS with regards to the + * dcache and the dcache requiring only one dentry per directory, we deal with + * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining + * a case sensitive dcache. This means that we get the full benefit of dcache + * speed when the file/directory is looked up with the same case as returned by + * ->ntfs_readdir() but that a lookup for any other case (or for the short file + * name) will not find anything in dcache and will enter ->ntfs_lookup() + * instead, where we search the directory for a fully matching file name + * (including case) and if that is not found, we search for a file name that + * matches with different case and if that has non-POSIX semantics we return + * that. We actually do only one search (case sensitive) and keep tabs on + * whether we have found a case insensitive match in the process. + * + * To simplify matters for us, we do not treat the short vs long filenames as + * two hard links but instead if the lookup matches a short filename, we + * return the dentry for the corresponding long filename instead. + * + * There are three cases we need to distinguish here: + * + * 1) @dent perfectly matches (i.e. including case) a directory entry with a + * file name in the WIN32 or POSIX namespaces. In this case + * ntfs_lookup_inode_by_name() will return with name set to NULL and we + * just d_splice_alias() @dent. + * 2) @dent matches (not including case) a directory entry with a file name in + * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return + * with name set to point to a kmalloc()ed ntfs_name structure containing + * the properly cased little endian Unicode name. We convert the name to the + * current NLS code page, search if a dentry with this name already exists + * and if so return that instead of @dent. At this point things are + * complicated by the possibility of 'disconnected' dentries due to NFS + * which we deal with appropriately (see the code comments). The VFS will + * then destroy the old @dent and use the one we returned. If a dentry is + * not found, we allocate a new one, d_splice_alias() it, and return it as + * above. + * 3) @dent matches either perfectly or not (i.e. we don't care about case) a + * directory entry with a file name in the DOS namespace. In this case + * ntfs_lookup_inode_by_name() will return with name set to point to a + * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) + * of the inode. We use the mft reference to read the inode and to find the + * file name in the WIN32 namespace corresponding to the matched short file + * name. We then convert the name to the current NLS code page, and proceed + * searching for a dentry with this name, etc, as in case 2), above. + * + * Locking: Caller must hold i_mutex on the directory. + */ +static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, + unsigned int flags) +{ + ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); + struct inode *dent_inode; + ntfschar *uname; + ntfs_name *name = NULL; + MFT_REF mref; + unsigned long dent_ino; + int uname_len; + + ntfs_debug("Looking up %pd in directory inode 0x%lx.", + dent, dir_ino->i_ino); + /* Convert the name of the dentry to Unicode. */ + uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, + &uname); + if (uname_len < 0) { + if (uname_len != -ENAMETOOLONG) + ntfs_error(vol->sb, "Failed to convert name to " + "Unicode."); + return ERR_PTR(uname_len); + } + mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, + &name); + kmem_cache_free(ntfs_name_cache, uname); + if (!IS_ERR_MREF(mref)) { + dent_ino = MREF(mref); + ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); + dent_inode = ntfs_iget(vol->sb, dent_ino); + if (likely(!IS_ERR(dent_inode))) { + /* Consistency check. */ + if (is_bad_inode(dent_inode) || MSEQNO(mref) == + NTFS_I(dent_inode)->seq_no || + dent_ino == FILE_MFT) { + /* Perfect WIN32/POSIX match. -- Case 1. */ + if (!name) { + ntfs_debug("Done. (Case 1.)"); + return d_splice_alias(dent_inode, dent); + } + /* + * We are too indented. Handle imperfect + * matches and short file names further below. + */ + goto handle_name; + } + ntfs_error(vol->sb, "Found stale reference to inode " + "0x%lx (reference sequence number = " + "0x%x, inode sequence number = 0x%x), " + "returning -EIO. Run chkdsk.", + dent_ino, MSEQNO(mref), + NTFS_I(dent_inode)->seq_no); + iput(dent_inode); + dent_inode = ERR_PTR(-EIO); + } else + ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " + "error code %li.", dent_ino, + PTR_ERR(dent_inode)); + kfree(name); + /* Return the error code. */ + return (struct dentry *)dent_inode; + } + /* It is guaranteed that @name is no longer allocated at this point. */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("Entry was not found, adding negative dentry."); + /* The dcache will handle negative entries. */ + d_add(dent, NULL); + ntfs_debug("Done."); + return NULL; + } + ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " + "code %i.", -MREF_ERR(mref)); + return ERR_PTR(MREF_ERR(mref)); + // TODO: Consider moving this lot to a separate function! (AIA) +handle_name: + { + MFT_RECORD *m; + ntfs_attr_search_ctx *ctx; + ntfs_inode *ni = NTFS_I(dent_inode); + int err; + struct qstr nls_name; + + nls_name.name = NULL; + if (name->type != FILE_NAME_DOS) { /* Case 2. */ + ntfs_debug("Case 2."); + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&name->name, name->len, + (unsigned char**)&nls_name.name, 0); + kfree(name); + } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ + FILE_NAME_ATTR *fn; + + ntfs_debug("Case 3."); + kfree(name); + + /* Find the WIN32 name corresponding to the matched DOS name. */ + ni = NTFS_I(dent_inode); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + do { + ATTR_RECORD *a; + u32 val_len; + + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, + NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Inode corrupt: No WIN32 " + "namespace counterpart to DOS " + "file name. Run chkdsk."); + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + /* Consistency checks. */ + a = ctx->attr; + if (a->non_resident || a->flags) + goto eio_err_out; + val_len = le32_to_cpu(a->data.resident.value_length); + if (le16_to_cpu(a->data.resident.value_offset) + + val_len > le32_to_cpu(a->length)) + goto eio_err_out; + fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( + ctx->attr->data.resident.value_offset)); + if ((u32)(fn->file_name_length * sizeof(ntfschar) + + sizeof(FILE_NAME_ATTR)) > val_len) + goto eio_err_out; + } while (fn->file_name_type != FILE_NAME_WIN32); + + /* Convert the found WIN32 name to current NLS code page. */ + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&fn->file_name, fn->file_name_length, + (unsigned char**)&nls_name.name, 0); + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + } + m = NULL; + ctx = NULL; + + /* Check if a conversion error occurred. */ + if ((signed)nls_name.len < 0) { + err = (signed)nls_name.len; + goto err_out; + } + nls_name.hash = full_name_hash(nls_name.name, nls_name.len); + + dent = d_add_ci(dent, dent_inode, &nls_name); + kfree(nls_name.name); + return dent; + +eio_err_out: + ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); + err = -EIO; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); + iput(dent_inode); + ntfs_error(vol->sb, "Failed, returning error code %i.", err); + return ERR_PTR(err); + } +} + +/** + * Inode operations for directories. + */ +const struct inode_operations ntfs_dir_inode_ops = { + .lookup = ntfs_lookup, /* VFS: Lookup directory. */ +}; + +/** + * ntfs_get_parent - find the dentry of the parent of a given directory dentry + * @child_dent: dentry of the directory whose parent directory to find + * + * Find the dentry for the parent directory of the directory specified by the + * dentry @child_dent. This function is called from + * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the + * default ->decode_fh() which is export_decode_fh() in the same file. + * + * The code is based on the ext3 ->get_parent() implementation found in + * fs/ext3/namei.c::ext3_get_parent(). + * + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. + * + * Return the dentry of the parent directory on success or the error code on + * error (IS_ERR() is true). + */ +static struct dentry *ntfs_get_parent(struct dentry *child_dent) +{ + struct inode *vi = d_inode(child_dent); + ntfs_inode *ni = NTFS_I(vi); + MFT_RECORD *mrec; + ntfs_attr_search_ctx *ctx; + ATTR_RECORD *attr; + FILE_NAME_ATTR *fn; + unsigned long parent_ino; + int err; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + /* Get the mft record of the inode belonging to the child dentry. */ + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) + return (struct dentry *)mrec; + /* Find the first file name attribute in the mft record. */ + ctx = ntfs_attr_get_search_ctx(ni, mrec); + if (unlikely(!ctx)) { + unmap_mft_record(ni); + return ERR_PTR(-ENOMEM); + } +try_next: + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + if (err == -ENOENT) + ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " + "file name attribute. Run chkdsk.", + vi->i_ino); + return ERR_PTR(err); + } + attr = ctx->attr; + if (unlikely(attr->non_resident)) + goto try_next; + fn = (FILE_NAME_ATTR *)((u8 *)attr + + le16_to_cpu(attr->data.resident.value_offset)); + if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > + (u8*)attr + le32_to_cpu(attr->length))) + goto try_next; + /* Get the inode number of the parent directory. */ + parent_ino = MREF_LE(fn->parent_directory); + /* Release the search context and the mft record of the child. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + + return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); +} + +static struct inode *ntfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = ntfs_iget(sb, ino); + if (!IS_ERR(inode)) { + if (is_bad_inode(inode) || inode->i_generation != generation) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +/** + * Export operations allowing NFS exporting of mounted NTFS partitions. + * + * We use the default ->encode_fh() for now. Note that they + * use 32 bits to store the inode number which is an unsigned long so on 64-bit + * architectures is usually 64 bits so it would all fail horribly on huge + * volumes. I guess we need to define our own encode and decode fh functions + * that store 64-bit inode numbers at some point but for now we will ignore the + * problem... + * + * We also use the default ->get_name() helper (used by ->decode_fh() via + * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs + * independent. + * + * The default ->get_parent() just returns -EACCES so we have to provide our + * own and the default ->get_dentry() is incompatible with NTFS due to not + * allowing the inode number 0 which is used in NTFS for the system file $MFT + * and due to using iget() whereas NTFS needs ntfs_iget(). + */ +const struct export_operations ntfs_export_ops = { + .get_parent = ntfs_get_parent, /* Find the parent of a given + directory. */ + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, +}; diff --git a/kernel/fs/ntfs/ntfs.h b/kernel/fs/ntfs/ntfs.h new file mode 100644 index 000000000..c581e26a3 --- /dev/null +++ b/kernel/fs/ntfs/ntfs.h @@ -0,0 +1,164 @@ +/* + * ntfs.h - Defines for NTFS Linux kernel driver. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (C) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_H +#define _LINUX_NTFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "volume.h" +#include "layout.h" + +typedef enum { + NTFS_BLOCK_SIZE = 512, + NTFS_BLOCK_SIZE_BITS = 9, + NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ + NTFS_MAX_NAME_LEN = 255, + NTFS_MAX_ATTR_NAME_LEN = 255, + NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE, +} NTFS_CONSTANTS; + +/* Global variables. */ + +/* Slab caches (from super.c). */ +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; + +/* The various operations structs defined throughout the driver files. */ +extern const struct address_space_operations ntfs_normal_aops; +extern const struct address_space_operations ntfs_compressed_aops; +extern const struct address_space_operations ntfs_mst_aops; + +extern const struct file_operations ntfs_file_ops; +extern const struct inode_operations ntfs_file_inode_ops; + +extern const struct file_operations ntfs_dir_ops; +extern const struct inode_operations ntfs_dir_inode_ops; + +extern const struct file_operations ntfs_empty_file_ops; +extern const struct inode_operations ntfs_empty_inode_ops; + +extern const struct export_operations ntfs_export_ops; + +/** + * NTFS_SB - return the ntfs volume given a vfs super block + * @sb: VFS super block + * + * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. + */ +static inline ntfs_volume *NTFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* Declarations of functions and global variables. */ + +/* From fs/ntfs/compress.c */ +extern int ntfs_read_compressed_block(struct page *page); +extern int allocate_compression_buffers(void); +extern void free_compression_buffers(void); + +/* From fs/ntfs/super.c */ +#define default_upcase_len 0x10000 +extern struct mutex ntfs_lock; + +typedef struct { + int val; + char *str; +} option_t; +extern const option_t on_errors_arr[]; + +/* From fs/ntfs/mst.c */ +extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); +extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); +extern void post_write_mst_fixup(NTFS_RECORD *b); + +/* From fs/ntfs/unistr.c */ +extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, + const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size); +extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); +extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size); +extern void ntfs_upcase_name(ntfschar *name, u32 name_len, + const ntfschar *upcase, const u32 upcase_len); +extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs); +extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len); + +/* From fs/ntfs/upcase.c */ +extern ntfschar *generate_default_upcase(void); + +static inline int ntfs_ffs(int x) +{ + int r = 1; + + if (!x) + return 0; + if (!(x & 0xffff)) { + x >>= 16; + r += 16; + } + if (!(x & 0xff)) { + x >>= 8; + r += 8; + } + if (!(x & 0xf)) { + x >>= 4; + r += 4; + } + if (!(x & 3)) { + x >>= 2; + r += 2; + } + if (!(x & 1)) { + x >>= 1; + r += 1; + } + return r; +} + +#endif /* _LINUX_NTFS_H */ diff --git a/kernel/fs/ntfs/quota.c b/kernel/fs/ntfs/quota.c new file mode 100644 index 000000000..d80e3315c --- /dev/null +++ b/kernel/fs/ntfs/quota.c @@ -0,0 +1,117 @@ +/* + * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include "index.h" +#include "quota.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume + * @vol: ntfs volume on which to mark the quotas out of date + * + * Mark the quotas out of date on the ntfs volume @vol and return 'true' on + * success and 'false' on error. + */ +bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) +{ + ntfs_index_context *ictx; + QUOTA_CONTROL_ENTRY *qce; + const le32 qid = QUOTA_DEFAULTS_ID; + int err; + + ntfs_debug("Entering."); + if (NVolQuotaOutOfDate(vol)) + goto done; + if (!vol->quota_ino || !vol->quota_q_ino) { + ntfs_error(vol->sb, "Quota inodes are not open."); + return false; + } + mutex_lock(&vol->quota_q_ino->i_mutex); + ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); + if (!ictx) { + ntfs_error(vol->sb, "Failed to get index context."); + goto err_out; + } + err = ntfs_index_lookup(&qid, sizeof(qid), ictx); + if (err) { + if (err == -ENOENT) + ntfs_error(vol->sb, "Quota defaults entry is not " + "present."); + else + ntfs_error(vol->sb, "Lookup of quota defaults entry " + "failed."); + goto err_out; + } + if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { + ntfs_error(vol->sb, "Quota defaults entry size is invalid. " + "Run chkdsk."); + goto err_out; + } + qce = (QUOTA_CONTROL_ENTRY*)ictx->data; + if (le32_to_cpu(qce->version) != QUOTA_VERSION) { + ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " + "supported.", le32_to_cpu(qce->version)); + goto err_out; + } + ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); + /* If quotas are already marked out of date, no need to do anything. */ + if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) + goto set_done; + /* + * If quota tracking is neither requested, nor enabled and there are no + * pending deletes, no need to mark the quotas out of date. + */ + if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | + QUOTA_FLAG_TRACKING_REQUESTED | + QUOTA_FLAG_PENDING_DELETES))) + goto set_done; + /* + * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. + * This is verified on WinXP to be sufficient to cause windows to + * rescan the volume on boot and update all quota entries. + */ + qce->flags |= QUOTA_FLAG_OUT_OF_DATE; + /* Ensure the modified flags are written to disk. */ + ntfs_index_entry_flush_dcache_page(ictx); + ntfs_index_entry_mark_dirty(ictx); +set_done: + ntfs_index_ctx_put(ictx); + mutex_unlock(&vol->quota_q_ino->i_mutex); + /* + * We set the flag so we do not try to mark the quotas out of date + * again on remount. + */ + NVolSetQuotaOutOfDate(vol); +done: + ntfs_debug("Done."); + return true; +err_out: + if (ictx) + ntfs_index_ctx_put(ictx); + mutex_unlock(&vol->quota_q_ino->i_mutex); + return false; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/quota.h b/kernel/fs/ntfs/quota.h new file mode 100644 index 000000000..4cbe5594c --- /dev/null +++ b/kernel/fs/ntfs/quota.h @@ -0,0 +1,35 @@ +/* + * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_QUOTA_H +#define _LINUX_NTFS_QUOTA_H + +#ifdef NTFS_RW + +#include "types.h" +#include "volume.h" + +extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/kernel/fs/ntfs/runlist.c b/kernel/fs/ntfs/runlist.c new file mode 100644 index 000000000..eac7d6788 --- /dev/null +++ b/kernel/fs/ntfs/runlist.c @@ -0,0 +1,1907 @@ +/** + * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "debug.h" +#include "dir.h" +#include "endian.h" +#include "malloc.h" +#include "ntfs.h" + +/** + * ntfs_rl_mm - runlist memmove + * + * It is up to the caller to serialize access to the runlist @base. + */ +static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, + int size) +{ + if (likely((dst != src) && (size > 0))) + memmove(base + dst, base + src, size * sizeof(*base)); +} + +/** + * ntfs_rl_mc - runlist memory copy + * + * It is up to the caller to serialize access to the runlists @dstbase and + * @srcbase. + */ +static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, + runlist_element *srcbase, int src, int size) +{ + if (likely(size > 0)) + memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); +} + +/** + * ntfs_rl_realloc - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs(new_size); + if (unlikely(!new_rl)) + return ERR_PTR(-ENOMEM); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_rl_realloc_nofail - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs_nofail(new_size); + BUG_ON(!new_rl); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_are_rl_mergeable - test if two runlists can be joined together + * @dst: original runlist + * @src: new runlist to test for mergeability with @dst + * + * Test if two runlists can be joined together. For this, their VCNs and LCNs + * must be adjacent. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * Return: true Success, the runlists can be merged. + * false Failure, the runlists cannot be merged. + */ +static inline bool ntfs_are_rl_mergeable(runlist_element *dst, + runlist_element *src) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* We can merge unmapped regions even if they are misaligned. */ + if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) + return true; + /* If the runs are misaligned, we cannot merge them. */ + if ((dst->vcn + dst->length) != src->vcn) + return false; + /* If both runs are non-sparse and contiguous, we can merge them. */ + if ((dst->lcn >= 0) && (src->lcn >= 0) && + ((dst->lcn + dst->length) == src->lcn)) + return true; + /* If we are merging two holes, we can merge them. */ + if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) + return true; + /* Cannot merge. */ + return false; +} + +/** + * __ntfs_rl_merge - merge two runlists without testing if they can be merged + * @dst: original, destination runlist + * @src: new runlist to merge with @dst + * + * Merge the two runlists, writing into the destination runlist @dst. The + * caller must make sure the runlists can be merged or this will corrupt the + * destination runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + */ +static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) +{ + dst->length += src->length; +} + +/** + * ntfs_rl_append - append a runlist after a given element + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: runlist to be inserted into @dst + * @ssize: number of elements in @src (excluding end marker) + * @loc: append the new runlist @src after this element in @dst + * + * Append the runlist @src after element @loc in @dst. Merge the right end of + * the new runlist, if necessary. Adjust the size of the hole before the + * appended runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_append(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool right = false; /* Right end of @src needs merging. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, check if the right hand end needs merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + + /* Space required: @dst size + @src size, less one if we merged. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the right hand end, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + + /* First run after the @src runs that have been inserted. */ + marker = loc + ssize + 1; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the preceding hole. */ + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + + /* We may have changed the length of the file, so fix the end marker */ + if (dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + + return dst; +} + +/** + * ntfs_rl_insert - insert a runlist into another + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: insert the new runlist @src before this element in @dst + * + * Insert the runlist @src before element @loc in the runlist @dst. Merge the + * left end of the new runlist, if necessary. Adjust the size of the hole + * after the inserted runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_insert(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool left = false; /* Left end of @src needs merging. */ + bool disc = false; /* Discontinuity between @dst and @src. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* + * disc => Discontinuity between the end of @dst and the start of @src. + * This means we might need to insert a "not mapped" run. + */ + if (loc == 0) + disc = (src[0].vcn > 0); + else { + s64 merged_length; + + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + + merged_length = dst[loc - 1].length; + if (left) + merged_length += src->length; + + disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); + } + /* + * Space required: @dst size + @src size, less one if we merged, plus + * one if there was a discontinuity. + */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlist. + */ + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * First run after the @src runs that have been inserted. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. And if @disc, then @dst and @src do + * not meet and we need an extra run to fill the gap. + */ + marker = loc + ssize - left + disc; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc, dsize - loc); + ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); + + /* Adjust the VCN of the first run after the insertion... */ + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + /* ... and the length. */ + if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) + dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; + + /* Writing beyond the end of the file and there is a discontinuity. */ + if (disc) { + if (loc > 0) { + dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + } else { + dst[loc].vcn = 0; + dst[loc].length = dst[loc + 1].vcn; + } + dst[loc].lcn = LCN_RL_NOT_MAPPED; + } + return dst; +} + +/** + * ntfs_rl_replace - overwrite a runlist element with another runlist + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst to overwrite with @src + * + * Replace the runlist element @dst at @loc with @src. Merge the left and + * right ends of the inserted runlist, if necessary. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_replace(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + signed delta; + bool left = false; /* Left end of @src needs merging. */ + bool right = false; /* Right end of @src needs merging. */ + int tail; /* Start of tail of @dst. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, see if the left and right ends need merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + if (loc > 0) + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + /* + * Allocate some space. We will need less if the left, right, or both + * ends get merged. The -1 accounts for the run being replaced. + */ + delta = ssize - 1 - left - right; + if (delta > 0) { + dst = ntfs_rl_realloc(dst, dsize, dsize + delta); + if (IS_ERR(dst)) + return dst; + } + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the left and right ends, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * Offset of the tail of @dst. This needs to be moved out of the way + * to make space for the runs to be copied from @src, i.e. the first + * run of the tail of @dst. + * Nominally, @tail equals @loc + 1, i.e. location, skipping the + * replaced run. However, if @right, then one of @dst's runs is + * already merged into @src. + */ + tail = loc + right + 1; + /* + * First run after the @src runs that have been inserted, i.e. where + * the tail of @dst needs to be moved to. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. + */ + marker = loc + ssize - left; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, tail, dsize - tail); + ntfs_rl_mc(dst, loc, src, left, ssize - left); + + /* We may have changed the length of the file, so fix the end marker. */ + if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + return dst; +} + +/** + * ntfs_rl_split - insert a runlist into the centre of a hole + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst at which to split and insert @src + * + * Split the runlist @dst at @loc into two and insert @new in between the two + * fragments. No merging of runlists is necessary. Adjust the size of the + * holes either side. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, + runlist_element *src, int ssize, int loc) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* Space required: @dst size + @src size + one new hole. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the holes either size of @src. */ + dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; + dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; + dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; + + return dst; +} + +/** + * ntfs_runlists_merge - merge two runlists into one + * @drl: original runlist to be worked on + * @srl: new runlist to be merged into @drl + * + * First we sanity check the two runlists @srl and @drl to make sure that they + * are sensible and can be merged. The runlist @srl must be either after the + * runlist @drl or completely within a hole (or unmapped region) in @drl. + * + * It is up to the caller to serialize access to the runlists @drl and @srl. + * + * Merging of runlists is necessary in two cases: + * 1. When attribute lists are used and a further extent is being mapped. + * 2. When new clusters are allocated to fill a hole or extend a file. + * + * There are four possible ways @srl can be merged. It can: + * - be inserted at the beginning of a hole, + * - split the hole in two and be inserted between the two fragments, + * - be appended at the end of a hole, or it can + * - replace the whole hole. + * It can also be appended to the end of the runlist, which is just a variant + * of the insert case. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @drl and @srl are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The runlists overlap and cannot be merged. + */ +runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl) +{ + int di, si; /* Current index into @[ds]rl. */ + int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ + int dins; /* Index into @drl at which to insert @srl. */ + int dend, send; /* Last index into @[ds]rl. */ + int dfinal, sfinal; /* The last index into @[ds]rl with + lcn >= LCN_HOLE. */ + int marker = 0; + VCN marker_vcn = 0; + +#ifdef DEBUG + ntfs_debug("dst:"); + ntfs_debug_dump_runlist(drl); + ntfs_debug("src:"); + ntfs_debug_dump_runlist(srl); +#endif + + /* Check for silly calling... */ + if (unlikely(!srl)) + return drl; + if (IS_ERR(srl) || IS_ERR(drl)) + return ERR_PTR(-EINVAL); + + /* Check for the case where the first mapping is being done now. */ + if (unlikely(!drl)) { + drl = srl; + /* Complete the source runlist if necessary. */ + if (unlikely(drl[0].vcn)) { + /* Scan to the end of the source runlist. */ + for (dend = 0; likely(drl[dend].length); dend++) + ; + dend++; + drl = ntfs_rl_realloc(drl, dend, dend + 1); + if (IS_ERR(drl)) + return drl; + /* Insert start element at the front of the runlist. */ + ntfs_rl_mm(drl, 1, 0, dend); + drl[0].vcn = 0; + drl[0].lcn = LCN_RL_NOT_MAPPED; + drl[0].length = drl[1].vcn; + } + goto finished; + } + + si = di = 0; + + /* Skip any unmapped start element(s) in the source runlist. */ + while (srl[si].length && srl[si].lcn < LCN_HOLE) + si++; + + /* Can't have an entirely unmapped source runlist. */ + BUG_ON(!srl[si].length); + + /* Record the starting points. */ + sstart = si; + + /* + * Skip forward in @drl until we reach the position where @srl needs to + * be inserted. If we reach the end of @drl, @srl just needs to be + * appended to @drl. + */ + for (; drl[di].length; di++) { + if (drl[di].vcn + drl[di].length > srl[sstart].vcn) + break; + } + dins = di; + + /* Sanity check for illegal overlaps. */ + if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && + (srl[si].lcn >= 0)) { + ntfs_error(NULL, "Run lists overlap. Cannot merge!"); + return ERR_PTR(-ERANGE); + } + + /* Scan to the end of both runlists in order to know their sizes. */ + for (send = si; srl[send].length; send++) + ; + for (dend = di; drl[dend].length; dend++) + ; + + if (srl[send].lcn == LCN_ENOENT) + marker_vcn = srl[marker = send].vcn; + + /* Scan to the last element with lcn >= LCN_HOLE. */ + for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) + ; + for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) + ; + + { + bool start; + bool finish; + int ds = dend + 1; /* Number of elements in drl & srl */ + int ss = sfinal - sstart + 1; + + start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ + (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ + finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ + ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ + (srl[send - 1].vcn + srl[send - 1].length))); + + /* Or we will lose an end marker. */ + if (finish && !drl[dins].length) + ss++; + if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) + finish = false; +#if 0 + ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); + ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); + ntfs_debug("start = %i, finish = %i", start, finish); + ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); +#endif + if (start) { + if (finish) + drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); + } else { + if (finish) + drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); + } + if (IS_ERR(drl)) { + ntfs_error(NULL, "Merge failed."); + return drl; + } + ntfs_free(srl); + if (marker) { + ntfs_debug("Triggering marker code."); + for (ds = dend; drl[ds].length; ds++) + ; + /* We only need to care if @srl ended after @drl. */ + if (drl[ds].vcn <= marker_vcn) { + int slots = 0; + + if (drl[ds].vcn == marker_vcn) { + ntfs_debug("Old marker = 0x%llx, replacing " + "with LCN_ENOENT.", + (unsigned long long) + drl[ds].lcn); + drl[ds].lcn = LCN_ENOENT; + goto finished; + } + /* + * We need to create an unmapped runlist element in + * @drl or extend an existing one before adding the + * ENOENT terminator. + */ + if (drl[ds].lcn == LCN_ENOENT) { + ds--; + slots = 1; + } + if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { + /* Add an unmapped runlist element. */ + if (!slots) { + drl = ntfs_rl_realloc_nofail(drl, ds, + ds + 2); + slots = 2; + } + ds++; + /* Need to set vcn if it isn't set already. */ + if (slots != 1) + drl[ds].vcn = drl[ds - 1].vcn + + drl[ds - 1].length; + drl[ds].lcn = LCN_RL_NOT_MAPPED; + /* We now used up a slot. */ + slots--; + } + drl[ds].length = marker_vcn - drl[ds].vcn; + /* Finally add the ENOENT terminator. */ + ds++; + if (!slots) + drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); + drl[ds].vcn = marker_vcn; + drl[ds].lcn = LCN_ENOENT; + drl[ds].length = (s64)0; + } + } + } + +finished: + /* The merge was completed successfully. */ + ntfs_debug("Merged runlist:"); + ntfs_debug_dump_runlist(drl); + return drl; +} + +/** + * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist + * @vol: ntfs volume on which the attribute resides + * @attr: attribute record whose mapping pairs array to decompress + * @old_rl: optional runlist in which to insert @attr's runlist + * + * It is up to the caller to serialize access to the runlist @old_rl. + * + * Decompress the attribute @attr's mapping pairs array into a runlist. On + * success, return the decompressed runlist. + * + * If @old_rl is not NULL, decompressed runlist is inserted into the + * appropriate place in @old_rl and the resultant, combined runlist is + * returned. The original @old_rl is deallocated. + * + * On error, return -errno. @old_rl is left unmodified in that case. + * + * The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EIO - Corrupt runlist. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The two runlists overlap. + * + * FIXME: For now we take the conceptionally simplest approach of creating the + * new runlist disregarding the already existing one and then splicing the + * two into one, if that is possible (we check for overlap and discard the new + * runlist if overlap present before returning ERR_PTR(-ERANGE)). + */ +runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl) +{ + VCN vcn; /* Current vcn. */ + LCN lcn; /* Current lcn. */ + s64 deltaxcn; /* Change in [vl]cn. */ + runlist_element *rl; /* The output runlist. */ + u8 *buf; /* Current position in mapping pairs array. */ + u8 *attr_end; /* End of attribute. */ + int rlsize; /* Size of runlist buffer. */ + u16 rlpos; /* Current runlist position in units of + runlist_elements. */ + u8 b; /* Current byte offset in buf. */ + +#ifdef DEBUG + /* Make sure attr exists and is non-resident. */ + if (!attr || !attr->non_resident || sle64_to_cpu( + attr->data.non_resident.lowest_vcn) < (VCN)0) { + ntfs_error(vol->sb, "Invalid arguments."); + return ERR_PTR(-EINVAL); + } +#endif + /* Start at vcn = lowest_vcn and lcn 0. */ + vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); + lcn = 0; + /* Get start of the mapping pairs array. */ + buf = (u8*)attr + le16_to_cpu( + attr->data.non_resident.mapping_pairs_offset); + attr_end = (u8*)attr + le32_to_cpu(attr->length); + if (unlikely(buf < (u8*)attr || buf > attr_end)) { + ntfs_error(vol->sb, "Corrupt attribute."); + return ERR_PTR(-EIO); + } + /* If the mapping pairs array is valid but empty, nothing to do. */ + if (!vcn && !*buf) + return old_rl; + /* Current position in runlist array. */ + rlpos = 0; + /* Allocate first page and set current runlist size to one page. */ + rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); + if (unlikely(!rl)) + return ERR_PTR(-ENOMEM); + /* Insert unmapped starting element if necessary. */ + if (vcn) { + rl->vcn = 0; + rl->lcn = LCN_RL_NOT_MAPPED; + rl->length = vcn; + rlpos++; + } + while (buf < attr_end && *buf) { + /* + * Allocate more memory if needed, including space for the + * not-mapped and terminator elements. ntfs_malloc_nofs() + * operates on whole pages only. + */ + if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { + runlist_element *rl2; + + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + ntfs_free(rl); + return ERR_PTR(-ENOMEM); + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + } + /* Enter the current vcn into the current runlist element. */ + rl[rlpos].vcn = vcn; + /* + * Get the change in vcn, i.e. the run length in clusters. + * Doing it this way ensures that we signextend negative values. + * A negative run length doesn't make any sense, but hey, I + * didn't make up the NTFS specs and Windows NT4 treats the run + * length as a signed value so that's how it is... + */ + b = *buf & 0xf; + if (b) { + if (unlikely(buf + b > attr_end)) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + } else { /* The length entry is compulsory. */ + ntfs_error(vol->sb, "Missing length entry in mapping " + "pairs array."); + deltaxcn = (s64)-1; + } + /* + * Assume a negative length to indicate data corruption and + * hence clean-up and return NULL. + */ + if (unlikely(deltaxcn < 0)) { + ntfs_error(vol->sb, "Invalid length in mapping pairs " + "array."); + goto err_out; + } + /* + * Enter the current run length into the current runlist + * element. + */ + rl[rlpos].length = deltaxcn; + /* Increment the current vcn by the current run length. */ + vcn += deltaxcn; + /* + * There might be no lcn change at all, as is the case for + * sparse clusters on NTFS 3.0+, in which case we set the lcn + * to LCN_HOLE. + */ + if (!(*buf & 0xf0)) + rl[rlpos].lcn = LCN_HOLE; + else { + /* Get the lcn change which really can be negative. */ + u8 b2 = *buf & 0xf; + b = b2 + ((*buf >> 4) & 0xf); + if (buf + b > attr_end) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b > b2; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + /* Change the current lcn to its new value. */ + lcn += deltaxcn; +#ifdef DEBUG + /* + * On NTFS 1.2-, apparently can have lcn == -1 to + * indicate a hole. But we haven't verified ourselves + * whether it is really the lcn or the deltaxcn that is + * -1. So if either is found give us a message so we + * can investigate it further! + */ + if (vol->major_ver < 3) { + if (unlikely(deltaxcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn delta == -1"); + if (unlikely(lcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn == -1"); + } +#endif + /* Check lcn is not below -1. */ + if (unlikely(lcn < (LCN)-1)) { + ntfs_error(vol->sb, "Invalid LCN < -1 in " + "mapping pairs array."); + goto err_out; + } + /* Enter the current lcn into the runlist element. */ + rl[rlpos].lcn = lcn; + } + /* Get to the next runlist element. */ + rlpos++; + /* Increment the buffer position to the next mapping pair. */ + buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; + } + if (unlikely(buf >= attr_end)) + goto io_error; + /* + * If there is a highest_vcn specified, it must be equal to the final + * vcn in the runlist - 1, or something has gone badly wrong. + */ + deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { +mpa_err: + ntfs_error(vol->sb, "Corrupt mapping pairs array in " + "non-resident attribute."); + goto err_out; + } + /* Setup not mapped runlist element if this is the base extent. */ + if (!attr->data.non_resident.lowest_vcn) { + VCN max_cluster; + + max_cluster = ((sle64_to_cpu( + attr->data.non_resident.allocated_size) + + vol->cluster_size - 1) >> + vol->cluster_size_bits) - 1; + /* + * A highest_vcn of zero means this is a single extent + * attribute so simply terminate the runlist with LCN_ENOENT). + */ + if (deltaxcn) { + /* + * If there is a difference between the highest_vcn and + * the highest cluster, the runlist is either corrupt + * or, more likely, there are more extents following + * this one. + */ + if (deltaxcn < max_cluster) { + ntfs_debug("More extents to follow; deltaxcn " + "= 0x%llx, max_cluster = " + "0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + rl[rlpos].vcn = vcn; + vcn += rl[rlpos].length = max_cluster - + deltaxcn; + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + rlpos++; + } else if (unlikely(deltaxcn > max_cluster)) { + ntfs_error(vol->sb, "Corrupt attribute. " + "deltaxcn = 0x%llx, " + "max_cluster = 0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + goto mpa_err; + } + } + rl[rlpos].lcn = LCN_ENOENT; + } else /* Not the base extent. There may be more extents to follow. */ + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + + /* Setup terminating runlist element. */ + rl[rlpos].vcn = vcn; + rl[rlpos].length = (s64)0; + /* If no existing runlist was specified, we are done. */ + if (!old_rl) { + ntfs_debug("Mapping pairs array successfully decompressed:"); + ntfs_debug_dump_runlist(rl); + return rl; + } + /* Now combine the new and old runlists checking for overlaps. */ + old_rl = ntfs_runlists_merge(old_rl, rl); + if (likely(!IS_ERR(old_rl))) + return old_rl; + ntfs_free(rl); + ntfs_error(vol->sb, "Failed to merge runlists."); + return old_rl; +io_error: + ntfs_error(vol->sb, "Corrupt attribute."); +err_out: + ntfs_free(rl); + return ERR_PTR(-EIO); +} + +/** + * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist + * @rl: runlist to use for conversion + * @vcn: vcn to convert + * + * Convert the virtual cluster number @vcn of an attribute into a logical + * cluster number (lcn) of a device using the runlist @rl to map vcns to their + * corresponding lcns. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ================================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_RL_NOT_MAPPED This is part of the runlist which has not been + * inserted into the runlist yet. + * LCN_ENOENT There is no such vcn in the attribute. + * + * Locking: - The caller must have locked the runlist (for reading or writing). + * - This function does not touch the lock, nor does it modify the + * runlist. + */ +LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) +{ + int i; + + BUG_ON(vcn < 0); + /* + * If rl is NULL, assume that we have found an unmapped runlist. The + * caller can then attempt to map it and fail appropriately if + * necessary. + */ + if (unlikely(!rl)) + return LCN_RL_NOT_MAPPED; + + /* Catch out of lower bounds vcn. */ + if (unlikely(vcn < rl[0].vcn)) + return LCN_ENOENT; + + for (i = 0; likely(rl[i].length); i++) { + if (unlikely(vcn < rl[i+1].vcn)) { + if (likely(rl[i].lcn >= (LCN)0)) + return rl[i].lcn + (vcn - rl[i].vcn); + return rl[i].lcn; + } + } + /* + * The terminator element is setup to the correct value, i.e. one of + * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. + */ + if (likely(rl[i].lcn < (LCN)0)) + return rl[i].lcn; + /* Just in case... We could replace this with BUG() some day. */ + return LCN_ENOENT; +} + +#ifdef NTFS_RW + +/** + * ntfs_rl_find_vcn_nolock - find a vcn in a runlist + * @rl: runlist to search + * @vcn: vcn to find + * + * Find the virtual cluster number @vcn in the runlist @rl and return the + * address of the runlist element containing the @vcn on success. + * + * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of + * the runlist. + * + * Locking: The runlist must be locked on entry. + */ +runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +{ + BUG_ON(vcn < 0); + if (unlikely(!rl || vcn < rl[0].vcn)) + return NULL; + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) + return rl; + return NULL; + } + rl++; + } + if (likely(rl->lcn == LCN_ENOENT)) + return rl; + return NULL; +} + +/** + * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number + * @n: number for which to get the number of bytes for + * + * Return the number of bytes required to store @n unambiguously as + * a signed number. + * + * This is used in the context of the mapping pairs array to determine how + * many bytes will be needed in the array to store a given logical cluster + * number (lcn) or a specific run length. + * + * Return the number of bytes written. This function cannot fail. + */ +static inline int ntfs_get_nr_significant_bytes(const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if ((n < 0 && j >= 0) || (n > 0 && j < 0)) + i++; + return i; +} + +/** + * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array + * @vol: ntfs volume (needed for the ntfs version) + * @rl: locked runlist to determine the size of the mapping pairs of + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * + * Walk the locked runlist @rl and calculate the size in bytes of the mapping + * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and + * finishing with vcn @last_vcn. + * + * A @last_vcn of -1 means end of runlist and in that case the size of the + * mapping pairs array corresponding to the runlist starting at vcn @first_vcn + * and finishing at the end of the runlist is determined. + * + * This for example allows us to allocate a buffer of the right size when + * building the mapping pairs array. + * + * If @rl is NULL, just return 1 (for the single terminator byte). + * + * Return the calculated size in bytes on success. On error, return -errno. + * The following error codes are defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn) +{ + LCN prev_lcn; + int rls; + bool the_end = false; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + return 1; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + prev_lcn = 0; + /* Always need the termining zero byte. */ + rls = 1; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length - delta); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(prev_lcn); + } + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(rl->lcn - + prev_lcn); + prev_lcn = rl->lcn; + } + } + return rls; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + rls = -EINVAL; + else + rls = -EIO; + return rls; +} + +/** + * ntfs_write_significant_bytes - write the significant bytes of a number + * @dst: destination buffer to write to + * @dst_max: pointer to last byte of destination buffer for bounds checking + * @n: number whose significant bytes to write + * + * Store in @dst, the minimum bytes of the number @n which are required to + * identify @n unambiguously as a signed number, taking care not to exceed + * @dest_max, the maximum position within @dst to which we are allowed to + * write. + * + * This is used when building the mapping pairs array of a runlist to compress + * a given logical cluster number (lcn) or a specific run length to the minimum + * size possible. + * + * Return the number of bytes written on success. On error, i.e. the + * destination buffer @dst is too small, return -ENOSPC. + */ +static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, + const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + if (unlikely(dst > dst_max)) + goto err_out; + *dst++ = l & 0xffll; + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if (n < 0 && j >= 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)-1; + } else if (n > 0 && j < 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)0; + } + return i; +err_out: + return -ENOSPC; +} + +/** + * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist + * @vol: ntfs volume (needed for the ntfs version) + * @dst: destination buffer to which to write the mapping pairs array + * @dst_len: size of destination buffer @dst in bytes + * @rl: locked runlist for which to build the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC + * + * Create the mapping pairs array from the locked runlist @rl, starting at vcn + * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. + * @dst_len is the size of @dst in bytes and it should be at least equal to the + * value obtained by calling ntfs_get_size_for_mapping_pairs(). + * + * A @last_vcn of -1 means end of runlist and in that case the mapping pairs + * array corresponding to the runlist starting at vcn @first_vcn and finishing + * at the end of the runlist is created. + * + * If @rl is NULL, just write a single terminator byte to @dst. + * + * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to + * the first vcn outside the destination buffer. Note that on error, @dst has + * been filled with all the mapping pairs that will fit, thus it can be treated + * as partial success, in that a new attribute extent needs to be created or + * the next extent has to be used and the mapping pairs build has to be + * continued with @first_vcn set to *@stop_vcn. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * -ENOSPC - The destination buffer is too small. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) +{ + LCN prev_lcn; + s8 *dst_max, *dst_next; + int err = -ENOSPC; + bool the_end = false; + s8 len_len, lcn_len; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + BUG_ON(dst_len < 1); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + if (stop_vcn) + *stop_vcn = 0; + /* Terminator byte. */ + *dst = 0; + return 0; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + /* + * @dst_max is used for bounds checking in + * ntfs_write_significant_bytes(). + */ + dst_max = dst + dst_len - 1; + prev_lcn = 0; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length - delta); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, rl->lcn - prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + prev_lcn = rl->lcn; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + } + /* Success. */ + err = 0; +size_err: + /* Set stop vcn. */ + if (stop_vcn) + *stop_vcn = rl->vcn; + /* Add terminator byte. */ + *dst = 0; + return err; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + err = -EINVAL; + else + err = -EIO; + return err; +} + +/** + * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to truncate + * @new_length: the new length of the runlist in VCNs + * + * Truncate the runlist described by @runlist as well as the memory buffer + * holding the runlist elements to a length of @new_length VCNs. + * + * If @new_length lies within the runlist, the runlist elements with VCNs of + * @new_length and above are discarded. As a special case if @new_length is + * zero, the runlist is discarded and set to NULL. + * + * If @new_length lies beyond the runlist, a sparse runlist element is added to + * the end of the runlist @runlist or if the last runlist element is a sparse + * one already, this is extended. + * + * Note, no checking is done for unmapped runlist elements. It is assumed that + * the caller has mapped any elements that need to be mapped already. + * + * Return 0 on success and -errno on error. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, + const s64 new_length) +{ + runlist_element *rl; + int old_size; + + ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); + BUG_ON(!runlist); + BUG_ON(new_length < 0); + rl = runlist->rl; + if (!new_length) { + ntfs_debug("Freeing runlist."); + runlist->rl = NULL; + if (rl) + ntfs_free(rl); + return 0; + } + if (unlikely(!rl)) { + /* + * Create a runlist consisting of a sparse runlist element of + * length @new_length followed by a terminator runlist element. + */ + rl = ntfs_malloc_nofs(PAGE_SIZE); + if (unlikely(!rl)) { + ntfs_error(vol->sb, "Not enough memory to allocate " + "runlist element buffer."); + return -ENOMEM; + } + runlist->rl = rl; + rl[1].length = rl->vcn = 0; + rl->lcn = LCN_HOLE; + rl[1].vcn = rl->length = new_length; + rl[1].lcn = LCN_ENOENT; + return 0; + } + BUG_ON(new_length < rl->vcn); + /* Find @new_length in the runlist. */ + while (likely(rl->length && new_length >= rl[1].vcn)) + rl++; + /* + * If not at the end of the runlist we need to shrink it. + * If at the end of the runlist we need to expand it. + */ + if (rl->length) { + runlist_element *trl; + bool is_end; + + ntfs_debug("Shrinking runlist."); + /* Determine the runlist size. */ + trl = rl + 1; + while (likely(trl->length)) + trl++; + old_size = trl - runlist->rl + 1; + /* Truncate the run. */ + rl->length = new_length - rl->vcn; + /* + * If a run was partially truncated, make the following runlist + * element a terminator. + */ + is_end = false; + if (rl->length) { + rl++; + if (!rl->length) + is_end = true; + rl->vcn = new_length; + rl->length = 0; + } + rl->lcn = LCN_ENOENT; + /* Reallocate memory if necessary. */ + if (!is_end) { + int new_size = rl - runlist->rl + 1; + rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { + ntfs_debug("Expanding runlist."); + /* + * If there is a previous runlist element and it is a sparse + * one, extend it. Otherwise need to add a new, sparse runlist + * element. + */ + if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) + (rl - 1)->length = new_length - (rl - 1)->vcn; + else { + /* Determine the runlist size. */ + old_size = rl - runlist->rl + 1; + /* Reallocate memory if necessary. */ + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size + 1); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to expand runlist " + "buffer, aborting."); + return PTR_ERR(rl); + } + runlist->rl = rl; + /* + * Set @rl to the same runlist element in the new + * runlist as before in the old runlist. + */ + rl += old_size - 1; + /* Add a new, sparse runlist element. */ + rl->lcn = LCN_HOLE; + rl->length = new_length - rl->vcn; + /* Add a new terminator runlist element. */ + rl++; + rl->length = 0; + } + rl->vcn = new_length; + rl->lcn = LCN_ENOENT; + } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { + /* Runlist already has same size as requested. */ + rl->lcn = LCN_ENOENT; + } + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_rl_punch_nolock - punch a hole into a runlist + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to punch a hole into + * @start: starting VCN of the hole to be created + * @length: size of the hole to be created in units of clusters + * + * Punch a hole into the runlist @runlist starting at VCN @start and of size + * @length clusters. + * + * Return 0 on success and -errno on error, in which case @runlist has not been + * modified. + * + * If @start and/or @start + @length are outside the runlist return error code + * -ENOENT. + * + * If the runlist contains unmapped or error elements between @start and @start + * + @length return error code -EINVAL. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length) +{ + const VCN end = start + length; + s64 delta; + runlist_element *rl, *rl_end, *rl_real_end, *trl; + int old_size; + bool lcn_fixup = false; + + ntfs_debug("Entering for start 0x%llx, length 0x%llx.", + (long long)start, (long long)length); + BUG_ON(!runlist); + BUG_ON(start < 0); + BUG_ON(length < 0); + BUG_ON(end < 0); + rl = runlist->rl; + if (unlikely(!rl)) { + if (likely(!start && !length)) + return 0; + return -ENOENT; + } + /* Find @start in the runlist. */ + while (likely(rl->length && start >= rl[1].vcn)) + rl++; + rl_end = rl; + /* Find @end in the runlist. */ + while (likely(rl_end->length && end >= rl_end[1].vcn)) { + /* Verify there are no unmapped or error elements. */ + if (unlikely(rl_end->lcn < LCN_HOLE)) + return -EINVAL; + rl_end++; + } + /* Check the last element. */ + if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) + return -EINVAL; + /* This covers @start being out of bounds, too. */ + if (!rl_end->length && end > rl_end->vcn) + return -ENOENT; + if (!length) + return 0; + if (!rl->length) + return -ENOENT; + rl_real_end = rl_end; + /* Determine the runlist size. */ + while (likely(rl_real_end->length)) + rl_real_end++; + old_size = rl_real_end - runlist->rl + 1; + /* If @start is in a hole simply extend the hole. */ + if (rl->lcn == LCN_HOLE) { + /* + * If both @start and @end are in the same sparse run, we are + * done. + */ + if (end <= rl[1].vcn) { + ntfs_debug("Done (requested hole is already sparse)."); + return 0; + } +extend_hole: + /* Extend the hole. */ + rl->length = end - rl->vcn; + /* If @end is in a hole, merge it with the current one. */ + if (rl_end->lcn == LCN_HOLE) { + rl_end++; + rl->length = rl_end->vcn - rl->vcn; + } + /* We have done the hole. Now deal with the remaining tail. */ + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Adjust the beginning of the tail if necessary. */ + if (end > rl->vcn) { + delta = end - rl->vcn; + rl->vcn = end; + rl->length -= delta; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0) + rl->lcn += delta; + } +shrink_allocation: + /* Reallocate memory if the allocation changed. */ + if (rl < rl_end) { + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size - (rl_end - rl)); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + ntfs_debug("Done (extend hole)."); + return 0; + } + /* + * If @start is at the beginning of a run things are easier as there is + * no need to split the first run. + */ + if (start == rl->vcn) { + /* + * @start is at the beginning of a run. + * + * If the previous run is sparse, extend its hole. + * + * If @end is not in the same run, switch the run to be sparse + * and extend the newly created hole. + * + * Thus both of these cases reduce the problem to the above + * case of "@start is in a hole". + */ + if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { + rl--; + goto extend_hole; + } + if (end >= rl[1].vcn) { + rl->lcn = LCN_HOLE; + goto extend_hole; + } + /* + * The final case is when @end is in the same run as @start. + * For this need to split the run into two. One run for the + * sparse region between the beginning of the old run, i.e. + * @start, and @end and one for the remaining non-sparse + * region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } +split_end: + /* Shift all the runs up by one. */ + memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the two split runs. */ + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + rl->vcn += length; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0 || lcn_fixup) + rl->lcn += length; + rl->length -= length; + ntfs_debug("Done (split one)."); + return 0; + } + /* + * @start is neither in a hole nor at the beginning of a run. + * + * If @end is in a hole, things are easier as simply truncating the run + * @start is in to end at @start - 1, deleting all runs after that up + * to @end, and finally extending the beginning of the run @end is in + * to be @start is all that is needed. + */ + if (rl_end->lcn == LCN_HOLE) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Extend the beginning of the run @end is in to be @start. */ + rl->vcn = start; + rl->length = rl[1].vcn - start; + goto shrink_allocation; + } + /* + * If @end is not in a hole there are still two cases to distinguish. + * Either @end is or is not in the same run as @start. + * + * The second case is easier as it can be reduced to an already solved + * problem by truncating the run @start is in to end at @start - 1. + * Then, if @end is in the next run need to split the run into a sparse + * run followed by a non-sparse run (already covered above) and if @end + * is not in the next run switching it to be sparse, again reduces the + * problem to the already covered case of "@start is in a hole". + */ + if (end >= rl[1].vcn) { + /* + * If @end is not in the next run, reduce the problem to the + * case of "@start is in a hole". + */ + if (rl[1].length && end >= rl[2].vcn) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + goto extend_hole; + } + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* + * @end is in the next run, reduce the problem to the case + * where "@start is at the beginning of a run and @end is in + * the same run as @start". + */ + delta = rl->vcn - start; + rl->vcn = start; + if (rl->lcn >= 0) { + rl->lcn -= delta; + /* Need this in case the lcn just became negative. */ + lcn_fixup = true; + } + rl->length += delta; + goto split_end; + } + /* + * The first case from above, i.e. @end is in the same run as @start. + * We need to split the run into three. One run for the non-sparse + * region between the beginning of the old run and @start, one for the + * sparse region between @start and @end, and one for the remaining + * non-sparse region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); + if (IS_ERR(trl)) + goto enomem_out; + old_size += 2; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Shift all the runs up by two. */ + memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the three split runs. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + delta = end - rl->vcn; + rl->vcn = end; + rl->lcn += delta; + rl->length -= delta; + ntfs_debug("Done (split both)."); + return 0; +enomem_out: + ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); + return -ENOMEM; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/runlist.h b/kernel/fs/ntfs/runlist.h new file mode 100644 index 000000000..47728fbb6 --- /dev/null +++ b/kernel/fs/ntfs/runlist.h @@ -0,0 +1,102 @@ +/* + * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_RUNLIST_H +#define _LINUX_NTFS_RUNLIST_H + +#include "types.h" +#include "layout.h" +#include "volume.h" + +/** + * runlist_element - in memory vcn to lcn mapping array element + * @vcn: starting vcn of the current array element + * @lcn: starting lcn of the current array element + * @length: length in clusters of the current array element + * + * The last vcn (in fact the last vcn + 1) is reached when length == 0. + * + * When lcn == -1 this means that the count vcns starting at vcn are not + * physically allocated (i.e. this is a hole / data is sparse). + */ +typedef struct { /* In memory vcn to lcn mapping structure element. */ + VCN vcn; /* vcn = Starting virtual cluster number. */ + LCN lcn; /* lcn = Starting logical cluster number. */ + s64 length; /* Run length in clusters. */ +} runlist_element; + +/** + * runlist - in memory vcn to lcn mapping array including a read/write lock + * @rl: pointer to an array of runlist elements + * @lock: read/write spinlock for serializing access to @rl + * + */ +typedef struct { + runlist_element *rl; + struct rw_semaphore lock; +} runlist; + +static inline void ntfs_init_runlist(runlist *rl) +{ + rl->rl = NULL; + init_rwsem(&rl->lock); +} + +typedef enum { + LCN_HOLE = -1, /* Keep this as highest value or die! */ + LCN_RL_NOT_MAPPED = -2, + LCN_ENOENT = -3, + LCN_ENOMEM = -4, + LCN_EIO = -5, +} LCN_SPECIAL_VALUES; + +extern runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl); + +extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl); + +extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); + +#ifdef NTFS_RW + +extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, + const VCN vcn); + +extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn); + +extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); + +extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, + runlist *const runlist, const s64 new_length); + +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/kernel/fs/ntfs/super.c b/kernel/fs/ntfs/super.c new file mode 100644 index 000000000..9e1e11207 --- /dev/null +++ b/kernel/fs/ntfs/super.c @@ -0,0 +1,3220 @@ +/* + * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001,2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include /* For bdev_logical_block_size(). */ +#include +#include +#include +#include +#include + +#include "sysctl.h" +#include "logfile.h" +#include "quota.h" +#include "usnjrnl.h" +#include "dir.h" +#include "debug.h" +#include "index.h" +#include "inode.h" +#include "aops.h" +#include "layout.h" +#include "malloc.h" +#include "ntfs.h" + +/* Number of mounted filesystems which have compression enabled. */ +static unsigned long ntfs_nr_compression_users; + +/* A global default upcase table and a corresponding reference count. */ +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; + +/* Error constants/strings used in inode.c::ntfs_show_options(). */ +typedef enum { + /* One of these must be present, default is ON_ERRORS_CONTINUE. */ + ON_ERRORS_PANIC = 0x01, + ON_ERRORS_REMOUNT_RO = 0x02, + ON_ERRORS_CONTINUE = 0x04, + /* Optional, can be combined with any of the above. */ + ON_ERRORS_RECOVER = 0x10, +} ON_ERRORS_ACTIONS; + +const option_t on_errors_arr[] = { + { ON_ERRORS_PANIC, "panic" }, + { ON_ERRORS_REMOUNT_RO, "remount-ro", }, + { ON_ERRORS_CONTINUE, "continue", }, + { ON_ERRORS_RECOVER, "recover" }, + { 0, NULL } +}; + +/** + * simple_getbool - + * + * Copied from old ntfs driver (which copied from vfat driver). + */ +static int simple_getbool(char *s, bool *setval) +{ + if (s) { + if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) + *setval = true; + else if (!strcmp(s, "0") || !strcmp(s, "no") || + !strcmp(s, "false")) + *setval = false; + else + return 0; + } else + *setval = true; + return 1; +} + +/** + * parse_options - parse the (re)mount options + * @vol: ntfs volume + * @opt: string containing the (re)mount options + * + * Parse the recognized options in @opt for the ntfs volume described by @vol. + */ +static bool parse_options(ntfs_volume *vol, char *opt) +{ + char *p, *v, *ov; + static char *utf8 = "utf8"; + int errors = 0, sloppy = 0; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; + int mft_zone_multiplier = -1, on_errors = -1; + int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; + struct nls_table *nls_map = NULL, *old_nls; + + /* I am lazy... (-8 */ +#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + variable = default_value; \ + else { \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } \ + } +#define NTFS_GETOPT(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_UID(option, variable) \ + if (!strcmp(p, option)) { \ + uid_t uid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + uid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kuid(current_user_ns(), uid_value); \ + if (!uid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_GID(option, variable) \ + if (!strcmp(p, option)) { \ + gid_t gid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + gid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kgid(current_user_ns(), gid_value); \ + if (!gid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_OCTAL(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 8); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_BOOL(option, variable) \ + if (!strcmp(p, option)) { \ + bool val; \ + if (!simple_getbool(v, &val)) \ + goto needs_bool; \ + variable = val; \ + } +#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ + if (!strcmp(p, option)) { \ + int _i; \ + if (!v || !*v) \ + goto needs_arg; \ + ov = v; \ + if (variable == -1) \ + variable = 0; \ + for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ + if (!strcmp(opt_array[_i].str, v)) { \ + variable |= opt_array[_i].val; \ + break; \ + } \ + if (!opt_array[_i].str || !*opt_array[_i].str) \ + goto needs_val; \ + } + if (!opt || !*opt) + goto no_mount_options; + ntfs_debug("Entering with mount options string: %s", opt); + while ((p = strsep(&opt, ","))) { + if ((v = strchr(p, '='))) + *v++ = 0; + NTFS_GETOPT_UID("uid", uid) + else NTFS_GETOPT_GID("gid", gid) + else NTFS_GETOPT_OCTAL("umask", fmask = dmask) + else NTFS_GETOPT_OCTAL("fmask", fmask) + else NTFS_GETOPT_OCTAL("dmask", dmask) + else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) + else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) + else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) + else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) + else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) + else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, + on_errors_arr) + else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) + ntfs_warning(vol->sb, "Ignoring obsolete option %s.", + p); + else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { + if (!strcmp(p, "iocharset")) + ntfs_warning(vol->sb, "Option iocharset is " + "deprecated. Please use " + "option nls= in " + "the future."); + if (!v || !*v) + goto needs_arg; +use_utf8: + old_nls = nls_map; + nls_map = load_nls(v); + if (!nls_map) { + if (!old_nls) { + ntfs_error(vol->sb, "NLS character set " + "%s not found.", v); + return false; + } + ntfs_error(vol->sb, "NLS character set %s not " + "found. Using previous one %s.", + v, old_nls->charset); + nls_map = old_nls; + } else /* nls_map */ { + unload_nls(old_nls); + } + } else if (!strcmp(p, "utf8")) { + bool val = false; + ntfs_warning(vol->sb, "Option utf8 is no longer " + "supported, using option nls=utf8. Please " + "use option nls=utf8 in the future and " + "make sure utf8 is compiled either as a " + "module or into the kernel."); + if (!v || !*v) + val = true; + else if (!simple_getbool(v, &val)) + goto needs_bool; + if (val) { + v = utf8; + goto use_utf8; + } + } else { + ntfs_error(vol->sb, "Unrecognized mount option %s.", p); + if (errors < INT_MAX) + errors++; + } +#undef NTFS_GETOPT_OPTIONS_ARRAY +#undef NTFS_GETOPT_BOOL +#undef NTFS_GETOPT +#undef NTFS_GETOPT_WITH_DEFAULT + } +no_mount_options: + if (errors && !sloppy) + return false; + if (sloppy) + ntfs_warning(vol->sb, "Sloppy option given. Ignoring " + "unrecognized mount option(s) and continuing."); + /* Keep this first! */ + if (on_errors != -1) { + if (!on_errors) { + ntfs_error(vol->sb, "Invalid errors option argument " + "or bug in options parser."); + return false; + } + } + if (nls_map) { + if (vol->nls_map && vol->nls_map != nls_map) { + ntfs_error(vol->sb, "Cannot change NLS character set " + "on remount."); + return false; + } /* else (!vol->nls_map) */ + ntfs_debug("Using NLS character set %s.", nls_map->charset); + vol->nls_map = nls_map; + } else /* (!nls_map) */ { + if (!vol->nls_map) { + vol->nls_map = load_nls_default(); + if (!vol->nls_map) { + ntfs_error(vol->sb, "Failed to load default " + "NLS character set."); + return false; + } + ntfs_debug("Using default NLS character set (%s).", + vol->nls_map->charset); + } + } + if (mft_zone_multiplier != -1) { + if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != + mft_zone_multiplier) { + ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " + "on remount."); + return false; + } + if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { + ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " + "Using default value, i.e. 1."); + mft_zone_multiplier = 1; + } + vol->mft_zone_multiplier = mft_zone_multiplier; + } + if (!vol->mft_zone_multiplier) + vol->mft_zone_multiplier = 1; + if (on_errors != -1) + vol->on_errors = on_errors; + if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) + vol->on_errors |= ON_ERRORS_CONTINUE; + if (uid_valid(uid)) + vol->uid = uid; + if (gid_valid(gid)) + vol->gid = gid; + if (fmask != (umode_t)-1) + vol->fmask = fmask; + if (dmask != (umode_t)-1) + vol->dmask = dmask; + if (show_sys_files != -1) { + if (show_sys_files) + NVolSetShowSystemFiles(vol); + else + NVolClearShowSystemFiles(vol); + } + if (case_sensitive != -1) { + if (case_sensitive) + NVolSetCaseSensitive(vol); + else + NVolClearCaseSensitive(vol); + } + if (disable_sparse != -1) { + if (disable_sparse) + NVolClearSparseEnabled(vol); + else { + if (!NVolSparseEnabled(vol) && + vol->major_ver && vol->major_ver < 3) + ntfs_warning(vol->sb, "Not enabling sparse " + "support due to NTFS volume " + "version %i.%i (need at least " + "version 3.0).", vol->major_ver, + vol->minor_ver); + else + NVolSetSparseEnabled(vol); + } + } + return true; +needs_arg: + ntfs_error(vol->sb, "The %s option requires an argument.", p); + return false; +needs_bool: + ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); + return false; +needs_val: + ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); + return false; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_volume_flags - write new flags to the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: new flags value for the volume information flags + * + * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() + * instead (see below). + * + * Replace the volume information flags on the volume @vol with the value + * supplied in @flags. Note, this overwrites the volume information flags, so + * make sure to combine the flags you want to modify with the old flags and use + * the result when calling ntfs_write_volume_flags(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +{ + ntfs_inode *ni = NTFS_I(vol->vol_ino); + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; + int err; + + ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", + le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); + if (vol->vol_flags == flags) + goto done; + BUG_ON(!ni); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto put_unm_err_out; + } + err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (err) + goto put_unm_err_out; + vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + vol->vol_flags = vi->flags = flags; + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +done: + ntfs_debug("Done."); + return 0; +put_unm_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i.", -err); + return err; +} + +/** + * ntfs_set_volume_flags - set bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to set on the volume + * + * Set the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags | flags); +} + +/** + * ntfs_clear_volume_flags - clear bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to clear on the volume + * + * Clear the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); + return ntfs_write_volume_flags(vol, flags); +} + +#endif /* NTFS_RW */ + +/** + * ntfs_remount - change the mount options of a mounted ntfs filesystem + * @sb: superblock of mounted ntfs filesystem + * @flags: remount flags + * @opt: remount options string + * + * Change the mount options of an already mounted ntfs filesystem. + * + * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after + * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, + * @sb->s_flags are not changed. + */ +static int ntfs_remount(struct super_block *sb, int *flags, char *opt) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering with remount options string: %s", opt); + + sync_filesystem(sb); + +#ifndef NTFS_RW + /* For read-only compiled driver, enforce read-only flag. */ + *flags |= MS_RDONLY; +#else /* NTFS_RW */ + /* + * For the read-write compiled driver, if we are remounting read-write, + * make sure there are no volume errors and that no unsupported volume + * flags are set. Also, empty the logfile journal as it would become + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. Finally, mark the quotas out of date so Windows rescans + * the volume on boot and updates them. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occurred. + */ + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + static const char *es = ". Cannot remount read-write."; + + /* Remounting read-write. */ + if (NVolErrors(vol)) { + ntfs_error(sb, "Volume has errors and is read-only%s", + es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); + return -EROFS; + } + if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_error(sb, "Failed to set dirty bit in volume " + "information flags%s", es); + return -EROFS; + } +#if 0 + // TODO: Enable this code once we start modifying anything that + // is different between NTFS 1.2 and 3.x... + /* Set NT4 compatibility flag on newer NTFS version volumes. */ + if ((vol->major_ver > 1)) { + if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + ntfs_error(sb, "Failed to set NT4 " + "compatibility flag%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } +#endif + if (!ntfs_empty_logfile(vol->logfile_ino)) { + ntfs_error(sb, "Failed to empty journal $LogFile%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_mark_quotas_out_of_date(vol)) { + ntfs_error(sb, "Failed to mark quotas out of date%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_stamp_usnjrnl(vol)) { + ntfs_error(sb, "Failed to stamp transation log " + "($UsnJrnl)%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + // TODO: Deal with *flags. + + if (!parse_options(vol, opt)) + return -EINVAL; + + ntfs_debug("Done."); + return 0; +} + +/** + * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector + * @sb: Super block of the device to which @b belongs. + * @b: Boot sector of device @sb to check. + * @silent: If 'true', all output will be silenced. + * + * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot + * sector. Returns 'true' if it is valid and 'false' if not. + * + * @sb is only needed for warning/error output, i.e. it can be NULL when silent + * is 'true'. + */ +static bool is_boot_sector_ntfs(const struct super_block *sb, + const NTFS_BOOT_SECTOR *b, const bool silent) +{ + /* + * Check that checksum == sum of u32 values from b to the checksum + * field. If checksum is zero, no checking is done. We will work when + * the checksum test fails, since some utilities update the boot sector + * ignoring the checksum which leaves the checksum out-of-date. We + * report a warning if this is the case. + */ + if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { + le32 *u; + u32 i; + + for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) + i += le32_to_cpup(u); + if (le32_to_cpu(b->checksum) != i) + ntfs_warning(sb, "Invalid boot sector checksum."); + } + /* Check OEMidentifier is "NTFS " */ + if (b->oem_id != magicNTFS) + goto not_ntfs; + /* Check bytes per sector value is between 256 and 4096. */ + if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || + le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) + goto not_ntfs; + /* Check sectors per cluster value is valid. */ + switch (b->bpb.sectors_per_cluster) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: + break; + default: + goto not_ntfs; + } + /* Check the cluster size is not above the maximum (64kiB). */ + if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * + b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) + goto not_ntfs; + /* Check reserved/unused fields are really zero. */ + if (le16_to_cpu(b->bpb.reserved_sectors) || + le16_to_cpu(b->bpb.root_entries) || + le16_to_cpu(b->bpb.sectors) || + le16_to_cpu(b->bpb.sectors_per_fat) || + le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) + goto not_ntfs; + /* Check clusters per file mft record value is valid. */ + if ((u8)b->clusters_per_mft_record < 0xe1 || + (u8)b->clusters_per_mft_record > 0xf7) + switch (b->clusters_per_mft_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* Check clusters per index block value is valid. */ + if ((u8)b->clusters_per_index_record < 0xe1 || + (u8)b->clusters_per_index_record > 0xf7) + switch (b->clusters_per_index_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* + * Check for valid end of sector marker. We will work without it, but + * many BIOSes will refuse to boot from a bootsector if the magic is + * incorrect, so we emit a warning. + */ + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) + ntfs_warning(sb, "Invalid end of sector marker."); + return true; +not_ntfs: + return false; +} + +/** + * read_ntfs_boot_sector - read the NTFS boot sector of a device + * @sb: super block of device to read the boot sector from + * @silent: if true, suppress all output + * + * Reads the boot sector from the device and validates it. If that fails, tries + * to read the backup boot sector, first from the end of the device a-la NT4 and + * later and then from the middle of the device a-la NT3.51 and before. + * + * If a valid boot sector is found but it is not the primary boot sector, we + * repair the primary boot sector silently (unless the device is read-only or + * the primary boot sector is not accessible). + * + * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super + * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized + * to their respective values. + * + * Return the unlocked buffer head containing the boot sector or NULL on error. + */ +static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, + const int silent) +{ + const char *read_err_str = "Unable to read %s boot sector."; + struct buffer_head *bh_primary, *bh_backup; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; + + /* Try to read primary boot sector. */ + if ((bh_primary = sb_bread(sb, 0))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_primary->b_data, silent)) + return bh_primary; + if (!silent) + ntfs_error(sb, "Primary boot sector is invalid."); + } else if (!silent) + ntfs_error(sb, read_err_str, "primary"); + if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { + if (bh_primary) + brelse(bh_primary); + if (!silent) + ntfs_error(sb, "Mount option errors=recover not used. " + "Aborting without trying to recover."); + return NULL; + } + /* Try to read NT4+ backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* Try to read NT3.51- backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + if (!silent) + ntfs_error(sb, "Could not find a valid backup boot " + "sector."); + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* We failed. Cleanup and return. */ + if (bh_primary) + brelse(bh_primary); + return NULL; +hotfix_primary_boot_sector: + if (bh_primary) { + /* + * If we managed to read sector zero and the volume is not + * read-only, copy the found, valid backup boot sector to the + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). + */ + if (!(sb->s_flags & MS_RDONLY)) { + ntfs_warning(sb, "Hot-fix: Recovering invalid primary " + "boot sector from backup copy."); + memcpy(bh_primary->b_data, bh_backup->b_data, + NTFS_BLOCK_SIZE); + mark_buffer_dirty(bh_primary); + sync_dirty_buffer(bh_primary); + if (buffer_uptodate(bh_primary)) { + brelse(bh_backup); + return bh_primary; + } + ntfs_error(sb, "Hot-fix: Device write error while " + "recovering primary boot sector."); + } else { + ntfs_warning(sb, "Hot-fix: Recovery of primary boot " + "sector failed: Read-only mount."); + } + brelse(bh_primary); + } + ntfs_warning(sb, "Using backup boot sector."); + return bh_backup; +} + +/** + * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol + * @vol: volume structure to initialise with data from boot sector + * @b: boot sector to parse + * + * Parse the ntfs boot sector @b and store all imporant information therein in + * the ntfs super block @vol. Return 'true' on success and 'false' on error. + */ +static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) +{ + unsigned int sectors_per_cluster_bits, nr_hidden_sects; + int clusters_per_mft_record, clusters_per_index_record; + s64 ll; + + vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); + vol->sector_size_bits = ffs(vol->sector_size) - 1; + ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, + vol->sector_size); + ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, + vol->sector_size_bits); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return false; + } + ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); + sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; + ntfs_debug("sectors_per_cluster_bits = 0x%x", + sectors_per_cluster_bits); + nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); + ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); + vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; + vol->cluster_size_mask = vol->cluster_size - 1; + vol->cluster_size_bits = ffs(vol->cluster_size) - 1; + ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, + vol->cluster_size); + ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); + return false; + } + clusters_per_mft_record = b->clusters_per_mft_record; + ntfs_debug("clusters_per_mft_record = %i (0x%x)", + clusters_per_mft_record, clusters_per_mft_record); + if (clusters_per_mft_record > 0) + vol->mft_record_size = vol->cluster_size << + (ffs(clusters_per_mft_record) - 1); + else + /* + * When mft_record_size < cluster_size, clusters_per_mft_record + * = -log2(mft_record_size) bytes. mft_record_size normaly is + * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). + */ + vol->mft_record_size = 1 << -clusters_per_mft_record; + vol->mft_record_size_mask = vol->mft_record_size - 1; + vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; + ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, + vol->mft_record_size); + ntfs_debug("vol->mft_record_size_mask = 0x%x", + vol->mft_record_size_mask); + ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", + vol->mft_record_size_bits, vol->mft_record_size_bits); + /* + * We cannot support mft record sizes above the PAGE_CACHE_SIZE since + * we store $MFT/$DATA, the table of mft records in the page cache. + */ + if (vol->mft_record_size > PAGE_CACHE_SIZE) { + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_CACHE_SIZE on your system (%lu). " + "This is not supported. Sorry.", + vol->mft_record_size, PAGE_CACHE_SIZE); + return false; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); + return false; + } + clusters_per_index_record = b->clusters_per_index_record; + ntfs_debug("clusters_per_index_record = %i (0x%x)", + clusters_per_index_record, clusters_per_index_record); + if (clusters_per_index_record > 0) + vol->index_record_size = vol->cluster_size << + (ffs(clusters_per_index_record) - 1); + else + /* + * When index_record_size < cluster_size, + * clusters_per_index_record = -log2(index_record_size) bytes. + * index_record_size normaly equals 4096 bytes, which is + * encoded as 0xF4 (-12 in decimal). + */ + vol->index_record_size = 1 << -clusters_per_index_record; + vol->index_record_size_mask = vol->index_record_size - 1; + vol->index_record_size_bits = ffs(vol->index_record_size) - 1; + ntfs_debug("vol->index_record_size = %i (0x%x)", + vol->index_record_size, vol->index_record_size); + ntfs_debug("vol->index_record_size_mask = 0x%x", + vol->index_record_size_mask); + ntfs_debug("vol->index_record_size_bits = %i (0x%x)", + vol->index_record_size_bits, + vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return false; + } + /* + * Get the size of the volume in clusters and check for 64-bit-ness. + * Windows currently only uses 32 bits to save the clusters so we do + * the same as it is much faster on 32-bit CPUs. + */ + ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; + if ((u64)ll >= 1ULL << 32) { + ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); + return false; + } + vol->nr_clusters = ll; + ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); + /* + * On an architecture where unsigned long is 32-bits, we restrict the + * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler + * will hopefully optimize the whole check away. + */ + if (sizeof(unsigned long) < 8) { + if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { + ntfs_error(vol->sb, "Volume size (%lluTiB) is too " + "large for this architecture. " + "Maximum supported is 2TiB. Sorry.", + (unsigned long long)ll >> (40 - + vol->cluster_size_bits)); + return false; + } + } + ll = sle64_to_cpu(b->mft_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mft_lcn = ll; + ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); + ll = sle64_to_cpu(b->mftmirr_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mftmirr_lcn = ll; + ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); +#ifdef NTFS_RW + /* + * Work out the size of the mft mirror in number of mft records. If the + * cluster size is less than or equal to the size taken by four mft + * records, the mft mirror stores the first four mft records. If the + * cluster size is bigger than the size taken by four mft records, the + * mft mirror contains as many mft records as will fit into one + * cluster. + */ + if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) + vol->mftmirr_size = 4; + else + vol->mftmirr_size = vol->cluster_size >> + vol->mft_record_size_bits; + ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); +#endif /* NTFS_RW */ + vol->serial_no = le64_to_cpu(b->volume_serial_number); + ntfs_debug("vol->serial_no = 0x%llx", + (unsigned long long)vol->serial_no); + return true; +} + +/** + * ntfs_setup_allocators - initialize the cluster and mft allocators + * @vol: volume structure for which to setup the allocators + * + * Setup the cluster (lcn) and mft allocators to the starting values. + */ +static void ntfs_setup_allocators(ntfs_volume *vol) +{ +#ifdef NTFS_RW + LCN mft_zone_size, mft_lcn; +#endif /* NTFS_RW */ + + ntfs_debug("vol->mft_zone_multiplier = 0x%x", + vol->mft_zone_multiplier); +#ifdef NTFS_RW + /* Determine the size of the MFT zone. */ + mft_zone_size = vol->nr_clusters; + switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ + case 4: + mft_zone_size >>= 1; /* 50% */ + break; + case 3: + mft_zone_size = (mft_zone_size + + (mft_zone_size >> 1)) >> 2; /* 37.5% */ + break; + case 2: + mft_zone_size >>= 2; /* 25% */ + break; + /* case 1: */ + default: + mft_zone_size >>= 3; /* 12.5% */ + break; + } + /* Setup the mft zone. */ + vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; + ntfs_debug("vol->mft_zone_pos = 0x%llx", + (unsigned long long)vol->mft_zone_pos); + /* + * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs + * source) and if the actual mft_lcn is in the expected place or even + * further to the front of the volume, extend the mft_zone to cover the + * beginning of the volume as well. This is in order to protect the + * area reserved for the mft bitmap as well within the mft_zone itself. + * On non-standard volumes we do not protect it as the overhead would + * be higher than the speed increase we would get by doing it. + */ + mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; + if (mft_lcn * vol->cluster_size < 16 * 1024) + mft_lcn = (16 * 1024 + vol->cluster_size - 1) / + vol->cluster_size; + if (vol->mft_zone_start <= mft_lcn) + vol->mft_zone_start = 0; + ntfs_debug("vol->mft_zone_start = 0x%llx", + (unsigned long long)vol->mft_zone_start); + /* + * Need to cap the mft zone on non-standard volumes so that it does + * not point outside the boundaries of the volume. We do this by + * halving the zone size until we are inside the volume. + */ + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + while (vol->mft_zone_end >= vol->nr_clusters) { + mft_zone_size >>= 1; + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + } + ntfs_debug("vol->mft_zone_end = 0x%llx", + (unsigned long long)vol->mft_zone_end); + /* + * Set the current position within each data zone to the start of the + * respective zone. + */ + vol->data1_zone_pos = vol->mft_zone_end; + ntfs_debug("vol->data1_zone_pos = 0x%llx", + (unsigned long long)vol->data1_zone_pos); + vol->data2_zone_pos = 0; + ntfs_debug("vol->data2_zone_pos = 0x%llx", + (unsigned long long)vol->data2_zone_pos); + + /* Set the mft data allocation position to mft record 24. */ + vol->mft_data_pos = 24; + ntfs_debug("vol->mft_data_pos = 0x%llx", + (unsigned long long)vol->mft_data_pos); +#endif /* NTFS_RW */ +} + +#ifdef NTFS_RW + +/** + * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume + * @vol: ntfs super block describing device whose mft mirror to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_mft_mirror(ntfs_volume *vol) +{ + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + + ntfs_debug("Entering."); + /* Get mft mirror inode. */ + tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + /* + * Re-initialize some specifics about $MFTMirr's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + tmp_ino->i_uid = GLOBAL_ROOT_UID; + tmp_ino->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + tmp_ino->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFTMirr. */ + tmp_ino->i_op = &ntfs_empty_inode_ops; + tmp_ino->i_fop = &ntfs_empty_file_ops; + /* Put in our special address space operations. */ + tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; + tmp_ni = NTFS_I(tmp_ino); + /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ + NInoSetMstProtected(tmp_ni); + NInoSetSparseDisabled(tmp_ni); + /* + * Set up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + tmp_ni->itype.index.block_size = vol->mft_record_size; + tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; + vol->mftmirr_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * check_mft_mirror - compare contents of the mft mirror with the mft + * @vol: ntfs super block describing device whose mft mirror to check + * + * Return 'true' on success or 'false' on error. + * + * Note, this function also results in the mft mirror runlist being completely + * mapped into memory. The mft mirror write code requires this and will BUG() + * should it find an unmapped runlist element. + */ +static bool check_mft_mirror(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + ntfs_inode *mirr_ni; + struct page *mft_page, *mirr_page; + u8 *kmft, *kmirr; + runlist_element *rl, rl2[2]; + pgoff_t index; + int mrecs_per_page, i; + + ntfs_debug("Entering."); + /* Compare contents of $MFT and $MFTMirr. */ + mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size; + BUG_ON(!mrecs_per_page); + BUG_ON(!vol->mftmirr_size); + mft_page = mirr_page = NULL; + kmft = kmirr = NULL; + index = i = 0; + do { + u32 bytes; + + /* Switch pages if necessary. */ + if (!(i % mrecs_per_page)) { + if (index) { + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + } + /* Get the $MFT page. */ + mft_page = ntfs_map_page(vol->mft_ino->i_mapping, + index); + if (IS_ERR(mft_page)) { + ntfs_error(sb, "Failed to read $MFT."); + return false; + } + kmft = page_address(mft_page); + /* Get the $MFTMirr page. */ + mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, + index); + if (IS_ERR(mirr_page)) { + ntfs_error(sb, "Failed to read $MFTMirr."); + goto mft_unmap_out; + } + kmirr = page_address(mirr_page); + ++index; + } + /* Do not check the record if it is not in use. */ + if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { + /* Make sure the record is ok. */ + if (ntfs_is_baad_recordp((le32*)kmft)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "record %i.", i); +mm_unmap_out: + ntfs_unmap_page(mirr_page); +mft_unmap_out: + ntfs_unmap_page(mft_page); + return false; + } + } + /* Do not check the mirror record if it is not in use. */ + if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { + if (ntfs_is_baad_recordp((le32*)kmirr)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "mirror record %i.", i); + goto mm_unmap_out; + } + } + /* Get the amount of data in the current record. */ + bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmft)) { + bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmirr)) + bytes = vol->mft_record_size; + } + /* Compare the two records. */ + if (memcmp(kmft, kmirr, bytes)) { + ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " + "match. Run ntfsfix or chkdsk.", i); + goto mm_unmap_out; + } + kmft += vol->mft_record_size; + kmirr += vol->mft_record_size; + } while (++i < vol->mftmirr_size); + /* Release the last pages. */ + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + + /* Construct the mft mirror runlist by hand. */ + rl2[0].vcn = 0; + rl2[0].lcn = vol->mftmirr_lcn; + rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + + vol->cluster_size - 1) / vol->cluster_size; + rl2[1].vcn = rl2[0].length; + rl2[1].lcn = LCN_ENOENT; + rl2[1].length = 0; + /* + * Because we have just read all of the mft mirror, we know we have + * mapped the full runlist for it. + */ + mirr_ni = NTFS_I(vol->mftmirr_ino); + down_read(&mirr_ni->runlist.lock); + rl = mirr_ni->runlist.rl; + /* Compare the two runlists. They must be identical. */ + i = 0; + do { + if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || + rl2[i].length != rl[i].length) { + ntfs_error(sb, "$MFTMirr location mismatch. " + "Run chkdsk."); + up_read(&mirr_ni->runlist.lock); + return false; + } + } while (rl2[i++].length); + up_read(&mirr_ni->runlist.lock); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_check_logfile - load and check the logfile inode for a volume + * @vol: ntfs super block describing device whose logfile to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_check_logfile(ntfs_volume *vol, + RESTART_PAGE_HEADER **rp) +{ + struct inode *tmp_ino; + + ntfs_debug("Entering."); + tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + if (!ntfs_check_logfile(tmp_ino, rp)) { + iput(tmp_ino); + /* ntfs_check_logfile() will have displayed error output. */ + return false; + } + NInoSetSparseDisabled(NTFS_I(tmp_ino)); + vol->logfile_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +#define NTFS_HIBERFIL_HEADER_SIZE 4096 + +/** + * check_windows_hibernation_status - check if Windows is suspended on a volume + * @vol: ntfs super block of device to check + * + * Check if Windows is hibernated on the ntfs volume @vol. This is done by + * looking for the file hiberfil.sys in the root directory of the volume. If + * the file is not present Windows is definitely not suspended. + * + * If hiberfil.sys exists and is less than 4kiB in size it means Windows is + * definitely suspended (this volume is not the system volume). Caveat: on a + * system with many volumes it is possible that the < 4kiB check is bogus but + * for now this should do fine. + * + * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the + * hiberfil header (which is the first 4kiB). If this begins with "hibr", + * Windows is definitely suspended. If it is completely full of zeroes, + * Windows is definitely not hibernated. Any other case is treated as if + * Windows is suspended. This caters for the above mentioned caveat of a + * system with many volumes where no "hibr" magic would be present and there is + * no zero header. + * + * Return 0 if Windows is not hibernated on the volume, >0 if Windows is + * hibernated on the volume, and -errno on error. + */ +static int check_windows_hibernation_status(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *vi; + struct page *page; + u32 *kaddr, *kend; + ntfs_name *name = NULL; + int ret = 1; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the hibernation file by looking up the + * filename hiberfil.sys in the root directory. + */ + mutex_lock(&vol->root_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, + &name); + mutex_unlock(&vol->root_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + ret = MREF_ERR(mref); + /* If the file does not exist, Windows is not hibernated. */ + if (ret == -ENOENT) { + ntfs_debug("hiberfil.sys not present. Windows is not " + "hibernated on the volume."); + return 0; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "hiberfil.sys."); + return ret; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi) || is_bad_inode(vi)) { + if (!IS_ERR(vi)) + iput(vi); + ntfs_error(vol->sb, "Failed to load hiberfil.sys."); + return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; + } + if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " + "Windows is hibernated on the volume. This " + "is not the system volume.", i_size_read(vi)); + goto iput_out; + } + page = ntfs_map_page(vi->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); + ret = PTR_ERR(page); + goto iput_out; + } + kaddr = (u32*)page_address(page); + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " + "hibernated on the volume. This is the " + "system volume."); + goto unm_iput_out; + } + kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); + do { + if (unlikely(*kaddr)) { + ntfs_debug("hiberfil.sys is larger than 4kiB " + "(0x%llx), does not contain the " + "\"hibr\" magic, and does not have a " + "zero header. Windows is hibernated " + "on the volume. This is not the " + "system volume.", i_size_read(vi)); + goto unm_iput_out; + } + } while (++kaddr < kend); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not " + "hibernated on the volume. This is the system " + "volume."); + ret = 0; +unm_iput_out: + ntfs_unmap_page(page); +iput_out: + iput(vi); + return ret; +} + +/** + * load_and_init_quota - load and setup the quota file for a volume if present + * @vol: ntfs super block describing device whose quota file to load + * + * Return 'true' on success or 'false' on error. If $Quota is not present, we + * leave vol->quota_ino as NULL and return success. + */ +static bool load_and_init_quota(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_name *name = NULL; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the quota file by looking up the filename + * $Quota in the extended system files directory $Extend. + */ + mutex_lock(&vol->extend_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, + &name); + mutex_unlock(&vol->extend_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, quotas are disabled and have + * never been enabled on this volume, just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$Quota not present. Volume does not have " + "quotas enabled."); + /* + * No need to try to set quotas out of date if they are + * not enabled. + */ + NVolSetQuotaOutOfDate(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for $Quota."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $Quota."); + return false; + } + vol->quota_ino = tmp_ino; + /* Get the $Q index allocation attribute. */ + tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); + return false; + } + vol->quota_q_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_usnjrnl - load and setup the transaction log if present + * @vol: ntfs super block describing device whose usnjrnl file to load + * + * Return 'true' on success or 'false' on error. + * + * If $UsnJrnl is not present or in the process of being disabled, we set + * NVolUsnJrnlStamped() and return success. + * + * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, + * i.e. transaction logging has only just been enabled or the journal has been + * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() + * and return success. + */ +static bool load_and_init_usnjrnl(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + struct page *page; + ntfs_name *name = NULL; + USN_HEADER *uh; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the transaction log file by looking up the + * filename $UsnJrnl in the extended system files directory $Extend. + */ + mutex_lock(&vol->extend_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, + &name); + mutex_unlock(&vol->extend_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, transaction logging is disabled, + * just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$UsnJrnl not present. Volume does not " + "have transaction logging enabled."); +not_enabled: + /* + * No need to try to stamp the transaction log if + * transaction logging is not enabled. + */ + NVolSetUsnJrnlStamped(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "$UsnJrnl."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $UsnJrnl."); + return false; + } + vol->usnjrnl_ino = tmp_ino; + /* + * If the transaction log is in the process of being deleted, we can + * ignore it. + */ + if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { + ntfs_debug("$UsnJrnl in the process of being disabled. " + "Volume does not have transaction logging " + "enabled."); + goto not_enabled; + } + /* Get the $DATA/$Max attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + vol->usnjrnl_max_ino = tmp_ino; + if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { + ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " + "attribute (size is 0x%llx but should be at " + "least 0x%zx bytes).", i_size_read(tmp_ino), + sizeof(USN_HEADER)); + return false; + } + /* Get the $DATA/$J attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " + "attribute."); + return false; + } + vol->usnjrnl_j_ino = tmp_ino; + /* Verify $J is non-resident and sparse. */ + tmp_ni = NTFS_I(vol->usnjrnl_j_ino); + if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { + ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " + "and/or not sparse."); + return false; + } + /* Read the USN_HEADER from $DATA/$Max. */ + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + /* Sanity check the $Max. */ + if (unlikely(sle64_to_cpu(uh->allocation_delta) > + sle64_to_cpu(uh->maximum_size))) { + ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " + "maximum size (0x%llx). $UsnJrnl is corrupt.", + (long long)sle64_to_cpu(uh->allocation_delta), + (long long)sle64_to_cpu(uh->maximum_size)); + ntfs_unmap_page(page); + return false; + } + /* + * If the transaction log has been stamped and nothing has been written + * to it since, we do not need to stamp it. + */ + if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= + i_size_read(vol->usnjrnl_j_ino))) { + if (likely(sle64_to_cpu(uh->lowest_valid_usn) == + i_size_read(vol->usnjrnl_j_ino))) { + ntfs_unmap_page(page); + ntfs_debug("$UsnJrnl is enabled but nothing has been " + "logged since it was last stamped. " + "Treating this as if the volume does " + "not have transaction logging " + "enabled."); + goto not_enabled; + } + ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " + "which is out of bounds (0x%llx). $UsnJrnl " + "is corrupt.", + (long long)sle64_to_cpu(uh->lowest_valid_usn), + i_size_read(vol->usnjrnl_j_ino)); + ntfs_unmap_page(page); + return false; + } + ntfs_unmap_page(page); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_attrdef - load the attribute definitions table for a volume + * @vol: ntfs super block describing device whose attrdef to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_attrdef(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + + ntfs_debug("Entering."); + /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ + ino = ntfs_iget(sb, FILE_AttrDef); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto failed; + } + NInoSetSparseDisabled(NTFS_I(ino)); + /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ + i_size = i_size_read(ino); + if (i_size <= 0 || i_size > 0x7fffffff) + goto iput_failed; + vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); + if (!vol->attrdef) + goto iput_failed; + index = 0; + max_index = i_size >> PAGE_CACHE_SHIFT; + size = PAGE_CACHE_SIZE; + while (index < max_index) { + /* Read the attrdef table and copy it into the linear buffer. */ +read_partial_attrdef_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto free_iput_failed; + memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_CACHE_SIZE) { + size = i_size & ~PAGE_CACHE_MASK; + if (size) + goto read_partial_attrdef_page; + } + vol->attrdef_size = i_size; + ntfs_debug("Read %llu bytes from $AttrDef.", i_size); + iput(ino); + return true; +free_iput_failed: + ntfs_free(vol->attrdef); + vol->attrdef = NULL; +iput_failed: + iput(ino); +failed: + ntfs_error(sb, "Failed to initialize attribute definition table."); + return false; +} + +#endif /* NTFS_RW */ + +/** + * load_and_init_upcase - load the upcase table for an ntfs volume + * @vol: ntfs super block describing device whose upcase to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_upcase(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + int i, max; + + ntfs_debug("Entering."); + /* Read upcase table and setup vol->upcase and vol->upcase_len. */ + ino = ntfs_iget(sb, FILE_UpCase); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto upcase_failed; + } + /* + * The upcase size must not be above 64k Unicode characters, must not + * be zero and must be a multiple of sizeof(ntfschar). + */ + i_size = i_size_read(ino); + if (!i_size || i_size & (sizeof(ntfschar) - 1) || + i_size > 64ULL * 1024 * sizeof(ntfschar)) + goto iput_upcase_failed; + vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); + if (!vol->upcase) + goto iput_upcase_failed; + index = 0; + max_index = i_size >> PAGE_CACHE_SHIFT; + size = PAGE_CACHE_SIZE; + while (index < max_index) { + /* Read the upcase table and copy it into the linear buffer. */ +read_partial_upcase_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto iput_upcase_failed; + memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_CACHE_SIZE) { + size = i_size & ~PAGE_CACHE_MASK; + if (size) + goto read_partial_upcase_page; + } + vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; + ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", + i_size, 64 * 1024 * sizeof(ntfschar)); + iput(ino); + mutex_lock(&ntfs_lock); + if (!default_upcase) { + ntfs_debug("Using volume specified $UpCase since default is " + "not present."); + mutex_unlock(&ntfs_lock); + return true; + } + max = default_upcase_len; + if (max > vol->upcase_len) + max = vol->upcase_len; + for (i = 0; i < max; i++) + if (vol->upcase[i] != default_upcase[i]) + break; + if (i == max) { + ntfs_free(vol->upcase); + vol->upcase = default_upcase; + vol->upcase_len = max; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_debug("Volume specified $UpCase matches default. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_debug("Using volume specified $UpCase since it does not match " + "the default."); + return true; +iput_upcase_failed: + iput(ino); + ntfs_free(vol->upcase); + vol->upcase = NULL; +upcase_failed: + mutex_lock(&ntfs_lock); + if (default_upcase) { + vol->upcase = default_upcase; + vol->upcase_len = default_upcase_len; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to load $UpCase from the volume. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to initialize upcase table."); + return false; +} + +/* + * The lcn and mft bitmap inodes are NTFS-internal inodes with + * their own special locking rules: + */ +static struct lock_class_key + lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, + mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; + +/** + * load_system_files - open the system files using normal functions + * @vol: ntfs super block describing device whose system files to load + * + * Open the system files with normal access functions and complete setting up + * the ntfs super block @vol. + * + * Return 'true' on success or 'false' on error. + */ +static bool load_system_files(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; +#ifdef NTFS_RW + RESTART_PAGE_HEADER *rp; + int err; +#endif /* NTFS_RW */ + + ntfs_debug("Entering."); +#ifdef NTFS_RW + /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ + if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { + static const char *es1 = "Failed to load $MFTMirr"; + static const char *es2 = "$MFTMirr does not match $MFT"; + static const char *es3 = ". Run ntfsfix and/or chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + !vol->mftmirr_ino ? es1 : es2, + es3); + goto iput_mirr_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", + !vol->mftmirr_ino ? es1 : es2, es3); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", + !vol->mftmirr_ino ? es1 : es2, es3); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* Get mft bitmap attribute inode. */ + vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); + if (IS_ERR(vol->mftbmp_ino)) { + ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); + goto iput_mirr_err_out; + } + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, + &mftbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, + &mftbmp_mrec_lock_key); + /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ + if (!load_and_init_upcase(vol)) + goto iput_mftbmp_err_out; +#ifdef NTFS_RW + /* + * Read attribute definitions table and setup @vol->attrdef and + * @vol->attrdef_size. + */ + if (!load_and_init_attrdef(vol)) + goto iput_upcase_err_out; +#endif /* NTFS_RW */ + /* + * Get the cluster allocation bitmap inode and verify the size, no + * need for any locking at this stage as we are already running + * exclusively as we are mount in progress task. + */ + vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); + if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { + if (!IS_ERR(vol->lcnbmp_ino)) + iput(vol->lcnbmp_ino); + goto bitmap_failed; + } + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, + &lcnbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, + &lcnbmp_mrec_lock_key); + + NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); + if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { + iput(vol->lcnbmp_ino); +bitmap_failed: + ntfs_error(sb, "Failed to load $Bitmap."); + goto iput_attrdef_err_out; + } + /* + * Get the volume inode and setup our cache of the volume flags and + * version. + */ + vol->vol_ino = ntfs_iget(sb, FILE_Volume); + if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { + if (!IS_ERR(vol->vol_ino)) + iput(vol->vol_ino); +volume_failed: + ntfs_error(sb, "Failed to load $Volume."); + goto iput_lcnbmp_err_out; + } + m = map_mft_record(NTFS_I(vol->vol_ino)); + if (IS_ERR(m)) { +iput_volume_failed: + iput(vol->vol_ino); + goto volume_failed; + } + if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { + ntfs_error(sb, "Failed to get attribute search context."); + goto get_ctx_vol_failed; + } + if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx) || ctx->attr->non_resident || ctx->attr->flags) { +err_put_vol: + ntfs_attr_put_search_ctx(ctx); +get_ctx_vol_failed: + unmap_mft_record(NTFS_I(vol->vol_ino)); + goto iput_volume_failed; + } + vi = (VOLUME_INFORMATION*)((char*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Some bounds checks. */ + if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + + le32_to_cpu(ctx->attr->data.resident.value_length) > + (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) + goto err_put_vol; + /* Copy the volume flags and version to the ntfs_volume structure. */ + vol->vol_flags = vi->flags; + vol->major_ver = vi->major_ver; + vol->minor_ver = vi->minor_ver; + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(NTFS_I(vol->vol_ino)); + pr_info("volume version %i.%i.\n", vol->major_ver, + vol->minor_ver); + if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { + ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " + "volume version %i.%i (need at least version " + "3.0).", vol->major_ver, vol->minor_ver); + NVolClearSparseEnabled(vol); + } +#ifdef NTFS_RW + /* Make sure that no unsupported volume flags are set. */ + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + static const char *es1a = "Volume is dirty"; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_vol_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* + * Do not set NVolErrors() because ntfs_remount() re-checks the + * flags which we need to do in case any flags have changed. + */ + } + /* + * Get the inode for the logfile, check it and determine if the volume + * was shutdown cleanly. + */ + rp = NULL; + if (!load_and_check_logfile(vol, &rp) || + !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { + static const char *es1a = "Failed to load $LogFile"; + static const char *es1b = "$LogFile is not clean"; + static const char *es2 = ". Mount in Windows."; + const char *es1; + + es1 = !vol->logfile_ino ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + if (vol->logfile_ino) { + BUG_ON(!rp); + ntfs_free(rp); + } + goto iput_logfile_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + ntfs_free(rp); +#endif /* NTFS_RW */ + /* Get the root directory inode so we can do path lookups. */ + vol->root_ino = ntfs_iget(sb, FILE_root); + if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (!IS_ERR(vol->root_ino)) + iput(vol->root_ino); + ntfs_error(sb, "Failed to load root directory."); + goto iput_logfile_err_out; + } +#ifdef NTFS_RW + /* + * Check if Windows is suspended to disk on the target volume. If it + * is hibernated, we must not write *anything* to the disk so set + * NVolErrors() without setting the dirty volume flag and mount + * read-only. This will prevent read-write remounting and it will also + * prevent all writes. + */ + err = check_windows_hibernation_status(vol); + if (unlikely(err)) { + static const char *es1a = "Failed to determine if Windows is " + "hibernated"; + static const char *es1b = "Windows is hibernated"; + static const char *es2 = ". Run chkdsk."; + const char *es1; + + es1 = err < 0 ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the volume dirty. */ + if (!(sb->s_flags & MS_RDONLY) && + ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + static const char *es1 = "Failed to set dirty bit in volume " + "information flags"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + /* + * Do not set NVolErrors() because ntfs_remount() might manage + * to set the dirty flag in which case all would be well. + */ + } +#if 0 + // TODO: Enable this code once we start modifying anything that is + // different between NTFS 1.2 and 3.x... + /* + * If (still) a read-write mount, set the NT4 compatibility flag on + * newer NTFS version volumes. + */ + if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) && + ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + static const char *es1 = "Failed to set NT4 compatibility flag"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif + /* If (still) a read-write mount, empty the logfile. */ + if (!(sb->s_flags & MS_RDONLY) && + !ntfs_empty_logfile(vol->logfile_ino)) { + static const char *es1 = "Failed to empty $LogFile"; + static const char *es2 = ". Mount in Windows."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* If on NTFS versions before 3.0, we are done. */ + if (unlikely(vol->major_ver < 3)) + return true; + /* NTFS 3.0+ specific initialization. */ + /* Get the security descriptors inode. */ + vol->secure_ino = ntfs_iget(sb, FILE_Secure); + if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { + if (!IS_ERR(vol->secure_ino)) + iput(vol->secure_ino); + ntfs_error(sb, "Failed to load $Secure."); + goto iput_root_err_out; + } + // TODO: Initialize security. + /* Get the extended system files' directory inode. */ + vol->extend_ino = ntfs_iget(sb, FILE_Extend); + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (!IS_ERR(vol->extend_ino)) + iput(vol->extend_ino); + ntfs_error(sb, "Failed to load $Extend."); + goto iput_sec_err_out; + } +#ifdef NTFS_RW + /* Find the quota file, load it if present, and set it up. */ + if (!load_and_init_quota(vol)) { + static const char *es1 = "Failed to load $Quota"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the quotas out of date. */ + if (!(sb->s_flags & MS_RDONLY) && + !ntfs_mark_quotas_out_of_date(vol)) { + static const char *es1 = "Failed to mark quotas out of date"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } + /* + * Find the transaction log file ($UsnJrnl), load it if present, check + * it, and set it up. + */ + if (!load_and_init_usnjrnl(vol)) { + static const char *es1 = "Failed to load $UsnJrnl"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, stamp the transaction log. */ + if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) { + static const char *es1 = "Failed to stamp transaction log " + "($UsnJrnl)"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + return true; +#ifdef NTFS_RW +iput_usnjrnl_err_out: + if (vol->usnjrnl_j_ino) + iput(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + iput(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + iput(vol->usnjrnl_ino); +iput_quota_err_out: + if (vol->quota_q_ino) + iput(vol->quota_q_ino); + if (vol->quota_ino) + iput(vol->quota_ino); + iput(vol->extend_ino); +#endif /* NTFS_RW */ +iput_sec_err_out: + iput(vol->secure_ino); +iput_root_err_out: + iput(vol->root_ino); +iput_logfile_err_out: +#ifdef NTFS_RW + if (vol->logfile_ino) + iput(vol->logfile_ino); +iput_vol_err_out: +#endif /* NTFS_RW */ + iput(vol->vol_ino); +iput_lcnbmp_err_out: + iput(vol->lcnbmp_ino); +iput_attrdef_err_out: + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } +#ifdef NTFS_RW +iput_upcase_err_out: +#endif /* NTFS_RW */ + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } +iput_mftbmp_err_out: + iput(vol->mftbmp_ino); +iput_mirr_err_out: +#ifdef NTFS_RW + if (vol->mftmirr_ino) + iput(vol->mftmirr_ino); +#endif /* NTFS_RW */ + return false; +} + +/** + * ntfs_put_super - called by the vfs to unmount a volume + * @sb: vfs superblock of volume to unmount + * + * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when + * the volume is being unmounted (umount system call has been invoked) and it + * releases all inodes and memory belonging to the NTFS specific part of the + * super block. + */ +static void ntfs_put_super(struct super_block *sb) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering."); + +#ifdef NTFS_RW + /* + * Commit all inodes while they are still open in case some of them + * cause others to be dirtied. + */ + ntfs_commit_inode(vol->vol_ino); + + /* NTFS 3.0+ specific. */ + if (vol->major_ver >= 3) { + if (vol->usnjrnl_j_ino) + ntfs_commit_inode(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + ntfs_commit_inode(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + ntfs_commit_inode(vol->usnjrnl_ino); + if (vol->quota_q_ino) + ntfs_commit_inode(vol->quota_q_ino); + if (vol->quota_ino) + ntfs_commit_inode(vol->quota_ino); + if (vol->extend_ino) + ntfs_commit_inode(vol->extend_ino); + if (vol->secure_ino) + ntfs_commit_inode(vol->secure_ino); + } + + ntfs_commit_inode(vol->root_ino); + + down_write(&vol->lcnbmp_lock); + ntfs_commit_inode(vol->lcnbmp_ino); + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + ntfs_commit_inode(vol->mftbmp_ino); + up_write(&vol->mftbmp_lock); + + if (vol->logfile_ino) + ntfs_commit_inode(vol->logfile_ino); + + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + + /* + * If a read-write mount and no volume errors have occurred, mark the + * volume clean. Also, re-commit all affected inodes. + */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + ntfs_commit_inode(vol->vol_ino); + ntfs_commit_inode(vol->root_ino); + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + } else { + ntfs_warning(sb, "Volume has errors. Leaving volume " + "marked dirty. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + iput(vol->vol_ino); + vol->vol_ino = NULL; + + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + + iput(vol->root_ino); + vol->root_ino = NULL; + + down_write(&vol->lcnbmp_lock); + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; + up_write(&vol->mftbmp_lock); + +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + /* Re-commit the mft mirror and mft just in case. */ + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } + /* + * We should have no dirty inodes left, due to + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. + */ + ntfs_commit_inode(vol->mft_ino); + write_inode_now(vol->mft_ino, 1); +#endif /* NTFS_RW */ + + iput(vol->mft_ino); + vol->mft_ino = NULL; + + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + /* + * Destroy the global default upcase table if necessary. Also decrease + * the number of upcase users if we are a user. + */ + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + if (!ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + + unload_nls(vol->nls_map); + + sb->s_fs_info = NULL; + kfree(vol); +} + +/** + * get_nr_free_clusters - return the number of free clusters on a volume + * @vol: ntfs volume for which to obtain free cluster count + * + * Calculate the number of free clusters on the mounted NTFS volume @vol. We + * actually calculate the number of clusters in use instead because this + * allows us to not care about partial pages as these will be just zero filled + * and hence not be counted as allocated clusters. + * + * The only particularity is that clusters beyond the end of the logical ntfs + * volume will be marked as allocated to prevent errors which means we have to + * discount those at the end. This is important as the cluster bitmap always + * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside + * the logical volume and marked in use when they are not as they do not exist. + * + * If any pages cannot be read we assume all clusters in the erroring pages are + * in use. This means we return an underestimate on errors which is better than + * an overestimate. + */ +static s64 get_nr_free_clusters(ntfs_volume *vol) +{ + s64 nr_free = vol->nr_clusters; + struct address_space *mapping = vol->lcnbmp_ino->i_mapping; + struct page *page; + pgoff_t index, max_index; + + ntfs_debug("Entering."); + /* Serialize accesses to the cluster bitmap. */ + down_read(&vol->lcnbmp_lock); + /* + * Convert the number of bits into bytes rounded up, then convert into + * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one + * full and one partial page max_index = 2. + */ + max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_CACHE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + page_cache_release(page); + } + ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); + /* + * Fixup for eventual bits outside logical ntfs volume (see function + * description above). + */ + if (vol->nr_clusters & 63) + nr_free += 64 - (vol->nr_clusters & 63); + up_read(&vol->lcnbmp_lock); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * __get_nr_free_mft_records - return the number of free inodes on a volume + * @vol: ntfs volume for which to obtain free inode count + * @nr_free: number of mft records in filesystem + * @max_index: maximum number of pages containing set bits + * + * Calculate the number of free mft records (inodes) on the mounted NTFS + * volume @vol. We actually calculate the number of mft records in use instead + * because this allows us to not care about partial pages as these will be just + * zero filled and hence not be counted as allocated mft record. + * + * If any pages cannot be read we assume all mft records in the erroring pages + * are in use. This means we return an underestimate on errors which is better + * than an overestimate. + * + * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. + */ +static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, + s64 nr_free, const pgoff_t max_index) +{ + struct address_space *mapping = vol->mftbmp_ino->i_mapping; + struct page *page; + pgoff_t index; + + ntfs_debug("Entering."); + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " + "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_CACHE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + page_cache_release(page); + } + ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", + index - 1); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * ntfs_statfs - return information about mounted NTFS volume + * @dentry: dentry from mounted volume + * @sfs: statfs structure in which to return the information + * + * Return information about the mounted NTFS volume @dentry in the statfs structure + * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is + * called). We interpret the values to be correct of the moment in time at + * which we are called. Most values are variable otherwise and this isn't just + * the free values but the totals as well. For example we can increase the + * total number of file nodes if we run out and we can keep doing this until + * there is no more space on the volume left at all. + * + * Called from vfs_statfs which is used to handle the statfs, fstatfs, and + * ustat system calls. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + struct super_block *sb = dentry->d_sb; + s64 size; + ntfs_volume *vol = NTFS_SB(sb); + ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); + pgoff_t max_index; + unsigned long flags; + + ntfs_debug("Entering."); + /* Type of filesystem. */ + sfs->f_type = NTFS_SB_MAGIC; + /* Optimal transfer block size. */ + sfs->f_bsize = PAGE_CACHE_SIZE; + /* + * Total data blocks in filesystem in units of f_bsize and since + * inodes are also stored in data blocs ($MFT is a file) this is just + * the total clusters. + */ + sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT; + /* Free data blocks in filesystem in units of f_bsize. */ + size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT; + if (size < 0LL) + size = 0LL; + /* Free blocks avail to non-superuser, same as above on NTFS. */ + sfs->f_bavail = sfs->f_bfree = size; + /* Serialize accesses to the inode bitmap. */ + down_read(&vol->mftbmp_lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; + /* + * Convert the maximum number of set bits into bytes rounded up, then + * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we + * have one full and one partial page max_index = 2. + */ + max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) + + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Number of inodes in filesystem (at this point in time). */ + sfs->f_files = size; + /* Free inodes in fs (based on current total count). */ + sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); + up_read(&vol->mftbmp_lock); + /* + * File system id. This is extremely *nix flavour dependent and even + * within Linux itself all fs do their own thing. I interpret this to + * mean a unique id associated with the mounted fs and not the id + * associated with the filesystem driver, the latter is already given + * by the filesystem type in sfs->f_type. Thus we use the 64-bit + * volume serial number splitting it into two 32-bit parts. We enter + * the least significant 32-bits in f_fsid[0] and the most significant + * 32-bits in f_fsid[1]. + */ + sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff; + sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff; + /* Maximum length of filenames. */ + sfs->f_namelen = NTFS_MAX_NAME_LEN; + return 0; +} + +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + +/** + * The complete super operations. + */ +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ + .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ +#ifdef NTFS_RW + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to + disk. */ +#endif /* NTFS_RW */ + .put_super = ntfs_put_super, /* Syscall: umount. */ + .statfs = ntfs_statfs, /* Syscall: statfs */ + .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ + .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is + removed from memory. */ + .show_options = ntfs_show_options, /* Show mount options in + proc. */ +}; + +/** + * ntfs_fill_super - mount an ntfs filesystem + * @sb: super block of ntfs filesystem to mount + * @opt: string containing the mount options + * @silent: silence error output + * + * ntfs_fill_super() is called by the VFS to mount the device described by @sb + * with the mount otions in @data with the NTFS filesystem. + * + * If @silent is true, remain silent even if errors are detected. This is used + * during bootup, when the kernel tries to mount the root filesystem with all + * registered filesystems one after the other until one succeeds. This implies + * that all filesystems except the correct one will quite correctly and + * expectedly return an error, but nobody wants to see error messages when in + * fact this is what is supposed to happen. + * + * NOTE: @sb->s_flags contains the mount options flags. + */ +static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) +{ + ntfs_volume *vol; + struct buffer_head *bh; + struct inode *tmp_ino; + int blocksize, result; + + /* + * We do a pretty difficult piece of bootstrap by reading the + * MFT (and other metadata) from disk into memory. We'll only + * release this metadata during umount, so the locking patterns + * observed during bootstrap do not count. So turn off the + * observation of locking patterns (strictly for this context + * only) while mounting NTFS. [The validator is still active + * otherwise, even for this context: it will for example record + * lock class registrations.] + */ + lockdep_off(); + ntfs_debug("Entering."); +#ifndef NTFS_RW + sb->s_flags |= MS_RDONLY; +#endif /* ! NTFS_RW */ + /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ + sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); + vol = NTFS_SB(sb); + if (!vol) { + if (!silent) + ntfs_error(sb, "Allocation of NTFS volume structure " + "failed. Aborting mount..."); + lockdep_on(); + return -ENOMEM; + } + /* Initialize ntfs_volume structure. */ + *vol = (ntfs_volume) { + .sb = sb, + /* + * Default is group and other don't have any access to files or + * directories while owner has full access. Further, files by + * default are not executable but directories are of course + * browseable. + */ + .fmask = 0177, + .dmask = 0077, + }; + init_rwsem(&vol->mftbmp_lock); + init_rwsem(&vol->lcnbmp_lock); + + /* By default, enable sparse support. */ + NVolSetSparseEnabled(vol); + + /* Important to get the mount options dealt with now. */ + if (!parse_options(vol, (char*)opt)) + goto err_out_now; + + /* We support sector sizes up to the PAGE_CACHE_SIZE. */ + if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_logical_block_size(sb->s_bdev), + PAGE_CACHE_SIZE); + goto err_out_now; + } + /* + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. + */ + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { + if (!silent) + ntfs_error(sb, "Unable to set device block size."); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + if (!i_size_read(sb->s_bdev->bd_inode)) { + if (!silent) + ntfs_error(sb, "Unable to determine device size."); + goto err_out_now; + } + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + /* Read the boot sector and return unlocked buffer head to it. */ + if (!(bh = read_ntfs_boot_sector(sb, silent))) { + if (!silent) + ntfs_error(sb, "Not an NTFS volume."); + goto err_out_now; + } + /* + * Extract the data from the boot sector and setup the ntfs volume + * using it. + */ + result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); + brelse(bh); + if (!result) { + if (!silent) + ntfs_error(sb, "Unsupported NTFS filesystem."); + goto err_out_now; + } + /* + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. + */ + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); + /* Setup remaining fields in the super block. */ + sb->s_magic = NTFS_SB_MAGIC; + /* + * Ntfs allows 63 bits for the file size, i.e. correct would be: + * sb->s_maxbytes = ~0ULL >> 1; + * But the kernel uses a long as the page cache page index which on + * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel + * defined to the maximum the page cache page index can cope with + * without overflowing the index or to 2^63 - 1, whichever is smaller. + */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Ntfs measures time in 100ns intervals. */ + sb->s_time_gran = 100; + /* + * Now load the metadata required for the page cache and our address + * space operations to function. We do this by setting up a specialised + * read_inode method and then just calling the normal iget() to obtain + * the inode for $MFT which is sufficient to allow our normal inode + * operations and associated address space operations to function. + */ + sb->s_op = &ntfs_sops; + tmp_ino = new_inode(sb); + if (!tmp_ino) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto err_out_now; + } + tmp_ino->i_ino = FILE_MFT; + insert_inode_hash(tmp_ino); + if (ntfs_read_inode_mount(tmp_ino) < 0) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto iput_tmp_ino_err_out_now; + } + mutex_lock(&ntfs_lock); + /* + * The current mount is a compression user if the cluster size is + * less than or equal 4kiB. + */ + if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { + result = allocate_compression_buffers(); + if (result) { + ntfs_error(NULL, "Failed to allocate buffers " + "for compression engine."); + ntfs_nr_compression_users--; + mutex_unlock(&ntfs_lock); + goto iput_tmp_ino_err_out_now; + } + } + /* + * Generate the global default upcase table if necessary. Also + * temporarily increment the number of upcase users to avoid race + * conditions with concurrent (u)mounts. + */ + if (!default_upcase) + default_upcase = generate_default_upcase(); + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + /* + * From now on, ignore @silent parameter. If we fail below this line, + * it will be due to a corrupt fs or a system error, so we report it. + */ + /* + * Open the system files with normal access functions and complete + * setting up the ntfs super block. + */ + if (!load_system_files(vol)) { + ntfs_error(sb, "Failed to load system files."); + goto unl_upcase_iput_tmp_ino_err_out_now; + } + + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); + if ((sb->s_root = d_make_root(vol->root_ino))) { + ntfs_debug("Exiting, status successful."); + /* Release the default upcase if it has no users. */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + mutex_unlock(&ntfs_lock); + sb->s_export_op = &ntfs_export_ops; + lockdep_on(); + return 0; + } + ntfs_error(sb, "Failed to allocate root directory."); + /* Clean up after the successful load_system_files() call from above. */ + // TODO: Use ntfs_put_super() instead of repeating all this code... + // FIXME: Should mark the volume clean as the error is most likely + // -ENOMEM. + iput(vol->vol_ino); + vol->vol_ino = NULL; + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + iput(vol->root_ino); + vol->root_ino = NULL; + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } +#endif /* NTFS_RW */ + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + if (vol->nls_map) { + unload_nls(vol->nls_map); + vol->nls_map = NULL; + } + /* Error exit code path. */ +unl_upcase_iput_tmp_ino_err_out_now: + /* + * Decrease the number of upcase users and destroy the global default + * upcase table if necessary. + */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); +iput_tmp_ino_err_out_now: + iput(tmp_ino); + if (vol->mft_ino && vol->mft_ino != tmp_ino) + iput(vol->mft_ino); + vol->mft_ino = NULL; + /* Errors at this stage are irrelevant. */ +err_out_now: + sb->s_fs_info = NULL; + kfree(vol); + ntfs_debug("Failed, returning -EINVAL."); + lockdep_on(); + return -EINVAL; +} + +/* + * This is a slab cache to optimize allocations and deallocations of Unicode + * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN + * (255) Unicode characters + a terminating NULL Unicode character. + */ +struct kmem_cache *ntfs_name_cache; + +/* Slab caches for efficient allocation/deallocation of inodes. */ +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; + +/* Init once constructor for the inode slab cache. */ +static void ntfs_big_inode_init_once(void *foo) +{ + ntfs_inode *ni = (ntfs_inode *)foo; + + inode_init_once(VFS_I(ni)); +} + +/* + * Slab caches to optimize allocations and deallocations of attribute search + * contexts and index contexts, respectively. + */ +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; + +/* Driver wide mutex. */ +DEFINE_MUTEX(ntfs_lock); + +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); +} + +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs", + .mount = ntfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ntfs"); + +/* Stable names for the slab caches. */ +static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; +static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; +static const char ntfs_name_cache_name[] = "ntfs_name_cache"; +static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; +static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; + +static int __init init_ntfs_fs(void) +{ + int err = 0; + + /* This may be ugly but it results in pretty output so who cares. (-8 */ + pr_info("driver " NTFS_VERSION " [Flags: R/" +#ifdef NTFS_RW + "W" +#else + "O" +#endif +#ifdef DEBUG + " DEBUG" +#endif +#ifdef MODULE + " MODULE" +#endif + "].\n"); + + ntfs_debug("Debug messages are enabled."); + + ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, + sizeof(ntfs_index_context), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_index_ctx_cache) { + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); + goto ictx_err_out; + } + ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, + sizeof(ntfs_attr_search_ctx), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_attr_ctx_cache) { + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); + goto actx_err_out; + } + + ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, + (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ntfs_name_cache) { + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); + goto name_err_out; + } + + ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, + sizeof(ntfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!ntfs_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); + goto inode_err_out; + } + + ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, + sizeof(big_ntfs_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + ntfs_big_inode_init_once); + if (!ntfs_big_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); + goto big_inode_err_out; + } + + /* Register the ntfs sysctls. */ + err = ntfs_sysctl(1); + if (err) { + pr_crit("Failed to register NTFS sysctls!\n"); + goto sysctl_err_out; + } + + err = register_filesystem(&ntfs_fs_type); + if (!err) { + ntfs_debug("NTFS driver registered successfully."); + return 0; /* Success! */ + } + pr_crit("Failed to register NTFS filesystem driver!\n"); + + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +sysctl_err_out: + kmem_cache_destroy(ntfs_big_inode_cache); +big_inode_err_out: + kmem_cache_destroy(ntfs_inode_cache); +inode_err_out: + kmem_cache_destroy(ntfs_name_cache); +name_err_out: + kmem_cache_destroy(ntfs_attr_ctx_cache); +actx_err_out: + kmem_cache_destroy(ntfs_index_ctx_cache); +ictx_err_out: + if (!err) { + pr_crit("Aborting NTFS filesystem driver registration...\n"); + err = -ENOMEM; + } + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + ntfs_debug("Unregistering NTFS driver."); + + unregister_filesystem(&ntfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ntfs_big_inode_cache); + kmem_cache_destroy(ntfs_inode_cache); + kmem_cache_destroy(ntfs_name_cache); + kmem_cache_destroy(ntfs_attr_ctx_cache); + kmem_cache_destroy(ntfs_index_ctx_cache); + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +} + +MODULE_AUTHOR("Anton Altaparmakov "); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); +MODULE_VERSION(NTFS_VERSION); +MODULE_LICENSE("GPL"); +#ifdef DEBUG +module_param(debug_msgs, bint, 0); +MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); +#endif + +module_init(init_ntfs_fs) +module_exit(exit_ntfs_fs) diff --git a/kernel/fs/ntfs/sysctl.c b/kernel/fs/ntfs/sysctl.c new file mode 100644 index 000000000..a503156ec --- /dev/null +++ b/kernel/fs/ntfs/sysctl.c @@ -0,0 +1,83 @@ +/* + * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef DEBUG + +#include + +#ifdef CONFIG_SYSCTL + +#include +#include + +#include "sysctl.h" +#include "debug.h" + +/* Definition of the ntfs sysctl. */ +static struct ctl_table ntfs_sysctls[] = { + { + .procname = "ntfs-debug", + .data = &debug_msgs, /* Data pointer and size. */ + .maxlen = sizeof(debug_msgs), + .mode = 0644, /* Mode, proc handler. */ + .proc_handler = proc_dointvec + }, + {} +}; + +/* Define the parent directory /proc/sys/fs. */ +static struct ctl_table sysctls_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = ntfs_sysctls + }, + {} +}; + +/* Storage for the sysctls header. */ +static struct ctl_table_header *sysctls_root_table; + +/** + * ntfs_sysctl - add or remove the debug sysctl + * @add: add (1) or remove (0) the sysctl + * + * Add or remove the debug sysctl. Return 0 on success or -errno on error. + */ +int ntfs_sysctl(int add) +{ + if (add) { + BUG_ON(sysctls_root_table); + sysctls_root_table = register_sysctl_table(sysctls_root); + if (!sysctls_root_table) + return -ENOMEM; + } else { + BUG_ON(!sysctls_root_table); + unregister_sysctl_table(sysctls_root_table); + sysctls_root_table = NULL; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ +#endif /* DEBUG */ diff --git a/kernel/fs/ntfs/sysctl.h b/kernel/fs/ntfs/sysctl.h new file mode 100644 index 000000000..d4f8ce920 --- /dev/null +++ b/kernel/fs/ntfs/sysctl.h @@ -0,0 +1,41 @@ +/* + * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_SYSCTL_H +#define _LINUX_NTFS_SYSCTL_H + + +#if defined(DEBUG) && defined(CONFIG_SYSCTL) + +extern int ntfs_sysctl(int add); + +#else + +/* Just return success. */ +static inline int ntfs_sysctl(int add) +{ + return 0; +} + +#endif /* DEBUG && CONFIG_SYSCTL */ +#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/kernel/fs/ntfs/time.h b/kernel/fs/ntfs/time.h new file mode 100644 index 000000000..01233989d --- /dev/null +++ b/kernel/fs/ntfs/time.h @@ -0,0 +1,100 @@ +/* + * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_TIME_H +#define _LINUX_NTFS_TIME_H + +#include /* For current_kernel_time(). */ +#include /* For do_div(). */ + +#include "endian.h" + +#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) + +/** + * utc2ntfs - convert Linux UTC time to NTFS time + * @ts: Linux UTC time to convert to NTFS time + * + * Convert the Linux UTC time @ts to its corresponding NTFS time and return + * that in little endian format. + * + * Linux stores time in a struct timespec consisting of a time_t (long at + * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second + * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of + * 1-nano-second intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100-nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline sle64 utc2ntfs(const struct timespec ts) +{ + /* + * Convert the seconds to 100ns intervals, add the nano-seconds + * converted to 100ns intervals, and then add the NTFS time offset. + */ + return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + + NTFS_TIME_OFFSET); +} + +/** + * get_current_ntfs_time - get the current time in little endian NTFS format + * + * Get the current time from the Linux kernel, convert it to its corresponding + * NTFS time and return that in little endian format. + */ +static inline sle64 get_current_ntfs_time(void) +{ + return utc2ntfs(current_kernel_time()); +} + +/** + * ntfs2utc - convert NTFS time to Linux time + * @time: NTFS time (little endian) to convert to Linux UTC + * + * Convert the little endian NTFS time @time to its corresponding Linux UTC + * time and return that in cpu format. + * + * Linux stores time in a struct timespec consisting of a time_t (long at + * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second + * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of + * 1-nano-second intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100 nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline struct timespec ntfs2utc(const sle64 time) +{ + struct timespec ts; + + /* Subtract the NTFS time offset. */ + u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); + /* + * Convert the time to 1-second intervals and the remainder to + * 1-nano-second intervals. + */ + ts.tv_nsec = do_div(t, 10000000) * 100; + ts.tv_sec = t; + return ts; +} + +#endif /* _LINUX_NTFS_TIME_H */ diff --git a/kernel/fs/ntfs/types.h b/kernel/fs/ntfs/types.h new file mode 100644 index 000000000..8c8053b66 --- /dev/null +++ b/kernel/fs/ntfs/types.h @@ -0,0 +1,69 @@ +/* + * types.h - Defines for NTFS Linux kernel driver specific types. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_TYPES_H +#define _LINUX_NTFS_TYPES_H + +#include + +typedef __le16 le16; +typedef __le32 le32; +typedef __le64 le64; +typedef __u16 __bitwise sle16; +typedef __u32 __bitwise sle32; +typedef __u64 __bitwise sle64; + +/* 2-byte Unicode character type. */ +typedef le16 ntfschar; +#define UCHAR_T_SIZE_BITS 1 + +/* + * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN + * and VCN, to allow for type checking and better code readability. + */ +typedef s64 VCN; +typedef sle64 leVCN; +typedef s64 LCN; +typedef sle64 leLCN; + +/* + * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit + * values. We define our own type LSN, to allow for type checking and better + * code readability. + */ +typedef s64 LSN; +typedef sle64 leLSN; + +/* + * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. + * We define our own type USN, to allow for type checking and better code + * readability. + */ +typedef s64 USN; +typedef sle64 leUSN; + +typedef enum { + CASE_SENSITIVE = 0, + IGNORE_CASE = 1, +} IGNORE_CASE_BOOL; + +#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/kernel/fs/ntfs/unistr.c b/kernel/fs/ntfs/unistr.c new file mode 100644 index 000000000..005ca4b0f --- /dev/null +++ b/kernel/fs/ntfs/unistr.c @@ -0,0 +1,398 @@ +/* + * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include "types.h" +#include "debug.h" +#include "ntfs.h" + +/* + * IMPORTANT + * ========= + * + * All these routines assume that the Unicode characters are in little endian + * encoding inside the strings!!! + */ + +/* + * This is used by the name collation functions to quickly determine what + * characters are (in)valid. + */ +static const u8 legal_ansi_char_array[0x40] = { + 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, + + 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, +}; + +/** + * ntfs_are_names_equal - compare two Unicode names for equality + * @s1: name to compare to @s2 + * @s1_len: length in Unicode characters of @s1 + * @s2: name to compare to @s1 + * @s2_len: length in Unicode characters of @s2 + * @ic: ignore case bool + * @upcase: upcase table (only if @ic == IGNORE_CASE) + * @upcase_size: length in Unicode characters of @upcase (if present) + * + * Compare the names @s1 and @s2 and return 'true' (1) if the names are + * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, + * the @upcase table is used to performa a case insensitive comparison. + */ +bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size) +{ + if (s1_len != s2_len) + return false; + if (ic == CASE_SENSITIVE) + return !ntfs_ucsncmp(s1, s2, s1_len); + return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); +} + +/** + * ntfs_collate_names - collate two Unicode names + * @name1: first Unicode name to compare + * @name2: second Unicode name to compare + * @err_val: if @name1 contains an invalid character return this value + * @ic: either CASE_SENSITIVE or IGNORE_CASE + * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) + * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) + * + * ntfs_collate_names collates two Unicode names and returns: + * + * -1 if the first name collates before the second one, + * 0 if the names match, + * 1 if the second name collates before the first one, or + * @err_val if an invalid character is found in @name1 during the comparison. + * + * The following characters are considered invalid: '"', '*', '<', '>' and '?'. + */ +int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + u32 cnt, min_len; + u16 c1, c2; + + min_len = name1_len; + if (name1_len > name2_len) + min_len = name2_len; + for (cnt = 0; cnt < min_len; ++cnt) { + c1 = le16_to_cpu(*name1++); + c2 = le16_to_cpu(*name2++); + if (ic) { + if (c1 < upcase_len) + c1 = le16_to_cpu(upcase[c1]); + if (c2 < upcase_len) + c2 = le16_to_cpu(upcase[c2]); + } + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + } + if (name1_len < name2_len) + return -1; + if (name1_len == name2_len) + return 0; + /* name1_len > name2_len */ + c1 = le16_to_cpu(*name1); + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + return 1; +} + +/** + * ntfs_ucsncmp - compare two little endian Unicode strings + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * The strings in little endian format and appropriate le16_to_cpu() + * conversion is performed on non-little endian machines. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) +{ + u16 c1, c2; + size_t i; + + for (i = 0; i < n; ++i) { + c1 = le16_to_cpu(s1[i]); + c2 = le16_to_cpu(s2[i]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +/** + * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * @upcase: upcase table + * @upcase_size: upcase table size in Unicode characters + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * ignoring case. The strings in little endian format and appropriate + * le16_to_cpu() conversion is performed on non-little endian machines. + * + * Each character is uppercased using the @upcase table before the comparison. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size) +{ + size_t i; + u16 c1, c2; + + for (i = 0; i < n; ++i) { + if ((c1 = le16_to_cpu(s1[i])) < upcase_size) + c1 = le16_to_cpu(upcase[c1]); + if ((c2 = le16_to_cpu(s2[i])) < upcase_size) + c2 = le16_to_cpu(upcase[c2]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, + const u32 upcase_len) +{ + u32 i; + u16 u; + + for (i = 0; i < name_len; i++) + if ((u = le16_to_cpu(name[i])) < upcase_len) + name[i] = upcase[u]; +} + +void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len) +{ + ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, + file_name_attr->file_name_length, upcase, upcase_len); +} + +int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, + file_name_attr1->file_name_length, + (ntfschar*)&file_name_attr2->file_name, + file_name_attr2->file_name_length, + err_val, ic, upcase, upcase_len); +} + +/** + * ntfs_nlstoucs - convert NLS string to little endian Unicode string + * @vol: ntfs volume which we are working with + * @ins: input NLS string buffer + * @ins_len: length of input string in bytes + * @outs: on return contains the allocated output Unicode string buffer + * + * Convert the input string @ins, which is in whatever format the loaded NLS + * map dictates, into a little endian, 2-byte Unicode string. + * + * This function allocates the string and the caller is responsible for + * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. + * + * On success the function returns the number of Unicode characters written to + * the output string *@outs (>= 0), not counting the terminating Unicode NULL + * character. *@outs is set to the allocated output string buffer. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. Both *@outs and *@outs_len + * are then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs) +{ + struct nls_table *nls = vol->nls_map; + ntfschar *ucs; + wchar_t wc; + int i, o, wc_len; + + /* We do not trust outside sources. */ + if (likely(ins)) { + ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); + if (likely(ucs)) { + for (i = o = 0; i < ins_len; i += wc_len) { + wc_len = nls->char2uni(ins + i, ins_len - i, + &wc); + if (likely(wc_len >= 0 && + o < NTFS_MAX_NAME_LEN)) { + if (likely(wc)) { + ucs[o++] = cpu_to_le16(wc); + continue; + } /* else if (!wc) */ + break; + } /* else if (wc_len < 0 || + o >= NTFS_MAX_NAME_LEN) */ + goto name_err; + } + ucs[o] = 0; + *outs = ucs; + return o; + } /* else if (!ucs) */ + ntfs_error(vol->sb, "Failed to allocate buffer for converted " + "name from ntfs_name_cache."); + return -ENOMEM; + } /* else if (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +name_err: + kmem_cache_free(ntfs_name_cache, ucs); + if (wc_len < 0) { + ntfs_error(vol->sb, "Name using character set %s contains " + "characters that cannot be converted to " + "Unicode.", nls->charset); + i = -EILSEQ; + } else /* if (o >= NTFS_MAX_NAME_LEN) */ { + ntfs_error(vol->sb, "Name is too long (maximum length for a " + "name on NTFS is %d Unicode characters.", + NTFS_MAX_NAME_LEN); + i = -ENAMETOOLONG; + } + return i; +} + +/** + * ntfs_ucstonls - convert little endian Unicode string to NLS string + * @vol: ntfs volume which we are working with + * @ins: input Unicode string buffer + * @ins_len: length of input string in Unicode characters + * @outs: on return contains the (allocated) output NLS string buffer + * @outs_len: length of output string buffer in bytes + * + * Convert the input little endian, 2-byte Unicode string @ins, of length + * @ins_len into the string format dictated by the loaded NLS. + * + * If *@outs is NULL, this function allocates the string and the caller is + * responsible for calling kfree(*@outs); when finished with it. In this case + * @outs_len is ignored and can be 0. + * + * On success the function returns the number of bytes written to the output + * string *@outs (>= 0), not counting the terminating NULL byte. If the output + * string buffer was allocated, *@outs is set to it. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. The contents of *@outs are + * then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len) +{ + struct nls_table *nls = vol->nls_map; + unsigned char *ns; + int i, o, ns_len, wc; + + /* We don't trust outside sources. */ + if (ins) { + ns = *outs; + ns_len = outs_len; + if (ns && !ns_len) { + wc = -ENAMETOOLONG; + goto conversion_err; + } + if (!ns) { + ns_len = ins_len * NLS_MAX_CHARSET_SIZE; + ns = kmalloc(ns_len + 1, GFP_NOFS); + if (!ns) + goto mem_err_out; + } + for (i = o = 0; i < ins_len; i++) { +retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, + ns_len - o); + if (wc > 0) { + o += wc; + continue; + } else if (!wc) + break; + else if (wc == -ENAMETOOLONG && ns != *outs) { + unsigned char *tc; + /* Grow in multiples of 64 bytes. */ + tc = kmalloc((ns_len + 64) & + ~63, GFP_NOFS); + if (tc) { + memcpy(tc, ns, ns_len); + ns_len = ((ns_len + 64) & ~63) - 1; + kfree(ns); + ns = tc; + goto retry; + } /* No memory so goto conversion_error; */ + } /* wc < 0, real error. */ + goto conversion_err; + } + ns[o] = 0; + *outs = ns; + return o; + } /* else (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +conversion_err: + ntfs_error(vol->sb, "Unicode name contains characters that cannot be " + "converted to character set %s. You might want to " + "try to use the mount option nls=utf8.", nls->charset); + if (ns != *outs) + kfree(ns); + if (wc != -ENAMETOOLONG) + wc = -EILSEQ; + return wc; +mem_err_out: + ntfs_error(vol->sb, "Failed to allocate name!"); + return -ENOMEM; +} diff --git a/kernel/fs/ntfs/upcase.c b/kernel/fs/ntfs/upcase.c new file mode 100644 index 000000000..e2f72ca98 --- /dev/null +++ b/kernel/fs/ntfs/upcase.c @@ -0,0 +1,87 @@ +/* + * upcase.c - Generate the full NTFS Unicode upcase table in little endian. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001 Richard Russon + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS source + * in the file COPYING); if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "malloc.h" +#include "ntfs.h" + +ntfschar *generate_default_upcase(void) +{ + static const int uc_run_table[][3] = { /* Start, End, Add */ + {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, + {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, + {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, + {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, + {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, + {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, + {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, + {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, + {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, + {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, + {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, + {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, + {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, + {0} + }; + + static const int uc_dup_table[][2] = { /* Start, End */ + {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, + {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, + {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, + {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, + {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, + {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, + {0} + }; + + static const int uc_word_table[][2] = { /* Offset, Value */ + {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, + {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, + {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, + {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, + {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, + {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, + {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, + {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, + {0} + }; + + int i, r; + ntfschar *uc; + + uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); + if (!uc) + return uc; + memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ + for (i = 0; i < default_upcase_len; i++) + uc[i] = cpu_to_le16(i); + for (r = 0; uc_run_table[r][0]; r++) + for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) + le16_add_cpu(&uc[i], uc_run_table[r][2]); + for (r = 0; uc_dup_table[r][0]; r++) + for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) + le16_add_cpu(&uc[i + 1], -1); + for (r = 0; uc_word_table[r][0]; r++) + uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); + return uc; +} diff --git a/kernel/fs/ntfs/usnjrnl.c b/kernel/fs/ntfs/usnjrnl.c new file mode 100644 index 000000000..b2bc0d55b --- /dev/null +++ b/kernel/fs/ntfs/usnjrnl.c @@ -0,0 +1,84 @@ +/* + * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include +#include +#include + +#include "aops.h" +#include "debug.h" +#include "endian.h" +#include "time.h" +#include "types.h" +#include "usnjrnl.h" +#include "volume.h" + +/** + * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume + * @vol: ntfs volume on which to stamp the transaction log + * + * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return + * 'true' on success and 'false' on error. + * + * This function assumes that the transaction log has already been loaded and + * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). + */ +bool ntfs_stamp_usnjrnl(ntfs_volume *vol) +{ + ntfs_debug("Entering."); + if (likely(!NVolUsnJrnlStamped(vol))) { + sle64 stamp; + struct page *page; + USN_HEADER *uh; + + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from " + "$UsnJrnl/$DATA/$Max attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + stamp = get_current_ntfs_time(); + ntfs_debug("Stamping transaction log ($UsnJrnl): old " + "journal_id 0x%llx, old lowest_valid_usn " + "0x%llx, new journal_id 0x%llx, new " + "lowest_valid_usn 0x%llx.", + (long long)sle64_to_cpu(uh->journal_id), + (long long)sle64_to_cpu(uh->lowest_valid_usn), + (long long)sle64_to_cpu(stamp), + i_size_read(vol->usnjrnl_j_ino)); + uh->lowest_valid_usn = + cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); + uh->journal_id = stamp; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetUsnJrnlStamped(vol); + } + ntfs_debug("Done."); + return true; +} + +#endif /* NTFS_RW */ diff --git a/kernel/fs/ntfs/usnjrnl.h b/kernel/fs/ntfs/usnjrnl.h new file mode 100644 index 000000000..00d8e6bd7 --- /dev/null +++ b/kernel/fs/ntfs/usnjrnl.h @@ -0,0 +1,205 @@ +/* + * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_USNJRNL_H +#define _LINUX_NTFS_USNJRNL_H + +#ifdef NTFS_RW + +#include "types.h" +#include "endian.h" +#include "layout.h" +#include "volume.h" + +/* + * Transaction log ($UsnJrnl) organization: + * + * The transaction log records whenever a file is modified in any way. So for + * example it will record that file "blah" was written to at a particular time + * but not what was written. If will record that a file was deleted or + * created, that a file was truncated, etc. See below for all the reason + * codes used. + * + * The transaction log is in the $Extend directory which is in the root + * directory of each volume. If it is not present it means transaction + * logging is disabled. If it is present it means transaction logging is + * either enabled or in the process of being disabled in which case we can + * ignore it as it will go away as soon as Windows gets its hands on it. + * + * To determine whether the transaction logging is enabled or in the process + * of being disabled, need to check the volume flags in the + * $VOLUME_INFORMATION attribute in the $Volume system file (which is present + * in the root directory and has a fixed mft record number, see layout.h). + * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log + * is in the process of being disabled and if this flag is clear it means the + * transaction log is enabled. + * + * The transaction log consists of two parts; the $DATA/$Max attribute as well + * as the $DATA/$J attribute. $Max is a header describing the transaction + * log whilst $J is the transaction log data itself as a sequence of variable + * sized USN_RECORDs (see below for all the structures). + * + * We do not care about transaction logging at this point in time but we still + * need to let windows know that the transaction log is out of date. To do + * this we need to stamp the transaction log. This involves setting the + * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used + * for the next added USN_RECORD to the $DATA/$J attribute as well as + * generating a new journal_id in $DATA/$Max. + * + * The journal_id is as of the current version (2.0) of the transaction log + * simply the 64-bit timestamp of when the journal was either created or last + * stamped. + * + * To determine the next usn there are two ways. The first is to parse + * $DATA/$J and to find the last USN_RECORD in it and to add its record_length + * to its usn (which is the byte offset in the $DATA/$J attribute). The + * second is simply to take the data size of the attribute. Since the usns + * are simply byte offsets into $DATA/$J, this is exactly the next usn. For + * obvious reasons we use the second method as it is much simpler and faster. + * + * As an aside, note that to actually disable the transaction log, one would + * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go + * through all the mft records on the volume and set the usn field in their + * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need + * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, + * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. + * + * Note that if a volume is unmounted whilst the transaction log is being + * disabled, the process will continue the next time the volume is mounted. + * This is why we can safely mount read-write when we see a transaction log + * in the process of being deleted. + */ + +/* Some $UsnJrnl related constants. */ +#define UsnJrnlMajorVer 2 +#define UsnJrnlMinorVer 0 + +/* + * $DATA/$Max attribute. This is (always?) resident and has a fixed size of + * 32 bytes. It contains the header describing the transaction log. + */ +typedef struct { +/*Ofs*/ +/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J + attribute. */ +/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the + size of the $DATA/$J attribute. */ +/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ +/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the + current journal_id. */ +/* sizeof() = 32 (0x20) bytes */ +} __attribute__ ((__packed__)) USN_HEADER; + +/* + * Reason flags (32-bit). Cumulative flags describing the change(s) to the + * file since it was last opened. I think the names speak for themselves but + * if you disagree check out the descriptions in the Linux NTFS project NTFS + * documentation: http://www.linux-ntfs.org/ + */ +enum { + USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), + USN_REASON_CLOSE = cpu_to_le32(0x80000000), +}; + +typedef le32 USN_REASON_FLAGS; + +/* + * Source info flags (32-bit). Information about the source of the change(s) + * to the file. For detailed descriptions of what these mean, see the Linux + * NTFS project NTFS documentation: + * http://www.linux-ntfs.org/ + */ +enum { + USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), +}; + +typedef le32 USN_SOURCE_INFO_FLAGS; + +/* + * $DATA/$J attribute. This is always non-resident, is marked as sparse, and + * is of variabled size. It consists of a sequence of variable size + * USN_RECORDS. The minimum allocated_size is allocation_delta as + * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is + * exceeded by more than allocation_delta bytes, allocation_delta bytes are + * allocated and appended to the $DATA/$J attribute and an equal number of + * bytes at the beginning of the attribute are freed and made sparse. Note the + * making sparse only happens at volume checkpoints and hence the actual + * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. + */ +typedef struct { +/*Ofs*/ +/* 0*/le32 length; /* Byte size of this record (8-byte + aligned). */ +/* 4*/le16 major_ver; /* Major version of the transaction log used + for this record. */ +/* 6*/le16 minor_ver; /* Minor version of the transaction log used + for this record. */ +/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or + directory) described by this record. */ +/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent + directory of the file described by this + record. */ +/*0x18*/leUSN usn; /* The usn of this record. Equals the offset + within the $DATA/$J attribute. */ +/*0x20*/sle64 time; /* Time when this record was created. */ +/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ +/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ +/*0x30*/le32 security_id; /* File security_id copied from + $STANDARD_INFORMATION. */ +/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from + $STANDARD_INFORMATION or $FILE_NAME (not + sure which). */ +/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ +/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the + start of this record. */ +/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use + file_name_offset to determine the location + of the name. */ +/* sizeof() = 60 (0x3c) bytes */ +} __attribute__ ((__packed__)) USN_RECORD; + +extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/kernel/fs/ntfs/volume.h b/kernel/fs/ntfs/volume.h new file mode 100644 index 000000000..4f579b02b --- /dev/null +++ b/kernel/fs/ntfs/volume.h @@ -0,0 +1,178 @@ +/* + * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part + * of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_VOLUME_H +#define _LINUX_NTFS_VOLUME_H + +#include +#include + +#include "types.h" +#include "layout.h" + +/* + * The NTFS in memory super block structure. + */ +typedef struct { + /* + * FIXME: Reorder to have commonly used together element within the + * same cache line, aiming at a cache line size of 32 bytes. Aim for + * 64 bytes for less commonly used together elements. Put most commonly + * used elements to front of structure. Obviously do this only when the + * structure has stabilized... (AIA) + */ + /* Device specifics. */ + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes + sized blocks on the device. */ + /* Configuration provided by user at mount time. */ + unsigned long flags; /* Miscellaneous flags, see below. */ + kuid_t uid; /* uid that files will be mounted as. */ + kgid_t gid; /* gid that files will be mounted as. */ + umode_t fmask; /* The mask for file permissions. */ + umode_t dmask; /* The mask for directory + permissions. */ + u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ + u8 on_errors; /* What to do on filesystem errors. */ + /* NTFS bootsector provided information. */ + u16 sector_size; /* in bytes */ + u8 sector_size_bits; /* log2(sector_size) */ + u32 cluster_size; /* in bytes */ + u32 cluster_size_mask; /* cluster_size - 1 */ + u8 cluster_size_bits; /* log2(cluster_size) */ + u32 mft_record_size; /* in bytes */ + u32 mft_record_size_mask; /* mft_record_size - 1 */ + u8 mft_record_size_bits; /* log2(mft_record_size) */ + u32 index_record_size; /* in bytes */ + u32 index_record_size_mask; /* index_record_size - 1 */ + u8 index_record_size_bits; /* log2(index_record_size) */ + LCN nr_clusters; /* Volume size in clusters == number of + bits in lcn bitmap. */ + LCN mft_lcn; /* Cluster location of mft data. */ + LCN mftmirr_lcn; /* Cluster location of copy of mft. */ + u64 serial_no; /* The volume serial number. */ + /* Mount specific NTFS information. */ + u32 upcase_len; /* Number of entries in upcase[]. */ + ntfschar *upcase; /* The upcase table. */ + + s32 attrdef_size; /* Size of the attribute definition + table in bytes. */ + ATTR_DEF *attrdef; /* Table of attribute definitions. + Obtained from FILE_AttrDef. */ + +#ifdef NTFS_RW + /* Variables used by the cluster and mft allocators. */ + s64 mft_data_pos; /* Mft record number at which to + allocate the next mft record. */ + LCN mft_zone_start; /* First cluster of the mft zone. */ + LCN mft_zone_end; /* First cluster beyond the mft zone. */ + LCN mft_zone_pos; /* Current position in the mft zone. */ + LCN data1_zone_pos; /* Current position in the first data + zone. */ + LCN data2_zone_pos; /* Current position in the second data + zone. */ +#endif /* NTFS_RW */ + + struct inode *mft_ino; /* The VFS inode of $MFT. */ + + struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ + struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the + mft record bitmap ($MFT/$BITMAP). */ +#ifdef NTFS_RW + struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ + int mftmirr_size; /* Size of mft mirror in mft records. */ + + struct inode *logfile_ino; /* The VFS inode of $LogFile. */ +#endif /* NTFS_RW */ + + struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ + struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the + cluster bitmap ($Bitmap/$DATA). */ + + struct inode *vol_ino; /* The VFS inode of $Volume. */ + VOLUME_FLAGS vol_flags; /* Volume flags. */ + u8 major_ver; /* Ntfs major version of volume. */ + u8 minor_ver; /* Ntfs minor version of volume. */ + + struct inode *root_ino; /* The VFS inode of the root + directory. */ + struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ + only, otherwise NULL). */ + struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ + only, otherwise NULL). */ +#ifdef NTFS_RW + /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *quota_ino; /* The VFS inode of $Quota. */ + struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ + /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ + struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ + struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ +#endif /* NTFS_RW */ + struct nls_table *nls_map; +} ntfs_volume; + +/* + * Defined bits for the flags field in the ntfs_volume structure. + */ +typedef enum { + NV_Errors, /* 1: Volume has errors, prevent remount rw. */ + NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ + NV_CaseSensitive, /* 1: Treat file names as case sensitive and + create filenames in the POSIX namespace. + Otherwise be case insensitive but still + create file names in POSIX namespace. */ + NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ + NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ + NV_SparseEnabled, /* 1: May create sparse files. */ +} ntfs_volume_flags; + +/* + * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() + * functions. + */ +#define DEFINE_NVOL_BIT_OPS(flag) \ +static inline int NVol##flag(ntfs_volume *vol) \ +{ \ + return test_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolSet##flag(ntfs_volume *vol) \ +{ \ + set_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolClear##flag(ntfs_volume *vol) \ +{ \ + clear_bit(NV_##flag, &(vol)->flags); \ +} + +/* Emit the ntfs volume bitops functions. */ +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) + +#endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/kernel/fs/ocfs2/Kconfig b/kernel/fs/ocfs2/Kconfig new file mode 100644 index 000000000..77a8de5f7 --- /dev/null +++ b/kernel/fs/ocfs2/Kconfig @@ -0,0 +1,76 @@ +config OCFS2_FS + tristate "OCFS2 file system support" + depends on NET && SYSFS && CONFIGFS_FS + select JBD2 + select CRC32 + select QUOTA + select QUOTA_TREE + select FS_POSIX_ACL + help + OCFS2 is a general purpose extent based shared disk cluster file + system with many similarities to ext3. It supports 64 bit inode + numbers, and has automatically extending metadata groups which may + also make it attractive for non-clustered use. + + You'll want to install the ocfs2-tools package in order to at least + get "mount.ocfs2". + + Project web page: http://oss.oracle.com/projects/ocfs2 + Tools web page: http://oss.oracle.com/projects/ocfs2-tools + OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + + For more information on OCFS2, see the file + . + +config OCFS2_FS_O2CB + tristate "O2CB Kernelspace Clustering" + depends on OCFS2_FS + default y + help + OCFS2 includes a simple kernelspace clustering package, the OCFS2 + Cluster Base. It only requires a very small userspace component + to configure it. This comes with the standard ocfs2-tools package. + O2CB is limited to maintaining a cluster for OCFS2 file systems. + It cannot manage any other cluster applications. + + It is always safe to say Y here, as the clustering method is + run-time selectable. + +config OCFS2_FS_USERSPACE_CLUSTER + tristate "OCFS2 Userspace Clustering" + depends on OCFS2_FS && DLM + default y + help + This option will allow OCFS2 to use userspace clustering services + in conjunction with the DLM in fs/dlm. If you are using a + userspace cluster manager, say Y here. + + It is safe to say Y, as the clustering method is run-time + selectable. + +config OCFS2_FS_STATS + bool "OCFS2 statistics" + depends on OCFS2_FS && DEBUG_FS + default y + help + This option allows some fs statistics to be captured. Enabling + this option may increase the memory consumption. + +config OCFS2_DEBUG_MASKLOG + bool "OCFS2 logging support" + depends on OCFS2_FS + default y + help + The ocfs2 filesystem has an extensive logging system. The system + allows selection of events to log via files in /sys/o2cb/logmask/. + This option will enlarge your kernel, but it allows debugging of + ocfs2 filesystem issues. + +config OCFS2_DEBUG_FS + bool "OCFS2 expensive checks" + depends on OCFS2_FS + default n + help + This option will enable expensive consistency checks. Enable + this option for debugging only as it is likely to decrease + performance of the filesystem. diff --git a/kernel/fs/ocfs2/Makefile b/kernel/fs/ocfs2/Makefile new file mode 100644 index 000000000..ce210d495 --- /dev/null +++ b/kernel/fs/ocfs2/Makefile @@ -0,0 +1,53 @@ +ccflags-y := -Ifs/ocfs2 + +ccflags-y += -DCATCH_BH_JBD_RACES + +obj-$(CONFIG_OCFS2_FS) += \ + ocfs2.o \ + ocfs2_stackglue.o + +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o +obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o + +ocfs2-objs := \ + alloc.o \ + aops.o \ + blockcheck.o \ + buffer_head_io.o \ + dcache.o \ + dir.o \ + dlmglue.o \ + export.o \ + extent_map.o \ + file.o \ + heartbeat.o \ + inode.o \ + ioctl.o \ + journal.o \ + localalloc.o \ + locks.o \ + mmap.o \ + namei.o \ + refcounttree.o \ + reservations.o \ + move_extents.o \ + resize.o \ + slot_map.o \ + suballoc.o \ + super.o \ + symlink.o \ + sysfile.o \ + uptodate.o \ + quota_local.o \ + quota_global.o \ + xattr.o \ + acl.o + +ocfs2_stackglue-objs := stackglue.o +ocfs2_stack_o2cb-objs := stack_o2cb.o +ocfs2_stack_user-objs := stack_user.o + +obj-$(CONFIG_OCFS2_FS) += dlmfs/ +# cluster/ is always needed when OCFS2_FS for masklog support +obj-$(CONFIG_OCFS2_FS) += cluster/ +obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ diff --git a/kernel/fs/ocfs2/acl.c b/kernel/fs/ocfs2/acl.c new file mode 100644 index 000000000..c58a1bcfd --- /dev/null +++ b/kernel/fs/ocfs2/acl.c @@ -0,0 +1,310 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/acl.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from xattr value to acl struct. + */ +static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) +{ + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(struct posix_acl_entry)) + return ERR_PTR(-EINVAL); + + count = size / sizeof(struct posix_acl_entry); + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + struct ocfs2_acl_entry *entry = + (struct ocfs2_acl_entry *)value; + + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + default: + break; + } + value += sizeof(struct posix_acl_entry); + + } + return acl; +} + +/* + * Convert acl struct to xattr value. + */ +static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) +{ + struct ocfs2_acl_entry *entry = NULL; + char *ocfs2_acl; + size_t n; + + *size = acl->a_count * sizeof(struct posix_acl_entry); + + ocfs2_acl = kmalloc(*size, GFP_NOFS); + if (!ocfs2_acl) + return ERR_PTR(-ENOMEM); + + entry = (struct ocfs2_acl_entry *)ocfs2_acl; + for (n = 0; n < acl->a_count; n++, entry++) { + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[n].e_uid)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[n].e_gid)); + break; + default: + entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return ocfs2_acl; +} + +static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, + int type, + struct buffer_head *di_bh) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + "", value, retval); + } + + if (retval > 0) + acl = ocfs2_acl_from_xattr(value, retval); + else if (retval == -ENODATA || retval == 0) + acl = NULL; + else + acl = ERR_PTR(retval); + + kfree(value); + + return acl; +} + +/* + * Helper function to set i_mode in memory and disk. Some call paths + * will not have di_bh or a journal handle to pass, in which case it + * will create it's own. + */ +static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, umode_t new_mode) +{ + int ret, commit_handle = 0; + struct ocfs2_dinode *di; + + if (di_bh == NULL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } else + get_bh(di_bh); + + if (handle == NULL) { + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_brelse; + } + + commit_handle = 1; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_mode = new_mode; + inode->i_ctime = CURRENT_TIME; + di->i_mode = cpu_to_le16(inode->i_mode); + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + if (commit_handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out_brelse: + brelse(di_bh); +out: + return ret; +} + +/* + * Set the access or default ACL of an inode. + */ +int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + umode_t mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + + if (ret == 0) + acl = NULL; + + ret = ocfs2_acl_set_mode(inode, di_bh, + handle, mode); + if (ret) + return ret; + } + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = ocfs2_acl_to_xattr(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + if (handle) + ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index, + "", value, size, 0, + meta_ac, data_ac); + else + ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + + return ret; +} + +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); +} + +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) +{ + struct ocfs2_super *osb; + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret = -EAGAIN; + + osb = OCFS2_SB(inode->i_sb); + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret < 0) + return ERR_PTR(ret); + + acl = ocfs2_get_acl_nolock(inode, type, di_bh); + + brelse(di_bh); + + return acl; +} diff --git a/kernel/fs/ocfs2/acl.h b/kernel/fs/ocfs2/acl.h new file mode 100644 index 000000000..3fce68d08 --- /dev/null +++ b/kernel/fs/ocfs2/acl.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_ACL_H +#define OCFS2_ACL_H + +#include + +struct ocfs2_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac); + +#endif /* OCFS2_ACL_H */ diff --git a/kernel/fs/ocfs2/alloc.c b/kernel/fs/ocfs2/alloc.c new file mode 100644 index 000000000..2d7f76e52 --- /dev/null +++ b/kernel/fs/ocfs2/alloc.c @@ -0,0 +1,7405 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.c + * + * Extent allocs and frees + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "sysfile.h" +#include "file.h" +#include "super.h" +#include "uptodate.h" +#include "xattr.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init__extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If this extent tree is supported by an extent map, insert + * a record into the map. + */ + void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + + /* + * If this extent tree is supported by an extent map, truncate the + * map to clusters, + */ + void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, + u32 clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_extent_contig)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters); +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, + .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&oi->ip_lock); + oi->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_insert_rec(inode, rec); +} + +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_trunc(inode, clusters); +} + +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); + + BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)oi->ip_blkno, + rec->e_cpos, oi->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + return 0; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + et->et_root_el = &vb->vb_xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) +{ + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + dx_root->dr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + return le64_to_cpu(dx_root->dr_last_eb_blk); +} + +static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + le32_add_cpu(&dx_root->dr_clusters, clusters); +} + +static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); + + return 0; +} + +static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + et->et_root_el = &dx_root->dr_list; +} + +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, + .eo_update_clusters = ocfs2_dx_root_update_clusters, + .eo_sanity_check = ocfs2_dx_root_sanity_check, + .eo_fill_root_el = ocfs2_dx_root_fill_root_el, +}; + +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type +ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return CONTIG_NONE; +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_extent_contig = ocfs2_refcount_tree_extent_contig, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, + ocfs2_journal_access_func access, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + et->et_ci = ci; + et->et_root_journal_access = access; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, + NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, + NULL, &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct ocfs2_xattr_value_buf *vb) +{ + __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, + &ocfs2_xattr_value_et_ops); +} + +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, + NULL, &ocfs2_dx_root_et_ops); +} + +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(et, clusters); +} + +static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + if (et->et_ops->eo_extent_map_insert) + et->et_ops->eo_extent_map_insert(et, rec); +} + +static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + if (et->et_ops->eo_extent_map_truncate) + et->et_ops->eo_extent_map_truncate(et, clusters); +} + +static inline int ocfs2_et_root_journal_access(handle_t *handle, + struct ocfs2_extent_tree *et, + int type) +{ + return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, + type); +} + +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *insert_rec) +{ + if (et->et_ops->eo_extent_contig) + return et->et_ops->eo_extent_contig(et, rec, insert_rec); + + return ocfs2_extent_rec_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, insert_rec); +} + +static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(et); + return ret; +} + +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb); +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; + + if (keep_root) + start = 1; + + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; + + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + else + path_root_access(path) = NULL; + + path->p_tree_depth = depth; +} + +void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * All the elements of src into dest. After this call, src could be freed + * without affecting dest. + * + * Both paths should have the same root. Any non-root elements of dest + * will be freed. + */ +static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_el(dest) != path_root_el(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); + + ocfs2_reinit_path(dest, 1); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + if (dest->p_node[i].bh) + get_bh(dest->p_node[i].bh); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el, + ocfs2_journal_access_func access) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + path_root_access(path) = access; + } + + return path; +} + +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +{ + return ocfs2_new_path(path_root_bh(path), path_root_el(path), + path_root_access(path)); +} + +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +{ + return ocfs2_new_path(et->et_root_bh, et->et_root_el, + et->et_root_journal_access); +} + +/* + * Journal the buffer at depth idx. All idx>0 are extent_blocks, + * otherwise it's the root_access function. + * + * I don't like the way this function's name looks next to + * ocfs2_journal_access_path(), but I don't have a better one. + */ +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) +{ + ocfs2_journal_access_func access = path_root_access(path); + + if (!access) + access = ocfs2_journal_access; + + if (idx) + access = ocfs2_journal_access_eb; + + return access(handle, ci, path->p_node[idx].bh, + OCFS2_JOURNAL_ACCESS_WRITE); +} + +/* + * Convenience function to journal all components in a path. + */ +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_path_bh_journal_access(handle, ci, path, i); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); + + rec_end = rec_start + clusters; + + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } + } + + return ret; +} + +/* + * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and + * ocfs2_extent_rec_contig only work properly against leaf nodes! + */ +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + u64 blk_end = le64_to_cpu(ext->e_blkno); + + blk_end += ocfs2_clusters_to_blocks(sb, + le16_to_cpu(ext->e_leaf_clusters)); + + return blkno == blk_end; +} + +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + u32 left_range; + + left_range = le32_to_cpu(left->e_cpos) + + le16_to_cpu(left->e_leaf_clusters); + + return (left_range == le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + /* + * Refuse to coalesce extent records with different flag + * fields - we don't want to mix unwritten extents with user + * data. + */ + if (ext->e_flags != insert_rec->e_flags) + return CONTIG_NONE; + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; +} + +/* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +enum ocfs2_split_type { + SPLIT_NONE = 0, + SPLIT_LEFT, + SPLIT_RIGHT, +}; + +struct ocfs2_insert_type { + enum ocfs2_split_type ins_split; + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_tree_depth; +}; + +struct ocfs2_merge_ctxt { + enum ocfs2_contig_type c_contig_type; + int c_has_empty_extent; + int c_split_covers_rec; +}; + +static int ocfs2_validate_extent_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + /* + * Errors after here are fatal. + */ + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + return -EINVAL; + } + + if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extent block #%llu has an invalid " + "h_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + return -EINVAL; + } + + return 0; +} + +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, eb_blkno, &tmp, + ocfs2_validate_extent_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + + +/* + * How many free extents have we got before we need more meta data? + */ +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et) +{ + int retval; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; + + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); + + if (last_eb_blk) { + retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, + &eb_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } + + BUG_ON(el->l_tree_depth != 0); + + retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); +bail: + brelse(eb_bh); + + trace_ocfs2_num_free_extents(retval); + return retval; +} + +/* expects array to already be allocated + * + * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and + * l_count for you + */ +static int ocfs2_create_new_meta_bhs(handle_t *handle, + struct ocfs2_extent_tree *et, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]) +{ + int count, status, i; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + struct ocfs2_extent_block *eb; + + count = 0; + while (count < wanted) { + status = ocfs2_claim_metadata(handle, + meta_ac, + wanted - count, + &suballoc_loc, + &suballoc_bit_start, + &num_got, + &first_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = count; i < (num_got + count); i++) { + bhs[i] = sb_getblk(osb->sb, first_blkno); + if (bhs[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) bhs[i]->b_data; + /* Ok, setup the minimal stuff here. */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(first_blkno); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = + cpu_to_le16(meta_ac->ac_alloc_slot); + eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); + eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + suballoc_bit_start++; + first_blkno++; + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. */ + ocfs2_journal_dirty(handle, bhs[i]); + } + + count += num_got; + } + + status = 0; +bail: + if (status < 0) { + for(i = 0; i < wanted; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + mlog_errno(status); + } + return status; +} + +/* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); +} + +/* + * Change range of the branches in the right most path according to the leaf + * extent block's rightmost record. + */ +static int ocfs2_adjust_rightmost_branch(handle_t *handle, + struct ocfs2_extent_tree *et) +{ + int status; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + path = ocfs2_new_path_from_et(et); + if (!path) { + status = -ENOMEM; + return status; + } + + status = ocfs2_find_path(et->et_ci, path, UINT_MAX); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_extend_trans(handle, path_num_items(path)); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_path(et->et_ci, handle, path); + if (status < 0) { + mlog_errno(status); + goto out; + } + + el = path_leaf_el(path); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; + + ocfs2_adjust_rightmost_records(handle, et, path, rec); + +out: + ocfs2_free_path(path); + return status; +} + +/* + * Add an entire tree branch to our inode. eb_bh is the extent block + * to start at, if we don't want to start the branch at the root + * structure. + * + * last_eb_bh is required as we have to update it's next_leaf pointer + * for the new last extent block. + * + * the new branch will be 'empty' in the sense that every block will + * contain a single record with cluster count == 0. + */ +static int ocfs2_add_branch(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head *eb_bh, + struct buffer_head **last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int status, new_blocks, i; + u64 next_blkno, new_last_eb_blk; + struct buffer_head *bh; + struct buffer_head **new_eb_bhs = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *eb_el; + struct ocfs2_extent_list *el; + u32 new_cpos, root_end; + + BUG_ON(!last_eb_bh || !*last_eb_bh); + + if (eb_bh) { + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = et->et_root_el; + + /* we never add a branch to a leaf. */ + BUG_ON(!el->l_tree_depth); + + new_blocks = le16_to_cpu(el->l_tree_depth); + + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + root_end = ocfs2_sum_rightmost_rec(et->et_root_el); + + /* + * If there is a gap before the root end and the real end + * of the righmost leaf block, we need to remove the gap + * between new_cpos and root_end first so that the tree + * is consistent after we add a new branch(it will start + * from new_cpos). + */ + if (root_end > new_cpos) { + trace_ocfs2_adjust_rightmost_branch( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + root_end, new_cpos); + + status = ocfs2_adjust_rightmost_branch(handle, et); + if (status) { + mlog_errno(status); + goto bail; + } + } + + /* allocate the number of new eb blocks we need */ + new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!new_eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, + meta_ac, new_eb_bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be + * linked with the rest of the tree. + * conversly, new_eb_bhs[0] is the new bottommost leaf. + * + * when we leave the loop, new_last_eb_blk will point to the + * newest leaf, and next_blkno will point to the topmost extent + * block. */ + next_blkno = new_last_eb_blk = 0; + for(i = 0; i < new_blocks; i++) { + bh = new_eb_bhs[i]; + eb = (struct ocfs2_extent_block *) bh->b_data; + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); + eb_el = &eb->h_list; + + status = ocfs2_journal_access_eb(handle, et->et_ci, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb->h_next_leaf_blk = 0; + eb_el->l_tree_depth = cpu_to_le16(i); + eb_el->l_next_free_rec = cpu_to_le16(1); + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); + eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); + /* + * eb_el isn't always an interior node, but even leaf + * nodes want a zero'd flags and reserved field so + * this gets the whole 32 bits regardless of use. + */ + eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); + if (!eb_el->l_tree_depth) + new_last_eb_blk = le64_to_cpu(eb->h_blkno); + + ocfs2_journal_dirty(handle, bh); + next_blkno = le64_to_cpu(eb->h_blkno); + } + + /* This is a bit hairy. We want to update up to three blocks + * here without leaving any of them in an inconsistent state + * in case of error. We don't have to worry about + * journal_dirty erroring as it won't unless we've aborted the + * handle (in which case we would never be here) so reserving + * the write with journal_access is all we need to do. */ + status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (eb_bh) { + status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Link the new branch into the rest of the tree (el will + * either be on the root_bh, or the extent block passed in. */ + i = le16_to_cpu(el->l_next_free_rec); + el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + el->l_recs[i].e_int_clusters = 0; + le16_add_cpu(&el->l_next_free_rec, 1); + + /* fe needs a new last extent block pointer, as does the + * next_leaf on the previously last-extent-block. */ + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); + + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; + eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); + + ocfs2_journal_dirty(handle, *last_eb_bh); + ocfs2_journal_dirty(handle, et->et_root_bh); + if (eb_bh) + ocfs2_journal_dirty(handle, eb_bh); + + /* + * Some callers want to track the rightmost leaf so pass it + * back here. + */ + brelse(*last_eb_bh); + get_bh(new_eb_bhs[0]); + *last_eb_bh = new_eb_bhs[0]; + + status = 0; +bail: + if (new_eb_bhs) { + for (i = 0; i < new_blocks; i++) + brelse(new_eb_bhs[i]); + kfree(new_eb_bhs); + } + + return status; +} + +/* + * adds another level to the allocation tree. + * returns back the new extent block so you can add a branch to it + * after this call. + */ +static int ocfs2_shift_tree_depth(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh) +{ + int status, i; + u32 new_clusters; + struct buffer_head *new_eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *root_el; + struct ocfs2_extent_list *eb_el; + + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); + + eb_el = &eb->h_list; + root_el = et->et_root_el; + + status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; + + ocfs2_journal_dirty(handle, new_eb_bh); + + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); + + /* If this is our 1st tree depth shift, then last_eb_blk + * becomes the allocated extent block */ + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + + ocfs2_journal_dirty(handle, et->et_root_bh); + + *ret_new_eb_bh = new_eb_bh; + new_eb_bh = NULL; + status = 0; +bail: + brelse(new_eb_bh); + + return status; +} + +/* + * Should only be called when there is no space left in any of the + * leaf nodes. What we want to do is find the lowest tree depth + * non-leaf extent block with room for new records. There are three + * valid results of this search: + * + * 1) a lowest extent block is found, then we pass it back in + * *lowest_eb_bh and return '0' + * + * 2) the search fails to find anything, but the root_el has room. We + * pass NULL back in *lowest_eb_bh, but still return '0' + * + * 3) the search fails to find anything AND the root_el is full, in + * which case we return > 0 + * + * return status < 0 indicates an error. + */ +static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, + struct buffer_head **target_bh) +{ + int status = 0, i; + u64 blkno; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + struct buffer_head *lowest_bh = NULL; + + *target_bh = NULL; + + el = et->et_root_el; + + while(le16_to_cpu(el->l_tree_depth) > 1) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty " + "extent list (next_free_rec == 0)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + status = -EIO; + goto bail; + } + i = le16_to_cpu(el->l_next_free_rec) - 1; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (!blkno) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent " + "list where extent # %d has no physical " + "block start", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); + status = -EIO; + goto bail; + } + + brelse(bh); + bh = NULL; + + status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count)) { + brelse(lowest_bh); + lowest_bh = bh; + get_bh(lowest_bh); + } + } + + /* If we didn't find one and the fe doesn't have any room, + * then return '1' */ + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) + status = 1; + + *target_bh = lowest_bh; +bail: + brelse(bh); + + return status; +} + +/* + * Grow a b-tree so that it has more records. + * + * We might shift the tree depth in which case existing paths should + * be considered invalid. + * + * Tree depth after the grow is returned via *final_depth. + * + * *last_eb_bh will be updated by ocfs2_add_branch(). + */ +static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, + int *final_depth, struct buffer_head **last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, shift; + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); + struct buffer_head *bh = NULL; + + BUG_ON(meta_ac == NULL); + + shift = ocfs2_find_branch_target(et, &bh); + if (shift < 0) { + ret = shift; + mlog_errno(ret); + goto out; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + BUG_ON(bh); + trace_ocfs2_grow_tree( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + depth); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + depth++; + if (depth == 1) { + /* + * Special case: we have room now if we shifted from + * tree_depth 0, so no more work needs to be done. + * + * We won't be calling add_branch, so pass + * back *last_eb_bh as the new leaf. At depth + * zero, it should always be null so there's + * no reason to brelse. + */ + BUG_ON(*last_eb_bh); + get_bh(bh); + *last_eb_bh = bh; + goto out; + } + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, + meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (final_depth) + *final_depth = depth; + brelse(bh); + return ret; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + trace_ocfs2_rotate_leaf(insert_cpos, insert_index, + has_empty, next_free, + le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) +{ + int size, num_recs = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(num_recs == 0); + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + num_recs--; + size = num_recs * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[0], &el->l_recs[1], size); + memset(&el->l_recs[num_recs], 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(num_recs); + } +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Owner %llu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + left->p_tree_depth, right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; + struct buffer_head *bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has empty extent list at " + "depth %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + + } + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } + + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; + } + + brelse(bh); + bh = NULL; + ret = ocfs2_read_extent_block(ci, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(ci, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); + BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); + left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); + } + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_int_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_int_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_int_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + * - When we've adjusted the last extent record in the left path leaf and the + * 1st extent record in the right path leaf during cross extent block merge. + */ +static void ocfs2_complete_edge_insert(handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + trace_ocfs2_complete_edge_insert(i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path->p_node[i].el; + idx = le16_to_cpu(left_el->l_next_free_rec) - 1; + left_rec = &el->l_recs[idx]; + + el = right_path->p_node[i].el; + right_rec = &el->l_recs[0]; + + ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, + right_el); + + ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + ocfs2_journal_dirty(handle, right_path->p_node[i].bh); + + /* + * Setup our list pointers now so that the current + * parents become children in the next iteration. + */ + left_el = left_path->p_node[i].el; + right_el = right_path->p_node[i].el; + } + + /* + * At the root node, adjust the two adjacent records which + * begin our path to the leaves. + */ + + el = left_path->p_node[subtree_index].el; + left_el = left_path->p_node[subtree_index + 1].el; + right_el = right_path->p_node[subtree_index + 1].el; + + ocfs2_adjust_root_records(el, left_el, right_el, + left_path->p_node[subtree_index + 1].bh->b_blocknr); + + root_bh = left_path->p_node[subtree_index].bh; + + ocfs2_journal_dirty(handle, root_bh); +} + +static int ocfs2_rotate_subtree_right(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i; + struct buffer_head *right_leaf_bh; + struct buffer_head *left_leaf_bh = NULL; + struct buffer_head *root_bh; + struct ocfs2_extent_list *right_el, *left_el; + struct ocfs2_extent_rec move_rec; + + left_leaf_bh = path_leaf_bh(left_path); + left_el = path_leaf_el(left_path); + + if (left_el->l_next_free_rec != left_el->l_count) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Inode %llu has non-full interior leaf node %llu" + "(next free = %u)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)left_leaf_bh->b_blocknr, + le16_to_cpu(left_el->l_next_free_rec)); + return -EROFS; + } + + /* + * This extent block may already have an empty record, so we + * return early if so. + */ + if (ocfs2_is_empty_extent(&left_el->l_recs[0])) + return 0; + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + right_leaf_bh = path_leaf_bh(right_path); + right_el = path_leaf_el(right_path); + + /* This is a code error, not a disk corruption. */ + mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " + "because rightmost leaf block %llu is empty\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)right_leaf_bh->b_blocknr); + + ocfs2_create_empty_extent(right_el); + + ocfs2_journal_dirty(handle, right_leaf_bh); + + /* Do the copy now. */ + i = le16_to_cpu(left_el->l_next_free_rec) - 1; + move_rec = left_el->l_recs[i]; + right_el->l_recs[0] = move_rec; + + /* + * Clear out the record we just copied and shift everything + * over, leaving an empty extent in the left leaf. + * + * We temporarily subtract from next_free_rec so that the + * shift will lose the tail record (which is now defunct). + */ + le16_add_cpu(&left_el->l_next_free_rec, -1); + ocfs2_shift_records_right(left_el); + memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&left_el->l_next_free_rec, 1); + + ocfs2_journal_dirty(handle, left_leaf_bh); + + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the left of the current one. + * + * Will return zero if the path passed in is already the leftmost path. + */ +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + BUG_ON(path->p_tree_depth == 0); + + *cpos = 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + el = path->p_node[i].el; + + /* + * Find the extent record just before the one in our + * path. + */ + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == 0) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the leftmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The leftmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); + *cpos = *cpos + ocfs2_rec_clusters(el, + &el->l_recs[j - 1]); + *cpos = *cpos - 1; + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +/* + * Extend the transaction by enough credits to complete the rotation, + * and still leave at least the original number of credits allocated + * to this transaction. + */ +static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + int op_credits, + struct ocfs2_path *path) +{ + int ret = 0; + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; + + if (handle->h_buffer_credits < credits) + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); + + return ret; +} + +/* + * Trap the case where we're inserting into the theoretical range past + * the _actual_ left leaf range. Otherwise, we'll rotate a record + * whose cpos is less than ours into the right leaf. + * + * It's only necessary to look at the rightmost record of the left + * leaf because the logic that calls us should ensure that the + * theoretical ranges in the path components above the leaves are + * correct. + */ +static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, + u32 insert_cpos) +{ + struct ocfs2_extent_list *left_el; + struct ocfs2_extent_rec *rec; + int next_free; + + left_el = path_leaf_el(left_path); + next_free = le16_to_cpu(left_el->l_next_free_rec); + rec = &left_el->l_recs[next_free - 1]; + + if (insert_cpos > le32_to_cpu(rec->e_cpos)) + return 1; + return 0; +} + +static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + unsigned int range; + struct ocfs2_extent_rec *rec; + + if (next_free == 0) + return 0; + + rec = &el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + /* Empty list. */ + if (next_free == 1) + return 0; + rec = &el->l_recs[1]; + } + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + return 1; + return 0; +} + +/* + * Rotate all the records in a btree right one record, starting at insert_cpos. + * + * The path to the rightmost leaf should be passed in. + * + * The array is assumed to be large enough to hold an entire path (tree depth). + * + * Upon successful return from this function: + * + * - The 'right_path' array will contain a path to the leaf block + * whose range contains e_cpos. + * - That leaf block will have a single empty extent in list index 0. + * - In the case that the rotation requires a post-insert update, + * *ret_left_path will contain a valid path which can be passed to + * ocfs2_insert_path(). + */ +static int ocfs2_rotate_tree_right(handle_t *handle, + struct ocfs2_extent_tree *et, + enum ocfs2_split_type split, + u32 insert_cpos, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, start, orig_credits = handle->h_buffer_credits; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + *ret_left_path = NULL; + + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_rotate_tree_right( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); + + /* + * What we want to do here is: + * + * 1) Start with the rightmost path. + * + * 2) Determine a path to the leaf block directly to the left + * of that leaf. + * + * 3) Determine the 'subtree root' - the lowest level tree node + * which contains a path to both leaves. + * + * 4) Rotate the subtree. + * + * 5) Find the next subtree by considering the left path to be + * the new right path. + * + * The check at the top of this while loop also accepts + * insert_cpos == cpos because cpos is only a _theoretical_ + * value to get us the left path - insert_cpos might very well + * be filling that hole. + * + * Stop at a cpos of '0' because we either started at the + * leftmost branch (i.e., a tree with one branch and a + * rotation inside of it), or we've gone as far as we can in + * rotating subtrees. + */ + while (cpos && insert_cpos <= cpos) { + trace_ocfs2_rotate_tree_right( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog_bug_on_msg(path_leaf_bh(left_path) == + path_leaf_bh(right_path), + "Owner %llu: error during insert of %u " + "(left path cpos %u) results in two identical " + "paths ending at %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos, + (unsigned long long) + path_leaf_bh(left_path)->b_blocknr); + + if (split == SPLIT_NONE && + ocfs2_rotate_requires_path_adjustment(left_path, + insert_cpos)) { + + /* + * We've rotated the tree as much as we + * should. The rest is up to + * ocfs2_insert_path() to complete, after the + * record insertion. We indicate this + * situation by returning the left path. + * + * The reason we don't adjust the records here + * before the record insert is that an error + * later might break the rule where a parent + * record e_cpos will reflect the actual + * e_cpos of the 1st nonempty record of the + * child list. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + start = ocfs2_find_subtree_root(et, left_path, right_path); + + trace_ocfs2_rotate_subtree(start, + (unsigned long long) + right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, start, + orig_credits, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_right(handle, et, left_path, + right_path, start); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (split != SPLIT_NONE && + ocfs2_leftmost_rec_contains(path_leaf_el(right_path), + insert_cpos)) { + /* + * A rotate moves the rightmost left leaf + * record over to the leftmost right leaf + * slot. If we're doing an extent split + * instead of a real insert, then we have to + * check that the extent to be split wasn't + * just moved over. If it was, then we can + * exit here, passing left_path back - + * ocfs2_split_extent() is smart enough to + * search both leaves. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + /* + * There is no need to re-read the next right path + * as we know that it'll be our current left + * path. Optimize by copying values instead. + */ + ocfs2_mv_path(right_path, left_path); + + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(left_path); + +out_ret_path: + return ret; +} + +static int ocfs2_update_edge_lengths(handle_t *handle, + struct ocfs2_extent_tree *et, + int subtree_index, struct ocfs2_path *path) +{ + int i, idx, ret; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + u32 range; + + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Path should always be rightmost. */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + BUG_ON(eb->h_next_leaf_blk != 0ULL); + + el = &eb->h_list; + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + for (i = 0; i < path->p_tree_depth; i++) { + el = path->p_node[i].el; + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + + rec->e_int_clusters = cpu_to_le32(range); + le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); + + ocfs2_journal_dirty(handle, path->p_node[i].bh); + } +out: + return ret; +} + +static void ocfs2_unlink_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path *path, int unlink_start) +{ + int ret, i; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh; + + for(i = unlink_start; i < path_num_items(path); i++) { + bh = path->p_node[i].bh; + + eb = (struct ocfs2_extent_block *)bh->b_data; + /* + * Not all nodes might have had their final count + * decremented by the caller - handle this here. + */ + el = &eb->h_list; + if (le16_to_cpu(el->l_next_free_rec) > 1) { + mlog(ML_ERROR, + "Inode %llu, attempted to remove extent block " + "%llu with %u records\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(el->l_next_free_rec)); + + ocfs2_journal_dirty(handle, bh); + ocfs2_remove_from_cache(et->et_ci, bh); + continue; + } + + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_journal_dirty(handle, bh); + + ret = ocfs2_cache_extent_block_free(dealloc, eb); + if (ret) + mlog_errno(ret); + + ocfs2_remove_from_cache(et->et_ci, bh); + } +} + +static void ocfs2_unlink_subtree(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int i; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + + el = path_leaf_el(left_path); + + eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; + + for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + if (root_el->l_recs[i].e_blkno == eb->h_blkno) + break; + + BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); + + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&root_el->l_next_free_rec, -1); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + eb->h_next_leaf_blk = 0; + + ocfs2_journal_dirty(handle, root_bh); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + ocfs2_unlink_path(handle, et, dealloc, right_path, + subtree_index + 1); +} + +static int ocfs2_rotate_subtree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int *deleted) +{ + int ret, i, del_right_subtree = 0, right_has_empty = 0; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); + struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; + struct ocfs2_extent_block *eb; + + *deleted = 0; + + right_leaf_el = path_leaf_el(right_path); + left_leaf_el = path_leaf_el(left_path); + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) + return 0; + + eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; + if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { + /* + * It's legal for us to proceed if the right leaf is + * the rightmost one and it has an empty extent. There + * are two cases to handle - whether the leaf will be + * empty after removal or not. If the leaf isn't empty + * then just remove the empty extent up front. The + * next block will handle empty leaves by flagging + * them for unlink. + * + * Non rightmost leaves will throw -EAGAIN and the + * caller can manually move the subtree and retry. + */ + + if (eb->h_next_leaf_blk != 0ULL) + return -EAGAIN; + + if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { + ret = ocfs2_journal_access_eb(handle, et->et_ci, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(right_leaf_el); + } else + right_has_empty = 1; + } + + if (eb->h_next_leaf_blk == 0ULL && + le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { + /* + * We have to update i_last_eb_blk during the meta + * data delete. + */ + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + del_right_subtree = 1; + } + + /* + * Getting here with an empty extent in the right path implies + * that it's the rightmost path and will be deleted. + */ + BUG_ON(right_has_empty && !del_right_subtree); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!right_has_empty) { + /* + * Only do this if we're moving a real + * record. Otherwise, the action is delayed until + * after removal of the right path in which case we + * can do a simple shift to remove the empty extent. + */ + ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); + memset(&right_leaf_el->l_recs[0], 0, + sizeof(struct ocfs2_extent_rec)); + } + if (eb->h_next_leaf_blk == 0ULL) { + /* + * Move recs over to get rid of empty extent, decrease + * next_free. This is allowed to remove the last + * extent in our leaf (setting l_next_free_rec to + * zero) - the delete code below won't care. + */ + ocfs2_remove_empty_extent(right_leaf_el); + } + + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + + if (del_right_subtree) { + ocfs2_unlink_subtree(handle, et, left_path, right_path, + subtree_index, dealloc); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + + /* + * Removal of the extent in the left leaf was skipped + * above so we could delete the right path + * 1st. + */ + if (right_has_empty) + ocfs2_remove_empty_extent(left_leaf_el); + + ocfs2_journal_dirty(handle, et_root_bh); + + *deleted = 1; + } else + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the right of the current one. + * + * Will return zero if the path passed in is already the rightmost path. + * + * This looks similar, but is subtly different to + * ocfs2_find_cpos_for_left_leaf(). + */ +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + *cpos = 0; + + if (path->p_tree_depth == 0) + return 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + int next_free; + + el = path->p_node[i].el; + + /* + * Find the extent record just after the one in our + * path. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == (next_free - 1)) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the rightmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The rightmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path) +{ + int ret; + struct buffer_head *bh = path_leaf_bh(path); + struct ocfs2_extent_list *el = path_leaf_el(path); + + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(el); + ocfs2_journal_dirty(handle, bh); + +out: + return ret; +} + +static int __ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + int orig_credits, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path **empty_extent_path) +{ + int ret, subtree_root, deleted; + u32 right_cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_path *right_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + + *empty_extent_path = NULL; + + ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_cp_path(left_path, path); + + right_path = ocfs2_new_path_from_path(path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (right_cpos) { + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(et, left_path, + right_path); + + trace_ocfs2_rotate_subtree(subtree_root, + (unsigned long long) + right_path->p_node[subtree_root].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + orig_credits, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Caller might still want to make changes to the + * tree root, so re-add it to the journal here. + */ + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_left(handle, et, left_path, + right_path, subtree_root, + dealloc, &deleted); + if (ret == -EAGAIN) { + /* + * The rotation has to temporarily stop due to + * the right subtree having an empty + * extent. Pass it back to the caller for a + * fixup. + */ + *empty_extent_path = right_path; + right_path = NULL; + goto out; + } + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The subtree rotate might have removed records on + * the rightmost edge. If so, then rotation is + * complete. + */ + if (deleted) + break; + + ocfs2_mv_path(left_path, right_path); + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(right_path); + ocfs2_free_path(left_path); + + return ret; +} + +static int ocfs2_remove_rightmost_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, subtree_index; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + + ret = ocfs2_et_sanity_check(et); + if (ret) + goto out; + /* + * There's two ways we handle this depending on + * whether path is the only existing one. + */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos) { + /* + * We have a path to the left of this one - it needs + * an update too. + */ + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + + ocfs2_unlink_subtree(handle, et, left_path, path, + subtree_index, dealloc); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + } else { + /* + * 'path' is also the leftmost path which + * means it must be the only one. This gets + * handled differently because we want to + * revert the root back to having extents + * in-line. + */ + ocfs2_unlink_path(handle, et, dealloc, path, 1); + + el = et->et_root_el; + el->l_tree_depth = 0; + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_et_set_last_eb_blk(et, 0); + } + + ocfs2_journal_dirty(handle, path_root_bh(path)); + +out: + ocfs2_free_path(left_path); + return ret; +} + +/* + * Left rotation of btree records. + * + * In many ways, this is (unsurprisingly) the opposite of right + * rotation. We start at some non-rightmost path containing an empty + * extent in the leaf block. The code works its way to the rightmost + * path by rotating records to the left in every subtree. + * + * This is used by any code which reduces the number of extent records + * in a leaf. After removal, an empty record should be placed in the + * leftmost list position. + * + * This won't handle a length update of the rightmost path records if + * the rightmost tree leaf record is removed so the caller is + * responsible for detecting and correcting that. + */ +static int ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, orig_credits = handle->h_buffer_credits; + struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + el = path_leaf_el(path); + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + if (path->p_tree_depth == 0) { +rightmost_no_delete: + /* + * Inline extents. This is trivially handled, so do + * it up front. + */ + ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Handle rightmost branch now. There's several cases: + * 1) simple rotation leaving records in there. That's trivial. + * 2) rotation requiring a branch delete - there's no more + * records left. Two cases of this: + * a) There are branches to the left. + * b) This is also the leftmost (the only) branch. + * + * 1) is handled via ocfs2_rotate_rightmost_leaf_left() + * 2a) we need the left branch so that we can update it with the unlink + * 2b) we need to bring the root back to inline extents. + */ + + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + el = &eb->h_list; + if (eb->h_next_leaf_blk == 0) { + /* + * This gets a bit tricky if we're going to delete the + * rightmost path. Get the other cases out of the way + * 1st. + */ + if (le16_to_cpu(el->l_next_free_rec) > 1) + goto rightmost_no_delete; + + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = -EIO; + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto out; + } + + /* + * XXX: The caller can not trust "path" any more after + * this as it will have been deleted. What do we do? + * + * In theory the rotate-for-merge code will never get + * here because it'll always ask for a rotate in a + * nonempty list. + */ + + ret = ocfs2_remove_rightmost_path(handle, et, path, + dealloc); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Now we can loop, remembering the path we get from -EAGAIN + * and restarting from there. + */ +try_rotate: + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, + dealloc, &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + while (ret == -EAGAIN) { + tmp_path = restart_path; + restart_path = NULL; + + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, + tmp_path, dealloc, + &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + ocfs2_free_path(tmp_path); + tmp_path = NULL; + + if (ret == 0) + goto try_rotate; + } + +out: + ocfs2_free_path(tmp_path); + ocfs2_free_path(restart_path); + return ret; +} + +static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, + int index) +{ + struct ocfs2_extent_rec *rec = &el->l_recs[index]; + unsigned int size; + + if (rec->e_leaf_clusters == 0) { + /* + * We consumed all of the merged-from record. An empty + * extent cannot exist anywhere but the 1st array + * position, so move things over if the merged-from + * record doesn't occupy that position. + * + * This creates a new empty extent so the caller + * should be smart enough to have removed any existing + * ones. + */ + if (index > 0) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + size = index * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[1], &el->l_recs[0], size); + } + + /* + * Always memset - the caller doesn't check whether it + * created an empty extent, so there could be junk in + * the other fields. + */ + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + } +} + +static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path **ret_right_path) +{ + int ret; + u32 right_cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_extent_list *left_el; + + *ret_right_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(left_path->p_tree_depth == 0); + + left_el = path_leaf_el(left_path); + BUG_ON(left_el->l_next_free_rec != left_el->l_count); + + ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + left_path, &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the rightmost leaf. */ + BUG_ON(right_cpos == 0); + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_right_path = right_path; +out: + if (ret) + ocfs2_free_path(right_path); + return ret; +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the beginning of the record "next" to it. + * For index < l_count - 1, the next means the extent rec at index + 1. + * For index == l_count - 1, the "next" means the 1st extent rec of the + * next extent block. + */ +static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, + handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *split_rec, + int index) +{ + int ret, next_free, i; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *right_el; + struct ocfs2_path *right_path = NULL; + int subtree_index = 0; + struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct buffer_head *bh = path_leaf_bh(left_path); + struct buffer_head *root_bh = NULL; + + BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); + left_rec = &el->l_recs[index]; + + if (index == le16_to_cpu(el->l_next_free_rec) - 1 && + le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_right_path(et, left_path, &right_path); + if (ret) { + mlog_errno(ret); + return ret; + } + + right_el = path_leaf_el(right_path); + next_free = le16_to_cpu(right_el->l_next_free_rec); + BUG_ON(next_free <= 0); + right_rec = &right_el->l_recs[0]; + if (ocfs2_is_empty_extent(right_rec)) { + BUG_ON(next_free <= 1); + right_rec = &right_el->l_recs[1]; + } + + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(right_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + } else { + BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); + right_rec = &el->l_recs[index + 1]; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, + path_num_items(left_path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); + + le32_add_cpu(&right_rec->e_cpos, -split_clusters); + le64_add_cpu(&right_rec->e_blkno, + -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); + + ocfs2_cleanup_merge(el, index); + + ocfs2_journal_dirty(handle, bh); + if (right_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + } +out: + ocfs2_free_path(right_path); + return ret; +} + +static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret; + u32 left_cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(right_path->p_tree_depth == 0); + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the leftmost leaf. */ + BUG_ON(left_cpos == 0); + + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_left_path = left_path; +out: + if (ret) + ocfs2_free_path(left_path); + return ret; +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the tail of the record "before" it. + * For index > 0, the "before" means the extent rec at index - 1. + * + * For index == 0, the "before" means the last record of the previous + * extent block. And there is also a situation that we may need to + * remove the rightmost leaf extent block in the right_path and change + * the right path to indicate the new rightmost path. + */ +static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, + handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int index) +{ + int ret, i, subtree_index = 0, has_empty_extent = 0; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *el = path_leaf_el(right_path); + struct buffer_head *bh = path_leaf_bh(right_path); + struct buffer_head *root_bh = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *left_el; + + BUG_ON(index < 0); + + right_rec = &el->l_recs[index]; + if (index == 0) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_left_path(et, right_path, &left_path); + if (ret) { + mlog_errno(ret); + return ret; + } + + left_el = path_leaf_el(left_path); + BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != + le16_to_cpu(left_el->l_count)); + + left_rec = &left_el->l_recs[ + le16_to_cpu(left_el->l_next_free_rec) - 1]; + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(split_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + left_rec = &el->l_recs[index - 1]; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + has_empty_extent = 1; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + path_num_items(right_path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (has_empty_extent && index == 1) { + /* + * The easy case - we can just plop the record right in. + */ + *left_rec = *split_rec; + + has_empty_extent = 0; + } else + le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); + + le32_add_cpu(&right_rec->e_cpos, split_clusters); + le64_add_cpu(&right_rec->e_blkno, + ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); + + ocfs2_cleanup_merge(el, index); + + ocfs2_journal_dirty(handle, bh); + if (left_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + /* + * In the situation that the right_rec is empty and the extent + * block is empty also, ocfs2_complete_edge_insert can't handle + * it and we need to delete the right extent block. + */ + if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && + le16_to_cpu(el->l_next_free_rec) == 1) { + + ret = ocfs2_remove_rightmost_path(handle, et, + right_path, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now the rightmost extent block has been deleted. + * So we use the new rightmost path. + */ + ocfs2_mv_path(right_path, left_path); + left_path = NULL; + } else + ocfs2_complete_edge_insert(handle, left_path, + right_path, subtree_index); + } +out: + ocfs2_free_path(left_path); + return ret; +} + +static int ocfs2_try_to_merge_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_merge_ctxt *ctxt) +{ + int ret = 0; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + + BUG_ON(ctxt->c_contig_type == CONTIG_NONE); + + if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* + * The merge code will need to create an empty + * extent to take the place of the newly + * emptied slot. Remove any pre-existing empty + * extents - having more than one in a leaf is + * illegal. + */ + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + split_index--; + rec = &el->l_recs[split_index]; + } + + if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { + /* + * Left-right contig implies this. + */ + BUG_ON(!ctxt->c_split_covers_rec); + + /* + * Since the leftright insert always covers the entire + * extent, this call will delete the insert record + * entirely, resulting in an empty extent record added to + * the extent block. + * + * Since the adding of an empty extent shifts + * everything back to the right, there's no need to + * update split_index here. + * + * When the split_index is zero, we need to merge it to the + * prevoius extent block. It is more efficient and easier + * if we do merge_right first and merge_left later. + */ + ret = ocfs2_merge_rec_right(path, handle, et, split_rec, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We can only get this from logic error above. + */ + BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + + /* The merge left us with an empty extent, remove it. */ + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + rec = &el->l_recs[split_index]; + + /* + * Note that we don't pass split_rec here on purpose - + * we've merged it into the rec already. + */ + ret = ocfs2_merge_rec_left(path, handle, et, rec, + dealloc, split_index); + + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + /* + * Error from this last rotate is not critical, so + * print but don't bubble it up. + */ + if (ret) + mlog_errno(ret); + ret = 0; + } else { + /* + * Merge a record to the left or right. + * + * 'contig_type' is relative to the existing record, + * so for example, if we're "right contig", it's to + * the record on the left (hence the left merge). + */ + if (ctxt->c_contig_type == CONTIG_RIGHT) { + ret = ocfs2_merge_rec_left(path, handle, et, + split_rec, dealloc, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_merge_rec_right(path, handle, + et, split_rec, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (ctxt->c_split_covers_rec) { + /* + * The merge may have left an empty extent in + * our leaf. Try to rotate it away. + */ + ret = ocfs2_rotate_tree_left(handle, et, path, + dealloc); + if (ret) + mlog_errno(ret); + ret = 0; + } + } + +out: + return ret; +} + +static void ocfs2_subtract_from_rec(struct super_block *sb, + enum ocfs2_split_type split, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *split_rec) +{ + u64 len_blocks; + + len_blocks = ocfs2_clusters_to_blocks(sb, + le16_to_cpu(split_rec->e_leaf_clusters)); + + if (split == SPLIT_LEFT) { + /* + * Region is on the left edge of the existing + * record. + */ + le32_add_cpu(&rec->e_cpos, + le16_to_cpu(split_rec->e_leaf_clusters)); + le64_add_cpu(&rec->e_blkno, len_blocks); + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } else { + /* + * Region is on the right edge of the existing + * record. + */ + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (insert->ins_split != SPLIT_NONE) { + i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); + BUG_ON(i == -1); + rec = &el->l_recs[i]; + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + insert->ins_split, rec, + insert_rec); + goto rotate; + } + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le16_add_cpu(&rec->e_leaf_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "owner %llu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + ocfs2_metadata_cache_owner(et->et_ci), + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le16_to_cpu(el->l_recs[i].e_leaf_clusters), + le32_to_cpu(insert_rec->e_cpos), + le16_to_cpu(insert_rec->e_leaf_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + +rotate: + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + /* + * Update everything except the leaf block. + */ + for (i = 0; i < path->p_tree_depth; i++) { + bh = path->p_node[i].bh; + el = path->p_node[i].el; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has a bad extent list", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + ret = -EIO; + return; + } + + rec = &el->l_recs[next_free - 1]; + + rec->e_int_clusters = insert_rec->e_cpos; + le32_add_cpu(&rec->e_int_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + ocfs2_journal_dirty(handle, bh); + } +} + +static int ocfs2_append_rec_to_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, next_free; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * This shouldn't happen for non-trees. The extent rec cluster + * count manipulation below only works for interior nodes. + */ + BUG_ON(right_path->p_tree_depth == 0); + + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_append_rec_to_path( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(insert_rec->e_cpos), + left_cpos); + + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); + + *ret_left_path = left_path; + ret = 0; +out: + if (ret != 0) + ocfs2_free_path(left_path); + + return ret; +} + +static void ocfs2_split_record(struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *split_rec, + enum ocfs2_split_type split) +{ + int index; + u32 cpos = le32_to_cpu(split_rec->e_cpos); + struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; + struct ocfs2_extent_rec *rec, *tmprec; + + right_el = path_leaf_el(right_path); + if (left_path) + left_el = path_leaf_el(left_path); + + el = right_el; + insert_el = right_el; + index = ocfs2_search_extent_list(el, cpos); + if (index != -1) { + if (index == 0 && left_path) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + + /* + * This typically means that the record + * started in the left path but moved to the + * right as a result of rotation. We either + * move the existing record to the left, or we + * do the later insert there. + * + * In this case, the left path should always + * exist as the rotate code will have passed + * it back for a post-insert update. + */ + + if (split == SPLIT_LEFT) { + /* + * It's a left split. Since we know + * that the rotate code gave us an + * empty extent in the left path, we + * can just do the insert there. + */ + insert_el = left_el; + } else { + /* + * Right split - we have to move the + * existing record over to the left + * leaf. The insert will be into the + * newly created empty extent in the + * right leaf. + */ + tmprec = &right_el->l_recs[index]; + ocfs2_rotate_leaf(left_el, tmprec); + el = left_el; + + memset(tmprec, 0, sizeof(*tmprec)); + index = ocfs2_search_extent_list(left_el, cpos); + BUG_ON(index == -1); + } + } + } else { + BUG_ON(!left_path); + BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); + /* + * Left path is easy - we can just allow the insert to + * happen. + */ + el = left_el; + insert_el = left_el; + index = ocfs2_search_extent_list(el, cpos); + BUG_ON(index == -1); + } + + rec = &el->l_recs[index]; + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + split, rec, split_rec); + ocfs2_rotate_leaf(insert_el, split_rec); +} + +/* + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. + * + * right_path is the path we want to do the actual insert + * in. left_path should only be passed in if we need to update that + * portion of the tree after an edge insert. + */ +static int ocfs2_insert_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret, subtree_index; + struct buffer_head *leaf_bh = path_leaf_bh(right_path); + + if (left_path) { + /* + * There's a chance that left_path got passed back to + * us without being accounted for in the + * journal. Extend our transaction here to be sure we + * can change those blocks. + */ + ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (insert->ins_split != SPLIT_NONE) { + /* + * We could call ocfs2_insert_at_leaf() for some types + * of splits, but it's easier to just let one separate + * function sort it all out. + */ + ocfs2_split_record(et, left_path, right_path, + insert_rec, insert->ins_split); + + /* + * Split might have modified either leaf and we don't + * have a guarantee that the later edge insert will + * dirty this for us. + */ + if (left_path) + ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); + } else + ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), + insert); + + ocfs2_journal_dirty(handle, leaf_bh); + + if (left_path) { + /* + * The rotate code has indicated that we need to fix + * up portions of the tree after the insert. + * + * XXX: Should we extend the transaction here? + */ + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_do_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *type) +{ + int ret, rotate = 0; + u32 cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el; + + el = et->et_root_el; + + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(el->l_tree_depth) == 0) { + ocfs2_insert_at_leaf(et, insert_rec, el, type); + goto out_update_clusters; + } + + right_path = ocfs2_new_path_from_et(et); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * Determine the path to start with. Rotations need the + * rightmost path, everything else can go directly to the + * target leaf. + */ + cpos = le32_to_cpu(insert_rec->e_cpos); + if (type->ins_appending == APPEND_NONE && + type->ins_contig == CONTIG_NONE) { + rotate = 1; + cpos = UINT_MAX; + } + + ret = ocfs2_find_path(et->et_ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Rotations and appends need special treatment - they modify + * parts of the tree's above them. + * + * Both might pass back a path immediate to the left of the + * one being inserted to. This will be cause + * ocfs2_insert_path() to modify the rightmost records of + * left_path to account for an edge insert. + * + * XXX: When modifying this code, keep in mind that an insert + * can wind up skipping both of these two special cases... + */ + if (rotate) { + ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, + le32_to_cpu(insert_rec->e_cpos), + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_rotate_tree_right() might have extended the + * transaction without re-journaling our tree root. + */ + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (type->ins_appending == APPEND_TAIL + && type->ins_contig != CONTIG_LEFT) { + ret = ocfs2_append_rec_to_path(handle, et, insert_rec, + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_insert_path(handle, et, left_path, right_path, + insert_rec, type); + if (ret) { + mlog_errno(ret); + goto out; + } + +out_update_clusters: + if (type->ins_split == SPLIT_NONE) + ocfs2_et_update_clusters(et, + le16_to_cpu(insert_rec->e_leaf_clusters)); + + ocfs2_journal_dirty(handle, et->et_root_bh); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + + return ret; +} + +static enum ocfs2_contig_type +ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, int index, + struct ocfs2_extent_rec *split_rec) +{ + int status; + enum ocfs2_contig_type ret = CONTIG_NONE; + u32 left_cpos, right_cpos; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_list *new_el; + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct buffer_head *bh; + struct ocfs2_extent_block *eb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + if (index > 0) { + rec = &el->l_recs[index - 1]; + } else if (path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); + if (status) + goto exit; + + if (left_cpos != 0) { + left_path = ocfs2_new_path_from_path(path); + if (!left_path) + goto exit; + + status = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (status) + goto free_left_path; + + new_el = path_leaf_el(left_path); + + if (le16_to_cpu(new_el->l_next_free_rec) != + le16_to_cpu(new_el->l_count)) { + bh = path_leaf_bh(left_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of " + "%d. It should have " + "matched the l_count of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + status = -EINVAL; + goto free_left_path; + } + rec = &new_el->l_recs[ + le16_to_cpu(new_el->l_next_free_rec) - 1]; + } + } + + /* + * We're careful to check for an empty extent record here - + * the merge code will know what to do if it sees one. + */ + if (rec) { + if (index == 1 && ocfs2_is_empty_extent(rec)) { + if (split_rec->e_cpos == el->l_recs[index].e_cpos) + ret = CONTIG_RIGHT; + } else { + ret = ocfs2_et_extent_contig(et, rec, split_rec); + } + } + + rec = NULL; + if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) + rec = &el->l_recs[index + 1]; + else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && + path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); + if (status) + goto free_left_path; + + if (right_cpos == 0) + goto free_left_path; + + right_path = ocfs2_new_path_from_path(path); + if (!right_path) + goto free_left_path; + + status = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (status) + goto free_right_path; + + new_el = path_leaf_el(right_path); + rec = &new_el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { + bh = path_leaf_bh(right_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + status = -EINVAL; + goto free_right_path; + } + rec = &new_el->l_recs[1]; + } + } + + if (rec) { + enum ocfs2_contig_type contig_type; + + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); + + if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) + ret = CONTIG_LEFTRIGHT; + else if (ret == CONTIG_NONE) + ret = contig_type; + } + +free_right_path: + ocfs2_free_path(right_path); +free_left_path: + ocfs2_free_path(left_path); +exit: + return ret; +} + +static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + enum ocfs2_contig_type contig_type = CONTIG_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); + if (contig_type != CONTIG_NONE) { + insert->ins_contig_index = i; + break; + } + } + insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } +} + +/* + * This should only be called against the righmost leaf extent list. + * + * ocfs2_figure_appending_type() will figure out whether we'll have to + * insert at the tail of the rightmost leaf. + * + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node + * then the logic here makes sense. + */ +static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + u32 cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + insert->ins_appending = APPEND_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (!el->l_next_free_rec) + goto set_tail_append; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + /* Were all records empty? */ + if (le16_to_cpu(el->l_next_free_rec) == 1) + goto set_tail_append; + } + + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + + if (cpos >= + (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) + goto set_tail_append; + + return; + +set_tail_append: + insert->ins_appending = APPEND_TAIL; +} + +/* + * Helper function called at the beginning of an insert. + * + * This computes a few things that are commonly used in the process of + * inserting into the btree: + * - Whether the new extent is contiguous with an existing one. + * - The current tree depth. + * - Whether the insert is an appending one. + * - The total # of free records in the tree. + * + * All of the information is stored on the ocfs2_insert_type + * structure. + */ +static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, + struct buffer_head **last_eb_bh, + struct ocfs2_extent_rec *insert_rec, + int *free_records, + struct ocfs2_insert_type *insert) +{ + int ret; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + struct buffer_head *bh = NULL; + + insert->ins_split = SPLIT_NONE; + + el = et->et_root_el; + insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); + + if (el->l_tree_depth) { + /* + * If we have tree depth, we read in the + * rightmost extent block ahead of time as + * ocfs2_figure_insert_type() and ocfs2_add_branch() + * may want it later. + */ + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + } + + /* + * Unless we have a contiguous insert, we'll need to know if + * there is room left in our allocation tree for another + * extent record. + * + * XXX: This test is simplistic, we can search for empty + * extent records too. + */ + *free_records = le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec); + + if (!insert->ins_tree_depth) { + ocfs2_figure_contig_type(et, insert, el, insert_rec); + ocfs2_figure_appending_type(insert, el, insert_rec); + return 0; + } + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * In the case that we're inserting past what the tree + * currently accounts for, ocfs2_find_path() will return for + * us the rightmost tree path. This is accounted for below in + * the appending code. + */ + ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + /* + * Now that we have the path, there's two things we want to determine: + * 1) Contiguousness (also set contig_index if this is so) + * + * 2) Are we doing an append? We can trivially break this up + * into two types of appends: simple record append, or a + * rotate inside the tail leaf. + */ + ocfs2_figure_contig_type(et, insert, el, insert_rec); + + /* + * The insert code isn't quite ready to deal with all cases of + * left contiguousness. Specifically, if it's an insert into + * the 1st record in a leaf, it will require the adjustment of + * cluster count on the last record of the path directly to it's + * left. For now, just catch that case and fool the layers + * above us. This works just fine for tree_depth == 0, which + * is why we allow that above. + */ + if (insert->ins_contig == CONTIG_LEFT && + insert->ins_contig_index == 0) + insert->ins_contig = CONTIG_NONE; + + /* + * Ok, so we can simply compare against last_eb to figure out + * whether the path doesn't exist. This will only happen in + * the case that we're doing a tail append, so maybe we can + * take advantage of that information somehow. + */ + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { + /* + * Ok, ocfs2_find_path() returned us the rightmost + * tree path. This might be an appending insert. There are + * two cases: + * 1) We're doing a true append at the tail: + * -This might even be off the end of the leaf + * 2) We're "appending" by rotating in the tail + */ + ocfs2_figure_appending_type(insert, el, insert_rec); + } + +out: + ocfs2_free_path(path); + + if (ret == 0) + *last_eb_bh = bh; + else + brelse(bh); + return ret; +} + +/* + * Insert an extent into a btree. + * + * The caller needs to update the owning btree's cluster count. + */ +int ocfs2_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, + u64 start_blk, + u32 new_clusters, + u8 flags, + struct ocfs2_alloc_context *meta_ac) +{ + int status; + int uninitialized_var(free_records); + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_insert_type insert = {0, }; + struct ocfs2_extent_rec rec; + + trace_ocfs2_insert_extent_start( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, new_clusters); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_blkno = cpu_to_le64(start_blk); + rec.e_leaf_clusters = cpu_to_le16(new_clusters); + rec.e_flags = flags; + status = ocfs2_et_insert_check(et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, + &free_records, &insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig, + insert.ins_contig_index, free_records, + insert.ins_tree_depth); + + if (insert.ins_contig == CONTIG_NONE && free_records == 0) { + status = ocfs2_grow_tree(handle, et, + &insert.ins_tree_depth, &last_eb_bh, + meta_ac); + if (status) { + mlog_errno(status); + goto bail; + } + } + + /* Finally, we can add clusters. This might rotate the tree for us. */ + status = ocfs2_do_insert_extent(handle, et, &rec, &insert); + if (status < 0) + mlog_errno(status); + else + ocfs2_et_extent_map_insert(et, &rec); + +bail: + brelse(last_eb_bh); + + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0, err = 0; + int need_free = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + err = -1; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + err = -2; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the tree root */ + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + need_free = 1; + goto bail; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_add_clusters_in_btree( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + bit_off, num_bits); + status = ocfs2_insert_extent(handle, et, *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + need_free = 1; + goto bail; + } + + ocfs2_journal_dirty(handle, et->et_root_bh); + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + err = clusters_to_add; + status = -EAGAIN; + reason = RESTART_TRANS; + } + +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + +leave: + if (reason_ret) + *reason_ret = reason; + trace_ocfs2_add_clusters_in_btree_ret(status, reason, err); + return status; +} + +static void ocfs2_make_right_split_rec(struct super_block *sb, + struct ocfs2_extent_rec *split_rec, + u32 cpos, + struct ocfs2_extent_rec *rec) +{ + u32 rec_cpos = le32_to_cpu(rec->e_cpos); + u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); + + memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); + + split_rec->e_cpos = cpu_to_le32(cpos); + split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); + + split_rec->e_blkno = rec->e_blkno; + le64_add_cpu(&split_rec->e_blkno, + ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); + + split_rec->e_flags = rec->e_flags; +} + +static int ocfs2_split_and_insert(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct buffer_head **last_eb_bh, + int split_index, + struct ocfs2_extent_rec *orig_split_rec, + struct ocfs2_alloc_context *meta_ac) +{ + int ret = 0, depth; + unsigned int insert_range, rec_range, do_leftright = 0; + struct ocfs2_extent_rec tmprec; + struct ocfs2_extent_list *rightmost_el; + struct ocfs2_extent_rec rec; + struct ocfs2_extent_rec split_rec = *orig_split_rec; + struct ocfs2_insert_type insert; + struct ocfs2_extent_block *eb; + +leftright: + /* + * Store a copy of the record on the stack - it might move + * around as the tree is manipulated below. + */ + rec = path_leaf_el(path)->l_recs[split_index]; + + rightmost_el = et->et_root_el; + + depth = le16_to_cpu(rightmost_el->l_tree_depth); + if (depth) { + BUG_ON(!(*last_eb_bh)); + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; + rightmost_el = &eb->h_list; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + ret = ocfs2_grow_tree(handle, et, + &depth, last_eb_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_tree_depth = depth; + + insert_range = le32_to_cpu(split_rec.e_cpos) + + le16_to_cpu(split_rec.e_leaf_clusters); + rec_range = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + + if (split_rec.e_cpos == rec.e_cpos) { + insert.ins_split = SPLIT_LEFT; + } else if (insert_range == rec_range) { + insert.ins_split = SPLIT_RIGHT; + } else { + /* + * Left/right split. We fake this as a right split + * first and then make a second pass as a left split. + */ + insert.ins_split = SPLIT_RIGHT; + + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &tmprec, insert_range, &rec); + + split_rec = tmprec; + + BUG_ON(do_leftright); + do_leftright = 1; + } + + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (do_leftright == 1) { + u32 cpos; + struct ocfs2_extent_list *el; + + do_leftright++; + split_rec = *orig_split_rec; + + ocfs2_reinit_path(path, 1); + + cpos = le32_to_cpu(split_rec.e_cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } + goto leftright; + } +out: + + return ret; +} + +static int ocfs2_replace_extent_rec(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, + int split_index, + struct ocfs2_extent_rec *split_rec) +{ + int ret; + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + el->l_recs[split_index] = *split_rec; + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); +out: + return ret; +} + +/* + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. + * + * Care is taken to handle contiguousness so as to not grow the tree. + * + * meta_ac is not strictly necessary - we only truly need it if growth + * of the tree is required. All other cases will degrade into a less + * optimal tree layout. + * + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. + * + * This code is optimized for readability - several passes might be + * made over certain portions of the tree. All of those blocks will + * have been brought into cache (and pinned via the journal), so the + * extra overhead is not expressed in terms of disk reads. + */ +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + struct ocfs2_merge_ctxt ctxt; + struct ocfs2_extent_list *rightmost_el; + + if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || + ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < + (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec); + + /* + * The core merge / split code wants to know how much room is + * left in this allocation tree, so we pass the + * rightmost extent list. + */ + if (path->p_tree_depth) { + struct ocfs2_extent_block *eb; + + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } else + rightmost_el = path_root_el(path); + + if (rec->e_cpos == split_rec->e_cpos && + rec->e_leaf_clusters == split_rec->e_leaf_clusters) + ctxt.c_split_covers_rec = 1; + else + ctxt.c_split_covers_rec = 0; + + ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); + + trace_ocfs2_split_extent(split_index, ctxt.c_contig_type, + ctxt.c_has_empty_extent, + ctxt.c_split_covers_rec); + + if (ctxt.c_contig_type == CONTIG_NONE) { + if (ctxt.c_split_covers_rec) + ret = ocfs2_replace_extent_rec(handle, et, path, el, + split_index, split_rec); + else + ret = ocfs2_split_and_insert(handle, et, path, + &last_eb_bh, split_index, + split_rec, meta_ac); + if (ret) + mlog_errno(ret); + } else { + ret = ocfs2_try_to_merge_extent(handle, et, path, + split_index, split_rec, + dealloc, &ctxt); + if (ret) + mlog_errno(ret); + } + +out: + brelse(last_eb_bh); + return ret; +} + +/* + * Change the flags of the already-existing extent at cpos for len clusters. + * + * new_flags: the flags we want to set. + * clear_flags: the flags we want to clear. + * phys: the new physical offset we want this new extent starts from. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) +{ + int ret, index; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); + struct ocfs2_extent_rec split_rec; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + left_path = ocfs2_new_path_from_et(et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + el = path_leaf_el(left_path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1) { + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); + ret = -EROFS; + goto out; + } + + ret = -EIO; + rec = &el->l_recs[index]; + if (new_flags && (rec->e_flags & new_flags)) { + mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " + "extent that already had them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + new_flags); + goto out; + } + + if (clear_flags && !(rec->e_flags & clear_flags)) { + mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " + "extent that didn't have them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + clear_flags); + goto out; + } + + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); + split_rec.e_cpos = cpu_to_le32(cpos); + split_rec.e_leaf_clusters = cpu_to_le16(len); + split_rec.e_blkno = cpu_to_le64(start_blkno); + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + if (clear_flags) + split_rec.e_flags &= ~clear_flags; + + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + return ret; + +} + +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + trace_ocfs2_mark_extent_written( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int index, u32 new_range, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, depth, credits; + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *rightmost_el, *el; + struct ocfs2_extent_rec split_rec; + struct ocfs2_extent_rec *rec; + struct ocfs2_insert_type insert; + + /* + * Setup the record to split before we grow the tree. + */ + el = path_leaf_el(path); + rec = &el->l_recs[index]; + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &split_rec, new_range, rec); + + depth = path->p_tree_depth; + if (depth > 0) { + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } else + rightmost_el = path_leaf_el(path); + + credits = path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_split = SPLIT_RIGHT; + insert.ins_tree_depth = depth; + + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); + if (ret) + mlog_errno(ret); + +out: + brelse(last_eb_bh); + return ret; +} + +static int ocfs2_truncate_rec(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u32 cpos, u32 len) +{ + int ret; + u32 left_cpos, rec_range, trunc_range; + int wants_rotate = 0, is_rightmost_tree_rec = 0; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_block *eb; + + if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + index--; + } + + if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && + path->p_tree_depth) { + /* + * Check whether this is the rightmost tree record. If + * we remove all of this record or part of its right + * edge then an update of the record lengths above it + * will be required. + */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + if (eb->h_next_leaf_blk == 0) + is_rightmost_tree_rec = 1; + } + + rec = &el->l_recs[index]; + if (index == 0 && path->p_tree_depth && + le32_to_cpu(rec->e_cpos) == cpos) { + /* + * Changing the leftmost offset (via partial or whole + * record truncate) of an interior (or rightmost) path + * means we have to update the subtree that is formed + * by this leaf and the one to it's left. + * + * There are two cases we can skip: + * 1) Path is the leftmost one in our btree. + * 2) The leaf is rightmost and will be empty after + * we remove the extent record - the rotate code + * knows how to update the newly formed edge. + */ + + ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } + + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { + int next_free; + + memset(rec, 0, sizeof(*rec)); + ocfs2_cleanup_merge(el, index); + wants_rotate = 1; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (is_rightmost_tree_rec && next_free > 1) { + /* + * We skip the edge update if this path will + * be deleted by the rotate code. + */ + rec = &el->l_recs[next_free - 1]; + ocfs2_adjust_rightmost_records(handle, et, path, + rec); + } + } else if (le32_to_cpu(rec->e_cpos) == cpos) { + /* Remove leftmost portion of the record. */ + le32_add_cpu(&rec->e_cpos, len); + le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); + le16_add_cpu(&rec->e_leaf_clusters, -len); + } else if (rec_range == trunc_range) { + /* Remove rightmost portion of the record */ + le16_add_cpu(&rec->e_leaf_clusters, -len); + if (is_rightmost_tree_rec) + ocfs2_adjust_rightmost_records(handle, et, path, rec); + } else { + /* Caller should have trapped this. */ + mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(rec->e_cpos), + le16_to_cpu(rec->e_leaf_clusters), cpos, len); + BUG(); + } + + if (left_path) { + int subtree_index; + + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + ocfs2_complete_edge_insert(handle, left_path, path, + subtree_index); + } + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + ocfs2_free_path(left_path); + return ret; +} + +int ocfs2_remove_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + u32 rec_range, trunc_range; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + + /* + * XXX: Why are we truncating to 0 instead of wherever this + * affects us? + */ + ocfs2_et_extent_map_truncate(et, 0); + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } + + /* + * We have 3 cases of extent removal: + * 1) Range covers the entire extent rec + * 2) Range begins or ends on one edge of the extent rec + * 3) Range is in the middle of the extent rec (no shared edges) + * + * For case 1 we remove the extent rec and left rotate to + * fill the hole. + * + * For case 2 we just shrink the existing extent rec, with a + * tree update if the shrinking edge is also the edge of an + * extent block. + * + * For case 3 we do a right split to turn the extent rec into + * something case 2 can handle. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); + + trace_ocfs2_remove_extent( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + + if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_split_tree(handle, et, path, index, + trunc_range, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The split could have manipulated the tree enough to + * move the record location, so we have to look for it again. + */ + ocfs2_reinit_path(path, 1); + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: split at cpos %u lost record.", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } + + /* + * Double check our values here. If anything is fishy, + * it's easier to catch it at the top level. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (rec_range != trunc_range) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: error after split at cpos %u" + "trunc len %u, existing record is (%u,%u)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(path); + return ret; +} + +/* + * ocfs2_reserve_blocks_for_rec_trunc() would look basically the + * same as ocfs2_lock_alloctors(), except for it accepts a blocks + * number to reserve some extra blocks, and it only handles meta + * data allocations. + * + * Currently, only ocfs2_remove_btree_range() uses it for truncating + * and punching holes. + */ +static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 extents_to_split, + struct ocfs2_alloc_context **ac, + int extra_blocks) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *ac = NULL; + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + if (extra_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + +out: + if (ret) { + if (*ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + } + + return ret; +} + +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc, bool refcount_tree_locked) +{ + int ret, credits = 0, extra_blocks = 0; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; + + if ((flags & OCFS2_EXT_REFCOUNTED) && len) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + if (!refcount_tree_locked) { + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto bail; + } + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + } + + ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, + extra_blocks); + if (ret) { + mlog_errno(ret); + goto bail; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb) + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(inode->i_sb, len)); + + ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + + ocfs2_journal_dirty(handle, et->et_root_bh); + + if (phys_blkno) { + if (flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + phys_blkno), + len, meta_ac, + dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + phys_blkno, len); + if (ret) + mlog_errno(ret); + + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); +bail: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +{ + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + + mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), + "slot %d, invalid truncate log parameters: used = " + "%u, count = %u\n", osb->slot_num, + le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); + return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); +} + +static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, + unsigned int new_start) +{ + unsigned int tail_index; + unsigned int current_tail; + + /* No records, nothing to coalesce */ + if (!le16_to_cpu(tl->tl_used)) + return 0; + + tail_index = le16_to_cpu(tl->tl_used) - 1; + current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); + current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); + + return current_tail == new_start; +} + +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters) +{ + int status, index; + unsigned int start_cluster, tl_count; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + tl_count = le16_to_cpu(tl->tl_count); + mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0, + "Truncate record count on #%llu invalid " + "wanted %u, actual %u\n", + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + ocfs2_truncate_recs_per_inode(osb->sb), + le16_to_cpu(tl->tl_count)); + + /* Caller should have known to flush before calling us. */ + index = le16_to_cpu(tl->tl_used); + if (index >= tl_count) { + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index, + start_cluster, num_clusters); + if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { + /* + * Move index back to the record we are coalescing with. + * ocfs2_truncate_log_can_coalesce() guarantees nonzero + */ + index--; + + num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); + } else { + tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); + tl->tl_used = cpu_to_le16(index + 1); + } + tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); + + ocfs2_journal_dirty(handle, tl_bh); + + osb->truncated_clusters += num_clusters; +bail: + return status; +} + +static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, + handle_t *handle, + struct inode *data_alloc_inode, + struct buffer_head *data_alloc_bh) +{ + int status = 0; + int i; + unsigned int num_clusters; + u64 start_blk; + struct ocfs2_truncate_rec rec; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + i = le16_to_cpu(tl->tl_used) - 1; + while (i >= 0) { + /* Caller has given us at least enough credits to + * update the truncate log dinode */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tl->tl_used = cpu_to_le16(i); + + ocfs2_journal_dirty(handle, tl_bh); + + /* TODO: Perhaps we can calculate the bulk of the + * credits up front rather than extending like + * this. */ + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + rec = tl->tl_recs[i]; + start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, + le32_to_cpu(rec.t_start)); + num_clusters = le32_to_cpu(rec.t_clusters); + + /* if start_blk is not set, we ignore the record as + * invalid. */ + if (start_blk) { + trace_ocfs2_replay_truncate_records( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + i, le32_to_cpu(rec.t_start), num_clusters); + + status = ocfs2_free_clusters(handle, data_alloc_inode, + data_alloc_bh, start_blk, + num_clusters); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i--; + } + + osb->truncated_clusters = 0; + +bail: + return status; +} + +/* Expects you to already be holding tl_inode->i_mutex */ +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + unsigned int num_to_flush; + handle_t *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *data_alloc_inode = NULL; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct buffer_head *data_alloc_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + num_to_flush = le16_to_cpu(tl->tl_used); + trace_ocfs2_flush_truncate_log( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + num_to_flush); + if (!num_to_flush) { + status = 0; + goto out; + } + + data_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!data_alloc_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto out; + } + + mutex_lock(&data_alloc_inode->i_mutex); + + status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_unlock; + } + + status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + data_alloc_bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +out_unlock: + brelse(data_alloc_bh); + ocfs2_inode_unlock(data_alloc_inode, 1); + +out_mutex: + mutex_unlock(&data_alloc_inode->i_mutex); + iput(data_alloc_inode); + +out: + return status; +} + +int ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mutex_lock(&tl_inode->i_mutex); + status = __ocfs2_flush_truncate_log(osb); + mutex_unlock(&tl_inode->i_mutex); + + return status; +} + +static void ocfs2_truncate_log_worker(struct work_struct *work) +{ + int status; + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + osb_truncate_log_wq.work); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + else + ocfs2_init_steal_slots(osb); +} + +#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel) +{ + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { + /* We want to push off log flushes while truncates are + * still running. */ + if (cancel) + cancel_delayed_work(&osb->osb_truncate_log_wq); + + queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); + } +} + +static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, + int slot_num, + struct inode **tl_inode, + struct buffer_head **tl_bh) +{ + int status; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, + TRUNCATE_LOG_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get load truncate log inode!\n"); + goto bail; + } + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + iput(inode); + mlog_errno(status); + goto bail; + } + + *tl_inode = inode; + *tl_bh = bh; +bail: + return status; +} + +/* called during the 1st stage of node recovery. we stamp a clean + * truncate log and pass back a copy for processing later. if the + * truncate log does not require processing, a *tl_copy is set to + * NULL. */ +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + *tl_copy = NULL; + + trace_ocfs2_begin_truncate_log_recovery(slot_num); + + status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's + * validated by the underlying call to ocfs2_read_inode_block(), + * so any corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + if (le16_to_cpu(tl->tl_used)) { + trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); + + *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + if (!(*tl_copy)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* Assuming the write-out below goes well, this copy + * will be passed back to recovery for processing. */ + memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); + + /* All we need to do to clear the truncate log is set + * tl_used. */ + tl->tl_used = 0; + + ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); + status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (tl_inode) + iput(tl_inode); + brelse(tl_bh); + + if (status < 0 && (*tl_copy)) { + kfree(*tl_copy); + *tl_copy = NULL; + mlog_errno(status); + } + + return status; +} + +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy) +{ + int status = 0; + int i; + unsigned int clusters, num_recs, start_cluster; + u64 start_blk; + handle_t *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_truncate_log *tl; + + if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { + mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); + return -EINVAL; + } + + tl = &tl_copy->id2.i_dealloc; + num_recs = le16_to_cpu(tl->tl_used); + trace_ocfs2_complete_truncate_log_recovery( + (unsigned long long)le64_to_cpu(tl_copy->i_blkno), + num_recs); + + mutex_lock(&tl_inode->i_mutex); + for(i = 0; i < num_recs; i++) { + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_up; + } + + clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); + start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); + start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); + + status = ocfs2_truncate_log_append(osb, handle, + start_blk, clusters); + ocfs2_commit_trans(osb, handle); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + +bail_up: + mutex_unlock(&tl_inode->i_mutex); + + return status; +} + +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + atomic_set(&osb->osb_tl_disable, 1); + + if (tl_inode) { + cancel_delayed_work(&osb->osb_truncate_log_wq); + flush_workqueue(ocfs2_wq); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + brelse(osb->osb_tl_bh); + iput(osb->osb_tl_inode); + } +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + + status = ocfs2_get_truncate_log_info(osb, + osb->slot_num, + &tl_inode, + &tl_bh); + if (status < 0) + mlog_errno(status); + + /* ocfs2_truncate_log_shutdown keys on the existence of + * osb->osb_tl_inode so we don't set any of the osb variables + * until we're sure all is well. */ + INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, + ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); + osb->osb_tl_bh = tl_bh; + osb->osb_tl_inode = tl_inode; + + return status; +} + +/* + * Delayed de-allocation of suballocator blocks. + * + * Some sets of block de-allocations might involve multiple suballocator inodes. + * + * The locking for this can get extremely complicated, especially when + * the suballocator inodes to delete from aren't known until deep + * within an unrelated codepath. + * + * ocfs2_extent_block structures are a good example of this - an inode + * btree could have been grown by any number of nodes each allocating + * out of their own suballoc inode. + * + * These structures allow the delay of block de-allocation until a + * later time, when locking of multiple cluster inodes won't cause + * deadlock. + */ + +/* + * Describe a single bit freed from a suballocator. For the block + * suballocators, it represents one block. For the global cluster + * allocator, it represents some clusters and free_bit indicates + * clusters number. + */ +struct ocfs2_cached_block_free { + struct ocfs2_cached_block_free *free_next; + u64 free_bg; + u64 free_blk; + unsigned int free_bit; +}; + +struct ocfs2_per_slot_free_list { + struct ocfs2_per_slot_free_list *f_next_suballocator; + int f_inode_type; + int f_slot; + struct ocfs2_cached_block_free *f_first; +}; + +static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) +{ + int ret; + u64 bg_blkno; + handle_t *handle; + struct inode *inode; + struct buffer_head *di_bh = NULL; + struct ocfs2_cached_block_free *tmp; + + inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&inode->i_mutex); + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + while (head) { + if (head->free_bg) + bg_blkno = head->free_bg; + else + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); + trace_ocfs2_free_cached_blocks( + (unsigned long long)head->free_blk, head->free_bit); + + ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, + head->free_bit, bg_blkno, 1); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + tmp = head; + head = head->free_next; + kfree(tmp); + } + +out_journal: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out_mutex: + mutex_unlock(&inode->i_mutex); + iput(inode); +out: + while(head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit) +{ + int ret = 0; + struct ocfs2_cached_block_free *item; + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = ctxt->c_global_allocator; + + ctxt->c_global_allocator = item; + return ret; +} + +static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, + struct ocfs2_cached_block_free *head) +{ + struct ocfs2_cached_block_free *tmp; + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + int ret = 0; + + mutex_lock(&tl_inode->i_mutex); + + while (head) { + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, + head->free_bit); + + ocfs2_commit_trans(osb, handle); + tmp = head; + head = head->free_next; + kfree(tmp); + + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + mutex_unlock(&tl_inode->i_mutex); + + while (head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + int ret = 0, ret2; + struct ocfs2_per_slot_free_list *fl; + + if (!ctxt) + return 0; + + while (ctxt->c_first_suballocator) { + fl = ctxt->c_first_suballocator; + + if (fl->f_first) { + trace_ocfs2_run_deallocs(fl->f_inode_type, + fl->f_slot); + ret2 = ocfs2_free_cached_blocks(osb, + fl->f_inode_type, + fl->f_slot, + fl->f_first); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + } + + ctxt->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + + if (ctxt->c_global_allocator) { + ret2 = ocfs2_free_cached_clusters(osb, + ctxt->c_global_allocator); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + + ctxt->c_global_allocator = NULL; + } + + return ret; +} + +static struct ocfs2_per_slot_free_list * +ocfs2_find_per_slot_free_list(int type, + int slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == slot) + return fl; + + fl = fl->f_next_suballocator; + } + + fl = kmalloc(sizeof(*fl), GFP_NOFS); + if (fl) { + fl->f_inode_type = type; + fl->f_slot = slot; + fl->f_first = NULL; + fl->f_next_suballocator = ctxt->c_first_suballocator; + + ctxt->c_first_suballocator = fl; + } + return fl; +} + +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 suballoc, + u64 blkno, unsigned int bit) +{ + int ret; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *item; + + fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); + if (fl == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + trace_ocfs2_cache_block_dealloc(type, slot, + (unsigned long long)suballoc, + (unsigned long long)blkno, bit); + + item->free_bg = suballoc; + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = fl->f_first; + + fl->f_first = item; + + ret = 0; +out: + return ret; +} + +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb) +{ + return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_suballoc_loc), + le64_to_cpu(eb->h_blkno), + le16_to_cpu(eb->h_suballoc_bit)); +} + +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return 0; +} + +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) +{ + int ret, partial = 0; + + ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); + if (ret) + mlog_errno(ret); + + if (zero) + zero_user_segment(page, from, to); + + /* + * Need to set the buffers we zero'd into uptodate + * here if they aren't - ocfs2_map_page_blocks() + * might've skipped some + */ + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + } + + if (!partial) + SetPageUptodate(page); + + flush_dcache_page(page); +} + +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, + loff_t end, struct page **pages, + int numpages, u64 phys, handle_t *handle) +{ + int i; + struct page *page; + unsigned int from, to = PAGE_CACHE_SIZE; + struct super_block *sb = inode->i_sb; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + if (numpages == 0) + goto out; + + to = PAGE_CACHE_SIZE; + for(i = 0; i < numpages; i++) { + page = pages[i]; + + from = start & (PAGE_CACHE_SIZE - 1); + if ((end >> PAGE_CACHE_SHIFT) == page->index) + to = end & (PAGE_CACHE_SIZE - 1); + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + + ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, + &phys); + + start = (page->index + 1) << PAGE_CACHE_SHIFT; + } +out: + if (pages) + ocfs2_unlock_and_free_pages(pages, numpages); +} + +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + int numpages, ret = 0; + struct address_space *mapping = inode->i_mapping; + unsigned long index; + loff_t last_page_bytes; + + BUG_ON(start > end); + + numpages = 0; + last_page_bytes = PAGE_ALIGN(end); + index = start >> PAGE_CACHE_SHIFT; + do { + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); + if (!pages[numpages]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + numpages++; + index++; + } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + +out: + if (ret != 0) { + if (pages) + ocfs2_unlock_and_free_pages(pages, numpages); + numpages = 0; + } + + *num = numpages; + + return ret; +} + +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + +/* + * Zero the area past i_size but still within an allocated + * cluster. This avoids exposing nonzero data on subsequent file + * extends. + * + * We need to call this before i_size is updated on the inode because + * otherwise block_write_full_page() will skip writeout of pages past + * i_size. The new_i_size parameter is passed for this reason. + */ +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end) +{ + int ret = 0, numpages; + struct page **pages = NULL; + u64 phys; + unsigned int ext_flags; + struct super_block *sb = inode->i_sb; + + /* + * File systems which don't support sparse files zero on every + * extend. + */ + if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) + return 0; + + pages = kcalloc(ocfs2_pages_per_cluster(sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (range_start == range_end) + goto out; + + ret = ocfs2_extent_map_get_blocks(inode, + range_start >> sb->s_blocksize_bits, + &phys, NULL, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Tail is a hole, or is marked unwritten. In either case, we + * can count on read and write to return/push zero's. + */ + if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) + goto out; + + ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, + &numpages); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, + numpages, phys, handle); + + /* + * Initiate writeout of the pages we zero'd here. We don't + * wait on them - the truncate_inode_pages() call later will + * do that for us. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, range_start, + range_end - 1); + if (ret) + mlog_errno(ret); + +out: + kfree(pages); + + return ret; +} + +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) +{ + unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); +} + +void ocfs2_dinode_new_extent_list(struct inode *inode, + struct ocfs2_dinode *di) +{ + ocfs2_zero_dinode_id2_with_xattr(inode, di); + di->id2.i_list.l_tree_depth = 0; + di->id2.i_list.l_next_free_rec = 0; + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); +} + +void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_inline_data *idata = &di->id2.i_data; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + /* + * We clear the entire i_data structure here so that all + * fields can be properly initialized. + */ + ocfs2_zero_dinode_id2_with_xattr(inode, di); + + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); +} + +int ocfs2_convert_inline_data_to_extents(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; + handle_t *handle; + u64 uninitialized_var(block); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_alloc_context *data_ac = NULL; + struct page **pages = NULL; + loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; + int did_quota = 0; + + has_data = i_size_read(inode) ? 1 : 0; + + if (has_data) { + pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto free_pages; + } + } + + handle = ocfs2_start_trans(osb, + ocfs2_inline_to_extents_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (has_data) { + unsigned int page_end; + u64 phys; + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, + &num); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Save two copies, one for insert, and one that can + * be changed by ocfs2_map_and_dirty_page() below. + */ + block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + /* + * Non sparse file systems zero on extend, so no need + * to do that now. + */ + if (!ocfs2_sparse_alloc(osb) && + PAGE_CACHE_SIZE < osb->s_clustersize) + end = PAGE_CACHE_SIZE; + + ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + if (ret) { + mlog_errno(ret); + need_free = 1; + goto out_commit; + } + + /* + * This should populate the 1st page for us and mark + * it up to date. + */ + ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + if (ret) { + mlog_errno(ret); + need_free = 1; + goto out_unlock; + } + + page_end = PAGE_CACHE_SIZE; + if (PAGE_CACHE_SIZE > osb->s_clustersize) + page_end = osb->s_clustersize; + + for (i = 0; i < num_pages; i++) + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, + pages[i], i > 0, &phys); + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_update_inode_fsync_trans(handle, inode, 1); + ocfs2_dinode_new_extent_list(inode, di); + + ocfs2_journal_dirty(handle, di_bh); + + if (has_data) { + /* + * An error at this point should be extremely rare. If + * this proves to be false, we could always re-build + * the in-inode data from our pages. + */ + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); + if (ret) { + mlog_errno(ret); + need_free = 1; + goto out_unlock; + } + + inode->i_blocks = ocfs2_inode_sector_count(inode); + } + +out_unlock: + if (pages) + ocfs2_unlock_and_free_pages(pages, num_pages); + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + + ocfs2_commit_trans(osb, handle); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); +free_pages: + kfree(pages); + return ret; +} + +/* + * It is expected, that by the time you call this function, + * inode->i_size and fe->i_size have been adjusted. + * + * WARNING: This will kfree the truncate context + */ +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *di_bh) +{ + int status = 0, i, flags = 0; + u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; + u64 blkno = 0; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *root_el = &(di->id2.i_list); + u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); + struct ocfs2_extent_tree et; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree = NULL; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + + path = ocfs2_new_path(di_bh, &di->id2.i_list, + ocfs2_journal_access_di); + if (!path) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ocfs2_extent_map_trunc(inode, new_highest_cpos); + +start: + /* + * Check that we still have allocation to delete. + */ + if (OCFS2_I(inode)->ip_clusters == 0) { + status = 0; + goto bail; + } + + /* + * Truncate always works against the rightmost tree branch. + */ + status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); + if (status) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_commit_truncate( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + new_highest_cpos, + OCFS2_I(inode)->ip_clusters, + path->p_tree_depth); + + /* + * By now, el will point to the extent list on the bottom most + * portion of this tree. Only the tail record is considered in + * each pass. + * + * We handle the following cases, in order: + * - empty extent: delete the remaining branch + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (truncate has completed) + */ + el = path_leaf_el(path); + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)path_leaf_bh(path)->b_blocknr); + status = -EROFS; + goto bail; + } + + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + flags = rec->e_flags; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (i == 0 && ocfs2_is_empty_extent(rec)) { + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (root_el->l_tree_depth && rec->e_int_clusters == 0) { + ocfs2_error(inode->i_sb, "Inode %lu has an empty " + "extent record, depth %u\n", inode->i_ino, + le16_to_cpu(root_el->l_tree_depth)); + status = -EROFS; + goto bail; + } + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; + } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { + /* + * Truncate entire record. + */ + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = ocfs2_rec_clusters(el, rec); + blkno = le64_to_cpu(rec->e_blkno); + } else if (range > new_highest_cpos) { + /* + * Partial truncate. it also should be + * the last truncate we're doing. + */ + trunc_cpos = new_highest_cpos; + trunc_len = range - new_highest_cpos; + coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); + blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + } else { + /* + * Truncate completed, leave happily. + */ + status = 0; + goto bail; + } + + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) { + status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, &dealloc, + refcount_loc, true); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + + /* + * The check above will catch the case where we've truncated + * away all allocation. + */ + goto start; + +bail: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + ocfs2_schedule_truncate_log_flush(osb, 1); + + ocfs2_run_deallocs(osb, &dealloc); + + ocfs2_free_path(path); + + return status; +} + +/* + * 'start' is inclusive, 'end' is not. + */ +int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, + unsigned int start, unsigned int end, int trunc) +{ + int ret; + unsigned int numbytes; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inline_data *idata = &di->id2.i_data; + + if (end > i_size_read(inode)) + end = i_size_read(inode); + + BUG_ON(start > end); + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || + !ocfs2_supports_inline_data(osb)) { + ocfs2_error(inode->i_sb, + "Inline data flags for inode %llu don't agree! " + "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + le16_to_cpu(di->i_dyn_features), + OCFS2_I(inode)->ip_dyn_features, + osb->s_feature_incompat); + ret = -EROFS; + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + numbytes = end - start; + memset(idata->id_data + start, 0, numbytes); + + /* + * No need to worry about the data page here - it's been + * truncated already and inline data doesn't need it for + * pushing zero's to disk, so we'll let readpage pick it up + * later. + */ + if (trunc) { + i_size_write(inode, start); + di->i_size = cpu_to_le64(start); + } + + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_update_inode_fsync_trans(handle, inode, 1); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + return ret; +} + +static int ocfs2_trim_extent(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 count) +{ + u64 discard, bcount; + + bcount = ocfs2_clusters_to_blocks(sb, count); + discard = le64_to_cpu(gd->bg_blkno) + + ocfs2_clusters_to_blocks(sb, start); + + trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); + + return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); +} + +static int ocfs2_trim_group(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 max, u32 minbits) +{ + int ret = 0, count = 0, next; + void *bitmap = gd->bg_bitmap; + + if (le16_to_cpu(gd->bg_free_bits_count) < minbits) + return 0; + + trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), + start, max, minbits); + + while (start < max) { + start = ocfs2_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = ocfs2_find_next_bit(bitmap, max, start); + + if ((next - start) >= minbits) { + ret = ocfs2_trim_extent(sb, gd, + start, next - start); + if (ret < 0) { + mlog_errno(ret); + break; + } + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) + break; + } + + if (ret < 0) + count = ret; + + return count; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 start, len, trimmed, first_group, last_group, group; + int ret, cnt; + u32 first_bit, last_bit, minlen; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_dinode *main_bm; + struct ocfs2_group_desc *gd = NULL; + + start = range->start >> osb->s_clustersize_bits; + len = range->len >> osb->s_clustersize_bits; + minlen = range->minlen >> osb->s_clustersize_bits; + + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) + return -EINVAL; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; + goto out_unlock; + } + + len = range->len >> osb->s_clustersize_bits; + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; + + trace_ocfs2_trim_fs(start, len, minlen); + + /* Determine first and last group to examine based on start and len */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); + last_bit = osb->bitmap_cpg; + + trimmed = 0; + for (group = first_group; group <= last_group;) { + if (first_bit + len >= osb->bitmap_cpg) + last_bit = osb->bitmap_cpg; + else + last_bit = first_bit + len; + + ret = ocfs2_read_group_descriptor(main_bm_inode, + main_bm, group, + &gd_bh); + if (ret < 0) { + mlog_errno(ret); + break; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + brelse(gd_bh); + gd_bh = NULL; + if (cnt < 0) { + ret = cnt; + mlog_errno(ret); + break; + } + + trimmed += cnt; + len -= osb->bitmap_cpg - first_bit; + first_bit = 0; + if (group == osb->first_cluster_group_blkno) + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + else + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + } + range->len = trimmed * sb->s_blocksize; +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 0); + brelse(main_bm_bh); +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); +out: + return ret; +} diff --git a/kernel/fs/ocfs2/alloc.h b/kernel/fs/ocfs2/alloc.h new file mode 100644 index 000000000..fb09b97db --- /dev/null +++ b/kernel/fs/ocfs2/alloc.h @@ -0,0 +1,324 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ALLOC_H +#define OCFS2_ALLOC_H + + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. It needs the ocfs2_caching_info structure associated with + * I/O on the tree. With metadata ecc, we now call different journal_access + * functions for each type of metadata, so it must have the + * root_journal_access function. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + struct ocfs2_caching_info *et_ci; + ocfs2_journal_access_func et_root_journal_access; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +struct ocfs2_xattr_value_buf; +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct ocfs2_xattr_value_buf *vb); +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); + +/* + * Read an extent block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The extent block will be validated + * with ocfs2_validate_extent_block(). + */ +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, + struct buffer_head **bh); + +struct ocfs2_alloc_context; +int ocfs2_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, + u64 start_blk, + u32 new_clusters, + u8 flags, + struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); +struct ocfs2_cached_dealloc_ctxt; +struct ocfs2_path; +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags); +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc, bool refcount_tree_locked); + +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) +{ + /* + * Rather than do all the work of determining how much we need + * (involves a ton of reads and locks), just ask for the + * maximal limit. That's a tree depth shift. So, one block for + * level of the tree (current l_tree_depth), one block for the + * new tree_depth==0 extent_block, and one block at the new + * top-of-the tree. + */ + return le16_to_cpu(root_el->l_tree_depth) + 2; +} + +void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); +void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di); +int ocfs2_convert_inline_data_to_extents(struct inode *inode, + struct buffer_head *di_bh); + +int ocfs2_truncate_log_init(struct ocfs2_super *osb); +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel); +int ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy); +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy); +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb); +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters); +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); + +/* + * Process local structure which describes the block unlinks done + * during an operation. This is populated via + * ocfs2_cache_block_dealloc(). + * + * ocfs2_run_deallocs() should be called after the potentially + * de-allocating routines. No journal handles should be open, and most + * locks should have been dropped. + */ +struct ocfs2_cached_dealloc_ctxt { + struct ocfs2_per_slot_free_list *c_first_suballocator; + struct ocfs2_cached_block_free *c_global_allocator; +}; +static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) +{ + c->c_first_suballocator = NULL; + c->c_global_allocator = NULL; +} +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit); +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 suballoc, u64 blkno, + unsigned int bit); +static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) +{ + return c->c_global_allocator != NULL; +} +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt); + +struct ocfs2_truncate_context { + struct ocfs2_cached_dealloc_ctxt tc_dealloc; + int tc_ext_alloc_locked; /* is it cluster locked? */ + /* these get destroyed once it's passed to ocfs2_commit_truncate. */ + struct buffer_head *tc_last_eb_bh; +}; + +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end); +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, + unsigned int start, unsigned int end, int trunc); + +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh); +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range); +/* + * Helper function to look at the # of clusters in an extent record. + */ +static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec) +{ + /* + * Cluster count in extent records is slightly different + * between interior nodes and leaf nodes. This is to support + * unwritten extents which need a flags field in leaf node + * records, thus shrinking the available space for a clusters + * field. + */ + if (el->l_tree_depth) + return le32_to_cpu(rec->e_int_clusters); + else + return le16_to_cpu(rec->e_leaf_clusters); +} + +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) +{ + return !rec->e_leaf_clusters; +} + +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num); +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; + +#define OCFS2_MAX_PATH_DEPTH 5 + +struct ocfs2_path { + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; + +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) + +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root); +void ocfs2_free_path(struct ocfs2_path *path); +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos); +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path); +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et); +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx); +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); +#endif /* OCFS2_ALLOC_H */ diff --git a/kernel/fs/ocfs2/aops.c b/kernel/fs/ocfs2/aops.c new file mode 100644 index 000000000..f906a250d --- /dev/null +++ b/kernel/fs/ocfs2/aops.c @@ -0,0 +1,2448 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" + +static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int status; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *buffer_cache_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + void *kaddr; + + trace_ocfs2_symlink_get_block( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); + + BUG_ON(ocfs2_inode_is_fast_symlink(inode)); + + if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { + mlog(ML_ERROR, "block offset > PATH_MAX: %llu", + (unsigned long long)iblock); + goto bail; + } + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + fe = (struct ocfs2_dinode *) bh->b_data; + + if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(fe->i_clusters))) { + err = -ENOMEM; + mlog(ML_ERROR, "block offset is outside the allocated size: " + "%llu\n", (unsigned long long)iblock); + goto bail; + } + + /* We don't use the page cache to create symlink data, so if + * need be, copy it over from the buffer cache. */ + if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { + u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + + iblock; + buffer_cache_bh = sb_getblk(osb->sb, blkno); + if (!buffer_cache_bh) { + err = -ENOMEM; + mlog(ML_ERROR, "couldn't getblock for symlink!\n"); + goto bail; + } + + /* we haven't locked out transactions, so a commit + * could've happened. Since we've got a reference on + * the bh, even if it commits while we're doing the + * copy, the data is still good. */ + if (buffer_jbd(buffer_cache_bh) + && ocfs2_inode_is_new(inode)) { + kaddr = kmap_atomic(bh_result->b_page); + if (!kaddr) { + mlog(ML_ERROR, "couldn't kmap!\n"); + goto bail; + } + memcpy(kaddr + (bh_result->b_size * iblock), + buffer_cache_bh->b_data, + bh_result->b_size); + kunmap_atomic(kaddr); + set_buffer_uptodate(bh_result); + } + brelse(buffer_cache_bh); + } + + map_bh(bh_result, inode->i_sb, + le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); + + err = 0; + +bail: + brelse(bh); + + return err; +} + +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = 0; + unsigned int ext_flags; + u64 max_blocks = bh_result->b_size >> inode->i_blkbits; + u64 p_blkno, count, past_eof; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + inode, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) { + /* this always does I/O for some reason. */ + err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); + goto bail; + } + + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, + &ext_flags); + if (err) { + mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " + "%llu, NULL)\n", err, inode, (unsigned long long)iblock, + (unsigned long long)p_blkno); + goto bail; + } + + if (max_blocks < count) + count = max_blocks; + + /* + * ocfs2 never allocates in this function - the only time we + * need to use BH_New is when we're extending i_size on a file + * system which doesn't support holes, in which case BH_New + * allows __block_write_begin() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. + */ + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } + + /* Treat the unwritten extent as a hole for zeroing purposes. */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + + bh_result->b_size = count << inode->i_blkbits; + + if (!ocfs2_sparse_alloc(osb)) { + if (p_blkno == 0) { + err = -EIO; + mlog(ML_ERROR, + "iblock = %llu p_blkno = %llu blkno=(%llu)\n", + (unsigned long long)iblock, + (unsigned long long)p_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); + dump_stack(); + goto bail; + } + } + + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + + trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + +bail: + if (err < 0) + err = -EIO; + + return err; +} + +int ocfs2_read_inline_data(struct inode *inode, struct page *page, + struct buffer_head *di_bh) +{ + void *kaddr; + loff_t size; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + return -EROFS; + } + + size = i_size_read(inode); + + if (size > PAGE_CACHE_SIZE || + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { + ocfs2_error(inode->i_sb, + "Inode %llu has with inline data has bad size: %Lu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)size); + return -EROFS; + } + + kaddr = kmap_atomic(page); + if (size) + memcpy(kaddr, di->id2.i_data.id_data, size); + /* Clear the remaining part of the page */ + memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + flush_dcache_page(page); + kunmap_atomic(kaddr); + + SetPageUptodate(page); + + return 0; +} + +static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +{ + int ret; + struct buffer_head *di_bh = NULL; + + BUG_ON(!PageLocked(page)); + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_inline_data(inode, page, di_bh); +out: + unlock_page(page); + + brelse(di_bh); + return ret; +} + +static int ocfs2_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + int ret, unlock = 1; + + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, + (page ? page->index : 0)); + + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out; + } + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + /* + * Unlock the page and cycle ip_alloc_sem so that we don't + * busyloop waiting for ip_alloc_sem to unlock + */ + ret = AOP_TRUNCATED_PAGE; + unlock_page(page); + unlock = 0; + down_read(&oi->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); + goto out_inode_unlock; + } + + /* + * i_size might have just been updated as we grabed the meta lock. We + * might now be discovering a truncate that hit on another node. + * block_read_full_page->get_block freaks out if it is asked to read + * beyond the end of a file, so we check here. Callers + * (generic_file_read, vm_ops->fault) are clever enough to check i_size + * and notice that the page they just read isn't needed. + * + * XXX sys_readahead() seems to get that wrong? + */ + if (start >= i_size_read(inode)) { + zero_user(page, 0, PAGE_SIZE); + SetPageUptodate(page); + ret = 0; + goto out_alloc; + } + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + ret = ocfs2_readpage_inline(inode, page); + else + ret = block_read_full_page(page, ocfs2_get_block); + unlock = 0; + +out_alloc: + up_read(&OCFS2_I(inode)->ip_alloc_sem); +out_inode_unlock: + ocfs2_inode_unlock(inode, 0); +out: + if (unlock) + unlock_page(page); + return ret; +} + +/* + * This is used only for read-ahead. Failures or difficult to handle + * situations are safe to ignore. + * + * Right now, we don't bother with BH_Boundary - in-inode extent lists + * are quite large (243 extents on 4k blocks), so most inodes don't + * grow out to a tree. If need be, detecting boundary extents could + * trivially be added in a future version of ocfs2_get_block(). + */ +static int ocfs2_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret, err = -EIO; + struct inode *inode = mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start; + struct page *last; + + /* + * Use the nonblocking flag for the dlm code to avoid page + * lock inversion, but don't bother with retrying. + */ + ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + return err; + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ocfs2_inode_unlock(inode, 0); + return err; + } + + /* + * Don't bother with inline-data. There isn't anything + * to read-ahead in that case anyway... + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto out_unlock; + + /* + * Check whether a remote node truncated this file - we just + * drop out in that case as it's not worth handling here. + */ + last = list_entry(pages->prev, struct page, lru); + start = (loff_t)last->index << PAGE_CACHE_SHIFT; + if (start >= i_size_read(inode)) + goto out_unlock; + + err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + +out_unlock: + up_read(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + + return err; +} + +/* Note: Because we don't support holes, our allocation has + * already happened (allocation writes zeros to the file data) + * so we don't have to worry about ordered writes in + * ocfs2_writepage. + * + * ->writepage is called during the process of invalidating the page cache + * during blocked lock processing. It can't block on any cluster locks + * to during block mapping. It's relying on the fact that the block + * mapping can't have disappeared under the dirty pages that it is + * being asked to write back. + */ +static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + trace_ocfs2_writepage( + (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, + page->index); + + return block_write_full_page(page, ocfs2_get_block, wbc); +} + +/* Taken from ext3. We don't necessarily need the full blown + * functionality yet, but IMHO it's better to cut and paste the whole + * thing so we can avoid introducing our own bugs (and easily pick up + * their fixes when they happen) --Mark */ +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) +{ + sector_t status; + u64 p_blkno = 0; + int err = 0; + struct inode *inode = mapping->host; + + trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)block); + + /* We don't need to lock journal system files, since they aren't + * accessed concurrently from multiple nodes. + */ + if (!INODE_JOURNAL(inode)) { + err = ocfs2_inode_lock(inode, NULL, 0); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + + if (!INODE_JOURNAL(inode)) { + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + } + + if (err) { + mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", + (unsigned long long)block); + mlog_errno(err); + goto bail; + } + +bail: + status = err ? 0 : p_blkno; + + return status; +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + u32 cpos = 0; + int alloc_locked = 0; + u64 p_blkno, inode_blocks, contig_blocks; + unsigned int ext_flags; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); + + /* This function won't even be called if the request isn't all + * nicely aligned and of the right size, so there's no need + * for us to check any of that. */ + + inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + + /* This figures out the size of the next contiguous block, and + * our logical offset */ + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + + /* We should already CoW the refcounted extent in case of create. */ + BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + + /* + * get_more_blocks() expects us to describe a hole by clearing + * the mapped bit on bh_result(). + * + * Consider an unwritten extent as a hole. + */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + else + clear_buffer_mapped(bh_result); + + /* make sure we don't map more than max_blocks blocks here as + that's all the kernel will handle at this point. */ + if (max_blocks < contig_blocks) + contig_blocks = max_blocks; + bh_result->b_size = contig_blocks << blocksize_bits; +bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); + return ret; +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. + */ +static void ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + int level; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + + if (ocfs2_iocb_is_sem_locked(iocb)) + ocfs2_iocb_clear_sem_locked(iocb); + + if (ocfs2_iocb_is_unaligned_aio(iocb)) { + ocfs2_iocb_clear_unaligned_aio(iocb); + + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); + } + + ocfs2_iocb_clear_rw_locked(iocb); + + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); +} + +static int ocfs2_releasepage(struct page *page, gfp_t wait) +{ + if (!page_has_buffers(page)) + return 0; + return try_to_free_buffers(page); +} + +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, + struct inode *inode, loff_t offset, + u64 zero_len, int cluster_align) +{ + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + int ret = 0; + + if (offset <= i_size_read(inode) || cluster_align) + return 0; + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + u64 s = i_size_read(inode); + sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + (do_div(s, osb->s_clustersize) >> 9); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, + zero_len >> 9, GFP_NOFS, false); + if (ret < 0) + mlog_errno(ret); + } + + return ret; +} + +static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + u64 zero_start, zero_len, total_zero_len; + u32 p_cpos = 0, clusters_to_add; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 size_div, offset_div; + int ret = 0; + + { + u64 o = offset; + u64 s = i_size_read(inode); + + offset_div = do_div(o, osb->s_clustersize); + size_div = do_div(s, osb->s_clustersize); + } + + if (offset <= i_size_read(inode)) + return 0; + + clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - + ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); + total_zero_len = offset - i_size_read(inode); + if (clusters_to_add) + total_zero_len -= offset_div; + + /* Allocate clusters to fill out holes, and this is only needed + * when we add more than one clusters. Otherwise the cluster will + * be allocated during direct IO */ + if (clusters_to_add > 1) { + ret = ocfs2_extend_allocation(inode, + OCFS2_I(inode)->ip_clusters, + clusters_to_add - 1, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + while (total_zero_len) { + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + + size_div; + zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - + size_div; + zero_len = min(total_zero_len, zero_len); + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + zero_start >> 9, zero_len >> 9, + GFP_NOFS, false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + total_zero_len -= zero_len; + v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); + + /* Only at first iteration can be cluster not aligned. + * So set size_div to 0 for the rest */ + size_div = 0; + } + +out: + return ret; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u64 zero_len_head, zero_len_tail; + int cluster_align_head, cluster_align_tail; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + u64 s = i_size_read(inode); + + zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align_head = !zero_len_head; + + zero_len_tail = osb->s_clustersize - + do_div(s, osb->s_clustersize); + if ((offset - i_size_read(inode)) < zero_len_tail) + zero_len_tail = offset - i_size_read(inode); + cluster_align_tail = !zero_len_tail; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + /* zeroing out the previously allocated cluster tail + * that but not zeroed */ + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, + zero_len_tail, cluster_align_tail); + else + ret = ocfs2_direct_IO_extend_no_holes(osb, inode, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + } + + written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + offset, ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written > 0 && append_write && !is_overwrite && + !cluster_align_head) { + /* zeroing out the allocated cluster head */ + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 0); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len_head >> 9, GFP_NOFS, false); + if (ret < 0) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 0); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); + + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) + return 0; + + if (iov_iter_rw(iter) == READ) + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); +} + +static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, + u32 cpos, + unsigned int *start, + unsigned int *end) +{ + unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + unsigned int cpp; + + cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + + cluster_start = cpos % cpp; + cluster_start = cluster_start << osb->s_clustersize_bits; + + cluster_end = cluster_start + osb->s_clustersize; + } + + BUG_ON(cluster_start > PAGE_SIZE); + BUG_ON(cluster_end > PAGE_SIZE); + + if (start) + *start = cluster_start; + if (end) + *end = cluster_end; +} + +/* + * 'from' and 'to' are the region in the page to avoid zeroing. + * + * If pagesize > clustersize, this function will avoid zeroing outside + * of the cluster boundary. + * + * from == to == 0 is code for "zero the entire cluster region" + */ +static void ocfs2_clear_page_regions(struct page *page, + struct ocfs2_super *osb, u32 cpos, + unsigned from, unsigned to) +{ + void *kaddr; + unsigned int cluster_start, cluster_end; + + ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); + + kaddr = kmap_atomic(page); + + if (from || to) { + if (from > cluster_start) + memset(kaddr + cluster_start, 0, from - cluster_start); + if (to < cluster_end) + memset(kaddr + to, 0, cluster_end - to); + } else { + memset(kaddr + cluster_start, 0, cluster_end - cluster_start); + } + + kunmap_atomic(kaddr); +} + +/* + * Nonsparse file systems fully allocate before we get to the write + * code. This prevents ocfs2_write() from tagging the write as an + * allocating one, which means ocfs2_map_page_blocks() might try to + * read-in the blocks at the tail of our file. Avoid reading them by + * testing i_size against each block offset. + */ +static int ocfs2_should_read_blk(struct inode *inode, struct page *page, + unsigned int block_start) +{ + u64 offset = page_offset(page) + block_start; + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 1; + + if (i_size_read(inode) > offset) + return 1; + + return 0; +} + +/* + * Some of this taken from __block_write_begin(). We already have our + * mapping by now though, and the entire write will be allocating or + * it won't, so not much need to use BH_New. + * + * This will also skip zeroing, which is handled externally. + */ +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new) +{ + int ret = 0; + struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; + unsigned int block_end, block_start; + unsigned int bsize = 1 << inode->i_blkbits; + + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + head = page_buffers(page); + for (bh = head, block_start = 0; bh != head || !block_start; + bh = bh->b_this_page, block_start += bsize) { + block_end = block_start + bsize; + + clear_buffer_new(bh); + + /* + * Ignore blocks outside of our i/o range - + * they may belong to unallocated clusters. + */ + if (block_start >= to || block_end <= from) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + continue; + } + + /* + * For an allocating write with cluster size >= page + * size, we always write the entire page. + */ + if (new) + set_buffer_new(bh); + + if (!buffer_mapped(bh)) { + map_bh(bh, inode->i_sb, *p_blkno); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_new(bh) && + ocfs2_should_read_blk(inode, page, block_start) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + + *p_blkno = *p_blkno + 1; + } + + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + ret = -EIO; + } + + if (ret == 0 || !new) + return ret; + + /* + * If we get -EIO above, zero out any newly allocated blocks + * to avoid exposing stale data. + */ + bh = head; + block_start = 0; + do { + block_end = block_start + bsize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + + zero_user(page, block_start, bh->b_size); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} + +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#define OCFS2_MAX_CTXT_PAGES 1 +#else +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#endif + +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) + +/* + * Describe the state of a single cluster to be written to. + */ +struct ocfs2_write_cluster_desc { + u32 c_cpos; + u32 c_phys; + /* + * Give this a unique field because c_phys eventually gets + * filled. + */ + unsigned c_new; + unsigned c_unwritten; + unsigned c_needs_zero; +}; + +struct ocfs2_write_ctxt { + /* Logical cluster position / len of write */ + u32 w_cpos; + u32 w_clen; + + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; + + /* + * This is true if page_size > cluster_size. + * + * It triggers a set of special cases during write which might + * have to deal with allocating writes to partial pages. + */ + unsigned int w_large_pages; + + /* + * Pages involved in this write. + * + * w_target_page is the page being written to by the user. + * + * w_pages is an array of pages which always contains + * w_target_page, and in the case of an allocating write with + * page_size < cluster size, it will contain zero'd and mapped + * pages adjacent to w_target_page which need to be written + * out in so that future reads from that region will get + * zero's. + */ + unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; + struct page *w_target_page; + + /* + * w_target_locked is used for page_mkwrite path indicating no unlocking + * against w_target_page in ocfs2_write_end_nolock. + */ + unsigned int w_target_locked:1; + + /* + * ocfs2_write_end() uses this to know what the real range to + * write in the target should be. + */ + unsigned int w_target_from; + unsigned int w_target_to; + + /* + * We could use journal_current_handle() but this is cleaner, + * IMHO -Mark + */ + handle_t *w_handle; + + struct buffer_head *w_di_bh; + + struct ocfs2_cached_dealloc_ctxt w_dealloc; +}; + +void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +{ + int i; + + for(i = 0; i < num_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } + } +} + +static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) +{ + int i; + + /* + * w_target_locked is only set to true in the page_mkwrite() case. + * The intent is to allow us to lock the target page from write_begin() + * to write_end(). The caller must hold a ref on w_target_page. + */ + if (wc->w_target_locked) { + BUG_ON(!wc->w_target_page); + for (i = 0; i < wc->w_num_pages; i++) { + if (wc->w_target_page == wc->w_pages[i]) { + wc->w_pages[i] = NULL; + break; + } + } + mark_page_accessed(wc->w_target_page); + page_cache_release(wc->w_target_page); + } + ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); +} + +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + ocfs2_unlock_pages(wc); + brelse(wc->w_di_bh); + kfree(wc); +} + +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, + struct ocfs2_super *osb, loff_t pos, + unsigned len, struct buffer_head *di_bh) +{ + u32 cend; + struct ocfs2_write_ctxt *wc; + + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); + if (!wc) + return -ENOMEM; + + wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; + cend = (pos + len - 1) >> osb->s_clustersize_bits; + wc->w_clen = cend - wc->w_cpos + 1; + get_bh(di_bh); + wc->w_di_bh = di_bh; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; + + ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + + *wcp = wc; + + return 0; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + + start = max(from, block_start); + end = min(to, block_end); + + zero_user_segment(page, start, end); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Only called when we have a failure during allocating write to write + * zero's to the newly allocated region. + */ +static void ocfs2_write_failure(struct inode *inode, + struct ocfs2_write_ctxt *wc, + loff_t user_pos, unsigned user_len) +{ + int i; + unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + to = user_pos + user_len; + struct page *tmppage; + + ocfs2_zero_new_buffers(wc->w_target_page, from, to); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + + block_commit_write(tmppage, from, to); + } + } +} + +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, + struct page *page, u32 cpos, + loff_t user_pos, unsigned user_len, + int new) +{ + int ret; + unsigned int map_from = 0, map_to = 0; + unsigned int cluster_start, cluster_end; + unsigned int user_data_from = 0, user_data_to = 0; + + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, + &cluster_start, &cluster_end); + + /* treat the write as new if the a hole/lseek spanned across + * the page boundary. + */ + new = new | ((i_size_read(inode) <= page_offset(page)) && + (page_offset(page) <= user_pos)); + + if (page == wc->w_target_page) { + map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_to = map_from + user_len; + + if (new) + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, + new); + else + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + map_from, map_to, new); + if (ret) { + mlog_errno(ret); + goto out; + } + + user_data_from = map_from; + user_data_to = map_to; + if (new) { + map_from = cluster_start; + map_to = cluster_end; + } + } else { + /* + * If we haven't allocated the new page yet, we + * shouldn't be writing it out without copying user + * data. This is likely a math error from the caller. + */ + BUG_ON(!new); + + map_from = cluster_start; + map_to = cluster_end; + + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, new); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Parts of newly allocated pages need to be zero'd. + * + * Above, we have also rewritten 'to' and 'from' - as far as + * the rest of the function is concerned, the entire cluster + * range inside of a page needs to be written. + * + * We can skip this if the page is up to date - it's already + * been zero'd from being read in as a hole. + */ + if (new && !PageUptodate(page)) + ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + cpos, user_data_from, user_data_to); + + flush_dcache_page(page); + +out: + return ret; +} + +/* + * This function will only grab one clusters worth of pages. + */ +static int ocfs2_grab_pages_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, + struct page *mmap_page) +{ + int ret = 0, i; + unsigned long start, target_index, end_index, index; + struct inode *inode = mapping->host; + loff_t last_byte; + + target_index = user_pos >> PAGE_CACHE_SHIFT; + + /* + * Figure out how many pages we'll be manipulating here. For + * non allocating write, we just change the one + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. + */ + if (new) { + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; + } else { + wc->w_num_pages = 1; + start = target_index; + } + + for(i = 0; i < wc->w_num_pages; i++) { + index = start + i; + + if (index == target_index && mmap_page) { + /* + * ocfs2_pagemkwrite() is a little different + * and wants us to directly use the page + * passed in. + */ + lock_page(mmap_page); + + /* Exit and let the caller retry */ + if (mmap_page->mapping != mapping) { + WARN_ON(mmap_page->mapping); + unlock_page(mmap_page); + ret = -EAGAIN; + goto out; + } + + page_cache_get(mmap_page); + wc->w_pages[i] = mmap_page; + wc->w_target_locked = true; + } else { + wc->w_pages[i] = find_or_create_page(mapping, index, + GFP_NOFS); + if (!wc->w_pages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + wait_for_stable_page(wc->w_pages[i]); + + if (index == target_index) + wc->w_target_page = wc->w_pages[i]; + } +out: + if (ret) + wc->w_target_locked = false; + return ret; +} + +/* + * Prepare a single cluster for write one cluster into the file. + */ +static int ocfs2_write_cluster(struct address_space *mapping, + u32 phys, unsigned int unwritten, + unsigned int should_zero, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, u32 cpos, + loff_t user_pos, unsigned user_len) +{ + int ret, i, new; + u64 v_blkno, p_blkno; + struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; + + new = phys == 0 ? 1 : 0; + if (new) { + u32 tmp_pos; + + /* + * This is safe to call with the page locks - it won't take + * any additional semaphores or cluster locks. + */ + tmp_pos = cpos; + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); + /* + * This shouldn't happen because we must have already + * calculated the correct meta data allocation required. The + * internal tree allocation code should know how to increase + * transaction credits itself. + * + * If need be, we could handle -EAGAIN for a + * RESTART_TRANS here. + */ + mlog_bug_on_msg(ret == -EAGAIN, + "Inode %llu: EAGAIN return during allocation.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } else if (unwritten) { + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, + wc->w_handle, cpos, 1, phys, + meta_ac, &wc->w_dealloc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (should_zero) + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); + else + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; + + /* + * The only reason this should fail is due to an inability to + * find the extent added. + */ + ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, + NULL); + if (ret < 0) { + mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " + "at logical block %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_blkno); + goto out; + } + + BUG_ON(p_blkno == 0); + + for(i = 0; i < wc->w_num_pages; i++) { + int tmpret; + + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, + wc->w_pages[i], cpos, + user_pos, user_len, + should_zero); + if (tmpret) { + mlog_errno(tmpret); + if (ret == 0) + ret = tmpret; + } + } + + /* + * We only have cleanup to do in case of allocating write. + */ + if (ret && new) + ocfs2_write_failure(inode, wc, user_pos, user_len); + +out: + + return ret; +} + +static int ocfs2_write_cluster_by_desc(struct address_space *mapping, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len) +{ + int ret, i; + loff_t cluster_off; + unsigned int local_len = len; + struct ocfs2_write_cluster_desc *desc; + struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb); + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + /* + * We have to make sure that the total write passed in + * doesn't extend past a single cluster. + */ + local_len = len; + cluster_off = pos & (osb->s_clustersize - 1); + if ((cluster_off + local_len) > osb->s_clustersize) + local_len = osb->s_clustersize - cluster_off; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, + wc, desc->c_cpos, pos, local_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + len -= local_len; + pos += local_len; + } + + ret = 0; +out: + return ret; +} + +/* + * ocfs2_write_end() wants to know which parts of the target page it + * should complete the write on. It's easiest to compute them ahead of + * time when a more complete view of the write is available. + */ +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len, int alloc) +{ + struct ocfs2_write_cluster_desc *desc; + + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_to = wc->w_target_from + len; + + if (alloc == 0) + return; + + /* + * Allocating write - we may have different boundaries based + * on page size and cluster size. + * + * NOTE: We can no longer compute one value from the other as + * the actual write length and user provided length may be + * different. + */ + + if (wc->w_large_pages) { + /* + * We only care about the 1st and last cluster within + * our range and whether they should be zero'd or not. Either + * value may be extended out to the start/end of a + * newly allocated cluster. + */ + desc = &wc->w_desc[0]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + &wc->w_target_from, + NULL); + + desc = &wc->w_desc[wc->w_clen - 1]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + NULL, + &wc->w_target_to); + } else { + wc->w_target_from = 0; + wc->w_target_to = PAGE_CACHE_SIZE; + } +} + +/* + * Populate each single-cluster write descriptor in the write context + * with information about the i/o to be done. + * + * Returns the number of clusters that will have to be allocated, as + * well as a worst case estimate of the number of extent records that + * would have to be created during a write to an unwritten region. + */ +static int ocfs2_populate_write_desc(struct inode *inode, + struct ocfs2_write_ctxt *wc, + unsigned int *clusters_to_alloc, + unsigned int *extents_to_split) +{ + int ret; + struct ocfs2_write_cluster_desc *desc; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 phys = 0; + int i; + + *clusters_to_alloc = 0; + *extents_to_split = 0; + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + desc->c_cpos = wc->w_cpos + i; + + if (num_clusters == 0) { + /* + * Need to look up the next extent record. + */ + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* We should already CoW the refcountd extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + + /* + * Assume worst case - that we're writing in + * the middle of the extent. + * + * We can assume that the write proceeds from + * left to right, in which case the extent + * insert code is smart enough to coalesce the + * next splits into the previous records created. + */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + *extents_to_split = *extents_to_split + 2; + } else if (phys) { + /* + * Only increment phys if it doesn't describe + * a hole. + */ + phys++; + } + + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + + desc->c_phys = phys; + if (phys == 0) { + desc->c_new = 1; + desc->c_needs_zero = 1; + *clusters_to_alloc = *clusters_to_alloc + 1; + } + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { + desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } + + num_clusters--; + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_write_begin_inline(struct address_space *mapping, + struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct page *page; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + page = find_or_create_page(mapping, 0, GFP_NOFS); + if (!page) { + ocfs2_commit_trans(osb, handle); + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + /* + * If we don't set w_num_pages then this page won't get unlocked + * and freed on cleanup of the write context. + */ + wc->w_pages[0] = wc->w_target_page = page; + wc->w_num_pages = 1; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + ocfs2_commit_trans(osb, handle); + + mlog_errno(ret); + goto out; + } + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + ocfs2_set_inode_data_inline(inode, di); + + if (!PageUptodate(page)) { + ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (ret) { + ocfs2_commit_trans(osb, handle); + + goto out; + } + } + + wc->w_handle = handle; +out: + return ret; +} + +int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (new_size <= le16_to_cpu(di->id2.i_data.id_count)) + return 1; + return 0; +} + +static int ocfs2_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, loff_t pos, + unsigned len, struct page *mmap_page, + struct ocfs2_write_ctxt *wc) +{ + int ret, written = 0; + loff_t end = pos + len; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; + + trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno, + len, (unsigned long long)pos, + oi->ip_dyn_features); + + /* + * Handle inodes which already have inline data 1st. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (mmap_page == NULL && + ocfs2_size_fits_inline_data(wc->w_di_bh, end)) + goto do_inline_write; + + /* + * The write won't fit - we have to give this inode an + * inline extent list now. + */ + ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Check whether the inode can accept inline data. + */ + if (oi->ip_clusters != 0 || i_size_read(inode) != 0) + return 0; + + /* + * Check whether the write can fit. + */ + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) + return 0; + +do_inline_write: + ret = ocfs2_write_begin_inline(mapping, inode, wc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * This signals to the caller that the data can be written + * inline. + */ + written = 1; +out: + return written ? written : ret; +} + +/* + * This function only does anything for file systems which can't + * handle sparse files. + * + * What we want to do here is fill in any hole between the current end + * of allocation and the end of our write. That way the rest of the + * write path can treat it as an non-allocating write, which has no + * special case code for sparse/nonsparse files. + */ +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, + struct ocfs2_write_ctxt *wc) +{ + int ret; + loff_t newsize = pos + len; + + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + + if (newsize <= i_size_read(inode)) + return 0; + + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); + if (ret) + mlog_errno(ret); + + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + + return ret; +} + +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + mutex_lock(&osb->osb_tl_inode->i_mutex); + truncated_clusters = osb->truncated_clusters; + mutex_unlock(&osb->osb_tl_inode->i_mutex); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) +{ + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + struct ocfs2_extent_tree et; + int try_free = 1, ret1; + +try_again: + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (ocfs2_supports_inline_data(osb)) { + ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, + mmap_page, wc); + if (ret == 1) { + ret = 0; + goto success; + } + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_check_range_for_refcount(inode, pos, len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } else if (ret == 1) { + clusters_need = wc->w_clen; + ret = ocfs2_refcount_cow(inode, di_bh, + wc->w_cpos, wc->w_clen, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, + &extents_to_split); + if (ret) { + mlog_errno(ret); + goto out; + } + clusters_need += clusters_to_alloc; + + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + trace_ocfs2_write_begin_nolock( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), + le32_to_cpu(di->i_clusters), + pos, len, flags, mmap_page, + clusters_to_alloc, extents_to_split); + + /* + * We set w_target_from, w_target_to here so that + * ocfs2_write_end() knows which range in the target page to + * write out. An allocation requires that we write the entire + * cluster range. + */ + if (clusters_to_alloc || extents_to_split) { + /* + * XXX: We are stretching the limits of + * ocfs2_lock_allocators(). It greatly over-estimates + * the work to be done. + */ + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list); + + } + + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + wc->w_handle = handle; + + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; + } + /* + * We don't want this to fail in ocfs2_write_end(), so do it + * here. + */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_page); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out_quota; + } + + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } + + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, + len); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + +success: + *pagep = wc->w_target_page; + *fsdata = wc; + return 0; +out_quota: + if (clusters_to_alloc) + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_write_ctxt(wc); + + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + + if (ret == -ENOSPC && try_free) { + /* + * Try to free some truncate log so that we can have enough + * clusters to allocate. + */ + try_free = 0; + + ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need); + if (ret1 == 1) + goto try_again; + + if (ret1 < 0) + mlog_errno(ret1); + } + + return ret; +} + +static int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct inode *inode = mapping->host; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, + fsdata, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out_fail; + } + + brelse(di_bh); + + return 0; + +out_fail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); + + return ret; +} + +static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, + unsigned len, unsigned *copied, + struct ocfs2_dinode *di, + struct ocfs2_write_ctxt *wc) +{ + void *kaddr; + + if (unlikely(*copied < len)) { + if (!PageUptodate(wc->w_target_page)) { + *copied = 0; + return; + } + } + + kaddr = kmap_atomic(wc->w_target_page); + memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); + kunmap_atomic(kaddr); + + trace_ocfs2_write_end_inline( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)pos, *copied, + le16_to_cpu(di->id2.i_data.id_count), + le16_to_cpu(di->i_dyn_features)); +} + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i; + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_write_ctxt *wc = fsdata; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle_t *handle = wc->w_handle; + struct page *tmppage; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); + goto out_write_size; + } + + if (unlikely(copied < len)) { + if (!PageUptodate(wc->w_target_page)) + copied = 0; + + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + start+len); + } + flush_dcache_page(wc->w_target_page); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (tmppage == wc->w_target_page) { + from = wc->w_target_from; + to = wc->w_target_to; + + BUG_ON(from > PAGE_CACHE_SIZE || + to > PAGE_CACHE_SIZE || + to < from); + } else { + /* + * Pages adjacent to the target (if any) imply + * a hole-filling write in which case we want + * to flush their entire range. + */ + from = 0; + to = PAGE_CACHE_SIZE; + } + + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + block_commit_write(tmppage, from, to); + } + } + +out_write_size: + pos += copied; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + ocfs2_journal_dirty(handle, wc->w_di_bh); + + /* unlock pages before dealloc since it needs acquiring j_trans_barrier + * lock, or it will cause a deadlock since journal commit threads holds + * this lock and will ask for the page lock when flushing the data. + * put it here to preserve the unlock order. + */ + ocfs2_unlock_pages(wc); + + ocfs2_commit_trans(osb, handle); + + ocfs2_run_deallocs(osb, &wc->w_dealloc); + + brelse(wc->w_di_bh); + kfree(wc); + + return copied; +} + +static int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + struct inode *inode = mapping->host; + + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + + return ret; +} + +const struct address_space_operations ocfs2_aops = { + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = block_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/kernel/fs/ocfs2/aops.h b/kernel/fs/ocfs2/aops.h new file mode 100644 index 000000000..dd59599b0 --- /dev/null +++ b/kernel/fs/ocfs2/aops.h @@ -0,0 +1,105 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_AOPS_H +#define OCFS2_AOPS_H + +#include + +handle_t *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to); + +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new); + +void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); + +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)); + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page); + +int ocfs2_read_inline_data(struct inode *inode, struct page *page, + struct buffer_head *di_bh); +int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); + +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +/* all ocfs2_dio_end_io()'s fault */ +#define ocfs2_iocb_is_rw_locked(iocb) \ + test_bit(0, (unsigned long *)&iocb->private) +static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) +{ + set_bit(0, (unsigned long *)&iocb->private); + if (level) + set_bit(1, (unsigned long *)&iocb->private); + else + clear_bit(1, (unsigned long *)&iocb->private); +} + +/* + * Using a named enum representing lock types in terms of #N bit stored in + * iocb->private, which is going to be used for communication between + * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + */ +enum ocfs2_iocb_lock_bits { + OCFS2_IOCB_RW_LOCK = 0, + OCFS2_IOCB_RW_LOCK_LEVEL, + OCFS2_IOCB_SEM, + OCFS2_IOCB_UNALIGNED_IO, + OCFS2_IOCB_NUM_LOCKS +}; + +#define ocfs2_iocb_clear_rw_locked(iocb) \ + clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) +#define ocfs2_iocb_rw_locked_level(iocb) \ + test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_sem_locked(iocb) \ + set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_sem_locked(iocb) \ + clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_sem_locked(iocb) \ + test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) + +#define ocfs2_iocb_set_unaligned_aio(iocb) \ + set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_unaligned_aio(iocb) \ + clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_unaligned_aio(iocb) \ + test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) + +#endif /* OCFS2_FILE_H */ diff --git a/kernel/fs/ocfs2/blockcheck.c b/kernel/fs/ocfs2/blockcheck.c new file mode 100644 index 000000000..0725e6054 --- /dev/null +++ b/kernel/fs/ocfs2/blockcheck.c @@ -0,0 +1,647 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.c + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2006, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "blockcheck.h" + + +/* + * We use the following conventions: + * + * d = # data bits + * p = # parity bits + * c = # total code bits (d + p) + */ + + +/* + * Calculate the bit offset in the hamming code buffer based on the bit's + * offset in the data buffer. Since the hamming code reserves all + * power-of-two bits for parity, the data bit number and the code bit + * number are offset by all the parity bits beforehand. + * + * Recall that bit numbers in hamming code are 1-based. This function + * takes the 0-based data bit from the caller. + * + * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0), + * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit. + * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3 + * in the code buffer. + * + * The caller can pass in *p if it wants to keep track of the most recent + * number of parity bits added. This allows the function to start the + * calculation at the last place. + */ +static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache) +{ + unsigned int b, p = 0; + + /* + * Data bits are 0-based, but we're talking code bits, which + * are 1-based. + */ + b = i + 1; + + /* Use the cache if it is there */ + if (p_cache) + p = *p_cache; + b += p; + + /* + * For every power of two below our bit number, bump our bit. + * + * We compare with (b + 1) because we have to compare with what b + * would be _if_ it were bumped up by the parity bit. Capice? + * + * p is set above. + */ + for (; (1 << p) < (b + 1); p++) + b++; + + if (p_cache) + *p_cache = p; + + return b; +} + +/* + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr) +{ + unsigned int i, b, p = 0; + + BUG_ON(!d); + + /* + * b is the hamming code bit number. Hamming code specifies a + * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is + * for the algorithm. + * + * The i++ in the for loop is so that the start offset passed + * to ocfs2_find_next_bit_set() is one greater than the previously + * found bit. + */ + for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++) + { + /* + * i is the offset in this hunk, nr + i is the total bit + * offset. + */ + b = calc_code_bit(nr + i, &p); + + /* + * Data bits in the resultant code are checked by + * parity bits that are part of the bit number + * representation. Huh? + * + * + * In other words, the parity bit at position 2^k + * checks bits in positions having bit k set in + * their binary representation. Conversely, for + * instance, bit 13, i.e. 1101(2), is checked by + * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1. + * + * + * Note that 'k' is the _code_ bit number. 'b' in + * our loop. + */ + parity ^= b; + } + + /* While the data buffer was treated as little endian, the + * return value is in host endian. */ + return parity; +} + +u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize) +{ + return ocfs2_hamming_encode(0, data, blocksize * 8, 0); +} + +/* + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one hunk, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix) +{ + unsigned int i, b; + + BUG_ON(!d); + + /* + * If the bit to fix has an hweight of 1, it's a parity bit. One + * busted parity bit is its own error. Nothing to do here. + */ + if (hweight32(fix) == 1) + return; + + /* + * nr + d is the bit right past the data hunk we're looking at. + * If fix after that, nothing to do + */ + if (fix >= calc_code_bit(nr + d, NULL)) + return; + + /* + * nr is the offset in the data hunk we're starting at. Let's + * start b at the offset in the code buffer. See hamming_encode() + * for a more detailed description of 'b'. + */ + b = calc_code_bit(nr, NULL); + /* If the fix is before this hunk, nothing to do */ + if (fix < b) + return; + + for (i = 0; i < d; i++, b++) + { + /* Skip past parity bits */ + while (hweight32(b) == 1) + b++; + + /* + * i is the offset in this data hunk. + * nr + i is the offset in the total data buffer. + * b is the offset in the total code buffer. + * + * Thus, when b == fix, bit i in the current hunk needs + * fixing. + */ + if (b == fix) + { + if (ocfs2_test_bit(i, data)) + ocfs2_clear_bit(i, data); + else + ocfs2_set_bit(i, data); + break; + } + } +} + +void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix) +{ + ocfs2_hamming_fix(data, blocksize * 8, 0, fix); +} + + +/* + * Debugfs handling. + */ + +#ifdef CONFIG_DEBUG_FS + +static int blockcheck_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); + +static struct dentry *blockcheck_debugfs_create(const char *name, + struct dentry *parent, + u64 *value) +{ + return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, + &blockcheck_fops); +} + +static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ + if (stats) { + debugfs_remove(stats->b_debug_check); + stats->b_debug_check = NULL; + debugfs_remove(stats->b_debug_failure); + stats->b_debug_failure = NULL; + debugfs_remove(stats->b_debug_recover); + stats->b_debug_recover = NULL; + debugfs_remove(stats->b_debug_dir); + stats->b_debug_dir = NULL; + } +} + +static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + int rc = -EINVAL; + + if (!stats) + goto out; + + stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + if (!stats->b_debug_dir) + goto out; + + stats->b_debug_check = + blockcheck_debugfs_create("blocks_checked", + stats->b_debug_dir, + &stats->b_check_count); + + stats->b_debug_failure = + blockcheck_debugfs_create("checksums_failed", + stats->b_debug_dir, + &stats->b_failure_count); + + stats->b_debug_recover = + blockcheck_debugfs_create("ecc_recoveries", + stats->b_debug_dir, + &stats->b_recover_count); + if (stats->b_debug_check && stats->b_debug_failure && + stats->b_debug_recover) + rc = 0; + +out: + if (rc) + ocfs2_blockcheck_debug_remove(stats); + return rc; +} +#else +static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return 0; +} + +static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +/* Always-called wrappers for starting and stopping the debugfs files */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return ocfs2_blockcheck_debug_install(stats, parent); +} + +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) +{ + ocfs2_blockcheck_debug_remove(stats); +} + +static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_check_count++; + new_count = stats->b_check_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Block check count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_failure_count++; + new_count = stats->b_failure_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Checksum failure count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_recover_count++; + new_count = stats->b_recover_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "ECC recovery count has wrapped\n"); +} + + + +/* + * These are the low-level APIs for using the ocfs2_block_check structure. + */ + +/* + * This function generates check information for a block. + * data is the block to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + u32 crc; + u32 ecc; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + crc = crc32_le(~0, data, blocksize); + ecc = ocfs2_hamming_encode_block(data, blocksize); + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHRT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information. Like _compute, + * the function will take care of zeroing bc before calculating check codes. + * If bc is not a pointer inside data, the caller must have zeroed any + * inline ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) +{ + int rc = 0; + u32 bc_crc32e; + u16 bc_ecc; + u32 crc, ecc; + + ocfs2_blockcheck_inc_check(stats); + + bc_crc32e = le32_to_cpu(bc->bc_crc32e); + bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + crc = crc32_le(~0, data, blocksize); + if (crc == bc_crc32e) + goto out; + + ocfs2_blockcheck_inc_failure(stats); + mlog(ML_ERROR, + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", + (unsigned int)bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + ecc = ocfs2_hamming_encode_block(data, blocksize); + ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc); + + /* And check the crc32 again */ + crc = crc32_le(~0, data, blocksize); + if (crc == bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); + goto out; + } + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", + (unsigned int)bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(bc_crc32e); + bc->bc_ecc = cpu_to_le16(bc_ecc); + + return rc; +} + +/* + * This function generates check information for a list of buffer_heads. + * bhs is the blocks to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i; + u32 crc, ecc; + + BUG_ON(nr < 0); + + if (!nr) + return; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + for (i = 0, crc = ~0, ecc = 0; i < nr; i++) { + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHRT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information on a list of + * buffer_heads. Like _compute_bhs, the function will take care of + * zeroing bc before calculating check codes. If bc is not a pointer + * inside data, the caller must have zeroed any inline + * ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) +{ + int i, rc = 0; + u32 bc_crc32e; + u16 bc_ecc; + u32 crc, ecc, fix; + + BUG_ON(nr < 0); + + if (!nr) + return 0; + + ocfs2_blockcheck_inc_check(stats); + + bc_crc32e = le32_to_cpu(bc->bc_crc32e); + bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == bc_crc32e) + goto out; + + ocfs2_blockcheck_inc_failure(stats); + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + for (i = 0, ecc = 0; i < nr; i++) { + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + fix = ecc ^ bc_ecc; + for (i = 0; i < nr; i++) { + /* + * Try the fix against each buffer. It will only affect + * one of them. + */ + ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i, fix); + } + + /* And check the crc32 again */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); + goto out; + } + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(bc_crc32e); + bc->bc_ecc = cpu_to_le16(bc_ecc); + + return rc; +} + +/* + * These are the main API. They check the superblock flag before + * calling the underlying operations. + * + * They expect the buffer(s) to be in disk format. + */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute(data, sb->s_blocksize, bc); +} + +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc, + &osb->osb_ecc_stats); + + return rc; +} + +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute_bhs(bhs, nr, bc); +} + +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc, + &osb->osb_ecc_stats); + + return rc; +} + diff --git a/kernel/fs/ocfs2/blockcheck.h b/kernel/fs/ocfs2/blockcheck.h new file mode 100644 index 000000000..d4b69febf --- /dev/null +++ b/kernel/fs/ocfs2/blockcheck.h @@ -0,0 +1,107 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.h + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_BLOCKCHECK_H +#define OCFS2_BLOCKCHECK_H + + +/* Count errors and error correction from blockcheck.c */ +struct ocfs2_blockcheck_stats { + spinlock_t b_lock; + u64 b_check_count; /* Number of blocks we've checked */ + u64 b_failure_count; /* Number of failed checksums */ + u64 b_recover_count; /* Number of blocks fixed by ecc */ + + /* + * debugfs entries, used if this is passed to + * ocfs2_blockcheck_stats_debugfs_install() + */ + struct dentry *b_debug_dir; /* Parent of the debugfs files */ + struct dentry *b_debug_check; /* Exposes b_check_count */ + struct dentry *b_debug_failure; /* Exposes b_failure_count */ + struct dentry *b_debug_recover; /* Exposes b_recover_count */ +}; + + +/* High level block API */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* Lower level API */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); + +/* Debug Initialization */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); + +/* + * Hamming code functions + */ + +/* + * Encoding hamming code parity bits for a buffer. + * + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, + unsigned int nr); +/* + * Fix a buffer with a bit error. The 'fix' is the original parity + * xor'd with the parity calculated now. + * + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one buffer, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix); + +/* Convenience wrappers for a single buffer of data */ +extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize); +extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix); +#endif diff --git a/kernel/fs/ocfs2/buffer_head_io.c b/kernel/fs/ocfs2/buffer_head_io.c new file mode 100644 index 000000000..1edcb141f --- /dev/null +++ b/kernel/fs/ocfs2/buffer_head_io.c @@ -0,0 +1,427 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * io.c + * + * Buffer cache handling + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "inode.h" +#include "journal.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "ocfs2_trace.h" + +/* + * Bits on bh->b_state used by ocfs2. + * + * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. + */ +enum ocfs2_state_bits { + BH_NeedsValidate = BH_JBDPrivateStart, +}; + +/* Expand the magic b_state functions */ +BUFFER_FNS(NeedsValidate, needs_validate); + +int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, + struct ocfs2_caching_info *ci) +{ + int ret = 0; + + trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci); + + BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); + BUG_ON(buffer_jbd(bh)); + + /* No need to check for a soft readonly file system here. non + * journalled writes are only ever done on system files which + * can get modified during recovery even if read-only. */ + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } + + ocfs2_metadata_cache_io_lock(ci); + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (buffer_uptodate(bh)) { + ocfs2_set_buffer_uptodate(ci, bh); + } else { + /* We don't need to remove the clustered uptodate + * information for this bh as it's not marked locally + * uptodate. */ + ret = -EIO; + mlog_errno(ret); + } + + ocfs2_metadata_cache_io_unlock(ci); +out: + return ret; +} + +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); + + if (!nr) + goto bail; + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + trace_ocfs2_read_blocks_sync_jbd( + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + /* No need to wait on the buffer if it's managed by JBD. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + int i, ignore_cache = 0; + struct buffer_head *bh; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + + trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); + + BUG_ON(!ci); + BUG_ON((flags & OCFS2_BH_READAHEAD) && + (flags & OCFS2_BH_IGNORE_CACHE)); + + if (bhs == NULL) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr < 0) { + mlog(ML_ERROR, "asked to read %d blocks!\n", nr); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr == 0) { + status = 0; + goto bail; + } + + ocfs2_metadata_cache_io_lock(ci); + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(sb, block++); + if (bhs[i] == NULL) { + ocfs2_metadata_cache_io_unlock(ci); + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); + + /* There are three read-ahead cases here which we need to + * be concerned with. All three assume a buffer has + * previously been submitted with OCFS2_BH_READAHEAD + * and it hasn't yet completed I/O. + * + * 1) The current request is sync to disk. This rarely + * happens these days, and never when performance + * matters - the code can just wait on the buffer + * lock and re-submit. + * + * 2) The current request is cached, but not + * readahead. ocfs2_buffer_uptodate() will return + * false anyway, so we'll wind up waiting on the + * buffer lock to do I/O. We re-check the request + * with after getting the lock to avoid a re-submit. + * + * 3) The current request is readahead (and so must + * also be a caching one). We short circuit if the + * buffer is locked (under I/O) and if it's in the + * uptodate cache. The re-check from #2 catches the + * case that the previous read-ahead completes just + * before our is-it-in-flight check. + */ + + if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { + trace_ocfs2_read_blocks_from_disk( + (unsigned long long)bh->b_blocknr, + (unsigned long long)ocfs2_metadata_cache_owner(ci)); + /* We're using ignore_cache here to say + * "go to disk" */ + ignore_cache = 1; + } + + trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr, + ignore_cache, buffer_jbd(bh), buffer_dirty(bh)); + + if (buffer_jbd(bh)) { + continue; + } + + if (ignore_cache) { + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + continue; + } + + /* A read-ahead request was made - if the + * buffer is already under read-ahead from a + * previously submitted request than we are + * done here. */ + if ((flags & OCFS2_BH_READAHEAD) + && ocfs2_buffer_read_ahead(ci, bh)) + continue; + + lock_buffer(bh); + if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES + mlog(ML_ERROR, "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); +#else + unlock_buffer(bh); + continue; +#endif + } + + /* Re-check ocfs2_buffer_uptodate() as a + * previously read-ahead buffer may have + * completed I/O while we were waiting for the + * buffer lock. */ + if (!(flags & OCFS2_BH_IGNORE_CACHE) + && !(flags & OCFS2_BH_READAHEAD) + && ocfs2_buffer_uptodate(ci, bh)) { + unlock_buffer(bh); + continue; + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + if (validate) + set_buffer_needs_validate(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + continue; + } + } + + status = 0; + + for (i = (nr - 1); i >= 0; i--) { + bh = bhs[i]; + + if (!(flags & OCFS2_BH_READAHEAD)) { + /* We know this can't have changed as we hold the + * owner sem. Avoid doing any work on the bh if the + * journal has it. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. Don't need to + * remove the clustered uptodate information + * for this bh as it's not marked locally + * uptodate. */ + status = -EIO; + put_bh(bh); + bhs[i] = NULL; + continue; + } + + if (buffer_needs_validate(bh)) { + /* We never set NeedsValidate if the + * buffer was held by the journal, so + * that better not have changed */ + BUG_ON(buffer_jbd(bh)); + clear_buffer_needs_validate(bh); + status = validate(sb, bh); + if (status) { + put_bh(bh); + bhs[i] = NULL; + continue; + } + } + } + + /* Always set the buffer in the cache, even if it was + * a forced read, or read-ahead which hasn't yet + * completed. */ + ocfs2_set_buffer_uptodate(ci, bh); + } + ocfs2_metadata_cache_io_unlock(ci); + + trace_ocfs2_read_blocks_end((unsigned long long)block, nr, + flags, ignore_cache); + +bail: + + return status; +} + +/* Check whether the blkno is the super block or one of the backups. */ +static void ocfs2_check_super_or_backup(struct super_block *sb, + sector_t blkno) +{ + int i; + u64 backup_blkno; + + if (blkno == OCFS2_SUPER_BLOCK_BLKNO) + return; + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + backup_blkno = ocfs2_backup_super_blkno(sb, i); + if (backup_blkno == blkno) + return; + } + + BUG(); +} + +/* + * Write super block and backups doesn't need to collaborate with journal, + * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed + * into this function. + */ +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh) +{ + int ret = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + BUG_ON(buffer_jbd(bh)); + ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + ret = -EIO; + mlog_errno(ret); + } + +out: + return ret; +} diff --git a/kernel/fs/ocfs2/buffer_head_io.h b/kernel/fs/ocfs2/buffer_head_io.h new file mode 100644 index 000000000..b97bcc6dd --- /dev/null +++ b/kernel/fs/ocfs2/buffer_head_io.h @@ -0,0 +1,77 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_buffer_head.h + * + * Buffer cache handling functions defined + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_BUFFER_HEAD_IO_H +#define OCFS2_BUFFER_HEAD_IO_H + +#include + +void ocfs2_end_buffer_io_sync(struct buffer_head *bh, + int uptodate); + +int ocfs2_write_block(struct ocfs2_super *osb, + struct buffer_head *bh, + struct ocfs2_caching_info *ci); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); + +/* + * If not NULL, validate() will be called on a buffer that is freshly + * read from disk. It will not be called if the buffer was in cache. + * Note that if validate() is being used for this buffer, it needs to + * be set even for a READAHEAD call, as it marks the buffer for later + * validation. + */ +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); + +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh); + +#define OCFS2_BH_IGNORE_CACHE 1 +#define OCFS2_BH_READAHEAD 8 + +static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate); + +bail: + return status; +} + +#endif /* OCFS2_BUFFER_HEAD_IO_H */ diff --git a/kernel/fs/ocfs2/cluster/Makefile b/kernel/fs/ocfs2/cluster/Makefile new file mode 100644 index 000000000..1aefc0350 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o + +ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ + quorum.o tcp.o netdebug.o diff --git a/kernel/fs/ocfs2/cluster/heartbeat.c b/kernel/fs/ocfs2/cluster/heartbeat.c new file mode 100644 index 000000000..16eff4572 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/heartbeat.c @@ -0,0 +1,2678 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#include "quorum.h" + +#include "masklog.h" + + +/* + * The first heartbeat pass had one global thread that would serialize all hb + * callback calls. This global serializing sem should only be removed once + * we've made sure that all callees can deal with being called concurrently + * from multiple hb region threads. + */ +static DECLARE_RWSEM(o2hb_callback_sem); + +/* + * multiple hb threads are watching multiple regions. A node is live + * whenever any of the threads sees activity from the node in its region. + */ +static DEFINE_SPINLOCK(o2hb_live_lock); +static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; +static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +static LIST_HEAD(o2hb_node_events); +static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); + +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + +#define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 +#define O2HB_DB_TYPE_REGION_PINNED 7 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; + +#define O2HB_DEBUG_DIR "o2hb" +#define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" +#define O2HB_DEBUG_REGION_PINNED "pinned" + +static struct dentry *o2hb_debug_dir; +static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; + +static LIST_HEAD(o2hb_all_regions); + +static struct o2hb_callback { + struct list_head list; +} o2hb_callbacks[O2HB_NUM_CB]; + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); + +#define O2HB_DEFAULT_BLOCK_BITS 9 + +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + +unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; + +/* + * o2hb_dependent_users tracks the number of registered callbacks that depend + * on heartbeat. o2net and o2dlm are two entities that register this callback. + * However only o2dlm depends on the heartbeat. It does not want the heartbeat + * to stop while a dlm domain is still active. + */ +unsigned int o2hb_dependent_users; + +/* + * In global heartbeat mode, all regions are pinned if there are one or more + * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All + * regions are unpinned if the region count exceeds the cut off or the number + * of dependent users falls to zero. + */ +#define O2HB_PIN_CUT_OFF 3 + +/* + * In local heartbeat mode, we assume the dlm domain name to be the same as + * region uuid. This is true for domains created for the file system but not + * necessarily true for userdlm domains. This is a known limitation. + * + * In global heartbeat mode, we pin/unpin all o2hb regions. This solution + * works for both file system and userdlm domains. + */ +static int o2hb_region_pin(const char *region_uuid); +static void o2hb_region_unpin(const char *region_uuid); + +/* Only sets a new threshold if there are no active regions. + * + * No locking or otherwise interesting code is required for reading + * o2hb_dead_threshold as it can't change once regions are active and + * it's not interesting to anyone until then anyway. */ +static void o2hb_dead_threshold_set(unsigned int threshold) +{ + if (threshold > O2HB_MIN_DEAD_THRESHOLD) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) + o2hb_dead_threshold = threshold; + spin_unlock(&o2hb_live_lock); + } +} + +static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + +struct o2hb_node_event { + struct list_head hn_item; + enum o2hb_callback_type hn_event_type; + struct o2nm_node *hn_node; + int hn_node_num; +}; + +struct o2hb_disk_slot { + struct o2hb_disk_heartbeat_block *ds_raw_block; + u8 ds_node_num; + u64 ds_last_time; + u64 ds_last_generation; + u16 ds_equal_samples; + u16 ds_changed_samples; + struct list_head ds_live_item; +}; + +/* each thread owns a region.. when we're asked to tear down the region + * we ask the thread to stop, who cleans up the region */ +struct o2hb_region { + struct config_item hr_item; + + struct list_head hr_all_item; + unsigned hr_unclean_stop:1, + hr_aborted_start:1, + hr_item_pinned:1, + hr_item_dropped:1; + + /* protected by the hr_callback_sem */ + struct task_struct *hr_task; + + unsigned int hr_blocks; + unsigned long long hr_start_block; + + unsigned int hr_block_bits; + unsigned int hr_block_bytes; + + unsigned int hr_slots_per_page; + unsigned int hr_num_pages; + + struct page **hr_slot_data; + struct block_device *hr_bdev; + struct o2hb_disk_slot *hr_slots; + + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; + + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; + struct dentry *hr_debug_pinned; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; + struct o2hb_debug_buf *hr_db_pinned; + + /* let the person setting up hb wait for it to return until it + * has reached a 'steady' state. This will be fixed when we have + * a more complete api that doesn't lead to this sort of fragility. */ + atomic_t hr_steady_iterations; + + /* terminate o2hb thread if it does not reach steady state + * (hr_steady_iterations == 0) within hr_unsteady_iterations */ + atomic_t hr_unsteady_iterations; + + char hr_dev_name[BDEVNAME_SIZE]; + + unsigned int hr_timeout_ms; + + /* randomized as the region goes up and down so that a node + * recognizes a node going up and down in one iteration */ + u64 hr_generation; + + struct delayed_work hr_write_timeout_work; + unsigned long hr_last_timeout_start; + + /* Used during o2hb_check_slot to hold a copy of the block + * being checked because we temporarily have to zero out the + * crc field. */ + struct o2hb_disk_heartbeat_block *hr_tmp_block; +}; + +struct o2hb_bio_wait_ctxt { + atomic_t wc_num_reqs; + struct completion wc_io_complete; + int wc_error; +}; + +static void o2hb_write_timeout(struct work_struct *work) +{ + int failed, quorum; + unsigned long flags; + struct o2hb_region *reg = + container_of(work, struct o2hb_region, + hr_write_timeout_work.work); + + mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " + "milliseconds\n", reg->hr_dev_name, + jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = bitmap_weight(o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = bitmap_weight(o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + + o2quo_disk_timeout(); +} + +static void o2hb_arm_write_timeout(struct o2hb_region *reg) +{ + /* Arm writeout only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) + return; + + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); + + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } + cancel_delayed_work(®->hr_write_timeout_work); + reg->hr_last_timeout_start = jiffies; + schedule_delayed_work(®->hr_write_timeout_work, + msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); +} + +static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +{ + cancel_delayed_work_sync(®->hr_write_timeout_work); +} + +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) +{ + atomic_set(&wc->wc_num_reqs, 1); + init_completion(&wc->wc_io_complete); + wc->wc_error = 0; +} + +/* Used in error paths too */ +static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, + unsigned int num) +{ + /* sadly atomic_sub_and_test() isn't available on all platforms. The + * good news is that the fast path only completes one at a time */ + while(num--) { + if (atomic_dec_and_test(&wc->wc_num_reqs)) { + BUG_ON(num > 0); + complete(&wc->wc_io_complete); + } + } +} + +static void o2hb_wait_on_io(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc) +{ + o2hb_bio_wait_dec(wc, 1); + wait_for_completion(&wc->wc_io_complete); +} + +static void o2hb_bio_end_io(struct bio *bio, + int error) +{ + struct o2hb_bio_wait_ctxt *wc = bio->bi_private; + + if (error) { + mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } + + o2hb_bio_wait_dec(wc, 1); + bio_put(bio); +} + +/* Setup a Bio to cover I/O against num_slots slots starting at + * start_slot. */ +static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc, + unsigned int *current_slot, + unsigned int max_slots) +{ + int len, current_page; + unsigned int vec_len, vec_start; + unsigned int bits = reg->hr_block_bits; + unsigned int spp = reg->hr_slots_per_page; + unsigned int cs = *current_slot; + struct bio *bio; + struct page *page; + + /* Testing has shown this allocation to take long enough under + * GFP_KERNEL that the local node can get fenced. It would be + * nicest if we could pre-allocate these bios and avoid this + * all together. */ + bio = bio_alloc(GFP_ATOMIC, 16); + if (!bio) { + mlog(ML_ERROR, "Could not alloc slots BIO!\n"); + bio = ERR_PTR(-ENOMEM); + goto bail; + } + + /* Must put everything in 512 byte sectors for the bio... */ + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); + bio->bi_bdev = reg->hr_bdev; + bio->bi_private = wc; + bio->bi_end_io = o2hb_bio_end_io; + + vec_start = (cs << bits) % PAGE_CACHE_SIZE; + while(cs < max_slots) { + current_page = cs / spp; + page = reg->hr_slot_data[current_page]; + + vec_len = min(PAGE_CACHE_SIZE - vec_start, + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + + mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", + current_page, vec_len, vec_start); + + len = bio_add_page(bio, page, vec_len, vec_start); + if (len != vec_len) break; + + cs += vec_len / (PAGE_CACHE_SIZE/spp); + vec_start = 0; + } + +bail: + *current_slot = cs; + return bio; +} + +static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int max_slots) +{ + unsigned int current_slot=0; + int status; + struct o2hb_bio_wait_ctxt wc; + struct bio *bio; + + o2hb_bio_wait_init(&wc); + + while(current_slot < max_slots) { + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + if (IS_ERR(bio)) { + status = PTR_ERR(bio); + mlog_errno(status); + goto bail_and_wait; + } + + atomic_inc(&wc.wc_num_reqs); + submit_bio(READ, bio); + } + + status = 0; + +bail_and_wait: + o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; + + return status; +} + +static int o2hb_issue_node_write(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *write_wc) +{ + int status; + unsigned int slot; + struct bio *bio; + + o2hb_bio_wait_init(write_wc); + + slot = o2nm_this_node(); + + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + if (IS_ERR(bio)) { + status = PTR_ERR(bio); + mlog_errno(status); + goto bail; + } + + atomic_inc(&write_wc->wc_num_reqs); + submit_bio(WRITE_SYNC, bio); + + status = 0; +bail: + return status; +} + +static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + __le32 old_cksum; + u32 ret; + + /* We want to compute the block crc with a 0 value in the + * hb_cksum field. Save it off here and replace after the + * crc. */ + old_cksum = hb_block->hb_cksum; + hb_block->hb_cksum = 0; + + ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes); + + hb_block->hb_cksum = old_cksum; + + return ret; +} + +static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block) +{ + mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, " + "cksum = 0x%x, generation 0x%llx\n", + (long long)le64_to_cpu(hb_block->hb_seq), + hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum), + (long long)le64_to_cpu(hb_block->hb_generation)); +} + +static int o2hb_verify_crc(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + u32 read, computed; + + read = le32_to_cpu(hb_block->hb_cksum); + computed = o2hb_compute_block_crc_le(reg, hb_block); + + return read == computed; +} + +/* + * Compare the slot data with what we wrote in the last iteration. + * If the match fails, print an appropriate error message. This is to + * detect errors like... another node hearting on the same slot, + * flaky device that is losing writes, etc. + * Returns 1 if check succeeds, 0 otherwise. + */ +static int o2hb_check_own_slot(struct o2hb_region *reg) +{ + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + char *errstr; + + slot = ®->hr_slots[o2nm_this_node()]; + /* Don't check on our 1st timestamp */ + if (!slot->ds_last_time) + return 0; + + hb_block = slot->ds_raw_block; + if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && + le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && + hb_block->hb_node == slot->ds_node_num) + return 1; + +#define ERRSTR1 "Another node is heartbeating on device" +#define ERRSTR2 "Heartbeat generation mismatch on device" +#define ERRSTR3 "Heartbeat sequence mismatch on device" + + if (hb_block->hb_node != slot->ds_node_num) + errstr = ERRSTR1; + else if (le64_to_cpu(hb_block->hb_generation) != + slot->ds_last_generation) + errstr = ERRSTR2; + else + errstr = ERRSTR3; + + mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + slot->ds_node_num, (unsigned long long)slot->ds_last_generation, + (unsigned long long)slot->ds_last_time, hb_block->hb_node, + (unsigned long long)le64_to_cpu(hb_block->hb_generation), + (unsigned long long)le64_to_cpu(hb_block->hb_seq)); + + return 0; +} + +static inline void o2hb_prepare_block(struct o2hb_region *reg, + u64 generation) +{ + int node_num; + u64 cputime; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + node_num = o2nm_this_node(); + slot = ®->hr_slots[node_num]; + + hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; + memset(hb_block, 0, reg->hr_block_bytes); + /* TODO: time stuff */ + cputime = CURRENT_TIME.tv_sec; + if (!cputime) + cputime = 1; + + hb_block->hb_seq = cpu_to_le64(cputime); + hb_block->hb_node = node_num; + hb_block->hb_generation = cpu_to_le64(generation); + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); + + /* This step must always happen last! */ + hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, + hb_block)); + + mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n", + (long long)generation, + le32_to_cpu(hb_block->hb_cksum)); +} + +static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, + struct o2nm_node *node, + int idx) +{ + struct o2hb_callback_func *f; + + list_for_each_entry(f, &hbcall->list, hc_item) { + mlog(ML_HEARTBEAT, "calling funcs %p\n", f); + (f->hc_func)(node, idx, f->hc_data); + } +} + +/* Will run the list in order until we process the passed event */ +static void o2hb_run_event_list(struct o2hb_node_event *queued_event) +{ + struct o2hb_callback *hbcall; + struct o2hb_node_event *event; + + /* Holding callback sem assures we don't alter the callback + * lists when doing this, and serializes ourselves with other + * processes wanting callbacks. */ + down_write(&o2hb_callback_sem); + + spin_lock(&o2hb_live_lock); + while (!list_empty(&o2hb_node_events) + && !list_empty(&queued_event->hn_item)) { + event = list_entry(o2hb_node_events.next, + struct o2hb_node_event, + hn_item); + list_del_init(&event->hn_item); + spin_unlock(&o2hb_live_lock); + + mlog(ML_HEARTBEAT, "Node %s event for %d\n", + event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN", + event->hn_node_num); + + hbcall = hbcall_from_type(event->hn_event_type); + + /* We should *never* have gotten on to the list with a + * bad type... This isn't something that we should try + * to recover from. */ + BUG_ON(IS_ERR(hbcall)); + + o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num); + + spin_lock(&o2hb_live_lock); + } + spin_unlock(&o2hb_live_lock); + + up_write(&o2hb_callback_sem); +} + +static void o2hb_queue_node_event(struct o2hb_node_event *event, + enum o2hb_callback_type type, + struct o2nm_node *node, + int node_num) +{ + assert_spin_locked(&o2hb_live_lock); + + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + + event->hn_event_type = type; + event->hn_node = node; + event->hn_node_num = node_num; + + mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n", + type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num); + + list_add_tail(&event->hn_item, &o2hb_node_events); +} + +static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) +{ + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + int queued = 0; + + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) + return; + + spin_lock(&o2hb_live_lock); + if (!list_empty(&slot->ds_live_item)) { + mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n", + slot->ds_node_num); + + list_del_init(&slot->ds_live_item); + + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, + slot->ds_node_num); + queued = 1; + } + } + spin_unlock(&o2hb_live_lock); + + if (queued) + o2hb_run_event_list(&event); + + o2nm_node_put(node); +} + +static void o2hb_set_quorum_device(struct o2hb_region *reg) +{ + if (!o2hb_global_heartbeat_active()) + return; + + /* Prevent race with o2hb_heartbeat_group_drop_item() */ + if (kthread_should_stop()) + return; + + /* Tag region as quorum only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) + return; + + spin_lock(&o2hb_live_lock); + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + goto unlock; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + goto unlock; + + printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", + config_item_name(®->hr_item), reg->hr_dev_name); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + + /* + * If global heartbeat active, unpin all regions if the + * region count > CUT_OFF + */ + if (bitmap_weight(o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) + o2hb_region_unpin(NULL); +unlock: + spin_unlock(&o2hb_live_lock); +} + +static int o2hb_check_slot(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + int changed = 0, gen_changed = 0; + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; + u64 cputime; + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; + unsigned int slot_dead_ms; + int tmp; + int queued = 0; + + memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); + + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } + + if (!o2hb_verify_crc(reg, hb_block)) { + /* all paths from here will drop o2hb_live_lock for + * us. */ + spin_lock(&o2hb_live_lock); + + /* Don't print an error on the console in this case - + * a freshly formatted heartbeat area will not have a + * crc set on it. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* The node is live but pushed out a bad crc. We + * consider it a transient miss but don't populate any + * other values as they may be junk. */ + mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", + slot->ds_node_num, reg->hr_dev_name); + o2hb_dump_slot(hb_block); + + slot->ds_equal_samples++; + goto fire_callbacks; + } + + /* we don't care if these wrap.. the state transitions below + * clear at the right places */ + cputime = le64_to_cpu(hb_block->hb_seq); + if (slot->ds_last_time != cputime) + slot->ds_changed_samples++; + else + slot->ds_equal_samples++; + slot->ds_last_time = cputime; + + /* The node changed heartbeat generations. We assume this to + * mean it dropped off but came back before we timed out. We + * want to consider it down for the time being but don't want + * to lose any changed_samples state we might build up to + * considering it live again. */ + if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) { + gen_changed = 1; + slot->ds_equal_samples = 0; + mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx " + "to 0x%llx)\n", slot->ds_node_num, + (long long)slot->ds_last_generation, + (long long)le64_to_cpu(hb_block->hb_generation)); + } + + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + + mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x " + "seq %llu last %llu changed %u equal %u\n", + slot->ds_node_num, (long long)slot->ds_last_generation, + le32_to_cpu(hb_block->hb_cksum), + (unsigned long long)le64_to_cpu(hb_block->hb_seq), + (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, + slot->ds_equal_samples); + + spin_lock(&o2hb_live_lock); + +fire_callbacks: + /* dead nodes only come to life after some number of + * changes at any time during their dead time */ + if (list_empty(&slot->ds_live_item) && + slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) { + mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", + slot->ds_node_num, (long long)slot->ds_last_generation); + + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + + /* first on the list generates a callback */ + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); + set_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, + slot->ds_node_num); + + changed = 1; + queued = 1; + } + + list_add_tail(&slot->ds_live_item, + &o2hb_live_slots[slot->ds_node_num]); + + slot->ds_equal_samples = 0; + + /* We want to be sure that all nodes agree on the + * number of milliseconds before a node will be + * considered dead. The self-fencing timeout is + * computed from this value, and a discrepancy might + * result in heartbeat calling a node dead when it + * hasn't self-fenced yet. */ + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); + if (slot_dead_ms && slot_dead_ms != dead_ms) { + /* TODO: Perhaps we can fail the region here. */ + mlog(ML_ERROR, "Node %d on device %s has a dead count " + "of %u ms, but our count is %u ms.\n" + "Please double check your configuration values " + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, + dead_ms); + } + goto out; + } + + /* if the list is dead, we're done.. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* live nodes only go dead after enough consequtive missed + * samples.. reset the missed counter whenever we see + * activity */ + if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { + mlog(ML_HEARTBEAT, "Node %d left my region\n", + slot->ds_node_num); + + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + + /* last off the live_slot generates a callback */ + list_del_init(&slot->ds_live_item); + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); + + changed = 1; + queued = 1; + } + + /* We don't clear this because the node is still + * actually writing new blocks. */ + if (!gen_changed) + slot->ds_changed_samples = 0; + goto out; + } + if (slot->ds_changed_samples) { + slot->ds_changed_samples = 0; + slot->ds_equal_samples = 0; + } +out: + spin_unlock(&o2hb_live_lock); + + if (queued) + o2hb_run_event_list(&event); + + if (node) + o2nm_node_put(node); + return changed; +} + +static int o2hb_highest_node(unsigned long *nodes, int numbits) +{ + return find_last_bit(nodes, numbits); +} + +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) +{ + int i, ret, highest_node; + int membership_change = 0, own_slot_ok = 0; + unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct o2hb_bio_wait_ctxt write_wc; + + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + goto bail; + } + + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES) { + mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); + ret = -EINVAL; + goto bail; + } + + /* No sense in reading the slots of nodes that don't exist + * yet. Of course, if the node definitions have holes in them + * then we're reading an empty slot anyway... Consider this + * best-effort. */ + ret = o2hb_read_slots(reg, highest_node + 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + /* With an up to date view of the slots, we can check that no + * other node has been improperly configured to heartbeat in + * our slot. */ + own_slot_ok = o2hb_check_own_slot(reg); + + /* fill in the proper info for our next heartbeat */ + o2hb_prepare_block(reg, reg->hr_generation); + + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + i = -1; + while((i = find_next_bit(configured_nodes, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + membership_change |= o2hb_check_slot(reg, ®->hr_slots[i]); + } + + /* + * We have to be sure we've advertised ourselves on disk + * before we can go to steady state. This ensures that + * people we find in our steady state have seen us. + */ + o2hb_wait_on_io(reg, &write_wc); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + ret = write_wc.wc_error; + goto bail; + } + + /* Skip disarming the timeout if own slot has stale/bad data */ + if (own_slot_ok) { + o2hb_set_quorum_device(reg); + o2hb_arm_write_timeout(reg); + } + +bail: + /* let the person who launched us know when things are steady */ + if (atomic_read(®->hr_steady_iterations) != 0) { + if (!ret && own_slot_ok && !membership_change) { + if (atomic_dec_and_test(®->hr_steady_iterations)) + wake_up(&o2hb_steady_queue); + } + } + + if (atomic_read(®->hr_steady_iterations) != 0) { + if (atomic_dec_and_test(®->hr_unsteady_iterations)) { + printk(KERN_NOTICE "o2hb: Unable to stabilize " + "heartbeart on region %s (%s)\n", + config_item_name(®->hr_item), + reg->hr_dev_name); + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; + wake_up(&o2hb_steady_queue); + ret = -EIO; + } + } + + return ret; +} + +/* Subtract b from a, storing the result in a. a *must* have a larger + * value than b. */ +static void o2hb_tv_subtract(struct timeval *a, + struct timeval *b) +{ + /* just return 0 when a is after b */ + if (a->tv_sec < b->tv_sec || + (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { + a->tv_sec = 0; + a->tv_usec = 0; + return; + } + + a->tv_sec -= b->tv_sec; + a->tv_usec -= b->tv_usec; + while ( a->tv_usec < 0 ) { + a->tv_sec--; + a->tv_usec += 1000000; + } +} + +static unsigned int o2hb_elapsed_msecs(struct timeval *start, + struct timeval *end) +{ + struct timeval res = *end; + + o2hb_tv_subtract(&res, start); + + return res.tv_sec * 1000 + res.tv_usec / 1000; +} + +/* + * we ride the region ref that the region dir holds. before the region + * dir is removed and drops it ref it will wait to tear down this + * thread. + */ +static int o2hb_thread(void *data) +{ + int i, ret; + struct o2hb_region *reg = data; + struct o2hb_bio_wait_ctxt write_wc; + struct timeval before_hb, after_hb; + unsigned int elapsed_msec; + + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); + + set_user_nice(current, MIN_NICE); + + /* Pin node */ + o2nm_depend_this_node(); + + while (!kthread_should_stop() && + !reg->hr_unclean_stop && !reg->hr_aborted_start) { + /* We track the time spent inside + * o2hb_do_disk_heartbeat so that we avoid more than + * hr_timeout_ms between disk writes. On busy systems + * this should result in a heartbeat which is less + * likely to time itself out. */ + do_gettimeofday(&before_hb); + + ret = o2hb_do_disk_heartbeat(reg); + + do_gettimeofday(&after_hb); + elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", + before_hb.tv_sec, (unsigned long) before_hb.tv_usec, + after_hb.tv_sec, (unsigned long) after_hb.tv_usec, + elapsed_msec, ret); + + if (!kthread_should_stop() && + elapsed_msec < reg->hr_timeout_ms) { + /* the kthread api has blocked signals for us so no + * need to record the return value. */ + msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); + } + } + + o2hb_disarm_write_timeout(reg); + + /* unclean stop is only used in very bad situation */ + for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) + o2hb_shutdown_slot(®->hr_slots[i]); + + /* Explicit down notification - avoid forcing the other nodes + * to timeout on this region when we could just as easily + * write a clear generation - thus indicating to them that + * this node has left this region. + */ + if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { + o2hb_prepare_block(reg, 0); + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret == 0) + o2hb_wait_on_io(reg, &write_wc); + else + mlog_errno(ret); + } + + /* Unpin node */ + o2nm_undepend_this_node(); + + mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); + + return 0; +} + +#ifdef CONFIG_DEBUG_FS +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long lts; + char *buf = NULL; + int i = -1; + int out = 0; + + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + lts = reg->hr_last_timeout_start; + /* If 0, it has never been set before */ + if (lts) + lts = jiffies_to_msecs(jiffies - lts); + out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); + goto done; + + case O2HB_DB_TYPE_REGION_PINNED: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + !!reg->hr_item_pinned); + goto done; + + default: + goto done; + } + + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + +done: + i_size_write(inode, out); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static const struct file_operations o2hb_debug_fops = { + .open = o2hb_debug_open, + .release = o2hb_debug_release, + .read = o2hb_debug_read, + .llseek = generic_file_llseek, +}; + +void o2hb_exit(void) +{ + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; +} + +int o2hb_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++) + INIT_LIST_HEAD(&o2hb_callbacks[i].list); + + for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) + INIT_LIST_HEAD(&o2hb_live_slots[i]); + + INIT_LIST_HEAD(&o2hb_node_events); + + memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + + o2hb_dependent_users = 0; + + return o2hb_debug_init(); +} + +/* if we're already in a callback then we're already serialized by the sem */ +static void o2hb_fill_node_map_from_callback(unsigned long *map, + unsigned bytes) +{ + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memcpy(map, &o2hb_live_node_bitmap, bytes); +} + +/* + * get a map of all nodes that are heartbeating in any regions + */ +void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +{ + /* callers want to serialize this map and callbacks so that they + * can trust that they don't miss nodes coming to the party */ + down_read(&o2hb_callback_sem); + spin_lock(&o2hb_live_lock); + o2hb_fill_node_map_from_callback(map, bytes); + spin_unlock(&o2hb_live_lock); + up_read(&o2hb_callback_sem); +} +EXPORT_SYMBOL_GPL(o2hb_fill_node_map); + +/* + * heartbeat configfs bits. The heartbeat set is a default set under + * the cluster set in nodemanager.c. + */ + +static struct o2hb_region *to_o2hb_region(struct config_item *item) +{ + return item ? container_of(item, struct o2hb_region, hr_item) : NULL; +} + +/* drop_item only drops its ref after killing the thread, nothing should + * be using the region anymore. this has to clean up any state that + * attributes might have built up. */ +static void o2hb_region_release(struct config_item *item) +{ + int i; + struct page *page; + struct o2hb_region *reg = to_o2hb_region(item); + + mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + + kfree(reg->hr_tmp_block); + + if (reg->hr_slot_data) { + for (i = 0; i < reg->hr_num_pages; i++) { + page = reg->hr_slot_data[i]; + if (page) + __free_page(page); + } + kfree(reg->hr_slot_data); + } + + if (reg->hr_bdev) + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + + kfree(reg->hr_slots); + + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); + debugfs_remove(reg->hr_debug_pinned); + debugfs_remove(reg->hr_debug_dir); + + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + spin_unlock(&o2hb_live_lock); + + kfree(reg); +} + +static int o2hb_read_block_input(struct o2hb_region *reg, + const char *page, + size_t count, + unsigned long *ret_bytes, + unsigned int *ret_bits) +{ + unsigned long bytes; + char *p = (char *)page; + + bytes = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* Heartbeat and fs min / max block sizes are the same. */ + if (bytes > 4096 || bytes < 512) + return -ERANGE; + if (hweight16(bytes) != 1) + return -EINVAL; + + if (ret_bytes) + *ret_bytes = bytes; + if (ret_bits) + *ret_bits = ffs(bytes) - 1; + + return 0; +} + +static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%u\n", reg->hr_block_bytes); +} + +static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + int status; + unsigned long block_bytes; + unsigned int block_bits; + + if (reg->hr_bdev) + return -EINVAL; + + status = o2hb_read_block_input(reg, page, count, + &block_bytes, &block_bits); + if (status) + return status; + + reg->hr_block_bytes = (unsigned int)block_bytes; + reg->hr_block_bits = block_bits; + + return count; +} + +static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%llu\n", reg->hr_start_block); +} + +static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoull(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + reg->hr_start_block = tmp; + + return count; +} + +static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%d\n", reg->hr_blocks); +} + +static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp > O2NM_MAX_NODES || tmp == 0) + return -ERANGE; + + reg->hr_blocks = (unsigned int)tmp; + + return count; +} + +static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, + char *page) +{ + unsigned int ret = 0; + + if (reg->hr_bdev) + ret = sprintf(page, "%s\n", reg->hr_dev_name); + + return ret; +} + +static void o2hb_init_region_params(struct o2hb_region *reg) +{ + reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; + + mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", + reg->hr_start_block, reg->hr_blocks); + mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n", + reg->hr_block_bytes, reg->hr_block_bits); + mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms); + mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold); +} + +static int o2hb_map_slot_data(struct o2hb_region *reg) +{ + int i, j; + unsigned int last_slot; + unsigned int spp = reg->hr_slots_per_page; + struct page *page; + char *raw; + struct o2hb_disk_slot *slot; + + reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); + if (reg->hr_tmp_block == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slots = kcalloc(reg->hr_blocks, + sizeof(struct o2hb_disk_slot), GFP_KERNEL); + if (reg->hr_slots == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + slot->ds_node_num = i; + INIT_LIST_HEAD(&slot->ds_live_item); + slot->ds_raw_block = NULL; + } + + reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp; + mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks " + "at %u blocks per page\n", + reg->hr_num_pages, reg->hr_blocks, spp); + + reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), + GFP_KERNEL); + if (!reg->hr_slot_data) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_num_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slot_data[i] = page; + + last_slot = i * spp; + raw = page_address(page); + for (j = 0; + (j < spp) && ((j + last_slot) < reg->hr_blocks); + j++) { + BUG_ON((j + last_slot) >= reg->hr_blocks); + + slot = ®->hr_slots[j + last_slot]; + slot->ds_raw_block = + (struct o2hb_disk_heartbeat_block *) raw; + + raw += reg->hr_block_bytes; + } + } + + return 0; +} + +/* Read in all the slots available and populate the tracking + * structures so that we can start with a baseline idea of what's + * there. */ +static int o2hb_populate_slot_data(struct o2hb_region *reg) +{ + int ret, i; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + ret = o2hb_read_slots(reg, reg->hr_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* We only want to get an idea of the values initially in each + * slot, so we do no verification - o2hb_check_slot will + * actually determine if each configured slot is valid and + * whether any values have changed. */ + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block; + + /* Only fill the values that o2hb_check_slot uses to + * determine changing slots */ + slot->ds_last_time = le64_to_cpu(hb_block->hb_seq); + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + } + +out: + return ret; +} + +/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + struct task_struct *hb_task; + long fd; + int sectsize; + char *p = (char *)page; + struct fd f; + struct inode *inode; + ssize_t ret = -EINVAL; + int live_threshold; + + if (reg->hr_bdev) + goto out; + + /* We can't heartbeat without having had our node number + * configured yet. */ + if (o2nm_this_node() == O2NM_MAX_NODES) + goto out; + + fd = simple_strtol(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + goto out; + + if (fd < 0 || fd >= INT_MAX) + goto out; + + f = fdget(fd); + if (f.file == NULL) + goto out; + + if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || + reg->hr_block_bytes == 0) + goto out2; + + inode = igrab(f.file->f_mapping->host); + if (inode == NULL) + goto out2; + + if (!S_ISBLK(inode->i_mode)) + goto out3; + + reg->hr_bdev = I_BDEV(f.file->f_mapping->host); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); + if (ret) { + reg->hr_bdev = NULL; + goto out3; + } + inode = NULL; + + bdevname(reg->hr_bdev, reg->hr_dev_name); + + sectsize = bdev_logical_block_size(reg->hr_bdev); + if (sectsize != reg->hr_block_bytes) { + mlog(ML_ERROR, + "blocksize %u incorrect for device, expected %d", + reg->hr_block_bytes, sectsize); + ret = -EINVAL; + goto out3; + } + + o2hb_init_region_params(reg); + + /* Generation of zero is invalid */ + do { + get_random_bytes(®->hr_generation, + sizeof(reg->hr_generation)); + } while (reg->hr_generation == 0); + + ret = o2hb_map_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out3; + } + + ret = o2hb_populate_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out3; + } + + INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + + /* + * A node is considered live after it has beat LIVE_THRESHOLD + * times. We're not steady until we've given them a chance + * _after_ our first read. + * The default threshold is bare minimum so as to limit the delay + * during mounts. For global heartbeat, the threshold doubled for the + * first region. + */ + live_threshold = O2HB_LIVE_THRESHOLD; + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + live_threshold <<= 1; + spin_unlock(&o2hb_live_lock); + } + ++live_threshold; + atomic_set(®->hr_steady_iterations, live_threshold); + /* unsteady_iterations is double the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + + hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", + reg->hr_item.ci_name); + if (IS_ERR(hb_task)) { + ret = PTR_ERR(hb_task); + mlog_errno(ret); + goto out3; + } + + spin_lock(&o2hb_live_lock); + reg->hr_task = hb_task; + spin_unlock(&o2hb_live_lock); + + ret = wait_event_interruptible(o2hb_steady_queue, + atomic_read(®->hr_steady_iterations) == 0); + if (ret) { + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; + } + + if (reg->hr_aborted_start) { + ret = -EIO; + goto out3; + } + + /* Ok, we were woken. Make sure it wasn't by drop_item() */ + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); + spin_unlock(&o2hb_live_lock); + + if (hb_task) + ret = count; + else + ret = -EIO; + + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", + config_item_name(®->hr_item), reg->hr_dev_name); + +out3: + iput(inode); +out2: + fdput(f); +out: + if (ret < 0) { + if (reg->hr_bdev) { + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + reg->hr_bdev = NULL; + } + } + return ret; +} + +static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, + char *page) +{ + pid_t pid = 0; + + spin_lock(&o2hb_live_lock); + if (reg->hr_task) + pid = task_pid_nr(reg->hr_task); + spin_unlock(&o2hb_live_lock); + + if (!pid) + return 0; + + return sprintf(page, "%u\n", pid); +} + +struct o2hb_region_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_region *, char *); + ssize_t (*store)(struct o2hb_region *, const char *, size_t); +}; + +static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "block_bytes", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_block_bytes_read, + .store = o2hb_region_block_bytes_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_start_block = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "start_block", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_start_block_read, + .store = o2hb_region_start_block_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_blocks = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "blocks", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_blocks_read, + .store = o2hb_region_blocks_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_dev = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dev", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_dev_read, + .store = o2hb_region_dev_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_pid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "pid", + .ca_mode = S_IRUGO | S_IRUSR }, + .show = o2hb_region_pid_read, +}; + +static struct configfs_attribute *o2hb_region_attrs[] = { + &o2hb_region_attr_block_bytes.attr, + &o2hb_region_attr_start_block.attr, + &o2hb_region_attr_blocks.attr, + &o2hb_region_attr_dev.attr, + &o2hb_region_attr_pid.attr, + NULL, +}; + +static ssize_t o2hb_region_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = 0; + + if (o2hb_region_attr->show) + ret = o2hb_region_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_region_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_region_attr->store) + ret = o2hb_region_attr->store(reg, page, count); + return ret; +} + +static struct configfs_item_operations o2hb_region_item_ops = { + .release = o2hb_region_release, + .show_attribute = o2hb_region_show, + .store_attribute = o2hb_region_store, +}; + +static struct config_item_type o2hb_region_type = { + .ct_item_ops = &o2hb_region_item_ops, + .ct_attrs = o2hb_region_attrs, + .ct_owner = THIS_MODULE, +}; + +/* heartbeat set */ + +struct o2hb_heartbeat_group { + struct config_group hs_group; + /* some stuff? */ +}; + +static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2hb_heartbeat_group, hs_group) + : NULL; +} + +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, + &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), + O2NM_MAX_NODES, reg); + if (!reg->hr_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_regnum = + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, + reg->hr_debug_dir, + &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, + 0, O2NM_MAX_NODES, reg); + if (!reg->hr_debug_regnum) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_elapsed_time = + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, + reg->hr_debug_dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, + 0, 0, reg); + if (!reg->hr_debug_elapsed_time) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_pinned = + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, + reg->hr_debug_dir, + &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, + 0, 0, reg); + if (!reg->hr_debug_pinned) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + return ret; +} + +static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, + const char *name) +{ + struct o2hb_region *reg = NULL; + int ret; + + reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); + if (reg == NULL) + return ERR_PTR(-ENOMEM); + + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { + ret = -ENAMETOOLONG; + goto free; + } + + spin_lock(&o2hb_live_lock); + reg->hr_region_num = 0; + if (o2hb_global_heartbeat_active()) { + reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, + O2NM_MAX_REGIONS); + if (reg->hr_region_num >= O2NM_MAX_REGIONS) { + spin_unlock(&o2hb_live_lock); + ret = -EFBIG; + goto free; + } + set_bit(reg->hr_region_num, o2hb_region_bitmap); + } + list_add_tail(®->hr_all_item, &o2hb_all_regions); + spin_unlock(&o2hb_live_lock); + + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); + if (ret) { + config_item_put(®->hr_item); + goto free; + } + + return ®->hr_item; +free: + kfree(reg); + return ERR_PTR(ret); +} + +static void o2hb_heartbeat_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct task_struct *hb_task; + struct o2hb_region *reg = to_o2hb_region(item); + int quorum_region = 0; + + /* stop the thread when the user removes the region dir */ + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + reg->hr_task = NULL; + reg->hr_item_dropped = 1; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + kthread_stop(hb_task); + + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + spin_unlock(&o2hb_live_lock); + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + ((atomic_read(®->hr_steady_iterations) == 0) ? + "stopped" : "start aborted"), config_item_name(item), + reg->hr_dev_name); + } + + /* + * If we're racing a dev_write(), we need to wake them. They will + * check reg->hr_task + */ + if (atomic_read(®->hr_steady_iterations) != 0) { + reg->hr_aborted_start = 1; + atomic_set(®->hr_steady_iterations, 0); + wake_up(&o2hb_steady_queue); + } + + config_item_put(item); + + if (!o2hb_global_heartbeat_active() || !quorum_region) + return; + + /* + * If global heartbeat active and there are dependent users, + * pin all regions if quorum region count <= CUT_OFF + */ + spin_lock(&o2hb_live_lock); + + if (!o2hb_dependent_users) + goto unlock; + + if (bitmap_weight(o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); +} + +struct o2hb_heartbeat_group_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_heartbeat_group *, char *); + ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); +}; + +static ssize_t o2hb_heartbeat_group_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = 0; + + if (o2hb_heartbeat_group_attr->show) + ret = o2hb_heartbeat_group_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_heartbeat_group_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_heartbeat_group_attr->store) + ret = o2hb_heartbeat_group_attr->store(reg, page, count); + return ret; +} + +static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%u\n", o2hb_dead_threshold); +} + +static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 10); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* this will validate ranges for us. */ + o2hb_dead_threshold_set((unsigned int) tmp); + + return count; +} + +static +ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%s\n", + o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); +} + +static +ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, + const char *page, size_t count) +{ + unsigned int i; + int ret; + size_t len; + + len = (page[count - 1] == '\n') ? count - 1 : count; + if (!len) + return -EINVAL; + + for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { + if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len)) + continue; + + ret = o2hb_global_heartbeat_mode_set(i); + if (!ret) + printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", + o2hb_heartbeat_mode_desc[i]); + return count; + } + + return -EINVAL; + +} + +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dead_threshold", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_threshold_show, + .store = o2hb_heartbeat_group_threshold_store, +}; + +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "mode", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_mode_show, + .store = o2hb_heartbeat_group_mode_store, +}; + +static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { + &o2hb_heartbeat_group_attr_threshold.attr, + &o2hb_heartbeat_group_attr_mode.attr, + NULL, +}; + +static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { + .show_attribute = o2hb_heartbeat_group_show, + .store_attribute = o2hb_heartbeat_group_store, +}; + +static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { + .make_item = o2hb_heartbeat_group_make_item, + .drop_item = o2hb_heartbeat_group_drop_item, +}; + +static struct config_item_type o2hb_heartbeat_group_type = { + .ct_group_ops = &o2hb_heartbeat_group_group_ops, + .ct_item_ops = &o2hb_heartbeat_group_item_ops, + .ct_attrs = o2hb_heartbeat_group_attrs, + .ct_owner = THIS_MODULE, +}; + +/* this is just here to avoid touching group in heartbeat.h which the + * entire damn world #includes */ +struct config_group *o2hb_alloc_hb_set(void) +{ + struct o2hb_heartbeat_group *hs = NULL; + struct config_group *ret = NULL; + + hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); + if (hs == NULL) + goto out; + + config_group_init_type_name(&hs->hs_group, "heartbeat", + &o2hb_heartbeat_group_type); + + ret = &hs->hs_group; +out: + if (ret == NULL) + kfree(hs); + return ret; +} + +void o2hb_free_hb_set(struct config_group *group) +{ + struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group); + kfree(hs); +} + +/* hb callback registration and issuing */ + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) +{ + if (type == O2HB_NUM_CB) + return ERR_PTR(-EINVAL); + + return &o2hb_callbacks[type]; +} + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority) +{ + INIT_LIST_HEAD(&hc->hc_item); + hc->hc_func = func; + hc->hc_data = data; + hc->hc_priority = priority; + hc->hc_type = type; + hc->hc_magic = O2HB_CB_MAGIC; +} +EXPORT_SYMBOL_GPL(o2hb_setup_callback); + +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only pin the matching region. In global we pin all the active + * regions. + */ +static int o2hb_region_pin(const char *region_uuid) +{ + int ret = 0, found = 0; + struct o2hb_region *reg; + char *uuid; + + assert_spin_locked(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + + uuid = config_item_name(®->hr_item); + + /* local heartbeat */ + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned || reg->hr_item_dropped) + goto skip_pin; + + /* Ignore ENOENT only for local hb (userdlm domain) */ + ret = o2nm_depend_item(®->hr_item); + if (!ret) { + mlog(ML_CLUSTER, "Pin region %s\n", uuid); + reg->hr_item_pinned = 1; + } else { + if (ret == -ENOENT && found) + ret = 0; + else { + mlog(ML_ERROR, "Pin region %s fails with %d\n", + uuid, ret); + break; + } + } +skip_pin: + if (found) + break; + } + + return ret; +} + +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only unpin the matching region. In global we unpin all the + * active regions. + */ +static void o2hb_region_unpin(const char *region_uuid) +{ + struct o2hb_region *reg; + char *uuid; + int found = 0; + + assert_spin_locked(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + + uuid = config_item_name(®->hr_item); + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned) { + mlog(ML_CLUSTER, "Unpin region %s\n", uuid); + o2nm_undepend_item(®->hr_item); + reg->hr_item_pinned = 0; + } + if (found) + break; + } +} + +static int o2hb_region_inc_user(const char *region_uuid) +{ + int ret = 0; + + spin_lock(&o2hb_live_lock); + + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + ret = o2hb_region_pin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and this is the first dependent user, + * pin all regions if quorum region count <= CUT_OFF + */ + o2hb_dependent_users++; + if (o2hb_dependent_users > 1) + goto unlock; + + if (bitmap_weight(o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + ret = o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); + return ret; +} + +void o2hb_region_dec_user(const char *region_uuid) +{ + spin_lock(&o2hb_live_lock); + + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + o2hb_region_unpin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and there are no dependent users, + * unpin all quorum regions + */ + o2hb_dependent_users--; + if (!o2hb_dependent_users) + o2hb_region_unpin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); +} + +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc) +{ + struct o2hb_callback_func *f; + struct o2hb_callback *hbcall; + int ret; + + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + BUG_ON(!list_empty(&hc->hc_item)); + + hbcall = hbcall_from_type(hc->hc_type); + if (IS_ERR(hbcall)) { + ret = PTR_ERR(hbcall); + goto out; + } + + if (region_uuid) { + ret = o2hb_region_inc_user(region_uuid); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + down_write(&o2hb_callback_sem); + + list_for_each_entry(f, &hbcall->list, hc_item) { + if (hc->hc_priority < f->hc_priority) { + list_add_tail(&hc->hc_item, &f->hc_item); + break; + } + } + if (list_empty(&hc->hc_item)) + list_add_tail(&hc->hc_item, &hbcall->list); + + up_write(&o2hb_callback_sem); + ret = 0; +out: + mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", + ret, __builtin_return_address(0), hc); + return ret; +} +EXPORT_SYMBOL_GPL(o2hb_register_callback); + +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc) +{ + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + + mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", + __builtin_return_address(0), hc); + + /* XXX Can this happen _with_ a region reference? */ + if (list_empty(&hc->hc_item)) + return; + + if (region_uuid) + o2hb_region_dec_user(region_uuid); + + down_write(&o2hb_callback_sem); + + list_del_init(&hc->hc_item); + + up_write(&o2hb_callback_sem); +} +EXPORT_SYMBOL_GPL(o2hb_unregister_callback); + +int o2hb_check_node_heartbeating(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); + +int o2hb_check_node_heartbeating_no_sem(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long flags; + + spin_lock_irqsave(&o2hb_live_lock, flags); + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); + +int o2hb_check_node_heartbeating_from_callback(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); + +/* Makes sure our local node is configured with a node number, and is + * heartbeating. */ +int o2hb_check_local_node_heartbeating(void) +{ + u8 node_num; + + /* if this node was set then we have networking */ + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + mlog(ML_HEARTBEAT, "this node has not been configured.\n"); + return 0; + } + + return o2hb_check_node_heartbeating(node_num); +} +EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); + +/* + * this is just a hack until we get the plumbing which flips file systems + * read only and drops the hb ref instead of killing the node dead. + */ +void o2hb_stop_all_regions(void) +{ + struct o2hb_region *reg; + + mlog(ML_ERROR, "stopping heartbeat on all active regions.\n"); + + spin_lock(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) + reg->hr_unclean_stop = 1; + + spin_unlock(&o2hb_live_lock); +} +EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); + +int o2hb_get_all_regions(char *region_uuids, u8 max_regions) +{ + struct o2hb_region *reg; + int numregs = 0; + char *p; + + spin_lock(&o2hb_live_lock); + + p = region_uuids; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); + if (numregs < max_regions) { + memcpy(p, config_item_name(®->hr_item), + O2HB_MAX_REGION_NAME_LEN); + p += O2HB_MAX_REGION_NAME_LEN; + } + numregs++; + } + + spin_unlock(&o2hb_live_lock); + + return numregs; +} +EXPORT_SYMBOL_GPL(o2hb_get_all_regions); + +int o2hb_global_heartbeat_active(void) +{ + return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); +} +EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/kernel/fs/ocfs2/cluster/heartbeat.h b/kernel/fs/ocfs2/cluster/heartbeat.h new file mode 100644 index 000000000..3ef5137dc --- /dev/null +++ b/kernel/fs/ocfs2/cluster/heartbeat.h @@ -0,0 +1,90 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_HEARTBEAT_H +#define O2CLUSTER_HEARTBEAT_H + +#include "ocfs2_heartbeat.h" + +#define O2HB_REGION_TIMEOUT_MS 2000 + +#define O2HB_MAX_REGION_NAME_LEN 32 + +/* number of changes to be seen as live */ +#define O2HB_LIVE_THRESHOLD 2 +/* number of equal samples to be seen as dead */ +extern unsigned int o2hb_dead_threshold; +#define O2HB_DEFAULT_DEAD_THRESHOLD 31 +/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ +#define O2HB_MIN_DEAD_THRESHOLD 2 +#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) + +#define O2HB_CB_MAGIC 0x51d1e4ec + +/* callback stuff */ +enum o2hb_callback_type { + O2HB_NODE_DOWN_CB = 0, + O2HB_NODE_UP_CB, + O2HB_NUM_CB +}; + +struct o2nm_node; +typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *); + +struct o2hb_callback_func { + u32 hc_magic; + struct list_head hc_item; + o2hb_cb_func *hc_func; + void *hc_data; + int hc_priority; + enum o2hb_callback_type hc_type; +}; + +struct config_group *o2hb_alloc_hb_set(void); +void o2hb_free_hb_set(struct config_group *group); + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority); +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc); +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc); +void o2hb_fill_node_map(unsigned long *map, + unsigned bytes); +void o2hb_exit(void); +int o2hb_init(void); +int o2hb_check_node_heartbeating(u8 node_num); +int o2hb_check_node_heartbeating_no_sem(u8 node_num); +int o2hb_check_node_heartbeating_from_callback(u8 node_num); +int o2hb_check_local_node_heartbeating(void); +void o2hb_stop_all_regions(void); +int o2hb_get_all_regions(char *region_uuids, u8 numregions); +int o2hb_global_heartbeat_active(void); + +#endif /* O2CLUSTER_HEARTBEAT_H */ diff --git a/kernel/fs/ocfs2/cluster/masklog.c b/kernel/fs/ocfs2/cluster/masklog.c new file mode 100644 index 000000000..af7598bff --- /dev/null +++ b/kernel/fs/ocfs2/cluster/masklog.c @@ -0,0 +1,155 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include "masklog.h" + +struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); +EXPORT_SYMBOL_GPL(mlog_and_bits); +struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0); +EXPORT_SYMBOL_GPL(mlog_not_bits); + +static ssize_t mlog_mask_show(u64 mask, char *buf) +{ + char *state; + + if (__mlog_test_u64(mask, mlog_and_bits)) + state = "allow"; + else if (__mlog_test_u64(mask, mlog_not_bits)) + state = "deny"; + else + state = "off"; + + return snprintf(buf, PAGE_SIZE, "%s\n", state); +} + +static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) +{ + if (!strncasecmp(buf, "allow", 5)) { + __mlog_set_u64(mask, mlog_and_bits); + __mlog_clear_u64(mask, mlog_not_bits); + } else if (!strncasecmp(buf, "deny", 4)) { + __mlog_set_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else if (!strncasecmp(buf, "off", 3)) { + __mlog_clear_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else + return -EINVAL; + + return count; +} + +struct mlog_attribute { + struct attribute attr; + u64 mask; +}; + +#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) + +#define define_mask(_name) { \ + .attr = { \ + .name = #_name, \ + .mode = S_IRUGO | S_IWUSR, \ + }, \ + .mask = ML_##_name, \ +} + +static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { + define_mask(TCP), + define_mask(MSG), + define_mask(SOCKET), + define_mask(HEARTBEAT), + define_mask(HB_BIO), + define_mask(DLMFS), + define_mask(DLM), + define_mask(DLM_DOMAIN), + define_mask(DLM_THREAD), + define_mask(DLM_MASTER), + define_mask(DLM_RECOVERY), + define_mask(DLM_GLUE), + define_mask(VOTE), + define_mask(CONN), + define_mask(QUORUM), + define_mask(BASTS), + define_mask(CLUSTER), + define_mask(ERROR), + define_mask(NOTICE), + define_mask(KTHREAD), +}; + +static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; + +static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, + char *buf) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_show(mlog_attr->mask, buf); +} + +static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, + const char *buf, size_t count) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_store(mlog_attr->mask, buf, count); +} + +static const struct sysfs_ops mlog_attr_ops = { + .show = mlog_show, + .store = mlog_store, +}; + +static struct kobj_type mlog_ktype = { + .default_attrs = mlog_attr_ptrs, + .sysfs_ops = &mlog_attr_ops, +}; + +static struct kset mlog_kset = { + .kobj = {.ktype = &mlog_ktype}, +}; + +int mlog_sys_init(struct kset *o2cb_kset) +{ + int i = 0; + + while (mlog_attrs[i].attr.mode) { + mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + i++; + } + mlog_attr_ptrs[i] = NULL; + + kobject_set_name(&mlog_kset.kobj, "logmask"); + mlog_kset.kobj.kset = o2cb_kset; + return kset_register(&mlog_kset); +} + +void mlog_sys_shutdown(void) +{ + kset_unregister(&mlog_kset); +} diff --git a/kernel/fs/ocfs2/cluster/masklog.h b/kernel/fs/ocfs2/cluster/masklog.h new file mode 100644 index 000000000..7fdc25a4d --- /dev/null +++ b/kernel/fs/ocfs2/cluster/masklog.h @@ -0,0 +1,221 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_MASKLOG_H +#define O2CLUSTER_MASKLOG_H + +/* + * For now this is a trivial wrapper around printk() that gives the critical + * ability to enable sets of debugging output at run-time. In the future this + * will almost certainly be redirected to relayfs so that it can pay a + * substantially lower heisenberg tax. + * + * Callers associate the message with a bitmask and a global bitmask is + * maintained with help from /proc. If any of the bits match the message is + * output. + * + * We must have efficient bit tests on i386 and it seems gcc still emits crazy + * code for the 64bit compare. It emits very good code for the dual unsigned + * long tests, though, completely avoiding tests that can never pass if the + * caller gives a constant bitmask that fills one of the longs with all 0s. So + * the desire is to have almost all of the calls decided on by comparing just + * one of the longs. This leads to having infrequently given bits that are + * frequently matched in the high bits. + * + * _ERROR and _NOTICE are used for messages that always go to the console and + * have appropriate KERN_ prefixes. We wrap these in our function instead of + * just calling printk() so that this can eventually make its way through + * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. + * The inline tests and macro dance give GCC the opportunity to quite cleverly + * only emit the appropriage printk() when the caller passes in a constant + * mask, as is almost always the case. + * + * All this bitmask nonsense is managed from the files under + * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward + * indication of which bits are allowed (allow) or denied (off/deny). + * ENTRY deny + * EXIT deny + * TCP off + * MSG off + * SOCKET off + * ERROR allow + * NOTICE allow + * + * Writing changes the state of a given bit and requires a strictly formatted + * single write() call: + * + * write(fd, "allow", 5); + * + * Echoing allow/deny/off string into the logmask files can flip the bits + * on or off as expected; here is the bash script for example: + * + * log_mask="/sys/fs/o2cb/log_mask" + * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do + * echo allow >"$log_mask"/"$node" + * done + * + * The debugfs.ocfs2 tool can also flip the bits with the -l option: + * + * debugfs.ocfs2 -l TCP allow + */ + +/* for task_struct */ +#include + +/* bits that are frequently given and infrequently matched in the low word */ +/* NOTE: If you add a flag, you need to also update masklog.c! */ +#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */ +#define ML_MSG 0x0000000000000002ULL /* net network messages */ +#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */ +#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */ +#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */ +#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */ +#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */ +#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */ +#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */ +#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */ +#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */ +#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */ +#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */ +#define ML_CONN 0x0000000000002000ULL /* net connection management */ +#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */ +#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */ +#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */ + +/* bits that are infrequently given and frequently matched in the high word */ +#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ + +#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) +#ifndef MLOG_MASK_PREFIX +#define MLOG_MASK_PREFIX 0 +#endif + +/* + * When logging is disabled, force the bit test to 0 for anything other + * than errors and notices, allowing gcc to remove the code completely. + * When enabled, allow all masks. + */ +#if defined(CONFIG_OCFS2_DEBUG_MASKLOG) +#define ML_ALLOWED_BITS ~0 +#else +#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE) +#endif + +#define MLOG_MAX_BITS 64 + +struct mlog_bits { + unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; +}; + +extern struct mlog_bits mlog_and_bits, mlog_not_bits; + +#if BITS_PER_LONG == 32 + +#define __mlog_test_u64(mask, bits) \ + ( (u32)(mask & 0xffffffff) & bits.words[0] || \ + ((u64)(mask) >> 32) & bits.words[1] ) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (u32)(mask & 0xffffffff); \ + bits.words[1] |= (u64)(mask) >> 32; \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ + bits.words[1] &= ~((u64)(mask) >> 32); \ +} while (0) +#define MLOG_BITS_RHS(mask) { \ + { \ + [0] = (u32)(mask & 0xffffffff), \ + [1] = (u64)(mask) >> 32, \ + } \ +} + +#else /* 32bit long above, 64bit long below */ + +#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (mask); \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~(mask); \ +} while (0) +#define MLOG_BITS_RHS(mask) { { (mask) } } + +#endif + +/* + * smp_processor_id() "helpfully" screams when called outside preemptible + * regions in current kernels. sles doesn't have the variants that don't + * scream. just do this instead of trying to guess which we're building + * against.. *sigh*. + */ +#define __mlog_cpu_guess ({ \ + unsigned long _cpu = get_cpu(); \ + put_cpu(); \ + _cpu; \ +}) + +/* In the following two macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define __mlog_printk(level, fmt, args...) \ + printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ + task_pid_nr(current), __mlog_cpu_guess, \ + __PRETTY_FUNCTION__, __LINE__ , ##args) + +#define mlog(mask, fmt, args...) do { \ + u64 __m = MLOG_MASK_PREFIX | (mask); \ + if ((__m & ML_ALLOWED_BITS) && \ + __mlog_test_u64(__m, mlog_and_bits) && \ + !__mlog_test_u64(__m, mlog_not_bits)) { \ + if (__m & ML_ERROR) \ + __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ + else if (__m & ML_NOTICE) \ + __mlog_printk(KERN_NOTICE, fmt , ##args); \ + else __mlog_printk(KERN_INFO, fmt , ##args); \ + } \ +} while (0) + +#define mlog_errno(st) ({ \ + int _st = (st); \ + if (_st != -ERESTARTSYS && _st != -EINTR && \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ + _st != -EDQUOT) \ + mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ + _st; \ +}) + +#define mlog_bug_on_msg(cond, fmt, args...) do { \ + if (cond) { \ + mlog(ML_ERROR, "bug expression: " #cond "\n"); \ + mlog(ML_ERROR, fmt, ##args); \ + BUG(); \ + } \ +} while (0) + +#include +#include +int mlog_sys_init(struct kset *o2cb_subsys); +void mlog_sys_shutdown(void); + +#endif /* O2CLUSTER_MASKLOG_H */ diff --git a/kernel/fs/ocfs2/cluster/netdebug.c b/kernel/fs/ocfs2/cluster/netdebug.c new file mode 100644 index 000000000..27d1242c8 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/netdebug.c @@ -0,0 +1,539 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * netdebug.c + * + * debug functionality for o2net + * + * Copyright (C) 2005, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifdef CONFIG_DEBUG_FS + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" + +#include "tcp_internal.h" + +#define O2NET_DEBUG_DIR "o2net" +#define SC_DEBUG_NAME "sock_containers" +#define NST_DEBUG_NAME "send_tracking" +#define STATS_DEBUG_NAME "stats" +#define NODES_DEBUG_NAME "connected_nodes" + +#define SHOW_SOCK_CONTAINERS 0 +#define SHOW_SOCK_STATS 1 + +static struct dentry *o2net_dentry; +static struct dentry *sc_dentry; +static struct dentry *nst_dentry; +static struct dentry *stats_dentry; +static struct dentry *nodes_dentry; + +static DEFINE_SPINLOCK(o2net_debug_lock); + +static LIST_HEAD(sock_containers); +static LIST_HEAD(send_tracking); + +void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + list_add(&nst->st_net_debug_item, &send_tracking); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + if (!list_empty(&nst->st_net_debug_item)) + list_del_init(&nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +static struct o2net_send_tracking + *next_nst(struct o2net_send_tracking *nst_start) +{ + struct o2net_send_tracking *nst, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(nst, &nst_start->st_net_debug_item, + st_net_debug_item) { + /* discover the head of the list */ + if (&nst->st_net_debug_item == &send_tracking) + break; + + /* use st_task to detect real nsts in the list */ + if (nst->st_task != NULL) { + ret = nst; + break; + } + } + + return ret; +} + +static void *nst_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + spin_unlock(&o2net_debug_lock); + + return nst; +} + +static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + list_del_init(&dummy_nst->st_net_debug_item); + if (nst) + list_add(&dummy_nst->st_net_debug_item, + &nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return nst; /* unused, just needs to be null when done */ +} + +static int nst_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + ktime_t now; + s64 sock, send, status; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + if (!nst) + goto out; + + now = ktime_get(); + sock = ktime_to_us(ktime_sub(now, nst->st_sock_time)); + send = ktime_to_us(ktime_sub(now, nst->st_send_time)); + status = ktime_to_us(ktime_sub(now, nst->st_status_time)); + + /* get_task_comm isn't exported. oh well. */ + seq_printf(seq, "%p:\n" + " pid: %lu\n" + " tgid: %lu\n" + " process name: %s\n" + " node: %u\n" + " sc: %p\n" + " message id: %d\n" + " message type: %u\n" + " message key: 0x%08x\n" + " sock acquiry: %lld usecs ago\n" + " send start: %lld usecs ago\n" + " wait start: %lld usecs ago\n", + nst, (unsigned long)task_pid_nr(nst->st_task), + (unsigned long)nst->st_task->tgid, + nst->st_task->comm, nst->st_node, + nst->st_sc, nst->st_id, nst->st_msg_type, + nst->st_msg_key, + (long long)sock, + (long long)send, + (long long)status); + +out: + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void nst_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations nst_seq_ops = { + .start = nst_seq_start, + .next = nst_seq_next, + .stop = nst_seq_stop, + .show = nst_seq_show, +}; + +static int nst_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_send_tracking *dummy_nst; + + dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); + if (!dummy_nst) + return -ENOMEM; + o2net_debug_add_nst(dummy_nst); + + return 0; +} + +static int nst_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_send_tracking *dummy_nst = seq->private; + + o2net_debug_del_nst(dummy_nst); + return seq_release_private(inode, file); +} + +static const struct file_operations nst_seq_fops = { + .open = nst_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nst_fop_release, +}; + +void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_add(&sc->sc_net_debug_item, &sock_containers); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_del_init(&sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +struct o2net_sock_debug { + int dbg_ctxt; + struct o2net_sock_container *dbg_sock; +}; + +static struct o2net_sock_container + *next_sc(struct o2net_sock_container *sc_start) +{ + struct o2net_sock_container *sc, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(sc, &sc_start->sc_net_debug_item, + sc_net_debug_item) { + /* discover the head of the list miscast as a sc */ + if (&sc->sc_net_debug_item == &sock_containers) + break; + + /* use sc_page to detect real scs in the list */ + if (sc->sc_page != NULL) { + ret = sc; + break; + } + } + + return ret; +} + +static void *sc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + spin_unlock(&o2net_debug_lock); + + return sc; +} + +static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + list_del_init(&dummy_sc->sc_net_debug_item); + if (sc) + list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return sc; /* unused, just needs to be null when done */ +} + +#ifdef CONFIG_OCFS2_FS_STATS +# define sc_send_count(_s) ((_s)->sc_send_count) +# define sc_recv_count(_s) ((_s)->sc_recv_count) +# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total)) +# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total)) +# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total)) +# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total)) +#else +# define sc_send_count(_s) (0U) +# define sc_recv_count(_s) (0U) +# define sc_tv_acquiry_total_ns(_s) (0LL) +# define sc_tv_send_total_ns(_s) (0LL) +# define sc_tv_status_total_ns(_s) (0LL) +# define sc_tv_process_total_ns(_s) (0LL) +#endif + +/* So that debugfs.ocfs2 can determine which format is being used */ +#define O2NET_STATS_STR_VERSION 1 +static void sc_show_sock_stats(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + if (!sc) + return; + + seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION, + sc->sc_node->nd_num, (unsigned long)sc_send_count(sc), + (long long)sc_tv_acquiry_total_ns(sc), + (long long)sc_tv_send_total_ns(sc), + (long long)sc_tv_status_total_ns(sc), + (unsigned long)sc_recv_count(sc), + (long long)sc_tv_process_total_ns(sc)); +} + +static void sc_show_sock_container(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + struct inet_sock *inet = NULL; + __be32 saddr = 0, daddr = 0; + __be16 sport = 0, dport = 0; + + if (!sc) + return; + + if (sc->sc_sock) { + inet = inet_sk(sc->sc_sock->sk); + /* the stack's structs aren't sparse endian clean */ + saddr = (__force __be32)inet->inet_saddr; + daddr = (__force __be32)inet->inet_daddr; + sport = (__force __be16)inet->inet_sport; + dport = (__force __be16)inet->inet_dport; + } + + /* XXX sigh, inet-> doesn't have sparse annotation so any + * use of it here generates a warning with -Wbitwise */ + seq_printf(seq, "%p:\n" + " krefs: %d\n" + " sock: %pI4:%u -> " + "%pI4:%u\n" + " remote node: %s\n" + " page off: %zu\n" + " handshake ok: %u\n" + " timer: %lld usecs\n" + " data ready: %lld usecs\n" + " advance start: %lld usecs\n" + " advance stop: %lld usecs\n" + " func start: %lld usecs\n" + " func stop: %lld usecs\n" + " func key: 0x%08x\n" + " func type: %u\n", + sc, + atomic_read(&sc->sc_kref.refcount), + &saddr, inet ? ntohs(sport) : 0, + &daddr, inet ? ntohs(dport) : 0, + sc->sc_node->nd_name, + sc->sc_page_off, + sc->sc_handshake_ok, + (long long)ktime_to_us(sc->sc_tv_timer), + (long long)ktime_to_us(sc->sc_tv_data_ready), + (long long)ktime_to_us(sc->sc_tv_advance_start), + (long long)ktime_to_us(sc->sc_tv_advance_stop), + (long long)ktime_to_us(sc->sc_tv_func_start), + (long long)ktime_to_us(sc->sc_tv_func_stop), + sc->sc_msg_key, + sc->sc_msg_type); +} + +static int sc_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + + if (sc) { + if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS) + sc_show_sock_container(seq, sc); + else + sc_show_sock_stats(seq, sc); + } + + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void sc_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations sc_seq_ops = { + .start = sc_seq_start, + .next = sc_seq_next, + .stop = sc_seq_stop, + .show = sc_seq_show, +}; + +static int sc_common_open(struct file *file, int ctxt) +{ + struct o2net_sock_debug *sd; + struct o2net_sock_container *dummy_sc; + + dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); + if (!dummy_sc) + return -ENOMEM; + + sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); + if (!sd) { + kfree(dummy_sc); + return -ENOMEM; + } + + sd->dbg_ctxt = ctxt; + sd->dbg_sock = dummy_sc; + + o2net_debug_add_sc(dummy_sc); + + return 0; +} + +static int sc_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *dummy_sc = sd->dbg_sock; + + o2net_debug_del_sc(dummy_sc); + return seq_release_private(inode, file); +} + +static int stats_fop_open(struct inode *inode, struct file *file) +{ + return sc_common_open(file, SHOW_SOCK_STATS); +} + +static const struct file_operations stats_seq_fops = { + .open = stats_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +static int sc_fop_open(struct inode *inode, struct file *file) +{ + return sc_common_open(file, SHOW_SOCK_CONTAINERS); +} + +static const struct file_operations sc_seq_fops = { + .open = sc_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +static int o2net_fill_bitmap(char *buf, int len) +{ + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int i = -1, out = 0; + + o2net_fill_node_map(map, sizeof(map)); + + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + + return out; +} + +static int nodes_fop_open(struct inode *inode, struct file *file) +{ + char *buf; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); + + file->private_data = buf; + + return 0; +} + +static int o2net_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2net_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} + +static const struct file_operations nodes_fops = { + .open = nodes_fop_open, + .release = o2net_debug_release, + .read = o2net_debug_read, + .llseek = generic_file_llseek, +}; + +void o2net_debugfs_exit(void) +{ + debugfs_remove(nodes_dentry); + debugfs_remove(stats_dentry); + debugfs_remove(sc_dentry); + debugfs_remove(nst_dentry); + debugfs_remove(o2net_dentry); +} + +int o2net_debugfs_init(void) +{ + umode_t mode = S_IFREG|S_IRUSR; + + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); + if (o2net_dentry) + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, + o2net_dentry, NULL, &nst_seq_fops); + if (nst_dentry) + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, + o2net_dentry, NULL, &sc_seq_fops); + if (sc_dentry) + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, + o2net_dentry, NULL, &stats_seq_fops); + if (stats_dentry) + nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, + o2net_dentry, NULL, &nodes_fops); + if (nodes_dentry) + return 0; + + o2net_debugfs_exit(); + mlog_errno(-ENOMEM); + return -ENOMEM; +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/fs/ocfs2/cluster/nodemanager.c b/kernel/fs/ocfs2/cluster/nodemanager.c new file mode 100644 index 000000000..441c84e16 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/nodemanager.c @@ -0,0 +1,987 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include "tcp.h" +#include "nodemanager.h" +#include "heartbeat.h" +#include "masklog.h" +#include "sys.h" + +/* for now we operate under the assertion that there can be only one + * cluster active at a time. Changing this will require trickling + * cluster references throughout where nodes are looked up */ +struct o2nm_cluster *o2nm_single_cluster = NULL; + +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; + +struct o2nm_node *o2nm_get_node_by_num(u8 node_num) +{ + struct o2nm_node *node = NULL; + + if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL) + goto out; + + read_lock(&o2nm_single_cluster->cl_nodes_lock); + node = o2nm_single_cluster->cl_nodes[node_num]; + if (node) + config_item_get(&node->nd_item); + read_unlock(&o2nm_single_cluster->cl_nodes_lock); +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_num); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes) +{ + struct o2nm_cluster *cluster = o2nm_single_cluster; + + BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); + + if (cluster == NULL) + return -EINVAL; + + read_lock(&cluster->cl_nodes_lock); + memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + read_unlock(&cluster->cl_nodes_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(o2nm_configured_node_map); + +static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster, + __be32 ip_needle, + struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; + struct rb_node *parent = NULL; + struct o2nm_node *node, *ret = NULL; + + while (*p) { + int cmp; + + parent = *p; + node = rb_entry(parent, struct o2nm_node, nd_ip_node); + + cmp = memcmp(&ip_needle, &node->nd_ipv4_address, + sizeof(ip_needle)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + ret = node; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr) +{ + struct o2nm_node *node = NULL; + struct o2nm_cluster *cluster = o2nm_single_cluster; + + if (cluster == NULL) + goto out; + + read_lock(&cluster->cl_nodes_lock); + node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); + if (node) + config_item_get(&node->nd_item); + read_unlock(&cluster->cl_nodes_lock); + +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip); + +void o2nm_node_put(struct o2nm_node *node) +{ + config_item_put(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_put); + +void o2nm_node_get(struct o2nm_node *node) +{ + config_item_get(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_get); + +u8 o2nm_this_node(void) +{ + u8 node_num = O2NM_MAX_NODES; + + if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local) + node_num = o2nm_single_cluster->cl_local_node; + + return node_num; +} +EXPORT_SYMBOL_GPL(o2nm_this_node); + +/* node configfs bits */ + +static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item) +{ + return item ? + container_of(to_config_group(item), struct o2nm_cluster, + cl_group) + : NULL; +} + +static struct o2nm_node *to_o2nm_node(struct config_item *item) +{ + return item ? container_of(item, struct o2nm_node, nd_item) : NULL; +} + +static void o2nm_node_release(struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + kfree(node); +} + +static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_num); +} + +static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) +{ + /* through the first node_set .parent + * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); +} + +enum { + O2NM_NODE_ATTR_NUM = 0, + O2NM_NODE_ATTR_PORT, + O2NM_NODE_ATTR_ADDRESS, + O2NM_NODE_ATTR_LOCAL, +}; + +static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp >= O2NM_MAX_NODES) + return -ERANGE; + + /* once we're in the cl_nodes tree networking can look us up by + * node number and try to use our address and port attributes + * to connect to this node.. make sure that they've been set + * before writing the node attribute? */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + write_lock(&cluster->cl_nodes_lock); + if (cluster->cl_nodes[tmp]) + p = NULL; + else { + cluster->cl_nodes[tmp] = node; + node->nd_num = tmp; + set_bit(tmp, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + if (p == NULL) + return -EEXIST; + + return count; +} +static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); +} + +static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, + const char *page, size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u16)-1) + return -ERANGE; + + node->nd_ipv4_port = htons(tmp); + + return count; +} + +static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%pI4\n", &node->nd_ipv4_address); +} + +static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, + const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + int ret, i; + struct rb_node **p, *parent; + unsigned int octets[4]; + __be32 ipv4_addr = 0; + + ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], + &octets[1], &octets[0]); + if (ret != 4) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(octets); i++) { + if (octets[i] > 255) + return -ERANGE; + be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); + } + + ret = 0; + write_lock(&cluster->cl_nodes_lock); + if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&node->nd_ip_node, parent, p); + rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); + } + write_unlock(&cluster->cl_nodes_lock); + if (ret) + return ret; + + memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); + + return count; +} + +static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_local); +} + +static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + ssize_t ret; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + tmp = !!tmp; /* boolean of whether this node wants to be local */ + + /* setting local turns on networking rx for now so we require having + * set everything else first */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + /* the only failure case is trying to set a new local node + * when a different one is already set */ + if (tmp && tmp == cluster->cl_has_local && + cluster->cl_local_node != node->nd_num) + return -EBUSY; + + /* bring up the rx thread if we're setting the new local node. */ + if (tmp && !cluster->cl_has_local) { + ret = o2net_start_listening(node); + if (ret) + return ret; + } + + if (!tmp && cluster->cl_has_local && + cluster->cl_local_node == node->nd_num) { + o2net_stop_listening(node); + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + } + + node->nd_local = tmp; + if (node->nd_local) { + cluster->cl_has_local = tmp; + cluster->cl_local_node = node->nd_num; + } + + return count; +} + +struct o2nm_node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_node *, char *); + ssize_t (*store)(struct o2nm_node *, const char *, size_t); +}; + +static struct o2nm_node_attribute o2nm_node_attr_num = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "num", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_num_read, + .store = o2nm_node_num_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_port", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_port_read, + .store = o2nm_node_ipv4_port_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_address", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_address_read, + .store = o2nm_node_ipv4_address_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_local_read, + .store = o2nm_node_local_write, +}; + +static struct configfs_attribute *o2nm_node_attrs[] = { + [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, + [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, + [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, + [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + NULL, +}; + +static int o2nm_attr_index(struct configfs_attribute *attr) +{ + int i; + for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { + if (attr == o2nm_node_attrs[i]) + return i; + } + BUG(); + return 0; +} + +static ssize_t o2nm_node_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret = 0; + + if (o2nm_node_attr->show) + ret = o2nm_node_attr->show(node, page); + return ret; +} + +static ssize_t o2nm_node_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret; + int attr_index = o2nm_attr_index(attr); + + if (o2nm_node_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + if (test_bit(attr_index, &node->nd_set_attributes)) + return -EBUSY; + + ret = o2nm_node_attr->store(node, page, count); + if (ret < count) + goto out; + + set_bit(attr_index, &node->nd_set_attributes); +out: + return ret; +} + +static struct configfs_item_operations o2nm_node_item_ops = { + .release = o2nm_node_release, + .show_attribute = o2nm_node_show, + .store_attribute = o2nm_node_store, +}; + +static struct config_item_type o2nm_node_type = { + .ct_item_ops = &o2nm_node_item_ops, + .ct_attrs = o2nm_node_attrs, + .ct_owner = THIS_MODULE, +}; + +/* node set */ + +struct o2nm_node_group { + struct config_group ns_group; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2nm_node_group, ns_group) + : NULL; +} +#endif + +struct o2nm_cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_cluster *, char *); + ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); +}; + +static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, + unsigned int *val) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u32)-1) + return -ERANGE; + + *val = tmp; + + return count; +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_idle_timeout_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change idle timeout after " + "the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val <= cluster->cl_keepalive_delay_ms) { + mlog(ML_NOTICE, "o2net: idle timeout must be larger " + "than keepalive delay\n"); + ret = -EINVAL; + } else { + cluster->cl_idle_timeout_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_keepalive_delay_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change keepalive delay after" + " the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val >= cluster->cl_idle_timeout_ms) { + mlog(ML_NOTICE, "o2net: keepalive delay must be " + "smaller than idle timeout\n"); + ret = -EINVAL; + } else { + cluster->cl_keepalive_delay_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + return o2nm_cluster_attr_write(page, count, + &cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + +static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "idle_timeout_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_idle_timeout_ms_read, + .store = o2nm_cluster_attr_idle_timeout_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "keepalive_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_keepalive_delay_ms_read, + .store = o2nm_cluster_attr_keepalive_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "reconnect_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_reconnect_delay_ms_read, + .store = o2nm_cluster_attr_reconnect_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + +static struct configfs_attribute *o2nm_cluster_attrs[] = { + &o2nm_cluster_attr_idle_timeout_ms.attr, + &o2nm_cluster_attr_keepalive_delay_ms.attr, + &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, + NULL, +}; +static ssize_t o2nm_cluster_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret = 0; + + if (o2nm_cluster_attr->show) + ret = o2nm_cluster_attr->show(cluster, page); + return ret; +} + +static ssize_t o2nm_cluster_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret; + + if (o2nm_cluster_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_cluster_attr->store(cluster, page, count); + if (ret < count) + goto out; +out: + return ret; +} + +static struct config_item *o2nm_node_group_make_item(struct config_group *group, + const char *name) +{ + struct o2nm_node *node = NULL; + + if (strlen(name) > O2NM_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); + if (node == NULL) + return ERR_PTR(-ENOMEM); + + strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ + config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); + spin_lock_init(&node->nd_lock); + + mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name); + + return &node->nd_item; +} + +static void o2nm_node_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); + + o2net_disconnect_node(node); + + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } + + /* XXX call into net to stop this node from trading messages */ + + write_lock(&cluster->cl_nodes_lock); + + /* XXX sloppy */ + if (node->nd_ipv4_address) + rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); + + /* nd_num might be 0 if the node number hasn't been set.. */ + if (cluster->cl_nodes[node->nd_num] == node) { + cluster->cl_nodes[node->nd_num] = NULL; + clear_bit(node->nd_num, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + + mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n", + config_item_name(&node->nd_item)); + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_node_group_group_ops = { + .make_item = o2nm_node_group_make_item, + .drop_item = o2nm_node_group_drop_item, +}; + +static struct config_item_type o2nm_node_group_type = { + .ct_group_ops = &o2nm_node_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +/* cluster */ + +static void o2nm_cluster_release(struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + + kfree(cluster->cl_group.default_groups); + kfree(cluster); +} + +static struct configfs_item_operations o2nm_cluster_item_ops = { + .release = o2nm_cluster_release, + .show_attribute = o2nm_cluster_show, + .store_attribute = o2nm_cluster_store, +}; + +static struct config_item_type o2nm_cluster_type = { + .ct_item_ops = &o2nm_cluster_item_ops, + .ct_attrs = o2nm_cluster_attrs, + .ct_owner = THIS_MODULE, +}; + +/* cluster set */ + +struct o2nm_cluster_group { + struct configfs_subsystem cs_subsys; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group) +{ + return group ? + container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys) + : NULL; +} +#endif + +static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, + const char *name) +{ + struct o2nm_cluster *cluster = NULL; + struct o2nm_node_group *ns = NULL; + struct config_group *o2hb_group = NULL, *ret = NULL; + void *defs = NULL; + + /* this runs under the parent dir's i_mutex; there can be only + * one caller in here at a time */ + if (o2nm_single_cluster) + return ERR_PTR(-ENOSPC); + + cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); + ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); + defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); + o2hb_group = o2hb_alloc_hb_set(); + if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + goto out; + + config_group_init_type_name(&cluster->cl_group, name, + &o2nm_cluster_type); + config_group_init_type_name(&ns->ns_group, "node", + &o2nm_node_group_type); + + cluster->cl_group.default_groups = defs; + cluster->cl_group.default_groups[0] = &ns->ns_group; + cluster->cl_group.default_groups[1] = o2hb_group; + cluster->cl_group.default_groups[2] = NULL; + rwlock_init(&cluster->cl_nodes_lock); + cluster->cl_node_ip_tree = RB_ROOT; + cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; + cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; + cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; + + ret = &cluster->cl_group; + o2nm_single_cluster = cluster; + +out: + if (ret == NULL) { + kfree(cluster); + kfree(ns); + o2hb_free_hb_set(o2hb_group); + kfree(defs); + ret = ERR_PTR(-ENOMEM); + } + + return ret; +} + +static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + int i; + struct config_item *killme; + + BUG_ON(o2nm_single_cluster != cluster); + o2nm_single_cluster = NULL; + + for (i = 0; cluster->cl_group.default_groups[i]; i++) { + killme = &cluster->cl_group.default_groups[i]->cg_item; + cluster->cl_group.default_groups[i] = NULL; + config_item_put(killme); + } + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_cluster_group_group_ops = { + .make_group = o2nm_cluster_group_make_group, + .drop_item = o2nm_cluster_group_drop_item, +}; + +static struct config_item_type o2nm_cluster_group_type = { + .ct_group_ops = &o2nm_cluster_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct o2nm_cluster_group o2nm_cluster_group = { + .cs_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "cluster", + .ci_type = &o2nm_cluster_group_type, + }, + }, + }, +}; + +int o2nm_depend_item(struct config_item *item) +{ + return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); +} + +void o2nm_undepend_item(struct config_item *item) +{ + configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); +} + +int o2nm_depend_this_node(void) +{ + int ret = 0; + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + if (!local_node) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_depend_item(&local_node->nd_item); + o2nm_node_put(local_node); + +out: + return ret; +} + +void o2nm_undepend_this_node(void) +{ + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + BUG_ON(!local_node); + + o2nm_undepend_item(&local_node->nd_item); + o2nm_node_put(local_node); +} + + +static void __exit exit_o2nm(void) +{ + /* XXX sync with hb callbacks and shut down hb? */ + o2net_unregister_hb_callbacks(); + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); + o2cb_sys_shutdown(); + + o2net_exit(); + o2hb_exit(); +} + +static int __init init_o2nm(void) +{ + int ret = -1; + + ret = o2hb_init(); + if (ret) + goto out; + + ret = o2net_init(); + if (ret) + goto out_o2hb; + + ret = o2net_register_hb_callbacks(); + if (ret) + goto out_o2net; + + config_group_init(&o2nm_cluster_group.cs_subsys.su_group); + mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex); + ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); + if (ret) { + printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); + goto out_callbacks; + } + + ret = o2cb_sys_init(); + if (!ret) + goto out; + + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); +out_callbacks: + o2net_unregister_hb_callbacks(); +out_o2net: + o2net_exit(); +out_o2hb: + o2hb_exit(); +out: + return ret; +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); + +module_init(init_o2nm) +module_exit(exit_o2nm) diff --git a/kernel/fs/ocfs2/cluster/nodemanager.h b/kernel/fs/ocfs2/cluster/nodemanager.h new file mode 100644 index 000000000..09ea2d388 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/nodemanager.h @@ -0,0 +1,88 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * nodemanager.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_NODEMANAGER_H +#define O2CLUSTER_NODEMANAGER_H + +#include "ocfs2_nodemanager.h" + +/* This totally doesn't belong here. */ +#include +#include + +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + +struct o2nm_node { + spinlock_t nd_lock; + struct config_item nd_item; + char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */ + __u8 nd_num; + /* only one address per node, as attributes, for now. */ + __be32 nd_ipv4_address; + __be16 nd_ipv4_port; + struct rb_node nd_ip_node; + /* there can be only one local node for now */ + int nd_local; + + unsigned long nd_set_attributes; +}; + +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + unsigned int cl_idle_timeout_ms; + unsigned int cl_keepalive_delay_ms; + unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; + + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +extern struct o2nm_cluster *o2nm_single_cluster; + +u8 o2nm_this_node(void); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes); +struct o2nm_node *o2nm_get_node_by_num(u8 node_num); +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); +void o2nm_node_get(struct o2nm_node *node); +void o2nm_node_put(struct o2nm_node *node); + +int o2nm_depend_item(struct config_item *item); +void o2nm_undepend_item(struct config_item *item); +int o2nm_depend_this_node(void); +void o2nm_undepend_this_node(void); + +#endif /* O2CLUSTER_NODEMANAGER_H */ diff --git a/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h b/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h new file mode 100644 index 000000000..3f4151da9 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_heartbeat.h + * + * On-disk structures for ocfs2_heartbeat + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_HEARTBEAT_H +#define _OCFS2_HEARTBEAT_H + +struct o2hb_disk_heartbeat_block { + __le64 hb_seq; + __u8 hb_node; + __u8 hb_pad1[3]; + __le32 hb_cksum; + __le64 hb_generation; + __le32 hb_dead_ms; +}; + +#endif /* _OCFS2_HEARTBEAT_H */ diff --git a/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h b/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h new file mode 100644 index 000000000..49b594325 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_nodemanager.h + * + * Header describing the interface between userspace and the kernel + * for the ocfs2_nodemanager module. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef _OCFS2_NODEMANAGER_H +#define _OCFS2_NODEMANAGER_H + +#define O2NM_API_VERSION 5 + +#define O2NM_MAX_NODES 255 +#define O2NM_INVALID_NODE_NUM 255 + +/* host name, group name, cluster name all 64 bytes */ +#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN + +/* + * Maximum number of global heartbeat regions allowed. + * **CAUTION** Changing this number will break dlm compatibility. + */ +#define O2NM_MAX_REGIONS 32 + +#endif /* _OCFS2_NODEMANAGER_H */ diff --git a/kernel/fs/ocfs2/cluster/quorum.c b/kernel/fs/ocfs2/cluster/quorum.c new file mode 100644 index 000000000..62e8ec619 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/quorum.c @@ -0,0 +1,340 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* This quorum hack is only here until we transition to some more rational + * approach that is driven from userspace. Honest. No foolin'. + * + * Imagine two nodes lose network connectivity to each other but they're still + * up and operating in every other way. Presumably a network timeout indicates + * that a node is broken and should be recovered. They can't both recover each + * other and both carry on without serialising their access to the file system. + * They need to decide who is authoritative. Now extend that problem to + * arbitrary groups of nodes losing connectivity between each other. + * + * So we declare that a node which has given up on connecting to a majority + * of nodes who are still heartbeating will fence itself. + * + * There are huge opportunities for races here. After we give up on a node's + * connection we need to wait long enough to give heartbeat an opportunity + * to declare the node as truly dead. We also need to be careful with the + * race between when we see a node start heartbeating and when we connect + * to it. + * + * So nodes that are in this transtion put a hold on the quorum decision + * with a counter. As they fall out of this transition they drop the count + * and if they're the last, they fire off the decision. + */ +#include +#include +#include + +#include "heartbeat.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_QUORUM +#include "masklog.h" +#include "quorum.h" + +static struct o2quo_state { + spinlock_t qs_lock; + struct work_struct qs_work; + int qs_pending; + int qs_heartbeating; + unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_connected; + unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_holds; + unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +} o2quo_state; + +/* this is horribly heavy-handed. It should instead flip the file + * system RO and call some userspace script. */ +static void o2quo_fence_self(void) +{ + /* panic spins with interrupts enabled. with preempt + * threads can still schedule, etc, etc */ + o2hb_stop_all_regions(); + + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; +} + +/* Indicate that a timeout occurred on a hearbeat region write. The + * other nodes in the cluster may consider us dead at that time so we + * want to "fence" ourselves so that we don't scribble on the disk + * after they think they've recovered us. This can't solve all + * problems related to writeout after recovery but this hack can at + * least close some of those gaps. When we have real fencing, this can + * go away as our node would be fenced externally before other nodes + * begin recovery. */ +void o2quo_disk_timeout(void) +{ + o2quo_fence_self(); +} + +static void o2quo_make_decision(struct work_struct *work) +{ + int quorum; + int lowest_hb, lowest_reachable = 0, fence = 0; + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); + if (lowest_hb != O2NM_MAX_NODES) + lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); + + mlog(0, "heartbeating: %d, connected: %d, " + "lowest: %d (%sreachable)\n", qs->qs_heartbeating, + qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); + + if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || + qs->qs_heartbeating == 1) + goto out; + + if (qs->qs_heartbeating & 1) { + /* the odd numbered cluster case is straight forward -- + * if we can't talk to the majority we're hosed */ + quorum = (qs->qs_heartbeating + 1)/2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + } else { + /* the even numbered cluster adds the possibility of each half + * of the cluster being able to talk amongst themselves.. in + * that case we're hosed if we can't talk to the group that has + * the lowest numbered node */ + quorum = qs->qs_heartbeating / 2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + else if ((qs->qs_connected == quorum) && + !lowest_reachable) { + mlog(ML_ERROR, "fencing this node because it is " + "connected to a half-quorum of %u out of %u " + "nodes which doesn't include the lowest active " + "node %u\n", quorum, qs->qs_heartbeating, + lowest_hb); + fence = 1; + } + } + +out: + if (fence) { + spin_unlock(&qs->qs_lock); + o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + +} + +static void o2quo_set_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (!test_and_set_bit(node, qs->qs_hold_bm)) { + qs->qs_holds++; + mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, + "node %u\n", node); + mlog(0, "node %u, %d total\n", node, qs->qs_holds); + } +} + +static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (test_and_clear_bit(node, qs->qs_hold_bm)) { + mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); + if (--qs->qs_holds == 0) { + if (qs->qs_pending) { + qs->qs_pending = 0; + schedule_work(&qs->qs_work); + } + } + mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", + node, qs->qs_holds); + } +} + +/* as a node comes up we delay the quorum decision until we know the fate of + * the connection. the hold will be droped in conn_up or hb_down. it might be + * perpetuated by con_err until hb_down. if we already have a conn, we might + * be dropping a hold that conn_up got. */ +void o2quo_hb_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating++; + mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); + set_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + if (!test_bit(node, qs->qs_conn_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* hb going down releases any holds we might have had due to this node from + * conn_up, conn_err, or hb_up */ +void o2quo_hb_down(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating--; + mlog_bug_on_msg(qs->qs_heartbeating < 0, + "node %u, %d heartbeating\n", + node, qs->qs_heartbeating); + mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); + clear_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* this tells us that we've decided that the node is still heartbeating + * even though we've lost it's conn. it must only be called after conn_err + * and indicates that we must now make a quorum decision in the future, + * though we might be doing so after waiting for holds to drain. Here + * we'll be dropping the hold from conn_err. */ +void o2quo_hb_still_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + mlog(0, "node %u\n", node); + + qs->qs_pending = 1; + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* This is analogous to hb_up. as a node's connection comes up we delay the + * quorum decision until we see it heartbeating. the hold will be droped in + * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if + * it's already heartbeating we might be dropping a hold that conn_up got. + * */ +void o2quo_conn_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_connected++; + mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); + set_bit(node, qs->qs_conn_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (!test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* we've decided that we won't ever be connecting to the node again. if it's + * still heartbeating we grab a hold that will delay decisions until either the + * node stops heartbeating from hb_down or the caller decides that the node is + * still up and calls still_up */ +void o2quo_conn_err(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + if (test_bit(node, qs->qs_conn_bm)) { + qs->qs_connected--; + mlog_bug_on_msg(qs->qs_connected < 0, + "node %u, connected %d\n", + node, qs->qs_connected); + + clear_bit(node, qs->qs_conn_bm); + } + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +void o2quo_init(void) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock_init(&qs->qs_lock); + INIT_WORK(&qs->qs_work, o2quo_make_decision); +} + +void o2quo_exit(void) +{ + struct o2quo_state *qs = &o2quo_state; + + flush_work(&qs->qs_work); +} diff --git a/kernel/fs/ocfs2/cluster/quorum.h b/kernel/fs/ocfs2/cluster/quorum.h new file mode 100644 index 000000000..6649cc6f6 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/quorum.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_QUORUM_H +#define O2CLUSTER_QUORUM_H + +void o2quo_init(void); +void o2quo_exit(void); + +void o2quo_hb_up(u8 node); +void o2quo_hb_down(u8 node); +void o2quo_hb_still_up(u8 node); +void o2quo_conn_up(u8 node); +void o2quo_conn_err(u8 node); +void o2quo_disk_timeout(void); + +#endif /* O2CLUSTER_QUORUM_H */ diff --git a/kernel/fs/ocfs2/cluster/sys.c b/kernel/fs/ocfs2/cluster/sys.c new file mode 100644 index 000000000..b7f57271d --- /dev/null +++ b/kernel/fs/ocfs2/cluster/sys.c @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.c + * + * OCFS2 cluster sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include + +#include "ocfs2_nodemanager.h" +#include "masklog.h" +#include "sys.h" + + +static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); +} +static struct kobj_attribute attr_version = + __ATTR(interface_revision, S_IRUGO, version_show, NULL); + +static struct attribute *o2cb_attrs[] = { + &attr_version.attr, + NULL, +}; + +static struct attribute_group o2cb_attr_group = { + .attrs = o2cb_attrs, +}; + +static struct kset *o2cb_kset; + +void o2cb_sys_shutdown(void) +{ + mlog_sys_shutdown(); + kset_unregister(o2cb_kset); +} + +int o2cb_sys_init(void) +{ + int ret; + + o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj); + if (!o2cb_kset) + return -ENOMEM; + + ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); + if (ret) + goto error; + + ret = mlog_sys_init(o2cb_kset); + if (ret) + goto error; + return 0; +error: + kset_unregister(o2cb_kset); + return ret; +} diff --git a/kernel/fs/ocfs2/cluster/sys.h b/kernel/fs/ocfs2/cluster/sys.h new file mode 100644 index 000000000..d66b8ab00 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/sys.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.h + * + * Function prototypes for o2cb sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_SYS_H +#define O2CLUSTER_SYS_H + +void o2cb_sys_shutdown(void); +int o2cb_sys_init(void); + +#endif /* O2CLUSTER_SYS_H */ diff --git a/kernel/fs/ocfs2/cluster/tcp.c b/kernel/fs/ocfs2/cluster/tcp.c new file mode 100644 index 000000000..56c403a56 --- /dev/null +++ b/kernel/fs/ocfs2/cluster/tcp.c @@ -0,0 +1,2219 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * ---- + * + * Callers for this were originally written against a very simple synchronus + * API. This implementation reflects those simple callers. Some day I'm sure + * we'll need to move to a more robust posting/callback mechanism. + * + * Transmit calls pass in kernel virtual addresses and block copying this into + * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting + * for a failed socket to timeout. TX callers can also pass in a poniter to an + * 'int' which gets filled with an errno off the wire in response to the + * message they send. + * + * Handlers for unsolicited messages are registered. Each socket has a page + * that incoming data is copied into. First the header, then the data. + * Handlers are called from only one thread with a reference to this per-socket + * page. This page is destroyed after the handler call, so it can't be + * referenced beyond the call. Handlers may block but are discouraged from + * doing so. + * + * Any framing errors (bad magic, large payload lengths) close a connection. + * + * Our sock_container holds the state we associate with a socket. It's current + * framing state is held there as well as the refcounting we do around when it + * is safe to tear down the socket. The socket is only finally torn down from + * the container when the container loses all of its references -- so as long + * as you hold a ref on the container you can trust that the socket is valid + * for use with kernel socket APIs. + * + * Connections are initiated between a pair of nodes when the node with the + * higher node number gets a heartbeat callback which indicates that the lower + * numbered node has started heartbeating. The lower numbered node is passive + * and only accepts the connection if the higher numbered node is heartbeating. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" +#include "quorum.h" + +#include "tcp_internal.h" + +#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" +#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ + &sc->sc_node->nd_ipv4_address, \ + ntohs(sc->sc_node->nd_ipv4_port) + +/* + * In the following two log macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define msglog(hdr, fmt, args...) do { \ + typeof(hdr) __hdr = (hdr); \ + mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ + "key %08x num %u] " fmt, \ + be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ + be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ + be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ + be32_to_cpu(__hdr->msg_num) , ##args); \ +} while (0) + +#define sclog(sc, fmt, args...) do { \ + typeof(sc) __sc = (sc); \ + mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ + "pg_off %zu] " fmt, __sc, \ + atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ + ##args); \ +} while (0) + +static DEFINE_RWLOCK(o2net_handler_lock); +static struct rb_root o2net_handler_tree = RB_ROOT; + +static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; + +/* XXX someday we'll need better accounting */ +static struct socket *o2net_listen_sock; + +/* + * listen work is only queued by the listening socket callbacks on the + * o2net_wq. teardown detaches the callbacks before destroying the workqueue. + * quorum work is queued as sock containers are shutdown.. stop_listening + * tears down all the node's sock containers, preventing future shutdowns + * and queued quroum work, before canceling delayed quorum work and + * destroying the work queue. + */ +static struct workqueue_struct *o2net_wq; +static struct work_struct o2net_listen_work; + +static struct o2hb_callback_func o2net_hb_up, o2net_hb_down; +#define O2NET_HB_PRI 0x1 + +static struct o2net_handshake *o2net_hand; +static struct o2net_msg *o2net_keep_req, *o2net_keep_resp; + +static int o2net_sys_err_translations[O2NET_ERR_MAX] = + {[O2NET_ERR_NONE] = 0, + [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, + [O2NET_ERR_OVERFLOW] = -EOVERFLOW, + [O2NET_ERR_DIED] = -EHOSTDOWN,}; + +/* can't quite avoid *all* internal declarations :/ */ +static void o2net_sc_connect_completed(struct work_struct *work); +static void o2net_rx_until_empty(struct work_struct *work); +static void o2net_shutdown_sc(struct work_struct *work); +static void o2net_listen_data_ready(struct sock *sk); +static void o2net_sc_send_keep_req(struct work_struct *work); +static void o2net_idle_timer(unsigned long data); +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); + +#ifdef CONFIG_DEBUG_FS +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ + INIT_LIST_HEAD(&nst->st_net_debug_item); + nst->st_task = task; + nst->st_msg_type = msgtype; + nst->st_msg_key = msgkey; + nst->st_node = node; +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ + nst->st_sock_time = ktime_get(); +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ + nst->st_send_time = ktime_get(); +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ + nst->st_status_time = ktime_get(); +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ + nst->st_sc = sc; +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ + nst->st_id = msg_id; +} + +static inline void o2net_set_sock_timer(struct o2net_sock_container *sc) +{ + sc->sc_tv_timer = ktime_get(); +} + +static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_data_ready = ktime_get(); +} + +static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_advance_start = ktime_get(); +} + +static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_advance_stop = ktime_get(); +} + +static inline void o2net_set_func_start_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_func_start = ktime_get(); +} + +static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_func_stop = ktime_get(); +} + +#else /* CONFIG_DEBUG_FS */ +# define o2net_init_nst(a, b, c, d, e) +# define o2net_set_nst_sock_time(a) +# define o2net_set_nst_send_time(a) +# define o2net_set_nst_status_time(a) +# define o2net_set_nst_sock_container(a, b) +# define o2net_set_nst_msg_id(a, b) +# define o2net_set_sock_timer(a) +# define o2net_set_data_ready_time(a) +# define o2net_set_advance_start_time(a) +# define o2net_set_advance_stop_time(a) +# define o2net_set_func_start_time(a) +# define o2net_set_func_stop_time(a) +#endif /* CONFIG_DEBUG_FS */ + +#ifdef CONFIG_OCFS2_FS_STATS +static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) +{ + return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); +} + +static void o2net_update_send_stats(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ + sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, + ktime_sub(ktime_get(), + nst->st_status_time)); + sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, + ktime_sub(nst->st_status_time, + nst->st_send_time)); + sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, + ktime_sub(nst->st_send_time, + nst->st_sock_time)); + sc->sc_send_count++; +} + +static void o2net_update_recv_stats(struct o2net_sock_container *sc) +{ + sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, + o2net_get_func_run_time(sc)); + sc->sc_recv_count++; +} + +#else + +# define o2net_update_send_stats(a, b) + +# define o2net_update_recv_stats(sc) + +#endif /* CONFIG_OCFS2_FS_STATS */ + +static inline unsigned int o2net_reconnect_delay(void) +{ + return o2nm_single_cluster->cl_reconnect_delay_ms; +} + +static inline unsigned int o2net_keepalive_delay(void) +{ + return o2nm_single_cluster->cl_keepalive_delay_ms; +} + +static inline unsigned int o2net_idle_timeout(void) +{ + return o2nm_single_cluster->cl_idle_timeout_ms; +} + +static inline int o2net_sys_err_to_errno(enum o2net_system_error err) +{ + int trans; + BUG_ON(err >= O2NET_ERR_MAX); + trans = o2net_sys_err_translations[err]; + + /* Just in case we mess up the translation table above */ + BUG_ON(err != O2NET_ERR_NONE && trans == 0); + return trans; +} + +static struct o2net_node * o2net_nn_from_num(u8 node_num) +{ + BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes)); + return &o2net_nodes[node_num]; +} + +static u8 o2net_num_from_nn(struct o2net_node *nn) +{ + BUG_ON(nn == NULL); + return nn - o2net_nodes; +} + +/* ------------------------------------------------------------ */ + +static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) +{ + int ret; + + spin_lock(&nn->nn_lock); + ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + nsw->ns_id = ret; + list_add_tail(&nsw->ns_node_item, &nn->nn_status_list); + } + spin_unlock(&nn->nn_lock); + if (ret < 0) + return ret; + + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; + return 0; +} + +static void o2net_complete_nsw_locked(struct o2net_node *nn, + struct o2net_status_wait *nsw, + enum o2net_system_error sys_status, + s32 status) +{ + assert_spin_locked(&nn->nn_lock); + + if (!list_empty(&nsw->ns_node_item)) { + list_del_init(&nsw->ns_node_item); + nsw->ns_sys_status = sys_status; + nsw->ns_status = status; + idr_remove(&nn->nn_status_idr, nsw->ns_id); + wake_up(&nsw->ns_wq); + } +} + +static void o2net_complete_nsw(struct o2net_node *nn, + struct o2net_status_wait *nsw, + u64 id, enum o2net_system_error sys_status, + s32 status) +{ + spin_lock(&nn->nn_lock); + if (nsw == NULL) { + if (id > INT_MAX) + goto out; + + nsw = idr_find(&nn->nn_status_idr, id); + if (nsw == NULL) + goto out; + } + + o2net_complete_nsw_locked(nn, nsw, sys_status, status); + +out: + spin_unlock(&nn->nn_lock); + return; +} + +static void o2net_complete_nodes_nsw(struct o2net_node *nn) +{ + struct o2net_status_wait *nsw, *tmp; + unsigned int num_kills = 0; + + assert_spin_locked(&nn->nn_lock); + + list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { + o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); + num_kills++; + } + + mlog(0, "completed %d messages for node %u\n", num_kills, + o2net_num_from_nn(nn)); +} + +static int o2net_nsw_completed(struct o2net_node *nn, + struct o2net_status_wait *nsw) +{ + int completed; + spin_lock(&nn->nn_lock); + completed = list_empty(&nsw->ns_node_item); + spin_unlock(&nn->nn_lock); + return completed; +} + +/* ------------------------------------------------------------ */ + +static void sc_kref_release(struct kref *kref) +{ + struct o2net_sock_container *sc = container_of(kref, + struct o2net_sock_container, sc_kref); + BUG_ON(timer_pending(&sc->sc_idle_timeout)); + + sclog(sc, "releasing\n"); + + if (sc->sc_sock) { + sock_release(sc->sc_sock); + sc->sc_sock = NULL; + } + + o2nm_undepend_item(&sc->sc_node->nd_item); + o2nm_node_put(sc->sc_node); + sc->sc_node = NULL; + + o2net_debug_del_sc(sc); + + if (sc->sc_page) + __free_page(sc->sc_page); + kfree(sc); +} + +static void sc_put(struct o2net_sock_container *sc) +{ + sclog(sc, "put\n"); + kref_put(&sc->sc_kref, sc_kref_release); +} +static void sc_get(struct o2net_sock_container *sc) +{ + sclog(sc, "get\n"); + kref_get(&sc->sc_kref); +} +static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) +{ + struct o2net_sock_container *sc, *ret = NULL; + struct page *page = NULL; + int status = 0; + + page = alloc_page(GFP_NOFS); + sc = kzalloc(sizeof(*sc), GFP_NOFS); + if (sc == NULL || page == NULL) + goto out; + + kref_init(&sc->sc_kref); + o2nm_node_get(node); + sc->sc_node = node; + + /* pin the node item of the remote node */ + status = o2nm_depend_item(&node->nd_item); + if (status) { + mlog_errno(status); + o2nm_node_put(node); + goto out; + } + INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); + INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); + INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); + INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); + + init_timer(&sc->sc_idle_timeout); + sc->sc_idle_timeout.function = o2net_idle_timer; + sc->sc_idle_timeout.data = (unsigned long)sc; + + sclog(sc, "alloced\n"); + + ret = sc; + sc->sc_page = page; + o2net_debug_add_sc(sc); + sc = NULL; + page = NULL; + +out: + if (page) + __free_page(page); + kfree(sc); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static void o2net_sc_queue_work(struct o2net_sock_container *sc, + struct work_struct *work) +{ + sc_get(sc); + if (!queue_work(o2net_wq, work)) + sc_put(sc); +} +static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc, + struct delayed_work *work, + int delay) +{ + sc_get(sc); + if (!queue_delayed_work(o2net_wq, work, delay)) + sc_put(sc); +} +static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, + struct delayed_work *work) +{ + if (cancel_delayed_work(work)) + sc_put(sc); +} + +static atomic_t o2net_connected_peers = ATOMIC_INIT(0); + +int o2net_num_connected_peers(void) +{ + return atomic_read(&o2net_connected_peers); +} + +static void o2net_set_nn_state(struct o2net_node *nn, + struct o2net_sock_container *sc, + unsigned valid, int err) +{ + int was_valid = nn->nn_sc_valid; + int was_err = nn->nn_persistent_error; + struct o2net_sock_container *old_sc = nn->nn_sc; + + assert_spin_locked(&nn->nn_lock); + + if (old_sc && !sc) + atomic_dec(&o2net_connected_peers); + else if (!old_sc && sc) + atomic_inc(&o2net_connected_peers); + + /* the node num comparison and single connect/accept path should stop + * an non-null sc from being overwritten with another */ + BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); + mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); + mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); + + if (was_valid && !valid && err == 0) + err = -ENOTCONN; + + mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", + o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, + nn->nn_persistent_error, err); + + nn->nn_sc = sc; + nn->nn_sc_valid = valid ? 1 : 0; + nn->nn_persistent_error = err; + + /* mirrors o2net_tx_can_proceed() */ + if (nn->nn_persistent_error || nn->nn_sc_valid) + wake_up(&nn->nn_sc_wq); + + if (was_valid && !was_err && nn->nn_persistent_error) { + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + } + + if (was_valid && !valid) { + if (old_sc) + printk(KERN_NOTICE "o2net: No longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + o2net_complete_nodes_nsw(nn); + } + + if (!was_valid && valid) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_connect_expired); + printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", + o2nm_this_node() > sc->sc_node->nd_num ? + "Connected to" : "Accepted connection from", + SC_NODEF_ARGS(sc)); + } + + /* trigger the connecting worker func as long as we're not valid, + * it will back off if it shouldn't connect. This can be called + * from node config teardown and so needs to be careful about + * the work queue actually being up. */ + if (!valid && o2net_wq) { + unsigned long delay; + /* delay if we're within a RECONNECT_DELAY of the + * last attempt */ + delay = (nn->nn_last_connect_attempt + + msecs_to_jiffies(o2net_reconnect_delay())) + - jiffies; + if (delay > msecs_to_jiffies(o2net_reconnect_delay())) + delay = 0; + mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); + queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); + + /* + * Delay the expired work after idle timeout. + * + * We might have lots of failed connection attempts that run + * through here but we only cancel the connect_expired work when + * a connection attempt succeeds. So only the first enqueue of + * the connect_expired work will do anything. The rest will see + * that it's already queued and do nothing. + */ + delay += msecs_to_jiffies(o2net_idle_timeout()); + queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); + } + + /* keep track of the nn's sc ref for the caller */ + if ((old_sc == NULL) && sc) + sc_get(sc); + if (old_sc && (old_sc != sc)) { + o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); + sc_put(old_sc); + } +} + +/* see o2net_register_callbacks() */ +static void o2net_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + + read_lock(&sk->sk_callback_lock); + if (sk->sk_user_data) { + struct o2net_sock_container *sc = sk->sk_user_data; + sclog(sc, "data_ready hit\n"); + o2net_set_data_ready_time(sc); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + ready = sc->sc_data_ready; + } else { + ready = sk->sk_data_ready; + } + read_unlock(&sk->sk_callback_lock); + + ready(sk); +} + +/* see o2net_register_callbacks() */ +static void o2net_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct o2net_sock_container *sc; + + read_lock(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc == NULL) { + state_change = sk->sk_state_change; + goto out; + } + + sclog(sc, "state_change to %d\n", sk->sk_state); + + state_change = sc->sc_state_change; + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +/* + * we register callbacks so we can queue work on events before calling + * the original callbacks. our callbacks our careful to test user_data + * to discover when they've reaced with o2net_unregister_callbacks(). + */ +static void o2net_register_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + write_lock_bh(&sk->sk_callback_lock); + + /* accepted sockets inherit the old listen socket data ready */ + if (sk->sk_data_ready == o2net_listen_data_ready) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + + BUG_ON(sk->sk_user_data != NULL); + sk->sk_user_data = sc; + sc_get(sc); + + sc->sc_data_ready = sk->sk_data_ready; + sc->sc_state_change = sk->sk_state_change; + sk->sk_data_ready = o2net_data_ready; + sk->sk_state_change = o2net_state_change; + + mutex_init(&sc->sc_send_lock); + + write_unlock_bh(&sk->sk_callback_lock); +} + +static int o2net_unregister_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + int ret = 0; + + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data == sc) { + ret = 1; + sk->sk_user_data = NULL; + sk->sk_data_ready = sc->sc_data_ready; + sk->sk_state_change = sc->sc_state_change; + } + write_unlock_bh(&sk->sk_callback_lock); + + return ret; +} + +/* + * this is a little helper that is called by callers who have seen a problem + * with an sc and want to detach it from the nn if someone already hasn't beat + * them to it. if an error is given then the shutdown will be persistent + * and pending transmits will be canceled. + */ +static void o2net_ensure_shutdown(struct o2net_node *nn, + struct o2net_sock_container *sc, + int err) +{ + spin_lock(&nn->nn_lock); + if (nn->nn_sc == sc) + o2net_set_nn_state(nn, NULL, 0, err); + spin_unlock(&nn->nn_lock); +} + +/* + * This work queue function performs the blocking parts of socket shutdown. A + * few paths lead here. set_nn_state will trigger this callback if it sees an + * sc detached from the nn. state_change will also trigger this callback + * directly when it sees errors. In that case we need to call set_nn_state + * ourselves as state_change couldn't get the nn_lock and call set_nn_state + * itself. + */ +static void o2net_shutdown_sc(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_shutdown_work); + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + sclog(sc, "shutting down\n"); + + /* drop the callbacks ref and call shutdown only once */ + if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { + /* we shouldn't flush as we're in the thread, the + * races with pending sc work structs are harmless */ + del_timer_sync(&sc->sc_idle_timeout); + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + sc_put(sc); + kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); + } + + /* not fatal so failed connects before the other guy has our + * heartbeat can be retried */ + o2net_ensure_shutdown(nn, sc, 0); + sc_put(sc); +} + +/* ------------------------------------------------------------ */ + +static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type, + u32 key) +{ + int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); + + if (ret == 0) + ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); + + return ret; +} + +static struct o2net_msg_handler * +o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; + struct o2net_msg_handler *nmh, *ret = NULL; + int cmp; + + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + cmp = o2net_handler_cmp(nmh, msg_type, key); + + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + ret = nmh; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +static void o2net_handler_kref_release(struct kref *kref) +{ + struct o2net_msg_handler *nmh; + nmh = container_of(kref, struct o2net_msg_handler, nh_kref); + + kfree(nmh); +} + +static void o2net_handler_put(struct o2net_msg_handler *nmh) +{ + kref_put(&nmh->nh_kref, o2net_handler_kref_release); +} + +/* max_len is protection for the handler func. incoming messages won't + * be given to the handler if their payload is longer than the max. */ +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, + struct list_head *unreg_list) +{ + struct o2net_msg_handler *nmh = NULL; + struct rb_node **p, *parent; + int ret = 0; + + if (max_len > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "max_len for message handler out of range: %u\n", + max_len); + ret = -EINVAL; + goto out; + } + + if (!msg_type) { + mlog(0, "no message type provided: %u, %p\n", msg_type, func); + ret = -EINVAL; + goto out; + + } + if (!func) { + mlog(0, "no message handler provided: %u, %p\n", + msg_type, func); + ret = -EINVAL; + goto out; + } + + nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS); + if (nmh == NULL) { + ret = -ENOMEM; + goto out; + } + + nmh->nh_func = func; + nmh->nh_func_data = data; + nmh->nh_post_func = post_func; + nmh->nh_msg_type = msg_type; + nmh->nh_max_len = max_len; + nmh->nh_key = key; + /* the tree and list get this ref.. they're both removed in + * unregister when this ref is dropped */ + kref_init(&nmh->nh_kref); + INIT_LIST_HEAD(&nmh->nh_unregister_item); + + write_lock(&o2net_handler_lock); + if (o2net_handler_tree_lookup(msg_type, key, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&nmh->nh_node, parent, p); + rb_insert_color(&nmh->nh_node, &o2net_handler_tree); + list_add_tail(&nmh->nh_unregister_item, unreg_list); + + mlog(ML_TCP, "registered handler func %p type %u key %08x\n", + func, msg_type, key); + /* we've had some trouble with handlers seemingly vanishing. */ + mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, + &parent) == NULL, + "couldn't find handler we *just* registered " + "for type %u key %08x\n", msg_type, key); + } + write_unlock(&o2net_handler_lock); + if (ret) + goto out; + +out: + if (ret) + kfree(nmh); + + return ret; +} +EXPORT_SYMBOL_GPL(o2net_register_handler); + +void o2net_unregister_handler_list(struct list_head *list) +{ + struct o2net_msg_handler *nmh, *n; + + write_lock(&o2net_handler_lock); + list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { + mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", + nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); + rb_erase(&nmh->nh_node, &o2net_handler_tree); + list_del_init(&nmh->nh_unregister_item); + kref_put(&nmh->nh_kref, o2net_handler_kref_release); + } + write_unlock(&o2net_handler_lock); +} +EXPORT_SYMBOL_GPL(o2net_unregister_handler_list); + +static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) +{ + struct o2net_msg_handler *nmh; + + read_lock(&o2net_handler_lock); + nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL); + if (nmh) + kref_get(&nmh->nh_kref); + read_unlock(&o2net_handler_lock); + + return nmh; +} + +/* ------------------------------------------------------------ */ + +static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) +{ + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); +} + +static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, + size_t veclen, size_t total) +{ + int ret; + struct msghdr msg = {.msg_flags = 0,}; + + if (sock == NULL) { + ret = -EINVAL; + goto out; + } + + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ +out: + mlog(0, "returning error: %d\n", ret); + return ret; +} + +static void o2net_sendpage(struct o2net_sock_container *sc, + void *kmalloced_virt, + size_t size) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + ssize_t ret; + + while (1) { + mutex_lock(&sc->sc_send_lock); + ret = sc->sc_sock->ops->sendpage(sc->sc_sock, + virt_to_page(kmalloced_virt), + (long)kmalloced_virt & ~PAGE_MASK, + size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); + if (ret == size) + break; + if (ret == (ssize_t)-EAGAIN) { + mlog(0, "sendpage of size %zu to " SC_NODEF_FMT + " returned EAGAIN\n", size, SC_NODEF_ARGS(sc)); + cond_resched(); + continue; + } + mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT + " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); + o2net_ensure_shutdown(nn, sc, 0); + break; + } +} + +static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key) +{ + memset(msg, 0, sizeof(struct o2net_msg)); + msg->magic = cpu_to_be16(O2NET_MSG_MAGIC); + msg->data_len = cpu_to_be16(data_len); + msg->msg_type = cpu_to_be16(msg_type); + msg->sys_status = cpu_to_be32(O2NET_ERR_NONE); + msg->status = 0; + msg->key = cpu_to_be32(key); +} + +static int o2net_tx_can_proceed(struct o2net_node *nn, + struct o2net_sock_container **sc_ret, + int *error) +{ + int ret = 0; + + spin_lock(&nn->nn_lock); + if (nn->nn_persistent_error) { + ret = 1; + *sc_ret = NULL; + *error = nn->nn_persistent_error; + } else if (nn->nn_sc_valid) { + kref_get(&nn->nn_sc->sc_kref); + + ret = 1; + *sc_ret = nn->nn_sc; + *error = 0; + } + spin_unlock(&nn->nn_lock); + + return ret; +} + +/* Get a map of all nodes to which this node is currently connected to */ +void o2net_fill_node_map(unsigned long *map, unsigned bytes) +{ + struct o2net_sock_container *sc; + int node, ret; + + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memset(map, 0, bytes); + for (node = 0; node < O2NM_MAX_NODES; ++node) { + if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) + continue; + if (!ret) { + set_bit(node, map); + sc_put(sc); + } + } +} +EXPORT_SYMBOL_GPL(o2net_fill_node_map); + +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, + size_t caller_veclen, u8 target_node, int *status) +{ + int ret = 0; + struct o2net_msg *msg = NULL; + size_t veclen, caller_bytes = 0; + struct kvec *vec = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn = o2net_nn_from_num(target_node); + struct o2net_status_wait nsw = { + .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), + }; + struct o2net_send_tracking nst; + + o2net_init_nst(&nst, msg_type, key, current, target_node); + + if (o2net_wq == NULL) { + mlog(0, "attempt to tx without o2netd running\n"); + ret = -ESRCH; + goto out; + } + + if (caller_veclen == 0) { + mlog(0, "bad kvec array length\n"); + ret = -EINVAL; + goto out; + } + + caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); + if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "total payload len %zu too large\n", caller_bytes); + ret = -EINVAL; + goto out; + } + + if (target_node == o2nm_this_node()) { + ret = -ELOOP; + goto out; + } + + o2net_debug_add_nst(&nst); + + o2net_set_nst_sock_time(&nst); + + wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); + if (ret) + goto out; + + o2net_set_nst_sock_container(&nst, sc); + + veclen = caller_veclen + 1; + vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + if (vec == NULL) { + mlog(0, "failed to %zu element kvec!\n", veclen); + ret = -ENOMEM; + goto out; + } + + msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC); + if (!msg) { + mlog(0, "failed to allocate a o2net_msg!\n"); + ret = -ENOMEM; + goto out; + } + + o2net_init_msg(msg, caller_bytes, msg_type, key); + + vec[0].iov_len = sizeof(struct o2net_msg); + vec[0].iov_base = msg; + memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); + + ret = o2net_prep_nsw(nn, &nsw); + if (ret) + goto out; + + msg->msg_num = cpu_to_be32(nsw.ns_id); + o2net_set_nst_msg_id(&nst, nsw.ns_id); + + o2net_set_nst_send_time(&nst); + + /* finally, convert the message header to network byte-order + * and send */ + mutex_lock(&sc->sc_send_lock); + ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, + sizeof(struct o2net_msg) + caller_bytes); + mutex_unlock(&sc->sc_send_lock); + msglog(msg, "sending returned %d\n", ret); + if (ret < 0) { + mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); + goto out; + } + + /* wait on other node's handler */ + o2net_set_nst_status_time(&nst); + wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); + + o2net_update_send_stats(&nst, sc); + + /* Note that we avoid overwriting the callers status return + * variable if a system error was reported on the other + * side. Callers beware. */ + ret = o2net_sys_err_to_errno(nsw.ns_sys_status); + if (status && !ret) + *status = nsw.ns_status; + + mlog(0, "woken, returning system status %d, user status %d\n", + ret, nsw.ns_status); +out: + o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ + if (sc) + sc_put(sc); + kfree(vec); + kfree(msg); + o2net_complete_nsw(nn, &nsw, 0, 0, 0); + return ret; +} +EXPORT_SYMBOL_GPL(o2net_send_message_vec); + +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + return o2net_send_message_vec(msg_type, key, &vec, 1, + target_node, status); +} +EXPORT_SYMBOL_GPL(o2net_send_message); + +static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr, + enum o2net_system_error syserr, int err) +{ + struct kvec vec = { + .iov_base = hdr, + .iov_len = sizeof(struct o2net_msg), + }; + + BUG_ON(syserr >= O2NET_ERR_MAX); + + /* leave other fields intact from the incoming message, msg_num + * in particular */ + hdr->sys_status = cpu_to_be32(syserr); + hdr->status = cpu_to_be32(err); + hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic + hdr->data_len = 0; + + msglog(hdr, "about to send status magic %d\n", err); + /* hdr has been in host byteorder this whole time */ + return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg)); +} + +/* this returns -errno if the header was unknown or too large, etc. + * after this is called the buffer us reused for the next message */ +static int o2net_process_message(struct o2net_sock_container *sc, + struct o2net_msg *hdr) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + int ret = 0, handler_status; + enum o2net_system_error syserr; + struct o2net_msg_handler *nmh = NULL; + void *ret_data = NULL; + + msglog(hdr, "processing message\n"); + + o2net_sc_postpone_idle(sc); + + switch(be16_to_cpu(hdr->magic)) { + case O2NET_MSG_STATUS_MAGIC: + /* special type for returning message status */ + o2net_complete_nsw(nn, NULL, + be32_to_cpu(hdr->msg_num), + be32_to_cpu(hdr->sys_status), + be32_to_cpu(hdr->status)); + goto out; + case O2NET_MSG_KEEP_REQ_MAGIC: + o2net_sendpage(sc, o2net_keep_resp, + sizeof(*o2net_keep_resp)); + goto out; + case O2NET_MSG_KEEP_RESP_MAGIC: + goto out; + case O2NET_MSG_MAGIC: + break; + default: + msglog(hdr, "bad magic\n"); + ret = -EINVAL; + goto out; + break; + } + + /* find a handler for it */ + handler_status = 0; + nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type), + be32_to_cpu(hdr->key)); + if (!nmh) { + mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", + be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); + syserr = O2NET_ERR_NO_HNDLR; + goto out_respond; + } + + syserr = O2NET_ERR_NONE; + + if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) + syserr = O2NET_ERR_OVERFLOW; + + if (syserr != O2NET_ERR_NONE) + goto out_respond; + + o2net_set_func_start_time(sc); + sc->sc_msg_key = be32_to_cpu(hdr->key); + sc->sc_msg_type = be16_to_cpu(hdr->msg_type); + handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + + be16_to_cpu(hdr->data_len), + nmh->nh_func_data, &ret_data); + o2net_set_func_stop_time(sc); + + o2net_update_recv_stats(sc); + +out_respond: + /* this destroys the hdr, so don't use it after this */ + mutex_lock(&sc->sc_send_lock); + ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, + handler_status); + mutex_unlock(&sc->sc_send_lock); + hdr = NULL; + mlog(0, "sending handler status %d, syserr %d returned %d\n", + handler_status, syserr, ret); + + if (nmh) { + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); + if (nmh->nh_post_func) + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, + ret_data); + } + +out: + if (nmh) + o2net_handler_put(nmh); + return ret; +} + +static int o2net_check_handshake(struct o2net_sock_container *sc) +{ + struct o2net_handshake *hand = page_address(sc->sc_page); + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " + "protocol version %llu but %llu is required. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + (unsigned long long)be64_to_cpu(hand->protocol_version), + O2NET_PROTOCOL_VERSION); + + /* don't bother reconnecting if its the wrong version. */ + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + /* + * Ensure timeouts are consistent with other nodes, otherwise + * we can end up with one node thinking that the other must be down, + * but isn't. This can ultimately cause corruption. + */ + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != + o2net_idle_timeout()) { + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " + "idle timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout()); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != + o2net_keepalive_delay()) { + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " + "delay of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay()); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != + O2HB_MAX_WRITE_TIMEOUT_MS) { + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " + "timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + sc->sc_handshake_ok = 1; + + spin_lock(&nn->nn_lock); + /* set valid and queue the idle timers only if it hasn't been + * shut down already */ + if (nn->nn_sc == sc) { + o2net_sc_reset_idle_timer(sc); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, sc, 1, 0); + } + spin_unlock(&nn->nn_lock); + + /* shift everything up as though it wasn't there */ + sc->sc_page_off -= sizeof(struct o2net_handshake); + if (sc->sc_page_off) + memmove(hand, hand + 1, sc->sc_page_off); + + return 0; +} + +/* this demuxes the queued rx bytes into header or payload bits and calls + * handlers as each full message is read off the socket. it returns -error, + * == 0 eof, or > 0 for progress made.*/ +static int o2net_advance_rx(struct o2net_sock_container *sc) +{ + struct o2net_msg *hdr; + int ret = 0; + void *data; + size_t datalen; + + sclog(sc, "receiving\n"); + o2net_set_advance_start_time(sc); + + if (unlikely(sc->sc_handshake_ok == 0)) { + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + } + + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { + o2net_check_handshake(sc); + if (unlikely(sc->sc_handshake_ok == 0)) + ret = -EPROTO; + } + goto out; + } + + /* do we need more header? */ + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_msg) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) { + sc->sc_page_off += ret; + /* only swab incoming here.. we can + * only get here once as we cross from + * being under to over */ + if (sc->sc_page_off == sizeof(struct o2net_msg)) { + hdr = page_address(sc->sc_page); + if (be16_to_cpu(hdr->data_len) > + O2NET_MAX_PAYLOAD_BYTES) + ret = -EOVERFLOW; + } + } + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + /* oof, still don't have a header */ + goto out; + } + + /* this was swabbed above when we first read it */ + hdr = page_address(sc->sc_page); + + msglog(hdr, "at page_off %zu\n", sc->sc_page_off); + + /* do we need more payload? */ + if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) { + /* need more payload */ + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) - + sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) { + /* we can only get here once, the first time we read + * the payload.. so set ret to progress if the handler + * works out. after calling this the message is toast */ + ret = o2net_process_message(sc, hdr); + if (ret == 0) + ret = 1; + sc->sc_page_off = 0; + } + +out: + sclog(sc, "ret = %d\n", ret); + o2net_set_advance_stop_time(sc); + return ret; +} + +/* this work func is triggerd by data ready. it reads until it can read no + * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing + * our work the work struct will be marked and we'll be called again. */ +static void o2net_rx_until_empty(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, sc_rx_work); + int ret; + + do { + ret = o2net_advance_rx(sc); + } while (ret > 0); + + if (ret <= 0 && ret != -EAGAIN) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + sclog(sc, "saw error %d, closing\n", ret); + /* not permanent so read failed handshake can retry */ + o2net_ensure_shutdown(nn, sc, 0); + } + + sc_put(sc); +} + +static int o2net_set_nodelay(struct socket *sock) +{ + int ret, val = 1; + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Dear unsuspecting programmer, + * + * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level + * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will + * silently turn into SO_DEBUG. + * + * Yours, + * Keeper of hilariously fragile interfaces. + */ + ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val)); + + set_fs(oldfs); + return ret; +} + +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + +static void o2net_initialize_handshake(void) +{ + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( + o2net_keepalive_delay()); + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( + o2net_reconnect_delay()); +} + +/* ------------------------------------------------------------ */ + +/* called when a connect completes and after a sock is accepted. the + * rx path will see the response and mark the sc valid */ +static void o2net_sc_connect_completed(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_connect_work); + + mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", + (unsigned long long)O2NET_PROTOCOL_VERSION, + (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + + o2net_initialize_handshake(); + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + sc_put(sc); +} + +/* this is called as a work_struct func. */ +static void o2net_sc_send_keep_req(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_keepalive_work.work); + + o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req)); + sc_put(sc); +} + +/* socket shutdown does a del_timer_sync against this as it tears down. + * we can't start this timer until we've got to the point in sc buildup + * where shutdown is going to be involved */ +static void o2net_idle_timer(unsigned long data) +{ + struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); +#ifdef CONFIG_DEBUG_FS + unsigned long msecs = ktime_to_ms(ktime_get()) - + ktime_to_ms(sc->sc_tv_timer); +#else + unsigned long msecs = o2net_idle_timeout(); +#endif + + printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); + + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. + */ + atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); + +} + +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) +{ + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, + msecs_to_jiffies(o2net_keepalive_delay())); + o2net_set_sock_timer(sc); + mod_timer(&sc->sc_idle_timeout, + jiffies + msecs_to_jiffies(o2net_idle_timeout())); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + + /* Only push out an existing timer */ + if (timer_pending(&sc->sc_idle_timeout)) + o2net_sc_reset_idle_timer(sc); +} + +/* this work func is kicked whenever a path sets the nn state which doesn't + * have valid set. This includes seeing hb come up, losing a connection, + * having a connect attempt fail, etc. This centralizes the logic which decides + * if a connect attempt should be made or if we should give up and all future + * transmit attempts should fail */ +static void o2net_start_connect(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_connect_work.work); + struct o2net_sock_container *sc = NULL; + struct o2nm_node *node = NULL, *mynode = NULL; + struct socket *sock = NULL; + struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; + int ret = 0, stop; + unsigned int timeout; + unsigned int noio_flag; + + /* + * sock_create allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); + /* if we're greater we initiate tx, otherwise we accept */ + if (o2nm_this_node() <= o2net_num_from_nn(nn)) + goto out; + + /* watch for racing with tearing a node down */ + node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); + if (node == NULL) { + ret = 0; + goto out; + } + + mynode = o2nm_get_node_by_num(o2nm_this_node()); + if (mynode == NULL) { + ret = 0; + goto out; + } + + spin_lock(&nn->nn_lock); + /* + * see if we already have one pending or have given up. + * For nn_timeout, it is set when we close the connection + * because of the idle time out. So it means that we have + * at least connected to that node successfully once, + * now try to connect to it again. + */ + timeout = atomic_read(&nn->nn_timeout); + stop = (nn->nn_sc || + (nn->nn_persistent_error && + (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); + spin_unlock(&nn->nn_lock); + if (stop) + goto out; + + nn->nn_last_connect_attempt = jiffies; + + sc = sc_alloc(node); + if (sc == NULL) { + mlog(0, "couldn't allocate sc\n"); + ret = -ENOMEM; + goto out; + } + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + mlog(0, "can't create socket: %d\n", ret); + goto out; + } + sc->sc_sock = sock; /* freed by sc_kref_release */ + + sock->sk->sk_allocation = GFP_ATOMIC; + + myaddr.sin_family = AF_INET; + myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; + myaddr.sin_port = htons(0); /* any port */ + + ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + sizeof(myaddr)); + if (ret) { + mlog(ML_ERROR, "bind failed with %d at address %pI4\n", + ret, &mynode->nd_ipv4_address); + goto out; + } + + ret = o2net_set_nodelay(sc->sc_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + + o2net_register_callbacks(sc->sc_sock->sk, sc); + + spin_lock(&nn->nn_lock); + /* handshake completion will set nn->nn_sc_valid */ + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + remoteaddr.sin_family = AF_INET; + remoteaddr.sin_addr.s_addr = node->nd_ipv4_address; + remoteaddr.sin_port = node->nd_ipv4_port; + + ret = sc->sc_sock->ops->connect(sc->sc_sock, + (struct sockaddr *)&remoteaddr, + sizeof(remoteaddr), + O_NONBLOCK); + if (ret == -EINPROGRESS) + ret = 0; + +out: + if (ret && sc) { + printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT + " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); + /* 0 err so that another will be queued and attempted + * from set_nn_state */ + o2net_ensure_shutdown(nn, sc, 0); + } + if (sc) + sc_put(sc); + if (node) + o2nm_node_put(node); + if (mynode) + o2nm_node_put(mynode); + + memalloc_noio_restore(noio_flag); + return; +} + +static void o2net_connect_expired(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_connect_expired.work); + + spin_lock(&nn->nn_lock); + if (!nn->nn_sc_valid) { + printk(KERN_NOTICE "o2net: No connection established with " + "node %u after %u.%u seconds, check network and" + " cluster configuration.\n", + o2net_num_from_nn(nn), + o2net_idle_timeout() / 1000, + o2net_idle_timeout() % 1000); + + o2net_set_nn_state(nn, NULL, 0, 0); + } + spin_unlock(&nn->nn_lock); +} + +static void o2net_still_up(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_still_up.work); + + o2quo_hb_still_up(o2net_num_from_nn(nn)); +} + +/* ------------------------------------------------------------ */ + +void o2net_disconnect_node(struct o2nm_node *node) +{ + struct o2net_node *nn = o2net_nn_from_num(node->nd_num); + + /* don't reconnect until it's heartbeating again */ + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + spin_unlock(&nn->nn_lock); + + if (o2net_wq) { + cancel_delayed_work(&nn->nn_connect_expired); + cancel_delayed_work(&nn->nn_connect_work); + cancel_delayed_work(&nn->nn_still_up); + flush_workqueue(o2net_wq); + } +} + +static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, + void *data) +{ + o2quo_hb_down(node_num); + + if (!node) + return; + + if (node_num != o2nm_this_node()) + o2net_disconnect_node(node); + + BUG_ON(atomic_read(&o2net_connected_peers) < 0); +} + +static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, + void *data) +{ + struct o2net_node *nn = o2net_nn_from_num(node_num); + + o2quo_hb_up(node_num); + + BUG_ON(!node); + + /* ensure an immediate connect attempt */ + nn->nn_last_connect_attempt = jiffies - + (msecs_to_jiffies(o2net_reconnect_delay()) + 1); + + if (node_num != o2nm_this_node()) { + /* believe it or not, accept and node hearbeating testing + * can succeed for this node before we got here.. so + * only use set_nn_state to clear the persistent error + * if that hasn't already happened */ + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + if (nn->nn_persistent_error) + o2net_set_nn_state(nn, NULL, 0, 0); + spin_unlock(&nn->nn_lock); + } +} + +void o2net_unregister_hb_callbacks(void) +{ + o2hb_unregister_callback(NULL, &o2net_hb_up); + o2hb_unregister_callback(NULL, &o2net_hb_down); +} + +int o2net_register_hb_callbacks(void) +{ + int ret; + + o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB, + o2net_hb_node_down_cb, NULL, O2NET_HB_PRI); + o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, + o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); + + ret = o2hb_register_callback(NULL, &o2net_hb_up); + if (ret == 0) + ret = o2hb_register_callback(NULL, &o2net_hb_down); + + if (ret) + o2net_unregister_hb_callbacks(); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static int o2net_accept_one(struct socket *sock, int *more) +{ + int ret, slen; + struct sockaddr_in sin; + struct socket *new_sock = NULL; + struct o2nm_node *node = NULL; + struct o2nm_node *local_node = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn; + unsigned int noio_flag; + + /* + * sock_create_lite allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); + + BUG_ON(sock == NULL); + *more = 0; + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + *more = 1; + new_sock->sk->sk_allocation = GFP_ATOMIC; + + ret = o2net_set_nodelay(new_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + + slen = sizeof(sin); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, + &slen, 1); + if (ret < 0) + goto out; + + node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); + if (node == NULL) { + printk(KERN_NOTICE "o2net: Attempt to connect from unknown " + "node at %pI4:%d\n", &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + if (o2nm_this_node() >= node->nd_num) { + local_node = o2nm_get_node_by_num(o2nm_this_node()); + if (local_node) + printk(KERN_NOTICE "o2net: Unexpected connect attempt " + "seen at node '%s' (%u, %pI4:%d) from " + "node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + /* this happens all the time when the other node sees our heartbeat + * and tries to connect before we see their heartbeat */ + if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) { + mlog(ML_CONN, "attempt to connect from node '%s' at " + "%pI4:%d but it isn't heartbeating\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + nn = o2net_nn_from_num(node->nd_num); + + spin_lock(&nn->nn_lock); + if (nn->nn_sc) + ret = -EBUSY; + else + ret = 0; + spin_unlock(&nn->nn_lock); + if (ret) { + printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " + "at %pI4:%d but it already has an open connection\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + goto out; + } + + sc = sc_alloc(node); + if (sc == NULL) { + ret = -ENOMEM; + goto out; + } + + sc->sc_sock = new_sock; + new_sock = NULL; + + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + o2net_register_callbacks(sc->sc_sock->sk, sc); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + + o2net_initialize_handshake(); + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + +out: + if (new_sock) + sock_release(new_sock); + if (node) + o2nm_node_put(node); + if (local_node) + o2nm_node_put(local_node); + if (sc) + sc_put(sc); + + memalloc_noio_restore(noio_flag); + return ret; +} + +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ + +static void o2net_accept_many(struct work_struct *work) +{ + struct socket *sock = o2net_listen_sock; + int more; + int err; + + /* + * It is critical to note that due to interrupt moderation + * at the network driver level, we can't assume to get a + * softIRQ for every single conn since tcp SYN packets + * can arrive back-to-back, and therefore many pending + * accepts may result in just 1 softIRQ. If we terminate + * the o2net_accept_one() loop upon seeing an err, what happens + * to the rest of the conns in the queue? If no new SYN + * arrives for hours, no softIRQ will be delivered, + * and the connections will just sit in the queue. + */ + + for (;;) { + err = o2net_accept_one(sock, &more); + if (!more) + break; + cond_resched(); + } +} + +static void o2net_listen_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + + read_lock(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (ready == NULL) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + + if (sk->sk_state == TCP_LISTEN) { + queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; + } + +out: + read_unlock(&sk->sk_callback_lock); + if (ready != NULL) + ready(sk); +} + +static int o2net_open_listening_sock(__be32 addr, __be16 port) +{ + struct socket *sock = NULL; + int ret; + struct sockaddr_in sin = { + .sin_family = PF_INET, + .sin_addr = { .s_addr = addr }, + .sin_port = port, + }; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); + goto out; + } + + sock->sk->sk_allocation = GFP_ATOMIC; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = o2net_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + o2net_listen_sock = sock; + INIT_WORK(&o2net_listen_work, o2net_accept_many); + + sock->sk->sk_reuse = SK_CAN_REUSE; + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) { + printk(KERN_ERR "o2net: Error %d while binding socket at " + "%pI4:%u\n", ret, &addr, ntohs(port)); + goto out; + } + + ret = sock->ops->listen(sock, 64); + if (ret < 0) + printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", + ret, &addr, ntohs(port)); + +out: + if (ret) { + o2net_listen_sock = NULL; + if (sock) + sock_release(sock); + } + return ret; +} + +/* + * called from node manager when we should bring up our network listening + * socket. node manager handles all the serialization to only call this + * once and to match it with o2net_stop_listening(). note, + * o2nm_this_node() doesn't work yet as we're being called while it + * is being set up. + */ +int o2net_start_listening(struct o2nm_node *node) +{ + int ret = 0; + + BUG_ON(o2net_wq != NULL); + BUG_ON(o2net_listen_sock != NULL); + + mlog(ML_KTHREAD, "starting o2net thread...\n"); + o2net_wq = create_singlethread_workqueue("o2net"); + if (o2net_wq == NULL) { + mlog(ML_ERROR, "unable to launch o2net thread\n"); + return -ENOMEM; /* ? */ + } + + ret = o2net_open_listening_sock(node->nd_ipv4_address, + node->nd_ipv4_port); + if (ret) { + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + } else + o2quo_conn_up(node->nd_num); + + return ret; +} + +/* again, o2nm_this_node() doesn't work here as we're involved in + * tearing it down */ +void o2net_stop_listening(struct o2nm_node *node) +{ + struct socket *sock = o2net_listen_sock; + size_t i; + + BUG_ON(o2net_wq == NULL); + BUG_ON(o2net_listen_sock == NULL); + + /* stop the listening socket from generating work */ + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = sock->sk->sk_user_data; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2nm_node *node = o2nm_get_node_by_num(i); + if (node) { + o2net_disconnect_node(node); + o2nm_node_put(node); + } + } + + /* finish all work and tear down the work queue */ + mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n"); + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + + sock_release(o2net_listen_sock); + o2net_listen_sock = NULL; + + o2quo_conn_err(node->nd_num); +} + +/* ------------------------------------------------------------ */ + +int o2net_init(void) +{ + unsigned long i; + + o2quo_init(); + + if (o2net_debugfs_init()) + goto out; + + o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); + o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) + goto out; + + o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); + o2net_hand->connector_id = cpu_to_be64(1); + + o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC); + o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2net_node *nn = o2net_nn_from_num(i); + + atomic_set(&nn->nn_timeout, 0); + spin_lock_init(&nn->nn_lock); + INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); + INIT_DELAYED_WORK(&nn->nn_connect_expired, + o2net_connect_expired); + INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up); + /* until we see hb from a node we'll return einval */ + nn->nn_persistent_error = -ENOTCONN; + init_waitqueue_head(&nn->nn_sc_wq); + idr_init(&nn->nn_status_idr); + INIT_LIST_HEAD(&nn->nn_status_list); + } + + return 0; + +out: + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + + o2quo_exit(); + return -ENOMEM; +} + +void o2net_exit(void) +{ + o2quo_exit(); + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + o2net_debugfs_exit(); +} diff --git a/kernel/fs/ocfs2/cluster/tcp.h b/kernel/fs/ocfs2/cluster/tcp.h new file mode 100644 index 000000000..c571e849f --- /dev/null +++ b/kernel/fs/ocfs2/cluster/tcp.h @@ -0,0 +1,155 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * tcp.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_TCP_H +#define O2CLUSTER_TCP_H + +#include +#ifdef __KERNEL__ +#include +#include +#else +#include +#endif +#include +#include + +struct o2net_msg +{ + __be16 magic; + __be16 data_len; + __be16 msg_type; + __be16 pad1; + __be32 sys_status; + __be32 status; + __be32 key; + __be32 msg_num; + __u8 buf[0]; +}; + +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +typedef void (o2net_post_msg_handler_func)(int status, void *data, + void *ret_data); + +#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) + +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 + +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 + +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff + +/* TODO: figure this out.... */ +static inline int o2net_link_down(int err, struct socket *sock) +{ + if (sock) { + if (sock->sk->sk_state != TCP_ESTABLISHED && + sock->sk->sk_state != TCP_CLOSE_WAIT) + return 1; + } + + if (err >= 0) + return 0; + switch (err) { + /* ????????????????????????? */ + case -ERESTARTSYS: + case -EBADF: + /* When the server has died, an ICMP port unreachable + * message prompts ECONNREFUSED. */ + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + return 1; + } + return 0; +} + +enum { + O2NET_DRIVER_UNINITED, + O2NET_DRIVER_READY, +}; + +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status); +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, + size_t veclen, u8 target_node, int *status); + +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, + struct list_head *unreg_list); +void o2net_unregister_handler_list(struct list_head *list); + +void o2net_fill_node_map(unsigned long *map, unsigned bytes); + +struct o2nm_node; +int o2net_register_hb_callbacks(void); +void o2net_unregister_hb_callbacks(void); +int o2net_start_listening(struct o2nm_node *node); +void o2net_stop_listening(struct o2nm_node *node); +void o2net_disconnect_node(struct o2nm_node *node); +int o2net_num_connected_peers(void); + +int o2net_init(void); +void o2net_exit(void); + +struct o2net_send_tracking; +struct o2net_sock_container; + +#ifdef CONFIG_DEBUG_FS +int o2net_debugfs_init(void); +void o2net_debugfs_exit(void); +void o2net_debug_add_nst(struct o2net_send_tracking *nst); +void o2net_debug_del_nst(struct o2net_send_tracking *nst); +void o2net_debug_add_sc(struct o2net_sock_container *sc); +void o2net_debug_del_sc(struct o2net_sock_container *sc); +#else +static inline int o2net_debugfs_init(void) +{ + return 0; +} +static inline void o2net_debugfs_exit(void) +{ +} +static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ +} +static inline void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +#endif /* O2CLUSTER_TCP_H */ diff --git a/kernel/fs/ocfs2/cluster/tcp_internal.h b/kernel/fs/ocfs2/cluster/tcp_internal.h new file mode 100644 index 000000000..b95e7df5b --- /dev/null +++ b/kernel/fs/ocfs2/cluster/tcp_internal.h @@ -0,0 +1,242 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_TCP_INTERNAL_H +#define O2CLUSTER_TCP_INTERNAL_H + +#define O2NET_MSG_MAGIC ((u16)0xfa55) +#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56) +#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) +#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) + +/* we're delaying our quorum decision so that heartbeat will have timed + * out truly dead nodes by the time we come around to making decisions + * on their number */ +#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) + +/* + * This version number represents quite a lot, unfortunately. It not + * only represents the raw network message protocol on the wire but also + * locking semantics of the file system using the protocol. It should + * be somewhere else, I'm sure, but right now it isn't. + * + * With version 11, we separate out the filesystem locking portion. The + * filesystem now has a major.minor version it negotiates. Version 11 + * introduces this negotiation to the o2dlm protocol, and as such the + * version here in tcp_internal.h should not need to be bumped for + * filesystem locking changes. + * + * New in version 11 + * - Negotiation of filesystem locking in the dlm join. + * + * New in version 10: + * - Meta/data locks combined + * + * New in version 9: + * - All votes removed + * + * New in version 8: + * - Replace delete inode votes with a cluster lock + * + * New in version 7: + * - DLM join domain includes the live nodemap + * + * New in version 6: + * - DLM lockres remote refcount fixes. + * + * New in version 5: + * - Network timeout checking protocol + * + * New in version 4: + * - Remove i_generation from lock names for better stat performance. + * + * New in version 3: + * - Replace dentry votes with a cluster lock + * + * New in version 2: + * - full 64 bit i_size in the metadata lock lvbs + * - introduction of "rw" lock and pushing meta/data locking down + */ +#define O2NET_PROTOCOL_VERSION 11ULL +struct o2net_handshake { + __be64 protocol_version; + __be64 connector_id; + __be32 o2hb_heartbeat_timeout_ms; + __be32 o2net_idle_timeout_ms; + __be32 o2net_keepalive_delay_ms; + __be32 o2net_reconnect_delay_ms; +}; + +struct o2net_node { + /* this is never called from int/bh */ + spinlock_t nn_lock; + + /* set the moment an sc is allocated and a connect is started */ + struct o2net_sock_container *nn_sc; + /* _valid is only set after the handshake passes and tx can happen */ + unsigned nn_sc_valid:1; + /* if this is set tx just returns it */ + int nn_persistent_error; + /* It is only set to 1 after the idle time out. */ + atomic_t nn_timeout; + + /* threads waiting for an sc to arrive wait on the wq for generation + * to increase. it is increased when a connecting socket succeeds + * or fails or when an accepted socket is attached. */ + wait_queue_head_t nn_sc_wq; + + struct idr nn_status_idr; + struct list_head nn_status_list; + + /* connects are attempted from when heartbeat comes up until either hb + * goes down, the node is unconfigured, or a connect succeeds. + * connect_work is queued from set_nn_state both from hb up and from + * itself if a connect attempt fails and so can be self-arming. + * shutdown is careful to first mark the nn such that no connects will + * be attempted before canceling delayed connect work and flushing the + * queue. */ + struct delayed_work nn_connect_work; + unsigned long nn_last_connect_attempt; + + /* this is queued as nodes come up and is canceled when a connection is + * established. this expiring gives up on the node and errors out + * transmits */ + struct delayed_work nn_connect_expired; + + /* after we give up on a socket we wait a while before deciding + * that it is still heartbeating and that we should do some + * quorum work */ + struct delayed_work nn_still_up; +}; + +struct o2net_sock_container { + struct kref sc_kref; + /* the next two are valid for the life time of the sc */ + struct socket *sc_sock; + struct o2nm_node *sc_node; + + /* all of these sc work structs hold refs on the sc while they are + * queued. they should not be able to ref a freed sc. the teardown + * race is with o2net_wq destruction in o2net_stop_listening() */ + + /* rx and connect work are generated from socket callbacks. sc + * shutdown removes the callbacks and then flushes the work queue */ + struct work_struct sc_rx_work; + struct work_struct sc_connect_work; + /* shutdown work is triggered in two ways. the simple way is + * for a code path calls ensure_shutdown which gets a lock, removes + * the sc from the nn, and queues the work. in this case the + * work is single-shot. the work is also queued from a sock + * callback, though, and in this case the work will find the sc + * still on the nn and will call ensure_shutdown itself.. this + * ends up triggering the shutdown work again, though nothing + * will be done in that second iteration. so work queue teardown + * has to be careful to remove the sc from the nn before waiting + * on the work queue so that the shutdown work doesn't remove the + * sc and rearm itself. + */ + struct work_struct sc_shutdown_work; + + struct timer_list sc_idle_timeout; + struct delayed_work sc_keepalive_work; + + unsigned sc_handshake_ok:1; + + struct page *sc_page; + size_t sc_page_off; + + /* original handlers for the sockets */ + void (*sc_state_change)(struct sock *sk); + void (*sc_data_ready)(struct sock *sk); + + u32 sc_msg_key; + u16 sc_msg_type; + +#ifdef CONFIG_DEBUG_FS + struct list_head sc_net_debug_item; + ktime_t sc_tv_timer; + ktime_t sc_tv_data_ready; + ktime_t sc_tv_advance_start; + ktime_t sc_tv_advance_stop; + ktime_t sc_tv_func_start; + ktime_t sc_tv_func_stop; +#endif +#ifdef CONFIG_OCFS2_FS_STATS + ktime_t sc_tv_acquiry_total; + ktime_t sc_tv_send_total; + ktime_t sc_tv_status_total; + u32 sc_send_count; + u32 sc_recv_count; + ktime_t sc_tv_process_total; +#endif + struct mutex sc_send_lock; +}; + +struct o2net_msg_handler { + struct rb_node nh_node; + u32 nh_max_len; + u32 nh_msg_type; + u32 nh_key; + o2net_msg_handler_func *nh_func; + o2net_msg_handler_func *nh_func_data; + o2net_post_msg_handler_func + *nh_post_func; + struct kref nh_kref; + struct list_head nh_unregister_item; +}; + +enum o2net_system_error { + O2NET_ERR_NONE = 0, + O2NET_ERR_NO_HNDLR, + O2NET_ERR_OVERFLOW, + O2NET_ERR_DIED, + O2NET_ERR_MAX +}; + +struct o2net_status_wait { + enum o2net_system_error ns_sys_status; + s32 ns_status; + int ns_id; + wait_queue_head_t ns_wq; + struct list_head ns_node_item; +}; + +#ifdef CONFIG_DEBUG_FS +/* just for state dumps */ +struct o2net_send_tracking { + struct list_head st_net_debug_item; + struct task_struct *st_task; + struct o2net_sock_container *st_sc; + u32 st_id; + u32 st_msg_type; + u32 st_msg_key; + u8 st_node; + ktime_t st_sock_time; + ktime_t st_send_time; + ktime_t st_status_time; +}; +#else +struct o2net_send_tracking { + u32 dummy; +}; +#endif /* CONFIG_DEBUG_FS */ + +#endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/kernel/fs/ocfs2/dcache.c b/kernel/fs/ocfs2/dcache.c new file mode 100644 index 000000000..290373024 --- /dev/null +++ b/kernel/fs/ocfs2/dcache.c @@ -0,0 +1,474 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.c + * + * dentry cache handling code + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "ocfs2_trace.h" + +void ocfs2_dentry_attach_gen(struct dentry *dentry) +{ + unsigned long gen = + OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + BUG_ON(d_inode(dentry)); + dentry->d_fsdata = (void *)gen; +} + + +static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + int ret = 0; /* if all else fails, just return false */ + struct ocfs2_super *osb; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + osb = OCFS2_SB(dentry->d_sb); + + trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, + dentry->d_name.name); + + /* For a negative dentry - + * check the generation number of the parent and compare with the + * one stored in the inode. + */ + if (inode == NULL) { + unsigned long gen = (unsigned long) dentry->d_fsdata; + unsigned long pgen; + spin_lock(&dentry->d_lock); + pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + spin_unlock(&dentry->d_lock); + trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, + dentry->d_name.name, + pgen, gen); + if (gen != pgen) + goto bail; + goto valid; + } + + BUG_ON(!osb); + + if (inode == osb->root_inode || is_bad_inode(inode)) + goto bail; + + spin_lock(&OCFS2_I(inode)->ip_lock); + /* did we or someone else delete this inode? */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + trace_ocfs2_dentry_revalidate_delete( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* + * We don't need a cluster lock to test this because once an + * inode nlink hits zero, it never goes back. + */ + if (inode->i_nlink == 0) { + trace_ocfs2_dentry_revalidate_orphaned( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); + goto bail; + } + + /* + * If the last lookup failed to create dentry lock, let us + * redo it. + */ + if (!dentry->d_fsdata) { + trace_ocfs2_dentry_revalidate_nofsdata( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + +valid: + ret = 1; + +bail: + trace_ocfs2_dentry_revalidate_ret(ret); + return ret; +} + +static int ocfs2_match_dentry(struct dentry *dentry, + u64 parent_blkno, + int skip_unhashed) +{ + struct inode *parent; + + /* + * ocfs2_lookup() does a d_splice_alias() _before_ attaching + * to the lock data, so we skip those here, otherwise + * ocfs2_dentry_attach_lock() will get its original dentry + * back. + */ + if (!dentry->d_fsdata) + return 0; + + if (!dentry->d_parent) + return 0; + + if (skip_unhashed && d_unhashed(dentry)) + return 0; + + parent = d_inode(dentry->d_parent); + /* Negative parent dentry? */ + if (!parent) + return 0; + + /* Name is in a different directory. */ + if (OCFS2_I(parent)->ip_blkno != parent_blkno) + return 0; + + return 1; +} + +/* + * Walk the inode alias list, and find a dentry which has a given + * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it + * is looking for a dentry_lock reference. The downconvert thread is + * looking to unhash aliases, so we allow it to skip any that already + * have that property. + */ +struct dentry *ocfs2_find_local_alias(struct inode *inode, + u64 parent_blkno, + int skip_unhashed) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { + trace_ocfs2_find_local_alias(dentry->d_name.len, + dentry->d_name.name); + + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + return dentry; + } + spin_unlock(&dentry->d_lock); + } + spin_unlock(&inode->i_lock); + return NULL; +} + +DEFINE_SPINLOCK(dentry_attach_lock); + +/* + * Attach this dentry to a cluster lock. + * + * Dentry locks cover all links in a given directory to a particular + * inode. We do this so that ocfs2 can build a lock name which all + * nodes in the cluster can agree on at all times. Shoving full names + * in the cluster lock won't work due to size restrictions. Covering + * links inside of a directory is a good compromise because it still + * allows us to use the parent directory lock to synchronize + * operations. + * + * Call this function with the parent dir semaphore and the parent dir + * cluster lock held. + * + * The dir semaphore will protect us from having to worry about + * concurrent processes on our node trying to attach a lock at the + * same time. + * + * The dir cluster lock (held at either PR or EX mode) protects us + * from unlink and rename on other nodes. + * + * A dput() can happen asynchronously due to pruning, so we cover + * attaching and detaching the dentry lock with a + * dentry_attach_lock. + * + * A node which has done lookup on a name retains a protected read + * lock until final dput. If the user requests and unlink or rename, + * the protected read is upgraded to an exclusive lock. Other nodes + * who have seen the dentry will then be informed that they need to + * downgrade their lock, which will involve d_delete on the + * dentry. This happens in ocfs2_dentry_convert_worker(). + */ +int ocfs2_dentry_attach_lock(struct dentry *dentry, + struct inode *inode, + u64 parent_blkno) +{ + int ret; + struct dentry *alias; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, dl); + + /* + * Negative dentry. We ignore these for now. + * + * XXX: Could we can improve ocfs2_dentry_revalidate() by + * tracking these? + */ + if (!inode) + return 0; + + if (d_really_is_negative(dentry) && dentry->d_fsdata) { + /* Converting a negative dentry to positive + Clear dentry->d_fsdata */ + dentry->d_fsdata = dl = NULL; + } + + if (dl) { + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + return 0; + } + + alias = ocfs2_find_local_alias(inode, parent_blkno, 0); + if (alias) { + /* + * Great, an alias exists, which means we must have a + * dentry lock already. We can just grab the lock off + * the alias and add it to the list. + * + * We're depending here on the fact that this dentry + * was found and exists in the dcache and so must have + * a reference to the dentry_lock because we can't + * race creates. Final dput() cannot happen on it + * since we have it pinned, so our reference is safe. + */ + dl = alias->d_fsdata; + mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n", + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + + trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name, + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + goto out_attach; + } + + /* + * There are no other aliases + */ + dl = kmalloc(sizeof(*dl), GFP_NOFS); + if (!dl) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + dl->dl_count = 0; + /* + * Does this have to happen below, for all attaches, in case + * the struct inode gets blown away by the downconvert thread? + */ + dl->dl_inode = igrab(inode); + dl->dl_parent_blkno = parent_blkno; + ocfs2_dentry_lock_res_init(dl, parent_blkno, inode); + +out_attach: + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = dl; + dl->dl_count++; + spin_unlock(&dentry_attach_lock); + + /* + * This actually gets us our PRMODE level lock. From now on, + * we'll have a notification if one of these names is + * destroyed on another node. + */ + ret = ocfs2_dentry_lock(dentry, 0); + if (!ret) + ocfs2_dentry_unlock(dentry, 0); + else + mlog_errno(ret); + + /* + * In case of error, manually free the allocation and do the iput(). + * We need to do this because error here means no d_instantiate(), + * which means iput() will not be called during dput(dentry). + */ + if (ret < 0 && !alias) { + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); + } + + dput(alias); + + return ret; +} + +/* + * ocfs2_dentry_iput() and friends. + * + * At this point, our particular dentry is detached from the inodes + * alias list, so there's no way that the locking code can find it. + * + * The interesting stuff happens when we determine that our lock needs + * to go away because this is the last subdir alias in the + * system. This function needs to handle a couple things: + * + * 1) Synchronizing lock shutdown with the downconvert threads. This + * is already handled for us via the lockres release drop function + * called in ocfs2_release_dentry_lock() + * + * 2) A race may occur when we're doing our lock shutdown and + * another process wants to create a new dentry lock. Right now we + * let them race, which means that for a very short while, this + * node might have two locks on a lock resource. This should be a + * problem though because one of them is in the process of being + * thrown out. + */ +static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + iput(dl->dl_inode); + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + kfree(dl); +} + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + int unlock = 0; + + BUG_ON(dl->dl_count == 0); + + spin_lock(&dentry_attach_lock); + dl->dl_count--; + unlock = !dl->dl_count; + spin_unlock(&dentry_attach_lock); + + if (unlock) + ocfs2_drop_dentry_lock(osb, dl); +} + +static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + if (!dl) { + /* + * No dentry lock is ok if we're disconnected or + * unhashed. + */ + if (!(dentry->d_flags & DCACHE_DISCONNECTED) && + !d_unhashed(dentry)) { + unsigned long long ino = 0ULL; + if (inode) + ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; + mlog(ML_ERROR, "Dentry is missing cluster lock. " + "inode: %llu, d_flags: 0x%x, d_name: %pd\n", + ino, dentry->d_flags, dentry); + } + + goto out; + } + + mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n", + dentry, dl->dl_count); + + ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl); + +out: + iput(inode); +} + +/* + * d_move(), but keep the locks in sync. + * + * When we are done, "dentry" will have the parent dir and name of + * "target", which will be thrown away. + * + * We manually update the lock of "dentry" if need be. + * + * "target" doesn't have it's dentry lock touched - we allow the later + * dput() to handle this for us. + * + * This is called during ocfs2_rename(), while holding parent + * directory locks. The dentries have already been deleted on other + * nodes via ocfs2_remote_dentry_delete(). + * + * Normally, the VFS handles the d_move() for the file system, after + * the ->rename() callback. OCFS2 wants to handle this internally, so + * the new lock can be created atomically with respect to the cluster. + */ +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); + struct inode *inode = d_inode(dentry); + + /* + * Move within the same directory, so the actual lock info won't + * change. + * + * XXX: Is there any advantage to dropping the lock here? + */ + if (old_dir == new_dir) + goto out_move; + + ocfs2_dentry_lock_put(osb, dentry->d_fsdata); + + dentry->d_fsdata = NULL; + ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno); + if (ret) + mlog_errno(ret); + +out_move: + d_move(dentry, target); +} + +const struct dentry_operations ocfs2_dentry_ops = { + .d_revalidate = ocfs2_dentry_revalidate, + .d_iput = ocfs2_dentry_iput, +}; diff --git a/kernel/fs/ocfs2/dcache.h b/kernel/fs/ocfs2/dcache.h new file mode 100644 index 000000000..55f58892b --- /dev/null +++ b/kernel/fs/ocfs2/dcache.h @@ -0,0 +1,59 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DCACHE_H +#define OCFS2_DCACHE_H + +extern const struct dentry_operations ocfs2_dentry_ops; + +struct ocfs2_dentry_lock { + unsigned int dl_count; + u64 dl_parent_blkno; + + /* + * The ocfs2_dentry_lock keeps an inode reference until + * dl_lockres has been destroyed. This is usually done in + * ->d_iput() anyway, so there should be minimal impact. + */ + struct inode *dl_inode; + struct ocfs2_lock_res dl_lockres; +}; + +int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, + u64 parent_blkno); + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl); + +struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, + int skip_unhashed); + +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir); + +extern spinlock_t dentry_attach_lock; +void ocfs2_dentry_attach_gen(struct dentry *dentry); + +#endif /* OCFS2_DCACHE_H */ diff --git a/kernel/fs/ocfs2/dir.c b/kernel/fs/ocfs2/dir.c new file mode 100644 index 000000000..ccd4dcfc3 --- /dev/null +++ b/kernel/fs/ocfs2/dir.c @@ -0,0 +1,4501 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c + * + * Creates, reads, walks and deletes directory-nodes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +static unsigned char ocfs2_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ocfs2_do_extend_dir(struct super_block *sb, + handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh); +static int ocfs2_dir_indexed(struct inode *inode); + +/* + * These are distinct checks because future versions of the file system will + * want to have a trailing dirent structure independent of indexing. + */ +static int ocfs2_supports_dir_trailer(struct inode *dir) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); +} + +/* + * "new' here refers to the point at which we're creating a new + * directory via "mkdir()", but also when we're expanding an inline + * directory. In either case, we don't yet have the indexing bit set + * on the directory, so the standard checks will fail in when metaecc + * is turned off. Only directory-initialization type functions should + * use this then. Everything else wants ocfs2_supports_dir_trailer() + */ +static int ocfs2_new_dir_wants_trailer(struct inode *dir) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + return ocfs2_meta_ecc(osb) || + ocfs2_supports_indexed_dirs(osb); +} + +static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) +{ + return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); +} + +#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) + +/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make + * them more consistent? */ +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data) +{ + char *p = data; + + p += blocksize - sizeof(struct ocfs2_dir_block_trailer); + return (struct ocfs2_dir_block_trailer *)p; +} + +/* + * XXX: This is executed once on every dirent. We should consider optimizing + * it. + */ +static int ocfs2_skip_dir_trailer(struct inode *dir, + struct ocfs2_dir_entry *de, + unsigned long offset, + unsigned long blklen) +{ + unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); + + if (!ocfs2_supports_dir_trailer(dir)) + return 0; + + if (offset != toff) + return 0; + + return 1; +} + +static void ocfs2_init_dir_trailer(struct inode *inode, + struct buffer_head *bh, u16 rec_len) +{ + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); + strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); + trailer->db_compat_rec_len = + cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); + trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + trailer->db_blkno = cpu_to_le64(bh->b_blocknr); + trailer->db_free_rec_len = cpu_to_le16(rec_len); +} +/* + * Link an unindexed block with a dir trailer structure into the index free + * list. This function will modify dirdata_bh, but assumes you've already + * passed it to the journal. + */ +static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, + struct buffer_head *dx_root_bh, + struct buffer_head *dirdata_bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer; + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) +{ + return res->dl_prev_leaf_bh == NULL; +} + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) +{ + brelse(res->dl_dx_root_bh); + brelse(res->dl_leaf_bh); + brelse(res->dl_dx_leaf_bh); + brelse(res->dl_prev_leaf_bh); +} + +static int ocfs2_dir_indexed(struct inode *inode) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) + return 1; + return 0; +} + +static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) +{ + return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; +} + +/* + * Hashing code adapted from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, + struct ocfs2_dx_hinfo *hinfo) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + const char *p; + __u32 in[8], buf[4]; + + /* + * XXX: Is this really necessary, if the index is never looked + * at by readdir? Is a hash value of '0' a bad idea? + */ + if ((len == 1 && !strncmp(".", name, 1)) || + (len == 2 && !strncmp("..", name, 2))) { + buf[0] = buf[1] = 0; + goto out; + } + +#ifdef OCFS2_DEBUG_DX_DIRS + /* + * This makes it very easy to debug indexing problems. We + * should never allow this to be selected without hand editing + * this file though. + */ + buf[0] = buf[1] = len; + goto out; +#endif + + memcpy(buf, osb->osb_dx_seed, sizeof(buf)); + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + +out: + hinfo->major_hash = buf[0]; + hinfo->minor_hash = buf[1]; +} + +/* + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. + */ +static int ocfs2_check_dir_entry(struct inode * dir, + struct ocfs2_dir_entry * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char *error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely( + ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) + error_msg = "directory entry across blocks"; + + if (unlikely(error_msg != NULL)) + mlog(ML_ERROR, "bad entry in directory #%llu: %s - " + "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, + offset, (unsigned long long)le64_to_cpu(de->inode), rlen, + de->name_len); + + return error_msg == NULL ? 1 : 0; +} + +static inline int ocfs2_match(int len, + const char * const name, + struct ocfs2_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + char *first_de, + unsigned int bytes, + struct ocfs2_dir_entry **res_dir) +{ + struct ocfs2_dir_entry *de; + char *dlimit, *de_buf; + int de_len; + int ret = 0; + + de_buf = first_de; + dlimit = de_buf + bytes; + + while (de_buf < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + de = (struct ocfs2_dir_entry *) de_buf; + + if (de_buf + namelen <= dlimit && + ocfs2_match(namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + ret = -1; + goto bail; + } + *res_dir = de; + ret = 1; + goto bail; + } + + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) { + ret = -1; + goto bail; + } + + de_buf += de_len; + offset += de_len; + } + +bail: + trace_ocfs2_search_dirblock(ret); + return ret; +} + +static struct buffer_head *ocfs2_find_entry_id(const char *name, + int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + int ret, found; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0, + data->id_data, i_size_read(dir), res_dir); + if (found == 1) + return di_bh; + + brelse(di_bh); +out: + return NULL; +} + +static int ocfs2_validate_dir_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(bh, sb); + + + /* + * We don't validate dirents here, that's handled + * in-place when the code walks them. + */ + trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + * + * Note that we are safe to call this even if the directory + * doesn't have a trailer. Filesystems without metaecc will do + * nothing, and filesystems with it will have one. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); + if (rc) + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + return rc; +} + +/* + * Validate a directory trailer. + * + * We check the trailer here rather than in ocfs2_validate_dir_block() + * because that function doesn't have the inode to test. + */ +static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(dir)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } +out: + return rc; +} + +/* + * This function forces all errors to -EIO for consistency with its + * predecessor, ocfs2_bread(). We haven't audited what returning the + * real error codes would do to callers. We log the real codes with + * mlog_errno() before we squash them. + */ +static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, int flags) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, + ocfs2_validate_dir_block); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (!(flags & OCFS2_BH_READAHEAD) && + ocfs2_supports_dir_trailer(inode)) { + rc = ocfs2_check_dir_trailer(inode, tmp); + if (rc) { + if (!*bh) + brelse(tmp); + mlog_errno(rc); + goto out; + } + } + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc ? -EIO : 0; +} + +/* + * Read the block at 'phys' which belongs to this directory + * inode. This function does no virtual->physical block translation - + * what's passed in is assumed to be a valid directory block. + */ +static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, + struct buffer_head **bh) +{ + int ret; + struct buffer_head *tmp = *bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, + ocfs2_validate_dir_block); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ocfs2_supports_dir_trailer(dir)) { + ret = ocfs2_check_dir_trailer(dir, tmp); + if (ret) { + if (!*bh) + brelse(tmp); + mlog_errno(ret); + goto out; + } + } + + if (!ret && !*bh) + *bh = tmp; +out: + return ret; +} + +static int ocfs2_validate_dx_root(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + + BUG_ON(!buffer_uptodate(bh)); + + dx_root = (struct ocfs2_dx_root_block *) bh->b_data; + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index root block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { + ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, + struct buffer_head **dx_root_bh) +{ + int ret; + u64 blkno = le64_to_cpu(di->i_dx_root); + struct buffer_head *tmp = *dx_root_bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_root); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_root_bh) + *dx_root_bh = tmp; + + return ret; +} + +static int ocfs2_validate_dx_leaf(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index leaf block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { + ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", + 7, dx_leaf->dl_signature); + return -EROFS; + } + + return 0; +} + +static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, + struct buffer_head **dx_leaf_bh) +{ + int ret; + struct buffer_head *tmp = *dx_leaf_bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_leaf); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_leaf_bh) + *dx_leaf_bh = tmp; + + return ret; +} + +/* + * Read a series of dx_leaf blocks. This expects all buffer_head + * pointers to be NULL on function entry. + */ +static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, + struct buffer_head **dx_leaf_bhs) +{ + int ret; + + ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, + ocfs2_validate_dx_leaf); + if (ret) + mlog_errno(ret); + + return ret; +} + +static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + + sb = dir->i_sb; + + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + start = OCFS2_I(dir)->ip_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + + bh = NULL; + err = ocfs2_read_dir_block(dir, b++, &bh, + OCFS2_BH_READAHEAD); + bh_use[ra_max] = bh; + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + if (ocfs2_read_dir_block(dir, block, &bh, 0)) { + /* read error, skip block & hope for the best. + * ocfs2_read_dir_block() has released the bh. */ + mlog(ML_ERROR, "reading directory %llu, " + "offset %lu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + block); + goto next; + } + i = ocfs2_search_dirblock(bh, dir, name, namelen, + block << sb->s_blocksize_bits, + bh->b_data, sb->s_blocksize, + res_dir); + if (i == 1) { + OCFS2_I(dir)->ip_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + + trace_ocfs2_find_entry_el(ret); + return ret; +} + +static int ocfs2_dx_dir_lookup_rec(struct inode *inode, + struct ocfs2_extent_list *el, + u32 major_hash, + u32 *ret_cpos, + u64 *ret_phys_blkno, + unsigned int *ret_clen) +{ + int ret = 0, i, found; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "btree tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= major_hash) { + found = 1; + break; + } + } + + if (!found) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in btree", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + if (ret_phys_blkno) + *ret_phys_blkno = le64_to_cpu(rec->e_blkno); + if (ret_cpos) + *ret_cpos = le32_to_cpu(rec->e_cpos); + if (ret_clen) + *ret_clen = le16_to_cpu(rec->e_leaf_clusters); + +out: + brelse(eb_bh); + return ret; +} + +/* + * Returns the block index, from the start of the cluster which this + * hash belongs too. + */ +static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + u32 minor_hash) +{ + return minor_hash & osb->osb_dx_mask; +} + +static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + struct ocfs2_dx_hinfo *hinfo) +{ + return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); +} + +static int ocfs2_dx_dir_lookup(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_dx_hinfo *hinfo, + u32 *ret_cpos, + u64 *ret_phys_blkno) +{ + int ret = 0; + unsigned int cend, uninitialized_var(clen); + u32 uninitialized_var(cpos); + u64 uninitialized_var(blkno); + u32 name_hash = hinfo->major_hash; + + ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, + &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + cend = cpos + clen; + if (name_hash >= cend) { + /* We want the last cluster */ + blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); + cpos += clen - 1; + } else { + blkno += ocfs2_clusters_to_blocks(inode->i_sb, + name_hash - cpos); + cpos = name_hash; + } + + /* + * We now have the cluster which should hold our entry. To + * find the exact block from the start of the cluster to + * search, we take the lower bits of the hash. + */ + blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); + + if (ret_phys_blkno) + *ret_phys_blkno = blkno; + if (ret_cpos) + *ret_cpos = cpos; + +out: + + return ret; +} + +static int ocfs2_dx_dir_search(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dx_root_block *dx_root, + struct ocfs2_dir_lookup_result *res) +{ + int ret, i, found; + u64 uninitialized_var(phys); + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = NULL; + struct buffer_head *dir_ent_bh = NULL; + struct ocfs2_dir_entry *dir_ent = NULL; + struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; + struct ocfs2_extent_list *dr_el; + struct ocfs2_dx_entry_list *entry_list; + + ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); + + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + goto search; + } + + dr_el = &dx_root->dr_list; + + ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno, + namelen, name, hinfo->major_hash, + hinfo->minor_hash, (unsigned long long)phys); + + ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; + + trace_ocfs2_dx_dir_search_leaf_info( + le16_to_cpu(dx_leaf->dl_list.de_num_used), + le16_to_cpu(dx_leaf->dl_list.de_count)); + + entry_list = &dx_leaf->dl_list; + +search: + /* + * Empty leaf is legal, so no need to check for that. + */ + found = 0; + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) + || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) + continue; + + /* + * Search unindexed leaf block now. We're not + * guaranteed to find anything. + */ + ret = ocfs2_read_dir_block_direct(dir, + le64_to_cpu(dx_entry->dx_dirent_blk), + &dir_ent_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: We should check the unindexed block here, + * before using it. + */ + + found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, + 0, dir_ent_bh->b_data, + dir->i_sb->s_blocksize, &dir_ent); + if (found == 1) + break; + + if (found == -1) { + /* This means we found a bad directory entry. */ + ret = -EIO; + mlog_errno(ret); + goto out; + } + + brelse(dir_ent_bh); + dir_ent_bh = NULL; + } + + if (found <= 0) { + ret = -ENOENT; + goto out; + } + + res->dl_leaf_bh = dir_ent_bh; + res->dl_entry = dir_ent; + res->dl_dx_leaf_bh = dx_leaf_bh; + res->dl_dx_entry = dx_entry; + + ret = 0; +out: + if (ret) { + brelse(dx_leaf_bh); + brelse(dir_ent_bh); + } + return ret; +} + +static int ocfs2_find_entry_dx(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + +/* + * Try to find an entry of the provided name within 'dir'. + * + * If nothing was found, -ENOENT is returned. Otherwise, zero is + * returned and the struct 'res' will contain information useful to + * other directory manipulation functions. + * + * Caller can NOT assume anything about the contents of the + * buffer_heads - they are passed back only so that it can be passed + * into any one of the manipulation functions (add entry, delete + * entry, etc). As an example, bh in the extent directory case is a + * data block, in the inline-data case it actually points to an inode, + * in the indexed directory case, multiple buffers are involved. + */ +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, struct ocfs2_dir_lookup_result *lookup) +{ + struct buffer_head *bh; + struct ocfs2_dir_entry *res_dir = NULL; + + if (ocfs2_dir_indexed(dir)) + return ocfs2_find_entry_dx(name, namelen, dir, lookup); + + /* + * The unindexed dir code only uses part of the lookup + * structure, so there's no reason to push it down further + * than this. + */ + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); + else + bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + + if (bh == NULL) + return -ENOENT; + + lookup->dl_leaf_bh = bh; + lookup->dl_entry = res_dir; + return 0; +} + +/* + * Update inode number and type of a previously found directory entry. + */ +int ocfs2_update_entry(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *res, + struct inode *new_entry_inode) +{ + int ret; + ocfs2_journal_access_func access = ocfs2_journal_access_db; + struct ocfs2_dir_entry *de = res->dl_entry; + struct buffer_head *de_bh = res->dl_leaf_bh; + + /* + * The same code works fine for both inline-data and extent + * based directories, so no need to split this up. The only + * difference is the journal_access function. + */ + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + ret = access(handle, INODE_CACHE(dir), de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno); + ocfs2_set_de_type(de, new_entry_inode->i_mode); + + ocfs2_journal_dirty(handle, de_bh); + +out: + return ret; +} + +/* + * __ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh, char *first_de, + unsigned int bytes) +{ + struct ocfs2_dir_entry *de, *pde; + int i, status = -ENOENT; + ocfs2_journal_access_func access = ocfs2_journal_access_db; + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + i = 0; + pde = NULL; + de = (struct ocfs2_dir_entry *) first_de; + while (i < bytes) { + if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (de == de_del) { + status = access(handle, INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (pde) + le16_add_cpu(&pde->rec_len, + le16_to_cpu(de->rec_len)); + de->inode = 0; + dir->i_version++; + ocfs2_journal_dirty(handle, bh); + goto bail; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); + } +bail: + return status; +} + +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) +{ + unsigned int hole; + + if (le64_to_cpu(de->inode) == 0) + hole = le16_to_cpu(de->rec_len); + else + hole = le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len); + + return hole; +} + +static int ocfs2_find_max_rec_len(struct super_block *sb, + struct buffer_head *dirblock_bh) +{ + int size, this_hole, largest_hole = 0; + char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; + struct ocfs2_dir_entry *de; + + trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); + size = ocfs2_dir_trailer_blk_off(sb); + limit = start + size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + if (de_buf != trailer) { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + } + + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, + int index) +{ + int num_used = le16_to_cpu(entry_list->de_num_used); + + if (num_used == 1 || index == (num_used - 1)) + goto clear; + + memmove(&entry_list->de_entries[index], + &entry_list->de_entries[index + 1], + (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); +clear: + num_used--; + memset(&entry_list->de_entries[num_used], 0, + sizeof(struct ocfs2_dx_entry)); + entry_list->de_num_used = cpu_to_le16(num_used); +} + +static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, index, max_rec_len, add_to_free_list = 0; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + struct buffer_head *leaf_bh = lookup->dl_leaf_bh; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; + struct ocfs2_dir_block_trailer *trailer; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * This function gets a bit messy because we might have to + * modify the root block, regardless of whether the indexed + * entries are stored inline. + */ + + /* + * *Only* set 'entry_list' here, based on where we're looking + * for the indexed entries. Later, we might still want to + * journal both blocks, based on free list state. + */ + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + } else { + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; + entry_list = &dx_leaf->dl_list; + } + + /* Neither of these are a disk corruption - that should have + * been caught by lookup, before we got here. */ + BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); + BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); + + index = (char *)dx_entry - (char *)entry_list->de_entries; + index /= sizeof(*dx_entry); + + if (index >= le16_to_cpu(entry_list->de_num_used)) { + mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, + entry_list, dx_entry); + return -EIO; + } + + /* + * We know that removal of this dirent will leave enough room + * for a new one, so add this block to the free list if it + * isn't already there. + */ + trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (trailer->db_free_rec_len == 0) + add_to_free_list = 1; + + /* + * Add the block holding our index into the journal before + * removing the unindexed entry. If we get an error return + * from __ocfs2_delete_entry(), then it hasn't removed the + * entry yet. Likewise, successful return means we *must* + * remove the indexed entry. + * + * We're also careful to journal the root tree block here as + * the entry count needs to be updated. Also, we might be + * adding to the start of the free list. + */ + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + lookup->dl_dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno, + index); + + ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, + leaf_bh, leaf_bh->b_data, leaf_bh->b_size); + if (ret) { + mlog_errno(ret); + goto out; + } + + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + if (add_to_free_list) { + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); + ocfs2_journal_dirty(handle, dx_root_bh); + } + + /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ + ocfs2_journal_dirty(handle, leaf_bh); + + le32_add_cpu(&dx_root->dr_num_entries, -1); + ocfs2_journal_dirty(handle, dx_root_bh); + + ocfs2_dx_list_remove_entry(entry_list, index); + + if (!ocfs2_dx_root_inline(dx_root)) + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); + +out: + return ret; +} + +static inline int ocfs2_delete_entry_id(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data, + i_size_read(dir)); + + brelse(di_bh); +out: + return ret; +} + +static inline int ocfs2_delete_entry_el(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data, + bh->b_size); +} + +/* + * Delete a directory entry. Hide the details of directory + * implementation from the caller. + */ +int ocfs2_delete_entry(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_lookup_result *res) +{ + if (ocfs2_dir_indexed(dir)) + return ocfs2_delete_entry_dx(handle, dir, res); + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_delete_entry_id(handle, dir, res->dl_entry, + res->dl_leaf_bh); + + return ocfs2_delete_entry_el(handle, dir, res->dl_entry, + res->dl_leaf_bh); +} + +/* + * Check whether 'de' has enough room to hold an entry of + * 'new_rec_len' bytes. + */ +static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, + unsigned int new_rec_len) +{ + unsigned int de_really_used; + + /* Check whether this is an empty record with enough space */ + if (le64_to_cpu(de->inode) == 0 && + le16_to_cpu(de->rec_len) >= new_rec_len) + return 1; + + /* + * Record might have free space at the end which we can + * use. + */ + de_really_used = OCFS2_DIR_REC_LEN(de->name_len); + if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len)) + return 1; + + return 0; +} + +static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, + struct ocfs2_dx_entry *dx_new_entry) +{ + int i; + + i = le16_to_cpu(dx_leaf->dl_list.de_num_used); + dx_leaf->dl_list.de_entries[i] = *dx_new_entry; + + le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); +} + +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk) +{ + int i; + struct ocfs2_dx_entry *dx_entry; + + i = le16_to_cpu(entry_list->de_num_used); + dx_entry = &entry_list->de_entries[i]; + + memset(dx_entry, 0, sizeof(*dx_entry)); + dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); + dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); + dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); + + le16_add_cpu(&entry_list->de_num_used, 1); +} + +static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct buffer_head *dx_leaf_bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf; + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); + ocfs2_journal_dirty(handle, dx_leaf_bh); + +out: + return ret; +} + +static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct ocfs2_dx_root_block *dx_root) +{ + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); +} + +static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + ocfs2_dx_inline_root_insert(dir, handle, + &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + dx_root); + } else { + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + lookup->dl_dx_leaf_bh); + if (ret) + goto out; + } + + le32_add_cpu(&dx_root->dr_num_entries, 1); + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static void ocfs2_remove_block_from_free_list(struct inode *dir, + handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + struct ocfs2_dir_block_trailer *trailer, *prev; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *bh; + + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + dx_root = (struct ocfs2_dx_root_block *)bh->b_data; + dx_root->dr_free_blk = trailer->db_free_next; + } else { + bh = lookup->dl_prev_leaf_bh; + prev = ocfs2_trailer_from_bh(bh, dir->i_sb); + prev->db_free_next = trailer->db_free_next; + } + + trailer->db_free_rec_len = cpu_to_le16(0); + trailer->db_free_next = cpu_to_le64(0); + + ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); +} + +/* + * This expects that a journal write has been reserved on + * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh + */ +static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int max_rec_len; + struct ocfs2_dir_block_trailer *trailer; + + /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); + if (max_rec_len) { + /* + * There's still room in this block, so no need to remove it + * from the free list. In this case, we just want to update + * the rec len accounting. + */ + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); + } else { + ocfs2_remove_block_from_free_list(dir, handle, lookup); + } +} + +/* we don't always have a dentry for what we want to add, so people + * like orphan dir can call this instead. + * + * The lookup context must have been filled from + * ocfs2_prepare_dir_for_insert. + */ +int __ocfs2_add_entry(handle_t *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup) +{ + unsigned long offset; + unsigned short rec_len; + struct ocfs2_dir_entry *de, *de1; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; + struct super_block *sb = dir->i_sb; + int retval, status; + unsigned int size = sb->s_blocksize; + struct buffer_head *insert_bh = lookup->dl_leaf_bh; + char *data_start = insert_bh->b_data; + + if (!namelen) + return -EINVAL; + + if (ocfs2_dir_indexed(dir)) { + struct buffer_head *bh; + + /* + * An indexed dir may require that we update the free space + * list. Reserve a write to the previous node in the list so + * that we don't fail later. + * + * XXX: This can be either a dx_root_block, or an unindexed + * directory tree leaf block. + */ + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + retval = ocfs2_journal_access_dr(handle, + INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } else { + bh = lookup->dl_prev_leaf_bh; + retval = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } + if (retval) { + mlog_errno(retval); + return retval; + } + } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + data_start = di->id2.i_data.id_data; + size = i_size_read(dir); + + BUG_ON(insert_bh != parent_fe_bh); + } + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) data_start; + while (1) { + BUG_ON((char *)de >= (size + data_start)); + + /* These checks should've already been passed by the + * prepare function, but I guess we can leave them + * here anyway. */ + if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + retval = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + retval = -EEXIST; + goto bail; + } + + /* We're guaranteed that we should have space, so we + * can't possibly have hit the trailer...right? */ + mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), + "Hit dir trailer trying to insert %.*s " + "(namelen %d) into directory %llu. " + "offset is %lu, trailer offset is %d\n", + namelen, name, namelen, + (unsigned long long)parent_fe_bh->b_blocknr, + offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); + + if (ocfs2_dirent_would_fit(de, rec_len)) { + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + + if (insert_bh == parent_fe_bh) + status = ocfs2_journal_access_di(handle, + INODE_CACHE(dir), + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + else { + status = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_insert(dir, + handle, + lookup); + if (status) { + mlog_errno(status); + goto bail; + } + } + } + + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + de1 = (struct ocfs2_dir_entry *)((char *) de + + OCFS2_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = OCFS2_FT_UNKNOWN; + if (blkno) { + de->inode = cpu_to_le64(blkno); + ocfs2_set_de_type(de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + + if (ocfs2_dir_indexed(dir)) + ocfs2_recalc_free_list(dir, handle, lookup); + + dir->i_version++; + ocfs2_journal_dirty(handle, insert_bh); + retval = 0; + goto bail; + } + + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); + } + + /* when you think about it, the assert above should prevent us + * from ever getting here. */ + retval = -ENOSPC; +bail: + if (retval) + mlog_errno(retval); + + return retval; +} + +static int ocfs2_dir_foreach_blk_id(struct inode *inode, + u64 *f_version, + struct dir_context *ctx) +{ + int ret, i; + unsigned long offset = ctx->pos; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + struct ocfs2_dir_entry *de; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + while (ctx->pos < i_size_read(inode)) { + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (*f_version != inode->i_version) { + for (i = 0; i < i_size_read(inode) && i < offset; ) { + de = (struct ocfs2_dir_entry *) + (data->id_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + ctx->pos = offset = i; + *f_version = inode->i_version; + } + + de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); + if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { + /* On error, skip the f_pos to the end. */ + ctx->pos = i_size_read(inode); + break; + } + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), d_type)) + goto out; + } + ctx->pos += le16_to_cpu(de->rec_len); + } +out: + brelse(di_bh); + return 0; +} + +/* + * NOTE: This function can be called against unindexed directories, + * and indexed ones. + */ +static int ocfs2_dir_foreach_blk_el(struct inode *inode, + u64 *f_version, + struct dir_context *ctx, + bool persist) +{ + unsigned long offset, blk, last_ra_blk = 0; + int i; + struct buffer_head * bh, * tmp; + struct ocfs2_dir_entry * de; + struct super_block * sb = inode->i_sb; + unsigned int ra_sectors = 16; + int stored = 0; + + bh = NULL; + + offset = ctx->pos & (sb->s_blocksize - 1); + + while (ctx->pos < i_size_read(inode)) { + blk = ctx->pos >> sb->s_blocksize_bits; + if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { + /* Skip the corrupt dirblock and keep trying */ + ctx->pos += sb->s_blocksize - offset; + continue; + } + + /* The idea here is to begin with 8k read-ahead and to stay + * 4k ahead of our current position. + * + * TODO: Use the pagecache for this. We just need to + * make sure it's cluster-safe... */ + if (!last_ra_blk + || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { + for (i = ra_sectors >> (sb->s_blocksize_bits - 9); + i > 0; i--) { + tmp = NULL; + if (!ocfs2_read_dir_block(inode, ++blk, &tmp, + OCFS2_BH_READAHEAD)) + brelse(tmp); + } + last_ra_blk = blk; + ra_sectors = 8; + } + + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (*f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ocfs2_dir_entry *) (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) + | offset; + *f_version = inode->i_version; + } + + while (ctx->pos < i_size_read(inode) + && offset < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + offset); + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; + brelse(bh); + continue; + } + if (le64_to_cpu(de->inode)) { + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + if (!dir_emit(ctx, de->name, + de->name_len, + le64_to_cpu(de->inode), + d_type)) { + brelse(bh); + return 0; + } + stored++; + } + offset += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse(bh); + bh = NULL; + if (!persist && stored) + break; + } + return 0; +} + +static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, + struct dir_context *ctx, + bool persist) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_dir_foreach_blk_id(inode, f_version, ctx); + return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist); +} + +/* + * This is intended to be called from inside other kernel functions, + * so we fake some arguments. + */ +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) +{ + u64 version = inode->i_version; + ocfs2_dir_foreach_blk(inode, &version, ctx, true); + return 0; +} + +/* + * ocfs2_readdir() + * + */ +int ocfs2_readdir(struct file *file, struct dir_context *ctx) +{ + int error = 0; + struct inode *inode = file_inode(file); + int lock_level = 0; + + trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); + + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + if (lock_level && error >= 0) { + /* We release EX lock which used to update atime + * and get PR lock again to reduce contention + * on commonly accessed directories. */ + ocfs2_inode_unlock(inode, 1); + lock_level = 0; + error = ocfs2_inode_lock(inode, NULL, 0); + } + if (error < 0) { + if (error != -ENOENT) + mlog_errno(error); + /* we haven't got any yet, so propagate the error. */ + goto bail_nolock; + } + + error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); + + ocfs2_inode_unlock(inode, lock_level); + if (error) + mlog_errno(error); + +bail_nolock: + + return error; +} + +/* + * NOTE: this should always be called with parent dir i_mutex taken. + */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct ocfs2_dir_lookup_result *lookup) +{ + int status = -ENOENT; + + trace_ocfs2_find_files_on_disk(namelen, name, blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_find_entry(name, namelen, inode, lookup); + if (status) + goto leave; + + *blkno = le64_to_cpu(lookup->dl_entry->inode); + + status = 0; +leave: + + return status; +} + +/* + * Convenience function for callers which just want the block number + * mapped to a name and don't require the full dirent info, etc. + */ +int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, + int namelen, u64 *blkno) +{ + int ret; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); + ocfs2_free_dir_lookup_result(&lookup); + + return ret; +} + +/* Check for a name within a directory. + * + * Return 0 if the name does not exist + * Return -EEXIST if the directory contains the name + * + * Callers should have i_mutex + a cluster lock on dir + */ +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen) +{ + int ret = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + trace_ocfs2_check_dir_for_entry( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); + + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = -EEXIST; + mlog_errno(ret); + } + + ocfs2_free_dir_lookup_result(&lookup); + + return ret; +} + +struct ocfs2_empty_dir_priv { + struct dir_context ctx; + unsigned seen_dot; + unsigned seen_dot_dot; + unsigned seen_other; + unsigned dx_dir; +}; +static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) +{ + struct ocfs2_empty_dir_priv *p = + container_of(ctx, struct ocfs2_empty_dir_priv, ctx); + + /* + * Check the positions of "." and ".." records to be sure + * they're in the correct place. + * + * Indexed directories don't need to proceed past the first + * two entries, so we end the scan after seeing '..'. Despite + * that, we allow the scan to proceed In the event that we + * have a corrupted indexed directory (no dot or dot dot + * entries). This allows us to double check for existing + * entries which might not have been found in the index. + */ + if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { + p->seen_dot = 1; + return 0; + } + + if (name_len == 2 && !strncmp("..", name, 2) && + pos == OCFS2_DIR_REC_LEN(1)) { + p->seen_dot_dot = 1; + + if (p->dx_dir && p->seen_dot) + return 1; + + return 0; + } + + p->seen_other = 1; + return 1; +} + +static int ocfs2_empty_dir_dx(struct inode *inode, + struct ocfs2_empty_dir_priv *priv) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_dx_root_block *dx_root; + + priv->dx_dir = 1; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (le32_to_cpu(dx_root->dr_num_entries) != 2) + priv->seen_other = 1; + +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + * + * Returns 1 if dir is empty, zero otherwise. + * + * XXX: This is a performance problem for unindexed directories. + */ +int ocfs2_empty_dir(struct inode *inode) +{ + int ret; + struct ocfs2_empty_dir_priv priv = { + .ctx.actor = ocfs2_empty_dir_filldir, + }; + + if (ocfs2_dir_indexed(inode)) { + ret = ocfs2_empty_dir_dx(inode, &priv); + if (ret) + mlog_errno(ret); + /* + * We still run ocfs2_dir_foreach to get the checks + * for "." and "..". + */ + } + + ret = ocfs2_dir_foreach(inode, &priv.ctx); + if (ret) + mlog_errno(ret); + + if (!priv.seen_dot || !priv.seen_dot_dot) { + mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* + * XXX: Is it really safe to allow an unlink to continue? + */ + return 1; + } + + return !priv.seen_other; +} + +/* + * Fills "." and ".." dirents in a new directory block. Returns dirent for + * "..", which might be used during creation of a directory with a trailing + * header. It is otherwise safe to ignore the return code. + */ +static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, + struct inode *parent, + char *start, + unsigned int size) +{ + struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; + + de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + de->name_len = 1; + de->rec_len = + cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + strcpy(de->name, "."); + ocfs2_set_de_type(de, S_IFDIR); + + de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); + de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy(de->name, ".."); + ocfs2_set_de_type(de, S_IFDIR); + + return de; +} + +/* + * This works together with code in ocfs2_mknod_locked() which sets + * the inline-data flag and initializes the inline-data section. + */ +static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inline_data *data = &di->id2.i_data; + unsigned int size = le16_to_cpu(data->id_count); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); + ocfs2_journal_dirty(handle, di_bh); + + i_size_write(inode, size); + set_nlink(inode, 2); + inode->i_blocks = ocfs2_inode_sector_count(inode); + + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **ret_new_bh) +{ + int status; + unsigned int size = osb->sb->s_blocksize; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de; + + if (ocfs2_new_dir_wants_trailer(inode)) + size = ocfs2_dir_trailer_blk_off(parent->i_sb); + + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, + data_ac, NULL, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, osb->sb->s_blocksize); + + de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); + if (ocfs2_new_dir_wants_trailer(inode)) { + int size = le16_to_cpu(de->rec_len); + + /* + * Figure out the size of the hole left over after + * insertion of '.' and '..'. The trailer wants this + * information. + */ + size -= OCFS2_DIR_REC_LEN(2); + size -= sizeof(struct ocfs2_dir_block_trailer); + + ocfs2_init_dir_trailer(inode, new_bh, size); + } + + ocfs2_journal_dirty(handle, new_bh); + + i_size_write(inode, inode->i_sb->s_blocksize); + set_nlink(inode, 2); + inode->i_blocks = ocfs2_inode_sector_count(inode); + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; + if (ret_new_bh) { + *ret_new_bh = new_bh; + new_bh = NULL; + } +bail: + brelse(new_bh); + + return status; +} + +static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dirdata_bh, + struct ocfs2_alloc_context *meta_ac, + int dx_inline, u32 num_entries, + struct buffer_head **ret_dx_root_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + u16 dr_suballoc_bit; + u64 suballoc_loc, dr_blkno; + unsigned int num_bits; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &dr_suballoc_bit, &num_bits, &dr_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_dx_dir_attach_index( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)dr_blkno); + + dx_root_bh = sb_getblk(osb->sb, dr_blkno); + if (dx_root_bh == NULL) { + ret = -ENOMEM; + goto out; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + memset(dx_root, 0, osb->sb->s_blocksize); + strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); + dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); + dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); + dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); + dx_root->dr_blkno = cpu_to_le64(dr_blkno); + dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); + dx_root->dr_num_entries = cpu_to_le32(num_entries); + if (le16_to_cpu(trailer->db_free_rec_len)) + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + else + dx_root->dr_free_blk = cpu_to_le64(0); + + if (dx_inline) { + dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; + dx_root->dr_entries.de_count = + cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); + } else { + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + } + ocfs2_journal_dirty(handle, dx_root_bh); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + di->i_dx_root = cpu_to_le64(dr_blkno); + + spin_lock(&OCFS2_I(dir)->ip_lock); + OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + ocfs2_journal_dirty(handle, di_bh); + + *ret_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + +out: + brelse(dx_root_bh); + return ret; +} + +static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 start_blk) +{ + int ret, i; + struct ocfs2_dx_leaf *dx_leaf; + struct buffer_head *bh; + + for (i = 0; i < num_dx_leaves; i++) { + bh = sb_getblk(osb->sb, start_blk + i); + if (bh == NULL) { + ret = -ENOMEM; + goto out; + } + dx_leaves[i] = bh; + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; + + memset(dx_leaf, 0, osb->sb->s_blocksize); + strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); + dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); + dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); + dx_leaf->dl_list.de_count = + cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); + + trace_ocfs2_dx_dir_format_cluster( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(dx_leaf->dl_list.de_count)); + + ocfs2_journal_dirty(handle, bh); + } + + ret = 0; +out: + return ret; +} + +/* + * Allocates and formats a new cluster for use in an indexed dir + * leaf. This version will not do the extent insert, so that it can be + * used by operations which need careful ordering. + */ +static int __ocfs2_dx_dir_new_cluster(struct inode *dir, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 *ret_phys_blkno) +{ + int ret; + u32 phys, num; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + /* + * XXX: For create, this should claim cluster for the index + * *before* the unindexed insert so that we have a better + * chance of contiguousness as the directory grows in number + * of entries. + */ + ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Format the new cluster first. That way, we're inserting + * valid data. + */ + phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); + ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, + num_dx_leaves, phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_phys_blkno = phys_blkno; +out: + return ret; +} + +static int ocfs2_dx_dir_new_cluster(struct inode *dir, + struct ocfs2_extent_tree *et, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves) +{ + int ret; + u64 phys_blkno; + + ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, + num_dx_leaves, &phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, + meta_ac); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, + int *ret_num_leaves) +{ + int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); + struct buffer_head **dx_leaves; + + dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), + GFP_NOFS); + if (dx_leaves && ret_num_leaves) + *ret_num_leaves = num_dx_leaves; + + return dx_leaves; +} + +static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *leaf_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_hinfo hinfo; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * Our strategy is to create the directory as though it were + * unindexed, then add the index block. This works with very + * little complication since the state of a new directory is a + * very well known quantity. + * + * Essentially, we have two dirents ("." and ".."), in the 1st + * block which need indexing. These are easily inserted into + * the index block. + */ + + ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, + data_ac, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, + meta_ac, 1, 2, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ + ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + + ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + +out: + brelse(dx_root_bh); + brelse(leaf_bh); + return ret; +} + +int ocfs2_fill_new_dir(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) + +{ + BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); + + if (ocfs2_supports_indexed_dirs(osb)) + return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, + data_ac, meta_ac); + + return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, + data_ac, NULL); +} + +static int ocfs2_dx_dir_index_block(struct inode *dir, + handle_t *handle, + struct buffer_head **dx_leaves, + int num_dx_leaves, + u32 *num_dx_entries, + struct buffer_head *dirent_bh) +{ + int ret = 0, namelen, i; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct buffer_head *dx_leaf_bh; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + namelen = de->name_len; + if (!namelen || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); + + i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); + dx_leaf_bh = dx_leaves[i]; + + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, + dirent_blk, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *num_dx_entries = *num_dx_entries + 1; + +inc: + de_buf += le16_to_cpu(de->rec_len); + } + +out: + return ret; +} + +/* + * XXX: This expects dx_root_bh to already be part of the transaction. + */ +static void ocfs2_dx_dir_index_root_block(struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dirent_bh) +{ + char *de_buf, *limit; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_entry *de; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!de->name_len || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); + + trace_ocfs2_dx_dir_index_root_block( + (unsigned long long)dir->i_ino, + hinfo.major_hash, hinfo.minor_hash, + de->name_len, de->name, + le16_to_cpu(dx_root->dr_entries.de_num_used)); + + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, + dirent_blk); + + le32_add_cpu(&dx_root->dr_num_entries, 1); +inc: + de_buf += le16_to_cpu(de->rec_len); + } +} + +/* + * Count the number of inline directory entries in di_bh and compare + * them against the number of entries we can hold in an inline dx root + * block. + */ +static int ocfs2_new_dx_should_be_inline(struct inode *dir, + struct buffer_head *di_bh) +{ + int dirent_count = 0; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (de->name_len && de->inode) + dirent_count++; + + de_buf += le16_to_cpu(de->rec_len); + } + + /* We are careful to leave room for one extra record. */ + return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); +} + +/* + * Expand rec_len of the rightmost dirent in a directory block so that it + * contains the end of our valid space for dirents. We do this during + * expansion from an inline directory to one with extents. The first dir block + * in that case is taken from the inline data portion of the inode block. + * + * This will also return the largest amount of contiguous space for a dirent + * in the block. That value is *not* necessarily the last dirent, even after + * expansion. The directory indexing code wants this value for free space + * accounting. We do this here since we're already walking the entire dir + * block. + * + * We add the dir trailer if this filesystem wants it. + */ +static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, + struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + struct ocfs2_dir_entry *de; + struct ocfs2_dir_entry *prev_de; + char *de_buf, *limit; + unsigned int new_size = sb->s_blocksize; + unsigned int bytes, this_hole; + unsigned int largest_hole = 0; + + if (ocfs2_new_dir_wants_trailer(dir)) + new_size = ocfs2_dir_trailer_blk_off(sb); + + bytes = new_size - old_size; + + limit = start + old_size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + prev_de = de; + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + le16_add_cpu(&prev_de->rec_len, bytes); + + /* We need to double check this after modification of the final + * dirent. */ + this_hole = ocfs2_figure_dirent_hole(prev_de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +/* + * We allocate enough clusters to fulfill "blocks_wanted", but set + * i_size to exactly one block. Ocfs2_extend_dir() will handle the + * rest automatically for us. + * + * *first_block_bh is a pointer to the 1st data block allocated to the + * directory. + */ +static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, + unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, + struct buffer_head **first_block_bh) +{ + u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; + struct super_block *sb = dir->i_sb; + int ret, i, num_dx_leaves = 0, dx_inline = 0, + credits = ocfs2_inline_to_extents_credits(sb); + u64 dx_insert_blkno, blkno, + bytes = blocks_wanted << sb->s_blocksize_bits; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(dir); + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct buffer_head *dirdata_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + struct ocfs2_extent_tree et; + struct ocfs2_extent_tree dx_et; + int did_quota = 0, bytes_allocated = 0; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); + + alloc = ocfs2_clusters_for_bytes(sb, bytes); + dx_alloc = 0; + + down_write(&oi->ip_alloc_sem); + + if (ocfs2_supports_indexed_dirs(osb)) { + credits += ocfs2_add_dir_index_credits(sb); + + dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); + if (!dx_inline) { + /* Add one more cluster for an index leaf */ + dx_alloc++; + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, + &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + /* This gets us the dx_root */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * We should never need more than 2 clusters for the unindexed + * tree - maximum dirent size is far less than one block. In + * fact, the only time we'd need more than one cluster is if + * blocksize == clustersize and the dirent won't fit in the + * extra space that the expansion to a single block gives. As + * of today, that only happens on 4k/4k file systems. + */ + BUG_ON(alloc > 2); + + ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Prepare for worst case allocation scenario of two separate + * extents in the unindexed tree. + */ + if (alloc == 2) + credits += OCFS2_SUBALLOC_ALLOC; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) + goto out_commit; + did_quota = 1; + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Allocate our index cluster first, to maximize the + * possibility that unindexed leaves grow + * contiguously. + */ + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, + dx_leaves, num_dx_leaves, + &dx_insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + + /* + * Try to claim as many clusters as the bitmap can give though + * if we only get one now, that's enough to continue. The rest + * will be claimed after the conversion to extents. + */ + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &oi->ip_la_data_resv; + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + + /* + * Operations are carefully ordered so that we set up the new + * data block first. The conversion from inline data to + * extents follows. + */ + blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); + dirdata_bh = sb_getblk(sb, blkno); + if (!dirdata_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); + + ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); + memset(dirdata_bh->b_data + i_size_read(dir), 0, + sb->s_blocksize - i_size_read(dir)); + i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); + if (ocfs2_new_dir_wants_trailer(dir)) { + /* + * Prepare the dir trailer up front. It will otherwise look + * like a valid dirent. Even if inserting the index fails + * (unlikely), then all we'll have done is given first dir + * block a small amount of fragmentation. + */ + ocfs2_init_dir_trailer(dir, dirdata_bh, i); + } + + ocfs2_update_inode_fsync_trans(handle, dir, 1); + ocfs2_journal_dirty(handle, dirdata_bh); + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Dx dirs with an external cluster need to do this up + * front. Inline dx root's get handled later, after + * we've allocated our root block. We get passed back + * a total number of items so that dr_num_entries can + * be correctly set once the dx_root has been + * allocated. + */ + ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, + num_dx_leaves, &num_dx_entries, + dirdata_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set extent, i_size, etc on the directory. After this, the + * inode should contain the same exact dirents as before and + * be fully accessible from system calls. + * + * We let the later dirent insert modify c/mtime - to the user + * the data hasn't changed. + */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_dinode_new_extent_list(dir, di); + + i_size_write(dir, sb->s_blocksize); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + + di->i_size = cpu_to_le64(sb->s_blocksize); + di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, dir, 1); + + /* + * This should never fail as our extent list is empty and all + * related blocks have been journaled already. + */ + ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, + 0, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + + ocfs2_journal_dirty(handle, di_bh); + + if (ocfs2_supports_indexed_dirs(osb)) { + ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, + dirdata_bh, meta_ac, dx_inline, + num_dx_entries, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (dx_inline) { + ocfs2_dx_dir_index_root_block(dir, dx_root_bh, + dirdata_bh); + } else { + ocfs2_init_dx_root_extent_tree(&dx_et, + INODE_CACHE(dir), + dx_root_bh); + ret = ocfs2_insert_extent(handle, &dx_et, 0, + dx_insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + } + } + + /* + * We asked for two clusters, but only got one in the 1st + * pass. Claim the 2nd cluster as a separate extent. + */ + if (alloc > len) { + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, + &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); + + ret = ocfs2_insert_extent(handle, &et, 1, + blkno, len, 0, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + + *first_block_bh = dirdata_bh; + dirdata_bh = NULL; + if (ocfs2_supports_indexed_dirs(osb)) { + unsigned int off; + + if (!dx_inline) { + /* + * We need to return the correct block within the + * cluster which should hold our entry. + */ + off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + &lookup->dl_hinfo); + get_bh(dx_leaves[off]); + lookup->dl_dx_leaf_bh = dx_leaves[off]; + } + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + } + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, bytes_allocated); + + ocfs2_commit_trans(osb, handle); + +out: + up_write(&oi->ip_alloc_sem); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + + brelse(dirdata_bh); + brelse(dx_root_bh); + + return ret; +} + +/* returns a bh of the 1st new block in the allocation. */ +static int ocfs2_do_extend_dir(struct super_block *sb, + handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh) +{ + int status; + int extend, did_quota = 0; + u64 p_blkno, v_blkno; + + spin_lock(&OCFS2_I(dir)->ip_lock); + extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + if (extend) { + u32 offset = OCFS2_I(dir)->ip_clusters; + + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) + goto bail; + did_quota = 1; + + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); + BUG_ON(status == -EAGAIN); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); + status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_bh = sb_getblk(sb, p_blkno); + if (!*new_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + status = 0; +bail: + if (did_quota && status < 0) + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + return status; +} + +/* + * Assumes you already have a cluster lock on the directory. + * + * 'blocks_wanted' is only used if we have an inline directory which + * is to be turned into an extent based one. The size of the dirent to + * insert might be larger than the space gained by growing to just one + * block, so we may have to grow the inode by two blocks in that case. + * + * If the directory is already indexed, dx_root_bh must be provided. + */ +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, + struct buffer_head **new_de_bh) +{ + int status = 0; + int credits, num_free_extents, drop_alloc_sem = 0; + loff_t dir_i_size; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry * de; + struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * This would be a code error as an inline directory should + * never have an index root. + */ + BUG_ON(dx_root_bh); + + status = ocfs2_expand_inline_dir(dir, parent_fe_bh, + blocks_wanted, lookup, + &new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + + /* Expansion from inline to an indexed directory will + * have given us this. */ + dx_root_bh = lookup->dl_dx_root_bh; + + if (blocks_wanted == 1) { + /* + * If the new dirent will fit inside the space + * created by pushing out to one block, then + * we can complete the operation + * here. Otherwise we have to expand i_size + * and format the 2nd block below. + */ + BUG_ON(new_bh == NULL); + goto bail_bh; + } + + /* + * Get rid of 'new_bh' - we want to format the 2nd + * data block and return that instead. + */ + brelse(new_bh); + new_bh = NULL; + + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; + dir_i_size = i_size_read(dir); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + goto do_extend; + } + + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; + dir_i_size = i_size_read(dir); + trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno, + dir_i_size); + + /* dir->i_size is always block aligned. */ + spin_lock(&OCFS2_I(dir)->ip_lock); + if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { + spin_unlock(&OCFS2_I(dir)->ip_lock); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), + parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, &et); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto bail; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; + + credits = ocfs2_calc_extend_credits(sb, el); + } else { + spin_unlock(&OCFS2_I(dir)->ip_lock); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + } + +do_extend: + if (ocfs2_dir_indexed(dir)) + credits++; /* For attaching the new dirent block to the + * dx_root */ + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, + data_ac, meta_ac, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); + + status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, sb->s_blocksize); + + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = 0; + if (ocfs2_supports_dir_trailer(dir)) { + de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); + + ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_link_trailer(dir, handle, + dx_root_bh, new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + } + } else { + de->rec_len = cpu_to_le16(sb->s_blocksize); + } + ocfs2_update_inode_fsync_trans(handle, dir, 1); + ocfs2_journal_dirty(handle, new_bh); + + dir_i_size += dir->i_sb->s_blocksize; + i_size_write(dir, dir_i_size); + dir->i_blocks = ocfs2_inode_sector_count(dir); + status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail_bh: + *new_de_bh = new_bh; + get_bh(*new_de_bh); +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + if (drop_alloc_sem) + up_write(&OCFS2_I(dir)->ip_alloc_sem); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + brelse(new_bh); + + return status; +} + +static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, + const char *name, int namelen, + struct buffer_head **ret_de_bh, + unsigned int *blocks_wanted) +{ + int ret; + struct super_block *sb = dir->i_sb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dir_entry *de, *last_de = NULL; + char *de_buf, *limit; + unsigned long offset = 0; + unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + + /* + * This calculates how many free bytes we'd have in block zero, should + * this function force expansion to an extent tree. + */ + if (ocfs2_new_dir_wants_trailer(dir)) + free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); + else + free_space = dir->i_sb->s_blocksize - i_size_read(dir); + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + rec_len = OCFS2_DIR_REC_LEN(namelen); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) { + ret = -ENOENT; + goto out; + } + if (ocfs2_match(namelen, name, de)) { + ret = -EEXIST; + goto out; + } + /* + * No need to check for a trailing dirent record here as + * they're not used for inline dirs. + */ + + if (ocfs2_dirent_would_fit(de, rec_len)) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = di_bh; + get_bh(*ret_de_bh); + ret = 0; + goto out; + } + + last_de = de; + de_buf += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + } + + /* + * We're going to require expansion of the directory - figure + * out how many blocks we'll need so that a place for the + * dirent can be found. + */ + *blocks_wanted = 1; + new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; + if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) + *blocks_wanted = 2; + + ret = -ENOSPC; +out: + return ret; +} + +static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, + int namelen, struct buffer_head **ret_de_bh) +{ + unsigned long offset; + struct buffer_head *bh = NULL; + unsigned short rec_len; + struct ocfs2_dir_entry *de; + struct super_block *sb = dir->i_sb; + int status; + int blocksize = dir->i_sb->s_blocksize; + + status = ocfs2_read_dir_block(dir, 0, &bh, 0); + if (status) + goto bail; + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse(bh); + bh = NULL; + + if (i_size_read(dir) <= offset) { + /* + * Caller will have to expand this + * directory. + */ + status = -ENOSPC; + goto bail; + } + status = ocfs2_read_dir_block(dir, + offset >> sb->s_blocksize_bits, + &bh, 0); + if (status) + goto bail; + + /* move to next block */ + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + status = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + status = -EEXIST; + goto bail; + } + + if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, + blocksize)) + goto next; + + if (ocfs2_dirent_would_fit(de, rec_len)) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = bh; + get_bh(*ret_de_bh); + status = 0; + goto bail; + } +next: + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); + } + +bail: + brelse(bh); + if (status) + mlog_errno(status); + + return status; +} + +static int dx_leaf_sort_cmp(const void *a, const void *b) +{ + const struct ocfs2_dx_entry *entry1 = a; + const struct ocfs2_dx_entry *entry2 = b; + u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); + u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); + u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); + u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); + + if (major_hash1 > major_hash2) + return 1; + if (major_hash1 < major_hash2) + return -1; + + /* + * It is not strictly necessary to sort by minor + */ + if (minor_hash1 > minor_hash2) + return 1; + if (minor_hash1 < minor_hash2) + return -1; + return 0; +} + +static void dx_leaf_sort_swap(void *a, void *b, int size) +{ + struct ocfs2_dx_entry *entry1 = a; + struct ocfs2_dx_entry *entry2 = b; + struct ocfs2_dx_entry tmp; + + BUG_ON(size != sizeof(*entry1)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; +} + +static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num = le16_to_cpu(dl_list->de_num_used); + + for (i = 0; i < (num - 1); i++) { + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != + le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) + return 0; + } + + return 1; +} + +/* + * Find the optimal value to split this leaf on. This expects the leaf + * entries to be in sorted order. + * + * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is + * the hash we want to insert. + * + * This function is only concerned with the major hash - that which + * determines which cluster an item belongs to. + */ +static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, + u32 leaf_cpos, u32 insert_hash, + u32 *split_hash) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num_used = le16_to_cpu(dl_list->de_num_used); + int allsame; + + /* + * There's a couple rare, but nasty corner cases we have to + * check for here. All of them involve a leaf where all value + * have the same hash, which is what we look for first. + * + * Most of the time, all of the above is false, and we simply + * pick the median value for a split. + */ + allsame = ocfs2_dx_leaf_same_major(dx_leaf); + if (allsame) { + u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); + + if (val == insert_hash) { + /* + * No matter where we would choose to split, + * the new entry would want to occupy the same + * block as these. Since there's no space left + * in their existing block, we know there + * won't be space after the split. + */ + return -ENOSPC; + } + + if (val == leaf_cpos) { + /* + * Because val is the same as leaf_cpos (which + * is the smallest value this leaf can have), + * yet is not equal to insert_hash, then we + * know that insert_hash *must* be larger than + * val (and leaf_cpos). At least cpos+1 in value. + * + * We also know then, that there cannot be an + * adjacent extent (otherwise we'd be looking + * at it). Choosing this value gives us a + * chance to get some contiguousness. + */ + *split_hash = leaf_cpos + 1; + return 0; + } + + if (val > insert_hash) { + /* + * val can not be the same as insert hash, and + * also must be larger than leaf_cpos. Also, + * we know that there can't be a leaf between + * cpos and val, otherwise the entries with + * hash 'val' would be there. + */ + *split_hash = val; + return 0; + } + + *split_hash = insert_hash; + return 0; + } + + /* + * Since the records are sorted and the checks above + * guaranteed that not all records in this block are the same, + * we simple travel forward, from the median, and pick the 1st + * record whose value is larger than leaf_cpos. + */ + for (i = (num_used / 2); i < num_used; i++) + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > + leaf_cpos) + break; + + BUG_ON(i == num_used); /* Should be impossible */ + *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); + return 0; +} + +/* + * Transfer all entries in orig_dx_leaves whose major hash is equal to or + * larger than split_hash into new_dx_leaves. We use a temporary + * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. + * + * Since the block offset inside a leaf (cluster) is a constant mask + * of minor_hash, we can optimize - an item at block offset X within + * the original cluster, will be at offset X within the new cluster. + */ +static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, + handle_t *handle, + struct ocfs2_dx_leaf *tmp_dx_leaf, + struct buffer_head **orig_dx_leaves, + struct buffer_head **new_dx_leaves, + int num_dx_leaves) +{ + int i, j, num_used; + u32 major_hash; + struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; + struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry *dx_entry; + + tmp_list = &tmp_dx_leaf->dl_list; + + for (i = 0; i < num_dx_leaves; i++) { + orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; + orig_list = &orig_dx_leaf->dl_list; + new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; + new_list = &new_dx_leaf->dl_list; + + num_used = le16_to_cpu(orig_list->de_num_used); + + memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); + tmp_list->de_num_used = cpu_to_le16(0); + memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); + + for (j = 0; j < num_used; j++) { + dx_entry = &orig_list->de_entries[j]; + major_hash = le32_to_cpu(dx_entry->dx_major_hash); + if (major_hash >= split_hash) + ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, + dx_entry); + else + ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, + dx_entry); + } + memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); + + ocfs2_journal_dirty(handle, orig_dx_leaves[i]); + ocfs2_journal_dirty(handle, new_dx_leaves[i]); + } +} + +static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, + struct ocfs2_dx_root_block *dx_root) +{ + int credits = ocfs2_clusters_to_blocks(osb->sb, 2); + + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list); + credits += ocfs2_quota_trans_credits(osb->sb); + return credits; +} + +/* + * Find the median value in dx_leaf_bh and allocate a new leaf to move + * half our entries into. + */ +static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dx_leaf_bh, + struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, + u64 leaf_blkno) +{ + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + int credits, ret, i, num_used, did_quota = 0; + u32 cpos, split_hash, insert_hash = hinfo->major_hash; + u64 orig_leaves_start; + int num_dx_leaves; + struct buffer_head **orig_dx_leaves = NULL; + struct buffer_head **new_dx_leaves = NULL; + struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; + struct ocfs2_extent_tree et; + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; + + trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, + insert_hash); + + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + /* + * XXX: This is a rather large limit. We should use a more + * realistic value. + */ + if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) + return -ENOSPC; + + num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); + if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { + mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " + "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, num_used); + ret = -EIO; + goto out; + } + + orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!orig_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); + if (!new_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * This block is changing anyway, so we can sort it in place. + */ + sort(dx_leaf->dl_list.de_entries, num_used, + sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, + dx_leaf_sort_swap); + + ocfs2_journal_dirty(handle, dx_leaf_bh); + + ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, + &split_hash); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash); + + /* + * We have to carefully order operations here. There are items + * which want to be in the new cluster before insert, but in + * order to put those items in the new cluster, we alter the + * old cluster. A failure to insert gets nasty. + * + * So, start by reserving writes to the old + * cluster. ocfs2_dx_dir_new_cluster will reserve writes on + * the new cluster for us, before inserting it. The insert + * won't happen if there's an error before that. Once the + * insert is done then, we can transfer from one leaf into the + * other without fear of hitting any error. + */ + + /* + * The leaf transfer wants some scratch space so that we don't + * wind up doing a bunch of expensive memmove(). + */ + tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); + if (!tmp_dx_leaf) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); + ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, + orig_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + for (i = 0; i < num_dx_leaves; i++) { + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + orig_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + new_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, + orig_dx_leaves, new_dx_leaves, num_dx_leaves); + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_update_inode_fsync_trans(handle, dir, 1); + ocfs2_commit_trans(osb, handle); + +out: + if (orig_dx_leaves || new_dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) { + if (orig_dx_leaves) + brelse(orig_dx_leaves[i]); + if (new_dx_leaves) + brelse(new_dx_leaves[i]); + } + kfree(orig_dx_leaves); + kfree(new_dx_leaves); + } + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + kfree(tmp_dx_leaf); + return ret; +} + +static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh, + const char *name, int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, rebalanced = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + u64 blkno; + u32 leaf_cpos; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + +restart_search: + ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, + &leaf_cpos, &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + + if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= + le16_to_cpu(dx_leaf->dl_list.de_count)) { + if (rebalanced) { + /* + * Rebalancing should have provided us with + * space in an appropriate leaf. + * + * XXX: Is this an abnormal condition then? + * Should we print a message here? + */ + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, + &lookup->dl_hinfo, leaf_cpos, + blkno); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* + * Restart the lookup. The rebalance might have + * changed which block our item fits into. Mark our + * progress, so we only execute this once. + */ + brelse(dx_leaf_bh); + dx_leaf_bh = NULL; + rebalanced = 1; + goto restart_search; + } + + lookup->dl_dx_leaf_bh = dx_leaf_bh; + dx_leaf_bh = NULL; + +out: + brelse(dx_leaf_bh); + return ret; +} + +static int ocfs2_search_dx_free_list(struct inode *dir, + struct buffer_head *dx_root_bh, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = -ENOSPC; + struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; + struct ocfs2_dir_block_trailer *db; + u64 next_block; + int rec_len = OCFS2_DIR_REC_LEN(namelen); + struct ocfs2_dx_root_block *dx_root; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + next_block = le64_to_cpu(dx_root->dr_free_blk); + + while (next_block) { + brelse(prev_leaf_bh); + prev_leaf_bh = leaf_bh; + leaf_bh = NULL; + + ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { + lookup->dl_leaf_bh = leaf_bh; + lookup->dl_prev_leaf_bh = prev_leaf_bh; + leaf_bh = NULL; + prev_leaf_bh = NULL; + break; + } + + next_block = le64_to_cpu(db->db_free_next); + } + + if (!next_block) + ret = -ENOSPC; + +out: + + brelse(leaf_bh); + brelse(prev_leaf_bh); + return ret; +} + +static int ocfs2_expand_inline_dx_root(struct inode *dir, + struct buffer_head *dx_root_bh) +{ + int ret, num_dx_leaves, i, j, did_quota = 0; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_extent_tree et; + u64 insert_blkno; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + struct ocfs2_dx_entry *dx_entry; + struct ocfs2_dx_leaf *target_leaf; + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + /* + * We do this up front, before the allocation, so that a + * failure to add the dx_root_bh to the journal won't result + * us losing clusters. + */ + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, + num_dx_leaves, &insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Transfer the entries from our dx_root into the appropriate + * block + */ + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + j = __ocfs2_dx_dir_hash_idx(osb, + le32_to_cpu(dx_entry->dx_minor_hash)); + target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; + + ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); + + /* Each leaf has been passed to the journal already + * via __ocfs2_dx_dir_new_cluster() */ + } + + dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; + memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list)); + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + + /* This should never fail considering we start with an empty + * dx_root. */ + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + did_quota = 0; + + ocfs2_update_inode_fsync_trans(handle, dir, 1); + ocfs2_journal_dirty(handle, dx_root_bh); + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + return ret; +} + +static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) +{ + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + if (le16_to_cpu(entry_list->de_num_used) >= + le16_to_cpu(entry_list->de_count)) + return -ENOSPC; + + return 0; +} + +static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, + struct buffer_head *di_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, free_dx_root = 1; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct buffer_head *leaf_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { + ret = -ENOSPC; + mlog_errno(ret); + goto out; + } + + if (ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_inline_dx_has_space(dx_root_bh); + + if (ret == 0) + goto search_el; + + /* + * We ran out of room in the root block. Expand it to + * an extent, then allow ocfs2_find_dir_space_dx to do + * the rest. + */ + ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Insert preparation for an indexed directory is split into two + * steps. The call to find_dir_space_dx reserves room in the index for + * an additional item. If we run out of space there, it's a real error + * we can't continue on. + */ + ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, + namelen, lookup); + if (ret) { + mlog_errno(ret); + goto out; + } + +search_el: + /* + * Next, we need to find space in the unindexed tree. This call + * searches using the free space linked list. If the unindexed tree + * lacks sufficient space, we'll expand it below. The expansion code + * is smart enough to add any new blocks to the free space list. + */ + ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Do this up here - ocfs2_extend_dir might need the dx_root */ + lookup->dl_dx_root_bh = dx_root_bh; + free_dx_root = 0; + + if (ret == -ENOSPC) { + ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We make the assumption here that new leaf blocks are added + * to the front of our free list. + */ + lookup->dl_prev_leaf_bh = NULL; + lookup->dl_leaf_bh = leaf_bh; + } + +out: + if (free_dx_root) + brelse(dx_root_bh); + return ret; +} + +/* + * Get a directory ready for insert. Any directory allocation required + * happens here. Success returns zero, and enough context in the dir + * lookup result that ocfs2_add_entry() will be able complete the task + * with minimal performance impact. + */ +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + unsigned int blocks_wanted = 1; + struct buffer_head *bh = NULL; + + trace_ocfs2_prepare_dir_for_insert( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); + + if (!namelen) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + /* + * Do this up front to reduce confusion. + * + * The directory might start inline, then be turned into an + * indexed one, in which case we'd need to hash deep inside + * ocfs2_find_dir_space_id(). Since + * ocfs2_prepare_dx_dir_for_insert() also needs this hash + * done, there seems no point in spreading out the calls. We + * can optimize away the case where the file system doesn't + * support indexing. + */ + if (ocfs2_supports_indexed_dirs(osb)) + ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); + + if (ocfs2_dir_indexed(dir)) { + ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, + name, namelen, lookup); + if (ret) + mlog_errno(ret); + goto out; + } + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, + namelen, &bh, &blocks_wanted); + } else + ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh); + + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + if (ret == -ENOSPC) { + /* + * We have to expand the directory to add this name. + */ + BUG_ON(bh); + + ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, + lookup, &bh); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + BUG_ON(!bh); + } + + lookup->dl_leaf_bh = bh; + bh = NULL; +out: + brelse(bh); + return ret; +} + +static int ocfs2_dx_dir_remove_index(struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + struct inode *dx_alloc_inode = NULL; + struct buffer_head *dx_alloc_bh = NULL; + handle_t *handle; + u64 blk; + u16 bit; + u64 bg_blkno; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + dx_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(dx_root->dr_suballoc_slot)); + if (!dx_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&dx_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(dir)->ip_lock); + OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); + di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); + + ocfs2_journal_dirty(handle, di_bh); + + blk = le64_to_cpu(dx_root->dr_blkno); + bit = le16_to_cpu(dx_root->dr_suballoc_bit); + if (dx_root->dr_suballoc_loc) + bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, + bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(dx_alloc_inode, 1); + +out_mutex: + mutex_unlock(&dx_alloc_inode->i_mutex); + brelse(dx_alloc_bh); +out: + iput(dx_alloc_inode); + return ret; +} + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) +{ + int ret; + unsigned int uninitialized_var(clen); + u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); + u64 uninitialized_var(blkno); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!ocfs2_dir_indexed(dir)) + return 0; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (ocfs2_dx_root_inline(dx_root)) + goto remove_index; + + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + + /* XXX: What if dr_clusters is too large? */ + while (le32_to_cpu(dx_root->dr_clusters)) { + ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, + major_hash, &cpos, &blkno, &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); + + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, + &dealloc, 0, false); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos == 0) + break; + + major_hash = cpos - 1; + } + +remove_index: + ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + brelse(dx_root_bh); + return ret; +} diff --git a/kernel/fs/ocfs2/dir.h b/kernel/fs/ocfs2/dir.h new file mode 100644 index 000000000..3d8639f38 --- /dev/null +++ b/kernel/fs/ocfs2/dir.h @@ -0,0 +1,116 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DIR_H +#define OCFS2_DIR_H + +struct ocfs2_dx_hinfo { + u32 major_hash; + u32 minor_hash; +}; + +struct ocfs2_dir_lookup_result { + struct buffer_head *dl_leaf_bh; /* Unindexed leaf + * block */ + struct ocfs2_dir_entry *dl_entry; /* Target dirent in + * unindexed leaf */ + + struct buffer_head *dl_dx_root_bh; /* Root of indexed + * tree */ + + struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ + struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in + * indexed leaf */ + struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */ + + struct buffer_head *dl_prev_leaf_bh;/* Previous entry in + * dir free space + * list. NULL if + * previous entry is + * dx root block. */ +}; + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); + +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup); +int ocfs2_delete_entry(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_lookup_result *res); +int __ocfs2_add_entry(handle_t *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup); +static inline int ocfs2_add_entry(handle_t *handle, + struct dentry *dentry, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup) +{ + return __ocfs2_add_entry(handle, d_inode(dentry->d_parent), + dentry->d_name.name, dentry->d_name.len, + inode, blkno, parent_fe_bh, lookup); +} +int ocfs2_update_entry(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *res, + struct inode *new_entry_inode); + +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen); +int ocfs2_empty_dir(struct inode *inode); + +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct ocfs2_dir_lookup_result *res); +int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, + int namelen, u64 *blkno); +int ocfs2_readdir(struct file *file, struct dir_context *ctx); +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx); +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup); +struct ocfs2_alloc_context; +int ocfs2_fill_new_dir(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac); + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh); + +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data); +#endif /* OCFS2_DIR_H */ diff --git a/kernel/fs/ocfs2/dlm/Makefile b/kernel/fs/ocfs2/dlm/Makefile new file mode 100644 index 000000000..bd1aab1f4 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/Makefile @@ -0,0 +1,7 @@ +ccflags-y := -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o + +ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o + diff --git a/kernel/fs/ocfs2/dlm/dlmapi.h b/kernel/fs/ocfs2/dlm/dlmapi.h new file mode 100644 index 000000000..3cfa114aa --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmapi.h @@ -0,0 +1,220 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmapi.h + * + * externally exported dlm interfaces + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMAPI_H +#define DLMAPI_H + +struct dlm_lock; +struct dlm_ctxt; + +/* NOTE: changes made to this enum should be reflected in dlmdebug.c */ +enum dlm_status { + DLM_NORMAL = 0, /* 0: request in progress */ + DLM_GRANTED, /* 1: request granted */ + DLM_DENIED, /* 2: request denied */ + DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */ + DLM_WORKING, /* 4: async request in progress */ + DLM_BLOCKED, /* 5: lock request blocked */ + DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/ + DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */ + DLM_SYSERR, /* 8: system error */ + DLM_NOSUPPORT, /* 9: unsupported */ + DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */ + DLM_IVLOCKID, /* 11: bad lockid */ + DLM_SYNC, /* 12: synchronous request granted */ + DLM_BADTYPE, /* 13: bad resource type */ + DLM_BADRESOURCE, /* 14: bad resource handle */ + DLM_MAXHANDLES, /* 15: no more resource handles */ + DLM_NOCLINFO, /* 16: can't contact cluster manager */ + DLM_NOLOCKMGR, /* 17: can't contact lock manager */ + DLM_NOPURGED, /* 18: can't contact purge daemon */ + DLM_BADARGS, /* 19: bad api args */ + DLM_VOID, /* 20: no status */ + DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */ + DLM_IVBUFLEN, /* 22: invalid resource name length */ + DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */ + DLM_BADPARAM, /* 24: invalid lock mode specified */ + DLM_VALNOTVALID, /* 25: value block has been invalidated */ + DLM_REJECTED, /* 26: request rejected, unrecognized client */ + DLM_ABORT, /* 27: blocked lock request cancelled */ + DLM_CANCEL, /* 28: conversion request cancelled */ + DLM_IVRESHANDLE, /* 29: invalid resource handle */ + DLM_DEADLOCK, /* 30: deadlock recovery refused this request */ + DLM_DENIED_NOASTS, /* 31: failed to allocate AST */ + DLM_FORWARD, /* 32: request must wait for primary's response */ + DLM_TIMEOUT, /* 33: timeout value for lock has expired */ + DLM_IVGROUPID, /* 34: invalid group specification */ + DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */ + DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */ + DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */ + DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */ + + DLM_RECOVERING, /* 39: extension, allows caller to fail a lock + request if it is being recovered */ + DLM_MIGRATING, /* 40: extension, allows caller to fail a lock + request if it is being migrated */ + DLM_MAXSTATS, /* 41: upper limit for return code validation */ +}; + +/* for pretty-printing dlm_status error messages */ +const char *dlm_errmsg(enum dlm_status err); +/* for pretty-printing dlm_status error names */ +const char *dlm_errname(enum dlm_status err); + +/* Eventually the DLM will use standard errno values, but in the + * meantime this lets us track dlm errors as they bubble up. When we + * bring its error reporting into line with the rest of the stack, + * these can just be replaced with calls to mlog_errno. */ +#define dlm_error(st) do { \ + if ((st) != DLM_RECOVERING && \ + (st) != DLM_MIGRATING && \ + (st) != DLM_FORWARD) \ + mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ +} while (0) + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + +#define DLM_LVB_LEN 64 + +/* Callers are only allowed access to the lvb and status members of + * this struct. */ +struct dlm_lockstatus { + enum dlm_status status; + u32 flags; + struct dlm_lock *lockid; + char lvb[DLM_LVB_LEN]; +}; + +/* Valid lock modes. */ +#define LKM_IVMODE (-1) /* invalid mode */ +#define LKM_NLMODE 0 /* null lock */ +#define LKM_CRMODE 1 /* concurrent read unsupported */ +#define LKM_CWMODE 2 /* concurrent write unsupported */ +#define LKM_PRMODE 3 /* protected read */ +#define LKM_PWMODE 4 /* protected write unsupported */ +#define LKM_EXMODE 5 /* exclusive */ +#define LKM_MAXMODE 5 +#define LKM_MODEMASK 0xff + +/* Flags passed to dlmlock and dlmunlock: + * reserved: flags used by the "real" dlm + * only a few are supported by this dlm + * (U) = unsupported by ocfs2 dlm */ +#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */ +#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */ +#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */ +#define LKM_LOCAL 0x00000080 /* local lock request */ +#define LKM_VALBLK 0x00000100 /* lock value block request */ +#define LKM_NOQUEUE 0x00000200 /* non blocking request */ +#define LKM_CONVERT 0x00000400 /* conversion request */ +#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_UNLOCK 0x00001000 /* deallocate this lock */ +#define LKM_CANCEL 0x00002000 /* cancel conversion request */ +#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ +#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */ +#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */ +#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */ +#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */ +#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */ +#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */ +#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */ +#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */ +#define LKM_FORCE 0x00800000 /* force unlock flag */ +#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate + lock value block (U) */ +/* unused */ +#define LKM_UNUSED1 0x00000001 /* unused */ +#define LKM_UNUSED2 0x00000002 /* unused */ +#define LKM_UNUSED3 0x00000004 /* unused */ +#define LKM_UNUSED4 0x00000008 /* unused */ +#define LKM_UNUSED5 0x02000000 /* unused */ +#define LKM_UNUSED6 0x04000000 /* unused */ +#define LKM_UNUSED7 0x08000000 /* unused */ + +/* ocfs2 extensions: internal only + * should never be used by caller */ +#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated + to another node */ +#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed + should be applied to lockres */ +#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied + from lockres when lock is granted */ +#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock + used to avoid recovery rwsem */ + + +typedef void (dlm_astlockfunc_t)(void *); +typedef void (dlm_bastlockfunc_t)(void *, int); +typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status); + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, + int mode, + struct dlm_lockstatus *lksb, + int flags, + const char *name, + int namelen, + dlm_astlockfunc_t *ast, + void *data, + dlm_bastlockfunc_t *bast); + +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, + struct dlm_lockstatus *lksb, + int flags, + dlm_astunlockfunc_t *unlockast, + void *data); + +struct dlm_protocol_version { + u8 pv_major; + u8 pv_minor; +}; +struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key, + struct dlm_protocol_version *fs_proto); + +void dlm_unregister_domain(struct dlm_ctxt *dlm); + +void dlm_print_one_lock(struct dlm_lock *lockid); + +typedef void (dlm_eviction_func)(int, void *); +struct dlm_eviction_cb { + struct list_head ec_item; + dlm_eviction_func *ec_func; + void *ec_data; +}; +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data); +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb); +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb); + +#endif /* DLMAPI_H */ diff --git a/kernel/fs/ocfs2/dlm/dlmast.c b/kernel/fs/ocfs2/dlm/dlmast.c new file mode 100644 index 000000000..fd6bbbbd7 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmast.c @@ -0,0 +1,504 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmast.c + * + * AST and BAST functionality for local and remote nodes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock); +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); + +/* Should be called as an ast gets queued to see if the new + * lock level will obsolete a pending bast. + * For example, if dlm_thread queued a bast for an EX lock that + * was blocking another EX, but before sending the bast the + * lock owner downconverted to NL, the bast is now obsolete. + * Only the ast should be sent. + * This is needed because the lock and convert paths can queue + * asts out-of-band (not waiting for dlm_thread) in order to + * allow for LKM_NOQUEUE to get immediate responses. */ +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + assert_spin_locked(&dlm->ast_lock); + assert_spin_locked(&lock->spinlock); + + if (lock->ml.highest_blocked == LKM_IVMODE) + return 0; + BUG_ON(lock->ml.highest_blocked == LKM_NLMODE); + + if (lock->bast_pending && + list_empty(&lock->bast_list)) + /* old bast already sent, ok */ + return 0; + + if (lock->ml.type == LKM_EXMODE) + /* EX blocks anything left, any bast still valid */ + return 0; + else if (lock->ml.type == LKM_NLMODE) + /* NL blocks nothing, no reason to send any bast, cancel it */ + return 1; + else if (lock->ml.highest_blocked != LKM_EXMODE) + /* PR only blocks EX */ + return 1; + + return 0; +} + +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + BUG_ON(!dlm); + BUG_ON(!lock); + + res = lock->lockres; + + assert_spin_locked(&dlm->ast_lock); + + if (!list_empty(&lock->ast_list)) { + mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, " + "AST list not empty, pending %d, newlevel %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ast_pending, lock->ml.type); + BUG(); + } + if (lock->ast_pending) + mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + + /* check to see if this ast obsoletes the bast */ + if (dlm_should_cancel_bast(dlm, lock)) { + mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + lock->bast_pending = 0; + list_del_init(&lock->bast_list); + lock->ml.highest_blocked = LKM_IVMODE; + /* removing lock from list, remove a ref. guaranteed + * this won't be the last ref because of the get above, + * so res->spinlock will not be taken here */ + dlm_lock_put(lock); + /* free up the reserved bast that we are cancelling. + * guaranteed that this will not be the last reserved + * ast because *both* an ast and a bast were reserved + * to get to this point. the res->spinlock will not be + * taken here */ + dlm_lockres_release_ast(dlm, res); + } + list_add_tail(&lock->ast_list, &dlm->pending_asts); + lock->ast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_ast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + + +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + BUG_ON(!dlm); + BUG_ON(!lock); + + assert_spin_locked(&dlm->ast_lock); + + res = lock->lockres; + + BUG_ON(!list_empty(&lock->bast_list)); + if (lock->bast_pending) + mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + list_add_tail(&lock->bast_list, &dlm->pending_basts); + lock->bast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_bast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct dlm_lockstatus *lksb = lock->lksb; + BUG_ON(!lksb); + + /* only updates if this node masters the lockres */ + spin_lock(&res->spinlock); + if (res->owner == dlm->node_num) { + /* check the lksb flags for the direction */ + if (lksb->flags & DLM_LKSB_GET_LVB) { + mlog(0, "getting lvb from lockres for %s node\n", + lock->ml.node == dlm->node_num ? "master" : + "remote"); + memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); + } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ + } + spin_unlock(&res->spinlock); + + /* reset any lvb flags on the lksb */ + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); +} + +void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + dlm_astlockfunc_t *fn; + struct dlm_lockstatus *lksb; + + mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + lksb = lock->lksb; + fn = lock->ast; + BUG_ON(lock->ml.node != dlm->node_num); + + dlm_update_lvb(dlm, res, lock); + (*fn)(lock->astdata); +} + + +int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + int ret; + struct dlm_lockstatus *lksb; + int lksbflags; + + mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + lksb = lock->lksb; + BUG_ON(lock->ml.node == dlm->node_num); + + lksbflags = lksb->flags; + dlm_update_lvb(dlm, res, lock); + + /* lock request came from another node + * go do the ast over there */ + ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags); + return ret; +} + +void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int blocked_type) +{ + dlm_bastlockfunc_t *fn = lock->bast; + + BUG_ON(lock->ml.node != dlm->node_num); + + mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + blocked_type); + + (*fn)(lock->astdata, blocked_type); +} + + + +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + int ret; + unsigned int locklen; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; + char *name; + struct list_head *head = NULL; + __be64 cookie; + u32 flags; + u8 node; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + name = past->name; + locklen = past->namelen; + cookie = past->cookie; + flags = be32_to_cpu(past->flags); + node = past->node_idx; + + if (locklen > DLM_LOCKID_NAME_MAX) { + ret = DLM_IVBUFLEN; + mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " + "handler!\n", locklen); + goto leave; + } + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", + flags); + ret = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); + + if (past->type != DLM_AST && + past->type != DLM_BAST) { + mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" + "name=%.*s, node=%u\n", past->type, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + ret = DLM_IVLOCKID; + goto leave; + } + + res = dlm_lookup_lockres(dlm, name, locklen); + if (!res) { + mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " + "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + ret = DLM_IVLOCKID; + goto leave; + } + + /* cannot get a proxy ast message if this node owns it */ + BUG_ON(res->owner == dlm->node_num); + + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "Responding with DLM_RECOVERING!\n"); + ret = DLM_RECOVERING; + goto unlock_out; + } + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Responding with DLM_MIGRATING!\n"); + ret = DLM_MIGRATING; + goto unlock_out; + } + /* try convert queue for both ast/bast */ + head = &res->converting; + lock = NULL; + list_for_each_entry(lock, head, list) { + if (lock->ml.cookie == cookie) + goto do_ast; + } + + /* if not on convert, try blocked for ast, granted for bast */ + if (past->type == DLM_AST) + head = &res->blocked; + else + head = &res->granted; + + list_for_each_entry(lock, head, list) { + /* if lock is found but unlock is pending ignore the bast */ + if (lock->ml.cookie == cookie) { + if (lock->unlock_pending) + break; + goto do_ast; + } + } + + mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " + "node=%u\n", past->type == DLM_AST ? "" : "b", + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + + ret = DLM_NORMAL; +unlock_out: + spin_unlock(&res->spinlock); + goto leave; + +do_ast: + ret = DLM_NORMAL; + if (past->type == DLM_AST) { + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->granted); + mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + lock->ml.type, lock->ml.convert_type); + + if (lock->ml.convert_type != LKM_IVMODE) { + lock->ml.type = lock->ml.convert_type; + lock->ml.convert_type = LKM_IVMODE; + } else { + // should already be there.... + } + + lock->lksb->status = DLM_NORMAL; + + /* if we requested the lvb, fetch it into our lksb now */ + if (flags & LKM_GET_LVB) { + BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); + memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); + } + } + spin_unlock(&res->spinlock); + + if (past->type == DLM_AST) + dlm_do_local_ast(dlm, res, lock); + else + dlm_do_local_bast(dlm, res, lock, past->blocked_type); + +leave: + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + return ret; +} + + + +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int msg_type, + int blocked_type, int flags) +{ + int ret = 0; + struct dlm_proxy_ast past; + struct kvec vec[2]; + size_t veclen = 1; + int status; + + mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name, + res->lockname.len, res->lockname.name, lock->ml.node, msg_type, + blocked_type); + + memset(&past, 0, sizeof(struct dlm_proxy_ast)); + past.node_idx = dlm->node_num; + past.type = msg_type; + past.blocked_type = blocked_type; + past.namelen = res->lockname.len; + memcpy(past.name, res->lockname.name, past.namelen); + past.cookie = lock->ml.cookie; + + vec[0].iov_len = sizeof(struct dlm_proxy_ast); + vec[0].iov_base = &past; + if (flags & DLM_LKSB_GET_LVB) { + be32_add_cpu(&past.flags, LKM_GET_LVB); + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, + lock->ml.node, &status); + if (ret < 0) + mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n", + dlm->name, res->lockname.len, res->lockname.name, ret, + lock->ml.node); + else { + if (status == DLM_RECOVERING) { + mlog(ML_ERROR, "sent AST to node %u, it thinks this " + "node is dead!\n", lock->ml.node); + BUG(); + } else if (status == DLM_MIGRATING) { + mlog(ML_ERROR, "sent AST to node %u, it returned " + "DLM_MIGRATING!\n", lock->ml.node); + BUG(); + } else if (status != DLM_NORMAL && status != DLM_IVLOCKID) { + mlog(ML_ERROR, "AST to node %u returned %d!\n", + lock->ml.node, status); + /* ignore it */ + } + ret = 0; + } + return ret; +} diff --git a/kernel/fs/ocfs2/dlm/dlmcommon.h b/kernel/fs/ocfs2/dlm/dlmcommon.h new file mode 100644 index 000000000..fae17c640 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmcommon.h @@ -0,0 +1,1150 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmcommon.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCOMMON_H +#define DLMCOMMON_H + +#include + +#define DLM_HB_NODE_DOWN_PRI (0xf000000) +#define DLM_HB_NODE_UP_PRI (0x8000000) + +#define DLM_LOCKID_NAME_MAX 32 + +#define DLM_DOMAIN_NAME_MAX_LEN 255 +#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES +#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes +#define DLM_THREAD_MS 200 // flush at least every 200 ms + +#define DLM_HASH_SIZE_DEFAULT (1 << 17) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) + +enum dlm_mle_type { + DLM_MLE_BLOCK = 0, + DLM_MLE_MASTER = 1, + DLM_MLE_MIGRATION = 2, + DLM_MLE_NUM_TYPES = 3, +}; + +struct dlm_master_list_entry { + struct hlist_node master_hash_node; + struct list_head hb_events; + struct dlm_ctxt *dlm; + spinlock_t spinlock; + wait_queue_head_t wq; + atomic_t woken; + struct kref mle_refs; + int inuse; + unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + u8 master; + u8 new_master; + enum dlm_mle_type type; + struct o2hb_callback_func mle_hb_up; + struct o2hb_callback_func mle_hb_down; + struct dlm_lock_resource *mleres; + unsigned char mname[DLM_LOCKID_NAME_MAX]; + unsigned int mnamelen; + unsigned int mnamehash; +}; + +enum dlm_ast_type { + DLM_AST = 0, + DLM_BAST = 1, + DLM_ASTUNLOCK = 2, +}; + + +#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \ + LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \ + LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE) + +#define DLM_RECOVERY_LOCK_NAME "$RECOVERY" +#define DLM_RECOVERY_LOCK_NAME_LEN 9 + +static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) +{ + if (name_len == DLM_RECOVERY_LOCK_NAME_LEN && + memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0) + return 1; + return 0; +} + +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 + +struct dlm_recovery_ctxt +{ + struct list_head resources; + struct list_head node_data; + u8 new_master; + u8 dead_node; + u16 state; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + wait_queue_head_t event; +}; + +enum dlm_ctxt_state { + DLM_CTXT_NEW = 0, + DLM_CTXT_JOINED = 1, + DLM_CTXT_IN_SHUTDOWN = 2, + DLM_CTXT_LEAVING = 3, +}; + +struct dlm_ctxt +{ + struct list_head list; + struct hlist_head **lockres_hash; + struct list_head dirty_list; + struct list_head purge_list; + struct list_head pending_asts; + struct list_head pending_basts; + struct list_head tracking_list; + unsigned int purge_count; + spinlock_t spinlock; + spinlock_t ast_lock; + spinlock_t track_lock; + char *name; + u8 node_num; + u32 key; + u8 joining_node; + wait_queue_head_t dlm_join_events; + unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct dlm_recovery_ctxt reco; + spinlock_t master_lock; + struct hlist_head **master_hash; + struct list_head mle_hb_events; + + /* these give a really vague idea of the system load */ + atomic_t mle_tot_count[DLM_MLE_NUM_TYPES]; + atomic_t mle_cur_count[DLM_MLE_NUM_TYPES]; + atomic_t res_tot_count; + atomic_t res_cur_count; + + struct dlm_debug_ctxt *dlm_debug_ctxt; + struct dentry *dlm_debugfs_subroot; + + /* NOTE: Next three are protected by dlm_domain_lock */ + struct kref dlm_refs; + enum dlm_ctxt_state dlm_state; + unsigned int num_joins; + + struct o2hb_callback_func dlm_hb_up; + struct o2hb_callback_func dlm_hb_down; + struct task_struct *dlm_thread_task; + struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; + wait_queue_head_t dlm_thread_wq; + wait_queue_head_t dlm_reco_thread_wq; + wait_queue_head_t ast_wq; + wait_queue_head_t migration_wq; + + struct work_struct dispatched_work; + struct list_head work_list; + spinlock_t work_lock; + struct list_head dlm_domain_handlers; + struct list_head dlm_eviction_callbacks; + + /* The filesystem specifies this at domain registration. We + * cache it here to know what to tell other nodes. */ + struct dlm_protocol_version fs_locking_proto; + /* This is the inter-dlm communication version */ + struct dlm_protocol_version dlm_locking_proto; +}; + +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + +static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm, + unsigned i) +{ + return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + + (i % DLM_BUCKETS_PER_PAGE); +} + +/* these keventd work queue items are for less-frequently + * called functions that cannot be directly called from the + * net message handlers for some reason, usually because + * they need to send net messages of their own. */ +void dlm_dispatch_work(struct work_struct *work); + +struct dlm_lock_resource; +struct dlm_work_item; + +typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *); + +struct dlm_request_all_locks_priv +{ + u8 reco_master; + u8 dead_node; +}; + +struct dlm_mig_lockres_priv +{ + struct dlm_lock_resource *lockres; + u8 real_master; + u8 extra_ref; +}; + +struct dlm_assert_master_priv +{ + struct dlm_lock_resource *lockres; + u8 request_from; + u32 flags; + unsigned ignore_higher:1; +}; + +struct dlm_deref_lockres_priv +{ + struct dlm_lock_resource *deref_res; + u8 deref_node; +}; + +struct dlm_work_item +{ + struct list_head list; + dlm_workfunc_t *func; + struct dlm_ctxt *dlm; + void *data; + union { + struct dlm_request_all_locks_priv ral; + struct dlm_mig_lockres_priv ml; + struct dlm_assert_master_priv am; + struct dlm_deref_lockres_priv dl; + } u; +}; + +static inline void dlm_init_work_item(struct dlm_ctxt *dlm, + struct dlm_work_item *i, + dlm_workfunc_t *f, void *data) +{ + memset(i, 0, sizeof(*i)); + i->func = f; + INIT_LIST_HEAD(&i->list); + i->data = data; + i->dlm = dlm; /* must have already done a dlm_grab on this! */ +} + + + +static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, + u8 node) +{ + assert_spin_locked(&dlm->spinlock); + + dlm->joining_node = node; + wake_up(&dlm->dlm_join_events); +} + +#define DLM_LOCK_RES_UNINITED 0x00000001 +#define DLM_LOCK_RES_RECOVERING 0x00000002 +#define DLM_LOCK_RES_READY 0x00000004 +#define DLM_LOCK_RES_DIRTY 0x00000008 +#define DLM_LOCK_RES_IN_PROGRESS 0x00000010 +#define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 +#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 +#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 + +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + +#define DLM_PURGE_INTERVAL_MS (8 * 1000) + +struct dlm_lock_resource +{ + /* WARNING: Please see the comment in dlm_init_lockres before + * adding fields here. */ + struct hlist_node hash_node; + struct qstr lockname; + struct kref refs; + + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ + struct list_head granted; + struct list_head converting; + struct list_head blocked; + struct list_head purge; + + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ + struct list_head dirty; + struct list_head recovering; // dlm_recovery_ctxt.resources list + + /* Added during init and removed during release */ + struct list_head tracking; /* dlm->tracking_list */ + + /* unused lock resources have their last_used stamped and are + * put on a list for the dlm thread to run. */ + unsigned long last_used; + + struct dlm_ctxt *dlm; + + unsigned migration_pending:1; + atomic_t asts_reserved; + spinlock_t spinlock; + wait_queue_head_t wq; + u8 owner; //node which owns the lock resource, or unknown + u16 state; + char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned int inflight_assert_workers; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +struct dlm_migratable_lock +{ + __be64 cookie; + + /* these 3 are just padding for the in-memory structure, but + * list and flags are actually used when sent over the wire */ + __be16 pad1; + u8 list; // 0=granted, 1=converting, 2=blocked + u8 flags; + + s8 type; + s8 convert_type; + s8 highest_blocked; + u8 node; +}; // 16 bytes + +struct dlm_lock +{ + struct dlm_migratable_lock ml; + + struct list_head list; + struct list_head ast_list; + struct list_head bast_list; + struct dlm_lock_resource *lockres; + spinlock_t spinlock; + struct kref lock_refs; + + // ast and bast must be callable while holding a spinlock! + dlm_astlockfunc_t *ast; + dlm_bastlockfunc_t *bast; + void *astdata; + struct dlm_lockstatus *lksb; + unsigned ast_pending:1, + bast_pending:1, + convert_pending:1, + lock_pending:1, + cancel_pending:1, + unlock_pending:1, + lksb_kernel_allocated:1; +}; + + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + + +enum dlm_lockres_list { + DLM_GRANTED_LIST = 0, + DLM_CONVERTING_LIST = 1, + DLM_BLOCKED_LIST = 2, +}; + +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; igranted; + else if (idx == DLM_CONVERTING_LIST) + ret = &res->converting; + else if (idx == DLM_BLOCKED_LIST) + ret = &res->blocked; + else + BUG(); + return ret; +} + + + + +struct dlm_node_iter +{ + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int curnode; +}; + + +enum { + DLM_MASTER_REQUEST_MSG = 500, + DLM_UNUSED_MSG1 = 501, + DLM_ASSERT_MASTER_MSG = 502, + DLM_CREATE_LOCK_MSG = 503, + DLM_CONVERT_LOCK_MSG = 504, + DLM_PROXY_AST_MSG = 505, + DLM_UNLOCK_LOCK_MSG = 506, + DLM_DEREF_LOCKRES_MSG = 507, + DLM_MIGRATE_REQUEST_MSG = 508, + DLM_MIG_LOCKRES_MSG = 509, + DLM_QUERY_JOIN_MSG = 510, + DLM_ASSERT_JOINED_MSG = 511, + DLM_CANCEL_JOIN_MSG = 512, + DLM_EXIT_DOMAIN_MSG = 513, + DLM_MASTER_REQUERY_MSG = 514, + DLM_LOCK_REQUEST_MSG = 515, + DLM_RECO_DATA_DONE_MSG = 516, + DLM_BEGIN_RECO_MSG = 517, + DLM_FINALIZE_RECO_MSG = 518, + DLM_QUERY_REGION = 519, + DLM_QUERY_NODEINFO = 520, + DLM_BEGIN_EXIT_DOMAIN_MSG = 521, +}; + +struct dlm_reco_node_data +{ + int state; + u8 node_num; + struct list_head list; +}; + +enum { + DLM_RECO_NODE_DATA_DEAD = -1, + DLM_RECO_NODE_DATA_INIT = 0, + DLM_RECO_NODE_DATA_REQUESTING = 1, + DLM_RECO_NODE_DATA_REQUESTED = 2, + DLM_RECO_NODE_DATA_RECEIVING = 3, + DLM_RECO_NODE_DATA_DONE = 4, + DLM_RECO_NODE_DATA_FINALIZE_SENT = 5, +}; + + +enum { + DLM_MASTER_RESP_NO = 0, + DLM_MASTER_RESP_YES = 1, + DLM_MASTER_RESP_MAYBE = 2, + DLM_MASTER_RESP_ERROR = 3, +}; + + +struct dlm_master_request +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + +#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 +#define DLM_ASSERT_MASTER_REQUERY 0x00000002 +#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 +struct dlm_assert_master +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + +struct dlm_migrate_request +{ + u8 master; + u8 new_master; + u8 namelen; + u8 pad1; + __be32 pad2; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_master_requery +{ + u8 pad1; + u8 pad2; + u8 node_idx; + u8 namelen; + __be32 pad3; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_MRES_RECOVERY 0x01 +#define DLM_MRES_MIGRATION 0x02 +#define DLM_MRES_ALL_DONE 0x04 + +/* + * We would like to get one whole lockres into a single network + * message whenever possible. Generally speaking, there will be + * at most one dlm_lock on a lockres for each node in the cluster, + * plus (infrequently) any additional locks coming in from userdlm. + * + * struct _dlm_lockres_page + * { + * dlm_migratable_lockres mres; + * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS]; + * u8 pad[DLM_MIG_LOCKRES_RESERVED]; + * }; + * + * from ../cluster/tcp.h + * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * (roughly 4080 bytes) + * and sizeof(dlm_migratable_lockres) = 112 bytes + * and sizeof(dlm_migratable_lock) = 16 bytes + * + * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and + * DLM_MIG_LOCKRES_RESERVED=128 means we have this: + * + * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) + + * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED = + * NET_MAX_PAYLOAD_BYTES + * (240 * 16) + 112 + 128 = 4080 + * + * So a lockres would need more than 240 locks before it would + * use more than one network packet to recover. Not too bad. + */ +#define DLM_MAX_MIGRATABLE_LOCKS 240 + +struct dlm_migratable_lockres +{ + u8 master; + u8 lockname_len; + u8 num_locks; // locks sent in this structure + u8 flags; + __be32 total_locks; // locks to be sent for this migration cookie + __be64 mig_cookie; // cookie for this lockres migration + // or zero if not needed + // 16 bytes + u8 lockname[DLM_LOCKID_NAME_MAX]; + // 48 bytes + u8 lvb[DLM_LVB_LEN]; + // 112 bytes + struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 +}; +#define DLM_MIG_LOCKRES_MAX_LEN \ + (sizeof(struct dlm_migratable_lockres) + \ + (sizeof(struct dlm_migratable_lock) * \ + DLM_MAX_MIGRATABLE_LOCKS) ) + +/* from above, 128 bytes + * for some undetermined future use */ +#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ + DLM_MIG_LOCKRES_MAX_LEN) + +struct dlm_create_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_convert_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) + +struct dlm_unlock_lock +{ + __be64 cookie; + + __be32 flags; + __be16 pad1; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) + +struct dlm_proxy_ast +{ + __be64 cookie; + + __be32 flags; + u8 node_idx; + u8 type; + u8 blocked_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) + +#define DLM_MOD_KEY (0x666c6172) +enum dlm_query_join_response_code { + JOIN_DISALLOW = 0, + JOIN_OK = 1, + JOIN_OK_NO_MAP = 2, + JOIN_PROTOCOL_MISMATCH = 3, +}; + +struct dlm_query_join_packet { + u8 code; /* Response code. dlm_minor and fs_minor + are only valid if this is JOIN_OK */ + u8 dlm_minor; /* The minor version of the protocol the + dlm is speaking. */ + u8 fs_minor; /* The minor version of the protocol the + filesystem is speaking. */ + u8 reserved; +}; + +union dlm_query_join_response { + __be32 intval; + struct dlm_query_join_packet packet; +}; + +struct dlm_lock_request +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + +struct dlm_reco_data_done +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; + + /* unused for now */ + /* eventually we can use this to attempt + * lvb recovery based on each node's info */ + u8 reco_lvb[DLM_LVB_LEN]; +}; + +struct dlm_begin_reco +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + + +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + +struct dlm_query_join_request +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + struct dlm_protocol_version dlm_proto; + struct dlm_protocol_version fs_proto; + u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; +}; + +struct dlm_assert_joined +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_cancel_join +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_query_region { + u8 qr_node; + u8 qr_numregions; + u8 qr_namelen; + u8 pad1; + u8 qr_domain[O2NM_MAX_NAME_LEN]; + u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; +}; + +struct dlm_node_info { + u8 ni_nodenum; + u8 pad1; + __be16 ni_ipv4_port; + __be32 ni_ipv4_address; +}; + +struct dlm_query_nodeinfo { + u8 qn_nodenum; + u8 qn_numnodes; + u8 qn_namelen; + u8 pad1; + u8 qn_domain[O2NM_MAX_NAME_LEN]; + struct dlm_node_info qn_nodes[O2NM_MAX_NODES]; +}; + +struct dlm_exit_domain +{ + u8 node_idx; + u8 pad1[3]; +}; + +struct dlm_finalize_reco +{ + u8 node_idx; + u8 dead_node; + u8 flags; + u8 pad1; + __be32 pad2; +}; + +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +static inline enum dlm_status +__dlm_lockres_state_to_status(struct dlm_lock_resource *res) +{ + enum dlm_status status = DLM_NORMAL; + + assert_spin_locked(&res->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) + status = DLM_RECOVERING; + else if (res->state & DLM_LOCK_RES_MIGRATING) + status = DLM_MIGRATING; + else if (res->state & DLM_LOCK_RES_IN_PROGRESS) + status = DLM_FORWARD; + + return status; +} + +static inline u8 dlm_get_lock_cookie_node(u64 cookie) +{ + u8 ret; + cookie >>= 56; + ret = (u8)(cookie & 0xffULL); + return ret; +} + +static inline unsigned long long dlm_get_lock_cookie_seq(u64 cookie) +{ + unsigned long long ret; + ret = ((unsigned long long)cookie) & 0x00ffffffffffffffULL; + return ret; +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb); +void dlm_lock_get(struct dlm_lock *lock); +void dlm_lock_put(struct dlm_lock *lock); + +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res); + +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_launch_thread(struct dlm_ctxt *dlm); +void dlm_complete_thread(struct dlm_ctxt *dlm); +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); +void dlm_wait_for_recovery(struct dlm_ctxt *dlm); +void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); + +void dlm_put(struct dlm_ctxt *dlm); +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); +int dlm_domain_fully_joined(struct dlm_ctxt *dlm); + +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static inline void dlm_lockres_get(struct dlm_lock_resource *res) +{ + /* This is called on every lookup, so it might be worth + * inlining. */ + kref_get(&res->refs); +} +void dlm_lockres_put(struct dlm_lock_resource *res); +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len); + +int dlm_is_host_down(int errno); + +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int namelen, + int flags); +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen); + +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); + +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_do_local_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +int dlm_do_remote_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_do_local_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type); +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int msg_type, + int blocked_type, int flags); +static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST, + blocked_type, 0); +} + +static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int flags) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST, + 0, flags); +} + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res); +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); + +u8 dlm_nm_this_node(struct dlm_ctxt *dlm); +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); + + +int dlm_nm_init(struct dlm_ctxt *dlm); +int dlm_heartbeat_init(struct dlm_ctxt *dlm); +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); + +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +int dlm_finish_migration(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 old_master); +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); + +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_assert_master_post_handler(int status, void *data, void *ret_data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master); + + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, + u8 request_from, + u32 flags); + + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, + u8 flags); +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + +/* will exit holding res->spinlock, but may drop in function */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); +void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); + +/* will exit holding res->spinlock, but may drop in function */ +static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) +{ + __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)); +} + +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); + +/* create/destroy slab caches */ +int dlm_init_master_caches(void); +void dlm_destroy_master_caches(void); + +int dlm_init_lock_cache(void); +void dlm_destroy_lock_cache(void); + +int dlm_init_mle_cache(void); +void dlm_destroy_mle_cache(void); + +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_clean_master_list(struct dlm_ctxt *dlm, + u8 dead_node); +void dlm_force_free_mles(struct dlm_ctxt *dlm); +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); +int __dlm_lockres_unused(struct dlm_lock_resource *res); + +static inline const char * dlm_lock_mode_name(int mode) +{ + switch (mode) { + case LKM_EXMODE: + return "EX"; + case LKM_PRMODE: + return "PR"; + case LKM_NLMODE: + return "NL"; + } + return "UNKNOWN"; +} + + +static inline int dlm_lock_compatible(int existing, int request) +{ + /* NO_LOCK compatible with all */ + if (request == LKM_NLMODE || + existing == LKM_NLMODE) + return 1; + + /* EX incompatible with all non-NO_LOCK */ + if (request == LKM_EXMODE) + return 0; + + /* request must be PR, which is compatible with PR */ + if (existing == LKM_PRMODE) + return 1; + + return 0; +} + +static inline int dlm_lock_on_list(struct list_head *head, + struct dlm_lock *lock) +{ + struct dlm_lock *tmplock; + + list_for_each_entry(tmplock, head, list) { + if (tmplock == lock) + return 1; + } + return 0; +} + + +static inline enum dlm_status dlm_err_to_dlm_status(int err) +{ + enum dlm_status ret; + if (err == -ENOMEM) + ret = DLM_SYSERR; + else if (err == -ETIMEDOUT || o2net_link_down(err, NULL)) + ret = DLM_NOLOCKMGR; + else if (err == -EINVAL) + ret = DLM_BADPARAM; + else if (err == -ENAMETOOLONG) + ret = DLM_IVBUFLEN; + else + ret = DLM_BADARGS; + return ret; +} + + +static inline void dlm_node_iter_init(unsigned long *map, + struct dlm_node_iter *iter) +{ + memcpy(iter->node_map, map, sizeof(iter->node_map)); + iter->curnode = -1; +} + +static inline int dlm_node_iter_next(struct dlm_node_iter *iter) +{ + int bit; + bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + iter->curnode = bit; + return bit; +} + +static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + res->owner = owner; +} + +static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + if (owner != res->owner) + dlm_set_lockres_owner(dlm, res, owner); +} + +#endif /* DLMCOMMON_H */ diff --git a/kernel/fs/ocfs2/dlm/dlmconvert.c b/kernel/fs/ocfs2/dlm/dlmconvert.c new file mode 100644 index 000000000..e36d63ff1 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmconvert.c @@ -0,0 +1,544 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.c + * + * underlying calls for lock conversion + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +/* NOTE: __dlmconvert_master is the only function in here that + * needs a spinlock held on entry (res->spinlock) and it is the + * only one that holds a lock on exit (res->spinlock). + * All other functions in here need no locks and drop all of + * the locks that they acquire. */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread); +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +/* + * this is only called directly by dlmlock(), and only when the + * local node is the owner of the lockres + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: see __dlmconvert_master + */ +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status; + + spin_lock(&res->spinlock); + /* we are not in a network handler, this is fine */ + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + status = __dlmconvert_master(dlm, res, lock, flags, type, + &call_ast, &kick_thread); + + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + if (status != DLM_NORMAL && status != DLM_NOTQUEUED) + dlm_error(status); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +/* performs lock conversion at the lockres master site + * locking: + * caller needs: res->spinlock + * taken: takes and drops lock->spinlock + * held on exit: res->spinlock + * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED + * call_ast: whether ast should be called for this lock + * kick_thread: whether dlm_kick_thread should be called + */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread) +{ + enum dlm_status status = DLM_NORMAL; + struct dlm_lock *tmplock=NULL; + + assert_spin_locked(&res->spinlock); + + mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n", + lock->ml.type, lock->ml.convert_type, type); + + spin_lock(&lock->spinlock); + + /* already converting? */ + if (lock->ml.convert_type != LKM_IVMODE) { + mlog(ML_ERROR, "attempted to convert a lock with a lock " + "conversion pending\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + /* must be on grant queue to convert */ + if (!dlm_lock_on_list(&res->granted, lock)) { + mlog(ML_ERROR, "attempted to convert a lock not on grant " + "queue\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + if (flags & LKM_VALBLK) { + switch (lock->ml.type) { + case LKM_EXMODE: + /* EX + LKM_VALBLK + convert == set lvb */ + mlog(0, "will set lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + break; + case LKM_PRMODE: + case LKM_NLMODE: + /* refetch if new level is not NL */ + if (type > LKM_NLMODE) { + mlog(0, "will fetch new value into " + "lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } else { + mlog(0, "will NOT fetch new value " + "into lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + flags &= ~(LKM_VALBLK); + } + break; + } + } + + + /* in-place downconvert? */ + if (type <= lock->ml.type) + goto grant; + + /* upconvert from here on */ + status = DLM_NORMAL; + list_for_each_entry(tmplock, &res->granted, list) { + if (tmplock == lock) + continue; + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + } + + list_for_each_entry(tmplock, &res->converting, list) { + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + /* existing conversion requests take precedence */ + if (!dlm_lock_compatible(tmplock->ml.convert_type, type)) + goto switch_queues; + } + + /* fall thru to grant */ + +grant: + mlog(0, "res %.*s, granting %s lock\n", res->lockname.len, + res->lockname.name, dlm_lock_mode_name(type)); + /* immediately grant the new lock type */ + lock->lksb->status = DLM_NORMAL; + if (lock->ml.node == dlm->node_num) + mlog(0, "doing in-place convert for nonlocal lock\n"); + lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + + status = DLM_NORMAL; + *call_ast = 1; + goto unlock_exit; + +switch_queues: + if (flags & LKM_NOQUEUE) { + mlog(0, "failed to convert NOQUEUE lock %.*s from " + "%d to %d...\n", res->lockname.len, res->lockname.name, + lock->ml.type, type); + status = DLM_NOTQUEUED; + goto unlock_exit; + } + mlog(0, "res %.*s, queueing...\n", res->lockname.len, + res->lockname.name); + + lock->ml.convert_type = type; + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->converting); + +unlock_exit: + spin_unlock(&lock->spinlock); + if (status == DLM_DENIED) { + __dlm_print_one_lock_resource(res); + } + if (status == DLM_NORMAL) + *kick_thread = 1; + return status; +} + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; + lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); +} + +/* messages the master site to do lock conversion + * locking: + * caller needs: none + * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS + * held on exit: none + * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node + */ +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + enum dlm_status status; + + mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, + lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "bailing out early since res is RECOVERING " + "on secondary queue\n"); + /* __dlm_print_one_lock_resource(res); */ + status = DLM_RECOVERING; + goto bail; + } + /* will exit this call with spinlock held */ + __dlm_wait_on_lockres(res); + + if (lock->ml.convert_type != LKM_IVMODE) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "converting a remote lock that is already " + "converting! (cookie=%u:%llu, conv=%d)\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.convert_type); + status = DLM_DENIED; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; + /* move lock to local convert queue */ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->converting); + lock->convert_pending = 1; + lock->ml.convert_type = type; + + if (flags & LKM_VALBLK) { + if (lock->ml.type == LKM_EXMODE) { + flags |= LKM_PUT_LVB; + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + } else { + if (lock->ml.convert_type == LKM_NLMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + } + spin_unlock(&res->spinlock); + + /* no locks held here. + * need to wait for a reply as to whether it got queued or not. */ + status = dlm_send_remote_convert_request(dlm, res, lock, flags, type); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->convert_pending = 0; + /* if it failed, move it back to granted queue */ + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + dlm_revert_pending_convert(res, lock); + } +bail: + spin_unlock(&res->spinlock); + + /* TODO: should this be a wake_one? */ + /* wake up any IN_PROGRESS waiters */ + wake_up(&res->wq); + + return status; +} + +/* sends DLM_CONVERT_LOCK_MSG to master site + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, status from remote node + */ +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + struct dlm_convert_lock convert; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); + + memset(&convert, 0, sizeof(struct dlm_convert_lock)); + convert.node_idx = dlm->node_num; + convert.requested_type = type; + convert.cookie = lock->ml.cookie; + convert.namelen = res->lockname.len; + convert.flags = cpu_to_be32(flags); + memcpy(convert.name, res->lockname.name, convert.namelen); + + vec[0].iov_len = sizeof(struct dlm_convert_lock); + vec[0].iov_base = &convert; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key, + vec, veclen, res->owner, &status); + if (tmpret >= 0) { + // successfully sent and received + ret = status; // this is already a dlm_status + if (ret == DLM_RECOVERING) { + mlog(0, "node %u returned DLM_RECOVERING from convert " + "message!\n", res->owner); + } else if (ret == DLM_MIGRATING) { + mlog(0, "node %u returned DLM_MIGRATING from convert " + "message!\n", res->owner); + } else if (ret == DLM_FORWARD) { + mlog(0, "node %u returned DLM_FORWARD from convert " + "message!\n", res->owner); + } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) + dlm_error(ret); + } else { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key, + res->owner); + if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + ret = DLM_RECOVERING; + mlog(0, "node %u died so returning DLM_RECOVERING " + "from convert message!\n", res->owner); + } else { + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +/* handler for DLM_CONVERT_LOCK_MSG on master site + * locking: + * caller needs: none + * taken: takes and drop res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, + * status from __dlmconvert_master + */ +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + struct dlm_lock *tmp_lock; + struct dlm_lockstatus *lksb; + enum dlm_status status = DLM_NORMAL; + u32 flags; + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + if (cnv->namelen > DLM_LOCKID_NAME_MAX) { + status = DLM_IVBUFLEN; + dlm_error(status); + goto leave; + } + + flags = be32_to_cpu(cnv->flags); + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + status = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } + list_for_each_entry(tmp_lock, &res->granted, list) { + if (tmp_lock->ml.cookie == cnv->cookie && + tmp_lock->ml.node == cnv->node_idx) { + lock = tmp_lock; + dlm_lock_get(lock); + break; + } + } + spin_unlock(&res->spinlock); + if (!lock) { + status = DLM_IVLOCKID; + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); + dlm_print_one_lock_resource(res); + goto leave; + } + + /* found the lock */ + lksb = lock->lksb; + + /* see if caller needed to get/put lvb */ + if (flags & LKM_PUT_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN); + } else if (flags & LKM_GET_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_GET_LVB; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status == DLM_NORMAL) { + __dlm_lockres_reserve_ast(res); + ast_reserved = 1; + res->state |= DLM_LOCK_RES_IN_PROGRESS; + status = __dlmconvert_master(dlm, res, lock, flags, + cnv->requested_type, + &call_ast, &kick_thread); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + wake = 1; + } + spin_unlock(&res->spinlock); + if (wake) + wake_up(&res->wq); + + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); + } + +leave: + if (lock) + dlm_lock_put(lock); + + /* either queue the ast or release it, if reserved */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else if (ast_reserved) + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} diff --git a/kernel/fs/ocfs2/dlm/dlmconvert.h b/kernel/fs/ocfs2/dlm/dlmconvert.h new file mode 100644 index 000000000..b2e3677df --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmconvert.h @@ -0,0 +1,35 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCONVERT_H +#define DLMCONVERT_H + +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +#endif diff --git a/kernel/fs/ocfs2/dlm/dlmdebug.c b/kernel/fs/ocfs2/dlm/dlmdebug.c new file mode 100644 index 000000000..825136070 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmdebug.c @@ -0,0 +1,1000 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.c + * + * debug functionality for the dlm + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len); + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); +} + +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + printk(" refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + +static void __dlm_print_lock(struct dlm_lock *lock) +{ + spin_lock(&lock->spinlock); + + printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, " + "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), " + "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + atomic_read(&lock->lock_refs.refcount), + (list_empty(&lock->ast_list) ? 'y' : 'n'), + (lock->ast_pending ? 'y' : 'n'), + (list_empty(&lock->bast_list) ? 'y' : 'n'), + (lock->bast_pending ? 'y' : 'n'), + (lock->convert_pending ? 'y' : 'n'), + (lock->lock_pending ? 'y' : 'n'), + (lock->cancel_pending ? 'y' : 'n'), + (lock->unlock_pending ? 'y' : 'n')); + + spin_unlock(&lock->spinlock); +} + +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + struct dlm_lock *lock; + char buf[DLM_LOCKID_NAME_MAX]; + + assert_spin_locked(&res->spinlock); + + stringify_lockname(res->lockname.name, res->lockname.len, + buf, sizeof(buf)); + printk("lockres: %s, owner=%u, state=%u\n", + buf, res->owner, res->state); + printk(" last used: %lu, refcnt: %u, on purge list: %s\n", + res->last_used, atomic_read(&res->refs.refcount), + list_empty(&res->purge) ? "no" : "yes"); + printk(" on dirty list: %s, on reco list: %s, " + "migrating pending: %s\n", + list_empty(&res->dirty) ? "no" : "yes", + list_empty(&res->recovering) ? "no" : "yes", + res->migration_pending ? "yes" : "no"); + printk(" inflight locks: %d, asts reserved: %d\n", + res->inflight_locks, atomic_read(&res->asts_reserved)); + dlm_print_lockres_refmap(res); + printk(" granted queue:\n"); + list_for_each_entry(lock, &res->granted, list) { + __dlm_print_lock(lock); + } + printk(" converting queue:\n"); + list_for_each_entry(lock, &res->converting, list) { + __dlm_print_lock(lock); + } + printk(" blocked queue:\n"); + list_for_each_entry(lock, &res->blocked, list) { + __dlm_print_lock(lock); + } +} + +void dlm_print_one_lock(struct dlm_lock *lockid) +{ + dlm_print_one_lock_resource(lockid->lockres); +} +EXPORT_SYMBOL_GPL(dlm_print_one_lock); + +static const char *dlm_errnames[] = { + [DLM_NORMAL] = "DLM_NORMAL", + [DLM_GRANTED] = "DLM_GRANTED", + [DLM_DENIED] = "DLM_DENIED", + [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS", + [DLM_WORKING] = "DLM_WORKING", + [DLM_BLOCKED] = "DLM_BLOCKED", + [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN", + [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD", + [DLM_SYSERR] = "DLM_SYSERR", + [DLM_NOSUPPORT] = "DLM_NOSUPPORT", + [DLM_CANCELGRANT] = "DLM_CANCELGRANT", + [DLM_IVLOCKID] = "DLM_IVLOCKID", + [DLM_SYNC] = "DLM_SYNC", + [DLM_BADTYPE] = "DLM_BADTYPE", + [DLM_BADRESOURCE] = "DLM_BADRESOURCE", + [DLM_MAXHANDLES] = "DLM_MAXHANDLES", + [DLM_NOCLINFO] = "DLM_NOCLINFO", + [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR", + [DLM_NOPURGED] = "DLM_NOPURGED", + [DLM_BADARGS] = "DLM_BADARGS", + [DLM_VOID] = "DLM_VOID", + [DLM_NOTQUEUED] = "DLM_NOTQUEUED", + [DLM_IVBUFLEN] = "DLM_IVBUFLEN", + [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT", + [DLM_BADPARAM] = "DLM_BADPARAM", + [DLM_VALNOTVALID] = "DLM_VALNOTVALID", + [DLM_REJECTED] = "DLM_REJECTED", + [DLM_ABORT] = "DLM_ABORT", + [DLM_CANCEL] = "DLM_CANCEL", + [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE", + [DLM_DEADLOCK] = "DLM_DEADLOCK", + [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS", + [DLM_FORWARD] = "DLM_FORWARD", + [DLM_TIMEOUT] = "DLM_TIMEOUT", + [DLM_IVGROUPID] = "DLM_IVGROUPID", + [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT", + [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH", + [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION", + [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ", + [DLM_RECOVERING] = "DLM_RECOVERING", + [DLM_MIGRATING] = "DLM_MIGRATING", + [DLM_MAXSTATS] = "DLM_MAXSTATS", +}; + +static const char *dlm_errmsgs[] = { + [DLM_NORMAL] = "request in progress", + [DLM_GRANTED] = "request granted", + [DLM_DENIED] = "request denied", + [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", + [DLM_WORKING] = "async request in progress", + [DLM_BLOCKED] = "lock request blocked", + [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", + [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", + [DLM_SYSERR] = "system error", + [DLM_NOSUPPORT] = "unsupported", + [DLM_CANCELGRANT] = "can't cancel convert: already granted", + [DLM_IVLOCKID] = "bad lockid", + [DLM_SYNC] = "synchronous request granted", + [DLM_BADTYPE] = "bad resource type", + [DLM_BADRESOURCE] = "bad resource handle", + [DLM_MAXHANDLES] = "no more resource handles", + [DLM_NOCLINFO] = "can't contact cluster manager", + [DLM_NOLOCKMGR] = "can't contact lock manager", + [DLM_NOPURGED] = "can't contact purge daemon", + [DLM_BADARGS] = "bad api args", + [DLM_VOID] = "no status", + [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", + [DLM_IVBUFLEN] = "invalid resource name length", + [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", + [DLM_BADPARAM] = "invalid lock mode specified", + [DLM_VALNOTVALID] = "value block has been invalidated", + [DLM_REJECTED] = "request rejected, unrecognized client", + [DLM_ABORT] = "blocked lock request cancelled", + [DLM_CANCEL] = "conversion request cancelled", + [DLM_IVRESHANDLE] = "invalid resource handle", + [DLM_DEADLOCK] = "deadlock recovery refused this request", + [DLM_DENIED_NOASTS] = "failed to allocate AST", + [DLM_FORWARD] = "request must wait for primary's response", + [DLM_TIMEOUT] = "timeout value for lock has expired", + [DLM_IVGROUPID] = "invalid group specification", + [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", + [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", + [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", + [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", + [DLM_RECOVERING] = "lock resource being recovered", + [DLM_MIGRATING] = "lock resource being migrated", + [DLM_MAXSTATS] = "invalid error number", +}; + +const char *dlm_errmsg(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errmsgs[DLM_MAXSTATS]; + return dlm_errmsgs[err]; +} +EXPORT_SYMBOL_GPL(dlm_errmsg); + +const char *dlm_errname(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errnames[DLM_MAXSTATS]; + return dlm_errnames[err]; +} +EXPORT_SYMBOL_GPL(dlm_errname); + +/* NOTE: This function converts a lockname into a string. It uses knowledge + * of the format of the lockname that should be outside the purview of the dlm. + * We are adding only to make dlm debugging slightly easier. + * + * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h. + */ +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len) +{ + int out = 0; + __be64 inode_blkno_be; + +#define OCFS2_DENTRY_LOCK_INO_START 18 + if (*lockname == 'N') { + memcpy((__be64 *)&inode_blkno_be, + (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + out += snprintf(buf + out, len - out, "%.*s%08x", + OCFS2_DENTRY_LOCK_INO_START - 1, lockname, + (unsigned int)be64_to_cpu(inode_blkno_be)); + } else + out += snprintf(buf + out, len - out, "%.*s", + locklen, lockname); + return out; +} + +static int stringify_nodemap(unsigned long *nodemap, int maxnodes, + char *buf, int len) +{ + int out = 0; + int i = -1; + + while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) + out += snprintf(buf + out, len - out, "%d ", i); + + return out; +} + +static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) +{ + int out = 0; + char *mle_type; + + if (mle->type == DLM_MLE_BLOCK) + mle_type = "BLK"; + else if (mle->type == DLM_MLE_MASTER) + mle_type = "MAS"; + else + mle_type = "MIG"; + + out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); + out += snprintf(buf + out, len - out, + "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", + mle_type, mle->master, mle->new_master, + !list_empty(&mle->hb_events), + !!mle->inuse, + atomic_read(&mle->mle_refs.refcount)); + + out += snprintf(buf + out, len - out, "Maybe="); + out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Vote="); + out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Response="); + out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Node="); + out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +void dlm_print_one_mle(struct dlm_master_list_entry *mle) +{ + char *buf; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (buf) { + dump_mle(mle, buf, PAGE_SIZE - 1); + free_page((unsigned long)buf); + } +} + +#ifdef CONFIG_DEBUG_FS + +static struct dentry *dlm_debugfs_root; + +#define DLM_DEBUGFS_DIR "o2dlm" +#define DLM_DEBUGFS_DLM_STATE "dlm_state" +#define DLM_DEBUGFS_LOCKING_STATE "locking_state" +#define DLM_DEBUGFS_MLE_STATE "mle_state" +#define DLM_DEBUGFS_PURGE_LIST "purge_list" + +/* begin - utils funcs */ +static void dlm_debug_free(struct kref *kref) +{ + struct dlm_debug_ctxt *dc; + + dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); + + kfree(dc); +} + +static void dlm_debug_put(struct dlm_debug_ctxt *dc) +{ + if (dc) + kref_put(&dc->debug_refcnt, dlm_debug_free); +} + +static void dlm_debug_get(struct dlm_debug_ctxt *dc) +{ + kref_get(&dc->debug_refcnt); +} + +static int debug_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +static ssize_t debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +/* end - util funcs */ + +/* begin - purge list funcs */ +static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + struct dlm_lock_resource *res; + int out = 0; + unsigned long total = 0; + + out += snprintf(buf + out, len - out, + "Dumping Purgelist for Domain: %s\n", dlm->name); + + spin_lock(&dlm->spinlock); + list_for_each_entry(res, &dlm->purge_list, purge) { + ++total; + if (len - out < 100) + continue; + spin_lock(&res->spinlock); + out += stringify_lockname(res->lockname.name, + res->lockname.len, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\t%ld\n", + (jiffies - res->last_used)/HZ); + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + out += snprintf(buf + out, len - out, "Total on list: %lu\n", total); + + return out; +} + +static int debug_purgelist_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_purgelist_fops = { + .open = debug_purgelist_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; +/* end - purge list funcs */ + +/* begin - debug mle funcs */ +static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + struct dlm_master_list_entry *mle; + struct hlist_head *bucket; + int i, out = 0; + unsigned long total = 0, longest = 0, bucket_count = 0; + + out += snprintf(buf + out, len - out, + "Dumping MLEs for Domain: %s\n", dlm->name); + + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_entry(mle, bucket, master_hash_node) { + ++total; + ++bucket_count; + if (len - out < 200) + continue; + out += dump_mle(mle, buf + out, len - out); + } + longest = max(longest, bucket_count); + bucket_count = 0; + } + spin_unlock(&dlm->master_lock); + + out += snprintf(buf + out, len - out, + "Total: %lu, Longest: %lu\n", total, longest); + return out; +} + +static int debug_mle_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_mle_fops = { + .open = debug_mle_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; + +/* end - debug mle funcs */ + +/* begin - debug lockres funcs */ +static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) +{ + int out; + +#define DEBUG_LOCK_VERSION 1 + spin_lock(&lock->spinlock); + out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + "%d,%d,%d,%d\n", + DEBUG_LOCK_VERSION, + list_type, lock->ml.type, lock->ml.convert_type, + lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + !list_empty(&lock->ast_list), + !list_empty(&lock->bast_list), + lock->ast_pending, lock->bast_pending, + lock->convert_pending, lock->lock_pending, + lock->cancel_pending, lock->unlock_pending, + atomic_read(&lock->lock_refs.refcount)); + spin_unlock(&lock->spinlock); + + return out; +} + +static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) +{ + struct dlm_lock *lock; + int i; + int out = 0; + + out += snprintf(buf + out, len - out, "NAME:"); + out += stringify_lockname(res->lockname.name, res->lockname.len, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + +#define DEBUG_LRES_VERSION 1 + out += snprintf(buf + out, len - out, + "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", + DEBUG_LRES_VERSION, + res->owner, res->state, res->last_used, + !list_empty(&res->purge), + !list_empty(&res->dirty), + !list_empty(&res->recovering), + res->inflight_locks, res->migration_pending, + atomic_read(&res->asts_reserved), + atomic_read(&res->refs.refcount)); + + /* refmap */ + out += snprintf(buf + out, len - out, "RMAP:"); + out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* lvb */ + out += snprintf(buf + out, len - out, "LVBX:"); + for (i = 0; i < DLM_LVB_LEN; i++) + out += snprintf(buf + out, len - out, + "%02x", (unsigned char)res->lvb[i]); + out += snprintf(buf + out, len - out, "\n"); + + /* granted */ + list_for_each_entry(lock, &res->granted, list) + out += dump_lock(lock, 0, buf + out, len - out); + + /* converting */ + list_for_each_entry(lock, &res->converting, list) + out += dump_lock(lock, 1, buf + out, len - out); + + /* blocked */ + list_for_each_entry(lock, &res->blocked, list) + out += dump_lock(lock, 2, buf + out, len - out); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +static void *lockres_seq_start(struct seq_file *m, loff_t *pos) +{ + struct debug_lockres *dl = m->private; + struct dlm_ctxt *dlm = dl->dl_ctxt; + struct dlm_lock_resource *oldres = dl->dl_res; + struct dlm_lock_resource *res = NULL; + struct list_head *track_list; + + spin_lock(&dlm->track_lock); + if (oldres) + track_list = &oldres->tracking; + else { + track_list = &dlm->tracking_list; + if (list_empty(track_list)) { + dl = NULL; + spin_unlock(&dlm->track_lock); + goto bail; + } + } + + list_for_each_entry(res, track_list, tracking) { + if (&res->tracking == &dlm->tracking_list) + res = NULL; + else + dlm_lockres_get(res); + break; + } + spin_unlock(&dlm->track_lock); + + if (oldres) + dlm_lockres_put(oldres); + + dl->dl_res = res; + + if (res) { + spin_lock(&res->spinlock); + dump_lockres(res, dl->dl_buf, dl->dl_len - 1); + spin_unlock(&res->spinlock); + } else + dl = NULL; + +bail: + /* passed to seq_show */ + return dl; +} + +static void lockres_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return NULL; +} + +static int lockres_seq_show(struct seq_file *s, void *v) +{ + struct debug_lockres *dl = (struct debug_lockres *)v; + + seq_printf(s, "%s", dl->dl_buf); + + return 0; +} + +static const struct seq_operations debug_lockres_ops = { + .start = lockres_seq_start, + .stop = lockres_seq_stop, + .next = lockres_seq_next, + .show = lockres_seq_show, +}; + +static int debug_lockres_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + struct debug_lockres *dl; + void *buf; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl)); + if (!dl) + goto bailfree; + + dl->dl_len = PAGE_SIZE; + dl->dl_buf = buf; + + dlm_grab(dlm); + dl->dl_ctxt = dlm; + + return 0; + +bailfree: + kfree(buf); +bail: + mlog_errno(-ENOMEM); + return -ENOMEM; +} + +static int debug_lockres_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct debug_lockres *dl = (struct debug_lockres *)seq->private; + + if (dl->dl_res) + dlm_lockres_put(dl->dl_res); + dlm_put(dl->dl_ctxt); + kfree(dl->dl_buf); + return seq_release_private(inode, file); +} + +static const struct file_operations debug_lockres_fops = { + .open = debug_lockres_open, + .release = debug_lockres_release, + .read = seq_read, + .llseek = seq_lseek, +}; +/* end - debug lockres funcs */ + +/* begin - debug state funcs */ +static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + int out = 0; + struct dlm_reco_node_data *node; + char *state; + int cur_mles = 0, tot_mles = 0; + int i; + + spin_lock(&dlm->spinlock); + + switch (dlm->dlm_state) { + case DLM_CTXT_NEW: + state = "NEW"; break; + case DLM_CTXT_JOINED: + state = "JOINED"; break; + case DLM_CTXT_IN_SHUTDOWN: + state = "SHUTDOWN"; break; + case DLM_CTXT_LEAVING: + state = "LEAVING"; break; + default: + state = "UNKNOWN"; break; + } + + /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ + out += snprintf(buf + out, len - out, + "Domain: %s Key: 0x%08x Protocol: %d.%d\n", + dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + + /* Thread Pid: xxx Node: xxx State: xxxxx */ + out += snprintf(buf + out, len - out, + "Thread Pid: %d Node: %d State: %s\n", + task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); + + /* Number of Joins: xxx Joining Node: xxx */ + out += snprintf(buf + out, len - out, + "Number of Joins: %d Joining Node: %d\n", + dlm->num_joins, dlm->joining_node); + + /* Domain Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Domain Map: "); + out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Exit Domain Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Live Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Live Map: "); + out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Lock Resources: xxx (xxx) */ + out += snprintf(buf + out, len - out, + "Lock Resources: %d (%d)\n", + atomic_read(&dlm->res_cur_count), + atomic_read(&dlm->res_tot_count)); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + tot_mles += atomic_read(&dlm->mle_tot_count[i]); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + cur_mles += atomic_read(&dlm->mle_cur_count[i]); + + /* MLEs: xxx (xxx) */ + out += snprintf(buf + out, len - out, + "MLEs: %d (%d)\n", cur_mles, tot_mles); + + /* Blocking: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Blocking: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); + + /* Mastery: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Mastery: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); + + /* Migration: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Migration: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); + + /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ + out += snprintf(buf + out, len - out, + "Lists: Dirty=%s Purge=%s PendingASTs=%s " + "PendingBASTs=%s\n", + (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), + (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); + + /* Purge Count: xxx Refs: xxx */ + out += snprintf(buf + out, len - out, + "Purge Count: %d Refs: %d\n", dlm->purge_count, + atomic_read(&dlm->dlm_refs.refcount)); + + /* Dead Node: xxx */ + out += snprintf(buf + out, len - out, + "Dead Node: %d\n", dlm->reco.dead_node); + + /* What about DLM_RECO_STATE_FINALIZE? */ + if (dlm->reco.state == DLM_RECO_STATE_ACTIVE) + state = "ACTIVE"; + else + state = "INACTIVE"; + + /* Recovery Pid: xxxx Master: xxx State: xxxx */ + out += snprintf(buf + out, len - out, + "Recovery Pid: %d Master: %d State: %s\n", + task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.new_master, state); + + /* Recovery Map: xx xx */ + out += snprintf(buf + out, len - out, "Recovery Map: "); + out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Recovery Node State: */ + out += snprintf(buf + out, len - out, "Recovery Node State:\n"); + list_for_each_entry(node, &dlm->reco.node_data, list) { + switch (node->state) { + case DLM_RECO_NODE_DATA_INIT: + state = "INIT"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + state = "REQUESTING"; + break; + case DLM_RECO_NODE_DATA_DEAD: + state = "DEAD"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + state = "RECEIVING"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + state = "REQUESTED"; + break; + case DLM_RECO_NODE_DATA_DONE: + state = "DONE"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + state = "FINALIZE-SENT"; + break; + default: + state = "BAD"; + break; + } + out += snprintf(buf + out, len - out, "\t%u - %s\n", + node->node_num, state); + } + + spin_unlock(&dlm->spinlock); + + return out; +} + +static int debug_state_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_state_fops = { + .open = debug_state_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; +/* end - debug state funcs */ + +/* files in subroot */ +int dlm_debug_init(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + /* for dumping dlm_ctxt */ + dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_state_fops); + if (!dc->debug_state_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres */ + dc->debug_lockres_dentry = + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_lockres_fops); + if (!dc->debug_lockres_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping mles */ + dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_mle_fops); + if (!dc->debug_mle_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres on the purge list */ + dc->debug_purgelist_dentry = + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_purgelist_fops); + if (!dc->debug_purgelist_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm_debug_get(dc); + return 0; + +bail: + dlm_debug_shutdown(dlm); + return -ENOMEM; +} + +void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + if (dc) { + debugfs_remove(dc->debug_purgelist_dentry); + debugfs_remove(dc->debug_mle_dentry); + debugfs_remove(dc->debug_lockres_dentry); + debugfs_remove(dc->debug_state_dentry); + dlm_debug_put(dc); + } +} + +/* subroot - domain dir */ +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, + dlm_debugfs_root); + if (!dlm->dlm_debugfs_subroot) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), + GFP_KERNEL); + if (!dlm->dlm_debug_ctxt) { + mlog_errno(-ENOMEM); + goto bail; + } + kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); + + return 0; +bail: + dlm_destroy_debugfs_subroot(dlm); + return -ENOMEM; +} + +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ + debugfs_remove(dlm->dlm_debugfs_subroot); +} + +/* debugfs root */ +int dlm_create_debugfs_root(void) +{ + dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); + if (!dlm_debugfs_root) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + return 0; +} + +void dlm_destroy_debugfs_root(void) +{ + debugfs_remove(dlm_debugfs_root); +} +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/fs/ocfs2/dlm/dlmdebug.h b/kernel/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 000000000..1f27c4812 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmdebug.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.h + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDEBUG_H +#define DLMDEBUG_H + +void dlm_print_one_mle(struct dlm_master_list_entry *mle); + +#ifdef CONFIG_DEBUG_FS + +struct dlm_debug_ctxt { + struct kref debug_refcnt; + struct dentry *debug_state_dentry; + struct dentry *debug_lockres_dentry; + struct dentry *debug_mle_dentry; + struct dentry *debug_purgelist_dentry; +}; + +struct debug_lockres { + int dl_len; + char *dl_buf; + struct dlm_ctxt *dl_ctxt; + struct dlm_lock_resource *dl_res; +}; + +int dlm_debug_init(struct dlm_ctxt *dlm); +void dlm_debug_shutdown(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_root(void); +void dlm_destroy_debugfs_root(void); + +#else + +static inline int dlm_debug_init(struct dlm_ctxt *dlm) +{ + return 0; +} +static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ +} +static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + return 0; +} +static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ +} +static inline int dlm_create_debugfs_root(void) +{ + return 0; +} +static inline void dlm_destroy_debugfs_root(void) +{ +} + +#endif /* CONFIG_DEBUG_FS */ +#endif /* DLMDEBUG_H */ diff --git a/kernel/fs/ocfs2/dlm/dlmdomain.c b/kernel/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 000000000..7df88a6dd --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmdomain.c @@ -0,0 +1,2375 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.c + * + * defines domain join / leave apis + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) +#include "cluster/masklog.h" + +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, (unsigned long)DLM_HASH_PAGES, + (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + +/* + * + * spinlock lock ordering: if multiple locks are needed, obey this ordering: + * dlm_domain_lock + * struct dlm_ctxt->spinlock + * struct dlm_lock_resource->spinlock + * struct dlm_ctxt->master_lock + * struct dlm_ctxt->ast_lock + * dlm_master_list_entry->spinlock + * dlm_lock->spinlock + * + */ + +DEFINE_SPINLOCK(dlm_domain_lock); +LIST_HEAD(dlm_domains); +static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); + +/* + * The supported protocol version for DLM communication. Running domains + * will have a negotiated version with the same major number and a minor + * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should + * be used to determine what a running domain is actually using. + * + * New in version 1.1: + * - Message DLM_QUERY_REGION added to support global heartbeat + * - Message DLM_QUERY_NODEINFO added to allow online node removes + * New in version 1.2: + * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + */ +static const struct dlm_protocol_version dlm_protocol = { + .pv_major = 1, + .pv_minor = 2, +}; + +#define DLM_DOMAIN_BACKOFF_MS 200 + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_protocol_compare(struct dlm_protocol_version *existing, + struct dlm_protocol_version *request); + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); + +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + if (hlist_unhashed(&res->hash_node)) + return; + + mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + hlist_del_init(&res->hash_node); + dlm_lockres_put(res); +} + +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct hlist_head *bucket; + struct qstr *q; + + assert_spin_locked(&dlm->spinlock); + + q = &res->lockname; + bucket = dlm_lockres_hash(dlm, q->hash); + + /* get a reference for our hashtable */ + dlm_lockres_get(res); + + hlist_add_head(&res->hash_node, bucket); + + mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); +} + +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct hlist_head *bucket; + struct dlm_lock_resource *res; + + mlog(0, "%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + bucket = dlm_lockres_hash(dlm, hash); + + hlist_for_each_entry(res, bucket, hash_node) { + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; + } + return NULL; +} + +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog(0, "%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len) +{ + struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, len, hash); + spin_unlock(&dlm->spinlock); + return res; +} + +static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) +{ + struct dlm_ctxt *tmp; + + assert_spin_locked(&dlm_domain_lock); + + /* tmp->name here is always NULL terminated, + * but domain may not be! */ + list_for_each_entry(tmp, &dlm_domains, list) { + if (strlen(tmp->name) == len && + memcmp(tmp->name, domain, len)==0) + return tmp; + } + + return NULL; +} + +/* For null terminated domain strings ONLY */ +static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) +{ + assert_spin_locked(&dlm_domain_lock); + + return __dlm_lookup_domain_full(domain, strlen(domain)); +} + + +/* returns true on one of two conditions: + * 1) the domain does not exist + * 2) the domain exists and it's state is "joined" */ +static int dlm_wait_on_domain_helper(const char *domain) +{ + int ret = 0; + struct dlm_ctxt *tmp = NULL; + + spin_lock(&dlm_domain_lock); + + tmp = __dlm_lookup_domain(domain); + if (!tmp) + ret = 1; + else if (tmp->dlm_state == DLM_CTXT_JOINED) + ret = 1; + + spin_unlock(&dlm_domain_lock); + return ret; +} + +static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) +{ + dlm_destroy_debugfs_subroot(dlm); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); + + kfree(dlm->name); + kfree(dlm); +} + +/* A little strange - this function will be called while holding + * dlm_domain_lock and is expected to be holding it on the way out. We + * will however drop and reacquire it multiple times */ +static void dlm_ctxt_release(struct kref *kref) +{ + struct dlm_ctxt *dlm; + + dlm = container_of(kref, struct dlm_ctxt, dlm_refs); + + BUG_ON(dlm->num_joins); + BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); + + /* we may still be in the list if we hit an error during join. */ + list_del_init(&dlm->list); + + spin_unlock(&dlm_domain_lock); + + mlog(0, "freeing memory from domain %s\n", dlm->name); + + wake_up(&dlm_domain_events); + + dlm_free_ctxt_mem(dlm); + + spin_lock(&dlm_domain_lock); +} + +void dlm_put(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm_domain_lock); + kref_put(&dlm->dlm_refs, dlm_ctxt_release); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_get(struct dlm_ctxt *dlm) +{ + kref_get(&dlm->dlm_refs); +} + +/* given a questionable reference to a dlm object, gets a reference if + * it can find it in the list, otherwise returns NULL in which case + * you shouldn't trust your pointer. */ +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) +{ + struct dlm_ctxt *target; + struct dlm_ctxt *ret = NULL; + + spin_lock(&dlm_domain_lock); + + list_for_each_entry(target, &dlm_domains, list) { + if (target == dlm) { + __dlm_get(target); + ret = target; + break; + } + } + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +int dlm_domain_fully_joined(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm_domain_lock); + ret = (dlm->dlm_state == DLM_CTXT_JOINED) || + (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + +static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) +{ + dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); + + /* We've left the domain. Now we can take ourselves out of the + * list and allow the kref stuff to help us free the + * memory. */ + spin_lock(&dlm_domain_lock); + list_del_init(&dlm->list); + spin_unlock(&dlm_domain_lock); + + /* Wake up anyone waiting for us to remove this domain */ + wake_up(&dlm_domain_events); +} + +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) +{ + int i, num, n, ret = 0; + struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; + + mlog(0, "Migrating locks from domain %s\n", dlm->name); + + num = 0; + spin_lock(&dlm->spinlock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); + dlm_lockres_get(res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + if (dropped) + __dlm_lockres_calc_usage(dlm, res); + else + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + + dlm_lockres_put(res); + + if (dropped) { + cond_resched_lock(&dlm->spinlock); + goto redo_bucket; + } + } + cond_resched_lock(&dlm->spinlock); + num += n; + } + spin_unlock(&dlm->spinlock); + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } + mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; +} + +static int dlm_no_joining_node(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm->spinlock); + ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; + spin_unlock(&dlm->spinlock); + + return ret; +} + +static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node); + + spin_lock(&dlm->spinlock); + set_bit(node, dlm->exit_domain_map); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + +static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) +{ + /* Yikes, a double spinlock! I need domain_lock for the dlm + * state and the dlm spinlock for join state... Sorry! */ +again: + spin_lock(&dlm_domain_lock); + spin_lock(&dlm->spinlock); + + if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "Node %d is joining, we wait on it.\n", + dlm->joining_node); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); + goto again; + } + + dlm->dlm_state = DLM_CTXT_LEAVING; + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_print_nodes(struct dlm_ctxt *dlm) +{ + int node = -1, num = 0; + + assert_spin_locked(&dlm->spinlock); + + printk("( "); + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + printk("%d ", node); + ++num; + } + printk(") %u nodes\n", num); +} + +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + mlog(0, "%p %u %p", msg, len, data); + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + + spin_lock(&dlm->spinlock); + clear_bit(node, dlm->domain_map); + clear_bit(node, dlm->exit_domain_map); + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, node, 0); + + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + +static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type, + unsigned int node) +{ + int status; + struct dlm_exit_domain leave_msg; + + mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name, + msg_type, node); + + memset(&leave_msg, 0, sizeof(leave_msg)); + leave_msg.node_idx = dlm->node_num; + + status = o2net_send_message(msg_type, dlm->key, &leave_msg, + sizeof(leave_msg), node, NULL); + if (status < 0) + mlog(ML_ERROR, "Error %d sending domain exit message %u " + "to node %u on domain %s\n", status, msg_type, node, + dlm->name); + + return status; +} + +static void dlm_begin_exit_domain(struct dlm_ctxt *dlm) +{ + int node = -1; + + /* Support for begin exit domain was added in 1.2 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor < 2) + return; + + /* + * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely + * informational. Meaning if a node does not receive the message, + * so be it. + */ + spin_lock(&dlm->spinlock); + while (1) { + node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1); + if (node >= O2NM_MAX_NODES) + break; + if (node == dlm->node_num) + continue; + + spin_unlock(&dlm->spinlock); + dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node); + spin_lock(&dlm->spinlock); + } + spin_unlock(&dlm->spinlock); +} + +static void dlm_leave_domain(struct dlm_ctxt *dlm) +{ + int node, clear_node, status; + + /* At this point we've migrated away all our locks and won't + * accept mastership of new ones. The dlm is responsible for + * almost nothing now. We make sure not to confuse any joining + * nodes and then commence shutdown procedure. */ + + spin_lock(&dlm->spinlock); + /* Clear ourselves from the domain map */ + clear_bit(dlm->node_num, dlm->domain_map); + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + 0)) < O2NM_MAX_NODES) { + /* Drop the dlm spinlock. This is safe wrt the domain_map. + * -nodes cannot be added now as the + * query_join_handlers knows to respond with OK_NO_MAP + * -we catch the right network errors if a node is + * removed from the map while we're sending him the + * exit message. */ + spin_unlock(&dlm->spinlock); + + clear_node = 1; + + status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG, + node); + if (status < 0 && + status != -ENOPROTOOPT && + status != -ENOTCONN) { + mlog(ML_NOTICE, "Error %d sending domain exit message " + "to node %d\n", status, node); + + /* Not sure what to do here but lets sleep for + * a bit in case this was a transient + * error... */ + msleep(DLM_DOMAIN_BACKOFF_MS); + clear_node = 0; + } + + spin_lock(&dlm->spinlock); + /* If we're not clearing the node bit then we intend + * to loop back around to try again. */ + if (clear_node) + clear_bit(node, dlm->domain_map); + } + spin_unlock(&dlm->spinlock); +} + +int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +void dlm_unregister_domain(struct dlm_ctxt *dlm) +{ + int leave = 0; + struct dlm_lock_resource *res; + + spin_lock(&dlm_domain_lock); + BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); + BUG_ON(!dlm->num_joins); + + dlm->num_joins--; + if (!dlm->num_joins) { + /* We mark it "in shutdown" now so new register + * requests wait until we've completely left the + * domain. Don't use DLM_CTXT_LEAVING yet as we still + * want new domain joins to communicate with us at + * least until we've completed migration of our + * resources. */ + dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; + leave = 1; + } + spin_unlock(&dlm_domain_lock); + + if (leave) { + mlog(0, "shutting down domain %s\n", dlm->name); + dlm_begin_exit_domain(dlm); + + /* We changed dlm state, notify the thread */ + dlm_kick_thread(dlm, NULL); + + while (dlm_migrate_all_locks(dlm)) { + /* Give dlm_thread time to purge the lockres' */ + msleep(500); + mlog(0, "%s: more migration to do\n", dlm->name); + } + + /* This list should be empty. If not, print remaining lockres */ + if (!list_empty(&dlm->tracking_list)) { + mlog(ML_ERROR, "Following lockres' are still on the " + "tracking list:\n"); + list_for_each_entry(res, &dlm->tracking_list, tracking) + dlm_print_one_lock_resource(res); + } + + dlm_mark_domain_leaving(dlm); + dlm_leave_domain(dlm); + printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); + dlm_force_free_mles(dlm); + dlm_complete_dlm_shutdown(dlm); + } + dlm_put(dlm); +} +EXPORT_SYMBOL_GPL(dlm_unregister_domain); + +static int dlm_query_join_proto_check(char *proto_type, int node, + struct dlm_protocol_version *ours, + struct dlm_protocol_version *request) +{ + int rc; + struct dlm_protocol_version proto = *request; + + if (!dlm_protocol_compare(ours, &proto)) { + mlog(0, + "node %u wanted to join with %s locking protocol " + "%u.%u, we respond with %u.%u\n", + node, proto_type, + request->pv_major, + request->pv_minor, + proto.pv_major, proto.pv_minor); + request->pv_minor = proto.pv_minor; + rc = 0; + } else { + mlog(ML_NOTICE, + "Node %u wanted to join with %s locking " + "protocol %u.%u, but we have %u.%u, disallowing\n", + node, proto_type, + request->pv_major, + request->pv_minor, + ours->pv_major, + ours->pv_minor); + rc = 1; + } + + return rc; +} + +/* + * struct dlm_query_join_packet is made up of four one-byte fields. They + * are effectively in big-endian order already. However, little-endian + * machines swap them before putting the packet on the wire (because + * query_join's response is a status, and that status is treated as a u32 + * on the wire). Thus, a big-endian and little-endian machines will treat + * this structure differently. + * + * The solution is to have little-endian machines swap the structure when + * converting from the structure to the u32 representation. This will + * result in the structure having the correct format on the wire no matter + * the host endian format. + */ +static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet, + u32 *wire) +{ + union dlm_query_join_response response; + + response.packet = *packet; + *wire = be32_to_cpu(response.intval); +} + +static void dlm_query_join_wire_to_packet(u32 wire, + struct dlm_query_join_packet *packet) +{ + union dlm_query_join_response response; + + response.intval = cpu_to_be32(wire); + *packet = response.packet; +} + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_query_join_request *query; + struct dlm_query_join_packet packet = { + .code = JOIN_DISALLOW, + }; + struct dlm_ctxt *dlm = NULL; + u32 response; + u8 nodenum; + + query = (struct dlm_query_join_request *) msg->buf; + + mlog(0, "node %u wants to join domain %s\n", query->node_idx, + query->domain); + + /* + * If heartbeat doesn't consider the node live, tell it + * to back off and try again. This gives heartbeat a chance + * to catch up. + */ + if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) { + mlog(0, "node %u is not in our live map yet\n", + query->node_idx); + + packet.code = JOIN_DISALLOW; + goto respond; + } + + packet.code = JOIN_OK_NO_MAP; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + mlog(0, "disallow join as node %u does not " + "have node %u in its nodemap\n", + query->node_idx, nodenum); + packet.code = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + + /* Once the dlm ctxt is marked as leaving then we don't want + * to be put in someone's domain map. + * Also, explicitly disallow joining at certain troublesome + * times (ie. during recovery). */ + if (dlm->dlm_state != DLM_CTXT_LEAVING) { + int bit = query->node_idx; + spin_lock(&dlm->spinlock); + + if (dlm->dlm_state == DLM_CTXT_NEW && + dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { + /*If this is a brand new context and we + * haven't started our join process yet, then + * the other node won the race. */ + packet.code = JOIN_OK_NO_MAP; + } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* Disallow parallel joins. */ + packet.code = JOIN_DISALLOW; + } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "node %u trying to join, but recovery " + "is ongoing.\n", bit); + packet.code = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->recovery_map)) { + mlog(0, "node %u trying to join, but it " + "still needs recovery.\n", bit); + packet.code = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->domain_map)) { + mlog(0, "node %u trying to join, but it " + "is still in the domain! needs recovery?\n", + bit); + packet.code = JOIN_DISALLOW; + } else { + /* Alright we're fully a part of this domain + * so we keep some state as to who's joining + * and indicate to him that needs to be fixed + * up. */ + + /* Make sure we speak compatible locking protocols. */ + if (dlm_query_join_proto_check("DLM", bit, + &dlm->dlm_locking_proto, + &query->dlm_proto)) { + packet.code = JOIN_PROTOCOL_MISMATCH; + } else if (dlm_query_join_proto_check("fs", bit, + &dlm->fs_locking_proto, + &query->fs_proto)) { + packet.code = JOIN_PROTOCOL_MISMATCH; + } else { + packet.dlm_minor = query->dlm_proto.pv_minor; + packet.fs_minor = query->fs_proto.pv_minor; + packet.code = JOIN_OK; + __dlm_set_joining_node(dlm, query->node_idx); + } + } + + spin_unlock(&dlm->spinlock); + } +unlock_respond: + spin_unlock(&dlm_domain_lock); + +respond: + mlog(0, "We respond with %u\n", packet.code); + + dlm_query_join_packet_to_wire(&packet, &response); + return response; +} + +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_assert_joined *assert; + struct dlm_ctxt *dlm = NULL; + + assert = (struct dlm_assert_joined *) msg->buf; + + mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, + assert->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); + /* XXX should we consider no dlm ctxt an error? */ + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Alright, this node has officially joined our + * domain. Set him in the map and clean up our + * leftover join state. */ + BUG_ON(dlm->joining_node != assert->node_idx); + + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "dlm recovery is ongoing, disallow join\n"); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + return -EAGAIN; + } + + set_bit(assert->node_idx, dlm->domain_map); + clear_bit(assert->node_idx, dlm->exit_domain_map); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", + assert->node_idx, dlm->name); + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_match_regions(struct dlm_ctxt *dlm, + struct dlm_query_region *qr, + char *local, int locallen) +{ + char *remote = qr->qr_regions; + char *l, *r; + int localnr, i, j, foundit; + int status = 0; + + if (!o2hb_global_heartbeat_active()) { + if (qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Joining node %d has global " + "heartbeat enabled but local node %d does not\n", + qr->qr_domain, qr->qr_node, dlm->node_num); + status = -EINVAL; + } + goto bail; + } + + if (o2hb_global_heartbeat_active() && !qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Local node %d has global " + "heartbeat enabled but joining node %d does not\n", + qr->qr_domain, dlm->node_num, qr->qr_node); + status = -EINVAL; + goto bail; + } + + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r); + r += O2HB_MAX_REGION_NAME_LEN; + } + + localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN); + localnr = o2hb_get_all_regions(local, (u8)localnr); + + /* compare local regions with remote */ + l = local; + for (i = 0; i < localnr; ++i) { + foundit = 0; + r = remote; + for (j = 0; j <= qr->qr_numregions; ++j) { + if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in local node %d but not in joining node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l, + dlm->node_num, qr->qr_node); + goto bail; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + + /* compare remote with local regions */ + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + foundit = 0; + l = local; + for (j = 0; j < localnr; ++j) { + if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in joining node %d but not in local node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r, + qr->qr_node, dlm->node_num); + goto bail; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + +bail: + return status; +} + +static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_region *qr = NULL; + int status, ret = 0, i; + char *p; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); + if (!qr) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + qr->qr_node = dlm->node_num; + qr->qr_namelen = strlen(dlm->name); + memcpy(qr->qr_domain, dlm->name, qr->qr_namelen); + /* if local hb, the numregions will be zero */ + if (o2hb_global_heartbeat_active()) + qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions, + O2NM_MAX_REGIONS); + + p = qr->qr_regions; + for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN) + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending regions to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr, + sizeof(struct dlm_query_region), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "Region mismatch %d, node %d\n", + ret, i); + break; + } + } + +bail: + kfree(qr); + return ret; +} + +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_region *qr; + struct dlm_ctxt *dlm = NULL; + char *local = NULL; + int status = 0; + + qr = (struct dlm_query_region *) msg->buf; + + mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, + qr->qr_domain); + + /* buffer used in dlm_mast_regions() */ + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) + return -ENOMEM; + + status = -EINVAL; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "before join domain\n", qr->qr_node, qr->qr_domain); + goto out_domain_lock; + } + + spin_lock(&dlm->spinlock); + if (dlm->joining_node != qr->qr_node) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but joining node is %d\n", qr->qr_node, qr->qr_domain, + dlm->joining_node); + goto out_dlm_lock; + } + + /* Support for global heartbeat was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but active dlm protocol is %d.%d\n", qr->qr_node, + qr->qr_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto out_dlm_lock; + } + + status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); + +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: + spin_unlock(&dlm_domain_lock); + + kfree(local); + + return status; +} + +static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn) +{ + struct o2nm_node *local; + struct dlm_node_info *remote; + int i, j; + int status = 0; + + for (j = 0; j < qn->qn_numnodes; ++j) + mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum, + &(qn->qn_nodes[j].ni_ipv4_address), + ntohs(qn->qn_nodes[j].ni_ipv4_port)); + + for (i = 0; i < O2NM_MAX_NODES && !status; ++i) { + local = o2nm_get_node_by_num(i); + remote = NULL; + for (j = 0; j < qn->qn_numnodes; ++j) { + if (qn->qn_nodes[j].ni_nodenum == i) { + remote = &(qn->qn_nodes[j]); + break; + } + } + + if (!local && !remote) + continue; + + if ((local && !remote) || (!local && remote)) + status = -EINVAL; + + if (!status && + ((remote->ni_nodenum != local->nd_num) || + (remote->ni_ipv4_port != local->nd_ipv4_port) || + (remote->ni_ipv4_address != local->nd_ipv4_address))) + status = -EINVAL; + + if (status) { + if (remote && !local) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in joining node %d but not in " + "local node %d\n", qn->qn_domain, + remote->ni_nodenum, + &(remote->ni_ipv4_address), + ntohs(remote->ni_ipv4_port), + qn->qn_nodenum, dlm->node_num); + if (local && !remote) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in local node %d but not in " + "joining node %d\n", qn->qn_domain, + local->nd_num, &(local->nd_ipv4_address), + ntohs(local->nd_ipv4_port), + dlm->node_num, qn->qn_nodenum); + BUG_ON((!local && !remote)); + } + + if (local) + o2nm_node_put(local); + } + + return status; +} + +static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_nodeinfo *qn = NULL; + struct o2nm_node *node; + int ret = 0, status, count, i; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); + if (!qn) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) { + node = o2nm_get_node_by_num(i); + if (!node) + continue; + qn->qn_nodes[count].ni_nodenum = node->nd_num; + qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port; + qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address; + mlog(0, "Node %3d, %pI4:%u\n", node->nd_num, + &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port)); + ++count; + o2nm_node_put(node); + } + + qn->qn_nodenum = dlm->node_num; + qn->qn_numnodes = count; + qn->qn_namelen = strlen(dlm->name); + memcpy(qn->qn_domain, dlm->name, qn->qn_namelen); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending nodeinfo to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + qn, sizeof(struct dlm_query_nodeinfo), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i); + break; + } + } + +bail: + kfree(qn); + return ret; +} + +static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_nodeinfo *qn; + struct dlm_ctxt *dlm = NULL; + int locked = 0, status = -EINVAL; + + qn = (struct dlm_query_nodeinfo *) msg->buf; + + mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum, + qn->qn_domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s before " + "join domain\n", qn->qn_nodenum, qn->qn_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qn->qn_nodenum) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s but " + "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, + dlm->joining_node); + goto bail; + } + + /* Support for node query was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s " + "but active dlm protocol is %d.%d\n", qn->qn_nodenum, + qn->qn_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_nodes(dlm, qn); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_cancel_join *cancel; + struct dlm_ctxt *dlm = NULL; + + cancel = (struct dlm_cancel_join *) msg->buf; + + mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, + cancel->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); + + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Yikes, this guy wants to cancel his join. No + * problem, we simply cleanup our join state. */ + BUG_ON(dlm->joining_node != cancel->node_idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_cancel_join cancel_msg; + + memset(&cancel_msg, 0, sizeof(cancel_msg)); + cancel_msg.node_idx = dlm->node_num; + cancel_msg.name_len = strlen(dlm->name); + memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); + + status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + &cancel_msg, sizeof(cancel_msg), node, + NULL); + if (status < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + node); + goto bail; + } + +bail: + return status; +} + +/* map_size should be in bytes. */ +static int dlm_send_join_cancels(struct dlm_ctxt *dlm, + unsigned long *node_map, + unsigned int map_size) +{ + int status, tmpstat; + unsigned int node; + + if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * + sizeof(unsigned long))) { + mlog(ML_ERROR, + "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", + map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES)); + return -EINVAL; + } + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + tmpstat = dlm_send_one_join_cancel(dlm, node); + if (tmpstat) { + mlog(ML_ERROR, "Error return %d cancelling join on " + "node %d\n", tmpstat, node); + if (!status) + status = tmpstat; + } + } + + if (status) + mlog_errno(status); + return status; +} + +static int dlm_request_join(struct dlm_ctxt *dlm, + int node, + enum dlm_query_join_response_code *response) +{ + int status; + struct dlm_query_join_request join_msg; + struct dlm_query_join_packet packet; + u32 join_resp; + + mlog(0, "querying node %d\n", node); + + memset(&join_msg, 0, sizeof(join_msg)); + join_msg.node_idx = dlm->node_num; + join_msg.name_len = strlen(dlm->name); + memcpy(join_msg.domain, dlm->name, join_msg.name_len); + join_msg.dlm_proto = dlm->dlm_locking_proto; + join_msg.fs_proto = dlm->fs_locking_proto; + + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, + sizeof(join_msg), node, &join_resp); + if (status < 0 && status != -ENOPROTOOPT) { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + node); + goto bail; + } + dlm_query_join_wire_to_packet(join_resp, &packet); + + /* -ENOPROTOOPT from the net code means the other side isn't + listening for our message type -- that's fine, it means + his dlm isn't up, so we can consider him a 'yes' but not + joined into the domain. */ + if (status == -ENOPROTOOPT) { + status = 0; + *response = JOIN_OK_NO_MAP; + } else if (packet.code == JOIN_DISALLOW || + packet.code == JOIN_OK_NO_MAP) { + *response = packet.code; + } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + *response = packet.code; + } else if (packet.code == JOIN_OK) { + *response = packet.code; + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + } else { + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + } + + mlog(0, "status %d, node %d response is %d\n", status, node, + *response); + +bail: + return status; +} + +static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + int ret; + struct dlm_assert_joined assert_msg; + + mlog(0, "Sending join assert to node %u\n", node); + + memset(&assert_msg, 0, sizeof(assert_msg)); + assert_msg.node_idx = dlm->node_num; + assert_msg.name_len = strlen(dlm->name); + memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); + + status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + &assert_msg, sizeof(assert_msg), node, + &ret); + if (status < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + node); + else + status = ret; + + return status; +} + +static void dlm_send_join_asserts(struct dlm_ctxt *dlm, + unsigned long *node_map) +{ + int status, node, live; + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + do { + /* It is very important that this message be + * received so we spin until either the node + * has died or it gets the message. */ + status = dlm_send_one_join_assert(dlm, node); + + spin_lock(&dlm->spinlock); + live = test_bit(node, dlm->live_nodes_map); + spin_unlock(&dlm->spinlock); + + if (status) { + mlog(ML_ERROR, "Error return %d asserting " + "join on node %d\n", status, node); + + /* give us some time between errors... */ + if (live) + msleep(DLM_DOMAIN_BACKOFF_MS); + } + } while (status && live); + } +} + +struct domain_join_ctxt { + unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +static int dlm_should_restart_join(struct dlm_ctxt *dlm, + struct domain_join_ctxt *ctxt, + enum dlm_query_join_response_code response) +{ + int ret; + + if (response == JOIN_DISALLOW) { + mlog(0, "Latest response of disallow -- should restart\n"); + return 1; + } + + spin_lock(&dlm->spinlock); + /* For now, we restart the process if the node maps have + * changed at all */ + ret = memcmp(ctxt->live_map, dlm->live_nodes_map, + sizeof(dlm->live_nodes_map)); + spin_unlock(&dlm->spinlock); + + if (ret) + mlog(0, "Node maps changed -- should restart\n"); + + return ret; +} + +static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) +{ + int status = 0, tmpstat, node; + struct domain_join_ctxt *ctxt; + enum dlm_query_join_response_code response = JOIN_DISALLOW; + + mlog(0, "%p", dlm); + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* group sem locking should work for us here -- we're already + * registered for heartbeat events so filling this should be + * atomic wrt getting those handlers called. */ + o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + + spin_lock(&dlm->spinlock); + memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); + + __dlm_set_joining_node(dlm, dlm->node_num); + + spin_unlock(&dlm->spinlock); + + node = -1; + while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + status = dlm_request_join(dlm, node, &response); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Ok, either we got a response or the node doesn't have a + * dlm up. */ + if (response == JOIN_OK) + set_bit(node, ctxt->yes_resp_map); + + if (dlm_should_restart_join(dlm, ctxt, response)) { + status = -EAGAIN; + goto bail; + } + } + + mlog(0, "Yay, done querying nodes!\n"); + + /* Yay, everyone agree's we can join the domain. My domain is + * comprised of all nodes who were put in the + * yes_resp_map. Copy that into our domain map and send a join + * assert message to clean up everyone elses state. */ + spin_lock(&dlm->spinlock); + memcpy(dlm->domain_map, ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + set_bit(dlm->node_num, dlm->domain_map); + spin_unlock(&dlm->spinlock); + + /* Support for global heartbeat and node info was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major > 1 || + dlm->dlm_locking_proto.pv_minor > 0) { + status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + status = dlm_send_regions(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + } + + dlm_send_join_asserts(dlm, ctxt->yes_resp_map); + + /* Joined state *must* be set before the joining node + * information, otherwise the query_join handler may read no + * current joiner but a state of NEW and tell joining nodes + * we're not in the domain. */ + spin_lock(&dlm_domain_lock); + dlm->dlm_state = DLM_CTXT_JOINED; + dlm->num_joins++; + spin_unlock(&dlm_domain_lock); + +bail: + spin_lock(&dlm->spinlock); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + if (!status) { + printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); + __dlm_print_nodes(dlm); + } + spin_unlock(&dlm->spinlock); + + if (ctxt) { + /* Do we need to send a cancel message to any nodes? */ + if (status < 0) { + tmpstat = dlm_send_join_cancels(dlm, + ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + if (tmpstat < 0) + mlog_errno(tmpstat); + } + kfree(ctxt); + } + + mlog(0, "returning %d\n", status); + return status; +} + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) +{ + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up); + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down); + o2net_unregister_handler_list(&dlm->dlm_domain_handlers); +} + +static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) +{ + int status; + + mlog(0, "registering handlers.\n"); + + o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, + dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); + if (status) + goto bail; + + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, + sizeof(struct dlm_master_request), + dlm_master_request_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, + sizeof(struct dlm_assert_master), + dlm_assert_master_handler, + dlm, dlm_assert_master_post_handler, + &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, + sizeof(struct dlm_create_lock), + dlm_create_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, + DLM_CONVERT_LOCK_MAX_LEN, + dlm_convert_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, + DLM_UNLOCK_LOCK_MAX_LEN, + dlm_unlock_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, + DLM_PROXY_AST_MAX_LEN, + dlm_proxy_ast_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_exit_domain_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, + sizeof(struct dlm_migrate_request), + dlm_migrate_request_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, + DLM_MIG_LOCKRES_MAX_LEN, + dlm_mig_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, + sizeof(struct dlm_master_requery), + dlm_master_requery_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, + sizeof(struct dlm_lock_request), + dlm_request_all_locks_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, + sizeof(struct dlm_reco_data_done), + dlm_reco_data_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, + sizeof(struct dlm_begin_reco), + dlm_begin_reco_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, + sizeof(struct dlm_finalize_reco), + dlm_finalize_reco_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_begin_exit_domain_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + +bail: + if (status) + dlm_unregister_domain_handlers(dlm); + + return status; +} + +static int dlm_join_domain(struct dlm_ctxt *dlm) +{ + int status; + unsigned int backoff; + unsigned int total_backoff = 0; + + BUG_ON(!dlm); + + mlog(0, "Join domain %s\n", dlm->name); + + status = dlm_register_domain_handlers(dlm); + if (status) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_recovery_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = dlm_debug_init(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + do { + status = dlm_try_to_join_domain(dlm); + + /* If we're racing another node to the join, then we + * need to back off temporarily and let them + * complete. */ +#define DLM_JOIN_TIMEOUT_MSECS 90000 + if (status == -EAGAIN) { + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { + status = -ERESTARTSYS; + mlog(ML_NOTICE, "Timed out joining dlm domain " + "%s after %u msecs\n", dlm->name, + total_backoff); + goto bail; + } + + /* + * After you! + * No, after you! + * I insist! + * But you first! + * ... + */ + backoff = (unsigned int)(jiffies & 0x3); + backoff *= DLM_DOMAIN_BACKOFF_MS; + total_backoff += backoff; + mlog(0, "backoff %d\n", backoff); + msleep(backoff); + } + } while (status == -EAGAIN); + + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + wake_up(&dlm_domain_events); + + if (status) { + dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); + } + + return status; +} + +static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, + u32 key) +{ + int i; + int ret; + struct dlm_ctxt *dlm = NULL; + + dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); + if (!dlm) { + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + dlm->name = kstrdup(domain, GFP_KERNEL); + if (dlm->name == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->lockres_hash) { + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); + + dlm->master_hash = (struct hlist_head **) + dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->master_hash) { + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); + + dlm->key = key; + dlm->node_num = o2nm_this_node(); + + ret = dlm_create_debugfs_subroot(dlm); + if (ret < 0) + goto leave; + + spin_lock_init(&dlm->spinlock); + spin_lock_init(&dlm->master_lock); + spin_lock_init(&dlm->ast_lock); + spin_lock_init(&dlm->track_lock); + INIT_LIST_HEAD(&dlm->list); + INIT_LIST_HEAD(&dlm->dirty_list); + INIT_LIST_HEAD(&dlm->reco.resources); + INIT_LIST_HEAD(&dlm->reco.node_data); + INIT_LIST_HEAD(&dlm->purge_list); + INIT_LIST_HEAD(&dlm->dlm_domain_handlers); + INIT_LIST_HEAD(&dlm->tracking_list); + dlm->reco.state = 0; + + INIT_LIST_HEAD(&dlm->pending_asts); + INIT_LIST_HEAD(&dlm->pending_basts); + + mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", + dlm->recovery_map, &(dlm->recovery_map[0])); + + memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); + memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); + memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + + dlm->dlm_thread_task = NULL; + dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; + init_waitqueue_head(&dlm->dlm_thread_wq); + init_waitqueue_head(&dlm->dlm_reco_thread_wq); + init_waitqueue_head(&dlm->reco.event); + init_waitqueue_head(&dlm->ast_wq); + init_waitqueue_head(&dlm->migration_wq); + INIT_LIST_HEAD(&dlm->mle_hb_events); + + dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; + init_waitqueue_head(&dlm->dlm_join_events); + + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + + atomic_set(&dlm->res_tot_count, 0); + atomic_set(&dlm->res_cur_count, 0); + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) { + atomic_set(&dlm->mle_tot_count[i], 0); + atomic_set(&dlm->mle_cur_count[i], 0); + } + + spin_lock_init(&dlm->work_lock); + INIT_LIST_HEAD(&dlm->work_list); + INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work); + + kref_init(&dlm->dlm_refs); + dlm->dlm_state = DLM_CTXT_NEW; + + INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); + + mlog(0, "context init: refcount %u\n", + atomic_read(&dlm->dlm_refs.refcount)); + +leave: + if (ret < 0 && dlm) { + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, + DLM_HASH_PAGES); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, + DLM_HASH_PAGES); + + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + } + return dlm; +} + +/* + * Compare a requested locking protocol version against the current one. + * + * If the major numbers are different, they are incompatible. + * If the current minor is greater than the request, they are incompatible. + * If the current minor is less than or equal to the request, they are + * compatible, and the requester should run at the current minor version. + */ +static int dlm_protocol_compare(struct dlm_protocol_version *existing, + struct dlm_protocol_version *request) +{ + if (existing->pv_major != request->pv_major) + return 1; + + if (existing->pv_minor > request->pv_minor) + return 1; + + if (existing->pv_minor < request->pv_minor) + request->pv_minor = existing->pv_minor; + + return 0; +} + +/* + * dlm_register_domain: one-time setup per "domain". + * + * The filesystem passes in the requested locking version via proto. + * If registration was successful, proto will contain the negotiated + * locking protocol. + */ +struct dlm_ctxt * dlm_register_domain(const char *domain, + u32 key, + struct dlm_protocol_version *fs_proto) +{ + int ret; + struct dlm_ctxt *dlm = NULL; + struct dlm_ctxt *new_ctxt = NULL; + + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { + ret = -ENAMETOOLONG; + mlog(ML_ERROR, "domain name length too long\n"); + goto leave; + } + + mlog(0, "register called for domain \"%s\"\n", domain); + +retry: + dlm = NULL; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + mlog_errno(ret); + goto leave; + } + + spin_lock(&dlm_domain_lock); + + dlm = __dlm_lookup_domain(domain); + if (dlm) { + if (dlm->dlm_state != DLM_CTXT_JOINED) { + spin_unlock(&dlm_domain_lock); + + mlog(0, "This ctxt is not joined yet!\n"); + wait_event_interruptible(dlm_domain_events, + dlm_wait_on_domain_helper( + domain)); + goto retry; + } + + if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); + mlog(ML_ERROR, + "Requested locking protocol version is not " + "compatible with already registered domain " + "\"%s\"\n", domain); + ret = -EPROTO; + goto leave; + } + + __dlm_get(dlm); + dlm->num_joins++; + + spin_unlock(&dlm_domain_lock); + + ret = 0; + goto leave; + } + + /* doesn't exist */ + if (!new_ctxt) { + spin_unlock(&dlm_domain_lock); + + new_ctxt = dlm_alloc_ctxt(domain, key); + if (new_ctxt) + goto retry; + + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + /* a little variable switch-a-roo here... */ + dlm = new_ctxt; + new_ctxt = NULL; + + /* add the new domain */ + list_add_tail(&dlm->list, &dlm_domains); + spin_unlock(&dlm_domain_lock); + + /* + * Pass the locking protocol version into the join. If the join + * succeeds, it will have the negotiated protocol set. + */ + dlm->dlm_locking_proto = dlm_protocol; + dlm->fs_locking_proto = *fs_proto; + + ret = dlm_join_domain(dlm); + if (ret) { + mlog_errno(ret); + dlm_put(dlm); + goto leave; + } + + /* Tell the caller what locking protocol we negotiated */ + *fs_proto = dlm->fs_locking_proto; + + ret = 0; +leave: + if (new_ctxt) + dlm_free_ctxt_mem(new_ctxt); + + if (ret < 0) + dlm = ERR_PTR(ret); + + return dlm; +} +EXPORT_SYMBOL_GPL(dlm_register_domain); + +static LIST_HEAD(dlm_join_handlers); + +static void dlm_unregister_net_handlers(void) +{ + o2net_unregister_handler_list(&dlm_join_handlers); +} + +static int dlm_register_net_handlers(void) +{ + int status = 0; + + status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_query_join_request), + dlm_query_join_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + sizeof(struct dlm_assert_joined), + dlm_assert_joined_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_cancel_join), + dlm_cancel_join_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY, + sizeof(struct dlm_query_region), + dlm_query_region_handler, + NULL, NULL, &dlm_join_handlers); + + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + sizeof(struct dlm_query_nodeinfo), + dlm_query_nodeinfo_handler, + NULL, NULL, &dlm_join_handlers); +bail: + if (status < 0) + dlm_unregister_net_handlers(); + + return status; +} + +/* Domain eviction callback handling. + * + * The file system requires notification of node death *before* the + * dlm completes it's recovery work, otherwise it may be able to + * acquire locks on resources requiring recovery. Since the dlm can + * evict a node from it's domain *before* heartbeat fires, a similar + * mechanism is required. */ + +/* Eviction is not expected to happen often, so a per-domain lock is + * not necessary. Eviction callbacks are allowed to sleep for short + * periods of time. */ +static DECLARE_RWSEM(dlm_callback_sem); + +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num) +{ + struct dlm_eviction_cb *cb; + + down_read(&dlm_callback_sem); + list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) { + cb->ec_func(node_num, cb->ec_data); + } + up_read(&dlm_callback_sem); +} + +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data) +{ + INIT_LIST_HEAD(&cb->ec_item); + cb->ec_func = f; + cb->ec_data = data; +} +EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); + +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); + +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_del_init(&cb->ec_item); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); + +static int __init dlm_init(void) +{ + int status; + + status = dlm_init_mle_cache(); + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); + goto error; + } + + status = dlm_init_master_caches(); + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_lockres and " + "o2dlm_lockname slabcaches\n"); + goto error; + } + + status = dlm_init_lock_cache(); + if (status) { + mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n"); + goto error; + } + + status = dlm_register_net_handlers(); + if (status) { + mlog(ML_ERROR, "Unable to register network handlers\n"); + goto error; + } + + status = dlm_create_debugfs_root(); + if (status) + goto error; + + return 0; +error: + dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); + dlm_destroy_mle_cache(); + return -1; +} + +static void __exit dlm_exit (void) +{ + dlm_destroy_debugfs_root(); + dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); + dlm_destroy_mle_cache(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); + +module_init(dlm_init); +module_exit(dlm_exit); diff --git a/kernel/fs/ocfs2/dlm/dlmdomain.h b/kernel/fs/ocfs2/dlm/dlmdomain.h new file mode 100644 index 000000000..fd6122a38 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmdomain.h @@ -0,0 +1,35 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDOMAIN_H +#define DLMDOMAIN_H + +extern spinlock_t dlm_domain_lock; +extern struct list_head dlm_domains; + +int dlm_shutting_down(struct dlm_ctxt *dlm); +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num); + +#endif diff --git a/kernel/fs/ocfs2/dlm/dlmlock.c b/kernel/fs/ocfs2/dlm/dlmlock.c new file mode 100644 index 000000000..66c2a491f --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmlock.c @@ -0,0 +1,761 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmlock.c + * + * underlying calls for lock creation + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static struct kmem_cache *dlm_lock_cache; + +static DEFINE_SPINLOCK(dlm_cookie_lock); +static u64 dlm_next_cookie = 1; + +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags); +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie); +static void dlm_lock_release(struct kref *kref); +static void dlm_lock_detach_lockres(struct dlm_lock *lock); + +int dlm_init_lock_cache(void) +{ + dlm_lock_cache = kmem_cache_create("o2dlm_lock", + sizeof(struct dlm_lock), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (dlm_lock_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_lock_cache(void) +{ + if (dlm_lock_cache) + kmem_cache_destroy(dlm_lock_cache); +} + +/* Tell us whether we can grant a new lock request. + * locking: + * caller needs: res->spinlock + * taken: none + * held on exit: none + * returns: 1 if the lock can be granted, 0 otherwise. + */ +static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct dlm_lock *tmplock; + + list_for_each_entry(tmplock, &res->granted, list) { + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + } + + list_for_each_entry(tmplock, &res->converting, list) { + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + if (!dlm_lock_compatible(tmplock->ml.convert_type, + lock->ml.type)) + return 0; + } + + return 1; +} + +/* performs lock creation at the lockres master site + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_NOTQUEUED + */ +static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status = DLM_NORMAL; + + mlog(0, "type=%d\n", lock->ml.type); + + spin_lock(&res->spinlock); + /* if called from dlm_create_lock_handler, need to + * ensure it will not sleep in dlm_wait_on_lockres */ + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL && + lock->ml.node != dlm->node_num) { + /* erf. state changed after lock was dropped. */ + spin_unlock(&res->spinlock); + dlm_error(status); + return status; + } + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + + if (dlm_can_grant_new_lock(res, lock)) { + mlog(0, "I can grant this lock right away\n"); + /* got it right away */ + lock->lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + + /* for the recovery lock, we can't allow the ast + * to be queued since the dlmthread is already + * frozen. but the recovery lock is always locked + * with LKM_NOQUEUE so we do not need the ast in + * this special case */ + if (!dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + kick_thread = 1; + call_ast = 1; + } else { + mlog(0, "%s: returning DLM_NORMAL to " + "node %u for reco lock\n", dlm->name, + lock->ml.node); + } + } else { + /* for NOQUEUE request, unless we get the + * lock right away, return DLM_NOTQUEUED */ + if (flags & LKM_NOQUEUE) { + status = DLM_NOTQUEUED; + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + mlog(0, "%s: returning NOTQUEUED to " + "node %u for reco lock\n", dlm->name, + lock->ml.node); + } + } else { + status = DLM_NORMAL; + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + kick_thread = 1; + } + } + + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + dlm_lockres_calc_usage(dlm, res); + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* remove from local queue if it failed */ + list_del_init(&lock->list); + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; +} + + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_DENIED, DLM_RECOVERING, or net status + */ +static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; + + mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n", + lock->ml.type, res->lockname.len, + res->lockname.name, flags); + + /* + * Wait if resource is getting recovered, remastered, etc. + * If the resource was remastered and new owner is self, then exit. + */ + spin_lock(&res->spinlock); + __dlm_wait_on_lockres(res); + if (res->owner == dlm->node_num) { + spin_unlock(&res->spinlock); + return DLM_RECOVERING; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + /* add lock to local (secondary) queue */ + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + lock->lock_pending = 1; + spin_unlock(&res->spinlock); + + /* spec seems to say that you will get DLM_NORMAL when the lock + * has been queued, meaning we need to wait for a reply here. */ + status = dlm_send_remote_lock_request(dlm, res, lock, flags); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->lock_pending = 0; + if (status != DLM_NORMAL) { + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; + dlm_error(status); + } + dlm_revert_pending_lock(res, lock); + dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_move_tail(&lock->list, &res->granted); + } + spin_unlock(&res->spinlock); + + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); + + wake_up(&res->wq); + return status; +} + + +/* for remote lock creation. + * locking: + * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, or net status + */ +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + struct dlm_create_lock create; + int tmpret, status = 0; + enum dlm_status ret; + + memset(&create, 0, sizeof(create)); + create.node_idx = dlm->node_num; + create.requested_type = lock->ml.type; + create.cookie = lock->ml.cookie; + create.namelen = res->lockname.len; + create.flags = cpu_to_be32(flags); + memcpy(create.name, res->lockname.name, create.namelen); + + tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, + sizeof(create), res->owner, &status); + if (tmpret >= 0) { + ret = status; + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " + "owned by node %u. That node is coming back up " + "currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } + } else { + mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " + "node %u\n", dlm->name, create.namelen, create.name, + tmpret, res->owner); + if (dlm_is_host_down(tmpret)) + ret = DLM_RECOVERING; + else + ret = dlm_err_to_dlm_status(tmpret); + } + + return ret; +} + +void dlm_lock_get(struct dlm_lock *lock) +{ + kref_get(&lock->lock_refs); +} + +void dlm_lock_put(struct dlm_lock *lock) +{ + kref_put(&lock->lock_refs, dlm_lock_release); +} + +static void dlm_lock_release(struct kref *kref) +{ + struct dlm_lock *lock; + + lock = container_of(kref, struct dlm_lock, lock_refs); + + BUG_ON(!list_empty(&lock->list)); + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + + dlm_lock_detach_lockres(lock); + + if (lock->lksb_kernel_allocated) { + mlog(0, "freeing kernel-allocated lksb\n"); + kfree(lock->lksb); + } + kmem_cache_free(dlm_lock_cache, lock); +} + +/* associate a lock with it's lockres, getting a ref on the lockres */ +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res) +{ + dlm_lockres_get(res); + lock->lockres = res; +} + +/* drop ref on lockres, if there is still one associated with lock */ +static void dlm_lock_detach_lockres(struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + res = lock->lockres; + if (res) { + lock->lockres = NULL; + mlog(0, "removing lock's lockres reference\n"); + dlm_lockres_put(res); + } +} + +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie) +{ + INIT_LIST_HEAD(&newlock->list); + INIT_LIST_HEAD(&newlock->ast_list); + INIT_LIST_HEAD(&newlock->bast_list); + spin_lock_init(&newlock->spinlock); + newlock->ml.type = type; + newlock->ml.convert_type = LKM_IVMODE; + newlock->ml.highest_blocked = LKM_IVMODE; + newlock->ml.node = node; + newlock->ml.pad1 = 0; + newlock->ml.list = 0; + newlock->ml.flags = 0; + newlock->ast = NULL; + newlock->bast = NULL; + newlock->astdata = NULL; + newlock->ml.cookie = cpu_to_be64(cookie); + newlock->ast_pending = 0; + newlock->bast_pending = 0; + newlock->convert_pending = 0; + newlock->lock_pending = 0; + newlock->unlock_pending = 0; + newlock->cancel_pending = 0; + newlock->lksb_kernel_allocated = 0; + + kref_init(&newlock->lock_refs); +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb) +{ + struct dlm_lock *lock; + int kernel_allocated = 0; + + lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); + if (!lock) + return NULL; + + if (!lksb) { + /* zero memory only if kernel-allocated */ + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) { + kmem_cache_free(dlm_lock_cache, lock); + return NULL; + } + kernel_allocated = 1; + } + + dlm_init_lock(lock, type, node, cookie); + if (kernel_allocated) + lock->lksb_kernel_allocated = 1; + lock->lksb = lksb; + lksb->lockid = lock; + return lock; +} + +/* handler for lock creation net message + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED + */ +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + enum dlm_status status = DLM_NORMAL; + char *name; + unsigned int namelen; + + BUG_ON(!dlm); + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + name = create->name; + namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } + + status = DLM_IVBUFLEN; + if (namelen > DLM_LOCKID_NAME_MAX) { + dlm_error(status); + goto leave; + } + + status = DLM_SYSERR; + newlock = dlm_new_lock(create->requested_type, + create->node_idx, + be64_to_cpu(create->cookie), NULL); + if (!newlock) { + dlm_error(status); + goto leave; + } + + lksb = newlock->lksb; + + if (be32_to_cpu(create->flags) & LKM_GET_LVB) { + lksb->flags |= DLM_LKSB_GET_LVB; + mlog(0, "set DLM_LKSB_GET_LVB flag\n"); + } + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, name, namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + spin_unlock(&res->spinlock); + + if (status != DLM_NORMAL) { + mlog(0, "lockres recovering/migrating/in-progress\n"); + goto leave; + } + + dlm_lock_attach_lockres(newlock, res); + + status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags)); +leave: + if (status != DLM_NORMAL) + if (newlock) + dlm_lock_put(newlock); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */ +static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie) +{ + u64 tmpnode = node_num; + + /* shift single byte of node num into top 8 bits */ + tmpnode <<= 56; + + spin_lock(&dlm_cookie_lock); + *cookie = (dlm_next_cookie | tmpnode); + if (++dlm_next_cookie & 0xff00000000000000ull) { + mlog(0, "This node's cookie will now wrap!\n"); + dlm_next_cookie = 1; + } + spin_unlock(&dlm_cookie_lock); +} + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, + struct dlm_lockstatus *lksb, int flags, + const char *name, int namelen, dlm_astlockfunc_t *ast, + void *data, dlm_bastlockfunc_t *bast) +{ + enum dlm_status status; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + int convert = 0, recovery = 0; + + /* yes this function is a mess. + * TODO: clean this up. lots of common code in the + * lock and convert paths, especially in the retry blocks */ + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + status = DLM_BADPARAM; + if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) { + dlm_error(status); + goto error; + } + + if (flags & ~LKM_VALID_FLAGS) { + dlm_error(status); + goto error; + } + + convert = (flags & LKM_CONVERT); + recovery = (flags & LKM_RECOVERY); + + if (recovery && + (!dlm_is_recovery_lock(name, namelen) || convert) ) { + dlm_error(status); + goto error; + } + if (convert && (flags & LKM_LOCAL)) { + mlog(ML_ERROR, "strange LOCAL convert request!\n"); + goto error; + } + + if (convert) { + /* CONVERT request */ + + /* if converting, must pass in a valid dlm_lock */ + lock = lksb->lockid; + if (!lock) { + mlog(ML_ERROR, "NULL lock pointer in convert " + "request\n"); + goto error; + } + + res = lock->lockres; + if (!res) { + mlog(ML_ERROR, "NULL lockres pointer in convert " + "request\n"); + goto error; + } + dlm_lockres_get(res); + + /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are + * static after the original lock call. convert requests will + * ensure that everything is the same, or return DLM_BADARGS. + * this means that DLM_DENIED_NOASTS will never be returned. + */ + if (lock->lksb != lksb || lock->ast != ast || + lock->bast != bast || lock->astdata != data) { + status = DLM_BADARGS; + mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lksb, ast, bast, data); + mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lock->lksb, lock->ast, + lock->bast, lock->astdata); + goto error; + } +retry_convert: + dlm_wait_for_recovery(dlm); + + if (res->owner == dlm->node_num) + status = dlmconvert_master(dlm, res, lock, flags, mode); + else + status = dlmconvert_remote(dlm, res, lock, flags, mode); + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* for now, see how this works without sleeping + * and just retry right away. I suspect the reco + * or migration will complete fast enough that + * no waiting will be necessary */ + mlog(0, "retrying convert with migration/recovery/" + "in-progress\n"); + msleep(100); + goto retry_convert; + } + } else { + u64 tmpcookie; + + /* LOCK request */ + status = DLM_BADARGS; + if (!name) { + dlm_error(status); + goto error; + } + + status = DLM_IVBUFLEN; + if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) { + dlm_error(status); + goto error; + } + + dlm_get_next_cookie(dlm->node_num, &tmpcookie); + lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb); + if (!lock) { + dlm_error(status); + goto error; + } + + if (!recovery) + dlm_wait_for_recovery(dlm); + + /* find or create the lock resource */ + res = dlm_get_lock_resource(dlm, name, namelen, flags); + if (!res) { + status = DLM_IVLOCKID; + dlm_error(status); + goto error; + } + + mlog(0, "type=%d, flags = 0x%x\n", mode, flags); + mlog(0, "creating lock: lock=%p res=%p\n", lock, res); + + dlm_lock_attach_lockres(lock, res); + lock->ast = ast; + lock->bast = bast; + lock->astdata = data; + +retry_lock: + if (flags & LKM_VALBLK) { + mlog(0, "LKM_VALBLK passed by caller\n"); + + /* LVB requests for non PR, PW or EX locks are + * ignored. */ + if (mode < LKM_PRMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + + if (res->owner == dlm->node_num) + status = dlmlock_master(dlm, res, lock, flags); + else + status = dlmlock_remote(dlm, res, lock, flags); + + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + msleep(100); + if (recovery) { + if (status != DLM_RECOVERING) + goto retry_lock; + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } else { + dlm_wait_for_recovery(dlm); + goto retry_lock; + } + } + + /* Inflight taken in dlm_get_lock_resource() is dropped here */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + + if (status != DLM_NORMAL) { + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; + if (status != DLM_NOTQUEUED) + dlm_error(status); + goto error; + } + } + +error: + if (status != DLM_NORMAL) { + if (lock && !convert) + dlm_lock_put(lock); + // this is kind of unnecessary + lksb->status = status; + } + + /* put lockres ref from the convert path + * or from dlm_get_lock_resource */ + if (res) + dlm_lockres_put(res); + + return status; +} +EXPORT_SYMBOL_GPL(dlmlock); diff --git a/kernel/fs/ocfs2/dlm/dlmmaster.c b/kernel/fs/ocfs2/dlm/dlmmaster.c new file mode 100644 index 000000000..fdf4b41d0 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmmaster.c @@ -0,0 +1,3487 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmmod.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) +#include "cluster/masklog.h" + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); + +static inline int dlm_mle_equal(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + const char *name, + unsigned int namelen) +{ + if (dlm != mle->dlm) + return 0; + + if (namelen != mle->mnamelen || + memcmp(name, mle->mname, namelen) != 0) + return 0; + + return 1; +} + +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache; + +static void dlm_mle_release(struct kref *kref); +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen); +static void dlm_put_mle(struct dlm_master_list_entry *mle); +static void __dlm_put_mle(struct dlm_master_list_entry *mle); +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen); + +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); + + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked); +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked); +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master); + +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target); +static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + + +int dlm_is_host_down(int errno) +{ + switch (errno) { + case -EBADF: + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ETIMEDOUT: + case -ECONNABORTED: + case -ENETDOWN: + case -ENETUNREACH: + case -ENETRESET: + case -ESHUTDOWN: + case -ENOPROTOOPT: + case -EINVAL: /* if returned from our tcp code, + this means there is no socket */ + return 1; + } + return 0; +} + + +/* + * MASTER LIST FUNCTIONS + */ + + +/* + * regarding master list entries and heartbeat callbacks: + * + * in order to avoid sleeping and allocation that occurs in + * heartbeat, master list entries are simply attached to the + * dlm's established heartbeat callbacks. the mle is attached + * when it is created, and since the dlm->spinlock is held at + * that time, any heartbeat event will be properly discovered + * by the mle. the mle needs to be detached from the + * dlm->mle_hb_events list as soon as heartbeat events are no + * longer useful to the mle, and before the mle is freed. + * + * as a general rule, heartbeat events are no longer needed by + * the mle once an "answer" regarding the lock master has been + * received. + */ +static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + + list_add_tail(&mle->hb_events, &dlm->mle_hb_events); +} + + +static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + if (!list_empty(&mle->hb_events)) + list_del_init(&mle->hb_events); +} + + +static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + spin_lock(&dlm->spinlock); + __dlm_mle_detach_hb_events(dlm, mle); + spin_unlock(&dlm->spinlock); +} + +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + +/* remove from list and free */ +static void __dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); +} + + +/* must not have any spinlocks coming in */ +static void dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} + +static inline void dlm_get_mle(struct dlm_master_list_entry *mle) +{ + kref_get(&mle->mle_refs); +} + +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen) +{ + assert_spin_locked(&dlm->spinlock); + + mle->dlm = dlm; + mle->type = type; + INIT_HLIST_NODE(&mle->master_hash_node); + INIT_LIST_HEAD(&mle->hb_events); + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + spin_lock_init(&mle->spinlock); + init_waitqueue_head(&mle->wq); + atomic_set(&mle->woken, 0); + kref_init(&mle->mle_refs); + memset(mle->response_map, 0, sizeof(mle->response_map)); + mle->master = O2NM_MAX_NODES; + mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + if (mle->type == DLM_MLE_MASTER) { + BUG_ON(!res); + mle->mleres = res; + memcpy(mle->mname, res->lockname.name, res->lockname.len); + mle->mnamelen = res->lockname.len; + mle->mnamehash = res->lockname.hash; + } else { + BUG_ON(!name); + mle->mleres = NULL; + memcpy(mle->mname, name, namelen); + mle->mnamelen = namelen; + mle->mnamehash = dlm_lockid_hash(name, namelen); + } + + atomic_inc(&dlm->mle_tot_count[mle->type]); + atomic_inc(&dlm->mle_cur_count[mle->type]); + + /* copy off the node_map and register hb callbacks on our copy */ + memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); + memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + clear_bit(dlm->node_num, mle->vote_map); + clear_bit(dlm->node_num, mle->node_map); + + /* attach the mle to the domain node up/down events */ + __dlm_mle_attach_hb_events(dlm, mle); +} + +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + if (!hlist_unhashed(&mle->master_hash_node)) + hlist_del_init(&mle->master_hash_node); +} + +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + struct hlist_head *bucket; + + assert_spin_locked(&dlm->master_lock); + + bucket = dlm_master_hash(dlm, mle->mnamehash); + hlist_add_head(&mle->master_hash_node, bucket); +} + +/* returns 1 if found, 0 if not */ +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen) +{ + struct dlm_master_list_entry *tmpmle; + struct hlist_head *bucket; + unsigned int hash; + + assert_spin_locked(&dlm->master_lock); + + hash = dlm_lockid_hash(name, namelen); + bucket = dlm_master_hash(dlm, hash); + hlist_for_each_entry(tmpmle, bucket, master_hash_node) { + if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) + continue; + dlm_get_mle(tmpmle); + *mle = tmpmle; + return 1; + } + return 0; +} + +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) +{ + struct dlm_master_list_entry *mle; + + assert_spin_locked(&dlm->spinlock); + + list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { + if (node_up) + dlm_mle_node_up(dlm, mle, NULL, idx); + else + dlm_mle_node_down(dlm, mle, NULL, idx); + } +} + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (!test_bit(idx, mle->node_map)) + mlog(0, "node %u already removed from nodemap!\n", idx); + else + clear_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (test_bit(idx, mle->node_map)) + mlog(0, "node %u already in node map!\n", idx); + else + set_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + + +int dlm_init_mle_cache(void) +{ + dlm_mle_cache = kmem_cache_create("o2dlm_mle", + sizeof(struct dlm_master_list_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (dlm_mle_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_mle_cache(void) +{ + if (dlm_mle_cache) + kmem_cache_destroy(dlm_mle_cache); +} + +static void dlm_mle_release(struct kref *kref) +{ + struct dlm_master_list_entry *mle; + struct dlm_ctxt *dlm; + + mle = container_of(kref, struct dlm_master_list_entry, mle_refs); + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, + mle->type); + + /* remove from list if not already */ + __dlm_unlink_mle(dlm, mle); + + /* detach the mle from the domain node up/down events */ + __dlm_mle_detach_hb_events(dlm, mle); + + atomic_dec(&dlm->mle_cur_count[mle->type]); + + /* NOTE: kfree under spinlock here. + * if this is bad, we can move this to a freelist. */ + kmem_cache_free(dlm_mle_cache, mle); +} + + +/* + * LOCK RESOURCE FUNCTIONS + */ + +int dlm_init_master_caches(void) +{ + dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", + sizeof(struct dlm_lock_resource), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockres_cache) + goto bail; + + dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", + DLM_LOCKID_NAME_MAX, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockname_cache) + goto bail; + + return 0; +bail: + dlm_destroy_master_caches(); + return -ENOMEM; +} + +void dlm_destroy_master_caches(void) +{ + if (dlm_lockname_cache) { + kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } + + if (dlm_lockres_cache) { + kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } +} + +static void dlm_lockres_release(struct kref *kref) +{ + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; + + res = container_of(kref, struct dlm_lock_resource, refs); + dlm = res->dlm; + + /* This should not happen -- all lockres' have a name + * associated with them at init time. */ + BUG_ON(!res->lockname.name); + + mlog(0, "destroying lockres %.*s\n", res->lockname.len, + res->lockname.name); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + atomic_dec(&dlm->res_cur_count); + + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + + /* By the time we're ready to blow this guy away, we shouldn't + * be on any lists. */ + BUG_ON(!hlist_unhashed(&res->hash_node)); + BUG_ON(!list_empty(&res->granted)); + BUG_ON(!list_empty(&res->converting)); + BUG_ON(!list_empty(&res->blocked)); + BUG_ON(!list_empty(&res->dirty)); + BUG_ON(!list_empty(&res->recovering)); + BUG_ON(!list_empty(&res->purge)); + + kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); + + kmem_cache_free(dlm_lockres_cache, res); +} + +void dlm_lockres_put(struct dlm_lock_resource *res) +{ + kref_put(&res->refs, dlm_lockres_release); +} + +static void dlm_init_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, unsigned int namelen) +{ + char *qname; + + /* If we memset here, we lose our reference to the kmalloc'd + * res->lockname.name, so be sure to init every field + * correctly! */ + + qname = (char *) res->lockname.name; + memcpy(qname, name, namelen); + + res->lockname.len = namelen; + res->lockname.hash = dlm_lockid_hash(name, namelen); + + init_waitqueue_head(&res->wq); + spin_lock_init(&res->spinlock); + INIT_HLIST_NODE(&res->hash_node); + INIT_LIST_HEAD(&res->granted); + INIT_LIST_HEAD(&res->converting); + INIT_LIST_HEAD(&res->blocked); + INIT_LIST_HEAD(&res->dirty); + INIT_LIST_HEAD(&res->recovering); + INIT_LIST_HEAD(&res->purge); + INIT_LIST_HEAD(&res->tracking); + atomic_set(&res->asts_reserved, 0); + res->migration_pending = 0; + res->inflight_locks = 0; + res->inflight_assert_workers = 0; + + res->dlm = dlm; + + kref_init(&res->refs); + + atomic_inc(&dlm->res_tot_count); + atomic_inc(&dlm->res_cur_count); + + /* just for consistency */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&res->spinlock); + + res->state = DLM_LOCK_RES_IN_PROGRESS; + + res->last_used = 0; + + spin_lock(&dlm->spinlock); + list_add_tail(&res->tracking, &dlm->tracking_list); + spin_unlock(&dlm->spinlock); + + memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); +} + +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen) +{ + struct dlm_lock_resource *res = NULL; + + res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); + if (!res) + goto error; + + res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); + if (!res->lockname.name) + goto error; + + dlm_init_lockres(dlm, res, name, namelen); + return res; + +error: + if (res) + kmem_cache_free(dlm_lockres_cache, res); + return NULL; +} + +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) +{ + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + set_bit(bit, res->refmap); +} + +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) +{ + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + clear_bit(bit, res->refmap); +} + +static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + res->inflight_locks++; + + mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); +} + +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + __dlm_lockres_grab_inflight_ref(dlm, res); +} + +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + + res->inflight_locks--; + + mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); + + wake_up(&res->wq); +} + +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +/* + * lookup a lock resource by name. + * may already exist in the hashtable. + * lockid is null terminated + * + * if not, allocate enough for the lockres and for + * the temporary structure used in doing the mastering. + * + * also, do a lookup in the dlm->master_list to see + * if another node has begun mastering the same lock. + * if so, there should be a block entry in there + * for this name, and we should *not* attempt to master + * the lock here. need to wait around for that node + * to assert_master (or die). + * + */ +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int namelen, + int flags) +{ + struct dlm_lock_resource *tmpres=NULL, *res=NULL; + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *alloc_mle = NULL; + int blocked = 0; + int ret, nodenum; + struct dlm_node_iter iter; + unsigned int hash; + int tries = 0; + int bit, wait_on_recovery = 0; + + BUG_ON(!lockid); + + hash = dlm_lockid_hash(lockid, namelen); + + mlog(0, "get lockres %s (len %d)\n", lockid, namelen); + +lookup: + spin_lock(&dlm->spinlock); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); + if (tmpres) { + spin_unlock(&dlm->spinlock); + spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + + /* Wait on the thread that is mastering the resource */ + if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + __dlm_wait_on_lockres(tmpres); + BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + + /* Wait on the resource purge to complete before continuing */ + if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { + BUG_ON(tmpres->owner == dlm->node_num); + __dlm_wait_on_lockres_flags(tmpres, + DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + + /* Grab inflight ref to pin the resource */ + dlm_lockres_grab_inflight_ref(dlm, tmpres); + + spin_unlock(&tmpres->spinlock); + if (res) + dlm_lockres_put(res); + res = tmpres; + goto leave; + } + + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(0, "allocating a new resource\n"); + /* nothing found and we need to allocate one. */ + alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!alloc_mle) + goto leave; + res = dlm_new_lockres(dlm, lockid, namelen); + if (!res) + goto leave; + goto lookup; + } + + mlog(0, "no lockres found, allocated our own: %p\n", res); + + if (flags & LKM_LOCAL) { + /* caller knows it's safe to assume it's not mastered elsewhere + * DONE! return right away */ + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, dlm->node_num); + __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + /* lockres still marked IN_PROGRESS */ + goto wake_waiters; + } + + /* check master list to see if another node has started mastering it */ + spin_lock(&dlm->master_lock); + + /* if we found a block, wait for lock to be mastered by another node */ + blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); + if (blocked) { + int mig; + if (mle->type == DLM_MLE_MASTER) { + mlog(ML_ERROR, "master entry for nonexistent lock!\n"); + BUG(); + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* master is known, detach */ + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + mle = NULL; + /* this is lame, but we can't wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; + } + } else { + /* go ahead and try to master lock on this node */ + mle = alloc_mle; + /* make sure this does not get freed below */ + alloc_mle = NULL; + dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); + set_bit(dlm->node_num, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + + /* still holding the dlm spinlock, check the recovery map + * to see if there are any nodes that still need to be + * considered. these will not appear in the mle nodemap + * but they might own this lockres. wait on them. */ + bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) { + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", + dlm->name, namelen, (char *)lockid, bit); + wait_on_recovery = 1; + } + } + + /* at this point there is either a DLM_MLE_BLOCK or a + * DLM_MLE_MASTER on the master list, so it's safe to add the + * lockres to the hashtable. anyone who finds the lock will + * still have to wait on the IN_PROGRESS. */ + + /* finally add the lockres to its hash bucket */ + __dlm_insert_lockres(dlm, res); + + /* since this lockres is new it doesn't not require the spinlock */ + __dlm_lockres_grab_inflight_ref(dlm, res); + + /* get an extra ref on the mle in case this is a BLOCK + * if so, the creator of the BLOCK may try to put the last + * ref at this time in the assert master handler, so we + * need an extra one to keep from a bad ptr deref. */ + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +redo_request: + while (wait_on_recovery) { + /* any cluster changes that occurred after dropping the + * dlm spinlock would be detectable be a change on the mle, + * so we only need to clear out the recovery map once. */ + if (dlm_is_recovery_lock(lockid, namelen)) { + mlog(0, "%s: Recovery map is not empty, but must " + "master $RECOVERY lock now\n", dlm->name); + if (!dlm_pre_master_reco_lockres(dlm, res)) + wait_on_recovery = 0; + else { + mlog(0, "%s: waiting 500ms for heartbeat state " + "change\n", dlm->name); + msleep(500); + } + continue; + } + + dlm_kick_recovery_thread(dlm); + msleep(1000); + dlm_wait_for_recovery(dlm); + + spin_lock(&dlm->spinlock); + bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) { + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", + dlm->name, namelen, (char *)lockid, bit); + wait_on_recovery = 1; + } else + wait_on_recovery = 0; + spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); + } + + /* must wait for lock to be mastered elsewhere */ + if (blocked) + goto wait; + + ret = -EINVAL; + dlm_node_iter_init(mle->vote_map, &iter); + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = dlm_do_master_request(res, mle, nodenum); + if (ret < 0) + mlog_errno(ret); + if (mle->master != O2NM_MAX_NODES) { + /* found a master ! */ + if (mle->master <= nodenum) + break; + /* if our master request has not reached the master + * yet, keep going until it does. this is how the + * master will know that asserts are needed back to + * the lower nodes. */ + mlog(0, "%s: res %.*s, Requests only up to %u but " + "master is %u, keep going\n", dlm->name, namelen, + lockid, nodenum, mle->master); + } + } + +wait: + /* keep going until the response map includes all nodes */ + ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); + if (ret < 0) { + wait_on_recovery = 1; + mlog(0, "%s: res %.*s, Node map changed, redo the master " + "request now, blocked=%d\n", dlm->name, res->lockname.len, + res->lockname.name, blocked); + if (++tries > 20) { + mlog(ML_ERROR, "%s: res %.*s, Spinning on " + "dlm_wait_for_lock_mastery, blocked = %d\n", + dlm->name, res->lockname.len, + res->lockname.name, blocked); + dlm_print_one_lock_resource(res); + dlm_print_one_mle(mle); + tries = 0; + } + goto redo_request; + } + + mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, + res->lockname.name, res->owner); + /* make sure we never continue without this */ + BUG_ON(res->owner == O2NM_MAX_NODES); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + /* put the extra ref */ + dlm_put_mle_inuse(mle); + +wake_waiters: + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + +leave: + /* need to free the unused mle */ + if (alloc_mle) + kmem_cache_free(dlm_mle_cache, alloc_mle); + + return res; +} + + +#define DLM_MASTERY_TIMEOUT_MS 5000 + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked) +{ + u8 m; + int ret, bit; + int map_changed, voting_done; + int assert, sleep; + +recheck: + ret = 0; + assert = 0; + + /* check if another node has already become the owner */ + spin_lock(&res->spinlock); + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, + res->lockname.len, res->lockname.name, res->owner); + spin_unlock(&res->spinlock); + /* this will cause the master to re-assert across + * the whole cluster, freeing up mles */ + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(res, mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } + } + ret = 0; + goto leave; + } + spin_unlock(&res->spinlock); + + spin_lock(&mle->spinlock); + m = mle->master; + map_changed = (memcmp(mle->vote_map, mle->node_map, + sizeof(mle->vote_map)) != 0); + voting_done = (memcmp(mle->vote_map, mle->response_map, + sizeof(mle->vote_map)) == 0); + + /* restart if we hit any errors */ + if (map_changed) { + int b; + mlog(0, "%s: %.*s: node map changed, restarting\n", + dlm->name, res->lockname.len, res->lockname.name); + ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); + b = (mle->type == DLM_MLE_BLOCK); + if ((*blocked && !b) || (!*blocked && b)) { + mlog(0, "%s:%.*s: status change: old=%d new=%d\n", + dlm->name, res->lockname.len, res->lockname.name, + *blocked, b); + *blocked = b; + } + spin_unlock(&mle->spinlock); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + mlog(0, "%s:%.*s: restart lock mastery succeeded, " + "rechecking now\n", dlm->name, res->lockname.len, + res->lockname.name); + goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } + } + + if (m != O2NM_MAX_NODES) { + /* another node has done an assert! + * all done! */ + sleep = 0; + } else { + sleep = 1; + /* have all nodes responded? */ + if (voting_done && !*blocked) { + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (dlm->node_num <= bit) { + /* my node number is lowest. + * now tell other nodes that I am + * mastering this. */ + mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ + assert = 1; + sleep = 0; + } + /* if voting is done, but we have not received + * an assert master yet, we must sleep */ + } + } + + spin_unlock(&mle->spinlock); + + /* sleep if we haven't finished voting yet */ + if (sleep) { + unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); + + /* + if (atomic_read(&mle->mle_refs.refcount) < 2) + mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, + atomic_read(&mle->mle_refs.refcount), + res->lockname.len, res->lockname.name); + */ + atomic_set(&mle->woken, 0); + (void)wait_event_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + timeo); + if (res->owner == O2NM_MAX_NODES) { + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); + goto recheck; + } + mlog(0, "done waiting, master is %u\n", res->owner); + ret = 0; + goto leave; + } + + ret = 0; /* done */ + if (assert) { + m = dlm->node_num; + mlog(0, "about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, m); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); + if (ret) { + /* This is a failure in the network path, + * not in the response to the assert_master + * (any nonzero response is a BUG on this node). + * Most likely a socket just got disconnected + * due to node death. */ + mlog_errno(ret); + } + /* no longer need to restart lock mastery. + * all living nodes have been contacted. */ + ret = 0; + } + + /* set the lockres owner */ + spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ + dlm_change_lockres_owner(dlm, res, m); + spin_unlock(&res->spinlock); + +leave: + return ret; +} + +struct dlm_bitmap_diff_iter +{ + int curnode; + unsigned long *orig_bm; + unsigned long *cur_bm; + unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +enum dlm_node_state_change +{ + NODE_DOWN = -1, + NODE_NO_CHANGE = 0, + NODE_UP +}; + +static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, + unsigned long *orig_bm, + unsigned long *cur_bm) +{ + unsigned long p1, p2; + int i; + + iter->curnode = -1; + iter->orig_bm = orig_bm; + iter->cur_bm = cur_bm; + + for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { + p1 = *(iter->orig_bm + i); + p2 = *(iter->cur_bm + i); + iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); + } +} + +static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, + enum dlm_node_state_change *state) +{ + int bit; + + if (iter->curnode >= O2NM_MAX_NODES) + return -ENOENT; + + bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, + iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + + /* if it was there in the original then this node died */ + if (test_bit(bit, iter->orig_bm)) + *state = NODE_DOWN; + else + *state = NODE_UP; + + iter->curnode = bit; + return bit; +} + + +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked) +{ + struct dlm_bitmap_diff_iter bdi; + enum dlm_node_state_change sc; + int node; + int ret = 0; + + mlog(0, "something happened such that the " + "master process may need to be restarted!\n"); + + assert_spin_locked(&mle->spinlock); + + dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + while (node >= 0) { + if (sc == NODE_UP) { + /* a node came up. clear any old vote from + * the response map and set it in the vote map + * then restart the mastery. */ + mlog(ML_NOTICE, "node %d up while restarting\n", node); + + /* redo the master request, but only for the new node */ + mlog(0, "sending request to new node\n"); + clear_bit(node, mle->response_map); + set_bit(node, mle->vote_map); + } else { + mlog(ML_ERROR, "node down! %d\n", node); + if (blocked) { + int lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, 0); + + /* act like it was never there */ + clear_bit(node, mle->maybe_map); + + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->mleres = res; + } + } + } + + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); + } + ret = -EAGAIN; + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + } + return ret; +} + + +/* + * DLM_MASTER_REQUEST_MSG + * + * returns: 0 on success, + * -errno on a network error + * + * on error, the caller should assume the target node is "dead" + * + */ + +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) +{ + struct dlm_ctxt *dlm = mle->dlm; + struct dlm_master_request request; + int ret, response=0, resend; + + memset(&request, 0, sizeof(request)); + request.node_idx = dlm->node_num; + + BUG_ON(mle->type == DLM_MLE_MIGRATION); + + request.namelen = (u8)mle->mnamelen; + memcpy(request.name, mle->mname, request.namelen); + +again: + ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, + sizeof(request), to, &response); + if (ret < 0) { + if (ret == -ESRCH) { + /* should never happen */ + mlog(ML_ERROR, "TCP stack not ready!\n"); + BUG(); + } else if (ret == -EINVAL) { + mlog(ML_ERROR, "bad args passed to o2net!\n"); + BUG(); + } else if (ret == -ENOMEM) { + mlog(ML_ERROR, "out of memory while trying to send " + "network message! retrying\n"); + /* this is totally crude */ + msleep(50); + goto again; + } else if (!dlm_is_host_down(ret)) { + /* not a network error. bad. */ + mlog_errno(ret); + mlog(ML_ERROR, "unhandled error!"); + BUG(); + } + /* all other errors should be network errors, + * and likely indicate node death */ + mlog(ML_ERROR, "link to %d went down!\n", to); + goto out; + } + + ret = 0; + resend = 0; + spin_lock(&mle->spinlock); + switch (response) { + case DLM_MASTER_RESP_YES: + set_bit(to, mle->response_map); + mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); + mle->master = to; + break; + case DLM_MASTER_RESP_NO: + mlog(0, "node %u not master, response=NO\n", to); + set_bit(to, mle->response_map); + break; + case DLM_MASTER_RESP_MAYBE: + mlog(0, "node %u not master, response=MAYBE\n", to); + set_bit(to, mle->response_map); + set_bit(to, mle->maybe_map); + break; + case DLM_MASTER_RESP_ERROR: + mlog(0, "node %u hit an error, resending\n", to); + resend = 1; + response = 0; + break; + default: + mlog(ML_ERROR, "bad response! %u\n", response); + BUG(); + } + spin_unlock(&mle->spinlock); + if (resend) { + /* this is also totally crude */ + msleep(50); + goto again; + } + +out: + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + u8 response = DLM_MASTER_RESP_MAYBE; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; + char *name; + unsigned int namelen, hash; + int found, ret; + int set_maybe; + int dispatch_assert = 0; + + if (!dlm_grab(dlm)) + return DLM_MASTER_RESP_NO; + + if (!dlm_domain_fully_joined(dlm)) { + response = DLM_MASTER_RESP_NO; + goto send_response; + } + + name = request->name; + namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); + + if (namelen > DLM_LOCKID_NAME_MAX) { + response = DLM_IVBUFLEN; + goto send_response; + } + +way_up_top: + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_unlock(&dlm->spinlock); + + /* take care of the easy cases up front */ + spin_lock(&res->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&res->hash_node)) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + goto way_up_top; + } + + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " + "being recovered/migrated\n"); + response = DLM_MASTER_RESP_ERROR; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + if (res->owner == dlm->node_num) { + dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); + spin_unlock(&res->spinlock); + response = DLM_MASTER_RESP_YES; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + + /* this node is the owner. + * there is some extra work that needs to + * happen now. the requesting node has + * caused all nodes up to this one to + * create mles. this node now needs to + * go back and clean those up. */ + dispatch_assert = 1; + goto send_response; + } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + spin_unlock(&res->spinlock); + // mlog(0, "node %u is the master\n", res->owner); + response = DLM_MASTER_RESP_NO; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* ok, there is no owner. either this node is + * being blocked, or it is actively trying to + * master this lock. */ + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "lock with no owner should be " + "in-progress!\n"); + BUG(); + } + + // mlog(0, "lockres is in progress...\n"); + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + mlog(ML_ERROR, "no mle found for this lock!\n"); + BUG(); + } + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->type == DLM_MLE_BLOCK) { + // mlog(0, "this node is waiting for " + // "lockres to be mastered\n"); + response = DLM_MASTER_RESP_NO; + } else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "node %u is master, but trying to migrate to " + "node %u.\n", tmpmle->master, tmpmle->new_master); + if (tmpmle->master == dlm->node_num) { + mlog(ML_ERROR, "no owner on lockres, but this " + "node is trying to migrate it to %u?!\n", + tmpmle->new_master); + BUG(); + } else { + /* the real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } + } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { + set_maybe = 0; + if (tmpmle->master == dlm->node_num) { + response = DLM_MASTER_RESP_YES; + /* this node will be the owner. + * go back and clean the mles on any + * other nodes */ + dispatch_assert = 1; + dlm_lockres_set_refmap_bit(dlm, res, + request->node_idx); + } else + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "this node is attempting to " + // "master lockres\n"); + response = DLM_MASTER_RESP_MAYBE; + } + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + + spin_unlock(&dlm->master_lock); + spin_unlock(&res->spinlock); + + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* + * lockres doesn't exist on this node + * if there is an MLE_BLOCK, return NO + * if there is an MLE_MASTER, return MAYBE + * otherwise, add an MLE_BLOCK, return NO + */ + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + /* this lockid has never been seen on this node yet */ + // mlog(0, "no mle found\n"); + if (!mle) { + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!mle) { + response = DLM_MASTER_RESP_ERROR; + mlog_errno(-ENOMEM); + goto send_response; + } + goto way_up_top; + } + + // mlog(0, "this is second time thru, already allocated, " + // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); + set_bit(request->node_idx, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "mle was found\n"); + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->master == dlm->node_num) { + mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); + BUG(); + } + if (tmpmle->type == DLM_MLE_BLOCK) + response = DLM_MASTER_RESP_NO; + else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "migration mle was found (%u->%u)\n", + tmpmle->master, tmpmle->new_master); + /* real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } else + response = DLM_MASTER_RESP_MAYBE; + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (found) { + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + } +send_response: + /* + * __dlm_lookup_lockres() grabbed a reference to this lockres. + * The reference is released by dlm_assert_master_worker() under + * the call to dlm_dispatch_assert_master(). If + * dlm_assert_master_worker() isn't called, we drop it here. + */ + if (dispatch_assert) { + if (response != DLM_MASTER_RESP_YES) + mlog(ML_ERROR, "invalid response %d\n", response); + if (!res) { + mlog(ML_ERROR, "bad lockres while trying to assert!\n"); + BUG(); + } + mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", + dlm->node_num, res->lockname.len, res->lockname.name); + spin_lock(&res->spinlock); + ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, + DLM_ASSERT_MASTER_MLE_CLEANUP); + if (ret < 0) { + mlog(ML_ERROR, "failed to dispatch assert master work\n"); + response = DLM_MASTER_RESP_ERROR; + dlm_lockres_put(res); + } else + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); + } else { + if (res) + dlm_lockres_put(res); + } + + dlm_put(dlm); + return response; +} + +/* + * DLM_ASSERT_MASTER_MSG + */ + + +/* + * NOTE: this can be used for debugging + * can periodically run all locks owned by this node + * and re-assert across the cluster... + */ +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) +{ + struct dlm_assert_master assert; + int to, tmpret; + struct dlm_node_iter iter; + int ret = 0; + int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; + + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + +again: + reassert = 0; + + /* note that if this nodemap is empty, it returns 0 */ + dlm_node_iter_init(nodemap, &iter); + while ((to = dlm_node_iter_next(&iter)) >= 0) { + int r = 0; + struct dlm_master_list_entry *mle = NULL; + + mlog(0, "sending assert master to %d (%.*s)\n", to, + namelen, lockname); + memset(&assert, 0, sizeof(assert)); + assert.node_idx = dlm->node_num; + assert.namelen = namelen; + memcpy(assert.name, lockname, namelen); + assert.flags = cpu_to_be32(flags); + + tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, + &assert, sizeof(assert), to, &r); + if (tmpret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", tmpret, + DLM_ASSERT_MASTER_MSG, dlm->key, to); + if (!dlm_is_host_down(tmpret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); + BUG(); + } + /* a node died. finish out the rest of the nodes. */ + mlog(0, "link to %d went down!\n", to); + /* any nonzero status return will do */ + ret = tmpret; + r = 0; + } else if (r < 0) { + /* ok, something horribly messed. kill thyself. */ + mlog(ML_ERROR,"during assert master of %.*s to %u, " + "got %d.\n", namelen, lockname, to, r); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + BUG(); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { + mlog(0, "%.*s: node %u create mles on other " + "nodes and requests a re-assert\n", + namelen, lockname, to); + reassert = 1; + } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(dlm, res, to); + spin_unlock(&res->spinlock); + } + } + + if (reassert) + goto again; + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_list_entry *mle = NULL; + struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen, hash; + u32 flags; + int master_request = 0, have_lockres_ref = 0; + int ret = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = assert->name; + namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); + flags = be32_to_cpu(assert->flags); + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + + spin_lock(&dlm->spinlock); + + if (flags) + mlog(0, "assert_master with flags: %u\n", flags); + + /* find the MLE */ + spin_lock(&dlm->master_lock); + if (!dlm_find_mle(dlm, &mle, name, namelen)) { + /* not an error, could be master just re-asserting */ + mlog(0, "just got an assert_master from %u, but no " + "MLE for it! (%.*s)\n", assert->node_idx, + namelen, name); + } else { + int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* not necessarily an error, though less likely. + * could be master just re-asserting. */ + mlog(0, "no bits set in the maybe_map, but %u " + "is asserting! (%.*s)\n", assert->node_idx, + namelen, name); + } else if (bit != assert->node_idx) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "master %u was found, %u should " + "back off\n", assert->node_idx, bit); + } else { + /* with the fix for bug 569, a higher node + * number winning the mastery will respond + * YES to mastery requests, but this node + * had no way of knowing. let it pass. */ + mlog(0, "%u is the lowest node, " + "%u is asserting. (%.*s) %u must " + "have begun after %u won.\n", bit, + assert->node_idx, namelen, name, bit, + assert->node_idx); + } + } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } + } + spin_unlock(&dlm->master_lock); + + /* ok everything checks out with the MLE + * now check to see if there is a lockres */ + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(ML_ERROR, "%u asserting but %.*s is " + "RECOVERING!\n", assert->node_idx, namelen, name); + goto kill; + } + if (!mle) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { + mlog(ML_ERROR, "DIE! Mastery assert from %u, " + "but current owner is %u! (%.*s)\n", + assert->node_idx, res->owner, namelen, + name); + __dlm_print_one_lock_resource(res); + BUG(); + } + } else if (mle->type != DLM_MLE_MIGRATION) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* owner is just re-asserting */ + if (res->owner == assert->node_idx) { + mlog(0, "owner %u re-asserting on " + "lock %.*s\n", assert->node_idx, + namelen, name); + goto ok; + } + mlog(ML_ERROR, "got assert_master from " + "node %u, but %u is the owner! " + "(%.*s)\n", assert->node_idx, + res->owner, namelen, name); + goto kill; + } + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "got assert from %u, but lock " + "with no owner should be " + "in-progress! (%.*s)\n", + assert->node_idx, + namelen, name); + goto kill; + } + } else /* mle->type == DLM_MLE_MIGRATION */ { + /* should only be getting an assert from new master */ + if (assert->node_idx != mle->new_master) { + mlog(ML_ERROR, "got assert from %u, but " + "new master is %u, and old master " + "was %u (%.*s)\n", + assert->node_idx, mle->new_master, + mle->master, namelen, name); + goto kill; + } + + } +ok: + spin_unlock(&res->spinlock); + } + + // mlog(0, "woo! got an assert_master from node %u!\n", + // assert->node_idx); + if (mle) { + int extra_ref = 0; + int nn = -1; + int rr, err = 0; + + spin_lock(&mle->spinlock); + if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) + extra_ref = 1; + else { + /* MASTER mle: if any bits set in the response map + * then the calling node needs to re-assert to clear + * up nodes that this node contacted */ + while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, + nn+1)) < O2NM_MAX_NODES) { + if (nn != dlm->node_num && nn != assert->node_idx) { + master_request = 1; + break; + } + } + } + mle->master = assert->node_idx; + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + spin_unlock(&mle->spinlock); + + if (res) { + int wake = 0; + spin_lock(&res->spinlock); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } + spin_unlock(&res->spinlock); + have_lockres_ref = 1; + if (wake) + wake_up(&res->wq); + } + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + if (extra_ref) { + /* the assert master message now balances the extra + * ref given by the master / migration request message. + * if this is the last put, it will be removed + * from the list. */ + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); + } + } + spin_unlock(&dlm->spinlock); + +done: + ret = 0; + if (res) { + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + *ret_data = (void *)res; + } + dlm_put(dlm); + if (master_request) { + mlog(0, "need to tell master to reassert\n"); + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); + } + return ret; + +kill: + /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + spin_lock(&dlm->master_lock); + if (mle) + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + *ret_data = (void *)res; + dlm_put(dlm); + return -EINVAL; +} + +void dlm_assert_master_post_handler(int status, void *data, void *ret_data) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; + + if (ret_data) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + return; +} + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, u8 request_from, u32 flags) +{ + struct dlm_work_item *item; + item = kzalloc(sizeof(*item), GFP_ATOMIC); + if (!item) + return -ENOMEM; + + + /* queue up work for dlm_assert_master_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); + item->u.am.lockres = res; /* already have a ref */ + /* can optionally ignore node numbers higher than this node */ + item->u.am.ignore_higher = ignore_higher; + item->u.am.request_from = request_from; + item->u.am.flags = flags; + + if (ignore_higher) + mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, + res->lockname.name); + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; +} + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm = data; + int ret = 0; + struct dlm_lock_resource *res; + unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int ignore_higher; + int bit; + u8 request_from; + u32 flags; + + dlm = item->dlm; + res = item->u.am.lockres; + ignore_higher = item->u.am.ignore_higher; + request_from = item->u.am.request_from; + flags = item->u.am.flags; + + spin_lock(&dlm->spinlock); + memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + spin_unlock(&dlm->spinlock); + + clear_bit(dlm->node_num, nodemap); + if (ignore_higher) { + /* if is this just to clear up mles for nodes below + * this node, do not send the message to the original + * caller or any node number higher than this */ + clear_bit(request_from, nodemap); + bit = dlm->node_num; + while (1) { + bit = find_next_bit(nodemap, O2NM_MAX_NODES, + bit+1); + if (bit >= O2NM_MAX_NODES) + break; + clear_bit(bit, nodemap); + } + } + + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + + /* this call now finishes out the nodemap + * even if one or more nodes die */ + mlog(0, "worker about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, dlm->node_num); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); + if (ret < 0) { + /* no need to restart, we are done */ + if (!dlm_is_host_down(ret)) + mlog_errno(ret); + } + + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: + dlm_lockres_drop_inflight_worker(dlm, res); + + dlm_lockres_put(res); + + mlog(0, "finished with dlm_assert_master_worker\n"); +} + +/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. + * We cannot wait for node recovery to complete to begin mastering this + * lockres because this lockres is used to kick off recovery! ;-) + * So, do a pre-check on all living nodes to see if any of those nodes + * think that $RECOVERY is currently mastered by a dead node. If so, + * we wait a short time to allow that node to get notified by its own + * heartbeat stack, then check again. All $RECOVERY lock resources + * mastered by dead nodes are purged when the hearbeat callback is + * fired, so we can know for sure that it is safe to continue once + * the node returns a live node or no node. */ +static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct dlm_node_iter iter; + int nodenum; + int ret = 0; + u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + /* do not send to self */ + if (nodenum == dlm->node_num) + continue; + ret = dlm_do_master_requery(dlm, res, nodenum, &master); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + /* host is down, so answer for that node would be + * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; + } + + if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* check to see if this master is in the recovery map */ + spin_lock(&dlm->spinlock); + if (test_bit(master, dlm->recovery_map)) { + mlog(ML_NOTICE, "%s: node %u has not seen " + "node %u go down yet, and thinks the " + "dead node is mastering the recovery " + "lock. must wait.\n", dlm->name, + nodenum, master); + ret = -EAGAIN; + } + spin_unlock(&dlm->spinlock); + mlog(0, "%s: reco lock master is %u\n", dlm->name, + master); + break; + } + } + return ret; +} + +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", + dlm->name, namelen, lockname, ret, res->owner); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + struct dlm_work_item *item; + int cleared = 0; + int dispatch = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_SETREF_INPROG) + dispatch = 1; + else { + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + dlm_lockres_clear_refmap_bit(dlm, res, node); + cleared = 1; + } + } + spin_unlock(&res->spinlock); + + if (!dispatch) { + if (cleared) + dlm_lockres_calc_usage(dlm, res); + else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + dlm_print_one_lock_resource(res); + } + ret = 0; + goto done; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + ret = -ENOMEM; + mlog_errno(ret); + goto done; + } + + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); + item->u.dl.deref_res = res; + item->u.dl.deref_node = node; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + + return ret; +} + +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_lock_resource *res; + u8 node; + u8 cleared = 0; + + dlm = item->dlm; + res = item->u.dl.deref_res; + node = item->u.dl.deref_node; + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + dlm_lockres_clear_refmap_bit(dlm, res, node); + cleared = 1; + } + spin_unlock(&res->spinlock); + + if (cleared) { + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", + dlm->name, res->lockname.len, res->lockname.name, node); + dlm_lockres_calc_usage(dlm, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + dlm_print_one_lock_resource(res); + } + + dlm_lockres_put(res); +} + +/* + * A migrateable resource is one that is : + * 1. locally mastered, and, + * 2. zero local locks, and, + * 3. one or more non-local locks, or, one or more references + * Returns 1 if yes, 0 if not. + */ +static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + enum dlm_lockres_list idx; + int nonlocal = 0, node_ref; + struct list_head *queue; + struct dlm_lock *lock; + u64 cookie; + + assert_spin_locked(&res->spinlock); + + /* delay migration when the lockres is in MIGRATING state */ + if (res->state & DLM_LOCK_RES_MIGRATING) + return 0; + + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + if (res->owner != dlm->node_num) + return 0; + + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node != dlm->node_num) { + nonlocal++; + continue; + } + cookie = be64_to_cpu(lock->ml.cookie); + mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + "%s list\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(cookie), + dlm_get_lock_cookie_seq(cookie), + dlm_list_in_text(idx)); + return 0; + } + } + + if (!nonlocal) { + node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (node_ref >= O2NM_MAX_NODES) + return 0; + } + + mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + res->lockname.name); + + return 1; +} + +/* + * DLM_MIGRATE_LOCKRES + */ + + +static int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 target) +{ + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *oldmle = NULL; + struct dlm_migratable_lockres *mres = NULL; + int ret = 0; + const char *name; + unsigned int namelen; + int mle_added = 0; + int wake = 0; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(target == O2NM_MAX_NODES); + + name = res->lockname.name; + namelen = res->lockname.len; + + mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, + target); + + /* preallocate up front. if this fails, abort */ + ret = -ENOMEM; + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); + if (!mres) { + mlog_errno(ret); + goto leave; + } + + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!mle) { + mlog_errno(ret); + goto leave; + } + ret = 0; + + /* + * clear any existing master requests and + * add the migration mle to the list + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, + namelen, target, dlm->node_num); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (ret == -EEXIST) { + mlog(0, "another process is already migrating it\n"); + goto fail; + } + mle_added = 1; + + /* + * set the MIGRATING flag and flush asts + * if we fail after this we need to re-dirty the lockres + */ + if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { + mlog(ML_ERROR, "tried to migrate %.*s to %u, but " + "the target went down.\n", res->lockname.len, + res->lockname.name, target); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + ret = -EINVAL; + } + +fail: + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (ret < 0) { + if (mle_added) { + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + } else if (mle) { + kmem_cache_free(dlm_mle_cache, mle); + mle = NULL; + } + goto leave; + } + + /* + * at this point, we have a migration target, an mle + * in the master list, and the MIGRATING flag set on + * the lockres + */ + + /* now that remote nodes are spinning on the MIGRATING flag, + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); + + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + * also, make sure that all callers of dlm_get_mle + * take both dlm->spinlock and dlm->master_lock */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* notify new node and send all lock state */ + /* call send_one_lockres with migration flag. + * this serves as notice to the target node that a + * migration is starting. */ + ret = dlm_send_one_lockres(dlm, res, mres, target, + DLM_MRES_MIGRATION); + + if (ret < 0) { + mlog(0, "migration to node %u failed with %d\n", + target, ret); + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + if (dlm_is_host_down(ret)) + dlm_wait_for_node_death(dlm, target, + DLM_NODE_DEATH_WAIT_MAX); + goto leave; + } + + /* at this point, the target sends a message to all nodes, + * (using dlm_do_migrate_request). this node is skipped since + * we had to put an mle in the list to begin the process. this + * node now waits for target to do an assert master. this node + * will be the last one notified, ensuring that the migration + * is complete everywhere. if the target dies while this is + * going on, some nodes could potentially see the target as the + * master, so it is important that my recovery finds the migration + * mle and sets the master to UNKNOWN. */ + + + /* wait for new node to assert master */ + while (1) { + ret = wait_event_interruptible_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + msecs_to_jiffies(5000)); + + if (ret >= 0) { + if (atomic_read(&mle->woken) == 1 || + res->owner == target) + break; + + mlog(0, "%s:%.*s: timed out during migration\n", + dlm->name, res->lockname.len, res->lockname.name); + /* avoid hang during shutdown when migrating lockres + * to a node which also goes down */ + if (dlm_is_node_dead(dlm, target)) { + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", + dlm->name, res->lockname.len, + res->lockname.name, target); + ret = -EINVAL; + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + goto leave; + } + } else + mlog(0, "%s:%.*s: caught signal during migration\n", + dlm->name, res->lockname.len, res->lockname.name); + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, target); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_remove_nonlocal_locks(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle_inuse(mle); + ret = 0; + + dlm_lockres_calc_usage(dlm, res); + +leave: + /* re-dirty the lockres if we failed */ + if (ret < 0) + dlm_kick_thread(dlm, res); + + /* wake up waiters if the MIGRATING flag got set + * but migration failed */ + if (wake) + wake_up(&res->wq); + + if (mres) + free_page((unsigned long)mres); + + dlm_put(dlm); + + mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, + name, target, ret); + return ret; +} + +#define DLM_MIGRATION_RETRY_MS 100 + +/* + * Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped + */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + u8 target = O2NM_MAX_NODES; + + assert_spin_locked(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (dlm_is_lockres_migrateable(dlm, res)) + target = dlm_pick_migration_target(dlm, res); + spin_unlock(&res->spinlock); + + if (target == O2NM_MAX_NODES) + goto leave; + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + ret = dlm_migrate_lockres(dlm, res, target); + if (ret) + mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", + dlm->name, res->lockname.len, res->lockname.name, + target, ret); + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + int ret; + spin_lock(&dlm->ast_lock); + spin_lock(&lock->spinlock); + ret = (list_empty(&lock->bast_list) && !lock->bast_pending); + spin_unlock(&lock->spinlock); + spin_unlock(&dlm->ast_lock); + return ret; +} + +static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 mig_target) +{ + int can_proceed; + spin_lock(&res->spinlock); + can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); + spin_unlock(&res->spinlock); + + /* target has died, so make the caller break out of the + * wait_event, but caller must recheck the domain_map */ + spin_lock(&dlm->spinlock); + if (!test_bit(mig_target, dlm->domain_map)) + can_proceed = 1; + spin_unlock(&dlm->spinlock); + return can_proceed; +} + +static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int ret; + spin_lock(&res->spinlock); + ret = !!(res->state & DLM_LOCK_RES_DIRTY); + spin_unlock(&res->spinlock); + return ret; +} + + +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) +{ + int ret = 0; + + mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", + res->lockname.len, res->lockname.name, dlm->node_num, + target); + /* need to set MIGRATING flag on lockres. this is done by + * ensuring that all asts have been flushed for this lockres. */ + spin_lock(&res->spinlock); + BUG_ON(res->migration_pending); + res->migration_pending = 1; + /* strategy is to reserve an extra ast then release + * it below, letting the release do all of the work */ + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + + /* now flush all the pending asts */ + dlm_kick_thread(dlm, res); + /* before waiting on DIRTY, block processes which may + * try to dirty the lockres before MIGRATING is set */ + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); + /* now wait on any pending asts and the DIRTY state */ + wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); + dlm_lockres_release_ast(dlm, res); + + mlog(0, "about to wait on migration_wq, dirty=%s\n", + res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + /* if the extra ref we just put was the final one, this + * will pass thru immediately. otherwise, we need to wait + * for the last ast to finish. */ +again: + ret = wait_event_interruptible_timeout(dlm->migration_wq, + dlm_migration_can_proceed(dlm, res, target), + msecs_to_jiffies(1000)); + if (ret < 0) { + mlog(0, "woken again: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } else { + mlog(0, "all is well: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } + if (!dlm_migration_can_proceed(dlm, res, target)) { + mlog(0, "trying again...\n"); + goto again; + } + + ret = 0; + /* did the target go down or die? */ + spin_lock(&dlm->spinlock); + if (!test_bit(target, dlm->domain_map)) { + mlog(ML_ERROR, "aha. migration target %u just went down\n", + target); + ret = -EHOSTDOWN; + } + spin_unlock(&dlm->spinlock); + + /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* + * at this point: + * + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down + * o there are no pending asts on this lockres + * o all processes trying to reserve an ast on this + * lockres must wait for the MIGRATING flag to clear + */ + return ret; +} + +/* last step in the migration process. + * original master calls this to free all of the dlm_lock + * structures that used to be for other nodes. */ +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct list_head *queue = &res->granted; + int i, bit; + struct dlm_lock *lock, *next; + + assert_spin_locked(&res->spinlock); + + BUG_ON(res->owner == dlm->node_num); + + for (i=0; i<3; i++) { + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node != dlm->node_num) { + mlog(0, "putting lock for node %u\n", + lock->ml.node); + /* be extra careful */ + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(dlm, res, + lock->ml.node); + list_del_init(&lock->list); + dlm_lock_put(lock); + /* In a normal unlock, we would have added a + * DLM_UNLOCK_FREE_LOCK action. Force it. */ + dlm_lock_put(lock); + } + } + queue++; + } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(dlm, res, bit); + } + bit++; + } +} + +/* + * Pick a node to migrate the lock resource to. This function selects a + * potential target based first on the locks and then on refmap. It skips + * nodes that are in the process of exiting the domain. + */ +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + enum dlm_lockres_list idx; + struct list_head *queue = &res->granted; + struct dlm_lock *lock; + int noderef; + u8 nodenum = O2NM_MAX_NODES; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* Go through all the locks */ + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node == dlm->node_num) + continue; + if (test_bit(lock->ml.node, dlm->exit_domain_map)) + continue; + nodenum = lock->ml.node; + goto bail; + } + } + + /* Go thru the refmap */ + noderef = -1; + while (1) { + noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, + noderef + 1); + if (noderef >= O2NM_MAX_NODES) + break; + if (noderef == dlm->node_num) + continue; + if (test_bit(noderef, dlm->exit_domain_map)) + continue; + nodenum = noderef; + goto bail; + } + +bail: + return nodenum; +} + +/* this is called by the new master once all lockres + * data has been received */ +static int dlm_do_migrate_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 master, u8 new_master, + struct dlm_node_iter *iter) +{ + struct dlm_migrate_request migrate; + int ret, skip, status = 0; + int nodenum; + + memset(&migrate, 0, sizeof(migrate)); + migrate.namelen = res->lockname.len; + memcpy(migrate.name, res->lockname.name, migrate.namelen); + migrate.new_master = new_master; + migrate.master = master; + + ret = 0; + + /* send message to all nodes, except the master and myself */ + while ((nodenum = dlm_node_iter_next(iter)) >= 0) { + if (nodenum == master || + nodenum == new_master) + continue; + + /* We could race exit domain. If exited, skip. */ + spin_lock(&dlm->spinlock); + skip = (!test_bit(nodenum, dlm->domain_map)); + spin_unlock(&dlm->spinlock); + if (skip) { + clear_bit(nodenum, iter->node_map); + continue; + } + + ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, + &migrate, sizeof(migrate), nodenum, + &status); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, Error %d send " + "MIGRATE_REQUEST to node %u\n", dlm->name, + migrate.namelen, migrate.name, ret, nodenum); + if (!dlm_is_host_down(ret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", ret); + BUG(); + } + clear_bit(nodenum, iter->node_map); + ret = 0; + } else if (status < 0) { + mlog(0, "migrate request (node %u) returned %d!\n", + nodenum, status); + ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(dlm, res, nodenum); + spin_unlock(&res->spinlock); + } + } + + if (ret < 0) + mlog_errno(ret); + + mlog(0, "returning ret=%d\n", ret); + return ret; +} + + +/* if there is an existing mle for this lockres, we now know who the master is. + * (the one who sent us *this* message) we can clear it up right away. + * since the process that put the mle on the list still has a reference to it, + * we can unhash it now, set the master and wake the process. as a result, + * we will have no mle in the list to start with. now we can add an mle for + * the migration and this should be the only one found for those scanning the + * list. */ +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; + const char *name; + unsigned int namelen, hash; + int ret = 0; + + if (!dlm_grab(dlm)) + return -EINVAL; + + name = migrate->name; + namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); + + /* preallocate.. if this fails, abort */ + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + + if (!mle) { + ret = -ENOMEM; + goto leave; + } + + /* check for pre-existing lock */ + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + /* if all is working ok, this can only mean that we got + * a migrate request from a node that we now see as + * dead. what can we do here? drop it to the floor? */ + spin_unlock(&res->spinlock); + mlog(ML_ERROR, "Got a migrate request, but the " + "lockres is marked as recovering!"); + kmem_cache_free(dlm_mle_cache, mle); + ret = -EINVAL; /* need a better solution */ + goto unlock; + } + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + } + + spin_lock(&dlm->master_lock); + /* ignore status. only nonzero status would BUG. */ + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, + name, namelen, + migrate->new_master, + migrate->master); + + spin_unlock(&dlm->master_lock); +unlock: + spin_unlock(&dlm->spinlock); + + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (res) + dlm_lockres_put(res); +leave: + dlm_put(dlm); + return ret; +} + +/* must be holding dlm->spinlock and dlm->master_lock + * when adding a migration mle, we can clear any other mles + * in the master list because we know with certainty that + * the master is "master". so we remove any old mle from + * the list after setting it's master field, and then add + * the new migration mle. this way we can hold with the rule + * of having only one mle for a given lock name at all times. */ +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master) +{ + int found; + int ret = 0; + + *oldmle = NULL; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + /* caller is responsible for any ref taken here on oldmle */ + found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); + if (found) { + struct dlm_master_list_entry *tmp = *oldmle; + spin_lock(&tmp->spinlock); + if (tmp->type == DLM_MLE_MIGRATION) { + if (master == dlm->node_num) { + /* ah another process raced me to it */ + mlog(0, "tried to migrate %.*s, but some " + "process beat me to it\n", + namelen, name); + ret = -EEXIST; + } else { + /* bad. 2 NODES are trying to migrate! */ + mlog(ML_ERROR, "migration error mle: " + "master=%u new_master=%u // request: " + "master=%u new_master=%u // " + "lockres=%.*s\n", + tmp->master, tmp->new_master, + master, new_master, + namelen, name); + BUG(); + } + } else { + /* this is essentially what assert_master does */ + tmp->master = master; + atomic_set(&tmp->woken, 1); + wake_up(&tmp->wq); + /* remove it so that only one mle will be found */ + __dlm_unlink_mle(dlm, tmp); + __dlm_mle_detach_hb_events(dlm, tmp); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } + } + spin_unlock(&tmp->spinlock); + } + + /* now add a migration mle to the tail of the list */ + dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); + mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ + mle->master = master; + /* do this for consistency with other mle types */ + set_bit(new_master, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + + return ret; +} + +/* + * Sets the owner of the lockres, associated to the mle, to UNKNOWN + */ +static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + struct dlm_lock_resource *res; + + /* Find the lockres associated to the mle and set its owner to UNK */ + res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, + mle->mnamehash); + if (res) { + spin_unlock(&dlm->master_lock); + + /* move lockres onto recovery list */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + dlm_move_lockres_to_recovery_list(dlm, res); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + + /* dump the mle */ + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + } + + return res; +} + +static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + __dlm_mle_detach_hb_events(dlm, mle); + + spin_lock(&mle->spinlock); + __dlm_unlink_mle(dlm, mle); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + + wake_up(&mle->wq); +} + +static void dlm_clean_block_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, u8 dead_node) +{ + int bit; + + BUG_ON(mle->type != DLM_MLE_BLOCK); + + spin_lock(&mle->spinlock); + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit != dead_node) { + mlog(0, "mle found, but dead node %u would not have been " + "master\n", dead_node); + spin_unlock(&mle->spinlock); + } else { + /* Must drop the refcount by one since the assert_master will + * never arrive. This may result in the mle being unlinked and + * freed, but there may still be a process waiting in the + * dlmlock path which is fine. */ + mlog(0, "node %u was expected master\n", dead_node); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + wake_up(&mle->wq); + + /* Do not need events any longer, so detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } +} + +void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_master_list_entry *mle; + struct dlm_lock_resource *res; + struct hlist_head *bucket; + struct hlist_node *tmp; + unsigned int i; + + mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); +top: + assert_spin_locked(&dlm->spinlock); + + /* clean the master list */ + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + /* MASTER mles are initiated locally. The waiting + * process will notice the node map change shortly. + * Let that happen as normal. */ + if (mle->type == DLM_MLE_MASTER) + continue; + + /* BLOCK mles are initiated by other nodes. Need to + * clean up if the dead node would have been the + * master. */ + if (mle->type == DLM_MLE_BLOCK) { + dlm_clean_block_mle(dlm, mle, dead_node); + continue; + } + + /* Everything else is a MIGRATION mle */ + + /* The rule for MIGRATION mles is that the master + * becomes UNKNOWN if *either* the original or the new + * master dies. All UNKNOWN lockres' are sent to + * whichever node becomes the recovery master. The new + * master is responsible for determining if there is + * still a master for this lockres, or if he needs to + * take over mastery. Either way, this node should + * expect another message to resolve this. */ + + if (mle->master != dead_node && + mle->new_master != dead_node) + continue; + + /* If we have reached this point, this mle needs to be + * removed from the list and freed. */ + dlm_clean_migration_mle(dlm, mle); + + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, + mle->new_master); + + /* If we find a lockres associated with the mle, we've + * hit this rare case that messes up our lock ordering. + * If so, we need to drop the master lock so that we can + * take the lockres lock, meaning that we will have to + * restart from the head of list. */ + res = dlm_reset_mleres_owner(dlm, mle); + if (res) + /* restart */ + goto top; + + /* This may be the last reference */ + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); +} + +int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 old_master) +{ + struct dlm_node_iter iter; + int ret = 0; + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + clear_bit(old_master, iter.node_map); + clear_bit(dlm->node_num, iter.node_map); + spin_unlock(&dlm->spinlock); + + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(dlm, res, old_master); + spin_unlock(&res->spinlock); + + mlog(0, "now time to do a migrate request to other nodes\n"); + ret = dlm_do_migrate_request(dlm, res, old_master, + dlm->node_num, &iter); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + mlog(0, "doing assert master of %.*s to all except the original node\n", + res->lockname.len, res->lockname.name); + /* this call now finishes out the nodemap + * even if one or more nodes die */ + ret = dlm_do_assert_master(dlm, res, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + /* no longer need to retry. all living nodes contacted. */ + mlog_errno(ret); + ret = 0; + } + + memset(iter.node_map, 0, sizeof(iter.node_map)); + set_bit(old_master, iter.node_map); + mlog(0, "doing assert master of %.*s back to %u\n", + res->lockname.len, res->lockname.name, old_master); + ret = dlm_do_assert_master(dlm, res, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + mlog(0, "assert master to original master failed " + "with %d.\n", ret); + /* the only nonzero status here would be because of + * a dead original node. we're done. */ + ret = 0; + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, dlm->node_num); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + /* re-dirty it on the new master */ + dlm_kick_thread(dlm, res); + wake_up(&res->wq); +leave: + return ret; +} + +/* + * LOCKRES AST REFCOUNT + * this is integral to migration + */ + +/* for future intent to call an ast, reserve one ahead of time. + * this should be called only after waiting on the lockres + * with dlm_wait_on_lockres, and while still holding the + * spinlock after the call. */ +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + __dlm_print_one_lock_resource(res); + } + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + + atomic_inc(&res->asts_reserved); +} + +/* + * used to drop the reserved ast, either because it went unused, + * or because the ast/bast was actually called. + * + * also, if there is a pending migration on this lockres, + * and this was the last pending ast on the lockres, + * atomically set the MIGRATING flag before we drop the lock. + * this is how we ensure that migration can proceed with no + * asts in progress. note that it is ok if the state of the + * queues is such that a lock should be granted in the future + * or that a bast should be fired, because the new master will + * shuffle the lists on this lockres as soon as it is migrated. + */ +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) + return; + + if (!res->migration_pending) { + spin_unlock(&res->spinlock); + return; + } + + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + res->migration_pending = 0; + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + wake_up(&dlm->migration_wq); +} + +void dlm_force_free_mles(struct dlm_ctxt *dlm) +{ + int i; + struct hlist_head *bucket; + struct dlm_master_list_entry *mle; + struct hlist_node *tmp; + + /* + * We notified all other nodes that we are exiting the domain and + * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still + * around we force free them and wake any processes that are waiting + * on the mles + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); + BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { + if (mle->type != DLM_MLE_BLOCK) { + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + } + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} diff --git a/kernel/fs/ocfs2/dlm/dlmrecovery.c b/kernel/fs/ocfs2/dlm/dlmrecovery.c new file mode 100644 index 000000000..ce12e0b1a --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmrecovery.c @@ -0,0 +1,2935 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmrecovery.c + * + * recovery stuff + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) +#include "cluster/masklog.h" + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); + +static int dlm_recovery_thread(void *data); +static int dlm_do_recovery(struct dlm_ctxt *dlm); + +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_request_all_locks(struct dlm_ctxt *dlm, + u8 request_from, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master); +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks); +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres); +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, + u8 dead_node, u8 send_to); +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, u8 dead_node); +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master); +static void dlm_reco_ast(void *astdata); +static void dlm_reco_bast(void *astdata, int blocked_type); +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); +static void dlm_request_all_locks_worker(struct dlm_work_item *item, + void *data); +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master); + +static u64 dlm_get_next_mig_cookie(void); + +static DEFINE_SPINLOCK(dlm_reco_state_lock); +static DEFINE_SPINLOCK(dlm_mig_cookie_lock); +static u64 dlm_mig_cookie = 1; + +static u64 dlm_get_next_mig_cookie(void) +{ + u64 c; + spin_lock(&dlm_mig_cookie_lock); + c = dlm_mig_cookie; + if (dlm_mig_cookie == (~0ULL)) + dlm_mig_cookie = 1; + else + dlm_mig_cookie++; + spin_unlock(&dlm_mig_cookie_lock); + return c; +} + +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + +static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + spin_unlock(&dlm->spinlock); +} + +/* Worker function used during recovery. */ +void dlm_dispatch_work(struct work_struct *work) +{ + struct dlm_ctxt *dlm = + container_of(work, struct dlm_ctxt, dispatched_work); + LIST_HEAD(tmp_list); + struct dlm_work_item *item, *next; + dlm_workfunc_t *workfunc; + int tot=0; + + spin_lock(&dlm->work_lock); + list_splice_init(&dlm->work_list, &tmp_list); + spin_unlock(&dlm->work_lock); + + list_for_each_entry(item, &tmp_list, list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_entry_safe(item, next, &tmp_list, list) { + workfunc = item->func; + list_del_init(&item->list); + + /* already have ref on dlm to avoid having + * it disappear. just double-check. */ + BUG_ON(item->dlm != dlm); + + /* this is allowed to sleep and + * call network stuff */ + workfunc(item, item->data); + + dlm_put(dlm); + kfree(item); + } +} + +/* + * RECOVERY THREAD + */ + +void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) +{ + /* wake the recovery thread + * this will wake the reco thread in one of three places + * 1) sleeping with no recovery happening + * 2) sleeping with recovery mastered elsewhere + * 3) recovery mastered here, waiting on reco data */ + + wake_up(&dlm->dlm_reco_thread_wq); +} + +/* Launch the recovery thread */ +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "starting dlm recovery thread...\n"); + + dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, + "dlm_reco_thread"); + if (IS_ERR(dlm->dlm_reco_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); + dlm->dlm_reco_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_reco_thread_task) { + mlog(0, "waiting for dlm recovery thread to exit\n"); + kthread_stop(dlm->dlm_reco_thread_task); + dlm->dlm_reco_thread_task = NULL; + } +} + + + +/* + * this is lame, but here's how recovery works... + * 1) all recovery threads cluster wide will work on recovering + * ONE node at a time + * 2) negotiate who will take over all the locks for the dead node. + * thats right... ALL the locks. + * 3) once a new master is chosen, everyone scans all locks + * and moves aside those mastered by the dead guy + * 4) each of these locks should be locked until recovery is done + * 5) the new master collects up all of secondary lock queue info + * one lock at a time, forcing each node to communicate back + * before continuing + * 6) each secondary lock queue responds with the full known lock info + * 7) once the new master has run all its locks, it sends a ALLDONE! + * message to everyone + * 8) upon receiving this message, the secondary queue node unlocks + * and responds to the ALLDONE + * 9) once the new master gets responses from everyone, he unlocks + * everything and recovery for this dead node is done + *10) go back to 2) while there are still dead nodes + * + */ + +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} + +#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) + +static int dlm_recovery_thread(void *data) +{ + int status; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + if (dlm_domain_fully_joined(dlm)) { + status = dlm_do_recovery(dlm); + if (status == -EAGAIN) { + /* do not sleep, recheck immediately. */ + continue; + } + if (status < 0) + mlog_errno(status); + } + + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM recovery thread\n"); + return 0; +} + +/* returns true when the recovery master has contacted us */ +static int dlm_reco_master_ready(struct dlm_ctxt *dlm) +{ + int ready; + spin_lock(&dlm->spinlock); + ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); + spin_unlock(&dlm->spinlock); + return ready; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) +{ + int dead; + spin_lock(&dlm->spinlock); + dead = !test_bit(node, dlm->domain_map); + spin_unlock(&dlm->spinlock); + return dead; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (dlm_is_node_dead(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + else + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); +} + +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (dlm_is_node_recovered(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + else + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); +} + +/* callers of the top-level api calls (dlmlock/dlmunlock) should + * block on the dlm->reco.event when recovery is in progress. + * the dlm recovery thread will set this state when it begins + * recovering a dead node (as the new master or not) and clear + * the state and wake as soon as all affected lock resources have + * been marked with the RECOVERY flag */ +static int dlm_in_recovery(struct dlm_ctxt *dlm) +{ + int in_recovery; + spin_lock(&dlm->spinlock); + in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + spin_unlock(&dlm->spinlock); + return in_recovery; +} + + +void dlm_wait_for_recovery(struct dlm_ctxt *dlm) +{ + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } + wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); +} + +static void dlm_begin_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", + dlm->name, dlm->reco.dead_node); + dlm->reco.state |= DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); +} + +static void dlm_end_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); + dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); + printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); + wake_up(&dlm->reco.event); +} + +static void dlm_print_recovery_master(struct dlm_ctxt *dlm) +{ + printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " + "dead node %u in domain %s\n", dlm->reco.new_master, + (dlm->node_num == dlm->reco.new_master ? "me" : "he"), + dlm->reco.dead_node, dlm->name); +} + +static int dlm_do_recovery(struct dlm_ctxt *dlm) +{ + int status = 0; + int ret; + + spin_lock(&dlm->spinlock); + + /* check to see if the new master has died */ + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && + test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "new master %u died while recovering %u!\n", + dlm->reco.new_master, dlm->reco.dead_node); + /* unset the new_master, leave dead_node */ + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); + } + + /* select a target to recover */ + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + int bit; + + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES || bit < 0) + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + else + dlm_set_reco_dead_node(dlm, bit); + } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { + /* BUG? */ + mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", + dlm->reco.dead_node); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + } + + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + // mlog(0, "nothing to recover! sleeping now!\n"); + spin_unlock(&dlm->spinlock); + /* return to main thread loop and sleep. */ + return 0; + } + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.dead_node); + spin_unlock(&dlm->spinlock); + + /* take write barrier */ + /* (stops the list reshuffling thread, proxy ast handling) */ + dlm_begin_recovery(dlm); + + if (dlm->reco.new_master == dlm->node_num) + goto master_here; + + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + /* choose a new master, returns 0 if this node + * is the master, -EEXIST if it's another node. + * this does not return until a new master is chosen + * or recovery completes entirely. */ + ret = dlm_pick_recovery_master(dlm); + if (!ret) { + /* already notified everyone. go. */ + goto master_here; + } + mlog(0, "another node will master this recovery session.\n"); + } + + dlm_print_recovery_master(dlm); + + /* it is safe to start everything back up here + * because all of the dead node's lock resources + * have been marked as in-recovery */ + dlm_end_recovery(dlm); + + /* sleep out in main dlm_recovery_thread loop. */ + return 0; + +master_here: + dlm_print_recovery_master(dlm); + + status = dlm_remaster_locks(dlm, dlm->reco.dead_node); + if (status < 0) { + /* we should never hit this anymore */ + mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " + "retrying.\n", dlm->name, status, dlm->reco.dead_node); + /* yield a bit to allow any final network messages + * to get handled on remaining nodes */ + msleep(100); + } else { + /* success! see if any other nodes need recovery */ + mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + } + dlm_end_recovery(dlm); + + /* continue and look for another dead node */ + return -EAGAIN; +} + +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) +{ + int status = 0; + struct dlm_reco_node_data *ndata; + int all_nodes_done; + int destroy = 0; + int pass = 0; + + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); + + /* safe to access the node data list without a lock, since this + * process is the only one to change the list */ + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); + ndata->state = DLM_RECO_NODE_DATA_REQUESTING; + + mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, + ndata->node_num); + + if (ndata->node_num == dlm->node_num) { + ndata->state = DLM_RECO_NODE_DATA_DONE; + continue; + } + + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } + } + } while (status != 0); + + spin_lock(&dlm_reco_state_lock); + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + case DLM_RECO_NODE_DATA_REQUESTED: + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after requesting " + "recovery info for node %u\n", + ndata->node_num, dead_node); + /* fine. don't need this node's info. + * continue without it. */ + break; + case DLM_RECO_NODE_DATA_REQUESTING: + ndata->state = DLM_RECO_NODE_DATA_REQUESTED; + mlog(0, "now receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + mlog(0, "already receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_DONE: + mlog(0, "already DONE receiving recovery data " + "from node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + } + spin_unlock(&dlm_reco_state_lock); + } + + mlog(0, "%s: Done requesting all lock info\n", dlm->name); + + /* nodes should be sending reco data now + * just need to wait */ + + while (1) { + /* check all the nodes now to see if we are + * done, or if anyone died */ + all_nodes_done = 1; + spin_lock(&dlm_reco_state_lock); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + mlog(0, "checking recovery state of node %u\n", + ndata->node_num); + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(ML_ERROR, "bad ndata state for " + "node %u: state=%d\n", + ndata->node_num, ndata->state); + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after " + "requesting recovery info for " + "node %u\n", ndata->node_num, + dead_node); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); + all_nodes_done = 0; + break; + case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, + all_nodes_done?"yes":"no"); + if (all_nodes_done) { + int ret; + + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state + * just send a finalize message to everyone and + * clean up */ + mlog(0, "all nodes are done! send finalize\n"); + ret = dlm_send_finalize_reco_message(dlm); + if (ret < 0) + mlog_errno(ret); + + spin_lock(&dlm->spinlock); + dlm_finish_local_lockres_recovery(dlm, dead_node, + dlm->node_num); + spin_unlock(&dlm->spinlock); + mlog(0, "should be done with recovery!\n"); + + mlog(0, "finishing recovery of %s at %lu, " + "dead=%u, this=%u, new=%u\n", dlm->name, + jiffies, dlm->reco.dead_node, + dlm->node_num, dlm->reco.new_master); + destroy = 1; + status = 0; + /* rescan everything marked dirty along the way */ + dlm_kick_thread(dlm, NULL); + break; + } + /* wait to be signalled, with periodic timeout + * to check for node death */ + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); + + } + + if (destroy) + dlm_destroy_recovery_area(dlm, dead_node); + + return status; +} + +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + int num=0; + struct dlm_reco_node_data *ndata; + + spin_lock(&dlm->spinlock); + memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + /* nodes can only be removed (by dying) after dropping + * this lock, and death will be trapped later, so this should do */ + spin_unlock(&dlm->spinlock); + + while (1) { + num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); + if (num >= O2NM_MAX_NODES) { + break; + } + BUG_ON(num == dead_node); + + ndata = kzalloc(sizeof(*ndata), GFP_NOFS); + if (!ndata) { + dlm_destroy_recovery_area(dlm, dead_node); + return -ENOMEM; + } + ndata->node_num = num; + ndata->state = DLM_RECO_NODE_DATA_INIT; + spin_lock(&dlm_reco_state_lock); + list_add_tail(&ndata->list, &dlm->reco.node_data); + spin_unlock(&dlm_reco_state_lock); + num++; + } + + return 0; +} + +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_reco_node_data *ndata, *next; + LIST_HEAD(tmplist); + + spin_lock(&dlm_reco_state_lock); + list_splice_init(&dlm->reco.node_data, &tmplist); + spin_unlock(&dlm_reco_state_lock); + + list_for_each_entry_safe(ndata, next, &tmplist, list) { + list_del_init(&ndata->list); + kfree(ndata); + } +} + +static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, + u8 dead_node) +{ + struct dlm_lock_request lr; + int ret; + int status; + + mlog(0, "\n"); + + + mlog(0, "dlm_request_all_locks: dead node is %u, sending request " + "to %u\n", dead_node, request_from); + + memset(&lr, 0, sizeof(lr)); + lr.node_idx = dlm->node_num; + lr.dead_node = dead_node; + + // send message + ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, + &lr, sizeof(lr), request_from, &status); + + /* negative status is handled by caller */ + if (ret < 0) + mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " + "to recover dead node %u\n", dlm->name, ret, + request_from, dead_node); + else + ret = status; + // return from here, then + // sleep until all received or error + return ret; + +} + +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; + char *buf = NULL; + struct dlm_work_item *item = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } + BUG_ON(lr->dead_node != dlm->reco.dead_node); + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + dlm_put(dlm); + return -ENOMEM; + } + + /* this will get freed by dlm_request_all_locks_worker */ + buf = (char *) __get_free_page(GFP_NOFS); + if (!buf) { + kfree(item); + dlm_put(dlm); + return -ENOMEM; + } + + /* queue up work for dlm_request_all_locks_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); + item->u.ral.reco_master = lr->node_idx; + item->u.ral.dead_node = lr->dead_node; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + + dlm_put(dlm); + return 0; +} + +static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_migratable_lockres *mres; + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; + LIST_HEAD(resources); + int ret; + u8 dead_node, reco_master; + int skip_all_done = 0; + + dlm = item->dlm; + dead_node = item->u.ral.dead_node; + reco_master = item->u.ral.reco_master; + mres = (struct dlm_migratable_lockres *)data; + + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + + if (dead_node != dlm->reco.dead_node || + reco_master != dlm->reco.new_master) { + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; + } + + /* lock resources should have already been moved to the + * dlm->reco.resources list. now move items from that list + * to a temp list if the dead owner matches. note that the + * whole cluster recovers only one node at a time, so we + * can safely move UNKNOWN lock resources for each recovery + * session. */ + dlm_move_reco_locks_to_list(dlm, &resources, dead_node); + + /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ + list_for_each_entry(res, &resources, recovering) { + ret = dlm_send_one_lockres(dlm, res, mres, reco_master, + DLM_MRES_RECOVERY); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } + } + + /* move the resources back to the list */ + spin_lock(&dlm->spinlock); + list_splice_init(&resources, &dlm->reco.resources); + spin_unlock(&dlm->spinlock); + + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: + free_page((unsigned long)data); +} + + +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) +{ + int ret, tmpret; + struct dlm_reco_data_done done_msg; + + memset(&done_msg, 0, sizeof(done_msg)); + done_msg.node_idx = dlm->node_num; + done_msg.dead_node = dead_node; + mlog(0, "sending DATA DONE message to %u, " + "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, + done_msg.dead_node); + + ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, + sizeof(done_msg), send_to, &tmpret); + if (ret < 0) { + mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " + "to recover dead node %u\n", dlm->name, ret, send_to, + dead_node); + if (!dlm_is_host_down(ret)) { + BUG(); + } + } else + ret = tmpret; + return ret; +} + + +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; + struct dlm_reco_node_data *ndata = NULL; + int ret = -EINVAL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); + + spin_lock(&dlm_reco_state_lock); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + if (ndata->node_num != done->node_idx) + continue; + + switch (ndata->state) { + /* should have moved beyond INIT but not to FINALIZE yet */ + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_DEAD: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(ML_ERROR, "bad ndata state for node %u:" + " state=%d\n", ndata->node_num, + ndata->state); + BUG(); + break; + /* these states are possible at this point, anywhere along + * the line of recovery */ + case DLM_RECO_NODE_DATA_DONE: + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(0, "node %u is DONE sending " + "recovery data!\n", + ndata->node_num); + + ndata->state = DLM_RECO_NODE_DATA_DONE; + ret = 0; + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + /* wake the recovery thread, some node is done */ + if (!ret) + dlm_kick_recovery_thread(dlm); + + if (ret < 0) + mlog(ML_ERROR, "failed to find recovery node data for node " + "%u\n", done->node_idx); + dlm_put(dlm); + + mlog(0, "leaving reco data done handler, ret=%d\n", ret); + return ret; +} + +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, + u8 dead_node) +{ + struct dlm_lock_resource *res, *next; + struct dlm_lock *lock; + + spin_lock(&dlm->spinlock); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); + continue; + } + + if (res->owner == dead_node) { + mlog(0, "found lockres owned by dead node while " + "doing recovery for node %u. sending it.\n", + dead_node); + list_move_tail(&res->recovering, list); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "found UNKNOWN owner while doing recovery " + "for node %u. sending it.\n", dead_node); + list_move_tail(&res->recovering, list); + } + } + spin_unlock(&dlm->spinlock); +} + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) +{ + int total_locks = 0; + struct list_head *iter, *queue = &res->granted; + int i; + + for (i=0; i<3; i++) { + list_for_each(iter, queue) + total_locks++; + queue++; + } + return total_locks; +} + + +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks) +{ + u64 mig_cookie = be64_to_cpu(mres->mig_cookie); + int mres_total_locks = be32_to_cpu(mres->total_locks); + int sz, ret = 0, status = 0; + u8 orig_flags = mres->flags, + orig_master = mres->master; + + BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); + if (!mres->num_locks) + return 0; + + sz = sizeof(struct dlm_migratable_lockres) + + (mres->num_locks * sizeof(struct dlm_migratable_lock)); + + /* add an all-done flag if we reached the last lock */ + orig_flags = mres->flags; + BUG_ON(total_locks > mres_total_locks); + if (total_locks == mres_total_locks) + mres->flags |= DLM_MRES_ALL_DONE; + + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", + send_to); + + /* send it */ + ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, + sz, send_to, &status); + if (ret < 0) { + /* XXX: negative status is not handled. + * this will end up killing this node. */ + mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " + "node %u (%s)\n", dlm->name, mres->lockname_len, + mres->lockname, ret, send_to, + (orig_flags & DLM_MRES_MIGRATION ? + "migration" : "recovery")); + } else { + /* might get an -ENOMEM back here */ + ret = status; + if (ret < 0) { + mlog_errno(ret); + + if (ret == -EFAULT) { + mlog(ML_ERROR, "node %u told me to kill " + "myself!\n", send_to); + BUG(); + } + } + } + + /* zero and reinit the message buffer */ + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, mres_total_locks, + mig_cookie, orig_flags, orig_master); + return ret; +} + +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master) +{ + /* mres here is one full page */ + clear_page(mres); + mres->lockname_len = namelen; + memcpy(mres->lockname, lockname, namelen); + mres->num_locks = 0; + mres->total_locks = cpu_to_be32(total_locks); + mres->mig_cookie = cpu_to_be64(cookie); + mres->flags = flags; + mres->master = master; +} + +static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, + int queue) +{ + if (!lock->lksb) + return; + + /* Ignore lvb in all locks in the blocked list */ + if (queue == DLM_BLOCKED_LIST) + return; + + /* Only consider lvbs in locks with granted EX or PR lock levels */ + if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE) + return; + + if (dlm_lvb_is_empty(mres->lvb)) { + memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); + return; + } + + /* Ensure the lvb copied for migration matches in other valid locks */ + if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN)) + return; + + mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, " + "node=%u\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->lockres->lockname.len, lock->lockres->lockname.name, + lock->ml.node); + dlm_print_one_lock_resource(lock->lockres); + BUG(); +} + +/* returns 1 if this lock fills the network structure, + * 0 otherwise */ +static int dlm_add_lock_to_array(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, int queue) +{ + struct dlm_migratable_lock *ml; + int lock_num = mres->num_locks; + + ml = &(mres->ml[lock_num]); + ml->cookie = lock->ml.cookie; + ml->type = lock->ml.type; + ml->convert_type = lock->ml.convert_type; + ml->highest_blocked = lock->ml.highest_blocked; + ml->list = queue; + if (lock->lksb) { + ml->flags = lock->lksb->flags; + dlm_prepare_lvb_for_migration(lock, mres, queue); + } + ml->node = lock->ml.node; + mres->num_locks++; + /* we reached the max, send this network message */ + if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) + return 1; + return 0; +} + +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, u8 flags) +{ + struct list_head *queue; + int total_locks, i; + u64 mig_cookie = 0; + struct dlm_lock *lock; + int ret = 0; + + BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + mlog(0, "sending to %u\n", send_to); + + total_locks = dlm_num_locks_in_lockres(res); + if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { + /* rare, but possible */ + mlog(0, "argh. lockres has %d locks. this will " + "require more than one network packet to " + "migrate\n", total_locks); + mig_cookie = dlm_get_next_mig_cookie(); + } + + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, total_locks, + mig_cookie, flags, res->owner); + + total_locks = 0; + for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry(lock, queue, list) { + /* add another lock. */ + total_locks++; + if (!dlm_add_lock_to_array(lock, mres, i)) + continue; + + /* this filled the lock message, + * we must send it immediately. */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, + res, total_locks); + if (ret < 0) + goto error; + } + } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } + /* flush any remaining locks */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) + BUG(); + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); + return ret; +} + + + +/* + * this message will contain no more than one page worth of + * recovery data, and it will work on only one lockres. + * there may be many locks in this page, and we may need to wait + * for additional packets to complete all the locks (rare, but + * possible). + */ +/* + * NOTE: the allocation error cases here are scary + * we really cannot afford to fail an alloc in recovery + * do we spin? returning an error only delays the problem really + */ + +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_migratable_lockres *mres = + (struct dlm_migratable_lockres *)msg->buf; + int ret = 0; + u8 real_master; + u8 extra_refs = 0; + char *buf = NULL; + struct dlm_work_item *item = NULL; + struct dlm_lock_resource *res = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + real_master = mres->master; + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* cannot migrate a lockres with no master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + } + + mlog(0, "%s message received from node %u\n", + (mres->flags & DLM_MRES_RECOVERY) ? + "recovery" : "migration", mres->master); + if (mres->flags & DLM_MRES_ALL_DONE) + mlog(0, "all done flag. all lockres data received!\n"); + + ret = -ENOMEM; + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!buf || !item) + goto leave; + + /* lookup the lock to see if we have a secondary queue for this + * already... just add the locks in and this will have its owner + * and RECOVERY flag changed when it completes. */ + res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + if (res) { + /* this will get a ref on res */ + /* mark it as recovering/migrating and hash it */ + spin_lock(&res->spinlock); + if (mres->flags & DLM_MRES_RECOVERY) { + res->state |= DLM_LOCK_RES_RECOVERING; + } else { + if (res->state & DLM_LOCK_RES_MIGRATING) { + /* this is at least the second + * lockres message */ + mlog(0, "lock %.*s is already migrating\n", + mres->lockname_len, + mres->lockname); + } else if (res->state & DLM_LOCK_RES_RECOVERING) { + /* caller should BUG */ + mlog(ML_ERROR, "node is attempting to migrate " + "lock %.*s, but marked as recovering!\n", + mres->lockname_len, mres->lockname); + ret = -EFAULT; + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + goto leave; + } + res->state |= DLM_LOCK_RES_MIGRATING; + } + spin_unlock(&res->spinlock); + } else { + /* need to allocate, just like if it was + * mastered here normally */ + res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); + if (!res) + goto leave; + + /* to match the ref that we would have gotten if + * dlm_lookup_lockres had succeeded */ + dlm_lockres_get(res); + + /* mark it as recovering/migrating and hash it */ + if (mres->flags & DLM_MRES_RECOVERY) + res->state |= DLM_LOCK_RES_RECOVERING; + else + res->state |= DLM_LOCK_RES_MIGRATING; + + spin_lock(&dlm->spinlock); + __dlm_insert_lockres(dlm, res); + spin_unlock(&dlm->spinlock); + + /* Add an extra ref for this lock-less lockres lest the + * dlm_thread purges it before we get the chance to add + * locks to it */ + dlm_lockres_get(res); + + /* There are three refs that need to be put. + * 1. Taken above. + * 2. kref_init in dlm_new_lockres()->dlm_init_lockres(). + * 3. dlm_lookup_lockres() + * The first one is handled at the end of this function. The + * other two are handled in the worker thread after locks have + * been attached. Yes, we don't wait for purge time to match + * kref_init. The lockres will still have atleast one ref + * added because it is in the hash __dlm_insert_lockres() */ + extra_refs++; + + /* now that the new lockres is inserted, + * make it usable by other processes */ + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + + /* at this point we have allocated everything we need, + * and we have a hashed lockres with an extra ref and + * the proper res->state flags. */ + ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); + if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* migration cannot have an unknown master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + mlog(0, "recovery has passed me a lockres with an " + "unknown owner.. will need to requery: " + "%.*s\n", mres->lockname_len, mres->lockname); + } else { + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ + dlm_change_lockres_owner(dlm, res, dlm->node_num); + } + spin_unlock(&res->spinlock); + + /* queue up work for dlm_mig_lockres_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */ + dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); + item->u.ml.lockres = res; /* already have a ref */ + item->u.ml.real_master = real_master; + item->u.ml.extra_ref = extra_refs; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + +leave: + /* One extra ref taken needs to be put here */ + if (extra_refs) + dlm_lockres_put(res); + + dlm_put(dlm); + if (ret < 0) { + kfree(buf); + kfree(item); + mlog_errno(ret); + } + + return ret; +} + + +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_migratable_lockres *mres; + int ret = 0; + struct dlm_lock_resource *res; + u8 real_master; + u8 extra_ref; + + dlm = item->dlm; + mres = (struct dlm_migratable_lockres *)data; + + res = item->u.ml.lockres; + real_master = item->u.ml.real_master; + extra_ref = item->u.ml.extra_ref; + + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* this case is super-rare. only occurs if + * node death happens during migration. */ +again: + ret = dlm_lockres_master_requery(dlm, res, &real_master); + if (ret < 0) { + mlog(0, "dlm_lockres_master_requery ret=%d\n", + ret); + goto again; + } + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lockres %.*s not claimed. " + "this node will take it.\n", + res->lockname.len, res->lockname.name); + } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + mlog(0, "master needs to respond to sender " + "that node %u still owns %.*s\n", + real_master, res->lockname.len, + res->lockname.name); + /* cannot touch this lockres */ + goto leave; + } + } + + ret = dlm_process_recovery_data(dlm, res, mres); + if (ret < 0) + mlog(0, "dlm_process_recovery_data returned %d\n", ret); + else + mlog(0, "dlm_process_recovery_data succeeded\n"); + + if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == + (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { + ret = dlm_finish_migration(dlm, res, mres->master); + if (ret < 0) + mlog_errno(ret); + } + +leave: + /* See comment in dlm_mig_lockres_handler() */ + if (res) { + if (extra_ref) + dlm_lockres_put(res); + dlm_lockres_put(res); + } + kfree(data); +} + + + +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master) +{ + struct dlm_node_iter iter; + int nodenum; + int ret = 0; + + *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; + + /* we only reach here if one of the two nodes in a + * migration died while the migration was in progress. + * at this point we need to requery the master. we + * know that the new_master got as far as creating + * an mle on at least one node, but we do not know + * if any nodes had actually cleared the mle and set + * the master to the new_master. the old master + * is supposed to set the owner to UNKNOWN in the + * event of a new_master death, so the only possible + * responses that we can get from nodes here are + * that the master is new_master, or that the master + * is UNKNOWN. + * if all nodes come back with UNKNOWN then we know + * the lock needs remastering here. + * if any node comes back with a valid master, check + * to see if that master is the one that we are + * recovering. if so, then the new_master died and + * we need to remaster this lock. if not, then the + * new_master survived and that node will respond to + * other nodes about the owner. + * if there is an owner, this node needs to dump this + * lockres and alert the sender that this lockres + * was rejected. */ + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + /* do not send to self */ + if (nodenum == dlm->node_num) + continue; + ret = dlm_do_master_requery(dlm, res, nodenum, real_master); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + /* host is down, so answer for that node would be + * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + } + if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lock master is %u\n", *real_master); + break; + } + } + return ret; +} + + +int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master) +{ + int ret = -EINVAL; + struct dlm_master_requery req; + int status = DLM_LOCK_RES_OWNER_UNKNOWN; + + memset(&req, 0, sizeof(req)); + req.node_idx = dlm->node_num; + req.namelen = res->lockname.len; + memcpy(req.name, res->lockname.name, res->lockname.len); + +resend: + ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, + &req, sizeof(req), nodenum, &status); + if (ret < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, + dlm->key, nodenum); + else if (status == -ENOMEM) { + mlog_errno(status); + msleep(50); + goto resend; + } else { + BUG_ON(status < 0); + BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); + *real_master = (u8) (status & 0xff); + mlog(0, "node %u responded to master requery with %u\n", + nodenum, *real_master); + ret = 0; + } + return ret; +} + + +/* this function cannot error, so unless the sending + * or receiving of the message failed, the owner can + * be trusted */ +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; + struct dlm_lock_resource *res = NULL; + unsigned int hash; + int master = DLM_LOCK_RES_OWNER_UNKNOWN; + u32 flags = DLM_ASSERT_MASTER_REQUERY; + + if (!dlm_grab(dlm)) { + /* since the domain has gone away on this + * node, the proper response is UNKNOWN */ + return master; + } + + hash = dlm_lockid_hash(req->name, req->namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); + if (res) { + spin_lock(&res->spinlock); + master = res->owner; + if (master == dlm->node_num) { + int ret = dlm_dispatch_assert_master(dlm, res, + 0, 0, flags); + if (ret < 0) { + mlog_errno(ret); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + spin_unlock(&dlm->spinlock); + dlm_put(dlm); + /* sender will take care of this and retry */ + return ret; + } else + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); + } else { + /* put.. incase we are not the master */ + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + } + } + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + return master; +} + +static inline struct list_head * +dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) +{ + struct list_head *ret; + BUG_ON(list_num < 0); + BUG_ON(list_num > 2); + ret = &(res->granted); + ret += list_num; + return ret; +} +/* TODO: do ast flush business + * TODO: do MIGRATING and RECOVERING spinning + */ + +/* +* NOTE about in-flight requests during migration: +* +* Before attempting the migrate, the master has marked the lockres as +* MIGRATING and then flushed all of its pending ASTS. So any in-flight +* requests either got queued before the MIGRATING flag got set, in which +* case the lock data will reflect the change and a return message is on +* the way, or the request failed to get in before MIGRATING got set. In +* this case, the caller will be told to spin and wait for the MIGRATING +* flag to be dropped, then recheck the master. +* This holds true for the convert, cancel and unlock cases, and since lvb +* updates are tied to these same messages, it applies to lvb updates as +* well. For the lock case, there is no way a lock can be on the master +* queue and not be on the secondary queue since the lock is always added +* locally first. This means that the new target node will never be sent +* a lock that he doesn't already have on the list. +* In total, this means that the local lock is correct and should not be +* updated to match the one sent by the master. Any messages sent back +* from the master before the MIGRATING flag will bring the lock properly +* up-to-date, and the change will be ordered properly for the waiter. +* We will *not* attempt to modify the lock underneath the waiter. +*/ + +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres) +{ + struct dlm_migratable_lock *ml; + struct list_head *queue, *iter; + struct list_head *tmpq = NULL; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + int ret = 0; + int i, j, bad; + struct dlm_lock *lock; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; + __be64 c; + + mlog(0, "running %d locks for this lockres\n", mres->num_locks); + for (i=0; inum_locks; i++) { + ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(dlm, res, from); + spin_unlock(&res->spinlock); + added++; + break; + } + BUG_ON(ml->highest_blocked != LKM_IVMODE); + newlock = NULL; + lksb = NULL; + + queue = dlm_list_num_to_pointer(res, ml->list); + tmpq = NULL; + + /* if the lock is for the local node it needs to + * be moved to the proper location within the queue. + * do not allocate a new lock structure. */ + if (ml->node == dlm->node_num) { + /* MIGRATION ONLY! */ + BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + + lock = NULL; + spin_lock(&res->spinlock); + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { + tmpq = dlm_list_idx_to_ptr(res, j); + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) + break; + lock = NULL; + } + if (lock) + break; + } + + /* lock is always created locally first, and + * destroyed locally last. it must be on the list */ + if (!lock) { + c = ml->cookie; + mlog(ML_ERROR, "Could not find local lock " + "with cookie %u:%llu, node %u, " + "list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); + __dlm_print_one_lock_resource(res); + BUG(); + } + + if (lock->ml.node != ml->node) { + c = lock->ml.cookie; + mlog(ML_ERROR, "Mismatched node# in lock " + "cookie %u:%llu, name %.*s, node %u\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + res->lockname.len, res->lockname.name, + lock->ml.node); + c = ml->cookie; + mlog(ML_ERROR, "Migrate lock cookie %u:%llu, " + "node %u, list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); + __dlm_print_one_lock_resource(res); + BUG(); + } + + if (tmpq != queue) { + c = ml->cookie; + mlog(0, "Lock cookie %u:%llu was on list %u " + "instead of list %u for %.*s\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + j, ml->list, res->lockname.len, + res->lockname.name); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + continue; + } + + /* see NOTE above about why we do not update + * to match the master here */ + + /* move the lock to its proper place */ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, queue); + spin_unlock(&res->spinlock); + added++; + + mlog(0, "just reordered a local lock!\n"); + continue; + } + + /* lock is for another node. */ + newlock = dlm_new_lock(ml->type, ml->node, + be64_to_cpu(ml->cookie), NULL); + if (!newlock) { + ret = -ENOMEM; + goto leave; + } + lksb = newlock->lksb; + dlm_lock_attach_lockres(newlock, res); + + if (ml->convert_type != LKM_IVMODE) { + BUG_ON(queue != &res->converting); + newlock->ml.convert_type = ml->convert_type; + } + lksb->flags |= (ml->flags & + (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + /* + * If the lock is in the blocked list it can't have a valid lvb, + * so skip it + */ + if (ml->list == DLM_BLOCKED_LIST) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { + if (lksb->flags & DLM_LKSB_PUT_LVB) { + /* other node was trying to update + * lvb when node died. recreate the + * lksb with the updated lvb. */ + memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); + } else { + /* otherwise, the node is sending its + * most recent valid lvb info */ + BUG_ON(ml->type != LKM_EXMODE && + ml->type != LKM_PRMODE); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; ilvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; ilvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); + } + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); + } + } +skip_lvb: + + /* NOTE: + * wrt lock queue ordering and recovery: + * 1. order of locks on granted queue is + * meaningless. + * 2. order of locks on converting queue is + * LOST with the node death. sorry charlie. + * 3. order of locks on the blocked queue is + * also LOST. + * order of locks does not affect integrity, it + * just means that a lock request may get pushed + * back in line as a result of the node death. + * also note that for a given node the lock order + * for its secondary queue locks is preserved + * relative to each other, but clearly *not* + * preserved relative to locks from other nodes. + */ + bad = 0; + spin_lock(&res->spinlock); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + if (mres->flags & DLM_MRES_RECOVERY && + ml->list == DLM_CONVERTING_LIST && + newlock->ml.type > + newlock->ml.convert_type) { + /* newlock is doing downconvert, add it to the + * head of converting list */ + list_add(&newlock->list, queue); + } else + list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(dlm, res, ml->node); + added++; + } + spin_unlock(&res->spinlock); + } + mlog(0, "done running all the locks\n"); + +leave: + /* balance the ref taken when the work was queued */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int i; + struct list_head *queue; + struct dlm_lock *lock, *next; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + res->state |= DLM_LOCK_RES_RECOVERING; + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->recovering); + dlm_lockres_put(res); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); + list_add_tail(&res->recovering, &dlm->reco.resources); + + /* find any pending locks and put them back on proper list */ + for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + dlm_lock_get(lock); + if (lock->convert_pending) { + /* move converting lock back to granted */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with convert pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_revert_pending_convert(res, lock); + lock->convert_pending = 0; + } else if (lock->lock_pending) { + /* remove pending lock requests completely */ + BUG_ON(i != DLM_BLOCKED_LIST); + mlog(0, "node died with lock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + /* lock will be floating until ref in + * dlmlock_remote is freed after the network + * call returns. ok for it to not be on any + * list since no ast can be called + * (the master is dead). */ + dlm_revert_pending_lock(res, lock); + lock->lock_pending = 0; + } else if (lock->unlock_pending) { + /* if an unlock was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master. note that the dlm_unlock + * call is still responsible for calling + * the unlockast. that will happen after + * the network call times out. for now, + * just move lists to prepare the new + * recovery master. */ + BUG_ON(i != DLM_GRANTED_LIST); + mlog(0, "node died with unlock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_unlock(res, lock); + lock->unlock_pending = 0; + } else if (lock->cancel_pending) { + /* if a cancel was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with cancel pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_cancel(res, lock); + lock->cancel_pending = 0; + } + dlm_lock_put(lock); + } + } +} + + + +/* removes all recovered locks from the recovery list. + * sets the res->owner to the new master. + * unsets the RECOVERY flag and wakes waiters. */ +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master) +{ + int i; + struct hlist_head *bucket; + struct dlm_lock_resource *res, *next; + + assert_spin_locked(&dlm->spinlock); + + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { + if (res->owner == dead_node) { + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); + list_del_init(&res->recovering); + spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + } + + /* this will become unnecessary eventually, but + * for now we need to run the whole hash, clear + * the RECOVERING state and set the owner + * if necessary */ + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_lockres_hash(dlm, i); + hlist_for_each_entry(res, bucket, hash_node) { + if (!(res->state & DLM_LOCK_RES_RECOVERING)) + continue; + + if (res->owner != dead_node && + res->owner != dlm->node_num) + continue; + + if (!list_empty(&res->recovering)) { + list_del_init(&res->recovering); + dlm_lockres_put(res); + } + + /* new_master has our reference from + * the lock state sent during recovery */ + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + } +} + +static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) +{ + if (local) { + if (lock->ml.type != LKM_EXMODE && + lock->ml.type != LKM_PRMODE) + return 1; + } else if (lock->ml.type == LKM_EXMODE) + return 1; + return 0; +} + +static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct list_head *queue; + struct dlm_lock *lock; + int blank_lvb = 0, local = 0; + int i; + u8 search_node; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (res->owner == dlm->node_num) + /* if this node owned the lockres, and if the dead node + * had an EX when he died, blank out the lvb */ + search_node = dead_node; + else { + /* if this is a secondary lockres, and we had no EX or PR + * locks granted, we can no longer trust the lvb */ + search_node = dlm->node_num; + local = 1; /* check local state for valid lvb */ + } + + for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node == search_node) { + if (dlm_lvb_needs_invalidation(lock, local)) { + /* zero the lksb lvb and lockres lvb */ + blank_lvb = 1; + memset(lock->lksb->lvb, 0, DLM_LVB_LEN); + } + } + } + } + + if (blank_lvb) { + mlog(0, "clearing %.*s lvb, dead node %u had EX\n", + res->lockname.len, res->lockname.name, dead_node); + memset(res->lvb, 0, DLM_LVB_LEN); + } +} + +static void dlm_free_dead_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct dlm_lock *lock, *next; + unsigned int freed = 0; + + /* this node is the lockres master: + * 1) remove any stale locks for the dead node + * 2) if the dead node had an EX when he died, blank out the lvb + */ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* We do two dlm_lock_put(). One for removing from list and the other is + * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ + + /* TODO: check pending_asts, pending_basts here */ + list_for_each_entry_safe(lock, next, &res->granted, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + list_for_each_entry_safe(lock, next, &res->converting, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + list_for_each_entry_safe(lock, next, &res->blocked, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + if(!test_bit(dead_node, res->refmap)) { + mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, " + "but ref was not set\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + __dlm_print_one_lock_resource(res); + } + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } + + /* do not kick thread yet */ + __dlm_dirty_lockres(dlm, res); +} + +/* if this node is the recovery master, and there are no + * locks for a given lockres owned by this node that are in + * either PR or EX mode, zero out the lvb before requesting. + * + */ + + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_lock_resource *res; + int i; + struct hlist_head *bucket; + struct dlm_lock *lock; + + + /* purge any stale mles */ + dlm_clean_master_list(dlm, dead_node); + + /* + * now clean up all lock resources. there are two rules: + * + * 1) if the dead node was the master, move the lockres + * to the recovering list. set the RECOVERING flag. + * this lockres needs to be cleaned up before it can + * be used further. + * + * 2) if this node was the master, remove all locks from + * each of the lockres queues that were owned by the + * dead node. once recovery finishes, the dlm thread + * can be kicked again to see if any ASTs or BASTs + * need to be fired as a result. + */ + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_lockres_hash(dlm, i); + hlist_for_each_entry(res, bucket, hash_node) { + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule + * DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); + continue; + } + spin_lock(&res->spinlock); + /* zero the lvb if necessary */ + dlm_revalidate_lvb(dlm, res, dead_node); + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "%s: res %.*s, Skip " + "recovery as it is being freed\n", + dlm->name, res->lockname.len, + res->lockname.name); + } else + dlm_move_lockres_to_recovery_list(dlm, + res); + + } else if (res->owner == dlm->node_num) { + dlm_free_dead_locks(dlm, res, dead_node); + __dlm_lockres_calc_usage(dlm, res); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } + } + spin_unlock(&res->spinlock); + } + } + +} + +static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) +{ + assert_spin_locked(&dlm->spinlock); + + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + + /* Clean up join state on node death. */ + if (dlm->joining_node == idx) { + mlog(0, "Clearing join state for node %u\n", idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + } + + /* check to see if the node is already considered dead */ + if (!test_bit(idx, dlm->live_nodes_map)) { + mlog(0, "for domain %s, node %d is already dead. " + "another node likely did recovery already.\n", + dlm->name, idx); + return; + } + + /* check to see if we do not care about this node */ + if (!test_bit(idx, dlm->domain_map)) { + /* This also catches the case that we get a node down + * but haven't joined the domain yet. */ + mlog(0, "node %u already removed from domain!\n", idx); + return; + } + + clear_bit(idx, dlm->live_nodes_map); + + /* make sure local cleanup occurs before the heartbeat events */ + if (!test_bit(idx, dlm->recovery_map)) + dlm_do_local_recovery_cleanup(dlm, idx); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, idx, 0); + + mlog(0, "node %u being removed from domain map!\n", idx); + clear_bit(idx, dlm->domain_map); + clear_bit(idx, dlm->exit_domain_map); + /* wake up migration waiters if a node goes down. + * perhaps later we can genericize this for other waiters. */ + wake_up(&dlm->migration_wq); + + if (test_bit(idx, dlm->recovery_map)) + mlog(0, "domain %s, node %u already added " + "to recovery map!\n", dlm->name, idx); + else + set_bit(idx, dlm->recovery_map); +} + +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + /* + * This will notify any dlm users that a node in our domain + * went away without notifying us first. + */ + if (test_bit(idx, dlm->domain_map)) + dlm_fire_domain_eviction_callbacks(dlm, idx); + + spin_lock(&dlm->spinlock); + __dlm_hb_node_down(dlm, idx); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + spin_lock(&dlm->spinlock); + set_bit(idx, dlm->live_nodes_map); + /* do NOT notify mle attached to the heartbeat events. + * new nodes are not interesting in mastery until joined. */ + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +static void dlm_reco_ast(void *astdata) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_bast(void *astdata, int blocked_type) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) +{ + mlog(0, "unlockast for recovery lock fired!\n"); +} + +/* + * dlm_pick_recovery_master will continually attempt to use + * dlmlock() on the special "$RECOVERY" lockres with the + * LKM_NOQUEUE flag to get an EX. every thread that enters + * this function on each node racing to become the recovery + * master will not stop attempting this until either: + * a) this node gets the EX (and becomes the recovery master), + * or b) dlm->reco.new_master gets set to some nodenum + * != O2NM_INVALID_NODE_NUM (another node will do the reco). + * so each time a recovery master is needed, the entire cluster + * will sync at this point. if the new master dies, that will + * be detected in dlm_do_recovery */ +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) +{ + enum dlm_status ret; + struct dlm_lockstatus lksb; + int status = -EINVAL; + + mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", + dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); +again: + memset(&lksb, 0, sizeof(lksb)); + + ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, + DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN, + dlm_reco_ast, dlm, dlm_reco_bast); + + mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", + dlm->name, ret, lksb.status); + + if (ret == DLM_NORMAL) { + mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", + dlm->name, dlm->node_num); + + /* got the EX lock. check to see if another node + * just became the reco master */ + if (dlm_reco_master_ready(dlm)) { + mlog(0, "%s: got reco EX lock, but %u will " + "do the recovery\n", dlm->name, + dlm->reco.new_master); + status = -EEXIST; + } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + /* this always succeeds */ + BUG_ON(status); + + /* set the new_master to this node */ + spin_lock(&dlm->spinlock); + dlm_set_reco_master(dlm, dlm->node_num); + spin_unlock(&dlm->spinlock); + } + + /* recovery lock is a special case. ast will not get fired, + * so just go ahead and unlock it. */ + ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret == DLM_DENIED) { + mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); + ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); + } + if (ret != DLM_NORMAL) { + /* this would really suck. this could only happen + * if there was a network error during the unlock + * because of node death. this means the unlock + * is actually "done" and the lock structure is + * even freed. we can continue, but only + * because this specific lock name is special. */ + mlog(ML_ERROR, "dlmunlock returned %d\n", ret); + } + } else if (ret == DLM_NOTQUEUED) { + mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", + dlm->name, dlm->node_num); + /* another node is master. wait on + * reco.new_master != O2NM_INVALID_NODE_NUM + * for at most one second */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_reco_master_ready(dlm), + msecs_to_jiffies(1000)); + if (!dlm_reco_master_ready(dlm)) { + mlog(0, "%s: reco master taking awhile\n", + dlm->name); + goto again; + } + /* another node has informed this one that it is reco master */ + mlog(0, "%s: reco master %u is ready to recover %u\n", + dlm->name, dlm->reco.new_master, dlm->reco.dead_node); + status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; + } else { + struct dlm_lock_resource *res; + + /* dlmlock returned something other than NOTQUEUED or NORMAL */ + mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " + "lksb.status=%s\n", dlm->name, dlm_errname(ret), + dlm_errname(lksb.status)); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + BUG(); + } + + return status; +} + +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_begin_reco br; + int ret = 0; + struct dlm_node_iter iter; + int nodenum; + int status; + + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + clear_bit(dead_node, iter.node_map); + + memset(&br, 0, sizeof(br)); + br.node_idx = dlm->node_num; + br.dead_node = dead_node; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = 0; + if (nodenum == dead_node) { + mlog(0, "not sending begin reco to dead node " + "%u\n", dead_node); + continue; + } + if (nodenum == dlm->node_num) { + mlog(0, "not sending begin reco to self\n"); + continue; + } +retry: + ret = -EINVAL; + mlog(0, "attempting to send begin reco msg to %d\n", + nodenum); + ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, + &br, sizeof(br), nodenum, &status); + /* negative status is handled ok by caller here */ + if (ret >= 0) + ret = status; + if (dlm_is_host_down(ret)) { + /* node is down. not involved in recovery + * so just keep going */ + mlog(ML_NOTICE, "%s: node %u was down when sending " + "begin reco msg (%d)\n", dlm->name, nodenum, ret); + ret = 0; + } + + /* + * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8, + * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN. + * We are handling both for compatibility reasons. + */ + if (ret == -EAGAIN || ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } + if (ret < 0) { + struct dlm_lock_resource *res; + + /* this is now a serious problem, possibly ENOMEM + * in the network stack. must retry */ + mlog_errno(ret); + mlog(ML_ERROR, "begin reco of dlm %s to node %u " + "returned %d\n", dlm->name, nodenum, ret); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + /* sleep for a bit in hopes that we can avoid + * another ENOMEM */ + msleep(100); + goto retry; + } + } + + return ret; +} + +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + dlm_put(dlm); + return -EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + + dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); + + spin_lock(&dlm->spinlock); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "%s: new_master %u died, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + } else { + mlog(0, "%s: new_master %u NOT DEAD, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + /* may not have seen the new master as dead yet */ + } + } + if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: dead_node previously set to %u, " + "node %u changing it to %u\n", dlm->name, + dlm->reco.dead_node, br->node_idx, br->dead_node); + } + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); + if (!test_bit(br->dead_node, dlm->recovery_map)) { + mlog(0, "recovery master %u sees %u as dead, but this " + "node has not yet. marking %u as dead\n", + br->node_idx, br->dead_node, br->dead_node); + if (!test_bit(br->dead_node, dlm->domain_map) || + !test_bit(br->dead_node, dlm->live_nodes_map)) + mlog(0, "%u not in domain/live_nodes map " + "so setting it in reco map manually\n", + br->dead_node); + /* force the recovery cleanup in __dlm_hb_node_down + * both of these will be cleared in a moment */ + set_bit(br->dead_node, dlm->domain_map); + set_bit(br->dead_node, dlm->live_nodes_map); + __dlm_hb_node_down(dlm, br->dead_node); + } + spin_unlock(&dlm->spinlock); + + dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + + dlm_put(dlm); + return 0; +} + +#define DLM_FINALIZE_STAGE2 0x01 +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) +{ + int ret = 0; + struct dlm_finalize_reco fr; + struct dlm_node_iter iter; + int nodenum; + int status; + int stage = 1; + + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + +stage2: + memset(&fr, 0, sizeof(fr)); + fr.node_idx = dlm->node_num; + fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + if (nodenum == dlm->node_num) + continue; + ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, + &fr, sizeof(fr), nodenum, &status); + if (ret >= 0) + ret = status; + if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG, + dlm->key, nodenum); + if (dlm_is_host_down(ret)) { + /* this has no effect on this recovery + * session, so set the status to zero to + * finish out the last recovery */ + mlog(ML_ERROR, "node %u went down after this " + "node finished recovery.\n", nodenum); + ret = 0; + continue; + } + break; + } + } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } + + return ret; +} + +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + + spin_lock(&dlm->spinlock); + + if (dlm->reco.new_master != fr->node_idx) { + mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " + "%u is supposed to be the new master, dead=%u\n", + fr->node_idx, dlm->reco.new_master, fr->dead_node); + BUG(); + } + if (dlm->reco.dead_node != fr->dead_node) { + mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " + "node %u, but node %u is supposed to be dead\n", + fr->node_idx, fr->dead_node, dlm->reco.dead_node); + BUG(); + } + + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + spin_unlock(&dlm->spinlock); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } + + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); + + dlm_put(dlm); + return 0; +} diff --git a/kernel/fs/ocfs2/dlm/dlmthread.c b/kernel/fs/ocfs2/dlm/dlmthread.c new file mode 100644 index 000000000..69aac6f08 --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmthread.c @@ -0,0 +1,756 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmthread.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) +#include "cluster/masklog.h" + +static int dlm_thread(void *data); +static void dlm_flush_asts(struct dlm_ctxt *dlm); + +#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) + +/* will exit holding res->spinlock, but may drop in function */ +/* waits until flags are cleared on res->state */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + + assert_spin_locked(&res->spinlock); + + add_wait_queue(&res->wq, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (res->state & flags) { + spin_unlock(&res->spinlock); + schedule(); + spin_lock(&res->spinlock); + goto repeat; + } + remove_wait_queue(&res->wq, &wait); + __set_current_state(TASK_RUNNING); +} + +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) +{ + if (list_empty(&res->granted) && + list_empty(&res->converting) && + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + int bit; + + assert_spin_locked(&res->spinlock); + + if (__dlm_lockres_has_locks(res)) + return 0; + + /* Locks are in the process of being created */ + if (res->inflight_locks) + return 0; + + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) + return 0; + + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + /* Another node has this resource with this node as the master */ + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) + return 0; + + return 1; +} + + +/* Call whenever you may have added or deleted something from one of + * the lockres queue's. This will figure out whether it belongs on the + * unused list or not and does the appropriate thing. */ +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (__dlm_lockres_unused(res)){ + if (list_empty(&res->purge)) { + mlog(0, "%s: Adding res %.*s to purge list\n", + dlm->name, res->lockname.len, res->lockname.name); + + res->last_used = jiffies; + dlm_lockres_get(res); + list_add_tail(&res->purge, &dlm->purge_list); + dlm->purge_count++; + } + } else if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purge list\n", + dlm->name, res->lockname.len, res->lockname.name); + + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } +} + +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + + __dlm_lockres_calc_usage(dlm, res); + + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); +} + +static void dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int master; + int ret = 0; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + master = (res->owner == dlm->node_num); + + mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name, + res->lockname.len, res->lockname.name, master); + + if (!master) { + res->state |= DLM_LOCK_RES_DROPPING_REF; + /* drop spinlock... retake below */ + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + /* This ensures that clear refmap is sent after the set */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + spin_unlock(&res->spinlock); + + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + if (!dlm_is_host_down(ret)) + BUG(); + } + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + } + + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist, master %d\n", + dlm->name, res->lockname.len, res->lockname.name, master); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } else + spin_unlock(&res->spinlock); +} + +static void dlm_run_purge_list(struct dlm_ctxt *dlm, + int purge_now) +{ + unsigned int run_max, unused; + unsigned long purge_jiffies; + struct dlm_lock_resource *lockres; + + spin_lock(&dlm->spinlock); + run_max = dlm->purge_count; + + while(run_max && !list_empty(&dlm->purge_list)) { + run_max--; + + lockres = list_entry(dlm->purge_list.next, + struct dlm_lock_resource, purge); + + spin_lock(&lockres->spinlock); + + purge_jiffies = lockres->last_used + + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); + + /* Make sure that we want to be processing this guy at + * this time. */ + if (!purge_now && time_after(purge_jiffies, jiffies)) { + /* Since resources are added to the purge list + * in tail order, we can stop at the first + * unpurgable resource -- anyone added after + * him will have a greater last_used value */ + spin_unlock(&lockres->spinlock); + break; + } + + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it. */ + unused = __dlm_lockres_unused(lockres); + if (!unused || + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { + mlog(0, "%s: res %.*s is in use or being remastered, " + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); + spin_unlock(&lockres->spinlock); + continue; + } + + dlm_lockres_get(lockres); + + dlm_purge_lockres(dlm, lockres); + + dlm_lockres_put(lockres); + + /* Avoid adding any scheduling latencies */ + cond_resched_lock(&dlm->spinlock); + } + + spin_unlock(&dlm->spinlock); +} + +static void dlm_shuffle_lists(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct dlm_lock *lock, *target; + int can_grant = 1; + + /* + * Because this function is called with the lockres + * spinlock, and because we know that it is not migrating/ + * recovering/in-progress, it is fine to reserve asts and + * basts right before queueing them all throughout + */ + assert_spin_locked(&dlm->ast_lock); + assert_spin_locked(&res->spinlock); + BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_IN_PROGRESS))); + +converting: + if (list_empty(&res->converting)) + goto blocked; + mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name, + res->lockname.len, res->lockname.name); + + target = list_entry(res->converting.next, struct dlm_lock, list); + if (target->ml.convert_type == LKM_IVMODE) { + mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n", + dlm->name, res->lockname.len, res->lockname.name); + BUG(); + } + list_for_each_entry(lock, &res->granted, list) { + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + /* queue the BAST if not already */ + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + /* update the highest_blocked if needed */ + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + + list_for_each_entry(lock, &res->converting, list) { + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + + /* we can convert the lock */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type " + "%d => %d, node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), + target->ml.type, + target->ml.convert_type, target->ml.node); + + target->ml.type = target->ml.convert_type; + target->ml.convert_type = LKM_IVMODE; + list_move_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + __dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +blocked: + if (list_empty(&res->blocked)) + goto leave; + target = list_entry(res->blocked.next, struct dlm_lock, list); + + list_for_each_entry(lock, &res->granted, list) { + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + list_for_each_entry(lock, &res->converting, list) { + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + /* we can grant the blocked lock (only + * possible if converting list empty) */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), + target->ml.type, target->ml.node); + + /* target->ml.type is already correct */ + list_move_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + __dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +leave: + return; +} + +/* must have NO locks when calling this with res !=NULL * */ +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + if (res) { + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + } + wake_up(&dlm->dlm_thread_wq); +} + +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* don't shuffle secondary queues */ + if ((res->owner == dlm->node_num)) { + if (res->state & (DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_BLOCK_DIRTY)) + return; + + if (list_empty(&res->dirty)) { + /* ref for dirty_list */ + dlm_lockres_get(res); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } + } + + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); +} + + +/* Launch the NM thread for the mounted volume */ +int dlm_launch_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "Starting dlm_thread...\n"); + + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + if (IS_ERR(dlm->dlm_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_thread_task)); + dlm->dlm_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_thread_task) { + mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n"); + kthread_stop(dlm->dlm_thread_task); + dlm->dlm_thread_task = NULL; + } +} + +static int dlm_dirty_list_empty(struct dlm_ctxt *dlm) +{ + int empty; + + spin_lock(&dlm->spinlock); + empty = list_empty(&dlm->dirty_list); + spin_unlock(&dlm->spinlock); + + return empty; +} + +static void dlm_flush_asts(struct dlm_ctxt *dlm) +{ + int ret; + struct dlm_lock *lock; + struct dlm_lock_resource *res; + u8 hi; + + spin_lock(&dlm->ast_lock); + while (!list_empty(&dlm->pending_asts)) { + lock = list_entry(dlm->pending_asts.next, + struct dlm_lock, ast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.node); + + BUG_ON(!lock->ast_pending); + + /* remove from list (including ref) */ + list_del_init(&lock->ast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_do_remote_ast(dlm, res, lock); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_ast(dlm, res, lock); + + spin_lock(&dlm->ast_lock); + + /* possible that another ast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->ast_list)) { + mlog(0, "%s: res %.*s, AST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); + } else + lock->ast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + + while (!list_empty(&dlm->pending_basts)) { + lock = list_entry(dlm->pending_basts.next, + struct dlm_lock, bast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + + BUG_ON(!lock->bast_pending); + + /* get the highest blocked lock, and reset */ + spin_lock(&lock->spinlock); + BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE); + hi = lock->ml.highest_blocked; + lock->ml.highest_blocked = LKM_IVMODE; + spin_unlock(&lock->spinlock); + + /* remove from list (including ref) */ + list_del_init(&lock->bast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, " + "blocked %d, node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + hi, lock->ml.node); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_send_proxy_bast(dlm, res, lock, hi); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_bast(dlm, res, lock, hi); + + spin_lock(&dlm->ast_lock); + + /* possible that another bast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->bast_list)) { + mlog(0, "%s: res %.*s, BAST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); + } else + lock->bast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + wake_up(&dlm->ast_wq); + spin_unlock(&dlm->ast_lock); +} + + +#define DLM_THREAD_TIMEOUT_MS (4 * 1000) +#define DLM_THREAD_MAX_DIRTY 100 +#define DLM_THREAD_MAX_ASTS 10 + +static int dlm_thread(void *data) +{ + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + int n = DLM_THREAD_MAX_DIRTY; + + /* dlm_shutting_down is very point-in-time, but that + * doesn't matter as we'll just loop back around if we + * get false on the leading edge of a state + * transition. */ + dlm_run_purge_list(dlm, dlm_shutting_down(dlm)); + + /* We really don't want to hold dlm->spinlock while + * calling dlm_shuffle_lists on each lockres that + * needs to have its queues adjusted and AST/BASTs + * run. So let's pull each entry off the dirty_list + * and drop dlm->spinlock ASAP. Once off the list, + * res->spinlock needs to be taken again to protect + * the queues while calling dlm_shuffle_lists. */ + spin_lock(&dlm->spinlock); + while (!list_empty(&dlm->dirty_list)) { + int delay = 0; + res = list_entry(dlm->dirty_list.next, + struct dlm_lock_resource, dirty); + + /* peel a lockres off, remove it from the list, + * unset the dirty flag and drop the dlm lock */ + BUG_ON(!res); + dlm_lockres_get(res); + + spin_lock(&res->spinlock); + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ + list_del_init(&res->dirty); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); + + /* lockres can be re-dirtied/re-added to the + * dirty_list in this gap, but that is ok */ + + spin_lock(&dlm->ast_lock); + spin_lock(&res->spinlock); + if (res->owner != dlm->node_num) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d," + " dirty %d\n", dlm->name, + !!(res->state & DLM_LOCK_RES_IN_PROGRESS), + !!(res->state & DLM_LOCK_RES_MIGRATING), + !!(res->state & DLM_LOCK_RES_RECOVERING), + !!(res->state & DLM_LOCK_RES_DIRTY)); + } + BUG_ON(res->owner != dlm->node_num); + + /* it is now ok to move lockreses in these states + * to the dirty list, assuming that they will only be + * dirty for a short while. */ + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + if (res->state & (DLM_LOCK_RES_IN_PROGRESS | + DLM_LOCK_RES_RECOVERING)) { + /* move it to the tail and keep going */ + res->state &= ~DLM_LOCK_RES_DIRTY; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); + mlog(0, "%s: res %.*s, inprogress, delay list " + "shuffle, state %d\n", dlm->name, + res->lockname.len, res->lockname.name, + res->state); + delay = 1; + goto in_progress; + } + + /* at this point the lockres is not migrating/ + * recovering/in-progress. we have the lockres + * spinlock and do NOT have the dlm lock. + * safe to reserve/queue asts and run the lists. */ + + /* called while holding lockres lock */ + dlm_shuffle_lists(dlm, res); + res->state &= ~DLM_LOCK_RES_DIRTY; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); + + dlm_lockres_calc_usage(dlm, res); + +in_progress: + + spin_lock(&dlm->spinlock); + /* if the lock was in-progress, stick + * it on the back of the list */ + if (delay) { + spin_lock(&res->spinlock); + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + } + dlm_lockres_put(res); + + /* unlikely, but we may need to give time to + * other tasks */ + if (!--n) { + mlog(0, "%s: Throttling dlm thread\n", + dlm->name); + break; + } + } + + spin_unlock(&dlm->spinlock); + dlm_flush_asts(dlm); + + /* yield and continue right away if there is more work to do */ + if (!n) { + cond_resched(); + continue; + } + + wait_event_interruptible_timeout(dlm->dlm_thread_wq, + !dlm_dirty_list_empty(dlm) || + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM thread\n"); + return 0; +} diff --git a/kernel/fs/ocfs2/dlm/dlmunlock.c b/kernel/fs/ocfs2/dlm/dlmunlock.c new file mode 100644 index 000000000..2e3c9dbab --- /dev/null +++ b/kernel/fs/ocfs2/dlm/dlmunlock.c @@ -0,0 +1,698 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmunlock.c + * + * underlying calls for unlocking locks + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +#define DLM_UNLOCK_FREE_LOCK 0x00000001 +#define DLM_UNLOCK_CALL_AST 0x00000002 +#define DLM_UNLOCK_REMOVE_LOCK 0x00000004 +#define DLM_UNLOCK_REGRANT_LOCK 0x00000008 +#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010 + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); + +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner); + + +/* + * according to the spec: + * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf + * + * flags & LKM_CANCEL != 0: must be converting or blocked + * flags & LKM_CANCEL == 0: must be granted + * + * So to unlock a converting lock, you must first cancel the + * convert (passing LKM_CANCEL in flags), then call the unlock + * again (with no LKM_CANCEL in flags). + */ + + +/* + * locking: + * caller needs: none + * taken: res->spinlock and lock->spinlock taken and dropped + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + * all callers should have taken an extra ref on lock coming in + */ +static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast, + int master_node) +{ + enum dlm_status status; + int actions = 0; + int in_use; + u8 owner; + + mlog(0, "master_node = %d, valblk = %d\n", master_node, + flags & LKM_VALBLK); + + if (master_node) + BUG_ON(res->owner != dlm->node_num); + else + BUG_ON(res->owner == dlm->node_num); + + spin_lock(&dlm->ast_lock); + /* We want to be sure that we're not freeing a lock + * that still has AST's pending... */ + in_use = !list_empty(&lock->ast_list); + spin_unlock(&dlm->ast_lock); + if (in_use && !(flags & LKM_CANCEL)) { + mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " + "while waiting for an ast!", res->lockname.len, + res->lockname.name); + return DLM_BADPARAM; + } + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_IN_PROGRESS) { + if (master_node && !(flags & LKM_CANCEL)) { + mlog(ML_ERROR, "lockres in progress!\n"); + spin_unlock(&res->spinlock); + return DLM_FORWARD; + } + /* ok for this to sleep if not in a network handler */ + __dlm_wait_on_lockres(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + } + spin_lock(&lock->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) { + status = DLM_RECOVERING; + goto leave; + } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + status = DLM_MIGRATING; + goto leave; + } + + /* see above for what the spec says about + * LKM_CANCEL and the lock queue state */ + if (flags & LKM_CANCEL) + status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions); + else + status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); + + if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node)) + goto leave; + + /* By now this has been masked out of cancel requests. */ + if (flags & LKM_VALBLK) { + /* make the final update to the lvb */ + if (master_node) + memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); + else + flags |= LKM_PUT_LVB; /* let the send function + * handle it. */ + } + + if (!master_node) { + owner = res->owner; + /* drop locks and send message */ + if (flags & LKM_CANCEL) + lock->cancel_pending = 1; + else + lock->unlock_pending = 1; + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, + flags, owner); + spin_lock(&res->spinlock); + spin_lock(&lock->spinlock); + /* if the master told us the lock was already granted, + * let the ast handle all of these actions */ + if (status == DLM_CANCELGRANT) { + actions &= ~(DLM_UNLOCK_REMOVE_LOCK| + DLM_UNLOCK_REGRANT_LOCK| + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { + /* must clear the actions because this unlock + * is about to be retried. cannot free or do + * any list manipulation. */ + mlog(0, "%s:%.*s: clearing actions, %s\n", + dlm->name, res->lockname.len, + res->lockname.name, + status==DLM_RECOVERING?"recovering": + (status==DLM_MIGRATING?"migrating": + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); + actions = 0; + } + if (flags & LKM_CANCEL) + lock->cancel_pending = 0; + else + lock->unlock_pending = 0; + + } + + /* get an extra ref on lock. if we are just switching + * lists here, we dont want the lock to go away. */ + dlm_lock_get(lock); + + if (actions & DLM_UNLOCK_REMOVE_LOCK) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_REGRANT_LOCK) { + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + } + if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) { + mlog(0, "clearing convert_type at %smaster node\n", + master_node ? "" : "non-"); + lock->ml.convert_type = LKM_IVMODE; + } + + /* remove the extra ref on lock */ + dlm_lock_put(lock); + +leave: + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + if (!dlm_lock_on_list(&res->converting, lock)) + BUG_ON(lock->ml.convert_type != LKM_IVMODE); + else + BUG_ON(lock->ml.convert_type == LKM_IVMODE); + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* let the caller's final dlm_lock_put handle the actual kfree */ + if (actions & DLM_UNLOCK_FREE_LOCK) { + /* this should always be coupled with list removal */ + BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); + mlog(0, "lock %u:%llu should be gone now! refs=%d\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + atomic_read(&lock->lock_refs.refcount)-1); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_CALL_AST) + *call_ast = 1; + + /* if cancel or unlock succeeded, lvb work is done */ + if (status == DLM_NORMAL) + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); + + return status; +} + +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* leave DLM_LKSB_PUT_LVB on the lksb so any final + * update of the lvb will be sent to the new master */ + list_del_init(&lock->list); +} + +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + list_move_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; +} + + +static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1); +} + +static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0); +} + +/* + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + */ +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner) +{ + struct dlm_unlock_lock unlock; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); + + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + + memset(&unlock, 0, sizeof(unlock)); + unlock.node_idx = dlm->node_num; + unlock.flags = cpu_to_be32(flags); + unlock.cookie = lock->ml.cookie; + unlock.namelen = res->lockname.len; + memcpy(unlock.name, res->lockname.name, unlock.namelen); + + vec[0].iov_len = sizeof(struct dlm_unlock_lock); + vec[0].iov_base = &unlock; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key, + vec, veclen, owner, &status); + if (tmpret >= 0) { + // successfully sent and received + if (status == DLM_FORWARD) + mlog(0, "master was in-progress. retry\n"); + ret = status; + } else { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner); + if (dlm_is_host_down(tmpret)) { + /* NOTE: this seems strange, but it is what we want. + * when the master goes down during a cancel or + * unlock, the recovery code completes the operation + * as if the master had not died, then passes the + * updated state to the recovery master. this thread + * just needs to finish out the operation and call + * the unlockast. */ + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; + } else { + /* something bad. this will BUG in ocfs2 */ + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, + * return value from dlmunlock_master + */ +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + enum dlm_status status = DLM_NORMAL; + int found = 0, i; + struct dlm_lockstatus *lksb = NULL; + int ignore; + u32 flags; + struct list_head *queue; + + flags = be32_to_cpu(unlock->flags); + + if (flags & LKM_GET_LVB) { + mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n"); + return DLM_BADARGS; + } + + if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) { + mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL " + "request!\n"); + return DLM_BADARGS; + } + + if (unlock->namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length in unlock handler!\n"); + return DLM_IVBUFLEN; + } + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none"); + + res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen); + if (!res) { + /* We assume here that a no lock resource simply means + * it was migrated away and destroyed before the other + * node could detect it. */ + mlog(0, "returning DLM_FORWARD -- res no longer exists\n"); + status = DLM_FORWARD; + goto not_found; + } + + queue=&res->granted; + found = 0; + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_RECOVERING\n"); + status = DLM_RECOVERING; + goto leave; + } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MIGRATING\n"); + status = DLM_MIGRATING; + goto leave; + } + + if (res->owner != dlm->node_num) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_FORWARD -- not master\n"); + status = DLM_FORWARD; + goto leave; + } + + for (i=0; i<3; i++) { + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == unlock->cookie && + lock->ml.node == unlock->node_idx) { + dlm_lock_get(lock); + found = 1; + break; + } + } + if (found) + break; + /* scan granted -> converting -> blocked queues */ + queue++; + } + spin_unlock(&res->spinlock); + if (!found) { + status = DLM_IVLOCKID; + goto not_found; + } + + /* lock was found on queue */ + lksb = lock->lksb; + if (flags & (LKM_VALBLK|LKM_PUT_LVB) && + lock->ml.type != LKM_EXMODE) + flags &= ~(LKM_VALBLK|LKM_PUT_LVB); + + /* unlockast only called on originating node */ + if (flags & LKM_PUT_LVB) { + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN); + } + + /* if this is in-progress, propagate the DLM_FORWARD + * all the way back out */ + status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore); + if (status == DLM_FORWARD) + mlog(0, "lockres is in progress\n"); + + if (flags & LKM_PUT_LVB) + lksb->flags &= ~DLM_LKSB_PUT_LVB; + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + +not_found: + if (!found) + mlog(ML_ERROR, "failed to find lock to unlock! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); + else + dlm_lock_put(lock); + +leave: + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + if (dlm_lock_on_list(&res->blocked, lock)) { + /* cancel this outright */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } else if (dlm_lock_on_list(&res->converting, lock)) { + /* cancel the request, put back on granted */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK | + DLM_UNLOCK_REGRANT_LOCK | + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (dlm_lock_on_list(&res->granted, lock)) { + /* too late, already granted. */ + status = DLM_CANCELGRANT; + *actions = DLM_UNLOCK_CALL_AST; + } else { + mlog(ML_ERROR, "lock to cancel is not on any list!\n"); + status = DLM_IVLOCKID; + *actions = 0; + } + return status; +} + +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + /* unlock request */ + if (!dlm_lock_on_list(&res->granted, lock)) { + status = DLM_DENIED; + dlm_error(status); + *actions = 0; + } else { + /* unlock granted lock */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_FREE_LOCK | + DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } + return status; +} + +/* there seems to be no point in doing this async + * since (even for the remote case) there is really + * no work to queue up... so just do it and fire the + * unlockast by hand when done... */ +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, + int flags, dlm_astunlockfunc_t *unlockast, void *data) +{ + enum dlm_status status; + struct dlm_lock_resource *res; + struct dlm_lock *lock = NULL; + int call_ast, is_master; + + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) { + mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n"); + flags &= ~LKM_VALBLK; + } + + if (!lksb->lockid || !lksb->lockid->lockres) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + lock = lksb->lockid; + BUG_ON(!lock); + dlm_lock_get(lock); + + res = lock->lockres; + BUG_ON(!res); + dlm_lockres_get(res); +retry: + call_ast = 0; + /* need to retry up here because owner may have changed */ + mlog(0, "lock=%p res=%p\n", lock, res); + + spin_lock(&res->spinlock); + is_master = (res->owner == dlm->node_num); + if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE) + flags &= ~LKM_VALBLK; + spin_unlock(&res->spinlock); + + if (is_master) { + status = dlmunlock_master(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_master: returned %d, " + "call_ast is %d\n", status, call_ast); + } else { + status = dlmunlock_remote(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_remote: returned %d, " + "call_ast is %d\n", status, call_ast); + } + + if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + + /* We want to go away for a tiny bit to allow recovery + * / migration to complete on this resource. I don't + * know of any wait queue we could sleep on as this + * may be happening on another node. Perhaps the + * proper solution is to queue up requests on the + * other end? */ + + /* do we want to yield(); ?? */ + msleep(50); + + mlog(0, "retrying unlock due to pending recovery/" + "migration/in-progress/reconnect\n"); + goto retry; + } + + if (call_ast) { + mlog(0, "calling unlockast(%p, %d)\n", data, status); + if (is_master) { + /* it is possible that there is one last bast + * pending. make sure it is flushed, then + * call the unlockast. + * not an issue if this is a mastered remotely, + * since this lock has been removed from the + * lockres queues and cannot be found. */ + dlm_kick_thread(dlm, NULL); + wait_event(dlm->ast_wq, + dlm_lock_basts_flushed(dlm, lock)); + } + (*unlockast)(data, status); + } + + if (status == DLM_CANCELGRANT) + status = DLM_NORMAL; + + if (status == DLM_NORMAL) { + mlog(0, "kicking the thread\n"); + dlm_kick_thread(dlm, res); + } else + dlm_error(status); + + dlm_lockres_calc_usage(dlm, res); + dlm_lockres_put(res); + dlm_lock_put(lock); + + mlog(0, "returning status=%d!\n", status); + return status; +} +EXPORT_SYMBOL_GPL(dlmunlock); + diff --git a/kernel/fs/ocfs2/dlmfs/Makefile b/kernel/fs/ocfs2/dlmfs/Makefile new file mode 100644 index 000000000..eed3db8c5 --- /dev/null +++ b/kernel/fs/ocfs2/dlmfs/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o + +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/kernel/fs/ocfs2/dlmfs/dlmfs.c b/kernel/fs/ocfs2/dlmfs/dlmfs.c new file mode 100644 index 000000000..b5cf27dcb --- /dev/null +++ b/kernel/fs/ocfs2/dlmfs/dlmfs.c @@ -0,0 +1,690 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfs.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. This file handles the virtual file system + * used for communication with userspace. Credit should go to ramfs, + * which was a template for the fs side of this module. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* Simple VFS hooks based on: */ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "stackglue.h" +#include "userdlm.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + + +static const struct super_operations dlmfs_ops; +static const struct file_operations dlmfs_file_operations; +static const struct inode_operations dlmfs_dir_inode_operations; +static const struct inode_operations dlmfs_root_inode_operations; +static const struct inode_operations dlmfs_file_inode_operations; +static struct kmem_cache *dlmfs_inode_cache; + +struct workqueue_struct *user_dlm_worker; + + + +/* + * These are the ABI capabilities of dlmfs. + * + * Over time, dlmfs has added some features that were not part of the + * initial ABI. Unfortunately, some of these features are not detectable + * via standard usage. For example, Linux's default poll always returns + * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs + * added poll support. Instead, we provide this list of new capabilities. + * + * Capabilities is a read-only attribute. We do it as a module parameter + * so we can discover it whether dlmfs is built in, loaded, or even not + * loaded. + * + * The ABI features are local to this machine's dlmfs mount. This is + * distinct from the locking protocol, which is concerned with inter-node + * interaction. + * + * Capabilities: + * - bast : POLLIN against the file descriptor of a held lock + * signifies a bast fired on the lock. + */ +#define DLMFS_CAPABILITIES "bast stackglue" +static int param_set_dlmfs_capabilities(const char *val, + struct kernel_param *kp) +{ + printk(KERN_ERR "%s: readonly parameter\n", kp->name); + return -EINVAL; +} +static int param_get_dlmfs_capabilities(char *buffer, + struct kernel_param *kp) +{ + return strlcpy(buffer, DLMFS_CAPABILITIES, + strlen(DLMFS_CAPABILITIES) + 1); +} +module_param_call(capabilities, param_set_dlmfs_capabilities, + param_get_dlmfs_capabilities, NULL, 0444); +MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); + + +/* + * decodes a set of open flags into a valid lock level and a set of flags. + * returns < 0 if we have invalid flags + * flags which mean something to us: + * O_RDONLY -> PRMODE level + * O_WRONLY -> EXMODE level + * + * O_NONBLOCK -> NOQUEUE + */ +static int dlmfs_decode_open_flags(int open_flags, + int *level, + int *flags) +{ + if (open_flags & (O_WRONLY|O_RDWR)) + *level = DLM_LOCK_EX; + else + *level = DLM_LOCK_PR; + + *flags = 0; + if (open_flags & O_NONBLOCK) + *flags |= DLM_LKF_NOQUEUE; + + return 0; +} + +static int dlmfs_file_open(struct inode *inode, + struct file *file) +{ + int status, level, flags; + struct dlmfs_filp_private *fp = NULL; + struct dlmfs_inode_private *ip; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + file->f_flags); + + status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); + if (status < 0) + goto bail; + + /* We don't want to honor O_APPEND at read/write time as it + * doesn't make sense for LVB writes. */ + file->f_flags &= ~O_APPEND; + + fp = kmalloc(sizeof(*fp), GFP_NOFS); + if (!fp) { + status = -ENOMEM; + goto bail; + } + fp->fp_lock_level = level; + + ip = DLMFS_I(inode); + + status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); + if (status < 0) { + /* this is a strange error to return here but I want + * to be able userspace to be able to distinguish a + * valid lock request from one that simply couldn't be + * granted. */ + if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN) + status = -ETXTBSY; + kfree(fp); + goto bail; + } + + file->private_data = fp; +bail: + return status; +} + +static int dlmfs_file_release(struct inode *inode, + struct file *file) +{ + int level, status; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + struct dlmfs_filp_private *fp = file->private_data; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "close called on inode %lu\n", inode->i_ino); + + status = 0; + if (fp) { + level = fp->fp_lock_level; + if (level != DLM_LOCK_IV) + user_dlm_cluster_unlock(&ip->ip_lockres, level); + + kfree(fp); + file->private_data = NULL; + } + + return 0; +} + +/* + * We do ->setattr() just to override size changes. Our size is the size + * of the LVB and nothing else. + */ +static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = d_inode(dentry); + + attr->ia_valid &= ~ATTR_SIZE; + error = inode_change_ok(inode, attr); + if (error) + return error; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +{ + int event = 0; + struct inode *inode = file_inode(file); + struct dlmfs_inode_private *ip = DLMFS_I(inode); + + poll_wait(file, &ip->ip_lockres.l_event, wait); + + spin_lock(&ip->ip_lockres.l_lock); + if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) + event = POLLIN | POLLRDNORM; + spin_unlock(&ip->ip_lockres.l_lock); + + return event; +} + +static ssize_t dlmfs_file_read(struct file *filp, + char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t readlen, got; + char *lvb_buf; + struct inode *inode = file_inode(filp); + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + /* don't read past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + readlen = i_size_read(inode) - *ppos; + else + readlen = count; + + lvb_buf = kmalloc(readlen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + got = user_dlm_read_lvb(inode, lvb_buf, readlen); + if (got) { + BUG_ON(got != readlen); + bytes_left = __copy_to_user(buf, lvb_buf, readlen); + readlen -= bytes_left; + } else + readlen = 0; + + kfree(lvb_buf); + + *ppos = *ppos + readlen; + + mlog(0, "read %zd bytes\n", readlen); + return readlen; +} + +static ssize_t dlmfs_file_write(struct file *filp, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t writelen; + char *lvb_buf; + struct inode *inode = file_inode(filp); + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return -ENOSPC; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* don't write past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + writelen = i_size_read(inode) - *ppos; + else + writelen = count - *ppos; + + lvb_buf = kmalloc(writelen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + bytes_left = copy_from_user(lvb_buf, buf, writelen); + writelen -= bytes_left; + if (writelen) + user_dlm_write_lvb(inode, lvb_buf, writelen); + + kfree(lvb_buf); + + *ppos = *ppos + writelen; + mlog(0, "wrote %zd bytes\n", writelen); + return writelen; +} + +static void dlmfs_init_once(void *foo) +{ + struct dlmfs_inode_private *ip = + (struct dlmfs_inode_private *) foo; + + ip->ip_conn = NULL; + ip->ip_parent = NULL; + + inode_init_once(&ip->ip_vfs_inode); +} + +static struct inode *dlmfs_alloc_inode(struct super_block *sb) +{ + struct dlmfs_inode_private *ip; + + ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + if (!ip) + return NULL; + + return &ip->ip_vfs_inode; +} + +static void dlmfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); +} + +static void dlmfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dlmfs_i_callback); +} + +static void dlmfs_evict_inode(struct inode *inode) +{ + int status; + struct dlmfs_inode_private *ip; + + clear_inode(inode); + + mlog(0, "inode %lu\n", inode->i_ino); + + ip = DLMFS_I(inode); + + if (S_ISREG(inode->i_mode)) { + status = user_dlm_destroy_lock(&ip->ip_lockres); + if (status < 0) + mlog_errno(status); + iput(ip->ip_parent); + goto clear_fields; + } + + mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn); + /* we must be a directory. If required, lets unregister the + * dlm context now. */ + if (ip->ip_conn) + user_dlm_unregister(ip->ip_conn); +clear_fields: + ip->ip_parent = NULL; + ip->ip_conn = NULL; +} + +static struct inode *dlmfs_get_root_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + umode_t mode = S_IFDIR | 0755; + + if (inode) { + inode->i_ino = get_next_ino(); + inode_init_owner(inode, NULL, mode); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inc_nlink(inode); + + inode->i_fop = &simple_dir_operations; + inode->i_op = &dlmfs_root_inode_operations; + } + + return inode; +} + +static struct inode *dlmfs_get_inode(struct inode *parent, + struct dentry *dentry, + umode_t mode) +{ + struct super_block *sb = parent->i_sb; + struct inode * inode = new_inode(sb); + struct dlmfs_inode_private *ip; + + if (!inode) + return NULL; + + inode->i_ino = get_next_ino(); + inode_init_owner(inode, parent, mode); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ip = DLMFS_I(inode); + ip->ip_conn = DLMFS_I(parent)->ip_conn; + + switch (mode & S_IFMT) { + default: + /* for now we don't support anything other than + * directories and regular files. */ + BUG(); + break; + case S_IFREG: + inode->i_op = &dlmfs_file_inode_operations; + inode->i_fop = &dlmfs_file_operations; + + i_size_write(inode, DLM_LVB_LEN); + + user_dlm_lock_res_init(&ip->ip_lockres, dentry); + + /* released at clear_inode time, this insures that we + * get to drop the dlm reference on each lock *before* + * we call the unregister code for releasing parent + * directories. */ + ip->ip_parent = igrab(parent); + BUG_ON(!ip->ip_parent); + break; + case S_IFDIR: + inode->i_op = &dlmfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == + * 2 (for "." entry) */ + inc_nlink(inode); + break; + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int dlmfs_mkdir(struct inode * dir, + struct dentry * dentry, + umode_t mode) +{ + int status; + struct inode *inode = NULL; + struct qstr *domain = &dentry->d_name; + struct dlmfs_inode_private *ip; + struct ocfs2_cluster_connection *conn; + + mlog(0, "mkdir %.*s\n", domain->len, domain->name); + + /* verify that we have a proper domain */ + if (domain->len >= GROUP_NAME_MAX) { + status = -EINVAL; + mlog(ML_ERROR, "invalid domain name for directory.\n"); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ip = DLMFS_I(inode); + + conn = user_dlm_register(domain); + if (IS_ERR(conn)) { + status = PTR_ERR(conn); + mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", + status, domain->len, domain->name); + goto bail; + } + ip->ip_conn = conn; + + inc_nlink(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + + status = 0; +bail: + if (status < 0) + iput(inode); + return status; +} + +static int dlmfs_create(struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool excl) +{ + int status = 0; + struct inode *inode; + struct qstr *name = &dentry->d_name; + + mlog(0, "create %.*s\n", name->len, name->name); + + /* verify name is valid and doesn't contain any dlm reserved + * characters */ + if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || + name->name[0] == '$') { + status = -EINVAL; + mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, + name->name); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ +bail: + return status; +} + +static int dlmfs_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + struct inode *inode = d_inode(dentry); + + mlog(0, "unlink inode %lu\n", inode->i_ino); + + /* if there are no current holders, or none that are waiting + * to acquire a lock, this basically destroys our lockres. */ + status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); + if (status < 0) { + mlog(ML_ERROR, "unlink %pd, error %d from destroy\n", + dentry, status); + goto bail; + } + status = simple_unlink(dir, dentry); +bail: + return status; +} + +static int dlmfs_fill_super(struct super_block * sb, + void * data, + int silent) +{ + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = DLMFS_MAGIC; + sb->s_op = &dlmfs_ops; + sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); + if (!sb->s_root) + return -ENOMEM; + return 0; +} + +static const struct file_operations dlmfs_file_operations = { + .open = dlmfs_file_open, + .release = dlmfs_file_release, + .poll = dlmfs_file_poll, + .read = dlmfs_file_read, + .write = dlmfs_file_write, + .llseek = default_llseek, +}; + +static const struct inode_operations dlmfs_dir_inode_operations = { + .create = dlmfs_create, + .lookup = simple_lookup, + .unlink = dlmfs_unlink, +}; + +/* this way we can restrict mkdir to only the toplevel of the fs. */ +static const struct inode_operations dlmfs_root_inode_operations = { + .lookup = simple_lookup, + .mkdir = dlmfs_mkdir, + .rmdir = simple_rmdir, +}; + +static const struct super_operations dlmfs_ops = { + .statfs = simple_statfs, + .alloc_inode = dlmfs_alloc_inode, + .destroy_inode = dlmfs_destroy_inode, + .evict_inode = dlmfs_evict_inode, + .drop_inode = generic_delete_inode, +}; + +static const struct inode_operations dlmfs_file_inode_operations = { + .getattr = simple_getattr, + .setattr = dlmfs_file_setattr, +}; + +static struct dentry *dlmfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, dlmfs_fill_super); +} + +static struct file_system_type dlmfs_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2_dlmfs", + .mount = dlmfs_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("ocfs2_dlmfs"); + +static int __init init_dlmfs_fs(void) +{ + int status; + int cleanup_inode = 0, cleanup_worker = 0; + + dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", + sizeof(struct dlmfs_inode_private), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + dlmfs_init_once); + if (!dlmfs_inode_cache) { + status = -ENOMEM; + goto bail; + } + cleanup_inode = 1; + + user_dlm_worker = create_singlethread_workqueue("user_dlm"); + if (!user_dlm_worker) { + status = -ENOMEM; + goto bail; + } + cleanup_worker = 1; + + user_dlm_set_locking_protocol(); + status = register_filesystem(&dlmfs_fs_type); +bail: + if (status) { + if (cleanup_inode) + kmem_cache_destroy(dlmfs_inode_cache); + if (cleanup_worker) + destroy_workqueue(user_dlm_worker); + } else + printk("OCFS2 User DLM kernel interface loaded\n"); + return status; +} + +static void __exit exit_dlmfs_fs(void) +{ + unregister_filesystem(&dlmfs_fs_type); + + flush_workqueue(user_dlm_worker); + destroy_workqueue(user_dlm_worker); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(dlmfs_inode_cache); + +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); + +module_init(init_dlmfs_fs) +module_exit(exit_dlmfs_fs) diff --git a/kernel/fs/ocfs2/dlmfs/userdlm.c b/kernel/fs/ocfs2/dlmfs/userdlm.c new file mode 100644 index 000000000..0499e3fb7 --- /dev/null +++ b/kernel/fs/ocfs2/dlmfs/userdlm.c @@ -0,0 +1,688 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. + * + * Many of the functions here are pared down versions of dlmglue.c + * functions. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include + +#include +#include +#include +#include + +#include "ocfs2_lockingver.h" +#include "stackglue.h" +#include "userdlm.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + + +static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct user_lock_res, l_lksb); +} + +static inline int user_check_wait_flag(struct user_lock_res *lockres, + int flag) +{ + int ret; + + spin_lock(&lockres->l_lock); + ret = lockres->l_flags & flag; + spin_unlock(&lockres->l_lock); + + return ret; +} + +static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BUSY)); +} + +static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); +} + +/* I heart container_of... */ +static inline struct ocfs2_cluster_connection * +cluster_connection_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return ip->ip_conn; +} + +static struct inode * +user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return &ip->ip_vfs_inode; +} + +static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) +{ + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); +} + +#define user_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error %d while calling %s on " \ + "resource %.*s\n", _stat, _func, \ + _lockres->l_namelen, _lockres->l_name); \ +} while (0) + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int user_highest_compat_lock_level(int level) +{ + int new_level = DLM_LOCK_EX; + + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; + return new_level; +} + +static void user_ast(struct ocfs2_dlm_lksb *lksb) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + int status; + + mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, + lockres->l_requested); + + spin_lock(&lockres->l_lock); + + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + if (status) { + mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", + status, lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + return; + } + + mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, + "Lockres %.*s, requested ivmode. flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* we're downconverting. */ + if (lockres->l_requested < lockres->l_level) { + if (lockres->l_requested <= + user_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = DLM_LOCK_NL; + lockres->l_flags &= ~USER_LOCK_BLOCKED; + } + } + + lockres->l_level = lockres->l_requested; + lockres->l_requested = DLM_LOCK_IV; + lockres->l_flags |= USER_LOCK_ATTACHED; + lockres->l_flags &= ~USER_LOCK_BUSY; + + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + if (!igrab(inode)) + BUG(); +} + +static void user_dlm_unblock_lock(struct work_struct *work); + +static void __user_dlm_queue_lockres(struct user_lock_res *lockres) +{ + if (!(lockres->l_flags & USER_LOCK_QUEUED)) { + user_dlm_grab_inode_ref(lockres); + + INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); + + queue_work(user_dlm_worker, &lockres->l_work); + lockres->l_flags |= USER_LOCK_QUEUED; + } +} + +static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) +{ + int queue = 0; + + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) + return; + + switch (lockres->l_blocking) { + case DLM_LOCK_EX: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + queue = 1; + break; + case DLM_LOCK_PR: + if (!lockres->l_ex_holders) + queue = 1; + break; + default: + BUG(); + } + + if (queue) + __user_dlm_queue_lockres(lockres); +} + +static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + + mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", + lockres->l_namelen, lockres->l_name, level, lockres->l_level); + + spin_lock(&lockres->l_lock); + lockres->l_flags |= USER_LOCK_BLOCKED; + if (level > lockres->l_blocking) + lockres->l_blocking = level; + + __user_dlm_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + + mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + if (status) + mlog(ML_ERROR, "dlm returns status %d\n", status); + + spin_lock(&lockres->l_lock); + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { + lockres->l_level = DLM_LOCK_IV; + } else if (status == DLM_CANCELGRANT) { + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + lockres->l_requested = DLM_LOCK_IV; /* cancel an + * upconvert + * request. */ + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + /* we want the unblock thread to look at it again + * now. */ + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); + } + + lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +/* + * This is the userdlmfs locking protocol version. + * + * See fs/ocfs2/dlmglue.c for more details on locking versions. + */ +static struct ocfs2_locking_protocol user_dlm_lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = user_ast, + .lp_blocking_ast = user_bast, + .lp_unlock_ast = user_unlock_ast, +}; + +static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + iput(inode); +} + +static void user_dlm_unblock_lock(struct work_struct *work) +{ + int new_level, status; + struct user_lock_res *lockres = + container_of(work, struct user_lock_res, l_work); + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); + + spin_lock(&lockres->l_lock); + + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ + lockres->l_flags &= ~USER_LOCK_QUEUED; + + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_BUSY) { + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + lockres->l_flags |= USER_LOCK_IN_CANCEL; + spin_unlock(&lockres->l_lock); + + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, + DLM_LKF_CANCEL); + if (status) + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); + goto drop_ref; + } + + /* If there are still incompat holders, we can exit safely + * without worrying about re-queueing this lock as that will + * happen on the last call to user_cluster_unlock. */ + if ((lockres->l_blocking == DLM_LOCK_EX) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock(&lockres->l_lock); + mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders, lockres->l_ro_holders); + goto drop_ref; + } + + if ((lockres->l_blocking == DLM_LOCK_PR) + && lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders); + goto drop_ref; + } + + /* yay, we can downconvert now. */ + new_level = user_highest_compat_lock_level(lockres->l_blocking); + lockres->l_requested = new_level; + lockres->l_flags |= USER_LOCK_BUSY; + mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); + spin_unlock(&lockres->l_lock); + + /* need lock downconvert request now... */ + status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, + DLM_LKF_CONVERT|DLM_LKF_VALBLK, + lockres->l_name, + lockres->l_namelen); + if (status) { + user_log_dlm_error("ocfs2_dlm_lock", status, lockres); + user_recover_from_dlm_error(lockres); + } + +drop_ref: + user_dlm_drop_inode_ref(lockres); +} + +static inline void user_dlm_inc_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case DLM_LOCK_EX: + lockres->l_ex_holders++; + break; + case DLM_LOCK_PR: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int +user_may_continue_on_blocked_lock(struct user_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); + + return wanted <= user_highest_compat_lock_level(lockres->l_blocking); +} + +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags) +{ + int status, local_flags; + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + status = -EINVAL; + goto bail; + } + + mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", + lockres->l_namelen, lockres->l_name, level, lkm_flags); + +again: + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + spin_lock(&lockres->l_lock); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if ((lockres->l_flags & USER_LOCK_BUSY) && + (level > lockres->l_level)) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + goto again; + } + + if ((lockres->l_flags & USER_LOCK_BLOCKED) && + (!user_may_continue_on_blocked_lock(lockres, level))) { + /* is the lock is currently blocked on behalf of + * another node */ + spin_unlock(&lockres->l_lock); + + user_wait_on_blocked_lock(lockres); + goto again; + } + + if (level > lockres->l_level) { + local_flags = lkm_flags | DLM_LKF_VALBLK; + if (lockres->l_level != DLM_LOCK_IV) + local_flags |= DLM_LKF_CONVERT; + + lockres->l_requested = level; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); + + /* call dlm_lock to upgrade lock now */ + status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, + local_flags, lockres->l_name, + lockres->l_namelen); + if (status) { + if ((lkm_flags & DLM_LKF_NOQUEUE) && + (status != -EAGAIN)) + user_log_dlm_error("ocfs2_dlm_lock", + status, lockres); + user_recover_from_dlm_error(lockres); + goto bail; + } + + user_wait_on_busy_lock(lockres); + goto again; + } + + user_dlm_inc_holders(lockres, level); + spin_unlock(&lockres->l_lock); + + status = 0; +bail: + return status; +} + +static inline void user_dlm_dec_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case DLM_LOCK_EX: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case DLM_LOCK_PR: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level) +{ + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + return; + } + + spin_lock(&lockres->l_lock); + user_dlm_dec_holders(lockres, level); + __user_dlm_cond_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); +} + +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < DLM_LOCK_EX); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + memcpy(lvb, val, len); + + spin_unlock(&lockres->l_lock); +} + +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb; + ssize_t ret = len; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < DLM_LOCK_PR); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + memcpy(val, lvb, len); + } else + ret = 0; + + spin_unlock(&lockres->l_lock); + return ret; +} + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry) +{ + memset(lockres, 0, sizeof(*lockres)); + + spin_lock_init(&lockres->l_lock); + init_waitqueue_head(&lockres->l_event); + lockres->l_level = DLM_LOCK_IV; + lockres->l_requested = DLM_LOCK_IV; + lockres->l_blocking = DLM_LOCK_IV; + + /* should have been checked before getting here. */ + BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); + + memcpy(lockres->l_name, + dentry->d_name.name, + dentry->d_name.len); + lockres->l_namelen = dentry->d_name.len; +} + +int user_dlm_destroy_lock(struct user_lock_res *lockres) +{ + int status = -EBUSY; + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); + + spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + + while (lockres->l_flags & USER_LOCK_BUSY) { + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + + spin_lock(&lockres->l_lock); + } + + if (lockres->l_ro_holders || lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + status = 0; + if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + lockres->l_flags &= ~USER_LOCK_ATTACHED; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); + if (status) { + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); + goto bail; + } + + user_wait_on_busy_lock(lockres); + + status = 0; +bail: + return status; +} + +static void user_dlm_recovery_handler_noop(int node_num, + void *recovery_data) +{ + /* We ignore recovery events */ + return; +} + +void user_dlm_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); +} + +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) +{ + int rc; + struct ocfs2_cluster_connection *conn; + + rc = ocfs2_cluster_connect_agnostic(name->name, name->len, + &user_dlm_lproto, + user_dlm_recovery_handler_noop, + NULL, &conn); + if (rc) + mlog_errno(rc); + + return rc ? ERR_PTR(rc) : conn; +} + +void user_dlm_unregister(struct ocfs2_cluster_connection *conn) +{ + ocfs2_cluster_disconnect(conn, 0); +} diff --git a/kernel/fs/ocfs2/dlmfs/userdlm.h b/kernel/fs/ocfs2/dlmfs/userdlm.h new file mode 100644 index 000000000..3b42d7953 --- /dev/null +++ b/kernel/fs/ocfs2/dlmfs/userdlm.h @@ -0,0 +1,113 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.h + * + * Userspace dlm defines + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef USERDLM_H +#define USERDLM_H + +#include +#include +#include +#include + +/* user_lock_res->l_flags flags. */ +#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized + * the lvb */ +#define USER_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently + * destroying this + * lock. */ +#define USER_LOCK_QUEUED (0x00000010) /* lock is on the + * workqueue */ +#define USER_LOCK_IN_CANCEL (0x00000020) + +struct user_lock_res { + spinlock_t l_lock; + + int l_flags; + +#define USER_DLM_LOCK_ID_MAX_LEN 32 + char l_name[USER_DLM_LOCK_ID_MAX_LEN]; + int l_namelen; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct ocfs2_dlm_lksb l_lksb; + + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct work_struct l_work; +}; + +extern struct workqueue_struct *user_dlm_worker; + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry); +int user_dlm_destroy_lock(struct user_lock_res *lockres); +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags); +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level); +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len); +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len); +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name); +void user_dlm_unregister(struct ocfs2_cluster_connection *conn); +void user_dlm_set_locking_protocol(void); + +struct dlmfs_inode_private { + struct ocfs2_cluster_connection *ip_conn; + + struct user_lock_res ip_lockres; /* unused for directories. */ + struct inode *ip_parent; + + struct inode ip_vfs_inode; +}; + +static inline struct dlmfs_inode_private * +DLMFS_I(struct inode *inode) +{ + return container_of(inode, + struct dlmfs_inode_private, + ip_vfs_inode); +} + +struct dlmfs_filp_private { + int fp_lock_level; +}; + +#define DLMFS_MAGIC 0x76a9f425 + +#endif /* USERDLM_H */ diff --git a/kernel/fs/ocfs2/dlmglue.c b/kernel/fs/ocfs2/dlmglue.c new file mode 100644 index 000000000..8b23aa2f5 --- /dev/null +++ b/kernel/fs/ocfs2/dlmglue.c @@ -0,0 +1,4106 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.c + * + * Code which implements an OCFS2 specific interface to our DLM. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MLOG_MASK_PREFIX ML_DLM_GLUE +#include + +#include "ocfs2.h" +#include "ocfs2_lockingver.h" + +#include "alloc.h" +#include "dcache.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "stackglue.h" +#include "slot_map.h" +#include "super.h" +#include "uptodate.h" +#include "quota.h" +#include "refcounttree.h" + +#include "buffer_head_io.h" + +struct ocfs2_mask_waiter { + struct list_head mw_item; + int mw_status; + struct completion mw_complete; + unsigned long mw_mask; + unsigned long mw_goal; +#ifdef CONFIG_OCFS2_FS_STATS + ktime_t mw_lock_start; +#endif +}; + +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres); + +/* + * Return value from ->downconvert_worker functions. + * + * These control the precise actions of ocfs2_unblock_lock() + * and ocfs2_process_blocked_lock() + * + */ +enum ocfs2_unblock_action { + UNBLOCK_CONTINUE = 0, /* Continue downconvert */ + UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire + * ->post_unlock callback */ + UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire + * ->post_unlock() callback. */ +}; + +struct ocfs2_unblock_ctl { + int requeue; + enum ocfs2_unblock_action unblock_action; +}; + +/* Lockdep class keys */ +struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; + +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres); + +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); + +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) + +/* This aids in debugging situations where a bad LVB might be involved. */ +static void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + mlog(level, "LVB information for %s (called from %s:%u):\n", + lockres->l_name, function, line); + mlog(level, "version: %u, clusters: %u, generation: 0x%x\n", + lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters), + be32_to_cpu(lvb->lvb_igeneration)); + mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", + (unsigned long long)be64_to_cpu(lvb->lvb_isize), + be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), + be16_to_cpu(lvb->lvb_imode)); + mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, " + "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink), + (long long)be64_to_cpu(lvb->lvb_iatime_packed), + (long long)be64_to_cpu(lvb->lvb_ictime_packed), + (long long)be64_to_cpu(lvb->lvb_imtime_packed), + be32_to_cpu(lvb->lvb_iattr)); +} + + +/* + * OCFS2 Lock Resource Operations + * + * These fine tune the behavior of the generic dlmglue locking infrastructure. + * + * The most basic of lock types can point ->l_priv to their respective + * struct ocfs2_super and allow the default actions to manage things. + * + * Right now, each lock type also needs to implement an init function, + * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres() + * should be called when the lock is no longer needed (i.e., object + * destruction time). + */ +struct ocfs2_lock_res_ops { + /* + * Translate an ocfs2_lock_res * into an ocfs2_super *. Define + * this callback if ->l_priv is not an ocfs2_super pointer + */ + struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); + + /* + * Optionally called in the downconvert thread after a + * successful downconvert. The lockres will not be referenced + * after this callback is called, so it is safe to free + * memory, etc. + * + * The exact semantics of when this is called are controlled + * by ->downconvert_worker() + */ + void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *); + + /* + * Allow a lock type to add checks to determine whether it is + * safe to downconvert a lock. Return 0 to re-queue the + * downconvert at a later time, nonzero to continue. + * + * For most locks, the default checks that there are no + * incompatible holders are sufficient. + * + * Called with the lockres spinlock held. + */ + int (*check_downconvert)(struct ocfs2_lock_res *, int); + + /* + * Allows a lock type to populate the lock value block. This + * is called on downconvert, and when we drop a lock. + * + * Locks that want to use this should set LOCK_TYPE_USES_LVB + * in the flags field. + * + * Called with the lockres spinlock held. + */ + void (*set_lvb)(struct ocfs2_lock_res *); + + /* + * Called from the downconvert thread when it is determined + * that a lock will be downconverted. This is called without + * any locks held so the function can do work that might + * schedule (syncing out data, etc). + * + * This should return any one of the ocfs2_unblock_action + * values, depending on what it wants the thread to do. + */ + int (*downconvert_worker)(struct ocfs2_lock_res *, int); + + /* + * LOCK_TYPE_* flags which describe the specific requirements + * of a lock type. Descriptions of each individual flag follow. + */ + int flags; +}; + +/* + * Some locks want to "refresh" potentially stale data when a + * meaningful (PRMODE or EXMODE) lock level is first obtained. If this + * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the + * individual lockres l_flags member from the ast function. It is + * expected that the locking wrapper will clear the + * OCFS2_LOCK_NEEDS_REFRESH flag when done. + */ +#define LOCK_TYPE_REQUIRES_REFRESH 0x1 + +/* + * Indicate that a lock type makes use of the lock value block. The + * ->set_lvb lock type callback must be defined. + */ +#define LOCK_TYPE_USES_LVB 0x2 + +static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { + .get_osb = ocfs2_get_inode_osb, + .check_downconvert = ocfs2_check_meta_downconvert, + .set_lvb = ocfs2_set_meta_lvb, + .downconvert_worker = ocfs2_data_convert_worker, + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_super_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH, +}; + +static struct ocfs2_lock_res_ops ocfs2_rename_lops = { + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { + .get_osb = ocfs2_get_dentry_osb, + .post_unlock = ocfs2_dentry_post_unlock, + .downconvert_worker = ocfs2_dentry_convert_worker, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_flock_lops = { + .get_osb = ocfs2_get_file_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { + .set_lvb = ocfs2_set_qinfo_lvb, + .get_osb = ocfs2_get_qinfo_osb, + .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { + .check_downconvert = ocfs2_check_refcount_downconvert, + .downconvert_worker = ocfs2_refcount_convert_worker, + .flags = 0, +}; + +static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_META || + lockres->l_type == OCFS2_LOCK_TYPE_RW || + lockres->l_type == OCFS2_LOCK_TYPE_OPEN; +} + +static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct ocfs2_lock_res, l_lksb); +} + +static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + return (struct inode *) lockres->l_priv; +} + +static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY); + + return (struct ocfs2_dentry_lock *)lockres->l_priv; +} + +static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO); + + return (struct ocfs2_mem_dqinfo *)lockres->l_priv; +} + +static inline struct ocfs2_refcount_tree * +ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res) +{ + return container_of(res, struct ocfs2_refcount_tree, rf_lockres); +} + +static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) +{ + if (lockres->l_ops->get_osb) + return lockres->l_ops->get_osb(lockres); + + return (struct ocfs2_super *)lockres->l_priv; +} + +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 dlm_flags); +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted); +static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, unsigned long caller_ip); +static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert); +#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ + if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ + _err, _func, _lockres->l_name); \ + else \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \ + _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \ + (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \ +} while (0) +static int ocfs2_downconvert_thread(void *arg); +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_inode_lock_update(struct inode *inode, + struct buffer_head **bh); +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); +static inline int ocfs2_highest_compat_lock_level(int level); +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb, + unsigned int generation); +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + + +static void ocfs2_build_lock_name(enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + char *name) +{ + int len; + + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); + + len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", + ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, + (long long)blkno, generation); + + BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); + + mlog(0, "built lock resource with name: %s\n", name); +} + +static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); + +static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, + struct ocfs2_dlm_debug *dlm_debug) +{ + mlog(0, "Add tracking for lockres %s\n", res->l_name); + + spin_lock(&ocfs2_dlm_tracking_lock); + list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) +{ + spin_lock(&ocfs2_dlm_tracking_lock); + if (!list_empty(&res->l_debug_list)) + list_del_init(&res->l_debug_list); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +#ifdef CONFIG_OCFS2_FS_STATS +static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ + res->l_lock_refresh = 0; + memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); + memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); +} + +static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, + struct ocfs2_mask_waiter *mw, int ret) +{ + u32 usec; + ktime_t kt; + struct ocfs2_lock_stats *stats; + + if (level == LKM_PRMODE) + stats = &res->l_lock_prmode; + else if (level == LKM_EXMODE) + stats = &res->l_lock_exmode; + else + return; + + kt = ktime_sub(ktime_get(), mw->mw_lock_start); + usec = ktime_to_us(kt); + + stats->ls_gets++; + stats->ls_total += ktime_to_ns(kt); + /* overflow */ + if (unlikely(stats->ls_gets == 0)) { + stats->ls_gets++; + stats->ls_total = ktime_to_ns(kt); + } + + if (stats->ls_max < usec) + stats->ls_max = usec; + + if (ret) + stats->ls_fail++; +} + +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ + lockres->l_lock_refresh++; +} + +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ + mw->mw_lock_start = ktime_get(); +} +#else +static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ +} +static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, + int level, struct ocfs2_mask_waiter *mw, int ret) +{ +} +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ +} +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ +} +#endif + +static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, + struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct ocfs2_lock_res_ops *ops, + void *priv) +{ + res->l_type = type; + res->l_ops = ops; + res->l_priv = priv; + + res->l_level = DLM_LOCK_IV; + res->l_requested = DLM_LOCK_IV; + res->l_blocking = DLM_LOCK_IV; + res->l_action = OCFS2_AST_INVALID; + res->l_unlock_action = OCFS2_UNLOCK_INVALID; + + res->l_flags = OCFS2_LOCK_INITIALIZED; + + ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); + + ocfs2_init_lock_stats(res); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (type != OCFS2_LOCK_TYPE_OPEN) + lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type], + &lockdep_keys[type], 0); + else + res->l_lockdep_map.key = NULL; +#endif +} + +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) +{ + /* This also clears out the lock status block */ + memset(res, 0, sizeof(struct ocfs2_lock_res)); + spin_lock_init(&res->l_lock); + init_waitqueue_head(&res->l_event); + INIT_LIST_HEAD(&res->l_blocked_list); + INIT_LIST_HEAD(&res->l_mask_waiters); +} + +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + unsigned int generation, + struct inode *inode) +{ + struct ocfs2_lock_res_ops *ops; + + switch(type) { + case OCFS2_LOCK_TYPE_RW: + ops = &ocfs2_inode_rw_lops; + break; + case OCFS2_LOCK_TYPE_META: + ops = &ocfs2_inode_inode_lops; + break; + case OCFS2_LOCK_TYPE_OPEN: + ops = &ocfs2_inode_open_lops; + break; + default: + mlog_bug_on_msg(1, "type: %d\n", type); + ops = NULL; /* thanks, gcc */ + break; + }; + + ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, + generation, res->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); +} + +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return OCFS2_SB(inode->i_sb); +} + +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mem_dqinfo *info = lockres->l_priv; + + return OCFS2_SB(info->dqi_gi.dqi_sb); +} + +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_file_private *fp = lockres->l_priv; + + return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb); +} + +static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) +{ + __be64 inode_blkno_be; + + memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + + return be64_to_cpu(inode_blkno_be); +} + +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_dentry_lock *dl = lockres->l_priv; + + return OCFS2_SB(dl->dl_inode->i_sb); +} + +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode) +{ + int len; + u64 inode_blkno = OCFS2_I(inode)->ip_blkno; + __be64 inode_blkno_be = cpu_to_be64(inode_blkno); + struct ocfs2_lock_res *lockres = &dl->dl_lockres; + + ocfs2_lock_res_init_once(lockres); + + /* + * Unfortunately, the standard lock naming scheme won't work + * here because we have two 16 byte values to use. Instead, + * we'll stuff the inode number as a binary value. We still + * want error prints to show something without garbling the + * display, so drop a null byte in there before the inode + * number. A future version of OCFS2 will likely use all + * binary lock names. The stringified names have been a + * tremendous aid in debugging, but now that the debugfs + * interface exists, we can mangle things there if need be. + * + * NOTE: We also drop the standard "pad" value (the total lock + * name size stays the same though - the last part is all + * zeros due to the memset in ocfs2_lock_res_init_once() + */ + len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START, + "%c%016llx", + ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY), + (long long)parent); + + BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1)); + + memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be, + sizeof(__be64)); + + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops, + dl); +} + +static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Superblock lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO, + 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, + &ocfs2_super_lops, osb); +} + +static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Rename lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, + &ocfs2_rename_lops, osb); +} + +static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* nfs_sync lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC, + &ocfs2_nfs_sync_lops, osb); +} + +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + &ocfs2_orphan_scan_lops, osb); +} + +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp) +{ + struct inode *inode = fp->fp_file->f_mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno, + inode->i_generation, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops, + fp); + lockres->l_flags |= OCFS2_LOCK_NOCACHE; +} + +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type, + 0, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres, + OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops, + info); +} + +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno, + generation, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT, + &ocfs2_refcount_block_lops, osb); +} + +void ocfs2_lock_res_free(struct ocfs2_lock_res *res) +{ + if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) + return; + + ocfs2_remove_lockres_tracking(res); + + mlog_bug_on_msg(!list_empty(&res->l_blocked_list), + "Lockres %s is on the blocked list\n", + res->l_name); + mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), + "Lockres %s has mask waiters pending\n", + res->l_name); + mlog_bug_on_msg(spin_is_locked(&res->l_lock), + "Lockres %s is locked\n", + res->l_name); + mlog_bug_on_msg(res->l_ro_holders, + "Lockres %s has %u ro holders\n", + res->l_name, res->l_ro_holders); + mlog_bug_on_msg(res->l_ex_holders, + "Lockres %s has %u ex holders\n", + res->l_name, res->l_ex_holders); + + /* Need to clear out the lock status block for the dlm */ + memset(&res->l_lksb, 0, sizeof(res->l_lksb)); + + res->l_flags = 0UL; +} + +static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, + int level) +{ + BUG_ON(!lockres); + + switch(level) { + case DLM_LOCK_EX: + lockres->l_ex_holders++; + break; + case DLM_LOCK_PR: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, + int level) +{ + BUG_ON(!lockres); + + switch(level) { + case DLM_LOCK_EX: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case DLM_LOCK_PR: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int ocfs2_highest_compat_lock_level(int level) +{ + int new_level = DLM_LOCK_EX; + + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; + return new_level; +} + +static void lockres_set_flags(struct ocfs2_lock_res *lockres, + unsigned long newflags) +{ + struct ocfs2_mask_waiter *mw, *tmp; + + assert_spin_locked(&lockres->l_lock); + + lockres->l_flags = newflags; + + list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + continue; + + list_del_init(&mw->mw_item); + mw->mw_status = 0; + complete(&mw->mw_complete); + } +} +static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) +{ + lockres_set_flags(lockres, lockres->l_flags | or); +} +static void lockres_clear_flags(struct ocfs2_lock_res *lockres, + unsigned long clear) +{ + lockres_set_flags(lockres, lockres->l_flags & ~clear); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); + + lockres->l_level = lockres->l_requested; + if (lockres->l_level <= + ocfs2_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = DLM_LOCK_NL; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + } + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + + /* Convert from RO to EX doesn't really need anything as our + * information is already up to data. Convert from NL to + * *anything* however should mark ourselves as needing an + * update */ + if (lockres->l_level == DLM_LOCK_NL && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + + /* + * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing + * the OCFS2_LOCK_BUSY flag to prevent the dc thread from + * downconverting the lock before the upconvert has fully completed. + * Do not prevent the dc thread from downconverting if NONBLOCK lock + * had already returned. + */ + if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED)) + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + else + lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED); + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + + if (lockres->l_requested > DLM_LOCK_NL && + !(lockres->l_flags & OCFS2_LOCK_LOCAL) && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert = 0; + + assert_spin_locked(&lockres->l_lock); + + if (level > lockres->l_blocking) { + /* only schedule a downconvert if we haven't already scheduled + * one that goes low enough to satisfy the level we're + * blocking. this also catches the case where we get + * duplicate BASTs */ + if (ocfs2_highest_compat_lock_level(level) < + ocfs2_highest_compat_lock_level(lockres->l_blocking)) + needs_downconvert = 1; + + lockres->l_blocking = level; + } + + mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n", + lockres->l_name, level, lockres->l_level, lockres->l_blocking, + needs_downconvert); + + if (needs_downconvert) + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + mlog(0, "needs_downconvert = %d\n", needs_downconvert); + return needs_downconvert; +} + +/* + * OCFS2_LOCK_PENDING and l_pending_gen. + * + * Why does OCFS2_LOCK_PENDING exist? To close a race between setting + * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock() + * for more details on the race. + * + * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces + * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock() + * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear + * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns, + * the caller is going to try to clear PENDING again. If nothing else is + * happening, __lockres_clear_pending() sees PENDING is unset and does + * nothing. + * + * But what if another path (eg downconvert thread) has just started a + * new locking action? The other path has re-set PENDING. Our path + * cannot clear PENDING, because that will re-open the original race + * window. + * + * [Example] + * + * ocfs2_meta_lock() + * ocfs2_cluster_lock() + * set BUSY + * set PENDING + * drop l_lock + * ocfs2_dlm_lock() + * ocfs2_locking_ast() ocfs2_downconvert_thread() + * clear PENDING ocfs2_unblock_lock() + * take_l_lock + * !BUSY + * ocfs2_prepare_downconvert() + * set BUSY + * set PENDING + * drop l_lock + * take l_lock + * clear PENDING + * drop l_lock + * + * ocfs2_dlm_lock() + * + * So as you can see, we now have a window where l_lock is not held, + * PENDING is not set, and ocfs2_dlm_lock() has not been called. + * + * The core problem is that ocfs2_cluster_lock() has cleared the PENDING + * set by ocfs2_prepare_downconvert(). That wasn't nice. + * + * To solve this we introduce l_pending_gen. A call to + * lockres_clear_pending() will only do so when it is passed a generation + * number that matches the lockres. lockres_set_pending() will return the + * current generation number. When ocfs2_cluster_lock() goes to clear + * PENDING, it passes the generation it got from set_pending(). In our + * example above, the generation numbers will *not* match. Thus, + * ocfs2_cluster_lock() will not clear the PENDING set by + * ocfs2_prepare_downconvert(). + */ + +/* Unlocked version for ocfs2_locking_ast() */ +static void __lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + assert_spin_locked(&lockres->l_lock); + + /* + * The ast and locking functions can race us here. The winner + * will clear pending, the loser will not. + */ + if (!(lockres->l_flags & OCFS2_LOCK_PENDING) || + (lockres->l_pending_gen != generation)) + return; + + lockres_clear_flags(lockres, OCFS2_LOCK_PENDING); + lockres->l_pending_gen++; + + /* + * The downconvert thread may have skipped us because we + * were PENDING. Wake it up. + */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(osb); +} + +/* Locked version for callers of ocfs2_dlm_lock() */ +static void lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + __lockres_clear_pending(lockres, generation, osb); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + + lockres_or_flags(lockres, OCFS2_LOCK_PENDING); + + return lockres->l_pending_gen; +} + +static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + int needs_downconvert; + unsigned long flags; + + BUG_ON(level <= DLM_LOCK_NL); + + mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, " + "type %s\n", lockres->l_name, level, lockres->l_level, + ocfs2_lock_type_string(lockres->l_type)); + + /* + * We can skip the bast for locks which don't enable caching - + * they'll be dropped at the earliest possible time anyway. + */ + if (lockres->l_flags & OCFS2_LOCK_NOCACHE) + return; + + spin_lock_irqsave(&lockres->l_lock, flags); + needs_downconvert = ocfs2_generic_handle_bast(lockres, level); + if (needs_downconvert) + ocfs2_schedule_blocked_lock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + ocfs2_wake_downconvert_thread(osb); +} + +static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + unsigned long flags; + int status; + + spin_lock_irqsave(&lockres->l_lock, flags); + + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + + if (status == -EAGAIN) { + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + goto out; + } + + if (status) { + mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n", + lockres->l_name, status); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, " + "level %d => %d\n", lockres->l_name, lockres->l_action, + lockres->l_unlock_action, lockres->l_level, lockres->l_requested); + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, " + "flags 0x%lx, unlock: %u\n", + lockres->l_name, lockres->l_action, lockres->l_flags, + lockres->l_unlock_action); + BUG(); + } +out: + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + + /* Did we try to cancel this lock? Clear that state */ + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + + /* + * We may have beaten the locking functions here. We certainly + * know that dlm_lock() has been called :-) + * Because we can't have two lock calls in flight at once, we + * can use lockres->l_pending_gen. + */ + __lockres_clear_pending(lockres, lockres->l_pending_gen, osb); + + wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + unsigned long flags; + + mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n", + lockres->l_name, lockres->l_unlock_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + if (error) { + mlog(ML_ERROR, "Dlm passes error %d for lock %s, " + "unlock_action %d\n", error, lockres->l_name, + lockres->l_unlock_action); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_unlock_action) { + case OCFS2_UNLOCK_CANCEL_CONVERT: + mlog(0, "Cancel convert success for %s\n", lockres->l_name); + lockres->l_action = OCFS2_AST_INVALID; + /* Downconvert thread may have requeued this lock, we + * need to wake it. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); + break; + case OCFS2_UNLOCK_DROP_LOCK: + lockres->l_level = DLM_LOCK_IV; + break; + default: + BUG(); + } + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +/* + * This is the filesystem locking protocol. It provides the lock handling + * hooks for the underlying DLM. It has a maximum version number. + * The version number allows interoperability with systems running at + * the same major number and an equal or smaller minor number. + * + * Whenever the filesystem does new things with locks (adds or removes a + * lock, orders them differently, does different things underneath a lock), + * the version must be changed. The protocol is negotiated when joining + * the dlm domain. A node may join the domain if its major version is + * identical to all other nodes and its minor version is greater than + * or equal to all other nodes. When its minor version is greater than + * the other nodes, it will run at the minor version specified by the + * other nodes. + * + * If a locking change is made that will not be compatible with older + * versions, the major number must be increased and the minor version set + * to zero. If a change merely adds a behavior that can be disabled when + * speaking to older versions, the minor version must be increased. If a + * change adds a fully backwards compatible change (eg, LVB changes that + * are just ignored by older versions), the version does not need to be + * updated. + */ +static struct ocfs2_locking_protocol lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = ocfs2_locking_ast, + .lp_blocking_ast = ocfs2_blocking_ast, + .lp_unlock_ast = ocfs2_unlock_ast, +}; + +void ocfs2_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version); +} + +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + if (convert) + lockres->l_action = OCFS2_AST_INVALID; + else + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +/* Note: If we detect another process working on the lock (i.e., + * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller + * to do the right thing in that case. + */ +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 dlm_flags) +{ + int ret = 0; + unsigned long flags; + unsigned int gen; + + mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, + dlm_flags); + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || + (lockres->l_flags & OCFS2_LOCK_BUSY)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + lockres->l_action = OCFS2_AST_ATTACH; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 1); + } + + mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); + +bail: + return ret; +} + +static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, + int flag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = lockres->l_flags & flag; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; +} + +static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); +} + +static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); +} + +static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) +{ + INIT_LIST_HEAD(&mw->mw_item); + init_completion(&mw->mw_complete); + ocfs2_init_start_time(mw); +} + +static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) +{ + wait_for_completion(&mw->mw_complete); + /* Re-arm the completion in case we want to wait on it again */ + reinit_completion(&mw->mw_complete); + return mw->mw_status; +} + +static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw, + unsigned long mask, + unsigned long goal) +{ + BUG_ON(!list_empty(&mw->mw_item)); + + assert_spin_locked(&lockres->l_lock); + + list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); + mw->mw_mask = mask; + mw->mw_goal = goal; +} + +/* returns 0 if the mw that was removed was already satisfied, -EBUSY + * if the mask still hadn't reached its goal */ +static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + int ret = 0; + + assert_spin_locked(&lockres->l_lock); + if (!list_empty(&mw->mw_item)) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + ret = -EBUSY; + + list_del_init(&mw->mw_item); + init_completion(&mw->mw_complete); + } + + return ret; +} + +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = __lockres_remove_mask_waiter(lockres, mw); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; + +} + +static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ret = wait_for_completion_interruptible(&mw->mw_complete); + if (ret) + lockres_remove_mask_waiter(lockres, mw); + else + ret = mw->mw_status; + /* Re-arm the completion in case we want to wait on it again */ + reinit_completion(&mw->mw_complete); + return ret; +} + +static int __ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 lkm_flags, + int arg_flags, + int l_subclass, + unsigned long caller_ip) +{ + struct ocfs2_mask_waiter mw; + int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); + int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ + unsigned long flags; + unsigned int gen; + int noqueue_attempted = 0; + int dlm_locked = 0; + + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { + mlog_errno(-EINVAL); + return -EINVAL; + } + + ocfs2_init_mask_waiter(&mw); + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= DLM_LKF_VALBLK; + +again: + wait = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + + if (catch_signals && signal_pending(current)) { + ret = -ERESTARTSYS; + goto unlock; + } + + mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, + "Cluster lock called on freeing lockres %s! flags " + "0x%lx\n", lockres->l_name, lockres->l_flags); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if (lockres->l_flags & OCFS2_LOCK_BUSY && + level > lockres->l_level) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + wait = 1; + goto unlock; + } + + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) { + /* + * We've upconverted. If the lock now has a level we can + * work with, we take it. If, however, the lock is not at the + * required level, we go thru the full cycle. One way this could + * happen is if a process requesting an upconvert to PR is + * closely followed by another requesting upconvert to an EX. + * If the process requesting EX lands here, we want it to + * continue attempting to upconvert and let the process + * requesting PR take the lock. + * If multiple processes request upconvert to PR, the first one + * here will take the lock. The others will have to go thru the + * OCFS2_LOCK_BLOCKED check to ensure that there is no pending + * downconvert request. + */ + if (level <= lockres->l_level) + goto update_holders; + } + + if (lockres->l_flags & OCFS2_LOCK_BLOCKED && + !ocfs2_may_continue_on_blocked_lock(lockres, level)) { + /* is the lock is currently blocked on behalf of + * another node */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); + wait = 1; + goto unlock; + } + + if (level > lockres->l_level) { + if (noqueue_attempted > 0) { + ret = -EAGAIN; + goto unlock; + } + if (lkm_flags & DLM_LKF_NOQUEUE) + noqueue_attempted = 1; + + if (lockres->l_action != OCFS2_AST_INVALID) + mlog(ML_ERROR, "lockres %s has action %u pending\n", + lockres->l_name, lockres->l_action); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres->l_action = OCFS2_AST_ATTACH; + lkm_flags &= ~DLM_LKF_CONVERT; + } else { + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; + } + + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); + + mlog(ML_BASTS, "lockres %s, convert from %d to %d\n", + lockres->l_name, lockres->l_level, level); + + /* call dlm_lock to upgrade lock now */ + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + lkm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + if (!(lkm_flags & DLM_LKF_NOQUEUE) || + (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", + ret, lockres); + } + ocfs2_recover_from_dlm_error(lockres, 1); + goto out; + } + dlm_locked = 1; + + mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", + lockres->l_name); + + /* At this point we've gone inside the dlm and need to + * complete our work regardless. */ + catch_signals = 0; + + /* wait for busy to clear and carry on */ + goto again; + } + +update_holders: + /* Ok, if we get here then we're good to go. */ + ocfs2_inc_holders(lockres, level); + + ret = 0; +unlock: + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + + spin_unlock_irqrestore(&lockres->l_lock, flags); +out: + /* + * This is helping work around a lock inversion between the page lock + * and dlm locks. One path holds the page lock while calling aops + * which block acquiring dlm locks. The voting thread holds dlm + * locks while acquiring page locks while down converting data locks. + * This block is helping an aop path notice the inversion and back + * off to unlock its page lock before trying the dlm lock again. + */ + if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && + mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { + wait = 0; + spin_lock_irqsave(&lockres->l_lock, flags); + if (__lockres_remove_mask_waiter(lockres, &mw)) { + if (dlm_locked) + lockres_or_flags(lockres, + OCFS2_LOCK_NONBLOCK_FINISHED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = -EAGAIN; + } else { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto again; + } + } + if (wait) { + ret = ocfs2_wait_for_mask(&mw); + if (ret == 0) + goto again; + mlog_errno(ret); + } + ocfs2_update_lock_stats(lockres, level, &mw, ret); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!ret && lockres->l_lockdep_map.key != NULL) { + if (level == DLM_LOCK_PR) + rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass, + !!(arg_flags & OCFS2_META_LOCK_NOQUEUE), + caller_ip); + else + rwsem_acquire(&lockres->l_lockdep_map, l_subclass, + !!(arg_flags & OCFS2_META_LOCK_NOQUEUE), + caller_ip); + } +#endif + return ret; +} + +static inline int ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 lkm_flags, + int arg_flags) +{ + return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags, + 0, _RET_IP_); +} + + +static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + unsigned long caller_ip) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + ocfs2_dec_holders(lockres, level); + ocfs2_downconvert_on_unlock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (lockres->l_lockdep_map.key != NULL) + rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); +#endif +} + +static int ocfs2_create_new_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int ex, + int local) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned long flags; + u32 lkm_flags = local ? DLM_LKF_LOCAL : 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ocfs2_lock_create(osb, lockres, level, lkm_flags); +} + +/* Grants us an EX lock on the data and metadata resources, skipping + * the normal cluster directory lookup. Use this ONLY on newly created + * inodes which other nodes can't possibly see, and which haven't been + * hashed in the inode hash yet. This can give us a good performance + * increase as it'll skip the network broadcast normally associated + * with creating a new lock resource. */ +int ocfs2_create_new_inode_locks(struct inode *inode) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + BUG_ON(!ocfs2_inode_is_new(inode)); + + mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); + + /* NOTE: That we don't increment any of the holder counts, nor + * do we add anything to a journal handle. Since this is + * supposed to be a new inode which the cluster doesn't know + * about yet, there is no need to. As far as the LVB handling + * is concerned, this is basically like acquiring an EX lock + * on a resource which has an invalid one -- we'll set it + * valid when we release the EX. */ + + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1); + if (ret) { + mlog_errno(ret); + goto bail; + } + + /* + * We don't want to use DLM_LKF_LOCAL on a meta data lock as they + * don't use a generation in their lock names. + */ + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + +bail: + return ret; +} + +int ocfs2_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, + 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rw_unlock(struct inode *inode, int write) +{ + int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); +} + +/* + * ocfs2_open_lock always get PR mode lock. + */ +int ocfs2_open_lock(struct inode *inode) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu take PRMODE open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_PR, 0, 0); + if (status < 0) + mlog_errno(status); + +out: + return status; +} + +int ocfs2_try_open_lock(struct inode *inode, int write) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu try to take %s open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_is_hard_readonly(osb)) { + if (write) + status = -EROFS; + goto out; + } + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + /* + * The file system may already holding a PRMODE/EXMODE open lock. + * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on + * other nodes and the -EAGAIN will indicate to the caller that + * this inode is still in use. + */ + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + level, DLM_LKF_NOQUEUE, 0); + +out: + return status; +} + +/* + * ocfs2_open_unlock unlock PR and EX mode open locks. + */ +void ocfs2_open_unlock(struct inode *inode) +{ + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + if(lockres->l_ro_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_PR); + if(lockres->l_ex_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_EX); + +out: + return; +} + +static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, + int level) +{ + int ret; + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + unsigned long flags; + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + +retry_cancel: + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + ret = ocfs2_prepare_cancel_convert(osb, lockres); + if (ret) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + goto retry_cancel; + } + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_for_mask(&mw); + goto retry_cancel; + } + + ret = -ERESTARTSYS; + /* + * We may still have gotten the lock, in which case there's no + * point to restarting the syscall. + */ + if (lockres->l_level == level) + ret = 0; + + mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret, + lockres->l_flags, lockres->l_level, lockres->l_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + +out: + return ret; +} + +/* + * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of + * flock() calls. The locking approach this requires is sufficiently + * different from all other cluster lock types that we implement a + * separate path to the "low-level" dlm calls. In particular: + * + * - No optimization of lock levels is done - we take at exactly + * what's been requested. + * + * - No lock caching is employed. We immediately downconvert to + * no-lock at unlock time. This also means flock locks never go on + * the blocking list). + * + * - Since userspace can trivially deadlock itself with flock, we make + * sure to allow cancellation of a misbehaving applications flock() + * request. + * + * - Access to any flock lockres doesn't require concurrency, so we + * can simplify the code by requiring the caller to guarantee + * serialization of dlmglue flock calls. + */ +int ocfs2_file_lock(struct file *file, int ex, int trylock) +{ + int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + + if ((lockres->l_flags & OCFS2_LOCK_BUSY) || + (lockres->l_level > DLM_LOCK_NL)) { + mlog(ML_ERROR, + "File lock \"%s\" has busy or locked state: flags: 0x%lx, " + "level: %u\n", lockres->l_name, lockres->l_flags, + lockres->l_level); + return -EINVAL; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* + * Get the lock at NLMODE to start - that way we + * can cancel the upconvert request if need be. + */ + ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_wait_for_mask(&mw); + if (ret) { + mlog_errno(ret); + goto out; + } + spin_lock_irqsave(&lockres->l_lock, flags); + } + + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, + lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1); + if (ret) { + if (!trylock || (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ret = -EINVAL; + } + + ocfs2_recover_from_dlm_error(lockres, 1); + lockres_remove_mask_waiter(lockres, &mw); + goto out; + } + + ret = ocfs2_wait_for_mask_interruptible(&mw, lockres); + if (ret == -ERESTARTSYS) { + /* + * Userspace can cause deadlock itself with + * flock(). Current behavior locally is to allow the + * deadlock, but abort the system call if a signal is + * received. We follow this example, otherwise a + * poorly written program could sit in kernel until + * reboot. + * + * Handling this is a bit more complicated for Ocfs2 + * though. We can't exit this function with an + * outstanding lock request, so a cancel convert is + * required. We intentionally overwrite 'ret' - if the + * cancel fails and the lock was granted, it's easier + * to just bubble success back up to the user. + */ + ret = ocfs2_flock_handle_signal(lockres, level); + } else if (!ret && (level > lockres->l_level)) { + /* Trylock failed asynchronously */ + BUG_ON(!trylock); + ret = -EAGAIN; + } + +out: + + mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n", + lockres->l_name, ex, trylock, ret); + return ret; +} + +void ocfs2_file_unlock(struct file *file) +{ + int ret; + unsigned int gen; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) + return; + + if (lockres->l_level == DLM_LOCK_NL) + return; + + mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", + lockres->l_name, lockres->l_flags, lockres->l_level, + lockres->l_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + /* + * Fake a blocking ast for the downconvert code. + */ + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + lockres->l_blocking = DLM_LOCK_EX; + + gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL); + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen); + if (ret) { + mlog_errno(ret); + return; + } + + ret = ocfs2_wait_for_mask(&mw); + if (ret) + mlog_errno(ret); +} + +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int kick = 0; + + /* If we know that another node is waiting on our lock, kick + * the downconvert thread * pre-emptively when we reach a release + * condition. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { + switch(lockres->l_blocking) { + case DLM_LOCK_EX: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + kick = 1; + break; + case DLM_LOCK_PR: + if (!lockres->l_ex_holders) + kick = 1; + break; + default: + BUG(); + } + } + + if (kick) + ocfs2_wake_downconvert_thread(osb); +} + +#define OCFS2_SEC_BITS 34 +#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) + +/* LVB only has room for 64 bits of time here so we pack it for + * now. */ +static u64 ocfs2_pack_timespec(struct timespec *spec) +{ + u64 res; + u64 sec = spec->tv_sec; + u32 nsec = spec->tv_nsec; + + res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); + + return res; +} + +/* Call this with the lockres locked. I am reasonably sure we don't + * need ip_lock in this function as anyone who would be changing those + * values is supposed to be blocked in ocfs2_inode_lock right now. */ +static void __ocfs2_stuff_meta_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_meta_lvb *lvb; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + /* + * Invalidate the LVB of a deleted inode - this way other + * nodes are forced to go to disk and discover the new inode + * status. + */ + if (oi->ip_flags & OCFS2_INODE_DELETED) { + lvb->lvb_version = 0; + goto out; + } + + lvb->lvb_version = OCFS2_LVB_VERSION; + lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode)); + lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + lvb->lvb_ictime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + lvb->lvb_imtime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); + lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); + lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); + +out: + mlog_meta_lvb(0, lockres); +} + +static void ocfs2_unpack_timespec(struct timespec *spec, + u64 packed_time) +{ + spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; + spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; +} + +static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_meta_lvb(0, lockres); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + /* We're safe here without the lockres lock... */ + spin_lock(&oi->ip_lock); + oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); + i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); + + oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); + oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures); + ocfs2_set_inode_flags(inode); + + /* fast-symlinks are a special case */ + if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_inode_sector_count(inode); + + i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid)); + i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); + ocfs2_unpack_timespec(&inode->i_atime, + be64_to_cpu(lvb->lvb_iatime_packed)); + ocfs2_unpack_timespec(&inode->i_mtime, + be64_to_cpu(lvb->lvb_imtime_packed)); + ocfs2_unpack_timespec(&inode->i_ctime, + be64_to_cpu(lvb->lvb_ictime_packed)); + spin_unlock(&oi->ip_lock); +} + +static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) + && lvb->lvb_version == OCFS2_LVB_VERSION + && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) + return 1; + return 0; +} + +/* Determine whether a lock resource needs to be refreshed, and + * arbitrate who gets to refresh it. + * + * 0 means no refresh needed. + * + * > 0 means you need to refresh this and you MUST call + * ocfs2_complete_lock_res_refresh afterwards. */ +static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) +{ + unsigned long flags; + int status = 0; + +refresh_check: + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_on_refreshing_lock(lockres); + goto refresh_check; + } + + /* Ok, I'll be the one to refresh this lock. */ + lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = 1; +bail: + mlog(0, "status %d\n", status); + return status; +} + +/* If status is non zero, I'll mark it as not being in refresh + * anymroe, but i won't clear the needs refresh flag. */ +static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, + int status) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); + if (!status) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +/* may or may not return a bh if it went to disk. */ +static int ocfs2_inode_lock_update(struct inode *inode, + struct buffer_head **bh) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_dinode *fe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_mount_local(osb)) + goto bail; + + spin_lock(&oi->ip_lock); + if (oi->ip_flags & OCFS2_INODE_DELETED) { + mlog(0, "Orphaned inode %llu was deleted while we " + "were waiting on a lock. ip_flags = 0x%x\n", + (unsigned long long)oi->ip_blkno, oi->ip_flags); + spin_unlock(&oi->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&oi->ip_lock); + + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + + /* This will discard any caching information we might have had + * for the inode metadata. */ + ocfs2_metadata_cache_purge(INODE_CACHE(inode)); + + ocfs2_extent_map_trunc(inode, 0); + + if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { + mlog(0, "Trusting LVB on inode %llu\n", + (unsigned long long)oi->ip_blkno); + ocfs2_refresh_inode_from_lvb(inode); + } else { + /* Boo, we have to go to disk. */ + /* read bh, cast, ocfs2_refresh_inode */ + status = ocfs2_read_inode_block(inode, bh); + if (status < 0) { + mlog_errno(status); + goto bail_refresh; + } + fe = (struct ocfs2_dinode *) (*bh)->b_data; + + /* This is a good chance to make sure we're not + * locking an invalid object. ocfs2_read_inode_block() + * already checked that the inode block is sane. + * + * We bug on a stale inode here because we checked + * above whether it was wiped from disk. The wiping + * node provides a guarantee that we receive that + * message and can mark the inode before dropping any + * locks associated with it. */ + mlog_bug_on_msg(inode->i_generation != + le32_to_cpu(fe->i_generation), + "Invalid dinode %llu disk generation: %u " + "inode->i_generation: %u\n", + (unsigned long long)oi->ip_blkno, + le32_to_cpu(fe->i_generation), + inode->i_generation); + mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), + "Stale dinode %llu dtime: %llu flags: 0x%x\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_dtime), + le32_to_cpu(fe->i_flags)); + + ocfs2_refresh_inode(inode, fe); + ocfs2_track_lock_refresh(lockres); + } + + status = 0; +bail_refresh: + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + return status; +} + +static int ocfs2_assign_bh(struct inode *inode, + struct buffer_head **ret_bh, + struct buffer_head *passed_bh) +{ + int status; + + if (passed_bh) { + /* Ok, the update went to disk for us, use the + * returned bh. */ + *ret_bh = passed_bh; + get_bh(*ret_bh); + + return 0; + } + + status = ocfs2_read_inode_block(inode, ret_bh); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * returns < 0 error if the callback will never be called, otherwise + * the result of the lock will be communicated via the callback. + */ +int ocfs2_inode_lock_full_nested(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + int arg_flags, + int subclass) +{ + int status, level, acquired; + u32 dlm_flags; + struct ocfs2_lock_res *lockres = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *local_bh = NULL; + + BUG_ON(!inode); + + mlog(0, "inode %llu, take %s META lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + status = 0; + acquired = 0; + /* We'll allow faking a readonly metadata lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto getbh; + } + + if (ocfs2_mount_local(osb)) + goto local; + + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + ocfs2_wait_for_recovery(osb); + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + dlm_flags = 0; + if (arg_flags & OCFS2_META_LOCK_NOQUEUE) + dlm_flags |= DLM_LKF_NOQUEUE; + + status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, + arg_flags, subclass, _RET_IP_); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto bail; + } + + /* Notify the error cleanup path to drop the cluster lock. */ + acquired = 1; + + /* We wait twice because a node may have died while we were in + * the lower dlm layers. The second time though, we've + * committed to owning this lock so we don't allow signals to + * abort the operation. */ + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + ocfs2_wait_for_recovery(osb); + +local: + /* + * We only see this flag if we're being called from + * ocfs2_read_locked_inode(). It means we're locking an inode + * which hasn't been populated yet, so clear the refresh flag + * and let the caller handle it. + */ + if (inode->i_state & I_NEW) { + status = 0; + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, 0); + goto bail; + } + + /* This is fun. The caller may want a bh back, or it may + * not. ocfs2_inode_lock_update definitely wants one in, but + * may or may not read one, depending on what's in the + * LVB. The result of all of this is that we've *only* gone to + * disk if we have to, so the complexity is worthwhile. */ + status = ocfs2_inode_lock_update(inode, &local_bh); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } +getbh: + if (ret_bh) { + status = ocfs2_assign_bh(inode, ret_bh, local_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (status < 0) { + if (ret_bh && (*ret_bh)) { + brelse(*ret_bh); + *ret_bh = NULL; + } + if (acquired) + ocfs2_inode_unlock(inode, ex); + } + + if (local_bh) + brelse(local_bh); + + return status; +} + +/* + * This is working around a lock inversion between tasks acquiring DLM + * locks while holding a page lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring page locks. + * + * ** These _with_page variantes are only intended to be called from aop + * methods that hold page locks and return a very specific *positive* error + * code that aop methods pass up to the VFS -- test for errors with != 0. ** + * + * The DLM is called such that it returns -EAGAIN if it would have + * blocked waiting for the downconvert thread. In that case we unlock + * our page so the downconvert thread can make progress. Once we've + * done this we have to return AOP_TRUNCATED_PAGE so the aop method + * that called us can bubble that back up into the VFS who will then + * immediately retry the aop call. + * + * We do a blocking lock and immediate unlock before returning, though, so that + * the lock has a great chance of being cached on this node by the time the VFS + * calls back to retry the aop. This has a potential to livelock as nodes + * ping locks back and forth, but that's a risk we're willing to take to avoid + * the lock inversion simply. + */ +int ocfs2_inode_lock_with_page(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct page *page) +{ + int ret; + + ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +int ocfs2_inode_lock_atime(struct inode *inode, + struct vfsmount *vfsmnt, + int *level) +{ + int ret; + + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* + * If we should update atime, we will get EX lock, + * otherwise we just get PR lock. + */ + if (ocfs2_should_update_atime(inode, vfsmnt)) { + struct buffer_head *bh = NULL; + + ocfs2_inode_unlock(inode, 0); + ret = ocfs2_inode_lock(inode, &bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + *level = 1; + if (ocfs2_should_update_atime(inode, vfsmnt)) + ocfs2_update_inode_atime(inode, bh); + if (bh) + brelse(bh); + } else + *level = 0; + + return ret; +} + +void ocfs2_inode_unlock(struct inode *inode, + int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop %s META lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); +} + +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int status = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &osb->osb_orphan_scan.os_lockres; + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); + if (status < 0) + return status; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION) + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + else + *seqno = osb->osb_orphan_scan.os_seqno + 1; + + return status; +} + +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) { + lockres = &osb->osb_orphan_scan.os_lockres; + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; + lvb->lvb_os_seqno = cpu_to_be32(seqno); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); + } +} + +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex) +{ + int status = 0; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* The super block lock path is really in the best position to + * know when resources covered by the lock need to be + * refreshed, so we do it here. Of course, making sense of + * everything is up to the caller :) */ + status = ocfs2_should_refresh_lock_res(lockres); + if (status) { + status = ocfs2_refresh_slot_info(osb); + + ocfs2_complete_lock_res_refresh(lockres, status); + + if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); + mlog_errno(status); + } + ocfs2_track_lock_refresh(lockres); + } +bail: + return status; +} + +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +int ocfs2_rename_lock(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rename_unlock(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, + 0, 0); + if (status < 0) + mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + + return status; +} + +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) +{ + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, + ex ? LKM_EXMODE : LKM_PRMODE); +} + +int ocfs2_dentry_lock(struct dentry *dentry, int ex) +{ + int ret; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + BUG_ON(!dl); + + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + return -EROFS; + return 0; + } + + if (ocfs2_mount_local(osb)) + return 0; + + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +void ocfs2_dentry_unlock(struct dentry *dentry, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); +} + +/* Reference counting of the dlm debug structure. We want this because + * open references on the debug inodes can live on after a mount, so + * we can't rely on the ocfs2_super to always exist. */ +static void ocfs2_dlm_debug_free(struct kref *kref) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); + + kfree(dlm_debug); +} + +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) +{ + if (dlm_debug) + kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); +} + +static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) +{ + kref_get(&debug->d_refcnt); +} + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); + if (!dlm_debug) { + mlog_errno(-ENOMEM); + goto out; + } + + kref_init(&dlm_debug->d_refcnt); + INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); + dlm_debug->d_locking_state = NULL; +out: + return dlm_debug; +} + +/* Access to this is arbitrated for us via seq_file->sem. */ +struct ocfs2_dlm_seq_priv { + struct ocfs2_dlm_debug *p_dlm_debug; + struct ocfs2_lock_res p_iter_res; + struct ocfs2_lock_res p_tmp_res; +}; + +static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, + struct ocfs2_dlm_seq_priv *priv) +{ + struct ocfs2_lock_res *iter, *ret = NULL; + struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; + + assert_spin_locked(&ocfs2_dlm_tracking_lock); + + list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { + /* discover the head of the list */ + if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { + mlog(0, "End of list found, %p\n", ret); + break; + } + + /* We track our "dummy" iteration lockres' by a NULL + * l_ops field. */ + if (iter->l_ops != NULL) { + ret = iter; + break; + } + } + + return ret; +} + +static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); + if (iter) { + /* Since lockres' have the lifetime of their container + * (which can be inodes, ocfs2_supers, etc) we want to + * copy this out to a temporary lockres while still + * under the spinlock. Obviously after this we can't + * trust any pointers on the copy returned, but that's + * ok as the information we want isn't typically held + * in them. */ + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter = v; + struct ocfs2_lock_res *dummy = &priv->p_iter_res; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(iter, priv); + list_del_init(&dummy->l_debug_list); + if (iter) { + list_add(&dummy->l_debug_list, &iter->l_debug_list); + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +/* + * Version is used by debugfs.ocfs2 to determine the format being used + * + * New in version 2 + * - Lock stats printed + * New in version 3 + * - Max time in lock stats is in usecs (instead of nsecs) + */ +#define OCFS2_DLM_DEBUG_STR_VERSION 3 +static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) +{ + int i; + char *lvb; + struct ocfs2_lock_res *lockres = v; + + if (!lockres) + return -EINVAL; + + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); + + if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) + seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1, + lockres->l_name, + (unsigned int)ocfs2_get_dentry_lock_ino(lockres)); + else + seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name); + + seq_printf(m, "%d\t" + "0x%lx\t" + "0x%x\t" + "0x%x\t" + "%u\t" + "%u\t" + "%d\t" + "%d\t", + lockres->l_level, + lockres->l_flags, + lockres->l_action, + lockres->l_unlock_action, + lockres->l_ro_holders, + lockres->l_ex_holders, + lockres->l_requested, + lockres->l_blocking); + + /* Dump the raw LVB */ + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + for(i = 0; i < DLM_LVB_LEN; i++) + seq_printf(m, "0x%x\t", lvb[i]); + +#ifdef CONFIG_OCFS2_FS_STATS +# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets) +# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets) +# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail) +# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail) +# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total) +# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total) +# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) +# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) +# define lock_refresh(_l) ((_l)->l_lock_refresh) +#else +# define lock_num_prmode(_l) (0) +# define lock_num_exmode(_l) (0) +# define lock_num_prmode_failed(_l) (0) +# define lock_num_exmode_failed(_l) (0) +# define lock_total_prmode(_l) (0ULL) +# define lock_total_exmode(_l) (0ULL) +# define lock_max_prmode(_l) (0) +# define lock_max_exmode(_l) (0) +# define lock_refresh(_l) (0) +#endif + /* The following seq_print was added in version 2 of this output */ + seq_printf(m, "%u\t" + "%u\t" + "%u\t" + "%u\t" + "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%u\t", + lock_num_prmode(lockres), + lock_num_exmode(lockres), + lock_num_prmode_failed(lockres), + lock_num_exmode_failed(lockres), + lock_total_prmode(lockres), + lock_total_exmode(lockres), + lock_max_prmode(lockres), + lock_max_exmode(lockres), + lock_refresh(lockres)); + + /* End the line */ + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations ocfs2_dlm_seq_ops = { + .start = ocfs2_dlm_seq_start, + .stop = ocfs2_dlm_seq_stop, + .next = ocfs2_dlm_seq_next, + .show = ocfs2_dlm_seq_show, +}; + +static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct ocfs2_dlm_seq_priv *priv = seq->private; + struct ocfs2_lock_res *res = &priv->p_iter_res; + + ocfs2_remove_lockres_tracking(res); + ocfs2_put_dlm_debug(priv->p_dlm_debug); + return seq_release_private(inode, file); +} + +static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) +{ + struct ocfs2_dlm_seq_priv *priv; + struct ocfs2_super *osb; + + priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv)); + if (!priv) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + osb = inode->i_private; + ocfs2_get_dlm_debug(osb->osb_dlm_debug); + priv->p_dlm_debug = osb->osb_dlm_debug; + INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); + + ocfs2_add_lockres_tracking(&priv->p_iter_res, + priv->p_dlm_debug); + + return 0; +} + +static const struct file_operations ocfs2_dlm_debug_fops = { + .open = ocfs2_dlm_debug_open, + .release = ocfs2_dlm_debug_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +{ + int ret = 0; + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + dlm_debug->d_locking_state = debugfs_create_file("locking_state", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_dlm_debug_fops); + if (!dlm_debug->d_locking_state) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking state debugfs file.\n"); + goto out; + } + + ocfs2_get_dlm_debug(dlm_debug); +out: + return ret; +} + +static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) +{ + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + if (dlm_debug) { + debugfs_remove(dlm_debug->d_locking_state); + ocfs2_put_dlm_debug(dlm_debug); + } +} + +int ocfs2_dlm_init(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_cluster_connection *conn = NULL; + + if (ocfs2_mount_local(osb)) { + osb->node_num = 0; + goto local; + } + + status = ocfs2_dlm_init_debug(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* launch downconvert thread */ + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + if (IS_ERR(osb->dc_task)) { + status = PTR_ERR(osb->dc_task); + osb->dc_task = NULL; + mlog_errno(status); + goto bail; + } + + /* for now, uuid == domain */ + status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), + osb->uuid_str, + strlen(osb->uuid_str), + &lproto, ocfs2_do_node_down, osb, + &conn); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_cluster_this_node(conn, &osb->node_num); + if (status < 0) { + mlog_errno(status); + mlog(ML_ERROR, + "could not find this host's node number\n"); + ocfs2_cluster_disconnect(conn, 0); + goto bail; + } + +local: + ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); + ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); + + osb->cconn = conn; + + status = 0; +bail: + if (status < 0) { + ocfs2_dlm_shutdown_debug(osb); + if (osb->dc_task) + kthread_stop(osb->dc_task); + } + + return status; +} + +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, + int hangup_pending) +{ + ocfs2_drop_osb_locks(osb); + + /* + * Now that we have dropped all locks and ocfs2_dismount_volume() + * has disabled recovery, the DLM won't be talking to us. It's + * safe to tear things down before disconnecting the cluster. + */ + + if (osb->dc_task) { + kthread_stop(osb->dc_task); + osb->dc_task = NULL; + } + + ocfs2_lock_res_free(&osb->osb_super_lockres); + ocfs2_lock_res_free(&osb->osb_rename_lockres); + ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); + + ocfs2_cluster_disconnect(osb->cconn, hangup_pending); + osb->cconn = NULL; + + ocfs2_dlm_shutdown_debug(osb); +} + +static int ocfs2_drop_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + unsigned long flags; + u32 lkm_flags = 0; + + /* We didn't get anywhere near actually using this lockres. */ + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) + goto out; + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= DLM_LKF_VALBLK; + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), + "lockres %s, flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + + while (lockres->l_flags & OCFS2_LOCK_BUSY) { + mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " + "%u, unlock_action = %u\n", + lockres->l_name, lockres->l_flags, lockres->l_action, + lockres->l_unlock_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* XXX: Today we just wait on any busy + * locks... Perhaps we need to cancel converts in the + * future? */ + ocfs2_wait_on_busy_lock(lockres); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level == DLM_LOCK_EX && + !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } + + if (lockres->l_flags & OCFS2_LOCK_BUSY) + mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", + lockres->l_name); + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto out; + } + + lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); + + /* make sure we never get here while waiting for an ast to + * fire. */ + BUG_ON(lockres->l_action != OCFS2_AST_INVALID); + + /* is this necessary? */ + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "lock %s\n", lockres->l_name); + + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); + mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); + ocfs2_dlm_dump_lksb(&lockres->l_lksb); + BUG(); + } + mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n", + lockres->l_name); + + ocfs2_wait_on_busy_lock(lockres); +out: + return 0; +} + +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +/* Mark the lockres as being dropped. It will no longer be + * queued if blocking, but we still may have to wait on it + * being dequeued from the downconvert thread before we can consider + * it safe to drop. + * + * You can *not* attempt to call cluster_lock on this lockres anymore. */ +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_mask_waiter mw; + unsigned long flags, flags2; + + ocfs2_init_mask_waiter(&mw); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } + while (lockres->l_flags & OCFS2_LOCK_QUEUED) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "Waiting on lockres %s\n", lockres->l_name); + + status = ocfs2_wait_for_mask(&mw); + if (status) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ocfs2_mark_lockres_freeing(osb, lockres); + ret = ocfs2_drop_lock(osb, lockres); + if (ret) + mlog_errno(ret); +} + +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) +{ + ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres); +} + +int ocfs2_drop_inode_locks(struct inode *inode) +{ + int status, err; + + /* No need to call ocfs2_mark_lockres_freeing here - + * ocfs2_clear_inode has done it for us. */ + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_open_lockres); + if (err < 0) + mlog_errno(err); + + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_inode_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_rw_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + return status; +} + +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + assert_spin_locked(&lockres->l_lock); + + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); + + if (lockres->l_level <= new_level) { + mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, " + "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, " + "block %d, pgen %d\n", lockres->l_name, lockres->l_level, + new_level, list_empty(&lockres->l_blocked_list), + list_empty(&lockres->l_mask_waiters), lockres->l_type, + lockres->l_flags, lockres->l_ro_holders, + lockres->l_ex_holders, lockres->l_action, + lockres->l_unlock_action, lockres->l_requested, + lockres->l_blocking, lockres->l_pending_gen); + BUG(); + } + + mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n", + lockres->l_name, lockres->l_level, new_level, lockres->l_blocking); + + lockres->l_action = OCFS2_AST_DOWNCONVERT; + lockres->l_requested = new_level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + return lockres_set_pending(lockres); +} + +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb, + unsigned int generation) +{ + int ret; + u32 dlm_flags = DLM_LKF_CONVERT; + + mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, + lockres->l_level, new_level); + + if (lvb) + dlm_flags |= DLM_LKF_VALBLK; + + ret = ocfs2_dlm_lock(osb->cconn, + new_level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, generation, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 1); + goto bail; + } + + ret = 0; +bail: + return ret; +} + +/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */ +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + /* If we're already trying to cancel a lock conversion + * then just drop the spinlock and allow the caller to + * requeue this lock. */ + mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name); + return 0; + } + + /* were we in a convert when we got the bast fire? */ + BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && + lockres->l_action != OCFS2_AST_DOWNCONVERT); + /* set things up for the unlockast to know to just + * clear out the ast_action and unset busy, etc. */ + lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), + "lock %s, invalid flags: 0x%lx\n", + lockres->l_name, lockres->l_flags); + + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); + + return 1; +} + +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, + DLM_LKF_CANCEL); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 0); + } + + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); + + return ret; +} + +static int ocfs2_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + struct ocfs2_unblock_ctl *ctl) +{ + unsigned long flags; + int blocking; + int new_level; + int level; + int ret = 0; + int set_lvb = 0; + unsigned int gen; + + spin_lock_irqsave(&lockres->l_lock, flags); + +recheck: + /* + * Is it still blocking? If not, we have no more work to do. + */ + if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) { + BUG_ON(lockres->l_blocking != DLM_LOCK_NL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = 0; + goto leave; + } + + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + /* XXX + * This is a *big* race. The OCFS2_LOCK_PENDING flag + * exists entirely for one reason - another thread has set + * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock(). + * + * If we do ocfs2_cancel_convert() before the other thread + * calls dlm_lock(), our cancel will do nothing. We will + * get no ast, and we will have no way of knowing the + * cancel failed. Meanwhile, the other thread will call + * into dlm_lock() and wait...forever. + * + * Why forever? Because another node has asked for the + * lock first; that's why we're here in unblock_lock(). + * + * The solution is OCFS2_LOCK_PENDING. When PENDING is + * set, we just requeue the unblock. Only when the other + * thread has called dlm_lock() and cleared PENDING will + * we then cancel their request. + * + * All callers of dlm_lock() must set OCFS2_DLM_PENDING + * at the same time they set OCFS2_DLM_BUSY. They must + * clear OCFS2_DLM_PENDING after dlm_lock() returns. + */ + if (lockres->l_flags & OCFS2_LOCK_PENDING) { + mlog(ML_BASTS, "lockres %s, ReQ: Pending\n", + lockres->l_name); + goto leave_requeue; + } + + ctl->requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + /* + * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is + * set when the ast is received for an upconvert just before the + * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast + * on the heels of the ast, we want to delay the downconvert just + * enough to allow the up requestor to do its task. Because this + * lock is in the blocked queue, the lock will be downconverted + * as soon as the requestor is done with the lock. + */ + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) + goto leave_requeue; + + /* + * How can we block and yet be at NL? We were trying to upconvert + * from NL and got canceled. The code comes back here, and now + * we notice and clear BLOCKING. + */ + if (lockres->l_level == DLM_LOCK_NL) { + BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); + mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name); + lockres->l_blocking = DLM_LOCK_NL; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto leave; + } + + /* if we're blocking an exclusive and we have *any* holders, + * then requeue. */ + if ((lockres->l_blocking == DLM_LOCK_EX) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n", + lockres->l_name, lockres->l_ex_holders, + lockres->l_ro_holders); + goto leave_requeue; + } + + /* If it's a PR we're blocking, then only + * requeue if we've got any EX holders */ + if (lockres->l_blocking == DLM_LOCK_PR && + lockres->l_ex_holders) { + mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n", + lockres->l_name, lockres->l_ex_holders); + goto leave_requeue; + } + + /* + * Can we get a lock in this state if the holder counts are + * zero? The meta data unblock code used to check this. + */ + if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) { + mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n", + lockres->l_name); + goto leave_requeue; + } + + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + if (lockres->l_ops->check_downconvert + && !lockres->l_ops->check_downconvert(lockres, new_level)) { + mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n", + lockres->l_name); + goto leave_requeue; + } + + /* If we get here, then we know that there are no more + * incompatible holders (and anyone asking for an incompatible + * lock is blocked). We can now downconvert the lock */ + if (!lockres->l_ops->downconvert_worker) + goto downconvert; + + /* Some lockres types want to do a bit of work before + * downconverting a lock. Allow that here. The worker function + * may sleep, so we save off a copy of what we're blocking as + * it may change while we're not holding the spin lock. */ + blocking = lockres->l_blocking; + level = lockres->l_level; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); + + if (ctl->unblock_action == UNBLOCK_STOP_POST) { + mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n", + lockres->l_name); + goto leave; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { + /* If this changed underneath us, then we can't drop + * it just yet. */ + mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, " + "Recheck\n", lockres->l_name, blocking, + lockres->l_blocking, level, lockres->l_level); + goto recheck; + } + +downconvert: + ctl->requeue = 0; + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_level == DLM_LOCK_EX) + set_lvb = 1; + + /* + * We only set the lvb if the lock has been fully + * refreshed - otherwise we risk setting stale + * data. Otherwise, there's no need to actually clear + * out the lvb here as it's value is still valid. + */ + if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } + + gen = ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, + gen); + +leave: + if (ret) + mlog_errno(ret); + return ret; + +leave_requeue: + spin_unlock_irqrestore(&lockres->l_lock, flags); + ctl->requeue = 1; + + return 0; +} + +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct inode *inode; + struct address_space *mapping; + struct ocfs2_inode_info *oi; + + inode = ocfs2_lock_res_inode(lockres); + mapping = inode->i_mapping; + + if (S_ISDIR(inode->i_mode)) { + oi = OCFS2_I(inode); + oi->ip_dir_lock_gen++; + mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); + goto out; + } + + if (!S_ISREG(inode->i_mode)) + goto out; + + /* + * We need this before the filemap_fdatawrite() so that it can + * transfer the dirty bit from the PTE to the + * page. Unfortunately this means that even for EX->PR + * downconverts, we'll lose our mappings and have to build + * them up again. + */ + unmap_mapping_range(mapping, 0, 0, 0); + + if (filemap_fdatawrite(mapping)) { + mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + sync_mapping_buffers(mapping); + if (blocking == DLM_LOCK_EX) { + truncate_inode_pages(mapping, 0); + } else { + /* We only need to wait on the I/O if we're not also + * truncating pages because truncate_inode_pages waits + * for us above. We don't truncate pages if we're + * blocking anything < EXMODE because we want to keep + * them around in that case. */ + filemap_fdatawait(mapping); + } + +out: + return UNBLOCK_CONTINUE; +} + +static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci, + struct ocfs2_lock_res *lockres, + int new_level) +{ + int checkpointed = ocfs2_ci_fully_checkpointed(ci); + + BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); + BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); + + if (checkpointed) + return 1; + + ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci))); + return 0; +} + +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); +} + +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + __ocfs2_stuff_meta_lvb(inode); +} + +/* + * Does the final reference drop on our dentry lock. Right now this + * happens in the downconvert thread, but we could choose to simplify the + * dlmglue API and push these off to the ocfs2_wq in the future. + */ +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + ocfs2_dentry_lock_put(osb, dl); +} + +/* + * d_delete() matching dentries before the lock downconvert. + * + * At this point, any process waiting to destroy the + * dentry_lock due to last ref count is stopped by the + * OCFS2_LOCK_QUEUED flag. + * + * We have two potential problems + * + * 1) If we do the last reference drop on our dentry_lock (via dput) + * we'll wind up in ocfs2_release_dentry_lock(), waiting on + * the downconvert to finish. Instead we take an elevated + * reference and push the drop until after we've completed our + * unblock processing. + * + * 2) There might be another process with a final reference, + * waiting on us to finish processing. If this is the case, we + * detect it and exit out - there's no more dentries anyway. + */ +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode); + struct dentry *dentry; + unsigned long flags; + int extra_ref = 0; + + /* + * This node is blocking another node from getting a read + * lock. This happens when we've renamed within a + * directory. We've forced the other nodes to d_delete(), but + * we never actually dropped our lock because it's still + * valid. The downconvert code will retain a PR for this node, + * so there's no further work to do. + */ + if (blocking == DLM_LOCK_PR) + return UNBLOCK_CONTINUE; + + /* + * Mark this inode as potentially orphaned. The code in + * ocfs2_delete_inode() will figure out whether it actually + * needs to be freed or not. + */ + spin_lock(&oi->ip_lock); + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + + /* + * Yuck. We need to make sure however that the check of + * OCFS2_LOCK_FREEING and the extra reference are atomic with + * respect to a reference decrement or the setting of that + * flag. + */ + spin_lock_irqsave(&lockres->l_lock, flags); + spin_lock(&dentry_attach_lock); + if (!(lockres->l_flags & OCFS2_LOCK_FREEING) + && dl->dl_count) { + dl->dl_count++; + extra_ref = 1; + } + spin_unlock(&dentry_attach_lock); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "extra_ref = %d\n", extra_ref); + + /* + * We have a process waiting on us in ocfs2_dentry_iput(), + * which means we can't have any more outstanding + * aliases. There's no need to do any more work. + */ + if (!extra_ref) + return UNBLOCK_CONTINUE; + + spin_lock(&dentry_attach_lock); + while (1) { + dentry = ocfs2_find_local_alias(dl->dl_inode, + dl->dl_parent_blkno, 1); + if (!dentry) + break; + spin_unlock(&dentry_attach_lock); + + if (S_ISDIR(dl->dl_inode->i_mode)) + shrink_dcache_parent(dentry); + + mlog(0, "d_delete(%pd);\n", dentry); + + /* + * The following dcache calls may do an + * iput(). Normally we don't want that from the + * downconverting thread, but in this case it's ok + * because the requesting node already has an + * exclusive lock on the inode, so it can't be queued + * for a downconvert. + */ + d_delete(dentry); + dput(dentry); + + spin_lock(&dentry_attach_lock); + } + spin_unlock(&dentry_attach_lock); + + /* + * If we are the last holder of this dentry lock, there is no + * reason to downconvert so skip straight to the unlock. + */ + if (dl->dl_count == 1) + return UNBLOCK_STOP_POST; + + return UNBLOCK_CONTINUE_POST; +} + +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level); +} + +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + ocfs2_metadata_cache_purge(&tree->rf_ci); + + return UNBLOCK_CONTINUE; +} + +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_qinfo_lvb *lvb; + struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres); + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; + lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); + lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace); + lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms); + lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); + lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); + lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); +} + +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) +{ + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + struct buffer_head *bh = NULL; + struct ocfs2_global_disk_dqinfo *gdinfo; + int status = 0; + + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) { + info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace); + info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace); + oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms); + oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks); + oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk); + oinfo->dqi_gi.dqi_free_entry = + be32_to_cpu(lvb->lvb_free_entry); + } else { + status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode, + oinfo->dqi_giblk, &bh); + if (status) { + mlog_errno(status); + goto bail; + } + gdinfo = (struct ocfs2_global_disk_dqinfo *) + (bh->b_data + OCFS2_GLOBAL_INFO_OFF); + info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace); + info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = + le32_to_cpu(gdinfo->dqi_free_entry); + brelse(bh); + ocfs2_track_lock_refresh(lockres); + } + +bail: + return status; +} + +/* Lock quota info, this function expects at least shared lock on the quota file + * so that we can safely refresh quota info from disk. */ +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + /* On RO devices, locking really isn't needed... */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + /* OK, we have the lock but we need to refresh the quota info */ + status = ocfs2_refresh_qinfo(oinfo); + if (status) + ocfs2_qinfo_unlock(oinfo, ex); + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + return status; +} + +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int status; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_unblock_ctl ctl = {0, 0,}; + unsigned long flags; + + /* Our reference to the lockres in this function can be + * considered valid until we remove the OCFS2_LOCK_QUEUED + * flag. */ + + BUG_ON(!lockres); + BUG_ON(!lockres->l_ops); + + mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name); + + /* Detect whether a lock has been marked as going away while + * the downconvert thread was processing other things. A lock can + * still be marked with OCFS2_LOCK_FREEING after this check, + * but short circuiting here will still save us some + * performance. */ + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_FREEING) + goto unqueue; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = ocfs2_unblock_lock(osb, lockres, &ctl); + if (status < 0) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); +unqueue: + if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) { + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + } else + ocfs2_schedule_blocked_lock(osb, lockres); + + mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name, + ctl.requeue ? "yes" : "no"); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + if (ctl.unblock_action != UNBLOCK_CONTINUE + && lockres->l_ops->post_unlock) + lockres->l_ops->post_unlock(osb, lockres); +} + +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + unsigned long flags; + + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_flags & OCFS2_LOCK_FREEING) { + /* Do not schedule a lock for downconvert when it's on + * the way to destruction - any nodes wanting access + * to the resource will get it soon. */ + mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + return; + } + + lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); + + spin_lock_irqsave(&osb->dc_task_lock, flags); + if (list_empty(&lockres->l_blocked_list)) { + list_add_tail(&lockres->l_blocked_list, + &osb->blocked_lock_list); + osb->blocked_lock_count++; + } + spin_unlock_irqrestore(&osb->dc_task_lock, flags); +} + +static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) +{ + unsigned long processed; + unsigned long flags; + struct ocfs2_lock_res *lockres; + + spin_lock_irqsave(&osb->dc_task_lock, flags); + /* grab this early so we know to try again if a state change and + * wake happens part-way through our work */ + osb->dc_work_sequence = osb->dc_wake_sequence; + + processed = osb->blocked_lock_count; + while (processed) { + BUG_ON(list_empty(&osb->blocked_lock_list)); + + lockres = list_entry(osb->blocked_lock_list.next, + struct ocfs2_lock_res, l_blocked_list); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags); + + BUG_ON(!processed); + processed--; + + ocfs2_process_blocked_lock(osb, lockres); + + spin_lock_irqsave(&osb->dc_task_lock, flags); + } + spin_unlock_irqrestore(&osb->dc_task_lock, flags); +} + +static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb) +{ + int empty = 0; + unsigned long flags; + + spin_lock_irqsave(&osb->dc_task_lock, flags); + if (list_empty(&osb->blocked_lock_list)) + empty = 1; + + spin_unlock_irqrestore(&osb->dc_task_lock, flags); + return empty; +} + +static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) +{ + int should_wake = 0; + unsigned long flags; + + spin_lock_irqsave(&osb->dc_task_lock, flags); + if (osb->dc_work_sequence != osb->dc_wake_sequence) + should_wake = 1; + spin_unlock_irqrestore(&osb->dc_task_lock, flags); + + return should_wake; +} + +static int ocfs2_downconvert_thread(void *arg) +{ + int status = 0; + struct ocfs2_super *osb = arg; + + /* only quit once we've been asked to stop and there is no more + * work available */ + while (!(kthread_should_stop() && + ocfs2_downconvert_thread_lists_empty(osb))) { + + wait_event_interruptible(osb->dc_event, + ocfs2_downconvert_thread_should_wake(osb) || + kthread_should_stop()); + + mlog(0, "downconvert_thread: awoken\n"); + + ocfs2_downconvert_thread_do_work(osb); + } + + osb->dc_task = NULL; + return status; +} + +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) +{ + unsigned long flags; + + spin_lock_irqsave(&osb->dc_task_lock, flags); + /* make sure the voting thread gets a swipe at whatever changes + * the caller may have made to the voting state */ + osb->dc_wake_sequence++; + spin_unlock_irqrestore(&osb->dc_task_lock, flags); + wake_up(&osb->dc_event); +} diff --git a/kernel/fs/ocfs2/dlmglue.h b/kernel/fs/ocfs2/dlmglue.h new file mode 100644 index 000000000..d293a22c3 --- /dev/null +++ b/kernel/fs/ocfs2/dlmglue.h @@ -0,0 +1,173 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef DLMGLUE_H +#define DLMGLUE_H + +#include "dcache.h" + +#define OCFS2_LVB_VERSION 5 + +struct ocfs2_meta_lvb { + __u8 lvb_version; + __u8 lvb_reserved0; + __be16 lvb_idynfeatures; + __be32 lvb_iclusters; + __be32 lvb_iuid; + __be32 lvb_igid; + __be64 lvb_iatime_packed; + __be64 lvb_ictime_packed; + __be64 lvb_imtime_packed; + __be64 lvb_isize; + __be16 lvb_imode; + __be16 lvb_inlink; + __be32 lvb_iattr; + __be32 lvb_igeneration; + __be32 lvb_reserved2; +}; + +#define OCFS2_QINFO_LVB_VERSION 1 + +struct ocfs2_qinfo_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_bgrace; + __be32 lvb_igrace; + __be32 lvb_syncms; + __be32 lvb_blocks; + __be32 lvb_free_blk; + __be32 lvb_free_entry; +}; + +#define OCFS2_ORPHAN_LVB_VERSION 1 + +struct ocfs2_orphan_scan_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_os_seqno; +}; + +/* ocfs2_inode_lock_full() 'arg_flags' flags */ +/* don't wait on recovery. */ +#define OCFS2_META_LOCK_RECOVERY (0x01) +/* Instruct the dlm not to queue ourselves on the other node. */ +#define OCFS2_META_LOCK_NOQUEUE (0x02) +/* don't block waiting for the downconvert thread, instead return -EAGAIN */ +#define OCFS2_LOCK_NONBLOCK (0x04) + +/* Locking subclasses of inode cluster lock */ +enum { + OI_LS_NORMAL = 0, + OI_LS_PARENT, + OI_LS_RENAME1, + OI_LS_RENAME2, + OI_LS_REFLINK_TARGET, +}; + +int ocfs2_dlm_init(struct ocfs2_super *osb); +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending); +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + unsigned int generation, + struct inode *inode); +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode); +struct ocfs2_file_private; +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp); +struct ocfs2_mem_dqinfo; +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info); +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation); +void ocfs2_lock_res_free(struct ocfs2_lock_res *res); +int ocfs2_create_new_inode_locks(struct inode *inode); +int ocfs2_drop_inode_locks(struct inode *inode); +int ocfs2_rw_lock(struct inode *inode, int write); +void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_open_lock(struct inode *inode); +int ocfs2_try_open_lock(struct inode *inode, int write); +void ocfs2_open_unlock(struct inode *inode); +int ocfs2_inode_lock_atime(struct inode *inode, + struct vfsmount *vfsmnt, + int *level); +int ocfs2_inode_lock_full_nested(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + int arg_flags, + int subclass); +int ocfs2_inode_lock_with_page(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct page *page); +/* Variants without special locking class or flags */ +#define ocfs2_inode_lock_full(i, r, e, f)\ + ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL) +#define ocfs2_inode_lock_nested(i, b, e, s)\ + ocfs2_inode_lock_full_nested(i, b, e, 0, s) +/* 99% of the time we don't want to supply any additional flags -- + * those are for very specific cases only. */ +#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +void ocfs2_inode_unlock(struct inode *inode, + int ex); +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex); +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex); +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno); +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno); + +int ocfs2_rename_lock(struct ocfs2_super *osb); +void ocfs2_rename_unlock(struct ocfs2_super *osb); +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +int ocfs2_dentry_lock(struct dentry *dentry, int ex); +void ocfs2_dentry_unlock(struct dentry *dentry, int ex); +int ocfs2_file_lock(struct file *file, int ex, int trylock); +void ocfs2_file_unlock(struct file *file); +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); +struct ocfs2_refcount_tree; +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); + + +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +/* for the downconvert thread */ +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb); + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); + +/* To set the locking protocol on module initialization */ +void ocfs2_set_locking_protocol(void); +#endif /* DLMGLUE_H */ diff --git a/kernel/fs/ocfs2/export.c b/kernel/fs/ocfs2/export.c new file mode 100644 index 000000000..827fc9809 --- /dev/null +++ b/kernel/fs/ocfs2/export.c @@ -0,0 +1,271 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.c + * + * Functions to facilitate NFS exporting + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "dlmglue.h" +#include "dcache.h" +#include "export.h" +#include "inode.h" + +#include "buffer_head_io.h" +#include "suballoc.h" +#include "ocfs2_trace.h" + +struct ocfs2_inode_handle +{ + u64 ih_blkno; + u32 ih_generation; +}; + +static struct dentry *ocfs2_get_dentry(struct super_block *sb, + struct ocfs2_inode_handle *handle) +{ + struct inode *inode; + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 blkno = handle->ih_blkno; + int status, set; + struct dentry *result; + + trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno); + + if (blkno == 0) { + result = ERR_PTR(-ESTALE); + goto bail; + } + + inode = ocfs2_ilookup(sb, blkno); + /* + * If the inode exists in memory, we only need to check it's + * generation number + */ + if (inode) + goto check_gen; + + /* + * This will synchronize us against ocfs2_delete_inode() on + * all nodes + */ + status = ocfs2_nfs_sync_lock(osb, 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + goto check_err; + } + + status = ocfs2_test_inode_bit(osb, blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + /* + * The blkno NFS gave us doesn't even show up + * as an inode, we return -ESTALE to be + * nice + */ + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + goto unlock_nfs_sync; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + /* If the inode allocator bit is clear, this inode must be stale */ + if (!set) { + status = -ESTALE; + goto unlock_nfs_sync; + } + + inode = ocfs2_iget(osb, blkno, 0, 0); + +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(osb, 1); + +check_err: + if (status < 0) { + if (status == -ESTALE) { + trace_ocfs2_get_dentry_stale((unsigned long long)blkno, + handle->ih_generation); + } + result = ERR_PTR(status); + goto bail; + } + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + result = (void *)inode; + goto bail; + } + +check_gen: + if (handle->ih_generation != inode->i_generation) { + iput(inode); + trace_ocfs2_get_dentry_generation((unsigned long long)blkno, + handle->ih_generation, + inode->i_generation); + result = ERR_PTR(-ESTALE); + goto bail; + } + + result = d_obtain_alias(inode); + if (IS_ERR(result)) + mlog_errno(PTR_ERR(result)); + +bail: + trace_ocfs2_get_dentry_end(result); + return result; +} + +static struct dentry *ocfs2_get_parent(struct dentry *child) +{ + int status; + u64 blkno; + struct dentry *parent; + struct inode *dir = d_inode(child); + + trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno); + + status = ocfs2_inode_lock(dir, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + parent = ERR_PTR(status); + goto bail; + } + + status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); + if (status < 0) { + parent = ERR_PTR(-ENOENT); + goto bail_unlock; + } + + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); + +bail_unlock: + ocfs2_inode_unlock(dir, 0); + +bail: + trace_ocfs2_get_parent_end(parent); + + return parent; +} + +static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len, + struct inode *parent) +{ + int len = *max_len; + int type = 1; + u64 blkno; + u32 generation; + __le32 *fh = (__force __le32 *) fh_in; + +#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION +#error "You go ahead and fix that mess, then. Somehow" + trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len, + dentry->d_name.name, + fh, len, connectable); +#endif + + if (parent && (len < 6)) { + *max_len = 6; + type = FILEID_INVALID; + goto bail; + } else if (len < 3) { + *max_len = 3; + type = FILEID_INVALID; + goto bail; + } + + blkno = OCFS2_I(inode)->ip_blkno; + generation = inode->i_generation; + + trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation); + + len = 3; + fh[0] = cpu_to_le32((u32)(blkno >> 32)); + fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[2] = cpu_to_le32(generation); + + if (parent) { + blkno = OCFS2_I(parent)->ip_blkno; + generation = parent->i_generation; + + fh[3] = cpu_to_le32((u32)(blkno >> 32)); + fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[5] = cpu_to_le32(generation); + + len = 6; + type = 2; + + trace_ocfs2_encode_fh_parent((unsigned long long)blkno, + generation); + } + + *max_len = len; + +bail: + trace_ocfs2_encode_fh_type(type); + return type; +} + +static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct ocfs2_inode_handle handle; + + if (fh_len < 3 || fh_type > 2) + return NULL; + + handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); + handle.ih_generation = le32_to_cpu(fid->raw[2]); + return ocfs2_get_dentry(sb, &handle); +} + +static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct ocfs2_inode_handle parent; + + if (fh_type != 2 || fh_len < 6) + return NULL; + + parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); + parent.ih_generation = le32_to_cpu(fid->raw[5]); + return ocfs2_get_dentry(sb, &parent); +} + +const struct export_operations ocfs2_export_ops = { + .encode_fh = ocfs2_encode_fh, + .fh_to_dentry = ocfs2_fh_to_dentry, + .fh_to_parent = ocfs2_fh_to_parent, + .get_parent = ocfs2_get_parent, +}; diff --git a/kernel/fs/ocfs2/export.h b/kernel/fs/ocfs2/export.h new file mode 100644 index 000000000..41a738678 --- /dev/null +++ b/kernel/fs/ocfs2/export.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.h + * + * Function prototypes + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_EXPORT_H +#define OCFS2_EXPORT_H + +#include + +extern const struct export_operations ocfs2_export_ops; + +#endif /* OCFS2_EXPORT_H */ diff --git a/kernel/fs/ocfs2/extent_map.c b/kernel/fs/ocfs2/extent_map.c new file mode 100644 index 000000000..767370b65 --- /dev/null +++ b/kernel/fs/ocfs2/extent_map.c @@ -0,0 +1,994 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.c + * + * Block/Cluster mapping functions + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "super.h" +#include "symlink.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +/* + * The extent caching implementation is intentionally trivial. + * + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. + */ + +void ocfs2_extent_map_init(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); +} + +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) +{ + unsigned int range; + struct ocfs2_extent_map_item *emi; + + *ret_emi = NULL; + + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; + + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); + + *ret_emi = emi; + break; + } + } +} + +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; + } + + spin_unlock(&oi->ip_lock); + + if (emi == NULL) + return -ENOENT; + + return 0; +} + +/* + * Forget about all clusters equal to or greater than cpos. + */ +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) +{ + struct ocfs2_extent_map_item *emi, *n; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; + } + + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; + } + } + spin_unlock(&oi->ip_lock); + + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { + list_del(&emi->ei_list); + kfree(emi); + } +} + +/* + * Is any part of emi2 contained within emi1 + */ +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) +{ + unsigned int range1, range2; + + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; + + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; + + return 0; +} + +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} + +/* + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. + */ +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) +{ + /* + * Handle contiguousness + */ + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; + } + + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; + } + + /* No merge was possible. */ + return 0; +} + +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } + } + + /* + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. + */ + + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); + + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; + + goto search; + } + + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } + + spin_unlock(&oi->ip_lock); + +out: + kfree(new_emi); +} + +static int ocfs2_last_eb_is_empty(struct inode *inode, + struct ocfs2_dinode *di) +{ + int ret, next_free; + u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + + next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) + ret = 1; + +out: + brelse(eb_bh); + return ret; +} + +/* + * Return the 1st index within el which contains an extent start + * larger than v_cluster. + */ +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) +{ + int i; + struct ocfs2_extent_rec *rec; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; + } + + return i; +} + +/* + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. + * + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. + * + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. + */ +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) +{ + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; + + i = ocfs2_search_for_hole_index(el, v_cluster); + + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Check the next leaf for any extents. + */ + + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; + + ret = ocfs2_read_extent_block(ci, + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; + el = &next_eb->h_list; + i = ocfs2_search_for_hole_index(el, v_cluster); + } + +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { + /* + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. + */ + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; + } + + ret = 0; +out: + brelse(next_eb_bh); + return ret; +} + +static int ocfs2_get_clusters_nocache(struct inode *inode, + struct buffer_head *di_bh, + u32 v_cluster, unsigned int *hole_len, + struct ocfs2_extent_rec *ret_rec, + unsigned int *is_last) +{ + int i, ret, tree_height, len; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *uninitialized_var(eb); + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct buffer_head *eb_bh = NULL; + + memset(ret_rec, 0, sizeof(*ret_rec)); + if (is_last) + *is_last = 0; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + tree_height = le16_to_cpu(el->l_tree_depth); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + /* + * Holes can be larger than the maximum size of an + * extent, so we return their lengths in a separate + * field. + */ + if (hole_len) { + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, + v_cluster, &len); + if (ret) { + mlog_errno(ret); + goto out; + } + + *hole_len = len; + } + goto out_hole; + } + + rec = &el->l_recs[i]; + + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *ret_rec = *rec; + + /* + * Checking for last extent is potentially expensive - we + * might have to look at the next leaf over to see if it's + * empty. + * + * The first two checks are to see whether the caller even + * cares for this information, and if the extent is at least + * the last in it's list. + * + * If those hold true, then the extent is last if any of the + * additional conditions hold true: + * - Extent list is in-inode + * - Extent list is right-most + * - Extent list is 2nd to rightmost, with empty right-most + */ + if (is_last) { + if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { + if (tree_height == 0) + *is_last = 1; + else if (eb->h_blkno == di->i_last_eb_blk) + *is_last = 1; + else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { + ret = ocfs2_last_eb_is_empty(inode, di); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (ret == 1) + *is_last = 1; + } + } + } + +out_hole: + ret = 0; +out: + brelse(eb_bh); + return ret; +} + +static void ocfs2_relative_extent_offsets(struct super_block *sb, + u32 v_cluster, + struct ocfs2_extent_rec *rec, + u32 *p_cluster, u32 *num_clusters) + +{ + u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); + + *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + + if (num_clusters) + *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; +} + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el, + unsigned int *extent_flags) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } else { + rec = &el->l_recs[i]; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + coff = v_cluster - le32_to_cpu(rec->e_cpos); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) +{ + int ret; + unsigned int uninitialized_var(hole_len), flags = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, + &rec, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + *num_clusters = hole_len; + } + } else { + ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, + p_cluster, num_clusters); + flags = rec.e_flags; + + ocfs2_extent_map_insert_rec(inode, &rec); + } + + if (extent_flags) + *extent_flags = flags; + +out: + brelse(di_bh); + return ret; +} + +/* + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. + */ +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) +{ + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); + + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } + + *p_blkno = boff; + + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); + } + +out: + return ret; +} + +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ +static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, + struct fiemap_extent_info *fieinfo, + u64 map_start) +{ + int ret; + unsigned int id_count; + struct ocfs2_dinode *di; + u64 phys; + u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + di = (struct ocfs2_dinode *)di_bh->b_data; + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); + + if (map_start < id_count) { + phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); + + ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, + flags); + if (ret < 0) + return ret; + } + + return 0; +} + +#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len) +{ + int ret, is_last; + u32 mapping_end, cpos; + unsigned int hole_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u64 len_bytes, phys_bytes, virt_bytes; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + if (ret) + return ret; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * Handle inline-data and fast symlink separately. + */ + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { + ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); + goto out_unlock; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + u32 fe_flags; + + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + if (rec.e_blkno == 0ULL) { + cpos += hole_size; + continue; + } + + fe_flags = 0; + if (rec.e_flags & OCFS2_EXT_UNWRITTEN) + fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; + if (is_last) + fe_flags |= FIEMAP_EXTENT_LAST; + len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; + phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; + virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; + + ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, + len_bytes, fe_flags); + if (ret) + break; + + cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); + } + + if (ret > 0) + ret = 0; + +out_unlock: + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + + return ret; +} + +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int ret; + unsigned int is_last = 0, is_data = 0; + u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 cpos, cend, clen, hole_size; + u64 extoff, extlen; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE); + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + if (*offset >= i_size_read(inode)) { + ret = -ENXIO; + goto out_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (whence == SEEK_HOLE) + *offset = i_size_read(inode); + goto out_unlock; + } + + clen = 0; + cpos = *offset >> cs_bits; + cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + + while (cpos < cend && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, + &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + extoff = cpos; + extoff <<= cs_bits; + + if (rec.e_blkno == 0ULL) { + clen = hole_size; + is_data = 0; + } else { + clen = le16_to_cpu(rec.e_leaf_clusters) - + (cpos - le32_to_cpu(rec.e_cpos)); + is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; + } + + if ((!is_data && whence == SEEK_HOLE) || + (is_data && whence == SEEK_DATA)) { + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + if (!is_last) + cpos += clen; + } + + if (whence == SEEK_HOLE) { + extoff = cpos; + extoff <<= cs_bits; + extlen = clen; + extlen <<= cs_bits; + + if ((extoff + extlen) > i_size_read(inode)) + extlen = i_size_read(inode) - extoff; + extoff += extlen; + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + ret = -ENXIO; + +out_unlock: + + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + return ret; +} + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int rc = 0; + u64 p_block, p_count; + int i, count, done = 0; + + trace_ocfs2_read_virt_blocks( + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); + + if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!(flags & OCFS2_BH_READAHEAD)); + goto out; + } + + while (done < nr) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); + rc = ocfs2_extent_map_get_blocks(inode, v_block + done, + &p_block, &p_count, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (rc) { + mlog_errno(rc); + break; + } + + if (!p_block) { + rc = -EIO; + mlog(ML_ERROR, + "Inode #%llu contains a hole at offset %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)(v_block + done) << + inode->i_sb->s_blocksize_bits); + break; + } + + count = nr - done; + if (p_count < count) + count = p_count; + + /* + * If the caller passed us bhs, they should have come + * from a previous readahead call to this function. Thus, + * they should have the right b_blocknr. + */ + for (i = 0; i < count; i++) { + if (!bhs[done + i]) + continue; + BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); + } + + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, + bhs + done, flags, validate); + if (rc) { + mlog_errno(rc); + break; + } + done += count; + } + +out: + return rc; +} + + diff --git a/kernel/fs/ocfs2/extent_map.h b/kernel/fs/ocfs2/extent_map.h new file mode 100644 index 000000000..67ea57d2f --- /dev/null +++ b/kernel/fs/ocfs2/extent_map.h @@ -0,0 +1,92 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.h + * + * In-memory file extent mappings for OCFS2. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _EXTENT_MAP_H +#define _EXTENT_MAP_H + +struct ocfs2_extent_map_item { + unsigned int ei_cpos; + unsigned int ei_phys; + unsigned int ei_clusters; + unsigned int ei_flags; + + struct list_head ei_list; +}; + +#define OCFS2_MAX_EXTENT_MAP_ITEMS 3 +struct ocfs2_extent_map { + unsigned int em_num_items; + struct list_head em_list; +}; + +void ocfs2_extent_map_init(struct inode *inode); +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec); + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, unsigned int *extent_flags); +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags); + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len); + +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el, + unsigned int *extent_flags); + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters); +static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate); + +bail: + return status; +} + + +#endif /* _EXTENT_MAP_H */ diff --git a/kernel/fs/ocfs2/file.c b/kernel/fs/ocfs2/file.c new file mode 100644 index 000000000..d8b670cbd --- /dev/null +++ b/kernel/fs/ocfs2/file.c @@ -0,0 +1,2702 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c + * + * File open, close, extend, truncate + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "sysfile.h" +#include "inode.h" +#include "ioctl.h" +#include "journal.h" +#include "locks.h" +#include "mmap.h" +#include "suballoc.h" +#include "super.h" +#include "xattr.h" +#include "acl.h" +#include "quota.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static int ocfs2_init_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp; + + fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->fp_file = file; + mutex_init(&fp->fp_mutex); + ocfs2_file_lock_res_init(&fp->fp_flock, fp); + file->private_data = fp; + + return 0; +} + +static void ocfs2_free_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (fp) { + ocfs2_simple_drop_lockres(osb, &fp->fp_flock); + ocfs2_lock_res_free(&fp->fp_flock); + kfree(fp); + file->private_data = NULL; + } +} + +static int ocfs2_file_open(struct inode *inode, struct file *file) +{ + int status; + int mode = file->f_flags; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + trace_ocfs2_file_open(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, mode); + + if (file->f_mode & FMODE_WRITE) + dquot_initialize(inode); + + spin_lock(&oi->ip_lock); + + /* Check that the inode hasn't been wiped from disk by another + * node. If it hasn't then we're safe as long as we hold the + * spin lock until our increment of open count. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&oi->ip_lock); + + status = -ENOENT; + goto leave; + } + + if (mode & O_DIRECT) + oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; + + oi->ip_open_count++; + spin_unlock(&oi->ip_lock); + + status = ocfs2_init_file_private(inode, file); + if (status) { + /* + * We want to set open count back if we're failing the + * open. + */ + spin_lock(&oi->ip_lock); + oi->ip_open_count--; + spin_unlock(&oi->ip_lock); + } + +leave: + return status; +} + +static int ocfs2_file_release(struct inode *inode, struct file *file) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + spin_lock(&oi->ip_lock); + if (!--oi->ip_open_count) + oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; + + trace_ocfs2_file_release(inode, file, file->f_path.dentry, + oi->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + oi->ip_open_count); + spin_unlock(&oi->ip_lock); + + ocfs2_free_file_private(inode, file); + + return 0; +} + +static int ocfs2_dir_open(struct inode *inode, struct file *file) +{ + return ocfs2_init_file_private(inode, file); +} + +static int ocfs2_dir_release(struct inode *inode, struct file *file) +{ + ocfs2_free_file_private(inode, file); + return 0; +} + +static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, + int datasync) +{ + int err = 0; + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; + + trace_ocfs2_sync_file(inode, file, file->f_path.dentry, + OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned long long)datasync); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!err) + err = ret; + } + + if (err) + mlog_errno(err); + + return (err < 0) ? -EIO : 0; +} + +int ocfs2_should_update_atime(struct inode *inode, + struct vfsmount *vfsmnt) +{ + struct timespec now; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return 0; + + if ((inode->i_flags & S_NOATIME) || + ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + /* + * We can be called with no vfsmnt structure - NFSD will + * sometimes do this. + * + * Note that our action here is different than touch_atime() - + * if we can't tell whether this is a noatime mount, then we + * don't know whether to trust the value of s_atime_quantum. + */ + if (vfsmnt == NULL) + return 0; + + if ((vfsmnt->mnt_flags & MNT_NOATIME) || + ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + if (vfsmnt->mnt_flags & MNT_RELATIME) { + if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + return 1; + + return 0; + } + + now = CURRENT_TIME; + if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + return 0; + else + return 1; +} + +int ocfs2_update_inode_atime(struct inode *inode, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Don't use ocfs2_mark_inode_dirty() here as we don't always + * have i_mutex to guard against concurrent changes to other + * inode fields. + */ + inode->i_atime = CURRENT_TIME; + di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + ocfs2_journal_dirty(handle, bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + return status; +} + +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_set_inode_size(handle, inode, di_bh, + new_i_size); + if (ret < 0) + mlog_errno(ret); + + ocfs2_update_inode_fsync_trans(handle, inode, 0); + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_cow_file_pos(struct inode *inode, + struct buffer_head *fe_bh, + u64 offset) +{ + int status; + u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + /* + * If the new offset is aligned to the range of the cluster, there is + * no space for ocfs2_zero_range_for_truncate to fill, so no need to + * CoW either. + */ + if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) + return 0; + + status = ocfs2_get_clusters(inode, cpos, &phys, + &num_clusters, &ext_flags); + if (status) { + mlog_errno(status); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + +out: + return status; +} + +static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + handle_t *handle; + struct ocfs2_dinode *di; + u64 cluster_bytes; + + /* + * We need to CoW the cluster contains the offset if it is reflinked + * since we will call ocfs2_zero_range_for_truncate later which will + * write "0" from offset to the end of the cluster. + */ + status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); + if (status) { + mlog_errno(status); + return status; + } + + /* TODO: This needs to actually orphan the inode in this + * transaction. */ + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + /* + * Do this before setting i_size. + */ + cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, + cluster_bytes); + if (status) { + mlog_errno(status); + goto out_commit; + } + + i_size_write(inode, new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di = (struct ocfs2_dinode *) fe_bh->b_data; + di->i_size = cpu_to_le64(new_i_size); + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + + ocfs2_journal_dirty(handle, fe_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return status; +} + +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ + fe = (struct ocfs2_dinode *) di_bh->b_data; + + trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); + + mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), + "Inode %llu, inode i_size = %lld != di " + "i_size = %llu, i_flags = 0x%x\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + i_size_read(inode), + (unsigned long long)le64_to_cpu(fe->i_size), + le32_to_cpu(fe->i_flags)); + + if (new_i_size > le64_to_cpu(fe->i_size)) { + trace_ocfs2_truncate_file_error( + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_resv_discard(&osb->osb_la_resmap, + &OCFS2_I(inode)->ip_la_data_resv); + + /* + * The inode lock forced other nodes to sync and drop their + * pages, which (correctly) happens even if we have a truncate + * without allocation change - ocfs2 cluster sizes can be much + * greater than page size, so we have to truncate them + * anyway. + */ + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + status = ocfs2_truncate_inline(inode, di_bh, new_i_size, + i_size_read(inode), 1); + if (status) + mlog_errno(status); + + goto bail_unlock_sem; + } + + /* alright, we're going to need to do a full blown alloc size + * change. Orphan the inode so that recovery can complete the + * truncate if necessary. This does the task of marking + * i_size. */ + status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_sem; + } + + status = ocfs2_commit_truncate(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_sem; + } + + /* TODO: orphan dir cleanup here. */ +bail_unlock_sem: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + +bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); + + return status; +} + +/* + * extend file allocation only here. + * we'll update all the disk stuff, and oip->alloc_size + * + * expect stuff to be locked, a transaction started and enough data / + * metadata reservations in the contexts. + * + * Will return -EAGAIN, and a reason if a restart is needed. + * If passed in, *reason will always be set, even in error. + */ +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int ret; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); + ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); + + return ret; +} + +static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + int status = 0; + int restart_func = 0; + int credits; + u32 prev_clusters; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe = NULL; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why = RESTART_NONE; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + int did_quota = 0; + + /* + * Unwritten extent only exists for file systems which + * support holes. + */ + BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + fe = (struct ocfs2_dinode *) bh->b_data; + +restart_all: + BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + trace_ocfs2_extend_allocation( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)i_size_read(inode), + le32_to_cpu(fe->i_clusters), clusters_to_add, + why, restart_func); + + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) + goto leave; + did_quota = 1; + + /* reserve a write to the file entry early on - that we if we + * run out of credits in the allocation path, we can still + * update i_size. */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = OCFS2_I(inode)->ip_clusters; + + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + ocfs2_update_inode_fsync_trans(handle, inode, 1); + ocfs2_journal_dirty(handle, bh); + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + restart_func = 1; + status = 0; + } else { + BUG_ON(why != RESTART_TRANS); + + status = ocfs2_allocate_extend_trans(handle, 1); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + + trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno, + le32_to_cpu(fe->i_clusters), + (unsigned long long)le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, + (unsigned long long)i_size_read(inode)); + +leave: + if (status < 0 && did_quota) + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + brelse(bh); + bh = NULL; + + return status; +} + +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + +/* Some parts of this taken from generic_cont_expand, which turned out + * to be too fragile to do exactly what we need without us having to + * worry about recursive locking in ->write_begin() and ->write_end(). */ +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to, struct buffer_head *di_bh) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; + handle_t *handle; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); + + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit_trans; + } + + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; + + trace_ocfs2_write_zero_page( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)abs_from, + (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to force + * __block_write_begin and block_commit_write to zero the + * whole block. + */ + ret = __block_write_begin(page, block_start + 1, 0, + ocfs2_get_block); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } + + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; + if (handle) { + ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + } + +out_unlock: + unlock_page(page); + page_cache_release(page); +out_commit_trans: + if (handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) +{ + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, + zero_clusters, UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end, struct buffer_head *di_bh) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + trace_ocfs2_zero_extend_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; + + /* + * Very large extends have the potential to lock up + * the cpu for extended periods of time. + */ + cond_resched(); + } + + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end, di_bh); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + + return ret; +} + +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) +{ + int ret; + u32 clusters_to_add; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); + if (clusters_to_add < oi->ip_clusters) + clusters_to_add = 0; + else + clusters_to_add -= oi->ip_clusters; + + if (clusters_to_add) { + ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Call this even if we don't add any clusters to the tree. We + * still need to zero the area between the old i_size and the + * new i_size. + */ + ret = ocfs2_zero_extend(inode, di_bh, zero_to); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_extend_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + BUG_ON(!di_bh); + + /* setattr sometimes calls us like this. */ + if (new_i_size == 0) + goto out; + + if (i_size_read(inode) == new_i_size) + goto out; + BUG_ON(new_i_size < i_size_read(inode)); + + /* + * The alloc sem blocks people in read/write from reading our + * allocation until we're done changing it. We depend on + * i_mutex to block other extend/truncate calls while we're + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. + */ + down_write(&oi->ip_alloc_sem); + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * We can optimize small extends by keeping the inodes + * inline data. + */ + if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { + up_write(&oi->ip_alloc_sem); + goto out_update_size; + } + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + up_write(&oi->ip_alloc_sem); + mlog_errno(ret); + goto out; + } + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); + + up_write(&oi->ip_alloc_sem); + + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out_update_size: + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + int status = 0, size_change; + struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *bh = NULL; + handle_t *handle = NULL; + struct dquot *transfer_to[MAXQUOTAS] = { }; + int qtype; + + trace_ocfs2_setattr(inode, dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + dentry->d_name.len, dentry->d_name.name, + attr->ia_valid, attr->ia_mode, + from_kuid(&init_user_ns, attr->ia_uid), + from_kgid(&init_user_ns, attr->ia_gid)); + + /* ensuring we don't even attempt to truncate a symlink */ + if (S_ISLNK(inode->i_mode)) + attr->ia_valid &= ~ATTR_SIZE; + +#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ + | ATTR_GID | ATTR_UID | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) + return 0; + + status = inode_change_ok(inode, attr); + if (status) + return status; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; + if (size_change) { + status = ocfs2_rw_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail_unlock_rw; + } + + if (size_change) { + status = inode_newsize_ok(inode, attr->ia_size); + if (status) + goto bail_unlock; + + inode_dio_wait(inode); + + if (i_size_read(inode) >= attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } + status = ocfs2_truncate_file(inode, bh, attr->ia_size); + } else + status = ocfs2_extend_file(inode, bh, attr->ia_size); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + status = -ENOSPC; + goto bail_unlock; + } + } + + if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + /* + * Gather pointers to quota structures so that allocation / + * freeing of quota structures happens here and not inside + * dquot_transfer() where we have problems with lock ordering + */ + if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid) + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); + if (!transfer_to[USRQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid) + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); + if (!transfer_to[GRPQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + + 2 * ocfs2_quota_trans_credits(sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = __dquot_transfer(inode, transfer_to); + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(osb, handle); +bail_unlock: + ocfs2_inode_unlock(inode, 1); +bail_unlock_rw: + if (size_change) + ocfs2_rw_unlock(inode, 1); +bail: + brelse(bh); + + /* Release quota pointers in case we acquired them */ + for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) + dqput(transfer_to[qtype]); + + if (!status && attr->ia_valid & ATTR_MODE) { + status = posix_acl_chmod(inode, inode->i_mode); + if (status < 0) + mlog_errno(status); + } + + return status; +} + +int ocfs2_getattr(struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct super_block *sb = d_inode(dentry)->i_sb; + struct ocfs2_super *osb = sb->s_fs_info; + int err; + + err = ocfs2_inode_revalidate(dentry); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + generic_fillattr(inode, stat); + + /* We set the blksize from the cluster size for performance */ + stat->blksize = osb->s_clustersize; + +bail: + return err; +} + +int ocfs2_permission(struct inode *inode, int mask) +{ + int ret; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + ret = generic_permission(inode, mask); + + ocfs2_inode_unlock(inode, 0); +out: + return ret; +} + +static int __ocfs2_write_remove_suid(struct inode *inode, + struct buffer_head *bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + + trace_ocfs2_write_remove_suid( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + inode->i_mode); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_trans; + } + + inode->i_mode &= ~S_ISUID; + if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) + inode->i_mode &= ~S_ISGID; + + di = (struct ocfs2_dinode *) bh->b_data; + di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + + ocfs2_journal_dirty(handle, bh); + +out_trans: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = ocfs2_read_inode_block(inode, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_write_remove_suid(inode, bh); +out: + brelse(bh); + return ret; +} + +/* + * Allocate enough extents to cover the region starting at byte offset + * start for len bytes. Existing extents are skipped, any extents + * added are marked as "unwritten". + */ +static int ocfs2_allocate_unwritten_extents(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u32 cpos, phys_cpos, clusters, alloc_size; + u64 end = start + len; + struct buffer_head *di_bh = NULL; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Nothing to do if the requested reservation range + * fits within the inode. + */ + if (ocfs2_size_fits_inline_data(di_bh, end)) + goto out; + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * We consider both start and len to be inclusive. + */ + cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); + clusters -= cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hole or existing extent len can be arbitrary, so + * cap it to our own allocation request. + */ + if (alloc_size > clusters) + alloc_size = clusters; + + if (phys_cpos) { + /* + * We already have an allocation at this + * region so we can safely skip it. + */ + goto next; + } + + ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +next: + cpos += alloc_size; + clusters -= alloc_size; + } + + ret = 0; +out: + + brelse(di_bh); + return ret; +} + +/* + * Truncate a byte range, avoiding pages within partial clusters. This + * preserves those pages for the zeroing code to write to. + */ +static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, + u64 byte_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t start, end; + struct address_space *mapping = inode->i_mapping; + + start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); + end = byte_start + byte_len; + end = end & ~(osb->s_clustersize - 1); + + if (start < end) { + unmap_mapping_range(mapping, start, end - start, 0); + truncate_inode_pages_range(mapping, start, end - 1); + } +} + +static int ocfs2_zero_partial_clusters(struct inode *inode, + u64 start, u64 len) +{ + int ret = 0; + u64 tmpend, end = start + len; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int csize = osb->s_clustersize; + handle_t *handle; + + /* + * The "start" and "end" values are NOT necessarily part of + * the range whose allocation is being deleted. Rather, this + * is what the user passed in with the request. We must zero + * partial clusters here. There's no need to worry about + * physical allocation - the zeroing code knows to skip holes. + */ + trace_ocfs2_zero_partial_clusters( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)start, (unsigned long long)end); + + /* + * If both edges are on a cluster boundary then there's no + * zeroing required as the region is part of the allocation to + * be truncated. + */ + if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + /* + * We want to get the byte offset of the end of the 1st cluster. + */ + tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; + + trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, + (unsigned long long)tmpend); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); + if (ret) + mlog_errno(ret); + + if (tmpend < end) { + /* + * This may make start and end equal, but the zeroing + * code will skip any work in that case so there's no + * need to catch it up here. + */ + start = end & ~(osb->s_clustersize - 1); + + trace_ocfs2_zero_partial_clusters_range2( + (unsigned long long)start, (unsigned long long)end); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); + if (ret) + mlog_errno(ret); + } + ocfs2_update_inode_fsync_trans(handle, inode, 1); + + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) +{ + int i; + struct ocfs2_extent_rec *rec = NULL; + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) < pos) + break; + } + + return i; +} + +/* + * Helper to calculate the punching pos and length in one run, we handle the + * following three cases in order: + * + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (hole-punching completed) +*/ +static void ocfs2_calc_trunc_pos(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec, + u32 trunc_start, u32 *trunc_cpos, + u32 *trunc_len, u32 *trunc_end, + u64 *blkno, int *done) +{ + int ret = 0; + u32 coff, range; + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (le32_to_cpu(rec->e_cpos) >= trunc_start) { + /* + * remove an entire extent record. + */ + *trunc_cpos = le32_to_cpu(rec->e_cpos); + /* + * Skip holes if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno); + *trunc_end = le32_to_cpu(rec->e_cpos); + } else if (range > trunc_start) { + /* + * remove a partial extent record, which means we're + * removing the last extent record. + */ + *trunc_cpos = trunc_start; + /* + * skip hole if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - trunc_start; + coff = trunc_start - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + *trunc_end = trunc_start; + } else { + /* + * It may have two following possibilities: + * + * - last record has been removed + * - trunc_start was within a hole + * + * both two cases mean the completion of hole punching. + */ + ret = 1; + } + + *done = ret; +} + +static int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) +{ + int ret = 0, flags = 0, done = 0, i; + u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; + u32 cluster_in_el; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + + trace_ocfs2_remove_inode_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)byte_start, + (unsigned long long)byte_len); + + if (byte_len == 0) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, + byte_start + byte_len, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + /* + * There's no need to get fancy with the page cache + * truncate of an inline-data inode. We're talking + * about less than a page here, which will be cached + * in the dinode buffer anyway. + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + goto out; + } + + /* + * For reflinks, we may need to CoW 2 clusters which might be + * partially zero'd later, if hole's start and end offset were + * within one cluster(means is not exactly aligned to clustersize). + */ + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); + trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; + cluster_in_el = trunc_end; + + ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (trunc_end > trunc_start) { + + ret = ocfs2_find_path(INODE_CACHE(inode), path, + cluster_in_el); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + i = ocfs2_find_rec(el, trunc_end); + /* + * Need to go to previous extent block. + */ + if (i < 0) { + if (path->p_tree_depth == 0) + break; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + path, + &cluster_in_el); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We've reached the leftmost extent block, + * it's safe to leave. + */ + if (cluster_in_el == 0) + break; + + /* + * The 'pos' searched for previous extent block is + * always one cluster less than actual trunc_end. + */ + trunc_end = cluster_in_el + 1; + + ocfs2_reinit_path(path, 1); + + continue; + + } else + rec = &el->l_recs[i]; + + ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, + &trunc_len, &trunc_end, &blkno, &done); + if (done) + break; + + flags = rec->e_flags; + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, + &dealloc, refcount_loc, false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + cluster_in_el = trunc_end; + + ocfs2_reinit_path(path, 1); + } + + ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); + +out: + ocfs2_free_path(path); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +/* + * Parts of this function taken from xfs_change_file_space() + */ +static int __ocfs2_change_file_space(struct file *file, struct inode *inode, + loff_t f_pos, unsigned int cmd, + struct ocfs2_space_resv *sr, + int change_size) +{ + int ret; + s64 llen; + loff_t size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + handle_t *handle; + unsigned long long max_off = inode->i_sb->s_maxbytes; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes on other nodes + */ + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_rw_unlock; + } + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + ret = -EPERM; + goto out_inode_unlock; + } + + switch (sr->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + sr->l_start += f_pos; + break; + case 2: /*SEEK_END*/ + sr->l_start += i_size_read(inode); + break; + default: + ret = -EINVAL; + goto out_inode_unlock; + } + sr->l_whence = 0; + + llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; + + if (sr->l_start < 0 + || sr->l_start > max_off + || (sr->l_start + llen) < 0 + || (sr->l_start + llen) > max_off) { + ret = -EINVAL; + goto out_inode_unlock; + } + size = sr->l_start + sr->l_len; + + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { + if (sr->l_len <= 0) { + ret = -EINVAL; + goto out_inode_unlock; + } + } + + if (file && should_remove_suid(file->f_path.dentry)) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + switch (cmd) { + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + /* + * This takes unsigned offsets, but the signed ones we + * pass have been checked against overflow above. + */ + ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, + sr->l_len); + break; + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, + sr->l_len); + break; + default: + ret = -EINVAL; + } + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + + /* + * We update c/mtime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_inode_unlock; + } + + if (change_size && i_size_read(inode) < size) + i_size_write(inode, size); + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + + if (file && (file->f_flags & O_SYNC)) + handle->h_sync = 1; + + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); + +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr) +{ + struct inode *inode = file_inode(file); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; + + if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && + !ocfs2_writes_unwritten_extents(osb)) + return -ENOTTY; + else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && + !ocfs2_sparse_alloc(osb)) + return -ENOTTY; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + mnt_drop_write_file(file); + return ret; +} + +static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_space_resv sr; + int change_size = 1; + int cmd = OCFS2_IOC_RESVSP64; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (!ocfs2_writes_unwritten_extents(osb)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_KEEP_SIZE) + change_size = 0; + + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = OCFS2_IOC_UNRESVSP64; + + sr.l_whence = 0; + sr.l_start = (s64)offset; + sr.l_len = (s64)len; + + return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr, + change_size); +} + +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) +{ + int blockmask = inode->i_sb->s_blocksize - 1; + loff_t final_size = pos + count; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + return 0; +} + +static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, + loff_t pos, size_t count, + int *meta_level) +{ + int ret; + struct buffer_head *di_bh = NULL; + u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + *meta_level = 1; + + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + if (ret) + mlog_errno(ret); +out: + brelse(di_bh); + return ret; +} + +static int ocfs2_prepare_inode_for_write(struct file *file, + loff_t pos, + size_t count, + int appending, + int *direct_io, + int *has_refcount) +{ + int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + loff_t end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); + + /* + * We start with a read level meta lock and only jump to an ex + * if we need to make modifications here. + */ + for(;;) { + ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (ret < 0) { + meta_level = -1; + mlog_errno(ret); + goto out; + } + + /* Clear suid / sgid if necessary. We do this here + * instead of later in the write path because + * remove_suid() calls ->setattr without any hint that + * we may have already done our cluster locking. Since + * ocfs2_setattr() *must* take cluster locks to + * proceed, this will lead us to recursively lock the + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ + if (should_remove_suid(dentry)) { + if (meta_level == 0) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + ret = ocfs2_write_remove_suid(inode); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + } + + end = pos + count; + + ret = ocfs2_check_range_for_refcount(inode, pos, count); + if (ret == 1) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = -1; + + ret = ocfs2_prepare_inode_for_refcount(inode, + file, + pos, + count, + &meta_level); + if (has_refcount) + *has_refcount = 1; + if (direct_io) + *direct_io = 0; + } + + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) + break; + + /* + * There's no sane way to do direct writes to an inode + * with inline data. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + *direct_io = 0; + break; + } + + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { + *direct_io = 0; + break; + } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, pos, count); + if (ret == 1) { + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); + break; + } + +out_unlock: + trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, + pos, appending, count, + direct_io, has_refcount); + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); + +out: + return ret; +} + +static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + int direct_io, appending, rw_level, have_alloc_sem = 0; + int can_do_direct, has_refcount = 0; + ssize_t written = 0; + ssize_t ret; + size_t count = iov_iter_count(from), orig_count; + loff_t old_size; + u32 old_clusters; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); + int unaligned_dio = 0; + int dropped_dio = 0; + + trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned int)from->nr_segs); /* GRRRRR */ + + if (count == 0) + return 0; + + appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; + direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + + mutex_lock(&inode->i_mutex); + + ocfs2_iocb_clear_sem_locked(iocb); + +relock: + /* to match setattr's i_mutex -> rw_lock ordering */ + if (direct_io) { + have_alloc_sem = 1; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_sem_locked(iocb); + } + + /* + * Concurrent O_DIRECT writes are allowed with + * mount_option "coherency=buffered". + */ + rw_level = (!direct_io || full_coherency); + + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + mlog_errno(ret); + goto out_sems; + } + + /* + * O_DIRECT writes with "coherency=full" need to take EX cluster + * inode_lock to guarantee coherency. + */ + if (direct_io && full_coherency) { + /* + * We need to take and drop the inode lock to force + * other nodes to drop their caches. Buffered I/O + * already does this in write_begin(). + */ + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ocfs2_inode_unlock(inode, 1); + } + + orig_count = iov_iter_count(from); + ret = generic_write_checks(iocb, from); + if (ret <= 0) { + if (ret) + mlog_errno(ret); + goto out; + } + count = ret; + + can_do_direct = direct_io; + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, + &can_do_direct, &has_refcount); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (direct_io && !is_sync_kiocb(iocb)) + unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); + + /* + * We can't complete the direct I/O as requested, fall back to + * buffered I/O. + */ + if (direct_io && !can_do_direct) { + ocfs2_rw_unlock(inode, rw_level); + + have_alloc_sem = 0; + rw_level = -1; + + direct_io = 0; + iocb->ki_flags &= ~IOCB_DIRECT; + iov_iter_reexpand(from, orig_count); + dropped_dio = 1; + goto relock; + } + + if (unaligned_dio) { + /* + * Wait on previous unaligned aio to complete before + * proceeding. + */ + mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); + /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ + ocfs2_iocb_set_unaligned_aio(iocb); + } + + /* + * To later detect whether a journal commit for sync writes is + * necessary, we sample i_size, and cluster count here. + */ + old_size = i_size_read(inode); + old_clusters = OCFS2_I(inode)->ip_clusters; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + + written = __generic_file_write_iter(iocb, from); + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + + if (unlikely(written <= 0)) + goto no_sync; + + if (((file->f_flags & O_DSYNC) && !direct_io) || + IS_SYNC(inode) || dropped_dio) { + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (ret < 0) + written = ret; + + if (!ret) { + ret = jbd2_journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + } + +no_sync: + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + rw_level = -1; + have_alloc_sem = 0; + unaligned_dio = 0; + } + + if (unaligned_dio) { + ocfs2_iocb_clear_unaligned_aio(iocb); + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); + } + +out: + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + +out_sems: + if (have_alloc_sem) + ocfs2_iocb_clear_sem_locked(iocb); + + mutex_unlock(&inode->i_mutex); + + if (written) + ret = written; + return ret; +} + +static ssize_t ocfs2_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + int ret = 0, lock_level = 0; + struct inode *inode = file_inode(in); + + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, len); + + /* + * See the comment in ocfs2_file_read_iter() + */ + ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = generic_file_splice_read(in, ppos, pipe, len, flags); + +bail: + return ret; +} + +static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) +{ + int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = file_inode(filp); + + trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name, + to->nr_segs); /* GRRRRR */ + + + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto bail; + } + + ocfs2_iocb_clear_sem_locked(iocb); + + /* + * buffered reads protect themselves in ->readpage(). O_DIRECT reads + * need locks to protect pending reads from racing with truncate. + */ + if (iocb->ki_flags & IOCB_DIRECT) { + have_alloc_sem = 1; + ocfs2_iocb_set_sem_locked(iocb); + + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + rw_level = 0; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + } + + /* + * We're fine letting folks race truncates and extending + * writes with read across the cluster, just like they can + * locally. Hence no rw_lock during read. + * + * Take and drop the meta data lock to update inode fields + * like i_size. This allows the checks down below + * generic_file_aio_read() a chance of actually working. + */ + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = generic_file_read_iter(iocb, to); + trace_generic_file_aio_read_ret(ret); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + + /* see ocfs2_file_write_iter */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +bail: + if (have_alloc_sem) + ocfs2_iocb_clear_sem_locked(iocb); + + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + + return ret; +} + +/* Refer generic_file_llseek_unlocked() */ +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + switch (whence) { + case SEEK_SET: + break; + case SEEK_END: + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); + break; + case SEEK_CUR: + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + case SEEK_HOLE: + ret = ocfs2_seek_data_hole_offset(file, &offset, whence); + if (ret) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out: + mutex_unlock(&inode->i_mutex); + if (ret) + return ret; + return offset; +} + +const struct inode_operations ocfs2_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, + .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, +}; + +const struct inode_operations ocfs2_special_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, + .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, +}; + +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ +const struct file_operations ocfs2_fops = { + .llseek = ocfs2_file_llseek, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = ocfs2_fallocate, +}; + +const struct file_operations ocfs2_dops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = ocfs2_file_llseek, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = ocfs2_fallocate, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, +}; diff --git a/kernel/fs/ocfs2/file.h b/kernel/fs/ocfs2/file.h new file mode 100644 index 000000000..e8c62f222 --- /dev/null +++ b/kernel/fs/ocfs2/file.h @@ -0,0 +1,85 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_FILE_H +#define OCFS2_FILE_H + +extern const struct file_operations ocfs2_fops; +extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; +extern const struct inode_operations ocfs2_file_iops; +extern const struct inode_operations ocfs2_special_file_iops; +struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; + +struct ocfs2_file_private { + struct file *fp_file; + struct mutex fp_mutex; + struct ocfs2_lock_res fp_flock; +}; + +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); +int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int ocfs2_permission(struct inode *inode, int mask); + +int ocfs2_should_update_atime(struct inode *inode, + struct vfsmount *vfsmnt); +int ocfs2_update_inode_atime(struct inode *inode, + struct buffer_head *bh); + +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr); + +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count); +#endif /* OCFS2_FILE_H */ diff --git a/kernel/fs/ocfs2/heartbeat.c b/kernel/fs/ocfs2/heartbeat.c new file mode 100644 index 000000000..d8208b20d --- /dev/null +++ b/kernel/fs/ocfs2/heartbeat.c @@ -0,0 +1,134 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.c + * + * Register ourselves with the heartbaet service, keep our node maps + * up to date, and fire off recovery when needed. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit); +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit); + +/* special case -1 for now + * TODO: should *really* make sure the calling func never passes -1!! */ +static void ocfs2_node_map_init(struct ocfs2_node_map *map) +{ + map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; + memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * + sizeof(unsigned long)); +} + +void ocfs2_init_node_maps(struct ocfs2_super *osb) +{ + spin_lock_init(&osb->node_map_lock); + ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); +} + +void ocfs2_do_node_down(int node_num, void *data) +{ + struct ocfs2_super *osb = data; + + BUG_ON(osb->node_num == node_num); + + trace_ocfs2_do_node_down(node_num); + + if (!osb->cconn) { + /* + * No cluster connection means we're not even ready to + * participate yet. We check the slots after the cluster + * comes up, so we will notice the node death then. We + * can safely ignore it here. + */ + return; + } + + ocfs2_recovery_thread(osb, node_num); +} + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit) +{ + set_bit(bit, map->map); +} + +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_set_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit) +{ + clear_bit(bit, map->map); +} + +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_clear_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + int ret; + if (bit >= map->num_nodes) { + mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); + BUG(); + } + spin_lock(&osb->node_map_lock); + ret = test_bit(bit, map->map); + spin_unlock(&osb->node_map_lock); + return ret; +} + diff --git a/kernel/fs/ocfs2/heartbeat.h b/kernel/fs/ocfs2/heartbeat.h new file mode 100644 index 000000000..74b9c5dda --- /dev/null +++ b/kernel/fs/ocfs2/heartbeat.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_HEARTBEAT_H +#define OCFS2_HEARTBEAT_H + +void ocfs2_init_node_maps(struct ocfs2_super *osb); + +void ocfs2_do_node_down(int node_num, void *data); + +/* node map functions - used to keep track of mounted and in-recovery + * nodes. */ +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); + +#endif /* OCFS2_HEARTBEAT_H */ diff --git a/kernel/fs/ocfs2/inode.c b/kernel/fs/ocfs2/inode.c new file mode 100644 index 000000000..b254416dc --- /dev/null +++ b/kernel/fs/ocfs2/inode.c @@ -0,0 +1,1460 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c + * + * vfs' aops, fops, dops and iops + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "xattr.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +struct ocfs2_find_inode_args +{ + u64 fi_blkno; + unsigned long fi_ino; + unsigned int fi_flags; + unsigned int fi_sysfile_type; +}; + +static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args); +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); +static int ocfs2_find_actor(struct inode *inode, void *opaque); +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh); + +void ocfs2_set_inode_flags(struct inode *inode) +{ + unsigned int flags = OCFS2_I(inode)->ip_attr; + + inode->i_flags &= ~(S_IMMUTABLE | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & OCFS2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + + if (flags & OCFS2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & OCFS2_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & OCFS2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & OCFS2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ +void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) +{ + unsigned int flags = oi->vfs_inode.i_flags; + + oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| + OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); + if (flags & S_SYNC) + oi->ip_attr |= OCFS2_SYNC_FL; + if (flags & S_APPEND) + oi->ip_attr |= OCFS2_APPEND_FL; + if (flags & S_IMMUTABLE) + oi->ip_attr |= OCFS2_IMMUTABLE_FL; + if (flags & S_NOATIME) + oi->ip_attr |= OCFS2_NOATIME_FL; + if (flags & S_DIRSYNC) + oi->ip_attr |= OCFS2_DIRSYNC_FL; +} + +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +{ + struct ocfs2_find_inode_args args; + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = 0; + + return ilookup5(sb, blkno, ocfs2_find_actor, &args); +} +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, + int sysfile_type) +{ + struct inode *inode = NULL; + struct super_block *sb = osb->sb; + struct ocfs2_find_inode_args args; + journal_t *journal = OCFS2_SB(sb)->journal->j_journal; + + trace_ocfs2_iget_begin((unsigned long long)blkno, flags, + sysfile_type); + + /* Ok. By now we've either got the offsets passed to us by the + * caller, or we just pulled them off the bh. Lets do some + * sanity checks to make sure they're OK. */ + if (blkno == 0) { + inode = ERR_PTR(-EINVAL); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + + args.fi_blkno = blkno; + args.fi_flags = flags; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = sysfile_type; + + inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, + ocfs2_init_locked_inode, &args); + /* inode was *not* in the inode cache. 2.6.x requires + * us to do our own read_inode call and unlock it + * afterwards. */ + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + trace_ocfs2_iget5_locked(inode->i_state); + if (inode->i_state & I_NEW) { + ocfs2_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + goto bail; + } + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + +bail: + if (!IS_ERR(inode)) { + trace_ocfs2_iget_end(inode, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return inode; +} + + +/* + * here's how inodes get read from disk: + * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR + * found? : return the in-memory inode + * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE + */ + +static int ocfs2_find_actor(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + args = opaque; + + mlog_bug_on_msg(!inode, "No inode in find actor!\n"); + + trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); + + if (oi->ip_blkno != args->fi_blkno) + goto bail; + + ret = 1; +bail: + return ret; +} + +/* + * initialize the new inode, but don't do anything that would cause + * us to sleep. + * return 0 on success, 1 on failure + */ +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = opaque; + static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, + ocfs2_file_ip_alloc_sem_key; + + inode->i_ino = args->fi_ino; + OCFS2_I(inode)->ip_blkno = args->fi_blkno; + if (args->fi_sysfile_type != 0) + lockdep_set_class(&inode->i_mutex, + &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) + lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, + &ocfs2_quota_ip_alloc_sem_key); + else + lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, + &ocfs2_file_ip_alloc_sem_key); + + return 0; +} + +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) +{ + struct super_block *sb; + struct ocfs2_super *osb; + int use_plocks = 1; + + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + + /* + * These have all been checked by ocfs2_read_inode_block() or set + * by ocfs2_mknod_locked(), so a failure is a code bug. + */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode + cannot create a superblock + inode today. change if + that is needed. */ + BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); + BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); + + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); + + inode->i_version = 1; + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { + inode->i_blocks = 0; + inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; + } else { + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_mapping->a_ops = &ocfs2_aops; + } + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) + mlog(ML_ERROR, + "ip_blkno %llu != i_blkno %llu!\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_blkno)); + + set_nlink(inode, ocfs2_read_links_count(fe)); + + trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, + le32_to_cpu(fe->i_flags)); + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + inode->i_flags |= S_NOQUOTA; + } + + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { + inode->i_flags |= S_NOQUOTA; + } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { + /* we can't actually hit this as read_inode can't + * handle superblocks today ;-) */ + BUG(); + } + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; + inode->i_op = &ocfs2_file_iops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFDIR: + inode->i_op = &ocfs2_dir_iops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; + i_size_write(inode, le64_to_cpu(fe->i_size)); + OCFS2_I(inode)->ip_dir_lock_gen = 1; + break; + case S_IFLNK: + inode->i_op = &ocfs2_symlink_inode_operations; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + default: + inode->i_op = &ocfs2_special_file_iops; + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + break; + } + + if (create_ino) { + inode->i_ino = ino_from_blkno(inode->i_sb, + le64_to_cpu(fe->i_blkno)); + + /* + * If we ever want to create system files from kernel, + * the generation argument to + * ocfs2_inode_lock_res_init() will have to change. + */ + BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, + OCFS2_LOCK_TYPE_META, 0, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, 0, inode); + } + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, + OCFS2_LOCK_TYPE_RW, inode->i_generation, + inode); + + ocfs2_set_inode_flags(inode); + + OCFS2_I(inode)->ip_last_used_slot = 0; + OCFS2_I(inode)->ip_last_used_group = 0; + + if (S_ISDIR(inode->i_mode)) + ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, + OCFS2_RESV_FLAG_DIR); +} + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args) +{ + struct super_block *sb; + struct ocfs2_super *osb; + struct ocfs2_dinode *fe; + struct buffer_head *bh = NULL; + int status, can_lock; + u32 generation = 0; + + status = -EINVAL; + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + /* + * To improve performance of cold-cache inode stats, we take + * the cluster lock here if possible. + * + * Generally, OCFS2 never trusts the contents of an inode + * unless it's holding a cluster lock, so taking it here isn't + * a correctness issue as much as it is a performance + * improvement. + * + * There are three times when taking the lock is not a good idea: + * + * 1) During startup, before we have initialized the DLM. + * + * 2) If we are reading certain system files which never get + * cluster locks (local alloc, truncate log). + * + * 3) If the process doing the iget() is responsible for + * orphan dir recovery. We're holding the orphan dir lock and + * can get into a deadlock with another process on another + * node in ->delete_inode(). + * + * #1 and #2 can be simply solved by never taking the lock + * here for system files (which are the only type we read + * during mount). It's a heavier approach, but our main + * concern is user-accessible files anyway. + * + * #3 works itself out because we'll eventually take the + * cluster lock before trusting anything anyway. + */ + can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) + && !ocfs2_mount_local(osb); + + trace_ocfs2_read_locked_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); + + /* + * To maintain backwards compatibility with older versions of + * ocfs2-tools, we still store the generation value for system + * files. The only ones that actually matter to userspace are + * the journals, but it's easier and inexpensive to just flag + * all system files similarly. + */ + if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + generation = osb->fs_generation; + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, + OCFS2_LOCK_TYPE_META, + generation, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, + 0, inode); + + if (can_lock) { + status = ocfs2_open_lock(inode); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } + status = ocfs2_inode_lock(inode, NULL, 0); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } + } + + if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { + status = ocfs2_try_open_lock(inode, 0); + if (status) { + make_bad_inode(inode); + return status; + } + } + + if (can_lock) { + status = ocfs2_read_inode_block_full(inode, &bh, + OCFS2_BH_IGNORE_CACHE); + } else { + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) + status = ocfs2_validate_inode_block(osb->sb, bh); + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EINVAL; + fe = (struct ocfs2_dinode *) bh->b_data; + + /* + * This is a code bug. Right now the caller needs to + * understand whether it is asking for a system file inode or + * not so the proper lock names can be built. + */ + mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != + !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), + "Inode %llu: system file state is ambigous\n", + (unsigned long long)args->fi_blkno); + + if (S_ISCHR(le16_to_cpu(fe->i_mode)) || + S_ISBLK(le16_to_cpu(fe->i_mode))) + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + + ocfs2_populate_inode(inode, fe, 0); + + BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + + status = 0; + +bail: + if (can_lock) + ocfs2_inode_unlock(inode, 0); + + if (status < 0) + make_bad_inode(inode); + + brelse(bh); + + return status; +} + +void ocfs2_sync_blockdev(struct super_block *sb) +{ + sync_blockdev(sb->s_bdev); +} + +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh) +{ + int status = 0; + struct ocfs2_dinode *fe; + handle_t *handle = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + /* + * This check will also skip truncate of inodes with inline + * data and fast symlinks. + */ + if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + + i_size_write(inode, 0); + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } + + ocfs2_commit_trans(osb, handle); + handle = NULL; + + status = ocfs2_commit_truncate(osb, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } + } + +out: + if (handle) + ocfs2_commit_trans(osb, handle); + return status; +} + +static int ocfs2_remove_inode(struct inode *inode, + struct buffer_head *di_bh, + struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh) +{ + int status; + struct inode *inode_alloc_inode = NULL; + struct buffer_head *inode_alloc_bh = NULL; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + le16_to_cpu(di->i_suballoc_slot)); + if (!inode_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + + ocfs2_quota_trans_credits(inode->i_sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh, false); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + } + + /* set the inodes dtime */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); + ocfs2_journal_dirty(handle, di_bh); + + ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); + dquot_free_inode(inode); + + status = ocfs2_free_dinode(handle, inode_alloc_inode, + inode_alloc_bh, di); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(osb, handle); +bail_unlock: + ocfs2_inode_unlock(inode_alloc_inode, 1); + mutex_unlock(&inode_alloc_inode->i_mutex); + brelse(inode_alloc_bh); +bail: + iput(inode_alloc_inode); + + return status; +} + +/* + * Serialize with orphan dir recovery. If the process doing + * recovery on this orphan dir does an iget() with the dir + * i_mutex held, we'll deadlock here. Instead we detect this + * and exit early - recovery will wipe this inode for us. + */ +static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + + spin_lock(&osb->osb_lock); + if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { + ret = -EDEADLK; + goto out; + } + /* This signals to the orphan recovery process that it should + * wait for us to handle the wipe. */ + osb->osb_orphan_wipes[slot]++; +out: + spin_unlock(&osb->osb_lock); + trace_ocfs2_check_orphan_recovery_state(slot, ret); + return ret; +} + +static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + osb->osb_orphan_wipes[slot]--; + spin_unlock(&osb->osb_lock); + + wake_up(&osb->osb_wipe_event); +} + +static int ocfs2_wipe_inode(struct inode *inode, + struct buffer_head *di_bh) +{ + int status, orphaned_slot = -1; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); + + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + } + + /* we do this while holding the orphan dir lock because we + * don't want recovery being run from another node to try an + * inode delete underneath us -- this will result in two nodes + * truncating the same file! */ + status = ocfs2_truncate_for_delete(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + /* Remove any dir index tree */ + if (S_ISDIR(inode->i_mode)) { + status = ocfs2_dx_dir_truncate(inode, di_bh); + if (status) { + mlog_errno(status); + goto bail_unlock_dir; + } + } + + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_refcount_tree(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, + orphan_dir_bh); + if (status < 0) + mlog_errno(status); + +bail_unlock_dir: + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) + return status; + + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); +bail: + iput(orphan_dir_inode); + ocfs2_signal_wipe_completion(osb, orphaned_slot); + + return status; +} + +/* There is a series of simple checks that should be done before a + * trylock is even considered. Encapsulate those in this function. */ +static int ocfs2_inode_is_valid_to_delete(struct inode *inode) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, + (unsigned long long)oi->ip_blkno, + oi->ip_flags); + + /* We shouldn't be getting here for the root directory + * inode.. */ + if (inode == osb->root_inode) { + mlog(ML_ERROR, "Skipping delete of root inode.\n"); + goto bail; + } + + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ + if (current == osb->dc_task) + goto bail; + + spin_lock(&oi->ip_lock); + /* OCFS2 *never* deletes system files. This should technically + * never get here as system file inodes should always have a + * positive link count. */ + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + mlog(ML_ERROR, "Skipping delete of system file %llu\n", + (unsigned long long)oi->ip_blkno); + goto bail_unlock; + } + + ret = 1; +bail_unlock: + spin_unlock(&oi->ip_lock); +bail: + return ret; +} + +/* Query the cluster to determine whether we should wipe an inode from + * disk or not. + * + * Requires the inode to have the cluster lock. */ +static int ocfs2_query_inode_wipe(struct inode *inode, + struct buffer_head *di_bh, + int *wipe) +{ + int status = 0, reason = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di; + + *wipe = 0; + + trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, + inode->i_nlink); + + /* While we were waiting for the cluster lock in + * ocfs2_delete_inode, another node might have asked to delete + * the inode. Recheck our flags to catch this. */ + if (!ocfs2_inode_is_valid_to_delete(inode)) { + reason = 1; + goto bail; + } + + /* Now that we have an up to date inode, we can double check + * the link count. */ + if (inode->i_nlink) + goto bail; + + /* Do some basic inode verification... */ + di = (struct ocfs2_dinode *) di_bh->b_data; + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && + !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + /* + * Inodes in the orphan dir must have ORPHANED_FL. The only + * inodes that come back out of the orphan dir are reflink + * targets. A reflink target may be moved out of the orphan + * dir between the time we scan the directory and the time we + * process it. This would lead to HAS_REFCOUNT_FL being set but + * ORPHANED_FL not. + */ + if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { + reason = 2; + goto bail; + } + + /* for lack of a better error? */ + status = -EEXIST; + mlog(ML_ERROR, + "Inode %llu (on-disk %llu) not orphaned! " + "Disk flags 0x%x, inode flags 0x%x\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)le64_to_cpu(di->i_blkno), + le32_to_cpu(di->i_flags), oi->ip_flags); + goto bail; + } + + /* has someone already deleted us?! baaad... */ + if (di->i_dtime) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + /* + * This is how ocfs2 determines whether an inode is still live + * within the cluster. Every node takes a shared read lock on + * the inode open lock in ocfs2_read_locked_inode(). When we + * get to ->delete_inode(), each node tries to convert it's + * lock to an exclusive. Trylocks are serialized by the inode + * meta data lock. If the upconvert succeeds, we know the inode + * is no longer live and can be deleted. + * + * Though we call this with the meta data lock held, the + * trylock keeps us from ABBA deadlock. + */ + status = ocfs2_try_open_lock(inode, 1); + if (status == -EAGAIN) { + status = 0; + reason = 3; + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *wipe = 1; + trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); + +bail: + trace_ocfs2_query_inode_wipe_end(status, reason); + return status; +} + +/* Support function for ocfs2_delete_inode. Will help us keep the + * inode data in a consistent state for clear_inode. Always truncates + * pages, optionally sync's them first. */ +static void ocfs2_cleanup_delete_inode(struct inode *inode, + int sync_data) +{ + trace_ocfs2_cleanup_delete_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); + if (sync_data) + filemap_write_and_wait(inode->i_mapping); + truncate_inode_pages_final(&inode->i_data); +} + +static void ocfs2_delete_inode(struct inode *inode) +{ + int wipe, status; + sigset_t oldset; + struct buffer_head *di_bh = NULL; + + trace_ocfs2_delete_inode(inode->i_ino, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + is_bad_inode(inode)); + + /* When we fail in read_inode() we mark inode as bad. The second test + * catches the case when inode allocation fails before allocating + * a block for inode. */ + if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) + goto bail; + + if (!ocfs2_inode_is_valid_to_delete(inode)) { + /* It's probably not necessary to truncate_inode_pages + * here but we do it for safety anyway (it will most + * likely be a no-op anyway) */ + ocfs2_cleanup_delete_inode(inode, 0); + goto bail; + } + + dquot_initialize(inode); + + /* We want to block signals in delete_inode as the lock and + * messaging paths may return us -ERESTARTSYS. Which would + * cause us to exit early, resulting in inodes being orphaned + * forever. */ + ocfs2_block_signals(&oldset); + + /* + * Synchronize us against ocfs2_get_dentry. We take this in + * shared mode so that all nodes can still concurrently + * process deletes. + */ + status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } + /* Lock down the inode. This gives us an up to date view of + * it's metadata (for verification), and allows us to + * serialize delete_inode on multiple nodes. + * + * Even though we might be doing a truncate, we don't take the + * allocation lock here as it won't be needed - nobody will + * have the file open. + */ + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_nfs_sync; + } + + /* Query the cluster. This will be the final decision made + * before we go ahead and wipe the inode. */ + status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); + if (!wipe || status < 0) { + /* Error and remote inode busy both mean we won't be + * removing the inode, so they take almost the same + * path. */ + if (status < 0) + mlog_errno(status); + + /* Someone in the cluster has disallowed a wipe of + * this inode, or it was never completely + * orphaned. Write out the pages and exit now. */ + ocfs2_cleanup_delete_inode(inode, 1); + goto bail_unlock_inode; + } + + ocfs2_cleanup_delete_inode(inode, 0); + + status = ocfs2_wipe_inode(inode, di_bh); + if (status < 0) { + if (status != -EDEADLK) + mlog_errno(status); + goto bail_unlock_inode; + } + + /* + * Mark the inode as successfully deleted. + * + * This is important for ocfs2_clear_inode() as it will check + * this flag and skip any checkpointing work + * + * ocfs2_stuff_meta_lvb() also uses this flag to invalidate + * the LVB for other nodes. + */ + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail_unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); + +bail_unblock: + ocfs2_unblock_signals(&oldset); +bail: + return; +} + +static void ocfs2_clear_inode(struct inode *inode) +{ + int status; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + clear_inode(inode); + trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink); + + mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + "Inode=%lu\n", inode->i_ino); + + dquot_drop(inode); + + /* To preven remote deletes we hold open lock before, now it + * is time to unlock PR and EX open locks. */ + ocfs2_open_unlock(inode); + + /* Do these before all the other work so that we don't bounce + * the downconvert thread while waiting to destroy the locks. */ + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); + + ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + &oi->ip_la_data_resv); + ocfs2_resv_init_once(&oi->ip_la_data_resv); + + /* We very well may get a clear_inode before all an inodes + * metadata has hit disk. Of course, we can't drop any cluster + * locks until the journal has finished with it. The only + * exception here are successfully wiped inodes - their + * metadata can now be considered to be part of the system + * inodes from which it came. */ + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + ocfs2_checkpoint_inode(inode); + + mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), + "Clear inode of %llu, inode has io markers\n", + (unsigned long long)oi->ip_blkno); + + ocfs2_extent_map_trunc(inode, 0); + + status = ocfs2_drop_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + ocfs2_lock_res_free(&oi->ip_rw_lockres); + ocfs2_lock_res_free(&oi->ip_inode_lockres); + ocfs2_lock_res_free(&oi->ip_open_lockres); + + ocfs2_metadata_cache_exit(INODE_CACHE(inode)); + + mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, + "Clear inode of %llu, inode has %u cache items\n", + (unsigned long long)oi->ip_blkno, + INODE_CACHE(inode)->ci_num_cached); + + mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), + "Clear inode of %llu, inode has a bad flag\n", + (unsigned long long)oi->ip_blkno); + + mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), + "Clear inode of %llu, inode is locked\n", + (unsigned long long)oi->ip_blkno); + + mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), + "Clear inode of %llu, io_mutex is locked\n", + (unsigned long long)oi->ip_blkno); + mutex_unlock(&oi->ip_io_mutex); + + /* + * down_trylock() returns 0, down_write_trylock() returns 1 + * kernel 1, world 0 + */ + mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), + "Clear inode of %llu, alloc_sem is locked\n", + (unsigned long long)oi->ip_blkno); + up_write(&oi->ip_alloc_sem); + + mlog_bug_on_msg(oi->ip_open_count, + "Clear inode of %llu has open count %d\n", + (unsigned long long)oi->ip_blkno, oi->ip_open_count); + + /* Clear all other flags. */ + oi->ip_flags = 0; + oi->ip_dir_start_lookup = 0; + oi->ip_blkno = 0ULL; + + /* + * ip_jinode is used to track txns against this inode. We ensure that + * the journal is flushed before journal shutdown. Thus it is safe to + * have inodes get cleaned up after journal shutdown. + */ + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); +} + +void ocfs2_evict_inode(struct inode *inode) +{ + if (!inode->i_nlink || + (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { + ocfs2_delete_inode(inode); + } else { + truncate_inode_pages_final(&inode->i_data); + } + ocfs2_clear_inode(inode); +} + +/* Called under inode_lock, with no more references on the + * struct inode, so it's safe here to check the flags field + * and to manipulate i_nlink without any other locks. */ +int ocfs2_drop_inode(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int res; + + trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink, oi->ip_flags); + + if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) + res = 1; + else + res = generic_drop_inode(inode); + + return res; +} + +/* + * This is called from our getattr. + */ +int ocfs2_inode_revalidate(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int status = 0; + + trace_ocfs2_inode_revalidate(inode, + inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, + inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); + + if (!inode) { + status = -ENOENT; + goto bail; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Let ocfs2_inode_lock do the work of updating our struct + * inode for us. */ + status = ocfs2_inode_lock(inode, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + ocfs2_inode_unlock(inode, 0); +bail: + return status; +} + +/* + * Updates a disk inode from a + * struct inode. + * Only takes ip_lock. + */ +int ocfs2_mark_inode_dirty(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + int status; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + + trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + ocfs2_get_inode_flags(OCFS2_I(inode)); + fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); + fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + fe->i_size = cpu_to_le64(i_size_read(inode)); + ocfs2_set_links_count(fe, inode->i_nlink); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); + fe->i_mode = cpu_to_le16(inode->i_mode); + fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); +leave: + return status; +} + +/* + * + * Updates a struct inode from a disk inode. + * does no i/o, only takes ip_lock. + */ +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe) +{ + spin_lock(&OCFS2_I(inode)->ip_lock); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); + ocfs2_set_inode_flags(inode); + i_size_write(inode, le64_to_cpu(fe->i_size)); + set_nlink(inode, ocfs2_read_links_count(fe)); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); + inode->i_mode = le16_to_cpu(fe->i_mode); + if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + /* + * Errors after here are fatal. + */ + + rc = -EINVAL; + + if (!OCFS2_IS_VALID_DINODE(di)) { + ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); + goto bail; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + goto bail; + } + + rc = 0; + +bail: + return rc; +} + +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, ocfs2_validate_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) +{ + return ocfs2_read_inode_block_full(inode, bh, 0); +} + + +static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->ip_blkno; +} + +static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->vfs_inode.i_sb; +} + +static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_lock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_lock(&oi->ip_io_mutex); +} + +static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_unlock(&oi->ip_io_mutex); +} + +const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { + .co_owner = ocfs2_inode_cache_owner, + .co_get_super = ocfs2_inode_cache_get_super, + .co_cache_lock = ocfs2_inode_cache_lock, + .co_cache_unlock = ocfs2_inode_cache_unlock, + .co_io_lock = ocfs2_inode_cache_io_lock, + .co_io_unlock = ocfs2_inode_cache_io_unlock, +}; + diff --git a/kernel/fs/ocfs2/inode.h b/kernel/fs/ocfs2/inode.h new file mode 100644 index 000000000..5e86b247c --- /dev/null +++ b/kernel/fs/ocfs2/inode.h @@ -0,0 +1,190 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_INODE_H +#define OCFS2_INODE_H + +#include "extent_map.h" + +/* OCFS2 Inode Private Data */ +struct ocfs2_inode_info +{ + u64 ip_blkno; + + struct ocfs2_lock_res ip_rw_lockres; + struct ocfs2_lock_res ip_inode_lockres; + struct ocfs2_lock_res ip_open_lockres; + + /* protects allocation changes on this inode. */ + struct rw_semaphore ip_alloc_sem; + + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + + /* Number of outstanding AIO's which are not page aligned */ + struct mutex ip_unaligned_aio; + + /* These fields are protected by ip_lock */ + spinlock_t ip_lock; + u32 ip_open_count; + struct list_head ip_io_markers; + u32 ip_clusters; + + u16 ip_dyn_features; + struct mutex ip_io_mutex; + u32 ip_flags; /* see below */ + u32 ip_attr; /* inode attributes */ + + /* protected by recovery_lock. */ + struct inode *ip_next_orphan; + + struct ocfs2_caching_info ip_metadata_cache; + struct ocfs2_extent_map ip_extent_map; + struct inode vfs_inode; + struct jbd2_inode ip_jinode; + + u32 ip_dir_start_lookup; + + /* Only valid if the inode is the dir. */ + u32 ip_last_used_slot; + u64 ip_last_used_group; + u32 ip_dir_lock_gen; + + struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + + wait_queue_head_t append_dio_wq; + + struct dquot *i_dquot[MAXQUOTAS]; +}; + +/* + * Flags for the ip_flags field + */ +/* System file inodes */ +#define OCFS2_INODE_SYSTEM_FILE 0x00000001 +#define OCFS2_INODE_JOURNAL 0x00000002 +#define OCFS2_INODE_BITMAP 0x00000004 +/* This inode has been wiped from disk */ +#define OCFS2_INODE_DELETED 0x00000008 +/* Has the inode been orphaned on another node? + * + * This hints to ocfs2_drop_inode that it should clear i_nlink before + * continuing. + * + * We *only* set this on unlink vote from another node. If the inode + * was locally orphaned, then we're sure of the state and don't need + * to twiddle i_nlink later - it's either zero or not depending on + * whether our unlink succeeded. Otherwise we got this from a node + * whose intention was to orphan the inode, however he may have + * crashed, failed etc, so we let ocfs2_drop_inode zero the value and + * rely on ocfs2_delete_inode to sort things out under the proper + * cluster locks. + */ +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 +/* Does someone have the file open O_DIRECT */ +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 +/* Tell the inode wipe code it's not in orphan dir */ +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 + +static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) +{ + return container_of(inode, struct ocfs2_inode_info, vfs_inode); +} + +#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) +#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) + +extern struct kmem_cache *ocfs2_inode_cache; + +extern const struct address_space_operations ocfs2_aops; +extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; + +static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) +{ + return &OCFS2_I(inode)->ip_metadata_cache; +} + +void ocfs2_evict_inode(struct inode *inode); +int ocfs2_drop_inode(struct inode *inode); + +/* Flags for ocfs2_iget() */ +#define OCFS2_FI_FLAG_SYSFILE 0x1 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, + int sysfile_type); +int ocfs2_inode_init_private(struct inode *inode); +int ocfs2_inode_revalidate(struct dentry *dentry); +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); +void ocfs2_read_inode(struct inode *inode); +void ocfs2_read_inode2(struct inode *inode, void *opaque); +ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp); +void ocfs2_sync_blockdev(struct super_block *sb); +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe); +int ocfs2_mark_inode_dirty(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada); + +void ocfs2_set_inode_flags(struct inode *inode); +void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); + +static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) +{ + int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; + + return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits; +} + +/* Validate that a bh contains a valid inode */ +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +/* + * Read an inode block into *bh. If *bh is NULL, a bh will be allocated. + * This is a cached read. The inode will be validated with + * ocfs2_validate_inode_block(). + */ +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); +/* The same, but can be passed OCFS2_BH_* flags */ +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags); + +static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); +} + +#endif /* OCFS2_INODE_H */ diff --git a/kernel/fs/ocfs2/ioctl.c b/kernel/fs/ocfs2/ioctl.c new file mode 100644 index 000000000..53e6c40ed --- /dev/null +++ b/kernel/fs/ocfs2/ioctl.c @@ -0,0 +1,1005 @@ +/* + * linux/fs/ocfs2/ioctl.c + * + * Copyright (C) 2006 Herbert Poetzl + * adapted from Remy Card's ext2/ioctl.c + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "journal.h" + +#include "ocfs2_fs.h" +#include "ioctl.h" +#include "resize.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "suballoc.h" +#include "move_extents.h" + +#define o2info_from_user(a, b) \ + copy_from_user(&(a), (b), sizeof(a)) +#define o2info_to_user(a, b) \ + copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) + +/* + * This is just a best-effort to tell userspace that this request + * caused the error. + */ +static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, + struct ocfs2_info_request __user *req) +{ + kreq->ir_flags |= OCFS2_INFO_FL_ERROR; + (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); +} + +static inline void o2info_set_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags |= OCFS2_INFO_FL_FILLED; +} + +static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags &= ~OCFS2_INFO_FL_FILLED; +} + +static inline int o2info_coherent(struct ocfs2_info_request *req) +{ + return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); +} + +static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) +{ + int status; + + status = ocfs2_inode_lock(inode, NULL, 0); + if (status < 0) { + mlog_errno(status); + return status; + } + ocfs2_get_inode_flags(OCFS2_I(inode)); + *flags = OCFS2_I(inode)->ip_attr; + ocfs2_inode_unlock(inode, 0); + + return status; +} + +static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, + unsigned mask) +{ + struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + struct buffer_head *bh = NULL; + unsigned oldflags; + int status; + + mutex_lock(&inode->i_mutex); + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EACCES; + if (!inode_owner_or_capable(inode)) + goto bail_unlock; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~OCFS2_DIRSYNC_FL; + + oldflags = ocfs2_inode->ip_attr; + flags = flags & mask; + flags |= oldflags & ~mask; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + status = -EPERM; + if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & + (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto bail_unlock; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + ocfs2_inode->ip_attr = flags; + ocfs2_set_inode_flags(inode); + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock: + ocfs2_inode_unlock(inode, 1); +bail: + mutex_unlock(&inode->i_mutex); + + brelse(bh); + + return status; +} + +static int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_blocksize oib; + + if (o2info_from_user(oib, req)) + return -EFAULT; + + oib.ib_blocksize = inode->i_sb->s_blocksize; + + o2info_set_request_filled(&oib.ib_req); + + if (o2info_to_user(oib, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_clustersize oic; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oic, req)) + return -EFAULT; + + oic.ic_clustersize = osb->s_clustersize; + + o2info_set_request_filled(&oic.ic_req); + + if (o2info_to_user(oic, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_maxslots oim; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oim, req)) + return -EFAULT; + + oim.im_max_slots = osb->max_slots; + + o2info_set_request_filled(&oim.im_req); + + if (o2info_to_user(oim, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_label oil; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oil, req)) + return -EFAULT; + + memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); + + o2info_set_request_filled(&oil.il_req); + + if (o2info_to_user(oil, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_uuid oiu; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oiu, req)) + return -EFAULT; + + memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); + + o2info_set_request_filled(&oiu.iu_req); + + if (o2info_to_user(oiu, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_fs_features oif; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oif, req)) + return -EFAULT; + + oif.if_compat_features = osb->s_feature_compat; + oif.if_incompat_features = osb->s_feature_incompat; + oif.if_ro_compat_features = osb->s_feature_ro_compat; + + o2info_set_request_filled(&oif.if_req); + + if (o2info_to_user(oif, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_journal_size oij; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oij, req)) + return -EFAULT; + + oij.ij_journal_size = i_size_read(osb->journal->j_inode); + + o2info_set_request_filled(&oij.ij_req); + + if (o2info_to_user(oij, req)) + return -EFAULT; + + return 0; +} + +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, + u32 slot) +{ + int status = 0, unlock = 0; + + struct buffer_head *bh = NULL; + struct ocfs2_dinode *dinode_alloc = NULL; + + if (inode_alloc) + mutex_lock(&inode_alloc->i_mutex); + + if (o2info_coherent(&fi->ifi_req)) { + status = ocfs2_inode_lock(inode_alloc, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + dinode_alloc = (struct ocfs2_dinode *)bh->b_data; + + fi->ifi_stat[slot].lfi_total = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); + fi->ifi_stat[slot].lfi_free = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - + le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); + +bail: + if (unlock) + ocfs2_inode_unlock(inode_alloc, 0); + + if (inode_alloc) + mutex_unlock(&inode_alloc->i_mutex); + + brelse(bh); + + return status; +} + +static int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u32 i; + u64 blkno = -1; + char namebuf[40]; + int status, type = INODE_ALLOC_SYSTEM_INODE; + struct ocfs2_info_freeinode *oifi = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *inode_alloc = NULL; + + oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); + if (!oifi) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } + + oifi->ifi_slotnum = osb->max_slots; + + for (i = 0; i < oifi->ifi_slotnum; i++) { + if (o2info_coherent(&oifi->ifi_req)) { + inode_alloc = ocfs2_get_system_file_inode(osb, type, i); + if (!inode_alloc) { + mlog(ML_ERROR, "unable to get alloc inode in " + "slot %u\n", i); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); + + iput(inode_alloc); + inode_alloc = NULL; + + if (status < 0) + goto bail; + } + + o2info_set_request_filled(&oifi->ifi_req); + + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } + + status = 0; +bail: + if (status) + o2info_set_request_error(&oifi->ifi_req, req); +out_free: + kfree(oifi); +out_err: + return status; +} + +static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, + unsigned int chunksize) +{ + int index; + + index = __ilog2_u32(chunksize); + if (index >= OCFS2_INFO_MAX_HIST) + index = OCFS2_INFO_MAX_HIST - 1; + + hist->fc_chunks[index]++; + hist->fc_clusters[index] += chunksize; +} + +static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, + unsigned int chunksize) +{ + if (chunksize > stats->ffs_max) + stats->ffs_max = chunksize; + + if (chunksize < stats->ffs_min) + stats->ffs_min = chunksize; + + stats->ffs_avg += chunksize; + stats->ffs_free_chunks_real++; +} + +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) +{ + o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); + o2ffg_update_stats(&(ffg->iff_ffs), chunksize); +} + +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) +{ + int status = 0, used; + u64 blkno; + + struct buffer_head *bh = NULL; + struct ocfs2_group_desc *bg = NULL; + + unsigned int max_bits, num_clusters; + unsigned int offset = 0, cluster, chunk; + unsigned int chunk_free, last_chunksize = 0; + + if (!le32_to_cpu(rec->c_free)) + goto bail; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (bh) { + brelse(bh); + bh = NULL; + } + + if (o2info_coherent(&ffg->iff_req)) + status = ocfs2_read_group_descriptor(gb_inode, + gb_dinode, + blkno, &bh); + else + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + + if (status < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # " + "%llu from device.", (unsigned long long)blkno); + status = -EIO; + goto bail; + } + + bg = (struct ocfs2_group_desc *)bh->b_data; + + if (!le16_to_cpu(bg->bg_free_bits_count)) + continue; + + max_bits = le16_to_cpu(bg->bg_bits); + offset = 0; + + for (chunk = 0; chunk < chunks_in_group; chunk++) { + /* + * last chunk may be not an entire one. + */ + if ((offset + ffg->iff_chunksize) > max_bits) + num_clusters = max_bits - offset; + else + num_clusters = ffg->iff_chunksize; + + chunk_free = 0; + for (cluster = 0; cluster < num_clusters; cluster++) { + used = ocfs2_test_bit(offset, + (unsigned long *)bg->bg_bitmap); + /* + * - chunk_free counts free clusters in #N chunk. + * - last_chunksize records the size(in) clusters + * for the last real free chunk being counted. + */ + if (!used) { + last_chunksize++; + chunk_free++; + } + + if (used && last_chunksize) { + ocfs2_info_update_ffg(ffg, + last_chunksize); + last_chunksize = 0; + } + + offset++; + } + + if (chunk_free == ffg->iff_chunksize) + ffg->iff_ffs.ffs_free_chunks++; + } + + /* + * need to update the info for last free chunk. + */ + if (last_chunksize) + ocfs2_info_update_ffg(ffg, last_chunksize); + + } while (le64_to_cpu(bg->bg_next_group)); + +bail: + brelse(bh); + + return status; +} + +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) +{ + u32 chunks_in_group; + int status = 0, unlock = 0, i; + + struct buffer_head *bh = NULL; + struct ocfs2_chain_list *cl = NULL; + struct ocfs2_chain_rec *rec = NULL; + struct ocfs2_dinode *gb_dinode = NULL; + + if (gb_inode) + mutex_lock(&gb_inode->i_mutex); + + if (o2info_coherent(&ffg->iff_req)) { + status = ocfs2_inode_lock(gb_inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + gb_dinode = (struct ocfs2_dinode *)bh->b_data; + cl = &(gb_dinode->id2.i_chain); + + /* + * Chunksize(in) clusters from userspace should be + * less than clusters in a group. + */ + if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { + status = -EINVAL; + goto bail; + } + + memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); + + ffg->iff_ffs.ffs_min = ~0U; + ffg->iff_ffs.ffs_clusters = + le32_to_cpu(gb_dinode->id1.bitmap1.i_total); + ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - + le32_to_cpu(gb_dinode->id1.bitmap1.i_used); + + chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + rec = &(cl->cl_recs[i]); + status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, + gb_dinode, + rec, ffg, + chunks_in_group); + if (status) + goto bail; + } + + if (ffg->iff_ffs.ffs_free_chunks_real) + ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / + ffg->iff_ffs.ffs_free_chunks_real); +bail: + if (unlock) + ocfs2_inode_unlock(gb_inode, 0); + + if (gb_inode) + mutex_unlock(&gb_inode->i_mutex); + + if (gb_inode) + iput(gb_inode); + + brelse(bh); + + return status; +} + +static int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u64 blkno = -1; + char namebuf[40]; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; + + struct ocfs2_info_freefrag *oiff; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *gb_inode = NULL; + + oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); + if (!oiff) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } + /* + * chunksize from userspace should be power of 2. + */ + if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || + (!oiff->iff_chunksize)) { + status = -EINVAL; + goto bail; + } + + if (o2info_coherent(&oiff->iff_req)) { + gb_inode = ocfs2_get_system_file_inode(osb, type, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, + OCFS2_INVALID_SLOT); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); + if (status < 0) + goto bail; + + o2info_set_request_filled(&oiff->iff_req); + + if (o2info_to_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } + + status = 0; +bail: + if (status) + o2info_set_request_error(&oiff->iff_req, req); +out_free: + kfree(oiff); +out_err: + return status; +} + +static int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + return -EFAULT; + + o2info_clear_request_filled(&oir); + + if (o2info_to_user(oir, req)) + return -EFAULT; + + return 0; +} + +/* + * Validate and distinguish OCFS2_IOC_INFO requests. + * + * - validate the magic number. + * - distinguish different requests. + * - validate size of different requests. + */ +static int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + status = -EINVAL; + if (oir.ir_magic != OCFS2_INFO_MAGIC) + goto bail; + + switch (oir.ir_code) { + case OCFS2_INFO_BLOCKSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) + status = ocfs2_info_handle_blocksize(inode, req); + break; + case OCFS2_INFO_CLUSTERSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) + status = ocfs2_info_handle_clustersize(inode, req); + break; + case OCFS2_INFO_MAXSLOTS: + if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) + status = ocfs2_info_handle_maxslots(inode, req); + break; + case OCFS2_INFO_LABEL: + if (oir.ir_size == sizeof(struct ocfs2_info_label)) + status = ocfs2_info_handle_label(inode, req); + break; + case OCFS2_INFO_UUID: + if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) + status = ocfs2_info_handle_uuid(inode, req); + break; + case OCFS2_INFO_FS_FEATURES: + if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) + status = ocfs2_info_handle_fs_features(inode, req); + break; + case OCFS2_INFO_JOURNAL_SIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) + status = ocfs2_info_handle_journal_size(inode, req); + break; + case OCFS2_INFO_FREEINODE: + if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) + status = ocfs2_info_handle_freeinode(inode, req); + break; + case OCFS2_INFO_FREEFRAG: + if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) + status = ocfs2_info_handle_freefrag(inode, req); + break; + default: + status = ocfs2_info_handle_unknown(inode, req); + break; + } + +bail: + return status; +} + +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) +{ + int status = -EFAULT; + u64 __user *bp = NULL; + + if (compat_flag) { +#ifdef CONFIG_COMPAT + /* + * pointer bp stores the base address of a pointers array, + * which collects all addresses of separate request. + */ + bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); +#else + BUG(); +#endif + } else + bp = (u64 __user *)(unsigned long)(info->oi_requests); + + if (o2info_from_user(*req_addr, bp + idx)) + goto bail; + + status = 0; +bail: + return status; +} + +/* + * OCFS2_IOC_INFO handles an array of requests passed from userspace. + * + * ocfs2_info_handle() recevies a large info aggregation, grab and + * validate the request count from header, then break it into small + * pieces, later specific handlers can handle them one by one. + * + * Idea here is to make each separate request small enough to ensure + * a better backward&forward compatibility, since a small piece of + * request will be less likely to be broken if disk layout get changed. + */ +static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) +{ + int i, status = 0; + u64 req_addr; + struct ocfs2_info_request __user *reqp; + + if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || + (!info->oi_requests)) { + status = -EINVAL; + goto bail; + } + + for (i = 0; i < info->oi_count; i++) { + + status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); + if (status) + break; + + reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr; + if (!reqp) { + status = -EINVAL; + goto bail; + } + + status = ocfs2_info_handle_request(inode, reqp); + if (status) + break; + } + +bail: + return status; +} + +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + unsigned int flags; + int new_clusters; + int status; + struct ocfs2_space_resv sr; + struct ocfs2_new_group_input input; + struct reflink_arguments args; + const char __user *old_path; + const char __user *new_path; + bool preserve; + struct ocfs2_info info; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case OCFS2_IOC_GETFLAGS: + status = ocfs2_get_inode_attr(inode, &flags); + if (status < 0) + return status; + + flags &= OCFS2_FL_VISIBLE; + return put_user(flags, (int __user *) arg); + case OCFS2_IOC_SETFLAGS: + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_set_inode_attr(inode, flags, + OCFS2_FL_MODIFIABLE); + mnt_drop_write_file(filp); + return status; + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) + return -EFAULT; + + return ocfs2_change_file_space(filp, cmd, &sr); + case OCFS2_IOC_GROUP_EXTEND: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (get_user(new_clusters, (int __user *)arg)) + return -EFAULT; + + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_extend(inode, new_clusters); + mnt_drop_write_file(filp); + return status; + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (copy_from_user(&input, (int __user *) arg, sizeof(input))) + return -EFAULT; + + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_add(inode, &input); + mnt_drop_write_file(filp); + return status; + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + old_path = (const char __user *)(unsigned long)args.old_path; + new_path = (const char __user *)(unsigned long)args.new_path; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 0); + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen); + ret = ocfs2_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &range, sizeof(range))) + return -EFAULT; + + return 0; + } + case OCFS2_IOC_MOVE_EXT: + return ocfs2_ioctl_move_extents(filp, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + bool preserve; + struct reflink_arguments args; + struct inode *inode = file_inode(file); + struct ocfs2_info info; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case OCFS2_IOC32_GETFLAGS: + cmd = OCFS2_IOC_GETFLAGS; + break; + case OCFS2_IOC32_SETFLAGS: + cmd = OCFS2_IOC_SETFLAGS; + break; + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + case OCFS2_IOC_GROUP_EXTEND: + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: + case FITRIM: + break; + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), + compat_ptr(args.new_path), preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 1); + case OCFS2_IOC_MOVE_EXT: + break; + default: + return -ENOIOCTLCMD; + } + + return ocfs2_ioctl(file, cmd, arg); +} +#endif diff --git a/kernel/fs/ocfs2/ioctl.h b/kernel/fs/ocfs2/ioctl.h new file mode 100644 index 000000000..0cd5323bd --- /dev/null +++ b/kernel/fs/ocfs2/ioctl.h @@ -0,0 +1,16 @@ +/* + * ioctl.h + * + * Function prototypes + * + * Copyright (C) 2006 Herbert Poetzl + * + */ + +#ifndef OCFS2_IOCTL_PROTO_H +#define OCFS2_IOCTL_PROTO_H + +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); + +#endif /* OCFS2_IOCTL_PROTO_H */ diff --git a/kernel/fs/ocfs2/journal.c b/kernel/fs/ocfs2/journal.c new file mode 100644 index 000000000..ff5319282 --- /dev/null +++ b/kernel/fs/ocfs2/journal.c @@ -0,0 +1,2323 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.c + * + * Defines functions of journalling api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "quota.h" +#include "file.h" +#include "namei.h" + +#include "buffer_head_io.h" +#include "ocfs2_trace.h" + +DEFINE_SPINLOCK(trans_inc_lock); + +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000 + +static int ocfs2_force_read_journal(struct inode *inode); +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num, int slot_num); +static int __ocfs2_recovery_thread(void *arg); +static int ocfs2_commit_cache(struct ocfs2_super *osb); +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota); +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty, int replayed); +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num); +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); +static int ocfs2_commit_thread(void *arg); +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); + +static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 0); +} + +static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 1); +} + +/* + * This replay_map is to track online/offline slots, so we could recover + * offline slots during recovery and mount + */ + +enum ocfs2_replay_state { + REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */ + REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */ + REPLAY_DONE /* Replay was already queued */ +}; + +struct ocfs2_replay_map { + unsigned int rm_slots; + enum ocfs2_replay_state rm_state; + unsigned char rm_replay_slots[0]; +}; + +void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +{ + if (!osb->replay_map) + return; + + /* If we've already queued the replay, we don't have any more to do */ + if (osb->replay_map->rm_state == REPLAY_DONE) + return; + + osb->replay_map->rm_state = state; +} + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map; + int i, node_num; + + /* If replay map is already set, we don't do it again */ + if (osb->replay_map) + return 0; + + replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + + (osb->max_slots * sizeof(char)), GFP_KERNEL); + + if (!replay_map) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + spin_lock(&osb->osb_lock); + + replay_map->rm_slots = osb->max_slots; + replay_map->rm_state = REPLAY_UNNEEDED; + + /* set rm_replay_slots for offline slot(s) */ + for (i = 0; i < replay_map->rm_slots; i++) { + if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT) + replay_map->rm_replay_slots[i] = 1; + } + + osb->replay_map = replay_map; + spin_unlock(&osb->osb_lock); + return 0; +} + +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + int i; + + if (!replay_map) + return; + + if (replay_map->rm_state != REPLAY_NEEDED) + return; + + for (i = 0; i < replay_map->rm_slots; i++) + if (replay_map->rm_replay_slots[i]) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, + NULL, NULL, + orphan_reco_type); + replay_map->rm_state = REPLAY_DONE; +} + +void ocfs2_free_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + + if (!osb->replay_map) + return; + + kfree(replay_map); + osb->replay_map = NULL; +} + +int ocfs2_recovery_init(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + mutex_init(&osb->recovery_lock); + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + init_waitqueue_head(&osb->recovery_event); + + rm = kzalloc(sizeof(struct ocfs2_recovery_map) + + osb->max_slots * sizeof(unsigned int), + GFP_KERNEL); + if (!rm) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + rm->rm_entries = (unsigned int *)((char *)rm + + sizeof(struct ocfs2_recovery_map)); + osb->recovery_map = rm; + + return 0; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + mutex_lock(&osb->recovery_lock); + osb->disable_recovery = 1; + mutex_unlock(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + /* + * Now that recovery is shut down, and the osb is about to be + * freed, the osb_lock is not taken here. + */ + rm = osb->recovery_map; + /* XXX: Should we bug if there are dirty entries? */ + + kfree(rm); +} + +static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + assert_spin_locked(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + return 1; + } + + return 0; +} + +/* Behaves like test-and-set. Returns the previous value */ +static int ocfs2_recovery_map_set(struct ocfs2_super *osb, + unsigned int node_num) +{ + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + return 1; + } + + /* XXX: Can this be exploited? Not from o2dlm... */ + BUG_ON(rm->rm_used >= osb->max_slots); + + rm->rm_entries[rm->rm_used] = node_num; + rm->rm_used++; + spin_unlock(&osb->osb_lock); + + return 0; +} + +static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + break; + } + + if (i < rm->rm_used) { + /* XXX: be careful with the pointer math */ + memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), + (rm->rm_used - i - 1) * sizeof(unsigned int)); + rm->rm_used--; + } + + spin_unlock(&osb->osb_lock); +} + +static int ocfs2_commit_cache(struct ocfs2_super *osb) +{ + int status = 0; + unsigned int flushed; + struct ocfs2_journal *journal = NULL; + + journal = osb->journal; + + /* Flush all pending commits and checkpoint the journal. */ + down_write(&journal->j_trans_barrier); + + flushed = atomic_read(&journal->j_num_trans); + trace_ocfs2_commit_cache_begin(flushed); + if (flushed == 0) { + up_write(&journal->j_trans_barrier); + goto finally; + } + + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + up_write(&journal->j_trans_barrier); + mlog_errno(status); + goto finally; + } + + ocfs2_inc_trans_id(journal); + + flushed = atomic_read(&journal->j_num_trans); + atomic_set(&journal->j_num_trans, 0); + up_write(&journal->j_trans_barrier); + + trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed); + + ocfs2_wake_downconvert_thread(osb); + wake_up(&journal->j_checkpointed); +finally: + return status; +} + +handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) +{ + journal_t *journal = osb->journal->j_journal; + handle_t *handle; + + BUG_ON(!osb || !osb->journal->j_journal); + + if (ocfs2_is_hard_readonly(osb)) + return ERR_PTR(-EROFS); + + BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); + BUG_ON(max_buffs <= 0); + + /* Nested transaction? Just return the handle... */ + if (journal_current_handle()) + return jbd2_journal_start(journal, max_buffs); + + sb_start_intwrite(osb->sb); + + down_read(&osb->journal->j_trans_barrier); + + handle = jbd2_journal_start(journal, max_buffs); + if (IS_ERR(handle)) { + up_read(&osb->journal->j_trans_barrier); + sb_end_intwrite(osb->sb); + + mlog_errno(PTR_ERR(handle)); + + if (is_journal_aborted(journal)) { + ocfs2_abort(osb->sb, "Detected aborted journal"); + handle = ERR_PTR(-EROFS); + } + } else { + if (!ocfs2_mount_local(osb)) + atomic_inc(&(osb->journal->j_num_trans)); + } + + return handle; +} + +int ocfs2_commit_trans(struct ocfs2_super *osb, + handle_t *handle) +{ + int ret, nested; + struct ocfs2_journal *journal = osb->journal; + + BUG_ON(!handle); + + nested = handle->h_ref > 1; + ret = jbd2_journal_stop(handle); + if (ret < 0) + mlog_errno(ret); + + if (!nested) { + up_read(&journal->j_trans_barrier); + sb_end_intwrite(osb->sb); + } + + return ret; +} + +/* + * 'nblocks' is what you want to add to the current transaction. + * + * This might call jbd2_journal_restart() which will commit dirty buffers + * and then restart the transaction. Before calling + * ocfs2_extend_trans(), any changed blocks should have been + * dirtied. After calling it, all blocks which need to be changed must + * go through another set of journal_access/journal_dirty calls. + * + * WARNING: This will not release any semaphores or disk locks taken + * during the transaction, so make sure they were taken *before* + * start_trans or we'll have ordering deadlocks. + * + * WARNING2: Note that we do *not* drop j_trans_barrier here. This is + * good because transaction ids haven't yet been recorded on the + * cluster locks associated with this handle. + */ +int ocfs2_extend_trans(handle_t *handle, int nblocks) +{ + int status, old_nblocks; + + BUG_ON(!handle); + BUG_ON(nblocks < 0); + + if (!nblocks) + return 0; + + old_nblocks = handle->h_buffer_credits; + + trace_ocfs2_extend_trans(old_nblocks, nblocks); + +#ifdef CONFIG_OCFS2_DEBUG_FS + status = 1; +#else + status = jbd2_journal_extend(handle, nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } +#endif + + if (status > 0) { + trace_ocfs2_extend_trans_restart(old_nblocks + nblocks); + status = jbd2_journal_restart(handle, + old_nblocks + nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + return status; +} + +/* + * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for metadata modifications. + * Taken from Ext4: extend_or_restart_transaction() + */ +int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) +{ + int status, old_nblks; + + BUG_ON(!handle); + + old_nblks = handle->h_buffer_credits; + trace_ocfs2_allocate_extend_trans(old_nblks, thresh); + + if (old_nblks < thresh) + return 0; + + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) + mlog_errno(status); + } + +bail: + return status; +} + + +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; +}; + +static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) +{ + return container_of(triggers, struct ocfs2_triggers, ot_triggers); +} + +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, data + ot->ot_offset); +} + +/* + * Quota blocks have their own trigger because the struct ocfs2_block_check + * offset depends on the blocksize. + */ +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &dqt->dq_check); +} + +/* + * Directory blocks also have their own trigger because the + * struct ocfs2_block_check offset depends on the blocksize. + */ +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_dir_block_trailer *trailer = + ocfs2_dir_trailer_from_size(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &trailer->db_check); +} + +static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh) +{ + mlog(ML_ERROR, + "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " + "bh->b_blocknr = %llu\n", + (unsigned long)bh, + (unsigned long long)bh->b_blocknr); + + /* We aren't guaranteed to have the superblock here - but if we + * don't, it'll just crash. */ + ocfs2_error(bh->b_assoc_map->host->i_sb, + "JBD2 has aborted our journal, ocfs2 cannot continue\n"); +} + +static struct ocfs2_triggers di_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dinode, i_check), +}; + +static struct ocfs2_triggers eb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_extent_block, h_check), +}; + +static struct ocfs2_triggers rb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), +}; + +static struct ocfs2_triggers gd_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), +}; + +static struct ocfs2_triggers db_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_db_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers xb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), +}; + +static struct ocfs2_triggers dq_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_dq_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers dr_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), +}; + +static struct ocfs2_triggers dl_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), +}; + +static int __ocfs2_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, + struct ocfs2_triggers *triggers, + int type) +{ + int status; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + BUG_ON(!ci || !ci->ci_ops); + BUG_ON(!handle); + BUG_ON(!bh); + + trace_ocfs2_journal_access( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr, type, bh->b_size); + + /* we can safely remove this assertion after testing. */ + if (!buffer_uptodate(bh)) { + mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); + mlog(ML_ERROR, "b_blocknr=%llu\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + /* Set the current transaction information on the ci so + * that the locking code knows whether it can drop it's locks + * on this ci or not. We're protected from the commit + * thread updating the current transaction id until + * ocfs2_commit_trans() because ocfs2_start_trans() took + * j_trans_barrier for us. */ + ocfs2_set_ci_lock_trans(osb->journal, ci); + + ocfs2_metadata_cache_io_lock(ci); + switch (type) { + case OCFS2_JOURNAL_ACCESS_CREATE: + case OCFS2_JOURNAL_ACCESS_WRITE: + status = jbd2_journal_get_write_access(handle, bh); + break; + + case OCFS2_JOURNAL_ACCESS_UNDO: + status = jbd2_journal_get_undo_access(handle, bh); + break; + + default: + status = -EINVAL; + mlog(ML_ERROR, "Unknown access type!\n"); + } + if (!status && ocfs2_meta_ecc(osb) && triggers) + jbd2_journal_set_triggers(bh, &triggers->ot_triggers); + ocfs2_metadata_cache_io_unlock(ci); + + if (status < 0) + mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", + status, type); + + return status; +} + +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); +} + +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); +} + +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + type); +} + +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); +} + +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); +} + +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); +} + +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); +} + +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); +} + +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, NULL, type); +} + +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) +{ + int status; + + trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); + + status = jbd2_journal_dirty_metadata(handle, bh); + BUG_ON(status); +} + +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) + +void ocfs2_set_journal_params(struct ocfs2_super *osb) +{ + journal_t *journal = osb->journal->j_journal; + unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + + if (osb->osb_commit_interval) + commit_interval = osb->osb_commit_interval; + + write_lock(&journal->j_state_lock); + journal->j_commit_interval = commit_interval; + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + write_unlock(&journal->j_state_lock); +} + +int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_super *osb; + int inode_lock = 0; + + BUG_ON(!journal); + + osb = journal->j_osb; + + /* already have the inode for our journal */ + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + osb->slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto done; + } + + SET_INODE_JOURNAL(inode); + OCFS2_I(inode)->ip_open_count++; + + /* Skip recovery waits here - journal inode metadata never + * changes in a live cluster so it can be considered an + * exception to the rule. */ + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not get lock on journal!\n"); + goto done; + } + + inode_lock = 1; + di = (struct ocfs2_dinode *)bh->b_data; + + if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) { + mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", + i_size_read(inode)); + status = -EINVAL; + goto done; + } + + trace_ocfs2_journal_init(i_size_read(inode), + (unsigned long long)inode->i_blocks, + OCFS2_I(inode)->ip_clusters); + + /* call the kernels journal init function now */ + j_journal = jbd2_journal_init_inode(inode); + if (j_journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EINVAL; + goto done; + } + + trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + + *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL); + + journal->j_journal = j_journal; + journal->j_inode = inode; + journal->j_bh = bh; + + ocfs2_set_journal_params(osb); + + journal->j_state = OCFS2_JOURNAL_LOADED; + + status = 0; +done: + if (status < 0) { + if (inode_lock) + ocfs2_inode_unlock(inode, 1); + brelse(bh); + if (inode) { + OCFS2_I(inode)->ip_open_count--; + iput(inode); + } + } + + return status; +} + +static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di) +{ + le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1); +} + +static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di) +{ + return le32_to_cpu(di->id1.journal1.ij_recovery_generation); +} + +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty, int replayed) +{ + int status; + unsigned int flags; + struct ocfs2_journal *journal = osb->journal; + struct buffer_head *bh = journal->j_bh; + struct ocfs2_dinode *fe; + + fe = (struct ocfs2_dinode *)bh->b_data; + + /* The journal bh on the osb always comes from ocfs2_journal_init() + * and was validated there inside ocfs2_inode_lock_full(). It's a + * code bug if we mess it up. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + if (dirty) + flags |= OCFS2_JOURNAL_DIRTY_FL; + else + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + if (replayed) + ocfs2_bump_recovery_generation(fe); + + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); + status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode)); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * If the journal has been kmalloc'd it needs to be freed after this + * call. + */ +void ocfs2_journal_shutdown(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = NULL; + int status = 0; + struct inode *inode = NULL; + int num_running_trans = 0; + + BUG_ON(!osb); + + journal = osb->journal; + if (!journal) + goto done; + + inode = journal->j_inode; + + if (journal->j_state != OCFS2_JOURNAL_LOADED) + goto done; + + /* need to inc inode use count - jbd2_journal_destroy will iput. */ + if (!igrab(inode)) + BUG(); + + num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + trace_ocfs2_journal_shutdown(num_running_trans); + + /* Do a commit_cache here. It will flush our journal, *and* + * release any locks that are still held. + * set the SHUTDOWN flag and release the trans lock. + * the commit thread will take the trans lock for us below. */ + journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; + + /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not + * drop the trans_lock (which we want to hold until we + * completely destroy the journal. */ + if (osb->commit_task) { + /* Wait for the commit thread */ + trace_ocfs2_journal_shutdown_wait(osb->commit_task); + kthread_stop(osb->commit_task); + osb->commit_task = NULL; + } + + BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + + if (ocfs2_mount_local(osb)) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + + if (status == 0) { + /* + * Do not toggle if flush was unsuccessful otherwise + * will leave dirty metadata in a "clean" journal + */ + status = ocfs2_journal_toggle_dirty(osb, 0, 0); + if (status < 0) + mlog_errno(status); + } + + /* Shutdown the kernel journal system */ + jbd2_journal_destroy(journal->j_journal); + journal->j_journal = NULL; + + OCFS2_I(inode)->ip_open_count--; + + /* unlock our journal */ + ocfs2_inode_unlock(inode, 1); + + brelse(journal->j_bh); + journal->j_bh = NULL; + + journal->j_state = OCFS2_JOURNAL_FREE; + +// up_write(&journal->j_trans_barrier); +done: + if (inode) + iput(inode); +} + +static void ocfs2_clear_journal_error(struct super_block *sb, + journal_t *journal, + int slot) +{ + int olderr; + + olderr = jbd2_journal_errno(journal); + if (olderr) { + mlog(ML_ERROR, "File system error %d recorded in " + "journal %u.\n", olderr, slot); + mlog(ML_ERROR, "File system on device %s needs checking.\n", + sb->s_id); + + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); + } +} + +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) +{ + int status = 0; + struct ocfs2_super *osb; + + BUG_ON(!journal); + + osb = journal->j_osb; + + status = jbd2_journal_load(journal->j_journal); + if (status < 0) { + mlog(ML_ERROR, "Failed to load journal!\n"); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Launch the commit thread */ + if (!local) { + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, + "ocfs2cmt"); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, " + "error=%d", status); + goto done; + } + } else + osb->commit_task = NULL; + +done: + return status; +} + + +/* 'full' flag tells us whether we clear out all blocks or if we just + * mark the journal clean */ +int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) +{ + int status; + + BUG_ON(!journal); + + status = jbd2_journal_wipe(journal->j_journal, full); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +static int ocfs2_recovery_completed(struct ocfs2_super *osb) +{ + int empty; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + empty = (rm->rm_used == 0); + spin_unlock(&osb->osb_lock); + + return empty; +} + +void ocfs2_wait_for_recovery(struct ocfs2_super *osb) +{ + wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); +} + +/* + * JBD Might read a cached version of another nodes journal file. We + * don't want this as this file changes often and we get no + * notification on those changes. The only way to be sure that we've + * got the most up to date version of those blocks then is to force + * read them off disk. Just searching through the buffer cache won't + * work as there may be pages backing this file which are still marked + * up to date. We know things can't change on this file underneath us + * as we have the lock by now :) + */ +static int ocfs2_force_read_journal(struct inode *inode) +{ + int status = 0; + int i; + u64 v_blkno, p_blkno, p_blocks, num_blocks; +#define CONCURRENT_JOURNAL_FILL 32ULL + struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; + + memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + v_blkno = 0; + while (v_blkno < num_blocks) { + status = ocfs2_extent_map_get_blocks(inode, v_blkno, + &p_blkno, &p_blocks, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (p_blocks > CONCURRENT_JOURNAL_FILL) + p_blocks = CONCURRENT_JOURNAL_FILL; + + /* We are reading journal data which should not + * be put in the uptodate cache */ + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = 0; i < p_blocks; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + + v_blkno += p_blocks; + } + +bail: + for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) + brelse(bhs[i]); + return status; +} + +struct ocfs2_la_recovery_item { + struct list_head lri_list; + int lri_slot; + struct ocfs2_dinode *lri_la_dinode; + struct ocfs2_dinode *lri_tl_dinode; + struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; +}; + +/* Does the second half of the recovery process. By this point, the + * node is marked clean and can actually be considered recovered, + * hence it's no longer in the recovery map, but there's still some + * cleanup we can do which shouldn't happen within the recovery thread + * as locking in that context becomes very difficult if we are to take + * recovering nodes into account. + * + * NOTE: This function can and will sleep on recovery of other nodes + * during cluster locking, just like any other ocfs2 process. + */ +void ocfs2_complete_recovery(struct work_struct *work) +{ + int ret = 0; + struct ocfs2_journal *journal = + container_of(work, struct ocfs2_journal, j_recovery_work); + struct ocfs2_super *osb = journal->j_osb; + struct ocfs2_dinode *la_dinode, *tl_dinode; + struct ocfs2_la_recovery_item *item, *n; + struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; + LIST_HEAD(tmp_la_list); + + trace_ocfs2_complete_recovery( + (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno); + + spin_lock(&journal->j_lock); + list_splice_init(&journal->j_la_cleanups, &tmp_la_list); + spin_unlock(&journal->j_lock); + + list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { + list_del_init(&item->lri_list); + + ocfs2_wait_on_quotas(osb); + + la_dinode = item->lri_la_dinode; + tl_dinode = item->lri_tl_dinode; + qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; + + trace_ocfs2_complete_recovery_slot(item->lri_slot, + la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, + tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0, + qrec); + + if (la_dinode) { + ret = ocfs2_complete_local_alloc_recovery(osb, + la_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(la_dinode); + } + + if (tl_dinode) { + ret = ocfs2_complete_truncate_log_recovery(osb, + tl_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(tl_dinode); + } + + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); + if (ret < 0) + mlog_errno(ret); + + if (qrec) { + ret = ocfs2_finish_quota_recovery(osb, qrec, + item->lri_slot); + if (ret < 0) + mlog_errno(ret); + /* Recovery info is already freed now */ + } + + kfree(item); + } + + trace_ocfs2_complete_recovery_end(ret); +} + +/* NOTE: This function always eats your references to la_dinode and + * tl_dinode, either manually on error, or by passing them to + * ocfs2_complete_recovery */ +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) +{ + struct ocfs2_la_recovery_item *item; + + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS); + if (!item) { + /* Though we wish to avoid it, we are in fact safe in + * skipping local alloc cleanup as fsck.ocfs2 is more + * than capable of reclaiming unused space. */ + kfree(la_dinode); + kfree(tl_dinode); + + if (qrec) + ocfs2_free_quota_recovery(qrec); + + mlog_errno(-ENOMEM); + return; + } + + INIT_LIST_HEAD(&item->lri_list); + item->lri_la_dinode = la_dinode; + item->lri_slot = slot_num; + item->lri_tl_dinode = tl_dinode; + item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; + + spin_lock(&journal->j_lock); + list_add_tail(&item->lri_list, &journal->j_la_cleanups); + queue_work(ocfs2_wq, &journal->j_recovery_work); + spin_unlock(&journal->j_lock); +} + +/* Called by the mount code to queue recovery the last part of + * recovery for it's own and offline slot(s). */ +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = osb->journal; + + if (ocfs2_is_hard_readonly(osb)) + return; + + /* No need to queue up our truncate_log as regular cleanup will catch + * that */ + ocfs2_queue_recovery_completion(journal, osb->slot_num, + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); + ocfs2_schedule_truncate_log_flush(osb, 0); + + osb->local_alloc_copy = NULL; + osb->dirty = 0; + + /* queue to recover orphan slots for all offline slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); + ocfs2_free_replay_slots(osb); +} + +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) +{ + if (osb->quota_rec) { + ocfs2_queue_recovery_completion(osb->journal, + osb->slot_num, + NULL, + NULL, + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); + osb->quota_rec = NULL; + } +} + +static int __ocfs2_recovery_thread(void *arg) +{ + int status, node_num, slot_num; + struct ocfs2_super *osb = arg; + struct ocfs2_recovery_map *rm = osb->recovery_map; + int *rm_quota = NULL; + int rm_quota_used = 0, i; + struct ocfs2_quota_recovery *qrec; + + status = ocfs2_wait_on_mount(osb); + if (status < 0) { + goto bail; + } + + rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } +restart: + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + + /* queue recovery for our own slot */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); + + spin_lock(&osb->osb_lock); + while (rm->rm_used) { + /* It's always safe to remove entry zero, as we won't + * clear it until ocfs2_recover_node() has succeeded. */ + node_num = rm->rm_entries[0]; + spin_unlock(&osb->osb_lock); + slot_num = ocfs2_node_num_to_slot(osb, node_num); + trace_ocfs2_recovery_thread_node(node_num, slot_num); + if (slot_num == -ENOENT) { + status = 0; + goto skip_recovery; + } + + /* It is a bit subtle with quota recovery. We cannot do it + * immediately because we have to obtain cluster locks from + * quota files and we also don't want to just skip it because + * then quota usage would be out of sync until some node takes + * the slot. So we remember which nodes need quota recovery + * and when everything else is done, we recover quotas. */ + for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + + status = ocfs2_recover_node(osb, node_num, slot_num); +skip_recovery: + if (!status) { + ocfs2_recovery_map_clear(osb, node_num); + } else { + mlog(ML_ERROR, + "Error %d recovering node %d on device (%u,%u)!\n", + status, node_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + mlog(ML_ERROR, "Volume requires unmount.\n"); + } + + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); + trace_ocfs2_recovery_thread_end(status); + + /* Refresh all journal recovery generations from disk */ + status = ocfs2_check_journals_nolocks(osb); + status = (status == -EROFS) ? 0 : status; + if (status < 0) + mlog_errno(status); + + /* Now it is right time to recover quotas... We have to do this under + * superblock lock so that no one can start using the slot (and crash) + * before we recover it */ + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); + } + + ocfs2_super_unlock(osb, 1); + + /* queue recovery for offline slots */ + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); + +bail: + mutex_lock(&osb->recovery_lock); + if (!status && !ocfs2_recovery_completed(osb)) { + mutex_unlock(&osb->recovery_lock); + goto restart; + } + + ocfs2_free_replay_slots(osb); + osb->recovery_thread_task = NULL; + mb(); /* sync with ocfs2_recovery_thread_running */ + wake_up(&osb->recovery_event); + + mutex_unlock(&osb->recovery_lock); + + kfree(rm_quota); + + /* no one is callint kthread_stop() for us so the kthread() api + * requires that we call do_exit(). And it isn't exported, but + * complete_and_exit() seems to be a minimal wrapper around it. */ + complete_and_exit(NULL, status); +} + +void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) +{ + mutex_lock(&osb->recovery_lock); + + trace_ocfs2_recovery_thread(node_num, osb->node_num, + osb->disable_recovery, osb->recovery_thread_task, + osb->disable_recovery ? + -1 : ocfs2_recovery_map_set(osb, node_num)); + + if (osb->disable_recovery) + goto out; + + if (osb->recovery_thread_task) + goto out; + + osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, + "ocfs2rec"); + if (IS_ERR(osb->recovery_thread_task)) { + mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); + osb->recovery_thread_task = NULL; + } + +out: + mutex_unlock(&osb->recovery_lock); + wake_up(&osb->recovery_event); +} + +static int ocfs2_read_journal_inode(struct ocfs2_super *osb, + int slot_num, + struct buffer_head **bh, + struct inode **ret_inode) +{ + int status = -EACCES; + struct inode *inode = NULL; + + BUG_ON(slot_num >= osb->max_slots); + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (!inode || is_bad_inode(inode)) { + mlog_errno(status); + goto bail; + } + SET_INODE_JOURNAL(inode); + + status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; + +bail: + if (inode) { + if (status || !ret_inode) + iput(inode); + else + *ret_inode = inode; + } + return status; +} + +/* Does the actual journal replay and marks the journal inode as + * clean. Will only replay if the journal inode is marked dirty. */ +static int ocfs2_replay_journal(struct ocfs2_super *osb, + int node_num, + int slot_num) +{ + int status; + int got_lock = 0; + unsigned int flags; + struct inode *inode = NULL; + struct ocfs2_dinode *fe; + journal_t *journal = NULL; + struct buffer_head *bh = NULL; + u32 slot_reco_gen; + + status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode); + if (status) { + mlog_errno(status); + goto done; + } + + fe = (struct ocfs2_dinode *)bh->b_data; + slot_reco_gen = ocfs2_get_recovery_generation(fe); + brelse(bh); + bh = NULL; + + /* + * As the fs recovery is asynchronous, there is a small chance that + * another node mounted (and recovered) the slot before the recovery + * thread could get the lock. To handle that, we dirty read the journal + * inode for that slot to get the recovery generation. If it is + * different than what we expected, the slot has been recovered. + * If not, it needs recovery. + */ + if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { + trace_ocfs2_replay_journal_recovered(slot_num, + osb->slot_recovery_generations[slot_num], slot_reco_gen); + osb->slot_recovery_generations[slot_num] = slot_reco_gen; + status = -EBUSY; + goto done; + } + + /* Continue with recovery as the journal has not yet been recovered */ + + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + trace_ocfs2_replay_journal_lock_err(status); + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not lock journal!\n"); + goto done; + } + got_lock = 1; + + fe = (struct ocfs2_dinode *) bh->b_data; + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + slot_reco_gen = ocfs2_get_recovery_generation(fe); + + if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { + trace_ocfs2_replay_journal_skip(node_num); + /* Refresh recovery generation for the slot */ + osb->slot_recovery_generations[slot_num] = slot_reco_gen; + goto done; + } + + /* we need to run complete recovery for offline orphan slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + + printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + + status = ocfs2_force_read_journal(inode); + if (status < 0) { + mlog_errno(status); + goto done; + } + + journal = jbd2_journal_init_inode(inode); + if (journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EIO; + goto done; + } + + status = jbd2_journal_load(journal); + if (status < 0) { + mlog_errno(status); + if (!igrab(inode)) + BUG(); + jbd2_journal_destroy(journal); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal, slot_num); + + /* wipe the journal */ + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + if (status < 0) + mlog_errno(status); + + /* This will mark the node clean */ + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + /* Increment recovery generation to indicate successful recovery */ + ocfs2_bump_recovery_generation(fe); + osb->slot_recovery_generations[slot_num] = + ocfs2_get_recovery_generation(fe); + + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) + mlog_errno(status); + + if (!igrab(inode)) + BUG(); + + jbd2_journal_destroy(journal); + + printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); +done: + /* drop the lock on this nodes journal */ + if (got_lock) + ocfs2_inode_unlock(inode, 1); + + if (inode) + iput(inode); + + brelse(bh); + + return status; +} + +/* + * Do the most important parts of node recovery: + * - Replay it's journal + * - Stamp a clean local allocator file + * - Stamp a clean truncate log + * - Mark the node clean + * + * If this function completes without error, a node in OCFS2 can be + * said to have been safely recovered. As a result, failure during the + * second part of a nodes recovery process (local alloc recovery) is + * far less concerning. + */ +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num, int slot_num) +{ + int status = 0; + struct ocfs2_dinode *la_copy = NULL; + struct ocfs2_dinode *tl_copy = NULL; + + trace_ocfs2_recover_node(node_num, slot_num, osb->node_num); + + /* Should not ever be called to recover ourselves -- in that + * case we should've called ocfs2_journal_load instead. */ + BUG_ON(osb->node_num == node_num); + + status = ocfs2_replay_journal(osb, node_num, slot_num); + if (status < 0) { + if (status == -EBUSY) { + trace_ocfs2_recover_node_skip(slot_num, node_num); + status = 0; + goto done; + } + mlog_errno(status); + goto done; + } + + /* Stamp a clean local alloc file AFTER recovering the journal... */ + status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* An error from begin_truncate_log_recovery is not + * serious enough to warrant halting the rest of + * recovery. */ + status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); + if (status < 0) + mlog_errno(status); + + /* Likewise, this would be a strange but ultimately not so + * harmful place to get an error... */ + status = ocfs2_clear_slot(osb, slot_num); + if (status < 0) + mlog_errno(status); + + /* This will kfree the memory pointed to by la_copy and tl_copy */ + ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); + + status = 0; +done: + + return status; +} + +/* Test node liveness by trylocking his journal. If we get the lock, + * we drop it here. Return 0 if we got the lock, -EAGAIN if node is + * still alive (we couldn't get the lock) and < 0 on error. */ +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num) +{ + int status, flags; + struct inode *inode = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + mlog(ML_ERROR, "access error\n"); + status = -EACCES; + goto bail; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto bail; + } + SET_INODE_JOURNAL(inode); + + flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; + status = ocfs2_inode_lock_full(inode, NULL, 1, flags); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto bail; + } + + ocfs2_inode_unlock(inode, 1); +bail: + if (inode) + iput(inode); + + return status; +} + +/* Call this underneath ocfs2_super_lock. It also assumes that the + * slot info struct has been updated from disk. */ +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) +{ + unsigned int node_num; + int status, i; + u32 gen; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; + + /* This is called with the super block cluster lock, so we + * know that the slot map can't change underneath us. */ + + for (i = 0; i < osb->max_slots; i++) { + /* Read journal inode to get the recovery generation */ + status = ocfs2_read_journal_inode(osb, i, &bh, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *)bh->b_data; + gen = ocfs2_get_recovery_generation(di); + brelse(bh); + bh = NULL; + + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + + trace_ocfs2_mark_dead_nodes(i, + osb->slot_recovery_generations[i]); + + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); + continue; + } + + status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); + continue; + } + + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + continue; + } + spin_unlock(&osb->osb_lock); + + /* Ok, we have a slot occupied by another node which + * is not in the recovery map. We trylock his journal + * file here to test if he's alive. */ + status = ocfs2_trylock_journal(osb, i); + if (!status) { + /* Since we're called from mount, we know that + * the recovery thread can't race us on + * setting / checking the recovery bits. */ + ocfs2_recovery_thread(osb, node_num); + } else if ((status < 0) && (status != -EAGAIN)) { + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + return status; +} + +/* + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some + * randomness to the timeout to minimize multple nodes firing the timer at the + * same time. + */ +static inline unsigned long ocfs2_orphan_scan_timeout(void) +{ + unsigned long time; + + get_random_bytes(&time, sizeof(time)); + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000); + return msecs_to_jiffies(time); +} + +/* + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for + * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This + * is done to catch any orphans that are left over in orphan directories. + * + * It scans all slots, even ones that are in use. It does so to handle the + * case described below: + * + * Node 1 has an inode it was using. The dentry went away due to memory + * pressure. Node 1 closes the inode, but it's on the free list. The node + * has the open lock. + * Node 2 unlinks the inode. It grabs the dentry lock to notify others, + * but node 1 has no dentry and doesn't get the message. It trylocks the + * open lock, sees that another node has a PR, and does nothing. + * Later node 2 runs its orphan dir. It igets the inode, trylocks the + * open lock, sees the PR still, and does nothing. + * Basically, we have to trigger an orphan iput on node 1. The only way + * for this to happen is if node 1 runs node 2's orphan dir. + * + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT + * seconds. It gets an EX lock on os_lockres and checks sequence number + * stored in LVB. If the sequence number has changed, it means some other + * node has done the scan. This node skips the scan and tracks the + * sequence number. If the sequence number didn't change, it means a scan + * hasn't happened. The node queues a scan and increments the + * sequence number in the LVB. + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + int status, i; + u32 seqno = 0; + + os = &osb->osb_orphan_scan; + + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + goto out; + + trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); + + status = ocfs2_orphan_scan_lock(osb, &seqno); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto out; + } + + /* Do no queue the tasks if the volume is being umounted */ + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + goto unlock; + + if (os->os_seqno != seqno) { + os->os_seqno = seqno; + goto unlock; + } + + for (i = 0; i < osb->max_slots; i++) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, + NULL, ORPHAN_NO_NEED_TRUNCATE); + /* + * We queued a recovery on orphan slots, increment the sequence + * number and update LVB so other node will skip the scan for a while + */ + seqno++; + os->os_count++; + os->os_scantime = CURRENT_TIME; +unlock: + ocfs2_orphan_scan_unlock(osb, seqno); +out: + trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); + return; +} + +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ +void ocfs2_orphan_scan_work(struct work_struct *work) +{ + struct ocfs2_orphan_scan *os; + struct ocfs2_super *osb; + + os = container_of(work, struct ocfs2_orphan_scan, + os_orphan_scan_work.work); + osb = os->os_osb; + + mutex_lock(&os->os_lock); + ocfs2_queue_orphan_scan(osb); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + mutex_unlock(&os->os_lock); +} + +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) { + atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); + mutex_lock(&os->os_lock); + cancel_delayed_work(&os->os_orphan_scan_work); + mutex_unlock(&os->os_lock); + } +} + +void ocfs2_orphan_scan_init(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_osb = osb; + os->os_count = 0; + os->os_seqno = 0; + mutex_init(&os->os_lock); + INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} + +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) + atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); + else { + atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + } +} + +struct ocfs2_orphan_filldir_priv { + struct dir_context ctx; + struct inode *head; + struct ocfs2_super *osb; +}; + +static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) +{ + struct ocfs2_orphan_filldir_priv *p = + container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx); + struct inode *iter; + + if (name_len == 1 && !strncmp(".", name, 1)) + return 0; + if (name_len == 2 && !strncmp("..", name, 2)) + return 0; + + /* Skip bad inodes so that recovery can continue */ + iter = ocfs2_iget(p->osb, ino, + OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); + if (IS_ERR(iter)) + return 0; + + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); + /* No locking is required for the next_orphan queue as there + * is only ever a single process doing orphan recovery. */ + OCFS2_I(iter)->ip_next_orphan = p->head; + p->head = iter; + + return 0; +} + +static int ocfs2_queue_orphans(struct ocfs2_super *osb, + int slot, + struct inode **head) +{ + int status; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_orphan_filldir_priv priv = { + .ctx.actor = ocfs2_orphan_filldir, + .osb = osb, + .head = *head + }; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + slot); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + return status; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx); + if (status) { + mlog_errno(status); + goto out_cluster; + } + + *head = priv.head; + +out_cluster: + ocfs2_inode_unlock(orphan_dir_inode, 0); +out: + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + return status; +} + +static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, + int slot) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = !osb->osb_orphan_wipes[slot]; + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + /* Mark ourselves such that new processes in delete_inode() + * know to quit early. */ + ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); + while (osb->osb_orphan_wipes[slot]) { + /* If any processes are already in the middle of an + * orphan wipe on this dir, then we need to wait for + * them. */ + spin_unlock(&osb->osb_lock); + wait_event_interruptible(osb->osb_wipe_event, + ocfs2_orphan_recovery_can_continue(osb, slot)); + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); +} + +/* + * Orphan recovery. Each mounted node has it's own orphan dir which we + * must run during recovery. Our strategy here is to build a list of + * the inodes in the orphan dir and iget/iput them. The VFS does + * (most) of the rest of the work. + * + * Orphan recovery can happen at any time, not just mount so we have a + * couple of extra considerations. + * + * - We grab as many inodes as we can under the orphan dir lock - + * doing iget() outside the orphan dir risks getting a reference on + * an invalid inode. + * - We must be sure not to deadlock with other processes on the + * system wanting to run delete_inode(). This can happen when they go + * to lock the orphan dir and the orphan recovery process attempts to + * iget() inside the orphan dir lock. This can be avoided by + * advertising our state to ocfs2_delete_inode(). + */ +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *iter; + struct ocfs2_inode_info *oi; + + trace_ocfs2_recover_orphans(slot); + + ocfs2_mark_recovering_orphan_dir(osb, slot); + ret = ocfs2_queue_orphans(osb, slot, &inode); + ocfs2_clear_recovering_orphan_dir(osb, slot); + + /* Error here should be noted, but we want to continue with as + * many queued inodes as we've got. */ + if (ret) + mlog_errno(ret); + + while (inode) { + oi = OCFS2_I(inode); + trace_ocfs2_recover_orphans_iput( + (unsigned long long)oi->ip_blkno); + + iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; + + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); + + wake_up(&OCFS2_I(inode)->append_dio_wq); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ + +next: + iput(inode); + + inode = iter; + } + + return ret; +} + +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota) +{ + /* This check is good because ocfs2 will wait on our recovery + * thread before changing it to something other than MOUNTED + * or DISABLED. */ + wait_event(osb->osb_mount_event, + (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) || + atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS || + atomic_read(&osb->vol_state) == VOLUME_DISABLED); + + /* If there's an error on mount, then we may never get to the + * MOUNTED flag, but this is set right before + * dismount_volume() so we can trust it. */ + if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { + trace_ocfs2_wait_on_mount(VOLUME_DISABLED); + mlog(0, "mount error, exiting!\n"); + return -EBUSY; + } + + return 0; +} + +static int ocfs2_commit_thread(void *arg) +{ + int status; + struct ocfs2_super *osb = arg; + struct ocfs2_journal *journal = osb->journal; + + /* we can trust j_num_trans here because _should_stop() is only set in + * shutdown and nobody other than ourselves should be able to start + * transactions. committing on shutdown might take a few iterations + * as final transactions put deleted inodes on the list */ + while (!(kthread_should_stop() && + atomic_read(&journal->j_num_trans) == 0)) { + + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); + + status = ocfs2_commit_cache(osb); + if (status < 0) { + static unsigned long abort_warn_time; + + /* Warn about this once per minute */ + if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) + mlog(ML_ERROR, "status = %d, journal is " + "already aborted.\n", status); + /* + * After ocfs2_commit_cache() fails, j_num_trans has a + * non-zero value. Sleep here to avoid a busy-wait + * loop. + */ + msleep_interruptible(1000); + } + + if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ + mlog(ML_KTHREAD, + "commit_thread: %u transactions pending on " + "shutdown\n", + atomic_read(&journal->j_num_trans)); + } + } + + return 0; +} + +/* Reads all the journal inodes without taking any cluster locks. Used + * for hard readonly access to determine whether any journal requires + * recovery. Also used to refresh the recovery generation numbers after + * a journal has been recovered by another node. + */ +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) +{ + int ret = 0; + unsigned int slot; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int journal_dirty = 0; + + for(slot = 0; slot < osb->max_slots; slot++) { + ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + + osb->slot_recovery_generations[slot] = + ocfs2_get_recovery_generation(di); + + if (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL) + journal_dirty = 1; + + brelse(di_bh); + di_bh = NULL; + } + +out: + if (journal_dirty) + ret = -EROFS; + return ret; +} diff --git a/kernel/fs/ocfs2/journal.h b/kernel/fs/ocfs2/journal.h new file mode 100644 index 000000000..f4cd3c3e9 --- /dev/null +++ b/kernel/fs/ocfs2/journal.h @@ -0,0 +1,645 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.h + * + * Defines journalling api and structures. + * + * Copyright (C) 2003, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_JOURNAL_H +#define OCFS2_JOURNAL_H + +#include +#include + +enum ocfs2_journal_state { + OCFS2_JOURNAL_FREE = 0, + OCFS2_JOURNAL_LOADED, + OCFS2_JOURNAL_IN_SHUTDOWN, +}; + +struct ocfs2_super; +struct ocfs2_dinode; + +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + unsigned int rm_used; + unsigned int *rm_entries; +}; + + +struct ocfs2_journal { + enum ocfs2_journal_state j_state; /* Journals current state */ + + journal_t *j_journal; /* The kernels journal type */ + struct inode *j_inode; /* Kernel inode pointing to + * this journal */ + struct ocfs2_super *j_osb; /* pointer to the super + * block for the node + * we're currently + * running on -- not + * necessarily the super + * block from the node + * which we usually run + * from (recovery, + * etc) */ + struct buffer_head *j_bh; /* Journal disk inode block */ + atomic_t j_num_trans; /* Number of transactions + * currently in the system. */ + spinlock_t j_lock; + unsigned long j_trans_id; + struct rw_semaphore j_trans_barrier; + wait_queue_head_t j_checkpointed; + + /* both fields protected by j_lock*/ + struct list_head j_la_cleanups; + struct work_struct j_recovery_work; +}; + +extern spinlock_t trans_inc_lock; + +/* wrap j_trans_id so we never have it equal to zero. */ +static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) +{ + unsigned long old_id; + spin_lock(&trans_inc_lock); + old_id = j->j_trans_id++; + if (unlikely(!j->j_trans_id)) + j->j_trans_id = 1; + spin_unlock(&trans_inc_lock); + return old_id; +} + +static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal, + struct ocfs2_caching_info *ci) +{ + spin_lock(&trans_inc_lock); + ci->ci_last_trans = journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Used to figure out whether it's safe to drop a metadata lock on an + * cached object. Returns true if all the object's changes have been + * checkpointed to disk. You should be holding the spinlock on the + * metadata lock while calling this to be sure that nobody can take + * the lock and put it on another transaction. */ +static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci) +{ + int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + + spin_lock(&trans_inc_lock); + ret = time_after(journal->j_trans_id, ci->ci_last_trans); + spin_unlock(&trans_inc_lock); + return ret; +} + +/* convenience function to check if an object backed by struct + * ocfs2_caching_info is still new (has never hit disk) Will do you a + * favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the ci is still new. */ +static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci) +{ + int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + + spin_lock(&trans_inc_lock); + ret = !(time_after(journal->j_trans_id, ci->ci_created_trans)); + if (!ret) + ci->ci_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +/* Wrapper for inodes so we can check system files */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ + /* System files are never "new" as they're written out by + * mkfs. This helps us early during mount, before we have the + * journal open and j_trans_id could be junk. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + return 0; + + return ocfs2_ci_is_new(INODE_CACHE(inode)); +} + +static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, + struct ocfs2_caching_info *ci) +{ + spin_lock(&trans_inc_lock); + ci->ci_created_trans = osb->journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Exported only for the journal struct init code in super.c. Do not call. */ +void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); + +void ocfs2_complete_recovery(struct work_struct *work); +void ocfs2_wait_for_recovery(struct ocfs2_super *osb); + +int ocfs2_recovery_init(struct ocfs2_super *osb); +void ocfs2_recovery_exit(struct ocfs2_super *osb); + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +/* + * Journal Control: + * Initialize, Load, Shutdown, Wipe a journal. + * + * ocfs2_journal_init - Initialize journal structures in the OSB. + * ocfs2_journal_load - Load the given journal off disk. Replay it if + * there's transactions still in there. + * ocfs2_journal_shutdown - Shutdown a journal, this will flush all + * uncommitted, uncheckpointed transactions. + * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally + * zero out each block. + * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. + * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat + * event on. + * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. + */ +void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_init(struct ocfs2_journal *journal, + int *dirty); +void ocfs2_journal_shutdown(struct ocfs2_super *osb); +int ocfs2_journal_wipe(struct ocfs2_journal *journal, + int full); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, + int replayed); +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); +void ocfs2_recovery_thread(struct ocfs2_super *osb, + int node_num); +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); + +static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) +{ + wake_up(&osb->checkpoint_event); +} + +static inline void ocfs2_checkpoint_inode(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_mount_local(osb)) + return; + + if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) { + /* WARNING: This only kicks off a single + * checkpoint. If someone races you and adds more + * metadata to the journal, you won't know, and will + * wind up waiting *a lot* longer than necessary. Right + * now we only use this in clear_inode so that's + * OK. */ + ocfs2_start_checkpoint(osb); + + wait_event(osb->journal->j_checkpointed, + ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))); + } +} + +/* + * Transaction Handling: + * Manage the lifetime of a transaction handle. + * + * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of + * the number of blocks that will be changed during + * this handle. + * ocfs2_commit_trans - Complete a handle. It might return -EIO if + * the journal was aborted. The majority of paths don't + * check the return value as an error there comes too + * late to do anything (and will be picked up in a + * later transaction). + * ocfs2_extend_trans - Extend a handle by nblocks credits. This may + * commit the handle to disk in the process, but will + * not release any locks taken during the transaction. + * ocfs2_journal_access* - Notify the handle that we want to journal this + * buffer. Will have to call ocfs2_journal_dirty once + * we've actually dirtied it. Type is one of . or . + * Always call the specific flavor of + * ocfs2_journal_access_*() unless you intend to + * manage the checksum by hand. + * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. + */ + +/* You must always start_trans with a number of buffs > 0, but it's + * perfectly legal to go through an entire transaction without having + * dirtied any buffers. */ +handle_t *ocfs2_start_trans(struct ocfs2_super *osb, + int max_buffs); +int ocfs2_commit_trans(struct ocfs2_super *osb, + handle_t *handle); +int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_allocate_extend_trans(handle_t *handle, + int thresh); + +/* + * Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * fallocate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. + */ +#define OCFS2_MAX_TRANS_DATA 64U + +/* + * Create access is for when we get a newly created buffer and we're + * not gonna read it off disk, but rather fill it ourselves. Right + * now, we don't do anything special with this (it turns into a write + * request), but this is a good placeholder in case we do... + * + * Write access is for when we read a block off disk and are going to + * modify it. This way the journalling layer knows it may need to make + * a copy of that block (if it's part of another, uncommitted + * transaction) before we do so. + */ +#define OCFS2_JOURNAL_ACCESS_CREATE 0 +#define OCFS2_JOURNAL_ACCESS_WRITE 1 +#define OCFS2_JOURNAL_ACCESS_UNDO 2 + + +/* ocfs2_inode */ +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_extent_block */ +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_refcount_block */ +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_group_desc */ +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_xattr_block */ +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* quota blocks */ +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* dirblock */ +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_dx_root_block */ +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_dx_leaf */ +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* Anything that has no ecc */ +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); + +/* + * A word about the journal_access/journal_dirty "dance". It is + * entirely legal to journal_access a buffer more than once (as long + * as the access type is the same -- I'm not sure what will happen if + * access type is different but this should never happen anyway) It is + * also legal to journal_dirty a buffer more than once. In fact, you + * can even journal_access a buffer after you've done a + * journal_access/journal_dirty pair. The only thing you cannot do + * however, is journal_dirty a buffer which you haven't yet passed to + * journal_access at least once. + * + * That said, 99% of the time this doesn't matter and this is what the + * path looks like: + * + * + * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); + * + * ocfs2_journal_dirty(handle, bh); + */ +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); + +/* + * Credit Macros: + * Convenience macros to calculate number of credits needed. + * + * For convenience sake, I have a set of macros here which calculate + * the *maximum* number of sectors which will be changed for various + * metadata updates. + */ + +/* simple file updates like chmod, etc. */ +#define OCFS2_INODE_UPDATE_CREDITS 1 + +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + +/* global quotafile inode update, data block */ +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +/* + * The two writes below can accidentally see global info dirty due + * to set_info() quotactl so make them prepared for the writes. + */ +/* quota data block, global info */ +/* Write to local quota file */ +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +/* global quota data block, local quota data block, global quota inode, + * global quota info */ +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +static inline int ocfs2_quota_trans_credits(struct super_block *sb) +{ + int credits = 0; + + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + return credits; +} + +/* group extend. inode update and last group update. */ +#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* group add. inode update and the new group update. */ +#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* get one bit out of a suballocator: dinode + group descriptor + + * prev. group desc. if we relink. */ +#define OCFS2_SUBALLOC_ALLOC (3) + +static inline int ocfs2_inline_to_extents_credits(struct super_block *sb) +{ + return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} + +/* dinode + group descriptor update. We don't relink on free yet. */ +#define OCFS2_SUBALLOC_FREE (2) + +#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS +#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + + OCFS2_TRUNCATE_LOG_UPDATE) + +static inline int ocfs2_remove_extent_credits(struct super_block *sb) +{ + return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} + +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) + +static inline int ocfs2_add_dir_index_credits(struct super_block *sb) +{ + /* 1 block for index, 2 allocs (data, metadata), 1 clusters + * worth of blocks for initial extent. */ + return 1 + 2 * OCFS2_SUBALLOC_ALLOC + + ocfs2_clusters_to_blocks(sb, 1); +} + +/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode + * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr + * blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, + int xattr_credits) +{ + int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS; + + if (is_dir) + dir_credits += ocfs2_add_dir_index_credits(sb); + + return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits + + ocfs2_quota_trans_credits(sb); +} + +/* local alloc metadata change + main bitmap updates */ +#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) + +/* used when we don't need an allocation change for a dir extend. One + * for the dinode, one for the new block. */ +#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) + +/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota + * update on dir + index leaf + dx root update for free list + + * previous dirblock update in the free list */ +static inline int ocfs2_link_credits(struct super_block *sb) +{ + return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + ocfs2_quota_trans_credits(sb); +} + +/* inode + dir inode (if we unlink a dir), + dir entry block + orphan + * dir inode link + dir inode index leaf + dir index root */ +static inline int ocfs2_unlink_credits(struct super_block *sb) +{ + /* The quota update from ocfs2_link_credits is unused here... */ + return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); +} + +/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + + * inode alloc group descriptor + orphan dir index root + + * orphan dir index leaf */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) + +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + +/* dinode update, old dir dinode update, new dir dinode update, old + * dir dir entry, new dir dir entry, dir entry update for renaming + * directory + target unlink + 3 x dir index leaves */ +static inline int ocfs2_rename_credits(struct super_block *sb) +{ + return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb); +} + +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* inode update, removal of dx root block from allocator */ +#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_SUBALLOC_FREE) + +static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) +{ + int credits = 1 + OCFS2_SUBALLOC_ALLOC; + + credits += ocfs2_clusters_to_blocks(sb, 1); + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + +/* inode update, new refcount block and its allocation credits. */ +#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_SUBALLOC_ALLOC) + +/* inode and the refcount block update. */ +#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * inode and the refcount block update. + * It doesn't include the credits for sub alloc change. + * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added. + */ +#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* 2 metadata alloc, 2 new blocks and root refcount block */ +#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_calc_extend_credits(struct super_block *sb, + struct ocfs2_extent_list *root_el) +{ + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; + + /* bitmap dinode, group desc. + relinked group. */ + bitmap_blocks = OCFS2_SUBALLOC_ALLOC; + + /* we might need to shift tree depth so lets assume an + * absolute worst case of complete fragmentation. Even with + * that, we only need one update for the dinode, and then + * however many metadata chunks needed * a remaining suballoc + * alloc. */ + sysfile_bitmap_blocks = 1 + + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); + + /* this does not include *new* metadata blocks, which are + * accounted for in sysfile_bitmap_blocks. root_el + + * prev. last_eb_blk + blocks along edge of tree. + * calc_symlink_credits passes because we just need 1 + * credit for the dinode there. */ + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); + + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + + ocfs2_quota_trans_credits(sb); +} + +static inline int ocfs2_calc_symlink_credits(struct super_block *sb) +{ + int blocks = ocfs2_mknod_credits(sb, 0, 0); + + /* links can be longer than one block so we may update many + * within our single allocated extent. */ + blocks += ocfs2_clusters_to_blocks(sb, 1); + + return blocks + ocfs2_quota_trans_credits(sb); +} + +static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, + unsigned int cpg) +{ + int blocks; + int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; + /* parent inode update + new block group header + bitmap inode update + + bitmap blocks affected */ + blocks = 1 + 1 + 1 + bitmap_blocks; + return blocks; +} + +/* + * Allocating a discontiguous block group requires the credits from + * ocfs2_calc_group_alloc_credits() as well as enough credits to fill + * the group descriptor's extent list. The caller already has started + * the transaction with ocfs2_calc_group_alloc_credits(). They extend + * it with these credits. + */ +static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) +{ + return ocfs2_extent_recs_per_gd(sb); +} + +static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, + unsigned int clusters_to_del, + struct ocfs2_dinode *fe, + struct ocfs2_extent_list *last_el) +{ + /* for dinode + all headers in this pass + update to next leaf */ + u16 next_free = le16_to_cpu(last_el->l_next_free_rec); + u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); + int credits = 1 + tree_depth + 1; + int i; + + i = next_free - 1; + BUG_ON(i < 0); + + /* We may be deleting metadata blocks, so metadata alloc dinode + + one desc. block for each possible delete. */ + if (tree_depth && next_free == 1 && + ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) + credits += 1 + tree_depth; + + /* update to the truncate log. */ + credits += OCFS2_TRUNCATE_LOG_UPDATE; + + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); +} + +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; +} + +#endif /* OCFS2_JOURNAL_H */ diff --git a/kernel/fs/ocfs2/localalloc.c b/kernel/fs/ocfs2/localalloc.c new file mode 100644 index 000000000..857bbbcd3 --- /dev/null +++ b/kernel/fs/ocfs2/localalloc.c @@ -0,0 +1,1343 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.c + * + * Node local data allocation + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv); + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); + +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh); + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh); + +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac); + +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode); + +/* + * ocfs2_la_default_mb() - determine a default size, in megabytes of + * the local alloc. + * + * Generally, we'd like to pick as large a local alloc as + * possible. Performance on large workloads tends to scale + * proportionally to la size. In addition to that, the reservations + * code functions more efficiently as it can reserve more windows for + * write. + * + * Some things work against us when trying to choose a large local alloc: + * + * - We need to ensure our sizing is picked to leave enough space in + * group descriptors for other allocations (such as block groups, + * etc). Picking default sizes which are a multiple of 4 could help + * - block groups are allocated in 2mb and 4mb chunks. + * + * - Likewise, we don't want to starve other nodes of bits on small + * file systems. This can easily be taken care of by limiting our + * default to a reasonable size (256M) on larger cluster sizes. + * + * - Some file systems can't support very large sizes - 4k and 8k in + * particular are limited to less than 128 and 256 megabytes respectively. + * + * The following reference table shows group descriptor and local + * alloc maximums at various cluster sizes (4k blocksize) + * + * csize: 4K group: 126M la: 121M + * csize: 8K group: 252M la: 243M + * csize: 16K group: 504M la: 486M + * csize: 32K group: 1008M la: 972M + * csize: 64K group: 2016M la: 1944M + * csize: 128K group: 4032M la: 3888M + * csize: 256K group: 8064M la: 7776M + * csize: 512K group: 16128M la: 15552M + * csize: 1024K group: 32256M la: 31104M + */ +#define OCFS2_LA_MAX_DEFAULT_MB 256 +#define OCFS2_LA_OLD_DEFAULT 8 +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) +{ + unsigned int la_mb; + unsigned int gd_mb; + unsigned int la_max_mb; + unsigned int megs_per_slot; + struct super_block *sb = osb->sb; + + gd_mb = ocfs2_clusters_to_megabytes(osb->sb, + 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat)); + + /* + * This takes care of files systems with very small group + * descriptors - 512 byte blocksize at cluster sizes lower + * than 16K and also 1k blocksize with 4k cluster size. + */ + if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192) + || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) + return OCFS2_LA_OLD_DEFAULT; + + /* + * Leave enough room for some block groups and make the final + * value we work from a multiple of 4. + */ + gd_mb -= 16; + gd_mb &= 0xFFFFFFFB; + + la_mb = gd_mb; + + /* + * Keep window sizes down to a reasonable default + */ + if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { + /* + * Some clustersize / blocksize combinations will have + * given us a larger than OCFS2_LA_MAX_DEFAULT_MB + * default size, but get poor distribution when + * limited to exactly 256 megabytes. + * + * As an example, 16K clustersize at 4K blocksize + * gives us a cluster group size of 504M. Paring the + * local alloc size down to 256 however, would give us + * only one window and around 200MB left in the + * cluster group. Instead, find the first size below + * 256 which would give us an even distribution. + * + * Larger cluster group sizes actually work out pretty + * well when pared to 256, so we don't have to do this + * for any group that fits more than two + * OCFS2_LA_MAX_DEFAULT_MB windows. + */ + if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB)) + la_mb = 256; + else { + unsigned int gd_mult = gd_mb; + + while (gd_mult > 256) + gd_mult = gd_mult >> 1; + + la_mb = gd_mult; + } + } + + megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots; + megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); + /* Too many nodes, too few disk clusters. */ + if (megs_per_slot < la_mb) + la_mb = megs_per_slot; + + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + + return la_mb; +} + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) +{ + struct super_block *sb = osb->sb; + unsigned int la_default_mb = ocfs2_la_default_mb(osb); + unsigned int la_max_mb; + + la_max_mb = ocfs2_clusters_to_megabytes(sb, + ocfs2_local_alloc_size(sb) * 8); + + trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb); + + if (requested_mb == -1) { + /* No user request - use defaults */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_default_mb); + } else if (requested_mb > la_max_mb) { + /* Request is too big, we give the maximum available */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_max_mb); + } else { + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, requested_mb); + } + + osb->local_alloc_bits = osb->local_alloc_default_bits; +} + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) +{ + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} + +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); +} + +/* + * Tell us whether a given allocation should use the local alloc + * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. + */ +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) +{ + int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; + + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + /* la_bits should be at least twice the size (in clusters) of + * a new block group. We want to be sure block group + * allocations go through the local alloc, so allow an + * allocation to take up to half the bitmap. */ + if (bits > (la_bits / 2)) + goto bail; + + ret = 1; +bail: + trace_ocfs2_alloc_should_use_local( + (unsigned long long)bits, osb->local_alloc_state, la_bits, ret); + spin_unlock(&osb->osb_lock); + return ret; +} + +int ocfs2_load_local_alloc(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_dinode *alloc = NULL; + struct buffer_head *alloc_bh = NULL; + u32 num_used; + struct inode *inode = NULL; + struct ocfs2_local_alloc *la; + + if (osb->local_alloc_bits == 0) + goto bail; + + if (osb->local_alloc_bits >= osb->bitmap_cpg) { + mlog(ML_NOTICE, "Requested local alloc window %d is larger " + "than max possible %u. Using defaults.\n", + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + ocfs2_la_default_mb(osb)); + } + + /* read the alloc off disk */ + inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (!(le32_to_cpu(alloc->i_flags) & + (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { + mlog(ML_ERROR, "Invalid local alloc inode, %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = -EINVAL; + goto bail; + } + + if ((la->la_size == 0) || + (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { + mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", + le16_to_cpu(la->la_size)); + status = -EINVAL; + goto bail; + } + + /* do a little verification. */ + num_used = ocfs2_local_alloc_count_bits(alloc); + + /* hopefully the local alloc has always been recovered before + * we load it. */ + if (num_used + || alloc->id1.bitmap1.i_used + || alloc->id1.bitmap1.i_total + || la->la_bm_off) + mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + "found = %u, set = %u, taken = %u, off = %u\n", + num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), + le32_to_cpu(alloc->id1.bitmap1.i_total), + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + + osb->local_alloc_bh = alloc_bh; + osb->local_alloc_state = OCFS2_LA_ENABLED; + +bail: + if (status < 0) + brelse(alloc_bh); + if (inode) + iput(inode); + + trace_ocfs2_load_local_alloc(osb->local_alloc_bits); + + if (status) + mlog_errno(status); + return status; +} + +/* + * return any unused bits to the bitmap and write out a clean + * local_alloc. + * + * local_alloc_bh is optional. If not passed, we will simply use the + * one off osb. If you do pass it however, be warned that it *will* be + * returned brelse'd and NULL'd out.*/ +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) +{ + int status; + handle_t *handle; + struct inode *local_alloc_inode = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_dinode *alloc = NULL; + + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + if (osb->local_alloc_state == OCFS2_LA_UNUSED) + goto out; + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto out; + } + + osb->local_alloc_state = OCFS2_LA_DISABLED; + + ocfs2_resmap_uninit(&osb->osb_la_resmap); + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + /* WINDOW_MOVE_CREDITS is a bit heavy... */ + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + handle = NULL; + goto out_unlock; + } + + bh = osb->local_alloc_bh; + alloc = (struct ocfs2_dinode *) bh->b_data; + + alloc_copy = kmalloc(bh->b_size, GFP_NOFS); + if (!alloc_copy) { + status = -ENOMEM; + goto out_commit; + } + memcpy(alloc_copy, alloc, bh->b_size); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + ocfs2_clear_local_alloc(alloc); + ocfs2_journal_dirty(handle, bh); + + brelse(bh); + osb->local_alloc_bh = NULL; + osb->local_alloc_state = OCFS2_LA_UNUSED; + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + if (local_alloc_inode) + iput(local_alloc_inode); + + kfree(alloc_copy); +} + +/* + * We want to free the bitmap bits outside of any recovery context as + * we'll need a cluster lock to do so, but we must clear the local + * alloc before giving up the recovered nodes journal. To solve this, + * we kmalloc a copy of the local alloc before it's change for the + * caller to process with ocfs2_complete_local_alloc_recovery + */ +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **alloc_copy) +{ + int status = 0; + struct buffer_head *alloc_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_dinode *alloc; + + trace_ocfs2_begin_local_alloc_recovery(slot_num); + + *alloc_copy = NULL; + + inode = ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode->i_mutex); + + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); + if (!(*alloc_copy)) { + status = -ENOMEM; + goto bail; + } + memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + ocfs2_clear_local_alloc(alloc); + + ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); + status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode)); + if (status < 0) + mlog_errno(status); + +bail: + if (status < 0) { + kfree(*alloc_copy); + *alloc_copy = NULL; + } + + brelse(alloc_bh); + + if (inode) { + mutex_unlock(&inode->i_mutex); + iput(inode); + } + + if (status) + mlog_errno(status); + return status; +} + +/* + * Step 2: By now, we've completed the journal recovery, we've stamped + * a clean local alloc on disk and dropped the node out of the + * recovery map. Dlm locks will no longer stall, so lets clear out the + * main bitmap. + */ +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc) +{ + int status; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto out_unlock; + } + + /* we want the bitmap change to be recorded on disk asap */ + handle->h_sync = 1; + + status = ocfs2_sync_local_to_main(osb, handle, alloc, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + + brelse(main_bm_bh); + + iput(main_bm_inode); + +out: + if (!status) + ocfs2_init_steal_slots(osb); + if (status) + mlog_errno(status); + return status; +} + +/* + * make sure we've got at least bits_wanted contiguous bits in the + * local alloc. You lose them when you drop i_mutex. + * + * We will add ourselves to the transaction passed in, but may start + * our own in order to shift windows. + */ +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context *ac) +{ + int status; + struct ocfs2_dinode *alloc; + struct inode *local_alloc_inode; + unsigned int free_bits; + + BUG_ON(!ac); + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + mutex_lock(&local_alloc_inode->i_mutex); + + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); + status = -ENOSPC; + goto bail; + } + spin_unlock(&osb->osb_lock); + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + +#ifdef CONFIG_OCFS2_DEBUG_FS + if (le32_to_cpu(alloc->id1.bitmap1.i_used) != + ocfs2_local_alloc_count_bits(alloc)) { + ocfs2_error(osb->sb, "local alloc inode %llu says it has " + "%u used bits, but a count shows %u", + (unsigned long long)le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); + status = -EIO; + goto bail; + } +#endif + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) { + /* uhoh, window change time. */ + status = + ocfs2_local_alloc_slide_window(osb, local_alloc_inode); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + ac->ac_inode = local_alloc_inode; + /* We should never use localalloc from another slot */ + ac->ac_alloc_slot = osb->slot_num; + ac->ac_which = OCFS2_AC_USE_LOCAL; + get_bh(osb->local_alloc_bh); + ac->ac_bh = osb->local_alloc_bh; + status = 0; +bail: + if (status < 0 && local_alloc_inode) { + mutex_unlock(&local_alloc_inode->i_mutex); + iput(local_alloc_inode); + } + + trace_ocfs2_reserve_local_alloc_bits( + (unsigned long long)ac->ac_max_block, + bits_wanted, osb->slot_num, status); + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 *bit_off, + u32 *num_bits) +{ + int status, start; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted, + ac->ac_resv); + if (start == -1) { + /* TODO: Shouldn't we just BUG here? */ + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + bitmap = la->la_bitmap; + *bit_off = le32_to_cpu(la->la_bm_off) + start; + *num_bits = bits_wanted; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start, + bits_wanted); + + while(bits_wanted--) + ocfs2_set_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) +{ + u32 count; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + count = memweight(la->la_bitmap, le16_to_cpu(la->la_size)); + + trace_ocfs2_local_alloc_count_bits(count); + return count; +} + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv) +{ + int numfound = 0, bitoff, left, startoff, lastzero; + int local_resv = 0; + struct ocfs2_alloc_reservation r; + void *bitmap = NULL; + struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; + + if (!alloc->id1.bitmap1.i_total) { + bitoff = -1; + goto bail; + } + + if (!resv) { + local_resv = 1; + ocfs2_resv_init_once(&r); + ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP); + resv = &r; + } + + numfound = *numbits; + if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) { + if (numfound < *numbits) + *numbits = numfound; + goto bail; + } + + /* + * Code error. While reservations are enabled, local + * allocation should _always_ go through them. + */ + BUG_ON(osb->osb_resv_level != 0); + + /* + * Reservations are disabled. Handle this the old way. + */ + + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + + numfound = bitoff = startoff = 0; + lastzero = -1; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { + if (bitoff == left) { + /* mlog(0, "bitoff (%d) == left", bitoff); */ + break; + } + /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " + "numfound = %d\n", bitoff, startoff, numfound);*/ + + /* Ok, we found a zero bit... is it contig. or do we + * start over?*/ + if (bitoff == startoff) { + /* we found a zero */ + numfound++; + startoff++; + } else { + /* got a zero after some ones */ + numfound = 1; + startoff = bitoff+1; + } + /* we got everything we needed */ + if (numfound == *numbits) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound); + + if (numfound == *numbits) + bitoff = startoff - numfound; + else + bitoff = -1; + +bail: + if (local_resv) + ocfs2_resv_discard(resmap, resv); + + trace_ocfs2_local_alloc_find_clear_bits(*numbits, + le32_to_cpu(alloc->id1.bitmap1.i_total), + bitoff, numfound); + + return bitoff; +} + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) +{ + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + int i; + + alloc->id1.bitmap1.i_total = 0; + alloc->id1.bitmap1.i_used = 0; + la->la_bm_off = 0; + for(i = 0; i < le16_to_cpu(la->la_size); i++) + la->la_bitmap[i] = 0; +} + +#if 0 +/* turn this on and uncomment below to aid debugging window shifts. */ +static void ocfs2_verify_zero_bits(unsigned long *bitmap, + unsigned int start, + unsigned int count) +{ + unsigned int tmp = count; + while(tmp--) { + if (ocfs2_test_bit(start + tmp, bitmap)) { + printk("ocfs2_verify_zero_bits: start = %u, count = " + "%u\n", start, count); + printk("ocfs2_verify_zero_bits: bit %u is set!", + start + tmp); + BUG(); + } + } +} +#endif + +/* + * sync the local alloc to main bitmap. + * + * assumes you've already locked the main bitmap -- the bitmap inode + * passed is used for caching. + */ +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh) +{ + int status = 0; + int bit_off, left, count, start; + u64 la_start_blk; + u64 blkno; + void *bitmap; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + trace_ocfs2_sync_local_to_main( + le32_to_cpu(alloc->id1.bitmap1.i_total), + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + if (!alloc->id1.bitmap1.i_total) { + goto bail; + } + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) == + le32_to_cpu(alloc->id1.bitmap1.i_total)) { + goto bail; + } + + la_start_blk = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(la->la_bm_off)); + bitmap = la->la_bitmap; + start = count = bit_off = 0; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) + != -1) { + if ((bit_off < left) && (bit_off == start)) { + count++; + start++; + continue; + } + if (count) { + blkno = la_start_blk + + ocfs2_clusters_to_blocks(osb->sb, + start - count); + + trace_ocfs2_sync_local_to_main_free( + count, start - count, + (unsigned long long)la_start_blk, + (unsigned long long)blkno); + + status = ocfs2_release_clusters(handle, + main_bm_inode, + main_bm_bh, blkno, + count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + if (bit_off >= left) + break; + count = 1; + start = bit_off + 1; + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh) +{ + int status; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_bits; + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *bitmap_inode = (*ac)->ac_inode; + igrab(*bitmap_inode); + *bitmap_bh = (*ac)->ac_bh; + get_bh(*bitmap_bh); + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +/* + * pass it the bitmap lock in lock_bh if you have it. + */ +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac) +{ + int status = 0; + u32 cluster_off, cluster_count; + struct ocfs2_dinode *alloc = NULL; + struct ocfs2_local_alloc *la; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + trace_ocfs2_local_alloc_new_window( + le32_to_cpu(alloc->id1.bitmap1.i_total), + osb->local_alloc_bits); + + /* Instruct the allocation code to try the most recently used + * cluster group. We'll re-record the group used this pass + * below. */ + ac->ac_last_group = osb->la_last_gd; + + /* we used the generic suballoc reserve function, but we set + * everything up nicely, so there's no reason why we can't use + * the more specific cluster api to claim bits. */ + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, + &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + ac->ac_bits_wanted = osb->local_alloc_bits; + status = ocfs2_claim_clusters(handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + osb->la_last_gd = ac->ac_last_group; + + la->la_bm_off = cpu_to_le32(cluster_off); + alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); + /* just in case... In the future when we find space ourselves, + * we don't have to get all contiguous -- but we'll have to + * set all previously used bits in bitmap and update + * la_bits_set before setting the bits in the main bitmap. */ + alloc->id1.bitmap1.i_used = 0; + memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, + le16_to_cpu(la->la_size)); + + ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, + OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); + + trace_ocfs2_local_alloc_new_window_result( + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(alloc->id1.bitmap1.i_total)); + +bail: + if (status) + mlog_errno(status); + return status; +} + +/* Note that we do *NOT* lock the local alloc inode here as + * it's been locked already for us. */ +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode) +{ + int status = 0; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + handle_t *handle = NULL; + struct ocfs2_dinode *alloc; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_alloc_context *ac = NULL; + + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + + /* This will lock the main bitmap for us. */ + status = ocfs2_local_alloc_reserve_for_window(osb, + &ac, + &main_bm_inode, + &main_bm_bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + /* We want to clear the local alloc before doing anything + * else, so that if we error later during this operation, + * local alloc shutdown won't try to double free main bitmap + * bits. Make a copy so the sync function knows which bits to + * free. */ + alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); + if (!alloc_copy) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_local_alloc_new_window(osb, handle, ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + atomic_inc(&osb->alloc_stats.moves); + +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + kfree(alloc_copy); + + if (ac) + ocfs2_free_alloc_context(ac); + + if (status) + mlog_errno(status); + return status; +} + diff --git a/kernel/fs/ocfs2/localalloc.h b/kernel/fs/ocfs2/localalloc.h new file mode 100644 index 000000000..44a7d1fb2 --- /dev/null +++ b/kernel/fs/ocfs2/localalloc.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCALALLOC_H +#define OCFS2_LOCALALLOC_H + +int ocfs2_load_local_alloc(struct ocfs2_super *osb); + +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb); +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb); + +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int node_num, + struct ocfs2_dinode **alloc_copy); + +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc); + +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, + u64 bits); + +struct ocfs2_alloc_context; +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context *ac); + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 *bit_off, + u32 *num_bits); + +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + +#endif /* OCFS2_LOCALALLOC_H */ diff --git a/kernel/fs/ocfs2/locks.c b/kernel/fs/ocfs2/locks.c new file mode 100644 index 000000000..6b6d092b0 --- /dev/null +++ b/kernel/fs/ocfs2/locks.c @@ -0,0 +1,141 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * locks.c + * + * Userspace file locking support + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "locks.h" + +static int ocfs2_do_flock(struct file *file, struct inode *inode, + int cmd, struct file_lock *fl) +{ + int ret = 0, level = 0, trylock = 0; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + + if (fl->fl_type == F_WRLCK) + level = 1; + if (!IS_SETLKW(cmd)) + trylock = 1; + + mutex_lock(&fp->fp_mutex); + + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level > LKM_NLMODE) { + int old_level = 0; + + if (lockres->l_level == LKM_EXMODE) + old_level = 1; + + if (level == old_level) + goto out; + + /* + * Converting an existing lock is not guaranteed to be + * atomic, so we can get away with simply unlocking + * here and allowing the lock code to try at the new + * level. + */ + + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + + ocfs2_file_unlock(file); + } + + ret = ocfs2_file_lock(file, level, trylock); + if (ret) { + if (ret == -EAGAIN && trylock) + ret = -EWOULDBLOCK; + else + mlog_errno(ret); + goto out; + } + + ret = flock_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); + +out: + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) +{ + int ret; + struct ocfs2_file_private *fp = file->private_data; + + mutex_lock(&fp->fp_mutex); + ocfs2_file_unlock(file); + ret = flock_lock_file_wait(file, fl); + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +/* + * Overall flow of ocfs2_flock() was influenced by gfs2_flock(). + */ +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb)) + return flock_lock_file_wait(file, fl); + + if (fl->fl_type == F_UNLCK) + return ocfs2_do_funlock(file, cmd, fl); + else + return ocfs2_do_flock(file, inode, cmd, fl); +} + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/kernel/fs/ocfs2/locks.h b/kernel/fs/ocfs2/locks.h new file mode 100644 index 000000000..496d488b2 --- /dev/null +++ b/kernel/fs/ocfs2/locks.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * locks.h + * + * Function prototypes for Userspace file locking support + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKS_H +#define OCFS2_LOCKS_H + +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); + +#endif /* OCFS2_LOCKS_H */ diff --git a/kernel/fs/ocfs2/mmap.c b/kernel/fs/ocfs2/mmap.c new file mode 100644 index 000000000..9581d190f --- /dev/null +++ b/kernel/fs/ocfs2/mmap.c @@ -0,0 +1,193 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mmap.c + * + * Code to deal with the mess that is clustered mmap. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "aops.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "mmap.h" +#include "super.h" +#include "ocfs2_trace.h" + + +static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) +{ + sigset_t oldset; + int ret; + + ocfs2_block_signals(&oldset); + ret = filemap_fault(area, vmf); + ocfs2_unblock_signals(&oldset); + + trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno, + area, vmf->page, vmf->pgoff); + return ret; +} + +static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, + struct page *page) +{ + int ret = VM_FAULT_NOPAGE; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + loff_t pos = page_offset(page); + unsigned int len = PAGE_CACHE_SIZE; + pgoff_t last_index; + struct page *locked_page = NULL; + void *fsdata; + loff_t size = i_size_read(inode); + + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + + /* + * There are cases that lead to the page no longer bebongs to the + * mapping. + * 1) pagecache truncates locally due to memory pressure. + * 2) pagecache truncates when another is taking EX lock against + * inode lock. see ocfs2_data_convert_worker. + * + * The i_size check doesn't catch the case where nodes truncated and + * then re-extended the file. We'll re-check the page mapping after + * taking the page lock inside of ocfs2_write_begin_nolock(). + * + * Let VM retry with these cases. + */ + if ((page->mapping != inode->i_mapping) || + (!PageUptodate(page)) || + (page_offset(page) >= size)) + goto out; + + /* + * Call ocfs2_write_begin() and ocfs2_write_end() to take + * advantage of the allocation code there. We pass a write + * length of the whole page (chopped to i_size) to make sure + * the whole thing is allocated. + * + * Since we know the page is up to date, we don't have to + * worry about ocfs2_write_begin() skipping some buffer reads + * because the "write" would invalidate their data. + */ + if (page->index == last_index) + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; + + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, + &fsdata, di_bh, page); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; + goto out; + } + + if (!locked_page) { + ret = VM_FAULT_NOPAGE; + goto out; + } + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); + BUG_ON(ret != len); + ret = VM_FAULT_LOCKED; +out: + return ret; +} + +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct buffer_head *di_bh = NULL; + sigset_t oldset; + int ret; + + sb_start_pagefault(inode->i_sb); + ocfs2_block_signals(&oldset); + + /* + * The cluster locks taken will block a truncate from another + * node. Taking the data lock will also ensure that we don't + * attempt page truncation as part of a downconvert. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* + * The alloc sem should be enough to serialize with + * ocfs2_truncate_file() changing i_size as well as any thread + * modifying the inode btree. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); + +out: + ocfs2_unblock_signals(&oldset); + sb_end_pagefault(inode->i_sb); + return ret; +} + +static const struct vm_operations_struct ocfs2_file_vm_ops = { + .fault = ocfs2_fault, + .page_mkwrite = ocfs2_page_mkwrite, +}; + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret = 0, lock_level = 0; + + ret = ocfs2_inode_lock_atime(file_inode(file), + file->f_path.mnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_inode_unlock(file_inode(file), lock_level); +out: + vma->vm_ops = &ocfs2_file_vm_ops; + return 0; +} + diff --git a/kernel/fs/ocfs2/mmap.h b/kernel/fs/ocfs2/mmap.h new file mode 100644 index 000000000..1274ee0f1 --- /dev/null +++ b/kernel/fs/ocfs2/mmap.h @@ -0,0 +1,6 @@ +#ifndef OCFS2_MMAP_H +#define OCFS2_MMAP_H + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); + +#endif /* OCFS2_MMAP_H */ diff --git a/kernel/fs/ocfs2/move_extents.c b/kernel/fs/ocfs2/move_extents.c new file mode 100644 index 000000000..56a768d06 --- /dev/null +++ b/kernel/fs/ocfs2/move_extents.c @@ -0,0 +1,1075 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.c + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "ocfs2_ioctl.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "uptodate.h" +#include "super.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "sysfile.h" +#include "refcounttree.h" +#include "move_extents.h" + +struct ocfs2_move_extents_context { + struct inode *inode; + struct file *file; + int auto_defrag; + int partial; + int credits; + u32 new_phys_cpos; + u32 clusters_moved; + u64 refcount_loc; + struct ocfs2_move_extents *range; + struct ocfs2_extent_tree et; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; +}; + +static int __ocfs2_move_extent(handle_t *handle, + struct ocfs2_move_extents_context *context, + u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, + int ext_flags) +{ + int ret = 0, index; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec *rec, replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); + u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); + + ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, + p_cpos, new_p_cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + new_p_cpos)); + + path = ocfs2_new_path_from_et(&context->et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + rec = &el->l_recs[index]; + + BUG_ON(ext_flags != rec->e_flags); + /* + * after moving/defraging to new location, the extent is not going + * to be refcounted anymore. + */ + replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + context->et.et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_split_extent(handle, &context->et, path, index, + &replace_rec, context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, context->et.et_root_bh); + + context->new_phys_cpos = new_p_cpos; + + /* + * need I to append truncate log for old clusters? + */ + if (old_blkno) { + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + old_blkno), + len, context->meta_ac, + &context->dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + old_blkno, len); + } + + ocfs2_update_inode_fsync_trans(handle, inode, 0); +out: + ocfs2_free_path(path); + return ret; +} + +/* + * lock allocators, and reserving appropriate number of bits for + * meta blocks and data clusters. + * + * in some cases, we don't need to reserve clusters, just let data_ac + * be NULL. + */ +static int ocfs2_lock_allocators_move_extents(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_move, + u32 extents_to_split, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int extra_blocks, + int *credits) +{ + int ret, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); + + mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", + extra_blocks, clusters_to_move, *credits); +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +/* + * Using one journal handle to guarantee the data consistency in case + * crash happens anywhere. + * + * XXX: defrag can end up with finishing partial extent as requested, + * due to not enough contiguous clusters can be found in allocator. + */ +static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, partial = context->partial; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 new_phys_cpos, new_len; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + *len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, + &context->meta_ac, + &context->data_ac, + extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * should be using allocation reservation strategy there? + * + * if (context->data_ac) + * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + */ + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock_mutex; + } + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_mutex; + } + + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, + &new_phys_cpos, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * allowing partial extent moving is kind of 'pros and cons', it makes + * whole defragmentation less likely to fail, on the contrary, the bad + * thing is it may make the fs even more fragmented after moving, let + * userspace make a good decision here. + */ + if (new_len != *len) { + mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); + if (!partial) { + context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; + ret = -ENOSPC; + goto out_commit; + } + } + + mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, + phys_cpos, new_phys_cpos); + + ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, + new_phys_cpos, ext_flags); + if (ret) + mlog_errno(ret); + + if (partial && (new_len != *len)) + *len = new_len; + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock_mutex: + mutex_unlock(&tl_inode->i_mutex); + + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * find the victim alloc group, where #blkno fits. + */ +static int ocfs2_find_victim_alloc_group(struct inode *inode, + u64 vict_blkno, + int type, int slot, + int *vict_bit, + struct buffer_head **ret_bh) +{ + int ret, i, bits_per_unit = 0; + u64 blkno; + char namebuf[40]; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ac_bh = NULL, *gd_bh = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *rec; + struct ocfs2_dinode *ac_dinode; + struct ocfs2_group_desc *bg; + + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (ret) { + ret = -ENOENT; + goto out; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; + cl = &(ac_dinode->id2.i_chain); + rec = &(cl->cl_recs[0]); + + if (type == GLOBAL_BITMAP_SYSTEM_INODE) + bits_per_unit = osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits; + /* + * 'vict_blkno' was out of the valid range. + */ + if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || + (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + bits_per_unit))) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + + rec = &(cl->cl_recs[i]); + if (!rec) + continue; + + bg = NULL; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (gd_bh) { + brelse(gd_bh); + gd_bh = NULL; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + + le16_to_cpu(bg->bg_bits))) { + + *ret_bh = gd_bh; + *vict_bit = (vict_blkno - blkno) >> + bits_per_unit; + mlog(0, "find the victim group: #%llu, " + "total_bits: %u, vict_bit: %u\n", + blkno, le16_to_cpu(bg->bg_bits), + *vict_bit); + goto out; + } + + } while (le64_to_cpu(bg->bg_next_group)); + } + + ret = -EINVAL; +out: + brelse(ac_bh); + + /* + * caller has to release the gd_bh properly. + */ + return ret; +} + +/* + * XXX: helper to validate and adjust moving goal. + */ +static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, + struct ocfs2_move_extents *range) +{ + int ret, goal_bit = 0; + + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int c_to_b = 1 << (osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits); + + /* + * make goal become cluster aligned. + */ + range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, + range->me_goal); + /* + * validate goal sits within global_bitmap, and return the victim + * group desc + */ + ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) + goto out; + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* + * movement is not gonna cross two groups. + */ + if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < + range->me_len) { + ret = -EINVAL; + goto out; + } + /* + * more exact validations/adjustments will be performed later during + * moving operation for each extent range. + */ + mlog(0, "extents get ready to be moved to #%llu block\n", + range->me_goal); + +out: + brelse(gd_bh); + + return ret; +} + +static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, + int *goal_bit, u32 move_len, u32 max_hop, + u32 *phys_cpos) +{ + int i, used, last_free_bits = 0, base_bit = *goal_bit; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + + for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { + + used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); + if (used) { + /* + * we even tried searching the free chunk by jumping + * a 'max_hop' distance, but still failed. + */ + if ((i - base_bit) > max_hop) { + *phys_cpos = 0; + break; + } + + if (last_free_bits) + last_free_bits = 0; + + continue; + } else + last_free_bits++; + + if (last_free_bits == move_len) { + *goal_bit = i; + *phys_cpos = base_cpos + i; + break; + } + } + + mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); +} + +static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, + u32 len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, goal_bit = 0; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *gb_inode = NULL; + struct buffer_head *gb_bh = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *gd; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, + context->range->me_threshold); + u64 phys_blkno, new_phys_blkno; + + phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, + &context->meta_ac, + NULL, extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * need to count 2 extra credits for global_bitmap inode and + * group descriptor. + */ + credits += OCFS2_INODE_UPDATE_CREDITS + 1; + + /* + * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() + * logic, while we still need to lock the global_bitmap. + */ + gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + ret = -EIO; + goto out; + } + + mutex_lock(&gb_inode->i_mutex); + + ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock_gb_mutex; + } + + mutex_lock(&tl_inode->i_mutex); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_tl_inode; + } + + new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); + ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * probe the victim cluster group to find a proper + * region to fit wanted movement, it even will perfrom + * a best-effort attempt by compromising to a threshold + * around the goal. + */ + ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, + new_phys_cpos); + if (!*new_phys_cpos) { + ret = -ENOSPC; + goto out_commit; + } + + ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, + *new_phys_cpos, ext_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, + le16_to_cpu(gd->bg_chain)); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, + goal_bit, len); + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); + mlog_errno(ret); + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + brelse(gd_bh); + +out_unlock_tl_inode: + mutex_unlock(&tl_inode->i_mutex); + + ocfs2_inode_unlock(gb_inode, 1); +out_unlock_gb_mutex: + mutex_unlock(&gb_inode->i_mutex); + brelse(gb_bh); + iput(gb_inode); + +out: + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * Helper to calculate the defraging length in one run according to threshold. + */ +static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, + u32 threshold, int *skip) +{ + if ((*alloc_size + *len_defraged) < threshold) { + /* + * proceed defragmentation until we meet the thresh + */ + *len_defraged += *alloc_size; + } else if (*len_defraged == 0) { + /* + * XXX: skip a large extent. + */ + *skip = 1; + } else { + /* + * split this extent to coalesce with former pieces as + * to reach the threshold. + * + * we're done here with one cycle of defragmentation + * in a size of 'thresh', resetting 'len_defraged' + * forces a new defragmentation. + */ + *alloc_size = threshold - *len_defraged; + *len_defraged = 0; + } +} + +static int __ocfs2_move_extents_range(struct buffer_head *di_bh, + struct ocfs2_move_extents_context *context) +{ + int ret = 0, flags, do_defrag, skip = 0; + u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; + u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; + + struct inode *inode = context->inode; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_move_extents *range = context->range; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if ((i_size_read(inode) == 0) || (range->me_len == 0)) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + context->refcount_loc = le64_to_cpu(di->i_refcount_loc); + + ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&context->dealloc); + + /* + * TO-DO XXX: + * + * - xattr extents. + */ + + do_defrag = context->auto_defrag; + + /* + * extents moving happens in unit of clusters, for the sake + * of simplicity, we may ignore two clusters where 'byte_start' + * and 'byte_start + len' were within. + */ + move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); + len_to_move = (range->me_start + range->me_len) >> + osb->s_clustersize_bits; + if (len_to_move >= move_start) + len_to_move -= move_start; + else + len_to_move = 0; + + if (do_defrag) { + defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; + if (defrag_thresh <= 1) + goto done; + } else + new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + range->me_goal); + + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " + "thresh: %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range->me_start, + (unsigned long long)range->me_len, + move_start, len_to_move, defrag_thresh); + + cpos = move_start; + while (len_to_move) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, + &flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > len_to_move) + alloc_size = len_to_move; + + /* + * XXX: how to deal with a hole: + * + * - skip the hole of course + * - force a new defragmentation + */ + if (!phys_cpos) { + if (do_defrag) + len_defraged = 0; + + goto next; + } + + if (do_defrag) { + ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, + defrag_thresh, &skip); + /* + * skip large extents + */ + if (skip) { + skip = 0; + goto next; + } + + mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " + "alloc_size: %u, len_defraged: %u\n", + cpos, phys_cpos, alloc_size, len_defraged); + + ret = ocfs2_defrag_extent(context, cpos, phys_cpos, + &alloc_size, flags); + } else { + ret = ocfs2_move_extent(context, cpos, phys_cpos, + &new_phys_cpos, alloc_size, + flags); + + new_phys_cpos += alloc_size; + } + + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + context->clusters_moved += alloc_size; +next: + cpos += alloc_size; + len_to_move -= alloc_size; + } + +done: + range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; + +out: + range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, + context->clusters_moved); + range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, + context->new_phys_cpos); + + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + + return ret; +} + +static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) +{ + int status; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_dinode *di; + struct buffer_head *di_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes from other nodes + */ + status = ocfs2_rw_lock(inode, 1); + if (status) { + mlog_errno(status); + goto out; + } + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status) { + mlog_errno(status); + goto out_rw_unlock; + } + + /* + * rememer ip_xattr_sem also needs to be held if necessary + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + status = __ocfs2_move_extents_range(di_bh, context); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (status) { + mlog_errno(status); + goto out_inode_unlock; + } + + /* + * We update ctime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_inode_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); +out: + mutex_unlock(&inode->i_mutex); + + return status; +} + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) +{ + int status; + + struct inode *inode = file_inode(filp); + struct ocfs2_move_extents range; + struct ocfs2_move_extents_context *context; + + if (!argp) + return -EINVAL; + + status = mnt_want_write_file(filp); + if (status) + return status; + + if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { + status = -EPERM; + goto out_drop; + } + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + status = -EPERM; + goto out_drop; + } + + context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); + if (!context) { + status = -ENOMEM; + mlog_errno(status); + goto out_drop; + } + + context->inode = inode; + context->file = filp; + + if (copy_from_user(&range, argp, sizeof(range))) { + status = -EFAULT; + goto out_free; + } + + if (range.me_start > i_size_read(inode)) { + status = -EINVAL; + goto out_free; + } + + if (range.me_start + range.me_len > i_size_read(inode)) + range.me_len = i_size_read(inode) - range.me_start; + + context->range = ⦥ + + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { + context->auto_defrag = 1; + /* + * ok, the default theshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + + if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) + context->partial = 1; + } else { + /* + * first best-effort attempt to validate and adjust the goal + * (physical address in block), while it can't guarantee later + * operation can succeed all the time since global_bitmap may + * change a bit over time. + */ + + status = ocfs2_validate_and_adjust_move_goal(inode, &range); + if (status) + goto out_copy; + } + + status = ocfs2_move_extents(context); + if (status) + mlog_errno(status); +out_copy: + /* + * movement/defragmentation may end up being partially completed, + * that's the reason why we need to return userspace the finished + * length and new_offset even if failure happens somewhere. + */ + if (copy_to_user(argp, &range, sizeof(range))) + status = -EFAULT; + +out_free: + kfree(context); +out_drop: + mnt_drop_write_file(filp); + + return status; +} diff --git a/kernel/fs/ocfs2/move_extents.h b/kernel/fs/ocfs2/move_extents.h new file mode 100644 index 000000000..4e143e811 --- /dev/null +++ b/kernel/fs/ocfs2/move_extents.h @@ -0,0 +1,22 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.h + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_MOVE_EXTENTS_H +#define OCFS2_MOVE_EXTENTS_H + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp); + +#endif /* OCFS2_MOVE_EXTENTS_H */ diff --git a/kernel/fs/ocfs2/namei.c b/kernel/fs/ocfs2/namei.c new file mode 100644 index 000000000..176fe6afd --- /dev/null +++ b/kernel/fs/ocfs2/namei.c @@ -0,0 +1,2921 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.c + * + * Create and rename file, directory, symlinks + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "xattr.h" +#include "acl.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac); + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup, + bool dio); + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + char *name, + struct ocfs2_dir_lookup_result *lookup, + struct inode *orphan_dir_inode, + bool dio); + +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + const char *symname); + +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2, + int rename); + +static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); +/* An orphan dir name is an 8 byte value, printed as a hex string */ +#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + +static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int status; + u64 blkno; + struct inode *inode = NULL; + struct dentry *ret; + struct ocfs2_inode_info *oi; + + trace_ocfs2_lookup(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, 0); + + if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { + ret = ERR_PTR(-ENAMETOOLONG); + goto bail; + } + + status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ret = ERR_PTR(status); + goto bail; + } + + status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name, + dentry->d_name.len, &blkno); + if (status < 0) + goto bail_add; + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0); + if (IS_ERR(inode)) { + ret = ERR_PTR(-EACCES); + goto bail_unlock; + } + + oi = OCFS2_I(inode); + /* Clear any orphaned state... If we were able to look up the + * inode from a directory, it certainly can't be orphaned. We + * might have the bad state from a node which intended to + * orphan this inode but crashed before it could commit the + * unlink. */ + spin_lock(&oi->ip_lock); + oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + +bail_add: + ret = d_splice_alias(inode, dentry); + + if (inode) { + /* + * If d_splice_alias() finds a DCACHE_DISCONNECTED + * dentry, it will d_move() it on top of ourse. The + * return value will indicate this however, so in + * those cases, we switch them around for the locking + * code. + * + * NOTE: This dentry already has ->d_op set from + * ocfs2_get_parent() and ocfs2_get_dentry() + */ + if (!IS_ERR_OR_NULL(ret)) + dentry = ret; + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + ret = ERR_PTR(status); + goto bail_unlock; + } + } else + ocfs2_dentry_attach_gen(dentry); + +bail_unlock: + /* Don't drop the cluster lock until *after* the d_add -- + * unlink on another node will message us to remove that + * dentry under this lock so otherwise we can race this with + * the downconvert thread and have a stale dentry. */ + ocfs2_inode_unlock(dir, 0); + +bail: + + trace_ocfs2_lookup_ret(ret); + + return ret; +} + +static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) { + mlog(ML_ERROR, "new_inode failed!\n"); + return NULL; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + if (S_ISDIR(mode)) + set_nlink(inode, 2); + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + return inode; +} + +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + +static int ocfs2_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t dev) +{ + int status = 0; + struct buffer_head *parent_fe_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb; + struct ocfs2_dinode *dirfe; + struct buffer_head *new_fe_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + int want_clusters = 0; + int want_meta = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; + struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; + + trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long)dev, mode); + + dquot_initialize(dir); + + /* get our super block */ + osb = OCFS2_SB(dir->i_sb); + + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) { + status = -EMLINK; + goto leave; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!ocfs2_read_links_count(dirfe)) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto leave; + } + } + + /* calculate meta data/clusters for setting security and acl xattr */ + status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, + &si, &want_clusters, + &xattr_credits, &want_meta); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* Reserve a cluster if creating an extent based directory. */ + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { + want_clusters += 1; + + /* Dir indexing requires extra space as well */ + if (ocfs2_supports_indexed_dirs(osb)) + want_meta++; + } + + status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = posix_acl_create(dir, &mode, &default_acl, &acl); + if (status) { + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), + xattr_credits)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + + status = dquot_alloc_inode(inode); + if (status) + goto leave; + did_quota_inode = 1; + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, inode, dev, + &new_fe_bh, parent_fe_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(mode)) { + status = ocfs2_fill_new_dir(osb, handle, dir, inode, + new_fe_bh, data_ac, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(dir), + parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + ocfs2_add_links_count(dirfe, 1); + ocfs2_journal_dirty(handle, parent_fe_bh); + inc_nlink(dir); + } + + if (default_acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_DEFAULT, default_acl, + meta_ac, data_ac); + } + if (!status && acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_ACCESS, acl, + meta_ac, data_ac); + } + + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + meta_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto leave; + } + + dl = dentry->d_fsdata; + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + insert_inode_hash(inode); + d_instantiate(dentry, inode); + status = 0; +leave: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); + + brelse(new_fe_bh); + brelse(parent_fe_bh); + kfree(si.value); + + ocfs2_free_dir_lookup_result(&lookup); + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + /* + * We should call iput after the i_mutex of the bitmap been + * unlocked in ocfs2_free_alloc_context, or the + * ocfs2_delete_inode will mutex_lock again. + */ + if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + + if (status) + mlog_errno(status); + + return status; +} + +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) +{ + int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + *new_fe_bh = NULL; + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); + OCFS2_I(inode)->ip_blkno = fe_blkno; + spin_lock(&osb->osb_lock); + inode->i_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + *new_fe_bh = sb_getblk(osb->sb, fe_blkno); + if (!*new_fe_bh) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; + memset(fe, 0, osb->sb->s_blocksize); + + fe->i_generation = cpu_to_le32(inode->i_generation); + fe->i_fs_generation = cpu_to_le32(osb->fs_generation); + fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); + + ocfs2_set_links_count(fe, inode->i_nlink); + + fe->i_last_eb_blk = 0; + strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); + fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + fe->i_atime = fe->i_ctime = fe->i_mtime = + cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = + cpu_to_le32(CURRENT_TIME.tv_nsec); + fe->i_dtime = 0; + + /* + * If supported, directories start with inline data. If inline + * isn't supported, but indexing is, we start them as indexed. + */ + feat = le16_to_cpu(fe->i_dyn_features); + if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { + fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); + + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); + } else { + fel = &fe->id2.i_list; + fel->l_tree_depth = 0; + fel->l_next_free_rec = 0; + fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); + } + + ocfs2_journal_dirty(handle, *new_fe_bh); + + ocfs2_populate_inode(inode, fe, 1); + ocfs2_ci_set_new(osb, INODE_CACHE(inode)); + if (!ocfs2_mount_local(osb)) { + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + } + + oi->i_sync_tid = handle->h_transaction->t_tid; + oi->i_datasync_tid = handle->h_transaction->t_tid; + +leave: + if (status < 0) { + if (*new_fe_bh) { + brelse(*new_fe_bh); + *new_fe_bh = NULL; + } + } + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + +static int ocfs2_mkdir(struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + int ret; + + trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, + OCFS2_I(dir)->ip_blkno, mode); + ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_create(struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool excl) +{ + int ret; + + trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); + ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = d_inode(old_dentry); + struct inode *old_dir = d_inode(old_dentry->d_parent); + int err; + struct buffer_head *fe_bh = NULL; + struct buffer_head *old_dir_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + u64 old_de_ino; + + trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + dquot_initialize(dir); + + err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, + &parent_fe_bh, dir, 0); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + return err; + } + + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!parent_fe_bh) { + if (old_dir_bh) { + parent_fe_bh = old_dir_bh; + get_bh(parent_fe_bh); + } else { + mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str); + err = -EIO; + goto out; + } + } + + if (!dir->i_nlink) { + err = -ENOENT; + goto out; + } + + err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_de_ino); + if (err) { + err = -ENOENT; + goto out; + } + + /* + * Check whether another node removed the source inode while we + * were in the vfs. + */ + if (old_de_ino != OCFS2_I(inode)->ip_blkno) { + err = -ENOENT; + goto out; + } + + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (err) + goto out; + + err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (err < 0) { + mlog_errno(err); + goto out; + } + + err = ocfs2_inode_lock(inode, &fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto out; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) { + err = -EMLINK; + goto out_unlock_inode; + } + + handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + mlog_errno(err); + goto out_unlock_inode; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (err < 0) { + mlog_errno(err); + goto out_commit; + } + + inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; + ocfs2_set_links_count(fe, inode->i_nlink); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(handle, fe_bh); + + err = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, + parent_fe_bh, &lookup); + if (err) { + ocfs2_add_links_count(fe, -1); + drop_nlink(inode); + mlog_errno(err); + goto out_commit; + } + + err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (err) { + mlog_errno(err); + goto out_commit; + } + + ihold(inode); + d_instantiate(dentry, inode); + +out_commit: + ocfs2_commit_trans(osb, handle); + ocfs2_unblock_signals(&oldset); +out_unlock_inode: + ocfs2_inode_unlock(inode, 1); + +out: + ocfs2_double_unlock(old_dir, dir); + + brelse(fe_bh); + brelse(parent_fe_bh); + brelse(old_dir_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + if (err) + mlog_errno(err); + + return err; +} + +/* + * Takes and drops an exclusive lock on the given dentry. This will + * force other nodes to drop it. + */ +static int ocfs2_remote_dentry_delete(struct dentry *dentry) +{ + int ret; + + ret = ocfs2_dentry_lock(dentry, 1); + if (ret) + mlog_errno(ret); + else + ocfs2_dentry_unlock(dentry, 1); + + return ret; +} + +static inline int ocfs2_inode_is_unlinkable(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) { + if (inode->i_nlink == 2) + return 1; + return 0; + } + + if (inode->i_nlink == 1) + return 1; + return 0; +} + +static int ocfs2_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + int child_locked = 0; + bool is_unlinkable = false; + struct inode *inode = d_inode(dentry); + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + u64 blkno; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_node_bh = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + + trace_ocfs2_unlink(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + dquot_initialize(dir); + + BUG_ON(d_inode(dentry->d_parent) != dir); + + if (inode == osb->root_inode) + return -EPERM; + + status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1, + OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, dir, + &lookup); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (OCFS2_I(inode)->ip_blkno != blkno) { + status = -ENOENT; + + trace_ocfs2_unlink_noent( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, + OCFS2_I(inode)->ip_flags); + goto leave; + } + + status = ocfs2_inode_lock(inode, &fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + child_locked = 1; + + if (S_ISDIR(inode->i_mode)) { + if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) { + status = -ENOTEMPTY; + goto leave; + } + } + + status = ocfs2_remote_dentry_delete(dentry); + if (status < 0) { + /* This remote delete should succeed under all normal + * circumstances. */ + mlog_errno(status); + goto leave; + } + + if (ocfs2_inode_is_unlinkable(inode)) { + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(inode)->ip_blkno, + orphan_name, &orphan_insert, + false); + if (status < 0) { + mlog_errno(status); + goto leave; + } + is_unlinkable = true; + } + + handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + /* delete the name from the parent dir */ + status = ocfs2_delete_entry(handle, dir, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) + drop_nlink(inode); + drop_nlink(inode); + ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_journal_dirty(handle, fe_bh); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + drop_nlink(dir); + + status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh); + if (status < 0) { + mlog_errno(status); + if (S_ISDIR(inode->i_mode)) + inc_nlink(dir); + goto leave; + } + + if (is_unlinkable) { + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, + orphan_name, &orphan_insert, orphan_dir, false); + if (status < 0) + mlog_errno(status); + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + brelse(fe_bh); + brelse(parent_node_bh); + + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&lookup); + + if (status && (status != -ENOTEMPTY) && (status != -ENOENT)) + mlog_errno(status); + + return status; +} + +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + +/* + * The only place this should be used is rename and link! + * if they have the same id, then the 1st one is the only one locked. + */ +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2, + int rename) +{ + int status; + int inode1_is_ancestor, inode2_is_ancestor; + struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); + struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); + struct buffer_head **tmpbh; + struct inode *tmpinode; + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ + if (oi1->ip_blkno != oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { + /* switch id1 and id2 around */ + tmpbh = bh2; + bh2 = bh1; + bh1 = tmpbh; + + tmpinode = inode2; + inode2 = inode1; + inode1 = tmpinode; + } + /* lock id2 */ + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + } + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, + rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT); + if (status < 0) { + /* + * An error return must mean that no cluster locks + * were held on function exit. + */ + if (oi1->ip_blkno != oi2->ip_blkno) { + ocfs2_inode_unlock(inode2, 1); + brelse(*bh2); + *bh2 = NULL; + } + + if (status != -ENOENT) + mlog_errno(status); + } + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) +{ + ocfs2_inode_unlock(inode1, 1); + + if (inode1 != inode2) + ocfs2_inode_unlock(inode2, 1); +} + +static int ocfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; + int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct inode *orphan_dir = NULL; + struct ocfs2_dinode *newfe = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *newfe_bh = NULL; + struct buffer_head *old_inode_bh = NULL; + struct ocfs2_super *osb = NULL; + u64 newfe_blkno, old_de_ino; + handle_t *handle = NULL; + struct buffer_head *old_dir_bh = NULL; + struct buffer_head *new_dir_bh = NULL; + u32 old_dir_nlink = old_dir->i_nlink; + struct ocfs2_dinode *old_di; + struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; + struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; + struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; + + /* At some point it might be nice to break this function up a + * bit. */ + + trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry, + old_dentry->d_name.len, old_dentry->d_name.name, + new_dentry->d_name.len, new_dentry->d_name.name); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + osb = OCFS2_SB(old_dir->i_sb); + + if (new_inode) { + if (!igrab(new_inode)) + BUG(); + } + + /* Assume a directory hierarchy thusly: + * a/b/c + * a/d + * a,b,c, and d are all directories. + * + * from cwd of 'a' on both nodes: + * node1: mv b/c d + * node2: mv d b/c + * + * And that's why, just like the VFS, we need a file system + * rename lock. */ + if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) { + status = ocfs2_rename_lock(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } + } + + /* if old and new are the same, this'll just do one lock. */ + status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, + &new_dir_bh, new_dir, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + parents_locked = 1; + + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!new_dir_bh) { + if (old_dir_bh) { + new_dir_bh = old_dir_bh; + get_bh(new_dir_bh); + } else { + mlog(ML_ERROR, "no old_dir_bh!\n"); + status = -EIO; + goto bail; + } + } + + /* + * Aside from allowing a meta data update, the locking here + * also ensures that the downconvert thread on other nodes + * won't have to concurrently downconvert the inode and the + * dentry locks. + */ + status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1, + OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + old_child_locked = 1; + + status = ocfs2_remote_dentry_delete(old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(old_inode->i_mode)) { + u64 old_inode_parent; + + update_dot_dot = 1; + status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, + old_inode, + &old_inode_dot_dot_res); + if (status) { + status = -EIO; + goto bail; + } + + if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) { + status = -EIO; + goto bail; + } + + if (!new_inode && new_dir != old_dir && + new_dir->i_nlink >= ocfs2_link_max(osb)) { + status = -EMLINK; + goto bail; + } + } + + status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de_ino); + if (status) { + status = -ENOENT; + goto bail; + } + + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) { + status = -ENOENT; + goto bail; + } + + /* check if the target already exists (in which case we need + * to delete it */ + status = ocfs2_find_files_on_disk(new_dentry->d_name.name, + new_dentry->d_name.len, + &newfe_blkno, new_dir, + &target_lookup_res); + /* The only error we allow here is -ENOENT because the new + * file not existing is perfectly valid. */ + if ((status < 0) && (status != -ENOENT)) { + /* If we cannot find the file specified we should just */ + /* return the error... */ + mlog_errno(status); + goto bail; + } + if (status == 0) + target_exists = 1; + + if (!target_exists && new_inode) { + /* + * Target was unlinked by another node while we were + * waiting to get to ocfs2_rename(). There isn't + * anything we can do here to help the situation, so + * bubble up the appropriate error. + */ + status = -ENOENT; + goto bail; + } + + /* In case we need to overwrite an existing file, we blow it + * away first */ + if (target_exists) { + /* VFS didn't think there existed an inode here, but + * someone else in the cluster must have raced our + * rename to create one. Today we error cleanly, in + * the future we should consider calling iget to build + * a new struct inode for this entry. */ + if (!new_inode) { + status = -EACCES; + + trace_ocfs2_rename_target_exists(new_dentry->d_name.len, + new_dentry->d_name.name); + goto bail; + } + + if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { + status = -EACCES; + + trace_ocfs2_rename_disagree( + (unsigned long long)OCFS2_I(new_inode)->ip_blkno, + (unsigned long long)newfe_blkno, + OCFS2_I(new_inode)->ip_flags); + goto bail; + } + + status = ocfs2_inode_lock(new_inode, &newfe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + new_child_locked = 1; + + status = ocfs2_remote_dentry_delete(new_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + newfe = (struct ocfs2_dinode *) newfe_bh->b_data; + + trace_ocfs2_rename_over_existing( + (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? + (unsigned long long)newfe_bh->b_blocknr : 0ULL); + + if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(new_inode)->ip_blkno, + orphan_name, &orphan_insert, + false); + if (status < 0) { + mlog_errno(status); + goto bail; + } + should_add_orphan = true; + } + } else { + BUG_ON(d_inode(new_dentry->d_parent) != new_dir); + + status = ocfs2_check_dir_for_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, + new_dentry->d_name.name, + new_dentry->d_name.len, + &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (target_exists) { + if (S_ISDIR(new_inode->i_mode)) { + if (new_inode->i_nlink != 2 || + !ocfs2_empty_dir(new_inode)) { + status = -ENOTEMPTY; + goto bail; + } + } + status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode), + newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* change the dirent to point to the correct inode */ + status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, + old_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + new_dir->i_version++; + + if (S_ISDIR(new_inode->i_mode)) + ocfs2_set_links_count(newfe, 0); + else + ocfs2_add_links_count(newfe, -1); + ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir, false); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + } else { + /* if the name was not found in new_dir, add it now */ + status = ocfs2_add_entry(handle, new_dentry, old_inode, + OCFS2_I(old_inode)->ip_blkno, + new_dir_bh, &target_insert); + } + + old_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_inode); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), + old_inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status >= 0) { + old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; + + old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); + old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(handle, old_inode_bh); + } else + mlog_errno(status); + + /* + * Now that the name has been added to new_dir, remove the old name. + * + * We don't keep any directory entry context around until now + * because the insert might have changed the type of directory + * we're dealing with. + */ + status = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, old_dir, + &old_entry_lookup); + if (status) + goto bail; + + status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (new_inode) { + drop_nlink(new_inode); + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + + if (update_dot_dot) { + status = ocfs2_update_entry(old_inode, handle, + &old_inode_dot_dot_res, new_dir); + drop_nlink(old_dir); + if (new_inode) { + drop_nlink(new_inode); + } else { + inc_nlink(new_dir); + mark_inode_dirty(new_dir); + } + } + mark_inode_dirty(old_dir); + ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh); + if (new_inode) { + mark_inode_dirty(new_inode); + ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh); + } + + if (old_dir != new_dir) { + /* Keep the same times on both directories.*/ + new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + + /* + * This will also pick up the i_nlink change from the + * block above. + */ + ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh); + } + + if (old_dir_nlink != old_dir->i_nlink) { + if (!old_dir_bh) { + mlog(ML_ERROR, "need to change nlink for old dir " + "%llu from %d to %d but bh is NULL!\n", + (unsigned long long)OCFS2_I(old_dir)->ip_blkno, + (int)old_dir_nlink, old_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access_di(handle, + INODE_CACHE(old_dir), + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) old_dir_bh->b_data; + ocfs2_set_links_count(fe, old_dir->i_nlink); + ocfs2_journal_dirty(handle, old_dir_bh); + } + } + ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); + status = 0; +bail: + if (rename_lock) + ocfs2_rename_unlock(osb); + + if (handle) + ocfs2_commit_trans(osb, handle); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if (new_inode) + sync_mapping_buffers(old_inode->i_mapping); + + if (new_inode) + iput(new_inode); + + ocfs2_free_dir_lookup_result(&target_lookup_res); + ocfs2_free_dir_lookup_result(&old_entry_lookup); + ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res); + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&target_insert); + + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + + if (status) + mlog_errno(status); + + return status; +} + +/* + * we expect i_size = strlen(symname). Copy symname into the file + * data, including the null terminator. + */ +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + const char *symname) +{ + struct buffer_head **bhs = NULL; + const char *c; + struct super_block *sb = osb->sb; + u64 p_blkno, p_blocks; + int virtual, blocks, status, i, bytes_left; + + bytes_left = i_size_read(inode) + 1; + /* we can't trust i_blocks because we're actually going to + * write i_size + 1 bytes. */ + blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks, + i_size_read(inode), blocks); + + /* Sanity check -- make sure we're going to fit. */ + if (bytes_left > + ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, + NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* links can never be larger than one cluster so we know this + * is all going to be contiguous, but do a sanity check + * anyway. */ + if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + virtual = 0; + while(bytes_left > 0) { + c = &symname[virtual * sb->s_blocksize]; + + bhs[virtual] = sb_getblk(sb, p_blkno); + if (!bhs[virtual]) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), + bhs[virtual]); + + status = ocfs2_journal_access(handle, INODE_CACHE(inode), + bhs[virtual], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[virtual]->b_data, 0, sb->s_blocksize); + + memcpy(bhs[virtual]->b_data, c, + (bytes_left > sb->s_blocksize) ? sb->s_blocksize : + bytes_left); + + ocfs2_journal_dirty(handle, bhs[virtual]); + + virtual++; + p_blkno++; + bytes_left -= sb->s_blocksize; + } + + status = 0; +bail: + + if (bhs) { + for(i = 0; i < blocks; i++) + brelse(bhs[i]); + kfree(bhs); + } + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + int status, l, credits; + u64 newsize; + struct ocfs2_super *osb = NULL; + struct inode *inode = NULL; + struct super_block *sb; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_dinode *dirfe; + handle_t *handle = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota = 0, did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; + + trace_ocfs2_symlink_begin(dir, dentry, symname, + dentry->d_name.len, dentry->d_name.name); + + dquot_initialize(dir); + + sb = dir->i_sb; + osb = OCFS2_SB(sb); + + l = strlen(symname) + 1; + + credits = ocfs2_calc_symlink_credits(sb); + + /* lock the parent directory */ + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!ocfs2_read_links_count(dirfe)) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto bail; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto bail; + } + } + + /* calculate meta data/clusters for setting security xattr */ + if (si.enable) { + status = ocfs2_calc_security_init(dir, &si, &want_clusters, + &xattr_credits, &xattr_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, credits + xattr_credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + + status = dquot_alloc_inode(inode); + if (status) + goto bail; + did_quota_inode = 1; + + trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + inode->i_mode); + + status = ocfs2_mknod_locked(osb, dir, inode, + 0, &new_fe_bh, parent_fe_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; + inode->i_rdev = 0; + newsize = l - 1; + inode->i_op = &ocfs2_symlink_inode_operations; + if (l > ocfs2_fast_symlink_chars(sb)) { + u32 offset = 0; + + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) + goto bail; + did_quota = 1; + inode->i_mapping->a_ops = &ocfs2_aops; + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); + if (status < 0) { + if (status != -ENOSPC && status != -EINTR) { + mlog(ML_ERROR, + "Failed to extend file to %llu\n", + (unsigned long long)newsize); + mlog_errno(status); + status = -ENOSPC; + } + goto bail; + } + i_size_write(inode, newsize); + inode->i_blocks = ocfs2_inode_sector_count(inode); + } else { + inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; + memcpy((char *) fe->id2.i_symlink, symname, l); + i_size_write(inode, newsize); + inode->i_blocks = 0; + } + + status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!ocfs2_inode_is_fast_symlink(inode)) { + status = ocfs2_create_symlink_data(osb, handle, inode, + symname); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto bail; + } + + dl = dentry->d_fsdata; + + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + insert_inode_hash(inode); + d_instantiate(dentry, inode); +bail: + if (status < 0 && did_quota) + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); + + brelse(new_fe_bh); + brelse(parent_fe_bh); + kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + + if (status) + mlog_errno(status); + + return status; +} + +static int ocfs2_blkno_stringify(u64 blkno, char *name) +{ + int status, namelen; + + namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx", + (long long)blkno); + if (namelen <= 0) { + if (namelen) + status = namelen; + else + status = -EINVAL; + mlog_errno(status); + goto bail; + } + if (namelen != OCFS2_ORPHAN_NAMELEN) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + trace_ocfs2_blkno_stringify(blkno, name, namelen); + + status = 0; +bail: + if (status < 0) + mlog_errno(status); + return status; +} + +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) +{ + struct inode *orphan_dir_inode; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + ret = -ENOENT; + mlog_errno(ret); + return ret; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; + } + + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; + + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup, + bool dio) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } + + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + namelen, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup, + bool dio) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup, dio); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + *ret_orphan_dir = orphan_dir_inode; + +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + } + + if (ret) + mlog_errno(ret); + return ret; +} + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + char *name, + struct ocfs2_dir_lookup_result *lookup, + struct inode *orphan_dir_inode, + bool dio) +{ + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + struct ocfs2_dinode *orphan_fe; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + trace_ocfs2_orphan_add_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* we're a cluster, and nlink can change on disk from + * underneath us... */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, 1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + ocfs2_journal_dirty(handle, orphan_dir_bh); + + status = __ocfs2_add_entry(handle, orphan_dir_inode, name, + namelen, inode, + OCFS2_I(inode)->ip_blkno, + orphan_dir_bh, lookup); + if (status < 0) { + mlog_errno(status); + goto rollback; + } + + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } + + ocfs2_journal_dirty(handle, fe_bh); + + trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + osb->slot_num); + +rollback: + if (status < 0) { + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + } + +leave: + brelse(orphan_dir_bh); + + return status; +} + +/* unlike orphan_add, we expect the orphan dir to already be locked here. */ +int ocfs2_orphan_del(struct ocfs2_super *osb, + handle_t *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh, + bool dio) +{ + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; + struct ocfs2_dinode *orphan_fe; + int status = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + trace_ocfs2_orphan_del( + (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, + name, strlen(name)); + + /* find it's spot in the orphan directory */ + status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, + &lookup); + if (status) { + mlog_errno(status); + goto leave; + } + + /* remove it from the orphan directory */ + status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* do the i_nlink dance! :) */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + ocfs2_journal_dirty(handle, orphan_dir_bh); + +leave: + ocfs2_free_dir_lookup_result(&lookup); + + if (status) + mlog_errno(status); + return status; +} + +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert, + false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return ret; +} + +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode) +{ + int status, did_quota_inode = 0; + struct inode *inode = NULL; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *parent_di_bh = NULL; + struct buffer_head *new_di_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = dquot_alloc_inode(inode); + if (status) + goto leave; + did_quota_inode = 1; + + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clear_nlink(inode); + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + di = (struct ocfs2_dinode *)new_di_bh->b_data; + status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, + &orphan_insert, orphan_dir, false); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* get open lock so that only nodes can't remove it from orphan dir. */ + status = ocfs2_open_lock(inode); + if (status < 0) + mlog_errno(status); + + insert_inode_hash(inode); +leave: + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if ((status < 0) && inode) { + clear_nlink(inode); + iput(inode); + } + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + brelse(new_di_bh); + + if (!status) + *new_inode = inode; + + ocfs2_free_dir_lookup_result(&orphan_insert); + + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + return status; +} + +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + +restart: + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + int status = 0; + struct buffer_head *parent_di_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *dir_di, *di; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry, + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + if (!dir_di->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + goto leave; + } + + status = ocfs2_read_inode_block(inode, &di_bh); + if (status < 0) { + mlog_errno(status); + goto orphan_unlock; + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto orphan_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh, false); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL); + di->i_orphaned_slot = 0; + set_nlink(inode, 1); + ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + ocfs2_journal_dirty(handle, di_bh); + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_di_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto out_commit; + } + + d_instantiate(dentry, inode); + status = 0; +out_commit: + ocfs2_commit_trans(osb, handle); +orphan_unlock: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); +leave: + + ocfs2_inode_unlock(dir, 1); + + brelse(di_bh); + brelse(parent_di_bh); + brelse(orphan_dir_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + if (status) + mlog_errno(status); + + return status; +} + +const struct inode_operations ocfs2_dir_iops = { + .create = ocfs2_create, + .lookup = ocfs2_lookup, + .link = ocfs2_link, + .unlink = ocfs2_unlink, + .rmdir = ocfs2_unlink, + .symlink = ocfs2_symlink, + .mkdir = ocfs2_mkdir, + .mknod = ocfs2_mknod, + .rename = ocfs2_rename, + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, + .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, +}; diff --git a/kernel/fs/ocfs2/namei.h b/kernel/fs/ocfs2/namei.h new file mode 100644 index 000000000..5ddecce17 --- /dev/null +++ b/kernel/fs/ocfs2/namei.h @@ -0,0 +1,51 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_NAMEI_H +#define OCFS2_NAMEI_H + +extern const struct inode_operations ocfs2_dir_iops; + +struct dentry *ocfs2_get_parent(struct dentry *child); + +int ocfs2_orphan_del(struct ocfs2_super *osb, + handle_t *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh, + bool dio); +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *new_inode, + struct dentry *new_dentry); + +#endif /* OCFS2_NAMEI_H */ diff --git a/kernel/fs/ocfs2/ocfs1_fs_compat.h b/kernel/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 000000000..dfb313bda --- /dev/null +++ b/kernel/fs/ocfs2/ocfs1_fs_compat.h @@ -0,0 +1,109 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs1_fs_compat.h + * + * OCFS1 volume header definitions. OCFS2 creates valid but unmountable + * OCFS1 volume headers on the first two sectors of an OCFS2 volume. + * This allows an OCFS1 volume to see the partition and cleanly fail to + * mount it. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS1_FS_COMPAT_H +#define _OCFS1_FS_COMPAT_H + +#define OCFS1_MAX_VOL_SIGNATURE_LEN 128 +#define OCFS1_MAX_MOUNT_POINT_LEN 128 +#define OCFS1_MAX_VOL_ID_LENGTH 16 +#define OCFS1_MAX_VOL_LABEL_LEN 64 +#define OCFS1_MAX_CLUSTER_NAME_LEN 64 + +#define OCFS1_MAJOR_VERSION (2) +#define OCFS1_MINOR_VERSION (0) +#define OCFS1_VOLUME_SIGNATURE "OracleCFS" + +/* + * OCFS1 superblock. Lives at sector 0. + */ +struct ocfs1_vol_disk_hdr +{ +/*00*/ __u32 minor_version; + __u32 major_version; +/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; +/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; +/*108*/ __u64 serial_num; +/*110*/ __u64 device_size; + __u64 start_off; +/*120*/ __u64 bitmap_off; + __u64 publ_off; +/*130*/ __u64 vote_off; + __u64 root_bitmap_off; +/*140*/ __u64 data_start_off; + __u64 root_bitmap_size; +/*150*/ __u64 root_off; + __u64 root_size; +/*160*/ __u64 cluster_size; + __u64 num_nodes; +/*170*/ __u64 num_clusters; + __u64 dir_node_size; +/*180*/ __u64 file_node_size; + __u64 internal_off; +/*190*/ __u64 node_cfg_off; + __u64 node_cfg_size; +/*1A0*/ __u64 new_cfg_off; + __u32 prot_bits; + __s32 excl_mount; +/*1B0*/ +}; + + +struct ocfs1_disk_lock +{ +/*00*/ __u32 curr_master; + __u8 file_lock; + __u8 compat_pad[3]; /* Not in original definition. Used to + make the already existing alignment + explicit */ + __u64 last_write_time; +/*10*/ __u64 last_read_time; + __u32 writer_node_num; + __u32 reader_node_num; +/*20*/ __u64 oin_node_map; + __u64 dlock_seq_num; +/*30*/ +}; + +/* + * OCFS1 volume label. Lives at sector 1. + */ +struct ocfs1_vol_label +{ +/*00*/ struct ocfs1_disk_lock disk_lock; +/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; +/*70*/ __u16 label_len; +/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; +/*82*/ __u16 vol_id_len; +/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; +/*A4*/ __u16 cluster_name_len; +/*A6*/ +}; + + +#endif /* _OCFS1_FS_COMPAT_H */ + diff --git a/kernel/fs/ocfs2/ocfs2.h b/kernel/fs/ocfs2/ocfs2.h new file mode 100644 index 000000000..460c6c37e --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2.h @@ -0,0 +1,919 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2.h + * + * Defines macros and structures used in OCFS2 + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_H +#define OCFS2_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* For union ocfs2_dlm_lksb */ +#include "stackglue.h" + +#include "ocfs2_fs.h" +#include "ocfs2_lockid.h" +#include "ocfs2_ioctl.h" + +/* For struct ocfs2_blockcheck_stats */ +#include "blockcheck.h" + +#include "reservations.h" + +/* Caching of metadata buffers */ + +/* Most user visible OCFS2 inodes will have very few pieces of + * metadata, but larger files (including bitmaps, etc) must be taken + * into account when designing an access scheme. We allow a small + * amount of inlined blocks to be stored on an array and grow the + * structure into a rb tree when necessary. */ +#define OCFS2_CACHE_INFO_MAX_ARRAY 2 + +/* Flags for ocfs2_caching_info */ + +enum ocfs2_caching_info_flags { + /* Indicates that the metadata cache is using the inline array */ + OCFS2_CACHE_FL_INLINE = 1<<1, +}; + +struct ocfs2_caching_operations; +struct ocfs2_caching_info { + /* + * The parent structure provides the locks, but because the + * parent structure can differ, it provides locking operations + * to struct ocfs2_caching_info. + */ + const struct ocfs2_caching_operations *ci_ops; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ci_created_trans; + /* last transaction we were a part of. */ + unsigned long ci_last_trans; + + /* Cache structures */ + unsigned int ci_flags; + unsigned int ci_num_cached; + union { + sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; + struct rb_root ci_tree; + } ci_cache; +}; +/* + * Need this prototype here instead of in uptodate.h because journal.h + * uses it. + */ +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); + +/* this limits us to 256 nodes + * if we need more, we can do a kmalloc for the map */ +#define OCFS2_NODE_MAP_MAX_NODES 256 +struct ocfs2_node_map { + u16 num_nodes; + unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; +}; + +enum ocfs2_ast_action { + OCFS2_AST_INVALID = 0, + OCFS2_AST_ATTACH, + OCFS2_AST_CONVERT, + OCFS2_AST_DOWNCONVERT, +}; + +/* actions for an unlockast function to take. */ +enum ocfs2_unlock_action { + OCFS2_UNLOCK_INVALID = 0, + OCFS2_UNLOCK_CANCEL_CONVERT, + OCFS2_UNLOCK_DROP_LOCK, +}; + +/* ocfs2_lock_res->l_flags flags. */ +#define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized + * the lvb */ +#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ +#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) +#define OCFS2_LOCK_REFRESHING (0x00000020) +#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization + * for shutdown paths */ +#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track + * when to skip queueing + * a lock because it's + * about to be + * dropped. */ +#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ +#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ +#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a + call to dlm_lock. Only + exists with BUSY set. */ +#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread + * from downconverting + * before the upconvert + * has completed */ + +#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster + * lock has already + * returned, do not block + * dc thread from + * downconverting */ + +struct ocfs2_lock_res_ops; + +typedef void (*ocfs2_lock_callback)(int status, unsigned long data); + +#ifdef CONFIG_OCFS2_FS_STATS +struct ocfs2_lock_stats { + u64 ls_total; /* Total wait in NSEC */ + u32 ls_gets; /* Num acquires */ + u32 ls_fail; /* Num failed acquires */ + + /* Storing max wait in usecs saves 24 bytes per inode */ + u32 ls_max; /* Max wait in USEC */ +}; +#endif + +struct ocfs2_lock_res { + void *l_priv; + struct ocfs2_lock_res_ops *l_ops; + + + struct list_head l_blocked_list; + struct list_head l_mask_waiters; + + unsigned long l_flags; + char l_name[OCFS2_LOCK_ID_MAX_LEN]; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + signed char l_level; + signed char l_requested; + signed char l_blocking; + + /* Data packed - type enum ocfs2_lock_type */ + unsigned char l_type; + + /* used from AST/BAST funcs. */ + /* Data packed - enum type ocfs2_ast_action */ + unsigned char l_action; + /* Data packed - enum type ocfs2_unlock_action */ + unsigned char l_unlock_action; + unsigned int l_pending_gen; + + spinlock_t l_lock; + + struct ocfs2_dlm_lksb l_lksb; + + wait_queue_head_t l_event; + + struct list_head l_debug_list; + +#ifdef CONFIG_OCFS2_FS_STATS + struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ + u32 l_lock_refresh; /* Disk refreshes */ + struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map l_lockdep_map; +#endif +}; + +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + +enum ocfs2_orphan_scan_state { + ORPHAN_SCAN_ACTIVE, + ORPHAN_SCAN_INACTIVE +}; + +struct ocfs2_orphan_scan { + struct mutex os_lock; + struct ocfs2_super *os_osb; + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ + struct delayed_work os_orphan_scan_work; + struct timespec os_scantime; /* time this node ran the scan */ + u32 os_count; /* tracks node specific scans */ + u32 os_seqno; /* tracks cluster wide scans */ + atomic_t os_state; /* ACTIVE or INACTIVE */ +}; + +struct ocfs2_dlm_debug { + struct kref d_refcnt; + struct dentry *d_locking_state; + struct list_head d_lockres_tracking; +}; + +enum ocfs2_vol_state +{ + VOLUME_INIT = 0, + VOLUME_MOUNTED, + VOLUME_MOUNTED_QUOTAS, + VOLUME_DISMOUNTED, + VOLUME_DISABLED +}; + +struct ocfs2_alloc_stats +{ + atomic_t moves; + atomic_t local_data; + atomic_t bitmap_data; + atomic_t bg_allocs; + atomic_t bg_extends; +}; + +enum ocfs2_local_alloc_state +{ + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ +}; + +enum ocfs2_mount_options +{ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ + OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ + OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ + OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ + OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ + OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT + writes */ + OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ + + OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ +}; + +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 + +struct ocfs2_journal; +struct ocfs2_slot_info; +struct ocfs2_recovery_map; +struct ocfs2_replay_map; +struct ocfs2_quota_recovery; +struct ocfs2_super +{ + struct task_struct *commit_task; + struct super_block *sb; + struct inode *root_inode; + struct inode *sys_root_inode; + struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; + struct inode **local_system_inodes; + + struct ocfs2_slot_info *slot_info; + + u32 *slot_recovery_generations; + + spinlock_t node_map_lock; + + u64 root_blkno; + u64 system_dir_blkno; + u64 bitmap_blkno; + u32 bitmap_cpg; + u8 *uuid; + char *uuid_str; + u32 uuid_hash; + u8 *vol_label; + u64 first_cluster_group_blkno; + u32 fs_generation; + + u32 s_feature_compat; + u32 s_feature_incompat; + u32 s_feature_ro_compat; + + /* Protects s_next_generation, osb_flags and s_inode_steal_slot. + * Could protect more on osb as it's very short lived. + */ + spinlock_t osb_lock; + u32 s_next_generation; + unsigned long osb_flags; + s16 s_inode_steal_slot; + s16 s_meta_steal_slot; + atomic_t s_num_inodes_stolen; + atomic_t s_num_meta_stolen; + + unsigned long s_mount_opt; + unsigned int s_atime_quantum; + + unsigned int max_slots; + unsigned int node_num; + int slot_num; + int preferred_slot; + int s_sectsize_bits; + int s_clustersize; + int s_clustersize_bits; + unsigned int s_xattr_inline_size; + + atomic_t vol_state; + struct mutex recovery_lock; + struct ocfs2_recovery_map *recovery_map; + struct ocfs2_replay_map *replay_map; + struct task_struct *recovery_thread_task; + int disable_recovery; + wait_queue_head_t checkpoint_event; + struct ocfs2_journal *journal; + unsigned long osb_commit_interval; + + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + /* osb_clusters_at_boot can become stale! Do not trust it to + * be up to date. */ + unsigned int osb_clusters_at_boot; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + + struct buffer_head *local_alloc_bh; + + u64 la_last_gd; + + struct ocfs2_reservation_map osb_la_resmap; + + unsigned int osb_resv_level; + unsigned int osb_dir_resv_level; + + /* Next three fields are for local node slot recovery during + * mount. */ + int dirty; + struct ocfs2_dinode *local_alloc_copy; + struct ocfs2_quota_recovery *quota_rec; + + struct ocfs2_blockcheck_stats osb_ecc_stats; + struct ocfs2_alloc_stats alloc_stats; + char dev_str[20]; /* "major,minor" of the device */ + + u8 osb_stackflags; + + char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; + struct ocfs2_cluster_connection *cconn; + struct ocfs2_lock_res osb_super_lockres; + struct ocfs2_lock_res osb_rename_lockres; + struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_dlm_debug *osb_dlm_debug; + + struct dentry *osb_debug_root; + struct dentry *osb_ctxt; + + wait_queue_head_t recovery_event; + + spinlock_t dc_task_lock; + struct task_struct *dc_task; + wait_queue_head_t dc_event; + unsigned long dc_wake_sequence; + unsigned long dc_work_sequence; + + /* + * Any thread can add locks to the list, but the downconvert + * thread is the only one allowed to remove locks. Any change + * to this rule requires updating + * ocfs2_downconvert_thread_do_work(). + */ + struct list_head blocked_lock_list; + unsigned long blocked_lock_count; + + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; + + wait_queue_head_t osb_mount_event; + + /* Truncate log info */ + struct inode *osb_tl_inode; + struct buffer_head *osb_tl_bh; + struct delayed_work osb_truncate_log_wq; + atomic_t osb_tl_disable; + /* + * How many clusters in our truncate log. + * It must be protected by osb_tl_inode->i_mutex. + */ + unsigned int truncated_clusters; + + struct ocfs2_node_map osb_recovering_orphan_dirs; + unsigned int *osb_orphan_wipes; + wait_queue_head_t osb_wipe_event; + + struct ocfs2_orphan_scan osb_orphan_scan; + + /* used to protect metaecc calculation check of xattr. */ + spinlock_t osb_xattr_lock; + + unsigned int osb_dx_mask; + u32 osb_dx_seed[4]; + + /* the group we used to allocate inodes. */ + u64 osb_inode_alloc_group; + + /* rb tree root for refcount lock. */ + struct rb_root osb_rf_lock_tree; + struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; +}; + +#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) + +/* Useful typedef for passing around journal access functions */ +typedef int (*ocfs2_journal_access_func)(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); + +static inline int ocfs2_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) + return 0; + return 1; +} + +static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) + return 1; + return 0; +} + +static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) +{ + /* + * Support for sparse files is a pre-requisite + */ + if (!ocfs2_sparse_alloc(osb)) + return 0; + + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) + return 1; + return 0; +} + +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) + return 1; + return 0; +} + + +static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) + return 1; + return 0; +} + +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + +static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) + return 1; + return 0; +} + +static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + return 1; + return 0; +} + +static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + return 1; + return 0; +} + +static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) +{ + if (ocfs2_supports_indexed_dirs(osb)) + return OCFS2_DX_LINK_MAX; + return OCFS2_LINK_MAX; +} + +static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) +{ + u32 nlink = le16_to_cpu(di->i_links_count); + u32 hi = le16_to_cpu(di->i_links_count_hi); + + if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + + return nlink; +} + +static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) +{ + u16 lo, hi; + + lo = nlink; + hi = nlink >> OCFS2_LINKS_HI_SHIFT; + + di->i_links_count = cpu_to_le16(lo); + di->i_links_count_hi = cpu_to_le16(hi); +} + +static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) +{ + u32 links = ocfs2_read_links_count(di); + + links += n; + + ocfs2_set_links_count(di, links); +} + +static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + return 1; + return 0; +} + +/* set / clear functions because cluster events can make these happen + * in parallel so we want the transitions to be atomic. this also + * means that any future flags osb_flags must be protected by spinlock + * too! */ +static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags |= flag; + spin_unlock(&osb->osb_lock); +} + +static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, + int hard) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); + if (hard) + osb->osb_flags |= OCFS2_OSB_HARD_RO; + else + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); +} + +static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_HARD_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | + OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); +} + +static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) +{ + return ocfs2_o2cb_stack(osb) && + (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); +} + +static inline int ocfs2_mount_local(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); +} + +static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); +} + + +#define OCFS2_IS_VALID_DINODE(ptr) \ + (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) + +#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ + (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) + +#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ + (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) + + +#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ + (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) + +#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ + (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_ROOT(ptr) \ + (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_LEAF(ptr) \ + (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) + +#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ + (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) + +static inline unsigned long ino_from_blkno(struct super_block *sb, + u64 blkno) +{ + return (unsigned long)(blkno & (u64)ULONG_MAX); +} + +static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, + u32 clusters) +{ + int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u64)clusters << c_to_b_bits; +} + +static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u32)(blocks >> b_to_c_bits); +} + +static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + bytes += OCFS2_SB(sb)->s_clustersize - 1; + /* OCFS2 just cannot have enough clusters to overflow this */ + clusters = (unsigned int)(bytes >> cl_bits); + + return clusters; +} + +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + +static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, + u64 bytes) +{ + bytes += sb->s_blocksize - 1; + return bytes >> sb->s_blocksize_bits; +} + +static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, + u32 clusters) +{ + return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; +} + +static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, + u64 blocks) +{ + int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; + unsigned int clusters; + + clusters = ocfs2_blocks_to_clusters(sb, blocks); + return (u64)clusters << bits; +} + +static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = ocfs2_clusters_for_bytes(sb, bytes); + return (u64)clusters << cl_bits; +} + +static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, + u64 bytes) +{ + u64 blocks; + + blocks = ocfs2_blocks_for_bytes(sb, bytes); + return blocks << sb->s_blocksize_bits; +} + +static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) +{ + return (unsigned long)((bytes + 511) >> 9); +} + +static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, + unsigned long pg_index) +{ + u32 clusters = pg_index; + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + + if (unlikely(PAGE_CACHE_SHIFT > cbits)) + clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); + else if (PAGE_CACHE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + + return clusters; +} + +/* + * Find the 1st page index which covers the given clusters. + */ +static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, + u32 clusters) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + pgoff_t index = clusters; + + if (PAGE_CACHE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); + } else if (PAGE_CACHE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + } + + return index; +} + +static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int pages_per_cluster = 1; + + if (PAGE_CACHE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + + return pages_per_cluster; +} + +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + +static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, + unsigned int clusters) +{ + return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + +static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) +{ + __set_bit_le(bit, bitmap); +} +#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) + +static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) +{ + __clear_bit_le(bit, bitmap); +} +#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) + +#define ocfs2_test_bit test_bit_le +#define ocfs2_find_next_zero_bit find_next_zero_bit_le +#define ocfs2_find_next_bit find_next_bit_le + +static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_set_bit(bit, bitmap); +} + +static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_clear_bit(bit, bitmap); +} + +static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + return ocfs2_test_bit(bit, bitmap); +} + +static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, + int start) +{ + int fix = 0, ret, tmpmax; + bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); + tmpmax = max + fix; + start += fix; + + ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +#endif /* OCFS2_H */ + diff --git a/kernel/fs/ocfs2/ocfs2_fs.h b/kernel/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 000000000..db64ce2d4 --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2_fs.h @@ -0,0 +1,1651 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_fs.h + * + * On-disk structures for OCFS2. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_FS_H +#define _OCFS2_FS_H + +/* Version */ +#define OCFS2_MAJOR_REV_LEVEL 0 +#define OCFS2_MINOR_REV_LEVEL 90 + +/* + * An OCFS2 volume starts this way: + * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. + * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. + * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. + * + * All other structures are found from the superblock information. + * + * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a + * blocksize of 2K, it is 4096 bytes into disk. + */ +#define OCFS2_SUPER_BLOCK_BLKNO 2 + +/* + * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could + * grow if needed. + */ +#define OCFS2_MIN_CLUSTERSIZE 4096 +#define OCFS2_MAX_CLUSTERSIZE 1048576 + +/* + * Blocks cannot be bigger than clusters, so the maximum blocksize is the + * minimum cluster size. + */ +#define OCFS2_MIN_BLOCKSIZE 512 +#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE + +/* Filesystem magic number */ +#define OCFS2_SUPER_MAGIC 0x7461636f + +/* Object signatures */ +#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" +#define OCFS2_INODE_SIGNATURE "INODE01" +#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" +#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" +#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" +#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" +#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" +#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" + +/* Compatibility flags */ +#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_compat & (mask) ) +#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) +#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) +#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat |= (mask) +#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat |= (mask) +#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat |= (mask) +#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat &= ~(mask) +#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) +#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat &= ~(mask) + +#define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \ + | OCFS2_FEATURE_COMPAT_JBD2_SB) +#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ + | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ + | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ + | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR \ + | OCFS2_FEATURE_INCOMPAT_META_ECC \ + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) +#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ + | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + +/* + * Heartbeat-only devices are missing journals and other files. The + * filesystem driver can't load them, but the library can. Never put + * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. + */ +#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 + +/* + * tunefs sets this incompat flag before starting the resize and clears it + * at the end. This flag protects users from inadvertently mounting the fs + * after an aborted run without fsck-ing. + */ +#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 + +/* Used to denote a non-clustered volume */ +#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 + +/* Support for sparse allocation in b-trees */ +#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 + +/* + * Tunefs sets this incompat flag before starting an operation which + * would require cleanup on abort. This is done to protect users from + * inadvertently mounting the fs after an aborted run without + * fsck-ing. + * + * s_tunefs_flags on the super block describes precisely which + * operations were in progress. + */ +#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020 + +/* Support for data packed into inode blocks */ +#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 + +/* + * Support for alternate, userspace cluster stacks. If set, the superblock + * field s_cluster_info contains a tag for the alternate stack in use as + * well as the name of the cluster being joined. + * mount.ocfs2 must pass in a matching stack name. + * + * If not set, the classic stack will be used. This is compatbile with + * all older versions. + */ +#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 + +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + +/* Support for indexed directores */ +#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 + +/* Metadata checksum and error correction */ +#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 + +/* Refcount tree support */ +#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 + +/* Discontigous block groups */ +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 + +/* + * Incompat bit to indicate useable clusterinfo with stackflags for all + * cluster stacks (userspace adnd o2cb). If this bit is set, + * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. + */ +#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 + +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + +/* + * backup superblock flag is used to indicate that this volume + * has backup superblocks. + */ +#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 + +/* + * The filesystem will correctly handle journal feature bits. + */ +#define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002 + +/* + * Unwritten extents support. + */ +#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 + +/* + * Maintain quota information for this filesystem + */ +#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 +#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + + +/* The byte offset of the first backup block will be 1G. + * The following will be 4G, 16G, 64G, 256G and 1T. + */ +#define OCFS2_BACKUP_SB_START 1 << 30 + +/* the max backup superblock nums */ +#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 + +/* + * Flags on ocfs2_super_block.s_tunefs_flags + */ +#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */ + +/* + * Flags on ocfs2_dinode.i_flags + */ +#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ +#define OCFS2_UNUSED2_FL (0x00000002) +#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ +#define OCFS2_UNUSED3_FL (0x00000008) +/* System inode flags */ +#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ +#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ +#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ +#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ +#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ +#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ +#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ +#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ +#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ + +/* + * Flags on ocfs2_dinode.i_dyn_features + * + * These can change much more often than i_flags. When adding flags, + * keep in mind that i_dyn_features is only 16 bits wide. + */ +#define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */ +#define OCFS2_HAS_XATTR_FL (0x0002) +#define OCFS2_INLINE_XATTR_FL (0x0004) +#define OCFS2_INDEXED_DIR_FL (0x0008) +#define OCFS2_HAS_REFCOUNT_FL (0x0010) + +/* Inode attributes, keep in sync with EXT2 */ +#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define OCFS2_DIRTY_FL FS_DIRTY_FL +#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +/* + * Extent record flags (e_node.leaf.flags) + */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ +#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference + * counted in an associated + * refcount tree */ + +/* + * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) + */ +#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ + +/* + * superblock s_state flags + */ +#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ + +/* Limit of space in ocfs2_dir_entry */ +#define OCFS2_MAX_FILENAME_LEN 255 + +/* Maximum slots on an ocfs2 file system */ +#define OCFS2_MAX_SLOTS 255 + +/* Slot map indicator for an empty slot */ +#define OCFS2_INVALID_SLOT -1 + +#define OCFS2_VOL_UUID_LEN 16 +#define OCFS2_MAX_VOL_LABEL_LEN 64 + +/* The cluster stack fields */ +#define OCFS2_STACK_LABEL_LEN 4 +#define OCFS2_CLUSTER_NAME_LEN 16 + +/* Classic (historically speaking) cluster stack */ +#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" + +/* Journal limits (in bytes) */ +#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) + +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + +/* + * Cluster info flags (ocfs2_cluster_info.ci_stackflags) + */ +#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) + +struct ocfs2_system_inode_info { + char *si_name; + int si_iflags; + int si_mode; +}; + +/* System file index */ +enum { + BAD_BLOCK_SYSTEM_INODE = 0, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, + SLOT_MAP_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE + HEARTBEAT_SYSTEM_INODE, + GLOBAL_BITMAP_SYSTEM_INODE, + USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE +#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE + ORPHAN_DIR_SYSTEM_INODE, + EXTENT_ALLOC_SYSTEM_INODE, + INODE_ALLOC_SYSTEM_INODE, + JOURNAL_SYSTEM_INODE, + LOCAL_ALLOC_SYSTEM_INODE, + TRUNCATE_LOG_SYSTEM_INODE, + LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE + NUM_SYSTEM_INODES +}; +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE +#define NUM_LOCAL_SYSTEM_INODES \ + (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) + +static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { + /* Global system inodes (single copy) */ + /* The first two are only used from userspace mfks/tunefs */ + [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, + [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + + /* These are used by the running filesystem */ + [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, + [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, + [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + + /* Slot-specific system inodes (one copy per slot) */ + [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, + [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, + [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, + [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, +}; + +/* Parameter passed from mount.ocfs2 to module */ +#define OCFS2_HB_NONE "heartbeat=none" +#define OCFS2_HB_LOCAL "heartbeat=local" +#define OCFS2_HB_GLOBAL "heartbeat=global" + +/* + * OCFS2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define OCFS2_FT_UNKNOWN 0 +#define OCFS2_FT_REG_FILE 1 +#define OCFS2_FT_DIR 2 +#define OCFS2_FT_CHRDEV 3 +#define OCFS2_FT_BLKDEV 4 +#define OCFS2_FT_FIFO 5 +#define OCFS2_FT_SOCK 6 +#define OCFS2_FT_SYMLINK 7 + +#define OCFS2_FT_MAX 8 + +/* + * OCFS2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define OCFS2_DIR_PAD 4 +#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) +#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) +#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ + OCFS2_DIR_ROUND) & \ + ~OCFS2_DIR_ROUND) +#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) + +#define OCFS2_LINK_MAX 32000 +#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) +#define OCFS2_LINKS_HI_SHIFT 16 +#define OCFS2_DX_ENTRIES_MAX (0xffffffffU) + +#define S_SHIFT 12 +static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, + [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, +}; + + +/* + * Convenience casts + */ +#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) + +/* + * Block checking structure. This is used in metadata to validate the + * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all + * zeros. + */ +struct ocfs2_block_check { +/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ + __le16 bc_ecc; /* Single-error-correction parity vector. + This is a simple Hamming code dependent + on the blocksize. OCFS2's maximum + blocksize, 4K, requires 16 parity bits, + so we fit in __le16. */ + __le16 bc_reserved1; +/*08*/ +}; + +/* + * On disk extent record for OCFS2 + * It describes a range of clusters on disk. + * + * Length fields are divided into interior and leaf node versions. + * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. + */ +struct ocfs2_extent_rec { +/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ + union { + __le32 e_int_clusters; /* Clusters covered by all children */ + struct { + __le16 e_leaf_clusters; /* Clusters covered by this + extent */ + __u8 e_reserved1; + __u8 e_flags; /* Extent flags */ + }; + }; + __le64 e_blkno; /* Physical disk offset, in blocks */ +/*10*/ +}; + +struct ocfs2_chain_rec { + __le32 c_free; /* Number of free bits in this chain. */ + __le32 c_total; /* Number of total bits in this chain */ + __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ +}; + +struct ocfs2_truncate_rec { + __le32 t_start; /* 1st cluster in this log */ + __le32 t_clusters; /* Number of total clusters covered */ +}; + +/* + * On disk extent list for OCFS2 (node in the tree). Note that this + * is contained inside ocfs2_dinode or ocfs2_extent_block, so the + * offsets are relative to ocfs2_dinode.id2.i_list or + * ocfs2_extent_block.h_list, respectively. + */ +struct ocfs2_extent_list { +/*00*/ __le16 l_tree_depth; /* Extent tree depth from this + point. 0 means data extents + hang directly off this + header (a leaf) + NOTE: The high 8 bits cannot be + used - tree_depth is never that big. + */ + __le16 l_count; /* Number of extent records */ + __le16 l_next_free_rec; /* Next unused extent slot */ + __le16 l_reserved1; + __le64 l_reserved2; /* Pad to + sizeof(ocfs2_extent_rec) */ +/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +}; + +/* + * On disk allocation chain list for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_chain. + */ +struct ocfs2_chain_list { +/*00*/ __le16 cl_cpg; /* Clusters per Block Group */ + __le16 cl_bpc; /* Bits per cluster */ + __le16 cl_count; /* Total chains in this list */ + __le16 cl_next_free_rec; /* Next unused chain slot */ + __le64 cl_reserved1; +/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +}; + +/* + * On disk deallocation log for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_dealloc. + */ +struct ocfs2_truncate_log { +/*00*/ __le16 tl_count; /* Total records in this log */ + __le16 tl_used; /* Number of records in use */ + __le32 tl_reserved1; +/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +}; + +/* + * On disk extent block (indirect block) for OCFS2 + */ +struct ocfs2_extent_block +{ +/*00*/ __u8 h_signature[8]; /* Signature for verification */ + struct ocfs2_block_check h_check; /* Error checking */ +/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this + extent_header belongs to */ + __le16 h_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 h_fs_generation; /* Must match super block */ + __le64 h_blkno; /* Offset on disk, in blocks */ +/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this + eb belongs to. Only valid + if allocated from a + discontiguous block group */ + __le64 h_next_leaf_blk; /* Offset on disk, in blocks, + of next leaf header pointing + to data */ +/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ +/* Actual on-disk size is one block */ +}; + +/* + * On disk slot map for OCFS2. This defines the contents of the "slot_map" + * system file. A slot is valid if it contains a node number >= 0. The + * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. + */ +struct ocfs2_slot_map { +/*00*/ __le16 sm_slots[0]; +/* + * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, + * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. + */ +}; + +struct ocfs2_extended_slot { +/*00*/ __u8 es_valid; + __u8 es_reserved1[3]; + __le32 es_node_num; +/*10*/ +}; + +/* + * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP + * is set. It separates out the valid marker from the node number, and + * has room to grow. Unlike the old slot map, this format is defined by + * i_size. + */ +struct ocfs2_slot_map_extended { +/*00*/ struct ocfs2_extended_slot se_slots[0]; +/* + * Actual size is i_size of the slot_map system file. It should + * match s_max_slots * sizeof(struct ocfs2_extended_slot) + */ +}; + +/* + * ci_stackflags is only valid if the incompat bit + * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. + */ +struct ocfs2_cluster_info { +/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; + union { + __le32 ci_reserved; + struct { + __u8 ci_stackflags; + __u8 ci_reserved1; + __u8 ci_reserved2; + __u8 ci_reserved3; + }; + }; +/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; +/*18*/ +}; + +/* + * On disk superblock for OCFS2 + * Note that it is contained inside an ocfs2_dinode, so all offsets + * are relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_super_block { +/*00*/ __le16 s_major_rev_level; + __le16 s_minor_rev_level; + __le16 s_mnt_count; + __le16 s_max_mnt_count; + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le32 s_checkinterval; /* Max time between checks */ +/*10*/ __le64 s_lastcheck; /* Time of last check */ + __le32 s_creator_os; /* OS */ + __le32 s_feature_compat; /* Compatible feature set */ +/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ + __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ + __le64 s_root_blkno; /* Offset, in blocks, of root directory + dinode */ +/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system + directory dinode */ + __le32 s_blocksize_bits; /* Blocksize for this fs */ + __le32 s_clustersize_bits; /* Clustersize for this fs */ +/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts + before tunefs required */ + __le16 s_tunefs_flag; + __le32 s_uuid_hash; /* hash value of uuid */ + __le64 s_first_cluster_group; /* Block offset of 1st cluster + * group header */ +/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ +/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ +/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either + userspace or clusterinfo + INCOMPAT flag set. */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. + * s_uuid_hash serves as seed[3]. */ +/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*140*/ + + /* + * NOTE: As stated above, all offsets are relative to + * ocfs2_dinode.id2, which is at 0xC0 in the inode. + * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within + * our smallest blocksize, which is 512 bytes. To ensure this, + * we reserve the space in s_reserved2. Anything past s_reserved2 + * will not be available on the smallest blocksize. + */ +}; + +/* + * Local allocation bitmap for OCFS2 slots + * Note that it exists inside an ocfs2_dinode, so all offsets are + * relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_local_alloc +{ +/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ + __le16 la_size; /* Size of included bitmap, in bytes */ + __le16 la_reserved1; + __le64 la_reserved2; +/*10*/ __u8 la_bitmap[0]; +}; + +/* + * Data-in-inode header. This is only used if i_dyn_features has + * OCFS2_INLINE_DATA_FL set. + */ +struct ocfs2_inline_data +{ +/*00*/ __le16 id_count; /* Number of bytes that can be used + * for data, starting at id_data */ + __le16 id_reserved0; + __le32 id_reserved1; + __u8 id_data[0]; /* Start of user data */ +}; + +/* + * On disk inode for OCFS2 + */ +struct ocfs2_dinode { +/*00*/ __u8 i_signature[8]; /* Signature for validation */ + __le32 i_generation; /* Generation number */ + __le16 i_suballoc_slot; /* Slot suballocator this inode + belongs to */ + __le16 i_suballoc_bit; /* Bit offset in suballocator + block group */ +/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ + __le16 i_xattr_inline_size; + __le32 i_clusters; /* Cluster count */ + __le32 i_uid; /* Owner UID */ + __le32 i_gid; /* Owning GID */ +/*20*/ __le64 i_size; /* Size in bytes */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_flags; /* File flags */ +/*30*/ __le64 i_atime; /* Access time */ + __le64 i_ctime; /* Creation time */ +/*40*/ __le64 i_mtime; /* Modification time */ + __le64 i_dtime; /* Deletion time */ +/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ + __le64 i_last_eb_blk; /* Pointer to last extent + block */ +/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ + __le32 i_atime_nsec; + __le32 i_ctime_nsec; + __le32 i_mtime_nsec; +/*70*/ __le32 i_attr; + __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL + was set in i_flags */ + __le16 i_dyn_features; + __le64 i_xattr_loc; +/*80*/ struct ocfs2_block_check i_check; /* Error checking */ +/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ +/*90*/ __le64 i_refcount_loc; + __le64 i_suballoc_loc; /* Suballocator block group this + inode belongs to. Only valid + if allocated from a + discontiguous block group */ +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; +/*B8*/ union { + __le64 i_pad1; /* Generic way to refer to this + 64bit union */ + struct { + __le64 i_rdev; /* Device number */ + } dev1; + struct { /* Info for bitmap system + inodes */ + __le32 i_used; /* Bits (ie, clusters) used */ + __le32 i_total; /* Total bits (clusters) + available */ + } bitmap1; + struct { /* Info for journal system + inodes */ + __le32 ij_flags; /* Mounted, version, etc. */ + __le32 ij_recovery_generation; /* Incremented when the + journal is recovered + after an unclean + shutdown */ + } journal1; + } id1; /* Inode type dependent 1 */ +/*C0*/ union { + struct ocfs2_super_block i_super; + struct ocfs2_local_alloc i_lab; + struct ocfs2_chain_list i_chain; + struct ocfs2_extent_list i_list; + struct ocfs2_truncate_log i_dealloc; + struct ocfs2_inline_data i_data; + __u8 i_symlink[0]; + } id2; +/* Actual on-disk size is one block */ +}; + +/* + * On-disk directory entry structure for OCFS2 + * + * Packed as this structure could be accessed unaligned on 64-bit platforms + */ +struct ocfs2_dir_entry { +/*00*/ __le64 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; +/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ +/* Actual on-disk length specified by rec_len */ +} __attribute__ ((packed)); + +/* + * Per-block record for the unindexed directory btree. This is carefully + * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are + * mirrored. That way, the directory manipulation code needs a minimal amount + * of update. + * + * NOTE: Keep this structure aligned to a multiple of 4 bytes. + */ +struct ocfs2_dir_block_trailer { +/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ + + __le16 db_compat_rec_len; /* Backwards compatible with + * ocfs2_dir_entry. */ + __u8 db_compat_name_len; /* Always zero. Was name_len */ + __u8 db_reserved0; + __le16 db_reserved1; + __le16 db_free_rec_len; /* Size of largest empty hole + * in this block. (unused) */ +/*10*/ __u8 db_signature[8]; /* Signature for verification */ + __le64 db_reserved2; + __le64 db_free_next; /* Next block in list (unused) */ +/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ + __le64 db_parent_dinode; /* dinode which owns me, in + blocks */ +/*30*/ struct ocfs2_block_check db_check; /* Error checking */ +/*40*/ +}; + + /* + * A directory entry in the indexed tree. We don't store the full name here, + * but instead provide a pointer to the full dirent in the unindexed tree. + * + * We also store name_len here so as to reduce the number of leaf blocks we + * need to search in case of collisions. + */ +struct ocfs2_dx_entry { + __le32 dx_major_hash; /* Used to find logical + * cluster in index */ + __le32 dx_minor_hash; /* Lower bits used to find + * block in cluster */ + __le64 dx_dirent_blk; /* Physical block in unindexed + * tree holding this dirent. */ +}; + +struct ocfs2_dx_entry_list { + __le32 de_reserved; + __le16 de_count; /* Maximum number of entries + * possible in de_entries */ + __le16 de_num_used; /* Current number of + * de_entries entries */ + struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + * in a packed array of + * length de_num_used */ +}; + +#define OCFS2_DX_FLAG_INLINE 0x01 + +/* + * A directory indexing block. Each indexed directory has one of these, + * pointed to by ocfs2_dinode. + * + * This block stores an indexed btree root, and a set of free space + * start-of-list pointers. + */ +struct ocfs2_dx_root_block { + __u8 dr_signature[8]; /* Signature for verification */ + struct ocfs2_block_check dr_check; /* Error checking */ + __le16 dr_suballoc_slot; /* Slot suballocator this + * block belongs to. */ + __le16 dr_suballoc_bit; /* Bit offset in suballocator + * block group */ + __le32 dr_fs_generation; /* Must match super block */ + __le64 dr_blkno; /* Offset on disk, in blocks */ + __le64 dr_last_eb_blk; /* Pointer to last + * extent block */ + __le32 dr_clusters; /* Clusters allocated + * to the indexed tree. */ + __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ + __u8 dr_reserved0; + __le16 dr_reserved1; + __le64 dr_dir_blkno; /* Pointer to parent inode */ + __le32 dr_num_entries; /* Total number of + * names stored in + * this directory.*/ + __le32 dr_reserved2; + __le64 dr_free_blk; /* Pointer to head of free + * unindexed block list. */ + __le64 dr_suballoc_loc; /* Suballocator block group + this root belongs to. + Only valid if allocated + from a discontiguous + block group */ + __le64 dr_reserved3[14]; + union { + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 + * bits for maximum space + * efficiency. */ + struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of + * entries. We grow out + * to extents if this + * gets too big. */ + }; +}; + +/* + * The header of a leaf block in the indexed tree. + */ +struct ocfs2_dx_leaf { + __u8 dl_signature[8];/* Signature for verification */ + struct ocfs2_block_check dl_check; /* Error checking */ + __le64 dl_blkno; /* Offset on disk, in blocks */ + __le32 dl_fs_generation;/* Must match super block */ + __le32 dl_reserved0; + __le64 dl_reserved1; + struct ocfs2_dx_entry_list dl_list; +}; + +/* + * Largest bitmap for a block (suballocator) group in bytes. This limit + * does not affect cluster groups (global allocator). Cluster group + * bitmaps run to the end of the block. + */ +#define OCFS2_MAX_BG_BITMAP_SIZE 256 + +/* + * On disk allocator group structure for OCFS2 + */ +struct ocfs2_group_desc +{ +/*00*/ __u8 bg_signature[8]; /* Signature for validation */ + __le16 bg_size; /* Size of included bitmap in + bytes. */ + __le16 bg_bits; /* Bits represented by this + group. */ + __le16 bg_free_bits_count; /* Free bits count */ + __le16 bg_chain; /* What chain I am in. */ +/*10*/ __le32 bg_generation; + __le32 bg_reserved1; + __le64 bg_next_group; /* Next group in my list, in + blocks */ +/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in + blocks */ + __le64 bg_blkno; /* Offset on disk, in blocks */ +/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ + __le64 bg_reserved2; +/*40*/ union { + __u8 bg_bitmap[0]; + struct { + /* + * Block groups may be discontiguous when + * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. + * The extents of a discontigous block group are + * stored in bg_list. It is a flat list. + * l_tree_depth must always be zero. A + * discontiguous group is signified by a non-zero + * bg_list->l_next_free_rec. Only block groups + * can be discontiguous; Cluster groups cannot. + * We've never made a block group with more than + * 2048 blocks (256 bytes of bg_bitmap). This + * codifies that limit so that we can fit bg_list. + * bg_size of a discontiguous block group will + * be 256 to match bg_bitmap_filler. + */ + __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; +/*140*/ struct ocfs2_extent_list bg_list; + }; + }; +/* Actual on-disk size is one block */ +}; + +struct ocfs2_refcount_rec { +/*00*/ __le64 r_cpos; /* Physical offset, in clusters */ + __le32 r_clusters; /* Clusters covered by this extent */ + __le32 r_refcount; /* Reference count of this extent */ +/*10*/ +}; +#define OCFS2_32BIT_POS_MASK (0xffffffffULL) + +#define OCFS2_REFCOUNT_LEAF_FL (0x00000001) +#define OCFS2_REFCOUNT_TREE_FL (0x00000002) + +struct ocfs2_refcount_list { +/*00*/ __le16 rl_count; /* Maximum number of entries possible + in rl_records */ + __le16 rl_used; /* Current number of used records */ + __le32 rl_reserved2; + __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ +/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +}; + + +struct ocfs2_refcount_block { +/*00*/ __u8 rf_signature[8]; /* Signature for verification */ + __le16 rf_suballoc_slot; /* Slot suballocator this block + belongs to */ + __le16 rf_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 rf_fs_generation; /* Must match superblock */ +/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ + __le64 rf_parent; /* Parent block, only valid if + OCFS2_REFCOUNT_LEAF_FL is set in + rf_flags */ +/*20*/ struct ocfs2_block_check rf_check; /* Error checking */ + __le64 rf_last_eb_blk; /* Pointer to last extent block */ +/*30*/ __le32 rf_count; /* Number of inodes sharing this + refcount tree */ + __le32 rf_flags; /* See the flags above */ + __le32 rf_clusters; /* clusters covered by refcount tree. */ + __le32 rf_cpos; /* cluster offset in refcount tree.*/ +/*40*/ __le32 rf_generation; /* generation number. all be the same + * for the same refcount tree. */ + __le32 rf_reserved0; + __le64 rf_suballoc_loc; /* Suballocator block group this + refcount block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*50*/ __le64 rf_reserved1[6]; +/*80*/ union { + struct ocfs2_refcount_list rf_records; /* List of refcount + records */ + struct ocfs2_extent_list rf_list; /* Extent record list, + only valid if + OCFS2_REFCOUNT_TREE_FL + is set in rf_flags */ + }; +/* Actual on-disk size is one block */ +}; + +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st entry in the + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ + __u8 xe_type; /* the low 7 bits indicate the name prefix + * type and the highest bit indicates whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* Number of xattr buckets + in this extent record, + only valid in the first + bucket. */ + struct ocfs2_block_check xh_check; /* Error checking + (Note, this is only + used for xattr + buckets. A block uses + xb_check and sets + this field to zero.) */ + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * When an xattr's value is large enough, it is stored in an external + * b-tree like file data. The xattr value root points to this structure. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + struct ocfs2_block_check xb_check; /* Error checking */ +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_suballoc_loc; /* Suballocator block group this + xattr block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + +/* + * On disk structures for global quota file + */ + +/* Magic numbers and known versions for global quota files */ +#define OCFS2_GLOBAL_QMAGICS {\ + 0x0cf52470, /* USRQUOTA */ \ + 0x0cf52471 /* GRPQUOTA */ \ +} + +#define OCFS2_GLOBAL_QVERSIONS {\ + 0, \ + 0, \ +} + + +/* Each block of each quota file has a certain fixed number of bytes reserved + * for OCFS2 internal use at its end. OCFS2 can use it for things like + * checksums, etc. */ +#define OCFS2_QBLK_RESERVED_SPACE 8 + +/* Generic header of all quota files */ +struct ocfs2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* Quota format version */ +}; + +#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of global quota file (immediately follows the generic + * header) */ +struct ocfs2_global_disk_dqinfo { +/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ + __le32 dqi_igrace; /* Grace time for inode softlimit excess */ + __le32 dqi_syncms; /* Time after which we sync local changes to + * global quota file */ + __le32 dqi_blocks; /* Number of blocks in quota file */ +/*10*/ __le32 dqi_free_blk; /* First free block in quota file */ + __le32 dqi_free_entry; /* First block with free dquot entry in quota + * file */ +}; + +/* Structure with global user / group information. We reserve some space + * for future use. */ +struct ocfs2_global_disk_dqblk { +/*00*/ __le32 dqb_id; /* ID the structure belongs to */ + __le32 dqb_use_count; /* Number of nodes having reference to this structure */ + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ +/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ +/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space */ +/*30*/ __le64 dqb_curspace; /* current space occupied */ + __le64 dqb_btime; /* time limit for excessive disk use */ +/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ + __le64 dqb_pad1; +/*50*/ __le64 dqb_pad2; +}; + +/* + * On-disk structures for local quota file + */ + +/* Magic numbers and known versions for local quota files */ +#define OCFS2_LOCAL_QMAGICS {\ + 0x0cf524c0, /* USRQUOTA */ \ + 0x0cf524c1 /* GRPQUOTA */ \ +} + +#define OCFS2_LOCAL_QVERSIONS {\ + 0, \ + 0, \ +} + +/* Quota flags in dqinfo header */ +#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ + * quota has been cleanly turned off) */ + +#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of local quota file (immediately follows the generic + * header) */ +struct ocfs2_local_disk_dqinfo { + __le32 dqi_flags; /* Flags for quota file */ + __le32 dqi_chunks; /* Number of chunks of quota structures + * with a bitmap */ + __le32 dqi_blocks; /* Number of blocks allocated for quota file */ +}; + +/* Header of one chunk of a quota file */ +struct ocfs2_local_disk_chunk { + __le32 dqc_free; /* Number of free entries in the bitmap */ + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + * chunk of quota file */ +}; + +/* One entry in local quota file */ +struct ocfs2_local_disk_dqblk { +/*00*/ __le64 dqb_id; /* id this quota applies to */ + __le64 dqb_spacemod; /* Change in the amount of used space */ +/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ +}; + + +/* + * The quota trailer lives at the end of each quota block. + */ + +struct ocfs2_disk_dqtrailer { +/*00*/ struct ocfs2_block_check dq_check; /* Error checking */ +/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ +}; + +static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, + void *buf) +{ + char *ptr = buf; + ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; + + return (struct ocfs2_disk_dqtrailer *)ptr; +} + +#ifdef __KERNEL__ +static inline int ocfs2_fast_symlink_chars(struct super_block *sb) +{ + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + +static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline int ocfs2_dx_entries_per_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline u16 ocfs2_local_alloc_size(struct super_block *sb) +{ + u16 size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(struct super_block *sb, + int suballocator, + u32 feature_incompat) +{ + int size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} + +static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) +{ + u64 offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset >>= sb->s_blocksize_bits; + return offset; + } + + return 0; + +} + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); + + return size / sizeof(struct ocfs2_refcount_rec); +} + +static inline u32 +ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) +{ + return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; +} +#else +static inline int ocfs2_fast_symlink_chars(int blocksize) +{ + return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_max_inline_data_with_xattr(int blocksize, + struct ocfs2_dinode *di) +{ + if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + di->i_xattr_inline_size; + else + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + +static inline int ocfs2_extent_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline int ocfs2_extent_recs_per_eb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_gd(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_local_alloc_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize, + int suballocator, + uint32_t feature_incompat) +{ + int size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} + +static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) +{ + uint64_t offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset /= blocksize; + return offset; + } + + return 0; +} + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} +#endif /* __KERNEL__ */ + + +static inline int ocfs2_system_inode_is_global(int type) +{ + return ((type >= 0) && + (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); +} + +static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, + int type, int slot) +{ + int chars; + + /* + * Global system inodes can only have one copy. Everything + * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode + * list has a copy per slot. + */ + if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) + chars = snprintf(buf, len, "%s", + ocfs2_system_inodes[type].si_name); + else + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name, + slot); + + return chars; +} + +static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, + umode_t mode) +{ + de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) +{ + if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + + le16_to_cpu(gd->bg_size)) != + offsetof(struct ocfs2_group_desc, bg_list)) + return 0; + /* + * Only valid to check l_next_free_rec if + * bg_bitmap + bg_size == bg_list. + */ + if (!gd->bg_list.l_next_free_rec) + return 0; + return 1; +} +#endif /* _OCFS2_FS_H */ + diff --git a/kernel/fs/ocfs2/ocfs2_ioctl.h b/kernel/fs/ocfs2/ocfs2_ioctl.h new file mode 100644 index 000000000..5b27ff1fa --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2_ioctl.h @@ -0,0 +1,242 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_ioctl.h + * + * Defines OCFS2 ioctls. + * + * Copyright (C) 2010 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_IOCTL_H +#define OCFS2_IOCTL_H + +/* + * ioctl commands + */ +#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS + +/* + * Space reservation / allocation / free ioctls and argument structure + * are designed to be compatible with XFS. + * + * ALLOCSP* and FREESP* are not and will never be supported, but are + * included here for completeness. + */ +struct ocfs2_space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) +#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) + +/* Used to pass group descriptor data when online resize is done */ +struct ocfs2_new_group_input { + __u64 group; /* Group descriptor's blkno. */ + __u32 clusters; /* Total number of clusters in this group */ + __u32 frees; /* Total free clusters in this group */ + __u16 chain; /* Chain for this group */ + __u16 reserved1; + __u32 reserved2; +}; + +#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int) +#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) +#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) + +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; + __u64 preserve; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + +/* Following definitions dedicated for ocfs2_info_request ioctls. */ +#define OCFS2_INFO_MAX_REQUEST (50) +#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) + +/* Magic number of all requests */ +#define OCFS2_INFO_MAGIC (0x4F32494E) + +/* + * Always try to separate info request into small pieces to + * guarantee the backward&forward compatibility. + */ +struct ocfs2_info { + __u64 oi_requests; /* Array of __u64 pointers to requests */ + __u32 oi_count; /* Number of requests in info_requests */ + __u32 oi_pad; +}; + +struct ocfs2_info_request { +/*00*/ __u32 ir_magic; /* Magic number */ + __u32 ir_code; /* Info request code */ + __u32 ir_size; /* Size of request */ + __u32 ir_flags; /* Request flags */ +/*10*/ /* Request specific fields */ +}; + +struct ocfs2_info_clustersize { + struct ocfs2_info_request ic_req; + __u32 ic_clustersize; + __u32 ic_pad; +}; + +struct ocfs2_info_blocksize { + struct ocfs2_info_request ib_req; + __u32 ib_blocksize; + __u32 ib_pad; +}; + +struct ocfs2_info_maxslots { + struct ocfs2_info_request im_req; + __u32 im_max_slots; + __u32 im_pad; +}; + +struct ocfs2_info_label { + struct ocfs2_info_request il_req; + __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; +} __attribute__ ((packed)); + +struct ocfs2_info_uuid { + struct ocfs2_info_request iu_req; + __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; +} __attribute__ ((packed)); + +struct ocfs2_info_fs_features { + struct ocfs2_info_request if_req; + __u32 if_compat_features; + __u32 if_incompat_features; + __u32 if_ro_compat_features; + __u32 if_pad; +}; + +struct ocfs2_info_journal_size { + struct ocfs2_info_request ij_req; + __u64 ij_journal_size; +}; + +struct ocfs2_info_freeinode { + struct ocfs2_info_request ifi_req; + struct ocfs2_info_local_freeinode { + __u64 lfi_total; + __u64 lfi_free; + } ifi_stat[OCFS2_MAX_SLOTS]; + __u32 ifi_slotnum; /* out */ + __u32 ifi_pad; +}; + +#define OCFS2_INFO_MAX_HIST (32) + +struct ocfs2_info_freefrag { + struct ocfs2_info_request iff_req; + struct ocfs2_info_freefrag_stats { /* (out) */ + struct ocfs2_info_free_chunk_list { + __u32 fc_chunks[OCFS2_INFO_MAX_HIST]; + __u32 fc_clusters[OCFS2_INFO_MAX_HIST]; + } ffs_fc_hist; + __u32 ffs_clusters; + __u32 ffs_free_clusters; + __u32 ffs_free_chunks; + __u32 ffs_free_chunks_real; + __u32 ffs_min; /* Minimum free chunksize in clusters */ + __u32 ffs_max; + __u32 ffs_avg; + __u32 ffs_pad; + } iff_ffs; + __u32 iff_chunksize; /* chunksize in clusters(in) */ + __u32 iff_pad; +}; + +/* Codes for ocfs2_info_request */ +enum ocfs2_info_type { + OCFS2_INFO_CLUSTERSIZE = 1, + OCFS2_INFO_BLOCKSIZE, + OCFS2_INFO_MAXSLOTS, + OCFS2_INFO_LABEL, + OCFS2_INFO_UUID, + OCFS2_INFO_FS_FEATURES, + OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_FREEINODE, + OCFS2_INFO_FREEFRAG, + OCFS2_INFO_NUM_TYPES +}; + +/* Flags for struct ocfs2_info_request */ +/* Filled by the caller */ +#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not + required. This is a hint. + It is up to ocfs2 whether + the request can be fulfilled + without locking. */ +/* Filled by ocfs2 */ +#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood + this request and + filled in the answer */ + +#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during + request handling. */ + +#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) + +struct ocfs2_move_extents { +/* All values are in bytes */ + /* in */ + __u64 me_start; /* Virtual start in the file to move */ + __u64 me_len; /* Length of the extents to be moved */ + __u64 me_goal; /* Physical offset of the goal, + it's in block unit */ + __u64 me_threshold; /* Maximum distance from goal or threshold + for auto defragmentation */ + __u64 me_flags; /* Flags for the operation: + * - auto defragmentation. + * - refcount,xattr cases. + */ + /* out */ + __u64 me_moved_len; /* Moved/defraged length */ + __u64 me_new_offset; /* Resulting physical location */ + __u32 me_reserved[2]; /* Reserved for futhure */ +}; + +#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to + claim new clusters + as the goal place + for extents moving */ +#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent + moving, is to make + movement less likely + to fail, may make fs + even more fragmented */ +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation + completely gets done. + */ + +#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents) + +#endif /* OCFS2_IOCTL_H */ diff --git a/kernel/fs/ocfs2/ocfs2_lockid.h b/kernel/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 000000000..d277aabf5 --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2_lockid.h @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockid.h + * + * Defines OCFS2 lockid bits. + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKID_H +#define OCFS2_LOCKID_H + +/* lock ids are made up in the following manner: + * name[0] --> type + * name[1-6] --> 6 pad characters, reserved for now + * name[7-22] --> block number, expressed in hex as 16 chars + * name[23-30] --> i_generation, expressed in hex 8 chars + * name[31] --> '\0' */ +#define OCFS2_LOCK_ID_MAX_LEN 32 +#define OCFS2_LOCK_ID_PAD "000000" + +#define OCFS2_DENTRY_LOCK_INO_START 18 + +enum ocfs2_lock_type { + OCFS2_LOCK_TYPE_META = 0, + OCFS2_LOCK_TYPE_DATA, + OCFS2_LOCK_TYPE_SUPER, + OCFS2_LOCK_TYPE_RENAME, + OCFS2_LOCK_TYPE_RW, + OCFS2_LOCK_TYPE_DENTRY, + OCFS2_LOCK_TYPE_OPEN, + OCFS2_LOCK_TYPE_FLOCK, + OCFS2_LOCK_TYPE_QINFO, + OCFS2_LOCK_TYPE_NFS_SYNC, + OCFS2_LOCK_TYPE_ORPHAN_SCAN, + OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_NUM_LOCK_TYPES +}; + +static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) +{ + char c; + switch (type) { + case OCFS2_LOCK_TYPE_META: + c = 'M'; + break; + case OCFS2_LOCK_TYPE_DATA: + c = 'D'; + break; + case OCFS2_LOCK_TYPE_SUPER: + c = 'S'; + break; + case OCFS2_LOCK_TYPE_RENAME: + c = 'R'; + break; + case OCFS2_LOCK_TYPE_RW: + c = 'W'; + break; + case OCFS2_LOCK_TYPE_DENTRY: + c = 'N'; + break; + case OCFS2_LOCK_TYPE_OPEN: + c = 'O'; + break; + case OCFS2_LOCK_TYPE_FLOCK: + c = 'F'; + break; + case OCFS2_LOCK_TYPE_QINFO: + c = 'Q'; + break; + case OCFS2_LOCK_TYPE_NFS_SYNC: + c = 'Y'; + break; + case OCFS2_LOCK_TYPE_ORPHAN_SCAN: + c = 'P'; + break; + case OCFS2_LOCK_TYPE_REFCOUNT: + c = 'T'; + break; + default: + c = '\0'; + } + + return c; +} + +static char *ocfs2_lock_type_strings[] = { + [OCFS2_LOCK_TYPE_META] = "Meta", + [OCFS2_LOCK_TYPE_DATA] = "Data", + [OCFS2_LOCK_TYPE_SUPER] = "Super", + [OCFS2_LOCK_TYPE_RENAME] = "Rename", + /* Need to differntiate from [R]ename.. serializing writes is the + * important job it does, anyway. */ + [OCFS2_LOCK_TYPE_RW] = "Write/Read", + [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", + [OCFS2_LOCK_TYPE_OPEN] = "Open", + [OCFS2_LOCK_TYPE_FLOCK] = "Flock", + [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", + [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", +}; + +static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) +{ +#ifdef __KERNEL__ + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); +#endif + return ocfs2_lock_type_strings[type]; +} + +#endif /* OCFS2_LOCKID_H */ diff --git a/kernel/fs/ocfs2/ocfs2_lockingver.h b/kernel/fs/ocfs2/ocfs2_lockingver.h new file mode 100644 index 000000000..2e45c8d2e --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2_lockingver.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockingver.h + * + * Defines OCFS2 Locking version values. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_LOCKINGVER_H +#define OCFS2_LOCKINGVER_H + +/* + * The protocol version for ocfs2 cluster locking. See dlmglue.c for + * more details. + * + * 1.0 - Initial locking version from ocfs2 1.4. + */ +#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 +#define OCFS2_LOCKING_PROTOCOL_MINOR 0 + +#endif /* OCFS2_LOCKINGVER_H */ diff --git a/kernel/fs/ocfs2/ocfs2_trace.h b/kernel/fs/ocfs2/ocfs2_trace.h new file mode 100644 index 000000000..6cb019b7c --- /dev/null +++ b/kernel/fs/ocfs2/ocfs2_trace.h @@ -0,0 +1,2768 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ocfs2 + +#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OCFS2_H + +#include + +DECLARE_EVENT_CLASS(ocfs2__int, + TP_PROTO(int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field(int, num) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%d", __entry->num) +); + +#define DEFINE_OCFS2_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int, name, \ + TP_PROTO(int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__uint, + TP_PROTO(unsigned int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field( unsigned int, num ) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%u", __entry->num) +); + +#define DEFINE_OCFS2_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint, name, \ + TP_PROTO(unsigned int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__ull, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu", __entry->blkno) +); + +#define DEFINE_OCFS2_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull, name, \ + TP_PROTO(unsigned long long num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__pointer, + TP_PROTO(void *pointer), + TP_ARGS(pointer), + TP_STRUCT__entry( + __field(void *, pointer) + ), + TP_fast_assign( + __entry->pointer = pointer; + ), + TP_printk("%p", __entry->pointer) +); + +#define DEFINE_OCFS2_POINTER_EVENT(name) \ +DEFINE_EVENT(ocfs2__pointer, name, \ + TP_PROTO(void *pointer), \ + TP_ARGS(pointer)) + +DECLARE_EVENT_CLASS(ocfs2__string, + TP_PROTO(const char *name), + TP_ARGS(name), + TP_STRUCT__entry( + __string(name,name) + ), + TP_fast_assign( + __assign_str(name, name); + ), + TP_printk("%s", __get_str(name)) +); + +#define DEFINE_OCFS2_STRING_EVENT(name) \ +DEFINE_EVENT(ocfs2__string, name, \ + TP_PROTO(const char *name), \ + TP_ARGS(name)) + +DECLARE_EVENT_CLASS(ocfs2__int_int, + TP_PROTO(int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%d %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int_int, name, \ + TP_PROTO(int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_int, + TP_PROTO(unsigned int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_int, name, \ + TP_PROTO(unsigned int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint, name, \ + TP_PROTO(unsigned int val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint, + TP_PROTO(unsigned long long value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint, name, \ + TP_PROTO(unsigned long long val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int, + TP_PROTO(unsigned long long value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int, name, \ + TP_PROTO(unsigned long long val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull, + TP_PROTO(unsigned long long value1, unsigned long long value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %llu", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull, name, \ + TP_PROTO(unsigned long long val1, unsigned long long val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %u", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned long long val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint, + TP_PROTO(unsigned long long value1, + unsigned int value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u", __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned int val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2, + unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned int, value1 ) + __field( unsigned int, value2 ) + __field( unsigned int, value3 ) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint_uint, name, \ + TP_PROTO(unsigned int value1, unsigned int value2, \ + unsigned int value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned long long value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned long long, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %llu", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_ull, name, \ + TP_PROTO(unsigned long long value1, unsigned long long value2, \ + unsigned long long value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int, + TP_PROTO(unsigned long long ull, int value1, int value2, int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned long long, ull ) + __field( int, value1 ) + __field( int, value2 ) + __field( int, value3 ) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %d %d %d", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int_int_int, name, \ + TP_PROTO(unsigned long long ull, int value1, \ + int value2, int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint, + TP_PROTO(unsigned long long ull, unsigned int value1, + unsigned int value2, unsigned int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, ull) + __field(unsigned int, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u %u", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned int value1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint, + TP_PROTO(unsigned long long value1, unsigned long long value2, + unsigned int value3, unsigned int value4), + TP_ARGS(value1, value2, value3, value4), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + __field(unsigned int, value4) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + __entry->value4 = value4; + ), + TP_printk("%llu %llu %u %u", + __entry->value1, __entry->value2, + __entry->value3, __entry->value4) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned long long ull1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, ull1, value2, value3)) + +/* Trace events for fs/ocfs2/alloc.c. */ +DECLARE_EVENT_CLASS(ocfs2__btree_ops, + TP_PROTO(unsigned long long owner,\ + unsigned int value1, unsigned int value2), + TP_ARGS(owner, value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u %u", + __entry->owner, __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_BTREE_EVENT(name) \ +DEFINE_EVENT(ocfs2__btree_ops, name, \ + TP_PROTO(unsigned long long owner, \ + unsigned int value1, unsigned int value2), \ + TP_ARGS(owner, value1, value2)) + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree); + +DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert); + +TRACE_EVENT(ocfs2_grow_tree, + TP_PROTO(unsigned long long owner, int depth), + TP_ARGS(owner, depth), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(int, depth) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->depth = depth; + ), + TP_printk("%llu %d", __entry->owner, __entry->depth) +); + +TRACE_EVENT(ocfs2_rotate_subtree, + TP_PROTO(int subtree_root, unsigned long long blkno, + int depth), + TP_ARGS(subtree_root, blkno, depth), + TP_STRUCT__entry( + __field(int, subtree_root) + __field(unsigned long long, blkno) + __field(int, depth) + ), + TP_fast_assign( + __entry->subtree_root = subtree_root; + __entry->blkno = blkno; + __entry->depth = depth; + ), + TP_printk("%d %llu %d", __entry->subtree_root, + __entry->blkno, __entry->depth) +); + +TRACE_EVENT(ocfs2_insert_extent, + TP_PROTO(unsigned int ins_appending, unsigned int ins_contig, + int ins_contig_index, int free_records, int ins_tree_depth), + TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records, + ins_tree_depth), + TP_STRUCT__entry( + __field(unsigned int, ins_appending) + __field(unsigned int, ins_contig) + __field(int, ins_contig_index) + __field(int, free_records) + __field(int, ins_tree_depth) + ), + TP_fast_assign( + __entry->ins_appending = ins_appending; + __entry->ins_contig = ins_contig; + __entry->ins_contig_index = ins_contig_index; + __entry->free_records = free_records; + __entry->ins_tree_depth = ins_tree_depth; + ), + TP_printk("%u %u %d %d %d", + __entry->ins_appending, __entry->ins_contig, + __entry->ins_contig_index, __entry->free_records, + __entry->ins_tree_depth) +); + +TRACE_EVENT(ocfs2_split_extent, + TP_PROTO(int split_index, unsigned int c_contig_type, + unsigned int c_has_empty_extent, + unsigned int c_split_covers_rec), + TP_ARGS(split_index, c_contig_type, + c_has_empty_extent, c_split_covers_rec), + TP_STRUCT__entry( + __field(int, split_index) + __field(unsigned int, c_contig_type) + __field(unsigned int, c_has_empty_extent) + __field(unsigned int, c_split_covers_rec) + ), + TP_fast_assign( + __entry->split_index = split_index; + __entry->c_contig_type = c_contig_type; + __entry->c_has_empty_extent = c_has_empty_extent; + __entry->c_split_covers_rec = c_split_covers_rec; + ), + TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type, + __entry->c_has_empty_extent, __entry->c_split_covers_rec) +); + +TRACE_EVENT(ocfs2_remove_extent, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, int index, + unsigned int e_cpos, unsigned int clusters), + TP_ARGS(owner, cpos, len, index, e_cpos, clusters), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(int, index) + __field(unsigned int, e_cpos) + __field(unsigned int, clusters) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->index = index; + __entry->e_cpos = e_cpos; + __entry->clusters = clusters; + ), + TP_printk("%llu %u %u %d %u %u", + __entry->owner, __entry->cpos, __entry->len, __entry->index, + __entry->e_cpos, __entry->clusters) +); + +TRACE_EVENT(ocfs2_commit_truncate, + TP_PROTO(unsigned long long ino, unsigned int new_cpos, + unsigned int clusters, unsigned int depth), + TP_ARGS(ino, new_cpos, clusters, depth), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, new_cpos) + __field(unsigned int, clusters) + __field(unsigned int, depth) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->new_cpos = new_cpos; + __entry->clusters = clusters; + __entry->depth = depth; + ), + TP_printk("%llu %u %u %u", + __entry->ino, __entry->new_cpos, + __entry->clusters, __entry->depth) +); + +TRACE_EVENT(ocfs2_validate_extent_block, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu ", __entry->blkno) +); + +TRACE_EVENT(ocfs2_rotate_leaf, + TP_PROTO(unsigned int insert_cpos, int insert_index, + int has_empty, int next_free, + unsigned int l_count), + TP_ARGS(insert_cpos, insert_index, has_empty, + next_free, l_count), + TP_STRUCT__entry( + __field(unsigned int, insert_cpos) + __field(int, insert_index) + __field(int, has_empty) + __field(int, next_free) + __field(unsigned int, l_count) + ), + TP_fast_assign( + __entry->insert_cpos = insert_cpos; + __entry->insert_index = insert_index; + __entry->has_empty = has_empty; + __entry->next_free = next_free; + __entry->l_count = l_count; + ), + TP_printk("%u %d %d %d %u", __entry->insert_cpos, + __entry->insert_index, __entry->has_empty, + __entry->next_free, __entry->l_count) +); + +TRACE_EVENT(ocfs2_add_clusters_in_btree_ret, + TP_PROTO(int status, int reason, int err), + TP_ARGS(status, reason, err), + TP_STRUCT__entry( + __field(int, status) + __field(int, reason) + __field(int, err) + ), + TP_fast_assign( + __entry->status = status; + __entry->reason = reason; + __entry->err = err; + ), + TP_printk("%d %d %d", __entry->status, + __entry->reason, __entry->err) +); + +TRACE_EVENT(ocfs2_mark_extent_written, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, unsigned int phys), + TP_ARGS(owner, cpos, len, phys), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, phys) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->phys = phys; + ), + TP_printk("%llu %u %u %u", + __entry->owner, __entry->cpos, + __entry->len, __entry->phys) +); + +DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned int start, unsigned int num), + TP_ARGS(blkno, index, start, num), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned int, start) + __field(unsigned int, num) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->start = start; + __entry->num = num; + ), + TP_printk("%llu %d %u %u", + __entry->blkno, __entry->index, + __entry->start, __entry->num) +); + +#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__truncate_log_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned int start, unsigned int num), \ + TP_ARGS(blkno, index, start, num)) + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append); + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs); + +TRACE_EVENT(ocfs2_cache_block_dealloc, + TP_PROTO(int type, int slot, unsigned long long suballoc, + unsigned long long blkno, unsigned int bit), + TP_ARGS(type, slot, suballoc, blkno, bit), + TP_STRUCT__entry( + __field(int, type) + __field(int, slot) + __field(unsigned long long, suballoc) + __field(unsigned long long, blkno) + __field(unsigned int, bit) + ), + TP_fast_assign( + __entry->type = type; + __entry->slot = slot; + __entry->suballoc = suballoc; + __entry->blkno = blkno; + __entry->bit = bit; + ), + TP_printk("%d %d %llu %llu %u", + __entry->type, __entry->slot, __entry->suballoc, + __entry->blkno, __entry->bit) +); + +TRACE_EVENT(ocfs2_trim_extent, + TP_PROTO(struct super_block *sb, unsigned long long blk, + unsigned long long count), + TP_ARGS(sb, blk, count), + TP_STRUCT__entry( + __field(int, dev_major) + __field(int, dev_minor) + __field(unsigned long long, blk) + __field(__u64, count) + ), + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->blk = blk; + __entry->count = count; + ), + TP_printk("%d %d %llu %llu", + __entry->dev_major, __entry->dev_minor, + __entry->blk, __entry->count) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); + +/* End of trace events for fs/ocfs2/alloc.c. */ + +/* Trace events for fs/ocfs2/localalloc.c. */ + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local); + +DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main); + +TRACE_EVENT(ocfs2_sync_local_to_main_free, + TP_PROTO(int count, int bit, unsigned long long start_blk, + unsigned long long blkno), + TP_ARGS(count, bit, start_blk, blkno), + TP_STRUCT__entry( + __field(int, count) + __field(int, bit) + __field(unsigned long long, start_blk) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->count = count; + __entry->bit = bit; + __entry->start_blk = start_blk; + __entry->blkno = blkno; + ), + TP_printk("%d %d %llu %llu", + __entry->count, __entry->bit, __entry->start_blk, + __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result); + +/* End of trace events for fs/ocfs2/localalloc.c. */ + +/* Trace events for fs/ocfs2/resize.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add); + +/* End of trace events for fs/ocfs2/resize.c. */ + +/* Trace events for fs/ocfs2/suballoc.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits); + +TRACE_EVENT(ocfs2_relink_block_group, + TP_PROTO(unsigned long long i_blkno, unsigned int chain, + unsigned long long bg_blkno, + unsigned long long prev_blkno), + TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno), + TP_STRUCT__entry( + __field(unsigned long long, i_blkno) + __field(unsigned int, chain) + __field(unsigned long long, bg_blkno) + __field(unsigned long long, prev_blkno) + ), + TP_fast_assign( + __entry->i_blkno = i_blkno; + __entry->chain = chain; + __entry->bg_blkno = bg_blkno; + __entry->prev_blkno = prev_blkno; + ), + TP_printk("%llu %u %llu %llu", + __entry->i_blkno, __entry->chain, __entry->bg_blkno, + __entry->prev_blkno) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits); + +TRACE_EVENT(ocfs2_free_suballoc_bits, + TP_PROTO(unsigned long long inode, unsigned long long group, + unsigned int start_bit, unsigned int count), + TP_ARGS(inode, group, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, inode) + __field(unsigned long long, group) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->group = group; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->inode, __entry->group, + __entry->start_bit, __entry->count) +); + +TRACE_EVENT(ocfs2_free_clusters, + TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk, + unsigned int start_bit, unsigned int count), + TP_ARGS(bg_blkno, start_blk, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, bg_blkno) + __field(unsigned long long, start_blk) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->bg_blkno = bg_blkno; + __entry->start_blk = start_blk; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk, + __entry->start_bit, __entry->count) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit); + +/* End of trace events for fs/ocfs2/suballoc.c. */ + +/* Trace events for fs/ocfs2/refcounttree.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block); + +DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned long long cpos, + unsigned int clusters, unsigned int refcount), + TP_ARGS(blkno, index, cpos, clusters, refcount), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + ), + TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index, + __entry->cpos, __entry->clusters, __entry->refcount) +); + +#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned long long cpos, \ + unsigned int count, unsigned int refcount), \ + TP_ARGS(blkno, index, cpos, count, refcount)) + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec); + +TRACE_EVENT(ocfs2_split_refcount_rec, + TP_PROTO(unsigned long long cpos, + unsigned int clusters, unsigned int refcount, + unsigned long long split_cpos, + unsigned int split_clusters, unsigned int split_refcount), + TP_ARGS(cpos, clusters, refcount, + split_cpos, split_clusters, split_refcount), + TP_STRUCT__entry( + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + __field(unsigned long long, split_cpos) + __field(unsigned int, split_clusters) + __field(unsigned int, split_refcount) + ), + TP_fast_assign( + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + __entry->split_cpos = split_cpos; + __entry->split_clusters = split_clusters; + __entry->split_refcount = split_refcount; + ), + TP_printk("%llu %u %u %llu %u %u", + __entry->cpos, __entry->clusters, __entry->refcount, + __entry->split_cpos, __entry->split_clusters, + __entry->split_refcount) +); + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec); + +TRACE_EVENT(ocfs2_decrease_refcount, + TP_PROTO(unsigned long long owner, + unsigned long long cpos, + unsigned int len, int delete), + TP_ARGS(owner, cpos, len, delete), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned long long, cpos) + __field(unsigned int, len) + __field(int, delete) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->delete = delete; + ), + TP_printk("%llu %llu %u %d", + __entry->owner, __entry->cpos, __entry->len, __entry->delete) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits); + +TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate, + TP_PROTO(int recs_add, unsigned long long cpos, + unsigned int clusters, unsigned long long r_cpos, + unsigned int r_clusters, unsigned int refcount, int index), + TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index), + TP_STRUCT__entry( + __field(int, recs_add) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned long long, r_cpos) + __field(unsigned int, r_clusters) + __field(unsigned int, refcount) + __field(int, index) + ), + TP_fast_assign( + __entry->recs_add = recs_add; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->r_cpos = r_cpos; + __entry->r_clusters = r_clusters; + __entry->refcount = refcount; + __entry->index = index; + ), + TP_printk("%d %llu %u %llu %u %u %d", + __entry->recs_add, __entry->cpos, __entry->clusters, + __entry->r_cpos, __entry->r_clusters, + __entry->refcount, __entry->index) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd); + +TRACE_EVENT(ocfs2_clear_ext_refcount, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int len, unsigned int p_cluster, + unsigned int ext_flags), + TP_ARGS(ino, cpos, len, p_cluster, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, p_cluster) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->len = len; + __entry->p_cluster = p_cluster; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u", + __entry->ino, __entry->cpos, __entry->len, + __entry->p_cluster, __entry->ext_flags) +); + +TRACE_EVENT(ocfs2_replace_clusters, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int old, unsigned int new, unsigned int len, + unsigned int ext_flags), + TP_ARGS(ino, cpos, old, new, len, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, old) + __field(unsigned int, new) + __field(unsigned int, len) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->old = old; + __entry->new = new; + __entry->len = len; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->old, __entry->new, + __entry->len, __entry->ext_flags) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable); + +TRACE_EVENT(ocfs2_refcount_cow_hunk, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int write_len, unsigned int max_cpos, + unsigned int cow_start, unsigned int cow_len), + TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, write_len) + __field(unsigned int, max_cpos) + __field(unsigned int, cow_start) + __field(unsigned int, cow_len) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->write_len = write_len; + __entry->max_cpos = max_cpos; + __entry->cow_start = cow_start; + __entry->cow_len = cow_len; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->write_len, + __entry->max_cpos, __entry->cow_start, __entry->cow_len) +); + +/* End of trace events for fs/ocfs2/refcounttree.c. */ + +/* Trace events for fs/ocfs2/aops.c. */ + +DECLARE_EVENT_CLASS(ocfs2__get_block, + TP_PROTO(unsigned long long ino, unsigned long long iblock, + void *bh_result, int create), + TP_ARGS(ino, iblock, bh_result, create), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, iblock) + __field(void *, bh_result) + __field(int, create) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->iblock = iblock; + __entry->bh_result = bh_result; + __entry->create = create; + ), + TP_printk("%llu %llu %p %d", + __entry->ino, __entry->iblock, + __entry->bh_result, __entry->create) +); + +#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \ +DEFINE_EVENT(ocfs2__get_block, name, \ + TP_PROTO(unsigned long long ino, unsigned long long iblock, \ + void *bh_result, int create), \ + TP_ARGS(ino, iblock, bh_result, create)) + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block); + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); + +TRACE_EVENT(ocfs2_try_to_write_inline_data, + TP_PROTO(unsigned long long ino, unsigned int len, + unsigned long long pos, unsigned int flags), + TP_ARGS(ino, len, pos, flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, len) + __field(unsigned long long, pos) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->len = len; + __entry->pos = pos; + __entry->flags = flags; + ), + TP_printk("%llu %u %llu 0x%x", + __entry->ino, __entry->len, __entry->pos, __entry->flags) +); + +TRACE_EVENT(ocfs2_write_begin_nolock, + TP_PROTO(unsigned long long ino, + long long i_size, unsigned int i_clusters, + unsigned long long pos, unsigned int len, + unsigned int flags, void *page, + unsigned int clusters, unsigned int extents_to_split), + TP_ARGS(ino, i_size, i_clusters, pos, len, flags, + page, clusters, extents_to_split), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(long long, i_size) + __field(unsigned int, i_clusters) + __field(unsigned long long, pos) + __field(unsigned int, len) + __field(unsigned int, flags) + __field(void *, page) + __field(unsigned int, clusters) + __field(unsigned int, extents_to_split) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->i_size = i_size; + __entry->i_clusters = i_clusters; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + __entry->page = page; + __entry->clusters = clusters; + __entry->extents_to_split = extents_to_split; + ), + TP_printk("%llu %lld %u %llu %u %u %p %u %u", + __entry->ino, __entry->i_size, __entry->i_clusters, + __entry->pos, __entry->len, + __entry->flags, __entry->page, __entry->clusters, + __entry->extents_to_split) +); + +TRACE_EVENT(ocfs2_write_end_inline, + TP_PROTO(unsigned long long ino, + unsigned long long pos, unsigned int copied, + unsigned int id_count, unsigned int features), + TP_ARGS(ino, pos, copied, id_count, features), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, pos) + __field(unsigned int, copied) + __field(unsigned int, id_count) + __field(unsigned int, features) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->pos = pos; + __entry->copied = copied; + __entry->id_count = id_count; + __entry->features = features; + ), + TP_printk("%llu %llu %u %u %u", + __entry->ino, __entry->pos, __entry->copied, + __entry->id_count, __entry->features) +); + +/* End of trace events for fs/ocfs2/aops.c. */ + +/* Trace events for fs/ocfs2/mmap.c. */ + +TRACE_EVENT(ocfs2_fault, + TP_PROTO(unsigned long long ino, + void *area, void *page, unsigned long pgoff), + TP_ARGS(ino, area, page, pgoff), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(void *, area) + __field(void *, page) + __field(unsigned long, pgoff) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->area = area; + __entry->page = page; + __entry->pgoff = pgoff; + ), + TP_printk("%llu %p %p %lu", + __entry->ino, __entry->area, __entry->page, __entry->pgoff) +); + +/* End of trace events for fs/ocfs2/mmap.c. */ + +/* Trace events for fs/ocfs2/file.c. */ + +DECLARE_EVENT_CLASS(ocfs2__file_ops, + TP_PROTO(void *inode, void *file, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned long long para), + TP_ARGS(inode, file, dentry, ino, d_len, d_name, para), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, file) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned long long, para) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->file = file; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->para = para; + ), + TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, + __entry->dentry, __entry->ino, __entry->para, + __entry->d_len, __get_str(d_name)) +); + +#define DEFINE_OCFS2_FILE_OPS(name) \ +DEFINE_EVENT(ocfs2__file_ops, name, \ +TP_PROTO(void *inode, void *file, void *dentry, \ + unsigned long long ino, \ + unsigned int d_len, const unsigned char *d_name, \ + unsigned long long mode), \ + TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode)) + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_open); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); + +DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); + +TRACE_EVENT(ocfs2_extend_allocation, + TP_PROTO(unsigned long long ip_blkno, unsigned long long size, + unsigned int clusters, unsigned int clusters_to_add, + int why, int restart_func), + TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func), + TP_STRUCT__entry( + __field(unsigned long long, ip_blkno) + __field(unsigned long long, size) + __field(unsigned int, clusters) + __field(unsigned int, clusters_to_add) + __field(int, why) + __field(int, restart_func) + ), + TP_fast_assign( + __entry->ip_blkno = ip_blkno; + __entry->size = size; + __entry->clusters = clusters; + __entry->clusters_to_add = clusters_to_add; + __entry->why = why; + __entry->restart_func = restart_func; + ), + TP_printk("%llu %llu %u %u %d %d", + __entry->ip_blkno, __entry->size, __entry->clusters, + __entry->clusters_to_add, __entry->why, __entry->restart_func) +); + +TRACE_EVENT(ocfs2_extend_allocation_end, + TP_PROTO(unsigned long long ino, + unsigned int di_clusters, unsigned long long di_size, + unsigned int ip_clusters, unsigned long long i_size), + TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, di_clusters) + __field(unsigned long long, di_size) + __field(unsigned int, ip_clusters) + __field(unsigned long long, i_size) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->di_clusters = di_clusters; + __entry->di_size = di_size; + __entry->ip_clusters = ip_clusters; + __entry->i_size = i_size; + ), + TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters, + __entry->di_size, __entry->ip_clusters, __entry->i_size) +); + +TRACE_EVENT(ocfs2_write_zero_page, + TP_PROTO(unsigned long long ino, + unsigned long long abs_from, unsigned long long abs_to, + unsigned long index, unsigned int zero_from, + unsigned int zero_to), + TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, abs_from) + __field(unsigned long long, abs_to) + __field(unsigned long, index) + __field(unsigned int, zero_from) + __field(unsigned int, zero_to) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->abs_from = abs_from; + __entry->abs_to = abs_to; + __entry->index = index; + __entry->zero_from = zero_from; + __entry->zero_to = zero_to; + ), + TP_printk("%llu %llu %llu %lu %u %u", __entry->ino, + __entry->abs_from, __entry->abs_to, + __entry->index, __entry->zero_from, __entry->zero_to) +); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend); + +TRACE_EVENT(ocfs2_setattr, + TP_PROTO(void *inode, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned int ia_valid, unsigned int ia_mode, + unsigned int ia_uid, unsigned int ia_gid), + TP_ARGS(inode, dentry, ino, d_len, d_name, + ia_valid, ia_mode, ia_uid, ia_gid), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned int, ia_valid) + __field(unsigned int, ia_mode) + __field(unsigned int, ia_uid) + __field(unsigned int, ia_gid) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->ia_valid = ia_valid; + __entry->ia_mode = ia_mode; + __entry->ia_uid = ia_uid; + __entry->ia_gid = ia_gid; + ), + TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode, + __entry->dentry, __entry->ino, __entry->d_len, + __get_str(d_name), __entry->ia_valid, __entry->ia_mode, + __entry->ia_uid, __entry->ia_gid) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); + +TRACE_EVENT(ocfs2_prepare_inode_for_write, + TP_PROTO(unsigned long long ino, unsigned long long saved_pos, + int appending, unsigned long count, + int *direct_io, int *has_refcount), + TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, saved_pos) + __field(int, appending) + __field(unsigned long, count) + __field(int, direct_io) + __field(int, has_refcount) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->saved_pos = saved_pos; + __entry->appending = appending; + __entry->count = count; + __entry->direct_io = direct_io ? *direct_io : -1; + __entry->has_refcount = has_refcount ? *has_refcount : -1; + ), + TP_printk("%llu %llu %d %lu %d %d", __entry->ino, + __entry->saved_pos, __entry->appending, __entry->count, + __entry->direct_io, __entry->has_refcount) +); + +DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); + +/* End of trace events for fs/ocfs2/file.c. */ + +/* Trace events for fs/ocfs2/inode.c. */ + +TRACE_EVENT(ocfs2_iget_begin, + TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type), + TP_ARGS(ino, flags, sysfile_type), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, flags) + __field(int, sysfile_type) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->flags = flags; + __entry->sysfile_type = sysfile_type; + ), + TP_printk("%llu %u %d", __entry->ino, + __entry->flags, __entry->sysfile_type) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked); + +TRACE_EVENT(ocfs2_iget_end, + TP_PROTO(void *inode, unsigned long long ino), + TP_ARGS(inode, ino), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + ), + TP_printk("%p %llu", __entry->inode, __entry->ino) +); + +TRACE_EVENT(ocfs2_find_actor, + TP_PROTO(void *inode, unsigned long long ino, + void *args, unsigned long long fi_blkno), + TP_ARGS(inode, ino, args, fi_blkno), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(void *, args) + __field(unsigned long long, fi_blkno) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->args = args; + __entry->fi_blkno = fi_blkno; + ), + TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino, + __entry->args, __entry->fi_blkno) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); + +TRACE_EVENT(ocfs2_inode_is_valid_to_delete, + TP_PROTO(void *task, void *dc_task, unsigned long long ino, + unsigned int flags), + TP_ARGS(task, dc_task, ino, flags), + TP_STRUCT__entry( + __field(void *, task) + __field(void *, dc_task) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->task = task; + __entry->dc_task = dc_task; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task, + __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); + +TRACE_EVENT(ocfs2_inode_revalidate, + TP_PROTO(void *inode, unsigned long long ino, + unsigned int flags), + TP_ARGS(inode, ino, flags), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty); + +/* End of trace events for fs/ocfs2/inode.c. */ + +/* Trace events for fs/ocfs2/extent_map.c. */ + +TRACE_EVENT(ocfs2_read_virt_blocks, + TP_PROTO(void *inode, unsigned long long vblock, int nr, + void *bhs, unsigned int flags, void *validate), + TP_ARGS(inode, vblock, nr, bhs, flags, validate), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, vblock) + __field(int, nr) + __field(void *, bhs) + __field(unsigned int, flags) + __field(void *, validate) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->vblock = vblock; + __entry->nr = nr; + __entry->bhs = bhs; + __entry->flags = flags; + __entry->validate = validate; + ), + TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock, + __entry->nr, __entry->bhs, __entry->flags, __entry->validate) +); + +/* End of trace events for fs/ocfs2/extent_map.c. */ + +/* Trace events for fs/ocfs2/slot_map.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block); + +DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot); + +/* End of trace events for fs/ocfs2/slot_map.c. */ + +/* Trace events for fs/ocfs2/heartbeat.c. */ + +DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down); + +/* End of trace events for fs/ocfs2/heartbeat.c. */ + +/* Trace events for fs/ocfs2/super.c. */ + +TRACE_EVENT(ocfs2_remount, + TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags), + TP_ARGS(s_flags, osb_flags, flags), + TP_STRUCT__entry( + __field(unsigned long, s_flags) + __field(unsigned long, osb_flags) + __field(int, flags) + ), + TP_fast_assign( + __entry->s_flags = s_flags; + __entry->osb_flags = osb_flags; + __entry->flags = flags; + ), + TP_printk("%lu %lu %d", __entry->s_flags, + __entry->osb_flags, __entry->flags) +); + +TRACE_EVENT(ocfs2_fill_super, + TP_PROTO(void *sb, void *data, int silent), + TP_ARGS(sb, data, silent), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, data) + __field(int, silent) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->data = data; + __entry->silent = silent; + ), + TP_printk("%p %p %d", __entry->sb, + __entry->data, __entry->silent) +); + +TRACE_EVENT(ocfs2_parse_options, + TP_PROTO(int is_remount, char *options), + TP_ARGS(is_remount, options), + TP_STRUCT__entry( + __field(int, is_remount) + __string(options, options) + ), + TP_fast_assign( + __entry->is_remount = is_remount; + __assign_str(options, options); + ), + TP_printk("%d %s", __entry->is_remount, __get_str(options)) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); + +TRACE_EVENT(ocfs2_statfs, + TP_PROTO(void *sb, void *buf), + TP_ARGS(sb, buf), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, buf) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->buf = buf; + ), + TP_printk("%p %p", __entry->sb, __entry->buf) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume); + +TRACE_EVENT(ocfs2_initialize_super, + TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir, + unsigned long long system_dir, int cluster_bits), + TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits), + TP_STRUCT__entry( + __string(label, label) + __string(uuid_str, uuid_str) + __field(unsigned long long, root_dir) + __field(unsigned long long, system_dir) + __field(int, cluster_bits) + ), + TP_fast_assign( + __assign_str(label, label); + __assign_str(uuid_str, uuid_str); + __entry->root_dir = root_dir; + __entry->system_dir = system_dir; + __entry->cluster_bits = cluster_bits; + ), + TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str), + __entry->root_dir, __entry->system_dir, __entry->cluster_bits) +); + +/* End of trace events for fs/ocfs2/super.c. */ + +/* Trace events for fs/ocfs2/xattr.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation); + +TRACE_EVENT(ocfs2_init_xattr_set_ctxt, + TP_PROTO(const char *name, int meta, int clusters, int credits), + TP_ARGS(name, meta, clusters, credits), + TP_STRUCT__entry( + __string(name, name) + __field(int, meta) + __field(int, clusters) + __field(int, credits) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->meta = meta; + __entry->clusters = clusters; + __entry->credits = credits; + ), + TP_printk("%s %d %d %d", __get_str(name), __entry->meta, + __entry->clusters, __entry->credits) +); + +DECLARE_EVENT_CLASS(ocfs2__xattr_find, + TP_PROTO(unsigned long long ino, const char *name, int name_index, + unsigned int hash, unsigned long long location, + int xe_index), + TP_ARGS(ino, name, name_index, hash, location, xe_index), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __string(name, name) + __field(int, name_index) + __field(unsigned int, hash) + __field(unsigned long long, location) + __field(int, xe_index) + ), + TP_fast_assign( + __entry->ino = ino; + __assign_str(name, name); + __entry->name_index = name_index; + __entry->hash = hash; + __entry->location = location; + __entry->xe_index = xe_index; + ), + TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name), + __entry->name_index, __entry->hash, __entry->location, + __entry->xe_index) +); + +#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \ +DEFINE_EVENT(ocfs2__xattr_find, name, \ +TP_PROTO(unsigned long long ino, const char *name, int name_index, \ + unsigned int hash, unsigned long long bucket, \ + int xe_index), \ + TP_ARGS(ino, name, name_index, hash, bucket, xe_index)) + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec); + +/* End of trace events for fs/ocfs2/xattr.c. */ + +/* Trace events for fs/ocfs2/reservations.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end); + +TRACE_EVENT(ocfs2_resv_find_window_begin, + TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal, + unsigned int wanted, int empty_root), + TP_ARGS(r_start, r_end, goal, wanted, empty_root), + TP_STRUCT__entry( + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, goal) + __field(unsigned int, wanted) + __field(int, empty_root) + ), + TP_fast_assign( + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->goal = goal; + __entry->wanted = wanted; + __entry->empty_root = empty_root; + ), + TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end, + __entry->goal, __entry->wanted, __entry->empty_root) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin); + +TRACE_EVENT(ocfs2_cannibalize_resv_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_begin, + TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen, + unsigned int r_start, unsigned int r_end, unsigned int r_len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(cstart, cend, clen, r_start, r_end, + r_len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, cstart) + __field(unsigned int, cend) + __field(unsigned int, clen) + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, r_len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->cstart = cstart; + __entry->cend = cend; + __entry->clen = clen; + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->r_len = r_len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u %u %u %u", + __entry->cstart, __entry->cend, __entry->clen, + __entry->r_start, __entry->r_end, __entry->r_len, + __entry->last_start, __entry->last_len) +); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +/* End of trace events for fs/ocfs2/reservations.c. */ + +/* Trace events for fs/ocfs2/quota_local.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file); + +DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot); + +/* End of trace events for fs/ocfs2/quota_local.c. */ + +/* Trace events for fs/ocfs2/quota_global.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block); + +TRACE_EVENT(ocfs2_sync_dquot, + TP_PROTO(unsigned int dq_id, long long dqb_curspace, + long long spacechange, long long curinodes, + long long inodechange), + TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange), + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(long long, dqb_curspace) + __field(long long, spacechange) + __field(long long, curinodes) + __field(long long, inodechange) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dqb_curspace = dqb_curspace; + __entry->spacechange = spacechange; + __entry->curinodes = curinodes; + __entry->inodechange = inodechange; + ), + TP_printk("%u %lld %lld %lld %lld", __entry->dq_id, + __entry->dqb_curspace, __entry->spacechange, + __entry->curinodes, __entry->inodechange) +); + +TRACE_EVENT(ocfs2_sync_dquot_helper, + TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type, + const char *s_id), + TP_ARGS(dq_id, dq_type, type, s_id), + + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(unsigned int, dq_type) + __field(unsigned long, type) + __string(s_id, s_id) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dq_type = dq_type; + __entry->type = type; + __assign_str(s_id, s_id); + ), + TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, + __entry->type, __get_str(s_id)) +); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); + +/* End of trace events for fs/ocfs2/quota_global.c. */ + +/* Trace events for fs/ocfs2/dir.c. */ +DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el); + +TRACE_EVENT(ocfs2_dx_dir_search, + TP_PROTO(unsigned long long ino, int namelen, const char *name, + unsigned int major_hash, unsigned int minor_hash, + unsigned long long blkno), + TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, namelen) + __string(name, name) + __field(unsigned int, major_hash) + __field(unsigned int,minor_hash) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->blkno = blkno; + ), + TP_printk("%llu %.*s %u %u %llu", __entry->ino, + __entry->namelen, __get_str(name), + __entry->major_hash, __entry->minor_hash, __entry->blkno) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir); + +TRACE_EVENT(ocfs2_find_files_on_disk, + TP_PROTO(int namelen, const char *name, void *blkno, + unsigned long long dir), + TP_ARGS(namelen, name, blkno, dir), + TP_STRUCT__entry( + __field(int, namelen) + __string(name, name) + __field(void *, blkno) + __field(unsigned long long, dir) + ), + TP_fast_assign( + __entry->namelen = namelen; + __assign_str(name, name); + __entry->blkno = blkno; + __entry->dir = dir; + ), + TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name), + __entry->blkno, __entry->dir) +); + +TRACE_EVENT(ocfs2_check_dir_for_entry, + TP_PROTO(unsigned long long dir, int namelen, const char *name), + TP_ARGS(dir, namelen, name), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(int, namelen) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->namelen = namelen; + __assign_str(name, name); + ), + TP_printk("%llu %.*s", __entry->dir, + __entry->namelen, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster); + +TRACE_EVENT(ocfs2_dx_dir_index_root_block, + TP_PROTO(unsigned long long dir, + unsigned int major_hash, unsigned int minor_hash, + int namelen, const char *name, unsigned int num_used), + TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(unsigned int, major_hash) + __field(unsigned int, minor_hash) + __field(int, namelen) + __string(name, name) + __field(unsigned int, num_used) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->num_used = num_used; + ), + TP_printk("%llu %x %x %.*s %u", __entry->dir, + __entry->major_hash, __entry->minor_hash, + __entry->namelen, __get_str(name), __entry->num_used) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert); + +/* End of trace events for fs/ocfs2/dir.c. */ + +/* Trace events for fs/ocfs2/namei.c. */ + +DECLARE_EVENT_CLASS(ocfs2__dentry_ops, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long long extra), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long long, extra) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->extra = extra; + ), + TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->extra) +); + +#define DEFINE_OCFS2_DENTRY_OPS(name) \ +DEFINE_EVENT(ocfs2__dentry_ops, name, \ +TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \ + unsigned long long dir_blkno, unsigned long long extra), \ + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra)) + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret); + +TRACE_EVENT(ocfs2_mknod, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long dev, int mode), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long, dev) + __field(int, mode) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->dev = dev; + __entry->mode = mode; + ), + TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->dev, __entry->mode) +); + +TRACE_EVENT(ocfs2_link, + TP_PROTO(unsigned long long ino, int old_len, const char *old_name, + int name_len, const char *name), + TP_ARGS(ino, old_len, old_name, name_len, name), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, old_len) + __string(old_name, old_name) + __field(int, name_len) + __string(name, name) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->name_len = name_len; + __assign_str(name, name); + ), + TP_printk("%llu %.*s %.*s", __entry->ino, + __entry->old_len, __get_str(old_name), + __entry->name_len, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end); + +TRACE_EVENT(ocfs2_rename, + TP_PROTO(void *old_dir, void *old_dentry, + void *new_dir, void *new_dentry, + int old_len, const char *old_name, + int new_len, const char *new_name), + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, + old_len, old_name, new_len, new_name), + TP_STRUCT__entry( + __field(void *, old_dir) + __field(void *, old_dentry) + __field(void *, new_dir) + __field(void *, new_dentry) + __field(int, old_len) + __string(old_name, old_name) + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->old_dir = old_dir; + __entry->old_dentry = old_dentry; + __entry->new_dir = new_dir; + __entry->new_dentry = new_dentry; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%p %p %p %p %.*s %.*s", + __entry->old_dir, __entry->old_dentry, + __entry->new_dir, __entry->new_dentry, + __entry->old_len, __get_str(old_name), + __entry->new_len, __get_str(new_name)) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + +TRACE_EVENT(ocfs2_rename_target_exists, + TP_PROTO(int new_len, const char *new_name), + TP_ARGS(new_len, new_name), + TP_STRUCT__entry( + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%.*s", __entry->new_len, __get_str(new_name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree); + +TRACE_EVENT(ocfs2_rename_over_existing, + TP_PROTO(unsigned long long new_blkno, void *new_bh, + unsigned long long newdi_blkno), + TP_ARGS(new_blkno, new_bh, newdi_blkno), + TP_STRUCT__entry( + __field(unsigned long long, new_blkno) + __field(void *, new_bh) + __field(unsigned long long, newdi_blkno) + ), + TP_fast_assign( + __entry->new_blkno = new_blkno; + __entry->new_bh = new_bh; + __entry->newdi_blkno = newdi_blkno; + ), + TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh, + __entry->newdi_blkno) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data); + +TRACE_EVENT(ocfs2_symlink_begin, + TP_PROTO(void *dir, void *dentry, const char *symname, + int len, const char *name), + TP_ARGS(dir, dentry, symname, len, name), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(const char *, symname) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->symname = symname; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, + __entry->symname, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_blkno_stringify, + TP_PROTO(unsigned long long blkno, const char *name, int namelen), + TP_ARGS(blkno, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->blkno = blkno; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->blkno, __get_str(name), + __entry->namelen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end); + +TRACE_EVENT(ocfs2_orphan_del, + TP_PROTO(unsigned long long dir, const char *name, int namelen), + TP_ARGS(dir, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->dir = dir; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->dir, __get_str(name), + __entry->namelen) +); + +/* End of trace events for fs/ocfs2/namei.c. */ + +/* Trace events for fs/ocfs2/dcache.c. */ + +TRACE_EVENT(ocfs2_dentry_revalidate, + TP_PROTO(void *dentry, int len, const char *name), + TP_ARGS(dentry, len, name), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_revalidate_negative, + TP_PROTO(int len, const char *name, unsigned long pgen, + unsigned long gen), + TP_ARGS(len, name, pgen, gen), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long, pgen) + __field(unsigned long, gen) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->pgen = pgen; + __entry->gen = gen; + ), + TP_printk("%.*s %lu %lu", __entry->len, __get_str(name), + __entry->pgen, __entry->gen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata); + +DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret); + +TRACE_EVENT(ocfs2_find_local_alias, + TP_PROTO(int len, const char *name), + TP_ARGS(len, name), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%.*s", __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock, + TP_PROTO(int len, const char *name, + unsigned long long parent, void *fsdata), + TP_ARGS(len, name, parent, fsdata), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long long, parent) + __field(void *, fsdata) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->parent = parent; + __entry->fsdata = fsdata; + ), + TP_printk("%.*s %llu %p", __entry->len, __get_str(name), + __entry->parent, __entry->fsdata) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock_found, + TP_PROTO(const char *name, unsigned long long parent, + unsigned long long ino), + TP_ARGS(name, parent, ino), + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long long, parent) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->parent = parent; + __entry->ino = ino; + ), + TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino) +); +/* End of trace events for fs/ocfs2/dcache.c. */ + +/* Trace events for fs/ocfs2/export.c. */ + +TRACE_EVENT(ocfs2_get_dentry_begin, + TP_PROTO(void *sb, void *handle, unsigned long long blkno), + TP_ARGS(sb, handle, blkno), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, handle) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->handle = handle; + __entry->blkno = blkno; + ), + TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end); + +TRACE_EVENT(ocfs2_get_parent, + TP_PROTO(void *child, int len, const char *name, + unsigned long long ino), + TP_ARGS(child, len, name, ino), + TP_STRUCT__entry( + __field(void *, child) + __field(int, len) + __string(name, name) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->child = child; + __entry->len = len; + __assign_str(name, name); + __entry->ino = ino; + ), + TP_printk("%p %.*s %llu", __entry->child, __entry->len, + __get_str(name), __entry->ino) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end); + +TRACE_EVENT(ocfs2_encode_fh_begin, + TP_PROTO(void *dentry, int name_len, const char *name, + void *fh, int len, int connectable), + TP_ARGS(dentry, name_len, name, fh, len, connectable), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(void *, fh) + __field(int, len) + __field(int, connectable) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->fh = fh; + __entry->len = len; + __entry->connectable = connectable; + ), + TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len, + __get_str(name), __entry->fh, __entry->len, + __entry->connectable) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent); + +DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type); + +/* End of trace events for fs/ocfs2/export.c. */ + +/* Trace events for fs/ocfs2/journal.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); + +DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen); + +DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end); + +TRACE_EVENT(ocfs2_complete_recovery_slot, + TP_PROTO(int slot, unsigned long long la_ino, + unsigned long long tl_ino, void *qrec), + TP_ARGS(slot, la_ino, tl_ino, qrec), + TP_STRUCT__entry( + __field(int, slot) + __field(unsigned long long, la_ino) + __field(unsigned long long, tl_ino) + __field(void *, qrec) + ), + TP_fast_assign( + __entry->slot = slot; + __entry->la_ino = la_ino; + __entry->tl_ino = tl_ino; + __entry->qrec = qrec; + ), + TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino, + __entry->tl_ino, __entry->qrec) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end); + +TRACE_EVENT(ocfs2_recovery_thread, + TP_PROTO(int node_num, int osb_node_num, int disable, + void *recovery_thread, int map_set), + TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set), + TP_STRUCT__entry( + __field(int, node_num) + __field(int, osb_node_num) + __field(int,disable) + __field(void *, recovery_thread) + __field(int,map_set) + ), + TP_fast_assign( + __entry->node_num = node_num; + __entry->osb_node_num = osb_node_num; + __entry->disable = disable; + __entry->recovery_thread = recovery_thread; + __entry->map_set = map_set; + ), + TP_printk("%d %d %d %p %d", __entry->node_num, + __entry->osb_node_num, __entry->disable, + __entry->recovery_thread, __entry->map_set) +); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput); + +DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount); + +/* End of trace events for fs/ocfs2/journal.c. */ + +/* Trace events for fs/ocfs2/buffer_head_io.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end); + +TRACE_EVENT(ocfs2_write_block, + TP_PROTO(unsigned long long block, void *ci), + TP_ARGS(block, ci), + TP_STRUCT__entry( + __field(unsigned long long, block) + __field(void *, ci) + ), + TP_fast_assign( + __entry->block = block; + __entry->ci = ci; + ), + TP_printk("%llu %p", __entry->block, __entry->ci) +); + +TRACE_EVENT(ocfs2_read_blocks_begin, + TP_PROTO(void *ci, unsigned long long block, + unsigned int nr, int flags), + TP_ARGS(ci, block, nr, flags), + TP_STRUCT__entry( + __field(void *, ci) + __field(unsigned long long, block) + __field(unsigned int, nr) + __field(int, flags) + ), + TP_fast_assign( + __entry->ci = ci; + __entry->block = block; + __entry->nr = nr; + __entry->flags = flags; + ), + TP_printk("%p %llu %u %d", __entry->ci, __entry->block, + __entry->nr, __entry->flags) +); + +/* End of trace events for fs/ocfs2/buffer_head_io.c. */ + +/* Trace events for fs/ocfs2/uptodate.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin); + +TRACE_EVENT(ocfs2_buffer_cached_end, + TP_PROTO(int index, void *item), + TP_ARGS(index, item), + TP_STRUCT__entry( + __field(int, index) + __field(void *, item) + ), + TP_fast_assign( + __entry->index = index; + __entry->item = item; + ), + TP_printk("%d %p", __entry->index, __entry->item) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache); + +/* End of trace events for fs/ocfs2/uptodate.c. */ +#endif /* _TRACE_OCFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE ocfs2_trace +#include diff --git a/kernel/fs/ocfs2/quota.h b/kernel/fs/ocfs2/quota.h new file mode 100644 index 000000000..b6d51333a --- /dev/null +++ b/kernel/fs/ocfs2/quota.h @@ -0,0 +1,123 @@ +/* + * quota.h for OCFS2 + * + * On disk quota structures for local and global quota file, in-memory + * structures. + * + */ + +#ifndef _OCFS2_QUOTA_H +#define _OCFS2_QUOTA_H + +#include +#include +#include +#include +#include + +#include "ocfs2.h" + +/* Number of quota types we support */ +#define OCFS2_MAXQUOTAS 2 + +/* + * In-memory structures + */ +struct ocfs2_dquot { + struct dquot dq_dquot; /* Generic VFS dquot */ + loff_t dq_local_off; /* Offset in the local quota file */ + u64 dq_local_phys_blk; /* Physical block carrying quota structure */ + struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ + unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ + s64 dq_origspace; /* Last globally synced space usage */ + s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ +}; + +/* Description of one chunk to recover in memory */ +struct ocfs2_recovery_chunk { + struct list_head rc_list; /* List of chunks */ + int rc_chunk; /* Chunk number */ + unsigned long *rc_bitmap; /* Bitmap of entries to recover */ +}; + +struct ocfs2_quota_recovery { + struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */ +}; + +/* In-memory structure with quota header information */ +struct ocfs2_mem_dqinfo { + unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_flags; /* Flags OLQF_* */ + unsigned int dqi_chunks; /* Number of chunks in local quota file */ + unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ + unsigned int dqi_syncms; /* How often should we sync with other nodes */ + struct list_head dqi_chunk; /* List of chunks */ + struct inode *dqi_gqinode; /* Global quota file inode */ + struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ + struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ + int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ + u64 dqi_giblk; /* Number of block with global information header */ + struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ + struct buffer_head *dqi_libh; /* Buffer with local information header */ + struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ + struct delayed_work dqi_sync_work; /* Work for syncing dquots */ + struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery + * information, in case we + * enable quotas on file + * needing it */ +}; + +static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot) +{ + return container_of(dquot, struct ocfs2_dquot, dq_dquot); +} + +struct ocfs2_quota_chunk { + struct list_head qc_chunk; /* List of quotafile chunks */ + int qc_num; /* Number of quota chunk */ + struct buffer_head *qc_headerbh; /* Buffer head with chunk header */ +}; + +extern struct kmem_cache *ocfs2_dquot_cachep; +extern struct kmem_cache *ocfs2_qf_chunk_cachep; + +extern struct qtree_fmt_operations ocfs2_global_ops; + +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, int slot_num); +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num); +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec); +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +int ocfs2_global_read_info(struct super_block *sb, int type); +int ocfs2_global_write_info(struct super_block *sb, int type); +int ocfs2_global_read_dquot(struct dquot *dquot); +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); +static inline int ocfs2_sync_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 0); +} +static inline int ocfs2_global_release_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 1); +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh); +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bh); +int ocfs2_create_local_dquot(struct dquot *dquot); +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); +int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); + +extern const struct dquot_operations ocfs2_quota_operations; +extern struct quota_format_type ocfs2_quota_format; + +#endif /* _OCFS2_QUOTA_H */ diff --git a/kernel/fs/ocfs2/quota_global.c b/kernel/fs/ocfs2/quota_global.c new file mode 100644 index 000000000..c93d67220 --- /dev/null +++ b/kernel/fs/ocfs2/quota_global.c @@ -0,0 +1,971 @@ +/* + * Implementation of operations over global quota file + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "inode.h" +#include "journal.h" +#include "file.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "quota.h" +#include "ocfs2_trace.h" + +/* + * Locking of quotas with OCFS2 is rather complex. Here are rules that + * should be obeyed by all the functions: + * - any write of quota structure (either to local or global file) is protected + * by dqio_mutex or dquot->dq_lock. + * - any modification of global quota file holds inode cluster lock, i_mutex, + * and ip_alloc_sem of the global quota file (achieved by + * ocfs2_lock_global_qf). It also has to hold qinfo_lock. + * - an allocation of new blocks for local quota file is protected by + * its ip_alloc_sem + * + * A rough sketch of locking dependencies (lf = local file, gf = global file): + * Normal filesystem operation: + * start_trans -> dqio_mutex -> write to lf + * Syncing of local and global file: + * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + * -> write to lf + * Acquire dquot for the first time: + * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf + * -> alloc space for gf + * -> start_trans -> qinfo_lock -> write to gf + * -> ip_alloc_sem of lf -> alloc space for lf + * -> write to lf + * Release last reference to dquot: + * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf + * -> write to lf + * Note that all the above operations also hold the inode cluster lock of lf. + * Recovery: + * inode cluster lock of recovered lf + * -> read bitmaps -> ip_alloc_sem of lf + * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + */ + +static void qsync_work_fn(struct work_struct *work); + +static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + /* Update from disk only entries not set by the admin */ + if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) { + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) { + m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit); + m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags)) + m->dqb_btime = le64_to_cpu(d->dqb_btime); + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags)) + m->dqb_itime = le64_to_cpu(d->dqb_itime); + OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count); +} + +static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); + d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); + d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; +} + +static int ocfs2_global_is_id(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + + if (qtree_entry_unused(&oinfo->dqi_gi, dp)) + return 0; + + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); +} + +struct qtree_fmt_operations ocfs2_global_ops = { + .mem2disk_dqblk = ocfs2_global_mem2diskdqb, + .disk2mem_dqblk = ocfs2_global_disk2memdqb, + .is_id = ocfs2_global_is_id, +}; + +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); + + trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); +} + +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bhp) +{ + int rc; + + *bhp = NULL; + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + return rc; +} + +/* Read data from global quotafile - avoid pagecache and such because we cannot + * afford acquiring the locks... We use quota cluster lock to serialize + * operations. Caller is responsible for acquiring it. */ +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + loff_t i_size = i_size_read(gqinode); + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + struct buffer_head *bh; + size_t toread, tocopy; + u64 pblock = 0, pcount = 0; + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); + if (!pcount) { + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, + &pcount, NULL); + if (err) { + mlog_errno(err); + return err; + } + } else { + pcount--; + pblock++; + } + bh = NULL; + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); + if (err) { + mlog_errno(err); + return err; + } + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0, new = 0, ja_type; + struct buffer_head *bh = NULL; + handle_t *handle = journal_current_handle(); + u64 pblock, pcount; + + if (!handle) { + mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " + "because transaction was not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) { + WARN_ON(1); + len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; + } + + if (i_size_read(gqinode) < off + len) { + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + + /* Space is already allocated in ocfs2_acquire_dquot() */ + err = ocfs2_simple_size_update(gqinode, + oinfo->dqi_gqi_bh, + rounded_end); + if (err < 0) + goto out; + new = 1; + } + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL); + if (err) { + mlog_errno(err); + goto out; + } + /* Not rewriting whole block? */ + if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && + !new) { + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_WRITE; + } else { + bh = sb_getblk(sb, pblock); + if (!bh) + err = -ENOMEM; + ja_type = OCFS2_JOURNAL_ACCESS_CREATE; + } + if (err) { + mlog_errno(err); + goto out; + } + lock_buffer(bh); + if (new) + memset(bh->b_data, 0, sb->s_blocksize); + memcpy(bh->b_data + offset, data, len); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); + err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh, + ja_type); + if (err < 0) { + brelse(bh); + goto out; + } + ocfs2_journal_dirty(handle, bh); + brelse(bh); +out: + if (err) { + mlog_errno(err); + return err; + } + gqinode->i_version++; + ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); + return len; +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + int status; + struct buffer_head *bh = NULL; + + status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex); + if (status < 0) + return status; + spin_lock(&dq_data_lock); + if (!oinfo->dqi_gqi_count++) + oinfo->dqi_gqi_bh = bh; + else + WARN_ON(bh != oinfo->dqi_gqi_bh); + spin_unlock(&dq_data_lock); + if (ex) { + mutex_lock(&oinfo->dqi_gqinode->i_mutex); + down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } else { + down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } + return 0; +} + +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + if (ex) { + up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + } else { + up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } + ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); + brelse(oinfo->dqi_gqi_bh); + spin_lock(&dq_data_lock); + if (!--oinfo->dqi_gqi_count) + oinfo->dqi_gqi_bh = NULL; + spin_unlock(&dq_data_lock); +} + +/* Read information header from global quota file */ +int ocfs2_global_read_info(struct super_block *sb, int type) +{ + struct inode *gqinode = NULL; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct ocfs2_global_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + u64 pcount; + int status; + + /* Read global header */ + gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!gqinode) { + mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", + type); + status = -EINVAL; + goto out_err; + } + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + &pcount, NULL); + if (status < 0) + goto out_unlock; + + status = ocfs2_qinfo_lock(oinfo, 0); + if (status < 0) + goto out_unlock; + status = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + ocfs2_qinfo_unlock(oinfo, 0); + ocfs2_unlock_global_qf(oinfo, 0); + if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot read global quota info (%d).\n", + status); + if (status >= 0) + status = -EIO; + mlog_errno(status); + goto out_err; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits; + oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize - + OCFS2_QBLK_RESERVED_SPACE; + oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); + INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); + +out_err: + return status; +out_unlock: + ocfs2_unlock_global_qf(oinfo, 0); + mlog_errno(status); + goto out_err; +} + +/* Write information to global quota file. Expects exlusive lock on quota + * file inode and quota info */ +static int __ocfs2_global_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_global_disk_dqinfo dinfo; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + spin_unlock(&dq_data_lock); + dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms); + dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + if (size != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot write global quota info structure\n"); + if (size >= 0) + size = -EIO; + return size; + } + return 0; +} + +int ocfs2_global_write_info(struct super_block *sb, int type) +{ + int err; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + return err; + err = __ocfs2_global_write_info(sb, type); + ocfs2_qinfo_unlock(info, 1); + return err; +} + +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, info block and + * the inode */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1; +} + +/* Sync local information about quota modifications with global quota file. + * Caller must have started the transaction and obtained exclusive lock for + * global quota file inode */ +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) +{ + int err, err2; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_id.type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_global_disk_dqblk dqblk; + s64 spacechange, inodechange; + time_t olditime, oldbtime; + + err = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct ocfs2_global_disk_dqblk), + dquot->dq_off); + if (err != sizeof(struct ocfs2_global_disk_dqblk)) { + if (err >= 0) { + mlog(ML_ERROR, "Short read from global quota file " + "(%u read)\n", err); + err = -EIO; + } + goto out; + } + + /* Update space and inode usage. Get also other information from + * global quota file so that we don't overwrite any changes there. + * We are */ + spin_lock(&dq_data_lock); + spacechange = dquot->dq_dqb.dqb_curspace - + OCFS2_DQUOT(dquot)->dq_origspace; + inodechange = dquot->dq_dqb.dqb_curinodes - + OCFS2_DQUOT(dquot)->dq_originodes; + olditime = dquot->dq_dqb.dqb_itime; + oldbtime = dquot->dq_dqb.dqb_btime; + ocfs2_global_disk2memdqb(dquot, &dqblk); + trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_dqb.dqb_curspace, + (long long)spacechange, + dquot->dq_dqb.dqb_curinodes, + (long long)inodechange); + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curspace += spacechange; + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curinodes += inodechange; + /* Set properly space grace time... */ + if (dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) && + oldbtime > 0) { + if (dquot->dq_dqb.dqb_btime > 0) + dquot->dq_dqb.dqb_btime = + min(dquot->dq_dqb.dqb_btime, oldbtime); + else + dquot->dq_dqb.dqb_btime = oldbtime; + } + } else { + dquot->dq_dqb.dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + /* Set properly inode grace time... */ + if (dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) && + olditime > 0) { + if (dquot->dq_dqb.dqb_itime > 0) + dquot->dq_dqb.dqb_itime = + min(dquot->dq_dqb.dqb_itime, olditime); + else + dquot->dq_dqb.dqb_itime = olditime; + } + } else { + dquot->dq_dqb.dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + /* All information is properly updated, clear the flags */ + __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + spin_unlock(&dq_data_lock); + err = ocfs2_qinfo_lock(info, freeing); + if (err < 0) { + mlog(ML_ERROR, "Failed to lock quota info, losing quota write" + " (type=%d, id=%u)\n", dquot->dq_id.type, + (unsigned)from_kqid(&init_user_ns, dquot->dq_id)); + goto out; + } + if (freeing) + OCFS2_DQUOT(dquot)->dq_use_count--; + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) { + err = qtree_release_dquot(&info->dqi_gi, dquot); + if (info_dirty(sb_dqinfo(sb, type))) { + err2 = __ocfs2_global_write_info(sb, type); + if (!err) + err = err2; + } + } +out_qlock: + ocfs2_qinfo_unlock(info, freeing); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* + * Functions for periodic syncing of dquots with global file + */ +static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) +{ + handle_t *handle; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + int status = 0; + + trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type, + type, sb->s_id); + if (type != dquot->dq_id.type) + goto out; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + if (status < 0) + mlog_errno(status); + /* We have to write local structure as well... */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) + mlog_errno(status); + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + return status; +} + +static void qsync_work_fn(struct work_struct *work) +{ + struct ocfs2_mem_dqinfo *oinfo = container_of(work, + struct ocfs2_mem_dqinfo, + dqi_sync_work.work); + struct super_block *sb = oinfo->dqi_gqinode->i_sb; + + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); +} + +/* + * Wrappers for generic quota functions + */ + +static int ocfs2_write_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type); + + handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + status = ocfs2_local_write_dquot(dquot); + mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out: + return status; +} + +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_QINFO_WRITE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; +} + +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ +static int ocfs2_release_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type); + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (atomic_read(&dquot->dq_count) > 1) + goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(ocfs2_wq, &osb->dquot_drop_work); + goto out; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_local_release_dquot(handle, dquot); + /* + * If we fail here, we cannot do much as global structure is + * already released. So just complain... + */ + if (status < 0) + mlog_errno(status); + /* + * Clear dq_off so that we search for the structure in quota file next + * time we acquire it. The structure might be deleted and reallocated + * elsewhere by another node while our dquot structure is on freelist. + */ + dquot->dq_off = 0; + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_trans: + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mutex_unlock(&dquot->dq_lock); + if (status) + mlog_errno(status); + return status; +} + +/* + * Read global dquot structure from disk or create it if it does + * not exist. Also update use count of the global structure and + * create structure in node-local quota file. + */ +static int ocfs2_acquire_dquot(struct dquot *dquot) +{ + int status = 0, err; + int ex = 0; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int type = dquot->dq_id.type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle; + + trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id), + type); + mutex_lock(&dquot->dq_lock); + /* + * We need an exclusive lock, because we're going to update use count + * and instantiate possibly new dquot structure + */ + status = ocfs2_lock_global_qf(info, 1); + if (status < 0) + goto out; + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + /* + * We always want to read dquot structure from disk because we don't + * know what happened with it while it was on freelist. + */ + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; + + OCFS2_DQUOT(dquot)->dq_use_count++; + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + if (!dquot->dq_off) { /* No real quota entry? */ + ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + status = ocfs2_extend_no_holes(gqinode, NULL, + i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits), + i_size_read(gqinode)); + if (status < 0) + goto out_dq; + } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto out_dq; + } + status = ocfs2_qinfo_lock(info, ex); + if (status < 0) + goto out_trans; + status = qtree_write_dquot(&info->dqi_gi, dquot); + if (ex && info_dirty(sb_dqinfo(sb, type))) { + err = __ocfs2_global_write_info(sb, type); + if (!status) + status = err; + } + ocfs2_qinfo_unlock(info, ex); +out_trans: + ocfs2_commit_trans(osb, handle); +out_dq: + ocfs2_unlock_global_qf(info, 1); + if (status < 0) + goto out; + + status = ocfs2_create_local_dquot(dquot); + if (status < 0) + goto out; + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out: + mutex_unlock(&dquot->dq_lock); + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_mark_dquot_dirty(struct dquot *dquot) +{ + unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_INODES_B)) | + (1 << (DQ_LASTSET_B + QIF_SPACE_B)) | + (1 << (DQ_LASTSET_B + QIF_BTIME_B)) | + (1 << (DQ_LASTSET_B + QIF_ITIME_B)); + int sync = 0; + int status; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_id.type; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(sb); + + trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id), + type); + + /* In case user set some limits, sync dquot immediately to global + * quota file so that information propagates quicker */ + spin_lock(&dq_data_lock); + if (dquot->dq_flags & mask) + sync = 1; + spin_unlock(&dq_data_lock); + /* This is a slight hack but we can't afford getting global quota + * lock if we already have a transaction started. */ + if (!sync || journal_current_handle()) { + status = ocfs2_write_dquot(dquot); + goto out; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_dlock; + } + /* Now write updated local dquot structure */ + status = ocfs2_local_write_dquot(dquot); +out_dlock: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status) + mlog_errno(status); + return status; +} + +/* This should happen only after set_dqinfo(). */ +static int ocfs2_write_info(struct super_block *sb, int type) +{ + handle_t *handle; + int status = 0; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_commit_info(sb, type); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status) + mlog_errno(status); + return status; +} + +static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) +{ + struct ocfs2_dquot *dquot = + kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS); + + if (!dquot) + return NULL; + return &dquot->dq_dquot; +} + +static void ocfs2_destroy_dquot(struct dquot *dquot) +{ + kmem_cache_free(ocfs2_dquot_cachep, dquot); +} + +const struct dquot_operations ocfs2_quota_operations = { + /* We never make dquot dirty so .write_dquot is never called */ + .acquire_dquot = ocfs2_acquire_dquot, + .release_dquot = ocfs2_release_dquot, + .mark_dirty = ocfs2_mark_dquot_dirty, + .write_info = ocfs2_write_info, + .alloc_dquot = ocfs2_alloc_dquot, + .destroy_dquot = ocfs2_destroy_dquot, +}; diff --git a/kernel/fs/ocfs2/quota_local.c b/kernel/fs/ocfs2/quota_local.c new file mode 100644 index 000000000..3d0b63d34 --- /dev/null +++ b/kernel/fs/ocfs2/quota_local.c @@ -0,0 +1,1315 @@ +/* + * Implementation of operations over local quota file + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "file.h" +#include "buffer_head_io.h" +#include "journal.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "quota.h" +#include "uptodate.h" +#include "super.h" +#include "ocfs2_trace.h" + +/* Number of local quota structures per block */ +static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) +{ + return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) / + sizeof(struct ocfs2_local_disk_dqblk)); +} + +/* Number of blocks with entries in one chunk */ +static inline unsigned int ol_chunk_blocks(struct super_block *sb) +{ + return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE) << 3) / + ol_quota_entries_per_block(sb); +} + +/* Number of entries in a chunk bitmap */ +static unsigned int ol_chunk_entries(struct super_block *sb) +{ + return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb); +} + +/* Offset of the chunk in quota file */ +static unsigned int ol_quota_chunk_block(struct super_block *sb, int c) +{ + /* 1 block for local quota file info, 1 block per chunk for chunk info */ + return 1 + (ol_chunk_blocks(sb) + 1) * c; +} + +static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ol_quota_chunk_block(sb, c) + 1 + off / epb; +} + +static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Offset of the dquot structure in the quota file */ +static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) +{ + return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) + + ol_dqblk_block_off(sb, c, off); +} + +static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) +{ + return off & ((1 << sb->s_blocksize_bits) - 1); +} + +/* Compute offset in the chunk of a structure with the given offset */ +static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ((off >> sb->s_blocksize_bits) - + ol_quota_chunk_block(sb, c) - 1) * epb + + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) / + sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Write bufferhead into the fs */ +static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, + void (*modify)(struct buffer_head *, void *), void *private) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + int status; + + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + return status; + } + status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + lock_buffer(bh); + modify(bh, private); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + return status; + } + return 0; +} + +/* + * Read quota block from a given logical offset. + * + * This function acquires ip_alloc_sem and thus it must not be called with a + * transaction started. + */ +static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +/* Check whether we understand format of quota files */ +static int ocfs2_local_check_quota_file(struct super_block *sb, int type) +{ + unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct buffer_head *bh = NULL; + struct inode *linode = sb_dqopt(sb)->files[type]; + struct inode *ginode = NULL; + struct ocfs2_disk_dqheader *dqhead; + int status, ret = 0; + + /* First check whether we understand local quota file */ + status = ocfs2_read_quota_block(linode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file header (type=%d)\n", + type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) { + mlog(ML_ERROR, "quota file magic does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_magic), + lmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) { + mlog(ML_ERROR, "quota file version does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_version), + lversions[type], type); + goto out_err; + } + brelse(bh); + bh = NULL; + + /* Next check whether we understand global quota file */ + ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!ginode) { + mlog(ML_ERROR, "cannot get global quota file inode " + "(type=%d)\n", type); + goto out_err; + } + /* Since the header is read only, we don't care about locking */ + status = ocfs2_read_quota_block(ginode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read global quota file header " + "(type=%d)\n", type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) { + mlog(ML_ERROR, "global quota file magic does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_magic), gmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) { + mlog(ML_ERROR, "global quota file version does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_version), gversions[type], + type); + goto out_err; + } + + ret = 1; +out_err: + brelse(bh); + iput(ginode); + return ret; +} + +/* Release given list of quota file chunks */ +static void ocfs2_release_local_quota_bitmaps(struct list_head *head) +{ + struct ocfs2_quota_chunk *pos, *next; + + list_for_each_entry_safe(pos, next, head, qc_chunk) { + list_del(&pos->qc_chunk); + brelse(pos->qc_headerbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, pos); + } +} + +/* Load quota bitmaps into memory */ +static int ocfs2_load_local_quota_bitmaps(struct inode *inode, + struct ocfs2_local_disk_dqinfo *ldinfo, + struct list_head *head) +{ + struct ocfs2_quota_chunk *newchunk; + int i, status; + + INIT_LIST_HEAD(head); + for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) { + newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!newchunk) { + ocfs2_release_local_quota_bitmaps(head); + return -ENOMEM; + } + newchunk->qc_num = i; + newchunk->qc_headerbh = NULL; + status = ocfs2_read_quota_block(inode, + ol_quota_chunk_block(inode->i_sb, i), + &newchunk->qc_headerbh); + if (status) { + mlog_errno(status); + kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk); + ocfs2_release_local_quota_bitmaps(head); + return status; + } + list_add_tail(&newchunk->qc_chunk, head); + } + return 0; +} + +static void olq_update_info(struct buffer_head *bh, void *private) +{ + struct mem_dqinfo *info = private; + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_local_disk_dqinfo *ldinfo; + + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + spin_lock(&dq_data_lock); + ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags); + ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); + ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); + spin_unlock(&dq_data_lock); +} + +static int ocfs2_add_recovery_chunk(struct super_block *sb, + struct ocfs2_local_disk_chunk *dchunk, + int chunk, + struct list_head *head) +{ + struct ocfs2_recovery_chunk *rc; + + rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS); + if (!rc) + return -ENOMEM; + rc->rc_chunk = chunk; + rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!rc->rc_bitmap) { + kfree(rc); + return -ENOMEM; + } + memcpy(rc->rc_bitmap, dchunk->dqc_bitmap, + (ol_chunk_entries(sb) + 7) >> 3); + list_add_tail(&rc->rc_list, head); + return 0; +} + +static void free_recovery_list(struct list_head *head) +{ + struct ocfs2_recovery_chunk *next; + struct ocfs2_recovery_chunk *rchunk; + + list_for_each_entry_safe(rchunk, next, head, rc_list) { + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + } +} + +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) +{ + int type; + + for (type = 0; type < OCFS2_MAXQUOTAS; type++) + free_recovery_list(&(rec->r_list[type])); + kfree(rec); +} + +/* Load entries in our quota file we have to recover*/ +static int ocfs2_recovery_load_quota(struct inode *lqinode, + struct ocfs2_local_disk_dqinfo *ldinfo, + int type, + struct list_head *head) +{ + struct super_block *sb = lqinode->i_sb; + struct buffer_head *hbh; + struct ocfs2_local_disk_chunk *dchunk; + int i, chunks = le32_to_cpu(ldinfo->dqi_chunks); + int status = 0; + + for (i = 0; i < chunks; i++) { + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, i), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb)) + status = ocfs2_add_recovery_chunk(sb, dchunk, i, head); + brelse(hbh); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(head); + return status; +} + +static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) +{ + int type; + struct ocfs2_quota_recovery *rec; + + rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); + if (!rec) + return NULL; + for (type = 0; type < OCFS2_MAXQUOTAS; type++) + INIT_LIST_HEAD(&(rec->r_list[type])); + return rec; +} + +/* Load information we need for quota recovery into memory */ +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, + int slot_num) +{ + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct inode *lqinode; + struct buffer_head *bh; + int type; + int status = 0; + struct ocfs2_quota_recovery *rec; + + printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + + rec = ocfs2_alloc_quota_recovery(); + if (!rec) + return ERR_PTR(-ENOMEM); + /* First init... */ + + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + /* At this point, journal of the slot is already replayed so + * we can trust metadata and data of the quota file */ + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + if (status < 0) { + ocfs2_free_quota_recovery(rec); + rec = ERR_PTR(status); + } + return rec; +} + +/* Sync changes in local quota file into global quota file and + * reinitialize local quota file. + * The function expects local quota file to be already locked and + * dqonoff_mutex locked. */ +static int ocfs2_recover_local_quota_file(struct inode *lqinode, + int type, + struct ocfs2_quota_recovery *rec) +{ + struct super_block *sb = lqinode->i_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_local_disk_chunk *dchunk; + struct ocfs2_local_disk_dqblk *dqblk; + struct dquot *dquot; + handle_t *handle; + struct buffer_head *hbh = NULL, *qbh = NULL; + int status = 0; + int bit, chunk; + struct ocfs2_recovery_chunk *rchunk, *next; + qsize_t spacechange, inodechange; + + trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); + + list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { + chunk = rchunk->rc_chunk; + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, chunk), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + qbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_dqblk_block(sb, chunk, bit), + &qbh); + if (status) { + mlog_errno(status); + break; + } + dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + + ol_dqblk_block_off(sb, chunk, bit)); + dquot = dqget(sb, + make_kqid(&init_user_ns, type, + le64_to_cpu(dqblk->dqb_id))); + if (!dquot) { + status = -EIO; + mlog(ML_ERROR, "Failed to get quota structure " + "for id %u, type %d. Cannot finish quota " + "file recovery.\n", + (unsigned)le64_to_cpu(dqblk->dqb_id), + type); + goto out_put_bh; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) { + mlog_errno(status); + goto out_put_dquot; + } + + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_drop_lock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + spin_lock(&dq_data_lock); + /* Add usage from quota entry into quota changes + * of our node. Auxiliary variables are important + * due to signedness */ + spacechange = le64_to_cpu(dqblk->dqb_spacemod); + inodechange = le64_to_cpu(dqblk->dqb_inodemod); + dquot->dq_dqb.dqb_curspace += spacechange; + dquot->dq_dqb.dqb_curinodes += inodechange; + spin_unlock(&dq_data_lock); + /* We want to drop reference held by the crashed + * node. Since we have our own reference we know + * global structure actually won't be freed. */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + /* Release local quota file entry */ + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(lqinode), + qbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + lock_buffer(qbh); + WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(qbh); + ocfs2_journal_dirty(handle, qbh); +out_commit: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_drop_lock: + ocfs2_unlock_global_qf(oinfo, 1); +out_put_dquot: + dqput(dquot); +out_put_bh: + brelse(qbh); + if (status < 0) + break; + } + brelse(hbh); + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(&(rec->r_list[type])); + if (status) + mlog_errno(status); + return status; +} + +/* Recover local quota files for given node different from us */ +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num) +{ + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct buffer_head *bh; + handle_t *handle; + int type; + int status = 0; + struct inode *lqinode; + unsigned int flags; + + printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { + if (list_empty(&(rec->r_list[type]))) + continue; + trace_ocfs2_finish_quota_recovery(slot_num); + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_NOQUEUE); + /* Someone else is holding the lock? Then he must be + * doing the recovery. Just skip the file... */ + if (status == -EAGAIN) { + printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " + "device (%s) for slot %d because quota file is " + "locked.\n", osb->dev_str, slot_num); + status = 0; + goto out_put; + } else if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + /* Is recovery still needed? */ + flags = le32_to_cpu(ldinfo->dqi_flags); + if (!(flags & OLQF_CLEAN)) + status = ocfs2_recover_local_quota_file(lqinode, + type, + rec); + /* We don't want to mark file as clean when it is actually + * active */ + if (slot_num == osb->slot_num) + goto out_bh; + /* Mark quota file as clean if we are recovering quota file of + * some other node. */ + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_bh; + } + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); +out_trans: + ocfs2_commit_trans(osb, handle); +out_bh: + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + kfree(rec); + return status; +} + +/* Read information header from quota file */ +static int ocfs2_local_read_info(struct super_block *sb, int type) +{ + struct ocfs2_local_disk_dqinfo *ldinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + int status; + struct buffer_head *bh = NULL; + struct ocfs2_quota_recovery *rec; + int locked = 0; + + /* We don't need the lock and we have to acquire quota file locks + * which will later depend on this lock */ + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + info->dqi_max_spc_limit = 0x7fffffffffffffffLL; + info->dqi_max_ino_limit = 0x7fffffffffffffffLL; + oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); + if (!oinfo) { + mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" + " info."); + goto out_err; + } + info->dqi_priv = oinfo; + oinfo->dqi_type = type; + INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_rec = NULL; + oinfo->dqi_lqi_bh = NULL; + oinfo->dqi_libh = NULL; + + status = ocfs2_global_read_info(sb, type); + if (status < 0) + goto out_err; + + status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + locked = 1; + + /* Now read local header */ + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(type=%d)\n", type); + goto out_err; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); + oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); + oinfo->dqi_libh = bh; + + /* We crashed when using local quota file? */ + if (!(oinfo->dqi_flags & OLQF_CLEAN)) { + rec = OCFS2_SB(sb)->quota_rec; + if (!rec) { + rec = ocfs2_alloc_quota_recovery(); + if (!rec) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + OCFS2_SB(sb)->quota_rec = rec; + } + + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + } + + status = ocfs2_load_local_quota_bitmaps(lqinode, + ldinfo, + &oinfo->dqi_chunk); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now mark quota file as used */ + oinfo->dqi_flags &= ~OLQF_CLEAN; + status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + return 0; +out_err: + if (oinfo) { + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + brelse(oinfo->dqi_lqi_bh); + if (locked) + ocfs2_inode_unlock(lqinode, 1); + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + kfree(oinfo); + } + brelse(bh); + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + return -1; +} + +/* Write local info to quota file */ +static int ocfs2_local_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) + ->dqi_libh; + int status; + + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + return -1; + } + + return 0; +} + +/* Release info from memory */ +static int ocfs2_local_free_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int mark_clean = 1, len; + int status; + + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + (chunk->qc_headerbh->b_data); + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + /* Not all entries free? Bug! */ + if (le32_to_cpu(dchunk->dqc_free) != len) { + mlog(ML_ERROR, "releasing quota file with used " + "entries (type=%d)\n", type); + mark_clean = 0; + } + } + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + + /* dqonoff_mutex protects us against racing with recovery thread... */ + if (oinfo->dqi_rec) { + ocfs2_free_quota_recovery(oinfo->dqi_rec); + mark_clean = 0; + } + + if (!mark_clean) + goto out; + + /* Mark local file as clean */ + oinfo->dqi_flags |= OLQF_CLEAN; + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], + oinfo->dqi_libh, + olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + goto out; + } + +out: + ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); + brelse(oinfo->dqi_libh); + brelse(oinfo->dqi_lqi_bh); + kfree(oinfo); + return 0; +} + +static void olq_set_dquot(struct buffer_head *bh, void *private) +{ + struct ocfs2_dquot *od = private; + struct ocfs2_local_disk_dqblk *dqblk; + struct super_block *sb = od->dq_dquot.dq_sb; + + dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + + ol_dqblk_block_offset(sb, od->dq_local_off)); + + dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, + od->dq_dquot.dq_id)); + spin_lock(&dq_data_lock); + dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - + od->dq_origspace); + dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - + od->dq_originodes); + spin_unlock(&dq_data_lock); + trace_olq_set_dquot( + (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), + (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), + from_kqid(&init_user_ns, od->dq_dquot.dq_id)); +} + +/* Write dquot to local quota file */ +int ocfs2_local_write_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct buffer_head *bh; + struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type]; + int status; + + status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, + &bh); + if (status) { + mlog_errno(status); + goto out; + } + status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + brelse(bh); + return status; +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int found = 0, len; + + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + chunk->qc_headerbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) > 0) { + found = 1; + break; + } + } + if (!found) + return NULL; + + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + + found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); + /* We failed? */ + if (found == len) { + mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" + " entries free (type=%d)\n", chunk->qc_num, + le32_to_cpu(dchunk->dqc_free), type); + return ERR_PTR(-EIO); + } + *offset = found; + return chunk; +} + +/* Add new chunk to the local quota file */ +static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk = NULL; + struct ocfs2_local_disk_chunk *dchunk; + int status; + handle_t *handle; + struct buffer_head *bh = NULL, *dbh = NULL; + u64 p_blkno; + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, NULL, + i_size_read(lqinode) + 2 * sb->s_blocksize, + i_size_read(lqinode)); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + i_size_read(lqinode) + 2 * sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!chunk) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + /* Initialize chunk header */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb)); + memset(dchunk->dqc_bitmap, 0, + sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + /* Initialize new block with structures */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + ocfs2_journal_dirty(handle, dbh); + + /* Update local quotafile info */ + oinfo->dqi_blocks += 2; + oinfo->dqi_chunks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + + list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk); + chunk->qc_num = list_entry(chunk->qc_chunk.prev, + struct ocfs2_quota_chunk, + qc_chunk)->qc_num + 1; + chunk->qc_headerbh = bh; + *offset = 0; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + brelse(bh); + brelse(dbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); + return ERR_PTR(status); +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_local_disk_chunk *dchunk; + int epb = ol_quota_entries_per_block(sb); + unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; + int status; + handle_t *handle; + + if (list_empty(&oinfo->dqi_chunk)) + return ocfs2_local_quota_add_chunk(sb, type, offset); + /* Is the last chunk full? */ + chunk = list_entry(oinfo->dqi_chunk.prev, + struct ocfs2_quota_chunk, qc_chunk); + chunk_blocks = oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1; + if (ol_chunk_blocks(sb) == chunk_blocks) + return ocfs2_local_quota_add_chunk(sb, type, offset); + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, NULL, + i_size_read(lqinode) + sb->s_blocksize, + i_size_read(lqinode)); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + i_size_read(lqinode) + sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Get buffer from the just added block */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + /* Update chunk header */ + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + chunk->qc_headerbh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data; + lock_buffer(chunk->qc_headerbh); + le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); + unlock_buffer(chunk->qc_headerbh); + ocfs2_journal_dirty(handle, chunk->qc_headerbh); + + /* Update file header */ + oinfo->dqi_blocks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + *offset = chunk_blocks * epb; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + return ERR_PTR(status); +} + +static void olq_alloc_dquot(struct buffer_head *bh, void *private) +{ + int *offset = private; + struct ocfs2_local_disk_chunk *dchunk; + + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, -1); +} + +/* Create dquot in the local file for given id */ +int ocfs2_create_local_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_id.type; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + int offset; + int status; + u64 pcount; + + down_write(&OCFS2_I(lqinode)->ip_alloc_sem); + chunk = ocfs2_find_free_entry(sb, type, &offset); + if (!chunk) { + chunk = ocfs2_extend_local_quota_file(sb, type, &offset); + if (IS_ERR(chunk)) { + status = PTR_ERR(chunk); + goto out; + } + } else if (IS_ERR(chunk)) { + status = PTR_ERR(chunk); + goto out; + } + od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); + od->dq_chunk = chunk; + status = ocfs2_extent_map_get_blocks(lqinode, + ol_dqblk_block(sb, chunk->qc_num, offset), + &od->dq_local_phys_blk, + &pcount, + NULL); + + /* Initialize dquot structure on disk */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Mark structure as allocated */ + status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot, + &offset); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + up_write(&OCFS2_I(lqinode)->ip_alloc_sem); + return status; +} + +/* + * Release dquot structure from local quota file. ocfs2_release_dquot() has + * already started a transaction and written all changes to global quota file + */ +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) +{ + int status; + int type = dquot->dq_id.type; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct super_block *sb = dquot->dq_sb; + struct ocfs2_local_disk_chunk *dchunk; + int offset; + + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(sb_dqopt(sb)->files[type]), + od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num, + od->dq_local_off); + dchunk = (struct ocfs2_local_disk_chunk *) + (od->dq_chunk->qc_headerbh->b_data); + /* Mark structure as freed */ + lock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + +out: + return status; +} + +static const struct quota_format_ops ocfs2_format_ops = { + .check_quota_file = ocfs2_local_check_quota_file, + .read_file_info = ocfs2_local_read_info, + .write_file_info = ocfs2_global_write_info, + .free_file_info = ocfs2_local_free_info, +}; + +struct quota_format_type ocfs2_quota_format = { + .qf_fmt_id = QFMT_OCFS2, + .qf_ops = &ocfs2_format_ops, + .qf_owner = THIS_MODULE +}; diff --git a/kernel/fs/ocfs2/refcounttree.c b/kernel/fs/ocfs2/refcounttree.c new file mode 100644 index 000000000..d8c6af101 --- /dev/null +++ b/kernel/fs/ocfs2/refcounttree.c @@ -0,0 +1,4474 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" +#include "ocfs2_trace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ocfs2_cow_context { + struct inode *inode; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct inode *inode, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; + +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)bh->b_data; + + trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) +{ + ocfs2_metadata_cache_exit(&tree->rf_ci); + ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static inline void +ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) + osb->osb_ref_tree_lru = NULL; +} + +static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) +{ + struct ocfs2_refcount_tree *tree = + container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); + + ocfs2_free_refcount_tree(tree); +} + +static inline void +ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) +{ + kref_get(&tree->rf_getcnt); +} + +static inline void +ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) +{ + kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); +} + +static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, + struct super_block *sb) +{ + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); + mutex_init(&new->rf_io_mutex); + new->rf_sb = sb; + spin_lock_init(&new->rf_lock); +} + +static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new, + u64 rf_blkno, u32 generation) +{ + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, + rf_blkno, generation); +} + +static struct ocfs2_refcount_tree* +ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) +{ + struct ocfs2_refcount_tree *new; + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) + return NULL; + + new->rf_blkno = rf_blkno; + kref_init(&new->rf_getcnt); + ocfs2_init_refcount_tree_ci(new, osb->sb); + + return new; +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *ref_rb; + + spin_lock(&osb->osb_lock); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = ocfs2_allocate_refcount_tree(osb, rf_blkno); + if (!new) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + /* + * We need the generation to create the refcount tree lock and since + * it isn't changed during the tree modification, we are safe here to + * read without protection. + * We also have to purge the cache after we create the lock since the + * refcount block may have the stale data. It can only be trusted when + * we hold the refcount lock. + */ + ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_metadata_cache_exit(&new->rf_ci); + kfree(new); + return ret; + } + + ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + new->rf_generation = le32_to_cpu(ref_rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, + new->rf_generation); + ocfs2_metadata_cache_purge(&new->rf_ci); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + *ret_tree = tree; + + osb->osb_ref_tree_lru = tree; + + spin_unlock(&osb->osb_lock); + + if (new) + ocfs2_free_refcount_tree(new); + + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + *ref_blkno = le64_to_cpu(di->i_refcount_loc); + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + int ret; + + ret = ocfs2_refcount_lock(tree, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + +out: + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really needs it. + * + * If the tree has been re-created by other node, it will free the + * old one and re-create it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret, delete_tree = 0; + struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + +again: + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_refcount_tree_get(tree); + + ret = __ocfs2_lock_refcount_tree(osb, tree, rw); + if (ret) { + mlog_errno(ret); + ocfs2_refcount_tree_put(tree); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + ocfs2_refcount_tree_put(tree); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + /* + * If the refcount block has been freed and re-created, we may need + * to recreate the refcount tree also. + * + * Here we just remove the tree from the rb-tree, and the last + * kref holder will unlock and delete this refcount_tree. + * Then we goto "again" and ocfs2_get_refcount_tree will create + * the new refcount tree for us. + */ + if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { + if (!tree->rf_removed) { + ocfs2_erase_refcount_tree_from_list(osb, tree); + tree->rf_removed = 1; + delete_tree = 1; + } + + ocfs2_unlock_refcount_tree(osb, tree, rw); + /* + * We get an extra reference when we create the refcount + * tree, so another put will destroy it. + */ + if (delete_tree) + ocfs2_refcount_tree_put(tree); + brelse(ref_root_bh); + ref_root_bh = NULL; + goto again; + } + + *ret_tree = tree; + if (ref_bh) { + *ref_bh = ref_root_bh; + ref_root_bh = NULL; + } +out: + brelse(ref_root_bh); + return ret; +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); + + ocfs2_refcount_unlock(tree, rw); + ocfs2_refcount_tree_put(tree); +} + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + trace_ocfs2_purge_refcount_trees( + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(tree); + } +} + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +static int ocfs2_create_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + trace_ocfs2_create_refcount_tree( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); + if (!new_tree) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); + spin_lock(&osb->osb_lock); + rb->rf_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); + + /* + * We have to init the tree lock here since it will use + * the generation number to create it. + */ + new_tree->rf_generation = le32_to_cpu(rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, + new_tree->rf_generation); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, first_blkno); + + /* + * We've just created a new refcount tree in this block. If + * we found a refcount tree on the ocfs2_super, it must be + * one we just deleted. We free the old tree before + * inserting the new tree. + */ + BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); + if (tree) + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + ocfs2_insert_refcount_tree(osb, new_tree); + spin_unlock(&osb->osb_lock); + new_tree = NULL; + if (tree) + ocfs2_refcount_tree_put(tree); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (new_tree) { + ocfs2_metadata_cache_exit(&new_tree->rf_ci); + kfree(new_tree); + } + + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_root_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret, delete_tree = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + if (rb->rf_suballoc_loc) + bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + delete_tree = 1; + ocfs2_erase_refcount_tree_from_list(osb, ref_tree); + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_refcount_tree_put(ref_tree); + brelse(blk_bh); + + return ret; +} + +static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index) +{ + int i = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = NULL; + + for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { + rec = &rb->rf_records.rl_recs[i]; + + if (le64_to_cpu(rec->r_cpos) + + le32_to_cpu(rec->r_clusters) <= cpos) + continue; + else if (le64_to_cpu(rec->r_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + if (ret_rec) + *ret_rec = *rec; + goto out; + } + + if (ret_rec) { + /* We meet with a hole here, so fake the rec. */ + ret_rec->r_cpos = cpu_to_le64(cpos); + ret_rec->r_refcount = 0; + if (i < le16_to_cpu(rb->rf_records.rl_used) && + le64_to_cpu(rec->r_cpos) < cpos + len) + ret_rec->r_clusters = + cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); + else + ret_rec->r_clusters = cpu_to_le32(len); + } + +out: + *index = i; +} + +/* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_refcount = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index, + struct buffer_head **ret_bh) +{ + int ret = 0, i, found; + u32 low_cpos, uninitialized_var(cpos_end); + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { + ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_root_bh; + get_bh(ref_root_bh); + return 0; + } + + el = &rb->rf_list; + low_cpos = cpos & OCFS2_32BIT_POS_MASK; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(sb, + "refcount tree %llu has non zero tree " + "depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= low_cpos) { + found = 1; + break; + } + } + + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_leaf_bh; +out: + brelse(eb_bh); + return ret; +} + +enum ocfs2_ref_rec_contig { + REF_CONTIG_NONE = 0, + REF_CONTIG_LEFT, + REF_CONTIG_RIGHT, + REF_CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, + int index) +{ + if ((rb->rf_records.rl_recs[index].r_refcount == + rb->rf_records.rl_recs[index + 1].r_refcount) && + (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == + le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) + return REF_CONTIG_RIGHT; + + return REF_CONTIG_NONE; +} + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) + ret = ocfs2_refcount_rec_adjacent(rb, index); + + if (index > 0) { + enum ocfs2_ref_rec_contig tmp; + + tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); + + if (tmp == REF_CONTIG_RIGHT) { + if (ret == REF_CONTIG_RIGHT) + ret = REF_CONTIG_LEFTRIGHT; + else + ret = REF_CONTIG_LEFT; + } + } + + return ret; +} + +static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, + int index) +{ + BUG_ON(rb->rf_records.rl_recs[index].r_refcount != + rb->rf_records.rl_recs[index+1].r_refcount); + + le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, + le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) + memmove(&rb->rf_records.rl_recs[index + 1], + &rb->rf_records.rl_recs[index + 2], + sizeof(struct ocfs2_refcount_rec) * + (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); + + memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + le16_add_cpu(&rb->rf_records.rl_used, -1); +} + +/* + * Merge the refcount rec if we are contiguous with the adjacent recs. + */ +static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig contig = + ocfs2_refcount_rec_contig(rb, index); + + if (contig == REF_CONTIG_NONE) + return; + + if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { + BUG_ON(index == 0); + index--; + } + + ocfs2_rotate_refcount_rec_left(rb, index); + + if (contig == REF_CONTIG_LEFTRIGHT) + ocfs2_rotate_refcount_rec_left(rb, index); +} + +/* + * Change the refcount indexed by "index" in ref_bh. + * If refcount reaches 0, remove it. + */ +static int ocfs2_change_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + int index, int merge, int change) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_change_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, le32_to_cpu(rec->r_refcount), change); + le32_add_cpu(&rec->r_refcount, change); + + if (!rec->r_refcount) { + if (index != le16_to_cpu(rl->rl_used) - 1) { + memmove(rec, rec + 1, + (le16_to_cpu(rl->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + } + + le16_add_cpu(&rl->rl_used, -1); + } else if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ocfs2_journal_dirty(handle, ref_leaf_bh); +out: + return ret; +} + +static int ocfs2_expand_inline_ref_root(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head **ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Initialize ocfs2_refcount_block. + * It should contain the same information as the old root. + * so just memcpy it and change the corresponding field. + */ + memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); + + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_cpos = cpu_to_le32(0); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + ocfs2_journal_dirty(handle, new_bh); + + /* Now change the root. */ + memset(&root_rb->rf_list, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list)); + root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); + root_rb->rf_clusters = cpu_to_le32(1); + root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); + root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); + + ocfs2_journal_dirty(handle, ref_root_bh); + + trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno, + le16_to_cpu(new_rb->rf_records.rl_used)); + + *ref_leaf_bh = new_bh; + new_bh = NULL; +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, + struct ocfs2_refcount_rec *next) +{ + if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= + ocfs2_get_ref_rec_low_cpos(next)) + return 1; + + return 0; +} + +static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); + u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static int cmp_refcount_rec_by_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u64 l_cpos = le64_to_cpu(l->r_cpos); + u64 r_cpos = le64_to_cpu(r->r_cpos); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static void swap_refcount_rec(void *a, void *b, int size) +{ + struct ocfs2_refcount_rec *l = a, *r = b, tmp; + + tmp = *l; + *l = *r; + *r = tmp; +} + +/* + * The refcount cpos are ordered by their 64bit cpos, + * But we will use the low 32 bit to be the e_cpos in the b-tree. + * So we need to make sure that this pos isn't intersected with others. + * + * Note: The refcount block is already sorted by their low 32 bit cpos, + * So just try the middle pos first, and we will exit when we find + * the good position. + */ +static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, + u32 *split_pos, int *split_index) +{ + int num_used = le16_to_cpu(rl->rl_used); + int delta, middle = num_used / 2; + + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle - delta - 1], + &rl->rl_recs[middle - delta])) { + *split_index = middle - delta; + break; + } + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == num_used) + continue; + + /* Now try delta past middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle + delta], + &rl->rl_recs[middle + delta + 1])) { + *split_index = middle + delta + 1; + break; + } + } + + if (delta >= middle) + return -ENOSPC; + + *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); + return 0; +} + +static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, + struct buffer_head *new_bh, + u32 *split_cpos) +{ + int split_index = 0, num_moved, ret; + u32 cpos = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_block *new_rb = + (struct ocfs2_refcount_block *)new_bh->b_data; + struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; + + trace_ocfs2_divide_leaf_refcount_block( + (unsigned long long)ref_leaf_bh->b_blocknr, + le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); + + /* + * XXX: Improvement later. + * If we know all the high 32 bit cpos is the same, no need to sort. + * + * In order to make the whole process safe, we do: + * 1. sort the entries by their low 32 bit cpos first so that we can + * find the split cpos easily. + * 2. call ocfs2_insert_extent to insert the new refcount block. + * 3. move the refcount rec to the new block. + * 4. sort the entries by their 64 bit cpos. + * 5. dirty the new_rb and rb. + */ + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + + ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); + if (ret) { + mlog_errno(ret); + return ret; + } + + new_rb->rf_cpos = cpu_to_le32(cpos); + + /* move refcount records starting from split_index to the new block. */ + num_moved = le16_to_cpu(rl->rl_used) - split_index; + memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /*ok, remove the entries we just moved over to the other block. */ + memset(&rl->rl_recs[split_index], 0, + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /* change old and new rl_used accordingly. */ + le16_add_cpu(&rl->rl_used, -num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); + + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + *split_cpos = cpos; + return 0; +} + +static int ocfs2_new_leaf_refcount_block(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got, new_cpos; + u64 suballoc_loc, blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_extent_tree ref_et; + + BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Initialize ocfs2_refcount_block. */ + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(new_rb, 0, sb->s_blocksize); + strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + new_rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + new_rb->rf_generation = root_rb->rf_generation; + + ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + ocfs2_journal_dirty(handle, new_bh); + + ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); + + trace_ocfs2_new_leaf_refcount_block( + (unsigned long long)new_bh->b_blocknr, new_cpos); + + /* Insert the new leaf block with the specific offset cpos. */ + ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, + 1, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_expand_refcount_tree(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *expand_bh = NULL; + + if (ref_root_bh == ref_leaf_bh) { + /* + * the old root bh hasn't been expanded to a b-tree, + * so expand it first. + */ + ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, + &expand_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + expand_bh = ref_leaf_bh; + get_bh(expand_bh); + } + + + /* Now add a new refcount block into the tree.*/ + ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, + expand_bh, meta_ac); + if (ret) + mlog_errno(ret); +out: + brelse(expand_bh); + return ret; +} + +/* + * Adjust the extent rec in b-tree representing ref_leaf_bh. + * + * Only called when we have inserted a new refcount rec at index 0 + * which means ocfs2_extent_rec.e_cpos may need some change. + */ +static int ocfs2_adjust_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec) +{ + int ret = 0, i; + u32 new_cpos, old_cpos; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct ocfs2_extent_list *el; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + old_cpos = le32_to_cpu(rb->rf_cpos); + new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; + if (old_cpos <= new_cpos) + goto out; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, path, old_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * 2 more credits, one for the leaf refcount block, one for + * the extent block contains the extent rec. + */ + ret = ocfs2_extend_trans(handle, 2); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* change the leaf extent block first. */ + el = path_leaf_el(path); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) + if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) + break; + + BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); + + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + + /* change the r_cpos in the leaf block. */ + rb->rf_cpos = cpu_to_le32(new_cpos); + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_insert_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + if (rf_list->rl_used == rf_list->rl_count) { + u64 cpos = le64_to_cpu(rec->r_cpos); + u32 len = le32_to_cpu(rec->r_clusters); + + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, NULL, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index < le16_to_cpu(rf_list->rl_used)) + memmove(&rf_list->rl_recs[index + 1], + &rf_list->rl_recs[index], + (le16_to_cpu(rf_list->rl_used) - index) * + sizeof(struct ocfs2_refcount_rec)); + + trace_ocfs2_insert_refcount_rec( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount)); + + rf_list->rl_recs[index] = *rec; + + le16_add_cpu(&rf_list->rl_used, 1); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ocfs2_journal_dirty(handle, ref_leaf_bh); + + if (index == 0) { + ret = ocfs2_adjust_refcount_rec(handle, ci, + ref_root_bh, + ref_leaf_bh, rec); + if (ret) + mlog_errno(ret); + } +out: + brelse(new_bh); + return ret; +} + +/* + * Split the refcount_rec indexed by "index" in ref_leaf_bh. + * This is much simple than our b-tree code. + * split_rec is the new refcount rec we want to insert. + * If split_rec->r_refcount > 0, we are changing the refcount(in case we + * increase refcount or decrease a refcount to non-zero). + * If split_rec->r_refcount == 0, we are punching a hole in current refcount + * rec( in case we decrease a refcount to zero). + */ +static int ocfs2_split_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *split_rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, recs_need; + u32 len; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; + struct ocfs2_refcount_rec *tail_rec = NULL; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos), + le32_to_cpu(orig_rec->r_clusters), + le32_to_cpu(orig_rec->r_refcount), + le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); + + /* + * If we just need to split the header or tail clusters, + * no more recs are needed, just split is OK. + * Otherwise we at least need one new recs. + */ + if (!split_rec->r_refcount && + (split_rec->r_cpos == orig_rec->r_cpos || + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) == + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need = 0; + else + recs_need = 1; + + /* + * We need one more rec if we split in the middle and the new rec have + * some refcount in it. + */ + if (split_rec->r_refcount && + (split_rec->r_cpos != orig_rec->r_cpos && + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) != + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need++; + + /* If the leaf block don't have enough record, expand it. */ + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { + struct ocfs2_refcount_rec tmp_rec; + u64 cpos = le64_to_cpu(orig_rec->r_cpos); + len = le32_to_cpu(orig_rec->r_clusters); + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have to re-get it since now cpos may be moved to + * another leaf block. + */ + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &tmp_rec, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + orig_rec = &rf_list->rl_recs[index]; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have calculated out how many new records we need and store + * in recs_need, so spare enough space first by moving the records + * after "index" to the end. + */ + if (index != le16_to_cpu(rf_list->rl_used) - 1) + memmove(&rf_list->rl_recs[index + 1 + recs_need], + &rf_list->rl_recs[index + 1], + (le16_to_cpu(rf_list->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + + len = (le64_to_cpu(orig_rec->r_cpos) + + le32_to_cpu(orig_rec->r_clusters)) - + (le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we have "len", the we will split in the tail and move it + * to the end of the space we have just spared. + */ + if (len) { + tail_rec = &rf_list->rl_recs[index + recs_need]; + + memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); + le64_add_cpu(&tail_rec->r_cpos, + le32_to_cpu(tail_rec->r_clusters) - len); + tail_rec->r_clusters = cpu_to_le32(len); + } + + /* + * If the split pos isn't the same as the original one, we need to + * split in the head. + * + * Note: We have the chance that split_rec.r_refcount = 0, + * recs_need = 0 and len > 0, which means we just cut the head from + * the orig_rec and in that case we have done some modification in + * orig_rec above, so the check for r_cpos is faked. + */ + if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { + len = le64_to_cpu(split_rec->r_cpos) - + le64_to_cpu(orig_rec->r_cpos); + orig_rec->r_clusters = cpu_to_le32(len); + index++; + } + + le16_add_cpu(&rf_list->rl_used, recs_need); + + if (split_rec->r_refcount) { + rf_list->rl_recs[index] = *split_rec; + trace_ocfs2_split_refcount_rec_insert( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + brelse(new_bh); + return ret; +} + +static int __ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_refcount_rec rec; + unsigned int set_len = 0; + + trace_ocfs2_increase_refcount_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + set_len = le32_to_cpu(rec.r_clusters); + + /* + * Here we may meet with 3 situations: + * + * 1. If we find an already existing record, and the length + * is the same, cool, we just need to increase the r_refcount + * and it is OK. + * 2. If we find a hole, just insert it with r_refcount = 1. + * 3. If we are in the middle of one extent record, split + * it. + */ + if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && + set_len <= len) { + trace_ocfs2_increase_refcount_change( + (unsigned long long)cpos, set_len, + le32_to_cpu(rec.r_refcount)); + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, + merge, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (!rec.r_refcount) { + rec.r_refcount = cpu_to_le32(1); + + trace_ocfs2_increase_refcount_insert( + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len); + ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, + &rec, index, + merge, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + set_len = min((u64)(cpos + len), + le64_to_cpu(rec.r_cpos) + set_len) - cpos; + rec.r_cpos = cpu_to_le64(cpos); + rec.r_clusters = cpu_to_le32(set_len); + le32_add_cpu(&rec.r_refcount, 1); + + trace_ocfs2_increase_refcount_split( + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len, le32_to_cpu(rec.r_refcount)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &rec, index, merge, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += set_len; + len -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +static int ocfs2_remove_refcount_extent(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_extent_tree et; + + BUG_ON(rb->rf_records.rl_used); + + trace_ocfs2_remove_refcount_extent( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rb->rf_cpos)); + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), + 1, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(ci, ref_leaf_bh); + + /* + * add the freed block to the dealloc so that it will be freed + * when we run dealloc. + */ + ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_suballoc_loc), + le64_to_cpu(rb->rf_blkno), + le16_to_cpu(rb->rf_suballoc_bit)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + le32_add_cpu(&rb->rf_clusters, -1); + + /* + * check whether we need to restore the root refcount block if + * there is no leaf extent block at atll. + */ + if (!rb->rf_list.l_next_free_rec) { + BUG_ON(rb->rf_clusters); + + trace_ocfs2_restore_refcount_block( + (unsigned long long)ref_root_bh->b_blocknr); + + rb->rf_flags = 0; + rb->rf_parent = 0; + rb->rf_cpos = 0; + memset(&rb->rf_records, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records)); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + } + + ocfs2_journal_dirty(handle, ref_root_bh); + +out: + return ret; +} + +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + return __ocfs2_increase_refcount(handle, ci, ref_root_bh, + cpos, len, 1, + meta_ac, dealloc); +} + +static int ocfs2_decrease_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + int index, u64 cpos, unsigned int len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; + + BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); + BUG_ON(cpos + len > + le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); + + trace_ocfs2_decrease_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + if (cpos == le64_to_cpu(rec->r_cpos) && + len == le32_to_cpu(rec->r_clusters)) + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, 1, -1); + else { + struct ocfs2_refcount_rec split = *rec; + split.r_cpos = cpu_to_le64(cpos); + split.r_clusters = cpu_to_le32(len); + + le32_add_cpu(&split.r_refcount, -1); + + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &split, index, 1, + meta_ac, dealloc); + } + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Remove the leaf refcount block if it contains no refcount record. */ + if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { + ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + } + +out: + return ret; +} + +static int __ocfs2_decrease_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret = 0, index = 0; + struct ocfs2_refcount_rec rec; + unsigned int r_count = 0, r_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *ref_leaf_bh = NULL; + + trace_ocfs2_decrease_refcount( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len, delete); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + r_count = le32_to_cpu(rec.r_refcount); + BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); + + r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + + ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, index, + cpos, r_len, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le32_to_cpu(rec.r_refcount) == 1 && delete) { + ret = ocfs2_cache_cluster_dealloc(dealloc, + ocfs2_clusters_to_blocks(sb, cpos), + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += r_len; + len -= r_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* Caller must hold refcount tree lock. */ +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret; + u64 ref_blkno; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, + cpos, len, meta_ac, dealloc, delete); + if (ret) + mlog_errno(ret); +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno, + cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given some contiguous physical clusters, calculate what we need + * for modifying their refcount. + */ +static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 start_cpos, + u32 clusters, + int *meta_add, + int *credits) +{ + int ret = 0, index, ref_blocks = 0, recs_add = 0; + u64 cpos = start_cpos; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; + u32 len; + + while (clusters) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, clusters, &rec, + &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ref_leaf_bh != prev_bh) { + /* + * Now we encounter a new leaf block, so calculate + * whether we need to extend the old leaf. + */ + if (prev_bh) { + rb = (struct ocfs2_refcount_block *) + prev_bh->b_data; + + if (le16_to_cpu(rb->rf_records.rl_used) + + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + } + + recs_add = 0; + *credits += 1; + brelse(prev_bh); + prev_bh = ref_leaf_bh; + get_bh(prev_bh); + } + + trace_ocfs2_calc_refcount_meta_credits_iterate( + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); + + len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + /* + * We record all the records which will be inserted to the + * same refcount block, so that we can tell exactly whether + * we need a new refcount block or not. + * + * If we will insert a new one, this is easy and only happens + * during adding refcounted flag to the extent, so we don't + * have a chance of spliting. We just need one record. + * + * If the refcount rec already exists, that would be a little + * complicated. we may have to: + * 1) split at the beginning if the start pos isn't aligned. + * we need 1 more record in this case. + * 2) split int the end if the end pos isn't aligned. + * we need 1 more record in this case. + * 3) split in the middle because of file system fragmentation. + * we need 2 more records in this case(we can't detect this + * beforehand, so always think of the worst case). + */ + if (rec.r_refcount) { + recs_add += 2; + /* Check whether we need a split at the beginning. */ + if (cpos == start_cpos && + cpos != le64_to_cpu(rec.r_cpos)) + recs_add++; + + /* Check whether we need a split in the end. */ + if (cpos + clusters < le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) + recs_add++; + } else + recs_add++; + + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + clusters -= len; + cpos += len; + } + + if (prev_bh) { + rb = (struct ocfs2_refcount_block *)prev_bh->b_data; + + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + } + + if (!ref_blocks) + goto out; + + *meta_add += ref_blocks; + *credits += ref_blocks; + + /* + * So we may need ref_blocks to insert into the tree. + * That also means we need to change the b-tree and add that number + * of records since we never merge them. + * We need one more block for expansion since the new created leaf + * block is also full and needs split. + */ + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + *meta_add += ocfs2_extend_meta_needed(et.et_root_el); + *credits += ocfs2_calc_extend_credits(sb, + et.et_root_el); + } else { + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + *meta_add += 1; + } + +out: + + trace_ocfs2_calc_refcount_meta_credits( + (unsigned long long)start_cpos, clusters, + *meta_add, *credits); + brelse(ref_leaf_bh); + brelse(prev_bh); + return ret; +} + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, so just go through it to see how many blocks + * we gonna touch and whether we need to create new blocks. + * + * Normally the refcount blocks store these refcount should be + * contiguous also, so that we can get the number easily. + * We will at most add split 2 refcount records and 2 more + * refcount blocks, so just check it in a rough way. + * + * Caller must hold refcount tree lock. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + u64 refcount_loc, + u64 phys_blkno, + u32 clusters, + int *credits, + int *ref_blocks) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + refcount_loc, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + &tree->rf_ci, + ref_root_bh, + start_cpos, clusters, + ref_blocks, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits); + +out: + brelse(ref_root_bh); + return ret; +} + +#define MAX_CONTIG_BYTES 1048576 + +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) +{ + return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); +} + +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) +{ + return ~(ocfs2_cow_contig_clusters(sb) - 1); +} + +/* + * Given an extent that starts at 'start' and an I/O that starts at 'cpos', + * find an offset (start + (n * contig_clusters)) that is closest to cpos + * while still being less than or equal to it. + * + * The goal is to break the extent at a multiple of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, + unsigned int start, + unsigned int cpos) +{ + BUG_ON(start > cpos); + + return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); +} + +/* + * Given a cluster count of len, pad it out so that it is a multiple + * of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, + unsigned int len) +{ + unsigned int padded = + (len + (ocfs2_cow_contig_clusters(sb) - 1)) & + ocfs2_cow_contig_mask(sb); + + /* Did we wrap? */ + if (padded < len) + padded = UINT_MAX; + + return padded; +} + +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * max_cpos is the place where we want to stop CoW intentionally. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we + * get good I/O from the resulting extent tree. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + u32 cpos, + u32 write_len, + u32 max_cpos, + u32 *cow_start, + u32 *cow_len) +{ + int ret = 0; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + unsigned int want_clusters, rec_end = 0; + int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); + int leaf_clusters; + + BUG_ON(cpos + write_len > max_cpos); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + /* + * We should find a refcounted record in the + * first pass. + */ + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + } + + /* + * If we encounter a hole, a non-refcounted record or + * pass the max_cpos, stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || + (max_cpos <= le32_to_cpu(rec->e_cpos))) + break; + + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + if (rec_end > max_cpos) { + rec_end = max_cpos; + leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); + } + + /* + * How many clusters do we actually need from + * this extent? First we see how many we actually + * need to complete the write. If that's smaller + * than contig_clusters, we try for contig_clusters. + */ + if (!*cow_len) + want_clusters = write_len; + else + want_clusters = (cpos + write_len) - + (*cow_start + *cow_len); + if (want_clusters < contig_clusters) + want_clusters = contig_clusters; + + /* + * If the write does not cover the whole extent, we + * need to calculate how we're going to split the extent. + * We try to do it on contig_clusters boundaries. + * + * Any extent smaller than contig_clusters will be + * CoWed in its entirety. + */ + if (leaf_clusters <= contig_clusters) + *cow_len += leaf_clusters; + else if (*cow_len || (*cow_start == cpos)) { + /* + * This extent needs to be CoW'd from its + * beginning, so all we have to do is compute + * how many clusters to grab. We align + * want_clusters to the edge of contig_clusters + * to get better I/O. + */ + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + + if (leaf_clusters < want_clusters) + *cow_len += leaf_clusters; + else + *cow_len += want_clusters; + } else if ((*cow_start + contig_clusters) >= + (cpos + write_len)) { + /* + * Breaking off contig_clusters at the front + * of the extent will cover our write. That's + * easy. + */ + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= contig_clusters) { + /* + * Breaking off contig_clusters at the tail of + * this extent will cover cpos. + */ + *cow_start = rec_end - contig_clusters; + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= want_clusters) { + /* + * While we can't fit the entire write in this + * extent, we know that the write goes from cpos + * to the end of the extent. Break that off. + * We try to break it at some multiple of + * contig_clusters from the front of the extent. + * Failing that (ie, cpos is within + * contig_clusters of the front), we'll CoW the + * entire extent. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + *cow_len = rec_end - *cow_start; + } else { + /* + * Ok, the entire write lives in the middle of + * this extent. Let's try to slice the extent up + * nicely. Optimally, our CoW region starts at + * m*contig_clusters from the beginning of the + * extent and goes for n*contig_clusters, + * covering the entire write. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + + want_clusters = (cpos + write_len) - *cow_start; + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + if (*cow_start + want_clusters <= rec_end) + *cow_len = want_clusters; + else + *cow_len = rec_end - *cow_start; + } + + /* Have we covered our entire write yet? */ + if ((*cow_start + *cow_len) >= (cpos + write_len)) + break; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i + 1 == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = -1; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare meta_ac, data_ac and calculate credits when we want to add some + * num_clusters in data_tree "et" and change the refcount for the old + * clusters(starting form p_cluster) in the refcount tree. + * + * Note: + * 1. since we may split the old tree, so we at most will need num_clusters + 2 + * more new leaf records. + * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so + * just give data_ac = NULL. + */ +static int ocfs2_lock_refcount_allocators(struct super_block *sb, + u32 p_cluster, u32 num_clusters, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int *credits) +{ + int ret = 0, meta_add = 0; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < num_clusters + 2) + meta_add = + ocfs2_extend_meta_needed(et->et_root_el); + + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el); + + ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, + p_cluster, num_clusters, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_lock_refcount_allocators(meta_add, *credits); + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, + data_ac); + if (ret) + mlog_errno(ret); + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUG_ON(buffer_dirty(bh)); + + clear_buffer_mapped(bh); + + return 0; +} + +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct inode *inode, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, partial; + struct super_block *sb = inode->i_sb; + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct page *page; + pgoff_t page_index; + unsigned int from, to, readahead_pages; + loff_t offset, end, map_end; + struct address_space *mapping = inode->i_mapping; + + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); + + readahead_pages = + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(inode)) + end = i_size_read(inode); + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + /* from, to is the offset within the page. */ + from = offset & (PAGE_CACHE_SIZE - 1); + to = PAGE_CACHE_SIZE; + if (map_end & (PAGE_CACHE_SIZE - 1)) + to = map_end & (PAGE_CACHE_SIZE - 1); + + page = find_or_create_page(mapping, page_index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + break; + } + + /* + * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * can't be dirtied before we CoW it out. + */ + if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + BUG_ON(PageDirty(page)); + + if (!PageUptodate(page)) { + ret = block_read_full_page(page, ocfs2_get_block); + if (ret) { + mlog_errno(ret); + goto unlock; + } + lock_page(page); + } + + if (page_has_buffers(page)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_clear_cow_buffer); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + ocfs2_map_and_dirty_page(inode, + handle, from, to, + page, 0, &new_block); + mark_page_accessed(page); +unlock: + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct inode *inode, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_caching_info *ci = INODE_CACHE(inode); + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ocfs2_journal_dirty(handle, new_bh); + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + trace_ocfs2_clear_ext_refcount((unsigned long long)ino, + cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + trace_ocfs2_replace_clusters((unsigned long long)ino, + cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = context->cow_duplicate_clusters(handle, context->inode, + cpos, old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters) +{ + int ret = 0; + loff_t offset, end, map_end; + pgoff_t page_index; + struct page *page; + + if (ocfs2_should_order_data(inode)) + return 0; + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + + ret = filemap_fdatawrite_range(inode->i_mapping, + offset, end - 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + page = find_or_create_page(inode->i_mapping, + page_index, GFP_NOFS); + BUG_ON(!page); + + wait_on_page_writeback(page); + if (PageError(page)) { + ret = -EIO; + mlog_errno(ret); + } else + mark_page_accessed(page); + + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_make_clusters_writable(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret, delete, index, credits = 0; + u32 new_bit, new_len, orig_num_clusters; + unsigned int set_len; + struct ocfs2_super *osb = OCFS2_SB(sb); + handle_t *handle; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; + struct ocfs2_refcount_rec rec; + + trace_ocfs2_make_clusters_writable(cpos, p_cluster, + num_clusters, e_flags); + + ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, + &context->data_et, + ref_ci, + context->ref_root_bh, + &context->meta_ac, + &context->data_ac, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (context->post_refcount) + credits += context->post_refcount->credits; + + credits += context->extra_credits; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + orig_num_clusters = num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, + p_cluster, num_clusters, + &rec, &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + BUG_ON(!rec.r_refcount); + set_len = min((u64)p_cluster + num_clusters, + le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and don't COW. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.r_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, ref_ci, + context->ref_root_bh, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos += set_len; + p_cluster += set_len; + num_clusters -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + + /* handle any post_cow action. */ + if (context->post_refcount && context->post_refcount->func) { + ret = context->post_refcount->func(context->inode, handle, + context->post_refcount->para); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + if (context->get_clusters == ocfs2_di_get_clusters) { + ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, + orig_num_clusters); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + brelse(ref_leaf_bh); + + return ret; +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context) +{ + int ret = 0; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + ocfs2_init_dealloc_ctxt(&context->dealloc); + + while (cow_len) { + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(inode->i_sb, context, + cow_start, p_cluster, + num_clusters, ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + } + + return ret; +} + +/* + * Starting at cpos, try to CoW write_len clusters. Don't CoW + * past max_cpos. This will stop when it runs into a hole or an + * unrefcounted extent. + */ +static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_cow_context *context = NULL; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, max_cpos, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno, + cpos, write_len, max_cpos, + cow_start, cow_len); + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context->get_clusters = ocfs2_di_get_clusters; + + ocfs2_init_dinode_extent_tree(&context->data_et, + INODE_CACHE(inode), di_bh); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + kfree(context); + return ret; +} + +/* + * CoW any and all clusters between cpos and cpos+write_len. + * Don't CoW past max_cpos. If this returns successfully, all + * clusters between cpos and cpos+write_len are safe to modify. + */ +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + while (write_len) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + if (write_len < num_clusters) + num_clusters = write_len; + + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + num_clusters, max_cpos); + if (ret) { + mlog_errno(ret); + break; + } + } + + write_len -= num_clusters; + cpos += num_clusters; + } + + return ret; +} + +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_object; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Given a xattr value root, calculate the most meta/credits we need for + * refcount tree change if we truncate it to 0. + */ +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits) +{ + int ret = 0, index, ref_blocks = 0; + u32 p_cluster, num_clusters; + u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL; + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, + p_cluster, num_clusters, + &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!rec.r_refcount); + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + /* + * We really don't know whether the other clusters is in + * this refcount block or not, so just take the worst + * case that all the clusters are in this block and each + * one will split a refcount rec, so totally we need + * clusters * 2 new refcount rec. + */ + if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + + if (num_clusters <= le32_to_cpu(rec.r_clusters)) + break; + else + num_clusters -= le32_to_cpu(rec.r_clusters); + p_cluster += num_clusters; + } + } + + *meta_add += ref_blocks; + if (!ref_blocks) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + else { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); + *credits += ocfs2_calc_extend_credits(inode->i_sb, + et.et_root_el); + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context *context = NULL; + u32 cow_start, cow_len; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, UINT_MAX, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_object = xv; + + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + /* We need the extra credits for duplicate_clusters by jbd. */ + context->extra_credits = + ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; + context->get_clusters = ocfs2_xattr_value_get_clusters; + context->post_refcount = post; + + ocfs2_init_xattr_value_extent_tree(&context->data_et, + INODE_CACHE(inode), vb); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + +out: + kfree(context); + return ret; +} + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) +{ + int ret; + handle_t *handle; + int credits = 1, ref_blocks = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + ref_ci, ref_root_bh, + p_cluster, num_clusters, + &ref_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_add_refcount_flag(ref_blocks, credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (post) + credits += post->credits; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, 0, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_change_ctime(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, data_changed = 0; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto attach_xattr; + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto unlock; + } + if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(inode, &di_et, + &ref_tree->rf_ci, + ref_root_bh, cpos, + p_cluster, num_clusters, + &dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + data_changed = 1; + } + cpos += num_clusters; + } + +attach_xattr: + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, + &ref_tree->rf_ci, + ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + if (data_changed) { + ret = ocfs2_change_ctime(inode, di_bh); + if (ret) + mlog_errno(ret); + } + +unlock: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } +out: + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + + return ret; +} + +static int ocfs2_add_refcounted_extent(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + unsigned int ext_flags, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_refcount_allocators(inode->i_sb, + p_cluster, num_clusters, + et, ref_ci, + ref_root_bh, &meta_ac, + NULL, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), + num_clusters, ext_flags, meta_ac); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_duplicate_inline_data(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; + + BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; + memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, + le16_to_cpu(s_di->id2.i_data.id_count)); + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + struct buffer_head *t_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + if (p_cluster) { + ret = ocfs2_add_refcounted_extent(t_inode, &et, + ref_ci, ref_root_bh, + cpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += num_clusters; + } + +out: + return ret; +} + +/* + * change the new file's attributes to the src. + * + * reflink creates a snapshot of a file, that means the attributes + * must be identical except for three exceptions - nlink, ino, and ctime. + */ +static int ocfs2_complete_reflink(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; + loff_t size = i_size_read(s_inode); + + handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; + OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + t_inode->i_blocks = s_inode->i_blocks; + + di->i_xattr_inline_size = s_di->i_xattr_inline_size; + di->i_clusters = s_di->i_clusters; + di->i_size = s_di->i_size; + di->i_dyn_features = s_di->i_dyn_features; + di->i_attr = s_di->i_attr; + + if (preserve) { + t_inode->i_uid = s_inode->i_uid; + t_inode->i_gid = s_inode->i_gid; + t_inode->i_mode = s_inode->i_mode; + di->i_uid = s_di->i_uid; + di->i_gid = s_di->i_gid; + di->i_mode = s_di->i_mode; + + /* + * update time. + * we want mtime to appear identical to the source and + * update ctime. + */ + t_inode->i_ctime = CURRENT_TIME; + + di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + + t_inode->i_mtime = s_inode->i_mtime; + di->i_mtime = s_di->i_mtime; + di->i_mtime_nsec = s_di->i_mtime_nsec; + } + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, + t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, + &ref_tree->rf_ci, ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *new_inode, + bool preserve) +{ + int ret; + struct inode *inode = d_inode(old_dentry); + struct buffer_head *new_bh = NULL; + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, + OI_LS_REFLINK_TARGET); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, + new_inode, new_bh, + preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + } + + ret = ocfs2_complete_reflink(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) + mlog_errno(ret); + +inode_unlock: + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + if (!ret) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret) + mlog_errno(ret); + } + return ret; +} + +static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + int error; + struct inode *inode = d_inode(old_dentry); + struct buffer_head *old_bh = NULL; + struct inode *new_orphan_inode = NULL; + struct posix_acl *default_acl, *acl; + umode_t mode; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + mode = inode->i_mode; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_create_inode_in_orphan(dir, mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + ocfs2_rw_unlock(inode, 1); + goto out; + } + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + down_write(&OCFS2_I(inode)->ip_alloc_sem); + error = __ocfs2_reflink(old_dentry, old_bh, + new_orphan_inode, preserve); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(old_bh); + + if (error) { + mlog_errno(error); + goto out; + } + + /* If the security isn't preserved, we need to re-initialize them. */ + if (!preserve) { + error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + &new_dentry->d_name, + default_acl, acl); + if (error) + mlog_errno(error); + } +out: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + + if (new_orphan_inode) { + /* + * We need to open_unlock the inode no matter whether we + * succeed or not, so that other nodes can delete it later. + */ + ocfs2_open_unlock(new_orphan_inode); + if (error) + iput(new_orphan_inode); + } + + return error; +} + +/* + * Below here are the bits used by OCFS2_IOC_REFLINK() to fake + * sys_reflink(). This will go away when vfs_reflink() exists in + * fs/namei.c. + */ + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (d_really_is_positive(child)) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/** + * ocfs2_vfs_reflink - Create a reference-counted link + * + * @old_dentry: source dentry + inode + * @dir: directory to create the target + * @new_dentry: target dentry + * @preserve: if true, preserve all file attributes + */ +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + struct inode *inode = d_inode(old_dentry); + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A reflink to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + /* Only regular files can be reflinked. */ + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + /* + * If the caller wants to preserve ownership, they require the + * rights to do so. + */ + if (preserve) { + if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) + return -EPERM; + } + + /* + * If the caller is modifying any aspect of the attributes, they + * are not creating a snapshot. They need read permission on the + * file. + */ + if (!preserve) { + error = inode_permission(inode, MAY_READ); + if (error) + return error; + } + + mutex_lock(&inode->i_mutex); + dquot_initialize(dir); + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_create(dir, new_dentry); + return error; +} +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve) +{ + struct dentry *new_dentry; + struct path old_path, new_path; + int error; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != new_path.mnt) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + d_inode(new_path.dentry), + new_dentry, preserve); +out_dput: + done_path_create(&new_path, new_dentry); +out: + path_put(&old_path); + + return error; +} diff --git a/kernel/fs/ocfs2/refcounttree.h b/kernel/fs/ocfs2/refcounttree.h new file mode 100644 index 000000000..6422bbcdb --- /dev/null +++ b/kernel/fs/ocfs2/refcounttree.h @@ -0,0 +1,118 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.h + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_REFCOUNTTREE_H +#define OCFS2_REFCOUNTTREE_H + +struct ocfs2_refcount_tree { + struct rb_node rf_node; + u64 rf_blkno; + u32 rf_generation; + struct kref rf_getcnt; + struct rw_semaphore rf_sem; + struct ocfs2_lock_res rf_lockres; + int rf_removed; + + /* the following 4 fields are used by caching_info. */ + spinlock_t rf_lock; + struct ocfs2_caching_info rf_ci; + struct mutex rf_io_mutex; + struct super_block *rf_sb; +}; + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb); +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **tree, + struct buffer_head **ref_bh); +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, + int rw); + +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete); +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + u64 refcount_loc, + u64 phys_blkno, + u32 clusters, + int *credits, + int *ref_blocks); +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos); + +typedef int (ocfs2_post_refcount_func)(struct inode *inode, + handle_t *handle, + void *para); +/* + * Some refcount caller need to do more work after we modify the data b-tree + * during refcount operation(including CoW and add refcount flag), and make the + * transaction complete. So it must give us this structure so that we can do it + * within our transaction. + * + */ +struct ocfs2_post_refcount { + int credits; /* credits it need for journal. */ + ocfs2_post_refcount_func *func; /* real function. */ + void *para; +}; + +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits); +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post); +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct inode *inode, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct inode *inode, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters); +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post); +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve); +#endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/kernel/fs/ocfs2/reservations.c b/kernel/fs/ocfs2/reservations.c new file mode 100644 index 000000000..6a348b029 --- /dev/null +++ b/kernel/fs/ocfs2/reservations.c @@ -0,0 +1,839 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.c + * + * Allocation reservations implementation + * + * Some code borrowed from fs/ext3/balloc.c and is: + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * The rest is copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "ocfs2_trace.h" + +#ifdef CONFIG_OCFS2_DEBUG_FS +#define OCFS2_CHECK_RESERVATIONS +#endif + +static DEFINE_SPINLOCK(resv_lock); + +#define OCFS2_MIN_RESV_WINDOW_BITS 8 +#define OCFS2_MAX_RESV_WINDOW_BITS 1024 + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) +{ + return (osb->osb_resv_level && osb->osb_dir_resv_level); +} + +static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + struct ocfs2_super *osb = resmap->m_osb; + unsigned int bits; + + if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { + /* 8, 16, 32, 64, 128, 256, 512, 1024 */ + bits = 4 << osb->osb_resv_level; + } else { + bits = 4 << osb->osb_dir_resv_level; + } + return bits; +} + +static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_len) + return resv->r_start + resv->r_len - 1; + return resv->r_start; +} + +static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) +{ + return !!(resv->r_len == 0); +} + +static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) +{ + if (resmap->m_osb->osb_resv_level == 0) + return 1; + return 0; +} + +static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + int i = 0; + + mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", + osb->dev_str, resmap->m_bitmap_len); + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" + "\tlast_len: %u\n", resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + node = rb_next(node); + i++; + } + + mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i); + + i = 0; + list_for_each_entry(resv, &resmap->m_lru, r_lru) { + mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t" + "last_start: %u\tlast_len: %u\n", i, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + i++; + } +} + +#ifdef OCFS2_CHECK_RESERVATIONS +static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap, + int i, + struct ocfs2_alloc_reservation *resv) +{ + char *disk_bitmap = resmap->m_disk_bitmap; + unsigned int start = resv->r_start; + unsigned int end = ocfs2_resv_end(resv); + + while (start <= end) { + if (ocfs2_test_bit(start, disk_bitmap)) { + mlog(ML_ERROR, + "reservation %d covers an allocated area " + "starting at bit %u!\n", i, start); + return 1; + } + + start++; + } + return 0; +} + +static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + unsigned int off = 0; + int i = 0; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (i > 0 && resv->r_start <= off) { + mlog(ML_ERROR, "reservation %d has bad start off!\n", + i); + goto bad; + } + + if (resv->r_len == 0) { + mlog(ML_ERROR, "reservation %d has no length!\n", + i); + goto bad; + } + + if (resv->r_start > ocfs2_resv_end(resv)) { + mlog(ML_ERROR, "reservation %d has invalid range!\n", + i); + goto bad; + } + + if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) { + mlog(ML_ERROR, "reservation %d extends past bitmap!\n", + i); + goto bad; + } + + if (ocfs2_validate_resmap_bits(resmap, i, resv)) + goto bad; + + off = ocfs2_resv_end(resv); + node = rb_next(node); + + i++; + } + return; + +bad: + ocfs2_dump_resv(resmap); + BUG(); +} +#else +static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + +} +#endif + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) +{ + memset(resv, 0, sizeof(*resv)); + INIT_LIST_HEAD(&resv->r_lru); +} + +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags) +{ + BUG_ON(flags & ~OCFS2_RESV_TYPES); + + resv->r_flags |= flags; +} + +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap) +{ + memset(resmap, 0, sizeof(*resmap)); + + resmap->m_osb = osb; + resmap->m_reservations = RB_ROOT; + /* m_bitmap_len is initialized to zero by the above memset. */ + INIT_LIST_HEAD(&resmap->m_lru); + + return 0; +} + +static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + if (!list_empty(&resv->r_lru)) + list_del_init(&resv->r_lru); + + list_add_tail(&resv->r_lru, &resmap->m_lru); +} + +static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) +{ + resv->r_len = 0; + resv->r_start = 0; +} + +static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) { + list_del_init(&resv->r_lru); + rb_erase(&resv->r_node, &resmap->m_reservations); + resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE; + } +} + +static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + __ocfs2_resv_trunc(resv); + /* + * last_len and last_start no longer make sense if + * we're changing the range of our allocations. + */ + resv->r_last_len = resv->r_last_start = 0; + + ocfs2_resv_remove(resmap, resv); +} + +/* does nothing if 'resv' is null */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv) { + spin_lock(&resv_lock); + __ocfs2_resv_discard(resmap, resv); + spin_unlock(&resv_lock); + } +} + +static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) +{ + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + assert_spin_locked(&resv_lock); + + while ((node = rb_last(&resmap->m_reservations)) != NULL) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + __ocfs2_resv_discard(resmap, resv); + } +} + +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + spin_lock(&resv_lock); + + ocfs2_resmap_clear_all_resv(resmap); + resmap->m_bitmap_len = clen; + resmap->m_disk_bitmap = disk_bitmap; + + spin_unlock(&resv_lock); +} + +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) +{ + /* Does nothing for now. Keep this around for API symmetry */ +} + +static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *new) +{ + struct rb_root *root = &resmap->m_reservations; + struct rb_node *parent = NULL; + struct rb_node **p = &root->rb_node; + struct ocfs2_alloc_reservation *tmp; + + assert_spin_locked(&resv_lock); + + trace_ocfs2_resv_insert(new->r_start, new->r_len); + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); + + if (new->r_start < tmp->r_start) { + p = &(*p)->rb_left; + + /* + * This is a good place to check for + * overlapping reservations. + */ + BUG_ON(ocfs2_resv_end(new) >= tmp->r_start); + } else if (new->r_start > ocfs2_resv_end(tmp)) { + p = &(*p)->rb_right; + } else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate reservation window!\n"); + BUG(); + } + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, root); + new->r_flags |= OCFS2_RESV_FLAG_INUSE; + + ocfs2_resv_mark_lru(resmap, new); + + ocfs2_check_resmap(resmap); +} + +/** + * ocfs2_find_resv_lhs() - find the window which contains goal + * @resmap: reservation map to search + * @goal: which bit to search for + * + * If a window containing that goal is not found, we return the window + * which comes before goal. Returns NULL on empty rbtree or no window + * before goal. + */ +static struct ocfs2_alloc_reservation * +ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal) +{ + struct ocfs2_alloc_reservation *resv = NULL; + struct ocfs2_alloc_reservation *prev_resv = NULL; + struct rb_node *node = resmap->m_reservations.rb_node; + + assert_spin_locked(&resv_lock); + + if (!node) + return NULL; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal) + break; + + /* Check if we overshot the reservation just before goal? */ + if (resv->r_start > goal) { + resv = prev_resv; + break; + } + + prev_resv = resv; + node = rb_next(node); + } + + return resv; +} + +/* + * We are given a range within the bitmap, which corresponds to a gap + * inside the reservations tree (search_start, search_len). The range + * can be anything from the whole bitmap, to a gap between + * reservations. + * + * The start value of *rstart is insignificant. + * + * This function searches the bitmap range starting at search_start + * with length search_len for a set of contiguous free bits. We try + * to find up to 'wanted' bits, but can sometimes return less. + * + * Returns the length of allocation, 0 if no free bits are found. + * + * *cstart and *clen will also be populated with the result. + */ +static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, + unsigned int wanted, + unsigned int search_start, + unsigned int search_len, + unsigned int *rstart, + unsigned int *rlen) +{ + void *bitmap = resmap->m_disk_bitmap; + unsigned int best_start, best_len = 0; + int offset, start, found; + + trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len, + wanted, resmap->m_bitmap_len); + + found = best_start = best_len = 0; + + start = search_start; + while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, + start)) != -1) { + /* Search reached end of the region */ + if (offset >= (search_start + search_len)) + break; + + if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_len) { + best_len = found; + best_start = start - found; + } + + if (found >= wanted) + break; + } + + if (best_len == 0) + return 0; + + if (best_len >= wanted) + best_len = wanted; + + *rlen = best_len; + *rstart = best_start; + + trace_ocfs2_resmap_find_free_bits_end(best_start, best_len); + + return *rlen; +} + +static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int goal, unsigned int wanted) +{ + struct rb_root *root = &resmap->m_reservations; + unsigned int gap_start, gap_end, gap_len; + struct ocfs2_alloc_reservation *prev_resv, *next_resv; + struct rb_node *prev, *next; + unsigned int cstart, clen; + unsigned int best_start = 0, best_len = 0; + + /* + * Nasty cases to consider: + * + * - rbtree is empty + * - our window should be first in all reservations + * - our window should be last in all reservations + * - need to make sure we don't go past end of bitmap + */ + trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv), + goal, wanted, RB_EMPTY_ROOT(root)); + + assert_spin_locked(&resv_lock); + + if (RB_EMPTY_ROOT(root)) { + /* + * Easiest case - empty tree. We can just take + * whatever window of free bits we want. + */ + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + resmap->m_bitmap_len - goal, + &cstart, &clen); + + /* + * This should never happen - the local alloc window + * will always have free bits when we're called. + */ + BUG_ON(goal == 0 && clen == 0); + + if (clen == 0) + return; + + resv->r_start = cstart; + resv->r_len = clen; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = ocfs2_find_resv_lhs(resmap, goal); + + if (prev_resv == NULL) { + /* + * A NULL here means that the search code couldn't + * find a window that starts before goal. + * + * However, we can take the first window after goal, + * which is also by definition, the leftmost window in + * the entire tree. If we can find free bits in the + * gap between goal and the LHS window, then the + * reservation can safely be placed there. + * + * Otherwise we fall back to a linear search, checking + * the gaps in between windows for a place to + * allocate. + */ + + next = rb_first(root); + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + + /* + * The search should never return such a window. (see + * comment above + */ + if (next_resv->r_start <= goal) { + mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n", + goal, next_resv->r_start, next_resv->r_len); + ocfs2_dump_resv(resmap); + BUG(); + } + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + next_resv->r_start - goal, + &cstart, &clen); + if (clen) { + best_len = clen; + best_start = cstart; + if (best_len == wanted) + goto out_insert; + } + + prev_resv = next_resv; + next_resv = NULL; + } + + trace_ocfs2_resv_find_window_prev(prev_resv->r_start, + ocfs2_resv_end(prev_resv)); + + prev = &prev_resv->r_node; + + /* Now we do a linear search for a window, starting at 'prev_rsv' */ + while (1) { + next = rb_next(prev); + if (next) { + next_resv = rb_entry(next, + struct ocfs2_alloc_reservation, + r_node); + + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = next_resv->r_start - 1; + gap_len = gap_end - gap_start + 1; + } else { + /* + * We're at the rightmost edge of the + * tree. See if a reservation between this + * window and the end of the bitmap will work. + */ + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_len = resmap->m_bitmap_len - gap_start; + gap_end = resmap->m_bitmap_len - 1; + } + + trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1, + next ? ocfs2_resv_end(next_resv) : -1); + /* + * No need to check this gap if we have already found + * a larger region of free bits. + */ + if (gap_len <= best_len) + goto next_resv; + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start, + gap_len, &cstart, &clen); + if (clen == wanted) { + best_len = clen; + best_start = cstart; + goto out_insert; + } else if (clen > best_len) { + best_len = clen; + best_start = cstart; + } + +next_resv: + if (!next) + break; + + prev = next; + prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, + r_node); + } + +out_insert: + if (best_len) { + resv->r_start = best_start; + resv->r_len = best_len; + ocfs2_resv_insert(resmap, resv); + } +} + +static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + struct ocfs2_alloc_reservation *lru_resv; + int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP); + unsigned int min_bits; + + if (!tmpwindow) + min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1; + else + min_bits = wanted; /* We at know the temp window will use all + * of these bits */ + + /* + * Take the first reservation off the LRU as our 'target'. We + * don't try to be smart about it. There might be a case for + * searching based on size but I don't have enough data to be + * sure. --Mark (3/16/2010) + */ + lru_resv = list_first_entry(&resmap->m_lru, + struct ocfs2_alloc_reservation, r_lru); + + trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start, + lru_resv->r_len, + ocfs2_resv_end(lru_resv)); + + /* + * Cannibalize (some or all) of the target reservation and + * feed it to the current window. + */ + if (lru_resv->r_len <= min_bits) { + /* + * Discard completely if size is less than or equal to a + * reasonable threshold - 50% of window bits for non temporary + * windows. + */ + resv->r_start = lru_resv->r_start; + resv->r_len = lru_resv->r_len; + + __ocfs2_resv_discard(resmap, lru_resv); + } else { + unsigned int shrink; + if (tmpwindow) + shrink = min_bits; + else + shrink = lru_resv->r_len / 2; + + lru_resv->r_len -= shrink; + + resv->r_start = ocfs2_resv_end(lru_resv) + 1; + resv->r_len = shrink; + } + + trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); + + ocfs2_resv_insert(resmap, resv); +} + +static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + unsigned int goal = 0; + + BUG_ON(!ocfs2_resv_empty(resv)); + + /* + * Begin by trying to get a window as close to the previous + * one as possible. Using the most recent allocation as a + * start goal makes sense. + */ + if (resv->r_last_len) { + goal = resv->r_last_start + resv->r_last_len; + if (goal >= resmap->m_bitmap_len) + goal = 0; + } + + __ocfs2_resv_find_window(resmap, resv, goal, wanted); + + /* Search from last alloc didn't work, try once more from beginning. */ + if (ocfs2_resv_empty(resv) && goal != 0) + __ocfs2_resv_find_window(resmap, resv, 0, wanted); + + if (ocfs2_resv_empty(resv)) { + /* + * Still empty? Pull oldest one off the LRU, remove it from + * tree, put this one in it's place. + */ + ocfs2_cannibalize_resv(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); +} + +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen) +{ + if (resv == NULL || ocfs2_resmap_disabled(resmap)) + return -ENOSPC; + + spin_lock(&resv_lock); + + if (ocfs2_resv_empty(resv)) { + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + unsigned int wanted = ocfs2_resv_window_bits(resmap, resv); + + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + /* + * Try to get a window here. If it works, we must fall + * through and test the bitmap . This avoids some + * ping-ponging of windows due to non-reserved space + * being allocation before we initialize a window for + * that inode. + */ + ocfs2_resv_find_window(resmap, resv, wanted); + trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len); + } + + BUG_ON(ocfs2_resv_empty(resv)); + + *cstart = resv->r_start; + *clen = resv->r_len; + + spin_unlock(&resv_lock); + return 0; +} + +static void + ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int start, unsigned int end) +{ + unsigned int rhs = 0; + unsigned int old_end = ocfs2_resv_end(resv); + + BUG_ON(start != resv->r_start || old_end < end); + + /* + * Completely used? We can remove it then. + */ + if (old_end == end) { + __ocfs2_resv_discard(resmap, resv); + return; + } + + rhs = old_end - end; + + /* + * This should have been trapped above. + */ + BUG_ON(rhs == 0); + + resv->r_start = end + 1; + resv->r_len = old_end - resv->r_start + 1; +} + +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen) +{ + unsigned int cend = cstart + clen - 1; + + if (resmap == NULL || ocfs2_resmap_disabled(resmap)) + return; + + if (resv == NULL) + return; + + BUG_ON(cstart != resv->r_start); + + spin_lock(&resv_lock); + + trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, + resv->r_last_len); + + BUG_ON(cstart < resv->r_start); + BUG_ON(cstart > ocfs2_resv_end(resv)); + BUG_ON(cend > ocfs2_resv_end(resv)); + + ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend); + resv->r_last_start = cstart; + resv->r_last_len = clen; + + /* + * May have been discarded above from + * ocfs2_adjust_resv_from_alloc(). + */ + if (!ocfs2_resv_empty(resv)) + ocfs2_resv_mark_lru(resmap, resv); + + trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); + + ocfs2_check_resmap(resmap); + + spin_unlock(&resv_lock); +} diff --git a/kernel/fs/ocfs2/reservations.h b/kernel/fs/ocfs2/reservations.h new file mode 100644 index 000000000..42c2b804f --- /dev/null +++ b/kernel/fs/ocfs2/reservations.h @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.h + * + * Allocation reservations function prototypes and structures. + * + * Copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_RESERVATIONS_H +#define OCFS2_RESERVATIONS_H + +#include + +#define OCFS2_DEFAULT_RESV_LEVEL 2 +#define OCFS2_MAX_RESV_LEVEL 9 +#define OCFS2_MIN_RESV_LEVEL 0 + +struct ocfs2_alloc_reservation { + struct rb_node r_node; + + unsigned int r_start; /* Beginning of current window */ + unsigned int r_len; /* Length of the window */ + + unsigned int r_last_len; /* Length of most recent alloc */ + unsigned int r_last_start; /* Start of most recent alloc */ + struct list_head r_lru; /* LRU list head */ + + unsigned int r_flags; +}; + +#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ +#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be + * destroyed immedately after use */ +#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed + * directory btree */ + +struct ocfs2_reservation_map { + struct rb_root m_reservations; + char *m_disk_bitmap; + + struct ocfs2_super *m_osb; + + /* The following are not initialized to meaningful values until a disk + * bitmap is provided. */ + u32 m_bitmap_len; /* Number of valid + * bits available */ + + struct list_head m_lru; /* LRU of reservations + * structures. */ + +}; + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); + +#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR) +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags); + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb); + +/** + * ocfs2_resv_discard() - truncate a reservation + * @resmap: + * @resv: the reservation to truncate. + * + * After this function is called, the reservation will be empty, and + * unlinked from the rbtree. + */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv); + + +/** + * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @resmap: struct ocfs2_reservation_map to initialize + * @obj: unused for now + * @ops: unused for now + * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) + * + * Only possible return value other than '0' is -ENOMEM for failure to + * allocation mirror bitmap. + */ +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_restart() - "restart" a reservation bitmap + * @resmap: reservations bitmap + * @clen: Number of valid bits in the bitmap + * @disk_bitmap: the disk bitmap this resmap should refer to. + * + * Re-initialize the parameters of a reservation bitmap. This is + * useful for local alloc window slides. + * + * This function will call ocfs2_trunc_resv against all existing + * reservations. A future version will recalculate existing + * reservations based on the new bitmap. + */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap); + +/** + * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure + * @resmap: the struct ocfs2_reservation_map to uninitialize + */ +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_resv_bits() - Return still-valid reservation bits + * @resmap: reservations bitmap + * @resv: reservation to base search from + * @cstart: start of proposed allocation + * @clen: length (in clusters) of proposed allocation + * + * Using the reservation data from resv, this function will compare + * resmap and resmap->m_disk_bitmap to determine what part (if any) of + * the reservation window is still clear to use. If resv is empty, + * this function will try to allocate a window for it. + * + * On success, zero is returned and the valid allocation area is set in cstart + * and clen. + * + * Returns -ENOSPC if reservations are disabled. + */ +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen); + +/** + * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. + * @resmap: reservations bitmap + * @resv: optional reservation to recalulate based on new bitmap + * @cstart: start of allocation in clusters + * @clen: end of allocation in clusters. + * + * Tell the reservation code that bits were used to fulfill allocation in + * resmap. The bits don't have to have been part of any existing + * reservation. But we must always call this function when bits are claimed. + * Internally, the reservations code will use this information to mark the + * reservations bitmap. If resv is passed, it's next allocation window will be + * calculated. It also expects that 'cstart' is the same as we passed back + * from ocfs2_resmap_resv_bits(). + */ +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen); + +#endif /* OCFS2_RESERVATIONS_H */ diff --git a/kernel/fs/ocfs2/resize.c b/kernel/fs/ocfs2/resize.c new file mode 100644 index 000000000..d5da6f624 --- /dev/null +++ b/kernel/fs/ocfs2/resize.c @@ -0,0 +1,589 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.c + * + * volume resize. + * Inspired by ext3/resize.c. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" +#include "suballoc.h" +#include "resize.h" + +/* + * Check whether there are new backup superblocks exist + * in the last group. If there are some, mark them or clear + * them in the bitmap. + * + * Return how many backups we find in the last group. + */ +static u16 ocfs2_calc_new_backup_super(struct inode *inode, + struct ocfs2_group_desc *gd, + u16 cl_cpg, + int set) +{ + int i; + u16 backups = 0; + u32 cluster; + u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + gd_blkno = ocfs2_which_cluster_group(inode, cluster); + if (gd_blkno < lgd_blkno) + continue; + else if (gd_blkno > lgd_blkno) + break; + + if (set) + ocfs2_set_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + else + ocfs2_clear_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + backups++; + } + + return backups; +} + +static int ocfs2_update_last_group_and_inode(handle_t *handle, + struct inode *bm_inode, + struct buffer_head *bm_bh, + struct buffer_head *group_bh, + u32 first_new_cluster, + int new_clusters) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct ocfs2_chain_rec *cr; + struct ocfs2_group_desc *group; + u16 chain, num_bits, backups = 0; + u16 cl_bpc = le16_to_cpu(cl->cl_bpc); + u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + + trace_ocfs2_update_last_group_and_inode(new_clusters, + first_new_cluster); + + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + + /* update the group first. */ + num_bits = new_clusters * cl_bpc; + le16_add_cpu(&group->bg_bits, num_bits); + le16_add_cpu(&group->bg_free_bits_count, num_bits); + + /* + * check whether there are some new backup superblocks exist in + * this group and update the group bitmap accordingly. + */ + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_BACKUP_SB)) { + backups = ocfs2_calc_new_backup_super(bm_inode, + group, + cl_cpg, 1); + le16_add_cpu(&group->bg_free_bits_count, -1 * backups); + } + + ocfs2_journal_dirty(handle, group_bh); + + /* update the inode accordingly. */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_rollback; + } + + chain = le16_to_cpu(group->bg_chain); + cr = (&cl->cl_recs[chain]); + le32_add_cpu(&cr->c_total, num_bits); + le32_add_cpu(&cr->c_free, num_bits); + le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits); + le32_add_cpu(&fe->i_clusters, new_clusters); + + if (backups) { + le32_add_cpu(&cr->c_free, -1 * backups); + le32_add_cpu(&fe->id1.bitmap1.i_used, backups); + } + + spin_lock(&OCFS2_I(bm_inode)->ip_lock); + OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(bm_inode)->ip_lock); + i_size_write(bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_journal_dirty(handle, bm_bh); + +out_rollback: + if (ret < 0) { + ocfs2_calc_new_backup_super(bm_inode, + group, + cl_cpg, 0); + le16_add_cpu(&group->bg_free_bits_count, backups); + le16_add_cpu(&group->bg_bits, -1 * num_bits); + le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + } +out: + if (ret) + mlog_errno(ret); + return ret; +} + +static int update_backups(struct inode * inode, u32 clusters, char *data) +{ + int i, ret = 0; + u32 cluster; + u64 blkno; + struct buffer_head *backup = NULL; + struct ocfs2_dinode *backup_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* calculate the real backups we need to update. */ + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if (cluster > clusters) + break; + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); + if (ret < 0) { + mlog_errno(ret); + break; + } + + memcpy(backup->b_data, data, inode->i_sb->s_blocksize); + + backup_di = (struct ocfs2_dinode *)backup->b_data; + backup_di->i_blkno = cpu_to_le64(blkno); + + ret = ocfs2_write_super_or_backup(osb, backup); + brelse(backup); + backup = NULL; + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static void ocfs2_update_super_and_backups(struct inode *inode, + int new_clusters) +{ + int ret; + u32 clusters = 0; + struct buffer_head *super_bh = NULL; + struct ocfs2_dinode *super_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* + * update the superblock last. + * It doesn't matter if the write failed. + */ + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + super_di = (struct ocfs2_dinode *)super_bh->b_data; + le32_add_cpu(&super_di->i_clusters, new_clusters); + clusters = le32_to_cpu(super_di->i_clusters); + + ret = ocfs2_write_super_or_backup(osb, super_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB)) + ret = update_backups(inode, clusters, super_bh->b_data); + +out: + brelse(super_bh); + if (ret) + printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s" + " during fs resize. This condition is not fatal," + " but fsck.ocfs2 should be run to fix it\n", + osb->dev_str); + return; +} + +/* + * Extend the filesystem to the new number of clusters specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. + */ +int ocfs2_group_extend(struct inode * inode, int new_clusters) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct buffer_head *group_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 cl_bpc; + u32 first_new_cluster; + u64 lgd_blkno; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + if (new_clusters < 0) + return -EINVAL; + else if (new_clusters == 0) + return 0; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(), + * so any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { + mlog(ML_ERROR, "The disk is too old and small. " + "Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + first_new_cluster = le32_to_cpu(fe->i_clusters); + lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, + first_new_cluster - 1); + + ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno, + &group_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + group = (struct ocfs2_group_desc *)group_bh->b_data; + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > + le16_to_cpu(fe->id2.i_chain.cl_cpg)) { + ret = -EINVAL; + goto out_unlock; + } + + + trace_ocfs2_group_extend( + (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_unlock; + } + + /* update the last group descriptor and inode. */ + ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode, + main_bm_bh, group_bh, + first_new_cluster, + new_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_update_super_and_backups(main_bm_inode, new_clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + brelse(group_bh); + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + return ret; +} + +static int ocfs2_check_new_group(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + int ret; + struct ocfs2_group_desc *gd = + (struct ocfs2_group_desc *)group_bh->b_data; + u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); + + ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh); + if (ret) + goto out; + + ret = -EINVAL; + if (le16_to_cpu(gd->bg_chain) != input->chain) + mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " + "while input has %u set.\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_chain), input->chain); + else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " + "input has %u clusters set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), input->clusters); + else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u " + "but it should have %u set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + input->frees * cl_bpc); + else + ret = 0; + +out: + return ret; +} + +static int ocfs2_verify_group_and_input(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count); + u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); + u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec); + u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group); + u32 total_clusters = le32_to_cpu(di->i_clusters); + int ret = -EINVAL; + + if (cluster < total_clusters) + mlog(ML_ERROR, "add a group which is in the current volume.\n"); + else if (input->chain >= cl_count) + mlog(ML_ERROR, "input chain exceeds the limit.\n"); + else if (next_free != cl_count && next_free != input->chain) + mlog(ML_ERROR, + "the add group should be in chain %u\n", next_free); + else if (total_clusters + input->clusters < total_clusters) + mlog(ML_ERROR, "add group's clusters overflow.\n"); + else if (input->clusters > cl_cpg) + mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n"); + else if (input->frees > input->clusters) + mlog(ML_ERROR, "the free cluster exceeds the total clusters\n"); + else if (total_clusters % cl_cpg != 0) + mlog(ML_ERROR, + "the last group isn't full. Use group extend first.\n"); + else if (input->group != ocfs2_which_cluster_group(inode, cluster)) + mlog(ML_ERROR, "group blkno is invalid\n"); + else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh))) + mlog(ML_ERROR, "group descriptor check failed.\n"); + else + ret = 0; + + return ret; +} + +/* Add a new group descriptor to global_bitmap. */ +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *cr; + u16 cl_bpc; + u64 bg_ptr; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { + mlog(ML_ERROR, "The disk is too old and small." + " Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); + if (ret < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # %llu " + "from the device.", (unsigned long long)input->group); + goto out_unlock; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh); + + ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); + if (ret) { + mlog_errno(ret); + goto out_free_group_bh; + } + + trace_ocfs2_group_add((unsigned long long)input->group, + input->chain, input->clusters, input->frees); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_free_group_bh; + } + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + cl = &fe->id2.i_chain; + cr = &cl->cl_recs[input->chain]; + + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + bg_ptr = le64_to_cpu(group->bg_next_group); + group->bg_next_group = cr->c_blkno; + ocfs2_journal_dirty(handle, group_bh); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + group->bg_next_group = cpu_to_le64(bg_ptr); + mlog_errno(ret); + goto out_commit; + } + + if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) { + le16_add_cpu(&cl->cl_next_free_rec, 1); + memset(cr, 0, sizeof(struct ocfs2_chain_rec)); + } + + cr->c_blkno = cpu_to_le64(input->group); + le32_add_cpu(&cr->c_total, input->clusters * cl_bpc); + le32_add_cpu(&cr->c_free, input->frees * cl_bpc); + + le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc); + le32_add_cpu(&fe->id1.bitmap1.i_used, + (input->clusters - input->frees) * cl_bpc); + le32_add_cpu(&fe->i_clusters, input->clusters); + + ocfs2_journal_dirty(handle, main_bm_bh); + + spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); + OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); + i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_update_super_and_backups(main_bm_inode, input->clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_free_group_bh: + brelse(group_bh); + +out_unlock: + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + return ret; +} diff --git a/kernel/fs/ocfs2/resize.h b/kernel/fs/ocfs2/resize.h new file mode 100644 index 000000000..f38841abf --- /dev/null +++ b/kernel/fs/ocfs2/resize.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.h + * + * Function prototypes + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_RESIZE_H +#define OCFS2_RESIZE_H + +int ocfs2_group_extend(struct inode * inode, int new_clusters); +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input); + +#endif /* OCFS2_RESIZE_H */ diff --git a/kernel/fs/ocfs2/slot_map.c b/kernel/fs/ocfs2/slot_map.c new file mode 100644 index 000000000..e78a203d4 --- /dev/null +++ b/kernel/fs/ocfs2/slot_map.c @@ -0,0 +1,538 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slot_map.c + * + * + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + + +struct ocfs2_slot { + int sl_valid; + unsigned int sl_node_num; +}; + +struct ocfs2_slot_info { + int si_extended; + int si_slots_per_block; + struct inode *si_inode; + unsigned int si_blocks; + struct buffer_head **si_bh; + unsigned int si_num_slots; + struct ocfs2_slot *si_slots; +}; + + +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num); + +static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si, + int slot_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + si->si_slots[slot_num].sl_valid = 0; +} + +static void ocfs2_set_slot(struct ocfs2_slot_info *si, + int slot_num, unsigned int node_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + + si->si_slots[slot_num].sl_valid = 1; + si->si_slots[slot_num].sl_node_num = node_num; +} + +/* This version is for the extended slot map */ +static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si) +{ + int b, i, slotno; + struct ocfs2_slot_map_extended *se; + + slotno = 0; + for (b = 0; b < si->si_blocks; b++) { + se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data; + for (i = 0; + (i < si->si_slots_per_block) && + (slotno < si->si_num_slots); + i++, slotno++) { + if (se->se_slots[i].es_valid) + ocfs2_set_slot(si, slotno, + le32_to_cpu(se->se_slots[i].es_node_num)); + else + ocfs2_invalidate_slot(si, slotno); + } + } +} + +/* + * Post the slot information on disk into our slot_info struct. + * Must be protected by osb_lock. + */ +static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si) +{ + int i; + struct ocfs2_slot_map *sm; + + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; + + for (i = 0; i < si->si_num_slots; i++) { + if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) + ocfs2_invalidate_slot(si, i); + else + ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); + } +} + +static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +{ + /* + * The slot data will have been refreshed when ocfs2_super_lock + * was taken. + */ + if (si->si_extended) + ocfs2_update_slot_info_extended(si); + else + ocfs2_update_slot_info_old(si); +} + +int ocfs2_refresh_slot_info(struct ocfs2_super *osb) +{ + int ret; + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + BUG_ON(si->si_blocks == 0); + BUG_ON(si->si_bh == NULL); + + trace_ocfs2_refresh_slot_info(si->si_blocks); + + /* + * We pass -1 as blocknr because we expect all of si->si_bh to + * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If + * this is not true, the read of -1 (UINT64_MAX) will fail. + */ + ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks, + si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL); + if (ret == 0) { + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + spin_unlock(&osb->osb_lock); + } + + return ret; +} + +/* post the our slot info stuff into it's destination bh and write it + * out. */ +static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) +{ + int blkind = slot_num / si->si_slots_per_block; + int slotno = slot_num % si->si_slots_per_block; + struct ocfs2_slot_map_extended *se; + + BUG_ON(blkind >= si->si_blocks); + + se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data; + se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid; + if (si->si_slots[slot_num].sl_valid) + se->se_slots[slotno].es_node_num = + cpu_to_le32(si->si_slots[slot_num].sl_node_num); + *bh = si->si_bh[blkind]; +} + +static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) +{ + int i; + struct ocfs2_slot_map *sm; + + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; + for (i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid) + sm->sm_slots[i] = + cpu_to_le16(si->si_slots[i].sl_node_num); + else + sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + } + *bh = si->si_bh[0]; +} + +static int ocfs2_update_disk_slot(struct ocfs2_super *osb, + struct ocfs2_slot_info *si, + int slot_num) +{ + int status; + struct buffer_head *bh; + + spin_lock(&osb->osb_lock); + if (si->si_extended) + ocfs2_update_disk_slot_extended(si, slot_num, &bh); + else + ocfs2_update_disk_slot_old(si, slot_num, &bh); + spin_unlock(&osb->osb_lock); + + status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode)); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * Calculate how many bytes are needed by the slot map. Returns + * an error if the slot map file is too small. + */ +static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, + struct inode *inode, + unsigned long long *bytes) +{ + unsigned long long bytes_needed; + + if (ocfs2_uses_extended_slot_map(osb)) { + bytes_needed = osb->max_slots * + sizeof(struct ocfs2_extended_slot); + } else { + bytes_needed = osb->max_slots * sizeof(__le16); + } + if (bytes_needed > i_size_read(inode)) { + mlog(ML_ERROR, + "Slot map file is too small! (size %llu, needed %llu)\n", + i_size_read(inode), bytes_needed); + return -ENOSPC; + } + + *bytes = bytes_needed; + return 0; +} + +/* try to find global node in the slot info. Returns -ENOENT + * if nothing is found. */ +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num) +{ + int i, ret = -ENOENT; + + for(i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid && + (node_num == si->si_slots[i].sl_node_num)) { + ret = i; + break; + } + } + + return ret; +} + +static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, + int preferred) +{ + int i, ret = -ENOSPC; + + if ((preferred >= 0) && (preferred < si->si_num_slots)) { + if (!si->si_slots[preferred].sl_valid) { + ret = preferred; + goto out; + } + } + + for(i = 0; i < si->si_num_slots; i++) { + if (!si->si_slots[i].sl_valid) { + ret = i; + break; + } + } +out: + return ret; +} + +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) +{ + int slot; + struct ocfs2_slot_info *si = osb->slot_info; + + spin_lock(&osb->osb_lock); + slot = __ocfs2_node_num_to_slot(si, node_num); + spin_unlock(&osb->osb_lock); + + return slot; +} + +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + assert_spin_locked(&osb->osb_lock); + + BUG_ON(slot_num < 0); + BUG_ON(slot_num >= osb->max_slots); + + if (!si->si_slots[slot_num].sl_valid) + return -ENOENT; + + *node_num = si->si_slots[slot_num].sl_node_num; + return 0; +} + +static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) +{ + unsigned int i; + + if (si == NULL) + return; + + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) { + for (i = 0; i < si->si_blocks; i++) { + if (si->si_bh[i]) { + brelse(si->si_bh[i]); + si->si_bh[i] = NULL; + } + } + kfree(si->si_bh); + } + + kfree(si); +} + +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, slot_num); + spin_unlock(&osb->osb_lock); + + return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num); +} + +static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) +{ + int status = 0; + u64 blkno; + unsigned long long blocks, bytes = 0; + unsigned int i; + struct buffer_head *bh; + + status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); + if (status) + goto bail; + + blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); + BUG_ON(blocks > UINT_MAX); + si->si_blocks = blocks; + if (!si->si_blocks) + goto bail; + + if (si->si_extended) + si->si_slots_per_block = + (osb->sb->s_blocksize / + sizeof(struct ocfs2_extended_slot)); + else + si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16); + + /* The size checks above should ensure this */ + BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); + + trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); + + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!si->si_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + for (i = 0; i < si->si_blocks; i++) { + status = ocfs2_extent_map_get_blocks(si->si_inode, i, + &blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i); + + bh = NULL; /* Acquire a fresh bh */ + status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, + 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_bh[i] = bh; + } + +bail: + return status; +} + +int ocfs2_init_slot_info(struct ocfs2_super *osb) +{ + int status; + struct inode *inode = NULL; + struct ocfs2_slot_info *si; + + si = kzalloc(sizeof(struct ocfs2_slot_info) + + (sizeof(struct ocfs2_slot) * osb->max_slots), + GFP_KERNEL); + if (!si) { + status = -ENOMEM; + mlog_errno(status); + return status; + } + + si->si_extended = ocfs2_uses_extended_slot_map(osb); + si->si_num_slots = osb->max_slots; + si->si_slots = (struct ocfs2_slot *)((char *)si + + sizeof(struct ocfs2_slot_info)); + + inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + si->si_inode = inode; + status = ocfs2_map_slot_buffers(osb, si); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + osb->slot_info = (struct ocfs2_slot_info *)si; +bail: + if (status < 0) + __ocfs2_free_slot_info(si); + + return status; +} + +void ocfs2_free_slot_info(struct ocfs2_super *osb) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + osb->slot_info = NULL; + __ocfs2_free_slot_info(si); +} + +int ocfs2_find_slot(struct ocfs2_super *osb) +{ + int status; + int slot; + struct ocfs2_slot_info *si; + + si = osb->slot_info; + + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); + + ocfs2_set_slot(si, slot, osb->node_num); + osb->slot_num = slot; + spin_unlock(&osb->osb_lock); + + trace_ocfs2_find_slot(osb->slot_num); + + status = ocfs2_update_disk_slot(osb, si, osb->slot_num); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +void ocfs2_put_slot(struct ocfs2_super *osb) +{ + int status, slot_num; + struct ocfs2_slot_info *si = osb->slot_info; + + if (!si) + return; + + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + + slot_num = osb->slot_num; + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + + status = ocfs2_update_disk_slot(osb, si, slot_num); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + ocfs2_free_slot_info(osb); +} + diff --git a/kernel/fs/ocfs2/slot_map.h b/kernel/fs/ocfs2/slot_map.h new file mode 100644 index 000000000..601c95fd7 --- /dev/null +++ b/kernel/fs/ocfs2/slot_map.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slotmap.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef SLOTMAP_H +#define SLOTMAP_H + +int ocfs2_init_slot_info(struct ocfs2_super *osb); +void ocfs2_free_slot_info(struct ocfs2_super *osb); + +int ocfs2_find_slot(struct ocfs2_super *osb); +void ocfs2_put_slot(struct ocfs2_super *osb); + +int ocfs2_refresh_slot_info(struct ocfs2_super *osb); + +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num); + +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num); + +#endif diff --git a/kernel/fs/ocfs2/stack_o2cb.c b/kernel/fs/ocfs2/stack_o2cb.c new file mode 100644 index 000000000..220cae7bb --- /dev/null +++ b/kernel/fs/ocfs2/stack_o2cb.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_o2cb.c + * + * Code which interfaces ocfs2 with the o2cb stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include + +/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ +#include + +#include "cluster/masklog.h" +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "stackglue.h" + +struct o2dlm_private { + struct dlm_eviction_cb op_eviction_cb; +}; + +static struct ocfs2_stack_plugin o2cb_stack; + +/* These should be identical */ +#if (DLM_LOCK_IV != LKM_IVMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_NL != LKM_NLMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CR != LKM_CRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CW != LKM_CWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PR != LKM_PRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PW != LKM_PWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_EX != LKM_EXMODE) +# error Lock modes do not match +#endif +static inline int mode_to_o2dlm(int mode) +{ + BUG_ON(mode > LKM_MAXMODE); + + return mode; +} + +#define map_flag(_generic, _o2dlm) \ + if (flags & (_generic)) { \ + flags &= ~(_generic); \ + o2dlm_flags |= (_o2dlm); \ + } +static int flags_to_o2dlm(u32 flags) +{ + int o2dlm_flags = 0; + + map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); + map_flag(DLM_LKF_CANCEL, LKM_CANCEL); + map_flag(DLM_LKF_CONVERT, LKM_CONVERT); + map_flag(DLM_LKF_VALBLK, LKM_VALBLK); + map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); + map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); + map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); + map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); + map_flag(DLM_LKF_LOCAL, LKM_LOCAL); + + /* map_flag() should have cleared every flag passed in */ + BUG_ON(flags != 0); + + return o2dlm_flags; +} +#undef map_flag + +/* + * Map an o2dlm status to standard errno values. + * + * o2dlm only uses a handful of these, and returns even fewer to the + * caller. Still, we try to assign sane values to each error. + * + * The following value pairs have special meanings to dlmglue, thus + * the right hand side needs to stay unique - never duplicate the + * mapping elsewhere in the table! + * + * DLM_NORMAL: 0 + * DLM_NOTQUEUED: -EAGAIN + * DLM_CANCELGRANT: -EBUSY + * DLM_CANCEL: -DLM_ECANCEL + */ +/* Keep in sync with dlmapi.h */ +static int status_map[] = { + [DLM_NORMAL] = 0, /* Success */ + [DLM_GRANTED] = -EINVAL, + [DLM_DENIED] = -EACCES, + [DLM_DENIED_NOLOCKS] = -EACCES, + [DLM_WORKING] = -EACCES, + [DLM_BLOCKED] = -EINVAL, + [DLM_BLOCKED_ORPHAN] = -EINVAL, + [DLM_DENIED_GRACE_PERIOD] = -EACCES, + [DLM_SYSERR] = -ENOMEM, /* It is what it is */ + [DLM_NOSUPPORT] = -EPROTO, + [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ + [DLM_IVLOCKID] = -EINVAL, + [DLM_SYNC] = -EINVAL, + [DLM_BADTYPE] = -EINVAL, + [DLM_BADRESOURCE] = -EINVAL, + [DLM_MAXHANDLES] = -ENOMEM, + [DLM_NOCLINFO] = -EINVAL, + [DLM_NOLOCKMGR] = -EINVAL, + [DLM_NOPURGED] = -EINVAL, + [DLM_BADARGS] = -EINVAL, + [DLM_VOID] = -EINVAL, + [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ + [DLM_IVBUFLEN] = -EINVAL, + [DLM_CVTUNGRANT] = -EPERM, + [DLM_BADPARAM] = -EINVAL, + [DLM_VALNOTVALID] = -EINVAL, + [DLM_REJECTED] = -EPERM, + [DLM_ABORT] = -EINVAL, + [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ + [DLM_IVRESHANDLE] = -EINVAL, + [DLM_DEADLOCK] = -EDEADLK, + [DLM_DENIED_NOASTS] = -EINVAL, + [DLM_FORWARD] = -EINVAL, + [DLM_TIMEOUT] = -ETIMEDOUT, + [DLM_IVGROUPID] = -EINVAL, + [DLM_VERS_CONFLICT] = -EOPNOTSUPP, + [DLM_BAD_DEVICE_PATH] = -ENOENT, + [DLM_NO_DEVICE_PERMISSION] = -EPERM, + [DLM_NO_CONTROL_DEVICE] = -ENOENT, + [DLM_RECOVERING] = -ENOTCONN, + [DLM_MIGRATING] = -ERESTART, + [DLM_MAXSTATS] = -EINVAL, +}; + +static int dlm_status_to_errno(enum dlm_status status) +{ + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); + + return status_map[status]; +} + +static void o2dlm_lock_ast_wrapper(void *astarg) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); +} + +static void o2dlm_blocking_ast_wrapper(void *astarg, int level) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); +} + +static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + int error = dlm_status_to_errno(status); + + /* + * In o2dlm, you can get both the lock_ast() for the lock being + * granted and the unlock_ast() for the CANCEL failing. A + * successful cancel sends DLM_NORMAL here. If the + * lock grant happened before the cancel arrived, you get + * DLM_CANCELGRANT. + * + * There's no need for the double-ast. If we see DLM_CANCELGRANT, + * we just ignore it. We expect the lock_ast() to handle the + * granted lock. + */ + if (status == DLM_CANCELGRANT) + return; + + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error); +} + +static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + enum dlm_status status; + int o2dlm_mode = mode_to_o2dlm(mode); + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, + o2dlm_flags, name, namelen, + o2dlm_lock_ast_wrapper, lksb, + o2dlm_blocking_ast_wrapper); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + enum dlm_status status; + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, + o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return dlm_status_to_errno(lksb->lksb_o2dlm.status); +} + +/* + * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB + * contents, it will zero out the LVB. Thus the caller can always trust + * the contents. + */ +static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + return 1; +} + +static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + return (void *)(lksb->lksb_o2dlm.lvb); +} + +static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ + dlm_print_one_lock(lksb->lksb_o2dlm.lockid); +} + +/* + * Check if this node is heartbeating and is connected to all other + * heartbeating nodes. + */ +static int o2cb_cluster_check(void) +{ + u8 node_num; + int i; + unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + printk(KERN_ERR "o2cb: This node has not been configured.\n"); + return -EINVAL; + } + + /* + * o2dlm expects o2net sockets to be created. If not, then + * dlm_join_domain() fails with a stack of errors which are both cryptic + * and incomplete. The idea here is to detect upfront whether we have + * managed to connect to all nodes or not. If not, then list the nodes + * to allow the user to check the configuration (incorrect IP, firewall, + * etc.) Yes, this is racy. But its not the end of the world. + */ +#define O2CB_MAP_STABILIZE_COUNT 60 + for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { + o2hb_fill_node_map(hbmap, sizeof(hbmap)); + if (!test_bit(node_num, hbmap)) { + printk(KERN_ERR "o2cb: %s heartbeat has not been " + "started.\n", (o2hb_global_heartbeat_active() ? + "Global" : "Local")); + return -EINVAL; + } + o2net_fill_node_map(netmap, sizeof(netmap)); + /* Force set the current node to allow easy compare */ + set_bit(node_num, netmap); + if (!memcmp(hbmap, netmap, sizeof(hbmap))) + return 0; + if (i < O2CB_MAP_STABILIZE_COUNT - 1) + msleep(1000); + } + + printk(KERN_ERR "o2cb: This node could not connect to nodes:"); + i = -1; + while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (!test_bit(i, netmap)) + printk(" %u", i); + } + printk(".\n"); + + return -ENOTCONN; +} + +/* + * Called from the dlm when it's about to evict a node. This is how the + * classic stack signals node death. + */ +static void o2dlm_eviction_cb(int node_num, void *data) +{ + struct ocfs2_cluster_connection *conn = data; + + printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", + node_num, conn->cc_namelen, conn->cc_name); + + conn->cc_recovery_handler(node_num, conn->cc_recovery_data); +} + +static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + int rc = 0; + u32 dlm_key; + struct dlm_ctxt *dlm; + struct o2dlm_private *priv; + struct dlm_protocol_version fs_version; + + BUG_ON(conn == NULL); + BUG_ON(conn->cc_proto == NULL); + + /* Ensure cluster stack is up and all nodes are connected */ + rc = o2cb_cluster_check(); + if (rc) { + printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " + "before retrying.\n"); + goto out; + } + + priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto out_free; + } + + /* This just fills the structure in. It is safe to pass conn. */ + dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, + conn); + + conn->cc_private = priv; + + /* used by the dlm code to make message headers unique, each + * node in this domain must agree on this. */ + dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); + fs_version.pv_major = conn->cc_version.pv_major; + fs_version.pv_minor = conn->cc_version.pv_minor; + + dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version); + if (IS_ERR(dlm)) { + rc = PTR_ERR(dlm); + mlog_errno(rc); + goto out_free; + } + + conn->cc_version.pv_major = fs_version.pv_major; + conn->cc_version.pv_minor = fs_version.pv_minor; + conn->cc_lockspace = dlm; + + dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); + +out_free: + if (rc) + kfree(conn->cc_private); + +out: + return rc; +} + +static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + struct dlm_ctxt *dlm = conn->cc_lockspace; + struct o2dlm_private *priv = conn->cc_private; + + dlm_unregister_eviction_cb(&priv->op_eviction_cb); + conn->cc_private = NULL; + kfree(priv); + + dlm_unregister_domain(dlm); + conn->cc_lockspace = NULL; + + return 0; +} + +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) +{ + int node_num; + + node_num = o2nm_this_node(); + if (node_num == O2NM_INVALID_NODE_NUM) + return -ENOENT; + + if (node_num >= O2NM_MAX_NODES) + return -EOVERFLOW; + + *node = node_num; + return 0; +} + +static struct ocfs2_stack_operations o2cb_stack_ops = { + .connect = o2cb_cluster_connect, + .disconnect = o2cb_cluster_disconnect, + .this_node = o2cb_cluster_this_node, + .dlm_lock = o2cb_dlm_lock, + .dlm_unlock = o2cb_dlm_unlock, + .lock_status = o2cb_dlm_lock_status, + .lvb_valid = o2cb_dlm_lvb_valid, + .lock_lvb = o2cb_dlm_lvb, + .dump_lksb = o2cb_dump_lksb, +}; + +static struct ocfs2_stack_plugin o2cb_stack = { + .sp_name = "o2cb", + .sp_ops = &o2cb_stack_ops, + .sp_owner = THIS_MODULE, +}; + +static int __init o2cb_stack_init(void) +{ + return ocfs2_stack_glue_register(&o2cb_stack); +} + +static void __exit o2cb_stack_exit(void) +{ + ocfs2_stack_glue_unregister(&o2cb_stack); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); +MODULE_LICENSE("GPL"); +module_init(o2cb_stack_init); +module_exit(o2cb_stack_exit); diff --git a/kernel/fs/ocfs2/stack_user.c b/kernel/fs/ocfs2/stack_user.c new file mode 100644 index 000000000..2768eb1da --- /dev/null +++ b/kernel/fs/ocfs2/stack_user.c @@ -0,0 +1,1134 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_user.c + * + * Code which interfaces ocfs2 with fs/dlm and a userspace stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "stackglue.h" + +#include + +/* + * The control protocol starts with a handshake. Until the handshake + * is complete, the control device will fail all write(2)s. + * + * The handshake is simple. First, the client reads until EOF. Each line + * of output is a supported protocol tag. All protocol tags are a single + * character followed by a two hex digit version number. Currently the + * only things supported is T01, for "Text-base version 0x01". Next, the + * client writes the version they would like to use, including the newline. + * Thus, the protocol tag is 'T01\n'. If the version tag written is + * unknown, -EINVAL is returned. Once the negotiation is complete, the + * client can start sending messages. + * + * The T01 protocol has three messages. First is the "SETN" message. + * It has the following syntax: + * + * SETN<8-char-hex-nodenum> + * + * This is 14 characters. + * + * The "SETN" message must be the first message following the protocol. + * It tells ocfs2_control the local node number. + * + * Next comes the "SETV" message. It has the following syntax: + * + * SETV<2-char-hex-major><2-char-hex-minor> + * + * This is 11 characters. + * + * The "SETV" message sets the filesystem locking protocol version as + * negotiated by the client. The client negotiates based on the maximum + * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major + * number from the "SETV" message must match + * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number + * must be less than or equal to ...sp_max_version.pv_minor. + * + * Once this information has been set, mounts will be allowed. From this + * point on, the "DOWN" message can be sent for node down notification. + * It has the following syntax: + * + * DOWN<32-char-cap-hex-uuid><8-char-hex-nodenum> + * + * eg: + * + * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n + * + * This is 47 characters. + */ + +/* + * Whether or not the client has done the handshake. + * For now, we have just one protocol version. + */ +#define OCFS2_CONTROL_PROTO "T01\n" +#define OCFS2_CONTROL_PROTO_LEN 4 + +/* Handshake states */ +#define OCFS2_CONTROL_HANDSHAKE_INVALID (0) +#define OCFS2_CONTROL_HANDSHAKE_READ (1) +#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2) +#define OCFS2_CONTROL_HANDSHAKE_VALID (3) + +/* Messages */ +#define OCFS2_CONTROL_MESSAGE_OP_LEN 4 +#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN" +#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14 +#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV" +#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11 +#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN" +#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47 +#define OCFS2_TEXT_UUID_LEN 32 +#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 +#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; + +/* + * ocfs2_live_connection is refcounted because the filesystem and + * miscdevice sides can detach in different order. Let's just be safe. + */ +struct ocfs2_live_connection { + struct list_head oc_list; + struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; +}; + +struct ocfs2_control_private { + struct list_head op_list; + int op_state; + int op_this_node; + struct ocfs2_protocol_version op_proto; +}; + +/* SETN<8-char-hex-nodenum> */ +struct ocfs2_control_message_setn { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +/* SETV<2-char-hex-major><2-char-hex-minor> */ +struct ocfs2_control_message_setv { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char space2; + char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char newline; +}; + +/* DOWN<32-char-cap-hex-uuid><8-char-hex-nodenum> */ +struct ocfs2_control_message_down { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char uuid[OCFS2_TEXT_UUID_LEN]; + char space2; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +union ocfs2_control_message { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + struct ocfs2_control_message_setn u_setn; + struct ocfs2_control_message_setv u_setv; + struct ocfs2_control_message_down u_down; +}; + +static struct ocfs2_stack_plugin ocfs2_user_plugin; + +static atomic_t ocfs2_control_opened; +static int ocfs2_control_this_node = -1; +static struct ocfs2_protocol_version running_proto; + +static LIST_HEAD(ocfs2_live_connection_list); +static LIST_HEAD(ocfs2_control_private_list); +static DEFINE_MUTEX(ocfs2_control_lock); + +static inline void ocfs2_control_set_handshake_state(struct file *file, + int state) +{ + struct ocfs2_control_private *p = file->private_data; + p->op_state = state; +} + +static inline int ocfs2_control_get_handshake_state(struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + return p->op_state; +} + +static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) +{ + size_t len = strlen(name); + struct ocfs2_live_connection *c; + + BUG_ON(!mutex_is_locked(&ocfs2_control_lock)); + + list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) { + if ((c->oc_conn->cc_namelen == len) && + !strncmp(c->oc_conn->cc_name, name, len)) + return c; + } + + return NULL; +} + +/* + * ocfs2_live_connection structures are created underneath the ocfs2 + * mount path. Since the VFS prevents multiple calls to + * fill_super(), we can't get dupes here. + */ +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) +{ + int rc = 0; + + mutex_lock(&ocfs2_control_lock); + c->oc_conn = conn; + + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) + list_add(&c->oc_list, &ocfs2_live_connection_list); + else { + printk(KERN_ERR + "ocfs2: Userspace control daemon is not present\n"); + rc = -ESRCH; + } + + mutex_unlock(&ocfs2_control_lock); + return rc; +} + +/* + * This function disconnects the cluster connection from ocfs2_control. + * Afterwards, userspace can't affect the cluster connection. + */ +static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c) +{ + mutex_lock(&ocfs2_control_lock); + list_del_init(&c->oc_list); + c->oc_conn = NULL; + mutex_unlock(&ocfs2_control_lock); + + kfree(c); +} + +static int ocfs2_control_cfu(void *target, size_t target_len, + const char __user *buf, size_t count) +{ + /* The T01 expects write(2) calls to have exactly one command */ + if ((count != target_len) || + (count > sizeof(union ocfs2_control_message))) + return -EINVAL; + + if (copy_from_user(target, buf, target_len)) + return -EFAULT; + + return 0; +} + +static ssize_t ocfs2_control_validate_protocol(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + char kbuf[OCFS2_CONTROL_PROTO_LEN]; + + ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN, + buf, count); + if (ret) + return ret; + + if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN)) + return -EINVAL; + + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + return count; +} + +static void ocfs2_control_send_down(const char *uuid, + int nodenum) +{ + struct ocfs2_live_connection *c; + + mutex_lock(&ocfs2_control_lock); + + c = ocfs2_connection_find(uuid); + if (c) { + BUG_ON(c->oc_conn == NULL); + c->oc_conn->cc_recovery_handler(nodenum, + c->oc_conn->cc_recovery_data); + } + + mutex_unlock(&ocfs2_control_lock); +} + +/* + * Called whenever configuration elements are sent to /dev/ocfs2_control. + * If all configuration elements are present, try to set the global + * values. If there is a problem, return an error. Skip any missing + * elements, and only bump ocfs2_control_opened when we have all elements + * and are successful. + */ +static int ocfs2_control_install_private(struct file *file) +{ + int rc = 0; + int set_p = 1; + struct ocfs2_control_private *p = file->private_data; + + BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + mutex_lock(&ocfs2_control_lock); + + if (p->op_this_node < 0) { + set_p = 0; + } else if ((ocfs2_control_this_node >= 0) && + (ocfs2_control_this_node != p->op_this_node)) { + rc = -EINVAL; + goto out_unlock; + } + + if (!p->op_proto.pv_major) { + set_p = 0; + } else if (!list_empty(&ocfs2_live_connection_list) && + ((running_proto.pv_major != p->op_proto.pv_major) || + (running_proto.pv_minor != p->op_proto.pv_minor))) { + rc = -EINVAL; + goto out_unlock; + } + + if (set_p) { + ocfs2_control_this_node = p->op_this_node; + running_proto.pv_major = p->op_proto.pv_major; + running_proto.pv_minor = p->op_proto.pv_minor; + } + +out_unlock: + mutex_unlock(&ocfs2_control_lock); + + if (!rc && set_p) { + /* We set the global values successfully */ + atomic_inc(&ocfs2_control_opened); + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_VALID); + } + + return rc; +} + +static int ocfs2_control_get_this_node(void) +{ + int rc; + + mutex_lock(&ocfs2_control_lock); + if (ocfs2_control_this_node < 0) + rc = -EINVAL; + else + rc = ocfs2_control_this_node; + mutex_unlock(&ocfs2_control_lock); + + return rc; +} + +static int ocfs2_control_do_setnode_msg(struct file *file, + struct ocfs2_control_message_setn *msg) +{ + long nodenum; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space != ' ') || (msg->newline != '\n')) + return -EINVAL; + msg->space = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + p->op_this_node = nodenum; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_setversion_msg(struct file *file, + struct ocfs2_control_message_setv *msg) + { + long major, minor; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + struct ocfs2_protocol_version *max = + &ocfs2_user_plugin.sp_max_proto; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + major = simple_strtol(msg->major, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + minor = simple_strtol(msg->minor, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + /* + * The major must be between 1 and 255, inclusive. The minor + * must be between 0 and 255, inclusive. The version passed in + * must be within the maximum version supported by the filesystem. + */ + if ((major == LONG_MIN) || (major == LONG_MAX) || + (major > (u8)-1) || (major < 1)) + return -ERANGE; + if ((minor == LONG_MIN) || (minor == LONG_MAX) || + (minor > (u8)-1) || (minor < 0)) + return -ERANGE; + if ((major != max->pv_major) || + (minor > max->pv_minor)) + return -EINVAL; + + p->op_proto.pv_major = major; + p->op_proto.pv_minor = minor; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_down_msg(struct file *file, + struct ocfs2_control_message_down *msg) +{ + long nodenum; + char *p = NULL; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &p, 16); + if (!p || *p) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + + ocfs2_control_send_down(msg->uuid, nodenum); + + return 0; +} + +static ssize_t ocfs2_control_message(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + union ocfs2_control_message msg; + + /* Try to catch padding issues */ + WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) != + (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1))); + + memset(&msg, 0, sizeof(union ocfs2_control_message)); + ret = ocfs2_control_cfu(&msg, count, buf, count); + if (ret) + goto out; + + if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn); + else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv); + else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_down_msg(file, &msg.u_down); + else + ret = -EINVAL; + +out: + return ret ? ret : count; +} + +static ssize_t ocfs2_control_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + ssize_t ret; + + switch (ocfs2_control_get_handshake_state(file)) { + case OCFS2_CONTROL_HANDSHAKE_INVALID: + ret = -EINVAL; + break; + + case OCFS2_CONTROL_HANDSHAKE_READ: + ret = ocfs2_control_validate_protocol(file, buf, + count); + break; + + case OCFS2_CONTROL_HANDSHAKE_PROTOCOL: + case OCFS2_CONTROL_HANDSHAKE_VALID: + ret = ocfs2_control_message(file, buf, count); + break; + + default: + BUG(); + ret = -EIO; + break; + } + + return ret; +} + +/* + * This is a naive version. If we ever have a new protocol, we'll expand + * it. Probably using seq_file. + */ +static ssize_t ocfs2_control_read(struct file *file, + char __user *buf, + size_t count, + loff_t *ppos) +{ + ssize_t ret; + + ret = simple_read_from_buffer(buf, count, ppos, + OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN); + + /* Have we read the whole protocol list? */ + if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN) + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_READ); + + return ret; +} + +static int ocfs2_control_release(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + + mutex_lock(&ocfs2_control_lock); + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + goto out; + + if (atomic_dec_and_test(&ocfs2_control_opened)) { + if (!list_empty(&ocfs2_live_connection_list)) { + /* XXX: Do bad things! */ + printk(KERN_ERR + "ocfs2: Unexpected release of ocfs2_control!\n" + " Loss of cluster connection requires " + "an emergency restart!\n"); + emergency_restart(); + } + /* + * Last valid close clears the node number and resets + * the locking protocol version + */ + ocfs2_control_this_node = -1; + running_proto.pv_major = 0; + running_proto.pv_minor = 0; + } + +out: + list_del_init(&p->op_list); + file->private_data = NULL; + + mutex_unlock(&ocfs2_control_lock); + + kfree(p); + + return 0; +} + +static int ocfs2_control_open(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p; + + p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL); + if (!p) + return -ENOMEM; + p->op_this_node = -1; + + mutex_lock(&ocfs2_control_lock); + file->private_data = p; + list_add(&p->op_list, &ocfs2_control_private_list); + mutex_unlock(&ocfs2_control_lock); + + return 0; +} + +static const struct file_operations ocfs2_control_fops = { + .open = ocfs2_control_open, + .release = ocfs2_control_release, + .read = ocfs2_control_read, + .write = ocfs2_control_write, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +static struct miscdevice ocfs2_control_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ocfs2_control", + .fops = &ocfs2_control_fops, +}; + +static int ocfs2_control_init(void) +{ + int rc; + + atomic_set(&ocfs2_control_opened, 0); + + rc = misc_register(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to register ocfs2_control device " + "(errno %d)\n", + -rc); + + return rc; +} + +static void ocfs2_control_exit(void) +{ + int rc; + + rc = misc_deregister(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to deregister ocfs2_control device " + "(errno %d)\n", + -rc); +} + +static void fsdlm_lock_ast_wrapper(void *astarg) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + int status = lksb->lksb_fsdlm.sb_status; + + /* + * For now we're punting on the issue of other non-standard errors + * where we can't tell if the unlock_ast or lock_ast should be called. + * The main "other error" that's possible is EINVAL which means the + * function was called with invalid args, which shouldn't be possible + * since the caller here is under our control. Other non-standard + * errors probably fall into the same category, or otherwise are fatal + * which means we can't carry on anyway. + */ + + if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0); + else + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); +} + +static void fsdlm_blocking_ast_wrapper(void *astarg, int level) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); +} + +static int user_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + int ret; + + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); + + ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); + return ret; +} + +static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + int ret; + + ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); + return ret; +} + +static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return lksb->lksb_fsdlm.sb_status; +} + +static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; + + return !invalid; +} + +static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); + return (void *)(lksb->lksb_fsdlm.sb_lvbptr); +} + +static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ +} + +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + +/* + * Compare a requested locking protocol version against the current one. + * + * If the major numbers are different, they are incompatible. + * If the current minor is greater than the request, they are incompatible. + * If the current minor is less than or equal to the request, they are + * compatible, and the requester should run at the current minor version. + */ +static int fs_protocol_compare(struct ocfs2_protocol_version *existing, + struct ocfs2_protocol_version *request) +{ + if (existing->pv_major != request->pv_major) + return 1; + + if (existing->pv_minor > request->pv_minor) + return 1; + + if (existing->pv_minor < request->pv_minor) + request->pv_minor = existing->pv_minor; + + return 0; +} + +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + +static int user_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + dlm_lockspace_t *fsdlm; + struct ocfs2_live_connection *lc; + int rc, ops_rv; + + BUG_ON(conn == NULL); + + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) + return -ENOMEM; + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); + if (rc) + goto out; + + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + + /* + * running_proto must have been set before we allowed any mounts + * to proceed. + */ + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { + printk(KERN_ERR + "Unable to mount with fs locking protocol version " + "%u.%u because negotiated protocol is %u.%u\n", + conn->cc_version.pv_major, conn->cc_version.pv_minor, + running_proto.pv_major, running_proto.pv_minor); + rc = -EPROTO; + ocfs2_live_connection_drop(lc); + lc = NULL; + } + +out: + if (rc) + kfree(lc); + return rc; +} + + +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) +{ + int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; + + if (rc < 0) + return rc; + + *this_node = rc; + return 0; +} + +static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { + .connect = user_cluster_connect, + .disconnect = user_cluster_disconnect, + .this_node = user_cluster_this_node, + .dlm_lock = user_dlm_lock, + .dlm_unlock = user_dlm_unlock, + .lock_status = user_dlm_lock_status, + .lvb_valid = user_dlm_lvb_valid, + .lock_lvb = user_dlm_lvb, + .plock = user_plock, + .dump_lksb = user_dlm_dump_lksb, +}; + +static struct ocfs2_stack_plugin ocfs2_user_plugin = { + .sp_name = "user", + .sp_ops = &ocfs2_user_plugin_ops, + .sp_owner = THIS_MODULE, +}; + + +static int __init ocfs2_user_plugin_init(void) +{ + int rc; + + rc = ocfs2_control_init(); + if (!rc) { + rc = ocfs2_stack_glue_register(&ocfs2_user_plugin); + if (rc) + ocfs2_control_exit(); + } + + return rc; +} + +static void __exit ocfs2_user_plugin_exit(void) +{ + ocfs2_stack_glue_unregister(&ocfs2_user_plugin); + ocfs2_control_exit(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks"); +MODULE_LICENSE("GPL"); +module_init(ocfs2_user_plugin_init); +module_exit(ocfs2_user_plugin_exit); diff --git a/kernel/fs/ocfs2/stackglue.c b/kernel/fs/ocfs2/stackglue.c new file mode 100644 index 000000000..5d965e83b --- /dev/null +++ b/kernel/fs/ocfs2/stackglue.c @@ -0,0 +1,748 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.c + * + * Code which implements an OCFS2 specific interface to underlying + * cluster stacks. + * + * Copyright (C) 2007, 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ocfs2_fs.h" + +#include "stackglue.h" + +#define OCFS2_STACK_PLUGIN_O2CB "o2cb" +#define OCFS2_STACK_PLUGIN_USER "user" +#define OCFS2_MAX_HB_CTL_PATH 256 + +static struct ocfs2_protocol_version locking_max_version; +static DEFINE_SPINLOCK(ocfs2_stack_lock); +static LIST_HEAD(ocfs2_stack_list); +static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; +static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; + +/* + * The stack currently in use. If not null, active_stack->sp_count > 0, + * the module is pinned, and the locking protocol cannot be changed. + */ +static struct ocfs2_stack_plugin *active_stack; + +static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) +{ + struct ocfs2_stack_plugin *p; + + assert_spin_locked(&ocfs2_stack_lock); + + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + if (!strcmp(p->sp_name, name)) + return p; + } + + return NULL; +} + +static int ocfs2_stack_driver_request(const char *stack_name, + const char *plugin_name) +{ + int rc; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + + /* + * If the stack passed by the filesystem isn't the selected one, + * we can't continue. + */ + if (strcmp(stack_name, cluster_stack_name)) { + rc = -EBUSY; + goto out; + } + + if (active_stack) { + /* + * If the active stack isn't the one we want, it cannot + * be selected right now. + */ + if (!strcmp(active_stack->sp_name, plugin_name)) + rc = 0; + else + rc = -EBUSY; + goto out; + } + + p = ocfs2_stack_lookup(plugin_name); + if (!p || !try_module_get(p->sp_owner)) { + rc = -ENOENT; + goto out; + } + + active_stack = p; + rc = 0; + +out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + + spin_unlock(&ocfs2_stack_lock); + return rc; +} + +/* + * This function looks up the appropriate stack and makes it active. If + * there is no stack, it tries to load it. It will fail if the stack still + * cannot be found. It will also fail if a different stack is in use. + */ +static int ocfs2_stack_driver_get(const char *stack_name) +{ + int rc; + char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; + + /* + * Classic stack does not pass in a stack name. This is + * compatible with older tools as well. + */ + if (!stack_name || !*stack_name) + stack_name = OCFS2_STACK_PLUGIN_O2CB; + + if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { + printk(KERN_ERR + "ocfs2 passed an invalid cluster stack label: \"%s\"\n", + stack_name); + return -EINVAL; + } + + /* Anything that isn't the classic stack is a user stack */ + if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) + plugin_name = OCFS2_STACK_PLUGIN_USER; + + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + if (rc == -ENOENT) { + request_module("ocfs2_stack_%s", plugin_name); + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + } + + if (rc == -ENOENT) { + printk(KERN_ERR + "ocfs2: Cluster stack driver \"%s\" cannot be found\n", + plugin_name); + } else if (rc == -EBUSY) { + printk(KERN_ERR + "ocfs2: A different cluster stack is in use\n"); + } + + return rc; +} + +static void ocfs2_stack_driver_put(void) +{ + spin_lock(&ocfs2_stack_lock); + BUG_ON(active_stack == NULL); + BUG_ON(active_stack->sp_count == 0); + + active_stack->sp_count--; + if (!active_stack->sp_count) { + module_put(active_stack->sp_owner); + active_stack = NULL; + } + spin_unlock(&ocfs2_stack_lock); +} + +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) +{ + int rc; + + spin_lock(&ocfs2_stack_lock); + if (!ocfs2_stack_lookup(plugin->sp_name)) { + plugin->sp_count = 0; + plugin->sp_max_proto = locking_max_version; + list_add(&plugin->sp_list, &ocfs2_stack_list); + printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", + plugin->sp_name); + rc = 0; + } else { + printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", + plugin->sp_name); + rc = -EEXIST; + } + spin_unlock(&ocfs2_stack_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); + +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) +{ + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + p = ocfs2_stack_lookup(plugin->sp_name); + if (p) { + BUG_ON(p != plugin); + BUG_ON(plugin == active_stack); + BUG_ON(plugin->sp_count != 0); + list_del_init(&plugin->sp_list); + printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", + plugin->sp_name); + } else { + printk(KERN_ERR "Stack \"%s\" is not registered\n", + plugin->sp_name); + } + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); + +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) +{ + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + if (memcmp(max_proto, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + BUG_ON(locking_max_version.pv_major != 0); + + locking_max_version = *max_proto; + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + p->sp_max_proto = locking_max_version; + } + } + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); + + +/* + * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument + * for the ast and bast functions. They will pass the lksb to the ast + * and bast. The caller can wrap the lksb with their own structure to + * get more information. + */ +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + if (!lksb->lksb_conn) + lksb->lksb_conn = conn; + else + BUG_ON(lksb->lksb_conn != conn); + return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, + name, namelen); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); + +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + BUG_ON(lksb->lksb_conn == NULL); + + return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); + +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_status(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); + +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lvb_valid(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); + +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_lvb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); + +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ + active_stack->sp_ops->dump_lksb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); + +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + +int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, + const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + int rc = 0; + struct ocfs2_cluster_connection *new_conn; + + BUG_ON(group == NULL); + BUG_ON(conn == NULL); + BUG_ON(recovery_handler == NULL); + + if (grouplen > GROUP_NAME_MAX) { + rc = -EINVAL; + goto out; + } + + if (memcmp(&lproto->lp_max_version, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + rc = -EINVAL; + goto out; + } + + new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), + GFP_KERNEL); + if (!new_conn) { + rc = -ENOMEM; + goto out; + } + + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + new_conn->cc_namelen = grouplen; + if (cluster_name_len) + strlcpy(new_conn->cc_cluster_name, cluster_name, + CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; + new_conn->cc_recovery_handler = recovery_handler; + new_conn->cc_recovery_data = recovery_data; + + new_conn->cc_proto = lproto; + /* Start the new connection at our maximum compatibility level */ + new_conn->cc_version = lproto->lp_max_version; + + /* This will pin the stack driver if successful */ + rc = ocfs2_stack_driver_get(stack_name); + if (rc) + goto out_free; + + rc = active_stack->sp_ops->connect(new_conn); + if (rc) { + ocfs2_stack_driver_put(); + goto out_free; + } + + *conn = new_conn; + +out_free: + if (rc) + kfree(new_conn); + +out: + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); + +/* The caller will ensure all nodes have the same cluster stack */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + char *stack_name = NULL; + + if (cluster_stack_name[0]) + stack_name = cluster_stack_name; + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); + +/* If hangup_pending is 0, the stack driver will be dropped */ +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending) +{ + int ret; + + BUG_ON(conn == NULL); + + ret = active_stack->sp_ops->disconnect(conn); + + /* XXX Should we free it anyway? */ + if (!ret) { + kfree(conn); + if (!hangup_pending) + ocfs2_stack_driver_put(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); + +/* + * Leave the group for this filesystem. This is executed by a userspace + * program (stored in ocfs2_hb_ctl_path). + */ +static void ocfs2_leave_group(const char *group) +{ + int ret; + char *argv[5], *envp[3]; + + argv[0] = ocfs2_hb_ctl_path; + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = (char *)group; + argv[4] = NULL; + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (ret < 0) { + printk(KERN_ERR + "ocfs2: Error %d running user helper " + "\"%s %s %s %s\"\n", + ret, argv[0], argv[1], argv[2], argv[3]); + } +} + +/* + * Hangup is a required post-umount. ocfs2-tools software expects the + * filesystem to call "ocfs2_hb_ctl" during unmount. This happens + * regardless of whether the DLM got started, so we can't do it + * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does + * the actual work. + */ +void ocfs2_cluster_hangup(const char *group, int grouplen) +{ + BUG_ON(group == NULL); + BUG_ON(group[grouplen] != '\0'); + + ocfs2_leave_group(group); + + /* cluster_disconnect() was called with hangup_pending==1 */ + ocfs2_stack_driver_put(); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); + +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) +{ + return active_stack->sp_ops->this_node(conn, node); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); + + +/* + * Sysfs bits + */ + +static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (locking_max_version.pv_major) + ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", + locking_max_version.pv_major, + locking_max_version.pv_minor); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_max_locking_protocol = + __ATTR(max_locking_protocol, S_IRUGO, + ocfs2_max_locking_protocol_show, NULL); + +static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + ret = snprintf(buf, remain, "%s\n", + p->sp_name); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ocfs2_stack_lock); + + return total; +} + +static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = + __ATTR(loaded_cluster_plugins, S_IRUGO, + ocfs2_loaded_cluster_plugins_show, NULL); + +static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + ret = snprintf(buf, PAGE_SIZE, "%s\n", + active_stack->sp_name); + if (ret == PAGE_SIZE) + ret = -E2BIG; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_active_cluster_plugin = + __ATTR(active_cluster_plugin, S_IRUGO, + ocfs2_active_cluster_plugin_show, NULL); + +static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret; + spin_lock(&ocfs2_stack_lock); + ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + size_t len = count; + ssize_t ret; + + if (len == 0) + return len; + + if (buf[len - 1] == '\n') + len--; + + if ((len != OCFS2_STACK_LABEL_LEN) || + (strnlen(buf, len) != len)) + return -EINVAL; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + if (!strncmp(buf, cluster_stack_name, len)) + ret = count; + else + ret = -EBUSY; + } else { + memcpy(cluster_stack_name, buf, len); + ret = count; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + + +static struct kobj_attribute ocfs2_attr_cluster_stack = + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, + ocfs2_cluster_stack_show, + ocfs2_cluster_stack_store); + + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + +static struct attribute *ocfs2_attrs[] = { + &ocfs2_attr_max_locking_protocol.attr, + &ocfs2_attr_loaded_cluster_plugins.attr, + &ocfs2_attr_active_cluster_plugin.attr, + &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, + NULL, +}; + +static struct attribute_group ocfs2_attr_group = { + .attrs = ocfs2_attrs, +}; + +static struct kset *ocfs2_kset; + +static void ocfs2_sysfs_exit(void) +{ + kset_unregister(ocfs2_kset); +} + +static int ocfs2_sysfs_init(void) +{ + int ret; + + ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); + if (!ocfs2_kset) + return -ENOMEM; + + ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); + if (ret) + goto error; + + return 0; + +error: + kset_unregister(ocfs2_kset); + return ret; +} + +/* + * Sysctl bits + * + * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't + * make as much sense in a multiple cluster stack world, but it's safer + * and easier to preserve the name. + */ + +#define FS_OCFS2_NM 1 + +static struct ctl_table ocfs2_nm_table[] = { + { + .procname = "hb_ctl_path", + .data = ocfs2_hb_ctl_path, + .maxlen = OCFS2_MAX_HB_CTL_PATH, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { } +}; + +static struct ctl_table ocfs2_mod_table[] = { + { + .procname = "nm", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_nm_table + }, + { } +}; + +static struct ctl_table ocfs2_kern_table[] = { + { + .procname = "ocfs2", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_mod_table + }, + { } +}; + +static struct ctl_table ocfs2_root_table[] = { + { + .procname = "fs", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_kern_table + }, + { } +}; + +static struct ctl_table_header *ocfs2_table_header; + + +/* + * Initialization + */ + +static int __init ocfs2_stack_glue_init(void) +{ + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); + + ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + if (!ocfs2_table_header) { + printk(KERN_ERR + "ocfs2 stack glue: unable to register sysctl\n"); + return -ENOMEM; /* or something. */ + } + + return ocfs2_sysfs_init(); +} + +static void __exit ocfs2_stack_glue_exit(void) +{ + memset(&locking_max_version, 0, + sizeof(struct ocfs2_protocol_version)); + locking_max_version.pv_major = 0; + locking_max_version.pv_minor = 0; + ocfs2_sysfs_exit(); + if (ocfs2_table_header) + unregister_sysctl_table(ocfs2_table_header); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_LICENSE("GPL"); +module_init(ocfs2_stack_glue_init); +module_exit(ocfs2_stack_glue_exit); diff --git a/kernel/fs/ocfs2/stackglue.h b/kernel/fs/ocfs2/stackglue.h new file mode 100644 index 000000000..66334a30c --- /dev/null +++ b/kernel/fs/ocfs2/stackglue.h @@ -0,0 +1,301 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.h + * + * Glue to the underlying cluster stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef STACKGLUE_H +#define STACKGLUE_H + +#include +#include +#include + +#include "dlm/dlmapi.h" +#include + +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + +/* + * dlmconstants.h does not have a LOCAL flag. We hope to remove it + * some day, but right now we need it. Let's fake it. This value is larger + * than any flag in dlmconstants.h. + */ +#define DLM_LKF_LOCAL 0x00100000 + +/* + * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably + * wants to be in a public header. + */ +#define GROUP_NAME_MAX 64 + +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + + +/* + * ocfs2_protocol_version changes when ocfs2 does something different in + * its inter-node behavior. See dlmglue.c for more information. + */ +struct ocfs2_protocol_version { + u8 pv_major; + u8 pv_minor; +}; + +/* + * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only + * has a pointer to separately allocated lvb space. This struct exists only to + * include in the lksb union to make space for a combined dlm_lksb and lvb. + */ +struct fsdlm_lksb_plus_lvb { + struct dlm_lksb lksb; + char lvb[DLM_LVB_LEN]; +}; + +/* + * A union of all lock status structures. We define it here so that the + * size of the union is known. Lock status structures are embedded in + * ocfs2 inodes. + */ +struct ocfs2_cluster_connection; +struct ocfs2_dlm_lksb { + union { + struct dlm_lockstatus lksb_o2dlm; + struct dlm_lksb lksb_fsdlm; + struct fsdlm_lksb_plus_lvb padding; + }; + struct ocfs2_cluster_connection *lksb_conn; +}; + +/* + * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. + */ +struct ocfs2_locking_protocol { + struct ocfs2_protocol_version lp_max_version; + void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb); + void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level); + void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error); +}; + + +/* + * A cluster connection. Mostly opaque to ocfs2, the connection holds + * state for the underlying stack. ocfs2 does use cc_version to determine + * locking compatibility. + */ +struct ocfs2_cluster_connection { + char cc_name[GROUP_NAME_MAX + 1]; + int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; + struct ocfs2_protocol_version cc_version; + struct ocfs2_locking_protocol *cc_proto; + void (*cc_recovery_handler)(int node_num, void *recovery_data); + void *cc_recovery_data; + void *cc_lockspace; + void *cc_private; +}; + +/* + * Each cluster stack implements the stack operations structure. Not used + * in the ocfs2 code, the stackglue code translates generic cluster calls + * into stack operations. + */ +struct ocfs2_stack_operations { + /* + * The fs code calls ocfs2_cluster_connect() to attach a new + * filesystem to the cluster stack. The ->connect() op is passed + * an ocfs2_cluster_connection with the name and recovery field + * filled in. + * + * The stack must set up any notification mechanisms and create + * the filesystem lockspace in the DLM. The lockspace should be + * stored on cc_lockspace. Any other information can be stored on + * cc_private. + * + * ->connect() must not return until it is guaranteed that + * + * - Node down notifications for the filesystem will be received + * and passed to conn->cc_recovery_handler(). + * - Locking requests for the filesystem will be processed. + */ + int (*connect)(struct ocfs2_cluster_connection *conn); + + /* + * The fs code calls ocfs2_cluster_disconnect() when a filesystem + * no longer needs cluster services. All DLM locks have been + * dropped, and recovery notification is being ignored by the + * fs code. The stack must disengage from the DLM and discontinue + * recovery notification. + * + * Once ->disconnect() has returned, the connection structure will + * be freed. Thus, a stack must not return from ->disconnect() + * until it will no longer reference the conn pointer. + * + * Once this call returns, the stack glue will be dropping this + * connection's reference on the module. + */ + int (*disconnect)(struct ocfs2_cluster_connection *conn); + + /* + * ->this_node() returns the cluster's unique identifier for the + * local node. + */ + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); + + /* + * Call the underlying dlm lock function. The ->dlm_lock() + * callback should convert the flags and mode as appropriate. + * + * ast and bast functions are not part of the call because the + * stack will likely want to wrap ast and bast calls before passing + * them to stack->sp_proto. There is no astarg. The lksb will + * be passed back to the ast and bast functions. The caller can + * use this to find their object. + */ + int (*dlm_lock)(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen); + + /* + * Call the underlying dlm unlock function. The ->dlm_unlock() + * function should convert the flags as appropriate. + * + * The unlock ast is not passed, as the stack will want to wrap + * it before calling stack->sp_proto->lp_unlock_ast(). There is + * no astarg. The lksb will be passed back to the unlock ast + * function. The caller can use this to find their object. + */ + int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags); + + /* + * Return the status of the current lock status block. The fs + * code should never dereference the union. The ->lock_status() + * callback pulls out the stack-specific lksb, converts the status + * to a proper errno, and returns it. + */ + int (*lock_status)(struct ocfs2_dlm_lksb *lksb); + + /* + * Return non-zero if the LVB is valid. + */ + int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb); + + /* + * Pull the lvb pointer off of the stack-specific lksb. + */ + void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb); + + /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* + * This is an optoinal debugging hook. If provided, the + * stack can dump debugging information about this lock. + */ + void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); +}; + +/* + * Each stack plugin must describe itself by registering a + * ocfs2_stack_plugin structure. This is only seen by stackglue and the + * stack driver. + */ +struct ocfs2_stack_plugin { + char *sp_name; + struct ocfs2_stack_operations *sp_ops; + struct module *sp_owner; + + /* These are managed by the stackglue code. */ + struct list_head sp_list; + unsigned int sp_count; + struct ocfs2_protocol_version sp_max_proto; +}; + + +/* Used by the filesystem */ +int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, + const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); +/* + * Used by callers that don't store their stack name. They must ensure + * all nodes have the same stack. + */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending); +void ocfs2_cluster_hangup(const char *group, int grouplen); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); + +struct ocfs2_lock_res; +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen); +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags); + +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb); +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb); +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb); +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb); + +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto); + + +/* Used by stack plugins */ +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); + +#endif /* STACKGLUE_H */ diff --git a/kernel/fs/ocfs2/suballoc.c b/kernel/fs/ocfs2/suballoc.c new file mode 100644 index 000000000..447902963 --- /dev/null +++ b/kernel/fs/ocfs2/suballoc.c @@ -0,0 +1,2919 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.c + * + * metadata alloc and free + * Inspired by ext3 block groups. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define NOT_ALLOC_NEW_GROUP 0 +#define ALLOC_NEW_GROUP 0x1 +#define ALLOC_GROUPS_FROM_GLOBAL 0x2 + +#define OCFS2_MAX_TO_STEAL 1024 + +struct ocfs2_suballoc_result { + u64 sr_bg_blkno; /* The bg we allocated from. Set + to 0 when a block group is + contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ + u64 sr_blkno; /* The first allocated block */ + unsigned int sr_bit_offset; /* The bit in the bg */ + unsigned int sr_bits; /* How many bits we claimed */ +}; + +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); +static int ocfs2_block_group_fill(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + unsigned int group_clusters, + u16 my_chain, + struct ocfs2_chain_list *cl); +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags); + +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res); +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res); +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res); +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr); +static int ocfs2_relink_block_group(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain); +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted); +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off); +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac); + +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +{ + struct inode *inode = ac->ac_inode; + + if (inode) { + if (ac->ac_which != OCFS2_AC_USE_LOCAL) + ocfs2_inode_unlock(inode, 1); + + mutex_unlock(&inode->i_mutex); + + iput(inode); + ac->ac_inode = NULL; + } + brelse(ac->ac_bh); + ac->ac_bh = NULL; + ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } +} + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + ocfs2_free_ac_resource(ac); + kfree(ac); +} + +static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) +{ + return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); +} + +#define do_error(fmt, ...) \ + do{ \ + if (resize) \ + mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ + else \ + ocfs2_error(sb, fmt, ##__VA_ARGS__); \ + } while (0) + +static int ocfs2_validate_gd_self(struct super_block *sb, + struct buffer_head *bh, + int resize) +{ + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + do_error("Group descriptor #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + gd->bg_signature); + return -EINVAL; + } + + if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { + do_error("Group descriptor #%llu has an invalid bg_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { + do_error("Group descriptor #%llu has an invalid " + "fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(gd->bg_generation)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + do_error("Group descriptor #%llu has bit count %u but " + "claims that %u are free", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + do_error("Group descriptor #%llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_validate_gd_parent(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh, + int resize) +{ + unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + if (di->i_blkno != gd->bg_parent_dinode) { + do_error("Group descriptor #%llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EINVAL; + } + + max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); + if (le16_to_cpu(gd->bg_bits) > max_bits) { + do_error("Group descriptor #%llu has bit count of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits)); + return -EINVAL; + } + + /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ + if ((le16_to_cpu(gd->bg_chain) > + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || + ((le16_to_cpu(gd->bg_chain) == + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { + do_error("Group descriptor #%llu has bad chain %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_chain)); + return -EINVAL; + } + + return 0; +} + +#undef do_error + +/* + * This version only prints errors. It does not fail the filesystem, and + * exists only for resize. + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) { + mlog(ML_ERROR, + "Checksum failed for group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + } else + rc = ocfs2_validate_gd_self(sb, bh, 1); + if (!rc) + rc = ocfs2_validate_gd_parent(sb, di, bh, 1); + + return rc; +} + +static int ocfs2_validate_group_descriptor(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + trace_ocfs2_validate_group_descriptor( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) + return rc; + + /* + * Errors after here are fatal. + */ + + return ocfs2_validate_gd_self(sb, bh, 0); +} + +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, + ocfs2_validate_group_descriptor); + if (rc) + goto out; + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) { + brelse(tmp); + goto out; + } + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc; +} + +static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, + struct ocfs2_group_desc *bg, + struct ocfs2_chain_list *cl, + u64 p_blkno, unsigned int clusters) +{ + struct ocfs2_extent_list *el = &bg->bg_list; + struct ocfs2_extent_rec *rec; + + BUG_ON(!ocfs2_supports_discontig_bg(osb)); + if (!el->l_next_free_rec) + el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; + rec->e_blkno = cpu_to_le64(p_blkno); + rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / + le16_to_cpu(cl->cl_bpc)); + rec->e_leaf_clusters = cpu_to_le16(clusters); + le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&bg->bg_free_bits_count, + clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&el->l_next_free_rec, 1); +} + +static int ocfs2_block_group_fill(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + unsigned int group_clusters, + u16 my_chain, + struct ocfs2_chain_list *cl) +{ + int status = 0; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct super_block * sb = alloc_inode->i_sb; + + if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { + ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " + "b_blocknr (%llu)", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bg, 0, sb->s_blocksize); + strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); + bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, + osb->s_feature_incompat)); + bg->bg_chain = cpu_to_le16(my_chain); + bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; + bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); + bg->bg_blkno = cpu_to_le64(group_blkno); + if (group_clusters == le16_to_cpu(cl->cl_cpg)) + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + else + ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, + group_clusters); + + /* set the 1st bit in the bitmap to account for the descriptor block */ + ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); + bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); + + ocfs2_journal_dirty(handle, bg_bh); + + /* There is no need to zero out or otherwise initialize the + * other blocks in a group - All valid FS metadata in a block + * group stores the superblock fs_generation value at + * allocation time. */ + +bail: + if (status) + mlog_errno(status); + return status; +} + +static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_count)) { + if (le32_to_cpu(cl->cl_recs[best].c_total) > + le32_to_cpu(cl->cl_recs[curr].c_total)) + best = curr; + curr++; + } + return best; +} + +static struct buffer_head * +ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + struct buffer_head *bg_bh; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + + status = ocfs2_claim_clusters(handle, ac, + le16_to_cpu(cl->cl_cpg), &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_contig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + brelse(bg_bh); + mlog_errno(status); + } + +bail: + return status ? ERR_PTR(status) : bg_bh; +} + +static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + unsigned int min_bits, + u32 *bit_off, u32 *num_bits) +{ + int status = 0; + + while (min_bits) { + status = ocfs2_claim_clusters(handle, ac, min_bits, + bit_off, num_bits); + if (status != -ENOSPC) + break; + + min_bits >>= 1; + } + + return status; +} + +static int ocfs2_block_group_grow_discontig(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl, + unsigned int min_bits) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = + (struct ocfs2_group_desc *)bg_bh->b_data; + unsigned int needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + u32 p_cpos, clusters; + u64 p_blkno; + struct ocfs2_extent_list *el = &bg->bg_list; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count))) { + if (min_bits > needed) + min_bits = needed; + status = ocfs2_block_group_claim_bits(osb, handle, ac, + min_bits, &p_cpos, + &clusters); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); + ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, + clusters); + + min_bits = clusters; + needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + } + + if (needed > 0) { + /* + * We have used up all the extent rec but can't fill up + * the cpg. So bail out. + */ + status = -ENOSPC; + goto bail; + } + + ocfs2_journal_dirty(handle, bg_bh); + +bail: + return status; +} + +static void ocfs2_bg_alloc_cleanup(handle_t *handle, + struct ocfs2_alloc_context *cluster_ac, + struct inode *alloc_inode, + struct buffer_head *bg_bh) +{ + int i, ret; + struct ocfs2_group_desc *bg; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + if (!bg_bh) + return; + + bg = (struct ocfs2_group_desc *)bg_bh->b_data; + el = &bg->bg_list; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, + cluster_ac->ac_bh, + le64_to_cpu(rec->e_blkno), + le16_to_cpu(rec->e_leaf_clusters)); + if (ret) + mlog_errno(ret); + /* Try all the clusters to free */ + } + + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); + brelse(bg_bh); +} + +static struct buffer_head * +ocfs2_block_group_alloc_discontig(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; + struct buffer_head *bg_bh = NULL; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + + if (!ocfs2_supports_discontig_bg(osb)) { + status = -ENOSPC; + goto bail; + } + + status = ocfs2_extend_trans(handle, + ocfs2_calc_bg_discontig_credits(osb->sb)); + if (status) { + mlog_errno(status); + goto bail; + } + + /* + * We're going to be grabbing from multiple cluster groups. + * We don't have enough credits to relink them all, and the + * cluster groups will be staying in cache for the duration of + * this operation. + */ + ac->ac_disable_chain_relink = 1; + + /* Claim the first region */ + status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + min_bits = num_bits; + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_discontig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_grow_discontig(handle, alloc_inode, + bg_bh, ac, cl, min_bits); + if (status) + mlog_errno(status); + +bail: + if (status) + ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); + return status ? ERR_PTR(status) : bg_bh; +} + +/* + * We expect the block group allocator to already be locked. + */ +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags) +{ + int status, credits; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + struct ocfs2_chain_list *cl; + struct ocfs2_alloc_context *ac = NULL; + handle_t *handle = NULL; + u16 alloc_rec; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + + BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); + + cl = &fe->id2.i_chain; + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, flags, &ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_group_alloc_credits(osb->sb, + le16_to_cpu(cl->cl_cpg)); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (last_alloc_group && *last_alloc_group != 0) { + trace_ocfs2_block_group_alloc( + (unsigned long long)*last_alloc_group); + ac->ac_last_group = *last_alloc_group; + } + + bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, + ac, cl); + if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + bg_bh = ocfs2_block_group_alloc_discontig(handle, + alloc_inode, + ac, cl); + if (IS_ERR(bg_bh)) { + status = PTR_ERR(bg_bh); + bg_bh = NULL; + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc_rec = le16_to_cpu(bg->bg_chain); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, + le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; + if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) + le16_add_cpu(&cl->cl_next_free_rec, 1); + + le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); + le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); + + ocfs2_journal_dirty(handle, bh); + + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); + OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); + + status = 0; + + /* save the new last alloc group so that the caller can cache it. */ + if (last_alloc_group) + *last_alloc_group = ac->ac_last_group; + +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + + if (ac) + ocfs2_free_alloc_context(ac); + + brelse(bg_bh); + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type, + u32 slot, + u64 *last_alloc_group, + int flags) +{ + int status; + u32 bits_wanted = ac->ac_bits_wanted; + struct inode *alloc_inode; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe; + u32 free_bits; + + alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); + if (!alloc_inode) { + mlog_errno(-EINVAL); + return -EINVAL; + } + + mutex_lock(&alloc_inode->i_mutex); + + status = ocfs2_inode_lock(alloc_inode, &bh, 1); + if (status < 0) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + + mlog_errno(status); + return status; + } + + ac->ac_inode = alloc_inode; + ac->ac_alloc_slot = slot; + + fe = (struct ocfs2_dinode *) bh->b_data; + + /* The bh was validated by the inode read inside + * ocfs2_inode_lock(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { + ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", + (unsigned long long)le64_to_cpu(fe->i_blkno)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - + le32_to_cpu(fe->id1.bitmap1.i_used); + + if (bits_wanted > free_bits) { + /* cluster bitmap never grows */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, + free_bits); + status = -ENOSPC; + goto bail; + } + + if (!(flags & ALLOC_NEW_GROUP)) { + trace_ocfs2_reserve_suballoc_bits_no_new_group( + slot, bits_wanted, free_bits); + status = -ENOSPC; + goto bail; + } + + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block, + last_alloc_group, flags); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_extends); + + /* You should never ask for this much metadata */ + BUG_ON(bits_wanted > + (le32_to_cpu(fe->id1.bitmap1.i_total) + - le32_to_cpu(fe->id1.bitmap1.i_used))); + } + + get_bh(bh); + ac->ac_bh = bh; +bail: + brelse(bh); + + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_inodes_stolen, 0); +} + +static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_meta_stolen, 0); +} + +void ocfs2_init_steal_slots(struct ocfs2_super *osb) +{ + ocfs2_init_inode_steal_slot(osb); + ocfs2_init_meta_steal_slot(osb); +} + +static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) +{ + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + osb->s_inode_steal_slot = slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + osb->s_meta_steal_slot = slot; + spin_unlock(&osb->osb_lock); +} + +static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) +{ + int slot = OCFS2_INVALID_SLOT; + + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + slot = osb->s_inode_steal_slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + slot = osb->s_meta_steal_slot; + spin_unlock(&osb->osb_lock); + + return slot; +} + +static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_resource(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type) +{ + int i, status = -ENOSPC; + int slot = __ocfs2_get_steal_slot(osb, type); + + /* Start to steal resource from the first slot after ours. */ + if (slot == OCFS2_INVALID_SLOT) + slot = osb->slot_num + 1; + + for (i = 0; i < osb->max_slots; i++, slot++) { + if (slot == osb->max_slots) + slot = 0; + + if (slot == osb->slot_num) + continue; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + type, + (u32)slot, NULL, + NOT_ALLOC_NEW_GROUP); + if (status >= 0) { + __ocfs2_set_steal_slot(osb, slot, type); + break; + } + + ocfs2_free_ac_resource(ac); + } + + return status; +} + +static int ocfs2_steal_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_meta(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); +} + +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) +{ + int status; + int slot = ocfs2_get_meta_steal_slot(osb); + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = blocks; + (*ac)->ac_which = OCFS2_AC_USE_META; + (*ac)->ac_group_search = ocfs2_block_group_search; + + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) + goto extent_steal; + + atomic_set(&osb->s_num_meta_stolen, 0); + status = ocfs2_reserve_suballoc_bits(osb, (*ac), + EXTENT_ALLOC_SYSTEM_INODE, + (u32)osb->slot_num, NULL, + ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); + + + if (status >= 0) { + status = 0; + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_meta_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +extent_steal: + status = ocfs2_steal_meta(osb, *ac); + atomic_inc(&osb->s_num_meta_stolen); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac) +{ + int status; + int slot = ocfs2_get_inode_steal_slot(osb); + u64 alloc_group; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = 1; + (*ac)->ac_which = OCFS2_AC_USE_INODE; + + (*ac)->ac_group_search = ocfs2_block_group_search; + + /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* + * slot is set when we successfully steal inode from other nodes. + * It is reset in 3 places: + * 1. when we flush the truncate log + * 2. when we complete local alloc recovery. + * 3. when we successfully allocate from our own slot. + * After it is set, we will go on stealing inodes until we find the + * need to check our slots to see whether there is some space for us. + */ + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) + goto inode_steal; + + atomic_set(&osb->s_num_inodes_stolen, 0); + alloc_group = osb->osb_inode_alloc_group; + status = ocfs2_reserve_suballoc_bits(osb, *ac, + INODE_ALLOC_SYSTEM_INODE, + (u32)osb->slot_num, + &alloc_group, + ALLOC_NEW_GROUP | + ALLOC_GROUPS_FROM_GLOBAL); + if (status >= 0) { + status = 0; + + spin_lock(&osb->osb_lock); + osb->osb_inode_alloc_group = alloc_group; + spin_unlock(&osb->osb_lock); + trace_ocfs2_reserve_new_inode_new_group( + (unsigned long long)alloc_group); + + /* + * Some inodes must be freed by us, so try to allocate + * from our own next time. + */ + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_inode_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +inode_steal: + status = ocfs2_steal_inode(osb, *ac); + atomic_inc(&osb->s_num_inodes_stolen); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +/* local alloc code has to do the same thing, so rather than do this + * twice.. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + + ac->ac_which = OCFS2_AC_USE_MAIN; + ac->ac_group_search = ocfs2_cluster_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, NULL, + ALLOC_NEW_GROUP); + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + +bail: + return status; +} + +/* Callers don't need to care which bitmap (local alloc or main) to + * use so we figure it out for them, but unfortunately this clutters + * things a bit. */ +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac) +{ + int status; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; + + status = -ENOSPC; + if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && + ocfs2_alloc_should_use_local(osb, bits_wanted)) { + status = ocfs2_reserve_local_alloc_bits(osb, + bits_wanted, + *ac); + if ((status < 0) && (status != -ENOSPC)) { + mlog_errno(status); + goto bail; + } + } + + if (status == -ENOSPC) { + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, + ALLOC_NEW_GROUP, ac); +} + +/* + * More or less lifted from ext3. I'll leave their description below: + * + * "For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes." + * + * Note: OCFS2 already does this differently for metadata vs data + * allocations, as those bitmaps are separate and undo access is never + * called on a metadata group descriptor. + */ +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr) +{ + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + int ret; + + if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) + return 0; + + if (!buffer_jbd(bg_bh)) + return 1; + + jbd_lock_bh_state(bg_bh); + bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + jbd_unlock_bh_state(bg_bh); + + return ret; +} + +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + unsigned int total_bits, + struct ocfs2_suballoc_result *res) +{ + void *bitmap; + u16 best_offset, best_size; + int offset, start, found, status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + /* Callers got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + + found = start = best_offset = best_size = 0; + bitmap = bg->bg_bitmap; + + while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { + if (offset == total_bits) + break; + + if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { + /* We found a zero, but we can't use it as it + * hasn't been put to disk yet! */ + found = 0; + start = offset + 1; + } else if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_size) { + best_size = found; + best_offset = start - found; + } + /* we got everything we needed */ + if (found == bits_wanted) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + if (best_size) { + res->sr_bit_offset = best_offset; + res->sr_bits = best_size; + } else { + status = -ENOSPC; + /* No error log here -- see the comment above + * ocfs2_test_bg_bit_allocatable */ + } + + return status; +} + +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + trace_ocfs2_block_group_set_bits(bit_off, num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + while(num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + ocfs2_journal_dirty(handle, group_bh); + +bail: + return status; +} + +/* find the one with the most empty bits */ +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + BUG_ON(!cl->cl_next_free_rec); + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_next_free_rec)) { + if (le32_to_cpu(cl->cl_recs[curr].c_free) > + le32_to_cpu(cl->cl_recs[best].c_free)) + best = curr; + curr++; + } + + BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); + return best; +} + +static int ocfs2_relink_block_group(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain) +{ + int status; + /* there is a really tiny chance the journal calls could fail, + * but we wouldn't want inconsistent blocks in *any* case. */ + u64 bg_ptr, prev_bg_ptr; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; + + /* The caller got these descriptors from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); + + trace_ocfs2_relink_block_group( + (unsigned long long)le64_to_cpu(fe->i_blkno), chain, + (unsigned long long)le64_to_cpu(bg->bg_blkno), + (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); + + bg_ptr = le64_to_cpu(bg->bg_next_group); + prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); + + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out; + + prev_bg->bg_next_group = bg->bg_next_group; + ocfs2_journal_dirty(handle, prev_bg_bh); + + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out_rollback_prev_bg; + + bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; + ocfs2_journal_dirty(handle, bg_bh); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out_rollback_bg; + + fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + ocfs2_journal_dirty(handle, fe_bh); + +out: + if (status < 0) + mlog_errno(status); + return status; + +out_rollback_bg: + bg->bg_next_group = cpu_to_le64(bg_ptr); +out_rollback_prev_bg: + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + goto out; +} + +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted) +{ + return le16_to_cpu(bg->bg_free_bits_count) > wanted; +} + +/* return 0 on success, -ENOSPC to keep searching and any other < 0 + * value on error. */ +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res) +{ + int search = -ENOSPC; + int ret; + u64 blkoff; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int max_bits, gd_cluster_off; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (gd->bg_free_bits_count) { + max_bits = le16_to_cpu(gd->bg_bits); + + /* Tail groups in cluster bitmaps which aren't cpg + * aligned are prone to partial extension by a failed + * fs resize. If the file system resize never got to + * update the dinode cluster count, then we don't want + * to trust any clusters past it, regardless of what + * the group descriptor says. */ + gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + if ((gd_cluster_off + max_bits) > + OCFS2_I(inode)->ip_clusters) { + max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; + trace_ocfs2_cluster_group_search_wrong_max_bits( + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); + } + + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + max_bits, res); + if (ret) + return ret; + + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + res->sr_bit_offset + + res->sr_bits); + trace_ocfs2_cluster_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + + /* ocfs2_block_group_find_clear_bits() might + * return success, but we still want to return + * -ENOSPC unless it found the minimum number + * of bits. */ + if (min_bits <= res->sr_bits) + search = 0; /* success */ + else if (res->sr_bits) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); + } + } + + return search; +} + +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res) +{ + int ret = -ENOSPC; + u64 blkoff; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON(min_bits != 1); + BUG_ON(ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) { + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + le16_to_cpu(bg->bg_bits), + res); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + + res->sr_bit_offset + res->sr_bits; + trace_ocfs2_block_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } + + return ret; +} + +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + ocfs2_journal_dirty(handle, di_bh); + +out: + return ret; +} + +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + +static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, + struct ocfs2_extent_rec *rec, + struct ocfs2_chain_list *cl) +{ + unsigned int bpc = le16_to_cpu(cl->cl_bpc); + unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; + unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; + + if (res->sr_bit_offset < bitoff) + return 0; + if (res->sr_bit_offset >= (bitoff + bitcount)) + return 0; + res->sr_blkno = le64_to_cpu(rec->e_blkno) + + (res->sr_bit_offset - bitoff); + if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) + res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; + return 1; +} + +static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, + struct ocfs2_group_desc *bg, + struct ocfs2_suballoc_result *res) +{ + int i; + u64 bg_blkno = res->sr_bg_blkno; /* Save off */ + struct ocfs2_extent_rec *rec; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { + res->sr_blkno = 0; + return; + } + + res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; + res->sr_bg_blkno = 0; /* Clear it for contig block groups */ + if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || + !bg->bg_list.l_next_free_rec) + return; + + for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { + rec = &bg->bg_list.l_recs[i]; + if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { + res->sr_bg_blkno = bg_blkno; /* Restore */ + break; + } + } +} + +static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res, + u16 *bits_left) +{ + int ret; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *gd; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct inode *alloc_inode = ac->ac_inode; + + ret = ocfs2_read_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + gd = (struct ocfs2_group_desc *) group_bh->b_data; + ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, + ac->ac_max_block, res); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + if (!ret) + ocfs2_bg_discontig_fix_result(ac, gd, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, + res->sr_bit_offset, res->sr_bits); + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); + mlog_errno(ret); + } + +out_loc_only: + *bits_left = le16_to_cpu(gd->bg_free_bits_count); + +out: + brelse(group_bh); + + return ret; +} + +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res, + u16 *bits_left) +{ + int status; + u16 chain; + u64 next_group; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *group_bh = NULL; + struct buffer_head *prev_group_bh = NULL; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + struct ocfs2_group_desc *bg; + + chain = ac->ac_chain; + trace_ocfs2_search_chain_begin( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + bits_wanted, chain); + + status = ocfs2_read_group_descriptor(alloc_inode, fe, + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + + status = -ENOSPC; + /* for now, the chain search is a bit simplistic. We just use + * the 1st group with any empty bits. */ + while ((status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, + res)) == -ENOSPC) { + if (!bg->bg_next_group) + break; + + brelse(prev_group_bh); + prev_group_bh = NULL; + + next_group = le64_to_cpu(bg->bg_next_group); + prev_group_bh = group_bh; + group_bh = NULL; + status = ocfs2_read_group_descriptor(alloc_inode, fe, + next_group, &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + trace_ocfs2_search_chain_succ( + (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); + + res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); + + BUG_ON(res->sr_bits == 0); + if (!status) + ocfs2_bg_discontig_fix_result(ac, bg, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + /* + * Keep track of previous block descriptor read. When + * we find a target, if we have read more than X + * number of descriptors, and the target is reasonably + * empty, relink him to top of his chain. + * + * We've read 0 extra blocks and only send one more to + * the transaction, yet the next guy to search has a + * much easier time. + * + * Do this *after* figuring out how many bits we're taking out + * of our target group. + */ + if (!ac->ac_disable_chain_relink && + (prev_group_bh) && + (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { + status = ocfs2_relink_block_group(handle, alloc_inode, + ac->ac_bh, group_bh, + prev_group_bh, chain); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (ac->ac_find_loc_only) + goto out_loc_only; + + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_set_bits(handle, + alloc_inode, + bg, + group_bh, + res->sr_bit_offset, + res->sr_bits); + if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); + mlog_errno(status); + goto bail; + } + + trace_ocfs2_search_chain_end( + (unsigned long long)le64_to_cpu(fe->i_blkno), + res->sr_bits); + +out_loc_only: + *bits_left = le16_to_cpu(bg->bg_free_bits_count); +bail: + brelse(group_bh); + brelse(prev_group_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* will give out up to bits_wanted contiguous bits. */ +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res) +{ + int status; + u16 victim, i; + u16 bits_left = 0; + u64 hint = ac->ac_last_group; + struct ocfs2_chain_list *cl; + struct ocfs2_dinode *fe; + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); + BUG_ON(!ac->ac_bh); + + fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* The bh was validated by the inode read during + * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= + le32_to_cpu(fe->id1.bitmap1.i_total)) { + ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used " + "bits but only %u total.", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); + status = -EIO; + goto bail; + } + + res->sr_bg_blkno = hint; + if (res->sr_bg_blkno) { + /* Attempt to short-circuit the usual search mechanism + * by jumping straight to the most recently used + * allocation group. This helps us maintain some + * contiguousness across allocations. */ + status = ocfs2_search_one_group(ac, handle, bits_wanted, + min_bits, res, &bits_left); + if (!status) + goto set_hint; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + + victim = ocfs2_find_victim_chain(cl); + ac->ac_chain = victim; + + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); + goto set_hint; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_claim_suballoc_bits(victim); + + /* If we didn't pick a good victim, then just default to + * searching each chain in order. Don't allow chain relinking + * because we only calculate enough journal credits for one + * relink per alloc. */ + ac->ac_disable_chain_relink = 1; + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { + if (i == victim) + continue; + if (!cl->cl_recs[i].c_free) + continue; + + ac->ac_chain = i; + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); + break; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + +set_hint: + if (status != -ENOSPC) { + /* If the next search of this group is not likely to + * yield a suitable extent, then we reset the last + * group hint so as to not waste a disk read */ + if (bits_left < min_bits) + ac->ac_last_group = 0; + else + ac->ac_last_group = hint; + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_metadata(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u64 *suballoc_loc, + u16 *suballoc_bit_start, + unsigned int *num_bits, + u64 *blkno_start) +{ + int status; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); + BUG_ON(ac->ac_which != OCFS2_AC_USE_META); + + status = ocfs2_claim_suballoc_bits(ac, + handle, + bits_wanted, + 1, + &res); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit_start = res.sr_bit_offset; + *blkno_start = res.sr_blkno; + ac->ac_bits_given += res.sr_bits; + *num_bits = res.sr_bits; + status = 0; +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_init_inode_ac_group(struct inode *dir, + struct buffer_head *parent_di_bh, + struct ocfs2_alloc_context *ac) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; + /* + * Try to allocate inodes from some specific group. + * + * If the parent dir has recorded the last group used in allocation, + * cool, use it. Otherwise if we try to allocate new inode from the + * same slot the parent dir belongs to, use the same chunk. + * + * We are very careful here to avoid the mistake of setting + * ac_last_group to a group descriptor from a different (unlocked) slot. + */ + if (OCFS2_I(dir)->ip_last_used_group && + OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) + ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; + else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { + if (di->i_suballoc_loc) + ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); + else + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(di->i_blkno), + le16_to_cpu(di->i_suballoc_bit)); + } +} + +static inline void ocfs2_save_inode_ac_group(struct inode *dir, + struct ocfs2_alloc_context *ac) +{ + OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; + OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; +} + +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + ocfs2_update_inode_fsync_trans(handle, dir, 0); +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); + mlog_errno(ret); + goto out; + } + + trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, + res->sr_bits); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + +int ocfs2_claim_new_inode(handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 *fe_blkno) +{ + int status; + struct ocfs2_suballoc_result res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + status = ocfs2_claim_suballoc_bits(ac, + handle, + 1, + 1, + &res); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res.sr_bits != 1); + + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit = res.sr_bit_offset; + *fe_blkno = res.sr_blkno; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + status = 0; +bail: + if (status) + mlog_errno(status); + return status; +} + +/* translate a group desc. blkno and it's bitmap offset into + * disk cluster offset. */ +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 cluster = 0; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg_blkno != osb->first_cluster_group_blkno) + cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); + cluster += (u32) bg_bit_off; + return cluster; +} + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 group_no; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + group_no = cluster / osb->bitmap_cpg; + if (!group_no) + return osb->first_cluster_group_blkno; + return ocfs2_clusters_to_blocks(inode->i_sb, + group_no * osb->bitmap_cpg); +} + +/* given the block number of a cluster start, calculate which cluster + * group and descriptor bitmap offset that corresponds to. */ +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + *bg_blkno = ocfs2_which_cluster_group(inode, + data_cluster); + + if (*bg_blkno == osb->first_cluster_group_blkno) + *bg_bit_off = (u16) data_cluster; + else + *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, + data_blkno - *bg_blkno); +} + +/* + * min_bits - minimum contiguous chunk from this total allocation we + * can handle. set to what we asked for originally for a full + * contig. allocation, set to '1' to indicate we can deal with extents + * of any size. + */ +int __ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 max_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + int status; + unsigned int bits_wanted = max_clusters; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL + && ac->ac_which != OCFS2_AC_USE_MAIN); + + if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + WARN_ON(min_clusters > 1); + + status = ocfs2_claim_local_alloc_bits(osb, + handle, + ac, + bits_wanted, + cluster_start, + num_clusters); + if (!status) + atomic_inc(&osb->alloc_stats.local_data); + } else { + if (min_clusters > (osb->bitmap_cpg - 1)) { + /* The only paths asking for contiguousness + * should know about this already. */ + mlog(ML_ERROR, "minimum allocation requested %u exceeds " + "group bitmap size %u!\n", min_clusters, + osb->bitmap_cpg); + status = -ENOSPC; + goto bail; + } + /* clamp the current request down to a realistic size. */ + if (bits_wanted > (osb->bitmap_cpg - 1)) + bits_wanted = osb->bitmap_cpg - 1; + + status = ocfs2_claim_suballoc_bits(ac, + handle, + bits_wanted, + min_clusters, + &res); + if (!status) { + BUG_ON(res.sr_blkno); /* cluster alloc can't set */ + *cluster_start = + ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, + res.sr_bg_blkno, + res.sr_bit_offset); + atomic_inc(&osb->alloc_stats.bitmap_data); + *num_clusters = res.sr_bits; + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + ac->ac_bits_given += *num_clusters; + +bail: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + + return __ocfs2_claim_clusters(handle, ac, min_clusters, + bits_wanted, cluster_start, num_clusters); +} + +static int ocfs2_block_group_clear_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits, + void (*undo_fn)(unsigned int bit, + unsigned long *bmap)) +{ + int status; + unsigned int tmp; + struct ocfs2_group_desc *undo_bg = NULL; + + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + + trace_ocfs2_block_group_clear_bits(bit_off, num_bits); + + BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + group_bh, + undo_fn ? + OCFS2_JOURNAL_ACCESS_UNDO : + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (undo_fn) { + jbd_lock_bh_state(group_bh); + undo_bg = (struct ocfs2_group_desc *) + bh2jh(group_bh)->b_committed_data; + BUG_ON(!undo_bg); + } + + tmp = num_bits; + while(tmp--) { + ocfs2_clear_bit((bit_off + tmp), + (unsigned long *) bg->bg_bitmap); + if (undo_fn) + undo_fn(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); + } + le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + + if (undo_fn) + jbd_unlock_bh_state(group_bh); + + ocfs2_journal_dirty(handle, group_bh); +bail: + return status; +} + +/* + * expects the suballoc inode to already be locked. + */ +static int _ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) +{ + int status = 0; + u32 tmp_used; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group; + + /* The alloc_bh comes from ocfs2_free_dinode() or + * ocfs2_free_clusters(). The callers have all locked the + * allocator and gotten alloc_bh from the lock call. This + * validates the dinode buffer. Any corruption that has happened + * is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); + + trace_ocfs2_free_suballoc_bits( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + (unsigned long long)bg_blkno, + start_bit, count); + + status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, + &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + group = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + + status = ocfs2_block_group_clear_bits(handle, alloc_inode, + group, group_bh, + start_bit, count, undo_fn); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, + start_bit, count); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, + count); + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); + ocfs2_journal_dirty(handle, alloc_bh); + +bail: + brelse(group_bh); + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, + start_bit, bg_blkno, count, NULL); +} + +int ocfs2_free_dinode(handle_t *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di) +{ + u64 blk = le64_to_cpu(di->i_blkno); + u16 bit = le16_to_cpu(di->i_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + if (di->i_suballoc_loc) + bg_blkno = le64_to_cpu(di->i_suballoc_loc); + return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, + inode_alloc_bh, bit, bg_blkno, 1); +} + +static int _ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) +{ + int status; + u16 bg_start_bit; + u64 bg_blkno; + struct ocfs2_dinode *fe; + + /* You can't ever have a contiguous set of clusters + * bigger than a block group bitmap so we never have to worry + * about looping on them. + * This is expensive. We can safely remove once this stuff has + * gotten tested really well. */ + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + + fe = (struct ocfs2_dinode *) bitmap_bh->b_data; + + ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, + &bg_start_bit); + + trace_ocfs2_free_clusters((unsigned long long)bg_blkno, + (unsigned long long)start_blk, + bg_start_bit, num_clusters); + + status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters, undo_fn); + if (status < 0) { + mlog_errno(status); + goto out; + } + + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_set_bit); +} + +/* + * Give never-used clusters back to the global bitmap. We don't need + * to protect these bits in the undo buffer. + */ +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_clear_bit); +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) +{ + printk("Block Group:\n"); + printk("bg_signature: %s\n", bg->bg_signature); + printk("bg_size: %u\n", bg->bg_size); + printk("bg_bits: %u\n", bg->bg_bits); + printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); + printk("bg_chain: %u\n", bg->bg_chain); + printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); + printk("bg_next_group: %llu\n", + (unsigned long long)bg->bg_next_group); + printk("bg_parent_dinode: %llu\n", + (unsigned long long)bg->bg_parent_dinode); + printk("bg_blkno: %llu\n", + (unsigned long long)bg->bg_blkno); +} + +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) +{ + int i; + + printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); + printk("i_signature: %s\n", fe->i_signature); + printk("i_size: %llu\n", + (unsigned long long)fe->i_size); + printk("i_clusters: %u\n", fe->i_clusters); + printk("i_generation: %u\n", + le32_to_cpu(fe->i_generation)); + printk("id1.bitmap1.i_used: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_used)); + printk("id1.bitmap1.i_total: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_total)); + printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); + printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); + printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); + printk("id2.i_chain.cl_next_free_rec: %u\n", + fe->id2.i_chain.cl_next_free_rec); + for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { + printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_free); + printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_total); + printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, + (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); + } +} + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} + +/* + * Read the inode specified by blkno to get suballoc_slot and + * suballoc_bit. + */ +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + + trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); + + /* dirty read disk */ + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); + if (status < 0) { + mlog(ML_ERROR, "read block %llu failed %d\n", + (unsigned long long)blkno, status); + goto bail; + } + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + mlog(ML_ERROR, "invalid inode %llu requested\n", + (unsigned long long)blkno); + status = -EINVAL; + goto bail; + } + + if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", + (unsigned long long)blkno, + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); + +bail: + brelse(inode_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * test whether bit is SET in allocator bitmap or not. on success, 0 + * is returned and *res is 1 for SET; 0 otherwise. when fails, errno + * is returned and *res is meaningless. Call this after you have + * cluster locked against suballoc, or you may get a result based on + * non-up2date contents + */ +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, + struct inode *suballoc, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, + u16 bit, int *res) +{ + struct ocfs2_dinode *alloc_di; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, + (unsigned int)bit); + + alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { + mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", + (unsigned int)bit, + ocfs2_bits_per_group(&alloc_di->id2.i_chain)); + status = -EINVAL; + goto bail; + } + + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, + &group_bh); + if (status < 0) { + mlog(ML_ERROR, "read group %llu failed %d\n", + (unsigned long long)bg_blkno, status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); + +bail: + brelse(group_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * Test if the bit representing this inode (blkno) is set in the + * suballocator. + * + * On success, 0 is returned and *res is 1 for SET; 0 otherwise. + * + * In the event of failure, a negative value is returned and *res is + * meaningless. + * + * Callers must make sure to hold nfs_sync_lock to prevent + * ocfs2_delete_inode() on another node from accessing the same + * suballocator concurrently. + */ +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) +{ + int status; + u64 group_blkno = 0; + u16 suballoc_bit = 0, suballoc_slot = 0; + struct inode *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + + trace_ocfs2_test_inode_bit((unsigned long long)blkno); + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &group_blkno, &suballoc_bit); + if (status < 0) { + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); + goto bail; + } + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + /* the error code could be inaccurate, but we are not able to + * get the correct one. */ + status = -EINVAL; + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", + (u32)suballoc_slot); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", + (u32)suballoc_slot, status); + goto bail; + } + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + group_blkno, blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); + + iput(inode_alloc_inode); + brelse(alloc_bh); +bail: + if (status) + mlog_errno(status); + return status; +} diff --git a/kernel/fs/ocfs2/suballoc.h b/kernel/fs/ocfs2/suballoc.h new file mode 100644 index 000000000..2d2501767 --- /dev/null +++ b/kernel/fs/ocfs2/suballoc.h @@ -0,0 +1,237 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.h + * + * Defines sub allocator api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _CHAINALLOC_H_ +#define _CHAINALLOC_H_ + +struct ocfs2_suballoc_result; +typedef int (group_search_t)(struct inode *, + struct buffer_head *, + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + struct ocfs2_suballoc_result *); + /* found bits */ + +struct ocfs2_alloc_context { + struct inode *ac_inode; /* which bitmap are we allocating from? */ + struct buffer_head *ac_bh; /* file entry bh */ + u32 ac_alloc_slot; /* which slot are we allocating from? */ + u32 ac_bits_wanted; + u32 ac_bits_given; +#define OCFS2_AC_USE_LOCAL 1 +#define OCFS2_AC_USE_MAIN 2 +#define OCFS2_AC_USE_INODE 3 +#define OCFS2_AC_USE_META 4 + u32 ac_which; + + /* these are used by the chain search */ + u16 ac_chain; + int ac_disable_chain_relink; + group_search_t *ac_group_search; + + u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ + + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + + struct ocfs2_alloc_reservation *ac_resv; +}; + +void ocfs2_init_steal_slots(struct ocfs2_super *osb); +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); +static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) +{ + return ac->ac_bits_wanted - ac->ac_bits_given; +} + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac); + +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + +int ocfs2_claim_metadata(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u64 *suballoc_loc, + u16 *suballoc_bit_start, + u32 *num_bits, + u64 *blkno_start); +int ocfs2_claim_new_inode(handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 *fe_blkno); +int ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters); +/* + * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * number of clusters smaller than the allocation reserved. + */ +int __ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 max_clusters, + u32 *cluster_start, + u32 *num_clusters); + +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count); +int ocfs2_free_dinode(handle_t *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di); +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); + +static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) +{ + u64 group = block - (u64) bit; + + return group; +} + +static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, + u64 bg_blkno) +{ + /* This should work for all block group descriptors as only + * the 1st group descriptor of the cluster bitmap is + * different. */ + + if (bg_blkno == osb->first_cluster_group_blkno) + return 0; + + /* the rest of the block groups are located at the beginning + * of their 1st cluster, so a direct translation just + * works. */ + return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); +} + +static inline int ocfs2_is_cluster_bitmap(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; +} + +/* This is for local alloc ONLY. Others should use the task-specific + * apis above. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); + +/* + * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it + * finds a problem. A caller that wants to check a group descriptor + * without going readonly should read the block with ocfs2_read_block[s]() + * and then checking it with this function. This is only resize, really. + * Everyone else should be using ocfs2_read_group_descriptor(). + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh); +/* + * Read a group descriptor block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The descriptor will be validated with + * ocfs2_validate_group_descriptor(). + */ +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh); + +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); + +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + +#endif /* _CHAINALLOC_H_ */ diff --git a/kernel/fs/ocfs2/super.c b/kernel/fs/ocfs2/super.c new file mode 100644 index 000000000..403c5660b --- /dev/null +++ b/kernel/fs/ocfs2/super.c @@ -0,0 +1,2644 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.c + * + * load/unload driver, mount/dismount volumes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "ocfs2_trace.h" + +#include + +#include "ocfs2.h" + +/* this should be the only file to include a version 1 header */ +#include "ocfs1_fs_compat.h" + +#include "alloc.h" +#include "aops.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "export.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "xattr.h" +#include "quota.h" +#include "refcounttree.h" +#include "suballoc.h" + +#include "buffer_head_io.h" + +static struct kmem_cache *ocfs2_inode_cachep; +struct kmem_cache *ocfs2_dquot_cachep; +struct kmem_cache *ocfs2_qf_chunk_cachep; + +/* OCFS2 needs to schedule several different types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. */ +struct workqueue_struct *ocfs2_wq = NULL; + +static struct dentry *ocfs2_debugfs_root; + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); + +struct mount_options +{ + unsigned long commit_interval; + unsigned long mount_opt; + unsigned int atime_quantum; + signed short slot; + int localalloc_opt; + unsigned int resv_level; + int dir_resv_level; + char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; +}; + +static int ocfs2_parse_options(struct super_block *sb, char *options, + struct mount_options *mopt, + int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); +static int ocfs2_show_options(struct seq_file *s, struct dentry *root); +static void ocfs2_put_super(struct super_block *sb); +static int ocfs2_mount_volume(struct super_block *sb); +static int ocfs2_remount(struct super_block *sb, int *flags, char *data); +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); +static int ocfs2_initialize_mem_caches(void); +static void ocfs2_free_mem_caches(void); +static void ocfs2_delete_osb(struct ocfs2_super *osb); + +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); + +static int ocfs2_sync_fs(struct super_block *sb, int wait); + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); +static void ocfs2_release_system_inodes(struct ocfs2_super *osb); +static int ocfs2_check_volume(struct ocfs2_super *osb); +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 sectsize, + struct ocfs2_blockcheck_stats *stats); +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size, + struct ocfs2_blockcheck_stats *stats); +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size); +static struct inode *ocfs2_alloc_inode(struct super_block *sb); +static void ocfs2_destroy_inode(struct inode *inode); +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); +static int ocfs2_enable_quotas(struct ocfs2_super *osb); +static void ocfs2_disable_quotas(struct ocfs2_super *osb); + +static struct dquot **ocfs2_get_dquots(struct inode *inode) +{ + return OCFS2_I(inode)->i_dquot; +} + +static const struct super_operations ocfs2_sops = { + .statfs = ocfs2_statfs, + .alloc_inode = ocfs2_alloc_inode, + .destroy_inode = ocfs2_destroy_inode, + .drop_inode = ocfs2_drop_inode, + .evict_inode = ocfs2_evict_inode, + .sync_fs = ocfs2_sync_fs, + .put_super = ocfs2_put_super, + .remount_fs = ocfs2_remount, + .show_options = ocfs2_show_options, + .quota_read = ocfs2_quota_read, + .quota_write = ocfs2_quota_write, + .get_dquots = ocfs2_get_dquots, +}; + +enum { + Opt_barrier, + Opt_err_panic, + Opt_err_ro, + Opt_intr, + Opt_nointr, + Opt_hb_none, + Opt_hb_local, + Opt_hb_global, + Opt_data_ordered, + Opt_data_writeback, + Opt_atime_quantum, + Opt_slot, + Opt_commit, + Opt_localalloc, + Opt_localflocks, + Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, + Opt_acl, + Opt_noacl, + Opt_usrquota, + Opt_grpquota, + Opt_coherency_buffered, + Opt_coherency_full, + Opt_resv_level, + Opt_dir_resv_level, + Opt_journal_async_commit, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier=%u"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_intr, "intr"}, + {Opt_nointr, "nointr"}, + {Opt_hb_none, OCFS2_HB_NONE}, + {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_hb_global, OCFS2_HB_GLOBAL}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_atime_quantum, "atime_quantum=%u"}, + {Opt_slot, "preferred_slot=%u"}, + {Opt_commit, "commit=%u"}, + {Opt_localalloc, "localalloc=%d"}, + {Opt_localflocks, "localflocks"}, + {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_coherency_buffered, "coherency=buffered"}, + {Opt_coherency_full, "coherency=full"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, + {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err, NULL} +}; + +#ifdef CONFIG_DEBUG_FS +static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) +{ + struct ocfs2_cluster_connection *cconn = osb->cconn; + struct ocfs2_recovery_map *rm = osb->recovery_map; + struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; + int i, out = 0; + + out += snprintf(buf + out, len - out, + "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", + "Device", osb->dev_str, osb->uuid_str, + osb->fs_generation, osb->vol_label); + + out += snprintf(buf + out, len - out, + "%10s => State: %d Flags: 0x%lX\n", "Volume", + atomic_read(&osb->vol_state), osb->osb_flags); + + out += snprintf(buf + out, len - out, + "%10s => Block: %lu Cluster: %d\n", "Sizes", + osb->sb->s_blocksize, osb->s_clustersize); + + out += snprintf(buf + out, len - out, + "%10s => Compat: 0x%X Incompat: 0x%X " + "ROcompat: 0x%X\n", + "Features", osb->s_feature_compat, + osb->s_feature_incompat, osb->s_feature_ro_compat); + + out += snprintf(buf + out, len - out, + "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", + osb->s_mount_opt, osb->s_atime_quantum); + + if (cconn) { + out += snprintf(buf + out, len - out, + "%10s => Stack: %s Name: %*s " + "Version: %d.%d\n", "Cluster", + (*osb->osb_cluster_stack == '\0' ? + "o2cb" : osb->osb_cluster_stack), + cconn->cc_namelen, cconn->cc_name, + cconn->cc_version.pv_major, + cconn->cc_version.pv_minor); + } + + spin_lock(&osb->dc_task_lock); + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Count: %lu WakeSeq: %lu " + "WorkSeq: %lu\n", "DownCnvt", + (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), + osb->blocked_lock_count, osb->dc_wake_sequence, + osb->dc_work_sequence); + spin_unlock(&osb->dc_task_lock); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + "Recovery", + (osb->recovery_thread_task ? + task_pid_nr(osb->recovery_thread_task) : -1)); + if (rm->rm_used == 0) + out += snprintf(buf + out, len - out, " None\n"); + else { + for (i = 0; i < rm->rm_used; i++) + out += snprintf(buf + out, len - out, " %d", + rm->rm_entries[i]); + out += snprintf(buf + out, len - out, "\n"); + } + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Interval: %lu\n", "Commit", + (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), + osb->osb_commit_interval); + + out += snprintf(buf + out, len - out, + "%10s => State: %d TxnId: %lu NumTxns: %d\n", + "Journal", osb->journal->j_state, + osb->journal->j_trans_id, + atomic_read(&osb->journal->j_num_trans)); + + out += snprintf(buf + out, len - out, + "%10s => GlobalAllocs: %d LocalAllocs: %d " + "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", + "Stats", + atomic_read(&osb->alloc_stats.bitmap_data), + atomic_read(&osb->alloc_stats.local_data), + atomic_read(&osb->alloc_stats.bg_allocs), + atomic_read(&osb->alloc_stats.moves), + atomic_read(&osb->alloc_stats.bg_extends)); + + out += snprintf(buf + out, len - out, + "%10s => State: %u Descriptor: %llu Size: %u bits " + "Default: %u bits\n", + "LocalAlloc", osb->local_alloc_state, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_bits, osb->local_alloc_default_bits); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, + "%10s => InodeSlot: %d StolenInodes: %d, " + "MetaSlot: %d StolenMeta: %d\n", "Steal", + osb->s_inode_steal_slot, + atomic_read(&osb->s_num_inodes_stolen), + osb->s_meta_steal_slot, + atomic_read(&osb->s_num_meta_stolen)); + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, "OrphanScan => "); + out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + os->os_count, os->os_seqno); + out += snprintf(buf + out, len - out, " Last Scan: "); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + out += snprintf(buf + out, len - out, "Disabled\n"); + else + out += snprintf(buf + out, len - out, "%lu seconds ago\n", + (get_seconds() - os->os_scantime.tv_sec)); + + out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + "Slots", "Num", "RecoGen"); + for (i = 0; i < osb->max_slots; ++i) { + out += snprintf(buf + out, len - out, + "%10s %c %3d %10d\n", + " ", + (i == osb->slot_num ? '*' : ' '), + i, osb->slot_recovery_generations[i]); + } + + return out; +} + +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + struct ocfs2_super *osb = inode->i_private; + char *buf = NULL; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static const struct file_operations ocfs2_osb_debug_fops = { + .open = ocfs2_osb_debug_open, + .release = ocfs2_debug_release, + .read = ocfs2_debug_read, + .llseek = generic_file_llseek, +}; + +static int ocfs2_sync_fs(struct super_block *sb, int wait) +{ + int status; + tid_t target; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (wait) { + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + } else { + ocfs2_schedule_truncate_log_flush(osb, 0); + } + + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); + } + return 0; +} + +static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) +{ + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + && (ino == USER_QUOTA_SYSTEM_INODE + || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) + return 0; + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + && (ino == GROUP_QUOTA_SYSTEM_INODE + || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) + return 0; + return 1; +} + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->root_inode = new; + + new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->sys_root_inode = new; + + for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; + i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog_errno(status); + /* FIXME: Should ERROR_RO_FS */ + mlog(ML_ERROR, "Unable to load system inode %d, " + "possibly corrupt fs?", i); + goto bail; + } + // the array now has one ref, so drop this one + iput(new); + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; + i < NUM_SYSTEM_INODES; + i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", + status, i, osb->slot_num); + goto bail; + } + /* the array now has one ref, so drop this one */ + iput(new); + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_release_system_inodes(struct ocfs2_super *osb) +{ + int i; + struct inode *inode; + + for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { + inode = osb->global_system_inodes[i]; + if (inode) { + iput(inode); + osb->global_system_inodes[i] = NULL; + } + } + + inode = osb->sys_root_inode; + if (inode) { + iput(inode); + osb->sys_root_inode = NULL; + } + + inode = osb->root_inode; + if (inode) { + iput(inode); + osb->root_inode = NULL; + } + + if (!osb->local_system_inodes) + return; + + for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { + if (osb->local_system_inodes[i]) { + iput(osb->local_system_inodes[i]); + osb->local_system_inodes[i] = NULL; + } + } + + kfree(osb->local_system_inodes); + osb->local_system_inodes = NULL; +} + +/* We're allocating fs objects, use GFP_NOFS */ +static struct inode *ocfs2_alloc_inode(struct super_block *sb) +{ + struct ocfs2_inode_info *oi; + + oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); + if (!oi) + return NULL; + + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + memset(&oi->i_dquot, 0, sizeof(oi->i_dquot)); + + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); + return &oi->vfs_inode; +} + +static void ocfs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); +} + +static void ocfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ocfs2_i_callback); +} + +static unsigned long long ocfs2_max_file_offset(unsigned int bbits, + unsigned int cbits) +{ + unsigned int bytes = 1 << cbits; + unsigned int trim = bytes; + unsigned int bitshift = 32; + + /* + * i_size and all block offsets in ocfs2 are always 64 bits + * wide. i_clusters is 32 bits, in cluster-sized units. So on + * 64 bit platforms, cluster size will be the limiting factor. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + BUILD_BUG_ON(sizeof(sector_t) != 8); + /* + * We might be limited by page cache size. + */ + if (bytes > PAGE_CACHE_SIZE) { + bytes = PAGE_CACHE_SIZE; + trim = 1; + /* + * Shift by 31 here so that we don't get larger than + * MAX_LFS_FILESIZE + */ + bitshift = 31; + } +# else + /* + * We are limited by the size of sector_t. Use block size, as + * that's what we expose to the VFS. + */ + bytes = 1 << bbits; + trim = 1; + bitshift = 31; +# endif +#endif + + /* + * Trim by a whole cluster when we can actually approach the + * on-disk limits. Otherwise we can overflow i_clusters when + * an extent start is at the max offset. + */ + return (((unsigned long long)bytes) << bitshift) - trim; +} + +static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +{ + int incompat_features; + int ret = 0; + struct mount_options parsed_options; + struct ocfs2_super *osb = OCFS2_SB(sb); + u32 tmp; + + sync_filesystem(sb); + + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { + ret = -EINVAL; + goto out; + } + + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != + (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change data mode on remount\n"); + goto out; + } + + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ + if (*flags & MS_RDONLY) { + ret = ocfs2_susp_quotas(osb, 0); + if (ret < 0) + goto out; + } + /* Lock here so the check of HARD_RO and the potential + * setting of SOFT_RO is atomic. */ + spin_lock(&osb->osb_lock); + if (osb->osb_flags & OCFS2_OSB_HARD_RO) { + mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); + ret = -EROFS; + goto unlock_osb; + } + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + } else { + if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { + mlog(ML_ERROR, "Cannot remount RDWR " + "filesystem due to previous errors.\n"); + ret = -EROFS; + goto unlock_osb; + } + incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); + if (incompat_features) { + mlog(ML_ERROR, "Cannot remount RDWR because " + "of unsupported optional features " + "(%x).\n", incompat_features); + ret = -EINVAL; + goto unlock_osb; + } + sb->s_flags &= ~MS_RDONLY; + osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; + } + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); +unlock_osb: + spin_unlock(&osb->osb_lock); + /* Enable quota accounting after remounting RW */ + if (!ret && !(*flags & MS_RDONLY)) { + if (sb_any_quota_suspended(sb)) + ret = ocfs2_susp_quotas(osb, 1); + else + ret = ocfs2_enable_quotas(osb); + if (ret < 0) { + /* Return back changes... */ + spin_lock(&osb->osb_lock); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + goto out; + } + } + } + + if (!ret) { + /* Only save off the new mount options in case of a successful + * remount. */ + osb->s_mount_opt = parsed_options.mount_opt; + osb->s_atime_quantum = parsed_options.atime_quantum; + osb->preferred_slot = parsed_options.slot; + if (parsed_options.commit_interval) + osb->osb_commit_interval = parsed_options.commit_interval; + + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); + } +out: + return ret; +} + +static int ocfs2_sb_probe(struct super_block *sb, + struct buffer_head **bh, + int *sector_size, + struct ocfs2_blockcheck_stats *stats) +{ + int status, tmpstat; + struct ocfs1_vol_disk_hdr *hdr; + struct ocfs2_dinode *di; + int blksize; + + *bh = NULL; + + /* may be > 512 */ + *sector_size = bdev_logical_block_size(sb->s_bdev); + if (*sector_size > OCFS2_MAX_BLOCKSIZE) { + mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", + *sector_size, OCFS2_MAX_BLOCKSIZE); + status = -EINVAL; + goto bail; + } + + /* Can this really happen? */ + if (*sector_size < OCFS2_MIN_BLOCKSIZE) + *sector_size = OCFS2_MIN_BLOCKSIZE; + + /* check block zero for old format */ + status = ocfs2_get_sector(sb, bh, 0, *sector_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; + if (hdr->major_version == OCFS1_MAJOR_VERSION) { + mlog(ML_ERROR, "incompatible version: %u.%u\n", + hdr->major_version, hdr->minor_version); + status = -EINVAL; + } + if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, + strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { + mlog(ML_ERROR, "incompatible volume signature: %8s\n", + hdr->signature); + status = -EINVAL; + } + brelse(*bh); + *bh = NULL; + if (status < 0) { + mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " + "upgraded before mounting with ocfs v2\n"); + goto bail; + } + + /* + * Now check at magic offset for 512, 1024, 2048, 4096 + * blocksizes. 4096 is the maximum blocksize because it is + * the minimum clustersize. + */ + status = -EINVAL; + for (blksize = *sector_size; + blksize <= OCFS2_MAX_BLOCKSIZE; + blksize <<= 1) { + tmpstat = ocfs2_get_sector(sb, bh, + OCFS2_SUPER_BLOCK_BLKNO, + blksize); + if (tmpstat < 0) { + status = tmpstat; + mlog_errno(status); + break; + } + di = (struct ocfs2_dinode *) (*bh)->b_data; + memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); + tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); + if (tmpstat < 0) { + brelse(*bh); + *bh = NULL; + } + if (tmpstat != -EAGAIN) { + status = tmpstat; + break; + } + } + +bail: + return status; +} + +static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) +{ + u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; + + if (osb->s_mount_opt & hb_enabled) { + if (ocfs2_mount_local(osb)) { + mlog(ML_ERROR, "Cannot heartbeat on a locally " + "mounted device.\n"); + return -EINVAL; + } + if (ocfs2_userspace_stack(osb)) { + mlog(ML_ERROR, "Userspace stack expected, but " + "o2cb heartbeat arguments passed to mount\n"); + return -EINVAL; + } + if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && + !ocfs2_cluster_o2cb_global_heartbeat(osb)) || + ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && + ocfs2_cluster_o2cb_global_heartbeat(osb))) { + mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); + return -EINVAL; + } + } + + if (!(osb->s_mount_opt & hb_enabled)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && + !ocfs2_userspace_stack(osb)) { + mlog(ML_ERROR, "Heartbeat has to be started to mount " + "a read-write clustered device.\n"); + return -EINVAL; + } + } + + return 0; +} + +/* + * If we're using a userspace stack, mount should have passed + * a name that matches the disk. If not, mount should not + * have passed a stack. + */ +static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, + struct mount_options *mopt) +{ + if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { + mlog(ML_ERROR, + "cluster stack passed to mount, but this filesystem " + "does not support it\n"); + return -EINVAL; + } + + if (ocfs2_userspace_stack(osb) && + strncmp(osb->osb_cluster_stack, mopt->cluster_stack, + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "cluster stack passed to mount (\"%s\") does not " + "match the filesystem (\"%s\")\n", + mopt->cluster_stack, + osb->osb_cluster_stack); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) +{ + int type; + struct super_block *sb = osb->sb; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + int status = 0; + + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + if (unsuspend) + status = dquot_resume(sb, type); + else { + struct ocfs2_mem_dqinfo *oinfo; + + /* Cancel periodic syncing before suspending */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + status = dquot_suspend(sb, type); + } + if (status < 0) + break; + } + if (status < 0) + mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " + "remount (error = %d).\n", status); + return status; +} + +static int ocfs2_enable_quotas(struct ocfs2_super *osb) +{ + struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL }; + struct super_block *sb = osb->sb; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { + LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + int status; + int type; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + inode[type] = ocfs2_get_system_file_inode(osb, ino[type], + osb->slot_num); + if (!inode[type]) { + status = -ENOENT; + goto out_quota_off; + } + status = dquot_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); + if (status < 0) + goto out_quota_off; + } + + for (type = 0; type < OCFS2_MAXQUOTAS; type++) + iput(inode[type]); + return 0; +out_quota_off: + ocfs2_disable_quotas(osb); + for (type = 0; type < OCFS2_MAXQUOTAS; type++) + iput(inode[type]); + mlog_errno(status); + return status; +} + +static void ocfs2_disable_quotas(struct ocfs2_super *osb) +{ + int type; + struct inode *inode; + struct super_block *sb = osb->sb; + struct ocfs2_mem_dqinfo *oinfo; + + /* We mostly ignore errors in this function because there's not much + * we can do when we see them */ + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { + if (!sb_has_quota_loaded(sb, type)) + continue; + /* Cancel periodic syncing before we grab dqonoff_mutex */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + inode = igrab(sb->s_dquot.files[type]); + /* Turn off quotas. This will remove all dquot structures from + * memory and so they will be automatically synced to global + * quota files */ + dquot_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (!inode) + continue; + iput(inode); + } +} + +static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct dentry *root; + int status, sector_size; + struct mount_options parsed_options; + struct inode *inode = NULL; + struct ocfs2_super *osb = NULL; + struct buffer_head *bh = NULL; + char nodestr[12]; + struct ocfs2_blockcheck_stats stats; + + trace_ocfs2_fill_super(sb, data, silent); + + if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { + status = -EINVAL; + goto read_super_error; + } + + /* probe for superblock */ + status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); + if (status < 0) { + mlog(ML_ERROR, "superblock probe failed!\n"); + goto read_super_error; + } + + status = ocfs2_initialize_super(sb, bh, sector_size, &stats); + osb = OCFS2_SB(sb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + brelse(bh); + bh = NULL; + + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } + osb->s_mount_opt = parsed_options.mount_opt; + osb->s_atime_quantum = parsed_options.atime_quantum; + osb->preferred_slot = parsed_options.slot; + osb->osb_commit_interval = parsed_options.commit_interval; + + ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); + osb->osb_resv_level = parsed_options.resv_level; + osb->osb_dir_resv_level = parsed_options.resv_level; + if (parsed_options.dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options.resv_level; + else + osb->osb_dir_resv_level = parsed_options.dir_resv_level; + + status = ocfs2_verify_userspace_stack(osb, &parsed_options); + if (status) + goto read_super_error; + + sb->s_magic = OCFS2_SUPER_MAGIC; + + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + * heartbeat=none */ + if (bdev_read_only(sb->s_bdev)) { + if (!(sb->s_flags & MS_RDONLY)) { + status = -EACCES; + mlog(ML_ERROR, "Readonly device detected but readonly " + "mount was not specified.\n"); + goto read_super_error; + } + + /* You should not be able to start a local heartbeat + * on a readonly device. */ + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + status = -EROFS; + mlog(ML_ERROR, "Local heartbeat specified on readonly " + "device.\n"); + goto read_super_error; + } + + status = ocfs2_check_journals_nolocks(osb); + if (status < 0) { + if (status == -EROFS) + mlog(ML_ERROR, "Recovery required on readonly " + "file system, but write access is " + "unavailable.\n"); + else + mlog_errno(status); + goto read_super_error; + } + + ocfs2_set_ro_flag(osb, 1); + + printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " + "Cluster services will not be used for this mount. " + "Recovery will be skipped.\n", osb->dev_str); + } + + if (!ocfs2_is_hard_readonly(osb)) { + if (sb->s_flags & MS_RDONLY) + ocfs2_set_ro_flag(osb, 0); + } + + status = ocfs2_verify_heartbeat(osb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, + ocfs2_debugfs_root); + if (!osb->osb_debug_root) { + status = -EINVAL; + mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); + goto read_super_error; + } + + osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_osb_debug_fops); + if (!osb->osb_ctxt) { + status = -EINVAL; + mlog_errno(status); + goto read_super_error; + } + + if (ocfs2_meta_ecc(osb)) { + status = ocfs2_blockcheck_stats_debugfs_install( + &osb->osb_ecc_stats, + osb->osb_debug_root); + if (status) { + mlog(ML_ERROR, + "Unable to create blockcheck statistics " + "files\n"); + goto read_super_error; + } + } + + status = ocfs2_mount_volume(sb); + if (status < 0) + goto read_super_error; + + if (osb->root_inode) + inode = igrab(osb->root_inode); + + if (!inode) { + status = -EIO; + mlog_errno(status); + goto read_super_error; + } + + root = d_make_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); + + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " + "with %s data mode.\n", + osb->dev_str, nodestr, osb->slot_num, + osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : + "ordered"); + + atomic_set(&osb->vol_state, VOLUME_MOUNTED); + wake_up(&osb->osb_mount_event); + + /* Now we can initialize quotas because we can afford to wait + * for cluster locks recovery now. That also means that truncation + * log recovery can happen but that waits for proper quota setup */ + if (!(sb->s_flags & MS_RDONLY)) { + status = ocfs2_enable_quotas(osb); + if (status < 0) { + /* We have to err-out specially here because + * s_root is already set */ + mlog_errno(status); + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + return status; + } + } + + ocfs2_complete_quota_recovery(osb); + + /* Now we wake up again for processes waiting for quotas */ + atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); + wake_up(&osb->osb_mount_event); + + /* Start this when the mount is almost sure of being successful */ + ocfs2_orphan_scan_start(osb); + + return status; + +read_super_error: + brelse(bh); + + if (osb) { + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + } + + if (status) + mlog_errno(status); + return status; +} + +static struct dentry *ocfs2_mount(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); +} + +static struct file_system_type ocfs2_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2", + .mount = ocfs2_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, + .next = NULL +}; +MODULE_ALIAS_FS("ocfs2"); + +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + +static int ocfs2_parse_options(struct super_block *sb, + char *options, + struct mount_options *mopt, + int is_remount) +{ + int status, user_stack = 0; + char *p; + u32 tmp; + + trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); + + mopt->commit_interval = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; + mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = -1; + mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; + + if (!options) { + status = 1; + goto bail; + } + + while ((p = strsep(&options, ",")) != NULL) { + int token, option; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_hb_local: + mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_hb_none: + mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; + break; + case Opt_hb_global: + mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; + break; + case Opt_barrier: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + mopt->mount_opt |= OCFS2_MOUNT_BARRIER; + else + mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_nointr: + mopt->mount_opt |= OCFS2_MOUNT_NOINTR; + break; + case Opt_err_panic: + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_data_ordered: + mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_data_writeback: + mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_atime_quantum: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0) + mopt->atime_quantum = option; + break; + case Opt_slot: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + mopt->slot = (s16)option; + break; + case Opt_commit: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option < 0) + return 0; + if (option == 0) + option = JBD2_DEFAULT_MAX_COMMIT_AGE; + mopt->commit_interval = HZ * option; + break; + case Opt_localalloc: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0) + mopt->localalloc_opt = option; + break; + case Opt_localflocks: + /* + * Changing this during remount could race + * flock() requests, or "unbalance" existing + * ones (e.g., a lock is taken in one mode but + * dropped in the other). If users care enough + * to flip locking modes during remount, we + * could add a "local" flag to individual + * flock structures for proper tracking of + * state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; + case Opt_stack: + /* Check both that the option we were passed + * is of the right length and that it is a proper + * string of the right length. + */ + if (((args[0].to - args[0].from) != + OCFS2_STACK_LABEL_LEN) || + (strnlen(args[0].from, + OCFS2_STACK_LABEL_LEN) != + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "Invalid cluster_stack option\n"); + status = 0; + goto bail; + } + memcpy(mopt->cluster_stack, args[0].from, + OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; + break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; + case Opt_usrquota: + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; + case Opt_coherency_buffered: + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_coherency_full: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_acl: + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; + break; + case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + break; + case Opt_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = option; + break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " + "or missing value\n", p); + status = 0; + goto bail; + } + } + + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } + } + + status = 1; + +bail: + return status; +} + +static int ocfs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct ocfs2_super *osb = OCFS2_SB(root->d_sb); + unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; + + if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { + seq_printf(s, ",_netdev"); + if (opts & OCFS2_MOUNT_HB_LOCAL) + seq_printf(s, ",%s", OCFS2_HB_LOCAL); + else + seq_printf(s, ",%s", OCFS2_HB_GLOBAL); + } else + seq_printf(s, ",%s", OCFS2_HB_NONE); + + if (opts & OCFS2_MOUNT_NOINTR) + seq_printf(s, ",nointr"); + + if (opts & OCFS2_MOUNT_DATA_WRITEBACK) + seq_printf(s, ",data=writeback"); + else + seq_printf(s, ",data=ordered"); + + if (opts & OCFS2_MOUNT_BARRIER) + seq_printf(s, ",barrier=1"); + + if (opts & OCFS2_MOUNT_ERRORS_PANIC) + seq_printf(s, ",errors=panic"); + else + seq_printf(s, ",errors=remount-ro"); + + if (osb->preferred_slot != OCFS2_INVALID_SLOT) + seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); + + seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + + if (osb->osb_commit_interval) + seq_printf(s, ",commit=%u", + (unsigned) (osb->osb_commit_interval / HZ)); + + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != ocfs2_la_default_mb(osb)) + seq_printf(s, ",localalloc=%d", local_alloc_megs); + + if (opts & OCFS2_MOUNT_LOCALFLOCKS) + seq_printf(s, ",localflocks,"); + + if (osb->osb_cluster_stack[0]) + seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, + osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_USRQUOTA) + seq_printf(s, ",usrquota"); + if (opts & OCFS2_MOUNT_GRPQUOTA) + seq_printf(s, ",grpquota"); + + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) + seq_printf(s, ",coherency=buffered"); + else + seq_printf(s, ",coherency=full"); + + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + + if (opts & OCFS2_MOUNT_POSIX_ACL) + seq_printf(s, ",acl"); + else + seq_printf(s, ",noacl"); + + if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) + seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + + if (osb->osb_dir_resv_level != osb->osb_resv_level) + seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + + if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + seq_printf(s, ",journal_async_commit"); + + return 0; +} + +static int __init ocfs2_init(void) +{ + int status; + + status = init_ocfs2_uptodate_cache(); + if (status < 0) + goto out1; + + status = ocfs2_initialize_mem_caches(); + if (status < 0) + goto out2; + + ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!ocfs2_wq) { + status = -ENOMEM; + goto out3; + } + + ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); + if (!ocfs2_debugfs_root) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + goto out4; + } + + ocfs2_set_locking_protocol(); + + status = register_quota_format(&ocfs2_quota_format); + if (status < 0) + goto out4; + status = register_filesystem(&ocfs2_fs_type); + if (!status) + return 0; + + unregister_quota_format(&ocfs2_quota_format); +out4: + destroy_workqueue(ocfs2_wq); + debugfs_remove(ocfs2_debugfs_root); +out3: + ocfs2_free_mem_caches(); +out2: + exit_ocfs2_uptodate_cache(); +out1: + mlog_errno(status); + return status; +} + +static void __exit ocfs2_exit(void) +{ + if (ocfs2_wq) { + flush_workqueue(ocfs2_wq); + destroy_workqueue(ocfs2_wq); + } + + unregister_quota_format(&ocfs2_quota_format); + + debugfs_remove(ocfs2_debugfs_root); + + ocfs2_free_mem_caches(); + + unregister_filesystem(&ocfs2_fs_type); + + exit_ocfs2_uptodate_cache(); +} + +static void ocfs2_put_super(struct super_block *sb) +{ + trace_ocfs2_put_super(sb); + + ocfs2_sync_blockdev(sb); + ocfs2_dismount_volume(sb, 0); +} + +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ocfs2_super *osb; + u32 numbits, freebits; + int status; + struct ocfs2_dinode *bm_lock; + struct buffer_head *bh = NULL; + struct inode *inode = NULL; + + trace_ocfs2_statfs(dentry->d_sb, buf); + + osb = OCFS2_SB(dentry->d_sb); + + inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + mlog(ML_ERROR, "failed to get bitmap inode\n"); + status = -EIO; + goto bail; + } + + status = ocfs2_inode_lock(inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bm_lock = (struct ocfs2_dinode *) bh->b_data; + + numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); + freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); + + buf->f_type = OCFS2_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = OCFS2_MAX_FILENAME_LEN; + buf->f_blocks = ((sector_t) numbits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bfree = ((sector_t) freebits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bavail = buf->f_bfree; + buf->f_files = numbits; + buf->f_ffree = freebits; + buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) + & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, + OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; + + brelse(bh); + + ocfs2_inode_unlock(inode, 0); + status = 0; +bail: + if (inode) + iput(inode); + + if (status) + mlog_errno(status); + + return status; +} + +static void ocfs2_inode_init_once(void *data) +{ + struct ocfs2_inode_info *oi = data; + + oi->ip_flags = 0; + oi->ip_open_count = 0; + spin_lock_init(&oi->ip_lock); + ocfs2_extent_map_init(&oi->vfs_inode); + INIT_LIST_HEAD(&oi->ip_io_markers); + oi->ip_dir_start_lookup = 0; + mutex_init(&oi->ip_unaligned_aio); + init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); + mutex_init(&oi->ip_io_mutex); + + oi->ip_blkno = 0ULL; + oi->ip_clusters = 0; + + ocfs2_resv_init_once(&oi->ip_la_data_resv); + + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); + ocfs2_lock_res_init_once(&oi->ip_inode_lockres); + ocfs2_lock_res_init_once(&oi->ip_open_lockres); + + init_waitqueue_head(&oi->append_dio_wq); + + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), + &ocfs2_inode_caching_ops); + + inode_init_once(&oi->vfs_inode); +} + +static int ocfs2_initialize_mem_caches(void) +{ + ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", + sizeof(struct ocfs2_inode_info), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + ocfs2_inode_init_once); + ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", + sizeof(struct ocfs2_dquot), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", + sizeof(struct ocfs2_quota_chunk), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + NULL); + if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || + !ocfs2_qf_chunk_cachep) { + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + return -ENOMEM; + } + + return 0; +} + +static void ocfs2_free_mem_caches(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + ocfs2_inode_cachep = NULL; + + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + ocfs2_dquot_cachep = NULL; + + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + ocfs2_qf_chunk_cachep = NULL; +} + +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size) +{ + if (!sb_set_blocksize(sb, sect_size)) { + mlog(ML_ERROR, "unable to set blocksize\n"); + return -EIO; + } + + *bh = sb_getblk(sb, block); + if (!*bh) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + lock_buffer(*bh); + if (!buffer_dirty(*bh)) + clear_buffer_uptodate(*bh); + unlock_buffer(*bh); + ll_rw_block(READ, 1, bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + mlog_errno(-EIO); + brelse(*bh); + *bh = NULL; + return -EIO; + } + + return 0; +} + +static int ocfs2_mount_volume(struct super_block *sb) +{ + int status = 0; + int unlock_super = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_is_hard_readonly(osb)) + goto leave; + + status = ocfs2_dlm_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + unlock_super = 1; + + /* This will load up the node map and add ourselves to it. */ + status = ocfs2_find_slot(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* load all node-local system inodes */ + status = ocfs2_init_local_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_check_volume(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_truncate_log_init(osb); + if (status < 0) + mlog_errno(status); + +leave: + if (unlock_super) + ocfs2_super_unlock(osb, 1); + + return status; +} + +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) +{ + int tmp, hangup_needed = 0; + struct ocfs2_super *osb = NULL; + char nodestr[12]; + + trace_ocfs2_dismount_volume(sb); + + BUG_ON(!sb); + osb = OCFS2_SB(sb); + BUG_ON(!osb); + + debugfs_remove(osb->osb_ctxt); + + /* Orphan scan should be stopped as early as possible */ + ocfs2_orphan_scan_stop(osb); + + ocfs2_disable_quotas(osb); + + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + + ocfs2_shutdown_local_alloc(osb); + + ocfs2_truncate_log_shutdown(osb); + + /* This will disable recovery and flush any recovery work. */ + ocfs2_recovery_exit(osb); + + ocfs2_journal_shutdown(osb); + + ocfs2_sync_blockdev(sb); + + ocfs2_purge_refcount_trees(osb); + + /* No cluster connection means we've failed during mount, so skip + * all the steps which depended on that to complete. */ + if (osb->cconn) { + tmp = ocfs2_super_lock(osb, 1); + if (tmp < 0) { + mlog_errno(tmp); + return; + } + } + + if (osb->slot_num != OCFS2_INVALID_SLOT) + ocfs2_put_slot(osb); + + if (osb->cconn) + ocfs2_super_unlock(osb, 1); + + ocfs2_release_system_inodes(osb); + + /* + * If we're dismounting due to mount error, mount.ocfs2 will clean + * up heartbeat. If we're a local mount, there is no heartbeat. + * If we failed before we got a uuid_str yet, we can't stop + * heartbeat. Otherwise, do it. + */ + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && + !ocfs2_is_hard_readonly(osb)) + hangup_needed = 1; + + if (osb->cconn) + ocfs2_dlm_shutdown(osb, hangup_needed); + + ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); + debugfs_remove(osb->osb_debug_root); + + if (hangup_needed) + ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); + + atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); + + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); + + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", + osb->dev_str, nodestr); + + ocfs2_delete_osb(osb); + kfree(osb); + sb->s_dev = 0; + sb->s_fs_info = NULL; +} + +static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, + unsigned uuid_bytes) +{ + int i, ret; + char *ptr; + + BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); + + osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + if (osb->uuid_str == NULL) + return -ENOMEM; + + for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { + /* print with null */ + ret = snprintf(ptr, 3, "%02X", uuid[i]); + if (ret != 2) /* drop super cleans up */ + return -EINVAL; + /* then only advance past the last char */ + ptr += 2; + } + + return 0; +} + +/* Make sure entire volume is addressable by our journal. Requires + osb_clusters_at_boot to be valid and for the journal to have been + initialized by ocfs2_journal_init(). */ +static int ocfs2_journal_addressable(struct ocfs2_super *osb) +{ + int status = 0; + u64 max_block = + ocfs2_clusters_to_blocks(osb->sb, + osb->osb_clusters_at_boot) - 1; + + /* 32-bit block number is always OK. */ + if (max_block <= (u32)~0ULL) + goto out; + + /* Volume is "huge", so see if our journal is new enough to + support it. */ + if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_JBD2_SB) && + jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT))) { + mlog(ML_ERROR, "The journal cannot address the entire volume. " + "Enable the 'block64' journal option with tunefs.ocfs2"); + status = -EFBIG; + goto out; + } + + out: + return status; +} + +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size, + struct ocfs2_blockcheck_stats *stats) +{ + int status; + int i, cbits, bbits; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + struct inode *inode = NULL; + struct ocfs2_journal *journal; + struct ocfs2_super *osb; + u64 total_blocks; + + osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); + if (!osb) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + sb->s_fs_info = osb; + sb->s_op = &ocfs2_sops; + sb->s_d_op = &ocfs2_dentry_ops; + sb->s_export_op = &ocfs2_export_ops; + sb->s_qcop = &dquot_quotactl_sysfile_ops; + sb->dq_op = &ocfs2_quota_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_xattr = ocfs2_xattr_handlers; + sb->s_time_gran = 1; + sb->s_flags |= MS_NOATIME; + /* this is needed to support O_LARGEFILE */ + cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); + bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); + + osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; + + for (i = 0; i < 3; i++) + osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); + osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); + + osb->sb = sb; + /* Save off for ocfs2_rw_direct */ + osb->s_sectsize_bits = blksize_bits(sector_size); + BUG_ON(!osb->s_sectsize_bits); + + spin_lock_init(&osb->dc_task_lock); + init_waitqueue_head(&osb->dc_event); + osb->dc_work_sequence = 0; + osb->dc_wake_sequence = 0; + INIT_LIST_HEAD(&osb->blocked_lock_list); + osb->blocked_lock_count = 0; + spin_lock_init(&osb->osb_lock); + spin_lock_init(&osb->osb_xattr_lock); + ocfs2_init_steal_slots(osb); + + mutex_init(&osb->system_file_mutex); + + atomic_set(&osb->alloc_stats.moves, 0); + atomic_set(&osb->alloc_stats.local_data, 0); + atomic_set(&osb->alloc_stats.bitmap_data, 0); + atomic_set(&osb->alloc_stats.bg_allocs, 0); + atomic_set(&osb->alloc_stats.bg_extends, 0); + + /* Copy the blockcheck stats from the superblock probe */ + osb->osb_ecc_stats = *stats; + + ocfs2_init_node_maps(osb); + + snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + + ocfs2_orphan_scan_init(osb); + + status = ocfs2_recovery_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize recovery state\n"); + mlog_errno(status); + goto bail; + } + + init_waitqueue_head(&osb->checkpoint_event); + + osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + + osb->slot_num = OCFS2_INVALID_SLOT; + + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + + osb->local_alloc_state = OCFS2_LA_UNUSED; + osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); + + init_waitqueue_head(&osb->osb_mount_event); + + status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); + if (status) { + mlog_errno(status); + goto bail; + } + + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); + if (!osb->vol_label) { + mlog(ML_ERROR, "unable to alloc vol label\n"); + status = -ENOMEM; + goto bail; + } + + osb->slot_recovery_generations = + kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), + GFP_KERNEL); + if (!osb->slot_recovery_generations) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + init_waitqueue_head(&osb->osb_wipe_event); + osb->osb_orphan_wipes = kcalloc(osb->max_slots, + sizeof(*osb->osb_orphan_wipes), + GFP_KERNEL); + if (!osb->osb_orphan_wipes) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + osb->osb_rf_lock_tree = RB_ROOT; + + osb->s_feature_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); + osb->s_feature_ro_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); + osb->s_feature_incompat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); + + if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount because of unsupported " + "optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + if (!(osb->sb->s_flags & MS_RDONLY) && + (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount RDWR because of " + "unsupported optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + + if (ocfs2_clusterinfo_valid(osb)) { + osb->osb_stackflags = + OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; + strlcpy(osb->osb_cluster_stack, + OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, + OCFS2_STACK_LABEL_LEN + 1); + if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, + "couldn't mount because of an invalid " + "cluster stack label (%s) \n", + osb->osb_cluster_stack); + status = -EINVAL; + goto bail; + } + strlcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN + 1); + } else { + /* The empty string is identical with classic tools that + * don't know about s_cluster_info. */ + osb->osb_cluster_stack[0] = '\0'; + } + + get_random_bytes(&osb->s_next_generation, sizeof(u32)); + + /* FIXME + * This should be done in ocfs2_journal_init(), but unknown + * ordering issues will cause the filesystem to crash. + * If anyone wants to figure out what part of the code + * refers to osb->journal before ocfs2_journal_init() is run, + * be my guest. + */ + /* initialize our journal structure */ + + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto bail; + } + osb->journal = journal; + journal->j_osb = osb; + + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = (unsigned long) 1; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; + + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); + + /* get some pseudo constants for clustersize bits */ + osb->s_clustersize_bits = + le32_to_cpu(di->id2.i_super.s_clustersize_bits); + osb->s_clustersize = 1 << osb->s_clustersize_bits; + + if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || + osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { + mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", + osb->s_clustersize); + status = -EINVAL; + goto bail; + } + + total_blocks = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(di->i_clusters)); + + status = generic_check_addressable(osb->sb->s_blocksize_bits, + total_blocks); + if (status) { + mlog(ML_ERROR, "Volume too large " + "to mount safely on this system"); + status = -EFBIG; + goto bail; + } + + if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid))) { + mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); + status = -ENOMEM; + goto bail; + } + + strlcpy(osb->vol_label, di->id2.i_super.s_label, + OCFS2_MAX_VOL_LABEL_LEN); + osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); + osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); + osb->first_cluster_group_blkno = + le64_to_cpu(di->id2.i_super.s_first_cluster_group); + osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); + trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, + (unsigned long long)osb->root_blkno, + (unsigned long long)osb->system_dir_blkno, + osb->s_clustersize_bits); + + osb->osb_dlm_debug = ocfs2_new_dlm_debug(); + if (!osb->osb_dlm_debug) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + atomic_set(&osb->vol_state, VOLUME_INIT); + + /* load root, system_dir, and all global system inodes */ + status = ocfs2_init_global_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* + * global bitmap + */ + inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; + iput(inode); + + osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, + osb->s_feature_incompat) * 8; + + status = ocfs2_init_slot_info(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + cleancache_init_shared_fs(sb); + +bail: + return status; +} + +/* + * will return: -EAGAIN if it is ok to keep searching for superblocks + * -EINVAL if there is a bad superblock + * 0 on success + */ +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 blksz, + struct ocfs2_blockcheck_stats *stats) +{ + int status = -EAGAIN; + + if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, + strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + /* We have to do a raw check of the feature here */ + if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & + OCFS2_FEATURE_INCOMPAT_META_ECC) { + status = ocfs2_block_check_validate(bh->b_data, + bh->b_size, + &di->i_check, + stats); + if (status) + goto out; + } + status = -EINVAL; + if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), + blksz); + } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != + OCFS2_MAJOR_REV_LEVEL || + le16_to_cpu(di->id2.i_super.s_minor_rev_level) != + OCFS2_MINOR_REV_LEVEL) { + mlog(ML_ERROR, "found superblock with bad version: " + "found %u.%u, should be %u.%u\n", + le16_to_cpu(di->id2.i_super.s_major_rev_level), + le16_to_cpu(di->id2.i_super.s_minor_rev_level), + OCFS2_MAJOR_REV_LEVEL, + OCFS2_MINOR_REV_LEVEL); + } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { + mlog(ML_ERROR, "bad block number on superblock: " + "found %llu, should be %llu\n", + (unsigned long long)le64_to_cpu(di->i_blkno), + (unsigned long long)bh->b_blocknr); + } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || + le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { + mlog(ML_ERROR, "bad cluster size found: %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { + mlog(ML_ERROR, "bad root_blkno: 0\n"); + } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { + mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); + } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { + mlog(ML_ERROR, + "Superblock slots found greater than file system " + "maximum: found %u, max %u\n", + le16_to_cpu(di->id2.i_super.s_max_slots), + OCFS2_MAX_SLOTS); + } else { + /* found it! */ + status = 0; + } + } + +out: + if (status && status != -EAGAIN) + mlog_errno(status); + return status; +} + +static int ocfs2_check_volume(struct ocfs2_super *osb) +{ + int status; + int dirty; + int local; + struct ocfs2_dinode *local_alloc = NULL; /* only used if we + * recover + * ourselves. */ + + /* Init our journal object. */ + status = ocfs2_journal_init(osb->journal, &dirty); + if (status < 0) { + mlog(ML_ERROR, "Could not initialize journal!\n"); + goto finally; + } + + /* Now that journal has been initialized, check to make sure + entire volume is addressable. */ + status = ocfs2_journal_addressable(osb); + if (status) + goto finally; + + /* If the journal was unmounted cleanly then we don't want to + * recover anything. Otherwise, journal_load will do that + * dirty work for us :) */ + if (!dirty) { + status = ocfs2_journal_wipe(osb->journal, 0); + if (status < 0) { + mlog_errno(status); + goto finally; + } + } else { + printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " + "unmounted cleanly, recovering it.\n", osb->dev_str); + } + + local = ocfs2_mount_local(osb); + + /* will play back anything left in the journal. */ + status = ocfs2_journal_load(osb->journal, local, dirty); + if (status < 0) { + mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); + goto finally; + } + + if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + jbd2_journal_set_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + else + jbd2_journal_clear_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + + if (dirty) { + /* recover my local alloc if we didn't unmount cleanly. */ + status = ocfs2_begin_local_alloc_recovery(osb, + osb->slot_num, + &local_alloc); + if (status < 0) { + mlog_errno(status); + goto finally; + } + /* we complete the recovery process after we've marked + * ourselves as mounted. */ + } + + status = ocfs2_load_local_alloc(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + if (dirty) { + /* Recovery will be completed after we've mounted the + * rest of the volume. */ + osb->dirty = 1; + osb->local_alloc_copy = local_alloc; + local_alloc = NULL; + } + + /* go through each journal, trylock it and if you get the + * lock, and it's marked as dirty, set the bit in the recover + * map and launch a recovery thread for it. */ + status = ocfs2_mark_dead_nodes(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + +finally: + kfree(local_alloc); + + if (status) + mlog_errno(status); + return status; +} + +/* + * The routine gets called from dismount or close whenever a dismount on + * volume is requested and the osb open count becomes 1. + * It will remove the osb from the global list and also free up all the + * initialized resources and fileobject. + */ +static void ocfs2_delete_osb(struct ocfs2_super *osb) +{ + /* This function assumes that the caller has the main osb resource */ + + ocfs2_free_slot_info(osb); + + kfree(osb->osb_orphan_wipes); + kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the start of ocfs2_initialize_osb(), + * we free it here. + */ + kfree(osb->journal); + kfree(osb->local_alloc_copy); + kfree(osb->uuid_str); + kfree(osb->vol_label); + ocfs2_put_dlm_debug(osb->osb_dlm_debug); + memset(osb, 0, sizeof(struct ocfs2_super)); +} + +/* Put OCFS2 into a readonly state, or (if the user specifies it), + * panic(). We do not support continue-on-error operation. */ +static void ocfs2_handle_error(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + + ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return; + + printk(KERN_CRIT "File system is now read-only due to the potential " + "of on-disk corruption. Please run fsck.ocfs2 once the file " + "system is unmounted.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); +} + +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + /* Not using mlog here because we want to show the actual + * function the error came from. */ + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + ocfs2_handle_error(sb); +} + +/* Handle critical errors. This is intentionally more drastic than + * ocfs2_handle_error, so we only use for things like journal errors, + * etc. */ +void __ocfs2_abort(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + /* We don't have the cluster support yet to go straight to + * hard readonly in here. Until then, we want to keep + * ocfs2_abort() so that we can at least mark critical + * errors. + * + * TODO: This should abort the journal and alert other nodes + * that our slot needs recovery. */ + + /* Force a panic(). This stinks, but it's better than letting + * things continue without having a proper hard readonly + * here. */ + if (!ocfs2_mount_local(OCFS2_SB(sb))) + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + ocfs2_handle_error(sb); +} + +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset) +{ + int rc; + sigset_t blocked; + + sigfillset(&blocked); + rc = sigprocmask(SIG_BLOCK, &blocked, oldset); + BUG_ON(rc); +} + +void ocfs2_unblock_signals(sigset_t *oldset) +{ + int rc = sigprocmask(SIG_SETMASK, oldset, NULL); + BUG_ON(rc); +} + +module_init(ocfs2_init); +module_exit(ocfs2_exit); diff --git a/kernel/fs/ocfs2/super.h b/kernel/fs/ocfs2/super.h new file mode 100644 index 000000000..74ff74cf7 --- /dev/null +++ b/kernel/fs/ocfs2/super.h @@ -0,0 +1,53 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SUPER_H +#define OCFS2_SUPER_H + +extern struct workqueue_struct *ocfs2_wq; + +int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, + int node_num); + +__printf(3, 4) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...); + +#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) + +__printf(3, 4) +void __ocfs2_abort(struct super_block *sb, const char *function, + const char *fmt, ...); + +#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) + +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset); +void ocfs2_unblock_signals(sigset_t *oldset); + +#endif /* OCFS2_SUPER_H */ diff --git a/kernel/fs/ocfs2/symlink.c b/kernel/fs/ocfs2/symlink.c new file mode 100644 index 000000000..66edce7ec --- /dev/null +++ b/kernel/fs/ocfs2/symlink.c @@ -0,0 +1,100 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * linux/cluster/ssi/cfs/symlink.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE + * or NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * Portions Copyright (C) 2001 Compaq Computer Corporation + * + * ocfs2 symlink handling code. + * + * Copyright (C) 2004, 2005 Oracle. + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "symlink.h" +#include "xattr.h" + +#include "buffer_head_io.h" + + +static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh = NULL; + int status = ocfs2_read_inode_block(inode, &bh); + struct ocfs2_dinode *fe; + const char *link; + void *kaddr; + size_t len; + + if (status < 0) { + mlog_errno(status); + return status; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + link = (char *) fe->id2.i_symlink; + /* will be less than a page size */ + len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb)); + kaddr = kmap_atomic(page); + memcpy(kaddr, link, len + 1); + kunmap_atomic(kaddr); + SetPageUptodate(page); + unlock_page(page); + brelse(bh); + return 0; +} + +const struct address_space_operations ocfs2_fast_symlink_aops = { + .readpage = ocfs2_fast_symlink_readpage, +}; + +const struct inode_operations ocfs2_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = ocfs2_getattr, + .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, +}; diff --git a/kernel/fs/ocfs2/symlink.h b/kernel/fs/ocfs2/symlink.h new file mode 100644 index 000000000..71ee4245e --- /dev/null +++ b/kernel/fs/ocfs2/symlink.h @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYMLINK_H +#define OCFS2_SYMLINK_H + +extern const struct inode_operations ocfs2_symlink_inode_operations; +extern const struct address_space_operations ocfs2_fast_symlink_aops; + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) +{ + return (S_ISLNK(inode->i_mode) && + inode->i_blocks == 0); +} + + +#endif /* OCFS2_SYMLINK_H */ diff --git a/kernel/fs/ocfs2/sysfile.c b/kernel/fs/ocfs2/sysfile.c new file mode 100644 index 000000000..af155c183 --- /dev/null +++ b/kernel/fs/ocfs2/sysfile.c @@ -0,0 +1,182 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.c + * + * Initialize, read, write, etc. system files. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "inode.h" +#include "journal.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; +#endif + +static inline int is_global_system_inode(int type) +{ + return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && + type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; +} + +static struct inode **get_local_system_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + int index; + struct inode **local_system_inodes, **free = NULL; + + BUG_ON(slot == OCFS2_INVALID_SLOT); + BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || + type > OCFS2_LAST_LOCAL_SYSTEM_INODE); + + spin_lock(&osb->osb_lock); + local_system_inodes = osb->local_system_inodes; + spin_unlock(&osb->osb_lock); + + if (unlikely(!local_system_inodes)) { + local_system_inodes = kzalloc(sizeof(struct inode *) * + NUM_LOCAL_SYSTEM_INODES * + osb->max_slots, + GFP_NOFS); + if (!local_system_inodes) { + mlog_errno(-ENOMEM); + /* + * return NULL here so that ocfs2_get_sytem_file_inodes + * will try to create an inode and use it. We will try + * to initialize local_system_inodes next time. + */ + return NULL; + } + + spin_lock(&osb->osb_lock); + if (osb->local_system_inodes) { + /* Someone has initialized it for us. */ + free = local_system_inodes; + local_system_inodes = osb->local_system_inodes; + } else + osb->local_system_inodes = local_system_inodes; + spin_unlock(&osb->osb_lock); + kfree(free); + } + + index = (slot * NUM_LOCAL_SYSTEM_INODES) + + (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); + + return &local_system_inodes[index]; +} + +struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + struct inode *inode = NULL; + struct inode **arr = NULL; + + /* avoid the lookup if cached in local system file array */ + if (is_global_system_inode(type)) { + arr = &(osb->global_system_inodes[type]); + } else + arr = get_local_system_inode(osb, type, slot); + + mutex_lock(&osb->system_file_mutex); + if (arr && ((inode = *arr) != NULL)) { + /* get a ref in addition to the array ref */ + inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); + BUG_ON(!inode); + + return inode; + } + + /* this gets one ref thru iget */ + inode = _ocfs2_get_system_file_inode(osb, type, slot); + + /* add one more if putting into array for first time */ + if (arr && inode) { + *arr = igrab(inode); + BUG_ON(!*arr); + } + mutex_unlock(&osb->system_file_mutex); + return inode; +} + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + char namebuf[40]; + struct inode *inode = NULL; + u64 blkno; + int status = 0; + + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); + + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (status < 0) { + goto bail; + } + + inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type); + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + inode = NULL; + goto bail; + } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (type == LOCAL_USER_QUOTA_SYSTEM_INODE || + type == LOCAL_GROUP_QUOTA_SYSTEM_INODE || + type == JOURNAL_SYSTEM_INODE) { + /* Ignore inode lock on these inodes as the lock does not + * really belong to any process and lockdep cannot handle + * that */ + OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL; + } else { + lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres. + l_lockdep_map, + ocfs2_system_inodes[type].si_name, + &ocfs2_sysfile_cluster_lock_key[type], 0); + } +#endif +bail: + + return inode; +} + diff --git a/kernel/fs/ocfs2/sysfile.h b/kernel/fs/ocfs2/sysfile.h new file mode 100644 index 000000000..cc9ea661f --- /dev/null +++ b/kernel/fs/ocfs2/sysfile.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYSFILE_H +#define OCFS2_SYSFILE_H + +struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#endif /* OCFS2_SYSFILE_H */ diff --git a/kernel/fs/ocfs2/uptodate.c b/kernel/fs/ocfs2/uptodate.c new file mode 100644 index 000000000..82e17b076 --- /dev/null +++ b/kernel/fs/ocfs2/uptodate.c @@ -0,0 +1,638 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.c + * + * Tracking the up-to-date-ness of a local buffer_head with respect to + * the cluster. + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Standard buffer head caching flags (uptodate, etc) are insufficient + * in a clustered environment - a buffer may be marked up to date on + * our local node but could have been modified by another cluster + * member. As a result an additional (and performant) caching scheme + * is required. A further requirement is that we consume as little + * memory as possible - we never pin buffer_head structures in order + * to cache them. + * + * We track the existence of up to date buffers on the inodes which + * are associated with them. Because we don't want to pin + * buffer_heads, this is only a (strong) hint and several other checks + * are made in the I/O path to ensure that we don't use a stale or + * invalid buffer without going to disk: + * - buffer_jbd is used liberally - if a bh is in the journal on + * this node then it *must* be up to date. + * - the standard buffer_uptodate() macro is used to detect buffers + * which may be invalid (even if we have an up to date tracking + * item for them) + * + * For a full understanding of how this code works together, one + * should read the callers in dlmglue.c, the I/O functions in + * buffer_head_io.c and ocfs2_journal_access in journal.c + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "inode.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +struct ocfs2_meta_cache_item { + struct rb_node c_node; + sector_t c_block; +}; + +static struct kmem_cache *ocfs2_uptodate_cachep; + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_owner(ci); +} + +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_get_super(ci); +} + +static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_lock(ci); +} + +static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_unlock(ci); +} + +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_lock(ci); +} + +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_unlock(ci); +} + + +static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, + int clear) +{ + ci->ci_flags |= OCFS2_CACHE_FL_INLINE; + ci->ci_num_cached = 0; + + if (clear) { + ci->ci_created_trans = 0; + ci->ci_last_trans = 0; + } +} + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops) +{ + BUG_ON(!ops); + + ci->ci_ops = ops; + ocfs2_metadata_cache_reset(ci, 1); +} + +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) +{ + ocfs2_metadata_cache_purge(ci); + ocfs2_metadata_cache_reset(ci, 1); +} + + +/* No lock taken here as 'root' is not expected to be visible to other + * processes. */ +static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) +{ + unsigned int purged = 0; + struct rb_node *node; + struct ocfs2_meta_cache_item *item; + + while ((node = rb_last(root)) != NULL) { + item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); + + trace_ocfs2_purge_copied_metadata_tree( + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, root); + kmem_cache_free(ocfs2_uptodate_cachep, item); + + purged++; + } + return purged; +} + +/* Called from locking and called from ocfs2_clear_inode. Dump the + * cache for a given inode. + * + * This function is a few more lines longer than necessary due to some + * accounting done here, but I think it's worth tracking down those + * bugs sooner -- Mark */ +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) +{ + unsigned int tree, to_purge, purged; + struct rb_root root = RB_ROOT; + + BUG_ON(!ci || !ci->ci_ops); + + ocfs2_metadata_cache_lock(ci); + tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); + to_purge = ci->ci_num_cached; + + trace_ocfs2_metadata_cache_purge( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, tree); + + /* If we're a tree, save off the root so that we can safely + * initialize the cache. We do the work to free tree members + * without the spinlock. */ + if (tree) + root = ci->ci_cache.ci_tree; + + ocfs2_metadata_cache_reset(ci, 0); + ocfs2_metadata_cache_unlock(ci); + + purged = ocfs2_purge_copied_metadata_tree(&root); + /* If possible, track the number wiped so that we can more + * easily detect counting errors. Unfortunately, this is only + * meaningful for trees. */ + if (tree && purged != to_purge) + mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, purged); +} + +/* Returns the index in the cache array, -1 if not found. + * Requires ip_lock. */ +static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, + sector_t item) +{ + int i; + + for (i = 0; i < ci->ci_num_cached; i++) { + if (item == ci->ci_cache.ci_array[i]) + return i; + } + + return -1; +} + +/* Returns the cache item if found, otherwise NULL. + * Requires ip_lock. */ +static struct ocfs2_meta_cache_item * +ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, + sector_t block) +{ + struct rb_node * n = ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *item = NULL; + + while (n) { + item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); + + if (block < item->c_block) + n = n->rb_left; + else if (block > item->c_block) + n = n->rb_right; + else + return item; + } + + return NULL; +} + +static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + int index = -1; + struct ocfs2_meta_cache_item *item = NULL; + + ocfs2_metadata_cache_lock(ci); + + trace_ocfs2_buffer_cached_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) bh->b_blocknr, + !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); + + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) + index = ocfs2_search_cache_array(ci, bh->b_blocknr); + else + item = ocfs2_search_cache_tree(ci, bh->b_blocknr); + + ocfs2_metadata_cache_unlock(ci); + + trace_ocfs2_buffer_cached_end(index, item); + + return (index != -1) || (item != NULL); +} + +/* Warning: even if it returns true, this does *not* guarantee that + * the block is stored in our inode metadata cache. + * + * This can be called under lock_buffer() + */ +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + /* Doesn't matter if the bh is in our cache or not -- if it's + * not marked uptodate then we know it can't have correct + * data. */ + if (!buffer_uptodate(bh)) + return 0; + + /* OCFS2 does not allow multiple nodes to be changing the same + * block at the same time. */ + if (buffer_jbd(bh)) + return 1; + + /* Ok, locally the buffer is marked as up to date, now search + * our cache to see if we can trust that. */ + return ocfs2_buffer_cached(ci, bh); +} + +/* + * Determine whether a buffer is currently out on a read-ahead request. + * ci_io_sem should be held to serialize submitters with the logic here. + */ +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); +} + +/* Requires ip_lock */ +static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, + sector_t block) +{ + BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); + + trace_ocfs2_append_cache_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); + + ci->ci_cache.ci_array[ci->ci_num_cached] = block; + ci->ci_num_cached++; +} + +/* By now the caller should have checked that the item does *not* + * exist in the tree. + * Requires ip_lock. */ +static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *new) +{ + sector_t block = new->c_block; + struct rb_node *parent = NULL; + struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *tmp; + + trace_ocfs2_insert_cache_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); + + while(*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); + + if (block < tmp->c_block) + p = &(*p)->rb_left; + else if (block > tmp->c_block) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate block %llu cached!\n", + (unsigned long long) block); + BUG(); + } + } + + rb_link_node(&new->c_node, parent, p); + rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached++; +} + +/* co_cache_lock() must be held */ +static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) +{ + return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && + (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); +} + +/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the + * pointers in tree after we use them - this allows caller to detect + * when to free in case of error. + * + * The co_cache_lock() must be held. */ +static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item **tree) +{ + int i; + + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, + "Owner %llu, num cached = %u, should be %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); + mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), + "Owner %llu not marked as inline anymore!\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); + + /* Be careful to initialize the tree members *first* because + * once the ci_tree is used, the array is junk... */ + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) + tree[i]->c_block = ci->ci_cache.ci_array[i]; + + ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; + ci->ci_cache.ci_tree = RB_ROOT; + /* this will be set again by __ocfs2_insert_cache_tree */ + ci->ci_num_cached = 0; + + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { + __ocfs2_insert_cache_tree(ci, tree[i]); + tree[i] = NULL; + } + + trace_ocfs2_expand_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_flags, ci->ci_num_cached); +} + +/* Slow path function - memory allocation is necessary. See the + * comment above ocfs2_set_buffer_uptodate for more information. */ +static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + sector_t block, + int expand_tree) +{ + int i; + struct ocfs2_meta_cache_item *new = NULL; + struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = + { NULL, }; + + trace_ocfs2_set_buffer_uptodate( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, expand_tree); + + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); + if (!new) { + mlog_errno(-ENOMEM); + return; + } + new->c_block = block; + + if (expand_tree) { + /* Do *not* allocate an array here - the removal code + * has no way of tracking that. */ + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { + tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, + GFP_NOFS); + if (!tree[i]) { + mlog_errno(-ENOMEM); + goto out_free; + } + + /* These are initialized in ocfs2_expand_cache! */ + } + } + + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { + /* Ok, items were removed from the cache in between + * locks. Detect this and revert back to the fast path */ + ocfs2_append_cache_array(ci, block); + ocfs2_metadata_cache_unlock(ci); + goto out_free; + } + + if (expand_tree) + ocfs2_expand_cache(ci, tree); + + __ocfs2_insert_cache_tree(ci, new); + ocfs2_metadata_cache_unlock(ci); + + new = NULL; +out_free: + if (new) + kmem_cache_free(ocfs2_uptodate_cachep, new); + + /* If these were used, then ocfs2_expand_cache re-set them to + * NULL for us. */ + if (tree[0]) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) + if (tree[i]) + kmem_cache_free(ocfs2_uptodate_cachep, + tree[i]); + } +} + +/* Item insertion is guarded by co_io_lock(), so the insertion path takes + * advantage of this by not rechecking for a duplicate insert during + * the slow case. Additionally, if the cache needs to be bumped up to + * a tree, the code will not recheck after acquiring the lock -- + * multiple paths cannot be expanding to a tree at the same time. + * + * The slow path takes into account that items can be removed + * (including the whole tree wiped and reset) when this process it out + * allocating memory. In those cases, it reverts back to the fast + * path. + * + * Note that this function may actually fail to insert the block if + * memory cannot be allocated. This is not fatal however (but may + * result in a performance penalty) + * + * Readahead buffers can be passed in here before the I/O request is + * completed. + */ +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + int expand; + + /* The block may very well exist in our cache already, so avoid + * doing any more work in that case. */ + if (ocfs2_buffer_cached(ci, bh)) + return; + + trace_ocfs2_set_buffer_uptodate_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr); + + /* No need to recheck under spinlock - insertion is guarded by + * co_io_lock() */ + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { + /* Fast case - it's an array and there's a free + * spot. */ + ocfs2_append_cache_array(ci, bh->b_blocknr); + ocfs2_metadata_cache_unlock(ci); + return; + } + + expand = 0; + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { + /* We need to bump things up to a tree. */ + expand = 1; + } + ocfs2_metadata_cache_unlock(ci); + + __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); +} + +/* Called against a newly allocated buffer. Most likely nobody should + * be able to read this sort of metadata while it's still being + * allocated, but this is careful to take co_io_lock() anyway. */ +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + /* This should definitely *not* exist in our cache */ + BUG_ON(ocfs2_buffer_cached(ci, bh)); + + set_buffer_uptodate(bh); + + ocfs2_metadata_cache_io_lock(ci); + ocfs2_set_buffer_uptodate(ci, bh); + ocfs2_metadata_cache_io_unlock(ci); +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, + int index) +{ + sector_t *array = ci->ci_cache.ci_array; + int bytes; + + BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); + BUG_ON(index >= ci->ci_num_cached); + BUG_ON(!ci->ci_num_cached); + + trace_ocfs2_remove_metadata_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, ci->ci_num_cached); + + ci->ci_num_cached--; + + /* don't need to copy if the array is now empty, or if we + * removed at the tail */ + if (ci->ci_num_cached && index < ci->ci_num_cached) { + bytes = sizeof(sector_t) * (ci->ci_num_cached - index); + memmove(&array[index], &array[index + 1], bytes); + } +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *item) +{ + trace_ocfs2_remove_metadata_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)item->c_block); + + rb_erase(&item->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached--; +} + +static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, + sector_t block) +{ + int index; + struct ocfs2_meta_cache_item *item = NULL; + + ocfs2_metadata_cache_lock(ci); + trace_ocfs2_remove_block_from_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) block, ci->ci_num_cached, + ci->ci_flags); + + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { + index = ocfs2_search_cache_array(ci, block); + if (index != -1) + ocfs2_remove_metadata_array(ci, index); + } else { + item = ocfs2_search_cache_tree(ci, block); + if (item) + ocfs2_remove_metadata_tree(ci, item); + } + ocfs2_metadata_cache_unlock(ci); + + if (item) + kmem_cache_free(ocfs2_uptodate_cachep, item); +} + +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(ci, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, + sector_t block, + u32 c_len) +{ + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(ci, block); +} + +int __init init_ocfs2_uptodate_cache(void) +{ + ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", + sizeof(struct ocfs2_meta_cache_item), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ocfs2_uptodate_cachep) + return -ENOMEM; + + return 0; +} + +void exit_ocfs2_uptodate_cache(void) +{ + if (ocfs2_uptodate_cachep) + kmem_cache_destroy(ocfs2_uptodate_cachep); +} diff --git a/kernel/fs/ocfs2/uptodate.h b/kernel/fs/ocfs2/uptodate.h new file mode 100644 index 000000000..0d826fe2d --- /dev/null +++ b/kernel/fs/ocfs2/uptodate.h @@ -0,0 +1,84 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.h + * + * Cluster uptodate tracking + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_UPTODATE_H +#define OCFS2_UPTODATE_H + +/* + * The caching code relies on locking provided by the user of + * struct ocfs2_caching_info. These operations connect that up. + */ +struct ocfs2_caching_operations { + /* + * A u64 representing the owning structure. Usually this + * is the block number (i_blkno or whatnot). This is used so + * that caching log messages can identify the owning structure. + */ + u64 (*co_owner)(struct ocfs2_caching_info *ci); + + /* The superblock is needed during I/O. */ + struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci); + /* + * Lock and unlock the caching data. These will not sleep, and + * should probably be spinlocks. + */ + void (*co_cache_lock)(struct ocfs2_caching_info *ci); + void (*co_cache_unlock)(struct ocfs2_caching_info *ci); + + /* + * Lock and unlock for disk I/O. These will sleep, and should + * be mutexes. + */ + void (*co_io_lock)(struct ocfs2_caching_info *ci); + void (*co_io_unlock)(struct ocfs2_caching_info *ci); +}; + +int __init init_ocfs2_uptodate_cache(void); +void exit_ocfs2_uptodate_cache(void); + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops); +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci); + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci); + +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, + sector_t block, + u32 c_len); +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, + struct buffer_head *bh); + +#endif /* OCFS2_UPTODATE_H */ diff --git a/kernel/fs/ocfs2/xattr.c b/kernel/fs/ocfs2/xattr.c new file mode 100644 index 000000000..d03bfbf3d --- /dev/null +++ b/kernel/fs/ocfs2/xattr.c @@ -0,0 +1,7425 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/xattr.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" +#include "refcounttree.h" +#include "acl.h" +#include "ocfs2_trace.h" + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + /* The inode these xattrs are associated with */ + struct inode *bu_inode; + + /* The actual buffers that make up the bucket */ + struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + + /* How many blocks make up one bucket for this filesystem */ + int bu_blocks; +}; + +struct ocfs2_xattr_set_ctxt { + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + int set_abort; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_HEADER_GAP 4 +#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ + - sizeof(struct ocfs2_xattr_header) \ + - OCFS2_XATTR_HEADER_GAP) +#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ + - sizeof(struct ocfs2_xattr_block) \ + - sizeof(struct ocfs2_xattr_header) \ + - OCFS2_XATTR_HEADER_GAP) + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +const struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &ocfs2_xattr_trusted_handler, + &ocfs2_xattr_security_handler, + NULL +}; + +static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] + = &posix_acl_access_xattr_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] + = &posix_acl_default_xattr_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, +}; + +struct ocfs2_xattr_info { + int xi_name_index; + const char *xi_name; + int xi_name_len; + const void *xi_value; + size_t xi_value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket *bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +/* Operations on struct ocfs2_xa_entry */ +struct ocfs2_xa_loc; +struct ocfs2_xa_loc_operations { + /* + * Journal functions + */ + int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc, + int type); + void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc); + + /* + * Return a pointer to the appropriate buffer in loc->xl_storage + * at the given offset from loc->xl_header. + */ + void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset); + + /* Can we reuse the existing entry for the new value? */ + int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* How much space is needed for the new value? */ + int (*xlo_check_space)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* + * Return the offset of the first name+value pair. This is + * the start of our downward-filling free space. + */ + int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc); + + /* + * Remove the name+value at this location. Do whatever is + * appropriate with the remaining name+value pairs. + */ + void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc); + + /* Fill xl_entry with a new entry */ + void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash); + + /* Add name+value storage to an entry */ + void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size); + + /* + * Initialize the value buf's access and bh fields for this entry. + * ocfs2_xa_fill_value_buf() will handle the xv pointer. + */ + void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb); +}; + +/* + * Describes an xattr entry location. This is a memory structure + * tracking the on-disk structure. + */ +struct ocfs2_xa_loc { + /* This xattr belongs to this inode */ + struct inode *xl_inode; + + /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */ + struct ocfs2_xattr_header *xl_header; + + /* Bytes from xl_header to the end of the storage */ + int xl_size; + + /* + * The ocfs2_xattr_entry this location describes. If this is + * NULL, this location describes the on-disk structure where it + * would have been. + */ + struct ocfs2_xattr_entry *xl_entry; + + /* + * Internal housekeeping + */ + + /* Buffer(s) containing this entry */ + void *xl_storage; + + /* Operations on the storage backing this location */ + const struct ocfs2_xa_loc_operations *xl_ops; +}; + +/* + * Convenience functions to calculate how much space is needed for a + * given name+value pair + */ +static int namevalue_size(int name_len, uint64_t value_len) +{ + if (value_len > OCFS2_XATTR_INLINE_SIZE) + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + else + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); +} + +static int namevalue_size_xi(struct ocfs2_xattr_info *xi) +{ + return namevalue_size(xi->xi_name_len, xi->xi_value_len); +} + +static int namevalue_size_xe(struct ocfs2_xattr_entry *xe) +{ + u64 value_len = le64_to_cpu(xe->xe_value_size); + + BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) && + ocfs2_xattr_is_local(xe)); + return namevalue_size(xe->xe_name_len, value_len); +} + + +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); + +typedef int (xattr_tree_rec_func)(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para); +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *root_bh, + xattr_tree_rec_func *rec_func, + void *para); +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para); + +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash); +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_need, + int *credits); +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh); + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) +#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) +#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) + +static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode) +{ + struct ocfs2_xattr_bucket *bucket; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET); + + bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS); + if (bucket) { + bucket->bu_inode = inode; + bucket->bu_blocks = blks; + } + + return bucket; +} + +static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket) +{ + int i; + + for (i = 0; i < bucket->bu_blocks; i++) { + brelse(bucket->bu_bhs[i]); + bucket->bu_bhs[i] = NULL; + } +} + +static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) +{ + if (bucket) { + ocfs2_xattr_bucket_relse(bucket); + bucket->bu_inode = NULL; + kfree(bucket); + } +} + +/* + * A bucket that has never been written to disk doesn't need to be + * read. We just need the buffer_heads. Don't call this for + * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes + * them fully. + */ +static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno, int new) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, + xb_blkno + i); + if (!bucket->bu_bhs[i]) { + rc = -ENOMEM; + mlog_errno(rc); + break; + } + + if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +/* Read the xattr bucket at xb_blkno */ +static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int rc; + + rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno, + bucket->bu_blocks, bucket->bu_bhs, 0, + NULL); + if (!rc) { + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, + bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + if (rc) + mlog_errno(rc); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +static int ocfs2_xattr_bucket_journal_access(handle_t *handle, + struct ocfs2_xattr_bucket *bucket, + int type) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + rc = ocfs2_journal_access(handle, + INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i], type); + if (rc) { + mlog_errno(rc); + break; + } + } + + return rc; +} + +static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int i; + + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + + for (i = 0; i < bucket->bu_blocks; i++) + ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); +} + +static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest, + struct ocfs2_xattr_bucket *src) +{ + int i; + int blocksize = src->bu_inode->i_sb->s_blocksize; + + BUG_ON(dest->bu_blocks != src->bu_blocks); + BUG_ON(dest->bu_inode != src->bu_inode); + + for (i = 0; i < src->bu_blocks; i++) { + memcpy(bucket_block(dest, i), bucket_block(src, i), + blocksize); + } +} + +static int ocfs2_validate_xattr_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check); + if (rc) + return rc; + + /* + * Errors after here are fatal + */ + + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ocfs2_error(sb, + "Extended attribute block #%llu has bad " + "signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); + return -EINVAL; + } + + if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extended attribute block #%llu has an " + "invalid xb_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extended attribute block #%llu has an invalid " + "xb_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp, + ocfs2_validate_xattr_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) +{ + return namevalue_size(name_len, value_len) + + sizeof(struct ocfs2_xattr_entry); +} + +static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xi(xi) + + sizeof(struct ocfs2_xattr_entry); +} + +static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe) +{ + return namevalue_size_xe(xe) + + sizeof(struct ocfs2_xattr_entry); +} + +int ocfs2_calc_security_init(struct inode *dir, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * So reserve one metadata block for it is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + s_size > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + /* reserve clusters for xattr value which will be set in B tree*/ + if (si->value_len > OCFS2_XATTR_INLINE_SIZE) { + int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + return ret; +} + +int ocfs2_calc_xattr_init(struct inode *dir, + struct buffer_head *dir_bh, + umode_t mode, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + int *want_meta) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = 0, a_size = 0, acl_len = 0, new_clusters; + + if (si->enable) + s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + "", NULL, 0); + if (acl_len > 0) { + a_size = ocfs2_xattr_entry_real_size(0, acl_len); + if (S_ISDIR(mode)) + a_size <<= 1; + } else if (acl_len != 0 && acl_len != -ENODATA) { + mlog_errno(ret); + return ret; + } + } + + if (!(s_size + a_size)) + return ret; + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * The max space of acl xattr taken inline is + * 80(value) + 16(entry) * 2(if directory) = 192 bytes, + * when blocksize = 512, may reserve one more cluser for + * xattr bucket, otherwise reserve one metadata block + * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || + (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { + *want_meta = *want_meta + 1; + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE && + (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) { + *want_clusters += 1; + *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb); + } + + /* + * reserve credits and clusters for xattrs which has large value + * and have to be set outside + */ + if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) { + new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL && + acl_len > OCFS2_XATTR_INLINE_SIZE) { + /* for directory, it has DEFAULT and ACCESS two types of acls */ + new_clusters = (S_ISDIR(mode) ? 2 : 1) * + ocfs2_clusters_for_bytes(dir->i_sb, acl_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + + return ret; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int status = 0, credits; + handle_t *handle = ctxt->handle; + enum ocfs2_alloc_restarted why; + u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + while (clusters_to_add) { + trace_ocfs2_xattr_extend_allocation(clusters_to_add); + + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } + + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } + + ocfs2_journal_dirty(handle, vb->vb_bh); + + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + u32 cpos, u32 phys_cpos, u32 len, + unsigned int ext_flags, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + handle_t *handle = ctxt->handle; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac, + &ctxt->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + le32_add_cpu(&vb->vb_xv->xr_clusters, -len); + ocfs2_journal_dirty(handle, vb->vb_bh); + + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(inode->i_sb, + phys_blkno), + len, ctxt->meta_ac, &ctxt->dealloc, 1); + else + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, + phys_blkno, len); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0; + unsigned int ext_flags; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, + &vb->vb_xv->xr_list, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, vb, cpos, + phys_cpos, alloc_size, + ext_flags, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), + block, alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + vb, ctxt); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + vb, ctxt); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_xattr_header *xh; + int i; + + xh = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) + if (!ocfs2_xattr_is_local(&xh->xh_entries[i])) + return 1; + + return 0; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else + ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh, + buffer, buffer_size); + + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry)); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(d_inode(dentry), di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(d_inode(dentry), 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); + + xs->bucket = ocfs2_xattr_bucket_new(inode); + if (!xs->bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto cleanup; + } + + ret = ocfs2_xattr_block_find(inode, name_index, name, xs); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + if (xs->not_found) { + ret = -ENODATA; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xs->bucket), + i, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + xs->base = bucket_block(xs->bucket, block_off); + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + ocfs2_xattr_bucket_free(xs->bucket); + + brelse(xs->xattr_bh); + xs->xattr_bh = NULL; + return ret; +} + +int ocfs2_xattr_get_nolock(struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return -ENODATA; + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA && di->i_xattr_loc) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +static int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct buffer_head *di_bh = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + down_read(&OCFS2_I(inode)->ip_xattr_sem); + ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + name, buffer, buffer_size); + up_read(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_value_buf *vb, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + unsigned int ext_flags; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, + INODE_CACHE(inode), + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ocfs2_journal_dirty(handle, bh); + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xa_check_space_helper(int needed_space, int free_start, + int num_entries) +{ + int free_space; + + if (!needed_space) + return 0; + + free_space = free_start - + sizeof(struct ocfs2_xattr_header) - + (num_entries * sizeof(struct ocfs2_xattr_entry)) - + OCFS2_XATTR_HEADER_GAP; + if (free_space < 0) + return -EIO; + if (free_space < needed_space) + return -ENOSPC; + + return 0; +} + +static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc, + int type) +{ + return loc->xl_ops->xlo_journal_access(handle, loc, type); +} + +static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_journal_dirty(handle, loc); +} + +/* Give a pointer into the storage for the given offset */ +static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset) +{ + BUG_ON(offset >= loc->xl_size); + return loc->xl_ops->xlo_offset_pointer(loc, offset); +} + +/* + * Wipe the name+value pair and allow the storage to reclaim it. This + * must be followed by either removal of the entry or a call to + * ocfs2_xa_add_namevalue(). + */ +static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_wipe_namevalue(loc); +} + +/* + * Find lowest offset to a name+value pair. This is the start of our + * downward-growing free space. + */ +static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc) +{ + return loc->xl_ops->xlo_get_free_start(loc); +} + +/* Can we reuse loc->xl_entry for xi? */ +static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_can_reuse(loc, xi); +} + +/* How much free space is needed to set the new value */ +static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_check_space(loc, xi); +} + +static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + loc->xl_ops->xlo_add_entry(loc, name_hash); + loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); + /* + * We can't leave the new entry's xe_name_offset at zero or + * add_namevalue() will go nuts. We set it to the size of our + * storage so that it can never be less than any other entry. + */ + loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); +} + +static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int size = namevalue_size_xi(xi); + int nameval_offset; + char *nameval_buf; + + loc->xl_ops->xlo_add_namevalue(loc, size); + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + loc->xl_entry->xe_name_len = xi->xi_name_len; + ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index); + ocfs2_xattr_set_local(loc->xl_entry, + xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE); + + nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + memset(nameval_buf, 0, size); + memcpy(nameval_buf, xi->xi_name, xi->xi_name_len); +} + +static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + + /* Value bufs are for value trees */ + BUG_ON(ocfs2_xattr_is_local(loc->xl_entry)); + BUG_ON(namevalue_size_xe(loc->xl_entry) != + (name_size + OCFS2_XATTR_ROOT_SIZE)); + + loc->xl_ops->xlo_fill_value_buf(loc, vb); + vb->vb_xv = + (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc, + nameval_offset + + name_size); +} + +static int ocfs2_xa_block_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct buffer_head *bh = loc->xl_storage; + ocfs2_journal_access_func access; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + access = ocfs2_journal_access_xb; + else + access = ocfs2_journal_access_di; + return access(handle, INODE_CACHE(loc->xl_inode), bh, type); +} + +static void ocfs2_xa_block_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct buffer_head *bh = loc->xl_storage; + + ocfs2_journal_dirty(handle, bh); +} + +static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + return (char *)loc->xl_header + offset; +} + +static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + /* + * Block storage is strict. If the sizes aren't exact, we will + * remove the old one and reinsert the new. + */ + return namevalue_size_xe(loc->xl_entry) == + namevalue_size_xi(xi); +} + +static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int i, count = le16_to_cpu(xh->xh_count); + int offset, free_start = loc->xl_size; + + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset < free_start) + free_start = offset; + } + + return free_start; +} + +static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + + /* + * Block storage will reclaim the original entry before inserting + * the new value, so we only need the difference. If the new + * entry is smaller than the old one, we don't need anything. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= ocfs2_xe_entry_usage(loc->xl_entry); + } + if (needed_space < 0) + needed_space = 0; + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + +/* + * Block storage for xattrs keeps the name+value pairs compacted. When + * we remove one, we have to shift any that preceded it towards the end. + */ +static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + int i, offset; + int namevalue_offset, first_namevalue_offset, namevalue_size; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + struct ocfs2_xattr_header *xh = loc->xl_header; + int count = le16_to_cpu(xh->xh_count); + + namevalue_offset = le16_to_cpu(entry->xe_name_offset); + namevalue_size = namevalue_size_xe(entry); + first_namevalue_offset = ocfs2_xa_get_free_start(loc); + + /* Shift the name+value pairs */ + memmove((char *)xh + first_namevalue_offset + namevalue_size, + (char *)xh + first_namevalue_offset, + namevalue_offset - first_namevalue_offset); + memset((char *)xh + first_namevalue_offset, 0, namevalue_size); + + /* Now tell xh->xh_entries about it */ + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset <= namevalue_offset) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, + namevalue_size); + } + + /* + * Note that we don't update xh_free_start or xh_name_value_len + * because they're not used in block-stored xattrs. + */ +} + +static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + loc->xl_entry = &(loc->xl_header->xh_entries[count]); + le16_add_cpu(&loc->xl_header->xh_count, 1); + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + + loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size); +} + +static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct buffer_head *bh = loc->xl_storage; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + vb->vb_access = ocfs2_journal_access_xb; + else + vb->vb_access = ocfs2_journal_access_di; + vb->vb_bh = bh; +} + +/* + * Operations for xattrs stored in blocks. This includes inline inode + * storage and unindexed ocfs2_xattr_blocks. + */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { + .xlo_journal_access = ocfs2_xa_block_journal_access, + .xlo_journal_dirty = ocfs2_xa_block_journal_dirty, + .xlo_offset_pointer = ocfs2_xa_block_offset_pointer, + .xlo_check_space = ocfs2_xa_block_check_space, + .xlo_can_reuse = ocfs2_xa_block_can_reuse, + .xlo_get_free_start = ocfs2_xa_block_get_free_start, + .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_block_add_entry, + .xlo_add_namevalue = ocfs2_xa_block_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf, +}; + +static int ocfs2_xa_bucket_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + return ocfs2_xattr_bucket_journal_access(handle, bucket, type); +} + +static void ocfs2_xa_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); +} + +static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + int block, block_offset; + + /* The header is at the front of the bucket */ + block = offset >> loc->xl_inode->i_sb->s_blocksize_bits; + block_offset = offset % loc->xl_inode->i_sb->s_blocksize; + + return bucket_block(bucket, block) + block_offset; +} + +static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xe(loc->xl_entry) >= + namevalue_size_xi(xi); +} + +static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + return le16_to_cpu(bucket_xh(bucket)->xh_free_start); +} + +static int ocfs2_bucket_align_free_start(struct super_block *sb, + int free_start, int size) +{ + /* + * We need to make sure that the name+value pair fits within + * one block. + */ + if (((free_start - size) >> sb->s_blocksize_bits) != + ((free_start - 1) >> sb->s_blocksize_bits)) + free_start -= free_start % sb->s_blocksize; + + return free_start; +} + +static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int rc; + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + int size = namevalue_size_xi(xi); + struct super_block *sb = loc->xl_inode->i_sb; + + /* + * Bucket storage does not reclaim name+value pairs it cannot + * reuse. They live as holes until the bucket fills, and then + * the bucket is defragmented. However, the bucket can reclaim + * the ocfs2_xattr_entry. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= sizeof(struct ocfs2_xattr_entry); + } + BUG_ON(needed_space < 0); + + if (free_start < size) { + if (needed_space) + return -ENOSPC; + } else { + /* + * First we check if it would fit in the first place. + * Below, we align the free start to a block. This may + * slide us below the minimum gap. By checking unaligned + * first, we avoid that error. + */ + rc = ocfs2_xa_check_space_helper(needed_space, free_start, + count); + if (rc) + return rc; + free_start = ocfs2_bucket_align_free_start(sb, free_start, + size); + } + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + +static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + le16_add_cpu(&loc->xl_header->xh_name_value_len, + -namevalue_size_xe(loc->xl_entry)); +} + +static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int count = le16_to_cpu(xh->xh_count); + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + /* + * We keep buckets sorted by name_hash, so we need to find + * our insert place. + */ + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + if (low != count) + memmove(&xh->xh_entries[low + 1], + &xh->xh_entries[low], + ((count - low) * sizeof(struct ocfs2_xattr_entry))); + + le16_add_cpu(&xh->xh_count, 1); + loc->xl_entry = &xh->xh_entries[low]; + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + struct ocfs2_xattr_header *xh = loc->xl_header; + struct super_block *sb = loc->xl_inode->i_sb; + int nameval_offset; + + free_start = ocfs2_bucket_align_free_start(sb, free_start, size); + nameval_offset = free_start - size; + loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset); + xh->xh_free_start = cpu_to_le16(nameval_offset); + le16_add_cpu(&xh->xh_name_value_len, size); + +} + +static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + struct super_block *sb = loc->xl_inode->i_sb; + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int size = namevalue_size_xe(loc->xl_entry); + int block_offset = nameval_offset >> sb->s_blocksize_bits; + + /* Values are not allowed to straddle block boundaries */ + BUG_ON(block_offset != + ((nameval_offset + size - 1) >> sb->s_blocksize_bits)); + /* We expect the bucket to be filled in */ + BUG_ON(!bucket->bu_bhs[block_offset]); + + vb->vb_access = ocfs2_journal_access; + vb->vb_bh = bucket->bu_bhs[block_offset]; +} + +/* Operations for xattrs stored in buckets. */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { + .xlo_journal_access = ocfs2_xa_bucket_journal_access, + .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty, + .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, + .xlo_check_space = ocfs2_xa_bucket_check_space, + .xlo_can_reuse = ocfs2_xa_bucket_can_reuse, + .xlo_get_free_start = ocfs2_xa_bucket_get_free_start, + .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_bucket_add_entry, + .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf, +}; + +static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_value_buf vb; + + if (ocfs2_xattr_is_local(loc->xl_entry)) + return 0; + + ocfs2_xa_fill_value_buf(loc, &vb); + return le32_to_cpu(vb.vb_xv->xr_clusters); +} + +static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int trunc_rc, access_rc; + struct ocfs2_xattr_value_buf vb; + + ocfs2_xa_fill_value_buf(loc, &vb); + trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes, + ctxt); + + /* + * The caller of ocfs2_xa_value_truncate() has already called + * ocfs2_xa_journal_access on the loc. However, The truncate code + * calls ocfs2_extend_trans(). This may commit the previous + * transaction and open a new one. If this is a bucket, truncate + * could leave only vb->vb_bh set up for journaling. Meanwhile, + * the caller is expecting to dirty the entire bucket. So we must + * reset the journal work. We do this even if truncate has failed, + * as it could have failed after committing the extend. + */ + access_rc = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + + /* Errors in truncate take precedence */ + return trunc_rc ? trunc_rc : access_rc; +} + +static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) +{ + int index, count; + struct ocfs2_xattr_header *xh = loc->xl_header; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + + ocfs2_xa_wipe_namevalue(loc); + loc->xl_entry = NULL; + + le16_add_cpu(&xh->xh_count, -1); + count = le16_to_cpu(xh->xh_count); + + /* + * Only zero out the entry if there are more remaining. This is + * important for an empty bucket, as it keeps track of the + * bucket's hash value. It doesn't hurt empty block storage. + */ + if (count) { + index = ((char *)entry - (char *)&xh->xh_entries) / + sizeof(struct ocfs2_xattr_entry); + memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1], + (count - index) * sizeof(struct ocfs2_xattr_entry)); + memset(&xh->xh_entries[count], 0, + sizeof(struct ocfs2_xattr_entry)); + } +} + +/* + * If we have a problem adjusting the size of an external value during + * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr + * in an intermediate state. For example, the value may be partially + * truncated. + * + * If the value tree hasn't changed, the extend/truncate went nowhere. + * We have nothing to do. The caller can treat it as a straight error. + * + * If the value tree got partially truncated, we now have a corrupted + * extended attribute. We're going to wipe its entry and leak the + * clusters. Better to leak some storage than leave a corrupt entry. + * + * If the value tree grew, it obviously didn't grow enough for the + * new entry. We're not going to try and reclaim those clusters either. + * If there was already an external value there (orig_clusters != 0), + * the new clusters are attached safely and we can just leave the old + * value in place. If there was no external value there, we remove + * the entry. + * + * This way, the xattr block we store in the journal will be consistent. + * If the size change broke because of the journal, no changes will hit + * disk anyway. + */ +static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc, + const char *what, + unsigned int orig_clusters) +{ + unsigned int new_clusters = ocfs2_xa_value_clusters(loc); + char *nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + + if (new_clusters < orig_clusters) { + mlog(ML_ERROR, + "Partial truncate while %s xattr %.*s. Leaking " + "%u clusters and removing the entry\n", + what, loc->xl_entry->xe_name_len, nameval_buf, + orig_clusters - new_clusters); + ocfs2_xa_remove_entry(loc); + } else if (!orig_clusters) { + mlog(ML_ERROR, + "Unable to allocate an external value for xattr " + "%.*s safely. Leaking %u clusters and removing the " + "entry\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); + ocfs2_xa_remove_entry(loc); + } else if (new_clusters > orig_clusters) + mlog(ML_ERROR, + "Unable to grow xattr %.*s safely. %u new clusters " + "have been added, but the value will not be " + "modified\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); +} + +static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + unsigned int orig_clusters; + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + /* + * Since this is remove, we can return 0 if + * ocfs2_xa_cleanup_value_truncate() is going to + * wipe the entry anyway. So we check the + * cluster count as well. + */ + if (orig_clusters != ocfs2_xa_value_clusters(loc)) + rc = 0; + ocfs2_xa_cleanup_value_truncate(loc, "removing", + orig_clusters); + if (rc) + goto out; + } + } + + ocfs2_xa_remove_entry(loc); + +out: + return rc; +} + +static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc) +{ + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + char *nameval_buf; + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE); +} + +/* + * Take an existing entry and make it ready for the new value. This + * won't allocate space, but it may free space. It should be ready for + * ocfs2_xa_prepare_entry() to finish the work. + */ +static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + unsigned int orig_clusters; + char *nameval_buf; + int xe_local = ocfs2_xattr_is_local(loc->xl_entry); + int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE; + + BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) != + name_size); + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + if (xe_local) { + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - name_size); + if (!xi_local) + ocfs2_xa_install_value_root(loc); + } else { + orig_clusters = ocfs2_xa_value_clusters(loc); + if (xi_local) { + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc < 0) + mlog_errno(rc); + else + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - + name_size); + } else if (le64_to_cpu(loc->xl_entry->xe_value_size) > + xi->xi_value_len) { + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, + ctxt); + if (rc < 0) + mlog_errno(rc); + } + + if (rc) { + ocfs2_xa_cleanup_value_truncate(loc, "reusing", + orig_clusters); + goto out; + } + } + + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + ocfs2_xattr_set_local(loc->xl_entry, xi_local); + +out: + return rc; +} + +/* + * Prepares loc->xl_entry to receive the new xattr. This includes + * properly setting up the name+value pair region. If loc->xl_entry + * already exists, it will take care of modifying it appropriately. + * + * Note that this modifies the data. You did journal_access already, + * right? + */ +static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + u32 name_hash, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + unsigned int orig_clusters; + __le64 orig_value_size = 0; + + rc = ocfs2_xa_check_space(loc, xi); + if (rc) + goto out; + + if (loc->xl_entry) { + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; + } + } + ocfs2_xa_wipe_namevalue(loc); + } else + ocfs2_xa_add_entry(loc, name_hash); + + /* + * If we get here, we have a blank entry. Fill it. We grow our + * name+value pair back from the end. + */ + ocfs2_xa_add_namevalue(loc, xi); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + ocfs2_xa_install_value_root(loc); + +alloc_value: + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); + if (rc < 0) { + ctxt->set_abort = 1; + ocfs2_xa_cleanup_value_truncate(loc, "growing", + orig_clusters); + /* + * If we were growing an existing value, + * ocfs2_xa_cleanup_value_truncate() won't remove + * the entry. We need to restore the original value + * size. + */ + if (loc->xl_entry) { + BUG_ON(!orig_value_size); + loc->xl_entry->xe_value_size = orig_value_size; + } + mlog_errno(rc); + } + } + +out: + return rc; +} + +/* + * Store the value portion of the name+value pair. This will skip + * values that are stored externally. Their tree roots were set up + * by ocfs2_xa_prepare_entry(). + */ +static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + char *nameval_buf; + struct ocfs2_xattr_value_buf vb; + + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + ocfs2_xa_fill_value_buf(loc, &vb); + rc = __ocfs2_xattr_set_value_outside(loc->xl_inode, + ctxt->handle, &vb, + xi->xi_value, + xi->xi_value_len); + } else + memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len); + + return rc; +} + +static int ocfs2_xa_set(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name, + xi->xi_name_len); + + ret = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * From here on out, everything is going to modify the buffer a + * little. Errors are going to leave the xattr header in a + * sane state. Thus, even with errors we dirty the sucker. + */ + + /* Don't worry, we are never called with !xi_value and !xl_entry */ + if (!xi->xi_value) { + ret = ocfs2_xa_remove(loc, ctxt); + goto out_dirty; + } + + ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out_dirty; + } + + ret = ocfs2_xa_store_value(loc, xi, ctxt); + if (ret) + mlog_errno(ret); + +out_dirty: + ocfs2_xa_journal_dirty(ctxt->handle, loc); + +out: + return ret; +} + +static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL)); + + loc->xl_inode = inode; + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_entry = entry; + loc->xl_size = le16_to_cpu(di->i_xattr_inline_size); + loc->xl_header = + (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size - + loc->xl_size); +} + +static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED); + + loc->xl_inode = inode; + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_header = &(xb->xb_attrs.xb_header); + loc->xl_entry = entry; + loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header); +} + +static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_bucket *bucket, + struct ocfs2_xattr_entry *entry) +{ + loc->xl_inode = bucket->bu_inode; + loc->xl_ops = &ocfs2_xa_bucket_loc_ops; + loc->xl_storage = bucket; + loc->xl_header = bucket_xh(bucket); + loc->xl_entry = entry; + loc->xl_size = OCFS2_XATTR_BUCKET_SIZE; +} + +/* + * In xattr remove, if it is stored outside and refcounted, we may have + * the chance to split the refcount tree. So need the allocators. + */ +static int ocfs2_lock_xattr_remove_allocators(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + int *ref_credits) +{ + int ret, meta_add = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + *ref_credits = 0; + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci, + ref_root_bh, xv, + &meta_add, ref_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + int ret = 0, i, ref_credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + void *val; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(entry)) + continue; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + vb->vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + + ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv, + ref_ci, ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, ref_credits + + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } + + if (ret < 0) { + mlog_errno(ret); + break; + } + + } + + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = di_bh, + .vb_access = ocfs2_journal_access_di, + }; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); + + return ret; +} + +struct ocfs2_rm_xattr_bucket_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; +}; + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + struct ocfs2_rm_xattr_bucket_para args = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + }; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); + } else + ret = ocfs2_iterate_xattr_index_block(inode, + blk_bh, + ocfs2_rm_xattr_cluster, + &args); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_xattr_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + if (xb->xb_suballoc_loc) + bg_blkno = le64_to_cpu(xb->xb_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_refcount_tree *ref_tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_caching_info *ref_ci = NULL; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + ref_ci = &ref_tree->rf_ci; + + } + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh, + ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc), + ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + + ocfs2_journal_dirty(handle, di_bh); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +static int ocfs2_xattr_ibody_init(struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_journal_dirty(ctxt->handle, di_bh); + +out: + return ret; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xa_loc loc; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + xs->here = loc.xl_entry; + +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +static int ocfs2_create_xattr_block(struct inode *inode, + struct buffer_head *inode_bh, + struct ocfs2_xattr_set_ctxt *ctxt, + int indexed, + struct buffer_head **ret_bh) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk; + + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), + inode_bh, OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1, + &suballoc_loc, &suballoc_bit_start, + &num_got, &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto end; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), + new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + /* Initialize ocfs2_xattr_block */ + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); + xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = + cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + if (indexed) { + struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16( + ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); + } + ocfs2_journal_dirty(ctxt->handle, new_bh); + + /* Add it to the inode */ + di->i_xattr_loc = cpu_to_le64(first_blkno); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ocfs2_journal_dirty(ctxt->handle, inode_bh); + + *ret_bh = new_bh; + new_bh = NULL; + +end: + brelse(new_bh); + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk = NULL; + int ret; + struct ocfs2_xa_loc loc; + + if (!xs->xattr_bh) { + ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt, + 0, &new_bh); + if (ret) { + mlog_errno(ret); + goto end; + } + + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, + xs->not_found ? NULL : xs->here); + + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) + xs->here = loc.xl_entry; + else if ((ret != -ENOSPC) || ctxt->set_abort) + goto end; + else { + ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); + if (ret) + goto end; + } + } + + if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED) + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); + +end: + return ret; +} + +/* Check whether the new xattr can be inserted into the inode. */ +static int ocfs2_xattr_can_be_in_inode(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *last; + int free, i; + size_t min_offs = xs->end - xs->base; + + if (!xs->header) + return 0; + + last = xs->header->xh_entries; + + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; + if (free < 0) + return 0; + + BUG_ON(!xs->not_found); + + if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi))) + return 1; + + return 0; +} + +static int ocfs2_calc_xattr_set_need(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + int *clusters_need, + int *meta_need, + int *credits_need) +{ + int ret = 0, old_in_xb = 0; + int clusters_add = 0, meta_add = 0, credits = 0; + struct buffer_head *bh = NULL; + struct ocfs2_xattr_block *xb = NULL; + struct ocfs2_xattr_entry *xe = NULL; + struct ocfs2_xattr_value_root *xv = NULL; + char *base = NULL; + int name_offset, name_len = 0; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + xi->xi_value_len); + u64 value_size; + + /* + * Calculate the clusters we need to write. + * No matter whether we replace an old one or add a new one, + * we need this for writing. + */ + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + credits += new_clusters * + ocfs2_clusters_to_blocks(inode->i_sb, 1); + + if (xis->not_found && xbs->not_found) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + clusters_add += new_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &def_xv.xv.xr_list); + } + + goto meta_guess; + } + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + credits += OCFS2_INODE_UPDATE_CREDITS; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + old_in_xb = 1; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + base = bucket_block(xbs->bucket, block_off); + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } else { + base = xbs->base; + credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS; + } + } + + /* + * delete a xattr doesn't need metadata and cluster allocation. + * so just calculate the credits and return. + * + * The credits for removing the value tree will be extended + * by ocfs2_remove_extent itself. + */ + if (!xi->xi_value) { + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_remove_extent_credits(inode->i_sb); + + goto out; + } + + /* do cluster allocation guess first. */ + value_size = le64_to_cpu(xe->xe_value_size); + + if (old_in_xb) { + /* + * In xattr set, we always try to set the xe in inode first, + * so if it can be inserted into inode successfully, the old + * one will be removed from the xattr block, and this xattr + * will be inserted into inode as a new xattr in inode. + */ + if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) { + clusters_add += new_clusters; + credits += ocfs2_remove_extent_credits(inode->i_sb) + + OCFS2_INODE_UPDATE_CREDITS; + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_calc_extend_credits( + inode->i_sb, + &def_xv.xv.xr_list); + goto out; + } + } + + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + /* the new values will be stored outside. */ + u32 old_clusters = 0; + + if (!ocfs2_xattr_is_local(xe)) { + old_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + value_size); + xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + value_size = OCFS2_XATTR_ROOT_SIZE; + } else + xv = &def_xv.xv; + + if (old_clusters >= new_clusters) { + credits += ocfs2_remove_extent_credits(inode->i_sb); + goto out; + } else { + meta_add += ocfs2_extend_meta_needed(&xv->xr_list); + clusters_add += new_clusters - old_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &xv->xr_list); + if (value_size >= OCFS2_XATTR_ROOT_SIZE) + goto out; + } + } else { + /* + * Now the new value will be stored inside. So if the new + * value is smaller than the size of value root or the old + * value, we don't need any allocation, otherwise we have + * to guess metadata allocation. + */ + if ((ocfs2_xattr_is_local(xe) && + (value_size >= xi->xi_value_len)) || + (!ocfs2_xattr_is_local(xe) && + OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len)) + goto out; + } + +meta_guess: + /* calculate metadata allocation. */ + if (di->i_xattr_loc) { + if (!xbs->xattr_bh) { + ret = ocfs2_read_xattr_block(inode, + le64_to_cpu(di->i_xattr_loc), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)bh->b_data; + } else + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + + /* + * If there is already an xattr tree, good, we can calculate + * like other b-trees. Otherwise we may have the chance of + * create a tree, the credit calculation is borrowed from + * ocfs2_calc_extend_credits with root_el = NULL. And the + * new tree will be cluster based, so no meta is needed. + */ + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + struct ocfs2_extent_list *el = + &xb->xb_attrs.xb_root.xt_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else + credits += OCFS2_SUBALLOC_ALLOC + 1; + + /* + * This cluster will be used either for new bucket or for + * new xattr block. + * If the cluster size is the same as the bucket size, one + * more is needed since we may need to extend the bucket + * also. + */ + clusters_add += 1; + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + if (OCFS2_XATTR_BUCKET_SIZE == + OCFS2_SB(inode->i_sb)->s_clustersize) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + clusters_add += 1; + } + } else { + credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } + } +out: + if (clusters_need) + *clusters_need = clusters_add; + if (meta_need) + *meta_need = meta_add; + if (credits_need) + *credits_need = credits; + brelse(bh); + return ret; +} + +static int ocfs2_init_xattr_set_ctxt(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt, + int extra_meta, + int *credits) +{ + int clusters_add, meta_add, ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt)); + + ocfs2_init_dealloc_ctxt(&ctxt->dealloc); + + ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs, + &clusters_add, &meta_add, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + meta_add += extra_meta; + trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add, + clusters_add, *credits); + + if (meta_add) { + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, + &ctxt->meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (clusters_add) { + ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (ctxt->meta_ac) { + ocfs2_free_alloc_context(ctxt->meta_ac); + ctxt->meta_ac = NULL; + } + + /* + * We cannot have an error and a non null ctxt->data_ac. + */ + } + + return ret; +} + +static int __ocfs2_xattr_set_handle(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0, credits, old_found; + + if (!xi->xi_value) { + /* Remove existing extended attribute */ + if (!xis->not_found) + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + else if (!xbs->not_found) + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + if (!ret && !xbs->not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi->xi_value = NULL; + xi->xi_value_len = 0; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else if ((ret == -ENOSPC) && !ctxt->set_abort) { + if (di->i_xattr_loc && !xbs->xattr_bh) { + ret = ocfs2_xattr_block_find(inode, + xi->xi_name_index, + xi->xi_name, xbs); + if (ret) + goto out; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + if (ret) + goto out; + if (!xis->not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi->xi_value = NULL; + xi->xi_value_len = 0; + xbs->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_ibody_set(inode, xi, + xis, ctxt); + } + } + } + + if (!ret) { + /* Update inode ctime. */ + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), + xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); + } +out: + return ret; +} + +/* + * This function only called duing creating inode + * for init security/acl xattrs of the new inode. + * All transanction credits have been reserved in mknod. + */ +int ocfs2_xattr_set_handle(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_dinode *di; + int ret; + + struct ocfs2_xattr_info xi = { + .xi_name_index = name_index, + .xi_name = name, + .xi_name_len = strlen(name), + .xi_value = value, + .xi_value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_set_ctxt ctxt = { + .handle = handle, + .meta_ac = meta_ac, + .data_ac = data_ac, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * In extreme situation, may need xattr bucket when + * block size is too small. And we have already reserved + * the credits for bucket in mknod. + */ + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) { + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + } + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret, credits, ref_meta = 0, ref_credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; + struct ocfs2_refcount_tree *ref_tree = NULL; + + struct ocfs2_xattr_info xi = { + .xi_name_index = name_index, + .xi_name = name, + .xi_name_len = strlen(name), + .xi_value = value, + .xi_value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * Only xbs will be used on indexed trees. xis doesn't need a + * bucket. + */ + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto cleanup_nolock; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search information. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + /* Check whether the value is refcounted and do some preparation. */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + (!xis.not_found || !xbs.not_found)) { + ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, + &xis, &xbs, &ref_tree, + &ref_meta, &ref_credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mutex_unlock(&tl_inode->i_mutex); + mlog_errno(ret); + goto cleanup; + } + } + mutex_unlock(&tl_inode->i_mutex); + + ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, + &xbs, &ctxt, ref_meta, &credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + /* we need to update inode's ctime field, so add credit for it. */ + credits += OCFS2_INODE_UPDATE_CREDITS; + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto out_free_ac; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); + + ocfs2_commit_trans(osb, ctxt.handle); + +out_free_ac: + if (ctxt.data_ac) + ocfs2_free_alloc_context(ctxt.data_ac); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + +cleanup: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + if (!value && !ret) { + ret = ocfs2_try_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); + } + ocfs2_inode_unlock(inode, 1); +cleanup_nolock: + brelse(di_bh); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + + xe_name = bucket_block(bucket, block_off) + new_offset; + if (!memcmp(name, xe_name, name_len)) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + struct ocfs2_xattr_bucket *search; + u32 last_hash; + u64 blkno, lower_blkno = 0; + + search = ocfs2_xattr_bucket_new(inode); + if (!search) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(search, p_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(search); + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + while (low_bucket <= high_bucket) { + ocfs2_xattr_bucket_relse(search); + + bucket = (low_bucket + high_bucket) / 2; + blkno = p_blkno + bucket * blk_per_bucket; + ret = ocfs2_read_xattr_bucket(search, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(search); + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_blkno which may be the insert place. */ + lower_blkno = blkno; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, search, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_blkno) + lower_blkno = p_blkno; + + /* This should be in cache - we just read it during the search */ + ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + xs->here = &xs->header->xh_entries[index]; + trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)bucket_blkno(xs->bucket), + index); + } else + ret = -ENODATA; + +out: + ocfs2_xattr_bucket_free(search); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)root_bh->b_blocknr, + -1); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno, + name, name_index, first_hash, + (unsigned long long)p_blkno, + num_clusters); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, ret = 0; + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket *bucket; + + bucket = ocfs2_xattr_bucket_new(inode); + if (!bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + trace_ocfs2_iterate_xattr_buckets( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, clusters); + + for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { + ret = ocfs2_read_xattr_bucket(bucket, blkno); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); + + trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno, + le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, bucket, para); + if (ret && ret != -ERANGE) + mlog_errno(ret); + /* Fall through to bucket_relse() */ + } + + ocfs2_xattr_bucket_relse(bucket); + if (ret) + break; + } + + ocfs2_xattr_bucket_free(bucket); + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> sb->s_blocksize_bits; + *new_offset = name_offset % sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *blk_bh, + xattr_tree_rec_func *rec_func, + void *para) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + + if (!el->l_next_free_rec || !rec_func) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + break; + } + + ret = rec_func(inode, blk_bh, p_blkno, e_cpos, + num_clusters, para); + if (ret) { + if (ret != -ERANGE) + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + return ret; + +} + +static int ocfs2_list_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_list_xattr_bucket, para); +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_list_xattr_tree_rec, &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * The header goes at the start of the bucket, and the names+values are + * filled from the end. This is why *target starts as the last buffer. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct ocfs2_xattr_bucket *bucket) +{ + int i, blocksize = inode->i_sb->s_blocksize; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u16 count = le16_to_cpu(xb_xh->xh_count); + char *src = xb_bh->b_data; + char *target = bucket_block(bucket, blks - 1); + + trace_ocfs2_cp_xattr_block_to_bucket_begin( + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)bucket_blkno(bucket)); + + for (i = 0; i < blks; i++) + memset(bucket_block(bucket, i), 0, blocksize); + + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = bucket_block(bucket, 0); + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static void ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh) +{ + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i; + + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (xs->not_found) + return; + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 bit_off, len; + u64 blkno; + handle_t *handle = ctxt->handle; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + trace_ocfs2_xattr_create_index_block_begin( + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + BUG_ON(!xs->bucket); + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, + 1, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); + + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xb_bh); + +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket_blkno(bucket); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_entry *xe; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(buf, bucket_block(bucket, i), blocksize); + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + trace_ocfs2_defrag_xattr_bucket( + (unsigned long long)blkno, le16_to_cpu(xh->xh_count), + xh_free_start, le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + len = namevalue_size_xe(xe); + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto out; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(bucket_block(bucket, i), buf, blocksize); + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + +out: + kfree(bucket_buf); + return ret; +} + +/* + * prev_blkno points to the start of an existing extent. new_blkno + * points to a newly allocated extent. Because we know each of our + * clusters contains more than bucket, we can easily split one cluster + * at a bucket boundary. So we take the last cluster of the existing + * extent and split it down the middle. We move the last half of the + * buckets in the last cluster of the existing extent over to the new + * extent. + * + * first_bh is the buffer at prev_blkno so we can update the existing + * extent's bucket count. header_bh is the bucket were we were hoping + * to insert our xattr. If the bucket move places the target in the new + * extent, we'll update first_bh and header_bh after modifying the old + * extent. + * + * first_hash will be set as the 1st xe's name_hash in the new extent. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u64 new_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int ret; + struct super_block *sb = inode->i_sb; + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + int to_move = num_buckets / 2; + u64 src_blkno; + u64 last_cluster_blkno = bucket_blkno(first) + + ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1)); + + BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); + + trace_ocfs2_mv_xattr_bucket_cross_cluster( + (unsigned long long)last_cluster_blkno, + (unsigned long long)new_blkno); + + ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), + last_cluster_blkno, new_blkno, + to_move, first_hash); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This is the first bucket that got moved */ + src_blkno = last_cluster_blkno + (to_move * blks_per_bucket); + + /* + * If the target bucket was part of the moved buckets, we need to + * update first and target. + */ + if (bucket_blkno(target) >= src_blkno) { + /* Find the block for the new target bucket */ + src_blkno = new_blkno + + (bucket_blkno(target) - src_blkno); + + ocfs2_xattr_bucket_relse(first); + ocfs2_xattr_bucket_relse(target); + + /* + * These shouldn't fail - the buffers are in the + * journal from ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_read_xattr_bucket(first, new_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_read_xattr_bucket(target, src_blkno); + if (ret) + mlog_errno(ret); + + } + +out: + return ret; +} + +/* + * Find the suitable pos when we divide a bucket into 2. + * We have to make sure the xattrs with the same hash value exist + * in the same bucket. + * + * If this ocfs2_xattr_header covers more than one hash value, find a + * place where the hash value changes. Try to find the most even split. + * The most common case is that all entries have different hash values, + * and the first check we make will find a place to split. + */ +static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh) +{ + struct ocfs2_xattr_entry *entries = xh->xh_entries; + int count = le16_to_cpu(xh->xh_count); + int delta, middle = count / 2; + + /* + * We start at the middle. Each step gets farther away in both + * directions. We therefore hit the change in hash value + * nearest to the middle. Note that this loop does not execute for + * count < 2. + */ + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (cmp_xe(&entries[middle - delta - 1], + &entries[middle - delta])) + return middle - delta; + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == count) + continue; + + /* Now try delta past middle */ + if (cmp_xe(&entries[middle + delta], + &entries[middle + delta + 1])) + return middle + delta + 1; + } + + /* Every entry had the same hash */ + return count; +} + +/* + * Move some xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + * + * Normally half of the xattrs will be moved. But we have to make + * sure that the xattrs with the same hash value are stored in the + * same bucket. If all the xattrs in this bucket have the same hash + * value, the new bucket will be initialized as an empty one and the + * first_hash will be initialized as (hash_value+1). + */ +static int ocfs2_divide_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + int count, start, len, name_value_len = 0, name_offset = 0; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk, + (unsigned long long)new_blk); + + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(s_bucket, blk); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Even if !new_bucket_head, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the + * same part of ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + new_bucket_head ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(s_bucket); + count = le16_to_cpu(xh->xh_count); + start = ocfs2_xattr_find_divide_pos(xh); + + if (start == count) { + xe = &xh->xh_entries[start-1]; + + /* + * initialized a new empty bucket here. + * The hash value is set as one larger than + * that of the last entry in the previous bucket. + */ + for (i = 0; i < t_bucket->bu_blocks; i++) + memset(bucket_block(t_bucket, i), 0, blocksize); + + xh = bucket_xh(t_bucket); + xh->xh_free_start = cpu_to_le16(blocksize); + xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; + le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); + + goto set_num_buckets; + } + + /* copy the whole bucket to the new first. */ + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + + /* update the new bucket. */ + xh = bucket_xh(t_bucket); + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + name_value_len += namevalue_size_xe(xe); + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + trace_ocfs2_divide_xattr_bucket_move(len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + +set_num_buckets: + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. If we + * just added a new empty bucket, there is no need to modify + * it. + */ + if (start == count) + goto out; + + xh = bucket_xh(s_bucket); + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_xattr_bucket_journal_dirty(handle, s_bucket); + +out: + ocfs2_xattr_bucket_free(s_bucket); + ocfs2_xattr_bucket_free(t_bucket); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; + + BUG_ON(s_blkno == t_blkno); + + trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno, + (unsigned long long)t_blkno, + t_is_new); + + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno); + if (ret) + goto out; + + /* + * Even if !t_is_new, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); + if (ret) + goto out; + + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new + * cluster to fill, we came here from + * ocfs2_mv_xattr_buckets(), and it is really new - + * ACCESS_CREATE is required. But we also might have moved data + * out of t_bucket before extending back into it. + * ocfs2_add_new_xattr_bucket() can do this - its call to + * ocfs2_add_new_xattr_cluster() may have created a new extent + * and copied out the end of the old extent. Then it re-extends + * the old extent back to create space for new xattrs. That's + * how we get here, and the bucket isn't really new. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + t_is_new ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); + +out: + ocfs2_xattr_bucket_free(t_bucket); + ocfs2_xattr_bucket_free(s_bucket); + + return ret; +} + +/* + * src_blk points to the start of an existing extent. last_blk points to + * last cluster in that extent. to_blk points to a newly allocated + * extent. We copy the buckets from the cluster at last_blk to the new + * extent. If start_bucket is non-zero, we skip that many buckets before + * we start copying. The new extent's xh_num_buckets gets set to the + * number of buckets we copied. The old extent's xh_num_buckets shrinks + * by the same amount. + */ +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct ocfs2_xattr_bucket *old_first, *new_first; + + trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk, + (unsigned long long)to_blk); + + BUG_ON(start_bucket >= num_buckets); + if (start_bucket) { + num_buckets -= start_bucket; + last_blk += (start_bucket * blks_per_bucket); + } + + /* The first bucket of the original extent */ + old_first = ocfs2_xattr_bucket_new(inode); + /* The first bucket of the new extent */ + new_first = ocfs2_xattr_bucket_new(inode); + if (!old_first || !new_first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(old_first, src_blk); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to update the first bucket of the old extent and all + * the buckets going to the new extent. + */ + credits = ((num_buckets + 1) * blks_per_bucket); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, old_first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + last_blk + (i * blks_per_bucket), + to_blk + (i * blks_per_bucket), + 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Get the new bucket ready before we dirty anything + * (This actually shouldn't fail, because we already dirtied + * it once in ocfs2_cp_xattr_bucket()). + */ + ret = ocfs2_read_xattr_bucket(new_first, to_blk); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_bucket_journal_access(handle, new_first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now update the headers */ + le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, old_first); + + bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, new_first); + + if (first_hash) + *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash); + +out: + ocfs2_xattr_bucket_free(new_first); + ocfs2_xattr_bucket_free(old_first); + return ret; +} + +/* + * Move some xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_divide_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_divide_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u64 new_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret; + + trace_ocfs2_adjust_xattr_cross_cluster( + (unsigned long long)bucket_blkno(first), + (unsigned long long)new_blk, prev_clusters); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first, target, + new_blk, + prev_clusters, + v_start); + if (ret) + mlog_errno(ret); + } else { + /* The start of the last cluster in the first extent */ + u64 last_blk = bucket_blkno(first) + + ((prev_clusters - 1) * + ocfs2_clusters_to_blocks(inode->i_sb, 1)); + + if (prev_clusters > 1 && bucket_blkno(target) != last_blk) { + ret = ocfs2_mv_xattr_buckets(inode, handle, + bucket_blkno(first), + last_blk, new_blk, 0, + v_start); + if (ret) + mlog_errno(ret); + } else { + ret = ocfs2_divide_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + if (ret) + mlog_errno(ret); + + if ((bucket_blkno(target) == last_blk) && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u32 *num_clusters, + u32 prev_cpos, + int *extend, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = ctxt->handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + trace_ocfs2_add_new_xattr_cluster_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)bucket_blkno(first), + prev_cpos, prev_clusters); + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits); + + if (bucket_blkno(first) + (prev_clusters * bpc) == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first, + target, + block, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block, + v_start, num_bits); + ret = ocfs2_insert_extent(handle, &et, v_start, block, + num_bits, 0, ctxt->meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ocfs2_journal_dirty(handle, root_bh); + +leave: + return ret; +} + +/* + * We are given an extent. 'first' is the bucket at the very front of + * the extent. The extent has space for an additional bucket past + * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number + * of the target bucket. We wish to shift every bucket past the target + * down one, filling in that additional space. When we get back to the + * target, we split the target between itself and the now-empty bucket + * at target+1 (aka, target_blkno + blks_per_bucket). + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + u64 target_blk, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 end_blk; + u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); + + trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk, + (unsigned long long)bucket_blkno(first), + num_clusters, new_bucket); + + /* The extent must have room for an additional bucket */ + BUG_ON(new_bucket >= + (num_clusters * ocfs2_xattr_buckets_per_cluster(osb))); + + /* end_blk points to the last existing bucket */ + end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket); + + /* + * end_blk is the start of the last existing bucket. + * Thus, (end_blk - target_blk) covers the target bucket and + * every bucket after it up to, but not including, the last + * existing bucket. Then we add the last existing bucket, the + * new bucket, and the first bucket (3 * blk_per_bucket). + */ + credits = (end_blk - target_blk) + (3 * blk_per_bucket); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + while (end_blk != target_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto out; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in target_blkno to the next bucket. */ + ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk, + target_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1); + ocfs2_xattr_bucket_journal_dirty(handle, first); + +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets + * accordingly. xb_bh is the ocfs2_xattr_block, and target is the + * bucket we want to insert into. + * + * In the easy case, we will move all the buckets after target down by + * one. Half of target's xattrs will be moved to the next bucket. + * + * If current cluster is full, we'll allocate a new one. This may not + * be contiguous. The underlying calls will make sure that there is + * space for the insert, shifting buckets around if necessary. + * 'target' may be moved by those calls. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct ocfs2_xattr_bucket *target, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u32 name_hash = + le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + /* The bucket at the front of the extent */ + struct ocfs2_xattr_bucket *first; + + trace_ocfs2_add_new_xattr_bucket( + (unsigned long long)bucket_blkno(target)); + + /* The first bucket of the original extent */ + first = ocfs2_xattr_bucket_new(inode); + if (!first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(first, p_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) { + /* + * This can move first+target if the target bucket moves + * to the new extent. + */ + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + first, + target, + &num_clusters, + e_cpos, + &extend, + ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) { + ret = ocfs2_extend_xattr_bucket(inode, + ctxt->handle, + first, + bucket_blkno(target), + num_clusters); + if (ret) + mlog_errno(ret); + } + +out: + ocfs2_xattr_bucket_free(first); + + return ret; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int xe_off, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret, offset; + u64 value_blk; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + + vb.vb_bh = bucket->bu_bhs[value_blk]; + BUG_ON(!vb.vb_bh); + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (vb.vb_bh->b_data + offset % blocksize); + + /* + * From here on out we have to dirty the bucket. The generic + * value calls only modify one of the bucket's bhs, but we need + * to send the bucket at once. So if they error, they *could* have + * modified something. We have to assume they did, and dirty + * the whole bucket. This leaves us in a consistent state. + */ + trace_ocfs2_xattr_bucket_value_truncate( + (unsigned long long)bucket_blkno(bucket), xe_off, len); + ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xe->xe_value_size = cpu_to_le64(len); + + ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); + +out: + return ret; +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_delete_xattr_in_bucket, para); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + trace_ocfs2_rm_xattr_cluster( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, cpos, len); + + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, + len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + ocfs2_journal_dirty(handle, root_bh); + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +/* + * check whether the xattr bucket is filled up with the same hash value. + * If we want to insert the xattr with the same hash, return -ENOSPC. + * If we want to insert a xattr with different hash value, go ahead + * and ocfs2_divide_xattr_bucket will handle this. + */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + const char *name) +{ + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) + return 0; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket_blkno(bucket), + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +/* + * Try to set the entry in the current bucket. If we fail, the caller + * will handle getting us another bucket. + */ +static int ocfs2_xattr_set_entry_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_xa_loc loc; + + trace_ocfs2_xattr_set_entry_bucket(xi->xi_name); + + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; + } + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Ok, we need space. Let's try defragmenting the bucket. */ + ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, + xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; + } + if (ret != -ENOSPC) + mlog_errno(ret); + + +out: + return ret; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + + trace_ocfs2_xattr_set_entry_index_block(xi->xi_name); + + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (!ret) + goto out; + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Ack, need more space. Let's try to get another bucket! */ + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, + xs->bucket, + xi->xi_name); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket, + ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_add_new_xattr_bucket() will have updated + * xs->bucket if it moved, but it will not have updated + * any of the other search fields. Thus, we drop it and + * re-search. Everything should be cached, so it'll be + * quick. + */ + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->xi_name_index, + xi->xi_name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + + /* Ok, we have a new bucket, let's try again */ + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (ret && (ret != -ENOSPC)) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, ref_credits; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u16 i; + struct ocfs2_xattr_entry *xe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; + int credits = ocfs2_remove_extent_credits(osb->sb) + + ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_xattr_value_root *xv; + struct ocfs2_rm_xattr_bucket_para *args = + (struct ocfs2_rm_xattr_bucket_para *)para; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, + i, &xv, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_lock_xattr_remove_allocators(inode, xv, + args->ref_ci, + args->ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_value_truncate(inode, bucket, + i, 0, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } + if (ret) { + mlog_errno(ret); + break; + } + } + + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + return ret; +} + +/* + * Whenever we modify a xattr value root in the bucket(e.g, CoW + * or change the extent record flag), we need to recalculate + * the metaecc for the whole bucket. So it is done here. + * + * Note: + * We have to give the extra credits for the caller. + */ +static int ocfs2_xattr_bucket_post_refcount(struct inode *inode, + handle_t *handle, + void *para) +{ + int ret; + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + + return 0; +} + +/* + * Special action we need if the xattr value is refcounted. + * + * 1. If the xattr is refcounted, lock the tree. + * 2. CoW the xattr if we are setting the new value and the value + * will be stored outside. + * 3. In other case, decrease_refcount will work for us, so just + * lock the refcount tree, calculate the meta and credits is OK. + * + * We have to do CoW before ocfs2_init_xattr_set_ctxt since + * currently CoW is a completed transaction, while this function + * will also lock the allocators and let us deadlock. So we will + * CoW the whole xattr value. + */ +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_add, + int *credits) +{ + int ret = 0; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_entry *xe; + char *base; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + int name_offset, name_len; + struct ocfs2_xattr_value_buf vb; + struct ocfs2_xattr_bucket *bucket = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_post_refcount refcount; + struct ocfs2_post_refcount *p = NULL; + struct buffer_head *ref_root_bh = NULL; + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + vb.vb_bh = xis->inode_bh; + vb.vb_access = ocfs2_journal_access_di; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + base = bucket_block(xbs->bucket, block_off); + vb.vb_bh = xbs->bucket->bu_bhs[block_off]; + vb.vb_access = ocfs2_journal_access; + + if (ocfs2_meta_ecc(osb)) { + /*create parameters for ocfs2_post_refcount. */ + bucket = xbs->bucket; + refcount.credits = bucket->bu_blocks; + refcount.para = bucket; + refcount.func = + ocfs2_xattr_bucket_post_refcount; + p = &refcount; + } + } else { + base = xbs->base; + vb.vb_bh = xbs->xattr_bh; + vb.vb_access = ocfs2_journal_access_xb; + } + } + + if (ocfs2_xattr_is_local(xe)) + goto out; + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, &vb.vb_xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We just need to check the 1st extent record, since we always + * CoW the whole xattr. So there shouldn't be a xattr with + * some REFCOUNT extent recs after the 1st one. + */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * If we are deleting the xattr or the new size will be stored inside, + * cool, leave it there, the xattr truncate process will remove them + * for us(it still needs the refcount tree lock and the meta, credits). + * And the worse case is that every cluster truncate will split the + * refcount tree, and make the original extent become 3. So we will need + * 2 * cluster more extent recs at most. + */ + if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) { + + ret = ocfs2_refcounted_xattr_delete_need(inode, + &(*ref_tree)->rf_ci, + ref_root_bh, vb.vb_xv, + meta_add, credits); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_cow_xattr(inode, di, &vb, + *ref_tree, ref_root_bh, 0, + le32_to_cpu(vb.vb_xv->xr_clusters), p); + if (ret) + mlog_errno(ret); + +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root. + * The physical clusters will be added to refcount tree. + */ +static int ocfs2_xattr_value_attach_refcount(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *value_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *refcount) +{ + int ret = 0; + u32 clusters = le32_to_cpu(xv->xr_clusters); + u32 cpos, p_cluster, num_clusters; + struct ocfs2_extent_list *el = &xv->xr_list; + unsigned int ext_flags; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED)) + continue; + + BUG_ON(!p_cluster); + + ret = ocfs2_add_refcount_flag(inode, value_et, + ref_ci, ref_root_bh, + cpos - num_clusters, + p_cluster, num_clusters, + dealloc, refcount); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +/* + * Given a normal ocfs2_xattr_header, refcount all the entries which + * have value stored outside. + * Used for xattrs stored in inode and ocfs2_xattr_block. + */ +static int ocfs2_xattr_attach_refcount_normal(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree et; + int i, ret = 0; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + xe = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + xv = (struct ocfs2_xattr_value_root *)((void *)header + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + vb->vb_xv = xv; + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et, + ref_ci, ref_root_bh, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_xattr_inline_attach_refcount(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *) + (fe_bh->b_data + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + struct ocfs2_xattr_value_buf vb = { + .vb_bh = fe_bh, + .vb_access = ocfs2_journal_access_di, + }; + + return ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, dealloc); +} + +struct ocfs2_xattr_tree_value_refcount_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh) +{ + int ret, block_off, name_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + void *base; + + ret = ocfs2_xattr_bucket_get_name_value(sb, + bucket_xh(bucket), + offset, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + + base = bucket_block(bucket, block_off); + + *xv = (struct ocfs2_xattr_value_root *)(base + name_offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (bh) + *bh = bucket->bu_bhs[block_off]; +out: + return ret; +} + +/* + * For a given xattr bucket, refcount all the entries which + * have value stored outside. + */ +static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int i, ret = 0; + struct ocfs2_extent_tree et; + struct ocfs2_xattr_tree_value_refcount_para *ref = + (struct ocfs2_xattr_tree_value_refcount_para *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + struct ocfs2_post_refcount refcount = { + .credits = bucket->bu_blocks, + .para = bucket, + .func = ocfs2_xattr_bucket_post_refcount, + }; + struct ocfs2_post_refcount *p = NULL; + + /* We only need post_refcount if we support metaecc. */ + if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) + p = &refcount; + + trace_ocfs2_xattr_bucket_value_refcount( + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, + &vb.vb_xv, &vb.vb_bh); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_init_xattr_value_extent_tree(&et, + INODE_CACHE(inode), &vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv, + &et, ref->ref_ci, + ref->ref_root_bh, + ref->dealloc, p); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; + +} + +static int ocfs2_refcount_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_xattr_bucket_value_refcount, + para); +} + +static int ocfs2_xattr_block_attach_refcount(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, + dealloc); + } else { + struct ocfs2_xattr_tree_value_refcount_para para = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + .dealloc = dealloc, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_refcount_xattr_tree_rec, + ¶); + } + + return ret; +} + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct buffer_head *blk_bh = NULL; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh, + ref_ci, ref_root_bh, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!di->i_xattr_loc) + goto out; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci, + ref_root_bh, dealloc); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); +out: + + return ret; +} + +typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe); +/* + * Store the information we need in xattr reflink. + * old_bh and new_bh are inode bh for the old and new inode. + */ +struct ocfs2_xattr_reflink { + struct inode *old_inode; + struct inode *new_inode; + struct buffer_head *old_bh; + struct buffer_head *new_bh; + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; + should_xattr_reflinked *xattr_reflinked; +}; + +/* + * Given a xattr header and xe offset, + * return the proper xv and the corresponding bh. + * xattr in inode, block and xattr tree have different implementaions. + */ +typedef int (get_xattr_value_root)(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para); + +/* + * Calculate all the xattr value root metadata stored in this xattr header and + * credits we need if we create them from the scratch. + * We use get_xattr_value_root so that all types of xattr container can use it. + */ +static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int *metas, int *credits, + int *num_recs, + get_xattr_value_root *func, + void *para) +{ + int i, ret = 0; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + *metas += le16_to_cpu(xv->xr_list.l_tree_depth) * + le16_to_cpu(xv->xr_list.l_next_free_rec); + + *credits += ocfs2_calc_extend_credits(sb, + &def_xv.xv.xr_list); + + /* + * If the value is a tree with depth > 1, We don't go deep + * to the extent block, so just calculate a maximum record num. + */ + if (!xv->xr_list.l_tree_depth) + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); + else + *num_recs += ocfs2_clusters_for_bytes(sb, + XATTR_SIZE_MAX); + } + + return ret; +} + +/* Used by xattr inode and block to return the right xv and buffer_head. */ +static int ocfs2_get_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + + *xv = (struct ocfs2_xattr_value_root *)((void *)xh + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (ret_bh) + *ret_bh = bh; + + return 0; +} + +/* + * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * It is only used for inline xattr and xattr block. + */ +static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, + struct ocfs2_xattr_header *xh, + struct buffer_head *ref_root_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, meta_add = 0, num_recs = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + *credits = 0; + + ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh, + &meta_add, credits, &num_recs, + ocfs2_get_xattr_value_root, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + */ + num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2; + meta_add += num_recs; + *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given a xattr header, reflink all the xattrs in this container. + * It can be used for inode, block and bucket. + * + * NOTE: + * Before we call this function, the caller has memcpy the xattr in + * old_xh to the new_xh. + * + * If args.xattr_reflinked is set, call it to decide whether the xe should + * be reflinked or not. If not, remove it from the new xattr header. + */ +static int ocfs2_reflink_xattr_header(handle_t *handle, + struct ocfs2_xattr_reflink *args, + struct buffer_head *old_bh, + struct ocfs2_xattr_header *xh, + struct buffer_head *new_bh, + struct ocfs2_xattr_header *new_xh, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_alloc_context *meta_ac, + get_xattr_value_root *func, + void *para) +{ + int ret = 0, i, j; + struct super_block *sb = args->old_inode->i_sb; + struct buffer_head *value_bh; + struct ocfs2_xattr_entry *xe, *last; + struct ocfs2_xattr_value_root *xv, *new_xv; + struct ocfs2_extent_tree data_et; + u32 clusters, cpos, p_cluster, num_clusters; + unsigned int ext_flags = 0; + + trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr, + le16_to_cpu(xh->xh_count)); + + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { + xe = &xh->xh_entries[i]; + + if (args->xattr_reflinked && !args->xattr_reflinked(xe)) { + xe = &new_xh->xh_entries[j]; + + le16_add_cpu(&new_xh->xh_count, -1); + if (new_xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } + + /* + * We don't want j to increase in the next round since + * it is already moved ahead. + */ + j--; + continue; + } + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, old_bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * For the xattr which has l_tree_depth = 0, all the extent + * recs have already be copied to the new xh with the + * propriate OCFS2_EXT_REFCOUNTED flag we just need to + * increase the refount count int the refcount tree. + * + * For the xattr which has l_tree_depth > 0, we need + * to initialize it to the empty default value root, + * and then insert the extents one by one. + */ + if (xv->xr_list.l_tree_depth) { + memcpy(new_xv, &def_xv, sizeof(def_xv)); + vb->vb_xv = new_xv; + vb->vb_bh = value_bh; + ocfs2_init_xattr_value_extent_tree(&data_et, + INODE_CACHE(args->new_inode), vb); + } + + clusters = le32_to_cpu(xv->xr_clusters); + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(args->old_inode, + cpos, + &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!p_cluster); + + if (xv->xr_list.l_tree_depth) { + ret = ocfs2_insert_extent(handle, + &data_et, cpos, + ocfs2_clusters_to_blocks( + args->old_inode->i_sb, + p_cluster), + num_clusters, ext_flags, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_increase_refcount(handle, args->ref_ci, + args->ref_root_bh, + p_cluster, num_clusters, + meta_ac, args->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + } + } + +out: + return ret; +} + +static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data; + int inline_size = le16_to_cpu(di->i_xattr_inline_size); + int header_off = osb->sb->s_blocksize - inline_size; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *) + (args->old_bh->b_data + header_off); + struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *) + (args->new_bh->b_data + header_off); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_inode_info *new_oi; + struct ocfs2_dinode *new_di; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = args->new_bh, + .vb_access = ocfs2_journal_access_di, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode), + args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(args->new_bh->b_data + header_off, + args->old_bh->b_data + header_off, inline_size); + + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + new_di->i_xattr_inline_size = cpu_to_le16(inline_size); + + ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh, + args->new_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_oi = OCFS2_I(args->new_inode); + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(args->new_inode))) { + struct ocfs2_extent_list *el = &new_di->id2.i_list; + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_create_empty_xattr_block(struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt; + + memset(&ctxt, 0, sizeof(ctxt)); + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto out; + } + + trace_ocfs2_create_empty_xattr_block( + (unsigned long long)fe_bh->b_blocknr, indexed); + ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed, + ret_bh); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, ctxt.handle); +out: + ocfs2_free_alloc_context(ctxt.meta_ac); + return ret; +} + +static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode); + struct ocfs2_dinode *new_di; + struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb); + int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_block *new_xb = + (struct ocfs2_xattr_block *)new_blk_bh->b_data; + struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = new_blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* One more credits in case we need to add xattr flags in new inode. */ + handle = ocfs2_start_trans(osb, credits + 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + ret = ocfs2_journal_access_di(handle, + INODE_CACHE(args->new_inode), + args->new_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode), + new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off, + osb->sb->s_blocksize - header_off); + + ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh, + new_blk_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_journal_dirty(handle, new_blk_bh); + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +struct ocfs2_reflink_xattr_tree_args { + struct ocfs2_xattr_reflink *reflink; + struct buffer_head *old_blk_bh; + struct buffer_head *new_blk_bh; + struct ocfs2_xattr_bucket *old_bucket; + struct ocfs2_xattr_bucket *new_bucket; +}; + +/* + * NOTE: + * We have to handle the case that both old bucket and new bucket + * will call this function to get the right ret_bh. + * So The caller must give us the right bh. + */ +static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_xattr_bucket *bucket; + + if (bh == args->old_bucket->bu_bhs[0]) + bucket = args->old_bucket; + else + bucket = args->new_bucket; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +struct ocfs2_value_tree_metas { + int num_metas; + int credits; + int num_recs; +}; + +static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +static int ocfs2_calc_value_tree_metas(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + struct ocfs2_value_tree_metas *metas = + (struct ocfs2_value_tree_metas *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + + /* Add the credits for this bucket first. */ + metas->credits += bucket->bu_blocks; + return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0], + xh, &metas->num_metas, + &metas->credits, &metas->num_recs, + ocfs2_value_tree_metas_in_bucket, + bucket); +} + +/* + * Given a xattr extent rec starting from blkno and having len clusters, + * iterate all the buckets calculate how much metadata we need for reflinking + * all the ocfs2_xattr_value_root and lock the allocators accordingly. + */ +static int ocfs2_lock_reflink_xattr_rec_allocators( + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *xt_et, + u64 blkno, u32 len, int *credits, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac) +{ + int ret, num_free_extents; + struct ocfs2_value_tree_metas metas; + struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb); + struct ocfs2_refcount_block *rb; + + memset(&metas, 0, sizeof(metas)); + + ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len, + ocfs2_calc_value_tree_metas, &metas); + if (ret) { + mlog_errno(ret); + goto out; + } + + *credits = metas.credits; + + /* + * Calculate we need for refcount tree change. + * + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + * In the end, we have to add credits for modifying the already + * existed refcount block. + */ + rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data; + metas.num_recs = + (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) / + ocfs2_refcount_recs_per_rb(osb->sb) * 2; + metas.num_metas += metas.num_recs; + *credits += metas.num_recs + + metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + /* count in the xattr tree change. */ + num_free_extents = ocfs2_num_free_extents(osb, xt_et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < len) + metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); + + *credits += ocfs2_calc_extend_credits(osb->sb, + xt_et->et_root_el); + + if (metas.num_metas) { + ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (len) { + ret = ocfs2_reserve_clusters(osb, len, data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_reflink_xattr_bucket(handle_t *handle, + u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_reflink_xattr_tree_args *args) +{ + int i, j, ret = 0; + struct super_block *sb = args->reflink->old_inode->i_sb; + int bpb = args->old_bucket->bu_blocks; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) { + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + for (j = 0; j < bpb; j++) + memcpy(bucket_block(args->new_bucket, j), + bucket_block(args->old_bucket, j), + sb->s_blocksize); + + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ret = ocfs2_reflink_xattr_header(handle, args->reflink, + args->old_bucket->bu_bhs[0], + bucket_xh(args->old_bucket), + args->new_bucket->bu_bhs[0], + bucket_xh(args->new_bucket), + &vb, meta_ac, + ocfs2_get_reflink_xattr_value_root, + args); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * Re-access and dirty the bucket to calculate metaecc. + * Because we may extend the transaction in reflink_xattr_header + * which will let the already accessed block gone. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + } + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno, + num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + +/* + * Create the same xattr extent record in the new inode's xattr tree. + */ +static int ocfs2_reflink_xattr_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret, credits = 0; + handle_t *handle; + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_extent_tree et; + + trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len); + + ocfs2_init_xattr_tree_extent_tree(&et, + INODE_CACHE(args->reflink->new_inode), + args->new_blk_bh); + + ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno, + len, &credits, + &meta_ac, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + return ret; +} + +/* + * Create reflinked xattr buckets. + * We will add bucket one by one, and refcount all the xattrs in the bucket + * if they are stored outside. + */ +static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret; + struct ocfs2_reflink_xattr_tree_args para; + + memset(¶, 0, sizeof(para)); + para.reflink = args; + para.old_blk_bh = blk_bh; + para.new_blk_bh = new_blk_bh; + + para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode); + if (!para.old_bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode); + if (!para.new_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh, + ocfs2_reflink_xattr_rec, + ¶); + if (ret) + mlog_errno(ret); + +out: + ocfs2_xattr_bucket_free(para.old_bucket); + ocfs2_xattr_bucket_free(para.new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh) +{ + int ret, indexed = 0; + struct buffer_head *new_blk_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) + indexed = 1; + + ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh, + &new_blk_bh, indexed); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!indexed) + ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); + else + ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_blk_bh); + return ret; +} + +static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe) +{ + int type = ocfs2_xattr_get_type(xe); + + return type != OCFS2_XATTR_INDEX_SECURITY && + type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS && + type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; +} + +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security) +{ + int ret; + struct ocfs2_xattr_reflink args; + struct ocfs2_inode_info *oi = OCFS2_I(old_inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh = NULL; + + ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dealloc_ctxt(&dealloc); + + args.old_inode = old_inode; + args.new_inode = new_inode; + args.old_bh = old_bh; + args.new_bh = new_bh; + args.ref_ci = &ref_tree->rf_ci; + args.ref_root_bh = ref_root_bh; + args.dealloc = &dealloc; + if (preserve_security) + args.xattr_reflinked = NULL; + else + args.xattr_reflinked = ocfs2_reflink_xattr_no_security; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_reflink_xattr_inline(&args); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!di->i_xattr_loc) + goto out_unlock; + + ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_reflink_xattr_in_block(&args, blk_bh); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); + +out_unlock: + ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb), + ref_tree, 1); + brelse(ref_root_bh); + + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc); + } + +out: + return ret; +} + +/* + * Initialize security and acl for a already created inode. + * Used for reflink a non-preserve-security file. + * + * It uses common api like ocfs2_xattr_set, so the caller + * must not hold any lock expect i_mutex. + */ +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode, + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl) +{ + struct buffer_head *dir_bh = NULL; + int ret = 0; + + ret = ocfs2_init_security_get(inode, dir, qstr, NULL); + if (ret) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_inode_lock(dir, &dir_bh, 0); + if (ret) { + mlog_errno(ret); + goto leave; + } + + if (!ret && default_acl) + ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (!ret && acl) + ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + + ocfs2_inode_unlock(dir, 0); + brelse(dir_bh); +leave: + return ret; +} +/* + * 'security' attributes support + */ +static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err) + break; + } + return err; +} + +int ocfs2_init_security_get(struct inode *inode, + struct inode *dir, + const struct qstr *qstr, + struct ocfs2_security_xattr_info *si) +{ + /* check whether ocfs2 support feature xattr */ + if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) + return -EOPNOTSUPP; + if (si) + return security_old_inode_init_security(inode, dir, qstr, + &si->name, &si->value, + &si->value_len); + + return security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, NULL); +} + +int ocfs2_init_security_set(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_security_xattr_info *si, + struct ocfs2_alloc_context *xattr_ac, + struct ocfs2_alloc_context *data_ac) +{ + return ocfs2_xattr_set_handle(handle, inode, di_bh, + OCFS2_XATTR_INDEX_SECURITY, + si->name, si->value, si->value_len, 0, + xattr_ac, data_ac); +} + +const struct xattr_handler ocfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ocfs2_xattr_security_list, + .get = ocfs2_xattr_security_get, + .set = ocfs2_xattr_security_set, +}; + +/* + * 'trusted' attributes support + */ +static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + +/* + * 'user' attributes support + */ +static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/kernel/fs/ocfs2/xattr.h b/kernel/fs/ocfs2/xattr.h new file mode 100644 index 000000000..f10d5b93c --- /dev/null +++ b/kernel/fs/ocfs2/xattr.h @@ -0,0 +1,100 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include +#include + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +struct ocfs2_security_xattr_info { + int enable; + const char *name; + void *value; + size_t value_len; +}; + +extern const struct xattr_handler ocfs2_xattr_user_handler; +extern const struct xattr_handler ocfs2_xattr_trusted_handler; +extern const struct xattr_handler ocfs2_xattr_security_handler; +extern const struct xattr_handler *ocfs2_xattr_handlers[]; + +ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, + const char *, void *, size_t); +int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, + int, const char *, const void *, size_t, int, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di); +int ocfs2_xattr_remove(struct inode *, struct buffer_head *); +int ocfs2_init_security_get(struct inode *, struct inode *, + const struct qstr *, + struct ocfs2_security_xattr_info *); +int ocfs2_init_security_set(handle_t *, struct inode *, + struct buffer_head *, + struct ocfs2_security_xattr_info *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_calc_security_init(struct inode *, + struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); +int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, + umode_t, struct ocfs2_security_xattr_info *, + int *, int *, int *); + +/* + * xattrs can live inside an inode, as part of an external xattr block, + * or inside an xattr bucket, which is the leaf of a tree rooted in an + * xattr block. Some of the xattr calls, especially the value setting + * functions, want to treat each of these locations as equal. Let's wrap + * them in a structure that we can pass around instead of raw buffer_heads. + */ +struct ocfs2_xattr_value_buf { + struct buffer_head *vb_bh; + ocfs2_journal_access_func vb_access; + struct ocfs2_xattr_value_root *vb_xv; +}; + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security); +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode, + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl); +#endif /* OCFS2_XATTR_H */ diff --git a/kernel/fs/omfs/Kconfig b/kernel/fs/omfs/Kconfig new file mode 100644 index 000000000..b1b9a0aba --- /dev/null +++ b/kernel/fs/omfs/Kconfig @@ -0,0 +1,13 @@ +config OMFS_FS + tristate "SonicBlue Optimized MPEG File System support" + depends on BLOCK + select CRC_ITU_T + help + This is the proprietary file system used by the Rio Karma music + player and ReplayTV DVR. Despite the name, this filesystem is not + more efficient than a standard FS for MPEG files, in fact likely + the opposite is true. Say Y if you have either of these devices + and wish to mount its disk. + + To compile this file system support as a module, choose M here: the + module will be called omfs. If unsure, say N. diff --git a/kernel/fs/omfs/Makefile b/kernel/fs/omfs/Makefile new file mode 100644 index 000000000..8b82b63f1 --- /dev/null +++ b/kernel/fs/omfs/Makefile @@ -0,0 +1,4 @@ + +obj-$(CONFIG_OMFS_FS) += omfs.o + +omfs-y := bitmap.o dir.o file.o inode.o diff --git a/kernel/fs/omfs/bitmap.c b/kernel/fs/omfs/bitmap.c new file mode 100644 index 000000000..83f4e7651 --- /dev/null +++ b/kernel/fs/omfs/bitmap.c @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include "omfs.h" + +unsigned long omfs_count_free(struct super_block *sb) +{ + unsigned int i; + unsigned long sum = 0; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int nbits = sb->s_blocksize * 8; + + for (i = 0; i < sbi->s_imap_size; i++) + sum += nbits - bitmap_weight(sbi->s_imap[i], nbits); + + return sum; +} + +/* + * Counts the run of zero bits starting at bit up to max. + * It handles the case where a run might spill over a buffer. + * Called with bitmap lock. + */ +static int count_run(unsigned long **addr, int nbits, + int addrlen, int bit, int max) +{ + int count = 0; + int x; + + for (; addrlen > 0; addrlen--, addr++) { + x = find_next_bit(*addr, nbits, bit); + count += x - bit; + + if (x < nbits || count > max) + return min(count, max); + + bit = 0; + } + return min(count, max); +} + +/* + * Sets or clears the run of count bits starting with bit. + * Called with bitmap lock. + */ +static int set_run(struct super_block *sb, int map, + int nbits, int bit, int count, int set) +{ + int i; + int err; + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + + err = -ENOMEM; + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + for (i = 0; i < count; i++, bit++) { + if (bit >= nbits) { + bit = 0; + map++; + + mark_buffer_dirty(bh); + brelse(bh); + bh = sb_bread(sb, + clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + } + if (set) { + set_bit(bit, sbi->s_imap[map]); + set_bit(bit, (unsigned long *)bh->b_data); + } else { + clear_bit(bit, sbi->s_imap[map]); + clear_bit(bit, (unsigned long *)bh->b_data); + } + } + mark_buffer_dirty(bh); + brelse(bh); + err = 0; +out: + return err; +} + +/* + * Tries to allocate exactly one block. Returns true if successful. + */ +int omfs_allocate_block(struct super_block *sb, u64 block) +{ + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + unsigned int map, bit; + int ret = 0; + u64 tmp; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + mutex_lock(&sbi->s_bitmap_lock); + if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map])) + goto out; + + if (sbi->s_bitmap_ino > 0) { + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + set_bit(bit, (unsigned long *)bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + } + ret = 1; +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + + +/* + * Tries to allocate a set of blocks. The request size depends on the + * type: for inodes, we must allocate sbi->s_mirrors blocks, and for file + * blocks, we try to allocate sbi->s_clustersize, but can always get away + * with just one block. + */ +int omfs_allocate_range(struct super_block *sb, + int min_request, + int max_request, + u64 *return_block, + int *return_size) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + int ret = 0; + int i, run, bit; + + mutex_lock(&sbi->s_bitmap_lock); + for (i = 0; i < sbi->s_imap_size; i++) { + bit = 0; + while (bit < bits_per_entry) { + bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry, + bit); + + if (bit == bits_per_entry) + break; + + run = count_run(&sbi->s_imap[i], bits_per_entry, + sbi->s_imap_size-i, bit, max_request); + + if (run >= min_request) + goto found; + bit += run; + } + } + ret = -ENOSPC; + goto out; + +found: + *return_block = (u64) i * bits_per_entry + bit; + *return_size = run; + ret = set_run(sb, i, bits_per_entry, bit, run, 1); + +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + +/* + * Clears count bits starting at a given block. + */ +int omfs_clear_range(struct super_block *sb, u64 block, int count) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + u64 tmp; + unsigned int map, bit; + int ret; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + if (map >= sbi->s_imap_size) + return 0; + + mutex_lock(&sbi->s_bitmap_lock); + ret = set_run(sb, map, bits_per_entry, bit, count, 0); + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} diff --git a/kernel/fs/omfs/dir.c b/kernel/fs/omfs/dir.c new file mode 100644 index 000000000..f833bf8d5 --- /dev/null +++ b/kernel/fs/omfs/dir.c @@ -0,0 +1,457 @@ +/* + * OMFS (as used by RIO Karma) directory operations. + * Copyright (C) 2005 Bob Copeland + * Released under GPL v2. + */ + +#include +#include +#include +#include "omfs.h" + +static int omfs_hash(const char *name, int namelen, int mod) +{ + int i, hash = 0; + for (i = 0; i < namelen; i++) + hash ^= tolower(name[i]) << (i % 24); + return hash % mod; +} + +/* + * Finds the bucket for a given name and reads the containing block; + * *ofs is set to the offset of the first list entry. + */ +static struct buffer_head *omfs_get_bucket(struct inode *dir, + const char *name, int namelen, int *ofs) +{ + int nbuckets = (dir->i_size - OMFS_DIR_START)/8; + int bucket = omfs_hash(name, namelen, nbuckets); + + *ofs = OMFS_DIR_START + bucket * 8; + return omfs_bread(dir->i_sb, dir->i_ino); +} + +static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block, + const char *name, int namelen, + u64 *prev_block) +{ + struct buffer_head *bh; + struct omfs_inode *oi; + int err = -ENOENT; + *prev_block = ~0; + + while (block != ~0) { + bh = omfs_bread(dir->i_sb, block); + if (!bh) { + err = -EIO; + goto err; + } + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) { + brelse(bh); + goto err; + } + + if (strncmp(oi->i_name, name, namelen) == 0) + return bh; + + *prev_block = block; + block = be64_to_cpu(oi->i_sibling); + brelse(bh); + } +err: + return ERR_PTR(err); +} + +static struct buffer_head *omfs_find_entry(struct inode *dir, + const char *name, int namelen) +{ + struct buffer_head *bh; + int ofs; + u64 block, dummy; + + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + return ERR_PTR(-EIO); + + block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs])); + brelse(bh); + + return omfs_scan_list(dir, block, name, namelen, &dummy); +} + +int omfs_make_empty(struct inode *inode, struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct buffer_head *bh; + struct omfs_inode *oi; + + bh = omfs_bread(sb, inode->i_ino); + if (!bh) + return -ENOMEM; + + memset(bh->b_data, 0, sizeof(struct omfs_inode)); + + if (S_ISDIR(inode->i_mode)) { + memset(&bh->b_data[OMFS_DIR_START], 0xff, + sbi->s_sys_blocksize - OMFS_DIR_START); + } else + omfs_make_empty_table(bh, OMFS_EXTENT_START); + + oi = (struct omfs_inode *) bh->b_data; + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + oi->i_sibling = ~cpu_to_be64(0ULL); + + mark_buffer_dirty(bh); + brelse(bh); + return 0; +} + +static int omfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh; + u64 block; + __be64 *entry; + int ofs; + + /* just prepend to head of queue in proper bucket */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + *entry = cpu_to_be64(inode->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + /* now set the sibling and parent pointers on the new inode */ + bh = omfs_bread(dir->i_sb, inode->i_ino); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + memcpy(oi->i_name, name, namelen); + memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen); + oi->i_sibling = cpu_to_be64(block); + oi->i_parent = cpu_to_be64(dir->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + dir->i_ctime = CURRENT_TIME_SEC; + + /* mark affected inodes dirty to rebuild checksums */ + mark_inode_dirty(dir); + mark_inode_dirty(inode); + return 0; +out: + return -ENOMEM; +} + +static int omfs_delete_entry(struct dentry *dentry) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct inode *dirty; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh, *bh2; + __be64 *entry, next; + u64 block, prev; + int ofs; + int err = -ENOMEM; + + /* delete the proper node in the bucket's linked list */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + + bh2 = omfs_scan_list(dir, block, name, namelen, &prev); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); + goto out_free_bh; + } + + oi = (struct omfs_inode *) bh2->b_data; + next = oi->i_sibling; + brelse(bh2); + + if (prev != ~0) { + /* found in middle of list, get list ptr */ + brelse(bh); + bh = omfs_bread(dir->i_sb, prev); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + entry = &oi->i_sibling; + } + + *entry = next; + mark_buffer_dirty(bh); + + if (prev != ~0) { + dirty = omfs_iget(dir->i_sb, prev); + if (!IS_ERR(dirty)) { + mark_inode_dirty(dirty); + iput(dirty); + } + } + + err = 0; +out_free_bh: + brelse(bh); +out: + return err; +} + +static int omfs_dir_is_empty(struct inode *inode) +{ + int nbuckets = (inode->i_size - OMFS_DIR_START) / 8; + struct buffer_head *bh; + u64 *ptr; + int i; + + bh = omfs_bread(inode->i_sb, inode->i_ino); + + if (!bh) + return 0; + + ptr = (u64 *) &bh->b_data[OMFS_DIR_START]; + + for (i = 0; i < nbuckets; i++, ptr++) + if (*ptr != ~0) + break; + + brelse(bh); + return *ptr != ~0; +} + +static int omfs_remove(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int ret; + + + if (S_ISDIR(inode->i_mode) && + !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; + + ret = omfs_delete_entry(dentry); + if (ret) + return ret; + + clear_nlink(inode); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + return 0; +} + +static int omfs_add_node(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int err; + struct inode *inode = omfs_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + err = omfs_make_empty(inode, dir->i_sb); + if (err) + goto out_free_inode; + + err = omfs_add_link(dentry, inode); + if (err) + goto out_free_inode; + + d_instantiate(dentry, inode); + return 0; + +out_free_inode: + iput(inode); + return err; +} + +static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return omfs_add_node(dir, dentry, mode | S_IFDIR); +} + +static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return omfs_add_node(dir, dentry, mode | S_IFREG); +} + +static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct buffer_head *bh; + struct inode *inode = NULL; + + if (dentry->d_name.len > OMFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); + if (!IS_ERR(bh)) { + struct omfs_inode *oi = (struct omfs_inode *)bh->b_data; + ino_t ino = be64_to_cpu(oi->i_head.h_self); + brelse(bh); + inode = omfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +/* sanity check block's self pointer */ +int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock) +{ + int is_bad; + u64 ino = be64_to_cpu(header->h_self); + is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) || + (ino > sbi->s_num_blocks)); + + if (is_bad) + printk(KERN_WARNING "omfs: bad hash chain detected\n"); + + return is_bad; +} + +static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, + u64 fsblock, int hindex) +{ + /* follow chain in this bucket */ + while (fsblock != ~0) { + struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock); + struct omfs_inode *oi; + u64 self; + unsigned char d_type; + + if (!bh) + return true; + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { + brelse(bh); + return true; + } + + self = fsblock; + fsblock = be64_to_cpu(oi->i_sibling); + + /* skip visited nodes */ + if (hindex) { + hindex--; + brelse(bh); + continue; + } + + d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; + + if (!dir_emit(ctx, oi->i_name, + strnlen(oi->i_name, OMFS_NAMELEN), + self, d_type)) { + brelse(bh); + return false; + } + brelse(bh); + ctx->pos++; + } + return true; +} + +static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); + int err; + + if (new_inode) { + /* overwriting existing file/dir */ + err = omfs_remove(new_dir, new_dentry); + if (err) + goto out; + } + + /* since omfs locates files by name, we need to unlink _before_ + * adding the new link or we won't find the old one */ + err = omfs_delete_entry(old_dentry); + if (err) + goto out; + + mark_inode_dirty(old_dir); + err = omfs_add_link(new_dentry, old_inode); + if (err) + goto out; + + old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); +out: + return err; +} + +static int omfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file_inode(file); + struct buffer_head *bh; + __be64 *p; + unsigned int hchain, hindex; + int nbuckets; + + if (ctx->pos >> 32) + return -EINVAL; + + if (ctx->pos < 1 << 20) { + if (!dir_emit_dots(file, ctx)) + return 0; + ctx->pos = 1 << 20; + } + + nbuckets = (dir->i_size - OMFS_DIR_START) / 8; + + /* high 12 bits store bucket + 1 and low 20 bits store hash index */ + hchain = (ctx->pos >> 20) - 1; + hindex = ctx->pos & 0xfffff; + + bh = omfs_bread(dir->i_sb, dir->i_ino); + if (!bh) + return -EINVAL; + + p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain; + + for (; hchain < nbuckets; hchain++) { + __u64 fsblock = be64_to_cpu(*p++); + if (!omfs_fill_chain(dir, ctx, fsblock, hindex)) + break; + hindex = 0; + ctx->pos = (hchain+2) << 20; + } + brelse(bh); + return 0; +} + +const struct inode_operations omfs_dir_inops = { + .lookup = omfs_lookup, + .mkdir = omfs_mkdir, + .rename = omfs_rename, + .create = omfs_create, + .unlink = omfs_remove, + .rmdir = omfs_remove, +}; + +const struct file_operations omfs_dir_operations = { + .read = generic_read_dir, + .iterate = omfs_readdir, + .llseek = generic_file_llseek, +}; diff --git a/kernel/fs/omfs/file.c b/kernel/fs/omfs/file.c new file mode 100644 index 000000000..d9e26cfbb --- /dev/null +++ b/kernel/fs/omfs/file.c @@ -0,0 +1,383 @@ +/* + * OMFS (as used by RIO Karma) file operations. + * Copyright (C) 2005 Bob Copeland + * Released under GPL v2. + */ + +#include +#include +#include +#include +#include "omfs.h" + +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ + return (sbi->s_sys_blocksize - offset - + sizeof(struct omfs_extent)) / + sizeof(struct omfs_extent_entry) + 1; +} + +void omfs_make_empty_table(struct buffer_head *bh, int offset) +{ + struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; + + oe->e_next = ~cpu_to_be64(0ULL); + oe->e_extent_count = cpu_to_be32(1), + oe->e_fill = cpu_to_be32(0x22), + oe->e_entry.e_cluster = ~cpu_to_be64(0ULL); + oe->e_entry.e_blocks = ~cpu_to_be64(0ULL); +} + +int omfs_shrink_inode(struct inode *inode) +{ + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct buffer_head *bh; + u64 next, last; + u32 extent_count; + u32 max_extents; + int ret; + + /* traverse extent table, freeing each entry that is greater + * than inode->i_size; + */ + next = inode->i_ino; + + /* only support truncate -> 0 for now */ + ret = -EIO; + if (inode->i_size != 0) + goto out; + + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + + if (extent_count > max_extents) + goto out_brelse; + + last = next; + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + /* ignore last entry as it is the terminator */ + for (; extent_count > 1; extent_count--) { + u64 start, count; + start = be64_to_cpu(entry->e_cluster); + count = be64_to_cpu(entry->e_blocks); + + omfs_clear_range(inode->i_sb, start, (int) count); + entry++; + } + omfs_make_empty_table(bh, (char *) oe - bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + + if (last != inode->i_ino) + omfs_clear_range(inode->i_sb, last, sbi->s_mirrors); + + if (next == ~0) + break; + + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + ret = 0; +out: + return ret; +out_brelse: + brelse(bh); + return ret; +} + +static void omfs_truncate(struct inode *inode) +{ + omfs_shrink_inode(inode); + mark_inode_dirty(inode); +} + +/* + * Add new blocks to the current extent, or create new entries/continuations + * as necessary. + */ +static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, + u64 *ret_block) +{ + struct omfs_extent_entry *terminator; + struct omfs_extent_entry *entry = &oe->e_entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + u32 extent_count = be32_to_cpu(oe->e_extent_count); + u64 new_block = 0; + u32 max_count; + int new_count; + int ret = 0; + + /* reached the end of the extent table with no blocks mapped. + * there are three possibilities for adding: grow last extent, + * add a new extent to the current extent table, and add a + * continuation inode. in last two cases need an allocator for + * sbi->s_cluster_size + */ + + /* TODO: handle holes */ + + /* should always have a terminator */ + if (extent_count < 1) + return -EIO; + + /* trivially grow current extent, if next block is not taken */ + terminator = entry + extent_count - 1; + if (extent_count > 1) { + entry = terminator-1; + new_block = be64_to_cpu(entry->e_cluster) + + be64_to_cpu(entry->e_blocks); + + if (omfs_allocate_block(inode->i_sb, new_block)) { + be64_add_cpu(&entry->e_blocks, 1); + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + 1)); + goto out; + } + } + max_count = omfs_max_extents(sbi, OMFS_EXTENT_START); + + /* TODO: add a continuation block here */ + if (be32_to_cpu(oe->e_extent_count) > max_count-1) + return -EIO; + + /* try to allocate a new cluster */ + ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize, + &new_block, &new_count); + if (ret) + goto out_fail; + + /* copy terminator down an entry */ + entry = terminator; + terminator++; + memcpy(terminator, entry, sizeof(struct omfs_extent_entry)); + + entry->e_cluster = cpu_to_be64(new_block); + entry->e_blocks = cpu_to_be64((u64) new_count); + + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + (u64) new_count)); + + /* write in new entry */ + be32_add_cpu(&oe->e_extent_count, 1); + +out: + *ret_block = new_block; +out_fail: + return ret; +} + +/* + * Scans across the directory table for a given file block number. + * If block not found, return 0. + */ +static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent, + sector_t block, int count, int *left) +{ + /* count > 1 because of terminator */ + sector_t searched = 0; + for (; count > 1; count--) { + int numblocks = clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_blocks)); + + if (block >= searched && + block < searched + numblocks) { + /* + * found it at cluster + (block - searched) + * numblocks - (block - searched) is remainder + */ + *left = numblocks - (block - searched); + return clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_cluster)) + + block - searched; + } + searched += numblocks; + ent++; + } + return 0; +} + +static int omfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct buffer_head *bh; + sector_t next, offset; + int ret; + u64 uninitialized_var(new_block); + u32 max_extents; + int extent_count; + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + int max_blocks = bh_result->b_size >> inode->i_blkbits; + int remain; + + ret = -EIO; + bh = omfs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + next = inode->i_ino; + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + if (extent_count > max_extents) + goto out_brelse; + + offset = find_block(inode, entry, block, extent_count, &remain); + if (offset > 0) { + ret = 0; + map_bh(bh_result, inode->i_sb, offset); + if (remain > max_blocks) + remain = max_blocks; + bh_result->b_size = (remain << inode->i_blkbits); + goto out_brelse; + } + if (next == ~0) + break; + + brelse(bh); + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + if (create) { + ret = omfs_grow_extent(inode, oe, &new_block); + if (ret == 0) { + mark_buffer_dirty(bh); + mark_inode_dirty(inode); + map_bh(bh_result, inode->i_sb, + clus_to_blk(sbi, new_block)); + } + } +out_brelse: + brelse(bh); +out: + return ret; +} + +static int omfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, omfs_get_block); +} + +static int omfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); +} + +static int omfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, omfs_get_block, wbc); +} + +static int +omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, omfs_get_block); +} + +static void omfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + omfs_truncate(inode); + } +} + +static int omfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + omfs_get_block); + if (unlikely(ret)) + omfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t omfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, omfs_get_block); +} + +const struct file_operations omfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int omfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + truncate_setsize(inode, attr->ia_size); + omfs_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations omfs_file_inops = { + .setattr = omfs_setattr, +}; + +const struct address_space_operations omfs_aops = { + .readpage = omfs_readpage, + .readpages = omfs_readpages, + .writepage = omfs_writepage, + .writepages = omfs_writepages, + .write_begin = omfs_write_begin, + .write_end = generic_write_end, + .bmap = omfs_bmap, +}; + diff --git a/kernel/fs/omfs/inode.c b/kernel/fs/omfs/inode.c new file mode 100644 index 000000000..3d935c817 --- /dev/null +++ b/kernel/fs/omfs/inode.c @@ -0,0 +1,596 @@ +/* + * Optimized MPEG FS - inode and super operations. + * Copyright (C) 2006 Bob Copeland + * Released under GPL v2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "omfs.h" + +MODULE_AUTHOR("Bob Copeland "); +MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); +MODULE_LICENSE("GPL"); + +struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + if (block >= sbi->s_num_blocks) + return NULL; + + return sb_bread(sb, clus_to_blk(sbi, block)); +} + +struct inode *omfs_new_inode(struct inode *dir, umode_t mode) +{ + struct inode *inode; + u64 new_block; + int err; + int len; + struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, + &new_block, &len); + if (err) + goto fail; + + inode->i_ino = new_block; + inode_init_owner(inode, NULL, mode); + inode->i_mapping->a_ops = &omfs_aops; + + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case S_IFREG: + inode->i_op = &omfs_file_inops; + inode->i_fop = &omfs_file_operations; + inode->i_size = 0; + break; + } + + insert_inode_hash(inode); + mark_inode_dirty(inode); + return inode; +fail: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +/* + * Update the header checksums for a dirty inode based on its contents. + * Caller is expected to hold the buffer head underlying oi and mark it + * dirty. + */ +static void omfs_update_checksums(struct omfs_inode *oi) +{ + int xor, i, ofs = 0, count; + u16 crc = 0; + unsigned char *ptr = (unsigned char *) oi; + + count = be32_to_cpu(oi->i_head.h_body_size); + ofs = sizeof(struct omfs_header); + + crc = crc_itu_t(crc, ptr + ofs, count); + oi->i_head.h_crc = cpu_to_be16(crc); + + xor = ptr[0]; + for (i = 1; i < OMFS_XOR_COUNT; i++) + xor ^= ptr[i]; + + oi->i_head.h_check_xor = xor; +} + +static int __omfs_write_inode(struct inode *inode, int wait) +{ + struct omfs_inode *oi; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct buffer_head *bh, *bh2; + u64 ctime; + int i; + int ret = -EIO; + int sync_failed = 0; + + /* get current inode since we may have written sibling ptrs etc. */ + bh = omfs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + if (S_ISDIR(inode->i_mode)) + oi->i_type = OMFS_DIR; + else if (S_ISREG(inode->i_mode)) + oi->i_type = OMFS_FILE; + else { + printk(KERN_WARNING "omfs: unknown file type: %d\n", + inode->i_mode); + goto out_brelse; + } + + oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - + sizeof(struct omfs_header)); + oi->i_head.h_version = 1; + oi->i_head.h_type = OMFS_INODE_NORMAL; + oi->i_head.h_magic = OMFS_IMAGIC; + oi->i_size = cpu_to_be64(inode->i_size); + + ctime = inode->i_ctime.tv_sec * 1000LL + + ((inode->i_ctime.tv_nsec + 999)/1000); + oi->i_ctime = cpu_to_be64(ctime); + + omfs_update_checksums(oi); + + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + sync_failed = 1; + } + + /* if mirroring writes, copy to next fsblock */ + for (i = 1; i < sbi->s_mirrors; i++) { + bh2 = omfs_bread(inode->i_sb, inode->i_ino + i); + if (!bh2) + goto out_brelse; + + memcpy(bh2->b_data, bh->b_data, bh->b_size); + mark_buffer_dirty(bh2); + if (wait) { + sync_dirty_buffer(bh2); + if (buffer_req(bh2) && !buffer_uptodate(bh2)) + sync_failed = 1; + } + brelse(bh2); + } + ret = (sync_failed) ? -EIO : 0; +out_brelse: + brelse(bh); +out: + return ret; +} + +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int omfs_sync_inode(struct inode *inode) +{ + return __omfs_write_inode(inode, 1); +} + +/* + * called when an entry is deleted, need to clear the bits in the + * bitmaps. + */ +static void omfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (inode->i_nlink) + return; + + if (S_ISREG(inode->i_mode)) { + inode->i_size = 0; + omfs_shrink_inode(inode); + } + + omfs_clear_range(inode->i_sb, inode->i_ino, 2); +} + +struct inode *omfs_iget(struct super_block *sb, ino_t ino) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct omfs_inode *oi; + struct buffer_head *bh; + u64 ctime; + unsigned long nsecs; + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + bh = omfs_bread(inode->i_sb, ino); + if (!bh) + goto iget_failed; + + oi = (struct omfs_inode *)bh->b_data; + + /* check self */ + if (ino != be64_to_cpu(oi->i_head.h_self)) + goto fail_bh; + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + + ctime = be64_to_cpu(oi->i_ctime); + nsecs = do_div(ctime, 1000) * 1000L; + + inode->i_atime.tv_sec = ctime; + inode->i_mtime.tv_sec = ctime; + inode->i_ctime.tv_sec = ctime; + inode->i_atime.tv_nsec = nsecs; + inode->i_mtime.tv_nsec = nsecs; + inode->i_ctime.tv_nsec = nsecs; + + inode->i_mapping->a_ops = &omfs_aops; + + switch (oi->i_type) { + case OMFS_DIR: + inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case OMFS_FILE: + inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); + inode->i_fop = &omfs_file_operations; + inode->i_size = be64_to_cpu(oi->i_size); + break; + } + brelse(bh); + unlock_new_inode(inode); + return inode; +fail_bh: + brelse(bh); +iget_failed: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static void omfs_put_super(struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + kfree(sbi->s_imap); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct omfs_sb_info *sbi = OMFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + + buf->f_type = OMFS_MAGIC; + buf->f_bsize = sbi->s_blocksize; + buf->f_blocks = sbi->s_num_blocks; + buf->f_files = sbi->s_num_blocks; + buf->f_namelen = OMFS_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + buf->f_bfree = buf->f_bavail = buf->f_ffree = + omfs_count_free(s); + + return 0; +} + +static const struct super_operations omfs_sops = { + .write_inode = omfs_write_inode, + .evict_inode = omfs_evict_inode, + .put_super = omfs_put_super, + .statfs = omfs_statfs, + .show_options = generic_show_options, +}; + +/* + * For Rio Karma, there is an on-disk free bitmap whose location is + * stored in the root block. For ReplayTV, there is no such free bitmap + * so we have to walk the tree. Both inodes and file data are allocated + * from the same map. This array can be big (300k) so we allocate + * in units of the blocksize. + */ +static int omfs_get_imap(struct super_block *sb) +{ + unsigned int bitmap_size, array_size; + int count; + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct buffer_head *bh; + unsigned long **ptr; + sector_t block; + + bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); + array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); + + if (sbi->s_bitmap_ino == ~0ULL) + goto out; + + sbi->s_imap_size = array_size; + sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); + if (!sbi->s_imap) + goto nomem; + + block = clus_to_blk(sbi, sbi->s_bitmap_ino); + if (block >= sbi->s_num_blocks) + goto nomem; + + ptr = sbi->s_imap; + for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { + bh = sb_bread(sb, block++); + if (!bh) + goto nomem_free; + *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL); + if (!*ptr) { + brelse(bh); + goto nomem_free; + } + memcpy(*ptr, bh->b_data, sb->s_blocksize); + if (count < sb->s_blocksize) + memset((void *)*ptr + count, 0xff, + sb->s_blocksize - count); + brelse(bh); + ptr++; + } +out: + return 0; + +nomem_free: + for (count = 0; count < array_size; count++) + kfree(sbi->s_imap[count]); + + kfree(sbi->s_imap); +nomem: + sbi->s_imap = NULL; + sbi->s_imap_size = 0; + return -ENOMEM; +} + +enum { + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_dmask, "dmask=%o"}, + {Opt_fmask, "fmask=%o"}, + {Opt_err, NULL}, +}; + +static int parse_options(char *options, struct omfs_sb_info *sbi) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(sbi->s_uid)) + return 0; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(sbi->s_gid)) + return 0; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = sbi->s_dmask = option; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_dmask = option; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = option; + break; + default: + return 0; + } + } + return 1; +} + +static int omfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh, *bh2; + struct omfs_super_block *omfs_sb; + struct omfs_root_block *omfs_rb; + struct omfs_sb_info *sbi; + struct inode *root; + int ret = -EINVAL; + + save_mount_options(sb, (char *) data); + + sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + sbi->s_uid = current_uid(); + sbi->s_gid = current_gid(); + sbi->s_dmask = sbi->s_fmask = current_umask(); + + if (!parse_options((char *) data, sbi)) + goto end; + + sb->s_maxbytes = 0xffffffff; + + sb_set_blocksize(sb, 0x200); + + bh = sb_bread(sb, 0); + if (!bh) + goto end; + + omfs_sb = (struct omfs_super_block *)bh->b_data; + + if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { + if (!silent) + printk(KERN_ERR "omfs: Invalid superblock (%x)\n", + omfs_sb->s_magic); + goto out_brelse_bh; + } + sb->s_magic = OMFS_MAGIC; + + sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); + sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); + sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); + sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); + sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); + mutex_init(&sbi->s_bitmap_lock); + + if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) { + printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n", + (unsigned long long)sbi->s_num_blocks); + goto out_brelse_bh; + } + + if (sbi->s_sys_blocksize > PAGE_SIZE) { + printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", + sbi->s_sys_blocksize); + goto out_brelse_bh; + } + + if (sbi->s_blocksize < sbi->s_sys_blocksize || + sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "omfs: block size (%d) is out of range\n", + sbi->s_blocksize); + goto out_brelse_bh; + } + + /* + * Use sys_blocksize as the fs block since it is smaller than a + * page while the fs blocksize can be larger. + */ + sb_set_blocksize(sb, sbi->s_sys_blocksize); + + /* + * ...and the difference goes into a shift. sys_blocksize is always + * a power of two factor of blocksize. + */ + sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - + get_bitmask_order(sbi->s_sys_blocksize); + + bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block)); + if (!bh2) + goto out_brelse_bh; + + omfs_rb = (struct omfs_root_block *)bh2->b_data; + + sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); + sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); + + if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { + printk(KERN_ERR "omfs: block count discrepancy between " + "super and root blocks (%llx, %llx)\n", + (unsigned long long)sbi->s_num_blocks, + (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); + goto out_brelse_bh2; + } + + if (sbi->s_bitmap_ino != ~0ULL && + sbi->s_bitmap_ino > sbi->s_num_blocks) { + printk(KERN_ERR "omfs: free space bitmap location is corrupt " + "(%llx, total blocks %llx)\n", + (unsigned long long) sbi->s_bitmap_ino, + (unsigned long long) sbi->s_num_blocks); + goto out_brelse_bh2; + } + if (sbi->s_clustersize < 1 || + sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) { + printk(KERN_ERR "omfs: cluster size out of range (%d)", + sbi->s_clustersize); + goto out_brelse_bh2; + } + + ret = omfs_get_imap(sb); + if (ret) + goto out_brelse_bh2; + + sb->s_op = &omfs_sops; + + root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_brelse_bh2; + } + + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ret = -ENOMEM; + goto out_brelse_bh2; + } + printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); + + ret = 0; +out_brelse_bh2: + brelse(bh2); +out_brelse_bh: + brelse(bh); +end: + if (ret) + kfree(sbi); + return ret; +} + +static struct dentry *omfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); +} + +static struct file_system_type omfs_fs_type = { + .owner = THIS_MODULE, + .name = "omfs", + .mount = omfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("omfs"); + +static int __init init_omfs_fs(void) +{ + return register_filesystem(&omfs_fs_type); +} + +static void __exit exit_omfs_fs(void) +{ + unregister_filesystem(&omfs_fs_type); +} + +module_init(init_omfs_fs); +module_exit(exit_omfs_fs); diff --git a/kernel/fs/omfs/omfs.h b/kernel/fs/omfs/omfs.h new file mode 100644 index 000000000..f0f8bc75e --- /dev/null +++ b/kernel/fs/omfs/omfs.h @@ -0,0 +1,68 @@ +#ifndef _OMFS_H +#define _OMFS_H + +#include +#include + +#include "omfs_fs.h" + +/* In-memory structures */ +struct omfs_sb_info { + u64 s_num_blocks; + u64 s_bitmap_ino; + u64 s_root_ino; + u32 s_blocksize; + u32 s_mirrors; + u32 s_sys_blocksize; + u32 s_clustersize; + int s_block_shift; + unsigned long **s_imap; + int s_imap_size; + struct mutex s_bitmap_lock; + kuid_t s_uid; + kgid_t s_gid; + int s_dmask; + int s_fmask; +}; + +/* convert a cluster number to a scaled block number */ +static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block) +{ + return block << sbi->s_block_shift; +} + +static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* bitmap.c */ +extern unsigned long omfs_count_free(struct super_block *sb); +extern int omfs_allocate_block(struct super_block *sb, u64 block); +extern int omfs_allocate_range(struct super_block *sb, int min_request, + int max_request, u64 *return_block, int *return_size); +extern int omfs_clear_range(struct super_block *sb, u64 block, int count); + +/* dir.c */ +extern const struct file_operations omfs_dir_operations; +extern const struct inode_operations omfs_dir_inops; +extern int omfs_make_empty(struct inode *inode, struct super_block *sb); +extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock); + +/* file.c */ +extern const struct file_operations omfs_file_operations; +extern const struct inode_operations omfs_file_inops; +extern const struct address_space_operations omfs_aops; +extern void omfs_make_empty_table(struct buffer_head *bh, int offset); +extern int omfs_shrink_inode(struct inode *inode); + +/* inode.c */ +extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block); +extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); +extern struct inode *omfs_new_inode(struct inode *dir, umode_t mode); +extern int omfs_reserve_block(struct super_block *sb, sector_t block); +extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino); +extern int omfs_sync_inode(struct inode *inode); + +#endif diff --git a/kernel/fs/omfs/omfs_fs.h b/kernel/fs/omfs/omfs_fs.h new file mode 100644 index 000000000..83a98330e --- /dev/null +++ b/kernel/fs/omfs/omfs_fs.h @@ -0,0 +1,82 @@ +#ifndef _OMFS_FS_H +#define _OMFS_FS_H + +/* OMFS On-disk structures */ + +#define OMFS_MAGIC 0xC2993D87 +#define OMFS_IMAGIC 0xD2 + +#define OMFS_DIR 'D' +#define OMFS_FILE 'F' +#define OMFS_INODE_NORMAL 'e' +#define OMFS_INODE_CONTINUATION 'c' +#define OMFS_INODE_SYSTEM 's' +#define OMFS_NAMELEN 256 +#define OMFS_DIR_START 0x1b8 +#define OMFS_EXTENT_START 0x1d0 +#define OMFS_EXTENT_CONT 0x40 +#define OMFS_XOR_COUNT 19 +#define OMFS_MAX_BLOCK_SIZE 8192 +#define OMFS_MAX_CLUSTER_SIZE 8 +#define OMFS_MAX_BLOCKS (1ul << 31) + +struct omfs_super_block { + char s_fill1[256]; + __be64 s_root_block; /* block number of omfs_root_block */ + __be64 s_num_blocks; /* total number of FS blocks */ + __be32 s_magic; /* OMFS_MAGIC */ + __be32 s_blocksize; /* size of a block */ + __be32 s_mirrors; /* # of mirrors of system blocks */ + __be32 s_sys_blocksize; /* size of non-data blocks */ +}; + +struct omfs_header { + __be64 h_self; /* FS block where this is located */ + __be32 h_body_size; /* size of useful data after header */ + __be16 h_crc; /* crc-ccitt of body_size bytes */ + char h_fill1[2]; + u8 h_version; /* version, always 1 */ + char h_type; /* OMFS_INODE_X */ + u8 h_magic; /* OMFS_IMAGIC */ + u8 h_check_xor; /* XOR of header bytes before this */ + __be32 h_fill2; +}; + +struct omfs_root_block { + struct omfs_header r_head; /* header */ + __be64 r_fill1; + __be64 r_num_blocks; /* total number of FS blocks */ + __be64 r_root_dir; /* block # of root directory */ + __be64 r_bitmap; /* block # of free space bitmap */ + __be32 r_blocksize; /* size of a block */ + __be32 r_clustersize; /* size allocated for data blocks */ + __be64 r_mirrors; /* # of mirrors of system blocks */ + char r_name[OMFS_NAMELEN]; /* partition label */ +}; + +struct omfs_inode { + struct omfs_header i_head; /* header */ + __be64 i_parent; /* parent containing this inode */ + __be64 i_sibling; /* next inode in hash bucket */ + __be64 i_ctime; /* ctime, in milliseconds */ + char i_fill1[35]; + char i_type; /* OMFS_[DIR,FILE] */ + __be32 i_fill2; + char i_fill3[64]; + char i_name[OMFS_NAMELEN]; /* filename */ + __be64 i_size; /* size of file, in bytes */ +}; + +struct omfs_extent_entry { + __be64 e_cluster; /* start location of a set of blocks */ + __be64 e_blocks; /* number of blocks after e_cluster */ +}; + +struct omfs_extent { + __be64 e_next; /* next extent table location */ + __be32 e_extent_count; /* total # extents in this table */ + __be32 e_fill; + struct omfs_extent_entry e_entry; /* start of extent entries */ +}; + +#endif diff --git a/kernel/fs/open.c b/kernel/fs/open.c new file mode 100644 index 000000000..98e5a52dc --- /dev/null +++ b/kernel/fs/open.c @@ -0,0 +1,1143 @@ +/* + * linux/fs/open.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) +{ + int ret; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ + if (length < 0) + return -EINVAL; + + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | time_attrs; + if (filp) { + newattrs.ia_file = filp; + newattrs.ia_valid |= ATTR_FILE; + } + + /* Remove suid/sgid on truncate too */ + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; + + mutex_lock(&dentry->d_inode->i_mutex); + /* Note any delegations or leases have already been broken: */ + ret = notify_change(dentry, &newattrs, NULL); + mutex_unlock(&dentry->d_inode->i_mutex); + return ret; +} + +long vfs_truncate(struct path *path, loff_t length) +{ + struct inode *inode; + long error; + + inode = path->dentry->d_inode; + + /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + error = mnt_want_write(path->mnt); + if (error) + goto out; + + error = inode_permission(inode, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; + + error = -EPERM; + if (IS_APPEND(inode)) + goto mnt_drop_write_and_out; + + error = get_write_access(inode); + if (error) + goto mnt_drop_write_and_out; + + /* + * Make sure that there are no leases. get_write_access() protects + * against the truncate racing with a lease-granting setlease(). + */ + error = break_lease(inode, O_WRONLY); + if (error) + goto put_write_and_out; + + error = locks_verify_truncate(inode, NULL, length); + if (!error) + error = security_path_truncate(path); + if (!error) + error = do_truncate(path->dentry, length, 0, NULL); + +put_write_and_out: + put_write_access(inode); +mnt_drop_write_and_out: + mnt_drop_write(path->mnt); +out: + return error; +} +EXPORT_SYMBOL_GPL(vfs_truncate); + +static long do_sys_truncate(const char __user *pathname, loff_t length) +{ + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int error; + + if (length < 0) /* sorry, but loff_t says... */ + return -EINVAL; + +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (!error) { + error = vfs_truncate(&path, length); + path_put(&path); + } + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) +{ + return do_sys_truncate(path, length); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) +{ + return do_sys_truncate(path, length); +} +#endif + +static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct inode *inode; + struct dentry *dentry; + struct fd f; + int error; + + error = -EINVAL; + if (length < 0) + goto out; + error = -EBADF; + f = fdget(fd); + if (!f.file) + goto out; + + /* explicitly opened as large or we are on 64-bit box */ + if (f.file->f_flags & O_LARGEFILE) + small = 0; + + dentry = f.file->f_path.dentry; + inode = dentry->d_inode; + error = -EINVAL; + if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) + goto out_putf; + + error = -EINVAL; + /* Cannot ftruncate over 2^31 bytes without large file support */ + if (small && length > MAX_NON_LFS) + goto out_putf; + + error = -EPERM; + if (IS_APPEND(inode)) + goto out_putf; + + sb_start_write(inode->i_sb); + error = locks_verify_truncate(inode, f.file, length); + if (!error) + error = security_path_truncate(&f.file->f_path); + if (!error) + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); + sb_end_write(inode->i_sb); +out_putf: + fdput(f); +out: + return error; +} + +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +{ + return do_sys_ftruncate(fd, length, 1); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +{ + return do_sys_ftruncate(fd, length, 1); +} +#endif + +/* LFS versions of truncate are only needed on 32 bit machines */ +#if BITS_PER_LONG == 32 +SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) +{ + return do_sys_truncate(path, length); +} + +SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) +{ + return do_sys_ftruncate(fd, length, 0); +} +#endif /* BITS_PER_LONG == 32 */ + + +int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + long ret; + + if (offset < 0 || len <= 0) + return -EINVAL; + + /* Return error if mode is not supported */ + if (mode & ~FALLOC_FL_SUPPORTED_MASK) + return -EOPNOTSUPP; + + /* Punch hole and zero range are mutually exclusive */ + if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + /* Punch hole must have keep size set */ + if ((mode & FALLOC_FL_PUNCH_HOLE) && + !(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + /* Collapse range should only be used exclusively. */ + if ((mode & FALLOC_FL_COLLAPSE_RANGE) && + (mode & ~FALLOC_FL_COLLAPSE_RANGE)) + return -EINVAL; + + /* Insert range should only be used exclusively. */ + if ((mode & FALLOC_FL_INSERT_RANGE) && + (mode & ~FALLOC_FL_INSERT_RANGE)) + return -EINVAL; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + /* + * We can only allow pure fallocate on append only files + */ + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + /* + * We cannot allow any fallocate operation on an active swapfile + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* + * Revalidate the write permissions, in case security policy has + * changed since the files were opened. + */ + ret = security_file_permission(file, MAY_WRITE); + if (ret) + return ret; + + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + /* + * Let individual file system decide if it supports preallocation + * for directories or not. + */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -ENODEV; + + /* Check for wrap through zero too */ + if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + return -EFBIG; + + if (!file->f_op->fallocate) + return -EOPNOTSUPP; + + sb_start_write(inode->i_sb); + ret = file->f_op->fallocate(file, mode, offset, len); + + /* + * Create inotify and fanotify events. + * + * To keep the logic simple always create events if fallocate succeeds. + * This implies that events are even created if the file size remains + * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. + */ + if (ret == 0) + fsnotify_modify(file); + + sb_end_write(inode->i_sb); + return ret; +} +EXPORT_SYMBOL_GPL(vfs_fallocate); + +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) +{ + struct fd f = fdget(fd); + int error = -EBADF; + + if (f.file) { + error = vfs_fallocate(f.file, mode, offset, len); + fdput(f); + } + return error; +} + +/* + * access() needs to use the real uid/gid, not the effective uid/gid. + * We do this by temporarily clearing all FS-related capabilities and + * switching the fsuid/fsgid around to the real ones. + */ +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +{ + const struct cred *old_cred; + struct cred *override_cred; + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + override_cred->fsuid = override_cred->uid; + override_cred->fsgid = override_cred->gid; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + /* Clear the capabilities if we switch to a non-root user */ + kuid_t root_uid = make_kuid(override_cred->user_ns, 0); + if (!uid_eq(override_cred->uid, root_uid)) + cap_clear(override_cred->cap_effective); + else + override_cred->cap_effective = + override_cred->cap_permitted; + } + + old_cred = override_creds(override_cred); +retry: + res = user_path_at(dfd, filename, lookup_flags, &path); + if (res) + goto out; + + inode = path.dentry->d_inode; + + if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { + /* + * MAY_EXEC on regular files is denied if the fs is mounted + * with the "noexec" flag. + */ + res = -EACCES; + if (path.mnt->mnt_flags & MNT_NOEXEC) + goto out_path_release; + } + + res = inode_permission(inode, mode | MAY_ACCESS); + /* SuS v2 requires we report a read only fs too */ + if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) + goto out_path_release; + /* + * This is a rare case where using __mnt_is_readonly() + * is OK without a mnt_want/drop_write() pair. Since + * no actual write to the fs is performed here, we do + * not need to telegraph to that to anyone. + * + * By doing this, we accept that this access is + * inherently racy and know that the fs may change + * state before we even see this result. + */ + if (__mnt_is_readonly(path.mnt)) + res = -EROFS; + +out_path_release: + path_put(&path); + if (retry_estale(res, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + revert_creds(old_cred); + put_cred(override_cred); + return res; +} + +SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) +{ + return sys_faccessat(AT_FDCWD, filename, mode); +} + +SYSCALL_DEFINE1(chdir, const char __user *, filename) +{ + struct path path; + int error; + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); + if (error) + goto out; + + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + if (error) + goto dput_and_out; + + set_fs_pwd(current->fs, &path); + +dput_and_out: + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + return error; +} + +SYSCALL_DEFINE1(fchdir, unsigned int, fd) +{ + struct fd f = fdget_raw(fd); + struct inode *inode; + int error = -EBADF; + + error = -EBADF; + if (!f.file) + goto out; + + inode = file_inode(f.file); + + error = -ENOTDIR; + if (!S_ISDIR(inode->i_mode)) + goto out_putf; + + error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + if (!error) + set_fs_pwd(current->fs, &f.file->f_path); +out_putf: + fdput(f); +out: + return error; +} + +SYSCALL_DEFINE1(chroot, const char __user *, filename) +{ + struct path path; + int error; + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); + if (error) + goto out; + + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + if (error) + goto dput_and_out; + + error = -EPERM; + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) + goto dput_and_out; + error = security_path_chroot(&path); + if (error) + goto dput_and_out; + + set_fs_root(current->fs, &path); + error = 0; +dput_and_out: + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + return error; +} + +static int chmod_common(struct path *path, umode_t mode) +{ + struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; + struct iattr newattrs; + int error; + + error = mnt_want_write(path->mnt); + if (error) + return error; +retry_deleg: + mutex_lock(&inode->i_mutex); + error = security_path_chmod(path, mode); + if (error) + goto out_unlock; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + error = notify_change(path->dentry, &newattrs, &delegated_inode); +out_unlock: + mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +{ + struct fd f = fdget(fd); + int err = -EBADF; + + if (f.file) { + audit_file(f.file); + err = chmod_common(&f.file->f_path, mode); + fdput(f); + } + return err; +} + +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) +{ + struct path path; + int error; + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + return error; +} + +SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) +{ + return sys_fchmodat(AT_FDCWD, filename, mode); +} + +static int chown_common(struct path *path, uid_t user, gid_t group) +{ + struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; + int error; + struct iattr newattrs; + kuid_t uid; + kgid_t gid; + + uid = make_kuid(current_user_ns(), user); + gid = make_kgid(current_user_ns(), group); + +retry_deleg: + newattrs.ia_valid = ATTR_CTIME; + if (user != (uid_t) -1) { + if (!uid_valid(uid)) + return -EINVAL; + newattrs.ia_valid |= ATTR_UID; + newattrs.ia_uid = uid; + } + if (group != (gid_t) -1) { + if (!gid_valid(gid)) + return -EINVAL; + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = gid; + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= + ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + mutex_lock(&inode->i_mutex); + error = security_path_chown(path, uid, gid); + if (!error) + error = notify_change(path->dentry, &newattrs, &delegated_inode); + mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + return error; +} + +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + struct path path; + int error = -EINVAL; + int lookup_flags; + + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + goto out; + + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + error = mnt_want_write(path.mnt); + if (error) + goto out_release; + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); +out_release: + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + return error; +} + +SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) +{ + return sys_fchownat(AT_FDCWD, filename, user, group, 0); +} + +SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +{ + return sys_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); +} + +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +{ + struct fd f = fdget(fd); + int error = -EBADF; + + if (!f.file) + goto out; + + error = mnt_want_write_file(f.file); + if (error) + goto out_fput; + audit_file(f.file); + error = chown_common(&f.file->f_path, user, group); + mnt_drop_write_file(f.file); +out_fput: + fdput(f); +out: + return error; +} + +int open_check_o_direct(struct file *f) +{ + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + return 0; +} + +static int do_dentry_open(struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) +{ + static const struct file_operations empty_fops = {}; + struct inode *inode; + int error; + + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | + FMODE_PREAD | FMODE_PWRITE; + + path_get(&f->f_path); + inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_mapping = inode->i_mapping; + + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; + f->f_op = &empty_fops; + return 0; + } + + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) + goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; + } + + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + + f->f_op = fops_get(inode->i_fop); + if (unlikely(WARN_ON(!f->f_op))) { + error = -ENODEV; + goto cleanup_all; + } + + error = security_file_open(f, cred); + if (error) + goto cleanup_all; + + error = break_lease(inode, f->f_flags); + if (error) + goto cleanup_all; + + if (!open) + open = f->f_op->open; + if (open) { + error = open(inode, f); + if (error) + goto cleanup_all; + } + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(inode); + if ((f->f_mode & FMODE_READ) && + likely(f->f_op->read || f->f_op->read_iter)) + f->f_mode |= FMODE_CAN_READ; + if ((f->f_mode & FMODE_WRITE) && + likely(f->f_op->write || f->f_op->write_iter)) + f->f_mode |= FMODE_CAN_WRITE; + + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); + + return 0; + +cleanup_all: + fops_put(f->f_op); + if (f->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(f->f_path.mnt); + } +cleanup_file: + path_put(&f->f_path); + f->f_path.mnt = NULL; + f->f_path.dentry = NULL; + f->f_inode = NULL; + return error; +} + +/** + * finish_open - finish opening a file + * @file: file pointer + * @dentry: pointer to dentry + * @open: open callback + * @opened: state of open + * + * This can be used to finish opening a file passed to i_op->atomic_open(). + * + * If the open callback is set to NULL, then the standard f_op->open() + * filesystem callback is substituted. + * + * NB: the dentry reference is _not_ consumed. If, for example, the dentry is + * the return value of d_splice_alias(), then the caller needs to perform dput() + * on it after finish_open(). + * + * On successful return @file is a fully instantiated open file. After this, if + * an error occurs in ->atomic_open(), it needs to clean up with fput(). + * + * Returns zero on success or -errno if the open failed. + */ +int finish_open(struct file *file, struct dentry *dentry, + int (*open)(struct inode *, struct file *), + int *opened) +{ + int error; + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + + file->f_path.dentry = dentry; + error = do_dentry_open(file, open, current_cred()); + if (!error) + *opened |= FILE_OPENED; + + return error; +} +EXPORT_SYMBOL(finish_open); + +/** + * finish_no_open - finish ->atomic_open() without opening the file + * + * @file: file pointer + * @dentry: dentry or NULL (as returned from ->lookup()) + * + * This can be used to set the result of a successful lookup in ->atomic_open(). + * + * NB: unlike finish_open() this function does consume the dentry reference and + * the caller need not dput() it. + * + * Returns "1" which must be the return value of ->atomic_open() after having + * called this function. + */ +int finish_no_open(struct file *file, struct dentry *dentry) +{ + file->f_path.dentry = dentry; + return 1; +} +EXPORT_SYMBOL(finish_no_open); + +struct file *dentry_open(const struct path *path, int flags, + const struct cred *cred) +{ + int error; + struct file *f; + + validate_creds(cred); + + /* We must always pass in a valid mount pointer. */ + BUG_ON(!path->mnt); + + f = get_empty_filp(); + if (!IS_ERR(f)) { + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } + } + return f; +} +EXPORT_SYMBOL(dentry_open); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + +static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (flags & (O_CREAT | __O_TMPFILE)) + op->mode = (mode & S_IALLUGO) | S_IFREG; + else + op->mode = 0; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + if (flags & __O_TMPFILE) { + if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + return -EINVAL; + acc_mode = MAY_OPEN | ACC_MODE(flags); + if (!(acc_mode & MAY_WRITE)) + return -EINVAL; + } else if (flags & O_PATH) { + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + op->lookup_flags = lookup_flags; + return 0; +} + +/** + * file_open_name - open file and return file pointer + * + * @name: struct filename containing path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *file_open_name(struct filename *name, int flags, umode_t mode) +{ + struct open_flags op; + int err = build_open_flags(flags, mode, &op); + return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, umode_t mode) +{ + struct filename *name = getname_kernel(filename); + struct file *file = ERR_CAST(name); + + if (!IS_ERR(name)) { + file = file_open_name(name, flags, mode); + putname(name); + } + return file; +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int err = build_open_flags(flags, 0, &op); + if (err) + return ERR_PTR(err); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + return do_file_open_root(dentry, mnt, filename, &op); +} +EXPORT_SYMBOL(file_open_root); + +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +{ + struct open_flags op; + int fd = build_open_flags(flags, mode, &op); + struct filename *tmp; + + if (fd) + return fd; + + tmp = getname(filename); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + fd = get_unused_fd_flags(flags); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); + } + } + putname(tmp); + return fd; +} + +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + if (force_o_largefile()) + flags |= O_LARGEFILE; + + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, + umode_t, mode) +{ + if (force_o_largefile()) + flags |= O_LARGEFILE; + + return do_sys_open(dfd, filename, flags, mode); +} + +#ifndef __alpha__ + +/* + * For backward compatibility? Maybe this should be moved + * into arch/i386 instead? + */ +SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) +{ + return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); +} + +#endif + +/* + * "id" is the POSIX thread ID. We use the + * files pointer for this.. + */ +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval = 0; + + if (!file_count(filp)) { + printk(KERN_ERR "VFS: Close: file count is 0\n"); + return 0; + } + + if (filp->f_op->flush) + retval = filp->f_op->flush(filp, id); + + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } + fput(filp); + return retval; +} + +EXPORT_SYMBOL(filp_close); + +/* + * Careful here! We test whether the file pointer is NULL before + * releasing the fd. This ensures that one clone task can't release + * an fd while another clone is opening it. + */ +SYSCALL_DEFINE1(close, unsigned int, fd) +{ + int retval = __close_fd(current->files, fd); + + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; + + return retval; +} +EXPORT_SYMBOL(sys_close); + +/* + * This routine simulates a hangup on the tty, to arrange that users + * are given clean terminals at login time. + */ +SYSCALL_DEFINE0(vhangup) +{ + if (capable(CAP_SYS_TTY_CONFIG)) { + tty_vhangup_self(); + return 0; + } + return -EPERM; +} + +/* + * Called when an inode is about to be open. + * We use this to disallow opening large files on 32bit systems if + * the caller didn't specify O_LARGEFILE. On 64bit systems we force + * on this flag in sys_open. + */ +int generic_file_open(struct inode * inode, struct file * filp) +{ + if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; + return 0; +} + +EXPORT_SYMBOL(generic_file_open); + +/* + * This is used by subsystems that don't want seekable + * file descriptors. The function is not supposed to ever fail, the only + * reason it returns an 'int' and not 'void' is so that it can be plugged + * directly into file_operations structure. + */ +int nonseekable_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + return 0; +} + +EXPORT_SYMBOL(nonseekable_open); diff --git a/kernel/fs/openpromfs/Makefile b/kernel/fs/openpromfs/Makefile new file mode 100644 index 000000000..4563199b8 --- /dev/null +++ b/kernel/fs/openpromfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Sun Openprom filesystem routines. +# + +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs.o + +openpromfs-objs := inode.o diff --git a/kernel/fs/openpromfs/inode.c b/kernel/fs/openpromfs/inode.c new file mode 100644 index 000000000..15e4500cd --- /dev/null +++ b/kernel/fs/openpromfs/inode.c @@ -0,0 +1,471 @@ +/* inode.c: /proc/openprom handling routines + * + * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static DEFINE_MUTEX(op_mutex); + +#define OPENPROM_ROOT_INO 0 + +enum op_inode_type { + op_inode_node, + op_inode_prop, +}; + +union op_inode_data { + struct device_node *node; + struct property *prop; +}; + +struct op_inode_info { + struct inode vfs_inode; + enum op_inode_type type; + union op_inode_data u; +}; + +static struct inode *openprom_iget(struct super_block *sb, ino_t ino); + +static inline struct op_inode_info *OP_I(struct inode *inode) +{ + return container_of(inode, struct op_inode_info, vfs_inode); +} + +static int is_string(unsigned char *p, int len) +{ + int i; + + for (i = 0; i < len; i++) { + unsigned char val = p[i]; + + if ((i && !val) || + (val >= ' ' && val <= '~')) + continue; + + return 0; + } + + return 1; +} + +static int property_show(struct seq_file *f, void *v) +{ + struct property *prop = f->private; + void *pval; + int len; + + len = prop->length; + pval = prop->value; + + if (is_string(pval, len)) { + while (len > 0) { + int n = strlen(pval); + + seq_printf(f, "%s", (char *) pval); + + /* Skip over the NULL byte too. */ + pval += n + 1; + len -= n + 1; + + if (len > 0) + seq_printf(f, " + "); + } + } else { + if (len & 3) { + while (len) { + len--; + if (len) + seq_printf(f, "%02x.", + *(unsigned char *) pval); + else + seq_printf(f, "%02x", + *(unsigned char *) pval); + pval++; + } + } else { + while (len >= 4) { + len -= 4; + + if (len) + seq_printf(f, "%08x.", + *(unsigned int *) pval); + else + seq_printf(f, "%08x", + *(unsigned int *) pval); + pval += 4; + } + } + } + seq_printf(f, "\n"); + + return 0; +} + +static void *property_start(struct seq_file *f, loff_t *pos) +{ + if (*pos == 0) + return pos; + return NULL; +} + +static void *property_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void property_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations property_op = { + .start = property_start, + .next = property_next, + .stop = property_stop, + .show = property_show +}; + +static int property_open(struct inode *inode, struct file *file) +{ + struct op_inode_info *oi = OP_I(inode); + int ret; + + BUG_ON(oi->type != op_inode_prop); + + ret = seq_open(file, &property_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = oi->u.prop; + } + return ret; +} + +static const struct file_operations openpromfs_prop_ops = { + .open = property_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int openpromfs_readdir(struct file *, struct dir_context *); + +static const struct file_operations openprom_operations = { + .read = generic_read_dir, + .iterate = openpromfs_readdir, + .llseek = generic_file_llseek, +}; + +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, unsigned int); + +static const struct inode_operations openprom_inode_operations = { + .lookup = openpromfs_lookup, +}; + +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + struct op_inode_info *ent_oi, *oi = OP_I(dir); + struct device_node *dp, *child; + struct property *prop; + enum op_inode_type ent_type; + union op_inode_data ent_data; + const char *name; + struct inode *inode; + unsigned int ino; + int len; + + BUG_ON(oi->type != op_inode_node); + + dp = oi->u.node; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + mutex_lock(&op_mutex); + + child = dp->child; + while (child) { + int n = strlen(child->path_component_name); + + if (len == n && + !strncmp(child->path_component_name, name, len)) { + ent_type = op_inode_node; + ent_data.node = child; + ino = child->unique_id; + goto found; + } + child = child->sibling; + } + + prop = dp->properties; + while (prop) { + int n = strlen(prop->name); + + if (len == n && !strncmp(prop->name, name, len)) { + ent_type = op_inode_prop; + ent_data.prop = prop; + ino = prop->unique_id; + goto found; + } + + prop = prop->next; + } + + mutex_unlock(&op_mutex); + return ERR_PTR(-ENOENT); + +found: + inode = openprom_iget(dir->i_sb, ino); + mutex_unlock(&op_mutex); + if (IS_ERR(inode)) + return ERR_CAST(inode); + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + set_nlink(inode, 2); + break; + case op_inode_prop: + if (!strcmp(dp->name, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + else + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &openpromfs_prop_ops; + set_nlink(inode, 1); + inode->i_size = ent_oi->u.prop->length; + break; + } + + d_add(dentry, inode); + return NULL; +} + +static int openpromfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct op_inode_info *oi = OP_I(inode); + struct device_node *dp = oi->u.node; + struct device_node *child; + struct property *prop; + int i; + + mutex_lock(&op_mutex); + + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, + (dp->parent == NULL ? + OPENPROM_ROOT_INO : + dp->parent->unique_id), DT_DIR)) + goto out; + ctx->pos = 2; + } + i = ctx->pos - 2; + + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; + i--; + } + while (child) { + if (!dir_emit(ctx, + child->path_component_name, + strlen(child->path_component_name), + child->unique_id, DT_DIR)) + goto out; + + ctx->pos++; + child = child->sibling; + } + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; + } + while (prop) { + if (!dir_emit(ctx, prop->name, strlen(prop->name), + prop->unique_id, DT_REG)) + goto out; + + ctx->pos++; + prop = prop->next; + } + +out: + mutex_unlock(&op_mutex); + return 0; +} + +static struct kmem_cache *op_inode_cachep; + +static struct inode *openprom_alloc_inode(struct super_block *sb) +{ + struct op_inode_info *oi; + + oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + return &oi->vfs_inode; +} + +static void openprom_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(op_inode_cachep, OP_I(inode)); +} + +static void openprom_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, openprom_i_callback); +} + +static struct inode *openprom_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + if (inode->i_ino == OPENPROM_ROOT_INO) { + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + } + unlock_new_inode(inode); + } + return inode; +} + +static int openprom_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_NOATIME; + return 0; +} + +static const struct super_operations openprom_sops = { + .alloc_inode = openprom_alloc_inode, + .destroy_inode = openprom_destroy_inode, + .statfs = simple_statfs, + .remount_fs = openprom_remount, +}; + +static int openprom_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct op_inode_info *oi; + int ret; + + s->s_flags |= MS_NOATIME; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = OPENPROM_SUPER_MAGIC; + s->s_op = &openprom_sops; + s->s_time_gran = 1; + root_inode = openprom_iget(s, OPENPROM_ROOT_INO); + if (IS_ERR(root_inode)) { + ret = PTR_ERR(root_inode); + goto out_no_root; + } + + oi = OP_I(root_inode); + oi->type = op_inode_node; + oi->u.node = of_find_node_by_path("/"); + + s->s_root = d_make_root(root_inode); + if (!s->s_root) + goto out_no_root_dentry; + return 0; + +out_no_root_dentry: + ret = -ENOMEM; +out_no_root: + printk("openprom_fill_super: get root inode failed\n"); + return ret; +} + +static struct dentry *openprom_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, openprom_fill_super); +} + +static struct file_system_type openprom_fs_type = { + .owner = THIS_MODULE, + .name = "openpromfs", + .mount = openprom_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("openpromfs"); + +static void op_inode_init_once(void *data) +{ + struct op_inode_info *oi = (struct op_inode_info *) data; + + inode_init_once(&oi->vfs_inode); +} + +static int __init init_openprom_fs(void) +{ + int err; + + op_inode_cachep = kmem_cache_create("op_inode_cache", + sizeof(struct op_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + op_inode_init_once); + if (!op_inode_cachep) + return -ENOMEM; + + err = register_filesystem(&openprom_fs_type); + if (err) + kmem_cache_destroy(op_inode_cachep); + + return err; +} + +static void __exit exit_openprom_fs(void) +{ + unregister_filesystem(&openprom_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(op_inode_cachep); +} + +module_init(init_openprom_fs) +module_exit(exit_openprom_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/overlayfs/Kconfig b/kernel/fs/overlayfs/Kconfig new file mode 100644 index 000000000..34355818a --- /dev/null +++ b/kernel/fs/overlayfs/Kconfig @@ -0,0 +1,10 @@ +config OVERLAY_FS + tristate "Overlay filesystem support" + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt diff --git a/kernel/fs/overlayfs/Makefile b/kernel/fs/overlayfs/Makefile new file mode 100644 index 000000000..900daed3e --- /dev/null +++ b/kernel/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAY_FS) += overlay.o + +overlay-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/kernel/fs/overlayfs/copy_up.c b/kernel/fs/overlayfs/copy_up.c new file mode 100644 index 000000000..84d693d37 --- /dev/null +++ b/kernel/fs/overlayfs/copy_up.c @@ -0,0 +1,416 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; +} + +static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, + struct dentry *dentry, struct path *lowerpath, + struct kstat *stat, struct iattr *attr, + const char *link) +{ + struct inode *wdir = workdir->d_inode; + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry = NULL; + struct dentry *upper = NULL; + umode_t mode = stat->mode; + int err; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out1; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); + stat->mode = mode; + if (err) + goto out2; + + if (S_ISREG(stat->mode)) { + struct path upperpath; + ovl_path_upper(dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = newdentry; + + err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); + if (err) + goto out_cleanup; + } + + err = ovl_copy_xattr(lowerpath->dentry, newdentry); + if (err) + goto out_cleanup; + + mutex_lock(&newdentry->d_inode->i_mutex); + err = ovl_set_attr(newdentry, stat); + if (!err && attr) + err = notify_change(newdentry, attr, NULL); + mutex_unlock(&newdentry->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + + ovl_dentry_update(dentry, newdentry); + newdentry = NULL; + + /* + * Non-directores become opaque when copied up. + */ + if (!S_ISDIR(stat->mode)) + ovl_dentry_set_opaque(dentry, true); +out2: + dput(upper); +out1: + dput(newdentry); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr) +{ + struct dentry *workdir = ovl_workdir(dentry); + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + struct dentry *upperdentry; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + if (WARN_ON(!workdir)) + return -EROFS; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(&parentpath, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_CHOWN for chown + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + err = -EIO; + if (lock_rename(workdir, upperdir) != NULL) { + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + goto out_unlock; + } + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + unlock_rename(workdir, upperdir); + err = 0; + /* Raced with another copy-up? Do the setattr here */ + if (attr) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } + goto out_put_cred; + } + + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, attr, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } +out_unlock: + unlock_rename(workdir, upperdir); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (OVL_TYPE_UPPER(type)) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (OVL_TYPE_UPPER(type)) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + + dput(parent); + dput(next); + } + + return err; +} diff --git a/kernel/fs/overlayfs/dir.c b/kernel/fs/overlayfs/dir.c new file mode 100644 index 000000000..692ceda3b --- /dev/null +++ b/kernel/fs/overlayfs/dir.c @@ -0,0 +1,951 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include "overlayfs.h" + +void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (d_is_dir(wdentry)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } +} + +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +{ + struct dentry *temp; + char name[20]; + + snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir, + struct dentry *dentry) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir, dentry); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug) +{ + int err; + + if (newdentry->d_inode) + return -ESTALE; + + if (hardlink) { + err = ovl_do_link(hardlink, dir, newdentry, debug); + } else { + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, stat->mode, debug); + break; + + case S_IFDIR: + err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, + stat->mode, stat->rdev, debug); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, link, debug); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -ENOENT; + } + return err; +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); +} + +static void ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + + err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE); + if (err) { + pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", + upperdentry->d_name.name, err); + } +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(&realpath, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (OVL_TYPE_MERGE(type)) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&udir->i_mutex); + return err; +} + +static int ovl_lock_rename_workdir(struct dentry *workdir, + struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + if (err) + goto out_dput; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(opaquedir); + if (err) + goto out_cleanup; + + mutex_lock(&opaquedir->d_inode->i_mutex); + err = ovl_set_attr(opaquedir, &stat); + mutex_unlock(&opaquedir->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); +out_dput: + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) +{ + int err; + struct dentry *ret = NULL; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (err) + ret = ERR_PTR(err); + else { + /* + * If no upperdentry then skip clearing whiteouts. + * + * Can race with copy-up, since we don't hold the upperdir + * mutex. Doesn't matter, since copy-up can't create a + * non-empty directory from an empty one. + */ + if (ovl_dentry_upper(dentry)) + ret = ovl_clear_empty(dentry, &list); + } + + ovl_cache_free(&list); + + return ret; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + + if (WARN_ON(!workdir)) + return -EROFS; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_dput; + + err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + if (err) + goto out_dput2; + + if (S_ISDIR(stat->mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput2: + dput(upper); +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out_dput2; +} + +static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, + const char *link, struct dentry *hardlink) +{ + int err; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + if (!ovl_dentry_is_opaque(dentry)) { + err = ovl_create_upper(dentry, inode, &stat, link, hardlink); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_iput; + + /* + * CAP_SYS_ADMIN for setting opaque xattr + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, link, + hardlink); + + revert_creds(old_cred); + put_cred(override_cred); + } + + if (!err) + inode = NULL; +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_create_or_link(dentry, mode, rdev, link, NULL); + ovl_drop_write(dentry); + } + + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *upper; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + upper = ovl_dentry_upper(old); + err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *whiteout; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (WARN_ON(!workdir)) + return -EROFS; + + if (is_dir) { + if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } else { + LIST_HEAD(list); + + /* + * When removing an empty opaque directory, then it + * makes no sense to replace it with an exact replica of + * itself. But emptiness still needs to be checked. + */ + err = ovl_check_empty_dir(dentry, &list); + ovl_cache_free(&list); + if (err) + goto out; + } + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_unlock; + + upper = ovl_dentry_upper(dentry); + if (!upper) { + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto kill_whiteout; + + err = ovl_do_rename(wdir, whiteout, udir, upper, 0); + dput(upper); + if (err) + goto kill_whiteout; + } else { + int flags = 0; + + if (opaquedir) + upper = opaquedir; + err = -ESTALE; + if (upper->d_parent != upperdir) + goto kill_whiteout; + + if (is_dir) + flags |= RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + + if (is_dir) + ovl_cleanup(wdir, upper); + } + ovl_dentry_version_inc(dentry->d_parent); +out_d_drop: + d_drop(dentry); + dput(whiteout); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out_d_drop; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper = ovl_dentry_upper(dentry); + int err; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = -ESTALE; + if (upper->d_parent == upperdir) { + /* Don't let d_delete() think it can reset d_inode */ + dget(upper); + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + dput(upper); + ovl_dentry_version_inc(dentry->d_parent); + } + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + d_drop(dentry); + mutex_unlock(&dir->i_mutex); + + return err; +} + +static inline int ovl_check_sticky(struct dentry *dentry) +{ + struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; + struct inode *inode = ovl_dentry_real(dentry)->d_inode; + + if (check_sticky(dir, inode)) + return -EPERM; + + return 0; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + enum ovl_path_type type; + int err; + + err = ovl_check_sticky(dentry); + if (err) + goto out; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + type = ovl_path_type(dentry); + if (OVL_TYPE_PURE_UPPER(type)) { + err = ovl_remove_upper(dentry, is_dir); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + + err = ovl_remove_and_whiteout(dentry, is_dir); + + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static int ovl_rename2(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = d_is_dir(old); + bool new_is_dir = false; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct cred *override_cred = NULL; + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + err = ovl_check_sticky(old); + if (err) + goto out; + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + err = -EXDEV; + if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir) + goto out; + + if (new->d_inode) { + err = ovl_check_sticky(new); + if (err) + goto out; + + if (d_is_dir(new)) + new_is_dir = true; + + new_type = ovl_path_type(new); + err = -EXDEV; + if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) + goto out; + + err = 0; + if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + goto out; + } + if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + goto out; + } + } else { + if (ovl_dentry_is_opaque(new)) + new_type = __OVL_PATH_UPPER; + else + new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE; + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } + + old_opaque = !OVL_TYPE_PURE_UPPER(old_type); + new_opaque = !OVL_TYPE_PURE_UPPER(new_type); + + if (old_opaque || new_opaque) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + } + + if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + if (overwrite) { + if (old_opaque) { + if (new->d_inode || !new_opaque) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && !new->d_inode && new_opaque) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + if (opaquedir) { + newdentry = opaquedir; + opaquedir = NULL; + } else { + dget(newdentry); + } + } else { + new_create = true; + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + if (!overwrite && new_is_dir && old_opaque && !new_opaque) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + + if (old_opaque || new_opaque) { + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + flags); + } else { + /* No debug for the plain case */ + BUG_ON(flags & ~RENAME_EXCHANGE); + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + NULL, flags); + } + + if (err) { + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(newdentry); + goto out_dput; + } + + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(newdentry); + + if (old_opaque != new_opaque) { + ovl_dentry_set_opaque(old, new_opaque); + if (!overwrite) + ovl_dentry_set_opaque(new, old_opaque); + } + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + if (old_opaque || new_opaque) { + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename2 = ovl_rename2, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/kernel/fs/overlayfs/inode.c b/kernel/fs/overlayfs/inode.c new file mode 100644 index 000000000..04f124884 --- /dev/null +++ b/kernel/fs/overlayfs/inode.c @@ -0,0 +1,436 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include "overlayfs.h" + +static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, + bool no_data) +{ + int err; + struct dentry *parent; + struct kstat stat; + struct path lowerpath; + + parent = dget_parent(dentry); + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (err) + goto out_dput_parent; + + if (no_data) + stat.size = 0; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + +out_dput_parent: + dput(parent); + return err; +} + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } else { + err = ovl_copy_up_last(dentry, attr, false); + } + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(&realpath, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + alias = d_find_any_alias(inode); + if (WARN_ON(!alias)) + return -ENOENT; + + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + } + + err = __inode_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -EPERM; + if (ovl_is_private_xattr(name)) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + upperdentry = ovl_dentry_upper(dentry); + err = vfs_setxattr(upperdentry, name, value, size, flags); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_need_xattr_filter(struct dentry *dentry, + enum ovl_path_type type) +{ + if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) + return S_ISDIR(dentry->d_inode->i_mode); + else + return false; +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(realpath.dentry, name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + ssize_t res; + int off; + + res = vfs_listxattr(realpath.dentry, list, size); + if (res <= 0 || size == 0) + return res; + + if (!ovl_need_xattr_filter(dentry, type)) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -ENODATA; + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + goto out_drop_write; + + if (!OVL_TYPE_UPPER(type)) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_removexattr(realpath.dentry, name); +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (OVL_TYPE_UPPER(type)) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static int ovl_dentry_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + bool want_write = false; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + want_write = true; + err = ovl_want_write(dentry); + if (err) + goto out; + + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_last(dentry, NULL, true); + else + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_open(&realpath, file, cred); +out_drop_write: + if (want_write) + ovl_drop_write(dentry); +out: + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .dentry_open = ovl_dentry_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + iput(inode); + inode = NULL; + } + + return inode; +} diff --git a/kernel/fs/overlayfs/overlayfs.h b/kernel/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000..17ac5afc9 --- /dev/null +++ b/kernel/fs/overlayfs/overlayfs.h @@ -0,0 +1,199 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include + +struct ovl_entry; + +enum ovl_path_type { + __OVL_PATH_PURE = (1 << 0), + __OVL_PATH_UPPER = (1 << 1), + __OVL_PATH_MERGE = (1 << 2), +}; + +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE) +#define OVL_TYPE_MERGE_OR_LOWER(type) \ + (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) + +#define OVL_XATTR_PRE_NAME "trusted.overlay." +#define OVL_XATTR_PRE_LEN 16 +#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque" + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool debug) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + if (debug) { + pr_debug("link(%pd2, %pd2) = %i\n", + old_dentry, new_dentry, err); + } + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_create(dir, dentry, mode, true); + if (debug) + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_mkdir(dir, dentry, mode); + if (debug) + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev, bool debug) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + if (debug) { + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", + dentry, mode, dev, err); + } + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname, bool debug) +{ + int err = vfs_symlink(dir, dentry, oldname); + if (debug) + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", + dentry, name, (int) size, (char *) value, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + olddentry, newdentry, flags); + + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + + if (err) { + pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct dentry *ovl_workdir(struct dentry *dentry); +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug); +void ovl_cleanup(struct inode *dir, struct dentry *dentry); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/kernel/fs/overlayfs/readdir.c b/kernel/fs/overlayfs/readdir.c new file mode 100644 index 000000000..907870e81 --- /dev/null +++ b/kernel/fs/overlayfs/readdir.c @@ -0,0 +1,557 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +struct ovl_cache_entry { + unsigned int len; + unsigned int type; + u64 ino; + struct list_head l_node; + struct rb_node node; + bool is_whiteout; + char name[]; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + bool is_merge; + struct rb_root root; + struct list_head *list; + struct list_head middle; + struct dentry *dir; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct list_head *cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, + const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); + + p = kmalloc(size, GFP_KERNEL); + if (!p) + return NULL; + + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + + if (d_type == DT_CHR) { + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + kfree(p); + return NULL; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + dentry = lookup_one_len(name, dir, len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + revert_creds(old_cred); + put_cred(override_cred); + } + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root.rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, &rdd->root); + + return 0; +} + +static int ovl_fill_lower(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, &rdd->middle); + } else { + p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, &rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(dentry) == cache) + ovl_set_dir_cache(dentry, NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + if (!rdd->is_merge) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->dir = realpath->dentry; + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + enum ovl_path_type type = ovl_path_type(dentry); + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + od->cursor = NULL; + } + WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); + if (od->is_real && OVL_TYPE_MERGE(type)) + od->is_real = false; +} + +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path realpath; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .list = list, + .root = RB_ROOT, + .is_merge = false, + }; + int idx, next; + + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); + + if (next != -1) { + err = ovl_dir_read(&realpath, &rdd); + if (err) + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); + } + } + return err; +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct list_head *p; + loff_t off = 0; + + list_for_each(p, &od->cache->entries) { + if (off >= pos) + break; + off++; + } + /* Cursor is safe since the cache is stable */ + od->cursor = p; +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(dentry); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache->refcount++; + return cache; + } + ovl_set_dir_cache(dentry, NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + + res = ovl_dir_read_merged(dentry, &cache->entries); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(dentry, cache); + + return cache; +} + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) + return iterate_dir(od->realfile, ctx); + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + od->cursor = p->l_node.next; + ctx->pos++; + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file_inode(file)->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file_inode(file)->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { + struct inode *inode = file_inode(file); + + realfile = lockless_dereference(od->upperfile); + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + smp_mb__before_spinlock(); + mutex_lock(&inode->i_mutex); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; + } + mutex_unlock(&inode->i_mutex); + } + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + mutex_lock(&inode->i_mutex); + ovl_cache_put(od, file->f_path.dentry); + mutex_unlock(&inode->i_mutex); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + od->realfile = realfile; + od->is_real = !OVL_TYPE_MERGE(type); + od->is_upper = OVL_TYPE_UPPER(type); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct ovl_cache_entry *p; + + err = ovl_dir_read_merged(dentry, list); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + mutex_unlock(&upper->d_inode->i_mutex); +} diff --git a/kernel/fs/overlayfs/super.c b/kernel/fs/overlayfs/super.c new file mode 100644 index 000000000..bf8537c7f --- /dev/null +++ b/kernel/fs/overlayfs/super.c @@ -0,0 +1,1046 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + unsigned numlower; + struct vfsmount **lower_mnt; + struct dentry *workdir; + long lower_namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; +}; + +struct ovl_dir_cache; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; + unsigned numlower; + struct path lowerstack[]; +}; + +#define OVL_MAX_STACK 500 + +static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) +{ + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; + + if (oe->__upperdentry) { + type = __OVL_PATH_UPPER; + + if (oe->numlower) { + if (S_ISDIR(dentry->d_inode->i_mode)) + type |= __OVL_PATH_MERGE; + } else if (!oe->opaque) { + type |= __OVL_PATH_PURE; + } + } else { + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; + } + return type; +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + return lockless_dereference(oe->__upperdentry); +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + if (!OVL_TYPE_UPPER(type)) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return __ovl_dentry_lower(oe); +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = __ovl_dentry_lower(oe); + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = __ovl_dentry_lower(oe); + *is_upper = false; + } + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; +} + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + struct inode *inode = dentry->d_inode; + + if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + return false; + + res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + unsigned int i; + + dput(oe->__upperdentry); + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); + kfree_rcu(oe, rcu); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) +{ + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + *path = oe->lowerstack[idx - 1]; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct path *stack = NULL; + struct dentry *upperdir, *upperdentry = NULL; + unsigned int ctr = 0; + struct inode *inode = NULL; + bool upperopaque = false; + struct dentry *this, *prev = NULL; + unsigned int i; + int err; + + upperdir = ovl_upperdentry_dereference(poe); + if (upperdir) { + this = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) + goto out; + + if (this) { + if (ovl_is_whiteout(this)) { + dput(this); + this = NULL; + upperopaque = true; + } else if (poe->numlower && ovl_is_opaquedir(this)) { + upperopaque = true; + } + } + upperdentry = prev = this; + } + + if (!upperopaque && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_put_upper; + } + + for (i = 0; !upperopaque && i < poe->numlower; i++) { + bool opaque = false; + struct path lowerpath = poe->lowerstack[i]; + + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) { + /* + * If it's positive, then treat ENAMETOOLONG as ENOENT. + */ + if (err == -ENAMETOOLONG && (upperdentry || ctr)) + continue; + goto out_put; + } + if (!this) + continue; + if (ovl_is_whiteout(this)) { + dput(this); + break; + } + /* + * Only makes sense to check opaque dir if this is not the + * lowermost layer. + */ + if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) + opaque = true; + + if (prev && (!S_ISDIR(prev->d_inode->i_mode) || + !S_ISDIR(this->d_inode->i_mode))) { + /* + * FIXME: check for upper-opaqueness maybe better done + * in remove code. + */ + if (prev == upperdentry) + upperopaque = true; + dput(this); + break; + } + /* + * If this is a non-directory then stop here. + */ + if (!S_ISDIR(this->d_inode->i_mode)) + opaque = true; + + stack[ctr].dentry = this; + stack[ctr].mnt = lowerpath.mnt; + ctr++; + prev = this; + if (opaque) + break; + } + + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + if (upperdentry || ctr) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : stack[0].dentry; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_free_oe; + ovl_copyattr(realdentry->d_inode, inode); + } + + oe->opaque = upperopaque; + oe->__upperdentry = upperdentry; + memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + kfree(stack); + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_free_oe: + kfree(oe); +out_put: + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + dput(upperdentry); +out: + return ERR_PTR(err); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + unsigned i; + + dput(ufs->workdir); + mntput(ufs->upper_mnt); + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); + + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +} + +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the upper filesystem (if it exists) + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_real(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + if (ufs->config.upperdir) { + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + } + return 0; +} + +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir)) + return -EROFS; + + return 0; +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, + .statfs = ovl_statfs, + .show_options = ovl_show_options, + .remount_fs = ovl_remount, +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_ERR, NULL} +}; + +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + while ((p = ovl_next_opt(&opt)) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); + return -EINVAL; + } + } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + + return 0; +} + +#define OVL_WORKDIR_NAME "work" + +static struct dentry *ovl_workdir_create(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct dentry *work; + int err; + bool retried = false; + + err = mnt_want_write(mnt); + if (err) + return ERR_PTR(err); + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); +retry: + work = lookup_one_len(OVL_WORKDIR_NAME, dentry, + strlen(OVL_WORKDIR_NAME)); + + if (!IS_ERR(work)) { + struct kstat stat = { + .mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + retried = true; + ovl_cleanup(dir, work); + dput(work); + goto retry; + } + + err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + if (err) + goto out_dput; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(mnt); + + return work; + +out_dput: + dput(work); + work = ERR_PTR(err); + goto out_unlock; +} + +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + +static bool ovl_is_allowed_fs_type(struct dentry *root) +{ + const struct dentry_operations *dop = root->d_op; + + /* + * We don't support: + * - automount filesystems + * - filesystems with revalidate (FIXME for lower layer) + * - filesystems with case insensitive names + */ + if (dop && + (dop->d_manage || dop->d_automount || + dop->d_revalidate || dop->d_weak_revalidate || + dop->d_compare || dop->d_hash)) { + return false; + } + return true; +} + +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("overlayfs: empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (!ovl_is_allowed_fs_type(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + pr_err("overlayfs: '%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + kfree(tmp); + } + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, long *namelen, + int *stack_depth) +{ + int err; + struct kstatfs statfs; + + err = ovl_mount_dir_noesc(name, path); + if (err) + goto out; + + err = vfs_statfs(path, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on '%s'\n", name); + goto out_put; + } + *namelen = max(*namelen, statfs.f_namelen); + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + return 0; + +out_put: + path_put(path); +out: + return err; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path upperpath = { NULL, NULL }; + struct path workpath = { NULL, NULL }; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct path *stack = NULL; + char *lowertmp; + char *lower; + unsigned int numlower; + unsigned int stacklen = 0; + unsigned int i; + int err; + + err = -ENOMEM; + ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out; + + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_config; + + err = -EINVAL; + if (!ufs->config.lowerdir) { + pr_err("overlayfs: missing 'lowerdir'\n"); + goto out_free_config; + } + + sb->s_stack_depth = 0; + if (ufs->config.upperdir) { + if (!ufs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_free_config; + } + + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_config; + + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } + + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_upperpath; + + err = -EINVAL; + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; + } + err = -ENOMEM; + lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + if (!lowertmp) + goto out_put_workpath; + + err = -EINVAL; + stacklen = ovl_split_lowerdirs(lowertmp); + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); + goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; + } + + stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_free_lowertmp; + + lower = lowertmp; + for (numlower = 0; numlower < stacklen; numlower++) { + err = ovl_lower_dir(lower, &stack[numlower], + &ufs->lower_namelen, &sb->s_stack_depth); + if (err) + goto out_put_lowerpath; + + lower = strchr(lower, '\0') + 1; + } + + err = -EINVAL; + sb->s_stack_depth++; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_lowerpath; + } + + if (ufs->config.upperdir) { + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_lowerpath; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + ufs->config.workdir, OVL_WORKDIR_NAME, -err); + sb->s_flags |= MS_RDONLY; + ufs->workdir = NULL; + } + } + + err = -ENOMEM; + ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); + if (ufs->lower_mnt == NULL) + goto out_put_workdir; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt = clone_private_mount(&stack[i]); + + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_lower_mnt; + } + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY; + + ufs->lower_mnt[ufs->numlower] = mnt; + ufs->numlower++; + } + + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) + sb->s_flags |= MS_RDONLY; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_put_lower_mnt; + + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); + if (!root_dentry) + goto out_free_oe; + + mntput(upperpath.mnt); + for (i = 0; i < numlower; i++) + mntput(stack[i].mnt); + path_put(&workpath); + kfree(lowertmp); + + oe->__upperdentry = upperpath.dentry; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = stack[i].dentry; + oe->lowerstack[i].mnt = ufs->lower_mnt[i]; + } + + root_dentry->d_fsdata = oe; + + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_free_oe: + kfree(oe); +out_put_lower_mnt: + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); + kfree(ufs->lower_mnt); +out_put_workdir: + dput(ufs->workdir); + mntput(ufs->upper_mnt); +out_put_lowerpath: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); +out_free_lowertmp: + kfree(lowertmp); +out_put_workpath: + path_put(&workpath); +out_put_upperpath: + path_put(&upperpath); +out_free_config: + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlay", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlay"); + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); diff --git a/kernel/fs/pipe.c b/kernel/fs/pipe.c new file mode 100644 index 000000000..8865f7963 --- /dev/null +++ b/kernel/fs/pipe.c @@ -0,0 +1,1125 @@ +/* + * linux/fs/pipe.c + * + * Copyright (C) 1991, 1992, 1999 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +/* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-size + */ +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; + +/* + * We use a start+len construction, which provides full use of the + * allocated memory. + * -- Florian Coosmann (FGC) + * + * Reads with count = 0 should always return 0. + * -- Julian Bradfield 1999-06-07. + * + * FIFOs and Pipes now generate SIGIO for both readers and writers. + * -- Jeremy Elson 2001-08-16 + * + * pipe_read & write cleanup + * -- Manfred Spraul 2002-05-09 + */ + +static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +{ + if (pipe->files) + mutex_lock_nested(&pipe->mutex, subclass); +} + +void pipe_lock(struct pipe_inode_info *pipe) +{ + /* + * pipe_lock() nests non-pipe inode locks (for writing to a file) + */ + pipe_lock_nested(pipe, I_MUTEX_PARENT); +} +EXPORT_SYMBOL(pipe_lock); + +void pipe_unlock(struct pipe_inode_info *pipe) +{ + if (pipe->files) + mutex_unlock(&pipe->mutex); +} +EXPORT_SYMBOL(pipe_unlock); + +static inline void __pipe_lock(struct pipe_inode_info *pipe) +{ + mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); +} + +static inline void __pipe_unlock(struct pipe_inode_info *pipe) +{ + mutex_unlock(&pipe->mutex); +} + +void pipe_double_lock(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2) +{ + BUG_ON(pipe1 == pipe2); + + if (pipe1 < pipe2) { + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + } else { + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); + } +} + +/* Drop the inode semaphore and wait for a pipe event, atomically */ +void pipe_wait(struct pipe_inode_info *pipe) +{ + DEFINE_WAIT(wait); + + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->wait, &wait); + pipe_lock(pipe); +} + +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); +} + +/** + * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to attempt to steal + * + * Description: + * This function attempts to steal the &struct page attached to + * @buf. If successful, this function returns 0 and returns with + * the page locked. The caller may then reuse the page for whatever + * he wishes; the typical use is insertion into a different file + * page cache. + */ +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * A reference of one is golden, that means that the owner of this + * page is the only one holding a reference to it. lock the page + * and return OK. + */ + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(generic_pipe_buf_steal); + +/** + * generic_pipe_buf_get - get a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to get a reference to + * + * Description: + * This function grabs an extra reference to @buf. It's used in + * in the tee() system call, when we duplicate the buffers in one + * pipe into another. + */ +void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_get); + +/** + * generic_pipe_buf_confirm - verify contents of the pipe buffer + * @info: the pipe that the buffer belongs to + * @buf: the buffer to confirm + * + * Description: + * This function does nothing, because the generic pipe code uses + * pages that are always good when inserted into the pipe. + */ +int generic_pipe_buf_confirm(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + return 0; +} +EXPORT_SYMBOL(generic_pipe_buf_confirm); + +/** + * generic_pipe_buf_release - put a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + * + * Description: + * This function releases a reference to @buf. + */ +void generic_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_release); + +static const struct pipe_buf_operations anon_pipe_buf_ops = { + .can_merge = 1, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static const struct pipe_buf_operations packet_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t +pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + size_t total_len = iov_iter_count(to); + struct file *filp = iocb->ki_filp; + struct pipe_inode_info *pipe = filp->private_data; + int do_wakeup; + ssize_t ret; + + /* Null read succeeds. */ + if (unlikely(total_len == 0)) + return 0; + + do_wakeup = 0; + ret = 0; + __pipe_lock(pipe); + for (;;) { + int bufs = pipe->nrbufs; + if (bufs) { + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; + const struct pipe_buf_operations *ops = buf->ops; + size_t chars = buf->len; + size_t written; + int error; + + if (chars > total_len) + chars = total_len; + + error = ops->confirm(pipe, buf); + if (error) { + if (!ret) + ret = error; + break; + } + + written = copy_page_to_iter(buf->page, buf->offset, chars, to); + if (unlikely(written < chars)) { + if (!ret) + ret = -EFAULT; + break; + } + ret += chars; + buf->offset += chars; + buf->len -= chars; + + /* Was it a packet buffer? Clean up and exit */ + if (buf->flags & PIPE_BUF_FLAG_PACKET) { + total_len = chars; + buf->len = 0; + } + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + curbuf = (curbuf + 1) & (pipe->buffers - 1); + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; + do_wakeup = 1; + } + total_len -= chars; + if (!total_len) + break; /* common path: read succeeded */ + } + if (bufs) /* More to do? */ + continue; + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + /* syscall merging: Usually we must not sleep + * if O_NONBLOCK is set, or if we got some data. + * But if a writer sleeps in kernel space, then + * we can wait for that data without violating POSIX. + */ + if (ret) + break; + if (filp->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + pipe_wait(pipe); + } + __pipe_unlock(pipe); + + /* Signal writers asynchronously that there is more room. */ + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + if (ret > 0) + file_accessed(filp); + return ret; +} + +static inline int is_packetized(struct file *file) +{ + return (file->f_flags & O_DIRECT) != 0; +} + +static ssize_t +pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *filp = iocb->ki_filp; + struct pipe_inode_info *pipe = filp->private_data; + ssize_t ret = 0; + int do_wakeup = 0; + size_t total_len = iov_iter_count(from); + ssize_t chars; + + /* Null write succeeds. */ + if (unlikely(total_len == 0)) + return 0; + + __pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + goto out; + } + + /* We try to merge small writes */ + chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; + const struct pipe_buf_operations *ops = buf->ops; + int offset = buf->offset + buf->len; + + if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error = ops->confirm(pipe, buf); + if (error) + goto out; + + ret = copy_page_from_iter(buf->page, offset, chars, from); + if (unlikely(ret < chars)) { + error = -EFAULT; + goto out; + } + do_wakeup = 1; + buf->len += chars; + ret = chars; + if (!iov_iter_count(from)) + goto out; + } + } + + for (;;) { + int bufs; + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + bufs = pipe->nrbufs; + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; + int copied; + + if (!page) { + page = alloc_page(GFP_HIGHUSER); + if (unlikely(!page)) { + ret = ret ? : -ENOMEM; + break; + } + pipe->tmp_page = page; + } + /* Always wake up, even if the copy fails. Otherwise + * we lock up (O_NONBLOCK-)readers that sleep due to + * syscall merging. + * FIXME! Is this really true? + */ + do_wakeup = 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + if (!ret) + ret = -EFAULT; + break; + } + ret += copied; + + /* Insert it into the buffer array */ + buf->page = page; + buf->ops = &anon_pipe_buf_ops; + buf->offset = 0; + buf->len = copied; + buf->flags = 0; + if (is_packetized(filp)) { + buf->ops = &packet_pipe_buf_ops; + buf->flags = PIPE_BUF_FLAG_PACKET; + } + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; + + if (!iov_iter_count(from)) + break; + } + if (bufs < pipe->buffers) + continue; + if (filp->f_flags & O_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } +out: + __pipe_unlock(pipe); + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } + return ret; +} + +static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe = filp->private_data; + int count, buf, nrbufs; + + switch (cmd) { + case FIONREAD: + __pipe_lock(pipe); + count = 0; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; + while (--nrbufs >= 0) { + count += pipe->bufs[buf].len; + buf = (buf+1) & (pipe->buffers - 1); + } + __pipe_unlock(pipe); + + return put_user(count, (int __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +/* No kernel lock held - fine */ +static unsigned int +pipe_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask; + struct pipe_inode_info *pipe = filp->private_data; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); + + /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; + if (filp->f_mode & FMODE_READ) { + mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; + if (!pipe->writers && filp->f_version != pipe->w_counter) + mask |= POLLHUP; + } + + if (filp->f_mode & FMODE_WRITE) { + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; + /* + * Most Unices do not set POLLERR for FIFOs but on Linux they + * behave exactly like pipes for poll(). + */ + if (!pipe->readers) + mask |= POLLERR; + } + + return mask; +} + +static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) +{ + int kill = 0; + + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + + if (kill) + free_pipe_info(pipe); +} + +static int +pipe_release(struct inode *inode, struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; + + __pipe_lock(pipe); + if (file->f_mode & FMODE_READ) + pipe->readers--; + if (file->f_mode & FMODE_WRITE) + pipe->writers--; + + if (pipe->readers || pipe->writers) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + __pipe_unlock(pipe); + + put_pipe_info(inode, pipe); + return 0; +} + +static int +pipe_fasync(int fd, struct file *filp, int on) +{ + struct pipe_inode_info *pipe = filp->private_data; + int retval = 0; + + __pipe_lock(pipe); + if (filp->f_mode & FMODE_READ) + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); + if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); + if (retval < 0 && (filp->f_mode & FMODE_READ)) + /* this can happen only if on == T */ + fasync_helper(-1, filp, 0, &pipe->fasync_readers); + } + __pipe_unlock(pipe); + return retval; +} + +struct pipe_inode_info *alloc_pipe_info(void) +{ + struct pipe_inode_info *pipe; + + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->buffers = PIPE_DEF_BUFFERS; + mutex_init(&pipe->mutex); + return pipe; + } + kfree(pipe); + } + + return NULL; +} + +void free_pipe_info(struct pipe_inode_info *pipe) +{ + int i; + + for (i = 0; i < pipe->buffers; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + if (buf->ops) + buf->ops->release(pipe, buf); + } + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe->bufs); + kfree(pipe); +} + +static struct vfsmount *pipe_mnt __read_mostly; + +/* + * pipefs_dname() is called from d_path(). + */ +static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pipefs_dentry_operations = { + .d_dname = pipefs_dname, +}; + +static struct inode * get_pipe_inode(void) +{ + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; + + if (!inode) + goto fail_inode; + + inode->i_ino = get_next_ino(); + + pipe = alloc_pipe_info(); + if (!pipe) + goto fail_iput; + + inode->i_pipe = pipe; + pipe->files = 2; + pipe->readers = pipe->writers = 1; + inode->i_fop = &pipefifo_fops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because "mark_inode_dirty()" will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + return inode; + +fail_iput: + iput(inode); + +fail_inode: + return NULL; +} + +int create_pipe_files(struct file **res, int flags) +{ + int err; + struct inode *inode = get_pipe_inode(); + struct file *f; + struct path path; + static struct qstr name = { .name = "" }; + + if (!inode) + return -ENFILE; + + err = -ENOMEM; + path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); + if (!path.dentry) + goto err_inode; + path.mnt = mntget(pipe_mnt); + + d_instantiate(path.dentry, inode); + + err = -ENFILE; + f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); + if (IS_ERR(f)) + goto err_dentry; + + f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); + f->private_data = inode->i_pipe; + + res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); + if (IS_ERR(res[0])) + goto err_file; + + path_get(&path); + res[0]->private_data = inode->i_pipe; + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); + res[1] = f; + return 0; + +err_file: + put_filp(f); +err_dentry: + free_pipe_info(inode->i_pipe); + path_put(&path); + return err; + +err_inode: + free_pipe_info(inode->i_pipe); + iput(inode); + return err; +} + +static int __do_pipe_flags(int *fd, struct file **files, int flags) +{ + int error; + int fdw, fdr; + + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) + return -EINVAL; + + error = create_pipe_files(files, flags); + if (error) + return error; + + error = get_unused_fd_flags(flags); + if (error < 0) + goto err_read_pipe; + fdr = error; + + error = get_unused_fd_flags(flags); + if (error < 0) + goto err_fdr; + fdw = error; + + audit_fd_pair(fdr, fdw); + fd[0] = fdr; + fd[1] = fdw; + return 0; + + err_fdr: + put_unused_fd(fdr); + err_read_pipe: + fput(files[0]); + fput(files[1]); + return error; +} + +int do_pipe_flags(int *fd, int flags) +{ + struct file *files[2]; + int error = __do_pipe_flags(fd, files, flags); + if (!error) { + fd_install(fd[0], files[0]); + fd_install(fd[1], files[1]); + } + return error; +} + +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +{ + struct file *files[2]; + int fd[2]; + int error; + + error = __do_pipe_flags(fd, files, flags); + if (!error) { + if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { + fput(files[0]); + fput(files[1]); + put_unused_fd(fd[0]); + put_unused_fd(fd[1]); + error = -EFAULT; + } else { + fd_install(fd[0], files[0]); + fd_install(fd[1], files[1]); + } + } + return error; +} + +SYSCALL_DEFINE1(pipe, int __user *, fildes) +{ + return sys_pipe2(fildes, 0); +} + +static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) +{ + int cur = *cnt; + + while (cur == *cnt) { + pipe_wait(pipe); + if (signal_pending(current)) + break; + } + return cur == *cnt ? -ERESTARTSYS : 0; +} + +static void wake_up_partner(struct pipe_inode_info *pipe) +{ + wake_up_interruptible(&pipe->wait); +} + +static int fifo_open(struct inode *inode, struct file *filp) +{ + struct pipe_inode_info *pipe; + bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; + int ret; + + filp->f_version = 0; + + spin_lock(&inode->i_lock); + if (inode->i_pipe) { + pipe = inode->i_pipe; + pipe->files++; + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + pipe = alloc_pipe_info(); + if (!pipe) + return -ENOMEM; + pipe->files = 1; + spin_lock(&inode->i_lock); + if (unlikely(inode->i_pipe)) { + inode->i_pipe->files++; + spin_unlock(&inode->i_lock); + free_pipe_info(pipe); + pipe = inode->i_pipe; + } else { + inode->i_pipe = pipe; + spin_unlock(&inode->i_lock); + } + } + filp->private_data = pipe; + /* OK, we have a pipe and it's pinned down */ + + __pipe_lock(pipe); + + /* We can only do regular read/write on fifos */ + filp->f_mode &= (FMODE_READ | FMODE_WRITE); + + switch (filp->f_mode) { + case FMODE_READ: + /* + * O_RDONLY + * POSIX.1 says that O_NONBLOCK means return with the FIFO + * opened, even when there is no process writing the FIFO. + */ + pipe->r_counter++; + if (pipe->readers++ == 0) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->writers) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { + if (wait_for_partner(pipe, &pipe->w_counter)) + goto err_rd; + } + } + break; + + case FMODE_WRITE: + /* + * O_WRONLY + * POSIX.1 says that O_NONBLOCK means return -1 with + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; + if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) + goto err; + + pipe->w_counter++; + if (!pipe->writers++) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->readers) { + if (wait_for_partner(pipe, &pipe->r_counter)) + goto err_wr; + } + break; + + case FMODE_READ | FMODE_WRITE: + /* + * O_RDWR + * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. + * This implementation will NEVER block on a O_RDWR open, since + * the process can at least talk to itself. + */ + + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) + wake_up_partner(pipe); + break; + + default: + ret = -EINVAL; + goto err; + } + + /* Ok! */ + __pipe_unlock(pipe); + return 0; + +err_rd: + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err_wr: + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err: + __pipe_unlock(pipe); + + put_pipe_info(inode, pipe); + return ret; +} + +const struct file_operations pipefifo_fops = { + .open = fifo_open, + .llseek = no_llseek, + .read_iter = pipe_read, + .write_iter = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, +}; + +/* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) +{ + struct pipe_buffer *bufs; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (nr_pages < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + unsigned int tail; + unsigned int head; + + tail = pipe->curbuf + pipe->nrbufs; + if (tail < pipe->buffers) + tail = 0; + else + tail &= (pipe->buffers - 1); + + head = pipe->nrbufs - tail; + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = nr_pages; + return nr_pages * PAGE_SIZE; +} + +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + return file->f_op == &pipefifo_fops ? file->private_data : NULL; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + __pipe_lock(pipe); + + switch (cmd) { + case F_SETPIPE_SZ: { + unsigned int size, nr_pages; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + ret = -EINVAL; + if (!nr_pages) + goto out; + + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { + ret = -EPERM; + goto out; + } + ret = pipe_set_size(pipe, nr_pages); + break; + } + case F_GETPIPE_SZ: + ret = pipe->buffers * PAGE_SIZE; + break; + default: + ret = -EINVAL; + break; + } + +out: + __pipe_unlock(pipe); + return ret; +} + +static const struct super_operations pipefs_ops = { + .destroy_inode = free_inode_nonrcu, + .statfs = simple_statfs, +}; + +/* + * pipefs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pipe: will go nicely and kill the special-casing in procfs. + */ +static struct dentry *pipefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "pipe:", &pipefs_ops, + &pipefs_dentry_operations, PIPEFS_MAGIC); +} + +static struct file_system_type pipe_fs_type = { + .name = "pipefs", + .mount = pipefs_mount, + .kill_sb = kill_anon_super, +}; + +static int __init init_pipe_fs(void) +{ + int err = register_filesystem(&pipe_fs_type); + + if (!err) { + pipe_mnt = kern_mount(&pipe_fs_type); + if (IS_ERR(pipe_mnt)) { + err = PTR_ERR(pipe_mnt); + unregister_filesystem(&pipe_fs_type); + } + } + return err; +} + +fs_initcall(init_pipe_fs); diff --git a/kernel/fs/pnode.c b/kernel/fs/pnode.c new file mode 100644 index 000000000..6367e1e43 --- /dev/null +++ b/kernel/fs/pnode.c @@ -0,0 +1,452 @@ +/* + * linux/fs/pnode.c + * + * (C) Copyright IBM Corporation 2005. + * Released under GPL v2. + * Author : Ram Pai (linuxram@us.ibm.com) + * + */ +#include +#include +#include +#include +#include "internal.h" +#include "pnode.h" + +/* return the next shared peer mount of @p */ +static inline struct mount *next_peer(struct mount *p) +{ + return list_entry(p->mnt_share.next, struct mount, mnt_share); +} + +static inline struct mount *first_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); +} + +static inline struct mount *next_slave(struct mount *p) +{ + return list_entry(p->mnt_slave.next, struct mount, mnt_slave); +} + +static struct mount *get_peer_under_root(struct mount *mnt, + struct mnt_namespace *ns, + const struct path *root) +{ + struct mount *m = mnt; + + do { + /* Check the namespace first for optimization */ + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) + return m; + + m = next_peer(m); + } while (m != mnt); + + return NULL; +} + +/* + * Get ID of closest dominating peer group having a representative + * under the given root. + * + * Caller must hold namespace_sem + */ +int get_dominating_id(struct mount *mnt, const struct path *root) +{ + struct mount *m; + + for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { + struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); + if (d) + return d->mnt_group_id; + } + + return 0; +} + +static int do_make_slave(struct mount *mnt) +{ + struct mount *peer_mnt = mnt, *master = mnt->mnt_master; + struct mount *slave_mnt; + + /* + * slave 'mnt' to a peer mount that has the + * same root dentry. If none is available then + * slave it to anything that is available. + */ + while ((peer_mnt = next_peer(peer_mnt)) != mnt && + peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ; + + if (peer_mnt == mnt) { + peer_mnt = next_peer(mnt); + if (peer_mnt == mnt) + peer_mnt = NULL; + } + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) + mnt_release_group_id(mnt); + + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + + if (peer_mnt) + master = peer_mnt; + + if (master) { + list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) + slave_mnt->mnt_master = master; + list_move(&mnt->mnt_slave, &master->mnt_slave_list); + list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + } else { + struct list_head *p = &mnt->mnt_slave_list; + while (!list_empty(p)) { + slave_mnt = list_first_entry(p, + struct mount, mnt_slave); + list_del_init(&slave_mnt->mnt_slave); + slave_mnt->mnt_master = NULL; + } + } + mnt->mnt_master = master; + CLEAR_MNT_SHARED(mnt); + return 0; +} + +/* + * vfsmount lock must be held for write + */ +void change_mnt_propagation(struct mount *mnt, int type) +{ + if (type == MS_SHARED) { + set_mnt_shared(mnt); + return; + } + do_make_slave(mnt); + if (type != MS_SLAVE) { + list_del_init(&mnt->mnt_slave); + mnt->mnt_master = NULL; + if (type == MS_UNBINDABLE) + mnt->mnt.mnt_flags |= MNT_UNBINDABLE; + else + mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; + } +} + +/* + * get the next mount in the propagation tree. + * @m: the mount seen last + * @origin: the original mount from where the tree walk initiated + * + * Note that peer groups form contiguous segments of slave lists. + * We rely on that in get_source() to be able to find out if + * vfsmount found while iterating with propagation_next() is + * a peer of one we'd found earlier. + */ +static struct mount *propagation_next(struct mount *m, + struct mount *origin) +{ + /* are there any slaves of this mount? */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + + while (1) { + struct mount *master = m->mnt_master; + + if (master == origin->mnt_master) { + struct mount *next = next_peer(m); + return (next == origin) ? NULL : next; + } else if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + + /* back at master */ + m = master; + } +} + +static struct mount *next_group(struct mount *m, struct mount *origin) +{ + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; + } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; + } +} + +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } + } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; + } + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + child->mnt.mnt_flags &= ~MNT_LOCKED; + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); + } + hlist_add_head(&child->mnt_hash, list); + return 0; +} + +/* + * mount 'source_mnt' under the destination 'dest_mnt' at + * dentry 'dest_dentry'. And propagate that mount to + * all the peer and slave mounts of 'dest_mnt'. + * Link all the new mounts into a propagation tree headed at + * source_mnt. Also link all the new mounts using ->mnt_list + * headed at source_mnt's ->mnt_list + * + * @dest_mnt: destination mount. + * @dest_dentry: destination dentry. + * @source_mnt: source mount. + * @tree_list : list of heads of trees to be attached. + */ +int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, + struct mount *source_mnt, struct hlist_head *tree_list) +{ + struct mount *m, *n; + int ret = 0; + + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; + + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) + goto out; + } + + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); + } +out: + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); + } + read_sequnlock_excl(&mount_lock); + return ret; +} + +/* + * return true if the refcount is greater than count + */ +static inline int do_refcount_check(struct mount *mnt, int count) +{ + return mnt_get_count(mnt) > count; +} + +/* + * check if the mount 'mnt' can be unmounted successfully. + * @mnt: the mount to be checked for unmount + * NOTE: unmounting 'mnt' would naturally propagate to all + * other mounts its parent propagates to. + * Check if any of these mounts that **do not have submounts** + * have more references than 'refcnt'. If so return busy. + * + * vfsmount lock must be held for write + */ +int propagate_mount_busy(struct mount *mnt, int refcnt) +{ + struct mount *m, *child; + struct mount *parent = mnt->mnt_parent; + int ret = 0; + + if (mnt == parent) + return do_refcount_check(mnt, refcnt); + + /* + * quickly check if the current mount can be unmounted. + * If not, we don't have to go checking for all other + * mounts + */ + if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) + return 1; + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + if (child && list_empty(&child->mnt_mounts) && + (ret = do_refcount_check(child, 1))) + break; + } + return ret; +} + +/* + * Clear MNT_LOCKED when it can be shown to be safe. + * + * mount_lock lock must be held for write + */ +void propagate_mount_unlock(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m, *child; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + if (child) + child->mnt.mnt_flags &= ~MNT_LOCKED; + } +} + +/* + * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. + */ +static void mark_umount_candidates(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); + if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + SET_MNT_MARK(child); + } + } +} + +/* + * NOTE: unmounting 'mnt' naturally propagates to all other mounts its + * parent propagates to. + */ +static void __propagate_umount(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); + /* + * umount the child only if the child has no children + * and the child is marked safe to unmount. + */ + if (!child || !IS_MNT_MARKED(child)) + continue; + CLEAR_MNT_MARK(child); + if (list_empty(&child->mnt_mounts)) { + list_del_init(&child->mnt_child); + child->mnt.mnt_flags |= MNT_UMOUNT; + list_move_tail(&child->mnt_list, &mnt->mnt_list); + } + } +} + +/* + * collect all mounts that receive propagation from the mount in @list, + * and return these additional mounts in the same list. + * @list: the list of mounts to be unmounted. + * + * vfsmount lock must be held for write + */ +int propagate_umount(struct list_head *list) +{ + struct mount *mnt; + + list_for_each_entry_reverse(mnt, list, mnt_list) + mark_umount_candidates(mnt); + + list_for_each_entry(mnt, list, mnt_list) + __propagate_umount(mnt); + return 0; +} diff --git a/kernel/fs/pnode.h b/kernel/fs/pnode.h new file mode 100644 index 000000000..7114ce6e6 --- /dev/null +++ b/kernel/fs/pnode.h @@ -0,0 +1,57 @@ +/* + * linux/fs/pnode.h + * + * (C) Copyright IBM Corporation 2005. + * Released under GPL v2. + * + */ +#ifndef _LINUX_PNODE_H +#define _LINUX_PNODE_H + +#include +#include "mount.h" + +#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) +#define IS_MNT_SLAVE(m) ((m)->mnt_master) +#define IS_MNT_NEW(m) (!(m)->mnt_ns) +#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) +#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) +#define IS_MNT_LOCKED_AND_LAZY(m) \ + (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED) + +#define CL_EXPIRE 0x01 +#define CL_SLAVE 0x02 +#define CL_COPY_UNBINDABLE 0x04 +#define CL_MAKE_SHARED 0x08 +#define CL_PRIVATE 0x10 +#define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 +#define CL_COPY_MNT_NS_FILE 0x80 + +#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) + +static inline void set_mnt_shared(struct mount *mnt) +{ + mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; + mnt->mnt.mnt_flags |= MNT_SHARED; +} + +void change_mnt_propagation(struct mount *, int); +int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, + struct hlist_head *); +int propagate_umount(struct list_head *); +int propagate_mount_busy(struct mount *, int); +void propagate_mount_unlock(struct mount *); +void mnt_release_group_id(struct mount *); +int get_dominating_id(struct mount *mnt, const struct path *root); +unsigned int mnt_get_count(struct mount *mnt); +void mnt_set_mountpoint(struct mount *, struct mountpoint *, + struct mount *); +struct mount *copy_tree(struct mount *, struct dentry *, int); +bool is_path_reachable(struct mount *, struct dentry *, + const struct path *root); +#endif /* _LINUX_PNODE_H */ diff --git a/kernel/fs/posix_acl.c b/kernel/fs/posix_acl.c new file mode 100644 index 000000000..84bb65b83 --- /dev/null +++ b/kernel/fs/posix_acl.c @@ -0,0 +1,905 @@ +/* + * Copyright (C) 2002,2003 by Andreas Gruenbacher + * + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, ). + */ + +/* + * This file contains generic functions for manipulating + * POSIX 1003.1e draft standard 17 ACLs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); + +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); + +struct posix_acl *get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (!inode->i_op->get_acl) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return inode->i_op->get_acl(inode, type); +} +EXPORT_SYMBOL(get_acl); + +/* + * Init a fresh posix_acl + */ +void +posix_acl_init(struct posix_acl *acl, int count) +{ + atomic_set(&acl->a_refcount, 1); + acl->a_count = count; +} +EXPORT_SYMBOL(posix_acl_init); + +/* + * Allocate a new ACL with the specified number of entries. + */ +struct posix_acl * +posix_acl_alloc(int count, gfp_t flags) +{ + const size_t size = sizeof(struct posix_acl) + + count * sizeof(struct posix_acl_entry); + struct posix_acl *acl = kmalloc(size, flags); + if (acl) + posix_acl_init(acl, count); + return acl; +} +EXPORT_SYMBOL(posix_acl_alloc); + +/* + * Clone an ACL. + */ +static struct posix_acl * +posix_acl_clone(const struct posix_acl *acl, gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + atomic_set(&clone->a_refcount, 1); + } + return clone; +} + +/* + * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. + */ +int +posix_acl_valid(const struct posix_acl *acl) +{ + const struct posix_acl_entry *pa, *pe; + int state = ACL_USER_OBJ; + int needs_mask = 0; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) + return -EINVAL; + switch (pa->e_tag) { + case ACL_USER_OBJ: + if (state == ACL_USER_OBJ) { + state = ACL_USER; + break; + } + return -EINVAL; + + case ACL_USER: + if (state != ACL_USER) + return -EINVAL; + if (!uid_valid(pa->e_uid)) + return -EINVAL; + needs_mask = 1; + break; + + case ACL_GROUP_OBJ: + if (state == ACL_USER) { + state = ACL_GROUP; + break; + } + return -EINVAL; + + case ACL_GROUP: + if (state != ACL_GROUP) + return -EINVAL; + if (!gid_valid(pa->e_gid)) + return -EINVAL; + needs_mask = 1; + break; + + case ACL_MASK: + if (state != ACL_GROUP) + return -EINVAL; + state = ACL_OTHER; + break; + + case ACL_OTHER: + if (state == ACL_OTHER || + (state == ACL_GROUP && !needs_mask)) { + state = 0; + break; + } + return -EINVAL; + + default: + return -EINVAL; + } + } + if (state == 0) + return 0; + return -EINVAL; +} +EXPORT_SYMBOL(posix_acl_valid); + +/* + * Returns 0 if the acl can be exactly represented in the traditional + * file mode permission bits, or else 1. Returns -E... on error. + */ +int +posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) +{ + const struct posix_acl_entry *pa, *pe; + umode_t mode = 0; + int not_equiv = 0; + + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + mode |= (pa->e_perm & S_IRWXO) << 6; + break; + case ACL_GROUP_OBJ: + mode |= (pa->e_perm & S_IRWXO) << 3; + break; + case ACL_OTHER: + mode |= pa->e_perm & S_IRWXO; + break; + case ACL_MASK: + mode = (mode & ~S_IRWXG) | + ((pa->e_perm & S_IRWXO) << 3); + not_equiv = 1; + break; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + default: + return -EINVAL; + } + } + if (mode_p) + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} +EXPORT_SYMBOL(posix_acl_equiv_mode); + +/* + * Create an ACL representing the file mode permission bits of an inode. + */ +struct posix_acl * +posix_acl_from_mode(umode_t mode, gfp_t flags) +{ + struct posix_acl *acl = posix_acl_alloc(3, flags); + if (!acl) + return ERR_PTR(-ENOMEM); + + acl->a_entries[0].e_tag = ACL_USER_OBJ; + acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; + + acl->a_entries[1].e_tag = ACL_GROUP_OBJ; + acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; + + acl->a_entries[2].e_tag = ACL_OTHER; + acl->a_entries[2].e_perm = (mode & S_IRWXO); + return acl; +} +EXPORT_SYMBOL(posix_acl_from_mode); + +/* + * Return 0 if current is granted want access to the inode + * by the acl. Returns -E... otherwise. + */ +int +posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) +{ + const struct posix_acl_entry *pa, *pe, *mask_obj; + int found = 0; + + want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + /* (May have been checked already) */ + if (uid_eq(inode->i_uid, current_fsuid())) + goto check_perm; + break; + case ACL_USER: + if (uid_eq(pa->e_uid, current_fsuid())) + goto mask; + break; + case ACL_GROUP_OBJ: + if (in_group_p(inode->i_gid)) { + found = 1; + if ((pa->e_perm & want) == want) + goto mask; + } + break; + case ACL_GROUP: + if (in_group_p(pa->e_gid)) { + found = 1; + if ((pa->e_perm & want) == want) + goto mask; + } + break; + case ACL_MASK: + break; + case ACL_OTHER: + if (found) + return -EACCES; + else + goto check_perm; + default: + return -EIO; + } + } + return -EIO; + +mask: + for (mask_obj = pa+1; mask_obj != pe; mask_obj++) { + if (mask_obj->e_tag == ACL_MASK) { + if ((pa->e_perm & mask_obj->e_perm & want) == want) + return 0; + return -EACCES; + } + } + +check_perm: + if ((pa->e_perm & want) == want) + return 0; + return -EACCES; +} + +/* + * Modify acl when creating a new inode. The caller must ensure the acl is + * only referenced once. + * + * mode_p initially must contain the mode parameter to the open() / creat() + * system calls. All permissions that are not granted by the acl are removed. + * The permissions in the acl are changed to reflect the mode_p parameter. + */ +static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +/* + * Modify the ACL for the chmod syscall. + */ +static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) +{ + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + struct posix_acl_entry *pa, *pe; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm = (mode & S_IRWXU) >> 6; + break; + + case ACL_USER: + case ACL_GROUP: + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_MASK: + mask_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm = (mode & S_IRWXO); + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm = (mode & S_IRWXG) >> 3; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm = (mode & S_IRWXG) >> 3; + } + + return 0; +} + +int +__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = posix_acl_create_masq(clone, mode_p); + if (err < 0) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(__posix_acl_create); + +int +__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = __posix_acl_chmod_masq(clone, mode); + if (err) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(__posix_acl_chmod); + +int +posix_acl_chmod(struct inode *inode, umode_t mode) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(inode)) + return 0; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) { + if (acl == ERR_PTR(-EOPNOTSUPP)) + return 0; + return PTR_ERR(acl); + } + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + return ret; + ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + return ret; +} +EXPORT_SYMBOL(posix_acl_chmod); + +int +posix_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; + return PTR_ERR(p); + } + + if (!p) + goto apply_umask; + + *acl = posix_acl_clone(p, GFP_NOFS); + if (!*acl) + goto no_mem; + + ret = posix_acl_create_masq(*acl, mode); + if (ret < 0) + goto no_mem_clone; + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +apply_umask: + *mode &= ~current_umask(); +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; + +no_mem_clone: + posix_acl_release(*acl); +no_mem: + posix_acl_release(p); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(posix_acl_create); + +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + + case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; + case ACL_GROUP: + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_entry = ext_acl->a_entries; + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); + +static int +posix_acl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!IS_POSIXACL(d_backing_inode(dentry))) + return -EOPNOTSUPP; + if (d_is_symlink(dentry)) + return -EOPNOTSUPP; + + acl = get_acl(d_backing_inode(dentry), type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +posix_acl_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = d_backing_inode(dentry); + struct posix_acl *acl = NULL; + int ret; + + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = inode->i_op->set_acl(inode, acl, type); +out: + posix_acl_release(acl); + return ret; +} + +static size_t +posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const char *xname; + size_t size; + + if (!IS_POSIXACL(d_backing_inode(dentry))) + return -EOPNOTSUPP; + if (d_is_symlink(dentry)) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + else + xname = POSIX_ACL_XATTR_DEFAULT; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +const struct xattr_handler posix_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); + +const struct xattr_handler posix_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); + +int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return 0; + if (error == 0) + acl = NULL; + } + + inode->i_ctime = CURRENT_TIME; + set_cached_acl(inode, type, acl); + return 0; +} + +int simple_acl_create(struct inode *dir, struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return 0; +} diff --git a/kernel/fs/proc/Kconfig b/kernel/fs/proc/Kconfig new file mode 100644 index 000000000..2183fcf41 --- /dev/null +++ b/kernel/fs/proc/Kconfig @@ -0,0 +1,73 @@ +config PROC_FS + bool "/proc file system support" if EXPERT + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + help + Provides a virtual ELF core file of the live kernel. This can + be read with gdb and other ELF tools. No modifications can be + made using this mechanism. + +config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EXPERT + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in . Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EXPERT + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/kernel/fs/proc/Makefile b/kernel/fs/proc/Makefile new file mode 100644 index 000000000..7151ea428 --- /dev/null +++ b/kernel/fs/proc/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for the Linux proc filesystem routines. +# + +obj-y += proc.o + +proc-y := nommu.o task_nommu.o +proc-$(CONFIG_MMU) := task_mmu.o + +proc-y += inode.o root.o base.o generic.o array.o \ + fd.o +proc-$(CONFIG_TTY) += proc_tty.o +proc-y += cmdline.o +proc-y += consoles.o +proc-y += cpuinfo.o +proc-y += devices.o +proc-y += interrupts.o +proc-y += loadavg.o +proc-y += meminfo.o +proc-y += stat.o +proc-y += uptime.o +proc-y += version.o +proc-y += softirqs.o +proc-y += namespaces.o +proc-y += self.o +proc-y += thread_self.o +proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o +proc-$(CONFIG_NET) += proc_net.o +proc-$(CONFIG_PROC_KCORE) += kcore.o +proc-$(CONFIG_PROC_VMCORE) += vmcore.o +proc-$(CONFIG_PRINTK) += kmsg.o +proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/kernel/fs/proc/array.c b/kernel/fs/proc/array.c new file mode 100644 index 000000000..fd02a9ebf --- /dev/null +++ b/kernel/fs/proc/array.c @@ -0,0 +1,695 @@ +/* + * linux/fs/proc/array.c + * + * Copyright (C) 1992 by Linus Torvalds + * based on ideas by Darren Senn + * + * Fixes: + * Michael. K. Johnson: stat,statm extensions. + * + * + * Pauline Middelink : Made cmdline,envline only break at '\0's, to + * make sure SET_PROCTITLE works. Also removed + * bad '!' which forced address recalculation for + * EVERY character on the current page. + * + * + * Danny ter Haar : added cpuinfo + * + * + * Alessandro Rubini : profile extension. + * + * + * Jeff Tranter : added BogoMips field to cpuinfo + * + * + * Bruno Haible : remove 4K limit for the maps file + * + * + * Yves Arrouye : remove removal of trailing spaces in get_array. + * + * + * Jerome Forissier : added per-CPU time information to /proc/stat + * and /proc//cpu extension + * + * - Incorporation and non-SMP safe operation + * of forissier patch in 2.1.78 by + * Hans Marcus + * + * aeb@cwi.nl : /proc/partitions + * + * + * Alan Cox : security fixes. + * + * + * Al Viro : safe handling of mm_struct + * + * Gerhard Wichert : added BIGMEM support + * Siemens AG + * + * Al Viro & Jeff Garzik : moved most of the thing into base.c and + * : proc_misc.c. The rest may eventually go into + * : base.c too. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "internal.h" + +static inline void task_name(struct seq_file *m, struct task_struct *p) +{ + char *buf; + char tcomm[sizeof(p->comm)]; + + get_task_comm(tcomm, p); + + seq_puts(m, "Name:\t"); + buf = m->buf + m->count; + + /* Ignore error for now */ + buf += string_escape_str(tcomm, buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + + m->count = buf - m->buf; + seq_putc(m, '\n'); +} + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const char * const task_state_array[] = { + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ +}; + +static inline const char *get_task_state(struct task_struct *tsk) +{ + unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); + + return task_state_array[fls(state)]; +} + +static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) +{ + struct user_namespace *user_ns = seq_user_ns(m); + struct group_info *group_info; + int g; + struct task_struct *tracer; + const struct cred *cred; + pid_t ppid, tpid = 0, tgid, ngid; + unsigned int max_fds = 0; + + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + + tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + + tgid = task_tgid_nr_ns(p, ns); + ngid = task_numa_group_id(p); + cred = get_task_cred(p); + + task_lock(p); + if (p->files) + max_fds = files_fdtable(p->files)->max_fds; + task_unlock(p); + rcu_read_unlock(); + + seq_printf(m, + "State:\t%s\n" + "Tgid:\t%d\n" + "Ngid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\nGroups:\t", + get_task_state(p), + tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid), + max_fds); + + group_info = cred->group_info; + for (g = 0; g < group_info->ngroups; g++) + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + put_cred(cred); + +#ifdef CONFIG_PID_NS + seq_puts(m, "\nNStgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSsid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_session_nr_ns(p, pid->numbers[g].ns)); +#endif + seq_putc(m, '\n'); +} + +void render_sigset_t(struct seq_file *m, const char *header, + sigset_t *set) +{ + int i; + + seq_puts(m, header); + + i = _NSIG; + do { + int x = 0; + + i -= 4; + if (sigismember(set, i+1)) x |= 1; + if (sigismember(set, i+2)) x |= 2; + if (sigismember(set, i+3)) x |= 4; + if (sigismember(set, i+4)) x |= 8; + seq_printf(m, "%x", x); + } while (i >= 4); + + seq_putc(m, '\n'); +} + +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, + sigset_t *catch) +{ + struct k_sigaction *k; + int i; + + k = p->sighand->action; + for (i = 1; i <= _NSIG; ++i, ++k) { + if (k->sa.sa_handler == SIG_IGN) + sigaddset(ign, i); + else if (k->sa.sa_handler != SIG_DFL) + sigaddset(catch, i); + } +} + +static inline void task_sig(struct seq_file *m, struct task_struct *p) +{ + unsigned long flags; + sigset_t pending, shpending, blocked, ignored, caught; + int num_threads = 0; + unsigned long qsize = 0; + unsigned long qlim = 0; + + sigemptyset(&pending); + sigemptyset(&shpending); + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); + + if (lock_task_sighand(p, &flags)) { + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = get_nr_threads(p); + rcu_read_lock(); /* FIXME: is this correct? */ + qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); + qlim = task_rlimit(p, RLIMIT_SIGPENDING); + unlock_task_sighand(p, &flags); + } + + seq_printf(m, "Threads:\t%d\n", num_threads); + seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); + + /* render them all */ + render_sigset_t(m, "SigPnd:\t", &pending); + render_sigset_t(m, "ShdPnd:\t", &shpending); + render_sigset_t(m, "SigBlk:\t", &blocked); + render_sigset_t(m, "SigIgn:\t", &ignored); + render_sigset_t(m, "SigCgt:\t", &caught); +} + +static void render_cap_t(struct seq_file *m, const char *header, + kernel_cap_t *a) +{ + unsigned __capi; + + seq_puts(m, header); + CAP_FOR_EACH_U32(__capi) { + seq_printf(m, "%08x", + a->cap[CAP_LAST_U32 - __capi]); + } + seq_putc(m, '\n'); +} + +static inline void task_cap(struct seq_file *m, struct task_struct *p) +{ + const struct cred *cred; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + + rcu_read_lock(); + cred = __task_cred(p); + cap_inheritable = cred->cap_inheritable; + cap_permitted = cred->cap_permitted; + cap_effective = cred->cap_effective; + cap_bset = cred->cap_bset; + rcu_read_unlock(); + + render_cap_t(m, "CapInh:\t", &cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cap_permitted); + render_cap_t(m, "CapEff:\t", &cap_effective); + render_cap_t(m, "CapBnd:\t", &cap_bset); +} + +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); +#endif +} + +static inline void task_context_switch_counts(struct seq_file *m, + struct task_struct *p) +{ + seq_printf(m, "voluntary_ctxt_switches:\t%lu\n" + "nonvoluntary_ctxt_switches:\t%lu\n", + p->nvcsw, + p->nivcsw); +} + +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_allowed:\t%*pb\n", + cpumask_pr_args(&task->cpus_allowed)); + seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", + cpumask_pr_args(&task->cpus_allowed)); +} + +int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + + task_name(m, task); + task_state(m, ns, pid, task); + + if (mm) { + task_mem(m, mm); + mmput(mm); + } + task_sig(m, task); + task_cap(m, task); + task_seccomp(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); + task_context_switch_counts(m, task); + return 0; +} + +static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task, int whole) +{ + unsigned long vsize, eip, esp, wchan = ~0UL; + int priority, nice; + int tty_pgrp = -1, tty_nr = 0; + sigset_t sigign, sigcatch; + char state; + pid_t ppid = 0, pgid = -1, sid = -1; + int num_threads = 0; + int permitted; + struct mm_struct *mm; + unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; + unsigned long rsslim = 0; + char tcomm[sizeof(task->comm)]; + unsigned long flags; + + state = *get_task_state(task); + vsize = eip = esp = 0; + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + mm = get_task_mm(task); + if (mm) { + vsize = task_vsize(mm); + if (permitted) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } + } + + get_task_comm(tcomm, task); + + sigemptyset(&sigign); + sigemptyset(&sigcatch); + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + if (sig->tty) { + struct pid *pgrp = tty_get_pgrp(sig->tty); + tty_pgrp = pid_nr_ns(pgrp, ns); + put_pid(pgrp); + tty_nr = new_encode_dev(tty_devnum(sig->tty)); + } + + num_threads = get_nr_threads(task); + collect_sigign_sigcatch(task, &sigign, &sigcatch); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } while_each_thread(task, t); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + thread_group_cputime_adjusted(task, &utime, &stime); + gtime += sig->gtime; + } + + sid = task_session_nr_ns(task, ns); + ppid = task_tgid_nr_ns(task->real_parent, ns); + pgid = task_pgrp_nr_ns(task, ns); + + unlock_task_sighand(task, &flags); + } + + if (permitted && (!whole || num_threads < 2)) + wchan = get_wchan(task); + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + task_cputime_adjusted(task, &utime, &stime); + gtime = task_gtime(task); + } + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); + nice = task_nice(task); + + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(task->real_start_time); + + seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ll(m, ' ', ppid); + seq_put_decimal_ll(m, ' ', pgid); + seq_put_decimal_ll(m, ' ', sid); + seq_put_decimal_ll(m, ' ', tty_nr); + seq_put_decimal_ll(m, ' ', tty_pgrp); + seq_put_decimal_ull(m, ' ', task->flags); + seq_put_decimal_ull(m, ' ', min_flt); + seq_put_decimal_ull(m, ' ', cmin_flt); + seq_put_decimal_ull(m, ' ', maj_flt); + seq_put_decimal_ull(m, ' ', cmaj_flt); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, ' ', priority); + seq_put_decimal_ll(m, ' ', nice); + seq_put_decimal_ll(m, ' ', num_threads); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', start_time); + seq_put_decimal_ull(m, ' ', vsize); + seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', rsslim); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, ' ', esp); + seq_put_decimal_ull(m, ' ', eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', wchan); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ll(m, ' ', task->exit_signal); + seq_put_decimal_ll(m, ' ', task_cpu(task)); + seq_put_decimal_ull(m, ' ', task->rt_priority); + seq_put_decimal_ull(m, ' ', task->policy); + seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + + if (mm && permitted) { + seq_put_decimal_ull(m, ' ', mm->start_data); + seq_put_decimal_ull(m, ' ', mm->end_data); + seq_put_decimal_ull(m, ' ', mm->start_brk); + seq_put_decimal_ull(m, ' ', mm->arg_start); + seq_put_decimal_ull(m, ' ', mm->arg_end); + seq_put_decimal_ull(m, ' ', mm->env_start); + seq_put_decimal_ull(m, ' ', mm->env_end); + } else + seq_printf(m, " 0 0 0 0 0 0 0"); + + if (permitted) + seq_put_decimal_ll(m, ' ', task->exit_code); + else + seq_put_decimal_ll(m, ' ', 0); + + seq_putc(m, '\n'); + if (mm) + mmput(mm); + return 0; +} + +int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 0); +} + +int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 1); +} + +int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; + struct mm_struct *mm = get_task_mm(task); + + if (mm) { + size = task_statm(mm, &shared, &text, &data, &resident); + mmput(mm); + } + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, 0, size); + seq_put_decimal_ull(m, ' ', resident); + seq_put_decimal_ull(m, ' ', shared); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', data); + seq_put_decimal_ull(m, ' ', 0); + seq_putc(m, '\n'); + + return 0; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static struct pid * +get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +{ + struct task_struct *start, *task; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + /* + * Lets try to continue searching first, this gives + * us significant speedup on children-rich processes. + */ + if (pid_prev) { + task = pid_task(pid_prev, PIDTYPE_PID); + if (task && task->real_parent == start && + !(list_empty(&task->sibling))) { + if (list_is_last(&task->sibling, &start->children)) + goto out; + task = list_first_entry(&task->sibling, + struct task_struct, sibling); + pid = get_pid(task_pid(task)); + goto out; + } + } + + /* + * Slow search case. + * + * We might miss some children here if children + * are exited while we were not holding the lock, + * but it was never promised to be accurate that + * much. + * + * "Just suppose that the parent sleeps, but N children + * exit after we printed their tids. Now the slow paths + * skips N extra children, we miss N tasks." (c) + * + * So one need to stop or freeze the leader and all + * its children to get a precise result. + */ + list_for_each_entry(task, &start->children, sibling) { + if (pos-- == 0) { + pid = get_pid(task_pid(task)); + break; + } + } + +out: + read_unlock(&tasklist_lock); + return pid; +} + +static int children_seq_show(struct seq_file *seq, void *v) +{ + struct inode *inode = seq->private; + pid_t pid; + + pid = pid_nr_ns(v, inode->i_sb->s_fs_info); + seq_printf(seq, "%d ", pid); + + return 0; +} + +static void *children_seq_start(struct seq_file *seq, loff_t *pos) +{ + return get_children_pid(seq->private, NULL, *pos); +} + +static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct pid *pid; + + pid = get_children_pid(seq->private, v, *pos + 1); + put_pid(v); + + ++*pos; + return pid; +} + +static void children_seq_stop(struct seq_file *seq, void *v) +{ + put_pid(v); +} + +static const struct seq_operations children_seq_ops = { + .start = children_seq_start, + .next = children_seq_next, + .stop = children_seq_stop, + .show = children_seq_show, +}; + +static int children_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &children_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = inode; + + return ret; +} + +int children_seq_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + return 0; +} + +const struct file_operations proc_tid_children_operations = { + .open = children_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = children_seq_release, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/fs/proc/base.c b/kernel/fs/proc/base.c new file mode 100644 index 000000000..093ca14f5 --- /dev/null +++ b/kernel/fs/proc/base.c @@ -0,0 +1,3214 @@ +/* + * linux/fs/proc/base.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc base directory handling functions + * + * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. + * Instead of using magical inumbers to determine the kind of object + * we allocate and fill in-core inodes upon lookup. They don't even + * go into icache. We cache the reference to task_struct upon lookup too. + * Eventually it should become a filesystem in its own. We don't use the + * rest of procfs anymore. + * + * + * Changelog: + * 17-Jan-2005 + * Allan Bezerra + * Bruna Moreira + * Edjard Mota + * Ilias Biris + * Mauricio Lin + * + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * + * A new process specific entry (smaps) included in /proc. It shows the + * size of rss for each memory area. The maps entry lacks information + * about physical memory size (rss) for each mapped file, i.e., + * rss information for executables and library files. + * This additional information is useful for any tools that need to know + * about physical memory consumption for a process specific library. + * + * Changelog: + * 21-Feb-2005 + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * Pud inclusion in the page table walking. + * + * ChangeLog: + * 10-Mar-2005 + * 10LE Instituto Nokia de Tecnologia - INdT: + * A better way to walks through the page table as suggested by Hugh Dickins. + * + * Simo Piiroinen : + * Smaps information related to shared, private, clean and dirty pages. + * + * Paul Mundt : + * Overall revision about smaps. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_HARDWALL +#include +#endif +#include +#include "internal.h" +#include "fd.h" + +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + +struct pid_entry { + const char *name; + int len; + umode_t mode; + const struct inode_operations *iop; + const struct file_operations *fop; + union proc_op op; +}; + +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .name = (NAME), \ + .len = sizeof(NAME) - 1, \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ +} + +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define ONE(NAME, MODE, show) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_single_file_operations, \ + { .proc_show = show } ) + +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 0; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + +static int get_task_root(struct task_struct *task, struct path *root) +{ + int result = -ENOENT; + + task_lock(task); + if (task->fs) { + get_fs_root(task->fs, root); + result = 0; + } + task_unlock(task); + return result; +} + +static int proc_cwd_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(d_inode(dentry)); + int result = -ENOENT; + + if (task) { + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; + } + task_unlock(task); + put_task_struct(task); + } + return result; +} + +static int proc_root_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(d_inode(dentry)); + int result = -ENOENT; + + if (task) { + result = get_task_root(task, path); + put_task_struct(task); + } + return result; +} + +static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + /* + * Rely on struct seq_operations::show() being called once + * per internal buffer allocation. See single_open(), traverse(). + */ + BUG_ON(m->size < PAGE_SIZE); + m->count += get_cmdline(task, m->buf, PAGE_SIZE); + return 0; +} + +static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + if (mm && !IS_ERR(mm)) { + unsigned int nwords = 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); + mmput(mm); + return 0; + } else + return PTR_ERR(mm); +} + + +#ifdef CONFIG_KALLSYMS +/* + * Provides a wchan file via kallsyms in a proper one-value-per-file format. + * Returns the resolved symbol. If that fails, simply return the address. + */ +static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long wchan; + char symname[KSYM_NAME_LEN]; + + wchan = get_wchan(task); + + if (lookup_symbol_name(wchan, symname) < 0) { + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return 0; + seq_printf(m, "%lu", wchan); + } else { + seq_printf(m, "%s", symname); + } + + return 0; +} +#endif /* CONFIG_KALLSYMS */ + +static int lock_trace(struct task_struct *task) +{ + int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + mutex_unlock(&task->signal->cred_guard_mutex); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + mutex_unlock(&task->signal->cred_guard_mutex); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct stack_trace trace; + unsigned long *entries; + int err; + int i; + + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_TRACE_DEPTH; + trace.entries = entries; + trace.skip = 0; + + err = lock_trace(task); + if (!err) { + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%pK>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + unlock_trace(task); + } + kfree(entries); + + return err; +} +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * Provides /proc/PID/schedstat + */ +static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + + return 0; +} +#endif + +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct inode *inode = m->private; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + seq_puts(m, "Latency Top version : v0.1\n"); + for (i = 0; i < 32; i++) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { + int q; + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long bt = lr->backtrace[q]; + if (!bt) + break; + if (bt == ULONG_MAX) + break; + seq_printf(m, " %ps", (void *)bt); + } + seq_putc(m, '\n'); + } + + } + put_task_struct(task); + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lstats_show_proc, inode); +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + + if (!task) + return -ESRCH; + clear_all_latency_tracing(task); + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long points = 0; + + read_lock(&tasklist_lock); + if (pid_alive(task)) + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; + read_unlock(&tasklist_lock); + seq_printf(m, "%lu\n", points); + + return 0; +} + +struct limit_names { + const char *name; + const char *unit; +}; + +static const struct limit_names lnames[RLIM_NLIMITS] = { + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, + [RLIMIT_FSIZE] = {"Max file size", "bytes"}, + [RLIMIT_DATA] = {"Max data size", "bytes"}, + [RLIMIT_STACK] = {"Max stack size", "bytes"}, + [RLIMIT_CORE] = {"Max core file size", "bytes"}, + [RLIMIT_RSS] = {"Max resident set", "bytes"}, + [RLIMIT_NPROC] = {"Max processes", "processes"}, + [RLIMIT_NOFILE] = {"Max open files", "files"}, + [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, + [RLIMIT_AS] = {"Max address space", "bytes"}, + [RLIMIT_LOCKS] = {"Max file locks", "locks"}, + [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, + [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, + [RLIMIT_NICE] = {"Max nice priority", NULL}, + [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, + [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, +}; + +/* Display limits for a process */ +static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned int i; + unsigned long flags; + + struct rlimit rlim[RLIM_NLIMITS]; + + if (!lock_task_sighand(task, &flags)) + return 0; + memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); + unlock_task_sighand(task, &flags); + + /* + * print the file header + */ + seq_printf(m, "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + + for (i = 0; i < RLIM_NLIMITS; i++) { + if (rlim[i].rlim_cur == RLIM_INFINITY) + seq_printf(m, "%-25s %-20s ", + lnames[i].name, "unlimited"); + else + seq_printf(m, "%-25s %-20lu ", + lnames[i].name, rlim[i].rlim_cur); + + if (rlim[i].rlim_max == RLIM_INFINITY) + seq_printf(m, "%-20s ", "unlimited"); + else + seq_printf(m, "%-20lu ", rlim[i].rlim_max); + + if (lnames[i].unit) + seq_printf(m, "%-10s\n", lnames[i].unit); + else + seq_putc(m, '\n'); + } + + return 0; +} + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK +static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + long nr; + unsigned long args[6], sp, pc; + int res; + + res = lock_trace(task); + if (res) + return res; + + if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + seq_puts(m, "running\n"); + else if (nr < 0) + seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else + seq_printf(m, + "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + nr, + args[0], args[1], args[2], args[3], args[4], args[5], + sp, pc); + unlock_trace(task); + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ + +/************************************************************************/ +/* Here the fs part begins */ +/************************************************************************/ + +/* permission checks */ +static int proc_fd_access_allowed(struct inode *inode) +{ + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_access(task, PTRACE_MODE_READ); + put_task_struct(task); + } + return allowed; +} + +int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = d_inode(dentry); + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + +static int proc_single_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct pid_namespace *ns; + struct pid *pid; + struct task_struct *task; + int ret; + + ns = inode->i_sb->s_fs_info; + pid = proc_pid(inode); + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); + + put_task_struct(task); + return ret; +} + +static int proc_single_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_single_show, inode); +} + +static const struct file_operations proc_single_file_operations = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) +{ + struct task_struct *task = get_proc_task(inode); + struct mm_struct *mm = ERR_PTR(-ESRCH); + + if (task) { + mm = mm_access(task, mode); + put_task_struct(task); + + if (!IS_ERR_OR_NULL(mm)) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + } + + return mm; +} + +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) +{ + struct mm_struct *mm = proc_mem_open(inode, mode); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + return 0; +} + +static int mem_open(struct inode *inode, struct file *file) +{ + int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + + return ret; +} + +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; + char *page; + + if (!mm) + return 0; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + + while (count > 0) { + int this_len = min_t(int, count, PAGE_SIZE); + + if (write && copy_from_user(page, buf, this_len)) { + copied = -EFAULT; + break; + } + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { + if (!copied) + copied = -EIO; + break; + } + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; + } + *ppos = addr; + + mmput(mm); +free: + free_page((unsigned long) page); + return copied; +} + +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + +loff_t mem_lseek(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + break; + case 1: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + force_successful_syscall_return(); + return file->f_pos; +} + +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + +static const struct file_operations proc_mem_operations = { + .llseek = mem_lseek, + .read = mem_read, + .write = mem_write, + .open = mem_open, + .release = mem_release, +}; + +static int environ_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ); +} + +static ssize_t environ_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *page; + unsigned long src = *ppos; + int ret = 0; + struct mm_struct *mm = file->private_data; + + if (!mm) + return 0; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + ret = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { + size_t this_len, max_len; + int retval; + + if (src >= (mm->env_end - mm->env_start)) + break; + + this_len = mm->env_end - (mm->env_start + src); + + max_len = min_t(size_t, PAGE_SIZE, count); + this_len = min(max_len, this_len); + + retval = access_remote_vm(mm, (mm->env_start + src), + page, this_len, 0); + + if (retval <= 0) { + ret = retval; + break; + } + + if (copy_to_user(buf, page, retval)) { + ret = -EFAULT; + break; + } + + ret += retval; + src += retval; + buf += retval; + count -= retval; + } + *ppos = src; + mmput(mm); + +free: + free_page((unsigned long) page); + return ret; +} + +static const struct file_operations proc_environ_operations = { + .open = environ_open, + .read = environ_read, + .llseek = generic_file_llseek, + .release = mem_release, +}; + +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file_inode(file)); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + short oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + unsigned long flags; + int oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); + if (err) + goto out; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file_inode(file)); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if ((short)oom_score_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + task->signal->oom_score_adj = (short)oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_score_adj; + trace_oom_score_adj_update(task); + +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_AUDITSYSCALL +#define TMPBUFLEN 21 +static ssize_t proc_loginuid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + char *page, *tmp; + ssize_t length; + uid_t loginuid; + kuid_t kloginuid; + + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; + + if (*ppos != 0) { + /* No partial writes. */ + return -EINVAL; + } + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free_page; + + page[count] = '\0'; + loginuid = simple_strtoul(page, &tmp, 10); + if (tmp == page) { + length = -EINVAL; + goto out_free_page; + + } + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } + } + + length = audit_set_loginuid(kloginuid); + if (likely(length == 0)) + length = count; + +out_free_page: + free_page((unsigned long) page); + return length; +} + +static const struct file_operations proc_loginuid_operations = { + .read = proc_loginuid_read, + .write = proc_loginuid_write, + .llseek = generic_file_llseek, +}; + +static ssize_t proc_sessionid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_sessionid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static const struct file_operations proc_sessionid_operations = { + .read = proc_sessionid_read, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_FAULT_INJECTION +static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + size_t len; + int make_it_fail; + + if (!task) + return -ESRCH; + make_it_fail = task->make_it_fail; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t proc_fault_inject_write(struct file * file, + const char __user * buf, size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + int make_it_fail; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + make_it_fail = simple_strtol(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + task->make_it_fail = make_it_fail; + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_fault_inject_operations = { + .read = proc_fault_inject_read, + .write = proc_fault_inject_write, + .llseek = generic_file_llseek, +}; +#endif + + +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + + memset(buffer, 0, sizeof(buffer)); + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) + set_task_comm(p, buffer); + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + task_lock(p); + seq_printf(m, "%s\n", p->comm); + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, comm_show, inode); +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +{ + struct task_struct *task; + struct mm_struct *mm; + struct file *exe_file; + + task = get_proc_task(d_inode(dentry)); + if (!task) + return -ENOENT; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ENOENT; + exe_file = get_mm_exe_file(mm); + mmput(mm); + if (exe_file) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + fput(exe_file); + return 0; + } else + return -ENOENT; +} + +static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = d_inode(dentry); + struct path path; + int error = -EACCES; + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + nd_jump_link(nd, &path); + return NULL; +out: + return ERR_PTR(error); +} + +static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +{ + char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *pathname; + int len; + + if (!tmp) + return -ENOMEM; + + pathname = d_path(path, tmp, PAGE_SIZE); + len = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + len = tmp + PAGE_SIZE - 1 - pathname; + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, pathname, len)) + len = -EFAULT; + out: + free_page((unsigned long)tmp); + return len; +} + +static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) +{ + int error = -EACCES; + struct inode *inode = d_inode(dentry); + struct path path; + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); +out: + return error; +} + +const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, +}; + + +/* building an inode */ + +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +{ + struct inode * inode; + struct proc_inode *ei; + const struct cred *cred; + + /* We need a new inode */ + + inode = new_inode(sb); + if (!inode) + goto out; + + /* Common stuff */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &proc_def_inode_operations; + + /* + * grab the reference to task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_unlock; + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } + security_task_to_inode(task, inode); + +out: + return inode; + +out_unlock: + iput(inode); + return NULL; +} + +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct task_struct *task; + const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; + + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + cred = __task_cred(task); + stat->uid = cred->euid; + stat->gid = cred->egid; + } + } + rcu_read_unlock(); + return 0; +} + +/* dentry stuff */ + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + * + * Rewrite the inode's ownerships here because the owning task may have + * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ +int pid_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + struct task_struct *task; + const struct cred *cred; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + task = get_proc_task(inode); + + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + return 0; +} + +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + +int pid_delete_dentry(const struct dentry *dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return proc_inode_is_dead(d_inode(dentry)); +} + +const struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating the + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +bool proc_fill_cache(struct file *file, struct dir_context *ctx, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr) +{ + struct dentry *child, *dir = file->f_path.dentry; + struct qstr qname = QSTR_INIT(name, len); + struct inode *inode; + unsigned type; + ino_t ino; + + child = d_hash_and_lookup(dir, &qname); + if (!child) { + child = d_alloc(dir, &qname); + if (!child) + goto end_instantiate; + if (instantiate(d_inode(dir), child, task, ptr) < 0) { + dput(child); + goto end_instantiate; + } + } + inode = d_inode(child); + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); + return dir_emit(ctx, name, len, ino, type); + +end_instantiate: + return dir_emit(ctx, name, len, 1, DT_UNKNOWN); +} + +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EPERM; + goto out_notask; + } + + inode = d_inode(dentry); + task = get_proc_task(inode); + if (!task) + goto out_notask; + + mm = mm_access(task, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(mm)) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(d_inode(dentry)); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + rc = -ENOENT; + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + fmode_t mode; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static int +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + fmode_t mode = (fmode_t)(unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return -ENOENT; + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return 0; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + int result; + struct mm_struct *mm; + + result = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = -ENOENT; + task = get_proc_task(dir); + if (!task) + goto out; + + result = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + result = -ENOENT; + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_put_task: + put_task_struct(task); +out: + return ERR_PTR(result); +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *file, struct dir_context *ctx) +{ + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + int ret; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(file_inode(file)); + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ret = 0; + if (!dir_emit_dots(file, ctx)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > ctx->pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_put_task; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; + + info.mode = vma->vm_file->f_mode; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + if (!proc_fill_cache(file, ctx, + p->name, p->len, + proc_map_files_instantiate, + task, + (void *)(unsigned long)p->mode)) + break; + ctx->pos++; + } + if (fa) + flex_array_free(fa); + mmput(mm); + +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .iterate = proc_map_files_readdir, + .llseek = default_llseek, +}; + +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; +}; + +static void *timers_start(struct seq_file *m, loff_t *pos) +{ + struct timers_private *tp = m->private; + + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); +} + +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} + +static void timers_stop(struct seq_file *m, void *v) +{ + struct timers_private *tp = m->private; + + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } + + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } +} + +static int show_timer(struct seq_file *m, void *v) +{ + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static const char * const nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; + + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%p\n", + timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); + + return 0; +} + +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, +}; + +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = inode->i_sb->s_fs_info; + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ + +static int proc_pident_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + set_nlink(inode, 2); /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, 0)) + return 0; +out: + return -ENOENT; +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + const struct pid_entry *ents, + unsigned int nents) +{ + int error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = -ENOENT; + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc// without very good reasons. + */ + last = &ents[nents - 1]; + for (p = ents; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_pident_instantiate(dir, dentry, task, p); +out: + put_task_struct(task); +out_no_task: + return ERR_PTR(error); +} + +static int proc_pident_readdir(struct file *file, struct dir_context *ctx, + const struct pid_entry *ents, unsigned int nents) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + const struct pid_entry *p; + + if (!task) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + + if (ctx->pos >= nents + 2) + goto out; + + for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + if (!proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) + break; + ctx->pos++; + } +out: + put_task_struct(task); + return 0; +} + +#ifdef CONFIG_SECURITY +static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + char *p = NULL; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + length = security_getprocattr(task, + (char*)file->f_path.dentry->d_name.name, + &p); + put_task_struct(task); + if (length > 0) + length = simple_read_from_buffer(buf, count, ppos, p, length); + kfree(p); + return length; +} + +static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + char *page; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out; + + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free; + + /* Guard against adverse ptrace interaction */ + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (length < 0) + goto out_free; + + length = security_setprocattr(task, + (char*)file->f_path.dentry->d_name.name, + (void*)page, count); + mutex_unlock(&task->signal->cred_guard_mutex); +out_free: + free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: + return length; +} + +static const struct file_operations proc_pid_attr_operations = { + .read = proc_pid_attr_read, + .write = proc_pid_attr_write, + .llseek = generic_file_llseek, +}; + +static const struct pid_entry attr_dir_stuff[] = { + REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("prev", S_IRUGO, proc_pid_attr_operations), + REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), +}; + +static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .iterate = proc_attr_dir_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +#endif + +#ifdef CONFIG_ELF_CORE +static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + char buffer[PROC_NUMBUF]; + size_t len; + int ret; + + if (!task) + return -ESRCH; + + ret = 0; + mm = get_task_mm(task); + if (mm) { + len = snprintf(buffer, sizeof(buffer), "%08lx\n", + ((mm->flags & MMF_DUMP_FILTER_MASK) >> + MMF_DUMP_FILTER_SHIFT)); + mmput(mm); + ret = simple_read_from_buffer(buf, count, ppos, buffer, len); + } + + put_task_struct(task); + + return ret; +} + +static ssize_t proc_coredump_filter_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + struct task_struct *task; + struct mm_struct *mm; + char buffer[PROC_NUMBUF], *end; + unsigned int val; + int ret; + int i; + unsigned long mask; + + ret = -EFAULT; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + goto out_no_task; + + ret = -EINVAL; + val = (unsigned int)simple_strtoul(buffer, &end, 0); + if (*end == '\n') + end++; + if (end - buffer == 0) + goto out_no_task; + + ret = -ESRCH; + task = get_proc_task(file_inode(file)); + if (!task) + goto out_no_task; + + ret = end - buffer; + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; + + for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { + if (val & mask) + set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + else + clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + } + + mmput(mm); + out_no_mm: + put_task_struct(task); + out_no_task: + return ret; +} + +static const struct file_operations proc_coredump_filter_operations = { + .read = proc_coredump_filter_read, + .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) +{ + struct task_io_accounting acct = task->ioac; + unsigned long flags; + int result; + + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); + } + seq_printf(m, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); + result = 0; + +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; +} + +static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_io_accounting(task, m, 0); +} + +static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_io_accounting(task, m, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static int proc_projid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_projid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_projid_map_operations = { + .open = proc_projid_map_open, + .write = proc_projid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; +#endif /* CONFIG_USER_NS */ + +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; +} + +/* + * Thread groups + */ +static const struct file_operations proc_task_operations; +static const struct inode_operations proc_task_inode_operations; + +static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + ONE("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + ONE("syscall", S_IRUSR, proc_pid_syscall), +#endif + ONE("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_pid_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + ONE("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + ONE("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + ONE("cpuset", S_IRUGO, proc_cpuset_show), +#endif +#ifdef CONFIG_CGROUPS + ONE("cgroup", S_IRUGO, proc_cgroup_show), +#endif + ONE("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_ELF_CORE + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + ONE("io", S_IRUSR, proc_tgid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + ONE("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), +#endif +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), +#endif +}; + +static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct file_operations proc_tgid_base_operations = { + .read = generic_read_dir, + .iterate = proc_tgid_base_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) +{ + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + /* no ->d_hash() rejects on procfs */ + dentry = d_hash_and_lookup(mnt->mnt_root, &name); + if (dentry) { + d_invalidate(dentry); + dput(dentry); + } + + if (pid == tgid) + return; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", tgid); + leader = d_hash_and_lookup(mnt->mnt_root, &name); + if (!leader) + goto out; + + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + d_invalidate(dentry); + dput(dentry); + } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; +} + +/** + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * @task: task that should be flushed. + * + * When flushing dentries from proc, one needs to flush them from global + * proc (proc_mnt) and from all the namespaces' procs this task was seen + * in. This call is supposed to do all of this job. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. + * + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. + * + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. + */ + +void proc_flush_task(struct task_struct *task) +{ + int i; + struct pid *pid, *tgid; + struct upid *upid; + + pid = task_pid(task); + tgid = task_tgid(task); + + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, + tgid->numbers[i].nr); + } +} + +static int proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) +{ + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tgid_base_inode_operations; + inode->i_fop = &proc_tgid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, 0)) + return 0; +out: + return -ENOENT; +} + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +{ + int result = -ENOENT; + struct task_struct *task; + unsigned tgid; + struct pid_namespace *ns; + + tgid = name_to_int(&dentry->d_name); + if (tgid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tgid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + + result = proc_pid_instantiate(dir, dentry, task, NULL); + put_task_struct(task); +out: + return ERR_PTR(result); +} + +/* + * Find the first task with tgid >= tgid + * + */ +struct tgid_iter { + unsigned int tgid; + struct task_struct *task; +}; +static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) +{ + struct pid *pid; + + if (iter.task) + put_task_struct(iter.task); + rcu_read_lock(); +retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { + iter.tgid = pid_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task + * being a thread_group_leader is the obvious thing + * todo but there is a window when it fails, due to + * the pid transfer logic in de_thread. + * + * So we perform the straight forward test of seeing + * if the pid we have found is the pid of a thread + * group leader, and don't worry if the task we have + * found doesn't happen to be a thread group leader. + * As we don't care in the case of readdir. + */ + if (!iter.task || !has_group_leader_pid(iter.task)) { + iter.tgid += 1; + goto retry; + } + get_task_struct(iter.task); + } + rcu_read_unlock(); + return iter; +} + +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) + +/* for the /proc/ directory itself, after non-process stuff has been done */ +int proc_pid_readdir(struct file *file, struct dir_context *ctx) +{ + struct tgid_iter iter; + struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + loff_t pos = ctx->pos; + + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) + return 0; + + if (pos == TGID_OFFSET - 2) { + struct inode *inode = d_inode(ns->proc_self); + if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + if (pos == TGID_OFFSET - 1) { + struct inode *inode = d_inode(ns->proc_thread_self); + if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + iter.tgid = pos - TGID_OFFSET; + iter.task = NULL; + for (iter = next_tgid(ns, iter); + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { + char name[PROC_NUMBUF]; + int len; + if (!has_pid_permissions(ns, iter.task, 2)) + continue; + + len = snprintf(name, sizeof(name), "%d", iter.tgid); + ctx->pos = iter.tgid + TGID_OFFSET; + if (!proc_fill_cache(file, ctx, name, len, + proc_pid_instantiate, iter.task, NULL)) { + put_task_struct(iter.task); + return 0; + } + } + ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; + return 0; +} + +/* + * Tasks + */ +static const struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + ONE("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + ONE("syscall", S_IRUSR, proc_pid_syscall), +#endif + ONE("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_tid_maps_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("children", S_IRUGO, proc_tid_children_operations), +#endif +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + ONE("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + ONE("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + ONE("cpuset", S_IRUGO, proc_cpuset_show), +#endif +#ifdef CONFIG_CGROUPS + ONE("cgroup", S_IRUGO, proc_cgroup_show), +#endif + ONE("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + ONE("io", S_IRUSR, proc_tid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + ONE("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), +#endif +}; + +static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static const struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .iterate = proc_tid_base_readdir, + .llseek = default_llseek, +}; + +static const struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static int proc_task_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + struct inode *inode; + inode = proc_pid_make_inode(dir->i_sb, task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, 0)) + return 0; +out: + return -ENOENT; +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +{ + int result = -ENOENT; + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + struct pid_namespace *ns; + + if (!leader) + goto out_no_task; + + tid = name_to_int(&dentry->d_name); + if (tid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (!same_thread_group(leader, task)) + goto out_drop_task; + + result = proc_task_instantiate(dir, dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return ERR_PTR(result); +} + +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) +{ + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { + pos = find_task_by_pid_ns(tid, ns); + if (pos && same_thread_group(pos, task)) + goto found; + } + + /* If nr exceeds the number of threads there is nothing todo */ + if (nr >= get_nr_threads(task)) + goto fail; + + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; +found: + get_task_struct(pos); +out: + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; +} + +/* for the /proc/TGID/task/ directories */ +static int proc_task_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct task_struct *task; + struct pid_namespace *ns; + int tid; + + if (proc_inode_is_dead(inode)) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + ns = inode->i_sb->s_fs_info; + tid = (int)file->f_version; + file->f_version = 0; + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); + task; + task = next_tid(task), ctx->pos++) { + char name[PROC_NUMBUF]; + int len; + tid = task_pid_nr_ns(task, ns); + len = snprintf(name, sizeof(name), "%d", tid); + if (!proc_fill_cache(file, ctx, name, len, + proc_task_instantiate, task, NULL)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + file->f_version = (u64)tid; + put_task_struct(task); + break; + } + } + + return 0; +} + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + stat->nlink += get_nr_threads(p); + put_task_struct(p); + } + + return 0; +} + +static const struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +static const struct file_operations proc_task_operations = { + .read = generic_read_dir, + .iterate = proc_task_readdir, + .llseek = default_llseek, +}; diff --git a/kernel/fs/proc/cmdline.c b/kernel/fs/proc/cmdline.c new file mode 100644 index 000000000..cbd82dff7 --- /dev/null +++ b/kernel/fs/proc/cmdline.c @@ -0,0 +1,29 @@ +#include +#include +#include +#include + +static int cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", saved_command_line); + return 0; +} + +static int cmdline_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmdline_proc_show, NULL); +} + +static const struct file_operations cmdline_proc_fops = { + .open = cmdline_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_cmdline_init(void) +{ + proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + return 0; +} +fs_initcall(proc_cmdline_init); diff --git a/kernel/fs/proc/consoles.c b/kernel/fs/proc/consoles.c new file mode 100644 index 000000000..290ba85cb --- /dev/null +++ b/kernel/fs/proc/consoles.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + * + * Licensed under GPLv2 + */ + +#include +#include +#include +#include +#include + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_setwidth(m, 21 - 1); + seq_printf(m, "%s%d", con->name, con->index); + seq_pad(m, ' '); + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_printf(m, "\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int consoles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &consoles_op); +} + +static const struct file_operations proc_consoles_operations = { + .open = consoles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_consoles_init(void) +{ + proc_create("consoles", 0, NULL, &proc_consoles_operations); + return 0; +} +fs_initcall(proc_consoles_init); diff --git a/kernel/fs/proc/cpuinfo.c b/kernel/fs/proc/cpuinfo.c new file mode 100644 index 000000000..06f4d31e0 --- /dev/null +++ b/kernel/fs/proc/cpuinfo.c @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +extern const struct seq_operations cpuinfo_op; +static int cpuinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuinfo_op); +} + +static const struct file_operations proc_cpuinfo_operations = { + .open = cpuinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_cpuinfo_init(void) +{ + proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + return 0; +} +fs_initcall(proc_cpuinfo_init); diff --git a/kernel/fs/proc/devices.c b/kernel/fs/proc/devices.c new file mode 100644 index 000000000..50493edc3 --- /dev/null +++ b/kernel/fs/proc/devices.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +static int devinfo_show(struct seq_file *f, void *v) +{ + int i = *(loff_t *) v; + + if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i == 0) + seq_puts(f, "Character devices:\n"); + chrdev_show(f, i); + } +#ifdef CONFIG_BLOCK + else { + i -= CHRDEV_MAJOR_HASH_SIZE; + if (i == 0) + seq_puts(f, "\nBlock devices:\n"); + blkdev_show(f, i); + } +#endif + return 0; +} + +static void *devinfo_start(struct seq_file *f, loff_t *pos) +{ + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; +} + +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return NULL; + return pos; +} + +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show +}; + +static int devinfo_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &devinfo_ops); +} + +static const struct file_operations proc_devinfo_operations = { + .open = devinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_devices_init(void) +{ + proc_create("devices", 0, NULL, &proc_devinfo_operations); + return 0; +} +fs_initcall(proc_devices_init); diff --git a/kernel/fs/proc/fd.c b/kernel/fs/proc/fd.c new file mode 100644 index 000000000..6e5fcd007 --- /dev/null +++ b/kernel/fs/proc/fd.c @@ -0,0 +1,357 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../mount.h" +#include "internal.h" +#include "fd.h" + +static int seq_show(struct seq_file *m, void *v) +{ + struct files_struct *files = NULL; + int f_flags = 0, ret = -ENOENT; + struct file *file = NULL; + struct task_struct *task; + + task = get_proc_task(m->private); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + + if (files) { + int fd = proc_fd(m->private); + + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + struct fdtable *fdt = files_fdtable(files); + + f_flags = file->f_flags; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + get_file(file); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + if (ret) + return ret; + + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); + + show_fd_locks(m, file, files); + if (seq_has_overflowed(m)) + goto out; + + if (file->f_op->show_fdinfo) + file->f_op->show_fdinfo(m, file); + +out: + fput(file); + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = seq_fdinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct files_struct *files; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int fd; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + struct file *file; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + unsigned f_mode = file->f_mode; + + rcu_read_unlock(); + put_files_struct(files); + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + struct files_struct *files = NULL; + struct task_struct *task; + int ret = -ENOENT; + + task = get_proc_task(d_inode(dentry)); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + + if (files) { + int fd = proc_fd(d_inode(dentry)); + struct file *fd_file; + + spin_lock(&files->file_lock); + fd_file = fcheck_files(files, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + return ret; +} + +static int +proc_fd_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFLNK; + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + + ei->op.proc_get_link = proc_fd_link; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + return 0; + out: + return -ENOENT; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + int result = -ENOENT; + unsigned fd = name_to_int(&dentry->d_name); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); +out: + put_task_struct(task); +out_no_task: + return ERR_PTR(result); +} + +static int proc_readfd_common(struct file *file, struct dir_context *ctx, + instantiate_t instantiate) +{ + struct task_struct *p = get_proc_task(file_inode(file)); + struct files_struct *files; + unsigned int fd; + + if (!p) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + files = get_files_struct(p); + if (!files) + goto out; + + rcu_read_lock(); + for (fd = ctx->pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, ctx->pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (!proc_fill_cache(file, ctx, + name, len, instantiate, p, + (void *)(unsigned long)fd)) + goto out_fd_loop; + rcu_read_lock(); + } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); +out: + put_task_struct(p); + return 0; +} + +static int proc_readfd(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .iterate = proc_readfd, + .llseek = default_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct inode *inode, int mask) +{ + int rv = generic_permission(inode, mask); + if (rv == 0) + return 0; + if (task_tgid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + return 0; + out: + return -ENOENT; +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, + proc_fdinfo_instantiate); +} + +const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .iterate = proc_readfdinfo, + .llseek = default_llseek, +}; diff --git a/kernel/fs/proc/fd.h b/kernel/fs/proc/fd.h new file mode 100644 index 000000000..7c047f256 --- /dev/null +++ b/kernel/fs/proc/fd.h @@ -0,0 +1,19 @@ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct inode *inode, int mask); + +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + +#endif /* __PROCFS_FD_H__ */ diff --git a/kernel/fs/proc/generic.c b/kernel/fs/proc/generic.c new file mode 100644 index 000000000..e5dee5c31 --- /dev/null +++ b/kernel/fs/proc/generic.c @@ -0,0 +1,645 @@ +/* + * proc/fs/generic.c --- generic routines for the proc-fs + * + * This file contains generic proc-fs routines for handling + * directories and files. + * + * Copyright (C) 1991, 1992 Linus Torvalds. + * Copyright (C) 1997 Theodore Ts'o + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static DEFINE_SPINLOCK(proc_subdir_lock); + +static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +{ + if (len < de->namelen) + return -1; + if (len > de->namelen) + return 1; + + return memcmp(name, de->name, len); +} + +static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, + const char *name, + unsigned int len) +{ + struct rb_node *node = dir->subdir.rb_node; + + while (node) { + struct proc_dir_entry *de = container_of(node, + struct proc_dir_entry, + subdir_node); + int result = proc_match(len, name, de); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return de; + } + return NULL; +} + +static bool pde_subdir_insert(struct proc_dir_entry *dir, + struct proc_dir_entry *de) +{ + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_dir_entry *this = + container_of(*new, struct proc_dir_entry, subdir_node); + int result = proc_match(de->namelen, de->name, this); + + parent = *new; + if (result < 0) + new = &(*new)->rb_left; + else if (result > 0) + new = &(*new)->rb_right; + else + return false; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&de->subdir_node, parent, new); + rb_insert_color(&de->subdir_node, root); + return true; +} + +static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + struct proc_dir_entry *de = PDE(inode); + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + proc_set_user(de, inode->i_uid, inode->i_gid); + de->mode = inode->i_mode; + return 0; +} + +static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct proc_dir_entry *de = PDE(inode); + if (de && de->nlink) + set_nlink(inode, de->nlink); + + generic_fillattr(inode, stat); + return 0; +} + +static const struct inode_operations proc_file_inode_operations = { + .setattr = proc_notify_change, +}; + +/* + * This function parses a name such as "tty/driver/serial", and + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + const char *cp = name, *next; + struct proc_dir_entry *de; + unsigned int len; + + de = *ret; + if (!de) + de = &proc_root; + + while (1) { + next = strchr(cp, '/'); + if (!next) + break; + + len = next - cp; + de = pde_subdir_find(de, cp, len); + if (!de) { + WARN(1, "name '%s'\n", name); + return -ENOENT; + } + cp += len + 1; + } + *residual = cp; + *ret = de; + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); + spin_unlock(&proc_subdir_lock); + return rv; +} + +static DEFINE_IDA(proc_inum_ida); +static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ + +#define PROC_DYNAMIC_FIRST 0xF0000000U + +/* + * Return an inode number between PROC_DYNAMIC_FIRST and + * 0xffffffff, or zero on failure. + */ +int proc_alloc_inum(unsigned int *inum) +{ + unsigned int i; + int error; + +retry: + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock_irq(&proc_inum_lock); + error = ida_get_new(&proc_inum_ida, &i); + spin_unlock_irq(&proc_inum_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + return error; + + if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { + spin_lock_irq(&proc_inum_lock); + ida_remove(&proc_inum_ida, i); + spin_unlock_irq(&proc_inum_lock); + return -ENOSPC; + } + *inum = PROC_DYNAMIC_FIRST + i; + return 0; +} + +void proc_free_inum(unsigned int inum) +{ + unsigned long flags; + spin_lock_irqsave(&proc_inum_lock, flags); + ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + spin_unlock_irqrestore(&proc_inum_lock, flags); +} + +/* + * Don't create negative dentries here, return -ENOENT by hand + * instead. + */ +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode; + + spin_lock(&proc_subdir_lock); + de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); + if (de) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, inode); + return NULL; + } + spin_unlock(&proc_subdir_lock); + return ERR_PTR(-ENOENT); +} + +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookup_de(PDE(dir), dir, dentry); +} + +/* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should + * continue with the entries.. + * + * Note that the VFS-layer doesn't care about the return + * value of the readdir() call, as long as it's non-negative + * for success.. + */ +int proc_readdir_de(struct proc_dir_entry *de, struct file *file, + struct dir_context *ctx) +{ + int i; + + if (!dir_emit_dots(file, ctx)) + return 0; + + spin_lock(&proc_subdir_lock); + de = pde_subdir_first(de); + i = ctx->pos - 2; + for (;;) { + if (!de) { + spin_unlock(&proc_subdir_lock); + return 0; + } + if (!i) + break; + de = pde_subdir_next(de); + i--; + } + + do { + struct proc_dir_entry *next; + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (!dir_emit(ctx, de->name, de->namelen, + de->low_ino, de->mode >> 12)) { + pde_put(de); + return 0; + } + spin_lock(&proc_subdir_lock); + ctx->pos++; + next = pde_subdir_next(de); + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + return 1; +} + +int proc_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + + return proc_readdir_de(PDE(inode), file, ctx); +} + +/* + * These are the generic /proc directory operations. They + * use the in-memory "struct proc_dir_entry" tree to parse + * the /proc directory. + */ +static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = proc_readdir, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, + .getattr = proc_getattr, + .setattr = proc_notify_change, +}; + +static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +{ + int ret; + + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; + + spin_lock(&proc_subdir_lock); + dp->parent = dir; + if (pde_subdir_insert(dir, dp) == false) { + WARN(1, "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); + spin_unlock(&proc_subdir_lock); + proc_free_inum(dp->low_ino); + return -EEXIST; + } + spin_unlock(&proc_subdir_lock); + + return 0; +} + +static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, + const char *name, + umode_t mode, + nlink_t nlink) +{ + struct proc_dir_entry *ent = NULL; + const char *fn; + struct qstr qstr; + + if (xlate_proc_name(name, parent, &fn) != 0) + goto out; + qstr.name = fn; + qstr.len = strlen(fn); + if (qstr.len == 0 || qstr.len >= 256) { + WARN(1, "name len %u\n", qstr.len); + return NULL; + } + if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { + WARN(1, "create '/proc/%s' by hand\n", qstr.name); + return NULL; + } + if (is_empty_pde(*parent)) { + WARN(1, "attempt to add to permanently empty directory"); + return NULL; + } + + ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + if (!ent) + goto out; + + memcpy(ent->name, fn, qstr.len + 1); + ent->namelen = qstr.len; + ent->mode = mode; + ent->nlink = nlink; + ent->subdir = RB_ROOT; + atomic_set(&ent->count, 1); + spin_lock_init(&ent->pde_unload_lock); + INIT_LIST_HEAD(&ent->pde_openers); +out: + return ent; +} + +struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent, const char *dest) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, + (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); + + if (ent) { + ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + if (ent->data) { + strcpy((char*)ent->data,dest); + ent->proc_iops = &proc_link_inode_operations; + if (proc_register(parent, ent) < 0) { + kfree(ent->data); + kfree(ent); + ent = NULL; + } + } else { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_symlink); + +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) +{ + struct proc_dir_entry *ent; + + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); + if (ent) { + ent->data = data; + ent->proc_fops = &proc_dir_operations; + ent->proc_iops = &proc_dir_inode_operations; + parent->nlink++; + if (proc_register(parent, ent) < 0) { + kfree(ent); + parent->nlink--; + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL_GPL(proc_mkdir_data); + +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) +{ + return proc_mkdir_data(name, mode, parent, NULL); +} +EXPORT_SYMBOL(proc_mkdir_mode); + +struct proc_dir_entry *proc_mkdir(const char *name, + struct proc_dir_entry *parent) +{ + return proc_mkdir_data(name, 0, parent, NULL); +} +EXPORT_SYMBOL(proc_mkdir); + +struct proc_dir_entry *proc_create_mount_point(const char *name) +{ + umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; + struct proc_dir_entry *ent, *parent = NULL; + + ent = __proc_create(&parent, name, mode, 2); + if (ent) { + ent->data = NULL; + ent->proc_fops = NULL; + ent->proc_iops = NULL; + if (proc_register(parent, ent) < 0) { + kfree(ent); + parent->nlink--; + ent = NULL; + } + } + return ent; +} + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, + void *data) +{ + struct proc_dir_entry *pde; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + + if (!S_ISREG(mode)) { + WARN_ON(1); /* use proc_mkdir() */ + return NULL; + } + + BUG_ON(proc_fops == NULL); + + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + pde = __proc_create(&parent, name, mode, 1); + if (!pde) + goto out; + pde->proc_fops = proc_fops; + pde->data = data; + pde->proc_iops = &proc_file_inode_operations; + if (proc_register(parent, pde) < 0) + goto out_free; + return pde; +out_free: + kfree(pde); +out: + return NULL; +} +EXPORT_SYMBOL(proc_create_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) +{ + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); + +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); + +static void free_proc_entry(struct proc_dir_entry *de) +{ + proc_free_inum(de->low_ino); + + if (S_ISLNK(de->mode)) + kfree(de->data); + kfree(de); +} + +void pde_put(struct proc_dir_entry *pde) +{ + if (atomic_dec_and_test(&pde->count)) + free_proc_entry(pde); +} + +/* + * Remove a /proc entry and free it if it's not currently in use. + */ +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry *de = NULL; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return; + } + len = strlen(fn); + + de = pde_subdir_find(parent, fn, len); + if (de) + rb_erase(&de->subdir_node, &parent->subdir); + spin_unlock(&proc_subdir_lock); + if (!de) { + WARN(1, "name '%s'\n", name); + return; + } + + proc_entry_rundown(de); + + if (S_ISDIR(de->mode)) + parent->nlink--; + de->nlink = 0; + WARN(pde_subdir_first(de), + "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", + __func__, de->parent->name, de->name, pde_subdir_first(de)->name); + pde_put(de); +} +EXPORT_SYMBOL(remove_proc_entry); + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); + + root = pde_subdir_find(parent, fn, len); + if (!root) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + rb_erase(&root->subdir_node, &parent->subdir); + + de = root; + while (1) { + next = pde_subdir_first(de); + if (next) { + rb_erase(&next->subdir_node, &de->subdir); + de = next; + continue; + } + spin_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + de->nlink = 0; + if (de == root) + break; + pde_put(de); + + spin_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +void *PDE_DATA(const struct inode *inode) +{ + return __PDE_DATA(inode); +} +EXPORT_SYMBOL(PDE_DATA); diff --git a/kernel/fs/proc/inode.c b/kernel/fs/proc/inode.c new file mode 100644 index 000000000..e3eb55246 --- /dev/null +++ b/kernel/fs/proc/inode.c @@ -0,0 +1,489 @@ +/* + * linux/fs/proc/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +static void proc_evict_inode(struct inode *inode) +{ + struct proc_dir_entry *de; + struct ctl_table_header *head; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); + + /* Let go of any associated proc directory entry */ + de = PDE(inode); + if (de) + pde_put(de); + head = PROC_I(inode)->sysctl; + if (head) { + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } +} + +static struct kmem_cache * proc_inode_cachep; + +static struct inode *proc_alloc_inode(struct super_block *sb) +{ + struct proc_inode *ei; + struct inode *inode; + + ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->pid = NULL; + ei->fd = 0; + ei->op.proc_get_link = NULL; + ei->pde = NULL; + ei->sysctl = NULL; + ei->sysctl_entry = NULL; + ei->ns_ops = NULL; + inode = &ei->vfs_inode; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static void proc_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(proc_inode_cachep, PROC_I(inode)); +} + +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + +static void init_once(void *foo) +{ + struct proc_inode *ei = (struct proc_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void __init proc_init_inodecache(void) +{ + proc_inode_cachep = kmem_cache_create("proc_inode_cache", + sizeof(struct proc_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); +} + +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + + return 0; +} + +static const struct super_operations proc_sops = { + .alloc_inode = proc_alloc_inode, + .destroy_inode = proc_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = proc_evict_inode, + .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, +}; + +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) +{ + return atomic_inc_unless_negative(&pde->in_use); +} + +static void unuse_pde(struct proc_dir_entry *pde) +{ + if (atomic_dec_return(&pde->in_use) == BIAS) + complete(pde->pde_unload_completion); +} + +/* pde is locked */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) +{ + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; + spin_unlock(&pde->pde_unload_lock); + wait_for_completion(&c); + spin_lock(&pde->pde_unload_lock); + } else { + struct file *file; + pdeo->closing = 1; + spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; + pde->proc_fops->release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); + list_del_init(&pdeo->lh); + if (pdeo->c) + complete(pdeo->c); + kfree(pdeo); + } +} + +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + } + spin_unlock(&de->pde_unload_lock); +} + +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + loff_t rv = -EINVAL; + if (use_pde(pde)) { + loff_t (*llseek)(struct file *, loff_t, int); + llseek = pde->proc_fops->llseek; + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + unuse_pde(pde); + } + return rv; +} + +static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + if (use_pde(pde)) { + read = pde->proc_fops->read; + if (read) + rv = read(file, buf, count, ppos); + unuse_pde(pde); + } + return rv; +} + +static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + if (use_pde(pde)) { + write = pde->proc_fops->write; + if (write) + rv = write(file, buf, count, ppos); + unuse_pde(pde); + } + return rv; +} + +static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned int rv = DEFAULT_POLLMASK; + unsigned int (*poll)(struct file *, struct poll_table_struct *); + if (use_pde(pde)) { + poll = pde->proc_fops->poll; + if (poll) + rv = poll(file, pts); + unuse_pde(pde); + } + return rv; +} + +static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + long rv = -ENOTTY; + long (*ioctl)(struct file *, unsigned int, unsigned long); + if (use_pde(pde)) { + ioctl = pde->proc_fops->unlocked_ioctl; + if (ioctl) + rv = ioctl(file, cmd, arg); + unuse_pde(pde); + } + return rv; +} + +#ifdef CONFIG_COMPAT +static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + long rv = -ENOTTY; + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + if (use_pde(pde)) { + compat_ioctl = pde->proc_fops->compat_ioctl; + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + unuse_pde(pde); + } + return rv; +} +#endif + +static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + int rv = -EIO; + int (*mmap)(struct file *, struct vm_area_struct *); + if (use_pde(pde)) { + mmap = pde->proc_fops->mmap; + if (mmap) + rv = mmap(file, vma); + unuse_pde(pde); + } + return rv; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + if (!get_area) + get_area = current->mm->get_unmapped_area; +#endif + + if (get_area) + rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; + unuse_pde(pde); + } + return rv; +} + +static int proc_reg_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + /* + * What for, you ask? Well, we can have open, rmmod, remove_proc_entry + * sequence. ->release won't be called because ->proc_fops will be + * cleared. Depending on complexity of ->release, consequences vary. + * + * We can't wait for mercy when close will be done for real, it's + * deadlockable: rmmod foo release + * by hand in remove_proc_entry(). For this, save opener's credentials + * for later. + */ + pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) + return -ENOMEM; + + if (!use_pde(pde)) { + kfree(pdeo); + return -ENOENT; + } + open = pde->proc_fops->open; + release = pde->proc_fops->release; + + if (open) + rv = open(inode, file); + + if (rv == 0 && release) { + /* To know what to release. */ + pdeo->file = file; + /* Strictly for "too late" ->release in proc_reg_release(). */ + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kfree(pdeo); + + unuse_pde(pde); + return rv; +} + +static int proc_reg_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct pde_opener *pdeo; + spin_lock(&pde->pde_unload_lock); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + break; + } + } + spin_unlock(&pde->pde_unload_lock); + return 0; +} + +static const struct file_operations proc_reg_file_ops = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_reg_compat_ioctl, +#endif + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +#ifdef CONFIG_COMPAT +static const struct file_operations proc_reg_file_ops_no_compat = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; +#endif + +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct proc_dir_entry *pde = PDE(d_inode(dentry)); + if (unlikely(!use_pde(pde))) + return ERR_PTR(-EINVAL); + nd_set_link(nd, pde->data); + return pde; +} + +static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + unuse_pde(p); +} + +const struct inode_operations proc_link_inode_operations = { + .readlink = generic_readlink, + .follow_link = proc_follow_link, + .put_link = proc_put_link, +}; + +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) +{ + struct inode *inode = new_inode_pseudo(sb); + + if (inode) { + inode->i_ino = de->low_ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + PROC_I(inode)->pde = de; + + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + set_nlink(inode, de->nlink); + WARN_ON(!de->proc_iops); + inode->i_op = de->proc_iops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_COMPAT + if (!de->proc_fops->compat_ioctl) + inode->i_fop = + &proc_reg_file_ops_no_compat; + else +#endif + inode->i_fop = &proc_reg_file_ops; + } else { + inode->i_fop = de->proc_fops; + } + } + } else + pde_put(de); + return inode; +} + +int proc_fill_super(struct super_block *s) +{ + struct inode *root_inode; + int ret; + + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} diff --git a/kernel/fs/proc/internal.h b/kernel/fs/proc/internal.h new file mode 100644 index 000000000..aa2781095 --- /dev/null +++ b/kernel/fs/proc/internal.h @@ -0,0 +1,305 @@ +/* Internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +struct ctl_table_header; +struct mempolicy; + +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. + */ +struct proc_dir_entry { + unsigned int low_ino; + umode_t mode; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; + void *data; + atomic_t count; /* use count */ + atomic_t in_use; /* number of callers into module in progress; */ + /* negative -> it's going away RSN */ + struct completion *pde_unload_completion; + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + u8 namelen; + char name[]; +}; + +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); +}; + +struct proc_inode { + struct pid *pid; + int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; +}; + +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} + +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +static inline int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if (dumpable == SUID_DUMP_USER) + return 1; + return 0; +} + +static inline unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 + +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern int pid_revalidate(struct dentry *, unsigned int); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef int instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, + struct dentry *); +extern int proc_readdir(struct file *, struct dir_context *); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); + +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + atomic_inc(&pde->count); + return pde; +} +extern void pde_put(struct proc_dir_entry *); + +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +struct proc_dir_entry *proc_create_mount_point(const char *name); + +/* + * inode.c + */ +struct pde_opener { + struct file *file; + struct list_head lh; + int closing; + struct completion *c; +}; +extern const struct inode_operations proc_link_inode_operations; + +extern const struct inode_operations proc_pid_link_inode_operations; + +extern void proc_init_inodecache(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern int proc_fill_super(struct super_block *); +extern void proc_entry_rundown(struct proc_dir_entry *); + +/* + * proc_namespaces.c + */ +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); +extern int proc_remount(struct super_block *, int *, char *); + +/* + * task_[no]mmu.c + */ +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct mm_struct *mm; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +}; + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); diff --git a/kernel/fs/proc/interrupts.c b/kernel/fs/proc/interrupts.c new file mode 100644 index 000000000..a352d5703 --- /dev/null +++ b/kernel/fs/proc/interrupts.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +/* + * /proc/interrupts + */ +static void *int_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos <= nr_irqs) ? pos : NULL; +} + +static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos > nr_irqs) + return NULL; + return pos; +} + +static void int_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations int_seq_ops = { + .start = int_seq_start, + .next = int_seq_next, + .stop = int_seq_stop, + .show = show_interrupts +}; + +static int interrupts_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &int_seq_ops); +} + +static const struct file_operations proc_interrupts_operations = { + .open = interrupts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_interrupts_init(void) +{ + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + return 0; +} +fs_initcall(proc_interrupts_init); diff --git a/kernel/fs/proc/kcore.c b/kernel/fs/proc/kcore.c new file mode 100644 index 000000000..91a4e6426 --- /dev/null +++ b/kernel/fs/proc/kcore.c @@ -0,0 +1,644 @@ +/* + * fs/proc/kcore.c kernel ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * ELF version written by David Howells + * Modified and incorporated into 2.3.x by Tigran Aivazian + * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian + * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CORE_STR "CORE" + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +static struct proc_dir_entry *proc_root_kcore; + + +#ifndef kc_vaddr_to_offset +#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) +#endif +#ifndef kc_offset_to_vaddr +#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) +#endif + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static LIST_HEAD(kclist_head); +static DEFINE_RWLOCK(kclist_lock); +static int kcore_need_update = 1; + +void +kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +{ + new->addr = (unsigned long)addr; + new->size = size; + new->type = type; + + write_lock(&kclist_lock); + list_add_tail(&new->list, &kclist_head); + write_unlock(&kclist_lock); +} + +static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +{ + size_t try, size; + struct kcore_list *m; + + *nphdr = 1; /* PT_NOTE */ + size = 0; + + list_for_each_entry(m, &kclist_head, list) { + try = kc_vaddr_to_offset((size_t)m->addr + m->size); + if (try > size) + size = try; + *nphdr = *nphdr + 1; + } + *elf_buflen = sizeof(struct elfhdr) + + (*nphdr + 2)*sizeof(struct elf_phdr) + + 3 * ((sizeof(struct elf_note)) + + roundup(sizeof(CORE_STR), 4)) + + roundup(sizeof(struct elf_prstatus), 4) + + roundup(sizeof(struct elf_prpsinfo), 4) + + roundup(sizeof(struct task_struct), 4); + *elf_buflen = PAGE_ALIGN(*elf_buflen); + return size + *elf_buflen; +} + +static void free_kclist_ents(struct list_head *head) +{ + struct kcore_list *tmp, *pos; + + list_for_each_entry_safe(pos, tmp, head, list) { + list_del(&pos->list); + kfree(pos); + } +} +/* + * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. + */ +static void __kcore_update_ram(struct list_head *list) +{ + int nphdr; + size_t size; + struct kcore_list *tmp, *pos; + LIST_HEAD(garbage); + + write_lock(&kclist_lock); + if (kcore_need_update) { + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM + || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(list, &kclist_head); + } else + list_splice(list, &garbage); + kcore_need_update = 0; + proc_root_kcore->size = get_kcore_size(&nphdr, &size); + write_unlock(&kclist_lock); + + free_kclist_ents(&garbage); +} + + +#ifdef CONFIG_HIGHMEM +/* + * If no highmem, we can assume [0...max_low_pfn) continuous range of memory + * because memory hole is not as big as !HIGHMEM case. + * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) + */ +static int kcore_update_ram(void) +{ + LIST_HEAD(head); + struct kcore_list *ent; + int ret = 0; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va(0); + ent->size = max_low_pfn << PAGE_SHIFT; + ent->type = KCORE_RAM; + list_add(&ent->list, &head); + __kcore_update_ram(&head); + return ret; +} + +#else /* !CONFIG_HIGHMEM */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* calculate vmemmap's address from given system ram pfn and register it */ +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; + unsigned long nr_pages = ent->size >> PAGE_SHIFT; + unsigned long start, end; + struct kcore_list *vmm, *tmp; + + + start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; + end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; + end = PAGE_ALIGN(end); + /* overlap check (because we have to align page */ + list_for_each_entry(tmp, head, list) { + if (tmp->type != KCORE_VMEMMAP) + continue; + if (start < tmp->addr + tmp->size) + if (end > tmp->addr) + end = tmp->addr; + } + if (start < end) { + vmm = kmalloc(sizeof(*vmm), GFP_KERNEL); + if (!vmm) + return 0; + vmm->addr = start; + vmm->size = end - start; + vmm->type = KCORE_VMEMMAP; + list_add_tail(&vmm->list, head); + } + return 1; + +} +#else +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + return 1; +} + +#endif + +static int +kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + struct list_head *head = (struct list_head *)arg; + struct kcore_list *ent; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->size = nr_pages << PAGE_SHIFT; + + /* Sanity check: Can happen in 32bit arch...maybe */ + if (ent->addr < (unsigned long) __va(0)) + goto free_out; + + /* cut not-mapped area. ....from ppc-32 code. */ + if (ULONG_MAX - ent->addr < ent->size) + ent->size = ULONG_MAX - ent->addr; + + /* cut when vmalloc() area is higher than direct-map area */ + if (VMALLOC_START > (unsigned long)__va(0)) { + if (ent->addr > VMALLOC_START) + goto free_out; + if (VMALLOC_START - ent->addr < ent->size) + ent->size = VMALLOC_START - ent->addr; + } + + ent->type = KCORE_RAM; + list_add_tail(&ent->list, head); + + if (!get_sparsemem_vmemmap_info(ent, head)) { + list_del(&ent->list); + goto free_out; + } + + return 0; +free_out: + kfree(ent); + return 1; +} + +static int kcore_update_ram(void) +{ + int nid, ret; + unsigned long end_pfn; + LIST_HEAD(head); + + /* Not inialized....update now */ + /* find out "max pfn" */ + end_pfn = 0; + for_each_node_state(nid, N_MEMORY) { + unsigned long node_end; + node_end = node_end_pfn(nid); + if (end_pfn < node_end) + end_pfn = node_end; + } + /* scan 0 to max_pfn */ + ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); + if (ret) { + free_kclist_ents(&head); + return -ENOMEM; + } + __kcore_update_ram(&head); + return ret; +} +#endif /* CONFIG_HIGHMEM */ + +/*****************************************************************************/ +/* + * determine size of ELF note + */ +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup((strlen(en->name) + 1), 4); + sz += roundup(en->datasz, 4); + + return sz; +} /* end notesize() */ + +/*****************************************************************************/ +/* + * store a note in the header buffer + */ +static char *storenote(struct memelfnote *men, char *bufp) +{ + struct elf_note en; + +#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + + /* XXX - cast from long long to long to avoid need for libgcc.a */ + bufp = (char*) roundup((unsigned long)bufp,4); + DUMP_WRITE(men->data, men->datasz); + bufp = (char*) roundup((unsigned long)bufp,4); + +#undef DUMP_WRITE + + return bufp; +} /* end storenote() */ + +/* + * store an ELF coredump header in the supplied buffer + * nphdr is the number of elf_phdr to insert + */ +static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) +{ + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ + struct elf_phdr *nhdr, *phdr; + struct elfhdr *elf; + struct memelfnote notes[3]; + off_t offset = 0; + struct kcore_list *m; + + /* setup ELF header */ + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + offset += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION]= EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize= sizeof(struct elf_phdr); + elf->e_phnum = nphdr; + elf->e_shentsize= 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + /* setup ELF PT_NOTE program header */ + nhdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + nhdr->p_type = PT_NOTE; + nhdr->p_offset = 0; + nhdr->p_vaddr = 0; + nhdr->p_paddr = 0; + nhdr->p_filesz = 0; + nhdr->p_memsz = 0; + nhdr->p_flags = 0; + nhdr->p_align = 0; + + /* setup ELF PT_LOAD program header for every area */ + list_for_each_entry(m, &kclist_head, list) { + phdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; + phdr->p_vaddr = (size_t)m->addr; + phdr->p_paddr = 0; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + } + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + nhdr->p_offset = offset; + + /* set up the process status */ + notes[0].name = CORE_STR; + notes[0].type = NT_PRSTATUS; + notes[0].datasz = sizeof(struct elf_prstatus); + notes[0].data = &prstatus; + + memset(&prstatus, 0, sizeof(struct elf_prstatus)); + + nhdr->p_filesz = notesize(¬es[0]); + bufp = storenote(¬es[0], bufp); + + /* set up the process info */ + notes[1].name = CORE_STR; + notes[1].type = NT_PRPSINFO; + notes[1].datasz = sizeof(struct elf_prpsinfo); + notes[1].data = &prpsinfo; + + memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); + prpsinfo.pr_state = 0; + prpsinfo.pr_sname = 'R'; + prpsinfo.pr_zomb = 0; + + strcpy(prpsinfo.pr_fname, "vmlinux"); + strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); + + nhdr->p_filesz += notesize(¬es[1]); + bufp = storenote(¬es[1], bufp); + + /* set up the task structure */ + notes[2].name = CORE_STR; + notes[2].type = NT_TASKSTRUCT; + notes[2].datasz = sizeof(struct task_struct); + notes[2].data = current; + + nhdr->p_filesz += notesize(¬es[2]); + bufp = storenote(¬es[2], bufp); + +} /* end elf_kcore_store_hdr() */ + +/*****************************************************************************/ +/* + * read from the ELF header and then kernel memory + */ +static ssize_t +read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0; + size_t size, tsz; + size_t elf_buflen; + int nphdr; + unsigned long start; + + read_lock(&kclist_lock); + size = get_kcore_size(&nphdr, &elf_buflen); + + if (buflen == 0 || *fpos >= size) { + read_unlock(&kclist_lock); + return 0; + } + + /* trim buflen to not go beyond EOF */ + if (buflen > size - *fpos) + buflen = size - *fpos; + + /* construct an ELF core header if we'll need some of it */ + if (*fpos < elf_buflen) { + char * elf_buf; + + tsz = elf_buflen - *fpos; + if (buflen < tsz) + tsz = buflen; + elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); + if (!elf_buf) { + read_unlock(&kclist_lock); + return -ENOMEM; + } + elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); + read_unlock(&kclist_lock); + if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } else + read_unlock(&kclist_lock); + + /* + * Check to see if our file offset matches with any of + * the addresses in the elf_phdr on our list. + */ + start = kc_offset_to_vaddr(*fpos - elf_buflen); + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + while (buflen) { + struct kcore_list *m; + + read_lock(&kclist_lock); + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && start < (m->addr+m->size)) + break; + } + read_unlock(&kclist_lock); + + if (&m->list == &kclist_head) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else if (is_vmalloc_or_module_addr((void *)start)) { + char * elf_buf; + + elf_buf = kzalloc(tsz, GFP_KERNEL); + if (!elf_buf) + return -ENOMEM; + vread(elf_buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ + if (copy_to_user(buffer, elf_buf, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + } else { + if (kern_addr_valid(start)) { + unsigned long n; + + n = copy_to_user(buffer, (char *)start, tsz); + /* + * We cannot distinguish between fault on source + * and fault on destination. When this happens + * we clear too and hope it will trigger the + * EFAULT again. + */ + if (n) { + if (clear_user(buffer + tsz - n, + n)) + return -EFAULT; + } + } else { + if (clear_user(buffer, tsz)) + return -EFAULT; + } + } + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + start += tsz; + tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); + } + + return acc; +} + + +static int open_kcore(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (kcore_need_update) + kcore_update_ram(); + if (i_size_read(inode) != proc_root_kcore->size) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, proc_root_kcore->size); + mutex_unlock(&inode->i_mutex); + } + return 0; +} + + +static const struct file_operations proc_kcore_operations = { + .read = read_kcore, + .open = open_kcore, + .llseek = default_llseek, +}; + +/* just remember that we have to update kcore */ +static int __meminit kcore_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + write_lock(&kclist_lock); + kcore_need_update = 1; + write_unlock(&kclist_lock); + } + return NOTIFY_OK; +} + +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; + +static struct kcore_list kcore_vmalloc; + +#ifdef CONFIG_ARCH_PROC_KCORE_TEXT +static struct kcore_list kcore_text; +/* + * If defined, special segment is used for mapping kernel text instead of + * direct-map area. We need to create special TEXT section. + */ +static void __init proc_kcore_text_init(void) +{ + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); +} +#else +static void __init proc_kcore_text_init(void) +{ +} +#endif + +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +/* + * MODULES_VADDR has no intersection with VMALLOC_ADDR. + */ +struct kcore_list kcore_modules; +static void __init add_modules_range(void) +{ + if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { + kclist_add(&kcore_modules, (void *)MODULES_VADDR, + MODULES_END - MODULES_VADDR, KCORE_VMALLOC); + } +} +#else +static void __init add_modules_range(void) +{ +} +#endif + +static int __init proc_kcore_init(void) +{ + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, + &proc_kcore_operations); + if (!proc_root_kcore) { + pr_err("couldn't create /proc/kcore\n"); + return 0; /* Always returns 0. */ + } + /* Store text area if it's special */ + proc_kcore_text_init(); + /* Store vmalloc area */ + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END - VMALLOC_START, KCORE_VMALLOC); + add_modules_range(); + /* Store direct-map area from physical memory map */ + kcore_update_ram(); + register_hotmemory_notifier(&kcore_callback_nb); + + return 0; +} +fs_initcall(proc_kcore_init); diff --git a/kernel/fs/proc/kmsg.c b/kernel/fs/proc/kmsg.c new file mode 100644 index 000000000..05f8dcdb0 --- /dev/null +++ b/kernel/fs/proc/kmsg.c @@ -0,0 +1,64 @@ +/* + * linux/fs/proc/kmsg.c + * + * Copyright (C) 1992 by Linus Torvalds + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern wait_queue_head_t log_wait; + +static int kmsg_open(struct inode * inode, struct file * file) +{ + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); +} + +static int kmsg_release(struct inode * inode, struct file * file) +{ + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); + return 0; +} + +static ssize_t kmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) + return -EAGAIN; + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); +} + +static unsigned int kmsg_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &log_wait, wait); + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) + return POLLIN | POLLRDNORM; + return 0; +} + + +static const struct file_operations proc_kmsg_operations = { + .read = kmsg_read, + .poll = kmsg_poll, + .open = kmsg_open, + .release = kmsg_release, + .llseek = generic_file_llseek, +}; + +static int __init proc_kmsg_init(void) +{ + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + return 0; +} +fs_initcall(proc_kmsg_init); diff --git a/kernel/fs/proc/loadavg.c b/kernel/fs/proc/loadavg.c new file mode 100644 index 000000000..aec66e6c2 --- /dev/null +++ b/kernel/fs/proc/loadavg.c @@ -0,0 +1,45 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int loadavg_proc_show(struct seq_file *m, void *v) +{ + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); + return 0; +} + +static int loadavg_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, loadavg_proc_show, NULL); +} + +static const struct file_operations loadavg_proc_fops = { + .open = loadavg_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_loadavg_init(void) +{ + proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + return 0; +} +fs_initcall(proc_loadavg_init); diff --git a/kernel/fs/proc/meminfo.c b/kernel/fs/proc/meminfo.c new file mode 100644 index 000000000..d3ebf2e61 --- /dev/null +++ b/kernel/fs/proc/meminfo.c @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_CMA +#include +#endif +#include +#include +#include "internal.h" + +void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) +{ +} + +static int meminfo_proc_show(struct seq_file *m, void *v) +{ + struct sysinfo i; + unsigned long committed; + struct vmalloc_info vmi; + long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + +/* + * display in kilobytes. + */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + si_meminfo(&i); + si_swapinfo(&i); + committed = percpu_counter_read_positive(&vm_committed_as); + + cached = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages() - i.bufferram; + if (cached < 0) + cached = 0; + + get_vmalloc_info(&vmi); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + + /* + * Tagged format, for easy grepping and expansion. + */ + seq_printf(m, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#ifdef CONFIG_HIGHMEM + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" +#endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" +#endif + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Shmem: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" + "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n" +#ifdef CONFIG_MEMORY_FAILURE + "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif +#ifdef CONFIG_CMA + "CmaTotal: %8lu kB\n" + "CmaFree: %8lu kB\n" +#endif + , + K(i.totalram), + K(i.freeram), + K(available), + K(i.bufferram), + K(cached), + K(total_swapcache_pages()), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#ifdef CONFIG_HIGHMEM + K(i.totalhigh), + K(i.freehigh), + K(i.totalram-i.totalhigh), + K(i.freeram-i.freehigh), +#endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), +#endif + K(i.totalswap), + K(i.freeswap), + K(global_page_state(NR_FILE_DIRTY)), + K(global_page_state(NR_WRITEBACK)), + K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_FILE_MAPPED)), + K(i.sharedram), + K(global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_SLAB_RECLAIMABLE)), + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif + K(global_page_state(NR_UNSTABLE_NFS)), + K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), + K(vm_commit_limit()), + K(committed), + (unsigned long)VMALLOC_TOTAL >> 10, + vmi.used >> 10, + vmi.largest_chunk >> 10 +#ifdef CONFIG_MEMORY_FAILURE + , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif +#ifdef CONFIG_CMA + , K(totalcma_pages) + , K(global_page_state(NR_FREE_CMA_PAGES)) +#endif + ); + + hugetlb_report_meminfo(m); + + arch_report_meminfo(m); + + return 0; +#undef K +} + +static int meminfo_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, meminfo_proc_show, NULL); +} + +static const struct file_operations meminfo_proc_fops = { + .open = meminfo_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_meminfo_init(void) +{ + proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + return 0; +} +fs_initcall(proc_meminfo_init); diff --git a/kernel/fs/proc/namespaces.c b/kernel/fs/proc/namespaces.c new file mode 100644 index 000000000..e512642db --- /dev/null +++ b/kernel/fs/proc/namespaces.c @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, +#endif + &mntns_operations, +}; + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + struct task_struct *task; + struct path ns_path; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + return error; + + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) + nd_jump_link(nd, &ns_path); + } + put_task_struct(task); + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + struct task_struct *task; + char name[50]; + int res = -EACCES; + + task = get_proc_task(inode); + if (!task) + return res; + + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + res = ns_get_name(name, sizeof(name), task, ns_ops); + if (res >= 0) + res = readlink_copy(buffer, buflen, name); + } + put_task_struct(task); + return res; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + +static int proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; + + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, 0)) + return 0; +out: + return -ENOENT; +} + +static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + const struct proc_ns_operations **entry, **last; + + if (!task) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) + goto out; + entry = ns_entries + (ctx->pos - 2); + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + const struct proc_ns_operations *ops = *entry; + if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops)) + break; + ctx->pos++; + entry++; + } +out: + put_task_struct(task); + return 0; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .iterate = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + int error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = -ENOENT; + + if (!task) + goto out_no_task; + + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + if (entry == last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return ERR_PTR(error); +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; diff --git a/kernel/fs/proc/nommu.c b/kernel/fs/proc/nommu.c new file mode 100644 index 000000000..d4a35746c --- /dev/null +++ b/kernel/fs/proc/nommu.c @@ -0,0 +1,134 @@ +/* nommu.c: mmu-less memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * display a single region to a sequenced file + */ +static int nommu_region_show(struct seq_file *m, struct vm_region *region) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags; + + flags = region->vm_flags; + file = region->vm_file; + + if (file) { + struct inode *inode = file_inode(region->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + region->vm_start, + region->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino); + + if (file) { + seq_pad(m, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display a list of all the REGIONs the kernel knows about + * - nommu kernels have a single flat list + */ +static int nommu_region_list_show(struct seq_file *m, void *_p) +{ + struct rb_node *p = _p; + + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); +} + +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) +{ + struct rb_node *p; + loff_t pos = *_pos; + + down_read(&nommu_region_sem); + + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; +} + +static void nommu_region_list_stop(struct seq_file *m, void *v) +{ + up_read(&nommu_region_sem); +} + +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return rb_next((struct rb_node *) v); +} + +static const struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show +}; + +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &proc_nommu_region_list_seqop); +} + +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_nommu_init(void) +{ + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + return 0; +} + +fs_initcall(proc_nommu_init); diff --git a/kernel/fs/proc/page.c b/kernel/fs/proc/page.c new file mode 100644 index 000000000..7eee2d8b9 --- /dev/null +++ b/kernel/fs/proc/page.c @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) + +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + if (!ppage || PageSlab(ppage)) + pcount = 0; + else + pcount = page_mapcount(ppage); + + if (put_user(pcount, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} + +u64 stable_page_flags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + if (PageLRU(head) || PageAnon(head)) + u |= 1 << KPF_THP; + else if (is_huge_zero_page(head)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } + } else if (is_zero_pfn(page_to_pfn(page))) + u |= 1 << KPF_ZERO_PAGE; + + + /* + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. + */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + if (PageBalloon(page)) + u |= 1 << KPF_BALLOON; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); + +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); + + return u; +}; + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (put_user(stable_page_flags(ppage), out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; + +static int __init proc_page_init(void) +{ + proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); + proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + return 0; +} +fs_initcall(proc_page_init); diff --git a/kernel/fs/proc/proc_net.c b/kernel/fs/proc/proc_net.c new file mode 100644 index 000000000..350984a19 --- /dev/null +++ b/kernel/fs/proc/proc_net.c @@ -0,0 +1,233 @@ +/* + * linux/fs/proc/net.c + * + * Copyright (C) 2007 + * + * Author: Eric Biederman + * + * proc net directory handling functions + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} + +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + +int seq_open_net(struct inode *ino, struct file *f, + const struct seq_operations *ops, int size) +{ + struct net *net; + struct seq_net_private *p; + + BUG_ON(size < sizeof(*p)); + + net = get_proc_net(ino); + if (net == NULL) + return -ENXIO; + + p = __seq_open_private(f, ops, size); + if (p == NULL) { + put_net(net); + return -ENOMEM; + } +#ifdef CONFIG_NET_NS + p->net = net; +#endif + return 0; +} +EXPORT_SYMBOL_GPL(seq_open_net); + +int single_open_net(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} +EXPORT_SYMBOL_GPL(single_open_net); + +int seq_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq; + + seq = f->private_data; + + put_net(seq_file_net(seq)); + seq_release_private(ino, f); + return 0; +} +EXPORT_SYMBOL_GPL(seq_release_net); + +int single_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); +} +EXPORT_SYMBOL_GPL(single_release_net); + +static struct net *get_proc_task_net(struct inode *dir) +{ + struct task_struct *task; + struct nsproxy *ns; + struct net *net = NULL; + + rcu_read_lock(); + task = pid_task(proc_pid(dir), PIDTYPE_PID); + if (task != NULL) { + task_lock(task); + ns = task->nsproxy; + if (ns != NULL) + net = get_net(ns->net_ns); + task_unlock(task); + } + rcu_read_unlock(); + + return net; +} + +static struct dentry *proc_tgid_net_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + struct dentry *de; + struct net *net; + + de = ERR_PTR(-ENOENT); + net = get_proc_task_net(dir); + if (net != NULL) { + de = proc_lookup_de(net->proc_net, dir, dentry); + put_net(net); + } + return de; +} + +static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct net *net; + + net = get_proc_task_net(inode); + + generic_fillattr(inode, stat); + + if (net != NULL) { + stat->nlink = net->proc_net->nlink; + put_net(net); + } + + return 0; +} + +const struct inode_operations proc_net_inode_operations = { + .lookup = proc_tgid_net_lookup, + .getattr = proc_tgid_net_getattr, +}; + +static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) +{ + int ret; + struct net *net; + + ret = -EINVAL; + net = get_proc_task_net(file_inode(file)); + if (net != NULL) { + ret = proc_readdir_de(net->proc_net, file, ctx); + put_net(net); + } + return ret; +} + +const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = proc_tgid_net_readdir, +}; + +static __net_init int proc_net_ns_init(struct net *net) +{ + struct proc_dir_entry *netd, *net_statd; + int err; + + err = -ENOMEM; + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + if (!netd) + goto out; + + netd->subdir = RB_ROOT; + netd->data = net; + netd->nlink = 2; + netd->namelen = 3; + netd->parent = &proc_root; + memcpy(netd->name, "net", 4); + + err = -EEXIST; + net_statd = proc_net_mkdir(net, "stat", netd); + if (!net_statd) + goto free_net; + + net->proc_net = netd; + net->proc_net_stat = net_statd; + return 0; + +free_net: + kfree(netd); +out: + return err; +} + +static __net_exit void proc_net_ns_exit(struct net *net) +{ + remove_proc_entry("stat", net->proc_net); + kfree(net->proc_net); +} + +static struct pernet_operations __net_initdata proc_net_ns_ops = { + .init = proc_net_ns_init, + .exit = proc_net_ns_exit, +}; + +int __init proc_net_init(void) +{ + proc_symlink("net", NULL, "self/net"); + + return register_pernet_subsys(&proc_net_ns_ops); +} diff --git a/kernel/fs/proc/proc_sysctl.c b/kernel/fs/proc/proc_sysctl.c new file mode 100644 index 000000000..fdda62e61 --- /dev/null +++ b/kernel/fs/proc/proc_sysctl.c @@ -0,0 +1,1619 @@ +/* + * /proc/sys support + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct dentry_operations proc_sys_dentry_operations; +static const struct file_operations proc_sys_file_operations; +static const struct inode_operations proc_sys_inode_operations; +static const struct file_operations proc_sys_dir_file_operations; +static const struct inode_operations proc_sys_dir_operations; + +/* Support for permanently empty directories */ + +struct ctl_table sysctl_mount_point[] = { + { } +}; + +static bool is_empty_dir(struct ctl_table_header *head) +{ + return head->ctl_table[0].child == sysctl_mount_point; +} + +static void set_empty_dir(struct ctl_dir *dir) +{ + dir->header.ctl_table[0].child = sysctl_mount_point; +} + +static void clear_empty_dir(struct ctl_dir *dir) + +{ + dir->header.ctl_table[0].child = NULL; +} + +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IFDIR|S_IRUGO|S_IXUGO, + }, + { } +}; +static struct ctl_table_root sysctl_table_root = { + .default_set.dir.header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table }}, + .ctl_table_arg = root_table, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + pr_cont("%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int minlen; + int cmp; + + minlen = len1; + if (minlen > len2) + minlen = len2; + + cmp = memcmp(name1, name2, minlen); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; + + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; + } + } + return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + pr_err("sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + pr_cont("/%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_node *node, struct ctl_table *table) +{ + head->ctl_table = table; + head->ctl_table_arg = table; + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; + head->node = node; + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) + node->header = head; + } +} + +static void erase_header(struct ctl_table_header *head) +{ + struct ctl_table *entry; + for (entry = head->ctl_table; entry->procname; entry++) + erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ + struct ctl_table *entry; + int err; + + /* Is this a permanently empty directory? */ + if (is_empty_dir(&dir->header)) + return -EROFS; + + /* Am I creating a permanently empty directory? */ + if (header->ctl_table == sysctl_mount_point) { + if (!RB_EMPTY_ROOT(&dir->root)) + return -EINVAL; + set_empty_dir(dir); + } + + dir->header.nreg++; + header->parent = dir; + err = insert_links(header); + if (err) + goto fail_links; + for (entry = header->ctl_table; entry->procname; entry++) { + err = insert_entry(header, entry); + if (err) + goto fail; + } + return 0; +fail: + erase_header(header); + put_links(header); +fail_links: + if (header->ctl_table == sysctl_mount_point) + clear_empty_dir(dir); + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + erase_header(p); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + BUG_ON(!head); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + spin_lock(&sysctl_lock); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + spin_unlock(&sysctl_lock); + return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ + struct ctl_node *ctl_node; + + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; + } + return NULL; +} + +static void first_entry(struct ctl_dir *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = NULL; + struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; + + spin_lock(&sysctl_lock); + ctl_node = first_usable_entry(rb_first(&dir->root)); + spin_unlock(&sysctl_lock); + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +void register_sysctl_root(struct ctl_table_root *root) +{ +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) + mode >>= 6; + else if (in_egroup_p(GLOBAL_ROOT_GID)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +{ + struct ctl_table_root *root = head->root; + int mode; + + if (root->permissions) + mode = root->permissions(head, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static struct inode *proc_sys_make_inode(struct super_block *sb, + struct ctl_table_header *head, struct ctl_table *table) +{ + struct inode *inode; + struct proc_inode *ei; + + inode = new_inode(sb); + if (!inode) + goto out; + + inode->i_ino = get_next_ino(); + + sysctl_head_get(head); + ei = PROC_I(inode); + ei->sysctl = head; + ei->sysctl_entry = table; + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = table->mode; + if (!S_ISDIR(table->mode)) { + inode->i_mode |= S_IFREG; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + } else { + inode->i_mode |= S_IFDIR; + inode->i_op = &proc_sys_dir_operations; + inode->i_fop = &proc_sys_dir_file_operations; + if (is_empty_dir(head)) + make_empty_dir_inode(inode); + } +out: + return inode; +} + +static struct ctl_table_header *grab_header(struct inode *inode) +{ + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &sysctl_table_root.default_set.dir.header; + return sysctl_head_grab(head); +} + +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ctl_table_header *head = grab_header(dir); + struct ctl_table_header *h = NULL; + struct qstr *name = &dentry->d_name; + struct ctl_table *p; + struct inode *inode; + struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; + int ret; + + if (IS_ERR(head)) + return ERR_CAST(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + p = lookup_entry(&h, ctl_dir, name->name, name->len); + if (!p) + goto out; + + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + } + + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (!inode) + goto out; + + err = NULL; + d_set_d_op(dentry, &proc_sys_dentry_operations); + d_add(dentry, inode); + +out: + if (h) + sysctl_head_finish(h); + sysctl_head_finish(head); + return err; +} + +static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct inode *inode = file_inode(filp); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + ssize_t error; + size_t res; + + if (IS_ERR(head)) + return PTR_ERR(head); + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) + goto out; + + /* if that can happen at all, it should be -EINVAL, not -EISDIR */ + error = -EINVAL; + if (!table->proc_handler) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = table->proc_handler(table, write, buf, &res, ppos); + if (!error) + error = res; +out: + sysctl_head_finish(head); + + return error; +} + +static ssize_t proc_sys_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); +} + +static ssize_t proc_sys_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); +} + +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + sysctl_head_finish(head); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = file_inode(filp); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned int ret = DEFAULT_POLLMASK; + unsigned long event; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return POLLERR | POLLHUP; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + event = (unsigned long)filp->private_data; + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + sysctl_head_finish(head); + + return ret; +} + +static bool proc_sys_fill_cache(struct file *file, + struct dir_context *ctx, + struct ctl_table_header *head, + struct ctl_table *table) +{ + struct dentry *child, *dir = file->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = table->procname; + qname.len = strlen(table->procname); + qname.hash = full_name_hash(qname.name, qname.len); + + child = d_lookup(dir, &qname); + if (!child) { + child = d_alloc(dir, &qname); + if (child) { + inode = proc_sys_make_inode(dir->d_sb, head, table); + if (!inode) { + dput(child); + return false; + } else { + d_set_d_op(child, &proc_sys_dentry_operations); + d_add(child, inode); + } + } else { + return false; + } + } + inode = d_inode(child); + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); + return dir_emit(ctx, qname.name, qname.len, ino, type); +} + +static bool proc_sys_link_fill_cache(struct file *file, + struct dir_context *ctx, + struct ctl_table_header *head, + struct ctl_table *table) +{ + bool ret = true; + head = sysctl_head_grab(head); + + if (S_ISLNK(table->mode)) { + /* It is not an error if we can not follow the link ignore it */ + int err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + } + + ret = proc_sys_fill_cache(file, ctx, head, table); +out: + sysctl_head_finish(head); + return ret; +} + +static int scan(struct ctl_table_header *head, struct ctl_table *table, + unsigned long *pos, struct file *file, + struct dir_context *ctx) +{ + bool res; + + if ((*pos)++ < ctx->pos) + return true; + + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, ctx, head, table); + else + res = proc_sys_fill_cache(file, ctx, head, table); + + if (res) + ctx->pos = *pos; + + return res; +} + +static int proc_sys_readdir(struct file *file, struct dir_context *ctx) +{ + struct ctl_table_header *head = grab_header(file_inode(file)); + struct ctl_table_header *h = NULL; + struct ctl_table *entry; + struct ctl_dir *ctl_dir; + unsigned long pos; + + if (IS_ERR(head)) + return PTR_ERR(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + if (!dir_emit_dots(file, ctx)) + return 0; + + pos = 2; + + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { + if (!scan(h, entry, &pos, file, ctx)) { + sysctl_head_finish(h); + break; + } + } + sysctl_head_finish(head); + return 0; +} + +static int proc_sys_permission(struct inode *inode, int mask) +{ + /* + * sysctl entries that are not writeable, + * are _NOT_ writeable, capabilities or not. + */ + struct ctl_table_header *head; + struct ctl_table *table; + int error; + + /* Executable files are not allowed under /proc/sys/ */ + if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) + return -EACCES; + + head = grab_header(inode); + if (IS_ERR(head)) + return PTR_ERR(head); + + table = PROC_I(inode)->sysctl_entry; + if (!table) /* global root - r-xr-xr-x */ + error = mask & MAY_WRITE ? -EACCES : 0; + else /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); + + sysctl_head_finish(head); + return error; +} + +static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (IS_ERR(head)) + return PTR_ERR(head); + + generic_fillattr(inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; + + sysctl_head_finish(head); + return 0; +} + +static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, + .read = proc_sys_read, + .write = proc_sys_write, + .llseek = default_llseek, +}; + +static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, + .iterate = proc_sys_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations proc_sys_inode_operations = { + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static const struct inode_operations proc_sys_dir_operations = { + .lookup = proc_sys_lookup, + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + return !PROC_I(d_inode(dentry))->sysctl->unregistering; +} + +static int proc_sys_delete(const struct dentry *dentry) +{ + return !!PROC_I(d_inode(dentry))->sysctl->unregistering; +} + +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct ctl_table_header *head; + struct inode *inode; + + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + /* AV: can it, indeed? */ + inode = d_inode_rcu(dentry); + if (!inode) + return 1; + if (name->len != len) + return 1; + if (memcmp(name->name, str, len)) + return 1; + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); +} + +static const struct dentry_operations proc_sys_dentry_operations = { + .d_revalidate = proc_sys_revalidate, + .d_delete = proc_sys_delete, + .d_compare = proc_sys_compare, +}; + +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + entry = find_entry(&head, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + struct ctl_node *node; + char *new_name; + + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); + if (!new) + return NULL; + + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->dir.header.root, set, node, table); + + return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_set *set = dir->header.set; + struct ctl_dir *subdir, *new = NULL; + int err; + + spin_lock(&sysctl_lock); + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + /* Was the subdir added while we dropped the lock? */ + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + /* Nope. Use the our freshly made directory entry. */ + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) + goto failed; + subdir = new; +found: + subdir->header.nreg++; +failed: + if (unlikely(IS_ERR(subdir))) { + pr_err("sysctl could not get directory: "); + sysctl_print_dir(dir); + pr_cont("/%*.*s %ld\n", + namelen, namelen, name, PTR_ERR(subdir)); + } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + ret = 0; + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root, namespaces); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); + + va_end(args); + return -EINVAL; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ + int err = 0; + for (; table->procname; table++) { + if (table->child) + err = sysctl_err(path, table, "Not a file"); + + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + err = sysctl_err(path, table, "No data"); + if (!table->maxlen) + err = sysctl_err(path, table, "No maxlen"); + } + if (!table->proc_handler) + err = sysctl_err(path, table, "No proc_handler"); + + if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) + err = sysctl_err(path, table, "bogus .mode 0%o", + table->mode); + } + return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + struct ctl_node *node; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + for (entry = table; entry->procname; entry++) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); + link_name = (char *)&link_table[nr_entries + 1]; + + for (link = link_table, entry = table; entry->procname; link++, entry++) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + } + init_header(links, dir->header.root, dir->header.set, node, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent = NULL; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table) +{ + struct ctl_table_root *root = set->dir.header.root; + struct ctl_table_header *header; + const char *name, *nextname; + struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + for (entry = table; entry->procname; entry++) + nr_entries++; + + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + if (!header) + return NULL; + + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + dir = &set->dir; + /* Reference moved down the diretory tree get_subdir */ + dir->header.nreg++; + spin_unlock(&sysctl_lock); + + /* Find the directory for the ctl_table */ + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + goto fail; + } + + spin_lock(&sysctl_lock); + if (insert_header(dir, header)) + goto fail_put_dir_locked; + + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); + + return header; + +fail_put_dir_locked: + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, struct ctl_table_set *set, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + for (new = files, entry = table; entry->procname; entry++) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(set, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + for (entry = table; entry->procname; entry++) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + set, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = table; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } + if (nr_subheaders == 1) { + header = __register_sysctl_table(set, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; + header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + set, table)) + goto err_register_leaves; + } + +out: + kfree(new_path); + return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + for (entry = header->ctl_table; entry->procname; entry++) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + pr_err("sysctl link missing during unregister: "); + sysctl_print_dir(parent); + pr_cont("/%s\n", name); + } + } +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ + struct ctl_dir *parent = header->parent; + + if (--header->nreg) + return; + + put_links(header); + start_unregistering(header); + if (!--header->count) + kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + int nr_subheaders; + might_sleep(); + + if (header == NULL) + return; + + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + + spin_lock(&sysctl_lock); + drop_sysctl_table(header); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ + memset(set, 0, sizeof(*set)); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} + +int __init proc_sys_init(void) +{ + struct proc_dir_entry *proc_sys_root; + + proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root->proc_iops = &proc_sys_dir_operations; + proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->nlink = 0; + + return sysctl_init(); +} diff --git a/kernel/fs/proc/proc_tty.c b/kernel/fs/proc/proc_tty.c new file mode 100644 index 000000000..15f327bed --- /dev/null +++ b/kernel/fs/proc/proc_tty.c @@ -0,0 +1,189 @@ +/* + * proc_tty.c -- handles /proc/tty + * + * Copyright 1997, Theodore Ts'o + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The /proc/tty directory inodes... + */ +static struct proc_dir_entry *proc_tty_driver; + +/* + * This is the handler for /proc/tty/drivers + */ +static void show_tty_range(struct seq_file *m, struct tty_driver *p, + dev_t from, int num) +{ + seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown"); + seq_printf(m, "/dev/%-8s ", p->name); + if (p->num > 1) { + seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from), + MINOR(from) + num - 1); + } else { + seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from)); + } + switch (p->type) { + case TTY_DRIVER_TYPE_SYSTEM: + seq_puts(m, "system"); + if (p->subtype == SYSTEM_TYPE_TTY) + seq_puts(m, ":/dev/tty"); + else if (p->subtype == SYSTEM_TYPE_SYSCONS) + seq_puts(m, ":console"); + else if (p->subtype == SYSTEM_TYPE_CONSOLE) + seq_puts(m, ":vtmaster"); + break; + case TTY_DRIVER_TYPE_CONSOLE: + seq_puts(m, "console"); + break; + case TTY_DRIVER_TYPE_SERIAL: + seq_puts(m, "serial"); + break; + case TTY_DRIVER_TYPE_PTY: + if (p->subtype == PTY_TYPE_MASTER) + seq_puts(m, "pty:master"); + else if (p->subtype == PTY_TYPE_SLAVE) + seq_puts(m, "pty:slave"); + else + seq_puts(m, "pty"); + break; + default: + seq_printf(m, "type:%d.%d", p->type, p->subtype); + } + seq_putc(m, '\n'); +} + +static int show_tty_driver(struct seq_file *m, void *v) +{ + struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers); + dev_t from = MKDEV(p->major, p->minor_start); + dev_t to = from + p->num; + + if (&p->tty_drivers == tty_drivers.next) { + /* pseudo-drivers first */ + seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); + seq_puts(m, "system:/dev/tty\n"); + seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); + seq_puts(m, "system:console\n"); +#ifdef CONFIG_UNIX98_PTYS + seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); + seq_puts(m, "system\n"); +#endif +#ifdef CONFIG_VT + seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); + seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); + seq_puts(m, "system:vtmaster\n"); +#endif + } + + while (MAJOR(from) < MAJOR(to)) { + dev_t next = MKDEV(MAJOR(from)+1, 0); + show_tty_range(m, p, from, next - from); + from = next; + } + if (from != to) + show_tty_range(m, p, from, to - from); + return 0; +} + +/* iterator */ +static void *t_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tty_mutex); + return seq_list_start(&tty_drivers, *pos); +} + +static void *t_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tty_drivers, pos); +} + +static void t_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tty_mutex); +} + +static const struct seq_operations tty_drivers_op = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = show_tty_driver +}; + +static int tty_drivers_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &tty_drivers_op); +} + +static const struct file_operations proc_tty_drivers_operations = { + .open = tty_drivers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * This function is called by tty_register_driver() to handle + * registering the driver's /proc handler into /proc/tty/driver/ + */ +void proc_tty_register_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_fops) + return; + + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); + driver->proc_entry = ent; +} + +/* + * This function is called by tty_unregister_driver() + */ +void proc_tty_unregister_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + ent = driver->proc_entry; + if (!ent) + return; + + remove_proc_entry(driver->driver_name, proc_tty_driver); + + driver->proc_entry = NULL; +} + +/* + * Called by proc_root_init() to initialize the /proc/tty subtree + */ +void __init proc_tty_init(void) +{ + if (!proc_mkdir("tty", NULL)) + return; + proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ + /* + * /proc/tty/driver/serial reveals the exact character counts for + * serial links which is just too easy to abuse for inferring + * password lengths and inter-keystroke timings during password + * entry. + */ + proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); + proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); + proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); +} diff --git a/kernel/fs/proc/root.c b/kernel/fs/proc/root.c new file mode 100644 index 000000000..68feb0f70 --- /dev/null +++ b/kernel/fs/proc/root.c @@ -0,0 +1,270 @@ +/* + * linux/fs/proc/root.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc root directory handling functions + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static int proc_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int proc_set_super(struct super_block *sb, void *data) +{ + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; +} + +enum { + Opt_gid, Opt_hidepid, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = make_kgid(current_user_ns(), option); + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + + sync_filesystem(sb); + return !proc_parse_options(data, pid); +} + +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int err; + struct super_block *sb; + struct pid_namespace *ns; + char *options; + + if (flags & MS_KERNMOUNT) { + ns = (struct pid_namespace *)data; + options = NULL; + } else { + ns = task_active_pid_ns(current); + options = data; + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + + sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } + + if (!sb->s_root) { + err = proc_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +static void proc_kill_sb(struct super_block *sb) +{ + struct pid_namespace *ns; + + ns = (struct pid_namespace *)sb->s_fs_info; + if (ns->proc_self) + dput(ns->proc_self); + if (ns->proc_thread_self) + dput(ns->proc_thread_self); + kill_anon_super(sb); + put_pid_ns(ns); +} + +static struct file_system_type proc_fs_type = { + .name = "proc", + .mount = proc_mount, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, +}; + +void __init proc_root_init(void) +{ + int err; + + proc_init_inodecache(); + err = register_filesystem(&proc_fs_type); + if (err) + return; + + proc_self_init(); + proc_thread_self_init(); + proc_symlink("mounts", NULL, "self/mounts"); + + proc_net_init(); + +#ifdef CONFIG_SYSVIPC + proc_mkdir("sysvipc", NULL); +#endif + proc_mkdir("fs", NULL); + proc_mkdir("driver", NULL); + proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ +#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) + /* just give it a mountpoint */ + proc_create_mount_point("openprom"); +#endif + proc_tty_init(); + proc_mkdir("bus", NULL); + proc_sys_init(); +} + +static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +) +{ + generic_fillattr(d_inode(dentry), stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} + +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) +{ + if (!proc_pid_lookup(dir, dentry, flags)) + return NULL; + + return proc_lookup(dir, dentry, flags); +} + +static int proc_root_readdir(struct file *file, struct dir_context *ctx) +{ + if (ctx->pos < FIRST_PROCESS_ENTRY) { + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; + ctx->pos = FIRST_PROCESS_ENTRY; + } + + return proc_pid_readdir(file, ctx); +} + +/* + * The root /proc directory is special, as it has the + * directories. Thus we don't use the generic + * directory handling functions for that.. + */ +static const struct file_operations proc_root_operations = { + .read = generic_read_dir, + .iterate = proc_root_readdir, + .llseek = default_llseek, +}; + +/* + * proc root can do almost nothing.. + */ +static const struct inode_operations proc_root_inode_operations = { + .lookup = proc_root_lookup, + .getattr = proc_root_getattr, +}; + +/* + * This is the root "inode" in the /proc tree.. + */ +struct proc_dir_entry proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .count = ATOMIC_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &proc_root, + .subdir = RB_ROOT, + .name = "/proc", +}; + +int pid_ns_prepare_proc(struct pid_namespace *ns) +{ + struct vfsmount *mnt; + + mnt = kern_mount_data(&proc_fs_type, ns); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + ns->proc_mnt = mnt; + return 0; +} + +void pid_ns_release_proc(struct pid_namespace *ns) +{ + kern_unmount(ns->proc_mnt); +} diff --git a/kernel/fs/proc/self.c b/kernel/fs/proc/self.c new file mode 100644 index 000000000..6195b4a7c --- /dev/null +++ b/kernel/fs/proc/self.c @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include "internal.h" + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return readlink_copy(buffer, buflen, tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = kfree_put_link, +}; + +static unsigned self_inum; + +int proc_setup_self(struct super_block *s) +{ + struct inode *root_inode = d_inode(s->s_root); + struct pid_namespace *ns = s->s_fs_info; + struct dentry *self; + + mutex_lock(&root_inode->i_mutex); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + } else { + dput(self); + self = ERR_PTR(-ENOMEM); + } + } else { + self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(self)) { + pr_err("proc_fill_super: can't allocate /proc/self\n"); + return PTR_ERR(self); + } + ns->proc_self = self; + return 0; +} + +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); +} diff --git a/kernel/fs/proc/softirqs.c b/kernel/fs/proc/softirqs.c new file mode 100644 index 000000000..ad8a77f94 --- /dev/null +++ b/kernel/fs/proc/softirqs.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +/* + * /proc/softirqs ... display the number of softirqs + */ +static int show_softirqs(struct seq_file *p, void *v) +{ + int i, j; + + seq_puts(p, " "); + for_each_possible_cpu(i) + seq_printf(p, "CPU%-8d", i); + seq_putc(p, '\n'); + + for (i = 0; i < NR_SOFTIRQS; i++) { + seq_printf(p, "%12s:", softirq_to_name[i]); + for_each_possible_cpu(j) + seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_putc(p, '\n'); + } + return 0; +} + +static int softirqs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_softirqs, NULL); +} + +static const struct file_operations proc_softirqs_operations = { + .open = softirqs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_softirqs_init(void) +{ + proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + return 0; +} +fs_initcall(proc_softirqs_init); diff --git a/kernel/fs/proc/stat.c b/kernel/fs/proc/stat.c new file mode 100644 index 000000000..510413eb2 --- /dev/null +++ b/kernel/fs/proc/stat.c @@ -0,0 +1,206 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + +#ifdef arch_idle_time + +static cputime64_t get_idle_time(int cpu) +{ + cputime64_t idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + cputime64_t iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else + +static u64 get_idle_time(int cpu) +{ + u64 idle, idle_time = -1ULL; + + if (cpu_online(cpu)) + idle_time = get_cpu_idle_time_us(cpu, NULL); + + if (idle_time == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + else + idle = usecs_to_cputime64(idle_time); + + return idle; +} + +static u64 get_iowait_time(int cpu) +{ + u64 iowait, iowait_time = -1ULL; + + if (cpu_online(cpu)) + iowait_time = get_cpu_iowait_time_us(cpu, NULL); + + if (iowait_time == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + else + iowait = usecs_to_cputime64(iowait_time); + + return iowait; +} + +#endif + +static int show_stat(struct seq_file *p, void *v) +{ + int i, j; + unsigned long jif; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; + u64 sum = 0; + u64 sum_softirq = 0; + unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; + struct timespec boottime; + + user = nice = system = idle = iowait = + irq = softirq = steal = 0; + guest = guest_nice = 0; + getboottime(&boottime); + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned int softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_sums[j] += softirq_stat; + sum_softirq += softirq_stat; + } + } + sum += arch_irq_stat(); + + seq_puts(p, "cpu "); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + + for_each_online_cpu(i) { + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(i); + iowait = get_iowait_time(i); + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); + + /* sum again ? it could be updated? */ + for_each_irq_nr(j) + seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j)); + + seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)jif, + total_forks, + nr_running(), + nr_iowait()); + + seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + + for (i = 0; i < NR_SOFTIRQS; i++) + seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); + seq_putc(p, '\n'); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + size_t size = 1024 + 128 * num_online_cpus(); + + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + return single_open_size(file, show_stat, NULL, size); +} + +static const struct file_operations proc_stat_operations = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_stat_init(void) +{ + proc_create("stat", 0, NULL, &proc_stat_operations); + return 0; +} +fs_initcall(proc_stat_init); diff --git a/kernel/fs/proc/task_mmu.c b/kernel/fs/proc/task_mmu.c new file mode 100644 index 000000000..6dee68d01 --- /dev/null +++ b/kernel/fs/proc/task_mmu.c @@ -0,0 +1,1625 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + unsigned long data, text, lib, swap, ptes, pmds; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = get_mm_rss(mm); + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; + + data = mm->total_vm - mm->shared_vm - mm->stack_vm; + text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; + lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); + seq_printf(m, + "VmPeak:\t%8lu kB\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" + "VmHWM:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB\n" + "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", + hiwater_vm << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), + mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), + data << (PAGE_SHIFT-10), + mm->stack_vm << (PAGE_SHIFT-10), text, lib, + ptes >> 10, + pmds >> 10, + swap << (PAGE_SHIFT-10)); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + return PAGE_SIZE * mm->total_vm; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + *shared = get_mm_counter(mm, MM_FILEPAGES); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = mm->total_vm - mm->shared_vm; + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + return mm->total_vm; +} + +#ifdef CONFIG_NUMA +/* + * Save get_task_policy() for show_numa_map(). + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ + struct task_struct *task = priv->task; + + task_lock(task); + priv->task_mempolicy = get_task_policy(task); + mpol_get(priv->task_mempolicy); + task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ + mpol_put(priv->task_mempolicy); +} +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif + +static void vma_stop(struct proc_maps_private *priv) +{ + struct mm_struct *mm = priv->mm; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); +} + +static struct vm_area_struct * +m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma == priv->tail_vma) + return NULL; + return vma->vm_next ?: priv->tail_vma; +} + +static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) +{ + if (m->count < m->size) /* vma is copied successfully */ + m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned int pos = *ppos; + + /* See m_cache_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) + return NULL; + + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; + + down_read(&mm->mmap_sem); + hold_task_mempolicy(priv); + priv->tail_vma = get_gate_vma(mm); + + if (last_addr) { + vma = find_vma(mm, last_addr); + if (vma && (vma = m_next_vma(priv, vma))) + return vma; + } + + m->version = 0; + if (pos < mm->map_count) { + for (vma = mm->mmap; pos; pos--) { + m->version = vma->vm_start; + vma = vma->vm_next; + } + return vma; + } + + /* we do not bother to update m->version in this case */ + if (pos == mm->map_count && priv->tail_vma) + return priv->tail_vma; + + vma_stop(priv); + return NULL; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *next; + + (*pos)++; + next = m_next_vma(priv, v); + if (!next) + vma_stop(priv); + return next; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + + if (!IS_ERR_OR_NULL(v)) + vma_stop(priv); + if (priv->task) { + put_task_struct(priv->task); + priv->task = NULL; + } +} + +static int proc_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops, int psize) +{ + struct proc_maps_private *priv = __seq_open_private(file, ops, psize); + + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + +static int proc_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); +} + +static int do_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return proc_maps_open(inode, file, ops, + sizeof(struct proc_maps_private)); +} + +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + } + rcu_read_unlock(); + + return ret; +} + +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) +{ + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + struct proc_maps_private *priv = m->private; + vm_flags_t flags = vma->vm_flags; + unsigned long ino = 0; + unsigned long long pgoff = 0; + unsigned long start, end; + dev_t dev = 0; + const char *name = NULL; + + if (file) { + struct inode *inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } + + /* We don't show the stack guard page in /proc/maps */ + start = vma->vm_start; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (file) { + seq_pad(m, ' '); + seq_path(m, &file->f_path, "\n"); + goto done; + } + + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + + name = arch_vma_name(vma); + if (!name) { + pid_t tid; + + if (!mm) { + name = "[vdso]"; + goto done; + } + + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + goto done; + } + + tid = pid_of_stack(priv, vma, is_pid); + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) { + name = "[stack]"; + } else { + /* Thread stack in /proc/PID/maps */ + seq_pad(m, ' '); + seq_printf(m, "[stack:%d]", tid); + } + } + } + +done: + if (name) { + seq_pad(m, ' '); + seq_puts(m, name); + } + seq_putc(m, '\n'); +} + +static int show_map(struct seq_file *m, void *v, int is_pid) +{ + show_map_vma(m, v, is_pid); + m_cache_vma(m, v); + return 0; +} + +static int show_pid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 0); +} + +static const struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +#ifdef CONFIG_PROC_PAGE_MONITOR +struct mem_size_stats { + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long anonymous_thp; + unsigned long swap; + u64 pss; +}; + +static void smaps_account(struct mem_size_stats *mss, struct page *page, + unsigned long size, bool young, bool dirty) +{ + int mapcount; + + if (PageAnon(page)) + mss->anonymous += size; + + mss->resident += size; + /* Accumulate the size in pages that have been accessed. */ + if (young || PageReferenced(page)) + mss->referenced += size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + u64 pss_delta; + + if (dirty || PageDirty(page)) + mss->shared_dirty += size; + else + mss->shared_clean += size; + pss_delta = (u64)size << PSS_SHIFT; + do_div(pss_delta, mapcount); + mss->pss += pss_delta; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += size; + else + mss->private_clean += size; + mss->pss += (u64)size << PSS_SHIFT; + } +} + +static void smaps_pte_entry(pte_t *pte, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (!non_swap_entry(swpent)) + mss->swap += PAGE_SIZE; + else if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } + + if (!page) + return; + smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page; + + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return; + mss->anonymous_thp += HPAGE_PMD_SIZE; + smaps_account(mss, page, HPAGE_PMD_SIZE, + pmd_young(*pmd), pmd_dirty(*pmd)); +} +#else +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ +} +#endif + +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + smaps_pmd_entry(pmd, addr, walk); + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + /* + * The mmap_sem held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(pte, addr, walk); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", +#ifdef CONFIG_X86_INTEL_MPX + [ilog2(VM_MPX)] = "mp", +#endif + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = "sd", +#endif + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + +static int show_smap(struct seq_file *m, void *v, int is_pid) +{ + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; + + memset(&mss, 0, sizeof mss); + /* mmap_sem is held in m_start */ + walk_page_vma(vma, &smaps_walk); + + show_map_vma(m, vma, is_pid); + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10, + mss.anonymous >> 10, + mss.anonymous_thp >> 10, + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + + show_smap_vma_flags(m, vma); + m_cache_vma(m, vma); + return 0; +} + +static int show_pid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 0); +} + +static const struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_smap +}; + +static int pid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +static int tid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { + .open = pid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +const struct file_operations proc_tid_smaps_operations = { + .open = tid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_MM_HIWATER_RSS, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + enum clear_refs_types type; +}; + +#ifdef CONFIG_MEM_SOFT_DIRTY +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } + + set_pte_at(vma->vm_mm, addr, pte, ptent); +} + +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); + + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); +} + +#else + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} + +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ +} +#endif + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty_pmd(vma, addr, pmd); + goto out; + } + + page = pmd_page(*pmd); + + /* Clear accessed and referenced bits. */ + pmdp_test_and_clear_young(vma, addr, pmd); + ClearPageReferenced(page); +out: + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. + * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. + * Writing 4 to /proc/pid/clear_refs affects all pages. + */ + if (cp->type == CLEAR_REFS_ANON && vma->vm_file) + return 1; + if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) + return 1; + return 0; +} + +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum clear_refs_types type; + int itype; + int rv; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning!" + " See the linux/Documentation/vm/pagemap.txt for " + "details.\n"); + } + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + struct clear_refs_private cp = { + .type = type, + }; + struct mm_walk clear_refs_walk = { + .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, + .mm = mm, + .private = &cp, + }; + + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + down_write(&mm->mmap_sem); + reset_mm_hiwater_rss(mm); + up_write(&mm->mmap_sem); + goto out_mm; + } + + down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) { + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } + downgrade_write(&mm->mmap_sem); + break; + } + mmu_notifier_invalidate_range_start(mm, 0, -1); + } + walk_page_range(0, ~0UL, &clear_refs_walk); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); +out_mm: + mmput(mm); + } + put_task_struct(task); + + return count; +} + +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, + .llseek = noop_llseek, +}; + +typedef struct { + u64 pme; +} pagemap_entry_t; + +struct pagemapread { + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool v2; +}; + +#define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) + +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_STATUS_BITS 3 +#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) +#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) +#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) +#define PM_PSHIFT_BITS 6 +#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) +#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) + +#define __PM_SOFT_DIRTY (1LL) +#define PM_PRESENT PM_STATUS(4LL) +#define PM_SWAP PM_STATUS(2LL) +#define PM_FILE PM_STATUS(1LL) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) +#define PM_END_OF_BUFFER 1 + +static inline pagemap_entry_t make_pme(u64 val) +{ + return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + struct pagemapread *pm) +{ + pm->buffer[pm->pos++] = *pme; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + unsigned long addr = start; + int err = 0; + + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + /* End of address space hole, which we mark as non-present. */ + unsigned long hole_end; + + if (vma) + hole_end = min(end, vma->vm_start); + else + hole_end = end; + + for (; addr < hole_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } + + if (!vma) + break; + + /* Addresses in the VMA. */ + if (vma->vm_flags & VM_SOFTDIRTY) + pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } + } +out: + return err; +} + +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + u64 frame, flags; + struct page *page = NULL; + int flags2 = 0; + + if (pte_present(pte)) { + frame = pte_pfn(pte); + flags = PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags = PM_SWAP; + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } else { + if (vma->vm_flags & VM_SOFTDIRTY) + flags2 |= __PM_SOFT_DIRTY; + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); + return; + } + + if (page && !PageAnon(page)) + flags |= PM_FILE; + if ((vma->vm_flags & VM_SOFTDIRTY)) + flags2 |= __PM_SOFT_DIRTY; + + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) +{ + /* + * Currently pmd for thp is always present because thp can not be + * swapped-out, migrated, or HWPOISONed (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) + *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); +} +#else +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) +{ +} +#endif + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + int pmd_flags2; + + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) + pmd_flags2 = __PM_SOFT_DIRTY; + else + pmd_flags2 = 0; + + for (; addr != end; addr += PAGE_SIZE) { + unsigned long offset; + pagemap_entry_t pme; + + offset = (addr & ~PAGEMAP_WALK_MASK) >> + PAGE_SHIFT; + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + spin_unlock(ptl); + return err; + } + + if (pmd_trans_unstable(pmd)) + return 0; + + /* + * We can assume that @vma always points to a valid one and @end never + * goes beyond vma->vm_end. + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pagemap_entry_t pme; + + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + pte_unmap_unlock(orig_pte, ptl); + + cond_resched(); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pte_t pte, int offset, int flags2) +{ + if (pte_present(pte)) + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | + PM_STATUS2(pm->v2, flags2) | + PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | + PM_STATUS2(pm->v2, flags2)); +} + +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + struct vm_area_struct *vma = walk->vma; + int err = 0; + int flags2; + pagemap_entry_t pme; + + if (vma->vm_flags & VM_SOFTDIRTY) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + + for (; addr != end; addr += PAGE_SIZE) { + int offset = (addr & ~hmask) >> PAGE_SHIFT; + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} +#endif /* HUGETLB_PAGE */ + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit entry + * consisting of the following: + * + * Bits 0-54 page frame number (PFN) if present + * Bits 0-4 swap type if swapped + * Bits 5-54 swap offset if swapped + * Bits 55-60 page shift (page size = 1<> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); + ret = -ENOMEM; + if (!pm.buffer) + goto out_task; + + mm = mm_access(task, PTRACE_MODE_READ); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; + + pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE + pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif + pagemap_walk.mm = mm; + pagemap_walk.private = ± + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + start_vaddr = svpfn << PAGE_SHIFT; + end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + down_read(&mm->mmap_sem); + ret = walk_page_range(start_vaddr, end, &pagemap_walk); + up_read(&mm->mmap_sem); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_mm; + } + copied += len; + buf += len; + count -= len; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + +out_mm: + mmput(mm); +out_free: + kfree(pm.buffer); +out_task: + put_task_struct(task); +out: + return ret; +} + +static int pagemap_open(struct inode *inode, struct file *file) +{ + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, + .open = pagemap_open, +}; +#endif /* CONFIG_PROC_PAGE_MONITOR */ + +#ifdef CONFIG_NUMA + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) +{ + int count = page_mapcount(page); + + md->pages += nr_pages; + if (pte_dirty || PageDirty(page)) + md->dirty += nr_pages; + + if (PageSwapCache(page)) + md->swapcache += nr_pages; + + if (PageActive(page) || PageUnevictable(page)) + md->active += nr_pages; + + if (PageWriteback(page)) + md->writeback += nr_pages; + + if (PageAnon(page)) + md->anon += nr_pages; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md = walk->private; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, vma, addr); + if (!page) + continue; + gather_stats(page, md, pte_dirty(*pte), 1); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (!pte_present(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte), 1); + return 0; +} + +#else +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v, int is_pid) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .private = md, + .mm = mm, + }; + struct mempolicy *pol; + char buffer[64]; + int nid; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + pol = __get_vma_policy(vma, vma->vm_start); + if (pol) { + mpol_to_str(buffer, sizeof(buffer), pol); + mpol_cond_put(pol); + } else { + mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); + } + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_puts(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_puts(m, " heap"); + } else { + pid_t tid = pid_of_stack(proc_priv, vma, is_pid); + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_puts(m, " stack"); + else + seq_printf(m, " stack:%d", tid); + } + } + + if (is_vm_hugetlb_page(vma)) + seq_puts(m, " huge"); + + /* mmap_sem is held by m_start */ + walk_page_vma(vma, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(nid, N_MEMORY) + if (md->node[nid]) + seq_printf(m, " N%d=%lu", nid, md->node[nid]); + + seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); +out: + seq_putc(m, '\n'); + m_cache_vma(m, vma); + return 0; +} + +static int show_pid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 0); +} + +static const struct seq_operations proc_pid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_numa_map, +}; + +static const struct seq_operations proc_tid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_numa_map, +}; + +static int numa_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return proc_maps_open(inode, file, ops, + sizeof(struct numa_maps_private)); +} + +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { + .open = pid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +const struct file_operations proc_tid_numa_maps_operations = { + .open = tid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; +#endif /* CONFIG_NUMA */ diff --git a/kernel/fs/proc/task_nommu.c b/kernel/fs/proc/task_nommu.c new file mode 100644 index 000000000..599ec2e20 --- /dev/null +++ b/kernel/fs/proc/task_nommu.c @@ -0,0 +1,345 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Logic: we've got two memory sums for each process, "shared", and + * "non-shared". Shared memory may get counted more than once, for + * each process that owns it. Non-shared memory is counted + * accurately. + */ +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } + + if (atomic_read(&mm->mm_count) > 1 || + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; + } else { + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; + } + } + + if (atomic_read(&mm->mm_count) > 1) + sbytes += kobjsize(mm); + else + bytes += kobjsize(mm); + + if (current->fs && current->fs->users > 1) + sbytes += kobjsize(current->fs); + else + bytes += kobjsize(current->fs); + + if (current->files && atomic_read(¤t->files->count) > 1) + sbytes += kobjsize(current->files); + else + bytes += kobjsize(current->files); + + if (current->sighand && atomic_read(¤t->sighand->count) > 1) + sbytes += kobjsize(current->sighand); + else + bytes += kobjsize(current->sighand); + + bytes += kobjsize(current); /* includes kernel stack */ + + seq_printf(m, + "Mem:\t%8lu bytes\n" + "Slack:\t%8lu bytes\n" + "Shared:\t%8lu bytes\n", + bytes, slack, sbytes); + + up_read(&mm->mmap_sem); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct rb_node *p; + unsigned long vsize = 0; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_end - vma->vm_start; + } + up_read(&mm->mmap_sem); + return vsize; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long size = kobjsize(mm); + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; + } + } + + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; + up_read(&mm->mmap_sem); + size >>= PAGE_SHIFT; + size += *text + *data; + *resident = size; + return size; +} + +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + } + rcu_read_unlock(); + + return ret; +} + +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, + int is_pid) +{ + struct mm_struct *mm = vma->vm_mm; + struct proc_maps_private *priv = m->private; + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags; + unsigned long long pgoff = 0; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + } + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); + + if (file) { + seq_pad(m, ' '); + seq_path(m, &file->f_path, ""); + } else if (mm) { + pid_t tid = pid_of_stack(priv, vma, is_pid); + + if (tid != 0) { + seq_pad(m, ' '); + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, "[stack]"); + else + seq_printf(m, "[stack:%d]", tid); + } + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display mapping lines for a particular process's /proc/pid/maps + */ +static int show_map(struct seq_file *m, void *_p, int is_pid) +{ + struct rb_node *p = _p; + + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), + is_pid); +} + +static int show_pid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 1); +} + +static int show_tid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 0); +} + +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct mm_struct *mm; + struct rb_node *p; + loff_t n = *pos; + + /* pin the task and mm whilst we play with them */ + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; + + down_read(&mm->mmap_sem); + /* start from the Nth VMA */ + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) + if (n-- == 0) + return p; + + up_read(&mm->mmap_sem); + mmput(mm); + return NULL; +} + +static void m_stop(struct seq_file *m, void *_vml) +{ + struct proc_maps_private *priv = m->private; + + if (!IS_ERR_OR_NULL(_vml)) { + up_read(&priv->mm->mmap_sem); + mmput(priv->mm); + } + if (priv->task) { + put_task_struct(priv->task); + priv->task = NULL; + } +} + +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +{ + struct rb_node *p = _p; + + (*pos)++; + return p ? rb_next(p) : NULL; +} + +static const struct seq_operations proc_pid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map +}; + +static int maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct proc_maps_private *priv; + + priv = __seq_open_private(file, ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + + +static int map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); +} + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_pid_maps_ops); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_tid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = map_release, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = map_release, +}; + diff --git a/kernel/fs/proc/thread_self.c b/kernel/fs/proc/thread_self.c new file mode 100644 index 000000000..a8371993b --- /dev/null +++ b/kernel/fs/proc/thread_self.c @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include "internal.h" + +/* + * /proc/thread_self: + */ +static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF]; + if (!pid) + return -ENOENT; + sprintf(tmp, "%d/task/%d", tgid, pid); + return readlink_copy(buffer, buflen, tmp); +} + +static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (pid) { + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d/task/%d", tgid, pid); + } + nd_set_link(nd, name); + return NULL; +} + +static const struct inode_operations proc_thread_self_inode_operations = { + .readlink = proc_thread_self_readlink, + .follow_link = proc_thread_self_follow_link, + .put_link = kfree_put_link, +}; + +static unsigned thread_self_inum; + +int proc_setup_thread_self(struct super_block *s) +{ + struct inode *root_inode = d_inode(s->s_root); + struct pid_namespace *ns = s->s_fs_info; + struct dentry *thread_self; + + mutex_lock(&root_inode->i_mutex); + thread_self = d_alloc_name(s->s_root, "thread-self"); + if (thread_self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = thread_self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_thread_self_inode_operations; + d_add(thread_self, inode); + } else { + dput(thread_self); + thread_self = ERR_PTR(-ENOMEM); + } + } else { + thread_self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(thread_self)) { + pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); + return PTR_ERR(thread_self); + } + ns->proc_thread_self = thread_self; + return 0; +} + +void __init proc_thread_self_init(void) +{ + proc_alloc_inum(&thread_self_inum); +} diff --git a/kernel/fs/proc/uptime.c b/kernel/fs/proc/uptime.c new file mode 100644 index 000000000..33de567c2 --- /dev/null +++ b/kernel/fs/proc/uptime.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static int uptime_proc_show(struct seq_file *m, void *v) +{ + struct timespec uptime; + struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; + int i; + + idletime = 0; + for_each_possible_cpu(i) + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + + get_monotonic_boottime(&uptime); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), + (unsigned long) idle.tv_sec, + (idle.tv_nsec / (NSEC_PER_SEC / 100))); + return 0; +} + +static int uptime_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, uptime_proc_show, NULL); +} + +static const struct file_operations uptime_proc_fops = { + .open = uptime_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_uptime_init(void) +{ + proc_create("uptime", 0, NULL, &uptime_proc_fops); + return 0; +} +fs_initcall(proc_uptime_init); diff --git a/kernel/fs/proc/version.c b/kernel/fs/proc/version.c new file mode 100644 index 000000000..d2154eb6d --- /dev/null +++ b/kernel/fs/proc/version.c @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include +#include + +static int version_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); + return 0; +} + +static int version_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, version_proc_show, NULL); +} + +static const struct file_operations version_proc_fops = { + .open = version_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_version_init(void) +{ + proc_create("version", 0, NULL, &version_proc_fops); + return 0; +} +fs_initcall(proc_version_init); diff --git a/kernel/fs/proc/vmcore.c b/kernel/fs/proc/vmcore.c new file mode 100644 index 000000000..4e61388ec --- /dev/null +++ b/kernel/fs/proc/vmcore.c @@ -0,0 +1,1194 @@ +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* List representing chunks of contiguous memory areas and their offsets in + * vmcore file. + */ +static LIST_HEAD(vmcore_list); + +/* Stores the pointer to the buffer containing kernel elf core headers. */ +static char *elfcorebuf; +static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; + +static char *elfnotes_buf; +static size_t elfnotes_sz; + +/* Total size of vmcore file. */ +static u64 vmcore_size; + +static struct proc_dir_entry *proc_vmcore; + +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (oldmem_pfn_is_ram) + return -EBUSY; + oldmem_pfn_is_ram = fn; + return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ + oldmem_pfn_is_ram = NULL; + wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ + int (*fn)(unsigned long pfn); + /* pfn is ram unless fn() checks pagetype */ + int ret = 1; + + /* + * Ask hypervisor if the pfn is really ram. + * A ballooned page contains no data and reading from such a page + * will cause high load in the hypervisor. + */ + fn = oldmem_pfn_is_ram; + if (fn) + ret = fn(pfn); + + return ret; +} + +/* Reads a page from the oldmem device from given offset. */ +static ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf) +{ + unsigned long pfn, offset; + size_t nr_bytes; + ssize_t read = 0, tmp; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + /* If pfn is not ram, return zeros for sparse dump files */ + if (pfn_is_ram(pfn) == 0) + memset(buf, 0, nr_bytes); + else { + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) + return tmp; + } + *ppos += nr_bytes; + count -= nr_bytes; + buf += nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + + return read; +} + +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + return 0; +} + +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to map oldmem + */ +int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Copy to either kernel or user space + */ +static int copy_to(void *target, void *src, size_t size, int userbuf) +{ + if (userbuf) { + if (copy_to_user((char __user *) target, src, size)) + return -EFAULT; + } else { + memcpy(target, src, size); + } + return 0; +} + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, + int userbuf) +{ + ssize_t acc = 0, tmp; + size_t tsz; + u64 start; + struct vmcore *m = NULL; + + if (buflen == 0 || *fpos >= vmcore_size) + return 0; + + /* trim buflen to not go beyond EOF */ + if (buflen > vmcore_size - *fpos) + buflen = vmcore_size - *fpos; + + /* Read ELF core header */ + if (*fpos < elfcorebuf_sz) { + tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); + if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + if (copy_to(buffer, kaddr, tsz, userbuf)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + } + + return acc; +} + +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + return __read_vmcore((__force char *) buffer, buflen, fpos, 1); +} + +/* + * The vmcore fault handler uses the page cache and fills data using the + * standard __vmcore_read() function. + * + * On s390 the fault handler is used for memory regions that can't be mapped + * directly with remap_pfn_range(). + */ +static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#ifdef CONFIG_S390 + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t index = vmf->pgoff; + struct page *page; + loff_t offset; + char *buf; + int rc; + + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (!page) + return VM_FAULT_OOM; + if (!PageUptodate(page)) { + offset = (loff_t) index << PAGE_CACHE_SHIFT; + buf = __va((page_to_pfn(page) << PAGE_SHIFT)); + rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + if (rc < 0) { + unlock_page(page); + page_cache_release(page); + return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + } + SetPageUptodate(page); + } + unlock_page(page); + vmf->page = page; + return 0; +#else + return VM_FAULT_SIGBUS; +#endif +} + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + +/** + * alloc_elfnotes_buf - allocate buffer for ELF note segment in + * vmalloc memory + * + * @notes_sz: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *alloc_elfnotes_buf(size_t notes_sz) +{ +#ifdef CONFIG_MMU + return vmalloc_user(notes_sz); +#else + return vzalloc(notes_sz); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +/* + * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages + * reported as not being ram with the zero page. + * + * @vma: vm_area_struct describing requested mapping + * @from: start remapping from + * @pfn: page frame number to start remapping to + * @size: remapping size + * @prot: protection bits + * + * Returns zero on success, -EAGAIN on failure. + */ +static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long map_size; + unsigned long pos_start, pos_end, pos; + unsigned long zeropage_pfn = my_zero_pfn(0); + size_t len = 0; + + pos_start = pfn; + pos_end = pfn + (size >> PAGE_SHIFT); + + for (pos = pos_start; pos < pos_end; ++pos) { + if (!pfn_is_ram(pos)) { + /* + * We hit a page which is not ram. Remap the continuous + * region between pos_start and pos-1 and replace + * the non-ram page at pos with the zero page. + */ + if (pos > pos_start) { + /* Remap continuous region */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, + pos_start, map_size, + prot)) + goto fail; + len += map_size; + } + /* Remap the zero page */ + if (remap_oldmem_pfn_range(vma, from + len, + zeropage_pfn, + PAGE_SIZE, prot)) + goto fail; + len += PAGE_SIZE; + pos_start = pos + 1; + } + } + if (pos > pos_start) { + /* Remap the rest */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, pos_start, + map_size, prot)) + goto fail; + } + return 0; +fail: + do_munmap(vma->vm_mm, from, len); + return -EAGAIN; +} + +static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + /* + * Check if oldmem_pfn_is_ram was registered to avoid + * looping over all pages without a reason. + */ + if (oldmem_pfn_is_ram) + return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range(vma, from, pfn, size, prot); +} + +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; + + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &vmcore_mmap_ops; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, tsz)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = min_t(size_t, m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif + +static const struct file_operations proc_vmcore_operations = { + .read = read_vmcore, + .llseek = default_llseek, + .mmap = mmap_vmcore, +}; + +static struct vmcore* __init get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) +{ + u64 size; + struct vmcore *m; + + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; + } + return size; +} + +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + void *notes_section; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + while (nhdr_ptr->n_namesz != 0) { + sz = sizeof(Elf64_Nhdr) + + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = roundup(note_off, PAGE_SIZE); + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + void *notes_section; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + while (nhdr_ptr->n_namesz != 0) { + sz = sizeof(Elf32_Nhdr) + + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = roundup(note_off, PAGE_SIZE); + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int __init process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + size_t elfnotes_sz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + + if (phdr_ptr->p_type != PT_LOAD) + continue; + + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = start; + new->size = size; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; + } + return 0; +} + +static int __init process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + size_t elfnotes_sz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + + if (phdr_ptr->p_type != PT_LOAD) + continue; + + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = start; + new->size = size; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) +{ + loff_t vmcore_off; + struct vmcore *m; + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static void free_elfcorebuf(void) +{ + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; +} + +static int __init parse_crash_elf64_headers(void) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !vmcore_elf64_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; + rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + return 0; +fail: + free_elfcorebuf(); + return rc; +} + +static int __init parse_crash_elf32_headers(void) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; + rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + return 0; +fail: + free_elfcorebuf(); + return rc; +} + +static int __init parse_crash_elf_headers(void) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + pr_warn("Warning: Core image elf header not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(); + if (rc) + return rc; + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(); + if (rc) + return rc; + } else { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + + return 0; +} + +/* Init function for vmcore module. */ +static int __init vmcore_init(void) +{ + int rc = 0; + + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size); + if (rc) + return rc; + /* + * If elfcorehdr= has been passed in cmdline or created in 2nd kernel, + * then capture the dump. + */ + if (!(is_vmcore_usable())) + return rc; + rc = parse_crash_elf_headers(); + if (rc) { + pr_warn("Kdump: vmcore not initialized\n"); + return rc; + } + elfcorehdr_free(elfcorehdr_addr); + elfcorehdr_addr = ELFCORE_ADDR_ERR; + + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + if (proc_vmcore) + proc_vmcore->size = vmcore_size; + return 0; +} +fs_initcall(vmcore_init); + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ + struct list_head *pos, *next; + + if (proc_vmcore) { + proc_remove(proc_vmcore); + proc_vmcore = NULL; + } + + /* clear the vmcore list. */ + list_for_each_safe(pos, next, &vmcore_list) { + struct vmcore *m; + + m = list_entry(pos, struct vmcore, list); + list_del(&m->list); + kfree(m); + } + free_elfcorebuf(); +} diff --git a/kernel/fs/proc_namespace.c b/kernel/fs/proc_namespace.c new file mode 100644 index 000000000..8db932da4 --- /dev/null +++ b/kernel/fs/proc_namespace.c @@ -0,0 +1,335 @@ +/* + * fs/proc_namespace.c - handling of /proc//{mounts,mountinfo,mountstats} + * + * In fact, that's a piece of procfs; it's *almost* isolated from + * the rest of fs/proc, but has rather close relationships with + * fs/namespace.c, thus here instead of fs/proc + * + */ +#include +#include +#include +#include +#include "proc/internal.h" /* only for get_proc_task() in ->open() */ + +#include "pnode.h" +#include "internal.h" + +static unsigned mounts_poll(struct file *file, poll_table *wait) +{ + struct proc_mounts *p = proc_mounts(file->private_data); + struct mnt_namespace *ns = p->ns; + unsigned res = POLLIN | POLLRDNORM; + int event; + + poll_wait(file, &p->ns->poll, wait); + + event = ACCESS_ONCE(ns->event); + if (p->m.poll_event != event) { + p->m.poll_event = event; + res |= POLLERR | POLLPRI; + } + + return res; +} + +struct proc_fs_info { + int flag; + const char *str; +}; + +static int show_sb_opts(struct seq_file *m, struct super_block *sb) +{ + static const struct proc_fs_info fs_info[] = { + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, + { MS_LAZYTIME, ",lazytime" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + return security_sb_show_options(m, sb); +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { + { MNT_NOSUID, ",nosuid" }, + { MNT_NODEV, ",nodev" }, + { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\"); +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = proc_mounts(m); + struct mount *r = real_mount(mnt); + int err = 0; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; + } else { + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + } + seq_putc(m, ' '); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + show_type(m, sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + show_mnt_opts(m, mnt); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt_path.dentry); + seq_puts(m, " 0 0\n"); +out: + return err; +} + +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = proc_mounts(m); + struct mount *r = real_mount(mnt); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) + err = sb->s_op->show_path(m, mnt->mnt_root); + else + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(r)) + seq_printf(m, " shared:%i", r->mnt_group_id); + if (IS_MNT_SLAVE(r)) { + int master = r->mnt_master->mnt_group_id; + int dom = get_dominating_id(r, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(r)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + if (sb->s_op->show_devname) + err = sb->s_op->show_devname(m, mnt->mnt_root); + else + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + if (err) + goto out; + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt->mnt_root); + seq_putc(m, '\n'); +out: + return err; +} + +static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = proc_mounts(m); + struct mount *r = real_mount(mnt); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + int err = 0; + + /* device */ + if (sb->s_op->show_devname) { + seq_puts(m, "device "); + err = sb->s_op->show_devname(m, mnt_path.dentry); + } else { + if (r->mnt_devname) { + seq_puts(m, "device "); + mangle(m, r->mnt_devname); + } else + seq_puts(m, "no device"); + } + + /* mount point */ + seq_puts(m, " mounted on "); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, sb); + + /* optional statistics */ + if (sb->s_op->show_stats) { + seq_putc(m, ' '); + if (!err) + err = sb->s_op->show_stats(m, mnt_path.dentry); + } + + seq_putc(m, '\n'); +out: + return err; +} + +static int mounts_open_common(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, struct vfsmount *)) +{ + struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; + struct mnt_namespace *ns = NULL; + struct path root; + struct proc_mounts *p; + int ret = -EINVAL; + + if (!task) + goto err; + + task_lock(task); + nsp = task->nsproxy; + if (!nsp || !nsp->mnt_ns) { + task_unlock(task); + put_task_struct(task); + goto err; + } + ns = nsp->mnt_ns; + get_mnt_ns(ns); + if (!task->fs) { + task_unlock(task); + put_task_struct(task); + ret = -ENOENT; + goto err_put_ns; + } + get_fs_root(task->fs, &root); + task_unlock(task); + put_task_struct(task); + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, &mounts_op); + if (ret) + goto err_free; + + p->ns = ns; + p->root = root; + p->m.poll_event = ns->event; + p->show = show; + p->cached_event = ~0ULL; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: + return ret; +} + +static int mounts_release(struct inode *inode, struct file *file) +{ + struct proc_mounts *p = proc_mounts(file->private_data); + path_put(&p->root); + put_mnt_ns(p->ns); + return seq_release(inode, file); +} + +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsmnt); +} + +static int mountinfo_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_mountinfo); +} + +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsstat); +} + +const struct file_operations proc_mounts_operations = { + .open = mounts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; diff --git a/kernel/fs/pstore/Kconfig b/kernel/fs/pstore/Kconfig new file mode 100644 index 000000000..916b8e23d --- /dev/null +++ b/kernel/fs/pstore/Kconfig @@ -0,0 +1,62 @@ +config PSTORE + bool "Persistent store support" + default n + select ZLIB_DEFLATE + select ZLIB_INFLATE + help + This option enables generic access to platform level + persistent storage via "pstore" filesystem that can + be mounted as /dev/pstore. Only useful if you have + a platform level driver that registers with pstore to + provide the data, so you probably should just go say "Y" + (or "M") to a platform specific persistent store driver + (e.g. ACPI_APEI on X86) which will select this for you. + If you don't have a platform persistent store driver, + say N. + +config PSTORE_CONSOLE + bool "Log kernel console messages" + depends on PSTORE + help + When the option is enabled, pstore will log all kernel + messages, even if no oops or panic happened. + +config PSTORE_PMSG + bool "Log user space messages" + depends on PSTORE + help + When the option is enabled, pstore will export a character + interface /dev/pmsg0 to log user space messages. On reboot + data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID]. + + If unsure, say N. + +config PSTORE_FTRACE + bool "Persistent function tracer" + depends on PSTORE + depends on FUNCTION_TRACER + depends on DEBUG_FS + help + With this option kernel traces function calls into a persistent + ram buffer that can be decoded and dumped after reboot through + pstore filesystem. It can be used to determine what function + was last called before a reset or panic. + + If unsure, say N. + +config PSTORE_RAM + tristate "Log panic/oops to a RAM buffer" + depends on PSTORE + depends on HAS_IOMEM + depends on HAVE_MEMBLOCK + select REED_SOLOMON + select REED_SOLOMON_ENC8 + select REED_SOLOMON_DEC8 + help + This enables panic and oops messages to be logged to a circular + buffer in RAM where it can be read back at some later point. + + Note that for historical reasons, the module will be named + "ramoops.ko". + + For more information, see Documentation/ramoops.txt. diff --git a/kernel/fs/pstore/Makefile b/kernel/fs/pstore/Makefile new file mode 100644 index 000000000..e647d8e81 --- /dev/null +++ b/kernel/fs/pstore/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the linux pstorefs routines. +# + +obj-y += pstore.o + +pstore-objs += inode.o platform.o +obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o + +obj-$(CONFIG_PSTORE_PMSG) += pmsg.o + +ramoops-objs += ram.o ram_core.o +obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/kernel/fs/pstore/ftrace.c b/kernel/fs/pstore/ftrace.c new file mode 100644 index 000000000..76a4eeb92 --- /dev/null +++ b/kernel/fs/pstore/ftrace.c @@ -0,0 +1,131 @@ +/* + * Copyright 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static void notrace pstore_ftrace_call(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *op, + struct pt_regs *regs) +{ + unsigned long flags; + struct pstore_ftrace_record rec = {}; + + if (unlikely(oops_in_progress)) + return; + + local_irq_save(flags); + + rec.ip = ip; + rec.parent_ip = parent_ip; + pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); + psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, + 0, sizeof(rec), psinfo); + + local_irq_restore(flags); +} + +static struct ftrace_ops pstore_ftrace_ops __read_mostly = { + .func = pstore_ftrace_call, +}; + +static DEFINE_MUTEX(pstore_ftrace_lock); +static bool pstore_ftrace_enabled; + +static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + u8 on; + ssize_t ret; + + ret = kstrtou8_from_user(buf, count, 2, &on); + if (ret) + return ret; + + mutex_lock(&pstore_ftrace_lock); + + if (!on ^ pstore_ftrace_enabled) + goto out; + + if (on) + ret = register_ftrace_function(&pstore_ftrace_ops); + else + ret = unregister_ftrace_function(&pstore_ftrace_ops); + if (ret) { + pr_err("%s: unable to %sregister ftrace ops: %zd\n", + __func__, on ? "" : "un", ret); + goto err; + } + + pstore_ftrace_enabled = on; +out: + ret = count; +err: + mutex_unlock(&pstore_ftrace_lock); + + return ret; +} + +static ssize_t pstore_ftrace_knob_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + char val[] = { '0' + pstore_ftrace_enabled, '\n' }; + + return simple_read_from_buffer(buf, count, ppos, val, sizeof(val)); +} + +static const struct file_operations pstore_knob_fops = { + .open = simple_open, + .read = pstore_ftrace_knob_read, + .write = pstore_ftrace_knob_write, +}; + +void pstore_register_ftrace(void) +{ + struct dentry *dir; + struct dentry *file; + + if (!psinfo->write_buf) + return; + + dir = debugfs_create_dir("pstore", NULL); + if (!dir) { + pr_err("%s: unable to create pstore directory\n", __func__); + return; + } + + file = debugfs_create_file("record_ftrace", 0600, dir, NULL, + &pstore_knob_fops); + if (!file) { + pr_err("%s: unable to create record_ftrace file\n", __func__); + goto err_file; + } + + return; +err_file: + debugfs_remove(dir); +} diff --git a/kernel/fs/pstore/inode.c b/kernel/fs/pstore/inode.c new file mode 100644 index 000000000..3adcc4669 --- /dev/null +++ b/kernel/fs/pstore/inode.c @@ -0,0 +1,483 @@ +/* + * Persistent Storage - ramfs parts. + * + * Copyright (C) 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define PSTORE_NAMELEN 64 + +static DEFINE_SPINLOCK(allpstore_lock); +static LIST_HEAD(allpstore); + +struct pstore_private { + struct list_head list; + struct pstore_info *psi; + enum pstore_type_id type; + u64 id; + int count; + ssize_t size; + char data[]; +}; + +struct pstore_ftrace_seq_data { + const void *ptr; + size_t off; + size_t size; +}; + +#define REC_SIZE sizeof(struct pstore_ftrace_record) + +static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + + data->off = ps->size % REC_SIZE; + data->off += *pos * REC_SIZE; + if (data->off + REC_SIZE > ps->size) { + kfree(data); + return NULL; + } + + return data; + +} + +static void pstore_ftrace_seq_stop(struct seq_file *s, void *v) +{ + kfree(v); +} + +static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data = v; + + data->off += REC_SIZE; + if (data->off + REC_SIZE > ps->size) + return NULL; + + (*pos)++; + return data; +} + +static int pstore_ftrace_seq_show(struct seq_file *s, void *v) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data = v; + struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); + + seq_printf(s, "%d %08lx %08lx %pf <- %pF\n", + pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip, + (void *)rec->ip, (void *)rec->parent_ip); + + return 0; +} + +static const struct seq_operations pstore_ftrace_seq_ops = { + .start = pstore_ftrace_seq_start, + .next = pstore_ftrace_seq_next, + .stop = pstore_ftrace_seq_stop, + .show = pstore_ftrace_seq_show, +}; + +static int pstore_check_syslog_permissions(struct pstore_private *ps) +{ + switch (ps->type) { + case PSTORE_TYPE_DMESG: + case PSTORE_TYPE_CONSOLE: + return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + default: + return 0; + } +} + +static ssize_t pstore_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct seq_file *sf = file->private_data; + struct pstore_private *ps = sf->private; + + if (ps->type == PSTORE_TYPE_FTRACE) + return seq_read(file, userbuf, count, ppos); + return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); +} + +static int pstore_file_open(struct inode *inode, struct file *file) +{ + struct pstore_private *ps = inode->i_private; + struct seq_file *sf; + int err; + const struct seq_operations *sops = NULL; + + err = pstore_check_syslog_permissions(ps); + if (err) + return err; + + if (ps->type == PSTORE_TYPE_FTRACE) + sops = &pstore_ftrace_seq_ops; + + err = seq_open(file, sops); + if (err < 0) + return err; + + sf = file->private_data; + sf->private = ps; + + return 0; +} + +static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) +{ + struct seq_file *sf = file->private_data; + + if (sf->op) + return seq_lseek(file, off, whence); + return default_llseek(file, off, whence); +} + +static const struct file_operations pstore_file_operations = { + .open = pstore_file_open, + .read = pstore_file_read, + .llseek = pstore_file_llseek, + .release = seq_release, +}; + +/* + * When a file is unlinked from our file system we call the + * platform driver to erase the record from persistent store. + */ +static int pstore_unlink(struct inode *dir, struct dentry *dentry) +{ + struct pstore_private *p = d_inode(dentry)->i_private; + int err; + + err = pstore_check_syslog_permissions(p); + if (err) + return err; + + if (p->psi->erase) + p->psi->erase(p->type, p->id, p->count, + d_inode(dentry)->i_ctime, p->psi); + else + return -EPERM; + + return simple_unlink(dir, dentry); +} + +static void pstore_evict_inode(struct inode *inode) +{ + struct pstore_private *p = inode->i_private; + unsigned long flags; + + clear_inode(inode); + if (p) { + spin_lock_irqsave(&allpstore_lock, flags); + list_del(&p->list); + spin_unlock_irqrestore(&allpstore_lock, flags); + kfree(p); + } +} + +static const struct inode_operations pstore_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = pstore_unlink, +}; + +static struct inode *pstore_get_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + return inode; +} + +enum { + Opt_kmsg_bytes, Opt_err +}; + +static const match_table_t tokens = { + {Opt_kmsg_bytes, "kmsg_bytes=%u"}, + {Opt_err, NULL} +}; + +static void parse_options(char *options) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_kmsg_bytes: + if (!match_int(&args[0], &option)) + pstore_set_kmsg_bytes(option); + break; + } + } +} + +static int pstore_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + parse_options(data); + + return 0; +} + +static const struct super_operations pstore_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = pstore_evict_inode, + .remount_fs = pstore_remount, + .show_options = generic_show_options, +}; + +static struct super_block *pstore_sb; + +int pstore_is_mounted(void) +{ + return pstore_sb != NULL; +} + +/* + * Make a regular file in the root directory of our file system. + * Load it up with "size" bytes of data from "buf". + * Set the mtime & ctime to the date that this record was originally stored. + */ +int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, + char *data, bool compressed, size_t size, + struct timespec time, struct pstore_info *psi) +{ + struct dentry *root = pstore_sb->s_root; + struct dentry *dentry; + struct inode *inode; + int rc = 0; + char name[PSTORE_NAMELEN]; + struct pstore_private *private, *pos; + unsigned long flags; + + spin_lock_irqsave(&allpstore_lock, flags); + list_for_each_entry(pos, &allpstore, list) { + if (pos->type == type && + pos->id == id && + pos->psi == psi) { + rc = -EEXIST; + break; + } + } + spin_unlock_irqrestore(&allpstore_lock, flags); + if (rc) + return rc; + + rc = -ENOMEM; + inode = pstore_get_inode(pstore_sb); + if (!inode) + goto fail; + inode->i_mode = S_IFREG | 0444; + inode->i_fop = &pstore_file_operations; + private = kmalloc(sizeof *private + size, GFP_KERNEL); + if (!private) + goto fail_alloc; + private->type = type; + private->id = id; + private->count = count; + private->psi = psi; + + switch (type) { + case PSTORE_TYPE_DMESG: + scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", + psname, id, compressed ? ".enc.z" : ""); + break; + case PSTORE_TYPE_CONSOLE: + scnprintf(name, sizeof(name), "console-%s-%lld", psname, id); + break; + case PSTORE_TYPE_FTRACE: + scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id); + break; + case PSTORE_TYPE_MCE: + scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_RTAS: + scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_OF: + scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", + psname, id); + break; + case PSTORE_TYPE_PPC_COMMON: + scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", + psname, id); + break; + case PSTORE_TYPE_PMSG: + scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; + case PSTORE_TYPE_UNKNOWN: + scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); + break; + default: + scnprintf(name, sizeof(name), "type%d-%s-%lld", + type, psname, id); + break; + } + + mutex_lock(&d_inode(root)->i_mutex); + + dentry = d_alloc_name(root, name); + if (!dentry) + goto fail_lockedalloc; + + memcpy(private->data, data, size); + inode->i_size = private->size = size; + + inode->i_private = private; + + if (time.tv_sec) + inode->i_mtime = inode->i_ctime = time; + + d_add(dentry, inode); + + spin_lock_irqsave(&allpstore_lock, flags); + list_add(&private->list, &allpstore); + spin_unlock_irqrestore(&allpstore_lock, flags); + + mutex_unlock(&d_inode(root)->i_mutex); + + return 0; + +fail_lockedalloc: + mutex_unlock(&d_inode(root)->i_mutex); + kfree(private); +fail_alloc: + iput(inode); + +fail: + return rc; +} + +static int pstore_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + + save_mount_options(sb, data); + + pstore_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = PSTOREFS_MAGIC; + sb->s_op = &pstore_ops; + sb->s_time_gran = 1; + + parse_options(data); + + inode = pstore_get_inode(sb); + if (inode) { + inode->i_mode = S_IFDIR | 0755; + inode->i_op = &pstore_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + } + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + pstore_get_records(0); + + return 0; +} + +static struct dentry *pstore_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, pstore_fill_super); +} + +static void pstore_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + pstore_sb = NULL; +} + +static struct file_system_type pstore_fs_type = { + .name = "pstore", + .mount = pstore_mount, + .kill_sb = pstore_kill_sb, +}; + +static int __init init_pstore_fs(void) +{ + int err; + + /* Create a convenient mount point for people to access pstore */ + err = sysfs_create_mount_point(fs_kobj, "pstore"); + if (err) + goto out; + + err = register_filesystem(&pstore_fs_type); + if (err < 0) + sysfs_remove_mount_point(fs_kobj, "pstore"); + +out: + return err; +} +module_init(init_pstore_fs) + +MODULE_AUTHOR("Tony Luck "); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/pstore/internal.h b/kernel/fs/pstore/internal.h new file mode 100644 index 000000000..c36ba2cd0 --- /dev/null +++ b/kernel/fs/pstore/internal.h @@ -0,0 +1,64 @@ +#ifndef __PSTORE_INTERNAL_H__ +#define __PSTORE_INTERNAL_H__ + +#include +#include +#include + +#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB) +#define PSTORE_CPU_IN_IP 0x1 +#elif NR_CPUS <= 4 && defined(CONFIG_ARM) +#define PSTORE_CPU_IN_IP 0x3 +#endif + +struct pstore_ftrace_record { + unsigned long ip; + unsigned long parent_ip; +#ifndef PSTORE_CPU_IN_IP + unsigned int cpu; +#endif +}; + +static inline void +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu) +{ +#ifndef PSTORE_CPU_IN_IP + rec->cpu = cpu; +#else + rec->ip |= cpu; +#endif +} + +static inline unsigned int +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) +{ +#ifndef PSTORE_CPU_IN_IP + return rec->cpu; +#else + return rec->ip & PSTORE_CPU_IN_IP; +#endif +} + +#ifdef CONFIG_PSTORE_FTRACE +extern void pstore_register_ftrace(void); +#else +static inline void pstore_register_ftrace(void) {} +#endif + +#ifdef CONFIG_PSTORE_PMSG +extern void pstore_register_pmsg(void); +#else +static inline void pstore_register_pmsg(void) {} +#endif + +extern struct pstore_info *psinfo; + +extern void pstore_set_kmsg_bytes(int); +extern void pstore_get_records(int); +extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, + int count, char *data, bool compressed, + size_t size, struct timespec time, + struct pstore_info *psi); +extern int pstore_is_mounted(void); + +#endif diff --git a/kernel/fs/pstore/platform.c b/kernel/fs/pstore/platform.c new file mode 100644 index 000000000..c4c9a10c5 --- /dev/null +++ b/kernel/fs/pstore/platform.c @@ -0,0 +1,547 @@ +/* + * Persistent Storage - platform driver interface parts. + * + * Copyright (C) 2007-2008 Google, Inc. + * Copyright (C) 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define pr_fmt(fmt) "pstore: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * We defer making "oops" entries appear in pstore - see + * whether the system is actually still running well enough + * to let someone see the entry + */ +static int pstore_update_ms = -1; +module_param_named(update_ms, pstore_update_ms, int, 0600); +MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " + "(default is -1, which means runtime updates are disabled; " + "enabling this option is not safe, it may lead to further " + "corruption on Oopses)"); + +static int pstore_new_entry; + +static void pstore_timefunc(unsigned long); +static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); + +static void pstore_dowork(struct work_struct *); +static DECLARE_WORK(pstore_work, pstore_dowork); + +/* + * pstore_lock just protects "psinfo" during + * calls to pstore_register() + */ +static DEFINE_SPINLOCK(pstore_lock); +struct pstore_info *psinfo; + +static char *backend; + +/* Compression parameters */ +#define COMPR_LEVEL 6 +#define WINDOW_BITS 12 +#define MEM_LEVEL 4 +static struct z_stream_s stream; + +static char *big_oops_buf; +static size_t big_oops_buf_sz; + +/* How much of the console log to snapshot */ +static unsigned long kmsg_bytes = 10240; + +void pstore_set_kmsg_bytes(int bytes) +{ + kmsg_bytes = bytes; +} + +/* Tag each group of saved records with a sequence number */ +static int oopscount; + +static const char *get_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_RESTART: + return "Restart"; + case KMSG_DUMP_HALT: + return "Halt"; + case KMSG_DUMP_POWEROFF: + return "Poweroff"; + default: + return "Unknown"; + } +} + +bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +{ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ + if (in_nmi()) + return true; + + switch (reason) { + /* In panic case, other cpus are stopped by smp_send_stop(). */ + case KMSG_DUMP_PANIC: + /* Emergency restart shouldn't be blocked by spin lock. */ + case KMSG_DUMP_EMERG: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(pstore_cannot_block_path); + +/* Derived from logfs_compress() */ +static int pstore_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Derived from logfs_uncompress */ +static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_inflateInit2(&stream, WINDOW_BITS); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +static void allocate_buf_for_compression(void) +{ + size_t size; + size_t cmpr; + + switch (psinfo->bufsize) { + /* buffer range for efivars */ + case 1000 ... 2000: + cmpr = 56; + break; + case 2001 ... 3000: + cmpr = 54; + break; + case 3001 ... 3999: + cmpr = 52; + break; + /* buffer range for nvram, erst */ + case 4000 ... 10000: + cmpr = 45; + break; + default: + cmpr = 60; + break; + } + + big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr; + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL), + zlib_inflate_workspacesize()); + stream.workspace = kmalloc(size, GFP_KERNEL); + if (!stream.workspace) { + pr_err("No memory for compression workspace; skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; skipping compression\n"); + stream.workspace = NULL; + } + +} + +/* + * Called when compression fails, since the printk buffer + * would be fetched for compression calling it again when + * compression fails would have moved the iterator of + * printk buffer which results in fetching old contents. + * Copy the recent messages from big_oops_buf to psinfo->buf + */ +static size_t copy_kmsg_to_buffer(int hsize, size_t len) +{ + size_t total_len; + size_t diff; + + total_len = hsize + len; + + if (total_len > psinfo->bufsize) { + diff = total_len - psinfo->bufsize + hsize; + memcpy(psinfo->buf, big_oops_buf, hsize); + memcpy(psinfo->buf + hsize, big_oops_buf + diff, + psinfo->bufsize - hsize); + total_len = psinfo->bufsize; + } else + memcpy(psinfo->buf, big_oops_buf, total_len); + + return total_len; +} + +/* + * callback from kmsg_dump. (s2,l2) has the most recently + * written bytes, older bytes are in (s1,l1). Save as much + * as we can from the end of the buffer. + */ +static void pstore_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + unsigned long total = 0; + const char *why; + u64 id; + unsigned int part = 1; + unsigned long flags = 0; + int is_locked = 0; + int ret; + + why = get_reason_str(reason); + + if (pstore_cannot_block_path(reason)) { + is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); + if (!is_locked) { + pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" + , in_nmi() ? "NMI" : why); + } + } else + spin_lock_irqsave(&psinfo->buf_lock, flags); + oopscount++; + while (total < kmsg_bytes) { + char *dst; + unsigned long size; + int hsize; + int zipped_len = -1; + size_t len; + bool compressed; + size_t total_len; + + if (big_oops_buf) { + dst = big_oops_buf; + hsize = sprintf(dst, "%s#%d Part%u\n", why, + oopscount, part); + size = big_oops_buf_sz - hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, + size, &len)) + break; + + zipped_len = pstore_compress(dst, psinfo->buf, + hsize + len, psinfo->bufsize); + + if (zipped_len > 0) { + compressed = true; + total_len = zipped_len; + } else { + compressed = false; + total_len = copy_kmsg_to_buffer(hsize, len); + } + } else { + dst = psinfo->buf; + hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, + part); + size = psinfo->bufsize - hsize; + dst += hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst, + size, &len)) + break; + + compressed = false; + total_len = hsize + len; + } + + ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, + oopscount, compressed, total_len, psinfo); + if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + pstore_new_entry = 1; + + total += total_len; + part++; + } + if (pstore_cannot_block_path(reason)) { + if (is_locked) + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + } else + spin_unlock_irqrestore(&psinfo->buf_lock, flags); +} + +static struct kmsg_dumper pstore_dumper = { + .dump = pstore_dump, +}; + +#ifdef CONFIG_PSTORE_CONSOLE +static void pstore_console_write(struct console *con, const char *s, unsigned c) +{ + const char *e = s + c; + + while (s < e) { + unsigned long flags; + u64 id; + + if (c > psinfo->bufsize) + c = psinfo->bufsize; + + if (oops_in_progress) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) + break; + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); + } + memcpy(psinfo->buf, s, c); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + s += c; + c = e - s; + } +} + +static struct console pstore_console = { + .name = "pstore", + .write = pstore_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, + .index = -1, +}; + +static void pstore_register_console(void) +{ + register_console(&pstore_console); +} +#else +static void pstore_register_console(void) {} +#endif + +static int pstore_write_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + bool compressed, size_t size, + struct pstore_info *psi) +{ + return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, + size, psi); +} + +/* + * platform specific persistent storage driver registers with + * us here. If pstore is already mounted, call the platform + * read function right away to populate the file system. If not + * then the pstore mount code will call us later to fill out + * the file system. + * + * Register with kmsg_dump to save last part of console log on panic. + */ +int pstore_register(struct pstore_info *psi) +{ + struct module *owner = psi->owner; + + if (backend && strcmp(backend, psi->name)) + return -EPERM; + + spin_lock(&pstore_lock); + if (psinfo) { + spin_unlock(&pstore_lock); + return -EBUSY; + } + + if (!psi->write) + psi->write = pstore_write_compat; + psinfo = psi; + mutex_init(&psinfo->read_mutex); + spin_unlock(&pstore_lock); + + if (owner && !try_module_get(owner)) { + psinfo = NULL; + return -EINVAL; + } + + allocate_buf_for_compression(); + + if (pstore_is_mounted()) + pstore_get_records(0); + + kmsg_dump_register(&pstore_dumper); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + pstore_register_pmsg(); + } + + if (pstore_update_ms >= 0) { + pstore_timer.expires = jiffies + + msecs_to_jiffies(pstore_update_ms); + add_timer(&pstore_timer); + } + + pr_info("Registered %s as persistent store backend\n", psi->name); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_register); + +/* + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. + */ +void pstore_get_records(int quiet) +{ + struct pstore_info *psi = psinfo; + char *buf = NULL; + ssize_t size; + u64 id; + int count; + enum pstore_type_id type; + struct timespec time; + int failed = 0, rc; + bool compressed; + int unzipped_len = -1; + + if (!psi) + return; + + mutex_lock(&psi->read_mutex); + if (psi->open && psi->open(psi)) + goto out; + + while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, + psi)) > 0) { + if (compressed && (type == PSTORE_TYPE_DMESG)) { + if (big_oops_buf) + unzipped_len = pstore_decompress(buf, + big_oops_buf, size, + big_oops_buf_sz); + + if (unzipped_len > 0) { + kfree(buf); + buf = big_oops_buf; + size = unzipped_len; + compressed = false; + } else { + pr_err("decompression failed;returned %d\n", + unzipped_len); + compressed = true; + } + } + rc = pstore_mkfile(type, psi->name, id, count, buf, + compressed, (size_t)size, time, psi); + if (unzipped_len < 0) { + /* Free buffer other than big oops */ + kfree(buf); + buf = NULL; + } else + unzipped_len = -1; + if (rc && (rc != -EEXIST || !quiet)) + failed++; + } + if (psi->close) + psi->close(psi); +out: + mutex_unlock(&psi->read_mutex); + + if (failed) + pr_warn("failed to load %d record(s) from '%s'\n", + failed, psi->name); +} + +static void pstore_dowork(struct work_struct *work) +{ + pstore_get_records(1); +} + +static void pstore_timefunc(unsigned long dummy) +{ + if (pstore_new_entry) { + pstore_new_entry = 0; + schedule_work(&pstore_work); + } + + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); +} + +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/kernel/fs/pstore/pmsg.c b/kernel/fs/pstore/pmsg.c new file mode 100644 index 000000000..feb5dd294 --- /dev/null +++ b/kernel/fs/pstore/pmsg.c @@ -0,0 +1,114 @@ +/* + * Copyright 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +static DEFINE_MUTEX(pmsg_lock); +#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) + +static ssize_t write_pmsg(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t i, buffer_size; + char *buffer; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + buffer_size = count; + if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) + buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; + buffer = vmalloc(buffer_size); + + mutex_lock(&pmsg_lock); + for (i = 0; i < count; ) { + size_t c = min(count - i, buffer_size); + u64 id; + long ret; + + ret = __copy_from_user(buffer, buf + i, c); + if (unlikely(ret != 0)) { + mutex_unlock(&pmsg_lock); + vfree(buffer); + return -EFAULT; + } + psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, + psinfo); + + i += c; + } + + mutex_unlock(&pmsg_lock); + vfree(buffer); + return count; +} + +static const struct file_operations pmsg_fops = { + .owner = THIS_MODULE, + .llseek = noop_llseek, + .write = write_pmsg, +}; + +static struct class *pmsg_class; +static int pmsg_major; +#define PMSG_NAME "pmsg" +#undef pr_fmt +#define pr_fmt(fmt) PMSG_NAME ": " fmt + +static char *pmsg_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0220; + return NULL; +} + +void pstore_register_pmsg(void) +{ + struct device *pmsg_device; + + pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops); + if (pmsg_major < 0) { + pr_err("register_chrdev failed\n"); + goto err; + } + + pmsg_class = class_create(THIS_MODULE, PMSG_NAME); + if (IS_ERR(pmsg_class)) { + pr_err("device class file already in use\n"); + goto err_class; + } + pmsg_class->devnode = pmsg_devnode; + + pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0), + NULL, "%s%d", PMSG_NAME, 0); + if (IS_ERR(pmsg_device)) { + pr_err("failed to create device\n"); + goto err_device; + } + return; + +err_device: + class_destroy(pmsg_class); +err_class: + unregister_chrdev(pmsg_major, PMSG_NAME); +err: + return; +} diff --git a/kernel/fs/pstore/ram.c b/kernel/fs/pstore/ram.c new file mode 100644 index 000000000..44a549bee --- /dev/null +++ b/kernel/fs/pstore/ram.c @@ -0,0 +1,648 @@ +/* + * RAM Oops/Panic logger + * + * Copyright (C) 2010 Marco Stornelli + * Copyright (C) 2011 Kees Cook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RAMOOPS_KERNMSG_HDR "====" +#define MIN_MEM_SIZE 4096UL + +static ulong record_size = MIN_MEM_SIZE; +module_param(record_size, ulong, 0400); +MODULE_PARM_DESC(record_size, + "size of each dump done on oops/panic"); + +static ulong ramoops_console_size = MIN_MEM_SIZE; +module_param_named(console_size, ramoops_console_size, ulong, 0400); +MODULE_PARM_DESC(console_size, "size of kernel console log"); + +static ulong ramoops_ftrace_size = MIN_MEM_SIZE; +module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400); +MODULE_PARM_DESC(ftrace_size, "size of ftrace log"); + +static ulong ramoops_pmsg_size = MIN_MEM_SIZE; +module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); +MODULE_PARM_DESC(pmsg_size, "size of user space message log"); + +static ulong mem_address; +module_param(mem_address, ulong, 0400); +MODULE_PARM_DESC(mem_address, + "start of reserved RAM used to store oops/panic logs"); + +static ulong mem_size; +module_param(mem_size, ulong, 0400); +MODULE_PARM_DESC(mem_size, + "size of reserved RAM used to store oops/panic logs"); + +static unsigned int mem_type; +module_param(mem_type, uint, 0600); +MODULE_PARM_DESC(mem_type, + "set to 1 to try to use unbuffered memory (default 0)"); + +static int dump_oops = 1; +module_param(dump_oops, int, 0600); +MODULE_PARM_DESC(dump_oops, + "set to 1 to dump oopses, 0 to only dump panics (default 1)"); + +static int ramoops_ecc; +module_param_named(ecc, ramoops_ecc, int, 0600); +MODULE_PARM_DESC(ramoops_ecc, + "if non-zero, the option enables ECC support and specifies " + "ECC buffer size in bytes (1 is a special value, means 16 " + "bytes ECC)"); + +struct ramoops_context { + struct persistent_ram_zone **przs; + struct persistent_ram_zone *cprz; + struct persistent_ram_zone *fprz; + struct persistent_ram_zone *mprz; + phys_addr_t phys_addr; + unsigned long size; + unsigned int memtype; + size_t record_size; + size_t console_size; + size_t ftrace_size; + size_t pmsg_size; + int dump_oops; + struct persistent_ram_ecc_info ecc_info; + unsigned int max_dump_cnt; + unsigned int dump_write_cnt; + /* _read_cnt need clear on ramoops_pstore_open */ + unsigned int dump_read_cnt; + unsigned int console_read_cnt; + unsigned int ftrace_read_cnt; + unsigned int pmsg_read_cnt; + struct pstore_info pstore; +}; + +static struct platform_device *dummy; +static struct ramoops_platform_data *dummy_data; + +static int ramoops_pstore_open(struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + + cxt->dump_read_cnt = 0; + cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; + cxt->pmsg_read_cnt = 0; + return 0; +} + +static struct persistent_ram_zone * +ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, + u64 *id, + enum pstore_type_id *typep, enum pstore_type_id type, + bool update) +{ + struct persistent_ram_zone *prz; + int i = (*c)++; + + if (i >= max) + return NULL; + + prz = przs[i]; + if (!prz) + return NULL; + + /* Update old/shadowed buffer. */ + if (update) + persistent_ram_save_old(prz); + + if (!persistent_ram_old_size(prz)) + return NULL; + + *typep = type; + *id = i; + + return prz; +} + +static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, + bool *compressed) +{ + char data_type; + int header_length = 0; + + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec, + &time->tv_nsec, &data_type, &header_length) == 3) { + if (data_type == 'C') + *compressed = true; + else + *compressed = false; + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n", + &time->tv_sec, &time->tv_nsec, &header_length) == 2) { + *compressed = false; + } else { + time->tv_sec = 0; + time->tv_nsec = 0; + *compressed = false; + } + return header_length; +} + +static bool prz_ok(struct persistent_ram_zone *prz) +{ + return !!prz && !!(persistent_ram_old_size(prz) + + persistent_ram_ecc_string(prz, NULL, 0)); +} + +static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, + char **buf, bool *compressed, + struct pstore_info *psi) +{ + ssize_t size; + ssize_t ecc_notice_size; + struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz; + int header_length; + + prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + cxt->max_dump_cnt, id, type, + PSTORE_TYPE_DMESG, 1); + if (!prz_ok(prz)) + prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, + 1, id, type, PSTORE_TYPE_CONSOLE, 0); + if (!prz_ok(prz)) + prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, + 1, id, type, PSTORE_TYPE_FTRACE, 0); + if (!prz_ok(prz)) + prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, + 1, id, type, PSTORE_TYPE_PMSG, 0); + if (!prz_ok(prz)) + return 0; + + if (!persistent_ram_old(prz)) + return 0; + + size = persistent_ram_old_size(prz); + header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time, + compressed); + size -= header_length; + + /* ECC correction notice */ + ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + + *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + + memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); + + return size + ecc_notice_size; +} + +static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, + bool compressed) +{ + char *hdr; + struct timespec timestamp; + size_t len; + + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(×tamp)) { + timestamp.tv_sec = 0; + timestamp.tv_nsec = 0; + } + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), + compressed ? 'C' : 'D'); + WARN_ON_ONCE(!hdr); + len = hdr ? strlen(hdr) : 0; + persistent_ram_write(prz, hdr, len); + kfree(hdr); + + return len; +} + +static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz; + size_t hlen; + + if (type == PSTORE_TYPE_CONSOLE) { + if (!cxt->cprz) + return -ENOMEM; + persistent_ram_write(cxt->cprz, buf, size); + return 0; + } else if (type == PSTORE_TYPE_FTRACE) { + if (!cxt->fprz) + return -ENOMEM; + persistent_ram_write(cxt->fprz, buf, size); + return 0; + } else if (type == PSTORE_TYPE_PMSG) { + if (!cxt->mprz) + return -ENOMEM; + persistent_ram_write(cxt->mprz, buf, size); + return 0; + } + + if (type != PSTORE_TYPE_DMESG) + return -EINVAL; + + /* Out of the various dmesg dump types, ramoops is currently designed + * to only store crash logs, rather than storing general kernel logs. + */ + if (reason != KMSG_DUMP_OOPS && + reason != KMSG_DUMP_PANIC) + return -EINVAL; + + /* Skip Oopes when configured to do so. */ + if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops) + return -EINVAL; + + /* Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (part != 1) + return -ENOSPC; + + if (!cxt->przs) + return -ENOSPC; + + prz = cxt->przs[cxt->dump_write_cnt]; + + hlen = ramoops_write_kmsg_hdr(prz, compressed); + if (size + hlen > prz->buffer_size) + size = prz->buffer_size - hlen; + persistent_ram_write(prz, buf, size); + + cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt; + + return 0; +} + +static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, + struct timespec time, struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz; + + switch (type) { + case PSTORE_TYPE_DMESG: + if (id >= cxt->max_dump_cnt) + return -EINVAL; + prz = cxt->przs[id]; + break; + case PSTORE_TYPE_CONSOLE: + prz = cxt->cprz; + break; + case PSTORE_TYPE_FTRACE: + prz = cxt->fprz; + break; + case PSTORE_TYPE_PMSG: + prz = cxt->mprz; + break; + default: + return -EINVAL; + } + + persistent_ram_free_old(prz); + persistent_ram_zap(prz); + + return 0; +} + +static struct ramoops_context oops_cxt = { + .pstore = { + .owner = THIS_MODULE, + .name = "ramoops", + .open = ramoops_pstore_open, + .read = ramoops_pstore_read, + .write_buf = ramoops_pstore_write_buf, + .erase = ramoops_pstore_erase, + }, +}; + +static void ramoops_free_przs(struct ramoops_context *cxt) +{ + int i; + + cxt->max_dump_cnt = 0; + if (!cxt->przs) + return; + + for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++) + persistent_ram_free(cxt->przs[i]); + kfree(cxt->przs); +} + +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) +{ + int err = -ENOMEM; + int i; + + if (!cxt->record_size) + return 0; + + if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for dumps\n"); + return -ENOMEM; + } + + cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; + if (!cxt->max_dump_cnt) + return -ENOMEM; + + cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt, + GFP_KERNEL); + if (!cxt->przs) { + dev_err(dev, "failed to initialize a prz array for dumps\n"); + goto fail_prz; + } + + for (i = 0; i < cxt->max_dump_cnt; i++) { + size_t sz = cxt->record_size; + + cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + &cxt->ecc_info, + cxt->memtype); + if (IS_ERR(cxt->przs[i])) { + err = PTR_ERR(cxt->przs[i]); + dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", + sz, (unsigned long long)*paddr, err); + goto fail_prz; + } + *paddr += sz; + } + + return 0; +fail_prz: + ramoops_free_przs(cxt); + return err; +} + +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) +{ + if (!sz) + return 0; + + if (*paddr + sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + sz, (unsigned long long)*paddr, + cxt->size, (unsigned long long)cxt->phys_addr); + return -ENOMEM; + } + + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); + if (IS_ERR(*prz)) { + int err = PTR_ERR(*prz); + + dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", + sz, (unsigned long long)*paddr, err); + return err; + } + + persistent_ram_zap(*prz); + + *paddr += sz; + + return 0; +} + +static int ramoops_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ramoops_platform_data *pdata = pdev->dev.platform_data; + struct ramoops_context *cxt = &oops_cxt; + size_t dump_mem_sz; + phys_addr_t paddr; + int err = -EINVAL; + + /* Only a single ramoops area allowed at a time, so fail extra + * probes. + */ + if (cxt->max_dump_cnt) + goto fail_out; + + if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && + !pdata->ftrace_size && !pdata->pmsg_size)) { + pr_err("The memory size and the record/console size must be " + "non-zero\n"); + goto fail_out; + } + + if (pdata->record_size && !is_power_of_2(pdata->record_size)) + pdata->record_size = rounddown_pow_of_two(pdata->record_size); + if (pdata->console_size && !is_power_of_2(pdata->console_size)) + pdata->console_size = rounddown_pow_of_two(pdata->console_size); + if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) + pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); + if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size)) + pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size); + + cxt->size = pdata->mem_size; + cxt->phys_addr = pdata->mem_address; + cxt->memtype = pdata->mem_type; + cxt->record_size = pdata->record_size; + cxt->console_size = pdata->console_size; + cxt->ftrace_size = pdata->ftrace_size; + cxt->pmsg_size = pdata->pmsg_size; + cxt->dump_oops = pdata->dump_oops; + cxt->ecc_info = pdata->ecc_info; + + paddr = cxt->phys_addr; + + dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size + - cxt->pmsg_size; + err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); + if (err) + goto fail_out; + + err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr, + cxt->console_size, 0); + if (err) + goto fail_init_cprz; + + err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size, + LINUX_VERSION_CODE); + if (err) + goto fail_init_fprz; + + err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0); + if (err) + goto fail_init_mprz; + + cxt->pstore.data = cxt; + /* + * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we + * have to handle dumps, we must have at least record_size buffer. And + * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be + * ZERO_SIZE_PTR). + */ + if (cxt->console_size) + cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ + cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); + cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); + spin_lock_init(&cxt->pstore.buf_lock); + if (!cxt->pstore.buf) { + pr_err("cannot allocate pstore buffer\n"); + err = -ENOMEM; + goto fail_clear; + } + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_buf; + } + + /* + * Update the module parameter variables as well so they are visible + * through /sys/module/ramoops/parameters/ + */ + mem_size = pdata->mem_size; + mem_address = pdata->mem_address; + record_size = pdata->record_size; + dump_oops = pdata->dump_oops; + ramoops_console_size = pdata->console_size; + ramoops_pmsg_size = pdata->pmsg_size; + ramoops_ftrace_size = pdata->ftrace_size; + + pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", + cxt->size, (unsigned long long)cxt->phys_addr, + cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); + + return 0; + +fail_buf: + kfree(cxt->pstore.buf); +fail_clear: + cxt->pstore.bufsize = 0; + kfree(cxt->mprz); +fail_init_mprz: + kfree(cxt->fprz); +fail_init_fprz: + kfree(cxt->cprz); +fail_init_cprz: + ramoops_free_przs(cxt); +fail_out: + return err; +} + +static int __exit ramoops_remove(struct platform_device *pdev) +{ +#if 0 + /* TODO(kees): We cannot unload ramoops since pstore doesn't support + * unregistering yet. + */ + struct ramoops_context *cxt = &oops_cxt; + + iounmap(cxt->virt_addr); + release_mem_region(cxt->phys_addr, cxt->size); + cxt->max_dump_cnt = 0; + + /* TODO(kees): When pstore supports unregistering, call it here. */ + kfree(cxt->pstore.buf); + cxt->pstore.bufsize = 0; + + return 0; +#endif + return -EBUSY; +} + +static struct platform_driver ramoops_driver = { + .probe = ramoops_probe, + .remove = __exit_p(ramoops_remove), + .driver = { + .name = "ramoops", + }, +}; + +static void ramoops_register_dummy(void) +{ + if (!mem_size) + return; + + pr_info("using module parameters\n"); + + dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL); + if (!dummy_data) { + pr_info("could not allocate pdata\n"); + return; + } + + dummy_data->mem_size = mem_size; + dummy_data->mem_address = mem_address; + dummy_data->mem_type = 0; + dummy_data->record_size = record_size; + dummy_data->console_size = ramoops_console_size; + dummy_data->ftrace_size = ramoops_ftrace_size; + dummy_data->pmsg_size = ramoops_pmsg_size; + dummy_data->dump_oops = dump_oops; + /* + * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC + * (using 1 byte for ECC isn't much of use anyway). + */ + dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + + dummy = platform_device_register_data(NULL, "ramoops", -1, + dummy_data, sizeof(struct ramoops_platform_data)); + if (IS_ERR(dummy)) { + pr_info("could not create platform device: %ld\n", + PTR_ERR(dummy)); + } +} + +static int __init ramoops_init(void) +{ + ramoops_register_dummy(); + return platform_driver_register(&ramoops_driver); +} +postcore_initcall(ramoops_init); + +static void __exit ramoops_exit(void) +{ + platform_driver_unregister(&ramoops_driver); + platform_device_unregister(dummy); + kfree(dummy_data); +} +module_exit(ramoops_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Stornelli "); +MODULE_DESCRIPTION("RAM Oops/Panic logger/driver"); diff --git a/kernel/fs/pstore/ram_core.c b/kernel/fs/pstore/ram_core.c new file mode 100644 index 000000000..76c3f80ef --- /dev/null +++ b/kernel/fs/pstore/ram_core.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) "persistent_ram: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct persistent_ram_buffer { + uint32_t sig; + atomic_t start; + atomic_t size; + uint8_t data[0]; +}; + +#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */ + +static inline size_t buffer_size(struct persistent_ram_zone *prz) +{ + return atomic_read(&prz->buffer->size); +} + +static inline size_t buffer_start(struct persistent_ram_zone *prz) +{ + return atomic_read(&prz->buffer->start); +} + +/* increase and wrap the start pointer, returning the old value */ +static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) +{ + int old; + int new; + + do { + old = atomic_read(&prz->buffer->start); + new = old + a; + while (unlikely(new >= prz->buffer_size)) + new -= prz->buffer_size; + } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); + + return old; +} + +/* increase the size counter until it hits the max size */ +static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a) +{ + size_t old; + size_t new; + + if (atomic_read(&prz->buffer->size) == prz->buffer_size) + return; + + do { + old = atomic_read(&prz->buffer->size); + new = old + a; + if (new > prz->buffer_size) + new = prz->buffer_size; + } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); +} + +static DEFINE_RAW_SPINLOCK(buffer_lock); + +/* increase and wrap the start pointer, returning the old value */ +static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + int old; + int new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->start); + new = old + a; + while (unlikely(new >= prz->buffer_size)) + new -= prz->buffer_size; + atomic_set(&prz->buffer->start, new); + + raw_spin_unlock_irqrestore(&buffer_lock, flags); + + return old; +} + +/* increase the size counter until it hits the max size */ +static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + size_t old; + size_t new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->size); + if (old == prz->buffer_size) + goto exit; + + new = old + a; + if (new > prz->buffer_size) + new = prz->buffer_size; + atomic_set(&prz->buffer->size, new); + +exit: + raw_spin_unlock_irqrestore(&buffer_lock, flags); +} + +static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic; +static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic; + +static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, + uint8_t *data, size_t len, uint8_t *ecc) +{ + int i; + uint16_t par[prz->ecc_info.ecc_size]; + + /* Initialize the parity buffer */ + memset(par, 0, sizeof(par)); + encode_rs8(prz->rs_decoder, data, len, par, 0); + for (i = 0; i < prz->ecc_info.ecc_size; i++) + ecc[i] = par[i]; +} + +static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, + void *data, size_t len, uint8_t *ecc) +{ + int i; + uint16_t par[prz->ecc_info.ecc_size]; + + for (i = 0; i < prz->ecc_info.ecc_size; i++) + par[i] = ecc[i]; + return decode_rs8(prz->rs_decoder, data, par, len, + NULL, 0, NULL, 0, NULL); +} + +static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, + unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + uint8_t *buffer_end = buffer->data + prz->buffer_size; + uint8_t *block; + uint8_t *par; + int ecc_block_size = prz->ecc_info.block_size; + int ecc_size = prz->ecc_info.ecc_size; + int size = ecc_block_size; + + if (!ecc_size) + return; + + block = buffer->data + (start & ~(ecc_block_size - 1)); + par = prz->par_buffer + (start / ecc_block_size) * ecc_size; + + do { + if (block + ecc_block_size > buffer_end) + size = buffer_end - block; + persistent_ram_encode_rs8(prz, block, size, par); + block += ecc_block_size; + par += ecc_size; + } while (block < buffer->data + start + count); +} + +static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + + if (!prz->ecc_info.ecc_size) + return; + + persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), + prz->par_header); +} + +static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + uint8_t *block; + uint8_t *par; + + if (!prz->ecc_info.ecc_size) + return; + + block = buffer->data; + par = prz->par_buffer; + while (block < buffer->data + buffer_size(prz)) { + int numerr; + int size = prz->ecc_info.block_size; + if (block + size > buffer->data + prz->buffer_size) + size = buffer->data + prz->buffer_size - block; + numerr = persistent_ram_decode_rs8(prz, block, size, par); + if (numerr > 0) { + pr_devel("error in block %p, %d\n", block, numerr); + prz->corrected_bytes += numerr; + } else if (numerr < 0) { + pr_devel("uncorrectable error in block %p\n", block); + prz->bad_blocks++; + } + block += prz->ecc_info.block_size; + par += prz->ecc_info.ecc_size; + } +} + +static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, + struct persistent_ram_ecc_info *ecc_info) +{ + int numerr; + struct persistent_ram_buffer *buffer = prz->buffer; + int ecc_blocks; + size_t ecc_total; + + if (!ecc_info || !ecc_info->ecc_size) + return 0; + + prz->ecc_info.block_size = ecc_info->block_size ?: 128; + prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16; + prz->ecc_info.symsize = ecc_info->symsize ?: 8; + prz->ecc_info.poly = ecc_info->poly ?: 0x11d; + + ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size, + prz->ecc_info.block_size + + prz->ecc_info.ecc_size); + ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size; + if (ecc_total >= prz->buffer_size) { + pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n", + __func__, prz->ecc_info.ecc_size, + ecc_total, prz->buffer_size); + return -EINVAL; + } + + prz->buffer_size -= ecc_total; + prz->par_buffer = buffer->data + prz->buffer_size; + prz->par_header = prz->par_buffer + + ecc_blocks * prz->ecc_info.ecc_size; + + /* + * first consecutive root is 0 + * primitive element to generate roots = 1 + */ + prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, + 0, 1, prz->ecc_info.ecc_size); + if (prz->rs_decoder == NULL) { + pr_info("init_rs failed\n"); + return -EINVAL; + } + + prz->corrected_bytes = 0; + prz->bad_blocks = 0; + + numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer), + prz->par_header); + if (numerr > 0) { + pr_info("error in header, %d\n", numerr); + prz->corrected_bytes += numerr; + } else if (numerr < 0) { + pr_info("uncorrectable error in header\n"); + prz->bad_blocks++; + } + + return 0; +} + +ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, + char *str, size_t len) +{ + ssize_t ret; + + if (!prz->ecc_info.ecc_size) + return 0; + + if (prz->corrected_bytes || prz->bad_blocks) + ret = snprintf(str, len, "" + "\n%d Corrected bytes, %d unrecoverable blocks\n", + prz->corrected_bytes, prz->bad_blocks); + else + ret = snprintf(str, len, "\nNo errors detected\n"); + + return ret; +} + +static void notrace persistent_ram_update(struct persistent_ram_zone *prz, + const void *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + memcpy(buffer->data + start, s, count); + persistent_ram_update_ecc(prz, start, count); +} + +void persistent_ram_save_old(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + size_t size = buffer_size(prz); + size_t start = buffer_start(prz); + + if (!size) + return; + + if (!prz->old_log) { + persistent_ram_ecc_old(prz); + prz->old_log = kmalloc(size, GFP_KERNEL); + } + if (!prz->old_log) { + pr_err("failed to allocate buffer\n"); + return; + } + + prz->old_log_size = size; + memcpy(prz->old_log, &buffer->data[start], size - start); + memcpy(prz->old_log + size - start, &buffer->data[0], start); +} + +int notrace persistent_ram_write(struct persistent_ram_zone *prz, + const void *s, unsigned int count) +{ + int rem; + int c = count; + size_t start; + + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + persistent_ram_update(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + persistent_ram_update(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return count; +} + +size_t persistent_ram_old_size(struct persistent_ram_zone *prz) +{ + return prz->old_log_size; +} + +void *persistent_ram_old(struct persistent_ram_zone *prz) +{ + return prz->old_log; +} + +void persistent_ram_free_old(struct persistent_ram_zone *prz) +{ + kfree(prz->old_log); + prz->old_log = NULL; + prz->old_log_size = 0; +} + +void persistent_ram_zap(struct persistent_ram_zone *prz) +{ + atomic_set(&prz->buffer->start, 0); + atomic_set(&prz->buffer->size, 0); + persistent_ram_update_header_ecc(prz); +} + +static void *persistent_ram_vmap(phys_addr_t start, size_t size, + unsigned int memtype) +{ + struct page **pages; + phys_addr_t page_start; + unsigned int page_count; + pgprot_t prot; + unsigned int i; + void *vaddr; + + page_start = start - offset_in_page(start); + page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); + + if (memtype) + prot = pgprot_noncached(PAGE_KERNEL); + else + prot = pgprot_writecombine(PAGE_KERNEL); + + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_err("%s: Failed to allocate array for %u pages\n", + __func__, page_count); + return NULL; + } + + for (i = 0; i < page_count; i++) { + phys_addr_t addr = page_start + i * PAGE_SIZE; + pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + } + vaddr = vmap(pages, page_count, VM_MAP, prot); + kfree(pages); + + return vaddr; +} + +static void *persistent_ram_iomap(phys_addr_t start, size_t size, + unsigned int memtype) +{ + void *va; + + if (!request_mem_region(start, size, "persistent_ram")) { + pr_err("request mem region (0x%llx@0x%llx) failed\n", + (unsigned long long)size, (unsigned long long)start); + return NULL; + } + + buffer_start_add = buffer_start_add_locked; + buffer_size_add = buffer_size_add_locked; + + if (memtype) + va = ioremap(start, size); + else + va = ioremap_wc(start, size); + + return va; +} + +static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, + struct persistent_ram_zone *prz, int memtype) +{ + prz->paddr = start; + prz->size = size; + + if (pfn_valid(start >> PAGE_SHIFT)) + prz->vaddr = persistent_ram_vmap(start, size, memtype); + else + prz->vaddr = persistent_ram_iomap(start, size, memtype); + + if (!prz->vaddr) { + pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, + (unsigned long long)size, (unsigned long long)start); + return -ENOMEM; + } + + prz->buffer = prz->vaddr + offset_in_page(start); + prz->buffer_size = size - sizeof(struct persistent_ram_buffer); + + return 0; +} + +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, + struct persistent_ram_ecc_info *ecc_info) +{ + int ret; + + ret = persistent_ram_init_ecc(prz, ecc_info); + if (ret) + return ret; + + sig ^= PERSISTENT_RAM_SIG; + + if (prz->buffer->sig == sig) { + if (buffer_size(prz) > prz->buffer_size || + buffer_start(prz) > buffer_size(prz)) + pr_info("found existing invalid buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); + else { + pr_debug("found existing buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); + persistent_ram_save_old(prz); + return 0; + } + } else { + pr_debug("no valid data in buffer (sig = 0x%08x)\n", + prz->buffer->sig); + } + + prz->buffer->sig = sig; + persistent_ram_zap(prz); + + return 0; +} + +void persistent_ram_free(struct persistent_ram_zone *prz) +{ + if (!prz) + return; + + if (prz->vaddr) { + if (pfn_valid(prz->paddr >> PAGE_SHIFT)) { + vunmap(prz->vaddr); + } else { + iounmap(prz->vaddr); + release_mem_region(prz->paddr, prz->size); + } + prz->vaddr = NULL; + } + persistent_ram_free_old(prz); + kfree(prz); +} + +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype) +{ + struct persistent_ram_zone *prz; + int ret = -ENOMEM; + + prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); + if (!prz) { + pr_err("failed to allocate persistent ram zone\n"); + goto err; + } + + ret = persistent_ram_buffer_map(start, size, prz, memtype); + if (ret) + goto err; + + ret = persistent_ram_post_init(prz, sig, ecc_info); + if (ret) + goto err; + + return prz; +err: + persistent_ram_free(prz); + return ERR_PTR(ret); +} diff --git a/kernel/fs/qnx4/Kconfig b/kernel/fs/qnx4/Kconfig new file mode 100644 index 000000000..5f6089994 --- /dev/null +++ b/kernel/fs/qnx4/Kconfig @@ -0,0 +1,14 @@ +config QNX4FS_FS + tristate "QNX4 file system support (read only)" + depends on BLOCK + help + This is the file system used by the real-time operating systems + QNX 4 and QNX 6 (the latter is also called QNX RTP). + Further information is available at . + Say Y if you intend to mount QNX hard disks or floppies. + + To compile this file system support as a module, choose M here: the + module will be called qnx4. + + If you don't know whether you need it, then you don't need it: + answer N. diff --git a/kernel/fs/qnx4/Makefile b/kernel/fs/qnx4/Makefile new file mode 100644 index 000000000..4a283b3f8 --- /dev/null +++ b/kernel/fs/qnx4/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux qnx4-filesystem routines. +# + +obj-$(CONFIG_QNX4FS_FS) += qnx4.o + +qnx4-objs := inode.o dir.o namei.o bitmap.o diff --git a/kernel/fs/qnx4/README b/kernel/fs/qnx4/README new file mode 100644 index 000000000..1f1e320d9 --- /dev/null +++ b/kernel/fs/qnx4/README @@ -0,0 +1,9 @@ + + This is a snapshot of the QNX4 filesystem for Linux. + Please send diffs and remarks to . + +Credits : + +Richard "Scuba" A. Frowijn +Frank "Jedi/Sector One" Denis +Anders Larsen (Maintainer) diff --git a/kernel/fs/qnx4/bitmap.c b/kernel/fs/qnx4/bitmap.c new file mode 100644 index 000000000..76a7a697b --- /dev/null +++ b/kernel/fs/qnx4/bitmap.c @@ -0,0 +1,44 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 28-05-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : basic optimisations. + * 25-06-1998 by Frank Denis : qnx4_is_free, qnx4_set_bitmap, qnx4_bmap . + * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . + */ + +#include +#include +#include "qnx4.h" + +unsigned long qnx4_count_free_blocks(struct super_block *sb) +{ + int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1; + int total = 0; + int total_free = 0; + int offset = 0; + int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size); + struct buffer_head *bh; + + while (total < size) { + int bytes = min(size - total, QNX4_BLOCK_SIZE); + + if ((bh = sb_bread(sb, start + offset)) == NULL) { + printk(KERN_ERR "qnx4: I/O error in counting free blocks\n"); + break; + } + total_free += bytes * BITS_PER_BYTE - + memweight(bh->b_data, bytes); + brelse(bh); + total += bytes; + offset++; + } + + return total_free; +} diff --git a/kernel/fs/qnx4/dir.c b/kernel/fs/qnx4/dir.c new file mode 100644 index 000000000..b218f9658 --- /dev/null +++ b/kernel/fs/qnx4/dir.c @@ -0,0 +1,81 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 28-05-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. + */ + +#include +#include "qnx4.h" + +static int qnx4_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + unsigned int offset; + struct buffer_head *bh; + struct qnx4_inode_entry *de; + struct qnx4_link_info *le; + unsigned long blknum; + int ix, ino; + int size; + + QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); + QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos)); + + while (ctx->pos < inode->i_size) { + blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS); + bh = sb_bread(inode->i_sb, blknum); + if (bh == NULL) { + printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); + return 0; + } + ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; + for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { + offset = ix * QNX4_DIR_ENTRY_SIZE; + de = (struct qnx4_inode_entry *) (bh->b_data + offset); + if (!de->di_fname[0]) + continue; + if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + continue; + if (!(de->di_status & QNX4_FILE_LINK)) + size = QNX4_SHORT_NAME_MAX; + else + size = QNX4_NAME_MAX; + size = strnlen(de->di_fname, size); + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); + if (!(de->di_status & QNX4_FILE_LINK)) + ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; + else { + le = (struct qnx4_link_info*)de; + ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + QNX4_INODES_PER_BLOCK + + le->dl_inode_ndx; + } + if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) { + brelse(bh); + return 0; + } + } + brelse(bh); + } + return 0; +} + +const struct file_operations qnx4_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = qnx4_readdir, + .fsync = generic_file_fsync, +}; + +const struct inode_operations qnx4_dir_inode_operations = +{ + .lookup = qnx4_lookup, +}; diff --git a/kernel/fs/qnx4/inode.c b/kernel/fs/qnx4/inode.c new file mode 100644 index 000000000..c4bcb7788 --- /dev/null +++ b/kernel/fs/qnx4/inode.c @@ -0,0 +1,426 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 01-06-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : Linux 2.1.99+ support, boot signature, misc. + * 30-06-1998 by Frank Denis : first step to write inodes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "qnx4.h" + +#define QNX4_VERSION 4 +#define QNX4_BMNAME ".bitmap" + +static const struct super_operations qnx4_sops; + +static struct inode *qnx4_alloc_inode(struct super_block *sb); +static void qnx4_destroy_inode(struct inode *inode); +static int qnx4_remount(struct super_block *sb, int *flags, char *data); +static int qnx4_statfs(struct dentry *, struct kstatfs *); + +static const struct super_operations qnx4_sops = +{ + .alloc_inode = qnx4_alloc_inode, + .destroy_inode = qnx4_destroy_inode, + .statfs = qnx4_statfs, + .remount_fs = qnx4_remount, +}; + +static int qnx4_remount(struct super_block *sb, int *flags, char *data) +{ + struct qnx4_sb_info *qs; + + sync_filesystem(sb); + qs = qnx4_sb(sb); + qs->Version = QNX4_VERSION; + *flags |= MS_RDONLY; + return 0; +} + +static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) +{ + unsigned long phys; + + QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); + + phys = qnx4_block_map( inode, iblock ); + if ( phys ) { + // logical block is before EOF + map_bh(bh, inode->i_sb, phys); + } + return 0; +} + +static inline u32 try_extent(qnx4_xtnt_t *extent, u32 *offset) +{ + u32 size = le32_to_cpu(extent->xtnt_size); + if (*offset < size) + return le32_to_cpu(extent->xtnt_blk) + *offset - 1; + *offset -= size; + return 0; +} + +unsigned long qnx4_block_map( struct inode *inode, long iblock ) +{ + int ix; + long i_xblk; + struct buffer_head *bh = NULL; + struct qnx4_xblk *xblk = NULL; + struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode); + u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); + u32 offset = iblock; + u32 block = try_extent(&qnx4_inode->di_first_xtnt, &offset); + + if (block) { + // iblock is in the first extent. This is easy. + } else { + // iblock is beyond first extent. We have to follow the extent chain. + i_xblk = le32_to_cpu(qnx4_inode->di_xblk); + ix = 0; + while ( --nxtnt > 0 ) { + if ( ix == 0 ) { + // read next xtnt block. + bh = sb_bread(inode->i_sb, i_xblk - 1); + if ( !bh ) { + QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1)); + return -EIO; + } + xblk = (struct qnx4_xblk*)bh->b_data; + if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) { + QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk)); + return -EIO; + } + } + block = try_extent(&xblk->xblk_xtnts[ix], &offset); + if (block) { + // got it! + break; + } + if ( ++ix >= xblk->xblk_num_xtnts ) { + i_xblk = le32_to_cpu(xblk->xblk_next_xblk); + ix = 0; + brelse( bh ); + bh = NULL; + } + } + if ( bh ) + brelse( bh ); + } + + QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); + return block; +} + +static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; + buf->f_bfree = qnx4_count_free_blocks(sb); + buf->f_bavail = buf->f_bfree; + buf->f_namelen = QNX4_NAME_MAX; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +/* + * Check the root directory of the filesystem to make sure + * it really _is_ a qnx4 filesystem, and to check the size + * of the directory entry. + */ +static const char *qnx4_checkroot(struct super_block *sb, + struct qnx4_super_block *s) +{ + struct buffer_head *bh; + struct qnx4_inode_entry *rootdir; + int rd, rl; + int i, j; + + if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0') + return "no qnx4 filesystem (no root dir)."; + QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); + rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size); + for (j = 0; j < rl; j++) { + bh = sb_bread(sb, rd + j); /* root dir, first block */ + if (bh == NULL) + return "unable to read root entry."; + rootdir = (struct qnx4_inode_entry *) bh->b_data; + for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) { + QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); + if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0) + continue; + qnx4_sb(sb)->BitMap = kmemdup(rootdir, + sizeof(struct qnx4_inode_entry), + GFP_KERNEL); + brelse(bh); + if (!qnx4_sb(sb)->BitMap) + return "not enough memory for bitmap inode"; + /* keep bitmap inode known */ + return NULL; + } + brelse(bh); + } + return "bitmap file not found."; +} + +static int qnx4_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh; + struct inode *root; + const char *errmsg; + struct qnx4_sb_info *qs; + + qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); + if (!qs) + return -ENOMEM; + s->s_fs_info = qs; + + sb_set_blocksize(s, QNX4_BLOCK_SIZE); + + s->s_op = &qnx4_sops; + s->s_magic = QNX4_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + + /* Check the superblock signature. Since the qnx4 code is + dangerous, we should leave as quickly as possible + if we don't belong here... */ + bh = sb_bread(s, 1); + if (!bh) { + printk(KERN_ERR "qnx4: unable to read the superblock\n"); + return -EINVAL; + } + + /* check before allocating dentries, inodes, .. */ + errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); + brelse(bh); + if (errmsg != NULL) { + if (!silent) + printk(KERN_ERR "qnx4: %s\n", errmsg); + return -EINVAL; + } + + /* does root not have inode number QNX4_ROOT_INO ?? */ + root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); + if (IS_ERR(root)) { + printk(KERN_ERR "qnx4: get inode failed\n"); + return PTR_ERR(root); + } + + s->s_root = d_make_root(root); + if (s->s_root == NULL) + return -ENOMEM; + + return 0; +} + +static void qnx4_kill_sb(struct super_block *sb) +{ + struct qnx4_sb_info *qs = qnx4_sb(sb); + kill_block_super(sb); + if (qs) { + kfree(qs->BitMap); + kfree(qs); + } +} + +static int qnx4_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,qnx4_get_block); +} + +static sector_t qnx4_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,qnx4_get_block); +} +static const struct address_space_operations qnx4_aops = { + .readpage = qnx4_readpage, + .bmap = qnx4_bmap +}; + +struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) +{ + struct buffer_head *bh; + struct qnx4_inode_entry *raw_inode; + int block; + struct qnx4_inode_entry *qnx4_inode; + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + qnx4_inode = qnx4_raw_inode(inode); + inode->i_mode = 0; + + QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino)); + if (!ino) { + printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is " + "out of range\n", + sb->s_id, ino); + iget_failed(inode); + return ERR_PTR(-EIO); + } + block = ino / QNX4_INODES_PER_BLOCK; + + if (!(bh = sb_bread(sb, block))) { + printk(KERN_ERR "qnx4: major problem: unable to read inode from dev " + "%s\n", sb->s_id); + iget_failed(inode); + return ERR_PTR(-EIO); + } + raw_inode = ((struct qnx4_inode_entry *) bh->b_data) + + (ino % QNX4_INODES_PER_BLOCK); + + inode->i_mode = le16_to_cpu(raw_inode->di_mode); + i_uid_write(inode, (uid_t)le16_to_cpu(raw_inode->di_uid)); + i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid)); + set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); + inode->i_size = le32_to_cpu(raw_inode->di_size); + inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); + + memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_mapping->a_ops = &qnx4_aops; + qnx4_i(inode)->mmu_private = inode->i_size; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &qnx4_dir_inode_operations; + inode->i_fop = &qnx4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &qnx4_aops; + qnx4_i(inode)->mmu_private = inode->i_size; + } else { + printk(KERN_ERR "qnx4: bad inode %lu on dev %s\n", + ino, sb->s_id); + iget_failed(inode); + brelse(bh); + return ERR_PTR(-EIO); + } + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +static struct kmem_cache *qnx4_inode_cachep; + +static struct inode *qnx4_alloc_inode(struct super_block *sb) +{ + struct qnx4_inode_info *ei; + ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void qnx4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); +} + +static void qnx4_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, qnx4_i_callback); +} + +static void init_once(void *foo) +{ + struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", + sizeof(struct qnx4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (qnx4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(qnx4_inode_cachep); +} + +static struct dentry *qnx4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); +} + +static struct file_system_type qnx4_fs_type = { + .owner = THIS_MODULE, + .name = "qnx4", + .mount = qnx4_mount, + .kill_sb = qnx4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("qnx4"); + +static int __init init_qnx4_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + return err; + + err = register_filesystem(&qnx4_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n"); + return 0; +} + +static void __exit exit_qnx4_fs(void) +{ + unregister_filesystem(&qnx4_fs_type); + destroy_inodecache(); +} + +module_init(init_qnx4_fs) +module_exit(exit_qnx4_fs) +MODULE_LICENSE("GPL"); + diff --git a/kernel/fs/qnx4/namei.c b/kernel/fs/qnx4/namei.c new file mode 100644 index 000000000..e62c81837 --- /dev/null +++ b/kernel/fs/qnx4/namei.c @@ -0,0 +1,125 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 01-06-1998 by Richard Frowijn : first release. + * 21-06-1998 by Frank Denis : dcache support, fixed error codes. + * 04-07-1998 by Frank Denis : first step for rmdir/unlink. + */ + +#include +#include "qnx4.h" + + +/* + * check if the filename is correct. For some obscure reason, qnx writes a + * new file twice in the directory entry, first with all possible options at 0 + * and for a second time the way it is, they want us not to access the qnx + * filesystem when whe are using linux. + */ +static int qnx4_match(int len, const char *name, + struct buffer_head *bh, unsigned long *offset) +{ + struct qnx4_inode_entry *de; + int namelen, thislen; + + if (bh == NULL) { + printk(KERN_WARNING "qnx4: matching unassigned buffer !\n"); + return 0; + } + de = (struct qnx4_inode_entry *) (bh->b_data + *offset); + *offset += QNX4_DIR_ENTRY_SIZE; + if ((de->di_status & QNX4_FILE_LINK) != 0) { + namelen = QNX4_NAME_MAX; + } else { + namelen = QNX4_SHORT_NAME_MAX; + } + thislen = strlen( de->di_fname ); + if ( thislen > namelen ) + thislen = namelen; + if (len != thislen) { + return 0; + } + if (strncmp(name, de->di_fname, len) == 0) { + if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) { + return 1; + } + } + return 0; +} + +static struct buffer_head *qnx4_find_entry(int len, struct inode *dir, + const char *name, struct qnx4_inode_entry **res_dir, int *ino) +{ + unsigned long block, offset, blkofs; + struct buffer_head *bh; + + *res_dir = NULL; + bh = NULL; + block = offset = blkofs = 0; + while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) { + if (!bh) { + block = qnx4_block_map(dir, blkofs); + if (block) + bh = sb_bread(dir->i_sb, block); + if (!bh) { + blkofs++; + continue; + } + } + *res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset); + if (qnx4_match(len, name, bh, &offset)) { + *ino = block * QNX4_INODES_PER_BLOCK + + (offset / QNX4_DIR_ENTRY_SIZE) - 1; + return bh; + } + if (offset < bh->b_size) { + continue; + } + brelse(bh); + bh = NULL; + offset = 0; + blkofs++; + } + brelse(bh); + *res_dir = NULL; + return NULL; +} + +struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + int ino; + struct qnx4_inode_entry *de; + struct qnx4_link_info *lnk; + struct buffer_head *bh; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *foundinode = NULL; + + if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) + goto out; + /* The entry is linked, let's get the real info */ + if ((de->di_status & QNX4_FILE_LINK) == QNX4_FILE_LINK) { + lnk = (struct qnx4_link_info *) de; + ino = (le32_to_cpu(lnk->dl_inode_blk) - 1) * + QNX4_INODES_PER_BLOCK + + lnk->dl_inode_ndx; + } + brelse(bh); + + foundinode = qnx4_iget(dir->i_sb, ino); + if (IS_ERR(foundinode)) { + QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", + PTR_ERR(foundinode))); + return ERR_CAST(foundinode); + } +out: + d_add(dentry, foundinode); + + return NULL; +} diff --git a/kernel/fs/qnx4/qnx4.h b/kernel/fs/qnx4/qnx4.h new file mode 100644 index 000000000..c9b1be2c1 --- /dev/null +++ b/kernel/fs/qnx4/qnx4.h @@ -0,0 +1,45 @@ +#include +#include + +#define QNX4_DEBUG 0 + +#if QNX4_DEBUG +#define QNX4DEBUG(X) printk X +#else +#define QNX4DEBUG(X) (void) 0 +#endif + +struct qnx4_sb_info { + unsigned int Version; /* may be useful */ + struct qnx4_inode_entry *BitMap; /* useful */ +}; + +struct qnx4_inode_info { + struct qnx4_inode_entry raw; + loff_t mmu_private; + struct inode vfs_inode; +}; + +extern struct inode *qnx4_iget(struct super_block *, unsigned long); +extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); +extern unsigned long qnx4_count_free_blocks(struct super_block *sb); +extern unsigned long qnx4_block_map(struct inode *inode, long iblock); + +extern const struct inode_operations qnx4_dir_inode_operations; +extern const struct file_operations qnx4_dir_operations; +extern int qnx4_is_free(struct super_block *sb, long block); + +static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct qnx4_inode_info *qnx4_i(struct inode *inode) +{ + return container_of(inode, struct qnx4_inode_info, vfs_inode); +} + +static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) +{ + return &qnx4_i(inode)->raw; +} diff --git a/kernel/fs/qnx6/Kconfig b/kernel/fs/qnx6/Kconfig new file mode 100644 index 000000000..edbba5c17 --- /dev/null +++ b/kernel/fs/qnx6/Kconfig @@ -0,0 +1,26 @@ +config QNX6FS_FS + tristate "QNX6 file system support (read only)" + depends on BLOCK && CRC32 + help + This is the file system used by the real-time operating systems + QNX 6 (also called QNX RTP). + Further information is available at . + Say Y if you intend to mount QNX hard disks or floppies formatted + with a mkqnx6fs. + However, keep in mind that this currently is a readonly driver! + + To compile this file system support as a module, choose M here: the + module will be called qnx6. + + If you don't know whether you need it, then you don't need it: + answer N. + +config QNX6FS_DEBUG + bool "QNX6 debugging information" + depends on QNX6FS_FS + help + Turns on extended debugging output. + + If you are not a developer working on the QNX6FS, you probably don't + want this: + answer N. diff --git a/kernel/fs/qnx6/Makefile b/kernel/fs/qnx6/Makefile new file mode 100644 index 000000000..5e6bae6fa --- /dev/null +++ b/kernel/fs/qnx6/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the linux qnx4-filesystem routines. +# + +obj-$(CONFIG_QNX6FS_FS) += qnx6.o + +qnx6-objs := inode.o dir.o namei.o super_mmi.o +ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG diff --git a/kernel/fs/qnx6/README b/kernel/fs/qnx6/README new file mode 100644 index 000000000..116d62202 --- /dev/null +++ b/kernel/fs/qnx6/README @@ -0,0 +1,8 @@ + + This is a snapshot of the QNX6 filesystem for Linux. + Please send diffs and remarks to . + +Credits : + +Al Viro (endless patience with me & support ;)) +Kai Bankett (Maintainer) diff --git a/kernel/fs/qnx6/dir.c b/kernel/fs/qnx6/dir.c new file mode 100644 index 000000000..8d64bb536 --- /dev/null +++ b/kernel/fs/qnx6/dir.c @@ -0,0 +1,286 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include "qnx6.h" + +static unsigned qnx6_lfile_checksum(char *name, unsigned size) +{ + unsigned crc = 0; + char *end = name + size; + while (name < end) { + crc = ((crc >> 1) + *(name++)) ^ + ((crc & 0x00000001) ? 0x80000000 : 0); + } + return crc; +} + +static struct page *qnx6_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static unsigned last_entry(struct inode *inode, unsigned long page_nr) +{ + unsigned long last_byte = inode->i_size; + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte / QNX6_DIR_ENTRY_SIZE; +} + +static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, + struct qnx6_long_dir_entry *de, + struct page **p) +{ + struct qnx6_sb_info *sbi = QNX6_SB(sb); + u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ + u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */ + /* within page */ + u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK; + struct address_space *mapping = sbi->longfile->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); + kmap(*p = page); + return (struct qnx6_long_filename *)(page_address(page) + offs); +} + +static int qnx6_dir_longfilename(struct inode *inode, + struct qnx6_long_dir_entry *de, + struct dir_context *ctx, + unsigned de_inode) +{ + struct qnx6_long_filename *lf; + struct super_block *s = inode->i_sb; + struct qnx6_sb_info *sbi = QNX6_SB(s); + struct page *page; + int lf_size; + + if (de->de_size != 0xff) { + /* error - long filename entries always have size 0xff + in direntry */ + pr_err("invalid direntry size (%i).\n", de->de_size); + return 0; + } + lf = qnx6_longname(s, de, &page); + if (IS_ERR(lf)) { + pr_err("Error reading longname\n"); + return 0; + } + + lf_size = fs16_to_cpu(sbi, lf->lf_size); + + if (lf_size > QNX6_LONG_NAME_MAX) { + pr_debug("file %s\n", lf->lf_fname); + pr_err("Filename too long (%i)\n", lf_size); + qnx6_put_page(page); + return 0; + } + + /* calc & validate longfilename checksum + mmi 3g filesystem does not have that checksum */ + if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != + qnx6_lfile_checksum(lf->lf_fname, lf_size)) + pr_info("long filename checksum error.\n"); + + pr_debug("qnx6_readdir:%.*s inode:%u\n", + lf_size, lf->lf_fname, de_inode); + if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { + qnx6_put_page(page); + return 0; + } + + qnx6_put_page(page); + /* success */ + return 1; +} + +static int qnx6_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *s = inode->i_sb; + struct qnx6_sb_info *sbi = QNX6_SB(s); + loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); + unsigned long npages = dir_pages(inode); + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; + bool done = false; + + ctx->pos = pos; + if (ctx->pos >= inode->i_size) + return 0; + + for ( ; !done && n < npages; n++, start = 0) { + struct page *page = qnx6_get_page(inode, n); + int limit = last_entry(inode, n); + struct qnx6_dir_entry *de; + int i = start; + + if (IS_ERR(page)) { + pr_err("%s(): read failed\n", __func__); + ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; + return PTR_ERR(page); + } + de = ((struct qnx6_dir_entry *)page_address(page)) + start; + for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { + int size = de->de_size; + u32 no_inode = fs32_to_cpu(sbi, de->de_inode); + + if (!no_inode || !size) + continue; + + if (size > QNX6_SHORT_NAME_MAX) { + /* long filename detected + get the filename from long filename + structure / block */ + if (!qnx6_dir_longfilename(inode, + (struct qnx6_long_dir_entry *)de, + ctx, no_inode)) { + done = true; + break; + } + } else { + pr_debug("%s():%.*s inode:%u\n", + __func__, size, de->de_fname, + no_inode); + if (!dir_emit(ctx, de->de_fname, size, + no_inode, DT_UNKNOWN)) { + done = true; + break; + } + } + } + qnx6_put_page(page); + } + return 0; +} + +/* + * check if the long filename is correct. + */ +static unsigned qnx6_long_match(int len, const char *name, + struct qnx6_long_dir_entry *de, struct inode *dir) +{ + struct super_block *s = dir->i_sb; + struct qnx6_sb_info *sbi = QNX6_SB(s); + struct page *page; + int thislen; + struct qnx6_long_filename *lf = qnx6_longname(s, de, &page); + + if (IS_ERR(lf)) + return 0; + + thislen = fs16_to_cpu(sbi, lf->lf_size); + if (len != thislen) { + qnx6_put_page(page); + return 0; + } + if (memcmp(name, lf->lf_fname, len) == 0) { + qnx6_put_page(page); + return fs32_to_cpu(sbi, de->de_inode); + } + qnx6_put_page(page); + return 0; +} + +/* + * check if the filename is correct. + */ +static unsigned qnx6_match(struct super_block *s, int len, const char *name, + struct qnx6_dir_entry *de) +{ + struct qnx6_sb_info *sbi = QNX6_SB(s); + if (memcmp(name, de->de_fname, len) == 0) + return fs32_to_cpu(sbi, de->de_inode); + return 0; +} + + +unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, + struct page **res_page) +{ + struct super_block *s = dir->i_sb; + struct qnx6_inode_info *ei = QNX6_I(dir); + struct page *page = NULL; + unsigned long start, n; + unsigned long npages = dir_pages(dir); + unsigned ino; + struct qnx6_dir_entry *de; + struct qnx6_long_dir_entry *lde; + + *res_page = NULL; + + if (npages == 0) + return 0; + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + + do { + page = qnx6_get_page(dir, n); + if (!IS_ERR(page)) { + int limit = last_entry(dir, n); + int i; + + de = (struct qnx6_dir_entry *)page_address(page); + for (i = 0; i < limit; i++, de++) { + if (len <= QNX6_SHORT_NAME_MAX) { + /* short filename */ + if (len != de->de_size) + continue; + ino = qnx6_match(s, len, name, de); + if (ino) + goto found; + } else if (de->de_size == 0xff) { + /* deal with long filename */ + lde = (struct qnx6_long_dir_entry *)de; + ino = qnx6_long_match(len, + name, lde, dir); + if (ino) + goto found; + } else + pr_err("undefined filename size in inode.\n"); + } + qnx6_put_page(page); + } + + if (++n >= npages) + n = 0; + } while (n != start); + return 0; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return ino; +} + +const struct file_operations qnx6_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = qnx6_readdir, + .fsync = generic_file_fsync, +}; + +const struct inode_operations qnx6_dir_inode_operations = { + .lookup = qnx6_lookup, +}; diff --git a/kernel/fs/qnx6/inode.c b/kernel/fs/qnx6/inode.c new file mode 100644 index 000000000..32d2e1a97 --- /dev/null +++ b/kernel/fs/qnx6/inode.c @@ -0,0 +1,685 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "qnx6.h" + +static const struct super_operations qnx6_sops; + +static void qnx6_put_super(struct super_block *sb); +static struct inode *qnx6_alloc_inode(struct super_block *sb); +static void qnx6_destroy_inode(struct inode *inode); +static int qnx6_remount(struct super_block *sb, int *flags, char *data); +static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf); +static int qnx6_show_options(struct seq_file *seq, struct dentry *root); + +static const struct super_operations qnx6_sops = { + .alloc_inode = qnx6_alloc_inode, + .destroy_inode = qnx6_destroy_inode, + .put_super = qnx6_put_super, + .statfs = qnx6_statfs, + .remount_fs = qnx6_remount, + .show_options = qnx6_show_options, +}; + +static int qnx6_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct qnx6_sb_info *sbi = QNX6_SB(sb); + + if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS) + seq_puts(seq, ",mmi_fs"); + return 0; +} + +static int qnx6_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + +static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block) +{ + struct qnx6_sb_info *sbi = QNX6_SB(sb); + return fs32_to_cpu(sbi, block) + sbi->s_blks_off; +} + +static unsigned qnx6_block_map(struct inode *inode, unsigned iblock); + +static int qnx6_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + unsigned phys; + + pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + inode->i_ino, (unsigned long)iblock); + + phys = qnx6_block_map(inode, iblock); + if (phys) { + /* logical block is before EOF */ + map_bh(bh, inode->i_sb, phys); + } + return 0; +} + +static int qnx6_check_blockptr(__fs32 ptr) +{ + if (ptr == ~(__fs32)0) { + pr_err("hit unused blockpointer.\n"); + return 0; + } + return 1; +} + +static int qnx6_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, qnx6_get_block); +} + +static int qnx6_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); +} + +/* + * returns the block number for the no-th element in the tree + * inodebits requred as there are multiple inodes in one inode block + */ +static unsigned qnx6_block_map(struct inode *inode, unsigned no) +{ + struct super_block *s = inode->i_sb; + struct qnx6_sb_info *sbi = QNX6_SB(s); + struct qnx6_inode_info *ei = QNX6_I(inode); + unsigned block = 0; + struct buffer_head *bh; + __fs32 ptr; + int levelptr; + int ptrbits = sbi->s_ptrbits; + int bitdelta; + u32 mask = (1 << ptrbits) - 1; + int depth = ei->di_filelevels; + int i; + + bitdelta = ptrbits * depth; + levelptr = no >> bitdelta; + + if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { + pr_err("Requested file block number (%u) too big.", no); + return 0; + } + + block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]); + + for (i = 0; i < depth; i++) { + bh = sb_bread(s, block); + if (!bh) { + pr_err("Error reading block (%u)\n", block); + return 0; + } + bitdelta -= ptrbits; + levelptr = (no >> bitdelta) & mask; + ptr = ((__fs32 *)bh->b_data)[levelptr]; + + if (!qnx6_check_blockptr(ptr)) + return 0; + + block = qnx6_get_devblock(s, ptr); + brelse(bh); + } + return block; +} + +static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct qnx6_sb_info *sbi = QNX6_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks); + buf->f_bfree = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks); + buf->f_files = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes); + buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); + buf->f_bavail = buf->f_bfree; + buf->f_namelen = QNX6_LONG_NAME_MAX; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +/* + * Check the root directory of the filesystem to make sure + * it really _is_ a qnx6 filesystem, and to check the size + * of the directory entry. + */ +static const char *qnx6_checkroot(struct super_block *s) +{ + static char match_root[2][3] = {".\0\0", "..\0"}; + int i, error = 0; + struct qnx6_dir_entry *dir_entry; + struct inode *root = d_inode(s->s_root); + struct address_space *mapping = root->i_mapping; + struct page *page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return "error reading root directory"; + kmap(page); + dir_entry = page_address(page); + for (i = 0; i < 2; i++) { + /* maximum 3 bytes - due to match_root limitation */ + if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) + error = 1; + } + qnx6_put_page(page); + if (error) + return "error reading root directory."; + return NULL; +} + +#ifdef CONFIG_QNX6FS_DEBUG +void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) +{ + struct qnx6_sb_info *sbi = QNX6_SB(s); + + pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); + pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); + pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); + pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); + pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); + pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); + pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); + pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); + pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); + pr_debug("inode_levels: %02x\n", sb->Inode.levels); +} +#endif + +enum { + Opt_mmifs, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_mmifs, "mmi_fs"}, + {Opt_err, NULL} +}; + +static int qnx6_parse_options(char *options, struct super_block *sb) +{ + char *p; + struct qnx6_sb_info *sbi = QNX6_SB(sb); + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_mmifs: + set_opt(sbi->s_mount_opt, MMI_FS); + break; + default: + return 0; + } + } + return 1; +} + +static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, + int offset, int silent) +{ + struct qnx6_sb_info *sbi = QNX6_SB(s); + struct buffer_head *bh; + struct qnx6_super_block *sb; + + /* Check the superblock signatures + start with the first superblock */ + bh = sb_bread(s, offset); + if (!bh) { + pr_err("unable to read the first superblock\n"); + return NULL; + } + sb = (struct qnx6_super_block *)bh->b_data; + if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) { + sbi->s_bytesex = BYTESEX_BE; + if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { + /* we got a big endian fs */ + pr_debug("fs got different endianness.\n"); + return bh; + } else + sbi->s_bytesex = BYTESEX_LE; + if (!silent) { + if (offset == 0) { + pr_err("wrong signature (magic) in superblock #1.\n"); + } else { + pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", + offset * s->s_blocksize); + } + } + brelse(bh); + return NULL; + } + return bh; +} + +static struct inode *qnx6_private_inode(struct super_block *s, + struct qnx6_root_node *p); + +static int qnx6_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh1 = NULL, *bh2 = NULL; + struct qnx6_super_block *sb1 = NULL, *sb2 = NULL; + struct qnx6_sb_info *sbi; + struct inode *root; + const char *errmsg; + struct qnx6_sb_info *qs; + int ret = -EINVAL; + u64 offset; + int bootblock_offset = QNX6_BOOTBLOCK_SIZE; + + qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL); + if (!qs) + return -ENOMEM; + s->s_fs_info = qs; + + /* Superblock always is 512 Byte long */ + if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { + pr_err("unable to set blocksize\n"); + goto outnobh; + } + + /* parse the mount-options */ + if (!qnx6_parse_options((char *) data, s)) { + pr_err("invalid mount options.\n"); + goto outnobh; + } + if (test_opt(s, MMI_FS)) { + sb1 = qnx6_mmi_fill_super(s, silent); + if (sb1) + goto mmi_success; + else + goto outnobh; + } + sbi = QNX6_SB(s); + sbi->s_bytesex = BYTESEX_LE; + /* Check the superblock signatures + start with the first superblock */ + bh1 = qnx6_check_first_superblock(s, + bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent); + if (!bh1) { + /* try again without bootblock offset */ + bh1 = qnx6_check_first_superblock(s, 0, silent); + if (!bh1) { + pr_err("unable to read the first superblock\n"); + goto outnobh; + } + /* seems that no bootblock at partition start */ + bootblock_offset = 0; + } + sb1 = (struct qnx6_super_block *)bh1->b_data; + +#ifdef CONFIG_QNX6FS_DEBUG + qnx6_superblock_debug(sb1, s); +#endif + + /* checksum check - start at byte 8 and end at byte 512 */ + if (fs32_to_cpu(sbi, sb1->sb_checksum) != + crc32_be(0, (char *)(bh1->b_data + 8), 504)) { + pr_err("superblock #1 checksum error\n"); + goto out; + } + + /* set new blocksize */ + if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { + pr_err("unable to set blocksize\n"); + goto out; + } + /* blocksize invalidates bh - pull it back in */ + brelse(bh1); + bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits); + if (!bh1) + goto outnobh; + sb1 = (struct qnx6_super_block *)bh1->b_data; + + /* calculate second superblock blocknumber */ + offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + + (bootblock_offset >> s->s_blocksize_bits) + + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); + + /* set bootblock offset */ + sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) + + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); + + /* next the second superblock */ + bh2 = sb_bread(s, offset); + if (!bh2) { + pr_err("unable to read the second superblock\n"); + goto out; + } + sb2 = (struct qnx6_super_block *)bh2->b_data; + if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { + if (!silent) + pr_err("wrong signature (magic) in superblock #2.\n"); + goto out; + } + + /* checksum check - start at byte 8 and end at byte 512 */ + if (fs32_to_cpu(sbi, sb2->sb_checksum) != + crc32_be(0, (char *)(bh2->b_data + 8), 504)) { + pr_err("superblock #2 checksum error\n"); + goto out; + } + + if (fs64_to_cpu(sbi, sb1->sb_serial) >= + fs64_to_cpu(sbi, sb2->sb_serial)) { + /* superblock #1 active */ + sbi->sb_buf = bh1; + sbi->sb = (struct qnx6_super_block *)bh1->b_data; + brelse(bh2); + pr_info("superblock #1 active\n"); + } else { + /* superblock #2 active */ + sbi->sb_buf = bh2; + sbi->sb = (struct qnx6_super_block *)bh2->b_data; + brelse(bh1); + pr_info("superblock #2 active\n"); + } +mmi_success: + /* sanity check - limit maximum indirect pointer levels */ + if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { + pr_err("too many inode levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); + goto out; + } + if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { + pr_err("too many longfilename levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); + goto out; + } + s->s_op = &qnx6_sops; + s->s_magic = QNX6_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + + /* ease the later tree level calculations */ + sbi = QNX6_SB(s); + sbi->s_ptrbits = ilog2(s->s_blocksize / 4); + sbi->inodes = qnx6_private_inode(s, &sb1->Inode); + if (!sbi->inodes) + goto out; + sbi->longfile = qnx6_private_inode(s, &sb1->Longfile); + if (!sbi->longfile) + goto out1; + + /* prefetch root inode */ + root = qnx6_iget(s, QNX6_ROOT_INO); + if (IS_ERR(root)) { + pr_err("get inode failed\n"); + ret = PTR_ERR(root); + goto out2; + } + + ret = -ENOMEM; + s->s_root = d_make_root(root); + if (!s->s_root) + goto out2; + + ret = -EINVAL; + errmsg = qnx6_checkroot(s); + if (errmsg != NULL) { + if (!silent) + pr_err("%s\n", errmsg); + goto out3; + } + return 0; + +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + iput(sbi->longfile); +out1: + iput(sbi->inodes); +out: + if (bh1) + brelse(bh1); + if (bh2) + brelse(bh2); +outnobh: + kfree(qs); + s->s_fs_info = NULL; + return ret; +} + +static void qnx6_put_super(struct super_block *sb) +{ + struct qnx6_sb_info *qs = QNX6_SB(sb); + brelse(qs->sb_buf); + iput(qs->longfile); + iput(qs->inodes); + kfree(qs); + sb->s_fs_info = NULL; + return; +} + +static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, qnx6_get_block); +} +static const struct address_space_operations qnx6_aops = { + .readpage = qnx6_readpage, + .readpages = qnx6_readpages, + .bmap = qnx6_bmap +}; + +static struct inode *qnx6_private_inode(struct super_block *s, + struct qnx6_root_node *p) +{ + struct inode *inode = new_inode(s); + if (inode) { + struct qnx6_inode_info *ei = QNX6_I(inode); + struct qnx6_sb_info *sbi = QNX6_SB(s); + inode->i_size = fs64_to_cpu(sbi, p->size); + memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr)); + ei->di_filelevels = p->levels; + inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */ + inode->i_mapping->a_ops = &qnx6_aops; + } + return inode; +} + +struct inode *qnx6_iget(struct super_block *sb, unsigned ino) +{ + struct qnx6_sb_info *sbi = QNX6_SB(sb); + struct qnx6_inode_entry *raw_inode; + struct inode *inode; + struct qnx6_inode_info *ei; + struct address_space *mapping; + struct page *page; + u32 n, offs; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = QNX6_I(inode); + + inode->i_mode = 0; + + if (ino == 0) { + pr_err("bad inode number on dev %s: %u is out of range\n", + sb->s_id, ino); + iget_failed(inode); + return ERR_PTR(-EIO); + } + n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS); + offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS); + mapping = sbi->inodes->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) { + pr_err("major problem: unable to read inode from dev %s\n", + sb->s_id); + iget_failed(inode); + return ERR_CAST(page); + } + kmap(page); + raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; + + inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); + i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); + i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); + inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); + inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime); + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime); + inode->i_ctime.tv_nsec = 0; + + /* calc blocks based on 512 byte blocksize */ + inode->i_blocks = (inode->i_size + 511) >> 9; + + memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr, + sizeof(raw_inode->di_block_ptr)); + ei->di_filelevels = raw_inode->di_filelevels; + + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_mapping->a_ops = &qnx6_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &qnx6_dir_inode_operations; + inode->i_fop = &qnx6_dir_operations; + inode->i_mapping->a_ops = &qnx6_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &qnx6_aops; + } else + init_special_inode(inode, inode->i_mode, 0); + qnx6_put_page(page); + unlock_new_inode(inode); + return inode; +} + +static struct kmem_cache *qnx6_inode_cachep; + +static struct inode *qnx6_alloc_inode(struct super_block *sb) +{ + struct qnx6_inode_info *ei; + ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void qnx6_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); +} + +static void qnx6_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, qnx6_i_callback); +} + +static void init_once(void *foo) +{ + struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", + sizeof(struct qnx6_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (!qnx6_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(qnx6_inode_cachep); +} + +static struct dentry *qnx6_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, qnx6_fill_super); +} + +static struct file_system_type qnx6_fs_type = { + .owner = THIS_MODULE, + .name = "qnx6", + .mount = qnx6_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("qnx6"); + +static int __init init_qnx6_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + return err; + + err = register_filesystem(&qnx6_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + pr_info("QNX6 filesystem 1.0.0 registered.\n"); + return 0; +} + +static void __exit exit_qnx6_fs(void) +{ + unregister_filesystem(&qnx6_fs_type); + destroy_inodecache(); +} + +module_init(init_qnx6_fs) +module_exit(exit_qnx6_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/qnx6/namei.c b/kernel/fs/qnx6/namei.c new file mode 100644 index 000000000..6c1a32313 --- /dev/null +++ b/kernel/fs/qnx6/namei.c @@ -0,0 +1,42 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include "qnx6.h" + +struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + unsigned ino; + struct page *page; + struct inode *foundinode = NULL; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + + if (len > QNX6_LONG_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + ino = qnx6_find_entry(len, dir, name, &page); + if (ino) { + foundinode = qnx6_iget(dir->i_sb, ino); + qnx6_put_page(page); + if (IS_ERR(foundinode)) { + pr_debug("lookup->iget -> error %ld\n", + PTR_ERR(foundinode)); + return ERR_CAST(foundinode); + } + } else { + pr_debug("%s(): not found %s\n", __func__, name); + return NULL; + } + d_add(dentry, foundinode); + return NULL; +} diff --git a/kernel/fs/qnx6/qnx6.h b/kernel/fs/qnx6/qnx6.h new file mode 100644 index 000000000..d3fb2b698 --- /dev/null +++ b/kernel/fs/qnx6/qnx6.h @@ -0,0 +1,135 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 page map extension by Al Viro + * + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +typedef __u16 __bitwise __fs16; +typedef __u32 __bitwise __fs32; +typedef __u64 __bitwise __fs64; + +#include + +struct qnx6_sb_info { + struct buffer_head *sb_buf; /* superblock buffer */ + struct qnx6_super_block *sb; /* our superblock */ + int s_blks_off; /* blkoffset fs-startpoint */ + int s_ptrbits; /* indirect pointer bitfield */ + unsigned long s_mount_opt; /* all mount options */ + int s_bytesex; /* holds endianess info */ + struct inode * inodes; + struct inode * longfile; +}; + +struct qnx6_inode_info { + __fs32 di_block_ptr[QNX6_NO_DIRECT_POINTERS]; + __u8 di_filelevels; + __u32 i_dir_start_lookup; + struct inode vfs_inode; +}; + +extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino); +extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); + +#ifdef CONFIG_QNX6FS_DEBUG +extern void qnx6_superblock_debug(struct qnx6_super_block *, + struct super_block *); +#endif + +extern const struct inode_operations qnx6_dir_inode_operations; +extern const struct file_operations qnx6_dir_operations; + +static inline struct qnx6_sb_info *QNX6_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct qnx6_inode_info *QNX6_I(struct inode *inode) +{ + return container_of(inode, struct qnx6_inode_info, vfs_inode); +} + +#define clear_opt(o, opt) (o &= ~(QNX6_MOUNT_##opt)) +#define set_opt(o, opt) (o |= (QNX6_MOUNT_##opt)) +#define test_opt(sb, opt) (QNX6_SB(sb)->s_mount_opt & \ + QNX6_MOUNT_##opt) +enum { + BYTESEX_LE, + BYTESEX_BE, +}; + +static inline __u64 fs64_to_cpu(struct qnx6_sb_info *sbi, __fs64 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return le64_to_cpu((__force __le64)n); + else + return be64_to_cpu((__force __be64)n); +} + +static inline __fs64 cpu_to_fs64(struct qnx6_sb_info *sbi, __u64 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return (__force __fs64)cpu_to_le64(n); + else + return (__force __fs64)cpu_to_be64(n); +} + +static inline __u32 fs32_to_cpu(struct qnx6_sb_info *sbi, __fs32 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 cpu_to_fs32(struct qnx6_sb_info *sbi, __u32 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return (__force __fs32)cpu_to_le32(n); + else + return (__force __fs32)cpu_to_be32(n); +} + +static inline __u16 fs16_to_cpu(struct qnx6_sb_info *sbi, __fs16 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) +{ + if (sbi->s_bytesex == BYTESEX_LE) + return (__force __fs16)cpu_to_le16(n); + else + return (__force __fs16)cpu_to_be16(n); +} + +extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, + int silent); + +static inline void qnx6_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, + struct page **res_page); diff --git a/kernel/fs/qnx6/super_mmi.c b/kernel/fs/qnx6/super_mmi.c new file mode 100644 index 000000000..62aaf3e31 --- /dev/null +++ b/kernel/fs/qnx6/super_mmi.c @@ -0,0 +1,148 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * + */ + +#include +#include +#include +#include "qnx6.h" + +static void qnx6_mmi_copy_sb(struct qnx6_super_block *qsb, + struct qnx6_mmi_super_block *sb) +{ + qsb->sb_magic = sb->sb_magic; + qsb->sb_checksum = sb->sb_checksum; + qsb->sb_serial = sb->sb_serial; + qsb->sb_blocksize = sb->sb_blocksize; + qsb->sb_num_inodes = sb->sb_num_inodes; + qsb->sb_free_inodes = sb->sb_free_inodes; + qsb->sb_num_blocks = sb->sb_num_blocks; + qsb->sb_free_blocks = sb->sb_free_blocks; + + /* the rest of the superblock is the same */ + memcpy(&qsb->Inode, &sb->Inode, sizeof(sb->Inode)); + memcpy(&qsb->Bitmap, &sb->Bitmap, sizeof(sb->Bitmap)); + memcpy(&qsb->Longfile, &sb->Longfile, sizeof(sb->Longfile)); +} + +struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) +{ + struct buffer_head *bh1, *bh2 = NULL; + struct qnx6_mmi_super_block *sb1, *sb2; + struct qnx6_super_block *qsb = NULL; + struct qnx6_sb_info *sbi; + __u64 offset; + + /* Check the superblock signatures + start with the first superblock */ + bh1 = sb_bread(s, 0); + if (!bh1) { + pr_err("Unable to read first mmi superblock\n"); + return NULL; + } + sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; + sbi = QNX6_SB(s); + if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { + if (!silent) { + pr_err("wrong signature (magic) in superblock #1.\n"); + goto out; + } + } + + /* checksum check - start at byte 8 and end at byte 512 */ + if (fs32_to_cpu(sbi, sb1->sb_checksum) != + crc32_be(0, (char *)(bh1->b_data + 8), 504)) { + pr_err("superblock #1 checksum error\n"); + goto out; + } + + /* calculate second superblock blocknumber */ + offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + QNX6_SUPERBLOCK_AREA / + fs32_to_cpu(sbi, sb1->sb_blocksize); + + /* set new blocksize */ + if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { + pr_err("unable to set blocksize\n"); + goto out; + } + /* blocksize invalidates bh - pull it back in */ + brelse(bh1); + bh1 = sb_bread(s, 0); + if (!bh1) + goto out; + sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; + + /* read second superblock */ + bh2 = sb_bread(s, offset); + if (!bh2) { + pr_err("unable to read the second superblock\n"); + goto out; + } + sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; + if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { + if (!silent) + pr_err("wrong signature (magic) in superblock #2.\n"); + goto out; + } + + /* checksum check - start at byte 8 and end at byte 512 */ + if (fs32_to_cpu(sbi, sb2->sb_checksum) + != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { + pr_err("superblock #1 checksum error\n"); + goto out; + } + + qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); + if (!qsb) { + pr_err("unable to allocate memory.\n"); + goto out; + } + + if (fs64_to_cpu(sbi, sb1->sb_serial) > + fs64_to_cpu(sbi, sb2->sb_serial)) { + /* superblock #1 active */ + qnx6_mmi_copy_sb(qsb, sb1); +#ifdef CONFIG_QNX6FS_DEBUG + qnx6_superblock_debug(qsb, s); +#endif + memcpy(bh1->b_data, qsb, sizeof(struct qnx6_super_block)); + + sbi->sb_buf = bh1; + sbi->sb = (struct qnx6_super_block *)bh1->b_data; + brelse(bh2); + pr_info("superblock #1 active\n"); + } else { + /* superblock #2 active */ + qnx6_mmi_copy_sb(qsb, sb2); +#ifdef CONFIG_QNX6FS_DEBUG + qnx6_superblock_debug(qsb, s); +#endif + memcpy(bh2->b_data, qsb, sizeof(struct qnx6_super_block)); + + sbi->sb_buf = bh2; + sbi->sb = (struct qnx6_super_block *)bh2->b_data; + brelse(bh1); + pr_info("superblock #2 active\n"); + } + kfree(qsb); + + /* offset for mmi_fs is just SUPERBLOCK_AREA bytes */ + sbi->s_blks_off = QNX6_SUPERBLOCK_AREA / s->s_blocksize; + + /* success */ + return sbi->sb; + +out: + if (bh1 != NULL) + brelse(bh1); + if (bh2 != NULL) + brelse(bh2); + return NULL; +} diff --git a/kernel/fs/quota/Kconfig b/kernel/fs/quota/Kconfig new file mode 100644 index 000000000..4a09975aa --- /dev/null +++ b/kernel/fs/quota/Kconfig @@ -0,0 +1,76 @@ +# +# Quota configuration +# + +config QUOTA + bool "Quota support" + select QUOTACTL + select SRCU + help + If you say Y here, you will be able to set per user limits for disk + usage (also called disk quotas). Currently, it works for the + ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems. + Note that gfs2 and xfs use their own quota system. + Ext3, ext4 and reiserfs also support journaled quotas for which + you don't need to run quotacheck(8) after an unclean shutdown. + For further details, read the Quota mini-HOWTO, available from + , or the documentation provided + with the quota tools. Probably the quota support is only useful for + multi user systems. If unsure, say N. + +config QUOTA_NETLINK_INTERFACE + bool "Report quota messages through netlink interface" + depends on QUOTACTL && NET + help + If you say Y here, quota warnings (about exceeding softlimit, reaching + hardlimit, etc.) will be reported through netlink interface. If unsure, + say Y. + +config PRINT_QUOTA_WARNING + bool "Print quota warnings to console (OBSOLETE)" + depends on QUOTA + default y + help + If you say Y here, quota warnings (about exceeding softlimit, reaching + hardlimit, etc.) will be printed to the process' controlling terminal. + Note that this behavior is currently deprecated and may go away in + future. Please use notification via netlink socket instead. + +config QUOTA_DEBUG + bool "Additional quota sanity checks" + depends on QUOTA + default n + help + If you say Y here, quota subsystem will perform some additional + sanity checks of quota internal structures. If unsure, say N. + +# Generic support for tree structured quota files. Selected when needed. +config QUOTA_TREE + tristate + +config QFMT_V1 + tristate "Old quota format support" + depends on QUOTA + help + This quota format was (is) used by kernels earlier than 2.4.22. If + you have quota working and you don't want to convert to new quota + format say Y here. + +config QFMT_V2 + tristate "Quota format vfsv0 and vfsv1 support" + depends on QUOTA + select QUOTA_TREE + help + This config option enables kernel support for vfsv0 and vfsv1 quota + formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format + also supports 64-bit inode and block quota limits. If you need this + functionality say Y here. + +config QUOTACTL + bool + default n + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/kernel/fs/quota/Makefile b/kernel/fs/quota/Makefile new file mode 100644 index 000000000..c66c37cda --- /dev/null +++ b/kernel/fs/quota/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_QUOTA) += dquot.o +obj-$(CONFIG_QFMT_V1) += quota_v1.o +obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTA_TREE) += quota_tree.o +obj-$(CONFIG_QUOTACTL) += quota.o kqid.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/kernel/fs/quota/compat.c b/kernel/fs/quota/compat.c new file mode 100644 index 000000000..fb1892fe3 --- /dev/null +++ b/kernel/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include +#include +#include + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/kernel/fs/quota/dquot.c b/kernel/fs/quota/dquot.c new file mode 100644 index 000000000..20d1f7456 --- /dev/null +++ b/kernel/fs/quota/dquot.c @@ -0,0 +1,2889 @@ +/* + * Implementation of the diskquota system for the LINUX operating system. QUOTA + * is implemented using the BSD system call interface as the means of + * communication with the user level. This file contains the generic routines + * called by the different filesystems on allocation of an inode or block. + * These routines take care of the administration needed to have a consistent + * diskquota tracking system. The ideas of both user and group quotas are based + * on the Melbourne quota system as used on BSD derived systems. The internal + * implementation is based on one of the several variants of the LINUX + * inode-subsystem with added complexity of the diskquota system. + * + * Author: Marco van Wieringen + * + * Fixes: Dmitry Gorodchanin , 11 Feb 96 + * + * Revised list management to avoid races + * -- Bill Hawes, , 9/98 + * + * Fixed races in dquot_transfer(), dqget() and dquot_alloc_...(). + * As the consequence the locking was moved from dquot_decr_...(), + * dquot_incr_...() to calling functions. + * invalidate_dquots() now writes modified dquots. + * Serialized quota_off() and quota_on() for mount point. + * Fixed a few bugs in grow_dquots(). + * Fixed deadlock in write_dquot() - we no longer account quotas on + * quota files + * remove_dquot_ref() moved to inode.c - it now traverses through inodes + * add_dquot_ref() restarts after blocking + * Added check for bogus uid and fixed check for group in quotactl. + * Jan Kara, , sponsored by SuSE CR, 10-11/99 + * + * Used struct list_head instead of own list struct + * Invalidation of referenced dquots is no longer possible + * Improved free_dquots list management + * Quota and i_blocks are now updated in one place to avoid races + * Warnings are now delayed so we won't block in critical section + * Write updated not to require dquot lock + * Jan Kara, , 9/2000 + * + * Added dynamic quota structure allocation + * Jan Kara 12/2000 + * + * Rewritten quota interface. Implemented new quota format and + * formats registering. + * Jan Kara, , 2001,2002 + * + * New SMP locking. + * Jan Kara, , 10/2002 + * + * Added journalled quota support, fix lock inversion problems + * Jan Kara, , 2003,2004 + * + * (C) Copyright 1994 - 1997 Marco van Wieringen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../internal.h" /* ugh */ + +#include + +/* + * There are three quota SMP locks. dq_list_lock protects all lists with quotas + * and quota formats. + * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and + * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. + * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly + * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects + * modifications of quota state (on quotaon and quotaoff) and readers who care + * about latest values take it as well. + * + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * dq_list_lock > dq_state_lock + * + * Note that some things (eg. sb pointer, type, id) doesn't change during + * the life of the dquot structure and so needn't to be protected by a lock + * + * Operation accessing dquots via inode pointers are protected by dquot_srcu. + * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and + * synchronize_srcu(&dquot_srcu) is called after clearing pointers from + * inode and before dropping dquot references to avoid use of dquots after + * they are freed. dq_data_lock is used to serialize the pointer setting and + * clearing operations. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dq_data_lock. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. + * + * Each dquot has its dq_lock mutex. Locked dquots might not be referenced + * from inodes (dquot_alloc_space() and such don't check the dq_lock). + * Currently dquot is locked only when it is being read to memory (or space for + * it is being allocated) on the first dqget() and when it is being released on + * the last dqput(). The allocation and release oparations are serialized by + * the dq_lock and by checking the use count in dquot_release(). Write + * operations on dquots don't hold dq_lock as they copy data under dq_data_lock + * spinlock to internal buffers before writing. + * + * Lock ordering (including related VFS locks) is the following: + * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex + * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); +EXPORT_SYMBOL(dq_data_lock); +DEFINE_STATIC_SRCU(dquot_srcu); + +void __quota_error(struct super_block *sb, const char *func, + const char *fmt, ...) +{ + if (printk_ratelimit()) { + va_list args; + struct va_format vaf; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "Quota error (device %s): %s: %pV\n", + sb->s_id, func, &vaf); + + va_end(args); + } +} +EXPORT_SYMBOL(__quota_error); + +#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) +static char *quotatypes[] = INITQFNAMES; +#endif +static struct quota_format_type *quota_formats; /* List of registered formats */ +static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; + +/* SLAB cache for dquot structures */ +static struct kmem_cache *dquot_cachep; + +int register_quota_format(struct quota_format_type *fmt) +{ + spin_lock(&dq_list_lock); + fmt->qf_next = quota_formats; + quota_formats = fmt; + spin_unlock(&dq_list_lock); + return 0; +} +EXPORT_SYMBOL(register_quota_format); + +void unregister_quota_format(struct quota_format_type *fmt) +{ + struct quota_format_type **actqf; + + spin_lock(&dq_list_lock); + for (actqf = "a_formats; *actqf && *actqf != fmt; + actqf = &(*actqf)->qf_next) + ; + if (*actqf) + *actqf = (*actqf)->qf_next; + spin_unlock(&dq_list_lock); +} +EXPORT_SYMBOL(unregister_quota_format); + +static struct quota_format_type *find_quota_format(int id) +{ + struct quota_format_type *actqf; + + spin_lock(&dq_list_lock); + for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; + actqf = actqf->qf_next) + ; + if (!actqf || !try_module_get(actqf->qf_owner)) { + int qm; + + spin_unlock(&dq_list_lock); + + for (qm = 0; module_names[qm].qm_fmt_id && + module_names[qm].qm_fmt_id != id; qm++) + ; + if (!module_names[qm].qm_fmt_id || + request_module(module_names[qm].qm_mod_name)) + return NULL; + + spin_lock(&dq_list_lock); + for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; + actqf = actqf->qf_next) + ; + if (actqf && !try_module_get(actqf->qf_owner)) + actqf = NULL; + } + spin_unlock(&dq_list_lock); + return actqf; +} + +static void put_quota_format(struct quota_format_type *fmt) +{ + module_put(fmt->qf_owner); +} + +/* + * Dquot List Management: + * The quota code uses three lists for dquot management: the inuse_list, + * free_dquots, and dquot_hash[] array. A single dquot structure may be + * on all three lists, depending on its current state. + * + * All dquots are placed to the end of inuse_list when first created, and this + * list is used for invalidate operation, which must look at every dquot. + * + * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, + * and this list is searched whenever we need an available dquot. Dquots are + * removed from the list as soon as they are used again, and + * dqstats.free_dquots gives the number of dquots on the list. When + * dquot is invalidated it's completely released from memory. + * + * Dquots with a specific identity (device, type and id) are placed on + * one of the dquot_hash[] hash chains. The provides an efficient search + * mechanism to locate a specific dquot. + */ + +static LIST_HEAD(inuse_list); +static LIST_HEAD(free_dquots); +static unsigned int dq_hash_bits, dq_hash_mask; +static struct hlist_head *dquot_hash; + +struct dqstats dqstats; +EXPORT_SYMBOL(dqstats); + +static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); + +static inline unsigned int +hashfn(const struct super_block *sb, struct kqid qid) +{ + unsigned int id = from_kqid(&init_user_ns, qid); + int type = qid.type; + unsigned long tmp; + + tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); + return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask; +} + +/* + * Following list functions expect dq_list_lock to be held + */ +static inline void insert_dquot_hash(struct dquot *dquot) +{ + struct hlist_head *head; + head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id); + hlist_add_head(&dquot->dq_hash, head); +} + +static inline void remove_dquot_hash(struct dquot *dquot) +{ + hlist_del_init(&dquot->dq_hash); +} + +static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, + struct kqid qid) +{ + struct hlist_node *node; + struct dquot *dquot; + + hlist_for_each (node, dquot_hash+hashent) { + dquot = hlist_entry(node, struct dquot, dq_hash); + if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) + return dquot; + } + return NULL; +} + +/* Add a dquot to the tail of the free list */ +static inline void put_dquot_last(struct dquot *dquot) +{ + list_add_tail(&dquot->dq_free, &free_dquots); + dqstats_inc(DQST_FREE_DQUOTS); +} + +static inline void remove_free_dquot(struct dquot *dquot) +{ + if (list_empty(&dquot->dq_free)) + return; + list_del_init(&dquot->dq_free); + dqstats_dec(DQST_FREE_DQUOTS); +} + +static inline void put_inuse(struct dquot *dquot) +{ + /* We add to the back of inuse list so we don't have to restart + * when traversing this list and we block */ + list_add_tail(&dquot->dq_inuse, &inuse_list); + dqstats_inc(DQST_ALLOC_DQUOTS); +} + +static inline void remove_inuse(struct dquot *dquot) +{ + dqstats_dec(DQST_ALLOC_DQUOTS); + list_del(&dquot->dq_inuse); +} +/* + * End of list functions needing dq_list_lock + */ + +static void wait_on_dquot(struct dquot *dquot) +{ + mutex_lock(&dquot->dq_lock); + mutex_unlock(&dquot->dq_lock); +} + +static inline int dquot_dirty(struct dquot *dquot) +{ + return test_bit(DQ_MOD_B, &dquot->dq_flags); +} + +static inline int mark_dquot_dirty(struct dquot *dquot) +{ + return dquot->dq_sb->dq_op->mark_dirty(dquot); +} + +/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ +int dquot_mark_dquot_dirty(struct dquot *dquot) +{ + int ret = 1; + + /* If quota is dirty already, we don't have to acquire dq_list_lock */ + if (test_bit(DQ_MOD_B, &dquot->dq_flags)) + return 1; + + spin_lock(&dq_list_lock); + if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { + list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> + info[dquot->dq_id.type].dqi_dirty_list); + ret = 0; + } + spin_unlock(&dq_list_lock); + return ret; +} +EXPORT_SYMBOL(dquot_mark_dquot_dirty); + +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + +/* This function needs dq_list_lock */ +static inline int clear_dquot_dirty(struct dquot *dquot) +{ + if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) + return 0; + list_del_init(&dquot->dq_dirty); + return 1; +} + +void mark_info_dirty(struct super_block *sb, int type) +{ + set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags); +} +EXPORT_SYMBOL(mark_info_dirty); + +/* + * Read dquot from disk and alloc space for it + */ + +int dquot_acquire(struct dquot *dquot) +{ + int ret = 0, ret2 = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dquot->dq_lock); + mutex_lock(&dqopt->dqio_mutex); + if (!test_bit(DQ_READ_B, &dquot->dq_flags)) + ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); + if (ret < 0) + goto out_iolock; + set_bit(DQ_READ_B, &dquot->dq_flags); + /* Instantiate dquot if needed */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { + ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); + /* Write the info if needed */ + if (info_dirty(&dqopt->info[dquot->dq_id.type])) { + ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( + dquot->dq_sb, dquot->dq_id.type); + } + if (ret < 0) + goto out_iolock; + if (ret2 < 0) { + ret = ret2; + goto out_iolock; + } + } + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_iolock: + mutex_unlock(&dqopt->dqio_mutex); + mutex_unlock(&dquot->dq_lock); + return ret; +} +EXPORT_SYMBOL(dquot_acquire); + +/* + * Write dquot to disk + */ +int dquot_commit(struct dquot *dquot) +{ + int ret = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dqopt->dqio_mutex); + spin_lock(&dq_list_lock); + if (!clear_dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + goto out_sem; + } + spin_unlock(&dq_list_lock); + /* Inactive dquot can be only if there was error during read/init + * => we have better not writing it */ + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); + else + ret = -EIO; +out_sem: + mutex_unlock(&dqopt->dqio_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_commit); + +/* + * Release dquot + */ +int dquot_release(struct dquot *dquot) +{ + int ret = 0, ret2 = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (atomic_read(&dquot->dq_count) > 1) + goto out_dqlock; + mutex_lock(&dqopt->dqio_mutex); + if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { + ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); + /* Write the info */ + if (info_dirty(&dqopt->info[dquot->dq_id.type])) { + ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( + dquot->dq_sb, dquot->dq_id.type); + } + if (ret >= 0) + ret = ret2; + } + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); + mutex_unlock(&dqopt->dqio_mutex); +out_dqlock: + mutex_unlock(&dquot->dq_lock); + return ret; +} +EXPORT_SYMBOL(dquot_release); + +void dquot_destroy(struct dquot *dquot) +{ + kmem_cache_free(dquot_cachep, dquot); +} +EXPORT_SYMBOL(dquot_destroy); + +static inline void do_destroy_dquot(struct dquot *dquot) +{ + dquot->dq_sb->dq_op->destroy_dquot(dquot); +} + +/* Invalidate all dquots on the list. Note that this function is called after + * quota is disabled and pointers from inodes removed so there cannot be new + * quota users. There can still be some users of quotas due to inodes being + * just deleted or pruned by prune_icache() (those are not attached to any + * list) or parallel quotactl call. We have to wait for such users. + */ +static void invalidate_dquots(struct super_block *sb, int type) +{ + struct dquot *dquot, *tmp; + +restart: + spin_lock(&dq_list_lock); + list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { + if (dquot->dq_sb != sb) + continue; + if (dquot->dq_id.type != type) + continue; + /* Wait for dquot users */ + if (atomic_read(&dquot->dq_count)) { + DEFINE_WAIT(wait); + + dqgrab(dquot); + prepare_to_wait(&dquot->dq_wait_unused, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&dq_list_lock); + /* Once dqput() wakes us up, we know it's time to free + * the dquot. + * IMPORTANT: we rely on the fact that there is always + * at most one process waiting for dquot to free. + * Otherwise dq_count would be > 1 and we would never + * wake up. + */ + if (atomic_read(&dquot->dq_count) > 1) + schedule(); + finish_wait(&dquot->dq_wait_unused, &wait); + dqput(dquot); + /* At this moment dquot() need not exist (it could be + * reclaimed by prune_dqcache(). Hence we must + * restart. */ + goto restart; + } + /* + * Quota now has no users and it has been written on last + * dqput() + */ + remove_dquot_hash(dquot); + remove_free_dquot(dquot); + remove_inuse(dquot); + do_destroy_dquot(dquot); + } + spin_unlock(&dq_list_lock); +} + +/* Call callback for every active dquot on given filesystem */ +int dquot_scan_active(struct super_block *sb, + int (*fn)(struct dquot *dquot, unsigned long priv), + unsigned long priv) +{ + struct dquot *dquot, *old_dquot = NULL; + int ret = 0; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + spin_lock(&dq_list_lock); + list_for_each_entry(dquot, &inuse_list, dq_inuse) { + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + continue; + if (dquot->dq_sb != sb) + continue; + /* Now we have active dquot so we can just increase use count */ + atomic_inc(&dquot->dq_count); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + dqput(old_dquot); + old_dquot = dquot; + /* + * ->release_dquot() can be racing with us. Our reference + * protects us from new calls to it so just wait for any + * outstanding call and recheck the DQ_ACTIVE_B after that. + */ + wait_on_dquot(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + ret = fn(dquot, priv); + if (ret < 0) + goto out; + } + spin_lock(&dq_list_lock); + /* We are safe to continue now because our dquot could not + * be moved out of the inuse list while we hold the reference */ + } + spin_unlock(&dq_list_lock); +out: + dqput(old_dquot); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_scan_active); + +/* Write all dquot structures to quota files */ +int dquot_writeback_dquots(struct super_block *sb, int type) +{ + struct list_head *dirty; + struct dquot *dquot; + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int err, ret = 0; + + mutex_lock(&dqopt->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + spin_lock(&dq_list_lock); + dirty = &dqopt->info[cnt].dqi_dirty_list; + while (!list_empty(dirty)) { + dquot = list_first_entry(dirty, struct dquot, + dq_dirty); + /* Dirty and inactive can be only bad dquot... */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + clear_dquot_dirty(dquot); + continue; + } + /* Now we have active dquot from which someone is + * holding reference so we can safely just increase + * use count */ + dqgrab(dquot); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + err = sb->dq_op->write_dquot(dquot); + if (!ret && err) + ret = err; + dqput(dquot); + spin_lock(&dq_list_lock); + } + spin_unlock(&dq_list_lock); + } + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) + && info_dirty(&dqopt->info[cnt])) + sb->dq_op->write_info(sb, cnt); + dqstats_inc(DQST_SYNCS); + mutex_unlock(&dqopt->dqonoff_mutex); + + return ret; +} +EXPORT_SYMBOL(dquot_writeback_dquots); + +/* Write all dquot structures to disk and make them visible from userspace */ +int dquot_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&dqopt->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock(&dqopt->files[cnt]->i_mutex); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + mutex_unlock(&dqopt->files[cnt]->i_mutex); + } + mutex_unlock(&dqopt->dqonoff_mutex); + + return 0; +} +EXPORT_SYMBOL(dquot_quota_sync); + +static unsigned long +dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + struct list_head *head; + struct dquot *dquot; + unsigned long freed = 0; + + spin_lock(&dq_list_lock); + head = free_dquots.prev; + while (head != &free_dquots && sc->nr_to_scan) { + dquot = list_entry(head, struct dquot, dq_free); + remove_dquot_hash(dquot); + remove_free_dquot(dquot); + remove_inuse(dquot); + do_destroy_dquot(dquot); + sc->nr_to_scan--; + freed++; + head = free_dquots.prev; + } + spin_unlock(&dq_list_lock); + return freed; +} + +static unsigned long +dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return vfs_pressure_ratio( + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); +} + +static struct shrinker dqcache_shrinker = { + .count_objects = dqcache_shrink_count, + .scan_objects = dqcache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + +/* + * Put reference to dquot + */ +void dqput(struct dquot *dquot) +{ + int ret; + + if (!dquot) + return; +#ifdef CONFIG_QUOTA_DEBUG + if (!atomic_read(&dquot->dq_count)) { + quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", + quotatypes[dquot->dq_id.type], + from_kqid(&init_user_ns, dquot->dq_id)); + BUG(); + } +#endif + dqstats_inc(DQST_DROPS); +we_slept: + spin_lock(&dq_list_lock); + if (atomic_read(&dquot->dq_count) > 1) { + /* We have more than one user... nothing to do */ + atomic_dec(&dquot->dq_count); + /* Releasing dquot during quotaoff phase? */ + if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && + atomic_read(&dquot->dq_count) == 1) + wake_up(&dquot->dq_wait_unused); + spin_unlock(&dq_list_lock); + return; + } + /* Need to release dquot? */ + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + /* Commit dquot before releasing */ + ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota structure" + " (error %d). Quota may get out of sync!", + ret); + /* + * We clear dirty bit anyway, so that we avoid + * infinite loop here + */ + spin_lock(&dq_list_lock); + clear_dquot_dirty(dquot); + spin_unlock(&dq_list_lock); + } + goto we_slept; + } + /* Clear flag in case dquot was inactive (something bad happened) */ + clear_dquot_dirty(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + dquot->dq_sb->dq_op->release_dquot(dquot); + goto we_slept; + } + atomic_dec(&dquot->dq_count); +#ifdef CONFIG_QUOTA_DEBUG + /* sanity check */ + BUG_ON(!list_empty(&dquot->dq_free)); +#endif + put_dquot_last(dquot); + spin_unlock(&dq_list_lock); +} +EXPORT_SYMBOL(dqput); + +struct dquot *dquot_alloc(struct super_block *sb, int type) +{ + return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); +} +EXPORT_SYMBOL(dquot_alloc); + +static struct dquot *get_empty_dquot(struct super_block *sb, int type) +{ + struct dquot *dquot; + + dquot = sb->dq_op->alloc_dquot(sb, type); + if(!dquot) + return NULL; + + mutex_init(&dquot->dq_lock); + INIT_LIST_HEAD(&dquot->dq_free); + INIT_LIST_HEAD(&dquot->dq_inuse); + INIT_HLIST_NODE(&dquot->dq_hash); + INIT_LIST_HEAD(&dquot->dq_dirty); + init_waitqueue_head(&dquot->dq_wait_unused); + dquot->dq_sb = sb; + dquot->dq_id = make_kqid_invalid(type); + atomic_set(&dquot->dq_count, 1); + + return dquot; +} + +/* + * Get reference to dquot + * + * Locking is slightly tricky here. We are guarded from parallel quotaoff() + * destroying our dquot by: + * a) checking for quota flags under dq_list_lock and + * b) getting a reference to dquot before we release dq_list_lock + */ +struct dquot *dqget(struct super_block *sb, struct kqid qid) +{ + unsigned int hashent = hashfn(sb, qid); + struct dquot *dquot = NULL, *empty = NULL; + + if (!sb_has_quota_active(sb, qid.type)) + return NULL; +we_slept: + spin_lock(&dq_list_lock); + spin_lock(&dq_state_lock); + if (!sb_has_quota_active(sb, qid.type)) { + spin_unlock(&dq_state_lock); + spin_unlock(&dq_list_lock); + goto out; + } + spin_unlock(&dq_state_lock); + + dquot = find_dquot(hashent, sb, qid); + if (!dquot) { + if (!empty) { + spin_unlock(&dq_list_lock); + empty = get_empty_dquot(sb, qid.type); + if (!empty) + schedule(); /* Try to wait for a moment... */ + goto we_slept; + } + dquot = empty; + empty = NULL; + dquot->dq_id = qid; + /* all dquots go on the inuse_list */ + put_inuse(dquot); + /* hash it first so it can be found */ + insert_dquot_hash(dquot); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + } else { + if (!atomic_read(&dquot->dq_count)) + remove_free_dquot(dquot); + atomic_inc(&dquot->dq_count); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_CACHE_HITS); + dqstats_inc(DQST_LOOKUPS); + } + /* Wait for dq_lock - after this we know that either dquot_release() is + * already finished or it will be canceled due to dq_count > 1 test */ + wait_on_dquot(dquot); + /* Read the dquot / allocate space in quota file */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && + sb->dq_op->acquire_dquot(dquot) < 0) { + dqput(dquot); + dquot = NULL; + goto out; + } +#ifdef CONFIG_QUOTA_DEBUG + BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ +#endif +out: + if (empty) + do_destroy_dquot(empty); + + return dquot; +} +EXPORT_SYMBOL(dqget); + +static inline struct dquot **i_dquot(struct inode *inode) +{ + return inode->i_sb->s_op->get_dquots(inode); +} + +static int dqinit_needed(struct inode *inode, int type) +{ + struct dquot * const *dquots; + int cnt; + + if (IS_NOQUOTA(inode)) + return 0; + + dquots = i_dquot(inode); + if (type != -1) + return !dquots[type]; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (!dquots[cnt]) + return 1; + return 0; +} + +/* This routine is guarded by dqonoff_mutex mutex */ +static void add_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode, *old_inode = NULL; +#ifdef CONFIG_QUOTA_DEBUG + int reserved = 0; +#endif + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; +#endif + iput(old_inode); + __dquot_initialize(inode, type); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + old_inode = inode; + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); + +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + quota_error(sb, "Writes happened before quota was turned on " + "thus quota information is probably inconsistent. " + "Please run quotacheck(8)"); + } +#endif +} + +/* + * Remove references to dquots from inode and add dquot to list for freeing + * if we have the last reference to dquot + */ +static void remove_inode_dquot_ref(struct inode *inode, int type, + struct list_head *tofree_head) +{ + struct dquot **dquots = i_dquot(inode); + struct dquot *dquot = dquots[type]; + + if (!dquot) + return; + + dquots[type] = NULL; + if (list_empty(&dquot->dq_free)) { + /* + * The inode still has reference to dquot so it can't be in the + * free list + */ + spin_lock(&dq_list_lock); + list_add(&dquot->dq_free, tofree_head); + spin_unlock(&dq_list_lock); + } else { + /* + * Dquot is already in a list to put so we won't drop the last + * reference here. + */ + dqput(dquot); + } +} + +/* + * Free list of dquots + * Dquots are removed from inodes and no new references can be got so we are + * the only ones holding reference + */ +static void put_dquot_list(struct list_head *tofree_head) +{ + struct list_head *act_head; + struct dquot *dquot; + + act_head = tofree_head->next; + while (act_head != tofree_head) { + dquot = list_entry(act_head, struct dquot, dq_free); + act_head = act_head->next; + /* Remove dquot from the list so we won't have problems... */ + list_del_init(&dquot->dq_free); + dqput(dquot); + } +} + +static void remove_dquot_ref(struct super_block *sb, int type, + struct list_head *tofree_head) +{ + struct inode *inode; + int reserved = 0; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + /* + * We have to scan also I_NEW inodes because they can already + * have quota pointer initialized. Luckily, we need to touch + * only quota pointers and these have separate locking + * (dq_data_lock). + */ + spin_lock(&dq_data_lock); + if (!IS_NOQUOTA(inode)) { + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; + remove_inode_dquot_ref(inode, type, tofree_head); + } + spin_unlock(&dq_data_lock); + } + spin_unlock(&inode_sb_list_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened after quota" + " was disabled thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } +#endif +} + +/* Gather all references from inodes and drop them */ +static void drop_dquot_ref(struct super_block *sb, int type) +{ + LIST_HEAD(tofree_head); + + if (sb->dq_op) { + remove_dquot_ref(sb, type, &tofree_head); + synchronize_srcu(&dquot_srcu); + put_dquot_list(&tofree_head); + } +} + +static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_curinodes += number; +} + +static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_curspace += number; +} + +static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_rsvspace += number; +} + +/* + * Claim reserved quota space + */ +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } + dquot->dq_dqb.dqb_curspace += number; + dquot->dq_dqb.dqb_rsvspace -= number; +} + +static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) + number = dquot->dq_dqb.dqb_curspace; + dquot->dq_dqb.dqb_rsvspace += number; + dquot->dq_dqb.dqb_curspace -= number; +} + +static inline +void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } +} + +static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) +{ + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curinodes >= number) + dquot->dq_dqb.dqb_curinodes -= number; + else + dquot->dq_dqb.dqb_curinodes = 0; + if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + dquot->dq_dqb.dqb_itime = (time_t) 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); +} + +static void dquot_decr_space(struct dquot *dquot, qsize_t number) +{ + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curspace >= number) + dquot->dq_dqb.dqb_curspace -= number; + else + dquot->dq_dqb.dqb_curspace = 0; + if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time_t) 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); +} + +struct dquot_warn { + struct super_block *w_sb; + struct kqid w_dq_id; + short w_type; +}; + +static int warning_issued(struct dquot *dquot, const int warntype) +{ + int flag = (warntype == QUOTA_NL_BHARDWARN || + warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B : + ((warntype == QUOTA_NL_IHARDWARN || + warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0); + + if (!flag) + return 0; + return test_and_set_bit(flag, &dquot->dq_flags); +} + +#ifdef CONFIG_PRINT_QUOTA_WARNING +static int flag_print_warnings = 1; + +static int need_print_warning(struct dquot_warn *warn) +{ + if (!flag_print_warnings) + return 0; + + switch (warn->w_dq_id.type) { + case USRQUOTA: + return uid_eq(current_fsuid(), warn->w_dq_id.uid); + case GRPQUOTA: + return in_group_p(warn->w_dq_id.gid); + case PRJQUOTA: + return 1; + } + return 0; +} + +/* Print warning to user which exceeded quota */ +static void print_warning(struct dquot_warn *warn) +{ + char *msg = NULL; + struct tty_struct *tty; + int warntype = warn->w_type; + + if (warntype == QUOTA_NL_IHARDBELOW || + warntype == QUOTA_NL_ISOFTBELOW || + warntype == QUOTA_NL_BHARDBELOW || + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) + return; + + tty = get_current_tty(); + if (!tty) + return; + tty_write_message(tty, warn->w_sb->s_id); + if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) + tty_write_message(tty, ": warning, "); + else + tty_write_message(tty, ": write failed, "); + tty_write_message(tty, quotatypes[warn->w_dq_id.type]); + switch (warntype) { + case QUOTA_NL_IHARDWARN: + msg = " file limit reached.\r\n"; + break; + case QUOTA_NL_ISOFTLONGWARN: + msg = " file quota exceeded too long.\r\n"; + break; + case QUOTA_NL_ISOFTWARN: + msg = " file quota exceeded.\r\n"; + break; + case QUOTA_NL_BHARDWARN: + msg = " block limit reached.\r\n"; + break; + case QUOTA_NL_BSOFTLONGWARN: + msg = " block quota exceeded too long.\r\n"; + break; + case QUOTA_NL_BSOFTWARN: + msg = " block quota exceeded.\r\n"; + break; + } + tty_write_message(tty, msg); + tty_kref_put(tty); +} +#endif + +static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, + int warntype) +{ + if (warning_issued(dquot, warntype)) + return; + warn->w_type = warntype; + warn->w_sb = dquot->dq_sb; + warn->w_dq_id = dquot->dq_id; +} + +/* + * Write warnings to the console and send warning messages over netlink. + * + * Note that this function can call into tty and networking code. + */ +static void flush_warnings(struct dquot_warn *warn) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (warn[i].w_type == QUOTA_NL_NOWARN) + continue; +#ifdef CONFIG_PRINT_QUOTA_WARNING + print_warning(&warn[i]); +#endif + quota_send_warning(warn[i].w_dq_id, + warn[i].w_sb->s_dev, warn[i].w_type); + } +} + +static int ignore_hardlimit(struct dquot *dquot) +{ + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || + !(info->dqi_flags & DQF_ROOT_SQUASH)); +} + +/* needs dq_data_lock */ +static int check_idq(struct dquot *dquot, qsize_t inodes, + struct dquot_warn *warn) +{ + qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; + + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) + return 0; + + if (dquot->dq_dqb.dqb_ihardlimit && + newinodes > dquot->dq_dqb.dqb_ihardlimit && + !ignore_hardlimit(dquot)) { + prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_isoftlimit && + newinodes > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime && + get_seconds() >= dquot->dq_dqb.dqb_itime && + !ignore_hardlimit(dquot)) { + prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_isoftlimit && + newinodes > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime == 0) { + prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); + dquot->dq_dqb.dqb_itime = get_seconds() + + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; + } + + return 0; +} + +/* needs dq_data_lock */ +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, + struct dquot_warn *warn) +{ + qsize_t tspace; + struct super_block *sb = dquot->dq_sb; + + if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) + return 0; + + tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + + space; + + if (dquot->dq_dqb.dqb_bhardlimit && + tspace > dquot->dq_dqb.dqb_bhardlimit && + !ignore_hardlimit(dquot)) { + if (!prealloc) + prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_bsoftlimit && + tspace > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime && + get_seconds() >= dquot->dq_dqb.dqb_btime && + !ignore_hardlimit(dquot)) { + if (!prealloc) + prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_bsoftlimit && + tspace > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime == 0) { + if (!prealloc) { + prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); + dquot->dq_dqb.dqb_btime = get_seconds() + + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; + } + else + /* + * We don't allow preallocation to exceed softlimit so exceeding will + * be always printed + */ + return -EDQUOT; + } + + return 0; +} + +static int info_idq_free(struct dquot *dquot, qsize_t inodes) +{ + qsize_t newinodes; + + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || + !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type)) + return QUOTA_NL_NOWARN; + + newinodes = dquot->dq_dqb.dqb_curinodes - inodes; + if (newinodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_ISOFTBELOW; + if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && + newinodes < dquot->dq_dqb.dqb_ihardlimit) + return QUOTA_NL_IHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int info_bdq_free(struct dquot *dquot, qsize_t space) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_NOWARN; + + if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_BSOFTBELOW; + if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) + return QUOTA_NL_BHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int dquot_active(const struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (IS_NOQUOTA(inode)) + return 0; + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); +} + +/* + * Initialize quota pointers in inode + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. + */ +static void __dquot_initialize(struct inode *inode, int type) +{ + int cnt, init_needed = 0; + struct dquot **dquots, *got[MAXQUOTAS]; + struct super_block *sb = inode->i_sb; + qsize_t rsv; + + if (!dquot_active(inode)) + return; + + dquots = i_dquot(inode); + + /* First get references to structures we might need. */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct kqid qid; + kprojid_t projid; + int rc; + + got[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + /* + * The i_dquot should have been initialized in most cases, + * we check it without locking here to avoid unnecessary + * dqget()/dqput() calls. + */ + if (dquots[cnt]) + continue; + + if (!sb_has_quota_active(sb, cnt)) + continue; + + init_needed = 1; + + switch (cnt) { + case USRQUOTA: + qid = make_kqid_uid(inode->i_uid); + break; + case GRPQUOTA: + qid = make_kqid_gid(inode->i_gid); + break; + case PRJQUOTA: + rc = inode->i_sb->dq_op->get_projid(inode, &projid); + if (rc) + continue; + qid = make_kqid_projid(projid); + break; + } + got[cnt] = dqget(sb, qid); + } + + /* All required i_dquot has been initialized */ + if (!init_needed) + return; + + spin_lock(&dq_data_lock); + if (IS_NOQUOTA(inode)) + goto out_err; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(sb, cnt)) + continue; + /* We could race with quotaon or dqget() could have failed */ + if (!got[cnt]) + continue; + if (!dquots[cnt]) { + dquots[cnt] = got[cnt]; + got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(dquots[cnt], rsv); + } + } +out_err: + spin_unlock(&dq_data_lock); + /* Drop unused references */ + dqput_all(got); +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); +} +EXPORT_SYMBOL(dquot_initialize); + +/* + * Release all quotas referenced by inode. + * + * This function only be called on inode free or converting + * a file to quota file, no other users for the i_dquot in + * both cases, so we needn't call synchronize_srcu() after + * clearing i_dquot. + */ +static void __dquot_drop(struct inode *inode) +{ + int cnt; + struct dquot **dquots = i_dquot(inode); + struct dquot *put[MAXQUOTAS]; + + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + put[cnt] = dquots[cnt]; + dquots[cnt] = NULL; + } + spin_unlock(&dq_data_lock); + dqput_all(put); +} + +void dquot_drop(struct inode *inode) +{ + struct dquot * const *dquots; + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + dquots = i_dquot(inode); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquots[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); + +/* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_add_rsv_space); + +void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_claim_rsv_space); + +void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_reclaim_rsv_space); + +void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_sub_rsv_space); + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + +/* + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. + */ + +/* + * This operation can block, but only after everything is updated + */ +int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) +{ + int cnt, ret = 0, index; + struct dquot_warn warn[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; + struct dquot **dquots; + + if (!dquot_active(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + warn[cnt].w_type = QUOTA_NL_NOWARN; + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!dquots[cnt]) + continue; + ret = check_bdq(dquots[cnt], number, + !(flags & DQUOT_SPACE_WARN), &warn[cnt]); + if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { + spin_unlock(&dq_data_lock); + goto out_flush_warn; + } + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!dquots[cnt]) + continue; + if (reserve) + dquot_resv_space(dquots[cnt], number); + else + dquot_incr_space(dquots[cnt], number); + } + inode_incr_space(inode, number, reserve); + spin_unlock(&dq_data_lock); + + if (reserve) + goto out_flush_warn; + mark_all_dquot_dirty(dquots); +out_flush_warn: + srcu_read_unlock(&dquot_srcu, index); + flush_warnings(warn); +out: + return ret; +} +EXPORT_SYMBOL(__dquot_alloc_space); + +/* + * This operation can block, but only after everything is updated + */ +int dquot_alloc_inode(struct inode *inode) +{ + int cnt, ret = 0, index; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots; + + if (!dquot_active(inode)) + return 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + warn[cnt].w_type = QUOTA_NL_NOWARN; + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!dquots[cnt]) + continue; + ret = check_idq(dquots[cnt], 1, &warn[cnt]); + if (ret) + goto warn_put_all; + } + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!dquots[cnt]) + continue; + dquot_incr_inodes(dquots[cnt], 1); + } + +warn_put_all: + spin_unlock(&dq_data_lock); + if (ret == 0) + mark_all_dquot_dirty(dquots); + srcu_read_unlock(&dquot_srcu, index); + flush_warnings(warn); + return ret; +} +EXPORT_SYMBOL(dquot_alloc_inode); + +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +{ + struct dquot **dquots; + int cnt, index; + + if (!dquot_active(inode)) { + inode_claim_rsv_space(inode, number); + return 0; + } + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + /* Claim reserved quotas to allocated quotas */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquots[cnt]) + dquot_claim_reserved_space(dquots[cnt], number); + } + /* Update inode bytes */ + inode_claim_rsv_space(inode, number); + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(dquots); + srcu_read_unlock(&dquot_srcu, index); + return 0; +} +EXPORT_SYMBOL(dquot_claim_space_nodirty); + +/* + * Convert allocated space back to in-memory reserved quotas + */ +void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) +{ + struct dquot **dquots; + int cnt, index; + + if (!dquot_active(inode)) { + inode_reclaim_rsv_space(inode, number); + return; + } + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + /* Claim reserved quotas to allocated quotas */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquots[cnt]) + dquot_reclaim_reserved_space(dquots[cnt], number); + } + /* Update inode bytes */ + inode_reclaim_rsv_space(inode, number); + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(dquots); + srcu_read_unlock(&dquot_srcu, index); + return; +} +EXPORT_SYMBOL(dquot_reclaim_space_nodirty); + +/* + * This operation can block, but only after everything is updated + */ +void __dquot_free_space(struct inode *inode, qsize_t number, int flags) +{ + unsigned int cnt; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot **dquots; + int reserve = flags & DQUOT_SPACE_RESERVE, index; + + if (!dquot_active(inode)) { + inode_decr_space(inode, number, reserve); + return; + } + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) + continue; + wtype = info_bdq_free(dquots[cnt], number); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); + if (reserve) + dquot_free_reserved_space(dquots[cnt], number); + else + dquot_decr_space(dquots[cnt], number); + } + inode_decr_space(inode, number, reserve); + spin_unlock(&dq_data_lock); + + if (reserve) + goto out_unlock; + mark_all_dquot_dirty(dquots); +out_unlock: + srcu_read_unlock(&dquot_srcu, index); + flush_warnings(warn); +} +EXPORT_SYMBOL(__dquot_free_space); + +/* + * This operation can block, but only after everything is updated + */ +void dquot_free_inode(struct inode *inode) +{ + unsigned int cnt; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots; + int index; + + if (!dquot_active(inode)) + return; + + dquots = i_dquot(inode); + index = srcu_read_lock(&dquot_srcu); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) + continue; + wtype = info_idq_free(dquots[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); + dquot_decr_inodes(dquots[cnt], 1); + } + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(dquots); + srcu_read_unlock(&dquot_srcu, index); + flush_warnings(warn); +} +EXPORT_SYMBOL(dquot_free_inode); + +/* + * Transfer the number of inode and blocks from one diskquota to an other. + * On success, dquot references in transfer_to are consumed and references + * to original dquots that need to be released are placed there. On failure, + * references are kept untouched. + * + * This operation can block, but only after everything is updated + * A transaction must be started when entering this function. + * + * We are holding reference on transfer_from & transfer_to, no need to + * protect them by srcu_read_lock(). + */ +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) +{ + qsize_t space, cur_space; + qsize_t rsv_space = 0; + struct dquot *transfer_from[MAXQUOTAS] = {}; + int cnt, ret = 0; + char is_valid[MAXQUOTAS] = {}; + struct dquot_warn warn_to[MAXQUOTAS]; + struct dquot_warn warn_from_inodes[MAXQUOTAS]; + struct dquot_warn warn_from_space[MAXQUOTAS]; + + if (IS_NOQUOTA(inode)) + return 0; + /* Initialize the arrays */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + warn_to[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; + } + + spin_lock(&dq_data_lock); + if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + spin_unlock(&dq_data_lock); + return 0; + } + cur_space = inode_get_bytes(inode); + rsv_space = inode_get_rsv_space(inode); + space = cur_space + rsv_space; + /* Build the transfer_from list and check the limits */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + /* + * Skip changes for same uid or gid or for turned off quota-type. + */ + if (!transfer_to[cnt]) + continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(inode->i_sb, cnt)) + continue; + is_valid[cnt] = 1; + transfer_from[cnt] = i_dquot(inode)[cnt]; + ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); + if (ret) + goto over_quota; + } + + /* + * Finally perform the needed transfer from transfer_from to transfer_to + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!is_valid[cnt]) + continue; + /* Due to IO error we might not have transfer_from[] structure */ + if (transfer_from[cnt]) { + int wtype; + wtype = info_idq_free(transfer_from[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_inodes[cnt], + transfer_from[cnt], wtype); + wtype = info_bdq_free(transfer_from[cnt], space); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_space[cnt], + transfer_from[cnt], wtype); + dquot_decr_inodes(transfer_from[cnt], 1); + dquot_decr_space(transfer_from[cnt], cur_space); + dquot_free_reserved_space(transfer_from[cnt], + rsv_space); + } + + dquot_incr_inodes(transfer_to[cnt], 1); + dquot_incr_space(transfer_to[cnt], cur_space); + dquot_resv_space(transfer_to[cnt], rsv_space); + + i_dquot(inode)[cnt] = transfer_to[cnt]; + } + spin_unlock(&dq_data_lock); + + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + flush_warnings(warn_to); + flush_warnings(warn_from_inodes); + flush_warnings(warn_from_space); + /* Pass back references to put */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (is_valid[cnt]) + transfer_to[cnt] = transfer_from[cnt]; + return 0; +over_quota: + spin_unlock(&dq_data_lock); + flush_warnings(warn_to); + return ret; +} +EXPORT_SYMBOL(__dquot_transfer); + +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ +int dquot_transfer(struct inode *inode, struct iattr *iattr) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct super_block *sb = inode->i_sb; + int ret; + + if (!dquot_active(inode)) + return 0; + + if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) + transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)) + transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid)); + + ret = __dquot_transfer(inode, transfer_to); + dqput_all(transfer_to); + return ret; +} +EXPORT_SYMBOL(dquot_transfer); + +/* + * Write info of quota file to disk + */ +int dquot_commit_info(struct super_block *sb, int type) +{ + int ret; + struct quota_info *dqopt = sb_dqopt(sb); + + mutex_lock(&dqopt->dqio_mutex); + ret = dqopt->ops[type]->write_file_info(sb, type); + mutex_unlock(&dqopt->dqio_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_commit_info); + +/* + * Definitions of diskquota operations. + */ +const struct dquot_operations dquot_operations = { + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; +EXPORT_SYMBOL(dquot_operations); + +/* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + dquot_initialize(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + +/* + * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) + */ +int dquot_disable(struct super_block *sb, int type, unsigned int flags) +{ + int cnt, ret = 0; + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *toputinode[MAXQUOTAS]; + + /* Cannot turn off usage accounting without turning off limits, or + * suspend quotas and simultaneously turn quotas off. */ + if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) + || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | + DQUOT_USAGE_ENABLED))) + return -EINVAL; + + /* We need to serialize quota_off() for device */ + mutex_lock(&dqopt->dqonoff_mutex); + + /* + * Skip everything if there's nothing to do. We have to do this because + * sometimes we are called when fill_super() failed and calling + * sync_fs() in such cases does no good. + */ + if (!sb_any_quota_loaded(sb)) { + mutex_unlock(&dqopt->dqonoff_mutex); + return 0; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + toputinode[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_loaded(sb, cnt)) + continue; + + if (flags & DQUOT_SUSPENDED) { + spin_lock(&dq_state_lock); + dqopt->flags |= + dquot_state_flag(DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); + } else { + spin_lock(&dq_state_lock); + dqopt->flags &= ~dquot_state_flag(flags, cnt); + /* Turning off suspended quotas? */ + if (!sb_has_quota_loaded(sb, cnt) && + sb_has_quota_suspended(sb, cnt)) { + dqopt->flags &= ~dquot_state_flag( + DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); + iput(dqopt->files[cnt]); + dqopt->files[cnt] = NULL; + continue; + } + spin_unlock(&dq_state_lock); + } + + /* We still have to keep quota loaded? */ + if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) + continue; + + /* Note: these are blocking operations */ + drop_dquot_ref(sb, cnt); + invalidate_dquots(sb, cnt); + /* + * Now all dquots should be invalidated, all writes done so we + * should be only users of the info. No locks needed. + */ + if (info_dirty(&dqopt->info[cnt])) + sb->dq_op->write_info(sb, cnt); + if (dqopt->ops[cnt]->free_file_info) + dqopt->ops[cnt]->free_file_info(sb, cnt); + put_quota_format(dqopt->info[cnt].dqi_format); + + toputinode[cnt] = dqopt->files[cnt]; + if (!sb_has_quota_loaded(sb, cnt)) + dqopt->files[cnt] = NULL; + dqopt->info[cnt].dqi_flags = 0; + dqopt->info[cnt].dqi_igrace = 0; + dqopt->info[cnt].dqi_bgrace = 0; + dqopt->ops[cnt] = NULL; + } + mutex_unlock(&dqopt->dqonoff_mutex); + + /* Skip syncing and setting flags if quota files are hidden */ + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + goto put_inodes; + + /* Sync the superblock so that buffers with quota data are written to + * disk (and so userspace sees correct data afterwards). */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + /* Now the quota files are just ordinary files and we can set the + * inode flags back. Moreover we discard the pagecache so that + * userspace sees the writes we did bypassing the pagecache. We + * must also discard the blockdev buffers so that we see the + * changes done by userspace on the next quotaon() */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { + mutex_lock(&dqopt->dqonoff_mutex); + /* If quota was reenabled in the meantime, we have + * nothing to do */ + if (!sb_has_quota_loaded(sb, cnt)) { + mutex_lock(&toputinode[cnt]->i_mutex); + toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | + S_NOATIME | S_NOQUOTA); + truncate_inode_pages(&toputinode[cnt]->i_data, + 0); + mutex_unlock(&toputinode[cnt]->i_mutex); + mark_inode_dirty_sync(toputinode[cnt]); + } + mutex_unlock(&dqopt->dqonoff_mutex); + } + if (sb->s_bdev) + invalidate_bdev(sb->s_bdev); +put_inodes: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { + /* On remount RO, we keep the inode pointer so that we + * can reenable quota on the subsequent remount RW. We + * have to check 'flags' variable and not use sb_has_ + * function because another quotaon / quotaoff could + * change global state before we got here. We refuse + * to suspend quotas when there is pending delete on + * the quota file... */ + if (!(flags & DQUOT_SUSPENDED)) + iput(toputinode[cnt]); + else if (!toputinode[cnt]->i_nlink) + ret = -EBUSY; + } + return ret; +} +EXPORT_SYMBOL(dquot_disable); + +int dquot_quota_off(struct super_block *sb, int type) +{ + return dquot_disable(sb, type, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); +} +EXPORT_SYMBOL(dquot_quota_off); + +/* + * Turn quotas on on a device + */ + +/* + * Helper function to turn quotas on when we already have the inode of + * quota file and no quota information is loaded. + */ +static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + struct quota_format_type *fmt = find_quota_format(format_id); + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + int error; + int oldflags = -1; + + if (!fmt) + return -ESRCH; + if (!S_ISREG(inode->i_mode)) { + error = -EACCES; + goto out_fmt; + } + if (IS_RDONLY(inode)) { + error = -EROFS; + goto out_fmt; + } + if (!sb->s_op->quota_write || !sb->s_op->quota_read || + (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { + error = -EINVAL; + goto out_fmt; + } + /* Usage always has to be set... */ + if (!(flags & DQUOT_USAGE_ENABLED)) { + error = -EINVAL; + goto out_fmt; + } + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); + invalidate_bdev(sb->s_bdev); + } + mutex_lock(&dqopt->dqonoff_mutex); + if (sb_has_quota_loaded(sb, type)) { + error = -EBUSY; + goto out_lock; + } + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + mutex_lock(&inode->i_mutex); + oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | + S_NOQUOTA); + inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + mutex_unlock(&inode->i_mutex); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); + } + + error = -EIO; + dqopt->files[type] = igrab(inode); + if (!dqopt->files[type]) + goto out_lock; + error = -EINVAL; + if (!fmt->qf_ops->check_quota_file(sb, type)) + goto out_file_init; + + dqopt->ops[type] = fmt->qf_ops; + dqopt->info[type].dqi_format = fmt; + dqopt->info[type].dqi_fmt_id = format_id; + INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); + mutex_lock(&dqopt->dqio_mutex); + error = dqopt->ops[type]->read_file_info(sb, type); + if (error < 0) { + mutex_unlock(&dqopt->dqio_mutex); + goto out_file_init; + } + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + dqopt->info[type].dqi_flags |= DQF_SYS_FILE; + mutex_unlock(&dqopt->dqio_mutex); + spin_lock(&dq_state_lock); + dqopt->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); + + add_dquot_ref(sb, type); + mutex_unlock(&dqopt->dqonoff_mutex); + + return 0; + +out_file_init: + dqopt->files[type] = NULL; + iput(inode); +out_lock: + if (oldflags != -1) { + mutex_lock(&inode->i_mutex); + /* Set the flags back (in the case of accidental quotaon() + * on a wrong file we don't want to mess up the flags) */ + inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); + inode->i_flags |= oldflags; + mutex_unlock(&inode->i_mutex); + } + mutex_unlock(&dqopt->dqonoff_mutex); +out_fmt: + put_quota_format(fmt); + + return error; +} + +/* Reenable quotas on remount RW */ +int dquot_resume(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode; + int ret = 0, cnt; + unsigned int flags; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, cnt)) { + mutex_unlock(&dqopt->dqonoff_mutex); + continue; + } + inode = dqopt->files[cnt]; + dqopt->files[cnt] = NULL; + spin_lock(&dq_state_lock); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + cnt); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); + spin_unlock(&dq_state_lock); + mutex_unlock(&dqopt->dqonoff_mutex); + + flags = dquot_generic_flag(flags, cnt); + ret = vfs_load_quota_inode(inode, cnt, + dqopt->info[cnt].dqi_fmt_id, flags); + iput(inode); + } + + return ret; +} +EXPORT_SYMBOL(dquot_resume); + +int dquot_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int error = security_quota_on(path->dentry); + if (error) + return error; + /* Quota file not on the same filesystem? */ + if (path->dentry->d_sb != sb) + error = -EXDEV; + else + error = vfs_load_quota_inode(d_inode(path->dentry), type, + format_id, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + return error; +} +EXPORT_SYMBOL(dquot_quota_on); + +/* + * More powerful function for turning on quotas allowing setting + * of individual quota flags + */ +int dquot_enable(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int ret = 0; + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + /* Just unsuspend quotas? */ + BUG_ON(flags & DQUOT_SUSPENDED); + + if (!flags) + return 0; + /* Just updating flags needed? */ + if (sb_has_quota_loaded(sb, type)) { + mutex_lock(&dqopt->dqonoff_mutex); + /* Now do a reliable test... */ + if (!sb_has_quota_loaded(sb, type)) { + mutex_unlock(&dqopt->dqonoff_mutex); + goto load_quota; + } + if (flags & DQUOT_USAGE_ENABLED && + sb_has_quota_usage_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + if (flags & DQUOT_LIMITS_ENABLED && + sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + spin_lock(&dq_state_lock); + sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); +out_lock: + mutex_unlock(&dqopt->dqonoff_mutex); + return ret; + } + +load_quota: + return vfs_load_quota_inode(inode, type, format_id, flags); +} +EXPORT_SYMBOL(dquot_enable); + +/* + * This function is used when filesystem needs to initialize quotas + * during mount time. + */ +int dquot_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type) +{ + struct dentry *dentry; + int error; + + mutex_lock(&d_inode(sb->s_root)->i_mutex); + dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); + mutex_unlock(&d_inode(sb->s_root)->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (d_really_is_negative(dentry)) { + error = -ENOENT; + goto out; + } + + error = security_quota_on(dentry); + if (!error) + error = vfs_load_quota_inode(d_inode(dentry), type, format_id, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + +out: + dput(dentry); + return error; +} +EXPORT_SYMBOL(dquot_quota_on_mount); + +static int dquot_quota_enable(struct super_block *sb, unsigned int flags) +{ + int ret; + int type; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) + return -ENOSYS; + /* Accounting cannot be turned on while fs is mounted */ + flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT); + if (!flags) + return -EINVAL; + for (type = 0; type < MAXQUOTAS; type++) { + if (!(flags & qtype_enforce_flag(type))) + continue; + /* Can't enforce without accounting */ + if (!sb_has_quota_usage_enabled(sb, type)) + return -EINVAL; + ret = dquot_enable(dqopt->files[type], type, + dqopt->info[type].dqi_fmt_id, + DQUOT_LIMITS_ENABLED); + if (ret < 0) + goto out_err; + } + return 0; +out_err: + /* Backout enforcement enablement we already did */ + for (type--; type >= 0; type--) { + if (flags & qtype_enforce_flag(type)) + dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); + } + /* Error code translation for better compatibility with XFS */ + if (ret == -EBUSY) + ret = -EEXIST; + return ret; +} + +static int dquot_quota_disable(struct super_block *sb, unsigned int flags) +{ + int ret; + int type; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) + return -ENOSYS; + /* + * We don't support turning off accounting via quotactl. In principle + * quota infrastructure can do this but filesystems don't expect + * userspace to be able to do it. + */ + if (flags & + (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT)) + return -EOPNOTSUPP; + + /* Filter out limits not enabled */ + for (type = 0; type < MAXQUOTAS; type++) + if (!sb_has_quota_limits_enabled(sb, type)) + flags &= ~qtype_enforce_flag(type); + /* Nothing left? */ + if (!flags) + return -EEXIST; + for (type = 0; type < MAXQUOTAS; type++) { + if (flags & qtype_enforce_flag(type)) { + ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); + if (ret < 0) + goto out_err; + } + } + return 0; +out_err: + /* Backout enforcement disabling we already did */ + for (type--; type >= 0; type--) { + if (flags & qtype_enforce_flag(type)) + dquot_enable(dqopt->files[type], type, + dqopt->info[type].dqi_fmt_id, + DQUOT_LIMITS_ENABLED); + } + return ret; +} + +/* Generic routine for getting common part of quota structure */ +static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; + + memset(di, 0, sizeof(*di)); + spin_lock(&dq_data_lock); + di->d_spc_hardlimit = dm->dqb_bhardlimit; + di->d_spc_softlimit = dm->dqb_bsoftlimit; + di->d_ino_hardlimit = dm->dqb_ihardlimit; + di->d_ino_softlimit = dm->dqb_isoftlimit; + di->d_space = dm->dqb_curspace + dm->dqb_rsvspace; + di->d_ino_count = dm->dqb_curinodes; + di->d_spc_timer = dm->dqb_btime; + di->d_ino_timer = dm->dqb_itime; + spin_unlock(&dq_data_lock); +} + +int dquot_get_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *di) +{ + struct dquot *dquot; + + dquot = dqget(sb, qid); + if (!dquot) + return -ESRCH; + do_get_dqblk(dquot, di); + dqput(dquot); + + return 0; +} +EXPORT_SYMBOL(dquot_get_dqblk); + +#define VFS_QC_MASK \ + (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ + QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ + QC_SPC_TIMER | QC_INO_TIMER) + +/* Generic routine for setting common part of quota structure */ +static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; + int check_blim = 0, check_ilim = 0; + struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + + if (di->d_fieldmask & ~VFS_QC_MASK) + return -EINVAL; + + if (((di->d_fieldmask & QC_SPC_SOFT) && + di->d_spc_softlimit > dqi->dqi_max_spc_limit) || + ((di->d_fieldmask & QC_SPC_HARD) && + di->d_spc_hardlimit > dqi->dqi_max_spc_limit) || + ((di->d_fieldmask & QC_INO_SOFT) && + (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) || + ((di->d_fieldmask & QC_INO_HARD) && + (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) + return -ERANGE; + + spin_lock(&dq_data_lock); + if (di->d_fieldmask & QC_SPACE) { + dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & QC_SPC_SOFT) + dm->dqb_bsoftlimit = di->d_spc_softlimit; + if (di->d_fieldmask & QC_SPC_HARD) + dm->dqb_bhardlimit = di->d_spc_hardlimit; + if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) { + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & QC_INO_COUNT) { + dm->dqb_curinodes = di->d_ino_count; + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & QC_INO_SOFT) + dm->dqb_isoftlimit = di->d_ino_softlimit; + if (di->d_fieldmask & QC_INO_HARD) + dm->dqb_ihardlimit = di->d_ino_hardlimit; + if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) { + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & QC_SPC_TIMER) { + dm->dqb_btime = di->d_spc_timer; + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & QC_INO_TIMER) { + dm->dqb_itime = di->d_ino_timer; + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + } + + if (check_blim) { + if (!dm->dqb_bsoftlimit || + dm->dqb_curspace < dm->dqb_bsoftlimit) { + dm->dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } else if (!(di->d_fieldmask & QC_SPC_TIMER)) + /* Set grace only if user hasn't provided his own... */ + dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; + } + if (check_ilim) { + if (!dm->dqb_isoftlimit || + dm->dqb_curinodes < dm->dqb_isoftlimit) { + dm->dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } else if (!(di->d_fieldmask & QC_INO_TIMER)) + /* Set grace only if user hasn't provided his own... */ + dm->dqb_itime = get_seconds() + dqi->dqi_igrace; + } + if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || + dm->dqb_isoftlimit) + clear_bit(DQ_FAKE_B, &dquot->dq_flags); + else + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + mark_dquot_dirty(dquot); + + return 0; +} + +int dquot_set_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *di) +{ + struct dquot *dquot; + int rc; + + dquot = dqget(sb, qid); + if (!dquot) { + rc = -ESRCH; + goto out; + } + rc = do_set_dqblk(dquot, di); + dqput(dquot); +out: + return rc; +} +EXPORT_SYMBOL(dquot_set_dqblk); + +/* Generic routine for getting common part of quota file information */ +int dquot_get_state(struct super_block *sb, struct qc_state *state) +{ + struct mem_dqinfo *mi; + struct qc_type_state *tstate; + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + memset(state, 0, sizeof(*state)); + for (type = 0; type < MAXQUOTAS; type++) { + if (!sb_has_quota_active(sb, type)) + continue; + tstate = state->s_state + type; + mi = sb_dqopt(sb)->info + type; + tstate->flags = QCI_ACCT_ENABLED; + spin_lock(&dq_data_lock); + if (mi->dqi_flags & DQF_SYS_FILE) + tstate->flags |= QCI_SYSFILE; + if (mi->dqi_flags & DQF_ROOT_SQUASH) + tstate->flags |= QCI_ROOT_SQUASH; + if (sb_has_quota_limits_enabled(sb, type)) + tstate->flags |= QCI_LIMITS_ENFORCED; + tstate->spc_timelimit = mi->dqi_bgrace; + tstate->ino_timelimit = mi->dqi_igrace; + tstate->ino = dqopt->files[type]->i_ino; + tstate->blocks = dqopt->files[type]->i_blocks; + tstate->nextents = 1; /* We don't know... */ + spin_unlock(&dq_data_lock); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; +} +EXPORT_SYMBOL(dquot_get_state); + +/* Generic routine for setting common part of quota file information */ +int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) +{ + struct mem_dqinfo *mi; + int err = 0; + + if ((ii->i_fieldmask & QC_WARNS_MASK) || + (ii->i_fieldmask & QC_RT_SPC_TIMER)) + return -EINVAL; + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + if (!sb_has_quota_active(sb, type)) { + err = -ESRCH; + goto out; + } + mi = sb_dqopt(sb)->info + type; + if (ii->i_fieldmask & QC_FLAGS) { + if ((ii->i_flags & QCI_ROOT_SQUASH && + mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) { + err = -EINVAL; + goto out; + } + } + spin_lock(&dq_data_lock); + if (ii->i_fieldmask & QC_SPC_TIMER) + mi->dqi_bgrace = ii->i_spc_timelimit; + if (ii->i_fieldmask & QC_INO_TIMER) + mi->dqi_igrace = ii->i_ino_timelimit; + if (ii->i_fieldmask & QC_FLAGS) { + if (ii->i_flags & QCI_ROOT_SQUASH) + mi->dqi_flags |= DQF_ROOT_SQUASH; + else + mi->dqi_flags &= ~DQF_ROOT_SQUASH; + } + spin_unlock(&dq_data_lock); + mark_info_dirty(sb, type); + /* Force write to disk */ + sb->dq_op->write_info(sb, type); +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return err; +} +EXPORT_SYMBOL(dquot_set_dqinfo); + +const struct quotactl_ops dquot_quotactl_ops = { + .quota_on = dquot_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +EXPORT_SYMBOL(dquot_quotactl_ops); + +const struct quotactl_ops dquot_quotactl_sysfile_ops = { + .quota_enable = dquot_quota_enable, + .quota_disable = dquot_quota_disable, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); + +static int do_proc_dqstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int type = (int *)table->data - dqstats.stat; + + /* Update global table */ + dqstats.stat[type] = + percpu_counter_sum_positive(&dqstats.counter[type]); + return proc_dointvec(table, write, buffer, lenp, ppos); +} + +static struct ctl_table fs_dqstats_table[] = { + { + .procname = "lookups", + .data = &dqstats.stat[DQST_LOOKUPS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "drops", + .data = &dqstats.stat[DQST_DROPS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "reads", + .data = &dqstats.stat[DQST_READS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "writes", + .data = &dqstats.stat[DQST_WRITES], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "cache_hits", + .data = &dqstats.stat[DQST_CACHE_HITS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "allocated_dquots", + .data = &dqstats.stat[DQST_ALLOC_DQUOTS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "free_dquots", + .data = &dqstats.stat[DQST_FREE_DQUOTS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "syncs", + .data = &dqstats.stat[DQST_SYNCS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, +#ifdef CONFIG_PRINT_QUOTA_WARNING + { + .procname = "warnings", + .data = &flag_print_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { }, +}; + +static struct ctl_table fs_table[] = { + { + .procname = "quota", + .mode = 0555, + .child = fs_dqstats_table, + }, + { }, +}; + +static struct ctl_table sys_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { }, +}; + +static int __init dquot_init(void) +{ + int i, ret; + unsigned long nr_hash, order; + + printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); + + register_sysctl_table(sys_table); + + dquot_cachep = kmem_cache_create("dquot", + sizeof(struct dquot), sizeof(unsigned long) * 4, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + NULL); + + order = 0; + dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order); + if (!dquot_hash) + panic("Cannot create dquot hash table"); + + for (i = 0; i < _DQST_DQSTAT_LAST; i++) { + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); + if (ret) + panic("Cannot create dquot stat counters"); + } + + /* Find power-of-two hlist_heads which can fit into allocation */ + nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); + dq_hash_bits = 0; + do { + dq_hash_bits++; + } while (nr_hash >> dq_hash_bits); + dq_hash_bits--; + + nr_hash = 1UL << dq_hash_bits; + dq_hash_mask = nr_hash - 1; + for (i = 0; i < nr_hash; i++) + INIT_HLIST_HEAD(dquot_hash + i); + + pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," + " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); + + register_shrinker(&dqcache_shrinker); + + return 0; +} +module_init(dquot_init); diff --git a/kernel/fs/quota/kqid.c b/kernel/fs/quota/kqid.c new file mode 100644 index 000000000..ebc5e6285 --- /dev/null +++ b/kernel/fs/quota/kqid.c @@ -0,0 +1,132 @@ +#include +#include +#include + +/** + * qid_eq - Test to see if to kquid values are the same + * @left: A qid value + * @right: Another quid value + * + * Return true if the two qid values are equal and false otherwise. + */ +bool qid_eq(struct kqid left, struct kqid right) +{ + if (left.type != right.type) + return false; + switch(left.type) { + case USRQUOTA: + return uid_eq(left.uid, right.uid); + case GRPQUOTA: + return gid_eq(left.gid, right.gid); + case PRJQUOTA: + return projid_eq(left.projid, right.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_eq); + +/** + * qid_lt - Test to see if one qid value is less than another + * @left: The possibly lesser qid value + * @right: The possibly greater qid value + * + * Return true if left is less than right and false otherwise. + */ +bool qid_lt(struct kqid left, struct kqid right) +{ + if (left.type < right.type) + return true; + if (left.type > right.type) + return false; + switch (left.type) { + case USRQUOTA: + return uid_lt(left.uid, right.uid); + case GRPQUOTA: + return gid_lt(left.gid, right.gid); + case PRJQUOTA: + return projid_lt(left.projid, right.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_lt); + +/** + * from_kqid - Create a qid from a kqid user-namespace pair. + * @targ: The user namespace we want a qid in. + * @kqid: The kernel internal quota identifier to start with. + * + * Map @kqid into the user-namespace specified by @targ and + * return the resulting qid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kqid has no mapping in @targ (qid_t)-1 is returned. + */ +qid_t from_kqid(struct user_namespace *targ, struct kqid kqid) +{ + switch (kqid.type) { + case USRQUOTA: + return from_kuid(targ, kqid.uid); + case GRPQUOTA: + return from_kgid(targ, kqid.gid); + case PRJQUOTA: + return from_kprojid(targ, kqid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(from_kqid); + +/** + * from_kqid_munged - Create a qid from a kqid user-namespace pair. + * @targ: The user namespace we want a qid in. + * @kqid: The kernel internal quota identifier to start with. + * + * Map @kqid into the user-namespace specified by @targ and + * return the resulting qid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kqid from_kqid_munged never fails and always + * returns a valid projid. This makes from_kqid_munged + * appropriate for use in places where failing to provide + * a qid_t is not a good option. + * + * If @kqid has no mapping in @targ the kqid.type specific + * overflow identifier is returned. + */ +qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid) +{ + switch (kqid.type) { + case USRQUOTA: + return from_kuid_munged(targ, kqid.uid); + case GRPQUOTA: + return from_kgid_munged(targ, kqid.gid); + case PRJQUOTA: + return from_kprojid_munged(targ, kqid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(from_kqid_munged); + +/** + * qid_valid - Report if a valid value is stored in a kqid. + * @qid: The kernel internal quota identifier to test. + */ +bool qid_valid(struct kqid qid) +{ + switch (qid.type) { + case USRQUOTA: + return uid_valid(qid.uid); + case GRPQUOTA: + return gid_valid(qid.gid); + case PRJQUOTA: + return projid_valid(qid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_valid); diff --git a/kernel/fs/quota/netlink.c b/kernel/fs/quota/netlink.c new file mode 100644 index 000000000..bb2869f5d --- /dev/null +++ b/kernel/fs/quota/netlink.c @@ -0,0 +1,109 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct genl_multicast_group quota_mcgrps[] = { + { .name = "events", }, +}; + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + /* + * Needed due to multicast group ID abuse - old code assumed + * the family ID was also a valid multicast group ID (which + * isn't true) and userspace might thus rely on it. Assign a + * static ID for this group to make dealing with that easier. + */ + .id = GENL_ID_VFS_DQUOT, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, + .mcgrps = quota_mcgrps, + .n_mcgrps = ARRAY_SIZE(quota_mcgrps), +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @qid: The kernel internal quota identifier. + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(struct kqid qid, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, + from_kqid_munged(&init_user_ns, qid)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, + from_kuid_munged(&init_user_ns, current_uid())); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast("a_genl_family, skb, 0, 0, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/kernel/fs/quota/quota.c b/kernel/fs/quota/quota.c new file mode 100644 index 000000000..86ded7375 --- /dev/null +++ b/kernel/fs/quota/quota.c @@ -0,0 +1,808 @@ +/* + * Quota code necessary even when VFS quota support is not compiled + * into the kernel. The interesting stuff is over in dquot.c, here + * we have symbols for initial quotactl(2) handling, the sysctl(2) + * variables, etc - things needed even when quota support disabled. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) +{ + switch (cmd) { + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XGETQSTATV: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || + (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) + break; + /*FALLTHROUGH*/ + default: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return security_quotactl(cmd, type, id, sb); +} + +static void quota_sync_one(struct super_block *sb, void *arg) +{ + int type = *(int *)arg; + + if (sb->s_qcop && sb->s_qcop->quota_sync && + (sb->s_quota_types & (1 << type))) + sb->s_qcop->quota_sync(sb, type); +} + +static int quota_sync_all(int type) +{ + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (!ret) + iterate_supers(quota_sync_one, &type); + return ret; +} + +unsigned int qtype_enforce_flag(int type) +{ + switch (type) { + case USRQUOTA: + return FS_QUOTA_UDQ_ENFD; + case GRPQUOTA: + return FS_QUOTA_GDQ_ENFD; + case PRJQUOTA: + return FS_QUOTA_PDQ_ENFD; + } + return 0; +} + +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + struct path *path) +{ + if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) + return -ENOSYS; + if (sb->s_qcop->quota_enable) + return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type)); + if (IS_ERR(path)) + return PTR_ERR(path); + return sb->s_qcop->quota_on(sb, type, id, path); +} + +static int quota_quotaoff(struct super_block *sb, int type) +{ + if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable) + return -ENOSYS; + if (sb->s_qcop->quota_disable) + return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type)); + return sb->s_qcop->quota_off(sb, type); +} + +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + if (!sb_has_quota_active(sb, type)) { + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return -ESRCH; + } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} + +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct qc_state state; + struct qc_type_state *tstate; + struct if_dqinfo uinfo; + int ret; + + /* This checks whether qc_state has enough entries... */ + BUILD_BUG_ON(MAXQUOTAS > XQM_MAXQUOTAS); + if (!sb->s_qcop->get_state) + return -ENOSYS; + ret = sb->s_qcop->get_state(sb, &state); + if (ret) + return ret; + tstate = state.s_state + type; + if (!(tstate->flags & QCI_ACCT_ENABLED)) + return -ESRCH; + memset(&uinfo, 0, sizeof(uinfo)); + uinfo.dqi_bgrace = tstate->spc_timelimit; + uinfo.dqi_igrace = tstate->ino_timelimit; + if (tstate->flags & QCI_SYSFILE) + uinfo.dqi_flags |= DQF_SYS_FILE; + if (tstate->flags & QCI_ROOT_SQUASH) + uinfo.dqi_flags |= DQF_ROOT_SQUASH; + uinfo.dqi_valid = IIF_ALL; + if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo))) + return -EFAULT; + return ret; +} + +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + struct qc_info qinfo; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + if (!sb->s_qcop->set_info) + return -ENOSYS; + if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE)) + return -EINVAL; + memset(&qinfo, 0, sizeof(qinfo)); + if (info.dqi_valid & IIF_FLAGS) { + if (info.dqi_flags & ~DQF_SETINFO_MASK) + return -EINVAL; + if (info.dqi_flags & DQF_ROOT_SQUASH) + qinfo.i_flags |= QCI_ROOT_SQUASH; + qinfo.i_fieldmask |= QC_FLAGS; + } + if (info.dqi_valid & IIF_BGRACE) { + qinfo.i_spc_timelimit = info.dqi_bgrace; + qinfo.i_fieldmask |= QC_SPC_TIMER; + } + if (info.dqi_valid & IIF_IGRACE) { + qinfo.i_ino_timelimit = info.dqi_igrace; + qinfo.i_fieldmask |= QC_INO_TIMER; + } + return sb->s_qcop->set_info(sb, type, &qinfo); +} + +static inline qsize_t qbtos(qsize_t blocks) +{ + return blocks << QIF_DQBLKSIZE_BITS; +} + +static inline qsize_t stoqb(qsize_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + +static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src) +{ + memset(dst, 0, sizeof(*dst)); + dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit); + dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit); + dst->dqb_curspace = src->d_space; + dst->dqb_ihardlimit = src->d_ino_hardlimit; + dst->dqb_isoftlimit = src->d_ino_softlimit; + dst->dqb_curinodes = src->d_ino_count; + dst->dqb_btime = src->d_spc_timer; + dst->dqb_itime = src->d_ino_timer; + dst->dqb_valid = QIF_ALL; +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct kqid qid; + struct qc_dqblk fdq; + struct if_dqblk idq; + int ret; + + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); + if (ret) + return ret; + copy_to_if_dqblk(&idq, &fdq); + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + return 0; +} + +static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) +{ + dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); + dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit); + dst->d_space = src->dqb_curspace; + dst->d_ino_hardlimit = src->dqb_ihardlimit; + dst->d_ino_softlimit = src->dqb_isoftlimit; + dst->d_ino_count = src->dqb_curinodes; + dst->d_spc_timer = src->dqb_btime; + dst->d_ino_timer = src->dqb_itime; + + dst->d_fieldmask = 0; + if (src->dqb_valid & QIF_BLIMITS) + dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD; + if (src->dqb_valid & QIF_SPACE) + dst->d_fieldmask |= QC_SPACE; + if (src->dqb_valid & QIF_ILIMITS) + dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD; + if (src->dqb_valid & QIF_INODES) + dst->d_fieldmask |= QC_INO_COUNT; + if (src->dqb_valid & QIF_BTIME) + dst->d_fieldmask |= QC_SPC_TIMER; + if (src->dqb_valid & QIF_ITIME) + dst->d_fieldmask |= QC_INO_TIMER; +} + +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct qc_dqblk fdq; + struct if_dqblk idq; + struct kqid qid; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + copy_from_if_dqblk(&fdq, &idq); + return sb->s_qcop->set_dqblk(sb, qid, &fdq); +} + +static int quota_enable(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->quota_enable) + return -ENOSYS; + return sb->s_qcop->quota_enable(sb, flags); +} + +static int quota_disable(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->quota_disable) + return -ENOSYS; + return sb->s_qcop->quota_disable(sb, flags); +} + +static int quota_state_to_flags(struct qc_state *state) +{ + int flags = 0; + + if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_UDQ_ACCT; + if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_UDQ_ENFD; + if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_GDQ_ACCT; + if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_GDQ_ENFD; + if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_PDQ_ACCT; + if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_PDQ_ENFD; + return flags; +} + +static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) +{ + int type; + struct qc_state state; + int ret; + + ret = sb->s_qcop->get_state(sb, &state); + if (ret < 0) + return ret; + + memset(fqs, 0, sizeof(*fqs)); + fqs->qs_version = FS_QSTAT_VERSION; + fqs->qs_flags = quota_state_to_flags(&state); + /* No quota enabled? */ + if (!fqs->qs_flags) + return -ENOSYS; + fqs->qs_incoredqs = state.s_incoredqs; + /* + * GETXSTATE quotactl has space for just one set of time limits so + * report them for the first enabled quota type + */ + for (type = 0; type < XQM_MAXQUOTAS; type++) + if (state.s_state[type].flags & QCI_ACCT_ENABLED) + break; + BUG_ON(type == XQM_MAXQUOTAS); + fqs->qs_btimelimit = state.s_state[type].spc_timelimit; + fqs->qs_itimelimit = state.s_state[type].ino_timelimit; + fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; + fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; + fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; + if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; + fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; + fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; + } + if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; + fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; + } + if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + /* + * Q_XGETQSTAT doesn't have room for both group and project + * quotas. So, allow the project quota values to be copied out + * only if there is no group quota information available. + */ + if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) { + fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino; + fqs->qs_gquota.qfs_nblks = + state.s_state[PRJQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = + state.s_state[PRJQUOTA].nextents; + } + } + return 0; +} + +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; + + if (!sb->s_qcop->get_state) + return -ENOSYS; + ret = quota_getstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} + +static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) +{ + int type; + struct qc_state state; + int ret; + + ret = sb->s_qcop->get_state(sb, &state); + if (ret < 0) + return ret; + + memset(fqs, 0, sizeof(*fqs)); + fqs->qs_version = FS_QSTAT_VERSION; + fqs->qs_flags = quota_state_to_flags(&state); + /* No quota enabled? */ + if (!fqs->qs_flags) + return -ENOSYS; + fqs->qs_incoredqs = state.s_incoredqs; + /* + * GETXSTATV quotactl has space for just one set of time limits so + * report them for the first enabled quota type + */ + for (type = 0; type < XQM_MAXQUOTAS; type++) + if (state.s_state[type].flags & QCI_ACCT_ENABLED) + break; + BUG_ON(type == XQM_MAXQUOTAS); + fqs->qs_btimelimit = state.s_state[type].spc_timelimit; + fqs->qs_itimelimit = state.s_state[type].ino_timelimit; + fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; + fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; + fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; + if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; + fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; + fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; + } + if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; + fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; + } + if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; + fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; + fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; + } + return 0; +} + +static int quota_getxstatev(struct super_block *sb, void __user *addr) +{ + struct fs_quota_statv fqs; + int ret; + + if (!sb->s_qcop->get_state) + return -ENOSYS; + + memset(&fqs, 0, sizeof(fqs)); + if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */ + return -EFAULT; + + /* If this kernel doesn't support user specified version, fail */ + switch (fqs.qs_version) { + case FS_QSTATV_VERSION1: + break; + default: + return -EINVAL; + } + ret = quota_getstatev(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} + +/* + * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them + * out of there as xfsprogs rely on definitions being in that header file. So + * just define same functions here for quota purposes. + */ +#define XFS_BB_SHIFT 9 + +static inline u64 quota_bbtob(u64 blocks) +{ + return blocks << XFS_BB_SHIFT; +} + +static inline u64 quota_btobb(u64 bytes) +{ + return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; +} + +static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) +{ + dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); + dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit); + dst->d_ino_hardlimit = src->d_ino_hardlimit; + dst->d_ino_softlimit = src->d_ino_softlimit; + dst->d_space = quota_bbtob(src->d_bcount); + dst->d_ino_count = src->d_icount; + dst->d_ino_timer = src->d_itimer; + dst->d_spc_timer = src->d_btimer; + dst->d_ino_warns = src->d_iwarns; + dst->d_spc_warns = src->d_bwarns; + dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); + dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); + dst->d_rt_space = quota_bbtob(src->d_rtbcount); + dst->d_rt_spc_timer = src->d_rtbtimer; + dst->d_rt_spc_warns = src->d_rtbwarns; + dst->d_fieldmask = 0; + if (src->d_fieldmask & FS_DQ_ISOFT) + dst->d_fieldmask |= QC_INO_SOFT; + if (src->d_fieldmask & FS_DQ_IHARD) + dst->d_fieldmask |= QC_INO_HARD; + if (src->d_fieldmask & FS_DQ_BSOFT) + dst->d_fieldmask |= QC_SPC_SOFT; + if (src->d_fieldmask & FS_DQ_BHARD) + dst->d_fieldmask |= QC_SPC_HARD; + if (src->d_fieldmask & FS_DQ_RTBSOFT) + dst->d_fieldmask |= QC_RT_SPC_SOFT; + if (src->d_fieldmask & FS_DQ_RTBHARD) + dst->d_fieldmask |= QC_RT_SPC_HARD; + if (src->d_fieldmask & FS_DQ_BTIMER) + dst->d_fieldmask |= QC_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_ITIMER) + dst->d_fieldmask |= QC_INO_TIMER; + if (src->d_fieldmask & FS_DQ_RTBTIMER) + dst->d_fieldmask |= QC_RT_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_BWARNS) + dst->d_fieldmask |= QC_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_IWARNS) + dst->d_fieldmask |= QC_INO_WARNS; + if (src->d_fieldmask & FS_DQ_RTBWARNS) + dst->d_fieldmask |= QC_RT_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_BCOUNT) + dst->d_fieldmask |= QC_SPACE; + if (src->d_fieldmask & FS_DQ_ICOUNT) + dst->d_fieldmask |= QC_INO_COUNT; + if (src->d_fieldmask & FS_DQ_RTBCOUNT) + dst->d_fieldmask |= QC_RT_SPACE; +} + +static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst, + struct fs_disk_quota *src) +{ + memset(dst, 0, sizeof(*dst)); + dst->i_spc_timelimit = src->d_btimer; + dst->i_ino_timelimit = src->d_itimer; + dst->i_rt_spc_timelimit = src->d_rtbtimer; + dst->i_ino_warnlimit = src->d_iwarns; + dst->i_spc_warnlimit = src->d_bwarns; + dst->i_rt_spc_warnlimit = src->d_rtbwarns; + if (src->d_fieldmask & FS_DQ_BWARNS) + dst->i_fieldmask |= QC_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_IWARNS) + dst->i_fieldmask |= QC_INO_WARNS; + if (src->d_fieldmask & FS_DQ_RTBWARNS) + dst->i_fieldmask |= QC_RT_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_BTIMER) + dst->i_fieldmask |= QC_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_ITIMER) + dst->i_fieldmask |= QC_INO_TIMER; + if (src->d_fieldmask & FS_DQ_RTBTIMER) + dst->i_fieldmask |= QC_RT_SPC_TIMER; +} + +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + struct qc_dqblk qdq; + struct kqid qid; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + /* Are we actually setting timer / warning limits for all users? */ + if (from_kqid(&init_user_ns, qid) == 0 && + fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { + struct qc_info qinfo; + int ret; + + if (!sb->s_qcop->set_info) + return -EINVAL; + copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq); + ret = sb->s_qcop->set_info(sb, type, &qinfo); + if (ret) + return ret; + /* These are already done */ + fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK); + } + copy_from_xfs_dqblk(&qdq, &fdq); + return sb->s_qcop->set_dqblk(sb, qid, &qdq); +} + +static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, + int type, qid_t id) +{ + memset(dst, 0, sizeof(*dst)); + dst->d_version = FS_DQUOT_VERSION; + dst->d_id = id; + if (type == USRQUOTA) + dst->d_flags = FS_USER_QUOTA; + else if (type == PRJQUOTA) + dst->d_flags = FS_PROJ_QUOTA; + else + dst->d_flags = FS_GROUP_QUOTA; + dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit); + dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit); + dst->d_ino_hardlimit = src->d_ino_hardlimit; + dst->d_ino_softlimit = src->d_ino_softlimit; + dst->d_bcount = quota_btobb(src->d_space); + dst->d_icount = src->d_ino_count; + dst->d_itimer = src->d_ino_timer; + dst->d_btimer = src->d_spc_timer; + dst->d_iwarns = src->d_ino_warns; + dst->d_bwarns = src->d_spc_warns; + dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); + dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); + dst->d_rtbcount = quota_btobb(src->d_rt_space); + dst->d_rtbtimer = src->d_rt_spc_timer; + dst->d_rtbwarns = src->d_rt_spc_warns; +} + +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + struct qc_dqblk qdq; + struct kqid qid; + int ret; + + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); + if (ret) + return ret; + copy_to_xfs_dqblk(&fdq, &qdq, type, id); + if (copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; +} + +static int quota_rmxquota(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->rm_xquota) + return -ENOSYS; + return sb->s_qcop->rm_xquota(sb, flags); +} + +/* Copy parameters and call proper function */ +static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr, struct path *path) +{ + int ret; + + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + /* + * Quota not supported on this fs? Check this before s_quota_types + * since they needn't be set if quota is not supported at all. + */ + if (!sb->s_qcop) + return -ENOSYS; + if (!(sb->s_quota_types & (1 << type))) + return -EINVAL; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + + switch (cmd) { + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, path); + case Q_QUOTAOFF: + return quota_quotaoff(sb, type); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + return sb->s_qcop->quota_sync(sb, type); + case Q_XQUOTAON: + return quota_enable(sb, addr); + case Q_XQUOTAOFF: + return quota_disable(sb, addr); + case Q_XQUOTARM: + return quota_rmxquota(sb, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XGETQSTATV: + return quota_getxstatev(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + if (sb->s_flags & MS_RDONLY) + return -EROFS; + /* XFS quotas are fully coherent now, making this call a noop */ + return 0; + default: + return -EINVAL; + } +} + +#ifdef CONFIG_BLOCK + +/* Return 1 if 'cmd' will block on frozen filesystem */ +static int quotactl_cmd_write(int cmd) +{ + switch (cmd) { + case Q_GETFMT: + case Q_GETINFO: + case Q_SYNC: + case Q_XGETQSTAT: + case Q_XGETQSTATV: + case Q_XGETQUOTA: + case Q_XQUOTASYNC: + return 0; + } + return 1; +} + +#endif /* CONFIG_BLOCK */ + +/* + * look up a superblock on which quota ops will be performed + * - use the name of a block device to find the superblock thereon + */ +static struct super_block *quotactl_block(const char __user *special, int cmd) +{ +#ifdef CONFIG_BLOCK + struct block_device *bdev; + struct super_block *sb; + struct filename *tmp = getname(special); + + if (IS_ERR(tmp)) + return ERR_CAST(tmp); + bdev = lookup_bdev(tmp->name); + putname(tmp); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + if (quotactl_cmd_write(cmd)) + sb = get_super_thawed(bdev); + else + sb = get_super(bdev); + bdput(bdev); + if (!sb) + return ERR_PTR(-ENODEV); + + return sb; +#else + return ERR_PTR(-ENODEV); +#endif +} + +/* + * This is the system call interface. This communicates with + * the user-level programs. Currently this only supports diskquota + * calls. Maybe we need to add the process quotas etc. in the future, + * but we probably should use rlimits for that. + */ +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) +{ + uint cmds, type; + struct super_block *sb = NULL; + struct path path, *pathp = NULL; + int ret; + + cmds = cmd >> SUBCMDSHIFT; + type = cmd & SUBCMDMASK; + + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; + } + + /* + * Path for quotaon has to be resolved before grabbing superblock + * because that gets s_umount sem which is also possibly needed by path + * resolution (think about autofs) and thus deadlocks could arise. + */ + if (cmds == Q_QUOTAON) { + ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + if (ret) + pathp = ERR_PTR(ret); + else + pathp = &path; + } + + sb = quotactl_block(special, cmds); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } + + ret = do_quotactl(sb, type, cmds, id, addr, pathp); + + drop_super(sb); +out: + if (pathp && !IS_ERR(pathp)) + path_put(pathp); + return ret; +} diff --git a/kernel/fs/quota/quota_tree.c b/kernel/fs/quota/quota_tree.c new file mode 100644 index 000000000..58efb83de --- /dev/null +++ b/kernel/fs/quota/quota_tree.c @@ -0,0 +1,670 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quota_tree.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota trie support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_QT_PARANOIA + +static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) +{ + unsigned int epb = info->dqi_usable_bs >> 2; + qid_t id = from_kqid(&init_user_ns, qid); + + depth = info->dqi_qtree_depth - depth - 1; + while (depth--) + id /= epb; + return id % epb; +} + +/* Number of entries in one blocks */ +static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) +{ + return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) + / info->dqi_entry_size; +} + +static char *getdqbuf(size_t size) +{ + char *buf = kmalloc(size, GFP_NOFS); + if (!buf) + printk(KERN_WARNING + "VFS: Not enough memory for quota buffers.\n"); + return buf; +} + +static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) +{ + struct super_block *sb = info->dqi_sb; + + memset(buf, 0, info->dqi_usable_bs); + return sb->s_op->quota_read(sb, info->dqi_type, buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) +{ + struct super_block *sb = info->dqi_sb; + ssize_t ret; + + ret = sb->s_op->quota_write(sb, info->dqi_type, buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + if (ret != info->dqi_usable_bs) { + quota_error(sb, "dquota write failed"); + if (ret >= 0) + ret = -EIO; + } + return ret; +} + +/* Remove empty block from list and return it */ +static int get_free_dqblk(struct qtree_mem_dqinfo *info) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int ret, blk; + + if (!buf) + return -ENOMEM; + if (info->dqi_free_blk) { + blk = info->dqi_free_blk; + ret = read_blk(info, blk, buf); + if (ret < 0) + goto out_buf; + info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, info->dqi_usable_bs); + /* Assure block allocation... */ + ret = write_blk(info, info->dqi_blocks, buf); + if (ret < 0) + goto out_buf; + blk = info->dqi_blocks++; + } + mark_info_dirty(info->dqi_sb, info->dqi_type); + ret = blk; +out_buf: + kfree(buf); + return ret; +} + +/* Insert empty block to the list */ +static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) +{ + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + err = write_blk(info, blk, buf); + if (err < 0) + return err; + info->dqi_free_blk = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +} + +/* Remove given block from the list of blocks with free entries */ +static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, + uint blk) +{ + char *tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free); + uint prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; + + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { + err = read_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + dh->dqdh_prev_free; + err = write_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + } + if (prevblk) { + err = read_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = + dh->dqdh_next_free; + err = write_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + } else { + info->dqi_free_entry = nextblk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + } + kfree(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); + /* No matter whether write succeeds block is out of list */ + if (write_blk(info, blk, buf) < 0) + quota_error(info->dqi_sb, "Can't write block (%u) " + "with free entries", blk); + return 0; +out_buf: + kfree(tmpbuf); + return err; +} + +/* Insert given block to the beginning of list with free entries */ +static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, + uint blk) +{ + char *tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + if (!tmpbuf) + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); + err = write_blk(info, blk, buf); + if (err < 0) + goto out_buf; + if (info->dqi_free_entry) { + err = read_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + cpu_to_le32(blk); + err = write_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + } + kfree(tmpbuf); + info->dqi_free_entry = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +out_buf: + kfree(tmpbuf); + return err; +} + +/* Is the entry in the block free? */ +int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) +{ + int i; + + for (i = 0; i < info->dqi_entry_size; i++) + if (disk[i]) + return 0; + return 1; +} +EXPORT_SYMBOL(qtree_entry_unused); + +/* Find space for dquot */ +static uint find_free_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, int *err) +{ + uint blk, i; + struct qt_disk_dqdbheader *dh; + char *buf = getdqbuf(info->dqi_usable_bs); + char *ddquot; + + *err = 0; + if (!buf) { + *err = -ENOMEM; + return 0; + } + dh = (struct qt_disk_dqdbheader *)buf; + if (info->dqi_free_entry) { + blk = info->dqi_free_entry; + *err = read_blk(info, blk, buf); + if (*err < 0) + goto out_buf; + } else { + blk = get_free_dqblk(info); + if ((int)blk < 0) { + *err = blk; + kfree(buf); + return 0; + } + memset(buf, 0, info->dqi_usable_bs); + /* This is enough as the block is already zeroed and the entry + * list is empty... */ + info->dqi_free_entry = blk; + mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); + } + /* Block will be full? */ + if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { + *err = remove_free_dqentry(info, buf, blk); + if (*err < 0) { + quota_error(dquot->dq_sb, "Can't remove block (%u) " + "from entry free list", blk); + goto out_buf; + } + } + le16_add_cpu(&dh->dqdh_entries, 1); + /* Find free structure in block */ + ddquot = buf + sizeof(struct qt_disk_dqdbheader); + for (i = 0; i < qtree_dqstr_in_blk(info); i++) { + if (qtree_entry_unused(info, ddquot)) + break; + ddquot += info->dqi_entry_size; + } +#ifdef __QUOTA_QT_PARANOIA + if (i == qtree_dqstr_in_blk(info)) { + quota_error(dquot->dq_sb, "Data block full but it shouldn't"); + *err = -EIO; + goto out_buf; + } +#endif + *err = write_blk(info, blk, buf); + if (*err < 0) { + quota_error(dquot->dq_sb, "Can't write quota data block %u", + blk); + goto out_buf; + } + dquot->dq_off = (blk << info->dqi_blocksize_bits) + + sizeof(struct qt_disk_dqdbheader) + + i * info->dqi_entry_size; + kfree(buf); + return blk; +out_buf: + kfree(buf); + return 0; +} + +/* Insert reference to structure into the trie */ +static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *treeblk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0, newson = 0, newact = 0; + __le32 *ref; + uint newblk; + + if (!buf) + return -ENOMEM; + if (!*treeblk) { + ret = get_free_dqblk(info); + if (ret < 0) + goto out_buf; + *treeblk = ret; + memset(buf, 0, info->dqi_usable_bs); + newact = 1; + } else { + ret = read_blk(info, *treeblk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read tree quota " + "block %u", *treeblk); + goto out_buf; + } + } + ref = (__le32 *)buf; + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!newblk) + newson = 1; + if (depth == info->dqi_qtree_depth - 1) { +#ifdef __QUOTA_QT_PARANOIA + if (newblk) { + quota_error(dquot->dq_sb, "Inserting already present " + "quota entry (block %u)", + le32_to_cpu(ref[get_index(info, + dquot->dq_id, depth)])); + ret = -EIO; + goto out_buf; + } +#endif + newblk = find_free_dqentry(info, dquot, &ret); + } else { + ret = do_insert_tree(info, dquot, &newblk, depth+1); + } + if (newson && ret >= 0) { + ref[get_index(info, dquot->dq_id, depth)] = + cpu_to_le32(newblk); + ret = write_blk(info, *treeblk, buf); + } else if (newact && ret < 0) { + put_free_dqblk(info, buf, *treeblk); + } +out_buf: + kfree(buf); + return ret; +} + +/* Wrapper for inserting quota structure into tree */ +static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + int tmp = QT_TREEOFF; + +#ifdef __QUOTA_QT_PARANOIA + if (info->dqi_blocks <= QT_TREEOFF) { + quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); + return -EIO; + } +#endif + return do_insert_tree(info, dquot, &tmp, 0); +} + +/* + * We don't have to be afraid of deadlocks as we never have quotas on quota + * files... + */ +int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_id.type; + struct super_block *sb = dquot->dq_sb; + ssize_t ret; + char *ddquot = getdqbuf(info->dqi_entry_size); + + if (!ddquot) + return -ENOMEM; + + /* dq_off is guarded by dqio_mutex */ + if (!dquot->dq_off) { + ret = dq_insert_tree(info, dquot); + if (ret < 0) { + quota_error(sb, "Error %zd occurred while creating " + "quota", ret); + kfree(ddquot); + return ret; + } + } + spin_lock(&dq_data_lock); + info->dqi_ops->mem2disk_dqblk(ddquot, dquot); + spin_unlock(&dq_data_lock); + ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, + dquot->dq_off); + if (ret != info->dqi_entry_size) { + quota_error(sb, "dquota write failed"); + if (ret >= 0) + ret = -ENOSPC; + } else { + ret = 0; + } + dqstats_inc(DQST_WRITES); + kfree(ddquot); + + return ret; +} +EXPORT_SYMBOL(qtree_write_dquot); + +/* Free dquot entry in data block */ +static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint blk) +{ + struct qt_disk_dqdbheader *dh; + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { + quota_error(dquot->dq_sb, "Quota structure has offset to " + "other block (%u) than it should (%u)", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + goto out_buf; + } + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota data block %u", + blk); + goto out_buf; + } + dh = (struct qt_disk_dqdbheader *)buf; + le16_add_cpu(&dh->dqdh_entries, -1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ + ret = remove_free_dqentry(info, buf, blk); + if (ret >= 0) + ret = put_free_dqblk(info, buf, blk); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't move quota data block " + "(%u) to free list", blk); + goto out_buf; + } + } else { + memset(buf + + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), + 0, info->dqi_entry_size); + if (le16_to_cpu(dh->dqdh_entries) == + qtree_dqstr_in_blk(info) - 1) { + /* Insert will write block itself */ + ret = insert_free_dqentry(info, buf, blk); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't insert quota " + "data block (%u) to free entry list", blk); + goto out_buf; + } + } else { + ret = write_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota " + "data block %u", blk); + goto out_buf; + } + } + } + dquot->dq_off = 0; /* Quota is now unattached */ +out_buf: + kfree(buf); + return ret; +} + +/* Remove reference to dquot from tree */ +static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *blk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + uint newblk; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, *blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota data block %u", + *blk); + goto out_buf; + } + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (depth == info->dqi_qtree_depth - 1) { + ret = free_dqentry(info, dquot, newblk); + newblk = 0; + } else { + ret = remove_tree(info, dquot, &newblk, depth+1); + } + if (ret >= 0 && !newblk) { + int i; + ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); + /* Block got empty? */ + for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) + ; + /* Don't put the root block into the free block list */ + if (i == (info->dqi_usable_bs >> 2) + && *blk != QT_TREEOFF) { + put_free_dqblk(info, buf, *blk); + *blk = 0; + } else { + ret = write_blk(info, *blk, buf); + if (ret < 0) + quota_error(dquot->dq_sb, + "Can't write quota tree block %u", + *blk); + } + } +out_buf: + kfree(buf); + return ret; +} + +/* Delete dquot from tree */ +int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + uint tmp = QT_TREEOFF; + + if (!dquot->dq_off) /* Even not allocated? */ + return 0; + return remove_tree(info, dquot, &tmp, 0); +} +EXPORT_SYMBOL(qtree_delete_dquot); + +/* Find entry in block */ +static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + int i; + char *ddquot; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota tree " + "block %u", blk); + goto out_buf; + } + ddquot = buf + sizeof(struct qt_disk_dqdbheader); + for (i = 0; i < qtree_dqstr_in_blk(info); i++) { + if (info->dqi_ops->is_id(ddquot, dquot)) + break; + ddquot += info->dqi_entry_size; + } + if (i == qtree_dqstr_in_blk(info)) { + quota_error(dquot->dq_sb, + "Quota for id %u referenced but not present", + from_kqid(&init_user_ns, dquot->dq_id)); + ret = -EIO; + goto out_buf; + } else { + ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + qt_disk_dqdbheader) + i * info->dqi_entry_size; + } +out_buf: + kfree(buf); + return ret; +} + +/* Find entry for given id in the tree */ +static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota tree block %u", + blk); + goto out_buf; + } + ret = 0; + blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!blk) /* No reference? */ + goto out_buf; + if (depth < info->dqi_qtree_depth - 1) + ret = find_tree_dqentry(info, dquot, blk, depth+1); + else + ret = find_block_dqentry(info, dquot, blk); +out_buf: + kfree(buf); + return ret; +} + +/* Find entry for given id in the tree - wrapper function */ +static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); +} + +int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_id.type; + struct super_block *sb = dquot->dq_sb; + loff_t offset; + char *ddquot; + int ret = 0; + +#ifdef __QUOTA_QT_PARANOIA + /* Invalidated quota? */ + if (!sb_dqopt(dquot->dq_sb)->files[type]) { + quota_error(sb, "Quota invalidated while reading!"); + return -EIO; + } +#endif + /* Do we know offset of the dquot entry in the quota file? */ + if (!dquot->dq_off) { + offset = find_dqentry(info, dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) + quota_error(sb,"Can't read quota structure " + "for id %u", + from_kqid(&init_user_ns, + dquot->dq_id)); + dquot->dq_off = 0; + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + ret = offset; + goto out; + } + dquot->dq_off = offset; + } + ddquot = getdqbuf(info->dqi_entry_size); + if (!ddquot) + return -ENOMEM; + ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, + dquot->dq_off); + if (ret != info->dqi_entry_size) { + if (ret >= 0) + ret = -EIO; + quota_error(sb, "Error while reading quota structure for id %u", + from_kqid(&init_user_ns, dquot->dq_id)); + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + kfree(ddquot); + goto out; + } + spin_lock(&dq_data_lock); + info->dqi_ops->disk2mem_dqblk(dquot, ddquot); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + kfree(ddquot); +out: + dqstats_inc(DQST_READS); + return ret; +} +EXPORT_SYMBOL(qtree_read_dquot); + +/* Check whether dquot should not be deleted. We know we are + * the only one operating on dquot (thanks to dq_lock) */ +int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && + !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) + return qtree_delete_dquot(info, dquot); + return 0; +} +EXPORT_SYMBOL(qtree_release_dquot); diff --git a/kernel/fs/quota/quota_tree.h b/kernel/fs/quota/quota_tree.h new file mode 100644 index 000000000..a1ab8db81 --- /dev/null +++ b/kernel/fs/quota/quota_tree.h @@ -0,0 +1,25 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTA_TREE_H +#define _LINUX_QUOTA_TREE_H + +#include +#include + +/* + * Structure of header of block with quota structures. It is padded to 16 bytes so + * there will be space for exactly 21 quota-entries in a block + */ +struct qt_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ + __le32 dqdh_prev_free; /* Number of previous block with free entry */ + __le16 dqdh_entries; /* Number of valid entries in block */ + __le16 dqdh_pad1; + __le32 dqdh_pad2; +}; + +#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ + +#endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/kernel/fs/quota/quota_v1.c b/kernel/fs/quota/quota_v1.c new file mode 100644 index 000000000..8fe79bece --- /dev/null +++ b/kernel/fs/quota/quota_v1.c @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quotaio_v1.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Old quota format support"); +MODULE_LICENSE("GPL"); + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v1_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v1_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + +static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) +{ + m->dqb_ihardlimit = d->dqb_ihardlimit; + m->dqb_isoftlimit = d->dqb_isoftlimit; + m->dqb_curinodes = d->dqb_curinodes; + m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit); + m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit); + m->dqb_curspace = v1_qbtos(d->dqb_curblocks); + m->dqb_itime = d->dqb_itime; + m->dqb_btime = d->dqb_btime; +} + +static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) +{ + d->dqb_ihardlimit = m->dqb_ihardlimit; + d->dqb_isoftlimit = m->dqb_isoftlimit; + d->dqb_curinodes = m->dqb_curinodes; + d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit); + d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit); + d->dqb_curblocks = v1_stoqb(m->dqb_curspace); + d->dqb_itime = m->dqb_itime; + d->dqb_btime = m->dqb_btime; +} + +static int v1_read_dqblk(struct dquot *dquot) +{ + int type = dquot->dq_id.type; + struct v1_disk_dqblk dqblk; + + if (!sb_dqopt(dquot->dq_sb)->files[type]) + return -EINVAL; + + /* Set structure to 0s in case read fails/is after end of file */ + memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); + dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), + v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id))); + + v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); + if (dquot->dq_dqb.dqb_bhardlimit == 0 && + dquot->dq_dqb.dqb_bsoftlimit == 0 && + dquot->dq_dqb.dqb_ihardlimit == 0 && + dquot->dq_dqb.dqb_isoftlimit == 0) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + dqstats_inc(DQST_READS); + + return 0; +} + +static int v1_commit_dqblk(struct dquot *dquot) +{ + short type = dquot->dq_id.type; + ssize_t ret; + struct v1_disk_dqblk dqblk; + + v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); + if (((type == USRQUOTA) && uid_eq(dquot->dq_id.uid, GLOBAL_ROOT_UID)) || + ((type == GRPQUOTA) && gid_eq(dquot->dq_id.gid, GLOBAL_ROOT_GID))) { + dqblk.dqb_btime = + sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; + dqblk.dqb_itime = + sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace; + } + ret = 0; + if (sb_dqopt(dquot->dq_sb)->files[type]) + ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, + (char *)&dqblk, sizeof(struct v1_disk_dqblk), + v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id))); + if (ret != sizeof(struct v1_disk_dqblk)) { + quota_error(dquot->dq_sb, "dquota write failed"); + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + +out: + dqstats_inc(DQST_WRITES); + + return ret; +} + +/* Magics of new quota format */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +/* Header of new quota format */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +static int v1_check_quota_file(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ulong blocks; + size_t off; + struct v2_disk_dqheader dqhead; + ssize_t size; + loff_t isize; + static const uint quota_magics[] = V2_INITQMAGICS; + + isize = i_size_read(inode); + if (!isize) + return 0; + blocks = isize >> BLOCK_SIZE_BITS; + off = isize & (BLOCK_SIZE - 1); + if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % + sizeof(struct v1_disk_dqblk)) + return 0; + /* Doublecheck whether we didn't get file with new format - with old + * quotactl() this could happen */ + size = sb->s_op->quota_read(sb, type, (char *)&dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) + return 1; /* Probably not new format */ + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) + return 1; /* Definitely not new format */ + printk(KERN_INFO + "VFS: %s: Refusing to turn on old quota format on given file." + " It probably contains newer quota format.\n", sb->s_id); + return 0; /* Seems like a new format file -> refuse it */ +} + +static int v1_read_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct v1_disk_dqblk dqblk; + int ret; + + ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + /* limits are stored as unsigned 32-bit data */ + dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS; + dqopt->info[type].dqi_max_ino_limit = 0xffffffff; + dqopt->info[type].dqi_igrace = + dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = + dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; +out: + return ret; +} + +static int v1_write_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct v1_disk_dqblk dqblk; + int ret; + + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; + ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + dqblk.dqb_itime = dqopt->info[type].dqi_igrace; + dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; + ret = sb->s_op->quota_write(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret == sizeof(struct v1_disk_dqblk)) + ret = 0; + else if (ret > 0) + ret = -EIO; +out: + return ret; +} + +static const struct quota_format_ops v1_format_ops = { + .check_quota_file = v1_check_quota_file, + .read_file_info = v1_read_file_info, + .write_file_info = v1_write_file_info, + .free_file_info = NULL, + .read_dqblk = v1_read_dqblk, + .commit_dqblk = v1_commit_dqblk, +}; + +static struct quota_format_type v1_quota_format = { + .qf_fmt_id = QFMT_VFS_OLD, + .qf_ops = &v1_format_ops, + .qf_owner = THIS_MODULE +}; + +static int __init init_v1_quota_format(void) +{ + return register_quota_format(&v1_quota_format); +} + +static void __exit exit_v1_quota_format(void) +{ + unregister_quota_format(&v1_quota_format); +} + +module_init(init_v1_quota_format); +module_exit(exit_v1_quota_format); + diff --git a/kernel/fs/quota/quota_v2.c b/kernel/fs/quota/quota_v2.c new file mode 100644 index 000000000..2aa012a68 --- /dev/null +++ b/kernel/fs/quota/quota_v2.c @@ -0,0 +1,346 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quota_tree.h" +#include "quotaio_v2.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota format v2 support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_V2_PARANOIA + +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r0_is_id(void *dp, struct dquot *dquot); +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r1_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2r0_qtree_ops = { + .mem2disk_dqblk = v2r0_mem2diskdqb, + .disk2mem_dqblk = v2r0_disk2memdqb, + .is_id = v2r0_is_id, +}; + +static struct qtree_fmt_operations v2r1_qtree_ops = { + .mem2disk_dqblk = v2r1_mem2diskdqb, + .disk2mem_dqblk = v2r1_disk2memdqb, + .is_id = v2r1_is_id, +}; + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v2_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v2_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + +static int v2_read_header(struct super_block *sb, int type, + struct v2_disk_dqheader *dqhead) +{ + ssize_t size; + + size = sb->s_op->quota_read(sb, type, (char *)dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + quota_error(sb, "Failed header read: expected=%zd got=%zd", + sizeof(struct v2_disk_dqheader), size); + return 0; + } + return 1; +} + +/* Check whether given file is really vfsv0 quotafile */ +static int v2_check_quota_file(struct super_block *sb, int type) +{ + struct v2_disk_dqheader dqhead; + static const uint quota_magics[] = V2_INITQMAGICS; + static const uint quota_versions[] = V2_INITQVERSIONS; + + if (!v2_read_header(sb, type, &dqhead)) + return 0; + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || + le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) + return 0; + return 1; +} + +/* Read information header from quota file */ +static int v2_read_file_info(struct super_block *sb, int type) +{ + struct v2_disk_dqinfo dinfo; + struct v2_disk_dqheader dqhead; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo; + ssize_t size; + unsigned int version; + + if (!v2_read_header(sb, type, &dqhead)) + return -1; + version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; + + size = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + quota_error(sb, "Can't read info structure"); + return -1; + } + info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); + if (!info->dqi_priv) { + printk(KERN_WARNING + "Not enough memory for quota information structure.\n"); + return -ENOMEM; + } + qinfo = info->dqi_priv; + if (version == 0) { + /* limits are stored as unsigned 32-bit data */ + info->dqi_max_spc_limit = 0xffffffffLL << QUOTABLOCK_BITS; + info->dqi_max_ino_limit = 0xffffffff; + } else { + /* + * Used space is stored as unsigned 64-bit value in bytes but + * quota core supports only signed 64-bit values so use that + * as a limit + */ + info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */ + info->dqi_max_ino_limit = 0x7fffffffffffffffLL; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + /* No flags currently supported */ + info->dqi_flags = 0; + qinfo->dqi_sb = sb; + qinfo->dqi_type = type; + qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; + qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; + qinfo->dqi_qtree_depth = qtree_depth(qinfo); + if (version == 0) { + qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk); + qinfo->dqi_ops = &v2r0_qtree_ops; + } else { + qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); + qinfo->dqi_ops = &v2r1_qtree_ops; + } + return 0; +} + +/* Write information header to quota file */ +static int v2_write_file_info(struct super_block *sb, int type) +{ + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo = info->dqi_priv; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + /* No flags currently supported */ + dinfo.dqi_flags = cpu_to_le32(0); + spin_unlock(&dq_data_lock); + dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + quota_error(sb, "Can't write info structure"); + return -1; + } + return 0; +} + +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r0_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r0_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r0_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r0_is_id(void *dp, struct dquot *dquot) +{ + struct v2r0_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); +} + +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r1_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r1_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r1_is_id(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); +} + +static int v2_read_dquot(struct dquot *dquot) +{ + return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); +} + +static int v2_write_dquot(struct dquot *dquot) +{ + return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); +} + +static int v2_release_dquot(struct dquot *dquot) +{ + return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); +} + +static int v2_free_file_info(struct super_block *sb, int type) +{ + kfree(sb_dqinfo(sb, type)->dqi_priv); + return 0; +} + +static const struct quota_format_ops v2_format_ops = { + .check_quota_file = v2_check_quota_file, + .read_file_info = v2_read_file_info, + .write_file_info = v2_write_file_info, + .free_file_info = v2_free_file_info, + .read_dqblk = v2_read_dquot, + .commit_dqblk = v2_write_dquot, + .release_dqblk = v2_release_dquot, +}; + +static struct quota_format_type v2r0_quota_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + +static struct quota_format_type v2r1_quota_format = { + .qf_fmt_id = QFMT_VFS_V1, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + +static int __init init_v2_quota_format(void) +{ + int ret; + + ret = register_quota_format(&v2r0_quota_format); + if (ret) + return ret; + return register_quota_format(&v2r1_quota_format); +} + +static void __exit exit_v2_quota_format(void) +{ + unregister_quota_format(&v2r0_quota_format); + unregister_quota_format(&v2r1_quota_format); +} + +module_init(init_v2_quota_format); +module_exit(exit_v2_quota_format); diff --git a/kernel/fs/quota/quotaio_v1.h b/kernel/fs/quota/quotaio_v1.h new file mode 100644 index 000000000..746654b5d --- /dev/null +++ b/kernel/fs/quota/quotaio_v1.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_QUOTAIO_V1_H +#define _LINUX_QUOTAIO_V1_H + +#include + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. + */ +struct v1_disk_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ +}; + +#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) + +#endif /* _LINUX_QUOTAIO_V1_H */ diff --git a/kernel/fs/quota/quotaio_v2.h b/kernel/fs/quota/quotaio_v2.h new file mode 100644 index 000000000..4e9543009 --- /dev/null +++ b/kernel/fs/quota/quotaio_v2.h @@ -0,0 +1,75 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTAIO_V2_H +#define _LINUX_QUOTAIO_V2_H + +#include +#include + +/* + * Definitions of magics and versions of current quota files + */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927, /* GRPQUOTA */\ + 0xd9c03f14, /* PRJQUOTA */\ +} + +#define V2_INITQVERSIONS {\ + 1, /* USRQUOTA */\ + 1, /* GRPQUOTA */\ + 1, /* PRJQUOTA */\ +} + +/* First generic header */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +struct v2r0_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ + __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +struct v2r1_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_pad; + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ + __le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +/* Header with type and version specific information */ +struct v2_disk_dqinfo { + __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ + __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */ + __le32 dqi_flags; /* Flags for quotafile (DQF_*) */ + __le32 dqi_blocks; /* Number of blocks in file */ + __le32 dqi_free_blk; /* Number of first free block in the list */ + __le32 dqi_free_entry; /* Number of block with at least one free entry */ +}; + +#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */ +#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */ + +#endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/kernel/fs/ramfs/Makefile b/kernel/fs/ramfs/Makefile new file mode 100644 index 000000000..c71e65dca --- /dev/null +++ b/kernel/fs/ramfs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the linux ramfs routines. +# + +obj-y += ramfs.o + +file-mmu-y := file-nommu.o +file-mmu-$(CONFIG_MMU) := file-mmu.o +ramfs-objs += inode.o $(file-mmu-y) diff --git a/kernel/fs/ramfs/file-mmu.c b/kernel/fs/ramfs/file-mmu.c new file mode 100644 index 000000000..183a21269 --- /dev/null +++ b/kernel/fs/ramfs/file-mmu.c @@ -0,0 +1,46 @@ +/* file-mmu.c: ramfs MMU-based file operations + * + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * + * Usage limits added by David Gibson, Linuxcare Australia. + * This file is released under the GPL. + */ + +/* + * NOTE! This filesystem is probably most useful + * not as a real filesystem, but as an example of + * how virtual filesystems can be written. + * + * It doesn't get much simpler than this. Consider + * that this file implements the full semantics of + * a POSIX-compliant read-write filesystem. + * + * Note in particular how the filesystem does not + * need to implement any data structures of its own + * to keep track of the virtual data: using the VFS + * caches is sufficient. + */ + +#include +#include +#include + +#include "internal.h" + +const struct file_operations ramfs_file_operations = { + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations ramfs_file_inode_operations = { + .setattr = simple_setattr, + .getattr = simple_getattr, +}; diff --git a/kernel/fs/ramfs/file-nommu.c b/kernel/fs/ramfs/file-nommu.c new file mode 100644 index 000000000..ba1323a94 --- /dev/null +++ b/kernel/fs/ramfs/file-nommu.c @@ -0,0 +1,271 @@ +/* file-nommu.c: no-MMU version of ramfs + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +static int ramfs_nommu_setattr(struct dentry *, struct iattr *); +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags); +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); + +static unsigned ramfs_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ | + NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; +} + +const struct file_operations ramfs_file_operations = { + .mmap_capabilities = ramfs_mmap_capabilities, + .mmap = ramfs_nommu_mmap, + .get_unmapped_area = ramfs_nommu_get_unmapped_area, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations ramfs_file_inode_operations = { + .setattr = ramfs_nommu_setattr, + .getattr = simple_getattr, +}; + +/*****************************************************************************/ +/* + * add a contiguous set of pages into a ramfs inode when it's truncated from + * size 0 on the assumption that it's going to be used for an mmap of shared + * memory + */ +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +{ + unsigned long npages, xpages, loop; + struct page *pages; + unsigned order; + void *data; + int ret; + + /* make various checks */ + order = get_order(newsize); + if (unlikely(order >= MAX_ORDER)) + return -EFBIG; + + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; + + i_size_write(inode, newsize); + + /* allocate enough contiguous pages to be able to satisfy the + * request */ + pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); + if (!pages) + return -ENOMEM; + + /* split the high-order page into an array of single pages */ + xpages = 1UL << order; + npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + + split_page(pages, order); + + /* trim off any pages we don't actually require */ + for (loop = npages; loop < xpages; loop++) + __free_page(pages + loop); + + /* clear the memory we allocated */ + newsize = PAGE_SIZE * npages; + data = page_address(pages); + memset(data, 0, newsize); + + /* attach all the pages to the inode's address space */ + for (loop = 0; loop < npages; loop++) { + struct page *page = pages + loop; + + ret = add_to_page_cache_lru(page, inode->i_mapping, loop, + GFP_KERNEL); + if (ret < 0) + goto add_error; + + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + SetPageUptodate(page); + + unlock_page(page); + put_page(page); + } + + return 0; + +add_error: + while (loop < npages) + __free_page(pages + loop++); + return ret; +} + +/*****************************************************************************/ +/* + * + */ +static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) +{ + int ret; + + /* assume a truncate from zero size is going to be for the purposes of + * shared mmap */ + if (size == 0) { + if (unlikely(newsize >> 32)) + return -EFBIG; + + return ramfs_nommu_expand_for_mapping(inode, newsize); + } + + /* check that a decrease in size doesn't cut off any shared mappings */ + if (newsize < size) { + ret = nommu_shrink_inode_mappings(inode, size, newsize); + if (ret < 0) + return ret; + } + + truncate_setsize(inode, newsize); + return 0; +} + +/*****************************************************************************/ +/* + * handle a change of attributes + * - we're specifically interested in a change of size + */ +static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) +{ + struct inode *inode = d_inode(dentry); + unsigned int old_ia_valid = ia->ia_valid; + int ret = 0; + + /* POSIX UID/GID verification for setting inode attributes */ + ret = inode_change_ok(inode, ia); + if (ret) + return ret; + + /* pick out size-changing events */ + if (ia->ia_valid & ATTR_SIZE) { + loff_t size = inode->i_size; + + if (ia->ia_size != size) { + ret = ramfs_nommu_resize(inode, ia->ia_size, size); + if (ret < 0 || ia->ia_valid == ATTR_SIZE) + goto out; + } else { + /* we skipped the truncate but must still update + * timestamps + */ + ia->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + } + + setattr_copy(inode, ia); + out: + ia->ia_valid = old_ia_valid; + return ret; +} + +/*****************************************************************************/ +/* + * try to determine where a shared mapping can be made + * - we require that: + * - the pages to be mapped must exist + * - the pages be physically contiguous in sequence + */ +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long maxpages, lpages, nr, loop, ret; + struct inode *inode = file_inode(file); + struct page **pages = NULL, **ptr, *page; + loff_t isize; + + if (!(flags & MAP_SHARED)) + return addr; + + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + isize = i_size_read(inode); + + ret = -EINVAL; + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= maxpages) + goto out; + + if (maxpages - pgoff < lpages) + goto out; + + /* gang-find the pages */ + ret = -ENOMEM; + pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_free; + + nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + if (nr != lpages) + goto out_free_pages; /* leave if some pages were missing */ + + /* check the pages for physical adjacency */ + ptr = pages; + page = *ptr++; + page++; + for (loop = lpages; loop > 1; loop--) + if (*ptr++ != page++) + goto out_free_pages; + + /* okay - all conditions fulfilled */ + ret = (unsigned long) page_address(pages[0]); + +out_free_pages: + ptr = pages; + for (loop = nr; loop > 0; loop--) + put_page(*ptr++); +out_free: + kfree(pages); +out: + return ret; +} + +/*****************************************************************************/ +/* + * set up a mapping for shared memory segments + */ +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!(vma->vm_flags & VM_SHARED)) + return -ENOSYS; + + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} diff --git a/kernel/fs/ramfs/inode.c b/kernel/fs/ramfs/inode.c new file mode 100644 index 000000000..889d558b4 --- /dev/null +++ b/kernel/fs/ramfs/inode.c @@ -0,0 +1,266 @@ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * + * Usage limits added by David Gibson, Linuxcare Australia. + * This file is released under the GPL. + */ + +/* + * NOTE! This filesystem is probably most useful + * not as a real filesystem, but as an example of + * how virtual filesystems can be written. + * + * It doesn't get much simpler than this. Consider + * that this file implements the full semantics of + * a POSIX-compliant read-write filesystem. + * + * Note in particular how the filesystem does not + * need to implement any data structures of its own + * to keep track of the virtual data: using the VFS + * caches is sufficient. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define RAMFS_DEFAULT_MODE 0755 + +static const struct super_operations ramfs_ops; +static const struct inode_operations ramfs_dir_inode_operations; + +static const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + +struct inode *ramfs_get_inode(struct super_block *sb, + const struct inode *dir, umode_t mode, dev_t dev) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); + inode->i_mapping->a_ops = &ramfs_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &ramfs_file_inode_operations; + inode->i_fop = &ramfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ramfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int +ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); + int error = -ENOSPC; + + if (inode) { + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } + return error; +} + +static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +{ + int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); + if (!retval) + inc_nlink(dir); + return retval; +} + +static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +{ + return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + if (inode) { + int l = strlen(symname)+1; + error = page_symlink(inode, symname, l); + if (!error) { + d_instantiate(dentry, inode); + dget(dentry); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } else + iput(inode); + } + return error; +} + +static const struct inode_operations ramfs_dir_inode_operations = { + .create = ramfs_create, + .lookup = simple_lookup, + .link = simple_link, + .unlink = simple_unlink, + .symlink = ramfs_symlink, + .mkdir = ramfs_mkdir, + .rmdir = simple_rmdir, + .mknod = ramfs_mknod, + .rename = simple_rename, +}; + +static const struct super_operations ramfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = generic_show_options, +}; + +struct ramfs_mount_opts { + umode_t mode; +}; + +enum { + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct ramfs_fs_info { + struct ramfs_mount_opts mount_opts; +}; + +static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + char *p; + + opts->mode = RAMFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally ramfs has ignored all mount options, + * and as it is used as a !CONFIG_SHMEM simple substitute + * for tmpfs, better continue to ignore other mount options. + */ + } + } + + return 0; +} + +int ramfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ramfs_fs_info *fsi; + struct inode *inode; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) + return -ENOMEM; + + err = ramfs_parse_options(data, &fsi->mount_opts); + if (err) + return err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + +struct dentry *ramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, ramfs_fill_super); +} + +static void ramfs_kill_sb(struct super_block *sb) +{ + kfree(sb->s_fs_info); + kill_litter_super(sb); +} + +static struct file_system_type ramfs_fs_type = { + .name = "ramfs", + .mount = ramfs_mount, + .kill_sb = ramfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, +}; + +int __init init_ramfs_fs(void) +{ + static unsigned long once; + + if (test_and_set_bit(0, &once)) + return 0; + return register_filesystem(&ramfs_fs_type); +} +fs_initcall(init_ramfs_fs); diff --git a/kernel/fs/ramfs/internal.h b/kernel/fs/ramfs/internal.h new file mode 100644 index 000000000..a9d8ae88f --- /dev/null +++ b/kernel/fs/ramfs/internal.h @@ -0,0 +1,13 @@ +/* internal.h: ramfs internal definitions + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +extern const struct inode_operations ramfs_file_inode_operations; diff --git a/kernel/fs/read_write.c b/kernel/fs/read_write.c new file mode 100644 index 000000000..819ef3faf --- /dev/null +++ b/kernel/fs/read_write.c @@ -0,0 +1,1329 @@ +/* + * linux/fs/read_write.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#include +#include + +typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); + +const struct file_operations generic_ro_fops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, +}; + +EXPORT_SYMBOL(generic_ro_fops); + +static inline int unsigned_offsets(struct file *file) +{ + return file->f_mode & FMODE_UNSIGNED_OFFSET; +} + +/** + * vfs_setpos - update the file offset for lseek + * @file: file structure in question + * @offset: file offset to seek to + * @maxsize: maximum file size + * + * This is a low-level filesystem helper for updating the file offset to + * the value specified by @offset if the given offset is valid and it is + * not equal to the current file offset. + * + * Return the specified offset on success and -EINVAL on invalid offset. + */ +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) +{ + if (offset < 0 && !unsigned_offsets(file)) + return -EINVAL; + if (offset > maxsize) + return -EINVAL; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + return offset; +} +EXPORT_SYMBOL(vfs_setpos); + +/** + * generic_file_llseek_size - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: max size of this file in file system + * @eof: offset used for SEEK_END position + * + * This is a variant of generic_file_llseek that allows passing in a custom + * maximum file size and a custom EOF position, for e.g. hashed directories + * + * Synchronization: + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks. + */ +loff_t +generic_file_llseek_size(struct file *file, loff_t offset, int whence, + loff_t maxsize, loff_t eof) +{ + switch (whence) { + case SEEK_END: + offset += eof; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) + return file->f_pos; + /* + * f_lock protects against read/modify/write race with other + * SEEK_CURs. Note that parallel writes and reads behave + * like SEEK_SET. + */ + spin_lock(&file->f_lock); + offset = vfs_setpos(file, file->f_pos + offset, maxsize); + spin_unlock(&file->f_lock); + return offset; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as long as + * offset isn't at the end of the file then the offset is data. + */ + if (offset >= eof) + return -ENXIO; + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so as long as + * offset isn't i_size or larger, return i_size. + */ + if (offset >= eof) + return -ENXIO; + offset = eof; + break; + } + + return vfs_setpos(file, offset, maxsize); +} +EXPORT_SYMBOL(generic_file_llseek_size); + +/** + * generic_file_llseek - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * + * This is a generic implemenation of ->llseek useable for all normal local + * filesystems. It just updates the file offset to the value specified by + * @offset and @whence. + */ +loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + + return generic_file_llseek_size(file, offset, whence, + inode->i_sb->s_maxbytes, + i_size_read(inode)); +} +EXPORT_SYMBOL(generic_file_llseek); + +/** + * fixed_size_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: size of the file + * + */ +loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + size, size); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(fixed_size_llseek); + +/** + * noop_llseek - No Operation Performed llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * + * This is an implementation of ->llseek useable for the rare special case when + * userspace expects the seek to succeed but the (device) file is actually not + * able to perform the seek. In this case you use noop_llseek() instead of + * falling back to the default implementation of ->llseek. + */ +loff_t noop_llseek(struct file *file, loff_t offset, int whence) +{ + return file->f_pos; +} +EXPORT_SYMBOL(noop_llseek); + +loff_t no_llseek(struct file *file, loff_t offset, int whence) +{ + return -ESPIPE; +} +EXPORT_SYMBOL(no_llseek); + +loff_t default_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t retval; + + mutex_lock(&inode->i_mutex); + switch (whence) { + case SEEK_END: + offset += i_size_read(inode); + break; + case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as + * long as offset isn't at the end of the file then the + * offset is data. + */ + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so + * as long as offset isn't i_size or larger, return + * i_size. + */ + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } + offset = inode->i_size; + break; + } + retval = -EINVAL; + if (offset >= 0 || unsigned_offsets(file)) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + retval = offset; + } +out: + mutex_unlock(&inode->i_mutex); + return retval; +} +EXPORT_SYMBOL(default_llseek); + +loff_t vfs_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t (*fn)(struct file *, loff_t, int); + + fn = no_llseek; + if (file->f_mode & FMODE_LSEEK) { + if (file->f_op->llseek) + fn = file->f_op->llseek; + } + return fn(file, offset, whence); +} +EXPORT_SYMBOL(vfs_llseek); + +static inline struct fd fdget_pos(int fd) +{ + return __to_fd(__fdget_pos(fd)); +} + +static inline void fdput_pos(struct fd f) +{ + if (f.flags & FDPUT_POS_UNLOCK) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +{ + off_t retval; + struct fd f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + retval = -EINVAL; + if (whence <= SEEK_MAX) { + loff_t res = vfs_llseek(f.file, offset, whence); + retval = res; + if (res != (loff_t)retval) + retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ + } + fdput_pos(f); + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) +{ + return sys_lseek(fd, offset, whence); +} +#endif + +#ifdef __ARCH_WANT_SYS_LLSEEK +SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, + unsigned long, offset_low, loff_t __user *, result, + unsigned int, whence) +{ + int retval; + struct fd f = fdget_pos(fd); + loff_t offset; + + if (!f.file) + return -EBADF; + + retval = -EINVAL; + if (whence > SEEK_MAX) + goto out_putf; + + offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, + whence); + + retval = (int)offset; + if (offset >= 0) { + retval = -EFAULT; + if (!copy_to_user(result, &offset, sizeof(offset))) + retval = 0; + } +out_putf: + fdput_pos(f); + return retval; +} +#endif + +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->read_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + + iter->type |= READ; + ret = file->f_op->read_iter(&kiocb, iter); + BUG_ON(ret == -EIOCBQUEUED); + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_read); + +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->write_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + + iter->type |= WRITE; + ret = file->f_op->write_iter(&kiocb, iter); + BUG_ON(ret == -EIOCBQUEUED); + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_write); + +/* + * rw_verify_area doesn't like huge counts. We limit + * them to something that fits in "int" so that others + * won't have to do range checks all the time. + */ +int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) +{ + struct inode *inode; + loff_t pos; + int retval = -EINVAL; + + inode = file_inode(file); + if (unlikely((ssize_t) count < 0)) + return retval; + pos = *ppos; + if (unlikely(pos < 0)) { + if (!unsigned_offsets(file)) + return retval; + if (count >= -pos) /* both values are in 0..LLONG_MAX */ + return -EOVERFLOW; + } else if (unlikely((loff_t) (pos + count) < 0)) { + if (!unsigned_offsets(file)) + return retval; + } + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + retval = locks_mandatory_area( + read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, + inode, file, pos, count); + if (retval < 0) + return retval; + } + retval = security_file_permission(file, + read_write == READ ? MAY_READ : MAY_WRITE); + if (retval) + return retval; + return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; +} + +static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + iov_iter_init(&iter, READ, &iov, 1, len); + + ret = filp->f_op->read_iter(&kiocb, &iter); + BUG_ON(ret == -EIOCBQUEUED); + *ppos = kiocb.ki_pos; + return ret; +} + +ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, + loff_t *pos) +{ + if (file->f_op->read) + return file->f_op->read(file, buf, count, pos); + else if (file->f_op->read_iter) + return new_sync_read(file, buf, count, pos); + else + return -EINVAL; +} +EXPORT_SYMBOL(__vfs_read); + +ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret; + + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) + return -EFAULT; + + ret = rw_verify_area(READ, file, pos, count); + if (ret >= 0) { + count = ret; + ret = __vfs_read(file, buf, count, pos); + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); + } + + return ret; +} + +EXPORT_SYMBOL(vfs_read); + +static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + iov_iter_init(&iter, WRITE, &iov, 1, len); + + ret = filp->f_op->write_iter(&kiocb, &iter); + BUG_ON(ret == -EIOCBQUEUED); + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} + +ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, + loff_t *pos) +{ + if (file->f_op->write) + return file->f_op->write(file, p, count, pos); + else if (file->f_op->write_iter) + return new_sync_write(file, p, count, pos); + else + return -EINVAL; +} +EXPORT_SYMBOL(__vfs_write); + +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + ret = __vfs_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + +EXPORT_SYMBOL(__kernel_write); + +ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + if (unlikely(!access_ok(VERIFY_READ, buf, count))) + return -EFAULT; + + ret = rw_verify_area(WRITE, file, pos, count); + if (ret >= 0) { + count = ret; + file_start_write(file); + ret = __vfs_write(file, buf, count, pos); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + file_end_write(file); + } + + return ret; +} + +EXPORT_SYMBOL(vfs_write); + +static inline loff_t file_pos_read(struct file *file) +{ + return file->f_pos; +} + +static inline void file_pos_write(struct file *file, loff_t pos) +{ + file->f_pos = pos; +} + +SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +{ + struct fd f = fdget_pos(fd); + ssize_t ret = -EBADF; + + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_read(f.file, buf, count, &pos); + if (ret >= 0) + file_pos_write(f.file, pos); + fdput_pos(f); + } + return ret; +} + +SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, + size_t, count) +{ + struct fd f = fdget_pos(fd); + ssize_t ret = -EBADF; + + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_write(f.file, buf, count, &pos); + if (ret >= 0) + file_pos_write(f.file, pos); + fdput_pos(f); + } + + return ret; +} + +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) +{ + struct fd f; + ssize_t ret = -EBADF; + + if (pos < 0) + return -EINVAL; + + f = fdget(fd); + if (f.file) { + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = vfs_read(f.file, buf, count, &pos); + fdput(f); + } + + return ret; +} + +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) +{ + struct fd f; + ssize_t ret = -EBADF; + + if (pos < 0) + return -EINVAL; + + f = fdget(fd); + if (f.file) { + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = vfs_write(f.file, buf, count, &pos); + fdput(f); + } + + return ret; +} + +/* + * Reduce an iovec's length in-place. Return the resulting number of segments + */ +unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) +{ + unsigned long seg = 0; + size_t len = 0; + + while (seg < nr_segs) { + seg++; + if (len + iov->iov_len >= to) { + iov->iov_len = to - len; + break; + } + len += iov->iov_len; + iov++; + } + return seg; +} +EXPORT_SYMBOL(iov_shorten); + +static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, + loff_t *ppos, iter_fn_t fn) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + + ret = fn(&kiocb, iter); + BUG_ON(ret == -EIOCBQUEUED); + *ppos = kiocb.ki_pos; + return ret; +} + +/* Do it by hand, with file-ops */ +static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, + loff_t *ppos, io_fn_t fn) +{ + ssize_t ret = 0; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + unsigned long seg; + ssize_t ret; + struct iovec *iov = fast_pointer; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) { + ret = 0; + goto out; + } + + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ + if (nr_segs > UIO_MAXIOV) { + ret = -EINVAL; + goto out; + } + if (nr_segs > fast_segs) { + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + ret = -ENOMEM; + goto out; + } + } + if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { + ret = -EFAULT; + goto out; + } + + /* + * According to the Single Unix Specification we should return EINVAL + * if an element length is < 0 when cast to ssize_t or if the + * total length would overflow the ssize_t return value of the + * system call. + * + * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the + * overflow case. + */ + ret = 0; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; + + /* see if we we're about to use an invalid len or if + * it's about to overflow ssize_t */ + if (len < 0) { + ret = -EINVAL; + goto out; + } + if (type >= 0 + && unlikely(!access_ok(vrfy_dir(type), buf, len))) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - ret) { + len = MAX_RW_COUNT - ret; + iov[seg].iov_len = len; + } + ret += len; + } +out: + *ret_pointer = iov; + return ret; +} + +static ssize_t do_readv_writev(int type, struct file *file, + const struct iovec __user * uvector, + unsigned long nr_segs, loff_t *pos) +{ + size_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + io_fn_t fn; + iter_fn_t iter_fn; + + ret = import_iovec(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + if (type == READ) { + fn = file->f_op->read; + iter_fn = file->f_op->read_iter; + } else { + fn = (io_fn_t)file->f_op->write; + iter_fn = file->f_op->write_iter; + file_start_write(file); + } + + if (iter_fn) + ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + else + ret = do_loop_readv_writev(file, &iter, pos, fn); + + if (type != READ) + file_end_write(file); + +out: + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + return do_readv_writev(READ, file, vec, vlen, pos); +} + +EXPORT_SYMBOL(vfs_readv); + +ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + return do_readv_writev(WRITE, file, vec, vlen, pos); +} + +EXPORT_SYMBOL(vfs_writev); + +SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + struct fd f = fdget_pos(fd); + ssize_t ret = -EBADF; + + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_readv(f.file, vec, vlen, &pos); + if (ret >= 0) + file_pos_write(f.file, pos); + fdput_pos(f); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + struct fd f = fdget_pos(fd); + ssize_t ret = -EBADF; + + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_writev(f.file, vec, vlen, &pos); + if (ret >= 0) + file_pos_write(f.file, pos); + fdput_pos(f); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) +{ +#define HALF_LONG_BITS (BITS_PER_LONG / 2) + return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; +} + +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct fd f; + ssize_t ret = -EBADF; + + if (pos < 0) + return -EINVAL; + + f = fdget(fd); + if (f.file) { + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = vfs_readv(f.file, vec, vlen, &pos); + fdput(f); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct fd f; + ssize_t ret = -EBADF; + + if (pos < 0) + return -EINVAL; + + f = fdget(fd); + if (f.file) { + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = vfs_writev(f.file, vec, vlen, &pos); + fdput(f); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +#ifdef CONFIG_COMPAT + +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + io_fn_t fn; + iter_fn_t iter_fn; + + ret = compat_import_iovec(type, uvector, nr_segs, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + if (type == READ) { + fn = file->f_op->read; + iter_fn = file->f_op->read_iter; + } else { + fn = (io_fn_t)file->f_op->write; + iter_fn = file->f_op->write_iter; + file_start_write(file); + } + + if (iter_fn) + ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + else + ret = do_loop_readv_writev(file, &iter, pos, fn); + + if (type != READ) + file_end_write(file); + +out: + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!(file->f_mode & FMODE_CAN_READ)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen) +{ + struct fd f = fdget_pos(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_readv(f.file, vec, vlen, &pos); + if (ret >= 0) + f.file->f_pos = pos; + fdput_pos(f); + return ret; +} + +static long __compat_sys_preadv64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = compat_readv(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_preadv64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + return __compat_sys_preadv64(fd, vec, vlen, pos); +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!(file->f_mode & FMODE_CAN_WRITE)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, + const struct compat_iovec __user *, vec, + compat_ulong_t, vlen) +{ + struct fd f = fdget_pos(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_writev(f.file, vec, vlen, &pos); + if (ret >= 0) + f.file->f_pos = pos; + fdput_pos(f); + return ret; +} + +static long __compat_sys_pwritev64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = compat_writev(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, + size_t count, loff_t max) +{ + struct fd in, out; + struct inode *in_inode, *out_inode; + loff_t pos; + loff_t out_pos; + ssize_t retval; + int fl; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in = fdget(in_fd); + if (!in.file) + goto out; + if (!(in.file->f_mode & FMODE_READ)) + goto fput_in; + retval = -ESPIPE; + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; + if (!(in.file->f_mode & FMODE_PREAD)) + goto fput_in; + } + retval = rw_verify_area(READ, in.file, &pos, count); + if (retval < 0) + goto fput_in; + count = retval; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out = fdget(out_fd); + if (!out.file) + goto fput_in; + if (!(out.file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + in_inode = file_inode(in.file); + out_inode = file_inode(out.file); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); + if (retval < 0) + goto fput_out; + count = retval; + + if (!max) + max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); + + if (unlikely(pos + count > max)) { + retval = -EOVERFLOW; + if (pos >= max) + goto fput_out; + count = max - pos; + } + + fl = 0; +#if 0 + /* + * We need to debate whether we can enable this or not. The + * man page documents EAGAIN return for the output at least, + * and the application is arguably buggy if it doesn't expect + * EAGAIN on a non-blocking file descriptor. + */ + if (in.file->f_flags & O_NONBLOCK) + fl = SPLICE_F_NONBLOCK; +#endif + file_start_write(out.file); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); + file_end_write(out.file); + + if (retval > 0) { + add_rchar(current, retval); + add_wchar(current, retval); + fsnotify_access(in.file); + fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; + } + + inc_syscr(current); + inc_syscw(current); + if (pos > max) + retval = -EOVERFLOW; + +fput_out: + fdput(out); +fput_in: + fdput(in); +out: + return retval; +} + +SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, + compat_off_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, + compat_loff_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif diff --git a/kernel/fs/readdir.c b/kernel/fs/readdir.c new file mode 100644 index 000000000..ced679179 --- /dev/null +++ b/kernel/fs/readdir.c @@ -0,0 +1,309 @@ +/* + * linux/fs/readdir.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int iterate_dir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + int res = -ENOTDIR; + if (!file->f_op->iterate) + goto out; + + res = security_file_permission(file, MAY_READ); + if (res) + goto out; + + res = mutex_lock_killable(&inode->i_mutex); + if (res) + goto out; + + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + ctx->pos = file->f_pos; + res = file->f_op->iterate(file, ctx); + file->f_pos = ctx->pos; + fsnotify_access(file); + file_accessed(file); + } + mutex_unlock(&inode->i_mutex); +out: + return res; +} +EXPORT_SYMBOL(iterate_dir); + +/* + * Traditional linux readdir() handling.. + * + * "count=1" is a special case, meaning that the buffer is one + * dirent-structure in size and that the code can't handle more + * anyway. Thus the special "fillonedir()" function for that + * case (the low-level handlers don't need to care about this). + */ + +#ifdef __ARCH_WANT_OLD_READDIR + +struct old_linux_dirent { + unsigned long d_ino; + unsigned long d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct readdir_callback { + struct dir_context ctx; + struct old_linux_dirent __user * dirent; + int result; +}; + +static int fillonedir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_callback *buf = + container_of(ctx, struct readdir_callback, ctx); + struct old_linux_dirent __user * dirent; + unsigned long d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct fd f = fdget(fd); + struct readdir_callback buf = { + .ctx.actor = fillonedir, + .dirent = dirent + }; + + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (buf.result) + error = buf.result; + + fdput(f); + return error; +} + +#endif /* __ARCH_WANT_OLD_READDIR */ + +/* + * New, all-improved, singing, dancing, iBCS2-compliant getdents() + * interface. + */ +struct linux_dirent { + unsigned long d_ino; + unsigned long d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct getdents_callback { + struct dir_context ctx; + struct linux_dirent __user * current_dir; + struct linux_dirent __user * previous; + int count; + int error; +}; + +static int filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct linux_dirent __user * dirent; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); + unsigned long d_ino; + int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, + sizeof(long)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct linux_dirent __user *, dirent, unsigned int, count) +{ + struct fd f; + struct linux_dirent __user * lastdirent; + struct getdents_callback buf = { + .ctx.actor = filldir, + .count = count, + .current_dir = dirent + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(buf.ctx.pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput(f); + return error; +} + +struct getdents_callback64 { + struct dir_context ctx; + struct linux_dirent64 __user * current_dir; + struct linux_dirent64 __user * previous; + int count; + int error; +}; + +static int filldir64(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct linux_dirent64 __user *dirent; + struct getdents_callback64 *buf = + container_of(ctx, struct getdents_callback64, ctx); + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(ino, &dirent->d_ino)) + goto efault; + if (__put_user(0, &dirent->d_off)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (__put_user(d_type, &dirent->d_type)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + struct fd f; + struct linux_dirent64 __user * lastdirent; + struct getdents_callback64 buf = { + .ctx.actor = filldir64, + .count = count, + .current_dir = dirent + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + typeof(lastdirent->d_off) d_off = buf.ctx.pos; + if (__put_user(d_off, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput(f); + return error; +} diff --git a/kernel/fs/reiserfs/Kconfig b/kernel/fs/reiserfs/Kconfig new file mode 100644 index 000000000..7cd46666b --- /dev/null +++ b/kernel/fs/reiserfs/Kconfig @@ -0,0 +1,88 @@ +config REISERFS_FS + tristate "Reiserfs support" + select CRC32 + help + Stores not just filenames but the files themselves in a balanced + tree. Uses journalling. + + Balanced trees are more efficient than traditional file system + architectural foundations. + + In general, ReiserFS is as fast as ext2, but is very efficient with + large directories and small files. Additional patches are needed + for NFS and quotas, please see + for links. + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based file + systems are. The next version will be so extended, and will support + plugins consistent with our motto ``It takes more than a license to + make source code open.'' + + Read + to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +config REISERFS_CHECK + bool "Enable reiserfs debug mode" + depends on REISERFS_FS + help + If you set this to Y, then ReiserFS will perform every check it can + possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say Y and you might get a useful error message. Almost + everyone should say N. + +config REISERFS_PROC_INFO + bool "Stats in /proc/fs/reiserfs" + depends on REISERFS_FS && PROC_FS + help + Create under /proc/fs/reiserfs a hierarchy of files, displaying + various ReiserFS statistics and internal data at the expense of + making your kernel or module slightly larger (+8 KB). This also + increases the amount of kernel memory required for each mount. + Almost everyone but ReiserFS developers and people fine-tuning + reiserfs or tracing problems should say N. + +config REISERFS_FS_XATTR + bool "ReiserFS extended attributes" + depends on REISERFS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config REISERFS_FS_POSIX_ACL + bool "ReiserFS POSIX Access Control Lists" + depends on REISERFS_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config REISERFS_FS_SECURITY + bool "ReiserFS Security Labels" + depends on REISERFS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ReiserFS filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/kernel/fs/reiserfs/Makefile b/kernel/fs/reiserfs/Makefile new file mode 100644 index 000000000..3c3b00165 --- /dev/null +++ b/kernel/fs/reiserfs/Makefile @@ -0,0 +1,38 @@ +# +# Makefile for the linux reiser-filesystem routines. +# + +obj-$(CONFIG_REISERFS_FS) += reiserfs.o + +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ + super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ + hashes.o tail_conversion.o journal.o resize.o \ + item_ops.o ioctl.o xattr.o lock.o + +ifeq ($(CONFIG_REISERFS_PROC_INFO),y) +reiserfs-objs += procfs.o +endif + +ifeq ($(CONFIG_REISERFS_FS_XATTR),y) +reiserfs-objs += xattr_user.o xattr_trusted.o +endif + +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) +reiserfs-objs += xattr_security.o +endif + +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) +reiserfs-objs += xattr_acl.o +endif + +# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline +# functions are used. This causes the compiler to advance the stack +# pointer out of the available stack space, corrupting kernel space, +# and causing a panic. Since this behavior only affects ppc32, this ifeq +# will work around it. If any other architecture displays this behavior, +# add it here. +ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1) + +TAGS: + etags *.c + diff --git a/kernel/fs/reiserfs/README b/kernel/fs/reiserfs/README new file mode 100644 index 000000000..e2f7a264e --- /dev/null +++ b/kernel/fs/reiserfs/README @@ -0,0 +1,161 @@ +[LICENSING] + +ReiserFS is hereby licensed under the GNU General +Public License version 2. + +Source code files that contain the phrase "licensing governed by +reiserfs/README" are "governed files" throughout this file. Governed +files are licensed under the GPL. The portions of them owned by Hans +Reiser, or authorized to be licensed by him, have been in the past, +and likely will be in the future, licensed to other parties under +other licenses. If you add your code to governed files, and don't +want it to be owned by Hans Reiser, put your copyright label on that +code so the poor blight and his customers can keep things straight. +All portions of governed files not labeled otherwise are owned by Hans +Reiser, and by adding your code to it, widely distributing it to +others or sending us a patch, and leaving the sentence in stating that +licensing is governed by the statement in this file, you accept this. +It will be a kindness if you identify whether Hans Reiser is allowed +to license code labeled as owned by you on your behalf other than +under the GPL, because he wants to know if it is okay to do so and put +a check in the mail to you (for non-trivial improvements) when he +makes his next sale. He makes no guarantees as to the amount if any, +though he feels motivated to motivate contributors, and you can surely +discuss this with him before or after contributing. You have the +right to decline to allow him to license your code contribution other +than under the GPL. + +Further licensing options are available for commercial and/or other +interests directly from Hans Reiser: hans@reiser.to. If you interpret +the GPL as not allowing those additional licensing options, you read +it wrongly, and Richard Stallman agrees with me, when carefully read +you can see that those restrictions on additional terms do not apply +to the owner of the copyright, and my interpretation of this shall +govern for this license. + +Finally, nothing in this license shall be interpreted to allow you to +fail to fairly credit me, or to remove my credits, without my +permission, unless you are an end user not redistributing to others. +If you have doubts about how to properly do that, or about what is +fair, ask. (Last I spoke with him Richard was contemplating how best +to address the fair crediting issue in the next GPL version.) + +[END LICENSING] + +Reiserfs is a file system based on balanced tree algorithms, which is +described at https://reiser4.wiki.kernel.org/index.php/Main_Page + +Stop reading here. Go there, then return. + +Send bug reports to yura@namesys.botik.ru. + +mkreiserfs and other utilities are in reiserfs/utils, or wherever your +Linux provider put them. There is some disagreement about how useful +it is for users to get their fsck and mkreiserfs out of sync with the +version of reiserfs that is in their kernel, with many important +distributors wanting them out of sync.:-) Please try to remember to +recompile and reinstall fsck and mkreiserfs with every update of +reiserfs, this is a common source of confusion. Note that some of the +utilities cannot be compiled without accessing the balancing code +which is in the kernel code, and relocating the utilities may require +you to specify where that code can be found. + +Yes, if you update your reiserfs kernel module you do have to +recompile your kernel, most of the time. The errors you get will be +quite cryptic if your forget to do so. + +Real users, as opposed to folks who want to hack and then understand +what went wrong, will want REISERFS_CHECK off. + +Hideous Commercial Pitch: Spread your development costs across other OS +vendors. Select from the best in the world, not the best in your +building, by buying from third party OS component suppliers. Leverage +the software component development power of the internet. Be the most +aggressive in taking advantage of the commercial possibilities of +decentralized internet development, and add value through your branded +integration that you sell as an operating system. Let your competitors +be the ones to compete against the entire internet by themselves. Be +hip, get with the new economic trend, before your competitors do. Send +email to hans@reiser.to. + +To understand the code, after reading the website, start reading the +code by reading reiserfs_fs.h first. + +Hans Reiser was the project initiator, primary architect, source of all +funding for the first 5.5 years, and one of the programmers. He owns +the copyright. + +Vladimir Saveljev was one of the programmers, and he worked long hours +writing the cleanest code. He always made the effort to be the best he +could be, and to make his code the best that it could be. What resulted +was quite remarkable. I don't think that money can ever motivate someone +to work the way he did, he is one of the most selfless men I know. + +Yura helps with benchmarking, coding hashes, and block pre-allocation +code. + +Anatoly Pinchuk is a former member of our team who worked closely with +Vladimir throughout the project's development. He wrote a quite +substantial portion of the total code. He realized that there was a +space problem with packing tails of files for files larger than a node +that start on a node aligned boundary (there are reasons to want to node +align files), and he invented and implemented indirect items and +unformatted nodes as the solution. + +Konstantin Shvachko, with the help of the Russian version of a VC, +tried to put me in a position where I was forced into giving control +of the project to him. (Fortunately, as the person paying the money +for all salaries from my dayjob I owned all copyrights, and you can't +really force takeovers of sole proprietorships.) This was something +curious, because he never really understood the value of our project, +why we should do what we do, or why innovation was possible in +general, but he was sure that he ought to be controlling it. Every +innovation had to be forced past him while he was with us. He added +two years to the time required to complete reiserfs, and was a net +loss for me. Mikhail Gilula was a brilliant innovator who also left +in a destructive way that erased the value of his contributions, and +that he was shown much generosity just makes it more painful. + +Grigory Zaigralin was an extremely effective system administrator for +our group. + +Igor Krasheninnikov was wonderful at hardware procurement, repair, and +network installation. + +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a +textbook he got the algorithm from in the code. Note that his analysis +of how we could use the hashing code in making 32 bit NFS cookies work +was probably more important than the actual algorithm. Colin Plumb also +contributed to it. + +Chris Mason dived right into our code, and in just a few months produced +the journaling code that dramatically increased the value of ReiserFS. +He is just an amazing programmer. + +Igor Zagorovsky is writing much of the new item handler and extent code +for our next major release. + +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the +resizer, and is hard at work on implementing allocate on flush. SGI +implemented allocate on flush before us for XFS, and generously took +the time to convince me we should do it also. They are great people, +and a great company. + +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. + +Vitaly Fertman is doing fsck. + +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably +the endian safe patches which allow ReiserFS to run on any platform +supported by the Linux kernel. + +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the +Alpha PC Company made it possible for me to not have a day job +anymore, and to dramatically increase our staffing. Ecila funded +hypertext feature development, MP3.com funded journaling, SuSE funded +core development, IntegratedLinux.com funded squid web cache +appliances, bigstorage.com funded HSM, and the alpha PC company funded +the alpha port. Many of these tasks were helped by sponsors other +than the ones just named. SuSE has helped in much more than just +funding.... + diff --git a/kernel/fs/reiserfs/acl.h b/kernel/fs/reiserfs/acl.h new file mode 100644 index 000000000..4a211f5b3 --- /dev/null +++ b/kernel/fs/reiserfs/acl.h @@ -0,0 +1,76 @@ +#include +#include + +#define REISERFS_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} reiserfs_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} reiserfs_acl_entry_short; + +typedef struct { + __le32 a_version; +} reiserfs_acl_header; + +static inline size_t reiserfs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(reiserfs_acl_header) + + count * sizeof(reiserfs_acl_entry_short); + } else { + return sizeof(reiserfs_acl_header) + + 4 * sizeof(reiserfs_acl_entry_short) + + (count - 4) * sizeof(reiserfs_acl_entry); + } +} + +static inline int reiserfs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(reiserfs_acl_header); + s = size - 4 * sizeof(reiserfs_acl_entry_short); + if (s < 0) { + if (size % sizeof(reiserfs_acl_entry_short)) + return -1; + return size / sizeof(reiserfs_acl_entry_short); + } else { + if (s % sizeof(reiserfs_acl_entry)) + return -1; + return s / sizeof(reiserfs_acl_entry) + 4; + } +} + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + struct inode *dir, struct dentry *dentry, + struct inode *inode); +int reiserfs_cache_default_acl(struct inode *dir); + +#else + +#define reiserfs_cache_default_acl(inode) 0 +#define reiserfs_get_acl NULL +#define reiserfs_set_acl NULL + +static inline int reiserfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + const struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + return 0; +} +#endif diff --git a/kernel/fs/reiserfs/bitmap.c b/kernel/fs/reiserfs/bitmap.c new file mode 100644 index 000000000..dc198bc64 --- /dev/null +++ b/kernel/fs/reiserfs/bitmap.c @@ -0,0 +1,1468 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +/* Reiserfs block (de)allocator, bitmap-based. */ + +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include + +#define PREALLOCATION_SIZE 9 + +/* different reiserfs block allocator options */ + +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) + +#define _ALLOC_concentrating_formatted_nodes 0 +#define _ALLOC_displacing_large_files 1 +#define _ALLOC_displacing_new_packing_localities 2 +#define _ALLOC_old_hashed_relocation 3 +#define _ALLOC_new_hashed_relocation 4 +#define _ALLOC_skip_busy 5 +#define _ALLOC_displace_based_on_dirid 6 +#define _ALLOC_hashed_formatted_nodes 7 +#define _ALLOC_old_way 8 +#define _ALLOC_hundredth_slices 9 +#define _ALLOC_dirid_groups 10 +#define _ALLOC_oid_groups 11 +#define _ALLOC_packing_groups 12 + +#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) +#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) +#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) + +#define SET_OPTION(optname) \ + do { \ + reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \ + set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ + } while(0) +#define TEST_OPTION(optname, s) \ + test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) + +static inline void get_bit_address(struct super_block *s, + b_blocknr_t block, + unsigned int *bmap_nr, + unsigned int *offset) +{ + /* + * It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. + */ + *bmap_nr = block >> (s->s_blocksize_bits + 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block & ((s->s_blocksize << 3) - 1); +} + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) +{ + unsigned int bmap, offset; + unsigned int bmap_count = reiserfs_bmap_count(s); + + if (block == 0 || block >= SB_BLOCK_COUNT(s)) { + reiserfs_error(s, "vs-4010", + "block number is out of range %lu (%u)", + block, SB_BLOCK_COUNT(s)); + return 0; + } + + get_bit_address(s, block, &bmap, &offset); + + /* + * Old format filesystem? Unlikely, but the bitmaps are all + * up front so we need to account for it. + */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &REISERFS_SB(s)->s_properties))) { + b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; + if (block >= bmap1 && + block <= bmap1 + bmap_count) { + reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } else { + if (offset == 0) { + reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } + + if (bmap >= bmap_count) { + reiserfs_error(s, "vs-4030", "bitmap for requested block " + "is out of range: block=%lu, bitmap_nr=%u", + block, bmap); + return 0; + } + + if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { + reiserfs_error(s, "vs-4050", "this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK(s)); + return 0; + } + + return 1; +} + +/* + * Searches in journal structures for a given block number (bmap, off). + * If block is found in reiserfs journal it suggests next free block + * candidate to test. + */ +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, + int off, int *next) +{ + b_blocknr_t tmp; + + if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { + if (tmp) { /* hint supplied */ + *next = tmp; + PROC_INFO_INC(s, scan_bitmap.in_journal_hint); + } else { + (*next) = off + 1; /* inc offset to avoid looping. */ + PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); + } + PROC_INFO_INC(s, scan_bitmap.retry); + return 1; + } + return 0; +} + +/* + * Searches for a window of zero bits with given minimum and maximum + * lengths in one bitmap block + */ +static int scan_bitmap_block(struct reiserfs_transaction_handle *th, + unsigned int bmap_n, int *beg, int boundary, + int min, int max, int unfm) +{ + struct super_block *s = th->t_super; + struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; + struct buffer_head *bh; + int end, next; + int org = *beg; + + BUG_ON(!th->t_trans_id); + RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " + "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); + PROC_INFO_INC(s, scan_bitmap.bmap); + + if (!bi) { + reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " + "for bitmap %d", bmap_n); + return 0; + } + + bh = reiserfs_read_bitmap_block(s, bmap_n); + if (bh == NULL) + return 0; + + while (1) { +cont: + if (bi->free_count < min) { + brelse(bh); + return 0; /* No free blocks in this bitmap */ + } + + /* search for a first zero bit -- beginning of a window */ + *beg = reiserfs_find_next_zero_le_bit + ((unsigned long *)(bh->b_data), boundary, *beg); + + /* + * search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size + */ + if (*beg + min > boundary) { + brelse(bh); + return 0; + } + + if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) + continue; + /* first zero bit found; we check next bits */ + for (end = *beg + 1;; end++) { + if (end >= *beg + max || end >= boundary + || reiserfs_test_le_bit(end, bh->b_data)) { + next = end; + break; + } + + /* + * finding the other end of zero bit window requires + * looking into journal structures (in case of + * searching for free blocks for unformatted nodes) + */ + if (unfm && is_block_in_journal(s, bmap_n, end, &next)) + break; + } + + /* + * now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end + */ + + /* found window of proper size */ + if (end - *beg >= min) { + int i; + reiserfs_prepare_for_journal(s, bh, 1); + /* + * try to set all blocks used checking are + * they still free + */ + for (i = *beg; i < end; i++) { + /* Don't check in journal again. */ + if (reiserfs_test_and_set_le_bit + (i, bh->b_data)) { + /* + * bit was set by another process while + * we slept in prepare_for_journal() + */ + PROC_INFO_INC(s, scan_bitmap.stolen); + + /* + * we can continue with smaller set + * of allocated blocks, if length of + * this set is more or equal to `min' + */ + if (i >= *beg + min) { + end = i; + break; + } + + /* + * otherwise we clear all bit + * were set ... + */ + while (--i >= *beg) + reiserfs_clear_le_bit + (i, bh->b_data); + reiserfs_restore_prepared_buffer(s, bh); + *beg = org; + + /* + * Search again in current block + * from beginning + */ + goto cont; + } + } + bi->free_count -= (end - *beg); + journal_mark_dirty(th, bh); + brelse(bh); + + /* free block count calculation */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + return end - (*beg); + } else { + *beg = next; + } + } +} + +static int bmap_hash_id(struct super_block *s, u32 id) +{ + char *hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % reiserfs_bmap_count(s); + if (!bm) + bm = 1; + } + /* this can only be true when SB_BMAP_NR = 1 */ + if (bm >= reiserfs_bmap_count(s)) + bm = 0; + return bm; +} + +/* + * hashes the id and then returns > 0 if the block group for the + * corresponding hash is full + */ +static inline int block_group_used(struct super_block *s, u32 id) +{ + int bm = bmap_hash_id(s, id); + struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; + + /* + * If we don't have cached information on this bitmap block, we're + * going to have to load it later anyway. Loading it here allows us + * to make a better decision. This favors long-term performance gain + * with a better on-disk layout vs. a short term gain of skipping the + * read and potentially having a bad placement. + */ + if (info->free_count == UINT_MAX) { + struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); + brelse(bh); + } + + if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) { + return 0; + } + return 1; +} + +/* + * the packing is returned in disk byte order + */ +__le32 reiserfs_choose_packing(struct inode * dir) +{ + __le32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); + /* + * some versions of reiserfsck expect packing locality 1 to be + * special + */ + if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; +} + +/* + * Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. + */ +static int scan_bitmap(struct reiserfs_transaction_handle *th, + b_blocknr_t * start, b_blocknr_t finish, + int min, int max, int unfm, sector_t file_block) +{ + int nr_allocated = 0; + struct super_block *s = th->t_super; + unsigned int bm, off; + unsigned int end_bm, end_off; + unsigned int off_max = s->s_blocksize << 3; + + BUG_ON(!th->t_trans_id); + PROC_INFO_INC(s, scan_bitmap.call); + + /* No point in looking for more free blocks */ + if (SB_FREE_BLOCKS(s) <= 0) + return 0; + + get_bit_address(s, *start, &bm, &off); + get_bit_address(s, finish, &end_bm, &end_off); + if (bm > reiserfs_bmap_count(s)) + return 0; + if (end_bm > reiserfs_bmap_count(s)) + end_bm = reiserfs_bmap_count(s); + + /* + * When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ + if (TEST_OPTION(skip_busy, s) + && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { + for (; bm < end_bm; bm++, off = 0) { + if ((off && (!unfm || (file_block != 0))) + || SB_AP_BITMAP(s)[bm].free_count > + (s->s_blocksize << 3) / 10) + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, + min, max, unfm); + if (nr_allocated) + goto ret; + } + /* we know from above that start is a reasonable number */ + get_bit_address(s, *start, &bm, &off); + } + + for (; bm < end_bm; bm++, off = 0) { + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + + nr_allocated = + scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); + +ret: + *start = bm * off_max + off; + return nr_allocated; + +} + +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs; + struct buffer_head *sbh, *bmbh; + struct reiserfs_bitmap_info *apbi; + unsigned int nr, offset; + + BUG_ON(!th->t_trans_id); + PROC_INFO_INC(s, free_block); + rs = SB_DISK_SUPER_BLOCK(s); + sbh = SB_BUFFER_WITH_SB(s); + apbi = SB_AP_BITMAP(s); + + get_bit_address(s, block, &nr, &offset); + + if (nr >= reiserfs_bmap_count(s)) { + reiserfs_error(s, "vs-4075", "block %lu is out of range", + block); + return; + } + + bmbh = reiserfs_read_bitmap_block(s, nr); + if (!bmbh) + return; + + reiserfs_prepare_for_journal(s, bmbh, 1); + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { + reiserfs_error(s, "vs-4080", + "block %lu: bit already cleared", block); + } + apbi[nr].free_count++; + journal_mark_dirty(th, bmbh); + brelse(bmbh); + + reiserfs_prepare_for_journal(s, sbh, 1); + /* update super block */ + set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); + + journal_mark_dirty(th, sbh); + if (for_unformatted) { + int depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(inode, 1); + reiserfs_write_lock_nested(s, depth); + } +} + +void reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + + BUG_ON(!th->t_trans_id); + RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); + if (!is_reusable(s, block, 1)) + return; + + if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { + reiserfs_error(th->t_super, "bitmap-4072", + "Trying to free block outside file system " + "boundaries (%lu > %lu)", + block, sb_block_count(REISERFS_SB(s)->s_rs)); + return; + } + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block); + _reiserfs_free_block(th, inode, block, for_unformatted); +} + +/* preallocated blocks don't need to be run through journal_mark_freed */ +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block) +{ + BUG_ON(!th->t_trans_id); + RFALSE(!th->t_super, + "vs-4060: trying to free block on nonexistent device"); + if (!is_reusable(th->t_super, block, 1)) + return; + _reiserfs_free_block(th, inode, block, 1); +} + +static void __discard_prealloc(struct reiserfs_transaction_handle *th, + struct reiserfs_inode_info *ei) +{ + unsigned long save = ei->i_prealloc_block; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; + + BUG_ON(!th->t_trans_id); +#ifdef CONFIG_REISERFS_CHECK + if (ei->i_prealloc_count < 0) + reiserfs_error(th->t_super, "zam-4001", + "inode has negative prealloc blocks count."); +#endif + while (ei->i_prealloc_count > 0) { + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); + ei->i_prealloc_block++; + ei->i_prealloc_count--; + dirty = 1; + } + if (dirty) + reiserfs_update_sd(th, inode); + ei->i_prealloc_block = save; + list_del_init(&ei->i_prealloc_list); +} + +/* FIXME: It should be inline function */ +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + struct reiserfs_inode_info *ei = REISERFS_I(inode); + + BUG_ON(!th->t_trans_id); + if (ei->i_prealloc_count) + __discard_prealloc(th, ei); +} + +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) +{ + struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + + BUG_ON(!th->t_trans_id); + while (!list_empty(plist)) { + struct reiserfs_inode_info *ei; + ei = list_entry(plist->next, struct reiserfs_inode_info, + i_prealloc_list); +#ifdef CONFIG_REISERFS_CHECK + if (!ei->i_prealloc_count) { + reiserfs_error(th->t_super, "zam-4001", + "inode is in prealloc list but has " + "no preallocated blocks."); + } +#endif + __discard_prealloc(th, ei); + } +} + +void reiserfs_init_alloc_options(struct super_block *s) +{ + set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); +} + +/* block allocator related options are parsed here */ +int reiserfs_parse_alloc_options(struct super_block *s, char *options) +{ + char *this_char, *value; + + /* clear default settings */ + REISERFS_SB(s)->s_alloc_options.bits = 0; + + while ((this_char = strsep(&options, ":")) != NULL) { + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, "concentrating_formatted_nodes")) { + int temp; + SET_OPTION(concentrating_formatted_nodes); + temp = (value + && *value) ? simple_strtoul(value, &value, + 0) : 10; + if (temp <= 0 || temp > 100) { + REISERFS_SB(s)->s_alloc_options.border = 10; + } else { + REISERFS_SB(s)->s_alloc_options.border = + 100 / temp; + } + continue; + } + if (!strcmp(this_char, "displacing_large_files")) { + SET_OPTION(displacing_large_files); + REISERFS_SB(s)->s_alloc_options.large_file_size = + (value + && *value) ? simple_strtoul(value, &value, 0) : 16; + continue; + } + if (!strcmp(this_char, "displacing_new_packing_localities")) { + SET_OPTION(displacing_new_packing_localities); + continue; + } + + if (!strcmp(this_char, "old_hashed_relocation")) { + SET_OPTION(old_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "new_hashed_relocation")) { + SET_OPTION(new_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } + if (!strcmp(this_char, "hashed_formatted_nodes")) { + SET_OPTION(hashed_formatted_nodes); + continue; + } + + if (!strcmp(this_char, "skip_busy")) { + SET_OPTION(skip_busy); + continue; + } + + if (!strcmp(this_char, "hundredth_slices")) { + SET_OPTION(hundredth_slices); + continue; + } + + if (!strcmp(this_char, "old_way")) { + SET_OPTION(old_way); + continue; + } + + if (!strcmp(this_char, "displace_based_on_dirid")) { + SET_OPTION(displace_based_on_dirid); + continue; + } + + if (!strcmp(this_char, "preallocmin")) { + REISERFS_SB(s)->s_alloc_options.preallocmin = + (value + && *value) ? simple_strtoul(value, &value, 0) : 4; + continue; + } + + if (!strcmp(this_char, "preallocsize")) { + REISERFS_SB(s)->s_alloc_options.preallocsize = + (value + && *value) ? simple_strtoul(value, &value, + 0) : + PREALLOCATION_SIZE; + continue; + } + + reiserfs_warning(s, "zam-4001", "unknown option - %s", + this_char); + return 1; + } + + reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); + return 0; +} + +static void print_sep(struct seq_file *seq, int *first) +{ + if (!*first) + seq_puts(seq, ":"); + else + *first = 0; +} + +void show_alloc_options(struct seq_file *seq, struct super_block *s) +{ + int first = 1; + + if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | + (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) + return; + + seq_puts(seq, ",alloc="); + + if (TEST_OPTION(concentrating_formatted_nodes, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.border != 10) { + seq_printf(seq, "concentrating_formatted_nodes=%d", + 100 / REISERFS_SB(s)->s_alloc_options.border); + } else + seq_puts(seq, "concentrating_formatted_nodes"); + } + if (TEST_OPTION(displacing_large_files, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { + seq_printf(seq, "displacing_large_files=%lu", + REISERFS_SB(s)->s_alloc_options.large_file_size); + } else + seq_puts(seq, "displacing_large_files"); + } + if (TEST_OPTION(displacing_new_packing_localities, s)) { + print_sep(seq, &first); + seq_puts(seq, "displacing_new_packing_localities"); + } + if (TEST_OPTION(old_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_hashed_relocation"); + } + if (TEST_OPTION(new_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "new_hashed_relocation"); + } + if (TEST_OPTION(dirid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "dirid_groups"); + } + if (TEST_OPTION(oid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "oid_groups"); + } + if (TEST_OPTION(packing_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "packing_groups"); + } + if (TEST_OPTION(hashed_formatted_nodes, s)) { + print_sep(seq, &first); + seq_puts(seq, "hashed_formatted_nodes"); + } + if (TEST_OPTION(skip_busy, s)) { + print_sep(seq, &first); + seq_puts(seq, "skip_busy"); + } + if (TEST_OPTION(hundredth_slices, s)) { + print_sep(seq, &first); + seq_puts(seq, "hundredth_slices"); + } + if (TEST_OPTION(old_way, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_way"); + } + if (TEST_OPTION(displace_based_on_dirid, s)) { + print_sep(seq, &first); + seq_puts(seq, "displace_based_on_dirid"); + } + if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { + print_sep(seq, &first); + seq_printf(seq, "preallocmin=%d", + REISERFS_SB(s)->s_alloc_options.preallocmin); + } + if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { + print_sep(seq, &first); + seq_printf(seq, "preallocsize=%d", + REISERFS_SB(s)->s_alloc_options.preallocsize); + } +} + +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + + if (hint->formatted_node) { + hash_in = (char *)&hint->key.k_dir_id; + } else { + if (!hint->inode) { + /*hint->search_start = hint->beg;*/ + hash_in = (char *)&hint->key.k_dir_id; + } else + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = + (char *)(&INODE_PKEY(hint->inode)->k_objectid); + } + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +/* + * Relocation based on dirid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void dirid_groups(reiserfs_blocknr_hint_t * hint) +{ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; + + if (hint->inode) + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize / 2; + hint->search_start = hash; + } +} + +/* + * Relocation based on oid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void oid_groups(reiserfs_blocknr_hint_t * hint) +{ + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* + * keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; + } +} + +/* + * returns 1 if it finds an indirect item and gets valid hint info + * from it, otherwise 0 + */ +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) +{ + struct treepath *path; + struct buffer_head *bh; + struct item_head *ih; + int pos_in_item; + __le32 *item; + int ret = 0; + + /* + * reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start + */ + if (!hint->path) + return 0; + + path = hint->path; + bh = get_last_bh(path); + RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); + ih = tp_item_head(path); + pos_in_item = path->pos_in_item; + item = tp_item_body(path); + + hint->search_start = bh->b_blocknr; + + /* + * for indirect item: go to left and look for the first non-hole entry + * in the indirect item + */ + if (!hint->formatted_node && is_indirect_le_ih(ih)) { + if (pos_in_item == I_UNFM_NUM(ih)) + pos_in_item--; + while (pos_in_item >= 0) { + int t = get_block_num(item, pos_in_item); + if (t) { + hint->search_start = t; + ret = 1; + break; + } + pos_in_item--; + } + } + + /* does result value fit into specified region? */ + return ret; +} + +/* + * should be, if formatted node, then try to put on first part of the device + * specified as number of percent with mount option device, else try to put + * on last of device. This is not to say it is good code to do so, + * but the effect should be measured. + */ +static inline void set_border_in_hint(struct super_block *s, + reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border = + SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; + + if (hint->formatted_node) + hint->end = border - 1; + else + hint->beg = border; +} + +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) +{ + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), + 4) % (hint->end - hint->beg); + else + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), + 4) % (hint->end - hint->beg); +} + +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + + if (!hint->inode) + hash_in = (char *)&hint->key.k_dir_id; + else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +static inline int +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * + hint) +{ + return hint->block == + REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; +} + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + + hint->th->displace_new_blocks = 0; + hint->search_start = + hint->beg + keyed_hash((char *)(&key->k_objectid), + 4) % (hint->end - hint->beg); +} +#endif + +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + u32 hash_in; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); + border = + hint->beg + (u32) keyed_hash(((char *)(&hash_in)), + 4) % (hint->end - hint->beg - 1); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline int old_way(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + border = + hint->beg + + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - + hint->beg); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + b_blocknr_t slice_start; + + slice_start = + (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); + if (slice_start > hint->search_start + || slice_start + (hint->end / 100) <= hint->search_start) { + hint->search_start = slice_start; + } +} + +static void determine_search_start(reiserfs_blocknr_hint_t * hint, + int amount_needed) +{ + struct super_block *s = hint->th->t_super; + int unfm_hint; + + hint->beg = 0; + hint->end = SB_BLOCK_COUNT(s) - 1; + + /* This is former border algorithm. Now with tunable border offset */ + if (concentrating_formatted_nodes(s)) + set_border_in_hint(s, hint); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * whenever we create a new directory, we displace it. At first + * we will hash for location, later we might look for a moderately + * empty place for it + */ + if (displacing_new_packing_localities(s) + && hint->th->displace_new_blocks) { + displace_new_packing_locality(hint); + + /* + * we do not continue determine_search_start, + * if new packing locality is being displaced + */ + return; + } +#endif + + /* + * all persons should feel encouraged to add more special cases + * here and test them + */ + + if (displacing_large_files(s) && !hint->formatted_node + && this_blocknr_allocation_would_make_it_a_large_file(hint)) { + displace_large_file(hint); + return; + } + + /* + * if none of our special cases is relevant, use the left + * neighbor in the tree order of the new node we are allocating for + */ + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { + hash_formatted_node(hint); + return; + } + + unfm_hint = get_left_neighbor(hint); + + /* + * Mimic old block allocator behaviour, that is if VFS allowed for + * preallocation, new blocks are displaced based on directory ID. + * Also, if suggested search_start is less than last preallocated + * block, we start searching from it, assuming that HDD dataflow + * is faster in forward direction + */ + if (TEST_OPTION(old_way, s)) { + if (!hint->formatted_node) { + if (!reiserfs_hashed_relocation(s)) + old_way(hint); + else if (!reiserfs_no_unhashed_relocation(s)) + old_hashed_relocation(hint); + + if (hint->inode + && hint->search_start < + REISERFS_I(hint->inode)->i_prealloc_block) + hint->search_start = + REISERFS_I(hint->inode)->i_prealloc_block; + } + return; + } + + /* This is an approach proposed by Hans */ + if (TEST_OPTION(hundredth_slices, s) + && !(displacing_large_files(s) && !hint->formatted_node)) { + hundredth_slices(hint); + return; + } + + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) { + old_hashed_relocation(hint); + } + + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) { + new_hashed_relocation(hint); + } + + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#endif + + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { + oid_groups(hint); + } + return; +} + +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) +{ + /* make minimum size a mount option and benchmark both ways */ + /* we preallocate blocks only for regular files, specific size */ + /* benchmark preallocating always and see what happens */ + + hint->prealloc_size = 0; + + if (!hint->formatted_node && hint->preallocate) { + if (S_ISREG(hint->inode->i_mode) + && hint->inode->i_size >= + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocmin * hint->inode->i_sb->s_blocksize) + hint->prealloc_size = + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocsize - 1; + } + return CARRY_ON; +} + +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + b_blocknr_t start, + b_blocknr_t finish, int min, + int amount_needed, + int prealloc_size) +{ + int rest = amount_needed; + int nr_allocated; + + while (rest > 0 && start <= finish) { + nr_allocated = scan_bitmap(hint->th, &start, finish, min, + rest + prealloc_size, + !hint->formatted_node, hint->block); + + if (nr_allocated == 0) /* no new blocks allocated, return */ + break; + + /* fill free_blocknrs array first */ + while (rest > 0 && nr_allocated > 0) { + *new_blocknrs++ = start++; + rest--; + nr_allocated--; + } + + /* do we have something to fill prealloc. array also ? */ + if (nr_allocated > 0) { + /* + * it means prealloc_size was greater that 0 and + * we do preallocation + */ + list_add(&REISERFS_I(hint->inode)->i_prealloc_list, + &SB_JOURNAL(hint->th->t_super)-> + j_prealloc_list); + REISERFS_I(hint->inode)->i_prealloc_block = start; + REISERFS_I(hint->inode)->i_prealloc_count = + nr_allocated; + break; + } + } + + return (amount_needed - rest); +} + +static inline int blocknrs_and_prealloc_arrays_from_search_start + (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, + int amount_needed) { + struct super_block *s = hint->th->t_super; + b_blocknr_t start = hint->search_start; + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; + int passno = 0; + int nr_allocated = 0; + int depth; + + determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating %d blocks id=%u", + amount_needed, hint->inode->i_uid); +#endif + depth = reiserfs_write_unlock_nested(s); + quota_ret = + dquot_alloc_block_nodirty(hint->inode, amount_needed); + if (quota_ret) { /* Quota exceeded? */ + reiserfs_write_lock_nested(s, depth); + return QUOTA_EXCEEDED; + } + if (hint->preallocate && hint->prealloc_size) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating (prealloc) %d blocks id=%u", + hint->prealloc_size, hint->inode->i_uid); +#endif + quota_ret = dquot_prealloc_block_nodirty(hint->inode, + hint->prealloc_size); + if (quota_ret) + hint->preallocate = hint->prealloc_size = 0; + } + /* for unformatted nodes, force large allocations */ + reiserfs_write_lock_nested(s, depth); + } + + do { + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: + /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ + if (!hint->formatted_node) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (nospace) %d blocks id=%u", + amount_needed + + hint->prealloc_size - + nr_allocated, + hint->inode->i_uid); +#endif + /* Free not allocated blocks */ + depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(hint->inode, + amount_needed + hint->prealloc_size - + nr_allocated); + reiserfs_write_lock_nested(s, depth); + } + while (nr_allocated--) + reiserfs_free_block(hint->th, hint->inode, + new_blocknrs[nr_allocated], + !hint->formatted_node); + + return NO_DISK_SPACE; + } + } while ((nr_allocated += allocate_without_wrapping_disk(hint, + new_blocknrs + + nr_allocated, + start, finish, + 1, + amount_needed - + nr_allocated, + hint-> + prealloc_size)) + < amount_needed); + if (!hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (failed prealloc) %d blocks id=%u", + amount_needed + hint->prealloc_size - + nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count, + hint->inode->i_uid); +#endif + + depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)-> + i_prealloc_count); + reiserfs_write_lock_nested(s, depth); + } + + return CARRY_ON; +} + +/* grab new blocknrs from preallocated list */ +/* return amount still needed after using them */ +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + int amount_needed) +{ + struct inode *inode = hint->inode; + + if (REISERFS_I(inode)->i_prealloc_count > 0) { + while (amount_needed) { + + *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; + REISERFS_I(inode)->i_prealloc_count--; + + amount_needed--; + + if (REISERFS_I(inode)->i_prealloc_count <= 0) { + list_del(&REISERFS_I(inode)->i_prealloc_list); + break; + } + } + } + /* return amount still needed after using preallocated blocks */ + return amount_needed; +} + +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, + b_blocknr_t *new_blocknrs, + int amount_needed, + /* Amount of blocks we have already reserved */ + int reserved_by_us) +{ + int initial_amount_needed = amount_needed; + int ret; + struct super_block *s = hint->th->t_super; + + /* Check if there is enough space, taking into account reserved space */ + if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < + amount_needed - reserved_by_us) + return NO_DISK_SPACE; + /* should this be if !hint->inode && hint->preallocate? */ + /* do you mean hint->formatted_node can be removed ? - Zam */ + /* + * hint->formatted_node cannot be removed because we try to access + * inode information here, and there is often no inode associated with + * metadata allocations - green + */ + + if (!hint->formatted_node && hint->preallocate) { + amount_needed = use_preallocated_list_if_available + (hint, new_blocknrs, amount_needed); + + /* + * We have all the block numbers we need from the + * prealloc list + */ + if (amount_needed == 0) + return CARRY_ON; + new_blocknrs += (initial_amount_needed - amount_needed); + } + + /* find search start and save it in hint structure */ + determine_search_start(hint, amount_needed); + if (hint->search_start >= SB_BLOCK_COUNT(s)) + hint->search_start = SB_BLOCK_COUNT(s) - 1; + + /* allocation itself; fill new_blocknrs and preallocation arrays */ + ret = blocknrs_and_prealloc_arrays_from_search_start + (hint, new_blocknrs, amount_needed); + + /* + * We used prealloc. list to fill (partially) new_blocknrs array. + * If final allocation fails we need to return blocks back to + * prealloc. list or just free them. -- Zam (I chose second + * variant) + */ + if (ret != CARRY_ON) { + while (amount_needed++ < initial_amount_needed) { + reiserfs_free_block(hint->th, hint->inode, + *(--new_blocknrs), 1); + } + } + return ret; +} + +void reiserfs_cache_bitmap_metadata(struct super_block *sb, + struct buffer_head *bh, + struct reiserfs_bitmap_info *info) +{ + unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); + + /* The first bit must ALWAYS be 1 */ + if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)) + reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is " + "corrupted: first bit must be 1", bh->b_blocknr); + + info->free_count = 0; + + while (--cur >= (unsigned long *)bh->b_data) { + /* 0 and ~0 are special, we can optimize for them */ + if (*cur == 0) + info->free_count += BITS_PER_LONG; + else if (*cur != ~0L) /* A mix, investigate */ + info->free_count += BITS_PER_LONG - hweight_long(*cur); + } +} + +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, + unsigned int bitmap) +{ + b_blocknr_t block = (sb->s_blocksize << 3) * bitmap; + struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; + struct buffer_head *bh; + + /* + * Way old format filesystems had the bitmaps packed up front. + * I doubt there are any of these left, but just in case... + */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &REISERFS_SB(sb)->s_properties))) + block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; + else if (bitmap == 0) + block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; + + bh = sb_bread(sb, block); + if (bh == NULL) + reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " + "reading failed", __func__, block); + else { + if (buffer_locked(bh)) { + int depth; + PROC_INFO_INC(sb, scan_bitmap.wait); + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(bh); + reiserfs_write_lock_nested(sb, depth); + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(atomic_read(&bh->b_count) == 0); + + if (info->free_count == UINT_MAX) + reiserfs_cache_bitmap_metadata(sb, bh, info); + } + + return bh; +} + +int reiserfs_init_bitmap_cache(struct super_block *sb) +{ + struct reiserfs_bitmap_info *bitmap; + unsigned int bmap_nr = reiserfs_bmap_count(sb); + + bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + if (bitmap == NULL) + return -ENOMEM; + + memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); + + SB_AP_BITMAP(sb) = bitmap; + + return 0; +} + +void reiserfs_free_bitmap_cache(struct super_block *sb) +{ + if (SB_AP_BITMAP(sb)) { + vfree(SB_AP_BITMAP(sb)); + SB_AP_BITMAP(sb) = NULL; + } +} diff --git a/kernel/fs/reiserfs/dir.c b/kernel/fs/reiserfs/dir.c new file mode 100644 index 000000000..4a024e2ce --- /dev/null +++ b/kernel/fs/reiserfs/dir.c @@ -0,0 +1,346 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include + +extern const struct reiserfs_key MIN_KEY; + +static int reiserfs_readdir(struct file *, struct dir_context *); +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); + +const struct file_operations reiserfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = reiserfs_readdir, + .fsync = reiserfs_dir_fsync, + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif +}; + +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&inode->i_mutex); + if (err < 0) + return err; + return 0; +} + +#define store_ih(where,what) copy_item_head (where, what) + +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) +{ + struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; + return (d_really_is_positive(privroot) && + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); +} + +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) +{ + + /* key of current position in the directory (key of directory entry) */ + struct cpu_key pos_key; + + INITIALIZE_PATH(path_to_entry); + struct buffer_head *bh; + int item_num, entry_num; + const struct reiserfs_key *rkey; + struct item_head *ih, tmp_ih; + int search_res; + char *local_buf; + loff_t next_pos; + char small_buf[32]; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + int ret = 0; + int depth; + + reiserfs_write_lock(inode->i_sb); + + reiserfs_check_lock_depth(inode->i_sb, "readdir"); + + /* + * form key for search the next directory entry using + * f_pos field of file structure + */ + make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); + next_pos = cpu_key_k_offset(&pos_key); + + path_to_entry.reada = PATH_READA; + while (1) { +research: + /* + * search the directory item, containing entry with + * specified key + */ + search_res = + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, + &de); + if (search_res == IO_ERROR) { + /* + * FIXME: we could just skip part of directory + * which could not be read + */ + ret = -EIO; + goto out; + } + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + RFALSE(item_num > B_NR_ITEMS(bh) - 1, + "vs-9005 item_num == %d, item amount == %d", + item_num, B_NR_ITEMS(bh)); + + /* + * and entry must be not more than number of entries + * in the item + */ + RFALSE(ih_entry_count(ih) < entry_num, + "vs-9010: entry number is too big %d (%d)", + entry_num, ih_entry_count(ih)); + + /* + * go through all entries in the directory item beginning + * from the entry, that has been found + */ + if (search_res == POSITION_FOUND + || entry_num < ih_entry_count(ih)) { + struct reiserfs_de_head *deh = + B_I_DEH(bh, ih) + entry_num; + + for (; entry_num < ih_entry_count(ih); + entry_num++, deh++) { + int d_reclen; + char *d_name; + ino_t d_ino; + loff_t cur_pos = deh_offset(deh); + + /* it is hidden entry */ + if (!de_visible(deh)) + continue; + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); + + if (d_reclen <= 0 || + d_name + d_reclen > bh->b_data + bh->b_size) { + /* + * There is corrupted data in entry, + * We'd better stop here + */ + pathrelse(&path_to_entry); + ret = -EIO; + goto out; + } + + if (!d_name[d_reclen - 1]) + d_reclen = strlen(d_name); + + /* too big to send back to VFS */ + if (d_reclen > + REISERFS_MAX_NAME(inode->i_sb-> + s_blocksize)) { + continue; + } + + /* Ignore the .reiserfs_priv entry */ + if (is_privroot_deh(inode, deh)) + continue; + + ctx->pos = deh_offset(deh); + d_ino = deh_objectid(deh); + if (d_reclen <= 32) { + local_buf = small_buf; + } else { + local_buf = kmalloc(d_reclen, + GFP_NOFS); + if (!local_buf) { + pathrelse(&path_to_entry); + ret = -ENOMEM; + goto out; + } + if (item_moved(&tmp_ih, &path_to_entry)) { + kfree(local_buf); + goto research; + } + } + + /* + * Note, that we copy name to user space via + * temporary buffer (local_buf) because + * filldir will block if user space buffer is + * swapped out. At that time entry can move to + * somewhere else + */ + memcpy(local_buf, d_name, d_reclen); + + /* + * Since filldir might sleep, we can release + * the write lock here for other waiters + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + if (!dir_emit + (ctx, local_buf, d_reclen, d_ino, + DT_UNKNOWN)) { + reiserfs_write_lock_nested(inode->i_sb, depth); + if (local_buf != small_buf) { + kfree(local_buf); + } + goto end; + } + reiserfs_write_lock_nested(inode->i_sb, depth); + if (local_buf != small_buf) { + kfree(local_buf); + } + + /* deh_offset(deh) may be invalid now. */ + next_pos = cur_pos + 1; + + if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); + goto research; + } + } /* for */ + } + + /* end of directory has been reached */ + if (item_num != B_NR_ITEMS(bh) - 1) + goto end; + + /* + * item we went through is last item of node. Using right + * delimiting key check is it directory end + */ + rkey = get_rkey(&path_to_entry, inode->i_sb); + if (!comp_le_keys(rkey, &MIN_KEY)) { + /* + * set pos_key to key, that is the smallest and greater + * that key of the last entry in the item + */ + set_cpu_key_k_offset(&pos_key, next_pos); + continue; + } + + /* end of directory has been reached */ + if (COMP_SHORT_KEYS(rkey, &pos_key)) { + goto end; + } + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset(&pos_key, + le_key_k_offset(KEY_FORMAT_3_5, rkey)); + + } /* while */ + +end: + ctx->pos = next_pos; + pathrelse(&path_to_entry); + reiserfs_check_path(&path_to_entry); +out: + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +static int reiserfs_readdir(struct file *file, struct dir_context *ctx) +{ + return reiserfs_readdir_inode(file_inode(file), ctx); +} + +/* + * compose directory item containing "." and ".." entries (entries are + * not aligned to 4 byte boundary) + */ +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *dot, *dotdot; + + memset(body, 0, EMPTY_DIR_SIZE_V1); + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; + + /* direntry header of "." */ + put_deh_offset(dot, DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(dot); + + /* direntry header of ".." */ + put_deh_offset(dotdot, DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - strlen("..")); + mark_de_visible(dotdot); + + /* copy ".." and "." */ + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); +} + +/* compose directory item containing "." and ".." entries */ +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *dot, *dotdot; + + memset(body, 0, EMPTY_DIR_SIZE); + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; + + /* direntry header of "." */ + put_deh_offset(dot, DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(dot); + + /* direntry header of ".." */ + put_deh_offset(dotdot, DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen(".."))); + mark_de_visible(dotdot); + + /* copy ".." and "." */ + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); +} diff --git a/kernel/fs/reiserfs/do_balan.c b/kernel/fs/reiserfs/do_balan.c new file mode 100644 index 000000000..9c02d96d3 --- /dev/null +++ b/kernel/fs/reiserfs/do_balan.c @@ -0,0 +1,1911 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Now we have all buffers that must be used in balancing of the tree + * Further calculations can not cause schedule(), and thus the buffer + * tree will be stable until the balancing will be finished + * balance the tree according to the analysis made before, + * and using buffers obtained after all above. + */ + +#include +#include +#include "reiserfs.h" +#include +#include + +static inline void buffer_info_init_left(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->L[0]; + bi->bi_parent = tb->FL[0]; + bi->bi_position = get_left_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_right(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->R[0]; + bi->bi_parent = tb->FR[0]; + bi->bi_position = get_right_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_tbS0(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi->bi_position = PATH_H_POSITION(tb->tb_path, 1); +} + +static inline void buffer_info_init_bh(struct tree_balance *tb, + struct buffer_info *bi, + struct buffer_head *bh) +{ + bi->tb = tb; + bi->bi_bh = bh; + bi->bi_parent = NULL; + bi->bi_position = 0; +} + +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag) +{ + journal_mark_dirty(tb->transaction_handle, bh); +} + +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +/* + * summary: + * if deleting something ( tb->insert_size[0] < 0 ) + * return(balance_leaf_when_delete()); (flag d handled here) + * else + * if lnum is larger than 0 we put items into the left node + * if rnum is larger than 0 we put items into the right node + * if snum1 is larger than 0 we put items into the new node s1 + * if snum2 is larger than 0 we put items into the new node s2 + * Note that all *num* count new items being created. + */ + +static void balance_leaf_when_delete_del(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih = item_head(tbS0, item_pos); +#endif + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + buffer_info_init_tbS0(tb, &bi); + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + } + } + + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); +} + +/* cut item in S[0] */ +static void balance_leaf_when_delete_cut(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct item_head *ih = item_head(tbS0, item_pos); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + + if (is_direntry_le_ih(ih)) { + /* + * UFS unlink semantics are such that you can only + * delete one directory entry at a time. + * + * when we cut a directory tb->insert_size[0] means + * number of entries to be cut (always 1) + */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic " + "length of item"); + } +} + +static int balance_leaf_when_delete_left(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* L[0] must be joined with S[0] */ + if (tb->lnum[0] == -1) { + /* R[0] must be also joined with S[0] */ + if (tb->rnum[0] == -1) { + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* + * all contents of all the + * 3 buffers will be in L[0] + */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 && + 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), -1, + NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->R[0]); + + return 0; + } + + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); + + return -1; + } + + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + + /* + * a part of contents of S[0] will be in L[0] and + * the rest part of S[0] will be in R[0] + */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item " + "number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) " + "parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) " + "parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; +} + +/* + * Balance leaf node in case of delete or cut: insert_size[0] < 0 + * + * lnum, rnum can have values >= -1 + * -1 means that the neighbor must be joined with S + * 0 means that nothing should be done with the neighbor + * >0 means to shift entirely or partly the specified number of items + * to the neighbor + */ +static int balance_leaf_when_delete(struct tree_balance *tb, int flag) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; + int n; + struct item_head *ih; + + RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, + "vs- 12000: level: wrong FR %z", tb->FR[0]); + RFALSE(tb->blknum[0] > 1, + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); + RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), + "PAP-12010: tree can not be empty"); + + ih = item_head(tbS0, item_pos); + buffer_info_init_tbS0(tb, &bi); + + /* Delete or truncate the item */ + + BUG_ON(flag != M_DELETE && flag != M_CUT); + if (flag == M_DELETE) + balance_leaf_when_delete_del(tb); + else /* M_CUT */ + balance_leaf_when_delete_cut(tb); + + + /* + * the rule is that no shifting occurs unless by shifting + * a node can be freed + */ + n = B_NR_ITEMS(tbS0); + + + /* L[0] takes part in balancing */ + if (tb->lnum[0]) + return balance_leaf_when_delete_left(tb); + + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} + +static unsigned int balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *const ih, + const char * const body) +{ + int ret; + struct buffer_info bi; + int n = B_NR_ITEMS(tb->L[0]); + unsigned body_shift_bytes = 0; + + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len, shift; + int version; + + ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = ih_item_len(ih) - tb->lbytes; + + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, ih_item_len(ih) - new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: " + "ih_item_len=%d", ih_item_len(ih)); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + min_t(int, tb->zeroes_num, ih_item_len(ih))); + + version = ih_version(ih); + + /* + * Calculate key component, item length and body to + * insert into S[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + + add_le_ih_k_offset(ih, tb->lbytes << shift); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > tb->zeroes_num) { + body_shift_bytes = tb->lbytes - tb->zeroes_num; + tb->zeroes_num = 0; + } else + tb->zeroes_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: " + "ih_item_len=%d", ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + tb->zeroes_num); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; + } + return body_shift_bytes; +} + +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + + RFALSE(tb->zeroes_num, + "PAP-12090: invalid parameter in case of a directory"); + + /* directory item */ + if (tb->lbytes > tb->pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head *pasted; + int ret, l_pos_in_item = tb->pos_in_item; + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 entries from given directory item + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); + if (ret && !tb->item_pos) { + pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); + l_pos_in_item += ih_entry_count(pasted) - + (tb->lbytes - 1); + } + + /* Append given directory entry to directory item */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + l_pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + /* + * previous string prepared space for pasting new entry, + * following string pastes this entry + */ + + /* + * when we have merge directory item, pos_in_item + * has been changed too + */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(&bi, n + tb->item_pos - ret, + l_pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* + * Shift lnum[0]-1 items in whole. Shift lbytes + * directory entries from directory item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + + /* Calculate new position to append in item body */ + tb->pos_in_item -= tb->lbytes; +} + +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + int body_shift_bytes = 0; + + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_left_shift_dirent(tb, ih, body); + return 0; + } + + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. " + "lbytes=%d", tb->lbytes); + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12100: incorrect position to paste: " + "item_len=%d, pos_in_item=%d", + ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item); + + /* appended item will be in L[0] in whole */ + if (tb->lbytes >= tb->pos_in_item) { + struct item_head *tbS0_pos_ih, *tbL0_ih; + struct item_head *tbS0_0_ih; + struct reiserfs_key *left_delim_key; + int ret, l_n, version, temp_l; + + tbS0_pos_ih = item_head(tbS0, tb->item_pos); + tbS0_0_ih = item_head(tbS0, 0); + + /* + * this bytes number must be appended + * to the last item of L[h] + */ + l_n = tb->lbytes - tb->pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12105: there is nothing to paste into " + "L[0]. insert_size=%d", tb->insert_size[0]); + + ret = leaf_shift_left(tb, tb->lnum[0], + ih_item_len(tbS0_pos_ih)); + + tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + ih_item_len(tbL0_ih), l_n, body, + min_t(int, l_n, tb->zeroes_num)); + + /* + * 0-th item in S0 can be only of DIRECT type + * when l_n != 0 + */ + temp_l = l_n; + + RFALSE(ih_item_len(tbS0_0_ih), + "PAP-12106: item length must be 0"); + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], n + tb->item_pos - ret)), + "PAP-12107: items must be of the same file"); + + if (is_indirect_le_ih(tbL0_ih)) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_l = l_n << shift; + } + /* update key of first item in S0 */ + version = ih_version(tbS0_0_ih); + add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l); + + /* update left delimiting key */ + left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]); + add_le_key_k_offset(version, left_delim_key, temp_l); + + /* + * Calculate new body, position in item and + * insert_size[0] + */ + if (l_n > tb->zeroes_num) { + body_shift_bytes = l_n - tb->zeroes_num; + tb->zeroes_num = 0; + } else + tb->zeroes_num -= l_n; + tb->pos_in_item = 0; + + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], + B_NR_ITEMS(tb->L[0]) - 1)) || + !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable(left_delim_key, tbS0->b_size), + "PAP-12120: item must be merge-able with left " + "neighboring item"); + } else { + /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + tb->pos_in_item -= tb->lbytes; + + RFALSE(tb->pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + tb->pos_in_item); + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + return body_shift_bytes; +} + + +/* appended item will be in L[0] in whole */ +static void balance_leaf_paste_left_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + struct item_head *pasted; + int ret; + + /* if we paste into first item of S[0] and it is left mergable */ + if (!tb->item_pos && + op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) { + /* + * then increment pos_in_item by the size of the + * last item in L[0] + */ + pasted = item_head(tb->L[0], n - 1); + if (is_direntry_le_ih(pasted)) + tb->pos_in_item += ih_entry_count(pasted); + else + tb->pos_in_item += ih_item_len(pasted); + } + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* if appended item is directory, paste entry */ + pasted = item_head(tb->L[0], n + tb->item_pos - ret); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, n + tb->item_pos - ret, + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* + * if appended item is indirect item, put unformatted node + * into un list + */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->insert_size[0] = 0; + tb->zeroes_num = 0; +} + +static unsigned int balance_leaf_paste_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + /* we must shift the part of the appended item */ + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) + return balance_leaf_paste_left_shift(tb, ih, body); + else + balance_leaf_paste_left_whole(tb, ih, body); + return 0; +} + +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ +static unsigned int balance_leaf_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + if (tb->lnum[0] <= 0) + return 0; + + /* new item or it part falls to L[0], shift it too */ + if (tb->item_pos < tb->lnum[0]) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + return balance_leaf_insert_left(tb, ih, body); + else /* M_PASTE */ + return balance_leaf_paste_left(tb, ih, body); + } else + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + return 0; +} + + +static void balance_leaf_insert_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int ret; + + /* new item or part of it doesn't fall into R[0] */ + if (n - tb->rnum[0] >= tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* new item or its part falls to R[0] */ + + /* part of new item falls into R[0] */ + if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { + loff_t old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version, shift; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, -1); + + version = ih_version(ih); + + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into R[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + + /* Insert part of the item into R[0] */ + buffer_info_init_right(tb, &bi); + if ((old_len - tb->rbytes) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->rbytes) - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - + (old_len - tb->rbytes); + tb->zeroes_num -= r_zeroes_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + /* + * Calculate key component and item length to + * insert into S[0] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { + /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + + /* Insert new item into R[0] */ + buffer_info_init_right(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1, + ih, body, tb->zeroes_num); + + if (tb->item_pos - n + tb->rnum[0] - 1 == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + + +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + int entry_count; + + RFALSE(tb->zeroes_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = ih_entry_count(item_head(tbS0, tb->item_pos)); + + /* new directory entry falls into R[0] */ + if (entry_count - tb->rbytes < tb->pos_in_item) { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: " + "rbytes=%d, entry_count=%d", tb->rbytes, entry_count); + + /* + * Shift rnum[0]-1 items in whole. + * Shift rbytes-1 directory entries from directory + * item number rnum[0] + */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); + + /* Paste given directory entry to directory item */ + paste_entry_position = tb->pos_in_item - entry_count + + tb->rbytes - 1; + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, 0, paste_entry_position, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, 0, paste_entry_position, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + /* change delimiting keys */ + if (paste_entry_position == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into R[0] */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } +} + +static void balance_leaf_paste_right_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n_shift, n_rem, r_zeroes_number, version; + unsigned long temp_rem; + const char *r_body; + struct buffer_info bi; + + /* we append to directory item */ + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_right_shift_dirent(tb, ih, body); + return; + } + + /* regular object */ + + /* + * Calculate number of bytes which must be shifted + * from appended item + */ + n_shift = tb->rbytes - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, " + "pos_in_item=%d", tb->pos_in_item, + ih_item_len(item_head(tbS0, tb->item_pos))); + + leaf_shift_right(tb, tb->rnum[0], n_shift); + + /* + * Calculate number of bytes which must remain in body + * after appending to R[0] + */ + n_rem = tb->insert_size[0] - tb->rbytes; + if (n_rem < 0) + n_rem = 0; + + temp_rem = n_rem; + + version = ih_version(item_head(tb->R[0], 0)); + + if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_rem = n_rem << shift; + } + + add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem); + add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]), + temp_rem); + + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + buffer_info_init_right(tb, &bi); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + if (is_indirect_le_ih(item_head(tb->R[0], 0))) + set_ih_free_space(item_head(tb->R[0], 0), 0); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_paste_right_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct item_head *pasted; + struct buffer_info bi; + + buffer_info_init_right(tb, &bi); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + /* append item in R[0] */ + if (tb->pos_in_item >= 0) { + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + } + + /* paste new entry, if item is directory item */ + pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) { + leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->pos_in_item) { + + RFALSE(tb->item_pos - n + tb->rnum[0], + "PAP-12165: directory item must be first " + "item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + } + } + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->zeroes_num = tb->insert_size[0] = 0; +} + +static void balance_leaf_paste_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* new item doesn't fall into R[0] */ + if (n - tb->rnum[0] > tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* pasted item or part of it falls to R[0] */ + + if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1) + /* we must shift the part of the appended item */ + balance_leaf_paste_right_shift(tb, ih, body); + else + /* pasted item in whole falls into R[0] */ + balance_leaf_paste_right_whole(tb, ih, body); +} + +/* shift rnum[0] items from S[0] to the right neighbor R[0] */ +static void balance_leaf_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + if (tb->rnum[0] <= 0) + return; + + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + balance_leaf_insert_right(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_right(tb, ih, body); +} + +static void balance_leaf_new_nodes_insert(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int shift; + + /* new item or it part don't falls into S_new[i] */ + if (n - tb->snum[i] >= tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* new item or it's part falls to first new node S_new[i] */ + + /* part of new item falls into S_new[i] */ + if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { + int old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, + tb->S_new[i]); + + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into S_new[i] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - tb->sbytes[i]) << shift)); + + put_ih_item_len(ih, tb->sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + + if ((old_len - tb->sbytes[i]) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->sbytes[i]) - + tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - (old_len - + tb->sbytes[i]); + tb->zeroes_num -= r_zeroes_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* + * Calculate key component and item length to + * insert into S[i] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->sbytes[i]); + tb->insert_size[0] -= tb->sbytes[i]; + } else { + /* whole new item falls into S_new[i] */ + + /* + * Shift snum[0] - 1 items to S_new[i] + * (sbytes[i] of split item) + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]); + + /* Insert new item into S_new[i] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1, + ih, body, tb->zeroes_num); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + +/* we append to directory item */ +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int entry_count = ih_entry_count(aux_ih); + struct buffer_info bi; + + if (entry_count - tb->sbytes[i] < tb->pos_in_item && + tb->pos_in_item <= entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb->insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(tb->sbytes[i] - 1 >= entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + tb->sbytes[i] - 1, entry_count); + + /* + * Shift snum[i]-1 items in whole. + * Shift sbytes[i] directory entries + * from directory item number snum[i] + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i] - 1, tb->S_new[i]); + + /* + * Paste given directory entry to + * directory item + */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, tb->insert_size[0], + body, tb->zeroes_num); + + /* paste new directory entry */ + leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + } + +} + +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int n_shift, n_rem, r_zeroes_number, shift; + const char *r_body; + struct item_head *tmp; + struct buffer_info bi; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + if (is_direntry_le_ih(aux_ih)) { + balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key, + insert_ptr, i); + return; + } + + /* regular object */ + + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) || + tb->insert_size[0] <= 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* + * Calculate number of bytes which must be shifted from appended item + */ + n_shift = tb->sbytes[i] - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift, + tb->S_new[i]); + + /* + * Calculate number of bytes which must remain in body after + * append to S_new[i] + */ + n_rem = tb->insert_size[0] - tb->sbytes[i]; + if (n_rem < 0) + n_rem = 0; + + /* Append part of body into S_new[0] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + tmp = item_head(tb->S_new[i], 0); + shift = 0; + if (is_indirect_le_ih(tmp)) { + set_ih_free_space(tmp, 0); + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + } + add_le_ih_k_offset(tmp, n_rem << shift); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) + +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + int leaf_mi; + struct item_head *pasted; + struct buffer_info bi; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih_check = item_head(tbS0, tb->item_pos); + + if (!is_direntry_le_ih(ih_check) && + (tb->pos_in_item != ih_item_len(ih_check) || + tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235", + "pos_in_item must be equal to ih_item_len"); +#endif + + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + + RFALSE(leaf_mi, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + leaf_mi); + + /* paste into item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + pasted = item_head(tb->S_new[i], tb->item_pos - n + + tb->snum[i]); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + +} +static void balance_leaf_new_nodes_paste(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* pasted item doesn't fall into S_new[i] */ + if (n - tb->snum[i] > tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* pasted item or part if it falls to S_new[i] */ + + if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1) + /* we must shift part of the appended item */ + balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key, + insert_ptr, i); + else + /* item falls wholly into S_new[i] */ + balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key, + insert_ptr, i); +} + +/* Fill new nodes that appear in place of S[0] */ +static void balance_leaf_new_nodes(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int flag) +{ + int i; + for (i = tb->blknum[0] - 2; i >= 0; i--) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + RFALSE(!tb->snum[i], + "PAP-12200: snum[%d] == %d. Must be > 0", i, + tb->snum[i]); + + /* here we shift from S to S_new nodes */ + + tb->S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL); + + if (flag == M_INSERT) + balance_leaf_new_nodes_insert(tb, ih, body, insert_key, + insert_ptr, i); + else /* M_PASTE */ + balance_leaf_new_nodes_paste(tb, ih, body, insert_key, + insert_ptr, i); + + memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE); + insert_ptr[i] = tb->S_new[i]; + + RFALSE(!buffer_journaled(tb->S_new[i]) + || buffer_journal_dirty(tb->S_new[i]) + || buffer_dirty(tb->S_new[i]), + "PAP-12247: S_new[%d] : (%b)", + i, tb->S_new[i]); + } +} + +static void balance_leaf_finish_node_insert(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num); + + /* If we insert the first key change the delimiting key */ + if (tb->item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + + } +} + +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *pasted = item_head(tbS0, tb->item_pos); + struct buffer_info bi; + + if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) { + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->item_pos && !tb->pos_in_item) { + RFALSE(!tb->CFL[0] || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); + } + + tb->insert_size[0] = 0; + } +} + +static void balance_leaf_finish_node_paste(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + struct item_head *pasted = item_head(tbS0, tb->item_pos); + + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + balance_leaf_finish_node_paste_dirent(tb, ih, body); + return; + } + + /* regular object */ + + if (tb->pos_in_item == ih_item_len(pasted)) { + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->insert_size[0] = 0; + } +#ifdef CONFIG_REISERFS_CHECK + else if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb->tb_sb, "PAP-12285", + "insert_size must be 0 (%d)", tb->insert_size[0]); + } +#endif +} + +/* + * if the affected item was not wholly shifted then we + * perform all necessary operations on that part or whole + * of the affected item which remains in S + */ +static void balance_leaf_finish_node(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + /* if we must insert or append into buffer S[0] */ + if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { + if (flag == M_INSERT) + balance_leaf_finish_node_insert(tb, ih, body); + else /* M_PASTE */ + balance_leaf_finish_node_paste(tb, ih, body); + } +} + +/** + * balance_leaf - reiserfs tree balancing algorithm + * @tb: tree balance state + * @ih: item header of inserted item (little endian) + * @body: body of inserted item or bytes to paste + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance) + * passed back: + * @insert_key: key to insert new nodes + * @insert_ptr: array of nodes to insert at the next level + * + * In our processing of one level we sometimes determine what must be + * inserted into the next higher level. This insertion consists of a + * key or two keys and their corresponding pointers. + */ +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag, + struct item_head *insert_key, + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + tb->item_pos = PATH_LAST_POSITION(tb->tb_path), + tb->pos_in_item = tb->tb_path->pos_in_item, + tb->zeroes_num = 0; + if (flag == M_INSERT && !body) + tb->zeroes_num = ih_item_len(ih); + + /* + * for indirect item pos_in_item is measured in unformatted node + * pointers. Recalculate to bytes + */ + if (flag != M_INSERT + && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) + tb->pos_in_item *= UNFM_P_SIZE; + + body += balance_leaf_left(tb, ih, body, flag); + + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + balance_leaf_right(tb, ih, body, flag); + + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + + /* + * if while adding to a node we discover that it is possible to split + * it in two, and merge the left part into the left neighbor and the + * right part into the right neighbor, eliminating the node + */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* + * if insertion was done before 0-th position in R[0], right + * delimiting key of the tb->L[0]'s and left delimiting key are + * not set correctly + */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, "vs-12195", + "CFR not initialized"); + copy_key(internal_key(tb->CFL[0], tb->lkey[0]), + internal_key(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); + } + + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag); + + balance_leaf_finish_node(tb, ih, body, flag); + +#ifdef CONFIG_REISERFS_CHECK + if (flag == M_PASTE && tb->insert_size[0]) { + print_cur_tb("12290"); + reiserfs_panic(tb->tb_sb, + "PAP-12290", "insert_size is still not 0 (%d)", + tb->insert_size[0]); + } +#endif + + /* Leaf level of the tree is balanced (end of balance_leaf) */ + return 0; +} + +/* Make empty node */ +void make_empty_node(struct buffer_info *bi) +{ + struct block_head *blkh; + + RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); + + blkh = B_BLK_HEAD(bi->bi_bh); + set_blkh_nr_item(blkh, 0); + set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh)); + + if (bi->bi_parent) + B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ +} + +/* Get first empty buffer */ +struct buffer_head *get_FEB(struct tree_balance *tb) +{ + int i; + struct buffer_info bi; + + for (i = 0; i < MAX_FEB_SIZE; i++) + if (tb->FEB[i] != NULL) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty"); + + buffer_info_init_bh(tb, &bi, tb->FEB[i]); + make_empty_node(&bi); + set_buffer_uptodate(tb->FEB[i]); + tb->used[i] = tb->FEB[i]; + tb->FEB[i] = NULL; + + return tb->used[i]; +} + +/* This is now used because reiserfs_free_block has to be able to schedule. */ +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) +{ + int i; + + if (buffer_dirty(bh)) + reiserfs_warning(tb->tb_sb, "reiserfs-12320", + "called with dirty buffer"); + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + get_bh(bh); /* free_thrown puts this */ + return; + } + reiserfs_warning(tb->tb_sb, "reiserfs-12321", + "too many thrown buffers"); +} + +static void free_thrown(struct tree_balance *tb) +{ + int i; + b_blocknr_t blocknr; + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr; + if (buffer_dirty(tb->thrown[i])) + reiserfs_warning(tb->tb_sb, "reiserfs-12322", + "called with dirty buffer %d", + blocknr); + brelse(tb->thrown[i]); /* incremented in store_thrown */ + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + } +} + +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh) +{ + struct block_head *blkh; + blkh = B_BLK_HEAD(bh); + set_blkh_level(blkh, FREE_LEVEL); + set_blkh_nr_item(blkh, 0); + + clear_buffer_dirty(bh); + store_thrown(tb, bh); +} + +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, + struct buffer_head *src, int n_src) +{ + + RFALSE(dest == NULL || src == NULL, + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", + src, dest); + RFALSE(!B_IS_KEYS_LEVEL(dest), + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", + dest); + RFALSE(n_dest < 0 || n_src < 0, + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); + RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); + + if (B_IS_ITEMS_LEVEL(src)) + /* source buffer contains leaf node */ + memcpy(internal_key(dest, n_dest), item_head(src, n_src), + KEY_SIZE); + else + memcpy(internal_key(dest, n_dest), internal_key(src, n_src), + KEY_SIZE); + + do_balance_mark_internal_dirty(tb, dest, 0); +} + +int get_left_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL, + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h)); + + if (Sh_position == 0) + return B_NR_ITEMS(tb->FL[h]); + else + return Sh_position - 1; +} + +int get_right_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL, + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]); + + if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h))) + return 0; + else + return Sh_position + 1; +} + +#ifdef CONFIG_REISERFS_CHECK + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +static void check_internal_node(struct super_block *s, struct buffer_head *bh, + char *mes) +{ + struct disk_child *dc; + int i; + + RFALSE(!bh, "PAP-12336: bh == 0"); + + if (!bh || !B_IS_IN_TREE(bh)) + return; + + RFALSE(!buffer_dirty(bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh)), + "PAP-12337: buffer (%b) must be dirty", bh); + dc = B_N_CHILD(bh, 0); + + for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { + if (!is_reusable(s, dc_block_number(dc), 1)) { + print_cur_tb(mes); + reiserfs_panic(s, "PAP-12338", + "invalid child pointer %y in %b", + dc, bh); + } + } +} + +static int locked_or_not_in_tree(struct tree_balance *tb, + struct buffer_head *bh, char *which) +{ + if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || + !B_IS_IN_TREE(bh)) { + reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh); + return 1; + } + return 0; +} + +static int check_before_balancing(struct tree_balance *tb) +{ + int retval = 0; + + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " + "occurred based on cur_tb not being null at " + "this point in code. do_balance cannot properly " + "handle concurrent tree accesses on a same " + "mount point."); + } + + /* + * double check that buffers that we will modify are unlocked. + * (fix_nodes should already have prepped all of these for us). + */ + if (tb->lnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]"); + check_leaf(tb->L[0]); + } + if (tb->rnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]"); + check_leaf(tb->R[0]); + } + retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path), + "S[0]"); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); + + return retval; +} + +static void check_after_balance_leaf(struct tree_balance *tb) +{ + if (tb->lnum[0]) { + if (B_FREE_SPACE(tb->L[0]) != + MAX_CHILD_SIZE(tb->L[0]) - + dc_size(B_N_CHILD + (tb->FL[0], get_left_neighbor_position(tb, 0)))) { + print_cur_tb("12221"); + reiserfs_panic(tb->tb_sb, "PAP-12355", + "shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE(tb->R[0]) != + MAX_CHILD_SIZE(tb->R[0]) - + dc_size(B_N_CHILD + (tb->FR[0], get_right_neighbor_position(tb, 0)))) { + print_cur_tb("12222"); + reiserfs_panic(tb->tb_sb, "PAP-12360", + "shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path, 1) && + (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) != + (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1)))))) { + int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)); + int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, + 1)))); + print_cur_tb("12223"); + reiserfs_warning(tb->tb_sb, "reiserfs-12363", + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", + left, + MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)), + PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1), + dc_size(B_N_CHILD + (PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1))), + right); + reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect"); + } +} + +static void check_leaf_level(struct tree_balance *tb) +{ + check_leaf(tb->L[0]); + check_leaf(tb->R[0]); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); +} + +static void check_internal_levels(struct tree_balance *tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h++) { + check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h), + "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node(tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node(tb->tb_sb, tb->R[h], "BAD R"); + } + +} + +#endif + +/* + * Now we have all of the buffers that must be used in balancing of + * the tree. We rely on the assumption that schedule() will not occur + * while do_balance works. ( Only interrupt handlers are acceptable.) + * We balance the tree according to the analysis made before this, + * using buffers already obtained. For SMP support it will someday be + * necessary to add ordered locking of tb. + */ + +/* + * Some interesting rules of balancing: + * we delete a maximum of two nodes per level per balancing: we never + * delete R, when we delete two of three nodes L, S, R then we move + * them into R. + * + * we only delete L if we are deleting two nodes, if we delete only + * one node we delete S + * + * if we shift leaves then we shift as much as we can: this is a + * deliberate policy of extremism in node packing which results in + * higher average utilization after repeated random balance operations + * at the cost of more memory copies and more balancing as a result of + * small insertions to full nodes. + * + * if we shift internal nodes we try to evenly balance the node + * utilization, with consequent less balancing at the cost of lower + * utilization. + * + * one could argue that the policy for directories in leaves should be + * that of internal nodes, but we will wait until another day to + * evaluate this.... It would be nice to someday measure and prove + * these assumptions as to what is optimal.... + */ + +static inline void do_balance_starts(struct tree_balance *tb) +{ + /* use print_cur_tb() to see initial state of struct tree_balance */ + + /* store_print_tb (tb); */ + + /* do not delete, just comment it out */ + /* + print_tb(flag, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item, tb, "check"); + */ + RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); +#ifdef CONFIG_REISERFS_CHECK + REISERFS_SB(tb->tb_sb)->cur_tb = tb; +#endif +} + +static inline void do_balance_completed(struct tree_balance *tb) +{ + +#ifdef CONFIG_REISERFS_CHECK + check_leaf_level(tb); + check_internal_levels(tb); + REISERFS_SB(tb->tb_sb)->cur_tb = NULL; +#endif + + /* + * reiserfs_free_block is no longer schedule safe. So, we need to + * put the buffers we want freed on the thrown list during do_balance, + * and then free them now + */ + + REISERFS_SB(tb->tb_sb)->s_do_balance++; + + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); + + free_thrown(tb); +} + +/* + * do_balance - balance the tree + * + * @tb: tree_balance structure + * @ih: item header of inserted item + * @body: body of inserted item or bytes to paste + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste + * + * Cut means delete part of an item (includes removing an entry from a + * directory). + * + * Delete means delete whole item. + * + * Insert means add a new item into the tree. + * + * Paste means to append to the end of an existing file or to + * insert a directory entry. + */ +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + int child_pos; /* position of a child node in its parent */ + int h; /* level of the tree being processed */ + + /* + * in our processing of one level we sometimes determine what + * must be inserted into the next higher level. This insertion + * consists of a key or two keys and their corresponding + * pointers + */ + struct item_head insert_key[2]; + + /* inserted node-ptrs for the next level */ + struct buffer_head *insert_ptr[2]; + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has " + "changed"); + } + /* if we have no real work to do */ + if (!tb->insert_size[0]) { + reiserfs_warning(tb->tb_sb, "PAP-12350", + "insert_size == 0, mode == %c", flag); + unfix_nodes(tb); + return; + } + + atomic_inc(&fs_generation(tb->tb_sb)); + do_balance_starts(tb); + + /* + * balance_leaf returns 0 except if combining L R and S into + * one node. see balance_internal() for explanation of this + * line of code. + */ + child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); + +#ifdef CONFIG_REISERFS_CHECK + check_after_balance_leaf(tb); +#endif + + /* Balance internal level of the tree. */ + for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) + child_pos = balance_internal(tb, h, child_pos, insert_key, + insert_ptr); + + do_balance_completed(tb); +} diff --git a/kernel/fs/reiserfs/file.c b/kernel/fs/reiserfs/file.c new file mode 100644 index 000000000..96a1bcf33 --- /dev/null +++ b/kernel/fs/reiserfs/file.c @@ -0,0 +1,270 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include + +/* + * We pack the tails of files on file close, not at the time they are written. + * This implies an unnecessary copy of the tail and an unnecessary indirect item + * insertion/balancing, for files that are written in one write. + * It avoids unnecessary tail packings (balances) for files that are written in + * multiple writes and are small enough to have tails. + * + * file_release is called by the VFS layer when the file is closed. If + * this is the last open file descriptor, and the file + * small enough to have a tail, and the tail is currently in an + * unformatted node, the tail is converted back into a direct item. + * + * We use reiserfs_truncate_file to pack the tail, since it already has + * all the conditions coded. + */ +static int reiserfs_file_release(struct inode *inode, struct file *filp) +{ + + struct reiserfs_transaction_handle th; + int err; + int jbegin_failure = 0; + + BUG_ON(!S_ISREG(inode->i_mode)); + + if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) + return 0; + + mutex_lock(&REISERFS_I(inode)->tailpack); + + if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { + mutex_unlock(&REISERFS_I(inode)->tailpack); + return 0; + } + + /* fast out for when nothing needs to be done */ + if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || + !tail_has_to_be_packed(inode)) && + REISERFS_I(inode)->i_prealloc_count <= 0) { + mutex_unlock(&REISERFS_I(inode)->tailpack); + return 0; + } + + reiserfs_write_lock(inode->i_sb); + /* + * freeing preallocation only involves relogging blocks that + * are already in the current transaction. preallocation gets + * freed at the end of each transaction, so it is impossible for + * us to log any additional blocks (including quota blocks) + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + /* + * uh oh, we can't allow the inode to go away while there + * is still preallocation blocks pending. Try to join the + * aborted transaction + */ + jbegin_failure = err; + err = journal_join_abort(&th, inode->i_sb); + + if (err) { + /* + * hmpf, our choices here aren't good. We can pin + * the inode which will disallow unmount from ever + * happening, we can do nothing, which will corrupt + * random memory on unmount, or we can forcibly + * remove the file from the preallocation list, which + * will leak blocks on disk. Lets pin the inode + * and let the admin know what is going on. + */ + igrab(inode); + reiserfs_warning(inode->i_sb, "clm-9001", + "pinning inode %lu because the " + "preallocation can't be freed", + inode->i_ino); + goto out; + } + } + reiserfs_update_inode_transaction(inode); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc(&th, inode); +#endif + err = journal_end(&th); + + /* copy back the error code from journal_begin */ + if (!err) + err = jbegin_failure; + + if (!err && + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && + tail_has_to_be_packed(inode)) { + + /* + * if regular file is released by last holder and it has been + * appended (we append by unformatted node only) or its direct + * item(s) had to be converted, then it may have to be + * indirect2direct converted + */ + err = reiserfs_truncate_file(inode, 0); + } +out: + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&REISERFS_I(inode)->tailpack); + return err; +} + +static int reiserfs_file_open(struct inode *inode, struct file *file) +{ + int err = dquot_file_open(inode, file); + + /* somebody might be tailpacking on final close; wait for it */ + if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { + mutex_lock(&REISERFS_I(inode)->tailpack); + atomic_inc(&REISERFS_I(inode)->openers); + mutex_unlock(&REISERFS_I(inode)->tailpack); + } + return err; +} + +void reiserfs_vfs_truncate_file(struct inode *inode) +{ + mutex_lock(&REISERFS_I(inode)->tailpack); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); +} + +/* Sync a reiserfs file. */ + +/* + * FIXME: sync_mapping_buffers() never has anything to sync. Can + * be removed... + */ + +static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + int barrier_done; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + BUG_ON(!S_ISREG(inode->i_mode)); + err = sync_mapping_buffers(inode->i_mapping); + reiserfs_write_lock(inode->i_sb); + barrier_done = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); + if (barrier_done < 0) + return barrier_done; + return (err < 0) ? -EIO : 0; +} + +/* taken fs/buffer.c:__block_commit_write */ +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + int logit = reiserfs_file_data_log(inode); + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + struct reiserfs_transaction_handle th; + int ret = 0; + + th.t_trans_id = 0; + blocksize = 1 << inode->i_blkbits; + + if (logit) { + reiserfs_write_lock(s); + ret = journal_begin(&th, s, bh_per_page + 1); + if (ret) + goto drop_write_lock; + reiserfs_update_inode_transaction(inode); + } + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (logit) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, bh); + } else if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* + * do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } + } + if (logit) { + ret = journal_end(&th); +drop_write_lock: + reiserfs_write_unlock(s); + } + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return ret; +} + +const struct file_operations reiserfs_file_operations = { + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = reiserfs_file_open, + .release = reiserfs_file_release, + .fsync = reiserfs_sync_file, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations reiserfs_file_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; diff --git a/kernel/fs/reiserfs/fix_node.c b/kernel/fs/reiserfs/fix_node.c new file mode 100644 index 000000000..6b0ddb2a9 --- /dev/null +++ b/kernel/fs/reiserfs/fix_node.c @@ -0,0 +1,2825 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* + * To make any changes in the tree we find a node that contains item + * to be changed/deleted or position in the node we insert a new item + * to. We call this node S. To do balancing we need to decide what we + * will shift to left/right neighbor, or to a new node, where new item + * will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ + +/* + * Takes item number in virtual node, returns number of item + * that it has in source buffer + */ +static inline int old_item_num(int new_num, int affected_item_num, int mode) +{ + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; + + if (mode == M_INSERT) { + + RFALSE(new_num == 0, + "vs-8005: for INSERT mode and item number of inserted item"); + + return new_num - 1; + } + + RFALSE(mode != M_DELETE, + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", + mode); + /* delete mode */ + return new_num + 1; +} + +static void create_virtual_node(struct tree_balance *tb, int h) +{ + struct item_head *ih; + struct virtual_node *vn = tb->tb_vn; + int new_num; + struct buffer_head *Sh; /* this comes from tb->S[h] */ + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + + /* size of changed node */ + vn->vn_size = + MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; + + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; + } + + /* number of items in virtual node */ + vn->vn_nr_item = + B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - + ((vn->vn_mode == M_DELETE) ? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); + + /* first item in the node */ + ih = item_head(Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ + if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) + && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* + * go through all items that remain in the virtual + * node (except for the new (inserted) one) + */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { + int j; + struct virtual_item *vi = vn->vn_vi + new_num; + int is_affected = + ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num(new_num, vn->vn_affected_item_num, + vn->vn_mode); + + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = ih_item_body(Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + /* + * FIXME: there is no check that item operation did not + * consume too much memory + */ + vn->vn_free_ptr += + op_create_vi(vn, vi, is_affected, tb->insert_size[0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic(tb->tb_sb, "vs-8030", + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + /* pointer to data which is going to be pasted */ + vi->vi_new_data = vn->vn_data; + } + } + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; + + RFALSE(vn->vn_ins_ih == NULL, + "vs-8040: item header of inserted item is not specified"); + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi(vn, vi, 0 /*not pasted or cut */ , + tb->insert_size[0]); + } + + /* + * set right merge flag we take right delimiting key and + * check whether it is a mergeable item + */ + if (tb->CFR[0]) { + struct reiserfs_key *key; + + key = internal_key(tb->CFR[0], tb->rkey[0]); + if (op_is_left_mergeable(key, Sh->b_size) + && (vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) + vn->vn_vi[vn->vn_nr_item - 1].vi_type |= + VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable(key, Sh->b_size) && + !(vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { + /* + * we delete last item and it could be merged + * with right neighbor's first item + */ + if (! + (B_NR_ITEMS(Sh) == 1 + && is_direntry_le_ih(item_head(Sh, 0)) + && ih_entry_count(item_head(Sh, 0)) == 1)) { + /* + * node contains more than 1 item, or item + * is not directory item, or this item + * contains more than 1 entry + */ + print_block(Sh, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "vs-8045", + "rdkey %k, affected item==%d " + "(mode==%c) Must be %c", + key, vn->vn_affected_item_num, + vn->vn_mode, M_DELETE); + } + } +#endif + + } +} + +/* + * Using virtual node check, how many items can be + * shifted to left neighbor + */ +static void check_left(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); + + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8055: parent does not exist or invalid"); + + vi = vn->vn_vi; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8055: invalid mode or balance condition failed"); + + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; + i++, ih_size = IH_SIZE, d_size = 0, vi++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0]++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* + * check whether L[0] can hold ih and at least one byte + * of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left(vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0]++; + + break; + } + + return; +} + +/* + * Using virtual node check, how many items can be + * shifted to right neighbor + */ +static void check_right(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8075: parent does not exist or invalid"); + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8080: invalid mode or balance condition failed"); + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; + i--, d_size = 0, ih_size = IH_SIZE, vi--) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0]++; + continue; + } + + /* + * check whether R[0] can hold ih and at least one + * byte of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { + tb->rbytes = -1; + return; + } + + /* + * R[0] can hold the header of the item and at least + * one byte of its body + */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right(vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0]++; + + break; + } + + return; +} + +/* + * from - number of items, which are shifted to left neighbor entirely + * to - number of item, which are shifted to right neighbor entirely + * from_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to right neighbor + */ +static int get_num_ver(int mode, struct tree_balance *tb, int h, + int from, int from_bytes, + int to, int to_bytes, short *snum012, int flow) +{ + int i; + int cur_free; + int units; + struct virtual_node *vn = tb->tb_vn; + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + + /* position of item we start filling node from */ + int start_item; + + /* position of item we finish filling node by */ + int end_item; + + /* + * number of first bytes (entries for directory) of start_item-th item + * we do not include into node that is being filled + */ + int start_bytes; + + /* + * number of last bytes (entries for directory) of end_item-th item + * we do node include into node that is being filled + */ + int end_bytes; + + /* + * these are positions in virtual item of items, that are split + * between S[0] and S1new and S1new and S2new + */ + int split_item_positions[2]; + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + + /* + * We only create additional nodes if we are in insert or paste mode + * or we are in replace mode at the internal level. If h is 0 and + * the mode is M_REPLACE then in fix_nodes we change the mode to + * paste or insert before we get here in the code. + */ + RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), + "vs-8100: insert_size < 0 in overflow"); + + max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); + + /* + * snum012 [0-2] - number of items, that lay + * to S[0], first new node and second new node + */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); + } + + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + /* start from 'from'-th item */ + start_item = from; + /* skip its first 'start_bytes' units */ + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + /* last included item is the 'end_item'-th one */ + end_item = vn->vn_nr_item - to - 1; + /* do not count last 'end_bytes' units of 'end_item'-th item */ + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* + * go through all item beginning from the start_item-th item + * and ending by the end_item-th item. Do not count first + * 'start_bytes' units of 'start_item'-th item and last + * 'end_bytes' of 'end_item'-th item + */ + for (i = start_item; i <= end_item; i++) { + struct virtual_item *vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + + RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* + * do not take in calculation head part (from_bytes) + * of from-th item + */ + current_item_size -= + op_part_size(vi, 0 /*from start */ , start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= + op_part_size(vi, 1 /*from end */ , skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1]++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + /* + * virtual item length is longer, than max size of item in + * a node. It is impossible for direct item + */ + if (current_item_size > max_node_size) { + RFALSE(is_direct_le_ih(vi->vi_ih), + "vs-8110: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); + /* we will try to split it */ + flow = 1; + } + + /* as we do not split items, take new node and continue */ + if (!flow) { + needed_nodes++; + i--; + total_node_size = 0; + continue; + } + + /* + * calculate number of item units which fit into node being + * filled + */ + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = + op_check_left(vi, free_space, start_bytes, + skip_from_end); + /* + * nothing fits into current node, take new + * node and continue + */ + if (units == -1) { + needed_nodes++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning(tb->tb_sb, "vs-8111", + "split_item_position is out of range"); + snum012[needed_nodes - 1]++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i--; + total_node_size = 0; + } + + /* + * sum012[4] (if it is not -1) contains number of units of which + * are to be in S1new, snum012[3] - to be in S0. They are supposed + * to be S1bytes and S2bytes correspondingly, so recalculate + */ + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = + ((split_item_positions[0] == + split_item_positions[1]) ? snum012[3] : 0); + + /* s2bytes */ + snum012[4] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - + bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) + reiserfs_warning(tb->tb_sb, "vs-8115", + "not directory or indirect item"); + } + + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = + ((split_item_positions[0] == split_item_positions[1] + && snum012[4] != -1) ? snum012[4] : 0); + + /* s1bytes */ + snum012[3] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - + bytes_to_r - bytes_to_l - bytes_to_S2new; + } + + return needed_nodes; +} + + +/* + * Set parameters for balancing. + * Performs write of results of analysis of balancing into structure tb, + * where it will later be used by the functions that actually do the balancing. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * lnum number of items from S[h] that must be shifted to L[h]; + * rnum number of items from S[h] that must be shifted to R[h]; + * blk_num number of blocks that S[h] will be splitted into; + * s012 number of items that fall into splitted nodes. + * lbytes number of bytes which flow to the left neighbor from the + * item that is not not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the + * item that is not not shifted entirely + * s1bytes number of bytes which flow to the first new node when + * S[0] splits (this number is contained in s012 array) + */ + +static void set_parameters(struct tree_balance *tb, int h, int lnum, + int rnum, int blk_num, short *s012, int lb, int rb) +{ + + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; + + /* only for leaf level */ + if (h == 0) { + if (s012 != NULL) { + tb->s0num = *s012++; + tb->snum[0] = *s012++; + tb->snum[1] = *s012++; + tb->sbytes[0] = *s012++; + tb->sbytes[1] = *s012; + } + tb->lbytes = lb; + tb->rbytes = rb; + } + PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); + PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); + + PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); + PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); +} + +/* + * check if node disappears if we shift tb->lnum[0] items to left + * neighbor and tb->rnum[0] to the right one. + */ +static int is_leaf_removable(struct tree_balance *tb) +{ + struct virtual_node *vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* + * number of items that will be shifted to left (right) neighbor + * entirely + */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + /* all content of node can be shifted to neighbors */ + if (remain_items < 1) { + set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, + NULL, -1, -1); + return 1; + } + + /* S[0] is not removable */ + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + return 0; + + /* check whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num(&vn->vn_vi[to_left]); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, + tb->lbytes, -1); + return 1; + } + + return 0; +} + +/* check whether L, S, R can be joined in one node */ +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) +{ + struct virtual_node *vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item - 1]. + vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head *ih; + + RFALSE(B_NR_ITEMS(S0) != 1, + "vs-8125: item number must be 1: it is %d", + B_NR_ITEMS(S0)); + + ih = item_head(S0, 0); + if (tb->CFR[0] + && !comp_short_le_keys(&ih->ih_key, + internal_key(tb->CFR[0], + tb->rkey[0]))) + /* + * Directory must be in correct state here: that is + * somewhere at the left side should exist first + * directory item. But the item being deleted can + * not be that first one because its right neighbor + * is item of the same directory. (But first item + * always gets deleted in last turn). So, neighbors + * of deleted item can be merged, so we can save + * ih_size + */ + if (is_direntry_le_ih(ih)) { + ih_size = IH_SIZE; + + /* + * we might check that left neighbor exists + * and is of the same directory + */ + RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, + "vs-8130: first directory item can not be removed until directory is not empty"); + } + + } + + if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); + PROC_INFO_INC(tb->tb_sb, leaves_removable); + return 1; + } + return 0; + +} + +/* when we do not split item, lnum and rnum are numbers of entire items */ +#define SET_PAR_SHIFT_LEFT \ +if (h)\ +{\ + int to_l;\ + \ + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ + (MAX_NR_KEY(Sh) + 1 - lpar);\ + \ + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (lset==LEFT_SHIFT_FLOW)\ + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ + tb->lbytes, -1);\ + else\ + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ + -1, -1);\ +} + +#define SET_PAR_SHIFT_RIGHT \ +if (h)\ +{\ + int to_r;\ + \ + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ + \ + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (rset==RIGHT_SHIFT_FLOW)\ + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ + -1, tb->rbytes);\ + else\ + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ + -1, -1);\ +} + +static void free_buffers_in_tb(struct tree_balance *tb) +{ + int i; + + pathrelse(tb->tb_path); + + for (i = 0; i < MAX_HEIGHT; i++) { + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } +} + +/* + * Get new buffers for storing new nodes that are created while balancing. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + * NO_DISK_SPACE - no disk space. + */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_empty_nodes(struct tree_balance *tb, int h) +{ + struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); + b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; + int counter, number_of_freeblk; + int amount_needed; /* number of needed empty blocks */ + int retval = CARRY_ON; + struct super_block *sb = tb->tb_sb; + + /* + * number_of_freeblk is the number of empty blocks which have been + * acquired for use by the balancing algorithm minus the number of + * empty blocks used in the previous levels of the analysis, + * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule + * occurs after empty blocks are acquired, and the balancing analysis + * is then restarted, amount_needed is the number needed by this + * level (h) of the balancing analysis. + * + * Note that for systems with many processes writing, it would be + * more layout optimal to calculate the total number needed by all + * levels and then to run reiserfs_new_blocks to get all of them at + * once. + */ + + /* + * Initiate number_of_freeblk to the amount acquired prior to the + * restart of the analysis or 0 if not restarted, then subtract the + * amount needed by all of the levels of the tree below h. + */ + /* blknum includes S[h], so we subtract 1 in this calculation */ + for (counter = 0, number_of_freeblk = tb->cur_blknum; + counter < h; counter++) + number_of_freeblk -= + (tb->blknum[counter]) ? (tb->blknum[counter] - + 1) : 0; + + /* Allocate missing empty blocks. */ + /* if Sh == 0 then we are getting a new root */ + amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; + /* + * Amount_needed = the amount that we need more than the + * amount that we have. + */ + if (amount_needed > number_of_freeblk) + amount_needed -= number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + /* + * No need to check quota - is not allocated for blocks used + * for formatted nodes + */ + if (reiserfs_new_form_blocknrs(tb, blocknrs, + amount_needed) == NO_DISK_SPACE) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for (blocknr = blocknrs, counter = 0; + counter < amount_needed; blocknr++, counter++) { + + RFALSE(!*blocknr, + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); + + new_bh = sb_getblk(sb, *blocknr); + RFALSE(buffer_dirty(new_bh) || + buffer_journaled(new_bh) || + buffer_journal_dirty(new_bh), + "PAP-8140: journaled or dirty buffer %b for the new block", + new_bh); + + /* Put empty buffers into the array. */ + RFALSE(tb->FEB[tb->cur_blknum], + "PAP-8141: busy slot for new buffer"); + + set_buffer_journal_new(new_bh); + tb->FEB[tb->cur_blknum++] = new_bh; + } + + if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) + retval = REPEAT_SEARCH; + + return retval; +} + +/* + * Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. + */ +static int get_lfree(struct tree_balance *tb, int h) +{ + struct buffer_head *l, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (l = tb->FL[h]) == NULL) + return 0; + + if (f == l) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS(l); + f = l; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); +} + +/* + * Get free space of the right neighbor, + * which is stored in the parent node of the right neighbor. + */ +static int get_rfree(struct tree_balance *tb, int h) +{ + struct buffer_head *r, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (r = tb->FR[h]) == NULL) + return 0; + + if (f == r) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); + +} + +/* Check whether left neighbor is in memory. */ +static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) +{ + struct buffer_head *father, *left; + struct super_block *sb = tb->tb_sb; + b_blocknr_t left_neighbor_blocknr; + int left_neighbor_position; + + /* Father of the left neighbor does not exist. */ + if (!tb->FL[h]) + return 0; + + /* Calculate father of the node to be balanced. */ + father = PATH_H_PBUFFER(tb->tb_path, h + 1); + + RFALSE(!father || + !B_IS_IN_TREE(father) || + !B_IS_IN_TREE(tb->FL[h]) || + !buffer_uptodate(father) || + !buffer_uptodate(tb->FL[h]), + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", + father, tb->FL[h]); + + /* + * Get position of the pointer to the left neighbor + * into the left father. + */ + left_neighbor_position = (father == tb->FL[h]) ? + tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); + /* Get left neighbor block number. */ + left_neighbor_blocknr = + B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { + + RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), + "vs-8170: left neighbor (%b %z) is not in the tree", + left, left); + put_bh(left); + return 1; + } + + return 0; +} + +#define LEFT_PARENTS 'l' +#define RIGHT_PARENTS 'r' + +static void decrement_key(struct cpu_key *key) +{ + /* call item specific function for this key */ + item_ops[cpu_key_k_type(key)]->decrement_key(key); +} + +/* + * Calculate far left/right parent of the left/right neighbor of the + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor + * of the parent F[h]. + * Calculate left/right common parent of the current node and L[h]/R[h]. + * Calculate left/right delimiting key position. + * Returns: PATH_INCORRECT - path in the tree is not correct + * SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function + * worked + */ +static int get_far_parent(struct tree_balance *tb, + int h, + struct buffer_head **pfather, + struct buffer_head **pcom_father, char c_lr_par) +{ + struct buffer_head *parent; + INITIALIZE_PATH(s_path_to_neighbor_father); + struct treepath *path = tb->tb_path; + struct cpu_key s_lr_father_key; + int counter, + position = INT_MAX, + first_last_position = 0, + path_offset = PATH_H_PATH_OFFSET(path, h); + + /* + * Starting from F[h] go upwards in the tree, and look for the common + * ancestor of F[h], and its neighbor l/r, that should be obtained. + */ + + counter = path_offset; + + RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8180: invalid path length"); + + for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { + /* + * Check whether parent of the current buffer in the path + * is really parent in the tree. + */ + if (!B_IS_IN_TREE + (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) + return REPEAT_SEARCH; + + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(path, + counter - 1)) > + B_NR_ITEMS(parent)) + return REPEAT_SEARCH; + + /* + * Check whether parent at the path really points + * to the child. + */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) + return REPEAT_SEARCH; + + /* + * Return delimiting key if position in the parent is not + * equal to first/last one. + */ + if (c_lr_par == RIGHT_PARENTS) + first_last_position = B_NR_ITEMS(parent); + if (position != first_last_position) { + *pcom_father = parent; + get_bh(*pcom_father); + /*(*pcom_father = parent)->b_count++; */ + break; + } + } + + /* if we are in the root of the tree, then there is no common father */ + if (counter == FIRST_PATH_ELEMENT_OFFSET) { + /* + * Check whether first buffer in the path is the + * root of the tree. + */ + if (PATH_OFFSET_PBUFFER + (tb->tb_path, + FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK(tb->tb_sb)) { + *pfather = *pcom_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; + } + + RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, + "PAP-8185: (%b %z) level too small", + *pcom_father, *pcom_father); + + /* Check whether the common parent is locked. */ + + if (buffer_locked(*pcom_father)) { + + /* Release the write lock while the buffer is busy */ + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(*pcom_father); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(*pcom_father); + return REPEAT_SEARCH; + } + } + + /* + * So, we got common parent of the current node and its + * left/right neighbor. Now we are getting the parent of the + * left/right neighbor. + */ + + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key(&s_lr_father_key, + internal_key(*pcom_father, + (c_lr_par == + LEFT_PARENTS) ? (tb->lkey[h - 1] = + position - + 1) : (tb->rkey[h - + 1] = + position))); + + if (c_lr_par == LEFT_PARENTS) + decrement_key(&s_lr_father_key); + + if (search_by_key + (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, + h + 1) == IO_ERROR) + /* path is released */ + return IO_ERROR; + + if (FILESYSTEM_CHANGED_TB(tb)) { + pathrelse(&s_path_to_neighbor_father); + brelse(*pcom_father); + return REPEAT_SEARCH; + } + + *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + + RFALSE(B_LEVEL(*pfather) != h + 1, + "PAP-8190: (%b %z) level too small", *pfather, *pfather); + RFALSE(s_path_to_neighbor_father.path_length < + FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); + + s_path_to_neighbor_father.path_length--; + pathrelse(&s_path_to_neighbor_father); + return CARRY_ON; +} + +/* + * Get parents of neighbors of node in the path(S[path_offset]) and + * common parents of S[path_offset] and L[path_offset]/R[path_offset]: + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], + * CFR[path_offset]. + * Calculate numbers of left and right delimiting keys position: + * lkey[path_offset], rkey[path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function worked + */ +static int get_parents(struct tree_balance *tb, int h) +{ + struct treepath *path = tb->tb_path; + int position, + ret, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + struct buffer_head *curf, *curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + /* + * The root can not have parents. + * Release nodes which previously were obtained as + * parents of the current node neighbors. + */ + brelse(tb->FL[h]); + brelse(tb->CFL[h]); + brelse(tb->FR[h]); + brelse(tb->CFR[h]); + tb->FL[h] = NULL; + tb->CFL[h] = NULL; + tb->FR[h] = NULL; + tb->CFR[h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[path_offset] of L[path_offset]. */ + position = PATH_OFFSET_POSITION(path, path_offset - 1); + if (position) { + /* Current node is not the first child of its parent. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->lkey[h] = position - 1; + } else { + /* + * Calculate current parent of L[path_offset], which is the + * left neighbor of the current node. Calculate current + * common parent of L[path_offset] and the current node. + * Note that CFL[path_offset] not equal FL[path_offset] and + * CFL[path_offset] not equal F[path_offset]. + * Calculate lkey[path_offset]. + */ + if ((ret = get_far_parent(tb, h + 1, &curf, + &curcf, + LEFT_PARENTS)) != CARRY_ON) + return ret; + } + + brelse(tb->FL[h]); + tb->FL[h] = curf; /* New initialization of FL[h]. */ + brelse(tb->CFL[h]); + tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); + + /* Get parent FR[h] of R[h]. */ + + /* Current node is the last child of F[h]. FR[h] != F[h]. */ + if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { + /* + * Calculate current parent of R[h], which is the right + * neighbor of F[h]. Calculate current common parent of + * R[h] and current node. Note that CFR[h] not equal + * FR[path_offset] and CFR[h] not equal F[h]. + */ + if ((ret = + get_far_parent(tb, h + 1, &curf, &curcf, + RIGHT_PARENTS)) != CARRY_ON) + return ret; + } else { + /* Current node is not the last child of its parent F[h]. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->rkey[h] = position; + } + + brelse(tb->FR[h]); + /* New initialization of FR[path_offset]. */ + tb->FR[h] = curf; + + brelse(tb->CFR[h]); + /* New initialization of CFR[path_offset]. */ + tb->CFR[h] = curcf; + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); + + return CARRY_ON; +} + +/* + * it is possible to remove node as result of shiftings to + * neighbors even when we insert or paste item. + */ +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, + struct tree_balance *tb, int h) +{ + struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head *ih; + struct reiserfs_key *r_key = NULL; + + ih = item_head(Sh, 0); + if (tb->CFR[h]) + r_key = internal_key(tb->CFR[h], tb->rkey[h]); + + if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ + - + ((!h + && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) + - + ((!h && r_key + && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + + ((h) ? KEY_SIZE : 0)) { + /* node can not be removed */ + if (sfree >= levbytes) { + /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = + B_NR_ITEMS(Sh) + + ((mode == M_INSERT) ? 1 : 0); + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + } + PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); + return !NO_BALANCING_NEEDED; +} + +/* + * Check whether current node S[h] is balanced when increasing its size by + * Inserting or Pasting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +/* ip means Inserting or Pasting */ +static int ip_check_balance(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + /* + * Number of bytes that must be inserted into (value is negative + * if bytes are deleted) buffer which contains node being balanced. + * The mnemonic is that the attempted change in node space used + * level is levbytes bytes. + */ + int levbytes; + int ret; + + int lfree, sfree, rfree /* free space in L, S and R */ ; + + /* + * nver is short for number of vertixes, and lnver is the number if + * we shift to the left, rnver is the number if we shift to the + * right, and lrnver is the number if we shift in both directions. + * The goal is to minimize first the number of vertixes, and second, + * the number of vertixes whose contents are changed by shifting, + * and third the number of uncached vertixes whose contents are + * changed by shifting and must be read from disk. + */ + int nver, lnver, rnver, lrnver; + + /* + * used at leaf level only, S0 = S[0] is the node being balanced, + * sInum [ I = 0,1,2 ] is the number of items that will + * remain in node SI after balancing. S1 and S2 are new + * nodes that might be created. + */ + + /* + * we perform 8 calls to get_num_ver(). For each call we + * calculate five parameters. where 4th parameter is s1bytes + * and 5th - s2bytes + * + * s0num, s1num, s2num for 8 cases + * 0,1 - do not shift and do not shift but bottle + * 2 - shift only whole item to left + * 3 - shift to left and bottle as much as possible + * 4,5 - shift to right (whole items and as much as possible + * 6,7 - shift to both directions (whole items and as much as possible) + */ + short snum012[40] = { 0, }; + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head *Sh; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if (!Sh) { + if (!h) + reiserfs_panic(tb->tb_sb, "vs-8210", + "S[0] can not be 0"); + switch (ret = get_empty_nodes(tb, h)) { + /* no balancing for higher levels needed */ + case CARRY_ON: + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return ret; + default: + reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " + "return value of get_empty_nodes"); + } + } + + /* get parents of S[h] neighbors. */ + ret = get_parents(tb, h); + if (ret != CARRY_ON) + return ret; + + sfree = B_FREE_SPACE(Sh); + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* and new item fits into node S[h] without any shifting */ + if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == + NO_BALANCING_NEEDED) + return NO_BALANCING_NEEDED; + + create_virtual_node(tb, h); + + /* + * determine maximal number of items we can shift to the left + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the left neighbor from the left most liquid + * item that cannot be shifted from S[0] entirely (returned value) + */ + check_left(tb, h, lfree); + + /* + * determine maximal number of items we can shift to the right + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the right neighbor from the right most liquid + * item that cannot be shifted from S[0] entirely (returned value) + */ + check_right(tb, h, rfree); + + /* + * all contents of internal node S[h] can be moved into its + * neighbors, S[h] will be removed after balancing + */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* + * Since we are working on internal nodes, and our internal + * nodes have fixed size entries, then we can balance by the + * number of items rather than the space they consume. In this + * routine we set the left node equal to the right node, + * allowing a difference of less than or equal to 1 child + * pointer. + */ + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* + * this checks balance condition, that any two neighboring nodes + * can not fit in one node + */ + RFALSE(h && + (tb->lnum[h] >= vn->vn_nr_item + 1 || + tb->rnum[h] >= vn->vn_nr_item + 1), + "vs-8220: tree is not balanced on internal level"); + RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), + "vs-8225: tree is not balanced on leaf level"); + + /* + * all contents of S[0] can be moved into its neighbors + * S[0] will be removed after balancing. + */ + if (!h && is_leaf_removable(tb)) + return CARRY_ON; + + /* + * why do we perform this check here rather than earlier?? + * Answer: we can win 1 node in some cases above. Moreover we + * checked it above, when we checked, that S[0] is not removable + * in principle + */ + + /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { + if (!h) + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + { + int lpar, rpar, nset, lset, rset, lrset; + /* regular overflowing of the node */ + + /* + * get_num_ver works in 2 modes (FLOW & NO_FLOW) + * lpar, rpar - number of items we can shift to left/right + * neighbor (including splitting item) + * nset, lset, rset, lrset - shows, whether flowing items + * give better packing + */ +#define FLOW 1 +#define NO_FLOW 0 /* do not any splitting */ + + /* we choose one of the following */ +#define NOTHING_SHIFT_NO_FLOW 0 +#define NOTHING_SHIFT_FLOW 5 +#define LEFT_SHIFT_NO_FLOW 10 +#define LEFT_SHIFT_FLOW 15 +#define RIGHT_SHIFT_NO_FLOW 20 +#define RIGHT_SHIFT_FLOW 25 +#define LR_SHIFT_NO_FLOW 30 +#define LR_SHIFT_FLOW 35 + + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + /* + * calculate number of blocks S[h] must be split into when + * nothing is shifted to the neighbors, as well as number of + * items in each part of the split node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, h ? vn->vn_nr_item : 0, -1, + snum012, NO_FLOW); + + if (!h) { + int nver1; + + /* + * note, that in this case we try to bottle + * between S[0] and S1 (S1 - the first new node) + */ + nver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * l_shift_num first items and l_shift_bytes of the right + * most liquid item to be shifted are shifted to the left + * neighbor, as well as number of items in each part of the + * splitted node (s012 numbers), and number of bytes + * (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, h ? vn->vn_nr_item : 0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lnver1; + + lnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * r_shift_num first items and r_shift_bytes of the left most + * liquid item to be shifted are shifted to the right neighbor, + * as well as number of items in each part of the splitted + * node (s012 numbers), and number of bytes (s1bytes) of the + * shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int rnver1; + + rnver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * items are shifted in both directions, as well as number + * of items in each part of the splitted node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lrnver1; + + lrnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } + + /* + * Our general shifting strategy is: + * 1) to minimized number of new nodes; + * 2) to minimized number of neighbors involved in shifting; + * 3) to minimized number of disk reads; + */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) { + RFALSE(h && + (tb->lnum[h] != 1 || + tb->rnum[h] != 1 || + lrnver != 1 || rnver != 2 || lnver != 2 + || h != 1), "vs-8230: bad h"); + if (lrset == LR_SHIFT_FLOW) + set_parameters(tb, h, tb->lnum[h], tb->rnum[h], + lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters(tb, h, + tb->lnum[h] - + ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - + ((tb->rbytes == -1) ? 0 : 1), + lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + + /* + * if shifting doesn't lead to better packing + * then don't shift + */ + if (nver == lrnver) { + set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, + -1); + return CARRY_ON; + } + + /* + * now we know that for better packing shifting in only one + * direction either to the left or to the right is required + */ + + /* + * if shifting to the left is better than + * shifting to the right + */ + if (lnver < rnver) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* + * if shifting to the right is better than + * shifting to the left + */ + if (lnver > rnver) { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + + /* + * now shifting in either direction gives the same number + * of nodes and we can make use of the cached neighbors + */ + if (is_left_neighbor_in_cache(tb, h)) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* + * shift to the right independently on whether the + * right neighbor in cache or not + */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting for INTERNAL node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + * + * Note: Items of internal nodes have fixed size, so the balance condition for + * the internal part of S+tree is as for the B-trees. + */ +static int dc_check_balance_internal(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* + * Sh is the node whose balance is currently being checked, + * and Fh is its father. + */ + struct buffer_head *Sh, *Fh; + int maxsize, ret; + int lfree, rfree /* free space in L and R */ ; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + Fh = PATH_H_PPARENT(tb->tb_path, h); + + maxsize = MAX_CHILD_SIZE(Sh); + + /* + * using tb->insert_size[h], which is negative in this case, + * create_virtual_node calculates: + * new_nr_item = number of items node would have if operation is + * performed without balancing (new_nr_item); + */ + create_virtual_node(tb, h); + + if (!Fh) { /* S[h] is the root. */ + /* no balancing for higher levels needed */ + if (vn->vn_nr_item > 0) { + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + /* + * new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. + */ + set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* + * Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. + */ + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { + /* + * Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. + */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { + /* All contents of S[h] can be moved to L[h]. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, + -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to R[h]. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, + -1); + return CARRY_ON; + } + } + + /* + * All contents of S[h] can be moved to the neighbors + * (L[h] & R[h]). + */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - + tb->rnum[h] + vn->vn_nr_item + 1) / 2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, + 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + /* + * Current node contain insufficient number of items. + * Balancing is required. + */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache(tb, h) + || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* For internal nodes try to borrow item from a neighbor */ + RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { + int from_l; + + from_l = + (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; + } + + set_parameters(tb, h, 0, + -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); + return CARRY_ON; +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Truncating for LEAF node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance_leaf(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* + * Number of bytes that must be deleted from + * (value is negative if bytes are deleted) buffer which + * contains node being balanced. The mnemonic is that the + * attempted change in node space used level is levbytes bytes. + */ + int levbytes; + + /* the maximal item size */ + int maxsize, ret; + + /* + * S0 is the node whose balance is currently being checked, + * and F0 is its father. + */ + struct buffer_head *S0, *F0; + int lfree, rfree /* free space in L and R */ ; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + F0 = PATH_H_PPARENT(tb->tb_path, 0); + + levbytes = tb->insert_size[h]; + + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if (!F0) { /* S[0] is the root now. */ + + RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), + "vs-8240: attempt to create empty buffer tree"); + + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + create_virtual_node(tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable(tb, lfree, rfree)) + return CARRY_ON; + + /* + * determine maximal number of items we can shift to the left/right + * neighbor and the maximal number of bytes that can flow to the + * left/right neighbor from the left/right most liquid item that + * cannot be shifted from S[0] entirely + */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + + RFALSE(!tb->FL[h], + "vs-8245: dc_check_balance_leaf: FL[h] must exist"); + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* + * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). + * Set parameters and return + */ + if (is_leaf_removable(tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance(struct tree_balance *tb, int h) +{ + RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), + "vs-8250: S is not initialized"); + + if (h) + return dc_check_balance_internal(tb, h); + else + return dc_check_balance_leaf(tb, h); +} + +/* + * Check whether current node S[h] is balanced. + * Calculate parameters for balancing for current level h. + * Parameters: + * + * tb tree_balance structure: + * + * tb is a large structure that must be read about in the header + * file at the same time as this procedure if the reader is + * to successfully understand this procedure + * + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste, d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int check_balance(int mode, + struct tree_balance *tb, + int h, + int inum, + int pos_in_item, + struct item_head *ins_ih, const void *data) +{ + struct virtual_node *vn; + + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; + + RFALSE(mode == M_INSERT && !vn->vn_ins_ih, + "vs-8255: ins_ih can not be 0 in insert mode"); + + /* Calculate balance parameters when size of node is increasing. */ + if (tb->insert_size[h] > 0) + return ip_check_balance(tb, h); + + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance(tb, h); +} + +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent(struct tree_balance *tb, int h) +{ + struct buffer_head *bh; + struct treepath *path = tb->tb_path; + int position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + + /* We are in the root or in the new root. */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, + "PAP-8260: invalid offset in the path"); + + if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; + PATH_OFFSET_POSITION(path, path_offset - 1) = 0; + return CARRY_ON; + } + /* Root is changed and we must recalculate the path. */ + return REPEAT_SEARCH; + } + + /* Parent in the path is not in the tree. */ + if (!B_IS_IN_TREE + (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) + return REPEAT_SEARCH; + + if ((position = + PATH_OFFSET_POSITION(path, + path_offset - 1)) > B_NR_ITEMS(bh)) + return REPEAT_SEARCH; + + /* Parent in the path is not parent of the current node in the tree. */ + if (B_N_CHILD_NUM(bh, position) != + PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) + return REPEAT_SEARCH; + + if (buffer_locked(bh)) { + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(bh); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + /* + * Parent in the path is unlocked and really parent + * of the current node. + */ + return CARRY_ON; +} + +/* + * Using lnum[h] and rnum[h] we should determine what neighbors + * of S[h] we + * need in order to balance S[h], and get them if necessary. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_neighbors(struct tree_balance *tb, int h) +{ + int child_position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); + unsigned long son_number; + struct super_block *sb = tb->tb_sb; + struct buffer_head *bh; + int depth; + + PROC_INFO_INC(sb, get_neighbors[h]); + + if (tb->lnum[h]) { + /* We need left neighbor to balance S[h]. */ + PROC_INFO_INC(sb, need_l_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FL[h] && + !PATH_OFFSET_POSITION(tb->tb_path, path_offset), + "PAP-8270: invalid position in the parent"); + + child_position = + (bh == + tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> + FL[h]); + son_number = B_N_CHILD_NUM(tb->FL[h], child_position); + depth = reiserfs_write_unlock_nested(tb->tb_sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + + RFALSE(!B_IS_IN_TREE(tb->FL[h]) || + child_position > B_NR_ITEMS(tb->FL[h]) || + B_N_CHILD_NUM(tb->FL[h], child_position) != + bh->b_blocknr, "PAP-8275: invalid parent"); + RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); + RFALSE(!h && + B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FL[0], child_position)), + "PAP-8290: invalid child size of left neighbor"); + + brelse(tb->L[h]); + tb->L[h] = bh; + } + + /* We need right neighbor to balance S[path_offset]. */ + if (tb->rnum[h]) { + PROC_INFO_INC(sb, need_r_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FR[h] && + PATH_OFFSET_POSITION(tb->tb_path, + path_offset) >= + B_NR_ITEMS(bh), + "PAP-8295: invalid position in the parent"); + + child_position = + (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; + son_number = B_N_CHILD_NUM(tb->FR[h], child_position); + depth = reiserfs_write_unlock_nested(tb->tb_sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + brelse(tb->R[h]); + tb->R[h] = bh; + + RFALSE(!h + && B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FR[0], child_position)), + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), + dc_size(B_N_CHILD(tb->FR[0], child_position))); + + } + return CARRY_ON; +} + +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) +{ + int max_num_of_items; + int max_num_of_entries; + unsigned long blocksize = sb->s_blocksize; + +#define MIN_NAME_LEN 1 + + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / + (DEH_SIZE + MIN_NAME_LEN); + + return sizeof(struct virtual_node) + + max(max_num_of_items * sizeof(struct virtual_item), + sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + + (max_num_of_entries - 1) * sizeof(__u16)); +} + +/* + * maybe we should fail balancing we are going to perform when kmalloc + * fails several times. But now it will loop until kmalloc gets + * required memory + */ +static int get_mem_for_virtual_node(struct tree_balance *tb) +{ + int check_fs = 0; + int size; + char *buf; + + size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + + /* we have to allocate more memory for virtual node */ + if (size > tb->vn_buf_size) { + if (tb->vn_buf) { + /* free memory allocated before */ + kfree(tb->vn_buf); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } + + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!buf) { + /* + * getting memory with GFP_KERNEL priority may involve + * balancing now (due to indirect_to_direct conversion + * on dcache shrinking). So, release path and collected + * resources here + */ + free_buffers_in_tb(tb); + buf = kmalloc(size, GFP_NOFS); + if (!buf) { + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule(); + return REPEAT_SEARCH; + } + + tb->vn_buf = buf; + } + + if (check_fs && FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + return CARRY_ON; +} + +#ifdef CONFIG_REISERFS_CHECK +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{ + if (bh) { + if (atomic_read(&(bh->b_count)) <= 0) + + reiserfs_panic(sb, "jmacd-1", "negative or zero " + "reference counter for buffer %s[%d] " + "(%b)", descr, level, bh); + + if (!buffer_uptodate(bh)) + reiserfs_panic(sb, "jmacd-2", "buffer is not up " + "to date %s[%d] (%b)", + descr, level, bh); + + if (!B_IS_IN_TREE(bh)) + reiserfs_panic(sb, "jmacd-3", "buffer is not " + "in tree %s[%d] (%b)", + descr, level, bh); + + if (bh->b_bdev != sb->s_bdev) + reiserfs_panic(sb, "jmacd-4", "buffer has wrong " + "device %s[%d] (%b)", + descr, level, bh); + + if (bh->b_size != sb->s_blocksize) + reiserfs_panic(sb, "jmacd-5", "buffer has wrong " + "blocksize %s[%d] (%b)", + descr, level, bh); + + if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) + reiserfs_panic(sb, "jmacd-6", "buffer block " + "number too high %s[%d] (%b)", + descr, level, bh); + } +} +#else +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{; +} +#endif + +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) +{ + return reiserfs_prepare_for_journal(s, bh, 0); +} + +static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) +{ + struct buffer_head *locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; + + do { + + locked = NULL; + + for (i = tb->tb_path->path_length; + !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { + if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { + /* + * if I understand correctly, we can only + * be sure the last buffer in the path is + * in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(tb->tb_path) == + PATH_OFFSET_PBUFFER(tb->tb_path, i)) + tb_buffer_sanity_check(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i), "S", + tb->tb_path-> + path_length - i); +#endif + if (!clear_all_dirty_bits(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i))) { + locked = + PATH_OFFSET_PBUFFER(tb->tb_path, + i); + } + } + } + + for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; + i++) { + + if (tb->lnum[i]) { + + if (tb->L[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->L[i], + "L", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->L[i])) + locked = tb->L[i]; + } + + if (!locked && tb->FL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FL[i], + "FL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FL[i])) + locked = tb->FL[i]; + } + + if (!locked && tb->CFL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFL[i], + "CFL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFL[i])) + locked = tb->CFL[i]; + } + + } + + if (!locked && (tb->rnum[i])) { + + if (tb->R[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->R[i], + "R", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->R[i])) + locked = tb->R[i]; + } + + if (!locked && tb->FR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FR[i], + "FR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FR[i])) + locked = tb->FR[i]; + } + + if (!locked && tb->CFR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFR[i], + "CFR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFR[i])) + locked = tb->CFR[i]; + } + } + } + + /* + * as far as I can tell, this is not required. The FEB list + * seems to be full of newly allocated nodes, which will + * never be locked, dirty, or anything else. + * To be safe, I'm putting in the checks and waits in. + * For the moment, they are needed to keep the code in + * journal.c from complaining about the buffer. + * That code is inside CONFIG_REISERFS_CHECK as well. --clm + */ + for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FEB[i])) + locked = tb->FEB[i]; + } + } + + if (locked) { + int depth; +#ifdef CONFIG_REISERFS_CHECK + repeat_counter++; + if ((repeat_counter % 10000) == 0) { + reiserfs_warning(tb->tb_sb, "reiserfs-8200", + "too many iterations waiting " + "for buffer to unlock " + "(%b)", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return (FILESYSTEM_CHANGED_TB(tb)) ? + REPEAT_SEARCH : CARRY_ON; + } +#endif + depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(locked); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + } while (locked); + + return CARRY_ON; +} + +/* + * Prepare for balancing, that is + * get all necessary parents, and neighbors; + * analyze what and where should be moved; + * get sufficient number of new nodes; + * Balancing will start only after all resources will be collected at a time. + * + * When ported to SMP kernels, only at the last moment after all needed nodes + * are collected in cache, will the resources be locked using the usual + * textbook ordered lock acquisition algorithms. Note that ensuring that + * this code neither write locks what it does not need to write lock nor locks + * out of order will be a pain in the butt that could have been avoided. + * Grumble grumble. -Hans + * + * fix is meant in the sense of render unchanging + * + * Latency might be improved by first gathering a list of what buffers + * are needed and then getting as many of them in parallel as possible? -Hans + * + * Parameters: + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) + * tb tree_balance structure; + * inum item number in S[h]; + * pos_in_item - comment this if you can + * ins_ih item head of item being inserted + * data inserted item or data to be pasted + * Returns: 1 - schedule occurred while the function worked; + * 0 - schedule didn't occur while the function worked; + * -1 - if no_disk_space + */ + +int fix_nodes(int op_mode, struct tree_balance *tb, + struct item_head *ins_ih, const void *data) +{ + int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); + int pos_in_item; + + /* + * we set wait_tb_buffers_run when we have to restore any dirty + * bits cleared during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0; + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; + + pos_in_item = tb->tb_path->pos_in_item; + + tb->fs_gen = get_generation(tb->tb_sb); + + /* + * we prepare and log the super here so it will already be in the + * transaction when do_balance needs to change it. + * This way do_balance won't have to schedule when trying to prepare + * the super for logging + */ + reiserfs_prepare_for_journal(tb->tb_sb, + SB_BUFFER_WITH_SB(tb->tb_sb), 1); + journal_mark_dirty(tb->transaction_handle, + SB_BUFFER_WITH_SB(tb->tb_sb)); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked(tbS0)) { + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(tbS0); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + print_cur_tb("fix_nodes"); + reiserfs_panic(tb->tb_sb, "PAP-8305", + "there is pending do_balance"); + } + + if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " + "not uptodate at the beginning of fix_nodes " + "or not in tree (mode %c)", + tbS0, tbS0, op_mode); + + /* Check parameters. */ + switch (op_mode) { + case M_INSERT: + if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " + "item number %d (in S0 - %d) in case " + "of insert", item_num, + B_NR_ITEMS(tbS0)); + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { + print_block(tbS0, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " + "item number(%d); mode = %c " + "insert_size = %d", + item_num, op_mode, + tb->insert_size[0]); + } + break; + default: + reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " + "of operation"); + } +#endif + + if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) + /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ + return REPEAT_SEARCH; + + /* Starting from the leaf level; for all levels h of the tree. */ + for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { + ret = get_direct_parent(tb, h); + if (ret != CARRY_ON) + goto repeat; + + ret = check_balance(op_mode, tb, h, item_num, + pos_in_item, ins_ih, data); + if (ret != CARRY_ON) { + if (ret == NO_BALANCING_NEEDED) { + /* No balancing for higher levels needed. */ + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + if (h != MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + /* + * ok, analysis and resource gathering + * are complete + */ + break; + } + goto repeat; + } + + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + + /* + * No disk space, or schedule occurred and analysis may be + * invalid and needs to be redone. + */ + ret = get_empty_nodes(tb, h); + if (ret != CARRY_ON) + goto repeat; + + /* + * We have a positive insert size but no nodes exist on this + * level, this means that we are creating a new root. + */ + if (!PATH_H_PBUFFER(tb->tb_path, h)) { + + RFALSE(tb->blknum[h] != 1, + "PAP-8350: creating new empty root"); + + if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { + /* + * The tree needs to be grown, so this node S[h] + * which is the root node is split into two nodes, + * and a new node (S[h+1]) will be created to + * become the root node. + */ + if (tb->blknum[h] > 1) { + + RFALSE(h == MAX_HEIGHT - 1, + "PAP-8355: attempt to create too high of a tree"); + + tb->insert_size[h + 1] = + (DC_SIZE + + KEY_SIZE) * (tb->blknum[h] - 1) + + DC_SIZE; + } else if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else + tb->insert_size[h + 1] = + (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); + } + + ret = wait_tb_buffers_until_unlocked(tb); + if (ret == CARRY_ON) { + if (FILESYSTEM_CHANGED_TB(tb)) { + wait_tb_buffers_run = 1; + ret = REPEAT_SEARCH; + goto repeat; + } else { + return CARRY_ON; + } + } else { + wait_tb_buffers_run = 1; + goto repeat; + } + +repeat: + /* + * fix_nodes was unable to perform its calculation due to + * filesystem got changed under us, lack of free disk space or i/o + * failure. If the first is the case - the search will be + * repeated. For now - free all resources acquired so far except + * for the new allocated nodes + */ + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + } else { + pathrelse(tb->tb_path); + } + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFR[i]); + } + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } + + if (wait_tb_buffers_run) { + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) + reiserfs_restore_prepared_buffer + (tb->tb_sb, tb->FEB[i]); + } + } + return ret; + } + +} + +void unfix_nodes(struct tree_balance *tb) +{ + int i; + + /* Release path buffers. */ + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + } + + /* deal with list of allocated (used and unused) nodes */ + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; + /* + * de-allocated block which was not used by + * balancing and bforget about buffer for it + */ + brelse(tb->FEB[i]); + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse(tb->used[i]); + } + } + + kfree(tb->vn_buf); + +} diff --git a/kernel/fs/reiserfs/hashes.c b/kernel/fs/reiserfs/hashes.c new file mode 100644 index 000000000..7a26c4fe6 --- /dev/null +++ b/kernel/fs/reiserfs/hashes.c @@ -0,0 +1,177 @@ + +/* + * Keyed 32-bit hash function using TEA in a Davis-Meyer function + * H0 = Key + * Hi = E Mi(Hi-1) + Hi-1 + * + * (see Applied Cryptography, 2nd edition, p448). + * + * Jeremy Fitzhardinge 1998 + * + * Jeremy has agreed to the contents of reiserfs/README. -Hans + * Yura's function is added (04/07/2000) + */ + +#include +#include "reiserfs.h" +#include + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + u32 sum = 0; \ + int n = rounds; \ + u32 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + +u32 keyed_hash(const signed char *msg, int len) +{ + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; + + u32 h0 = k[0], h1 = k[1]; + u32 a, b, c, d; + u32 pad; + int i; + + /* assert(len >= 0 && len < 256); */ + + pad = (u32) len | ((u32) len << 8); + pad |= pad << 16; + + while (len >= 16) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + d = (u32) msg[12] | + (u32) msg[13] << 8 | + (u32) msg[14] << 16 | (u32) msg[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + msg += 16; + } + + if (len >= 12) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + + d = pad; + for (i = 12; i < len; i++) { + d <<= 8; + d |= msg[i]; + } + } else if (len >= 8) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + + c = d = pad; + for (i = 8; i < len; i++) { + c <<= 8; + c |= msg[i]; + } + } else if (len >= 4) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + + b = c = d = pad; + for (i = 4; i < len; i++) { + b <<= 8; + b |= msg[i]; + } + } else { + a = b = c = d = pad; + for (i = 0; i < len; i++) { + a <<= 8; + a |= msg[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0 ^ h1; +} + +/* + * What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README + */ +u32 yura_hash(const signed char *msg, int len) +{ + int j, pow; + u32 a, c; + int i; + + for (pow = 1, i = 1; i < len; i++) + pow = pow * 10; + + if (len == 1) + a = msg[0] - 48; + else + a = (msg[0] - 48) * pow; + + for (i = 1; i < len; i++) { + c = msg[i] - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +u32 r5_hash(const signed char *msg, int len) +{ + u32 a = 0; + while (*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; +} diff --git a/kernel/fs/reiserfs/ibalance.c b/kernel/fs/reiserfs/ibalance.c new file mode 100644 index 000000000..b751eea32 --- /dev/null +++ b/kernel/fs/reiserfs/ibalance.c @@ -0,0 +1,1160 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* this is one and only function that is used outside (do_balance.c) */ +int balance_internal(struct tree_balance *, + int, int, struct item_head *, struct buffer_head **); + +/* + * modes of internal_shift_left, internal_shift_right and + * internal_insert_childs + */ +#define INTERNAL_SHIFT_FROM_S_TO_L 0 +#define INTERNAL_SHIFT_FROM_R_TO_S 1 +#define INTERNAL_SHIFT_FROM_L_TO_S 2 +#define INTERNAL_SHIFT_FROM_S_TO_R 3 +#define INTERNAL_INSERT_TO_S 4 +#define INTERNAL_INSERT_TO_L 5 +#define INTERNAL_INSERT_TO_R 6 + +static void internal_define_dest_src_infos(int shift_mode, + struct tree_balance *tb, + int h, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *d_key, struct buffer_head **cf) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_S_TO_L: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + /* dest position is analog of dest->b_item_order */ + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_R_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + break; + + default: + reiserfs_panic(tb->tb_sb, "ibalance-1", + "shift type is unknown (%d)", + shift_mode); + } +} + +/* + * Insert count node pointers into buffer cur before position to + 1. + * Insert count items into buffer cur before position to. + * Items and node pointers are specified by inserted and bh respectively. + */ +static void internal_insert_childs(struct buffer_info *cur_bi, + int to, int count, + struct item_head *inserted, + struct buffer_head **bh) +{ + struct buffer_head *cur = cur_bi->bi_bh; + struct block_head *blkh; + int nr; + struct reiserfs_key *ih; + struct disk_child new_dc[2]; + struct disk_child *dc; + int i; + + if (count <= 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + RFALSE(count > 2, "too many children (%d) are to be inserted", count); + RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE), + "no enough free space (%d), needed %d bytes", + B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE)); + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur, to + 1); + + memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i++) { + put_dc_size(&new_dc[i], + MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); + put_dc_block_number(&new_dc[i], bh[i]->b_blocknr); + } + memcpy(dc, new_dc, DC_SIZE * count); + + /* prepare space for count items */ + ih = internal_key(cur, ((to == -1) ? 0 : to)); + + memmove(ih + count, ih, + (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy(ih, inserted, KEY_SIZE); + if (count > 1) + memcpy(ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - count * (DC_SIZE + + KEY_SIZE)); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* + * Delete del_num items and node pointers from buffer cur starting from + * the first_i'th item and first_p'th pointers respectively. + */ +static void internal_delete_pointers_items(struct buffer_info *cur_bi, + int first_p, + int first_i, int del_num) +{ + struct buffer_head *cur = cur_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + RFALSE(cur == NULL, "buffer is 0"); + RFALSE(del_num < 0, + "negative number of items (%d) can not be deleted", del_num); + RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1 + || first_i < 0, + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, first_p + del_num, + B_NR_ITEMS(cur) + 1, first_i); + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + if (first_p == 0 && del_num == nr + 1) { + RFALSE(first_i != 0, + "1st deleted key must have order 0, not %d", first_i); + make_empty_node(cur_bi); + return; + } + + RFALSE(first_i + del_num > B_NR_ITEMS(cur), + "first_i = %d del_num = %d " + "no so many keys (%d) in the node (%b)(%z)", + first_i, del_num, first_i + del_num, cur, cur); + + /* deleting */ + dc = B_N_CHILD(cur, first_p); + + memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = internal_key(cur, first_i); + memmove(key, key + del_num, + (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - + del_num) * DC_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + + (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } +} + +/* delete n node pointers and items starting from given position */ +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) +{ + int i_from; + + i_from = (from == 0) ? from : from - 1; + + /* + * delete n pointers starting from `from' position in CUR; + * delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items(cur_bi, from, i_from, n); +} + +/* + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer + * dest + * last_first == FIRST_TO_LAST means that we copy first items + * from src to tail of dest + * last_first == LAST_TO_FIRST means that we copy last items + * from src to head of dest + */ +static void internal_copy_pointers_items(struct buffer_info *dest_bi, + struct buffer_head *src, + int last_first, int cpy_num) +{ + /* + * ATTENTION! Number of node pointers in DEST is equal to number + * of items in DEST as delimiting key have already inserted to + * buffer dest. + */ + struct buffer_head *dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + nr_src = B_NR_ITEMS(src); + + RFALSE(dest == NULL || src == NULL, + "src (%p) or dest (%p) buffer is 0", src, dest); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "invalid last_first parameter (%d)", last_first); + RFALSE(nr_src < cpy_num - 1, + "no so many items (%d) in src (%d)", cpy_num, nr_src); + RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); + RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); + + if (cpy_num == 0) + return; + + /* coping */ + blkh = B_BLK_HEAD(dest); + nr_dest = blkh_nr_item(blkh); + + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = + nr_src - cpy_num + 1) : (dest_order = + nr_dest, + src_order = + 0); + + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD(dest, dest_order); + + memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + + /* insert pointers */ + memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); + + /* prepare space for cpy_num - 1 item headers */ + key = internal_key(dest, dest_order); + memmove(key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + + cpy_num)); + + /* insert headers */ + memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to + * buffer dest. + * Delete cpy_num - del_par items and node pointers from buffer src. + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. + */ +static void internal_move_pointers_items(struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int last_first, int cpy_num, + int del_par) +{ + int first_pointer; + int first_item; + + internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first, + cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* + * delete cpy_num - del_par pointers and keys starting for + * pointers with first_pointer, for key - with first_item + */ + internal_delete_pointers_items(src_bi, first_pointer, + first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = (cpy_num - del_par == + (j = + B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num + + del_par; + + internal_delete_pointers_items(src_bi, + j + 1 - cpy_num + del_par, i, + cpy_num - del_par); + } +} + +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ +static void internal_insert_key(struct buffer_info *dest_bi, + /* insert key before key with n_dest number */ + int dest_position_before, + struct buffer_head *src, int src_position) +{ + struct buffer_head *dest = dest_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + + RFALSE(dest == NULL || src == NULL, + "source(%p) or dest(%p) buffer is 0", src, dest); + RFALSE(dest_position_before < 0 || src_position < 0, + "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + RFALSE(dest_position_before > B_NR_ITEMS(dest) || + src_position >= B_NR_ITEMS(src), + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS(dest), + src_position, B_NR_ITEMS(src)); + RFALSE(B_FREE_SPACE(dest) < KEY_SIZE, + "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest)); + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + + /* prepare space for inserting key */ + key = internal_key(dest, dest_position_before); + memmove(key + 1, key, + (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy(key, internal_key(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from + * buffer src to buffer dest. + * Replace d_key'th key in buffer cfl. + * Delete pointer_amount items and node pointers from buffer src. + */ +/* this can be invoked both to shift from S to L and from R to S */ +static void internal_shift_left( + /* + * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S + */ + int mode, + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount); */ + + if (pointer_amount) { + /* + * insert delimiting key from common father of dest and + * src to node dest into position B_NR_ITEM(dest) + */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position /*src->b_item_order */ == 0) + replace_key(tb, cf, d_key_position, + src_bi. + bi_parent /*src->b_parent */ , 0); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 0); + +} + +/* + * Insert delimiting key to L[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shifts from S[h] to L[h] */ +static void internal_shift1_left(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + if (pointer_amount > 0) + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 1); +} + +/* + * Insert d_key'th (delimiting) key from buffer cfr to head of dest. + * Copy n node pointers and n - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfr. + * Delete n items and node pointers from buffer src. + */ +static void internal_shift_right( + /* + * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S + */ + int mode, + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + int nr; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + nr = B_NR_ITEMS(src_bi.bi_bh); + + if (pointer_amount > 0) { + /* + * insert delimiting key from common father of dest + * and src to dest node into position 0 + */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { + RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || + dest_bi.bi_bh != tb->R[h], + "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h)); + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key(tb, cf, d_key_position, tb->CFL[h], + tb->lkey[h]); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 0); +} + +/* + * Insert delimiting key to R[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shift from S[h] to R[h] */ +static void internal_shift1_right(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + /* insert rkey from CFR[h] to right neighbor R[h] */ + if (pointer_amount > 0) + internal_insert_key(&dest_bi, 0, cf, d_key_position); + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 1); +} + +/* + * Delete insert_num node pointers together with their left items + * and balance current node. + */ +static void balance_internal_when_delete(struct tree_balance *tb, + int h, int child_pos) +{ + int insert_num; + int n; + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + internal_delete_childs(&bi, child_pos, -insert_num); + + RFALSE(tb->blknum[h] > 1, + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); + + n = B_NR_ITEMS(tbSh); + + if (tb->lnum[h] == 0 && tb->rnum[h] == 0) { + if (tb->blknum[h] == 0) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + + RFALSE(n + || B_FREE_SPACE(tbSh) != + MAX_CHILD_SIZE(tbSh) - DC_SIZE, + "buffer must have only 0 keys (%d)", n); + RFALSE(bi.bi_parent, "root has parent (%p)", + bi.bi_parent); + + /* choose a new root */ + if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1])) + new_root = tb->R[h - 1]; + else + new_root = tb->L[h - 1]; + /* + * switch super block's tree root block + * number to the new value */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); + /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */ + PUT_SB_TREE_HEIGHT(tb->tb_sb, + SB_TREE_HEIGHT(tb->tb_sb) - 1); + + do_balance_mark_sb_dirty(tb, + REISERFS_SB(tb->tb_sb)->s_sbh, + 1); + /*&&&&&&&&&&&&&&&&&&&&&& */ + /* use check_internal if new root is an internal node */ + if (h > 1) + check_internal(new_root); + /*&&&&&&&&&&&&&&&&&&&&&& */ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + /* join S[h] with L[h] */ + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { + + RFALSE(tb->rnum[h] != 0, + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + /* join S[h] with R[h] */ + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); + + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + + /* borrow from left neighbor L[h] */ + if (tb->lnum[h] < 0) { + RFALSE(tb->rnum[h] != 0, + "wrong tb->rnum[%d]==%d when borrow from L[h]", h, + tb->rnum[h]); + internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, + -tb->lnum[h]); + return; + } + + /* borrow from right neighbor R[h] */ + if (tb->rnum[h] < 0) { + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); + internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */ + return; + } + + /* split S[h] into two parts and put them into neighbors */ + if (tb->lnum[h] > 0) { + RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + reiserfs_panic(tb->tb_sb, "ibalance-2", + "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} + +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL, + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", + tb->L[h], tb->CFL[h]); + + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; + + memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); +} + +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL, + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", + tb->R[h], tb->CFR[h]); + RFALSE(B_NR_ITEMS(tb->R[h]) == 0, + "R[h] can not be empty if it exists (item number=%d)", + B_NR_ITEMS(tb->R[h])); + + memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); +} + + +/* + * if inserting/pasting { + * child_pos is the position of the node-pointer in S[h] that + * pointed to S[h-1] before balancing of the h-1 level; + * this means that new pointers and items must be inserted AFTER + * child_pos + * } else { + * it is the position of the leftmost pointer that must be deleted + * (together with its corresponding key to the left of the pointer) + * as a result of the previous level's balancing. + * } + */ + +int balance_internal(struct tree_balance *tb, + int h, /* level of the tree */ + int child_pos, + /* key for insertion on higher level */ + struct item_head *insert_key, + /* node for insertion on higher level */ + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + /* + * we return this: it is 0 if there is no S[h], + * else it is tb->S[h]->b_item_order + */ + int order; + int insert_num, n, k; + struct buffer_head *S_new; + struct item_head new_insert_key; + struct buffer_head *new_insert_ptr = NULL; + struct item_head *new_insert_key_addr = insert_key; + + RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); + + PROC_INFO_INC(tb->tb_sb, balance_at[h]); + + order = + (tbSh) ? PATH_H_POSITION(tb->tb_path, + h + 1) /*tb->S[h]->b_item_order */ : 0; + + /* + * Using insert_size[h] calculate the number insert_num of items + * that must be inserted to or deleted from S[h]. + */ + insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper * */ + RFALSE(insert_num < -2 || insert_num > 2, + "incorrect number of items inserted to the internal node (%d)", + insert_num); + RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); + + /* Make balance in case insert_num < 0 */ + if (insert_num < 0) { + balance_internal_when_delete(tb, h, child_pos); + return order; + } + + k = 0; + if (tb->lnum[h] > 0) { + /* + * shift lnum[h] items from S[h] to the left neighbor L[h]. + * check how many of new items fall into L[h] or CFL[h] after + * shifting + */ + n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ + if (tb->lnum[h] <= child_pos) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h]); + child_pos -= tb->lnum[h]; + } else if (tb->lnum[h] > child_pos + insert_num) { + /* all new items fall into L[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h] - insert_num); + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next */ + n + child_pos + 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* + * some items fall into L[h] or CFL[h], + * but some don't fall + */ + internal_shift1_left(tb, h, child_pos + 1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next, */ + n + child_pos + 1, k, + insert_key, insert_ptr); + + replace_lkey(tb, h, insert_key + k); + + /* + * replace the first node-ptr in S[h] by + * node-ptr to insert_ptr[k] + */ + dc = B_N_CHILD(tbSh, 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr[k]) - + B_FREE_SPACE(insert_ptr[k])); + put_dc_block_number(dc, insert_ptr[k]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } + /* tb->lnum[h] > 0 */ + if (tb->rnum[h] > 0) { + /*shift rnum[h] items from S[h] to the right neighbor R[h] */ + /* + * check how many of new items fall into R or CFR + * after shifting + */ + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + if (n - tb->rnum[h] >= child_pos) + /* new items fall into S[h] */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + else if (n + insert_num - tb->rnum[h] < child_pos) { + /* all new items fall into R[h] */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h],tb->S[h-1]->b_next */ + child_pos - n - insert_num + + tb->rnum[h] - 1, + insert_num, insert_key, + insert_ptr); + insert_num = 0; + } else { + struct disk_child *dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb, h, n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h], tb->R[h]->b_child, */ + 0, k, insert_key + 1, + insert_ptr + 1); + + replace_rkey(tb, h, insert_key + insert_num - k - 1); + + /* + * replace the first node-ptr in R[h] by + * node-ptr insert_ptr[insert_num-k-1] + */ + dc = B_N_CHILD(tb->R[h], 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr + [insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1])); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tb->R[h], 0); + + insert_num -= (k + 1); + } + } + + /** Fill new node that appears instead of S[h] **/ + RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); + RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); + + if (!tb->blknum[h]) { /* node S[h] is empty now */ + RFALSE(!tbSh, "S[h] is equal NULL"); + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return order; + } + + if (!tbSh) { + /* create new root */ + struct disk_child *dc; + struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); + struct block_head *blkh; + + if (tb->blknum[h] != 1) + reiserfs_panic(NULL, "ibalance-3", "One new node " + "required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB(tb); + blkh = B_BLK_HEAD(tbSh); + set_blkh_level(blkh, h + 1); + + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + put_dc_block_number(dc, tbSh_1->b_blocknr); + put_dc_size(dc, + (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); + + tb->insert_size[h] -= DC_SIZE; + set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = + tbSh; + + /* Change root in structure super block. */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); + PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); + do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); + } + + if (tb->blknum[h] == 2) { + int snum; + struct buffer_info dest_bi, src_bi; + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + set_blkh_level(B_BLK_HEAD(S_new), h + 1); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = NULL; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1) / 2; + if (n - snum >= child_pos) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy(&new_insert_key, internal_key(tbSh, n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, snum, 0); + } else if (n + insert_num - snum < child_pos) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* + * new_insert_key = (n + insert_item - snum)'th + * key in S[h] + */ + memcpy(&new_insert_key, + internal_key(tbSh, n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + snum - insert_num, 0); + + /* + * insert insert_num keys and node-pointers + * into S_new + */ + internal_insert_childs(&dest_bi, + /*S_new,tb->S[h-1]->b_next, */ + child_pos - n - insert_num + + snum - 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + n - child_pos + 1, 1); + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs(&dest_bi, /*S_new, */ 0, k, + insert_key + 1, insert_ptr + 1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key, insert_key + insert_num - k - 1, + KEY_SIZE); + /* + * replace first node-ptr in S_new by node-ptr + * to insert_ptr[insert_num-k-1] + */ + + dc = B_N_CHILD(S_new, 0); + put_dc_size(dc, + (MAX_CHILD_SIZE + (insert_ptr[insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1]))); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, S_new, 0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + + RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) + || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", + S_new); + + /* S_new is released in unfix_nodes */ + } + + n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ + + if (0 <= child_pos && child_pos <= n && insert_num > 0) { + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + internal_insert_childs(&bi, /*tbSh, */ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ + child_pos, insert_num, insert_key, + insert_ptr); + } + + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); + insert_ptr[0] = new_insert_ptr; + + return order; +} diff --git a/kernel/fs/reiserfs/inode.c b/kernel/fs/reiserfs/inode.c new file mode 100644 index 000000000..f6f2fbad9 --- /dev/null +++ b/kernel/fs/reiserfs/inode.c @@ -0,0 +1,3461 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +void reiserfs_evict_inode(struct inode *inode) +{ + /* + * We need blocks for transaction + (user+group) quota + * update (possibly delete) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + int err; + + if (!inode->i_nlink && !is_bad_inode(inode)) + dquot_initialize(inode); + + truncate_inode_pages_final(&inode->i_data); + if (inode->i_nlink) + goto no_delete; + + /* + * The = 0 happens when we abort creating a new inode + * for some reason like lack of space.. + * also handles bad_inode case + */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { + + reiserfs_delete_xattrs(inode); + + reiserfs_write_lock(inode->i_sb); + + if (journal_begin(&th, inode->i_sb, jbegin_count)) + goto out; + reiserfs_update_inode_transaction(inode); + + reiserfs_discard_prealloc(&th, inode); + + err = reiserfs_delete_object(&th, inode); + + /* + * Do quota update inside a transaction for journaled quotas. + * We must do that after delete_object so that quota updates + * go into the same transaction as stat data deletion + */ + if (!err) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } + + if (journal_end(&th)) + goto out; + + /* + * check return value from reiserfs_delete_object after + * ending the transaction + */ + if (err) + goto out; + + /* + * all items of file are deleted, so we can remove + * "save" link + * we can't do anything about an error here + */ + remove_save_link(inode, 0 /* not truncate */); +out: + reiserfs_write_unlock(inode->i_sb); + } else { + /* no object items are in the tree */ + ; + } + + /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); + + dquot_drop(inode); + inode->i_blocks = 0; + return; + +no_delete: + clear_inode(inode); + dquot_drop(inode); +} + +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, + __u32 objectid, loff_t offset, int type, int length) +{ + key->version = version; + + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset(key, offset); + set_cpu_key_k_type(key, type); + key->key_length = length; +} + +/* + * take base of inode_key (it comes from inode always) (dirid, objectid) + * and version from an inode, set offset and type of key + */ +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, + int type, int length) +{ + _make_cpu_key(key, get_inode_item_key_version(inode), + le32_to_cpu(INODE_PKEY(inode)->k_dir_id), + le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, + length); +} + +/* when key is 0, do not set version and short key */ +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, + int entry_count /*or ih_free_space */ ) +{ + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = + cpu_to_le32(key->on_disk_key.k_objectid); + } + put_ih_version(ih, version); + set_le_ih_k_offset(ih, offset); + set_le_ih_k_type(ih, type); + put_ih_item_len(ih, length); + /* set_ih_free_space (ih, 0); */ + /* + * for directory items it is entry count, for directs and stat + * datas - 0xffff, for indirects - 0 + */ + put_ih_entry_count(ih, entry_count); +} + +/* + * FIXME: we might cache recently accessed indirect item + * Ugh. Not too eager for that.... + * I cut the code until such time as I see a convincing argument (benchmark). + * I don't want a bloated inode struct..., and I don't like code complexity.... + */ + +/* + * cutting the code is fine, since it really isn't in use yet and is easy + * to add back in. But, Vladimir has a really good idea here. Think + * about what happens for reading a file. For each page, + * The VFS layer calls reiserfs_readpage, who searches the tree to find + * an indirect item. This indirect item has X number of pointers, where + * X is a big number if we've done the block allocation right. But, + * we only use one or two of these pointers during each call to readpage, + * needlessly researching again later on. + * + * The size of the cache could be dynamic based on the size of the file. + * + * I'd also like to see us cache the location the stat data item, since + * we are needlessly researching for that frequently. + * + * --chris + */ + +/* + * If this page has a file tail in it, and + * it was read in by get_block_create_0, the page data is valid, + * but tail is still sitting in a direct item, and we can't write to + * it. So, look through this page, and check all the mapped buffers + * to make sure they have valid block numbers. Any that don't need + * to be unmapped, so that __block_write_begin will correctly call + * reiserfs_get_block to convert the tail into an unformatted node + */ +static inline void fix_tail_page_for_writing(struct page *page) +{ + struct buffer_head *head, *next, *bh; + + if (page && page_has_buffers(page)) { + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } +} + +/* + * reiserfs_get_block does not need to allocate a block only if it has been + * done already or non-hole position has been found in the indirect item + */ +static inline int allocation_needed(int retval, b_blocknr_t allocated, + struct item_head *ih, + __le32 * item, int pos_in_item) +{ + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && + get_block_num(item, pos_in_item)) + return 0; + return 1; +} + +static inline int indirect_item_found(int retval, struct item_head *ih) +{ + return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); +} + +static inline void set_block_dev_mapped(struct buffer_head *bh, + b_blocknr_t block, struct inode *inode) +{ + map_bh(bh, inode->i_sb, block); +} + +/* + * files which were created in the earlier version can not be longer, + * than 2 gb + */ +static int file_capable(struct inode *inode, sector_t block) +{ + /* it is new file. */ + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || + /* old file, but 'block' is inside of 2gb */ + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) + return 1; + + return 0; +} + +static int restart_transaction(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct super_block *s = th->t_super; + int err; + + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_refcount); + + pathrelse(path); + + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return 0; + } + reiserfs_update_sd(th, inode); + err = journal_end(th); + if (!err) { + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); + if (!err) + reiserfs_update_inode_transaction(inode); + } + return err; +} + +/* + * it is called by get_block when create == 0. Returns block number + * for 'block'-th logical block of file. When it hits direct item it + * returns 0 (being called from bmap) or read direct item into piece + * of page (bh_result) + * Please improve the english/clarity in the comment above, as it is + * hard to understand. + */ +static int _get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int args) +{ + INITIALIZE_PATH(path); + struct cpu_key key; + struct buffer_head *bh; + struct item_head *ih, tmp_ih; + b_blocknr_t blocknr; + char *p = NULL; + int chars; + int ret; + int result; + int done = 0; + unsigned long offset; + + /* prepare the key to look for the 'block'-th block of file */ + make_cpu_key(&key, inode, + (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, + 3); + + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) { + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + if (result == IO_ERROR) + return -EIO; + /* + * We do not return -ENOENT if there is a hole but page is + * uptodate, because it means that there is some MMAPED data + * associated with it that is yet to be written to disk. + */ + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + return -ENOENT; + } + return 0; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + if (is_indirect_le_ih(ih)) { + __le32 *ind_item = (__le32 *) ih_item_body(bh, ih); + + /* + * FIXME: here we could cache indirect item or part of it in + * the inode to avoid search_by_key in case of subsequent + * access to file + */ + blocknr = get_block_num(ind_item, path.pos_in_item); + ret = 0; + if (blocknr) { + map_bh(bh_result, inode->i_sb, blocknr); + if (path.pos_in_item == + ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { + set_buffer_boundary(bh_result); + } + } else + /* + * We do not return -ENOENT if there is a hole but + * page is uptodate, because it means that there is + * some MMAPED data associated with it that is + * yet to be written to disk. + */ + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + ret = -ENOENT; + } + + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return ret; + } + /* requested data are in direct item(s) */ + if (!(args & GET_BLOCK_READ_DIRECT)) { + /* + * we are called by bmap. FIXME: we can not map block of file + * when it is stored in direct item(s) + */ + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return -ENOENT; + } + + /* + * if we've got a direct item, and the buffer or page was uptodate, + * we don't want to pull data off disk again. skip to the + * end, where we map the buffer and return + */ + if (buffer_uptodate(bh_result)) { + goto finished; + } else + /* + * grab_tail_page can trigger calls to reiserfs_get_block on + * up to date pages without any buffers. If the page is up + * to date, we don't want read old data off disk. Set the up + * to date bit on the buffer instead and jump to the end + */ + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { + set_buffer_uptodate(bh_result); + goto finished; + } + /* read file tail into part of page */ + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); + copy_item_head(&tmp_ih, ih); + + /* + * we only want to kmap if we are reading the tail into the page. + * this is not the common case, so we don't kmap until we are + * sure we need to. But, this means the item might move if + * kmap schedules + */ + if (!p) + p = (char *)kmap(bh_result->b_page); + + p += offset; + memset(p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih(ih)) { + BUG(); + } + /* + * make sure we don't read more bytes than actually exist in + * the file. This can happen in odd cases where i_size isn't + * correct, and when direct item padding results in a few + * extra bytes at the end of the direct item + */ + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) + break; + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { + chars = + inode->i_size - (le_ih_k_offset(ih) - 1) - + path.pos_in_item; + done = 1; + } else { + chars = ih_item_len(ih) - path.pos_in_item; + } + memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars); + + if (done) + break; + + p += chars; + + /* + * we done, if read direct item is not the last item of + * node FIXME: we could try to check right delimiting key + * to see whether direct item continues in the right + * neighbor or rely on i_size + */ + if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) + break; + + /* update key to look for the next piece */ + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) + /* i/o error most likely */ + break; + bh = get_last_bh(&path); + ih = tp_item_head(&path); + } while (1); + + flush_dcache_page(bh_result->b_page); + kunmap(bh_result->b_page); + +finished: + pathrelse(&path); + + if (result == IO_ERROR) + return -EIO; + + /* + * this buffer has valid data, but isn't valid for io. mapping it to + * block #0 tells the rest of reiserfs it just has a tail in it + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_uptodate(bh_result); + return 0; +} + +/* + * this is called to create file map. So, _get_block_create_0 will not + * read direct item + */ +static int reiserfs_bmap(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + if (!file_capable(inode, block)) + return -EFBIG; + + reiserfs_write_lock(inode->i_sb); + /* do not read the direct item */ + _get_block_create_0(inode, block, bh_result, 0); + reiserfs_write_unlock(inode->i_sb); + return 0; +} + +/* + * special version of get_block that is only used by grab_tail_page right + * now. It is sent to __block_write_begin, and when you try to get a + * block past the end of the file (or a block from a hole) it returns + * -ENOENT instead of a valid buffer. __block_write_begin expects to + * be able to do i/o on the buffers returned, unless an error value + * is also returned. + * + * So, this allows __block_write_begin to be used for reading a single block + * in a page. Where it does not produce a valid page for holes, or past the + * end of the file. This turns out to be exactly what we need for reading + * tails for conversion. + * + * The point of the wrapper is forcing a certain value for create, even + * though the VFS layer is calling this function with create==1. If you + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, + * don't use this function. +*/ +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, + int create) +{ + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); +} + +/* + * This is special helper for reiserfs_get_block in case we are executing + * direct_IO request. + */ +static int reiserfs_get_blocks_direct_io(struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + int ret; + + bh_result->b_page = NULL; + + /* + * We set the b_size before reiserfs_get_block call since it is + * referenced in convert_tail_for_hole() that may be called from + * reiserfs_get_block() + */ + bh_result->b_size = (1 << inode->i_blkbits); + + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE); + if (ret) + goto out; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* + * make sure future calls to the direct io funcs for this + * offset in the file fail by unmapping the buffer + */ + clear_buffer_mapped(bh_result); + ret = -EINVAL; + } + + /* + * Possible unpacked tail. Flush the data before pages have + * disappeared + */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + int err; + + reiserfs_write_lock(inode->i_sb); + + err = reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + reiserfs_write_unlock(inode->i_sb); + + if (err < 0) + ret = err; + } +out: + return ret; +} + +/* + * helper function for when reiserfs_get_block is called for a hole + * but the file tail is still in a direct item + * bh_result is the buffer head for the hole + * tail_offset is the offset of the start of the tail in the file + * + * This calls prepare_write, which will start a new transaction + * you should not be in a transaction, or have any paths held when you + * call this. + */ +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) +{ + unsigned long index; + unsigned long tail_end; + unsigned long tail_start; + struct page *tail_page; + struct page *hole_page = bh_result->b_page; + int retval = 0; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); + tail_end = (tail_start | (bh_result->b_size - 1)) + 1; + + index = tail_offset >> PAGE_CACHE_SHIFT; + /* + * hole_page can be zero in case of direct_io, we are sure + * that we cannot get here if we write with O_DIRECT into tail page + */ + if (!hole_page || index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index); + retval = -ENOMEM; + if (!tail_page) { + goto out; + } + } else { + tail_page = hole_page; + } + + /* + * we don't have to make sure the conversion did not happen while + * we were locking the page because anyone that could convert + * must first take i_mutex. + * + * We must fix the tail page for writing because it might have buffers + * that are mapped, but have a block number of 0. This indicates tail + * data that has been read directly into the page, and + * __block_write_begin won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); + if (retval) + goto unlock; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page); + + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); + +unlock: + if (tail_page != hole_page) { + unlock_page(tail_page); + page_cache_release(tail_page); + } +out: + return retval; +} + +static inline int _allocate_block(struct reiserfs_transaction_handle *th, + sector_t block, + struct inode *inode, + b_blocknr_t * allocated_block_nr, + struct treepath *path, int flags) +{ + BUG_ON(!th->t_trans_id); + +#ifdef REISERFS_PREALLOCATE + if (!(flags & GET_BLOCK_NO_IMUX)) { + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, + path, block); + } +#endif + return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, + block); +} + +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int repeat, retval = 0; + /* b_blocknr_t is (unsigned) 32 bit int*/ + b_blocknr_t allocated_block_nr = 0; + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head *bh, *unbh = NULL; + struct item_head *ih, tmp_ih; + __le32 *item; + int done; + int fs_gen; + struct reiserfs_transaction_handle *th = NULL; + /* + * space reserved in transaction batch: + * . 3 balancings in direct->indirect conversion + * . 1 block involved into reiserfs_update_sd() + * XXX in practically impossible worst case direct2indirect() + * can incur (much) more than 3 balancings. + * quota update for user, group + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 1 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + int version; + int dangle = 1; + loff_t new_offset = + (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; + + reiserfs_write_lock(inode->i_sb); + version = get_inode_item_key_version(inode); + + if (!file_capable(inode, block)) { + reiserfs_write_unlock(inode->i_sb); + return -EFBIG; + } + + /* + * if !create, we aren't changing the FS, so we don't need to + * log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0(inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT); + reiserfs_write_unlock(inode->i_sb); + return ret; + } + + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; + + /* + * If file is of such a size, that it might have a tail and + * tails are enabled we should mark it as possibly needing + * tail packing on close + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size < i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size < i_block_size(inode))) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { +start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; + goto failure; + } + reiserfs_update_inode_transaction(inode); + } +research: + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (allocation_needed + (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + if (!th) { + pathrelse(&path); + goto start_trans; + } + + repeat = + _allocate_block(th, block, inode, &allocated_block_nr, + &path, create); + + /* + * restart the transaction to give the journal a chance to free + * some blocks. releases the path, so we have to go back to + * research if we succeed on the second try + */ + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + repeat = + _allocate_block(th, block, inode, + &allocated_block_nr, NULL, create); + + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { + goto research; + } + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; + goto failure; + } + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found(retval, ih)) { + b_blocknr_t unfm_ptr; + /* + * 'block'-th block is in the file already (there is + * corresponding cell in some indirect item). But it may be + * zero unformatted node pointer (hole) + */ + unfm_ptr = get_block_num(item, pos_in_item); + if (unfm_ptr == 0) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + set_buffer_new(bh_result); + if (buffer_dirty(bh_result) + && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); + put_block_num(item, pos_in_item, allocated_block_nr); + unfm_ptr = allocated_block_nr; + journal_mark_dirty(th, bh); + reiserfs_update_sd(th, inode); + } + set_block_dev_mapped(bh_result, unfm_ptr, inode); + pathrelse(&path); + retval = 0; + if (!dangle && th) + retval = reiserfs_end_persistent_transaction(th); + + reiserfs_write_unlock(inode->i_sb); + + /* + * the item was found, so new blocks were not added to the file + * there is no need to make sure the inode is updated with this + * transaction + */ + return retval; + } + + if (!th) { + pathrelse(&path); + goto start_trans; + } + + /* + * desired position is not found or is in the direct item. We have + * to append file with holes up to 'block'-th block converting + * direct items to indirect one if necessary + */ + done = 0; + do { + if (is_statdata_le_ih(ih)) { + __le32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head(&tmp_ih, &key, version, 1, + TYPE_INDIRECT, UNFM_P_SIZE, + 0 /* free_space */ ); + + /* + * we are going to add 'block'-th block to the file. + * Use allocated block for that + */ + if (cpu_key_k_offset(&key) == 1) { + unp = cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } + tmp_key = key; /* ;) */ + set_cpu_key_k_offset(&tmp_key, 1); + PATH_LAST_POSITION(&path)++; + + retval = + reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, + inode, (char *)&unp); + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + /* + * retval == -ENOSPC, -EDQUOT or -EIO + * or -EEXIST + */ + goto failure; + } + } else if (is_direct_le_ih(ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = + ((le_ih_k_offset(ih) - + 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + + /* + * direct item we just found fits into block we have + * to map. Convert it into unformatted node: use + * bh_result for the conversion + */ + if (tail_offset == cpu_key_k_offset(&key)) { + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* + * we have to pad file tail stored in direct + * item(s) up to block size and convert it + * to unformatted node. FIXME: this should + * also get into page cache + */ + + pathrelse(&path); + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + BUG_ON(!th->t_refcount); + if (th->t_refcount == 1) { + retval = + reiserfs_end_persistent_transaction + (th); + th = NULL; + if (retval) + goto failure; + } + + retval = + convert_tail_for_hole(inode, bh_result, + tail_offset); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, + "clm-6004", + "convert tail failed " + "inode %lu, error %d", + inode->i_ino, + retval); + if (allocated_block_nr) { + /* + * the bitmap, the super, + * and the stat data == 3 + */ + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb, 3); + if (th) + reiserfs_free_block(th, + inode, + allocated_block_nr, + 1); + } + goto failure; + } + goto research; + } + retval = + direct2indirect(th, inode, &path, unbh, + tail_offset); + if (retval) { + reiserfs_unmap_buffer(unbh); + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + /* + * it is important the set_buffer_uptodate is done + * after the direct2indirect. The buffer might + * contain valid data newer than the data on disk + * (read by readpage, changed, and then sent here by + * writepage). direct2indirect needs to know if unbh + * was already up to date, so it can decide if the + * data in unbh needs to be replaced with data from + * the disk + */ + set_buffer_uptodate(unbh); + + /* + * unbh->b_page == NULL in case of DIRECT_IO request, + * this means buffer will disappear shortly, so it + * should not be added to + */ + if (unbh->b_page) { + /* + * we've converted the tail, so we must + * flush unbh before the transaction commits + */ + reiserfs_add_tail_list(inode, unbh); + + /* + * mark it dirty now to prevent commit_write + * from adding this buffer to the inode's + * dirty buffer list + */ + /* + * AKPM: changed __mark_buffer_dirty to + * mark_buffer_dirty(). It's still atomic, + * but it sets the page dirty too, which makes + * it eligible for writeback at any time by the + * VM (which was also the case with + * __mark_buffer_dirty()) + */ + mark_buffer_dirty(unbh); + } + } else { + /* + * append indirect item with holes if needed, when + * appending pointer to 'block'-th block use block, + * which is already allocated + */ + struct cpu_key tmp_key; + /* + * We use this in case we need to allocate + * only one block which is a fastpath + */ + unp_t unf_single = 0; + unp_t *un; + __u64 max_to_insert = + MAX_ITEM_LEN(inode->i_sb->s_blocksize) / + UNFM_P_SIZE; + __u64 blocks_needed; + + RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, + "vs-804: invalid position for append"); + /* + * indirect item has to be appended, + * set up key of that position + * (key type is unimportant) + */ + make_cpu_key(&tmp_key, inode, + le_key_k_offset(version, + &ih->ih_key) + + op_bytes_number(ih, + inode->i_sb->s_blocksize), + TYPE_INDIRECT, 3); + + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), + "green-805: invalid offset"); + blocks_needed = + 1 + + ((cpu_key_k_offset(&key) - + cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> + s_blocksize_bits); + + if (blocks_needed == 1) { + un = &unf_single; + } else { + un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS); + if (!un) { + un = &unf_single; + blocks_needed = 1; + max_to_insert = 0; + } + } + if (blocks_needed <= max_to_insert) { + /* + * we are going to add target block to + * the file. Use allocated block for that + */ + un[blocks_needed - 1] = + cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } else { + /* paste hole to the indirect item */ + /* + * If kmalloc failed, max_to_insert becomes + * zero and it means we only have space for + * one block + */ + blocks_needed = + max_to_insert ? max_to_insert : 1; + } + retval = + reiserfs_paste_into_item(th, &path, &tmp_key, inode, + (char *)un, + UNFM_P_SIZE * + blocks_needed); + + if (blocks_needed != 1) + kfree(un); + + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + if (!done) { + /* + * We need to mark new file size in case + * this function will be interrupted/aborted + * later on. And we may do this only for + * holes. + */ + inode->i_size += + inode->i_sb->s_blocksize * blocks_needed; + } + } + + if (done == 1) + break; + + /* + * this loop could log more blocks than we had originally + * asked for. So, we have to allow the transaction to end + * if it is too big or too full. Update the inode so things + * are consistent if we crash before the function returns + * release the path so that anybody waiting on the path before + * ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + } + /* + * inserting indirect pointers for a hole can take a + * long time. reschedule if needed and also release the write + * lock for others. + */ + reiserfs_cond_resched(inode->i_sb); + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "vs-825", + "%K should not be found", &key); + retval = -EEXIST; + if (allocated_block_nr) + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + pathrelse(&path); + goto failure; + } + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + } while (1); + + retval = 0; + +failure: + if (th && (!dangle || (retval && !th->t_trans_id))) { + int err; + if (th->t_trans_id) + reiserfs_update_sd(th, inode); + err = reiserfs_end_persistent_transaction(th); + if (err) + retval = err; + } + + reiserfs_write_unlock(inode->i_sb); + reiserfs_check_path(&path); + return retval; +} + +static int +reiserfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); +} + +/* + * Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in + * stat item + */ +static int real_space_diff(struct inode *inode, int sd_size) +{ + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size; + + /* + * End of file is also in full block with indirect reference, so round + * up to the next block. + * + * there is just no way to know if the tail is actually packed + * on the file, so we have to assume it isn't. When we pack the + * tail, we add 4 bytes to pretend there really is an unformatted + * node pointer + */ + bytes = + ((inode->i_size + + (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + + sd_size; + return bytes; +} + +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, + int sd_size) +{ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + + (loff_t) (real_space_diff(inode, sd_size)); + } + return ((loff_t) real_space_diff(inode, sd_size)) + + (((loff_t) blocks) << 9); +} + +/* Compute number of blocks used by file in ReiserFS counting */ +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) +{ + loff_t bytes = inode_get_bytes(inode); + loff_t real_space = real_space_diff(inode, sd_size); + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t) 511; + } + + /* + * files from before the quota patch might i_blocks such that + * bytes < real_space. Deal with that here to prevent it from + * going negative. + */ + if (bytes < real_space) + return 0; + return (bytes - real_space) >> 9; +} + +/* + * BAD: new directories have stat data of new type and all other items + * of old type. Version stored in the inode says about body items, so + * in update_stat_data we can not rely on inode, but have to check + * item version directly + */ + +/* called by read_locked_inode */ +static void init_inode(struct inode *inode, struct treepath *path) +{ + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; + + bh = PATH_PLAST_BUFFER(path); + ih = tp_item_head(path); + + copy_key(INODE_PKEY(inode), &ih->ih_key); + + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + reiserfs_init_xattr_rwsem(inode); + + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = + (struct stat_data_v1 *)ih_item_body(bh, ih); + unsigned long blocks; + + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + set_nlink(inode, sd_v1_nlink(sd)); + i_uid_write(inode, sd_v1_uid(sd)); + i_gid_write(inode, sd_v1_gid(sd)); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = sd_v1_blocks(sd); + inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + + /* + * there was a bug in <=3.5.23 when i_blocks could take + * negative values. Starting from 3.5.17 this value could + * even be stored in stat data. For such files we set + * i_blocks based on file size. Just 2 notes: this can be + * wrong for sparse files. On-disk value will be only + * updated if file's inode will ever change + */ + if (inode->i_blocks > blocks) { + inode->i_blocks = blocks; + } + + rdev = sd_v1_rdev(sd); + REISERFS_I(inode)->i_first_direct_byte = + sd_v1_first_direct_byte(sd); + + /* + * an early bug in the quota code can give us an odd + * number for the block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++; + } + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); + /* + * nopack is initially zero for v1 objects. For v2 objects, + * nopack is initialised from sd_attrs + */ + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } else { + /* + * new stat data found, but object may have old items + * (directories and symlinks) + */ + struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih); + + inode->i_mode = sd_v2_mode(sd); + set_nlink(inode, sd_v2_nlink(sd)); + i_uid_write(inode, sd_v2_uid(sd)); + inode->i_size = sd_v2_size(sd); + i_gid_write(inode, sd_v2_gid(sd)); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_blocks = sd_v2_blocks(sd); + rdev = sd_v2_rdev(sd); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_generation = + le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + else + inode->i_generation = sd_v2_generation(sd); + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + REISERFS_I(inode)->i_first_direct_byte = 0; + set_inode_sd_version(inode, STAT_DATA_V2); + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); + /* + * read persistent inode attributes from sd and initialise + * generic inode flags from them + */ + REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + } +} + +/* update new stat data with inode fields */ +static void inode2sd(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data *sd_v2 = (struct stat_data *)sd; + __u16 flags; + + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); + set_sd_v2_uid(sd_v2, i_uid_read(inode)); + set_sd_v2_size(sd_v2, size); + set_sd_v2_gid(sd_v2, i_gid_read(inode)); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); + else + set_sd_v2_generation(sd_v2, inode->i_generation); + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, &flags); + set_sd_v2_attrs(sd_v2, flags); +} + +/* used to copy inode's fields to old stat data */ +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; + + set_sd_v1_mode(sd_v1, inode->i_mode); + set_sd_v1_uid(sd_v1, i_uid_read(inode)); + set_sd_v1_gid(sd_v1, i_gid_read(inode)); + set_sd_v1_nlink(sd_v1, inode->i_nlink); + set_sd_v1_size(sd_v1, size); + set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); + set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); + else + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); + + /* Sigh. i_first_direct_byte is back */ + set_sd_v1_first_direct_byte(sd_v1, + REISERFS_I(inode)->i_first_direct_byte); +} + +/* + * NOTE, you must prepare the buffer head before sending it here, + * and then log it after the call + */ +static void update_stat_data(struct treepath *path, struct inode *inode, + loff_t size) +{ + struct buffer_head *bh; + struct item_head *ih; + + bh = PATH_PLAST_BUFFER(path); + ih = tp_item_head(path); + + if (!is_statdata_le_ih(ih)) + reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", + INODE_PKEY(inode), ih); + + /* path points to old stat data */ + if (stat_data_v1(ih)) { + inode2sd_v1(ih_item_body(bh, ih), inode, size); + } else { + inode2sd(ih_item_body(bh, ih), inode, size); + } + + return; +} + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size) +{ + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh; + int fs_gen; + struct item_head *ih, tmp_ih; + int retval; + + BUG_ON(!th->t_trans_id); + + /* key type is unimportant */ + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); + + for (;;) { + int pos; + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13050", + "i/o failure occurred trying to " + "update %K stat data", &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION(&path); + pathrelse(&path); + if (inode->i_nlink == 0) { + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ + return; + } + reiserfs_warning(inode->i_sb, "vs-13060", + "stat data of object %k (nlink == %d) " + "not found (pos %d)", + INODE_PKEY(inode), inode->i_nlink, + pos); + reiserfs_check_path(&path); + return; + } + + /* + * sigh, prepare_for_journal might schedule. When it + * schedules the FS might change. We have to detect that, + * and loop back to the search if the stat data item has moved + */ + bh = get_last_bh(&path); + ih = tp_item_head(&path); + copy_item_head(&tmp_ih, ih); + fs_gen = get_generation(inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + /* Stat_data item has been moved after scheduling. */ + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + continue; + } + break; + } + update_stat_data(&path, inode, size); + journal_mark_dirty(th, bh); + pathrelse(&path); + return; +} + +/* + * reiserfs_read_locked_inode is called to read the inode off disk, and it + * does a make_bad_inode when things go wrong. But, we need to make sure + * and clear the key in the private portion of the inode, otherwise a + * corresponding iput might try to delete whatever object the inode last + * represented. + */ +static void reiserfs_make_bad_inode(struct inode *inode) +{ + memset(INODE_PKEY(inode), 0, KEY_SIZE); + make_bad_inode(inode); +} + +/* + * initially this function was derived from minix or ext2's analog and + * evolved as the prototype did + */ +int reiserfs_init_locked_inode(struct inode *inode, void *p) +{ + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); + return 0; +} + +/* + * looks for stat data in the tree, and fills up the fields of in-core + * inode stat data fields + */ +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args) +{ + INITIALIZE_PATH(path_to_sd); + struct cpu_key key; + unsigned long dirino; + int retval; + + dirino = args->dirid; + + /* + * set version 1, version 2 could be used too, because stat data + * key is the same in both versions + */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.k_offset = 0; + key.on_disk_key.k_type = 0; + + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13070", + "i/o failure occurred trying to find " + "stat data of %K", &key); + reiserfs_make_bad_inode(inode); + return; + } + + /* a stale NFS handle can trigger this without it being an error */ + if (retval != ITEM_FOUND) { + pathrelse(&path_to_sd); + reiserfs_make_bad_inode(inode); + clear_nlink(inode); + return; + } + + init_inode(inode, &path_to_sd); + + /* + * It is possible that knfsd is trying to access inode of a file + * that is being removed from the disk by some other thread. As we + * update sd on unlink all that is required is to check for nlink + * here. This bug was first found by Sizif when debugging + * SquidNG/Butterfly, forgotten, and found again after Philippe + * Gramoulle reproduced it. + + * More logical fix would require changes in fs/inode.c:iput() to + * remove inode from hash-table _after_ fs cleaned disk stuff up and + * in iget() to return NULL if I_FREEING inode is found in + * hash-table. + */ + + /* + * Currently there is one place where it's ok to meet inode with + * nlink==0: processing of open-unlinked and half-truncated files + * during mount (fs/reiserfs/super.c:finish_unfinished()). + */ + if ((inode->i_nlink == 0) && + !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { + reiserfs_warning(inode->i_sb, "vs-13075", + "dead inode read from disk %K. " + "This is likely to be race with knfsd. Ignore", + &key); + reiserfs_make_bad_inode(inode); + } + + /* init inode should be relsing */ + reiserfs_check_path(&path_to_sd); + + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + cache_no_acl(inode); +} + +/* + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). + * + * @inode: inode from hash table to check + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. + * + * This function is called by iget5_locked() to distinguish reiserfs inodes + * having the same inode numbers. Such inodes can only exist due to some + * error condition. One of them should be bad. Inodes with identical + * inode numbers (objectids) are distinguished by parent directory ids. + * + */ +int reiserfs_find_actor(struct inode *inode, void *opaque) +{ + struct reiserfs_iget_args *args; + + args = opaque; + /* args is already in CPU order */ + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); +} + +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) +{ + struct inode *inode; + struct reiserfs_iget_args args; + int depth; + + args.objectid = key->on_disk_key.k_objectid; + args.dirid = key->on_disk_key.k_dir_id; + depth = reiserfs_write_unlock_nested(s); + inode = iget5_locked(s, key->on_disk_key.k_objectid, + reiserfs_find_actor, reiserfs_init_locked_inode, + (void *)(&args)); + reiserfs_write_lock_nested(s, depth); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + + if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { + /* either due to i/o error or a stale NFS handle */ + iput(inode); + inode = NULL; + } + return inode; +} + +static struct dentry *reiserfs_get_dentry(struct super_block *sb, + u32 objectid, u32 dir_id, u32 generation) + +{ + struct cpu_key key; + struct inode *inode; + + key.on_disk_key.k_objectid = objectid; + key.on_disk_key.k_dir_id = dir_id; + reiserfs_write_lock(sb); + inode = reiserfs_iget(sb, &key); + if (inode && !IS_ERR(inode) && generation != 0 && + generation != inode->i_generation) { + iput(inode); + inode = NULL; + } + reiserfs_write_unlock(sb); + + return d_obtain_alias(inode); +} + +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + /* + * fhtype happens to reflect the number of u32s encoded. + * due to a bug in earlier code, fhtype might indicate there + * are more u32s then actually fitted. + * so if fhtype seems to be more than len, reduce fhtype. + * Valid types are: + * 2 - objectid + dir_id - legacy support + * 3 - objectid + dir_id + generation + * 4 - objectid + dir_id + objectid and dirid of parent - legacy + * 5 - objectid + dir_id + generation + objectid and dirid of parent + * 6 - as above plus generation of directory + * 6 does not fit in NFSv2 handles + */ + if (fh_type > fh_len) { + if (fh_type != 6 || fh_len != 5) + reiserfs_warning(sb, "reiserfs-13077", + "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fh_type, fh_len); + fh_type = fh_len; + } + if (fh_len < 2) + return NULL; + + return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], + (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); +} + +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + if (fh_type > fh_len) + fh_type = fh_len; + if (fh_type < 4) + return NULL; + + return reiserfs_get_dentry(sb, + (fh_type >= 5) ? fid->raw[3] : fid->raw[2], + (fh_type >= 5) ? fid->raw[4] : fid->raw[3], + (fh_type == 6) ? fid->raw[5] : 0); +} + +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent) +{ + int maxlen = *lenp; + + if (parent && (maxlen < 5)) { + *lenp = 5; + return FILEID_INVALID; + } else if (maxlen < 3) { + *lenp = 3; + return FILEID_INVALID; + } + + data[0] = inode->i_ino; + data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + data[2] = inode->i_generation; + *lenp = 3; + if (parent) { + data[3] = parent->i_ino; + data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id); + *lenp = 5; + if (maxlen >= 6) { + data[5] = parent->i_generation; + *lenp = 6; + } + } + return *lenp; +} + +/* + * looks for stat data, then copies fields to it, marks the buffer + * containing stat data as dirty + */ +/* + * reiserfs inodes are never really dirty, since the dirty inode call + * always logs them. This call allows the VFS inode marking routines + * to properly mark inodes for datasync and such, but only actually + * does something when called for a synchronous update. + */ +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct reiserfs_transaction_handle th; + int jbegin_count = 1; + + if (inode->i_sb->s_flags & MS_RDONLY) + return -EROFS; + /* + * memory pressure can sometimes initiate write_inode calls with + * sync == 1, + * these cases are just when the system needs ram, not when the + * inode needs to reach disk for safety, and they can safely be + * ignored because the altered inode has already been logged. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { + reiserfs_write_lock(inode->i_sb); + if (!journal_begin(&th, inode->i_sb, jbegin_count)) { + reiserfs_update_sd(&th, inode); + journal_end_sync(&th); + } + reiserfs_write_unlock(inode->i_sb); + } + return 0; +} + +/* + * stat data of new object is inserted already, this inserts the item + * containing "." and ".." entries + */ +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, struct treepath *path, + struct inode *dir) +{ + struct super_block *sb = th->t_super; + char empty_dir[EMPTY_DIR_SIZE]; + char *body = empty_dir; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, + TYPE_DIRENTRY, 3 /*key length */ ); + + /* + * compose item head for new item. Directories consist of items of + * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + * is done by reiserfs_new_inode + */ + if (old_format_only(sb)) { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } else { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new directory"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13070", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is empty directory item */ + return reiserfs_insert_item(th, path, &key, ih, inode, body); +} + +/* + * stat data of object has been inserted, this inserts the item + * containing the body of symlink + */ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, + struct treepath *path, const char *symname, + int item_len) +{ + struct super_block *sb = th->t_super; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, + le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3 /*key length */ ); + + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, + 0 /*free_space */ ); + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new symlink"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13080", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is body of symlink */ + return reiserfs_insert_item(th, path, &key, ih, inode, symname); +} + +/* + * inserts the stat data into the tree, and then calls + * reiserfs_new_directory (to insert ".", ".." item if new object is + * directory) or reiserfs_new_symlink (to insert symlink body if new + * object is symlink) or nothing (if new object is regular file) + + * NOTE! uid and gid must already be set in the inode. If we return + * non-zero due to an error, we have to drop the quota previously allocated + * for the fresh inode. This can only be done outside a transaction, so + * if we return non-zero, we also end the transaction. + * + * @th: active transaction handle + * @dir: parent directory for new inode + * @mode: mode of new inode + * @symname: symlink contents if inode is symlink + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for + * symlinks + * @inode: inode to be filled + * @security: optional security context to associate with this inode + */ +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, umode_t mode, const char *symname, + /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks) */ + loff_t i_size, struct dentry *dentry, + struct inode *inode, + struct reiserfs_security_handle *security) +{ + struct super_block *sb = dir->i_sb; + struct reiserfs_iget_args args; + INITIALIZE_PATH(path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + int err; + int depth; + + BUG_ON(!th->t_trans_id); + + depth = reiserfs_write_unlock_nested(sb); + err = dquot_alloc_inode(inode); + reiserfs_write_lock_nested(sb, depth); + if (err) + goto out_end_trans; + if (!dir->i_nlink) { + err = -EPERM; + goto out_bad_inode; + } + + /* item head of new item */ + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); + ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); + if (!ih.ih_key.k_objectid) { + err = -ENOMEM; + goto out_bad_inode; + } + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE); + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); + + depth = reiserfs_write_unlock_nested(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock_nested(inode->i_sb, depth); + if (err) { + err = -EINVAL; + goto out_bad_inode; + } + + if (old_format_only(sb)) + /* + * not a perfect generation count, as object ids can be reused, + * but this is as good as reiserfs can do right now. + * note that the private part of inode isn't filled in yet, + * we have to use the directory. + */ + inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); + else +#if defined( USE_INODE_GENERATION_COUNTER ) + inode->i_generation = + le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); +#else + inode->i_generation = ++event; +#endif + + /* fill stat data */ + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); + + /* uid and gid must already be set by the caller for quota init */ + + /* symlink cannot be immutable or append only, right? */ + if (S_ISLNK(inode->i_mode)) + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_size = i_size; + inode->i_blocks = 0; + inode->i_bytes = 0; + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; + + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_attrs = + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; + sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); + reiserfs_init_xattr_rwsem(inode); + + /* key to search for correct place for new stat data */ + _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), + le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, + TYPE_STAT_DATA, 3 /*key length */ ); + + /* find proper place for inserting of stat data */ + retval = search_item(sb, &key, &path_to_key); + if (retval == IO_ERROR) { + err = -EIO; + goto out_bad_inode; + } + if (retval == ITEM_FOUND) { + pathrelse(&path_to_key); + err = -EEXIST; + goto out_bad_inode; + } + if (old_format_only(sb)) { + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ + if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { + pathrelse(&path_to_key); + err = -EINVAL; + goto out_bad_inode; + } + inode2sd_v1(&sd, inode, inode->i_size); + } else { + inode2sd(&sd, inode, inode->i_size); + } + /* + * store in in-core inode the key of stat data and version all + * object items will have (directory items will have old offset + * format, other new objects will consist of new items) + */ + if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + if (old_format_only(sb)) + set_inode_sd_version(inode, STAT_DATA_V1); + else + set_inode_sd_version(inode, STAT_DATA_V2); + + /* insert the stat data into the tree */ +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (REISERFS_I(dir)->new_packing_locality) + th->displace_new_blocks = 1; +#endif + retval = + reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, + (char *)(&sd)); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + goto out_bad_inode; + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (!th->displace_new_blocks) + REISERFS_I(dir)->new_packing_locality = 0; +#endif + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = + reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only(sb)) + i_size = ROUND_UP(i_size); + retval = + reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, + i_size); + } + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th); + goto out_inserted_sd; + } + + if (reiserfs_posixacl(inode->i_sb)) { + reiserfs_write_unlock(inode->i_sb); + retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); + reiserfs_write_lock(inode->i_sb); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning(inode->i_sb, "jdm-13090", + "ACLs aren't enabled in the fs, " + "but vfs thinks they are!"); + } else if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + + if (security->name) { + reiserfs_write_unlock(inode->i_sb); + retval = reiserfs_security_write(th, inode, security); + reiserfs_write_lock(inode->i_sb); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + retval = journal_end(th); + if (retval) + err = retval; + goto out_inserted_sd; + } + } + + reiserfs_update_sd(th, inode); + reiserfs_check_path(&path_to_key); + + return 0; + +out_bad_inode: + /* Invalidate the object, nothing was inserted yet */ + INODE_PKEY(inode)->k_objectid = 0; + + /* Quota change must be inside a transaction for journaling */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + +out_end_trans: + journal_end(th); + /* + * Drop can be outside and it needs more credits so it's better + * to have it outside + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_drop(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + +out_inserted_sd: + clear_nlink(inode); + th->t_trans_id = 0; /* so the caller can't use this handle later */ + unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ + iput(inode); + return err; +} + +/* + * finds the tail page in the page cache, + * reads the last block in. + * + * On success, page_result is set to a locked, pinned page, and bh_result + * is set to an up to date buffer for the last block in the file. returns 0. + * + * tail conversion is not done, so bh_result might not be valid for writing + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before + * trying to write the block. + * + * on failure, nonzero is returned, page_result and bh_result are untouched. + */ +static int grab_tail_page(struct inode *inode, + struct page **page_result, + struct buffer_head **bh_result) +{ + + /* + * we want the page with the last byte in the file, + * not the page that will hold the next byte for appending + */ + unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long pos = 0; + unsigned long start = 0; + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1); + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int error; + + /* + * we know that we are only called with inode->i_size > 0. + * we also know that a file tail can never be as big as a block + * If i_size % blocksize == 0, our file is currently block aligned + * and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT; + } + page = grab_cache_page(inode->i_mapping, index); + error = -ENOMEM; + if (!page) { + goto out; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize; + + error = __block_write_begin(page, start, offset - start, + reiserfs_get_block_create_0); + if (error) + goto unlock; + + head = page_buffers(page); + bh = head; + do { + if (pos >= start) { + break; + } + bh = bh->b_this_page; + pos += blocksize; + } while (bh != head); + + if (!buffer_uptodate(bh)) { + /* + * note, this should never happen, prepare_write should be + * taking care of this for us. If the buffer isn't up to + * date, I've screwed up the code to find the buffer, or the + * code to call prepare_write + */ + reiserfs_error(inode->i_sb, "clm-6000", + "error reading block %lu", bh->b_blocknr); + error = -EIO; + goto unlock; + } + *bh_result = bh; + *page_result = page; + +out: + return error; + +unlock: + unlock_page(page); + page_cache_release(page); + return error; +} + +/* + * vfs version of truncate file. Must NOT be called with + * a transaction already started. + * + * some code taken from block_truncate_page + */ +int reiserfs_truncate_file(struct inode *inode, int update_timestamps) +{ + struct reiserfs_transaction_handle th; + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned length; + struct page *page = NULL; + int error; + struct buffer_head *bh = NULL; + int err2; + + reiserfs_write_lock(inode->i_sb); + + if (inode->i_size > 0) { + error = grab_tail_page(inode, &page, &bh); + if (error) { + /* + * -ENOENT means we truncated past the end of the + * file, and get_block_create_0 could not find a + * block to read in, which is ok. + */ + if (error != -ENOENT) + reiserfs_error(inode->i_sb, "clm-6001", + "grab_tail_page failed %d", + error); + page = NULL; + bh = NULL; + } + } + + /* + * so, if page != NULL, we have a buffer head for the offset at + * the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + * then we have an unformatted node. Otherwise, we have a direct item, + * and no zeroing is required on disk. We zero after the truncate, + * because the truncate might pack the item anyway + * (it will unmap bh if it packs). + * + * it is enough to reserve space in transaction for 2 balancings: + * one for "save" link adding and another for the first + * cut_from_item. 1 is for update_sd + */ + error = journal_begin(&th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + reiserfs_update_inode_transaction(inode); + if (update_timestamps) + /* + * we are doing real truncate: if the system crashes + * before the last transaction of truncating gets committed + * - on reboot the file either appears truncated properly + * or not truncated at all + */ + add_save_link(&th, inode, 1); + err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); + error = journal_end(&th); + if (error) + goto out; + + /* check reiserfs_do_truncate after ending the transaction */ + if (err2) { + error = err2; + goto out; + } + + if (update_timestamps) { + error = remove_save_link(inode, 1 /* truncate */); + if (error) + goto out; + } + + if (page) { + length = offset & (blocksize - 1); + /* if we are not on a block boundary */ + if (length) { + length = blocksize - length; + zero_user(page, offset, length); + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + mark_buffer_dirty(bh); + } + } + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock(inode->i_sb); + + return 0; +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock(inode->i_sb); + + return error; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) +{ + struct reiserfs_transaction_handle th; + int fs_gen; + struct item_head tmp_ih; + struct item_head *ih; + struct buffer_head *bh; + __le32 *item; + struct cpu_key key; + INITIALIZE_PATH(path); + int pos_in_item; + int jbegin_count = JOURNAL_PER_BALANCE_CNT; + loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; + int retval; + int use_get_block = 0; + int bytes_copied = 0; + int copy_size; + int trans_running = 0; + + /* + * catch places below that try to log something without + * starting a trans + */ + th.t_trans_id = 0; + + if (!buffer_uptodate(bh_result)) { + return -EIO; + } + + kmap(bh_result->b_page); +start_over: + reiserfs_write_lock(inode->i_sb); + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); + +research: + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning(inode->i_sb, "clm-6002", + "bytes_copied %d", bytes_copied); + } + if (!get_block_num(item, pos_in_item)) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out; + } + set_block_dev_mapped(bh_result, + get_block_num(item, pos_in_item), inode); + } else if (is_direct_le_ih(ih)) { + char *p; + p = page_address(bh_result->b_page); + p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); + copy_size = ih_item_len(ih) - pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (!trans_running) { + /* vs-3050 is gone, no need to drop the path */ + retval = journal_begin(&th, inode->i_sb, jbegin_count); + if (retval) + goto out; + reiserfs_update_inode_transaction(inode); + trans_running = 1; + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + } + + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + goto research; + } + + memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied, + copy_size); + + journal_mark_dirty(&th, bh); + bytes_copied += copy_size; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + copy_size); + goto research; + } + } else { + reiserfs_warning(inode->i_sb, "clm-6003", + "bad item inode %lu", inode->i_ino); + retval = -EIO; + goto out; + } + retval = 0; + +out: + pathrelse(&path); + if (trans_running) { + int err = journal_end(&th); + if (err) + retval = err; + trans_running = 0; + } + reiserfs_write_unlock(inode->i_sb); + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + retval = reiserfs_get_block(inode, block, bh_result, + GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX + | GET_BLOCK_NO_DANGLE); + if (!retval) { + if (!buffer_mapped(bh_result) + || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0; + goto start_over; + } + } + } + kunmap(bh_result->b_page); + + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* + * we've copied data from the page into the direct item, so the + * buffer in the page is now clean, mark it to reflect that. + */ + lock_buffer(bh_result); + clear_buffer_dirty(bh_result); + unlock_buffer(bh_result); + } + return retval; +} + +/* + * mason@suse.com: updated in 2.5.54 to follow the same general io + * start/recovery path as __block_write_full_page, along with special + * code to handle reiserfs tails. + */ +static int reiserfs_write_full_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int error = 0; + unsigned long block; + sector_t last_block; + struct buffer_head *head, *bh; + int partial = 0; + int nr = 0; + int checked = PageChecked(page); + struct reiserfs_transaction_handle th; + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + th.t_trans_id = 0; + + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + /* + * The page dirty bit is cleared before writepage is called, which + * means we have to tell create_empty_buffers to make dirty buffers + * The page really should be up to date at this point, so tossing + * in the BH_Uptodate is just a sanity check. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, s->s_blocksize, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } + head = page_buffers(page); + + /* + * last page in the file, zero out any contents past the + * last byte in the file + */ + if (page->index >= end_index) { + unsigned last_offset; + + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + unlock_page(page); + return 0; + } + zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); + } + bh = head; + block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + /* first map all the buffers, logging any direct items we find */ + do { + if (block > last_block) { + /* + * This can happen when the block size is less than + * the page size. The corresponding bytes in the page + * were zero filled above + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((checked || buffer_dirty(bh)) && + (!buffer_mapped(bh) || (buffer_mapped(bh) + && bh->b_blocknr == + 0))) { + /* + * not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail; + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* + * we start the transaction after map_block_for_writepage, + * because it can create holes in the file (an unbounded operation). + * starting it here, we can make a reliable estimate for how many + * blocks we're going to log + */ + if (checked) { + ClearPageChecked(page); + reiserfs_write_lock(s); + error = journal_begin(&th, s, bh_per_page + 1); + if (error) { + reiserfs_write_unlock(s); + goto fail; + } + reiserfs_update_inode_transaction(inode); + } + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + if (checked) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, bh); + continue; + } + /* + * from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (checked) { + error = journal_end(&th); + reiserfs_write_unlock(s); + if (error) + goto fail; + } + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + /* + * since any buffer might be the only dirty buffer on the page, + * the first submit_bh can bring the page out of writeback. + * be careful with the buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + + error = 0; +done: + if (nr == 0) { + /* + * if this page only had a direct item, it is very possible for + * no io to be required without there being an error. Or, + * someone else could have locked them and sent them down the + * pipe without locking the page + */ + bh = head; + do { + if (!buffer_uptodate(bh)) { + partial = 1; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (!partial) + SetPageUptodate(page); + end_page_writeback(page); + } + return error; + +fail: + /* + * catches various errors, we need to make sure any valid dirty blocks + * get to the media. The page is currently locked and not marked for + * writeback + */ + ClearPageUptodate(page); + bh = head; + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * clear any dirty bits that might have come from + * getting attached to a dirty page + */ + clear_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while (bh != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; +} + +static int reiserfs_readpage(struct file *f, struct page *page) +{ + return block_read_full_page(page, reiserfs_get_block); +} + +static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + reiserfs_wait_on_write_block(inode->i_sb); + return reiserfs_write_full_page(page, wbc); +} + +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + +static int reiserfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode; + struct page *page; + pgoff_t index; + int ret; + int old_ref = 0; + + inode = mapping->host; + *fsdata = NULL; + if (flags & AOP_FLAG_CONT_EXPAND && + (pos & (inode->i_sb->s_blocksize - 1)) == 0) { + pos ++; + *fsdata = (void *)(unsigned long)flags; + } + + index = pos >> PAGE_CACHE_SHIFT; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + reiserfs_wait_on_write_block(inode->i_sb); + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + ret = __block_write_begin(page, pos, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested + * above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + if (ret) { + unlock_page(page); + page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); + } + return ret; +} + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) +{ + struct inode *inode = page->mapping->host; + int ret; + int old_ref = 0; + int depth; + + depth = reiserfs_write_unlock_nested(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); + + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + + ret = __block_write_begin(page, from, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested + * above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + return ret; + +} + +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) +{ + return generic_block_bmap(as, block, reiserfs_bmap); +} + +static int reiserfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th; + unsigned start; + bool locked = false; + + if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) + pos ++; + + reiserfs_wait_on_write_block(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) + th = current->journal_info; + else + th = NULL; + + start = pos & (PAGE_CACHE_SIZE - 1); + if (unlikely(copied < len)) { + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start + copied, start + len); + } + flush_dcache_page(page); + + reiserfs_commit_page(inode, page, start, start + copied); + + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. + */ + if (pos + copied > inode->i_size) { + struct reiserfs_transaction_handle myth; + reiserfs_write_lock(inode->i_sb); + locked = true; + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos + copied; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around on + * the dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth); + if (ret) + goto journal_error; + } + if (th) { + if (!locked) { + reiserfs_write_lock(inode->i_sb); + locked = true; + } + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + +out: + if (locked) + reiserfs_write_unlock(inode->i_sb); + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + + return ret == 0 ? copied : ret; + +journal_error: + reiserfs_write_unlock(inode->i_sb); + locked = false; + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + goto out; +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; + int depth; + + depth = reiserfs_write_unlock_nested(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); + + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); + + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. + */ + if (pos > inode->i_size) { + struct reiserfs_transaction_handle myth; + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around + * on the dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth); + if (ret) + goto journal_error; + } + if (th) { + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + +out: + return ret; + +journal_error: + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + + return ret; +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (sd_attrs & REISERFS_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (sd_attrs & REISERFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + if (sd_attrs & REISERFS_NOTAIL_FL) + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } +} + +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (inode->i_flags & S_IMMUTABLE) + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; + if (inode->i_flags & S_SYNC) + *sd_attrs |= REISERFS_SYNC_FL; + else + *sd_attrs &= ~REISERFS_SYNC_FL; + if (inode->i_flags & S_NOATIME) + *sd_attrs |= REISERFS_NOATIME_FL; + else + *sd_attrs &= ~REISERFS_NOATIME_FL; + if (REISERFS_I(inode)->i_flags & i_nopack_mask) + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; + } +} + +/* + * decide if this buffer needs to stay around for data logging or ordered + * write purposes + */ +static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +{ + int ret = 1; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + + lock_buffer(bh); + spin_lock(&j->j_dirty_buffers_lock); + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* + * the page is locked, and the only places that log a data buffer + * also lock the page. + */ + if (reiserfs_file_data_log(inode)) { + /* + * very conservative, leave the buffer pinned if + * anyone might need it. + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0; + } + } else if (buffer_dirty(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* + * why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) + && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } +free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); + return ret; +} + +/* clm -- taken from fs/buffer.c:block_invalidate_page */ +static void reiserfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); + int ret = 1; + + BUG_ON(!PageLocked(page)); + + if (!partial_page) + ClearPageChecked(page); + + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (next_off > stop) + goto out; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (!partial_page && ret) { + ret = try_to_release_page(page, 0); + /* maybe should BUG_ON(!ret); - neilb */ + } +out: + return; +} + +static int reiserfs_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + if (reiserfs_file_data_log(inode)) { + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); + } + return __set_page_dirty_buffers(page); +} + +/* + * Returns 1 if the page's buffers were dropped. The page is locked. + * + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads + * in the buffers at page_buffers(page). + * + * even in -o notail mode, we can't be sure an old mount without -o notail + * didn't create files with tails. + */ +static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + struct inode *inode = page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + struct buffer_head *head; + struct buffer_head *bh; + int ret = 1; + + WARN_ON(PageChecked(page)); + spin_lock(&j->j_dirty_buffers_lock); + head = page_buffers(page); + bh = head; + do { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { + ret = 0; + break; + } + } + bh = bh->b_this_page; + } while (bh != head); + if (ret) + ret = try_to_free_buffers(page); + spin_unlock(&j->j_dirty_buffers_lock); + return ret; +} + +/* + * We thank Mingming Cao for helping us understand in great detail what + * to do in this section of the code. + */ +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, + reiserfs_get_blocks_direct_io); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { + truncate_setsize(inode, isize); + reiserfs_vfs_truncate_file(inode); + } + } + + return ret; +} + +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + /* must be turned off for recursive notify_change calls */ + ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + reiserfs_write_lock(inode->i_sb); + if (attr->ia_valid & ATTR_SIZE) { + /* + * version 2 items will be caught by the s_maxbytes check + * done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + reiserfs_write_unlock(inode->i_sb); + error = -EFBIG; + goto out; + } + + inode_dio_wait(inode); + + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand_simple(inode, attr->ia_size); + if (REISERFS_I(inode)->i_prealloc_count > 0) { + int err; + struct reiserfs_transaction_handle th; + /* we're changing at most 2 bitmaps, inode + super */ + err = journal_begin(&th, inode->i_sb, 4); + if (!err) { + reiserfs_discard_prealloc(&th, inode); + err = journal_end(&th); + } + if (err) + error = err; + } + if (error) { + reiserfs_write_unlock(inode->i_sb); + goto out; + } + /* + * file size is changed, ctime and mtime are + * to be updated + */ + attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); + } + } + reiserfs_write_unlock(inode->i_sb); + + if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && + (get_inode_sd_version(inode) == STAT_DATA_V1)) { + /* stat data of format v3.5 has 16 bit uid and gid */ + error = -EINVAL; + goto out; + } + + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * + (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + + REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + + 2; + + error = reiserfs_chown_xattrs(inode, attr); + + if (error) + return error; + + /* + * (user+group)*(old+new) structure - we count quota + * info and , inode write (sb, inode) + */ + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); + if (error) + goto out; + error = dquot_transfer(inode, attr); + reiserfs_write_lock(inode->i_sb); + if (error) { + journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + goto out; + } + + /* + * Update corresponding info in inode so that everything + * is in one transaction + */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + mark_inode_dirty(inode); + error = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error) + goto out; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (!error) { + /* + * Could race against reiserfs_file_release + * if called from NFS, so take tailpack mutex. + */ + mutex_lock(&REISERFS_I(inode)->tailpack); + truncate_setsize(inode, attr->ia_size); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); + } + } + + if (!error) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + + if (!error && reiserfs_posixacl(inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod(inode); + } + +out: + return error; +} + +const struct address_space_operations reiserfs_address_space_operations = { + .writepage = reiserfs_writepage, + .readpage = reiserfs_readpage, + .readpages = reiserfs_readpages, + .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, + .write_begin = reiserfs_write_begin, + .write_end = reiserfs_write_end, + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO, + .set_page_dirty = reiserfs_set_page_dirty, +}; diff --git a/kernel/fs/reiserfs/ioctl.c b/kernel/fs/reiserfs/ioctl.c new file mode 100644 index 000000000..6ec8a30a0 --- /dev/null +++ b/kernel/fs/reiserfs/ioctl.c @@ -0,0 +1,230 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include + +/* + * reiserfs_ioctl - handler for ioctl for inode + * supported commands: + * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect + * and prevent packing file (argument arg has t + * be non-zero) + * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION + * 3) That's all for a while ... + */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + unsigned int flags; + int err = 0; + + reiserfs_write_lock(inode->i_sb); + + switch (cmd) { + case REISERFS_IOC_UNPACK: + if (S_ISREG(inode->i_mode)) { + if (arg) + err = reiserfs_unpack(inode, filp); + } else + err = -ENOTTY; + break; + /* + * following two cases are taken from fs/ext2/ioctl.c by Remy + * Card (card@masi.ibp.fr) + */ + case REISERFS_IOC_GETFLAGS: + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); + err = put_user(flags, (int __user *)arg); + break; + case REISERFS_IOC_SETFLAGS:{ + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + err = mnt_want_write_file(filp); + if (err) + break; + + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } + /* + * Is it quota file? Do not allow user to mess with it + */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + if (((flags ^ REISERFS_I(inode)-> + i_attrs) & (REISERFS_IMMUTABLE_FL | + REISERFS_APPEND_FL)) + && !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } + if ((flags & REISERFS_NOTAIL_FL) && + S_ISREG(inode->i_mode)) { + int result; + + result = reiserfs_unpack(inode, filp); + if (result) { + err = result; + goto setflags_out; + } + } + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + break; + } + case REISERFS_IOC_GETVERSION: + err = put_user(inode->i_generation, (int __user *)arg); + break; + case REISERFS_IOC_SETVERSION: + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + break; + } + err = mnt_want_write_file(filp); + if (err) + break; + if (get_user(inode->i_generation, (int __user *)arg)) { + err = -EFAULT; + goto setversion_out; + } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setversion_out: + mnt_drop_write_file(filp); + break; + default: + err = -ENOTTY; + } + + reiserfs_write_unlock(inode->i_sb); + + return err; +} + +#ifdef CONFIG_COMPAT +long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + /* + * These are just misnamed, they actually + * get/put from/to user an int + */ + switch (cmd) { + case REISERFS_IOC32_UNPACK: + cmd = REISERFS_IOC_UNPACK; + break; + case REISERFS_IOC32_GETFLAGS: + cmd = REISERFS_IOC_GETFLAGS; + break; + case REISERFS_IOC32_SETFLAGS: + cmd = REISERFS_IOC_SETFLAGS; + break; + case REISERFS_IOC32_GETVERSION: + cmd = REISERFS_IOC_GETVERSION; + break; + case REISERFS_IOC32_SETVERSION: + cmd = REISERFS_IOC_SETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +/* + * reiserfs_unpack + * Function try to convert tail from direct item into indirect. + * It set up nopack attribute in the REISERFS_I(inode)->nopack + */ +int reiserfs_unpack(struct inode *inode, struct file *filp) +{ + int retval = 0; + int index; + struct page *page; + struct address_space *mapping; + unsigned long write_from; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (inode->i_size == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + return 0; + } + /* ioctl already done */ + if (REISERFS_I(inode)->i_flags & i_nopack_mask) { + return 0; + } + + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + + reiserfs_write_lock(inode->i_sb); + + write_from = inode->i_size & (blocksize - 1); + /* if we are on a block boundary, we are already unpacked. */ + if (write_from == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + goto out; + } + + /* + * we unpack by finding the page with the tail, and calling + * __reiserfs_write_begin on that page. This will force a + * reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT; + mapping = inode->i_mapping; + page = grab_cache_page(mapping, index); + retval = -ENOMEM; + if (!page) { + goto out; + } + retval = __reiserfs_write_begin(page, write_from, 0); + if (retval) + goto out_unlock; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page); + retval = reiserfs_commit_write(NULL, page, write_from, write_from); + REISERFS_I(inode)->i_flags |= i_nopack_mask; + +out_unlock: + unlock_page(page); + page_cache_release(page); + +out: + mutex_unlock(&inode->i_mutex); + reiserfs_write_unlock(inode->i_sb); + return retval; +} diff --git a/kernel/fs/reiserfs/item_ops.c b/kernel/fs/reiserfs/item_ops.c new file mode 100644 index 000000000..aca73dd73 --- /dev/null +++ b/kernel/fs/reiserfs/item_ops.c @@ -0,0 +1,752 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include "reiserfs.h" + +/* + * this contains item handlers for old item types: sd, direct, + * indirect, directory + */ + +/* + * and where are the comments? how about saying where we can find an + * explanation of each item handler method? -Hans + */ + +/* stat data functions */ +static int sd_bytes_number(struct item_head *ih, int block_size) +{ + return 0; +} + +static void sd_decrement_key(struct cpu_key *key) +{ + key->on_disk_key.k_objectid--; + set_cpu_key_k_type(key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1)); +} + +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) +{ + return 0; +} + +static char *print_time(time_t t) +{ + static char timebuf[256]; + + sprintf(timebuf, "%ld", t); + return timebuf; +} + +static void sd_print_item(struct item_head *ih, char *item) +{ + printk("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = (struct stat_data_v1 *)item; + + printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + sd_v1_size(sd), sd_v1_nlink(sd), + sd_v1_first_direct_byte(sd), + print_time(sd_v1_mtime(sd))); + } else { + struct stat_data *sd = (struct stat_data *)item; + + printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), + sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + } +} + +static void sd_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int sd_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_STAT_DATA; + return 0; +} + +static int sd_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + BUG_ON(start_skip || end_skip); + return -1; +} + +static int sd_check_right(struct virtual_item *vi, int free) +{ + return -1; +} + +static int sd_part_size(struct virtual_item *vi, int first, int count) +{ + BUG_ON(count); + return 0; +} + +static int sd_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void sd_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16100", + "STATDATA, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations stat_data_ops = { + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, + .print_item = sd_print_item, + .check_item = sd_check_item, + + .create_vi = sd_create_vi, + .check_left = sd_check_left, + .check_right = sd_check_right, + .part_size = sd_part_size, + .unit_num = sd_unit_num, + .print_vi = sd_print_vi +}; + +/* direct item functions */ +static int direct_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih); +} + +/* FIXME: this should probably switch to indirect as well */ +static void direct_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direct_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); +} + +static void direct_print_item(struct item_head *ih, char *item) +{ + int j = 0; + +/* return; */ + printk("\""); + while (j < ih_item_len(ih)) + printk("%c", item[j++]); + printk("\"\n"); +} + +static void direct_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int direct_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_DIRECT; + return 0; +} + +static int direct_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % 8; + return bytes ? : -1; +} + +static int direct_check_right(struct virtual_item *vi, int free) +{ + return direct_check_left(vi, free, 0, 0); +} + +static int direct_part_size(struct virtual_item *vi, int first, int count) +{ + return count; +} + +static int direct_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void direct_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16101", + "DIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations direct_ops = { + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, + .print_item = direct_print_item, + .check_item = direct_check_item, + + .create_vi = direct_create_vi, + .check_left = direct_check_left, + .check_right = direct_check_right, + .part_size = direct_part_size, + .unit_num = direct_unit_num, + .print_vi = direct_print_vi +}; + +/* indirect item functions */ +static int indirect_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih) / UNFM_P_SIZE * block_size; +} + +/* decrease offset, if it becomes 0, change type to stat data */ +static void indirect_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +/* if it is not first item of the body, then it is mergeable */ +static int indirect_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return (le_key_k_offset(version, key) != 1); +} + +/* printing of indirect item */ +static void start_new_sequence(__u32 * start, int *len, __u32 new) +{ + *start = new; + *len = 1; +} + +static int sequence_finished(__u32 start, int *len, __u32 new) +{ + if (start == INT_MAX) + return 1; + + if (start == 0 && new == 0) { + (*len)++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len)++; + return 0; + } + return 1; +} + +static void print_sequence(__u32 start, int len) +{ + if (start == INT_MAX) + return; + + if (len == 1) + printk(" %d", start); + else + printk(" %d(%d)", start, len); +} + +static void indirect_print_item(struct item_head *ih, char *item) +{ + int j; + __le32 *unp; + __u32 prev = INT_MAX; + int num = 0; + + unp = (__le32 *) item; + + if (ih_item_len(ih) % UNFM_P_SIZE) + reiserfs_warning(NULL, "reiserfs-16102", "invalid item len"); + + printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); + for (j = 0; j < I_UNFM_NUM(ih); j++) { + if (sequence_finished(prev, &num, get_block_num(unp, j))) { + print_sequence(prev, num); + start_new_sequence(&prev, &num, get_block_num(unp, j)); + } + } + print_sequence(prev, num); + printk("]\n"); +} + +static void indirect_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int indirect_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_INDIRECT; + return 0; +} + +static int indirect_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % UNFM_P_SIZE; + return bytes ? : -1; +} + +static int indirect_check_right(struct virtual_item *vi, int free) +{ + return indirect_check_left(vi, free, 0, 0); +} + +/* + * return size in bytes of 'units' units. If first == 0 - calculate + * from the head (left), otherwise - from tail (right) + */ +static int indirect_part_size(struct virtual_item *vi, int first, int units) +{ + /* unit of indirect item is byte (yet) */ + return units; +} + +static int indirect_unit_num(struct virtual_item *vi) +{ + /* unit of indirect item is byte (yet) */ + return vi->vi_item_len - IH_SIZE; +} + +static void indirect_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16103", + "INDIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations indirect_ops = { + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, + .print_item = indirect_print_item, + .check_item = indirect_check_item, + + .create_vi = indirect_create_vi, + .check_left = indirect_check_left, + .check_right = indirect_check_right, + .part_size = indirect_part_size, + .unit_num = indirect_unit_num, + .print_vi = indirect_print_vi +}; + +/* direntry functions */ +static int direntry_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "vs-16090", + "bytes number is asked for direntry"); + return 0; +} + +static void direntry_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direntry_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; + +} + +static void direntry_print_item(struct item_head *ih, char *item) +{ + int i; + int namelen; + struct reiserfs_de_head *deh; + char *name; + static char namebuf[80]; + + printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", + "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < ih_entry_count(ih); i++, deh++) { + namelen = + (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - + deh_location(deh); + name = item + deh_location(deh); + if (name[namelen - 1] == 0) + namelen = strlen(name); + namebuf[0] = '"'; + if (namelen > sizeof(namebuf) - 3) { + strncpy(namebuf + 1, name, sizeof(namebuf) - 3); + namebuf[sizeof(namebuf) - 2] = '"'; + namebuf[sizeof(namebuf) - 1] = 0; + } else { + memcpy(namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", + i, namebuf, + deh_dir_id(deh), deh_objectid(deh), + GET_HASH_VALUE(deh_offset(deh)), + GET_GENERATION_NUMBER((deh_offset(deh))), + (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); + } +} + +static void direntry_check_item(struct item_head *ih, char *item) +{ + int i; + struct reiserfs_de_head *deh; + + /* unused */ + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < ih_entry_count(ih); i++, deh++) { + ; + } +} + +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 + +/* + * function returns old entry number in directory item in real node + * using new entry number in virtual item in virtual node + */ +static inline int old_entry_num(int is_affected, int virtual_entry_num, + int pos_in_item, int mode) +{ + if (mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; + + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; + + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + + if (mode == M_CUT) + return virtual_entry_num + 1; + + RFALSE(mode != M_PASTE || virtual_entry_num == 0, + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", + mode); + + return virtual_entry_num - 1; +} + +/* + * Create an array of sizes of directory entries for virtual + * item. Return space used by an item. FIXME: no control over + * consuming of space used by this item handler + */ +static int direntry_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + int i, j; + int size = sizeof(struct direntry_uarea); + struct reiserfs_de_head *deh; + + vi->vi_index = TYPE_DIRENTRY; + + BUG_ON(!(vi->vi_ih) || !vi->vi_item); + + dir_u->flags = 0; + if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count(vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i++) { + j = old_entry_num(is_affected, i, vn->vn_pos_in_item, + vn->vn_mode); + dir_u->entry_sizes[i] = + (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - + deh_location(&deh[j]) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof(short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; + +#ifdef CONFIG_REISERFS_CHECK + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected + && (vn->vn_mode == M_PASTE + || vn->vn_mode == M_CUT)) ? insert_size : 0)) { + reiserfs_panic(NULL, "vs-8025", "(mode==%c, " + "insert_size==%d), invalid length of " + "directory item", + vn->vn_mode, insert_size); + } + } +#endif + + return size; + +} + +/* + * return number of entries which may fit into specified amount of + * free space, or -1 if free space is not enough even for 1 entry + */ +static int direntry_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + /* i-th entry doesn't fit into the remaining free space */ + if (dir_u->entry_sizes[i] > free) + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + + if (entries == dir_u->entry_count) { + reiserfs_panic(NULL, "item_ops-1", + "free space %d, entry_count %d", free, + dir_u->entry_count); + } + + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries < 2) + entries = 0; + + return entries ? : -1; +} + +static int direntry_check_right(struct virtual_item *vi, int free) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = dir_u->entry_count - 1; i >= 0; i--) { + /* i-th entry doesn't fit into the remaining free space */ + if (dir_u->entry_sizes[i] > free) + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + BUG_ON(entries == dir_u->entry_count); + + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ? : -1; +} + +/* sum of entry sizes between from-th and to-th entries including both edges */ +static int direntry_part_size(struct virtual_item *vi, int first, int count) +{ + int i, retval; + int from, to; + struct direntry_uarea *dir_u = vi->vi_uarea; + + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; + + for (i = from; i <= to; i++) + retval += dir_u->entry_sizes[i]; + + return retval; +} + +static int direntry_unit_num(struct virtual_item *vi) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + + return dir_u->entry_count; +} + +static void direntry_print_vi(struct virtual_item *vi) +{ + int i; + struct direntry_uarea *dir_u = vi->vi_uarea; + + reiserfs_warning(NULL, "reiserfs-16104", + "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i++) + printk("%d ", dir_u->entry_sizes[i]); + printk("\n"); +} + +static struct item_operations direntry_ops = { + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, + .print_item = direntry_print_item, + .check_item = direntry_check_item, + + .create_vi = direntry_create_vi, + .check_left = direntry_check_left, + .check_right = direntry_check_right, + .part_size = direntry_part_size, + .unit_num = direntry_unit_num, + .print_vi = direntry_print_vi +}; + +/* Error catching functions to catch errors caused by incorrect item types. */ +static int errcatch_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "green-16001", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_decrement_key(struct cpu_key *key) +{ + reiserfs_warning(NULL, "green-16002", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + reiserfs_warning(NULL, "green-16003", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16004", + "Invalid item type observed, run fsck ASAP"); +} + +static void errcatch_check_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16005", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + reiserfs_warning(NULL, "green-16006", + "Invalid item type observed, run fsck ASAP"); + /* + * We might return -1 here as well, but it won't help as + * create_virtual_node() from where this operation is called + * from is of return type void. + */ + return 0; +} + +static int errcatch_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + reiserfs_warning(NULL, "green-16007", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_check_right(struct virtual_item *vi, int free) +{ + reiserfs_warning(NULL, "green-16008", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_part_size(struct virtual_item *vi, int first, int count) +{ + reiserfs_warning(NULL, "green-16009", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static int errcatch_unit_num(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16010", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16011", + "Invalid item type observed, run fsck ASAP"); +} + +static struct item_operations errcatch_ops = { + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, + errcatch_print_item, + errcatch_check_item, + + errcatch_create_vi, + errcatch_check_left, + errcatch_check_right, + errcatch_part_size, + errcatch_unit_num, + errcatch_print_vi +}; + +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) +#error Item types must use disk-format assigned values. +#endif + +struct item_operations *item_ops[TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ +}; diff --git a/kernel/fs/reiserfs/journal.c b/kernel/fs/reiserfs/journal.c new file mode 100644 index 000000000..9d6486d41 --- /dev/null +++ b/kernel/fs/reiserfs/journal.c @@ -0,0 +1,4403 @@ +/* + * Write ahead logging implementation copyright Chris Mason 2000 + * + * The background commits make this code very interrelated, and + * overly complex. I need to rethink things a bit....The major players: + * + * journal_begin -- call with the number of blocks you expect to log. + * If the current transaction is too + * old, it will block until the current transaction is + * finished, and then start a new one. + * Usually, your transaction will get joined in with + * previous ones for speed. + * + * journal_join -- same as journal_begin, but won't block on the current + * transaction regardless of age. Don't ever call + * this. Ever. There are only two places it should be + * called from, and they are both inside this file. + * + * journal_mark_dirty -- adds blocks into this transaction. clears any flags + * that might make them get sent to disk + * and then marks them BH_JDirty. Puts the buffer head + * into the current transaction hash. + * + * journal_end -- if the current transaction is batchable, it does nothing + * otherwise, it could do an async/synchronous commit, or + * a full flush of all log and real blocks in the + * transaction. + * + * flush_old_commits -- if the current transaction is too old, it is ended and + * commit blocks are sent to disk. Forces commit blocks + * to disk for all backgrounded commits that have been + * around too long. + * -- Note, if you call this as an immediate flush from + * from within kupdate, it will ignore the immediate flag + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) + +/* must be correct to keep the desc and commit structs at 4k */ +#define JOURNAL_TRANS_HALF 1018 +#define BUFNR 64 /*read ahead */ + +/* cnode stat bits. Move these into reiserfs_fs.h */ + +/* this block was freed, and can't be written. */ +#define BLOCK_FREED 2 +/* this block was freed during this transaction, and can't be written */ +#define BLOCK_FREED_HOLDER 3 + +/* used in flush_journal_list */ +#define BLOCK_NEEDS_FLUSH 4 +#define BLOCK_DIRTIED 5 + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ + +/* flags for do_journal_end */ +#define FLUSH_ALL 1 /* flush commit and real blocks */ +#define COMMIT_NOW 2 /* end and commit this transaction */ +#define WAIT 4 /* wait for the log blocks to hit the disk */ + +static int do_journal_end(struct reiserfs_transaction_handle *, int flags); +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int can_dirty(struct reiserfs_journal_cnode *cn); +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb); +static void release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(struct work_struct *work); +static void queue_log_writer(struct super_block *s); + +/* values for join in do_journal_begin_r */ +enum { + JBEGIN_REG = 0, /* regular journal begin */ + /* join the running transaction if at all possible */ + JBEGIN_JOIN = 1, + /* called from cleanup code, ignores aborted flag */ + JBEGIN_ABORT = 2, +}; + +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, + unsigned long nblocks, int join); + +static void init_journal_hash(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + memset(journal->j_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); +} + +/* + * clears BH_Dirty and sticks the buffer on the clean list. Called because + * I can't allow refile_buffer to make schedule happen after I've freed a + * block. Look at remove_from_transaction and journal_mark_freed for + * more details. + */ +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) +{ + if (bh) { + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + } + return 0; +} + +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block + *sb) +{ + struct reiserfs_bitmap_node *bn; + static int id; + + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); + if (!bn) { + return NULL; + } + bn->data = kzalloc(sb->s_blocksize, GFP_NOFS); + if (!bn->data) { + kfree(bn); + return NULL; + } + bn->id = id++; + INIT_LIST_HEAD(&bn->list); + return bn; +} + +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = journal->j_bitmap_nodes.next; + + journal->j_used_bitmap_nodes++; +repeat: + + if (entry != &journal->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list); + list_del(entry); + memset(bn->data, 0, sb->s_blocksize); + journal->j_free_bitmap_nodes--; + return bn; + } + bn = allocate_bitmap_node(sb); + if (!bn) { + yield(); + goto repeat; + } + return bn; +} +static inline void free_bitmap_node(struct super_block *sb, + struct reiserfs_bitmap_node *bn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + journal->j_used_bitmap_nodes--; + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + kfree(bn->data); + kfree(bn); + } else { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } +} + +static void allocate_bitmap_nodes(struct super_block *sb) +{ + int i; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { + bn = allocate_bitmap_node(sb); + if (bn) { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } else { + /* this is ok, we'll try again when more are needed */ + break; + } + } +} + +static int set_bit_in_list_bitmap(struct super_block *sb, + b_blocknr_t block, + struct reiserfs_list_bitmap *jb) +{ + unsigned int bmap_nr = block / (sb->s_blocksize << 3); + unsigned int bit_nr = block % (sb->s_blocksize << 3); + + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(sb); + } + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); + return 0; +} + +static void cleanup_bitmap_list(struct super_block *sb, + struct reiserfs_list_bitmap *jb) +{ + int i; + if (jb->bitmaps == NULL) + return; + + for (i = 0; i < reiserfs_bmap_count(sb); i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(sb, jb->bitmaps[i]); + jb->bitmaps[i] = NULL; + } + } +} + +/* + * only call this on FS unmount. + */ +static int free_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array) +{ + int i; + struct reiserfs_list_bitmap *jb; + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + cleanup_bitmap_list(sb, jb); + vfree(jb->bitmaps); + jb->bitmaps = NULL; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct list_head *next = journal->j_bitmap_nodes.next; + struct reiserfs_bitmap_node *bn; + + while (next != &journal->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list); + list_del(next); + kfree(bn->data); + kfree(bn); + next = journal->j_bitmap_nodes.next; + journal->j_free_bitmap_nodes--; + } + + return 0; +} + +/* + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. + * jb_array is the array to be filled in. + */ +int reiserfs_allocate_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array, + unsigned int bmap_nr) +{ + int i; + int failed = 0; + struct reiserfs_list_bitmap *jb; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); + + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + jb->bitmaps = vzalloc(mem); + if (!jb->bitmaps) { + reiserfs_warning(sb, "clm-2000", "unable to " + "allocate bitmaps for journal lists"); + failed = 1; + break; + } + } + if (failed) { + free_list_bitmaps(sb, jb_array); + return -1; + } + return 0; +} + +/* + * find an available list bitmap. If you can't find one, flush a commit list + * and try again + */ +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, + struct reiserfs_journal_list + *jl) +{ + int i, j; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_list_bitmap *jb = NULL; + + for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { + i = journal->j_list_bitmap_index; + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; + jb = journal->j_list_bitmap + i; + if (journal->j_list_bitmap[i].journal_list) { + flush_commit_list(sb, + journal->j_list_bitmap[i]. + journal_list, 1); + if (!journal->j_list_bitmap[i].journal_list) { + break; + } + } else { + break; + } + } + /* double check to make sure if flushed correctly */ + if (jb->journal_list) + return NULL; + jb->journal_list = jl; + return jb; +} + +/* + * allocates a new chunk of X nodes, and links them all together as a list. + * Uses the cnode->next and cnode->prev pointers + * returns NULL on failure + */ +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) +{ + struct reiserfs_journal_cnode *head; + int i; + if (num_cnodes <= 0) { + return NULL; + } + head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + if (!head) { + return NULL; + } + head[0].prev = NULL; + head[0].next = head + 1; + for (i = 1; i < num_cnodes; i++) { + head[i].prev = head + (i - 1); + head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ + } + head[num_cnodes - 1].next = NULL; + return head; +} + +/* pulls a cnode off the free list, or returns NULL on failure */ +static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "get_cnode"); + + if (journal->j_cnode_free <= 0) { + return NULL; + } + journal->j_cnode_used++; + journal->j_cnode_free--; + cn = journal->j_cnode_free_list; + if (!cn) { + return cn; + } + if (cn->next) { + cn->next->prev = NULL; + } + journal->j_cnode_free_list = cn->next; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); + return cn; +} + +/* + * returns a cnode to the free list + */ +static void free_cnode(struct super_block *sb, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "free_cnode"); + + journal->j_cnode_used--; + journal->j_cnode_free++; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = journal->j_cnode_free_list; + if (journal->j_cnode_free_list) { + journal->j_cnode_free_list->prev = cn; + } + cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ + journal->j_cnode_free_list = cn; +} + +static void clear_prepared_bits(struct buffer_head *bh) +{ + clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); +} + +/* + * return a cnode with same dev, block number and size in table, + * or null if not found + */ +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct + super_block + *sb, + struct + reiserfs_journal_cnode + **table, + long bl) +{ + struct reiserfs_journal_cnode *cn; + cn = journal_hash(table, sb, bl); + while (cn) { + if (cn->blocknr == bl && cn->sb == sb) + return cn; + cn = cn->hnext; + } + return (struct reiserfs_journal_cnode *)0; +} + +/* + * this actually means 'can this block be reallocated yet?'. If you set + * search_all, a block can only be allocated if it is not in the current + * transaction, was not freed by the current transaction, and has no chance + * of ever being overwritten by a replay after crashing. + * + * If you don't set search_all, a block can only be allocated if it is not + * in the current transaction. Since deleting a block removes it from the + * current transaction, this case should never happen. If you don't set + * search_all, make sure you never write the block without logging it. + * + * next_zero_bit is a suggestion about the next block to try for find_forward. + * when bl is rejected because it is set in a journal list bitmap, we search + * for the next zero bit in the bitmap that rejected bl. Then, we return + * that through next_zero_bit for find_forward to try. + * + * Just because we return something in next_zero_bit does not mean we won't + * reject it on the next call to reiserfs_in_journal + */ +int reiserfs_in_journal(struct super_block *sb, + unsigned int bmap_nr, int bit_nr, int search_all, + b_blocknr_t * next_zero_bit) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn; + struct reiserfs_list_bitmap *jb; + int i; + unsigned long bl; + + *next_zero_bit = 0; /* always start this at zero. */ + + PROC_INFO_INC(sb, journal.in_journal); + /* + * If we aren't doing a search_all, this is a metablock, and it + * will be logged before use. if we crash before the transaction + * that freed it commits, this transaction won't have committed + * either, and the block will never be written + */ + if (search_all) { + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + PROC_INFO_INC(sb, journal.in_journal_bitmap); + jb = journal->j_list_bitmap + i; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, + (unsigned long *)jb->bitmaps[bmap_nr]-> + data)) { + *next_zero_bit = + find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]-> + data), + sb->s_blocksize << 3, + bit_nr + 1); + return 1; + } + } + } + + bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; + /* is it in any old transactions? */ + if (search_all + && (cn = + get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) { + BUG(); + return 1; + } + + PROC_INFO_INC(sb, journal.in_journal_reusable); + /* safe for reuse */ + return 0; +} + +/* insert cn into table */ +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal_cnode *cn_orig; + + cn_orig = journal_hash(table, cn->sb, cn->blocknr); + cn->hnext = cn_orig; + cn->hprev = NULL; + if (cn_orig) { + cn_orig->hprev = cn; + } + journal_hash(table, cn->sb, cn->blocknr) = cn; +} + +/* lock the current transaction */ +static inline void lock_journal(struct super_block *sb) +{ + PROC_INFO_INC(sb, journal.lock_journal); + + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); +} + +/* unlock the current transaction */ +static inline void unlock_journal(struct super_block *sb) +{ + mutex_unlock(&SB_JOURNAL(sb)->j_mutex); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d", + jl->j_trans_id, jl->j_refcount); + } + if (--jl->j_refcount == 0) + kfree(jl); +} + +/* + * this used to be much more involved, and I'm keeping it just in case + * things get ugly again. it gets called by flush_commit_list, and + * cleans up any data stored about blocks freed during a transaction. + */ +static void cleanup_freed_for_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; + if (jb) { + cleanup_bitmap_list(sb, jb); + } + jl->j_list_bitmap->journal_list = NULL; + jl->j_list_bitmap = NULL; +} + +static int journal_list_still_alive(struct super_block *s, + unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct list_head *entry = &journal->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +/* + * If page->mapping was null, we failed to truncate this page for + * some reason. Most likely because it was truncated after being + * logged via data=journal. + * + * This does a check to see if the buffer belongs to one of these + * lost pages before doing the final put_bh. If page->mapping was + * null, it tries to free buffers on the page, which should make the + * final page_cache_release drop the page from the lru. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + if (!page->mapping && trylock_page(page)) { + page_cache_get(page); + put_bh(bh); + if (!page->mapping) + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + } else { + put_bh(bh); + } +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning(NULL, "clm-2084", + "pinned buffer %lu:%s sent to disk", + bh->b_blocknr, bdevname(bh->b_bdev, b)); + } + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + + unlock_buffer(bh); + release_buffer_page(bh); +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); +} + +static void submit_logged_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_buffer_io_sync; + clear_buffer_journal_new(bh); + clear_buffer_dirty(bh); + if (!test_clear_buffer_journal_test(bh)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +static void submit_ordered_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->nr; i++) { + submit_logged_buffer(chunk->bh[i]); + } + chunk->nr = 0; +} + +static void write_ordered_chunk(struct buffer_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->nr; i++) { + submit_ordered_buffer(chunk->bh[i]); + } + chunk->nr = 0; +} + +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, + spinlock_t * lock, void (fn) (struct buffer_chunk *)) +{ + int ret = 0; + BUG_ON(chunk->nr >= CHUNK_SIZE); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) { + spin_unlock(lock); + fn(chunk); + spin_lock(lock); + } else { + fn(chunk); + } + } + return ret; +} + +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); +static struct reiserfs_jh *alloc_jh(void) +{ + struct reiserfs_jh *jh; + while (1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); + } +} + +/* + * we want to free the jh when the buffer has been written + * and waited on + */ +void reiserfs_free_jh(struct buffer_head *bh) +{ + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } +} + +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, + int tail) +{ + struct reiserfs_jh *jh; + + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { +no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* + * buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + BUG_ON(bh->b_private); + jh->bh = bh; + bh->b_private = jh; + } + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; +} + +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +} +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +} + +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) +static int write_ordered_buffers(spinlock_t * lock, + struct reiserfs_journal *j, + struct reiserfs_journal_list *jl, + struct list_head *list) +{ + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = j->j_errno; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while (!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (!trylock_buffer(bh)) { + if (!buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + cond_resched(); + spin_lock(lock); + goto loop_next; + } + /* + * in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } + if (buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } +loop_next: + put_bh(bh); + cond_resched_lock(lock); + } + if (chunk.nr) { + spin_unlock(lock); + write_ordered_chunk(&chunk); + spin_lock(lock); + } + while (!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) { + ret = -EIO; + } + /* + * ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a + * valid journal head from an older transaction. If someone + * else sets our buffer dirty after we write it in the first + * loop, and then someone truncates the page away, nobody + * will ever write the buffer. We're safe if we write the + * page one last time after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } + put_bh(bh); + cond_resched_lock(lock); + } + spin_unlock(lock); + return ret; +} + +static int flush_older_commits(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned int trans_id = jl->j_trans_id; + unsigned int other_trans_id; + unsigned int first_trans_id; + +find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &journal->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* + * the one we just flushed is gone, this means + * all older lists are also gone, so first_jl + * is no longer valid either. Go back to the + * beginning. + */ + if (!journal_list_still_alive + (s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &journal->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} + +static int reiserfs_async_progress_wait(struct super_block *s) +{ + struct reiserfs_journal *j = SB_JOURNAL(s); + + if (atomic_read(&j->j_async_throttle)) { + int depth; + + depth = reiserfs_write_unlock_nested(s); + congestion_wait(BLK_RW_ASYNC, HZ / 10); + reiserfs_write_lock_nested(s, depth); + } + + return 0; +} + +/* + * if this journal list still has commit blocks unflushed, send them to disk. + * + * log areas must be flushed in order (transaction 2 can't commit before + * transaction 1) Before the commit block can by written, every other log + * block must be safely on disk + */ +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + int i; + b_blocknr_t bn; + struct buffer_head *tbh = NULL; + unsigned int trans_id = jl->j_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int retval = 0; + int write_len; + int depth; + + reiserfs_check_lock_depth(s, "flush_commit_list"); + + if (atomic_read(&jl->j_older_commits_done)) { + return 0; + } + + /* + * before we can put our commit blocks on disk, we have to make + * sure everyone older than us is on disk too + */ + BUG_ON(jl->j_len <= 0); + BUG_ON(trans_id == journal->j_trans_id); + + get_journal_list(jl); + if (flushall) { + if (flush_older_commits(s, jl) == 1) { + /* + * list disappeared during flush_older_commits. + * return + */ + goto put_jl; + } + } + + /* make sure nobody is trying to flush this one at the same time */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); + + if (!journal_list_still_alive(s, trans_id)) { + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + BUG_ON(jl->j_trans_id == 0); + + /* this commit is done, exit */ + if (atomic_read(&jl->j_commit_left) <= 0) { + if (flushall) { + atomic_set(&jl->j_older_commits_done, 1); + } + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + + if (!list_empty(&jl->j_bh_list)) { + int ret; + + /* + * We might sleep in numerous places inside + * write_ordered_buffers. Relax the write lock. + */ + depth = reiserfs_write_unlock_nested(s); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; + reiserfs_write_lock_nested(s, depth); + } + BUG_ON(!list_empty(&jl->j_bh_list)); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. + */ + atomic_inc(&journal->j_async_throttle); + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % + SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + if (tbh) { + if (buffer_dirty(tbh)) { + depth = reiserfs_write_unlock_nested(s); + ll_rw_block(WRITE, 1, &tbh); + reiserfs_write_lock_nested(s, depth); + } + put_bh(tbh) ; + } + } + atomic_dec(&journal->j_async_throttle); + + for (i = 0; i < (jl->j_len + 1); i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(tbh); + reiserfs_write_lock_nested(s, depth); + /* + * since we're using ll_rw_blk above, it might have skipped + * over a locked buffer. Double check here + */ + /* redundant, sync_dirty_buffer() checks */ + if (buffer_dirty(tbh)) { + depth = reiserfs_write_unlock_nested(s); + sync_dirty_buffer(tbh); + reiserfs_write_lock_nested(s, depth); + } + if (unlikely(!buffer_uptodate(tbh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-601", + "buffer write failed"); +#endif + retval = -EIO; + } + /* once for journal_find_get_block */ + put_bh(tbh); + /* once due to original getblk in do_journal_end */ + put_bh(tbh); + atomic_dec(&jl->j_commit_left); + } + + BUG_ON(atomic_read(&jl->j_commit_left) != 1); + + /* + * If there was a write error in the journal - we can't commit + * this transaction - it will be invalid and, if successful, + * will just end up propagating the write error out to + * the file system. + */ + if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + depth = reiserfs_write_unlock_nested(s); + if (reiserfs_barrier_flush(s)) + __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(jl->j_commit_bh); + reiserfs_write_lock_nested(s, depth); + } + + /* + * If there was a write error in the journal - we can't commit this + * transaction - it will be invalid and, if successful, will just end + * up propagating the write error out to the filesystem. + */ + if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-615", "buffer write failed"); +#endif + retval = -EIO; + } + bforget(jl->j_commit_bh); + if (journal->j_last_commit_id != 0 && + (jl->j_trans_id - journal->j_last_commit_id) != 1) { + reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu", + journal->j_last_commit_id, jl->j_trans_id); + } + journal->j_last_commit_id = jl->j_trans_id; + + /* + * now, every commit block is on the disk. It is safe to allow + * blocks freed during this transaction to be reallocated + */ + cleanup_freed_for_journal_list(s, jl); + + retval = retval ? retval : journal->j_errno; + + /* mark the metadata dirty */ + if (!retval) + dirty_one_transaction(s, jl); + atomic_dec(&jl->j_commit_left); + + if (flushall) { + atomic_set(&jl->j_older_commits_done, 1); + } + mutex_unlock(&jl->j_commit_mutex); +put_jl: + put_journal_list(s, jl); + + if (retval) + reiserfs_abort(s, retval, "Journal write error in %s", + __func__); + return retval; +} + +/* + * flush_journal_list frequently needs to find a newer transaction for a + * given block. This does that, or returns NULL if it can't find anything + */ +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct + reiserfs_journal_cnode + *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist; + } + cn = cn->hprev; + } + return NULL; +} + +static void remove_journal_hash(struct super_block *, + struct reiserfs_journal_cnode **, + struct reiserfs_journal_list *, unsigned long, + int); + +/* + * once all the real blocks have been flushed, it is safe to remove them + * from the journal list for this transaction. Aside from freeing the + * cnode, this also allows the block to be reallocated for data blocks + * if it had been deleted. + */ +static void remove_all_from_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl, + int debug) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *last; + cn = jl->j_realblock; + + /* + * which is better, to lock once around the whole loop, or + * to lock for each call to remove_journal_hash? + */ + while (cn) { + if (cn->blocknr != 0) { + if (debug) { + reiserfs_warning(sb, "reiserfs-2201", + "block %u, bh is %d, state %ld", + cn->blocknr, cn->bh ? 1 : 0, + cn->state); + } + cn->state = 0; + remove_journal_hash(sb, journal->j_list_hash_table, + jl, cn->blocknr, 1); + } + last = cn; + cn = cn->next; + free_cnode(sb, last); + } + jl->j_realblock = NULL; +} + +/* + * if this timestamp is greater than the timestamp we wrote last to the + * header block, write it to the header block. once this is done, I can + * safely say the log area for this transaction won't ever be replayed, + * and I can start releasing blocks in this transaction for reuse as data + * blocks. called by flush_journal_list, before it calls + * remove_all_from_journal_list + */ +static int _update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int depth; + + if (reiserfs_is_journal_aborted(journal)) + return -EIO; + + if (trans_id >= journal->j_last_flush_trans_id) { + if (buffer_locked((journal->j_header_bh))) { + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock_nested(sb, depth); + if (unlikely(!buffer_uptodate(journal->j_header_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(sb, "journal-699", + "buffer write failed"); +#endif + return -EIO; + } + } + journal->j_last_flush_trans_id = trans_id; + journal->j_first_unflushed_offset = offset; + jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> + b_data); + jh->j_last_flush_trans_id = cpu_to_le32(trans_id); + jh->j_first_unflushed_offset = cpu_to_le32(offset); + jh->j_mount_id = cpu_to_le32(journal->j_mount_id); + + set_buffer_dirty(journal->j_header_bh); + depth = reiserfs_write_unlock_nested(sb); + + if (reiserfs_barrier_flush(sb)) + __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(journal->j_header_bh); + + reiserfs_write_lock_nested(sb, depth); + if (!buffer_uptodate(journal->j_header_bh)) { + reiserfs_warning(sb, "journal-837", + "IO error during journal replay"); + return -EIO; + } + } + return 0; +} + +static int update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + return _update_journal_header_block(sb, offset, trans_id); +} + +/* +** flush any and all journal lists older than you are +** can only be called from flush_journal_list +*/ +static int flush_older_journal_lists(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned int trans_id = jl->j_trans_id; + + /* + * we know we are the only ones flushing things, no extra race + * protection is required. + */ +restart: + entry = journal->j_journal_list.next; + /* Did we wrap? */ + if (entry == &journal->j_journal_list) + return 0; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + BUG_ON(other_jl->j_refcount <= 0); + /* do not flush all */ + flush_journal_list(sb, other_jl, 0); + + /* other_jl is now deleted from the list */ + goto restart; + } + return 0; +} + +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + journal->j_num_work_lists--; + } +} + +/* + * flush a journal list, both commit and real blocks + * + * always set flushall to 1, unless you are calling from inside + * flush_journal_list + * + * IMPORTANT. This can only be called while there are no journal writers, + * and the journal is locked. That means it can only be called from + * do_journal_end, or by journal_release + */ +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + struct reiserfs_journal_list *pjl; + struct reiserfs_journal_cnode *cn, *last; + int count; + int was_jwait = 0; + int was_dirty = 0; + struct buffer_head *saved_bh; + unsigned long j_len_saved = jl->j_len; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err = 0; + int depth; + + BUG_ON(j_len_saved <= 0); + + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_warning(s, "clm-2048", "called with wcount %d", + atomic_read(&journal->j_wcount)); + } + + /* if flushall == 0, the lock is already held */ + if (flushall) { + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + } else if (mutex_trylock(&journal->j_flush_mutex)) { + BUG(); + } + + count = 0; + if (j_len_saved > journal->j_trans_max) { + reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu", + j_len_saved, jl->j_trans_id); + return 0; + } + + /* if all the work is already done, get out of here */ + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { + goto flush_older_and_return; + } + + /* + * start by putting the commit list on disk. This will also flush + * the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1); + + if (!(jl->j_state & LIST_DIRTY) + && !reiserfs_is_journal_aborted(journal)) + BUG(); + + /* are we done now? */ + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { + goto flush_older_and_return; + } + + /* + * loop through each cnode, see if we need to write it, + * or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_panic(s, "journal-844", "journal list is flushing, " + "wcount is not 0"); + } + cn = jl->j_realblock; + while (cn) { + was_jwait = 0; + was_dirty = 0; + saved_bh = NULL; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode; + } + + /* + * This transaction failed commit. + * Don't write out to the disk + */ + if (!(jl->j_state & LIST_DIRTY)) + goto free_cnode; + + pjl = find_newer_jl_for_cn(cn); + /* + * the order is important here. We check pjl to make sure we + * don't clear BH_JDirty_wait if we aren't the one writing this + * block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh; + + /* + * we do this to make sure nobody releases the + * buffer while we are working with it + */ + get_bh(saved_bh); + + if (buffer_journal_dirty(saved_bh)) { + BUG_ON(!can_dirty(cn)); + was_jwait = 1; + was_dirty = 1; + } else if (can_dirty(cn)) { + /* + * everything with !pjl && jwait + * should be writable + */ + BUG(); + } + } + + /* + * if someone has this block in a newer transaction, just make + * sure they are committed, and don't try writing it to disk + */ + if (pjl) { + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1); + goto free_cnode; + } + + /* + * bh == NULL when the block got to disk on its own, OR, + * the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode; + } + + /* + * this should never happen. kupdate_one_transaction has + * this list locked while it works, so we should never see a + * buffer here that is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { + reiserfs_warning(s, "journal-813", + "BAD! buffer %llu %cdirty %cjwait, " + "not in a newer tranasction", + (unsigned long long)saved_bh-> + b_blocknr, was_dirty ? ' ' : '!', + was_jwait ? ' ' : '!'); + } + if (was_dirty) { + /* + * we inc again because saved_bh gets decremented + * at free_cnode + */ + get_bh(saved_bh); + set_bit(BLOCK_NEEDS_FLUSH, &cn->state); + lock_buffer(saved_bh); + BUG_ON(cn->blocknr != saved_bh->b_blocknr); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh); + else + unlock_buffer(saved_bh); + count++; + } else { + reiserfs_warning(s, "clm-2082", + "Unable to flush buffer %llu in %s", + (unsigned long long)saved_bh-> + b_blocknr, __func__); + } +free_cnode: + last = cn; + cn = cn->next; + if (saved_bh) { + /* + * we incremented this to keep others from + * taking the buffer head away + */ + put_bh(saved_bh); + if (atomic_read(&saved_bh->b_count) < 0) { + reiserfs_warning(s, "journal-945", + "saved_bh->b_count < 0"); + } + } + } + if (count > 0) { + cn = jl->j_realblock; + while (cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, "journal-1011", + "cn->bh is NULL"); + } + + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(cn->bh); + reiserfs_write_lock_nested(s, depth); + + if (!cn->bh) { + reiserfs_panic(s, "journal-1012", + "cn->bh is NULL"); + } + if (unlikely(!buffer_uptodate(cn->bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-949", + "buffer write failed"); +#endif + err = -EIO; + } + /* + * note, we must clear the JDirty_wait bit + * after the up to date check, otherwise we + * race against our flushpage routine + */ + BUG_ON(!test_clear_buffer_journal_dirty + (cn->bh)); + + /* drop one ref for us */ + put_bh(cn->bh); + /* drop one ref for journal_mark_dirty */ + release_buffer_page(cn->bh); + } + cn = cn->next; + } + } + + if (err) + reiserfs_abort(s, -EIO, + "Write error while pushing transaction to disk in %s", + __func__); +flush_older_and_return: + + /* + * before we can update the journal header block, we _must_ flush all + * real blocks from all older transactions to disk. This is because + * once the header block is updated, this transaction will not be + * replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl); + } + + err = journal->j_errno; + /* + * before we can remove everything from the hash tables for this + * transaction, we must make sure it can never be replayed + * + * since we are only called from do_journal_end, we know for sure there + * are no allocations going on while we are flushing journal lists. So, + * we only need to update the journal header block for the last list + * being flushed + */ + if (!err && flushall) { + err = + update_journal_header_block(s, + (jl->j_start + jl->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(s), + jl->j_trans_id); + if (err) + reiserfs_abort(s, -EIO, + "Write error while updating journal header in %s", + __func__); + } + remove_all_from_journal_list(s, jl, 0); + list_del_init(&jl->j_list); + journal->j_num_lists--; + del_from_work_list(s, jl); + + if (journal->j_last_flush_id != 0 && + (jl->j_trans_id - journal->j_last_flush_id) != 1) { + reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu", + journal->j_last_flush_id, jl->j_trans_id); + } + journal->j_last_flush_id = jl->j_trans_id; + + /* + * not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ + jl->j_len = 0; + atomic_set(&jl->j_nonzerolen, 0); + jl->j_start = 0; + jl->j_realblock = NULL; + jl->j_commit_bh = NULL; + jl->j_trans_id = 0; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + mutex_unlock(&journal->j_flush_mutex); + return err; +} + +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) +{ + struct reiserfs_journal_cnode *cn; + int ret = 0; + + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; + } + + cn = jl->j_realblock; + while (cn) { + /* + * if the blocknr == 0, this has been cleared from the hash, + * skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* + * we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + buffer_journal_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } +next: + cn = cn->next; + cond_resched(); + } + return ret; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock; + while (cn) { + /* + * look for a more recent transaction that logged this + * buffer. Only the most recent transaction with a buffer in + * it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn); + if (!pjl && cn->blocknr && cn->bh + && buffer_journal_dirty(cn->bh)) { + BUG_ON(!can_dirty(cn)); + /* + * if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + clear_buffer_journal_new(cn->bh); + if (buffer_journal_prepared(cn->bh)) { + set_buffer_journal_restore_dirty(cn->bh); + } else { + set_buffer_journal_test(cn->bh); + mark_buffer_dirty(cn->bh); + } + } + cn = cn->next; + } + return ret; +} + +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned int *next_trans_id, + int num_blocks, int num_trans) +{ + int ret = 0; + int written = 0; + int transactions_flushed = 0; + unsigned int orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + struct reiserfs_journal *journal = SB_JOURNAL(s); + chunk.nr = 0; + + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* + * we've got j_flush_mutex held, nobody is going to delete any + * of these lists out from underneath us + */ + while ((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left) + || !(jl->j_state & LIST_DIRTY)) { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); + + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &journal->j_journal_list) { + break; + } + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; + } + if (chunk.nr) { + write_chunk(&chunk); + } + +done: + mutex_unlock(&journal->j_flush_mutex); + return ret; +} + +/* + * for o_sync and fsync heavy applications, they tend to use + * all the journa list slots with tiny transactions. These + * trigger lots and lots of calls to update the header block, which + * adds seeks and slows things down. + * + * This function tries to clear out a large chunk of the journal lists + * at once, which makes everything faster since only the newest journal + * list updates the header block + */ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + int limit = 256; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned int trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + + flush_jl = tjl = jl; + + /* in data logging mode, try harder to flush a lot of blocks */ + if (reiserfs_data_log(s)) + limit = 1024; + /* flush for 256 transactions or limit blocks, whichever comes first */ + for (i = 0; i < 256 && len < limit; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &journal->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + get_journal_list(jl); + get_journal_list(flush_jl); + + /* + * try to find a group of blocks we can flush across all the + * transactions, but only bother if we've actually spanned + * across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + put_journal_list(s, flush_jl); + put_journal_list(s, jl); + return 0; +} + +/* + * removes any nodes in table with name block and dev as bh. + * only touchs the hnext and hprev pointers. + */ +void remove_journal_hash(struct super_block *sb, + struct reiserfs_journal_cnode **table, + struct reiserfs_journal_list *jl, + unsigned long block, int remove_freed) +{ + struct reiserfs_journal_cnode *cur; + struct reiserfs_journal_cnode **head; + + head = &(journal_hash(table, sb, block)); + if (!head) { + return; + } + cur = *head; + while (cur) { + if (cur->blocknr == block && cur->sb == sb + && (jl == NULL || jl == cur->jlist) + && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext; + } else { + *head = cur->hnext; + } + cur->blocknr = 0; + cur->sb = NULL; + cur->state = 0; + /* + * anybody who clears the cur->bh will also + * dec the nonzerolen + */ + if (cur->bh && cur->jlist) + atomic_dec(&cur->jlist->j_nonzerolen); + cur->bh = NULL; + cur->jlist = NULL; + } + cur = cur->hnext; + } +} + +static void free_journal_ram(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + kfree(journal->j_current_jl); + journal->j_num_lists--; + + vfree(journal->j_cnode_free_orig); + free_list_bitmaps(sb, journal->j_list_bitmap); + free_bitmap_nodes(sb); /* must be after free_list_bitmaps */ + if (journal->j_header_bh) { + brelse(journal->j_header_bh); + } + /* + * j_header_bh is on the journal dev, make sure + * not to release the journal dev until we brelse j_header_bh + */ + release_journal_dev(sb, journal); + vfree(journal); +} + +/* + * call on unmount. Only set error to 1 if you haven't made your way out + * of read_super() yet. Any other caller must keep error at 0. + */ +static int do_journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb, int error) +{ + struct reiserfs_transaction_handle myth; + int flushed = 0; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + /* + * we only want to flush out transactions if we were + * called with error == 0 + */ + if (!error && !(sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + BUG_ON(!th->t_trans_id); + do_journal_end(th, FLUSH_ALL); + + /* + * make sure something gets logged to force + * our way into the flush code + */ + if (!journal_join(&myth, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); + flushed = 1; + } + } + + /* this also catches errors during the do_journal_end above */ + if (!error && reiserfs_is_journal_aborted(journal)) { + memset(&myth, 0, sizeof(myth)); + if (!journal_join_abort(&myth, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); + } + } + + + /* + * We must release the write lock here because + * the workqueue job (flush_async_commit) needs this lock + */ + reiserfs_write_unlock(sb); + + /* + * Cancel flushing of old commits. Note that neither of these works + * will be requeued because superblock is being shutdown and doesn't + * have MS_ACTIVE set. + */ + cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); + /* wait for all commits to finish */ + cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); + + free_journal_ram(sb); + + reiserfs_write_lock(sb); + + return 0; +} + +/* * call on unmount. flush all journal trans, release all alloc'd ram */ +int journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 0); +} + +/* only call from an error condition inside reiserfs_read_super! */ +int journal_release_error(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 1); +} + +/* + * compares description block with commit block. + * returns 1 if they differ, 0 if they are the same + */ +static int journal_compare_desc_commit(struct super_block *sb, + struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) +{ + if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || + get_commit_trans_len(commit) != get_desc_trans_len(desc) || + get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max || + get_commit_trans_len(commit) <= 0) { + return 1; + } + return 0; +} + +/* + * returns 0 if it did not find a description block + * returns -1 if it found a corrupt commit block + * returns 1 if both desc and commit were valid + * NOTE: only called during fs mount + */ +static int journal_transaction_is_valid(struct super_block *sb, + struct buffer_head *d_bh, + unsigned int *oldest_invalid_trans_id, + unsigned long *newest_mount_id) +{ + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; + unsigned long offset; + + if (!d_bh) + return 0; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (get_desc_trans_len(desc) > 0 + && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id + && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu", + get_desc_trans_id(desc), + *oldest_invalid_trans_id); + return 0; + } + if (newest_mount_id + && *newest_mount_id > get_desc_mount_id(desc)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu", + get_desc_mount_id(desc), + *newest_mount_id); + return -1; + } + if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) { + reiserfs_warning(sb, "journal-2018", + "Bad transaction length %d " + "encountered, ignoring transaction", + get_desc_trans_len(desc)); + return -1; + } + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + + /* + * ok, we have a journal description block, + * let's see if the transaction was valid + */ + c_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((offset + get_desc_trans_len(desc) + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) + return 0; + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + if (oldest_invalid_trans_id) { + *oldest_invalid_trans_id = + get_desc_trans_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d", + get_desc_trans_id(desc)); + } + return -1; + } + brelse(c_bh); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1006: found valid " + "transaction start offset %llu, len %d id %d", + d_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), + get_desc_trans_id(desc)); + return 1; + } else { + return 0; + } +} + +static void brelse_array(struct buffer_head **heads, int num) +{ + int i; + for (i = 0; i < num; i++) { + brelse(heads[i]); + } +} + +/* + * given the start, and values for the oldest acceptable transactions, + * this either reads in a replays a transaction, or returns because the + * transaction is invalid, or too old. + * NOTE: only called during fs mount + */ +static int journal_read_transaction(struct super_block *sb, + unsigned long cur_dblock, + unsigned long oldest_start, + unsigned int oldest_trans_id, + unsigned long newest_mount_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + unsigned int trans_id = 0; + struct buffer_head *c_bh; + struct buffer_head *d_bh; + struct buffer_head **log_blocks = NULL; + struct buffer_head **real_blocks = NULL; + unsigned int trans_offset; + int i; + int trans_half; + + d_bh = journal_bread(sb, cur_dblock); + if (!d_bh) + return 1; + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %llu, len %d mount_id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), get_desc_mount_id(desc)); + if (get_desc_trans_id(desc) < oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old", + cur_dblock - + SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + brelse(d_bh); + return 1; + } + if (get_desc_mount_id(desc) != newest_mount_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu", get_desc_mount_id(desc), + newest_mount_id); + brelse(d_bh); + return 1; + } + c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + 1) % + SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) { + brelse(d_bh); + return 1; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_read_transaction, " + "commit offset %llu had bad time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + brelse(d_bh); + return 1; + } + + if (bdev_read_only(sb->s_bdev)) { + reiserfs_warning(sb, "clm-2076", + "device is readonly, unable to replay log"); + brelse(c_bh); + brelse(d_bh); + return -EROFS; + } + + trans_id = get_desc_trans_id(desc); + /* + * now we know we've got a good transaction, and it was + * inside the valid time ranges + */ + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + if (!log_blocks || !real_blocks) { + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + reiserfs_warning(sb, "journal-1169", + "kmalloc failed, unable to mount FS"); + return -1; + } + /* get all the buffer heads */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0; i < get_desc_trans_len(desc); i++) { + log_blocks[i] = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + (trans_offset + 1 + + i) % SB_ONDISK_JOURNAL_SIZE(sb)); + if (i < trans_half) { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(desc->j_realblock[i])); + } else { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(commit-> + j_realblock[i - trans_half])); + } + if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) { + reiserfs_warning(sb, "journal-1207", + "REPLAY FAILURE fsck required! " + "Block to replay is outside of " + "filesystem"); + goto abort_replay; + } + /* make sure we don't try to replay onto log or reserved area */ + if (is_block_in_log_or_reserved_area + (sb, real_blocks[i]->b_blocknr)) { + reiserfs_warning(sb, "journal-1204", + "REPLAY FAILURE fsck required! " + "Trying to replay onto a log block"); +abort_replay: + brelse_array(log_blocks, i); + brelse_array(real_blocks, i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + for (i = 0; i < get_desc_trans_len(desc); i++) { + + wait_on_buffer(log_blocks[i]); + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning(sb, "journal-1212", + "REPLAY FAILURE fsck required! " + "buffer write failed"); + brelse_array(log_blocks + i, + get_desc_trans_len(desc) - i); + brelse_array(real_blocks, get_desc_trans_len(desc)); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, + real_blocks[i]->b_size); + set_buffer_uptodate(real_blocks[i]); + brelse(log_blocks[i]); + } + /* flush out the real blocks */ + for (i = 0; i < get_desc_trans_len(desc); i++) { + set_buffer_dirty(real_blocks[i]); + write_dirty_buffer(real_blocks[i], WRITE); + } + for (i = 0; i < get_desc_trans_len(desc); i++) { + wait_on_buffer(real_blocks[i]); + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning(sb, "journal-1226", + "REPLAY FAILURE, fsck required! " + "buffer write failed"); + brelse_array(real_blocks + i, + get_desc_trans_len(desc) - i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + brelse(real_blocks[i]); + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + + 2) % SB_ONDISK_JOURNAL_SIZE(sb)); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1095: setting journal " "start to offset %ld", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + + /* + * init starting values for the first transaction, in case + * this is the last transaction to be replayed. + */ + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + journal->j_last_flush_trans_id = trans_id; + journal->j_trans_id = trans_id + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return 0; +} + +/* + * This function reads blocks starting from block and to max_block of bufsize + * size (but no more than BUFNR blocks at a time). This proved to improve + * mounting speed on self-rebuilding raid5 arrays at least. + * Right now it is only used from journal code. But later we might use it + * from other places. + * Note: Do not use journal_getblk/sb_getblk functions here! + */ +static struct buffer_head *reiserfs_breada(struct block_device *dev, + b_blocknr_t block, int bufsize, + b_blocknr_t max_block) +{ + struct buffer_head *bhlist[BUFNR]; + unsigned int blocks = BUFNR; + struct buffer_head *bh; + int i, j; + + bh = __getblk(dev, block, bufsize); + if (buffer_uptodate(bh)) + return (bh); + + if (block + BUFNR > max_block) { + blocks = max_block - block; + } + bhlist[0] = bh; + j = 1; + for (i = 1; i < blocks; i++) { + bh = __getblk(dev, block + i, bufsize); + if (buffer_uptodate(bh)) { + brelse(bh); + break; + } else + bhlist[j++] = bh; + } + ll_rw_block(READ, j, bhlist); + for (i = 1; i < j; i++) + brelse(bhlist[i]); + bh = bhlist[0]; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* + * read and replay the log + * on a clean unmount, the journal header's next unflushed pointer will be + * to an invalid transaction. This tests that before finding all the + * transactions in the log, which makes normal mount times fast. + * + * After a crash, this starts with the next unflushed transaction, and + * replays until it finds one too old, or invalid. + * + * On exit, it sets things up so the first transaction will work correctly. + * NOTE: only called during fs mount + */ +static int journal_read(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + unsigned int oldest_trans_id = 0; + unsigned int oldest_invalid_trans_id = 0; + time_t start; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0; + unsigned long newest_mount_id = 9; + struct buffer_head *d_bh; + struct reiserfs_journal_header *jh; + int valid_journal_header = 0; + int replay_count = 0; + int continue_replay = 1; + int ret; + char b[BDEVNAME_SIZE]; + + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_info(sb, "checking transaction log (%s)\n", + bdevname(journal->j_dev_bd, b)); + start = get_seconds(); + + /* + * step 1, read in the journal header block. Check the transaction + * it says is the first unflushed, and if that transaction is not + * valid, replay is done + */ + journal->j_header_bh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!journal->j_header_bh) { + return 1; + } + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); + if (le32_to_cpu(jh->j_first_unflushed_offset) < + SB_ONDISK_JOURNAL_SIZE(sb) + && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + oldest_start = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset); + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + newest_mount_id = le32_to_cpu(jh->j_mount_id); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu", le32_to_cpu(jh->j_first_unflushed_offset), + le32_to_cpu(jh->j_last_flush_trans_id)); + valid_journal_header = 1; + + /* + * now, we try to read the first unflushed offset. If it + * is not valid, there is nothing more we can do, and it + * makes no sense to read through the whole log. + */ + d_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset)); + ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL); + if (!ret) { + continue_replay = 0; + } + brelse(d_bh); + goto start_log_replay; + } + + /* + * ok, there are transactions that need to be replayed. start + * with the first log block, find all the valid transactions, and + * pick out the oldest. + */ + while (continue_replay + && cur_dblock < + (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb))) { + /* + * Note that it is required for blocksize of primary fs + * device and journal device to be the same + */ + d_bh = + reiserfs_breada(journal->j_dev_bd, cur_dblock, + sb->s_blocksize, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + ret = + journal_transaction_is_valid(sb, d_bh, + &oldest_invalid_trans_id, + &newest_mount_id); + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1179: Setting " + "oldest_start to offset %llu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } else if (oldest_trans_id > get_desc_trans_id(desc)) { + /* one we just read was older */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } + if (newest_mount_id < get_desc_mount_id(desc)) { + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1299: Setting " + "newest_mount_id to %d", + get_desc_mount_id(desc)); + } + cur_dblock += get_desc_trans_len(desc) + 2; + } else { + cur_dblock++; + } + brelse(d_bh); + } + +start_log_replay: + cur_dblock = oldest_start; + if (oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1206: Starting replay " + "from offset %llu, trans_id %lu", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + oldest_trans_id); + + } + replay_count = 0; + while (continue_replay && oldest_trans_id > 0) { + ret = + journal_read_transaction(sb, cur_dblock, oldest_start, + oldest_trans_id, newest_mount_id); + if (ret < 0) { + return ret; + } else if (ret != 0) { + break; + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start; + replay_count++; + if (cur_dblock == oldest_start) + break; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1225: No valid " "transactions found"); + } + /* + * j_start does not get set correctly if we don't replay any + * transactions. if we had a valid journal_header, set j_start + * to the first unflushed transaction value, copy the trans_id + * from the header + */ + if (valid_journal_header && replay_count == 0) { + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); + journal->j_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id) + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_last_flush_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id); + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + journal->j_mount_id = newest_mount_id + 1; + } + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %lu", journal->j_mount_id); + journal->j_first_unflushed_offset = journal->j_start; + if (replay_count > 0) { + reiserfs_info(sb, + "replayed %d transactions in %lu seconds\n", + replay_count, get_seconds() - start); + } + /* needed to satisfy the locking in _update_journal_header_block */ + reiserfs_write_lock(sb); + if (!bdev_read_only(sb->s_bdev) && + _update_journal_header_block(sb, journal->j_start, + journal->j_last_flush_trans_id)) { + reiserfs_write_unlock(sb); + /* + * replay failed, caller must call free_journal_ram and abort + * the mount + */ + return -1; + } + reiserfs_write_unlock(sb); + return 0; +} + +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + mutex_init(&jl->j_commit_mutex); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; +} + +static void journal_list_init(struct super_block *sb) +{ + SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); +} + +static void release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal) +{ + if (journal->j_dev_bd != NULL) { + blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + journal->j_dev_bd = NULL; + } +} + +static int journal_init_dev(struct super_block *super, + struct reiserfs_journal *journal, + const char *jdev_name) +{ + int result; + dev_t jdev; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; + char b[BDEVNAME_SIZE]; + + result = 0; + + journal->j_dev_bd = NULL; + jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; + + if (bdev_read_only(super->s_bdev)) + blkdev_mode = FMODE_READ; + + /* there is no "jdev" option and journal is on separate device */ + if ((!jdev_name || !jdev_name[0])) { + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); + journal->j_dev_mode = blkdev_mode; + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, "sh-458", + "cannot init journal device '%s': %i", + __bdevname(jdev, b), result); + return result; + } else if (jdev != super->s_dev) + set_blocksize(journal->j_dev_bd, super->s_blocksize); + + return 0; + } + + journal->j_dev_mode = blkdev_mode; + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, + "journal_init_dev: Cannot open '%s': %i", + jdev_name, result); + return result; + } + + set_blocksize(journal->j_dev_bd, super->s_blocksize); + reiserfs_info(super, + "journal_init_dev: journal device: %s\n", + bdevname(journal->j_dev_bd, b)); + return 0; +} + +/* + * When creating/tuning a file system user can assign some + * journal params within boundaries which depend on the ratio + * blocksize/standard_blocksize. + * + * For blocks >= standard_blocksize transaction size should + * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more + * then JOURNAL_TRANS_MAX_DEFAULT. + * + * For blocks < standard_blocksize these boundaries should be + * decreased proportionally. + */ +#define REISERFS_STANDARD_BLKSIZE (4096) + +static int check_advise_trans_params(struct super_block *sb, + struct reiserfs_journal *journal) +{ + if (journal->j_trans_max) { + /* Non-default journal params. Do sanity check for them. */ + int ratio = 1; + if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) + ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; + + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || + journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || + SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max < + JOURNAL_MIN_RATIO) { + reiserfs_warning(sb, "sh-462", + "bad transaction max size (%u). " + "FSCK?", journal->j_trans_max); + return 1; + } + if (journal->j_max_batch != (journal->j_trans_max) * + JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { + reiserfs_warning(sb, "sh-463", + "bad transaction max batch (%u). " + "FSCK?", journal->j_max_batch); + return 1; + } + } else { + /* + * Default journal params. + * The file system was created by old version + * of mkreiserfs, so some fields contain zeros, + * and we need to advise proper values for them + */ + if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { + reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", + sb->s_blocksize); + return 1; + } + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; + } + return 0; +} + +/* must be called once on fs mount. calls journal_read for you */ +int journal_init(struct super_block *sb, const char *j_dev_name, + int old_format, unsigned int commit_max_age) +{ + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2; + struct buffer_head *bhjh; + struct reiserfs_super_block *rs; + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; + char b[BDEVNAME_SIZE]; + int ret; + + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); + if (!journal) { + reiserfs_warning(sb, "journal-1256", + "unable to get memory for journal structure"); + return 1; + } + INIT_LIST_HEAD(&journal->j_bitmap_nodes); + INIT_LIST_HEAD(&journal->j_prealloc_list); + INIT_LIST_HEAD(&journal->j_working_list); + INIT_LIST_HEAD(&journal->j_journal_list); + journal->j_persistent_trans = 0; + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) + goto free_and_return; + + allocate_bitmap_nodes(sb); + + /* reserved for journal area support */ + SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ? + REISERFS_OLD_DISK_OFFSET_IN_BYTES + / sb->s_blocksize + + reiserfs_bmap_count(sb) + + 1 : + REISERFS_DISK_OFFSET_IN_BYTES / + sb->s_blocksize + 2); + + /* + * Sanity check to see is the standard journal fitting + * within first bitmap (actual for small blocksizes) + */ + if (!SB_ONDISK_JOURNAL_DEVICE(sb) && + (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { + reiserfs_warning(sb, "journal-1393", + "journal does not fit for area addressed " + "by first of bitmap blocks. It starts at " + "%u and its size is %u. Block size %ld", + SB_JOURNAL_1st_RESERVED_BLOCK(sb), + SB_ONDISK_JOURNAL_SIZE(sb), + sb->s_blocksize); + goto free_and_return; + } + + if (journal_init_dev(sb, journal, j_dev_name) != 0) { + reiserfs_warning(sb, "sh-462", + "unable to initialize journal device"); + goto free_and_return; + } + + rs = SB_DISK_SUPER_BLOCK(sb); + + /* read journal header */ + bhjh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!bhjh) { + reiserfs_warning(sb, "sh-459", + "unable to read journal header"); + goto free_and_return; + } + jh = (struct reiserfs_journal_header *)(bhjh->b_data); + + /* make sure that journal matches to the super block */ + if (is_reiserfs_jr(rs) + && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != + sb_jp_journal_magic(rs))) { + reiserfs_warning(sb, "sh-460", + "journal header magic %x (device %s) does " + "not match to magic found in super block %x", + jh->jh_journal.jp_journal_magic, + bdevname(journal->j_dev_bd, b), + sb_jp_journal_magic(rs)); + brelse(bhjh); + goto free_and_return; + } + + journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); + journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); + journal->j_max_commit_age = + le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + + if (check_advise_trans_params(sb, journal) != 0) + goto free_and_return; + journal->j_default_max_commit_age = journal->j_max_commit_age; + + if (commit_max_age != 0) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + + reiserfs_info(sb, "journal params: device %s, size %u, " + "journal first block %u, max trans len %u, max batch %u, " + "max commit age %u, max trans age %u\n", + bdevname(journal->j_dev_bd, b), + SB_ONDISK_JOURNAL_SIZE(sb), + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + journal->j_trans_max, + journal->j_max_batch, + journal->j_max_commit_age, journal->j_max_trans_age); + + brelse(bhjh); + + journal->j_list_bitmap_index = 0; + journal_list_init(sb); + + memset(journal->j_list_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); + + INIT_LIST_HEAD(&journal->j_dirty_buffers); + spin_lock_init(&journal->j_dirty_buffers_lock); + + journal->j_start = 0; + journal->j_len = 0; + journal->j_len_alloc = 0; + atomic_set(&journal->j_wcount, 0); + atomic_set(&journal->j_async_throttle, 0); + journal->j_bcount = 0; + journal->j_trans_start_time = 0; + journal->j_last = NULL; + journal->j_first = NULL; + init_waitqueue_head(&journal->j_join_wait); + mutex_init(&journal->j_mutex); + mutex_init(&journal->j_flush_mutex); + + journal->j_trans_id = 10; + journal->j_mount_id = 10; + journal->j_state = 0; + atomic_set(&journal->j_jlock, 0); + journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + journal->j_cnode_free_orig = journal->j_cnode_free_list; + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; + journal->j_cnode_used = 0; + journal->j_must_wait = 0; + + if (journal->j_cnode_free == 0) { + reiserfs_warning(sb, "journal-2004", "Journal cnode memory " + "allocation failed (%ld bytes). Journal is " + "too large for available memory. Usually " + "this is due to a journal that is too large.", + sizeof (struct reiserfs_journal_cnode) * num_cnodes); + goto free_and_return; + } + + init_journal_hash(sb); + jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); + jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); + if (!jl->j_list_bitmap) { + reiserfs_warning(sb, "journal-2005", + "get_list_bitmap failed for journal list 0"); + goto free_and_return; + } + + ret = journal_read(sb); + if (ret < 0) { + reiserfs_warning(sb, "reiserfs-2006", + "Replay Failure, unable to mount"); + goto free_and_return; + } + + INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); + journal->j_work_sb = sb; + return 0; +free_and_return: + free_journal_ram(sb); + return 1; +} + +/* + * test for a polite end of the current transaction. Used by file_write, + * and should be used by delete to make sure they don't write more than + * can fit inside a single transaction + */ +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, + int new_alloc) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + time_t now = get_seconds(); + /* cannot restart while nested */ + BUG_ON(!th->t_trans_id); + if (th->t_refcount > 1) + return 0; + if (journal->j_must_wait > 0 || + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || + atomic_read(&journal->j_jlock) || + (now - journal->j_trans_start_time) > journal->j_max_trans_age || + journal->j_cnode_free < (journal->j_trans_max * 3)) { + return 1; + } + + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; + return 0; +} + +/* this must be called inside a transaction */ +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + BUG_ON(!th->t_trans_id); + journal->j_must_wait = 1; + set_bit(J_WRITERS_BLOCKED, &journal->j_state); + return; +} + +/* this must be called without a transaction started */ +void reiserfs_allow_writes(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + clear_bit(J_WRITERS_BLOCKED, &journal->j_state); + wake_up(&journal->j_join_wait); +} + +/* this must be called without a transaction started */ +void reiserfs_wait_on_write_block(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + wait_event(journal->j_join_wait, + !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); +} + +static void queue_log_writer(struct super_block *s) +{ + wait_queue_t wait; + struct reiserfs_journal *journal = SB_JOURNAL(s); + set_bit(J_WRITERS_QUEUED, &journal->j_state); + + /* + * we don't want to use wait_event here because + * we only want to wait once. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&journal->j_join_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { + int depth = reiserfs_write_unlock_nested(s); + schedule(); + reiserfs_write_lock_nested(s, depth); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(&journal->j_join_wait, &wait); +} + +static void wake_queued_writers(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) + wake_up(&journal->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned long bcount = journal->j_bcount; + while (1) { + int depth; + + depth = reiserfs_write_unlock_nested(sb); + schedule_timeout_uninterruptible(1); + reiserfs_write_lock_nested(sb, depth); + + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; + while ((atomic_read(&journal->j_wcount) > 0 || + atomic_read(&journal->j_jlock)) && + journal->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (journal->j_trans_id != trans_id) + break; + if (bcount == journal->j_bcount) + break; + bcount = journal->j_bcount; + } +} + +/* + * join == true if you must join an existing transaction. + * join == false if you can deal with waiting for others to finish + * + * this will block until the transaction is joinable. send the number of + * blocks you expect to use in nblocks. +*/ +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks, + int join) +{ + time_t now = get_seconds(); + unsigned int old_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; + int retval; + int depth; + + reiserfs_check_lock_depth(sb, "journal_begin"); + BUG_ON(nblocks > journal->j_trans_max); + + PROC_INFO_INC(sb, journal.journal_being); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = sb; + +relock: + lock_journal(sb); + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { + unlock_journal(sb); + retval = journal->j_errno; + goto out_fail; + } + journal->j_bcount++; + + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { + unlock_journal(sb); + depth = reiserfs_write_unlock_nested(sb); + reiserfs_wait_on_write_block(sb); + reiserfs_write_lock_nested(sb, depth); + PROC_INFO_INC(sb, journal.journal_relock_writers); + goto relock; + } + now = get_seconds(); + + /* + * if there is no room in the journal OR + * if this transaction is too old, and we weren't called joinable, + * wait for it to finish before beginning we don't sleep if there + * aren't other writers + */ + + if ((!join && journal->j_must_wait > 0) || + (!join + && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) + || (!join && atomic_read(&journal->j_wcount) > 0 + && journal->j_trans_start_time > 0 + && (now - journal->j_trans_start_time) > + journal->j_max_trans_age) || (!join + && atomic_read(&journal->j_jlock)) + || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { + + old_trans_id = journal->j_trans_id; + /* allow others to finish this transaction */ + unlock_journal(sb); + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + journal->j_max_batch && + ((journal->j_len + nblocks + 2) * 100) < + (journal->j_len_alloc * 75)) { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(sb); + goto relock; + } + } + /* + * don't mess with joining the transaction if all we + * have to do is wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } + goto relock; + } + retval = journal_join(&myth, sb); + if (retval) + goto out_fail; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != journal->j_trans_id) { + retval = do_journal_end(&myth, 0); + } else { + retval = do_journal_end(&myth, COMMIT_NOW); + } + + if (retval) + goto out_fail; + + PROC_INFO_INC(sb, journal.journal_relock_wcount); + goto relock; + } + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); + } + atomic_inc(&journal->j_wcount); + journal->j_len_alloc += nblocks; + th->t_blocks_logged = 0; + th->t_blocks_allocated = nblocks; + th->t_trans_id = journal->j_trans_id; + unlock_journal(sb); + INIT_LIST_HEAD(&th->t_list); + return 0; + +out_fail: + memset(th, 0, sizeof(*th)); + /* + * Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later + */ + th->t_super = sb; + return retval; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *s, + int nblocks) +{ + int ret; + struct reiserfs_transaction_handle *th; + + /* + * if we're nesting into an existing transaction. It will be + * persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info; + th->t_refcount++; + BUG_ON(th->t_refcount < 2); + + return th; + } + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks); + if (ret) { + kfree(th); + return NULL; + } + + SB_JOURNAL(s)->j_persistent_trans++; + return th; +} + +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + int ret = 0; + if (th->t_trans_id) + ret = journal_end(th); + else + ret = -EIO; + if (th->t_refcount == 0) { + SB_JOURNAL(s)->j_persistent_trans--; + kfree(th); + } + return ret; +} + +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); +} + +int journal_join_abort(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); +} + +int journal_begin(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + int ret; + + th->t_handle_save = NULL; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == sb) { + BUG_ON(!cur_th->t_refcount); + cur_th->t_refcount++; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + reiserfs_warning(sb, "reiserfs-2005", + "BAD: refcount <= 1, but " + "journal_info != 0"); + return 0; + } else { + /* + * we've ended up with a handle from a different + * filesystem. save it and restore on journal_end. + * This should never really happen... + */ + reiserfs_warning(sb, "clm-2100", + "nesting info a different FS"); + th->t_handle_save = current->journal_info; + current->journal_info = th; + } + } else { + current->journal_info = th; + } + ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); + BUG_ON(current->journal_info != th); + + /* + * I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since + * journal_end won't be called to do it. */ + if (ret) + current->journal_info = th->t_handle_save; + else + BUG_ON(!th->t_refcount); + + return ret; +} + +/* + * puts bh into the current transaction. If it was already there, reorders + * removes the old pointers from the hash, and puts new ones in (to make + * sure replay happen in the right order). + * + * if it was dirty, cleans and files onto the clean list. I can't let it + * be dirty again until the transaction is committed. + * + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. + */ +int journal_mark_dirty(struct reiserfs_transaction_handle *th, + struct buffer_head *bh) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0; + int prepared = 0; + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(sb, journal.mark_dirty); + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + prepared = test_clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + PROC_INFO_INC(sb, journal.mark_dirty_already); + return 0; + } + + /* + * this must be turned into a panic instead of a warning. We can't + * allow a dirty or journal_dirty or locked buffer to be logged, as + * some changes could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_dirty(bh)) { + reiserfs_warning(sb, "journal-1777", + "buffer %llu bad state " + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", + (unsigned long long)bh->b_blocknr, + prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!'); + } + + if (atomic_read(&journal->j_wcount) <= 0) { + reiserfs_warning(sb, "journal-1409", + "returning because j_wcount was %d", + atomic_read(&journal->j_wcount)); + return 1; + } + /* + * this error means I've screwed up, and we've overflowed + * the transaction. Nothing can be done here, except make the + * FS readonly or panic. + */ + if (journal->j_len >= journal->j_trans_max) { + reiserfs_panic(th->t_super, "journal-1413", + "j_len (%lu) is too big", + journal->j_len); + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1; + PROC_INFO_INC(sb, journal.mark_dirty_notjournal); + clear_buffer_journal_dirty(bh); + } + + if (journal->j_len > journal->j_len_alloc) { + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; + } + + set_buffer_journaled(bh); + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(sb); + if (!cn) { + reiserfs_panic(sb, "journal-4", "get_cnode failed!"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; + } + th->t_blocks_logged++; + journal->j_len++; + + cn->bh = bh; + cn->blocknr = bh->b_blocknr; + cn->sb = sb; + cn->jlist = NULL; + insert_journal_hash(journal->j_hash_table, cn); + if (!count_already_incd) { + get_bh(bh); + } + } + cn->next = NULL; + cn->prev = journal->j_last; + cn->bh = bh; + if (journal->j_last) { + journal->j_last->next = cn; + journal->j_last = cn; + } else { + journal->j_first = cn; + journal->j_last = cn; + } + reiserfs_schedule_old_flush(sb); + return 0; +} + +int journal_end(struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + if (!current->journal_info && th->t_refcount > 1) + reiserfs_warning(sb, "REISER-NESTING", + "th NULL, refcount %d", th->t_refcount); + + if (!th->t_trans_id) { + WARN_ON(1); + return -EIO; + } + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = + current->journal_info; + + /* + * we aren't allowed to close a nested transaction on a + * different filesystem from the one in the task struct + */ + BUG_ON(cur_th->t_super != th->t_super); + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, 0); + } +} + +/* + * removes from the current transaction, relsing and descrementing any counters. + * also files the removed buffer directly onto the clean list + * + * called by journal_mark_freed when a block has been deleted + * + * returns 1 if it cleaned and relsed the buffer. 0 otherwise + */ +static int remove_from_transaction(struct super_block *sb, + b_blocknr_t blocknr, int already_cleaned) +{ + struct buffer_head *bh; + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (!cn || !cn->bh) { + return ret; + } + bh = cn->bh; + if (cn->prev) { + cn->prev->next = cn->next; + } + if (cn->next) { + cn->next->prev = cn->prev; + } + if (cn == journal->j_first) { + journal->j_first = cn->next; + } + if (cn == journal->j_last) { + journal->j_last = cn->prev; + } + if (bh) + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); + clear_buffer_journaled(bh); /* don't log this one */ + + if (!already_cleaned) { + clear_buffer_journal_dirty(bh); + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + put_bh(bh); + if (atomic_read(&bh->b_count) < 0) { + reiserfs_warning(sb, "journal-1752", + "b_count < 0"); + } + ret = 1; + } + journal->j_len--; + journal->j_len_alloc--; + free_cnode(sb, cn); + return ret; +} + +/* + * for any cnode in a journal list, it can only be dirtied of all the + * transactions that include it are committed to disk. + * this checks through each transaction, and returns 1 if you are allowed + * to dirty, and 0 if you aren't + * + * it is called by dirty_journal_list, which is called after + * flush_commit_list has gotten all the log blocks for a given + * transaction on disk + * + */ +static int can_dirty(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + struct reiserfs_journal_cnode *cur = cn->hprev; + int can_dirty = 1; + + /* + * first test hprev. These are all newer than cn, so any node here + * with the same block number and dev means this node can't be sent + * to disk right now. + */ + while (cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && + cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hprev; + } + /* + * then test hnext. These are all older than cn. As long as they + * are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext; + while (cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hnext; + } + return can_dirty; +} + +/* + * syncs the commit blocks, but does not force the real buffers to disk + * will wait until the current transaction is done/committed before returning + */ +int journal_end_sync(struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + /* you can sync while nested, very, very bad */ + BUG_ON(th->t_refcount > 1); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); + } + return do_journal_end(th, COMMIT_NOW | WAIT); +} + +/* writeback the pending async commits to disk */ +static void flush_async_commits(struct work_struct *work) +{ + struct reiserfs_journal *journal = + container_of(work, struct reiserfs_journal, j_work.work); + struct super_block *sb = journal->j_work_sb; + struct reiserfs_journal_list *jl; + struct list_head *entry; + + reiserfs_write_lock(sb); + if (!list_empty(&journal->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = journal->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(sb, jl, 1); + } + reiserfs_write_unlock(sb); +} + +/* + * flushes any old transactions to disk + * ends the current transaction if it is too old + */ +void reiserfs_flush_old_commits(struct super_block *sb) +{ + time_t now; + struct reiserfs_transaction_handle th; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + now = get_seconds(); + /* + * safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&journal->j_journal_list)) + return; + + /* + * check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&journal->j_wcount) <= 0 && + journal->j_trans_start_time > 0 && + journal->j_len > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) { + if (!journal_join(&th, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + + /* + * we're only being called from kreiserfsd, it makes + * no sense to do an async commit so that kreiserfsd + * can do it later + */ + do_journal_end(&th, COMMIT_NOW | WAIT); + } + } +} + +/* + * returns 0 if do_journal_end should return right away, returns 1 if + * do_journal_end should finish the commit + * + * if the current transaction is too old, but still has writers, this will + * wait on j_join_wait until all the writers are done. By the time it + * wakes up, the transaction it was called has already ended, so it just + * flushes the commit list and returns 0. + * + * Won't batch when flush or commit_now is set. Also won't batch when + * others are waiting on j_join_wait. + * + * Note, we can't allow the journal_end to proceed while there are still + * writers in the log. + */ +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) +{ + + time_t now; + int flush = flags & FLUSH_ALL; + int commit_now = flags & COMMIT_NOW; + int wait_on_commit = flags & WAIT; + struct reiserfs_journal_list *jl; + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); + /* <= 0 is allowed. unmounting might not call begin */ + if (atomic_read(&journal->j_wcount) > 0) + atomic_dec(&journal->j_wcount); + + /* + * BUG, deal with case where j_len is 0, but people previously + * freed blocks need to be released will be dealt with by next + * transaction that actually writes something, but should be taken + * care of in this trans + */ + BUG_ON(journal->j_len == 0); + + /* + * if wcount > 0, and we are called to with flush or commit_now, + * we wait on j_join_wait. We will wake up when the last writer has + * finished the transaction, and started it on its way to the disk. + * Then, we flush the commit or journal list, and just return 0 + * because the rest of journal end was already done for this + * transaction. + */ + if (atomic_read(&journal->j_wcount) > 0) { + if (flush || commit_now) { + unsigned trans_id; + + jl = journal->j_current_jl; + trans_id = jl->j_trans_id; + if (wait_on_commit) + jl->j_state |= LIST_COMMIT_PENDING; + atomic_set(&journal->j_jlock, 1); + if (flush) { + journal->j_next_full_flush = 1; + } + unlock_journal(sb); + + /* + * sleep while the current transaction is + * still j_jlocked + */ + while (journal->j_trans_id == trans_id) { + if (atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } else { + lock_journal(sb); + if (journal->j_trans_id == trans_id) { + atomic_set(&journal->j_jlock, + 1); + } + unlock_journal(sb); + } + } + BUG_ON(journal->j_trans_id == trans_id); + + if (commit_now + && journal_list_still_alive(sb, trans_id) + && wait_on_commit) { + flush_commit_list(sb, jl, 1); + } + return 0; + } + unlock_journal(sb); + return 0; + } + + /* deal with old transactions where we are the last writers */ + now = get_seconds(); + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { + commit_now = 1; + journal->j_next_async_flush = 1; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) + && !flush && !commit_now && (journal->j_len < journal->j_max_batch) + && journal->j_len_alloc < journal->j_max_batch + && journal->j_cnode_free > (journal->j_trans_max * 3)) { + journal->j_bcount++; + unlock_journal(sb); + return 0; + } + + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) { + reiserfs_panic(sb, "journal-003", + "j_start (%ld) is too high", + journal->j_start); + } + return 1; +} + +/* + * Does all the work that makes deleting blocks safe. + * when deleting a block mark BH_JNew, just remove it from the current + * transaction, clean it's buffer_head and move on. + * + * otherwise: + * set a bit for the block in the journal bitmap. That will prevent it from + * being allocated for unformatted nodes before this transaction has finished. + * + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. + * That will prevent any old transactions with this block from trying to flush + * to the real location. Since we aren't removing the cnode from the + * journal_list_hash, *the block can't be reallocated yet. + * + * Then remove it from the current transaction, decrementing any counters and + * filing it on the clean list. + */ +int journal_mark_freed(struct reiserfs_transaction_handle *th, + struct super_block *sb, b_blocknr_t blocknr) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + struct buffer_head *bh = NULL; + struct reiserfs_list_bitmap *jb = NULL; + int cleaned = 0; + BUG_ON(!th->t_trans_id); + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh; + get_bh(bh); + } + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_buffer_journal_new(bh); + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + cleaned = remove_from_transaction(sb, blocknr, cleaned); + } else { + /* + * set the bit for this block in the journal bitmap + * for this transaction + */ + jb = journal->j_current_jl->j_list_bitmap; + if (!jb) { + reiserfs_panic(sb, "journal-1702", + "journal_list_bitmap is NULL"); + } + set_bit_in_list_bitmap(sb, blocknr, jb); + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + } + cleaned = remove_from_transaction(sb, blocknr, cleaned); + + /* + * find all older transactions with this block, + * make sure they don't try to write it out + */ + cn = get_journal_hash_dev(sb, journal->j_list_hash_table, + blocknr); + while (cn) { + if (sb == cn->sb && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state); + if (cn->bh) { + /* + * remove_from_transaction will brelse + * the buffer if it was in the current + * trans + */ + if (!cleaned) { + clear_buffer_journal_dirty(cn-> + bh); + clear_buffer_dirty(cn->bh); + clear_buffer_journal_test(cn-> + bh); + cleaned = 1; + put_bh(cn->bh); + if (atomic_read + (&cn->bh->b_count) < 0) { + reiserfs_warning(sb, + "journal-2138", + "cn->bh->b_count < 0"); + } + } + /* + * since we are clearing the bh, + * we MUST dec nonzerolen + */ + if (cn->jlist) { + atomic_dec(&cn->jlist-> + j_nonzerolen); + } + cn->bh = NULL; + } + } + cn = cn->hnext; + } + } + + if (bh) + release_buffer_page(bh); /* get_hash grabs the buffer */ + return 0; +} + +void reiserfs_update_inode_transaction(struct inode *inode) +{ + struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); + REISERFS_I(inode)->i_jl = journal->j_current_jl; + REISERFS_I(inode)->i_trans_id = journal->j_trans_id; +} + +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th; + struct super_block *sb = inode->i_sb; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + /* + * is it from the current transaction, + * or from an unknown transaction? + */ + if (id == journal->j_trans_id) { + jl = journal->j_current_jl; + /* + * try to let other writers come in and + * grow this transaction + */ + let_transaction_grow(sb, id); + if (journal->j_trans_id != id) { + goto flush_commit_only; + } + + ret = journal_begin(&th, sb, 1); + if (ret) + return ret; + + /* someone might have ended this transaction while we joined */ + if (journal->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th); + goto flush_commit_only; + } + + ret = journal_end_sync(&th); + if (!ret) + ret = 1; + + } else { + /* + * this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ +flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1); + if (journal->j_errno) + ret = journal->j_errno; + } + } + /* otherwise the list is gone, and long since committed */ + return ret; +} + +int reiserfs_commit_for_inode(struct inode *inode) +{ + unsigned int id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* + * for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode); + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); +} + +void reiserfs_restore_prepared_buffer(struct super_block *sb, + struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + PROC_INFO_INC(sb, journal.restore_prepared); + if (!bh) { + return; + } + if (test_clear_buffer_journal_restore_dirty(bh) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + reiserfs_write_lock(sb); + cn = get_journal_hash_dev(sb, + journal->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_buffer_journal_test(bh); + mark_buffer_dirty(bh); + } + reiserfs_write_unlock(sb); + } + clear_buffer_journal_prepared(bh); +} + +extern struct tree_balance *cur_tb; +/* + * before we can change a metadata block, we have to make sure it won't + * be written to disk while we are altering it. So, we must: + * clean it + * wait on it. + */ +int reiserfs_prepare_for_journal(struct super_block *sb, + struct buffer_head *bh, int wait) +{ + PROC_INFO_INC(sb, journal.prepare); + + if (!trylock_buffer(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_buffer_journal_prepared(bh); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_buffer_journal_test(bh); + set_buffer_journal_restore_dirty(bh); + } + unlock_buffer(bh); + return 1; +} + +/* + * long and ugly. If flush, will not return until all commit + * blocks and all real buffers in the trans are on disk. + * If no_async, won't return until all commit blocks are on disk. + * + * keep reading, there are comments as you go along + * + * If the journal is aborted, we just clean up. Things like flushing + * journal lists, etc just won't happen. + */ +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; /* commit bh */ + struct buffer_head *d_bh; /* desc bh */ + int cur_write_start = 0; /* start index of current log write */ + int old_start; + int i; + int flush; + int wait_on_commit; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned int commit_trans_id; + int trans_half; + int depth; + + BUG_ON(th->t_refcount > 1); + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_super); + + /* + * protect flush_older_commits from doing mistakes if the + * transaction ID counter gets overflowed. + */ + if (th->t_trans_id == ~0U) + flags |= FLUSH_ALL | COMMIT_NOW | WAIT; + flush = flags & FLUSH_ALL; + wait_on_commit = flags & WAIT; + + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth(sb, "journal end"); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); + } + + lock_journal(sb); + if (journal->j_next_full_flush) { + flags |= FLUSH_ALL; + flush = 1; + } + if (journal->j_next_async_flush) { + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; + } + + /* + * check_journal_end locks the journal, and unlocks if it does + * not return 1 it tells us if we should continue with the + * journal_end, or just return + */ + if (!check_journal_end(th, flags)) { + reiserfs_schedule_old_flush(sb); + wake_queued_writers(sb); + reiserfs_async_progress_wait(sb); + goto out; + } + + /* check_journal_end might set these, check again */ + if (journal->j_next_full_flush) { + flush = 1; + } + + /* + * j must wait means we have to flush the log blocks, and the + * real blocks for this transaction + */ + if (journal->j_must_wait > 0) { + flush = 1; + } +#ifdef REISERFS_PREALLOCATE + /* + * quota ops might need to nest, setup the journal_info pointer + * for them and raise the refcount so that it is > 0. + */ + current->journal_info = th; + th->t_refcount++; + + /* it should not involve new blocks into the transaction */ + reiserfs_discard_all_prealloc(th); + + th->t_refcount--; + current->journal_info = th->t_handle_save; +#endif + + /* setup description block */ + d_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + journal->j_start); + set_buffer_uptodate(d_bh); + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; + memset(d_bh->b_data, 0, d_bh->b_size); + memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); + set_desc_trans_id(desc, journal->j_trans_id); + + /* + * setup commit block. Don't write (keep it clean too) this one + * until after everyone else is written + */ + c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((journal->j_start + journal->j_len + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + memset(c_bh->b_data, 0, c_bh->b_size); + set_commit_trans_id(commit, journal->j_trans_id); + set_buffer_uptodate(c_bh); + + /* init this journal list */ + jl = journal->j_current_jl; + + /* + * we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0); + jl->j_trans_id = journal->j_trans_id; + jl->j_timestamp = journal->j_trans_start_time; + jl->j_commit_bh = c_bh; + jl->j_start = journal->j_start; + jl->j_len = journal->j_len; + atomic_set(&jl->j_nonzerolen, journal->j_len); + atomic_set(&jl->j_commit_left, journal->j_len + 2); + jl->j_realblock = NULL; + + /* + * The ENTIRE FOR LOOP MUST not cause schedule to occur. + * for each real block, add it to the journal list hash, + * copy into real block index array in the commit or desc block + */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { + if (buffer_journaled(cn->bh)) { + jl_cn = get_cnode(sb); + if (!jl_cn) { + reiserfs_panic(sb, "journal-1676", + "get_cnode returned NULL"); + } + if (i == 0) { + jl->j_realblock = jl_cn; + } + jl_cn->prev = last_cn; + jl_cn->next = NULL; + if (last_cn) { + last_cn->next = jl_cn; + } + last_cn = jl_cn; + /* + * make sure the block we are trying to log + * is not a block of journal or reserved area + */ + if (is_block_in_log_or_reserved_area + (sb, cn->bh->b_blocknr)) { + reiserfs_panic(sb, "journal-2332", + "Trying to log block %lu, " + "which is a log block", + cn->bh->b_blocknr); + } + jl_cn->blocknr = cn->bh->b_blocknr; + jl_cn->state = 0; + jl_cn->sb = sb; + jl_cn->bh = cn->bh; + jl_cn->jlist = jl; + insert_journal_hash(journal->j_list_hash_table, jl_cn); + if (i < trans_half) { + desc->j_realblock[i] = + cpu_to_le32(cn->bh->b_blocknr); + } else { + commit->j_realblock[i - trans_half] = + cpu_to_le32(cn->bh->b_blocknr); + } + } else { + i--; + } + } + set_desc_trans_len(desc, journal->j_len); + set_desc_mount_id(desc, journal->j_mount_id); + set_desc_trans_id(desc, journal->j_trans_id); + set_commit_trans_len(commit, journal->j_len); + + /* + * special check in case all buffers in the journal + * were marked for not logging + */ + BUG_ON(journal->j_len == 0); + + /* + * we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + + /* + * first data block is j_start + 1, so add one to + * cur_write_start wherever you use it + */ + cur_write_start = journal->j_start; + cn = journal->j_first; + jindex = 1; /* start at one so we don't get the desc again */ + while (cn) { + clear_buffer_journal_new(cn->bh); + /* copy all the real blocks into log area. dirty log blocks */ + if (buffer_journaled(cn->bh)) { + struct buffer_head *tmp_bh; + char *addr; + struct page *page; + tmp_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((cur_write_start + + jindex) % + SB_ONDISK_JOURNAL_SIZE(sb))); + set_buffer_uptodate(tmp_bh); + page = cn->bh->b_page; + addr = kmap(page); + memcpy(tmp_bh->b_data, + addr + offset_in_page(cn->bh->b_data), + cn->bh->b_size); + kunmap(page); + mark_buffer_dirty(tmp_bh); + jindex++; + set_buffer_journal_dirty(cn->bh); + clear_buffer_journaled(cn->bh); + } else { + /* + * JDirty cleared sometime during transaction. + * don't log this one + */ + reiserfs_warning(sb, "journal-2048", + "BAD, buffer in journal hash, " + "but not JDirty!"); + brelse(cn->bh); + } + next = cn->next; + free_cnode(sb, cn); + cn = next; + reiserfs_cond_resched(sb); + } + + /* + * we are done with both the c_bh and d_bh, but + * c_bh must be written after all other commit blocks, + * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + journal->j_current_jl = alloc_journal_list(sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &journal->j_journal_list); + list_add_tail(&jl->j_working_list, &journal->j_working_list); + journal->j_num_work_lists++; + + /* reset journal values for the next transaction */ + old_start = journal->j_start; + journal->j_start = + (journal->j_start + journal->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(sb); + atomic_set(&journal->j_wcount, 0); + journal->j_bcount = 0; + journal->j_last = NULL; + journal->j_first = NULL; + journal->j_len = 0; + journal->j_trans_start_time = 0; + /* check for trans_id overflow */ + if (++journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_current_jl->j_trans_id = journal->j_trans_id; + journal->j_must_wait = 0; + journal->j_len_alloc = 0; + journal->j_next_full_flush = 0; + journal->j_next_async_flush = 0; + init_journal_hash(sb); + + /* + * make sure reiserfs_add_jh sees the new current_jl before we + * write out the tails + */ + smp_mb(); + + /* + * tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed + * and clean, if we crash before the later transaction commits, the + * data block is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + depth = reiserfs_write_unlock_nested(sb); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_tail_bh_list); + reiserfs_write_lock_nested(sb, depth); + } + BUG_ON(!list_empty(&jl->j_tail_bh_list)); + mutex_unlock(&jl->j_commit_mutex); + + /* + * honor the flush wishes from the caller, simple commits can + * be done outside the journal lock, they are done below + * + * if we don't flush the commit list right now, we put it into + * the work queue so the people waiting on the async progress work + * queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(sb, jl, 1); + flush_journal_list(sb, jl, 1); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { + /* + * Avoid queueing work when sb is being shut down. Transaction + * will be flushed on journal shutdown. + */ + if (sb->s_flags & MS_ACTIVE) + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); + } + + /* + * if the next transaction has any chance of wrapping, flush + * transactions that might get overwritten. If any journal lists + * are very old flush them as well. + */ +first_jl: + list_for_each_safe(entry, safe, &journal->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (journal->j_start <= temp_jl->j_start) { + if ((journal->j_start + journal->j_trans_max + 1) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else if ((journal->j_start + + journal->j_trans_max + 1) < + SB_ONDISK_JOURNAL_SIZE(sb)) { + /* + * if we don't cross into the next + * transaction and we don't wrap, there is + * no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((journal->j_start + + journal->j_trans_max + 1) > + SB_ONDISK_JOURNAL_SIZE(sb)) { + if (((journal->j_start + journal->j_trans_max + 1) % + SB_ONDISK_JOURNAL_SIZE(sb)) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else { + /* + * we don't overlap anything from out start + * to the end of the log, and our wrapped + * portion doesn't overlap anything at + * the start of the log. We can break + */ + break; + } + } + } + + journal->j_current_jl->j_list_bitmap = + get_list_bitmap(sb, journal->j_current_jl); + + if (!(journal->j_current_jl->j_list_bitmap)) { + reiserfs_panic(sb, "journal-1996", + "could not get a list bitmap"); + } + + atomic_set(&journal->j_jlock, 0); + unlock_journal(sb); + /* wake up any body waiting to join. */ + clear_bit(J_WRITERS_QUEUED, &journal->j_state); + wake_up(&journal->j_join_wait); + + if (!flush && wait_on_commit && + journal_list_still_alive(sb, commit_trans_id)) { + flush_commit_list(sb, jl, 1); + } +out: + reiserfs_check_lock_depth(sb, "journal end2"); + + memset(th, 0, sizeof(*th)); + /* + * Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later + */ + th->t_super = sb; + + return journal->j_errno; +} + +/* Send the file system read only and refuse new transactions */ +void reiserfs_abort_journal(struct super_block *sb, int errno) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + if (test_bit(J_ABORTED, &journal->j_state)) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + sb->s_flags |= MS_RDONLY; + set_bit(J_ABORTED, &journal->j_state); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif +} diff --git a/kernel/fs/reiserfs/lbalance.c b/kernel/fs/reiserfs/lbalance.c new file mode 100644 index 000000000..249594a82 --- /dev/null +++ b/kernel/fs/reiserfs/lbalance.c @@ -0,0 +1,1427 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* + * copy copy_count entries from source directory item to dest buffer + * (creating new item if needed) + */ +static void leaf_copy_dir_entries(struct buffer_info *dest_bi, + struct buffer_head *source, int last_first, + int item_num, int from, int copy_count) +{ + struct buffer_head *dest = dest_bi->bi_bh; + /* + * either the number of target item, or if we must create a + * new item, the number of the item we will create it next to + */ + int item_num_in_dest; + + struct item_head *ih; + struct reiserfs_de_head *deh; + int copy_records_len; /* length of all records in item to be copied */ + char *records; + + ih = item_head(source, item_num); + + RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); + + /* + * length of all record to be copied and first byte of + * the last of them + */ + deh = B_I_DEH(source, ih); + if (copy_count) { + copy_records_len = (from ? deh_location(&deh[from - 1]) : + ih_item_len(ih)) - + deh_location(&deh[from + copy_count - 1]); + records = + source->b_data + ih_location(ih) + + deh_location(&deh[from + copy_count - 1]); + } else { + copy_records_len = 0; + records = NULL; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = + (last_first == + LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) + - 1); + + /* + * if there are no items in dest or the first/last item in + * dest is not item of the same directory + */ + if ((item_num_in_dest == -1) || + (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST + && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, + leaf_key(dest, + item_num_in_dest)))) + { + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + put_ih_version(&new_ih, KEY_FORMAT_3_5); + /* calculate item len */ + put_ih_item_len(&new_ih, + DEH_SIZE * copy_count + copy_records_len); + put_ih_entry_count(&new_ih, 0); + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < ih_entry_count(ih)) { + set_le_ih_k_offset(&new_ih, + deh_offset(&deh[from])); + } else { + /* + * no entries will be copied to this + * item in this function + */ + set_le_ih_k_offset(&new_ih, U32_MAX); + /* + * this item is not yet valid, but we + * want I_IS_DIRECTORY_ITEM to return 1 + * for it, so we -1 + */ + } + set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, + TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf(dest_bi, + (last_first == + LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), + &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer(dest_bi, + (last_first == + FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - + 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, + records, 0); + } + + item_num_in_dest = + (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; + + leaf_paste_entries(dest_bi, item_num_in_dest, + (last_first == + FIRST_TO_LAST) ? ih_entry_count(item_head(dest, + item_num_in_dest)) + : 0, copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len); +} + +/* + * Copy the first (if last_first == FIRST_TO_LAST) or last + * (last_first == LAST_TO_FIRST) item or part of it or nothing + * (see the return 0 below) from SOURCE to the end (if last_first) + * or beginning (!last_first) of the DEST + */ +/* returns 1 if anything was copied, else 0 */ +static int leaf_copy_boundary_item(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int bytes_or_entries) +{ + struct buffer_head *dest = dest_bi->bi_bh; + /* number of items in the source and destination buffers */ + int dest_nr_item, src_nr_item; + struct item_head *ih; + struct item_head *dih; + + dest_nr_item = B_NR_ITEMS(dest); + + /* + * if ( DEST is empty or first item of SOURCE and last item of + * DEST are the items of different objects or of different types ) + * then there is no need to treat this item differently from the + * other items that we copy, so we return + */ + if (last_first == FIRST_TO_LAST) { + ih = item_head(src, 0); + dih = item_head(dest, dest_nr_item - 1); + + /* there is nothing to merge */ + if (!dest_nr_item + || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) + return 0; + + RFALSE(!ih_item_len(ih), + "vs-10010: item can not have empty length"); + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* copy all entries to dest */ + bytes_or_entries = ih_entry_count(ih); + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, + bytes_or_entries); + return 1; + } + + /* + * copy part of the body of the first item of SOURCE + * to the end of the body of the last item of the DEST + * part defined by 'bytes_or_entries'; if bytes_or_entries + * == -1 copy whole body; don't create new item header + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_item_len(ih); + +#ifdef CONFIG_REISERFS_CHECK + else { + if (bytes_or_entries == ih_item_len(ih) + && is_indirect_le_ih(ih)) + if (get_ih_free_space(ih)) + reiserfs_panic(sb_from_bi(dest_bi), + "vs-10020", + "last unformatted node " + "must be filled " + "entirely (%h)", ih); + } +#endif + + /* + * merge first item (or its part) of src buffer with the last + * item of dest buffer. Both are of the same file + */ + leaf_paste_in_buffer(dest_bi, + dest_nr_item - 1, ih_item_len(dih), + bytes_or_entries, ih_item_body(src, ih), 0); + + if (is_indirect_le_ih(dih)) { + RFALSE(get_ih_free_space(dih), + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); + if (bytes_or_entries == ih_item_len(ih)) + set_ih_free_space(dih, get_ih_free_space(ih)); + } + + return 1; + } + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* + * (DEST is empty or last item of SOURCE and first item of DEST + * are the items of different object or of different types) + */ + src_nr_item = B_NR_ITEMS(src); + ih = item_head(src, src_nr_item - 1); + dih = item_head(dest, 0); + + if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) + return 0; + + if (is_direntry_le_ih(ih)) { + /* + * bytes_or_entries = entries number in last + * item body of SOURCE + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_entry_count(ih); + + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + src_nr_item - 1, + ih_entry_count(ih) - bytes_or_entries, + bytes_or_entries); + return 1; + } + + /* + * copy part of the body of the last item of SOURCE to the + * begin of the body of the first item of the DEST; part defined + * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; + * change first item key of the DEST; don't create new item header + */ + + RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); + + if (bytes_or_entries == -1) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih_item_len(ih); + + RFALSE(le_ih_k_offset(dih) != + le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), + "vs-10050: items %h and %h do not match", ih, dih); + + /* change first item key of the DEST */ + set_le_ih_k_offset(dih, le_ih_k_offset(ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type(dih, le_ih_k_type(ih)); + } else { + /* merge to right only part of item */ + RFALSE(ih_item_len(ih) <= bytes_or_entries, + "vs-10060: no so much bytes %lu (needed %lu)", + (unsigned long)ih_item_len(ih), + (unsigned long)bytes_or_entries); + + /* change first item key of the DEST */ + if (is_direct_le_ih(dih)) { + RFALSE(le_ih_k_offset(dih) <= + (unsigned long)bytes_or_entries, + "vs-10070: dih %h, bytes_or_entries(%d)", dih, + bytes_or_entries); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + bytes_or_entries); + } else { + RFALSE(le_ih_k_offset(dih) <= + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, + "vs-10080: dih %h, bytes_or_entries(%d)", + dih, + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + ((bytes_or_entries / UNFM_P_SIZE) * + dest->b_size)); + } + } + + leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, + ih_item_body(src, + ih) + ih_item_len(ih) - bytes_or_entries, + 0); + return 1; +} + +/* + * copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning + * from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning + * from first-th item in src to head of dest + */ +static void leaf_copy_items_entirely(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int first, int cpy_num) +{ + struct buffer_head *dest; + int nr, free_space; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, + "vs-10090: bad last_first parameter %d", last_first); + RFALSE(B_NR_ITEMS(src) - first < cpy_num, + "vs-10100: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); + RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); + + dest = dest_bi->bi_bh; + + RFALSE(!dest, "vs-10130: can not copy negative amount of items"); + + if (cpy_num == 0) + return; + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* + * we will insert items before 0-th or nr-th item in dest buffer. + * It depends of last_first parameter + */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = item_head(dest, dest_before); + + RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, + "vs-10140: not enough free space for headers %d (needed %d)", + B_FREE_SPACE(dest), cpy_num * IH_SIZE); + + /* prepare space for headers */ + memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); + + /* copy item headers */ + memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); + + free_space -= (IH_SIZE * cpy_num); + set_blkh_free_space(blkh, free_space); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); + for (i = dest_before; i < nr + cpy_num; i++) { + location -= ih_item_len(ih + i - dest_before); + put_ih_location(ih + i - dest_before, location); + } + + /* prepare space for items */ + last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); + last_inserted_loc = ih_location(&ih[cpy_num - 1]); + + /* check free space */ + RFALSE(free_space < j - last_inserted_loc, + "vs-10150: not enough free space for items %d (needed %d)", + free_space, j - last_inserted_loc); + + memmove(dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy(dest->b_data + last_inserted_loc, + item_body(src, (first + cpy_num - 1)), + j - last_inserted_loc); + + /* sizes, item number */ + set_blkh_nr_item(blkh, nr + cpy_num); + set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); + + do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + RFALSE(dc_block_number(t_dc) != dest->b_blocknr, + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", + (long unsigned)dest->b_blocknr, + (long unsigned)dc_block_number(t_dc)); + put_dc_size(t_dc, + dc_size(t_dc) + (j - last_inserted_loc + + IH_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* + * This function splits the (liquid) item into two items (useful when + * shifting part of an item into another node.) + */ +static void leaf_item_bottle(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int item_num, int cpy_bytes) +{ + struct buffer_head *dest = dest_bi->bi_bh; + struct item_head *ih; + + RFALSE(cpy_bytes == -1, + "vs-10170: bytes == - 1 means: do not split item"); + + if (last_first == FIRST_TO_LAST) { + /* + * if ( if item in position item_num in buffer SOURCE + * is directory item ) + */ + ih = item_head(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, + item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the end of the DEST part defined by + * 'cpy_bytes'; create new item header; change old + * item_header (????); n_ih = new item_header; + */ + memcpy(&n_ih, ih, IH_SIZE); + put_ih_item_len(&n_ih, cpy_bytes); + if (is_indirect_le_ih(ih)) { + RFALSE(cpy_bytes == ih_item_len(ih) + && get_ih_free_space(ih), + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + (long unsigned)get_ih_free_space(ih)); + set_ih_free_space(&n_ih, 0); + } + + RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), + "vs-10190: bad mergeability of item %h", ih); + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, + item_body(src, item_num), 0); + } + } else { + /* + * if ( if item in position item_num in buffer + * SOURCE is directory item ) + */ + ih = item_head(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + item_num, + ih_entry_count(ih) - cpy_bytes, + cpy_bytes); + else { + struct item_head n_ih; + + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the begin of the DEST part defined by + * 'cpy_bytes'; create new item header; + * n_ih = new item_header; + */ + memcpy(&n_ih, ih, SHORT_KEY_SIZE); + + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; + + if (is_direct_le_ih(ih)) { + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + ih_item_len(ih) - cpy_bytes); + set_le_ih_k_type(&n_ih, TYPE_DIRECT); + set_ih_free_space(&n_ih, MAX_US_INT); + } else { + /* indirect item */ + RFALSE(!cpy_bytes && get_ih_free_space(ih), + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + (ih_item_len(ih) - + cpy_bytes) / UNFM_P_SIZE * + dest->b_size); + set_le_ih_k_type(&n_ih, TYPE_INDIRECT); + set_ih_free_space(&n_ih, get_ih_free_space(ih)); + } + + /* set item length */ + put_ih_item_len(&n_ih, cpy_bytes); + + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; + + leaf_insert_into_buf(dest_bi, 0, &n_ih, + item_body(src, item_num) + + ih_item_len(ih) - cpy_bytes, 0); + } + } +} + +/* + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE + * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole + * items from SOURCE to DEST. From last item copy cpy_num bytes for regular + * item and cpy_num directory entries for directory item. + */ +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, + int last_first, int cpy_num, int cpy_bytes) +{ + struct buffer_head *dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; + RFALSE(!dest || !src, "vs-10210: !dest || !src"); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + RFALSE(B_NR_ITEMS(src) < cpy_num, + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), + cpy_num); + RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); + + if (cpy_num == 0) + return 0; + + if (last_first == FIRST_TO_LAST) { + /* copy items to left */ + pos = 0; + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* + * copy the first item or it part or nothing to the end of + * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) + */ + i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if (cpy_num == 0) + return i; + pos += i; + if (cpy_bytes == -1) + /* + * copy first cpy_num items starting from position + * 'pos' of SOURCE to end of DEST + */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num); + else { + /* + * copy first cpy_num-1 items starting from position + * 'pos-1' of the SOURCE to the end of the DEST + */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num - 1); + + /* + * copy part of the item which number is + * cpy_num+pos-1 to the end of the DEST + */ + leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, + cpy_num + pos - 1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS(src); + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* + * copy the last item or it part or nothing to the + * begin of the DEST + * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); + */ + i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if (cpy_num == 0) + return i; + + pos = src_nr_item - cpy_num - i; + if (cpy_bytes == -1) { + /* + * starting from position 'pos' copy last cpy_num + * items of SOURCE to begin of DEST + */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos, cpy_num); + } else { + /* + * copy last cpy_num-1 items starting from position + * 'pos+1' of the SOURCE to the begin of the DEST; + */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos + 1, cpy_num - 1); + + /* + * copy part of the item which number is pos to + * the begin of the DEST + */ + leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, + cpy_bytes); + } + } + return i; +} + +/* + * there are types of coping: from S[0] to L[0], from S[0] to R[0], + * from R[0] to L[0]. for each of these we have to define parent and + * positions of destination and source buffers + */ +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *first_last, + struct buffer_head *Snew) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + + /* src->b_item_order */ + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = NULL; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic(sb_from_bi(src_bi), "vs-10250", + "shift type is unknown (%d)", shift_mode); + } + RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); +} + +/* + * copy mov_num items and mov_bytes of the (mov_num-1)th item to + * neighbor. Delete them from source + */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew) +{ + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; + + leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, + &first_last, Snew); + + ret_value = + leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, + mov_bytes); + + leaf_delete_items(&src_bi, first_last, + (first_last == + FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - + mov_num), mov_num, mov_bytes); + + return ret_value; +} + +/* + * Shift shift_num items (and shift_bytes of last shifted item if + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key + */ +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); + int i; + + /* + * move shift_num (and shift_bytes bytes) items from S[0] + * to left neighbor L[0] + */ + i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); + + if (shift_num) { + /* number of items in S[0] == 0 */ + if (B_NR_ITEMS(S0) == 0) { + + RFALSE(shift_bytes != -1, + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", + shift_bytes); +#ifdef CONFIG_REISERFS_CHECK + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb("vs-10275"); + reiserfs_panic(tb->tb_sb, "vs-10275", + "balance condition corrupted " + "(%c)", tb->tb_mode); + } +#endif + + if (PATH_H_POSITION(tb->tb_path, 1) == 0) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); + + RFALSE((shift_bytes != -1 && + !(is_direntry_le_ih(item_head(S0, 0)) + && !ih_entry_count(item_head(S0, 0)))) && + (!op_is_left_mergeable + (leaf_key(S0, 0), S0->b_size)), + "vs-10280: item must be mergeable"); + } + } + + return i; +} + +/* CLEANING STOPPED HERE */ + +/* + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, + * and replace the delimiting key + */ +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + int ret_value; + + /* + * move shift_num (and shift_bytes) items from S[0] to + * right neighbor R[0] + */ + ret_value = + leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); + + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + } + + return ret_value; +} + +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num); +/* + * If del_bytes == -1, starting from position 'first' delete del_num + * items in whole in buffer CUR. + * If not. + * If last_first == 0. Starting from position 'first' delete del_num-1 + * items in whole. Delete part of body of the first item. Part defined by + * del_bytes. Don't delete first item header + * If last_first == 1. Starting from position 'first+1' delete del_num-1 + * items in whole. Delete part of body of the last item . Part defined by + * del_bytes. Don't delete last item header. +*/ +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, + int first, int del_num, int del_bytes) +{ + struct buffer_head *bh; + int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); + + RFALSE(!bh, "10155: bh is not defined"); + RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", + del_num); + RFALSE(first < 0 + || first + del_num > item_amount, + "10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", first, + first + del_num, item_amount); + + if (del_num == 0) + return; + + if (first == 0 && del_num == item_amount && del_bytes == -1) { + make_empty_node(cur_bi); + do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); + return; + } + + if (del_bytes == -1) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num); + else { + if (last_first == FIRST_TO_LAST) { + /* + * delete del_num-1 items beginning from + * item in position first + */ + leaf_delete_items_entirely(cur_bi, first, del_num - 1); + + /* + * delete the part of the first item of the bh + * do not delete item header + */ + leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); + } else { + struct item_head *ih; + int len; + + /* + * delete del_num-1 items beginning from + * item in position first+1 + */ + leaf_delete_items_entirely(cur_bi, first + 1, + del_num - 1); + + ih = item_head(bh, B_NR_ITEMS(bh) - 1); + if (is_direntry_le_ih(ih)) + /* the last item is directory */ + /* + * len = numbers of directory entries + * in this item + */ + len = ih_entry_count(ih); + else + /* len = body len of item */ + len = ih_item_len(ih); + + /* + * delete the part of the last item of the bh + * do not delete item header + */ + leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, + len - del_bytes, del_bytes); + } + } +} + +/* insert item into the leaf node in position before */ +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + char *to; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, + "vs-10170: not enough free space in block %z, new item %h", + bh, inserted_item_ih); + RFALSE(zeros_number > ih_item_len(inserted_item_ih), + "vs-10172: zero number == %d, item length == %d", + zeros_number, ih_item_len(inserted_item_ih)); + + /* get item new item must be inserted before */ + ih = item_head(bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; + unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; + + memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); + memset(to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove(to, inserted_item_body, + ih_item_len(inserted_item_ih) - zeros_number); + else + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); + + /* insert item header */ + memmove(ih + 1, ih, IH_SIZE * (nr - before)); + memmove(ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i++) { + unmoved_loc -= ih_item_len(&ih[i - before]); + put_ih_location(&ih[i - before], unmoved_loc); + } + + /* sizes, free space, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, + free_space - (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_leaf_dirty(bi->tb, bh, 1); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * paste paste_size bytes to affected_item_num-th item. + * When item is a directory, this only prepare space for new entries + */ +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, + int pos_in_item, int paste_size, + const char *body, int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < paste_size, + "vs-10175: not enough free space: needed %d, available %d", + paste_size, free_space); + +#ifdef CONFIG_REISERFS_CHECK + if (zeros_number > paste_size) { + struct super_block *sb = NULL; + if (bi && bi->tb) + sb = bi->tb->tb_sb; + print_cur_tb("10177"); + reiserfs_panic(sb, "vs-10177", + "zeros_number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* item to be appended */ + ih = item_head(bh, affected_item_num); + + last_loc = ih_location(&ih[nr - affected_item_num - 1]); + unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; + + /* prepare space */ + memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + /* change locations */ + for (i = affected_item_num; i < nr; i++) + put_ih_location(&ih[i - affected_item_num], + ih_location(&ih[i - affected_item_num]) - + paste_size); + + if (body) { + if (!is_direntry_le_ih(ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove(bh->b_data + ih_location(ih) + + paste_size, + bh->b_data + ih_location(ih), + ih_item_len(ih)); + /* paste data in the head of item */ + memset(bh->b_data + ih_location(ih), 0, + zeros_number); + memcpy(bh->b_data + ih_location(ih) + + zeros_number, body, + paste_size - zeros_number); + } else { + memset(bh->b_data + unmoved_loc - paste_size, 0, + zeros_number); + memcpy(bh->b_data + unmoved_loc - paste_size + + zeros_number, body, + paste_size - zeros_number); + } + } + } else + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); + + put_ih_item_len(ih, ih_item_len(ih) + paste_size); + + /* change free space */ + set_blkh_free_space(blkh, free_space - paste_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + paste_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + * does not have free space, so it moves DEHs and remaining records as + * necessary. Return value is size of removed part of directory item + * in bytes. + */ +static int leaf_cut_entries(struct buffer_head *bh, + struct item_head *ih, int from, int del_count) +{ + char *item; + struct reiserfs_de_head *deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char *prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + /* + * make sure that item is directory and there are enough entries to + * remove + */ + RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); + RFALSE(ih_entry_count(ih) < from + del_count, + "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", + ih_entry_count(ih), from, del_count); + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* + * first byte of remaining entries, those are BEFORE cut entries + * (prev_record) and length of all removed records (cut_records_len) + */ + prev_record_offset = + (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); + cut_records_len = prev_record_offset /*from_record */ - + deh_location(&deh[from + del_count - 1]); + prev_record = item + prev_record_offset; + + /* adjust locations of remaining entries */ + for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) - + (DEH_SIZE * del_count)); + + for (i = 0; i < from; i++) + put_deh_location(&deh[i], + deh_location(&deh[i]) - (DEH_SIZE * del_count + + cut_records_len)); + + put_ih_entry_count(ih, ih_entry_count(ih) - del_count); + + /* shift entry head array and entries those are AFTER removed entries */ + memmove((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove(prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih_item_len(ih) - prev_record); + + return DEH_SIZE * del_count + cut_records_len; +} + +/* + * when cut item is part of regular file + * pos_in_item - first byte that must be cut + * cut_size - number of bytes to be cut beginning from pos_in_item + * + * when cut item is part of directory + * pos_in_item - number of first deleted entry + * cut_size - count of deleted entries + */ +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size) +{ + int nr; + struct buffer_head *bh = bi->bi_bh; + struct block_head *blkh; + struct item_head *ih; + int last_loc, unmoved_loc; + int i; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + /* item head of truncated item */ + ih = item_head(bh, cut_item_num); + + if (is_direntry_le_ih(ih)) { + /* first cut entry () */ + cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ + RFALSE(cut_item_num, + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", + cut_item_num); + /* change item key by key of first entry in the item */ + set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); + } + } else { + /* item is direct or indirect */ + RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); + RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + (long unsigned)pos_in_item, (long unsigned)cut_size, + (long unsigned)ih_item_len(ih)); + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove(bh->b_data + ih_location(ih), + bh->b_data + ih_location(ih) + cut_size, + ih_item_len(ih) - cut_size); + + /* change key of item */ + if (is_direct_le_ih(ih)) + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + cut_size); + else { + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (cut_size / UNFM_P_SIZE) * + bh->b_size); + RFALSE(ih_item_len(ih) == cut_size + && get_ih_free_space(ih), + "10205: invalid ih_free_space (%h)", ih); + } + } + } + + /* location of the last item */ + last_loc = ih_location(&ih[nr - cut_item_num - 1]); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; + + /* shift */ + memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ + put_ih_item_len(ih, ih_item_len(ih) - cut_size); + + if (is_indirect_le_ih(ih)) { + if (pos_in_item) + set_ih_free_space(ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i++) + put_ih_location(&ih[i - cut_item_num], + ih_location(&ih[i - cut_item_num]) + cut_size); + + /* size, free space */ + set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) - cut_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* delete del_num items from buffer starting from the first'th item */ +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num) +{ + struct buffer_head *bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(bh == NULL, "10210: buffer is 0"); + RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); + + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + RFALSE(first < 0 || first + del_num > nr, + "10220: first=%d, number=%d, there is %d items", first, del_num, + nr); + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node(bi); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + return; + } + + ih = item_head(bh, first); + + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : ih_location(ih - 1); + + /* delete items */ + last_loc = ih_location(&ih[nr - 1 - first]); + last_removed_loc = ih_location(&ih[del_num - 1]); + + memmove(bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i++) + put_ih_location(&ih[i - first], + ih_location(&ih[i - first]) + (j - + last_removed_loc)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + (j - last_removed_loc + + IH_SIZE * del_num)); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (j - last_removed_loc + + IH_SIZE * del_num)); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * paste new_entry_count entries (new_dehs, records) into position + * before to item_num-th item + */ +void leaf_paste_entries(struct buffer_info *bi, + int item_num, + int before, + int new_entry_count, + struct reiserfs_de_head *new_dehs, + const char *records, int paste_size) +{ + struct item_head *ih; + char *item; + struct reiserfs_de_head *deh; + char *insert_point; + int i, old_entry_num; + struct buffer_head *bh = bi->bi_bh; + + if (new_entry_count == 0) + return; + + ih = item_head(bh, item_num); + + /* + * make sure, that item is directory, and there are enough + * records in it + */ + RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); + RFALSE(ih_entry_count(ih) < before, + "10230: there are no entry we paste entries before. entry_count = %d, before = %d", + ih_entry_count(ih), before); + + /* first byte of dest item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* new records will be pasted at this point */ + insert_point = + item + + (before ? deh_location(&deh[before - 1]) + : (ih_item_len(ih) - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = ih_entry_count(ih) - 1; i >= before; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) + + (DEH_SIZE * new_entry_count)); + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i++) + put_deh_location(&deh[i], + deh_location(&deh[i]) + paste_size); + + old_entry_num = ih_entry_count(ih); + put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); + + /* prepare space for pasted records */ + memmove(insert_point + paste_size, insert_point, + item + (ih_item_len(ih) - paste_size) - insert_point); + + /* copy new records */ + memcpy(insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove((char *)(deh + new_entry_count), deh, + insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i++) { + put_deh_location(&deh[i], + deh_location(&deh[i]) + + (-deh_location + (&new_dehs[new_entry_count - 1]) + + insert_point + DEH_SIZE * new_entry_count - + item)); + } + + /* change item key if necessary (when we paste before 0-th entry */ + if (!before) { + set_le_ih_k_offset(ih, deh_offset(new_dehs)); + } +#ifdef CONFIG_REISERFS_CHECK + { + int prev, next; + /* check record locations */ + deh = B_I_DEH(bh, ih); + for (i = 0; i < ih_entry_count(ih); i++) { + next = + (i < + ih_entry_count(ih) - + 1) ? deh_location(&deh[i + 1]) : 0; + prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; + + if (prev && prev <= deh_location(&deh[i])) + reiserfs_error(sb_from_bi(bi), "vs-10240", + "directory item (%h) " + "corrupted (prev %a, " + "cur(%d) %a)", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh_location(&deh[i])) + reiserfs_error(sb_from_bi(bi), "vs-10250", + "directory item (%h) " + "corrupted (cur(%d) %a, " + "next %a)", + ih, i, deh + i, deh + i + 1); + } + } +#endif + +} diff --git a/kernel/fs/reiserfs/lock.c b/kernel/fs/reiserfs/lock.c new file mode 100644 index 000000000..045b83ef9 --- /dev/null +++ b/kernel/fs/reiserfs/lock.c @@ -0,0 +1,100 @@ +#include "reiserfs.h" +#include + +/* + * The previous reiserfs locking scheme was heavily based on + * the tricky properties of the Bkl: + * + * - it was acquired recursively by a same task + * - the performances relied on the release-while-schedule() property + * + * Now that we replace it by a mutex, we still want to keep the same + * recursive property to avoid big changes in the code structure. + * We use our own lock_owner here because the owner field on a mutex + * is only available in SMP or mutex debugging, also we only need this field + * for this mutex, no need for a system wide mutex facility. + * + * Also this lock is often released before a call that could block because + * reiserfs performances were partially based on the release while schedule() + * property of the Bkl. + */ +void reiserfs_write_lock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + } + + /* No need to protect it, only the current task touches it */ + sb_i->lock_depth++; +} + +void reiserfs_write_unlock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* + * Are we unlocking without even holding the lock? + * Such a situation must raise a BUG() if we don't want + * to corrupt the data. + */ + BUG_ON(sb_i->lock_owner != current); + + if (--sb_i->lock_depth == -1) { + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + } +} + +int __must_check reiserfs_write_unlock_nested(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + int depth; + + /* this can happen when the lock isn't always held */ + if (sb_i->lock_owner != current) + return -1; + + depth = sb_i->lock_depth; + + sb_i->lock_depth = -1; + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + + return depth; +} + +void reiserfs_write_lock_nested(struct super_block *s, int depth) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* this can happen when the lock isn't always held */ + if (depth == -1) + return; + + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + sb_i->lock_depth = depth; +} + +/* + * Utility function to force a BUG if it is called without the superblock + * write lock held. caller is the string printed just before calling BUG() + */ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ON(sb_i->lock_depth < 0); +} + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/kernel/fs/reiserfs/namei.c b/kernel/fs/reiserfs/namei.c new file mode 100644 index 000000000..b55a07465 --- /dev/null +++ b/kernel/fs/reiserfs/namei.c @@ -0,0 +1,1659 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include + +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); + +/* + * directory item contains array of entry headers. This performs + * binary search through that array + */ +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) +{ + struct item_head *ih = de->de_ih; + struct reiserfs_de_head *deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = ih_entry_count(ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; + j = (rbound + lbound) / 2) { + if (off < deh_offset(deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset(deh + j)) { + lbound = j + 1; + continue; + } + /* this is not name found, but matched third key component */ + de->de_entry_num = j; + return NAME_FOUND; + } + + de->de_entry_num = lbound; + return NAME_NOT_FOUND; +} + +/* + * comment? maybe something like set de to point to what the path points to? + */ +static inline void set_de_item_location(struct reiserfs_dir_entry *de, + struct treepath *path) +{ + de->de_bh = get_last_bh(path); + de->de_ih = tp_item_head(path); + de->de_deh = B_I_DEH(de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION(path); +} + +/* + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set + */ +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); + de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen(de->de_name); +} + +/* what entry points to */ +static inline void set_de_object_key(struct reiserfs_dir_entry *de) +{ + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]); + de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]); +} + +static inline void store_de_entry_key(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + /* store key of the found entry */ + de->de_entry_key.version = KEY_FORMAT_3_5; + de->de_entry_key.on_disk_key.k_dir_id = + le32_to_cpu(de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = + le32_to_cpu(de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh)); + set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY); +} + +/* + * We assign a key to each directory item, and place multiple entries in a + * single directory item. A directory item has a key equal to the key of + * the first directory entry in it. + + * This function first calls search_by_key, then, if item whose first entry + * matches is not found it looks for the entry inside directory item found + * by search_by_key. Fills the path to the entry, and to the entry position + * in the item + */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *path, struct reiserfs_dir_entry *de) +{ + int retval; + + retval = search_item(sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION(path)) { + reiserfs_error(sb, "vs-7000", "search_by_key " + "returned item position == 0"); + pathrelse(path); + return IO_ERROR; + } + PATH_LAST_POSITION(path)--; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse(path); + reiserfs_error(sb, "vs-7002", "no path to here"); + return IO_ERROR; + } + + set_de_item_location(de, path); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih(de->de_ih) || + COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) { + print_block(de->de_bh, 0, -1, -1); + reiserfs_panic(sb, "vs-7005", "found item %h is not directory " + "item or does not belong to the same directory " + "as key %K", de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* + * binary search in directory item by third component of the + * key. sets de->de_entry_num of de + */ + retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + /* + * ugly, but rename needs de_bh, de_deh, de_name, + * de_namelen, de_objectid set + */ + set_de_name_and_namelen(de); + set_de_object_key(de); + } + return retval; +} + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ + +/* + * The third component is hashed, and you can choose from more than + * one hash function. Per directory hashes are not yet implemented + * but are thought about. This function should be moved to hashes.c + * Jedi, please do so. -Hans + */ +static __u32 get_third_component(struct super_block *s, + const char *name, int len) +{ + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = REISERFS_SB(s)->s_hash_function(name, len); + + /* take bits from 7-th to 30-th including both bounds */ + res = GET_HASH_VALUE(res); + if (res == 0) + /* + * needed to have no names before "." and ".." those have hash + * value == 0 and generation conters 1 and 2 accordingly + */ + res = 128; + return res + MAX_GENERATION_NUMBER; +} + +static int reiserfs_match(struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + int retval = NAME_NOT_FOUND; + + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = + (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : + NAME_FOUND_INVISIBLE); + + return retval; +} + +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ + +/* used when hash collisions exist */ + +static int linear_search_in_dir_item(struct cpu_key *key, + struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + struct reiserfs_de_head *deh = de->de_deh; + int retval; + int i; + + i = de->de_entry_num; + + if (i == ih_entry_count(de->de_ih) || + GET_HASH_VALUE(deh_offset(deh + i)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + i--; + } + + RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih), + "vs-7010: array of entry headers not found"); + + deh += i; + + for (; i >= 0; i--, deh--) { + /* hash value does not match, no need to check whole name */ + if (GET_HASH_VALUE(deh_offset(deh)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + return NAME_NOT_FOUND; + } + + /* mark that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), + de->de_gen_number_bit_string); + + /* calculate pointer to name and namelen */ + de->de_entry_num = i; + set_de_name_and_namelen(de); + + /* + * de's de_name, de_namelen, de_recordlen are set. + * Fill the rest. + */ + if ((retval = + reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { + + /* key of pointed object */ + set_de_object_key(de); + + store_de_entry_key(de); + + /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ + return retval; + } + } + + if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) + /* + * we have reached left most entry in the node. In common we + * have to go to the left neighbor, but if generation counter + * is 0 already, we know for sure, that there is no name with + * the same hash value + */ + /* + * FIXME: this work correctly only because hash value can not + * be 0. Btw, in case of Yura's hash it is probably possible, + * so, this is a bug + */ + return NAME_NOT_FOUND; + + RFALSE(de->de_item_num, + "vs-7015: two diritems of the same directory in one node?"); + + return GOTO_PREVIOUS_ITEM; +} + +/* + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND + * FIXME: should add something like IOERROR + */ +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, + struct treepath *path_to_entry, + struct reiserfs_dir_entry *de) +{ + struct cpu_key key_to_search; + int retval; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key(&key_to_search, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + while (1) { + retval = + search_by_entry_key(dir->i_sb, &key_to_search, + path_to_entry, de); + if (retval == IO_ERROR) { + reiserfs_error(dir->i_sb, "zam-7001", "io error"); + return IO_ERROR; + } + + /* compare names for all entries having given hash value */ + retval = + linear_search_in_dir_item(&key_to_search, de, name, + namelen); + /* + * there is no need to scan directory anymore. + * Given entry found or does not exist + */ + if (retval != GOTO_PREVIOUS_ITEM) { + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* + * there is left neighboring item of this directory + * and given entry can be there + */ + set_cpu_key_k_offset(&key_to_search, + le_ih_k_offset(de->de_ih) - 1); + pathrelse(path_to_entry); + + } /* while (1) */ +} + +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + + if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) + return ERR_PTR(-ENAMETOOLONG); + + reiserfs_write_lock(dir->i_sb); + + de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval == NAME_FOUND) { + inode = reiserfs_iget(dir->i_sb, + (struct cpu_key *)&de.de_dir_id); + if (!inode || IS_ERR(inode)) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-EACCES); + } + + /* + * Propagate the private flag so we know we're + * in the priv tree + */ + if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + } + reiserfs_write_unlock(dir->i_sb); + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); + } + + return d_splice_alias(inode, dentry); +} + +/* + * looks up the dentry of the parent directory for child. + * taken from ext2_get_parent + */ +struct dentry *reiserfs_get_parent(struct dentry *child) +{ + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + struct inode *dir = d_inode(child); + + if (dir->i_nlink == 0) { + return ERR_PTR(-ENOENT); + } + de.de_gen_number_bit_string = NULL; + + reiserfs_write_lock(dir->i_sb); + retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); + } + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id); + reiserfs_write_unlock(dir->i_sb); + + return d_obtain_alias(inode); +} + +/* add entry to the directory (entry can be hidden). + +insert definition of when hidden directories are used here -Hans + + Does not mark dir inode dirty, do it after successesfull call to it */ + +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, + struct inode *dir, const char *name, int namelen, + struct inode *inode, int visible) +{ + struct cpu_key entry_key; + struct reiserfs_de_head *deh; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); + int gen_number; + + /* + * 48 bytes now and we avoid kmalloc if we + * create file with short name + */ + char small_buf[32 + DEH_SIZE]; + + char *buffer; + int buflen, paste_size; + int retval; + + BUG_ON(!th->t_trans_id); + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key(&entry_key, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP(namelen); + if (buflen > sizeof(small_buf)) { + buffer = kmalloc(buflen, GFP_NOFS); + if (!buffer) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = + (get_inode_sd_version(dir) == + STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; + + /* + * fill buffer : directory entry head, name[, dir objectid | , + * stat data | ,stat data, dir objectid ] + */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; /* JDM Endian safe if 0 */ + put_deh_offset(deh, cpu_key_k_offset(&entry_key)); + deh->deh_state = 0; /* JDM Endian safe if 0 */ + /* put key (ino analog) to de */ + + /* safe: k_dir_id is le */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; + /* safe: k_objectid is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; + + /* copy name */ + memcpy((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); + + /* + * entry is ready to be pasted into tree, set 'visibility' + * and 'stat data in entry' attributes + */ + mark_de_without_sd(deh); + visible ? mark_de_visible(deh) : mark_de_hidden(deh); + + /* find the proper place for the new entry */ + memset(bit_string, 0, sizeof(bit_string)); + de.de_gen_number_bit_string = bit_string; + retval = reiserfs_find_entry(dir, name, namelen, &path, &de); + if (retval != NAME_NOT_FOUND) { + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + + if (retval == IO_ERROR) { + return -EIO; + } + + if (retval != NAME_FOUND) { + reiserfs_error(dir->i_sb, "zam-7002", + "reiserfs_find_entry() returned " + "unexpected value (%d)", retval); + } + + return -EEXIST; + } + + gen_number = + find_first_zero_bit(bit_string, + MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning(dir->i_sb, "reiserfs-7010", + "Congratulations! we have got hash function " + "screwed up"); + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); + set_cpu_key_k_offset(&entry_key, deh_offset(deh)); + + /* update max-hash-collisions counter in reiserfs_sb_info */ + PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); + + /* we need to re-search for the insertion point */ + if (gen_number != 0) { + if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != + NAME_NOT_FOUND) { + reiserfs_warning(dir->i_sb, "vs-7032", + "entry with this key (%K) already " + "exists", &entry_key); + + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + } + + /* perform the insertion of the entry that we have prepared */ + retval = + reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, + paste_size); + if (buffer != small_buf) + kfree(buffer); + if (retval) { + reiserfs_check_path(&path); + return retval; + } + + dir->i_size += paste_size; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (!S_ISDIR(inode->i_mode) && visible) + /* reiserfs_mkdir or reiserfs_rename will do that by itself */ + reiserfs_update_sd(th, dir); + + reiserfs_check_path(&path); + return 0; +} + +/* + * quota utility function, call if you've had to abort after calling + * new_inode_init, and have not called reiserfs_new_inode yet. + * This should only be called on inodes that do not have stat data + * inserted into the tree yet. + */ +static int drop_new_inode(struct inode *inode) +{ + dquot_drop(inode); + make_bad_inode(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + return 0; +} + +/* + * utility function that does setup for reiserfs_new_inode. + * dquot_initialize needs lots of credits so it's better to have it + * outside of a transaction, so we had to pull some bits of + * reiserfs_new_inode out into this func. + */ +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) +{ + /* + * Make inode invalid - just in case we are going to drop it before + * the initialization happens + */ + INODE_PKEY(inode)->k_objectid = 0; + + /* + * the quota init calls have to know who to charge the quota to, so + * we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + return 0; +} + +static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + int retval; + struct inode *inode; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) + goto out_failed; + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) { + goto out_failed; + } + + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + /* FIXME: needed for block and char devices only */ + reiserfs_update_sd(&th, inode); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + dquot_initialize(dir); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * set flag that new packing locality created and new blocks + * for the content of that directory are not displaced yet + */ + REISERFS_I(dir)->new_packing_locality = 1; +#endif + mode = S_IFDIR | mode; + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + /* + * inc the link count now, so another writer doesn't overflow + * it while we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); + if (retval) { + DEC_DIR_INODE_NLINK(dir) + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + /* note, _this_ add_entry will not update dir's stat data */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + clear_nlink(inode); + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + /* the above add_entry did not update dir's stat data */ + reiserfs_update_sd(&th, dir); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static inline int reiserfs_empty_dir(struct inode *inode) +{ + /* + * we can cheat because an old format dir cannot have + * EMPTY_DIR_SIZE, and a new format dir cannot have + * EMPTY_DIR_SIZE_V1. So, if the inode is either size, + * regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0; + } + return 1; +} + +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + + /* + * we will be doing 2 balancings and update 2 stat data, we + * change quotas of the owner of the directory and of the owner + * of the parent directory. The quota structure is possibly + * deleted only on last iput => outside of this transaction + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_rmdir; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_rmdir; + } + + inode = d_inode(dentry); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + /* + * FIXME: compare key of an object and a key found in the entry + */ + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key, + dir, NULL, /* page */ + 0 /*new file size - not used here */ ); + if (retval < 0) + goto end_rmdir; + + if (inode->i_nlink != 2 && inode->i_nlink != 1) + reiserfs_error(inode->i_sb, "reiserfs-7040", + "empty directory has nlink != 2 (%d)", + inode->i_nlink); + + clear_nlink(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + reiserfs_update_sd(&th, dir); + + /* prevent empty directory from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th); + reiserfs_check_path(&path); +out_rmdir: + reiserfs_write_unlock(dir->i_sb); + return retval; + +end_rmdir: + /* + * we must release path, because we did not call + * reiserfs_cut_from_item, or reiserfs_cut_from_item does not + * release path if operation was not complete + */ + pathrelse(&path); + err = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; +} + +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path); + struct reiserfs_transaction_handle th; + int jbegin_count; + unsigned long savelink; + + dquot_initialize(dir); + + inode = d_inode(dentry); + + /* + * in this transaction we can be doing at max two balancings and + * update two stat datas, we change quotas of the owner of the + * directory and of the owner of the parent directory. The quota + * structure is possibly deleted only on iput => outside of + * this transaction + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_unlink; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_unlink; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + /* + * FIXME: compare key of an object and a key found in the entry + */ + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + reiserfs_warning(inode->i_sb, "reiserfs-7042", + "deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + + drop_nlink(inode); + + /* + * we schedule before doing the add_save_link call, save the link + * count so we don't race + */ + savelink = inode->i_nlink; + + retval = + reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL, + 0); + if (retval < 0) { + inc_nlink(inode); + goto end_unlink; + } + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, dir); + + if (!savelink) + /* prevent file from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th); + reiserfs_check_path(&path); + reiserfs_write_unlock(dir->i_sb); + return retval; + +end_unlink: + pathrelse(&path); + err = journal_end(&th); + reiserfs_check_path(&path); + if (err) + retval = err; +out_unlink: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_symlink(struct inode *parent_dir, + struct dentry *dentry, const char *symname) +{ + int retval; + struct inode *inode; + char *name; + int item_len; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + int mode = S_IFLNK | S_IRWXUGO; + /* + * We need blocks for transaction + (user+group)*(quotas for + * new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + + dquot_initialize(parent_dir); + + if (!(inode = new_inode(parent_dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, parent_dir, mode); + + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, + &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + + reiserfs_write_lock(parent_dir->i_sb); + item_len = ROUND_UP(strlen(symname)); + if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { + retval = -ENAMETOOLONG; + drop_new_inode(inode); + goto out_failed; + } + + name = kmalloc(item_len, GFP_NOFS); + if (!name) { + drop_new_inode(inode); + retval = -ENOMEM; + goto out_failed; + } + memcpy(name, symname, strlen(symname)); + padd_item(name, item_len, strlen(symname)); + + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + kfree(name); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), + dentry, inode, &security); + kfree(name); + if (retval) { /* reiserfs_new_inode iputs for us */ + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(parent_dir); + + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); +out_failed: + reiserfs_write_unlock(parent_dir->i_sb); + return retval; +} + +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + struct inode *inode = d_inode(old_dentry); + struct reiserfs_transaction_handle th; + /* + * We need blocks for transaction + update of quotas for + * the owners of the directory + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + if (inode->i_nlink >= REISERFS_LINK_MAX) { + /* FIXME: sd_nlink is 32 bit for new files */ + reiserfs_write_unlock(dir->i_sb); + return -EMLINK; + } + + /* inc before scheduling so reiserfs_unlink knows we are here */ + inc_nlink(inode); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_nlink(inode); + reiserfs_write_unlock(dir->i_sb); + return retval; + } + + /* create new entry */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (retval) { + int err; + drop_nlink(inode); + err = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; + } + + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + ihold(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +/* de contains information pointing to an entry which */ +static int de_still_valid(const char *name, int len, + struct reiserfs_dir_entry *de) +{ + struct reiserfs_dir_entry tmp = *de; + + /* recalculate pointer to name and name length */ + set_de_name_and_namelen(&tmp); + /* FIXME: could check more */ + if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) + return 0; + return 1; +} + +static int entry_points_to_object(const char *name, int len, + struct reiserfs_dir_entry *de, + struct inode *inode) +{ + if (!de_still_valid(name, len, de)) + return 0; + + if (inode) { + if (!de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(inode->i_sb, "vs-7042", + "entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } + + /* this must be added hidden entry */ + if (de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(NULL, "vs-7043", "entry must be visible"); + + return 1; +} + +/* sets key of objectid the entry has to point to */ +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, + struct reiserfs_key *key) +{ + /* JDM These operations are endian safe - both are le */ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; +} + +/* + * process, that is going to call fix_nodes/do_balance must hold only + * one path. If it holds 2 or more, it can get into endless waiting in + * get_empty_nodes or its clones + */ +static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int retval; + INITIALIZE_PATH(old_entry_path); + INITIALIZE_PATH(new_entry_path); + INITIALIZE_PATH(dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode *old_inode, *new_dentry_inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + umode_t old_inode_mode; + unsigned long savelink = 1; + struct timespec ctime; + + /* + * three balancings: (1) old name removal, (2) new name insertion + * and (3) maybe "save" link insertion + * stat data updates: (1) old directory, + * (2) new directory and (3) maybe old object stat data (when it is + * directory) and (4) maybe stat data of object to which new entry + * pointed initially and (5) maybe block containing ".." of + * renamed directory + * quota updates: two parent directories + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 5 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_inode = d_inode(old_dentry); + new_dentry_inode = d_inode(new_dentry); + + /* + * make sure that oldname still exists and points to an object we + * are going to rename + */ + old_de.de_gen_number_bit_string = NULL; + reiserfs_write_lock(old_dir->i_sb); + retval = + reiserfs_find_entry(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_entry_path, + &old_de); + pathrelse(&old_entry_path); + if (retval == IO_ERROR) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOENT; + } + + old_inode_mode = old_inode->i_mode; + if (S_ISDIR(old_inode_mode)) { + /* + * make sure that directory being renamed has correct ".." + * and that its new parent directory has not too many links + * already + */ + if (new_dentry_inode) { + if (!reiserfs_empty_dir(new_dentry_inode)) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOTEMPTY; + } + } + + /* + * directory is renamed, its parent directory will be changed, + * so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + &dot_dot_de); + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + } + + retval = journal_begin(&th, old_dir->i_sb, jbegin_count); + if (retval) { + reiserfs_write_unlock(old_dir->i_sb); + return retval; + } + + /* add new entry (or find the existing one) */ + retval = + reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, old_inode, 0); + if (retval == -EEXIST) { + if (!new_dentry_inode) { + reiserfs_panic(old_dir->i_sb, "vs-7050", + "new entry is found, new inode == 0"); + } + } else if (retval) { + int err = journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return err ? err : retval; + } + + reiserfs_update_inode_transaction(old_dir); + reiserfs_update_inode_transaction(new_dir); + + /* + * this makes it so an fsync on an open fd for the old name will + * commit the rename operation + */ + reiserfs_update_inode_transaction(old_inode); + + if (new_dentry_inode) + reiserfs_update_inode_transaction(new_dentry_inode); + + while (1) { + /* + * look for old name using corresponding entry key + * (found by reiserfs_find_entry) + */ + if ((retval = + search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, + &old_entry_path, + &old_de)) != NAME_FOUND) { + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); + + /* look for new name by reiserfs_find_entry */ + new_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, &new_entry_path, + &new_de); + /* + * reiserfs_add_entry should not return IO_ERROR, + * because it is called with essentially same parameters from + * reiserfs_add_entry above, and we'll catch any i/o errors + * before we get here. + */ + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); + + if (S_ISDIR(old_inode->i_mode)) { + if ((retval = + search_by_entry_key(new_dir->i_sb, + &dot_dot_de.de_entry_key, + &dot_dot_entry_path, + &dot_dot_de)) != NAME_FOUND) { + pathrelse(&dot_dot_entry_path); + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + copy_item_head(&dot_dot_ih, + tp_item_head(&dot_dot_entry_path)); + /* node containing ".." gets into transaction */ + reiserfs_prepare_for_journal(old_inode->i_sb, + dot_dot_de.de_bh, 1); + } + /* + * we should check seals here, not do + * this stuff, yes? Then, having + * gathered everything into RAM we + * should lock the buffers, yes? -Hans + */ + /* + * probably. our rename needs to hold more + * than one path at once. The seals would + * have to be written to deal with multi-path + * issues -chris + */ + /* + * sanity checking before doing the rename - avoid races many + * of the above checks could have scheduled. We have to be + * sure our items haven't been shifted by another process. + */ + if (item_moved(&new_entry_ih, &new_entry_path) || + !entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_dentry_inode) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object(old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer(old_inode->i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode->i_sb, + old_de.de_bh); + if (S_ISDIR(old_inode_mode)) + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + if (S_ISDIR(old_inode_mode)) { + if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || + !entry_points_to_object("..", 2, &dot_dot_de, + old_dir)) { + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + old_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + } + + RFALSE(S_ISDIR(old_inode_mode) && + !buffer_journal_prepared(dot_dot_de.de_bh), ""); + + break; + } + + /* + * ok, all the changes can be done in one fell swoop when we + * have claimed all the buffers needed. + */ + + mark_de_visible(new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); + journal_mark_dirty(&th, new_de.de_bh); + + mark_de_hidden(old_de.de_deh + old_de.de_entry_num); + journal_mark_dirty(&th, old_de.de_bh); + ctime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + /* + * thanks to Alex Adriaanse for patch + * which adds ctime update of renamed object + */ + old_inode->i_ctime = ctime; + + if (new_dentry_inode) { + /* adjust link number of the victim */ + if (S_ISDIR(new_dentry_inode->i_mode)) { + clear_nlink(new_dentry_inode); + } else { + drop_nlink(new_dentry_inode); + } + new_dentry_inode->i_ctime = ctime; + savelink = new_dentry_inode->i_nlink; + } + + if (S_ISDIR(old_inode_mode)) { + /* adjust ".." of renamed directory */ + set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); + journal_mark_dirty(&th, dot_dot_de.de_bh); + + /* + * there (in new_dir) was no directory, so it got new link + * (".." of renamed directory) + */ + if (!new_dentry_inode) + INC_DIR_INODE_NLINK(new_dir); + + /* old directory lost one link - ".. " of renamed directory */ + DEC_DIR_INODE_NLINK(old_dir); + } + /* + * looks like in 2.3.99pre3 brelse is atomic. + * so we can use pathrelse + */ + pathrelse(&new_entry_path); + pathrelse(&dot_dot_entry_path); + + /* + * FIXME: this reiserfs_cut_from_item's return value may screw up + * anybody, but it will panic if will not be able to find the + * entry. This needs one more clean up + */ + if (reiserfs_cut_from_item + (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL, + 0) < 0) + reiserfs_error(old_dir->i_sb, "vs-7060", + "couldn't not cut old name. Fsck later?"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + + reiserfs_update_sd(&th, old_dir); + reiserfs_update_sd(&th, new_dir); + reiserfs_update_sd(&th, old_inode); + + if (new_dentry_inode) { + if (savelink == 0) + add_save_link(&th, new_dentry_inode, + 0 /* not truncate */ ); + reiserfs_update_sd(&th, new_dentry_inode); + } + + retval = journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return retval; +} + +/* directories can handle most operations... */ +const struct inode_operations reiserfs_dir_inode_operations = { + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; + +/* + * symlink operations.. same as page_symlink_inode_operations, with xattr + * stuff added + */ +const struct inode_operations reiserfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; + +/* + * special file operations.. just xattr/acl stuff + */ +const struct inode_operations reiserfs_special_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; diff --git a/kernel/fs/reiserfs/objectid.c b/kernel/fs/reiserfs/objectid.c new file mode 100644 index 000000000..99a5d5dae --- /dev/null +++ b/kernel/fs/reiserfs/objectid.c @@ -0,0 +1,217 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" + +/* find where objectid map starts */ +#define objectid_map(s,rs) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ + (__le32 *)((rs) + 1)) + +#ifdef CONFIG_REISERFS_CHECK + +static void check_objectid_map(struct super_block *s, __le32 * map) +{ + if (le32_to_cpu(map[0]) != 1) + reiserfs_panic(s, "vs-15010", "map corrupted: %lx", + (long unsigned int)le32_to_cpu(map[0])); + + /* FIXME: add something else here */ +} + +#else +static void check_objectid_map(struct super_block *s, __le32 * map) +{; +} +#endif + +/* + * When we allocate objectids we allocate the first unused objectid. + * Each sequence of objectids in use (the odd sequences) is followed + * by a sequence of objectids not in use (the even sequences). We + * only need to record the last objectid in each of these sequences + * (both the odd and even sequences) in order to fully define the + * boundaries of the sequences. A consequence of allocating the first + * objectid not in use is that under most conditions this scheme is + * extremely compact. The exception is immediately after a sequence + * of operations which deletes a large number of objects of + * non-sequential objectids, and even then it will become compact + * again as soon as more objects are created. Note that many + * interesting optimizations of layout could result from complicating + * objectid assignment, but we have deferred making them for now. + */ + +/* get unique object identifier */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + __u32 unused_objectid; + + BUG_ON(!th->t_trans_id); + + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + /* comment needed -Hans */ + unused_objectid = le32_to_cpu(map[1]); + if (unused_objectid == U32_MAX) { + reiserfs_warning(s, "reiserfs-15100", "no more object ids"); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); + return 0; + } + + /* + * This incrementation allocates the first unused objectid. That + * is to say, the first entry on the objectid map is the first + * unused objectid, and by incrementing it we use it. See below + * where we check to see if we eliminated a sequence of unused + * objectids.... + */ + map[1] = cpu_to_le32(unused_objectid + 1); + + /* + * Now we check to see if we eliminated the last remaining member of + * the first even sequence (and can eliminate the sequence by + * eliminating its last objectid from oids), and can collapse the + * first two odd sequences into one sequence. If so, then the net + * result is to eliminate a pair of objectids from oids. We do this + * by shifting the entire map to the left. + */ + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { + memmove(map + 1, map + 3, + (sb_oid_cursize(rs) - 3) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + } + + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + return unused_objectid; +} + +/* makes object identifier unused */ +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + int i = 0; + + BUG_ON(!th->t_trans_id); + /*return; */ + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + /* + * start at the beginning of the objectid map (i = 0) and go to + * the end of it (i = disk_sb->s_oid_cursize). Linear search is + * what we use, though it is possible that binary search would be + * more efficient after performing lots of deletions (which is + * when oids is large.) We only check even i's. + */ + while (i < sb_oid_cursize(rs)) { + if (objectid_to_release == le32_to_cpu(map[i])) { + /* This incrementation unallocates the objectid. */ + le32_add_cpu(&map[i], 1); + + /* + * Did we unallocate the last member of an + * odd sequence, and can shrink oids? + */ + if (map[i] == map[i + 1]) { + /* shrink objectid map */ + memmove(map + i, map + i + 2, + (sb_oid_cursize(rs) - i - + 2) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + + RFALSE(sb_oid_cursize(rs) < 2 || + sb_oid_cursize(rs) > sb_oid_maxsize(rs), + "vs-15005: objectid map corrupted cur_size == %d (max == %d)", + sb_oid_cursize(rs), sb_oid_maxsize(rs)); + } + return; + } + + if (objectid_to_release > le32_to_cpu(map[i]) && + objectid_to_release < le32_to_cpu(map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { + le32_add_cpu(&map[i + 1], -1); + return; + } + + /* + * JDM comparing two little-endian values for + * equality -- safe + */ + /* + * objectid map must be expanded, but + * there is no space + */ + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { + PROC_INFO_INC(s, leaked_oid); + return; + } + + /* expand the objectid map */ + memmove(map + i + 3, map + i + 1, + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32(objectid_to_release); + map[i + 2] = cpu_to_le32(objectid_to_release + 1); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2); + return; + } + i += 2; + } + + reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)", + (long unsigned)objectid_to_release); +} + +int reiserfs_convert_objectid_map_v1(struct super_block *s) +{ + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s); + int cur_size = sb_oid_cursize(disk_sb); + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; + int old_max = sb_oid_maxsize(disk_sb); + struct reiserfs_super_block_v1 *disk_sb_v1; + __le32 *objectid_map, *new_objectid_map; + int i; + + disk_sb_v1 = + (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__le32 *) (disk_sb_v1 + 1); + new_objectid_map = (__le32 *) (disk_sb + 1); + + if (cur_size > new_size) { + /* + * mark everyone used that was listed as free at + * the end of the objectid map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1]; + set_sb_oid_cursize(disk_sb, new_size); + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1; i >= 0; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i]; + } + + /* set the max size so we don't overflow later */ + set_sb_oid_maxsize(disk_sb, new_size); + + /* Zero out label and generate random UUID */ + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)); + generate_random_uuid(disk_sb->s_uuid); + + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)); + return 0; +} diff --git a/kernel/fs/reiserfs/prints.c b/kernel/fs/reiserfs/prints.c new file mode 100644 index 000000000..ae1dc841d --- /dev/null +++ b/kernel/fs/reiserfs/prints.c @@ -0,0 +1,777 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include "reiserfs.h" +#include +#include + +#include + +static char error_buf[1024]; +static char fmt_buf[1024]; +static char off_buf[80]; + +static char *reiserfs_cpu_offset(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf(off_buf, "%llu(%llu)", + (unsigned long long) + GET_HASH_VALUE(cpu_key_k_offset(key)), + (unsigned long long) + GET_GENERATION_NUMBER(cpu_key_k_offset(key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)cpu_key_k_offset(key)); + return off_buf; +} + +static char *le_offset(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + sprintf(off_buf, "%llu(%llu)", + (unsigned long long) + GET_HASH_VALUE(le_key_k_offset(version, key)), + (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset(version, key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)le_key_k_offset(version, key)); + return off_buf; +} + +static char *cpu_type(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type(key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type(key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +static char *le_type(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + + if (le_key_k_type(version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type(version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type(version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +/* %k */ +static void sprintf_le_key(char *buf, struct reiserfs_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); + else + sprintf(buf, "[NULL]"); +} + +/* %K */ +static void sprintf_cpu_key(char *buf, struct cpu_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), + cpu_type(key)); + else + sprintf(buf, "[NULL]"); +} + +static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) +{ + if (deh) + sprintf(buf, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), + deh_location(deh), deh_state(deh)); + else + sprintf(buf, "[NULL]"); + +} + +static void sprintf_item_head(char *buf, struct item_head *ih) +{ + if (ih) { + strcpy(buf, + (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); + sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); + sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + } else + sprintf(buf, "[NULL]"); +} + +static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) +{ + char name[20]; + + memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); +} + +static void sprintf_block_head(char *buf, struct buffer_head *bh) +{ + sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); +} + +static void sprintf_buffer_head(char *buf, struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + sprintf(buf, + "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bdevname(bh->b_bdev, b), bh->b_size, + (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); +} + +static void sprintf_disk_child(char *buf, struct disk_child *dc) +{ + sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), + dc_size(dc)); +} + +static char *is_there_reiserfs_struct(char *fmt, int *what) +{ + char *k = fmt; + + while ((k = strchr(k, '%')) != NULL) { + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { + *what = k[1]; + break; + } + k++; + } + return k; +} + +/* + * debugging reiserfs we used to print out a lot of different + * variables, like keys, item headers, buffer heads etc. Values of + * most fields matter. So it took a long time just to write + * appropriative printk. With this reiserfs_warning you can use format + * specification for complex structures like you used to do with + * printfs for integers, doubles and pointers. For instance, to print + * out key structure you have to write just: + * reiserfs_warning ("bad key %k", key); + * instead of + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + * key->k_offset, key->k_uniqueness); + */ +static DEFINE_SPINLOCK(error_lock); +static void prepare_error_buf(const char *fmt, va_list args) +{ + char *fmt1 = fmt_buf; + char *k; + char *p = error_buf; + int what; + + spin_lock(&error_lock); + + strcpy(fmt1, fmt); + + while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { + *k = 0; + + p += vsprintf(p, fmt1, args); + + switch (what) { + case 'k': + sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + break; + case 'K': + sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + break; + case 'h': + sprintf_item_head(p, va_arg(args, struct item_head *)); + break; + case 't': + sprintf_direntry(p, + va_arg(args, + struct reiserfs_dir_entry *)); + break; + case 'y': + sprintf_disk_child(p, + va_arg(args, struct disk_child *)); + break; + case 'z': + sprintf_block_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'b': + sprintf_buffer_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'a': + sprintf_de_head(p, + va_arg(args, + struct reiserfs_de_head *)); + break; + } + + p += strlen(p); + fmt1 = k + 2; + } + vsprintf(p, fmt1, args); + spin_unlock(&error_lock); + +} + +/* + * in addition to usual conversion specifiers this accepts reiserfs + * specific conversion specifiers: + * %k to print little endian key, + * %K to print cpu key, + * %h to print item_head, + * %t to print directory entry + * %z to print block head (arg must be struct buffer_head * + * %b to print buffer_head + */ + +#define do_reiserfs_warning(fmt)\ +{\ + va_list args;\ + va_start( args, fmt );\ + prepare_error_buf( fmt, args );\ + va_end( args );\ +} + +void __reiserfs_warning(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: " + "%s\n", sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); +} + +/* No newline.. reiserfs_info calls can be followed by printk's */ +void reiserfs_info(struct super_block *sb, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_NOTICE "REISERFS (device %s): %s", + sb->s_id, error_buf); + else + printk(KERN_NOTICE "REISERFS %s:", error_buf); +} + +/* No newline.. reiserfs_printk calls can be followed by printk's */ +static void reiserfs_printk(const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + printk(error_buf); +} + +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) +{ +#ifdef CONFIG_REISERFS_CHECK + do_reiserfs_warning(fmt); + if (s) + printk(KERN_DEBUG "REISERFS debug (device %s): %s\n", + s->s_id, error_buf); + else + printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf); +#endif +} + +/* + * The format: + * + * maintainer-errorid: [function-name:] message + * + * where errorid is unique to the maintainer and function-name is + * optional, is recommended, so that anyone can easily find the bug + * with a simple grep for the short to type string + * maintainer-errorid. Don't bother with reusing errorids, there are + * lots of numbers out there. + * + * Example: + * + * reiserfs_panic( + * p_sb, "reiser-29: reiserfs_new_blocknrs: " + * "one of search_start or rn(%d) is equal to MAX_B_NUM," + * "which means that we are optimizing location based on the " + * "bogus location of a temp buffer (%p).", + * rn, bh + * ); + * + * Regular panic()s sometimes clear the screen before the message can + * be read, thus the need for the while loop. + * + * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely + * ignores this scheme, and considers it pointless complexity): + * + * panics in reiserfs_fs.h have numbers from 1000 to 1999 + * super.c 2000 to 2999 + * preserve.c (unused) 3000 to 3999 + * bitmap.c 4000 to 4999 + * stree.c 5000 to 5999 + * prints.c 6000 to 6999 + * namei.c 7000 to 7999 + * fix_nodes.c 8000 to 8999 + * dir.c 9000 to 9999 + * lbalance.c 10000 to 10999 + * ibalance.c 11000 to 11999 not ready + * do_balan.c 12000 to 12999 + * inode.c 13000 to 13999 + * file.c 14000 to 14999 + * objectid.c 15000 - 15999 + * buffer.c 16000 - 16999 + * symlink.c 17000 - 17999 + * + * . */ + +void __reiserfs_panic(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif + if (sb) + printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", + sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); + BUG(); +} + +void __reiserfs_error(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + BUG_ON(sb == NULL); + + if (reiserfs_error_panic(sb)) + __reiserfs_panic(sb, id, function, error_buf); + + if (id && id[0]) + printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n", + sb->s_id, id, function, error_buf); + else + printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + if (sb->s_flags & MS_RDONLY) + return; + + reiserfs_info(sb, "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, -EIO); +} + +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + if (reiserfs_error_panic(sb)) { + panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id, + error_buf); + } + + if (reiserfs_is_journal_aborted(SB_JOURNAL(sb))) + return; + + printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, + error_buf); + + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, errno); +} + +/* + * this prints internal nodes (4 keys/items in line) (dc_number, + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + * dc_size)... + */ +static int print_internal(struct buffer_head *bh, int first, int last) +{ + struct reiserfs_key *key; + struct disk_child *dc; + int i; + int from, to; + + if (!B_IS_KEYS_LEVEL(bh)) + return 1; + + check_internal(bh); + + if (first == -1) { + from = 0; + to = B_NR_ITEMS(bh); + } else { + from = first; + to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + } + + reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + dc = B_N_CHILD(bh, from); + reiserfs_printk("PTR %d: %y ", from, dc); + + for (i = from, key = internal_key(bh, from), dc++; i < to; + i++, key++, dc++) { + reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk("\n"); + } + printk("\n"); + return 0; +} + +static int print_leaf(struct buffer_head *bh, int print_mode, int first, + int last) +{ + struct block_head *blkh; + struct item_head *ih; + int i, nr; + int from, to; + + if (!B_IS_ITEMS_LEVEL(bh)) + return 1; + + check_leaf(bh); + + blkh = B_BLK_HEAD(bh); + ih = item_head(bh, 0); + nr = blkh_nr_item(blkh); + + printk + ("\n===================================================================\n"); + reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + nr - 1)->ih_key)); + return 0; + } + + if (first < 0 || first > nr - 1) + from = 0; + else + from = first; + + if (last < 0 || last > nr) + to = nr; + else + to = last; + + ih += from; + printk + ("-------------------------------------------------------------------------------\n"); + printk + ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih++) { + printk + ("-------------------------------------------------------------------------------\n"); + reiserfs_printk("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item(ih, ih_item_body(bh, ih)); + } + + printk + ("===================================================================\n"); + + return 0; +} + +char *reiserfs_hashname(int code) +{ + if (code == YURA_HASH) + return "rupasov"; + if (code == TEA_HASH) + return "tea"; + if (code == R5_HASH) + return "r5"; + + return "unknown"; +} + +/* return 1 if this is not super block */ +static int print_super_block(struct buffer_head *bh) +{ + struct reiserfs_super_block *rs = + (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + char *version; + char b[BDEVNAME_SIZE]; + + if (is_reiserfs_3_5(rs)) { + version = "3.5"; + } else if (is_reiserfs_3_6(rs)) { + version = "3.6"; + } else if (is_reiserfs_jr(rs)) { + version = ((sb_version(rs) == REISERFS_VERSION_2) ? + "3.6" : "3.5"); + } else { + return 1; + } + + printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); + printk("Reiserfs version %s\n", version); + printk("Block count %u\n", sb_block_count(rs)); + printk("Blocksize %d\n", sb_blocksize(rs)); + printk("Free blocks %u\n", sb_free_blocks(rs)); + /* + * FIXME: this would be confusing if + * someone stores reiserfs super block in some data block ;) +// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + */ + skipped = bh->b_blocknr; + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); + printk + ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" + "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), + (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : + sb_reserved_for_journal(rs)), data_blocks); + printk("Root block %u\n", sb_root_block(rs)); + printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); + printk("Journal dev %d\n", sb_jp_journal_dev(rs)); + printk("Journal orig size %d\n", sb_jp_journal_size(rs)); + printk("FS state %d\n", sb_fs_state(rs)); + printk("Hash function \"%s\"\n", + reiserfs_hashname(sb_hash_function_code(rs))); + + printk("Tree height %d\n", sb_tree_height(rs)); + return 0; +} + +static int print_desc_block(struct buffer_head *bh) +{ + struct reiserfs_journal_desc *desc; + + if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) + return 1; + + desc = (struct reiserfs_journal_desc *)(bh->b_data); + printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", + (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), + get_desc_mount_id(desc), get_desc_trans_len(desc)); + + return 0; +} +/* ..., int print_mode, int first, int last) */ +void print_block(struct buffer_head *bh, ...) +{ + va_list args; + int mode, first, last; + + if (!bh) { + printk("print_block: buffer is NULL\n"); + return; + } + + va_start(args, bh); + + mode = va_arg(args, int); + first = va_arg(args, int); + last = va_arg(args, int); + if (print_leaf(bh, mode, first, last)) + if (print_internal(bh, first, last)) + if (print_super_block(bh)) + if (print_desc_block(bh)) + printk + ("Block %llu contains unformatted data\n", + (unsigned long long)bh->b_blocknr); + + va_end(args); +} + +static char print_tb_buf[2048]; + +/* this stores initial state of tree balance in the print_tb_buf */ +void store_print_tb(struct tree_balance *tb) +{ + int h = 0; + int i; + struct buffer_head *tbSh, *tbFh; + + if (!tb) + return; + + sprintf(print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + REISERFS_SB(tb->tb_sb)->s_do_balance, + tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item); + + for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) { + if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= + tb->tb_path->path_length + && PATH_H_PATH_OFFSET(tb->tb_path, + h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER(tb->tb_path, h); + tbFh = PATH_H_PPARENT(tb->tb_path, h); + } else { + tbSh = NULL; + tbFh = NULL; + } + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", + h, + (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), + (tbSh) ? atomic_read(&tbSh->b_count) : -1, + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), + (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), + (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, + (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), + (tb->FL[h]) ? (long long)(tb->FL[h]-> + b_blocknr) : (-1LL), + (tb->FR[h]) ? (long long)(tb->FR[h]-> + b_blocknr) : (-1LL), + (tb->CFL[h]) ? (long long)(tb->CFL[h]-> + b_blocknr) : (-1LL), + (tb->CFR[h]) ? (long long)(tb->CFR[h]-> + b_blocknr) : (-1LL)); + } + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], + tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], + tb->sbytes[0], tb->snum[1], tb->sbytes[1], + tb->cur_blknum, tb->lkey[0], tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], + tb->blknum[h]); + } while (tb->insert_size[h]); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "FEB list: "); + + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < ARRAY_SIZE(tb->FEB); i++) + sprintf(print_tb_buf + strlen(print_tb_buf), + "%p (%llu %d)%s", tb->FEB[i], + tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> + b_blocknr : 0ULL, + tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, + (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "======================== the end ====================================\n"); +} + +void print_cur_tb(char *mes) +{ + printk("%s\n%s", mes, print_tb_buf); +} + +static void check_leaf_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6010", "invalid item number %z", + bh); + if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) + reiserfs_panic(NULL, "vs-6020", "invalid free space %z", + bh); + +} + +static void check_internal_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + + blkh = B_BLK_HEAD(bh); + if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) + reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); + + if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh); + + if (B_FREE_SPACE(bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - + DC_SIZE * (B_NR_ITEMS(bh) + 1)) + reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh); + +} + +void check_leaf(struct buffer_head *bh) +{ + int i; + struct item_head *ih; + + if (!bh) + return; + check_leaf_block_head(bh); + for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, ih_item_body(bh, ih)); +} + +void check_internal(struct buffer_head *bh) +{ + if (!bh) + return; + check_internal_block_head(bh); +} + +void print_statistics(struct super_block *s) +{ + + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ + bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); + */ + +} diff --git a/kernel/fs/reiserfs/procfs.c b/kernel/fs/reiserfs/procfs.c new file mode 100644 index 000000000..621b9f381 --- /dev/null +++ b/kernel/fs/reiserfs/procfs.c @@ -0,0 +1,508 @@ +/* -*- linux-c -*- */ + +/* fs/reiserfs/procfs.c */ + +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* proc info support a la one created by Sizif@Botik.RU for PGC */ + +#include +#include +#include +#include +#include "reiserfs.h" +#include +#include + +/* + * LOCKING: + * + * These guys are evicted from procfs as the very first step in ->kill_sb(). + * + */ + +static int show_version(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + char *format; + + if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { + format = "3.6"; + } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { + format = "3.5"; + } else { + format = "unknown"; + } + + seq_printf(m, "%s format\twith checks %s\n", format, +#if defined( CONFIG_REISERFS_CHECK ) + "on" +#else + "off" +#endif + ); + return 0; +} + +#define SF( x ) ( r -> x ) +#define SFP( x ) SF( s_proc_info_data.x ) +#define SFPL( x ) SFP( x[ level ] ) +#define SFPF( x ) SFP( scan_bitmap.x ) +#define SFPJ( x ) SFP( journal.x ) + +#define D2C( x ) le16_to_cpu( x ) +#define D4C( x ) le32_to_cpu( x ) +#define DF( x ) D2C( rs -> s_v1.x ) +#define DFL( x ) D4C( rs -> s_v1.x ) + +#define objectid_map( s, rs ) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ + (__le32 *)(rs + 1)) +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) + +#define DJF( x ) le32_to_cpu( rs -> x ) +#define DJV( x ) le32_to_cpu( s_v1 -> x ) +#define DJP( x ) le32_to_cpu( jp -> x ) +#define JF( x ) ( r -> s_journal -> x ) + +static int show_super(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "state: \t%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" + "gen. counter: \t%i\n" + "s_disk_reads: \t%i\n" + "s_disk_writes: \t%i\n" + "s_fix_nodes: \t%i\n" + "s_do_balance: \t%i\n" + "s_unneeded_left_neighbor: \t%i\n" + "s_good_search_by_key_reada: \t%i\n" + "s_bmaps: \t%i\n" + "s_bmaps_without_search: \t%i\n" + "s_direct2indirect: \t%i\n" + "s_indirect2direct: \t%i\n" + "\n" + "max_hash_collisions: \t%i\n" + "breads: \t%lu\n" + "bread_misses: \t%lu\n" + "search_by_key: \t%lu\n" + "search_by_key_fs_changed: \t%lu\n" + "search_by_key_restarted: \t%lu\n" + "insert_item_restarted: \t%lu\n" + "paste_into_item_restarted: \t%lu\n" + "cut_from_item_restarted: \t%lu\n" + "delete_solid_item_restarted: \t%lu\n" + "delete_item_restarted: \t%lu\n" + "leaked_oid: \t%lu\n" + "leaves_removable: \t%lu\n", + SF(s_mount_state) == REISERFS_VALID_FS ? + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", + reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", + reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", + reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", + reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", + reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", + reiserfs_no_unhashed_relocation(sb) ? + "NO_UNHASHED_RELOCATION " : "", + reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", + reiserfs_test4(sb) ? "TEST4 " : "", + have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? + "SMALL_TAILS " : "NO_TAILS ", + replay_only(sb) ? "REPLAY_ONLY " : "", + convert_reiserfs(sb) ? "CONV " : "", + atomic_read(&r->s_generation_counter), + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), + SF(s_do_balance), SF(s_unneeded_left_neighbor), + SF(s_good_search_by_key_reada), SF(s_bmaps), + SF(s_bmaps_without_search), SF(s_direct2indirect), + SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), + SFP(bread_miss), SFP(search_by_key), + SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), + SFP(insert_item_restarted), SFP(paste_into_item_restarted), + SFP(cut_from_item_restarted), + SFP(delete_solid_item_restarted), SFP(delete_item_restarted), + SFP(leaked_oid), SFP(leaves_removable)); + + return 0; +} + +static int show_per_level(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + int level; + + seq_printf(m, "level\t" + " balances" + " [sbk: reads" + " fs_changed" + " restarted]" + " free space" + " items" + " can_remove" + " lnum" + " rnum" + " lbytes" + " rbytes" + " get_neig" + " get_neig_res" " need_l_neig" " need_r_neig" "\n"); + + for (level = 0; level < MAX_HEIGHT; ++level) { + seq_printf(m, "%i\t" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12li" + " %12li" + " %12li" + " %12li" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + "\n", + level, + SFPL(balance_at), + SFPL(sbk_read_at), + SFPL(sbk_fs_changed), + SFPL(sbk_restarted), + SFPL(free_at), + SFPL(items_at), + SFPL(can_node_be_removed), + SFPL(lnum), + SFPL(rnum), + SFPL(lbytes), + SFPL(rbytes), + SFPL(get_neighbors), + SFPL(get_neighbors_restart), + SFPL(need_l_neighbor), SFPL(need_r_neighbor) + ); + } + return 0; +} + +static int show_bitmap(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "free_block: %lu\n" + " scan_bitmap:" + " wait" + " bmap" + " retry" + " stolen" + " journal_hint" + "journal_nohint" + "\n" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + "\n", + SFP(free_block), + SFPF(call), + SFPF(wait), + SFPF(bmap), + SFPF(retry), + SFPF(stolen), + SFPF(in_journal_hint), SFPF(in_journal_nohint)); + + return 0; +} + +static int show_on_disk_super(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + int hash_code = DFL(s_hash_function_code); + __u32 flags = DJF(s_flags); + + seq_printf(m, "block_count: \t%i\n" + "free_blocks: \t%i\n" + "root_block: \t%i\n" + "blocksize: \t%i\n" + "oid_maxsize: \t%i\n" + "oid_cursize: \t%i\n" + "umount_state: \t%i\n" + "magic: \t%10.10s\n" + "fs_state: \t%i\n" + "hash: \t%s\n" + "tree_height: \t%i\n" + "bmap_nr: \t%i\n" + "version: \t%i\n" + "flags: \t%x[%s]\n" + "reserved_for_journal: \t%i\n", + DFL(s_block_count), + DFL(s_free_blocks), + DFL(s_root_block), + DF(s_blocksize), + DF(s_oid_maxsize), + DF(s_oid_cursize), + DF(s_umount_state), + rs->s_v1.s_magic, + DF(s_fs_state), + hash_code == TEA_HASH ? "tea" : + (hash_code == YURA_HASH) ? "rupasov" : + (hash_code == R5_HASH) ? "r5" : + (hash_code == UNSET_HASH) ? "unset" : "unknown", + DF(s_tree_height), + DF(s_bmap_nr), + DF(s_version), flags, (flags & reiserfs_attrs_cleared) + ? "attrs_cleared" : "", DF(s_reserved_for_journal)); + + return 0; +} + +static int show_oidmap(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); + unsigned long total_used = 0; + int i; + + for (i = 0; i < mapsize; ++i) { + __u32 right; + + right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); + seq_printf(m, "%s: [ %x .. %x )\n", + (i & 1) ? "free" : "used", MAP(i), right); + if (!(i & 1)) { + total_used += right - MAP(i); + } + } +#if defined( REISERFS_USE_OIDMAPF ) + if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { + loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; + total_used += size / sizeof(reiserfs_oidinterval_d_t); + } +#endif + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", + mapsize, + mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); + return 0; +} + +static int show_journal(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + struct reiserfs_super_block *rs = r->s_rs; + struct journal_params *jp = &rs->s_v1.s_journal; + char b[BDEVNAME_SIZE]; + + seq_printf(m, /* on-disk fields */ + "jp_journal_1st_block: \t%i\n" + "jp_journal_dev: \t%s[%x]\n" + "jp_journal_size: \t%i\n" + "jp_journal_trans_max: \t%i\n" + "jp_journal_magic: \t%i\n" + "jp_journal_max_batch: \t%i\n" + "jp_journal_max_commit_age: \t%i\n" + "jp_journal_max_trans_age: \t%i\n" + /* incore fields */ + "j_1st_reserved_block: \t%i\n" + "j_state: \t%li\n" + "j_trans_id: \t%u\n" + "j_mount_id: \t%lu\n" + "j_start: \t%lu\n" + "j_len: \t%lu\n" + "j_len_alloc: \t%lu\n" + "j_wcount: \t%i\n" + "j_bcount: \t%lu\n" + "j_first_unflushed_offset: \t%lu\n" + "j_last_flush_trans_id: \t%u\n" + "j_trans_start_time: \t%li\n" + "j_list_bitmap_index: \t%i\n" + "j_must_wait: \t%i\n" + "j_next_full_flush: \t%i\n" + "j_next_async_flush: \t%i\n" + "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" + /* reiserfs_proc_info_data_t.journal fields */ + "in_journal: \t%12lu\n" + "in_journal_bitmap: \t%12lu\n" + "in_journal_reusable: \t%12lu\n" + "lock_journal: \t%12lu\n" + "lock_journal_wait: \t%12lu\n" + "journal_begin: \t%12lu\n" + "journal_relock_writers: \t%12lu\n" + "journal_relock_wcount: \t%12lu\n" + "mark_dirty: \t%12lu\n" + "mark_dirty_already: \t%12lu\n" + "mark_dirty_notjournal: \t%12lu\n" + "restore_prepared: \t%12lu\n" + "prepare: \t%12lu\n" + "prepare_retry: \t%12lu\n", + DJP(jp_journal_1st_block), + bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + DJP(jp_journal_dev), + DJP(jp_journal_size), + DJP(jp_journal_trans_max), + DJP(jp_journal_magic), + DJP(jp_journal_max_batch), + SB_JOURNAL(sb)->j_max_commit_age, + DJP(jp_journal_max_trans_age), + JF(j_1st_reserved_block), + JF(j_state), + JF(j_trans_id), + JF(j_mount_id), + JF(j_start), + JF(j_len), + JF(j_len_alloc), + atomic_read(&r->s_journal->j_wcount), + JF(j_bcount), + JF(j_first_unflushed_offset), + JF(j_last_flush_trans_id), + JF(j_trans_start_time), + JF(j_list_bitmap_index), + JF(j_must_wait), + JF(j_next_full_flush), + JF(j_next_async_flush), + JF(j_cnode_used), + JF(j_cnode_free), + SFPJ(in_journal), + SFPJ(in_journal_bitmap), + SFPJ(in_journal_reusable), + SFPJ(lock_journal), + SFPJ(lock_journal_wait), + SFPJ(journal_being), + SFPJ(journal_relock_writers), + SFPJ(journal_relock_wcount), + SFPJ(mark_dirty), + SFPJ(mark_dirty_already), + SFPJ(mark_dirty_notjournal), + SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) + ); + return 0; +} + +static int r_open(struct inode *inode, struct file *file) +{ + return single_open(file, PDE_DATA(inode), + proc_get_parent_data(inode)); +} + +static const struct file_operations r_file_operations = { + .open = r_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct proc_dir_entry *proc_info_root = NULL; +static const char proc_info_root_name[] = "fs/reiserfs"; + +static void add_file(struct super_block *sb, char *name, + int (*func) (struct seq_file *, void *)) +{ + proc_create_data(name, 0, REISERFS_SB(sb)->procdir, + &r_file_operations, func); +} + +int reiserfs_proc_info_init(struct super_block *sb) +{ + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, sb->s_id, BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + spin_lock_init(&__PINFO(sb).lock); + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); + if (REISERFS_SB(sb)->procdir) { + add_file(sb, "version", show_version); + add_file(sb, "super", show_super); + add_file(sb, "per-level", show_per_level); + add_file(sb, "bitmap", show_bitmap); + add_file(sb, "on-disk-super", show_on_disk_super); + add_file(sb, "oidmap", show_oidmap); + add_file(sb, "journal", show_journal); + return 0; + } + reiserfs_warning(sb, "cannot create /proc/%s/%s", + proc_info_root_name, b); + return 1; +} + +int reiserfs_proc_info_done(struct super_block *sb) +{ + struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + if (de) { + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, sb->s_id, BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + remove_proc_subtree(b, proc_info_root); + REISERFS_SB(sb)->procdir = NULL; + } + return 0; +} + +int reiserfs_proc_info_global_init(void) +{ + if (proc_info_root == NULL) { + proc_info_root = proc_mkdir(proc_info_root_name, NULL); + if (!proc_info_root) { + reiserfs_warning(NULL, "cannot create /proc/%s", + proc_info_root_name); + return 1; + } + } + return 0; +} + +int reiserfs_proc_info_global_done(void) +{ + if (proc_info_root != NULL) { + proc_info_root = NULL; + remove_proc_entry(proc_info_root_name, NULL); + } + return 0; +} +/* + * Revision 1.1.8.2 2001/07/15 17:08:42 god + * . use get_super() in procfs.c + * . remove remove_save_link() from reiserfs_do_truncate() + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + * Revision 1.1.8.1 2001/07/11 16:48:50 god + * proc info support + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + */ + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/kernel/fs/reiserfs/reiserfs.h b/kernel/fs/reiserfs/reiserfs.h new file mode 100644 index 000000000..2adcde137 --- /dev/null +++ b/kernel/fs/reiserfs/reiserfs.h @@ -0,0 +1,3411 @@ +/* + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for + * licensing and copyright details + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* the 32 bit compat definitions with int argument */ +#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) +#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION +#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION + +struct reiserfs_journal_list; + +/* bitmasks for i_flags field in reiserfs-specific part of inode */ +typedef enum { + /* + * this says what format of key do all items (but stat data) of + * an object have. If this is set, that format is 3.6 otherwise - 3.5 + */ + i_item_key_version_mask = 0x0001, + + /* + * If this is unset, object has 3.5 stat data, otherwise, + * it has 3.6 stat data with 64bit size, 32bit nlink etc. + */ + i_stat_data_version_mask = 0x0002, + + /* file might need tail packing on close */ + i_pack_on_close_mask = 0x0004, + + /* don't pack tail of file */ + i_nopack_mask = 0x0008, + + /* + * If either of these are set, "safe link" was created for this + * file during truncate or unlink. Safe link is used to avoid + * leakage of disk space on crash with some files open, but unlinked. + */ + i_link_saved_unlink_mask = 0x0010, + i_link_saved_truncate_mask = 0x0020, + + i_has_xattr_dir = 0x0040, + i_data_log = 0x0080, +} reiserfs_inode_flags; + +struct reiserfs_inode_info { + __u32 i_key[4]; /* key is still 4 32 bit integers */ + + /* + * transient inode flags that are never stored on disk. Bitmasks + * for this field are defined above. + */ + __u32 i_flags; + + /* offset of first byte stored in direct item. */ + __u32 i_first_direct_byte; + + /* copy of persistent inode flags read from sd_attrs. */ + __u32 i_attrs; + + /* first unused block of a sequence of unused blocks */ + int i_prealloc_block; + int i_prealloc_count; /* length of that sequence */ + + /* per-transaction list of inodes which have preallocated blocks */ + struct list_head i_prealloc_list; + + /* + * new_packing_locality is created; new blocks for the contents + * of this directory should be displaced + */ + unsigned new_packing_locality:1; + + /* + * we use these for fsync or O_SYNC to decide which transaction + * needs to be committed in order for this inode to be properly + * flushed + */ + unsigned int i_trans_id; + + struct reiserfs_journal_list *i_jl; + atomic_t openers; + struct mutex tailpack; +#ifdef CONFIG_REISERFS_FS_XATTR + struct rw_semaphore i_xattr_sem; +#endif +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + struct inode vfs_inode; +}; + +typedef enum { + reiserfs_attrs_cleared = 0x00000001, +} reiserfs_super_block_flags; + +/* + * struct reiserfs_super_block accessors/mutators since this is a disk + * structure, it will always be in little endian format. + */ +#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count)) +#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) +#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks)) +#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v)) +#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block)) +#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v)) + +#define sb_jp_journal_1st_block(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block)) +#define set_sb_jp_journal_1st_block(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v)) +#define sb_jp_journal_dev(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev)) +#define set_sb_jp_journal_dev(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v)) +#define sb_jp_journal_size(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size)) +#define set_sb_jp_journal_size(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v)) +#define sb_jp_journal_trans_max(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max)) +#define set_sb_jp_journal_trans_max(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v)) +#define sb_jp_journal_magic(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic)) +#define set_sb_jp_journal_magic(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v)) +#define sb_jp_journal_max_batch(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch)) +#define set_sb_jp_journal_max_batch(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v)) +#define sb_jp_jourmal_max_commit_age(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age)) +#define set_sb_jp_journal_max_commit_age(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v)) + +#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize)) +#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v)) +#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize)) +#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v)) +#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize)) +#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v)) +#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state)) +#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v)) +#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state)) +#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v)) +#define sb_hash_function_code(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_hash_function_code)) +#define set_sb_hash_function_code(sbp,v) \ + ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v)) +#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height)) +#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v)) +#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr)) +#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v)) +#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version)) +#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v)) + +#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count)) +#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v)) + +#define sb_reserved_for_journal(sbp) \ + (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal)) +#define set_sb_reserved_for_journal(sbp,v) \ + ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v)) + +/* LOGGING -- */ + +/* + * These all interelate for performance. + * + * If the journal block count is smaller than n transactions, you lose speed. + * I don't know what n is yet, I'm guessing 8-16. + * + * typical transaction size depends on the application, how often fsync is + * called, and how many metadata blocks you dirty in a 30 second period. + * The more small files (<16k) you use, the larger your transactions will + * be. + * + * If your journal fills faster than dirty buffers get flushed to disk, it + * must flush them before allowing the journal to wrap, which slows things + * down. If you need high speed meta data updates, the journal should be + * big enough to prevent wrapping before dirty meta blocks get to disk. + * + * If the batch max is smaller than the transaction max, you'll waste space + * at the end of the journal because journal_end sets the next transaction + * to start at 0 if the next transaction has any chance of wrapping. + * + * The large the batch max age, the better the speed, and the more meta + * data changes you'll lose after a crash. + */ + +/* don't mess with these for a while */ +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ +#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ +#define JOURNAL_HASH_SIZE 8192 + +/* number of copies of the bitmaps to have floating. Must be >= 2 */ +#define JOURNAL_NUM_BITMAPS 5 + +/* + * One of these for every block in every transaction + * Each one is in two hash tables. First, a hash of the current transaction, + * and after journal_end, a hash of all the in memory transactions. + * next and prev are used by the current transaction (journal_hash). + * hnext and hprev are used by journal_list_hash. If a block is in more + * than one transaction, the journal_list_hash links it in multiple times. + * This allows flush_journal_list to remove just the cnode belonging to a + * given transaction. + */ +struct reiserfs_journal_cnode { + struct buffer_head *bh; /* real buffer head */ + struct super_block *sb; /* dev of real buffer head */ + + /* block number of real buffer head, == 0 when buffer on disk */ + __u32 blocknr; + + unsigned long state; + + /* journal list this cnode lives in */ + struct reiserfs_journal_list *jlist; + + struct reiserfs_journal_cnode *next; /* next in transaction list */ + struct reiserfs_journal_cnode *prev; /* prev in transaction list */ + struct reiserfs_journal_cnode *hprev; /* prev in hash list */ + struct reiserfs_journal_cnode *hnext; /* next in hash list */ +}; + +struct reiserfs_bitmap_node { + int id; + char *data; + struct list_head list; +}; + +struct reiserfs_list_bitmap { + struct reiserfs_journal_list *journal_list; + struct reiserfs_bitmap_node **bitmaps; +}; + +/* + * one of these for each transaction. The most important part here is the + * j_realblock. this list of cnodes is used to hash all the blocks in all + * the commits, to mark all the real buffer heads dirty once all the commits + * hit the disk, and to make sure every real block in a transaction is on + * disk before allowing the log area to be overwritten + */ +struct reiserfs_journal_list { + unsigned long j_start; + unsigned long j_state; + unsigned long j_len; + atomic_t j_nonzerolen; + atomic_t j_commit_left; + + /* all commits older than this on disk */ + atomic_t j_older_commits_done; + + struct mutex j_commit_mutex; + unsigned int j_trans_id; + time_t j_timestamp; + struct reiserfs_list_bitmap *j_list_bitmap; + struct buffer_head *j_commit_bh; /* commit buffer head */ + struct reiserfs_journal_cnode *j_realblock; + struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */ + /* time ordered list of all active transactions */ + struct list_head j_list; + + /* + * time ordered list of all transactions we haven't tried + * to flush yet + */ + struct list_head j_working_list; + + /* list of tail conversion targets in need of flush before commit */ + struct list_head j_tail_bh_list; + + /* list of data=ordered buffers in need of flush before commit */ + struct list_head j_bh_list; + int j_refcount; +}; + +struct reiserfs_journal { + struct buffer_head **j_ap_blocks; /* journal blocks on disk */ + /* newest journal block */ + struct reiserfs_journal_cnode *j_last; + + /* oldest journal block. start here for traverse */ + struct reiserfs_journal_cnode *j_first; + + struct block_device *j_dev_bd; + fmode_t j_dev_mode; + + /* first block on s_dev of reserved area journal */ + int j_1st_reserved_block; + + unsigned long j_state; + unsigned int j_trans_id; + unsigned long j_mount_id; + + /* start of current waiting commit (index into j_ap_blocks) */ + unsigned long j_start; + unsigned long j_len; /* length of current waiting commit */ + + /* number of buffers requested by journal_begin() */ + unsigned long j_len_alloc; + + atomic_t j_wcount; /* count of writers for current commit */ + + /* batch count. allows turning X transactions into 1 */ + unsigned long j_bcount; + + /* first unflushed transactions offset */ + unsigned long j_first_unflushed_offset; + + /* last fully flushed journal timestamp */ + unsigned j_last_flush_trans_id; + + struct buffer_head *j_header_bh; + + time_t j_trans_start_time; /* time this transaction started */ + struct mutex j_mutex; + struct mutex j_flush_mutex; + + /* wait for current transaction to finish before starting new one */ + wait_queue_head_t j_join_wait; + + atomic_t j_jlock; /* lock for j_join_wait */ + int j_list_bitmap_index; /* number of next list bitmap to use */ + + /* no more journal begins allowed. MUST sleep on j_join_wait */ + int j_must_wait; + + /* next journal_end will flush all journal list */ + int j_next_full_flush; + + /* next journal_end will flush all async commits */ + int j_next_async_flush; + + int j_cnode_used; /* number of cnodes on the used list */ + int j_cnode_free; /* number of cnodes on the free list */ + + /* max number of blocks in a transaction. */ + unsigned int j_trans_max; + + /* max number of blocks to batch into a trans */ + unsigned int j_max_batch; + + /* in seconds, how old can an async commit be */ + unsigned int j_max_commit_age; + + /* in seconds, how old can a transaction be */ + unsigned int j_max_trans_age; + + /* the default for the max commit age */ + unsigned int j_default_max_commit_age; + + struct reiserfs_journal_cnode *j_cnode_free_list; + + /* orig pointer returned from vmalloc */ + struct reiserfs_journal_cnode *j_cnode_free_orig; + + struct reiserfs_journal_list *j_current_jl; + int j_free_bitmap_nodes; + int j_used_bitmap_nodes; + + int j_num_lists; /* total number of active transactions */ + int j_num_work_lists; /* number that need attention from kreiserfsd */ + + /* debugging to make sure things are flushed in order */ + unsigned int j_last_flush_id; + + /* debugging to make sure things are committed in order */ + unsigned int j_last_commit_id; + + struct list_head j_bitmap_nodes; + struct list_head j_dirty_buffers; + spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */ + + /* list of all active transactions */ + struct list_head j_journal_list; + + /* lists that haven't been touched by writeback attempts */ + struct list_head j_working_list; + + /* hash table for real buffer heads in current trans */ + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; + + /* hash table for all the real buffer heads in all the transactions */ + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; + + /* array of bitmaps to record the deleted blocks */ + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; + + /* list of inodes which have preallocated blocks */ + struct list_head j_prealloc_list; + int j_persistent_trans; + unsigned long j_max_trans_size; + unsigned long j_max_batch_size; + + int j_errno; + + /* when flushing ordered buffers, throttle new ordered writers */ + struct delayed_work j_work; + struct super_block *j_work_sb; + atomic_t j_async_throttle; +}; + +enum journal_state_bits { + J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */ + J_WRITERS_QUEUED, /* set when log is full due to too many writers */ + J_ABORTED, /* set when log is aborted */ +}; + +/* ick. magic string to find desc blocks in the journal */ +#define JOURNAL_DESC_MAGIC "ReIsErLB" + +typedef __u32(*hashf_t) (const signed char *, int); + +struct reiserfs_bitmap_info { + __u32 free_count; +}; + +struct proc_dir_entry; + +#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO ) +typedef unsigned long int stat_cnt_t; +typedef struct reiserfs_proc_info_data { + spinlock_t lock; + int exiting; + int max_hash_collisions; + + stat_cnt_t breads; + stat_cnt_t bread_miss; + stat_cnt_t search_by_key; + stat_cnt_t search_by_key_fs_changed; + stat_cnt_t search_by_key_restarted; + + stat_cnt_t insert_item_restarted; + stat_cnt_t paste_into_item_restarted; + stat_cnt_t cut_from_item_restarted; + stat_cnt_t delete_solid_item_restarted; + stat_cnt_t delete_item_restarted; + + stat_cnt_t leaked_oid; + stat_cnt_t leaves_removable; + + /* + * balances per level. + * Use explicit 5 as MAX_HEIGHT is not visible yet. + */ + stat_cnt_t balance_at[5]; /* XXX */ + /* sbk == search_by_key */ + stat_cnt_t sbk_read_at[5]; /* XXX */ + stat_cnt_t sbk_fs_changed[5]; + stat_cnt_t sbk_restarted[5]; + stat_cnt_t items_at[5]; /* XXX */ + stat_cnt_t free_at[5]; /* XXX */ + stat_cnt_t can_node_be_removed[5]; /* XXX */ + long int lnum[5]; /* XXX */ + long int rnum[5]; /* XXX */ + long int lbytes[5]; /* XXX */ + long int rbytes[5]; /* XXX */ + stat_cnt_t get_neighbors[5]; + stat_cnt_t get_neighbors_restart[5]; + stat_cnt_t need_l_neighbor[5]; + stat_cnt_t need_r_neighbor[5]; + + stat_cnt_t free_block; + struct __scan_bitmap_stats { + stat_cnt_t call; + stat_cnt_t wait; + stat_cnt_t bmap; + stat_cnt_t retry; + stat_cnt_t in_journal_hint; + stat_cnt_t in_journal_nohint; + stat_cnt_t stolen; + } scan_bitmap; + struct __journal_stats { + stat_cnt_t in_journal; + stat_cnt_t in_journal_bitmap; + stat_cnt_t in_journal_reusable; + stat_cnt_t lock_journal; + stat_cnt_t lock_journal_wait; + stat_cnt_t journal_being; + stat_cnt_t journal_relock_writers; + stat_cnt_t journal_relock_wcount; + stat_cnt_t mark_dirty; + stat_cnt_t mark_dirty_already; + stat_cnt_t mark_dirty_notjournal; + stat_cnt_t restore_prepared; + stat_cnt_t prepare; + stat_cnt_t prepare_retry; + } journal; +} reiserfs_proc_info_data_t; +#else +typedef struct reiserfs_proc_info_data { +} reiserfs_proc_info_data_t; +#endif + +/* Number of quota types we support */ +#define REISERFS_MAXQUOTAS 2 + +/* reiserfs union of in-core super block data */ +struct reiserfs_sb_info { + /* Buffer containing the super block */ + struct buffer_head *s_sbh; + + /* Pointer to the on-disk super block in the buffer */ + struct reiserfs_super_block *s_rs; + struct reiserfs_bitmap_info *s_ap_bitmap; + + /* pointer to journal information */ + struct reiserfs_journal *s_journal; + + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ + + /* Serialize writers access, replace the old bkl */ + struct mutex lock; + + /* Owner of the lock (can be recursive) */ + struct task_struct *lock_owner; + + /* Depth of the lock, start from -1 like the bkl */ + int lock_depth; + + struct workqueue_struct *commit_wq; + + /* Comment? -Hans */ + void (*end_io_handler) (struct buffer_head *, int); + + /* + * pointer to function which is used to sort names in directory. + * Set on mount + */ + hashf_t s_hash_function; + + /* reiserfs's mount options are set here */ + unsigned long s_mount_opt; + + /* This is a structure that describes block allocator options */ + struct { + /* Bitfield for enable/disable kind of options */ + unsigned long bits; + + /* + * size started from which we consider file + * to be a large one (in blocks) + */ + unsigned long large_file_size; + + int border; /* percentage of disk, border takes */ + + /* + * Minimal file size (in blocks) starting + * from which we do preallocations + */ + int preallocmin; + + /* + * Number of blocks we try to prealloc when file + * reaches preallocmin size (in blocks) or prealloc_list + is empty. + */ + int preallocsize; + } s_alloc_options; + + /* Comment? -Hans */ + wait_queue_head_t s_wait; + /* increased by one every time the tree gets re-balanced */ + atomic_t s_generation_counter; + + /* File system properties. Currently holds on-disk FS format */ + unsigned long s_properties; + + /* session statistics */ + int s_disk_reads; + int s_disk_writes; + int s_fix_nodes; + int s_do_balance; + int s_unneeded_left_neighbor; + int s_good_search_by_key_reada; + int s_bmaps; + int s_bmaps_without_search; + int s_direct2indirect; + int s_indirect2direct; + + /* + * set up when it's ok for reiserfs_read_inode2() to read from + * disk inode with nlink==0. Currently this is only used during + * finish_unfinished() processing at mount time + */ + int s_is_unlinked_ok; + + reiserfs_proc_info_data_t s_proc_info_data; + struct proc_dir_entry *procdir; + + /* amount of blocks reserved for further allocations */ + int reserved_blocks; + + + /* this lock on now only used to protect reserved_blocks variable */ + spinlock_t bitmap_lock; + struct dentry *priv_root; /* root of /.reiserfs_priv */ + struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ + int j_errno; + + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work old_work; /* old transactions flush delayed work */ + spinlock_t old_work_lock; /* protects old_work and work_queued */ + +#ifdef CONFIG_QUOTA + char *s_qf_names[REISERFS_MAXQUOTAS]; + int s_jquota_fmt; +#endif + char *s_jdev; /* Stored jdev for mount option showing */ +#ifdef CONFIG_REISERFS_CHECK + + /* + * Detects whether more than one copy of tb exists per superblock + * as a means of checking whether do_balance is executing + * concurrently against another tree reader/writer on a same + * mount point. + */ + struct tree_balance *cur_tb; +#endif +}; + +/* Definitions of reiserfs on-disk properties: */ +#define REISERFS_3_5 0 +#define REISERFS_3_6 1 +#define REISERFS_OLD_FORMAT 2 + +/* Mount options */ +enum reiserfs_mount_options { + /* large tails will be created in a session */ + REISERFS_LARGETAIL, + /* + * small (for files less than block size) tails will + * be created in a session + */ + REISERFS_SMALLTAIL, + + /* replay journal and return 0. Use by fsck */ + REPLAYONLY, + + /* + * -o conv: causes conversion of old format super block to the + * new format. If not specified - old partition will be dealt + * with in a manner of 3.5.x + */ + REISERFS_CONVERT, + + /* + * -o hash={tea, rupasov, r5, detect} is meant for properly mounting + * reiserfs disks from 3.5.19 or earlier. 99% of the time, this + * option is not required. If the normal autodection code can't + * determine which hash to use (because both hashes had the same + * value for a file) use this option to force a specific hash. + * It won't allow you to override the existing hash on the FS, so + * if you have a tea hash disk, and mount with -o hash=rupasov, + * the mount will fail. + */ + FORCE_TEA_HASH, /* try to force tea hash on mount */ + FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ + FORCE_R5_HASH, /* try to force rupasov hash on mount */ + FORCE_HASH_DETECT, /* try to detect hash function on mount */ + + REISERFS_DATA_LOG, + REISERFS_DATA_ORDERED, + REISERFS_DATA_WRITEBACK, + + /* + * used for testing experimental features, makes benchmarking new + * features with and without more convenient, should never be used by + * users in any code shipped to users (ideally) + */ + + REISERFS_NO_BORDER, + REISERFS_NO_UNHASHED_RELOCATION, + REISERFS_HASHED_RELOCATION, + REISERFS_ATTRS, + REISERFS_XATTRS_USER, + REISERFS_POSIXACL, + REISERFS_EXPOSE_PRIVROOT, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, + + /* Actions on error */ + REISERFS_ERROR_PANIC, + REISERFS_ERROR_RO, + REISERFS_ERROR_CONTINUE, + + REISERFS_USRQUOTA, /* User quota option specified */ + REISERFS_GRPQUOTA, /* Group quota option specified */ + + REISERFS_TEST1, + REISERFS_TEST2, + REISERFS_TEST3, + REISERFS_TEST4, + REISERFS_UNSUPPORTED_OPT, +}; + +#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH)) +#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH)) +#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH)) +#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT)) +#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER)) +#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION)) +#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION)) +#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4)) + +#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL)) +#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL)) +#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY)) +#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS)) +#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5)) +#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT)) +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG)) +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED)) +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) +#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT)) +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) + +#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC)) +#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO)) + +void reiserfs_file_buffer(struct buffer_head *bh, int list); +extern struct file_system_type reiserfs_fs_type; +int reiserfs_resize(struct super_block *, unsigned long); + +#define CARRY_ON 0 +#define SCHEDULE_OCCURRED 1 + +#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) +#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) +#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) +#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) + +#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) + +#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) +static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal + *journal) +{ + return test_bit(J_ABORTED, &journal->j_state); +} + +/* + * Locking primitives. The write lock is a per superblock + * special mutex that has properties close to the Big Kernel Lock + * which was used in the previous locking scheme. + */ +void reiserfs_write_lock(struct super_block *s); +void reiserfs_write_unlock(struct super_block *s); +int __must_check reiserfs_write_unlock_nested(struct super_block *s); +void reiserfs_write_lock_nested(struct super_block *s, int depth); + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *s); +#else +static inline void reiserfs_lock_check_recursive(struct super_block *s) { } +#endif + +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + * - The inode mutex + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, + struct super_block *s) +{ + int depth; + + depth = reiserfs_write_unlock_nested(s); + mutex_lock(m); + reiserfs_write_lock_nested(s, depth); +} + +static inline void +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, + struct super_block *s) +{ + int depth; + + depth = reiserfs_write_unlock_nested(s); + mutex_lock_nested(m, subclass); + reiserfs_write_lock_nested(s, depth); +} + +static inline void +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) +{ + int depth; + depth = reiserfs_write_unlock_nested(s); + down_read(sem); + reiserfs_write_lock_nested(s, depth); +} + +/* + * When we schedule, we usually want to also release the write lock, + * according to the previous bkl based locking scheme of reiserfs. + */ +static inline void reiserfs_cond_resched(struct super_block *s) +{ + if (need_resched()) { + int depth; + + depth = reiserfs_write_unlock_nested(s); + schedule(); + reiserfs_write_lock_nested(s, depth); + } +} + +struct fid; + +/* + * in reading the #defines, it may help to understand that they employ + * the following abbreviations: + * + * B = Buffer + * I = Item header + * H = Height within the tree (should be changed to LEV) + * N = Number of the item in the node + * STAT = stat data + * DEH = Directory Entry Header + * EC = Entry Count + * E = Entry number + * UL = Unsigned Long + * BLKH = BLocK Header + * UNFM = UNForMatted node + * DC = Disk Child + * P = Path + * + * These #defines are named by concatenating these abbreviations, + * where first comes the arguments, and last comes the return value, + * of the macro. + */ + +#define USE_INODE_GENERATION_COUNTER + +#define REISERFS_PREALLOCATE +#define DISPLACE_NEW_PACKING_LOCALITIES +#define PREALLOCATION_SIZE 9 + +/* n must be power of 2 */ +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) + +/* + * to be ok for alpha and others we have to align structures to 8 byte + * boundary. + * FIXME: do not change 4 by anything else: there is code which relies on that + */ +#define ROUND_UP(x) _ROUND_UP(x,8LL) + +/* + * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug + * messages. + */ +#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ + +void __reiserfs_warning(struct super_block *s, const char *id, + const char *func, const char *fmt, ...); +#define reiserfs_warning(s, id, fmt, args...) \ + __reiserfs_warning(s, id, __func__, fmt, ##args) +/* assertions handling */ + +/* always check a condition and panic if it's false. */ +#define __RASSERT(cond, scond, format, args...) \ +do { \ + if (!(cond)) \ + reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \ + __FILE__ ":%i:%s: " format "\n", \ + __LINE__, __func__ , ##args); \ +} while (0) + +#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args) + +#if defined( CONFIG_REISERFS_CHECK ) +#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args) +#else +#define RFALSE( cond, format, args... ) do {;} while( 0 ) +#endif + +#define CONSTF __attribute_const__ +/* + * Disk Data Structures + */ + +/*************************************************************************** + * SUPER BLOCK * + ***************************************************************************/ + +/* + * Structure of super block on disk, a version of which in RAM is often + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger + * structure containing fields never written to disk. + */ +#define UNSET_HASH 0 /* Detect hash on disk */ +#define TEA_HASH 1 +#define YURA_HASH 2 +#define R5_HASH 3 +#define DEFAULT_HASH R5_HASH + +struct journal_params { + /* where does journal start from on its * device */ + __le32 jp_journal_1st_block; + + /* journal device st_rdev */ + __le32 jp_journal_dev; + + /* size of the journal */ + __le32 jp_journal_size; + + /* max number of blocks in a transaction. */ + __le32 jp_journal_trans_max; + + /* + * random value made on fs creation + * (this was sb_journal_block_count) + */ + __le32 jp_journal_magic; + + /* max number of blocks to batch into a trans */ + __le32 jp_journal_max_batch; + + /* in seconds, how old can an async commit be */ + __le32 jp_journal_max_commit_age; + + /* in seconds, how old can a transaction be */ + __le32 jp_journal_max_trans_age; +}; + +/* this is the super from 3.5.X, where X >= 10 */ +struct reiserfs_super_block_v1 { + __le32 s_block_count; /* blocks count */ + __le32 s_free_blocks; /* free blocks count */ + __le32 s_root_block; /* root block number */ + struct journal_params s_journal; + __le16 s_blocksize; /* block size */ + + /* max size of object id array, see get_objectid() commentary */ + __le16 s_oid_maxsize; + __le16 s_oid_cursize; /* current size of object id array */ + + /* this is set to 1 when filesystem was umounted, to 2 - when not */ + __le16 s_umount_state; + + /* + * reiserfs magic string indicates that file system is reiserfs: + * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" + */ + char s_magic[10]; + + /* + * it is set to used by fsck to mark which + * phase of rebuilding is done + */ + __le16 s_fs_state; + /* + * indicate, what hash function is being use + * to sort names in a directory + */ + __le32 s_hash_function_code; + __le16 s_tree_height; /* height of disk tree */ + + /* + * amount of bitmap blocks needed to address + * each block of file system + */ + __le16 s_bmap_nr; + + /* + * this field is only reliable on filesystem with non-standard journal + */ + __le16 s_version; + + /* + * size in blocks of journal area on main device, we need to + * keep after making fs with non-standard journal + */ + __le16 s_reserved_for_journal; +} __attribute__ ((__packed__)); + +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) + +/* this is the on disk super block */ +struct reiserfs_super_block { + struct reiserfs_super_block_v1 s_v1; + __le32 s_inode_generation; + + /* Right now used only by inode-attributes, if enabled */ + __le32 s_flags; + + unsigned char s_uuid[16]; /* filesystem unique identifier */ + unsigned char s_label[16]; /* filesystem volume label */ + __le16 s_mnt_count; /* Count of mounts since last fsck */ + __le16 s_max_mnt_count; /* Maximum mounts before check */ + __le32 s_lastcheck; /* Timestamp of last fsck */ + __le32 s_check_interval; /* Interval between checks */ + + /* + * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1() + * so any additions must be updated there as well. */ + char s_unused[76]; +} __attribute__ ((__packed__)); + +#define SB_SIZE (sizeof(struct reiserfs_super_block)) + +#define REISERFS_VERSION_1 0 +#define REISERFS_VERSION_2 2 + +/* on-disk super block fields converted to cpu form */ +#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) +#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) +#define SB_BLOCKSIZE(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize)) +#define SB_BLOCK_COUNT(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count)) +#define SB_FREE_BLOCKS(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks)) +#define SB_REISERFS_MAGIC(s) \ + (SB_V1_DISK_SUPER_BLOCK(s)->s_magic) +#define SB_ROOT_BLOCK(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block)) +#define SB_TREE_HEIGHT(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height)) +#define SB_REISERFS_STATE(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state)) +#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version)) +#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr)) + +#define PUT_SB_BLOCK_COUNT(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0) +#define PUT_SB_FREE_BLOCKS(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0) +#define PUT_SB_ROOT_BLOCK(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0) +#define PUT_SB_TREE_HEIGHT(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0) +#define PUT_SB_REISERFS_STATE(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0) +#define PUT_SB_VERSION(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0) +#define PUT_SB_BMAP_NR(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0) + +#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal) +#define SB_ONDISK_JOURNAL_SIZE(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size)) +#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block)) +#define SB_ONDISK_JOURNAL_DEVICE(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev)) +#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal)) + +#define is_block_in_log_or_reserved_area(s, block) \ + block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \ + && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \ + ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \ + SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s))) + +int is_reiserfs_3_5(struct reiserfs_super_block *rs); +int is_reiserfs_3_6(struct reiserfs_super_block *rs); +int is_reiserfs_jr(struct reiserfs_super_block *rs); + +/* + * ReiserFS leaves the first 64k unused, so that partition labels have + * enough space. If someone wants to write a fancy bootloader that + * needs more than 64k, let us know, and this will be increased in size. + * This number must be larger than than the largest block size on any + * platform, or code will break. -Hans + */ +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) +#define REISERFS_FIRST_BLOCK unused_define +#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES + +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */ +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024) + +/* reiserfs internal error code (used by search_by_key and fix_nodes)) */ +#define CARRY_ON 0 +#define REPEAT_SEARCH -1 +#define IO_ERROR -2 +#define NO_DISK_SPACE -3 +#define NO_BALANCING_NEEDED (-4) +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) +#define QUOTA_EXCEEDED -6 + +typedef __u32 b_blocknr_t; +typedef __le32 unp_t; + +struct unfm_nodeinfo { + unp_t unfm_nodenum; + unsigned short unfm_freespace; +}; + +/* there are two formats of keys: 3.5 and 3.6 */ +#define KEY_FORMAT_3_5 0 +#define KEY_FORMAT_3_6 1 + +/* there are two stat datas */ +#define STAT_DATA_V1 0 +#define STAT_DATA_V2 1 + +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode) +{ + return container_of(inode, struct reiserfs_inode_info, vfs_inode); +} + +static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. + */ +static inline __u32 reiserfs_bmap_count(struct super_block *sb) +{ + return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; +} + +static inline int bmap_would_wrap(unsigned bmap_nr) +{ + return bmap_nr > ((1LL << 16) - 1); +} + +/* + * this says about version of key of all items (but stat data) the + * object consists of + */ +#define get_inode_item_key_version( inode ) \ + ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) + +#define set_inode_item_key_version( inode, version ) \ + ({ if((version)==KEY_FORMAT_3_6) \ + REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \ + else \ + REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; }) + +#define get_inode_sd_version(inode) \ + ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1) + +#define set_inode_sd_version(inode, version) \ + ({ if((version)==STAT_DATA_V2) \ + REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \ + else \ + REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) + +/* + * This is an aggressive tail suppression policy, I am hoping it + * improves our benchmarks. The principle behind it is that percentage + * space saving is what matters, not absolute space saving. This is + * non-intuitive, but it helps to understand it if you consider that the + * cost to access 4 blocks is not much more than the cost to access 1 + * block, if you have to do a seek and rotate. A tail risks a + * non-linear disk access that is significant as a percentage of total + * time cost for a 4 block file and saves an amount of space that is + * less significant as a percentage of space, or so goes the hypothesis. + * -Hans + */ +#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ +(\ + (!(n_tail_size)) || \ + (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ + ( (n_file_size) >= (n_block_size) * 4 ) || \ + ( ( (n_file_size) >= (n_block_size) * 3 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \ + ( ( (n_file_size) >= (n_block_size) * 2 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \ + ( ( (n_file_size) >= (n_block_size) ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ +) + +/* + * Another strategy for tails, this one means only create a tail if all the + * file would fit into one DIRECT item. + * Primary intention for this one is to increase performance by decreasing + * seeking. +*/ +#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ +(\ + (!(n_tail_size)) || \ + (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \ +) + +/* + * values for s_umount_state field + */ +#define REISERFS_VALID_FS 1 +#define REISERFS_ERROR_FS 2 + +/* + * there are 5 item types currently + */ +#define TYPE_STAT_DATA 0 +#define TYPE_INDIRECT 1 +#define TYPE_DIRECT 2 +#define TYPE_DIRENTRY 3 +#define TYPE_MAXTYPE 3 +#define TYPE_ANY 15 /* FIXME: comment is required */ + +/*************************************************************************** + * KEY & ITEM HEAD * + ***************************************************************************/ + +/* * directories use this key as well as old files */ +struct offset_v1 { + __le32 k_offset; + __le32 k_uniqueness; +} __attribute__ ((__packed__)); + +struct offset_v2 { + __le64 v; +} __attribute__ ((__packed__)); + +static inline __u16 offset_v2_k_type(const struct offset_v2 *v2) +{ + __u8 type = le64_to_cpu(v2->v) >> 60; + return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY; +} + +static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type) +{ + v2->v = + (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60); +} + +static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2) +{ + return le64_to_cpu(v2->v) & (~0ULL >> 4); +} + +static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) +{ + offset &= (~0ULL >> 4); + v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); +} + +/* + * Key of an item determines its location in the S+tree, and + * is composed of 4 components + */ +struct reiserfs_key { + /* packing locality: by default parent directory object id */ + __le32 k_dir_id; + + __le32 k_objectid; /* object identifier */ + union { + struct offset_v1 k_offset_v1; + struct offset_v2 k_offset_v2; + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + +struct in_core_key { + /* packing locality: by default parent directory object id */ + __u32 k_dir_id; + __u32 k_objectid; /* object identifier */ + __u64 k_offset; + __u8 k_type; +}; + +struct cpu_key { + struct in_core_key on_disk_key; + int version; + /* 3 in all cases but direct2indirect and indirect2direct conversion */ + int key_length; +}; + +/* + * Our function for comparing keys can compare keys of different + * lengths. It takes as a parameter the length of the keys it is to + * compare. These defines are used in determining what is to be passed + * to it as that parameter. + */ +#define REISERFS_FULL_KEY_LEN 4 +#define REISERFS_SHORT_KEY_LEN 2 + +/* The result of the key compare */ +#define FIRST_GREATER 1 +#define SECOND_GREATER -1 +#define KEYS_IDENTICAL 0 +#define KEY_FOUND 1 +#define KEY_NOT_FOUND 0 + +#define KEY_SIZE (sizeof(struct reiserfs_key)) +#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) + +/* return values for search_by_key and clones */ +#define ITEM_FOUND 1 +#define ITEM_NOT_FOUND 0 +#define ENTRY_FOUND 1 +#define ENTRY_NOT_FOUND 0 +#define DIRECTORY_NOT_FOUND -1 +#define REGULAR_FILE_FOUND -2 +#define DIRECTORY_FOUND -3 +#define BYTE_FOUND 1 +#define BYTE_NOT_FOUND 0 +#define FILE_NOT_FOUND -1 + +#define POSITION_FOUND 1 +#define POSITION_NOT_FOUND 0 + +/* return values for reiserfs_find_entry and search_by_entry_key */ +#define NAME_FOUND 1 +#define NAME_NOT_FOUND 0 +#define GOTO_PREVIOUS_ITEM 2 +#define NAME_FOUND_INVISIBLE 3 + +/* + * Everything in the filesystem is stored as a set of items. The + * item head contains the key of the item, its free space (for + * indirect items) and specifies the location of the item itself + * within the block. + */ + +struct item_head { + /* + * Everything in the tree is found by searching for it based on + * its key. + */ + struct reiserfs_key ih_key; + union { + /* + * The free space in the last unformatted node of an + * indirect item if this is an indirect item. This + * equals 0xFFFF iff this is a direct item or stat data + * item. Note that the key, not this field, is used to + * determine the item type, and thus which field this + * union contains. + */ + __le16 ih_free_space_reserved; + + /* + * Iff this is a directory item, this field equals the + * number of directory entries in the directory item. + */ + __le16 ih_entry_count; + } __attribute__ ((__packed__)) u; + __le16 ih_item_len; /* total size of the item body */ + + /* an offset to the item body within the block */ + __le16 ih_item_location; + + /* + * 0 for all old items, 2 for new ones. Highest bit is set by fsck + * temporary, cleaned after all done + */ + __le16 ih_version; +} __attribute__ ((__packed__)); +/* size of item header */ +#define IH_SIZE (sizeof(struct item_head)) + +#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved) +#define ih_version(ih) le16_to_cpu((ih)->ih_version) +#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count) +#define ih_location(ih) le16_to_cpu((ih)->ih_item_location) +#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len) + +#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0) +#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0) +#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0) +#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0) +#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0) + +#define unreachable_item(ih) (ih_version(ih) & (1 << 15)) + +#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) + +/* + * these operate on indirect items, where you've got an array of ints + * at a possibly unaligned location. These are a noop on ia32 + * + * p is the array of __u32, i is the index into the array, v is the value + * to store there. + */ +#define get_block_num(p, i) get_unaligned_le32((p) + (i)) +#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) + +/* * in old version uniqueness field shows key type */ +#define V1_SD_UNIQUENESS 0 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe +#define V1_DIRECT_UNIQUENESS 0xffffffff +#define V1_DIRENTRY_UNIQUENESS 500 +#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */ + +/* here are conversion routines */ +static inline int uniqueness2type(__u32 uniqueness) CONSTF; +static inline int uniqueness2type(__u32 uniqueness) +{ + switch ((int)uniqueness) { + case V1_SD_UNIQUENESS: + return TYPE_STAT_DATA; + case V1_INDIRECT_UNIQUENESS: + return TYPE_INDIRECT; + case V1_DIRECT_UNIQUENESS: + return TYPE_DIRECT; + case V1_DIRENTRY_UNIQUENESS: + return TYPE_DIRENTRY; + case V1_ANY_UNIQUENESS: + default: + return TYPE_ANY; + } +} + +static inline __u32 type2uniqueness(int type) CONSTF; +static inline __u32 type2uniqueness(int type) +{ + switch (type) { + case TYPE_STAT_DATA: + return V1_SD_UNIQUENESS; + case TYPE_INDIRECT: + return V1_INDIRECT_UNIQUENESS; + case TYPE_DIRECT: + return V1_DIRECT_UNIQUENESS; + case TYPE_DIRENTRY: + return V1_DIRENTRY_UNIQUENESS; + case TYPE_ANY: + default: + return V1_ANY_UNIQUENESS; + } +} + +/* + * key is pointer to on disk key which is stored in le, result is cpu, + * there is no way to get version of object from key, so, provide + * version to these defines + */ +static inline loff_t le_key_k_offset(int version, + const struct reiserfs_key *key) +{ + return (version == KEY_FORMAT_3_5) ? + le32_to_cpu(key->u.k_offset_v1.k_offset) : + offset_v2_k_offset(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_offset(const struct item_head *ih) +{ + return le_key_k_offset(ih_version(ih), &(ih->ih_key)); +} + +static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) +{ + if (version == KEY_FORMAT_3_5) { + loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness); + return uniqueness2type(val); + } else + return offset_v2_k_type(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_type(const struct item_head *ih) +{ + return le_key_k_type(ih_version(ih), &(ih->ih_key)); +} + +static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + if (version == KEY_FORMAT_3_5) + key->u.k_offset_v1.k_offset = cpu_to_le32(offset); + else + set_offset_v2_k_offset(&key->u.k_offset_v2, offset); +} + +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + set_le_key_k_offset(version, key, + le_key_k_offset(version, key) + offset); +} + +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); +} + +static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); +} + +static inline void set_le_key_k_type(int version, struct reiserfs_key *key, + int type) +{ + if (version == KEY_FORMAT_3_5) { + type = type2uniqueness(type); + key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type); + } else + set_offset_v2_k_type(&key->u.k_offset_v2, type); +} + +static inline void set_le_ih_k_type(struct item_head *ih, int type) +{ + set_le_key_k_type(ih_version(ih), &(ih->ih_key), type); +} + +static inline int is_direntry_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_DIRENTRY; +} + +static inline int is_direct_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_DIRECT; +} + +static inline int is_indirect_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_INDIRECT; +} + +static inline int is_statdata_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_STAT_DATA; +} + +/* item header has version. */ +static inline int is_direntry_le_ih(struct item_head *ih) +{ + return is_direntry_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_direct_le_ih(struct item_head *ih) +{ + return is_direct_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_indirect_le_ih(struct item_head *ih) +{ + return is_indirect_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_statdata_le_ih(struct item_head *ih) +{ + return is_statdata_le_key(ih_version(ih), &ih->ih_key); +} + +/* key is pointer to cpu key, result is cpu */ +static inline loff_t cpu_key_k_offset(const struct cpu_key *key) +{ + return key->on_disk_key.k_offset; +} + +static inline loff_t cpu_key_k_type(const struct cpu_key *key) +{ + return key->on_disk_key.k_type; +} + +static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset) +{ + key->on_disk_key.k_offset = offset; +} + +static inline void set_cpu_key_k_type(struct cpu_key *key, int type) +{ + key->on_disk_key.k_type = type; +} + +static inline void cpu_key_k_offset_dec(struct cpu_key *key) +{ + key->on_disk_key.k_offset--; +} + +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY) +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT) +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT) +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA) + +/* are these used ? */ +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key))) +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key))) +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key))) +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key))) + +#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \ + (!COMP_SHORT_KEYS(ih, key) && \ + I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize)) + +/* maximal length of item */ +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE) +#define MIN_ITEM_LEN 1 + +/* object identifier for root dir */ +#define REISERFS_ROOT_OBJECTID 2 +#define REISERFS_ROOT_PARENT_OBJECTID 1 + +extern struct reiserfs_key root_key; + +/* + * Picture represents a leaf of the S+tree + * ______________________________________________________ + * | | Array of | | | + * |Block | Object-Item | F r e e | Objects- | + * | head | Headers | S p a c e | Items | + * |______|_______________|___________________|___________| + */ + +/* + * Header of a disk block. More precisely, header of a formatted leaf + * or internal node, and not the header of an unformatted node. + */ +struct block_head { + __le16 blk_level; /* Level of a block in the tree. */ + __le16 blk_nr_item; /* Number of keys/items in a block. */ + __le16 blk_free_space; /* Block free space in bytes. */ + __le16 blk_reserved; + /* dump this in v4/planA */ + + /* kept only for compatibility */ + struct reiserfs_key blk_right_delim_key; +}; + +#define BLKH_SIZE (sizeof(struct block_head)) +#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level)) +#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item)) +#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space)) +#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved)) +#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val)) +#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val)) +#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val)) +#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val)) +#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key) +#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val) + +/* values for blk_level field of the struct block_head */ + +/* + * When node gets removed from the tree its blk_level is set to FREE_LEVEL. + * It is then used to see whether the node is still in the tree + */ +#define FREE_LEVEL 0 + +#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */ + +/* + * Given the buffer head of a formatted node, resolve to the + * block head of that node. + */ +#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data)) +/* Number of items that are in buffer. */ +#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh))) +#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh))) +#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh))) + +#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0) + +/* Get right delimiting key. -- little endian */ +#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh)))) + +/* Does the buffer contain a disk leaf. */ +#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL) + +/* Does the buffer contain a disk internal node */ +#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ + && B_LEVEL(bh) <= MAX_HEIGHT) + +/*************************************************************************** + * STAT DATA * + ***************************************************************************/ + +/* + * old stat data is 32 bytes long. We are going to distinguish new one by + * different size +*/ +struct stat_data_v1 { + __le16 sd_mode; /* file type, permissions */ + __le16 sd_nlink; /* number of hard links */ + __le16 sd_uid; /* owner */ + __le16 sd_gid; /* group */ + __le32 sd_size; /* file size */ + __le32 sd_atime; /* time of last access */ + __le32 sd_mtime; /* time file was last modified */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; + union { + __le32 sd_rdev; + __le32 sd_blocks; /* number of blocks file uses */ + } __attribute__ ((__packed__)) u; + + /* + * first byte of file which is stored in a direct item: except that if + * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no + * direct item. The existence of this field really grates on me. + * Let's replace it with a macro based on sd_size and our tail + * suppression policy. Someday. -Hans + */ + __le32 sd_first_direct_byte; +} __attribute__ ((__packed__)); + +#define SD_V1_SIZE (sizeof(struct stat_data_v1)) +#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5) +#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) +#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink)) +#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v)) +#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid)) +#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v)) +#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid)) +#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v)) +#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size)) +#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v)) +#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks)) +#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v)) +#define sd_v1_first_direct_byte(sdp) \ + (le32_to_cpu((sdp)->sd_first_direct_byte)) +#define set_sd_v1_first_direct_byte(sdp,v) \ + ((sdp)->sd_first_direct_byte = cpu_to_le32(v)) + +/* inode flags stored in sd_attrs (nee sd_reserved) */ + +/* + * we want common flags to have the same values as in ext2, + * so chattr(1) will work without problems + */ +#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL +#define REISERFS_APPEND_FL FS_APPEND_FL +#define REISERFS_SYNC_FL FS_SYNC_FL +#define REISERFS_NOATIME_FL FS_NOATIME_FL +#define REISERFS_NODUMP_FL FS_NODUMP_FL +#define REISERFS_SECRM_FL FS_SECRM_FL +#define REISERFS_UNRM_FL FS_UNRM_FL +#define REISERFS_COMPR_FL FS_COMPR_FL +#define REISERFS_NOTAIL_FL FS_NOTAIL_FL + +/* persistent flags that file inherits from the parent directory */ +#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ + REISERFS_SYNC_FL | \ + REISERFS_NOATIME_FL | \ + REISERFS_NODUMP_FL | \ + REISERFS_SECRM_FL | \ + REISERFS_COMPR_FL | \ + REISERFS_NOTAIL_FL ) + +/* + * Stat Data on disk (reiserfs version of UFS disk inode minus the + * address blocks) + */ +struct stat_data { + __le16 sd_mode; /* file type, permissions */ + __le16 sd_attrs; /* persistent inode flags */ + __le32 sd_nlink; /* number of hard links */ + __le64 sd_size; /* file size */ + __le32 sd_uid; /* owner */ + __le32 sd_gid; /* group */ + __le32 sd_atime; /* time of last access */ + __le32 sd_mtime; /* time file was last modified */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; + __le32 sd_blocks; + union { + __le32 sd_rdev; + __le32 sd_generation; + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + +/* this is 44 bytes long */ +#define SD_SIZE (sizeof(struct stat_data)) +#define SD_V2_SIZE SD_SIZE +#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6) +#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) +/* sd_reserved */ +/* set_sd_reserved */ +#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink)) +#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v)) +#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size)) +#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v)) +#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid)) +#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v)) +#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid)) +#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v)) +#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks)) +#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v)) +#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation)) +#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v)) +#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs)) +#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v)) + +/*************************************************************************** + * DIRECTORY STRUCTURE * + ***************************************************************************/ +/* + * Picture represents the structure of directory items + * ________________________________________________ + * | Array of | | | | | | + * | directory |N-1| N-2 | .... | 1st |0th| + * | entry headers | | | | | | + * |_______________|___|_____|________|_______|___| + * <---- directory entries ------> + * + * First directory item has k_offset component 1. We store "." and ".." + * in one item, always, we never split "." and ".." into differing + * items. This makes, among other things, the code for removing + * directories simpler. + */ +#define SD_OFFSET 0 +#define SD_UNIQUENESS 0 +#define DOT_OFFSET 1 +#define DOT_DOT_OFFSET 2 +#define DIRENTRY_UNIQUENESS 500 + +#define FIRST_ITEM_OFFSET 1 + +/* + * Q: How to get key of object pointed to by entry from entry? + * + * A: Each directory entry has its header. This header has deh_dir_id + * and deh_objectid fields, those are key of object, entry points to + */ + +/* + * NOT IMPLEMENTED: + * Directory will someday contain stat data of object + */ + +struct reiserfs_de_head { + __le32 deh_offset; /* third component of the directory entry key */ + + /* + * objectid of the parent directory of the object, that is referenced + * by directory entry + */ + __le32 deh_dir_id; + + /* objectid of the object, that is referenced by directory entry */ + __le32 deh_objectid; + __le16 deh_location; /* offset of name in the whole item */ + + /* + * whether 1) entry contains stat data (for future), and + * 2) whether entry is hidden (unlinked) + */ + __le16 deh_state; +} __attribute__ ((__packed__)); +#define DEH_SIZE sizeof(struct reiserfs_de_head) +#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset)) +#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id)) +#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid)) +#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location)) +#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state)) + +#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v))) +#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v))) +#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v))) +#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v))) +#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v))) + +/* empty directory contains two entries "." and ".." and their headers */ +#define EMPTY_DIR_SIZE \ +(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) + +/* old format directories have this size when empty */ +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) + +#define DEH_Statdata 0 /* not used now */ +#define DEH_Visible 2 + +/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */ +#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__) +# define ADDR_UNALIGNED_BITS (3) +#endif + +/* + * These are only used to manipulate deh_state. + * Because of this, we'll use the ext2_ bit routines, + * since they are little endian + */ +#ifdef ADDR_UNALIGNED_BITS + +# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) +# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3) + +# define set_bit_unaligned(nr, addr) \ + __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +# define clear_bit_unaligned(nr, addr) \ + __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +# define test_bit_unaligned(nr, addr) \ + test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) + +#else + +# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr) +# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr) +# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr) + +#endif + +#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid); +extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid); + +/* two entries per block (at least) */ +#define REISERFS_MAX_NAME(block_size) 255 + +/* + * this structure is used for operations on directory entries. It is + * not a disk structure. + * + * When reiserfs_find_entry or search_by_entry_key find directory + * entry, they return filled reiserfs_dir_entry structure + */ +struct reiserfs_dir_entry { + struct buffer_head *de_bh; + int de_item_num; + struct item_head *de_ih; + int de_entry_num; + struct reiserfs_de_head *de_deh; + int de_entrylen; + int de_namelen; + char *de_name; + unsigned long *de_gen_number_bit_string; + + __u32 de_dir_id; + __u32 de_objectid; + + struct cpu_key de_entry_key; +}; + +/* + * these defines are useful when a particular member of + * a reiserfs_dir_entry is needed + */ + +/* pointer to file name, stored in entry */ +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \ + (ih_item_body(bh, ih) + deh_location(deh)) + +/* length of name */ +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0)) + +/* hash value occupies bits from 7 up to 30 */ +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL) +/* generation number occupies 7 bits starting from 0 up to 6 */ +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL) +#define MAX_GENERATION_NUMBER 127 + +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number)) + +/* + * Picture represents an internal node of the reiserfs tree + * ______________________________________________________ + * | | Array of | Array of | Free | + * |block | keys | pointers | space | + * | head | N | N+1 | | + * |______|_______________|___________________|___________| + */ + +/*************************************************************************** + * DISK CHILD * + ***************************************************************************/ +/* + * Disk child pointer: + * The pointer from an internal node of the tree to a node that is on disk. + */ +struct disk_child { + __le32 dc_block_number; /* Disk child's block number. */ + __le16 dc_size; /* Disk child's used space. */ + __le16 dc_reserved; +}; + +#define DC_SIZE (sizeof(struct disk_child)) +#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number)) +#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size)) +#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0) +#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0) + +/* Get disk child by buffer header and position in the tree node. */ +#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\ +((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos))) + +/* Get disk child number by buffer header and position in the tree node. */ +#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos))) +#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \ + (put_dc_block_number(B_N_CHILD(bh, n_pos), val)) + + /* maximal value of field child_size in structure disk_child */ + /* child size is the combined size of all items and their headers */ +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE )) + +/* amount of used space in buffer (not including block head) */ +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur))) + +/* max and min number of keys in internal node */ +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) +#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) + +/*************************************************************************** + * PATH STRUCTURES AND DEFINES * + ***************************************************************************/ + +/* + * search_by_key fills up the path from the root to the leaf as it descends + * the tree looking for the key. It uses reiserfs_bread to try to find + * buffers in the cache given their block number. If it does not find + * them in the cache it reads them from disk. For each node search_by_key + * finds using reiserfs_bread it then uses bin_search to look through that + * node. bin_search will find the position of the block_number of the next + * node if it is looking through an internal node. If it is looking through + * a leaf node bin_search will find the position of the item which has key + * either equal to given key, or which is the maximal key less than the + * given key. + */ + +struct path_element { + /* Pointer to the buffer at the path in the tree. */ + struct buffer_head *pe_buffer; + /* Position in the tree node which is placed in the buffer above. */ + int pe_position; +}; + +/* + * maximal height of a tree. don't change this without + * changing JOURNAL_PER_BALANCE_CNT + */ +#define MAX_HEIGHT 5 + +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define EXTENDED_MAX_HEIGHT 7 + +/* Must be equal to at least 2. */ +#define FIRST_PATH_ELEMENT_OFFSET 2 + +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define ILLEGAL_PATH_ELEMENT_OFFSET 1 + +/* this MUST be MAX_HEIGHT + 1. See about FEB below */ +#define MAX_FEB_SIZE 6 + +/* + * We need to keep track of who the ancestors of nodes are. When we + * perform a search we record which nodes were visited while + * descending the tree looking for the node we searched for. This list + * of nodes is called the path. This information is used while + * performing balancing. Note that this path information may become + * invalid, and this means we must check it when using it to see if it + * is still valid. You'll need to read search_by_key and the comments + * in it, especially about decrement_counters_in_path(), to understand + * this structure. + * + * Paths make the code so much harder to work with and debug.... An + * enormous number of bugs are due to them, and trying to write or modify + * code that uses them just makes my head hurt. They are based on an + * excessive effort to avoid disturbing the precious VFS code.:-( The + * gods only know how we are going to SMP the code that uses them. + * znodes are the way! + */ + +#define PATH_READA 0x1 /* do read ahead */ +#define PATH_READA_BACK 0x2 /* read backwards */ + +struct treepath { + int path_length; /* Length of the array above. */ + int reada; + /* Array of the path elements. */ + struct path_element path_elements[EXTENDED_MAX_HEIGHT]; + int pos_in_item; +}; + +#define pos_in_item(path) ((path)->pos_in_item) + +#define INITIALIZE_PATH(var) \ +struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} + +/* Get path element by path and path position. */ +#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset)) + +/* Get buffer header at the path by path and path position. */ +#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer) + +/* Get position in the element at the path by path and path position. */ +#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) + +#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) + +/* + * you know, to the person who didn't write this the macro name does not + * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or + * maybe we should just focus on dumping paths... -Hans + */ +#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) + +/* + * in do_balance leaf has h == 0 in contrast with path structure, + * where root has level == 0. That is why we need these defines + */ + +/* tb->S[h] */ +#define PATH_H_PBUFFER(path, h) \ + PATH_OFFSET_PBUFFER(path, path->path_length - (h)) + +/* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1) + +#define PATH_H_POSITION(path, h) \ + PATH_OFFSET_POSITION(path, path->path_length - (h)) + +/* tb->S[h]->b_item_order */ +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) + +#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) + +static inline void *reiserfs_node_data(const struct buffer_head *bh) +{ + return bh->b_data + sizeof(struct block_head); +} + +/* get key from internal node */ +static inline struct reiserfs_key *internal_key(struct buffer_head *bh, + int item_num) +{ + struct reiserfs_key *key = reiserfs_node_data(bh); + + return &key[item_num]; +} + +/* get the item header from leaf node */ +static inline struct item_head *item_head(const struct buffer_head *bh, + int item_num) +{ + struct item_head *ih = reiserfs_node_data(bh); + + return &ih[item_num]; +} + +/* get the key from leaf node */ +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh, + int item_num) +{ + return &item_head(bh, item_num)->ih_key; +} + +static inline void *ih_item_body(const struct buffer_head *bh, + const struct item_head *ih) +{ + return bh->b_data + ih_location(ih); +} + +/* get item body from leaf node */ +static inline void *item_body(const struct buffer_head *bh, int item_num) +{ + return ih_item_body(bh, item_head(bh, item_num)); +} + +static inline struct item_head *tp_item_head(const struct treepath *path) +{ + return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +static inline void *tp_item_body(const struct treepath *path) +{ + return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +#define get_last_bh(path) PATH_PLAST_BUFFER(path) +#define get_item_pos(path) PATH_LAST_POSITION(path) +#define item_moved(ih,path) comp_items(ih, path) +#define path_changed(ih,path) comp_items (ih, path) + +/* array of the entry headers */ + /* get item body */ +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih))) + +/* + * length of the directory entry in directory item. This define + * calculates length of i-th directory entry using directory entry + * locations from dir entry head. When it calculates length of 0-th + * directory entry, it uses length of whole item in place of entry + * location of the non-existent following entry in the calculation. + * See picture above. + */ +static inline int entry_length(const struct buffer_head *bh, + const struct item_head *ih, int pos_in_item) +{ + struct reiserfs_de_head *deh; + + deh = B_I_DEH(bh, ih) + pos_in_item; + if (pos_in_item) + return deh_location(deh - 1) - deh_location(deh); + + return ih_item_len(ih) - deh_location(deh); +} + +/*************************************************************************** + * MISC * + ***************************************************************************/ + +/* Size of pointer to the unformatted node. */ +#define UNFM_P_SIZE (sizeof(unp_t)) +#define UNFM_P_SHIFT 2 + +/* in in-core inode key is stored on le form */ +#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) + +#define MAX_UL_INT 0xffffffff +#define MAX_INT 0x7ffffff +#define MAX_US_INT 0xffff + +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset +static inline loff_t max_reiserfs_offset(struct inode *inode) +{ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) + return (loff_t) U32_MAX; + + return (loff_t) ((~(__u64) 0) >> 4); +} + +#define MAX_KEY_OBJECTID MAX_UL_INT + +#define MAX_B_NUM MAX_UL_INT +#define MAX_FC_NUM MAX_US_INT + +/* the purpose is to detect overflow of an unsigned short */ +#define REISERFS_LINK_MAX (MAX_US_INT - 1000) + +/* + * The following defines are used in reiserfs_insert_item + * and reiserfs_append_item + */ +#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */ +#define REISERFS_USER_MEM 1 /* user memory mode */ + +#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) +#define get_generation(s) atomic_read (&fs_generation(s)) +#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s) \ +({ \ + reiserfs_cond_resched(s); \ + __fs_changed(gen, s); \ +}) + +/*************************************************************************** + * FIXATE NODES * + ***************************************************************************/ + +#define VI_TYPE_LEFT_MERGEABLE 1 +#define VI_TYPE_RIGHT_MERGEABLE 2 + +/* + * To make any changes in the tree we always first find node, that + * contains item to be changed/deleted or place to insert a new + * item. We call this node S. To do balancing we need to decide what + * we will shift to left/right neighbor, or to a new node, where new + * item will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ +struct virtual_item { + int vi_index; /* index in the array of item operations */ + unsigned short vi_type; /* left/right mergeability */ + + /* length of item that it will have after balancing */ + unsigned short vi_item_len; + + struct item_head *vi_ih; + const char *vi_item; /* body of item (old or new) */ + const void *vi_new_data; /* 0 always but paste mode */ + void *vi_uarea; /* item specific area */ +}; + +struct virtual_node { + /* this is a pointer to the free space in the buffer */ + char *vn_free_ptr; + + unsigned short vn_nr_item; /* number of items in virtual node */ + + /* + * size of node , that node would have if it has + * unlimited size and no balancing is performed + */ + short vn_size; + + /* mode of balancing (paste, insert, delete, cut) */ + short vn_mode; + + short vn_affected_item_num; + short vn_pos_in_item; + + /* item header of inserted item, 0 for other modes */ + struct item_head *vn_ins_ih; + const void *vn_data; + + /* array of items (including a new one, excluding item to be deleted) */ + struct virtual_item *vn_vi; +}; + +/* used by directory items when creating virtual nodes */ +struct direntry_uarea { + int flags; + __u16 entry_count; + __u16 entry_sizes[1]; +} __attribute__ ((__packed__)); + +/*************************************************************************** + * TREE BALANCE * + ***************************************************************************/ + +/* + * This temporary structure is used in tree balance algorithms, and + * constructed as we go to the extent that its various parts are + * needed. It contains arrays of nodes that can potentially be + * involved in the balancing of node S, and parameters that define how + * each of the nodes must be balanced. Note that in these algorithms + * for balancing the worst case is to need to balance the current node + * S and the left and right neighbors and all of their parents plus + * create a new node. We implement S1 balancing for the leaf nodes + * and S0 balancing for the internal nodes (S1 and S0 are defined in + * our papers.) + */ + +/* size of the array of buffers to free at end of do_balance */ +#define MAX_FREE_BLOCK 7 + +/* maximum number of FEB blocknrs on a single level */ +#define MAX_AMOUNT_NEEDED 2 + +/* someday somebody will prefix every field in this struct with tb_ */ +struct tree_balance { + int tb_mode; + int need_balance_dirty; + struct super_block *tb_sb; + struct reiserfs_transaction_handle *transaction_handle; + struct treepath *tb_path; + + /* array of left neighbors of nodes in the path */ + struct buffer_head *L[MAX_HEIGHT]; + + /* array of right neighbors of nodes in the path */ + struct buffer_head *R[MAX_HEIGHT]; + + /* array of fathers of the left neighbors */ + struct buffer_head *FL[MAX_HEIGHT]; + + /* array of fathers of the right neighbors */ + struct buffer_head *FR[MAX_HEIGHT]; + /* array of common parents of center node and its left neighbor */ + struct buffer_head *CFL[MAX_HEIGHT]; + + /* array of common parents of center node and its right neighbor */ + struct buffer_head *CFR[MAX_HEIGHT]; + + /* + * array of empty buffers. Number of buffers in array equals + * cur_blknum. + */ + struct buffer_head *FEB[MAX_FEB_SIZE]; + struct buffer_head *used[MAX_FEB_SIZE]; + struct buffer_head *thrown[MAX_FEB_SIZE]; + + /* + * array of number of items which must be shifted to the left in + * order to balance the current node; for leaves includes item that + * will be partially shifted; for internal nodes, it is the number + * of child pointers rather than items. It includes the new item + * being created. The code sometimes subtracts one to get the + * number of wholly shifted items for other purposes. + */ + int lnum[MAX_HEIGHT]; + + /* substitute right for left in comment above */ + int rnum[MAX_HEIGHT]; + + /* + * array indexed by height h mapping the key delimiting L[h] and + * S[h] to its item number within the node CFL[h] + */ + int lkey[MAX_HEIGHT]; + + /* substitute r for l in comment above */ + int rkey[MAX_HEIGHT]; + + /* + * the number of bytes by we are trying to add or remove from + * S[h]. A negative value means removing. + */ + int insert_size[MAX_HEIGHT]; + + /* + * number of nodes that will replace node S[h] after balancing + * on the level h of the tree. If 0 then S is being deleted, + * if 1 then S is remaining and no new nodes are being created, + * if 2 or 3 then 1 or 2 new nodes is being created + */ + int blknum[MAX_HEIGHT]; + + /* fields that are used only for balancing leaves of the tree */ + + /* number of empty blocks having been already allocated */ + int cur_blknum; + + /* number of items that fall into left most node when S[0] splits */ + int s0num; + + /* + * number of bytes which can flow to the left neighbor from the left + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int lbytes; + + /* + * number of bytes which will flow to the right neighbor from the right + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int rbytes; + + + /* + * index into the array of item headers in + * S[0] of the affected item + */ + int item_pos; + + /* new nodes allocated to hold what could not fit into S */ + struct buffer_head *S_new[2]; + + /* + * number of items that will be placed into nodes in S_new + * when S[0] splits + */ + int snum[2]; + + /* + * number of bytes which flow to nodes in S_new when S[0] splits + * note: if S[0] splits into 3 nodes, then items do not need to be cut + */ + int sbytes[2]; + + int pos_in_item; + int zeroes_num; + + /* + * buffers which are to be freed after do_balance finishes + * by unfix_nodes + */ + struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; + + /* + * kmalloced memory. Used to create virtual node and keep + * map of dirtied bitmap blocks + */ + char *vn_buf; + + int vn_buf_size; /* size of the vn_buf */ + + /* VN starts after bitmap of bitmap blocks */ + struct virtual_node *tb_vn; + + /* + * saved value of `reiserfs_generation' counter see + * FILESYSTEM_CHANGED() macro in reiserfs_fs.h + */ + int fs_gen; + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * key pointer, to pass to block allocator or + * another low-level subsystem + */ + struct in_core_key key; +#endif +}; + +/* These are modes of balancing */ + +/* When inserting an item. */ +#define M_INSERT 'i' +/* + * When inserting into (directories only) or appending onto an already + * existent item. + */ +#define M_PASTE 'p' +/* When deleting an item. */ +#define M_DELETE 'd' +/* When truncating an item or removing an entry from a (directory) item. */ +#define M_CUT 'c' + +/* used when balancing on leaf level skipped (in reiserfsck) */ +#define M_INTERNAL 'n' + +/* + * When further balancing is not needed, then do_balance does not need + * to be called. + */ +#define M_SKIP_BALANCING 's' +#define M_CONVERT 'v' + +/* modes of leaf_move_items */ +#define LEAF_FROM_S_TO_L 0 +#define LEAF_FROM_S_TO_R 1 +#define LEAF_FROM_R_TO_L 2 +#define LEAF_FROM_L_TO_R 3 +#define LEAF_FROM_S_TO_SNEW 4 + +#define FIRST_TO_LAST 0 +#define LAST_TO_FIRST 1 + +/* + * used in do_balance for passing parent of node information that has + * been gotten from tb struct + */ +struct buffer_info { + struct tree_balance *tb; + struct buffer_head *bi_bh; + struct buffer_head *bi_parent; + int bi_position; +}; + +static inline struct super_block *sb_from_tb(struct tree_balance *tb) +{ + return tb ? tb->tb_sb : NULL; +} + +static inline struct super_block *sb_from_bi(struct buffer_info *bi) +{ + return bi ? sb_from_tb(bi->tb) : NULL; +} + +/* + * there are 4 types of items: stat data, directory item, indirect, direct. + * +-------------------+------------+--------------+------------+ + * | | k_offset | k_uniqueness | mergeable? | + * +-------------------+------------+--------------+------------+ + * | stat data | 0 | 0 | no | + * +-------------------+------------+--------------+------------+ + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no | + * | non 1st directory | hash value | UNIQUENESS | yes | + * | item | | | | + * +-------------------+------------+--------------+------------+ + * | indirect item | offset + 1 |TYPE_INDIRECT | [1] | + * +-------------------+------------+--------------+------------+ + * | direct item | offset + 1 |TYPE_DIRECT | [2] | + * +-------------------+------------+--------------+------------+ + * + * [1] if this is not the first indirect item of the object + * [2] if this is not the first direct item of the object +*/ + +struct item_operations { + int (*bytes_number) (struct item_head * ih, int block_size); + void (*decrement_key) (struct cpu_key *); + int (*is_left_mergeable) (struct reiserfs_key * ih, + unsigned long bsize); + void (*print_item) (struct item_head *, char *item); + void (*check_item) (struct item_head *, char *item); + + int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, + int is_affected, int insert_size); + int (*check_left) (struct virtual_item * vi, int free, + int start_skip, int end_skip); + int (*check_right) (struct virtual_item * vi, int free); + int (*part_size) (struct virtual_item * vi, int from, int to); + int (*unit_num) (struct virtual_item * vi); + void (*print_vi) (struct virtual_item * vi); +}; + +extern struct item_operations *item_ops[TYPE_ANY + 1]; + +#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) +#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) +#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item) +#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item) +#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size) +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip) +#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free) +#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to) +#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi) +#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi) + +#define COMP_SHORT_KEYS comp_short_keys + +/* number of blocks pointed to by the indirect item */ +#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE) + +/* + * the used space within the unformatted node corresponding + * to pos within the item pointed to by ih + */ +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) + +/* + * number of bytes contained by the direct item or the + * unformatted nodes the indirect item points to + */ + +/* following defines use reiserfs buffer header and item header */ + +/* get stat-data */ +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) + +/* this is 3976 for size==4096 */ +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) + +/* + * indirect items consist of entries which contain blocknrs, pos + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the + * blocknr contained by the entry pos points to + */ +#define B_I_POS_UNFM_POINTER(bh, ih, pos) \ + le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \ + (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val)) + +struct reiserfs_iget_args { + __u32 objectid; + __u32 dirid; +}; + +/*************************************************************************** + * FUNCTION DECLARATIONS * + ***************************************************************************/ + +#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) + +#define journal_trans_half(blocksize) \ + ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + +/* journal.c see journal.c for all the comments here */ + +/* first block written in a commit. */ +struct reiserfs_journal_desc { + __le32 j_trans_id; /* id of commit */ + + /* length of commit. len +1 is the commit block */ + __le32 j_len; + + __le32 j_mount_id; /* mount id of this trans */ + __le32 j_realblock[1]; /* real locations for each block */ +}; + +#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) +#define get_desc_trans_len(d) le32_to_cpu((d)->j_len) +#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id) + +#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0) +#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0) + +/* last block written in a commit */ +struct reiserfs_journal_commit { + __le32 j_trans_id; /* must match j_trans_id from the desc block */ + __le32 j_len; /* ditto */ + __le32 j_realblock[1]; /* real locations for each block */ +}; + +#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) +#define get_commit_trans_len(c) le32_to_cpu((c)->j_len) +#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id) + +#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0) + +/* + * this header block gets written whenever a transaction is considered + * fully flushed, and is more recent than the last fully flushed transaction. + * fully flushed means all the log blocks and all the real blocks are on + * disk, and this transaction does not need to be replayed. + */ +struct reiserfs_journal_header { + /* id of last fully flushed transaction */ + __le32 j_last_flush_trans_id; + + /* offset in the log of where to start replay after a crash */ + __le32 j_first_unflushed_offset; + + __le32 j_mount_id; + /* 12 */ struct journal_params jh_journal; +}; + +/* biggest tunable defines are right here */ +#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ + +/* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_TRANS_MAX_DEFAULT 1024 +#define JOURNAL_TRANS_MIN_DEFAULT 256 + +/* + * max blocks to batch into one transaction, + * don't make this any bigger than 900 + */ +#define JOURNAL_MAX_BATCH_DEFAULT 900 +#define JOURNAL_MIN_RATIO 2 +#define JOURNAL_MAX_COMMIT_AGE 30 +#define JOURNAL_MAX_TRANS_AGE 30 +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9) +#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \ + 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \ + REISERFS_QUOTA_TRANS_BLOCKS(sb))) + +#ifdef CONFIG_QUOTA +#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA)) +/* We need to update data and inode (atime) */ +#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0) +/* 1 balancing, 1 bitmap, 1 data per write + stat data update */ +#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0) +/* same as with INIT */ +#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0) +#else +#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0 +#define REISERFS_QUOTA_INIT_BLOCKS(s) 0 +#define REISERFS_QUOTA_DEL_BLOCKS(s) 0 +#endif + +/* + * both of these can be as low as 1, or as high as you want. The min is the + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated + * as needed, and released when transactions are committed. On release, if + * the current number of nodes is > max, the node is freed, otherwise, + * it is put on a free list for faster use later. +*/ +#define REISERFS_MIN_BITMAP_NODES 10 +#define REISERFS_MAX_BITMAP_NODES 100 + +/* these are based on journal hash size of 8192 */ +#define JBH_HASH_SHIFT 13 +#define JBH_HASH_MASK 8191 + +#define _jhashfn(sb,block) \ + (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \ + (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) +#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) + +/* We need these to make journal.c code more readable */ +#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) + +enum reiserfs_bh_state_bits { + BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ + BH_JDirty_wait, + /* + * disk block was taken off free list before being in a + * finished transaction, or written to disk. Can be reused immed. + */ + BH_JNew, + BH_JPrepared, + BH_JRestore_dirty, + BH_JTest, /* debugging only will go away */ +}; + +BUFFER_FNS(JDirty, journaled); +TAS_BUFFER_FNS(JDirty, journaled); +BUFFER_FNS(JDirty_wait, journal_dirty); +TAS_BUFFER_FNS(JDirty_wait, journal_dirty); +BUFFER_FNS(JNew, journal_new); +TAS_BUFFER_FNS(JNew, journal_new); +BUFFER_FNS(JPrepared, journal_prepared); +TAS_BUFFER_FNS(JPrepared, journal_prepared); +BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +BUFFER_FNS(JTest, journal_test); +TAS_BUFFER_FNS(JTest, journal_test); + +/* transaction handle which is passed around for all journal calls */ +struct reiserfs_transaction_handle { + /* + * super for this FS when journal_begin was called. saves calls to + * reiserfs_get_super also used by nested transactions to make + * sure they are nesting on the right FS _must_ be first + * in the handle + */ + struct super_block *t_super; + + int t_refcount; + int t_blocks_logged; /* number of blocks this writer has logged */ + int t_blocks_allocated; /* number of blocks this writer allocated */ + + /* sanity check, equals the current trans id */ + unsigned int t_trans_id; + + void *t_handle_save; /* save existing current->journal_info */ + + /* + * if new block allocation occurres, that block + * should be displaced from others + */ + unsigned displace_new_blocks:1; + + struct list_head t_list; +}; + +/* + * used to keep track of ordered and tail writes, attached to the buffer + * head through b_journal_head. + */ +struct reiserfs_jh { + struct reiserfs_journal_list *jl; + struct buffer_head *bh; + struct list_head list; +}; + +void reiserfs_free_jh(struct buffer_head *bh); +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); +int journal_mark_dirty(struct reiserfs_transaction_handle *, + struct buffer_head *bh); + +static inline int reiserfs_file_data_log(struct inode *inode) +{ + if (reiserfs_data_log(inode->i_sb) || + (REISERFS_I(inode)->i_flags & i_data_log)) + return 1; + return 0; +} + +static inline int reiserfs_transaction_running(struct super_block *s) +{ + struct reiserfs_transaction_handle *th = current->journal_info; + if (th && th->t_super == s) + return 1; + if (th && th->t_super == NULL) + BUG(); + return 0; +} + +static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th) +{ + return th->t_blocks_allocated - th->t_blocks_logged; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *, + int count); +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +void reiserfs_vfs_truncate_file(struct inode *inode); +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to); +void reiserfs_flush_old_commits(struct super_block *); +int reiserfs_commit_for_inode(struct inode *); +int reiserfs_inode_needs_commit(struct inode *); +void reiserfs_update_inode_transaction(struct inode *); +void reiserfs_wait_on_write_block(struct super_block *s); +void reiserfs_block_writes(struct reiserfs_transaction_handle *th); +void reiserfs_allow_writes(struct super_block *s); +void reiserfs_check_lock_depth(struct super_block *s, char *caller); +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, + int wait); +void reiserfs_restore_prepared_buffer(struct super_block *, + struct buffer_head *bh); +int journal_init(struct super_block *, const char *j_dev_name, int old_format, + unsigned int); +int journal_release(struct reiserfs_transaction_handle *, struct super_block *); +int journal_release_error(struct reiserfs_transaction_handle *, + struct super_block *); +int journal_end(struct reiserfs_transaction_handle *); +int journal_end_sync(struct reiserfs_transaction_handle *); +int journal_mark_freed(struct reiserfs_transaction_handle *, + struct super_block *, b_blocknr_t blocknr); +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); +int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, + int bit_nr, int searchall, b_blocknr_t *next); +int journal_begin(struct reiserfs_transaction_handle *, + struct super_block *sb, unsigned long); +int journal_join_abort(struct reiserfs_transaction_handle *, + struct super_block *sb); +void reiserfs_abort_journal(struct super_block *sb, int errno); +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); +int reiserfs_allocate_list_bitmaps(struct super_block *s, + struct reiserfs_list_bitmap *, unsigned int); + +void reiserfs_schedule_old_flush(struct super_block *s); +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate); +int remove_save_link(struct inode *inode, int truncate); + +/* objectid.c */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th); +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release); +int reiserfs_convert_objectid_map_v1(struct super_block *); + +/* stree.c */ +int B_IS_IN_TREE(const struct buffer_head *); +extern void copy_item_head(struct item_head *to, + const struct item_head *from); + +/* first key is in cpu form, second - le */ +extern int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key); +extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); + +/* both are in le form */ +extern int comp_le_keys(const struct reiserfs_key *, + const struct reiserfs_key *); +extern int comp_short_le_keys(const struct reiserfs_key *, + const struct reiserfs_key *); + +/* * get key version from on disk key - kludge */ +static inline int le_key_version(const struct reiserfs_key *key) +{ + int type; + + type = offset_v2_k_type(&(key->u.k_offset_v2)); + if (type != TYPE_DIRECT && type != TYPE_INDIRECT + && type != TYPE_DIRENTRY) + return KEY_FORMAT_3_5; + + return KEY_FORMAT_3_6; + +} + +static inline void copy_key(struct reiserfs_key *to, + const struct reiserfs_key *from) +{ + memcpy(to, from, KEY_SIZE); +} + +int comp_items(const struct item_head *stored_ih, const struct treepath *path); +const struct reiserfs_key *get_rkey(const struct treepath *chk_path, + const struct super_block *sb); +int search_by_key(struct super_block *, const struct cpu_key *, + struct treepath *, int); +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL) +int search_for_position_by_key(struct super_block *sb, + const struct cpu_key *cpu_key, + struct treepath *search_path); +extern void decrement_bcount(struct buffer_head *bh); +void decrement_counters_in_path(struct treepath *search_path); +void pathrelse(struct treepath *search_path); +int reiserfs_check_path(struct treepath *p); +void pathrelse_and_restore(struct super_block *s, struct treepath *search_path); + +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct item_head *ih, + struct inode *inode, const char *body); + +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct inode *inode, + const char *body, int paste_size); + +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + struct cpu_key *key, + struct inode *inode, + struct page *page, loff_t new_file_size); + +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct inode *inode, struct buffer_head *un_bh); + +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key); +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode); +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, + struct inode *inode, struct page *, + int update_timestamps); + +#define i_block_size(inode) ((inode)->i_sb->s_blocksize) +#define file_size(inode) ((inode)->i_size) +#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1)) + +#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\ +!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 ) + +void padd_item(char *item, int total_length, int length); + +/* inode.c */ +/* args for the create parameter of reiserfs_get_block */ +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ +#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ + +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args); +int reiserfs_find_actor(struct inode *inode, void *p); +int reiserfs_init_locked_inode(struct inode *inode, void *p); +void reiserfs_evict_inode(struct inode *inode); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent); + +int reiserfs_truncate_file(struct inode *, int update_timestamps); +void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, + int type, int key_length); +void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, int entry_count); +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key); + +struct reiserfs_security_handle; +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, umode_t mode, + const char *symname, loff_t i_size, + struct dentry *dentry, struct inode *inode, + struct reiserfs_security_handle *security); + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size); + +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + reiserfs_update_sd_size(th, inode, inode->i_size); +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + +/* namei.c */ +void set_de_name_and_namelen(struct reiserfs_dir_entry *de); +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *path, struct reiserfs_dir_entry *de); +struct dentry *reiserfs_get_parent(struct dentry *); + +#ifdef CONFIG_REISERFS_PROC_INFO +int reiserfs_proc_info_init(struct super_block *sb); +int reiserfs_proc_info_done(struct super_block *sb); +int reiserfs_proc_info_global_init(void); +int reiserfs_proc_info_global_done(void); + +#define PROC_EXP( e ) e + +#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data +#define PROC_INFO_MAX( sb, field, value ) \ + __PINFO( sb ).field = \ + max( REISERFS_SB( sb ) -> s_proc_info_data.field, value ) +#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) ) +#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) ) +#define PROC_INFO_BH_STAT( sb, bh, level ) \ + PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \ + PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \ + PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) ) +#else +static inline int reiserfs_proc_info_init(struct super_block *sb) +{ + return 0; +} + +static inline int reiserfs_proc_info_done(struct super_block *sb) +{ + return 0; +} + +static inline int reiserfs_proc_info_global_init(void) +{ + return 0; +} + +static inline int reiserfs_proc_info_global_done(void) +{ + return 0; +} + +#define PROC_EXP( e ) +#define VOID_V ( ( void ) 0 ) +#define PROC_INFO_MAX( sb, field, value ) VOID_V +#define PROC_INFO_INC( sb, field ) VOID_V +#define PROC_INFO_ADD( sb, field, val ) VOID_V +#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V +#endif + +/* dir.c */ +extern const struct inode_operations reiserfs_dir_inode_operations; +extern const struct inode_operations reiserfs_symlink_inode_operations; +extern const struct inode_operations reiserfs_special_inode_operations; +extern const struct file_operations reiserfs_dir_operations; +int reiserfs_readdir_inode(struct inode *, struct dir_context *); + +/* tail_conversion.c */ +int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, + struct treepath *, struct buffer_head *, loff_t); +int indirect2direct(struct reiserfs_transaction_handle *, struct inode *, + struct page *, struct treepath *, const struct cpu_key *, + loff_t, char *); +void reiserfs_unmap_buffer(struct buffer_head *); + +/* file.c */ +extern const struct inode_operations reiserfs_file_inode_operations; +extern const struct file_operations reiserfs_file_operations; +extern const struct address_space_operations reiserfs_address_space_operations; + +/* fix_nodes.c */ + +int fix_nodes(int n_op_mode, struct tree_balance *tb, + struct item_head *ins_ih, const void *); +void unfix_nodes(struct tree_balance *); + +/* prints.c */ +void __reiserfs_panic(struct super_block *s, const char *id, + const char *function, const char *fmt, ...) + __attribute__ ((noreturn)); +#define reiserfs_panic(s, id, fmt, args...) \ + __reiserfs_panic(s, id, __func__, fmt, ##args) +void __reiserfs_error(struct super_block *s, const char *id, + const char *function, const char *fmt, ...); +#define reiserfs_error(s, id, fmt, args...) \ + __reiserfs_error(s, id, __func__, fmt, ##args) +void reiserfs_info(struct super_block *s, const char *fmt, ...); +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...); +void print_indirect_item(struct buffer_head *bh, int item_num); +void store_print_tb(struct tree_balance *tb); +void print_cur_tb(char *mes); +void print_de(struct reiserfs_dir_entry *de); +void print_bi(struct buffer_info *bi, char *mes); +#define PRINT_LEAF_ITEMS 1 /* print all items */ +#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */ +#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */ +void print_block(struct buffer_head *bh, ...); +void print_bmap(struct super_block *s, int silent); +void print_bmap_block(int i, char *data, int size, int silent); +/*void print_super_block (struct super_block * s, char * mes);*/ +void print_objectid_map(struct super_block *s); +void print_block_head(struct buffer_head *bh, char *mes); +void check_leaf(struct buffer_head *bh); +void check_internal(struct buffer_head *bh); +void print_statistics(struct super_block *s); +char *reiserfs_hashname(int code); + +/* lbalance.c */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew); +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes); +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, + int del_num, int del_bytes); +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number); +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, + int pos_in_item, int paste_size, + const char * const body, int zeros_number); +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size); +void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, + int new_entry_count, struct reiserfs_de_head *new_dehs, + const char *records, int paste_size); +/* ibalance.c */ +int balance_internal(struct tree_balance *, int, int, struct item_head *, + struct buffer_head **); + +/* do_balance.c */ +void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag); +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag); +void reiserfs_invalidate_buffer(struct tree_balance *tb, + struct buffer_head *bh); + +int get_left_neighbor_position(struct tree_balance *tb, int h); +int get_right_neighbor_position(struct tree_balance *tb, int h); +void replace_key(struct tree_balance *tb, struct buffer_head *, int, + struct buffer_head *, int); +void make_empty_node(struct buffer_info *); +struct buffer_head *get_FEB(struct tree_balance *); + +/* bitmap.c */ + +/* + * structure contains hints for block allocator, and it is a container for + * arguments, such as node, search path, transaction_handle, etc. + */ +struct __reiserfs_blocknr_hint { + /* inode passed to allocator, if we allocate unf. nodes */ + struct inode *inode; + + sector_t block; /* file offset, in blocks */ + struct in_core_key key; + + /* + * search path, used by allocator to deternine search_start by + * various ways + */ + struct treepath *path; + + /* + * transaction handle is needed to log super blocks + * and bitmap blocks changes + */ + struct reiserfs_transaction_handle *th; + + b_blocknr_t beg, end; + + /* + * a field used to transfer search start value (block number) + * between different block allocator procedures + * (determine_search_start() and others) + */ + b_blocknr_t search_start; + + /* + * is set in determine_prealloc_size() function, + * used by underlayed function that do actual allocation + */ + int prealloc_size; + + /* + * the allocator uses different polices for getting disk + * space for formatted/unformatted blocks with/without preallocation + */ + unsigned formatted_node:1; + unsigned preallocate:1; +}; + +typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t; + +int reiserfs_parse_alloc_options(struct super_block *, char *); +void reiserfs_init_alloc_options(struct super_block *s); + +/* + * given a directory, this will tell you what packing locality + * to use for a new object underneat it. The locality is returned + * in disk byte order (le). + */ +__le32 reiserfs_choose_packing(struct inode *dir); + +void show_alloc_options(struct seq_file *seq, struct super_block *s); +int reiserfs_init_bitmap_cache(struct super_block *sb); +void reiserfs_free_bitmap_cache(struct super_block *sb); +void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap); +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *, + b_blocknr_t, int for_unformatted); +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int, + int); +static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb, + b_blocknr_t * new_blocknrs, + int amount_needed) +{ + reiserfs_blocknr_hint_t hint = { + .th = tb->transaction_handle, + .path = tb->tb_path, + .inode = NULL, + .key = tb->key, + .block = 0, + .formatted_node = 1 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed, + 0); +} + +static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle + *th, struct inode *inode, + b_blocknr_t * new_blocknrs, + struct treepath *path, + sector_t block) +{ + reiserfs_blocknr_hint_t hint = { + .th = th, + .path = path, + .inode = inode, + .block = block, + .formatted_node = 0, + .preallocate = 0 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +#ifdef REISERFS_PREALLOCATE +static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle + *th, struct inode *inode, + b_blocknr_t * new_blocknrs, + struct treepath *path, + sector_t block) +{ + reiserfs_blocknr_hint_t hint = { + .th = th, + .path = path, + .inode = inode, + .block = block, + .formatted_node = 0, + .preallocate = 1 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode); +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th); +#endif + +/* hashes.c */ +__u32 keyed_hash(const signed char *msg, int len); +__u32 yura_hash(const signed char *msg, int len); +__u32 r5_hash(const signed char *msg, int len); + +#define reiserfs_set_le_bit __set_bit_le +#define reiserfs_test_and_set_le_bit __test_and_set_bit_le +#define reiserfs_clear_le_bit __clear_bit_le +#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le +#define reiserfs_test_le_bit test_bit_le +#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le + +/* + * sometimes reiserfs_truncate may require to allocate few new blocks + * to perform indirect2direct conversion. People probably used to + * think, that truncate should work without problems on a filesystem + * without free disk space. They may complain that they can not + * truncate due to lack of free disk space. This spare space allows us + * to not worry about it. 500 is probably too much, but it should be + * absolutely safe + */ +#define SPARE_SPACE 500 + +/* prototypes from ioctl.c */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long reiserfs_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg); +int reiserfs_unpack(struct inode *inode, struct file *filp); diff --git a/kernel/fs/reiserfs/resize.c b/kernel/fs/reiserfs/resize.c new file mode 100644 index 000000000..6052d323b --- /dev/null +++ b/kernel/fs/reiserfs/resize.c @@ -0,0 +1,229 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Alexander Zarochentcev. + * + * The kernel part of the (on-line) reiserfs resizer. + */ + +#include +#include +#include +#include +#include +#include "reiserfs.h" +#include + +int reiserfs_resize(struct super_block *s, unsigned long block_count_new) +{ + int err = 0; + struct reiserfs_super_block *sb; + struct reiserfs_bitmap_info *bitmap; + struct reiserfs_bitmap_info *info; + struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); + struct buffer_head *bh; + struct reiserfs_transaction_handle th; + unsigned int bmap_nr_new, bmap_nr; + unsigned int block_r_new, block_r; + + struct reiserfs_list_bitmap *jb; + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; + + unsigned long int block_count, free_blocks; + int i; + int copy_size; + int depth; + + sb = SB_DISK_SUPER_BLOCK(s); + + if (SB_BLOCK_COUNT(s) >= block_count_new) { + printk("can\'t shrink filesystem on-line\n"); + return -EINVAL; + } + + /* check the device size */ + depth = reiserfs_write_unlock_nested(s); + bh = sb_bread(s, block_count_new - 1); + reiserfs_write_lock_nested(s, depth); + if (!bh) { + printk("reiserfs_resize: can\'t read last block\n"); + return -EINVAL; + } + bforget(bh); + + /* + * old disk layout detection; those partitions can be mounted, but + * cannot be resized + */ + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES) { + printk + ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + return -ENOTSUPP; + } + + /* count used bits in last bitmap block */ + block_r = SB_BLOCK_COUNT(s) - + (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8; + + /* count bitmap blocks in new fs */ + bmap_nr_new = block_count_new / (s->s_blocksize * 8); + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; + if (block_r_new) + bmap_nr_new++; + else + block_r_new = s->s_blocksize * 8; + + /* save old values */ + block_count = SB_BLOCK_COUNT(s); + bmap_nr = reiserfs_bmap_count(s); + + /* resizing of reiserfs bitmaps (journal and real), if needed */ + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk + ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + return -ENOMEM; + } + /* + * the new journal bitmaps are zero filled, now we copy i + * the bitmap node pointers from the old journal bitmap + * structs, and then transfer the new data structures + * into the journal struct. + * + * using the copy_size var below allows this code to work for + * both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = + copy_size * sizeof(struct reiserfs_list_bitmap_node *); + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + struct reiserfs_bitmap_node **node_tmp; + jb = SB_JOURNAL(s)->j_list_bitmap + i; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); + + /* + * just in case vfree schedules on us, copy the new + * pointer into the journal struct before freeing the + * old one + */ + node_tmp = jb->bitmaps; + jb->bitmaps = jbitmap[i].bitmaps; + vfree(node_tmp); + } + + /* + * allocate additional bitmap blocks, reallocate + * array of bitmap block pointers + */ + bitmap = + vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + if (!bitmap) { + /* + * Journal bitmaps are still supersized, but the + * memory isn't leaked, so I guess it's ok + */ + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + for (i = 0; i < bmap_nr; i++) + bitmap[i] = old_bitmap[i]; + + /* + * This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the + * journal transaction begins, and the new bitmaps don't + * matter if the transaction fails. + */ + for (i = bmap_nr; i < bmap_nr_new; i++) { + int depth; + /* + * don't use read_bitmap_block since it will cache + * the uninitialized bitmap + */ + depth = reiserfs_write_unlock_nested(s); + bh = sb_bread(s, i * s->s_blocksize * 8); + reiserfs_write_lock_nested(s, depth); + if (!bh) { + vfree(bitmap); + return -EIO; + } + memset(bh->b_data, 0, sb_blocksize(sb)); + reiserfs_set_le_bit(0, bh->b_data); + reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + depth = reiserfs_write_unlock_nested(s); + sync_dirty_buffer(bh); + reiserfs_write_lock_nested(s, depth); + /* update bitmap_info stuff */ + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; + brelse(bh); + } + /* free old bitmap blocks array */ + SB_AP_BITMAP(s) = bitmap; + vfree(old_bitmap); + } + + /* + * begin transaction, if there was an error, it's fine. Yes, we have + * incorrect bitmaps now, but none of it is ever going to touch the + * disk anyway. + */ + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* Extend old last bitmap block - new blocks have been made available */ + info = SB_AP_BITMAP(s) + bmap_nr - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); + if (!bh) { + int jerr = journal_end(&th); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r; i < s->s_blocksize * 8; i++) + reiserfs_clear_le_bit(i, bh->b_data); + info->free_count += s->s_blocksize * 8 - block_r; + + journal_mark_dirty(&th, bh); + brelse(bh); + + /* Correct new last bitmap block - It may not be full */ + info = SB_AP_BITMAP(s) + bmap_nr_new - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); + if (!bh) { + int jerr = journal_end(&th); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r_new; i < s->s_blocksize * 8; i++) + reiserfs_set_le_bit(i, bh->b_data); + journal_mark_dirty(&th, bh); + brelse(bh); + + info->free_count -= s->s_blocksize * 8 - block_r_new; + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + free_blocks = SB_FREE_BLOCKS(s); + PUT_SB_FREE_BLOCKS(s, + free_blocks + (block_count_new - block_count - + (bmap_nr_new - bmap_nr))); + PUT_SB_BLOCK_COUNT(s, block_count_new); + PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); + + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + + SB_JOURNAL(s)->j_must_wait = 1; + return journal_end(&th); +} diff --git a/kernel/fs/reiserfs/stree.c b/kernel/fs/reiserfs/stree.c new file mode 100644 index 000000000..24cbe0132 --- /dev/null +++ b/kernel/fs/reiserfs/stree.c @@ -0,0 +1,2262 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru + * Programm System Institute + * Pereslavl-Zalessky Russia + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include + +/* Does the buffer contain a disk block which is in the tree. */ +inline int B_IS_IN_TREE(const struct buffer_head *bh) +{ + + RFALSE(B_LEVEL(bh) > MAX_HEIGHT, + "PAP-1010: block (%b) has too big level (%z)", bh, bh); + + return (B_LEVEL(bh) != FREE_LEVEL); +} + +/* to get item head in le form */ +inline void copy_item_head(struct item_head *to, + const struct item_head *from) +{ + memcpy(to, from, IH_SIZE); +} + +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. For key of items of the same + * object this returns 0. + * Returns: -1 if key1 < key2 + * 0 if key1 == key2 + * 1 if key1 > key2 + */ +inline int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + __u32 n; + n = le32_to_cpu(le_key->k_dir_id); + if (n < cpu_key->on_disk_key.k_dir_id) + return -1; + if (n > cpu_key->on_disk_key.k_dir_id) + return 1; + n = le32_to_cpu(le_key->k_objectid); + if (n < cpu_key->on_disk_key.k_objectid) + return -1; + if (n > cpu_key->on_disk_key.k_objectid) + return 1; + return 0; +} + +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. + * Compare keys using all 4 key fields. + * Returns: -1 if key1 < key2 0 + * if key1 = key2 1 if key1 > key2 + */ +static inline int comp_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + int retval; + + retval = comp_short_keys(le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset(le_key_version(le_key), le_key) < + cpu_key_k_offset(cpu_key)) + return -1; + if (le_key_k_offset(le_key_version(le_key), le_key) > + cpu_key_k_offset(cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type(le_key_version(le_key), le_key) < + cpu_key_k_type(cpu_key)) + return -1; + + if (le_key_k_type(le_key_version(le_key), le_key) > + cpu_key_k_type(cpu_key)) + return 1; + + return 0; +} + +inline int comp_short_le_keys(const struct reiserfs_key *key1, + const struct reiserfs_key *key2) +{ + __u32 *k1_u32, *k2_u32; + int key_length = REISERFS_SHORT_KEY_LEN; + + k1_u32 = (__u32 *) key1; + k2_u32 = (__u32 *) key2; + for (; key_length--; ++k1_u32, ++k2_u32) { + if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32)) + return -1; + if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32)) + return 1; + } + return 0; +} + +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) +{ + int version; + to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); + + /* find out version of the key */ + version = le_key_version(from); + to->version = version; + to->on_disk_key.k_offset = le_key_k_offset(version, from); + to->on_disk_key.k_type = le_key_k_type(version, from); +} + +/* + * this does not say which one is bigger, it only returns 1 if keys + * are not equal, 0 otherwise + */ +inline int comp_le_keys(const struct reiserfs_key *k1, + const struct reiserfs_key *k2) +{ + return memcmp(k1, k2, sizeof(struct reiserfs_key)); +} + +/************************************************************************** + * Binary search toolkit function * + * Search for an item in the array by the item key * + * Returns: 1 if found, 0 if not found; * + * *pos = number of the searched element if found, else the * + * number of the first element that is larger than key. * + **************************************************************************/ +/* + * For those not familiar with binary search: lbound is the leftmost item + * that it could be, rbound the rightmost item that it could be. We examine + * the item halfway between lbound and rbound, and that tells us either + * that we can increase lbound, or decrease rbound, or that we have found it, + * or if lbound <= rbound that there are no possible items, and we have not + * found it. With each examination we cut the number of possible items it + * could be by one more than half rounded down, or we find it. + */ +static inline int bin_search(const void *key, /* Key to search for. */ + const void *base, /* First item in the array. */ + int num, /* Number of items in the array. */ + /* + * Item size in the array. searched. Lest the + * reader be confused, note that this is crafted + * as a general function, and when it is applied + * specifically to the array of item headers in a + * node, width is actually the item header size + * not the item size. + */ + int width, + int *pos /* Number of the searched for element. */ + ) +{ + int rbound, lbound, j; + + for (j = ((rbound = num - 1) + (lbound = 0)) / 2; + lbound <= rbound; j = (rbound + lbound) / 2) + switch (comp_keys + ((struct reiserfs_key *)((char *)base + j * width), + (struct cpu_key *)key)) { + case -1: + lbound = j + 1; + continue; + case 1: + rbound = j - 1; + continue; + case 0: + *pos = j; + return ITEM_FOUND; /* Key found in the array. */ + } + + /* + * bin_search did not find given key, it returns position of key, + * that is minimal and greater than the given one. + */ + *pos = lbound; + return ITEM_NOT_FOUND; +} + + +/* Minimal possible key. It is never in the tree. */ +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; + +/* Maximal possible key. It is never in the tree. */ +static const struct reiserfs_key MAX_KEY = { + cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff), + {{cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff)},} +}; + +/* + * Get delimiting key of the buffer by looking for it in the buffers in the + * path, starting from the bottom of the path, and going upwards. We must + * check the path's validity at each step. If the key is not in the path, + * there is no delimiting key in the tree (buffer is first or last buffer + * in tree), and in this case we return a special key, either MIN_KEY or + * MAX_KEY. + */ +static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5010: invalid offset in the path"); + + /* While not higher in path than first element. */ + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5020: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MAX_KEY; + /* + * Return delimiting key if position in the parent + * is not equal to zero. + */ + if (position) + return internal_key(parent, position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MIN_KEY; + return &MAX_KEY; +} + +/* Get delimiting key of the buffer at the path and its right neighbor. */ +inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5030: invalid offset in the path"); + + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5040: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MIN_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MIN_KEY; + /* + * Check whether parent at the path really points + * to the child. + */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MIN_KEY; + + /* + * Return delimiting key if position in the parent + * is not the last one. + */ + if (position != B_NR_ITEMS(parent)) + return internal_key(parent, position); + } + + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MAX_KEY; + return &MIN_KEY; +} + +/* + * Check whether a key is contained in the tree rooted from a buffer at a path. + * This works by looking at the left and right delimiting keys for the buffer + * in the last path_element in the path. These delimiting keys are stored + * at least one level above that buffer in the tree. If the buffer is the + * first or last node in the tree order then one of the delimiting keys may + * be absent, and in this case get_lkey and get_rkey return a special key + * which is MIN_KEY or MAX_KEY. + */ +static inline int key_in_buffer( + /* Path which should be checked. */ + struct treepath *chk_path, + /* Key which should be checked. */ + const struct cpu_key *key, + struct super_block *sb + ) +{ + + RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET + || chk_path->path_length > MAX_HEIGHT, + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", + key, chk_path->path_length); + RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev, + "PAP-5060: device must not be NODEV"); + + if (comp_keys(get_lkey(chk_path, sb), key) == 1) + /* left delimiting key is bigger, that the key we look for */ + return 0; + /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */ + if (comp_keys(get_rkey(chk_path, sb), key) != 1) + /* key must be less than right delimitiing key */ + return 0; + return 1; +} + +int reiserfs_check_path(struct treepath *p) +{ + RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, + "path not properly relsed"); + return 0; +} + +/* + * Drop the reference to each buffer in a path and restore + * dirty bits clean when preparing the buffer for the log. + * This version should only be called from fix_nodes() + */ +void pathrelse_and_restore(struct super_block *sb, + struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "clm-4000: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { + struct buffer_head *bh; + bh = PATH_OFFSET_PBUFFER(search_path, path_offset--); + reiserfs_restore_prepared_buffer(sb, bh); + brelse(bh); + } + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +/* Drop the reference to each buffer in a path */ +void pathrelse(struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "PAP-5090: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) + brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--)); + + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + struct item_head *ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { + reiserfs_warning(NULL, "reiserfs-5080", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning(NULL, "reiserfs-5081", + "nr_item seems wrong: %z", bh); + return 0; + } + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + + /* free space does not match to calculated amount of use space */ + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, "reiserfs-5082", + "free space seems wrong: %z", bh); + return 0; + } + /* + * FIXME: it is_leaf will hit performance too much - we may have + * return 1 here + */ + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i++, ih++) { + if (le_ih_k_type(ih) == TYPE_ANY) { + reiserfs_warning(NULL, "reiserfs-5083", + "wrong item type for item %h", + ih); + return 0; + } + if (ih_location(ih) >= blocksize + || ih_location(ih) < IH_SIZE * nr) { + reiserfs_warning(NULL, "reiserfs-5084", + "item location seems wrong: %h", + ih); + return 0; + } + if (ih_item_len(ih) < 1 + || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { + reiserfs_warning(NULL, "reiserfs-5085", + "item length seems wrong: %h", + ih); + return 0; + } + if (prev_location - ih_location(ih) != ih_item_len(ih)) { + reiserfs_warning(NULL, "reiserfs-5086", + "item location seems wrong " + "(second one): %h", ih); + return 0; + } + prev_location = ih_location(ih); + } + + /* one may imagine many more checks */ + return 1; +} + +/* returns 1 if buf looks like an internal node, 0 otherwise */ +static int is_internal(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + nr = blkh_level(blkh); + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + reiserfs_warning(NULL, "reiserfs-5087", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + /* for internal which is not root we might check min number of keys */ + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + reiserfs_warning(NULL, "reiserfs-5088", + "number of key seems wrong: %z", bh); + return 0; + } + + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, "reiserfs-5089", + "free space seems wrong: %z", bh); + return 0; + } + + /* one may imagine many more checks */ + return 1; +} + +/* + * make sure that bh contains formatted node of reiserfs tree of + * 'level'-th level + */ +static int is_tree_node(struct buffer_head *bh, int level) +{ + if (B_LEVEL(bh) != level) { + reiserfs_warning(NULL, "reiserfs-5090", "node level %d does " + "not match to the expected one %d", + B_LEVEL(bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf(bh->b_data, bh->b_size, bh); + + return is_internal(bh->b_data, bh->b_size, bh); +} + +#define SEARCH_BY_KEY_READA 16 + +/* + * The function is NOT SCHEDULE-SAFE! + * It might unlock the write lock if we needed to wait for a block + * to be read. Note that in this case it won't recover the lock to avoid + * high contention resulting from too much lock requests, especially + * the caller (search_by_key) will perform other schedule-unsafe + * operations just after calling this function. + * + * @return depth of lock to be restored after read completes + */ +static int search_by_key_reada(struct super_block *s, + struct buffer_head **bh, + b_blocknr_t *b, int num) +{ + int i, j; + int depth = -1; + + for (i = 0; i < num; i++) { + bh[i] = sb_getblk(s, b[i]); + } + /* + * We are going to read some blocks on which we + * have a reference. It's safe, though we might be + * reading blocks concurrently changed if we release + * the lock. But it's still fine because we check later + * if the tree changed + */ + for (j = 0; j < i; j++) { + /* + * note, this needs attention if we are getting rid of the BKL + * you have to make sure the prepared bit isn't set on this + * buffer + */ + if (!buffer_uptodate(bh[j])) { + if (depth == -1) + depth = reiserfs_write_unlock_nested(s); + ll_rw_block(READA, 1, bh + j); + } + brelse(bh[j]); + } + return depth; +} + +/* + * This function fills up the path from the root to the leaf as it + * descends the tree looking for the key. It uses reiserfs_bread to + * try to find buffers in the cache given their block number. If it + * does not find them in the cache it reads them from disk. For each + * node search_by_key finds using reiserfs_bread it then uses + * bin_search to look through that node. bin_search will find the + * position of the block_number of the next node if it is looking + * through an internal node. If it is looking through a leaf node + * bin_search will find the position of the item which has key either + * equal to given key, or which is the maximal key less than the given + * key. search_by_key returns a path that must be checked for the + * correctness of the top of the path but need not be checked for the + * correctness of the bottom of the path + */ +/* + * search_by_key - search for key (and item) in stree + * @sb: superblock + * @key: pointer to key to search for + * @search_path: Allocated and initialized struct treepath; Returned filled + * on success. + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to + * stop at leaf level. + * + * The function is NOT SCHEDULE-SAFE! + */ +int search_by_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *search_path, int stop_level) +{ + b_blocknr_t block_number; + int expected_level; + struct buffer_head *bh; + struct path_element *last_element; + int node_level, retval; + int right_neighbor_of_leaf_node; + int fs_gen; + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; + b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; + int reada_count = 0; + +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + + PROC_INFO_INC(sb, search_by_key); + + /* + * As we add each node to a path we increase its count. This means + * that we must be careful to release all nodes in a path before we + * either discard the path struct or re-use the path struct, as we + * do here. + */ + + pathrelse(search_path); + + right_neighbor_of_leaf_node = 0; + + /* + * With each iteration of this loop we search through the items in the + * current node, and calculate the next current node(next path element) + * for the next iteration of this loop.. + */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + while (1) { + +#ifdef CONFIG_REISERFS_CHECK + if (!(++repeat_counter % 50000)) + reiserfs_warning(sb, "PAP-5100", + "%s: there were %d iterations of " + "while loop looking for key %K", + current->comm, repeat_counter, + key); +#endif + + /* prep path to have another element added to it. */ + last_element = + PATH_OFFSET_PELEMENT(search_path, + ++search_path->path_length); + fs_gen = get_generation(sb); + + /* + * Read the next tree node, and set the last element + * in the path to have a pointer to it. + */ + if ((bh = last_element->pe_buffer = + sb_getblk(sb, block_number))) { + + /* + * We'll need to drop the lock if we encounter any + * buffers that need to be read. If all of them are + * already up to date, we don't need to drop the lock. + */ + int depth = -1; + + if (!buffer_uptodate(bh) && reada_count > 1) + depth = search_by_key_reada(sb, reada_bh, + reada_blocks, reada_count); + + if (!buffer_uptodate(bh) && depth == -1) + depth = reiserfs_write_unlock_nested(sb); + + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + + if (depth != -1) + reiserfs_write_lock_nested(sb, depth); + if (!buffer_uptodate(bh)) + goto io_error; + } else { +io_error: + search_path->path_length--; + pathrelse(search_path); + return IO_ERROR; + } + reada_count = 0; + if (expected_level == -1) + expected_level = SB_TREE_HEIGHT(sb); + expected_level--; + + /* + * It is possible that schedule occurred. We must check + * whether the key to search is still in the tree rooted + * from the current buffer. If not then repeat search + * from the root. + */ + if (fs_changed(fs_gen, sb) && + (!B_IS_IN_TREE(bh) || + B_LEVEL(bh) != expected_level || + !key_in_buffer(search_path, key, sb))) { + PROC_INFO_INC(sb, search_by_key_fs_changed); + PROC_INFO_INC(sb, search_by_key_restarted); + PROC_INFO_INC(sb, + sbk_restarted[expected_level - 1]); + pathrelse(search_path); + + /* + * Get the root block number so that we can + * repeat the search starting from the root. + */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } + + /* + * only check that the key is in the buffer if key is not + * equal to the MAX_KEY. Latter case is only possible in + * "finish_unfinished()" processing during mount. + */ + RFALSE(comp_keys(&MAX_KEY, key) && + !key_in_buffer(search_path, key, sb), + "PAP-5130: key is not in the buffer"); +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(sb)->cur_tb) { + print_cur_tb("5140"); + reiserfs_panic(sb, "PAP-5140", + "schedule occurred in do_balance!"); + } +#endif + + /* + * make sure, that the node contents look like a node of + * certain level + */ + if (!is_tree_node(bh, expected_level)) { + reiserfs_error(sb, "vs-5150", + "invalid format found in block %ld. " + "Fsck?", bh->b_blocknr); + pathrelse(search_path); + return IO_ERROR; + } + + /* ok, we have acquired next formatted node in the tree */ + node_level = B_LEVEL(bh); + + PROC_INFO_BH_STAT(sb, bh, node_level - 1); + + RFALSE(node_level < stop_level, + "vs-5152: tree level (%d) is less than stop level (%d)", + node_level, stop_level); + + retval = bin_search(key, item_head(bh, 0), + B_NR_ITEMS(bh), + (node_level == + DISK_LEAF_NODE_LEVEL) ? IH_SIZE : + KEY_SIZE, + &last_element->pe_position); + if (node_level == stop_level) { + return retval; + } + + /* we are not in the stop level */ + /* + * item has been found, so we choose the pointer which + * is to the right of the found one + */ + if (retval == ITEM_FOUND) + last_element->pe_position++; + + /* + * if item was not found we choose the position which is to + * the left of the found item. This requires no code, + * bin_search did it already. + */ + + /* + * So we have chosen a position in the current node which is + * an internal node. Now we calculate child block number by + * position in the node. + */ + block_number = + B_N_CHILD_NUM(bh, last_element->pe_position); + + /* + * if we are going to read leaf nodes, try for read + * ahead as well + */ + if ((search_path->reada & PATH_READA) && + node_level == DISK_LEAF_NODE_LEVEL + 1) { + int pos = last_element->pe_position; + int limit = B_NR_ITEMS(bh); + struct reiserfs_key *le_key; + + if (search_path->reada & PATH_READA_BACK) + limit = 0; + while (reada_count < SEARCH_BY_KEY_READA) { + if (pos == limit) + break; + reada_blocks[reada_count++] = + B_N_CHILD_NUM(bh, pos); + if (search_path->reada & PATH_READA_BACK) + pos--; + else + pos++; + + /* + * check to make sure we're in the same object + */ + le_key = internal_key(bh, pos); + if (le32_to_cpu(le_key->k_objectid) != + key->on_disk_key.k_objectid) { + break; + } + } + } + } +} + +/* + * Form the path to an item and position in this item which contains + * file byte defined by key. If there is no such item + * corresponding to the key, we point the path to the item with + * maximal key less than key, and *pos_in_item is set to one + * past the last entry/byte in the item. If searching for entry in a + * directory item, and it is not found, *pos_in_item is set to one + * entry more than the entry with maximal key which is less than the + * sought key. + * + * Note that if there is no entry in this same node which is one more, + * then we point to an imaginary entry. for direct items, the + * position is in units of bytes, for indirect items the position is + * in units of blocknr entries, for directory items the position is in + * units of directory entries. + */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_for_position_by_key(struct super_block *sb, + /* Key to search (cpu variable) */ + const struct cpu_key *p_cpu_key, + /* Filled up by this function. */ + struct treepath *search_path) +{ + struct item_head *p_le_ih; /* pointer to on-disk structure */ + int blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if (is_direntry_cpu_key(p_cpu_key)) + return search_by_entry_key(sb, p_cpu_key, search_path, + &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item(sb, p_cpu_key, search_path); + if (retval == IO_ERROR) + return retval; + if (retval == ITEM_FOUND) { + + RFALSE(!ih_item_len + (item_head + (PATH_PLAST_BUFFER(search_path), + PATH_LAST_POSITION(search_path))), + "PAP-5165: item length equals zero"); + + pos_in_item(search_path) = 0; + return POSITION_FOUND; + } + + RFALSE(!PATH_LAST_POSITION(search_path), + "PAP-5170: position equals zero"); + + /* Item is not found. Set path to the previous item. */ + p_le_ih = + item_head(PATH_PLAST_BUFFER(search_path), + --PATH_LAST_POSITION(search_path)); + blk_size = sb->s_blocksize; + + if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key)) + return FILE_NOT_FOUND; + + /* FIXME: quite ugly this far */ + + item_offset = le_ih_k_offset(p_le_ih); + offset = cpu_key_k_offset(p_cpu_key); + + /* Needed byte is contained in the item pointed to by the path. */ + if (item_offset <= offset && + item_offset + op_bytes_number(p_le_ih, blk_size) > offset) { + pos_in_item(search_path) = offset - item_offset; + if (is_indirect_le_ih(p_le_ih)) { + pos_in_item(search_path) /= blk_size; + } + return POSITION_FOUND; + } + + /* + * Needed byte is not contained in the item pointed to by the + * path. Set pos_in_item out of the item. + */ + if (is_indirect_le_ih(p_le_ih)) + pos_in_item(search_path) = + ih_item_len(p_le_ih) / UNFM_P_SIZE; + else + pos_in_item(search_path) = ih_item_len(p_le_ih); + + return POSITION_NOT_FOUND; +} + +/* Compare given item and item pointed to by the path. */ +int comp_items(const struct item_head *stored_ih, const struct treepath *path) +{ + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + struct item_head *ih; + + /* Last buffer at the path is not in the tree. */ + if (!B_IS_IN_TREE(bh)) + return 1; + + /* Last path position is invalid. */ + if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh)) + return 1; + + /* we need only to know, whether it is the same item */ + ih = tp_item_head(path); + return memcmp(stored_ih, ih, IH_SIZE); +} + +/* unformatted nodes are not logged anymore, ever. This is safe now */ +#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) + +/* block can not be forgotten as it is in I/O or held by someone */ +#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) + +/* prepare for delete or cut of direct item */ +static inline int prepare_for_direct_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, int *cut_size) +{ + loff_t round_len; + + if (new_file_length == max_reiserfs_offset(inode)) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + /* new file gets truncated */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { + round_len = ROUND_UP(new_file_length); + /* this was new_file_length < le_ih ... */ + if (round_len < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1); + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + /* old file: items may have any length */ + + if (new_file_length < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + + /* Calculate first position and size for cutting from item. */ + *cut_size = -(ih_item_len(le_ih) - + (pos_in_item(path) = + new_file_length + 1 - le_ih_k_offset(le_ih))); + return M_CUT; /* Cut from this item. */ +} + +static inline int prepare_for_direntry_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, + int *cut_size) +{ + if (le_ih_k_offset(le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset(inode)) { + RFALSE(ih_entry_count(le_ih) != 2, + "PAP-5220: incorrect empty directory item (%h)", le_ih); + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + /* Delete the directory item containing "." and ".." entry. */ + return M_DELETE; + } + + if (ih_entry_count(le_ih) == 1) { + /* + * Delete the directory item such as there is one record only + * in this item + */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = + -(DEH_SIZE + + entry_length(get_last_bh(path), le_ih, pos_in_item(path))); + return M_CUT; +} + +#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) + +/* + * If the path points to a directory or direct item, calculate mode + * and the size cut, for balance. + * If the path points to an indirect item, remove some number of its + * unformatted nodes. + * In case of file truncate calculate whether this item must be + * deleted/truncated or last unformatted node of this item will be + * converted to a direct item. + * This function returns a determination of what balance mode the + * calling function should employ. + */ +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct treepath *path, + const struct cpu_key *item_key, + /* + * Number of unformatted nodes + * which were removed from end + * of the file. + */ + int *removed, + int *cut_size, + /* MAX_KEY_OFFSET in case of delete. */ + unsigned long long new_file_length + ) +{ + struct super_block *sb = inode->i_sb; + struct item_head *p_le_ih = tp_item_head(path); + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + + BUG_ON(!th->t_trans_id); + + /* Stat_data item. */ + if (is_statdata_le_ih(p_le_ih)) { + + RFALSE(new_file_length != max_reiserfs_offset(inode), + "PAP-5210: mode must be M_DELETE"); + + *cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); + return M_DELETE; + } + + /* Directory item. */ + if (is_direntry_le_ih(p_le_ih)) + return prepare_for_direntry_item(path, p_le_ih, inode, + new_file_length, + cut_size); + + /* Direct item. */ + if (is_direct_le_ih(p_le_ih)) + return prepare_for_direct_item(path, p_le_ih, inode, + new_file_length, cut_size); + + /* Case of an indirect item. */ + { + int blk_size = sb->s_blocksize; + struct item_head s_ih; + int need_re_search; + int delete = 0; + int result = M_CUT; + int pos = 0; + + if ( new_file_length == max_reiserfs_offset (inode) ) { + /* + * prepare_for_delete_or_cut() is called by + * reiserfs_delete_item() + */ + new_file_length = 0; + delete = 1; + } + + do { + need_re_search = 0; + *cut_size = 0; + bh = PATH_PLAST_BUFFER(path); + copy_item_head(&s_ih, tp_item_head(path)); + pos = I_UNFM_NUM(&s_ih); + + while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { + __le32 *unfm; + __u32 block; + + /* + * Each unformatted block deletion may involve + * one additional bitmap block into the transaction, + * thereby the initial journal space reservation + * might not be enough. + */ + if (!delete && (*cut_size) != 0 && + reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) + break; + + unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1; + block = get_block_num(unfm, 0); + + if (block != 0) { + reiserfs_prepare_for_journal(sb, bh, 1); + put_block_num(unfm, 0, 0); + journal_mark_dirty(th, bh); + reiserfs_free_block(th, inode, block, 1); + } + + reiserfs_cond_resched(sb); + + if (item_moved (&s_ih, path)) { + need_re_search = 1; + break; + } + + pos --; + (*removed)++; + (*cut_size) -= UNFM_P_SIZE; + + if (pos == 0) { + (*cut_size) -= IH_SIZE; + result = M_DELETE; + break; + } + } + /* + * a trick. If the buffer has been logged, this will + * do nothing. If we've broken the loop without logging + * it, it will restore the buffer + */ + reiserfs_restore_prepared_buffer(sb, bh); + } while (need_re_search && + search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); + pos_in_item(path) = pos * UNFM_P_SIZE; + + if (*cut_size == 0) { + /* + * Nothing was cut. maybe convert last unformatted node to the + * direct item? + */ + result = M_CONVERT; + } + return result; + } +} + +/* Calculate number of bytes which will be deleted or cut during balance */ +static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) +{ + int del_size; + struct item_head *p_le_ih = tp_item_head(tb->tb_path); + + if (is_statdata_le_ih(p_le_ih)) + return 0; + + del_size = + (mode == + M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; + if (is_direntry_le_ih(p_le_ih)) { + /* + * return EMPTY_DIR_SIZE; We delete emty directories only. + * we can't use EMPTY_DIR_SIZE, as old format dirs have a + * different empty size. ick. FIXME, is this right? + */ + return del_size; + } + + if (is_indirect_le_ih(p_le_ih)) + del_size = (del_size / UNFM_P_SIZE) * + (PATH_PLAST_BUFFER(tb->tb_path)->b_size); + return del_size; +} + +static void init_tb_struct(struct reiserfs_transaction_handle *th, + struct tree_balance *tb, + struct super_block *sb, + struct treepath *path, int size) +{ + + BUG_ON(!th->t_trans_id); + + memset(tb, '\0', sizeof(struct tree_balance)); + tb->transaction_handle = th; + tb->tb_sb = sb; + tb->tb_path = path; + PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + tb->insert_size[0] = size; +} + +void padd_item(char *item, int total_length, int length) +{ + int i; + + for (i = total_length; i > length;) + item[--i] = 0; +} + +#ifdef REISERQUOTA_DEBUG +char key2type(struct reiserfs_key *ih) +{ + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; +} + +char head2type(struct item_head *ih) +{ + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; +} +#endif + +/* + * Delete object item. + * th - active transaction handle + * path - path to the deleted item + * item_key - key to search for the deleted item + * indode - used for updating i_blocks and quotas + * un_bh - NULL or unformatted node pointer + */ +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *item_key, + struct inode *inode, struct buffer_head *un_bh) +{ + struct super_block *sb = inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; + int ret_value, del_size, removed; + int depth; + +#ifdef CONFIG_REISERFS_CHECK + char mode; + int iter = 0; +#endif + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_del_balance, sb, path, + 0 /*size is unknown */ ); + + while (1) { + removed = 0; + +#ifdef CONFIG_REISERFS_CHECK + iter++; + mode = +#endif + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &del_size, + max_reiserfs_offset(inode)); + + RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); + + copy_item_head(&s_ih, tp_item_head(path)); + s_del_balance.insert_size[0] = del_size; + + ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, delete_item_restarted); + + /* file system changed, repeat search */ + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == IO_ERROR) + break; + if (ret_value == FILE_NOT_FOUND) { + reiserfs_warning(sb, "vs-5340", + "no items of the file %K found", + item_key); + break; + } + } /* while (1) */ + + if (ret_value != CARRY_ON) { + unfix_nodes(&s_del_balance); + return 0; + } + + /* reiserfs_delete_item returns item length when success */ + ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = tp_item_head(path); + quota_cut_bytes = ih_item_len(q_ih); + + /* + * hack so the quota code doesn't have to guess if the file has a + * tail. On tail insert, we allocate quota for 1 unformatted node. + * We test the offset because the tail might have been + * split into multiple items, and we only want to decrement for + * the unfm node once + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } + + if (un_bh) { + int off; + char *data; + + /* + * We are in direct2indirect conversion, so move tail contents + * to the unformatted node + */ + /* + * note, we do the copy before preparing the buffer because we + * don't care about the contents of the unformatted node yet. + * the only thing we really care about is the direct item's + * data is in the unformatted node. + * + * Otherwise, we would have to call + * reiserfs_prepare_for_journal on the unformatted node, + * which might schedule, meaning we'd have to loop all the + * way back up to the start of the while loop. + * + * The unformatted node must be dirtied later on. We can't be + * sure here if the entire tail has been deleted yet. + * + * un_bh is from the page cache (all unformatted nodes are + * from the page cache) and might be a highmem page. So, we + * can't use un_bh->b_data. + * -clm + */ + + data = kmap_atomic(un_bh->b_page); + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + memcpy(data + off, + ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), + ret_value); + kunmap_atomic(data); + } + + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "reiserquota delete_item(): freeing %u, id=%u type=%c", + quota_cut_bytes, inode->i_uid, head2type(&s_ih)); +#endif + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + + /* Return deleted body length */ + return ret_value; +} + +/* + * Summary Of Mechanisms For Handling Collisions Between Processes: + * + * deletion of the body of the object is performed by iput(), with the + * result that if multiple processes are operating on a file, the + * deletion of the body of the file is deferred until the last process + * that has an open inode performs its iput(). + * + * writes and truncates are protected from collisions by use of + * semaphores. + * + * creates, linking, and mknod are protected from collisions with other + * processes by making the reiserfs_add_entry() the last step in the + * creation, and then rolling back all changes if there was a collision. + * - Hans +*/ + +/* this deletes item which never gets split */ +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key) +{ + struct super_block *sb = th->t_super; + struct tree_balance tb; + INITIALIZE_PATH(path); + int item_len = 0; + int tb_init = 0; + struct cpu_key cpu_key; + int retval; + int quota_cut_bytes = 0; + + BUG_ON(!th->t_trans_id); + + le_key2cpu_key(&cpu_key, key); + + while (1) { + retval = search_item(th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_error(th->t_super, "vs-5350", + "i/o failure occurred trying " + "to delete %K", &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse(&path); + /* + * No need for a warning, if there is just no free + * space to insert '..' item into the + * newly-created subdir + */ + if (! + ((unsigned long long) + GET_HASH_VALUE(le_key_k_offset + (le_key_version(key), key)) == 0 + && (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset + (le_key_version(key), + key)) == 1)) + reiserfs_warning(th->t_super, "vs-5355", + "%k not found", key); + break; + } + if (!tb_init) { + tb_init = 1; + item_len = ih_item_len(tp_item_head(&path)); + init_tb_struct(th, &tb, th->t_super, &path, + -(IH_SIZE + item_len)); + } + quota_cut_bytes = ih_item_len(tp_item_head(&path)); + + retval = fix_nodes(M_DELETE, &tb, NULL, NULL); + if (retval == REPEAT_SEARCH) { + PROC_INFO_INC(th->t_super, delete_solid_item_restarted); + continue; + } + + if (retval == CARRY_ON) { + do_balance(&tb, NULL, NULL, M_DELETE); + /* + * Should we count quota for item? (we don't + * count quotas for save-links) + */ + if (inode) { + int depth; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota delete_solid_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, + key2type(key)); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, + quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); + } + break; + } + + /* IO_ERROR, NO_DISK_SPACE, etc */ + reiserfs_warning(th->t_super, "vs-5360", + "could not delete %K due to fix_nodes failure", + &cpu_key); + unfix_nodes(&tb); + break; + } + + reiserfs_check_path(&path); +} + +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + int err; + inode->i_size = 0; + BUG_ON(!th->t_trans_id); + + /* for directory this deletes item containing "." and ".." */ + err = + reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ ); + if (err) + return err; + +#if defined( USE_INODE_GENERATION_COUNTER ) + if (!old_format_only(th->t_super)) { + __le32 *inode_generation; + + inode_generation = + &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; + le32_add_cpu(inode_generation, 1); + } +/* USE_INODE_GENERATION_COUNTER */ +#endif + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + + return err; +} + +static void unmap_buffers(struct page *page, loff_t pos) +{ + struct buffer_head *bh; + struct buffer_head *head; + struct buffer_head *next; + unsigned long tail_index; + unsigned long cur_index; + + if (page) { + if (page_has_buffers(page)) { + tail_index = pos & (PAGE_CACHE_SIZE - 1); + cur_index = 0; + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + + /* + * we want to unmap the buffers that contain + * the tail, and all the buffers after it + * (since the tail must be at the end of the + * file). We don't want to unmap file data + * before the tail, since it might be dirty + * and waiting to reach disk + */ + cur_index += bh->b_size; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } + } +} + +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct page *page, + struct treepath *path, + const struct cpu_key *item_key, + loff_t new_file_size, char *mode) +{ + struct super_block *sb = inode->i_sb; + int block_size = sb->s_blocksize; + int cut_bytes; + BUG_ON(!th->t_trans_id); + BUG_ON(new_file_size != inode->i_size); + + /* + * the page being sent in could be NULL if there was an i/o error + * reading in the last block. The user will hit problems trying to + * read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&inode->i_count) > 1 || + !tail_has_to_be_packed(inode) || + !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) { + /* leave tail in an unformatted node */ + *mode = M_SKIP_BALANCING; + cut_bytes = + block_size - (new_file_size & (block_size - 1)); + pathrelse(path); + return cut_bytes; + } + + /* Perform the conversion to a direct_item. */ + return indirect2direct(th, inode, page, path, item_key, + new_file_size, mode); +} + +/* + * we did indirect_to_direct conversion. And we have inserted direct + * item successesfully, but there were no disk space to cut unfm + * pointer being converted. Therefore we have to delete inserted + * direct item(s) + */ +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct cpu_key tail_key; + int tail_len; + int removed; + BUG_ON(!th->t_trans_id); + + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); + tail_key.key_length = 4; + + tail_len = + (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key(inode->i_sb, &tail_key, path) == + POSITION_NOT_FOUND) + reiserfs_panic(inode->i_sb, "vs-5615", + "found invalid item"); + RFALSE(path->pos_in_item != + ih_item_len(tp_item_head(path)) - 1, + "vs-5616: appended bytes found"); + PATH_LAST_POSITION(path)--; + + removed = + reiserfs_delete_item(th, path, &tail_key, inode, + NULL /*unbh not needed */ ); + RFALSE(removed <= 0 + || removed > tail_len, + "vs-5617: there was tail %d bytes, removed item length %d bytes", + tail_len, removed); + tail_len -= removed; + set_cpu_key_k_offset(&tail_key, + cpu_key_k_offset(&tail_key) - removed); + } + reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " + "conversion has been rolled back due to " + "lack of disk space"); + mark_inode_dirty(inode); +} + +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + struct cpu_key *item_key, + struct inode *inode, + struct page *page, loff_t new_file_size) +{ + struct super_block *sb = inode->i_sb; + /* + * Every function which is going to call do_balance must first + * create a tree_balance structure. Then it must fill up this + * structure by using the init_tb_struct and fix_nodes functions. + * After that we can make tree balancing. + */ + struct tree_balance s_cut_balance; + struct item_head *p_le_ih; + int cut_size = 0; /* Amount to be cut. */ + int ret_value = CARRY_ON; + int removed = 0; /* Number of the removed unformatted nodes. */ + int is_inode_locked = 0; + char mode; /* Mode of the balance. */ + int retval2 = -1; + int quota_cut_bytes; + loff_t tail_pos = 0; + int depth; + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_cut_balance, inode->i_sb, path, + cut_size); + + /* + * Repeat this loop until we either cut the item without needing + * to balance, or we fix_nodes without schedule occurring + */ + while (1) { + /* + * Determine the balance mode, position of the first byte to + * be cut, and size to be cut. In case of the indirect item + * free unformatted nodes which are pointed to by the cut + * pointers. + */ + + mode = + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &cut_size, new_file_size); + if (mode == M_CONVERT) { + /* + * convert last unformatted node to direct item or + * leave tail in the unformatted node + */ + RFALSE(ret_value != CARRY_ON, + "PAP-5570: can not convert twice"); + + ret_value = + maybe_indirect_to_direct(th, inode, page, + path, item_key, + new_file_size, &mode); + if (mode == M_SKIP_BALANCING) + /* tail has been left in the unformatted node */ + return ret_value; + + is_inode_locked = 1; + + /* + * removing of last unformatted node will + * change value we have to return to truncate. + * Save it + */ + retval2 = ret_value; + + /* + * So, we have performed the first part of the + * conversion: + * inserting the new direct item. Now we are + * removing the last unformatted node pointer. + * Set key to search for it. + */ + set_cpu_key_k_type(item_key, TYPE_INDIRECT); + item_key->key_length = 4; + new_file_size -= + (new_file_size & (sb->s_blocksize - 1)); + tail_pos = new_file_size; + set_cpu_key_k_offset(item_key, new_file_size + 1); + if (search_for_position_by_key + (sb, item_key, + path) == POSITION_NOT_FOUND) { + print_block(PATH_PLAST_BUFFER(path), 3, + PATH_LAST_POSITION(path) - 1, + PATH_LAST_POSITION(path) + 1); + reiserfs_panic(sb, "PAP-5580", "item to " + "convert does not exist (%K)", + item_key); + } + continue; + } + if (cut_size == 0) { + pathrelse(path); + return 0; + } + + s_cut_balance.insert_size[0] = cut_size; + + ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, cut_from_item_restarted); + + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == POSITION_FOUND) + continue; + + reiserfs_warning(sb, "PAP-5610", "item %K not found", + item_key); + unfix_nodes(&s_cut_balance); + return (ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */ + if (ret_value != CARRY_ON) { + if (is_inode_locked) { + /* + * FIXME: this seems to be not needed: we are always + * able to cut item + */ + indirect_to_direct_roll_back(th, inode, path); + } + if (ret_value == NO_DISK_SPACE) + reiserfs_warning(sb, "reiserfs-5092", + "NO_DISK_SPACE"); + unfix_nodes(&s_cut_balance); + return -EIO; + } + + /* go ahead and perform balancing */ + + RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode"); + + /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = + (mode == + M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance. + insert_size[0]; + if (retval2 == -1) + ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); + else + ret_value = retval2; + + /* + * For direct items, we only change the quota when deleting the last + * item. + */ + p_le_ih = tp_item_head(s_cut_balance.tb_path); + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (mode == M_DELETE && + (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == + 1) { + /* FIXME: this is to keep 3.5 happy */ + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } +#ifdef CONFIG_REISERFS_CHECK + if (is_inode_locked) { + struct item_head *le_ih = + tp_item_head(s_cut_balance.tb_path); + /* + * we are going to complete indirect2direct conversion. Make + * sure, that we exactly remove last unformatted node pointer + * of the item + */ + if (!is_indirect_le_ih(le_ih)) + reiserfs_panic(sb, "vs-5652", + "item must be indirect %h", le_ih); + + if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) + reiserfs_panic(sb, "vs-5653", "completing " + "indirect2direct conversion indirect " + "item %h being deleted must be of " + "4 byte long", le_ih); + + if (mode == M_CUT + && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic(sb, "vs-5654", "can not complete " + "indirect2direct conversion of %h " + "(CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* + * it would be useful to make sure, that right neighboring + * item is direct item of this file + */ + } +#endif + + do_balance(&s_cut_balance, NULL, NULL, mode); + if (is_inode_locked) { + /* + * we've done an indirect->direct conversion. when the + * data block was freed, it was removed from the list of + * blocks that must be flushed before the transaction + * commits, make sure to unmap and invalidate it + */ + unmap_buffers(page, tail_pos); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + } +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota cut_from_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, '?'); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); + return ret_value; +} + +static void truncate_directory(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + BUG_ON(!th->t_trans_id); + if (inode->i_nlink) + reiserfs_error(inode->i_sb, "vs-5655", "link count != 0"); + + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + reiserfs_update_sd(th, inode); + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); +} + +/* + * Truncate file to the new size. Note, this must be called with a + * transaction already started + */ +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, + struct inode *inode, /* ->i_size contains new size */ + struct page *page, /* up to date for last block */ + /* + * when it is called by file_release to convert + * the tail - no timestamps should be updated + */ + int update_timestamps + ) +{ + INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ + struct item_head *p_le_ih; /* Pointer to an item header. */ + + /* Key to search for a previous file item. */ + struct cpu_key s_item_key; + loff_t file_size, /* Old file size. */ + new_file_size; /* New file size. */ + int deleted; /* Number of deleted or truncated bytes. */ + int retval; + int err = 0; + + BUG_ON(!th->t_trans_id); + if (! + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + return 0; + + /* deletion of directory - no need to update timestamps */ + if (S_ISDIR(inode->i_mode)) { + truncate_directory(th, inode); + return 0; + } + + /* Get new file size. */ + new_file_size = inode->i_size; + + /* FIXME: note, that key type is unimportant here */ + make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), + TYPE_DIRECT, 3); + + retval = + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-5657", + "i/o failure occurred trying to truncate %K", + &s_item_key); + err = -EIO; + goto out; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_error(inode->i_sb, "PAP-5660", + "wrong result %d of search for %K", retval, + &s_item_key); + + err = -EIO; + goto out; + } + + s_search_path.pos_in_item--; + + /* Get real file size (total length of all file items) */ + p_le_ih = tp_item_head(&s_search_path); + if (is_statdata_le_ih(p_le_ih)) + file_size = 0; + else { + loff_t offset = le_ih_k_offset(p_le_ih); + int bytes = + op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); + + /* + * this may mismatch with real file size: if last direct item + * had no padding zeros and last unformatted node had no free + * space, this file would have this file size + */ + file_size = offset + bytes - 1; + } + /* + * are we doing a full truncate or delete, if so + * kick in the reada code + */ + if (new_file_size == 0) + s_search_path.reada = PATH_READA | PATH_READA_BACK; + + if (file_size == 0 || file_size < new_file_size) { + goto update_and_out; + } + + /* Update key to search for the last file item. */ + set_cpu_key_k_offset(&s_item_key, file_size); + + do { + /* Cut or delete file item. */ + deleted = + reiserfs_cut_from_item(th, &s_search_path, &s_item_key, + inode, page, new_file_size); + if (deleted < 0) { + reiserfs_warning(inode->i_sb, "vs-5665", + "reiserfs_cut_from_item failed"); + reiserfs_check_path(&s_search_path); + return 0; + } + + RFALSE(deleted > file_size, + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", + deleted, file_size, &s_item_key); + + /* Change key to search the last file item. */ + file_size -= deleted; + + set_cpu_key_k_offset(&s_item_key, file_size); + + /* + * While there are bytes to truncate and previous + * file item is presented in the tree. + */ + + /* + * This loop could take a really long time, and could log + * many more blocks than a transaction can hold. So, we do + * a polite journal end here, and if the transaction needs + * ending, we make sure the file is consistent before ending + * the current trans and starting a new one + */ + if (journal_transaction_should_end(th, 0) || + reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { + pathrelse(&s_search_path); + + if (update_timestamps) { + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + + err = journal_end(th); + if (err) + goto out; + err = journal_begin(th, inode->i_sb, + JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ; + if (err) + goto out; + reiserfs_update_inode_transaction(inode); + } + } while (file_size > ROUND_UP(new_file_size) && + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path) == POSITION_FOUND); + + RFALSE(file_size > ROUND_UP(new_file_size), + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", + new_file_size, file_size, s_item_key.on_disk_key.k_objectid); + +update_and_out: + if (update_timestamps) { + /* this is truncate, not file closing */ + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + +out: + pathrelse(&s_search_path); + return err; +} + +#ifdef CONFIG_REISERFS_CHECK +/* this makes sure, that we __append__, not overwrite or add holes */ +static void check_research_for_paste(struct treepath *path, + const struct cpu_key *key) +{ + struct item_head *found_ih = tp_item_head(path); + + if (is_direct_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + pos_in_item(path)) + reiserfs_panic(NULL, "PAP-5720", "found direct item " + "%h or position (%d) does not match " + "to key %K", found_ih, + pos_in_item(path), key); + } + if (is_indirect_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || I_UNFM_NUM(found_ih) != pos_in_item(path) + || get_ih_free_space(found_ih) != 0) + reiserfs_panic(NULL, "PAP-5730", "found indirect " + "item (%h) or position (%d) does not " + "match to key (%K)", + found_ih, pos_in_item(path), key); + } +} +#endif /* config reiserfs check */ + +/* + * Paste bytes to the existing item. + * Returns bytes number pasted into the item. + */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + /* Path to the pasted item. */ + struct treepath *search_path, + /* Key to search for the needed item. */ + const struct cpu_key *key, + /* Inode item belongs to */ + struct inode *inode, + /* Pointer to the bytes to paste. */ + const char *body, + /* Size of pasted bytes. */ + int pasted_size) +{ + struct super_block *sb = inode->i_sb; + struct tree_balance s_paste_balance; + int retval; + int fs_gen; + int depth; + + BUG_ON(!th->t_trans_id); + + fs_gen = get_generation(inode->i_sb); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): allocating %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&key->on_disk_key)); +#endif + + depth = reiserfs_write_unlock_nested(sb); + retval = dquot_alloc_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); + if (retval) { + pathrelse(search_path); + return retval; + } + init_tb_struct(th, &s_paste_balance, th->t_super, search_path, + pasted_size); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_paste_balance.key = key->on_disk_key; +#endif + + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_PASTE, &s_paste_balance, NULL, + body)) == REPEAT_SEARCH) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, paste_into_item_restarted); + retval = + search_for_position_by_key(th->t_super, key, + search_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "PAP-5710", + "entry or pasted byte (%K) exists", + key); + retval = -EEXIST; + goto error_out; + } +#ifdef CONFIG_REISERFS_CHECK + check_research_for_paste(search_path, key); +#endif + } + + /* + * Perform balancing after all resources are collected by fix_nodes, + * and accessing them will not risk triggering schedule. + */ + if (retval == CARRY_ON) { + do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); + return 0; + } + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* this also releases the path */ + unfix_nodes(&s_paste_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): freeing %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&key->on_disk_key)); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); + return retval; +} + +/* + * Insert new item into the buffer at the path. + * th - active transaction handle + * path - path to the inserted item + * ih - pointer to the item header to insert + * body - pointer to the bytes to insert + */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *key, + struct item_head *ih, struct inode *inode, + const char *body) +{ + struct tree_balance s_ins_balance; + int retval; + int fs_gen = 0; + int quota_bytes = 0; + + BUG_ON(!th->t_trans_id); + + if (inode) { /* Do we count quotas for item? */ + int depth; + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(ih); + + /* + * hack so the quota code doesn't have to guess + * if the file has a tail, links are always tails, + * so there's no guessing needed + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): allocating %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + /* + * We can't dirty inode here. It would be immediately + * written but appropriate stat item isn't inserted yet... + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + if (retval) { + pathrelse(path); + return retval; + } + } + init_tb_struct(th, &s_ins_balance, th->t_super, path, + IH_SIZE + ih_item_len(ih)); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_ins_balance.key = key->on_disk_key; +#endif + /* + * DQUOT_* can schedule, must check to be sure calling + * fix_nodes is safe + */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_INSERT, &s_ins_balance, ih, + body)) == REPEAT_SEARCH) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, insert_item_restarted); + retval = search_item(th->t_super, key, path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == ITEM_FOUND) { + reiserfs_warning(th->t_super, "PAP-5760", + "key %K already exists in the tree", + key); + retval = -EEXIST; + goto error_out; + } + } + + /* make balancing after all resources will be collected at a time */ + if (retval == CARRY_ON) { + do_balance(&s_ins_balance, ih, body, M_INSERT); + return 0; + } + + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* also releases the path */ + unfix_nodes(&s_ins_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): freeing %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + if (inode) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + } + return retval; +} diff --git a/kernel/fs/reiserfs/super.c b/kernel/fs/reiserfs/super.c new file mode 100644 index 000000000..0111ad046 --- /dev/null +++ b/kernel/fs/reiserfs/super.c @@ -0,0 +1,2563 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to add the LFS fixes + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct file_system_type reiserfs_fs_type; + +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; + +int is_reiserfs_3_5(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string, + strlen(reiserfs_3_5_magic_string)); +} + +int is_reiserfs_3_6(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string, + strlen(reiserfs_3_6_magic_string)); +} + +int is_reiserfs_jr(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string, + strlen(reiserfs_jr_magic_string)); +} + +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) +{ + return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) || + is_reiserfs_jr(rs)); +} + +static int reiserfs_remount(struct super_block *s, int *flags, char *data); +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); + +static int reiserfs_sync_fs(struct super_block *s, int wait) +{ + struct reiserfs_transaction_handle th; + + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(s, -1); + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th)) + reiserfs_flush_old_commits(s); + reiserfs_write_unlock(s); + return 0; +} + +static void flush_old_commits(struct work_struct *work) +{ + struct reiserfs_sb_info *sbi; + struct super_block *s; + + sbi = container_of(work, struct reiserfs_sb_info, old_work.work); + s = sbi->s_journal->j_work_sb; + + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); + + reiserfs_sync_fs(s, 1); +} + +void reiserfs_schedule_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + unsigned long delay; + + /* + * Avoid scheduling flush when sb is being shut down. It can race + * with journal shutdown and free still queued delayed work. + */ + if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE)) + return; + + spin_lock(&sbi->old_work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->old_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->old_work_lock); +} + +static void cancel_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); +} + +static int reiserfs_freeze(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + + cancel_old_flush(s); + + reiserfs_write_lock(s); + if (!(s->s_flags & MS_RDONLY)) { + int err = journal_begin(&th, s, 1); + if (err) { + reiserfs_block_writes(&th); + } else { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + reiserfs_block_writes(&th); + journal_end_sync(&th); + } + } + reiserfs_write_unlock(s); + return 0; +} + +static int reiserfs_unfreeze(struct super_block *s) +{ + reiserfs_allow_writes(s); + return 0; +} + +extern const struct in_core_key MAX_IN_CORE_KEY; + +/* + * this is used to delete "save link" when there are no items of a + * file it points to. It can either happen if unlink is completed but + * "save unlink" removal, or if file has both unlink and truncate + * pending and as unlink completes first (because key of "save link" + * protecting unlink is bigger that a key lf "save link" which + * protects truncate), so there left no items to make truncate + * completion on + */ +static int remove_save_link_only(struct super_block *s, + struct reiserfs_key *key, int oid_free) +{ + struct reiserfs_transaction_handle th; + int err; + + /* we are going to do one balancing */ + err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + reiserfs_delete_solid_item(&th, NULL, key); + if (oid_free) + /* removals are protected by direct items */ + reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); + + return journal_end(&th); +} + +#ifdef CONFIG_QUOTA +static int reiserfs_quota_on_mount(struct super_block *, int); +#endif + +/* look for uncompleted unlinks and truncates and complete them */ +static int finish_unfinished(struct super_block *s) +{ + INITIALIZE_PATH(path); + struct cpu_key max_cpu_key, obj_key; + struct reiserfs_key save_link_key, last_inode_key; + int retval = 0; + struct item_head *ih; + struct buffer_head *bh; + int item_pos; + char *item; + int done; + struct inode *inode; + int truncate; +#ifdef CONFIG_QUOTA + int i; + int ms_active_set; + int quota_enabled[REISERFS_MAXQUOTAS]; +#endif + + /* compose key to look for "save" links */ + max_cpu_key.version = KEY_FORMAT_3_5; + max_cpu_key.on_disk_key.k_dir_id = ~0U; + max_cpu_key.on_disk_key.k_objectid = ~0U; + set_cpu_key_k_offset(&max_cpu_key, ~0U); + max_cpu_key.key_length = 3; + + memset(&last_inode_key, 0, sizeof(last_inode_key)); + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + if (s->s_flags & MS_ACTIVE) { + ms_active_set = 0; + } else { + ms_active_set = 1; + s->s_flags |= MS_ACTIVE; + } + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + quota_enabled[i] = 1; + if (REISERFS_SB(s)->s_qf_names[i]) { + int ret; + + if (sb_has_quota_active(s, i)) { + quota_enabled[i] = 0; + continue; + } + ret = reiserfs_quota_on_mount(s, i); + if (ret < 0) + reiserfs_warning(s, "reiserfs-2500", + "cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + done = 0; + REISERFS_SB(s)->s_is_unlinked_ok = 1; + while (!retval) { + int depth; + retval = search_item(s, &max_cpu_key, &path); + if (retval != ITEM_NOT_FOUND) { + reiserfs_error(s, "vs-2140", + "search_by_key returned %d", retval); + break; + } + + bh = get_last_bh(&path); + item_pos = get_item_pos(&path); + if (item_pos != B_NR_ITEMS(bh)) { + reiserfs_warning(s, "vs-2060", + "wrong position found"); + break; + } + item_pos--; + ih = item_head(bh, item_pos); + + if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) + /* there are no "save" links anymore */ + break; + + save_link_key = ih->ih_key; + if (is_indirect_le_ih(ih)) + truncate = 1; + else + truncate = 0; + + /* reiserfs_iget needs k_dirid and k_objectid only */ + item = ih_item_body(bh, ih); + obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); + obj_key.on_disk_key.k_objectid = + le32_to_cpu(ih->ih_key.k_objectid); + obj_key.on_disk_key.k_offset = 0; + obj_key.on_disk_key.k_type = 0; + + pathrelse(&path); + + inode = reiserfs_iget(s, &obj_key); + if (!inode) { + /* + * the unlink almost completed, it just did not + * manage to remove "save" link and release objectid + */ + reiserfs_warning(s, "vs-2180", "iget failed for %K", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 1); + continue; + } + + if (!truncate && inode->i_nlink) { + /* file is not unlinked */ + reiserfs_warning(s, "vs-2185", + "file %K is not unlinked", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 0); + continue; + } + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_initialize(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + + if (truncate && S_ISDIR(inode->i_mode)) { + /* + * We got a truncate request for a dir which + * is impossible. The only imaginable way is to + * execute unfinished truncate request then boot + * into old kernel, remove the file and create dir + * with the same key. + */ + reiserfs_warning(s, "green-2101", + "impossible truncate on a " + "directory %k. Please report", + INODE_PKEY(inode)); + retval = remove_save_link_only(s, &save_link_key, 0); + truncate = 0; + iput(inode); + continue; + } + + if (truncate) { + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + /* + * not completed truncate found. New size was + * committed together with "save" link + */ + reiserfs_info(s, "Truncating %k to %lld ..", + INODE_PKEY(inode), inode->i_size); + + /* don't update modification time */ + reiserfs_truncate_file(inode, 0); + + retval = remove_save_link(inode, truncate); + } else { + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + /* not completed unlink (rmdir) found */ + reiserfs_info(s, "Removing %k..", INODE_PKEY(inode)); + if (memcmp(&last_inode_key, INODE_PKEY(inode), + sizeof(last_inode_key))){ + last_inode_key = *INODE_PKEY(inode); + /* removal gets completed in iput */ + retval = 0; + } else { + reiserfs_warning(s, "super-2189", "Dead loop " + "in finish_unfinished " + "detected, just remove " + "save link\n"); + retval = remove_save_link_only(s, + &save_link_key, 0); + } + } + + iput(inode); + printk("done\n"); + done++; + } + REISERFS_SB(s)->s_is_unlinked_ok = 0; + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + reiserfs_write_unlock(s); + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + if (sb_dqopt(s)->files[i] && quota_enabled[i]) + dquot_quota_off(s, i); + } + reiserfs_write_lock(s); + if (ms_active_set) + /* Restore the flag back */ + s->s_flags &= ~MS_ACTIVE; +#endif + pathrelse(&path); + if (done) + reiserfs_info(s, "There were %d uncompleted unlinks/truncates. " + "Completed\n", done); + return retval; +} + +/* + * to protect file being unlinked from getting lost we "safe" link files + * being unlinked. This link will be deleted in the same transaction with last + * item of file. mounting the filesystem we scan all these links and remove + * files which almost got lost + */ +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate) +{ + INITIALIZE_PATH(path); + int retval; + struct cpu_key key; + struct item_head ih; + __le32 link; + + BUG_ON(!th->t_trans_id); + + /* file can only get one "save link" of each kind */ + RFALSE(truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask), + "saved link already exists for truncated inode %lx", + (long)inode->i_ino); + RFALSE(!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask), + "saved link already exists for unlinked inode %lx", + (long)inode->i_ino); + + /* setup key of "save" link */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; + key.on_disk_key.k_objectid = inode->i_ino; + if (!truncate) { + /* unlink, rmdir, rename */ + set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize); + set_cpu_key_k_type(&key, TYPE_DIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, + 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, + 4 /*length */ , 0xffff /*free space */ ); + } else { + /* truncate */ + if (S_ISDIR(inode->i_mode)) + reiserfs_warning(inode->i_sb, "green-2102", + "Adding a truncate savelink for " + "a directory %k! Please report", + INODE_PKEY(inode)); + set_cpu_key_k_offset(&key, 1); + set_cpu_key_k_type(&key, TYPE_INDIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT, + 4 /*length */ , 0 /*free space */ ); + } + key.key_length = 3; + + /* look for its place in the tree */ + retval = search_item(inode->i_sb, &key, &path); + if (retval != ITEM_NOT_FOUND) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2100", + "search_by_key (%K) returned %d", &key, + retval); + pathrelse(&path); + return; + } + + /* body of "save" link */ + link = INODE_PKEY(inode)->k_dir_id; + + /* put "save" link into tree, don't charge quota to anyone */ + retval = + reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2120", + "insert_item returned %d", retval); + } else { + if (truncate) + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + else + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + } +} + +/* this opens transaction unlike add_save_link */ +int remove_save_link(struct inode *inode, int truncate) +{ + struct reiserfs_transaction_handle th; + struct reiserfs_key key; + int err; + + /* we are going to do one balancing only */ + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + /* setup key of "save" link */ + key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID); + key.k_objectid = INODE_PKEY(inode)->k_objectid; + if (!truncate) { + /* unlink, rmdir, rename */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, + 1 + inode->i_sb->s_blocksize); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT); + } else { + /* truncate */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT); + } + + if ((truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) || + (!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask))) + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item(&th, NULL, &key); + if (!truncate) { + reiserfs_release_objectid(&th, inode->i_ino); + REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask; + } else + REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; + + return journal_end(&th); +} + +static void reiserfs_kill_sb(struct super_block *s) +{ + if (REISERFS_SB(s)) { + reiserfs_proc_info_done(s); + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; + } + + kill_block_super(s); +} + +static void reiserfs_put_super(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + reiserfs_write_lock(s); + + /* + * change file system state to current state if it was mounted + * with read-write permissions + */ + if (!(s->s_flags & MS_RDONLY)) { + if (!journal_begin(&th, s, 10)) { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), + REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + } + } + + /* + * note, journal_release checks for readonly mount, and can + * decide not to do a journal_end + */ + journal_release(&th, s); + + reiserfs_free_bitmap_cache(s); + + brelse(SB_BUFFER_WITH_SB(s)); + + print_statistics(s); + + if (REISERFS_SB(s)->reserved_blocks != 0) { + reiserfs_warning(s, "green-2005", "reserved blocks left %d", + REISERFS_SB(s)->reserved_blocks); + } + + reiserfs_write_unlock(s); + mutex_destroy(&REISERFS_SB(s)->lock); + destroy_workqueue(REISERFS_SB(s)->commit_wq); + kfree(s->s_fs_info); + s->s_fs_info = NULL; +} + +static struct kmem_cache *reiserfs_inode_cachep; + +static struct inode *reiserfs_alloc_inode(struct super_block *sb) +{ + struct reiserfs_inode_info *ei; + ei = (struct reiserfs_inode_info *) + kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + atomic_set(&ei->openers, 0); + mutex_init(&ei->tailpack); +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + + return &ei->vfs_inode; +} + +static void reiserfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); +} + +static void reiserfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, reiserfs_i_callback); +} + +static void init_once(void *foo) +{ + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; + + INIT_LIST_HEAD(&ei->i_prealloc_list); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", + sizeof(struct + reiserfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (reiserfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(reiserfs_inode_cachep); +} + +/* we don't mark inodes dirty, we just log them */ +static void reiserfs_dirty_inode(struct inode *inode, int flags) +{ + struct reiserfs_transaction_handle th; + + int err = 0; + + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning(inode->i_sb, "clm-6006", + "writing inode %lu on readonly FS", + inode->i_ino); + return; + } + reiserfs_write_lock(inode->i_sb); + + /* + * this is really only used for atime updates, so they don't have + * to be included in O_SYNC or fsync + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) + goto out; + + reiserfs_update_sd(&th, inode); + journal_end(&th); + +out: + reiserfs_write_unlock(inode->i_sb); +} + +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *s = root->d_sb; + struct reiserfs_journal *journal = SB_JOURNAL(s); + long opts = REISERFS_SB(s)->s_mount_opt; + + if (opts & (1 << REISERFS_LARGETAIL)) + seq_puts(seq, ",tails=on"); + else if (!(opts & (1 << REISERFS_SMALLTAIL))) + seq_puts(seq, ",notail"); + /* tails=small is default so we don't show it */ + + if (!(opts & (1 << REISERFS_BARRIER_FLUSH))) + seq_puts(seq, ",barrier=none"); + /* barrier=flush is default so we don't show it */ + + if (opts & (1 << REISERFS_ERROR_CONTINUE)) + seq_puts(seq, ",errors=continue"); + else if (opts & (1 << REISERFS_ERROR_PANIC)) + seq_puts(seq, ",errors=panic"); + /* errors=ro is default so we don't show it */ + + if (opts & (1 << REISERFS_DATA_LOG)) + seq_puts(seq, ",data=journal"); + else if (opts & (1 << REISERFS_DATA_WRITEBACK)) + seq_puts(seq, ",data=writeback"); + /* data=ordered is default so we don't show it */ + + if (opts & (1 << REISERFS_ATTRS)) + seq_puts(seq, ",attrs"); + + if (opts & (1 << REISERFS_XATTRS_USER)) + seq_puts(seq, ",user_xattr"); + + if (opts & (1 << REISERFS_EXPOSE_PRIVROOT)) + seq_puts(seq, ",expose_privroot"); + + if (opts & (1 << REISERFS_POSIXACL)) + seq_puts(seq, ",acl"); + + if (REISERFS_SB(s)->s_jdev) + seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + + if (journal->j_max_commit_age != journal->j_default_max_commit_age) + seq_printf(seq, ",commit=%d", journal->j_max_commit_age); + +#ifdef CONFIG_QUOTA + if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + else if (opts & (1 << REISERFS_USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + else if (opts & (1 << REISERFS_GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (REISERFS_SB(s)->s_jquota_fmt) { + if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD) + seq_puts(seq, ",jqfmt=vfsold"); + else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0) + seq_puts(seq, ",jqfmt=vfsv0"); + } +#endif + + /* Block allocator options */ + if (opts & (1 << REISERFS_NO_BORDER)) + seq_puts(seq, ",block-allocator=noborder"); + if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=no_unhashed_relocation"); + if (opts & (1 << REISERFS_HASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=hashed_relocation"); + if (opts & (1 << REISERFS_TEST4)) + seq_puts(seq, ",block-allocator=test4"); + show_alloc_options(seq, s); + return 0; +} + +#ifdef CONFIG_QUOTA +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, + size_t, loff_t); +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, + loff_t); + +static struct dquot **reiserfs_get_dquots(struct inode *inode) +{ + return REISERFS_I(inode)->i_dquot; +} +#endif + +static const struct super_operations reiserfs_sops = { + .alloc_inode = reiserfs_alloc_inode, + .destroy_inode = reiserfs_destroy_inode, + .write_inode = reiserfs_write_inode, + .dirty_inode = reiserfs_dirty_inode, + .evict_inode = reiserfs_evict_inode, + .put_super = reiserfs_put_super, + .sync_fs = reiserfs_sync_fs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, + .statfs = reiserfs_statfs, + .remount_fs = reiserfs_remount, + .show_options = reiserfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = reiserfs_quota_read, + .quota_write = reiserfs_quota_write, + .get_dquots = reiserfs_get_dquots, +#endif +}; + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") + +static int reiserfs_write_dquot(struct dquot *); +static int reiserfs_acquire_dquot(struct dquot *); +static int reiserfs_release_dquot(struct dquot *); +static int reiserfs_mark_dquot_dirty(struct dquot *); +static int reiserfs_write_info(struct super_block *, int); +static int reiserfs_quota_on(struct super_block *, int, int, struct path *); + +static const struct dquot_operations reiserfs_quota_operations = { + .write_dquot = reiserfs_write_dquot, + .acquire_dquot = reiserfs_acquire_dquot, + .release_dquot = reiserfs_release_dquot, + .mark_dirty = reiserfs_mark_dquot_dirty, + .write_info = reiserfs_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops reiserfs_qctl_operations = { + .quota_on = reiserfs_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#endif + +static const struct export_operations reiserfs_export_ops = { + .encode_fh = reiserfs_encode_fh, + .fh_to_dentry = reiserfs_fh_to_dentry, + .fh_to_parent = reiserfs_fh_to_parent, + .get_parent = reiserfs_get_parent, +}; + +/* + * this struct is used in reiserfs_getopt () for containing the value for + * those mount options that have values rather than being toggles. + */ +typedef struct { + char *value; + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; +} arg_desc_t; + +/* Set this bit in arg_required to allow empty arguments */ +#define REISERFS_OPT_ALLOWEMPTY 31 + +/* + * this struct is used in reiserfs_getopt() for describing the + * set of reiserfs mount options + */ +typedef struct { + char *option_name; + + /* 0 if argument is not required, not 0 otherwise */ + int arg_required; + + /* list of values accepted by an option */ + const arg_desc_t *values; + + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; +} opt_desc_t; + +/* possible values for -o data= */ +static const arg_desc_t logging_mode[] = { + {"ordered", 1 << REISERFS_DATA_ORDERED, + (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)}, + {"journal", 1 << REISERFS_DATA_LOG, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)}, + {"writeback", 1 << REISERFS_DATA_WRITEBACK, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)}, + {.value = NULL} +}; + +/* possible values for -o barrier= */ +static const arg_desc_t barrier_mode[] = { + {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH}, + {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE}, + {.value = NULL} +}; + +/* + * possible values for "-o block-allocator=" and bits which are to be set in + * s_mount_opt of reiserfs specific part of in-core super block + */ +static const arg_desc_t balloc[] = { + {"noborder", 1 << REISERFS_NO_BORDER, 0}, + {"border", 0, 1 << REISERFS_NO_BORDER}, + {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0}, + {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0}, + {"test4", 1 << REISERFS_TEST4, 0}, + {"notest4", 0, 1 << REISERFS_TEST4}, + {NULL, 0, 0} +}; + +static const arg_desc_t tails[] = { + {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL}, + {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL}, + {NULL, 0, 0} +}; + +static const arg_desc_t error_actions[] = { + {"panic", 1 << REISERFS_ERROR_PANIC, + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, + {"ro-remount", 1 << REISERFS_ERROR_RO, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG + {"continue", 1 << REISERFS_ERROR_CONTINUE, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, +#endif + {NULL, 0, 0}, +}; + +/* + * proceed only one option from a list *cur - string containing of mount + * options + * opts - array of options which are accepted + * opt_arg - if option is found and requires an argument and if it is specifed + * in the input - pointer to the argument is stored here + * bit_flags - if option requires to set a certain bit - it is set here + * return -1 if unknown option is found, opt->arg_required otherwise + */ +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, + char **opt_arg, unsigned long *bit_flags) +{ + char *p; + /* + * foo=bar, + * ^ ^ ^ + * | | +-- option_end + * | +-- arg_start + * +-- option_start + */ + const opt_desc_t *opt; + const arg_desc_t *arg; + + p = *cur; + + /* assume argument cannot contain commas */ + *cur = strchr(p, ','); + if (*cur) { + *(*cur) = '\0'; + (*cur)++; + } + + if (!strncmp(p, "alloc=", 6)) { + /* + * Ugly special case, probably we should redo options + * parser so that it can understand several arguments for + * some options, also so that it can fill several bitfields + * with option values. + */ + if (reiserfs_parse_alloc_options(s, p + 6)) { + return -1; + } else { + return 0; + } + } + + /* for every option in the list */ + for (opt = opts; opt->option_name; opt++) { + if (!strncmp(p, opt->option_name, strlen(opt->option_name))) { + if (bit_flags) { + if (opt->clrmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6500", + "%s not supported.\n", + p); + else + *bit_flags &= ~opt->clrmask; + if (opt->setmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6501", + "%s not supported.\n", + p); + else + *bit_flags |= opt->setmask; + } + break; + } + } + if (!opt->option_name) { + reiserfs_warning(s, "super-6502", + "unknown mount option \"%s\"", p); + return -1; + } + + p += strlen(opt->option_name); + switch (*p) { + case '=': + if (!opt->arg_required) { + reiserfs_warning(s, "super-6503", + "the option \"%s\" does not " + "require an argument\n", + opt->option_name); + return -1; + } + break; + + case 0: + if (opt->arg_required) { + reiserfs_warning(s, "super-6504", + "the option \"%s\" requires an " + "argument\n", opt->option_name); + return -1; + } + break; + default: + reiserfs_warning(s, "super-6505", + "head of option \"%s\" is only correct\n", + opt->option_name); + return -1; + } + + /* + * move to the argument, or to next option if argument is not + * required + */ + p++; + + if (opt->arg_required + && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) + && !strlen(p)) { + /* this catches "option=," if not allowed */ + reiserfs_warning(s, "super-6506", + "empty argument for \"%s\"\n", + opt->option_name); + return -1; + } + + if (!opt->values) { + /* *=NULLopt_arg contains pointer to argument */ + *opt_arg = p; + return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY); + } + + /* values possible for this option are listed in opt->values */ + for (arg = opt->values; arg->value; arg++) { + if (!strcmp(p, arg->value)) { + if (bit_flags) { + *bit_flags &= ~arg->clrmask; + *bit_flags |= arg->setmask; + } + return opt->arg_required; + } + } + + reiserfs_warning(s, "super-6506", + "bad value \"%s\" for option \"%s\"\n", p, + opt->option_name); + return -1; +} + +/* returns 0 if something is wrong in option string, 1 - otherwise */ +static int reiserfs_parse_options(struct super_block *s, + + /* string given via mount's -o */ + char *options, + + /* + * after the parsing phase, contains the + * collection of bitflags defining what + * mount options were selected. + */ + unsigned long *mount_options, + + /* strtol-ed from NNN of resize=NNN */ + unsigned long *blocks, + char **jdev_name, + unsigned int *commit_max_age, + char **qf_names, + unsigned int *qfmt) +{ + int c; + char *arg = NULL; + char *pos; + opt_desc_t opts[] = { + /* + * Compatibility stuff, so that -o notail for old + * setups still work + */ + {"tails",.arg_required = 't',.values = tails}, + {"notail",.clrmask = + (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"conv",.setmask = 1 << REISERFS_CONVERT}, + {"attrs",.setmask = 1 << REISERFS_ATTRS}, + {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, + {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, +#ifdef CONFIG_REISERFS_FS_XATTR + {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, + {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, +#else + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, +#else + {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif + {.option_name = "nolog"}, + {"replayonly",.setmask = 1 << REPLAYONLY}, + {"block-allocator",.arg_required = 'a',.values = balloc}, + {"data",.arg_required = 'd',.values = logging_mode}, + {"barrier",.arg_required = 'b',.values = barrier_mode}, + {"resize",.arg_required = 'r',.values = NULL}, + {"jdev",.arg_required = 'j',.values = NULL}, + {"nolargeio",.arg_required = 'w',.values = NULL}, + {"commit",.arg_required = 'c',.values = NULL}, + {"usrquota",.setmask = 1 << REISERFS_USRQUOTA}, + {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA}, + {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA}, + {"errors",.arg_required = 'e',.values = error_actions}, + {"usrjquota",.arg_required = + 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"grpjquota",.arg_required = + 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"jqfmt",.arg_required = 'f',.values = NULL}, + {.option_name = NULL} + }; + + *blocks = 0; + if (!options || !*options) + /* + * use default configuration: create tails, journaling on, no + * conversion to newest format + */ + return 1; + + for (pos = options; pos;) { + c = reiserfs_getopt(s, &pos, opts, &arg, mount_options); + if (c == -1) + /* wrong option is given */ + return 0; + + if (c == 'r') { + char *p; + + p = NULL; + /* "resize=NNN" or "resize=auto" */ + + if (!strcmp(arg, "auto")) { + /* From JFS code, to auto-get the size. */ + *blocks = + s->s_bdev->bd_inode->i_size >> s-> + s_blocksize_bits; + } else { + *blocks = simple_strtoul(arg, &p, 0); + if (*p != '\0') { + /* NNN does not look like a number */ + reiserfs_warning(s, "super-6507", + "bad value %s for " + "-oresize\n", arg); + return 0; + } + } + } + + if (c == 'c') { + char *p = NULL; + unsigned long val = simple_strtoul(arg, &p, 0); + /* commit=NNN (time in seconds) */ + if (*p != '\0' || val >= (unsigned int)-1) { + reiserfs_warning(s, "super-6508", + "bad value %s for -ocommit\n", + arg); + return 0; + } + *commit_max_age = (unsigned int)val; + } + + if (c == 'w') { + reiserfs_warning(s, "super-6509", "nolargeio option " + "is no longer supported"); + return 0; + } + + if (c == 'j') { + if (arg && *arg && jdev_name) { + /* Hm, already assigned? */ + if (*jdev_name) { + reiserfs_warning(s, "super-6510", + "journal device was " + "already specified to " + "be %s", *jdev_name); + return 0; + } + *jdev_name = arg; + } + } +#ifdef CONFIG_QUOTA + if (c == 'u' || c == 'g') { + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; + + if (sb_any_quota_loaded(s) && + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { + reiserfs_warning(s, "super-6511", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + if (*arg) { /* Some filename specified? */ + if (REISERFS_SB(s)->s_qf_names[qtype] + && strcmp(REISERFS_SB(s)->s_qf_names[qtype], + arg)) { + reiserfs_warning(s, "super-6512", + "%s quota file " + "already specified.", + QTYPE2NAME(qtype)); + return 0; + } + if (strchr(arg, '/')) { + reiserfs_warning(s, "super-6513", + "quotafile must be " + "on filesystem root."); + return 0; + } + qf_names[qtype] = kstrdup(arg, GFP_KERNEL); + if (!qf_names[qtype]) { + reiserfs_warning(s, "reiserfs-2502", + "not enough memory " + "for storing " + "quotafile name."); + return 0; + } + if (qtype == USRQUOTA) + *mount_options |= 1 << REISERFS_USRQUOTA; + else + *mount_options |= 1 << REISERFS_GRPQUOTA; + } else { + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; + if (qtype == USRQUOTA) + *mount_options &= ~(1 << REISERFS_USRQUOTA); + else + *mount_options &= ~(1 << REISERFS_GRPQUOTA); + } + } + if (c == 'f') { + if (!strcmp(arg, "vfsold")) + *qfmt = QFMT_VFS_OLD; + else if (!strcmp(arg, "vfsv0")) + *qfmt = QFMT_VFS_V0; + else { + reiserfs_warning(s, "super-6514", + "unknown quota format " + "specified."); + return 0; + } + if (sb_any_quota_loaded(s) && + *qfmt != REISERFS_SB(s)->s_jquota_fmt) { + reiserfs_warning(s, "super-6515", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + } +#else + if (c == 'u' || c == 'g' || c == 'f') { + reiserfs_warning(s, "reiserfs-2503", "journaled " + "quota options not supported."); + return 0; + } +#endif + } + +#ifdef CONFIG_QUOTA + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { + reiserfs_warning(s, "super-6515", + "journaled quota format not specified."); + return 0; + } + if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) && + sb_has_quota_loaded(s, USRQUOTA)) || + (!(*mount_options & (1 << REISERFS_GRPQUOTA)) && + sb_has_quota_loaded(s, GRPQUOTA))) { + reiserfs_warning(s, "super-6516", "quota options must " + "be present when quota is turned on."); + return 0; + } +#endif + + return 1; +} + +static void switch_data_mode(struct super_block *s, unsigned long mode) +{ + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); +} + +static void handle_data_mode(struct super_block *s, unsigned long mount_options) +{ + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + reiserfs_info(s, "switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + reiserfs_info(s, "switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + reiserfs_info(s, "switching to writeback data mode\n"); + } + } +} + +static void handle_barrier_mode(struct super_block *s, unsigned long bits) +{ + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + +static void handle_attrs(struct super_block *s) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + + if (reiserfs_attrs(s)) { + if (old_format_only(s)) { + reiserfs_warning(s, "super-6517", "cannot support " + "attributes on 3.5.x disk format"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + return; + } + if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { + reiserfs_warning(s, "super-6518", "cannot support " + "attributes until flag is set in " + "super-block"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + } + } +} + +#ifdef CONFIG_QUOTA +static void handle_quota_files(struct super_block *s, char **qf_names, + unsigned int *qfmt) +{ + int i; + + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; + } + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; +} +#endif + +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) +{ + struct reiserfs_super_block *rs; + struct reiserfs_transaction_handle th; + unsigned long blocks; + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; + unsigned long safe_mask = 0; + unsigned int commit_max_age = (unsigned int)-1; + struct reiserfs_journal *journal = SB_JOURNAL(s); + char *new_opts = kstrdup(arg, GFP_KERNEL); + int err; + char *qf_names[REISERFS_MAXQUOTAS]; + unsigned int qfmt = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + + sync_filesystem(s); + reiserfs_write_lock(s); + +#ifdef CONFIG_QUOTA + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); +#endif + + rs = SB_DISK_SUPER_BLOCK(s); + + if (!reiserfs_parse_options + (s, arg, &mount_options, &blocks, NULL, &commit_max_age, + qf_names, &qfmt)) { +#ifdef CONFIG_QUOTA + for (i = 0; i < REISERFS_MAXQUOTAS; i++) + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(qf_names[i]); +#endif + err = -EINVAL; + goto out_err_unlock; + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + handle_attrs(s); + + /* Add options that are safe here */ + safe_mask |= 1 << REISERFS_SMALLTAIL; + safe_mask |= 1 << REISERFS_LARGETAIL; + safe_mask |= 1 << REISERFS_NO_BORDER; + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; + safe_mask |= 1 << REISERFS_HASHED_RELOCATION; + safe_mask |= 1 << REISERFS_TEST4; + safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; + safe_mask |= 1 << REISERFS_ERROR_RO; + safe_mask |= 1 << REISERFS_ERROR_CONTINUE; + safe_mask |= 1 << REISERFS_ERROR_PANIC; + safe_mask |= 1 << REISERFS_USRQUOTA; + safe_mask |= 1 << REISERFS_GRPQUOTA; + + /* + * Update the bitmask, taking care to keep + * the bits we're not allowed to change here + */ + REISERFS_SB(s)->s_mount_opt = + (REISERFS_SB(s)-> + s_mount_opt & ~safe_mask) | (mount_options & safe_mask); + + if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } else if (commit_max_age == 0) { + /* 0 means restore defaults. */ + journal->j_max_commit_age = journal->j_default_max_commit_age; + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + } + + if (blocks) { + err = reiserfs_resize(s, blocks); + if (err != 0) + goto out_err_unlock; + } + + if (*mount_flags & MS_RDONLY) { + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + /* remount read-only */ + if (s->s_flags & MS_RDONLY) + /* it is read-only already */ + goto out_ok_unlocked; + + err = dquot_suspend(s, -1); + if (err < 0) + goto out_err; + + /* try to remount file system with read-only permissions */ + if (sb_umount_state(rs) == REISERFS_VALID_FS + || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { + goto out_ok_unlocked; + } + + reiserfs_write_lock(s); + + err = journal_begin(&th, s, 10); + if (err) + goto out_err_unlock; + + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + } else { + /* remount read-write */ + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + goto out_ok_unlocked; /* We are read-write already */ + } + + if (reiserfs_is_journal_aborted(journal)) { + err = journal->j_errno; + goto out_err_unlock; + } + + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + + /* now it is safe to call journal_begin */ + s->s_flags &= ~MS_RDONLY; + err = journal_begin(&th, s, 10); + if (err) + goto out_err_unlock; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; + set_sb_umount_state(rs, REISERFS_ERROR_FS); + if (!old_format_only(s)) + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1; + err = journal_end(&th); + if (err) + goto out_err_unlock; + + reiserfs_write_unlock(s); + if (!(*mount_flags & MS_RDONLY)) { + dquot_resume(s, -1); + reiserfs_write_lock(s); + finish_unfinished(s); + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + } + +out_ok_unlocked: + replace_mount_options(s, new_opts); + return 0; + +out_err_unlock: + reiserfs_write_unlock(s); +out_err: + kfree(new_opts); + return err; +} + +static int read_super_block(struct super_block *s, int offset) +{ + struct buffer_head *bh; + struct reiserfs_super_block *rs; + int fs_blocksize; + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2006", + "bread failed (dev %s, block %lu, size %lu)", + s->s_id, offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_any_reiserfs_magic_string(rs)) { + brelse(bh); + return 1; + } + /* + * ok, reiserfs signature (old or new) found in at the given offset + */ + fs_blocksize = sb_blocksize(rs); + brelse(bh); + sb_set_blocksize(s, fs_blocksize); + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2007", + "bread failed (dev %s, block %lu, size %lu)", + s->s_id, offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (sb_blocksize(rs) != s->s_blocksize) { + reiserfs_warning(s, "sh-2011", "can't find a reiserfs " + "filesystem on (dev %s, block %llu, size %lu)", + s->s_id, + (unsigned long long)bh->b_blocknr, + s->s_blocksize); + brelse(bh); + return 1; + } + + if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { + brelse(bh); + reiserfs_warning(s, "super-6519", "Unfinished reiserfsck " + "--rebuild-tree run detected. Please run\n" + "reiserfsck --rebuild-tree and wait for a " + "completion. If that fails\n" + "get newer reiserfsprogs package"); + return 1; + } + + SB_BUFFER_WITH_SB(s) = bh; + SB_DISK_SUPER_BLOCK(s) = rs; + + /* + * magic is of non-standard journal filesystem, look at s_version to + * find which format is in use + */ + if (is_reiserfs_jr(rs)) { + if (sb_version(rs) == REISERFS_VERSION_2) + reiserfs_info(s, "found reiserfs format \"3.6\"" + " with non-standard journal\n"); + else if (sb_version(rs) == REISERFS_VERSION_1) + reiserfs_info(s, "found reiserfs format \"3.5\"" + " with non-standard journal\n"); + else { + reiserfs_warning(s, "sh-2012", "found unknown " + "format \"%u\" of reiserfs with " + "non-standard magic", sb_version(rs)); + return 1; + } + } else + /* + * s_version of standard format may contain incorrect + * information, so we just look at the magic string + */ + reiserfs_info(s, + "found reiserfs format \"%s\" with standard journal\n", + is_reiserfs_3_5(rs) ? "3.5" : "3.6"); + + s->s_op = &reiserfs_sops; + s->s_export_op = &reiserfs_export_ops; +#ifdef CONFIG_QUOTA + s->s_qcop = &reiserfs_qctl_operations; + s->dq_op = &reiserfs_quota_operations; + s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + + /* + * new format is limited by the 32 bit wide i_blocks field, want to + * be one full block below that. + */ + s->s_maxbytes = (512LL << 32) - s->s_blocksize; + return 0; +} + +/* after journal replay, reread all bitmap and super blocks */ +static int reread_meta_blocks(struct super_block *s) +{ + ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); + wait_on_buffer(SB_BUFFER_WITH_SB(s)); + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + reiserfs_warning(s, "reiserfs-2504", "error reading the super"); + return 1; + } + + return 0; +} + +/* hash detection stuff */ + +/* + * if root directory is empty - we set default - Yura's - hash and + * warn about it + * FIXME: we look for only one name in a directory. If tea and yura + * both have the same value - we ask user to send report to the + * mailing list + */ +static __u32 find_hash_out(struct super_block *s) +{ + int retval; + struct inode *inode; + struct cpu_key key; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + struct reiserfs_de_head *deh; + __u32 hash = DEFAULT_HASH; + __u32 deh_hashval, teahash, r5hash, yurahash; + + inode = d_inode(s->s_root); + + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + deh = de.de_deh + de.de_entry_num; + + if (deh_offset(deh) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) + hash = YURA_HASH; + reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n"); + goto out; + } + + deh_hashval = GET_HASH_VALUE(deh_offset(deh)); + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + + if ((teahash == r5hash && deh_hashval == r5hash) || + (teahash == yurahash && deh_hashval == yurahash) || + (r5hash == yurahash && deh_hashval == yurahash)) { + reiserfs_warning(s, "reiserfs-2506", + "Unable to automatically detect hash " + "function. Please mount with -o " + "hash={tea,rupasov,r5}"); + hash = UNSET_HASH; + goto out; + } + + if (deh_hashval == yurahash) + hash = YURA_HASH; + else if (deh_hashval == teahash) + hash = TEA_HASH; + else if (deh_hashval == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "reiserfs-2506", + "Unrecognised hash function"); + hash = UNSET_HASH; + } +out: + pathrelse(&path); + return hash; +} + +/* finds out which hash names are sorted with */ +static int what_hash(struct super_block *s) +{ + __u32 code; + + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); + + /* + * reiserfs_hash_detect() == true if any of the hash mount options + * were used. We must check them to make sure the user isn't + * using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out(s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* + * detection has found the hash, and we must check against the + * mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + reiserfs_warning(s, "reiserfs-2507", + "Error, %s hash detected, " + "unable to force rupasov hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + reiserfs_warning(s, "reiserfs-2508", + "Error, %s hash detected, " + "unable to force tea hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + reiserfs_warning(s, "reiserfs-2509", + "Error, %s hash detected, " + "unable to force r5 hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } + } else { + /* + * find_hash_out was not called or + * could not determine the hash + */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH; + } + } + + /* + * if we are mounted RW, and we have a new valid hash code, update + * the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); + } + return code; +} + +/* return pointer to appropriate function */ +static hashf_t hash_function(struct super_block *s) +{ + switch (what_hash(s)) { + case TEA_HASH: + reiserfs_info(s, "Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_info(s, "Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_info(s, "Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; +} + +/* this is used to set up correct value for old partitions */ +static int function2code(hashf_t func) +{ + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; + + BUG(); /* should never happen */ + + return 0; +} + +#define SWARN(silent, s, id, ...) \ + if (!(silent)) \ + reiserfs_warning(s, id, __VA_ARGS__) + +static int reiserfs_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct reiserfs_transaction_handle th; + int old_format = 0; + unsigned long blocks; + unsigned int commit_max_age = 0; + int jinit_done = 0; + struct reiserfs_iget_args args; + struct reiserfs_super_block *rs; + char *jdev_name; + struct reiserfs_sb_info *sbi; + int errval = -EINVAL; + char *qf_names[REISERFS_MAXQUOTAS] = {}; + unsigned int qfmt = 0; + + save_mount_options(s, data); + + sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + /* Set default values for options: non-aggressive tails, RO on errors */ + sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); + sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); + /* no preallocation minimum, be smart in reiserfs_file_write instead */ + sbi->s_alloc_options.preallocmin = 0; + /* Preallocate by 16 blocks (17-1) at once */ + sbi->s_alloc_options.preallocsize = 17; + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + + spin_lock_init(&sbi->old_work_lock); + INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits); + mutex_init(&sbi->lock); + sbi->lock_depth = -1; + + sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0, + s->s_id); + if (!sbi->commit_wq) { + SWARN(silent, s, "", "Cannot allocate commit workqueue"); + errval = -ENOMEM; + goto error_unlocked; + } + + jdev_name = NULL; + if (reiserfs_parse_options + (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name, + &commit_max_age, qf_names, &qfmt) == 0) { + goto error_unlocked; + } + if (jdev_name && jdev_name[0]) { + sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL); + if (!sbi->s_jdev) { + SWARN(silent, s, "", "Cannot allocate memory for " + "journal device name"); + goto error; + } + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + if (blocks) { + SWARN(silent, s, "jmacd-7", "resize option for remount only"); + goto error_unlocked; + } + + /* + * try old format (undistributed bitmap, super block in 8-th 1k + * block of a device) + */ + if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) + old_format = 1; + + /* + * try new format (64-th 1k block), which can contain reiserfs + * super block + */ + else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { + SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", + s->s_id); + goto error_unlocked; + } + + rs = SB_DISK_SUPER_BLOCK(s); + /* + * Let's do basic sanity check to verify that underlying device is not + * smaller than the filesystem. If the check fails then abort and + * scream, because bad stuff will happen otherwise. + */ + if (s->s_bdev && s->s_bdev->bd_inode + && i_size_read(s->s_bdev->bd_inode) < + sb_block_count(rs) * sb_blocksize(rs)) { + SWARN(silent, s, "", "Filesystem cannot be " + "mounted because it is bigger than the device"); + SWARN(silent, s, "", "You may need to run fsck " + "or increase size of your LVM partition"); + SWARN(silent, s, "", "Or may be you forgot to " + "reboot after fdisk when it told you to"); + goto error_unlocked; + } + + sbi->s_mount_state = SB_REISERFS_STATE(s); + sbi->s_mount_state = REISERFS_VALID_FS; + + if ((errval = reiserfs_init_bitmap_cache(s))) { + SWARN(silent, s, "jmacd-8", "unable to read bitmap"); + goto error_unlocked; + } + + errval = -EINVAL; +#ifdef CONFIG_REISERFS_CHECK + SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); + SWARN(silent, s, "", "- it is slow mode for debugging."); +#endif + + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) { + sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + reiserfs_info(s, "using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + reiserfs_info(s, "using ordered data mode\n"); + } else { + reiserfs_info(s, "using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, "sh-2022", + "unable to initialize journal space"); + goto error_unlocked; + } else { + /* + * once this is set, journal_release must be called + * if we error out of the mount + */ + jinit_done = 1; + } + + if (reread_meta_blocks(s)) { + SWARN(silent, s, "jmacd-9", + "unable to reread meta blocks after journal init"); + goto error_unlocked; + } + + if (replay_only(s)) + goto error_unlocked; + + if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + SWARN(silent, s, "clm-7000", + "Detected readonly device, marking FS readonly"); + s->s_flags |= MS_RDONLY; + } + args.objectid = REISERFS_ROOT_OBJECTID; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID; + root_inode = + iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, + reiserfs_init_locked_inode, (void *)&args); + if (!root_inode) { + SWARN(silent, s, "jmacd-10", "get root inode failed"); + goto error_unlocked; + } + + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) + goto error; + /* define and initialize hash function */ + sbi->s_hash_function = hash_function(s); + if (sbi->s_hash_function == NULL) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if (is_reiserfs_3_5(rs) + || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) + set_bit(REISERFS_3_5, &sbi->s_properties); + else if (old_format) + set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties); + else + set_bit(REISERFS_3_6, &sbi->s_properties); + + if (!(s->s_flags & MS_RDONLY)) { + + errval = journal_begin(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + + set_sb_umount_state(rs, REISERFS_ERROR_FS); + set_sb_fs_state(rs, 0); + + /* + * Clear out s_bmap_nr if it would wrap. We can handle this + * case, but older revisions can't. This will cause the + * file system to fail mount on those older implementations, + * avoiding corruption. -jeffm + */ + if (bmap_would_wrap(reiserfs_bmap_count(s)) && + sb_bmap_nr(rs) != 0) { + reiserfs_warning(s, "super-2030", "This file system " + "claims to use %u bitmap blocks in " + "its super block, but requires %u. " + "Clearing to zero.", sb_bmap_nr(rs), + reiserfs_bmap_count(s)); + + set_sb_bmap_nr(rs, 0); + } + + if (old_format_only(s)) { + /* + * filesystem of format 3.5 either with standard + * or non-standard journal + */ + if (convert_reiserfs(s)) { + /* and -o conv is given */ + if (!silent) + reiserfs_info(s, + "converting 3.5 filesystem to the 3.6 format"); + + if (is_reiserfs_3_5(rs)) + /* + * put magic string of 3.6 format. + * 2.2 will not be able to + * mount this filesystem anymore + */ + memcpy(rs->s_v1.s_magic, + reiserfs_3_6_magic_string, + sizeof + (reiserfs_3_6_magic_string)); + + set_sb_version(rs, REISERFS_VERSION_2); + reiserfs_convert_objectid_map_v1(s); + set_bit(REISERFS_3_6, &sbi->s_properties); + clear_bit(REISERFS_3_5, &sbi->s_properties); + } else if (!silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + } else + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + + + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + reiserfs_write_unlock(s); + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error_unlocked; + } + reiserfs_write_lock(s); + + /* + * look for files which were to be removed in previous session + */ + finish_unfinished(s); + } else { + if (old_format_only(s) && !silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + + reiserfs_write_unlock(s); + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error_unlocked; + } + reiserfs_write_lock(s); + } + /* + * mark hash in super block: it could be unset. overwrite should be ok + */ + set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); + + handle_attrs(s); + + reiserfs_proc_info_init(s); + + init_waitqueue_head(&(sbi->s_wait)); + spin_lock_init(&sbi->bitmap_lock); + + reiserfs_write_unlock(s); + + return (0); + +error: + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); + journal_release_error(NULL, s); + reiserfs_write_unlock(s); + } + + if (sbi->commit_wq) + destroy_workqueue(sbi->commit_wq); + + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + + reiserfs_free_bitmap_cache(s); + if (SB_BUFFER_WITH_SB(s)) + brelse(SB_BUFFER_WITH_SB(s)); +#ifdef CONFIG_QUOTA + { + int j; + for (j = 0; j < REISERFS_MAXQUOTAS; j++) + kfree(qf_names[j]); + } +#endif + kfree(sbi); + + s->s_fs_info = NULL; + return errval; +} + +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb); + + buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); + buf->f_bfree = sb_free_blocks(rs); + buf->f_bavail = buf->f_bfree; + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; + buf->f_bsize = dentry->d_sb->s_blocksize; + /* changed to accommodate gcc folks. */ + buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, + sizeof(rs->s_uuid)/2); + + return 0; +} + +#ifdef CONFIG_QUOTA +static int reiserfs_write_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(dquot->dq_sb); + ret = dquot_commit(dquot); + reiserfs_write_lock_nested(dquot->dq_sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_acquire_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(dquot->dq_sb); + ret = dquot_acquire(dquot); + reiserfs_write_lock_nested(dquot->dq_sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_release_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + reiserfs_write_unlock(dquot->dq_sb); + if (ret) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + goto out; + } + ret = dquot_release(dquot); + reiserfs_write_lock(dquot->dq_sb); + err = journal_end(&th); + if (!ret && err) + ret = err; + reiserfs_write_unlock(dquot->dq_sb); +out: + return ret; +} + +static int reiserfs_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return reiserfs_write_dquot(dquot); + } else + return dquot_mark_dquot_dirty(dquot); +} + +static int reiserfs_write_info(struct super_block *sb, int type) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + /* Data block + inode block */ + reiserfs_write_lock(sb); + ret = journal_begin(&th, sb, 2); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(sb); + ret = dquot_commit_info(sb, type); + reiserfs_write_lock_nested(sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(sb); + return ret; +} + +/* + * Turn on quotas during mount time - we need to find the quota file and such... + */ +static int reiserfs_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; + + reiserfs_write_lock(sb); + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { + err = -EINVAL; + goto out; + } + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) { + err = -EXDEV; + goto out; + } + inode = d_inode(path->dentry); + /* + * We must not pack tails for quota files on reiserfs for quota + * IO to work + */ + if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { + err = reiserfs_unpack(inode, NULL); + if (err) { + reiserfs_warning(sb, "super-6520", + "Unpacking tail of quota file failed" + " (%d). Cannot turn on quotas.", err); + err = -EINVAL; + goto out; + } + mark_inode_dirty(inode); + } + /* Journaling quota? */ + if (REISERFS_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + reiserfs_warning(sb, "super-6521", + "Quota file not on filesystem root. " + "Journalled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (reiserfs_file_data_log(inode)) { + /* Just start temporary transaction and finish it */ + err = journal_begin(&th, sb, 1); + if (err) + goto out; + err = journal_end_sync(&th); + if (err) + goto out; + } + reiserfs_write_unlock(sb); + return dquot_quota_on(sb, type, format_id, path); +out: + reiserfs_write_unlock(sb); + return err; +} + +/* + * Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races + */ +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + size_t toread; + struct buffer_head tmp_bh, *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = + sb->s_blocksize - offset < + toread ? sb->s_blocksize - offset : toread; + tmp_bh.b_state = 0; + /* + * Quota files are without tails so we can safely + * use this function + */ + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, 0); + reiserfs_write_unlock(sb); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* + * Write to quotafile (we know the transaction is already started and has + * enough credits) + */ +static ssize_t reiserfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head tmp_bh, *bh; + + if (!current->journal_info) { + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + tmp_bh.b_state = 0; + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + reiserfs_write_unlock(sb); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data + offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + reiserfs_write_lock(sb); + reiserfs_prepare_for_journal(sb, bh, 1); + journal_mark_dirty(current->journal_info, bh); + if (!journal_quota) + reiserfs_add_ordered_list(inode, bh); + reiserfs_write_unlock(sb); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off + len - towrite) + i_size_write(inode, off + len - towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return len - towrite; +} + +#endif + +static struct dentry *get_super_block(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); +} + +static int __init init_reiserfs_fs(void) +{ + int ret; + + ret = init_inodecache(); + if (ret) + return ret; + + reiserfs_proc_info_global_init(); + + ret = register_filesystem(&reiserfs_fs_type); + if (ret) + goto out; + + return 0; +out: + reiserfs_proc_info_global_done(); + destroy_inodecache(); + + return ret; +} + +static void __exit exit_reiserfs_fs(void) +{ + reiserfs_proc_info_global_done(); + unregister_filesystem(&reiserfs_fs_type); + destroy_inodecache(); +} + +struct file_system_type reiserfs_fs_type = { + .owner = THIS_MODULE, + .name = "reiserfs", + .mount = get_super_block, + .kill_sb = reiserfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("reiserfs"); + +MODULE_DESCRIPTION("ReiserFS journaled filesystem"); +MODULE_AUTHOR("Hans Reiser "); +MODULE_LICENSE("GPL"); + +module_init(init_reiserfs_fs); +module_exit(exit_reiserfs_fs); diff --git a/kernel/fs/reiserfs/tail_conversion.c b/kernel/fs/reiserfs/tail_conversion.c new file mode 100644 index 000000000..f41e19b4b --- /dev/null +++ b/kernel/fs/reiserfs/tail_conversion.c @@ -0,0 +1,317 @@ +/* + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright + * details + */ + +#include +#include +#include +#include "reiserfs.h" + +/* + * access to tail : when one is going to read tail it must make sure, that is + * not running. direct2indirect and indirect2direct can not run concurrently + */ + +/* + * Converts direct items to an unformatted node. Panics if file has no + * tail. -ENOSPC if no disk space for conversion + */ +/* + * path points to first direct item of the file regardless of how many of + * them are there + */ +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, + struct treepath *path, struct buffer_head *unbh, + loff_t tail_offset) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *up_to_date_bh; + struct item_head *p_le_ih = tp_item_head(path); + unsigned long total_tail = 0; + + /* Key to search for the last byte of the converted item. */ + struct cpu_key end_key; + + /* + * new indirect item to be inserted or key + * of unfm pointer to be pasted + */ + struct item_head ind_ih; + int blk_size; + /* returned value for reiserfs_insert_item and clones */ + int retval; + /* Handle on an unformatted node that will be inserted in the tree. */ + unp_t unfm_ptr; + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_direct2indirect++; + + blk_size = sb->s_blocksize; + + /* + * and key to search for append or insert pointer to the new + * unformatted node. + */ + copy_item_head(&ind_ih, p_le_ih); + set_le_ih_k_offset(&ind_ih, tail_offset); + set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + /* FIXME: we could avoid this */ + if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { + reiserfs_error(sb, "PAP-14030", + "pasted or inserted byte exists in " + "the tree %K. Use fsck to repair.", &end_key); + pathrelse(path); + return -EIO; + } + + p_le_ih = tp_item_head(path); + + unfm_ptr = cpu_to_le32(unbh->b_blocknr); + + if (is_statdata_le_ih(p_le_ih)) { + /* Insert new indirect item. */ + set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ + put_ih_item_len(&ind_ih, UNFM_P_SIZE); + PATH_LAST_POSITION(path)++; + retval = + reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, + (char *)&unfm_ptr); + } else { + /* Paste into last indirect item of an object. */ + retval = reiserfs_paste_into_item(th, path, &end_key, inode, + (char *)&unfm_ptr, + UNFM_P_SIZE); + } + if (retval) { + return retval; + } + /* + * note: from here there are two keys which have matching first + * three key components. They only differ by the fourth one. + */ + + /* Set the key to search for the direct items of the file */ + make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, + 4); + + /* + * Move bytes from the direct items to the new unformatted node + * and delete them. + */ + while (1) { + int tail_size; + + /* + * end_key.k_offset is set so, that we will always have found + * last item of the file + */ + if (search_for_position_by_key(sb, &end_key, path) == + POSITION_FOUND) + reiserfs_panic(sb, "PAP-14050", + "direct item (%K) not found", &end_key); + p_le_ih = tp_item_head(path); + RFALSE(!is_direct_le_ih(p_le_ih), + "vs-14055: direct item expected(%K), found %h", + &end_key, p_le_ih); + tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + + ih_item_len(p_le_ih) - 1; + + /* + * we only send the unbh pointer if the buffer is not + * up to date. this avoids overwriting good data from + * writepage() with old data from the disk or buffer cache + * Special case: unbh->b_page will be NULL if we are coming + * through DIRECT_IO handler here. + */ + if (!unbh->b_page || buffer_uptodate(unbh) + || PageUptodate(unbh->b_page)) { + up_to_date_bh = NULL; + } else { + up_to_date_bh = unbh; + } + retval = reiserfs_delete_item(th, path, &end_key, inode, + up_to_date_bh); + + total_tail += retval; + + /* done: file does not have direct items anymore */ + if (tail_size == retval) + break; + + } + /* + * if we've copied bytes from disk into the page, we need to zero + * out the unused part of the block (it was not up to date before) + */ + if (up_to_date_bh) { + unsigned pgoff = + (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + char *kaddr = kmap_atomic(up_to_date_bh->b_page); + memset(kaddr + pgoff, 0, blk_size - total_tail); + kunmap_atomic(kaddr); + } + + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + + return 0; +} + +/* stolen from fs/buffer.c */ +void reiserfs_unmap_buffer(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG(); + } + clear_buffer_dirty(bh); + /* + * Remove the buffer from whatever list it belongs to. We are mostly + * interested in removing it from per-sb j_dirty_buffers list, to avoid + * BUG() on attempt to write not mapped buffer + */ + if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { + struct inode *inode = bh->b_page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + spin_lock(&j->j_dirty_buffers_lock); + list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); + spin_unlock(&j->j_dirty_buffers_lock); + } + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + unlock_buffer(bh); +} + +/* + * this first locks inode (neither reads nor sync are permitted), + * reads tail through page cache, insert direct item. When direct item + * inserted successfully inode is left locked. Return value is always + * what we expect from it (number of cut bytes). But when tail remains + * in the unformatted node, we set mode to SKIP_BALANCING and unlock + * inode + */ +int indirect2direct(struct reiserfs_transaction_handle *th, + struct inode *inode, struct page *page, + struct treepath *path, /* path to the indirect item. */ + const struct cpu_key *item_key, /* Key to look for + * unformatted node + * pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char *mode) +{ + struct super_block *sb = inode->i_sb; + struct item_head s_ih; + unsigned long block_size = sb->s_blocksize; + char *tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_indirect2direct++; + + *mode = M_SKIP_BALANCING; + + /* store item head path points to. */ + copy_item_head(&s_ih, tp_item_head(path)); + + tail_len = (n_new_file_size & (block_size - 1)); + if (get_inode_sd_version(inode) == STAT_DATA_V2) + round_tail_len = ROUND_UP(tail_len); + else + round_tail_len = tail_len; + + pos = + le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + pos1 = pos; + + /* + * we are protected by i_mutex. The tail can not disapper, not + * append can be done either + * we are in truncate or packing tail in file_release + */ + + tail = (char *)kmap(page); /* this can schedule */ + + if (path_changed(&s_ih, path)) { + /* re-search indirect item */ + if (search_for_position_by_key(sb, item_key, path) + == POSITION_NOT_FOUND) + reiserfs_panic(sb, "PAP-5520", + "item to be converted %K does not exist", + item_key); + copy_item_head(&s_ih, tp_item_head(path)); +#ifdef CONFIG_REISERFS_CHECK + pos = le_ih_k_offset(&s_ih) - 1 + + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + if (pos != pos1) + reiserfs_panic(sb, "vs-5530", "tail position " + "changed while we were reading it"); +#endif + } + + /* Set direct item header to insert. */ + make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode), + pos1 + 1, TYPE_DIRECT, round_tail_len, + 0xffff /*ih_free_space */ ); + + /* + * we want a pointer to the first byte of the tail in the page. + * the page was locked and this part of the page was up to date when + * indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); + + PATH_LAST_POSITION(path)++; + + key = *item_key; + set_cpu_key_k_type(&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if (reiserfs_insert_item(th, path, &key, &s_ih, inode, + tail ? tail : NULL) < 0) { + /* + * No disk memory. So we can not convert last unformatted node + * to the direct item. In this case we used to adjust + * indirect items's ih_free_space. Now ih_free_space is not + * used, it would be ideal to write zeros to corresponding + * unformatted node. For now i_size is considered as guard for + * going out of file size + */ + kunmap(page); + return block_size - round_tail_len; + } + kunmap(page); + + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, inode); + + /* + * note: we have now the same as in above direct2indirect + * conversion: there are two keys which have matching first three + * key components. They only differ by the fourth one. + */ + + /* + * We have inserted new direct item and must remove last + * unformatted node. + */ + *mode = M_CUT; + + /* we store position of first direct item in the in-core inode */ + /* mark_file_with_tail (inode, pos1 + 1); */ + REISERFS_I(inode)->i_first_direct_byte = pos1 + 1; + + return block_size - round_tail_len; +} diff --git a/kernel/fs/reiserfs/xattr.c b/kernel/fs/reiserfs/xattr.c new file mode 100644 index 000000000..e87f9b52b --- /dev/null +++ b/kernel/fs/reiserfs/xattr.c @@ -0,0 +1,1064 @@ +/* + * linux/fs/reiserfs/xattr.c + * + * Copyright (c) 2002 by Jeff Mahoney, + * + */ + +/* + * In order to implement EA/ACLs in a clean, backwards compatible manner, + * they are implemented as files in a "private" directory. + * Each EA is in it's own file, with the directory layout like so (/ is assumed + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, + * directories named using the capital-hex form of the objectid and + * generation number are used. Inside each directory are individual files + * named with the name of the extended attribute. + * + * So, for objectid 12648430, we could have: + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type + * .. or similar. + * + * The file contents are the text of the EA. The size is known based on the + * stat data describing the file. + * + * In the case of system.posix_acl_access and system.posix_acl_default, since + * these are special cases for filesystem ACLs, they are interpreted by the + * kernel, in addition, they are negatively and positively cached and attached + * to the inode so that unnecessary lookups are avoided. + * + * Locking works like so: + * Directory components (xattr root, xattr dir) are protectd by their i_mutex. + * The xattrs themselves are protected by the xattr_sem. + */ + +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include +#include +#include +#include +#include +#include + +#define PRIVROOT_NAME ".reiserfs_priv" +#define XAROOT_NAME "xattrs" + + +/* + * Helpers for inode ops. We do this so that we don't have all the VFS + * overhead and also for proper i_mutex annotation. + * dir->i_mutex must be held for all of them. + */ +#ifdef CONFIG_REISERFS_FS_XATTR +static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->create(dir, dentry, mode, true); +} +#endif + +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->mkdir(dir, dentry, mode); +} + +/* + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr + * mutation ops aren't called during rename or splace, which are the + * only other users of I_MUTEX_CHILD. It violates the ordering, but that's + * better than allocating another subclass just for this code. + */ +static int xattr_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + error = dir->i_op->unlink(dir, dentry); + mutex_unlock(&d_inode(dentry)->i_mutex); + + if (!error) + d_delete(dentry); + return error; +} + +static int xattr_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + error = dir->i_op->rmdir(dir, dentry); + if (!error) + d_inode(dentry)->i_flags |= S_DEAD; + mutex_unlock(&d_inode(dentry)->i_mutex); + if (!error) + d_delete(dentry); + + return error; +} + +#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) + +static struct dentry *open_xa_root(struct super_block *sb, int flags) +{ + struct dentry *privroot = REISERFS_SB(sb)->priv_root; + struct dentry *xaroot; + + if (d_really_is_negative(privroot)) + return ERR_PTR(-ENODATA); + + mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + + xaroot = dget(REISERFS_SB(sb)->xattr_root); + if (!xaroot) + xaroot = ERR_PTR(-ENODATA); + else if (d_really_is_negative(xaroot)) { + int err = -ENODATA; + + if (xattr_may_create(flags)) + err = xattr_mkdir(d_inode(privroot), xaroot, 0700); + if (err) { + dput(xaroot); + xaroot = ERR_PTR(err); + } + } + + mutex_unlock(&d_inode(privroot)->i_mutex); + return xaroot; +} + +static struct dentry *open_xa_dir(const struct inode *inode, int flags) +{ + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = open_xa_root(inode->i_sb, flags); + if (IS_ERR(xaroot)) + return xaroot; + + snprintf(namebuf, sizeof(namebuf), "%X.%X", + le32_to_cpu(INODE_PKEY(inode)->k_objectid), + inode->i_generation); + + mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + + xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { + int err = -ENODATA; + + if (xattr_may_create(flags)) + err = xattr_mkdir(d_inode(xaroot), xadir, 0700); + if (err) { + dput(xadir); + xadir = ERR_PTR(err); + } + } + + mutex_unlock(&d_inode(xaroot)->i_mutex); + dput(xaroot); + return xadir; +} + +/* + * The following are side effects of other operations that aren't explicitly + * modifying extended attributes. This includes operations such as permissions + * or ownership changes, object deletions, etc. + */ +struct reiserfs_dentry_buf { + struct dir_context ctx; + struct dentry *xadir; + int count; + struct dentry *dentries[8]; +}; + +static int +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct reiserfs_dentry_buf *dbuf = + container_of(ctx, struct reiserfs_dentry_buf, ctx); + struct dentry *dentry; + + WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + + if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) + return -ENOSPC; + + if (name[0] == '.' && (namelen < 2 || + (namelen == 2 && name[1] == '.'))) + return 0; + + dentry = lookup_one_len(name, dbuf->xadir, namelen); + if (IS_ERR(dentry)) { + return PTR_ERR(dentry); + } else if (d_really_is_negative(dentry)) { + /* A directory entry exists, but no file? */ + reiserfs_error(dentry->d_sb, "xattr-20003", + "Corrupted directory: xattr %pd listed but " + "not found for file %pd.\n", + dentry, dbuf->xadir); + dput(dentry); + return -EIO; + } + + dbuf->dentries[dbuf->count++] = dentry; + return 0; +} + +static void +cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) +{ + int i; + + for (i = 0; i < buf->count; i++) + if (buf->dentries[i]) + dput(buf->dentries[i]); +} + +static int reiserfs_for_each_xattr(struct inode *inode, + int (*action)(struct dentry *, void *), + void *data) +{ + struct dentry *dir; + int i, err = 0; + struct reiserfs_dentry_buf buf = { + .ctx.actor = fill_with_dentries, + }; + + /* Skip out, an xattr has no xattrs associated with it */ + if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) + return 0; + + dir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } else if (d_really_is_negative(dir)) { + err = 0; + goto out_dir; + } + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + + buf.xadir = dir; + while (1) { + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + if (err) + break; + if (!buf.count) + break; + for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { + struct dentry *dentry = buf.dentries[i]; + + if (!d_is_dir(dentry)) + err = action(dentry, data); + + dput(dentry); + buf.dentries[i] = NULL; + } + if (err) + break; + buf.count = 0; + } + mutex_unlock(&d_inode(dir)->i_mutex); + + cleanup_dentry_buf(&buf); + + if (!err) { + /* + * We start a transaction here to avoid a ABBA situation + * between the xattr root's i_mutex and the journal lock. + * This doesn't incur much additional overhead since the + * new transaction will just nest inside the + * outer transaction. + */ + int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + + reiserfs_write_lock(inode->i_sb); + err = journal_begin(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); + if (!err) { + int jerror; + + mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + I_MUTEX_XATTR); + err = action(dir, data); + reiserfs_write_lock(inode->i_sb); + jerror = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + err = jerror ?: err; + } + } +out_dir: + dput(dir); +out: + /* -ENODATA isn't an error */ + if (err == -ENODATA) + err = 0; + return err; +} + +static int delete_one_xattr(struct dentry *dentry, void *data) +{ + struct inode *dir = d_inode(dentry->d_parent); + + /* This is the xattr dir, handle specially. */ + if (d_is_dir(dentry)) + return xattr_rmdir(dir, dentry); + + return xattr_unlink(dir, dentry); +} + +static int chown_one_xattr(struct dentry *dentry, void *data) +{ + struct iattr *attrs = data; + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; +} + +/* No i_mutex, but the inode is unconnected. */ +int reiserfs_delete_xattrs(struct inode *inode) +{ + int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + + if (err) + reiserfs_warning(inode->i_sb, "jdm-20004", + "Couldn't delete all xattrs (%d)\n", err); + return err; +} + +/* inode->i_mutex: down */ +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) +{ + int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + + if (err) + reiserfs_warning(inode->i_sb, "jdm-20007", + "Couldn't chown all xattrs (%d)\n", err); + return err; +} + +#ifdef CONFIG_REISERFS_FS_XATTR +/* + * Returns a dentry corresponding to a specific extended attribute file + * for the inode. If flags allow, the file is created. Otherwise, a + * valid or negative dentry, or an error is returned. + */ +static struct dentry *xattr_lookup(struct inode *inode, const char *name, + int flags) +{ + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir(inode, flags); + if (IS_ERR(xadir)) + return ERR_CAST(xadir); + + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + xafile = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(xafile)) { + err = PTR_ERR(xafile); + goto out; + } + + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) + err = -EEXIST; + + if (d_really_is_negative(xafile)) { + err = -ENODATA; + if (xattr_may_create(flags)) + err = xattr_create(d_inode(xadir), xafile, + 0700|S_IFREG); + } + + if (err) + dput(xafile); +out: + mutex_unlock(&d_inode(xadir)->i_mutex); + dput(xadir); + if (err) + return ERR_PTR(err); + return xafile; +} + +/* Internal operations on file data */ +static inline void reiserfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static struct page *reiserfs_get_page(struct inode *dir, size_t n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* + * We can deadlock if we try to free dentries, + * and an unlink/rmdir has just occurred - GFP_NOFS avoids this + */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline __u32 xattr_hash(const char *msg, int len) +{ + return csum_partial(msg, len, 0); +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +static void update_ctime(struct inode *inode) +{ + struct timespec now = current_fs_time(inode->i_sb); + + if (inode_unhashed(inode) || !inode->i_nlink || + timespec_equal(&inode->i_ctime, &now)) + return; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +static int lookup_and_delete_xattr(struct inode *inode, const char *name) +{ + int err = 0; + struct dentry *dentry, *xadir; + + xadir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(xadir)) + return PTR_ERR(xadir); + + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + dentry = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_dput; + } + + if (d_really_is_positive(dentry)) { + err = xattr_unlink(d_inode(xadir), dentry); + update_ctime(inode); + } + + dput(dentry); +out_dput: + mutex_unlock(&d_inode(xadir)->i_mutex); + dput(xadir); + return err; +} + + +/* Generic extended attribute operations that can be used by xa plugins */ + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, + struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + int err = 0; + struct dentry *dentry; + struct page *page; + char *data; + size_t file_pos = 0; + size_t buffer_pos = 0; + size_t new_size; + __u32 xahash = 0; + + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + return err; + } + + dentry = xattr_lookup(inode, name, flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + down_write(&REISERFS_I(inode)->i_xattr_sem); + + xahash = xattr_hash(buffer, buffer_size); + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page(d_inode(dentry), file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + + skip = file_pos = sizeof(struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32(xahash); + } + + reiserfs_write_lock(inode->i_sb); + err = __reiserfs_write_begin(page, page_offset, chunk + skip); + if (!err) { + if (buffer) + memcpy(data + skip, buffer + buffer_pos, chunk); + err = reiserfs_commit_write(NULL, page, page_offset, + page_offset + chunk + + skip); + } + reiserfs_write_unlock(inode->i_sb); + unlock_page(page); + reiserfs_put_page(page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + new_size = buffer_size + sizeof(struct reiserfs_xattr_header); + if (!err && new_size < i_size_read(d_inode(dentry))) { + struct iattr newattrs = { + .ia_ctime = current_fs_time(inode->i_sb), + .ia_size = new_size, + .ia_valid = ATTR_SIZE | ATTR_CTIME, + }; + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_dio_wait(d_inode(dentry)); + + err = reiserfs_setattr(dentry, &newattrs); + mutex_unlock(&d_inode(dentry)->i_mutex); + } else + update_ctime(inode); +out_unlock: + up_write(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + return err; +} + +/* We need to start a transaction to maintain lock ordering */ +int reiserfs_xattr_set(struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + + struct reiserfs_transaction_handle th; + int error, error2; + size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); + + if (!(flags & XATTR_REPLACE)) + jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); + if (error) { + return error; + } + + error = reiserfs_xattr_set_handle(&th, inode, name, + buffer, buffer_size, flags); + + reiserfs_write_lock(inode->i_sb); + error2 = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error == 0) + error = error2; + + return error; +} + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, + size_t buffer_size) +{ + ssize_t err = 0; + struct dentry *dentry; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* + * We can't have xattrs attached to v1 items since they don't have + * generation numbers + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dentry = xattr_lookup(inode, name, XATTR_REPLACE); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + + down_read(&REISERFS_I(inode)->i_xattr_sem); + + isize = i_size_read(d_inode(dentry)); + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof(struct reiserfs_xattr_header); + goto out_unlock; + } + + if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_unlock; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page(d_inode(dentry), file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { + unlock_page(page); + reiserfs_put_page(page); + reiserfs_warning(inode->i_sb, "jdm-20001", + "Invalid magic for xattr (%s) " + "associated with %k", name, + INODE_PKEY(inode)); + err = -EIO; + goto out_unlock; + } + hash = le32_to_cpu(rxh->h_hash); + } + memcpy(buffer + buffer_pos, data + skip, chunk); + unlock_page(page); + reiserfs_put_page(page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof(struct reiserfs_xattr_header); + + if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != + hash) { + reiserfs_warning(inode->i_sb, "jdm-20002", + "Invalid hash for xattr (%s) associated " + "with %k", name, INODE_PKEY(inode)); + err = -EIO; + } + +out_unlock: + up_read(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + +out: + return err; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix with the generic xattr API, a filesystem should create a + * null-terminated array of struct xattr_handler (one for each prefix) and + * hang a pointer to it off of the s_xattr field of the superblock. + * + * The generic_fooxattr() functions will use this list to dispatch xattr + * operations to the correct xattr_handler. + */ +#define for_each_xattr_handler(handlers, handler) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* This is the implementation for the xattr plugin infrastructure */ +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, + const char *name) +{ + const struct xattr_handler *xah; + + if (!handlers) + return NULL; + + for_each_xattr_handler(handlers, xah) { + if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0) + break; + } + + return xah; +} + + +/* + * Inode operation getxattr() + */ +ssize_t +reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, + size_t size) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->get(dentry, name, buffer, size, handler->flags); +} + +/* + * Inode operation setxattr() + * + * d_inode(dentry)->i_mutex down + */ +int +reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, value, size, flags, handler->flags); +} + +/* + * Inode operation removexattr() + * + * d_inode(dentry)->i_mutex down + */ +int reiserfs_removexattr(struct dentry *dentry, const char *name) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); +} + +struct listxattr_buf { + struct dir_context ctx; + size_t size; + size_t pos; + char *buf; + struct dentry *dentry; +}; + +static int listxattr_filler(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct listxattr_buf *b = + container_of(ctx, struct listxattr_buf, ctx); + size_t size; + + if (name[0] != '.' || + (namelen != 1 && (name[1] != '.' || namelen != 2))) { + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, + name); + if (!handler) /* Unsupported xattr name */ + return 0; + if (b->buf) { + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); + if (size > b->size) + return -ERANGE; + } else { + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); + } + + b->pos += size; + } + return 0; +} + +/* + * Inode operation listxattr() + * + * We totally ignore the generic listxattr here because it would be stupid + * not to. Since the xattrs are organized in a directory, we can just + * readdir to find them. + */ +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) +{ + struct dentry *dir; + int err = 0; + struct listxattr_buf buf = { + .ctx.actor = listxattr_filler, + .dentry = dentry, + .buf = buffer, + .size = buffer ? size : 0, + }; + + if (d_really_is_negative(dentry)) + return -EINVAL; + + if (!dentry->d_sb->s_xattr || + get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + mutex_unlock(&d_inode(dir)->i_mutex); + + if (!err) + err = buf.pos; + + dput(dir); +out: + return err; +} + +static int create_privroot(struct dentry *dentry) +{ + int err; + struct inode *inode = d_inode(dentry->d_parent); + + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + + err = xattr_mkdir(inode, dentry, 0700); + if (err || d_really_is_negative(dentry)) { + reiserfs_warning(dentry->d_sb, "jdm-20006", + "xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. " + "Failing mount."); + return -EOPNOTSUPP; + } + + d_inode(dentry)->i_flags |= S_PRIVATE; + reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " + "storage.\n", PRIVROOT_NAME); + + return 0; +} + +#else +int __init reiserfs_xattr_register_handlers(void) { return 0; } +void reiserfs_xattr_unregister_handlers(void) {} +static int create_privroot(struct dentry *dentry) { return 0; } +#endif + +/* Actual operations that are exported to VFS-land */ +static const struct xattr_handler *reiserfs_xattr_handlers[] = { +#ifdef CONFIG_REISERFS_FS_XATTR + &reiserfs_xattr_user_handler, + &reiserfs_xattr_trusted_handler, +#endif +#ifdef CONFIG_REISERFS_FS_SECURITY + &reiserfs_xattr_security_handler, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static int xattr_mount_check(struct super_block *s) +{ + /* + * We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. + */ + if (old_format_only(s)) { + if (reiserfs_xattrs_optional(s)) { + /* + * Old format filesystem, but optional xattrs have + * been enabled. Error out. + */ + reiserfs_warning(s, "jdm-2005", + "xattrs/ACLs not supported " + "on pre-v3.6 format filesystems. " + "Failing mount."); + return -EOPNOTSUPP; + } + } + + return 0; +} + +int reiserfs_permission(struct inode *inode, int mask) +{ + /* + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. + */ + if (IS_PRIVATE(inode)) + return 0; + + return generic_permission(inode, mask); +} + +static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) +{ + return -EPERM; +} + +static const struct dentry_operations xattr_lookup_poison_ops = { + .d_revalidate = xattr_hide_revalidate, +}; + +int reiserfs_lookup_privroot(struct super_block *s) +{ + struct dentry *dentry; + int err = 0; + + /* If we don't have the privroot located yet - go find it */ + mutex_lock(&d_inode(s->s_root)->i_mutex); + dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, + strlen(PRIVROOT_NAME)); + if (!IS_ERR(dentry)) { + REISERFS_SB(s)->priv_root = dentry; + d_set_d_op(dentry, &xattr_lookup_poison_ops); + if (d_really_is_positive(dentry)) + d_inode(dentry)->i_flags |= S_PRIVATE; + } else + err = PTR_ERR(dentry); + mutex_unlock(&d_inode(s->s_root)->i_mutex); + + return err; +} + +/* + * We need to take a copy of the mount flags since things like + * MS_RDONLY don't get set until *after* we're called. + * mount_flags != mount_options + */ +int reiserfs_xattr_init(struct super_block *s, int mount_flags) +{ + int err = 0; + struct dentry *privroot = REISERFS_SB(s)->priv_root; + + err = xattr_mount_check(s); + if (err) + goto error; + + if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + mutex_lock(&d_inode(s->s_root)->i_mutex); + err = create_privroot(REISERFS_SB(s)->priv_root); + mutex_unlock(&d_inode(s->s_root)->i_mutex); + } + + if (d_really_is_positive(privroot)) { + s->s_xattr = reiserfs_xattr_handlers; + mutex_lock(&d_inode(privroot)->i_mutex); + if (!REISERFS_SB(s)->xattr_root) { + struct dentry *dentry; + + dentry = lookup_one_len(XAROOT_NAME, privroot, + strlen(XAROOT_NAME)); + if (!IS_ERR(dentry)) + REISERFS_SB(s)->xattr_root = dentry; + else + err = PTR_ERR(dentry); + } + mutex_unlock(&d_inode(privroot)->i_mutex); + } + +error: + if (err) { + clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt); + clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + if (reiserfs_posixacl(s)) + s->s_flags |= MS_POSIXACL; + else + s->s_flags &= ~MS_POSIXACL; + + return err; +} diff --git a/kernel/fs/reiserfs/xattr.h b/kernel/fs/reiserfs/xattr.h new file mode 100644 index 000000000..15dde6262 --- /dev/null +++ b/kernel/fs/reiserfs/xattr.h @@ -0,0 +1,122 @@ +#include +#include +#include +#include + +struct inode; +struct dentry; +struct iattr; +struct super_block; + +int reiserfs_xattr_register_handlers(void) __init; +void reiserfs_xattr_unregister_handlers(void); +int reiserfs_xattr_init(struct super_block *sb, int mount_flags); +int reiserfs_lookup_privroot(struct super_block *sb); +int reiserfs_delete_xattrs(struct inode *inode); +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); +int reiserfs_permission(struct inode *inode, int mask); + +#ifdef CONFIG_REISERFS_FS_XATTR +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +int reiserfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int reiserfs_removexattr(struct dentry *dentry, const char *name); + +int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); +int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); +int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *, + struct inode *, const char *, const void *, + size_t, int); + +extern const struct xattr_handler reiserfs_xattr_user_handler; +extern const struct xattr_handler reiserfs_xattr_trusted_handler; +extern const struct xattr_handler reiserfs_xattr_security_handler; +#ifdef CONFIG_REISERFS_FS_SECURITY +int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec); +int reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec); +void reiserfs_security_free(struct reiserfs_security_handle *sec); +#endif + +static inline int reiserfs_xattrs_initialized(struct super_block *sb) +{ + return REISERFS_SB(sb)->priv_root != NULL; +} + +#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) +static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) +{ + loff_t ret = 0; + if (reiserfs_file_data_log(inode)) { + ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize); + ret >>= inode->i_sb->s_blocksize_bits; + } + return ret; +} + +/* + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file. + * Let's try to be smart about it. + * xattr root: We cache it. If it's not cached, we may need to create it. + * xattr dir: If anything has been loaded for this inode, we can set a flag + * saying so. + * xattr file: Since we don't cache xattrs, we can't tell. We always include + * blocks for it. + * + * However, since root and dir can be created between calls - YOU MUST SAVE + * THIS VALUE. + */ +static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) +{ + size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + + if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + } + + return nblocks; +} + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ + init_rwsem(&REISERFS_I(inode)->i_xattr_sem); +} + +#else + +#define reiserfs_getxattr NULL +#define reiserfs_setxattr NULL +#define reiserfs_listxattr NULL +#define reiserfs_removexattr NULL + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ +} +#endif /* CONFIG_REISERFS_FS_XATTR */ + +#ifndef CONFIG_REISERFS_FS_SECURITY +static inline int reiserfs_security_init(struct inode *dir, + struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec) +{ + return 0; +} +static inline int +reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec) +{ + return 0; +} +static inline void reiserfs_security_free(struct reiserfs_security_handle *sec) +{} +#endif diff --git a/kernel/fs/reiserfs/xattr_acl.c b/kernel/fs/reiserfs/xattr_acl.c new file mode 100644 index 000000000..4b34b9dc0 --- /dev/null +++ b/kernel/fs/reiserfs/xattr_acl.c @@ -0,0 +1,407 @@ +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include + +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, + struct inode *inode, int type, + struct posix_acl *acl); + + +int +reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error, error2; + struct reiserfs_transaction_handle th; + size_t jcreate_blocks; + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; + + + /* + * Pessimism: We can't assume that anything from the xattr root up + * has been created. + */ + + jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, size) * 2; + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); + if (error == 0) { + error = __reiserfs_set_acl(&th, inode, type, acl); + reiserfs_write_lock(inode->i_sb); + error2 = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error2) + error = error2; + } + + return error; +} + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(reiserfs_acl_header)) + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *) value)->a_version != + cpu_to_le32(REISERFS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(reiserfs_acl_header); + count = reiserfs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +{ + reiserfs_acl_header *ext_acl; + char *e; + int n; + + *size = reiserfs_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * + sizeof(reiserfs_acl_entry), + GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); + e = (char *)ext_acl + sizeof(reiserfs_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(reiserfs_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +{ + char *name, *value; + struct posix_acl *acl; + int size; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = reiserfs_xattr_get(inode, name, NULL, 0); + if (size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return ERR_PTR(size); + } + + value = kmalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + retval = reiserfs_xattr_get(inode, name, value, size); + if (retval == -ENODATA || retval == -ENOSYS) { + /* + * This shouldn't actually happen as it should have + * been caught above.. but just in case + */ + acl = NULL; + } else if (retval < 0) { + acl = ERR_PTR(retval); + } else { + acl = reiserfs_posix_acl_from_disk(value, retval); + } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + kfree(value); + return acl; +} + +/* + * Inode operation set_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +static int +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, + int type, struct posix_acl *acl) +{ + char *name; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = reiserfs_posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0); + + /* + * Ensure that the inode gets dirtied if we're only using + * the mode bits and an old ACL didn't exist. We don't need + * to check if the inode is hashed here since we won't get + * called by reiserfs_inherit_default_acl(). + */ + if (error == -ENODATA) { + error = 0; + if (type == ACL_TYPE_ACCESS) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + } + + kfree(value); + + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +/* + * dir->i_mutex: locked, + * inode is new and not released into the wild yet + */ +int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK(inode->i_mode)) + return 0; + + /* + * ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from + */ + if (get_inode_sd_version(dir) == STAT_DATA_V1) + goto apply_umask; + + /* + * Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles + */ + if (IS_PRIVATE(dir)) { + inode->i_flags |= S_PRIVATE; + goto apply_umask; + } + + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + + if (default_acl) { + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!err) + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } + + return err; + +apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + return err; +} + +/* This is used to cache the default acl before a new object is created. + * The biggest reason for this is to get an idea of how many blocks will + * actually be required for the create operation if we must inherit an ACL. + * An ACL write can add up to 3 object creations and an additional file write + * so we'd prefer not to reserve that many blocks in the journal if we can. + * It also has the advantage of not loading the ACL with a transaction open, + * this may seem silly, but if the owner of the directory is doing the + * creation, the ACL may not be loaded since the permissions wouldn't require + * it. + * We return the number of blocks required for the transaction. + */ +int reiserfs_cache_default_acl(struct inode *inode) +{ + struct posix_acl *acl; + int nblocks = 0; + + if (IS_PRIVATE(inode)) + return 0; + + acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + + if (acl && !IS_ERR(acl)) { + int size = reiserfs_acl_size(acl->a_count); + + /* Other xattrs can be created during inode creation. We don't + * want to claim too many blocks, so we check to see if we + * we need to create the tree to the xattrs, and then we + * just want two files. */ + nblocks = reiserfs_xattr_jcreate_nblocks(inode); + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* We need to account for writes + bitmaps for two files */ + nblocks += reiserfs_xattr_nblocks(inode, size) * 4; + posix_acl_release(acl); + } + + return nblocks; +} + +/* + * Called under i_mutex + */ +int reiserfs_acl_chmod(struct inode *inode) +{ + if (IS_PRIVATE(inode)) + return 0; + if (get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) + return 0; + + return posix_acl_chmod(inode, inode->i_mode); +} diff --git a/kernel/fs/reiserfs/xattr_security.c b/kernel/fs/reiserfs/xattr_security.c new file mode 100644 index 000000000..9a3b0616f --- /dev/null +++ b/kernel/fs/reiserfs/xattr_security.c @@ -0,0 +1,120 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include +#include + +static int +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) +{ + const size_t len = namelen + 1; + + if (IS_PRIVATE(d_inode(dentry))) + return 0; + + if (list && len <= list_len) { + memcpy(list, name, namelen); + list[namelen] = '\0'; + } + + return len; +} + +/* Initializes the security context for a new inode and returns the number + * of blocks needed for the transaction. If successful, reiserfs_security + * must be released using reiserfs_security_free when the caller is done. */ +int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec) +{ + int blocks = 0; + int error; + + sec->name = NULL; + + /* Don't add selinux attributes on xattrs - they'll never get used */ + if (IS_PRIVATE(dir)) + return 0; + + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); + if (error) { + if (error == -EOPNOTSUPP) + error = 0; + + sec->name = NULL; + sec->value = NULL; + sec->length = 0; + return error; + } + + if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) { + blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, sec->length); + /* We don't want to count the directories twice if we have + * a default ACL. */ + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + } + return blocks; +} + +int reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec) +{ + int error; + if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value, + sec->length, XATTR_CREATE); + if (error == -ENODATA || error == -EOPNOTSUPP) + error = 0; + + return error; +} + +void reiserfs_security_free(struct reiserfs_security_handle *sec) +{ + kfree(sec->name); + kfree(sec->value); + sec->name = NULL; + sec->value = NULL; +} + +const struct xattr_handler reiserfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = security_get, + .set = security_set, + .list = security_list, +}; diff --git a/kernel/fs/reiserfs/xattr_trusted.c b/kernel/fs/reiserfs/xattr_trusted.c new file mode 100644 index 000000000..e4f134371 --- /dev/null +++ b/kernel/fs/reiserfs/xattr_trusted.c @@ -0,0 +1,56 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include + +static int +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return 0; + + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = trusted_get, + .set = trusted_set, + .list = trusted_list, +}; diff --git a/kernel/fs/reiserfs/xattr_user.c b/kernel/fs/reiserfs/xattr_user.c new file mode 100644 index 000000000..d0b08d3e5 --- /dev/null +++ b/kernel/fs/reiserfs/xattr_user.c @@ -0,0 +1,52 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include "xattr.h" +#include + +static int +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return 0; + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = user_get, + .set = user_set, + .list = user_list, +}; diff --git a/kernel/fs/romfs/Kconfig b/kernel/fs/romfs/Kconfig new file mode 100644 index 000000000..ce2d6bcc6 --- /dev/null +++ b/kernel/fs/romfs/Kconfig @@ -0,0 +1,62 @@ +config ROMFS_FS + tristate "ROM file system support" + depends on BLOCK || MTD + ---help--- + This is a very small read-only file system mainly intended for + initial ram disks of installation disks, but it could be used for + other read-only media as well. Read + for details. + + To compile this file system support as a module, choose M here: the + module will be called romfs. Note that the file system of your + root partition (the one containing the directory /) cannot be a + module. + + If you don't know whether you need it, then you don't need it: + answer N. + +# +# Select the backing stores to be supported +# +choice + prompt "RomFS backing stores" + depends on ROMFS_FS + default ROMFS_BACKED_BY_BLOCK + help + Select the backing stores to be supported. + +config ROMFS_BACKED_BY_BLOCK + bool "Block device-backed ROM file system support" + depends on BLOCK + help + This permits ROMFS to use block devices buffered through the page + cache as the medium from which to retrieve data. It does not allow + direct mapping of the medium. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_MTD + bool "MTD-backed ROM file system support" + depends on MTD=y || (ROMFS_FS=m && MTD) + help + This permits ROMFS to use MTD based devices directly, without the + intercession of the block layer (which may have been disabled). It + also allows direct mapping of MTD devices through romfs files under + NOMMU conditions if the underlying device is directly addressable by + the CPU. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_BOTH + bool "Both the above" + depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) +endchoice + + +config ROMFS_ON_BLOCK + bool + default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + +config ROMFS_ON_MTD + bool + default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH diff --git a/kernel/fs/romfs/Makefile b/kernel/fs/romfs/Makefile new file mode 100644 index 000000000..420beb7d4 --- /dev/null +++ b/kernel/fs/romfs/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux RomFS filesystem routines. +# + +obj-$(CONFIG_ROMFS_FS) += romfs.o + +romfs-y := storage.o super.o + +ifneq ($(CONFIG_MMU),y) +romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o +endif + diff --git a/kernel/fs/romfs/internal.h b/kernel/fs/romfs/internal.h new file mode 100644 index 000000000..95217b830 --- /dev/null +++ b/kernel/fs/romfs/internal.h @@ -0,0 +1,47 @@ +/* RomFS internal definitions + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +struct romfs_inode_info { + struct inode vfs_inode; + unsigned long i_metasize; /* size of non-data area */ + unsigned long i_dataoffset; /* from the start of fs */ +}; + +static inline size_t romfs_maxsize(struct super_block *sb) +{ + return (size_t) (unsigned long) sb->s_fs_info; +} + +static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) +{ + return container_of(inode, struct romfs_inode_info, vfs_inode); +} + +/* + * mmap-nommu.c + */ +#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) +extern const struct file_operations romfs_ro_fops; +#else +#define romfs_ro_fops generic_ro_fops +#endif + +/* + * storage.c + */ +extern int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen); +extern ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen); +extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/kernel/fs/romfs/mmap-nommu.c b/kernel/fs/romfs/mmap-nommu.c new file mode 100644 index 000000000..1118a0dc6 --- /dev/null +++ b/kernel/fs/romfs/mmap-nommu.c @@ -0,0 +1,89 @@ +/* NOMMU mmap support for RomFS on MTD devices + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + * - attempts to map through to the underlying MTD device + */ +static unsigned long romfs_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct inode *inode = file->f_mapping->host; + struct mtd_info *mtd = inode->i_sb->s_mtd; + unsigned long isize, offset, maxpages, lpages; + int ret; + + if (!mtd) + return (unsigned long) -ENOSYS; + + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + isize = i_size_read(inode); + offset = pgoff << PAGE_SHIFT; + + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) + return (unsigned long) -EINVAL; + + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset += ROMFS_I(inode)->i_dataoffset; + if (offset >= mtd->size) + return (unsigned long) -EINVAL; + /* the mapping mustn't extend beyond the EOF */ + if ((offset + len) > mtd->size) + len = mtd->size - offset; + + ret = mtd_get_unmapped_area(mtd, len, offset, flags); + if (ret == -EOPNOTSUPP) + ret = -ENOSYS; + return (unsigned long) ret; +} + +/* + * permit a R/O mapping to be made directly through onto an MTD device if + * possible + */ +static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +static unsigned romfs_mmap_capabilities(struct file *file) +{ + struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd; + + if (!mtd) + return NOMMU_MAP_COPY; + return mtd_mmap_capabilities(mtd); +} + +const struct file_operations romfs_ro_fops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .splice_read = generic_file_splice_read, + .mmap = romfs_mmap, + .get_unmapped_area = romfs_get_unmapped_area, + .mmap_capabilities = romfs_mmap_capabilities, +}; diff --git a/kernel/fs/romfs/storage.c b/kernel/fs/romfs/storage.c new file mode 100644 index 000000000..f86f51f99 --- /dev/null +++ b/kernel/fs/romfs/storage.c @@ -0,0 +1,293 @@ +/* RomFS storage access routines + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) +#error no ROMFS backing store interface configured +#endif + +#ifdef CONFIG_ROMFS_ON_MTD +#define ROMFS_MTD_READ(sb, ...) mtd_read((sb)->s_mtd, ##__VA_ARGS__) + +/* + * read data from an romfs image on an MTD device + */ +static int romfs_mtd_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t rlen; + int ret; + + ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); + return (ret < 0 || rlen != buflen) ? -EIO : 0; +} + +/* + * determine the length of a string in a romfs image on an MTD device + */ +static ssize_t romfs_mtd_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + ssize_t n = 0; + size_t segment; + u_char buf[16], *p; + size_t len; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (maxlen > 0) { + segment = min_t(size_t, maxlen, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + p = memchr(buf, 0, len); + if (p) + return n + (p - buf); + maxlen -= len; + pos += len; + n += len; + } + + return n; +} + +/* + * compare a string to one in a romfs image on MTD + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + u_char buf[17]; + size_t len, segment; + int ret; + + /* scan the string up to 16 bytes at a time, and attempt to grab the + * trailing NUL whilst we're at it */ + buf[0] = 0xff; + + while (size > 0) { + segment = min_t(size_t, size + 1, 17); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + len--; + if (memcmp(buf, str, len) != 0) + return 0; + buf[0] = buf[len]; + size -= len; + pos += len; + str += len; + } + + /* check the trailing NUL was */ + if (buf[0]) + return 0; + + return 1; +} +#endif /* CONFIG_ROMFS_ON_MTD */ + +#ifdef CONFIG_ROMFS_ON_BLOCK +/* + * read data from an romfs image on a block device + */ +static int romfs_blk_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + + /* copy the string up to blocksize bytes at a time */ + while (buflen > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, buflen, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + memcpy(buf, bh->b_data + offset, segment); + brelse(bh); + buf += segment; + buflen -= segment; + pos += segment; + } + + return 0; +} + +/* + * determine the length of a string in romfs on a block device + */ +static ssize_t romfs_blk_strnlen(struct super_block *sb, + unsigned long pos, size_t limit) +{ + struct buffer_head *bh; + unsigned long offset; + ssize_t n = 0; + size_t segment; + u_char *buf, *p; + + /* scan the string up to blocksize bytes at a time */ + while (limit > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, limit, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + buf = bh->b_data + offset; + p = memchr(buf, 0, segment); + brelse(bh); + if (p) + return n + (p - buf); + limit -= segment; + pos += segment; + n += segment; + } + + return n; +} + +/* + * compare a string to one in a romfs image on a block device + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + bool matched, terminated = false; + + /* compare string up to a block at a time */ + while (size > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, size, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = (memcmp(bh->b_data + offset, str, segment) == 0); + + size -= segment; + pos += segment; + str += segment; + if (matched && size == 0 && offset + segment < ROMBSIZE) { + if (!bh->b_data[offset + segment]) + terminated = true; + else + matched = false; + } + brelse(bh); + if (!matched) + return 0; + } + + if (!terminated) { + /* the terminating NUL must be on the first byte of the next + * block */ + BUG_ON((pos & (ROMBSIZE - 1)) != 0); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = !bh->b_data[0]; + brelse(bh); + if (!matched) + return 0; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_BLOCK */ + +/* + * read data from the romfs image + */ +int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (buflen > limit - pos) + buflen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_read(sb, pos, buf, buflen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_read(sb, pos, buf, buflen); +#endif + return -EIO; +} + +/* + * determine the length of a string in romfs + */ +ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (maxlen > limit - pos) + maxlen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strnlen(sb, pos, maxlen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strnlen(sb, pos, maxlen); +#endif + return -EIO; +} + +/* + * compare a string to one in romfs + * - the string to be compared to, str, may not be NUL-terminated; instead the + * string is of the specified size + * - return 1 if matched, 0 if differ, -ve if error + */ +int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (size > ROMFS_MAXFN) + return -ENAMETOOLONG; + if (size + 1 > limit - pos) + return -EIO; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strcmp(sb, pos, str, size); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strcmp(sb, pos, str, size); +#endif + return -EIO; +} diff --git a/kernel/fs/romfs/super.c b/kernel/fs/romfs/super.c new file mode 100644 index 000000000..268733cda --- /dev/null +++ b/kernel/fs/romfs/super.c @@ -0,0 +1,659 @@ +/* Block- or MTD-based romfs + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * Derived from: ROMFS file system, Linux implementation + * + * Copyright © 1997-1999 Janos Farkas + * + * Using parts of the minix filesystem + * Copyright © 1991, 1992 Linus Torvalds + * + * and parts of the affs filesystem additionally + * Copyright © 1993 Ray Burr + * Copyright © 1996 Hans-Joachim Widmaier + * + * Changes + * Changed for 2.1.19 modules + * Jan 1997 Initial release + * Jun 1997 2.1.43+ changes + * Proper page locking in readpage + * Changed to work with 2.1.45+ fs + * Jul 1997 Fixed follow_link + * 2.1.47 + * lookup shouldn't return -ENOENT + * from Horst von Brand: + * fail on wrong checksum + * double unlock_super was possible + * correct namelen for statfs + * spotted by Bill Hawes: + * readlink shouldn't iput() + * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() + * exposed a problem in readdir + * 2.1.107 code-freeze spellchecker run + * Aug 1998 2.1.118+ VFS changes + * Sep 1998 2.1.122 another VFS change (follow_link) + * Apr 1999 2.2.7 no more EBADF checking in + * lookup/readdir, use ERR_PTR + * Jun 1999 2.3.6 d_alloc_root use changed + * 2.3.9 clean up usage of ENOENT/negative + * dentries in lookup + * clean up page flags setting + * (error, uptodate, locking) in + * in readpage + * use init_special_inode for + * fifos/sockets (and streamline) in + * read_inode, fix _ops table order + * Aug 1999 2.3.16 __initfunc() => __init change + * Oct 1999 2.3.24 page->owner hack obsoleted + * Nov 1999 2.3.27 2.3.25+ page->offset => index change + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct kmem_cache *romfs_inode_cachep; + +static const umode_t romfs_modemap[8] = { + 0, /* hard link */ + S_IFDIR | 0644, /* directory */ + S_IFREG | 0644, /* regular file */ + S_IFLNK | 0777, /* symlink */ + S_IFBLK | 0600, /* blockdev */ + S_IFCHR | 0600, /* chardev */ + S_IFSOCK | 0644, /* socket */ + S_IFIFO | 0644 /* FIFO */ +}; + +static const unsigned char romfs_dtype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO +}; + +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); + +/* + * read a page worth of data from the image + */ +static int romfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t offset, size; + unsigned long fillsize, pos; + void *buf; + int ret; + + buf = kmap(page); + if (!buf) + return -ENOMEM; + + /* 32 bit warning -- but not for us :) */ + offset = page_offset(page); + size = i_size_read(inode); + fillsize = 0; + ret = 0; + if (offset < size) { + size -= offset; + fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; + + pos = ROMFS_I(inode)->i_dataoffset + offset; + + ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); + if (ret < 0) { + SetPageError(page); + fillsize = 0; + ret = -EIO; + } + } + + if (fillsize < PAGE_SIZE) + memset(buf + fillsize, 0, PAGE_SIZE - fillsize); + if (ret == 0) + SetPageUptodate(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static const struct address_space_operations romfs_aops = { + .readpage = romfs_readpage +}; + +/* + * read the entries from a directory + */ +static int romfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *i = file_inode(file); + struct romfs_inode ri; + unsigned long offset, maxoff; + int j, ino, nextfh; + char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ + int ret; + + maxoff = romfs_maxsize(i->i_sb); + + offset = ctx->pos; + if (!offset) { + offset = i->i_ino & ROMFH_MASK; + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* Not really failsafe, but we are read-only... */ + for (;;) { + if (!offset || offset >= maxoff) { + offset = maxoff; + ctx->pos = offset; + goto out; + } + ctx->pos = offset; + + /* Fetch inode info */ + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + + j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, + sizeof(fsname) - 1); + if (j < 0) + goto out; + + ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); + if (ret < 0) + goto out; + fsname[j] = '\0'; + + ino = offset; + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) + ino = be32_to_cpu(ri.spec); + if (!dir_emit(ctx, fsname, j, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE])) + goto out; + + offset = nextfh & ROMFH_MASK; + } +out: + return 0; +} + +/* + * look up an entry in a directory + */ +static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + unsigned long offset, maxoff; + struct inode *inode; + struct romfs_inode ri; + const char *name; /* got from dentry */ + int len, ret; + + offset = dir->i_ino & ROMFH_MASK; + ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto error; + + /* search all the file entries in the list starting from the one + * pointed to by the directory's special data */ + maxoff = romfs_maxsize(dir->i_sb); + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + for (;;) { + if (!offset || offset >= maxoff) + goto out0; + + ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* try to match the first 16 bytes of name */ + ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); + if (ret < 0) + goto error; + if (ret == 1) + break; + + /* next entry */ + offset = be32_to_cpu(ri.next) & ROMFH_MASK; + } + + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + inode = romfs_iget(dir->i_sb, offset); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto error; + } + goto outi; + + /* + * it's a bit funky, _lookup needs to return an error code + * (negative) or a NULL, both as a dentry. ENOENT should not + * be returned, instead we need to create a negative dentry by + * d_add(dentry, NULL); and return 0 as no error. + * (Although as I see, it only matters on writable file + * systems). + */ +out0: + inode = NULL; +outi: + d_add(dentry, inode); + ret = 0; +error: + return ERR_PTR(ret); +} + +static const struct file_operations romfs_dir_operations = { + .read = generic_read_dir, + .iterate = romfs_readdir, + .llseek = default_llseek, +}; + +static const struct inode_operations romfs_dir_inode_operations = { + .lookup = romfs_lookup, +}; + +/* + * get a romfs inode based on its position in the image (which doubles as the + * inode number) + */ +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) +{ + struct romfs_inode_info *inode; + struct romfs_inode ri; + struct inode *i; + unsigned long nlen; + unsigned nextfh; + int ret; + umode_t mode; + + /* we might have to traverse a chain of "hard link" file entries to get + * to the actual file */ + for (;;) { + ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* XXX: do romfs_checksum here too (with name) */ + + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) + break; + + pos = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* determine the length of the filename */ + nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); + if (IS_ERR_VALUE(nlen)) + goto eio; + + /* get an inode for this image position */ + i = iget_locked(sb, pos); + if (!i) + return ERR_PTR(-ENOMEM); + + if (!(i->i_state & I_NEW)) + return i; + + /* precalculate the data offset */ + inode = ROMFS_I(i); + inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; + inode->i_dataoffset = pos + inode->i_metasize; + + set_nlink(i, 1); /* Hard to decide.. */ + i->i_size = be32_to_cpu(ri.size); + i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; + i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + + /* set up mode and ops */ + mode = romfs_modemap[nextfh & ROMFH_TYPE]; + + switch (nextfh & ROMFH_TYPE) { + case ROMFH_DIR: + i->i_size = ROMFS_I(i)->i_metasize; + i->i_op = &romfs_dir_inode_operations; + i->i_fop = &romfs_dir_operations; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_REG: + i->i_fop = &romfs_ro_fops; + i->i_data.a_ops = &romfs_aops; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_SYM: + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &romfs_aops; + mode |= S_IRWXUGO; + break; + default: + /* depending on MBZ for sock/fifos */ + nextfh = be32_to_cpu(ri.spec); + init_special_inode(i, mode, MKDEV(nextfh >> 16, + nextfh & 0xffff)); + break; + } + + i->i_mode = mode; + + unlock_new_inode(i); + return i; + +eio: + ret = -EIO; +error: + pr_err("read error for inode 0x%lx\n", pos); + return ERR_PTR(ret); +} + +/* + * allocate a new inode + */ +static struct inode *romfs_alloc_inode(struct super_block *sb) +{ + struct romfs_inode_info *inode; + + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + return inode ? &inode->vfs_inode : NULL; +} + +/* + * return a spent inode to the slab cache + */ +static void romfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); +} + +static void romfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, romfs_i_callback); +} + +/* + * get filesystem statistics + */ +static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ROMFS_MAGIC; + buf->f_namelen = ROMFS_MAXFN; + buf->f_bsize = ROMBSIZE; + buf->f_bfree = buf->f_bavail = buf->f_ffree; + buf->f_blocks = + (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * remounting must involve read-only + */ +static int romfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations romfs_super_ops = { + .alloc_inode = romfs_alloc_inode, + .destroy_inode = romfs_destroy_inode, + .statfs = romfs_statfs, + .remount_fs = romfs_remount, +}; + +/* + * checksum check on part of a romfs filesystem + */ +static __u32 romfs_checksum(const void *data, int size) +{ + const __be32 *ptr = data; + __u32 sum; + + sum = 0; + size >>= 2; + while (size > 0) { + sum += be32_to_cpu(*ptr++); + size--; + } + return sum; +} + +/* + * fill in the superblock + */ +static int romfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct romfs_super_block *rsb; + struct inode *root; + unsigned long pos, img_size; + const char *storage; + size_t len; + int ret; + +#ifdef CONFIG_BLOCK + if (!sb->s_mtd) { + sb_set_blocksize(sb, ROMBSIZE); + } else { + sb->s_blocksize = ROMBSIZE; + sb->s_blocksize_bits = blksize_bits(ROMBSIZE); + } +#endif + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_magic = ROMFS_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_op = &romfs_super_ops; + + /* read the image superblock and check it */ + rsb = kmalloc(512, GFP_KERNEL); + if (!rsb) + return -ENOMEM; + + sb->s_fs_info = (void *) 512; + ret = romfs_dev_read(sb, 0, rsb, 512); + if (ret < 0) + goto error_rsb; + + img_size = be32_to_cpu(rsb->size); + + if (sb->s_mtd && img_size > sb->s_mtd->size) + goto error_rsb_inval; + + sb->s_fs_info = (void *) img_size; + + if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || + img_size < ROMFH_SIZE) { + if (!silent) + pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { + pr_err("bad initial checksum on dev %s.\n", sb->s_id); + goto error_rsb_inval; + } + + storage = sb->s_mtd ? "MTD" : "the block layer"; + + len = strnlen(rsb->name, ROMFS_MAXFN); + if (!silent) + pr_notice("Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); + + kfree(rsb); + rsb = NULL; + + /* find the root directory */ + pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; + + root = romfs_iget(sb, pos); + if (IS_ERR(root)) + return PTR_ERR(root); + + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + return 0; + +error_rsb_inval: + ret = -EINVAL; +error_rsb: + kfree(rsb); + return ret; +} + +/* + * get a superblock for mounting + */ +static struct dentry *romfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + struct dentry *ret = ERR_PTR(-EINVAL); + +#ifdef CONFIG_ROMFS_ON_MTD + ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (ret == ERR_PTR(-EINVAL)) + ret = mount_bdev(fs_type, flags, dev_name, data, + romfs_fill_super); +#endif + return ret; +} + +/* + * destroy a romfs superblock in the appropriate manner + */ +static void romfs_kill_sb(struct super_block *sb) +{ +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) { + kill_mtd_super(sb); + return; + } +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) { + kill_block_super(sb); + return; + } +#endif +} + +static struct file_system_type romfs_fs_type = { + .owner = THIS_MODULE, + .name = "romfs", + .mount = romfs_mount, + .kill_sb = romfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("romfs"); + +/* + * inode storage initialiser + */ +static void romfs_i_init_once(void *_inode) +{ + struct romfs_inode_info *inode = _inode; + + inode_init_once(&inode->vfs_inode); +} + +/* + * romfs module initialisation + */ +static int __init init_romfs_fs(void) +{ + int ret; + + pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + + romfs_inode_cachep = + kmem_cache_create("romfs_i", + sizeof(struct romfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + romfs_i_init_once); + + if (!romfs_inode_cachep) { + pr_err("Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = register_filesystem(&romfs_fs_type); + if (ret) { + pr_err("Failed to register filesystem\n"); + goto error_register; + } + return 0; + +error_register: + kmem_cache_destroy(romfs_inode_cachep); + return ret; +} + +/* + * romfs module removal + */ +static void __exit exit_romfs_fs(void) +{ + unregister_filesystem(&romfs_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(romfs_inode_cachep); +} + +module_init(init_romfs_fs); +module_exit(exit_romfs_fs); + +MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ diff --git a/kernel/fs/select.c b/kernel/fs/select.c new file mode 100644 index 000000000..f684c750e --- /dev/null +++ b/kernel/fs/select.c @@ -0,0 +1,1040 @@ +/* + * This file contains the procedures for the handling of select and poll + * + * Created for Linux based loosely upon Mathius Lattner's minix + * patches by Peter MacDonald. Heavily edited by Linus. + * + * 4 February 1994 + * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS + * flag set in its personality we do *not* modify the given timeout + * parameter to reflect time remaining. + * + * 24 January 2000 + * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation + * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + */ + +#include +#include +#include +#include +#include +#include +#include /* for STICKY_TIMEOUTS */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * Estimate expected accuracy in ns from a timeval. + * + * After quite a bit of churning around, we've settled on + * a simple thing of taking 0.1% of the timeout as the + * slack, with a cap of 100 msec. + * "nice" tasks get a 0.5% slack instead. + * + * Consider this comment an open invitation to come up with even + * better solutions.. + */ + +#define MAX_SLACK (100 * NSEC_PER_MSEC) + +static long __estimate_accuracy(struct timespec *tv) +{ + long slack; + int divfactor = 1000; + + if (tv->tv_sec < 0) + return 0; + + if (task_nice(current) > 0) + divfactor = divfactor / 5; + + if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) + return MAX_SLACK; + + slack = tv->tv_nsec / divfactor; + slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + + if (slack > MAX_SLACK) + return MAX_SLACK; + + return slack; +} + +long select_estimate_accuracy(struct timespec *tv) +{ + unsigned long ret; + struct timespec now; + + /* + * Realtime tasks get a slack of 0 for obvious reasons. + */ + + if (rt_task(current)) + return 0; + + ktime_get_ts(&now); + now = timespec_sub(*tv, now); + ret = __estimate_accuracy(&now); + if (ret < current->timer_slack_ns) + return current->timer_slack_ns; + return ret; +} + + + +struct poll_table_page { + struct poll_table_page * next; + struct poll_table_entry * entry; + struct poll_table_entry entries[0]; +}; + +#define POLL_TABLE_FULL(table) \ + ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + +/* + * Ok, Peter made a complicated, but straightforward multiple_wait() function. + * I have rewritten this, taking some shortcuts: This code may not be easy to + * follow, but it should be free of race-conditions, and it's practical. If you + * understand what I'm doing here, then you understand how the linux + * sleep/wakeup mechanism works. + * + * Two very simple procedures, poll_wait() and poll_freewait() make all the + * work. poll_wait() is an inline-function defined in , + * as all select/poll functions have to call it to add an entry to the + * poll table. + */ +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, + poll_table *p); + +void poll_initwait(struct poll_wqueues *pwq) +{ + init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; + pwq->triggered = 0; + pwq->error = 0; + pwq->table = NULL; + pwq->inline_index = 0; +} +EXPORT_SYMBOL(poll_initwait); + +static void free_poll_entry(struct poll_table_entry *entry) +{ + remove_wait_queue(entry->wait_address, &entry->wait); + fput(entry->filp); +} + +void poll_freewait(struct poll_wqueues *pwq) +{ + struct poll_table_page * p = pwq->table; + int i; + for (i = 0; i < pwq->inline_index; i++) + free_poll_entry(pwq->inline_entries + i); + while (p) { + struct poll_table_entry * entry; + struct poll_table_page *old; + + entry = p->entry; + do { + entry--; + free_poll_entry(entry); + } while (entry > p->entries); + old = p; + p = p->next; + free_page((unsigned long) old); + } +} +EXPORT_SYMBOL(poll_freewait); + +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) +{ + struct poll_table_page *table = p->table; + + if (p->inline_index < N_INLINE_POLL_ENTRIES) + return p->inline_entries + p->inline_index++; + + if (!table || POLL_TABLE_FULL(table)) { + struct poll_table_page *new_table; + + new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); + if (!new_table) { + p->error = -ENOMEM; + return NULL; + } + new_table->entry = new_table->entries; + new_table->next = table; + p->table = new_table; + table = new_table; + } + + return table->entry++; +} + +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + +/* Add a new entry */ +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, + poll_table *p) +{ + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); + if (!entry) + return; + entry->filp = get_file(filp); + entry->wait_address = wait_address; + entry->key = p->_key; + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; + add_wait_queue(wait_address, &entry->wait); +} + +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + +/** + * poll_select_set_timeout - helper function to setup the timeout value + * @to: pointer to timespec variable for the final timeout + * @sec: seconds (from user space) + * @nsec: nanoseconds (from user space) + * + * Note, we do not use a timespec for the user space value here, That + * way we can use the function for timeval and compat interfaces as well. + * + * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. + */ +int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +{ + struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + + if (!timespec_valid(&ts)) + return -EINVAL; + + /* Optimize for the zero timeout value here */ + if (!sec && !nsec) { + to->tv_sec = to->tv_nsec = 0; + } else { + ktime_get_ts(to); + *to = timespec_add_safe(*to, ts); + } + return 0; +} + +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec rts; + struct timeval rtv; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&rts); + rts = timespec_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; + + if (timeval) { + if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) + memset(&rtv, 0, sizeof(rtv)); + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + + } else if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +#define FDS_IN(fds, n) (fds->in + n) +#define FDS_OUT(fds, n) (fds->out + n) +#define FDS_EX(fds, n) (fds->ex + n) + +#define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) + +static int max_select_fd(unsigned long n, fd_set_bits *fds) +{ + unsigned long *open_fds; + unsigned long set; + int max; + struct fdtable *fdt; + + /* handle last in-complete long-word first */ + set = ~(~0UL << (n & (BITS_PER_LONG-1))); + n /= BITS_PER_LONG; + fdt = files_fdtable(current->files); + open_fds = fdt->open_fds + n; + max = 0; + if (set) { + set &= BITS(fds, n); + if (set) { + if (!(set & ~*open_fds)) + goto get_max; + return -EBADF; + } + } + while (n) { + open_fds--; + n--; + set = BITS(fds, n); + if (!set) + continue; + if (set & ~*open_fds) + return -EBADF; + if (max) + continue; +get_max: + do { + max++; + set >>= 1; + } while (set); + max += n * BITS_PER_LONG; + } + + return max; +} + +#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) +#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) +#define POLLEX_SET (POLLPRI) + +static inline void wait_key_set(poll_table *wait, unsigned long in, + unsigned long out, unsigned long bit, + unsigned int ll_flag) +{ + wait->_key = POLLEX_SET | ll_flag; + if (in & bit) + wait->_key |= POLLIN_SET; + if (out & bit) + wait->_key |= POLLOUT_SET; +} + +int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +{ + ktime_t expire, *to = NULL; + struct poll_wqueues table; + poll_table *wait; + int retval, i, timed_out = 0; + unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; + + rcu_read_lock(); + retval = max_select_fd(n, fds); + rcu_read_unlock(); + + if (retval < 0) + return retval; + n = retval; + + poll_initwait(&table); + wait = &table.pt; + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + wait->_qproc = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = select_estimate_accuracy(end_time); + + retval = 0; + for (;;) { + unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_busy_loop = false; + + inp = fds->in; outp = fds->out; exp = fds->ex; + rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; + + for (i = 0; i < n; ++rinp, ++routp, ++rexp) { + unsigned long in, out, ex, all_bits, bit = 1, mask, j; + unsigned long res_in = 0, res_out = 0, res_ex = 0; + + in = *inp++; out = *outp++; ex = *exp++; + all_bits = in | out | ex; + if (all_bits == 0) { + i += BITS_PER_LONG; + continue; + } + + for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { + struct fd f; + if (i >= n) + break; + if (!(bit & all_bits)) + continue; + f = fdget(i); + if (f.file) { + const struct file_operations *f_op; + f_op = f.file->f_op; + mask = DEFAULT_POLLMASK; + if (f_op->poll) { + wait_key_set(wait, in, out, + bit, busy_flag); + mask = (*f_op->poll)(f.file, wait); + } + fdput(f); + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + + } + } + if (res_in) + *rinp = res_in; + if (res_out) + *routp = res_out; + if (res_ex) + *rexp = res_ex; + cond_resched(); + } + wait->_qproc = NULL; + if (retval || timed_out || signal_pending(current)) + break; + if (table.error) { + retval = table.error; + break; + } + + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; + } + + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) + timed_out = 1; + } + + poll_freewait(&table); + + return retval; +} + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int ret, max_fds; + unsigned int size; + struct fdtable *fdt; + /* Allocate small arguments on the stack to save memory and be faster */ + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + ret = -EINVAL; + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; + bits = kmalloc(6 * size, GFP_KERNEL); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; + + if ((ret = get_fd_set(n, inp, fds.in)) || + (ret = get_fd_set(n, outp, fds.out)) || + (ret = get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (set_fd_set(n, inp, fds.res_in) || + set_fd_set(n, outp, fds.res_out) || + set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; + +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timespec __user *tsp, + const sigset_t __user *sigmask, size_t sigsetsize) +{ + sigset_t ksigmask, sigsaved; + struct timespec ts, end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +/* + * Most architectures can't handle 7-argument syscalls. So we provide a + * 6-argument version where the sixth argument is a pointer to a structure + * which has a pointer to the sigset_t itself followed by a size_t containing + * the sigset size. + */ +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) +{ + size_t sigsetsize = 0; + sigset_t __user *up = NULL; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) + || __get_user(up, (sigset_t __user * __user *)sig) + || __get_user(sigsetsize, + (size_t __user *)(sig+sizeof(void *)))) + return -EFAULT; + } + + return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize); +} + +#ifdef __ARCH_WANT_SYS_OLD_SELECT +struct sel_arg_struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; +}; + +SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); +} +#endif + +struct poll_list { + struct poll_list *next; + int len; + struct pollfd entries[0]; +}; + +#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) + +/* + * Fish for pollable events on the pollfd->fd file descriptor. We're only + * interested in events matching the pollfd->events mask, and the result + * matching that mask is both recorded in pollfd->revents and returned. The + * pwait poll_table will be used by the fd-provided poll handler for waiting, + * if pwait->_qproc is non-NULL. + */ +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_busy_poll, + unsigned int busy_flag) +{ + unsigned int mask; + int fd; + + mask = 0; + fd = pollfd->fd; + if (fd >= 0) { + struct fd f = fdget(fd); + mask = POLLNVAL; + if (f.file) { + mask = DEFAULT_POLLMASK; + if (f.file->f_op->poll) { + pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= busy_flag; + mask = f.file->f_op->poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; + } + /* Mask out unneeded events. */ + mask &= pollfd->events | POLLERR | POLLHUP; + fdput(f); + } + } + pollfd->revents = mask; + + return mask; +} + +static int do_poll(unsigned int nfds, struct poll_list *list, + struct poll_wqueues *wait, struct timespec *end_time) +{ + poll_table* pt = &wait->pt; + ktime_t expire, *to = NULL; + int timed_out = 0, count = 0; + unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; + + /* Optimise the no-wait case */ + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + pt->_qproc = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = select_estimate_accuracy(end_time); + + for (;;) { + struct poll_list *walk; + bool can_busy_loop = false; + + for (walk = list; walk != NULL; walk = walk->next) { + struct pollfd * pfd, * pfd_end; + + pfd = walk->entries; + pfd_end = pfd + walk->len; + for (; pfd != pfd_end; pfd++) { + /* + * Fish for events. If we found one, record it + * and kill poll_table->_qproc, so we don't + * needlessly register any other waiters after + * this. They'll get immediately deregistered + * when we break out and return. + */ + if (do_pollfd(pfd, pt, &can_busy_loop, + busy_flag)) { + count++; + pt->_qproc = NULL; + /* found something, stop busy polling */ + busy_flag = 0; + can_busy_loop = false; + } + } + } + /* + * All waiters have already been registered, so don't provide + * a poll_table->_qproc to them on the next loop iteration. + */ + pt->_qproc = NULL; + if (!count) { + count = wait->error; + if (signal_pending(current)) + count = -EINTR; + } + if (count || timed_out) + break; + + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; + } + + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) + timed_out = 1; + } + return count; +} + +#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ + sizeof(struct pollfd)) + +int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, + struct timespec *end_time) +{ + struct poll_wqueues table; + int err = -EFAULT, fdcount, len, size; + /* Allocate small arguments on the stack to save memory and be + faster - use long to make sure the buffer is aligned properly + on 64 bit archs to avoid unaligned access */ + long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; + struct poll_list *const head = (struct poll_list *)stack_pps; + struct poll_list *walk = head; + unsigned long todo = nfds; + + if (nfds > rlimit(RLIMIT_NOFILE)) + return -EINVAL; + + len = min_t(unsigned int, nfds, N_STACK_PPS); + for (;;) { + walk->next = NULL; + walk->len = len; + if (!len) + break; + + if (copy_from_user(walk->entries, ufds + nfds-todo, + sizeof(struct pollfd) * walk->len)) + goto out_fds; + + todo -= walk->len; + if (!todo) + break; + + len = min(todo, POLLFD_PER_PAGE); + size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; + walk = walk->next = kmalloc(size, GFP_KERNEL); + if (!walk) { + err = -ENOMEM; + goto out_fds; + } + } + + poll_initwait(&table); + fdcount = do_poll(nfds, head, &table, end_time); + poll_freewait(&table); + + for (walk = head; walk; walk = walk->next) { + struct pollfd *fds = walk->entries; + int j; + + for (j = 0; j < walk->len; j++, ufds++) + if (__put_user(fds[j].revents, &ufds->revents)) + goto out_fds; + } + + err = fdcount; +out_fds: + walk = head->next; + while (walk) { + struct poll_list *pos = walk; + walk = walk->next; + kfree(pos); + } + + return err; +} + +static long do_restart_poll(struct restart_block *restart_block) +{ + struct pollfd __user *ufds = restart_block->poll.ufds; + int nfds = restart_block->poll.nfds; + struct timespec *to = NULL, end_time; + int ret; + + if (restart_block->poll.has_timeout) { + end_time.tv_sec = restart_block->poll.tv_sec; + end_time.tv_nsec = restart_block->poll.tv_nsec; + to = &end_time; + } + + ret = do_sys_poll(ufds, nfds, to); + + if (ret == -EINTR) { + restart_block->fn = do_restart_poll; + ret = -ERESTART_RESTARTBLOCK; + } + return ret; +} + +SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, + int, timeout_msecs) +{ + struct timespec end_time, *to = NULL; + int ret; + + if (timeout_msecs >= 0) { + to = &end_time; + poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, + NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); + } + + ret = do_sys_poll(ufds, nfds, to); + + if (ret == -EINTR) { + struct restart_block *restart_block; + + restart_block = ¤t->restart_block; + restart_block->fn = do_restart_poll; + restart_block->poll.ufds = ufds; + restart_block->poll.nfds = nfds; + + if (timeout_msecs >= 0) { + restart_block->poll.tv_sec = end_time.tv_sec; + restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.has_timeout = 1; + } else + restart_block->poll.has_timeout = 0; + + ret = -ERESTART_RESTARTBLOCK; + } + return ret; +} + +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + sigset_t ksigmask, sigsaved; + struct timespec ts, end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} diff --git a/kernel/fs/seq_file.c b/kernel/fs/seq_file.c new file mode 100644 index 000000000..555f82155 --- /dev/null +++ b/kernel/fs/seq_file.c @@ -0,0 +1,968 @@ +/* + * linux/fs/seq_file.c + * + * helper functions for making synthetic files from sequences of records. + * initial implementation -- AV, Oct 2001. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static void seq_set_overflow(struct seq_file *m) +{ + m->count = m->size; +} + +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + /* + * __GFP_NORETRY to avoid oom-killings with high-order allocations - + * it's better to fall back to vmalloc() than to kill things. + */ + buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + +/** + * seq_open - initialize sequential file + * @file: file we initialize + * @op: method table describing the sequence + * + * seq_open() sets @file, associating it with a sequence described + * by @op. @op->start() sets the iterator up and returns the first + * element of sequence. @op->stop() shuts it down. @op->next() + * returns the next element of sequence. @op->show() prints element + * into the buffer. In case of error ->start() and ->next() return + * ERR_PTR(error). In the end of sequence they return %NULL. ->show() + * returns 0 in case of success and negative number in case of error. + * Returning SEQ_SKIP means "discard this element and move on". + */ +int seq_open(struct file *file, const struct seq_operations *op) +{ + struct seq_file *p = file->private_data; + + if (!p) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + file->private_data = p; + } + memset(p, 0, sizeof(*p)); + mutex_init(&p->lock); + p->op = op; +#ifdef CONFIG_USER_NS + p->user_ns = file->f_cred->user_ns; +#endif + + /* + * Wrappers around seq_open(e.g. swaps_open) need to be + * aware of this. If they set f_version themselves, they + * should call seq_open first and then set f_version. + */ + file->f_version = 0; + + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; + return 0; +} +EXPORT_SYMBOL(seq_open); + +static int traverse(struct seq_file *m, loff_t offset) +{ + loff_t pos = 0, index; + int error = 0; + void *p; + + m->version = 0; + index = 0; + m->count = m->from = 0; + if (!offset) { + m->index = index; + return 0; + } + if (!m->buf) { + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); + if (!m->buf) + return -ENOMEM; + } + p = m->op->start(m, &index); + while (p) { + error = PTR_ERR(p); + if (IS_ERR(p)) + break; + error = m->op->show(m, p); + if (error < 0) + break; + if (unlikely(error)) { + error = 0; + m->count = 0; + } + if (seq_has_overflowed(m)) + goto Eoverflow; + if (pos + m->count > offset) { + m->from = offset - pos; + m->count -= m->from; + m->index = index; + break; + } + pos += m->count; + m->count = 0; + if (pos == offset) { + index++; + m->index = index; + break; + } + p = m->op->next(m, p, &index); + } + m->op->stop(m, p); + m->index = index; + return error; + +Eoverflow: + m->op->stop(m, p); + kvfree(m->buf); + m->count = 0; + m->buf = seq_buf_alloc(m->size <<= 1); + return !m->buf ? -ENOMEM : -EAGAIN; +} + +/** + * seq_read - ->read() method for sequential files. + * @file: the file to read from + * @buf: the buffer to read to + * @size: the maximum number of bytes to read + * @ppos: the current position in the file + * + * Ready-made ->f_op->read() + */ +ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + size_t copied = 0; + loff_t pos; + size_t n; + void *p; + int err = 0; + + mutex_lock(&m->lock); + + /* + * seq_file->op->..m_start/m_stop/m_next may do special actions + * or optimisations based on the file->f_version, so we want to + * pass the file->f_version to those methods. + * + * seq_file->version is just copy of f_version, and seq_file + * methods can treat it simply as file version. + * It is copied in first and copied out after all operations. + * It is convenient to have it as part of structure to avoid the + * need of passing another argument to all the seq_file methods. + */ + m->version = file->f_version; + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } else { + m->read_pos = *ppos; + } + } + + /* grab buffer if we didn't have one */ + if (!m->buf) { + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); + if (!m->buf) + goto Enomem; + } + /* if not empty - flush it first */ + if (m->count) { + n = min(m->count, size); + err = copy_to_user(buf, m->buf + m->from, n); + if (err) + goto Efault; + m->count -= n; + m->from += n; + size -= n; + buf += n; + copied += n; + if (!m->count) + m->index++; + if (!size) + goto Done; + } + /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); + while (1) { + err = PTR_ERR(p); + if (!p || IS_ERR(p)) + break; + err = m->op->show(m, p); + if (err < 0) + break; + if (unlikely(err)) + m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } + if (m->count < m->size) + goto Fill; + m->op->stop(m, p); + kvfree(m->buf); + m->count = 0; + m->buf = seq_buf_alloc(m->size <<= 1); + if (!m->buf) + goto Enomem; + m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); + } + m->op->stop(m, p); + m->count = 0; + goto Done; +Fill: + /* they want more? let's try to get some more */ + while (m->count < size) { + size_t offs = m->count; + loff_t next = pos; + p = m->op->next(m, p, &next); + if (!p || IS_ERR(p)) { + err = PTR_ERR(p); + break; + } + err = m->op->show(m, p); + if (seq_has_overflowed(m) || err) { + m->count = offs; + if (likely(err <= 0)) + break; + } + pos = next; + } + m->op->stop(m, p); + n = min(m->count, size); + err = copy_to_user(buf, m->buf, n); + if (err) + goto Efault; + copied += n; + m->count -= n; + if (m->count) + m->from = n; + else + pos++; + m->index = pos; +Done: + if (!copied) + copied = err; + else { + *ppos += copied; + m->read_pos += copied; + } + file->f_version = m->version; + mutex_unlock(&m->lock); + return copied; +Enomem: + err = -ENOMEM; + goto Done; +Efault: + err = -EFAULT; + goto Done; +} +EXPORT_SYMBOL(seq_read); + +/** + * seq_lseek - ->llseek() method for sequential files. + * @file: the file in question + * @offset: new position + * @whence: 0 for absolute, 1 for relative position + * + * Ready-made ->f_op->llseek() + */ +loff_t seq_lseek(struct file *file, loff_t offset, int whence) +{ + struct seq_file *m = file->private_data; + loff_t retval = -EINVAL; + + mutex_lock(&m->lock); + m->version = file->f_version; + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + case SEEK_SET: + if (offset < 0) + break; + retval = offset; + if (offset != m->read_pos) { + while ((retval = traverse(m, offset)) == -EAGAIN) + ; + if (retval) { + /* with extreme prejudice... */ + file->f_pos = 0; + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + } else { + m->read_pos = offset; + retval = file->f_pos = offset; + } + } else { + file->f_pos = offset; + } + } + file->f_version = m->version; + mutex_unlock(&m->lock); + return retval; +} +EXPORT_SYMBOL(seq_lseek); + +/** + * seq_release - free the structures associated with sequential file. + * @file: file in question + * @inode: its inode + * + * Frees the structures associated with sequential file; can be used + * as ->f_op->release() if you don't have private data to destroy. + */ +int seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + kvfree(m->buf); + kfree(m); + return 0; +} +EXPORT_SYMBOL(seq_release); + +/** + * seq_escape - print string into buffer, escaping some characters + * @m: target buffer + * @s: string + * @esc: set of characters that need escaping + * + * Puts string into buffer, replacing each occurrence of character from + * @esc with usual octal escape. Returns 0 in case of success, -1 - in + * case of overflow. + */ +int seq_escape(struct seq_file *m, const char *s, const char *esc) +{ + char *end = m->buf + m->size; + char *p; + char c; + + for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { + if (!strchr(esc, c)) { + *p++ = c; + continue; + } + if (p + 3 < end) { + *p++ = '\\'; + *p++ = '0' + ((c & 0300) >> 6); + *p++ = '0' + ((c & 070) >> 3); + *p++ = '0' + (c & 07); + continue; + } + seq_set_overflow(m); + return -1; + } + m->count = p - m->buf; + return 0; +} +EXPORT_SYMBOL(seq_escape); + +int seq_vprintf(struct seq_file *m, const char *f, va_list args) +{ + int len; + + if (m->count < m->size) { + len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + seq_set_overflow(m); + return -1; +} +EXPORT_SYMBOL(seq_vprintf); + +int seq_printf(struct seq_file *m, const char *f, ...) +{ + int ret; + va_list args; + + va_start(args, f); + ret = seq_vprintf(m, f, args); + va_end(args); + + return ret; +} +EXPORT_SYMBOL(seq_printf); + +/** + * mangle_path - mangle and copy path to buffer beginning + * @s: buffer start + * @p: beginning of path in above buffer + * @esc: set of characters that need escaping + * + * Copy the path from @p to @s, replacing each occurrence of character from + * @esc with usual octal escape. + * Returns pointer past last written character in @s, or NULL in case of + * failure. + */ +char *mangle_path(char *s, const char *p, const char *esc) +{ + while (s <= p) { + char c = *p++; + if (!c) { + return s; + } else if (!strchr(esc, c)) { + *s++ = c; + } else if (s + 4 > p) { + break; + } else { + *s++ = '\\'; + *s++ = '0' + ((c & 0300) >> 6); + *s++ = '0' + ((c & 070) >> 3); + *s++ = '0' + (c & 07); + } + } + return NULL; +} +EXPORT_SYMBOL(mangle_path); + +/** + * seq_path - seq_file interface to print a pathname + * @m: the seq_file handle + * @path: the struct path to print + * @esc: set of characters to escape in the output + * + * return the absolute path of 'path', as represented by the + * dentry / mnt pair in the path parameter. + */ +int seq_path(struct seq_file *m, const struct path *path, const char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = d_path(path, buf, size); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + } + } + seq_commit(m, res); + + return res; +} +EXPORT_SYMBOL(seq_path); + +/* + * Same as seq_path, but relative to supplied root. + */ +int seq_path_root(struct seq_file *m, const struct path *path, + const struct path *root, const char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -ENAMETOOLONG; + + if (size) { + char *p; + + p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; + res = PTR_ERR(p); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + else + res = -ENAMETOOLONG; + } + } + seq_commit(m, res); + + return res < 0 && res != -ENAMETOOLONG ? res : 0; +} + +/* + * returns the path of the 'dentry' from the root of its filesystem. + */ +int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = dentry_path(dentry, buf, size); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + } + } + seq_commit(m, res); + + return res; +} + +static void *single_start(struct seq_file *p, loff_t *pos) +{ + return NULL + (*pos == 0); +} + +static void *single_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void single_stop(struct seq_file *p, void *v) +{ +} + +int single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data) +{ + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + int res = -ENOMEM; + + if (op) { + op->start = single_start; + op->next = single_next; + op->stop = single_stop; + op->show = show; + res = seq_open(file, op); + if (!res) + ((struct seq_file *)file->private_data)->private = data; + else + kfree(op); + } + return res; +} +EXPORT_SYMBOL(single_open); + +int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), + void *data, size_t size) +{ + char *buf = seq_buf_alloc(size); + int ret; + if (!buf) + return -ENOMEM; + ret = single_open(file, show, data); + if (ret) { + kvfree(buf); + return ret; + } + ((struct seq_file *)file->private_data)->buf = buf; + ((struct seq_file *)file->private_data)->size = size; + return 0; +} +EXPORT_SYMBOL(single_open_size); + +int single_release(struct inode *inode, struct file *file) +{ + const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; + int res = seq_release(inode, file); + kfree(op); + return res; +} +EXPORT_SYMBOL(single_release); + +int seq_release_private(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + kfree(seq->private); + seq->private = NULL; + return seq_release(inode, file); +} +EXPORT_SYMBOL(seq_release_private); + +void *__seq_open_private(struct file *f, const struct seq_operations *ops, + int psize) +{ + int rc; + void *private; + struct seq_file *seq; + + private = kzalloc(psize, GFP_KERNEL); + if (private == NULL) + goto out; + + rc = seq_open(f, ops); + if (rc < 0) + goto out_free; + + seq = f->private_data; + seq->private = private; + return private; + +out_free: + kfree(private); +out: + return NULL; +} +EXPORT_SYMBOL(__seq_open_private); + +int seq_open_private(struct file *filp, const struct seq_operations *ops, + int psize) +{ + return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(seq_open_private); + +int seq_putc(struct seq_file *m, char c) +{ + if (m->count < m->size) { + m->buf[m->count++] = c; + return 0; + } + return -1; +} +EXPORT_SYMBOL(seq_putc); + +int seq_puts(struct seq_file *m, const char *s) +{ + int len = strlen(s); + if (m->count + len < m->size) { + memcpy(m->buf + m->count, s, len); + m->count += len; + return 0; + } + seq_set_overflow(m); + return -1; +} +EXPORT_SYMBOL(seq_puts); + +/* + * A helper routine for putting decimal numbers without rich format of printf(). + * only 'unsigned long long' is supported. + * This routine will put one byte delimiter + number into seq_file. + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +int seq_put_decimal_ull(struct seq_file *m, char delimiter, + unsigned long long num) +{ + int len; + + if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ + goto overflow; + + if (delimiter) + m->buf[m->count++] = delimiter; + + if (num < 10) { + m->buf[m->count++] = num + '0'; + return 0; + } + + len = num_to_str(m->buf + m->count, m->size - m->count, num); + if (!len) + goto overflow; + m->count += len; + return 0; +overflow: + seq_set_overflow(m); + return -1; +} +EXPORT_SYMBOL(seq_put_decimal_ull); + +int seq_put_decimal_ll(struct seq_file *m, char delimiter, + long long num) +{ + if (num < 0) { + if (m->count + 3 >= m->size) { + seq_set_overflow(m); + return -1; + } + if (delimiter) + m->buf[m->count++] = delimiter; + num = -num; + delimiter = '-'; + } + return seq_put_decimal_ull(m, delimiter, num); + +} +EXPORT_SYMBOL(seq_put_decimal_ll); + +/** + * seq_write - write arbitrary data to buffer + * @seq: seq_file identifying the buffer to which data should be written + * @data: data address + * @len: number of bytes + * + * Return 0 on success, non-zero otherwise. + */ +int seq_write(struct seq_file *seq, const void *data, size_t len) +{ + if (seq->count + len < seq->size) { + memcpy(seq->buf + seq->count, data, len); + seq->count += len; + return 0; + } + seq_set_overflow(seq); + return -1; +} +EXPORT_SYMBOL(seq_write); + +/** + * seq_pad - write padding spaces to buffer + * @m: seq_file identifying the buffer to which data should be written + * @c: the byte to append after padding if non-zero + */ +void seq_pad(struct seq_file *m, char c) +{ + int size = m->pad_until - m->count; + if (size > 0) + seq_printf(m, "%*s", size, ""); + if (c) + seq_putc(m, c); +} +EXPORT_SYMBOL(seq_pad); + +struct list_head *seq_list_start(struct list_head *head, loff_t pos) +{ + struct list_head *lh; + + list_for_each(lh, head) + if (pos-- == 0) + return lh; + + return NULL; +} +EXPORT_SYMBOL(seq_list_start); + +struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) +{ + if (!pos) + return head; + + return seq_list_start(head, pos - 1); +} +EXPORT_SYMBOL(seq_list_start_head); + +struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) +{ + struct list_head *lh; + + lh = ((struct list_head *)v)->next; + ++*ppos; + return lh == head ? NULL : lh; +} +EXPORT_SYMBOL(seq_list_next); + +/** + * seq_hlist_start - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) +{ + struct hlist_node *node; + + hlist_for_each(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start); + +/** + * seq_hlist_start_head - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + */ +struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head); + +/** + * seq_hlist_next - move to the next position of the hlist + * @v: the current iterator + * @head: the head of the hlist + * @ppos: the current position + * + * Called at seq_file->op->next(). + */ +struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return head->first; + else + return node->next; +} +EXPORT_SYMBOL(seq_hlist_next); + +/** + * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, + loff_t pos) +{ + struct hlist_node *node; + + __hlist_for_each_rcu(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_rcu); + +/** + * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, + loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start_rcu(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head_rcu); + +/** + * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU + * @v: the current iterator + * @head: the head of the hlist + * @ppos: the current position + * + * Called at seq_file->op->next(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_next_rcu(void *v, + struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return rcu_dereference(head->first); + else + return rcu_dereference(node->next); +} +EXPORT_SYMBOL(seq_hlist_next_rcu); + +/** + * seq_hlist_start_precpu - start an iteration of a percpu hlist array + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node * +seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) +{ + struct hlist_node *node; + + for_each_possible_cpu(*cpu) { + hlist_for_each(node, per_cpu_ptr(head, *cpu)) { + if (pos-- == 0) + return node; + } + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_percpu); + +/** + * seq_hlist_next_percpu - move to the next position of the percpu hlist array + * @v: pointer to current hlist_node + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->next(). + */ +struct hlist_node * +seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, + int *cpu, loff_t *pos) +{ + struct hlist_node *node = v; + + ++*pos; + + if (node->next) + return node->next; + + for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; + *cpu = cpumask_next(*cpu, cpu_possible_mask)) { + struct hlist_head *bucket = per_cpu_ptr(head, *cpu); + + if (!hlist_empty(bucket)) + return bucket->first; + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_next_percpu); diff --git a/kernel/fs/signalfd.c b/kernel/fs/signalfd.c new file mode 100644 index 000000000..7e412ad74 --- /dev/null +++ b/kernel/fs/signalfd.c @@ -0,0 +1,342 @@ +/* + * fs/signalfd.c + * + * Copyright (C) 2003 Linus Torvalds + * + * Mon Mar 5, 2007: Davide Libenzi + * Changed ->read() to return a siginfo strcture instead of signal number. + * Fixed locking in ->poll(). + * Added sighand-detach notification. + * Added fd re-use in sys_signalfd() syscall. + * Now using anonymous inode source. + * Thanks to Oleg Nesterov for useful code review and suggestions. + * More comments and suggestions from Arnd Bergmann. + * Sat May 19, 2007: Davi E. M. Arnaut + * Retrieve multiple signals with one read() call + * Sun Jul 15, 2007: Davide Libenzi + * Attach to the sighand only during read() and poll(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + +struct signalfd_ctx { + sigset_t sigmask; +}; + +static int signalfd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static unsigned int signalfd_poll(struct file *file, poll_table *wait) +{ + struct signalfd_ctx *ctx = file->private_data; + unsigned int events = 0; + + poll_wait(file, ¤t->sighand->signalfd_wqh, wait); + + spin_lock_irq(¤t->sighand->siglock); + if (next_signal(¤t->pending, &ctx->sigmask) || + next_signal(¤t->signal->shared_pending, + &ctx->sigmask)) + events |= POLLIN; + spin_unlock_irq(¤t->sighand->siglock); + + return events; +} + +/* + * Copied from copy_siginfo_to_user() in kernel/signal.c + */ +static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, + siginfo_t const *kinfo) +{ + long err; + + BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128); + + /* + * Unused members should be zero ... + */ + err = __clear_user(uinfo, sizeof(*uinfo)); + + /* + * If you change siginfo_t structure, please be sure + * this code is fixed accordingly. + */ + err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); + err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); + err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code); + switch (kinfo->si_code & __SI_MASK) { + case __SI_KILL: + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + break; + case __SI_TIMER: + err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); + err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + case __SI_POLL: + err |= __put_user(kinfo->si_band, &uinfo->ssi_band); + err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); + break; + case __SI_FAULT: + err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); +#ifdef __ARCH_SI_TRAPNO + err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); +#endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif + break; + case __SI_CHLD: + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user(kinfo->si_status, &uinfo->ssi_status); + err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); + err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); + break; + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + default: + /* + * This case catches also the signals queued by sigqueue(). + */ + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + } + + return err ? -EFAULT: sizeof(*uinfo); +} + +static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info, + int nonblock) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + + spin_lock_irq(¤t->sighand->siglock); + ret = dequeue_signal(current, &ctx->sigmask, info); + switch (ret) { + case 0: + if (!nonblock) + break; + ret = -EAGAIN; + default: + spin_unlock_irq(¤t->sighand->siglock); + return ret; + } + + add_wait_queue(¤t->sighand->signalfd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = dequeue_signal(current, &ctx->sigmask, info); + if (ret != 0) + break; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock_irq(¤t->sighand->siglock); + schedule(); + spin_lock_irq(¤t->sighand->siglock); + } + spin_unlock_irq(¤t->sighand->siglock); + + remove_wait_queue(¤t->sighand->signalfd_wqh, &wait); + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative + * error code. The "count" parameter must be at least the size of a + * "struct signalfd_siginfo". + */ +static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct signalfd_ctx *ctx = file->private_data; + struct signalfd_siginfo __user *siginfo; + int nonblock = file->f_flags & O_NONBLOCK; + ssize_t ret, total = 0; + siginfo_t info; + + count /= sizeof(struct signalfd_siginfo); + if (!count) + return -EINVAL; + + siginfo = (struct signalfd_siginfo __user *) buf; + do { + ret = signalfd_dequeue(ctx, &info, nonblock); + if (unlikely(ret <= 0)) + break; + ret = signalfd_copyinfo(siginfo, &info); + if (ret < 0) + break; + siginfo++; + total += ret; + nonblock = 1; + } while (--count); + + return total ? total: ret; +} + +#ifdef CONFIG_PROC_FS +static void signalfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct signalfd_ctx *ctx = f->private_data; + sigset_t sigmask; + + sigmask = ctx->sigmask; + signotset(&sigmask); + render_sigset_t(m, "sigmask:\t", &sigmask); +} +#endif + +static const struct file_operations signalfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = signalfd_show_fdinfo, +#endif + .release = signalfd_release, + .poll = signalfd_poll, + .read = signalfd_read, + .llseek = noop_llseek, +}; + +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) +{ + sigset_t sigmask; + struct signalfd_ctx *ctx; + + /* Check the SFD_* constants for consistency. */ + BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) + return -EINVAL; + + if (sizemask != sizeof(sigset_t) || + copy_from_user(&sigmask, user_mask, sizeof(sigmask))) + return -EINVAL; + sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&sigmask); + + if (ufd == -1) { + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->sigmask = sigmask; + + /* + * When we call this, the initialization must be complete, since + * anon_inode_getfd() will install the fd. + */ + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); + if (ufd < 0) + kfree(ctx); + } else { + struct fd f = fdget(ufd); + if (!f.file) + return -EBADF; + ctx = f.file->private_data; + if (f.file->f_op != &signalfd_fops) { + fdput(f); + return -EINVAL; + } + spin_lock_irq(¤t->sighand->siglock); + ctx->sigmask = sigmask; + spin_unlock_irq(¤t->sighand->siglock); + + wake_up(¤t->sighand->signalfd_wqh); + fdput(f); + } + + return ufd; +} + +SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask) +{ + return sys_signalfd4(ufd, user_mask, sizemask, 0); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + compat_sigset_t ss32; + sigset_t tmp; + sigset_t __user *ksigmask; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&tmp, &ss32); + ksigmask = compat_alloc_user_space(sizeof(sigset_t)); + if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + return -EFAULT; + + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} +#endif diff --git a/kernel/fs/splice.c b/kernel/fs/splice.c new file mode 100644 index 000000000..bfe62ae40 --- /dev/null +++ b/kernel/fs/splice.c @@ -0,0 +1,2034 @@ +/* + * "splice": joining two ropes together by interweaving their strands. + * + * This is the "extended pipe" functionality, where a pipe is used as + * an arbitrary in-memory buffer. Think of a pipe as a small kernel + * buffer that you can use to transfer data from one end to the other. + * + * The traditional unix read/write is extended with a "splice()" operation + * that transfers data buffers to or from a pipe buffer. + * + * Named by Larry McVoy, original implementation from Linus, extended by + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. + * + * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Linus Torvalds + * Copyright (C) 2006 Ingo Molnar + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + struct address_space *mapping; + + lock_page(page); + + mapping = page_mapping(page); + if (mapping) { + WARN_ON(!PageUptodate(page)); + + /* + * At least for ext2 with nobh option, we need to wait on + * writeback completing on this page, since we'll remove it + * from the pagecache. Otherwise truncate wont wait on the + * page, allowing the disk blocks to be reused by someone else + * before we actually wrote our data to them. fs corruption + * ensues. + */ + wait_on_page_writeback(page); + + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; + + /* + * If we succeeded in removing the mapping, set LRU flag + * and return good. + */ + if (remove_mapping(mapping, page)) { + buf->flags |= PIPE_BUF_FLAG_LRU; + return 0; + } + } + + /* + * Raced with truncate or failed to remove page from current + * address space, unlock and return failure. + */ +out_unlock: + unlock_page(page); + return 1; +} + +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); + buf->flags &= ~PIPE_BUF_FLAG_LRU; +} + +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + int err; + + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page. + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * Uh oh, read-error from disk. + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } + + /* + * Page is ok afterall, we are done. + */ + unlock_page(page); + } + + return 0; +error: + unlock_page(page); + return err; +} + +const struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .confirm = page_cache_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_LRU; + return generic_pipe_buf_steal(pipe, buf); +} + +static const struct pipe_buf_operations user_page_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .steal = user_page_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + +/** + * splice_to_pipe - fill passed data into a pipe + * @pipe: pipe to fill + * @spd: data to fill + * + * Description: + * @spd contains a map of pages and len/offset tuples, along with + * the struct pipe_buf_operations associated with these pages. This + * function will link that data to the pipe. + * + */ +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + unsigned int spd_pages = spd->nr_pages; + int ret, do_wakeup, page_nr; + + ret = 0; + do_wakeup = 0; + page_nr = 0; + + pipe_lock(pipe); + + for (;;) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->files) + do_wakeup = 1; + + if (!--spd->nr_pages) + break; + if (pipe->nrbufs < pipe->buffers) + continue; + + break; + } + + if (spd->flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + + if (do_wakeup) + wakeup_pipe_readers(pipe); + + while (page_nr < spd_pages) + spd->spd_release(spd, page_nr++); + + return ret; +} + +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +{ + page_cache_release(spd->pages[i]); +} + +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + unsigned int buffers = ACCESS_ONCE(pipe->buffers); + + spd->nr_pages_max = buffers; + if (buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct splice_pipe_desc *spd) +{ + if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + +static int +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, spd.nr_pages_max); + + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); + index += spd.nr_pages; + + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * readahead/allocate the rest and fill in the holes. + */ + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); + + error = 0; + while (spd.nr_pages < nr_pages) { + /* + * Page could be there, find_get_pages_contig() breaks on + * the first hole. + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * page didn't exist, allocate one. + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL); + if (unlikely(error)) { + page_cache_release(page); + if (error == -EEXIST) + continue; + break; + } + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); + } + + spd.pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. + */ + if (!page->mapping) { + unlock_page(page); + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); + if (unlikely(error)) { + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ + if (error == AOP_TRUNCATED_PAGE) + error = 0; + + break; + } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; + + /* + * max good bytes in this page + */ + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + /* + * force quit after adding this page + */ + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + /* + * Release any pages at the end, if we quit early. 'page_nr' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(&spd); + return error; +} + +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @ppos: position in @in + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given file and fill them into a pipe. Can be + * used as long as the address_space operations for the source implements + * a readpage() hook. + * + */ +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + loff_t isize, left; + int ret; + + if (IS_DAX(in->f_mapping->host)) + return default_file_splice_read(in, ppos, pipe, len, flags); + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) { + *ppos += ret; + file_accessed(in); + } + + return ret; +} +EXPORT_SYMBOL(generic_file_splice_read); + +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); + +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +} + +ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_write(file, (__force const char __user *)buf, count, &pos); + set_fs(old_fs); + + return res; +} +EXPORT_SYMBOL(kernel_write); + +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { + vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } + + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { + struct page *page; + + page = alloc_page(GFP_USER); + error = -ENOMEM; + if (!page) + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) page_address(page); + vec[i].iov_len = this_len; + spd.pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } + + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) { + error = res; + goto err; + } + + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + this_len = min_t(size_t, vec[i].iov_len, res); + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; + if (!this_len) { + __free_page(spd.pages[i]); + spd.pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; + + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; + +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(&spd); + return res; + +err: + for (i = 0; i < spd.nr_pages; i++) + __free_page(spd.pages[i]); + + res = error; + goto shrink_ret; +} +EXPORT_SYMBOL(default_file_splice_read); + +/* + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). Return the number of bytes sent. + */ +static int pipe_to_sendpage(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, struct splice_desc *sd) +{ + struct file *file = sd->u.file; + loff_t pos = sd->pos; + int more; + + if (!likely(file->f_op->sendpage)) + return -EINVAL; + + more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; + + if (sd->len < sd->total_len && pipe->nrbufs > 1) + more |= MSG_SENDPAGE_NOTLAST; + + return file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); +} + +static void wakeup_pipe_writers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +} + +/** + * splice_from_pipe_feed - feed available data from a pipe to a file + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function loops over the pipe and calls @actor to do the + * actual moving of a single struct pipe_buffer to the desired + * destination. It returns when there's no more buffers left in + * the pipe or if the requested number of bytes (@sd->total_len) + * have been copied. It returns a positive number (one) if the + * pipe needs to be filled with more data, zero if the required + * number of bytes have been copied and -errno on error. + * + * This, together with splice_from_pipe_{begin,end,next}, may be + * used to implement the functionality of __splice_from_pipe() when + * locking is required around copying the pipe buffers to the + * destination. + */ +static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + while (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + const struct pipe_buf_operations *ops = buf->ops; + + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + return ret; + } + + ret = actor(pipe, buf, sd); + if (ret <= 0) + return ret; + + buf->offset += ret; + buf->len -= ret; + + sd->num_spliced += ret; + sd->len -= ret; + sd->pos += ret; + sd->total_len -= ret; + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd->need_wakeup = true; + } + + if (!sd->total_len) + return 0; + } + + return 1; +} + +/** + * splice_from_pipe_next - wait for some data to splice from + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wait for some data and return a positive + * value (one) if pipe buffers are available. It will return zero + * or -errno if no more data needs to be spliced. + */ +static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + while (!pipe->nrbufs) { + if (!pipe->writers) + return 0; + + if (!pipe->waiting_writers && sd->num_spliced) + return 0; + + if (sd->flags & SPLICE_F_NONBLOCK) + return -EAGAIN; + + if (signal_pending(current)) + return -ERESTARTSYS; + + if (sd->need_wakeup) { + wakeup_pipe_writers(pipe); + sd->need_wakeup = false; + } + + pipe_wait(pipe); + } + + return 1; +} + +/** + * splice_from_pipe_begin - start splicing from pipe + * @sd: information about the splice operation + * + * Description: + * This function should be called before a loop containing + * splice_from_pipe_next() and splice_from_pipe_feed() to + * initialize the necessary fields of @sd. + */ +static void splice_from_pipe_begin(struct splice_desc *sd) +{ + sd->num_spliced = 0; + sd->need_wakeup = false; +} + +/** + * splice_from_pipe_end - finish splicing from pipe + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wake up pipe writers if necessary. It should + * be called after a loop containing splice_from_pipe_next() and + * splice_from_pipe_feed(). + */ +static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + if (sd->need_wakeup) + wakeup_pipe_writers(pipe); +} + +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * + */ +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + splice_from_pipe_begin(sd); + do { + ret = splice_from_pipe_next(pipe, sd); + if (ret > 0) + ret = splice_from_pipe_feed(pipe, sd, actor); + } while (ret > 0); + splice_from_pipe_end(pipe, sd); + + return sd->num_spliced ? sd->num_spliced : ret; +} +EXPORT_SYMBOL(__splice_from_pipe); + +/** + * splice_from_pipe - splice data from a pipe to a file + * @pipe: pipe to splice from + * @out: file to splice to + * @ppos: position in @out + * @len: how many bytes to splice + * @flags: splice modifier flags + * @actor: handler that splices the data + * + * Description: + * See __splice_from_pipe. This function locks the pipe inode, + * otherwise it's identical to __splice_from_pipe(). + * + */ +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) +{ + ssize_t ret; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, actor); + pipe_unlock(pipe); + + return ret; +} + +/** + * iter_file_splice_write - splice data from a pipe to a file + * @pipe: pipe info + * @out: file to write to + * @ppos: position in @out + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * This one is ->write_iter-based. + * + */ +ssize_t +iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + int nbufs = pipe->buffers; + struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); + ssize_t ret; + + if (unlikely(!array)) + return -ENOMEM; + + pipe_lock(pipe); + + splice_from_pipe_begin(&sd); + while (sd.total_len) { + struct iov_iter from; + size_t left; + int n, idx; + + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + if (unlikely(nbufs < pipe->buffers)) { + kfree(array); + nbufs = pipe->buffers; + array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); + if (!array) { + ret = -ENOMEM; + break; + } + } + + /* build the vector */ + left = sd.total_len; + for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { + struct pipe_buffer *buf = pipe->bufs + idx; + size_t this_len = buf->len; + + if (this_len > left) + this_len = left; + + if (idx == pipe->buffers - 1) + idx = -1; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + goto done; + } + + array[n].bv_page = buf->page; + array[n].bv_len = this_len; + array[n].bv_offset = buf->offset; + left -= this_len; + } + + iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, + sd.total_len - left); + ret = vfs_iter_write(out, &from, &sd.pos); + if (ret <= 0) + break; + + sd.num_spliced += ret; + sd.total_len -= ret; + *ppos = sd.pos; + + /* dismiss the fully eaten buffers, adjust the partial one */ + while (ret) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + if (ret >= buf->len) { + const struct pipe_buf_operations *ops = buf->ops; + ret -= buf->len; + buf->len = 0; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd.need_wakeup = true; + } else { + buf->offset += ret; + buf->len -= ret; + ret = 0; + } + } + } +done: + kfree(array); + splice_from_pipe_end(pipe, &sd); + + pipe_unlock(pipe); + + if (sd.num_spliced) + ret = sd.num_spliced; + + return ret; +} + +EXPORT_SYMBOL(iter_file_splice_write); + +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int ret; + void *data; + loff_t tmp = sd->pos; + + data = kmap(buf->page); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); + kunmap(buf->page); + + return ret; +} + +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; + + return ret; +} + +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @pipe: pipe to splice from + * @out: socket to write to + * @ppos: position in @out + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); +} + +EXPORT_SYMBOL(generic_splice_sendpage); + +/* + * Attempt to initiate a splice from pipe to file. + */ +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); + + if (out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else + splice_write = default_file_splice_write; + + return splice_write(pipe, out, ppos, len, flags); +} + +/* + * Attempt to initiate a splice from a file to a pipe. + */ +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); + int ret; + + if (unlikely(!(in->f_mode & FMODE_READ))) + return -EBADF; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + if (in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else + splice_read = default_file_splice_read; + + return splice_read(in, ppos, pipe, len, flags); +} + +/** + * splice_direct_to_actor - splices data directly between two non-pipes + * @in: file to splice from + * @sd: actor information on where to splice to + * @actor: handles the data splicing + * + * Description: + * This is a special case helper to splice directly between two + * points, without requiring an explicit pipe. Internally an allocated + * pipe is cached in the process, and reused during the lifetime of + * that process. + * + */ +ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + splice_direct_actor *actor) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + size_t len; + int i, flags, more; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = file_inode(in)->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (unlikely(!pipe)) { + pipe = alloc_pipe_info(); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the splice_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * Do the splice. + */ + ret = 0; + bytes = 0; + len = sd->total_len; + flags = sd->flags; + + /* + * Don't block on output, we have to drain the direct pipe. + */ + sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; + + while (len) { + size_t read_len; + loff_t pos = sd->pos, prev_pos = pos; + + ret = do_splice_to(in, &pos, pipe, len, flags); + if (unlikely(ret <= 0)) + goto out_release; + + read_len = ret; + sd->total_len = read_len; + + /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = actor(pipe, sd); + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; + goto out_release; + } + + bytes += ret; + len -= ret; + sd->pos = pos; + + if (ret < read_len) { + sd->pos = prev_pos + ret; + goto out_release; + } + } + +done: + pipe->nrbufs = pipe->curbuf = 0; + file_accessed(in); + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < pipe->buffers; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + + if (!bytes) + bytes = ret; + + goto done; +} +EXPORT_SYMBOL(splice_direct_to_actor); + +static int direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + return do_splice_from(pipe, file, sd->opos, sd->total_len, + sd->flags); +} + +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + */ +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + .opos = opos, + }; + long ret; + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, opos, len); + if (unlikely(ret < 0)) + return ret; + + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + if (ret > 0) + *ppos = sd.pos; + + return ret; +} +EXPORT_SYMBOL(do_splice_direct); + +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); + +/* + * Determine where to splice to/from. + */ +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; + loff_t offset; + long ret; + + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { + if (off_in) + return -ESPIPE; + if (off_out) { + if (!(out->f_mode & FMODE_PWRITE)) + return -EINVAL; + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + } else { + offset = out->f_pos; + } + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, &offset, len); + if (unlikely(ret < 0)) + return ret; + + file_start_write(out); + ret = do_splice_from(ipipe, out, &offset, len, flags); + file_end_write(out); + + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } + + if (opipe) { + if (off_out) + return -ESPIPE; + if (off_in) { + if (!(in->f_mode & FMODE_PREAD)) + return -EINVAL; + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + } else { + offset = in->f_pos; + } + + ret = do_splice_to(in, &offset, opipe, len, flags); + + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } + + return -EINVAL; +} + +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial, bool aligned, + unsigned int pipe_buffers) +{ + int buffers = 0, error = 0; + + while (nr_vecs) { + unsigned long off, npages; + struct iovec entry; + void __user *base; + size_t len; + int i; + + error = -EFAULT; + if (copy_from_user(&entry, iov, sizeof(entry))) + break; + + base = entry.iov_base; + len = entry.iov_len; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + error = 0; + if (unlikely(!len)) + break; + error = -EFAULT; + if (!access_ok(VERIFY_READ, base, len)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; + + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE - off); + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == pipe_buffers) + break; + + nr_vecs--; + iov++; + } + + if (buffers) + return buffers; + + return error; +} + +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + ret = import_iovec(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + return ret; + + sd.total_len = iov_iter_count(&iter); + sd.len = 0; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + + if (sd.total_len) { + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); + pipe_unlock(pipe); + } + + kfree(iov); + return ret; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + */ +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + .spd_release = spd_release_page, + }; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, false, + spd.nr_pages_max); + if (spd.nr_pages <= 0) + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(&spd); + return ret; +} + +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) +{ + struct fd f; + long error; + + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + error = -EBADF; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_WRITE) + error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); + else if (f.file->f_mode & FMODE_READ) + error = vmsplice_to_user(f.file, iov, nr_segs, flags); + + fdput(f); + } + + return error; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, + unsigned int, nr_segs, unsigned int, flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} +#endif + +SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) +{ + struct fd in, out; + long error; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fdget(fd_in); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + out = fdget(fd_out); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_splice(in.file, off_in, + out.file, off_out, + len, flags); + fdput(out); + } + } + fdput(in); + } + return error; +} + +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + pipe_unlock(pipe); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < pipe->buffers) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (pipe->nrbufs >= pipe->buffers) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + return ret; +} + +/* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, nbuf; + bool input_wakeup = false; + + +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (!ipipe->nrbufs && !ipipe->writers) + break; + + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { + /* Already processed some buffers, break */ + if (ret) + break; + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } + + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + obuf = opipe->bufs + nbuf; + + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + ret += obuf->len; + len -= obuf->len; + } while (len); + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); + + if (input_wakeup) + wakeup_pipe_writers(ipipe); + + return ret; +} + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, i = 0, nbuf; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) + break; + + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); + + /* + * return EAGAIN if we have the potential of some data in the + * future, otherwise just return 0 + */ + if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); + int ret = -EINVAL; + + /* + * Duplicate the contents of ipipe to opipe without actually + * copying the data. + */ + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = ipipe_prep(ipipe, flags); + if (!ret) { + ret = opipe_prep(opipe, flags); + if (!ret) + ret = link_pipe(ipipe, opipe, len, flags); + } + } + + return ret; +} + +SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) +{ + struct fd in; + int error; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fdget(fdin); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + struct fd out = fdget(fdout); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_tee(in.file, out.file, + len, flags); + fdput(out); + } + } + fdput(in); + } + + return error; +} diff --git a/kernel/fs/squashfs/Kconfig b/kernel/fs/squashfs/Kconfig new file mode 100644 index 000000000..ffb093e72 --- /dev/null +++ b/kernel/fs/squashfs/Kconfig @@ -0,0 +1,210 @@ +config SQUASHFS + tristate "SquashFS 4.0 - Squashed file system support" + depends on BLOCK + help + Saying Y here includes support for SquashFS 4.0 (a Compressed + Read-Only File System). Squashfs is a highly compressed read-only + filesystem for Linux. It uses zlib, lzo or xz compression to + compress both files, inodes and directories. Inodes in the system + are very small and all blocks are packed to minimise data overhead. + Block sizes greater than 4K are supported up to a maximum of 1 Mbytes + (default block size 128K). SquashFS 4.0 supports 64 bit filesystems + and files (larger than 4GB), full uid/gid information, hard links and + timestamps. + + Squashfs is intended for general read-only filesystem use, for + archival use (i.e. in cases where a .tar.gz file may be used), and in + embedded systems where low overhead is needed. Further information + and tools are available from http://squashfs.sourceforge.net. + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here. The module will be called squashfs. Note that the root + file system (the one containing the directory /) cannot be compiled + as a module. + + If unsure, say N. + +choice + prompt "File decompression options" + depends on SQUASHFS + help + Squashfs now supports two options for decompressing file + data. Traditionally Squashfs has decompressed into an + intermediate buffer and then memcopied it into the page cache. + Squashfs now supports the ability to decompress directly into + the page cache. + + If unsure, select "Decompress file data into an intermediate buffer" + +config SQUASHFS_FILE_CACHE + bool "Decompress file data into an intermediate buffer" + help + Decompress file data into an intermediate buffer and then + memcopy it into the page cache. + +config SQUASHFS_FILE_DIRECT + bool "Decompress files directly into the page cache" + help + Directly decompress file data into the page cache. + Doing so can significantly improve performance because + it eliminates a memcpy and it also removes the lock contention + on the single buffer. + +endchoice + +choice + prompt "Decompressor parallelisation options" + depends on SQUASHFS + help + Squashfs now supports three parallelisation options for + decompression. Each one exhibits various trade-offs between + decompression performance and CPU and memory usage. + + If in doubt, select "Single threaded compression" + +config SQUASHFS_DECOMP_SINGLE + bool "Single threaded compression" + help + Traditionally Squashfs has used single-threaded decompression. + Only one block (data or metadata) can be decompressed at any + one time. This limits CPU and memory usage to a minimum. + +config SQUASHFS_DECOMP_MULTI + bool "Use multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + If you have a parallel I/O workload and your system has enough memory, + using this option may improve overall I/O performance. + + This decompressor implementation uses up to two parallel + decompressors per core. It dynamically allocates decompressors + on a demand basis. + +config SQUASHFS_DECOMP_MULTI_PERCPU + bool "Use percpu multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + This decompressor implementation uses a maximum of one + decompressor per core. It uses percpu variables to ensure + decompression is load-balanced across the cores. + +endchoice + +config SQUASHFS_XATTR + bool "Squashfs XATTR support" + depends on SQUASHFS + help + Saying Y here includes support for extended attributes (xattrs). + Xattrs are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page). + + If unsure, say N. + +config SQUASHFS_ZLIB + bool "Include support for ZLIB compressed file systems" + depends on SQUASHFS + select ZLIB_INFLATE + default y + help + ZLIB compression is the standard compression used by Squashfs + file systems. It offers a good trade-off between compression + achieved and the amount of CPU time and memory necessary to + compress and decompress. + + If unsure, say Y. + +config SQUASHFS_LZ4 + bool "Include support for LZ4 compressed file systems" + depends on SQUASHFS + select LZ4_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZ4 compression. LZ4 compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. + + LZ4 is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_LZO + bool "Include support for LZO compressed file systems" + depends on SQUASHFS + select LZO_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZO compression. LZO compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. + + LZO is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_XZ + bool "Include support for XZ compressed file systems" + depends on SQUASHFS + select XZ_DEC + help + Saying Y here includes support for reading Squashfs file systems + compressed with XZ compression. XZ gives better compression than + the default zlib compression, at the expense of greater CPU and + memory overhead. + + XZ is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_4K_DEVBLK_SIZE + bool "Use 4K device block size?" + depends on SQUASHFS + help + By default Squashfs sets the dev block size (sb_min_blocksize) + to 1K or the smallest block size supported by the block device + (if larger). This, because blocks are packed together and + unaligned in Squashfs, should reduce latency. + + This, however, gives poor performance on MTD NAND devices where + the optimal I/O size is 4K (even though the devices can support + smaller block sizes). + + Using a 4K device block size may also improve overall I/O + performance for some file access patterns (e.g. sequential + accesses of files in filesystem order) on all media. + + Setting this option will force Squashfs to use a 4K device block + size by default. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + bool "Additional option for memory-constrained systems" + depends on SQUASHFS + help + Saying Y here allows you to specify cache size. + + If unsure, say N. + +config SQUASHFS_FRAGMENT_CACHE_SIZE + int "Number of fragments cached" if SQUASHFS_EMBEDDED + depends on SQUASHFS + default "3" + help + By default SquashFS caches the last 3 fragments read from + the filesystem. Increasing this amount may mean SquashFS + has to re-read fragments less often from disk, at the expense + of extra system memory. Decreasing this amount will mean + SquashFS uses less memory at the expense of extra reads from disk. + + Note there must be at least one cached fragment. Anything + much more than three will probably not make much difference. diff --git a/kernel/fs/squashfs/Makefile b/kernel/fs/squashfs/Makefile new file mode 100644 index 000000000..246a6f329 --- /dev/null +++ b/kernel/fs/squashfs/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the linux squashfs routines. +# + +obj-$(CONFIG_SQUASHFS) += squashfs.o +squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o +squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o +squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o +squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o +squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o +squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o +squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o diff --git a/kernel/fs/squashfs/block.c b/kernel/fs/squashfs/block.c new file mode 100644 index 000000000..0cea9b923 --- /dev/null +++ b/kernel/fs/squashfs/block.c @@ -0,0 +1,214 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * block.c + */ + +/* + * This file implements the low-level routines to read and decompress + * datablocks and metadata blocks. + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +/* + * Read the metadata block length, this is stored in the first two + * bytes of the metadata block. + */ +static struct buffer_head *get_block_length(struct super_block *sb, + u64 *cur_index, int *offset, int *length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head *bh; + + bh = sb_bread(sb, *cur_index); + if (bh == NULL) + return NULL; + + if (msblk->devblksize - *offset == 1) { + *length = (unsigned char) bh->b_data[*offset]; + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *length |= (unsigned char) bh->b_data[0] << 8; + *offset = 1; + } else { + *length = (unsigned char) bh->b_data[*offset] | + (unsigned char) bh->b_data[*offset + 1] << 8; + *offset += 2; + + if (*offset == msblk->devblksize) { + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *offset = 0; + } + } + + return bh; +} + + +/* + * Read and decompress a metadata block or datablock. Length is non-zero + * if a datablock is being read (the size is stored elsewhere in the + * filesystem), otherwise the length is obtained from the first two bytes of + * the metadata block. A bit in the length field indicates if the block + * is stored uncompressed in the filesystem (usually because compression + * generated a larger block - this does occasionally happen with compression + * algorithms). + */ +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head **bh; + int offset = index & ((1 << msblk->devblksize_log2) - 1); + u64 cur_index = index >> msblk->devblksize_log2; + int bytes, compressed, b = 0, k = 0, avail, i; + + bh = kcalloc(((output->length + msblk->devblksize - 1) + >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); + if (bh == NULL) + return -ENOMEM; + + if (length) { + /* + * Datablock. + */ + bytes = -offset; + compressed = SQUASHFS_COMPRESSED_BLOCK(length); + length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + if (next_index) + *next_index = index + length; + + TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", + index, compressed ? "" : "un", length, output->length); + + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) + goto read_failure; + + for (b = 0; bytes < length; b++, cur_index++) { + bh[b] = sb_getblk(sb, cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b, bh); + } else { + /* + * Metadata block. + */ + if ((index + 2) > msblk->bytes_used) + goto read_failure; + + bh[0] = get_block_length(sb, &cur_index, &offset, &length); + if (bh[0] == NULL) + goto read_failure; + b = 1; + + bytes = msblk->devblksize - offset; + compressed = SQUASHFS_COMPRESSED(length); + length = SQUASHFS_COMPRESSED_SIZE(length); + if (next_index) + *next_index = index + length + 2; + + TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + compressed ? "" : "un", length); + + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) + goto block_release; + + for (; bytes < length; b++) { + bh[b] = sb_getblk(sb, ++cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b - 1, bh + 1); + } + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + + if (compressed) { + length = squashfs_decompress(msblk, bh, b, offset, length, + output); + if (length < 0) + goto read_failure; + } else { + /* + * Block is uncompressed. + */ + int in, pg_offset = 0; + void *data = squashfs_first_page(output); + + for (bytes = length; k < b; k++) { + in = min(bytes, msblk->devblksize - offset); + bytes -= in; + while (in) { + if (pg_offset == PAGE_CACHE_SIZE) { + data = squashfs_next_page(output); + pg_offset = 0; + } + avail = min_t(int, in, PAGE_CACHE_SIZE - + pg_offset); + memcpy(data + pg_offset, bh[k]->b_data + offset, + avail); + in -= avail; + pg_offset += avail; + offset += avail; + } + offset = 0; + put_bh(bh[k]); + } + squashfs_finish_page(output); + } + + kfree(bh); + return length; + +block_release: + for (; k < b; k++) + put_bh(bh[k]); + +read_failure: + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); + kfree(bh); + return -EIO; +} diff --git a/kernel/fs/squashfs/cache.c b/kernel/fs/squashfs/cache.c new file mode 100644 index 000000000..1cb70a0b2 --- /dev/null +++ b/kernel/fs/squashfs/cache.c @@ -0,0 +1,458 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * cache.c + */ + +/* + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing + * recently accessed data Squashfs uses two small metadata and fragment caches. + * + * This file implements a generic cache implementation used for both caches, + * plus functions layered ontop of the generic cache implementation to + * access the metadata and fragment caches. + * + * To avoid out of memory and fragmentation issues with vmalloc the cache + * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. + * + * It should be noted that the cache is not used for file datablocks, these + * are decompressed and cached in the page-cache in the normal way. The + * cache is only used to temporarily cache fragment and metadata blocks + * which have been read as as a result of a metadata (i.e. inode or + * directory) or fragment access. Because metadata and fragments are packed + * together into blocks (to gain greater compression) the read of a particular + * piece of metadata or fragment will retrieve other metadata/fragments which + * have been packed with it, these because of locality-of-reference may be read + * in the near future. Temporarily caching them ensures they are available for + * near future access without requiring an additional read and decompress. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "page_actor.h" + +/* + * Look-up block in cache, and increment usage count. If not in cache, read + * and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, + struct squashfs_cache *cache, u64 block, int length) +{ + int i, n; + struct squashfs_cache_entry *entry; + + spin_lock(&cache->lock); + + while (1) { + for (i = cache->curr_blk, n = 0; n < cache->entries; n++) { + if (cache->entry[i].block == block) { + cache->curr_blk = i; + break; + } + i = (i + 1) % cache->entries; + } + + if (n == cache->entries) { + /* + * Block not in cache, if all cache entries are used + * go to sleep waiting for one to become available. + */ + if (cache->unused == 0) { + cache->num_waiters++; + spin_unlock(&cache->lock); + wait_event(cache->wait_queue, cache->unused); + spin_lock(&cache->lock); + cache->num_waiters--; + continue; + } + + /* + * At least one unused cache entry. A simple + * round-robin strategy is used to choose the entry to + * be evicted from the cache. + */ + i = cache->next_blk; + for (n = 0; n < cache->entries; n++) { + if (cache->entry[i].refcount == 0) + break; + i = (i + 1) % cache->entries; + } + + cache->next_blk = (i + 1) % cache->entries; + entry = &cache->entry[i]; + + /* + * Initialise chosen cache entry, and fill it in from + * disk. + */ + cache->unused--; + entry->block = block; + entry->refcount = 1; + entry->pending = 1; + entry->num_waiters = 0; + entry->error = 0; + spin_unlock(&cache->lock); + + entry->length = squashfs_read_data(sb, block, length, + &entry->next_index, entry->actor); + + spin_lock(&cache->lock); + + if (entry->length < 0) + entry->error = entry->length; + + entry->pending = 0; + + /* + * While filling this entry one or more other processes + * have looked it up in the cache, and have slept + * waiting for it to become available. + */ + if (entry->num_waiters) { + spin_unlock(&cache->lock); + wake_up_all(&entry->wait_queue); + } else + spin_unlock(&cache->lock); + + goto out; + } + + /* + * Block already in cache. Increment refcount so it doesn't + * get reused until we're finished with it, if it was + * previously unused there's one less cache entry available + * for reuse. + */ + entry = &cache->entry[i]; + if (entry->refcount == 0) + cache->unused--; + entry->refcount++; + + /* + * If the entry is currently being filled in by another process + * go to sleep waiting for it to become available. + */ + if (entry->pending) { + entry->num_waiters++; + spin_unlock(&cache->lock); + wait_event(entry->wait_queue, !entry->pending); + } else + spin_unlock(&cache->lock); + + goto out; + } + +out: + TRACE("Got %s %d, start block %lld, refcount %d, error %d\n", + cache->name, i, entry->block, entry->refcount, entry->error); + + if (entry->error) + ERROR("Unable to read %s cache entry [%llx]\n", cache->name, + block); + return entry; +} + + +/* + * Release cache entry, once usage count is zero it can be reused. + */ +void squashfs_cache_put(struct squashfs_cache_entry *entry) +{ + struct squashfs_cache *cache = entry->cache; + + spin_lock(&cache->lock); + entry->refcount--; + if (entry->refcount == 0) { + cache->unused++; + /* + * If there's any processes waiting for a block to become + * available, wake one up. + */ + if (cache->num_waiters) { + spin_unlock(&cache->lock); + wake_up(&cache->wait_queue); + return; + } + } + spin_unlock(&cache->lock); +} + +/* + * Delete cache reclaiming all kmalloced buffers. + */ +void squashfs_cache_delete(struct squashfs_cache *cache) +{ + int i, j; + + if (cache == NULL) + return; + + for (i = 0; i < cache->entries; i++) { + if (cache->entry[i].data) { + for (j = 0; j < cache->pages; j++) + kfree(cache->entry[i].data[j]); + kfree(cache->entry[i].data); + } + kfree(cache->entry[i].actor); + } + + kfree(cache->entry); + kfree(cache); +} + + +/* + * Initialise cache allocating the specified number of entries, each of + * size block_size. To avoid vmalloc fragmentation issues each entry + * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. + */ +struct squashfs_cache *squashfs_cache_init(char *name, int entries, + int block_size) +{ + int i, j; + struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); + + if (cache == NULL) { + ERROR("Failed to allocate %s cache\n", name); + return NULL; + } + + cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); + if (cache->entry == NULL) { + ERROR("Failed to allocate %s cache\n", name); + goto cleanup; + } + + cache->curr_blk = 0; + cache->next_blk = 0; + cache->unused = entries; + cache->entries = entries; + cache->block_size = block_size; + cache->pages = block_size >> PAGE_CACHE_SHIFT; + cache->pages = cache->pages ? cache->pages : 1; + cache->name = name; + cache->num_waiters = 0; + spin_lock_init(&cache->lock); + init_waitqueue_head(&cache->wait_queue); + + for (i = 0; i < entries; i++) { + struct squashfs_cache_entry *entry = &cache->entry[i]; + + init_waitqueue_head(&cache->entry[i].wait_queue); + entry->cache = cache; + entry->block = SQUASHFS_INVALID_BLK; + entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); + if (entry->data == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } + + for (j = 0; j < cache->pages; j++) { + entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (entry->data[j] == NULL) { + ERROR("Failed to allocate %s buffer\n", name); + goto cleanup; + } + } + + entry->actor = squashfs_page_actor_init(entry->data, + cache->pages, 0); + if (entry->actor == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } + } + + return cache; + +cleanup: + squashfs_cache_delete(cache); + return NULL; +} + + +/* + * Copy up to length bytes from cache entry to buffer starting at offset bytes + * into the cache entry. If there's not length bytes then copy the number of + * bytes available. In all cases return the number of bytes copied. + */ +int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, + int offset, int length) +{ + int remaining = length; + + if (length == 0) + return 0; + else if (buffer == NULL) + return min(length, entry->length - offset); + + while (offset < entry->length) { + void *buff = entry->data[offset / PAGE_CACHE_SIZE] + + (offset % PAGE_CACHE_SIZE); + int bytes = min_t(int, entry->length - offset, + PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); + + if (bytes >= remaining) { + memcpy(buffer, buff, remaining); + remaining = 0; + break; + } + + memcpy(buffer, buff, bytes); + buffer += bytes; + remaining -= bytes; + offset += bytes; + } + + return length - remaining; +} + + +/* + * Read length bytes from metadata position (block is the + * start of the compressed block on disk, and offset is the offset into + * the block once decompressed). Data is packed into consecutive blocks, + * and length bytes may require reading more than one block. + */ +int squashfs_read_metadata(struct super_block *sb, void *buffer, + u64 *block, int *offset, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int bytes, res = length; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); + + while (length) { + entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); + if (entry->error) { + res = entry->error; + goto error; + } else if (*offset >= entry->length) { + res = -EIO; + goto error; + } + + bytes = squashfs_copy_data(buffer, entry, *offset, length); + if (buffer) + buffer += bytes; + length -= bytes; + *offset += bytes; + + if (*offset == entry->length) { + *block = entry->next_index; + *offset = 0; + } + + squashfs_cache_put(entry); + } + + return res; + +error: + squashfs_cache_put(entry); + return res; +} + + +/* + * Look-up in the fragmment cache the fragment located at in the + * filesystem. If necessary read and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->fragment_cache, start_block, + length); +} + + +/* + * Read and decompress the datablock located at in the + * filesystem. The cache is used here to avoid duplicating locking and + * read/decompress code. + */ +struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->read_page, start_block, length); +} + + +/* + * Read a filesystem table (uncompressed sequence of bytes) from disk + */ +void *squashfs_read_table(struct super_block *sb, u64 block, int length) +{ + int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int i, res; + void *table, *buffer, **data; + struct squashfs_page_actor *actor; + + table = buffer = kmalloc(length, GFP_KERNEL); + if (table == NULL) + return ERR_PTR(-ENOMEM); + + data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) { + res = -ENOMEM; + goto failed; + } + + actor = squashfs_page_actor_init(data, pages, length); + if (actor == NULL) { + res = -ENOMEM; + goto failed2; + } + + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + data[i] = buffer; + + res = squashfs_read_data(sb, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); + + kfree(data); + kfree(actor); + + if (res < 0) + goto failed; + + return table; + +failed2: + kfree(data); +failed: + kfree(table); + return ERR_PTR(res); +} diff --git a/kernel/fs/squashfs/decompressor.c b/kernel/fs/squashfs/decompressor.c new file mode 100644 index 000000000..e9034bf6e --- /dev/null +++ b/kernel/fs/squashfs/decompressor.c @@ -0,0 +1,148 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.c + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" +#include "page_actor.h" + +/* + * This file (and decompressor.h) implements a decompressor framework for + * Squashfs, allowing multiple decompressors to be easily supported + */ + +static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { + NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 +}; + +#ifndef CONFIG_SQUASHFS_LZ4 +static const struct squashfs_decompressor squashfs_lz4_comp_ops = { + NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0 +}; +#endif + +#ifndef CONFIG_SQUASHFS_LZO +static const struct squashfs_decompressor squashfs_lzo_comp_ops = { + NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 +}; +#endif + +#ifndef CONFIG_SQUASHFS_XZ +static const struct squashfs_decompressor squashfs_xz_comp_ops = { + NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 +}; +#endif + +#ifndef CONFIG_SQUASHFS_ZLIB +static const struct squashfs_decompressor squashfs_zlib_comp_ops = { + NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 +}; +#endif + +static const struct squashfs_decompressor squashfs_unknown_comp_ops = { + NULL, NULL, NULL, NULL, 0, "unknown", 0 +}; + +static const struct squashfs_decompressor *decompressor[] = { + &squashfs_zlib_comp_ops, + &squashfs_lz4_comp_ops, + &squashfs_lzo_comp_ops, + &squashfs_xz_comp_ops, + &squashfs_lzma_unsupported_comp_ops, + &squashfs_unknown_comp_ops +}; + + +const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) +{ + int i; + + for (i = 0; decompressor[i]->id; i++) + if (id == decompressor[i]->id) + break; + + return decompressor[i]; +} + + +static void *get_comp_opts(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *buffer = NULL, *comp_opts; + struct squashfs_page_actor *actor = NULL; + int length = 0; + + /* + * Read decompressor specific options from file system if present + */ + if (SQUASHFS_COMP_OPTS(flags)) { + buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (buffer == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + actor = squashfs_page_actor_init(&buffer, 1, 0); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); + + if (length < 0) { + comp_opts = ERR_PTR(length); + goto out; + } + } + + comp_opts = squashfs_comp_opts(msblk, buffer, length); + +out: + kfree(actor); + kfree(buffer); + return comp_opts; +} + + +void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *stream, *comp_opts = get_comp_opts(sb, flags); + + if (IS_ERR(comp_opts)) + return comp_opts; + + stream = squashfs_decompressor_create(msblk, comp_opts); + if (IS_ERR(stream)) + kfree(comp_opts); + + return stream; +} diff --git a/kernel/fs/squashfs/decompressor.h b/kernel/fs/squashfs/decompressor.h new file mode 100644 index 000000000..a25713c03 --- /dev/null +++ b/kernel/fs/squashfs/decompressor.h @@ -0,0 +1,61 @@ +#ifndef DECOMPRESSOR_H +#define DECOMPRESSOR_H +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.h + */ + +struct squashfs_decompressor { + void *(*init)(struct squashfs_sb_info *, void *); + void *(*comp_opts)(struct squashfs_sb_info *, void *, int); + void (*free)(void *); + int (*decompress)(struct squashfs_sb_info *, void *, + struct buffer_head **, int, int, int, + struct squashfs_page_actor *); + int id; + char *name; + int supported; +}; + +static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int length) +{ + return msblk->decompressor->comp_opts ? + msblk->decompressor->comp_opts(msblk, buff, length) : NULL; +} + +#ifdef CONFIG_SQUASHFS_XZ +extern const struct squashfs_decompressor squashfs_xz_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_LZ4 +extern const struct squashfs_decompressor squashfs_lz4_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_LZO +extern const struct squashfs_decompressor squashfs_lzo_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_ZLIB +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; +#endif + +#endif diff --git a/kernel/fs/squashfs/decompressor_multi.c b/kernel/fs/squashfs/decompressor_multi.c new file mode 100644 index 000000000..d6008a636 --- /dev/null +++ b/kernel/fs/squashfs/decompressor_multi.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2013 + * Minchan Kim + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression in the + * decompressor framework + */ + + +/* + * The reason that multiply two is that a CPU can request new I/O + * while it is waiting previous request. + */ +#define MAX_DECOMPRESSOR (num_online_cpus() * 2) + + +int squashfs_max_decompressors(void) +{ + return MAX_DECOMPRESSOR; +} + + +struct squashfs_stream { + void *comp_opts; + struct list_head strm_list; + struct mutex mutex; + int avail_decomp; + wait_queue_head_t wait; +}; + + +struct decomp_stream { + void *stream; + struct list_head list; +}; + + +static void put_decomp_stream(struct decomp_stream *decomp_strm, + struct squashfs_stream *stream) +{ + mutex_lock(&stream->mutex); + list_add(&decomp_strm->list, &stream->strm_list); + mutex_unlock(&stream->mutex); + wake_up(&stream->wait); +} + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct decomp_stream *decomp_strm = NULL; + int err = -ENOMEM; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) + goto out; + + stream->comp_opts = comp_opts; + mutex_init(&stream->mutex); + INIT_LIST_HEAD(&stream->strm_list); + init_waitqueue_head(&stream->wait); + + /* + * We should have a decompressor at least as default + * so if we fail to allocate new decompressor dynamically, + * we could always fall back to default decompressor and + * file system works. + */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto out; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + err = PTR_ERR(decomp_strm->stream); + goto out; + } + + list_add(&decomp_strm->list, &stream->strm_list); + stream->avail_decomp = 1; + return stream; + +out: + kfree(decomp_strm); + kfree(stream); + return ERR_PTR(err); +} + + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + if (stream) { + struct decomp_stream *decomp_strm; + + while (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + msblk->decompressor->free(decomp_strm->stream); + kfree(decomp_strm); + stream->avail_decomp--; + } + WARN_ON(stream->avail_decomp); + kfree(stream->comp_opts); + kfree(stream); + } +} + + +static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, + struct squashfs_stream *stream) +{ + struct decomp_stream *decomp_strm; + + while (1) { + mutex_lock(&stream->mutex); + + /* There is available decomp_stream */ + if (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + mutex_unlock(&stream->mutex); + break; + } + + /* + * If there is no available decomp and already full, + * let's wait for releasing decomp from other users. + */ + if (stream->avail_decomp >= MAX_DECOMPRESSOR) + goto wait; + + /* Let's allocate new decomp */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto wait; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + kfree(decomp_strm); + goto wait; + } + + stream->avail_decomp++; + WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + + mutex_unlock(&stream->mutex); + break; +wait: + /* + * If system memory is tough, let's for other's + * releasing instead of hurting VM because it could + * make page cache thrashing. + */ + mutex_unlock(&stream->mutex); + wait_event(stream->wait, + !list_empty(&stream->strm_list)); + } + + return decomp_strm; +} + + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); + res = msblk->decompressor->decompress(msblk, decomp_stream->stream, + bh, b, offset, length, output); + put_decomp_stream(decomp_stream, stream); + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + return res; +} diff --git a/kernel/fs/squashfs/decompressor_multi_percpu.c b/kernel/fs/squashfs/decompressor_multi_percpu.c new file mode 100644 index 000000000..23a9c28ad --- /dev/null +++ b/kernel/fs/squashfs/decompressor_multi_percpu.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression using percpu + * variables, one thread per cpu core. + */ + +struct squashfs_stream { + void *stream; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu; + int err, cpu; + + percpu = alloc_percpu(struct squashfs_stream); + if (percpu == NULL) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + } + + kfree(comp_opts); + return (__force void *) percpu; + +out: + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + if (!IS_ERR_OR_NULL(stream->stream)) + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream; + int cpu; + + if (msblk->stream) { + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream = get_cpu_ptr(percpu); + int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + put_cpu_ptr(stream); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return num_possible_cpus(); +} diff --git a/kernel/fs/squashfs/decompressor_single.c b/kernel/fs/squashfs/decompressor_single.c new file mode 100644 index 000000000..a6c75929a --- /dev/null +++ b/kernel/fs/squashfs/decompressor_single.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements single-threaded decompression in the + * decompressor framework + */ + +struct squashfs_stream { + void *stream; + struct mutex mutex; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + int err = -ENOMEM; + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto out; + + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + + kfree(comp_opts); + mutex_init(&stream->mutex); + return stream; + +out: + kfree(stream); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + + if (stream) { + msblk->decompressor->free(stream->stream); + kfree(stream); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + + mutex_lock(&stream->mutex); + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + mutex_unlock(&stream->mutex); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return 1; +} diff --git a/kernel/fs/squashfs/dir.c b/kernel/fs/squashfs/dir.c new file mode 100644 index 000000000..d8c2d747b --- /dev/null +++ b/kernel/fs/squashfs/dir.c @@ -0,0 +1,236 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * dir.c + */ + +/* + * This file implements code to read directories from disk. + * + * See namei.c for a description of directory organisation on disk. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const unsigned char squashfs_filetype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK +}; + +/* + * Lookup offset (f_pos) in the directory index, returning the + * metadata block containing it. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_offset(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, int index_offset, + int i_count, u64 f_pos) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int err, i, index, length = 0; + unsigned int size; + struct squashfs_dir_index dir_index; + + TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", + i_count, f_pos); + + /* + * Translate from external f_pos to the internal f_pos. This + * is offset by 3 because we invent "." and ".." entries which are + * not actually stored in the directory. + */ + if (f_pos <= 3) + return f_pos; + f_pos -= 3; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, &dir_index, &index_start, + &index_offset, sizeof(dir_index)); + if (err < 0) + break; + + index = le32_to_cpu(dir_index.index); + if (index > f_pos) + /* + * Found the index we're looking for. + */ + break; + + size = le32_to_cpu(dir_index.size) + 1; + + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + break; + + err = squashfs_read_metadata(sb, NULL, &index_start, + &index_offset, size); + if (err < 0) + break; + + length = index; + *next_block = le32_to_cpu(dir_index.start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + + /* + * Translate back from internal f_pos to external f_pos. + */ + return length + 3; +} + + +static int squashfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + u64 block = squashfs_i(inode)->start + msblk->directory_table; + int offset = squashfs_i(inode)->offset, length, err; + unsigned int inode_number, dir_count, size, type; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + + TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + goto finish; + } + + /* + * Return "." and ".." entries as the first two filenames in the + * directory. To maximise compression these two entries are not + * stored in the directory, and so we invent them here. + * + * It also means that the external f_pos is offset by 3 from the + * on-disk directory f_pos. + */ + while (ctx->pos < 3) { + char *name; + int i_ino; + + if (ctx->pos == 0) { + name = "."; + size = 1; + i_ino = inode->i_ino; + } else { + name = ".."; + size = 2; + i_ino = squashfs_i(inode)->parent; + } + + if (!dir_emit(ctx, name, size, i_ino, + squashfs_filetype_table[1])) + goto finish; + + ctx->pos += size; + } + + length = get_dir_index_using_offset(inode->i_sb, &block, &offset, + squashfs_i(inode)->dir_idx_start, + squashfs_i(inode)->dir_idx_offset, + squashfs_i(inode)->dir_idx_cnt, + ctx->pos); + + while (length < i_size_read(inode)) { + /* + * Read directory header + */ + err = squashfs_read_metadata(inode->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto failed_read; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + + if (dir_count > SQUASHFS_DIR_COUNT) + goto failed_read; + + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(inode->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto failed_read; + + size = le16_to_cpu(dire->size) + 1; + + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto failed_read; + + err = squashfs_read_metadata(inode->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto failed_read; + + length += sizeof(*dire) + size; + + if (ctx->pos >= length) + continue; + + dire->name[size] = '\0'; + inode_number = le32_to_cpu(dirh.inode_number) + + ((short) le16_to_cpu(dire->inode_number)); + type = le16_to_cpu(dire->type); + + if (type > SQUASHFS_MAX_DIR_TYPE) + goto failed_read; + + if (!dir_emit(ctx, dire->name, size, + inode_number, + squashfs_filetype_table[type])) + goto finish; + + ctx->pos = length; + } + } + +finish: + kfree(dire); + return 0; + +failed_read: + ERROR("Unable to read directory block [%llx:%x]\n", block, offset); + kfree(dire); + return 0; +} + + +const struct file_operations squashfs_dir_ops = { + .read = generic_read_dir, + .iterate = squashfs_readdir, + .llseek = default_llseek, +}; diff --git a/kernel/fs/squashfs/export.c b/kernel/fs/squashfs/export.c new file mode 100644 index 000000000..8073b6532 --- /dev/null +++ b/kernel/fs/squashfs/export.c @@ -0,0 +1,163 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * export.c + */ + +/* + * This file implements code to make Squashfs filesystems exportable (NFS etc.) + * + * The export code uses an inode lookup table to map inode numbers passed in + * filehandles to an inode location on disk. This table is stored compressed + * into metadata blocks. A second index table is used to locate these. This + * second index table for speed of access (and because it is small) is read at + * mount time and cached in memory. + * + * The inode lookup table is used only by the export code, inode disk + * locations are directly encoded in directories, enabling direct access + * without an intermediate lookup for all operations except the export ops. + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up inode number (ino) in table, returning the inode location. + */ +static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); + int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); + u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + __le64 ino; + int err; + + TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); + if (err < 0) + return err; + + TRACE("squashfs_inode_lookup, inode = 0x%llx\n", + (u64) le64_to_cpu(ino)); + + return le64_to_cpu(ino); +} + + +static struct dentry *squashfs_export_iget(struct super_block *sb, + unsigned int ino_num) +{ + long long ino; + struct dentry *dentry = ERR_PTR(-ENOENT); + + TRACE("Entered squashfs_export_iget\n"); + + ino = squashfs_inode_lookup(sb, ino_num); + if (ino >= 0) + dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); + + return dentry; +} + + +static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) + || fh_len < 2) + return NULL; + + return squashfs_export_iget(sb, fid->i32.ino); +} + + +static struct dentry *squashfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) + return NULL; + + return squashfs_export_iget(sb, fid->i32.parent_ino); +} + + +static struct dentry *squashfs_get_parent(struct dentry *child) +{ + struct inode *inode = d_inode(child); + unsigned int parent_ino = squashfs_i(inode)->parent; + + return squashfs_export_iget(inode->i_sb, parent_ino); +} + + +/* + * Read uncompressed inode lookup table indexes off disk into memory + */ +__le64 *squashfs_read_inode_lookup_table(struct super_block *sb, + u64 lookup_table_start, u64 next_table, unsigned int inodes) +{ + unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + __le64 *table; + + TRACE("In read_inode_lookup_table, length %d\n", length); + + /* Sanity check values */ + + /* there should always be at least one inode */ + if (inodes == 0) + return ERR_PTR(-EINVAL); + + /* length bytes should not extend into the next table - this check + * also traps instances where lookup_table_start is incorrectly larger + * than the next table start + */ + if (lookup_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, lookup_table_start, length); + + /* + * table[0] points to the first inode lookup table metadata block, + * this should be less than lookup_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} + + +const struct export_operations squashfs_export_ops = { + .fh_to_dentry = squashfs_fh_to_dentry, + .fh_to_parent = squashfs_fh_to_parent, + .get_parent = squashfs_get_parent +}; diff --git a/kernel/fs/squashfs/file.c b/kernel/fs/squashfs/file.c new file mode 100644 index 000000000..e5c968906 --- /dev/null +++ b/kernel/fs/squashfs/file.c @@ -0,0 +1,503 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * file.c + */ + +/* + * This file contains code for handling regular files. A regular file + * consists of a sequence of contiguous compressed blocks, and/or a + * compressed fragment block (tail-end packed block). The compressed size + * of each datablock is stored in a block list contained within the + * file inode (itself stored in one or more compressed metadata blocks). + * + * To speed up access to datablocks when reading 'large' files (256 Mbytes or + * larger), the code implements an index cache that caches the mapping from + * block index to datablock location on disk. + * + * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while + * retaining a simple and space-efficient block list on disk. The cache + * is split into slots, caching up to eight 224 GiB files (128 KiB blocks). + * Larger files use multiple slots, with 1.75 TiB files using all 8 slots. + * The index cache is designed to be memory efficient, and by default uses + * 16 KiB. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Locate cache slot in range [offset, index] for specified inode. If + * there's more than one return the slot closest to index. + */ +static struct meta_index *locate_meta_index(struct inode *inode, int offset, + int index) +{ + struct meta_index *meta = NULL; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("locate_meta_index: index %d, offset %d\n", index, offset); + + if (msblk->meta_index == NULL) + goto not_allocated; + + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + if (msblk->meta_index[i].inode_number == inode->i_ino && + msblk->meta_index[i].offset >= offset && + msblk->meta_index[i].offset <= index && + msblk->meta_index[i].locked == 0) { + TRACE("locate_meta_index: entry %d, offset %d\n", i, + msblk->meta_index[i].offset); + meta = &msblk->meta_index[i]; + offset = meta->offset; + } + } + + if (meta) + meta->locked = 1; + +not_allocated: + mutex_unlock(&msblk->meta_index_mutex); + + return meta; +} + + +/* + * Find and initialise an empty cache slot for index offset. + */ +static struct meta_index *empty_meta_index(struct inode *inode, int offset, + int skip) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct meta_index *meta = NULL; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip); + + if (msblk->meta_index == NULL) { + /* + * First time cache index has been used, allocate and + * initialise. The cache index could be allocated at + * mount time but doing it here means it is allocated only + * if a 'large' file is read. + */ + msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS, + sizeof(*(msblk->meta_index)), GFP_KERNEL); + if (msblk->meta_index == NULL) { + ERROR("Failed to allocate meta_index\n"); + goto failed; + } + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + msblk->meta_index[i].inode_number = 0; + msblk->meta_index[i].locked = 0; + } + msblk->next_meta_index = 0; + } + + for (i = SQUASHFS_META_SLOTS; i && + msblk->meta_index[msblk->next_meta_index].locked; i--) + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + if (i == 0) { + TRACE("empty_meta_index: failed!\n"); + goto failed; + } + + TRACE("empty_meta_index: returned meta entry %d, %p\n", + msblk->next_meta_index, + &msblk->meta_index[msblk->next_meta_index]); + + meta = &msblk->meta_index[msblk->next_meta_index]; + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + meta->inode_number = inode->i_ino; + meta->offset = offset; + meta->skip = skip; + meta->entries = 0; + meta->locked = 1; + +failed: + mutex_unlock(&msblk->meta_index_mutex); + return meta; +} + + +static void release_meta_index(struct inode *inode, struct meta_index *meta) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + mutex_lock(&msblk->meta_index_mutex); + meta->locked = 0; + mutex_unlock(&msblk->meta_index_mutex); +} + + +/* + * Read the next n blocks from the block list, starting from + * metadata block . + */ +static long long read_indexes(struct super_block *sb, int n, + u64 *start_block, int *offset) +{ + int err, i; + long long block = 0; + __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + + if (blist == NULL) { + ERROR("read_indexes: Failed to allocate block_list\n"); + return -ENOMEM; + } + + while (n) { + int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); + + err = squashfs_read_metadata(sb, blist, start_block, + offset, blocks << 2); + if (err < 0) { + ERROR("read_indexes: reading block [%llx:%x]\n", + *start_block, *offset); + goto failure; + } + + for (i = 0; i < blocks; i++) { + int size = le32_to_cpu(blist[i]); + block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); + } + n -= blocks; + } + + kfree(blist); + return block; + +failure: + kfree(blist); + return err; +} + + +/* + * Each cache index slot has SQUASHFS_META_ENTRIES, each of which + * can cache one index -> datablock/blocklist-block mapping. We wish + * to distribute these over the length of the file, entry[0] maps index x, + * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on. + * The larger the file, the greater the skip factor. The skip factor is + * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure + * the number of metadata blocks that need to be read fits into the cache. + * If the skip factor is limited in this way then the file will use multiple + * slots. + */ +static inline int calculate_skip(int blocks) +{ + int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + * SQUASHFS_META_INDEXES); + return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); +} + + +/* + * Search and grow the index cache for the specified inode, returning the + * on-disk locations of the datablock and block list metadata block + * for index (scaled to nearest cache index). + */ +static int fill_meta_index(struct inode *inode, int index, + u64 *index_block, int *index_offset, u64 *data_block) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int skip = calculate_skip(i_size_read(inode) >> msblk->block_log); + int offset = 0; + struct meta_index *meta; + struct meta_entry *meta_entry; + u64 cur_index_block = squashfs_i(inode)->block_list_start; + int cur_offset = squashfs_i(inode)->offset; + u64 cur_data_block = squashfs_i(inode)->start; + int err, i; + + /* + * Scale index to cache index (cache slot entry) + */ + index /= SQUASHFS_META_INDEXES * skip; + + while (offset < index) { + meta = locate_meta_index(inode, offset + 1, index); + + if (meta == NULL) { + meta = empty_meta_index(inode, offset + 1, skip); + if (meta == NULL) + goto all_done; + } else { + offset = index < meta->offset + meta->entries ? index : + meta->offset + meta->entries - 1; + meta_entry = &meta->meta_entry[offset - meta->offset]; + cur_index_block = meta_entry->index_block + + msblk->inode_table; + cur_offset = meta_entry->offset; + cur_data_block = meta_entry->data_block; + TRACE("get_meta_index: offset %d, meta->offset %d, " + "meta->entries %d\n", offset, meta->offset, + meta->entries); + TRACE("get_meta_index: index_block 0x%llx, offset 0x%x" + " data_block 0x%llx\n", cur_index_block, + cur_offset, cur_data_block); + } + + /* + * If necessary grow cache slot by reading block list. Cache + * slot is extended up to index or to the end of the slot, in + * which case further slots will be used. + */ + for (i = meta->offset + meta->entries; i <= index && + i < meta->offset + SQUASHFS_META_ENTRIES; i++) { + int blocks = skip * SQUASHFS_META_INDEXES; + long long res = read_indexes(inode->i_sb, blocks, + &cur_index_block, &cur_offset); + + if (res < 0) { + if (meta->entries == 0) + /* + * Don't leave an empty slot on read + * error allocated to this inode... + */ + meta->inode_number = 0; + err = res; + goto failed; + } + + cur_data_block += res; + meta_entry = &meta->meta_entry[i - meta->offset]; + meta_entry->index_block = cur_index_block - + msblk->inode_table; + meta_entry->offset = cur_offset; + meta_entry->data_block = cur_data_block; + meta->entries++; + offset++; + } + + TRACE("get_meta_index: meta->offset %d, meta->entries %d\n", + meta->offset, meta->entries); + + release_meta_index(inode, meta); + } + +all_done: + *index_block = cur_index_block; + *index_offset = cur_offset; + *data_block = cur_data_block; + + /* + * Scale cache index (cache slot entry) to index + */ + return offset * SQUASHFS_META_INDEXES * skip; + +failed: + release_meta_index(inode, meta); + return err; +} + + +/* + * Get the on-disk location and compressed size of the datablock + * specified by index. Fill_meta_index() does most of the work. + */ +static int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + long long blks; + int offset; + __le32 size; + int res = fill_meta_index(inode, index, &start, &offset, block); + + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" + " 0x%x, block 0x%llx\n", res, index, start, offset, + *block); + + if (res < 0) + return res; + + /* + * res contains the index of the mapping returned by fill_meta_index(), + * this will likely be less than the desired index (because the + * meta_index cache works at a higher granularity). Read any + * extra block indexes needed. + */ + if (res < index) { + blks = read_indexes(inode->i_sb, index - res, &start, &offset); + if (blks < 0) + return (int) blks; + *block += blks; + } + + /* + * Read length of block specified by index. + */ + res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + sizeof(size)); + if (res < 0) + return res; + return le32_to_cpu(size); +} + +/* Copy data into page cache */ +void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, + int bytes, int offset) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + void *pageaddr; + int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = page->index & ~mask, end_index = start_index | mask; + + /* + * Loop copying datablock into pages. As the datablock likely covers + * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly + * grab the pages from the page cache, except for the page that we've + * been called to fill. + */ + for (i = start_index; i <= end_index && bytes > 0; i++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + struct page *push_page; + int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0; + + TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); + + push_page = (i == page->index) ? page : + grab_cache_page_nowait(page->mapping, i); + + if (!push_page) + continue; + + if (PageUptodate(push_page)) + goto skip_page; + + pageaddr = kmap_atomic(push_page); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(push_page); + SetPageUptodate(push_page); +skip_page: + unlock_page(push_page); + if (i != page->index) + page_cache_release(push_page); + } +} + +/* Read datablock stored packed inside a fragment (tail-end packed block) */ +static int squashfs_readpage_fragment(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + else + squashfs_copy_cache(page, buffer, i_size_read(inode) & + (msblk->block_size - 1), + squashfs_i(inode)->fragment_offset); + + squashfs_cache_put(buffer); + return res; +} + +static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + squashfs_copy_cache(page, NULL, bytes, 0); + return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int file_end = i_size_read(inode) >> msblk->block_log; + int res; + void *pageaddr; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) + res = squashfs_readpage_sparse(page, index, file_end); + else + res = squashfs_readpage_block(page, block, bsize); + } else + res = squashfs_readpage_fragment(page); + + if (!res) + return 0; + +error_out: + SetPageError(page); +out: + pageaddr = kmap_atomic(page); + memset(pageaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(pageaddr); + flush_dcache_page(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + + +const struct address_space_operations squashfs_aops = { + .readpage = squashfs_readpage +}; diff --git a/kernel/fs/squashfs/file_cache.c b/kernel/fs/squashfs/file_cache.c new file mode 100644 index 000000000..f2310d2a2 --- /dev/null +++ b/kernel/fs/squashfs/file_cache.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* Read separately compressed datablock and memcopy into page cache */ +int squashfs_readpage_block(struct page *page, u64 block, int bsize) +{ + struct inode *i = page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + else + squashfs_copy_cache(page, buffer, buffer->length, 0); + + squashfs_cache_put(buffer); + return res; +} diff --git a/kernel/fs/squashfs/file_direct.c b/kernel/fs/squashfs/file_direct.c new file mode 100644 index 000000000..43e7a7edd --- /dev/null +++ b/kernel/fs/squashfs/file_direct.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "page_actor.h" + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page); + +/* Read separately compressed datablock directly into page cache */ +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) + +{ + struct inode *inode = target_page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + + int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = target_page->index & ~mask; + int end_index = start_index | mask; + int i, n, pages, missing_pages, bytes, res = -ENOMEM; + struct page **page; + struct squashfs_page_actor *actor; + void *pageaddr; + + if (end_index > file_end) + end_index = file_end; + + pages = end_index - start_index + 1; + + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); + if (page == NULL) + return res; + + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(page, pages, 0); + if (actor == NULL) + goto out; + + /* Try to grab all the pages covered by the Squashfs block */ + for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + page[i] = (n == target_page->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, n); + + if (page[i] == NULL) { + missing_pages++; + continue; + } + + if (PageUptodate(page[i])) { + unlock_page(page[i]); + page_cache_release(page[i]); + page[i] = NULL; + missing_pages++; + } + } + + if (missing_pages) { + /* + * Couldn't get one or more pages, this page has either + * been VM reclaimed, but others are still in the page cache + * and uptodate, or we're racing with another thread in + * squashfs_readpage also trying to grab them. Fall back to + * using an intermediate buffer. + */ + res = squashfs_read_cache(target_page, block, bsize, pages, + page); + if (res < 0) + goto mark_errored; + + goto out; + } + + /* Decompress directly into the page cache buffers */ + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + if (res < 0) + goto mark_errored; + + /* Last page may have trailing bytes not filled */ + bytes = res % PAGE_CACHE_SIZE; + if (bytes) { + pageaddr = kmap_atomic(page[pages - 1]); + memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(pageaddr); + } + + /* Mark pages as uptodate, unlock and release */ + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + unlock_page(page[i]); + if (page[i] != target_page) + page_cache_release(page[i]); + } + + kfree(actor); + kfree(page); + + return 0; + +mark_errored: + /* Decompression failed, mark pages as errored. Target_page is + * dealt with by the caller + */ + for (i = 0; i < pages; i++) { + if (page[i] == NULL || page[i] == target_page) + continue; + flush_dcache_page(page[i]); + SetPageError(page[i]); + unlock_page(page[i]); + page_cache_release(page[i]); + } + +out: + kfree(actor); + kfree(page); + return res; +} + + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page) +{ + struct inode *i = target_page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int bytes = buffer->length, res = buffer->error, n, offset = 0; + void *pageaddr; + + if (res) { + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + goto out; + } + + for (n = 0; n < pages && bytes > 0; n++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + + if (page[n] == NULL) + continue; + + pageaddr = kmap_atomic(page[n]); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(page[n]); + SetPageUptodate(page[n]); + unlock_page(page[n]); + if (page[n] != target_page) + page_cache_release(page[n]); + } + +out: + squashfs_cache_put(buffer); + return res; +} diff --git a/kernel/fs/squashfs/fragment.c b/kernel/fs/squashfs/fragment.c new file mode 100644 index 000000000..0ed6edbc5 --- /dev/null +++ b/kernel/fs/squashfs/fragment.c @@ -0,0 +1,99 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * fragment.c + */ + +/* + * This file implements code to handle compressed fragments (tail-end packed + * datablocks). + * + * Regular files contain a fragment index which is mapped to a fragment + * location on disk and compressed size using a fragment lookup table. + * Like everything in Squashfs this fragment lookup table is itself stored + * compressed into metadata blocks. A second index table is used to locate + * these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" + +/* + * Look-up fragment using the fragment index table. Return the on disk + * location of the fragment and its compressed size + */ +int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, + u64 *fragment_block) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_FRAGMENT_INDEX(fragment); + int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); + u64 start_block = le64_to_cpu(msblk->fragment_index[block]); + struct squashfs_fragment_entry fragment_entry; + int size; + + size = squashfs_read_metadata(sb, &fragment_entry, &start_block, + &offset, sizeof(fragment_entry)); + if (size < 0) + return size; + + *fragment_block = le64_to_cpu(fragment_entry.start_block); + size = le32_to_cpu(fragment_entry.size); + + return size; +} + + +/* + * Read the uncompressed fragment lookup table indexes off disk into memory + */ +__le64 *squashfs_read_fragment_index_table(struct super_block *sb, + u64 fragment_table_start, u64 next_table, unsigned int fragments) +{ + unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); + __le64 *table; + + /* + * Sanity check, length bytes should not extend into the next table - + * this check also traps instances where fragment_table_start is + * incorrectly larger than the next table start + */ + if (fragment_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, fragment_table_start, length); + + /* + * table[0] points to the first fragment table metadata block, this + * should be less than fragment_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} diff --git a/kernel/fs/squashfs/id.c b/kernel/fs/squashfs/id.c new file mode 100644 index 000000000..d38ea3dab --- /dev/null +++ b/kernel/fs/squashfs/id.c @@ -0,0 +1,102 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * id.c + */ + +/* + * This file implements code to handle uids and gids. + * + * For space efficiency regular files store uid and gid indexes, which are + * converted to 32-bit uids/gids using an id look up table. This table is + * stored compressed into metadata blocks. A second index table is used to + * locate these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" + +/* + * Map uid/gid index into real 32-bit uid/gid using the id look up table + */ +int squashfs_get_id(struct super_block *sb, unsigned int index, + unsigned int *id) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_ID_BLOCK(index); + int offset = SQUASHFS_ID_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->id_table[block]); + __le32 disk_id; + int err; + + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, + sizeof(disk_id)); + if (err < 0) + return err; + + *id = le32_to_cpu(disk_id); + return 0; +} + + +/* + * Read uncompressed id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_id_index_table(struct super_block *sb, + u64 id_table_start, u64 next_table, unsigned short no_ids) +{ + unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + __le64 *table; + + TRACE("In read_id_index_table, length %d\n", length); + + /* Sanity check values */ + + /* there should always be at least one id */ + if (no_ids == 0) + return ERR_PTR(-EINVAL); + + /* + * length bytes should not extend into the next table - this check + * also traps instances where id_table_start is incorrectly larger + * than the next table start + */ + if (id_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, id_table_start, length); + + /* + * table[0] points to the first id lookup table metadata block, this + * should be less than id_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} diff --git a/kernel/fs/squashfs/inode.c b/kernel/fs/squashfs/inode.c new file mode 100644 index 000000000..a1ce5ce60 --- /dev/null +++ b/kernel/fs/squashfs/inode.c @@ -0,0 +1,429 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * inode.c + */ + +/* + * This file implements code to create and read inodes from disk. + * + * Inodes in Squashfs are identified by a 48-bit inode which encodes the + * location of the compressed metadata block containing the inode, and the byte + * offset into that block where the inode is placed (). + * + * To maximise compression there are different inodes for each file type + * (regular file, directory, device, etc.), the inode contents and length + * varying with the type. + * + * To further maximise compression, two types of regular file inode and + * directory inode are defined: inodes optimised for frequently occurring + * regular files and directories, and extended types where extra + * information has to be stored. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Initialise VFS inode with the base inode information common to all + * Squashfs inode types. Sqsh_ino contains the unswapped base inode + * off disk. + */ +static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + struct squashfs_base_inode *sqsh_ino) +{ + uid_t i_uid; + gid_t i_gid; + int err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); + if (err) + return err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid); + if (err) + return err; + + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); + inode->i_atime.tv_sec = inode->i_mtime.tv_sec; + inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode->i_mode = le16_to_cpu(sqsh_ino->mode); + inode->i_size = 0; + + return err; +} + + +struct inode *squashfs_iget(struct super_block *sb, long long ino, + unsigned int ino_number) +{ + struct inode *inode = iget_locked(sb, ino_number); + int err; + + TRACE("Entered squashfs_iget\n"); + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = squashfs_read_inode(inode, ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + + +/* + * Initialise VFS inode by reading inode from inode table (compressed + * metadata). The format and amount of data read depends on type. + */ +int squashfs_read_inode(struct inode *inode, long long ino) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + int err, type, offset = SQUASHFS_INODE_OFFSET(ino); + union squashfs_inode squashfs_ino; + struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + int xattr_id = SQUASHFS_INVALID_XATTR; + + TRACE("Entered squashfs_read_inode\n"); + + /* + * Read inode base common to all inode types. + */ + err = squashfs_read_metadata(sb, sqshb_ino, &block, + &offset, sizeof(*sqshb_ino)); + if (err < 0) + goto failed_read; + + err = squashfs_new_inode(sb, inode, sqshb_ino); + if (err) + goto failed_read; + + block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + offset = SQUASHFS_INODE_OFFSET(ino); + + type = le16_to_cpu(sqshb_ino->inode_type); + switch (type) { + case SQUASHFS_REG_TYPE: { + unsigned int frag_offset, frag; + int frag_size; + u64 frag_blk; + struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + set_nlink(inode, 1); + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_LREG_TYPE: { + unsigned int frag_offset, frag; + int frag_size; + u64 frag_blk; + struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + xattr_id = le32_to_cpu(sqsh_ino->xattr); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_inode_ops; + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = (inode->i_size - + le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; + + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_DIR_TYPE: { + struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + inode->i_size = le16_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_cnt = 0; + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", + SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_LDIR_TYPE: { + struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + xattr_id = le32_to_cpu(sqsh_ino->xattr); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_start = block; + squashfs_i(inode)->dir_idx_offset = offset; + squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Long directory inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_SYMLINK_TYPE: + case SQUASHFS_LSYMLINK_TYPE: { + struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + inode->i_op = &squashfs_symlink_inode_ops; + inode->i_data.a_ops = &squashfs_symlink_aops; + inode->i_mode |= S_IFLNK; + squashfs_i(inode)->start = block; + squashfs_i(inode)->offset = offset; + + if (type == SQUASHFS_LSYMLINK_TYPE) { + __le32 xattr; + + err = squashfs_read_metadata(sb, NULL, &block, + &offset, inode->i_size); + if (err < 0) + goto failed_read; + err = squashfs_read_metadata(sb, &xattr, &block, + &offset, sizeof(xattr)); + if (err < 0) + goto failed_read; + xattr_id = le32_to_cpu(xattr); + } + + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + block, offset); + break; + } + case SQUASHFS_BLKDEV_TYPE: + case SQUASHFS_CHRDEV_TYPE: { + struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_CHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LCHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_FIFO_TYPE: + case SQUASHFS_SOCKET_TYPE: { + struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_FIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + init_special_inode(inode, inode->i_mode, 0); + break; + } + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LFIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); + init_special_inode(inode, inode->i_mode, 0); + break; + } + default: + ERROR("Unknown inode type %d in squashfs_iget!\n", type); + return -EINVAL; + } + + if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { + err = squashfs_xattr_lookup(sb, xattr_id, + &squashfs_i(inode)->xattr_count, + &squashfs_i(inode)->xattr_size, + &squashfs_i(inode)->xattr); + if (err < 0) + goto failed_read; + inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + + 1; + } else + squashfs_i(inode)->xattr_count = 0; + + return 0; + +failed_read: + ERROR("Unable to read inode 0x%llx\n", ino); + return err; +} + + +const struct inode_operations squashfs_inode_ops = { + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/kernel/fs/squashfs/lz4_wrapper.c b/kernel/fs/squashfs/lz4_wrapper.c new file mode 100644 index 000000000..c31e2bc9c --- /dev/null +++ b/kernel/fs/squashfs/lz4_wrapper.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2013, 2014 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +#define LZ4_LEGACY 1 + +struct lz4_comp_opts { + __le32 version; + __le32 flags; +}; + +struct squashfs_lz4 { + void *input; + void *output; +}; + + +static void *lz4_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) +{ + struct lz4_comp_opts *comp_opts = buff; + + /* LZ4 compressed filesystems always have compression options */ + if (comp_opts == NULL || len < sizeof(*comp_opts)) + return ERR_PTR(-EIO); + + if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) { + /* LZ4 format currently used by the kernel is the 'legacy' + * format */ + ERROR("Unknown LZ4 version\n"); + return ERR_PTR(-EINVAL); + } + + return NULL; +} + + +static void *lz4_init(struct squashfs_sb_info *msblk, void *buff) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + struct squashfs_lz4 *stream; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed2; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed3; + + return stream; + +failed3: + vfree(stream->input); +failed2: + kfree(stream); +failed: + ERROR("Failed to initialise LZ4 decompressor\n"); + return ERR_PTR(-ENOMEM); +} + + +static void lz4_free(void *strm) +{ + struct squashfs_lz4 *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct squashfs_lz4 *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t dest_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lz4_decompress_unknownoutputsize(stream->input, length, + stream->output, &dest_len); + if (res) + return -EIO; + + bytes = dest_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + squashfs_finish_page(output); + + return dest_len; +} + +const struct squashfs_decompressor squashfs_lz4_comp_ops = { + .init = lz4_init, + .comp_opts = lz4_comp_opts, + .free = lz4_free, + .decompress = lz4_uncompress, + .id = LZ4_COMPRESSION, + .name = "lz4", + .supported = 1 +}; diff --git a/kernel/fs/squashfs/lzo_wrapper.c b/kernel/fs/squashfs/lzo_wrapper.c new file mode 100644 index 000000000..244b9fbff --- /dev/null +++ b/kernel/fs/squashfs/lzo_wrapper.c @@ -0,0 +1,130 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 LG Electronics + * Chan Jeong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * lzo_wrapper.c + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +struct squashfs_lzo { + void *input; + void *output; +}; + +static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + + struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed2; + + return stream; + +failed2: + vfree(stream->input); +failed: + ERROR("Failed to allocate lzo workspace\n"); + kfree(stream); + return ERR_PTR(-ENOMEM); +} + + +static void lzo_free(void *strm) +{ + struct squashfs_lzo *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct squashfs_lzo *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t out_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lzo1x_decompress_safe(stream->input, (size_t)length, + stream->output, &out_len); + if (res != LZO_E_OK) + goto failed; + + res = bytes = (int)out_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } else { + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + } + squashfs_finish_page(output); + + return res; + +failed: + return -EIO; +} + +const struct squashfs_decompressor squashfs_lzo_comp_ops = { + .init = lzo_init, + .free = lzo_free, + .decompress = lzo_uncompress, + .id = LZO_COMPRESSION, + .name = "lzo", + .supported = 1 +}; diff --git a/kernel/fs/squashfs/namei.c b/kernel/fs/squashfs/namei.c new file mode 100644 index 000000000..67cad77fe --- /dev/null +++ b/kernel/fs/squashfs/namei.c @@ -0,0 +1,252 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * namei.c + */ + +/* + * This file implements code to do filename lookup in directories. + * + * Like inodes, directories are packed into compressed metadata blocks, stored + * in a directory table. Directories are accessed using the start address of + * the metablock containing the directory and the offset into the + * decompressed block (). + * + * Directories are organised in a slightly complex way, and are not simply + * a list of file names. The organisation takes advantage of the + * fact that (in most cases) the inodes of the files will be in the same + * compressed metadata block, and therefore, can share the start block. + * Directories are therefore organised in a two level list, a directory + * header containing the shared start block value, and a sequence of directory + * entries, each of which share the shared start block. A new directory header + * is written once/if the inode start block changes. The directory + * header/directory entry list is repeated as many times as necessary. + * + * Directories are sorted, and can contain a directory index to speed up + * file lookup. Directory indexes store one entry per metablock, each entry + * storing the index/filename mapping to the first directory header + * in each metadata block. Directories are sorted in alphabetical order, + * and at lookup the index is scanned linearly looking for the first filename + * alphabetically larger than the filename being looked up. At this point the + * location of the metadata block the filename is in has been found. + * The general idea of the index is ensure only one metadata block needs to be + * decompressed to do a lookup irrespective of the length of the directory. + * This scheme has the advantage that it doesn't require extra memory overhead + * and doesn't require much extra storage on disk. + */ + +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Lookup name in the directory index, returning the location of the metadata + * block containing it, and the directory index this represents. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_name(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, + int index_offset, int i_count, const char *name, + int len) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int i, length = 0, err; + unsigned int size; + struct squashfs_dir_index *index; + char *str; + + TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); + + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + if (index == NULL) { + ERROR("Failed to allocate squashfs_dir_index\n"); + goto out; + } + + str = &index->name[SQUASHFS_NAME_LEN + 1]; + strncpy(str, name, len); + str[len] = '\0'; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, index, &index_start, + &index_offset, sizeof(*index)); + if (err < 0) + break; + + + size = le32_to_cpu(index->size) + 1; + if (size > SQUASHFS_NAME_LEN) + break; + + err = squashfs_read_metadata(sb, index->name, &index_start, + &index_offset, size); + if (err < 0) + break; + + index->name[size] = '\0'; + + if (strcmp(index->name, str) > 0) + break; + + length = le32_to_cpu(index->index); + *next_block = le32_to_cpu(index->start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + kfree(index); + +out: + /* + * Return index (f_pos) of the looked up metadata block. Translate + * from internal f_pos to external f_pos which is offset by 3 because + * we invent "." and ".." entries which are not actually stored in the + * directory. + */ + return length + 3; +} + + +static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + const unsigned char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *inode = NULL; + struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + u64 block = squashfs_i(dir)->start + msblk->directory_table; + int offset = squashfs_i(dir)->offset; + int err, length; + unsigned int dir_count, size; + + TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + return ERR_PTR(-ENOMEM); + } + + if (len > SQUASHFS_NAME_LEN) { + err = -ENAMETOOLONG; + goto failed; + } + + length = get_dir_index_using_name(dir->i_sb, &block, &offset, + squashfs_i(dir)->dir_idx_start, + squashfs_i(dir)->dir_idx_offset, + squashfs_i(dir)->dir_idx_cnt, name, len); + + while (length < i_size_read(dir)) { + /* + * Read directory header. + */ + err = squashfs_read_metadata(dir->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto read_failure; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + + if (dir_count > SQUASHFS_DIR_COUNT) + goto data_error; + + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(dir->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto read_failure; + + size = le16_to_cpu(dire->size) + 1; + + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto data_error; + + err = squashfs_read_metadata(dir->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto read_failure; + + length += sizeof(*dire) + size; + + if (name[0] < dire->name[0]) + goto exit_lookup; + + if (len == size && !strncmp(name, dire->name, len)) { + unsigned int blk, off, ino_num; + long long ino; + blk = le32_to_cpu(dirh.start_block); + off = le16_to_cpu(dire->offset); + ino_num = le32_to_cpu(dirh.inode_number) + + (short) le16_to_cpu(dire->inode_number); + ino = SQUASHFS_MKINODE(blk, off); + + TRACE("calling squashfs_iget for directory " + "entry %s, inode %x:%x, %d\n", name, + blk, off, ino_num); + + inode = squashfs_iget(dir->i_sb, ino, ino_num); + goto exit_lookup; + } + } + } + +exit_lookup: + kfree(dire); + return d_splice_alias(inode, dentry); + +data_error: + err = -EIO; + +read_failure: + ERROR("Unable to read directory block [%llx:%x]\n", + squashfs_i(dir)->start + msblk->directory_table, + squashfs_i(dir)->offset); +failed: + kfree(dire); + return ERR_PTR(err); +} + + +const struct inode_operations squashfs_dir_inode_ops = { + .lookup = squashfs_lookup, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; diff --git a/kernel/fs/squashfs/page_actor.c b/kernel/fs/squashfs/page_actor.c new file mode 100644 index 000000000..5a1c11f56 --- /dev/null +++ b/kernel/fs/squashfs/page_actor.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include "page_actor.h" + +/* + * This file contains implementations of page_actor for decompressing into + * an intermediate buffer, and for decompressing directly into the + * page cache. + * + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ + +/* Implementation of page_actor for decompressing into intermediate buffer */ +static void *cache_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->buffer[0]; +} + +static void *cache_next_page(struct squashfs_page_actor *actor) +{ + if (actor->next_page == actor->pages) + return NULL; + + return actor->buffer[actor->next_page++]; +} + +static void cache_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} + +struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->buffer = buffer; + actor->pages = pages; + actor->next_page = 0; + actor->squashfs_first_page = cache_first_page; + actor->squashfs_next_page = cache_next_page; + actor->squashfs_finish_page = cache_finish_page; + return actor; +} + +/* Implementation of page_actor for decompressing directly into page cache. */ +static void *direct_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->pageaddr = kmap_atomic(actor->page[0]); +} + +static void *direct_next_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); + + return actor->pageaddr = actor->next_page == actor->pages ? NULL : + kmap_atomic(actor->page[actor->next_page++]); +} + +static void direct_finish_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); +} + +struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + actor->pageaddr = NULL; + actor->squashfs_first_page = direct_first_page; + actor->squashfs_next_page = direct_next_page; + actor->squashfs_finish_page = direct_finish_page; + return actor; +} diff --git a/kernel/fs/squashfs/page_actor.h b/kernel/fs/squashfs/page_actor.h new file mode 100644 index 000000000..26dd82008 --- /dev/null +++ b/kernel/fs/squashfs/page_actor.h @@ -0,0 +1,81 @@ +#ifndef PAGE_ACTOR_H +#define PAGE_ACTOR_H +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef CONFIG_SQUASHFS_FILE_DIRECT +struct squashfs_page_actor { + void **page; + int pages; + int length; + int next_page; +}; + +static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + return actor; +} + +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->page[0]; +} + +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->next_page == actor->pages ? NULL : + actor->page[actor->next_page++]; +} + +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} +#else +struct squashfs_page_actor { + union { + void **buffer; + struct page **page; + }; + void *pageaddr; + void *(*squashfs_first_page)(struct squashfs_page_actor *); + void *(*squashfs_next_page)(struct squashfs_page_actor *); + void (*squashfs_finish_page)(struct squashfs_page_actor *); + int pages; + int length; + int next_page; +}; + +extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page + **, int, int); +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_first_page(actor); +} +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_next_page(actor); +} +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + actor->squashfs_finish_page(actor); +} +#endif +#endif diff --git a/kernel/fs/squashfs/squashfs.h b/kernel/fs/squashfs/squashfs.h new file mode 100644 index 000000000..887d6d270 --- /dev/null +++ b/kernel/fs/squashfs/squashfs.h @@ -0,0 +1,113 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs.h + */ + +#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args) + +#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) + +#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) + +/* block.c */ +extern int squashfs_read_data(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); + +/* cache.c */ +extern struct squashfs_cache *squashfs_cache_init(char *, int, int); +extern void squashfs_cache_delete(struct squashfs_cache *); +extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *, + struct squashfs_cache *, u64, int); +extern void squashfs_cache_put(struct squashfs_cache_entry *); +extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int); +extern int squashfs_read_metadata(struct super_block *, void *, u64 *, + int *, int); +extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, + u64, int); +extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, + u64, int); +extern void *squashfs_read_table(struct super_block *, u64, int); + +/* decompressor.c */ +extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); +extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); + +/* decompressor_xxx.c */ +extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); +extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, + int, int, int, struct squashfs_page_actor *); +extern int squashfs_max_decompressors(void); + +/* export.c */ +extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, + unsigned int); + +/* fragment.c */ +extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); +extern __le64 *squashfs_read_fragment_index_table(struct super_block *, + u64, u64, unsigned int); + +/* file.c */ +void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, + int); + +/* file_xxx.c */ +extern int squashfs_readpage_block(struct page *, u64, int); + +/* id.c */ +extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); +extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, + unsigned short); + +/* inode.c */ +extern struct inode *squashfs_iget(struct super_block *, long long, + unsigned int); +extern int squashfs_read_inode(struct inode *, long long); + +/* xattr.c */ +extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t); + +/* + * Inodes, files, decompressor and xattr operations + */ + +/* dir.c */ +extern const struct file_operations squashfs_dir_ops; + +/* export.c */ +extern const struct export_operations squashfs_export_ops; + +/* file.c */ +extern const struct address_space_operations squashfs_aops; + +/* inode.c */ +extern const struct inode_operations squashfs_inode_ops; + +/* namei.c */ +extern const struct inode_operations squashfs_dir_inode_ops; + +/* symlink.c */ +extern const struct address_space_operations squashfs_symlink_aops; +extern const struct inode_operations squashfs_symlink_inode_ops; + +/* xattr.c */ +extern const struct xattr_handler *squashfs_xattr_handlers[]; diff --git a/kernel/fs/squashfs/squashfs_fs.h b/kernel/fs/squashfs/squashfs_fs.h new file mode 100644 index 000000000..506f4ba5b --- /dev/null +++ b/kernel/fs/squashfs/squashfs_fs.h @@ -0,0 +1,457 @@ +#ifndef SQUASHFS_FS +#define SQUASHFS_FS +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs.h + */ + +#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE +#define SQUASHFS_MAJOR 4 +#define SQUASHFS_MINOR 0 +#define SQUASHFS_START 0 + +/* size of metadata (inode and directory) blocks */ +#define SQUASHFS_METADATA_SIZE 8192 + +/* default size of block device I/O */ +#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE +#define SQUASHFS_DEVBLK_SIZE 4096 +#else +#define SQUASHFS_DEVBLK_SIZE 1024 +#endif + +#define SQUASHFS_FILE_MAX_SIZE 1048576 +#define SQUASHFS_FILE_MAX_LOG 20 + +/* Max length of filename (not 255) */ +#define SQUASHFS_NAME_LEN 256 + +/* Max value for directory header count*/ +#define SQUASHFS_DIR_COUNT 256 + +#define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_XATTR (0xffffffffU) +#define SQUASHFS_INVALID_BLK (-1LL) + +/* Filesystem flags */ +#define SQUASHFS_NOI 0 +#define SQUASHFS_NOD 1 +#define SQUASHFS_NOF 3 +#define SQUASHFS_NO_FRAG 4 +#define SQUASHFS_ALWAYS_FRAG 5 +#define SQUASHFS_DUPLICATE 6 +#define SQUASHFS_EXPORT 7 +#define SQUASHFS_COMP_OPT 10 + +#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) + +#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOI) + +#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOD) + +#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOF) + +#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NO_FRAG) + +#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_ALWAYS_FRAG) + +#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_DUPLICATE) + +#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_EXPORT) + +#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_COMP_OPT) + +/* Inode types including extended types */ +#define SQUASHFS_DIR_TYPE 1 +#define SQUASHFS_REG_TYPE 2 +#define SQUASHFS_SYMLINK_TYPE 3 +#define SQUASHFS_BLKDEV_TYPE 4 +#define SQUASHFS_CHRDEV_TYPE 5 +#define SQUASHFS_FIFO_TYPE 6 +#define SQUASHFS_SOCKET_TYPE 7 +#define SQUASHFS_LDIR_TYPE 8 +#define SQUASHFS_LREG_TYPE 9 +#define SQUASHFS_LSYMLINK_TYPE 10 +#define SQUASHFS_LBLKDEV_TYPE 11 +#define SQUASHFS_LCHRDEV_TYPE 12 +#define SQUASHFS_LFIFO_TYPE 13 +#define SQUASHFS_LSOCKET_TYPE 14 + +/* Max type value stored in directory entry */ +#define SQUASHFS_MAX_DIR_TYPE 7 + +/* Xattr types */ +#define SQUASHFS_XATTR_USER 0 +#define SQUASHFS_XATTR_TRUSTED 1 +#define SQUASHFS_XATTR_SECURITY 2 +#define SQUASHFS_XATTR_VALUE_OOL 256 +#define SQUASHFS_XATTR_PREFIX_MASK 0xff + +/* Flag whether block is compressed or uncompressed, bit is set if block is + * uncompressed */ +#define SQUASHFS_COMPRESSED_BIT (1 << 15) + +#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \ + (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT) + +#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT)) + +#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24) + +#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \ + ~SQUASHFS_COMPRESSED_BIT_BLOCK) + +#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) + +/* + * Inode number ops. Inodes consist of a compressed block number, and an + * uncompressed offset within that block + */ +#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ + << 16) + (B))) + +/* fragment and fragment table defines */ +#define SQUASHFS_FRAGMENT_BYTES(A) \ + ((A) * sizeof(struct squashfs_fragment_entry)) + +#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\ + sizeof(u64)) + +/* inode lookup table defines */ +#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64)) + +#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\ + sizeof(u64)) + +/* uid/gid lookup table defines */ +#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int)) + +#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ + sizeof(u64)) +/* xattr id lookup table defines */ +#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) + +#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\ + sizeof(u64)) +#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +/* cached data constants for filesystem */ +#define SQUASHFS_CACHED_BLKS 8 + +/* meta index cache */ +#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) +#define SQUASHFS_META_ENTRIES 127 +#define SQUASHFS_META_SLOTS 8 + +struct meta_entry { + u64 data_block; + unsigned int index_block; + unsigned short offset; + unsigned short pad; +}; + +struct meta_index { + unsigned int inode_number; + unsigned int offset; + unsigned short entries; + unsigned short skip; + unsigned short locked; + unsigned short pad; + struct meta_entry meta_entry[SQUASHFS_META_ENTRIES]; +}; + + +/* + * definitions for structures on disk + */ +#define ZLIB_COMPRESSION 1 +#define LZMA_COMPRESSION 2 +#define LZO_COMPRESSION 3 +#define XZ_COMPRESSION 4 +#define LZ4_COMPRESSION 5 + +struct squashfs_super_block { + __le32 s_magic; + __le32 inodes; + __le32 mkfs_time; + __le32 block_size; + __le32 fragments; + __le16 compression; + __le16 block_log; + __le16 flags; + __le16 no_ids; + __le16 s_major; + __le16 s_minor; + __le64 root_inode; + __le64 bytes_used; + __le64 id_table_start; + __le64 xattr_id_table_start; + __le64 inode_table_start; + __le64 directory_table_start; + __le64 fragment_table_start; + __le64 lookup_table_start; +}; + +struct squashfs_dir_index { + __le32 index; + __le32 start_block; + __le32 size; + unsigned char name[0]; +}; + +struct squashfs_base_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; +}; + +struct squashfs_ipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; +}; + +struct squashfs_lipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 xattr; +}; + +struct squashfs_dev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; +}; + +struct squashfs_ldev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; + __le32 xattr; +}; + +struct squashfs_symlink_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 symlink_size; + char symlink[0]; +}; + +struct squashfs_reg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 fragment; + __le32 offset; + __le32 file_size; + __le16 block_list[0]; +}; + +struct squashfs_lreg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le64 start_block; + __le64 file_size; + __le64 sparse; + __le32 nlink; + __le32 fragment; + __le32 offset; + __le32 xattr; + __le16 block_list[0]; +}; + +struct squashfs_dir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 nlink; + __le16 file_size; + __le16 offset; + __le32 parent_inode; +}; + +struct squashfs_ldir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 file_size; + __le32 start_block; + __le32 parent_inode; + __le16 i_count; + __le16 offset; + __le32 xattr; + struct squashfs_dir_index index[0]; +}; + +union squashfs_inode { + struct squashfs_base_inode base; + struct squashfs_dev_inode dev; + struct squashfs_ldev_inode ldev; + struct squashfs_symlink_inode symlink; + struct squashfs_reg_inode reg; + struct squashfs_lreg_inode lreg; + struct squashfs_dir_inode dir; + struct squashfs_ldir_inode ldir; + struct squashfs_ipc_inode ipc; + struct squashfs_lipc_inode lipc; +}; + +struct squashfs_dir_entry { + __le16 offset; + __le16 inode_number; + __le16 type; + __le16 size; + char name[0]; +}; + +struct squashfs_dir_header { + __le32 count; + __le32 start_block; + __le32 inode_number; +}; + +struct squashfs_fragment_entry { + __le64 start_block; + __le32 size; + unsigned int unused; +}; + +struct squashfs_xattr_entry { + __le16 type; + __le16 size; + char data[0]; +}; + +struct squashfs_xattr_val { + __le32 vsize; + char value[0]; +}; + +struct squashfs_xattr_id { + __le64 xattr; + __le32 count; + __le32 size; +}; + +struct squashfs_xattr_id_table { + __le64 xattr_table_start; + __le32 xattr_ids; + __le32 unused; +}; + +#endif diff --git a/kernel/fs/squashfs/squashfs_fs_i.h b/kernel/fs/squashfs/squashfs_fs_i.h new file mode 100644 index 000000000..73588e770 --- /dev/null +++ b/kernel/fs/squashfs/squashfs_fs_i.h @@ -0,0 +1,54 @@ +#ifndef SQUASHFS_FS_I +#define SQUASHFS_FS_I +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_i.h + */ + +struct squashfs_inode_info { + u64 start; + int offset; + u64 xattr; + unsigned int xattr_size; + int xattr_count; + union { + struct { + u64 fragment_block; + int fragment_size; + int fragment_offset; + u64 block_list_start; + }; + struct { + u64 dir_idx_start; + int dir_idx_offset; + int dir_idx_cnt; + int parent; + }; + }; + struct inode vfs_inode; +}; + + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} +#endif diff --git a/kernel/fs/squashfs/squashfs_fs_sb.h b/kernel/fs/squashfs/squashfs_fs_sb.h new file mode 100644 index 000000000..1da565cb5 --- /dev/null +++ b/kernel/fs/squashfs/squashfs_fs_sb.h @@ -0,0 +1,80 @@ +#ifndef SQUASHFS_FS_SB +#define SQUASHFS_FS_SB +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_sb.h + */ + +#include "squashfs_fs.h" + +struct squashfs_cache { + char *name; + int entries; + int curr_blk; + int next_blk; + int num_waiters; + int unused; + int block_size; + int pages; + spinlock_t lock; + wait_queue_head_t wait_queue; + struct squashfs_cache_entry *entry; +}; + +struct squashfs_cache_entry { + u64 block; + int length; + int refcount; + u64 next_index; + int pending; + int error; + int num_waiters; + wait_queue_head_t wait_queue; + struct squashfs_cache *cache; + void **data; + struct squashfs_page_actor *actor; +}; + +struct squashfs_sb_info { + const struct squashfs_decompressor *decompressor; + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + __le64 *xattr_id_table; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + struct squashfs_stream *stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + u64 xattr_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; + int xattr_ids; +}; +#endif diff --git a/kernel/fs/squashfs/super.c b/kernel/fs/squashfs/super.c new file mode 100644 index 000000000..5056babe0 --- /dev/null +++ b/kernel/fs/squashfs/super.c @@ -0,0 +1,508 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * super.c + */ + +/* + * This file implements code to read the superblock, read and initialise + * in-memory structures at mount time, and all the VFS glue code to register + * the filesystem. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" +#include "xattr.h" + +static struct file_system_type squashfs_fs_type; +static const struct super_operations squashfs_super_ops; + +static const struct squashfs_decompressor *supported_squashfs_filesystem(short + major, short minor, short id) +{ + const struct squashfs_decompressor *decompressor; + + if (major < SQUASHFS_MAJOR) { + ERROR("Major/Minor mismatch, older Squashfs %d.%d " + "filesystems are unsupported\n", major, minor); + return NULL; + } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { + ERROR("Major/Minor mismatch, trying to mount newer " + "%d.%d filesystem\n", major, minor); + ERROR("Please update your kernel\n"); + return NULL; + } + + decompressor = squashfs_lookup_decompressor(id); + if (!decompressor->supported) { + ERROR("Filesystem uses \"%s\" compression. This is not " + "supported\n", decompressor->name); + return NULL; + } + + return decompressor; +} + + +static int squashfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct squashfs_sb_info *msblk; + struct squashfs_super_block *sblk = NULL; + char b[BDEVNAME_SIZE]; + struct inode *root; + long long root_inode; + unsigned short flags; + unsigned int fragments; + u64 lookup_table_start, xattr_id_table_start, next_table; + int err; + + TRACE("Entered squashfs_fill_superblock\n"); + + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + ERROR("Failed to allocate squashfs_sb_info\n"); + return -ENOMEM; + } + msblk = sb->s_fs_info; + + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + msblk->devblksize_log2 = ffz(~msblk->devblksize); + + mutex_init(&msblk->meta_index_mutex); + + /* + * msblk->bytes_used is checked in squashfs_read_table to ensure reads + * are not beyond filesystem end. But as we're using + * squashfs_read_table here to read the superblock (including the value + * of bytes_used) we need to set it to an initial sensible dummy value + */ + msblk->bytes_used = sizeof(*sblk); + sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); + + if (IS_ERR(sblk)) { + ERROR("unable to read squashfs_super_block\n"); + err = PTR_ERR(sblk); + sblk = NULL; + goto failed_mount; + } + + err = -EINVAL; + + /* Check it is a SQUASHFS superblock */ + sb->s_magic = le32_to_cpu(sblk->s_magic); + if (sb->s_magic != SQUASHFS_MAGIC) { + if (!silent) + ERROR("Can't find a SQUASHFS superblock on %s\n", + bdevname(sb->s_bdev, b)); + goto failed_mount; + } + + /* Check the MAJOR & MINOR versions and lookup compression type */ + msblk->decompressor = supported_squashfs_filesystem( + le16_to_cpu(sblk->s_major), + le16_to_cpu(sblk->s_minor), + le16_to_cpu(sblk->compression)); + if (msblk->decompressor == NULL) + goto failed_mount; + + /* Check the filesystem does not extend beyond the end of the + block device */ + msblk->bytes_used = le64_to_cpu(sblk->bytes_used); + if (msblk->bytes_used < 0 || msblk->bytes_used > + i_size_read(sb->s_bdev->bd_inode)) + goto failed_mount; + + /* Check block size for sanity */ + msblk->block_size = le32_to_cpu(sblk->block_size); + if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) + goto failed_mount; + + /* + * Check the system page size is not larger than the filesystem + * block size (by default 128K). This is currently not supported. + */ + if (PAGE_CACHE_SIZE > msblk->block_size) { + ERROR("Page size > filesystem block size (%d). This is " + "currently not supported!\n", msblk->block_size); + goto failed_mount; + } + + /* Check block log for sanity */ + msblk->block_log = le16_to_cpu(sblk->block_log); + if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) + goto failed_mount; + + /* Check that block_size and block_log match */ + if (msblk->block_size != (1 << msblk->block_log)) + goto failed_mount; + + /* Check the root inode for sanity */ + root_inode = le64_to_cpu(sblk->root_inode); + if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) + goto failed_mount; + + msblk->inode_table = le64_to_cpu(sblk->inode_table_start); + msblk->directory_table = le64_to_cpu(sblk->directory_table_start); + msblk->inodes = le32_to_cpu(sblk->inodes); + flags = le16_to_cpu(sblk->flags); + + TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) + ? "un" : ""); + TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) + ? "un" : ""); + TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); + TRACE("Block size %d\n", msblk->block_size); + TRACE("Number of inodes %d\n", msblk->inodes); + TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); + TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); + TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); + TRACE("sblk->fragment_table_start %llx\n", + (u64) le64_to_cpu(sblk->fragment_table_start)); + TRACE("sblk->id_table_start %llx\n", + (u64) le64_to_cpu(sblk->id_table_start)); + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_flags |= MS_RDONLY; + sb->s_op = &squashfs_super_ops; + + err = -ENOMEM; + + msblk->block_cache = squashfs_cache_init("metadata", + SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); + if (msblk->block_cache == NULL) + goto failed_mount; + + /* Allocate read_page block */ + msblk->read_page = squashfs_cache_init("data", + squashfs_max_decompressors(), msblk->block_size); + if (msblk->read_page == NULL) { + ERROR("Failed to allocate read_page block\n"); + goto failed_mount; + } + + msblk->stream = squashfs_decompressor_setup(sb, flags); + if (IS_ERR(msblk->stream)) { + err = PTR_ERR(msblk->stream); + msblk->stream = NULL; + goto failed_mount; + } + + /* Handle xattrs */ + sb->s_xattr = squashfs_xattr_handlers; + xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); + if (xattr_id_table_start == SQUASHFS_INVALID_BLK) { + next_table = msblk->bytes_used; + goto allocate_id_index_table; + } + + /* Allocate and read xattr id lookup table */ + msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, + xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); + if (IS_ERR(msblk->xattr_id_table)) { + ERROR("unable to read xattr id index table\n"); + err = PTR_ERR(msblk->xattr_id_table); + msblk->xattr_id_table = NULL; + if (err != -ENOTSUPP) + goto failed_mount; + } + next_table = msblk->xattr_table; + +allocate_id_index_table: + /* Allocate and read id index table */ + msblk->id_table = squashfs_read_id_index_table(sb, + le64_to_cpu(sblk->id_table_start), next_table, + le16_to_cpu(sblk->no_ids)); + if (IS_ERR(msblk->id_table)) { + ERROR("unable to read id index table\n"); + err = PTR_ERR(msblk->id_table); + msblk->id_table = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->id_table[0]); + + /* Handle inode lookup table */ + lookup_table_start = le64_to_cpu(sblk->lookup_table_start); + if (lookup_table_start == SQUASHFS_INVALID_BLK) + goto handle_fragments; + + /* Allocate and read inode lookup table */ + msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, + lookup_table_start, next_table, msblk->inodes); + if (IS_ERR(msblk->inode_lookup_table)) { + ERROR("unable to read inode lookup table\n"); + err = PTR_ERR(msblk->inode_lookup_table); + msblk->inode_lookup_table = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->inode_lookup_table[0]); + + sb->s_export_op = &squashfs_export_ops; + +handle_fragments: + fragments = le32_to_cpu(sblk->fragments); + if (fragments == 0) + goto check_directory_table; + + msblk->fragment_cache = squashfs_cache_init("fragment", + SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); + if (msblk->fragment_cache == NULL) { + err = -ENOMEM; + goto failed_mount; + } + + /* Allocate and read fragment index table */ + msblk->fragment_index = squashfs_read_fragment_index_table(sb, + le64_to_cpu(sblk->fragment_table_start), next_table, fragments); + if (IS_ERR(msblk->fragment_index)) { + ERROR("unable to read fragment index table\n"); + err = PTR_ERR(msblk->fragment_index); + msblk->fragment_index = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->fragment_index[0]); + +check_directory_table: + /* Sanity check directory_table */ + if (msblk->directory_table > next_table) { + err = -EINVAL; + goto failed_mount; + } + + /* Sanity check inode_table */ + if (msblk->inode_table >= msblk->directory_table) { + err = -EINVAL; + goto failed_mount; + } + + /* allocate root */ + root = new_inode(sb); + if (!root) { + err = -ENOMEM; + goto failed_mount; + } + + err = squashfs_read_inode(root, root_inode); + if (err) { + make_bad_inode(root); + iput(root); + goto failed_mount; + } + insert_inode_hash(root); + + sb->s_root = d_make_root(root); + if (sb->s_root == NULL) { + ERROR("Root inode create failed\n"); + err = -ENOMEM; + goto failed_mount; + } + + TRACE("Leaving squashfs_fill_super\n"); + kfree(sblk); + return 0; + +failed_mount: + squashfs_cache_delete(msblk->block_cache); + squashfs_cache_delete(msblk->fragment_cache); + squashfs_cache_delete(msblk->read_page); + squashfs_decompressor_destroy(msblk); + kfree(msblk->inode_lookup_table); + kfree(msblk->fragment_index); + kfree(msblk->id_table); + kfree(msblk->xattr_id_table); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + kfree(sblk); + return err; +} + + +static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); + + TRACE("Entered squashfs_statfs\n"); + + buf->f_type = SQUASHFS_MAGIC; + buf->f_bsize = msblk->block_size; + buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; + buf->f_bfree = buf->f_bavail = 0; + buf->f_files = msblk->inodes; + buf->f_ffree = 0; + buf->f_namelen = SQUASHFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + + +static int squashfs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + *flags |= MS_RDONLY; + return 0; +} + + +static void squashfs_put_super(struct super_block *sb) +{ + if (sb->s_fs_info) { + struct squashfs_sb_info *sbi = sb->s_fs_info; + squashfs_cache_delete(sbi->block_cache); + squashfs_cache_delete(sbi->fragment_cache); + squashfs_cache_delete(sbi->read_page); + squashfs_decompressor_destroy(sbi); + kfree(sbi->id_table); + kfree(sbi->fragment_index); + kfree(sbi->meta_index); + kfree(sbi->inode_lookup_table); + kfree(sbi->xattr_id_table); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + } +} + + +static struct dentry *squashfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); +} + + +static struct kmem_cache *squashfs_inode_cachep; + + +static void init_once(void *foo) +{ + struct squashfs_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + + +static int __init init_inodecache(void) +{ + squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", + sizeof(struct squashfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + + return squashfs_inode_cachep ? 0 : -ENOMEM; +} + + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(squashfs_inode_cachep); +} + + +static int __init init_squashfs_fs(void) +{ + int err = init_inodecache(); + + if (err) + return err; + + err = register_filesystem(&squashfs_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); + + return 0; +} + + +static void __exit exit_squashfs_fs(void) +{ + unregister_filesystem(&squashfs_fs_type); + destroy_inodecache(); +} + + +static struct inode *squashfs_alloc_inode(struct super_block *sb) +{ + struct squashfs_inode_info *ei = + kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + + return ei ? &ei->vfs_inode : NULL; +} + + +static void squashfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); +} + +static void squashfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, squashfs_i_callback); +} + + +static struct file_system_type squashfs_fs_type = { + .owner = THIS_MODULE, + .name = "squashfs", + .mount = squashfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV +}; +MODULE_ALIAS_FS("squashfs"); + +static const struct super_operations squashfs_super_ops = { + .alloc_inode = squashfs_alloc_inode, + .destroy_inode = squashfs_destroy_inode, + .statfs = squashfs_statfs, + .put_super = squashfs_put_super, + .remount_fs = squashfs_remount +}; + +module_init(init_squashfs_fs); +module_exit(exit_squashfs_fs); +MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); +MODULE_AUTHOR("Phillip Lougher "); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/squashfs/symlink.c b/kernel/fs/squashfs/symlink.c new file mode 100644 index 000000000..12806dffb --- /dev/null +++ b/kernel/fs/squashfs/symlink.c @@ -0,0 +1,127 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * symlink.c + */ + +/* + * This file implements code to handle symbolic links. + * + * The data contents of symbolic links are stored inside the symbolic + * link inode within the inode table. This allows the normally small symbolic + * link to be compressed as part of the inode table, achieving much greater + * compression than if the symbolic link was compressed individually. + */ + +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +static int squashfs_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + int index = page->index << PAGE_CACHE_SHIFT; + u64 block = squashfs_i(inode)->start; + int offset = squashfs_i(inode)->offset; + int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); + int bytes, copied; + void *pageaddr; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " + "%llx, offset %x\n", page->index, block, offset); + + /* + * Skip index bytes into symlink metadata. + */ + if (index) { + bytes = squashfs_read_metadata(sb, NULL, &block, &offset, + index); + if (bytes < 0) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + goto error_out; + } + } + + /* + * Read length bytes from symlink metadata. Squashfs_read_metadata + * is not used here because it can sleep and we want to use + * kmap_atomic to map the page. Instead call the underlying + * squashfs_cache_get routine. As length bytes may overlap metadata + * blocks, we may need to call squashfs_cache_get multiple times. + */ + for (bytes = 0; bytes < length; offset = 0, bytes += copied) { + entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); + if (entry->error) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + squashfs_cache_put(entry); + goto error_out; + } + + pageaddr = kmap_atomic(page); + copied = squashfs_copy_data(pageaddr + bytes, entry, offset, + length - bytes); + if (copied == length - bytes) + memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); + else + block = entry->next_index; + kunmap_atomic(pageaddr); + squashfs_cache_put(entry); + } + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +error_out: + SetPageError(page); + unlock_page(page); + return 0; +} + + +const struct address_space_operations squashfs_symlink_aops = { + .readpage = squashfs_symlink_readpage +}; + +const struct inode_operations squashfs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/kernel/fs/squashfs/xattr.c b/kernel/fs/squashfs/xattr.c new file mode 100644 index 000000000..e5e0ddf5b --- /dev/null +++ b/kernel/fs/squashfs/xattr.c @@ -0,0 +1,324 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.c + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const struct xattr_handler *squashfs_xattr_handler(int); + +ssize_t squashfs_listxattr(struct dentry *d, char *buffer, + size_t buffer_size) +{ + struct inode *inode = d_inode(d); + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + size_t rest = buffer_size; + int err; + + /* check that the file system has xattrs */ + if (msblk->xattr_id_table == NULL) + return -EOPNOTSUPP; + + /* loop reading each xattr name */ + while (count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + const struct xattr_handler *handler; + int name_size, prefix_size = 0; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); + if (handler) + prefix_size = handler->list(d, buffer, rest, NULL, + name_size, handler->flags); + if (prefix_size) { + if (buffer) { + if (prefix_size + name_size + 1 > rest) { + err = -ERANGE; + goto failed; + } + buffer += prefix_size; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, name_size); + if (err < 0) + goto failed; + if (buffer) { + buffer[name_size] = '\0'; + buffer += name_size + 1; + } + rest -= prefix_size + name_size + 1; + } else { + /* no handler or insuffficient privileges, so skip */ + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + } + + + /* skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = buffer_size - rest; + +failed: + return err; +} + + +static int squashfs_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t buffer_size) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + int name_len = strlen(name); + int err, vsize; + char *target = kmalloc(name_len, GFP_KERNEL); + + if (target == NULL) + return -ENOMEM; + + /* loop reading each xattr name */ + for (; count; count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + int type, prefix, name_size; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + type = le16_to_cpu(entry.type); + prefix = type & SQUASHFS_XATTR_PREFIX_MASK; + + if (prefix == name_index && name_size == name_len) + err = squashfs_read_metadata(sb, target, &start, + &offset, name_size); + else + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + + if (prefix == name_index && name_size == name_len && + strncmp(target, name, name_size) == 0) { + /* found xattr */ + if (type & SQUASHFS_XATTR_VALUE_OOL) { + __le64 xattr_val; + u64 xattr; + /* val is a reference to the real location */ + err = squashfs_read_metadata(sb, &val, &start, + &offset, sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, &xattr_val, + &start, &offset, sizeof(xattr_val)); + if (err < 0) + goto failed; + xattr = le64_to_cpu(xattr_val); + start = SQUASHFS_XATTR_BLK(xattr) + + msblk->xattr_table; + offset = SQUASHFS_XATTR_OFFSET(xattr); + } + /* read xattr value */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + vsize = le32_to_cpu(val.vsize); + if (buffer) { + if (vsize > buffer_size) { + err = -ERANGE; + goto failed; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, vsize); + if (err < 0) + goto failed; + } + break; + } + + /* no match, skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = count ? vsize : -ENODATA; + +failed: + kfree(target); + return err; +} + + +/* + * User namespace support + */ +static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + if (list && XATTR_USER_PREFIX_LEN <= list_size) + memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + return XATTR_USER_PREFIX_LEN; +} + +static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, + size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = squashfs_user_list, + .get = squashfs_user_get +}; + +/* + * Trusted namespace support + */ +static size_t squashfs_trusted_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size) + memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return XATTR_TRUSTED_PREFIX_LEN; +} + +static int squashfs_trusted_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = squashfs_trusted_list, + .get = squashfs_trusted_get +}; + +/* + * Security namespace support + */ +static size_t squashfs_security_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (list && XATTR_SECURITY_PREFIX_LEN <= list_size) + memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); + return XATTR_SECURITY_PREFIX_LEN; +} + +static int squashfs_security_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = squashfs_security_list, + .get = squashfs_security_get +}; + +static const struct xattr_handler *squashfs_xattr_handler(int type) +{ + if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) + /* ignore unrecognised type */ + return NULL; + + switch (type & SQUASHFS_XATTR_PREFIX_MASK) { + case SQUASHFS_XATTR_USER: + return &squashfs_xattr_user_handler; + case SQUASHFS_XATTR_TRUSTED: + return &squashfs_xattr_trusted_handler; + case SQUASHFS_XATTR_SECURITY: + return &squashfs_xattr_security_handler; + default: + /* ignore unrecognised type */ + return NULL; + } +} + +const struct xattr_handler *squashfs_xattr_handlers[] = { + &squashfs_xattr_user_handler, + &squashfs_xattr_trusted_handler, + &squashfs_xattr_security_handler, + NULL +}; + diff --git a/kernel/fs/squashfs/xattr.h b/kernel/fs/squashfs/xattr.h new file mode 100644 index 000000000..c83f5d9ec --- /dev/null +++ b/kernel/fs/squashfs/xattr.h @@ -0,0 +1,47 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.h + */ + +#ifdef CONFIG_SQUASHFS_XATTR +extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, + u64 *, int *); +extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, + unsigned int *, unsigned long long *); +#else +static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, + u64 start, u64 *xattr_table_start, int *xattr_ids) +{ + ERROR("Xattrs in filesystem, these will be ignored\n"); + *xattr_table_start = start; + return ERR_PTR(-ENOTSUPP); +} + +static inline int squashfs_xattr_lookup(struct super_block *sb, + unsigned int index, int *count, unsigned int *size, + unsigned long long *xattr) +{ + return 0; +} +#define squashfs_listxattr NULL +#define generic_getxattr NULL +#define squashfs_xattr_handlers NULL +#endif diff --git a/kernel/fs/squashfs/xattr_id.c b/kernel/fs/squashfs/xattr_id.c new file mode 100644 index 000000000..c89607d69 --- /dev/null +++ b/kernel/fs/squashfs/xattr_id.c @@ -0,0 +1,95 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +/* + * This file implements code to map the 32-bit xattr id stored in the inode + * into the on disk location of the xattr data. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Map xattr id using the xattr id look up table + */ +int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, + int *count, unsigned int *size, unsigned long long *xattr) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_XATTR_BLOCK(index); + int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + struct squashfs_xattr_id id; + int err; + + err = squashfs_read_metadata(sb, &id, &start_block, &offset, + sizeof(id)); + if (err < 0) + return err; + + *xattr = le64_to_cpu(id.xattr); + *size = le32_to_cpu(id.size); + *count = le32_to_cpu(id.count); + return 0; +} + + +/* + * Read uncompressed xattr id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, + u64 *xattr_table_start, int *xattr_ids) +{ + unsigned int len; + struct squashfs_xattr_id_table *id_table; + + id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + if (IS_ERR(id_table)) + return (__le64 *) id_table; + + *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); + *xattr_ids = le32_to_cpu(id_table->xattr_ids); + kfree(id_table); + + /* Sanity check values */ + + /* there is always at least one xattr id */ + if (*xattr_ids == 0) + return ERR_PTR(-EINVAL); + + /* xattr_table should be less than start */ + if (*xattr_table_start >= start) + return ERR_PTR(-EINVAL); + + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + + TRACE("In read_xattr_index_table, length %d\n", len); + + return squashfs_read_table(sb, start + sizeof(*id_table), len); +} diff --git a/kernel/fs/squashfs/xz_wrapper.c b/kernel/fs/squashfs/xz_wrapper.c new file mode 100644 index 000000000..c609624e4 --- /dev/null +++ b/kernel/fs/squashfs/xz_wrapper.c @@ -0,0 +1,193 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xz_wrapper.c + */ + + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +struct squashfs_xz { + struct xz_dec *state; + struct xz_buf buf; +}; + +struct disk_comp_opts { + __le32 dictionary_size; + __le32 flags; +}; + +struct comp_opts { + int dict_size; +}; + +static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) +{ + struct disk_comp_opts *comp_opts = buff; + struct comp_opts *opts; + int err = 0, n; + + opts = kmalloc(sizeof(*opts), GFP_KERNEL); + if (opts == NULL) { + err = -ENOMEM; + goto out2; + } + + if (comp_opts) { + /* check compressor options are the expected length */ + if (len < sizeof(*comp_opts)) { + err = -EIO; + goto out; + } + + opts->dict_size = le32_to_cpu(comp_opts->dictionary_size); + + /* the dictionary size should be 2^n or 2^n+2^(n+1) */ + n = ffs(opts->dict_size) - 1; + if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) + + (1 << (n + 1))) { + err = -EIO; + goto out; + } + } else + /* use defaults */ + opts->dict_size = max_t(int, msblk->block_size, + SQUASHFS_METADATA_SIZE); + + return opts; + +out: + kfree(opts); +out2: + return ERR_PTR(err); +} + + +static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct comp_opts *comp_opts = buff; + struct squashfs_xz *stream; + int err; + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) { + err = -ENOMEM; + goto failed; + } + + stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size); + if (stream->state == NULL) { + kfree(stream); + err = -ENOMEM; + goto failed; + } + + return stream; + +failed: + ERROR("Failed to initialise xz decompressor\n"); + return ERR_PTR(err); +} + + +static void squashfs_xz_free(void *strm) +{ + struct squashfs_xz *stream = strm; + + if (stream) { + xz_dec_end(stream->state); + kfree(stream); + } +} + + +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + enum xz_ret xz_err; + int avail, total = 0, k = 0; + struct squashfs_xz *stream = strm; + + xz_dec_reset(stream->state); + stream->buf.in_pos = 0; + stream->buf.in_size = 0; + stream->buf.out_pos = 0; + stream->buf.out_size = PAGE_CACHE_SIZE; + stream->buf.out = squashfs_first_page(output); + + do { + if (stream->buf.in_pos == stream->buf.in_size && k < b) { + avail = min(length, msblk->devblksize - offset); + length -= avail; + stream->buf.in = bh[k]->b_data + offset; + stream->buf.in_size = avail; + stream->buf.in_pos = 0; + offset = 0; + } + + if (stream->buf.out_pos == stream->buf.out_size) { + stream->buf.out = squashfs_next_page(output); + if (stream->buf.out != NULL) { + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } + } + + xz_err = xz_dec_run(stream->state, &stream->buf); + + if (stream->buf.in_pos == stream->buf.in_size && k < b) + put_bh(bh[k++]); + } while (xz_err == XZ_OK); + + squashfs_finish_page(output); + + if (xz_err != XZ_STREAM_END || k < b) + goto out; + + return total + stream->buf.out_pos; + +out: + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_xz_comp_ops = { + .init = squashfs_xz_init, + .comp_opts = squashfs_xz_comp_opts, + .free = squashfs_xz_free, + .decompress = squashfs_xz_uncompress, + .id = XZ_COMPRESSION, + .name = "xz", + .supported = 1 +}; diff --git a/kernel/fs/squashfs/zlib_wrapper.c b/kernel/fs/squashfs/zlib_wrapper.c new file mode 100644 index 000000000..8727caba6 --- /dev/null +++ b/kernel/fs/squashfs/zlib_wrapper.c @@ -0,0 +1,135 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * zlib_wrapper.c + */ + + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) +{ + z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->workspace = vmalloc(zlib_inflate_workspacesize()); + if (stream->workspace == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate zlib workspace\n"); + kfree(stream); + return ERR_PTR(-ENOMEM); +} + + +static void zlib_free(void *strm) +{ + z_stream *stream = strm; + + if (stream) + vfree(stream->workspace); + kfree(stream); +} + + +static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + int zlib_err, zlib_init = 0, k = 0; + z_stream *stream = strm; + + stream->avail_out = PAGE_CACHE_SIZE; + stream->next_out = squashfs_first_page(output); + stream->avail_in = 0; + + do { + if (stream->avail_in == 0 && k < b) { + int avail = min(length, msblk->devblksize - offset); + length -= avail; + stream->next_in = bh[k]->b_data + offset; + stream->avail_in = avail; + offset = 0; + } + + if (stream->avail_out == 0) { + stream->next_out = squashfs_next_page(output); + if (stream->next_out != NULL) + stream->avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(stream); + if (zlib_err != Z_OK) { + squashfs_finish_page(output); + goto out; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); + + if (stream->avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + squashfs_finish_page(output); + + if (zlib_err != Z_STREAM_END) + goto out; + + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) + goto out; + + if (k < b) + goto out; + + return stream->total_out; + +out: + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zlib_comp_ops = { + .init = zlib_init, + .free = zlib_free, + .decompress = zlib_uncompress, + .id = ZLIB_COMPRESSION, + .name = "zlib", + .supported = 1 +}; + diff --git a/kernel/fs/stack.c b/kernel/fs/stack.c new file mode 100644 index 000000000..a54e33ed1 --- /dev/null +++ b/kernel/fs/stack.c @@ -0,0 +1,76 @@ +#include +#include +#include + +/* does _NOT_ require i_mutex to be held. + * + * This function cannot be inlined since i_size_{read,write} is rather + * heavy-weight on 32-bit systems + */ +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) +{ + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); +} +EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); + +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) +{ + dest->i_mode = src->i_mode; + dest->i_uid = src->i_uid; + dest->i_gid = src->i_gid; + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; + set_nlink(dest, src->i_nlink); +} +EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/kernel/fs/stat.c b/kernel/fs/stat.c new file mode 100644 index 000000000..cccc1aab9 --- /dev/null +++ b/kernel/fs/stat.c @@ -0,0 +1,511 @@ +/* + * linux/fs/stat.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void generic_fillattr(struct inode *inode, struct kstat *stat) +{ + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = (1 << inode->i_blkbits); + stat->blocks = inode->i_blocks; +} + +EXPORT_SYMBOL(generic_fillattr); + +/** + * vfs_getattr_nosec - getattr without security checks + * @path: file to get attributes from + * @stat: structure to return attributes in + * + * Get attributes without calling security_inode_getattr. + * + * Currently the only caller other than vfs_getattr is internal to the + * filehandle lookup code, which uses only the inode number and returns + * no attributes to any user. Any other code probably wants + * vfs_getattr. + */ +int vfs_getattr_nosec(struct path *path, struct kstat *stat) +{ + struct inode *inode = d_backing_inode(path->dentry); + + if (inode->i_op->getattr) + return inode->i_op->getattr(path->mnt, path->dentry, stat); + + generic_fillattr(inode, stat); + return 0; +} + +EXPORT_SYMBOL(vfs_getattr_nosec); + +int vfs_getattr(struct path *path, struct kstat *stat) +{ + int retval; + + retval = security_inode_getattr(path); + if (retval) + return retval; + return vfs_getattr_nosec(path, stat); +} + +EXPORT_SYMBOL(vfs_getattr); + +int vfs_fstat(unsigned int fd, struct kstat *stat) +{ + struct fd f = fdget_raw(fd); + int error = -EBADF; + + if (f.file) { + error = vfs_getattr(&f.file->f_path, stat); + fdput(f); + } + return error; +} +EXPORT_SYMBOL(vfs_fstat); + +int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, + int flag) +{ + struct path path; + int error = -EINVAL; + unsigned int lookup_flags = 0; + + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) + goto out; + + if (!(flag & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = vfs_getattr(&path, stat); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + return error; +} +EXPORT_SYMBOL(vfs_fstatat); + +int vfs_stat(const char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, 0); +} +EXPORT_SYMBOL(vfs_stat); + +int vfs_lstat(const char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); +} +EXPORT_SYMBOL(vfs_lstat); + + +#ifdef __ARCH_WANT_OLD_STAT + +/* + * For backward compatibility? Maybe this should be moved + * into arch/i386 instead? + */ +static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf) +{ + static int warncount = 5; + struct __old_kernel_stat tmp; + + if (warncount > 0) { + warncount--; + printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n", + current->comm); + } else if (warncount < 0) { + /* it's laughable, but... */ + warncount = 0; + } + + memset(&tmp, 0, sizeof(struct __old_kernel_stat)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = old_encode_dev(stat->rdev); +#if BITS_PER_LONG == 32 + if (stat->size > MAX_NON_LFS) + return -EOVERFLOW; +#endif + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_ctime = stat->ctime.tv_sec; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(stat, const char __user *, filename, + struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + + return cp_old_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(lstat, const char __user *, filename, + struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + + return cp_old_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_old_stat(&stat, statbuf); + + return error; +} + +#endif /* __ARCH_WANT_OLD_STAT */ + +#if BITS_PER_LONG == 32 +# define choose_32_64(a,b) a +#else +# define choose_32_64(a,b) b +#endif + +#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) +#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) + +#ifndef INIT_STRUCT_STAT_PADDING +# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + +static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) +{ + struct stat tmp; + + if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + return -EOVERFLOW; +#if BITS_PER_LONG == 32 + if (stat->size > MAX_NON_LFS) + return -EOVERFLOW; +#endif + + INIT_STRUCT_STAT_PADDING(tmp); + tmp.st_dev = encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_ctime = stat->ctime.tv_sec; +#ifdef STAT_HAVE_NSEC + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; +#endif + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_stat(filename, &stat); + + if (error) + return error; + return cp_new_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + + return cp_new_stat(&stat, statbuf); +} + +#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) +SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, + struct stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat(&stat, statbuf); +} +#endif + +SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_new_stat(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, + char __user *, buf, int, bufsiz) +{ + struct path path; + int error; + int empty = 0; + unsigned int lookup_flags = LOOKUP_EMPTY; + + if (bufsiz <= 0) + return -EINVAL; + +retry: + error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); + if (!error) { + struct inode *inode = d_backing_inode(path.dentry); + + error = empty ? -ENOENT : -EINVAL; + if (inode->i_op->readlink) { + error = security_inode_readlink(path.dentry); + if (!error) { + touch_atime(&path); + error = inode->i_op->readlink(path.dentry, + buf, bufsiz); + } + } + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + return error; +} + +SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, + int, bufsiz) +{ + return sys_readlinkat(AT_FDCWD, path, buf, bufsiz); +} + + +/* ---------- LFS-64 ----------- */ +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) + +#ifndef INIT_STRUCT_STAT64_PADDING +# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + +static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) +{ + struct stat64 tmp; + + INIT_STRUCT_STAT64_PADDING(tmp); +#ifdef CONFIG_MIPS + /* mips has weird padding, so we don't get 64 bits there */ + if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) + return -EOVERFLOW; + tmp.st_dev = new_encode_dev(stat->dev); + tmp.st_rdev = new_encode_dev(stat->rdev); +#else + tmp.st_dev = huge_encode_dev(stat->dev); + tmp.st_rdev = huge_encode_dev(stat->rdev); +#endif + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; +#ifdef STAT64_HAS_BROKEN_ST_INO + tmp.__st_ino = stat->ino; +#endif + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_size = stat->size; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_stat(filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE2(lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_lstat(filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat64(&stat, statbuf); +} +#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ + +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks += bytes >> 9; + bytes &= 511; + inode->i_bytes += bytes; + if (inode->i_bytes >= 512) { + inode->i_blocks++; + inode->i_bytes -= 512; + } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); + spin_unlock(&inode->i_lock); +} + +EXPORT_SYMBOL(inode_add_bytes); + +void __inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks -= bytes >> 9; + bytes &= 511; + if (inode->i_bytes < bytes) { + inode->i_blocks--; + inode->i_bytes += 512; + } + inode->i_bytes -= bytes; +} + +EXPORT_SYMBOL(__inode_sub_bytes); + +void inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_sub_bytes(inode, bytes); + spin_unlock(&inode->i_lock); +} + +EXPORT_SYMBOL(inode_sub_bytes); + +loff_t inode_get_bytes(struct inode *inode) +{ + loff_t ret; + + spin_lock(&inode->i_lock); + ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; + spin_unlock(&inode->i_lock); + return ret; +} + +EXPORT_SYMBOL(inode_get_bytes); + +void inode_set_bytes(struct inode *inode, loff_t bytes) +{ + /* Caller is here responsible for sufficient locking + * (ie. inode->i_lock) */ + inode->i_blocks = bytes >> 9; + inode->i_bytes = bytes & 511; +} + +EXPORT_SYMBOL(inode_set_bytes); diff --git a/kernel/fs/statfs.c b/kernel/fs/statfs.c new file mode 100644 index 000000000..083dc0ac9 --- /dev/null +++ b/kernel/fs/statfs.c @@ -0,0 +1,241 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int flags_by_mnt(int mnt_flags) +{ + int flags = 0; + + if (mnt_flags & MNT_READONLY) + flags |= ST_RDONLY; + if (mnt_flags & MNT_NOSUID) + flags |= ST_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= ST_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= ST_NOEXEC; + if (mnt_flags & MNT_NOATIME) + flags |= ST_NOATIME; + if (mnt_flags & MNT_NODIRATIME) + flags |= ST_NODIRATIME; + if (mnt_flags & MNT_RELATIME) + flags |= ST_RELATIME; + return flags; +} + +static int flags_by_sb(int s_flags) +{ + int flags = 0; + if (s_flags & MS_SYNCHRONOUS) + flags |= ST_SYNCHRONOUS; + if (s_flags & MS_MANDLOCK) + flags |= ST_MANDLOCK; + return flags; +} + +static int calculate_f_flags(struct vfsmount *mnt) +{ + return ST_VALID | flags_by_mnt(mnt->mnt_flags) | + flags_by_sb(mnt->mnt_sb->s_flags); +} + +static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) +{ + int retval; + + if (!dentry->d_sb->s_op->statfs) + return -ENOSYS; + + memset(buf, 0, sizeof(*buf)); + retval = security_sb_statfs(dentry); + if (retval) + return retval; + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; + return retval; +} + +int vfs_statfs(struct path *path, struct kstatfs *buf) +{ + int error; + + error = statfs_by_dentry(path->dentry, buf); + if (!error) + buf->f_flags = calculate_f_flags(path->mnt); + return error; +} +EXPORT_SYMBOL(vfs_statfs); + +int user_statfs(const char __user *pathname, struct kstatfs *st) +{ + struct path path; + int error; + unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + return error; +} + +int fd_statfs(int fd, struct kstatfs *st) +{ + struct fd f = fdget_raw(fd); + int error = -EBADF; + if (f.file) { + error = vfs_statfs(&f.file->f_path, st); + fdput(f); + } + return error; +} + +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); + else { + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & + 0xffffffff00000000ULL) + return -EOVERFLOW; + /* + * f_files and f_ffree may be -1; it's okay to stuff + * that into 32 bits + */ + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); + } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; + return 0; +} + +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) +{ + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); + else { + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); + } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) +{ + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); + return error; +} + +SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) +{ + struct kstatfs st; + int error; + if (sz != sizeof(*buf)) + return -EINVAL; + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); + return error; +} + +SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) +{ + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); + return error; +} + +SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) +{ + struct kstatfs st; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); + return error; +} + +int vfs_ustat(dev_t dev, struct kstatfs *sbuf) +{ + struct super_block *s = user_get_super(dev); + int err; + if (!s) + return -EINVAL; + + err = statfs_by_dentry(s->s_root, sbuf); + drop_super(s); + return err; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); + if (err) + return err; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; +} diff --git a/kernel/fs/super.c b/kernel/fs/super.c new file mode 100644 index 000000000..928c20f47 --- /dev/null +++ b/kernel/fs/super.c @@ -0,0 +1,1396 @@ +/* + * linux/fs/super.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * super.c contains code to handle: - mount structures + * - super-block tables + * - filesystem drivers list + * - mount system call + * - umount system call + * - ustat system call + * + * GK 2/5/95 - Changed to support mounting the root fs via NFS + * + * Added kerneld support: Jacques Gelinas and Bjorn Ekwall + * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Added options to /proc/mounts: + * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. + * Added devfs support: Richard Gooch , 13-JAN-1998 + * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 + */ + +#include +#include +#include +#include +#include +#include /* for the emergency remount stuff */ +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static LIST_HEAD(super_blocks); +static DEFINE_SPINLOCK(sb_lock); + +static char *sb_writers_name[SB_FREEZE_LEVELS] = { + "sb_writers", + "sb_pagefaults", + "sb_internal", +}; + +/* + * One thing we have to be careful of with a per-sb shrinker is that we don't + * drop the last active reference to the superblock from within the shrinker. + * If that happens we could trigger unregistering the shrinker from within the + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * take a passive reference to the superblock to avoid this from occurring. + */ +static unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct super_block *sb; + long fs_objects = 0; + long total_objects; + long freed = 0; + long dentries; + long inodes; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Deadlock avoidance. We may hold various FS locks, and we don't want + * to recurse into the FS that called us in clear_inode() and friends.. + */ + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + if (!trylock_super(sb)) + return SHRINK_STOP; + + if (sb->s_op->nr_cached_objects) + fs_objects = sb->s_op->nr_cached_objects(sb, sc); + + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects = dentries + inodes + fs_objects + 1; + if (!total_objects) + total_objects = 1; + + /* proportion the scan between the caches */ + dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); + inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); + + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + * + * Ensure that we always scan at least one object - memcg kmem + * accounting uses this to fully empty the caches. + */ + sc->nr_to_scan = dentries + 1; + freed = prune_dcache_sb(sb, sc); + sc->nr_to_scan = inodes + 1; + freed += prune_icache_sb(sb, sc); + + if (fs_objects) { + sc->nr_to_scan = fs_objects + 1; + freed += sb->s_op->free_cached_objects(sb, sc); + } + + up_read(&sb->s_umount); + return freed; +} + +static unsigned long super_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct super_block *sb; + long total_objects = 0; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Don't call trylock_super as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_shrink_count() and + * s_op->nr_cached_objects(). + */ + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, sc); + + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); + + total_objects = vfs_pressure_ratio(total_objects); + return total_objects; +} + +/** + * destroy_super - frees a superblock + * @s: superblock to free + * + * Frees a superblock. + */ +static void destroy_super(struct super_block *s) +{ + int i; + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); + security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); + kfree(s->s_subtype); + kfree(s->s_options); + kfree_rcu(s, rcu); +} + +/** + * alloc_super - create new superblock + * @type: filesystem type superblock should belong to + * @flags: the mount flags + * + * Allocates and initializes a new &struct super_block. alloc_super() + * returns a pointer new superblock or %NULL if allocation had failed. + */ +static struct super_block *alloc_super(struct file_system_type *type, int flags) +{ + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + static const struct super_operations default_op; + int i; + + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->s_mounts); + + if (security_sb_alloc(s)) + goto fail; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) + goto fail; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); + } + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_bdi = &noop_backing_dev_info; + s->s_flags = flags; + INIT_HLIST_NODE(&s->s_instances); + INIT_HLIST_BL_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + + if (list_lru_init_memcg(&s->s_dentry_lru)) + goto fail; + if (list_lru_init_memcg(&s->s_inode_lru)) + goto fail; + + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); + s->s_count = 1; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + s->s_maxbytes = MAX_NON_LFS; + s->s_op = &default_op; + s->s_time_gran = 1000000000; + s->cleancache_poolid = CLEANCACHE_NO_POOL; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; + s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; + return s; + +fail: + destroy_super(s); + return NULL; +} + +/* Superblock refcounting */ + +/* + * Drop a superblock's refcount. The caller must hold sb_lock. + */ +static void __put_super(struct super_block *sb) +{ + if (!--sb->s_count) { + list_del_init(&sb->s_list); + destroy_super(sb); + } +} + +/** + * put_super - drop a temporary reference to superblock + * @sb: superblock in question + * + * Drops a temporary reference, frees superblock if there's no + * references left. + */ +static void put_super(struct super_block *sb) +{ + spin_lock(&sb_lock); + __put_super(sb); + spin_unlock(&sb_lock); +} + + +/** + * deactivate_locked_super - drop an active reference to superblock + * @s: superblock to deactivate + * + * Drops an active reference to superblock, converting it into a temprory + * one if there is no other active references left. In that case we + * tell fs driver to shut it down and drop the temporary reference we + * had just acquired. + * + * Caller holds exclusive lock on superblock; that lock is released. + */ +void deactivate_locked_super(struct super_block *s) +{ + struct file_system_type *fs = s->s_type; + if (atomic_dec_and_test(&s->s_active)) { + cleancache_invalidate_fs(s); + unregister_shrinker(&s->s_shrink); + fs->kill_sb(s); + + /* + * Since list_lru_destroy() may sleep, we cannot call it from + * put_super(), where we hold the sb_lock. Therefore we destroy + * the lru lists right now. + */ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + + put_filesystem(fs); + put_super(s); + } else { + up_write(&s->s_umount); + } +} + +EXPORT_SYMBOL(deactivate_locked_super); + +/** + * deactivate_super - drop an active reference to superblock + * @s: superblock to deactivate + * + * Variant of deactivate_locked_super(), except that superblock is *not* + * locked by caller. If we are going to drop the final active reference, + * lock will be acquired prior to that. + */ +void deactivate_super(struct super_block *s) +{ + if (!atomic_add_unless(&s->s_active, -1, 1)) { + down_write(&s->s_umount); + deactivate_locked_super(s); + } +} + +EXPORT_SYMBOL(deactivate_super); + +/** + * grab_super - acquire an active reference + * @s: reference we are trying to make active + * + * Tries to acquire an active reference. grab_super() is used when we + * had just found a superblock in super_blocks or fs_type->fs_supers + * and want to turn it into a full-blown active reference. grab_super() + * is called with sb_lock held and drops it. Returns 1 in case of + * success, 0 if we had failed (superblock contents was already dead or + * dying when grab_super() had been called). Note that this is only + * called for superblocks not in rundown mode (== ones still on ->fs_supers + * of their type), so increment of ->s_count is OK here. + */ +static int grab_super(struct super_block *s) __releases(sb_lock) +{ + s->s_count++; + spin_unlock(&sb_lock); + down_write(&s->s_umount); + if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) { + put_super(s); + return 1; + } + up_write(&s->s_umount); + put_super(s); + return 0; +} + +/* + * trylock_super - try to grab ->s_umount shared + * @sb: reference we are trying to grab + * + * Try to prevent fs shutdown. This is used in places where we + * cannot take an active reference but we need to ensure that the + * filesystem is not shut down while we are working on it. It returns + * false if we cannot acquire s_umount or if we lose the race and + * filesystem already got into shutdown, and returns true with the s_umount + * lock held in read mode in case of success. On successful return, + * the caller must drop the s_umount lock when done. + * + * Note that unlike get_super() et.al. this one does *not* bump ->s_count. + * The reason why it's safe is that we are OK with doing trylock instead + * of down_read(). There's a couple of places that are OK with that, but + * it's very much not a general-purpose interface. + */ +bool trylock_super(struct super_block *sb) +{ + if (down_read_trylock(&sb->s_umount)) { + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & MS_BORN)) + return true; + up_read(&sb->s_umount); + } + + return false; +} + +/** + * generic_shutdown_super - common helper for ->kill_sb() + * @sb: superblock to kill + * + * generic_shutdown_super() does all fs-independent work on superblock + * shutdown. Typical ->kill_sb() should pick all fs-specific objects + * that need destruction out of superblock, call generic_shutdown_super() + * and release aforementioned objects. Note: dentries and inodes _are_ + * taken care of and do not need specific handling. + * + * Upon calling this function, the filesystem may no longer alter or + * rearrange the set of dentries belonging to this super_block, nor may it + * change the attachments of dentries to inodes. + */ +void generic_shutdown_super(struct super_block *sb) +{ + const struct super_operations *sop = sb->s_op; + + if (sb->s_root) { + shrink_dcache_for_umount(sb); + sync_filesystem(sb); + sb->s_flags &= ~MS_ACTIVE; + + fsnotify_unmount_inodes(&sb->s_inodes); + + evict_inodes(sb); + + if (sb->s_dio_done_wq) { + destroy_workqueue(sb->s_dio_done_wq); + sb->s_dio_done_wq = NULL; + } + + if (sop->put_super) + sop->put_super(sb); + + if (!list_empty(&sb->s_inodes)) { + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); + } + } + spin_lock(&sb_lock); + /* should be initialized for __put_super_and_need_restart() */ + hlist_del_init(&sb->s_instances); + spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +EXPORT_SYMBOL(generic_shutdown_super); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct super_block *s = NULL; + struct super_block *old; + int err; + +retry: + spin_lock(&sb_lock); + if (test) { + hlist_for_each_entry(old, &type->fs_supers, s_instances) { + if (!test(old, data)) + continue; + if (!grab_super(old)) + goto retry; + if (s) { + up_write(&s->s_umount); + destroy_super(s); + s = NULL; + } + return old; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(type, flags); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + err = set(s, data); + if (err) { + spin_unlock(&sb_lock); + up_write(&s->s_umount); + destroy_super(s); + return ERR_PTR(err); + } + s->s_type = type; + strlcpy(s->s_id, type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + hlist_add_head(&s->s_instances, &type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(type); + register_shrinker(&s->s_shrink); + return s; +} + +EXPORT_SYMBOL(sget); + +void drop_super(struct super_block *sb) +{ + up_read(&sb->s_umount); + put_super(sb); +} + +EXPORT_SYMBOL(drop_super); + +/** + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root && (sb->s_flags & MS_BORN)) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +/** + * iterate_supers_type - call function for superblocks of given type + * @type: fs type + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers_type(struct file_system_type *type, + void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + hlist_for_each_entry(sb, &type->fs_supers, s_instances) { + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root && (sb->s_flags & MS_BORN)) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +EXPORT_SYMBOL(iterate_supers_type); + +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ + +struct super_block *get_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + + spin_lock(&sb_lock); +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + if (sb->s_bdev == bdev) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + /* still alive? */ + if (sb->s_root && (sb->s_flags & MS_BORN)) + return sb; + up_read(&sb->s_umount); + /* nope, got unmounted */ + spin_lock(&sb_lock); + __put_super(sb); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +EXPORT_SYMBOL(get_super); + +/** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + while (1) { + struct super_block *s = get_super(bdev); + if (!s || s->s_writers.frozen == SB_UNFROZEN) + return s; + up_read(&s->s_umount); + wait_event(s->s_writers.wait_unfrozen, + s->s_writers.frozen == SB_UNFROZEN); + put_super(s); + } +} +EXPORT_SYMBOL(get_super_thawed); + +/** + * get_active_super - get an active reference to the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. Returns the superblock with an active + * reference or %NULL if none was found. + */ +struct super_block *get_active_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + +restart: + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + if (sb->s_bdev == bdev) { + if (!grab_super(sb)) + goto restart; + up_write(&sb->s_umount); + return sb; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +struct super_block *user_get_super(dev_t dev) +{ + struct super_block *sb; + + spin_lock(&sb_lock); +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + if (sb->s_dev == dev) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + /* still alive? */ + if (sb->s_root && (sb->s_flags & MS_BORN)) + return sb; + up_read(&sb->s_umount); + /* nope, got unmounted */ + spin_lock(&sb_lock); + __put_super(sb); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +/** + * do_remount_sb - asks filesystem to change mount options. + * @sb: superblock in question + * @flags: numeric part of options + * @data: the rest of options + * @force: whether or not to force the change + * + * Alters the mount options of a mounted file system. + */ +int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +{ + int retval; + int remount_ro; + + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + +#ifdef CONFIG_BLOCK + if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; +#endif + + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + + if (remount_ro) { + if (!hlist_empty(&sb->s_pins)) { + up_write(&sb->s_umount); + group_pin_kill(&sb->s_pins); + down_write(&sb->s_umount); + if (!sb->s_root) + return 0; + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + } + } + shrink_dcache_sb(sb); + + /* If we are remounting RDONLY and current sb is read/write, + make sure there are no rw files opened */ + if (remount_ro) { + if (force) { + sb->s_readonly_remount = 1; + smp_wmb(); + } else { + retval = sb_prepare_remount_readonly(sb); + if (retval) + return retval; + } + } + + if (sb->s_op->remount_fs) { + retval = sb->s_op->remount_fs(sb, &flags, data); + if (retval) { + if (!force) + goto cancel_readonly; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } + } + sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + /* Needs to be ordered wrt mnt_is_readonly() */ + smp_wmb(); + sb->s_readonly_remount = 0; + + /* + * Some filesystems modify their metadata via some other path than the + * bdev buffer cache (eg. use a private mapping, or directories in + * pagecache, etc). Also file data modifications go via their own + * mappings. So If we try to mount readonly then copy the filesystem + * from bdev, we could get stale data, so invalidate it to give a best + * effort at coherency. + */ + if (remount_ro && sb->s_bdev) + invalidate_bdev(sb->s_bdev); + return 0; + +cancel_readonly: + sb->s_readonly_remount = 0; + return retval; +} + +static void do_emergency_remount(struct work_struct *work) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_write(&sb->s_umount); + if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) && + !(sb->s_flags & MS_RDONLY)) { + /* + * What lock protects sb->s_flags?? + */ + do_remount_sb(sb, MS_RDONLY, NULL, 1); + } + up_write(&sb->s_umount); + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); + kfree(work); + printk("Emergency Remount complete\n"); +} + +void emergency_remount(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_emergency_remount); + schedule_work(work); + } +} + +/* + * Unnamed block devices are dummy devices used by virtual + * filesystems which don't use real block-devices. -- jrs + */ + +static DEFINE_IDA(unnamed_dev_ida); +static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; + +int get_anon_bdev(dev_t *p) +{ + int dev; + int error; + + retry: + if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0) + return -ENOMEM; + spin_lock(&unnamed_dev_lock); + error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev); + if (!error) + unnamed_dev_start = dev + 1; + spin_unlock(&unnamed_dev_lock); + if (error == -EAGAIN) + /* We raced and lost with another CPU. */ + goto retry; + else if (error) + return -EAGAIN; + + if (dev == (1 << MINORBITS)) { + spin_lock(&unnamed_dev_lock); + ida_remove(&unnamed_dev_ida, dev); + if (unnamed_dev_start > dev) + unnamed_dev_start = dev; + spin_unlock(&unnamed_dev_lock); + return -EMFILE; + } + *p = MKDEV(0, dev & MINORMASK); + return 0; +} +EXPORT_SYMBOL(get_anon_bdev); + +void free_anon_bdev(dev_t dev) +{ + int slot = MINOR(dev); + spin_lock(&unnamed_dev_lock); + ida_remove(&unnamed_dev_ida, slot); + if (slot < unnamed_dev_start) + unnamed_dev_start = slot; + spin_unlock(&unnamed_dev_lock); +} +EXPORT_SYMBOL(free_anon_bdev); + +int set_anon_super(struct super_block *s, void *data) +{ + return get_anon_bdev(&s->s_dev); +} + +EXPORT_SYMBOL(set_anon_super); + +void kill_anon_super(struct super_block *sb) +{ + dev_t dev = sb->s_dev; + generic_shutdown_super(sb); + free_anon_bdev(dev); +} + +EXPORT_SYMBOL(kill_anon_super); + +void kill_litter_super(struct super_block *sb) +{ + if (sb->s_root) + d_genocide(sb->s_root); + kill_anon_super(sb); +} + +EXPORT_SYMBOL(kill_litter_super); + +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +struct dentry *mount_ns(struct file_system_type *fs_type, int flags, + void *data, int (*fill_super)(struct super_block *, void *, int)) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (!sb->s_root) { + int err; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +EXPORT_SYMBOL(mount_ns); + +#ifdef CONFIG_BLOCK +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + return 0; +} + +static int test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +struct dentry *mount_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, + bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + error = -EBUSY; + goto error_bdev; + } + + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } else { + char b[BDEVNAME_SIZE]; + + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + goto error; + } + + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; + } + + return dget(s->s_root); + +error_s: + error = PTR_ERR(s); +error_bdev: + blkdev_put(bdev, mode); +error: + return ERR_PTR(error); +} +EXPORT_SYMBOL(mount_bdev); + +void kill_block_super(struct super_block *sb) +{ + struct block_device *bdev = sb->s_bdev; + fmode_t mode = sb->s_mode; + + bdev->bd_super = NULL; + generic_shutdown_super(sb); + sync_blockdev(bdev); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); + blkdev_put(bdev, mode | FMODE_EXCL); +} + +EXPORT_SYMBOL(kill_block_super); +#endif + +struct dentry *mount_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + int error; + struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); + + if (IS_ERR(s)) + return ERR_CAST(s); + + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_nodev); + +static int compare_single(struct super_block *s, void *p) +{ + return 1; +} + +struct dentry *mount_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct super_block *s; + int error; + + s = sget(fs_type, compare_single, set_anon_super, flags, NULL); + if (IS_ERR(s)) + return ERR_CAST(s); + if (!s->s_root) { + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); + } + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_single); + +struct dentry * +mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct dentry *root; + struct super_block *sb; + char *secdata = NULL; + int error = -ENOMEM; + + if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { + secdata = alloc_secdata(); + if (!secdata) + goto out; + + error = security_sb_copy_data(data, secdata); + if (error) + goto out_free_secdata; + } + + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; + } + sb = root->d_sb; + BUG_ON(!sb); + WARN_ON(!sb->s_bdi); + sb->s_flags |= MS_BORN; + + error = security_sb_kern_mount(sb, flags, secdata); + if (error) + goto out_sb; + + /* + * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE + * but s_maxbytes was an unsigned long long for many releases. Throw + * this warning for a little while to try and catch filesystems that + * violate this rule. + */ + WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, sb->s_maxbytes); + + up_write(&sb->s_umount); + free_secdata(secdata); + return root; +out_sb: + dput(root); + deactivate_locked_super(sb); +out_free_secdata: + free_secdata(secdata); +out: + return ERR_PTR(error); +} + +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ + percpu_counter_dec(&sb->s_writers.counter[level-1]); + /* + * Make sure s_writers are updated before we wake up waiters in + * freeze_super(). + */ + smp_mb(); + if (waitqueue_active(&sb->s_writers.wait)) + wake_up(&sb->s_writers.wait); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, + unsigned long ip) +{ + int i; + + if (!trylock) { + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + trylock = true; + break; + } + } + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: + if (unlikely(sb->s_writers.frozen >= level)) { + if (!wait) + return 0; + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen < level); + } + +#ifdef CONFIG_LOCKDEP + acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif + percpu_counter_inc(&sb->s_writers.counter[level-1]); + /* + * Make sure counter is updated before we check for frozen. + * freeze_super() first sets frozen and then checks the counter. + */ + smp_mb(); + if (unlikely(sb->s_writers.frozen >= level)) { + __sb_end_write(sb, level); + goto retry; + } + return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ + s64 writers; + + /* + * We just cycle-through lockdep here so that it does not complain + * about returning with lock to userspace + */ + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + + do { + DEFINE_WAIT(wait); + + /* + * We use a barrier in prepare_to_wait() to separate setting + * of frozen and checking of the counter + */ + prepare_to_wait(&sb->s_writers.wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); + if (writers) + schedule(); + + finish_wait(&sb->s_writers.wait, &wait); + } while (writers); +} + +/** + * freeze_super - lock the filesystem and force it into a consistent state + * @sb: the super to lock + * + * Syncs the super to make sure the filesystem is consistent and calls the fs's + * freeze_fs. Subsequent calls to this without first thawing the fs will return + * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount. + */ +int freeze_super(struct super_block *sb) +{ + int ret; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + if (sb->s_writers.frozen != SB_UNFROZEN) { + deactivate_locked_super(sb); + return -EBUSY; + } + + if (!(sb->s_flags & MS_BORN)) { + up_write(&sb->s_umount); + return 0; /* sic - it's "nothing to do" */ + } + + if (sb->s_flags & MS_RDONLY) { + /* Nothing to do really... */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; + up_write(&sb->s_umount); + return 0; + } + + /* From now on, no new normal writers can start */ + sb->s_writers.frozen = SB_FREEZE_WRITE; + smp_wmb(); + + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ + up_write(&sb->s_umount); + + sb_wait_write(sb, SB_FREEZE_WRITE); + + /* Now we go and block page faults... */ + down_write(&sb->s_umount); + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; + smp_wmb(); + + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + + /* All writers are done so after syncing there won't be dirty data */ + sync_filesystem(sb); + + /* Now wait for internal filesystem counter */ + sb->s_writers.frozen = SB_FREEZE_FS; + smp_wmb(); + sb_wait_write(sb, SB_FREEZE_FS); + + if (sb->s_op->freeze_fs) { + ret = sb->s_op->freeze_fs(sb); + if (ret) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_writers.frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } + } + /* + * This is just for debugging purposes so that fs can warn if it + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; + up_write(&sb->s_umount); + return 0; +} +EXPORT_SYMBOL(freeze_super); + +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ +int thaw_super(struct super_block *sb) +{ + int error; + + down_write(&sb->s_umount); + if (sb->s_writers.frozen == SB_UNFROZEN) { + up_write(&sb->s_umount); + return -EINVAL; + } + + if (sb->s_flags & MS_RDONLY) + goto out; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + up_write(&sb->s_umount); + return error; + } + } + +out: + sb->s_writers.frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + + return 0; +} +EXPORT_SYMBOL(thaw_super); diff --git a/kernel/fs/sync.c b/kernel/fs/sync.c new file mode 100644 index 000000000..fbc98ee62 --- /dev/null +++ b/kernel/fs/sync.c @@ -0,0 +1,366 @@ +/* + * High-level sync()-related operations + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ + SYNC_FILE_RANGE_WAIT_AFTER) + +/* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. + */ +static int __sync_filesystem(struct super_block *sb, int wait) +{ + if (wait) + sync_inodes_sb(sb); + else + writeback_inodes_sb(sb, WB_REASON_SYNC); + + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, wait); + return __sync_blockdev(sb->s_bdev, wait); +} + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int sync_filesystem(struct super_block *sb) +{ + int ret; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + /* + * No point in syncing out anything if the filesystem is read-only. + */ + if (sb->s_flags & MS_RDONLY) + return 0; + + ret = __sync_filesystem(sb, 0); + if (ret < 0) + return ret; + return __sync_filesystem(sb, 1); +} +EXPORT_SYMBOL(sync_filesystem); + +static void sync_inodes_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY)) + sync_inodes_sb(sb); +} + +static void sync_fs_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, *(int *)arg); +} + +static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) +{ + filemap_fdatawrite(bdev->bd_inode->i_mapping); +} + +static void fdatawait_one_bdev(struct block_device *bdev, void *arg) +{ + filemap_fdatawait(bdev->bd_inode->i_mapping); +} + +/* + * Sync everything. We start by waking flusher threads so that most of + * writeback runs on all devices in parallel. Then we sync all inodes reliably + * which effectively also waits for all flusher threads to finish doing + * writeback. At this point all data is on disk so metadata should be stable + * and we tell filesystems to sync their metadata via ->sync_fs() calls. + * Finally, we writeout all block devices because some filesystems (e.g. ext2) + * just write metadata (such as inodes or bitmaps) to block device page cache + * and do not sync it on their own in ->sync_fs(). + */ +SYSCALL_DEFINE0(sync) +{ + int nowait = 0, wait = 1; + + wakeup_flusher_threads(0, WB_REASON_SYNC); + iterate_supers(sync_inodes_one_sb, NULL); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &wait); + iterate_bdevs(fdatawrite_one_bdev, NULL); + iterate_bdevs(fdatawait_one_bdev, NULL); + if (unlikely(laptop_mode)) + laptop_sync_completion(); + return 0; +} + +static void do_sync_work(struct work_struct *work) +{ + int nowait = 0; + + /* + * Sync twice to reduce the possibility we skipped some inodes / pages + * because they were temporarily locked + */ + iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_bdevs(fdatawrite_one_bdev, NULL); + iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_bdevs(fdatawrite_one_bdev, NULL); + printk("Emergency Sync complete\n"); + kfree(work); +} + +void emergency_sync(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_sync_work); + schedule_work(work); + } +} + +/* + * sync a single super + */ +SYSCALL_DEFINE1(syncfs, int, fd) +{ + struct fd f = fdget(fd); + struct super_block *sb; + int ret; + + if (!f.file) + return -EBADF; + sb = f.file->f_path.dentry->d_sb; + + down_read(&sb->s_umount); + ret = sync_filesystem(sb); + up_read(&sb->s_umount); + + fdput(f); + return ret; +} + +/** + * vfs_fsync_range - helper to sync a range of data & metadata to disk + * @file: file to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync + * + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. + */ +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + + if (!file->f_op->fsync) + return -EINVAL; + if (!datasync && (inode->i_state & I_DIRTY_TIME)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } + return file->f_op->fsync(file, start, end, datasync); +} +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + */ +int vfs_fsync(struct file *file, int datasync) +{ + return vfs_fsync_range(file, 0, LLONG_MAX, datasync); +} +EXPORT_SYMBOL(vfs_fsync); + +static int do_fsync(unsigned int fd, int datasync) +{ + struct fd f = fdget(fd); + int ret = -EBADF; + + if (f.file) { + ret = vfs_fsync(f.file, datasync); + fdput(f); + } + return ret; +} + +SYSCALL_DEFINE1(fsync, unsigned int, fd) +{ + return do_fsync(fd, 0); +} + +SYSCALL_DEFINE1(fdatasync, unsigned int, fd) +{ + return do_fsync(fd, 1); +} + +/* + * sys_sync_file_range() permits finely controlled syncing over a segment of + * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is + * zero then sys_sync_file_range() will operate from offset out to EOF. + * + * The flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range + * before performing the write. + * + * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the + * range which are not presently under writeback. Note that this may block for + * significant periods due to exhaustion of disk request structures. + * + * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range + * after performing the write. + * + * Useful combinations of the flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages + * in the range which were dirty on entry to sys_sync_file_range() are placed + * under writeout. This is a start-write-for-data-integrity operation. + * + * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which + * are not presently under writeout. This is an asynchronous flush-to-disk + * operation. Not suitable for data integrity operations. + * + * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for + * completion of writeout of all pages in the range. This will be used after an + * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait + * for that operation to complete and to return the result. + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * a traditional sync() operation. This is a write-for-data-integrity operation + * which will ensure that all pages in the range which were dirty on entry to + * sys_sync_file_range() are committed to disk. + * + * + * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any + * I/O errors or ENOSPC conditions and will return those to the caller, after + * clearing the EIO and ENOSPC flags in the address_space. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. + */ +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) +{ + int ret; + struct fd f; + struct address_space *mapping; + loff_t endbyte; /* inclusive */ + umode_t i_mode; + + ret = -EINVAL; + if (flags & ~VALID_FLAGS) + goto out; + + endbyte = offset + nbytes; + + if ((s64)offset < 0) + goto out; + if ((s64)endbyte < 0) + goto out; + if (endbyte < offset) + goto out; + + if (sizeof(pgoff_t) == 4) { + if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * The range starts outside a 32 bit machine's + * pagecache addressing capabilities. Let it "succeed" + */ + ret = 0; + goto out; + } + if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * Out to EOF + */ + nbytes = 0; + } + } + + if (nbytes == 0) + endbyte = LLONG_MAX; + else + endbyte--; /* inclusive */ + + ret = -EBADF; + f = fdget(fd); + if (!f.file) + goto out; + + i_mode = file_inode(f.file)->i_mode; + ret = -ESPIPE; + if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && + !S_ISLNK(i_mode)) + goto out_put; + + mapping = f.file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + +out_put: + fdput(f); +out: + return ret; +} + +/* It would be nice if people remember that not all the world's an i386 + when they introduce new system calls */ +SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, + loff_t, offset, loff_t, nbytes) +{ + return sys_sync_file_range(fd, offset, nbytes, flags); +} diff --git a/kernel/fs/sysfs/Kconfig b/kernel/fs/sysfs/Kconfig new file mode 100644 index 000000000..b27560145 --- /dev/null +++ b/kernel/fs/sysfs/Kconfig @@ -0,0 +1,24 @@ +config SYSFS + bool "sysfs file system support" if EXPERT + default y + select KERNFS + help + The sysfs filesystem is a virtual filesystem that the kernel uses to + export internal kernel objects, their attributes, and their + relationships to one another. + + Users can use sysfs to ascertain useful information about the running + kernel, such as the devices the kernel has discovered on each bus and + which driver each is bound to. sysfs can also be used to tune devices + and other kernel subsystems. + + Some system agents rely on the information in sysfs to operate. + /sbin/hotplug uses device and object attributes in sysfs to assist in + delegating policy decisions, like persistently naming devices. + + sysfs is currently used by the block subsystem to mount the root + partition. If sysfs is disabled you must specify the boot device on + the kernel boot command line via its major and minor numbers. For + example, "root=03:01" for /dev/hda1. + + Designers of embedded systems may wish to say N here to conserve space. diff --git a/kernel/fs/sysfs/Makefile b/kernel/fs/sysfs/Makefile new file mode 100644 index 000000000..6eff6e120 --- /dev/null +++ b/kernel/fs/sysfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the sysfs virtual filesystem +# + +obj-y := file.o dir.o symlink.o mount.o group.o diff --git a/kernel/fs/sysfs/dir.c b/kernel/fs/sysfs/dir.c new file mode 100644 index 000000000..94374e435 --- /dev/null +++ b/kernel/fs/sysfs/dir.c @@ -0,0 +1,157 @@ +/* + * fs/sysfs/dir.c - sysfs core and dir operation implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#undef DEBUG + +#include +#include +#include +#include "sysfs.h" + +DEFINE_SPINLOCK(sysfs_symlink_target_lock); + +void sysfs_warn_dup(struct kernfs_node *parent, const char *name) +{ + char *buf, *path = NULL; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (buf) + path = kernfs_path(parent, buf, PATH_MAX); + + WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", + path, name); + + kfree(buf); +} + +/** + * sysfs_create_dir_ns - create a directory for an object with a namespace tag + * @kobj: object we're creating directory for + * @ns: the namespace tag to use + */ +int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) +{ + struct kernfs_node *parent, *kn; + + BUG_ON(!kobj); + + if (kobj->parent) + parent = kobj->parent->sd; + else + parent = sysfs_root_kn; + + if (!parent) + return -ENOENT; + + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), + S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, kobject_name(kobj)); + return PTR_ERR(kn); + } + + kobj->sd = kn; + return 0; +} + +/** + * sysfs_remove_dir - remove an object's directory. + * @kobj: object. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ +void sysfs_remove_dir(struct kobject *kobj) +{ + struct kernfs_node *kn = kobj->sd; + + /* + * In general, kboject owner is responsible for ensuring removal + * doesn't race with other operations and sysfs doesn't provide any + * protection; however, when @kobj is used as a symlink target, the + * symlinking entity usually doesn't own @kobj and thus has no + * control over removal. @kobj->sd may be removed anytime + * and symlink code may end up dereferencing an already freed node. + * + * sysfs_symlink_target_lock synchronizes @kobj->sd + * disassociation against symlink operations so that symlink code + * can safely dereference @kobj->sd. + */ + spin_lock(&sysfs_symlink_target_lock); + kobj->sd = NULL; + spin_unlock(&sysfs_symlink_target_lock); + + if (kn) { + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); + kernfs_remove(kn); + } +} + +int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, + const void *new_ns) +{ + struct kernfs_node *parent; + int ret; + + parent = kernfs_get_parent(kobj->sd); + ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + kernfs_put(parent); + return ret; +} + +int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, + const void *new_ns) +{ + struct kernfs_node *kn = kobj->sd; + struct kernfs_node *new_parent; + + new_parent = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : sysfs_root_kn; + + return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); +} + +/** + * sysfs_create_mount_point - create an always empty directory + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to add + */ +int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *kn, *parent = parent_kobj->sd; + + kn = kernfs_create_empty_dir(parent, name); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_create_mount_point); + +/** + * sysfs_remove_mount_point - remove an always empty directory. + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to remove + * + */ +void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *parent = parent_kobj->sd; + + kernfs_remove_by_name_ns(parent, name, NULL); +} +EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); diff --git a/kernel/fs/sysfs/file.c b/kernel/fs/sysfs/file.c new file mode 100644 index 000000000..7c2867b44 --- /dev/null +++ b/kernel/fs/sysfs/file.c @@ -0,0 +1,497 @@ +/* + * fs/sysfs/file.c - sysfs regular (text) file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sysfs.h" +#include "../kernfs/kernfs-internal.h" + +/* + * Determine ktype->sysfs_ops for the given kernfs_node. This function + * must be called while holding an active reference. + */ +static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) +{ + struct kobject *kobj = kn->parent->priv; + + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; +} + +/* + * Reads on sysfs are handled through seq_file, which takes care of hairy + * details like buffering and seeking. The following function pipes + * sysfs_ops->show() result through seq_file. + */ +static int sysfs_kf_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + struct kobject *kobj = of->kn->parent->priv; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + ssize_t count; + char *buf; + + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ + count = seq_get_buf(sf, &buf); + if (count < PAGE_SIZE) { + seq_commit(sf, -1); + return 0; + } + memset(buf, 0, PAGE_SIZE); + + /* + * Invoke show(). Control may reach here via seq file lseek even + * if @ops->show() isn't implemented. + */ + if (ops->show) { + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; + } + + /* + * The code works fine with PAGE_SIZE return but it's likely to + * indicate truncated result or overflow in normal use cases. + */ + if (count >= (ssize_t)PAGE_SIZE) { + print_symbol("fill_read_buffer: %s returned bad count\n", + (unsigned long)ops->show); + /* Try to struggle along */ + count = PAGE_SIZE - 1; + } + seq_commit(sf, count); + return 0; +} + +static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; + + if (!count) + return 0; + + if (size) { + if (pos > size) + return 0; + if (pos + count > size) + count = size - pos; + } + + if (!battr->read) + return -EIO; + + return battr->read(of->file, kobj, battr, buf, pos, count); +} + +/* kernfs read callback for regular sysfs files with pre-alloc */ +static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; + + /* + * If buf != of->prealloc_buf, we don't know how + * large it is, so cannot safely pass it to ->show + */ + if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + return 0; + return ops->show(kobj, of->kn->priv, buf); +} + +/* kernfs write callback for regular sysfs files */ +static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; + + if (!count) + return 0; + + return ops->store(kobj, of->kn->priv, buf, count); +} + +/* kernfs write callback for bin sysfs files */ +static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; + + if (size) { + if (size <= pos) + return -EFBIG; + count = min_t(ssize_t, count, size - pos); + } + if (!count) + return 0; + + if (!battr->write) + return -EIO; + + return battr->write(of->file, kobj, battr, buf, pos, count); +} + +static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, + struct vm_area_struct *vma) +{ + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + + return battr->mmap(of->file, kobj, battr, vma); +} + +void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) +{ + struct kernfs_node *kn = kobj->sd, *tmp; + + if (kn && dir) + kn = kernfs_find_and_get(kn, dir); + else + kernfs_get(kn); + + if (kn && attr) { + tmp = kernfs_find_and_get(kn, attr); + kernfs_put(kn); + kn = tmp; + } + + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + +static const struct kernfs_ops sysfs_file_kfops_empty = { +}; + +static const struct kernfs_ops sysfs_file_kfops_ro = { + .seq_show = sysfs_kf_seq_show, +}; + +static const struct kernfs_ops sysfs_file_kfops_wo = { + .write = sysfs_kf_write, +}; + +static const struct kernfs_ops sysfs_file_kfops_rw = { + .seq_show = sysfs_kf_seq_show, + .write = sysfs_kf_write, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_ro = { + .read = sysfs_kf_read, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_wo = { + .write = sysfs_kf_write, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_rw = { + .read = sysfs_kf_read, + .write = sysfs_kf_write, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_bin_kfops_ro = { + .read = sysfs_kf_bin_read, +}; + +static const struct kernfs_ops sysfs_bin_kfops_wo = { + .write = sysfs_kf_bin_write, +}; + +static const struct kernfs_ops sysfs_bin_kfops_rw = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, +}; + +static const struct kernfs_ops sysfs_bin_kfops_mmap = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, + .mmap = sysfs_kf_bin_mmap, +}; + +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t mode, const void *ns) +{ + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + loff_t size; + + if (!is_bin) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; + + /* every kobject with an attribute needs a ktype assigned */ + if (WARN(!sysfs_ops, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj))) + return -EINVAL; + + if (sysfs_ops->show && sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_rw; + else + ops = &sysfs_file_kfops_rw; + } else if (sysfs_ops->show) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_ro; + else + ops = &sysfs_file_kfops_ro; + } else if (sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_wo; + else + ops = &sysfs_file_kfops_wo; + } else + ops = &sysfs_file_kfops_empty; + + size = PAGE_SIZE; + } else { + struct bin_attribute *battr = (void *)attr; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = battr->size; + } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, + (void *)attr, ns, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); + } + return 0; +} + +int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, + bool is_bin) +{ + return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); +} + +/** + * sysfs_create_file_ns - create an attribute file for an object with custom ns + * @kobj: object we're creating for + * @attr: attribute descriptor + * @ns: namespace the new file should belong to + */ +int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) +{ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); + +} +EXPORT_SYMBOL_GPL(sysfs_create_file_ns); + +int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) +{ + int err = 0; + int i; + + for (i = 0; ptr[i] && !err; i++) + err = sysfs_create_file(kobj, ptr[i]); + if (err) + while (--i >= 0) + sysfs_remove_file(kobj, ptr[i]); + return err; +} +EXPORT_SYMBOL_GPL(sysfs_create_files); + +/** + * sysfs_add_file_to_group - add an attribute file to a pre-existing group. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @group: group name. + */ +int sysfs_add_file_to_group(struct kobject *kobj, + const struct attribute *attr, const char *group) +{ + struct kernfs_node *parent; + int error; + + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (!parent) + return -ENOENT; + + error = sysfs_add_file(parent, attr, false); + kernfs_put(parent); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); + +/** + * sysfs_chmod_file - update the modified mode value on an object attribute. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @mode: file permissions. + * + */ +int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, + umode_t mode) +{ + struct kernfs_node *kn; + struct iattr newattrs; + int rc; + + kn = kernfs_find_and_get(kobj->sd, attr->name); + if (!kn) + return -ENOENT; + + newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE; + + rc = kernfs_setattr(kn, &newattrs); + + kernfs_put(kn); + return rc; +} +EXPORT_SYMBOL_GPL(sysfs_chmod_file); + +/** + * sysfs_remove_file_ns - remove an object attribute with a custom ns tag + * @kobj: object we're acting for + * @attr: attribute descriptor + * @ns: namespace tag of the file to remove + * + * Hash the attribute name and namespace tag and kill the victim. + */ +void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) +{ + struct kernfs_node *parent = kobj->sd; + + kernfs_remove_by_name_ns(parent, attr->name, ns); +} +EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); + +/** + * sysfs_remove_file_self - remove an object attribute from its own method + * @kobj: object we're acting for + * @attr: attribute descriptor + * + * See kernfs_remove_self() for details. + */ +bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) +{ + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; + bool ret; + + kn = kernfs_find_and_get(parent, attr->name); + if (WARN_ON_ONCE(!kn)) + return false; + + ret = kernfs_remove_self(kn); + + kernfs_put(kn); + return ret; +} + +void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) +{ + int i; + for (i = 0; ptr[i]; i++) + sysfs_remove_file(kobj, ptr[i]); +} +EXPORT_SYMBOL_GPL(sysfs_remove_files); + +/** + * sysfs_remove_file_from_group - remove an attribute file from a group. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @group: group name. + */ +void sysfs_remove_file_from_group(struct kobject *kobj, + const struct attribute *attr, const char *group) +{ + struct kernfs_node *parent; + + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (parent) { + kernfs_remove_by_name(parent, attr->name); + kernfs_put(parent); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); + +/** + * sysfs_create_bin_file - create binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, &attr->attr, true); +} +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); + +/** + * sysfs_remove_bin_file - remove binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + kernfs_remove_by_name(kobj->sd, attr->attr.name); +} +EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/kernel/fs/sysfs/group.c b/kernel/fs/sysfs/group.c new file mode 100644 index 000000000..b400c0437 --- /dev/null +++ b/kernel/fs/sysfs/group.c @@ -0,0 +1,354 @@ +/* + * fs/sysfs/group.c - Operations for adding/removing multiple files at once. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2013 Greg Kroah-Hartman + * Copyright (c) 2013 The Linux Foundation + * + * This file is released undert the GPL v2. + * + */ + +#include +#include +#include +#include +#include +#include "sysfs.h" + + +static void remove_files(struct kernfs_node *parent, + const struct attribute_group *grp) +{ + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; + + if (grp->attrs) + for (attr = grp->attrs; *attr; attr++) + kernfs_remove_by_name(parent, (*attr)->name); + if (grp->bin_attrs) + for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) + kernfs_remove_by_name(parent, (*bin_attr)->attr.name); +} + +static int create_files(struct kernfs_node *parent, struct kobject *kobj, + const struct attribute_group *grp, int update) +{ + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; + int error = 0, i; + + if (grp->attrs) { + for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { + umode_t mode = (*attr)->mode; + + /* + * In update mode, we're changing the permissions or + * visibility. Do this by first removing then + * re-adding (if required) the file. + */ + if (update) + kernfs_remove_by_name(parent, (*attr)->name); + if (grp->is_visible) { + mode = grp->is_visible(kobj, *attr, i); + if (!mode) + continue; + } + + WARN(mode & ~(SYSFS_PREALLOC | 0664), + "Attribute %s: Invalid permissions 0%o\n", + (*attr)->name, mode); + + mode &= SYSFS_PREALLOC | 0664; + error = sysfs_add_file_mode_ns(parent, *attr, false, + mode, NULL); + if (unlikely(error)) + break; + } + if (error) { + remove_files(parent, grp); + goto exit; + } + } + + if (grp->bin_attrs) { + for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { + if (update) + kernfs_remove_by_name(parent, + (*bin_attr)->attr.name); + error = sysfs_add_file_mode_ns(parent, + &(*bin_attr)->attr, true, + (*bin_attr)->attr.mode, NULL); + if (error) + break; + } + if (error) + remove_files(parent, grp); + } +exit: + return error; +} + + +static int internal_create_group(struct kobject *kobj, int update, + const struct attribute_group *grp) +{ + struct kernfs_node *kn; + int error; + + BUG_ON(!kobj || (!update && !kobj->sd)); + + /* Updates may happen before the object has been instantiated */ + if (unlikely(update && !kobj->sd)) + return -EINVAL; + if (!grp->attrs && !grp->bin_attrs) { + WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n", + kobj->name, grp->name ?: ""); + return -EINVAL; + } + if (grp->name) { + kn = kernfs_create_dir(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, kobj); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(kobj->sd, grp->name); + return PTR_ERR(kn); + } + } else + kn = kobj->sd; + kernfs_get(kn); + error = create_files(kn, kobj, grp, update); + if (error) { + if (grp->name) + kernfs_remove(kn); + } + kernfs_put(kn); + return error; +} + +/** + * sysfs_create_group - given a directory kobject, create an attribute group + * @kobj: The kobject to create the group on + * @grp: The attribute group to create + * + * This function creates a group for the first time. It will explicitly + * warn and error if any of the attribute files being created already exist. + * + * Returns 0 on success or error. + */ +int sysfs_create_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 0, grp); +} +EXPORT_SYMBOL_GPL(sysfs_create_group); + +/** + * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups + * @kobj: The kobject to create the group on + * @groups: The attribute groups to create, NULL terminated + * + * This function creates a bunch of attribute groups. If an error occurs when + * creating a group, all previously created groups will be removed, unwinding + * everything back to the original state when this function was called. + * It will explicitly warn and error if any of the attribute files being + * created already exist. + * + * Returns 0 on success or error code from sysfs_create_group on error. + */ +int sysfs_create_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int error = 0; + int i; + + if (!groups) + return 0; + + for (i = 0; groups[i]; i++) { + error = sysfs_create_group(kobj, groups[i]); + if (error) { + while (--i >= 0) + sysfs_remove_group(kobj, groups[i]); + break; + } + } + return error; +} +EXPORT_SYMBOL_GPL(sysfs_create_groups); + +/** + * sysfs_update_group - given a directory kobject, update an attribute group + * @kobj: The kobject to update the group on + * @grp: The attribute group to update + * + * This function updates an attribute group. Unlike + * sysfs_create_group(), it will explicitly not warn or error if any + * of the attribute files being created already exist. Furthermore, + * if the visibility of the files has changed through the is_visible() + * callback, it will update the permissions and add or remove the + * relevant files. + * + * The primary use for this function is to call it after making a change + * that affects group visibility. + * + * Returns 0 on success or error. + */ +int sysfs_update_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 1, grp); +} +EXPORT_SYMBOL_GPL(sysfs_update_group); + +/** + * sysfs_remove_group: remove a group from a kobject + * @kobj: kobject to remove the group from + * @grp: group to remove + * + * This function removes a group of attributes from a kobject. The attributes + * previously have to have been created for this group, otherwise it will fail. + */ +void sysfs_remove_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; + + if (grp->name) { + kn = kernfs_find_and_get(parent, grp->name); + if (!kn) { + WARN(!kn, KERN_WARNING + "sysfs group %p not found for kobject '%s'\n", + grp, kobject_name(kobj)); + return; + } + } else { + kn = parent; + kernfs_get(kn); + } + + remove_files(kn, grp); + if (grp->name) + kernfs_remove(kn); + + kernfs_put(kn); +} +EXPORT_SYMBOL_GPL(sysfs_remove_group); + +/** + * sysfs_remove_groups - remove a list of groups + * + * @kobj: The kobject for the groups to be removed from + * @groups: NULL terminated list of groups to be removed + * + * If groups is not NULL, remove the specified groups from the kobject. + */ +void sysfs_remove_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int i; + + if (!groups) + return; + for (i = 0; groups[i]; i++) + sysfs_remove_group(kobj, groups[i]); +} +EXPORT_SYMBOL_GPL(sysfs_remove_groups); + +/** + * sysfs_merge_group - merge files into a pre-existing attribute group. + * @kobj: The kobject containing the group. + * @grp: The files to create and the attribute group they belong to. + * + * This function returns an error if the group doesn't exist or any of the + * files already exist in that group, in which case none of the new files + * are created. + */ +int sysfs_merge_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + struct kernfs_node *parent; + int error = 0; + struct attribute *const *attr; + int i; + + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (!parent) + return -ENOENT; + + for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) + error = sysfs_add_file(parent, *attr, false); + if (error) { + while (--i >= 0) + kernfs_remove_by_name(parent, (*--attr)->name); + } + kernfs_put(parent); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_merge_group); + +/** + * sysfs_unmerge_group - remove files from a pre-existing attribute group. + * @kobj: The kobject containing the group. + * @grp: The files to remove and the attribute group they belong to. + */ +void sysfs_unmerge_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + struct kernfs_node *parent; + struct attribute *const *attr; + + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (parent) { + for (attr = grp->attrs; *attr; ++attr) + kernfs_remove_by_name(parent, (*attr)->name); + kernfs_put(parent); + } +} +EXPORT_SYMBOL_GPL(sysfs_unmerge_group); + +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @target: The target kobject of the symlink to create. + * @link_name: The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, + struct kobject *target, const char *link_name) +{ + struct kernfs_node *parent; + int error = 0; + + parent = kernfs_find_and_get(kobj->sd, group_name); + if (!parent) + return -ENOENT; + + error = sysfs_create_link_sd(parent, target, link_name); + kernfs_put(parent); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @link_name: The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, + const char *link_name) +{ + struct kernfs_node *parent; + + parent = kernfs_find_and_get(kobj->sd, group_name); + if (parent) { + kernfs_remove_by_name(parent, link_name); + kernfs_put(parent); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); diff --git a/kernel/fs/sysfs/mount.c b/kernel/fs/sysfs/mount.c new file mode 100644 index 000000000..1c6ac6fce --- /dev/null +++ b/kernel/fs/sysfs/mount.c @@ -0,0 +1,79 @@ +/* + * fs/sysfs/symlink.c - operations for initializing and mounting sysfs + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#define DEBUG + +#include +#include +#include +#include +#include + +#include "sysfs.h" + +static struct kernfs_root *sysfs_root; +struct kernfs_node *sysfs_root_kn; + +static struct dentry *sysfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *root; + void *ns; + bool new_sb; + + if (!(flags & MS_KERNMOUNT)) { + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); + } + + ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); + if (IS_ERR(root) || !new_sb) + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + return root; +} + +static void sysfs_kill_sb(struct super_block *sb) +{ + void *ns = (void *)kernfs_super_ns(sb); + + kernfs_kill_sb(sb); + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); +} + +static struct file_system_type sysfs_fs_type = { + .name = "sysfs", + .mount = sysfs_mount, + .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, +}; + +int __init sysfs_init(void) +{ + int err; + + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); + if (IS_ERR(sysfs_root)) + return PTR_ERR(sysfs_root); + + sysfs_root_kn = sysfs_root->kn; + + err = register_filesystem(&sysfs_fs_type); + if (err) { + kernfs_destroy_root(sysfs_root); + return err; + } + + return 0; +} diff --git a/kernel/fs/sysfs/symlink.c b/kernel/fs/sysfs/symlink.c new file mode 100644 index 000000000..aecb15f84 --- /dev/null +++ b/kernel/fs/sysfs/symlink.c @@ -0,0 +1,197 @@ +/* + * fs/sysfs/symlink.c - sysfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#include +#include +#include +#include +#include + +#include "sysfs.h" + +static int sysfs_do_create_link_sd(struct kernfs_node *parent, + struct kobject *target_kobj, + const char *name, int warn) +{ + struct kernfs_node *kn, *target = NULL; + + BUG_ON(!name || !parent); + + /* + * We don't own @target_kobj and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. + */ + spin_lock(&sysfs_symlink_target_lock); + if (target_kobj->sd) { + target = target_kobj->sd; + kernfs_get(target); + } + spin_unlock(&sysfs_symlink_target_lock); + + if (!target) + return -ENOENT; + + kn = kernfs_create_link(parent, name, target); + kernfs_put(target); + + if (!IS_ERR(kn)) + return 0; + + if (warn && PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); +} + +/** + * sysfs_create_link_sd - create symlink to a given object. + * @kn: directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link_sd(kn, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct kernfs_node *parent = NULL; + + if (!kobj) + parent = sysfs_root_kn; + else + parent = kobj->sd; + + if (!parent) + return -EFAULT; + + return sysfs_do_create_link_sd(parent, target, name, warn); +} + +/** + * sysfs_create_link - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 1); +} +EXPORT_SYMBOL_GPL(sysfs_create_link); + +/** + * sysfs_create_link_nowarn - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + * + * This function does the same as sysfs_create_link(), but it + * doesn't warn if the link already exists. + */ +int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 0); +} + +/** + * sysfs_delete_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @name: name of the symlink to remove. + * + * Unlike sysfs_remove_link sysfs_delete_link has enough information + * to successfully delete symlinks in tagged directories. + */ +void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, + const char *name) +{ + const void *ns = NULL; + + /* + * We don't own @target and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. + */ + spin_lock(&sysfs_symlink_target_lock); + if (targ->sd && kernfs_ns_enabled(kobj->sd)) + ns = targ->sd->ns; + spin_unlock(&sysfs_symlink_target_lock); + kernfs_remove_by_name_ns(kobj->sd, name, ns); +} + +/** + * sysfs_remove_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @name: name of the symlink to remove. + */ +void sysfs_remove_link(struct kobject *kobj, const char *name) +{ + struct kernfs_node *parent = NULL; + + if (!kobj) + parent = sysfs_root_kn; + else + parent = kobj->sd; + + kernfs_remove_by_name(parent, name); +} +EXPORT_SYMBOL_GPL(sysfs_remove_link); + +/** + * sysfs_rename_link_ns - rename symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @old: previous name of the symlink. + * @new: new name of the symlink. + * @new_ns: new namespace of the symlink. + * + * A helper function for the common rename symlink idiom. + */ +int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, + const char *old, const char *new, const void *new_ns) +{ + struct kernfs_node *parent, *kn = NULL; + const void *old_ns = NULL; + int result; + + if (!kobj) + parent = sysfs_root_kn; + else + parent = kobj->sd; + + if (targ->sd) + old_ns = targ->sd->ns; + + result = -ENOENT; + kn = kernfs_find_and_get_ns(parent, old, old_ns); + if (!kn) + goto out; + + result = -EINVAL; + if (kernfs_type(kn) != KERNFS_LINK) + goto out; + if (kn->symlink.target_kn->priv != targ) + goto out; + + result = kernfs_rename_ns(kn, parent, new, new_ns); + +out: + kernfs_put(kn); + return result; +} +EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); diff --git a/kernel/fs/sysfs/sysfs.h b/kernel/fs/sysfs/sysfs.h new file mode 100644 index 000000000..0e2f1cccb --- /dev/null +++ b/kernel/fs/sysfs/sysfs.h @@ -0,0 +1,43 @@ +/* + * fs/sysfs/sysfs.h - sysfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#ifndef __SYSFS_INTERNAL_H +#define __SYSFS_INTERNAL_H + +#include + +/* + * mount.c + */ +extern struct kernfs_node *sysfs_root_kn; + +/* + * dir.c + */ +extern spinlock_t sysfs_symlink_target_lock; + +void sysfs_warn_dup(struct kernfs_node *parent, const char *name); + +/* + * file.c + */ +int sysfs_add_file(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin); +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t amode, const void *ns); + +/* + * symlink.c + */ +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, + const char *name); + +#endif /* __SYSFS_INTERNAL_H */ diff --git a/kernel/fs/sysv/Kconfig b/kernel/fs/sysv/Kconfig new file mode 100644 index 000000000..33aeb4b75 --- /dev/null +++ b/kernel/fs/sysv/Kconfig @@ -0,0 +1,36 @@ +config SYSV_FS + tristate "System V/Xenix/V7/Coherent file system support" + depends on BLOCK + help + SCO, Xenix and Coherent are commercial Unix systems for Intel + machines, and Version 7 was used on the DEC PDP-11. Saying Y + here would allow you to read from their floppies and hard disk + partitions. + + If you have floppies or hard disk partitions like that, it is likely + that they contain binaries from those other Unix systems; in order + to run these binaries, you will want to install linux-abi which is + a set of kernel modules that lets you run SCO, Xenix, Wyse, + UnixWare, Dell Unix and System V programs under Linux. It is + available via FTP (user: ftp) from + ). + NOTE: that will work only for binaries from Intel-based systems; + PDP ones will have to wait until somebody ports Linux to -11 ;-) + + If you only intend to mount files from some other Unix over the + network using NFS, you don't need the System V file system support + (but you need NFS file system support obviously). + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). Note also that this option has + nothing whatsoever to do with the option "System V IPC". Read about + the System V file system in + . + Saying Y here will enlarge your kernel by about 27 KB. + + To compile this as a module, choose M here: the module will be called + sysv. + + If you haven't heard about all of this before, it's safe to say N. diff --git a/kernel/fs/sysv/Makefile b/kernel/fs/sysv/Makefile new file mode 100644 index 000000000..3591f9d7a --- /dev/null +++ b/kernel/fs/sysv/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux SystemV/Coherent filesystem routines. +# + +obj-$(CONFIG_SYSV_FS) += sysv.o + +sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ + namei.o super.o symlink.o diff --git a/kernel/fs/sysv/balloc.c b/kernel/fs/sysv/balloc.c new file mode 100644 index 000000000..921c053fc --- /dev/null +++ b/kernel/fs/sysv/balloc.c @@ -0,0 +1,239 @@ +/* + * linux/fs/sysv/balloc.c + * + * minix/bitmap.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext/freelists.c + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * + * xenix/alloc.c + * Copyright (C) 1992 Doug Evans + * + * coh/alloc.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/balloc.c + * Copyright (C) 1993 Bruno Haible + * + * This file contains code for allocating/freeing blocks. + */ + +#include +#include +#include "sysv.h" + +/* We don't trust the value of + sb->sv_sbd2->s_tfree = *sb->sv_free_blocks + but we nevertheless keep it up to date. */ + +static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh) +{ + char *bh_data = bh->b_data; + + if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4) + return (sysv_zone_t*)(bh_data+4); + else + return (sysv_zone_t*)(bh_data+2); +} + +/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */ + +void sysv_free_block(struct super_block * sb, sysv_zone_t nr) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + sysv_zone_t *blocks = sbi->s_bcache; + unsigned count; + unsigned block = fs32_to_cpu(sbi, nr); + + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only no one + * should call this for an AFS filesystem anyway... + */ + if (sbi->s_type == FSTYPE_AFS) + return; + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("sysv_free_block: trying to free block not in datazone\n"); + return; + } + + mutex_lock(&sbi->s_lock); + count = fs16_to_cpu(sbi, *sbi->s_bcache_count); + + if (count > sbi->s_flc_size) { + printk("sysv_free_block: flc_count > flc_size\n"); + mutex_unlock(&sbi->s_lock); + return; + } + /* If the free list head in super-block is full, it is copied + * into this block being freed, ditto if it's completely empty + * (applies only on Coherent). + */ + if (count == sbi->s_flc_size || count == 0) { + block += sbi->s_block_base; + bh = sb_getblk(sb, block); + if (!bh) { + printk("sysv_free_block: getblk() failed\n"); + mutex_unlock(&sbi->s_lock); + return; + } + memset(bh->b_data, 0, sb->s_blocksize); + *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count); + memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t)); + mark_buffer_dirty(bh); + set_buffer_uptodate(bh); + brelse(bh); + count = 0; + } + sbi->s_bcache[count++] = nr; + + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + fs32_add(sbi, sbi->s_free_blocks, 1); + dirty_sb(sb); + mutex_unlock(&sbi->s_lock); +} + +sysv_zone_t sysv_new_block(struct super_block * sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned int block; + sysv_zone_t nr; + struct buffer_head * bh; + unsigned count; + + mutex_lock(&sbi->s_lock); + count = fs16_to_cpu(sbi, *sbi->s_bcache_count); + + if (count == 0) /* Applies only to Coherent FS */ + goto Enospc; + nr = sbi->s_bcache[--count]; + if (nr == 0) /* Applies only to Xenix FS, SystemV FS */ + goto Enospc; + + block = fs32_to_cpu(sbi, nr); + + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("sysv_new_block: new block %d is not in data zone\n", + block); + goto Enospc; + } + + if (count == 0) { /* the last block continues the free list */ + unsigned count; + + block += sbi->s_block_base; + if (!(bh = sb_bread(sb, block))) { + printk("sysv_new_block: cannot read free-list block\n"); + /* retry this same block next time */ + *sbi->s_bcache_count = cpu_to_fs16(sbi, 1); + goto Enospc; + } + count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); + if (count > sbi->s_flc_size) { + printk("sysv_new_block: free-list block with >flc_size entries\n"); + brelse(bh); + goto Enospc; + } + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + memcpy(sbi->s_bcache, get_chunk(sb, bh), + count * sizeof(sysv_zone_t)); + brelse(bh); + } + /* Now the free list head in the superblock is valid again. */ + fs32_add(sbi, sbi->s_free_blocks, -1); + dirty_sb(sb); + mutex_unlock(&sbi->s_lock); + return nr; + +Enospc: + mutex_unlock(&sbi->s_lock); + return 0; +} + +unsigned long sysv_count_free_blocks(struct super_block * sb) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + int sb_count; + int count; + struct buffer_head * bh = NULL; + sysv_zone_t *blocks; + unsigned block; + int n; + + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only we just + * lie and say it has no free block at all. + */ + if (sbi->s_type == FSTYPE_AFS) + return 0; + + mutex_lock(&sbi->s_lock); + sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); + + if (0) + goto trust_sb; + + /* this causes a lot of disk traffic ... */ + count = 0; + n = fs16_to_cpu(sbi, *sbi->s_bcache_count); + blocks = sbi->s_bcache; + while (1) { + sysv_zone_t zone; + if (n > sbi->s_flc_size) + goto E2big; + zone = 0; + while (n && (zone = blocks[--n]) != 0) + count++; + if (zone == 0) + break; + + block = fs32_to_cpu(sbi, zone); + if (bh) + brelse(bh); + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) + goto Einval; + block += sbi->s_block_base; + bh = sb_bread(sb, block); + if (!bh) + goto Eio; + n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); + blocks = get_chunk(sb, bh); + } + if (bh) + brelse(bh); + if (count != sb_count) + goto Ecount; +done: + mutex_unlock(&sbi->s_lock); + return count; + +Einval: + printk("sysv_count_free_blocks: new block %d is not in data zone\n", + block); + goto trust_sb; +Eio: + printk("sysv_count_free_blocks: cannot read free-list block\n"); + goto trust_sb; +E2big: + printk("sysv_count_free_blocks: >flc_size entries in free-list block\n"); + if (bh) + brelse(bh); +trust_sb: + count = sb_count; + goto done; +Ecount: + printk("sysv_count_free_blocks: free block count was %d, " + "correcting to %d\n", sb_count, count); + if (!(sb->s_flags & MS_RDONLY)) { + *sbi->s_free_blocks = cpu_to_fs32(sbi, count); + dirty_sb(sb); + } + goto done; +} diff --git a/kernel/fs/sysv/dir.c b/kernel/fs/sysv/dir.c new file mode 100644 index 000000000..8f3555f00 --- /dev/null +++ b/kernel/fs/sysv/dir.c @@ -0,0 +1,372 @@ +/* + * linux/fs/sysv/dir.c + * + * minix/dir.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/dir.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/dir.c + * Copyright (C) 1993 Bruno Haible + * + * SystemV/Coherent directory handling functions + */ + +#include +#include +#include +#include "sysv.h" + +static int sysv_readdir(struct file *, struct dir_context *); + +const struct file_operations sysv_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = sysv_readdir, + .fsync = generic_file_fsync, +}; + +static inline void dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static struct page * dir_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static int sysv_readdir(struct file *file, struct dir_context *ctx) +{ + unsigned long pos = ctx->pos; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + unsigned long npages = dir_pages(inode); + unsigned offset; + unsigned long n; + + ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); + if (pos >= inode->i_size) + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct sysv_dir_entry *de; + struct page *page = dir_get_page(inode, n); + + if (IS_ERR(page)) + continue; + kaddr = (char *)page_address(page); + de = (struct sysv_dir_entry *)(kaddr+offset); + limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; + for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { + char *name = de->name; + + if (!de->inode) + continue; + + if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), + fs16_to_cpu(SYSV_SB(sb), de->inode), + DT_UNKNOWN)) { + dir_put_page(page); + return 0; + } + } + dir_put_page(page); + } + return 0; +} + +/* compare strings: name[0..len-1] (not zero-terminated) and + * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) + */ +static inline int namecompare(int len, int maxlen, + const char * name, const char * buffer) +{ + if (len < maxlen && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +/* + * sysv_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + */ +struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) +{ + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct inode * dir = d_inode(dentry->d_parent); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct sysv_dir_entry *de; + + *res_page = NULL; + + start = SYSV_I(dir)->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + + do { + char *kaddr; + page = dir_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = (char*)page_address(page); + de = (struct sysv_dir_entry *) kaddr; + kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + for ( ; (char *) de <= kaddr ; de++) { + if (!de->inode) + continue; + if (namecompare(namelen, SYSV_NAMELEN, + name, de->name)) + goto found; + } + dir_put_page(page); + } + + if (++n >= npages) + n = 0; + } while (n != start); + + return NULL; + +found: + SYSV_I(dir)->i_dir_start_lookup = n; + *res_page = page; + return de; +} + +int sysv_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct page *page = NULL; + struct sysv_dir_entry * de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + /* We take care of directory expansion in the same loop */ + for (n = 0; n <= npages; n++) { + page = dir_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + kaddr = (char*)page_address(page); + de = (struct sysv_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + while ((char *)de <= kaddr) { + if (!de->inode) + goto got_it; + err = -EEXIST; + if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) + goto out_page; + de++; + } + dir_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + if (err) + goto out_unlock; + memcpy (de->name, name, namelen); + memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +out_page: + dir_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_page; +} + +int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr = (char*)page_address(page); + loff_t pos = page_offset(page) + (char *)de - kaddr; + int err; + + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + BUG_ON(err); + de->inode = 0; + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_put_page(page); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return err; +} + +int sysv_make_empty(struct inode *inode, struct inode *dir) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + struct sysv_dir_entry * de; + char *base; + int err; + + if (!page) + return -ENOMEM; + err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE); + if (err) { + unlock_page(page); + goto fail; + } + kmap(page); + + base = (char*)page_address(page); + memset(base, 0, PAGE_CACHE_SIZE); + + de = (struct sysv_dir_entry *) base; + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + strcpy(de->name,"."); + de++; + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); + strcpy(de->name,".."); + + kunmap(page); + err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int sysv_empty_dir(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct sysv_dir_entry * de; + page = dir_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = (char *)page_address(page); + de = (struct sysv_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE; + + for ( ;(char *)de <= kaddr; de++) { + if (!de->inode) + continue; + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (!de->name[1]) { + if (de->inode == cpu_to_fs16(SYSV_SB(sb), + inode->i_ino)) + continue; + goto not_empty; + } + if (de->name[1] != '.' || de->name[2]) + goto not_empty; + } + dir_put_page(page); + } + return 1; + +not_empty: + dir_put_page(page); + return 0; +} + +/* Releases the page */ +void sysv_set_link(struct sysv_dir_entry *de, struct page *page, + struct inode *inode) +{ + struct inode *dir = page->mapping->host; + loff_t pos = page_offset(page) + + (char *)de-(char*)page_address(page); + int err; + + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + BUG_ON(err); + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + +struct sysv_dir_entry * sysv_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = dir_get_page(dir, 0); + struct sysv_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = (struct sysv_dir_entry*) page_address(page) + 1; + *p = page; + } + return de; +} + +ino_t sysv_inode_by_name(struct dentry *dentry) +{ + struct page *page; + struct sysv_dir_entry *de = sysv_find_entry (dentry, &page); + ino_t res = 0; + + if (de) { + res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); + dir_put_page(page); + } + return res; +} diff --git a/kernel/fs/sysv/file.c b/kernel/fs/sysv/file.c new file mode 100644 index 000000000..82ddc0906 --- /dev/null +++ b/kernel/fs/sysv/file.c @@ -0,0 +1,57 @@ +/* + * linux/fs/sysv/file.c + * + * minix/file.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/file.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/file.c + * Copyright (C) 1993 Bruno Haible + * + * SystemV/Coherent regular file handling primitives + */ + +#include "sysv.h" + +/* + * We have mostly NULLs here: the current defaults are OK for + * the coh filesystem. + */ +const struct file_operations sysv_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int sysv_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + truncate_setsize(inode, attr->ia_size); + sysv_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations sysv_file_inode_operations = { + .setattr = sysv_setattr, + .getattr = sysv_getattr, +}; diff --git a/kernel/fs/sysv/ialloc.c b/kernel/fs/sysv/ialloc.c new file mode 100644 index 000000000..f9db4eb31 --- /dev/null +++ b/kernel/fs/sysv/ialloc.c @@ -0,0 +1,234 @@ +/* + * linux/fs/sysv/ialloc.c + * + * minix/bitmap.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext/freelists.c + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * + * xenix/alloc.c + * Copyright (C) 1992 Doug Evans + * + * coh/alloc.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/ialloc.c + * Copyright (C) 1993 Bruno Haible + * + * This file contains code for allocating/freeing inodes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "sysv.h" + +/* We don't trust the value of + sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes + but we nevertheless keep it up to date. */ + +/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ + +/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ +static inline sysv_ino_t * +sv_sb_fic_inode(struct super_block * sb, unsigned int i) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + if (sbi->s_bh1 == sbi->s_bh2) + return &sbi->s_sb_fic_inodes[i]; + else { + /* 512 byte Xenix FS */ + unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); + if (offset < 512) + return (sysv_ino_t*)(sbi->s_sbd1 + offset); + else + return (sysv_ino_t*)(sbi->s_sbd2 + offset); + } +} + +struct sysv_inode * +sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct sysv_inode *res; + int block = sbi->s_firstinodezone + sbi->s_block_base; + + block += (ino-1) >> sbi->s_inodes_per_block_bits; + *bh = sb_bread(sb, block); + if (!*bh) + return NULL; + res = (struct sysv_inode *)(*bh)->b_data; + return res + ((ino-1) & sbi->s_inodes_per_block_1); +} + +static int refill_free_cache(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + int i = 0, ino; + + ino = SYSV_ROOT_INO+1; + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto out; + while (ino <= sbi->s_ninodes) { + if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { + *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); + if (i == sbi->s_fic_size) + break; + } + if ((ino++ & sbi->s_inodes_per_block_1) == 0) { + brelse(bh); + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto out; + } else + raw_inode++; + } + brelse(bh); +out: + return i; +} + +void sysv_free_inode(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned int ino; + struct buffer_head * bh; + struct sysv_inode * raw_inode; + unsigned count; + + sb = inode->i_sb; + ino = inode->i_ino; + if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { + printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); + return; + } + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("sysv_free_inode: unable to read inode block on device " + "%s\n", inode->i_sb->s_id); + return; + } + mutex_lock(&sbi->s_lock); + count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); + if (count < sbi->s_fic_size) { + *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); + *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); + } + fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); + dirty_sb(sb); + memset(raw_inode, 0, sizeof(struct sysv_inode)); + mark_buffer_dirty(bh); + mutex_unlock(&sbi->s_lock); + brelse(bh); +} + +struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct inode *inode; + sysv_ino_t ino; + unsigned count; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE + }; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + mutex_lock(&sbi->s_lock); + count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); + if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { + count = refill_free_cache(sb); + if (count == 0) { + iput(inode); + mutex_unlock(&sbi->s_lock); + return ERR_PTR(-ENOSPC); + } + } + /* Now count > 0. */ + ino = *sv_sb_fic_inode(sb,--count); + *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); + fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); + dirty_sb(sb); + inode_init_owner(inode, dir, mode); + inode->i_ino = fs16_to_cpu(sbi, ino); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); + SYSV_I(inode)->i_dir_start_lookup = 0; + insert_inode_hash(inode); + mark_inode_dirty(inode); + + sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ + mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ + /* That's it. */ + mutex_unlock(&sbi->s_lock); + return inode; +} + +unsigned long sysv_count_free_inodes(struct super_block * sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + int ino, count, sb_count; + + mutex_lock(&sbi->s_lock); + + sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); + + if (0) + goto trust_sb; + + /* this causes a lot of disk traffic ... */ + count = 0; + ino = SYSV_ROOT_INO+1; + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto Eio; + while (ino <= sbi->s_ninodes) { + if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) + count++; + if ((ino++ & sbi->s_inodes_per_block_1) == 0) { + brelse(bh); + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto Eio; + } else + raw_inode++; + } + brelse(bh); + if (count != sb_count) + goto Einval; +out: + mutex_unlock(&sbi->s_lock); + return count; + +Einval: + printk("sysv_count_free_inodes: " + "free inode count was %d, correcting to %d\n", + sb_count, count); + if (!(sb->s_flags & MS_RDONLY)) { + *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); + dirty_sb(sb); + } + goto out; + +Eio: + printk("sysv_count_free_inodes: unable to read inode table\n"); +trust_sb: + count = sb_count; + goto out; +} diff --git a/kernel/fs/sysv/inode.c b/kernel/fs/sysv/inode.c new file mode 100644 index 000000000..88956309c --- /dev/null +++ b/kernel/fs/sysv/inode.c @@ -0,0 +1,370 @@ +/* + * linux/fs/sysv/inode.c + * + * minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * xenix/inode.c + * Copyright (C) 1992 Doug Evans + * + * coh/inode.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/inode.c + * Copyright (C) 1993 Paul B. Monday + * + * sysv/inode.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + * + * This file contains code for allocating/freeing inodes and for read/writing + * the superblock. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sysv.h" + +static int sysv_sync_fs(struct super_block *sb, int wait) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned long time = get_seconds(), old_time; + + mutex_lock(&sbi->s_lock); + + /* + * If we are going to write out the super block, + * then attach current time stamp. + * But if the filesystem was marked clean, keep it clean. + */ + old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); + if (sbi->s_type == FSTYPE_SYSV4) { + if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) + *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time); + *sbi->s_sb_time = cpu_to_fs32(sbi, time); + mark_buffer_dirty(sbi->s_bh2); + } + + mutex_unlock(&sbi->s_lock); + + return 0; +} + +static int sysv_remount(struct super_block *sb, int *flags, char *data) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + sync_filesystem(sb); + if (sbi->s_forced_ro) + *flags |= MS_RDONLY; + return 0; +} + +static void sysv_put_super(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + /* XXX ext2 also updates the state here */ + mark_buffer_dirty(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + mark_buffer_dirty(sbi->s_bh2); + } + + brelse(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + brelse(sbi->s_bh2); + + kfree(sbi); +} + +static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_ndatazones; + buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb); + buf->f_files = sbi->s_ninodes; + buf->f_ffree = sysv_count_free_inodes(sb); + buf->f_namelen = SYSV_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32 + */ +static inline void read3byte(struct sysv_sb_info *sbi, + unsigned char * from, unsigned char * to) +{ + if (sbi->s_bytesex == BYTESEX_PDP) { + to[0] = from[0]; + to[1] = 0; + to[2] = from[1]; + to[3] = from[2]; + } else if (sbi->s_bytesex == BYTESEX_LE) { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + to[3] = 0; + } else { + to[0] = 0; + to[1] = from[0]; + to[2] = from[1]; + to[3] = from[2]; + } +} + +static inline void write3byte(struct sysv_sb_info *sbi, + unsigned char * from, unsigned char * to) +{ + if (sbi->s_bytesex == BYTESEX_PDP) { + to[0] = from[0]; + to[1] = from[2]; + to[2] = from[3]; + } else if (sbi->s_bytesex == BYTESEX_LE) { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + } else { + to[0] = from[1]; + to[1] = from[2]; + to[2] = from[3]; + } +} + +static const struct inode_operations sysv_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = sysv_getattr, +}; + +void sysv_set_inode(struct inode *inode, dev_t rdev) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &sysv_file_inode_operations; + inode->i_fop = &sysv_file_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &sysv_dir_inode_operations; + inode->i_fop = &sysv_dir_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_blocks) { + inode->i_op = &sysv_symlink_inode_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else { + inode->i_op = &sysv_fast_symlink_inode_operations; + nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + sizeof(SYSV_I(inode)->i_data) - 1); + } + } else + init_special_inode(inode, inode->i_mode, rdev); +} + +struct inode *sysv_iget(struct super_block *sb, unsigned int ino) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + struct sysv_inode_info * si; + struct inode *inode; + unsigned int block; + + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %d is out of range\n", + sb->s_id, ino); + return ERR_PTR(-EIO); + } + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("Major problem: unable to read inode from dev %s\n", + inode->i_sb->s_id); + goto bad_inode; + } + /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ + inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); + i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid)); + i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); + set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); + inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); + inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); + inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); + inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime); + inode->i_ctime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_blocks = 0; + + si = SYSV_I(inode); + for (block = 0; block < 10+1+1+1; block++) + read3byte(sbi, &raw_inode->i_data[3*block], + (u8 *)&si->i_data[block]); + brelse(bh); + si->i_dir_start_lookup = 0; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + sysv_set_inode(inode, + old_decode_dev(fs32_to_cpu(sbi, si->i_data[0]))); + else + sysv_set_inode(inode, 0); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static int __sysv_write_inode(struct inode *inode, int wait) +{ + struct super_block * sb = inode->i_sb; + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + struct sysv_inode_info * si; + unsigned int ino, block; + int err = 0; + + ino = inode->i_ino; + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %d is out of range\n", + inode->i_sb->s_id, ino); + return -EIO; + } + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("unable to read i-node block\n"); + return -EIO; + } + + raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); + raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode))); + raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); + raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); + raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); + raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); + raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec); + + si = SYSV_I(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev)); + for (block = 0; block < 10+1+1+1; block++) + write3byte(sbi, (u8 *)&si->i_data[block], + &raw_inode->i_data[3*block]); + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing sysv inode [%s:%08x]\n", + sb->s_id, ino); + err = -EIO; + } + } + brelse(bh); + return 0; +} + +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int sysv_sync_inode(struct inode *inode) +{ + return __sysv_write_inode(inode, 1); +} + +static void sysv_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + if (!inode->i_nlink) { + inode->i_size = 0; + sysv_truncate(inode); + } + invalidate_inode_buffers(inode); + clear_inode(inode); + if (!inode->i_nlink) + sysv_free_inode(inode); +} + +static struct kmem_cache *sysv_inode_cachep; + +static struct inode *sysv_alloc_inode(struct super_block *sb) +{ + struct sysv_inode_info *si; + + si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL); + if (!si) + return NULL; + return &si->vfs_inode; +} + +static void sysv_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); +} + +static void sysv_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, sysv_i_callback); +} + +static void init_once(void *p) +{ + struct sysv_inode_info *si = (struct sysv_inode_info *)p; + + inode_init_once(&si->vfs_inode); +} + +const struct super_operations sysv_sops = { + .alloc_inode = sysv_alloc_inode, + .destroy_inode = sysv_destroy_inode, + .write_inode = sysv_write_inode, + .evict_inode = sysv_evict_inode, + .put_super = sysv_put_super, + .sync_fs = sysv_sync_fs, + .remount_fs = sysv_remount, + .statfs = sysv_statfs, +}; + +int __init sysv_init_icache(void) +{ + sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", + sizeof(struct sysv_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (!sysv_inode_cachep) + return -ENOMEM; + return 0; +} + +void sysv_destroy_icache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(sysv_inode_cachep); +} diff --git a/kernel/fs/sysv/itree.c b/kernel/fs/sysv/itree.c new file mode 100644 index 000000000..2fde40acf --- /dev/null +++ b/kernel/fs/sysv/itree.c @@ -0,0 +1,501 @@ +/* + * linux/fs/sysv/itree.c + * + * Handling of indirect blocks' trees. + * AV, Sep--Dec 2000 + */ + +#include +#include +#include +#include "sysv.h" + +enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ + +static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode) +{ + mark_buffer_dirty_inode(bh, inode); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); +} + +static int block_to_path(struct inode *inode, long block, int offsets[DEPTH]) +{ + struct super_block *sb = inode->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + int ptrs_bits = sbi->s_ind_per_block_bits; + unsigned long indirect_blocks = sbi->s_ind_per_block, + double_blocks = sbi->s_ind_per_block_2; + int n = 0; + + if (block < 0) { + printk("sysv_block_map: block < 0\n"); + } else if (block < DIRECT) { + offsets[n++] = block; + } else if ( (block -= DIRECT) < indirect_blocks) { + offsets[n++] = DIRECT; + offsets[n++] = block; + } else if ((block -= indirect_blocks) < double_blocks) { + offsets[n++] = DIRECT+1; + offsets[n++] = block >> ptrs_bits; + offsets[n++] = block & (indirect_blocks - 1); + } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) { + offsets[n++] = DIRECT+2; + offsets[n++] = block >> (ptrs_bits * 2); + offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1); + offsets[n++] = block & (indirect_blocks - 1); + } else { + /* nothing */; + } + return n; +} + +static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr) +{ + return sbi->s_block_base + fs32_to_cpu(sbi, nr); +} + +typedef struct { + sysv_zone_t *p; + sysv_zone_t key; + struct buffer_head *bh; +} Indirect; + +static DEFINE_RWLOCK(pointers_lock); + +static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +static inline sysv_zone_t *block_end(struct buffer_head *bh) +{ + return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); +} + +/* + * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) + */ +static Indirect *get_branch(struct inode *inode, + int depth, + int offsets[], + Indirect chain[], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + int block = block_to_cpu(SYSV_SB(sb), p->key); + bh = sb_bread(sb, block); + if (!bh) + goto failure; + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +static int alloc_branch(struct inode *inode, + int num, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0; + int i; + + branch[0].key = sysv_new_block(inode->i_sb); + if (branch[0].key) for (n = 1; n < num; n++) { + struct buffer_head *bh; + int parent; + /* Allocate the next block */ + branch[n].key = sysv_new_block(inode->i_sb); + if (!branch[n].key) + break; + /* + * Get buffer_head for parent block, zero it out and set + * the pointer to new one, then send parent to disk. + */ + parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); + bh = sb_getblk(inode->i_sb, parent); + lock_buffer(bh); + memset(bh->b_data, 0, blocksize); + branch[n].bh = bh; + branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + set_buffer_uptodate(bh); + unlock_buffer(bh); + dirty_indirect(bh, inode); + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < n; i++) + sysv_free_block(inode->i_sb, branch[i].key); + return -ENOSPC; +} + +static inline int splice_branch(struct inode *inode, + Indirect chain[], + Indirect *where, + int num) +{ + int i; + + /* Verify that place we are splicing to is still there and vacant */ + write_lock(&pointers_lock); + if (!verify_chain(chain, where-1) || *where->p) + goto changed; + *where->p = where->key; + write_unlock(&pointers_lock); + + inode->i_ctime = CURRENT_TIME_SEC; + + /* had we spliced it onto indirect block? */ + if (where->bh) + dirty_indirect(where->bh, inode); + + if (IS_SYNC(inode)) + sysv_sync_inode(inode); + else + mark_inode_dirty(inode); + return 0; + +changed: + write_unlock(&pointers_lock); + for (i = 1; i < num; i++) + bforget(where[i].bh); + for (i = 0; i < num; i++) + sysv_free_block(inode->i_sb, where[i].key); + return -EAGAIN; +} + +static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + struct super_block *sb = inode->i_sb; + Indirect *partial; + int left; + int depth = block_to_path(inode, iblock, offsets); + + if (depth == 0) + goto out; + +reread: + read_lock(&pointers_lock); + partial = get_branch(inode, depth, offsets, chain, &err); + read_unlock(&pointers_lock); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { +got_it: + map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb), + chain[depth-1].key)); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + left = (chain + depth) - partial; + err = alloc_branch(inode, left, offsets+(partial-chain), partial); + if (err) + goto cleanup; + + if (splice_branch(inode, chain, partial, left) < 0) + goto changed; + + set_buffer_new(bh_result); + goto got_it; + +changed: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + goto reread; +} + +static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +static Indirect *find_shared(struct inode *inode, + int depth, + int offsets[], + Indirect chain[], + sysv_zone_t *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + + write_lock(&pointers_lock); + partial = get_branch(inode, k, offsets, chain, &err); + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) { + write_unlock(&pointers_lock); + goto no_top; + } + for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&pointers_lock); + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q) +{ + for ( ; p < q ; p++) { + sysv_zone_t nr = *p; + if (nr) { + *p = 0; + sysv_free_block(inode->i_sb, nr); + mark_inode_dirty(inode); + } + } +} + +static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth) +{ + struct buffer_head * bh; + struct super_block *sb = inode->i_sb; + + if (depth--) { + for ( ; p < q ; p++) { + int block; + sysv_zone_t nr = *p; + if (!nr) + continue; + *p = 0; + block = block_to_cpu(SYSV_SB(sb), nr); + bh = sb_bread(sb, block); + if (!bh) + continue; + free_branches(inode, (sysv_zone_t*)bh->b_data, + block_end(bh), depth); + bforget(bh); + sysv_free_block(sb, nr); + mark_inode_dirty(inode); + } + } else + free_data(inode, p, q); +} + +void sysv_truncate (struct inode * inode) +{ + sysv_zone_t *i_data = SYSV_I(inode)->i_data; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + sysv_zone_t nr = 0; + int n; + long iblock; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + + blocksize = inode->i_sb->s_blocksize; + iblock = (inode->i_size + blocksize-1) + >> inode->i_sb->s_blocksize_bits; + + block_truncate_page(inode->i_mapping, inode->i_size, get_block); + + n = block_to_path(inode, iblock, offsets); + if (n == 0) + return; + + if (n == 1) { + free_data(inode, i_data+offsets[0], i_data + DIRECT); + goto do_indirects; + } + + partial = find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (already detached) */ + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + dirty_indirect(partial->bh, inode); + free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + free_branches(inode, partial->p + 1, block_end(partial->bh), + (chain+n-1) - partial); + dirty_indirect(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */ + while (n < DEPTH) { + nr = i_data[DIRECT + n - 1]; + if (nr) { + i_data[DIRECT + n - 1] = 0; + mark_inode_dirty(inode); + free_branches(inode, &nr, &nr+1, n); + } + n++; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) + sysv_sync_inode (inode); + else + mark_inode_dirty(inode); +} + +static unsigned sysv_nblocks(struct super_block *s, loff_t size) +{ + struct sysv_sb_info *sbi = SYSV_SB(s); + int ptrs_bits = sbi->s_ind_per_block_bits; + unsigned blocks, res, direct = DIRECT, i = DEPTH; + blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits; + res = blocks; + while (--i && blocks > direct) { + blocks = ((blocks - direct - 1) >> ptrs_bits) + 1; + res += blocks; + direct = 1; + } + return blocks; +} + +int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct super_block *s = dentry->d_sb; + generic_fillattr(d_inode(dentry), stat); + stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); + stat->blksize = s->s_blocksize; + return 0; +} + +static int sysv_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,get_block,wbc); +} + +static int sysv_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,get_block); +} + +int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, get_block); +} + +static void sysv_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + sysv_truncate(inode); + } +} + +static int sysv_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); + if (unlikely(ret)) + sysv_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t sysv_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,get_block); +} + +const struct address_space_operations sysv_aops = { + .readpage = sysv_readpage, + .writepage = sysv_writepage, + .write_begin = sysv_write_begin, + .write_end = generic_write_end, + .bmap = sysv_bmap +}; diff --git a/kernel/fs/sysv/namei.c b/kernel/fs/sysv/namei.c new file mode 100644 index 000000000..11e83ed0b --- /dev/null +++ b/kernel/fs/sysv/namei.c @@ -0,0 +1,290 @@ +/* + * linux/fs/sysv/namei.c + * + * minix/namei.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/namei.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/namei.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + */ + +#include +#include "sysv.h" + +static int add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = sysv_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static int sysv_hash(const struct dentry *dentry, struct qstr *qstr) +{ + /* Truncate the name in place, avoids having to define a compare + function. */ + if (qstr->len > SYSV_NAMELEN) { + qstr->len = SYSV_NAMELEN; + qstr->hash = full_name_hash(qstr->name, qstr->len); + } + return 0; +} + +const struct dentry_operations sysv_dentry_operations = { + .d_hash = sysv_hash, +}; + +static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > SYSV_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + ino = sysv_inode_by_name(dentry); + + if (ino) { + inode = sysv_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) +{ + struct inode * inode; + int err; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = sysv_new_inode(dir, mode); + err = PTR_ERR(inode); + + if (!IS_ERR(inode)) { + sysv_set_inode(inode, rdev); + mark_inode_dirty(inode); + err = add_nondir(dentry, inode); + } + return err; +} + +static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +{ + return sysv_mknod(dir, dentry, mode, 0); +} + +static int sysv_symlink(struct inode * dir, struct dentry * dentry, + const char * symname) +{ + int err = -ENAMETOOLONG; + int l = strlen(symname)+1; + struct inode * inode; + + if (l > dir->i_sb->s_blocksize) + goto out; + + inode = sysv_new_inode(dir, S_IFLNK|0777); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + sysv_set_inode(inode, 0); + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + mark_inode_dirty(inode); + err = add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int sysv_link(struct dentry * old_dentry, struct inode * dir, + struct dentry * dentry) +{ + struct inode *inode = d_inode(old_dentry); + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + return add_nondir(dentry, inode); +} + +static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +{ + struct inode * inode; + int err; + + inode_inc_link_count(dir); + + inode = sysv_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + sysv_set_inode(inode, 0); + + inode_inc_link_count(inode); + + err = sysv_make_empty(inode, dir); + if (err) + goto out_fail; + + err = sysv_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int sysv_unlink(struct inode * dir, struct dentry * dentry) +{ + struct inode * inode = d_inode(dentry); + struct page * page; + struct sysv_dir_entry * de; + int err = -ENOENT; + + de = sysv_find_entry(dentry, &page); + if (!de) + goto out; + + err = sysv_delete_entry (de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); +out: + return err; +} + +static int sysv_rmdir(struct inode * dir, struct dentry * dentry) +{ + struct inode *inode = d_inode(dentry); + int err = -ENOTEMPTY; + + if (sysv_empty_dir(inode)) { + err = sysv_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry) +{ + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); + struct page * dir_page = NULL; + struct sysv_dir_entry * dir_de = NULL; + struct page * old_page; + struct sysv_dir_entry * old_de; + int err = -ENOENT; + + old_de = sysv_find_entry(old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = sysv_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page * new_page; + struct sysv_dir_entry * new_de; + + err = -ENOTEMPTY; + if (dir_de && !sysv_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = sysv_find_entry(new_dentry, &new_page); + if (!new_de) + goto out_dir; + sysv_set_link(new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + err = sysv_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + sysv_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + sysv_set_link(dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations sysv_dir_inode_operations = { + .create = sysv_create, + .lookup = sysv_lookup, + .link = sysv_link, + .unlink = sysv_unlink, + .symlink = sysv_symlink, + .mkdir = sysv_mkdir, + .rmdir = sysv_rmdir, + .mknod = sysv_mknod, + .rename = sysv_rename, + .getattr = sysv_getattr, +}; diff --git a/kernel/fs/sysv/super.c b/kernel/fs/sysv/super.c new file mode 100644 index 000000000..eda109597 --- /dev/null +++ b/kernel/fs/sysv/super.c @@ -0,0 +1,593 @@ +/* + * linux/fs/sysv/inode.c + * + * minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * xenix/inode.c + * Copyright (C) 1992 Doug Evans + * + * coh/inode.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/inode.c + * Copyright (C) 1993 Paul B. Monday + * + * sysv/inode.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + * + * This file contains code for read/parsing the superblock. + */ + +#include +#include +#include +#include +#include "sysv.h" + +/* + * The following functions try to recognize specific filesystems. + * + * We recognize: + * - Xenix FS by its magic number. + * - SystemV FS by its magic number. + * - Coherent FS by its funny fname/fpack field. + * - SCO AFS by s_nfree == 0xffff + * - V7 FS has no distinguishing features. + * + * We discriminate among SystemV4 and SystemV2 FS by the assumption that + * the time stamp is not < 01-01-1980. + */ + +enum { + JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 +}; + +static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) +{ + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + struct xenix_super_block * sbd1; + struct xenix_super_block * sbd2; + + if (bh1 != bh2) + sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; + else { + /* block size = 512, so bh1 != bh2 */ + sbd1 = (struct xenix_super_block *) bh1->b_data; + sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); + } + + *max_links = XENIX_LINK_MAX; + sbi->s_fic_size = XENIX_NICINOD; + sbi->s_flc_size = XENIX_NICFREE; + sbi->s_sbd1 = (char *)sbd1; + sbi->s_sbd2 = (char *)sbd2; + sbi->s_sb_fic_count = &sbd1->s_ninode; + sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd2->s_tinode; + sbi->s_bcache_count = &sbd1->s_nfree; + sbi->s_bcache = &sbd1->s_free[0]; + sbi->s_free_blocks = &sbd2->s_tfree; + sbi->s_sb_time = &sbd2->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); +} + +static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) +{ + struct sysv4_super_block * sbd; + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + + if (bh1 == bh2) + sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); + else + sbd = (struct sysv4_super_block *) bh2->b_data; + + *max_links = SYSV_LINK_MAX; + sbi->s_fic_size = SYSV_NICINOD; + sbi->s_flc_size = SYSV_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_sb_state = &sbd->s_state; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) +{ + struct sysv2_super_block *sbd; + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + + if (bh1 == bh2) + sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); + else + sbd = (struct sysv2_super_block *) bh2->b_data; + + *max_links = SYSV_LINK_MAX; + sbi->s_fic_size = SYSV_NICINOD; + sbi->s_flc_size = SYSV_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_sb_state = &sbd->s_state; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) +{ + struct coh_super_block * sbd; + struct buffer_head *bh1 = sbi->s_bh1; + + sbd = (struct coh_super_block *) bh1->b_data; + + *max_links = COH_LINK_MAX; + sbi->s_fic_size = COH_NICINOD; + sbi->s_flc_size = COH_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) +{ + struct buffer_head *bh2 = sbi->s_bh2; + struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; + + *max_links = V7_LINK_MAX; + sbi->s_fic_size = V7_NICINOD; + sbi->s_flc_size = V7_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; + if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) + sbi->s_bytesex = BYTESEX_LE; + else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) + sbi->s_bytesex = BYTESEX_BE; + else + return 0; + switch (fs32_to_cpu(sbi, sbd->s_type)) { + case 1: + sbi->s_type = FSTYPE_XENIX; + return 1; + case 2: + sbi->s_type = FSTYPE_XENIX; + return 2; + default: + return 0; + } +} + +static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct super_block *sb = sbi->s_sb; + /* All relevant fields are at the same offsets in R2 and R4 */ + struct sysv4_super_block * sbd; + u32 type; + + sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); + if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) + sbi->s_bytesex = BYTESEX_LE; + else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) + sbi->s_bytesex = BYTESEX_BE; + else + return 0; + + type = fs32_to_cpu(sbi, sbd->s_type); + + if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { + sbi->s_type = FSTYPE_AFS; + sbi->s_forced_ro = 1; + if (!(sb->s_flags & MS_RDONLY)) { + printk("SysV FS: SCO EAFS on %s detected, " + "forcing read-only mode.\n", + sb->s_id); + } + return type; + } + + if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { + /* this is likely to happen on SystemV2 FS */ + if (type > 3 || type < 1) + return 0; + sbi->s_type = FSTYPE_SYSV2; + return type; + } + if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) + return 0; + + /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, + 0x20 or 0x30 indicates that symbolic links and the 14-character + filename limit is gone. Due to lack of information about this + feature read-only mode seems to be a reasonable approach... -KGB */ + + if (type >= 0x10) { + printk("SysV FS: can't handle long file names on %s, " + "forcing read-only mode.\n", sb->s_id); + sbi->s_forced_ro = 1; + } + + sbi->s_type = FSTYPE_SYSV4; + return type >= 0x10 ? type >> 4 : type; +} + +static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct coh_super_block * sbd; + + sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); + if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) + || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) + return 0; + sbi->s_bytesex = BYTESEX_PDP; + sbi->s_type = FSTYPE_COH; + return 1; +} + +static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + int size = detect_sysv(sbi, bh); + + return size>2 ? 0 : size; +} + +static struct { + int block; + int (*test)(struct sysv_sb_info *, struct buffer_head *); +} flavours[] = { + {1, detect_xenix}, + {0, detect_sysv}, + {0, detect_coherent}, + {9, detect_sysv_odd}, + {15,detect_sysv_odd}, + {18,detect_sysv}, +}; + +static char *flavour_names[] = { + [FSTYPE_XENIX] = "Xenix", + [FSTYPE_SYSV4] = "SystemV", + [FSTYPE_SYSV2] = "SystemV Release 2", + [FSTYPE_COH] = "Coherent", + [FSTYPE_V7] = "V7", + [FSTYPE_AFS] = "AFS", +}; + +static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { + [FSTYPE_XENIX] = detected_xenix, + [FSTYPE_SYSV4] = detected_sysv4, + [FSTYPE_SYSV2] = detected_sysv2, + [FSTYPE_COH] = detected_coherent, + [FSTYPE_V7] = detected_v7, + [FSTYPE_AFS] = detected_sysv4, +}; + +static int complete_read_super(struct super_block *sb, int silent, int size) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct inode *root_inode; + char *found = flavour_names[sbi->s_type]; + u_char n_bits = size+8; + int bsize = 1 << n_bits; + int bsize_4 = bsize >> 2; + + sbi->s_firstinodezone = 2; + + flavour_setup[sbi->s_type](sbi, &sb->s_max_links); + + sbi->s_truncate = 1; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; + sbi->s_inodes_per_block = bsize >> 6; + sbi->s_inodes_per_block_1 = (bsize >> 6)-1; + sbi->s_inodes_per_block_bits = n_bits-6; + sbi->s_ind_per_block = bsize_4; + sbi->s_ind_per_block_2 = bsize_4*bsize_4; + sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); + sbi->s_ind_per_block_bits = n_bits-2; + + sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) + << sbi->s_inodes_per_block_bits; + + if (!silent) + printk("VFS: Found a %s FS (block size = %ld) on device %s\n", + found, sb->s_blocksize, sb->s_id); + + sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; + /* set up enough so that it can read an inode */ + sb->s_op = &sysv_sops; + if (sbi->s_forced_ro) + sb->s_flags |= MS_RDONLY; + if (sbi->s_truncate) + sb->s_d_op = &sysv_dentry_operations; + root_inode = sysv_iget(sb, SYSV_ROOT_INO); + if (IS_ERR(root_inode)) { + printk("SysV FS: get root inode failed\n"); + return 0; + } + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + printk("SysV FS: get root dentry failed\n"); + return 0; + } + return 1; +} + +static int sysv_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh1, *bh = NULL; + struct sysv_sb_info *sbi; + unsigned long blocknr; + int size = 0, i; + + BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); + BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); + BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); + BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); + BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); + + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_sb = sb; + sbi->s_block_base = 0; + mutex_init(&sbi->s_lock); + sb->s_fs_info = sbi; + + sb_set_blocksize(sb, BLOCK_SIZE); + + for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { + brelse(bh); + bh = sb_bread(sb, flavours[i].block); + if (!bh) + continue; + size = flavours[i].test(SYSV_SB(sb), bh); + } + + if (!size) + goto Eunknown; + + switch (size) { + case 1: + blocknr = bh->b_blocknr << 1; + brelse(bh); + sb_set_blocksize(sb, 512); + bh1 = sb_bread(sb, blocknr); + bh = sb_bread(sb, blocknr + 1); + break; + case 2: + bh1 = bh; + break; + case 3: + blocknr = bh->b_blocknr >> 1; + brelse(bh); + sb_set_blocksize(sb, 2048); + bh1 = bh = sb_bread(sb, blocknr); + break; + default: + goto Ebadsize; + } + + if (bh && bh1) { + sbi->s_bh1 = bh1; + sbi->s_bh2 = bh; + if (complete_read_super(sb, silent, size)) + return 0; + } + + brelse(bh1); + brelse(bh); + sb_set_blocksize(sb, BLOCK_SIZE); + printk("oldfs: cannot read superblock\n"); +failed: + kfree(sbi); + return -EINVAL; + +Eunknown: + brelse(bh); + if (!silent) + printk("VFS: unable to find oldfs superblock on device %s\n", + sb->s_id); + goto failed; +Ebadsize: + brelse(bh); + if (!silent) + printk("VFS: oldfs: unsupported block size (%dKb)\n", + 1<<(size-2)); + goto failed; +} + +static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) +{ + struct v7_super_block *v7sb; + struct sysv_inode *v7i; + struct buffer_head *bh2; + struct sysv_sb_info *sbi; + + sbi = sb->s_fs_info; + + /* plausibility check on superblock */ + v7sb = (struct v7_super_block *) bh->b_data; + if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || + fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || + fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) + return 0; + + /* plausibility check on root inode: it is a directory, + with a nonzero size that is a multiple of 16 */ + bh2 = sb_bread(sb, 2); + if (bh2 == NULL) + return 0; + + v7i = (struct sysv_inode *)(bh2->b_data + 64); + if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || + (fs32_to_cpu(sbi, v7i->i_size) == 0) || + (fs32_to_cpu(sbi, v7i->i_size) & 017) || + (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * + sizeof(struct sysv_dir_entry))) { + brelse(bh2); + return 0; + } + + brelse(bh2); + return 1; +} + +static int v7_fill_super(struct super_block *sb, void *data, int silent) +{ + struct sysv_sb_info *sbi; + struct buffer_head *bh; + + if (440 != sizeof (struct v7_super_block)) + panic("V7 FS: bad super-block size"); + if (64 != sizeof (struct sysv_inode)) + panic("sysv fs: bad i-node size"); + + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_sb = sb; + sbi->s_block_base = 0; + sbi->s_type = FSTYPE_V7; + mutex_init(&sbi->s_lock); + sb->s_fs_info = sbi; + + sb_set_blocksize(sb, 512); + + if ((bh = sb_bread(sb, 1)) == NULL) { + if (!silent) + printk("VFS: unable to read V7 FS superblock on " + "device %s.\n", sb->s_id); + goto failed; + } + + /* Try PDP-11 UNIX */ + sbi->s_bytesex = BYTESEX_PDP; + if (v7_sanity_check(sb, bh)) + goto detected; + + /* Try PC/IX, v7/x86 */ + sbi->s_bytesex = BYTESEX_LE; + if (v7_sanity_check(sb, bh)) + goto detected; + + goto failed; + +detected: + sbi->s_bh1 = bh; + sbi->s_bh2 = bh; + if (complete_read_super(sb, silent, 1)) + return 0; + +failed: + printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", + sb->s_id); + brelse(bh); + kfree(sbi); + return -EINVAL; +} + +/* Every kernel module contains stuff like this. */ + +static struct dentry *sysv_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); +} + +static struct dentry *v7_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); +} + +static struct file_system_type sysv_fs_type = { + .owner = THIS_MODULE, + .name = "sysv", + .mount = sysv_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("sysv"); + +static struct file_system_type v7_fs_type = { + .owner = THIS_MODULE, + .name = "v7", + .mount = v7_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("v7"); +MODULE_ALIAS("v7"); + +static int __init init_sysv_fs(void) +{ + int error; + + error = sysv_init_icache(); + if (error) + goto out; + error = register_filesystem(&sysv_fs_type); + if (error) + goto destroy_icache; + error = register_filesystem(&v7_fs_type); + if (error) + goto unregister; + return 0; + +unregister: + unregister_filesystem(&sysv_fs_type); +destroy_icache: + sysv_destroy_icache(); +out: + return error; +} + +static void __exit exit_sysv_fs(void) +{ + unregister_filesystem(&sysv_fs_type); + unregister_filesystem(&v7_fs_type); + sysv_destroy_icache(); +} + +module_init(init_sysv_fs) +module_exit(exit_sysv_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/sysv/symlink.c b/kernel/fs/sysv/symlink.c new file mode 100644 index 000000000..d3fa0d703 --- /dev/null +++ b/kernel/fs/sysv/symlink.c @@ -0,0 +1,20 @@ +/* + * linux/fs/sysv/symlink.c + * + * Handling of System V filesystem fast symlinks extensions. + * Aug 2001, Christoph Hellwig (hch@infradead.org) + */ + +#include "sysv.h" +#include + +static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data); + return NULL; +} + +const struct inode_operations sysv_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = sysv_follow_link, +}; diff --git a/kernel/fs/sysv/sysv.h b/kernel/fs/sysv/sysv.h new file mode 100644 index 000000000..69d488986 --- /dev/null +++ b/kernel/fs/sysv/sysv.h @@ -0,0 +1,247 @@ +#ifndef _SYSV_H +#define _SYSV_H + +#include + +typedef __u16 __bitwise __fs16; +typedef __u32 __bitwise __fs32; + +#include + +/* + * SystemV/V7/Coherent super-block data in memory + * + * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified + * while the system is running). This is in contrast to the Minix and Berkeley + * filesystems (where the superblock is never modified). This affects the + * sync() operation: we must keep the superblock in a disk buffer and use this + * one as our "working copy". + */ + +struct sysv_sb_info { + struct super_block *s_sb; /* VFS superblock */ + int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */ + char s_bytesex; /* bytesex (le/be/pdp) */ + char s_truncate; /* if 1: names > SYSV_NAMELEN chars are truncated */ + /* if 0: they are disallowed (ENAMETOOLONG) */ + unsigned int s_inodes_per_block; /* number of inodes per block */ + unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */ + unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */ + unsigned int s_ind_per_block; /* number of indirections per block */ + unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */ + unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */ + unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */ + unsigned int s_block_base; /* physical block number of block 0 */ + unsigned short s_fic_size; /* free inode cache size, NICINOD */ + unsigned short s_flc_size; /* free block list chunk size, NICFREE */ + /* The superblock is kept in one or two disk buffers: */ + struct buffer_head *s_bh1; + struct buffer_head *s_bh2; + /* These are pointers into the disk buffer, to compensate for + different superblock layout. */ + char * s_sbd1; /* entire superblock data, for part 1 */ + char * s_sbd2; /* entire superblock data, for part 2 */ + __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */ + sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */ + __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */ + __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */ + sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */ + __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */ + __fs32 *s_sb_time; /* pointer to s_sbd->s_time */ + __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */ + /* We keep those superblock entities that don't change here; + this saves us an indirection and perhaps a conversion. */ + u32 s_firstinodezone; /* index of first inode zone */ + u32 s_firstdatazone; /* same as s_sbd->s_isize */ + u32 s_ninodes; /* total number of inodes */ + u32 s_ndatazones; /* total number of data zones */ + u32 s_nzones; /* same as s_sbd->s_fsize */ + u16 s_namelen; /* max length of dir entry */ + int s_forced_ro; + struct mutex s_lock; +}; + +/* + * SystemV/V7/Coherent FS inode data in memory + */ +struct sysv_inode_info { + __fs32 i_data[13]; + u32 i_dir_start_lookup; + struct inode vfs_inode; +}; + + +static inline struct sysv_inode_info *SYSV_I(struct inode *inode) +{ + return list_entry(inode, struct sysv_inode_info, vfs_inode); +} + +static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + + +/* identify the FS in memory */ +enum { + FSTYPE_NONE = 0, + FSTYPE_XENIX, + FSTYPE_SYSV4, + FSTYPE_SYSV2, + FSTYPE_COH, + FSTYPE_V7, + FSTYPE_AFS, + FSTYPE_END, +}; + +#define SYSV_MAGIC_BASE 0x012FF7B3 + +#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX) +#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4) +#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2) +#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH) + + +/* Admissible values for i_nlink: 0.._LINK_MAX */ +enum { + XENIX_LINK_MAX = 126, /* ?? */ + SYSV_LINK_MAX = 126, /* 127? 251? */ + V7_LINK_MAX = 126, /* ?? */ + COH_LINK_MAX = 10000, +}; + + +static inline void dirty_sb(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + mark_buffer_dirty(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + mark_buffer_dirty(sbi->s_bh2); +} + + +/* ialloc.c */ +extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, + struct buffer_head **); +extern struct inode * sysv_new_inode(const struct inode *, umode_t); +extern void sysv_free_inode(struct inode *); +extern unsigned long sysv_count_free_inodes(struct super_block *); + +/* balloc.c */ +extern sysv_zone_t sysv_new_block(struct super_block *); +extern void sysv_free_block(struct super_block *, sysv_zone_t); +extern unsigned long sysv_count_free_blocks(struct super_block *); + +/* itree.c */ +extern void sysv_truncate(struct inode *); +extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +/* inode.c */ +extern struct inode *sysv_iget(struct super_block *, unsigned int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); +extern int sysv_sync_inode(struct inode *); +extern void sysv_set_inode(struct inode *, dev_t); +extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int sysv_init_icache(void); +extern void sysv_destroy_icache(void); + + +/* dir.c */ +extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); +extern int sysv_add_link(struct dentry *, struct inode *); +extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); +extern int sysv_make_empty(struct inode *, struct inode *); +extern int sysv_empty_dir(struct inode *); +extern void sysv_set_link(struct sysv_dir_entry *, struct page *, + struct inode *); +extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); +extern ino_t sysv_inode_by_name(struct dentry *); + + +extern const struct inode_operations sysv_file_inode_operations; +extern const struct inode_operations sysv_dir_inode_operations; +extern const struct inode_operations sysv_fast_symlink_inode_operations; +extern const struct file_operations sysv_file_operations; +extern const struct file_operations sysv_dir_operations; +extern const struct address_space_operations sysv_aops; +extern const struct super_operations sysv_sops; +extern const struct dentry_operations sysv_dentry_operations; + + +enum { + BYTESEX_LE, + BYTESEX_PDP, + BYTESEX_BE, +}; + +static inline u32 PDP_swab(u32 x) +{ +#ifdef __LITTLE_ENDIAN + return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16); +#else +#ifdef __BIG_ENDIAN + return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8); +#else +#error BYTESEX +#endif +#endif +} + +static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + return PDP_swab((__force __u32)n); + else if (sbi->s_bytesex == BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + return (__force __fs32)PDP_swab(n); + else if (sbi->s_bytesex == BYTESEX_LE) + return (__force __fs32)cpu_to_le32(n); + else + return (__force __fs32)cpu_to_be32(n); +} + +static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); + else if (sbi->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, d); + else + be32_add_cpu((__be32 *)n, d); + return *n; +} + +static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n) +{ + if (sbi->s_bytesex != BYTESEX_BE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) +{ + if (sbi->s_bytesex != BYTESEX_BE) + return (__force __fs16)cpu_to_le16(n); + else + return (__force __fs16)cpu_to_be16(n); +} + +static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) +{ + if (sbi->s_bytesex != BYTESEX_BE) + le16_add_cpu((__le16 *)n, d); + else + be16_add_cpu((__be16 *)n, d); + return *n; +} + +#endif /* _SYSV_H */ diff --git a/kernel/fs/timerfd.c b/kernel/fs/timerfd.c new file mode 100644 index 000000000..64fb86066 --- /dev/null +++ b/kernel/fs/timerfd.c @@ -0,0 +1,571 @@ +/* + * fs/timerfd.c + * + * Copyright (C) 2007 Davide Libenzi + * + * + * Thanks to Thomas Gleixner for code reviews and useful comments. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct timerfd_ctx { + union { + struct hrtimer tmr; + struct alarm alarm; + } t; + ktime_t tintv; + ktime_t moffs; + wait_queue_head_t wqh; + u64 ticks; + int clockid; + short unsigned expired; + short unsigned settime_flags; /* to show in fdinfo */ + struct rcu_head rcu; + struct list_head clist; + bool might_cancel; +}; + +static LIST_HEAD(cancel_list); +static DEFINE_SPINLOCK(cancel_lock); + +static inline bool isalarm(struct timerfd_ctx *ctx) +{ + return ctx->clockid == CLOCK_REALTIME_ALARM || + ctx->clockid == CLOCK_BOOTTIME_ALARM; +} + +/* + * This gets called when the timer event triggers. We set the "expired" + * flag, but we do not re-arm the timer (in case it's necessary, + * tintv.tv64 != 0) until the timer is accessed. + */ +static void timerfd_triggered(struct timerfd_ctx *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + ctx->expired = 1; + ctx->ticks++; + wake_up_locked(&ctx->wqh); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); +} + +static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +{ + struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, + t.tmr); + timerfd_triggered(ctx); + return HRTIMER_NORESTART; +} + +static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, + ktime_t now) +{ + struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, + t.alarm); + timerfd_triggered(ctx); + return ALARMTIMER_NORESTART; +} + +/* + * Called when the clock was set to cancel the timers in the cancel + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). + */ +void timerfd_clock_was_set(void) +{ + ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs.tv64 != moffs.tv64) { + ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; + wake_up_locked(&ctx->wqh); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); +} + +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } +} + +static bool timerfd_canceled(struct timerfd_ctx *ctx) +{ + if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) + return false; + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + return true; +} + +static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) +{ + if ((ctx->clockid == CLOCK_REALTIME || + ctx->clockid == CLOCK_REALTIME_ALARM) && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } else if (ctx->might_cancel) { + timerfd_remove_cancel(ctx); + } +} + +static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) +{ + ktime_t remaining; + + if (isalarm(ctx)) + remaining = alarm_expires_remaining(&ctx->t.alarm); + else + remaining = hrtimer_expires_remaining(&ctx->t.tmr); + + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; +} + +static int timerfd_setup(struct timerfd_ctx *ctx, int flags, + const struct itimerspec *ktmr) +{ + enum hrtimer_mode htmode; + ktime_t texp; + int clockid = ctx->clockid; + + htmode = (flags & TFD_TIMER_ABSTIME) ? + HRTIMER_MODE_ABS: HRTIMER_MODE_REL; + + texp = timespec_to_ktime(ktmr->it_value); + ctx->expired = 0; + ctx->ticks = 0; + ctx->tintv = timespec_to_ktime(ktmr->it_interval); + + if (isalarm(ctx)) { + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + } else { + hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_set_expires(&ctx->t.tmr, texp); + ctx->t.tmr.function = timerfd_tmrproc; + } + + if (texp.tv64 != 0) { + if (isalarm(ctx)) { + if (flags & TFD_TIMER_ABSTIME) + alarm_start(&ctx->t.alarm, texp); + else + alarm_start_relative(&ctx->t.alarm, texp); + } else { + hrtimer_start(&ctx->t.tmr, texp, htmode); + } + + if (timerfd_canceled(ctx)) + return -ECANCELED; + } + + ctx->settime_flags = flags & TFD_SETTIME_FLAGS; + return 0; +} + +static int timerfd_release(struct inode *inode, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + + timerfd_remove_cancel(ctx); + + if (isalarm(ctx)) + alarm_cancel(&ctx->t.alarm); + else + hrtimer_cancel(&ctx->t.tmr); + kfree_rcu(ctx, rcu); + return 0; +} + +static unsigned int timerfd_poll(struct file *file, poll_table *wait) +{ + struct timerfd_ctx *ctx = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &ctx->wqh, wait); + + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= POLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return events; +} + +static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct timerfd_ctx *ctx = file->private_data; + ssize_t res; + u64 ticks = 0; + + if (count < sizeof(ticks)) + return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); + if (file->f_flags & O_NONBLOCK) + res = -EAGAIN; + else + res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); + + /* + * If clock has changed, we do not care about the + * ticks and we do not rearm the timer. Userspace must + * reevaluate anyway. + */ + if (timerfd_canceled(ctx)) { + ctx->ticks = 0; + ctx->expired = 0; + res = -ECANCELED; + } + + if (ctx->ticks) { + ticks = ctx->ticks; + + if (ctx->expired && ctx->tintv.tv64) { + /* + * If tintv.tv64 != 0, this is a periodic timer that + * needs to be re-armed. We avoid doing it in the timer + * callback to avoid DoS attacks specifying a very + * short timer period. + */ + if (isalarm(ctx)) { + ticks += alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ticks += hrtimer_forward_now(&ctx->t.tmr, + ctx->tintv) - 1; + hrtimer_restart(&ctx->t.tmr); + } + } + ctx->expired = 0; + ctx->ticks = 0; + } + spin_unlock_irq(&ctx->wqh.lock); + if (ticks) + res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks); + return res; +} + +#ifdef CONFIG_PROC_FS +static void timerfd_show(struct seq_file *m, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + struct itimerspec t; + + spin_lock_irq(&ctx->wqh.lock); + t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + + seq_printf(m, + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0%o\n" + "it_value: (%llu, %llu)\n" + "it_interval: (%llu, %llu)\n", + ctx->clockid, + (unsigned long long)ctx->ticks, + ctx->settime_flags, + (unsigned long long)t.it_value.tv_sec, + (unsigned long long)t.it_value.tv_nsec, + (unsigned long long)t.it_interval.tv_sec, + (unsigned long long)t.it_interval.tv_nsec); +} +#else +#define timerfd_show NULL +#endif + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct timerfd_ctx *ctx = file->private_data; + int ret = 0; + + switch (cmd) { + case TFD_IOC_SET_TICKS: { + u64 ticks; + + if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) + return -EFAULT; + if (!ticks) + return -EINVAL; + + spin_lock_irq(&ctx->wqh.lock); + if (!timerfd_canceled(ctx)) { + ctx->ticks = ticks; + wake_up_locked(&ctx->wqh); + } else + ret = -ECANCELED; + spin_unlock_irq(&ctx->wqh.lock); + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret; +} +#else +#define timerfd_ioctl NULL +#endif + +static const struct file_operations timerfd_fops = { + .release = timerfd_release, + .poll = timerfd_poll, + .read = timerfd_read, + .llseek = noop_llseek, + .show_fdinfo = timerfd_show, + .unlocked_ioctl = timerfd_ioctl, +}; + +static int timerfd_fget(int fd, struct fd *p) +{ + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; + if (f.file->f_op != &timerfd_fops) { + fdput(f); + return -EINVAL; + } + *p = f; + return 0; +} + +SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) +{ + int ufd; + struct timerfd_ctx *ctx; + + /* Check the TFD_* constants for consistency. */ + BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && + clockid != CLOCK_BOOTTIME && + clockid != CLOCK_BOOTTIME_ALARM)) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + init_waitqueue_head(&ctx->wqh); + ctx->clockid = clockid; + + if (isalarm(ctx)) + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + else + hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + if (ufd < 0) + kfree(ctx); + + return ufd; +} + +static int do_timerfd_settime(int ufd, int flags, + const struct itimerspec *new, + struct itimerspec *old) +{ + struct fd f; + struct timerfd_ctx *ctx; + int ret; + + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&new->it_value) || + !timespec_valid(&new->it_interval)) + return -EINVAL; + + ret = timerfd_fget(ufd, &f); + if (ret) + return ret; + ctx = f.file->private_data; + + timerfd_setup_cancel(ctx, flags); + + /* + * We need to stop the existing timer before reprogramming + * it to the new values. + */ + for (;;) { + spin_lock_irq(&ctx->wqh.lock); + + if (isalarm(ctx)) { + if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) + break; + } else { + if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) + break; + } + spin_unlock_irq(&ctx->wqh.lock); + if (isalarm(ctx)) + hrtimer_wait_for_timer(&ctx->t.alarm.timer); + else + hrtimer_wait_for_timer(&ctx->t.tmr); + } + + /* + * If the timer is expired and it's periodic, we need to advance it + * because the caller may want to know the previous expiration time. + * We do not update "ticks" and "expired" since the timer will be + * re-programmed again in the following timerfd_setup() call. + */ + if (ctx->expired && ctx->tintv.tv64) { + if (isalarm(ctx)) + alarm_forward_now(&ctx->t.alarm, ctx->tintv); + else + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); + } + + old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec(ctx->tintv); + + /* + * Re-program the timer to the new value ... + */ + ret = timerfd_setup(ctx, flags, new); + + spin_unlock_irq(&ctx->wqh.lock); + fdput(f); + return ret; +} + +static int do_timerfd_gettime(int ufd, struct itimerspec *t) +{ + struct fd f; + struct timerfd_ctx *ctx; + int ret = timerfd_fget(ufd, &f); + if (ret) + return ret; + ctx = f.file->private_data; + + spin_lock_irq(&ctx->wqh.lock); + if (ctx->expired && ctx->tintv.tv64) { + ctx->expired = 0; + + if (isalarm(ctx)) { + ctx->ticks += + alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ctx->ticks += + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) + - 1; + hrtimer_restart(&ctx->t.tmr); + } + } + t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + fdput(f); + return 0; +} + +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (copy_from_user(&new, utmr, sizeof(new))) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && copy_to_user(otmr, &old, sizeof(old))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; + return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct compat_itimerspec __user *, utmr, + struct compat_itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (get_compat_itimerspec(&new, utmr)) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && put_compat_itimerspec(otmr, &old)) + return -EFAULT; + return ret; +} + +COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, + struct compat_itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; + return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; +} +#endif diff --git a/kernel/fs/tracefs/Makefile b/kernel/fs/tracefs/Makefile new file mode 100644 index 000000000..82fa35b65 --- /dev/null +++ b/kernel/fs/tracefs/Makefile @@ -0,0 +1,4 @@ +tracefs-objs := inode.o + +obj-$(CONFIG_TRACING) += tracefs.o + diff --git a/kernel/fs/tracefs/inode.c b/kernel/fs/tracefs/inode.c new file mode 100644 index 000000000..a43df11a1 --- /dev/null +++ b/kernel/fs/tracefs/inode.c @@ -0,0 +1,648 @@ +/* + * inode.c - part of tracefs, a pseudo file system for activating tracing + * + * Based on debugfs by: Greg Kroah-Hartman + * + * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * tracefs is the file system that is used by the tracing infrastructure. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TRACEFS_DEFAULT_MODE 0700 + +static struct vfsmount *tracefs_mount; +static int tracefs_mount_count; +static bool tracefs_registered; + +static ssize_t default_read_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t default_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return count; +} + +static const struct file_operations tracefs_file_operations = { + .read = default_read_file, + .write = default_write_file, + .open = simple_open, + .llseek = noop_llseek, +}; + +static struct tracefs_dir_ops { + int (*mkdir)(const char *name); + int (*rmdir)(const char *name); +} tracefs_ops; + +static char *get_dname(struct dentry *dentry) +{ + const char *dname; + char *name; + int len = dentry->d_name.len; + + dname = dentry->d_name.name; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return NULL; + memcpy(name, dname, len); + name[len] = 0; + return name; +} + +static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The mkdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * mkdir routine to handle races. + */ + mutex_unlock(&inode->i_mutex); + ret = tracefs_ops.mkdir(name); + mutex_lock(&inode->i_mutex); + + kfree(name); + + return ret; +} + +static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The rmdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * rmdir routine to handle races. + * This time we need to unlock not only the parent (inode) but + * also the directory that is being deleted. + */ + mutex_unlock(&inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + + ret = tracefs_ops.rmdir(name); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + kfree(name); + + return ret; +} + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = tracefs_syscall_mkdir, + .rmdir = tracefs_syscall_rmdir, +}; + +static struct inode *tracefs_get_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + return inode; +} + +struct tracefs_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; +}; + +enum { + Opt_uid, + Opt_gid, + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct tracefs_fs_info { + struct tracefs_mount_opts mount_opts; +}; + +static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + kuid_t uid; + kgid_t gid; + char *p; + + opts->mode = TRACEFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally tracefs has ignored all mount options + */ + } + } + + return 0; +} + +static int tracefs_apply_options(struct super_block *sb) +{ + struct tracefs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = sb->s_root->d_inode; + struct tracefs_mount_opts *opts = &fsi->mount_opts; + + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; + + return 0; +} + +static int tracefs_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct tracefs_fs_info *fsi = sb->s_fs_info; + + sync_filesystem(sb); + err = tracefs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + tracefs_apply_options(sb); + +fail: + return err; +} + +static int tracefs_show_options(struct seq_file *m, struct dentry *root) +{ + struct tracefs_fs_info *fsi = root->d_sb->s_fs_info; + struct tracefs_mount_opts *opts = &fsi->mount_opts; + + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + if (opts->mode != TRACEFS_DEFAULT_MODE) + seq_printf(m, ",mode=%o", opts->mode); + + return 0; +} + +static const struct super_operations tracefs_super_operations = { + .statfs = simple_statfs, + .remount_fs = tracefs_remount, + .show_options = tracefs_show_options, +}; + +static int trace_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr trace_files[] = {{""}}; + struct tracefs_fs_info *fsi; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) { + err = -ENOMEM; + goto fail; + } + + err = tracefs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files); + if (err) + goto fail; + + sb->s_op = &tracefs_super_operations; + + tracefs_apply_options(sb); + + return 0; + +fail: + kfree(fsi); + sb->s_fs_info = NULL; + return err; +} + +static struct dentry *trace_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_single(fs_type, flags, data, trace_fill_super); +} + +static struct file_system_type trace_fs_type = { + .owner = THIS_MODULE, + .name = "tracefs", + .mount = trace_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("tracefs"); + +static struct dentry *start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + pr_debug("tracefs: creating file '%s'\n",name); + + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, + &tracefs_mount_count); + if (error) + return ERR_PTR(error); + + /* If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = tracefs_mount->mnt_root; + + mutex_lock(&parent->d_inode->i_mutex); + dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(dentry) && dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + if (IS_ERR(dentry)) + mutex_unlock(&parent->d_inode->i_mutex); + return dentry; +} + +static struct dentry *failed_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + dput(dentry); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + return NULL; +} + +static struct dentry *end_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + return dentry; +} + +/** + * tracefs_create_file - create a file in the tracefs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the tracefs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * This is the basic "create a file" function for tracefs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the tracefs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If tracefs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *tracefs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fops ? fops : &tracefs_file_operations; + inode->i_private = data; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + +static struct dentry *__create_dir(const char *name, struct dentry *parent, + const struct inode_operations *ops) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = ops; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + +/** + * tracefs_create_dir - create a directory in the tracefs filesystem + * @name: a pointer to a string containing the name of the directory to + * create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * directory will be created in the root of the tracefs filesystem. + * + * This function creates a directory in tracefs with the given name. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed. If an error occurs, %NULL will be returned. + * + * If tracing is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) +{ + return __create_dir(name, parent, &simple_dir_inode_operations); +} + +/** + * tracefs_create_instance_dir - create the tracing instances directory + * @name: The name of the instances directory to create + * @parent: The parent directory that the instances directory will exist + * @mkdir: The function to call when a mkdir is performed. + * @rmdir: The function to call when a rmdir is performed. + * + * Only one instances directory is allowed. + * + * The instances directory is special as it allows for mkdir and rmdir to + * to be done by userspace. When a mkdir or rmdir is performed, the inode + * locks are released and the methhods passed in (@mkdir and @rmdir) are + * called without locks and with the name of the directory being created + * within the instances directory. + * + * Returns the dentry of the instances directory. + */ +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)) +{ + struct dentry *dentry; + + /* Only allow one instance of the instances directory. */ + if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) + return NULL; + + dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + if (!dentry) + return NULL; + + tracefs_ops.mkdir = mkdir; + tracefs_ops.rmdir = rmdir; + + return dentry; +} + +static inline int tracefs_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (tracefs_positive(dentry)) { + if (dentry->d_inode) { + dget(dentry); + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: + ret = simple_rmdir(parent->d_inode, dentry); + break; + default: + simple_unlink(parent->d_inode, dentry); + break; + } + if (!ret) + d_delete(dentry); + dput(dentry); + } + } + return ret; +} + +/** + * tracefs_remove - removes a file or directory from the tracefs filesystem + * @dentry: a pointer to a the dentry of the file or directory to be + * removed. + * + * This function removes a file or directory in tracefs that was previously + * created with a call to another tracefs function (like + * tracefs_create_file() or variants thereof.) + */ +void tracefs_remove(struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + mutex_lock(&parent->d_inode->i_mutex); + ret = __tracefs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + if (!ret) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); +} + +/** + * tracefs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in tracefs that + * was previously created with a call to another tracefs function + * (like tracefs_create_file() or variants thereof.) + */ +void tracefs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child, *parent; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + parent = dentry; + down: + mutex_lock(&parent->d_inode->i_mutex); + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_child) { + if (!tracefs_positive(child)) + continue; + + /* perhaps simple_empty(child) makes more sense */ + if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); + mutex_unlock(&parent->d_inode->i_mutex); + parent = child; + goto down; + } + + spin_unlock(&parent->d_lock); + + if (!__tracefs_remove(child, parent)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * tracefs_positive() check. + */ + goto loop; + } + spin_unlock(&parent->d_lock); + + mutex_unlock(&parent->d_inode->i_mutex); + child = parent; + parent = parent->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + + if (child != dentry) + /* go up */ + goto loop; + + if (!__tracefs_remove(child, parent)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + mutex_unlock(&parent->d_inode->i_mutex); +} + +/** + * tracefs_initialized - Tells whether tracefs has been registered + */ +bool tracefs_initialized(void) +{ + return tracefs_registered; +} + +static int __init tracefs_init(void) +{ + int retval; + + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); + if (retval) + return -EINVAL; + + retval = register_filesystem(&trace_fs_type); + if (!retval) + tracefs_registered = true; + + return retval; +} +core_initcall(tracefs_init); diff --git a/kernel/fs/ubifs/Kconfig b/kernel/fs/ubifs/Kconfig new file mode 100644 index 000000000..ba66d5080 --- /dev/null +++ b/kernel/fs/ubifs/Kconfig @@ -0,0 +1,37 @@ +config UBIFS_FS + tristate "UBIFS file system support" + select CRC16 + select CRC32 + select CRYPTO if UBIFS_FS_ADVANCED_COMPR + select CRYPTO if UBIFS_FS_LZO + select CRYPTO if UBIFS_FS_ZLIB + select CRYPTO_LZO if UBIFS_FS_LZO + select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + depends on MTD_UBI + help + UBIFS is a file system for flash devices which works on top of UBI. + +config UBIFS_FS_ADVANCED_COMPR + bool "Advanced compression options" + depends on UBIFS_FS + help + This option allows to explicitly choose which compressions, if any, + are enabled in UBIFS. Removing compressors means inability to read + existing file systems. + + If unsure, say 'N'. + +config UBIFS_FS_LZO + bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + LZO compressor is generally faster than zlib but compresses worse. + Say 'Y' if unsure. + +config UBIFS_FS_ZLIB + bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. diff --git a/kernel/fs/ubifs/Makefile b/kernel/fs/ubifs/Makefile new file mode 100644 index 000000000..2c6f0cb81 --- /dev/null +++ b/kernel/fs/ubifs/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_UBIFS_FS) += ubifs.o + +ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o +ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o +ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o diff --git a/kernel/fs/ubifs/budget.c b/kernel/fs/ubifs/budget.c new file mode 100644 index 000000000..11a11b32a --- /dev/null +++ b/kernel/fs/ubifs/budget.c @@ -0,0 +1,730 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the budgeting sub-system which is responsible for UBIFS + * space management. + * + * Factors such as compression, wasted space at the ends of LEBs, space in other + * journal heads, the effect of updates on the index, and so on, make it + * impossible to accurately predict the amount of space needed. Consequently + * approximations are used. + */ + +#include "ubifs.h" +#include +#include + +/* + * When pessimistic budget calculations say that there is no enough space, + * UBIFS starts writing back dirty inodes and pages, doing garbage collection, + * or committing. The below constant defines maximum number of times UBIFS + * repeats the operations. + */ +#define MAX_MKSPC_RETRIES 3 + +/* + * The below constant defines amount of dirty pages which should be written + * back at when trying to shrink the liability. + */ +#define NR_TO_WRITE 16 + +/** + * shrink_liability - write-back some dirty pages/inodes. + * @c: UBIFS file-system description object + * @nr_to_write: how many dirty pages to write-back + * + * This function shrinks UBIFS liability by means of writing back some amount + * of dirty inodes and their pages. + * + * Note, this function synchronizes even VFS inodes which are locked + * (@i_mutex) by the caller of the budgeting function, because write-back does + * not touch @i_mutex. + */ +static void shrink_liability(struct ubifs_info *c, int nr_to_write) +{ + down_read(&c->vfs_sb->s_umount); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); + up_read(&c->vfs_sb->s_umount); +} + +/** + * run_gc - run garbage collector. + * @c: UBIFS file-system description object + * + * This function runs garbage collector to make some more free space. Returns + * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a + * negative error code in case of failure. + */ +static int run_gc(struct ubifs_info *c) +{ + int err, lnum; + + /* Make some free space by garbage-collecting dirty space */ + down_read(&c->commit_sem); + lnum = ubifs_garbage_collect(c, 1); + up_read(&c->commit_sem); + if (lnum < 0) + return lnum; + + /* GC freed one LEB, return it to lprops */ + dbg_budg("GC freed LEB %d", lnum); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + return 0; +} + +/** + * get_liability - calculate current liability. + * @c: UBIFS file-system description object + * + * This function calculates and returns current UBIFS liability, i.e. the + * amount of bytes UBIFS has "promised" to write to the media. + */ +static long long get_liability(struct ubifs_info *c) +{ + long long liab; + + spin_lock(&c->space_lock); + liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth; + spin_unlock(&c->space_lock); + return liab; +} + +/** + * make_free_space - make more free space on the file-system. + * @c: UBIFS file-system description object + * + * This function is called when an operation cannot be budgeted because there + * is supposedly no free space. But in most cases there is some free space: + * o budgeting is pessimistic, so it always budgets more than it is actually + * needed, so shrinking the liability is one way to make free space - the + * cached data will take less space then it was budgeted for; + * o GC may turn some dark space into free space (budgeting treats dark space + * as not available); + * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs. + * + * So this function tries to do the above. Returns %-EAGAIN if some free space + * was presumably made and the caller has to re-try budgeting the operation. + * Returns %-ENOSPC if it couldn't do more free space, and other negative error + * codes on failures. + */ +static int make_free_space(struct ubifs_info *c) +{ + int err, retries = 0; + long long liab1, liab2; + + do { + liab1 = get_liability(c); + /* + * We probably have some dirty pages or inodes (liability), try + * to write them back. + */ + dbg_budg("liability %lld, run write-back", liab1); + shrink_liability(c, NR_TO_WRITE); + + liab2 = get_liability(c); + if (liab2 < liab1) + return -EAGAIN; + + dbg_budg("new liability %lld (not shrunk)", liab2); + + /* Liability did not shrink again, try GC */ + dbg_budg("Run GC"); + err = run_gc(c); + if (!err) + return -EAGAIN; + + if (err != -EAGAIN && err != -ENOSPC) + /* Some real error happened */ + return err; + + dbg_budg("Run commit (retries %d)", retries); + err = ubifs_run_commit(c); + if (err) + return err; + } while (retries++ < MAX_MKSPC_RETRIES); + + return -ENOSPC; +} + +/** + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. + * @c: UBIFS file-system description object + * + * This function calculates and returns the number of LEBs which should be kept + * for index usage. + */ +int ubifs_calc_min_idx_lebs(struct ubifs_info *c) +{ + int idx_lebs; + long long idx_size; + + idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx; + /* And make sure we have thrice the index size of space reserved */ + idx_size += idx_size << 1; + /* + * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' + * pair, nor similarly the two variables for the new index size, so we + * have to do this costly 64-bit division on fast-path. + */ + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); + /* + * The index head is not available for the in-the-gaps method, so add an + * extra LEB to compensate. + */ + idx_lebs += 1; + if (idx_lebs < MIN_INDEX_LEBS) + idx_lebs = MIN_INDEX_LEBS; + return idx_lebs; +} + +/** + * ubifs_calc_available - calculate available FS space. + * @c: UBIFS file-system description object + * @min_idx_lebs: minimum number of LEBs reserved for the index + * + * This function calculates and returns amount of FS space available for use. + */ +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) +{ + int subtract_lebs; + long long available; + + available = c->main_bytes - c->lst.total_used; + + /* + * Now 'available' contains theoretically available flash space + * assuming there is no index, so we have to subtract the space which + * is reserved for the index. + */ + subtract_lebs = min_idx_lebs; + + /* Take into account that GC reserves one LEB for its own needs */ + subtract_lebs += 1; + + /* + * The GC journal head LEB is not really accessible. And since + * different write types go to different heads, we may count only on + * one head's space. + */ + subtract_lebs += c->jhead_cnt - 1; + + /* We also reserve one LEB for deletions, which bypass budgeting */ + subtract_lebs += 1; + + available -= (long long)subtract_lebs * c->leb_size; + + /* Subtract the dead space which is not available for use */ + available -= c->lst.total_dead; + + /* + * Subtract dark space, which might or might not be usable - it depends + * on the data which we have on the media and which will be written. If + * this is a lot of uncompressed or not-compressible data, the dark + * space cannot be used. + */ + available -= c->lst.total_dark; + + /* + * However, there is more dark space. The index may be bigger than + * @min_idx_lebs. Those extra LEBs are assumed to be available, but + * their dark space is not included in total_dark, so it is subtracted + * here. + */ + if (c->lst.idx_lebs > min_idx_lebs) { + subtract_lebs = c->lst.idx_lebs - min_idx_lebs; + available -= subtract_lebs * c->dark_wm; + } + + /* The calculations are rough and may end up with a negative number */ + return available > 0 ? available : 0; +} + +/** + * can_use_rp - check whether the user is allowed to use reserved pool. + * @c: UBIFS file-system description object + * + * UBIFS has so-called "reserved pool" which is flash space reserved + * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock. + * This function checks whether current user is allowed to use reserved pool. + * Returns %1 current user is allowed to use reserved pool and %0 otherwise. + */ +static int can_use_rp(struct ubifs_info *c) +{ + if (uid_eq(current_fsuid(), c->rp_uid) || capable(CAP_SYS_RESOURCE) || + (!gid_eq(c->rp_gid, GLOBAL_ROOT_GID) && in_group_p(c->rp_gid))) + return 1; + return 0; +} + +/** + * do_budget_space - reserve flash space for index and data growth. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. + * + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index + * would take if it was consolidated and written to the flash. This guarantees + * that the "in-the-gaps" commit method always succeeds and UBIFS will always + * be able to commit dirty index. So this function basically adds amount of + * budgeted index space to the size of the current index, multiplies this by 3, + * and makes sure this does not exceed the amount of free LEBs. + * + * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables: + * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might + * be large, because UBIFS does not do any index consolidation as long as + * there is free space. IOW, the index may take a lot of LEBs, but the LEBs + * will contain a lot of dirt. + * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs. + * + * This function returns zero in case of success, and %-ENOSPC in case of + * failure. + */ +static int do_budget_space(struct ubifs_info *c) +{ + long long outstanding, available; + int lebs, rsvd_idx_lebs, min_idx_lebs; + + /* First budget index space */ + min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + /* Now 'min_idx_lebs' contains number of LEBs to reserve */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + + /* + * The number of LEBs that are available to be used by the index is: + * + * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - + * @c->lst.taken_empty_lebs + * + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. + * + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. + */ + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (unlikely(rsvd_idx_lebs > lebs)) { + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d", + min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs); + return -ENOSPC; + } + + available = ubifs_calc_available(c, min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; + + if (unlikely(available < outstanding)) { + dbg_budg("out of data space: available %lld, outstanding %lld", + available, outstanding); + return -ENOSPC; + } + + if (available - outstanding <= c->rp_size && !can_use_rp(c)) + return -ENOSPC; + + c->bi.min_idx_lebs = min_idx_lebs; + return 0; +} + +/** + * calc_idx_growth - calculate approximate index growth from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + * + * For now we assume each new node adds one znode. But this is rather poor + * approximation, though. + */ +static int calc_idx_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int znodes; + + znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) + + req->new_dent; + return znodes * c->max_idx_node_sz; +} + +/** + * calc_data_growth - calculate approximate amount of new data from budgeting + * request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_data_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int data_growth; + + data_growth = req->new_ino ? c->bi.inode_budget : 0; + if (req->new_page) + data_growth += c->bi.page_budget; + if (req->new_dent) + data_growth += c->bi.dent_budget; + data_growth += req->new_ino_d; + return data_growth; +} + +/** + * calc_dd_growth - calculate approximate amount of data which makes other data + * dirty from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_dd_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int dd_growth; + + dd_growth = req->dirtied_page ? c->bi.page_budget : 0; + + if (req->dirtied_ino) + dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); + if (req->mod_dent) + dd_growth += c->bi.dent_budget; + dd_growth += req->dirtied_ino_d; + return dd_growth; +} + +/** + * ubifs_budget_space - ensure there is enough space to complete an operation. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function allocates budget for an operation. It uses pessimistic + * approximation of how much flash space the operation needs. The goal of this + * function is to make sure UBIFS always has flash space to flush all dirty + * pages, dirty inodes, and dirty znodes (liability). This function may force + * commit, garbage-collection or write-back. Returns zero in case of success, + * %-ENOSPC if there is no free space and other negative error codes in case of + * failures. + */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + int err, idx_growth, data_growth, dd_growth, retried = 0; + + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + + data_growth = calc_data_growth(c, req); + dd_growth = calc_dd_growth(c, req); + if (!data_growth && !dd_growth) + return 0; + idx_growth = calc_idx_growth(c, req); + +again: + spin_lock(&c->space_lock); + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + + if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) { + dbg_budg("no space"); + spin_unlock(&c->space_lock); + return -ENOSPC; + } + + c->bi.idx_growth += idx_growth; + c->bi.data_growth += data_growth; + c->bi.dd_growth += dd_growth; + + err = do_budget_space(c); + if (likely(!err)) { + req->idx_growth = idx_growth; + req->data_growth = data_growth; + req->dd_growth = dd_growth; + spin_unlock(&c->space_lock); + return 0; + } + + /* Restore the old values */ + c->bi.idx_growth -= idx_growth; + c->bi.data_growth -= data_growth; + c->bi.dd_growth -= dd_growth; + spin_unlock(&c->space_lock); + + if (req->fast) { + dbg_budg("no space for fast budgeting"); + return err; + } + + err = make_free_space(c); + cond_resched(); + if (err == -EAGAIN) { + dbg_budg("try again"); + goto again; + } else if (err == -ENOSPC) { + if (!retried) { + retried = 1; + dbg_budg("-ENOSPC, but anyway try once again"); + goto again; + } + dbg_budg("FS is full, -ENOSPC"); + c->bi.nospace = 1; + if (can_use_rp(c) || c->rp_size == 0) + c->bi.nospace_rp = 1; + smp_wmb(); + } else + ubifs_err(c, "cannot budget space, error %d", err); + return err; +} + +/** + * ubifs_release_budget - release budgeted free space. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function releases the space budgeted by 'ubifs_budget_space()'. Note, + * since the index changes (which were budgeted for in @req->idx_growth) will + * only be written to the media on commit, this function moves the index budget + * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed + * by the commit operation. + */ +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + if (!req->recalculate) { + ubifs_assert(req->idx_growth >= 0); + ubifs_assert(req->data_growth >= 0); + ubifs_assert(req->dd_growth >= 0); + } + + if (req->recalculate) { + req->data_growth = calc_data_growth(c, req); + req->dd_growth = calc_dd_growth(c, req); + req->idx_growth = calc_idx_growth(c, req); + } + + if (!req->data_growth && !req->dd_growth) + return; + + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + + spin_lock(&c->space_lock); + c->bi.idx_growth -= req->idx_growth; + c->bi.uncommitted_idx += req->idx_growth; + c->bi.data_growth -= req->data_growth; + c->bi.dd_growth -= req->dd_growth; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + ubifs_assert(c->bi.min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->bi.idx_growth & 7)); + ubifs_assert(!(c->bi.data_growth & 7)); + ubifs_assert(!(c->bi.dd_growth & 7)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_convert_page_budget - convert budget of a new page. + * @c: UBIFS file-system description object + * + * This function converts budget which was allocated for a new page of data to + * the budget of changing an existing page of data. The latter is smaller than + * the former, so this function only does simple re-calculation and does not + * involve any write-back. + */ +void ubifs_convert_page_budget(struct ubifs_info *c) +{ + spin_lock(&c->space_lock); + /* Release the index growth reservation */ + c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + /* Release the data growth reservation */ + c->bi.data_growth -= c->bi.page_budget; + /* Increase the dirty data growth reservation instead */ + c->bi.dd_growth += c->bi.page_budget; + /* And re-calculate the indexing space reservation */ + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_release_dirty_inode_budget - release dirty inode budget. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to release the budget for + * + * This function releases budget corresponding to a dirty inode. It is usually + * called when after the inode has been written to the media and marked as + * clean. It also causes the "no space" flags to be cleared. + */ +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui) +{ + struct ubifs_budget_req req; + + memset(&req, 0, sizeof(struct ubifs_budget_req)); + /* The "no space" flags will be cleared because dd_growth is > 0 */ + req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8); + ubifs_release_budget(c, &req); +} + +/** + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexing nodes as well. This introduces additional + * overhead, and UBIFS has to report slightly less free space to meet the above + * expectations. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, long long free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reserves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + return div_u64(free, divisor); +} + +/** + * ubifs_get_free_space_nolock - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. + */ +long long ubifs_get_free_space_nolock(struct ubifs_info *c) +{ + int rsvd_idx_lebs, lebs; + long long available, outstanding, free; + + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->bi.data_growth + c->bi.dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); + + if (available > outstanding) + free = ubifs_reported_space(c, available - outstanding); + else + free = 0; + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates and returns amount of free space to report to + * user-space. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + long long free; + + spin_lock(&c->space_lock); + free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); + + return free; +} diff --git a/kernel/fs/ubifs/commit.c b/kernel/fs/ubifs/commit.c new file mode 100644 index 000000000..63f566199 --- /dev/null +++ b/kernel/fs/ubifs/commit.c @@ -0,0 +1,734 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions that manage the running of the commit process. + * Each affected module has its own functions to accomplish their part in the + * commit and those functions are called here. + * + * The commit is the process whereby all updates to the index and LEB properties + * are written out together and the journal becomes empty. This keeps the + * file system consistent - at all times the state can be recreated by reading + * the index and LEB properties and then replaying the journal. + * + * The commit is split into two parts named "commit start" and "commit end". + * During commit start, the commit process has exclusive access to the journal + * by holding the commit semaphore down for writing. As few I/O operations as + * possible are performed during commit start, instead the nodes that are to be + * written are merely identified. During commit end, the commit semaphore is no + * longer held and the journal is again in operation, allowing users to continue + * to use the file system while the bulk of the commit I/O is performed. The + * purpose of this two-step approach is to prevent the commit from causing any + * latency blips. Note that in any case, the commit does not prevent lookups + * (as permitted by the TNC mutex), or access to VFS data structures e.g. page + * cache. + */ + +#include +#include +#include +#include "ubifs.h" + +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + +/** + * do_commit - commit the journal. + * @c: UBIFS file-system description object + * + * This function implements UBIFS commit. It has to be called with commit lock + * locked. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int do_commit(struct ubifs_info *c) +{ + int err, new_ltail_lnum, old_ltail_lnum, i; + struct ubifs_zbranch zroot; + struct ubifs_lp_stats lst; + + dbg_cmt("start"); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (c->ro_error) { + err = -EROFS; + goto out_up; + } + + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + + /* Sync all write buffers (necessary for recovery) */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + goto out_up; + } + + c->cmt_no += 1; + err = ubifs_gc_start_commit(c); + if (err) + goto out_up; + err = dbg_check_lprops(c); + if (err) + goto out_up; + err = ubifs_log_start_commit(c, &new_ltail_lnum); + if (err) + goto out_up; + err = ubifs_tnc_start_commit(c, &zroot); + if (err) + goto out_up; + err = ubifs_lpt_start_commit(c); + if (err) + goto out_up; + err = ubifs_orphan_start_commit(c); + if (err) + goto out_up; + + ubifs_get_lp_stats(c, &lst); + + up_write(&c->commit_sem); + + err = ubifs_tnc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_end_commit(c); + if (err) + goto out; + err = ubifs_orphan_end_commit(c); + if (err) + goto out; + err = dbg_check_old_index(c, &zroot); + if (err) + goto out; + + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); + c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); + c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); + c->mst_node->root_offs = cpu_to_le32(zroot.offs); + c->mst_node->root_len = cpu_to_le32(zroot.len); + c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); + c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); + c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz); + c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); + c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); + c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); + c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs); + c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum); + c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs); + c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum); + c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs); + c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum); + c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs); + c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs); + c->mst_node->total_free = cpu_to_le64(lst.total_free); + c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty); + c->mst_node->total_used = cpu_to_le64(lst.total_used); + c->mst_node->total_dead = cpu_to_le64(lst.total_dead); + c->mst_node->total_dark = cpu_to_le64(lst.total_dark); + if (c->no_orphs) + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); + + old_ltail_lnum = c->ltail_lnum; + err = ubifs_log_end_commit(c, new_ltail_lnum); + if (err) + goto out; + + err = ubifs_log_post_commit(c, old_ltail_lnum); + if (err) + goto out; + err = ubifs_gc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_post_commit(c); + if (err) + goto out; + +out_cancel: + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_RESTING; + wake_up(&c->cmt_wq); + dbg_cmt("commit end"); + spin_unlock(&c->cs_lock); + return 0; + +out_up: + up_write(&c->commit_sem); +out: + ubifs_err(c, "commit failed, error %d", err); + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_BROKEN; + wake_up(&c->cmt_wq); + spin_unlock(&c->cs_lock); + ubifs_ro_mode(c, err); + return err; +} + +/** + * run_bg_commit - run background commit if it is needed. + * @c: UBIFS file-system description object + * + * This function runs background commit if it is needed. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int run_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + /* + * Run background commit only if background commit was requested or if + * commit is required. + */ + if (c->cmt_state != COMMIT_BACKGROUND && + c->cmt_state != COMMIT_REQUIRED) + goto out; + spin_unlock(&c->cs_lock); + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_REQUIRED) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + else if (c->cmt_state == COMMIT_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_BACKGROUND; + else + goto out_cmt_unlock; + spin_unlock(&c->cs_lock); + + return do_commit(c); + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return 0; +} + +/** + * ubifs_bg_thread - UBIFS background thread function. + * @info: points to the file-system description object + * + * This function implements various file-system background activities: + * o when a write-buffer timer expires it synchronizes the appropriate + * write-buffer; + * o when the journal is about to be full, it starts in-advance commit. + * + * Note, other stuff like background garbage collection may be added here in + * future. + */ +int ubifs_bg_thread(void *info) +{ + int err; + struct ubifs_info *c = info; + + ubifs_msg(c, "background thread \"%s\" started, PID %d", + c->bgt_name, current->pid); + set_freezable(); + + while (1) { + if (kthread_should_stop()) + break; + + if (try_to_freeze()) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + /* Check if there is something to do */ + if (!c->need_bgt) { + /* + * Nothing prevents us from going sleep now and + * be never woken up and block the task which + * could wait in 'kthread_stop()' forever. + */ + if (kthread_should_stop()) + break; + schedule(); + continue; + } else + __set_current_state(TASK_RUNNING); + + c->need_bgt = 0; + err = ubifs_bg_wbufs_sync(c); + if (err) + ubifs_ro_mode(c, err); + + run_bg_commit(c); + cond_resched(); + } + + ubifs_msg(c, "background thread \"%s\" stops", c->bgt_name); + return 0; +} + +/** + * ubifs_commit_required - set commit state to "required". + * @c: UBIFS file-system description object + * + * This function is called if a commit is required but cannot be done from the + * calling function, so it is just flagged instead. + */ +void ubifs_commit_required(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + switch (c->cmt_state) { + case COMMIT_RESTING: + case COMMIT_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_REQUIRED)); + c->cmt_state = COMMIT_REQUIRED; + break; + case COMMIT_RUNNING_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_RUNNING_REQUIRED)); + c->cmt_state = COMMIT_RUNNING_REQUIRED; + break; + case COMMIT_REQUIRED: + case COMMIT_RUNNING_REQUIRED: + case COMMIT_BROKEN: + break; + } + spin_unlock(&c->cs_lock); +} + +/** + * ubifs_request_bg_commit - notify the background thread to do a commit. + * @c: UBIFS file-system description object + * + * This function is called if the journal is full enough to make a commit + * worthwhile, so background thread is kicked to start it. + */ +void ubifs_request_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_RESTING) { + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_BACKGROUND)); + c->cmt_state = COMMIT_BACKGROUND; + spin_unlock(&c->cs_lock); + ubifs_wake_up_bgt(c); + } else + spin_unlock(&c->cs_lock); +} + +/** + * wait_for_commit - wait for commit. + * @c: UBIFS file-system description object + * + * This function sleeps until the commit operation is no longer running. + */ +static int wait_for_commit(struct ubifs_info *c) +{ + dbg_cmt("pid %d goes sleep", current->pid); + + /* + * The following sleeps if the condition is false, and will be woken + * when the commit ends. It is possible, although very unlikely, that we + * will wake up and see the subsequent commit running, rather than the + * one we were waiting for, and go back to sleep. However, we will be + * woken again, so there is no danger of sleeping forever. + */ + wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND && + c->cmt_state != COMMIT_RUNNING_REQUIRED); + dbg_cmt("commit finished, pid %d woke up", current->pid); + return 0; +} + +/** + * ubifs_run_commit - run or wait for commit. + * @c: UBIFS file-system description object + * + * This function runs commit and returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_run_commit(struct ubifs_info *c) +{ + int err = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BROKEN) { + err = -EROFS; + goto out; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + /* + * We set the commit state to 'running required' to indicate + * that we want it to complete as quickly as possible. + */ + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + spin_unlock(&c->cs_lock); + + /* Ok, the commit is indeed needed */ + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + /* + * Since we unlocked 'c->cs_lock', the state may have changed, so + * re-check it. + */ + if (c->cmt_state == COMMIT_BROKEN) { + err = -EROFS; + goto out_cmt_unlock; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + up_write(&c->commit_sem); + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + c->cmt_state = COMMIT_RUNNING_REQUIRED; + spin_unlock(&c->cs_lock); + + err = do_commit(c); + return err; + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return err; +} + +/** + * ubifs_gc_should_commit - determine if it is time for GC to run commit. + * @c: UBIFS file-system description object + * + * This function is called by garbage collection to determine if commit should + * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal + * is full enough to start commit, this function returns true. It is not + * absolutely necessary to commit yet, but it feels like this should be better + * then to keep doing GC. This function returns %1 if GC has to initiate commit + * and %0 if not. + */ +int ubifs_gc_should_commit(struct ubifs_info *c) +{ + int ret = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BACKGROUND) { + dbg_cmt("commit required now"); + c->cmt_state = COMMIT_REQUIRED; + } else + dbg_cmt("commit not requested"); + if (c->cmt_state == COMMIT_REQUIRED) + ret = 1; + spin_unlock(&c->cs_lock); + return ret; +} + +/* + * Everything below is related to debugging. + */ + +/** + * struct idx_node - hold index nodes during index tree traversal. + * @list: list + * @iip: index in parent (slot number of this indexing node in the parent + * indexing node) + * @upper_key: all keys in this indexing node have to be less or equivalent to + * this key + * @idx: index node (8-byte aligned because all node structures must be 8-byte + * aligned) + */ +struct idx_node { + struct list_head list; + int iip; + union ubifs_key upper_key; + struct ubifs_idx_node idx __aligned(8); +}; + +/** + * dbg_old_index_check_init - get information for the next old index check. + * @c: UBIFS file-system description object + * @zroot: root of the index + * + * This function records information about the index that will be needed for the + * next old index check i.e. 'dbg_check_old_index()'. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + struct ubifs_idx_node *idx; + int lnum, offs, len, err = 0; + struct ubifs_debug_info *d = c->dbg; + + d->old_zroot = *zroot; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out; + + d->old_zroot_level = le16_to_cpu(idx->level); + d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); +out: + kfree(idx); + return err; +} + +/** + * dbg_check_old_index - check the old copy of the index. + * @c: UBIFS file-system description object + * @zroot: root of the new index + * + * In order to be able to recover from an unclean unmount, a complete copy of + * the index must exist on flash. This is the "old" index. The commit process + * must write the "new" index to flash without overwriting or destroying any + * part of the old index. This function is run at commit end in order to check + * that the old index does indeed exist completely intact. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; + int first = 1, iip; + struct ubifs_debug_info *d = c->dbg; + union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key; + unsigned long long uninitialized_var(last_sqnum); + struct ubifs_idx_node *idx; + struct list_head list; + struct idx_node *i; + size_t sz; + + if (!dbg_is_chk_index(c)) + return 0; + + INIT_LIST_HEAD(&list); + + sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) - + UBIFS_IDX_NODE_SZ; + + /* Start at the old zroot */ + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + iip = 0; + + /* + * Traverse the index tree preorder depth-first i.e. do a node and then + * its subtrees from left to right. + */ + while (1) { + struct ubifs_branch *br; + + /* Get the next index node */ + i = kmalloc(sz, GFP_NOFS); + if (!i) { + err = -ENOMEM; + goto out_free; + } + i->iip = iip; + /* Keep the index nodes on our path in a linked list */ + list_add_tail(&i->list, &list); + /* Read the index node */ + idx = &i->idx; + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out_free; + /* Validate index node */ + child_cnt = le16_to_cpu(idx->child_cnt); + if (child_cnt < 1 || child_cnt > c->fanout) { + err = 1; + goto out_dump; + } + if (first) { + first = 0; + /* Check root level and sqnum */ + if (le16_to_cpu(idx->level) != d->old_zroot_level) { + err = 2; + goto out_dump; + } + if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) { + err = 3; + goto out_dump; + } + /* Set last values as though root had a parent */ + last_level = le16_to_cpu(idx->level) + 1; + last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1; + key_read(c, ubifs_idx_key(c, idx), &lower_key); + highest_ino_key(c, &upper_key, INUM_WATERMARK); + } + key_copy(c, &upper_key, &i->upper_key); + if (le16_to_cpu(idx->level) != last_level - 1) { + err = 3; + goto out_dump; + } + /* + * The index is always written bottom up hence a child's sqnum + * is always less than the parents. + */ + if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) { + err = 4; + goto out_dump; + } + /* Check key range */ + key_read(c, ubifs_idx_key(c, idx), &l_key); + br = ubifs_idx_branch(c, idx, child_cnt - 1); + key_read(c, &br->key, &u_key); + if (keys_cmp(c, &lower_key, &l_key) > 0) { + err = 5; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) < 0) { + err = 6; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) == 0) + if (!is_hash_key(c, &u_key)) { + err = 7; + goto out_dump; + } + /* Go to next index node */ + if (le16_to_cpu(idx->level) == 0) { + /* At the bottom, so go up until can go right */ + while (1) { + /* Drop the bottom of the list */ + list_del(&i->list); + kfree(i); + /* No more list means we are done */ + if (list_empty(&list)) + goto out; + /* Look at the new bottom */ + i = list_entry(list.prev, struct idx_node, + list); + idx = &i->idx; + /* Can we go right */ + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + iip = iip + 1; + break; + } else + /* Nope, so go up again */ + iip = i->iip; + } + } else + /* Go down left */ + iip = 0; + /* + * We have the parent in 'idx' and now we set up for reading the + * child pointed to by slot 'iip'. + */ + last_level = le16_to_cpu(idx->level); + last_sqnum = le64_to_cpu(idx->ch.sqnum); + br = ubifs_idx_branch(c, idx, iip); + lnum = le32_to_cpu(br->lnum); + offs = le32_to_cpu(br->offs); + len = le32_to_cpu(br->len); + key_read(c, &br->key, &lower_key); + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + br = ubifs_idx_branch(c, idx, iip + 1); + key_read(c, &br->key, &upper_key); + } else + key_copy(c, &i->upper_key, &upper_key); + } +out: + err = dbg_old_index_check_init(c, zroot); + if (err) + goto out_free; + + return 0; + +out_dump: + ubifs_err(c, "dumping index node (iip=%d)", i->iip); + ubifs_dump_node(c, idx); + list_del(&i->list); + kfree(i); + if (!list_empty(&list)) { + i = list_entry(list.prev, struct idx_node, list); + ubifs_err(c, "dumping parent index node"); + ubifs_dump_node(c, &i->idx); + } +out_free: + while (!list_empty(&list)) { + i = list_entry(list.next, struct idx_node, list); + list_del(&i->list); + kfree(i); + } + ubifs_err(c, "failed, error %d", err); + if (err > 0) + err = -EINVAL; + return err; +} diff --git a/kernel/fs/ubifs/compress.c b/kernel/fs/ubifs/compress.c new file mode 100644 index 000000000..565cb56d7 --- /dev/null +++ b/kernel/fs/ubifs/compress.c @@ -0,0 +1,250 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + * Zoltan Sogor + */ + +/* + * This file provides a single place to access to compression and + * decompression. + */ + +#include +#include "ubifs.h" + +/* Fake description object for the "none" compressor */ +static struct ubifs_compressor none_compr = { + .compr_type = UBIFS_COMPR_NONE, + .name = "none", + .capi_name = "", +}; + +#ifdef CONFIG_UBIFS_FS_LZO +static DEFINE_MUTEX(lzo_mutex); + +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .comp_mutex = &lzo_mutex, + .name = "lzo", + .capi_name = "lzo", +}; +#else +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .name = "lzo", +}; +#endif + +#ifdef CONFIG_UBIFS_FS_ZLIB +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); + +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .comp_mutex = &deflate_mutex, + .decomp_mutex = &inflate_mutex, + .name = "zlib", + .capi_name = "deflate", +}; +#else +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .name = "zlib", +}; +#endif + +/* All UBIFS compressors */ +struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/** + * ubifs_compress - compress data. + * @in_buf: data to compress + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit + * + * This function compresses input buffer @in_buf of length @in_len and stores + * the result in the output buffer @out_buf and the resulting length in + * @out_len. If the input buffer does not compress, it is just copied to the + * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if + * compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. + */ +void ubifs_compress(const struct ubifs_info *c, const void *in_buf, + int in_len, void *out_buf, int *out_len, int *compr_type) +{ + int err; + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + if (compr->comp_mutex) + mutex_lock(compr->comp_mutex); + err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->comp_mutex) + mutex_unlock(compr->comp_mutex); + if (unlikely(err)) { + ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", + in_len, compr->name, err); + goto no_compr; + } + + /* + * If the data compressed only slightly, it is better to leave it + * uncompressed to improve read speed. + */ + if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) + goto no_compr; + + return; + +no_compr: + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + +/** + * ubifs_decompress - decompress data. + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_buf: output buffer where decompressed data should + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into buffer @out_buf. + * The length of the uncompressed data is returned in @out_len. This functions + * returns %0 on success or a negative error code on failure. + */ +int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, + int in_len, void *out_buf, int *out_len, int compr_type) +{ + int err; + struct ubifs_compressor *compr; + + if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { + ubifs_err(c, "invalid compression type %d", compr_type); + return -EINVAL; + } + + compr = ubifs_compressors[compr_type]; + + if (unlikely(!compr->capi_name)) { + ubifs_err(c, "%s compression is not compiled in", compr->name); + return -EINVAL; + } + + if (compr_type == UBIFS_COMPR_NONE) { + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + return 0; + } + + if (compr->decomp_mutex) + mutex_lock(compr->decomp_mutex); + err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->decomp_mutex) + mutex_unlock(compr->decomp_mutex); + if (err) + ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d", + in_len, compr->name, err); + + return err; +} + +/** + * compr_init - initialize a compressor. + * @compr: compressor description object + * + * This function initializes the requested compressor and returns zero in case + * of success or a negative error code in case of failure. + */ +static int __init compr_init(struct ubifs_compressor *compr) +{ + if (compr->capi_name) { + compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + if (IS_ERR(compr->cc)) { + pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld", + current->pid, compr->name, PTR_ERR(compr->cc)); + return PTR_ERR(compr->cc); + } + } + + ubifs_compressors[compr->compr_type] = compr; + return 0; +} + +/** + * compr_exit - de-initialize a compressor. + * @compr: compressor description object + */ +static void compr_exit(struct ubifs_compressor *compr) +{ + if (compr->capi_name) + crypto_free_comp(compr->cc); + return; +} + +/** + * ubifs_compressors_init - initialize UBIFS compressors. + * + * This function initializes the compressor which were compiled in. Returns + * zero in case of success and a negative error code in case of failure. + */ +int __init ubifs_compressors_init(void) +{ + int err; + + err = compr_init(&lzo_compr); + if (err) + return err; + + err = compr_init(&zlib_compr); + if (err) + goto out_lzo; + + ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr; + return 0; + +out_lzo: + compr_exit(&lzo_compr); + return err; +} + +/** + * ubifs_compressors_exit - de-initialize UBIFS compressors. + */ +void ubifs_compressors_exit(void) +{ + compr_exit(&lzo_compr); + compr_exit(&zlib_compr); +} diff --git a/kernel/fs/ubifs/debug.c b/kernel/fs/ubifs/debug.c new file mode 100644 index 000000000..4c46a9865 --- /dev/null +++ b/kernel/fs/ubifs/debug.c @@ -0,0 +1,3104 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements most of the debugging stuff which is compiled in only + * when it is enabled. But some debugging check functions are implemented in + * corresponding subsystem, just because they are closely related and utilize + * various local functions of those subsystems. + */ + +#include +#include +#include +#include +#include +#include "ubifs.h" + +static DEFINE_SPINLOCK(dbg_lock); + +static const char *get_key_fmt(int fmt) +{ + switch (fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return "simple"; + default: + return "unknown/invalid format"; + } +} + +static const char *get_key_hash(int hash) +{ + switch (hash) { + case UBIFS_KEY_HASH_R5: + return "R5"; + case UBIFS_KEY_HASH_TEST: + return "test"; + default: + return "unknown/invalid name hash"; + } +} + +static const char *get_key_type(int type) +{ + switch (type) { + case UBIFS_INO_KEY: + return "inode"; + case UBIFS_DENT_KEY: + return "direntry"; + case UBIFS_XENT_KEY: + return "xentry"; + case UBIFS_DATA_KEY: + return "data"; + case UBIFS_TRUN_KEY: + return "truncate"; + default: + return "unknown/invalid key"; + } +} + +static const char *get_dent_type(int type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return "file"; + case UBIFS_ITYPE_DIR: + return "dir"; + case UBIFS_ITYPE_LNK: + return "symlink"; + case UBIFS_ITYPE_BLK: + return "blkdev"; + case UBIFS_ITYPE_CHR: + return "char dev"; + case UBIFS_ITYPE_FIFO: + return "fifo"; + case UBIFS_ITYPE_SOCK: + return "socket"; + default: + return "unknown/invalid type"; + } +} + +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len) +{ + char *p = buffer; + int type = key_type(c, key); + + if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { + switch (type) { + case UBIFS_INO_KEY: + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + len -= snprintf(p, len, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); + break; + case UBIFS_DATA_KEY: + len -= snprintf(p, len, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); + break; + case UBIFS_TRUN_KEY: + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + default: + len -= snprintf(p, len, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); + } + } else + len -= snprintf(p, len, "bad key format %d", c->key_fmt); + ubifs_assert(len > 0); + return p; +} + +const char *dbg_ntype(int type) +{ + switch (type) { + case UBIFS_PAD_NODE: + return "padding node"; + case UBIFS_SB_NODE: + return "superblock node"; + case UBIFS_MST_NODE: + return "master node"; + case UBIFS_REF_NODE: + return "reference node"; + case UBIFS_INO_NODE: + return "inode node"; + case UBIFS_DENT_NODE: + return "direntry node"; + case UBIFS_XENT_NODE: + return "xentry node"; + case UBIFS_DATA_NODE: + return "data node"; + case UBIFS_TRUN_NODE: + return "truncate node"; + case UBIFS_IDX_NODE: + return "indexing node"; + case UBIFS_CS_NODE: + return "commit start node"; + case UBIFS_ORPH_NODE: + return "orphan node"; + default: + return "unknown node"; + } +} + +static const char *dbg_gtype(int type) +{ + switch (type) { + case UBIFS_NO_NODE_GROUP: + return "no node group"; + case UBIFS_IN_NODE_GROUP: + return "in node group"; + case UBIFS_LAST_OF_NODE_GROUP: + return "last of node group"; + default: + return "unknown"; + } +} + +const char *dbg_cstate(int cmt_state) +{ + switch (cmt_state) { + case COMMIT_RESTING: + return "commit resting"; + case COMMIT_BACKGROUND: + return "background commit requested"; + case COMMIT_REQUIRED: + return "commit required"; + case COMMIT_RUNNING_BACKGROUND: + return "BACKGROUND commit running"; + case COMMIT_RUNNING_REQUIRED: + return "commit running and required"; + case COMMIT_BROKEN: + return "broken commit"; + default: + return "unknown commit state"; + } +} + +const char *dbg_jhead(int jhead) +{ + switch (jhead) { + case GCHD: + return "0 (GC)"; + case BASEHD: + return "1 (base)"; + case DATAHD: + return "2 (data)"; + default: + return "unknown journal head"; + } +} + +static void dump_ch(const struct ubifs_ch *ch) +{ + pr_err("\tmagic %#x\n", le32_to_cpu(ch->magic)); + pr_err("\tcrc %#x\n", le32_to_cpu(ch->crc)); + pr_err("\tnode_type %d (%s)\n", ch->node_type, + dbg_ntype(ch->node_type)); + pr_err("\tgroup_type %d (%s)\n", ch->group_type, + dbg_gtype(ch->group_type)); + pr_err("\tsqnum %llu\n", + (unsigned long long)le64_to_cpu(ch->sqnum)); + pr_err("\tlen %u\n", le32_to_cpu(ch->len)); +} + +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) +{ + const struct ubifs_inode *ui = ubifs_inode(inode); + struct qstr nm = { .name = NULL }; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + int count = 2; + + pr_err("Dump in-memory inode:"); + pr_err("\tinode %lu\n", inode->i_ino); + pr_err("\tsize %llu\n", + (unsigned long long)i_size_read(inode)); + pr_err("\tnlink %u\n", inode->i_nlink); + pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode)); + pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode)); + pr_err("\tatime %u.%u\n", + (unsigned int)inode->i_atime.tv_sec, + (unsigned int)inode->i_atime.tv_nsec); + pr_err("\tmtime %u.%u\n", + (unsigned int)inode->i_mtime.tv_sec, + (unsigned int)inode->i_mtime.tv_nsec); + pr_err("\tctime %u.%u\n", + (unsigned int)inode->i_ctime.tv_sec, + (unsigned int)inode->i_ctime.tv_nsec); + pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); + pr_err("\txattr_size %u\n", ui->xattr_size); + pr_err("\txattr_cnt %u\n", ui->xattr_cnt); + pr_err("\txattr_names %u\n", ui->xattr_names); + pr_err("\tdirty %u\n", ui->dirty); + pr_err("\txattr %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + pr_err("\tui_size %llu\n", + (unsigned long long)ui->ui_size); + pr_err("\tflags %d\n", ui->flags); + pr_err("\tcompr_type %d\n", ui->compr_type); + pr_err("\tlast_page_read %lu\n", ui->last_page_read); + pr_err("\tread_in_a_row %lu\n", ui->read_in_a_row); + pr_err("\tdata_len %d\n", ui->data_len); + + if (!S_ISDIR(inode->i_mode)) + return; + + pr_err("List of directory entries:\n"); + ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); + + lowest_dent_key(c, &key, inode->i_ino); + while (1) { + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + if (PTR_ERR(dent) != -ENOENT) + pr_err("error %ld\n", PTR_ERR(dent)); + break; + } + + pr_err("\t%d: %s (%s)\n", + count++, dent->name, get_dent_type(dent->type)); + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); +} + +void ubifs_dump_node(const struct ubifs_info *c, const void *node) +{ + int i, n; + union ubifs_key key; + const struct ubifs_ch *ch = node; + char key_buf[DBG_KEY_BUF_LEN]; + + /* If the magic is incorrect, just hexdump the first bytes */ + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { + pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node, UBIFS_CH_SZ, 1); + return; + } + + spin_lock(&dbg_lock); + dump_ch(node); + + switch (ch->node_type) { + case UBIFS_PAD_NODE: + { + const struct ubifs_pad_node *pad = node; + + pr_err("\tpad_len %u\n", le32_to_cpu(pad->pad_len)); + break; + } + case UBIFS_SB_NODE: + { + const struct ubifs_sb_node *sup = node; + unsigned int sup_flags = le32_to_cpu(sup->flags); + + pr_err("\tkey_hash %d (%s)\n", + (int)sup->key_hash, get_key_hash(sup->key_hash)); + pr_err("\tkey_fmt %d (%s)\n", + (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); + pr_err("\tflags %#x\n", sup_flags); + pr_err("\tbig_lpt %u\n", + !!(sup_flags & UBIFS_FLG_BIGLPT)); + pr_err("\tspace_fixup %u\n", + !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); + pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); + pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(sup->leb_cnt)); + pr_err("\tmax_leb_cnt %u\n", le32_to_cpu(sup->max_leb_cnt)); + pr_err("\tmax_bud_bytes %llu\n", + (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); + pr_err("\tlog_lebs %u\n", le32_to_cpu(sup->log_lebs)); + pr_err("\tlpt_lebs %u\n", le32_to_cpu(sup->lpt_lebs)); + pr_err("\torph_lebs %u\n", le32_to_cpu(sup->orph_lebs)); + pr_err("\tjhead_cnt %u\n", le32_to_cpu(sup->jhead_cnt)); + pr_err("\tfanout %u\n", le32_to_cpu(sup->fanout)); + pr_err("\tlsave_cnt %u\n", le32_to_cpu(sup->lsave_cnt)); + pr_err("\tdefault_compr %u\n", + (int)le16_to_cpu(sup->default_compr)); + pr_err("\trp_size %llu\n", + (unsigned long long)le64_to_cpu(sup->rp_size)); + pr_err("\trp_uid %u\n", le32_to_cpu(sup->rp_uid)); + pr_err("\trp_gid %u\n", le32_to_cpu(sup->rp_gid)); + pr_err("\tfmt_version %u\n", le32_to_cpu(sup->fmt_version)); + pr_err("\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); + pr_err("\tUUID %pUB\n", sup->uuid); + break; + } + case UBIFS_MST_NODE: + { + const struct ubifs_mst_node *mst = node; + + pr_err("\thighest_inum %llu\n", + (unsigned long long)le64_to_cpu(mst->highest_inum)); + pr_err("\tcommit number %llu\n", + (unsigned long long)le64_to_cpu(mst->cmt_no)); + pr_err("\tflags %#x\n", le32_to_cpu(mst->flags)); + pr_err("\tlog_lnum %u\n", le32_to_cpu(mst->log_lnum)); + pr_err("\troot_lnum %u\n", le32_to_cpu(mst->root_lnum)); + pr_err("\troot_offs %u\n", le32_to_cpu(mst->root_offs)); + pr_err("\troot_len %u\n", le32_to_cpu(mst->root_len)); + pr_err("\tgc_lnum %u\n", le32_to_cpu(mst->gc_lnum)); + pr_err("\tihead_lnum %u\n", le32_to_cpu(mst->ihead_lnum)); + pr_err("\tihead_offs %u\n", le32_to_cpu(mst->ihead_offs)); + pr_err("\tindex_size %llu\n", + (unsigned long long)le64_to_cpu(mst->index_size)); + pr_err("\tlpt_lnum %u\n", le32_to_cpu(mst->lpt_lnum)); + pr_err("\tlpt_offs %u\n", le32_to_cpu(mst->lpt_offs)); + pr_err("\tnhead_lnum %u\n", le32_to_cpu(mst->nhead_lnum)); + pr_err("\tnhead_offs %u\n", le32_to_cpu(mst->nhead_offs)); + pr_err("\tltab_lnum %u\n", le32_to_cpu(mst->ltab_lnum)); + pr_err("\tltab_offs %u\n", le32_to_cpu(mst->ltab_offs)); + pr_err("\tlsave_lnum %u\n", le32_to_cpu(mst->lsave_lnum)); + pr_err("\tlsave_offs %u\n", le32_to_cpu(mst->lsave_offs)); + pr_err("\tlscan_lnum %u\n", le32_to_cpu(mst->lscan_lnum)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(mst->leb_cnt)); + pr_err("\tempty_lebs %u\n", le32_to_cpu(mst->empty_lebs)); + pr_err("\tidx_lebs %u\n", le32_to_cpu(mst->idx_lebs)); + pr_err("\ttotal_free %llu\n", + (unsigned long long)le64_to_cpu(mst->total_free)); + pr_err("\ttotal_dirty %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dirty)); + pr_err("\ttotal_used %llu\n", + (unsigned long long)le64_to_cpu(mst->total_used)); + pr_err("\ttotal_dead %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dead)); + pr_err("\ttotal_dark %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dark)); + break; + } + case UBIFS_REF_NODE: + { + const struct ubifs_ref_node *ref = node; + + pr_err("\tlnum %u\n", le32_to_cpu(ref->lnum)); + pr_err("\toffs %u\n", le32_to_cpu(ref->offs)); + pr_err("\tjhead %u\n", le32_to_cpu(ref->jhead)); + break; + } + case UBIFS_INO_NODE: + { + const struct ubifs_ino_node *ino = node; + + key_read(c, &ino->key, &key); + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tcreat_sqnum %llu\n", + (unsigned long long)le64_to_cpu(ino->creat_sqnum)); + pr_err("\tsize %llu\n", + (unsigned long long)le64_to_cpu(ino->size)); + pr_err("\tnlink %u\n", le32_to_cpu(ino->nlink)); + pr_err("\tatime %lld.%u\n", + (long long)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + pr_err("\tmtime %lld.%u\n", + (long long)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); + pr_err("\tctime %lld.%u\n", + (long long)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); + pr_err("\tuid %u\n", le32_to_cpu(ino->uid)); + pr_err("\tgid %u\n", le32_to_cpu(ino->gid)); + pr_err("\tmode %u\n", le32_to_cpu(ino->mode)); + pr_err("\tflags %#x\n", le32_to_cpu(ino->flags)); + pr_err("\txattr_cnt %u\n", le32_to_cpu(ino->xattr_cnt)); + pr_err("\txattr_size %u\n", le32_to_cpu(ino->xattr_size)); + pr_err("\txattr_names %u\n", le32_to_cpu(ino->xattr_names)); + pr_err("\tcompr_type %#x\n", + (int)le16_to_cpu(ino->compr_type)); + pr_err("\tdata len %u\n", le32_to_cpu(ino->data_len)); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + const struct ubifs_dent_node *dent = node; + int nlen = le16_to_cpu(dent->nlen); + + key_read(c, &dent->key, &key); + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tinum %llu\n", + (unsigned long long)le64_to_cpu(dent->inum)); + pr_err("\ttype %d\n", (int)dent->type); + pr_err("\tnlen %d\n", nlen); + pr_err("\tname "); + + if (nlen > UBIFS_MAX_NLEN) + pr_err("(bad name length, not printing, bad or corrupted node)"); + else { + for (i = 0; i < nlen && dent->name[i]; i++) + pr_cont("%c", dent->name[i]); + } + pr_cont("\n"); + + break; + } + case UBIFS_DATA_NODE: + { + const struct ubifs_data_node *dn = node; + int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; + + key_read(c, &dn->key, &key); + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tsize %u\n", le32_to_cpu(dn->size)); + pr_err("\tcompr_typ %d\n", + (int)le16_to_cpu(dn->compr_type)); + pr_err("\tdata size %d\n", dlen); + pr_err("\tdata:\n"); + print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, + (void *)&dn->data, dlen, 0); + break; + } + case UBIFS_TRUN_NODE: + { + const struct ubifs_trun_node *trun = node; + + pr_err("\tinum %u\n", le32_to_cpu(trun->inum)); + pr_err("\told_size %llu\n", + (unsigned long long)le64_to_cpu(trun->old_size)); + pr_err("\tnew_size %llu\n", + (unsigned long long)le64_to_cpu(trun->new_size)); + break; + } + case UBIFS_IDX_NODE: + { + const struct ubifs_idx_node *idx = node; + + n = le16_to_cpu(idx->child_cnt); + pr_err("\tchild_cnt %d\n", n); + pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); + pr_err("\tBranches:\n"); + + for (i = 0; i < n && i < c->fanout - 1; i++) { + const struct ubifs_branch *br; + + br = ubifs_idx_branch(c, idx, i); + key_read(c, &br->key, &key); + pr_err("\t%d: LEB %d:%d len %d key %s\n", + i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), + le32_to_cpu(br->len), + dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + } + break; + } + case UBIFS_CS_NODE: + break; + case UBIFS_ORPH_NODE: + { + const struct ubifs_orph_node *orph = node; + + pr_err("\tcommit number %llu\n", + (unsigned long long) + le64_to_cpu(orph->cmt_no) & LLONG_MAX); + pr_err("\tlast node flag %llu\n", + (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); + n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + pr_err("\t%d orphan inode numbers:\n", n); + for (i = 0; i < n; i++) + pr_err("\t ino %llu\n", + (unsigned long long)le64_to_cpu(orph->inos[i])); + break; + } + default: + pr_err("node type %d was not recognized\n", + (int)ch->node_type); + } + spin_unlock(&dbg_lock); +} + +void ubifs_dump_budget_req(const struct ubifs_budget_req *req) +{ + spin_lock(&dbg_lock); + pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n", + req->new_ino, req->dirtied_ino); + pr_err("\tnew_ino_d %d, dirtied_ino_d %d\n", + req->new_ino_d, req->dirtied_ino_d); + pr_err("\tnew_page %d, dirtied_page %d\n", + req->new_page, req->dirtied_page); + pr_err("\tnew_dent %d, mod_dent %d\n", + req->new_dent, req->mod_dent); + pr_err("\tidx_growth %d\n", req->idx_growth); + pr_err("\tdata_growth %d dd_growth %d\n", + req->data_growth, req->dd_growth); + spin_unlock(&dbg_lock); +} + +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) +{ + spin_lock(&dbg_lock); + pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs %d\n", + current->pid, lst->empty_lebs, lst->idx_lebs); + pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n", + lst->taken_empty_lebs, lst->total_free, lst->total_dirty); + pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n", + lst->total_used, lst->total_dark, lst->total_dead); + spin_unlock(&dbg_lock); +} + +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) +{ + int i; + struct rb_node *rb; + struct ubifs_bud *bud; + struct ubifs_gced_idx_leb *idx_gc; + long long available, outstanding, free; + + spin_lock(&c->space_lock); + spin_lock(&dbg_lock); + pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n", + current->pid, bi->data_growth + bi->dd_growth, + bi->data_growth + bi->dd_growth + bi->idx_growth); + pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n", + bi->data_growth, bi->dd_growth, bi->idx_growth); + pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n", + bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx); + pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n", + bi->page_budget, bi->inode_budget, bi->dent_budget); + pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp); + pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + + if (bi != &c->bi) + /* + * If we are dumping saved budgeting data, do not print + * additional information which is about the current state, not + * the old one which corresponded to the saved budgeting data. + */ + goto out_unlock; + + pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); + pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n", + atomic_long_read(&c->dirty_pg_cnt), + atomic_long_read(&c->dirty_zn_cnt), + atomic_long_read(&c->clean_zn_cnt)); + pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); + + /* If we are in R/O mode, journal heads do not exist */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) + pr_err("\tjhead %s\t LEB %d\n", + dbg_jhead(c->jheads[i].wbuf.jhead), + c->jheads[i].wbuf.lnum); + for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + pr_err("\tbud LEB %d\n", bud->lnum); + } + list_for_each_entry(bud, &c->old_buds, list) + pr_err("\told bud LEB %d\n", bud->lnum); + list_for_each_entry(idx_gc, &c->idx_gc, list) + pr_err("\tGC'ed idx LEB %d unmap %d\n", + idx_gc->lnum, idx_gc->unmap); + pr_err("\tcommit state %d\n", c->cmt_state); + + /* Print budgeting predictions */ + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; + free = ubifs_get_free_space_nolock(c); + pr_err("Budgeting predictions:\n"); + pr_err("\tavailable: %lld, outstanding %lld, free %lld\n", + available, outstanding, free); +out_unlock: + spin_unlock(&dbg_lock); + spin_unlock(&c->space_lock); +} + +void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +{ + int i, spc, dark = 0, dead = 0; + struct rb_node *rb; + struct ubifs_bud *bud; + + spc = lp->free + lp->dirty; + if (spc < c->dead_wm) + dead = spc; + else + dark = ubifs_calc_dark(c, spc); + + if (lp->flags & LPROPS_INDEX) + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + lp->flags); + else + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); + + if (lp->flags & LPROPS_TAKEN) { + if (lp->flags & LPROPS_INDEX) + pr_cont("index, taken"); + else + pr_cont("taken"); + } else { + const char *s; + + if (lp->flags & LPROPS_INDEX) { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_DIRTY_IDX: + s = "dirty index"; + break; + case LPROPS_FRDI_IDX: + s = "freeable index"; + break; + default: + s = "index"; + } + } else { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_UNCAT: + s = "not categorized"; + break; + case LPROPS_DIRTY: + s = "dirty"; + break; + case LPROPS_FREE: + s = "free"; + break; + case LPROPS_EMPTY: + s = "empty"; + break; + case LPROPS_FREEABLE: + s = "freeable"; + break; + default: + s = NULL; + break; + } + } + pr_cont("%s", s); + } + + for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + if (bud->lnum == lp->lnum) { + int head = 0; + for (i = 0; i < c->jhead_cnt; i++) { + /* + * Note, if we are in R/O mode or in the middle + * of mounting/re-mounting, the write-buffers do + * not exist. + */ + if (c->jheads && + lp->lnum == c->jheads[i].wbuf.lnum) { + pr_cont(", jhead %s", dbg_jhead(i)); + head = 1; + } + } + if (!head) + pr_cont(", bud of jhead %s", + dbg_jhead(bud->jhead)); + } + } + if (lp->lnum == c->gc_lnum) + pr_cont(", GC LEB"); + pr_cont(")\n"); +} + +void ubifs_dump_lprops(struct ubifs_info *c) +{ + int lnum, err; + struct ubifs_lprops lp; + struct ubifs_lp_stats lst; + + pr_err("(pid %d) start dumping LEB properties\n", current->pid); + ubifs_get_lp_stats(c, &lst); + ubifs_dump_lstats(&lst); + + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) { + ubifs_err(c, "cannot read lprops for LEB %d", lnum); + continue; + } + + ubifs_dump_lprop(c, &lp); + } + pr_err("(pid %d) finish dumping LEB properties\n", current->pid); +} + +void ubifs_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + pr_err("(pid %d) dumping LPT information\n", current->pid); + pr_err("\tlpt_sz: %lld\n", c->lpt_sz); + pr_err("\tpnode_sz: %d\n", c->pnode_sz); + pr_err("\tnnode_sz: %d\n", c->nnode_sz); + pr_err("\tltab_sz: %d\n", c->ltab_sz); + pr_err("\tlsave_sz: %d\n", c->lsave_sz); + pr_err("\tbig_lpt: %d\n", c->big_lpt); + pr_err("\tlpt_hght: %d\n", c->lpt_hght); + pr_err("\tpnode_cnt: %d\n", c->pnode_cnt); + pr_err("\tnnode_cnt: %d\n", c->nnode_cnt); + pr_err("\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + pr_err("\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + pr_err("\tlsave_cnt: %d\n", c->lsave_cnt); + pr_err("\tspace_bits: %d\n", c->space_bits); + pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + pr_err("\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + pr_err("\tpcnt_bits: %d\n", c->pcnt_bits); + pr_err("\tlnum_bits: %d\n", c->lnum_bits); + pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + pr_err("\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + pr_err("\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n", + i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty, + c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + pr_err("Dumping node at LEB %d:%d len %d\n", + sleb->lnum, snod->offs, snod->len); + ubifs_dump_node(c, snod->node); + } +} + +void ubifs_dump_leb(const struct ubifs_info *c, int lnum) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + void *buf; + + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); + return; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + ubifs_err(c, "scan error %d", (int)PTR_ERR(sleb)); + goto out; + } + + pr_err("LEB %d has %d nodes ending at %d\n", lnum, + sleb->nodes_cnt, sleb->endpt); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + pr_err("Dumping node at LEB %d:%d len %d\n", lnum, + snod->offs, snod->len); + ubifs_dump_node(c, snod->node); + } + + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); + ubifs_scan_destroy(sleb); + +out: + vfree(buf); + return; +} + +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) +{ + int n; + const struct ubifs_zbranch *zbr; + char key_buf[DBG_KEY_BUF_LEN]; + + spin_lock(&dbg_lock); + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + + pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n", + znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip, + znode->level, znode->child_cnt, znode->flags); + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + spin_unlock(&dbg_lock); + return; + } + + pr_err("zbranches:\n"); + for (n = 0; n < znode->child_cnt; n++) { + zbr = &znode->zbranch[n]; + if (znode->level > 0) + pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); + else + pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); + } + spin_unlock(&dbg_lock); +} + +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +{ + int i; + + pr_err("(pid %d) start dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + + pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n", + i, lprops->lnum, lprops->hpos, lprops->free, + lprops->dirty, lprops->flags); + } + pr_err("(pid %d) finish dumping heap\n", current->pid); +} + +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + pr_err("(pid %d) dumping pnode:\n", current->pid); + pr_err("\taddress %zx parent %zx cnext %zx\n", + (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); + pr_err("\tflags %lu iip %d level %d num %d\n", + pnode->flags, iip, pnode->level, pnode->num); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp = &pnode->lprops[i]; + + pr_err("\t%d: free %d dirty %d flags %d lnum %d\n", + i, lp->free, lp->dirty, lp->flags, lp->lnum); + } +} + +void ubifs_dump_tnc(struct ubifs_info *c) +{ + struct ubifs_znode *znode; + int level; + + pr_err("\n"); + pr_err("(pid %d) start dumping TNC tree\n", current->pid); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + level = znode->level; + pr_err("== Level %d ==\n", level); + while (znode) { + if (level != znode->level) { + level = znode->level; + pr_err("== Level %d ==\n", level); + } + ubifs_dump_znode(c, znode); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + } + pr_err("(pid %d) finish dumping TNC tree\n", current->pid); +} + +static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, + void *priv) +{ + ubifs_dump_znode(c, znode); + return 0; +} + +/** + * ubifs_dump_index - dump the on-flash index. + * @c: UBIFS file-system description object + * + * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()' + * which dumps only in-memory znodes and does not read znodes which from flash. + */ +void ubifs_dump_index(struct ubifs_info *c) +{ + dbg_walk_index(c, NULL, dump_znode, NULL); +} + +/** + * dbg_save_space_info - save information about flash space. + * @c: UBIFS file-system description object + * + * This function saves information about UBIFS free space, dirty space, etc, in + * order to check it later. + */ +void dbg_save_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + int freeable_cnt; + + spin_lock(&c->space_lock); + memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info)); + d->saved_idx_gc_cnt = c->idx_gc_cnt; + + /* + * We use a dirty hack here and zero out @c->freeable_cnt, because it + * affects the free space calculations, and UBIFS might not know about + * all freeable eraseblocks. Indeed, we know about freeable eraseblocks + * only when we read their lprops, and we do this only lazily, upon the + * need. So at any given point of time @c->freeable_cnt might be not + * exactly accurate. + * + * Just one example about the issue we hit when we did not zero + * @c->freeable_cnt. + * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the + * amount of free space in @d->saved_free + * 2. We re-mount R/W, which makes UBIFS to read the "lsave" + * information from flash, where we cache LEBs from various + * categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()' + * -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()' + * -> 'ubifs_get_pnode()' -> 'update_cats()' + * -> 'ubifs_add_to_cat()'). + * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt + * becomes %1. + * 4. We calculate the amount of free space when the re-mount is + * finished in 'dbg_check_space_info()' and it does not match + * @d->saved_free. + */ + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + d->saved_free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); +} + +/** + * dbg_check_space_info - check flash space information. + * @c: UBIFS file-system description object + * + * This function compares current flash space information with the information + * which was saved when the 'dbg_save_space_info()' function was called. + * Returns zero if the information has not changed, and %-EINVAL it it has + * changed. + */ +int dbg_check_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + struct ubifs_lp_stats lst; + long long free; + int freeable_cnt; + + spin_lock(&c->space_lock); + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); + + if (free != d->saved_free) { + ubifs_err(c, "free space changed from %lld to %lld", + d->saved_free, free); + goto out; + } + + return 0; + +out: + ubifs_msg(c, "saved lprops statistics dump"); + ubifs_dump_lstats(&d->saved_lst); + ubifs_msg(c, "saved budgeting info dump"); + ubifs_dump_budg(c, &d->saved_bi); + ubifs_msg(c, "saved idx_gc_cnt %d", d->saved_idx_gc_cnt); + ubifs_msg(c, "current lprops statistics dump"); + ubifs_get_lp_stats(c, &lst); + ubifs_dump_lstats(&lst); + ubifs_msg(c, "current budgeting info dump"); + ubifs_dump_budg(c, &c->bi); + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_synced_i_size - check synchronized inode size. + * @c: UBIFS file-system description object + * @inode: inode to check + * + * If inode is clean, synchronized inode size has to be equivalent to current + * inode size. This function has to be called only for locked inodes (@i_mutex + * has to be locked). Returns %0 if synchronized inode size if correct, and + * %-EINVAL if not. + */ +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) +{ + int err = 0; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (!dbg_is_chk_gen(c)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + + mutex_lock(&ui->ui_mutex); + spin_lock(&ui->ui_lock); + if (ui->ui_size != ui->synced_i_size && !ui->dirty) { + ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean", + ui->ui_size, ui->synced_i_size); + ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + inode->i_mode, i_size_read(inode)); + dump_stack(); + err = -EINVAL; + } + spin_unlock(&ui->ui_lock); + mutex_unlock(&ui->ui_mutex); + return err; +} + +/* + * dbg_check_dir - check directory inode size and link count. + * @c: UBIFS file-system description object + * @dir: the directory to calculate size for + * @size: the result is returned here + * + * This function makes sure that directory size and link count are correct. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, it is good idea to make sure the @dir->i_mutex is locked before + * calling this function. + */ +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) +{ + unsigned int nlink = 2; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + struct qstr nm = { .name = NULL }; + loff_t size = UBIFS_INO_NODE_SZ; + + if (!dbg_is_chk_gen(c)) + return 0; + + if (!S_ISDIR(dir->i_mode)) + return 0; + + lowest_dent_key(c, &key, dir->i_ino); + while (1) { + int err; + + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + break; + return err; + } + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(nm.len); + if (dent->type == UBIFS_ITYPE_DIR) + nlink += 1; + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); + + if (i_size_read(dir) != size) { + ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu", + dir->i_ino, (unsigned long long)i_size_read(dir), + (unsigned long long)size); + ubifs_dump_inode(c, dir); + dump_stack(); + return -EINVAL; + } + if (dir->i_nlink != nlink) { + ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u", + dir->i_ino, dir->i_nlink, nlink); + ubifs_dump_inode(c, dir); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * dbg_check_key_order - make sure that colliding keys are properly ordered. + * @c: UBIFS file-system description object + * @zbr1: first zbranch + * @zbr2: following zbranch + * + * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of + * names of the direntries/xentries which are referred by the keys. This + * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes + * sure the name of direntry/xentry referred by @zbr1 is less than + * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not, + * and a negative error code in case of failure. + */ +static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, + struct ubifs_zbranch *zbr2) +{ + int err, nlen1, nlen2, cmp; + struct ubifs_dent_node *dent1, *dent2; + union ubifs_key key; + char key_buf[DBG_KEY_BUF_LEN]; + + ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); + dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent1) + return -ENOMEM; + dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent2) { + err = -ENOMEM; + goto out_free; + } + + err = ubifs_tnc_read_node(c, zbr1, dent1); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent1); + if (err) + goto out_free; + + err = ubifs_tnc_read_node(c, zbr2, dent2); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent2); + if (err) + goto out_free; + + /* Make sure node keys are the same as in zbranch */ + err = 1; + key_read(c, &dent1->key, &key); + if (keys_cmp(c, &zbr1->key, &key)) { + ubifs_err(c, "1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err(c, "but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent1); + goto out_free; + } + + key_read(c, &dent2->key, &key); + if (keys_cmp(c, &zbr2->key, &key)) { + ubifs_err(c, "2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err(c, "but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent2); + goto out_free; + } + + nlen1 = le16_to_cpu(dent1->nlen); + nlen2 = le16_to_cpu(dent2->nlen); + + cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2)); + if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) { + err = 0; + goto out_free; + } + if (cmp == 0 && nlen1 == nlen2) + ubifs_err(c, "2 xent/dent nodes with the same name"); + else + ubifs_err(c, "bad order of colliding key %s", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + + ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs); + ubifs_dump_node(c, dent1); + ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs); + ubifs_dump_node(c, dent2); + +out_free: + kfree(dent2); + kfree(dent1); + return err; +} + +/** + * dbg_check_znode - check if znode is all right. + * @c: UBIFS file-system description object + * @zbr: zbranch which points to this znode + * + * This function makes sure that znode referred to by @zbr is all right. + * Returns zero if it is, and %-EINVAL if it is not. + */ +static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zp = znode->parent; + int n, err, cmp; + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + err = 1; + goto out; + } + if (znode->level < 0) { + err = 2; + goto out; + } + if (znode->iip < 0 || znode->iip >= c->fanout) { + err = 3; + goto out; + } + + if (zbr->len == 0) + /* Only dirty zbranch may have no on-flash nodes */ + if (!ubifs_zn_dirty(znode)) { + err = 4; + goto out; + } + + if (ubifs_zn_dirty(znode)) { + /* + * If znode is dirty, its parent has to be dirty as well. The + * order of the operation is important, so we have to have + * memory barriers. + */ + smp_mb(); + if (zp && !ubifs_zn_dirty(zp)) { + /* + * The dirty flag is atomic and is cleared outside the + * TNC mutex, so znode's dirty flag may now have + * been cleared. The child is always cleared before the + * parent, so we just need to check again. + */ + smp_mb(); + if (ubifs_zn_dirty(znode)) { + err = 5; + goto out; + } + } + } + + if (zp) { + const union ubifs_key *min, *max; + + if (znode->level != zp->level - 1) { + err = 6; + goto out; + } + + /* Make sure the 'parent' pointer in our znode is correct */ + err = ubifs_search_zbranch(c, zp, &zbr->key, &n); + if (!err) { + /* This zbranch does not exist in the parent */ + err = 7; + goto out; + } + + if (znode->iip >= zp->child_cnt) { + err = 8; + goto out; + } + + if (znode->iip != n) { + /* This may happen only in case of collisions */ + if (keys_cmp(c, &zp->zbranch[n].key, + &zp->zbranch[znode->iip].key)) { + err = 9; + goto out; + } + n = znode->iip; + } + + /* + * Make sure that the first key in our znode is greater than or + * equal to the key in the pointing zbranch. + */ + min = &zbr->key; + cmp = keys_cmp(c, min, &znode->zbranch[0].key); + if (cmp == 1) { + err = 10; + goto out; + } + + if (n + 1 < zp->child_cnt) { + max = &zp->zbranch[n + 1].key; + + /* + * Make sure the last key in our znode is less or + * equivalent than the key in the zbranch which goes + * after our pointing zbranch. + */ + cmp = keys_cmp(c, max, + &znode->zbranch[znode->child_cnt - 1].key); + if (cmp == -1) { + err = 11; + goto out; + } + } + } else { + /* This may only be root znode */ + if (zbr != &c->zroot) { + err = 12; + goto out; + } + } + + /* + * Make sure that next key is greater or equivalent then the previous + * one. + */ + for (n = 1; n < znode->child_cnt; n++) { + cmp = keys_cmp(c, &znode->zbranch[n - 1].key, + &znode->zbranch[n].key); + if (cmp > 0) { + err = 13; + goto out; + } + if (cmp == 0) { + /* This can only be keys with colliding hash */ + if (!is_hash_key(c, &znode->zbranch[n].key)) { + err = 14; + goto out; + } + + if (znode->level != 0 || c->replaying) + continue; + + /* + * Colliding keys should follow binary order of + * corresponding xentry/dentry names. + */ + err = dbg_check_key_order(c, &znode->zbranch[n - 1], + &znode->zbranch[n]); + if (err < 0) + return err; + if (err) { + err = 15; + goto out; + } + } + } + + for (n = 0; n < znode->child_cnt; n++) { + if (!znode->zbranch[n].znode && + (znode->zbranch[n].lnum == 0 || + znode->zbranch[n].len == 0)) { + err = 16; + goto out; + } + + if (znode->zbranch[n].lnum != 0 && + znode->zbranch[n].len == 0) { + err = 17; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].len != 0) { + err = 18; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].offs != 0) { + err = 19; + goto out; + } + + if (znode->level != 0 && znode->zbranch[n].znode) + if (znode->zbranch[n].znode->parent != znode) { + err = 20; + goto out; + } + } + + return 0; + +out: + ubifs_err(c, "failed, error %d", err); + ubifs_msg(c, "dump of the znode"); + ubifs_dump_znode(c, znode); + if (zp) { + ubifs_msg(c, "dump of the parent znode"); + ubifs_dump_znode(c, zp); + } + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_tnc - check TNC tree. + * @c: UBIFS file-system description object + * @extra: do extra checks that are possible at start commit + * + * This function traverses whole TNC tree and checks every znode. Returns zero + * if everything is all right and %-EINVAL if something is wrong with TNC. + */ +int dbg_check_tnc(struct ubifs_info *c, int extra) +{ + struct ubifs_znode *znode; + long clean_cnt = 0, dirty_cnt = 0; + int err, last; + + if (!dbg_is_chk_index(c)) + return 0; + + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + if (!c->zroot.znode) + return 0; + + znode = ubifs_tnc_postorder_first(c->zroot.znode); + while (1) { + struct ubifs_znode *prev; + struct ubifs_zbranch *zbr; + + if (!znode->parent) + zbr = &c->zroot; + else + zbr = &znode->parent->zbranch[znode->iip]; + + err = dbg_check_znode(c, zbr); + if (err) + return err; + + if (extra) { + if (ubifs_zn_dirty(znode)) + dirty_cnt += 1; + else + clean_cnt += 1; + } + + prev = znode; + znode = ubifs_tnc_postorder_next(znode); + if (!znode) + break; + + /* + * If the last key of this znode is equivalent to the first key + * of the next znode (collision), then check order of the keys. + */ + last = prev->child_cnt - 1; + if (prev->level == 0 && znode->level == 0 && !c->replaying && + !keys_cmp(c, &prev->zbranch[last].key, + &znode->zbranch[0].key)) { + err = dbg_check_key_order(c, &prev->zbranch[last], + &znode->zbranch[0]); + if (err < 0) + return err; + if (err) { + ubifs_msg(c, "first znode"); + ubifs_dump_znode(c, prev); + ubifs_msg(c, "second znode"); + ubifs_dump_znode(c, znode); + return -EINVAL; + } + } + } + + if (extra) { + if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) { + ubifs_err(c, "incorrect clean_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->clean_zn_cnt), + clean_cnt); + return -EINVAL; + } + if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) { + ubifs_err(c, "incorrect dirty_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->dirty_zn_cnt), + dirty_cnt); + return -EINVAL; + } + } + + return 0; +} + +/** + * dbg_walk_index - walk the on-flash index. + * @c: UBIFS file-system description object + * @leaf_cb: called for each leaf node + * @znode_cb: called for each indexing node + * @priv: private data which is passed to callbacks + * + * This function walks the UBIFS index and calls the @leaf_cb for each leaf + * node and @znode_cb for each indexing node. Returns zero in case of success + * and a negative error code in case of failure. + * + * It would be better if this function removed every znode it pulled to into + * the TNC, so that the behavior more closely matched the non-debugging + * behavior. + */ +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv) +{ + int err; + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *child; + + mutex_lock(&c->tnc_mutex); + /* If the root indexing node is not in TNC - pull it */ + if (!c->zroot.znode) { + c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(c->zroot.znode)) { + err = PTR_ERR(c->zroot.znode); + c->zroot.znode = NULL; + goto out_unlock; + } + } + + /* + * We are going to traverse the indexing tree in the postorder manner. + * Go down and find the leftmost indexing node where we are going to + * start from. + */ + znode = c->zroot.znode; + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + + znode = child; + } + + /* Iterate over all indexing nodes */ + while (1) { + int idx; + + cond_resched(); + + if (znode_cb) { + err = znode_cb(c, znode, priv); + if (err) { + ubifs_err(c, "znode checking function returned error %d", + err); + ubifs_dump_znode(c, znode); + goto out_dump; + } + } + if (leaf_cb && znode->level == 0) { + for (idx = 0; idx < znode->child_cnt; idx++) { + zbr = &znode->zbranch[idx]; + err = leaf_cb(c, zbr, priv); + if (err) { + ubifs_err(c, "leaf checking function returned error %d, for leaf at LEB %d:%d", + err, zbr->lnum, zbr->offs); + goto out_dump; + } + } + } + + if (!znode->parent) + break; + + idx = znode->iip + 1; + znode = znode->parent; + if (idx < znode->child_cnt) { + /* Switch to the next index in the parent */ + zbr = &znode->zbranch[idx]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, idx); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } else + /* + * This is the last child, switch to the parent and + * continue. + */ + continue; + + /* Go to the lowest leftmost znode in the new sub-tree */ + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } + } + + mutex_unlock(&c->tnc_mutex); + return 0; + +out_dump: + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + ubifs_msg(c, "dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_dump_znode(c, znode); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * add_size - add znode size to partially calculated index size. + * @c: UBIFS file-system description object + * @znode: znode to add size for + * @priv: partially calculated index size + * + * This is a helper function for 'dbg_check_idx_size()' which is called for + * every indexing node and adds its size to the 'long long' variable pointed to + * by @priv. + */ +static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) +{ + long long *idx_size = priv; + int add; + + add = ubifs_idx_node_sz(c, znode->child_cnt); + add = ALIGN(add, 8); + *idx_size += add; + return 0; +} + +/** + * dbg_check_idx_size - check index size. + * @c: UBIFS file-system description object + * @idx_size: size to check + * + * This function walks the UBIFS index, calculates its size and checks that the + * size is equivalent to @idx_size. Returns zero in case of success and a + * negative error code in case of failure. + */ +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) +{ + int err; + long long calc = 0; + + if (!dbg_is_chk_index(c)) + return 0; + + err = dbg_walk_index(c, NULL, add_size, &calc); + if (err) { + ubifs_err(c, "error %d while walking the index", err); + return err; + } + + if (calc != idx_size) { + ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", + calc, idx_size); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * struct fsck_inode - information about an inode used when checking the file-system. + * @rb: link in the RB-tree of inodes + * @inum: inode number + * @mode: inode type, permissions, etc + * @nlink: inode link count + * @xattr_cnt: count of extended attributes + * @references: how many directory/xattr entries refer this inode (calculated + * while walking the index) + * @calc_cnt: for directory inode count of child directories + * @size: inode size (read from on-flash inode) + * @xattr_sz: summary size of all extended attributes (read from on-flash + * inode) + * @calc_sz: for directories calculated directory size + * @calc_xcnt: count of extended attributes + * @calc_xsz: calculated summary size of all extended attributes + * @xattr_nms: sum of lengths of all extended attribute names belonging to this + * inode (read from on-flash inode) + * @calc_xnms: calculated sum of lengths of all extended attribute names + */ +struct fsck_inode { + struct rb_node rb; + ino_t inum; + umode_t mode; + unsigned int nlink; + unsigned int xattr_cnt; + int references; + int calc_cnt; + long long size; + unsigned int xattr_sz; + long long calc_sz; + long long calc_xcnt; + long long calc_xsz; + unsigned int xattr_nms; + long long calc_xnms; +}; + +/** + * struct fsck_data - private FS checking information. + * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects) + */ +struct fsck_data { + struct rb_root inodes; +}; + +/** + * add_inode - add inode information to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @ino: raw UBIFS inode to add + * + * This is a helper function for 'check_leaf()' which adds information about + * inode @ino to the RB-tree of inodes. Returns inode information pointer in + * case of success and a negative error code in case of failure. + */ +static struct fsck_inode *add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, + struct ubifs_ino_node *ino) +{ + struct rb_node **p, *parent = NULL; + struct fsck_inode *fscki; + ino_t inum = key_inum_flash(c, &ino->key); + struct inode *inode; + struct ubifs_inode *ui; + + p = &fsckd->inodes.rb_node; + while (*p) { + parent = *p; + fscki = rb_entry(parent, struct fsck_inode, rb); + if (inum < fscki->inum) + p = &(*p)->rb_left; + else if (inum > fscki->inum) + p = &(*p)->rb_right; + else + return fscki; + } + + if (inum > c->highest_inum) { + ubifs_err(c, "too high inode number, max. is %lu", + (unsigned long)c->highest_inum); + return ERR_PTR(-EINVAL); + } + + fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS); + if (!fscki) + return ERR_PTR(-ENOMEM); + + inode = ilookup(c->vfs_sb, inum); + + fscki->inum = inum; + /* + * If the inode is present in the VFS inode cache, use it instead of + * the on-flash inode which might be out-of-date. E.g., the size might + * be out-of-date. If we do not do this, the following may happen, for + * example: + * 1. A power cut happens + * 2. We mount the file-system R/O, the replay process fixes up the + * inode size in the VFS cache, but on on-flash. + * 3. 'check_leaf()' fails because it hits a data node beyond inode + * size. + */ + if (!inode) { + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + } else { + ui = ubifs_inode(inode); + fscki->nlink = inode->i_nlink; + fscki->size = inode->i_size; + fscki->xattr_cnt = ui->xattr_cnt; + fscki->xattr_sz = ui->xattr_size; + fscki->xattr_nms = ui->xattr_names; + fscki->mode = inode->i_mode; + iput(inode); + } + + if (S_ISDIR(fscki->mode)) { + fscki->calc_sz = UBIFS_INO_NODE_SZ; + fscki->calc_cnt = 2; + } + + rb_link_node(&fscki->rb, parent, p); + rb_insert_color(&fscki->rb, &fsckd->inodes); + + return fscki; +} + +/** + * search_inode - search inode in the RB-tree of inodes. + * @fsckd: FS checking information + * @inum: inode number to search + * + * This is a helper function for 'check_leaf()' which searches inode @inum in + * the RB-tree of inodes and returns an inode information pointer or %NULL if + * the inode was not found. + */ +static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum) +{ + struct rb_node *p; + struct fsck_inode *fscki; + + p = fsckd->inodes.rb_node; + while (p) { + fscki = rb_entry(p, struct fsck_inode, rb); + if (inum < fscki->inum) + p = p->rb_left; + else if (inum > fscki->inum) + p = p->rb_right; + else + return fscki; + } + return NULL; +} + +/** + * read_add_inode - read inode node and add it to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @inum: inode number to read + * + * This is a helper function for 'check_leaf()' which finds inode node @inum in + * the index, reads it, and adds it to the RB-tree of inodes. Returns inode + * information pointer in case of success and a negative error code in case of + * failure. + */ +static struct fsck_inode *read_add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, ino_t inum) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + + fscki = search_inode(fsckd, inum); + if (fscki) + return fscki; + + ino_key_init(c, &key, inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err(c, "inode %lu not found in index", (unsigned long)inum); + return ERR_PTR(-ENOENT); + } else if (err < 0) { + ubifs_err(c, "error %d while looking up inode %lu", + err, (unsigned long)inum); + return ERR_PTR(err); + } + + zbr = &znode->zbranch[n]; + if (zbr->len < UBIFS_INO_NODE_SZ) { + ubifs_err(c, "bad node %lu node length %d", + (unsigned long)inum, zbr->len); + return ERR_PTR(-EINVAL); + } + + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return ERR_PTR(-ENOMEM); + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return ERR_PTR(err); + } + + fscki = add_inode(c, fsckd, ino); + kfree(ino); + if (IS_ERR(fscki)) { + ubifs_err(c, "error %ld while adding inode %lu node", + PTR_ERR(fscki), (unsigned long)inum); + return fscki; + } + + return fscki; +} + +/** + * check_leaf - check leaf node. + * @c: UBIFS file-system description object + * @zbr: zbranch of the leaf node to check + * @priv: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which is called for + * every single leaf node while walking the indexing tree. It checks that the + * leaf node referred from the indexing tree exists, has correct CRC, and does + * some other basic validation. This function is also responsible for building + * an RB-tree of inodes - it adds all inodes into the RB-tree. It also + * calculates reference count, size, etc for each inode in order to later + * compare them to the information stored inside the inodes and detect possible + * inconsistencies. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + ino_t inum; + void *node; + struct ubifs_ch *ch; + int err, type = key_type(c, &zbr->key); + struct fsck_inode *fscki; + + if (zbr->len < UBIFS_CH_SZ) { + ubifs_err(c, "bad leaf length %d (LEB %d:%d)", + zbr->len, zbr->lnum, zbr->offs); + return -EINVAL; + } + + node = kmalloc(zbr->len, GFP_NOFS); + if (!node) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) { + ubifs_err(c, "cannot read leaf node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + goto out_free; + } + + /* If this is an inode node, add it to RB-tree of inodes */ + if (type == UBIFS_INO_KEY) { + fscki = add_inode(c, priv, node); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err(c, "error %d while adding inode node", err); + goto out_dump; + } + goto out; + } + + if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY && + type != UBIFS_DATA_KEY) { + ubifs_err(c, "unexpected node type %d at LEB %d:%d", + type, zbr->lnum, zbr->offs); + err = -EINVAL; + goto out_free; + } + + ch = node; + if (le64_to_cpu(ch->sqnum) > c->max_sqnum) { + ubifs_err(c, "too high sequence number, max. is %llu", + c->max_sqnum); + err = -EINVAL; + goto out_dump; + } + + if (type == UBIFS_DATA_KEY) { + long long blk_offs; + struct ubifs_data_node *dn = node; + + ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ); + + /* + * Search the inode node this data node belongs to and insert + * it to the RB-tree of inodes. + */ + inum = key_inum_flash(c, &dn->key); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err(c, "error %d while processing data node and trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Make sure the data node is within inode size */ + blk_offs = key_block_flash(c, &dn->key); + blk_offs <<= UBIFS_BLOCK_SHIFT; + blk_offs += le32_to_cpu(dn->size); + if (blk_offs > fscki->size) { + ubifs_err(c, "data node at LEB %d:%d is not within inode size %lld", + zbr->lnum, zbr->offs, fscki->size); + err = -EINVAL; + goto out_dump; + } + } else { + int nlen; + struct ubifs_dent_node *dent = node; + struct fsck_inode *fscki1; + + ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ); + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + /* + * Search the inode node this entry refers to and the parent + * inode node and insert them to the RB-tree of inodes. + */ + inum = le64_to_cpu(dent->inum); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err(c, "error %d while processing entry node and trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Count how many direntries or xentries refers this inode */ + fscki->references += 1; + + inum = key_inum_flash(c, &dent->key); + fscki1 = read_add_inode(c, priv, inum); + if (IS_ERR(fscki1)) { + err = PTR_ERR(fscki1); + ubifs_err(c, "error %d while processing entry node and trying to find parent inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + nlen = le16_to_cpu(dent->nlen); + if (type == UBIFS_XENT_KEY) { + fscki1->calc_xcnt += 1; + fscki1->calc_xsz += CALC_DENT_SIZE(nlen); + fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size); + fscki1->calc_xnms += nlen; + } else { + fscki1->calc_sz += CALC_DENT_SIZE(nlen); + if (dent->type == UBIFS_ITYPE_DIR) + fscki1->calc_cnt += 1; + } + } + +out: + kfree(node); + return 0; + +out_dump: + ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_dump_node(c, node); +out_free: + kfree(node); + return err; +} + +/** + * free_inodes - free RB-tree of inodes. + * @fsckd: FS checking information + */ +static void free_inodes(struct fsck_data *fsckd) +{ + struct fsck_inode *fscki, *n; + + rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb) + kfree(fscki); +} + +/** + * check_inodes - checks all inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which walks the + * RB-tree of inodes after the index scan has been finished, and checks that + * inode nlink, size, etc are correct. Returns zero if inodes are fine, + * %-EINVAL if not, and a negative error code in case of failure. + */ +static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + struct rb_node *this = rb_first(&fsckd->inodes); + + while (this) { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_next(this); + + if (S_ISDIR(fscki->mode)) { + /* + * Directories have to have exactly one reference (they + * cannot have hardlinks), although root inode is an + * exception. + */ + if (fscki->inum != UBIFS_ROOT_INO && + fscki->references != 1) { + ubifs_err(c, "directory inode %lu has %d direntries which refer it, but should be 1", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->inum == UBIFS_ROOT_INO && + fscki->references != 0) { + ubifs_err(c, "root inode %lu has non-zero (%d) direntries which refer it", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->calc_sz != fscki->size) { + ubifs_err(c, "directory inode %lu size is %lld, but calculated size is %lld", + (unsigned long)fscki->inum, + fscki->size, fscki->calc_sz); + goto out_dump; + } + if (fscki->calc_cnt != fscki->nlink) { + ubifs_err(c, "directory inode %lu nlink is %d, but calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->calc_cnt); + goto out_dump; + } + } else { + if (fscki->references != fscki->nlink) { + ubifs_err(c, "inode %lu nlink is %d, but calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->references); + goto out_dump; + } + } + if (fscki->xattr_sz != fscki->calc_xsz) { + ubifs_err(c, "inode %lu has xattr size %u, but calculated size is %lld", + (unsigned long)fscki->inum, fscki->xattr_sz, + fscki->calc_xsz); + goto out_dump; + } + if (fscki->xattr_cnt != fscki->calc_xcnt) { + ubifs_err(c, "inode %lu has %u xattrs, but calculated count is %lld", + (unsigned long)fscki->inum, + fscki->xattr_cnt, fscki->calc_xcnt); + goto out_dump; + } + if (fscki->xattr_nms != fscki->calc_xnms) { + ubifs_err(c, "inode %lu has xattr names' size %u, but calculated names' size is %lld", + (unsigned long)fscki->inum, fscki->xattr_nms, + fscki->calc_xnms); + goto out_dump; + } + } + + return 0; + +out_dump: + /* Read the bad inode and dump it */ + ino_key_init(c, &key, fscki->inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err(c, "inode %lu not found in index", + (unsigned long)fscki->inum); + return -ENOENT; + } else if (err < 0) { + ubifs_err(c, "error %d while looking up inode %lu", + err, (unsigned long)fscki->inum); + return err; + } + + zbr = &znode->zbranch[n]; + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return err; + } + + ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d", + (unsigned long)fscki->inum, zbr->lnum, zbr->offs); + ubifs_dump_node(c, ino); + kfree(ino); + return -EINVAL; +} + +/** + * dbg_check_filesystem - check the file-system. + * @c: UBIFS file-system description object + * + * This function checks the file system, namely: + * o makes sure that all leaf nodes exist and their CRCs are correct; + * o makes sure inode nlink, size, xattr size/count are correct (for all + * inodes). + * + * The function reads whole indexing tree and all nodes, so it is pretty + * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if + * not, and a negative error code in case of failure. + */ +int dbg_check_filesystem(struct ubifs_info *c) +{ + int err; + struct fsck_data fsckd; + + if (!dbg_is_chk_fs(c)) + return 0; + + fsckd.inodes = RB_ROOT; + err = dbg_walk_index(c, check_leaf, NULL, &fsckd); + if (err) + goto out_free; + + err = check_inodes(c, &fsckd); + if (err) + goto out_free; + + free_inodes(&fsckd); + return 0; + +out_free: + ubifs_err(c, "file-system check failed with error %d", err); + dump_stack(); + free_inodes(&fsckd); + return err; +} + +/** + * dbg_check_data_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) +{ + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; + + if (!dbg_is_chk_gen(c)) + return 0; + + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t blka, blkb; + + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); + + if (sa->type != UBIFS_DATA_NODE) { + ubifs_err(c, "bad node type %d", sa->type); + ubifs_dump_node(c, sa->node); + return -EINVAL; + } + if (sb->type != UBIFS_DATA_NODE) { + ubifs_err(c, "bad node type %d", sb->type); + ubifs_dump_node(c, sb->node); + return -EINVAL; + } + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err(c, "larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } + + blka = key_block(c, &sa->key); + blkb = key_block(c, &sb->key); + + if (blka > blkb) { + ubifs_err(c, "larger block %u goes before %u", blka, blkb); + goto error_dump; + } + if (blka == blkb) { + ubifs_err(c, "two data nodes for the same block"); + goto error_dump; + } + } + + return 0; + +error_dump: + ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sb->node); + return -EINVAL; +} + +/** + * dbg_check_nondata_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of non-data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) +{ + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; + + if (!dbg_is_chk_gen(c)) + return 0; + + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t hasha, hashb; + + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); + + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err(c, "bad node type %d", sa->type); + ubifs_dump_node(c, sa->node); + return -EINVAL; + } + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err(c, "bad node type %d", sb->type); + ubifs_dump_node(c, sb->node); + return -EINVAL; + } + + if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + ubifs_err(c, "non-inode node goes before inode node"); + goto error_dump; + } + + if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE) + continue; + + if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + /* Inode nodes are sorted in descending size order */ + if (sa->len < sb->len) { + ubifs_err(c, "smaller inode node goes first"); + goto error_dump; + } + continue; + } + + /* + * This is either a dentry or xentry, which should be sorted in + * ascending (parent ino, hash) order. + */ + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err(c, "larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } + + hasha = key_block(c, &sa->key); + hashb = key_block(c, &sb->key); + + if (hasha > hashb) { + ubifs_err(c, "larger hash %u goes before %u", + hasha, hashb); + goto error_dump; + } + } + + return 0; + +error_dump: + ubifs_msg(c, "dumping first node"); + ubifs_dump_node(c, sa->node); + ubifs_msg(c, "dumping second node"); + ubifs_dump_node(c, sb->node); + return -EINVAL; + return 0; +} + +static inline int chance(unsigned int n, unsigned int out_of) +{ + return !!((prandom_u32() % out_of) + 1 <= n); + +} + +static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) +{ + struct ubifs_debug_info *d = c->dbg; + + ubifs_assert(dbg_is_tst_rcvry(c)); + + if (!d->pc_cnt) { + /* First call - decide delay to the power cut */ + if (chance(1, 2)) { + unsigned long delay; + + if (chance(1, 2)) { + d->pc_delay = 1; + /* Fail within 1 minute */ + delay = prandom_u32() % 60000; + d->pc_timeout = jiffies; + d->pc_timeout += msecs_to_jiffies(delay); + ubifs_warn(c, "failing after %lums", delay); + } else { + d->pc_delay = 2; + delay = prandom_u32() % 10000; + /* Fail within 10000 operations */ + d->pc_cnt_max = delay; + ubifs_warn(c, "failing after %lu calls", delay); + } + } + + d->pc_cnt += 1; + } + + /* Determine if failure delay has expired */ + if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout)) + return 0; + if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max) + return 0; + + if (lnum == UBIFS_SB_LNUM) { + if (write && chance(1, 2)) + return 0; + if (chance(19, 20)) + return 0; + ubifs_warn(c, "failing in super block LEB %d", lnum); + } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { + if (chance(19, 20)) + return 0; + ubifs_warn(c, "failing in master LEB %d", lnum); + } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { + if (write && chance(99, 100)) + return 0; + if (chance(399, 400)) + return 0; + ubifs_warn(c, "failing in log LEB %d", lnum); + } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { + if (write && chance(7, 8)) + return 0; + if (chance(19, 20)) + return 0; + ubifs_warn(c, "failing in LPT LEB %d", lnum); + } else if (lnum >= c->orph_first && lnum <= c->orph_last) { + if (write && chance(1, 2)) + return 0; + if (chance(9, 10)) + return 0; + ubifs_warn(c, "failing in orphan LEB %d", lnum); + } else if (lnum == c->ihead_lnum) { + if (chance(99, 100)) + return 0; + ubifs_warn(c, "failing in index head LEB %d", lnum); + } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { + if (chance(9, 10)) + return 0; + ubifs_warn(c, "failing in GC head LEB %d", lnum); + } else if (write && !RB_EMPTY_ROOT(&c->buds) && + !ubifs_search_bud(c, lnum)) { + if (chance(19, 20)) + return 0; + ubifs_warn(c, "failing in non-bud LEB %d", lnum); + } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || + c->cmt_state == COMMIT_RUNNING_REQUIRED) { + if (chance(999, 1000)) + return 0; + ubifs_warn(c, "failing in bud LEB %d commit running", lnum); + } else { + if (chance(9999, 10000)) + return 0; + ubifs_warn(c, "failing in bud LEB %d commit not running", lnum); + } + + d->pc_happened = 1; + ubifs_warn(c, "========== Power cut emulated =========="); + dump_stack(); + return 1; +} + +static int corrupt_data(const struct ubifs_info *c, const void *buf, + unsigned int len) +{ + unsigned int from, to, ffs = chance(1, 2); + unsigned char *p = (void *)buf; + + from = prandom_u32() % len; + /* Corruption span max to end of write unit */ + to = min(len, ALIGN(from + 1, c->max_write_size)); + + ubifs_warn(c, "filled bytes %u-%u with %s", from, to - 1, + ffs ? "0xFFs" : "random data"); + + if (ffs) + memset(p + from, 0xFF, to - from); + else + prandom_bytes(p + from, to - from); + + return to; +} + +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, + int offs, int len) +{ + int err, failing; + + if (c->dbg->pc_happened) + return -EROFS; + + failing = power_cut_emulated(c, lnum, 1); + if (failing) { + len = corrupt_data(c, buf, len); + ubifs_warn(c, "actually write %d bytes to LEB %d:%d (the buffer was corrupted)", + len, lnum, offs); + } + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); + if (err) + return err; + if (failing) + return -EROFS; + return 0; +} + +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, + int len) +{ + int err; + + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len); + if (err) + return err; + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; + return 0; +} + +int dbg_leb_unmap(struct ubifs_info *c, int lnum) +{ + int err; + + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_unmap(c->ubi, lnum); + if (err) + return err; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + return 0; +} + +int dbg_leb_map(struct ubifs_info *c, int lnum) +{ + int err; + + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_map(c->ubi, lnum); + if (err) + return err; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + return 0; +} + +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; + +static int dfs_file_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +/** + * provide_user_output - provide output to the user reading a debugfs file. + * @val: boolean value for the answer + * @u: the buffer to store the answer at + * @count: size of the buffer + * @ppos: position in the @u output buffer + * + * This is a simple helper function which stores @val boolean value in the user + * buffer when the user reads one of UBIFS debugfs files. Returns amount of + * bytes written to @u in case of success and a negative error code in case of + * failure. + */ +static int provide_user_output(int val, char __user *u, size_t count, + loff_t *ppos) +{ + char buf[3]; + + if (val) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + + return simple_read_from_buffer(u, count, ppos, buf, 2); +} + +static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + int val; + + if (dent == d->dfs_chk_gen) + val = d->chk_gen; + else if (dent == d->dfs_chk_index) + val = d->chk_index; + else if (dent == d->dfs_chk_orph) + val = d->chk_orph; + else if (dent == d->dfs_chk_lprops) + val = d->chk_lprops; + else if (dent == d->dfs_chk_fs) + val = d->chk_fs; + else if (dent == d->dfs_tst_rcvry) + val = d->tst_rcvry; + else if (dent == d->dfs_ro_error) + val = c->ro_error; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); +} + +/** + * interpret_user_input - interpret user debugfs file input. + * @u: user-provided buffer with the input + * @count: buffer size + * + * This is a helper function which interpret user input to a boolean UBIFS + * debugfs file. Returns %0 or %1 in case of success and a negative error code + * in case of failure. + */ +static int interpret_user_input(const char __user *u, size_t count) +{ + size_t buf_size; + char buf[8]; + + buf_size = min_t(size_t, count, (sizeof(buf) - 1)); + if (copy_from_user(buf, u, buf_size)) + return -EFAULT; + + if (buf[0] == '1') + return 1; + else if (buf[0] == '0') + return 0; + + return -EINVAL; +} + +static ssize_t dfs_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) +{ + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + struct dentry *dent = file->f_path.dentry; + int val; + + /* + * TODO: this is racy - the file-system might have already been + * unmounted and we'd oops in this case. The plan is to fix it with + * help of 'iterate_supers_type()' which we should have in v3.0: when + * a debugfs opened, we rember FS's UUID in file->private_data. Then + * whenever we access the FS via a debugfs file, we iterate all UBIFS + * superblocks and fine the one with the same UUID, and take the + * locking right. + * + * The other way to go suggested by Al Viro is to create a separate + * 'ubifs-debug' file-system instead. + */ + if (file->f_path.dentry == d->dfs_dump_lprops) { + ubifs_dump_lprops(c); + return count; + } + if (file->f_path.dentry == d->dfs_dump_budg) { + ubifs_dump_budg(c, &c->bi); + return count; + } + if (file->f_path.dentry == d->dfs_dump_tnc) { + mutex_lock(&c->tnc_mutex); + ubifs_dump_tnc(c); + mutex_unlock(&c->tnc_mutex); + return count; + } + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == d->dfs_chk_gen) + d->chk_gen = val; + else if (dent == d->dfs_chk_index) + d->chk_index = val; + else if (dent == d->dfs_chk_orph) + d->chk_orph = val; + else if (dent == d->dfs_chk_lprops) + d->chk_lprops = val; + else if (dent == d->dfs_chk_fs) + d->chk_fs = val; + else if (dent == d->dfs_tst_rcvry) + d->tst_rcvry = val; + else if (dent == d->dfs_ro_error) + c->ro_error = !!val; + else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_fops = { + .open = dfs_file_open, + .read = dfs_file_read, + .write = dfs_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance. + * @c: UBIFS file-system description object + * + * This function creates all debugfs files for this instance of UBIFS. Returns + * zero in case of success and a negative error code in case of failure. + * + * Note, the only reason we have not merged this function with the + * 'ubifs_debugging_init()' function is because it is better to initialize + * debugfs interfaces at the very end of the mount process, and remove them at + * the very beginning of the mount process. + */ +int dbg_debugfs_init_fs(struct ubifs_info *c) +{ + int err, n; + const char *fname; + struct dentry *dent; + struct ubifs_debug_info *d = c->dbg; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + if (n == UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + fname = UBIFS_DFS_DIR_NAME; + dent = ERR_PTR(-EINVAL); + goto out; + } + + fname = d->dfs_dir_name; + dent = debugfs_create_dir(fname, dfs_rootdir); + if (IS_ERR_OR_NULL(dent)) + goto out; + d->dfs_dir = dent; + + fname = "dump_lprops"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_lprops = dent; + + fname = "dump_budg"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_budg = dent; + + fname = "dump_tnc"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_tnc = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_tst_rcvry = dent; + + fname = "ro_error"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_ro_error = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(d->dfs_dir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err(c, "cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debugfs_exit_fs - remove all debugfs files. + * @c: UBIFS file-system description object + */ +void dbg_debugfs_exit_fs(struct ubifs_info *c) +{ + if (IS_ENABLED(CONFIG_DEBUG_FS)) + debugfs_remove_recursive(c->dbg->dfs_dir); +} + +struct ubifs_global_debug_info ubifs_dbg; + +static struct dentry *dfs_chk_gen; +static struct dentry *dfs_chk_index; +static struct dentry *dfs_chk_orph; +static struct dentry *dfs_chk_lprops; +static struct dentry *dfs_chk_fs; +static struct dentry *dfs_tst_rcvry; + +static ssize_t dfs_global_file_read(struct file *file, char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + if (dent == dfs_chk_gen) + val = ubifs_dbg.chk_gen; + else if (dent == dfs_chk_index) + val = ubifs_dbg.chk_index; + else if (dent == dfs_chk_orph) + val = ubifs_dbg.chk_orph; + else if (dent == dfs_chk_lprops) + val = ubifs_dbg.chk_lprops; + else if (dent == dfs_chk_fs) + val = ubifs_dbg.chk_fs; + else if (dent == dfs_tst_rcvry) + val = ubifs_dbg.tst_rcvry; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); +} + +static ssize_t dfs_global_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == dfs_chk_gen) + ubifs_dbg.chk_gen = val; + else if (dent == dfs_chk_index) + ubifs_dbg.chk_index = val; + else if (dent == dfs_chk_orph) + ubifs_dbg.chk_orph = val; + else if (dent == dfs_chk_lprops) + ubifs_dbg.chk_lprops = val; + else if (dent == dfs_chk_fs) + ubifs_dbg.chk_fs = val; + else if (dent == dfs_tst_rcvry) + ubifs_dbg.tst_rcvry = val; + else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_global_fops = { + .read = dfs_global_file_read, + .write = dfs_global_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + int err; + const char *fname; + struct dentry *dent; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + fname = "ubifs"; + dent = debugfs_create_dir(fname, NULL); + if (IS_ERR_OR_NULL(dent)) + goto out; + dfs_rootdir = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_tst_rcvry = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(dfs_rootdir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + pr_err("UBIFS error (pid %d): cannot create \"%s\" debugfs file or directory, error %d\n", + current->pid, fname, err); + return err; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_FS)) + debugfs_remove_recursive(dfs_rootdir); +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + return 0; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + kfree(c->dbg); +} diff --git a/kernel/fs/ubifs/debug.h b/kernel/fs/ubifs/debug.h new file mode 100644 index 000000000..e03d51797 --- /dev/null +++ b/kernel/fs/ubifs/debug.h @@ -0,0 +1,315 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_DEBUG_H__ +#define __UBIFS_DEBUG_H__ + +/* Checking helper functions */ +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); + +/* + * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) + +/** + * ubifs_debug_info - per-FS debugging information. + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * + * @pc_happened: non-zero if an emulated power cut happened + * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @pc_timeout: time in jiffies when delay of failure mode expires + * @pc_cnt: current number of calls to failure mode I/O functions + * @pc_cnt_max: number of calls by which to delay failure mode + * + * @chk_lpt_sz: used by LPT tree size checker + * @chk_lpt_sz2: used by LPT tree size checker + * @chk_lpt_wastage: used by LPT tree size checker + * @chk_lpt_lebs: used by LPT tree size checker + * @new_nhead_offs: used by LPT tree size checker + * @new_ihead_lnum: used by debugging to check @c->ihead_lnum + * @new_ihead_offs: used by debugging to check @c->ihead_offs + * + * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') + * @saved_bi: saved budgeting information + * @saved_free: saved amount of free space + * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + * + * @dfs_dir_name: name of debugfs directory containing this file-system's files + * @dfs_dir: direntry object of the file-system debugfs directory + * @dfs_dump_lprops: "dump lprops" debugfs knob + * @dfs_dump_budg: "dump budgeting information" debugfs knob + * @dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks + * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks + * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks + * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks + * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks + * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing + * @dfs_ro_error: debugfs knob to switch UBIFS to R/O mode (different to + * re-mounting to R/O mode because it does not flush any buffers + * and UBIFS just starts returning -EROFS on all write + * operations) + */ +struct ubifs_debug_info { + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + + int pc_happened; + int pc_delay; + unsigned long pc_timeout; + unsigned int pc_cnt; + unsigned int pc_cnt_max; + + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_offs; + int new_ihead_lnum; + int new_ihead_offs; + + struct ubifs_lp_stats saved_lst; + struct ubifs_budg_info saved_bi; + long long saved_free; + int saved_idx_gc_cnt; + + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; + + char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; + struct dentry *dfs_dir; + struct dentry *dfs_dump_lprops; + struct dentry *dfs_dump_budg; + struct dentry *dfs_dump_tnc; + struct dentry *dfs_chk_gen; + struct dentry *dfs_chk_index; + struct dentry *dfs_chk_orph; + struct dentry *dfs_chk_lprops; + struct dentry *dfs_chk_fs; + struct dentry *dfs_tst_rcvry; + struct dentry *dfs_ro_error; +}; + +/** + * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information. + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + */ +struct ubifs_global_debug_info { + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; +}; + +#define ubifs_assert(expr) do { \ + if (unlikely(!(expr))) { \ + pr_crit("UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ + dump_stack(); \ + } \ +} while (0) + +#define ubifs_assert_cmt_locked(c) do { \ + if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ + up_write(&(c)->commit_sem); \ + pr_crit("commit lock is not locked!\n"); \ + ubifs_assert(0); \ + } \ +} while (0) + +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid, \ + ##__VA_ARGS__) + +#define DBG_KEY_BUF_LEN 48 +#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ + char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ + pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid, \ + ##__VA_ARGS__, \ + dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ +} while (0) + +/* General messages */ +#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) +/* Additional journal messages */ +#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) \ + ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__) +/* Additional TNC messages */ +#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) \ + ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__) +/* Additional lprops messages */ +#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) +/* Additional LEB find messages */ +#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) +/* Additional mount messages */ +#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) \ + ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__) +/* Additional I/O messages */ +#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) +/* Additional commit messages */ +#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__) +/* Additional budgeting messages */ +#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__) +/* Additional log messages */ +#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__) +/* Additional gc messages */ +#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__) +/* Additional scan messages */ +#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__) +/* Additional recovery messages */ +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) + +extern struct ubifs_global_debug_info ubifs_dbg; + +static inline int dbg_is_chk_gen(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen); +} +static inline int dbg_is_chk_index(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_index || c->dbg->chk_index); +} +static inline int dbg_is_chk_orph(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph); +} +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops); +} +static inline int dbg_is_chk_fs(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs); +} +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry); +} +static inline int dbg_is_power_cut(const struct ubifs_info *c) +{ + return !!c->dbg->pc_happened; +} + +int ubifs_debugging_init(struct ubifs_info *c); +void ubifs_debugging_exit(struct ubifs_info *c); + +/* Dump functions */ +const char *dbg_ntype(int type); +const char *dbg_cstate(int cmt_state); +const char *dbg_jhead(int jhead); +const char *dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key); +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len); +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); +void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_budget_req(const struct ubifs_budget_req *req); +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); +void ubifs_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp); +void ubifs_dump_lprops(struct ubifs_info *c); +void ubifs_dump_lpt_info(struct ubifs_info *c); +void ubifs_dump_leb(const struct ubifs_info *c, int lnum); +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + int cat); +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void ubifs_dump_tnc(struct ubifs_info *c); +void ubifs_dump_index(struct ubifs_info *c); +void ubifs_dump_lpt_lebs(const struct ubifs_info *c); + +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv); + +/* Checking functions */ +void dbg_save_space_info(struct ubifs_info *c); +int dbg_check_space_info(struct ubifs_info *c); +int dbg_check_lprops(struct ubifs_info *c); +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_cats(struct ubifs_info *c); +int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode); +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir); +int dbg_check_tnc(struct ubifs_info *c, int extra); +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); +int dbg_check_filesystem(struct ubifs_info *c); +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos); +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col); +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size); +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); + +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); +int dbg_leb_unmap(struct ubifs_info *c, int lnum); +int dbg_leb_map(struct ubifs_info *c, int lnum); + +/* Debugfs-related stuff */ +int dbg_debugfs_init(void); +void dbg_debugfs_exit(void); +int dbg_debugfs_init_fs(struct ubifs_info *c); +void dbg_debugfs_exit_fs(struct ubifs_info *c); + +#endif /* !__UBIFS_DEBUG_H__ */ diff --git a/kernel/fs/ubifs/dir.c b/kernel/fs/ubifs/dir.c new file mode 100644 index 000000000..27060fc85 --- /dev/null +++ b/kernel/fs/ubifs/dir.c @@ -0,0 +1,1202 @@ +/* * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements directory operations. + * + * All FS operations in this file allocate budget before writing anything to the + * media. If they fail to allocate it, the error is returned. The only + * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even + * if they unable to allocate the budget, because deletion %-ENOSPC failure is + * not what users are usually ready to get. UBIFS budgeting subsystem has some + * space reserved for these purposes. + * + * All operations in this file write all inodes which they change straight + * away, instead of marking them dirty. For example, 'ubifs_link()' changes + * @i_size of the parent inode and writes the parent inode together with the + * target inode. This was done to simplify file-system recovery which would + * otherwise be very difficult to do. The only exception is rename which marks + * the re-named inode dirty (because its @i_ctime is updated) but does not + * write it, but just marks it as dirty. + */ + +#include "ubifs.h" + +/** + * inherit_flags - inherit flags of the parent inode. + * @dir: parent inode + * @mode: new inode mode flags + * + * This is a helper function for 'ubifs_new_inode()' which inherits flag of the + * parent directory inode @dir. UBIFS inodes inherit the following flags: + * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on + * sub-directory basis; + * o %UBIFS_SYNC_FL - useful for the same reasons; + * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories. + * + * This function returns the inherited flags. + */ +static int inherit_flags(const struct inode *dir, umode_t mode) +{ + int flags; + const struct ubifs_inode *ui = ubifs_inode(dir); + + if (!S_ISDIR(dir->i_mode)) + /* + * The parent is not a directory, which means that an extended + * attribute inode is being created. No flags. + */ + return 0; + + flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL); + if (!S_ISDIR(mode)) + /* The "DIRSYNC" flag only applies to directories */ + flags &= ~UBIFS_DIRSYNC_FL; + return flags; +} + +/** + * ubifs_new_inode - allocate new UBIFS inode object. + * @c: UBIFS file-system description object + * @dir: parent directory inode + * @mode: inode mode flags + * + * This function finds an unused inode number, allocates new inode and + * initializes it. Returns new inode in case of success and an error code in + * case of failure. + */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + umode_t mode) +{ + struct inode *inode; + struct ubifs_inode *ui; + + inode = new_inode(c->vfs_sb); + ui = ubifs_inode(inode); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* + * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and + * marking them dirty in file write path (see 'file_update_time()'). + * UBIFS has to fully control "clean <-> dirty" transitions of inodes + * to make budgeting work. + */ + inode->i_flags |= S_NOCMTIME; + + inode_init_owner(inode, dir, mode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + ubifs_current_time(inode); + inode->i_mapping->nrpages = 0; + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ; + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + break; + case S_IFSOCK: + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + inode->i_op = &ubifs_file_inode_operations; + break; + default: + BUG(); + } + + ui->flags = inherit_flags(dir, mode); + ubifs_set_inode_flags(inode); + if (S_ISREG(mode)) + ui->compr_type = c->default_compr; + else + ui->compr_type = UBIFS_COMPR_NONE; + ui->synced_i_size = 0; + + spin_lock(&c->cnt_lock); + /* Inode number overflow is currently not supported */ + if (c->highest_inum >= INUM_WARN_WATERMARK) { + if (c->highest_inum >= INUM_WATERMARK) { + spin_unlock(&c->cnt_lock); + ubifs_err(c, "out of inode numbers"); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + ubifs_warn(c, "running out of inode numbers (current %lu, max %u)", + (unsigned long)c->highest_inum, INUM_WATERMARK); + } + + inode->i_ino = ++c->highest_inum; + /* + * The creation sequence number remains with this inode for its + * lifetime. All nodes for this inode have a greater sequence number, + * and so it is possible to distinguish obsolete nodes belonging to a + * previous incarnation of the same inode number - for example, for the + * purpose of rebuilding the index. + */ + ui->creat_sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + return inode; +} + +static int dbg_check_name(const struct ubifs_info *c, + const struct ubifs_dent_node *dent, + const struct qstr *nm) +{ + if (!dbg_is_chk_gen(c)) + return 0; + if (le16_to_cpu(dent->nlen) != nm->len) + return -EINVAL; + if (memcmp(dent->name, nm->name, nm->len)) + return -EINVAL; + return 0; +} + +static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int err; + union ubifs_key key; + struct inode *inode = NULL; + struct ubifs_dent_node *dent; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); + + if (dentry->d_name.len > UBIFS_MAX_NLEN) + return ERR_PTR(-ENAMETOOLONG); + + dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent) + return ERR_PTR(-ENOMEM); + + dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + + err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); + if (err) { + if (err == -ENOENT) { + dbg_gen("not found"); + goto done; + } + goto out; + } + + if (dbg_check_name(c, dent, &dentry->d_name)) { + err = -EINVAL; + goto out; + } + + inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); + if (IS_ERR(inode)) { + /* + * This should not happen. Probably the file-system needs + * checking. + */ + err = PTR_ERR(inode); + ubifs_err(c, "dead directory entry '%pd', error %d", + dentry, err); + ubifs_ro_mode(c, err); + goto out; + } + +done: + kfree(dent); + /* + * Note, d_splice_alias() would be required instead if we supported + * NFS. + */ + d_add(dentry, inode); + return NULL; + +out: + kfree(dent); + return ERR_PTR(err); +} + +static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1 }; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + + /* + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + ubifs_err(c, "cannot create regular file, error %d", err); + return err; +} + +/** + * vfs_dent_type - get VFS directory entry type. + * @type: UBIFS directory entry type + * + * This function converts UBIFS directory entry type into VFS directory entry + * type. + */ +static unsigned int vfs_dent_type(uint8_t type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return DT_REG; + case UBIFS_ITYPE_DIR: + return DT_DIR; + case UBIFS_ITYPE_LNK: + return DT_LNK; + case UBIFS_ITYPE_BLK: + return DT_BLK; + case UBIFS_ITYPE_CHR: + return DT_CHR; + case UBIFS_ITYPE_FIFO: + return DT_FIFO; + case UBIFS_ITYPE_SOCK: + return DT_SOCK; + default: + BUG(); + } + return 0; +} + +/* + * The classical Unix view for directory is that it is a linear array of + * (name, inode number) entries. Linux/VFS assumes this model as well. + * Particularly, 'readdir()' call wants us to return a directory entry offset + * which later may be used to continue 'readdir()'ing the directory or to + * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this + * model because directory entries are identified by keys, which may collide. + * + * UBIFS uses directory entry hash value for directory offsets, so + * 'seekdir()'/'telldir()' may not always work because of possible key + * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work + * properly by means of saving full directory entry name in the private field + * of the file description object. + * + * This means that UBIFS cannot support NFS which requires full + * 'seekdir()'/'telldir()' support. + */ +static int ubifs_readdir(struct file *file, struct dir_context *ctx) +{ + int err; + struct qstr nm; + union ubifs_key key; + struct ubifs_dent_node *dent; + struct inode *dir = file_inode(file); + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); + + if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) + /* + * The directory was seek'ed to a senseless position or there + * are no more entries. + */ + return 0; + + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; + } + + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (ctx->pos < 2) { + ubifs_assert(!file->private_data); + if (!dir_emit_dots(file, ctx)) + return 0; + + /* Find the first entry in TNC and save it */ + lowest_dent_key(c, &key, dir->i_ino); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + ctx->pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + dent = file->private_data; + if (!dent) { + /* + * The directory was seek'ed to and is now readdir'ed. + * Find the entry corresponding to @ctx->pos or the closest one. + */ + dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + ctx->pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + while (1) { + dbg_gen("feed '%s', ino %llu, new f_pos %#x", + dent->name, (unsigned long long)le64_to_cpu(dent->inum), + key_hash_flash(c, &dent->key)); + ubifs_assert(le64_to_cpu(dent->ch.sqnum) > + ubifs_inode(dir)->creat_sqnum); + + nm.len = le16_to_cpu(dent->nlen); + if (!dir_emit(ctx, dent->name, nm.len, + le64_to_cpu(dent->inum), + vfs_dent_type(dent->type))) + return 0; + + /* Switch to the next entry */ + key_read(c, &dent->key, &key); + nm.name = dent->name; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + kfree(file->private_data); + ctx->pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + cond_resched(); + } + +out: + if (err != -ENOENT) { + ubifs_err(c, "cannot find next direntry, error %d", err); + return err; + } + + kfree(file->private_data); + file->private_data = NULL; + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2; + return 0; +} + +/* Free saved readdir() state when the directory is closed */ +static int ubifs_dir_release(struct inode *dir, struct file *file) +{ + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = d_inode(old_dentry); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + /* + * Budget request settings: new direntry, changing the target inode, + * changing the parent inode. + */ + + dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", + dentry, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + + err = dbg_check_synced_i_size(c, inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + lock_2_inodes(dir, inode); + inc_nlink(inode); + ihold(inode); + inode->i_ctime = ubifs_current_time(inode); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(inode); + unlock_2_inodes(dir, inode); + ubifs_release_budget(c, &req); + iput(inode); + return err; +} + +static int ubifs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = d_inode(dentry); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + unsigned int saved_nlink = inode->i_nlink; + + /* + * Budget request settings: deletion direntry, deletion inode (+1 for + * @dirtied_ino), changing the parent directory inode. If budgeting + * fails, go ahead anyway because we have extra space reserved for + * deletions. + */ + + dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", + dentry, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = dbg_check_synced_i_size(c, inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + drop_nlink(inode); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + set_nlink(inode, saved_nlink); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_dir_empty - check if a directory is empty or not. + * @c: UBIFS file-system description object + * @dir: VFS inode object of the directory to check + * + * This function checks if directory @dir is empty. Returns zero if the + * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes + * in case of of errors. + */ +static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +{ + struct qstr nm = { .name = NULL }; + struct ubifs_dent_node *dent; + union ubifs_key key; + int err; + + lowest_dent_key(c, &key, dir->i_ino); + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + err = 0; + } else { + kfree(dent); + err = -ENOTEMPTY; + } + return err; +} + +static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = d_inode(dentry); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode and + * changing the parent inode. If budgeting fails, go ahead anyway + * because we have extra space reserved for deletions. + */ + + dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, + inode->i_ino, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = check_dir_empty(c, d_inode(dentry)); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + clear_nlink(inode); + drop_nlink(dir); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(dir); + set_nlink(inode, 2); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + mutex_lock(&dir_ui->ui_mutex); + insert_inode_hash(inode); + inc_nlink(inode); + inc_nlink(dir); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) { + ubifs_err(c, "cannot create directory, error %d", err); + goto out_cancel; + } + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(dir); + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + union ubifs_dev_desc *dev = NULL; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, devlen = 0; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (S_ISBLK(mode) || S_ISCHR(mode)) { + dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!dev) + return -ENOMEM; + devlen = ubifs_encode_dev(dev, rdev); + } + + err = ubifs_budget_space(c, &req); + if (err) { + kfree(dev); + return err; + } + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + kfree(dev); + err = PTR_ERR(inode); + goto out_budg; + } + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_size = ubifs_inode(inode)->ui_size = devlen; + ui = ubifs_inode(inode); + ui->data = dev; + ui->data_len = devlen; + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, len = strlen(symname); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, + symname, dir->i_ino); + + if (len > UBIFS_MAX_INO_DATA) + return -ENAMETOOLONG; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + ui = ubifs_inode(inode); + ui->data = kmalloc(len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_inode; + } + + memcpy(ui->data, symname, len); + ((char *)ui->data)[len] = '\0'; + /* + * The terminating zero byte is not written to the flash media and it + * is put just to make later in-memory string processing simpler. Thus, + * data length is @len, not @len + %1. + */ + ui->data_len = len; + inode->i_size = ubifs_inode(inode)->ui_size = len; + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * lock_3_inodes - a wrapper for locking three UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + * + * This function is used for 'ubifs_rename()' and @inode1 may be the same as + * @inode2 whereas @inode3 may be %NULL. + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + if (inode2 != inode1) + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + if (inode3) + mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); +} + +/** + * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + */ +static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + if (inode3) + mutex_unlock(&ubifs_inode(inode3)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ubifs_info *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); + int err, release, sync = 0, move = (new_dir != old_dir); + int is_dir = S_ISDIR(old_inode->i_mode); + int unlink = !!new_inode; + int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); + int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, + .dirtied_ino = 3 }; + struct ubifs_budget_req ino_req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct timespec time; + unsigned int uninitialized_var(saved_nlink); + + /* + * Budget request settings: deletion direntry, new direntry, removing + * the old inode, and changing old and new parent directory inodes. + * + * However, this operation also marks the target inode as dirty and + * does not write it, so we allocate budget for the target inode + * separately. + */ + + dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", + old_dentry, old_inode->i_ino, old_dir->i_ino, + new_dentry, new_dir->i_ino); + ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); + ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + if (unlink) + ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + + + if (unlink && is_dir) { + err = check_dir_empty(c, new_inode); + if (err) + return err; + } + + err = ubifs_budget_space(c, &req); + if (err) + return err; + err = ubifs_budget_space(c, &ino_req); + if (err) { + ubifs_release_budget(c, &req); + return err; + } + + lock_3_inodes(old_dir, new_dir, new_inode); + + /* + * Like most other Unix systems, set the @i_ctime for inodes on a + * rename. + */ + time = ubifs_current_time(old_dir); + old_inode->i_ctime = time; + + /* We must adjust parent link count when renaming directories */ + if (is_dir) { + if (move) { + /* + * @old_dir loses a link because we are moving + * @old_inode to a different directory. + */ + drop_nlink(old_dir); + /* + * @new_dir only gains a link if we are not also + * overwriting an existing directory. + */ + if (!unlink) + inc_nlink(new_dir); + } else { + /* + * @old_inode is not moving to a different directory, + * but @old_dir still loses a link if we are + * overwriting an existing directory. + */ + if (unlink) + drop_nlink(old_dir); + } + } + + old_dir->i_size -= old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + old_dir->i_mtime = old_dir->i_ctime = time; + new_dir->i_mtime = new_dir->i_ctime = time; + + /* + * And finally, if we unlinked a direntry which happened to have the + * same name as the moved direntry, we have to decrement @i_nlink of + * the unlinked inode and change its ctime. + */ + if (unlink) { + /* + * Directories cannot have hard-links, so if this is a + * directory, just clear @i_nlink. + */ + saved_nlink = new_inode->i_nlink; + if (is_dir) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + new_inode->i_ctime = time; + } else { + new_dir->i_size += new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + + /* + * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode + * is dirty, because this will be done later on at the end of + * 'ubifs_rename()'. + */ + if (IS_SYNC(old_inode)) { + sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); + if (unlink && IS_SYNC(new_inode)) + sync = 1; + } + err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, + sync); + if (err) + goto out_cancel; + + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &req); + + mutex_lock(&old_inode_ui->ui_mutex); + release = old_inode_ui->dirty; + mark_inode_dirty_sync(old_inode); + mutex_unlock(&old_inode_ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &ino_req); + if (IS_SYNC(old_inode)) + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + return err; + +out_cancel: + if (unlink) { + set_nlink(new_inode, saved_nlink); + } else { + new_dir->i_size -= new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + old_dir->i_size += old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + if (is_dir) { + if (move) { + inc_nlink(old_dir); + if (!unlink) + drop_nlink(new_dir); + } else { + if (unlink) + inc_nlink(old_dir); + } + } + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &ino_req); + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + loff_t size; + struct inode *inode = d_inode(dentry); + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + generic_fillattr(inode, stat); + stat->blksize = UBIFS_BLOCK_SIZE; + stat->size = ui->ui_size; + + /* + * Unfortunately, the 'stat()' system call was designed for block + * device based file systems, and it is not appropriate for UBIFS, + * because UBIFS does not have notion of "block". For example, it is + * difficult to tell how many block a directory takes - it actually + * takes less than 300 bytes, but we have to round it to block size, + * which introduces large mistake. This makes utilities like 'du' to + * report completely senseless numbers. This is the reason why UBIFS + * goes the same way as JFFS2 - it reports zero blocks for everything + * but regular files, which makes more sense than reporting completely + * wrong sizes. + */ + if (S_ISREG(inode->i_mode)) { + size = ui->xattr_size; + size += stat->size; + size = ALIGN(size, UBIFS_BLOCK_SIZE); + /* + * Note, user-space expects 512-byte blocks count irrespectively + * of what was reported in @stat->size. + */ + stat->blocks = size >> 9; + } else + stat->blocks = 0; + mutex_unlock(&ui->ui_mutex); + return 0; +} + +const struct inode_operations ubifs_dir_inode_operations = { + .lookup = ubifs_lookup, + .create = ubifs_create, + .link = ubifs_link, + .symlink = ubifs_symlink, + .unlink = ubifs_unlink, + .mkdir = ubifs_mkdir, + .rmdir = ubifs_rmdir, + .mknod = ubifs_mknod, + .rename = ubifs_rename, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +}; + +const struct file_operations ubifs_dir_operations = { + .llseek = generic_file_llseek, + .release = ubifs_dir_release, + .read = generic_read_dir, + .iterate = ubifs_readdir, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/kernel/fs/ubifs/file.c b/kernel/fs/ubifs/file.c new file mode 100644 index 000000000..35efc103c --- /dev/null +++ b/kernel/fs/ubifs/file.c @@ -0,0 +1,1594 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements VFS file and inode operations for regular files, device + * nodes and symlinks as well as address space operations. + * + * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if + * the page is dirty and is used for optimization purposes - dirty pages are + * not budgeted so the flag shows that 'ubifs_write_end()' should not release + * the budget for this page. The @PG_checked flag is set if full budgeting is + * required for the page e.g., when it corresponds to a file hole or it is + * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because + * it is OK to fail in this function, and the budget is released in + * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry + * information about how the page was budgeted, to make it possible to release + * the budget properly. + * + * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we + * implement. However, this is not true for 'ubifs_writepage()', which may be + * called with @i_mutex unlocked. For example, when flusher thread is doing + * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. + * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. + * in the "sys_write -> alloc_pages -> direct reclaim path". So, in + * 'ubifs_writepage()' we are only guaranteed that the page is locked. + * + * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the + * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not + * set as well. However, UBIFS disables readahead. + */ + +#include "ubifs.h" +#include +#include +#include + +static int read_block(struct inode *inode, void *addr, unsigned int block, + struct ubifs_data_node *dn) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err, len, out_len; + union ubifs_key key; + unsigned int dlen; + + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_tnc_lookup(c, &key, dn); + if (err) { + if (err == -ENOENT) + /* Not found, so it must be a hole */ + memset(addr, 0, UBIFS_BLOCK_SIZE); + return err; + } + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto dump; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto dump; + + /* + * Data length can be less than a full block, even for blocks that are + * not the last in the file (e.g., as a result of making a hole and + * appending data). Ensure that the remainder is zeroed out. + */ + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + return 0; + +dump: + ubifs_err(c, "bad data node (block %u, inode %lu)", + block, inode->i_ino); + ubifs_dump_node(c, dn); + return -EINVAL; +} + +static int do_readpage(struct page *page) +{ + void *addr; + int err = 0, i; + unsigned int block, beyond; + struct ubifs_data_node *dn; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + ubifs_assert(!PageChecked(page)); + ubifs_assert(!PagePrivate(page)); + + addr = kmap(page); + + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + if (block >= beyond) { + /* Reading beyond inode */ + SetPageChecked(page); + memset(addr, 0, PAGE_CACHE_SIZE); + goto out; + } + + dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); + if (!dn) { + err = -ENOMEM; + goto error; + } + + i = 0; + while (1) { + int ret; + + if (block >= beyond) { + /* Reading beyond inode */ + err = -ENOENT; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else { + ret = read_block(inode, addr, block, dn); + if (ret) { + err = ret; + if (err != -ENOENT) + break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); + } + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += UBIFS_BLOCK_SIZE; + } + if (err) { + struct ubifs_info *c = inode->i_sb->s_fs_info; + if (err == -ENOENT) { + /* Not found, so it must be a hole */ + SetPageChecked(page); + dbg_gen("hole"); + goto out_free; + } + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + goto error; + } + +out_free: + kfree(dn); +out: + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + return 0; + +error: + kfree(dn); + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + return err; +} + +/** + * release_new_page_budget - release budget of a new page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of one new page of data. + */ +static void release_new_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 }; + + ubifs_release_budget(c, &req); +} + +/** + * release_existing_page_budget - release budget of an existing page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of changing one one page of data which already exists on the flash media. + */ +static void release_existing_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget}; + + ubifs_release_budget(c, &req); +} + +static int write_begin_slow(struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep, + unsigned flags) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct ubifs_budget_req req = { .new_page = 1 }; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + struct page *page; + + dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + inode->i_ino, pos, len, inode->i_size); + + /* + * At the slow path we have to budget before locking the page, because + * budgeting may force write-back, which would wait on locked pages and + * deadlock if we had the page locked. At this point we do not know + * anything about the page, so assume that this is a new page which is + * written to a hole. This corresponds to largest budget. Later the + * budget will be amended if this is not true. + */ + if (appending) + /* We are appending data, budget for inode change */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) + return err; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (unlikely(!page)) { + ubifs_release_budget(c, &req); + return -ENOMEM; + } + + if (!PageUptodate(page)) { + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + SetPageChecked(page); + else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + ubifs_release_budget(c, &req); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + if (PagePrivate(page)) + /* + * The page is dirty, which means it was budgeted twice: + * o first time the budget was allocated by the task which + * made the page dirty and set the PG_private flag; + * o and then we budgeted for it for the second time at the + * very beginning of this function. + * + * So what we have to do is to release the page budget we + * allocated. + */ + release_new_page_budget(c); + else if (!PageChecked(page)) + /* + * We are changing a page which already exists on the media. + * This means that changing the page does not make the amount + * of indexing information larger, and this part of the budget + * which we have already acquired may be released. + */ + ubifs_convert_page_budget(c); + + if (appending) { + struct ubifs_inode *ui = ubifs_inode(inode); + + /* + * 'ubifs_write_end()' is optimized from the fast-path part of + * 'ubifs_write_begin()' and expects the @ui_mutex to be locked + * if data is appended. + */ + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The inode is dirty already, so we may free the + * budget we allocated. + */ + ubifs_release_dirty_inode_budget(c, ui); + } + + *pagep = page; + return 0; +} + +/** + * allocate_budget - allocate budget for 'ubifs_write_begin()'. + * @c: UBIFS file-system description object + * @page: page to allocate budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for 'ubifs_write_begin()' which allocates budget + * for the operation. The budget is allocated differently depending on whether + * this is appending, whether the page is dirty or not, and so on. This + * function leaves the @ui->ui_mutex locked in case of appending. Returns zero + * in case of success and %-ENOSPC in case of failure. + */ +static int allocate_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + struct ubifs_budget_req req = { .fast = 1 }; + + if (PagePrivate(page)) { + if (!appending) + /* + * The page is dirty and we are not appending, which + * means no budget is needed at all. + */ + return 0; + + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The page is dirty and we are appending, so the inode + * has to be marked as dirty. However, it is already + * dirty, so we do not need any budget. We may return, + * but @ui->ui_mutex hast to be left locked because we + * should prevent write-back from flushing the inode + * and freeing the budget. The lock will be released in + * 'ubifs_write_end()'. + */ + return 0; + + /* + * The page is dirty, we are appending, the inode is clean, so + * we need to budget the inode change. + */ + req.dirtied_ino = 1; + } else { + if (PageChecked(page)) + /* + * The page corresponds to a hole and does not + * exist on the media. So changing it makes + * make the amount of indexing information + * larger, and we have to budget for a new + * page. + */ + req.new_page = 1; + else + /* + * Not a hole, the change will not add any new + * indexing information, budget for page + * change. + */ + req.dirtied_page = 1; + + if (appending) { + mutex_lock(&ui->ui_mutex); + if (!ui->dirty) + /* + * The inode is clean but we will have to mark + * it as dirty because we are appending. This + * needs a budget. + */ + req.dirtied_ino = 1; + } + } + + return ubifs_budget_space(c, &req); +} + +/* + * This function is called when a page of data is going to be written. Since + * the page of data will not necessarily go to the flash straight away, UBIFS + * has to reserve space on the media for it, which is done by means of + * budgeting. + * + * This is the hot-path of the file-system and we are trying to optimize it as + * much as possible. For this reasons it is split on 2 parts - slow and fast. + * + * There many budgeting cases: + * o a new page is appended - we have to budget for a new page and for + * changing the inode; however, if the inode is already dirty, there is + * no need to budget for it; + * o an existing clean page is changed - we have budget for it; if the page + * does not exist on the media (a hole), we have to budget for a new + * page; otherwise, we may budget for changing an existing page; the + * difference between these cases is that changing an existing page does + * not introduce anything new to the FS indexing information, so it does + * not grow, and smaller budget is acquired in this case; + * o an existing dirty page is changed - no need to budget at all, because + * the page budget has been acquired by earlier, when the page has been + * marked dirty. + * + * UBIFS budgeting sub-system may force write-back if it thinks there is no + * space to reserve. This imposes some locking restrictions and makes it + * impossible to take into account the above cases, and makes it impossible to + * optimize budgeting. + * + * The solution for this is that the fast path of 'ubifs_write_begin()' assumes + * there is a plenty of flash space and the budget will be acquired quickly, + * without forcing write-back. The slow path does not make this assumption. + */ +static int ubifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; + struct page *page; + + ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (unlikely(c->ro_error)) + return -EROFS; + + /* Try out the fast-path part first */ + page = grab_cache_page_write_begin(mapping, index, flags); + if (unlikely(!page)) + return -ENOMEM; + + if (!PageUptodate(page)) { + /* The page is not loaded from the flash */ + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { + /* + * We change whole page so no need to load it. But we + * do not know whether this page exists on the media or + * not, so we assume the latter because it requires + * larger budget. The assumption is that it is better + * to budget a bit more than to read the page from the + * media. Thus, we are setting the @PG_checked flag + * here. + */ + SetPageChecked(page); + skipped_read = 1; + } else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + err = allocate_budget(c, page, ui, appending); + if (unlikely(err)) { + ubifs_assert(err == -ENOSPC); + /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* + * Budgeting failed which means it would have to force + * write-back but didn't, because we set the @fast flag in the + * request. Write-back cannot be done now, while we have the + * page locked, because it would deadlock. Unlock and free + * everything and fall-back to slow-path. + */ + if (appending) { + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + unlock_page(page); + page_cache_release(page); + + return write_begin_slow(mapping, pos, len, pagep, flags); + } + + /* + * Whee, we acquired budgeting quickly - without involving + * garbage-collection, committing or forcing write-back. We return + * with @ui->ui_mutex locked if we are appending pages, and unlocked + * otherwise. This is an optimization (slightly hacky though). + */ + *pagep = page; + return 0; + +} + +/** + * cancel_budget - cancel budget. + * @c: UBIFS file-system description object + * @page: page to cancel budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for a page write operation. It unlocks the + * @ui->ui_mutex in case of appending. + */ +static void cancel_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + if (appending) { + if (!ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + mutex_unlock(&ui->ui_mutex); + } + if (!PagePrivate(page)) { + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + } +} + +static int ubifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + loff_t end_pos = pos + len; + int appending = !!(end_pos > inode->i_size); + + dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + inode->i_ino, pos, page->index, len, copied, inode->i_size); + + if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { + /* + * VFS copied less data to the page that it intended and + * declared in its '->write_begin()' call via the @len + * argument. If the page was not up-to-date, and @len was + * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did + * not load it from the media (for optimization reasons). This + * means that part of the page contains garbage. So read the + * page now. + */ + dbg_gen("copied %d instead of %d, read page and repeat", + copied, len); + cancel_budget(c, page, ui, appending); + ClearPageChecked(page); + + /* + * Return 0 to force VFS to repeat the whole operation, or the + * error code if 'do_readpage()' fails. + */ + copied = do_readpage(page); + goto out; + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (appending) { + i_size_write(inode, end_pos); + ui->ui_size = end_pos; + /* + * Note, we do not set @I_DIRTY_PAGES (which means that the + * inode has dirty pages), this has been done in + * '__set_page_dirty_nobuffers()'. + */ + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + +out: + unlock_page(page); + page_cache_release(page); + return copied; +} + +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err(c, "bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read information + * @page1: first page to read + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, + struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + int err, page_idx, page_cnt, ret = 0, n = 0; + int allocate = bu->buf ? 0 : 1; + loff_t isize; + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + goto out_bu_off; + } + + if (bu->cnt) { + if (allocate) { + /* + * Allocate bulk-read buffer depending on how many data + * nodes we are going to read. + */ + bu->buf_len = bu->zbranch[bu->cnt - 1].offs + + bu->zbranch[bu->cnt - 1].len - + bu->zbranch[0].offs; + ubifs_assert(bu->buf_len > 0); + ubifs_assert(bu->buf_len <= c->leb_size); + bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN); + if (!bu->buf) + goto out_bu_off; + } + + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + if (allocate) + kfree(bu->buf); + return ret; + +out_warn: + ubifs_warn(c, "ignoring error %d and skipping bulk-read", err); + goto out_free; + +out_bu_off: + ui->read_in_a_row = ui->bulk_read = 0; + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + struct bu_info *bu; + int err = 0, allocated = 0; + + ui->last_page_read = index; + if (!c->bulk_read) + return 0; + + /* + * Bulk-read is protected by @ui->ui_mutex, but it is an optimization, + * so don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + + /* + * If possible, try to use pre-allocated bulk-read information, which + * is protected by @c->bu_mutex. + */ + if (mutex_trylock(&c->bu_mutex)) + bu = &c->bu; + else { + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN); + if (!bu) + goto out_unlock; + + bu->buf = NULL; + allocated = 1; + } + + bu->buf_len = c->max_bu_buf_len; + data_key_init(c, &bu->key, inode->i_ino, + page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, page); + + if (!allocated) + mutex_unlock(&c->bu_mutex); + else + kfree(bu); + +out_unlock: + mutex_unlock(&ui->ui_mutex); + return err; +} + +static int ubifs_readpage(struct file *file, struct page *page) +{ + if (ubifs_bulk_read(page)) + return 0; + do_readpage(page); + unlock_page(page); + return 0; +} + +static int do_writepage(struct page *page, int len) +{ + int err = 0, i, blen; + unsigned int block; + void *addr; + union ubifs_key key; + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + +#ifdef UBIFS_DEBUG + struct ubifs_inode *ui = ubifs_inode(inode); + spin_lock(&ui->ui_lock); + ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT); + spin_unlock(&ui->ui_lock); +#endif + + /* Update radix tree tags */ + set_page_writeback(page); + + addr = kmap(page); + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + i = 0; + while (len) { + blen = min_t(int, len, UBIFS_BLOCK_SIZE); + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + if (err) + break; + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += blen; + len -= blen; + } + if (err) { + SetPageError(page); + ubifs_err(c, "cannot write page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + ubifs_ro_mode(c, err); + } + + ubifs_assert(PagePrivate(page)); + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); + + kunmap(page); + unlock_page(page); + end_page_writeback(page); + return err; +} + +/* + * When writing-back dirty inodes, VFS first writes-back pages belonging to the + * inode, then the inode itself. For UBIFS this may cause a problem. Consider a + * situation when a we have an inode with size 0, then a megabyte of data is + * appended to the inode, then write-back starts and flushes some amount of the + * dirty pages, the journal becomes full, commit happens and finishes, and then + * an unclean reboot happens. When the file system is mounted next time, the + * inode size would still be 0, but there would be many pages which are beyond + * the inode size, they would be indexed and consume flash space. Because the + * journal has been committed, the replay would not be able to detect this + * situation and correct the inode size. This means UBIFS would have to scan + * whole index and correct all inode sizes, which is long an unacceptable. + * + * To prevent situations like this, UBIFS writes pages back only if they are + * within the last synchronized inode size, i.e. the size which has been + * written to the flash media last time. Otherwise, UBIFS forces inode + * write-back, thus making sure the on-flash inode contains current inode size, + * and then keeps writing pages back. + * + * Some locking issues explanation. 'ubifs_writepage()' first is called with + * the page locked, and it locks @ui_mutex. However, write-back does take inode + * @i_mutex, which means other VFS operations may be run on this inode at the + * same time. And the problematic one is truncation to smaller size, from where + * we have to call 'truncate_setsize()', which first changes @inode->i_size, + * then drops the truncated pages. And while dropping the pages, it takes the + * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' + * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. + * This means that @inode->i_size is changed while @ui_mutex is unlocked. + * + * XXX(truncate): with the new truncate sequence this is not true anymore, + * and the calls to truncate_setsize can be move around freely. They should + * be moved to the very end of the truncate sequence. + * + * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond + * inode size. How do we do this if @inode->i_size may became smaller while we + * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the + * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size + * internally and updates it under @ui_mutex. + * + * Q: why we do not worry that if we race with truncation, we may end up with a + * situation when the inode is truncated while we are in the middle of + * 'do_writepage()', so we do write beyond inode size? + * A: If we are in the middle of 'do_writepage()', truncation would be locked + * on the page lock and it would not write the truncated inode node to the + * journal before we have finished. + */ +static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + loff_t i_size = i_size_read(inode), synced_i_size; + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + int err, len = i_size & (PAGE_CACHE_SIZE - 1); + void *kaddr; + + dbg_gen("ino %lu, pg %lu, pg flags %#lx", + inode->i_ino, page->index, page->flags); + ubifs_assert(PagePrivate(page)); + + /* Is the page fully outside @i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !len)) { + err = 0; + goto out_unlock; + } + + spin_lock(&ui->ui_lock); + synced_i_size = ui->synced_i_size; + spin_unlock(&ui->ui_lock); + + /* Is the page fully inside @i_size? */ + if (page->index < end_index) { + if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + goto out_unlock; + /* + * The inode has been written, but the write-buffer has + * not been synchronized, so in case of an unclean + * reboot we may end up with some pages beyond inode + * size, but they would be in the journal (because + * commit flushes write buffers) and recovery would deal + * with this. + */ + } + return do_writepage(page, PAGE_CACHE_SIZE); + } + + /* + * The page straddles @i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page); + memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); + flush_dcache_page(page); + kunmap_atomic(kaddr); + + if (i_size > synced_i_size) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + goto out_unlock; + } + + return do_writepage(page, len); + +out_unlock: + unlock_page(page); + return err; +} + +/** + * do_attr_changes - change inode attributes. + * @inode: inode to change attributes for + * @attr: describes attributes to change + */ +static void do_attr_changes(struct inode *inode, const struct iattr *attr) +{ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} + +/** + * do_truncation - truncate an inode. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call when the inode is truncated + * to a smaller size. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int do_truncation(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err; + struct ubifs_budget_req req; + loff_t old_size = inode->i_size, new_size = attr->ia_size; + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + memset(&req, 0, sizeof(struct ubifs_budget_req)); + + /* + * If this is truncation to a smaller size, and we do not truncate on a + * block boundary, budget for changing one data block, because the last + * block will be re-written. + */ + if (new_size & (UBIFS_BLOCK_SIZE - 1)) + req.dirtied_page = 1; + + req.dirtied_ino = 1; + /* A funny way to budget for truncation node */ + req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; + err = ubifs_budget_space(c, &req); + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } + + truncate_setsize(inode, new_size); + + if (offset) { + pgoff_t index = new_size >> PAGE_CACHE_SHIFT; + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + if (PageDirty(page)) { + /* + * 'ubifs_jnl_truncate()' will try to truncate + * the last data node, but it contains + * out-of-date data because the page is dirty. + * Write the page now, so that + * 'ubifs_jnl_truncate()' will see an already + * truncated (and up to date) data node. + */ + ubifs_assert(PagePrivate(page)); + + clear_page_dirty_for_io(page); + if (UBIFS_BLOCKS_PER_PAGE_SHIFT) + offset = new_size & + (PAGE_CACHE_SIZE - 1); + err = do_writepage(page, offset); + page_cache_release(page); + if (err) + goto out_budg; + /* + * We could now tell 'ubifs_jnl_truncate()' not + * to read the last block. + */ + } else { + /* + * We could 'kmap()' the page and pass the data + * to 'ubifs_jnl_truncate()' to save it from + * having to read it. + */ + unlock_page(page); + page_cache_release(page); + } + } + } + + mutex_lock(&ui->ui_mutex); + ui->ui_size = inode->i_size; + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* Other attributes may be changed at the same time as well */ + do_attr_changes(inode, attr); + err = ubifs_jnl_truncate(c, inode, old_size, new_size); + mutex_unlock(&ui->ui_mutex); + +out_budg: + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return err; +} + +/** + * do_setattr - change inode attributes. + * @c: UBIFS file-system description object + * @inode: inode to change attributes for + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call for all cases except + * truncations to smaller size. Returns zero in case of success and a negative + * error code in case of failure. + */ +static int do_setattr(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err, release; + loff_t new_size = attr->ia_size; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + dbg_gen("size %lld -> %lld", inode->i_size, new_size); + truncate_setsize(inode, new_size); + } + + mutex_lock(&ui->ui_mutex); + if (attr->ia_valid & ATTR_SIZE) { + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* 'truncate_setsize()' changed @i_size, update @ui_size */ + ui->ui_size = inode->i_size; + } + + do_attr_changes(inode, attr); + + release = ui->dirty; + if (attr->ia_valid & ATTR_SIZE) + /* + * Inode length changed, so we have to make sure + * @I_DIRTY_DATASYNC is set. + */ + __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + else + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = inode->i_sb->s_op->write_inode(inode, NULL); + return err; +} + +int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct inode *inode = d_inode(dentry); + struct ubifs_info *c = inode->i_sb->s_fs_info; + + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); + err = inode_change_ok(inode, attr); + if (err) + return err; + + err = dbg_check_synced_i_size(c, inode); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) + /* Truncation to a smaller size */ + err = do_truncation(c, inode, attr); + else + err = do_setattr(c, inode, attr); + + return err; +} + +static void ubifs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + ubifs_assert(PagePrivate(page)); + if (offset || length < PAGE_CACHE_SIZE) + /* Partial page remains dirty */ + return; + + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); +} + +static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ubifs_inode *ui = ubifs_inode(d_inode(dentry)); + + nd_set_link(nd, ui->data); + return NULL; +} + +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + + dbg_gen("syncing inode %lu", inode->i_ino); + + if (c->ro_mount) + /* + * For some really strange reasons VFS does not filter out + * 'fsync()' for R/O mounted file-systems as per 2.6.39. + */ + return 0; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&inode->i_mutex); + + /* Synchronize the inode unless this is a 'datasync()' call. */ + if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + goto out; + } + + /* + * Nodes related to this inode may still sit in a write-buffer. Flush + * them. + */ + err = ubifs_sync_wbufs_by_inode(c, inode); +out: + mutex_unlock(&inode->i_mutex); + return err; +} + +/** + * mctime_update_needed - check if mtime or ctime update is needed. + * @inode: the inode to do the check for + * @now: current time + * + * This helper function checks if the inode mtime/ctime should be updated or + * not. If current values of the time-stamps are within the UBIFS inode time + * granularity, they are not updated. This is an optimization. + */ +static inline int mctime_update_needed(const struct inode *inode, + const struct timespec *now) +{ + if (!timespec_equal(&inode->i_mtime, now) || + !timespec_equal(&inode->i_ctime, now)) + return 1; + return 0; +} + +/** + * update_ctime - update mtime and ctime of an inode. + * @inode: inode to update + * + * This function updates mtime and ctime of the inode if it is not equivalent to + * current time. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int update_mctime(struct inode *inode) +{ + struct timespec now = ubifs_current_time(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + + if (mctime_update_needed(inode, &now)) { + int err, release; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_budget(c, &req); + } + + return 0; +} + +static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + int err = update_mctime(file_inode(iocb->ki_filp)); + if (err) + return err; + + return generic_file_write_iter(iocb, from); +} + +static int ubifs_set_page_dirty(struct page *page) +{ + int ret; + + ret = __set_page_dirty_nobuffers(page); + /* + * An attempt to dirty a page without budgeting for it - should not + * happen. + */ + ubifs_assert(ret == 0); + return ret; +} + +static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + /* + * An attempt to release a dirty page without budgeting for it - should + * not happen. + */ + if (PageWriteback(page)) + return 0; + ubifs_assert(PagePrivate(page)); + ubifs_assert(0); + ClearPagePrivate(page); + ClearPageChecked(page); + return 1; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. + * UBIFS must ensure page is budgeted for. + */ +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct timespec now = ubifs_current_time(inode); + struct ubifs_budget_req req = { .new_page = 1 }; + int err, update_time; + + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + i_size_read(inode)); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (unlikely(c->ro_error)) + return VM_FAULT_SIGBUS; /* -EROFS */ + + /* + * We have not locked @page so far so we may budget for changing the + * page. Note, we cannot do this after we locked the page, because + * budgeting may cause write-back which would cause deadlock. + * + * At the moment we do not know whether the page is dirty or not, so we + * assume that it is not and budget for a new page. We could look at + * the @PG_private flag and figure this out, but we may race with write + * back and the page state may change by the time we lock it, so this + * would need additional care. We do not bother with this at the + * moment, although it might be good idea to do. Instead, we allocate + * budget for a new page and amend it later on if the page was in fact + * dirty. + * + * The budgeting-related logic of this function is similar to what we + * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there + * for more comments. + */ + update_time = mctime_update_needed(inode, &now); + if (update_time) + /* + * We have to change inode time stamp which requires extra + * budgeting. + */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) { + if (err == -ENOSPC) + ubifs_warn(c, "out of space for mmapped file (inode number %lu)", + inode->i_ino); + return VM_FAULT_SIGBUS; + } + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode))) { + /* Page got truncated out from underneath us */ + err = -EINVAL; + goto out_unlock; + } + + if (PagePrivate(page)) + release_new_page_budget(c); + else { + if (!PageChecked(page)) + ubifs_convert_page_budget(c); + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (update_time) { + int release; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_dirty_inode_budget(c, ui); + } + + wait_for_stable_page(page); + return VM_FAULT_LOCKED; + +out_unlock: + unlock_page(page); + ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; + return err; +} + +static const struct vm_operations_struct ubifs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ubifs_vm_page_mkwrite, +}; + +static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + + err = generic_file_mmap(file, vma); + if (err) + return err; + vma->vm_ops = &ubifs_file_vm_ops; + return 0; +} + +const struct address_space_operations ubifs_file_address_operations = { + .readpage = ubifs_readpage, + .writepage = ubifs_writepage, + .write_begin = ubifs_write_begin, + .write_end = ubifs_write_end, + .invalidatepage = ubifs_invalidatepage, + .set_page_dirty = ubifs_set_page_dirty, + .releasepage = ubifs_releasepage, +}; + +const struct inode_operations ubifs_file_inode_operations = { + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +}; + +const struct inode_operations ubifs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ubifs_follow_link, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +}; + +const struct file_operations ubifs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = ubifs_write_iter, + .mmap = ubifs_file_mmap, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/kernel/fs/ubifs/find.c b/kernel/fs/ubifs/find.c new file mode 100644 index 000000000..2dcf3d473 --- /dev/null +++ b/kernel/fs/ubifs/find.c @@ -0,0 +1,985 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains functions for finding LEBs for various purposes e.g. + * garbage collection. In general, lprops category heaps and lists are used + * for fast access, falling back on scanning the LPT as a last resort. + */ + +#include +#include "ubifs.h" + +/** + * struct scan_data - data provided to scan callback functions + * @min_space: minimum number of bytes for which to scan + * @pick_free: whether it is OK to scan for empty LEBs + * @lnum: LEB number found is returned here + * @exclude_index: whether to exclude index LEBs + */ +struct scan_data { + int min_space; + int pick_free; + int lnum; + int exclude_index; +}; + +/** + * valuable - determine whether LEB properties are valuable. + * @c: the UBIFS file-system description object + * @lprops: LEB properties + * + * This function return %1 if the LEB properties should be added to the LEB + * properties tree in memory. Otherwise %0 is returned. + */ +static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) +{ + int n, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (heap->cnt < heap->max_cnt) + return 1; + if (lprops->free + lprops->dirty >= c->dark_wm) + return 1; + return 0; + case LPROPS_EMPTY: + n = c->lst.empty_lebs + c->freeable_cnt - + c->lst.taken_empty_lebs; + if (n < c->lsave_cnt) + return 1; + return 0; + case LPROPS_FREEABLE: + return 1; + case LPROPS_FRDI_IDX: + return 1; + } + return 0; +} + +/** + * scan_for_dirty_cb - dirty space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_dirty_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < data->min_space) + return ret; + /* If specified, exclude index LEBs */ + if (data->exclude_index && lprops->flags & LPROPS_INDEX) + return ret; + /* If specified, exclude empty or freeable LEBs */ + if (lprops->free + lprops->dirty == c->leb_size) { + if (!data->pick_free) + return ret; + /* Exclude LEBs with too little dirty space (unless it is empty) */ + } else if (lprops->dirty < c->dead_wm) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_dirty - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: if it is OK to return a free or freeable LEB + * @exclude_index: whether to exclude index LEBs + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, + int min_space, int pick_free, + int exclude_index) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + /* There may be an LEB with enough dirty space on the free heap */ + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free + lprops->dirty < min_space) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the dirty heap, and ended + * up as uncategorized even though it has enough dirty space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->free + lprops->dirty < min_space) + continue; + if (exclude_index && (lprops->flags & LPROPS_INDEX)) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + data.exclude_index = exclude_index; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_dirty_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= min_space); + ubifs_assert(lprops->dirty >= c->dead_wm || + (pick_free && + lprops->free + lprops->dirty == c->leb_size)); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector. + * @c: the UBIFS file-system description object + * @ret_lp: LEB properties are returned here on exit + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: controls whether it is OK to pick empty or index LEBs + * + * This function tries to find a dirty logical eraseblock which has at least + * @min_space free and dirty space. It prefers to take an LEB from the dirty or + * dirty index heap, and it falls-back to LPT scanning if the heaps are empty + * or do not have an LEB which satisfies the @min_space criteria. + * + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. + * + * The additional @pick_free argument controls if this function has to return a + * free or freeable LEB if one is present. For example, GC must to set it to %1, + * when called from the journal space reservation function, because the + * appearance of free space may coincide with the loss of enough dirty space + * for GC to succeed anyway. + * + * In contrast, if the Garbage Collector is called from budgeting, it should + * just make free space, not return LEBs which are already free or freeable. + * + * In addition @pick_free is set to %2 by the recovery process in order to + * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". + */ +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free) +{ + int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0; + const struct ubifs_lprops *lp = NULL, *idx_lp = NULL; + struct ubifs_lpt_heap *heap, *idx_heap; + + ubifs_get_lprops(c); + + if (pick_free) { + int lebs, rsvd_idx_lebs = 0; + + spin_lock(&c->space_lock); + lebs = c->lst.empty_lebs + c->idx_gc_cnt; + lebs += c->freeable_cnt - c->lst.taken_empty_lebs; + + /* + * Note, the index may consume more LEBs than have been reserved + * for it. It is OK because it might be consolidated by GC. + * But if the index takes fewer LEBs than it is reserved for it, + * this function must avoid picking those reserved LEBs. + */ + if (c->bi.min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + exclude_index = 1; + } + spin_unlock(&c->space_lock); + + /* Check if there are enough free LEBs for the index */ + if (rsvd_idx_lebs < lebs) { + /* OK, try to find an empty LEB */ + lp = ubifs_fast_find_empty(c); + if (lp) + goto found; + + /* Or a freeable LEB */ + lp = ubifs_fast_find_freeable(c); + if (lp) + goto found; + } else + /* + * We cannot pick free/freeable LEBs in the below code. + */ + pick_free = 0; + } else { + spin_lock(&c->space_lock); + exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs); + spin_unlock(&c->space_lock); + } + + /* Look on the dirty and dirty index heaps */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + + if (idx_heap->cnt && !exclude_index) { + idx_lp = idx_heap->arr[0]; + sum = idx_lp->free + idx_lp->dirty; + /* + * Since we reserve thrice as much space for the index than it + * actually takes, it does not make sense to pick indexing LEBs + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. + */ + if (sum < min_space || sum < c->half_leb_size) + idx_lp = NULL; + } + + if (heap->cnt) { + lp = heap->arr[0]; + if (lp->dirty + lp->free < min_space) + lp = NULL; + } + + /* Pick the LEB with most space */ + if (idx_lp && lp) { + if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty) + lp = idx_lp; + } else if (idx_lp && !lp) + lp = idx_lp; + + if (lp) { + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); + goto found; + } + + /* Did not find a dirty LEB on the dirty heaps, have to scan */ + dbg_find("scanning LPT for a dirty LEB"); + lp = scan_for_dirty(c, min_space, pick_free, exclude_index); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(lp->dirty >= c->dead_wm || + (pick_free && lp->free + lp->dirty == c->leb_size)); + +found: + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lp->lnum, lp->free, lp->dirty, lp->flags); + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + memcpy(ret_lp, lp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_free_cb - free space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_free_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBs */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free < data->min_space) + return ret; + /* If specified, exclude empty LEBs */ + if (!data->pick_free && lprops->free == c->leb_size) + return ret; + /* + * LEBs that have only free and dirty space must not be allocated + * because they may have been unmapped already or they may have data + * that is obsolete only because of nodes that are still sitting in a + * wbuf. + */ + if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * do_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of free space required + * @pick_free: whether it is OK to scan for empty LEBs + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static +const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, + int min_space, int pick_free, + int squeeze) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + if (squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + if (pick_free) { + lprops = ubifs_fast_find_empty(c); + if (lprops) + return lprops; + } + if (!squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + /* There may be an LEB with enough free space on the dirty heap */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free >= min_space) + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the free heap, and ended + * up as uncategorized even though it has enough free space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->flags & LPROPS_INDEX) + continue; + if (lprops->free >= min_space) + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_free_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free >= min_space); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of required free space + * @offs: contains offset of where free space starts on exit + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function looks for an LEB with at least @min_space bytes of free space. + * It tries to find an empty LEB if possible. If no empty LEBs are available, + * this function searches for a non-empty data LEB. The returned LEB is marked + * as "taken". + * + * This function returns found LEB number in case of success, %-ENOSPC if it + * failed to find a LEB with @min_space bytes of free space and other a negative + * error codes in case of failure. + */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze) +{ + const struct ubifs_lprops *lprops; + int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags; + + dbg_find("min_space %d", min_space); + ubifs_get_lprops(c); + + /* Check if there are enough empty LEBs for commit */ + spin_lock(&c->space_lock); + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (rsvd_idx_lebs < lebs) + /* + * OK to allocate an empty LEB, but we still don't want to go + * looking for one if there aren't any. + */ + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + pick_free = 1; + /* + * Because we release the space lock, we must account + * for this allocation here. After the LEB properties + * flags have been updated, we subtract one. Note, the + * result of this is that lprops also decreases + * @taken_empty_lebs in 'ubifs_change_lp()', so it is + * off by one for a short period of time which may + * introduce a small disturbance to budgeting + * calculations, but this is harmless because at the + * worst case this would make the budgeting subsystem + * be more pessimistic than needed. + * + * Fundamentally, this is about serialization of the + * budgeting and lprops subsystems. We could make the + * @space_lock a mutex and avoid dropping it before + * calling 'ubifs_change_lp()', but mutex is more + * heavy-weight, and we want budgeting to be as fast as + * possible. + */ + c->lst.taken_empty_lebs += 1; + } + spin_unlock(&c->space_lock); + + lprops = do_find_free_space(c, min_space, pick_free, squeeze); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + lnum = lprops->lnum; + flags = lprops->flags | LPROPS_TAKEN; + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + + *offs = c->leb_size - lprops->free; + ubifs_release_lprops(c); + + if (*offs == 0) { + /* + * Ensure that empty LEBs have been unmapped. They may not have + * been, for example, because of an unclean unmount. Also + * LEBs that were freeable LEBs (free + dirty == leb_size) will + * not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); + return lnum; + +out: + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_idx_cb - callback used by the scan for a free LEB for the index. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBS */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs that cannot be made empty */ + if (lprops->free + lprops->dirty != c->leb_size) + return ret; + /* + * We are allocating for the index so it is safe to allocate LEBs with + * only free and dirty space, because write buffers are sync'd at commit + * start. + */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_leb_for_idx - scan for a free LEB for the index. + * @c: the UBIFS file-system description object + */ +static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct scan_data data; + int err; + + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_idx_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_leb_for_idx - find a free LEB for the index. + * @c: the UBIFS file-system description object + * + * This function looks for a free LEB and returns that LEB number. The returned + * LEB is marked as "taken", "index". + * + * Only empty LEBs are allocated. This is for two reasons. First, the commit + * calculates the number of LEBs to allocate based on the assumption that they + * will be empty. Secondly, free space at the end of an index LEB is not + * guaranteed to be empty because it may have been used by the in-the-gaps + * method prior to an unclean unmount. + * + * If no LEB is found %-ENOSPC is returned. For other failures another negative + * error code is returned. + */ +int ubifs_find_free_leb_for_idx(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + int lnum = -1, err, flags; + + ubifs_get_lprops(c); + + lprops = ubifs_fast_find_empty(c); + if (!lprops) { + lprops = ubifs_fast_find_freeable(c); + if (!lprops) { + /* + * The first condition means the following: go scan the + * LPT if there are uncategorized lprops, which means + * there may be freeable LEBs there (UBIFS does not + * store the information about freeable LEBs in the + * master node). + */ + if (c->in_a_category_cnt != c->main_lebs || + c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + ubifs_assert(c->freeable_cnt == 0); + lprops = scan_for_leb_for_idx(c); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } + } + } + + if (!lprops) { + err = -ENOSPC; + goto out; + } + + lnum = lprops->lnum; + + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lnum, lprops->free, lprops->dirty, lprops->flags); + + flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX; + lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + ubifs_release_lprops(c); + + /* + * Ensure that empty LEBs have been unmapped. They may not have been, + * for example, because of an unclean unmount. Also LEBs that were + * freeable LEBs (free + dirty == leb_size) will not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) { + ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN | LPROPS_INDEX, 0); + return err; + } + + return lnum; + +out: + ubifs_release_lprops(c); + return err; +} + +static int cmp_dirty_idx(const struct ubifs_lprops **a, + const struct ubifs_lprops **b) +{ + const struct ubifs_lprops *lpa = *a; + const struct ubifs_lprops *lpb = *b; + + return lpa->dirty + lpa->free - lpb->dirty - lpb->free; +} + +static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b, + int size) +{ + struct ubifs_lprops *t = *a; + + *a = *b; + *b = t; +} + +/** + * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos. + * @c: the UBIFS file-system description object + * + * This function is called each commit to create an array of LEB numbers of + * dirty index LEBs sorted in order of dirty and free space. This is used by + * the in-the-gaps method of TNC commit. + */ +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) +{ + int i; + + ubifs_get_lprops(c); + /* Copy the LPROPS_DIRTY_IDX heap */ + c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt; + memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr, + sizeof(void *) * c->dirty_idx.cnt); + /* Sort it so that the dirtiest is now at the end */ + sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), + (int (*)(const void *, const void *))cmp_dirty_idx, + (void (*)(void *, void *, int))swap_dirty_idx); + dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); + if (c->dirty_idx.cnt) + dbg_find("dirtiest index LEB is %d with dirty %d and free %d", + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free); + /* Replace the lprops pointers with LEB numbers */ + for (i = 0; i < c->dirty_idx.cnt; i++) + c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum; + ubifs_release_lprops(c); + return 0; +} + +/** + * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_dirty_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude non-index LEBs */ + if (!(lprops->flags & LPROPS_INDEX)) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < c->min_idx_node_sz) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * find_dirty_idx_leb - find a dirty index LEB. + * @c: the UBIFS file-system description object + * + * This function returns LEB number upon success and a negative error code upon + * failure. In particular, -ENOSPC is returned if a dirty index LEB is not + * found. + * + * Note that this function scans the entire LPT but it is called very rarely. + */ +static int find_dirty_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i, ret; + + /* Check all structures in memory first */ + data.lnum = -1; + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->uncat_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return -ENOSPC; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + &data); + if (err) + return err; +found: + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + + dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x", + lprops->lnum, lprops->free, lprops->dirty, lprops->flags); + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, + lprops->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + + return lprops->lnum; +} + +/** + * get_idx_gc_leb - try to get a LEB number from trivial GC. + * @c: the UBIFS file-system description object + */ +static int get_idx_gc_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, lnum; + + err = ubifs_get_idx_gc_leb(c); + if (err < 0) + return err; + lnum = err; + /* + * The LEB was due to be unmapped after the commit but + * it is needed now for this commit. + */ + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_INDEX, -1); + if (IS_ERR(lp)) + return PTR_ERR(lp); + dbg_find("LEB %d, dirty %d and free %d flags %#x", + lp->lnum, lp->dirty, lp->free, lp->flags); + return lnum; +} + +/** + * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array. + * @c: the UBIFS file-system description object + */ +static int find_dirtiest_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int lnum; + + while (1) { + if (!c->dirty_idx.cnt) + return -ENOSPC; + /* The lprops pointers were replaced by LEB numbers */ + lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt]; + lp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX)) + continue; + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) + return PTR_ERR(lp); + break; + } + dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, + lp->free, lp->flags); + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(lp->flags & LPROPS_INDEX); + return lnum; +} + +/** + * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit. + * @c: the UBIFS file-system description object + * + * This function attempts to find an untaken index LEB with the most free and + * dirty space that can be used without overwriting index nodes that were in the + * last index committed. + */ +int ubifs_find_dirty_idx_leb(struct ubifs_info *c) +{ + int err; + + ubifs_get_lprops(c); + + /* + * We made an array of the dirtiest index LEB numbers as at the start of + * last commit. Try that array first. + */ + err = find_dirtiest_idx_leb(c); + + /* Next try scanning the entire LPT */ + if (err == -ENOSPC) + err = find_dirty_idx_leb(c); + + /* Finally take any index LEBs awaiting trivial GC */ + if (err == -ENOSPC) + err = get_idx_gc_leb(c); + + ubifs_release_lprops(c); + return err; +} diff --git a/kernel/fs/ubifs/gc.c b/kernel/fs/ubifs/gc.c new file mode 100644 index 000000000..9718da86a --- /dev/null +++ b/kernel/fs/ubifs/gc.c @@ -0,0 +1,984 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements garbage collection. The procedure for garbage collection + * is different depending on whether a LEB as an index LEB (contains index + * nodes) or not. For non-index LEBs, garbage collection finds a LEB which + * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete + * nodes to the journal, at which point the garbage-collected LEB is free to be + * reused. For index LEBs, garbage collection marks the non-obsolete index nodes + * dirty in the TNC, and after the next commit, the garbage-collected LEB is + * to be reused. Garbage collection will cause the number of dirty index nodes + * to grow, however sufficient space is reserved for the index to ensure the + * commit will never run out of space. + * + * Notes about dead watermark. At current UBIFS implementation we assume that + * LEBs which have less than @c->dead_wm bytes of free + dirty space are full + * and not worth garbage-collecting. The dead watermark is one min. I/O unit + * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS + * Garbage Collector has to synchronize the GC head's write buffer before + * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can + * actually reclaim even very small pieces of dirty space by garbage collecting + * enough dirty LEBs, but we do not bother doing this at this implementation. + * + * Notes about dark watermark. The results of GC work depends on how big are + * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed, + * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would + * have to waste large pieces of free space at the end of LEB B, because nodes + * from LEB A would not fit. And the worst situation is when all nodes are of + * maximum size. So dark watermark is the amount of free + dirty space in LEB + * which are guaranteed to be reclaimable. If LEB has less space, the GC might + * be unable to reclaim it. So, LEBs with free + dirty greater than dark + * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * good, and GC takes extra care when moving them. + */ + +#include +#include +#include +#include "ubifs.h" + +/* + * GC may need to move more than one LEB to make progress. The below constants + * define "soft" and "hard" limits on the number of LEBs the garbage collector + * may move. + */ +#define SOFT_LEBS_LIMIT 4 +#define HARD_LEBS_LIMIT 32 + +/** + * switch_gc_head - switch the garbage collection journal head. + * @c: UBIFS file-system description object + * @buf: buffer to write + * @len: length of the buffer to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function switch the GC head to the next LEB which is reserved in + * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, + * and other negative error code in case of failures. + */ +static int switch_gc_head(struct ubifs_info *c) +{ + int err, gc_lnum = c->gc_lnum; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert(gc_lnum != -1); + dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)", + wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum, + c->leb_size - wbuf->offs - wbuf->used); + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + /* + * The GC write-buffer was synchronized, we may safely unmap + * 'c->gc_lnum'. + */ + err = ubifs_leb_unmap(c, gc_lnum); + if (err) + return err; + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); + if (err) + return err; + + c->gc_lnum = -1; + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0); + return err; +} + +/** + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + ubifs_assert(sa->type == UBIFS_DATA_NODE); + ubifs_assert(sb->type == UBIFS_DATA_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +static int nondata_nodes_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY && + key_type(c, &sb->key) != UBIFS_DATA_KEY); + ubifs_assert(sa->type != UBIFS_DATA_NODE && + sb->type != UBIFS_DATA_NODE); + + /* Inodes go before directory entries */ + if (sa->type == UBIFS_INO_NODE) { + if (sb->type == UBIFS_INO_NODE) + return sb->len - sa->len; + return -1; + } + if (sb->type == UBIFS_INO_NODE) + return 1; + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY || + key_type(c, &sa->key) == UBIFS_XENT_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY || + key_type(c, &sb->key) == UBIFS_XENT_KEY); + ubifs_assert(sa->type == UBIFS_DENT_NODE || + sa->type == UBIFS_XENT_NODE); + ubifs_assert(sb->type == UBIFS_DENT_NODE || + sb->type == UBIFS_XENT_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. + * @c: UBIFS file-system description object + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here + * + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; + * + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) +{ + int err; + struct ubifs_scan_node *snod, *tmp; + + *min = INT_MAX; + + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + ubifs_assert(snod->type == UBIFS_INO_NODE || + snod->type == UBIFS_DATA_NODE || + snod->type == UBIFS_DENT_NODE || + snod->type == UBIFS_XENT_NODE || + snod->type == UBIFS_TRUN_NODE); + + if (snod->type != UBIFS_INO_NODE && + snod->type != UBIFS_DATA_NODE && + snod->type != UBIFS_DENT_NODE && + snod->type != UBIFS_XENT_NODE) { + /* Probably truncation node, zap it */ + list_del(&snod->list); + kfree(snod); + continue; + } + + ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY || + key_type(c, &snod->key) == UBIFS_INO_KEY || + key_type(c, &snod->key) == UBIFS_DENT_KEY || + key_type(c, &snod->key) == UBIFS_XENT_KEY); + + err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, + snod->offs, 0); + if (err < 0) + return err; + + if (!err) { + /* The node is obsolete, remove it from the list */ + list_del(&snod->list); + kfree(snod); + continue; + } + + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); + } + + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + + err = dbg_check_data_nodes_order(c, &sleb->nodes); + if (err) + return err; + err = dbg_check_nondata_nodes_order(c, nondata); + if (err) + return err; + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + if (wbuf->lnum == -1) { + /* + * The GC journal head is not set, because it is the first GC + * invocation since mount. + */ + err = switch_gc_head(c); + if (err) + return err; + } + + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + + /* Write nodes to their new location. Use the first-fit strategy */ + while (1) { + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (avail < min) + break; + + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; + continue; + } + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + if (list_empty(&sleb->nodes) && list_empty(&nondata)) + break; + + /* + * Waste the rest of the space in the LEB and switch to the + * next LEB. + */ + err = switch_gc_head(c); + if (err) + goto out; + } + + return 0; + +out: + list_splice_tail(&nondata, &sleb->nodes); + return err; +} + +/** + * gc_sync_wbufs - sync write-buffers for GC. + * @c: UBIFS file-system description object + * + * We must guarantee that obsoleting nodes are on flash. Unfortunately they may + * be in a write-buffer instead. That is, a node could be written to a + * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is + * erased before the write-buffer is sync'd and then there is an unclean + * unmount, then an existing node is lost. To avoid this, we sync all + * write-buffers. + * + * This function returns %0 on success or a negative error code on failure. + */ +static int gc_sync_wbufs(struct ubifs_info *c) +{ + int err, i; + + for (i = 0; i < c->jhead_cnt; i++) { + if (i == GCHD) + continue; + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + return 0; +} + +/** + * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock. + * @c: UBIFS file-system description object + * @lp: describes the LEB to garbage collect + * + * This function garbage-collects an LEB and returns one of the @LEB_FREED, + * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is + * required, and other negative error codes in case of failures. + */ +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + int err = 0, lnum = lp->lnum; + + ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 || + c->need_recovery); + ubifs_assert(c->gc_lnum != lnum); + ubifs_assert(wbuf->lnum != lnum); + + if (lp->free + lp->dirty == c->leb_size) { + /* Special case - a free LEB */ + dbg_gc("LEB %d is free, return it", lp->lnum); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + + if (lp->free != c->leb_size) { + /* + * Write buffers must be sync'd before unmapping + * freeable LEBs, because one of them may contain data + * which obsoletes something in 'lp->pnum'. + */ + err = gc_sync_wbufs(c); + if (err) + return err; + err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + return err; + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + return LEB_RETAINED; + } + + return LEB_FREED; + } + + /* + * We scan the entire LEB even though we only really need to scan up to + * (c->leb_size - lp->free). + */ + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + ubifs_assert(!list_empty(&sleb->nodes)); + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_gced_idx_leb *idx_gc; + + dbg_gc("indexing LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx = snod->node; + int level = le16_to_cpu(idx->level); + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + key_read(c, ubifs_idx_key(c, idx), &snod->key); + err = ubifs_dirty_idx_node(c, &snod->key, level, lnum, + snod->offs); + if (err) + goto out; + } + + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + + idx_gc->lnum = lnum; + idx_gc->unmap = 0; + list_add(&idx_gc->list, &c->idx_gc); + + /* + * Don't release the LEB until after the next commit, because + * it may contain data which is needed for recovery. So + * although we freed this LEB, it will become usable only after + * the commit. + */ + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, + LPROPS_INDEX, 1); + if (err) + goto out; + err = LEB_FREED_IDX; + } else { + dbg_gc("data LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + + err = move_nodes(c, sleb); + if (err) + goto out_inc_seq; + + err = gc_sync_wbufs(c); + if (err) + goto out_inc_seq; + + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); + if (err) + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + err = LEB_RETAINED; + } else { + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out; + + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + + err = LEB_FREED; + } + } + +out: + ubifs_scan_destroy(sleb); + return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; +} + +/** + * ubifs_garbage_collect - UBIFS garbage collector. + * @c: UBIFS file-system description object + * @anyway: do GC even if there are free LEBs + * + * This function does out-of-place garbage collection. The return codes are: + * o positive LEB number if the LEB has been freed and may be used; + * o %-EAGAIN if the caller has to run commit; + * o %-ENOSPC if GC failed to make any progress; + * o other negative error codes in case of other errors. + * + * Garbage collector writes data to the journal when GC'ing data LEBs, and just + * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point + * commit may be required. But commit cannot be run from inside GC, because the + * caller might be holding the commit lock, so %-EAGAIN is returned instead; + * And this error code means that the caller has to run commit, and re-run GC + * if there is still no free space. + * + * There are many reasons why this function may return %-EAGAIN: + * o the log is full and there is no space to write an LEB reference for + * @c->gc_lnum; + * o the journal is too large and exceeds size limitations; + * o GC moved indexing LEBs, but they can be used only after the commit; + * o the shrinker fails to find clean znodes to free and requests the commit; + * o etc. + * + * Note, if the file-system is close to be full, this function may return + * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of + * the function. E.g., this happens if the limits on the journal size are too + * tough and GC writes too much to the journal before an LEB is freed. This + * might also mean that the journal is too large, and the TNC becomes to big, + * so that the shrinker is constantly called, finds not clean znodes to free, + * and requests commit. Well, this may also happen if the journal is all right, + * but another kernel process consumes too much memory. Anyway, infinite + * %-EAGAIN may happen, but in some extreme/misconfiguration cases. + */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway) +{ + int i, err, ret, min_space = c->dead_wm; + struct ubifs_lprops lp; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert_cmt_locked(c); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (ubifs_gc_should_commit(c)) + return -EAGAIN; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_error) { + ret = -EROFS; + goto out_unlock; + } + + /* We expect the write-buffer to be empty on entry */ + ubifs_assert(!wbuf->used); + + for (i = 0; ; i++) { + int space_before, space_after; + + cond_resched(); + + /* Give the commit an opportunity to run */ + if (ubifs_gc_should_commit(c)) { + ret = -EAGAIN; + break; + } + + if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) { + /* + * We've done enough iterations. Indexing LEBs were + * moved and will be available after the commit. + */ + dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + break; + } + + if (i > HARD_LEBS_LIMIT) { + /* + * We've moved too many LEBs and have not made + * progress, give up. + */ + dbg_gc("hard limit, -ENOSPC"); + ret = -ENOSPC; + break; + } + + /* + * Empty and freeable LEBs can turn up while we waited for + * the wbuf lock, or while we have been running GC. In that + * case, we should just return one of those instead of + * continuing to GC dirty LEBs. Hence we request + * 'ubifs_find_dirty_leb()' to return an empty LEB if it can. + */ + ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1); + if (ret) { + if (ret == -ENOSPC) + dbg_gc("no more dirty LEBs"); + break; + } + + dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)", + lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, + min_space); + + space_before = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum == -1) + space_before = 0; + + ret = ubifs_garbage_collect_leb(c, &lp); + if (ret < 0) { + if (ret == -EAGAIN) { + /* + * This is not error, so we have to return the + * LEB to lprops. But if 'ubifs_return_leb()' + * fails, its failure code is propagated to the + * caller instead of the original '-EAGAIN'. + */ + err = ubifs_return_leb(c, lp.lnum); + if (err) + ret = err; + break; + } + goto out; + } + + if (ret == LEB_FREED) { + /* An LEB has been freed and is ready for use */ + dbg_gc("LEB %d freed, return", lp.lnum); + ret = lp.lnum; + break; + } + + if (ret == LEB_FREED_IDX) { + /* + * This was an indexing LEB and it cannot be + * immediately used. And instead of requesting the + * commit straight away, we try to garbage collect some + * more. + */ + dbg_gc("indexing LEB %d freed, continue", lp.lnum); + continue; + } + + ubifs_assert(ret == LEB_RETAINED); + space_after = c->leb_size - wbuf->offs - wbuf->used; + dbg_gc("LEB %d retained, freed %d bytes", lp.lnum, + space_after - space_before); + + if (space_after > space_before) { + /* GC makes progress, keep working */ + min_space >>= 1; + if (min_space < c->dead_wm) + min_space = c->dead_wm; + continue; + } + + dbg_gc("did not make progress"); + + /* + * GC moved an LEB bud have not done any progress. This means + * that the previous GC head LEB contained too few free space + * and the LEB which was GC'ed contained only large nodes which + * did not fit that space. + * + * We can do 2 things: + * 1. pick another LEB in a hope it'll contain a small node + * which will fit the space we have at the end of current GC + * head LEB, but there is no guarantee, so we try this out + * unless we have already been working for too long; + * 2. request an LEB with more dirty space, which will force + * 'ubifs_find_dirty_leb()' to start scanning the lprops + * table, instead of just picking one from the heap + * (previously it already picked the dirtiest LEB). + */ + if (i < SOFT_LEBS_LIMIT) { + dbg_gc("try again"); + continue; + } + + min_space <<= 1; + if (min_space > c->dark_wm) + min_space = c->dark_wm; + dbg_gc("set min. space to %d", min_space); + } + + if (ret == -ENOSPC && !list_empty(&c->idx_gc)) { + dbg_gc("no space, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + if (!err) + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) { + ret = err; + goto out; + } +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return ret; + +out: + ubifs_assert(ret < 0); + ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); + ubifs_wbuf_sync_nolock(wbuf); + ubifs_ro_mode(c, ret); + mutex_unlock(&wbuf->io_mutex); + ubifs_return_leb(c, lp.lnum); + return ret; +} + +/** + * ubifs_gc_start_commit - garbage collection at start of commit. + * @c: UBIFS file-system description object + * + * If a LEB has only dirty and free space, then we may safely unmap it and make + * it free. Note, we cannot do this with indexing LEBs because dirty space may + * correspond index nodes that are required for recovery. In that case, the + * LEB cannot be unmapped until after the next commit. + * + * This function returns %0 upon success and a negative error code upon failure. + */ +int ubifs_gc_start_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + const struct ubifs_lprops *lp; + int err = 0, flags; + + ubifs_get_lprops(c); + + /* + * Unmap (non-index) freeable LEBs. Note that recovery requires that all + * wbufs are sync'd before this, which is done in 'do_commit()'. + */ + while (1) { + lp = ubifs_fast_find_freeable(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + goto out; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + } + + /* Mark GC'd index LEBs OK to unmap after this commit finishes */ + list_for_each_entry(idx_gc, &c->idx_gc, list) + idx_gc->unmap = 1; + + /* Record index freeable LEBs for unmapping after commit */ + while (1) { + lp = ubifs_fast_find_frdi_idx(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(lp->flags & LPROPS_INDEX); + /* Don't release the LEB until after the next commit */ + flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + kfree(idx_gc); + goto out; + } + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + idx_gc->lnum = lp->lnum; + idx_gc->unmap = 1; + list_add(&idx_gc->list, &c->idx_gc); + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_gc_end_commit - garbage collection at end of commit. + * @c: UBIFS file-system description object + * + * This function completes out-of-place garbage collection of index LEBs. + */ +int ubifs_gc_end_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc, *tmp; + struct ubifs_wbuf *wbuf; + int err = 0; + + wbuf = &c->jheads[GCHD].wbuf; + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list) + if (idx_gc->unmap) { + dbg_gc("LEB %d", idx_gc->lnum); + err = ubifs_leb_unmap(c, idx_gc->lnum); + if (err) + goto out; + err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC, + LPROPS_NC, 0, LPROPS_TAKEN, -1); + if (err) + goto out; + list_del(&idx_gc->list); + kfree(idx_gc); + } +out: + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_destroy_idx_gc - destroy idx_gc list. + * @c: UBIFS file-system description object + * + * This function destroys the @c->idx_gc list. It is called when unmounting + * so locks are not needed. Returns zero in case of success and a negative + * error code in case of failure. + */ +void ubifs_destroy_idx_gc(struct ubifs_info *c) +{ + while (!list_empty(&c->idx_gc)) { + struct ubifs_gced_idx_leb *idx_gc; + + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, + list); + c->idx_gc_cnt -= 1; + list_del(&idx_gc->list); + kfree(idx_gc); + } +} + +/** + * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list. + * @c: UBIFS file-system description object + * + * Called during start commit so locks are not needed. + */ +int ubifs_get_idx_gc_leb(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + int lnum; + + if (list_empty(&c->idx_gc)) + return -ENOSPC; + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list); + lnum = idx_gc->lnum; + /* c->idx_gc_cnt is updated by the caller when lprops are updated */ + list_del(&idx_gc->list); + kfree(idx_gc); + return lnum; +} diff --git a/kernel/fs/ubifs/io.c b/kernel/fs/ubifs/io.c new file mode 100644 index 000000000..97be41215 --- /dev/null +++ b/kernel/fs/ubifs/io.c @@ -0,0 +1,1150 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements UBIFS I/O subsystem which provides various I/O-related + * helper functions (reading/writing/checking/validating nodes) and implements + * write-buffering support. Write buffers help to save space which otherwise + * would have been wasted for padding to the nearest minimal I/O unit boundary. + * Instead, data first goes to the write-buffer and is flushed when the + * buffer is full or when it is not used for some time (by timer). This is + * similar to the mechanism is used by JFFS2. + * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * + * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by + * mutexes defined inside these objects. Since sometimes upper-level code + * has to lock the write-buffer (e.g. journal space reservation code), many + * functions related to write-buffers have "nolock" suffix which means that the + * caller has to lock the write-buffer before calling this function. + * + * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not + * aligned, UBIFS starts the next node from the aligned address, and the padded + * bytes may contain any rubbish. In other words, UBIFS does not put padding + * bytes in those small gaps. Common headers of nodes store real node lengths, + * not aligned lengths. Indexing nodes also store real lengths in branches. + * + * UBIFS uses padding when it pads to the next min. I/O unit. In this case it + * uses padding nodes or padding bytes, if the padding node does not fit. + * + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. + */ + +#include +#include +#include "ubifs.h" + +/** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_error) { + c->ro_error = 1; + c->no_chk_data_crc = 0; + c->vfs_sb->s_flags |= MS_RDONLY; + ubifs_warn(c, "switched to read-only mode, error %d", err); + dump_stack(); + } +} + +/* + * Below are simple wrappers over UBI I/O functions which include some + * additional checks and UBIFS debugging stuff. See corresponding UBI function + * for more information. + */ + +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg) +{ + int err; + + err = ubi_read(c->ubi, lnum, buf, offs, len); + /* + * In case of %-EBADMSG print the error message only if the + * @even_ebadmsg is true. + */ + if (err && (err != -EBADMSG || even_ebadmsg)) { + ubifs_err(c, "reading %d bytes from LEB %d:%d failed, error %d", + len, lnum, offs, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); + else + err = dbg_leb_write(c, lnum, buf, offs, len); + if (err) { + ubifs_err(c, "writing %d bytes to LEB %d:%d failed, error %d", + len, lnum, offs, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_change(c->ubi, lnum, buf, len); + else + err = dbg_leb_change(c, lnum, buf, len); + if (err) { + ubifs_err(c, "changing %d bytes in LEB %d failed, error %d", + len, lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_unmap(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_unmap(c->ubi, lnum); + else + err = dbg_leb_unmap(c, lnum); + if (err) { + ubifs_err(c, "unmap LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_map(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_map(c->ubi, lnum); + else + err = dbg_leb_map(c, lnum); + if (err) { + ubifs_err(c, "mapping LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_is_mapped(const struct ubifs_info *c, int lnum) +{ + int err; + + err = ubi_is_mapped(c->ubi, lnum); + if (err < 0) { + ubifs_err(c, "ubi_is_mapped failed for LEB %d, error %d", + lnum, err); + dump_stack(); + } + return err; +} + +/** + * ubifs_check_node - check node. + * @c: UBIFS file-system description object + * @buf: node to check + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * @must_chk_crc: indicates whether to always check the CRC + * + * This function checks node magic number and CRC checksum. This function also + * validates node length to prevent UBIFS from becoming crazy when an attacker + * feeds it a file-system image with incorrect nodes. For example, too large + * node length in the common header could cause UBIFS to read memory outside of + * allocated buffer when checking the CRC checksum. + * + * This function may skip data nodes CRC checking if @c->no_chk_data_crc is + * true, which is controlled by corresponding UBIFS mount option. However, if + * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. + * + * This function returns zero in case of success and %-EUCLEAN in case of bad + * CRC or magic. + */ +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc) +{ + int err = -EINVAL, type, node_len; + uint32_t crc, node_crc, magic; + const struct ubifs_ch *ch = buf; + + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + + magic = le32_to_cpu(ch->magic); + if (magic != UBIFS_NODE_MAGIC) { + if (!quiet) + ubifs_err(c, "bad magic %#08x, expected %#08x", + magic, UBIFS_NODE_MAGIC); + err = -EUCLEAN; + goto out; + } + + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + if (!quiet) + ubifs_err(c, "bad node type %d", type); + goto out; + } + + node_len = le32_to_cpu(ch->len); + if (node_len + offs > c->leb_size) + goto out_len; + + if (c->ranges[type].max_len == 0) { + if (node_len != c->ranges[type].len) + goto out_len; + } else if (node_len < c->ranges[type].min_len || + node_len > c->ranges[type].max_len) + goto out_len; + + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) + return 0; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) { + if (!quiet) + ubifs_err(c, "bad CRC: calculated %#08x, read %#08x", + crc, node_crc); + err = -EUCLEAN; + goto out; + } + + return 0; + +out_len: + if (!quiet) + ubifs_err(c, "bad node length %d", node_len); +out: + if (!quiet) { + ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); + ubifs_dump_node(c, buf); + dump_stack(); + } + return err; +} + +/** + * ubifs_pad - pad flash space. + * @c: UBIFS file-system description object + * @buf: buffer to put padding to + * @pad: how many bytes to pad + * + * The flash media obliges us to write only in chunks of %c->min_io_size and + * when we have to write less data we add padding node to the write-buffer and + * pad it to the next minimal I/O unit's boundary. Padding nodes help when the + * media is being scanned. If the amount of wasted space is not enough to fit a + * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes + * pattern (%UBIFS_PADDING_BYTE). + * + * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is + * used. + */ +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad) +{ + uint32_t crc; + + ubifs_assert(pad >= 0 && !(pad & 7)); + + if (pad >= UBIFS_PAD_NODE_SZ) { + struct ubifs_ch *ch = buf; + struct ubifs_pad_node *pad_node = buf; + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->node_type = UBIFS_PAD_NODE; + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->padding[0] = ch->padding[1] = 0; + ch->sqnum = 0; + ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ); + pad -= UBIFS_PAD_NODE_SZ; + pad_node->pad_len = cpu_to_le32(pad); + crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8); + ch->crc = cpu_to_le32(crc); + memset(buf + UBIFS_PAD_NODE_SZ, 0, pad); + } else if (pad > 0) + /* Too little space, padding node won't fit */ + memset(buf, UBIFS_PADDING_BYTE, pad); +} + +/** + * next_sqnum - get next sequence number. + * @c: UBIFS file-system description object + */ +static unsigned long long next_sqnum(struct ubifs_info *c) +{ + unsigned long long sqnum; + + spin_lock(&c->cnt_lock); + sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + + if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) { + if (sqnum >= SQNUM_WATERMARK) { + ubifs_err(c, "sequence number overflow %llu, end of life", + sqnum); + ubifs_ro_mode(c, -EINVAL); + } + ubifs_warn(c, "running out of sequence numbers, end of life soon"); + } + + return sqnum; +} + +/** + * ubifs_prepare_node - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. + */ +void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); + + if (pad) { + len = ALIGN(len, 8); + pad = ALIGN(len, c->min_io_size) - len; + ubifs_pad(c, node + len, pad); + } +} + +/** + * ubifs_prep_grp_node - prepare node of a group to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @last: indicates the last node of the group + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC and fills the common header. + */ +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + if (last) + ch->group_type = UBIFS_LAST_OF_NODE_GROUP; + else + ch->group_type = UBIFS_IN_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); +} + +/** + * wbuf_timer_callback - write-buffer timer callback function. + * @timer: timer data (write-buffer descriptor) + * + * This function is called when the write-buffer timer expires. + */ +static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) +{ + struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); + + dbg_io("jhead %s", dbg_jhead(wbuf->jhead)); + wbuf->need_sync = 1; + wbuf->c->need_wbuf_sync = 1; + ubifs_wake_up_bgt(wbuf->c); + return HRTIMER_NORESTART; +} + +/** + * new_wbuf_timer - start new write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + ubifs_assert(!hrtimer_active(&wbuf->timer)); + + if (wbuf->no_timer) + return; + dbg_io("set timer for jhead %s, %llu-%llu millisecs", + dbg_jhead(wbuf->jhead), + div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, + USEC_PER_SEC)); + hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + HRTIMER_MODE_REL); +} + +/** + * cancel_wbuf_timer - cancel write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + if (wbuf->no_timer) + return; + wbuf->need_sync = 0; + hrtimer_cancel(&wbuf->timer); +} + +/** + * ubifs_wbuf_sync_nolock - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This function synchronizes write-buffer @buf and returns zero in case of + * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. + */ +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) +{ + struct ubifs_info *c = wbuf->c; + int err, dirt, sync_len; + + cancel_wbuf_timer_nolock(wbuf); + if (!wbuf->used || wbuf->lnum == -1) + /* Write-buffer is empty or not seeked */ + return 0; + + dbg_io("LEB %d:%d, %d bytes, jhead %s", + wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); + ubifs_assert(!(wbuf->avail & 7)); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); + + if (c->ro_error) + return -EROFS; + + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len); + if (err) + return err; + + spin_lock(&wbuf->lock); + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + + if (wbuf->sync_callback) + err = wbuf->sync_callback(c, wbuf->lnum, + c->leb_size - wbuf->offs, dirt); + return err; +} + +/** + * ubifs_wbuf_seek_nolock - seek write-buffer. + * @wbuf: write-buffer + * @lnum: logical eraseblock number to seek to + * @offs: logical eraseblock offset to seek to + * + * This function targets the write-buffer to logical eraseblock @lnum:@offs. + * The write-buffer has to be empty. Returns zero in case of success and a + * negative error code in case of failure. + */ +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs) +{ + const struct ubifs_info *c = wbuf->c; + + dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead)); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); + ubifs_assert(offs >= 0 && offs <= c->leb_size); + ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); + ubifs_assert(lnum != wbuf->lnum); + ubifs_assert(wbuf->used == 0); + + spin_lock(&wbuf->lock); + wbuf->lnum = lnum; + wbuf->offs = offs; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; + wbuf->used = 0; + spin_unlock(&wbuf->lock); + + return 0; +} + +/** + * ubifs_bg_wbufs_sync - synchronize write-buffers. + * @c: UBIFS file-system description object + * + * This function is called by background thread to synchronize write-buffers. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_bg_wbufs_sync(struct ubifs_info *c) +{ + int err, i; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (!c->need_wbuf_sync) + return 0; + c->need_wbuf_sync = 0; + + if (c->ro_error) { + err = -EROFS; + goto out_timers; + } + + dbg_io("synchronize"); + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + cond_resched(); + + /* + * If the mutex is locked then wbuf is being changed, so + * synchronization is not necessary. + */ + if (mutex_is_locked(&wbuf->io_mutex)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (!wbuf->need_sync) { + mutex_unlock(&wbuf->io_mutex); + continue; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + if (err) { + ubifs_err(c, "cannot sync write-buffer, error %d", err); + ubifs_ro_mode(c, err); + goto out_timers; + } + } + + return 0; + +out_timers: + /* Cancel all timers to prevent repeated errors */ + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + cancel_wbuf_timer_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + } + return err; +} + +/** + * ubifs_wbuf_write_nolock - write data to flash via write-buffer. + * @wbuf: write-buffer + * @buf: node to write + * @len: node length + * + * This function writes data to flash via write-buffer @wbuf. This means that + * the last piece of the node won't reach the flash media immediately if it + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). + * + * This function returns zero in case of success and a negative error code in + * case of failure. If the node cannot be written because there is no more + * space in this logical eraseblock, %-ENOSPC is returned. + */ +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) +{ + struct ubifs_info *c = wbuf->c; + int err, written, n, aligned_len = ALIGN(len, 8); + + dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used); + ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); + ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); + ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); + ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); + + if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { + err = -ENOSPC; + goto out; + } + + cancel_wbuf_timer_nolock(wbuf); + + if (c->ro_error) + return -EROFS; + + if (aligned_len <= wbuf->avail) { + /* + * The node is not very large and fits entirely within + * write-buffer. + */ + memcpy(wbuf->buf + wbuf->used, buf, len); + + if (aligned_len == wbuf->avail) { + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, + wbuf->offs, wbuf->size); + if (err) + goto out; + + spin_lock(&wbuf->lock); + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + } else { + spin_lock(&wbuf->lock); + wbuf->avail -= aligned_len; + wbuf->used += aligned_len; + spin_unlock(&wbuf->lock); + } + + goto exit; + } + + written = 0; + + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, + wbuf->size); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } + + /* + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. + * We align node length to 8-byte boundary because we anyway flash wbuf + * if the remaining space is less than 8 bytes. + */ + n = aligned_len >> c->max_write_shift; + if (n) { + n <<= c->max_write_shift; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, + wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, n); + if (err) + goto out; + wbuf->offs += n; + aligned_len -= n; + len -= n; + written += n; + } + + spin_lock(&wbuf->lock); + if (aligned_len) + /* + * And now we have what's left and what does not take whole + * max. write unit, so write it to the write-buffer and we are + * done. + */ + memcpy(wbuf->buf, buf + written, len); + + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; + wbuf->used = aligned_len; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + +exit: + if (wbuf->sync_callback) { + int free = c->leb_size - wbuf->offs - wbuf->used; + + err = wbuf->sync_callback(c, wbuf->lnum, free, 0); + if (err) + goto out; + } + + if (wbuf->used) + new_wbuf_timer_nolock(wbuf); + + return 0; + +out: + ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d", + len, wbuf->lnum, wbuf->offs, err); + ubifs_dump_node(c, buf); + dump_stack(); + ubifs_dump_leb(c, wbuf->lnum); + return err; +} + +/** + * ubifs_write_node - write node to the media. + * @c: UBIFS file-system description object + * @buf: the node to write + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function automatically fills node magic number, assigns sequence + * number, and calculates node CRC checksum. The length of the @buf buffer has + * to be aligned to the minimal I/O unit size. This function automatically + * appends padding node and padding bytes if needed. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, + int offs) +{ + int err, buf_len = ALIGN(len, c->min_io_size); + + dbg_io("LEB %d:%d, %s, length %d (aligned %d)", + lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len, + buf_len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); + + if (c->ro_error) + return -EROFS; + + ubifs_prepare_node(c, buf, len, 1); + err = ubifs_leb_write(c, lnum, buf, offs, buf_len); + if (err) + ubifs_dump_node(c, buf); + + return err; +} + +/** + * ubifs_read_node_wbuf - read node from the media or write-buffer. + * @wbuf: wbuf to check for un-written data + * @buf: buffer to read to + * @type: node type + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and length, checks it and stores + * in @buf. If the node partially or fully sits in the write-buffer, this + * function takes data from the buffer, otherwise it reads the flash media. + * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative + * error code in case of failure. + */ +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs) +{ + const struct ubifs_info *c = wbuf->c; + int err, rlen, overlap; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs, + dbg_ntype(type), len, dbg_jhead(wbuf->jhead)); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_read_node(c, buf, type, len, lnum, offs); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) { + /* Read everything that goes before write-buffer */ + err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + if (err && err != -EBADMSG) + return err; + } + + if (type != ch->node_type) { + ubifs_err(c, "bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_err(c, "expected node type %d", type); + return err; + } + + rlen = le32_to_cpu(ch->len); + if (rlen != len) { + ubifs_err(c, "bad node length %d, expected %d", rlen, len); + goto out; + } + + return 0; + +out: + ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); + ubifs_dump_node(c, buf); + dump_stack(); + return -EINVAL; +} + +/** + * ubifs_read_node - read node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and and length, checks it and + * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched + * and a negative error code in case of failure. + */ +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs) +{ + int err, l; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); + if (err && err != -EBADMSG) + return err; + + if (type != ch->node_type) { + ubifs_errc(c, "bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_errc(c, "expected node type %d", type); + return err; + } + + l = le32_to_cpu(ch->len); + if (l != len) { + ubifs_errc(c, "bad node length %d, expected %d", l, len); + goto out; + } + + return 0; + +out: + ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, + offs, ubi_is_mapped(c->ubi, lnum)); + if (!c->probing) { + ubifs_dump_node(c, buf); + dump_stack(); + } + return -EINVAL; +} + +/** + * ubifs_wbuf_init - initialize write-buffer. + * @c: UBIFS file-system description object + * @wbuf: write-buffer to initialize + * + * This function initializes write-buffer. Returns zero in case of success + * %-ENOMEM in case of failure. + */ +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) +{ + size_t size; + + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); + if (!wbuf->buf) + return -ENOMEM; + + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + wbuf->inodes = kmalloc(size, GFP_KERNEL); + if (!wbuf->inodes) { + kfree(wbuf->buf); + wbuf->buf = NULL; + return -ENOMEM; + } + + wbuf->used = 0; + wbuf->lnum = wbuf->offs = -1; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; + wbuf->sync_callback = NULL; + mutex_init(&wbuf->io_mutex); + spin_lock_init(&wbuf->lock); + wbuf->c = c; + wbuf->next_ino = 0; + + hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wbuf->timer.function = wbuf_timer_callback_nolock; + wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); + wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; + wbuf->delta *= 1000000000ULL; + ubifs_assert(wbuf->delta <= ULONG_MAX); + return 0; +} + +/** + * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. + * @wbuf: the write-buffer where to add + * @inum: the inode number + * + * This function adds an inode number to the inode array of the write-buffer. + */ +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum) +{ + if (!wbuf->buf) + /* NOR flash or something similar */ + return; + + spin_lock(&wbuf->lock); + if (wbuf->used) + wbuf->inodes[wbuf->next_ino++] = inum; + spin_unlock(&wbuf->lock); +} + +/** + * wbuf_has_ino - returns if the wbuf contains data from the inode. + * @wbuf: the write-buffer + * @inum: the inode number + * + * This function returns with %1 if the write-buffer contains some data from the + * given inode otherwise it returns with %0. + */ +static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum) +{ + int i, ret = 0; + + spin_lock(&wbuf->lock); + for (i = 0; i < wbuf->next_ino; i++) + if (inum == wbuf->inodes[i]) { + ret = 1; + break; + } + spin_unlock(&wbuf->lock); + + return ret; +} + +/** + * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode. + * @c: UBIFS file-system description object + * @inode: inode to synchronize + * + * This function synchronizes write-buffers which contain nodes belonging to + * @inode. Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode) +{ + int i, err = 0; + + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + if (i == GCHD) + /* + * GC head is special, do not look at it. Even if the + * head contains something related to this inode, it is + * a _copy_ of corresponding on-flash node which sits + * somewhere else. + */ + continue; + + if (!wbuf_has_ino(wbuf, inode->i_ino)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (wbuf_has_ino(wbuf, inode->i_ino)) + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + + if (err) { + ubifs_ro_mode(c, err); + return err; + } + } + return 0; +} diff --git a/kernel/fs/ubifs/ioctl.c b/kernel/fs/ubifs/ioctl.c new file mode 100644 index 000000000..3c7b29de0 --- /dev/null +++ b/kernel/fs/ubifs/ioctl.c @@ -0,0 +1,205 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Zoltan Sogor + * Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements EXT2-compatible extended attribute ioctl() calls */ + +#include +#include +#include "ubifs.h" + +/** + * ubifs_set_inode_flags - set VFS inode flags. + * @inode: VFS inode to set flags for + * + * This function propagates flags from UBIFS inode object to VFS inode object. + */ +void ubifs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = ubifs_inode(inode)->flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + if (flags & UBIFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & UBIFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & UBIFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & UBIFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* + * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. + * @ioctl_flags: flags to convert + * + * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * (@UBIFS_COMPR_FL, etc). + */ +static int ioctl2ubifs(int ioctl_flags) +{ + int ubifs_flags = 0; + + if (ioctl_flags & FS_COMPR_FL) + ubifs_flags |= UBIFS_COMPR_FL; + if (ioctl_flags & FS_SYNC_FL) + ubifs_flags |= UBIFS_SYNC_FL; + if (ioctl_flags & FS_APPEND_FL) + ubifs_flags |= UBIFS_APPEND_FL; + if (ioctl_flags & FS_IMMUTABLE_FL) + ubifs_flags |= UBIFS_IMMUTABLE_FL; + if (ioctl_flags & FS_DIRSYNC_FL) + ubifs_flags |= UBIFS_DIRSYNC_FL; + + return ubifs_flags; +} + +/* + * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. + * @ubifs_flags: flags to convert + * + * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags + * (@FS_COMPR_FL, etc). + */ +static int ubifs2ioctl(int ubifs_flags) +{ + int ioctl_flags = 0; + + if (ubifs_flags & UBIFS_COMPR_FL) + ioctl_flags |= FS_COMPR_FL; + if (ubifs_flags & UBIFS_SYNC_FL) + ioctl_flags |= FS_SYNC_FL; + if (ubifs_flags & UBIFS_APPEND_FL) + ioctl_flags |= FS_APPEND_FL; + if (ubifs_flags & UBIFS_IMMUTABLE_FL) + ioctl_flags |= FS_IMMUTABLE_FL; + if (ubifs_flags & UBIFS_DIRSYNC_FL) + ioctl_flags |= FS_DIRSYNC_FL; + + return ioctl_flags; +} + +static int setflags(struct inode *inode, int flags) +{ + int oldflags, err, release; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ui->data_len }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + mutex_lock(&ui->ui_mutex); + oldflags = ubifs2ioctl(ui->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock; + } + } + + ui->flags = ioctl2ubifs(flags); + ubifs_set_inode_flags(inode); + inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = write_inode_now(inode, 1); + return err; + +out_unlock: + ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino); + mutex_unlock(&ui->ui_mutex); + ubifs_release_budget(c, &req); + return err; +} + +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int flags, err; + struct inode *inode = file_inode(file); + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = ubifs2ioctl(ubifs_inode(inode)->flags); + + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); + return put_user(flags, (int __user *) arg); + + case FS_IOC_SETFLAGS: { + if (IS_RDONLY(inode)) + return -EROFS; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~FS_DIRSYNC_FL; + + /* + * Make sure the file-system is read-write and make sure it + * will not become read-only while we are changing the flags. + */ + err = mnt_want_write_file(file); + if (err) + return err; + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); + err = setflags(inode, flags); + mnt_drop_write_file(file); + return err; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff --git a/kernel/fs/ubifs/journal.c b/kernel/fs/ubifs/journal.c new file mode 100644 index 000000000..0b9da5b6e --- /dev/null +++ b/kernel/fs/ubifs/journal.c @@ -0,0 +1,1466 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS journal. + * + * The journal consists of 2 parts - the log and bud LEBs. The log has fixed + * length and position, while a bud logical eraseblock is any LEB in the main + * area. Buds contain file system data - data nodes, inode nodes, etc. The log + * contains only references to buds and some other stuff like commit + * start node. The idea is that when we commit the journal, we do + * not copy the data, the buds just become indexed. Since after the commit the + * nodes in bud eraseblocks become leaf nodes of the file system index tree, we + * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will + * become leafs in the future. + * + * The journal is multi-headed because we want to write data to the journal as + * optimally as possible. It is nice to have nodes belonging to the same inode + * in one LEB, so we may write data owned by different inodes to different + * journal heads, although at present only one data head is used. + * + * For recovery reasons, the base head contains all inode nodes, all directory + * entry nodes and all truncate nodes. This means that the other heads contain + * only data nodes. + * + * Bud LEBs may be half-indexed. For example, if the bud was not full at the + * time of commit, the bud is retained to continue to be used in the journal, + * even though the "front" of the LEB is now indexed. In that case, the log + * reference contains the offset where the bud starts for the purposes of the + * journal. + * + * The journal size has to be limited, because the larger is the journal, the + * longer it takes to mount UBIFS (scanning the journal) and the more memory it + * takes (indexing in the TNC). + * + * All the journal write operations like 'ubifs_jnl_update()' here, which write + * multiple UBIFS nodes to the journal at one go, are atomic with respect to + * unclean reboots. Should the unclean reboot happen, the recovery code drops + * all the nodes. + */ + +#include "ubifs.h" + +/** + * zero_ino_node_unused - zero out unused fields of an on-flash inode node. + * @ino: the inode to zero out + */ +static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) +{ + memset(ino->padding1, 0, 4); + memset(ino->padding2, 0, 26); +} + +/** + * zero_dent_node_unused - zero out unused fields of an on-flash directory + * entry node. + * @dent: the directory entry to zero out + */ +static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) +{ + dent->padding1 = 0; + memset(dent->padding2, 0, 4); +} + +/** + * zero_data_node_unused - zero out unused fields of an on-flash data node. + * @data: the data node to zero out + */ +static inline void zero_data_node_unused(struct ubifs_data_node *data) +{ + memset(data->padding, 0, 2); +} + +/** + * zero_trun_node_unused - zero out unused fields of an on-flash truncation + * node. + * @trun: the truncation node to zero out + */ +static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) +{ + memset(trun->padding, 0, 12); +} + +/** + * reserve_space - reserve space in the journal. + * @c: UBIFS file-system description object + * @jhead: journal head number + * @len: node length + * + * This function reserves space in journal head @head. If the reservation + * succeeded, the journal head stays locked and later has to be unlocked using + * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock + * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and + * other negative error codes in case of other failures. + */ +static int reserve_space(struct ubifs_info *c, int jhead, int len) +{ + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + /* + * Typically, the base head has smaller nodes written to it, so it is + * better to try to allocate space at the ends of eraseblocks. This is + * what the squeeze parameter does. + */ + ubifs_assert(!c->ro_media && !c->ro_mount); + squeeze = (jhead == BASEHD); +again: + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_error) { + err = -EROFS; + goto out_unlock; + } + + avail = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum != -1 && avail >= len) + return 0; + + /* + * Write buffer wasn't seek'ed or there is no enough space - look for an + * LEB with some empty space. + */ + lnum = ubifs_find_free_space(c, len, &offs, squeeze); + if (lnum >= 0) + goto out; + + err = lnum; + if (err != -ENOSPC) + goto out_unlock; + + /* + * No free space, we have to run garbage collector to make + * some. But the write-buffer mutex has to be unlocked because + * GC also takes it. + */ + dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead)); + mutex_unlock(&wbuf->io_mutex); + + lnum = ubifs_garbage_collect(c, 0); + if (lnum < 0) { + err = lnum; + if (err != -ENOSPC) + return err; + + /* + * GC could not make a free LEB. But someone else may + * have allocated new bud for this journal head, + * because we dropped @wbuf->io_mutex, so try once + * again. + */ + dbg_jnl("GC couldn't make a free LEB for jhead %s", + dbg_jhead(jhead)); + if (retries++ < 2) { + dbg_jnl("retry (%d)", retries); + goto again; + } + + dbg_jnl("return -ENOSPC"); + return err; + } + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead)); + avail = c->leb_size - wbuf->offs - wbuf->used; + + if (wbuf->lnum != -1 && avail >= len) { + /* + * Someone else has switched the journal head and we have + * enough space now. This happens when more than one process is + * trying to write to the same journal head at the same time. + */ + dbg_jnl("return LEB %d back, already have LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs + wbuf->used); + err = ubifs_return_leb(c, lnum); + if (err) + goto out_unlock; + return 0; + } + + offs = 0; + +out: + /* + * Make sure we synchronize the write-buffer before we add the new bud + * to the log. Otherwise we may have a power cut after the log + * reference node for the last bud (@lnum) is written but before the + * write-buffer data are written to the next-to-last bud + * (@wbuf->lnum). And the effect would be that the recovery would see + * that there is corruption in the next-to-last bud. + */ + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out_return; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs); + if (err) + goto out_unlock; + + return 0; + +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return err; + +out_return: + /* An error occurred and the LEB has to be returned to lprops */ + ubifs_assert(err < 0); + err1 = ubifs_return_leb(c, lnum); + if (err1 && err == -EAGAIN) + /* + * Return original error code only if it is not %-EAGAIN, + * which is not really an error. Otherwise, return the error + * code of 'ubifs_return_leb()'. + */ + err = err1; + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * write_node - write node to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @node: node to write + * @len: node length + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function writes a node to reserved space of journal head @jhead. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int write_node(struct ubifs_info *c, int jhead, void *node, int len, + int *lnum, int *offs) +{ + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); + ubifs_prepare_node(c, node, len, 0); + + return ubifs_wbuf_write_nolock(wbuf, node, len); +} + +/** + * write_head - write data to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @buf: buffer to write + * @len: length to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * @sync: non-zero if the write-buffer has to by synchronized + * + * This function is the same as 'write_node()' but it does not assume the + * buffer it is writing is a node, so it does not prepare it (which means + * initializing common header and calculating CRC). + */ +static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, + int *lnum, int *offs, int sync) +{ + int err; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); + + err = ubifs_wbuf_write_nolock(wbuf, buf, len); + if (err) + return err; + if (sync) + err = ubifs_wbuf_sync_nolock(wbuf); + return err; +} + +/** + * make_reservation - reserve journal space. + * @c: UBIFS file-system description object + * @jhead: journal head + * @len: how many bytes to reserve + * + * This function makes space reservation in journal head @jhead. The function + * takes the commit lock and locks the journal head, and the caller has to + * unlock the head and finish the reservation with 'finish_reservation()'. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, the journal head may be unlocked as soon as the data is written, while + * the commit lock has to be released after the data has been added to the + * TNC. + */ +static int make_reservation(struct ubifs_info *c, int jhead, int len) +{ + int err, cmt_retries = 0, nospc_retries = 0; + +again: + down_read(&c->commit_sem); + err = reserve_space(c, jhead, len); + if (!err) + return 0; + up_read(&c->commit_sem); + + if (err == -ENOSPC) { + /* + * GC could not make any progress. We should try to commit + * once because it could make some dirty space and GC would + * make progress, so make the error -EAGAIN so that the below + * will commit and re-try. + */ + if (nospc_retries++ < 2) { + dbg_jnl("no space, retry"); + err = -EAGAIN; + } + + /* + * This means that the budgeting is incorrect. We always have + * to be able to write to the media, because all operations are + * budgeted. Deletions are not budgeted, though, but we reserve + * an extra LEB for them. + */ + } + + if (err != -EAGAIN) + goto out; + + /* + * -EAGAIN means that the journal is full or too large, or the above + * code wants to do one commit. Do this and re-try. + */ + if (cmt_retries > 128) { + /* + * This should not happen unless the journal size limitations + * are too tough. + */ + ubifs_err(c, "stuck in space allocation"); + err = -ENOSPC; + goto out; + } else if (cmt_retries > 32) + ubifs_warn(c, "too many space allocation re-tries (%d)", + cmt_retries); + + dbg_jnl("-EAGAIN, commit and retry (retried %d times)", + cmt_retries); + cmt_retries += 1; + + err = ubifs_run_commit(c); + if (err) + return err; + goto again; + +out: + ubifs_err(c, "cannot reserve %d bytes in jhead %d, error %d", + len, jhead, err); + if (err == -ENOSPC) { + /* This are some budgeting problems, print useful information */ + down_write(&c->commit_sem); + dump_stack(); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + cmt_retries = dbg_check_lprops(c); + up_write(&c->commit_sem); + } + return err; +} + +/** + * release_head - release a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * + * This function releases journal head @jhead which was locked by + * the 'make_reservation()' function. It has to be called after each successful + * 'make_reservation()' invocation. + */ +static inline void release_head(struct ubifs_info *c, int jhead) +{ + mutex_unlock(&c->jheads[jhead].wbuf.io_mutex); +} + +/** + * finish_reservation - finish a reservation. + * @c: UBIFS file-system description object + * + * This function finishes journal space reservation. It must be called after + * 'make_reservation()'. + */ +static void finish_reservation(struct ubifs_info *c) +{ + up_read(&c->commit_sem); +} + +/** + * get_dent_type - translate VFS inode mode to UBIFS directory entry type. + * @mode: inode mode + */ +static int get_dent_type(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return UBIFS_ITYPE_REG; + case S_IFDIR: + return UBIFS_ITYPE_DIR; + case S_IFLNK: + return UBIFS_ITYPE_LNK; + case S_IFBLK: + return UBIFS_ITYPE_BLK; + case S_IFCHR: + return UBIFS_ITYPE_CHR; + case S_IFIFO: + return UBIFS_ITYPE_FIFO; + case S_IFSOCK: + return UBIFS_ITYPE_SOCK; + default: + BUG(); + } + return 0; +} + +/** + * pack_inode - pack an inode node. + * @c: UBIFS file-system description object + * @ino: buffer in which to pack inode node + * @inode: inode to pack + * @last: indicates the last node of the group + */ +static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, + const struct inode *inode, int last) +{ + int data_len = 0, last_reference = !inode->i_nlink; + struct ubifs_inode *ui = ubifs_inode(inode); + + ino->ch.node_type = UBIFS_INO_NODE; + ino_key_init_flash(c, &ino->key, inode->i_ino); + ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); + ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); + ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); + ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); + ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->uid = cpu_to_le32(i_uid_read(inode)); + ino->gid = cpu_to_le32(i_gid_read(inode)); + ino->mode = cpu_to_le32(inode->i_mode); + ino->flags = cpu_to_le32(ui->flags); + ino->size = cpu_to_le64(ui->ui_size); + ino->nlink = cpu_to_le32(inode->i_nlink); + ino->compr_type = cpu_to_le16(ui->compr_type); + ino->data_len = cpu_to_le32(ui->data_len); + ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt); + ino->xattr_size = cpu_to_le32(ui->xattr_size); + ino->xattr_names = cpu_to_le32(ui->xattr_names); + zero_ino_node_unused(ino); + + /* + * Drop the attached data if this is a deletion inode, the data is not + * needed anymore. + */ + if (!last_reference) { + memcpy(ino->data, ui->data, ui->data_len); + data_len = ui->data_len; + } + + ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last); +} + +/** + * mark_inode_clean - mark UBIFS inode as clean. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to mark as clean + * + * This helper function marks UBIFS inode @ui as clean by cleaning the + * @ui->dirty flag and releasing its budget. Note, VFS may still treat the + * inode as dirty and try to write it back, but 'ubifs_write_inode()' would + * just do nothing. + */ +static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) +{ + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + ui->dirty = 0; +} + +/** + * ubifs_jnl_update - update inode. + * @c: UBIFS file-system description object + * @dir: parent inode or host inode in case of extended attributes + * @nm: directory entry name + * @inode: inode to update + * @deletion: indicates a directory entry deletion i.e unlink or rmdir + * @xent: non-zero if the directory entry is an extended attribute entry + * + * This function updates an inode by writing a directory entry (or extended + * attribute entry), the inode itself, and the parent directory inode (or the + * host inode) to the journal. + * + * The function writes the host inode @dir last, which is important in case of + * extended attributes. Indeed, then we guarantee that if the host inode gets + * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed, + * the extended attribute inode gets flushed too. And this is exactly what the + * user expects - synchronizing the host inode synchronizes its extended + * attributes. Similarly, this guarantees that if @dir is synchronized, its + * directory entry corresponding to @nm gets synchronized too. + * + * If the inode (@inode) or the parent directory (@dir) are synchronous, this + * function synchronizes the write-buffer. + * + * This function marks the @dir and @inode inodes as clean and returns zero on + * success. In case of failure, a negative error code is returned. + */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent) +{ + int err, dlen, ilen, len, lnum, ino_offs, dent_offs; + int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); + int last_reference = !!(deletion && inode->i_nlink == 0); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *host_ui = ubifs_inode(dir); + struct ubifs_dent_node *dent; + struct ubifs_ino_node *ino; + union ubifs_key dent_key, ino_key; + + dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + ilen = UBIFS_INO_NODE_SZ; + + /* + * If the last reference to the inode is being deleted, then there is + * no need to attach and write inode data, it is being deleted anyway. + * And if the inode is being deleted, no need to synchronize + * write-buffer even if the inode is synchronous. + */ + if (!last_reference) { + ilen += ui->data_len; + sync |= IS_SYNC(inode); + } + + aligned_dlen = ALIGN(dlen, 8); + aligned_ilen = ALIGN(ilen, 8); + + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + /* Make sure to also account for extended attributes */ + len += host_ui->data_len; + + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + if (!xent) { + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init(c, &dent_key, dir->i_ino, nm); + } else { + dent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &dent_key, dir->i_ino, nm); + } + + key_write(c, &dent_key, dent->key); + dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); + dent->type = get_dent_type(inode->i_mode); + dent->nlen = cpu_to_le16(nm->len); + memcpy(dent->name, nm->name, nm->len); + dent->name[nm->len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen, 0); + + ino = (void *)dent + aligned_dlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + aligned_ilen; + pack_inode(c, ino, dir, 1); + + if (last_reference) { + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino); + } + release_head(c, BASEHD); + kfree(dent); + + if (deletion) { + err = ubifs_tnc_remove_nm(c, &dent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, dlen); + } else + err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm); + if (err) + goto out_ro; + + /* + * Note, we do not remove the inode from TNC even if the last reference + * to it has just been deleted, because the inode may still be opened. + * Instead, the inode has been added to orphan lists and the orphan + * subsystem will take further care about it. + */ + ino_key_init(c, &ino_key, inode->i_ino); + ino_offs = dent_offs + aligned_dlen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen); + if (err) + goto out_ro; + + ino_key_init(c, &ino_key, dir->i_ino); + ino_offs += aligned_ilen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, + UBIFS_INO_NODE_SZ + host_ui->data_len); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + mark_inode_clean(c, host_ui); + return 0; + +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; + +out_release: + release_head(c, BASEHD); + kfree(dent); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_write_data - write a data node to the journal. + * @c: UBIFS file-system description object + * @inode: inode the data node belongs to + * @key: node key + * @buf: buffer to write + * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) + * + * This function writes a data node to the journal. Returns %0 if the data node + * was successfully written, and a negative error code in case of failure. + */ +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len) +{ + struct ubifs_data_node *data; + int err, lnum, offs, compr_type, out_len; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", + (unsigned long)key_inum(c, key), key_block(c, key), len); + ubifs_assert(len <= UBIFS_BLOCK_SIZE); + + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } + + data->ch.node_type = UBIFS_DATA_NODE; + key_write(c, key, &data->key); + data->size = cpu_to_le32(len); + zero_data_node_unused(data); + + if (!(ui->flags & UBIFS_COMPR_FL)) + /* Compression is disabled for this inode */ + compr_type = UBIFS_COMPR_NONE; + else + compr_type = ui->compr_type; + + out_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + + dlen = UBIFS_DATA_NODE_SZ + out_len; + data->compr_type = cpu_to_le16(compr_type); + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, DATAHD, dlen); + if (err) + goto out_free; + + err = write_node(c, DATAHD, data, dlen, &lnum, &offs); + if (err) + goto out_release; + ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key)); + release_head(c, DATAHD); + + err = ubifs_tnc_add(c, key, lnum, offs, dlen); + if (err) + goto out_ro; + + finish_reservation(c); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); + return 0; + +out_release: + release_head(c, DATAHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); + return err; +} + +/** + * ubifs_jnl_write_inode - flush inode to the journal. + * @c: UBIFS file-system description object + * @inode: inode to flush + * + * This function writes inode @inode to the journal. If the inode is + * synchronous, it also synchronizes the write-buffer. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err, lnum, offs; + struct ubifs_ino_node *ino; + struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; + + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + + /* + * If the inode is being deleted, do not write the attached data. No + * need to synchronize the write-buffer either. + */ + if (!last_reference) { + len += ui->data_len; + sync = IS_SYNC(inode); + } + ino = kmalloc(len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 1); + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + inode->i_ino); + release_head(c, BASEHD); + + if (last_reference) { + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + goto out_ro; + ubifs_delete_orphan(c, inode->i_ino); + err = ubifs_add_dirt(c, lnum, len); + } else { + union ubifs_key key; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len); + } + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +/** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** + * ubifs_jnl_rename - rename a directory entry. + * @c: UBIFS file-system description object + * @old_dir: parent inode of directory entry to rename + * @old_dentry: directory entry to rename + * @new_dir: parent inode of directory entry to rename + * @new_dentry: new directory entry (or directory entry to replace) + * @sync: non-zero if the write-buffer has to be synchronized + * + * This function implements the re-name operation which may involve writing up + * to 3 inodes and 2 directory entries. It marks the written inodes as clean + * and returns zero on success. In case of failure, a negative error code is + * returned. + */ +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync) +{ + void *p; + union ubifs_key key; + struct ubifs_dent_node *dent, *dent2; + int err, dlen1, dlen2, ilen, lnum, offs, len; + const struct inode *old_inode = d_inode(old_dentry); + const struct inode *new_inode = d_inode(new_dentry); + int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; + int last_reference = !!(new_inode && new_inode->i_nlink == 0); + int move = (old_dir != new_dir); + struct ubifs_inode *uninitialized_var(new_ui); + + dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", + old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); + ubifs_assert(ubifs_inode(old_dir)->data_len == 0); + ubifs_assert(ubifs_inode(new_dir)->data_len == 0); + ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); + ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); + + dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; + dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + if (new_inode) { + new_ui = ubifs_inode(new_inode); + ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); + ilen = UBIFS_INO_NODE_SZ; + if (!last_reference) + ilen += new_ui->data_len; + } else + ilen = 0; + + aligned_dlen1 = ALIGN(dlen1, 8); + aligned_dlen2 = ALIGN(dlen2, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + if (old_dir != new_dir) + len += plen; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + /* Make new dent */ + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent->inum = cpu_to_le64(old_inode->i_ino); + dent->type = get_dent_type(old_inode->i_mode); + dent->nlen = cpu_to_le16(new_dentry->d_name.len); + memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); + dent->name[new_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen1, 0); + + /* Make deletion dent */ + dent2 = (void *)dent + aligned_dlen1; + dent2->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, + &old_dentry->d_name); + dent2->inum = 0; + dent2->type = DT_UNKNOWN; + dent2->nlen = cpu_to_le16(old_dentry->d_name.len); + memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); + dent2->name[old_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent2); + ubifs_prep_grp_node(c, dent2, dlen2, 0); + + p = (void *)dent2 + aligned_dlen2; + if (new_inode) { + pack_inode(c, p, new_inode, 0); + p += ALIGN(ilen, 8); + } + + if (!move) + pack_inode(c, p, old_dir, 1); + else { + pack_inode(c, p, old_dir, 0); + p += ALIGN(plen, 8); + pack_inode(c, p, new_dir, 1); + } + + if (last_reference) { + err = ubifs_add_orphan(c, new_inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + new_ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino); + if (new_inode) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + new_inode->i_ino); + } + release_head(c, BASEHD); + + dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, dlen2); + if (err) + goto out_ro; + + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen1 + aligned_dlen2; + if (new_inode) { + ino_key_init(c, &key, new_inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen); + if (err) + goto out_ro; + offs += ALIGN(ilen, 8); + } + + ino_key_init(c, &key, old_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + + if (old_dir != new_dir) { + offs += ALIGN(plen, 8); + ino_key_init(c, &key, new_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + } + + finish_reservation(c); + if (new_inode) { + mark_inode_clean(c, new_ui); + spin_lock(&new_ui->ui_lock); + new_ui->synced_i_size = new_ui->ui_size; + spin_unlock(&new_ui->ui_lock); + } + mark_inode_clean(c, ubifs_inode(old_dir)); + if (move) + mark_inode_clean(c, ubifs_inode(new_dir)); + kfree(dent); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, new_inode->i_ino); +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; +} + +/** + * recomp_data_node - re-compress a truncated data node. + * @dn: data node to re-compress + * @new_len: new length + * + * This function is used when an inode is truncated and the last data node of + * the inode has to be re-compressed and re-written. + */ +static int recomp_data_node(const struct ubifs_info *c, + struct ubifs_data_node *dn, int *new_len) +{ + void *buf; + int err, len, compr_type, out_len; + + out_len = le32_to_cpu(dn->size); + buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + compr_type = le16_to_cpu(dn->compr_type); + err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + dn->compr_type = cpu_to_le16(compr_type); + dn->size = cpu_to_le32(*new_len); + *new_len = UBIFS_DATA_NODE_SZ + out_len; +out: + kfree(buf); + return err; +} + +/** + * ubifs_jnl_truncate - update the journal for a truncation. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @old_size: old size + * @new_size: new size + * + * When the size of a file decreases due to truncation, a truncation node is + * written, the journal tree is updated, and the last data block is re-written + * if it has been affected. The inode is also updated in order to synchronize + * the new inode size. + * + * This function marks the inode as clean and returns zero on success. In case + * of failure, a negative error code is returned. + */ +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size) +{ + union ubifs_key key, to_key; + struct ubifs_ino_node *ino; + struct ubifs_trun_node *trun; + struct ubifs_data_node *uninitialized_var(dn); + int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + ino_t inum = inode->i_ino; + unsigned int blk; + + dbg_jnl("ino %lu, size %lld -> %lld", + (unsigned long)inum, old_size, new_size); + ubifs_assert(!ui->data_len); + ubifs_assert(S_ISREG(inode->i_mode)); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + ino = kmalloc(sz, GFP_NOFS); + if (!ino) + return -ENOMEM; + + trun = (void *)ino + UBIFS_INO_NODE_SZ; + trun->ch.node_type = UBIFS_TRUN_NODE; + trun->inum = cpu_to_le32(inum); + trun->old_size = cpu_to_le64(old_size); + trun->new_size = cpu_to_le64(new_size); + zero_trun_node_unused(trun); + + dlen = new_size & (UBIFS_BLOCK_SIZE - 1); + if (dlen) { + /* Get last data block so it can be truncated */ + dn = (void *)trun + UBIFS_TRUN_NODE_SZ; + blk = new_size >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &key, inum, blk); + dbg_jnlk(&key, "last block key "); + err = ubifs_tnc_lookup(c, &key, dn); + if (err == -ENOENT) + dlen = 0; /* Not found (so it is a hole) */ + else if (err) + goto out_free; + else { + if (le32_to_cpu(dn->size) <= dlen) + dlen = 0; /* Nothing to do */ + else { + int compr_type = le16_to_cpu(dn->compr_type); + + if (compr_type != UBIFS_COMPR_NONE) { + err = recomp_data_node(c, dn, &dlen); + if (err) + goto out_free; + } else { + dn->size = cpu_to_le32(dlen); + dlen += UBIFS_DATA_NODE_SZ; + } + zero_data_node_unused(dn); + } + } + } + + /* Must make reservation before allocating sequence numbers */ + len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ; + if (dlen) + len += dlen; + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 0); + ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); + if (dlen) + ubifs_prep_grp_node(c, dn, dlen, 1); + + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum); + release_head(c, BASEHD); + + if (dlen) { + sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ; + err = ubifs_tnc_add(c, &key, lnum, sz, dlen); + if (err) + goto out_ro; + } + + ino_key_init(c, &key, inum); + err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ); + if (err) + goto out_ro; + + bit = new_size & (UBIFS_BLOCK_SIZE - 1); + blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0); + data_key_init(c, &key, inum, blk); + + bit = old_size & (UBIFS_BLOCK_SIZE - 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1); + data_key_init(c, &to_key, inum, blk); + + err = ubifs_tnc_remove_range(c, &key, &to_key); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + + +/** + * ubifs_jnl_delete_xattr - delete an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @nm: extended attribute entry name + * + * This function delete an extended attribute which is very similar to + * un-linking regular files - it writes a deletion xentry, a deletion inode and + * updates the target inode. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm) +{ + int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; + struct ubifs_dent_node *xent; + struct ubifs_ino_node *ino; + union ubifs_key xent_key, key1, key2; + int sync = IS_DIRSYNC(host); + struct ubifs_inode *host_ui = ubifs_inode(host); + + dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + host->i_ino, inode->i_ino, nm->name, + ubifs_inode(inode)->data_len); + ubifs_assert(inode->i_nlink == 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + /* + * Since we are deleting the inode, we do not bother to attach any data + * to it and assume its length is %UBIFS_INO_NODE_SZ. + */ + xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + aligned_xlen = ALIGN(xlen, 8); + hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; + len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); + + xent = kmalloc(len, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) { + kfree(xent); + return err; + } + + xent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &xent_key, host->i_ino, nm); + key_write(c, &xent_key, xent->key); + xent->inum = 0; + xent->type = get_dent_type(inode->i_mode); + xent->nlen = cpu_to_le16(nm->len); + memcpy(xent->name, nm->name, nm->len); + xent->name[nm->len] = '\0'; + zero_dent_node_unused(xent); + ubifs_prep_grp_node(c, xent, xlen, 0); + + ino = (void *)xent + aligned_xlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + UBIFS_INO_NODE_SZ; + pack_inode(c, ino, host, 1); + + err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); + if (!sync && !err) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino); + release_head(c, BASEHD); + kfree(xent); + if (err) + goto out_ro; + + /* Remove the extended attribute entry from TNC */ + err = ubifs_tnc_remove_nm(c, &xent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, xlen); + if (err) + goto out_ro; + + /* + * Remove all nodes belonging to the extended attribute inode from TNC. + * Well, there actually must be only one node - the inode itself. + */ + lowest_ino_key(c, &key1, inode->i_ino); + highest_ino_key(c, &key2, inode->i_ino); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + /* And update TNC with the new host inode position */ + ino_key_init(c, &key1, host->i_ino); + err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @inode: extended attribute inode + * @host: host inode + * + * This function writes the updated version of an extended attribute inode and + * the host inode to the journal (to the base head). The host inode is written + * after the extended attribute inode in order to guarantee that the extended + * attribute will be flushed when the inode is synchronized by 'fsync()' and + * consequently, the write-buffer is synchronized. This function returns zero + * in case of success and a negative error code in case of failure. + */ +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, + const struct inode *host) +{ + int err, len1, len2, aligned_len, aligned_len1, lnum, offs; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_ino_node *ino; + union ubifs_key key; + int sync = IS_DIRSYNC(host); + + dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + ubifs_assert(host->i_nlink > 0); + ubifs_assert(inode->i_nlink > 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + len1 = UBIFS_INO_NODE_SZ + host_ui->data_len; + len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len; + aligned_len1 = ALIGN(len1, 8); + aligned_len = aligned_len1 + ALIGN(len2, 8); + + ino = kmalloc(aligned_len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, aligned_len); + if (err) + goto out_free; + + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); + + err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); + if (!sync && !err) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + } + release_head(c, BASEHD); + if (err) + goto out_ro; + + ino_key_init(c, &key, host->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len1); + if (err) + goto out_ro; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + kfree(ino); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + diff --git a/kernel/fs/ubifs/key.h b/kernel/fs/ubifs/key.h new file mode 100644 index 000000000..92a8491a8 --- /dev/null +++ b/kernel/fs/ubifs/key.h @@ -0,0 +1,548 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This header contains various key-related definitions and helper function. + * UBIFS allows several key schemes, so we access key fields only via these + * helpers. At the moment only one key scheme is supported. + * + * Simple key scheme + * ~~~~~~~~~~~~~~~~~ + * + * Keys are 64-bits long. First 32-bits are inode number (parent inode number + * in case of direntry key). Next 3 bits are node type. The last 29 bits are + * 4KiB offset in case of inode node, and direntry hash in case of a direntry + * node. We use "r5" hash borrowed from reiserfs. + */ + +#ifndef __UBIFS_KEY_H__ +#define __UBIFS_KEY_H__ + +/** + * key_mask_hash - mask a valid hash value. + * @val: value to be masked + * + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This + * function makes sure the reserved values are not used. + */ +static inline uint32_t key_mask_hash(uint32_t hash) +{ + hash &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(hash <= 2)) + hash += 3; + return hash; +} + +/** + * key_r5_hash - R5 hash function (borrowed from reiserfs). + * @s: direntry name + * @len: name length + */ +static inline uint32_t key_r5_hash(const char *s, int len) +{ + uint32_t a = 0; + const signed char *str = (const signed char *)s; + + while (*str) { + a += *str << 4; + a += *str >> 4; + a *= 11; + str++; + } + + return key_mask_hash(a); +} + +/** + * key_test_hash - testing hash function. + * @str: direntry name + * @len: name length + */ +static inline uint32_t key_test_hash(const char *str, int len) +{ + uint32_t a = 0; + + len = min_t(uint32_t, len, 4); + memcpy(&a, str, len); + return key_mask_hash(a); +} + +/** + * ino_key_init - initialize inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void ino_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * ino_key_init_flash - initialize on-flash inode key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + */ +static inline void ino_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum) +{ + union ubifs_key *key = k; + + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_ino_key - get the lowest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void lowest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0; +} + +/** + * highest_ino_key - get the highest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0xffffffff; +} + +/** + * dent_key_init - initialize directory entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_hash - initialize directory entry key without re-calculating + * hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @hash: direntry name hash + */ +static inline void dent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_flash - initialize on-flash directory entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_dent_key - get the lowest possible directory entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: parent inode number + */ +static inline void lowest_dent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * xent_key_init - initialize extended attribute entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_flash - initialize on-flash extended attribute entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_xent_key - get the lowest possible extended attribute entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: host inode number + */ +static inline void lowest_xent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * data_key_init - initialize data key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + unsigned int block) +{ + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->u32[0] = inum; + key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS); +} + +/** + * highest_data_key - get the highest possible data key for an inode. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_data_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK); +} + +/** + * trun_key_init - initialize truncation node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * + * Note, UBIFS does not have truncation keys on the media and this function is + * only used for purposes of replay. + */ +static inline void trun_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * invalid_key_init - initialize invalid node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * + * This is a helper function which marks a @key object as invalid. + */ +static inline void invalid_key_init(const struct ubifs_info *c, + union ubifs_key *key) +{ + key->u32[0] = 0xDEADBEAF; + key->u32[1] = UBIFS_INVALID_KEY; +} + +/** + * key_type - get key type. + * @c: UBIFS file-system description object + * @key: key to get type of + */ +static inline int key_type(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type_flash - get type of a on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to get type of + */ +static inline int key_type_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_inum - fetch inode number from key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return key->u32[0]; +} + +/** + * key_inum_flash - fetch inode number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[0]); +} + +/** + * key_hash - get directory entry hash. + * @c: UBIFS file-system description object + * @key: the key to get hash from + */ +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_hash_flash - get directory entry hash from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get hash from + */ +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_block - get data block number. + * @c: UBIFS file-system description object + * @key: the key to get the block number from + */ +static inline unsigned int key_block(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_block_flash - get data block number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get the block number from + */ +static inline unsigned int key_block_flash(const struct ubifs_info *c, + const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_read - transform a key to in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_read(const struct ubifs_info *c, const void *from, + union ubifs_key *to) +{ + const union ubifs_key *f = from; + + to->u32[0] = le32_to_cpu(f->j32[0]); + to->u32[1] = le32_to_cpu(f->j32[1]); +} + +/** + * key_write - transform a key from in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); + memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * key_write_idx - transform a key from in-memory format for the index. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write_idx(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); +} + +/** + * key_copy - copy a key. + * @c: UBIFS file-system description object + * @from: the key to copy from + * @to: the key to copy to + */ +static inline void key_copy(const struct ubifs_info *c, + const union ubifs_key *from, union ubifs_key *to) +{ + to->u64[0] = from->u64[0]; +} + +/** + * keys_cmp - compare keys. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %-1 if @key1 is less than + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. + */ +static inline int keys_cmp(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] < key2->u32[0]) + return -1; + if (key1->u32[0] > key2->u32[0]) + return 1; + if (key1->u32[1] < key2->u32[1]) + return -1; + if (key1->u32[1] > key2->u32[1]) + return 1; + + return 0; +} + +/** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** + * is_hash_key - is a key vulnerable to hash collisions. + * @c: UBIFS file-system description object + * @key: key + * + * This function returns %1 if @key is a hashed key or %0 otherwise. + */ +static inline int is_hash_key(const struct ubifs_info *c, + const union ubifs_key *key) +{ + int type = key_type(c, key); + + return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY; +} + +/** + * key_max_inode_size - get maximum file size allowed by current key format. + * @c: UBIFS file-system description object + */ +static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) +{ + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE; + default: + return 0; + } +} + +#endif /* !__UBIFS_KEY_H__ */ diff --git a/kernel/fs/ubifs/log.c b/kernel/fs/ubifs/log.c new file mode 100644 index 000000000..8c795e639 --- /dev/null +++ b/kernel/fs/ubifs/log.c @@ -0,0 +1,753 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file is a part of UBIFS journal implementation and contains various + * functions which manipulate the log. The log is a fixed area on the flash + * which does not contain any data but refers to buds. The log is a part of the + * journal. + */ + +#include "ubifs.h" + +static int dbg_check_bud_bytes(struct ubifs_info *c); + +/** + * ubifs_search_bud - search bud LEB. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This function searches bud LEB @lnum. Returns bud description object in case + * of success and %NULL if there is no bud with this LEB number. + */ +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + spin_unlock(&c->buds_lock); + return bud; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This functions returns the wbuf for @lnum or %NULL if there is not one. + */ +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + int jhead; + + if (!c->jheads) + return NULL; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + jhead = bud->jhead; + spin_unlock(&c->buds_lock); + return &c->jheads[jhead].wbuf; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * empty_log_bytes - calculate amount of empty space in the log. + * @c: UBIFS file-system description object + */ +static inline long long empty_log_bytes(const struct ubifs_info *c) +{ + long long h, t; + + h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; + t = (long long)c->ltail_lnum * c->leb_size; + + if (h > t) + return c->log_bytes - h + t; + else if (h != t) + return t - h; + else if (c->lhead_lnum != c->ltail_lnum) + return 0; + else + return c->log_bytes; +} + +/** + * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list. + * @c: UBIFS file-system description object + * @bud: the bud to add + */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct rb_node **p, *parent = NULL; + struct ubifs_bud *b; + struct ubifs_jhead *jhead; + + spin_lock(&c->buds_lock); + p = &c->buds.rb_node; + while (*p) { + parent = *p; + b = rb_entry(parent, struct ubifs_bud, rb); + ubifs_assert(bud->lnum != b->lnum); + if (bud->lnum < b->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bud->rb, parent, p); + rb_insert_color(&bud->rb, &c->buds); + if (c->jheads) { + jhead = &c->jheads[bud->jhead]; + list_add_tail(&bud->list, &jhead->buds_list); + } else + ubifs_assert(c->replaying && c->ro_mount); + + /* + * Note, although this is a new bud, we anyway account this space now, + * before any data has been written to it, because this is about to + * guarantee fixed mount time, and this bud will anyway be read and + * scanned. + */ + c->bud_bytes += c->leb_size - bud->start; + + dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum, + bud->start, dbg_jhead(bud->jhead), c->bud_bytes); + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_add_bud_to_log - add a new bud to the log. + * @c: UBIFS file-system description object + * @jhead: journal head the bud belongs to + * @lnum: LEB number of the bud + * @offs: starting offset of the bud + * + * This function writes reference node for the new bud LEB @lnum it to the log, + * and adds it to the buds tress. It also makes sure that log size does not + * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success, + * %-EAGAIN if commit is required, and a negative error codes in case of + * failure. + */ +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) +{ + int err; + struct ubifs_bud *bud; + struct ubifs_ref_node *ref; + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS); + if (!bud) + return -ENOMEM; + ref = kzalloc(c->ref_node_alsz, GFP_NOFS); + if (!ref) { + kfree(bud); + return -ENOMEM; + } + + mutex_lock(&c->log_mutex); + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) { + err = -EROFS; + goto out_unlock; + } + + /* Make sure we have enough space in the log */ + if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) { + dbg_log("not enough log space - %lld, required %d", + empty_log_bytes(c), c->min_log_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * Make sure the amount of space in buds will not exceed the + * 'c->max_bud_bytes' limit, because we want to guarantee mount time + * limits. + * + * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes + * because we are holding @c->log_mutex. All @c->bud_bytes take place + * when both @c->log_mutex and @c->bud_bytes are locked. + */ + if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) { + dbg_log("bud bytes %lld (%lld max), require commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * If the journal is full enough - start background commit. Note, it is + * OK to read 'c->cmt_state' without spinlock because integer reads + * are atomic in the kernel. + */ + if (c->bud_bytes >= c->bg_bud_bytes && + c->cmt_state == COMMIT_RESTING) { + dbg_log("bud bytes %lld (%lld max), initiate BG commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_request_bg_commit(c); + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(bud->lnum); + ref->offs = cpu_to_le32(bud->start); + ref->jhead = cpu_to_le32(jhead); + + if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next log LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out_unlock; + } + + if (bud->start == 0) { + /* + * Before writing the LEB reference which refers an empty LEB + * to the log, we have to make sure it is mapped, because + * otherwise we'd risk to refer an LEB with garbage in case of + * an unclean reboot, because the target LEB might have been + * unmapped, but not yet physically erased. + */ + err = ubifs_leb_map(c, bud->lnum); + if (err) + goto out_unlock; + } + + dbg_log("write ref LEB %d:%d", + c->lhead_lnum, c->lhead_offs); + err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, + c->lhead_offs); + if (err) + goto out_unlock; + + c->lhead_offs += c->ref_node_alsz; + + ubifs_add_bud(c, bud); + + mutex_unlock(&c->log_mutex); + kfree(ref); + return 0; + +out_unlock: + mutex_unlock(&c->log_mutex); + kfree(ref); + kfree(bud); + return err; +} + +/** + * remove_buds - remove used buds. + * @c: UBIFS file-system description object + * + * This function removes use buds from the buds tree. It does not remove the + * buds which are pointed to by journal heads. + */ +static void remove_buds(struct ubifs_info *c) +{ + struct rb_node *p; + + ubifs_assert(list_empty(&c->old_buds)); + c->cmt_bud_bytes = 0; + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct rb_node *p1 = p; + struct ubifs_bud *bud; + struct ubifs_wbuf *wbuf; + + p = rb_next(p); + bud = rb_entry(p1, struct ubifs_bud, rb); + wbuf = &c->jheads[bud->jhead].wbuf; + + if (wbuf->lnum == bud->lnum) { + /* + * Do not remove buds which are pointed to by journal + * heads (non-closed buds). + */ + c->cmt_bud_bytes += wbuf->offs - bud->start; + dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + wbuf->offs - bud->start, c->cmt_bud_bytes); + bud->start = wbuf->offs; + } else { + c->cmt_bud_bytes += c->leb_size - bud->start; + dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + c->leb_size - bud->start, c->cmt_bud_bytes); + rb_erase(p1, &c->buds); + /* + * If the commit does not finish, the recovery will need + * to replay the journal, in which case the old buds + * must be unchanged. Do not release them until post + * commit i.e. do not allow them to be garbage + * collected. + */ + list_move(&bud->list, &c->old_buds); + } + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_log_start_commit - start commit. + * @c: UBIFS file-system description object + * @ltail_lnum: return new log tail LEB number + * + * The commit operation starts with writing "commit start" node to the log and + * reference nodes for all journal heads which will define new journal after + * the commit has been finished. The commit start and reference nodes are + * written in one go to the nearest empty log LEB (hence, when commit is + * finished UBIFS may safely unmap all the previous log LEBs). This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) +{ + void *buf; + struct ubifs_cs_node *cs; + struct ubifs_ref_node *ref; + int err, i, max_len, len; + + err = dbg_check_bud_bytes(c); + if (err) + return err; + + max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ; + max_len = ALIGN(max_len, c->min_io_size); + buf = cs = kmalloc(max_len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + cs->cmt_no = cpu_to_le64(c->cmt_no); + ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); + + /* + * Note, we do not lock 'c->log_mutex' because this is the commit start + * phase and we are exclusively using the log. And we do not lock + * write-buffer because nobody can write to the file-system at this + * phase. + */ + + len = UBIFS_CS_NODE_SZ; + for (i = 0; i < c->jhead_cnt; i++) { + int lnum = c->jheads[i].wbuf.lnum; + int offs = c->jheads[i].wbuf.offs; + + if (lnum == -1 || offs == c->leb_size) + continue; + + dbg_log("add ref to LEB %d:%d for jhead %s", + lnum, offs, dbg_jhead(i)); + ref = buf + len; + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(lnum); + ref->offs = cpu_to_le32(offs); + ref->jhead = cpu_to_le32(i); + + ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0); + len += UBIFS_REF_NODE_SZ; + } + + ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len); + + /* Switch to the next log LEB */ + if (c->lhead_offs) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); + c->lhead_offs = 0; + } + + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; + + len = ALIGN(len, c->min_io_size); + dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len); + if (err) + goto out; + + *ltail_lnum = c->lhead_lnum; + + c->lhead_offs += len; + if (c->lhead_offs == c->leb_size) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + remove_buds(c); + + /* + * We have started the commit and now users may use the rest of the log + * for new writes. + */ + c->min_log_bytes = 0; + +out: + kfree(buf); + return err; +} + +/** + * ubifs_log_end_commit - end commit. + * @c: UBIFS file-system description object + * @ltail_lnum: new log tail LEB number + * + * This function is called on when the commit operation was finished. It + * moves log tail to new position and updates the master node so that it stores + * the new log tail LEB number. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) +{ + int err; + + /* + * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS + * writes during commit. Its only short "commit" start phase when + * writers are blocked. + */ + mutex_lock(&c->log_mutex); + + dbg_log("old tail was LEB %d:0, new tail is LEB %d:0", + c->ltail_lnum, ltail_lnum); + + c->ltail_lnum = ltail_lnum; + /* + * The commit is finished and from now on it must be guaranteed that + * there is always enough space for the next commit. + */ + c->min_log_bytes = c->leb_size; + + spin_lock(&c->buds_lock); + c->bud_bytes -= c->cmt_bud_bytes; + spin_unlock(&c->buds_lock); + + err = dbg_check_bud_bytes(c); + if (err) + goto out; + + err = ubifs_write_master(c); + +out: + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * ubifs_log_post_commit - things to do after commit is completed. + * @c: UBIFS file-system description object + * @old_ltail_lnum: old log tail LEB number + * + * Release buds only after commit is completed, because they must be unchanged + * if recovery is needed. + * + * Unmap log LEBs only after commit is completed, because they may be needed for + * recovery. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) +{ + int lnum, err = 0; + + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + err = ubifs_return_leb(c, bud->lnum); + if (err) + return err; + list_del(&bud->list); + kfree(bud); + } + mutex_lock(&c->log_mutex); + for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; + lnum = ubifs_next_log_lnum(c, lnum)) { + dbg_log("unmap log LEB %d", lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + } +out: + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * struct done_ref - references that have been done. + * @rb: rb-tree node + * @lnum: LEB number + */ +struct done_ref { + struct rb_node rb; + int lnum; +}; + +/** + * done_already - determine if a reference has been done already. + * @done_tree: rb-tree to store references that have been done + * @lnum: LEB number of reference + * + * This function returns %1 if the reference has been done, %0 if not, otherwise + * a negative error code is returned. + */ +static int done_already(struct rb_root *done_tree, int lnum) +{ + struct rb_node **p = &done_tree->rb_node, *parent = NULL; + struct done_ref *dr; + + while (*p) { + parent = *p; + dr = rb_entry(parent, struct done_ref, rb); + if (lnum < dr->lnum) + p = &(*p)->rb_left; + else if (lnum > dr->lnum) + p = &(*p)->rb_right; + else + return 1; + } + + dr = kzalloc(sizeof(struct done_ref), GFP_NOFS); + if (!dr) + return -ENOMEM; + + dr->lnum = lnum; + + rb_link_node(&dr->rb, parent, p); + rb_insert_color(&dr->rb, done_tree); + + return 0; +} + +/** + * destroy_done_tree - destroy the done tree. + * @done_tree: done tree to destroy + */ +static void destroy_done_tree(struct rb_root *done_tree) +{ + struct done_ref *dr, *n; + + rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb) + kfree(dr); +} + +/** + * add_node - add a node to the consolidated log. + * @c: UBIFS file-system description object + * @buf: buffer to which to add + * @lnum: LEB number to which to write is passed and returned here + * @offs: offset to where to write is passed and returned here + * @node: node to add + * + * This function returns %0 on success and a negative error code on failure. + */ +static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, + void *node) +{ + struct ubifs_ch *ch = node; + int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs; + + if (len > remains) { + int sz = ALIGN(*offs, c->min_io_size), err; + + ubifs_pad(c, buf + *offs, sz - *offs); + err = ubifs_leb_change(c, *lnum, buf, sz); + if (err) + return err; + *lnum = ubifs_next_log_lnum(c, *lnum); + *offs = 0; + } + memcpy(buf + *offs, node, len); + *offs += ALIGN(len, 8); + return 0; +} + +/** + * ubifs_consolidate_log - consolidate the log. + * @c: UBIFS file-system description object + * + * Repeated failed commits could cause the log to be full, but at least 1 LEB is + * needed for commit. This function rewrites the reference nodes in the log + * omitting duplicates, and failed CS nodes, and leaving no gaps. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_consolidate_log(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct rb_root done_tree = RB_ROOT; + int lnum, err, first = 1, write_lnum, offs = 0; + void *buf; + + dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum, + c->lhead_lnum); + buf = vmalloc(c->leb_size); + if (!buf) + return -ENOMEM; + lnum = c->ltail_lnum; + write_lnum = lnum; + while (1) { + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + goto out_free; + } + list_for_each_entry(snod, &sleb->nodes, list) { + switch (snod->type) { + case UBIFS_REF_NODE: { + struct ubifs_ref_node *ref = snod->node; + int ref_lnum = le32_to_cpu(ref->lnum); + + err = done_already(&done_tree, ref_lnum); + if (err < 0) + goto out_scan; + if (err != 1) { + err = add_node(c, buf, &write_lnum, + &offs, snod->node); + if (err) + goto out_scan; + } + break; + } + case UBIFS_CS_NODE: + if (!first) + break; + err = add_node(c, buf, &write_lnum, &offs, + snod->node); + if (err) + goto out_scan; + first = 0; + break; + } + } + ubifs_scan_destroy(sleb); + if (lnum == c->lhead_lnum) + break; + lnum = ubifs_next_log_lnum(c, lnum); + } + if (offs) { + int sz = ALIGN(offs, c->min_io_size); + + ubifs_pad(c, buf + offs, sz - offs); + err = ubifs_leb_change(c, write_lnum, buf, sz); + if (err) + goto out_free; + offs = ALIGN(offs, c->min_io_size); + } + destroy_done_tree(&done_tree); + vfree(buf); + if (write_lnum == c->lhead_lnum) { + ubifs_err(c, "log is too full"); + return -EINVAL; + } + /* Unmap remaining LEBs */ + lnum = write_lnum; + do { + lnum = ubifs_next_log_lnum(c, lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } while (lnum != c->lhead_lnum); + c->lhead_lnum = write_lnum; + c->lhead_offs = offs; + dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs); + return 0; + +out_scan: + ubifs_scan_destroy(sleb); +out_free: + destroy_done_tree(&done_tree); + vfree(buf); + return err; +} + +/** + * dbg_check_bud_bytes - make sure bud bytes calculation are all right. + * @c: UBIFS file-system description object + * + * This function makes sure the amount of flash space used by closed buds + * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in + * case of failure. + */ +static int dbg_check_bud_bytes(struct ubifs_info *c) +{ + int i, err = 0; + struct ubifs_bud *bud; + long long bud_bytes = 0; + + if (!dbg_is_chk_gen(c)) + return 0; + + spin_lock(&c->buds_lock); + for (i = 0; i < c->jhead_cnt; i++) + list_for_each_entry(bud, &c->jheads[i].buds_list, list) + bud_bytes += c->leb_size - bud->start; + + if (c->bud_bytes != bud_bytes) { + ubifs_err(c, "bad bud_bytes %lld, calculated %lld", + c->bud_bytes, bud_bytes); + err = -EINVAL; + } + spin_unlock(&c->buds_lock); + + return err; +} diff --git a/kernel/fs/ubifs/lprops.c b/kernel/fs/ubifs/lprops.c new file mode 100644 index 000000000..a0011aa3a --- /dev/null +++ b/kernel/fs/ubifs/lprops.c @@ -0,0 +1,1321 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the functions that access LEB properties and their + * categories. LEBs are categorized based on the needs of UBIFS, and the + * categories are stored as either heaps or lists to provide a fast way of + * finding a LEB in a particular category. For example, UBIFS may need to find + * an empty LEB for the journal, or a very dirty LEB for garbage collection. + */ + +#include "ubifs.h" + +/** + * get_heap_comp_val - get the LEB properties value for heap comparisons. + * @lprops: LEB properties + * @cat: LEB category + */ +static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_FREE: + return lprops->free; + case LPROPS_DIRTY_IDX: + return lprops->free + lprops->dirty; + default: + return lprops->dirty; + } +} + +/** + * move_up_lpt_heap - move a new heap entry up as far as possible. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @cat: LEB category + * + * New entries to a heap are added at the bottom and then moved up until the + * parent's value is greater. In the case of LPT's category heaps, the value + * is either the amount of free space or the amount of dirty space, depending + * on the category. + */ +static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int cat) +{ + int val1, val2, hpos; + + hpos = lprops->hpos; + if (!hpos) + return; /* Already top of the heap */ + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater, move up the heap */ + do { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val2 >= val1) + return; + /* Greater than parent so move up */ + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + } while (hpos); +} + +/** + * adjust_lpt_heap - move a changed heap entry up or down the heap. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @hpos: heap position of @lprops + * @cat: LEB category + * + * Changed entries in a heap are moved up or down until the parent's value is + * greater. In the case of LPT's category heaps, the value is either the amount + * of free space or the amount of dirty space, depending on the category. + */ +static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int hpos, int cat) +{ + int val1, val2, val3, cpos; + + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater than parent, move up the heap */ + if (hpos) { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 > val2) { + /* Greater than parent so move up */ + while (1) { + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + if (!hpos) + return; + ppos = (hpos - 1) / 2; + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 <= val2) + return; + /* Still greater than parent so keep going */ + } + } + } + + /* Not greater than parent, so compare to children */ + while (1) { + /* Compare to left child */ + cpos = hpos * 2 + 1; + if (cpos >= heap->cnt) + return; + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val2) { + /* Less than left child, so promote biggest child */ + if (cpos + 1 < heap->cnt) { + val3 = get_heap_comp_val(heap->arr[cpos + 1], + cat); + if (val3 > val2) + cpos += 1; /* Right child is bigger */ + } + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + /* Compare to right child */ + cpos += 1; + if (cpos >= heap->cnt) + return; + val3 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val3) { + /* Less than right child, so promote right child */ + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + return; + } +} + +/** + * add_to_lpt_heap - add LEB properties to a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category + * + * This function returns %1 if @lprops is added to the heap for LEB category + * @cat, otherwise %0 is returned because the heap is full. + */ +static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if (heap->cnt >= heap->max_cnt) { + const int b = LPT_HEAP_SZ / 2 - 1; + int cpos, val1, val2; + + /* Compare to some other LEB on the bottom of heap */ + /* Pick a position kind of randomly */ + cpos = (((size_t)lprops >> 4) & b) + b; + ubifs_assert(cpos >= b); + ubifs_assert(cpos < LPT_HEAP_SZ); + ubifs_assert(cpos < heap->cnt); + + val1 = get_heap_comp_val(lprops, cat); + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 > val2) { + struct ubifs_lprops *lp; + + lp = heap->arr[cpos]; + lp->flags &= ~LPROPS_CAT_MASK; + lp->flags |= LPROPS_UNCAT; + list_add(&lp->list, &c->uncat_list); + lprops->hpos = cpos; + heap->arr[cpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } + dbg_check_heap(c, heap, cat, -1); + return 0; /* Not added to heap */ + } else { + lprops->hpos = heap->cnt++; + heap->arr[lprops->hpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } +} + +/** + * remove_from_lpt_heap - remove LEB properties from a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category + */ +static void remove_from_lpt_heap(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + ubifs_assert(hpos >= 0 && hpos < heap->cnt); + ubifs_assert(heap->arr[hpos] == lprops); + heap->cnt -= 1; + if (hpos < heap->cnt) { + heap->arr[hpos] = heap->arr[heap->cnt]; + heap->arr[hpos]->hpos = hpos; + adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat); + } + dbg_check_heap(c, heap, cat, -1); +} + +/** + * lpt_heap_replace - replace lprops in a category heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * @cat: LEB category + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * the category heaps to those lprops must be updated to point to the new + * lprops. This function does that. + */ +static void lpt_heap_replace(struct ubifs_info *c, + struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = new_lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + heap->arr[hpos] = new_lprops; +} + +/** + * ubifs_add_to_cat - add LEB properties to a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category to which to add + * + * LEB properties are categorized to enable fast find operations. + */ +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + if (add_to_lpt_heap(c, lprops, cat)) + break; + /* No more room on heap so make it un-categorized */ + cat = LPROPS_UNCAT; + /* Fall through */ + case LPROPS_UNCAT: + list_add(&lprops->list, &c->uncat_list); + break; + case LPROPS_EMPTY: + list_add(&lprops->list, &c->empty_list); + break; + case LPROPS_FREEABLE: + list_add(&lprops->list, &c->freeable_list); + c->freeable_cnt += 1; + break; + case LPROPS_FRDI_IDX: + list_add(&lprops->list, &c->frdi_idx_list); + break; + default: + ubifs_assert(0); + } + + lprops->flags &= ~LPROPS_CAT_MASK; + lprops->flags |= cat; + c->in_a_category_cnt += 1; + ubifs_assert(c->in_a_category_cnt <= c->main_lebs); +} + +/** + * ubifs_remove_from_cat - remove LEB properties from a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category from which to remove + * + * LEB properties are categorized to enable fast find operations. + */ +static void ubifs_remove_from_cat(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + remove_from_lpt_heap(c, lprops, cat); + break; + case LPROPS_FREEABLE: + c->freeable_cnt -= 1; + ubifs_assert(c->freeable_cnt >= 0); + /* Fall through */ + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FRDI_IDX: + ubifs_assert(!list_empty(&lprops->list)); + list_del(&lprops->list); + break; + default: + ubifs_assert(0); + } + + c->in_a_category_cnt -= 1; + ubifs_assert(c->in_a_category_cnt >= 0); +} + +/** + * ubifs_replace_cat - replace lprops in a category list or heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops) +{ + int cat; + + cat = new_lprops->flags & LPROPS_CAT_MASK; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + lpt_heap_replace(c, old_lprops, new_lprops, cat); + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_replace(&old_lprops->list, &new_lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_ensure_cat - ensure LEB properties are categorized. + * @c: UBIFS file-system description object + * @lprops: LEB properties + * + * A LEB may have fallen off of the bottom of a heap, and ended up as + * un-categorized even though it has enough space for us now. If that is the + * case this function will put the LEB back onto a heap. + */ +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int cat = lprops->flags & LPROPS_CAT_MASK; + + if (cat != LPROPS_UNCAT) + return; + cat = ubifs_categorize_lprops(c, lprops); + if (cat == LPROPS_UNCAT) + return; + ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT); + ubifs_add_to_cat(c, lprops, cat); +} + +/** + * ubifs_categorize_lprops - categorize LEB properties. + * @c: UBIFS file-system description object + * @lprops: LEB properties to categorize + * + * LEB properties are categorized to enable fast find operations. This function + * returns the LEB category to which the LEB properties belong. Note however + * that if the LEB category is stored as a heap and the heap is full, the + * LEB properties may have their category changed to %LPROPS_UNCAT. + */ +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops) +{ + if (lprops->flags & LPROPS_TAKEN) + return LPROPS_UNCAT; + + if (lprops->free == c->leb_size) { + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return LPROPS_EMPTY; + } + + if (lprops->free + lprops->dirty == c->leb_size) { + if (lprops->flags & LPROPS_INDEX) + return LPROPS_FRDI_IDX; + else + return LPROPS_FREEABLE; + } + + if (lprops->flags & LPROPS_INDEX) { + if (lprops->dirty + lprops->free >= c->min_idx_node_sz) + return LPROPS_DIRTY_IDX; + } else { + if (lprops->dirty >= c->dead_wm && + lprops->dirty > lprops->free) + return LPROPS_DIRTY; + if (lprops->free > 0) + return LPROPS_FREE; + } + + return LPROPS_UNCAT; +} + +/** + * change_category - change LEB properties category. + * @c: UBIFS file-system description object + * @lprops: LEB properties to re-categorize + * + * LEB properties are categorized to enable fast find operations. When the LEB + * properties change they must be re-categorized. + */ +static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int old_cat = lprops->flags & LPROPS_CAT_MASK; + int new_cat = ubifs_categorize_lprops(c, lprops); + + if (old_cat == new_cat) { + struct ubifs_lpt_heap *heap; + + /* lprops on a heap now must be moved up or down */ + if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) + return; /* Not on a heap */ + heap = &c->lpt_heap[new_cat - 1]; + adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat); + } else { + ubifs_remove_from_cat(c, lprops, old_cat); + ubifs_add_to_cat(c, lprops, new_cat); + } +} + +/** + * ubifs_calc_dark - calculate LEB dark space size. + * @c: the UBIFS file-system description object + * @spc: amount of free and dirty space in the LEB + * + * This function calculates and returns amount of dark space in an LEB which + * has @spc bytes of free and dirty space. + * + * UBIFS is trying to account the space which might not be usable, and this + * space is called "dark space". For example, if an LEB has only %512 free + * bytes, it is dark space, because it cannot fit a large data node. + */ +int ubifs_calc_dark(const struct ubifs_info *c, int spc) +{ + ubifs_assert(!(spc & 7)); + + if (spc < c->dark_wm) + return spc; + + /* + * If we have slightly more space then the dark space watermark, we can + * anyway safely assume it we'll be able to write a node of the + * smallest size there. + */ + if (spc - c->dark_wm < MIN_WRITE_SZ) + return spc - MIN_WRITE_SZ; + + return c->dark_wm; +} + +/** + * is_lprops_dirty - determine if LEB properties are dirty. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to test + */ +static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + struct ubifs_pnode *pnode; + int pos; + + pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1); + pnode = (struct ubifs_pnode *)container_of(lprops - pos, + struct ubifs_pnode, + lprops[0]); + return !test_bit(COW_CNODE, &pnode->flags) && + test_bit(DIRTY_CNODE, &pnode->flags); +} + +/** + * ubifs_change_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lp: LEB properties to change + * @free: new free space amount + * @dirty: new dirty space amount + * @flags: new flags + * @idx_gc_cnt: change to the count of @idx_gc list + * + * This function changes LEB properties (@free, @dirty or @flag). However, the + * property which has the %LPROPS_NC value is not changed. Returns a pointer to + * the updated LEB properties on success and a negative error code on failure. + * + * Note, the LEB properties may have had to be copied (due to COW) and + * consequently the pointer returned may not be the same as the pointer + * passed. + */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt) +{ + /* + * This is the only function that is allowed to change lprops, so we + * discard the "const" qualifier. + */ + struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; + + dbg_lp("LEB %d, free %d, dirty %d, flags %d", + lprops->lnum, free, dirty, flags); + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + ubifs_assert(c->freeable_cnt >= 0); + ubifs_assert(c->freeable_cnt <= c->main_lebs); + ubifs_assert(c->lst.taken_empty_lebs >= 0); + ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs); + ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7)); + ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7)); + ubifs_assert(!(c->lst.total_used & 7)); + ubifs_assert(free == LPROPS_NC || free >= 0); + ubifs_assert(dirty == LPROPS_NC || dirty >= 0); + + if (!is_lprops_dirty(c, lprops)) { + lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum); + if (IS_ERR(lprops)) + return lprops; + } else + ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum)); + + ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); + + spin_lock(&c->space_lock); + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs -= 1; + + if (!(lprops->flags & LPROPS_INDEX)) { + int old_spc; + + old_spc = lprops->free + lprops->dirty; + if (old_spc < c->dead_wm) + c->lst.total_dead -= old_spc; + else + c->lst.total_dark -= ubifs_calc_dark(c, old_spc); + + c->lst.total_used -= c->leb_size - old_spc; + } + + if (free != LPROPS_NC) { + free = ALIGN(free, 8); + c->lst.total_free += free - lprops->free; + + /* Increase or decrease empty LEBs counter if needed */ + if (free == c->leb_size) { + if (lprops->free != c->leb_size) + c->lst.empty_lebs += 1; + } else if (lprops->free == c->leb_size) + c->lst.empty_lebs -= 1; + lprops->free = free; + } + + if (dirty != LPROPS_NC) { + dirty = ALIGN(dirty, 8); + c->lst.total_dirty += dirty - lprops->dirty; + lprops->dirty = dirty; + } + + if (flags != LPROPS_NC) { + /* Take care about indexing LEBs counter if needed */ + if ((lprops->flags & LPROPS_INDEX)) { + if (!(flags & LPROPS_INDEX)) + c->lst.idx_lebs -= 1; + } else if (flags & LPROPS_INDEX) + c->lst.idx_lebs += 1; + lprops->flags = flags; + } + + if (!(lprops->flags & LPROPS_INDEX)) { + int new_spc; + + new_spc = lprops->free + lprops->dirty; + if (new_spc < c->dead_wm) + c->lst.total_dead += new_spc; + else + c->lst.total_dark += ubifs_calc_dark(c, new_spc); + + c->lst.total_used += c->leb_size - new_spc; + } + + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs += 1; + + change_category(c, lprops); + c->idx_gc_cnt += idx_gc_cnt; + spin_unlock(&c->space_lock); + return lprops; +} + +/** + * ubifs_get_lp_stats - get lprops statistics. + * @c: UBIFS file-system description object + * @st: return statistics + */ +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) +{ + spin_lock(&c->space_lock); + memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_change_one_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space + * @flags_set: flags to set + * @flags_clean: flags to clean + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes properties of LEB @lnum. It is a helper wrapper over + * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the + * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and + * a negative error code in case of failure. + */ +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err(c, "cannot change properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_update_one_lp - update LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space to add + * @flags_set: flags to set + * @flags_clean: flags to clean + * + * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to + * current dirty space, not substitutes it. + */ +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err(c, "cannot update properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_read_one_lp - read LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to read properties for + * @lp: where to store read properties + * + * This helper function reads properties of a LEB @lnum and stores them in @lp. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) +{ + int err = 0; + const struct ubifs_lprops *lpp; + + ubifs_get_lprops(c); + + lpp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lpp)) { + err = PTR_ERR(lpp); + ubifs_err(c, "cannot read properties of LEB %d, error %d", + lnum, err); + goto out; + } + + memcpy(lp, lpp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fast_find_free - try to find a LEB with free space quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a LEB with free space or %NULL if + * the function is unable to find a LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + heap = &c->lpt_heap[LPROPS_FREE - 1]; + if (heap->cnt == 0) + return NULL; + + lprops = heap->arr[0]; + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_fast_find_empty - try to find an empty LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for an empty LEB or %NULL if the + * function is unable to find an empty LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->empty_list)) + return NULL; + + lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free == c->leb_size); + return lprops; +} + +/** + * ubifs_fast_find_freeable - try to find a freeable LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable LEB or %NULL if the + * function is unable to find a freeable LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->freeable_list)) + return NULL; + + lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(c->freeable_cnt > 0); + return lprops; +} + +/** + * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable index LEB or %NULL if the + * function is unable to find a freeable index LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->frdi_idx_list)) + return NULL; + + lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + return lprops; +} + +/* + * Everything below is related to debugging. + */ + +/** + * dbg_check_cats - check category heaps and lists. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_cats(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct list_head *pos; + int i, cat; + + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) + return 0; + + list_for_each_entry(lprops, &c->empty_list, list) { + if (lprops->free != c->leb_size) { + ubifs_err(c, "non-empty LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err(c, "taken LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + } + + i = 0; + list_for_each_entry(lprops, &c->freeable_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err(c, "non-freeable LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err(c, "taken LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + i += 1; + } + if (i != c->freeable_cnt) { + ubifs_err(c, "freeable list count %d expected %d", i, + c->freeable_cnt); + return -EINVAL; + } + + i = 0; + list_for_each(pos, &c->idx_gc) + i += 1; + if (i != c->idx_gc_cnt) { + ubifs_err(c, "idx_gc list count %d expected %d", i, + c->idx_gc_cnt); + return -EINVAL; + } + + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err(c, "non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err(c, "taken LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + if (!(lprops->flags & LPROPS_INDEX)) { + ubifs_err(c, "non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); + return -EINVAL; + } + } + + for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (!lprops) { + ubifs_err(c, "null ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->hpos != i) { + ubifs_err(c, "bad ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err(c, "taken LEB in LPT heap cat %d", cat); + return -EINVAL; + } + } + } + + return 0; +} + +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos) +{ + int i = 0, j, err = 0; + + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) + return; + + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + struct ubifs_lprops *lp; + + if (i != add_pos) + if ((lprops->flags & LPROPS_CAT_MASK) != cat) { + err = 1; + goto out; + } + if (lprops->hpos != i) { + err = 2; + goto out; + } + lp = ubifs_lpt_lookup(c, lprops->lnum); + if (IS_ERR(lp)) { + err = 3; + goto out; + } + if (lprops != lp) { + ubifs_err(c, "lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); + err = 4; + goto out; + } + for (j = 0; j < i; j++) { + lp = heap->arr[j]; + if (lp == lprops) { + err = 5; + goto out; + } + if (lp->lnum == lprops->lnum) { + err = 6; + goto out; + } + } + } +out: + if (err) { + ubifs_err(c, "failed cat %d hpos %d err %d", cat, i, err); + dump_stack(); + ubifs_dump_heap(c, heap, cat); + } +} + +/** + * scan_check_cb - scan callback. + * @c: the UBIFS file-system description object + * @lp: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @lst: lprops statistics to update + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_check_cb(struct ubifs_info *c, + const struct ubifs_lprops *lp, int in_tree, + struct ubifs_lp_stats *lst) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; + + cat = lp->flags & LPROPS_CAT_MASK; + if (cat != LPROPS_UNCAT) { + cat = ubifs_categorize_lprops(c, lp); + if (cat != (lp->flags & LPROPS_CAT_MASK)) { + ubifs_err(c, "bad LEB category %d expected %d", + (lp->flags & LPROPS_CAT_MASK), cat); + return -EINVAL; + } + } + + /* Check lp is on its category list (if it has one) */ + if (in_tree) { + struct list_head *list = NULL; + + switch (cat) { + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + } + if (list) { + struct ubifs_lprops *lprops; + int found = 0; + + list_for_each_entry(lprops, list, list) { + if (lprops == lp) { + found = 1; + break; + } + } + if (!found) { + ubifs_err(c, "bad LPT list (category %d)", cat); + return -EINVAL; + } + } + } + + /* Check lp is on its category heap (if it has one) */ + if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || + lp != heap->arr[lp->hpos]) { + ubifs_err(c, "bad LPT heap (category %d)", cat); + return -EINVAL; + } + } + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage - do not scan them. + */ + if (lp->free == c->leb_size) { + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + ret = PTR_ERR(sleb); + if (ret == -EUCLEAN) { + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + } + goto out; + } + + is_idx = -1; + list_for_each_entry(snod, &sleb->nodes, list) { + int found, level = 0; + + cond_resched(); + + if (is_idx == -1) + is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0; + + if (is_idx && snod->type != UBIFS_IDX_NODE) { + ubifs_err(c, "indexing node in data LEB %d:%d", + lnum, snod->offs); + goto out_destroy; + } + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_idx_node *idx = snod->node; + + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + } + + found = ubifs_tnc_has_node(c, &snod->key, level, lnum, + snod->offs, is_idx); + if (found) { + if (found < 0) + goto out_destroy; + used += ALIGN(snod->len, 8); + } + } + + free = c->leb_size - sleb->endpt; + dirty = sleb->endpt - used; + + if (free > c->leb_size || free < 0 || dirty > c->leb_size || + dirty < 0) { + ubifs_err(c, "bad calculated accounting for LEB %d: free %d, dirty %d", + lnum, free, dirty); + goto out_destroy; + } + + if (lp->free + lp->dirty == c->leb_size && + free + dirty == c->leb_size) + if ((is_idx && !(lp->flags & LPROPS_INDEX)) || + (!is_idx && free == c->leb_size) || + lp->free == c->leb_size) { + /* + * Empty or freeable LEBs could contain index + * nodes from an uncompleted commit due to an + * unclean unmount. Or they could be empty for + * the same reason. Or it may simply not have been + * unmapped. + */ + free = lp->free; + dirty = lp->dirty; + is_idx = 0; + } + + if (is_idx && lp->free + lp->dirty == free + dirty && + lnum != c->ihead_lnum) { + /* + * After an unclean unmount, an index LEB could have a different + * amount of free space than the value recorded by lprops. That + * is because the in-the-gaps method may use free space or + * create free space (as a side-effect of using ubi_leb_change + * and not writing the whole LEB). The incorrect free space + * value is not a problem because the index is only ever + * allocated empty LEBs, so there will never be an attempt to + * write to the free space at the end of an index LEB - except + * by the in-the-gaps method for which it is not a problem. + */ + free = lp->free; + dirty = lp->dirty; + } + + if (lp->free != free || lp->dirty != dirty) + goto out_print; + + if (is_idx && !(lp->flags & LPROPS_INDEX)) { + if (free == c->leb_size) + /* Free but not unmapped LEB, it's fine */ + is_idx = 0; + else { + ubifs_err(c, "indexing node without indexing flag"); + goto out_print; + } + } + + if (!is_idx && (lp->flags & LPROPS_INDEX)) { + ubifs_err(c, "data node with indexing flag"); + goto out_print; + } + + if (free == c->leb_size) + lst->empty_lebs += 1; + + if (is_idx) + lst->idx_lebs += 1; + + if (!(lp->flags & LPROPS_INDEX)) + lst->total_used += c->leb_size - free - dirty; + lst->total_free += free; + lst->total_dirty += dirty; + + if (!(lp->flags & LPROPS_INDEX)) { + int spc = free + dirty; + + if (spc < c->dead_wm) + lst->total_dead += spc; + else + lst->total_dark += ubifs_calc_dark(c, spc); + } + + ubifs_scan_destroy(sleb); + vfree(buf); + return LPT_SCAN_CONTINUE; + +out_print: + ubifs_err(c, "bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d", + lnum, lp->free, lp->dirty, lp->flags, free, dirty); + ubifs_dump_leb(c, lnum); +out_destroy: + ubifs_scan_destroy(sleb); + ret = -EINVAL; +out: + vfree(buf); + return ret; +} + +/** + * dbg_check_lprops - check all LEB properties. + * @c: UBIFS file-system description object + * + * This function checks all LEB properties and makes sure they are all correct. + * It returns zero if everything is fine, %-EINVAL if there is an inconsistency + * and other negative error codes in case of other errors. This function is + * called while the file system is locked (because of commit start), so no + * additional locking is required. Note that locking the LPT mutex would cause + * a circular lock dependency with the TNC mutex. + */ +int dbg_check_lprops(struct ubifs_info *c) +{ + int i, err; + struct ubifs_lp_stats lst; + + if (!dbg_is_chk_lprops(c)) + return 0; + + /* + * As we are going to scan the media, the write buffers have to be + * synchronized. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + memset(&lst, 0, sizeof(struct ubifs_lp_stats)); + err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, + (ubifs_lpt_scan_callback)scan_check_cb, + &lst); + if (err && err != -ENOSPC) + goto out; + + if (lst.empty_lebs != c->lst.empty_lebs || + lst.idx_lebs != c->lst.idx_lebs || + lst.total_free != c->lst.total_free || + lst.total_dirty != c->lst.total_dirty || + lst.total_used != c->lst.total_used) { + ubifs_err(c, "bad overall accounting"); + ubifs_err(c, "calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", + lst.empty_lebs, lst.idx_lebs, lst.total_free, + lst.total_dirty, lst.total_used); + ubifs_err(c, "read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", + c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, + c->lst.total_dirty, c->lst.total_used); + err = -EINVAL; + goto out; + } + + if (lst.total_dead != c->lst.total_dead || + lst.total_dark != c->lst.total_dark) { + ubifs_err(c, "bad dead/dark space accounting"); + ubifs_err(c, "calculated: total_dead %lld, total_dark %lld", + lst.total_dead, lst.total_dark); + ubifs_err(c, "read from lprops: total_dead %lld, total_dark %lld", + c->lst.total_dead, c->lst.total_dark); + err = -EINVAL; + goto out; + } + + err = dbg_check_cats(c); +out: + return err; +} diff --git a/kernel/fs/ubifs/lpt.c b/kernel/fs/ubifs/lpt.c new file mode 100644 index 000000000..dc9f27e9d --- /dev/null +++ b/kernel/fs/ubifs/lpt.c @@ -0,0 +1,2278 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the LEB properties tree (LPT) area. The LPT area + * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and + * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits + * between the log and the orphan area. + * + * The LPT area is like a miniature self-contained file system. It is required + * that it never runs out of space, is fast to access and update, and scales + * logarithmically. The LEB properties tree is implemented as a wandering tree + * much like the TNC, and the LPT area has its own garbage collection. + * + * The LPT has two slightly different forms called the "small model" and the + * "big model". The small model is used when the entire LEB properties table + * can be written into a single eraseblock. In that case, garbage collection + * consists of just writing the whole table, which therefore makes all other + * eraseblocks reusable. In the case of the big model, dirty eraseblocks are + * selected for garbage collection, which consists of marking the clean nodes in + * that LEB as dirty, and then only the dirty nodes are written out. Also, in + * the case of the big model, a table of LEB numbers is saved so that the entire + * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first + * mounted. + */ + +#include "ubifs.h" +#include +#include +#include + +/** + * do_calc_lpt_geom - calculate sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * Calculate the sizes of LPT bit fields, nodes, and tree, based on the + * properties of the flash and whether LPT is "big" (c->big_lpt). + */ +static void do_calc_lpt_geom(struct ubifs_info *c) +{ + int i, n, bits, per_leb_wastage, max_pnode_cnt; + long long sz, tot_wastage; + + n = c->main_lebs + c->max_leb_cnt - c->leb_cnt; + max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + + c->lpt_hght = 1; + n = UBIFS_LPT_FANOUT; + while (n < max_pnode_cnt) { + c->lpt_hght += 1; + n <<= UBIFS_LPT_FANOUT_SHIFT; + } + + c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + + n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT); + c->nnode_cnt = n; + for (i = 1; i < c->lpt_hght; i++) { + n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + c->nnode_cnt += n; + } + + c->space_bits = fls(c->leb_size) - 3; + c->lpt_lnum_bits = fls(c->lpt_lebs); + c->lpt_offs_bits = fls(c->leb_size - 1); + c->lpt_spc_bits = fls(c->leb_size); + + n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT); + c->pcnt_bits = fls(n - 1); + + c->lnum_bits = fls(c->max_leb_cnt - 1); + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT; + c->pnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT; + c->nnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lpt_lebs * c->lpt_spc_bits * 2; + c->ltab_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lnum_bits * c->lsave_cnt; + c->lsave_sz = (bits + 7) / 8; + + /* Calculate the minimum LPT size */ + c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + c->lpt_sz += c->ltab_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; + + /* Add wastage */ + sz = c->lpt_sz; + per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz); + sz += per_leb_wastage; + tot_wastage = per_leb_wastage; + while (sz > c->leb_size) { + sz += per_leb_wastage; + sz -= c->leb_size; + tot_wastage += per_leb_wastage; + } + tot_wastage += ALIGN(sz, c->min_io_size) - sz; + c->lpt_sz += tot_wastage; +} + +/** + * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_calc_lpt_geom(struct ubifs_info *c) +{ + int lebs_needed; + long long sz; + + do_calc_lpt_geom(c); + + /* Verify that lpt_lebs is big enough */ + sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + ubifs_err(c, "too few LPT LEBs"); + return -EINVAL; + } + + /* Verify that ltab fits in a single LEB (since ltab is a single node */ + if (c->ltab_sz > c->leb_size) { + ubifs_err(c, "LPT ltab too big"); + return -EINVAL; + } + + c->check_lpt_free = c->big_lpt; + return 0; +} + +/** + * calc_dflt_lpt_geom - calculate default LPT geometry. + * @c: the UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @big_lpt: whether the LPT area is "big" is returned here + * + * The size of the LPT area depends on parameters that themselves are dependent + * on the size of the LPT area. This function, successively recalculates the LPT + * area geometry until the parameters and resultant geometry are consistent. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, + int *big_lpt) +{ + int i, lebs_needed; + long long sz; + + /* Start by assuming the minimum number of LPT LEBs */ + c->lpt_lebs = UBIFS_MIN_LPT_LEBS; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + + /* And assume we will use the small LPT model */ + c->big_lpt = 0; + + /* + * Calculate the geometry based on assumptions above and then see if it + * makes sense + */ + do_calc_lpt_geom(c); + + /* Small LPT model must have lpt_sz < leb_size */ + if (c->lpt_sz > c->leb_size) { + /* Nope, so try again using big LPT model */ + c->big_lpt = 1; + do_calc_lpt_geom(c); + } + + /* Now check there are enough LPT LEBs */ + for (i = 0; i < 64 ; i++) { + sz = c->lpt_sz * 4; /* Allow 4 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + /* Not enough LPT LEBs so try again with more */ + c->lpt_lebs = lebs_needed; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + do_calc_lpt_geom(c); + continue; + } + if (c->ltab_sz > c->leb_size) { + ubifs_err(c, "LPT ltab too big"); + return -EINVAL; + } + *main_lebs = c->main_lebs; + *big_lpt = c->big_lpt; + return 0; + } + return -EINVAL; +} + +/** + * pack_bits - pack bit fields end-to-end. + * @addr: address at which to pack (passed and next address returned) + * @pos: bit position at which to pack (passed and next position returned) + * @val: value to pack + * @nrbits: number of bits of value to pack (1-32) + */ +static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits) +{ + uint8_t *p = *addr; + int b = *pos; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + ubifs_assert((val >> nrbits) == 0 || nrbits == 32); + if (b) { + *p |= ((uint8_t)val) << b; + nrbits += b; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= (8 - b)); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 32) + *++p = (uint8_t)(val >>= 8); + } + } + } + } else { + *p = (uint8_t)val; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) + *++p = (uint8_t)(val >>= 8); + } + } + } + b = nrbits & 7; + if (b == 0) + p++; + *addr = p; + *pos = b; +} + +/** + * ubifs_unpack_bits - unpack bit fields. + * @addr: address at which to unpack (passed and next address returned) + * @pos: bit position at which to unpack (passed and next position returned) + * @nrbits: number of bits of value to unpack (1-32) + * + * This functions returns the value unpacked. + */ +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) +{ + const int k = 32 - nrbits; + uint8_t *p = *addr; + int b = *pos; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + if (b) { + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } + val <<= (8 - b); + val |= *p >> b; + nrbits += b; + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } + val <<= k; + val >>= k; + b = nrbits & 7; + p += nrbits >> 3; + *addr = p; + *pos = b; + ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); + return val; +} + +/** + * ubifs_pack_pnode - pack all the bit fields of a pnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @pnode: pnode to pack + */ +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, pnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pack_bits(&addr, &pos, pnode->lprops[i].free >> 3, + c->space_bits); + pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3, + c->space_bits); + if (pnode->lprops[i].flags & LPROPS_INDEX) + pack_bits(&addr, &pos, 1, 1); + else + pack_bits(&addr, &pos, 0, 1); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->pnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_nnode - pack all the bit fields of a nnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @nnode: nnode to pack + */ +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, nnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + + if (lnum == 0) + lnum = c->lpt_last + 1; + pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits); + pack_bits(&addr, &pos, nnode->nbranch[i].offs, + c->lpt_offs_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->nnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_ltab - pack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @ltab: LPT's own lprops table to pack + */ +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lpt_lebs; i++) { + pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits); + pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->ltab_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_lsave - pack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @lsave: LPT's save table to pack + */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lsave_cnt; i++) + pack_bits(&addr, &pos, lsave[i], c->lnum_bits); + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->lsave_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number to which to add dirty space + * @dirty: amount of dirty space to add + */ +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + if (!dirty || !lnum) + return; + dbg_lp("LEB %d add %d to %d", + lnum, dirty, c->ltab[lnum - c->lpt_first].dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * set_ltab - set LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space + */ +static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d %d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty = dirty; +} + +/** + * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @nnode: nnode for which to add dirt + */ +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *np = nnode->parent; + + if (np) + ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum, + c->nnode_sz); + else { + ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz); + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + } +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * calc_nnode_num - calculate nnode number. + * @row: the row in the tree (root is zero) + * @col: the column in the row (leftmost is zero) + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number for the nnode at @row + * and @col. + */ +static int calc_nnode_num(int row, int col) +{ + int num, bits; + + num = 1; + while (row--) { + bits = (col & (UBIFS_LPT_FANOUT - 1)); + col >>= UBIFS_LPT_FANOUT_SHIFT; + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= bits; + } + return num; +} + +/** + * calc_nnode_num_from_parent - calculate nnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_nnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int num, shft; + + if (!parent) + return 1; + shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT; + num = parent->num ^ (1 << shft); + num |= (UBIFS_LPT_FANOUT + iip) << shft; + return num; +} + +/** + * calc_pnode_num_from_parent - calculate pnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The pnode number is a number that uniquely identifies a pnode and can be used + * easily to traverse the tree from the root to that pnode. + * + * This function calculates and returns the pnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_pnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; + + for (i = 0; i < n; i++) { + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= pnum & (UBIFS_LPT_FANOUT - 1); + pnum >>= UBIFS_LPT_FANOUT_SHIFT; + } + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= iip; + return num; +} + +/** + * ubifs_create_dflt_lpt - create default LPT. + * @c: UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @lpt_first: LEB number of first LPT LEB + * @lpt_lebs: number of LEBs for LPT is passed and returned here + * @big_lpt: use big LPT model is passed and returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt) +{ + int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row; + int blnum, boffs, bsz, bcnt; + struct ubifs_pnode *pnode = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = NULL, *p; + struct ubifs_lpt_lprops *ltab = NULL; + int *lsave = NULL; + + err = calc_dflt_lpt_geom(c, main_lebs, big_lpt); + if (err) + return err; + *lpt_lebs = c->lpt_lebs; + + /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */ + c->lpt_first = lpt_first; + /* Needed by 'set_ltab()' */ + c->lpt_last = lpt_first + c->lpt_lebs - 1; + /* Needed by 'ubifs_pack_lsave()' */ + c->main_first = c->leb_cnt - *main_lebs; + + lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL); + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); + buf = vmalloc(c->leb_size); + ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!pnode || !nnode || !buf || !ltab || !lsave) { + err = -ENOMEM; + goto out; + } + + ubifs_assert(!c->ltab); + c->ltab = ltab; /* Needed by set_ltab */ + + /* Initialize LPT's own lprops */ + for (i = 0; i < c->lpt_lebs; i++) { + ltab[i].free = c->leb_size; + ltab[i].dirty = 0; + ltab[i].tgc = 0; + ltab[i].cmt = 0; + } + + lnum = lpt_first; + p = buf; + /* Number of leaf nodes (pnodes) */ + cnt = c->pnode_cnt; + + /* + * The first pnode contains the LEB properties for the LEBs that contain + * the root inode node and the root index node of the index tree. + */ + node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8); + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[0].free = c->leb_size - iopos; + pnode->lprops[0].dirty = iopos - node_sz; + pnode->lprops[0].flags = LPROPS_INDEX; + + node_sz = UBIFS_INO_NODE_SZ; + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[1].free = c->leb_size - iopos; + pnode->lprops[1].dirty = iopos - node_sz; + + for (i = 2; i < UBIFS_LPT_FANOUT; i++) + pnode->lprops[i].free = c->leb_size; + + /* Add first pnode */ + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len = c->pnode_sz; + pnode->num += 1; + + /* Reset pnode values for remaining pnodes */ + pnode->lprops[0].free = c->leb_size; + pnode->lprops[0].dirty = 0; + pnode->lprops[0].flags = 0; + + pnode->lprops[1].free = c->leb_size; + pnode->lprops[1].dirty = 0; + + /* + * To calculate the internal node branches, we keep information about + * the level below. + */ + blnum = lnum; /* LEB number of level below */ + boffs = 0; /* Offset of level below */ + bcnt = cnt; /* Number of nodes in level below */ + bsz = c->pnode_sz; /* Size of nodes in level below */ + + /* Add all remaining pnodes */ + for (i = 1; i < cnt; i++) { + if (len + c->pnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubifs_leb_change(c, lnum++, buf, alen); + if (err) + goto out; + p = buf; + len = 0; + } + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len += c->pnode_sz; + /* + * pnodes are simply numbered left to right starting at zero, + * which means the pnode number can be used easily to traverse + * down the tree to the corresponding pnode. + */ + pnode->num += 1; + } + + row = 0; + for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT) + row += 1; + /* Add all nnodes, one level at a time */ + while (1) { + /* Number of internal nodes (nnodes) at next level */ + cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + if (len + c->nnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, + alen - len); + memset(p, 0xff, alen - len); + err = ubifs_leb_change(c, lnum++, buf, alen); + if (err) + goto out; + p = buf; + len = 0; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) { + c->lpt_lnum = lnum; + c->lpt_offs = len; + } + /* Set branches to the level below */ + for (j = 0; j < UBIFS_LPT_FANOUT; j++) { + if (bcnt) { + if (boffs + bsz > c->leb_size) { + blnum += 1; + boffs = 0; + } + nnode->nbranch[j].lnum = blnum; + nnode->nbranch[j].offs = boffs; + boffs += bsz; + bcnt--; + } else { + nnode->nbranch[j].lnum = 0; + nnode->nbranch[j].offs = 0; + } + } + nnode->num = calc_nnode_num(row, i); + ubifs_pack_nnode(c, p, nnode); + p += c->nnode_sz; + len += c->nnode_sz; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) + break; + /* Update the information about the level below */ + bcnt = cnt; + bsz = c->nnode_sz; + row -= 1; + } + + if (*big_lpt) { + /* Need to add LPT's save table */ + if (len + c->lsave_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubifs_leb_change(c, lnum++, buf, alen); + if (err) + goto out; + p = buf; + len = 0; + } + + c->lsave_lnum = lnum; + c->lsave_offs = len; + + for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++) + lsave[i] = c->main_first + i; + for (; i < c->lsave_cnt; i++) + lsave[i] = c->main_first; + + ubifs_pack_lsave(c, p, lsave); + p += c->lsave_sz; + len += c->lsave_sz; + } + + /* Need to add LPT's own LEB properties table */ + if (len + c->ltab_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubifs_leb_change(c, lnum++, buf, alen); + if (err) + goto out; + p = buf; + len = 0; + } + + c->ltab_lnum = lnum; + c->ltab_offs = len; + + /* Update ltab before packing it */ + len += c->ltab_sz; + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + + ubifs_pack_ltab(c, p, ltab); + p += c->ltab_sz; + + /* Write remaining buffer */ + memset(p, 0xff, alen - len); + err = ubifs_leb_change(c, lnum, buf, alen); + if (err) + goto out; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(len, c->min_io_size); + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); +out: + c->ltab = NULL; + kfree(lsave); + vfree(ltab); + vfree(buf); + kfree(nnode); + kfree(pnode); + return err; +} + +/** + * update_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * When a pnode is loaded into memory, the LEB properties it contains are added, + * by this function, to the LEB category lists and heaps. + */ +static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK; + int lnum = pnode->lprops[i].lnum; + + if (!lnum) + return; + ubifs_add_to_cat(c, &pnode->lprops[i], cat); + } +} + +/** + * replace_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @old_pnode: pnode copied + * @new_pnode: pnode copy + * + * During commit it is sometimes necessary to copy a pnode + * (see dirty_cow_pnode). When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode, + struct ubifs_pnode *new_pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (!new_pnode->lprops[i].lnum) + return; + ubifs_replace_cat(c, &old_pnode->lprops[i], + &new_pnode->lprops[i]); + } +} + +/** + * check_lpt_crc - check LPT node crc is correct. + * @c: UBIFS file-system description object + * @buf: buffer containing node + * @len: length of node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_crc(const struct ubifs_info *c, void *buf, int len) +{ + int pos = 0; + uint8_t *addr = buf; + uint16_t crc, calc_crc; + + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) { + ubifs_err(c, "invalid crc in LPT node: crc %hx calc %hx", + crc, calc_crc); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * check_lpt_type - check LPT node type is correct. + * @c: UBIFS file-system description object + * @addr: address of type bit field is passed and returned updated here + * @pos: position of type bit field is passed and returned updated here + * @type: expected type + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_type(const struct ubifs_info *c, uint8_t **addr, + int *pos, int type) +{ + int node_type; + + node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS); + if (node_type != type) { + ubifs_err(c, "invalid type (%d) in LPT node type %d", + node_type, type); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * unpack_pnode - unpack a pnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed pnode to unpack + * @pnode: pnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_pnode(const struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_PNODE); + if (err) + return err; + if (c->big_lpt) + pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->free <<= 3; + lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->dirty <<= 3; + + if (ubifs_unpack_bits(&addr, &pos, 1)) + lprops->flags = LPROPS_INDEX; + else + lprops->flags = 0; + lprops->flags |= ubifs_categorize_lprops(c, lprops); + } + err = check_lpt_crc(c, buf, c->pnode_sz); + return err; +} + +/** + * ubifs_unpack_nnode - unpack a nnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed nnode to unpack + * @nnode: nnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_NNODE); + if (err) + return err; + if (c->big_lpt) + nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum; + + lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) + + c->lpt_first; + if (lnum == c->lpt_last + 1) + lnum = 0; + nnode->nbranch[i].lnum = lnum; + nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos, + c->lpt_offs_bits); + } + err = check_lpt_crc(c, buf, c->nnode_sz); + return err; +} + +/** + * unpack_ltab - unpack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_ltab(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LTAB); + if (err) + return err; + for (i = 0; i < c->lpt_lebs; i++) { + int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + + if (free < 0 || free > c->leb_size || dirty < 0 || + dirty > c->leb_size || free + dirty > c->leb_size) + return -EINVAL; + + c->ltab[i].free = free; + c->ltab[i].dirty = dirty; + c->ltab[i].tgc = 0; + c->ltab[i].cmt = 0; + } + err = check_lpt_crc(c, buf, c->ltab_sz); + return err; +} + +/** + * unpack_lsave - unpack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_lsave(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LSAVE); + if (err) + return err; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits); + + if (lnum < c->main_first || lnum >= c->leb_cnt) + return -EINVAL; + c->lsave[i] = lnum; + } + err = check_lpt_crc(c, buf, c->lsave_sz); + return err; +} + +/** + * validate_nnode - validate a nnode. + * @c: UBIFS file-system description object + * @nnode: nnode to validate + * @parent: parent nnode (or NULL for the root nnode) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode, + struct ubifs_nnode *parent, int iip) +{ + int i, lvl, max_offs; + + if (c->big_lpt) { + int num = calc_nnode_num_from_parent(c, parent, iip); + + if (nnode->num != num) + return -EINVAL; + } + lvl = parent ? parent->level - 1 : c->lpt_hght; + if (lvl < 1) + return -EINVAL; + if (lvl == 1) + max_offs = c->leb_size - c->pnode_sz; + else + max_offs = c->leb_size - c->nnode_sz; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + int offs = nnode->nbranch[i].offs; + + if (lnum == 0) { + if (offs != 0) + return -EINVAL; + continue; + } + if (lnum < c->lpt_first || lnum > c->lpt_last) + return -EINVAL; + if (offs < 0 || offs > max_offs) + return -EINVAL; + } + return 0; +} + +/** + * validate_pnode - validate a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to validate + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + if (c->big_lpt) { + int num = calc_pnode_num_from_parent(c, parent, iip); + + if (pnode->num != num) + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int free = pnode->lprops[i].free; + int dirty = pnode->lprops[i].dirty; + + if (free < 0 || free > c->leb_size || free % c->min_io_size || + (free & 7)) + return -EINVAL; + if (dirty < 0 || dirty > c->leb_size || (dirty & 7)) + return -EINVAL; + if (dirty + free > c->leb_size) + return -EINVAL; + } + return 0; +} + +/** + * set_pnode_lnum - set LEB numbers on a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to update + * + * This function calculates the LEB numbers for the LEB properties it contains + * based on the pnode number. + */ +static void set_pnode_lnum(const struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + int i, lnum; + + lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (lnum >= c->leb_cnt) + return; + pnode->lprops[i].lnum = lnum++; + } +} + +/** + * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + if (parent) { + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + } else { + lnum = c->lpt_lnum; + offs = c->lpt_offs; + } + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1); + if (err) + goto out; + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + goto out; + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + if (parent) { + branch->nnode = nnode; + nnode->level = parent->level - 1; + } else { + c->nroot = nnode; + nnode->level = c->lpt_hght; + } + nnode->parent = parent; + nnode->iip = iip; + return 0; + +out: + ubifs_err(c, "error %d reading nnode at %d:%d", err, lnum, offs); + dump_stack(); + kfree(nnode); + return err; +} + +/** + * read_pnode - read a pnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (!pnode) + return -ENOMEM; + + if (lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1); + if (err) + goto out; + err = unpack_pnode(c, buf, pnode); + if (err) + goto out; + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + branch->pnode = pnode; + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + c->pnodes_have += 1; + return 0; + +out: + ubifs_err(c, "error %d reading pnode at %d:%d", err, lnum, offs); + ubifs_dump_pnode(c, pnode, parent, iip); + dump_stack(); + ubifs_err(c, "calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + kfree(pnode); + return err; +} + +/** + * read_ltab - read LPT's own lprops table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_ltab(struct ubifs_info *c) +{ + int err; + void *buf; + + buf = vmalloc(c->ltab_sz); + if (!buf) + return -ENOMEM; + err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1); + if (err) + goto out; + err = unpack_ltab(c, buf); +out: + vfree(buf); + return err; +} + +/** + * read_lsave - read LPT's save table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_lsave(struct ubifs_info *c) +{ + int err, i; + void *buf; + + buf = vmalloc(c->lsave_sz); + if (!buf) + return -ENOMEM; + err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs, + c->lsave_sz, 1); + if (err) + goto out; + err = unpack_lsave(c, buf); + if (err) + goto out; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = c->lsave[i]; + struct ubifs_lprops *lprops; + + /* + * Due to automatic resizing, the values in the lsave table + * could be beyond the volume size - just ignore them. + */ + if (lnum >= c->leb_cnt) + continue; + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } +out: + vfree(buf); + return err; +} + +/** + * ubifs_get_nnode - get a nnode. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) + return nnode; + err = ubifs_read_nnode(c, parent, iip); + if (err) + return ERR_PTR(err); + return branch->nnode; +} + +/** + * ubifs_get_pnode - get a pnode. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) + return pnode; + err = read_pnode(c, parent, iip); + if (err) + return ERR_PTR(err); + update_cats(c, branch->pnode); + return branch->pnode; +} + +/** + * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + return &pnode->lprops[iip]; +} + +/** + * dirty_cow_nnode - ensure a nnode is not being committed. + * @c: UBIFS file-system description object + * @nnode: nnode to check + * + * Returns dirtied nnode on success or negative error code on failure. + */ +static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *n; + int i; + + if (!test_bit(COW_CNODE, &nnode->flags)) { + /* nnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + } + return nnode; + } + + /* nnode is being committed, so copy it */ + n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (unlikely(!n)) + return ERR_PTR(-ENOMEM); + + memcpy(n, nnode, sizeof(struct ubifs_nnode)); + n->cnext = NULL; + __set_bit(DIRTY_CNODE, &n->flags); + __clear_bit(COW_CNODE, &n->flags); + + /* The children now have new parent */ + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_nbranch *branch = &n->nbranch[i]; + + if (branch->cnode) + branch->cnode->parent = n; + } + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags)); + __set_bit(OBSOLETE_CNODE, &nnode->flags); + + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + if (nnode->parent) + nnode->parent->nbranch[n->iip].nnode = n; + else + c->nroot = n; + return n; +} + +/** + * dirty_cow_pnode - ensure a pnode is not being committed. + * @c: UBIFS file-system description object + * @pnode: pnode to check + * + * Returns dirtied pnode on success or negative error code on failure. + */ +static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_pnode *p; + + if (!test_bit(COW_CNODE, &pnode->flags)) { + /* pnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + } + return pnode; + } + + /* pnode is being committed, so copy it */ + p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (unlikely(!p)) + return ERR_PTR(-ENOMEM); + + memcpy(p, pnode, sizeof(struct ubifs_pnode)); + p->cnext = NULL; + __set_bit(DIRTY_CNODE, &p->flags); + __clear_bit(COW_CNODE, &p->flags); + replace_cats(c, pnode, p); + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags)); + __set_bit(OBSOLETE_CNODE, &pnode->flags); + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + pnode->parent->nbranch[p->iip].pnode = p; + return p; +} + +/** + * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + pnode = dirty_cow_pnode(c, pnode); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags)); + return &pnode->lprops[iip]; +} + +/** + * lpt_init_rd - initialize the LPT for reading. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_rd(struct ubifs_info *c) +{ + int err, i; + + c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab) + return -ENOMEM; + + i = max_t(int, c->nnode_sz, c->pnode_sz); + c->lpt_nod_buf = kmalloc(i, GFP_KERNEL); + if (!c->lpt_nod_buf) + return -ENOMEM; + + for (i = 0; i < LPROPS_HEAP_CNT; i++) { + c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, + GFP_KERNEL); + if (!c->lpt_heap[i].arr) + return -ENOMEM; + c->lpt_heap[i].cnt = 0; + c->lpt_heap[i].max_cnt = LPT_HEAP_SZ; + } + + c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL); + if (!c->dirty_idx.arr) + return -ENOMEM; + c->dirty_idx.cnt = 0; + c->dirty_idx.max_cnt = LPT_HEAP_SZ; + + err = read_ltab(c); + if (err) + return err; + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; +} + +/** + * lpt_init_wr - initialize the LPT for writing. + * @c: UBIFS file-system description object + * + * 'lpt_init_rd()' must have been called already. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_wr(struct ubifs_info *c) +{ + int err, i; + + c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab_cmt) + return -ENOMEM; + + c->lpt_buf = vmalloc(c->leb_size); + if (!c->lpt_buf) + return -ENOMEM; + + if (c->big_lpt) { + c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS); + if (!c->lsave) + return -ENOMEM; + err = read_lsave(c); + if (err) + return err; + } + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].free == c->leb_size) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + } + + return 0; +} + +/** + * ubifs_lpt_init - initialize the LPT. + * @c: UBIFS file-system description object + * @rd: whether to initialize lpt for reading + * @wr: whether to initialize lpt for writing + * + * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true + * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is + * true. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) +{ + int err; + + if (rd) { + err = lpt_init_rd(c); + if (err) + goto out_err; + } + + if (wr) { + err = lpt_init_wr(c); + if (err) + goto out_err; + } + + return 0; + +out_err: + if (wr) + ubifs_lpt_free(c, 1); + if (rd) + ubifs_lpt_free(c, 0); + return err; +} + +/** + * struct lpt_scan_node - somewhere to put nodes while we scan LPT. + * @nnode: where to keep a nnode + * @pnode: where to keep a pnode + * @cnode: where to keep a cnode + * @in_tree: is the node in the tree in memory + * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in + * the tree + * @ptr.pnode: ditto for pnode + * @ptr.cnode: ditto for cnode + */ +struct lpt_scan_node { + union { + struct ubifs_nnode nnode; + struct ubifs_pnode pnode; + struct ubifs_cnode cnode; + }; + int in_tree; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + } ptr; +}; + +/** + * scan_get_nnode - for the scan, get a nnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the nnode + * @parent: parent of the nnode + * @iip: index in parent of the nnode + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) { + path->in_tree = 1; + path->ptr.nnode = nnode; + return nnode; + } + nnode = &path->nnode; + path->in_tree = 0; + path->ptr.nnode = nnode; + memset(nnode, 0, sizeof(struct ubifs_nnode)); + if (branch->lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->nnode_sz, 1); + if (err) + return ERR_PTR(err); + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + return ERR_PTR(err); + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + nnode->level = parent->level - 1; + nnode->parent = parent; + nnode->iip = iip; + return nnode; +} + +/** + * scan_get_pnode - for the scan, get a pnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the pnode + * @parent: parent of the pnode + * @iip: index in parent of the pnode + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) { + path->in_tree = 1; + path->ptr.pnode = pnode; + return pnode; + } + pnode = &path->pnode; + path->in_tree = 0; + path->ptr.pnode = pnode; + memset(pnode, 0, sizeof(struct ubifs_pnode)); + if (branch->lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + ubifs_assert(branch->lnum >= c->lpt_first && + branch->lnum <= c->lpt_last); + ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->pnode_sz, 1); + if (err) + return ERR_PTR(err); + err = unpack_pnode(c, buf, pnode); + if (err) + return ERR_PTR(err); + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + return pnode; +} + +/** + * ubifs_lpt_scan_nolock - scan the LPT. + * @c: the UBIFS file-system description object + * @start_lnum: LEB number from which to start scanning + * @end_lnum: LEB number at which to stop scanning + * @scan_cb: callback function called for each lprops + * @data: data to be passed to the callback function + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data) +{ + int err = 0, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct lpt_scan_node *path; + + if (start_lnum == -1) { + start_lnum = end_lnum + 1; + if (start_lnum >= c->leb_cnt) + start_lnum = c->main_first; + } + + ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt); + ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt); + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + + path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1), + GFP_NOFS); + if (!path) + return -ENOMEM; + + path[0].ptr.nnode = c->nroot; + path[0].in_tree = 1; +again: + /* Descend to the pnode containing start_lnum */ + nnode = c->nroot; + i = start_lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = (i & (UBIFS_LPT_FANOUT - 1)); + + /* Loop for each lprops */ + while (1) { + struct ubifs_lprops *lprops = &pnode->lprops[iip]; + int ret, lnum = lprops->lnum; + + ret = scan_cb(c, lprops, path[h].in_tree, data); + if (ret < 0) { + err = ret; + goto out; + } + if (ret & LPT_SCAN_ADD) { + /* Add all the nodes in path to the tree in memory */ + for (h = 1; h < c->lpt_hght; h++) { + const size_t sz = sizeof(struct ubifs_nnode); + struct ubifs_nnode *parent; + + if (path[h].in_tree) + continue; + nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + parent = nnode->parent; + parent->nbranch[nnode->iip].nnode = nnode; + path[h].ptr.nnode = nnode; + path[h].in_tree = 1; + path[h + 1].cnode.parent = nnode; + } + if (path[h].in_tree) + ubifs_ensure_cat(c, lprops); + else { + const size_t sz = sizeof(struct ubifs_pnode); + struct ubifs_nnode *parent; + + pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + parent = pnode->parent; + parent->nbranch[pnode->iip].pnode = pnode; + path[h].ptr.pnode = pnode; + path[h].in_tree = 1; + update_cats(c, pnode); + c->pnodes_have += 1; + } + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *) + c->nroot, 0, 0); + if (err) + goto out; + err = dbg_check_cats(c); + if (err) + goto out; + } + if (ret & LPT_SCAN_STOP) { + err = 0; + break; + } + /* Get the next lprops */ + if (lnum == end_lnum) { + /* + * We got to the end without finding what we were + * looking for + */ + err = -ENOSPC; + goto out; + } + if (lnum + 1 >= c->leb_cnt) { + /* Wrap-around to the beginning */ + start_lnum = c->main_first; + goto again; + } + if (iip + 1 < UBIFS_LPT_FANOUT) { + /* Next lprops is in the same pnode */ + iip += 1; + continue; + } + /* We need to get the next pnode. Go up until we can go right */ + iip = pnode->iip; + while (1) { + h -= 1; + ubifs_assert(h >= 0); + nnode = path[h].ptr.nnode; + if (iip + 1 < UBIFS_LPT_FANOUT) + break; + iip = nnode->iip; + } + /* Go right */ + iip += 1; + /* Descend to the pnode */ + h += 1; + for (; h < c->lpt_hght; h++) { + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + iip = 0; + } + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = 0; + } +out: + kfree(path); + return err; +} + +/** + * dbg_chk_pnode - check a pnode. + * @c: the UBIFS file-system description object + * @pnode: pnode to check + * @col: pnode column + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + int col) +{ + int i; + + if (pnode->num != col) { + ubifs_err(c, "pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp, *lprops = &pnode->lprops[i]; + int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i + + c->main_first; + int found, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + struct list_head *list = NULL; + + if (lnum >= c->leb_cnt) + continue; + if (lprops->lnum != lnum) { + ubifs_err(c, "bad LEB number %d expected %d", + lprops->lnum, lnum); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + if (cat != LPROPS_UNCAT) { + ubifs_err(c, "LEB %d taken but not uncat %d", + lprops->lnum, cat); + return -EINVAL; + } + continue; + } + if (lprops->flags & LPROPS_INDEX) { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY_IDX: + case LPROPS_FRDI_IDX: + break; + default: + ubifs_err(c, "LEB %d index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } else { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY: + case LPROPS_FREE: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + break; + default: + ubifs_err(c, "LEB %d not index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } + switch (cat) { + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + } + found = 0; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (lprops->hpos < heap->cnt && + heap->arr[lprops->hpos] == lprops) + found = 1; + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_for_each_entry(lp, list, list) + if (lprops == lp) { + found = 1; + break; + } + break; + } + if (!found) { + ubifs_err(c, "LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); + return -EINVAL; + } + switch (cat) { + case LPROPS_EMPTY: + if (lprops->free != c->leb_size) { + ubifs_err(c, "LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + break; + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err(c, "LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + break; + } + } + return 0; +} + +/** + * dbg_check_lpt_nodes - check nnodes and pnodes. + * @c: the UBIFS file-system description object + * @cnode: next cnode (nnode or pnode) to check + * @row: row of cnode (root is zero) + * @col: column of cnode (leftmost is zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col) +{ + struct ubifs_nnode *nnode, *nn; + struct ubifs_cnode *cn; + int num, iip = 0, err; + + if (!dbg_is_chk_lprops(c)) + return 0; + + while (cnode) { + ubifs_assert(row >= 0); + nnode = cnode->parent; + if (cnode->level) { + /* cnode is a nnode */ + num = calc_nnode_num(row, col); + if (cnode->num != num) { + ubifs_err(c, "nnode num %d expected %d parent num %d iip %d", + cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); + return -EINVAL; + } + nn = (struct ubifs_nnode *)cnode; + while (iip < UBIFS_LPT_FANOUT) { + cn = nn->nbranch[iip].cnode; + if (cn) { + /* Go down */ + row += 1; + col <<= UBIFS_LPT_FANOUT_SHIFT; + col += iip; + iip = 0; + cnode = cn; + break; + } + /* Go right */ + iip += 1; + } + if (iip < UBIFS_LPT_FANOUT) + continue; + } else { + struct ubifs_pnode *pnode; + + /* cnode is a pnode */ + pnode = (struct ubifs_pnode *)cnode; + err = dbg_chk_pnode(c, pnode, col); + if (err) + return err; + } + /* Go up and to the right */ + row -= 1; + col >>= UBIFS_LPT_FANOUT_SHIFT; + iip = cnode->iip + 1; + cnode = (struct ubifs_cnode *)nnode; + } + return 0; +} diff --git a/kernel/fs/ubifs/lpt_commit.c b/kernel/fs/ubifs/lpt_commit.c new file mode 100644 index 000000000..ce89bdc3e --- /dev/null +++ b/kernel/fs/ubifs/lpt_commit.c @@ -0,0 +1,2037 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements commit-related functionality of the LEB properties + * subsystem. + */ + +#include +#include +#include +#include "ubifs.h" + +static int dbg_populate_lsave(struct ubifs_info *c); + +/** + * first_dirty_cnode - find first dirty cnode. + * @c: UBIFS file-system description object + * @nnode: nnode at which to start + * + * This function returns the first dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode) +{ + ubifs_assert(nnode); + while (1) { + int i, cont = 0; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_cnode *cnode; + + cnode = nnode->nbranch[i].cnode; + if (cnode && + test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; + nnode = (struct ubifs_nnode *)cnode; + cont = 1; + break; + } + } + if (!cont) + return (struct ubifs_cnode *)nnode; + } +} + +/** + * next_dirty_cnode - find next dirty cnode. + * @cnode: cnode from which to begin searching + * + * This function returns the next dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode) +{ + struct ubifs_nnode *nnode; + int i; + + ubifs_assert(cnode); + nnode = cnode->parent; + if (!nnode) + return NULL; + for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) { + cnode = nnode->nbranch[i].cnode; + if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; /* cnode is a pnode */ + /* cnode is a nnode */ + return first_dirty_cnode((struct ubifs_nnode *)cnode); + } + } + return (struct ubifs_cnode *)nnode; +} + +/** + * get_cnodes_to_commit - create list of dirty cnodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of cnodes to commit. + */ +static int get_cnodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + int cnt = 0; + + if (!c->nroot) + return 0; + + if (!test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + c->lpt_cnext = first_dirty_cnode(c->nroot); + cnode = c->lpt_cnext; + if (!cnode) + return 0; + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_CNODE, &cnode->flags)); + __set_bit(COW_CNODE, &cnode->flags); + cnext = next_dirty_cnode(cnode); + if (!cnext) { + cnode->cnext = c->lpt_cnext; + break; + } + cnode->cnext = cnext; + cnode = cnext; + cnt += 1; + } + dbg_cmt("committing %d cnodes", cnt); + dbg_lp("committing %d cnodes", cnt); + ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt); + return cnt; +} + +/** + * upd_ltab - update LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space to add + */ +static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d +%d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * alloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function finds the next empty LEB in the ltab starting from @lnum. If a + * an empty LEB is found it is returned in @lnum and the function returns %0. + * Otherwise the function returns -ENOSPC. Note however, that LPT is designed + * never to run out of space. + */ +static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + + for (i = 0; i < n; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + return -ENOSPC; +} + +/** + * layout_cnodes - layout cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, alen, done_lsave, done_ltab, err; + struct ubifs_cnode *cnode; + + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + do { + if (cnode->level) { + len = c->nnode_sz; + c->dirty_nn_cnt -= 1; + } else { + len = c->pnode_sz; + c->dirty_pn_cnt -= 1; + } + while (offs + len > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->parent) { + cnode->parent->nbranch[cnode->iip].lnum = lnum; + cnode->parent->nbranch[cnode->iip].offs = offs; + } else { + c->lpt_lnum = lnum; + c->lpt_offs = offs; + } + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; + return 0; + +no_space: + ubifs_err(c, "LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * realloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function duplicates exactly the results of the function alloc_lpt_leb. + * It is used during end commit to reallocate the same LEB numbers that were + * allocated by alloc_lpt_leb during start commit. + * + * This function finds the next LEB that was allocated by the alloc_lpt_leb + * function starting from @lnum. If a LEB is found it is returned in @lnum and + * the function returns %0. Otherwise the function returns -ENOSPC. + * Note however, that LPT is designed never to run out of space. + */ +static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + + for (i = 0; i < n; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + return -ENOSPC; +} + +/** + * write_cnodes - write cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave; + struct ubifs_cnode *cnode; + void *buf = c->lpt_buf; + + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + from = offs; + /* Ensure empty LEB is unmapped */ + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Loop for each cnode */ + do { + if (cnode->level) + len = c->nnode_sz; + else + len = c->pnode_sz; + while (offs + len > c->leb_size) { + wlen = offs - from; + if (wlen) { + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, + alen); + if (err) + return err; + } + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->level) + ubifs_pack_nnode(c, buf + offs, + (struct ubifs_nnode *)cnode); + else + ubifs_pack_pnode(c, buf + offs, + (struct ubifs_pnode *)cnode); + /* + * The reason for the barriers is the same as in case of TNC. + * See comment in 'write_index()'. 'dirty_cow_nnode()' and + * 'dirty_cow_pnode()' are the functions for which this is + * important. + */ + clear_bit(DIRTY_CNODE, &cnode->flags); + smp_mb__before_atomic(); + clear_bit(COW_CNODE, &cnode->flags); + smp_mb__after_atomic(); + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Write remaining data in buffer */ + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); + if (err) + return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(offs, c->min_io_size); + + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; + +no_space: + ubifs_err(c, "LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * next_pnode_to_dirty - find next pnode to dirty. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * This function returns the next pnode to dirty or %NULL if there are no more + * pnodes. Note that pnodes that have never been written (lnum == 0) are + * skipped. + */ +static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_nnode *nnode; + int iip; + + /* Try to go right */ + nnode = pnode->parent; + for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + return ubifs_get_pnode(c, nnode, iip); + } + + /* Go up while can't go right */ + do { + iip = nnode->iip + 1; + nnode = nnode->parent; + if (!nnode) + return NULL; + for (; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + } while (iip >= UBIFS_LPT_FANOUT); + + /* Go right */ + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + + /* Go down to level 1 */ + while (nnode->level > 1) { + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + if (iip >= UBIFS_LPT_FANOUT) { + /* + * Should not happen, but we need to keep going + * if it does. + */ + iip = 0; + } + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + } + + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) + if (nnode->nbranch[iip].lnum) + break; + if (iip >= UBIFS_LPT_FANOUT) + /* Should not happen, but we need to keep going if it does */ + iip = 0; + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * pnode_lookup - lookup a pnode in the LPT. + * @c: UBIFS file-system description object + * @i: pnode number (0 to main_lebs - 1) + * + * This function returns a pointer to the pnode on success or a negative + * error code on failure. + */ +static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) +{ + int err, h, iip, shft; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + i <<= UBIFS_LPT_FANOUT_SHIFT; + nnode = c->nroot; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * do_make_pnode_dirty - mark a pnode dirty. + * @c: UBIFS file-system description object + * @pnode: pnode to mark dirty + */ +static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + struct ubifs_nnode *nnode; + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + /* Mark parent and ancestors dirty too */ + nnode = pnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } +} + +/** + * make_tree_dirty - mark the entire LEB properties tree dirty. + * @c: UBIFS file-system description object + * + * This function is used by the "small" LPT model to cause the entire LEB + * properties tree to be written. The "small" LPT model does not use LPT + * garbage collection because it is more efficient to write the entire tree + * (because it is small). + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_tree_dirty(struct ubifs_info *c) +{ + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, 0); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + + while (pnode) { + do_make_pnode_dirty(c, pnode); + pnode = next_pnode_to_dirty(c, pnode); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + } + return 0; +} + +/** + * need_write_all - determine if the LPT area is running out of free space. + * @c: UBIFS file-system description object + * + * This function returns %1 if the LPT area is running out of free space and %0 + * if it is not. + */ +static int need_write_all(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + free += c->leb_size; + } + /* Less than twice the size left */ + if (free <= c->lpt_sz * 2) + return 1; + return 0; +} + +/** + * lpt_tgc_start - start trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called during start commit to mark LPT LEBs for trivial GC. + */ +static void lpt_tgc_start(struct ubifs_info *c) +{ + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + continue; + if (c->ltab[i].dirty > 0 && + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) { + c->ltab[i].tgc = 1; + c->ltab[i].free = c->leb_size; + c->ltab[i].dirty = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + } +} + +/** + * lpt_tgc_end - end trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called after the commit is completed (master node has been + * written) and un-maps LPT LEBs that were marked for trivial GC. + */ +static int lpt_tgc_end(struct ubifs_info *c) +{ + int i, err; + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].tgc) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + c->ltab[i].tgc = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + return 0; +} + +/** + * populate_lsave - fill the lsave array with important LEB numbers. + * @c: the UBIFS file-system description object + * + * This function is only called for the "big" model. It records a small number + * of LEB numbers of important LEBs. Important LEBs are ones that are (from + * most important to least important): empty, freeable, freeable index, dirty + * index, dirty or free. Upon mount, we read this list of LEB numbers and bring + * their pnodes into memory. That will stop us from having to scan the LPT + * straight away. For the "small" model we assume that scanning the LPT is no + * big deal. + */ +static void populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i, cnt = 0; + + ubifs_assert(c->big_lpt); + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + + if (dbg_populate_lsave(c)) + return; + + list_for_each_entry(lprops, &c->empty_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->freeable_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + /* Fill it up completely */ + while (cnt < c->lsave_cnt) + c->lsave[cnt++] = c->main_first; +} + +/** + * nnode_lookup - lookup a nnode in the LPT. + * @c: UBIFS file-system description object + * @i: nnode number + * + * This function returns a pointer to the nnode on success or a negative + * error code on failure. + */ +static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i) +{ + int err, iip; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + while (1) { + iip = i & (UBIFS_LPT_FANOUT - 1); + i >>= UBIFS_LPT_FANOUT_SHIFT; + if (!i) + break; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return nnode; + } + return nnode; +} + +/** + * make_nnode_dirty - find a nnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: nnode number of nnode to make dirty + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_nnode *nnode; + + nnode = nnode_lookup(c, node_num); + if (IS_ERR(nnode)) + return PTR_ERR(nnode); + if (nnode->parent) { + struct ubifs_nbranch *branch; + + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; /* nnode is obsolete */ + } else if (c->lpt_lnum != lnum || c->lpt_offs != offs) + return 0; /* nnode is obsolete */ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + /* Mark parent and ancestors dirty too */ + nnode = nnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } + return 0; +} + +/** + * make_pnode_dirty - find a pnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: pnode number of pnode to make dirty + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + pnode = pnode_lookup(c, node_num); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; + do_make_pnode_dirty(c, pnode); + return 0; +} + +/** + * make_ltab_dirty - make ltab node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where ltab was written + * @offs: offset where ltab was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 0; /* This ltab node is obsolete */ + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + return 0; +} + +/** + * make_lsave_dirty - make lsave node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where lsave was written + * @offs: offset where lsave was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 0; /* This lsave node is obsolete */ + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + return 0; +} + +/** + * make_node_dirty - make node dirty. + * @c: UBIFS file-system description object + * @node_type: LPT node type + * @node_num: node number + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, + int lnum, int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return make_nnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_PNODE: + return make_pnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_LTAB: + return make_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return make_lsave_dirty(c, lnum, offs); + } + return -EINVAL; +} + +/** + * get_lpt_node_len - return the length of a node based on its type. + * @c: UBIFS file-system description object + * @node_type: LPT node type + */ +static int get_lpt_node_len(const struct ubifs_info *c, int node_type) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return c->nnode_sz; + case UBIFS_LPT_PNODE: + return c->pnode_sz; + case UBIFS_LPT_LTAB: + return c->ltab_sz; + case UBIFS_LPT_LSAVE: + return c->lsave_sz; + } + return 0; +} + +/** + * get_pad_len - return the length of padding in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + */ +static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len) +{ + int offs, pad_len; + + if (c->min_io_size == 1) + return 0; + offs = c->leb_size - len; + pad_len = ALIGN(offs, c->min_io_size) - offs; + return pad_len; +} + +/** + * get_lpt_node_type - return type (and node number) of a node in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @node_num: node number is returned here + */ +static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf, + int *node_num) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type; + + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + return node_type; +} + +/** + * is_a_node - determine if a buffer contains a node. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + * + * This function returns %1 if the buffer contains a node or %0 if it does not. + */ +static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type, node_len; + uint16_t crc, calc_crc; + + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + if (node_type == UBIFS_LPT_NOT_A_NODE) + return 0; + node_len = get_lpt_node_len(c, node_type); + if (!node_len || node_len > len) + return 0; + pos = 0; + addr = buf; + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + node_len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) + return 0; + return 1; +} + +/** + * lpt_gc_lnum - garbage collect a LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to garbage collect + * + * LPT garbage collection is used only for the "big" LPT model + * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes + * in the LEB being garbage-collected as dirty. The dirty nodes are written + * next commit, after which the LEB is free to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_gc_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->lpt_buf; + + dbg_lp("LEB %d", lnum); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) + return err; + + while (1) { + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + continue; + } + return 0; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + offs = c->leb_size - len; + ubifs_assert(node_len != 0); + mutex_lock(&c->lp_mutex); + err = make_node_dirty(c, node_type, node_num, lnum, offs); + mutex_unlock(&c->lp_mutex); + if (err) + return err; + buf += node_len; + len -= node_len; + } + return 0; +} + +/** + * lpt_gc - LPT garbage collection. + * @c: UBIFS file-system description object + * + * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'. + * Returns %0 on success and a negative error code on failure. + */ +static int lpt_gc(struct ubifs_info *c) +{ + int i, lnum = -1, dirty = 0; + + mutex_lock(&c->lp_mutex); + for (i = 0; i < c->lpt_lebs; i++) { + ubifs_assert(!c->ltab[i].tgc); + if (i + c->lpt_first == c->nhead_lnum || + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + continue; + if (c->ltab[i].dirty > dirty) { + dirty = c->ltab[i].dirty; + lnum = i + c->lpt_first; + } + } + mutex_unlock(&c->lp_mutex); + if (lnum == -1) + return -ENOSPC; + return lpt_gc_lnum(c, lnum); +} + +/** + * ubifs_lpt_start_commit - UBIFS commit starts. + * @c: the UBIFS file-system description object + * + * This function has to be called when UBIFS starts the commit operation. + * This function "freezes" all currently dirty LEB properties and does not + * change them anymore. Further changes are saved and tracked separately + * because they are not part of this commit. This function returns zero in case + * of success and a negative error code in case of failure. + */ +int ubifs_lpt_start_commit(struct ubifs_info *c) +{ + int err, cnt; + + dbg_lp(""); + + mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; + err = dbg_check_ltab(c); + if (err) + goto out; + + if (c->check_lpt_free) { + /* + * We ensure there is enough free space in + * ubifs_lpt_post_commit() by marking nodes dirty. That + * information is lost when we unmount, so we also need + * to check free space once after mounting also. + */ + c->check_lpt_free = 0; + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } + } + + lpt_tgc_start(c); + + if (!c->dirty_pn_cnt) { + dbg_cmt("no cnodes to commit"); + err = 0; + goto out; + } + + if (!c->big_lpt && need_write_all(c)) { + /* If needed, write everything */ + err = make_tree_dirty(c); + if (err) + goto out; + lpt_tgc_start(c); + } + + if (c->big_lpt) + populate_lsave(c); + + cnt = get_cnodes_to_commit(c); + ubifs_assert(cnt != 0); + + err = layout_cnodes(c); + if (err) + goto out; + + /* Copy the LPT's own lprops for end commit to write */ + memcpy(c->ltab_cmt, c->ltab, + sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY); + +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * free_obsolete_cnodes - free obsolete cnodes for commit end. + * @c: UBIFS file-system description object + */ +static void free_obsolete_cnodes(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + + cnext = c->lpt_cnext; + if (!cnext) + return; + do { + cnode = cnext; + cnext = cnode->cnext; + if (test_bit(OBSOLETE_CNODE, &cnode->flags)) + kfree(cnode); + else + cnode->cnext = NULL; + } while (cnext != c->lpt_cnext); + c->lpt_cnext = NULL; +} + +/** + * ubifs_lpt_end_commit - finish the commit operation. + * @c: the UBIFS file-system description object + * + * This function has to be called when the commit operation finishes. It + * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to + * the media. Returns zero in case of success and a negative error code in case + * of failure. + */ +int ubifs_lpt_end_commit(struct ubifs_info *c) +{ + int err; + + dbg_lp(""); + + if (!c->lpt_cnext) + return 0; + + err = write_cnodes(c); + if (err) + return err; + + mutex_lock(&c->lp_mutex); + free_obsolete_cnodes(c); + mutex_unlock(&c->lp_mutex); + + return 0; +} + +/** + * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC. + * @c: UBIFS file-system description object + * + * LPT trivial GC is completed after a commit. Also LPT GC is done after a + * commit for the "big" LPT model. + */ +int ubifs_lpt_post_commit(struct ubifs_info *c) +{ + int err; + + mutex_lock(&c->lp_mutex); + err = lpt_tgc_end(c); + if (err) + goto out; + if (c->big_lpt) + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * first_nnode - find the first nnode in memory. + * @c: UBIFS file-system description object + * @hght: height of tree where nnode found is returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght) +{ + struct ubifs_nnode *nnode; + int h, i, found; + + nnode = c->nroot; + *hght = 0; + if (!nnode) + return NULL; + for (h = 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * next_nnode - find the next nnode in memory. + * @c: UBIFS file-system description object + * @nnode: nnode from which to start. + * @hght: height of tree where nnode is, is passed and returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *next_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode, int *hght) +{ + struct ubifs_nnode *parent; + int iip, h, i, found; + + parent = nnode->parent; + if (!parent) + return NULL; + if (nnode->iip == UBIFS_LPT_FANOUT - 1) { + *hght -= 1; + return parent; + } + for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + nnode = parent->nbranch[iip].nnode; + if (nnode) + break; + } + if (!nnode) { + *hght -= 1; + return parent; + } + for (h = *hght + 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * ubifs_lpt_free - free resources owned by the LPT. + * @c: UBIFS file-system description object + * @wr_only: free only resources used for writing + */ +void ubifs_lpt_free(struct ubifs_info *c, int wr_only) +{ + struct ubifs_nnode *nnode; + int i, hght; + + /* Free write-only things first */ + + free_obsolete_cnodes(c); /* Leftover from a failed commit */ + + vfree(c->ltab_cmt); + c->ltab_cmt = NULL; + vfree(c->lpt_buf); + c->lpt_buf = NULL; + kfree(c->lsave); + c->lsave = NULL; + + if (wr_only) + return; + + /* Now free the rest */ + + nnode = first_nnode(c, &hght); + while (nnode) { + for (i = 0; i < UBIFS_LPT_FANOUT; i++) + kfree(nnode->nbranch[i].nnode); + nnode = next_nnode(c, nnode, &hght); + } + for (i = 0; i < LPROPS_HEAP_CNT; i++) + kfree(c->lpt_heap[i].arr); + kfree(c->dirty_idx.arr); + kfree(c->nroot); + vfree(c->ltab); + kfree(c->lpt_nod_buf); +} + +/* + * Everything below is related to debugging. + */ + +/** + * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. + * @buf: buffer + * @len: buffer length + */ +static int dbg_is_all_ff(uint8_t *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + return 0; + return 1; +} + +/** + * dbg_is_nnode_dirty - determine if a nnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + */ +static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_nnode *nnode; + int hght; + + /* Entire tree is in memory so first_nnode / next_nnode are OK */ + nnode = first_nnode(c, &hght); + for (; nnode; nnode = next_nnode(c, nnode, &hght)) { + struct ubifs_nbranch *branch; + + cond_resched(); + if (nnode->parent) { + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } else { + if (c->lpt_lnum != lnum || c->lpt_offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } + } + return 1; +} + +/** + * dbg_is_pnode_dirty - determine if a pnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + */ +static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + int i, cnt; + + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + cond_resched(); + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &pnode->flags)) + return 1; + return 0; + } + return 1; +} + +/** + * dbg_is_ltab_dirty - determine if a ltab node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where ltab node was written + * @offs: offset where ltab node was written + */ +static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 1; + return (c->lpt_drty_flgs & LTAB_DIRTY) != 0; +} + +/** + * dbg_is_lsave_dirty - determine if a lsave node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where lsave node was written + * @offs: offset where lsave node was written + */ +static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 1; + return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0; +} + +/** + * dbg_is_node_dirty - determine if a node is dirty. + * @c: the UBIFS file-system description object + * @node_type: node type + * @lnum: LEB number where node was written + * @offs: offset where node was written + */ +static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum, + int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return dbg_is_nnode_dirty(c, lnum, offs); + case UBIFS_LPT_PNODE: + return dbg_is_pnode_dirty(c, lnum, offs); + case UBIFS_LPT_LTAB: + return dbg_is_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return dbg_is_lsave_dirty(c, lnum, offs); + } + return 1; +} + +/** + * dbg_check_ltab_lnum - check the ltab for a LPT LEB number. + * @c: the UBIFS file-system description object + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; + int ret; + void *buf, *p; + + if (!dbg_is_chk_lprops(c)) + return 0; + + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err(c, "cannot allocate memory for ltab checking"); + return 0; + } + + dbg_lp("LEB %d", lnum); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) + goto out; + + while (1) { + if (!is_a_node(c, p, len)) { + int i, pad_len; + + pad_len = get_pad_len(c, p, len); + if (pad_len) { + p += pad_len; + len -= pad_len; + dirty += pad_len; + continue; + } + if (!dbg_is_all_ff(p, len)) { + ubifs_err(c, "invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); + err = -EINVAL; + } + i = lnum - c->lpt_first; + if (len != c->ltab[i].free) { + ubifs_err(c, "invalid free space in LEB %d (free %d, expected %d)", + lnum, len, c->ltab[i].free); + err = -EINVAL; + } + if (dirty != c->ltab[i].dirty) { + ubifs_err(c, "invalid dirty space in LEB %d (dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); + err = -EINVAL; + } + goto out; + } + node_type = get_lpt_node_type(c, p, &node_num); + node_len = get_lpt_node_len(c, node_type); + ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); + if (ret == 1) + dirty += node_len; + p += node_len; + len -= node_len; + } + + err = 0; +out: + vfree(buf); + return err; +} + +/** + * dbg_check_ltab - check the free and dirty space in the ltab. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_ltab(struct ubifs_info *c) +{ + int lnum, err, i, cnt; + + if (!dbg_is_chk_lprops(c)) + return 0; + + /* Bring the entire tree into memory */ + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + cond_resched(); + } + + /* Check nodes */ + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0); + if (err) + return err; + + /* Check each LEB */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + err = dbg_check_ltab_lnum(c, lnum); + if (err) { + ubifs_err(c, "failed at LEB %d", lnum); + return err; + } + } + + dbg_lp("succeeded"); + return 0; +} + +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + if (!dbg_is_chk_lprops(c)) + return 0; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + ubifs_err(c, "LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: what to do + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + struct ubifs_debug_info *d = c->dbg; + long long chk_lpt_sz, lpt_sz; + int err = 0; + + if (!dbg_is_chk_lprops(c)) + return 0; + + switch (action) { + case 0: + d->chk_lpt_sz = 0; + d->chk_lpt_sz2 = 0; + d->chk_lpt_lebs = 0; + d->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + ubifs_err(c, "dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + ubifs_err(c, "dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + d->chk_lpt_sz += len; + return 0; + case 2: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + d->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= d->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (d->chk_lpt_sz != chk_lpt_sz) { + ubifs_err(c, "LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz > c->lpt_sz) { + ubifs_err(c, "LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { + ubifs_err(c, "LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { + ubifs_err(c, "LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { + ubifs_err(c, "LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) { + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + } + d->chk_lpt_sz2 = d->chk_lpt_sz; + d->chk_lpt_sz = 0; + d->chk_lpt_wastage = 0; + d->chk_lpt_lebs = 0; + d->new_nhead_offs = len; + return err; + case 4: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + +/** + * ubifs_dump_lpt_leb - dump an LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to dump + * + * This function dumps an LEB from LPT area. Nodes in this area are very + * different to nodes in the main area (e.g., they do not have common headers, + * they do not have 8-byte alignments, etc), so we have a separate function to + * dump LPT area LEBs. Note, LPT has to be locked by the caller. + */ +static void dump_lpt_leb(const struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf, *p; + + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err(c, "cannot allocate memory to dump LPT"); + return; + } + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) + goto out; + + while (1) { + offs = c->leb_size - len; + if (!is_a_node(c, p, len)) { + int pad_len; + + pad_len = get_pad_len(c, p, len); + if (pad_len) { + pr_err("LEB %d:%d, pad %d bytes\n", + lnum, offs, pad_len); + p += pad_len; + len -= pad_len; + continue; + } + if (len) + pr_err("LEB %d:%d, free %d bytes\n", + lnum, offs, len); + break; + } + + node_type = get_lpt_node_type(c, p, &node_num); + switch (node_type) { + case UBIFS_LPT_PNODE: + { + node_len = c->pnode_sz; + if (c->big_lpt) + pr_err("LEB %d:%d, pnode num %d\n", + lnum, offs, node_num); + else + pr_err("LEB %d:%d, pnode\n", lnum, offs); + break; + } + case UBIFS_LPT_NNODE: + { + int i; + struct ubifs_nnode nnode; + + node_len = c->nnode_sz; + if (c->big_lpt) + pr_err("LEB %d:%d, nnode num %d, ", + lnum, offs, node_num); + else + pr_err("LEB %d:%d, nnode, ", + lnum, offs); + err = ubifs_unpack_nnode(c, p, &nnode); + if (err) { + pr_err("failed to unpack_node, error %d\n", + err); + break; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pr_cont("%d:%d", nnode.nbranch[i].lnum, + nnode.nbranch[i].offs); + if (i != UBIFS_LPT_FANOUT - 1) + pr_cont(", "); + } + pr_cont("\n"); + break; + } + case UBIFS_LPT_LTAB: + node_len = c->ltab_sz; + pr_err("LEB %d:%d, ltab\n", lnum, offs); + break; + case UBIFS_LPT_LSAVE: + node_len = c->lsave_sz; + pr_err("LEB %d:%d, lsave len\n", lnum, offs); + break; + default: + ubifs_err(c, "LPT node type %d not recognized", node_type); + goto out; + } + + p += node_len; + len -= node_len; + } + + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); +out: + vfree(buf); + return; +} + +/** + * ubifs_dump_lpt_lebs - dump LPT lebs. + * @c: UBIFS file-system description object + * + * This function dumps all LPT LEBs. The caller has to make sure the LPT is + * locked. + */ +void ubifs_dump_lpt_lebs(const struct ubifs_info *c) +{ + int i; + + pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid); + for (i = 0; i < c->lpt_lebs; i++) + dump_lpt_leb(c, i + c->lpt_first); + pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid); +} + +/** + * dbg_populate_lsave - debugging version of 'populate_lsave()' + * @c: UBIFS file-system description object + * + * This is a debugging version for 'populate_lsave()' which populates lsave + * with random LEBs instead of useful LEBs, which is good for test coverage. + * Returns zero if lsave has not been populated (this debugging feature is + * disabled) an non-zero if lsave has been populated. + */ +static int dbg_populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i; + + if (!dbg_is_chk_gen(c)) + return 0; + if (prandom_u32() & 3) + return 0; + + for (i = 0; i < c->lsave_cnt; i++) + c->lsave[i] = c->main_first; + + list_for_each_entry(lprops, &c->empty_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->freeable_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->frdi_idx_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + + return 1; +} diff --git a/kernel/fs/ubifs/master.c b/kernel/fs/ubifs/master.c new file mode 100644 index 000000000..c6a5e39e2 --- /dev/null +++ b/kernel/fs/ubifs/master.c @@ -0,0 +1,395 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements reading and writing the master node */ + +#include "ubifs.h" + +/** + * scan_for_master - search the valid master node. + * @c: UBIFS file-system description object + * + * This function scans the master node LEBs and search for the latest master + * node. Returns zero in case of success, %-EUCLEAN if there master area is + * corrupted and requires recovery, and a negative error code in case of + * failure. + */ +static int scan_for_master(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, offs = 0, nodes_cnt; + + lnum = UBIFS_MST_LNUM; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + nodes_cnt = sleb->nodes_cnt; + if (nodes_cnt > 0) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + if (snod->type != UBIFS_MST_NODE) + goto out_dump; + memcpy(c->mst_node, snod->node, snod->len); + offs = snod->offs; + } + ubifs_scan_destroy(sleb); + + lnum += 1; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + if (sleb->nodes_cnt != nodes_cnt) + goto out; + if (!sleb->nodes_cnt) + goto out; + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); + if (snod->type != UBIFS_MST_NODE) + goto out_dump; + if (snod->offs != offs) + goto out; + if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, + (void *)snod->node + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out; + c->mst_offs = offs; + ubifs_scan_destroy(sleb); + return 0; + +out: + ubifs_scan_destroy(sleb); + return -EUCLEAN; + +out_dump: + ubifs_err(c, "unexpected node type %d master LEB %d:%d", + snod->type, lnum, snod->offs); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * validate_master - validate master node. + * @c: UBIFS file-system description object + * + * This function validates data which was read from master node. Returns zero + * if the data is all right and %-EINVAL if not. + */ +static int validate_master(const struct ubifs_info *c) +{ + long long main_sz; + int err; + + if (c->max_sqnum >= SQNUM_WATERMARK) { + err = 1; + goto out; + } + + if (c->cmt_no >= c->max_sqnum) { + err = 2; + goto out; + } + + if (c->highest_inum >= INUM_WATERMARK) { + err = 3; + goto out; + } + + if (c->lhead_lnum < UBIFS_LOG_LNUM || + c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs || + c->lhead_offs < 0 || c->lhead_offs >= c->leb_size || + c->lhead_offs & (c->min_io_size - 1)) { + err = 4; + goto out; + } + + if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first || + c->zroot.offs >= c->leb_size || c->zroot.offs & 7) { + err = 5; + goto out; + } + + if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len || + c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) { + err = 6; + goto out; + } + + if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) { + err = 7; + goto out; + } + + if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first || + c->ihead_offs % c->min_io_size || c->ihead_offs < 0 || + c->ihead_offs > c->leb_size || c->ihead_offs & 7) { + err = 8; + goto out; + } + + main_sz = (long long)c->main_lebs * c->leb_size; + if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) { + err = 9; + goto out; + } + + if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last || + c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) { + err = 10; + goto out; + } + + if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last || + c->nhead_offs < 0 || c->nhead_offs % c->min_io_size || + c->nhead_offs > c->leb_size) { + err = 11; + goto out; + } + + if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last || + c->ltab_offs < 0 || + c->ltab_offs + c->ltab_sz > c->leb_size) { + err = 12; + goto out; + } + + if (c->big_lpt && (c->lsave_lnum < c->lpt_first || + c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 || + c->lsave_offs + c->lsave_sz > c->leb_size)) { + err = 13; + goto out; + } + + if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) { + err = 14; + goto out; + } + + if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) { + err = 15; + goto out; + } + + if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) { + err = 16; + goto out; + } + + if (c->lst.total_free < 0 || c->lst.total_free > main_sz || + c->lst.total_free & 7) { + err = 17; + goto out; + } + + if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) { + err = 18; + goto out; + } + + if (c->lst.total_used < 0 || (c->lst.total_used & 7)) { + err = 19; + goto out; + } + + if (c->lst.total_free + c->lst.total_dirty + + c->lst.total_used > main_sz) { + err = 20; + goto out; + } + + if (c->lst.total_dead + c->lst.total_dark + + c->lst.total_used + c->bi.old_idx_sz > main_sz) { + err = 21; + goto out; + } + + if (c->lst.total_dead < 0 || + c->lst.total_dead > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dead & 7) { + err = 22; + goto out; + } + + if (c->lst.total_dark < 0 || + c->lst.total_dark > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dark & 7) { + err = 23; + goto out; + } + + return 0; + +out: + ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err); + ubifs_dump_node(c, c->mst_node); + return -EINVAL; +} + +/** + * ubifs_read_master - read master node. + * @c: UBIFS file-system description object + * + * This function finds and reads the master node during file-system mount. If + * the flash is empty, it creates default master node as well. Returns zero in + * case of success and a negative error code in case of failure. + */ +int ubifs_read_master(struct ubifs_info *c) +{ + int err, old_leb_cnt; + + c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!c->mst_node) + return -ENOMEM; + + err = scan_for_master(c); + if (err) { + if (err == -EUCLEAN) + err = ubifs_recover_master_node(c); + if (err) + /* + * Note, we do not free 'c->mst_node' here because the + * unmount routine will take care of this. + */ + return err; + } + + /* Make sure that the recovery flag is clear */ + c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY); + + c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum); + c->highest_inum = le64_to_cpu(c->mst_node->highest_inum); + c->cmt_no = le64_to_cpu(c->mst_node->cmt_no); + c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum); + c->zroot.offs = le32_to_cpu(c->mst_node->root_offs); + c->zroot.len = le32_to_cpu(c->mst_node->root_len); + c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum); + c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); + c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); + c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); + c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); + c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); + c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); + c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs); + c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum); + c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs); + c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum); + c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs); + c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum); + c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs); + c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs); + old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt); + c->lst.total_free = le64_to_cpu(c->mst_node->total_free); + c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty); + c->lst.total_used = le64_to_cpu(c->mst_node->total_used); + c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); + c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); + + c->calc_idx_sz = c->bi.old_idx_sz; + + if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) + c->no_orphs = 1; + + if (old_leb_cnt != c->leb_cnt) { + /* The file system has been resized */ + int growth = c->leb_cnt - old_leb_cnt; + + if (c->leb_cnt < old_leb_cnt || + c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err(c, "bad leb_cnt on master node"); + ubifs_dump_node(c, c->mst_node); + return -EINVAL; + } + + dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs", + old_leb_cnt, c->leb_cnt); + c->lst.empty_lebs += growth; + c->lst.total_free += growth * (long long)c->leb_size; + c->lst.total_dark += growth * (long long)c->dark_wm; + + /* + * Reflect changes back onto the master node. N.B. the master + * node gets written immediately whenever mounting (or + * remounting) in read-write mode, so we do not need to write it + * here. + */ + c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt); + c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs); + c->mst_node->total_free = cpu_to_le64(c->lst.total_free); + c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark); + } + + err = validate_master(c); + if (err) + return err; + + err = dbg_old_index_check_init(c, &c->zroot); + + return err; +} + +/** + * ubifs_write_master - write master node. + * @c: UBIFS file-system description object + * + * This function writes the master node. Returns zero in case of success and a + * negative error code in case of failure. The master node is written twice to + * enable recovery. + */ +int ubifs_write_master(struct ubifs_info *c) +{ + int err, lnum, offs, len; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + + lnum = UBIFS_MST_LNUM; + offs = c->mst_offs + c->mst_node_alsz; + len = UBIFS_MST_NODE_SZ; + + if (offs + UBIFS_MST_NODE_SZ > c->leb_size) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + offs = 0; + } + + c->mst_offs = offs; + c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); + + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); + if (err) + return err; + + lnum += 1; + + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); + + return err; +} diff --git a/kernel/fs/ubifs/misc.h b/kernel/fs/ubifs/misc.h new file mode 100644 index 000000000..ee7cb5ebb --- /dev/null +++ b/kernel/fs/ubifs/misc.h @@ -0,0 +1,303 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains miscellaneous helper functions. + */ + +#ifndef __UBIFS_MISC_H__ +#define __UBIFS_MISC_H__ + +/** + * ubifs_zn_dirty - check if znode is dirty. + * @znode: znode to check + * + * This helper function returns %1 if @znode is dirty and %0 otherwise. + */ +static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) +{ + return !!test_bit(DIRTY_ZNODE, &znode->flags); +} + +/** + * ubifs_zn_obsolete - check if znode is obsolete. + * @znode: znode to check + * + * This helper function returns %1 if @znode is obsolete and %0 otherwise. + */ +static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode) +{ + return !!test_bit(OBSOLETE_ZNODE, &znode->flags); +} + +/** + * ubifs_zn_cow - check if znode has to be copied on write. + * @znode: znode to check + * + * This helper function returns %1 if @znode is has COW flag set and %0 + * otherwise. + */ +static inline int ubifs_zn_cow(const struct ubifs_znode *znode) +{ + return !!test_bit(COW_ZNODE, &znode->flags); +} + +/** + * ubifs_wake_up_bgt - wake up background thread. + * @c: UBIFS file-system description object + */ +static inline void ubifs_wake_up_bgt(struct ubifs_info *c) +{ + if (c->bgt && !c->need_bgt) { + c->need_bgt = 1; + wake_up_process(c->bgt); + } +} + +/** + * ubifs_tnc_find_child - find next child in znode. + * @znode: znode to search at + * @start: the zbranch index to start at + * + * This helper function looks for znode child starting at index @start. Returns + * the child or %NULL if no children were found. + */ +static inline struct ubifs_znode * +ubifs_tnc_find_child(struct ubifs_znode *znode, int start) +{ + while (start < znode->child_cnt) { + if (znode->zbranch[start].znode) + return znode->zbranch[start].znode; + start += 1; + } + + return NULL; +} + +/** + * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object. + * @inode: the VFS 'struct inode' pointer + */ +static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) +{ + return container_of(inode, struct ubifs_inode, vfs_inode); +} + +/** + * ubifs_compr_present - check if compressor was compiled in. + * @compr_type: compressor type to check + * + * This function returns %1 of compressor of type @compr_type is present, and + * %0 if not. + */ +static inline int ubifs_compr_present(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return !!ubifs_compressors[compr_type]->capi_name; +} + +/** + * ubifs_compr_name - get compressor name string by its type. + * @compr_type: compressor type + * + * This function returns compressor type string. + */ +static inline const char *ubifs_compr_name(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return ubifs_compressors[compr_type]->name; +} + +/** + * ubifs_wbuf_sync - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume + * that the write-buffer is already locked. + */ +static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) +{ + int err; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_encode_dev - encode device node IDs. + * @dev: UBIFS device node information + * @rdev: device IDs to encode + * + * This is a helper function which encodes major/minor numbers of a device node + * into UBIFS device node description. We use standard Linux "new" and "huge" + * encodings. + */ +static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) +{ + if (new_valid_dev(rdev)) { + dev->new = cpu_to_le32(new_encode_dev(rdev)); + return sizeof(dev->new); + } else { + dev->huge = cpu_to_le64(huge_encode_dev(rdev)); + return sizeof(dev->huge); + } +} + +/** + * ubifs_add_dirt - add dirty space to LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to add dirty space for + * @dirty: dirty space to add + * + * This is a helper function which increased amount of dirty LEB space. Returns + * zero in case of success and a negative error code in case of failure. + */ +static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0); +} + +/** + * ubifs_return_leb - return LEB to lprops. + * @c: the UBIFS file-system description object + * @lnum: LEB to return + * + * This helper function cleans the "taken" flag of a logical eraseblock in the + * lprops. Returns zero in case of success and a negative error code in case of + * failure. + */ +static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) +{ + return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); +} + +/** + * ubifs_idx_node_sz - return index node size. + * @c: the UBIFS file-system description object + * @child_cnt: number of children of this index node + */ +static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) +{ + return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt; +} + +/** + * ubifs_idx_branch - return pointer to an index branch. + * @c: the UBIFS file-system description object + * @idx: index node + * @bnum: branch number + */ +static inline +struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, + const struct ubifs_idx_node *idx, + int bnum) +{ + return (struct ubifs_branch *)((void *)idx->branches + + (UBIFS_BRANCH_SZ + c->key_len) * bnum); +} + +/** + * ubifs_idx_key - return pointer to an index key. + * @c: the UBIFS file-system description object + * @idx: index node + */ +static inline void *ubifs_idx_key(const struct ubifs_info *c, + const struct ubifs_idx_node *idx) +{ + return (void *)((struct ubifs_branch *)idx->branches)->key; +} + +/** + * ubifs_current_time - round current time to time granularity. + * @inode: inode + */ +static inline struct timespec ubifs_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + +/** + * ubifs_next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + * + * This helper function returns the log LEB number which goes next after LEB + * 'lnum'. + */ +static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + +#endif /* __UBIFS_MISC_H__ */ diff --git a/kernel/fs/ubifs/orphan.c b/kernel/fs/ubifs/orphan.c new file mode 100644 index 000000000..caf2d123e --- /dev/null +++ b/kernel/fs/ubifs/orphan.c @@ -0,0 +1,956 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Adrian Hunter + */ + +#include "ubifs.h" + +/* + * An orphan is an inode number whose inode node has been committed to the index + * with a link count of zero. That happens when an open file is deleted + * (unlinked) and then a commit is run. In the normal course of events the inode + * would be deleted when the file is closed. However in the case of an unclean + * unmount, orphans need to be accounted for. After an unclean unmount, the + * orphans' inodes must be deleted which means either scanning the entire index + * looking for them, or keeping a list on flash somewhere. This unit implements + * the latter approach. + * + * The orphan area is a fixed number of LEBs situated between the LPT area and + * the main area. The number of orphan area LEBs is specified when the file + * system is created. The minimum number is 1. The size of the orphan area + * should be so that it can hold the maximum number of orphans that are expected + * to ever exist at one time. + * + * The number of orphans that can fit in a LEB is: + * + * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64) + * + * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough. + * + * Orphans are accumulated in a rb-tree. When an inode's link count drops to + * zero, the inode number is added to the rb-tree. It is removed from the tree + * when the inode is deleted. Any new orphans that are in the orphan tree when + * the commit is run, are written to the orphan area in 1 or more orphan nodes. + * If the orphan area is full, it is consolidated to make space. There is + * always enough space because validation prevents the user from creating more + * than the maximum number of orphans allowed. + */ + +static int dbg_check_orphans(struct ubifs_info *c); + +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + orphan->new = 1; + + spin_lock(&c->orphan_lock); + if (c->tot_orphans >= c->max_orphans) { + spin_unlock(&c->orphan_lock); + kfree(orphan); + return -ENFILE; + } + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + ubifs_err(c, "orphaned twice"); + spin_unlock(&c->orphan_lock); + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + c->new_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + list_add_tail(&orphan->new_list, &c->orph_new); + spin_unlock(&c->orphan_lock); + dbg_gen("ino %lu", (unsigned long)inum); + return 0; +} + +/** + * ubifs_delete_orphan - delete an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Delete an orphan. This function is called when an inode is deleted. + */ +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + if (o->del) { + spin_unlock(&c->orphan_lock); + dbg_gen("deleted twice ino %lu", + (unsigned long)inum); + return; + } + if (o->cmt) { + o->del = 1; + o->dnext = c->orph_dnext; + c->orph_dnext = o; + spin_unlock(&c->orphan_lock); + dbg_gen("delete later ino %lu", + (unsigned long)inum); + return; + } + rb_erase(p, &c->orph_tree); + list_del(&o->list); + c->tot_orphans -= 1; + if (o->new) { + list_del(&o->new_list); + c->new_orphans -= 1; + } + spin_unlock(&c->orphan_lock); + kfree(o); + dbg_gen("inum %lu", (unsigned long)inum); + return; + } + } + spin_unlock(&c->orphan_lock); + ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum); + dump_stack(); +} + +/** + * ubifs_orphan_start_commit - start commit of orphans. + * @c: UBIFS file-system description object + * + * Start commit of orphans. + */ +int ubifs_orphan_start_commit(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, **last; + + spin_lock(&c->orphan_lock); + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_new, new_list) { + ubifs_assert(orphan->new); + ubifs_assert(!orphan->cmt); + orphan->new = 0; + orphan->cmt = 1; + *last = orphan; + last = &orphan->cnext; + } + *last = NULL; + c->cmt_orphans = c->new_orphans; + c->new_orphans = 0; + dbg_cmt("%d orphans to commit", c->cmt_orphans); + INIT_LIST_HEAD(&c->orph_new); + if (c->tot_orphans == 0) + c->no_orphs = 1; + else + c->no_orphs = 0; + spin_unlock(&c->orphan_lock); + return 0; +} + +/** + * avail_orphs - calculate available space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in the + * available space. + */ +static int avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail, gap; + + avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + gap = c->leb_size - c->ohead_offs; + if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64)) + avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + return avail; +} + +/** + * tot_avail_orphs - calculate total space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in half + * the total space. That leaves half the space for adding new orphans. + */ +static int tot_avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail; + + avail_lebs = c->orph_lebs; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + return avail / 2; +} + +/** + * do_write_orph_node - write a node to the orphan head. + * @c: UBIFS file-system description object + * @len: length of node + * @atomic: write atomically + * + * This function writes a node to the orphan head from the orphan buffer. If + * %atomic is not zero, then the write is done atomically. On success, %0 is + * returned, otherwise a negative error code is returned. + */ +static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) +{ + int err = 0; + + if (atomic) { + ubifs_assert(c->ohead_offs == 0); + ubifs_prepare_node(c, c->orph_buf, len, 1); + len = ALIGN(len, c->min_io_size); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len); + } else { + if (c->ohead_offs == 0) { + /* Ensure LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->ohead_lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, + c->ohead_offs); + } + return err; +} + +/** + * write_orph_node - write an orphan node. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function builds an orphan node from the cnext list and writes it to the + * orphan head. On success, %0 is returned, otherwise a negative error code + * is returned. + */ +static int write_orph_node(struct ubifs_info *c, int atomic) +{ + struct ubifs_orphan *orphan, *cnext; + struct ubifs_orph_node *orph; + int gap, err, len, cnt, i; + + ubifs_assert(c->cmt_orphans > 0); + gap = c->leb_size - c->ohead_offs; + if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) { + c->ohead_lnum += 1; + c->ohead_offs = 0; + gap = c->leb_size; + if (c->ohead_lnum > c->orph_last) { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err(c, "out of space in orphan area"); + return -EINVAL; + } + } + cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + if (cnt > c->cmt_orphans) + cnt = c->cmt_orphans; + len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64); + ubifs_assert(c->orph_buf); + orph = c->orph_buf; + orph->ch.node_type = UBIFS_ORPH_NODE; + spin_lock(&c->orphan_lock); + cnext = c->orph_cnext; + for (i = 0; i < cnt; i++) { + orphan = cnext; + ubifs_assert(orphan->cmt); + orph->inos[i] = cpu_to_le64(orphan->inum); + orphan->cmt = 0; + cnext = orphan->cnext; + orphan->cnext = NULL; + } + c->orph_cnext = cnext; + c->cmt_orphans -= cnt; + spin_unlock(&c->orphan_lock); + if (c->cmt_orphans) + orph->cmt_no = cpu_to_le64(c->cmt_no); + else + /* Mark the last node of the commit */ + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); + ubifs_assert(c->ohead_offs + len <= c->leb_size); + ubifs_assert(c->ohead_lnum >= c->orph_first); + ubifs_assert(c->ohead_lnum <= c->orph_last); + err = do_write_orph_node(c, len, atomic); + c->ohead_offs += ALIGN(len, c->min_io_size); + c->ohead_offs = ALIGN(c->ohead_offs, 8); + return err; +} + +/** + * write_orph_nodes - write orphan nodes until there are no more to commit. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function writes orphan nodes for all the orphans to commit. On success, + * %0 is returned, otherwise a negative error code is returned. + */ +static int write_orph_nodes(struct ubifs_info *c, int atomic) +{ + int err; + + while (c->cmt_orphans > 0) { + err = write_orph_node(c, atomic); + if (err) + return err; + } + if (atomic) { + int lnum; + + /* Unmap any unused LEBs after consolidation */ + for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + } + return 0; +} + +/** + * consolidate - consolidate the orphan area. + * @c: UBIFS file-system description object + * + * This function enables consolidation by putting all the orphans into the list + * to commit. The list is in the order that the orphans were added, and the + * LEBs are written atomically in order, so at no time can orphans be lost by + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int consolidate(struct ubifs_info *c) +{ + int tot_avail = tot_avail_orphs(c), err = 0; + + spin_lock(&c->orphan_lock); + dbg_cmt("there is space for %d orphans and there are %d", + tot_avail, c->tot_orphans); + if (c->tot_orphans - c->new_orphans <= tot_avail) { + struct ubifs_orphan *orphan, **last; + int cnt = 0; + + /* Change the cnext list to include all non-new orphans */ + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_list, list) { + if (orphan->new) + continue; + orphan->cmt = 1; + *last = orphan; + last = &orphan->cnext; + cnt += 1; + } + *last = NULL; + ubifs_assert(cnt == c->tot_orphans - c->new_orphans); + c->cmt_orphans = cnt; + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + } else { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err(c, "out of space in orphan area"); + err = -EINVAL; + } + spin_unlock(&c->orphan_lock); + return err; +} + +/** + * commit_orphans - commit orphans. + * @c: UBIFS file-system description object + * + * This function commits orphans to flash. On success, %0 is returned, + * otherwise a negative error code is returned. + */ +static int commit_orphans(struct ubifs_info *c) +{ + int avail, atomic = 0, err; + + ubifs_assert(c->cmt_orphans > 0); + avail = avail_orphs(c); + if (avail < c->cmt_orphans) { + /* Not enough space to write new orphans, so consolidate */ + err = consolidate(c); + if (err) + return err; + atomic = 1; + } + err = write_orph_nodes(c, atomic); + return err; +} + +/** + * erase_deleted - erase the orphans marked for deletion. + * @c: UBIFS file-system description object + * + * During commit, the orphans being committed cannot be deleted, so they are + * marked for deletion and deleted by this function. Also, the recovery + * adds killed orphans to the deletion list, and therefore they are deleted + * here too. + */ +static void erase_deleted(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, *dnext; + + spin_lock(&c->orphan_lock); + dnext = c->orph_dnext; + while (dnext) { + orphan = dnext; + dnext = orphan->dnext; + ubifs_assert(!orphan->new); + ubifs_assert(orphan->del); + rb_erase(&orphan->rb, &c->orph_tree); + list_del(&orphan->list); + c->tot_orphans -= 1; + dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); + kfree(orphan); + } + c->orph_dnext = NULL; + spin_unlock(&c->orphan_lock); +} + +/** + * ubifs_orphan_end_commit - end commit of orphans. + * @c: UBIFS file-system description object + * + * End commit of orphans. + */ +int ubifs_orphan_end_commit(struct ubifs_info *c) +{ + int err; + + if (c->cmt_orphans != 0) { + err = commit_orphans(c); + if (err) + return err; + } + erase_deleted(c); + err = dbg_check_orphans(c); + return err; +} + +/** + * ubifs_clear_orphans - erase all LEBs used for orphans. + * @c: UBIFS file-system description object + * + * If recovery is not required, then the orphans from the previous session + * are not needed. This function locates the LEBs used to record + * orphans, and un-maps them. + */ +int ubifs_clear_orphans(struct ubifs_info *c) +{ + int lnum, err; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + return 0; +} + +/** + * insert_dead_orphan - insert an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * This function is a helper to the 'do_kill_orphans()' function. The orphan + * must be kept until the next commit, so it is added to the rb-tree and the + * deletion list. + */ +static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + /* Already added - no problem */ + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + orphan->del = 1; + orphan->dnext = c->orph_dnext; + c->orph_dnext = orphan; + dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, + c->new_orphans, c->tot_orphans); + return 0; +} + +/** + * do_kill_orphans - remove orphan inodes from the index. + * @c: UBIFS file-system description object + * @sleb: scanned LEB + * @last_cmt_no: cmt_no of last orphan node read is passed and returned here + * @outofdate: whether the LEB is out of date is returned here + * @last_flagged: whether the end orphan node is encountered + * + * This function is a helper to the 'kill_orphans()' function. It goes through + * every orphan node in a LEB and for every inode number recorded, removes + * all keys for that inode from the TNC. + */ +static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + unsigned long long *last_cmt_no, int *outofdate, + int *last_flagged) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + unsigned long long cmt_no; + ino_t inum; + int i, n, err, first = 1; + + list_for_each_entry(snod, &sleb->nodes, list) { + if (snod->type != UBIFS_ORPH_NODE) { + ubifs_err(c, "invalid node type %d in orphan area at %d:%d", + snod->type, sleb->lnum, snod->offs); + ubifs_dump_node(c, snod->node); + return -EINVAL; + } + + orph = snod->node; + + /* Check commit number */ + cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX; + /* + * The commit number on the master node may be less, because + * of a failed commit. If there are several failed commits in a + * row, the commit number written on orphan nodes will continue + * to increase (because the commit number is adjusted here) even + * though the commit number on the master node stays the same + * because the master node has not been re-written. + */ + if (cmt_no > c->cmt_no) + c->cmt_no = cmt_no; + if (cmt_no < *last_cmt_no && *last_flagged) { + /* + * The last orphan node had a higher commit number and + * was flagged as the last written for that commit + * number. That makes this orphan node, out of date. + */ + if (!first) { + ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d", + cmt_no, sleb->lnum, snod->offs); + ubifs_dump_node(c, snod->node); + return -EINVAL; + } + dbg_rcvry("out of date LEB %d", sleb->lnum); + *outofdate = 1; + return 0; + } + + if (first) + first = 0; + + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + dbg_rcvry("deleting orphaned inode %lu", + (unsigned long)inum); + err = ubifs_tnc_remove_ino(c, inum); + if (err) + return err; + err = insert_dead_orphan(c, inum); + if (err) + return err; + } + + *last_cmt_no = cmt_no; + if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) { + dbg_rcvry("last orph node for commit %llu at %d:%d", + cmt_no, sleb->lnum, snod->offs); + *last_flagged = 1; + } else + *last_flagged = 0; + } + + return 0; +} + +/** + * kill_orphans - remove all orphan inodes from the index. + * @c: UBIFS file-system description object + * + * If recovery is required, then orphan inodes recorded during the previous + * session (which ended with an unclean unmount) must be deleted from the index. + * This is done by updating the TNC, but since the index is not updated until + * the next commit, the LEBs where the orphan information is recorded are not + * erased until the next commit. + */ +static int kill_orphans(struct ubifs_info *c) +{ + unsigned long long last_cmt_no = 0; + int lnum, err = 0, outofdate = 0, last_flagged = 0; + + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) { + dbg_rcvry("no orphans"); + return 0; + } + /* + * Orph nodes always start at c->orph_first and are written to each + * successive LEB in turn. Generally unused LEBs will have been unmapped + * but may contain out of date orphan nodes if the unmap didn't go + * through. In addition, the last orphan node written for each commit is + * marked (top bit of orph->cmt_no is set to 1). It is possible that + * there are orphan nodes from the next commit (i.e. the commit did not + * complete successfully). In that case, no orphans will have been lost + * due to the way that orphans are written, and any orphans added will + * be valid orphans anyway and so can be deleted. + */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + dbg_rcvry("LEB %d", lnum); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) { + if (PTR_ERR(sleb) == -EUCLEAN) + sleb = ubifs_recover_leb(c, lnum, 0, + c->sbuf, -1); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + } + err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate, + &last_flagged); + if (err || outofdate) { + ubifs_scan_destroy(sleb); + break; + } + if (sleb->endpt) { + c->ohead_lnum = lnum; + c->ohead_offs = sleb->endpt; + } + ubifs_scan_destroy(sleb); + } + return err; +} + +/** + * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them. + * @c: UBIFS file-system description object + * @unclean: indicates recovery from unclean unmount + * @read_only: indicates read only mount + * + * This function is called when mounting to erase orphans from the previous + * session. If UBIFS was not unmounted cleanly, then the inodes recorded as + * orphans are deleted. + */ +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) +{ + int err = 0; + + c->max_orphans = tot_avail_orphs(c); + + if (!read_only) { + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) + return -ENOMEM; + } + + if (unclean) + err = kill_orphans(c); + else if (!read_only) + err = ubifs_clear_orphans(c); + + return err; +} + +/* + * Everything below is related to debugging. + */ + +struct check_orphan { + struct rb_node rb; + ino_t inum; +}; + +struct check_info { + unsigned long last_ino; + unsigned long tot_inos; + unsigned long missing; + unsigned long long leaf_cnt; + struct ubifs_ino_node *node; + struct rb_root root; +}; + +static int dbg_find_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + spin_unlock(&c->orphan_lock); + return 1; + } + } + spin_unlock(&c->orphan_lock); + return 0; +} + +static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &root->rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct check_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + kfree(orphan); + return 0; + } + } + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, root); + return 0; +} + +static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *o; + struct rb_node *p; + + p = root->rb_node; + while (p) { + o = rb_entry(p, struct check_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else + return 1; + } + return 0; +} + +static void dbg_free_check_tree(struct rb_root *root) +{ + struct check_orphan *o, *n; + + rbtree_postorder_for_each_entry_safe(o, n, root, rb) + kfree(o); +} + +static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + struct check_info *ci = priv; + ino_t inum; + int err; + + inum = key_inum(c, &zbr->key); + if (inum != ci->last_ino) { + /* Lowest node type is the inode node, so it comes first */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + ubifs_err(c, "found orphan node ino %lu, type %d", + (unsigned long)inum, key_type(c, &zbr->key)); + ci->last_ino = inum; + ci->tot_inos += 1; + err = ubifs_tnc_read_node(c, zbr, ci->node); + if (err) { + ubifs_err(c, "node read failed, error %d", err); + return err; + } + if (ci->node->nlink == 0) + /* Must be recorded as an orphan */ + if (!dbg_find_check_orphan(&ci->root, inum) && + !dbg_find_orphan(c, inum)) { + ubifs_err(c, "missing orphan, ino %lu", + (unsigned long)inum); + ci->missing += 1; + } + } + ci->leaf_cnt += 1; + return 0; +} + +static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + ino_t inum; + int i, n, err; + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + if (snod->type != UBIFS_ORPH_NODE) + continue; + orph = snod->node; + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + err = dbg_ins_check_orphan(&ci->root, inum); + if (err) + return err; + } + } + return 0; +} + +static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) +{ + int lnum, err = 0; + void *buf; + + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) + return 0; + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err(c, "cannot allocate memory to check orphans"); + return 0; + } + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + + err = dbg_read_orphans(ci, sleb); + ubifs_scan_destroy(sleb); + if (err) + break; + } + + vfree(buf); + return err; +} + +static int dbg_check_orphans(struct ubifs_info *c) +{ + struct check_info ci; + int err; + + if (!dbg_is_chk_orph(c)) + return 0; + + ci.last_ino = 0; + ci.tot_inos = 0; + ci.missing = 0; + ci.leaf_cnt = 0; + ci.root = RB_ROOT; + ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ci.node) { + ubifs_err(c, "out of memory"); + return -ENOMEM; + } + + err = dbg_scan_orphans(c, &ci); + if (err) + goto out; + + err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci); + if (err) { + ubifs_err(c, "cannot scan TNC, error %d", err); + goto out; + } + + if (ci.missing) { + ubifs_err(c, "%lu missing orphan(s)", ci.missing); + err = -EINVAL; + goto out; + } + + dbg_cmt("last inode number is %lu", ci.last_ino); + dbg_cmt("total number of inodes is %lu", ci.tot_inos); + dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt); + +out: + dbg_free_check_tree(&ci.root); + kfree(ci.node); + return err; +} diff --git a/kernel/fs/ubifs/recovery.c b/kernel/fs/ubifs/recovery.c new file mode 100644 index 000000000..695fc71d5 --- /dev/null +++ b/kernel/fs/ubifs/recovery.c @@ -0,0 +1,1547 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions needed to recover from unclean un-mounts. + * When UBIFS is mounted, it checks a flag on the master node to determine if + * an un-mount was completed successfully. If not, the process of mounting + * incorporates additional checking and fixing of on-flash data structures. + * UBIFS always cleans away all remnants of an unclean un-mount, so that + * errors do not accumulate. However UBIFS defers recovery if it is mounted + * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. + */ + +#include +#include +#include "ubifs.h" + +/** + * is_empty - determine whether a buffer is empty (contains all 0xff). + * @buf: buffer to clean + * @len: length of buffer + * + * This function returns %1 if the buffer is empty (contains all 0xff) otherwise + * %0 is returned. + */ +static int is_empty(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * first_non_ff - find offset of the first non-0xff byte. + * @buf: buffer to search in + * @len: length of buffer + * + * This function returns offset of the first non-0xff byte in @buf or %-1 if + * the buffer contains only 0xff bytes. + */ +static int first_non_ff(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return i; + return -1; +} + +/** + * get_master_node - get the last valid master node allowing for corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @pbuf: buffer containing the LEB read, is returned here + * @mst: master node, if found, is returned here + * @cor: corruption, if found, is returned here + * + * This function allocates a buffer, reads the LEB into it, and finds and + * returns the last valid master node allowing for one area of corruption. + * The corrupt area, if there is one, must be consistent with the assumption + * that it is the result of an unclean unmount while the master node was being + * written. Under those circumstances, it is valid to use the previously written + * master node. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, + struct ubifs_mst_node **mst, void **cor) +{ + const int sz = c->mst_node_alsz; + int err, offs, len; + void *sbuf, *buf; + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0); + if (err && err != -EBADMSG) + goto out_free; + + /* Find the first position that is definitely not a node */ + offs = 0; + buf = sbuf; + len = c->leb_size; + while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) { + struct ubifs_ch *ch = buf; + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + break; + offs += sz; + buf += sz; + len -= sz; + } + /* See if there was a valid master node before that */ + if (offs) { + int ret; + + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE && offs) { + /* Could have been corruption so check one place back */ + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE) + /* + * We accept only one area of corruption because + * we are assuming that it was caused while + * trying to write a master node. + */ + goto out_err; + } + if (ret == SCANNED_A_NODE) { + struct ubifs_ch *ch = buf; + + if (ch->node_type != UBIFS_MST_NODE) + goto out_err; + dbg_rcvry("found a master node at %d:%d", lnum, offs); + *mst = buf; + offs += sz; + buf += sz; + len -= sz; + } + } + /* Check for corruption */ + if (offs < c->leb_size) { + if (!is_empty(buf, min_t(int, len, sz))) { + *cor = buf; + dbg_rcvry("found corruption at %d:%d", lnum, offs); + } + offs += sz; + buf += sz; + len -= sz; + } + /* Check remaining empty space */ + if (offs < c->leb_size) + if (!is_empty(buf, len)) + goto out_err; + *pbuf = sbuf; + return 0; + +out_err: + err = -EINVAL; +out_free: + vfree(sbuf); + *mst = NULL; + *cor = NULL; + return err; +} + +/** + * write_rcvrd_mst_node - write recovered master node. + * @c: UBIFS file-system description object + * @mst: master node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_rcvrd_mst_node(struct ubifs_info *c, + struct ubifs_mst_node *mst) +{ + int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz; + __le32 save_flags; + + dbg_rcvry("recovery"); + + save_flags = mst->flags; + mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); + + ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); + err = ubifs_leb_change(c, lnum, mst, sz); + if (err) + goto out; + err = ubifs_leb_change(c, lnum + 1, mst, sz); + if (err) + goto out; +out: + mst->flags = save_flags; + return err; +} + +/** + * ubifs_recover_master_node - recover the master node. + * @c: UBIFS file-system description object + * + * This function recovers the master node from corruption that may occur due to + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_master_node(struct ubifs_info *c) +{ + void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL; + struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst; + const int sz = c->mst_node_alsz; + int err, offs1, offs2; + + dbg_rcvry("recovery"); + + err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1); + if (err) + goto out_free; + + err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2); + if (err) + goto out_free; + + if (mst1) { + offs1 = (void *)mst1 - buf1; + if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) && + (offs1 == 0 && !cor1)) { + /* + * mst1 was written by recovery at offset 0 with no + * corruption. + */ + dbg_rcvry("recovery recovery"); + mst = mst1; + } else if (mst2) { + offs2 = (void *)mst2 - buf2; + if (offs1 == offs2) { + /* Same offset, so must be the same */ + if (memcmp((void *)mst1 + UBIFS_CH_SZ, + (void *)mst2 + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out_err; + mst = mst1; + } else if (offs2 + sz == offs1) { + /* 1st LEB was written, 2nd was not */ + if (cor1) + goto out_err; + mst = mst1; + } else if (offs1 == 0 && + c->leb_size - offs2 - sz < sz) { + /* 1st LEB was unmapped and written, 2nd not */ + if (cor1) + goto out_err; + mst = mst1; + } else + goto out_err; + } else { + /* + * 2nd LEB was unmapped and about to be written, so + * there must be only one master node in the first LEB + * and no corruption. + */ + if (offs1 != 0 || cor1) + goto out_err; + mst = mst1; + } + } else { + if (!mst2) + goto out_err; + /* + * 1st LEB was unmapped and about to be written, so there must + * be no room left in 2nd LEB. + */ + offs2 = (void *)mst2 - buf2; + if (offs2 + sz + sz <= c->leb_size) + goto out_err; + mst = mst2; + } + + ubifs_msg(c, "recovered master node from LEB %d", + (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); + + memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); + + if (c->ro_mount) { + /* Read-only mode. Keep a copy for switching to rw mode */ + c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); + if (!c->rcvrd_mst_node) { + err = -ENOMEM; + goto out_free; + } + memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + + /* + * We had to recover the master node, which means there was an + * unclean reboot. However, it is possible that the master node + * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set. + * E.g., consider the following chain of events: + * + * 1. UBIFS was cleanly unmounted, so the master node is clean + * 2. UBIFS is being mounted R/W and starts changing the master + * node in the first (%UBIFS_MST_LNUM). A power cut happens, + * so this LEB ends up with some amount of garbage at the + * end. + * 3. UBIFS is being mounted R/O. We reach this place and + * recover the master node from the second LEB + * (%UBIFS_MST_LNUM + 1). But we cannot update the media + * because we are being mounted R/O. We have to defer the + * operation. + * 4. However, this master node (@c->mst_node) is marked as + * clean (since the step 1). And if we just return, the + * mount code will be confused and won't recover the master + * node when it is re-mounter R/W later. + * + * Thus, to force the recovery by marking the master node as + * dirty. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + } else { + /* Write the recovered master node */ + c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; + err = write_rcvrd_mst_node(c, c->mst_node); + if (err) + goto out_free; + } + + vfree(buf2); + vfree(buf1); + + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err(c, "failed to recover master node"); + if (mst1) { + ubifs_err(c, "dumping first master node"); + ubifs_dump_node(c, mst1); + } + if (mst2) { + ubifs_err(c, "dumping second master node"); + ubifs_dump_node(c, mst2); + } + vfree(buf2); + vfree(buf1); + return err; +} + +/** + * ubifs_write_rcvrd_mst_node - write the recovered master node. + * @c: UBIFS file-system description object + * + * This function writes the master node that was recovered during mounting in + * read-only mode and must now be written because we are remounting rw. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) +{ + int err; + + if (!c->rcvrd_mst_node) + return 0; + c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = write_rcvrd_mst_node(c, c->rcvrd_mst_node); + if (err) + return err; + kfree(c->rcvrd_mst_node); + c->rcvrd_mst_node = NULL; + return 0; +} + +/** + * is_last_write - determine if an offset was in the last write to a LEB. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @offs: offset to check + * + * This function returns %1 if @offs was in the last write to the LEB whose data + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. + */ +static int is_last_write(const struct ubifs_info *c, void *buf, int offs) +{ + int empty_offs, check_len; + uint8_t *p; + + /* + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. + */ + empty_offs = ALIGN(offs + 1, c->max_write_size); + check_len = c->leb_size - empty_offs; + p = buf + empty_offs - offs; + return is_empty(p, check_len); +} + +/** + * clean_buf - clean the data from an LEB sitting in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to clean + * @lnum: LEB number to clean + * @offs: offset from which to clean + * @len: length of buffer + * + * This function pads up to the next min_io_size boundary (if there is one) and + * sets empty space to all 0xff. @buf, @offs and @len are updated to the next + * @c->min_io_size boundary. + */ +static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, + int *offs, int *len) +{ + int empty_offs, pad_len; + + lnum = lnum; + dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); + + ubifs_assert(!(*offs & 7)); + empty_offs = ALIGN(*offs, c->min_io_size); + pad_len = empty_offs - *offs; + ubifs_pad(c, *buf, pad_len); + *offs += pad_len; + *buf += pad_len; + *len -= pad_len; + memset(*buf, 0xff, c->leb_size - empty_offs); +} + +/** + * no_more_nodes - determine if there are no more nodes in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @len: length of buffer + * @lnum: LEB number of the LEB from which @buf was read + * @offs: offset from which @buf was read + * + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. + */ +static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, + int lnum, int offs) +{ + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); + + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; + } + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->max_write_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; +} + +/** + * fix_unclean_leb - fix an unclean LEB. + * @c: UBIFS file-system description object + * @sleb: scanned LEB information + * @start: offset where scan started + */ +static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int start) +{ + int lnum = sleb->lnum, endpt = start; + + /* Get the end offset of the last node we are keeping */ + if (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + + snod = list_entry(sleb->nodes.prev, + struct ubifs_scan_node, list); + endpt = snod->offs + snod->len; + } + + if (c->ro_mount && !c->remounting_rw) { + /* Add to recovery list */ + struct ubifs_unclean_leb *ucleb; + + dbg_rcvry("need to fix LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS); + if (!ucleb) + return -ENOMEM; + ucleb->lnum = lnum; + ucleb->endpt = endpt; + list_add_tail(&ucleb->list, &c->unclean_leb_list); + } else { + /* Write the fixed LEB back to flash */ + int err; + + dbg_rcvry("fixing LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + if (endpt == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } else { + int len = ALIGN(endpt, c->min_io_size); + + if (start) { + err = ubifs_leb_read(c, lnum, sleb->buf, 0, + start, 1); + if (err) + return err; + } + /* Pad to min_io_size */ + if (len > endpt) { + int pad_len = len - ALIGN(endpt, 8); + + if (pad_len > 0) { + void *buf = sleb->buf + len - pad_len; + + ubifs_pad(c, buf, pad_len); + } + } + err = ubifs_leb_change(c, lnum, sleb->buf, len); + if (err) + return err; + } + } + return 0; +} + +/** + * drop_last_group - drop the last group of nodes. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * group of nodes of the scanned LEB. + */ +static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) +{ + while (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + struct ubifs_ch *ch; + + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + ch = snod->node; + if (ch->group_type != UBIFS_IN_NODE_GROUP) + break; + + dbg_rcvry("dropping grouped node at %d:%d", + sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * drop_last_node - drop the last node. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB. + */ +static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) +{ + struct ubifs_scan_node *snod; + + if (!list_empty(&sleb->nodes)) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + + dbg_rcvry("dropping last node at %d:%d", + sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * ubifs_recover_leb - scan and recover a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not + * belong to any journal head) + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * Returns the scanned information on success and a negative error code on + * failure. + */ +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int jhead) +{ + int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; + int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped; + struct ubifs_scan_leb *sleb; + void *buf = sbuf + offs; + + dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped); + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + ubifs_assert(len >= 8); + while (len >= 8) { + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + /* + * Scan quietly until there is an error from which we cannot + * recover + */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } else if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + } else if (ret == SCANNED_EMPTY_SPACE || + ret == SCANNED_GARBAGE || + ret == SCANNED_A_BAD_PAD_NODE || + ret == SCANNED_A_CORRUPT_NODE) { + dbg_rcvry("found corruption (%d) at %d:%d", + ret, lnum, offs); + break; + } else { + ubifs_err(c, "unexpected return value %d", ret); + err = -EINVAL; + goto error; + } + } + + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) { + if (!is_last_write(c, buf, offs)) + goto corrupted_rescan; + } else if (ret == SCANNED_A_CORRUPT_NODE) { + if (!no_more_nodes(c, buf, len, lnum, offs)) + goto corrupted_rescan; + } else if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) { + int corruption = first_non_ff(buf, len); + + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ + ubifs_err(c, "corrupt empty space LEB %d:%d, corruption starts at %d", + lnum, offs, corruption); + /* Make sure we dump interesting non-0xFF data */ + offs += corruption; + buf += corruption; + goto corrupted; + } + } + + min_io_unit = round_down(offs, c->min_io_size); + if (grouped) + /* + * If nodes are grouped, always drop the incomplete group at + * the end. + */ + drop_last_group(sleb, &offs); + + if (jhead == GCHD) { + /* + * If this LEB belongs to the GC head then while we are in the + * middle of the same min. I/O unit keep dropping nodes. So + * basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely + * with all the uncorrupted nodes which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the + * corruption starts B, and the previous min. I/O unit A. The + * below code tries to deal with a situation when half of B + * contains valid nodes or the end of a valid node, and the + * second half of B contains corrupted data or garbage. This + * means that UBIFS had been writing to B just before the power + * cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully + * and the other half not, but this is possible in our 'failure + * mode emulation' infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Why + * can't we just clean-up the second half of B by putting a + * padding node there? We can, and this works fine with one + * exception which was reproduced with power cut emulation + * testing and happens extremely rarely. + * + * Imagine the file-system is full, we run GC which starts + * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is + * the current GC head LEB). The @c->gc_lnum is -1, which means + * that GC will retain LEB X and will try to continue. Imagine + * that LEB X is currently the dirtiest LEB, and the amount of + * used space in LEB Y is exactly the same as amount of free + * space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to + * LEB Y. We are here trying to recover LEB Y which is the GC + * head LEB. We find the min. I/O unit B as described above. + * Then we clean-up LEB Y by padding min. I/O unit. And later + * 'ubifs_rcvry_gc_commit()' function fails, because it cannot + * find a dirty LEB which could be GC'd into LEB Y! Even LEB X + * does not match because the amount of valid nodes there does + * not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and + * analysed is that we cannot mount the file-system with + * -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we + * should free min. I/O unit B in LEB Y completely and the last + * used min. I/O unit in LEB Y should be A. This is basically + * what the below code tries to do. + */ + while (offs > min_io_unit) + drop_last_node(sleb, &offs); + } + + buf = sbuf + offs; + len = c->leb_size - offs; + + clean_buf(c, &buf, lnum, &offs, &len); + ubifs_end_scan(c, sleb, lnum, offs); + + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; + + return sleb; + +corrupted_rescan: + /* Re-scan the corrupted data with verbose messages */ + ubifs_err(c, "corruption %d", ret); + ubifs_scan_a_node(c, buf, len, lnum, offs, 1); +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err(c, "LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * get_cs_sqnum - get commit start sequence number. + * @c: UBIFS file-system description object + * @lnum: LEB number of commit start node + * @offs: offset of commit start node + * @cs_sqnum: commit start sequence number is returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, + unsigned long long *cs_sqnum) +{ + struct ubifs_cs_node *cs_node = NULL; + int err, ret; + + dbg_rcvry("at %d:%d", lnum, offs); + cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL); + if (!cs_node) + return -ENOMEM; + if (c->leb_size - offs < UBIFS_CS_NODE_SZ) + goto out_err; + err = ubifs_leb_read(c, lnum, (void *)cs_node, offs, + UBIFS_CS_NODE_SZ, 0); + if (err && err != -EBADMSG) + goto out_free; + ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); + if (ret != SCANNED_A_NODE) { + ubifs_err(c, "Not a valid node"); + goto out_err; + } + if (cs_node->ch.node_type != UBIFS_CS_NODE) { + ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type); + goto out_err; + } + if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { + ubifs_err(c, "CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); + goto out_err; + } + *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); + dbg_rcvry("commit start sqnum %llu", *cs_sqnum); + kfree(cs_node); + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err(c, "failed to get CS sqnum"); + kfree(cs_node); + return err; +} + +/** + * ubifs_recover_log_leb - scan and recover a log LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by unclean reboots from which we are attempting to recover + * (assume that only the last log LEB can be corrupted by an unclean reboot). + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int next_lnum; + + dbg_rcvry("LEB %d", lnum); + next_lnum = lnum + 1; + if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs) + next_lnum = UBIFS_LOG_LNUM; + if (next_lnum != c->ltail_lnum) { + /* + * We can only recover at the end of the log, so check that the + * next log LEB is empty or out of date. + */ + sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0); + if (IS_ERR(sleb)) + return sleb; + if (sleb->nodes_cnt) { + struct ubifs_scan_node *snod; + unsigned long long cs_sqnum = c->cs_sqnum; + + snod = list_entry(sleb->nodes.next, + struct ubifs_scan_node, list); + if (cs_sqnum == 0) { + int err; + + err = get_cs_sqnum(c, lnum, offs, &cs_sqnum); + if (err) { + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + } + } + if (snod->sqnum > cs_sqnum) { + ubifs_err(c, "unrecoverable log corruption in LEB %d", + lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(-EUCLEAN); + } + } + ubifs_scan_destroy(sleb); + } + return ubifs_recover_leb(c, lnum, offs, sbuf, -1); +} + +/** + * recover_head - recover a head. + * @c: UBIFS file-system description object + * @lnum: LEB number of head to recover + * @offs: offset of head to recover + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at a head location. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) +{ + int len = c->max_write_size, err; + + if (offs + len > c->leb_size) + len = c->leb_size - offs; + + if (!len) + return 0; + + /* Read at the head location and check it is empty flash */ + err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1); + if (err || !is_empty(sbuf, len)) { + dbg_rcvry("cleaning head at %d:%d", lnum, offs); + if (offs == 0) + return ubifs_leb_unmap(c, lnum); + err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); + if (err) + return err; + return ubifs_leb_change(c, lnum, sbuf, offs); + } + + return 0; +} + +/** + * ubifs_recover_inl_heads - recover index and LPT heads. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at the index and + * LPT head locations. + * + * This deals with the recovery of a half-completed journal commit. UBIFS is + * careful never to overwrite the last version of the index or the LPT. Because + * the index and LPT are wandering trees, data from a half-completed commit will + * not be referenced anywhere in UBIFS. The data will be either in LEBs that are + * assumed to be empty and will be unmapped anyway before use, or in the index + * and LPT heads. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) +{ + int err; + + ubifs_assert(!c->ro_mount || c->remounting_rw); + + dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); + err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); + if (err) + return err; + + dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs); + + return recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); +} + +/** + * clean_an_unclean_leb - read and write a LEB to remove corruption. + * @c: UBIFS file-system description object + * @ucleb: unclean LEB information + * @sbuf: LEB-sized buffer to use + * + * This function reads a LEB up to a point pre-determined by the mount recovery, + * checks the nodes, and writes the result back to the flash, thereby cleaning + * off any following corruption, or non-fatal ECC errors. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int clean_an_unclean_leb(struct ubifs_info *c, + struct ubifs_unclean_leb *ucleb, void *sbuf) +{ + int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; + void *buf = sbuf; + + dbg_rcvry("LEB %d len %d", lnum, len); + + if (len == 0) { + /* Nothing to read, just unmap it */ + return ubifs_leb_unmap(c, lnum); + } + + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); + if (err && err != -EBADMSG) + return err; + + while (len >= 8) { + int ret; + + cond_resched(); + + /* Scan quietly until there is an error */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + ubifs_err(c, "unexpected empty space at %d:%d", + lnum, offs); + return -EUCLEAN; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + ubifs_scanned_corruption(c, lnum, offs, buf); + return -EUCLEAN; + } + + /* Pad to min_io_size */ + len = ALIGN(ucleb->endpt, c->min_io_size); + if (len > ucleb->endpt) { + int pad_len = len - ALIGN(ucleb->endpt, 8); + + if (pad_len > 0) { + buf = c->sbuf + len - pad_len; + ubifs_pad(c, buf, pad_len); + } + } + + /* Write back the LEB atomically */ + err = ubifs_leb_change(c, lnum, sbuf, len); + if (err) + return err; + + dbg_rcvry("cleaned LEB %d", lnum); + + return 0; +} + +/** + * ubifs_clean_lebs - clean LEBs recovered during read-only mount. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function cleans a LEB identified during recovery that needs to be + * written but was not because UBIFS was mounted read-only. This happens when + * remounting to read-write mode. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf) +{ + dbg_rcvry("recovery"); + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + int err; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + err = clean_an_unclean_leb(c, ucleb, sbuf); + if (err) + return err; + list_del(&ucleb->list); + kfree(ucleb); + } + return 0; +} + +/** + * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit. + * @c: UBIFS file-system description object + * + * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty + * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns + * zero in case of success and a negative error code in case of failure. + */ +static int grab_empty_leb(struct ubifs_info *c) +{ + int lnum, err; + + /* + * Note, it is very important to first search for an empty LEB and then + * run the commit, not vice-versa. The reason is that there might be + * only one empty LEB at the moment, the one which has been the + * @c->gc_lnum just before the power cut happened. During the regular + * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no + * one but GC can grab it. But at this moment this single empty LEB is + * not marked as taken, so if we run commit - what happens? Right, the + * commit will grab it and write the index there. Remember that the + * index always expands as long as there is free space, and it only + * starts consolidating when we run out of space. + * + * IOW, if we run commit now, we might not be able to find a free LEB + * after this. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + ubifs_err(c, "could not find an empty LEB"); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + return lnum; + } + + /* Reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + + c->gc_lnum = lnum; + dbg_rcvry("found empty LEB %d, run commit", lnum); + + return ubifs_run_commit(c); +} + +/** + * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. + * @c: UBIFS file-system description object + * + * Out-of-place garbage collection requires always one empty LEB with which to + * start garbage collection. The LEB number is recorded in c->gc_lnum and is + * written to the master node on unmounting. In the case of an unclean unmount + * the value of gc_lnum recorded in the master node is out of date and cannot + * be used. Instead, recovery must allocate an empty LEB for this purpose. + * However, there may not be enough empty space, in which case it must be + * possible to GC the dirtiest LEB into the GC head LEB. + * + * This function also runs the commit which causes the TNC updates from + * size-recovery and orphans to be written to the flash. That is important to + * ensure correct replay order for subsequent mounts. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_rcvry_gc_commit(struct ubifs_info *c) +{ + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + struct ubifs_lprops lp; + int err; + + dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs); + + c->gc_lnum = -1; + if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) + return grab_empty_leb(c); + + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); + if (err) { + if (err != -ENOSPC) + return err; + + dbg_rcvry("could not find a dirty LEB"); + return grab_empty_leb(c); + } + + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + ubifs_assert(lp.free + lp.dirty >= wbuf->offs); + + /* + * We run the commit before garbage collection otherwise subsequent + * mounts will see the GC and orphan deletion in a different order. + */ + dbg_rcvry("committing"); + err = ubifs_run_commit(c); + if (err) + return err; + + dbg_rcvry("GC'ing LEB %d", lp.lnum); + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_garbage_collect_leb(c, &lp); + if (err >= 0) { + int err2 = ubifs_wbuf_sync_nolock(wbuf); + + if (err2) + err = err2; + } + mutex_unlock(&wbuf->io_mutex); + if (err < 0) { + ubifs_err(c, "GC failed, error %d", err); + if (err == -EAGAIN) + err = -EINVAL; + return err; + } + + ubifs_assert(err == LEB_RETAINED); + if (err != LEB_RETAINED) + return -EINVAL; + + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + + dbg_rcvry("allocated LEB %d for GC", lp.lnum); + return 0; +} + +/** + * struct size_entry - inode size information for recovery. + * @rb: link in the RB-tree of sizes + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + * @inode: inode if pinned in memory awaiting rw mode to fix it + */ +struct size_entry { + struct rb_node rb; + ino_t inum; + loff_t i_size; + loff_t d_size; + int exists; + struct inode *inode; +}; + +/** + * add_ino - add an entry to the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + */ +static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size, + loff_t d_size, int exists) +{ + struct rb_node **p = &c->size_tree.rb_node, *parent = NULL; + struct size_entry *e; + + while (*p) { + parent = *p; + e = rb_entry(parent, struct size_entry, rb); + if (inum < e->inum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + e = kzalloc(sizeof(struct size_entry), GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->inum = inum; + e->i_size = i_size; + e->d_size = d_size; + e->exists = exists; + + rb_link_node(&e->rb, parent, p); + rb_insert_color(&e->rb, &c->size_tree); + + return 0; +} + +/** + * find_ino - find an entry on the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum) +{ + struct rb_node *p = c->size_tree.rb_node; + struct size_entry *e; + + while (p) { + e = rb_entry(p, struct size_entry, rb); + if (inum < e->inum) + p = p->rb_left; + else if (inum > e->inum) + p = p->rb_right; + else + return e; + } + return NULL; +} + +/** + * remove_ino - remove an entry from the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static void remove_ino(struct ubifs_info *c, ino_t inum) +{ + struct size_entry *e = find_ino(c, inum); + + if (!e) + return; + rb_erase(&e->rb, &c->size_tree); + kfree(e); +} + +/** + * ubifs_destroy_size_tree - free resources related to the size tree. + * @c: UBIFS file-system description object + */ +void ubifs_destroy_size_tree(struct ubifs_info *c) +{ + struct size_entry *e, *n; + + rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { + if (e->inode) + iput(e->inode); + kfree(e); + } + + c->size_tree = RB_ROOT; +} + +/** + * ubifs_recover_size_accum - accumulate inode sizes for recovery. + * @c: UBIFS file-system description object + * @key: node key + * @deletion: node is for a deletion + * @new_size: inode size + * + * This function has two purposes: + * 1) to ensure there are no data nodes that fall outside the inode size + * 2) to ensure there are no data nodes for inodes that do not exist + * To accomplish those purposes, a rb-tree is constructed containing an entry + * for each inode number in the journal that has not been deleted, and recording + * the size from the inode node, the maximum size of any data node (also altered + * by truncations) and a flag indicating a inode number for which no inode node + * was present in the journal. + * + * Note that there is still the possibility that there are data nodes that have + * been committed that are beyond the inode size, however the only way to find + * them would be to scan the entire index. Alternatively, some provision could + * be made to record the size of inodes at the start of commit, which would seem + * very cumbersome for a scenario that is quite unlikely and the only negative + * consequence of which is wasted space. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size) +{ + ino_t inum = key_inum(c, key); + struct size_entry *e; + int err; + + switch (key_type(c, key)) { + case UBIFS_INO_KEY: + if (deletion) + remove_ino(c, inum); + else { + e = find_ino(c, inum); + if (e) { + e->i_size = new_size; + e->exists = 1; + } else { + err = add_ino(c, inum, new_size, 0, 1); + if (err) + return err; + } + } + break; + case UBIFS_DATA_KEY: + e = find_ino(c, inum); + if (e) { + if (new_size > e->d_size) + e->d_size = new_size; + } else { + err = add_ino(c, inum, 0, new_size, 0); + if (err) + return err; + } + break; + case UBIFS_TRUN_KEY: + e = find_ino(c, inum); + if (e) + e->d_size = new_size; + break; + } + return 0; +} + +/** + * fix_size_in_place - fix inode size in place on flash. + * @c: UBIFS file-system description object + * @e: inode size information for recovery + */ +static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) +{ + struct ubifs_ino_node *ino = c->sbuf; + unsigned char *p; + union ubifs_key key; + int err, lnum, offs, len; + loff_t i_size; + uint32_t crc; + + /* Locate the inode node LEB number and offset */ + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs); + if (err) + goto out; + /* + * If the size recorded on the inode node is greater than the size that + * was calculated from nodes in the journal then don't change the inode. + */ + i_size = le64_to_cpu(ino->size); + if (i_size >= e->d_size) + return 0; + /* Read the LEB */ + err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1); + if (err) + goto out; + /* Change the size field and recalculate the CRC */ + ino = c->sbuf + offs; + ino->size = cpu_to_le64(e->d_size); + len = le32_to_cpu(ino->ch.len); + crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8); + ino->ch.crc = cpu_to_le32(crc); + /* Work out where data in the LEB ends and free space begins */ + p = c->sbuf; + len = c->leb_size - 1; + while (p[len] == 0xff) + len -= 1; + len = ALIGN(len + 1, c->min_io_size); + /* Atomically write the fixed LEB back again */ + err = ubifs_leb_change(c, lnum, c->sbuf, len); + if (err) + goto out; + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", + (unsigned long)e->inum, lnum, offs, i_size, e->d_size); + return 0; + +out: + ubifs_warn(c, "inode %lu failed to fix size %lld -> %lld error %d", + (unsigned long)e->inum, e->i_size, e->d_size, err); + return err; +} + +/** + * ubifs_recover_size - recover inode size. + * @c: UBIFS file-system description object + * + * This function attempts to fix inode size discrepancies identified by the + * 'ubifs_recover_size_accum()' function. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->size_tree); + + while (this) { + struct size_entry *e; + int err; + + e = rb_entry(this, struct size_entry, rb); + if (!e->exists) { + union ubifs_key key; + + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_lookup(c, &key, c->sbuf); + if (err && err != -ENOENT) + return err; + if (err == -ENOENT) { + /* Remove data nodes that have no inode */ + dbg_rcvry("removing ino %lu", + (unsigned long)e->inum); + err = ubifs_tnc_remove_ino(c, e->inum); + if (err) + return err; + } else { + struct ubifs_ino_node *ino = c->sbuf; + + e->exists = 1; + e->i_size = le64_to_cpu(ino->size); + } + } + + if (e->exists && e->i_size < e->d_size) { + if (c->ro_mount) { + /* Fix the inode size and pin it in memory */ + struct inode *inode; + struct ubifs_inode *ui; + + ubifs_assert(!e->inode); + + inode = ubifs_iget(c->vfs_sb, e->inum); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ui = ubifs_inode(inode); + if (inode->i_size < e->d_size) { + dbg_rcvry("ino %lu size %lld -> %lld", + (unsigned long)e->inum, + inode->i_size, e->d_size); + inode->i_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; + e->inode = inode; + this = rb_next(this); + continue; + } + iput(inode); + } else { + /* Fix the size in place */ + err = fix_size_in_place(c, e); + if (err) + return err; + if (e->inode) + iput(e->inode); + } + } + + this = rb_next(this); + rb_erase(&e->rb, &c->size_tree); + kfree(e); + } + + return 0; +} diff --git a/kernel/fs/ubifs/replay.c b/kernel/fs/ubifs/replay.c new file mode 100644 index 000000000..3ca454013 --- /dev/null +++ b/kernel/fs/ubifs/replay.c @@ -0,0 +1,1082 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains journal replay code. It runs when the file-system is being + * mounted and requires no locking. + * + * The larger is the journal, the longer it takes to scan it, so the longer it + * takes to mount UBIFS. This is why the journal has limited size which may be + * changed depending on the system requirements. But a larger journal gives + * faster I/O speed because it writes the index less frequently. So this is a + * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the + * larger is the journal, the more memory its index may consume. + */ + +#include "ubifs.h" +#include + +/** + * struct replay_entry - replay list entry. + * @lnum: logical eraseblock number of the node + * @offs: node offset + * @len: node length + * @deletion: non-zero if this entry corresponds to a node deletion + * @sqnum: node sequence number + * @list: links the replay list + * @key: node key + * @nm: directory entry name + * @old_size: truncation old size + * @new_size: truncation new size + * + * The replay process first scans all buds and builds the replay list, then + * sorts the replay list in nodes sequence number order, and then inserts all + * the replay entries to the TNC. + */ +struct replay_entry { + int lnum; + int offs; + int len; + unsigned int deletion:1; + unsigned long long sqnum; + struct list_head list; + union ubifs_key key; + union { + struct qstr nm; + struct { + loff_t old_size; + loff_t new_size; + }; + }; +}; + +/** + * struct bud_entry - entry in the list of buds to replay. + * @list: next bud in the list + * @bud: bud description object + * @sqnum: reference node sequence number + * @free: free bytes in the bud + * @dirty: dirty bytes in the bud + */ +struct bud_entry { + struct list_head list; + struct ubifs_bud *bud; + unsigned long long sqnum; + int free; + int dirty; +}; + +/** + * set_bud_lprops - set free and dirty space used by a bud. + * @c: UBIFS file-system description object + * @b: bud entry which describes the bud + * + * This function makes sure the LEB properties of bud @b are set correctly + * after the replay. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) +{ + const struct ubifs_lprops *lp; + int err = 0, dirty; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + dirty = lp->dirty; + if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + /* + * The LEB was added to the journal with a starting offset of + * zero which means the LEB must have been empty. The LEB + * property values should be @lp->free == @c->leb_size and + * @lp->dirty == 0, but that is not the case. The reason is that + * the LEB had been garbage collected before it became the bud, + * and there was not commit inbetween. The garbage collector + * resets the free and dirty space without recording it + * anywhere except lprops, so if there was no commit then + * lprops does not have that information. + * + * We do not need to adjust free space because the scan has told + * us the exact value which is recorded in the replay entry as + * @b->free. + * + * However we do need to subtract from the dirty space the + * amount of space that the garbage collector reclaimed, which + * is the whole LEB minus the amount of space that was free. + */ + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, + lp->free, lp->dirty); + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, + lp->free, lp->dirty); + dirty -= c->leb_size - lp->free; + /* + * If the replay order was perfect the dirty space would now be + * zero. The order is not perfect because the journal heads + * race with each other. This is not a problem but is does mean + * that the dirty space may temporarily exceed c->leb_size + * during the replay. + */ + if (dirty != 0) + dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty", + b->bud->lnum, lp->free, lp->dirty, b->free, + b->dirty); + } + lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + /* Make sure the journal head points to the latest bud */ + err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, + b->bud->lnum, c->leb_size - b->free); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * set_buds_lprops - set free and dirty space for all replayed buds. + * @c: UBIFS file-system description object + * + * This function sets LEB properties for all replayed buds. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int set_buds_lprops(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + + list_for_each_entry(b, &c->replay_buds, list) { + err = set_bud_lprops(c, b); + if (err) + return err; + } + + return 0; +} + +/** + * trun_remove_range - apply a replay entry for a truncation to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry of truncation + */ +static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) +{ + unsigned min_blk, max_blk; + union ubifs_key min_key, max_key; + ino_t ino; + + min_blk = r->new_size / UBIFS_BLOCK_SIZE; + if (r->new_size & (UBIFS_BLOCK_SIZE - 1)) + min_blk += 1; + + max_blk = r->old_size / UBIFS_BLOCK_SIZE; + if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0) + max_blk -= 1; + + ino = key_inum(c, &r->key); + + data_key_init(c, &min_key, ino, min_blk); + data_key_init(c, &max_key, ino, max_blk); + + return ubifs_tnc_remove_range(c, &min_key, &max_key); +} + +/** + * apply_replay_entry - apply a replay entry to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry to apply + * + * Apply a replay entry to the TNC. + */ +static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) +{ + int err; + + dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", + r->lnum, r->offs, r->len, r->deletion, r->sqnum); + + /* Set c->replay_sqnum to help deal with dangling branches. */ + c->replay_sqnum = r->sqnum; + + if (is_hash_key(c, &r->key)) { + if (r->deletion) + err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); + else + err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, + r->len, &r->nm); + } else { + if (r->deletion) + switch (key_type(c, &r->key)) { + case UBIFS_INO_KEY: + { + ino_t inum = key_inum(c, &r->key); + + err = ubifs_tnc_remove_ino(c, inum); + break; + } + case UBIFS_TRUN_KEY: + err = trun_remove_range(c, r); + break; + default: + err = ubifs_tnc_remove(c, &r->key); + break; + } + else + err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs, + r->len); + if (err) + return err; + + if (c->need_recovery) + err = ubifs_recover_size_accum(c, &r->key, r->deletion, + r->new_size); + } + + return err; +} + +/** + * replay_entries_cmp - compare 2 replay entries. + * @priv: UBIFS file-system description object + * @a: first replay entry + * @a: second replay entry + * + * This is a comparios function for 'list_sort()' which compares 2 replay + * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * greater sequence number and %-1 otherwise. + */ +static int replay_entries_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct replay_entry *ra, *rb; + + cond_resched(); + if (a == b) + return 0; + + ra = list_entry(a, struct replay_entry, list); + rb = list_entry(b, struct replay_entry, list); + ubifs_assert(ra->sqnum != rb->sqnum); + if (ra->sqnum > rb->sqnum) + return 1; + return -1; +} + +/** + * apply_replay_list - apply the replay list to the TNC. + * @c: UBIFS file-system description object + * + * Apply all entries in the replay list to the TNC. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int apply_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r; + int err; + + list_sort(c, &c->replay_list, &replay_entries_cmp); + + list_for_each_entry(r, &c->replay_list, list) { + cond_resched(); + + err = apply_replay_entry(c, r); + if (err) + return err; + } + + return 0; +} + +/** + * destroy_replay_list - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay list. + */ +static void destroy_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r, *tmp; + + list_for_each_entry_safe(r, tmp, &c->replay_list, list) { + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + list_del(&r->list); + kfree(r); + } +} + +/** + * insert_node - insert a node to the replay list + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * @old_size: truncation old size + * @new_size: truncation new size + * + * This function inserts a scanned non-direntry node to the replay list. The + * replay list contains @struct replay_entry elements, and we sort this list in + * sequence number order before applying it. The replay list is applied at the + * very end of the replay process. Since the list is sorted in sequence number + * order, the older modifications are applied first. This function returns zero + * in case of success and a negative error code in case of failure. + */ +static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, unsigned long long sqnum, + int deletion, int *used, loff_t old_size, + loff_t new_size) +{ + struct replay_entry *r; + + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->deletion = !!deletion; + r->sqnum = sqnum; + key_copy(c, key, &r->key); + r->old_size = old_size; + r->new_size = new_size; + + list_add_tail(&r->list, &c->replay_list); + return 0; +} + +/** + * insert_dent - insert a directory entry node into the replay list. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @name: directory entry name + * @nlen: directory entry name length + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * + * This function inserts a scanned directory entry node or an extended + * attribute entry to the replay list. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, const char *name, int nlen, + unsigned long long sqnum, int deletion, int *used) +{ + struct replay_entry *r; + char *nbuf; + + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + nbuf = kmalloc(nlen + 1, GFP_KERNEL); + if (!nbuf) { + kfree(r); + return -ENOMEM; + } + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->deletion = !!deletion; + r->sqnum = sqnum; + key_copy(c, key, &r->key); + r->nm.len = nlen; + memcpy(nbuf, name, nlen); + nbuf[nlen] = '\0'; + r->nm.name = nbuf; + + list_add_tail(&r->list, &c->replay_list); + return 0; +} + +/** + * ubifs_validate_entry - validate directory or extended attribute entry node. + * @c: UBIFS file-system description object + * @dent: the node to validate + * + * This function validates directory or extended attribute entry node @dent. + * Returns zero if the node is all right and a %-EINVAL if not. + */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent) +{ + int key_type = key_type_flash(c, dent->key); + int nlen = le16_to_cpu(dent->nlen); + + if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || + dent->type >= UBIFS_ITYPES_CNT || + nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || + strnlen(dent->name, nlen) != nlen || + le64_to_cpu(dent->inum) > MAX_INUM) { + ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ? + "directory entry" : "extended attribute entry"); + return -EINVAL; + } + + if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) { + ubifs_err(c, "bad key type %d", key_type); + return -EINVAL; + } + + return 0; +} + +/** + * is_last_bud - check if the bud is the last in the journal head. + * @c: UBIFS file-system description object + * @bud: bud description object + * + * This function checks if bud @bud is the last bud in its journal head. This + * information is then used by 'replay_bud()' to decide whether the bud can + * have corruptions or not. Indeed, only last buds can be corrupted by power + * cuts. Returns %1 if this is the last bud, and %0 if not. + */ +static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct ubifs_jhead *jh = &c->jheads[bud->jhead]; + struct ubifs_bud *next; + uint32_t data; + int err; + + if (list_is_last(&bud->list, &jh->buds_list)) + return 1; + + /* + * The following is a quirk to make sure we work correctly with UBIFS + * images used with older UBIFS. + * + * Normally, the last bud will be the last in the journal head's list + * of bud. However, there is one exception if the UBIFS image belongs + * to older UBIFS. This is fairly unlikely: one would need to use old + * UBIFS, then have a power cut exactly at the right point, and then + * try to mount this image with new UBIFS. + * + * The exception is: it is possible to have 2 buds A and B, A goes + * before B, and B is the last, bud B is contains no data, and bud A is + * corrupted at the end. The reason is that in older versions when the + * journal code switched the next bud (from A to B), it first added a + * log reference node for the new bud (B), and only after this it + * synchronized the write-buffer of current bud (A). But later this was + * changed and UBIFS started to always synchronize the write-buffer of + * the bud (A) before writing the log reference for the new bud (B). + * + * But because older UBIFS always synchronized A's write-buffer before + * writing to B, we can recognize this exceptional situation but + * checking the contents of bud B - if it is empty, then A can be + * treated as the last and we can recover it. + * + * TODO: remove this piece of code in a couple of years (today it is + * 16.05.2011). + */ + next = list_entry(bud->list.next, struct ubifs_bud, list); + if (!list_is_last(&next->list, &jh->buds_list)) + return 0; + + err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1); + if (err) + return 0; + + return data == 0xFFFFFFFF; +} + +/** + * replay_bud - replay a bud logical eraseblock. + * @c: UBIFS file-system description object + * @b: bud entry which describes the bud + * + * This function replays bud @bud, recovers it if needed, and adds all nodes + * from this bud to the replay list. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int replay_bud(struct ubifs_info *c, struct bud_entry *b) +{ + int is_last = is_last_bud(c, b->bud); + int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + + dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d", + lnum, b->bud->jhead, offs, is_last); + + if (c->need_recovery && is_last) + /* + * Recover only last LEBs in the journal heads, because power + * cuts may cause corruptions only in these LEBs, because only + * these LEBs could possibly be written to at the power cut + * time. + */ + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead); + else + sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + /* + * The bud does not have to start from offset zero - the beginning of + * the 'lnum' LEB may contain previously committed data. One of the + * things we have to do in replay is to correctly update lprops with + * newer information about this LEB. + * + * At this point lprops thinks that this LEB has 'c->leb_size - offs' + * bytes of free space because it only contain information about + * committed data. + * + * But we know that real amount of free space is 'c->leb_size - + * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and + * 'sleb->endpt' is used by bud data. We have to correctly calculate + * how much of these data are dirty and update lprops with this + * information. + * + * The dirt in that LEB region is comprised of padding nodes, deletion + * nodes, truncation nodes and nodes which are obsoleted by subsequent + * nodes in this LEB. So instead of calculating clean space, we + * calculate used space ('used' variable). + */ + + list_for_each_entry(snod, &sleb->nodes, list) { + int deletion = 0; + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err(c, "file system's life ended"); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_INO_NODE: + { + struct ubifs_ino_node *ino = snod->node; + loff_t new_size = le64_to_cpu(ino->size); + + if (le32_to_cpu(ino->nlink) == 0) + deletion = 1; + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DATA_NODE: + { + struct ubifs_data_node *dn = snod->node; + loff_t new_size = le32_to_cpu(dn->size) + + key_block(c, &snod->key) * + UBIFS_BLOCK_SIZE; + + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + struct ubifs_dent_node *dent = snod->node; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + err = insert_dent(c, lnum, snod->offs, snod->len, + &snod->key, dent->name, + le16_to_cpu(dent->nlen), snod->sqnum, + !le64_to_cpu(dent->inum), &used); + break; + } + case UBIFS_TRUN_NODE: + { + struct ubifs_trun_node *trun = snod->node; + loff_t old_size = le64_to_cpu(trun->old_size); + loff_t new_size = le64_to_cpu(trun->new_size); + union ubifs_key key; + + /* Validate truncation node */ + if (old_size < 0 || old_size > c->max_inode_sz || + new_size < 0 || new_size > c->max_inode_sz || + old_size <= new_size) { + ubifs_err(c, "bad truncation node"); + goto out_dump; + } + + /* + * Create a fake truncation key just to use the same + * functions which expect nodes to have keys. + */ + trun_key_init(c, &key, le32_to_cpu(trun->inum)); + err = insert_node(c, lnum, snod->offs, snod->len, + &key, snod->sqnum, 1, &used, + old_size, new_size); + break; + } + default: + ubifs_err(c, "unexpected node type %d in bud LEB %d:%d", + snod->type, lnum, snod->offs); + err = -EINVAL; + goto out_dump; + } + if (err) + goto out; + } + + ubifs_assert(ubifs_search_bud(c, lnum)); + ubifs_assert(sleb->endpt - offs >= used); + ubifs_assert(sleb->endpt % c->min_io_size == 0); + + b->dirty = sleb->endpt - offs - used; + b->free = c->leb_size - sleb->endpt; + dbg_mnt("bud LEB %d replied: dirty %d, free %d", + lnum, b->dirty, b->free); + +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs); + ubifs_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * replay_buds - replay all buds. + * @c: UBIFS file-system description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_buds(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + unsigned long long prev_sqnum = 0; + + list_for_each_entry(b, &c->replay_buds, list) { + err = replay_bud(c, b); + if (err) + return err; + + ubifs_assert(b->sqnum > prev_sqnum); + prev_sqnum = b->sqnum; + } + + return 0; +} + +/** + * destroy_bud_list - destroy the list of buds to replay. + * @c: UBIFS file-system description object + */ +static void destroy_bud_list(struct ubifs_info *c) +{ + struct bud_entry *b; + + while (!list_empty(&c->replay_buds)) { + b = list_entry(c->replay_buds.next, struct bud_entry, list); + list_del(&b->list); + kfree(b); + } +} + +/** + * add_replay_bud - add a bud to the list of buds to replay. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @sqnum: reference node sequence number + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + unsigned long long sqnum) +{ + struct ubifs_bud *bud; + struct bud_entry *b; + + dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead); + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL); + if (!bud) + return -ENOMEM; + + b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL); + if (!b) { + kfree(bud); + return -ENOMEM; + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + ubifs_add_bud(c, bud); + + b->bud = bud; + b->sqnum = sqnum; + list_add_tail(&b->list, &c->replay_buds); + + return 0; +} + +/** + * validate_ref - validate a reference node. + * @c: UBIFS file-system description object + * @ref: the reference node to validate + * @ref_lnum: LEB number of the reference node + * @ref_offs: reference node offset + * + * This function returns %1 if a bud reference already exists for the LEB. %0 is + * returned if the reference node is new, otherwise %-EINVAL is returned if + * validation failed. + */ +static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref) +{ + struct ubifs_bud *bud; + int lnum = le32_to_cpu(ref->lnum); + unsigned int offs = le32_to_cpu(ref->offs); + unsigned int jhead = le32_to_cpu(ref->jhead); + + /* + * ref->offs may point to the end of LEB when the journal head points + * to the end of LEB and we write reference node for it during commit. + * So this is why we require 'offs > c->leb_size'. + */ + if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt || + lnum < c->main_first || offs > c->leb_size || + offs & (c->min_io_size - 1)) + return -EINVAL; + + /* Make sure we have not already looked at this bud */ + bud = ubifs_search_bud(c, lnum); + if (bud) { + if (bud->jhead == jhead && bud->start <= offs) + return 1; + ubifs_err(c, "bud at LEB %d:%d was already referred", lnum, offs); + return -EINVAL; + } + + return 0; +} + +/** + * replay_log_leb - replay a log logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: log logical eraseblock to replay + * @offs: offset to start replaying from + * @sbuf: scan buffer + * + * This function replays a log LEB and returns zero in case of success, %1 if + * this is the last LEB in the log, and a negative error code in case of + * failure. + */ +static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) +{ + int err; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + const struct ubifs_cs_node *node; + + dbg_mnt("replay log LEB %d:%d", lnum, offs); + sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery); + if (IS_ERR(sleb)) { + if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) + return PTR_ERR(sleb); + /* + * Note, the below function will recover this log LEB only if + * it is the last, because unclean reboots can possibly corrupt + * only the tail of the log. + */ + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + } + + if (sleb->nodes_cnt == 0) { + err = 1; + goto out; + } + + node = sleb->buf; + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + if (c->cs_sqnum == 0) { + /* + * This is the first log LEB we are looking at, make sure that + * the first node is a commit start node. Also record its + * sequence number so that UBIFS can determine where the log + * ends, because all nodes which were have higher sequence + * numbers. + */ + if (snod->type != UBIFS_CS_NODE) { + ubifs_err(c, "first log node at LEB %d:%d is not CS node", + lnum, offs); + goto out_dump; + } + if (le64_to_cpu(node->cmt_no) != c->cmt_no) { + ubifs_err(c, "first CS node at LEB %d:%d has wrong commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); + goto out_dump; + } + + c->cs_sqnum = le64_to_cpu(node->ch.sqnum); + dbg_mnt("commit start sqnum %llu", c->cs_sqnum); + } + + if (snod->sqnum < c->cs_sqnum) { + /* + * This means that we reached end of log and now + * look to the older log data, which was already + * committed but the eraseblock was not erased (UBIFS + * only un-maps it). So this basically means we have to + * exit with "end of log" code. + */ + err = 1; + goto out; + } + + /* Make sure the first node sits at offset zero of the LEB */ + if (snod->offs != 0) { + ubifs_err(c, "first node is not at zero offset"); + goto out_dump; + } + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err(c, "file system's life ended"); + goto out_dump; + } + + if (snod->sqnum < c->cs_sqnum) { + ubifs_err(c, "bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_REF_NODE: { + const struct ubifs_ref_node *ref = snod->node; + + err = validate_ref(c, ref); + if (err == 1) + break; /* Already have this bud */ + if (err) + goto out_dump; + + err = add_replay_bud(c, le32_to_cpu(ref->lnum), + le32_to_cpu(ref->offs), + le32_to_cpu(ref->jhead), + snod->sqnum); + if (err) + goto out; + + break; + } + case UBIFS_CS_NODE: + /* Make sure it sits at the beginning of LEB */ + if (snod->offs != 0) { + ubifs_err(c, "unexpected node in log"); + goto out_dump; + } + break; + default: + ubifs_err(c, "unexpected node in log"); + goto out_dump; + } + } + + if (sleb->endpt || c->lhead_offs >= c->leb_size) { + c->lhead_lnum = lnum; + c->lhead_offs = sleb->endpt; + } + + err = !sleb->endpt; +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err(c, "log error detected while replaying the log at LEB %d:%d", + lnum, offs + snod->offs); + ubifs_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * take_ihead - update the status of the index head in lprops to 'taken'. + * @c: UBIFS file-system description object + * + * This function returns the amount of free space in the index head LEB or a + * negative error code. + */ +static int take_ihead(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, free; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + free = lp->free; + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + err = free; +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_replay_journal - replay journal. + * @c: UBIFS file-system description object + * + * This function scans the journal, replays and cleans it up. It makes sure all + * memory data structures related to uncommitted journal are built (dirty TNC + * tree, tree of buds, modified lprops, etc). + */ +int ubifs_replay_journal(struct ubifs_info *c) +{ + int err, lnum, free; + + BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); + + /* Update the status of the index head in lprops to 'taken' */ + free = take_ihead(c); + if (free < 0) + return free; /* Error code */ + + if (c->ihead_offs != c->leb_size - free) { + ubifs_err(c, "bad index head LEB %d:%d", c->ihead_lnum, + c->ihead_offs); + return -EINVAL; + } + + dbg_mnt("start replaying the journal"); + c->replaying = 1; + lnum = c->ltail_lnum = c->lhead_lnum; + + do { + err = replay_log_leb(c, lnum, 0, c->sbuf); + if (err == 1) { + if (lnum != c->lhead_lnum) + /* We hit the end of the log */ + break; + + /* + * The head of the log must always start with the + * "commit start" node on a properly formatted UBIFS. + * But we found no nodes at all, which means that + * someting went wrong and we cannot proceed mounting + * the file-system. + */ + ubifs_err(c, "no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted", + lnum, 0); + err = -EINVAL; + } + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } while (lnum != c->ltail_lnum); + + err = replay_buds(c); + if (err) + goto out; + + err = apply_replay_list(c); + if (err) + goto out; + + err = set_buds_lprops(c); + if (err) + goto out; + + /* + * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable + * to roughly estimate index growth. Things like @c->bi.min_idx_lebs + * depend on it. This means we have to initialize it to make sure + * budgeting works properly. + */ + c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->bi.uncommitted_idx *= c->max_idx_node_sz; + + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu", + c->lhead_lnum, c->lhead_offs, c->max_sqnum, + (unsigned long)c->highest_inum); +out: + destroy_replay_list(c); + destroy_bud_list(c); + c->replaying = 0; + return err; +} diff --git a/kernel/fs/ubifs/sb.c b/kernel/fs/ubifs/sb.c new file mode 100644 index 000000000..f4fbc7b6b --- /dev/null +++ b/kernel/fs/ubifs/sb.c @@ -0,0 +1,809 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS superblock. The superblock is stored at the first + * LEB of the volume and is never changed by UBIFS. Only user-space tools may + * change it. The superblock node mostly contains geometry information. + */ + +#include "ubifs.h" +#include +#include +#include + +/* + * Default journal size in logical eraseblocks as a percent of total + * flash size. + */ +#define DEFAULT_JNL_PERCENT 5 + +/* Default maximum journal size in bytes */ +#define DEFAULT_MAX_JNL (32*1024*1024) + +/* Default indexing tree fanout */ +#define DEFAULT_FANOUT 8 + +/* Default number of data journal heads */ +#define DEFAULT_JHEADS_CNT 1 + +/* Default positions of different LEBs in the main area */ +#define DEFAULT_IDX_LEB 0 +#define DEFAULT_DATA_LEB 1 +#define DEFAULT_GC_LEB 2 + +/* Default number of LEB numbers in LPT's save table */ +#define DEFAULT_LSAVE_CNT 256 + +/* Default reserved pool size as a percent of maximum free space */ +#define DEFAULT_RP_PERCENT 5 + +/* The default maximum size of reserved pool in bytes */ +#define DEFAULT_MAX_RP_SIZE (5*1024*1024) + +/* Default time granularity in nanoseconds */ +#define DEFAULT_TIME_GRAN 1000000000 + +/** + * create_default_filesystem - format empty UBI volume. + * @c: UBIFS file-system description object + * + * This function creates default empty file-system. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int create_default_filesystem(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + struct ubifs_mst_node *mst; + struct ubifs_idx_node *idx; + struct ubifs_branch *br; + struct ubifs_ino_node *ino; + struct ubifs_cs_node *cs; + union ubifs_key key; + int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; + int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; + int min_leb_cnt = UBIFS_MIN_LEB_CNT; + long long tmp64, main_bytes; + __le64 tmp_le64; + + /* Some functions called from here depend on the @c->key_len filed */ + c->key_len = UBIFS_SK_LEN; + + /* + * First of all, we have to calculate default file-system geometry - + * log size, journal size, etc. + */ + if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT) + /* We can first multiply then divide and have no overflow */ + jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100; + else + jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT; + + if (jnl_lebs < UBIFS_MIN_JNL_LEBS) + jnl_lebs = UBIFS_MIN_JNL_LEBS; + if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL) + jnl_lebs = DEFAULT_MAX_JNL / c->leb_size; + + /* + * The log should be large enough to fit reference nodes for all bud + * LEBs. Because buds do not have to start from the beginning of LEBs + * (half of the LEB may contain committed data), the log should + * generally be larger, make it twice as large. + */ + tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1; + log_lebs = tmp / c->leb_size; + /* Plus one LEB reserved for commit */ + log_lebs += 1; + if (c->leb_cnt - min_leb_cnt > 8) { + /* And some extra space to allow writes while committing */ + log_lebs += 1; + min_leb_cnt += 1; + } + + max_buds = jnl_lebs - log_lebs; + if (max_buds < UBIFS_MIN_BUD_LEBS) + max_buds = UBIFS_MIN_BUD_LEBS; + + /* + * Orphan nodes are stored in a separate area. One node can store a lot + * of orphan inode numbers, but when new orphan comes we just add a new + * orphan node. At some point the nodes are consolidated into one + * orphan node. + */ + orph_lebs = UBIFS_MIN_ORPH_LEBS; + if (c->leb_cnt - min_leb_cnt > 1) + /* + * For debugging purposes it is better to have at least 2 + * orphan LEBs, because the orphan subsystem would need to do + * consolidations and would be stressed more. + */ + orph_lebs += 1; + + main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; + main_lebs -= orph_lebs; + + lpt_first = UBIFS_LOG_LNUM + log_lebs; + c->lsave_cnt = DEFAULT_LSAVE_CNT; + c->max_leb_cnt = c->leb_cnt; + err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs, + &big_lpt); + if (err) + return err; + + dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first, + lpt_first + lpt_lebs - 1); + + main_first = c->leb_cnt - main_lebs; + + /* Create default superblock */ + tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + sup = kzalloc(tmp, GFP_KERNEL); + if (!sup) + return -ENOMEM; + + tmp64 = (long long)max_buds * c->leb_size; + if (big_lpt) + sup_flags |= UBIFS_FLG_BIGLPT; + + sup->ch.node_type = UBIFS_SB_NODE; + sup->key_hash = UBIFS_KEY_HASH_R5; + sup->flags = cpu_to_le32(sup_flags); + sup->min_io_size = cpu_to_le32(c->min_io_size); + sup->leb_size = cpu_to_le32(c->leb_size); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt); + sup->max_bud_bytes = cpu_to_le64(tmp64); + sup->log_lebs = cpu_to_le32(log_lebs); + sup->lpt_lebs = cpu_to_le32(lpt_lebs); + sup->orph_lebs = cpu_to_le32(orph_lebs); + sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT); + sup->fanout = cpu_to_le32(DEFAULT_FANOUT); + sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); + sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); + sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + if (c->mount_opts.override_compr) + sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); + else + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + + generate_random_uuid(sup->uuid); + + main_bytes = (long long)main_lebs * c->leb_size; + tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100); + if (tmp64 > DEFAULT_MAX_RP_SIZE) + tmp64 = DEFAULT_MAX_RP_SIZE; + sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); + + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0); + kfree(sup); + if (err) + return err; + + dbg_gen("default superblock created at LEB 0:0"); + + /* Create default master node */ + mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!mst) + return -ENOMEM; + + mst->ch.node_type = UBIFS_MST_NODE; + mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM); + mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO); + mst->cmt_no = 0; + mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->root_offs = 0; + tmp = ubifs_idx_node_sz(c, 1); + mst->root_len = cpu_to_le32(tmp); + mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB); + mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size)); + mst->index_size = cpu_to_le64(ALIGN(tmp, 8)); + mst->lpt_lnum = cpu_to_le32(c->lpt_lnum); + mst->lpt_offs = cpu_to_le32(c->lpt_offs); + mst->nhead_lnum = cpu_to_le32(c->nhead_lnum); + mst->nhead_offs = cpu_to_le32(c->nhead_offs); + mst->ltab_lnum = cpu_to_le32(c->ltab_lnum); + mst->ltab_offs = cpu_to_le32(c->ltab_offs); + mst->lsave_lnum = cpu_to_le32(c->lsave_lnum); + mst->lsave_offs = cpu_to_le32(c->lsave_offs); + mst->lscan_lnum = cpu_to_le32(main_first); + mst->empty_lebs = cpu_to_le32(main_lebs - 2); + mst->idx_lebs = cpu_to_le32(1); + mst->leb_cnt = cpu_to_le32(c->leb_cnt); + + /* Calculate lprops statistics */ + tmp64 = main_bytes; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + mst->total_free = cpu_to_le64(tmp64); + + tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) - + UBIFS_INO_NODE_SZ; + tmp64 += ino_waste; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8); + mst->total_dirty = cpu_to_le64(tmp64); + + /* The indexing LEB does not contribute to dark space */ + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); + mst->total_dark = cpu_to_le64(tmp64); + + mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); + + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0); + if (err) { + kfree(mst); + return err; + } + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, + 0); + kfree(mst); + if (err) + return err; + + dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM); + + /* Create the root indexing node */ + tmp = ubifs_idx_node_sz(c, 1); + idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + if (!idx) + return -ENOMEM; + + c->key_fmt = UBIFS_SIMPLE_KEY_FMT; + c->key_hash = key_r5_hash; + + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(1); + ino_key_init(c, &key, UBIFS_ROOT_INO); + br = ubifs_idx_branch(c, idx, 0); + key_write_idx(c, &key, &br->key); + br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); + br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0); + kfree(idx); + if (err) + return err; + + dbg_gen("default root indexing node created LEB %d:0", + main_first + DEFAULT_IDX_LEB); + + /* Create default root inode */ + tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + ino = kzalloc(tmp, GFP_KERNEL); + if (!ino) + return -ENOMEM; + + ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO); + ino->ch.node_type = UBIFS_INO_NODE; + ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); + ino->nlink = cpu_to_le32(2); + tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + ino->atime_sec = tmp_le64; + ino->ctime_sec = tmp_le64; + ino->mtime_sec = tmp_le64; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; + ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); + + /* Set compression enabled by default */ + ino->flags = cpu_to_le32(UBIFS_COMPR_FL); + + err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, + main_first + DEFAULT_DATA_LEB, 0); + kfree(ino); + if (err) + return err; + + dbg_gen("root inode created at LEB %d:0", + main_first + DEFAULT_DATA_LEB); + + /* + * The first node in the log has to be the commit start node. This is + * always the case during normal file-system operation. Write a fake + * commit start node to the log. + */ + tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size); + cs = kzalloc(tmp, GFP_KERNEL); + if (!cs) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); + kfree(cs); + if (err) + return err; + + ubifs_msg(c, "default file-system created"); + return 0; +} + +/** + * validate_sb - validate superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node + * + * This function validates superblock node @sup. Since most of data was read + * from the superblock and stored in @c, the function validates fields in @c + * instead. Returns zero in case of success and %-EINVAL in case of validation + * failure. + */ +static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + long long max_bytes; + int err = 1, min_leb_cnt; + + if (!c->key_hash) { + err = 2; + goto failed; + } + + if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) { + err = 3; + goto failed; + } + + if (le32_to_cpu(sup->min_io_size) != c->min_io_size) { + ubifs_err(c, "min. I/O unit mismatch: %d in superblock, %d real", + le32_to_cpu(sup->min_io_size), c->min_io_size); + goto failed; + } + + if (le32_to_cpu(sup->leb_size) != c->leb_size) { + ubifs_err(c, "LEB size mismatch: %d in superblock, %d real", + le32_to_cpu(sup->leb_size), c->leb_size); + goto failed; + } + + if (c->log_lebs < UBIFS_MIN_LOG_LEBS || + c->lpt_lebs < UBIFS_MIN_LPT_LEBS || + c->orph_lebs < UBIFS_MIN_ORPH_LEBS || + c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 4; + goto failed; + } + + /* + * Calculate minimum allowed amount of main area LEBs. This is very + * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we + * have just read from the superblock. + */ + min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs; + min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; + + if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { + ubifs_err(c, "bad LEB count: %d in superblock, %d on UBI volume, %d minimum required", + c->leb_cnt, c->vi.size, min_leb_cnt); + goto failed; + } + + if (c->max_leb_cnt < c->leb_cnt) { + ubifs_err(c, "max. LEB count %d less than LEB count %d", + c->max_leb_cnt, c->leb_cnt); + goto failed; + } + + if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + ubifs_err(c, "too few main LEBs count %d, must be at least %d", + c->main_lebs, UBIFS_MIN_MAIN_LEBS); + goto failed; + } + + max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; + if (c->max_bud_bytes < max_bytes) { + ubifs_err(c, "too small journal (%lld bytes), must be at least %lld bytes", + c->max_bud_bytes, max_bytes); + goto failed; + } + + max_bytes = (long long)c->leb_size * c->main_lebs; + if (c->max_bud_bytes > max_bytes) { + ubifs_err(c, "too large journal size (%lld bytes), only %lld bytes available in the main area", + c->max_bud_bytes, max_bytes); + goto failed; + } + + if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 || + c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) { + err = 9; + goto failed; + } + + if (c->fanout < UBIFS_MIN_FANOUT || + ubifs_idx_node_sz(c, c->fanout) > c->leb_size) { + err = 10; + goto failed; + } + + if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT && + c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - + c->log_lebs - c->lpt_lebs - c->orph_lebs)) { + err = 11; + goto failed; + } + + if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs + + c->orph_lebs + c->main_lebs != c->leb_cnt) { + err = 12; + goto failed; + } + + if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + err = 13; + goto failed; + } + + if (c->rp_size < 0 || max_bytes < c->rp_size) { + err = 14; + goto failed; + } + + if (le32_to_cpu(sup->time_gran) > 1000000000 || + le32_to_cpu(sup->time_gran) < 1) { + err = 15; + goto failed; + } + + return 0; + +failed: + ubifs_err(c, "bad superblock, error %d", err); + ubifs_dump_node(c, sup); + return -EINVAL; +} + +/** + * ubifs_read_sb_node - read superblock node. + * @c: UBIFS file-system description object + * + * This function returns a pointer to the superblock node or a negative error + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. + */ +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + int err; + + sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS); + if (!sup) + return ERR_PTR(-ENOMEM); + + err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ, + UBIFS_SB_LNUM, 0); + if (err) { + kfree(sup); + return ERR_PTR(err); + } + + return sup; +} + +/** + * ubifs_write_sb_node - write superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node read with 'ubifs_read_sb_node()' + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + + ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len); +} + +/** + * ubifs_read_superblock - read superblock. + * @c: UBIFS file-system description object + * + * This function finds, reads and checks the superblock. If an empty UBI volume + * is being mounted, this function creates default superblock. Returns zero in + * case of success, and a negative error code in case of failure. + */ +int ubifs_read_superblock(struct ubifs_info *c) +{ + int err, sup_flags; + struct ubifs_sb_node *sup; + + if (c->empty) { + err = create_default_filesystem(c); + if (err) + return err; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + + /* + * The software supports all previous versions but not future versions, + * due to the unavailability of time-travelling equipment. + */ + if (c->fmt_version > UBIFS_FORMAT_VERSION) { + ubifs_assert(!c->ro_media || c->ro_mount); + if (!c->ro_mount || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg(c, "only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; + } + + if (c->fmt_version < 3) { + ubifs_err(c, "on-flash format version %d is not supported", + c->fmt_version); + err = -EINVAL; + goto out; + } + + switch (sup->key_hash) { + case UBIFS_KEY_HASH_R5: + c->key_hash = key_r5_hash; + c->key_hash_type = UBIFS_KEY_HASH_R5; + break; + + case UBIFS_KEY_HASH_TEST: + c->key_hash = key_test_hash; + c->key_hash_type = UBIFS_KEY_HASH_TEST; + break; + }; + + c->key_fmt = sup->key_fmt; + + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + c->key_len = UBIFS_SK_LEN; + break; + default: + ubifs_err(c, "unsupported key format"); + err = -EINVAL; + goto out; + } + + c->leb_cnt = le32_to_cpu(sup->leb_cnt); + c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt); + c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes); + c->log_lebs = le32_to_cpu(sup->log_lebs); + c->lpt_lebs = le32_to_cpu(sup->lpt_lebs); + c->orph_lebs = le32_to_cpu(sup->orph_lebs); + c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; + c->fanout = le32_to_cpu(sup->fanout); + c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); + c->rp_size = le64_to_cpu(sup->rp_size); + c->rp_uid = make_kuid(&init_user_ns, le32_to_cpu(sup->rp_uid)); + c->rp_gid = make_kgid(&init_user_ns, le32_to_cpu(sup->rp_gid)); + sup_flags = le32_to_cpu(sup->flags); + if (!c->mount_opts.override_compr) + c->default_compr = le16_to_cpu(sup->default_compr); + + c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); + memcpy(&c->uuid, &sup->uuid, 16); + c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); + + /* Automatically increase file system size to the maximum size */ + c->old_leb_cnt = c->leb_cnt; + if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { + c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); + if (c->ro_mount) + dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + else { + dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + c->old_leb_cnt = c->leb_cnt; + } + } + + c->log_bytes = (long long)c->log_lebs * c->leb_size; + c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1; + c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs; + c->lpt_last = c->lpt_first + c->lpt_lebs - 1; + c->orph_first = c->lpt_last + 1; + c->orph_last = c->orph_first + c->orph_lebs - 1; + c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; + c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; + c->main_first = c->leb_cnt - c->main_lebs; + + err = validate_sb(c, sup); +out: + kfree(sup); + return err; +} + +/** + * fixup_leb - fixup/unmap an LEB containing free space. + * @c: UBIFS file-system description object + * @lnum: the LEB number to fix up + * @len: number of used bytes in LEB (starting at offset 0) + * + * This function reads the contents of the given LEB number @lnum, then fixes + * it up, so that empty min. I/O units in the end of LEB are actually erased on + * flash (rather than being just all-0xff real data). If the LEB is completely + * empty, it is simply unmapped. + */ +static int fixup_leb(struct ubifs_info *c, int lnum, int len) +{ + int err; + + ubifs_assert(len >= 0); + ubifs_assert(len % c->min_io_size == 0); + ubifs_assert(len < c->leb_size); + + if (len == 0) { + dbg_mnt("unmap empty LEB %d", lnum); + return ubifs_leb_unmap(c, lnum); + } + + dbg_mnt("fixup LEB %d, data len %d", lnum, len); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1); + if (err) + return err; + + return ubifs_leb_change(c, lnum, c->sbuf, len); +} + +/** + * fixup_free_space - find & remap all LEBs containing free space. + * @c: UBIFS file-system description object + * + * This function walks through all LEBs in the filesystem and fiexes up those + * containing free/empty space. + */ +static int fixup_free_space(struct ubifs_info *c) +{ + int lnum, err = 0; + struct ubifs_lprops *lprops; + + ubifs_get_lprops(c); + + /* Fixup LEBs in the master area */ + for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) { + err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz); + if (err) + goto out; + } + + /* Unmap unused log LEBs */ + lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + while (lnum != c->ltail_lnum) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } + + /* + * Fixup the log head which contains the only a CS node at the + * beginning. + */ + err = fixup_leb(c, c->lhead_lnum, + ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size)); + if (err) + goto out; + + /* Fixup LEBs in the LPT area */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + int free = c->ltab[lnum - c->lpt_first].free; + + if (free > 0) { + err = fixup_leb(c, lnum, c->leb_size - free); + if (err) + goto out; + } + } + + /* Unmap LEBs in the orphans area */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + } + + /* Fixup LEBs in the main area */ + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (lprops->free > 0) { + err = fixup_leb(c, lnum, c->leb_size - lprops->free); + if (err) + goto out; + } + } + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fixup_free_space - find & fix all LEBs with free space. + * @c: UBIFS file-system description object + * + * This function fixes up LEBs containing free space on first mount, if the + * appropriate flag was set when the FS was created. Each LEB with one or more + * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure + * the free space is actually erased. E.g., this is necessary for some NAND + * chips, since the free space may have been programmed like real "0xff" data + * (generating a non-0xff ECC), causing future writes to the not-really-erased + * NAND pages to behave badly. After the space is fixed up, the superblock flag + * is cleared, so that this is skipped for all future mounts. + */ +int ubifs_fixup_free_space(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + ubifs_assert(c->space_fixup); + ubifs_assert(!c->ro_mount); + + ubifs_msg(c, "start fixing up free space"); + + err = fixup_free_space(c); + if (err) + return err; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* Free-space fixup is no longer required */ + c->space_fixup = 0; + sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); + + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + return err; + + ubifs_msg(c, "free space fixup complete"); + return err; +} diff --git a/kernel/fs/ubifs/scan.c b/kernel/fs/ubifs/scan.c new file mode 100644 index 000000000..aab87340d --- /dev/null +++ b/kernel/fs/ubifs/scan.c @@ -0,0 +1,379 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the scan which is a general-purpose function for + * determining what nodes are in an eraseblock. The scan is used to replay the + * journal, to do garbage collection. for the TNC in-the-gaps method, and by + * debugging functions. + */ + +#include "ubifs.h" + +/** + * scan_padding_bytes - scan for padding bytes. + * @buf: buffer to scan + * @len: length of buffer + * + * This function returns the number of padding bytes on success and + * %SCANNED_GARBAGE on failure. + */ +static int scan_padding_bytes(void *buf, int len) +{ + int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len); + uint8_t *p = buf; + + dbg_scan("not a node"); + + while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE) + pad_len += 1; + + if (!pad_len || (pad_len & 7)) + return SCANNED_GARBAGE; + + dbg_scan("%d padding bytes", pad_len); + + return pad_len; +} + +/** + * ubifs_scan_a_node - scan for a node or padding. + * @c: UBIFS file-system description object + * @buf: buffer to scan + * @len: length of buffer + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * + * This function returns a scanning code to indicate what was scanned. + */ +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet) +{ + struct ubifs_ch *ch = buf; + uint32_t magic; + + magic = le32_to_cpu(ch->magic); + + if (magic == 0xFFFFFFFF) { + dbg_scan("hit empty space at LEB %d:%d", lnum, offs); + return SCANNED_EMPTY_SPACE; + } + + if (magic != UBIFS_NODE_MAGIC) + return scan_padding_bytes(buf, len); + + if (len < UBIFS_CH_SZ) + return SCANNED_GARBAGE; + + dbg_scan("scanning %s at LEB %d:%d", + dbg_ntype(ch->node_type), lnum, offs); + + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) + return SCANNED_A_CORRUPT_NODE; + + if (ch->node_type == UBIFS_PAD_NODE) { + struct ubifs_pad_node *pad = buf; + int pad_len = le32_to_cpu(pad->pad_len); + int node_len = le32_to_cpu(ch->len); + + /* Validate the padding node */ + if (pad_len < 0 || + offs + node_len + pad_len > c->leb_size) { + if (!quiet) { + ubifs_err(c, "bad pad node at LEB %d:%d", + lnum, offs); + ubifs_dump_node(c, pad); + } + return SCANNED_A_BAD_PAD_NODE; + } + + /* Make the node pads to 8-byte boundary */ + if ((node_len + pad_len) & 7) { + if (!quiet) + ubifs_err(c, "bad padding length %d - %d", + offs, offs + node_len + pad_len); + return SCANNED_A_BAD_PAD_NODE; + } + + dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len, + lnum, offs, ALIGN(offs + node_len + pad_len, 8)); + + return node_len + pad_len; + } + + return SCANNED_A_NODE; +} + +/** + * ubifs_start_scan - create LEB scanning information at start of scan. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function returns the scanned information on success and a negative error + * code on failure. + */ +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int err; + + dbg_scan("scan LEB %d:%d", lnum, offs); + + sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS); + if (!sleb) + return ERR_PTR(-ENOMEM); + + sleb->lnum = lnum; + INIT_LIST_HEAD(&sleb->nodes); + sleb->buf = sbuf; + + err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); + if (err && err != -EBADMSG) { + ubifs_err(c, "cannot read %d bytes from LEB %d:%d, error %d", + c->leb_size - offs, lnum, offs, err); + kfree(sleb); + return ERR_PTR(err); + } + + /* + * Note, we ignore integrity errors (EBASMSG) because all the nodes are + * protected by CRC checksums. + */ + return sleb; +} + +/** + * ubifs_end_scan - update LEB scanning information at end of scan. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + */ +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs) +{ + lnum = lnum; + dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); + ubifs_assert(offs % c->min_io_size == 0); + + sleb->endpt = ALIGN(offs, c->min_io_size); +} + +/** + * ubifs_add_snod - add a scanned node to LEB scanning information. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @buf: buffer containing node + * @offs: offset of node on flash + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs) +{ + struct ubifs_ch *ch = buf; + struct ubifs_ino_node *ino = buf; + struct ubifs_scan_node *snod; + + snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); + if (!snod) + return -ENOMEM; + + snod->sqnum = le64_to_cpu(ch->sqnum); + snod->type = ch->node_type; + snod->offs = offs; + snod->len = le32_to_cpu(ch->len); + snod->node = buf; + + switch (ch->node_type) { + case UBIFS_INO_NODE: + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + case UBIFS_DATA_NODE: + /* + * The key is in the same place in all keyed + * nodes. + */ + key_read(c, &ino->key, &snod->key); + break; + default: + invalid_key_init(c, &snod->key); + break; + } + list_add_tail(&snod->list, &sleb->nodes); + sleb->nodes_cnt += 1; + return 0; +} + +/** + * ubifs_scanned_corruption - print information after UBIFS scanned corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number of corruption + * @offs: offset of corruption + * @buf: buffer containing corruption + */ +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf) +{ + int len; + + ubifs_err(c, "corruption at LEB %d:%d", lnum, offs); + len = c->leb_size - offs; + if (len > 8192) + len = 8192; + ubifs_err(c, "first %d bytes from LEB %d:%d", len, lnum, offs); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); +} + +/** + * ubifs_scan - scan a logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be of @c->leb_size bytes in size) + * @quiet: print no messages + * + * This function scans LEB number @lnum and returns complete information about + * its contents. Returns the scanned information in case of success and, + * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case + * of failure. + * + * If @quiet is non-zero, this function does not print large and scary + * error messages and flash dumps in case of errors. + */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf, int quiet) +{ + void *buf = sbuf + offs; + int err, len = c->leb_size - offs; + struct ubifs_scan_leb *sleb; + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + while (len >= 8) { + struct ubifs_ch *ch = buf; + int node_len, ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) + /* Empty space is checked later */ + break; + + switch (ret) { + case SCANNED_GARBAGE: + ubifs_err(c, "garbage"); + goto corrupted; + case SCANNED_A_NODE: + break; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + ubifs_err(c, "bad node"); + goto corrupted; + default: + ubifs_err(c, "unknown"); + err = -EINVAL; + goto error; + } + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } + + if (offs % c->min_io_size) { + if (!quiet) + ubifs_err(c, "empty space starts at non-aligned offset %d", + offs); + goto corrupted; + } + + ubifs_end_scan(c, sleb, lnum, offs); + + for (; len > 4; offs += 4, buf = buf + 4, len -= 4) + if (*(uint32_t *)buf != 0xffffffff) + break; + for (; len; offs++, buf++, len--) + if (*(uint8_t *)buf != 0xff) { + if (!quiet) + ubifs_err(c, "corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + + return sleb; + +corrupted: + if (!quiet) { + ubifs_scanned_corruption(c, lnum, offs, buf); + ubifs_err(c, "LEB %d scanning failed", lnum); + } + err = -EUCLEAN; + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + +error: + ubifs_err(c, "LEB %d scanning failed, error %d", lnum, err); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * ubifs_scan_destroy - destroy LEB scanning information. + * @sleb: scanning information to free + */ +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *node; + struct list_head *head; + + head = &sleb->nodes; + while (!list_empty(head)) { + node = list_entry(head->next, struct ubifs_scan_node, list); + list_del(&node->list); + kfree(node); + } + kfree(sleb); +} diff --git a/kernel/fs/ubifs/shrinker.c b/kernel/fs/ubifs/shrinker.c new file mode 100644 index 000000000..9a9fb94a4 --- /dev/null +++ b/kernel/fs/ubifs/shrinker.c @@ -0,0 +1,331 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS shrinker which evicts clean znodes from the TNC + * tree when Linux VM needs more RAM. + * + * We do not implement any LRU lists to find oldest znodes to free because it + * would add additional overhead to the file system fast paths. So the shrinker + * just walks the TNC tree when searching for znodes to free. + * + * If the root of a TNC sub-tree is clean and old enough, then the children are + * also clean and old enough. So the shrinker walks the TNC in level order and + * dumps entire sub-trees. + * + * The age of znodes is just the time-stamp when they were last looked at. + * The current shrinker first tries to evict old znodes, then young ones. + * + * Since the shrinker is global, it has to protect against races with FS + * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. + */ + +#include "ubifs.h" + +/* List of all UBIFS file-system instances */ +LIST_HEAD(ubifs_infos); + +/* + * We number each shrinker run and record the number on the ubifs_info structure + * so that we can easily work out which ubifs_info structures have already been + * done by the current run. + */ +static unsigned int shrinker_run_no; + +/* Protects 'ubifs_infos' list */ +DEFINE_SPINLOCK(ubifs_infos_lock); + +/* Global clean znode counter (for all mounted UBIFS instances) */ +atomic_long_t ubifs_clean_zn_cnt; + +/** + * shrink_tnc - shrink TNC tree. + * @c: UBIFS file-system description object + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function traverses TNC tree and frees clean znodes. It does not free + * clean znodes which younger then @age. Returns number of freed znodes. + */ +static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) +{ + int total_freed = 0; + struct ubifs_znode *znode, *zprev; + int time = get_seconds(); + + ubifs_assert(mutex_is_locked(&c->umount_mutex)); + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + + if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) + return 0; + + /* + * Traverse the TNC tree in levelorder manner, so that it is possible + * to destroy large sub-trees. Indeed, if a znode is old, then all its + * children are older or of the same age. + * + * Note, we are holding 'c->tnc_mutex', so we do not have to lock the + * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is + * changed only when the 'c->tnc_mutex' is held. + */ + zprev = NULL; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + while (znode && total_freed < nr && + atomic_long_read(&c->clean_zn_cnt) > 0) { + int freed; + + /* + * If the znode is clean, but it is in the 'c->cnext' list, this + * means that this znode has just been written to flash as a + * part of commit and was marked clean. They will be removed + * from the list at end commit. We cannot change the list, + * because it is not protected by any mutex (design decision to + * make commit really independent and parallel to main I/O). So + * we just skip these znodes. + * + * Note, the 'clean_zn_cnt' counters are not updated until + * after the commit, so the UBIFS shrinker does not report + * the znodes which are in the 'c->cnext' list as freeable. + * + * Also note, if the root of a sub-tree is not in 'c->cnext', + * then the whole sub-tree is not in 'c->cnext' as well, so it + * is safe to dump whole sub-tree. + */ + + if (znode->cnext) { + /* + * Very soon these znodes will be removed from the list + * and become freeable. + */ + *contention = 1; + } else if (!ubifs_zn_dirty(znode) && + abs(time - znode->time) >= age) { + if (znode->parent) + znode->parent->zbranch[znode->iip].znode = NULL; + else + c->zroot.znode = NULL; + + freed = ubifs_destroy_tnc_subtree(znode); + atomic_long_sub(freed, &ubifs_clean_zn_cnt); + atomic_long_sub(freed, &c->clean_zn_cnt); + total_freed += freed; + znode = zprev; + } + + if (unlikely(!c->zroot.znode)) + break; + + zprev = znode; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + cond_resched(); + } + + return total_freed; +} + +/** + * shrink_tnc_trees - shrink UBIFS TNC trees. + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function walks the list of mounted UBIFS file-systems and frees clean + * znodes which are older than @age, until at least @nr znodes are freed. + * Returns the number of freed znodes. + */ +static int shrink_tnc_trees(int nr, int age, int *contention) +{ + struct ubifs_info *c; + struct list_head *p; + unsigned int run_no; + int freed = 0; + + spin_lock(&ubifs_infos_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + /* Iterate over all mounted UBIFS file-systems and try to shrink them */ + p = ubifs_infos.next; + while (p != &ubifs_infos) { + c = list_entry(p, struct ubifs_info, infos_list); + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (c->shrinker_run_no == run_no) + break; + if (!mutex_trylock(&c->umount_mutex)) { + /* Some un-mount is in progress, try next FS */ + *contention = 1; + p = p->next; + continue; + } + /* + * We're holding 'c->umount_mutex', so the file-system won't go + * away. + */ + if (!mutex_trylock(&c->tnc_mutex)) { + mutex_unlock(&c->umount_mutex); + *contention = 1; + p = p->next; + continue; + } + spin_unlock(&ubifs_infos_lock); + /* + * OK, now we have TNC locked, the file-system cannot go away - + * it is safe to reap the cache. + */ + c->shrinker_run_no = run_no; + freed += shrink_tnc(c, nr, age, contention); + mutex_unlock(&c->tnc_mutex); + spin_lock(&ubifs_infos_lock); + /* Get the next list element before we move this one */ + p = p->next; + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&c->infos_list, &ubifs_infos); + mutex_unlock(&c->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&ubifs_infos_lock); + return freed; +} + +/** + * kick_a_thread - kick a background thread to start commit. + * + * This function kicks a background thread to start background commit. Returns + * %-1 if a thread was kicked or there is another reason to assume the memory + * will soon be freed or become freeable. If there are no dirty znodes, returns + * %0. + */ +static int kick_a_thread(void) +{ + int i; + struct ubifs_info *c; + + /* + * Iterate over all mounted UBIFS file-systems and find out if there is + * already an ongoing commit operation there. If no, then iterate for + * the second time and initiate background commit. + */ + spin_lock(&ubifs_infos_lock); + for (i = 0; i < 2; i++) { + list_for_each_entry(c, &ubifs_infos, infos_list) { + long dirty_zn_cnt; + + if (!mutex_trylock(&c->umount_mutex)) { + /* + * Some un-mount is in progress, it will + * certainly free memory, so just return. + */ + spin_unlock(&ubifs_infos_lock); + return -1; + } + + dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); + + if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || + c->ro_mount || c->ro_error) { + mutex_unlock(&c->umount_mutex); + continue; + } + + if (c->cmt_state != COMMIT_RESTING) { + spin_unlock(&ubifs_infos_lock); + mutex_unlock(&c->umount_mutex); + return -1; + } + + if (i == 1) { + list_move_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + ubifs_request_bg_commit(c); + mutex_unlock(&c->umount_mutex); + return -1; + } + mutex_unlock(&c->umount_mutex); + } + } + spin_unlock(&ubifs_infos_lock); + + return 0; +} + +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); + + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; +} + +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + int contention = 0; + unsigned long freed; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); + + if (!clean_zn_cnt) { + /* + * No clean znodes, nothing to reap. All we can do in this case + * is to kick background threads to start commit, which will + * probably make clean znodes which, in turn, will be freeable. + * And we return -1 which means will make VM call us again + * later. + */ + dbg_tnc("no clean znodes, kick a thread"); + return kick_a_thread(); + } + + freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough old znodes, try to free young ones"); + freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough young znodes, free all"); + freed += shrink_tnc_trees(nr - freed, 0, &contention); + + if (!freed && contention) { + dbg_tnc("freed nothing, but contention"); + return SHRINK_STOP; + } + +out: + dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); + return freed; +} diff --git a/kernel/fs/ubifs/super.c b/kernel/fs/ubifs/super.c new file mode 100644 index 000000000..75e6f04bb --- /dev/null +++ b/kernel/fs/ubifs/super.c @@ -0,0 +1,2300 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS initialization and VFS superblock operations. Some + * initialization stuff which is rather large and complex is placed at + * corresponding subsystems, but most of it is here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs.h" + +/* + * Maximum amount of memory we may 'kmalloc()' without worrying that we are + * allocating too much. + */ +#define UBIFS_KMALLOC_OK (128*1024) + +/* Slab cache for UBIFS inodes */ +struct kmem_cache *ubifs_inode_slab; + +/* UBIFS TNC shrinker description */ +static struct shrinker ubifs_shrinker_info = { + .scan_objects = ubifs_shrink_scan, + .count_objects = ubifs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +/** + * validate_inode - validate inode. + * @c: UBIFS file-system description object + * @inode: the inode to validate + * + * This is a helper function for 'ubifs_iget()' which validates various fields + * of a newly built inode to make sure they contain sane values and prevent + * possible vulnerabilities. Returns zero if the inode is all right and + * a non-zero error code if not. + */ +static int validate_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + const struct ubifs_inode *ui = ubifs_inode(inode); + + if (inode->i_size > c->max_inode_sz) { + ubifs_err(c, "inode is too large (%lld)", + (long long)inode->i_size); + return 1; + } + + if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + ubifs_err(c, "unknown compression type %d", ui->compr_type); + return 2; + } + + if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) + return 3; + + if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) + return 4; + + if (ui->xattr && !S_ISREG(inode->i_mode)) + return 5; + + if (!ubifs_compr_present(ui->compr_type)) { + ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", + inode->i_ino, ubifs_compr_name(ui->compr_type)); + } + + err = dbg_check_dir(c, inode); + return err; +} + +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) +{ + int err; + union ubifs_key key; + struct ubifs_ino_node *ino; + struct ubifs_info *c = sb->s_fs_info; + struct inode *inode; + struct ubifs_inode *ui; + + dbg_gen("inode %lu", inum); + + inode = iget_locked(sb, inum); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + ui = ubifs_inode(inode); + + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) { + err = -ENOMEM; + goto out; + } + + ino_key_init(c, &key, inode->i_ino); + + err = ubifs_tnc_lookup(c, &key, ino); + if (err) + goto out_ino; + + inode->i_flags |= (S_NOCMTIME | S_NOATIME); + set_nlink(inode, le32_to_cpu(ino->nlink)); + i_uid_write(inode, le32_to_cpu(ino->uid)); + i_gid_write(inode, le32_to_cpu(ino->gid)); + inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); + inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); + inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); + inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); + inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); + inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode->i_mode = le32_to_cpu(ino->mode); + inode->i_size = le64_to_cpu(ino->size); + + ui->data_len = le32_to_cpu(ino->data_len); + ui->flags = le32_to_cpu(ino->flags); + ui->compr_type = le16_to_cpu(ino->compr_type); + ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); + ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + ui->xattr_size = le32_to_cpu(ino->xattr_size); + ui->xattr_names = le32_to_cpu(ino->xattr_names); + ui->synced_i_size = ui->ui_size = inode->i_size; + + ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; + + err = validate_inode(c, inode); + if (err) + goto out_invalid; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + if (ui->xattr) { + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + } else if (ui->data_len != 0) { + err = 10; + goto out_invalid; + } + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + if (ui->data_len != 0) { + err = 11; + goto out_invalid; + } + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { + err = 12; + goto out_invalid; + } + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + break; + case S_IFBLK: + case S_IFCHR: + { + dev_t rdev; + union ubifs_dev_desc *dev; + + ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + + dev = (union ubifs_dev_desc *)ino->data; + if (ui->data_len == sizeof(dev->new)) + rdev = new_decode_dev(le32_to_cpu(dev->new)); + else if (ui->data_len == sizeof(dev->huge)) + rdev = huge_decode_dev(le64_to_cpu(dev->huge)); + else { + err = 13; + goto out_invalid; + } + memcpy(ui->data, ino->data, ui->data_len); + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, 0); + if (ui->data_len != 0) { + err = 14; + goto out_invalid; + } + break; + default: + err = 15; + goto out_invalid; + } + + kfree(ino); + ubifs_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +out_invalid: + ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); + ubifs_dump_node(c, ino); + ubifs_dump_inode(c, inode); + err = -EINVAL; +out_ino: + kfree(ino); +out: + ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); + iget_failed(inode); + return ERR_PTR(err); +} + +static struct inode *ubifs_alloc_inode(struct super_block *sb) +{ + struct ubifs_inode *ui; + + ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + if (!ui) + return NULL; + + memset((void *)ui + sizeof(struct inode), 0, + sizeof(struct ubifs_inode) - sizeof(struct inode)); + mutex_init(&ui->ui_mutex); + spin_lock_init(&ui->ui_lock); + return &ui->vfs_inode; +}; + +static void ubifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ubifs_inode *ui = ubifs_inode(inode); + kmem_cache_free(ubifs_inode_slab, ui); +} + +static void ubifs_destroy_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + kfree(ui->data); + call_rcu(&inode->i_rcu, ubifs_i_callback); +} + +/* + * Note, Linux write-back code calls this without 'i_mutex'. + */ +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err = 0; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(!ui->xattr); + if (is_bad_inode(inode)) + return 0; + + mutex_lock(&ui->ui_mutex); + /* + * Due to races between write-back forced by budgeting + * (see 'sync_some_inodes()') and background write-back, the inode may + * have already been synchronized, do not do this again. This might + * also happen if it was synchronized in an VFS operation, e.g. + * 'ubifs_link()'. + */ + if (!ui->dirty) { + mutex_unlock(&ui->ui_mutex); + return 0; + } + + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err(c, "can't write inode %lu, error %d", + inode->i_ino, err); + else + err = dbg_check_inode_size(c, inode, ui->ui_size); + } + + ui->dirty = 0; + mutex_unlock(&ui->ui_mutex); + ubifs_release_dirty_inode_budget(c, ui); + return err; +} + +static void ubifs_evict_inode(struct inode *inode) +{ + int err; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (ui->xattr) + /* + * Extended attribute inode deletions are fully handled in + * 'ubifs_removexattr()'. These inodes are special and have + * limited usage, so there is nothing to do here. + */ + goto out; + + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + ubifs_assert(!atomic_read(&inode->i_count)); + + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_nlink) + goto done; + + if (is_bad_inode(inode)) + goto out; + + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); + if (err) + /* + * Worst case we have a lost orphan inode wasting space, so a + * simple error message is OK here. + */ + ubifs_err(c, "can't delete inode %lu, error %d", + inode->i_ino, err); + +out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } +done: + clear_inode(inode); +} + +static void ubifs_dirty_inode(struct inode *inode, int flags) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + if (!ui->dirty) { + ui->dirty = 1; + dbg_gen("inode %lu", inode->i_ino); + } +} + +static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ubifs_info *c = dentry->d_sb->s_fs_info; + unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; + + free = ubifs_get_free_space(c); + dbg_gen("free space %lld bytes (%lld blocks)", + free, free >> UBIFS_BLOCK_SHIFT); + + buf->f_type = UBIFS_SUPER_MAGIC; + buf->f_bsize = UBIFS_BLOCK_SIZE; + buf->f_blocks = c->block_cnt; + buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; + if (free > c->report_rp_size) + buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; + else + buf->f_bavail = 0; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = UBIFS_MAX_NLEN; + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + ubifs_assert(buf->f_bfree <= c->block_cnt); + return 0; +} + +static int ubifs_show_options(struct seq_file *s, struct dentry *root) +{ + struct ubifs_info *c = root->d_sb->s_fs_info; + + if (c->mount_opts.unmount_mode == 2) + seq_puts(s, ",fast_unmount"); + else if (c->mount_opts.unmount_mode == 1) + seq_puts(s, ",norm_unmount"); + + if (c->mount_opts.bulk_read == 2) + seq_puts(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_puts(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_puts(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_puts(s, ",no_chk_data_crc"); + + if (c->mount_opts.override_compr) { + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); + } + + return 0; +} + +static int ubifs_sync_fs(struct super_block *sb, int wait) +{ + int i, err; + struct ubifs_info *c = sb->s_fs_info; + + /* + * Zero @wait is just an advisory thing to help the file system shove + * lots of data into the queues, and there will be the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (!wait) + return 0; + + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + /* + * Strictly speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. + */ + err = ubifs_run_commit(c); + if (err) + return err; + + return ubi_sync(c->vi.ubi_num); +} + +/** + * init_constants_early - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This function initialize UBIFS constants which do not need the superblock to + * be read. It also checks that the UBI volume satisfies basic UBIFS + * requirements. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int init_constants_early(struct ubifs_info *c) +{ + if (c->vi.corrupted) { + ubifs_warn(c, "UBI volume is corrupted - read-only mode"); + c->ro_media = 1; + } + + if (c->di.ro_mode) { + ubifs_msg(c, "read-only UBI device"); + c->ro_media = 1; + } + + if (c->vi.vol_type == UBI_STATIC_VOLUME) { + ubifs_msg(c, "static UBI volume - read-only mode"); + c->ro_media = 1; + } + + c->leb_cnt = c->vi.size; + c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; + c->half_leb_size = c->leb_size / 2; + c->min_io_size = c->di.min_io_size; + c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; + + if (c->leb_size < UBIFS_MIN_LEB_SZ) { + ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); + return -EINVAL; + } + + if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err(c, "too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); + return -EINVAL; + } + + if (!is_power_of_2(c->min_io_size)) { + ubifs_err(c, "bad min. I/O size %d", c->min_io_size); + return -EINVAL; + } + + /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err(c, "bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* + * UBIFS aligns all node to 8-byte boundary, so to make function in + * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is + * less than 8. + */ + if (c->min_io_size < 8) { + c->min_io_size = 8; + c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } + } + + c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); + c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); + + /* + * Initialize node length ranges which are mostly needed for node + * length validation. + */ + c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; + c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; + c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; + c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; + c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; + c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; + + c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; + c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; + c->ranges[UBIFS_ORPH_NODE].min_len = + UBIFS_ORPH_NODE_SZ + sizeof(__le64); + c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; + c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; + c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; + /* + * Minimum indexing node size is amended later when superblock is + * read and the key length is known. + */ + c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; + /* + * Maximum indexing node size is amended later when superblock is + * read and the fanout is known. + */ + c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; + + /* + * Initialize dead and dark LEB space watermarks. See gc.c for comments + * about these values. + */ + c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); + c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + + /* Buffer size for bulk-reads */ + c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->max_bu_buf_len > c->leb_size) + c->max_bu_buf_len = c->leb_size; + return 0; +} + +/** + * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. + * @c: UBIFS file-system description object + * @lnum: LEB the write-buffer was synchronized to + * @free: how many free bytes left in this LEB + * @pad: how many bytes were padded + * + * This is a callback function which is called by the I/O unit when the + * write-buffer is synchronized. We need this to correctly maintain space + * accounting in bud logical eraseblocks. This function returns zero in case of + * success and a negative error code in case of failure. + * + * This function actually belongs to the journal, but we keep it here because + * we want to keep it static. + */ +static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) +{ + return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); +} + +/* + * init_constants_sb - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the superblock has been read. It also checks various UBIFS parameters and + * makes sure they are all right. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int init_constants_sb(struct ubifs_info *c) +{ + int tmp, err; + long long tmp64; + + c->main_bytes = (long long)c->main_lebs * c->leb_size; + c->max_znode_sz = sizeof(struct ubifs_znode) + + c->fanout * sizeof(struct ubifs_zbranch); + + tmp = ubifs_idx_node_sz(c, 1); + c->ranges[UBIFS_IDX_NODE].min_len = tmp; + c->min_idx_node_sz = ALIGN(tmp, 8); + + tmp = ubifs_idx_node_sz(c, c->fanout); + c->ranges[UBIFS_IDX_NODE].max_len = tmp; + c->max_idx_node_sz = ALIGN(tmp, 8); + + /* Make sure LEB size is large enough to fit full commit */ + tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; + tmp = ALIGN(tmp, c->min_io_size); + if (tmp > c->leb_size) { + ubifs_err(c, "too small LEB size %d, at least %d needed", + c->leb_size, tmp); + return -EINVAL; + } + + /* + * Make sure that the log is large enough to fit reference nodes for + * all buds plus one reserved LEB. + */ + tmp64 = c->max_bud_bytes + c->leb_size - 1; + c->max_bud_cnt = div_u64(tmp64, c->leb_size); + tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); + tmp /= c->leb_size; + tmp += 1; + if (c->log_lebs < tmp) { + ubifs_err(c, "too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); + return -EINVAL; + } + + /* + * When budgeting we assume worst-case scenarios when the pages are not + * be compressed and direntries are of the maximum size. + * + * Note, data, which may be stored in inodes is budgeted separately, so + * it is not included into 'c->bi.inode_budget'. + */ + c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->bi.inode_budget = UBIFS_INO_NODE_SZ; + c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; + + /* + * When the amount of flash space used by buds becomes + * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. + * The writers are unblocked when the commit is finished. To avoid + * writers to be blocked UBIFS initiates background commit in advance, + * when number of bud bytes becomes above the limit defined below. + */ + c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; + + /* + * Ensure minimum journal size. All the bytes in the journal heads are + * considered to be used, when calculating the current journal usage. + * Consequently, if the journal is too small, UBIFS will treat it as + * always full. + */ + tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; + if (c->bg_bud_bytes < tmp64) + c->bg_bud_bytes = tmp64; + if (c->max_bud_bytes < tmp64 + c->leb_size) + c->max_bud_bytes = tmp64 + c->leb_size; + + err = ubifs_calc_lpt_geom(c); + if (err) + return err; + + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; + return 0; +} + +/* + * init_constants_master - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the master node has been read. It also checks various UBIFS parameters and + * makes sure they are all right. + */ +static void init_constants_master(struct ubifs_info *c) +{ + long long tmp64; + + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); + + /* + * Calculate total amount of FS blocks. This number is not used + * internally because it does not make much sense for UBIFS, but it is + * necessary to report something for the 'statfs()' call. + * + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, minimum LEBs for the index, and assume only one journal + * head is available. + */ + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 *= (long long)c->leb_size - c->leb_overhead; + tmp64 = ubifs_reported_space(c, tmp64); + c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; +} + +/** + * take_gc_lnum - reserve GC LEB. + * @c: UBIFS file-system description object + * + * This function ensures that the LEB reserved for garbage collection is marked + * as "taken" in lprops. We also have to set free space to LEB size and dirty + * space to zero, because lprops may contain out-of-date information if the + * file-system was un-mounted before it has been committed. This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +static int take_gc_lnum(struct ubifs_info *c) +{ + int err; + + if (c->gc_lnum == -1) { + ubifs_err(c, "no LEB for GC"); + return -EINVAL; + } + + /* And we have to tell lprops that this LEB is taken */ + err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, + LPROPS_TAKEN, 0, 0); + return err; +} + +/** + * alloc_wbufs - allocate write-buffers. + * @c: UBIFS file-system description object + * + * This helper function allocates and initializes UBIFS write-buffers. Returns + * zero in case of success and %-ENOMEM in case of failure. + */ +static int alloc_wbufs(struct ubifs_info *c) +{ + int i, err; + + c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead), + GFP_KERNEL); + if (!c->jheads) + return -ENOMEM; + + /* Initialize journal heads */ + for (i = 0; i < c->jhead_cnt; i++) { + INIT_LIST_HEAD(&c->jheads[i].buds_list); + err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); + if (err) + return err; + + c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; + c->jheads[i].wbuf.jhead = i; + c->jheads[i].grouped = 1; + } + + /* + * Garbage Collector head does not need to be synchronized by timer. + * Also GC head nodes are not grouped. + */ + c->jheads[GCHD].wbuf.no_timer = 1; + c->jheads[GCHD].grouped = 0; + + return 0; +} + +/** + * free_wbufs - free write-buffers. + * @c: UBIFS file-system description object + */ +static void free_wbufs(struct ubifs_info *c) +{ + int i; + + if (c->jheads) { + for (i = 0; i < c->jhead_cnt; i++) { + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); + } + kfree(c->jheads); + c->jheads = NULL; + } +} + +/** + * free_orphans - free orphans. + * @c: UBIFS file-system description object + */ +static void free_orphans(struct ubifs_info *c) +{ + struct ubifs_orphan *orph; + + while (c->orph_dnext) { + orph = c->orph_dnext; + c->orph_dnext = orph->dnext; + list_del(&orph->list); + kfree(orph); + } + + while (!list_empty(&c->orph_list)) { + orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); + list_del(&orph->list); + kfree(orph); + ubifs_err(c, "orphan list not empty at unmount"); + } + + vfree(c->orph_buf); + c->orph_buf = NULL; +} + +/** + * free_buds - free per-bud objects. + * @c: UBIFS file-system description object + */ +static void free_buds(struct ubifs_info *c) +{ + struct ubifs_bud *bud, *n; + + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + kfree(bud); +} + +/** + * check_volume_empty - check if the UBI volume is empty. + * @c: UBIFS file-system description object + * + * This function checks if the UBIFS volume is empty by looking if its LEBs are + * mapped or not. The result of checking is stored in the @c->empty variable. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int check_volume_empty(struct ubifs_info *c) +{ + int lnum, err; + + c->empty = 1; + for (lnum = 0; lnum < c->leb_cnt; lnum++) { + err = ubifs_is_mapped(c, lnum); + if (unlikely(err < 0)) + return err; + if (err == 1) { + c->empty = 0; + break; + } + + cond_resched(); + } + + return 0; +} + +/* + * UBIFS mount options. + * + * Opt_fast_unmount: do not run a journal commit before un-mounting + * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_fast_unmount, + Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_fast_unmount, "fast_unmount"}, + {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +/** + * parse_standard_option - parse a standard mount option. + * @option: the option to parse + * + * Normally, standard mount options like "sync" are passed to file-systems as + * flags. However, when a "rootflags=" kernel boot parameter is used, they may + * be present in the options string. This function tries to deal with this + * situation and parse standard options. Returns 0 if the option was not + * recognized, and the corresponding integer flag if it was. + * + * UBIFS is only interested in the "sync" option, so do not check for anything + * else. + */ +static int parse_standard_option(const char *option) +{ + + pr_notice("UBIFS: parse %s\n", option); + if (!strcmp(option, "sync")) + return MS_SYNCHRONOUS; + return 0; +} + +/** + * ubifs_parse_options - parse mount parameters. + * @c: UBIFS file-system description object + * @options: parameters to parse + * @is_remount: non-zero if this is FS re-mount + * + * This function parses UBIFS mount options and returns zero in case success + * and a negative error code in case of failure. + */ +static int ubifs_parse_options(struct ubifs_info *c, char *options, + int is_remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + /* + * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. + * We accept them in order to be backward-compatible. But this + * should be removed at some point. + */ + case Opt_fast_unmount: + c->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + c->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; + case Opt_override_compr: + { + char *name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr_type = UBIFS_COMPR_NONE; + else if (!strcmp(name, "lzo")) + c->mount_opts.compr_type = UBIFS_COMPR_LZO; + else if (!strcmp(name, "zlib")) + c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else { + ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = 1; + c->default_compr = c->mount_opts.compr_type; + break; + } + default: + { + unsigned long flag; + struct super_block *sb = c->vfs_sb; + + flag = parse_standard_option(p); + if (!flag) { + ubifs_err(c, "unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + sb->s_flags |= flag; + break; + } + } + } + + return 0; +} + +/** + * destroy_journal - destroy journal data structures. + * @c: UBIFS file-system description object + * + * This function destroys journal data structures including those that may have + * been created by recovery functions. + */ +static void destroy_journal(struct ubifs_info *c) +{ + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + list_del(&ucleb->list); + kfree(ucleb); + } + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + list_del(&bud->list); + kfree(bud); + } + ubifs_destroy_idx_gc(c); + ubifs_destroy_size_tree(c); + ubifs_tnc_close(c); + free_buds(c); +} + +/** + * bu_init - initialize bulk-read information. + * @c: UBIFS file-system description object + */ +static void bu_init(struct ubifs_info *c) +{ + ubifs_assert(c->bulk_read == 1); + + if (c->bu.buf) + return; /* Already initialized */ + +again: + c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); + if (!c->bu.buf) { + if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { + c->max_bu_buf_len = UBIFS_KMALLOC_OK; + goto again; + } + + /* Just disable bulk-read */ + ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it", + c->max_bu_buf_len); + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + return; + } +} + +/** + * check_free_space - check if there is enough free space to mount. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free space to be mounted in + * read/write mode. UBIFS must always have some free space to allow deletions. + */ +static int check_free_space(struct ubifs_info *c) +{ + ubifs_assert(c->dark_wm > 0); + if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { + ubifs_err(c, "insufficient free space to mount in R/W mode"); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + return -ENOSPC; + } + return 0; +} + +/** + * mount_ubifs - mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * This function mounts UBIFS file system. Returns zero in case of success and + * a negative error code in case of failure. + */ +static int mount_ubifs(struct ubifs_info *c) +{ + int err; + long long x, y; + size_t sz; + + c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + /* Suppress error messages while probing if MS_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + + err = init_constants_early(c); + if (err) + return err; + + err = ubifs_debugging_init(c); + if (err) + return err; + + err = check_volume_empty(c); + if (err) + goto out_free; + + if (c->empty && (c->ro_mount || c->ro_media)) { + /* + * This UBI volume is empty, and read-only, or the file system + * is mounted read-only - we cannot format it. + */ + ubifs_err(c, "can't format empty UBI volume: read-only %s", + c->ro_media ? "UBI volume" : "mount"); + err = -EROFS; + goto out_free; + } + + if (c->ro_media && !c->ro_mount) { + ubifs_err(c, "cannot mount read-write - read-only media"); + err = -EROFS; + goto out_free; + } + + /* + * The requirement for the buffer is that it should fit indexing B-tree + * height amount of integers. We assume the height if the TNC tree will + * never exceed 64. + */ + err = -ENOMEM; + c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL); + if (!c->bottom_up_buf) + goto out_free; + + c->sbuf = vmalloc(c->leb_size); + if (!c->sbuf) + goto out_free; + + if (!c->ro_mount) { + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) + goto out_free; + } + + if (c->bulk_read == 1) + bu_init(c); + + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + + c->mounting = 1; + + err = ubifs_read_superblock(c); + if (err) + goto out_free; + + c->probing = 0; + + /* + * Make sure the compressor which is set as default in the superblock + * or overridden by mount options is actually compiled in. + */ + if (!ubifs_compr_present(c->default_compr)) { + ubifs_err(c, "'compressor \"%s\" is not compiled in", + ubifs_compr_name(c->default_compr)); + err = -ENOTSUPP; + goto out_free; + } + + err = init_constants_sb(c); + if (err) + goto out_free; + + sz = ALIGN(c->max_idx_node_sz, c->min_io_size); + sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); + c->cbuf = kmalloc(sz, GFP_NOFS); + if (!c->cbuf) { + err = -ENOMEM; + goto out_free; + } + + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); + if (!c->ro_mount) { + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err(c, "cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out_wbufs; + } + wake_up_process(c->bgt); + } + + err = ubifs_read_master(c); + if (err) + goto out_master; + + init_constants_master(c); + + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { + ubifs_msg(c, "recovery needed"); + c->need_recovery = 1; + } + + if (c->need_recovery && !c->ro_mount) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !c->ro_mount); + if (err) + goto out_master; + + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_lpt; + } + + if (!c->ro_mount && !c->need_recovery) { + /* + * Set the "dirty" flag so that if we reboot uncleanly we + * will notice this immediately on the next mount. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out_lpt; + } + + err = dbg_check_idx_size(c, c->bi.old_idx_sz); + if (err) + goto out_lpt; + + err = ubifs_replay_journal(c); + if (err) + goto out_journal; + + /* Calculate 'min_idx_lebs' after journal replay */ + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); + if (err) + goto out_orphans; + + if (!c->ro_mount) { + int lnum; + + err = check_free_space(c); + if (err) + goto out_orphans; + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out_orphans; + } + + if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + err = ubifs_rcvry_gc_commit(c); + if (err) + goto out_orphans; + } else { + err = take_gc_lnum(c); + if (err) + goto out_orphans; + + /* + * GC LEB may contain garbage if there was an unclean + * reboot, and it should be un-mapped. + */ + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out_orphans; + } + + err = dbg_check_lprops(c); + if (err) + goto out_orphans; + } else if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + } else { + /* + * Even if we mount read-only, we have to set space in GC LEB + * to proper value because this affects UBIFS free space + * reporting. We do not want to have a situation when + * re-mounting from R/O to R/W changes amount of free space. + */ + err = take_gc_lnum(c); + if (err) + goto out_orphans; + } + + spin_lock(&ubifs_infos_lock); + list_add_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + if (c->need_recovery) { + if (c->ro_mount) + ubifs_msg(c, "recovery deferred"); + else { + c->need_recovery = 0; + ubifs_msg(c, "recovery completed"); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); + } + } else + ubifs_assert(c->lst.taken_empty_lebs > 0); + + err = dbg_check_filesystem(c); + if (err) + goto out_infos; + + err = dbg_debugfs_init_fs(c); + if (err) + goto out_infos; + + c->mounting = 0; + + ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s", + c->vi.ubi_num, c->vi.vol_id, c->vi.name, + c->ro_mount ? ", R/O mode" : ""); + x = (long long)c->main_lebs * c->leb_size; + y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", + c->leb_size, c->leb_size >> 10, c->min_io_size, + c->max_write_size); + ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", + x, x >> 20, c->main_lebs, + y, y >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, + c->big_lpt ? ", big LPT model" : ", small LPT model"); + + dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr)); + dbg_gen("data journal heads: %d", + c->jhead_cnt - NONDATA_JHEADS_CNT); + dbg_gen("log LEBs: %d (%d - %d)", + c->log_lebs, UBIFS_LOG_LNUM, c->log_last); + dbg_gen("LPT area LEBs: %d (%d - %d)", + c->lpt_lebs, c->lpt_first, c->lpt_last); + dbg_gen("orphan area LEBs: %d (%d - %d)", + c->orph_lebs, c->orph_first, c->orph_last); + dbg_gen("main area LEBs: %d (%d - %d)", + c->main_lebs, c->main_first, c->leb_cnt - 1); + dbg_gen("index LEBs: %d", c->lst.idx_lebs); + dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)", + c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, + c->bi.old_idx_sz >> 20); + dbg_gen("key hash type: %d", c->key_hash_type); + dbg_gen("tree fanout: %d", c->fanout); + dbg_gen("reserved GC LEB: %d", c->gc_lnum); + dbg_gen("max. znode size %d", c->max_znode_sz); + dbg_gen("max. index node size %d", c->max_idx_node_sz); + dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", + UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); + dbg_gen("node sizes: trun %zu, sb %zu, master %zu", + UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); + dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", + UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); + dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); + dbg_gen("dead watermark: %d", c->dead_wm); + dbg_gen("dark watermark: %d", c->dark_wm); + dbg_gen("LEB overhead: %d", c->leb_overhead); + x = (long long)c->main_lebs * c->dark_wm; + dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", + x, x >> 10, x >> 20); + dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + c->max_bud_bytes, c->max_bud_bytes >> 10, + c->max_bud_bytes >> 20); + dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + c->bg_bud_bytes, c->bg_bud_bytes >> 10, + c->bg_bud_bytes >> 20); + dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", + c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); + dbg_gen("max. seq. number: %llu", c->max_sqnum); + dbg_gen("commit number: %llu", c->cmt_no); + + return 0; + +out_infos: + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); +out_orphans: + free_orphans(c); +out_journal: + destroy_journal(c); +out_lpt: + ubifs_lpt_free(c, 0); +out_master: + kfree(c->mst_node); + kfree(c->rcvrd_mst_node); + if (c->bgt) + kthread_stop(c->bgt); +out_wbufs: + free_wbufs(c); +out_cbuf: + kfree(c->cbuf); +out_free: + kfree(c->write_reserve_buf); + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); + return err; +} + +/** + * ubifs_umount - un-mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * Note, this function is called to free allocated resourced when un-mounting, + * as well as free resources when an error occurred while we were half way + * through mounting (error path cleanup function). So it has to make sure the + * resource was actually allocated before freeing it. + */ +static void ubifs_umount(struct ubifs_info *c) +{ + dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + + dbg_debugfs_exit_fs(c); + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); + + if (c->bgt) + kthread_stop(c->bgt); + + destroy_journal(c); + free_wbufs(c); + free_orphans(c); + ubifs_lpt_free(c, 0); + + kfree(c->cbuf); + kfree(c->rcvrd_mst_node); + kfree(c->mst_node); + kfree(c->write_reserve_buf); + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); +} + +/** + * ubifs_remount_rw - re-mount in read-write mode. + * @c: UBIFS file-system description object + * + * UBIFS avoids allocating many unnecessary resources when mounted in read-only + * mode. This function allocates the needed resources and re-mounts UBIFS in + * read-write mode. + */ +static int ubifs_remount_rw(struct ubifs_info *c) +{ + int err, lnum; + + if (c->rw_incompat) { + ubifs_err(c, "the file-system is not R/W-compatible"); + ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + + mutex_lock(&c->umount_mutex); + dbg_save_space_info(c); + c->remounting_rw = 1; + c->ro_mount = 0; + + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out; + } + + err = check_free_space(c); + if (err) + goto out; + + if (c->old_leb_cnt != c->leb_cnt) { + struct ubifs_sb_node *sup; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) { + err = PTR_ERR(sup); + goto out; + } + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + goto out; + } + + if (c->need_recovery) { + ubifs_msg(c, "completing deferred recovery"); + err = ubifs_write_rcvrd_mst_node(c); + if (err) + goto out; + err = ubifs_recover_size(c); + if (err) + goto out; + err = ubifs_clean_lebs(c, c->sbuf); + if (err) + goto out; + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out; + } else { + /* A readonly mount is not allowed to have orphans */ + ubifs_assert(c->tot_orphans == 0); + err = ubifs_clear_orphans(c); + if (err) + goto out; + } + + if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out; + } + + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) { + err = -ENOMEM; + goto out; + } + + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) { + err = -ENOMEM; + goto out; + } + + err = ubifs_lpt_init(c, 0, 1); + if (err) + goto out; + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err(c, "cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out; + } + wake_up_process(c->bgt); + + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out; + } + + if (c->need_recovery) + err = ubifs_rcvry_gc_commit(c); + else + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out; + + dbg_gen("re-mounted read-write"); + c->remounting_rw = 0; + + if (c->need_recovery) { + c->need_recovery = 0; + ubifs_msg(c, "deferred recovery completed"); + } else { + /* + * Do not run the debugging space check if the were doing + * recovery, because when we saved the information we had the + * file-system in a state where the TNC and lprops has been + * modified in memory, but all the I/O operations (including a + * commit) were deferred. So the file-system was in + * "non-committed" state. Now the file-system is in committed + * state, and of course the amount of free space will change + * because, for example, the old index size was imprecise. + */ + err = dbg_check_space_info(c); + } + + mutex_unlock(&c->umount_mutex); + return err; + +out: + c->ro_mount = 1; + vfree(c->orph_buf); + c->orph_buf = NULL; + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->remounting_rw = 0; + mutex_unlock(&c->umount_mutex); + return err; +} + +/** + * ubifs_remount_ro - re-mount in read-only mode. + * @c: UBIFS file-system description object + * + * We assume VFS has stopped writing. Possibly the background thread could be + * running a commit, however kthread_stop will wait in that case. + */ +static void ubifs_remount_ro(struct ubifs_info *c) +{ + int i, err; + + ubifs_assert(!c->need_recovery); + ubifs_assert(!c->ro_mount); + + mutex_lock(&c->umount_mutex); + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + dbg_save_space_info(c); + + for (i = 0; i < c->jhead_cnt; i++) + ubifs_wbuf_sync(&c->jheads[i].wbuf); + + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); + + vfree(c->orph_buf); + c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->ro_mount = 1; + err = dbg_check_space_info(c); + if (err) + ubifs_ro_mode(c, err); + mutex_unlock(&c->umount_mutex); +} + +static void ubifs_put_super(struct super_block *sb) +{ + int i; + struct ubifs_info *c = sb->s_fs_info; + + ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num); + + /* + * The following asserts are only valid if there has not been a failure + * of the media. For example, there will be dirty inodes if we failed + * to write them back because of I/O errors. + */ + if (!c->ro_error) { + ubifs_assert(c->bi.idx_growth == 0); + ubifs_assert(c->bi.dd_growth == 0); + ubifs_assert(c->bi.data_growth == 0); + } + + /* + * The 'c->umount_lock' prevents races between UBIFS memory shrinker + * and file system un-mount. Namely, it prevents the shrinker from + * picking this superblock for shrinking - it will be just skipped if + * the mutex is locked. + */ + mutex_lock(&c->umount_mutex); + if (!c->ro_mount) { + /* + * First of all kill the background thread to make sure it does + * not interfere with un-mounting and freeing resources. + */ + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + /* + * On fatal errors c->ro_error is set to 1, in which case we do + * not write the master node. + */ + if (!c->ro_error) { + int err; + + /* Synchronize write-buffers */ + for (i = 0; i < c->jhead_cnt; i++) + ubifs_wbuf_sync(&c->jheads[i].wbuf); + + /* + * We are being cleanly unmounted which means the + * orphans were killed - indicate this in the master + * node. Also save the reserved GC LEB number. + */ + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + /* + * Recovery will attempt to fix the master area + * next mount, so we just print a message and + * continue to unmount normally. + */ + ubifs_err(c, "failed to write master node, error %d", + err); + } else { + for (i = 0; i < c->jhead_cnt; i++) + /* Make sure write-buffer timers are canceled */ + hrtimer_cancel(&c->jheads[i].wbuf.timer); + } + } + + ubifs_umount(c); + bdi_destroy(&c->bdi); + ubi_close_volume(c->ubi); + mutex_unlock(&c->umount_mutex); +} + +static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + int err; + struct ubifs_info *c = sb->s_fs_info; + + sync_filesystem(sb); + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + + err = ubifs_parse_options(c, data, 1); + if (err) { + ubifs_err(c, "invalid or unknown remount parameter"); + return err; + } + + if (c->ro_mount && !(*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg(c, "cannot re-mount R/W due to prior errors"); + return -EROFS; + } + if (c->ro_media) { + ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O"); + return -EROFS; + } + err = ubifs_remount_rw(c); + if (err) + return err; + } else if (!c->ro_mount && (*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg(c, "cannot re-mount R/O due to prior errors"); + return -EROFS; + } + ubifs_remount_ro(c); + } + + if (c->bulk_read == 1) + bu_init(c); + else { + dbg_gen("disable bulk-read"); + kfree(c->bu.buf); + c->bu.buf = NULL; + } + + ubifs_assert(c->lst.taken_empty_lebs > 0); + return 0; +} + +const struct super_operations ubifs_super_operations = { + .alloc_inode = ubifs_alloc_inode, + .destroy_inode = ubifs_destroy_inode, + .put_super = ubifs_put_super, + .write_inode = ubifs_write_inode, + .evict_inode = ubifs_evict_inode, + .statfs = ubifs_statfs, + .dirty_inode = ubifs_dirty_inode, + .remount_fs = ubifs_remount_fs, + .show_options = ubifs_show_options, + .sync_fs = ubifs_sync_fs, +}; + +/** + * open_ubi - parse UBI device name string and open the UBI device. + * @name: UBI volume name + * @mode: UBI volume open mode + * + * The primary method of mounting UBIFS is by specifying the UBI volume + * character device node path. However, UBIFS may also be mounted withoug any + * character device node using one of the following methods: + * + * o ubiX_Y - mount UBI device number X, volume Y; + * o ubiY - mount UBI device number 0, volume Y; + * o ubiX:NAME - mount UBI device X, volume with name NAME; + * o ubi:NAME - mount UBI device 0, volume with name NAME. + * + * Alternative '!' separator may be used instead of ':' (because some shells + * like busybox may interpret ':' as an NFS host name separator). This function + * returns UBI volume description object in case of success and a negative + * error code in case of failure. + */ +static struct ubi_volume_desc *open_ubi(const char *name, int mode) +{ + struct ubi_volume_desc *ubi; + int dev, vol; + char *endptr; + + /* First, try to open using the device node path method */ + ubi = ubi_open_volume_path(name, mode); + if (!IS_ERR(ubi)) + return ubi; + + /* Try the "nodev" method */ + if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') + return ERR_PTR(-EINVAL); + + /* ubi:NAME method */ + if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') + return ubi_open_volume_nm(0, name + 4, mode); + + if (!isdigit(name[3])) + return ERR_PTR(-EINVAL); + + dev = simple_strtoul(name + 3, &endptr, 0); + + /* ubiY method */ + if (*endptr == '\0') + return ubi_open_volume(0, dev, mode); + + /* ubiX_Y method */ + if (*endptr == '_' && isdigit(endptr[1])) { + vol = simple_strtoul(endptr + 1, &endptr, 0); + if (*endptr != '\0') + return ERR_PTR(-EINVAL); + return ubi_open_volume(dev, vol, mode); + } + + /* ubiX:NAME method */ + if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') + return ubi_open_volume_nm(dev, ++endptr, mode); + + return ERR_PTR(-EINVAL); +} + +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) +{ + struct ubifs_info *c; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} + +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubifs_info *c = sb->s_fs_info; + struct inode *root; + int err; + + c->vfs_sb = sb; + /* Re-open the UBI device in read-write mode */ + c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); + if (IS_ERR(c->ubi)) { + err = PTR_ERR(c->ubi); + goto out; + } + + /* + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For + * UBIFS, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless. + * + * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + */ + c->bdi.name = "ubifs", + c->bdi.capabilities = 0; + err = bdi_init(&c->bdi); + if (err) + goto out_close; + err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", + c->vi.ubi_num, c->vi.vol_id); + if (err) + goto out_bdi; + + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_bdi; + + sb->s_bdi = &c->bdi; + sb->s_fs_info = c; + sb->s_magic = UBIFS_SUPER_MAGIC; + sb->s_blocksize = UBIFS_BLOCK_SIZE; + sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; + sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); + if (c->max_inode_sz > MAX_LFS_FILESIZE) + sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; + sb->s_op = &ubifs_super_operations; + sb->s_xattr = ubifs_xattr_handlers; + + mutex_lock(&c->umount_mutex); + err = mount_ubifs(c); + if (err) { + ubifs_assert(err < 0); + goto out_unlock; + } + + /* Read the root inode */ + root = ubifs_iget(sb, UBIFS_ROOT_INO); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out_umount; + } + + sb->s_root = d_make_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_umount; + } + + mutex_unlock(&c->umount_mutex); + return 0; + +out_umount: + ubifs_umount(c); +out_unlock: + mutex_unlock(&c->umount_mutex); +out_bdi: + bdi_destroy(&c->bdi); +out_close: + ubi_close_volume(c->ubi); +out: + return err; +} + +static int sb_test(struct super_block *sb, void *data) +{ + struct ubifs_info *c1 = data; + struct ubifs_info *c = sb->s_fs_info; + + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, + const char *name, void *data) +{ + struct ubi_volume_desc *ubi; + struct ubifs_info *c; + struct super_block *sb; + int err; + + dbg_gen("name %s, flags %#x", name, flags); + + /* + * Get UBI device number and volume ID. Mount it read-only so far + * because this might be a new mount point, and UBI allows only one + * read-write user at a time. + */ + ubi = open_ubi(name, UBI_READONLY); + if (IS_ERR(ubi)) { + pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", + current->pid, name, (int)PTR_ERR(ubi)); + return ERR_CAST(ubi); + } + + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + + sb = sget(fs_type, sb_test, sb_set, flags, c); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + kfree(c); + goto out_close; + } + + if (sb->s_root) { + struct ubifs_info *c1 = sb->s_fs_info; + kfree(c); + /* A new mount point for already mounted UBIFS */ + dbg_gen("this ubi volume is already mounted"); + if (!!(flags & MS_RDONLY) != c1->ro_mount) { + err = -EBUSY; + goto out_deact; + } + } else { + err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto out_deact; + /* We do not support atime */ + sb->s_flags |= MS_ACTIVE | MS_NOATIME; + } + + /* 'fill_super()' opens ubi again so we must close it here */ + ubi_close_volume(ubi); + + return dget(sb->s_root); + +out_deact: + deactivate_locked_super(sb); +out_close: + ubi_close_volume(ubi); + return ERR_PTR(err); +} + +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + +static struct file_system_type ubifs_fs_type = { + .name = "ubifs", + .owner = THIS_MODULE, + .mount = ubifs_mount, + .kill_sb = kill_ubifs_super, +}; +MODULE_ALIAS_FS("ubifs"); + +/* + * Inode slab cache constructor. + */ +static void inode_slab_ctor(void *obj) +{ + struct ubifs_inode *ui = obj; + inode_init_once(&ui->vfs_inode); +} + +static int __init ubifs_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); + + /* Make sure node sizes are 8-byte aligned */ + BUILD_BUG_ON(UBIFS_CH_SZ & 7); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); + BUILD_BUG_ON(MIN_WRITE_SZ & 7); + + /* Check min. node size */ + BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); + + /* Defined node sizes */ + BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); + + /* + * We use 2 bit wide bit-fields to store compression type, which should + * be amended if more compressors are added. The bit-fields are: + * @compr_type in 'struct ubifs_inode', @default_compr in + * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. + */ + BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); + + /* + * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to + * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. + */ + if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { + pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", + current->pid, (unsigned int)PAGE_CACHE_SIZE); + return -EINVAL; + } + + ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", + sizeof(struct ubifs_inode), 0, + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, + &inode_slab_ctor); + if (!ubifs_inode_slab) + return -ENOMEM; + + register_shrinker(&ubifs_shrinker_info); + + err = ubifs_compressors_init(); + if (err) + goto out_shrinker; + + err = dbg_debugfs_init(); + if (err) + goto out_compr; + + err = register_filesystem(&ubifs_fs_type); + if (err) { + pr_err("UBIFS error (pid %d): cannot register file system, error %d", + current->pid, err); + goto out_dbg; + } + return 0; + +out_dbg: + dbg_debugfs_exit(); +out_compr: + ubifs_compressors_exit(); +out_shrinker: + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); + return err; +} +/* late_initcall to let compressors initialize first */ +late_initcall(ubifs_init); + +static void __exit ubifs_exit(void) +{ + ubifs_assert(list_empty(&ubifs_infos)); + ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + + dbg_debugfs_exit(); + ubifs_compressors_exit(); + unregister_shrinker(&ubifs_shrinker_info); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ubifs_inode_slab); + unregister_filesystem(&ubifs_fs_type); +} +module_exit(ubifs_exit); + +MODULE_LICENSE("GPL"); +MODULE_VERSION(__stringify(UBIFS_VERSION)); +MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); +MODULE_DESCRIPTION("UBIFS - UBI File System"); diff --git a/kernel/fs/ubifs/tnc.c b/kernel/fs/ubifs/tnc.c new file mode 100644 index 000000000..957f5757f --- /dev/null +++ b/kernel/fs/ubifs/tnc.c @@ -0,0 +1,3327 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements TNC (Tree Node Cache) which caches indexing nodes of + * the UBIFS B-tree. + * + * At the moment the locking rules of the TNC tree are quite simple and + * straightforward. We just have a mutex and lock it when we traverse the + * tree. If a znode is not in memory, we read it from flash while still having + * the mutex locked. + */ + +#include +#include +#include "ubifs.h" + +/* + * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. + * @NAME_LESS: name corresponding to the first argument is less than second + * @NAME_MATCHES: names match + * @NAME_GREATER: name corresponding to the second argument is greater than + * first + * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media + * + * These constants were introduce to improve readability. + */ +enum { + NAME_LESS = 0, + NAME_MATCHES = 1, + NAME_GREATER = 2, + NOT_ON_MEDIA = 3, +}; + +/** + * insert_old_idx - record an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + * + * For recovery, there must always be a complete intact version of the index on + * flash at all times. That is called the "old index". It is the index as at the + * time of the last successful commit. Many of the index nodes in the old index + * may be dirty, but they must not be erased until the next successful commit + * (at which point that index becomes the old index). + * + * That means that the garbage collection and the in-the-gaps method of + * committing must be able to determine if an index node is in the old index. + * Most of the old index nodes can be found by looking up the TNC using the + * 'lookup_znode()' function. However, some of the old index nodes may have + * been deleted from the current index or may have been changed so much that + * they cannot be easily found. In those cases, an entry is added to an RB-tree. + * That is what this function does. The RB-tree is ordered by LEB number and + * offset because they uniquely identify the old index node. + */ +static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *old_idx, *o; + struct rb_node **p, *parent = NULL; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) + return -ENOMEM; + old_idx->lnum = lnum; + old_idx->offs = offs; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = &(*p)->rb_left; + else if (lnum > o->lnum) + p = &(*p)->rb_right; + else if (offs < o->offs) + p = &(*p)->rb_left; + else if (offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err(c, "old idx added twice!"); + kfree(old_idx); + return 0; + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); + return 0; +} + +/** + * insert_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode) +{ + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) + return insert_old_idx(c, zbr->lnum, zbr->offs); + } else + if (c->zroot.len) + return insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + return 0; +} + +/** + * ins_clr_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +static int ins_clr_old_idx_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int err; + + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (err) + return err; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + } + } else + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs); + if (err) + return err; + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + } + return 0; +} + +/** + * destroy_old_idx - destroy the old_idx RB-tree. + * @c: UBIFS file-system description object + * + * During start commit, the old_idx RB-tree is used to avoid overwriting index + * nodes that were in the index last commit but have since been deleted. This + * is necessary for recovery i.e. the old index must be kept intact until the + * new index is successfully written. The old-idx RB-tree is used for the + * in-the-gaps method of writing index nodes and is destroyed every commit. + */ +void destroy_old_idx(struct ubifs_info *c) +{ + struct ubifs_old_idx *old_idx, *n; + + rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb) + kfree(old_idx); + + c->old_idx = RB_ROOT; +} + +/** + * copy_znode - copy a dirty znode. + * @c: UBIFS file-system description object + * @znode: znode to copy + * + * A dirty znode being committed may not be changed, so it is copied. + */ +static struct ubifs_znode *copy_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + zn = kmalloc(c->max_znode_sz, GFP_NOFS); + if (unlikely(!zn)) + return ERR_PTR(-ENOMEM); + + memcpy(zn, znode, c->max_znode_sz); + zn->cnext = NULL; + __set_bit(DIRTY_ZNODE, &zn->flags); + __clear_bit(COW_ZNODE, &zn->flags); + + ubifs_assert(!ubifs_zn_obsolete(znode)); + __set_bit(OBSOLETE_ZNODE, &znode->flags); + + if (znode->level != 0) { + int i; + const int n = zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *zbr = &zn->zbranch[i]; + + if (zbr->znode) + zbr->znode->parent = zn; + } + } + + atomic_long_inc(&c->dirty_zn_cnt); + return zn; +} + +/** + * add_idx_dirt - add dirt due to a dirty znode. + * @c: UBIFS file-system description object + * @lnum: LEB number of index node + * @dirt: size of index node + * + * This function updates lprops dirty space and the new size of the index. + */ +static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) +{ + c->calc_idx_sz -= ALIGN(dirt, 8); + return ubifs_add_dirt(c, lnum, dirt); +} + +/** + * dirty_cow_znode - ensure a znode is not being committed. + * @c: UBIFS file-system description object + * @zbr: branch of znode to check + * + * Returns dirtied znode on success or negative error code on failure. + */ +static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zn; + int err; + + if (!ubifs_zn_cow(znode)) { + /* znode is not being committed */ + if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { + atomic_long_inc(&c->dirty_zn_cnt); + atomic_long_dec(&c->clean_zn_cnt); + atomic_long_dec(&ubifs_clean_zn_cnt); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + if (unlikely(err)) + return ERR_PTR(err); + } + return znode; + } + + zn = copy_znode(c, znode); + if (IS_ERR(zn)) + return zn; + + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (unlikely(err)) + return ERR_PTR(err); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + } else + err = 0; + + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + if (unlikely(err)) + return ERR_PTR(err); + return zn; +} + +/** + * lnc_add - add a leaf node to the leaf node cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * Leaf nodes are non-index nodes directory entry nodes or data nodes. The + * purpose of the leaf node cache is to save re-reading the same leaf node over + * and over again. Most things are cached by VFS, however the file system must + * cache directory entries for readdir and for resolving hash collisions. The + * present implementation of the leaf node cache is extremely simple, and + * allows for error returns that are not used but that may be needed if a more + * complex implementation is created. + * + * Note, this function does not add the @node object to LNC directly, but + * allocates a copy of the object and adds the copy to LNC. The reason for this + * is that @node has been allocated outside of the TNC subsystem and will be + * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC + * may be changed at any time, e.g. freed by the shrinker. + */ +static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const void *node) +{ + int err; + void *lnc_node; + const struct ubifs_dent_node *dent = node; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + ubifs_assert(is_hash_key(c, &zbr->key)); + + err = ubifs_validate_entry(c, dent); + if (err) { + dump_stack(); + ubifs_dump_node(c, dent); + return err; + } + + lnc_node = kmemdup(node, zbr->len, GFP_NOFS); + if (!lnc_node) + /* We don't have to have the cache, so no error */ + return 0; + + zbr->leaf = lnc_node; + return 0; +} + + /** + * lnc_add_directly - add a leaf node to the leaf-node-cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * This function is similar to 'lnc_add()', but it does not create a copy of + * @node but inserts @node to TNC directly. + */ +static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + + err = ubifs_validate_entry(c, node); + if (err) { + dump_stack(); + ubifs_dump_node(c, node); + return err; + } + + zbr->leaf = node; + return 0; +} + +/** + * lnc_free - remove a leaf node from the leaf node cache. + * @zbr: zbranch of leaf node + * @node: leaf node + */ +static void lnc_free(struct ubifs_zbranch *zbr) +{ + if (!zbr->leaf) + return; + kfree(zbr->leaf); + zbr->leaf = NULL; +} + +/** + * tnc_read_node_nm - read a "hashed" leaf node. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a "hashed" node defined by @zbr from the leaf node cache + * (in it is there) or from the hash media, in which case the node is also + * added to LNC. Returns zero in case of success or a negative negative error + * code in case of failure. + */ +static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(is_hash_key(c, &zbr->key)); + + if (zbr->leaf) { + /* Read from the leaf node cache */ + ubifs_assert(zbr->len != 0); + memcpy(node, zbr->leaf, zbr->len); + return 0; + } + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) + return err; + + /* Add the node to the leaf node cache */ + err = lnc_add(c, zbr, node); + return err; +} + +/** + * try_read_node - read a node if it is a node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: LEB number of node to read + * @offs: offset of node to read + * + * This function tries to read a node of known type and length, checks it and + * stores it in @buf. This function returns %1 if a node is present and %0 if + * a node is not present. A negative error code is returned for I/O errors. + * This function performs that same function as ubifs_read_node except that + * it does not require that there is actually a node present and instead + * the return code indicates if a node was read. + * + * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc + * is true (it is controlled by corresponding mount option). However, if + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. + */ +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs) +{ + int err, node_len; + struct ubifs_ch *ch = buf; + uint32_t crc, node_crc; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + + err = ubifs_leb_read(c, lnum, buf, offs, len, 1); + if (err) { + ubifs_err(c, "cannot read node type %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + return 0; + + if (ch->node_type != type) + return 0; + + node_len = le32_to_cpu(ch->len); + if (node_len != len) + return 0; + + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) + return 1; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) + return 0; + + return 1; +} + +/** + * fallible_read_node - try to read a leaf node. + * @c: UBIFS file-system description object + * @key: key of node to read + * @zbr: position of node + * @node: node returned + * + * This function tries to read a node and returns %1 if the node is read, %0 + * if the node is not present, and a negative error code in the case of error. + */ +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node) +{ + int ret; + + dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); + + ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, + zbr->offs); + if (ret == 1) { + union ubifs_key node_key; + struct ubifs_dent_node *dent = node; + + /* All nodes have key in the same place */ + key_read(c, &dent->key, &node_key); + if (keys_cmp(c, key, &node_key) != 0) + ret = 0; + } + if (ret == 0 && c->replaying) + dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ", + zbr->lnum, zbr->offs, zbr->len); + return ret; +} + +/** + * matches_name - determine if a direntry or xattr entry matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by + * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case + * of failure, a negative error code is returned. + */ +static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, dent); + if (err) + goto out_free; + + /* Add the node to the leaf node cache */ + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * get_znode - get a TNC znode that may not be loaded yet. + * @c: UBIFS file-system description object + * @znode: parent znode + * @n: znode branch slot number + * + * This function returns the znode or a negative error code. + */ +static struct ubifs_znode *get_znode(struct ubifs_info *c, + struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + + zbr = &znode->zbranch[n]; + if (zbr->znode) + znode = zbr->znode; + else + znode = ubifs_load_znode(c, zbr, znode, n); + return znode; +} + +/** + * tnc_next - find next TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is passed and returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the next TNC entry is found, %-ENOENT if there is + * no next entry, or a negative error code otherwise. + */ +static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + nn += 1; + if (nn < znode->child_cnt) { + *n = nn; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip + 1; + znode = zp; + if (nn < znode->child_cnt) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = 0; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * tnc_prev - find previous TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the previous TNC entry is found, %-ENOENT if + * there is no next entry, or a negative error code otherwise. + */ +static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + if (nn > 0) { + *n = nn - 1; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip - 1; + znode = zp; + if (nn >= 0) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + nn = znode->child_cnt - 1; + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = znode->child_cnt - 1; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * resolve_collision - resolve a collision. + * @c: UBIFS file-system description object + * @key: key of a directory or extended attribute entry + * @zn: znode is returned here + * @n: zbranch number is passed and returned here + * @nm: name of the entry + * + * This function is called for "hashed" keys to make sure that the found key + * really corresponds to the looked up node (directory or extended attribute + * entry). It returns %1 and sets @zn and @n if the collision is resolved. + * %0 is returned if @nm is not found and @zn and @n are set to the previous + * entry, i.e. to the entry after which @nm could follow if it were in TNC. + * This means that @n may be set to %-1 if the leftmost key in @zn is the + * previous one. A negative error code is returned on failures. + */ +static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm) +{ + int err; + + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (unlikely(err < 0)) + return err; + if (err == NAME_MATCHES) + return 1; + + if (err == NAME_GREATER) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + return 0; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* + * We have found the branch after which we would + * like to insert, but inserting in this znode + * may still be wrong. Consider the following 3 + * znodes, in the case where we are resolving a + * collision with Key2. + * + * znode zp + * ---------------------- + * level 1 | Key0 | Key1 | + * ----------------------- + * | | + * znode za | | znode zb + * ------------ ------------ + * level 0 | Key0 | | Key2 | + * ------------ ------------ + * + * The lookup finds Key2 in znode zb. Lets say + * there is no match and the name is greater so + * we look left. When we find Key0, we end up + * here. If we return now, we will insert into + * znode za at slot n = 1. But that is invalid + * according to the parent's keys. Key2 must + * be inserted into znode zb. + * + * Note, this problem is not relevant for the + * case when we go right, because + * 'tnc_insert()' would correct the parent key. + */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + return 0; + } + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_LESS) + return 0; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_GREATER); + } + } else { + int nn = *n; + struct ubifs_znode *znode = *zn; + + /* Look right */ + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + err = matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + return 0; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_LESS); + } + } +} + +/** + * fallible_matches_name - determine if a dent matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This is a "fallible" version of 'matches_name()' function which does not + * panic if the direntry/xentry referred by @zbr does not exist on the media. + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr + * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA + * if xentry/direntry referred by @zbr does not exist on the media. A negative + * error code is returned in case of failure. + */ +static int fallible_matches_name(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = fallible_read_node(c, &zbr->key, zbr, dent); + if (err < 0) + goto out_free; + if (err == 0) { + /* The node was not present */ + err = NOT_ON_MEDIA; + goto out_free; + } + ubifs_assert(err == 1); + + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * fallible_resolve_collision - resolve a collision even if nodes are missing. + * @c: UBIFS file-system description object + * @key: key + * @zn: znode is returned here + * @n: branch number is passed and returned here + * @nm: name of directory entry + * @adding: indicates caller is adding a key to the TNC + * + * This is a "fallible" version of the 'resolve_collision()' function which + * does not panic if one of the nodes referred to by TNC does not exist on the + * media. This may happen when replaying the journal if a deleted node was + * Garbage-collected and the commit was not done. A branch that refers to a node + * that is not present is called a dangling branch. The following are the return + * codes for this function: + * o if @nm was found, %1 is returned and @zn and @n are set to the found + * branch; + * o if we are @adding and @nm was not found, %0 is returned; + * o if we are not @adding and @nm was not found, but a dangling branch was + * found, then %1 is returned and @zn and @n are set to the dangling branch; + * o a negative error code is returned in case of failure. + */ +static int fallible_resolve_collision(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm, int adding) +{ + struct ubifs_znode *o_znode = NULL, *znode = *zn; + int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; + + cmp = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (unlikely(cmp < 0)) + return cmp; + if (cmp == NAME_MATCHES) + return 1; + if (cmp == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + /* + * We are unlucky and hit a dangling branch straight away. + * Now we do not really know where to go to find the needed + * branch - to the left or to the right. Well, let's try left. + */ + unsure = 1; + } else if (!adding) + unsure = 1; /* Remove a dangling branch wherever it is */ + + if (cmp == NAME_GREATER || unsure) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + break; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* See comments in 'resolve_collision()' */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + break; + } + err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = *zn; + o_n = *n; + continue; + } + if (!adding) + continue; + if (err == NAME_LESS) + break; + else + unsure = 0; + } + } + + if (cmp == NAME_LESS || unsure) { + /* Look right */ + *zn = znode; + *n = nn; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + err = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + break; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + } + } + } + + /* Never match a dangling branch when adding */ + if (adding || !o_znode) + return 0; + + dbg_mntk(key, "dangling match LEB %d:%d len %d key ", + o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, + o_znode->zbranch[o_n].len); + *zn = o_znode; + *n = o_n; + return 1; +} + +/** + * matches_position - determine if a zbranch matches a given position. + * @zbr: zbranch of dent + * @lnum: LEB number of dent to match + * @offs: offset of dent to match + * + * This function returns %1 if @lnum:@offs matches, and %0 otherwise. + */ +static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs) +{ + if (zbr->lnum == lnum && zbr->offs == offs) + return 1; + else + return 0; +} + +/** + * resolve_collision_directly - resolve a collision directly. + * @c: UBIFS file-system description object + * @key: key of directory entry + * @zn: znode is passed and returned here + * @n: zbranch number is passed and returned here + * @lnum: LEB number of dent node to match + * @offs: offset of dent node to match + * + * This function is used for "hashed" keys to make sure the found directory or + * extended attribute entry node is what was looked for. It is used when the + * flash address of the right node is known (@lnum:@offs) which makes it much + * easier to resolve collisions (no need to read entries and match full + * names). This function returns %1 and sets @zn and @n if the collision is + * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the + * previous directory entry. Otherwise a negative error code is returned. + */ +static int resolve_collision_directly(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int nn, err; + + znode = *zn; + nn = *n; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + if (matches_position(&znode->zbranch[nn], lnum, offs)) { + *zn = znode; + *n = nn; + return 1; + } + } + + /* Look right */ + znode = *zn; + nn = *n; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + *zn = znode; + *n = nn; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + } +} + +/** + * dirty_cow_bottom_up - dirty a znode and its ancestors. + * @c: UBIFS file-system description object + * @znode: znode to dirty + * + * If we do not have a unique key that resides in a znode, then we cannot + * dirty that znode from the top down (i.e. by using lookup_level0_dirty) + * This function records the path back to the last dirty ancestor, and then + * dirties the znodes on that path. + */ +static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zp; + int *path = c->bottom_up_buf, p = 0; + + ubifs_assert(c->zroot.znode); + ubifs_assert(znode); + if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) { + kfree(c->bottom_up_buf); + c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int), + GFP_NOFS); + if (!c->bottom_up_buf) + return ERR_PTR(-ENOMEM); + path = c->bottom_up_buf; + } + if (c->zroot.znode->level) { + /* Go up until parent is dirty */ + while (1) { + int n; + + zp = znode->parent; + if (!zp) + break; + n = znode->iip; + ubifs_assert(p < c->zroot.znode->level); + path[p++] = n; + if (!zp->cnext && ubifs_zn_dirty(znode)) + break; + znode = zp; + } + } + + /* Come back down, dirtying as we go */ + while (1) { + struct ubifs_zbranch *zbr; + + zp = znode->parent; + if (zp) { + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < zp->child_cnt); + zbr = &zp->zbranch[path[--p]]; + znode = dirty_cow_znode(c, zbr); + } else { + ubifs_assert(znode == c->zroot.znode); + znode = dirty_cow_znode(c, &c->zroot); + } + if (IS_ERR(znode) || !p) + break; + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < znode->child_cnt); + znode = znode->zbranch[path[p - 1]].znode; + } + + return znode; +} + +/** + * ubifs_lookup_level0 - search for zero-level znode. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain + * @key, then %0 is returned and slot number of the closest branch is stored + * in @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %0 is stored in @n. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnck(key, "search key "); + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = zbr->znode; + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * Here is a tricky place. We have not found the key and this is a + * "hashed" key, which may collide. The rest of the code deals with + * situations like this: + * + * | 3 | 5 | + * / \ + * | 3 | 5 | | 6 | 7 | (x) + * + * Or more a complex example: + * + * | 1 | 5 | + * / \ + * | 1 | 3 | | 5 | 8 | + * \ / + * | 5 | 5 | | 6 | 7 | (x) + * + * In the examples, if we are looking for key "5", we may reach nodes + * marked with "(x)". In this case what we have do is to look at the + * left and see if there is "5" key there. If there is, we have to + * return it. + * + * Note, this whole situation is possible because we allow to have + * elements which are equivalent to the next key in the parent in the + * children of current znode. For example, this happens if we split a + * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something + * like this: + * | 3 | 5 | + * / \ + * | 3 | 5 | | 5 | 6 | 7 | + * ^ + * And this becomes what is at the first "picture" after key "5" marked + * with "^" is removed. What could be done is we could prohibit + * splitting in the middle of the colliding sequence. Also, when + * removing the leftmost key, we would have to correct the key of the + * parent node, which would introduce additional complications. Namely, + * if we changed the leftmost key of the parent znode, the garbage + * collector would be unable to find it (GC is doing this when GC'ing + * indexing LEBs). Although we already have an additional RB-tree where + * we save such changed znodes (see 'ins_clr_old_idx_znode()') until + * after the commit. But anyway, this does not look easy to implement + * so we did not try this. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * lookup_level0_dirty - search for zero-level znode dirtying. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain @key + * then %0 is returned and slot number of the closed branch is stored in + * @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %-1 is stored in @n. + * + * Additionally all znodes in the path from the root to the located zero-level + * znode are marked as dirty. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnck(key, "search and dirty key "); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode = dirty_cow_znode(c, &c->zroot); + if (IS_ERR(znode)) + return PTR_ERR(znode); + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * See huge comment at 'lookup_level0_dirty()' what is the rest of the + * code. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * maybe_leb_gced - determine if a LEB may have been garbage collected. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number + * + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. + */ +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) +{ + int gc_seq2, gced_lnum; + + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; +} + +/** + * ubifs_tnc_locate - look up a file-system node and return it and its location. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @lnum: LEB number is returned here + * @offs: offset is returned here + * + * This function looks up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. + */ +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs) +{ + int found, n, err, safely = 0, gc_seq1; + struct ubifs_znode *znode; + struct ubifs_zbranch zbr, *zt; + +again: + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out; + } else if (found < 0) { + err = found; + goto out; + } + zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } + if (is_hash_key(c, key)) { + /* + * In this case the leaf node cache gets used, so we pass the + * address of the zbranch and keep the mutex locked + */ + err = tnc_read_node_nm(c, zt, node); + goto out; + } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ + zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } + + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; + +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. This function returns zero in case of success + * and a negative error code in case of failure. + * + * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function + * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares + * maximum possible amount of nodes for bulk-read. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_leb_read(c, lnum, buf, offs, len, 0); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err(c, "bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err(c, "bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err(c, "bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnck(&zbr->key, "looked for key "); + dbg_tnck(&key1, "found node's key "); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_dump_node(c, buf); + dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err(c, "buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err(c, "failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dump_stack(); + dbg_tnck(&bu->key, "key "); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** + * do_lookup_nm- look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int found, n, err; + struct ubifs_znode *znode; + + dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out_unlock; + } else if (found < 0) { + err = found; + goto out_unlock; + } + + ubifs_assert(n >= 0); + + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + if (err == 0) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_node_nm(c, &znode->zbranch[n], node); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_nm - look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int err, len; + const struct ubifs_dent_node *dent = node; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + len = le16_to_cpu(dent->nlen); + if (nm->len == len && !memcmp(dent->name, nm->name, len)) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_nm(c, key, node, nm); +} + +/** + * correct_parent_keys - correct parent znodes' keys. + * @c: UBIFS file-system description object + * @znode: znode to correct parent znodes for + * + * This is a helper function for 'tnc_insert()'. When the key of the leftmost + * zbranch changes, keys of parent znodes have to be corrected. This helper + * function is called in such situations and corrects the keys if needed. + */ +static void correct_parent_keys(const struct ubifs_info *c, + struct ubifs_znode *znode) +{ + union ubifs_key *key, *key1; + + ubifs_assert(znode->parent); + ubifs_assert(znode->iip == 0); + + key = &znode->zbranch[0].key; + key1 = &znode->parent->zbranch[0].key; + + while (keys_cmp(c, key, key1) < 0) { + key_copy(c, key, key1); + znode = znode->parent; + znode->alt = 1; + if (!znode->parent || znode->iip) + break; + key1 = &znode->parent->zbranch[0].key; + } +} + +/** + * insert_zbranch - insert a zbranch into a znode. + * @znode: znode into which to insert + * @zbr: zbranch to insert + * @n: slot number to insert to + * + * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in + * znode's array of zbranches and keeps zbranches consolidated, so when a new + * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th + * slot, zbranches starting from @n have to be moved right. + */ +static void insert_zbranch(struct ubifs_znode *znode, + const struct ubifs_zbranch *zbr, int n) +{ + int i; + + ubifs_assert(ubifs_zn_dirty(znode)); + + if (znode->level) { + for (i = znode->child_cnt; i > n; i--) { + znode->zbranch[i] = znode->zbranch[i - 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + if (zbr->znode) + zbr->znode->iip = n; + } else + for (i = znode->child_cnt; i > n; i--) + znode->zbranch[i] = znode->zbranch[i - 1]; + + znode->zbranch[n] = *zbr; + znode->child_cnt += 1; + + /* + * After inserting at slot zero, the lower bound of the key range of + * this znode may have changed. If this znode is subsequently split + * then the upper bound of the key range may change, and furthermore + * it could change to be lower than the original lower bound. If that + * happens, then it will no longer be possible to find this znode in the + * TNC using the key from the index node on flash. That is bad because + * if it is not found, we will assume it is obsolete and may overwrite + * it. Then if there is an unclean unmount, we will start using the + * old index which will be broken. + * + * So we first mark znodes that have insertions at slot zero, and then + * if they are split we add their lnum/offs to the old_idx tree. + */ + if (n == 0) + znode->alt = 1; +} + +/** + * tnc_insert - insert a node into TNC. + * @c: UBIFS file-system description object + * @znode: znode to insert into + * @zbr: branch to insert + * @n: slot number to insert new zbranch to + * + * This function inserts a new node described by @zbr into znode @znode. If + * znode does not have a free slot for new zbranch, it is split. Parent znodes + * are splat as well if needed. Returns zero in case of success or a negative + * error code in case of failure. + */ +static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, + struct ubifs_zbranch *zbr, int n) +{ + struct ubifs_znode *zn, *zi, *zp; + int i, keep, move, appending = 0; + union ubifs_key *key = &zbr->key, *key1; + + ubifs_assert(n >= 0 && n <= c->fanout); + + /* Implement naive insert for now */ +again: + zp = znode->parent; + if (znode->child_cnt < c->fanout) { + ubifs_assert(n != c->fanout); + dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level); + + insert_zbranch(znode, zbr, n); + + /* Ensure parent's key is correct */ + if (n == 0 && zp && znode->iip == 0) + correct_parent_keys(c, znode); + + return 0; + } + + /* + * Unfortunately, @znode does not have more empty slots and we have to + * split it. + */ + dbg_tnck(key, "splitting level %d, key ", znode->level); + + if (znode->alt) + /* + * We can no longer be sure of finding this znode by key, so we + * record it in the old_idx tree. + */ + ins_clr_old_idx_znode(c, znode); + + zn = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zn) + return -ENOMEM; + zn->parent = zp; + zn->level = znode->level; + + /* Decide where to split */ + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } + } + + if (appending) { + keep = c->fanout; + move = 0; + } else { + keep = (c->fanout + 1) / 2; + move = c->fanout - keep; + } + + /* + * Although we don't at present, we could look at the neighbors and see + * if we can move some zbranches there. + */ + + if (n < keep) { + /* Insert into existing znode */ + zi = znode; + move += 1; + keep -= 1; + } else { + /* Insert into new znode */ + zi = zn; + n -= keep; + /* Re-parent */ + if (zn->level != 0) + zbr->znode->parent = zn; + } + +do_split: + + __set_bit(DIRTY_ZNODE, &zn->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zn->child_cnt = move; + znode->child_cnt = keep; + + dbg_tnc("moving %d, keeping %d", move, keep); + + /* Move zbranch */ + for (i = 0; i < move; i++) { + zn->zbranch[i] = znode->zbranch[keep + i]; + /* Re-parent */ + if (zn->level != 0) + if (zn->zbranch[i].znode) { + zn->zbranch[i].znode->parent = zn; + zn->zbranch[i].znode->iip = i; + } + } + + /* Insert new key and branch */ + dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level); + + insert_zbranch(zi, zbr, n); + + /* Insert new znode (produced by spitting) into the parent */ + if (zp) { + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + + /* Locate insertion point */ + n = znode->iip + 1; + + /* Tail recursion */ + zbr->key = zn->zbranch[0].key; + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + znode = zp; + + goto again; + } + + /* We have to split root znode */ + dbg_tnc("creating new zroot at level %d", znode->level + 1); + + zi = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zi) + return -ENOMEM; + + zi->child_cnt = 2; + zi->level = znode->level + 1; + + __set_bit(DIRTY_ZNODE, &zi->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zi->zbranch[0].key = znode->zbranch[0].key; + zi->zbranch[0].znode = znode; + zi->zbranch[0].lnum = c->zroot.lnum; + zi->zbranch[0].offs = c->zroot.offs; + zi->zbranch[0].len = c->zroot.len; + zi->zbranch[1].key = zn->zbranch[0].key; + zi->zbranch[1].znode = zn; + + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + c->zroot.znode = zi; + + zn->parent = zi; + zn->iip = 1; + znode->parent = zi; + znode->iip = 0; + + return 0; +} + +/** + * ubifs_tnc_add - add a node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function adds a node with key @key to TNC. The node may be new or it may + * obsolete some existing one. Returns %0 on success or negative error code on + * failure. + */ +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len); + found = lookup_level0_dirty(c, key, &znode, &n); + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + } else if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else + err = found; + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + + return err; +} + +/** + * ubifs_tnc_replace - replace a node in the TNC only if the old node is found. + * @c: UBIFS file-system description object + * @key: key to add + * @old_lnum: LEB number of old node + * @old_offs: old node offset + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function replaces a node with key @key in the TNC only if the old node + * is found. This function is called by garbage collection when node are moved. + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum, + old_offs, lnum, offs, len); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + found = 0; + if (zbr->lnum == old_lnum && zbr->offs == old_offs) { + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + found = 1; + } else if (is_hash_key(c, key)) { + found = resolve_collision_directly(c, key, &znode, &n, + old_lnum, old_offs); + dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d", + found, znode, n, old_lnum, old_offs); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + zbr = &znode->zbranch[n]; + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, + zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } + } + } + + if (!found) + err = ubifs_add_dirt(c, lnum, len); + + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_add_nm - add a "hashed" node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * @nm: node name + * + * This is the same as 'ubifs_tnc_add()' but it should be used with keys which + * may have collisions, like directory entry keys. + */ +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + lnum, offs, nm->len, nm->name); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + if (c->replaying) + found = fallible_resolve_collision(c, key, &znode, &n, + nm, 1); + else + found = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n); + if (found < 0) { + err = found; + goto out_unlock; + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + goto out_unlock; + } + } + + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + if (err) + goto out_unlock; + if (c->replaying) { + /* + * We did not find it in the index so there may be a + * dangling branch still in the index. So we remove it + * by passing 'ubifs_tnc_remove_nm()' the same key but + * an unmatchable name. + */ + struct qstr noname = { .name = "" }; + + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + return ubifs_tnc_remove_nm(c, key, &noname); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * tnc_delete - delete a znode form TNC. + * @c: UBIFS file-system description object + * @znode: znode to delete from + * @n: zbranch slot number to delete + * + * This function deletes a leaf node from @n-th slot of @znode. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *zp; + int i, err; + + /* Delete without merge for now */ + ubifs_assert(znode->level == 0); + ubifs_assert(n >= 0 && n < c->fanout); + dbg_tnck(&znode->zbranch[n].key, "deleting key "); + + zbr = &znode->zbranch[n]; + lnc_free(zbr); + + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) { + ubifs_dump_znode(c, znode); + return err; + } + + /* We do not "gap" zbranch slots */ + for (i = n; i < znode->child_cnt - 1; i++) + znode->zbranch[i] = znode->zbranch[i + 1]; + znode->child_cnt -= 1; + + if (znode->child_cnt > 0) + return 0; + + /* + * This was the last zbranch, we have to delete this znode from the + * parent. + */ + + do { + ubifs_assert(!ubifs_zn_obsolete(znode)); + ubifs_assert(ubifs_zn_dirty(znode)); + + zp = znode->parent; + n = znode->iip; + + atomic_long_dec(&c->dirty_zn_cnt); + + err = insert_old_idx_znode(c, znode); + if (err) + return err; + + if (znode->cnext) { + __set_bit(OBSOLETE_ZNODE, &znode->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(znode); + znode = zp; + } while (znode->child_cnt == 1); /* while removing last child */ + + /* Remove from znode, entry n - 1 */ + znode->child_cnt -= 1; + ubifs_assert(znode->level != 0); + for (i = n; i < znode->child_cnt; i++) { + znode->zbranch[i] = znode->zbranch[i + 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + + /* + * If this is the root and it has only 1 child then + * collapse the tree. + */ + if (!znode->parent) { + while (znode->child_cnt == 1 && znode->level != 0) { + zp = znode; + zbr = &znode->zbranch[0]; + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode->parent = NULL; + znode->iip = 0; + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + if (err) + return err; + } + c->zroot.lnum = zbr->lnum; + c->zroot.offs = zbr->offs; + c->zroot.len = zbr->len; + c->zroot.znode = znode; + ubifs_assert(!ubifs_zn_obsolete(zp)); + ubifs_assert(ubifs_zn_dirty(zp)); + atomic_long_dec(&c->dirty_zn_cnt); + + if (zp->cnext) { + __set_bit(OBSOLETE_ZNODE, &zp->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(zp); + } + } + + return 0; +} + +/** + * ubifs_tnc_remove - remove an index entry of a node. + * @c: UBIFS file-system description object + * @key: key of node + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnck(key, "key "); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + if (found == 1) + err = tnc_delete(c, znode, n); + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node. + * @c: UBIFS file-system description object + * @key: key of node + * @nm: directory entry name + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm) +{ + int n, err; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnck(key, "%.*s, key ", nm->len, nm->name); + err = lookup_level0_dirty(c, key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (err < 0) + goto out_unlock; + if (err) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + err = tnc_delete(c, znode, n); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * key_in_range - determine if a key falls within a range of keys. + * @c: UBIFS file-system description object + * @key: key to check + * @from_key: lowest key in range + * @to_key: highest key in range + * + * This function returns %1 if the key is in range and %0 otherwise. + */ +static int key_in_range(struct ubifs_info *c, union ubifs_key *key, + union ubifs_key *from_key, union ubifs_key *to_key) +{ + if (keys_cmp(c, key, from_key) < 0) + return 0; + if (keys_cmp(c, key, to_key) > 0) + return 0; + return 1; +} + +/** + * ubifs_tnc_remove_range - remove index entries in range. + * @c: UBIFS file-system description object + * @from_key: lowest key to remove + * @to_key: highest key to remove + * + * This function removes index entries starting at @from_key and ending at + * @to_key. This function returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key) +{ + int i, n, k, err = 0; + struct ubifs_znode *znode; + union ubifs_key *key; + + mutex_lock(&c->tnc_mutex); + while (1) { + /* Find first level 0 znode that contains keys to remove */ + err = ubifs_lookup_level0(c, from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) + key = from_key; + else { + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, from_key, to_key)) { + err = 0; + goto out_unlock; + } + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + /* Remove all keys in range except the first */ + for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) { + key = &znode->zbranch[i].key; + if (!key_in_range(c, key, from_key, to_key)) + break; + lnc_free(&znode->zbranch[i]); + err = ubifs_add_dirt(c, znode->zbranch[i].lnum, + znode->zbranch[i].len); + if (err) { + ubifs_dump_znode(c, znode); + goto out_unlock; + } + dbg_tnck(key, "removing key "); + } + if (k) { + for (i = n + 1 + k; i < znode->child_cnt; i++) + znode->zbranch[i - k] = znode->zbranch[i]; + znode->child_cnt -= k; + } + + /* Now delete the first */ + err = tnc_delete(c, znode, n); + if (err) + goto out_unlock; + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_ino - remove an inode from TNC. + * @c: UBIFS file-system description object + * @inum: inode number to remove + * + * This function remove inode @inum and all the extended attributes associated + * with the anode from TNC and returns zero in case of success or a negative + * error code in case of failure. + */ +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) +{ + union ubifs_key key1, key2; + struct ubifs_dent_node *xent, *pxent = NULL; + struct qstr nm = { .name = NULL }; + + dbg_tnc("ino %lu", (unsigned long)inum); + + /* + * Walk all extended attribute entries and remove them together with + * corresponding extended attribute inodes. + */ + lowest_xent_key(c, &key1, inum); + while (1) { + ino_t xattr_inum; + int err; + + xent = ubifs_tnc_next_ent(c, &key1, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + if (err == -ENOENT) + break; + return err; + } + + xattr_inum = le64_to_cpu(xent->inum); + dbg_tnc("xent '%s', ino %lu", xent->name, + (unsigned long)xattr_inum); + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + err = ubifs_tnc_remove_nm(c, &key1, &nm); + if (err) { + kfree(xent); + return err; + } + + lowest_ino_key(c, &key1, xattr_inum); + highest_ino_key(c, &key2, xattr_inum); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) { + kfree(xent); + return err; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key1); + } + + kfree(pxent); + lowest_ino_key(c, &key1, inum); + highest_ino_key(c, &key2, inum); + + return ubifs_tnc_remove_range(c, &key1, &key2); +} + +/** + * ubifs_tnc_next_ent - walk directory or extended attribute entries. + * @c: UBIFS file-system description object + * @key: key of last entry + * @nm: name of last entry found or %NULL + * + * This function finds and reads the next directory or extended attribute entry + * after the given key (@key) if there is one. @nm is used to resolve + * collisions. + * + * If the name of the current entry is not known and only the key is known, + * @nm->name has to be %NULL. In this case the semantics of this function is a + * little bit different and it returns the entry corresponding to this key, not + * the next one. If the key was not found, the closest "right" entry is + * returned. + * + * If the fist entry has to be found, @key has to contain the lowest possible + * key value for this inode and @name has to be %NULL. + * + * This function returns the found directory or extended attribute entry node + * in case of success, %-ENOENT is returned if no entry was found, and a + * negative error code is returned in case of failure. + */ +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_dent_node *dent; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey; + + dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); + ubifs_assert(is_hash_key(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + if (nm->name) { + if (err) { + /* Handle collisions */ + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", + err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + } + + /* Now find next entry */ + err = tnc_next(c, &znode, &n); + if (unlikely(err)) + goto out_unlock; + } else { + /* + * The full name of the entry was not given, in which case the + * behavior of this function is a little different and it + * returns current entry, not the next one. + */ + if (!err) { + /* + * However, the given key does not exist in the TNC + * tree and @znode/@n variables contain the closest + * "preceding" element. Switch to the next one. + */ + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + } + + zbr = &znode->zbranch[n]; + dent = kmalloc(zbr->len, GFP_NOFS); + if (unlikely(!dent)) { + err = -ENOMEM; + goto out_unlock; + } + + /* + * The above 'tnc_next()' call could lead us to the next inode, check + * this. + */ + dkey = &zbr->key; + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_free; + } + + err = tnc_read_node_nm(c, zbr, dent); + if (unlikely(err)) + goto out_free; + + mutex_unlock(&c->tnc_mutex); + return dent; + +out_free: + kfree(dent); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return ERR_PTR(err); +} + +/** + * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit. + * @c: UBIFS file-system description object + * + * Destroy left-over obsolete znodes from a failed commit. + */ +static void tnc_destroy_cnext(struct ubifs_info *c) +{ + struct ubifs_znode *cnext; + + if (!c->cnext) + return; + ubifs_assert(c->cmt_state == COMMIT_BROKEN); + cnext = c->cnext; + do { + struct ubifs_znode *znode = cnext; + + cnext = cnext->cnext; + if (ubifs_zn_obsolete(znode)) + kfree(znode); + } while (cnext && cnext != c->cnext); +} + +/** + * ubifs_tnc_close - close TNC subsystem and free all related resources. + * @c: UBIFS file-system description object + */ +void ubifs_tnc_close(struct ubifs_info *c) +{ + tnc_destroy_cnext(c); + if (c->zroot.znode) { + long n, freed; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + ubifs_assert(freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + } + kfree(c->gap_lebs); + kfree(c->ilebs); + destroy_old_idx(c); +} + +/** + * left_znode - get the znode to the left. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the left of @znode or NULL if + * there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *left_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip - 1; + + /* Go up until we can go left */ + znode = znode->parent; + if (!znode) + return NULL; + if (n >= 0) { + /* Now go down the rightmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + n = znode->child_cnt - 1; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * right_znode - get the znode to the right. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the right of @znode or NULL + * if there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *right_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip + 1; + + /* Go up until we can go right */ + znode = znode->parent; + if (!znode) + return NULL; + if (n < znode->child_cnt) { + /* Now go down the leftmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * lookup_znode - find a particular indexing node from TNC. + * @c: UBIFS file-system description object + * @key: index node key to lookup + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function searches an indexing node by its first key @key and its + * address @lnum:@offs. It looks up the indexing tree by pulling all indexing + * nodes it traverses to TNC. This function is called for indexing nodes which + * were found on the media by scanning, for example when garbage-collecting or + * when doing in-the-gaps commit. This means that the indexing node which is + * looked for does not have to have exactly the same leftmost key @key, because + * the leftmost key may have been changed, in which case TNC will contain a + * dirty znode which still refers the same @lnum:@offs. This function is clever + * enough to recognize such indexing nodes. + * + * Note, if a znode was deleted or changed too much, then this function will + * not find it. For situations like this UBIFS has the old index RB-tree + * (indexed by @lnum:@offs). + * + * This function returns a pointer to the znode found or %NULL if it is not + * found. A negative error code is returned on failure. + */ +static struct ubifs_znode *lookup_znode(struct ubifs_info *c, + union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode, *zn; + int n, nn; + + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); + + /* + * The arguments have probably been read off flash, so don't assume + * they are valid. + */ + if (level < 0) + return ERR_PTR(-EINVAL); + + /* Get the root znode */ + znode = c->zroot.znode; + if (!znode) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return znode; + } + /* Check if it is the one we are looking for */ + if (c->zroot.lnum == lnum && c->zroot.offs == offs) + return znode; + /* Descend to the parent level i.e. (level + 1) */ + if (level >= znode->level) + return NULL; + while (1) { + ubifs_search_zbranch(c, znode, key, &n); + if (n < 0) { + /* + * We reached a znode where the leftmost key is greater + * than the key we are searching for. This is the same + * situation as the one described in a huge comment at + * the end of the 'ubifs_lookup_level0()' function. And + * for exactly the same reasons we have to try to look + * left before giving up. + */ + znode = left_znode(c, znode); + if (!znode) + return NULL; + if (IS_ERR(znode)) + return znode; + ubifs_search_zbranch(c, znode, key, &n); + ubifs_assert(n >= 0); + } + if (znode->level == level + 1) + break; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + /* Check if the child is the one we are looking for */ + if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* If the key is unique, there is nowhere else to look */ + if (!is_hash_key(c, key)) + return NULL; + /* + * The key is not unique and so may be also in the znodes to either + * side. + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + /* Move one branch to the left */ + if (n) + n -= 1; + else { + znode = left_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = znode->child_cnt - 1; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is less than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) < 0) + break; + } + /* Back to the middle */ + znode = zn; + n = nn; + /* Look right */ + while (1) { + /* Move one branch to the right */ + if (++n >= znode->child_cnt) { + znode = right_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = 0; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is greater than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) > 0) + break; + } + return NULL; +} + +/** + * is_idx_node_in_tnc - determine if an index node is in the TNC. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * This function returns %0 if the index node is not referred to in the TNC, %1 + * if the index node is referred to in the TNC and the corresponding znode is + * dirty, %2 if an index node is referred to in the TNC and the corresponding + * znode is clean, and a negative error code in case of failure. + * + * Note, the @key argument has to be the key of the first child. Also note, + * this function relies on the fact that 0:0 is never a valid LEB number and + * offset for a main-area node. + */ +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + return 0; + if (IS_ERR(znode)) + return PTR_ERR(znode); + + return ubifs_zn_dirty(znode) ? 1 : 2; +} + +/** + * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @lnum: node LEB number + * @offs: node offset + * + * This function returns %1 if the node is referred to in the TNC, %0 if it is + * not, and a negative error code in case of failure. + * + * Note, this function relies on the fact that 0:0 is never a valid LEB number + * and offset for a main-area node. + */ +static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, + int lnum, int offs) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *zn; + int n, found, err, nn; + const int unique = !is_hash_key(c, key); + + found = ubifs_lookup_level0(c, key, &znode, &n); + if (found < 0) + return found; /* Error code */ + if (!found) + return 0; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + if (unique) + return 0; + /* + * Because the key is not unique, we have to look left + * and right as well + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &n); + if (err == -ENOENT) + break; + if (err) + return err; + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + /* Look right */ + znode = zn; + n = nn; + while (1) { + err = tnc_next(c, &znode, &n); + if (err) { + if (err == -ENOENT) + return 0; + return err; + } + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + return 0; +} + +/** + * ubifs_tnc_has_node - determine whether a node is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @level: index node level (if it is an index node) + * @lnum: node LEB number + * @offs: node offset + * @is_idx: non-zero if the node is an index node + * + * This function returns %1 if the node is in the TNC, %0 if it is not, and a + * negative error code in case of failure. For index nodes, @key has to be the + * key of the first child. An index node is considered to be in the TNC only if + * the corresponding znode is clean or has not been loaded. + */ +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx) +{ + int err; + + mutex_lock(&c->tnc_mutex); + if (is_idx) { + err = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (err < 0) + goto out_unlock; + if (err == 1) + /* The index node was found but it was dirty */ + err = 0; + else if (err == 2) + /* The index node was found and it was clean */ + err = 1; + else + BUG_ON(err != 0); + } else + err = is_leaf_node_in_tnc(c, key, lnum, offs); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_dirty_idx_node - dirty an index node. + * @c: UBIFS file-system description object + * @key: index node key + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function loads and dirties an index node so that it can be garbage + * collected. The @key argument has to be the key of the first child. This + * function relies on the fact that 0:0 is never a valid LEB number and offset + * for a main-area node. Returns %0 on success and a negative error code on + * failure. + */ +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int err = 0; + + mutex_lock(&c->tnc_mutex); + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + goto out_unlock; + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * dbg_check_inode_size - check if inode size is correct. + * @c: UBIFS file-system description object + * @inum: inode number + * @size: inode size + * + * This function makes sure that the inode size (@size) is correct and it does + * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL + * if it has a data page beyond @size, and other negative error code in case of + * other errors. + */ +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size) +{ + int err, n; + union ubifs_key from_key, to_key, *key; + struct ubifs_znode *znode; + unsigned int block; + + if (!S_ISREG(inode->i_mode)) + return 0; + if (!dbg_is_chk_gen(c)) + return 0; + + block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &from_key, inode->i_ino, block); + highest_data_key(c, &to_key, inode->i_ino); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + key = &from_key; + goto out_dump; + } + + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + + ubifs_assert(err == 0); + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, &from_key, &to_key)) + goto out_unlock; + +out_dump: + block = key_block(c, key); + ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld", + (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT); + mutex_unlock(&c->tnc_mutex); + ubifs_dump_inode(c, inode); + dump_stack(); + return -EINVAL; + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} diff --git a/kernel/fs/ubifs/tnc_commit.c b/kernel/fs/ubifs/tnc_commit.c new file mode 100644 index 000000000..b45345d70 --- /dev/null +++ b/kernel/fs/ubifs/tnc_commit.c @@ -0,0 +1,1071 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* This file implements TNC functions for committing */ + +#include +#include "ubifs.h" + +/** + * make_idx_node - make an index node for fill-the-gaps method of TNC commit. + * @c: UBIFS file-system description object + * @idx: buffer in which to place new index node + * @znode: znode from which to make new index node + * @lnum: LEB number where new index node will be written + * @offs: offset where new index node will be written + * @len: length of new index node + */ +static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, + struct ubifs_znode *znode, int lnum, int offs, int len) +{ + struct ubifs_znode *zp; + int i, err; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err(c, "bad ref in znode"); + ubifs_dump_znode(c, znode); + if (zbr->znode) + ubifs_dump_znode(c, zbr->znode); + } + } + ubifs_prepare_node(c, idx, len, 0); + + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; + + err = insert_old_idx_znode(c, znode); + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + + zbr = &zp->zbranch[znode->iip]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + atomic_long_dec(&c->dirty_zn_cnt); + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(ubifs_zn_cow(znode)); + + /* + * Note, unlike 'write_index()' we do not add memory barriers here + * because this function is called with @c->tnc_mutex locked. + */ + __clear_bit(DIRTY_ZNODE, &znode->flags); + __clear_bit(COW_ZNODE, &znode->flags); + + return err; +} + +/** + * fill_gap - make index nodes in gaps in dirty index LEBs. + * @c: UBIFS file-system description object + * @lnum: LEB number that gap appears in + * @gap_start: offset of start of gap + * @gap_end: offset of end of gap + * @dirt: adds dirty space to this + * + * This function returns the number of index nodes written into the gap. + */ +static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end, + int *dirt) +{ + int len, gap_remains, gap_pos, written, pad_len; + + ubifs_assert((gap_start & 7) == 0); + ubifs_assert((gap_end & 7) == 0); + ubifs_assert(gap_end >= gap_start); + + gap_remains = gap_end - gap_start; + if (!gap_remains) + return 0; + gap_pos = gap_start; + written = 0; + while (c->enext) { + len = ubifs_idx_node_sz(c, c->enext->child_cnt); + if (len < gap_remains) { + struct ubifs_znode *znode = c->enext; + const int alen = ALIGN(len, 8); + int err; + + ubifs_assert(alen <= gap_remains); + err = make_idx_node(c, c->ileb_buf + gap_pos, znode, + lnum, gap_pos, len); + if (err) + return err; + gap_remains -= alen; + gap_pos += alen; + c->enext = znode->cnext; + if (c->enext == c->cnext) + c->enext = NULL; + written += 1; + } else + break; + } + if (gap_end == c->leb_size) { + c->ileb_len = ALIGN(gap_pos, c->min_io_size); + /* Pad to end of min_io_size */ + pad_len = c->ileb_len - gap_pos; + } else + /* Pad to end of gap */ + pad_len = gap_remains; + dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d", + lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len); + ubifs_pad(c, c->ileb_buf + gap_pos, pad_len); + *dirt += pad_len; + return written; +} + +/** + * find_old_idx - find an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %1 if found and %0 otherwise. + */ +static int find_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *o; + struct rb_node *p; + + p = c->old_idx.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = p->rb_left; + else if (lnum > o->lnum) + p = p->rb_right; + else if (offs < o->offs) + p = p->rb_left; + else if (offs > o->offs) + p = p->rb_right; + else + return 1; + } + return 0; +} + +/** + * is_idx_node_in_use - determine if an index node can be overwritten. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * If @key / @lnum / @offs identify an index node that was not part of the old + * index, then this function returns %0 (obsolete). Else if the index node was + * part of the old index but is now dirty %1 is returned, else if it is clean %2 + * is returned. A negative error code is returned on failure. + */ +static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, + int level, int lnum, int offs) +{ + int ret; + + ret = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (ret < 0) + return ret; /* Error code */ + if (ret == 0) + if (find_old_idx(c, lnum, offs)) + return 1; + return ret; +} + +/** + * layout_leb_in_gaps - layout index nodes using in-the-gaps method. + * @c: UBIFS file-system description object + * @p: return LEB number here + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * This function merely puts the next znode into the next gap, making no attempt + * to try to maximise the number of znodes that fit. + * This function returns the number of index nodes written into the gaps, or a + * negative error code on failure. + */ +static int layout_leb_in_gaps(struct ubifs_info *c, int *p) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written; + + tot_written = 0; + /* Get an index LEB with lots of obsolete index nodes */ + lnum = ubifs_find_dirty_idx_leb(c); + if (lnum < 0) + /* + * There also may be dirt in the index head that could be + * filled, however we do not check there at present. + */ + return lnum; /* Error code */ + *p = lnum; + dbg_gc("LEB %d", lnum); + /* + * Scan the index LEB. We use the generic scan for this even though + * it is more comprehensive and less efficient than is needed for this + * purpose. + */ + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0); + c->ileb_len = 0; + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + gap_start = 0; + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx; + int in_use, level; + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + idx = snod->node; + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + /* Determine if the index node is in use (not obsolete) */ + in_use = is_idx_node_in_use(c, &snod->key, level, lnum, + snod->offs); + if (in_use < 0) { + ubifs_scan_destroy(sleb); + return in_use; /* Error code */ + } + if (in_use) { + if (in_use == 1) + dirt += ALIGN(snod->len, 8); + /* + * The obsolete index nodes form gaps that can be + * overwritten. This gap has ended because we have + * found an index node that is still in use + * i.e. not obsolete + */ + gap_end = snod->offs; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) { + ubifs_scan_destroy(sleb); + return written; /* Error code */ + } + tot_written += written; + gap_start = ALIGN(snod->offs + snod->len, 8); + } + } + ubifs_scan_destroy(sleb); + c->ileb_len = c->leb_size; + gap_end = c->leb_size; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) + return written; /* Error code */ + tot_written += written; + if (tot_written == 0) { + struct ubifs_lprops lp; + + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + return err; + if (lp.free == c->leb_size) { + /* + * We must have snatched this LEB from the idx_gc list + * so we need to correct the free and dirty space. + */ + err = ubifs_change_one_lp(c, lnum, + c->leb_size - c->ileb_len, + dirt, 0, 0, 0); + if (err) + return err; + } + return 0; + } + err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt, + 0, 0, 0); + if (err) + return err; + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len); + if (err) + return err; + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + return tot_written; +} + +/** + * get_leb_cnt - calculate the number of empty LEBs needed to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns the number of empty LEBs needed to commit @cnt znodes + * to the current index head. The number is not exact and may be more than + * needed. + */ +static int get_leb_cnt(struct ubifs_info *c, int cnt) +{ + int d; + + /* Assume maximum index node size (i.e. overestimate space needed) */ + cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz; + if (cnt < 0) + cnt = 0; + d = c->leb_size / c->max_idx_node_sz; + return DIV_ROUND_UP(cnt, d); +} + +/** + * layout_in_gaps - in-the-gaps method of committing TNC. + * @c: UBIFS file-system description object + * @cnt: number of dirty znodes to commit. + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_gaps(struct ubifs_info *c, int cnt) +{ + int err, leb_needed_cnt, written, *p; + + dbg_gc("%d znodes to write", cnt); + + c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS); + if (!c->gap_lebs) + return -ENOMEM; + + p = c->gap_lebs; + do { + ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + written = layout_leb_in_gaps(c, p); + if (written < 0) { + err = written; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; + } + if (!dbg_is_chk_index(c)) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_warn(c, "out of space"); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + } + /* Try to commit anyway */ + break; + } + p++; + cnt -= written; + leb_needed_cnt = get_leb_cnt(c, cnt); + dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt, + leb_needed_cnt, c->ileb_cnt); + } while (leb_needed_cnt > c->ileb_cnt); + + *p = -1; + return 0; +} + +/** + * layout_in_empty_space - layout index nodes in empty space. + * @c: UBIFS file-system description object + * + * This function lays out new index nodes for dirty znodes using empty LEBs. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_empty_space(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext, *zp; + int lnum, offs, len, next_len, buf_len, buf_offs, used, avail; + int wlen, blen, err; + + cnext = c->enext; + if (!cnext) + return 0; + + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + buf_len = ubifs_idx_node_sz(c, c->fanout); + buf_len = ALIGN(buf_len, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) + lnum = -1; + + while (1) { + znode = cnext; + + len = ubifs_idx_node_sz(c, znode->child_cnt); + + /* Determine the index node position */ + if (lnum == -1) { + if (c->ileb_nxt >= c->ileb_cnt) { + ubifs_err(c, "out of space"); + return -ENOSPC; + } + lnum = c->ilebs[c->ileb_nxt++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + + offs = buf_offs + used; + + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + int i; + + i = znode->iip; + zbr = &zp->zbranch[i]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + /* + * Once lprops is updated, we can decrease the dirty znode count + * but it is easier to just do it here. + */ + atomic_long_dec(&c->dirty_zn_cnt); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + cnext = znode->cnext; + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + if (next_len != 0 && + buf_offs + used + next_len <= c->leb_size && + avail > 0) + continue; + + if (avail <= 0 && next_len && + buf_offs + used + next_len <= c->leb_size) + blen = buf_len; + else + blen = ALIGN(wlen, c->min_io_size); + + /* The buffer is full or there are no more znodes to do */ + buf_offs += blen; + if (next_len) { + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, blen - used, + 0, 0); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + continue; + } + err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs, + blen - used, 0, 0); + if (err) + return err; + break; + } + + c->dbg->new_ihead_lnum = lnum; + c->dbg->new_ihead_offs = buf_offs; + + return 0; +} + +/** + * layout_commit - determine positions of index nodes to commit. + * @c: UBIFS file-system description object + * @no_space: indicates that insufficient empty LEBs were allocated + * @cnt: number of znodes to commit + * + * Calculate and update the positions of index nodes to commit. If there were + * an insufficient number of empty LEBs allocated, then index nodes are placed + * into the gaps created by obsolete index nodes in non-empty index LEBs. For + * this purpose, an obsolete index node is one that was not in the index as at + * the end of the last commit. To write "in-the-gaps" requires that those index + * LEBs are updated atomically in-place. + */ +static int layout_commit(struct ubifs_info *c, int no_space, int cnt) +{ + int err; + + if (no_space) { + err = layout_in_gaps(c, cnt); + if (err) + return err; + } + err = layout_in_empty_space(c); + return err; +} + +/** + * find_first_dirty - find first dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode) +{ + int i, cont; + + if (!znode) + return NULL; + + while (1) { + if (znode->level == 0) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + cont = 0; + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) { + znode = zbr->znode; + cont = 1; + break; + } + } + if (!cont) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + } +} + +/** + * find_next_dirty - find next dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode) +{ + int n = znode->iip + 1; + + znode = znode->parent; + if (!znode) + return NULL; + for (; n < znode->child_cnt; n++) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) + return find_first_dirty(zbr->znode); + } + return znode; +} + +/** + * get_znodes_to_commit - create list of dirty znodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of znodes to commit. + */ +static int get_znodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + int cnt = 0; + + c->cnext = find_first_dirty(c->zroot.znode); + znode = c->enext = c->cnext; + if (!znode) { + dbg_cmt("no znodes to commit"); + return 0; + } + cnt += 1; + while (1) { + ubifs_assert(!ubifs_zn_cow(znode)); + __set_bit(COW_ZNODE, &znode->flags); + znode->alt = 0; + cnext = find_next_dirty(znode); + if (!cnext) { + znode->cnext = c->cnext; + break; + } + znode->cnext = cnext; + znode = cnext; + cnt += 1; + } + dbg_cmt("committing %d znodes", cnt); + ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt)); + return cnt; +} + +/** + * alloc_idx_lebs - allocate empty LEBs to be used to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns %-ENOSPC if it cannot allocate a sufficient number of + * empty LEBs. %0 is returned on success, otherwise a negative error code + * is returned. + */ +static int alloc_idx_lebs(struct ubifs_info *c, int cnt) +{ + int i, leb_cnt, lnum; + + c->ileb_cnt = 0; + c->ileb_nxt = 0; + leb_cnt = get_leb_cnt(c, cnt); + dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt); + if (!leb_cnt) + return 0; + c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS); + if (!c->ilebs) + return -ENOMEM; + for (i = 0; i < leb_cnt; i++) { + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) + return lnum; + c->ilebs[c->ileb_cnt++] = lnum; + dbg_cmt("LEB %d", lnum); + } + if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) + return -ENOSPC; + return 0; +} + +/** + * free_unused_idx_lebs - free unused LEBs that were allocated for the commit. + * @c: UBIFS file-system description object + * + * It is possible that we allocate more empty LEBs for the commit than we need. + * This functions frees the surplus. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_unused_idx_lebs(struct ubifs_info *c) +{ + int i, err = 0, lnum, er; + + for (i = c->ileb_nxt; i < c->ileb_cnt; i++) { + lnum = c->ilebs[i]; + dbg_cmt("LEB %d", lnum); + er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX | LPROPS_TAKEN, 0); + if (!err) + err = er; + } + return err; +} + +/** + * free_idx_lebs - free unused LEBs after commit end. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_idx_lebs(struct ubifs_info *c) +{ + int err; + + err = free_unused_idx_lebs(c); + kfree(c->ilebs); + c->ilebs = NULL; + return err; +} + +/** + * ubifs_tnc_start_commit - start TNC commit. + * @c: UBIFS file-system description object + * @zroot: new index root position is returned here + * + * This function prepares the list of indexing nodes to commit and lays out + * their positions on flash. If there is not enough free space it uses the + * in-gap commit method. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int err = 0, cnt; + + mutex_lock(&c->tnc_mutex); + err = dbg_check_tnc(c, 1); + if (err) + goto out; + cnt = get_znodes_to_commit(c); + if (cnt != 0) { + int no_space = 0; + + err = alloc_idx_lebs(c, cnt); + if (err == -ENOSPC) + no_space = 1; + else if (err) + goto out_free; + err = layout_commit(c, no_space, cnt); + if (err) + goto out_free; + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + err = free_unused_idx_lebs(c); + if (err) + goto out; + } + destroy_old_idx(c); + memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch)); + + err = ubifs_save_dirty_idx_lnums(c); + if (err) + goto out; + + spin_lock(&c->space_lock); + /* + * Although we have not finished committing yet, update size of the + * committed index ('c->bi.old_idx_sz') and zero out the index growth + * budget. It is OK to do this now, because we've reserved all the + * space which is needed to commit the index, and it is save for the + * budgeting subsystem to assume the index is already committed, + * even though it is not. + */ + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->bi.old_idx_sz = c->calc_idx_sz; + c->bi.uncommitted_idx = 0; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); + mutex_unlock(&c->tnc_mutex); + + dbg_cmt("number of index LEBs %d", c->lst.idx_lebs); + dbg_cmt("size of index %llu", c->calc_idx_sz); + return err; + +out_free: + free_idx_lebs(c); +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * write_index - write index nodes. + * @c: UBIFS file-system description object + * + * This function writes the index nodes whose positions were laid out in the + * layout_in_empty_space function. + */ +static int write_index(struct ubifs_info *c) +{ + struct ubifs_idx_node *idx; + struct ubifs_znode *znode, *cnext; + int i, lnum, offs, len, next_len, buf_len, buf_offs, used; + int avail, wlen, err, lnum_pos = 0, blen, nxt_offs; + + cnext = c->enext; + if (!cnext) + return 0; + + /* + * Always write index nodes to the index head so that index nodes and + * other types of nodes are never mixed in the same erase block. + */ + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + /* Allocate commit buffer */ + buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0, + LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + + while (1) { + cond_resched(); + + znode = cnext; + idx = c->cbuf + used; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err(c, "bad ref in znode"); + ubifs_dump_znode(c, znode); + if (zbr->znode) + ubifs_dump_znode(c, zbr->znode); + } + } + len = ubifs_idx_node_sz(c, znode->child_cnt); + ubifs_prepare_node(c, idx, len, 0); + + /* Determine the index node position */ + if (lnum == -1) { + lnum = c->ilebs[lnum_pos++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + offs = buf_offs + used; + + if (lnum != znode->lnum || offs != znode->offs || + len != znode->len) { + ubifs_err(c, "inconsistent znode posn"); + return -EINVAL; + } + + /* Grab some stuff from znode while we still can */ + cnext = znode->cnext; + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(ubifs_zn_cow(znode)); + + /* + * It is important that other threads should see %DIRTY_ZNODE + * flag cleared before %COW_ZNODE. Specifically, it matters in + * the 'dirty_cow_znode()' function. This is the reason for the + * first barrier. Also, we want the bit changes to be seen to + * other threads ASAP, to avoid unnecesarry copying, which is + * the reason for the second barrier. + */ + clear_bit(DIRTY_ZNODE, &znode->flags); + smp_mb__before_atomic(); + clear_bit(COW_ZNODE, &znode->flags); + smp_mb__after_atomic(); + + /* + * We have marked the znode as clean but have not updated the + * @c->clean_zn_cnt counter. If this znode becomes dirty again + * before 'free_obsolete_znodes()' is called, then + * @c->clean_zn_cnt will be decremented before it gets + * incremented (resulting in 2 decrements for the same znode). + * This means that @c->clean_zn_cnt may become negative for a + * while. + * + * Q: why we cannot increment @c->clean_zn_cnt? + * A: because we do not have the @c->tnc_mutex locked, and the + * following code would be racy and buggy: + * + * if (!ubifs_zn_obsolete(znode)) { + * atomic_long_inc(&c->clean_zn_cnt); + * atomic_long_inc(&ubifs_clean_zn_cnt); + * } + * + * Thus, we just delay the @c->clean_zn_cnt update until we + * have the mutex locked. + */ + + /* Do not access znode from this point on */ + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + nxt_offs = buf_offs + used + next_len; + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) + continue; + else + blen = buf_len; + } else { + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + + /* The buffer is full or there are no more znodes to do */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, + 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; + } + break; + } + + if (lnum != c->dbg->new_ihead_lnum || + buf_offs != c->dbg->new_ihead_offs) { + ubifs_err(c, "inconsistent ihead"); + return -EINVAL; + } + + c->ihead_lnum = lnum; + c->ihead_offs = buf_offs; + + return 0; +} + +/** + * free_obsolete_znodes - free obsolete znodes. + * @c: UBIFS file-system description object + * + * At the end of commit end, obsolete znodes are freed. + */ +static void free_obsolete_znodes(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + + cnext = c->cnext; + do { + znode = cnext; + cnext = znode->cnext; + if (ubifs_zn_obsolete(znode)) + kfree(znode); + else { + znode->cnext = NULL; + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } + } while (cnext != c->cnext); +} + +/** + * return_gap_lebs - return LEBs used by the in-gap commit method. + * @c: UBIFS file-system description object + * + * This function clears the "taken" flag for the LEBs which were used by the + * "commit in-the-gaps" method. + */ +static int return_gap_lebs(struct ubifs_info *c) +{ + int *p, err; + + if (!c->gap_lebs) + return 0; + + dbg_cmt(""); + for (p = c->gap_lebs; *p != -1; p++) { + err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); + if (err) + return err; + } + + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return 0; +} + +/** + * ubifs_tnc_end_commit - update the TNC for commit end. + * @c: UBIFS file-system description object + * + * Write the dirty znodes. + */ +int ubifs_tnc_end_commit(struct ubifs_info *c) +{ + int err; + + if (!c->cnext) + return 0; + + err = return_gap_lebs(c); + if (err) + return err; + + err = write_index(c); + if (err) + return err; + + mutex_lock(&c->tnc_mutex); + + dbg_cmt("TNC height is %d", c->zroot.znode->level + 1); + + free_obsolete_znodes(c); + + c->cnext = NULL; + kfree(c->ilebs); + c->ilebs = NULL; + + mutex_unlock(&c->tnc_mutex); + + return 0; +} diff --git a/kernel/fs/ubifs/tnc_misc.c b/kernel/fs/ubifs/tnc_misc.c new file mode 100644 index 000000000..93f5b7859 --- /dev/null +++ b/kernel/fs/ubifs/tnc_misc.c @@ -0,0 +1,494 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains miscelanious TNC-related functions shared betweend + * different files. This file does not form any logically separate TNC + * sub-system. The file was created because there is a lot of TNC code and + * putting it all in one file would make that file too big and unreadable. + */ + +#include "ubifs.h" + +/** + * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal. + * @zr: root of the subtree to traverse + * @znode: previous znode + * + * This function implements levelorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode) +{ + int level, iip, level_search = 0; + struct ubifs_znode *zn; + + ubifs_assert(zr); + + if (unlikely(!znode)) + return zr; + + if (unlikely(znode == zr)) { + if (znode->level == 0) + return NULL; + return ubifs_tnc_find_child(zr, 0); + } + + level = znode->level; + + iip = znode->iip; + while (1) { + ubifs_assert(znode->level <= zr->level); + + /* + * First walk up until there is a znode with next branch to + * look at. + */ + while (znode->parent != zr && iip >= znode->parent->child_cnt) { + znode = znode->parent; + iip = znode->iip; + } + + if (unlikely(znode->parent == zr && + iip >= znode->parent->child_cnt)) { + /* This level is done, switch to the lower one */ + level -= 1; + if (level_search || level < 0) + /* + * We were already looking for znode at lower + * level ('level_search'). As we are here + * again, it just does not exist. Or all levels + * were finished ('level < 0'). + */ + return NULL; + + level_search = 1; + iip = -1; + znode = ubifs_tnc_find_child(zr, 0); + ubifs_assert(znode); + } + + /* Switch to the next index */ + zn = ubifs_tnc_find_child(znode->parent, iip + 1); + if (!zn) { + /* No more children to look at, we have walk up */ + iip = znode->parent->child_cnt; + continue; + } + + /* Walk back down to the level we came from ('level') */ + while (zn->level != level) { + znode = zn; + zn = ubifs_tnc_find_child(zn, 0); + if (!zn) { + /* + * This path is not too deep so it does not + * reach 'level'. Try next path. + */ + iip = znode->iip; + break; + } + } + + if (zn) { + ubifs_assert(zn->level >= 0); + return zn; + } + } +} + +/** + * ubifs_search_zbranch - search znode branch. + * @c: UBIFS file-system description object + * @znode: znode to search in + * @key: key to search for + * @n: znode branch slot number is returned here + * + * This is a helper function which search branch with key @key in @znode using + * binary search. The result of the search may be: + * o exact match, then %1 is returned, and the slot number of the branch is + * stored in @n; + * o no exact match, then %0 is returned and the slot number of the left + * closest branch is returned in @n; the slot if all keys in this znode are + * greater than @key, then %-1 is returned in @n. + */ +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n) +{ + int beg = 0, end = znode->child_cnt, uninitialized_var(mid); + int uninitialized_var(cmp); + const struct ubifs_zbranch *zbr = &znode->zbranch[0]; + + ubifs_assert(end > beg); + + while (end > beg) { + mid = (beg + end) >> 1; + cmp = keys_cmp(c, key, &zbr[mid].key); + if (cmp > 0) + beg = mid + 1; + else if (cmp < 0) + end = mid; + else { + *n = mid; + return 1; + } + } + + *n = end - 1; + + /* The insert point is after *n */ + ubifs_assert(*n >= -1 && *n < znode->child_cnt); + if (*n == -1) + ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0); + else + ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0); + if (*n + 1 < znode->child_cnt) + ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0); + + return 0; +} + +/** + * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal. + * @znode: znode to start at (root of the sub-tree to traverse) + * + * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is + * ignored. + */ +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode) +{ + if (unlikely(!znode)) + return NULL; + + while (znode->level > 0) { + struct ubifs_znode *child; + + child = ubifs_tnc_find_child(znode, 0); + if (!child) + return znode; + znode = child; + } + + return znode; +} + +/** + * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal. + * @znode: previous znode + * + * This function implements postorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + ubifs_assert(znode); + if (unlikely(!znode->parent)) + return NULL; + + /* Switch to the next index in the parent */ + zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1); + if (!zn) + /* This is in fact the last child, return parent */ + return znode->parent; + + /* Go to the first znode in this new subtree */ + return ubifs_tnc_postorder_first(zn); +} + +/** + * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree. + * @znode: znode defining subtree to destroy + * + * This function destroys subtree of the TNC tree. Returns number of clean + * znodes in the subtree. + */ +long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode); + long clean_freed = 0; + int n; + + ubifs_assert(zn); + while (1) { + for (n = 0; n < zn->child_cnt; n++) { + if (!zn->zbranch[n].znode) + continue; + + if (zn->level > 0 && + !ubifs_zn_dirty(zn->zbranch[n].znode)) + clean_freed += 1; + + cond_resched(); + kfree(zn->zbranch[n].znode); + } + + if (zn == znode) { + if (!ubifs_zn_dirty(zn)) + clean_freed += 1; + kfree(zn); + return clean_freed; + } + + zn = ubifs_tnc_postorder_next(zn); + } +} + +/** + * read_znode - read an indexing node from flash and fill znode. + * @c: UBIFS file-system description object + * @lnum: LEB of the indexing node to read + * @offs: node offset + * @len: node length + * @znode: znode to read to + * + * This function reads an indexing node from the flash media and fills znode + * with the read data. Returns zero in case of success and a negative error + * code in case of failure. The read indexing node is validated and if anything + * is wrong with it, this function prints complaint messages and returns + * %-EINVAL. + */ +static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, + struct ubifs_znode *znode) +{ + int i, err, type, cmp; + struct ubifs_idx_node *idx; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err < 0) { + kfree(idx); + return err; + } + + znode->child_cnt = le16_to_cpu(idx->child_cnt); + znode->level = le16_to_cpu(idx->level); + + dbg_tnc("LEB %d:%d, level %d, %d branch", + lnum, offs, znode->level, znode->child_cnt); + + if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { + ubifs_err(c, "current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + ubifs_err(c, "max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); + err = 1; + goto out_dump; + } + + for (i = 0; i < znode->child_cnt; i++) { + const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_read(c, &br->key, &zbr->key); + zbr->lnum = le32_to_cpu(br->lnum); + zbr->offs = le32_to_cpu(br->offs); + zbr->len = le32_to_cpu(br->len); + zbr->znode = NULL; + + /* Validate branch */ + + if (zbr->lnum < c->main_first || + zbr->lnum >= c->leb_cnt || zbr->offs < 0 || + zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { + ubifs_err(c, "bad branch %d", i); + err = 2; + goto out_dump; + } + + switch (key_type(c, &zbr->key)) { + case UBIFS_INO_KEY: + case UBIFS_DATA_KEY: + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + break; + default: + ubifs_err(c, "bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); + err = 3; + goto out_dump; + } + + if (znode->level) + continue; + + type = key_type(c, &zbr->key); + if (c->ranges[type].max_len == 0) { + if (zbr->len != c->ranges[type].len) { + ubifs_err(c, "bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err(c, "have to be %d", c->ranges[type].len); + err = 4; + goto out_dump; + } + } else if (zbr->len < c->ranges[type].min_len || + zbr->len > c->ranges[type].max_len) { + ubifs_err(c, "bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err(c, "have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); + err = 5; + goto out_dump; + } + } + + /* + * Ensure that the next key is greater or equivalent to the + * previous one. + */ + for (i = 0; i < znode->child_cnt - 1; i++) { + const union ubifs_key *key1, *key2; + + key1 = &znode->zbranch[i].key; + key2 = &znode->zbranch[i + 1].key; + + cmp = keys_cmp(c, key1, key2); + if (cmp > 0) { + ubifs_err(c, "bad key order (keys %d and %d)", i, i + 1); + err = 6; + goto out_dump; + } else if (cmp == 0 && !is_hash_key(c, key1)) { + /* These can only be keys with colliding hash */ + ubifs_err(c, "keys %d and %d are not hashed but equivalent", + i, i + 1); + err = 7; + goto out_dump; + } + } + + kfree(idx); + return 0; + +out_dump: + ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err); + ubifs_dump_node(c, idx); + kfree(idx); + return -EINVAL; +} + +/** + * ubifs_load_znode - load znode to TNC cache. + * @c: UBIFS file-system description object + * @zbr: znode branch + * @parent: znode's parent + * @iip: index in parent + * + * This function loads znode pointed to by @zbr into the TNC cache and + * returns pointer to it in case of success and a negative error code in case + * of failure. + */ +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip) +{ + int err; + struct ubifs_znode *znode; + + ubifs_assert(!zbr->znode); + /* + * A slab cache is not presently used for znodes because the znode size + * depends on the fanout which is stored in the superblock. + */ + znode = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!znode) + return ERR_PTR(-ENOMEM); + + err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode); + if (err) + goto out; + + atomic_long_inc(&c->clean_zn_cnt); + + /* + * Increment the global clean znode counter as well. It is OK that + * global and per-FS clean znode counters may be inconsistent for some + * short time (because we might be preempted at this point), the global + * one is only used in shrinker. + */ + atomic_long_inc(&ubifs_clean_zn_cnt); + + zbr->znode = znode; + znode->parent = parent; + znode->time = get_seconds(); + znode->iip = iip; + + return znode; + +out: + kfree(znode); + return ERR_PTR(err); +} + +/** + * ubifs_tnc_read_node - read a leaf node from the flash media. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a node defined by @zbr from the flash media. Returns + * zero in case of success or a negative negative error code in case of + * failure. + */ +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + union ubifs_key key1, *key = &zbr->key; + int err, type = key_type(c, key); + struct ubifs_wbuf *wbuf; + + /* + * 'zbr' has to point to on-flash node. The node may sit in a bud and + * may even be in a write buffer, so we have to take care about this. + */ + wbuf = ubifs_get_wbuf(c, zbr->lnum); + if (wbuf) + err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len, + zbr->lnum, zbr->offs); + else + err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum, + zbr->offs); + + if (err) { + dbg_tnck(key, "key "); + return err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { + ubifs_err(c, "bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnck(key, "looked for key "); + dbg_tnck(&key1, "but found node's key "); + ubifs_dump_node(c, node); + return -EINVAL; + } + + return 0; +} diff --git a/kernel/fs/ubifs/ubifs-media.h b/kernel/fs/ubifs/ubifs-media.h new file mode 100644 index 000000000..e24380cf4 --- /dev/null +++ b/kernel/fs/ubifs/ubifs-media.h @@ -0,0 +1,784 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file describes UBIFS on-flash format and contains definitions of all the + * relevant data structures and constants. + * + * All UBIFS on-flash objects are stored in the form of nodes. All nodes start + * with the UBIFS node magic number and have the same common header. Nodes + * always sit at 8-byte aligned positions on the media and node header sizes are + * also 8-byte aligned (except for the indexing node and the padding node). + */ + +#ifndef __UBIFS_MEDIA_H__ +#define __UBIFS_MEDIA_H__ + +/* UBIFS node magic number (must not have the padding byte first or last) */ +#define UBIFS_NODE_MAGIC 0x06101831 + +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ +#define UBIFS_FORMAT_VERSION 4 + +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + +/* Minimum logical eraseblock size in bytes */ +#define UBIFS_MIN_LEB_SZ (15*1024) + +/* Initial CRC32 value used when calculating CRC checksums */ +#define UBIFS_CRC32_INIT 0xFFFFFFFFU + +/* + * UBIFS does not try to compress data if its length is less than the below + * constant. + */ +#define UBIFS_MIN_COMPR_LEN 128 + +/* + * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes + * shorter than uncompressed data length, UBIFS prefers to leave this data + * node uncompress, because it'll be read faster. + */ +#define UBIFS_MIN_COMPRESS_DIFF 64 + +/* Root inode number */ +#define UBIFS_ROOT_INO 1 + +/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */ +#define UBIFS_FIRST_INO 64 + +/* + * Maximum file name and extended attribute length (must be a multiple of 8, + * minus 1). + */ +#define UBIFS_MAX_NLEN 255 + +/* Maximum number of data journal heads */ +#define UBIFS_MAX_JHEADS 1 + +/* + * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system, + * which means that it does not treat the underlying media as consisting of + * blocks like in case of hard drives. Do not be confused. UBIFS block is just + * the maximum amount of data which one data node can have or which can be + * attached to an inode node. + */ +#define UBIFS_BLOCK_SIZE 4096 +#define UBIFS_BLOCK_SHIFT 12 + +/* UBIFS padding byte pattern (must not be first or last byte of node magic) */ +#define UBIFS_PADDING_BYTE 0xCE + +/* Maximum possible key length */ +#define UBIFS_MAX_KEY_LEN 16 + +/* Key length ("simple" format) */ +#define UBIFS_SK_LEN 8 + +/* Minimum index tree fanout */ +#define UBIFS_MIN_FANOUT 3 + +/* Maximum number of levels in UBIFS indexing B-tree */ +#define UBIFS_MAX_LEVELS 512 + +/* Maximum amount of data attached to an inode in bytes */ +#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE + +/* LEB Properties Tree fanout (must be power of 2) and fanout shift */ +#define UBIFS_LPT_FANOUT 4 +#define UBIFS_LPT_FANOUT_SHIFT 2 + +/* LEB Properties Tree bit field sizes */ +#define UBIFS_LPT_CRC_BITS 16 +#define UBIFS_LPT_CRC_BYTES 2 +#define UBIFS_LPT_TYPE_BITS 4 + +/* The key is always at the same position in all keyed nodes */ +#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) + +/* Garbage collector journal head number */ +#define UBIFS_GC_HEAD 0 +/* Base journal head number */ +#define UBIFS_BASE_HEAD 1 +/* Data journal head number */ +#define UBIFS_DATA_HEAD 2 + +/* + * LEB Properties Tree node types. + * + * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties) + * UBIFS_LPT_NNODE: LPT internal node + * UBIFS_LPT_LTAB: LPT's own lprops table + * UBIFS_LPT_LSAVE: LPT's save table (big model only) + * UBIFS_LPT_NODE_CNT: count of LPT node types + * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type + */ +enum { + UBIFS_LPT_PNODE, + UBIFS_LPT_NNODE, + UBIFS_LPT_LTAB, + UBIFS_LPT_LSAVE, + UBIFS_LPT_NODE_CNT, + UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1, +}; + +/* + * UBIFS inode types. + * + * UBIFS_ITYPE_REG: regular file + * UBIFS_ITYPE_DIR: directory + * UBIFS_ITYPE_LNK: soft link + * UBIFS_ITYPE_BLK: block device node + * UBIFS_ITYPE_CHR: character device node + * UBIFS_ITYPE_FIFO: fifo + * UBIFS_ITYPE_SOCK: socket + * UBIFS_ITYPES_CNT: count of supported file types + */ +enum { + UBIFS_ITYPE_REG, + UBIFS_ITYPE_DIR, + UBIFS_ITYPE_LNK, + UBIFS_ITYPE_BLK, + UBIFS_ITYPE_CHR, + UBIFS_ITYPE_FIFO, + UBIFS_ITYPE_SOCK, + UBIFS_ITYPES_CNT, +}; + +/* + * Supported key hash functions. + * + * UBIFS_KEY_HASH_R5: R5 hash + * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name + */ +enum { + UBIFS_KEY_HASH_R5, + UBIFS_KEY_HASH_TEST, +}; + +/* + * Supported key formats. + * + * UBIFS_SIMPLE_KEY_FMT: simple key format + */ +enum { + UBIFS_SIMPLE_KEY_FMT, +}; + +/* + * The simple key format uses 29 bits for storing UBIFS block number and hash + * value. + */ +#define UBIFS_S_KEY_BLOCK_BITS 29 +#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF +#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS +#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK + +/* + * Key types. + * + * UBIFS_INO_KEY: inode node key + * UBIFS_DATA_KEY: data node key + * UBIFS_DENT_KEY: directory entry node key + * UBIFS_XENT_KEY: extended attribute entry key + * UBIFS_KEY_TYPES_CNT: number of supported key types + */ +enum { + UBIFS_INO_KEY, + UBIFS_DATA_KEY, + UBIFS_DENT_KEY, + UBIFS_XENT_KEY, + UBIFS_KEY_TYPES_CNT, +}; + +/* Count of LEBs reserved for the superblock area */ +#define UBIFS_SB_LEBS 1 +/* Count of LEBs reserved for the master area */ +#define UBIFS_MST_LEBS 2 + +/* First LEB of the superblock area */ +#define UBIFS_SB_LNUM 0 +/* First LEB of the master area */ +#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS) +/* First LEB of the log area */ +#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS) + +/* + * The below constants define the absolute minimum values for various UBIFS + * media areas. Many of them actually depend of flash geometry and the FS + * configuration (number of journal heads, orphan LEBs, etc). This means that + * the smallest volume size which can be used for UBIFS cannot be pre-defined + * by these constants. The file-system that meets the below limitation will not + * necessarily mount. UBIFS does run-time calculations and validates the FS + * size. + */ + +/* Minimum number of logical eraseblocks in the log */ +#define UBIFS_MIN_LOG_LEBS 2 +/* Minimum number of bud logical eraseblocks (one for each head) */ +#define UBIFS_MIN_BUD_LEBS 3 +/* Minimum number of journal logical eraseblocks */ +#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS) +/* Minimum number of LPT area logical eraseblocks */ +#define UBIFS_MIN_LPT_LEBS 2 +/* Minimum number of orphan area logical eraseblocks */ +#define UBIFS_MIN_ORPH_LEBS 1 +/* + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 + * for GC, 1 for deletions, and at least 1 for committed data). + */ +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) + +/* Minimum number of logical eraseblocks */ +#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ + UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \ + UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS) + +/* Node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_CH_SZ sizeof(struct ubifs_ch) +#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node) +#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node) +#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node) +#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node) +#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node) +#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node) +#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node) +#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node) +#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node) +#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) +#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) +/* Extended attribute entry nodes are identical to directory entry nodes */ +#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ +/* Only this does not have to be multiple of 8 bytes */ +#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch) + +/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) +#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA) +#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1) +#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ + +/* The largest UBIFS node */ +#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ + +/* + * On-flash inode flags. + * + * UBIFS_COMPR_FL: use compression for this inode + * UBIFS_SYNC_FL: I/O on this inode has to be synchronous + * UBIFS_IMMUTABLE_FL: inode is immutable + * UBIFS_APPEND_FL: writes to the inode may only append data + * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous + * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * + * Note, these are on-flash flags which correspond to ioctl flags + * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not + * have to be the same. + */ +enum { + UBIFS_COMPR_FL = 0x01, + UBIFS_SYNC_FL = 0x02, + UBIFS_IMMUTABLE_FL = 0x04, + UBIFS_APPEND_FL = 0x08, + UBIFS_DIRSYNC_FL = 0x10, + UBIFS_XATTR_FL = 0x20, +}; + +/* Inode flag bits used by UBIFS */ +#define UBIFS_FL_MASK 0x0000001F + +/* + * UBIFS compression algorithms. + * + * UBIFS_COMPR_NONE: no compression + * UBIFS_COMPR_LZO: LZO compression + * UBIFS_COMPR_ZLIB: ZLIB compression + * UBIFS_COMPR_TYPES_CNT: count of supported compression types + */ +enum { + UBIFS_COMPR_NONE, + UBIFS_COMPR_LZO, + UBIFS_COMPR_ZLIB, + UBIFS_COMPR_TYPES_CNT, +}; + +/* + * UBIFS node types. + * + * UBIFS_INO_NODE: inode node + * UBIFS_DATA_NODE: data node + * UBIFS_DENT_NODE: directory entry node + * UBIFS_XENT_NODE: extended attribute node + * UBIFS_TRUN_NODE: truncation node + * UBIFS_PAD_NODE: padding node + * UBIFS_SB_NODE: superblock node + * UBIFS_MST_NODE: master node + * UBIFS_REF_NODE: LEB reference node + * UBIFS_IDX_NODE: index node + * UBIFS_CS_NODE: commit start node + * UBIFS_ORPH_NODE: orphan node + * UBIFS_NODE_TYPES_CNT: count of supported node types + * + * Note, we index arrays by these numbers, so keep them low and contiguous. + * Node type constants for inodes, direntries and so on have to be the same as + * corresponding key type constants. + */ +enum { + UBIFS_INO_NODE, + UBIFS_DATA_NODE, + UBIFS_DENT_NODE, + UBIFS_XENT_NODE, + UBIFS_TRUN_NODE, + UBIFS_PAD_NODE, + UBIFS_SB_NODE, + UBIFS_MST_NODE, + UBIFS_REF_NODE, + UBIFS_IDX_NODE, + UBIFS_CS_NODE, + UBIFS_ORPH_NODE, + UBIFS_NODE_TYPES_CNT, +}; + +/* + * Master node flags. + * + * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty + * UBIFS_MST_NO_ORPHS: no orphan inodes present + * UBIFS_MST_RCVRY: written by recovery + */ +enum { + UBIFS_MST_DIRTY = 1, + UBIFS_MST_NO_ORPHS = 2, + UBIFS_MST_RCVRY = 4, +}; + +/* + * Node group type (used by recovery to recover whole group or none). + * + * UBIFS_NO_NODE_GROUP: this node is not part of a group + * UBIFS_IN_NODE_GROUP: this node is a part of a group + * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group + */ +enum { + UBIFS_NO_NODE_GROUP = 0, + UBIFS_IN_NODE_GROUP, + UBIFS_LAST_OF_NODE_GROUP, +}; + +/* + * Superblock flags. + * + * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed + */ +enum { + UBIFS_FLG_BIGLPT = 0x02, + UBIFS_FLG_SPACE_FIXUP = 0x04, +}; + +/** + * struct ubifs_ch - common header node. + * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) + * @crc: CRC-32 checksum of the node header + * @sqnum: sequence number + * @len: full node length + * @node_type: node type + * @group_type: node group type + * @padding: reserved for future, zeroes + * + * Every UBIFS node starts with this common part. If the node has a key, the + * key always goes next. + */ +struct ubifs_ch { + __le32 magic; + __le32 crc; + __le64 sqnum; + __le32 len; + __u8 node_type; + __u8 group_type; + __u8 padding[2]; +} __packed; + +/** + * union ubifs_dev_desc - device node descriptor. + * @new: new type device descriptor + * @huge: huge type device descriptor + * + * This data structure describes major/minor numbers of a device node. In an + * inode is a device node then its data contains an object of this type. UBIFS + * uses standard Linux "new" and "huge" device node encodings. + */ +union ubifs_dev_desc { + __le32 new; + __le64 huge; +} __packed; + +/** + * struct ubifs_ino_node - inode node. + * @ch: common header + * @key: node key + * @creat_sqnum: sequence number at time of creation + * @size: inode size in bytes (amount of uncompressed data) + * @atime_sec: access time seconds + * @ctime_sec: creation time seconds + * @mtime_sec: modification time seconds + * @atime_nsec: access time nanoseconds + * @ctime_nsec: creation time nanoseconds + * @mtime_nsec: modification time nanoseconds + * @nlink: number of hard links + * @uid: owner ID + * @gid: group ID + * @mode: access flags + * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc) + * @data_len: inode data length + * @xattr_cnt: count of extended attributes this inode has + * @xattr_size: summarized size of all extended attributes in bytes + * @padding1: reserved for future, zeroes + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @compr_type: compression type used for this inode + * @padding2: reserved for future, zeroes + * @data: data attached to the inode + * + * Note, even though inode compression type is defined by @compr_type, some + * nodes of this inode may be compressed with different compressor - this + * happens if compression type is changed while the inode already has data + * nodes. But @compr_type will be use for further writes to the inode. + * + * Note, do not forget to amend 'zero_ino_node_unused()' function when changing + * the padding fields. + */ +struct ubifs_ino_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 creat_sqnum; + __le64 size; + __le64 atime_sec; + __le64 ctime_sec; + __le64 mtime_sec; + __le32 atime_nsec; + __le32 ctime_nsec; + __le32 mtime_nsec; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 flags; + __le32 data_len; + __le32 xattr_cnt; + __le32 xattr_size; + __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */ + __le32 xattr_names; + __le16 compr_type; + __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ + __u8 data[]; +} __packed; + +/** + * struct ubifs_dent_node - directory entry node. + * @ch: common header + * @key: node key + * @inum: target inode number + * @padding1: reserved for future, zeroes + * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) + * @nlen: name length + * @padding2: reserved for future, zeroes + * @name: zero-terminated name + * + * Note, do not forget to amend 'zero_dent_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_dent_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 inum; + __u8 padding1; + __u8 type; + __le16 nlen; + __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __u8 name[]; +} __packed; + +/** + * struct ubifs_data_node - data node. + * @ch: common header + * @key: node key + * @size: uncompressed data size in bytes + * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) + * @padding: reserved for future, zeroes + * @data: data + * + * Note, do not forget to amend 'zero_data_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_data_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le32 size; + __le16 compr_type; + __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __u8 data[]; +} __packed; + +/** + * struct ubifs_trun_node - truncation node. + * @ch: common header + * @inum: truncated inode number + * @padding: reserved for future, zeroes + * @old_size: size before truncation + * @new_size: size after truncation + * + * This node exists only in the journal and never goes to the main area. Note, + * do not forget to amend 'zero_trun_node_unused()' function when changing the + * padding fields. + */ +struct ubifs_trun_node { + struct ubifs_ch ch; + __le32 inum; + __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ + __le64 old_size; + __le64 new_size; +} __packed; + +/** + * struct ubifs_pad_node - padding node. + * @ch: common header + * @pad_len: how many bytes after this node are unused (because padded) + * @padding: reserved for future, zeroes + */ +struct ubifs_pad_node { + struct ubifs_ch ch; + __le32 pad_len; +} __packed; + +/** + * struct ubifs_sb_node - superblock node. + * @ch: common header + * @padding: reserved for future, zeroes + * @key_hash: type of hash function used in keys + * @key_fmt: format of the key + * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc) + * @min_io_size: minimal input/output unit size + * @leb_size: logical eraseblock size in bytes + * @leb_cnt: count of LEBs used by file-system + * @max_leb_cnt: maximum count of LEBs used by file-system + * @max_bud_bytes: maximum amount of data stored in buds + * @log_lebs: log size in logical eraseblocks + * @lpt_lebs: number of LEBs used for lprops table + * @orph_lebs: number of LEBs used for recording orphans + * @jhead_cnt: count of journal heads + * @fanout: tree fanout (max. number of links per indexing node) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @fmt_version: UBIFS on-flash format version + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @padding1: reserved for future, zeroes + * @rp_uid: reserve pool UID + * @rp_gid: reserve pool GID + * @rp_size: size of the reserved pool in bytes + * @padding2: reserved for future, zeroes + * @time_gran: time granularity in nanoseconds + * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version + */ +struct ubifs_sb_node { + struct ubifs_ch ch; + __u8 padding[2]; + __u8 key_hash; + __u8 key_fmt; + __le32 flags; + __le32 min_io_size; + __le32 leb_size; + __le32 leb_cnt; + __le32 max_leb_cnt; + __le64 max_bud_bytes; + __le32 log_lebs; + __le32 lpt_lebs; + __le32 orph_lebs; + __le32 jhead_cnt; + __le32 fanout; + __le32 lsave_cnt; + __le32 fmt_version; + __le16 default_compr; + __u8 padding1[2]; + __le32 rp_uid; + __le32 rp_gid; + __le64 rp_size; + __le32 time_gran; + __u8 uuid[16]; + __le32 ro_compat_version; + __u8 padding2[3968]; +} __packed; + +/** + * struct ubifs_mst_node - master node. + * @ch: common header + * @highest_inum: highest inode number in the committed index + * @cmt_no: commit number + * @flags: various flags (%UBIFS_MST_DIRTY, etc) + * @log_lnum: start of the log + * @root_lnum: LEB number of the root indexing node + * @root_offs: offset within @root_lnum + * @root_len: root indexing node length + * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was + * not reserved and should be reserved on mount) + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @index_size: size of index on flash + * @total_free: total free space in bytes + * @total_dirty: total dirty space in bytes + * @total_used: total used space in bytes (includes only data LEBs) + * @total_dead: total dead space in bytes (includes only data LEBs) + * @total_dark: total dark space in bytes (includes only data LEBs) + * @lpt_lnum: LEB number of LPT root nnode + * @lpt_offs: offset of LPT root nnode + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @lsave_lnum: LEB number of LPT's save table (big model only) + * @lsave_offs: offset of LPT's save table (big model only) + * @lscan_lnum: LEB number of last LPT scan + * @empty_lebs: number of empty logical eraseblocks + * @idx_lebs: number of indexing logical eraseblocks + * @leb_cnt: count of LEBs used by file-system + * @padding: reserved for future, zeroes + */ +struct ubifs_mst_node { + struct ubifs_ch ch; + __le64 highest_inum; + __le64 cmt_no; + __le32 flags; + __le32 log_lnum; + __le32 root_lnum; + __le32 root_offs; + __le32 root_len; + __le32 gc_lnum; + __le32 ihead_lnum; + __le32 ihead_offs; + __le64 index_size; + __le64 total_free; + __le64 total_dirty; + __le64 total_used; + __le64 total_dead; + __le64 total_dark; + __le32 lpt_lnum; + __le32 lpt_offs; + __le32 nhead_lnum; + __le32 nhead_offs; + __le32 ltab_lnum; + __le32 ltab_offs; + __le32 lsave_lnum; + __le32 lsave_offs; + __le32 lscan_lnum; + __le32 empty_lebs; + __le32 idx_lebs; + __le32 leb_cnt; + __u8 padding[344]; +} __packed; + +/** + * struct ubifs_ref_node - logical eraseblock reference node. + * @ch: common header + * @lnum: the referred logical eraseblock number + * @offs: start offset in the referred LEB + * @jhead: journal head number + * @padding: reserved for future, zeroes + */ +struct ubifs_ref_node { + struct ubifs_ch ch; + __le32 lnum; + __le32 offs; + __le32 jhead; + __u8 padding[28]; +} __packed; + +/** + * struct ubifs_branch - key/reference/length branch + * @lnum: LEB number of the target node + * @offs: offset within @lnum + * @len: target node length + * @key: key + */ +struct ubifs_branch { + __le32 lnum; + __le32 offs; + __le32 len; + __u8 key[]; +} __packed; + +/** + * struct ubifs_idx_node - indexing node. + * @ch: common header + * @child_cnt: number of child index nodes + * @level: tree level + * @branches: LEB number / offset / length / key branches + */ +struct ubifs_idx_node { + struct ubifs_ch ch; + __le16 child_cnt; + __le16 level; + __u8 branches[]; +} __packed; + +/** + * struct ubifs_cs_node - commit start node. + * @ch: common header + * @cmt_no: commit number + */ +struct ubifs_cs_node { + struct ubifs_ch ch; + __le64 cmt_no; +} __packed; + +/** + * struct ubifs_orph_node - orphan node. + * @ch: common header + * @cmt_no: commit number (also top bit is set on the last node of the commit) + * @inos: inode numbers of orphans + */ +struct ubifs_orph_node { + struct ubifs_ch ch; + __le64 cmt_no; + __le64 inos[]; +} __packed; + +#endif /* __UBIFS_MEDIA_H__ */ diff --git a/kernel/fs/ubifs/ubifs.h b/kernel/fs/ubifs/ubifs.h new file mode 100644 index 000000000..de759022f --- /dev/null +++ b/kernel/fs/ubifs/ubifs.h @@ -0,0 +1,1803 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_H__ +#define __UBIFS_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs-media.h" + +/* Version of this UBIFS implementation */ +#define UBIFS_VERSION 1 + +/* Normal UBIFS messages */ +#define ubifs_msg(c, fmt, ...) \ + pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__) +/* UBIFS error messages */ +#define ubifs_err(c, fmt, ...) \ + pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ + __func__, ##__VA_ARGS__) +/* UBIFS warning messages */ +#define ubifs_warn(c, fmt, ...) \ + pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ + __func__, ##__VA_ARGS__) +/* + * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description + * object as an argument. + */ +#define ubifs_errc(c, fmt, ...) \ + do { \ + if (!(c)->probing) \ + ubifs_err(c, fmt, ##__VA_ARGS__); \ + } while (0) + +/* UBIFS file system VFS magic number */ +#define UBIFS_SUPER_MAGIC 0x24051905 + +/* Number of UBIFS blocks per VFS page */ +#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE) +#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT) + +/* "File system end of life" sequence number watermark */ +#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL +#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL + +/* + * Minimum amount of LEBs reserved for the index. At present the index needs at + * least 2 LEBs: one for the index head and one for in-the-gaps method (which + * currently does not cater for the index head and so excludes it from + * consideration). + */ +#define MIN_INDEX_LEBS 2 + +/* Minimum amount of data UBIFS writes to the flash */ +#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) + +/* + * Currently we do not support inode number overlapping and re-using, so this + * watermark defines dangerous inode number level. This should be fixed later, + * although it is difficult to exceed current limit. Another option is to use + * 64-bit inode numbers, but this means more overhead. + */ +#define INUM_WARN_WATERMARK 0xFFF00000 +#define INUM_WATERMARK 0xFFFFFF00 + +/* Maximum number of entries in each LPT (LEB category) heap */ +#define LPT_HEAP_SZ 256 + +/* + * Background thread name pattern. The numbers are UBI device and volume + * numbers. + */ +#define BGT_NAME_PATTERN "ubifs_bgt%d_%d" + +/* Write-buffer synchronization timeout interval in seconds */ +#define WBUF_TIMEOUT_SOFTLIMIT 3 +#define WBUF_TIMEOUT_HARDLIMIT 5 + +/* Maximum possible inode number (only 32-bit inodes are supported now) */ +#define MAX_INUM 0xFFFFFFFF + +/* Number of non-data journal heads */ +#define NONDATA_JHEADS_CNT 2 + +/* Shorter names for journal head numbers for internal usage */ +#define GCHD UBIFS_GC_HEAD +#define BASEHD UBIFS_BASE_HEAD +#define DATAHD UBIFS_DATA_HEAD + +/* 'No change' value for 'ubifs_change_lp()' */ +#define LPROPS_NC 0x80000001 + +/* + * There is no notion of truncation key because truncation nodes do not exist + * in TNC. However, when replaying, it is handy to introduce fake "truncation" + * keys for truncation nodes because the code becomes simpler. So we define + * %UBIFS_TRUN_KEY type. + * + * But otherwise, out of the journal reply scope, the truncation keys are + * invalid. + */ +#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT +#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT + +/* + * How much a directory entry/extended attribute entry adds to the parent/host + * inode. + */ +#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8) + +/* How much an extended attribute adds to the host inode */ +#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8) + +/* + * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered + * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are + * considered "young". This is used by shrinker when selecting znode to trim + * off. + */ +#define OLD_ZNODE_AGE 20 +#define YOUNG_ZNODE_AGE 5 + +/* + * Some compressors, like LZO, may end up with more data then the input buffer. + * So UBIFS always allocates larger output buffer, to be sure the compressor + * will not corrupt memory in case of worst case compression. + */ +#define WORST_COMPR_FACTOR 2 + +/* + * How much memory is needed for a buffer where we compress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + +/* Maximum expected tree height for use by bottom_up_buf */ +#define BOTTOM_UP_HEIGHT 64 + +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + +/* + * Lockdep classes for UBIFS inode @ui_mutex. + */ +enum { + WB_MUTEX_1 = 0, + WB_MUTEX_2 = 1, + WB_MUTEX_3 = 2, +}; + +/* + * Znode flags (actually, bit numbers which store the flags). + * + * DIRTY_ZNODE: znode is dirty + * COW_ZNODE: znode is being committed and a new instance of this znode has to + * be created before changing this znode + * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is + * still in the commit list and the ongoing commit operation + * will commit it, and delete this znode after it is done + */ +enum { + DIRTY_ZNODE = 0, + COW_ZNODE = 1, + OBSOLETE_ZNODE = 2, +}; + +/* + * Commit states. + * + * COMMIT_RESTING: commit is not wanted + * COMMIT_BACKGROUND: background commit has been requested + * COMMIT_REQUIRED: commit is required + * COMMIT_RUNNING_BACKGROUND: background commit is running + * COMMIT_RUNNING_REQUIRED: commit is running and it is required + * COMMIT_BROKEN: commit failed + */ +enum { + COMMIT_RESTING = 0, + COMMIT_BACKGROUND, + COMMIT_REQUIRED, + COMMIT_RUNNING_BACKGROUND, + COMMIT_RUNNING_REQUIRED, + COMMIT_BROKEN, +}; + +/* + * 'ubifs_scan_a_node()' return values. + * + * SCANNED_GARBAGE: scanned garbage + * SCANNED_EMPTY_SPACE: scanned empty space + * SCANNED_A_NODE: scanned a valid node + * SCANNED_A_CORRUPT_NODE: scanned a corrupted node + * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length + * + * Greater than zero means: 'scanned that number of padding bytes' + */ +enum { + SCANNED_GARBAGE = 0, + SCANNED_EMPTY_SPACE = -1, + SCANNED_A_NODE = -2, + SCANNED_A_CORRUPT_NODE = -3, + SCANNED_A_BAD_PAD_NODE = -4, +}; + +/* + * LPT cnode flag bits. + * + * DIRTY_CNODE: cnode is dirty + * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), + * so it can (and must) be freed when the commit is finished + * COW_CNODE: cnode is being committed and must be copied before writing + */ +enum { + DIRTY_CNODE = 0, + OBSOLETE_CNODE = 1, + COW_CNODE = 2, +}; + +/* + * Dirty flag bits (lpt_drty_flgs) for LPT special nodes. + * + * LTAB_DIRTY: ltab node is dirty + * LSAVE_DIRTY: lsave node is dirty + */ +enum { + LTAB_DIRTY = 1, + LSAVE_DIRTY = 2, +}; + +/* + * Return codes used by the garbage collector. + * @LEB_FREED: the logical eraseblock was freed and is ready to use + * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit + * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes + */ +enum { + LEB_FREED, + LEB_FREED_IDX, + LEB_RETAINED, +}; + +/** + * struct ubifs_old_idx - index node obsoleted since last commit start. + * @rb: rb-tree node + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + */ +struct ubifs_old_idx { + struct rb_node rb; + int lnum; + int offs; +}; + +/* The below union makes it easier to deal with keys */ +union ubifs_key { + uint8_t u8[UBIFS_SK_LEN]; + uint32_t u32[UBIFS_SK_LEN/4]; + uint64_t u64[UBIFS_SK_LEN/8]; + __le32 j32[UBIFS_SK_LEN/4]; +}; + +/** + * struct ubifs_scan_node - UBIFS scanned node information. + * @list: list of scanned nodes + * @key: key of node scanned (if it has one) + * @sqnum: sequence number + * @type: type of node scanned + * @offs: offset with LEB of node scanned + * @len: length of node scanned + * @node: raw node + */ +struct ubifs_scan_node { + struct list_head list; + union ubifs_key key; + unsigned long long sqnum; + int type; + int offs; + int len; + void *node; +}; + +/** + * struct ubifs_scan_leb - UBIFS scanned LEB information. + * @lnum: logical eraseblock number + * @nodes_cnt: number of nodes scanned + * @nodes: list of struct ubifs_scan_node + * @endpt: end point (and therefore the start of empty space) + * @buf: buffer containing entire LEB scanned + */ +struct ubifs_scan_leb { + int lnum; + int nodes_cnt; + struct list_head nodes; + int endpt; + void *buf; +}; + +/** + * struct ubifs_gced_idx_leb - garbage-collected indexing LEB. + * @list: list + * @lnum: LEB number + * @unmap: OK to unmap this LEB + * + * This data structure is used to temporary store garbage-collected indexing + * LEBs - they are not released immediately, but only after the next commit. + * This is needed to guarantee recoverability. + */ +struct ubifs_gced_idx_leb { + struct list_head list; + int lnum; + int unmap; +}; + +/** + * struct ubifs_inode - UBIFS in-memory inode description. + * @vfs_inode: VFS inode description object + * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; + * @xattr_size: summarized size of all extended attributes in bytes + * @xattr_cnt: count of extended attributes this inode has + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @dirty: non-zero if the inode is dirty + * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used + * @ui_mutex: serializes inode write-back with the rest of VFS operations, + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size + * @ui_lock: protects @synced_i_size + * @synced_i_size: synchronized size of inode, i.e. the value of inode size + * currently stored on the flash; used only for regular file + * inodes + * @ui_size: inode size used by UBIFS when writing to flash + * @flags: inode flags (@UBIFS_COMPR_FL, etc) + * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) + * @data_len: length of the data attached to the inode + * @data: inode's data + * + * @ui_mutex exists for two main reasons. At first it prevents inodes from + * being written back while UBIFS changing them, being in the middle of an VFS + * operation. This way UBIFS makes sure the inode fields are consistent. For + * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * write-back must not write any of them before we have finished. + * + * The second reason is budgeting - UBIFS has to budget all operations. If an + * operation is going to mark an inode dirty, it has to allocate budget for + * this. It cannot just mark it dirty because there is no guarantee there will + * be enough flash space to write the inode back later. This means UBIFS has + * to have full control over inode "clean <-> dirty" transitions (and pages + * actually). But unfortunately, VFS marks inodes dirty in many places, and it + * does not ask the file-system if it is allowed to do so (there is a notifier, + * but it is not enough), i.e., there is no mechanism to synchronize with this. + * So UBIFS has its own inode dirty flag and its own mutex to serialize + * "clean <-> dirty" transitions. + * + * The @synced_i_size field is used to make sure we never write pages which are + * beyond last synchronized inode size. See 'ubifs_writepage()' for more + * information. + * + * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses + * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot + * make sure @inode->i_size is always changed under @ui_mutex, because it + * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would + * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields + * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * could consider to rework locking and base it on "shadow" fields. + */ +struct ubifs_inode { + struct inode vfs_inode; + unsigned long long creat_sqnum; + unsigned long long del_cmtno; + unsigned int xattr_size; + unsigned int xattr_cnt; + unsigned int xattr_names; + unsigned int dirty:1; + unsigned int xattr:1; + unsigned int bulk_read:1; + unsigned int compr_type:2; + struct mutex ui_mutex; + spinlock_t ui_lock; + loff_t synced_i_size; + loff_t ui_size; + int flags; + pgoff_t last_page_read; + pgoff_t read_in_a_row; + int data_len; + void *data; +}; + +/** + * struct ubifs_unclean_leb - records a LEB recovered under read-only mode. + * @list: list + * @lnum: LEB number of recovered LEB + * @endpt: offset where recovery ended + * + * This structure records a LEB identified during recovery that needs to be + * cleaned but was not because UBIFS was mounted read-only. The information + * is used to clean the LEB when remounting to read-write mode. + */ +struct ubifs_unclean_leb { + struct list_head list; + int lnum; + int endpt; +}; + +/* + * LEB properties flags. + * + * LPROPS_UNCAT: not categorized + * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index + * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index + * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index + * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs + * LPROPS_EMPTY: LEB is empty, not taken + * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken + * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken + * LPROPS_CAT_MASK: mask for the LEB categories above + * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media) + * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash) + */ +enum { + LPROPS_UNCAT = 0, + LPROPS_DIRTY = 1, + LPROPS_DIRTY_IDX = 2, + LPROPS_FREE = 3, + LPROPS_HEAP_CNT = 3, + LPROPS_EMPTY = 4, + LPROPS_FREEABLE = 5, + LPROPS_FRDI_IDX = 6, + LPROPS_CAT_MASK = 15, + LPROPS_TAKEN = 16, + LPROPS_INDEX = 32, +}; + +/** + * struct ubifs_lprops - logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @flags: LEB properties flags (see above) + * @lnum: LEB number + * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE) + * @hpos: heap position in heap of same-category lprops (other categories) + */ +struct ubifs_lprops { + int free; + int dirty; + int flags; + int lnum; + union { + struct list_head list; + int hpos; + }; +}; + +/** + * struct ubifs_lpt_lprops - LPT logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @tgc: trivial GC flag (1 => unmap after commit end) + * @cmt: commit flag (1 => reserved for commit) + */ +struct ubifs_lpt_lprops { + int free; + int dirty; + unsigned tgc:1; + unsigned cmt:1; +}; + +/** + * struct ubifs_lp_stats - statistics of eraseblocks in the main area. + * @empty_lebs: number of empty LEBs + * @taken_empty_lebs: number of taken LEBs + * @idx_lebs: number of indexing LEBs + * @total_free: total free space in bytes (includes all LEBs) + * @total_dirty: total dirty space in bytes (includes all LEBs) + * @total_used: total used space in bytes (does not include index LEBs) + * @total_dead: total dead space in bytes (does not include index LEBs) + * @total_dark: total dark space in bytes (does not include index LEBs) + * + * The @taken_empty_lebs field counts the LEBs that are in the transient state + * of having been "taken" for use but not yet written to. @taken_empty_lebs is + * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be + * used by itself (in which case 'unused_lebs' would be a better name). In the + * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained + * by GC, but unlike other empty LEBs that are "taken", it may not be written + * straight away (i.e. before the next commit start or unmount), so either + * @gc_lnum must be specially accounted for, or the current approach followed + * i.e. count it under @taken_empty_lebs. + * + * @empty_lebs includes @taken_empty_lebs. + * + * @total_used, @total_dead and @total_dark fields do not account indexing + * LEBs. + */ +struct ubifs_lp_stats { + int empty_lebs; + int taken_empty_lebs; + int idx_lebs; + long long total_free; + long long total_dirty; + long long total_used; + long long total_dead; + long long total_dark; +}; + +struct ubifs_nnode; + +/** + * struct ubifs_cnode - LEB Properties Tree common node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (zero for pnodes, greater than zero for nnodes) + * @num: node number + */ +struct ubifs_cnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; +}; + +/** + * struct ubifs_pnode - LEB Properties Tree leaf node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always zero for pnodes) + * @num: node number + * @lprops: LEB properties array + */ +struct ubifs_pnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_lprops lprops[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_nbranch - LEB Properties Tree internal node branch. + * @lnum: LEB number of child + * @offs: offset of child + * @nnode: nnode child + * @pnode: pnode child + * @cnode: cnode child + */ +struct ubifs_nbranch { + int lnum; + int offs; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + }; +}; + +/** + * struct ubifs_nnode - LEB Properties Tree internal node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always greater than zero for nnodes) + * @num: node number + * @nbranch: branches to child nodes + */ +struct ubifs_nnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_lpt_heap - heap of categorized lprops. + * @arr: heap array + * @cnt: number in heap + * @max_cnt: maximum number allowed in heap + * + * There are %LPROPS_HEAP_CNT heaps. + */ +struct ubifs_lpt_heap { + struct ubifs_lprops **arr; + int cnt; + int max_cnt; +}; + +/* + * Return codes for LPT scan callback function. + * + * LPT_SCAN_CONTINUE: continue scanning + * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory + * LPT_SCAN_STOP: stop scanning + */ +enum { + LPT_SCAN_CONTINUE = 0, + LPT_SCAN_ADD = 1, + LPT_SCAN_STOP = 2, +}; + +struct ubifs_info; + +/* Callback used by the 'ubifs_lpt_scan_nolock()' function */ +typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, + const struct ubifs_lprops *lprops, + int in_tree, void *data); + +/** + * struct ubifs_wbuf - UBIFS write-buffer. + * @c: UBIFS file-system description object + * @buf: write-buffer (of min. flash I/O unit size) + * @lnum: logical eraseblock number the write-buffer points to + * @offs: write-buffer offset in this logical eraseblock + * @avail: number of bytes available in the write-buffer + * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) + * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep + * up by 'mutex_lock_nested()). + * @sync_callback: write-buffer synchronization callback + * @io_mutex: serializes write-buffer I/O + * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes + * fields + * @softlimit: soft write-buffer timeout interval + * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit + * and @softlimit + @delta) + * @timer: write-buffer timer + * @no_timer: non-zero if this write-buffer does not have a timer + * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing + * @next_ino: points to the next position of the following inode number + * @inodes: stores the inode numbers of the nodes which are in wbuf + * + * The write-buffer synchronization callback is called when the write-buffer is + * synchronized in order to notify how much space was wasted due to + * write-buffer padding and how much free space is left in the LEB. + * + * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under + * spin-lock or mutex because they are written under both mutex and spin-lock. + * @buf is appended to under mutex but overwritten under both mutex and + * spin-lock. Thus the data between @buf and @buf + @used can be read under + * spinlock. + */ +struct ubifs_wbuf { + struct ubifs_info *c; + void *buf; + int lnum; + int offs; + int avail; + int used; + int size; + int jhead; + int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); + struct mutex io_mutex; + spinlock_t lock; + ktime_t softlimit; + unsigned long long delta; + struct hrtimer timer; + unsigned int no_timer:1; + unsigned int need_sync:1; + int next_ino; + ino_t *inodes; +}; + +/** + * struct ubifs_bud - bud logical eraseblock. + * @lnum: logical eraseblock number + * @start: where the (uncommitted) bud data starts + * @jhead: journal head number this bud belongs to + * @list: link in the list buds belonging to the same journal head + * @rb: link in the tree of all buds + */ +struct ubifs_bud { + int lnum; + int start; + int jhead; + struct list_head list; + struct rb_node rb; +}; + +/** + * struct ubifs_jhead - journal head. + * @wbuf: head's write-buffer + * @buds_list: list of bud LEBs belonging to this journal head + * @grouped: non-zero if UBIFS groups nodes when writing to this journal head + * + * Note, the @buds list is protected by the @c->buds_lock. + */ +struct ubifs_jhead { + struct ubifs_wbuf wbuf; + struct list_head buds_list; + unsigned int grouped:1; +}; + +/** + * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. + * @key: key + * @znode: znode address in memory + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum + * @len: target node length + */ +struct ubifs_zbranch { + union ubifs_key key; + union { + struct ubifs_znode *znode; + void *leaf; + }; + int lnum; + int offs; + int len; +}; + +/** + * struct ubifs_znode - in-memory representation of an indexing node. + * @parent: parent znode or NULL if it is the root + * @cnext: next znode to commit + * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE) + * @time: last access time (seconds) + * @level: level of the entry in the TNC tree + * @child_cnt: count of child znodes + * @iip: index in parent's zbranch array + * @alt: lower bound of key range has altered i.e. child inserted at slot 0 + * @lnum: LEB number of the corresponding indexing node + * @offs: offset of the corresponding indexing node + * @len: length of the corresponding indexing node + * @zbranch: array of znode branches (@c->fanout elements) + * + * Note! The @lnum, @offs, and @len fields are not really needed - we have them + * only for internal consistency check. They could be removed to save some RAM. + */ +struct ubifs_znode { + struct ubifs_znode *parent; + struct ubifs_znode *cnext; + unsigned long flags; + unsigned long time; + int level; + int child_cnt; + int iip; + int alt; + int lnum; + int offs; + int len; + struct ubifs_zbranch zbranch[]; +}; + +/** + * struct bu_info - bulk-read information. + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** + * struct ubifs_node_range - node length range description data structure. + * @len: fixed node length + * @min_len: minimum possible node length + * @max_len: maximum possible node length + * + * If @max_len is %0, the node has fixed length @len. + */ +struct ubifs_node_range { + union { + int len; + int min_len; + }; + int max_len; +}; + +/** + * struct ubifs_compressor - UBIFS compressor description structure. + * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) + * @cc: cryptoapi compressor handle + * @comp_mutex: mutex used during compression + * @decomp_mutex: mutex used during decompression + * @name: compressor name + * @capi_name: cryptoapi compressor name + */ +struct ubifs_compressor { + int compr_type; + struct crypto_comp *cc; + struct mutex *comp_mutex; + struct mutex *decomp_mutex; + const char *name; + const char *capi_name; +}; + +/** + * struct ubifs_budget_req - budget requirements of an operation. + * + * @fast: non-zero if the budgeting should try to acquire budget quickly and + * should not try to call write-back + * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields + * have to be re-calculated + * @new_page: non-zero if the operation adds a new page + * @dirtied_page: non-zero if the operation makes a page dirty + * @new_dent: non-zero if the operation adds a new directory entry + * @mod_dent: non-zero if the operation removes or modifies an existing + * directory entry + * @new_ino: non-zero if the operation adds a new inode + * @new_ino_d: now much data newly created inode contains + * @dirtied_ino: how many inodes the operation makes dirty + * @dirtied_ino_d: now much data dirtied inode contains + * @idx_growth: how much the index will supposedly grow + * @data_growth: how much new data the operation will supposedly add + * @dd_growth: how much data that makes other data dirty the operation will + * supposedly add + * + * @idx_growth, @data_growth and @dd_growth are not used in budget request. The + * budgeting subsystem caches index and data growth values there to avoid + * re-calculating them when the budget is released. However, if @idx_growth is + * %-1, it is calculated by the release function using other fields. + * + * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d + * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made + * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. + */ +struct ubifs_budget_req { + unsigned int fast:1; + unsigned int recalculate:1; +#ifndef UBIFS_DEBUG + unsigned int new_page:1; + unsigned int dirtied_page:1; + unsigned int new_dent:1; + unsigned int mod_dent:1; + unsigned int new_ino:1; + unsigned int new_ino_d:13; + unsigned int dirtied_ino:4; + unsigned int dirtied_ino_d:15; +#else + /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; + unsigned int dirtied_ino; + unsigned int dirtied_ino_d; +#endif + int idx_growth; + int data_growth; + int dd_growth; +}; + +/** + * struct ubifs_orphan - stores the inode number of an orphan. + * @rb: rb-tree node of rb-tree of orphans sorted by inode number + * @list: list head of list of orphans in order added + * @new_list: list head of list of orphans added since the last commit + * @cnext: next orphan to commit + * @dnext: next orphan to delete + * @inum: inode number + * @new: %1 => added since the last commit, otherwise %0 + * @cmt: %1 => commit pending, otherwise %0 + * @del: %1 => delete pending, otherwise %0 + */ +struct ubifs_orphan { + struct rb_node rb; + struct list_head list; + struct list_head new_list; + struct ubifs_orphan *cnext; + struct ubifs_orphan *dnext; + ino_t inum; + unsigned new:1; + unsigned cmt:1; + unsigned del:1; +}; + +/** + * struct ubifs_mount_opts - UBIFS-specific mount options information. + * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disable, %2 enable) + * @chk_data_crc: enable/disable CRC data checking when reading data nodes + * (%0 default, %1 disable, %2 enable) + * @override_compr: override default compressor (%0 - do not override and use + * superblock compressor, %1 - override and use compressor + * specified in @compr_type) + * @compr_type: compressor type to override the superblock compressor with + * (%UBIFS_COMPR_NONE, etc) + */ +struct ubifs_mount_opts { + unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; + unsigned int override_compr:1; + unsigned int compr_type:2; +}; + +/** + * struct ubifs_budg_info - UBIFS budgeting information. + * @idx_growth: amount of bytes budgeted for index growth + * @data_growth: amount of bytes budgeted for cached data + * @dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but + * which still have to be taken into account because the index + * has not been committed so far + * @old_idx_sz: size of index on flash + * @min_idx_lebs: minimum number of LEBs required for the index + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * @page_budget: budget for a page (constant, never changed after mount) + * @inode_budget: budget for an inode (constant, never changed after mount) + * @dent_budget: budget for a directory entry (constant, never changed after + * mount) + */ +struct ubifs_budg_info { + long long idx_growth; + long long data_growth; + long long dd_growth; + long long uncommitted_idx; + unsigned long long old_idx_sz; + int min_idx_lebs; + unsigned int nospace:1; + unsigned int nospace_rp:1; + int page_budget; + int inode_budget; + int dent_budget; +}; + +struct ubifs_debug_info; + +/** + * struct ubifs_info - UBIFS file-system description data structure + * (per-superblock). + * @vfs_sb: VFS @struct super_block object + * @bdi: backing device info object to make VFS happy and disable read-ahead + * + * @highest_inum: highest used inode number + * @max_sqnum: current global sequence number + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters + * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version + * @uuid: UUID from super block + * + * @lhead_lnum: log head logical eraseblock number + * @lhead_offs: log head offset + * @ltail_lnum: log tail logical eraseblock number (offset is always 0) + * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and + * @bud_bytes + * @min_log_bytes: minimum required number of bytes in the log + * @cmt_bud_bytes: used during commit to temporarily amount of bytes in + * committed buds + * + * @buds: tree of all buds indexed by bud LEB number + * @bud_bytes: how many bytes of flash is used by buds + * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud + * lists + * @jhead_cnt: count of journal heads + * @jheads: journal heads (head zero is base head) + * @max_bud_bytes: maximum number of bytes allowed in buds + * @bg_bud_bytes: number of bud bytes when background commit is initiated + * @old_buds: buds to be released after commit ends + * @max_bud_cnt: maximum number of buds + * + * @commit_sem: synchronizes committer with other processes + * @cmt_state: commit state + * @cs_lock: commit state lock + * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * + * @big_lpt: flag that LPT is too big to write whole during commit + * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible + * + * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and + * @calc_idx_sz + * @zroot: zbranch which points to the root index node and znode + * @cnext: next znode to commit + * @enext: next znode to commit to empty space + * @gap_lebs: array of LEBs used by the in-gaps commit method + * @cbuf: commit buffer + * @ileb_buf: buffer for commit in-the-gaps method + * @ileb_len: length of data in ileb_buf + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @ilebs: pre-allocated index LEBs + * @ileb_cnt: number of pre-allocated index LEBs + * @ileb_nxt: next pre-allocated index LEBs + * @old_idx: tree of index nodes obsoleted since the last commit start + * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c + * + * @mst_node: master node + * @mst_offs: offset of valid master node + * + * @max_bu_buf_len: maximum bulk-read buffer length + * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu + * @bu: pre-allocated bulk-read information + * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * + * @log_lebs: number of logical eraseblocks in the log + * @log_bytes: log size in bytes + * @log_last: last LEB of the log + * @lpt_lebs: number of LEBs used for lprops table + * @lpt_first: first LEB of the lprops table area + * @lpt_last: last LEB of the lprops table area + * @orph_lebs: number of LEBs used for the orphan area + * @orph_first: first LEB of the orphan area + * @orph_last: last LEB of the orphan area + * @main_lebs: count of LEBs in the main area + * @main_first: first LEB of the main area + * @main_bytes: main area size in bytes + * + * @key_hash_type: type of the key hash + * @key_hash: direntry key hash function + * @key_fmt: key format + * @key_len: key length + * @fanout: fanout of the index tree (number of links per indexing node) + * + * @min_io_size: minimal input/output unit size + * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one + * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks + * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) + * @leb_cnt: count of logical eraseblocks + * @max_leb_cnt: maximum count of logical eraseblocks + * @old_leb_cnt: count of logical eraseblocks before re-size + * @ro_media: the underlying UBI volume is read-only + * @ro_mount: the file-system was mounted as read-only + * @ro_error: UBIFS switched to R/O mode because an error happened + * + * @dirty_pg_cnt: number of dirty pages (not used) + * @dirty_zn_cnt: number of dirty znodes + * @clean_zn_cnt: number of clean znodes + * + * @space_lock: protects @bi and @lst + * @lst: lprops statistics + * @bi: budgeting information + * @calc_idx_sz: temporary variable which is used to calculate new index size + * (contains accurate new index size at end of TNC commit start) + * + * @ref_node_alsz: size of the LEB reference node aligned to the min. flash + * I/O unit + * @mst_node_alsz: master node aligned size + * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary + * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary + * @max_inode_sz: maximum possible inode size in bytes + * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting + * @dead_wm: LEB dead space watermark + * @dark_wm: LEB dark space watermark + * @block_cnt: count of 4KiB blocks on the FS + * + * @ranges: UBIFS node length ranges + * @ubi: UBI volume descriptor + * @di: UBI device information + * @vi: UBI volume information + * + * @orph_tree: rb-tree of orphan inode numbers + * @orph_list: list of orphan inode numbers in order added + * @orph_new: list of orphan inode numbers added since last commit + * @orph_cnext: next orphan to commit + * @orph_dnext: next orphan to delete + * @orphan_lock: lock for orph_tree and orph_new + * @orph_buf: buffer for orphan nodes + * @new_orphans: number of orphans since last commit + * @cmt_orphans: number of orphans being committed + * @tot_orphans: number of orphans in the rb_tree + * @max_orphans: maximum number of orphans allowed + * @ohead_lnum: orphan head LEB number + * @ohead_offs: orphan head offset + * @no_orphs: non-zero if there are no orphans + * + * @bgt: UBIFS background thread + * @bgt_name: background thread name + * @need_bgt: if background thread should run + * @need_wbuf_sync: if write-buffers have to be synchronized + * + * @gc_lnum: LEB number used for garbage collection + * @sbuf: a buffer of LEB size used by GC and replay for scanning + * @idx_gc: list of index LEBs that have been garbage collected + * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected + * + * @infos_list: links all 'ubifs_info' objects + * @umount_mutex: serializes shrinker and un-mount + * @shrinker_run_no: shrinker run number + * + * @space_bits: number of bits needed to record free or dirty space + * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT + * @lpt_offs_bits: number of bits needed to record an offset in the LPT + * @lpt_spc_bits: number of bits needed to space in the LPT + * @pcnt_bits: number of bits needed to record pnode or nnode number + * @lnum_bits: number of bits needed to record LEB number + * @nnode_sz: size of on-flash nnode + * @pnode_sz: size of on-flash pnode + * @ltab_sz: size of on-flash LPT lprops table + * @lsave_sz: size of on-flash LPT save table + * @pnode_cnt: number of pnodes + * @nnode_cnt: number of nnodes + * @lpt_hght: height of the LPT + * @pnodes_have: number of pnodes in memory + * + * @lp_mutex: protects lprops table and all the other lprops-related fields + * @lpt_lnum: LEB number of the root nnode of the LPT + * @lpt_offs: offset of the root nnode of the LPT + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab + * @dirty_nn_cnt: number of dirty nnodes + * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed + * @lpt_sz: LPT size + * @lpt_nod_buf: buffer for an on-flash nnode or pnode + * @lpt_buf: buffer of LEB size used by LPT + * @nroot: address in memory of the root nnode of the LPT + * @lpt_cnext: next LPT node to commit + * @lpt_heap: array of heaps of categorized lprops + * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at + * previous commit start + * @uncat_list: list of un-categorized LEBs + * @empty_list: list of empty LEBs + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) + * @freeable_cnt: number of freeable LEBs in @freeable_list + * @in_a_category_cnt: count of lprops which are in a certain category, which + * basically meants that they were loaded from the flash + * + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @ltab: LPT's own lprops table + * @ltab_cmt: LPT's own lprops table (commit copy) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @lsave_lnum: LEB number of LPT's save table + * @lsave_offs: offset of LPT's save table + * @lsave: LPT's save table + * @lscan_lnum: LEB number of last LPT scan + * + * @rp_size: size of the reserved pool in bytes + * @report_rp_size: size of the reserved pool reported to user-space + * @rp_uid: reserved pool user ID + * @rp_gid: reserved pool group ID + * + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @probing: %1 while attempting to mount if MS_SILENT mount flag is set + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode + * @replay_list: temporary list used during journal replay + * @replay_buds: list of buds to replay + * @cs_sqnum: sequence number of first node in the log (commit start node) + * @replay_sqnum: sequence number of node currently being replayed + * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W + * mode + * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted + * FS to R/W mode + * @size_tree: inode size information for recovery + * @mount_opts: UBIFS-specific mount options + * + * @dbg: debugging-related information + */ +struct ubifs_info { + struct super_block *vfs_sb; + struct backing_dev_info bdi; + + ino_t highest_inum; + unsigned long long max_sqnum; + unsigned long long cmt_no; + spinlock_t cnt_lock; + int fmt_version; + int ro_compat_version; + unsigned char uuid[16]; + + int lhead_lnum; + int lhead_offs; + int ltail_lnum; + struct mutex log_mutex; + int min_log_bytes; + long long cmt_bud_bytes; + + struct rb_root buds; + long long bud_bytes; + spinlock_t buds_lock; + int jhead_cnt; + struct ubifs_jhead *jheads; + long long max_bud_bytes; + long long bg_bud_bytes; + struct list_head old_buds; + int max_bud_cnt; + + struct rw_semaphore commit_sem; + int cmt_state; + spinlock_t cs_lock; + wait_queue_head_t cmt_wq; + + unsigned int big_lpt:1; + unsigned int space_fixup:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int rw_incompat:1; + + struct mutex tnc_mutex; + struct ubifs_zbranch zroot; + struct ubifs_znode *cnext; + struct ubifs_znode *enext; + int *gap_lebs; + void *cbuf; + void *ileb_buf; + int ileb_len; + int ihead_lnum; + int ihead_offs; + int *ilebs; + int ileb_cnt; + int ileb_nxt; + struct rb_root old_idx; + int *bottom_up_buf; + + struct ubifs_mst_node *mst_node; + int mst_offs; + + int max_bu_buf_len; + struct mutex bu_mutex; + struct bu_info bu; + + struct mutex write_reserve_mutex; + void *write_reserve_buf; + + int log_lebs; + long long log_bytes; + int log_last; + int lpt_lebs; + int lpt_first; + int lpt_last; + int orph_lebs; + int orph_first; + int orph_last; + int main_lebs; + int main_first; + long long main_bytes; + + uint8_t key_hash_type; + uint32_t (*key_hash)(const char *str, int len); + int key_fmt; + int key_len; + int fanout; + + int min_io_size; + int min_io_shift; + int max_write_size; + int max_write_shift; + int leb_size; + int leb_start; + int half_leb_size; + int idx_leb_size; + int leb_cnt; + int max_leb_cnt; + int old_leb_cnt; + unsigned int ro_media:1; + unsigned int ro_mount:1; + unsigned int ro_error:1; + + atomic_long_t dirty_pg_cnt; + atomic_long_t dirty_zn_cnt; + atomic_long_t clean_zn_cnt; + + spinlock_t space_lock; + struct ubifs_lp_stats lst; + struct ubifs_budg_info bi; + unsigned long long calc_idx_sz; + + int ref_node_alsz; + int mst_node_alsz; + int min_idx_node_sz; + int max_idx_node_sz; + long long max_inode_sz; + int max_znode_sz; + + int leb_overhead; + int dead_wm; + int dark_wm; + int block_cnt; + + struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT]; + struct ubi_volume_desc *ubi; + struct ubi_device_info di; + struct ubi_volume_info vi; + + struct rb_root orph_tree; + struct list_head orph_list; + struct list_head orph_new; + struct ubifs_orphan *orph_cnext; + struct ubifs_orphan *orph_dnext; + spinlock_t orphan_lock; + void *orph_buf; + int new_orphans; + int cmt_orphans; + int tot_orphans; + int max_orphans; + int ohead_lnum; + int ohead_offs; + int no_orphs; + + struct task_struct *bgt; + char bgt_name[sizeof(BGT_NAME_PATTERN) + 9]; + int need_bgt; + int need_wbuf_sync; + + int gc_lnum; + void *sbuf; + struct list_head idx_gc; + int idx_gc_cnt; + int gc_seq; + int gced_lnum; + + struct list_head infos_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + int space_bits; + int lpt_lnum_bits; + int lpt_offs_bits; + int lpt_spc_bits; + int pcnt_bits; + int lnum_bits; + int nnode_sz; + int pnode_sz; + int ltab_sz; + int lsave_sz; + int pnode_cnt; + int nnode_cnt; + int lpt_hght; + int pnodes_have; + + struct mutex lp_mutex; + int lpt_lnum; + int lpt_offs; + int nhead_lnum; + int nhead_offs; + int lpt_drty_flgs; + int dirty_nn_cnt; + int dirty_pn_cnt; + int check_lpt_free; + long long lpt_sz; + void *lpt_nod_buf; + void *lpt_buf; + struct ubifs_nnode *nroot; + struct ubifs_cnode *lpt_cnext; + struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT]; + struct ubifs_lpt_heap dirty_idx; + struct list_head uncat_list; + struct list_head empty_list; + struct list_head freeable_list; + struct list_head frdi_idx_list; + int freeable_cnt; + int in_a_category_cnt; + + int ltab_lnum; + int ltab_offs; + struct ubifs_lpt_lprops *ltab; + struct ubifs_lpt_lprops *ltab_cmt; + int lsave_cnt; + int lsave_lnum; + int lsave_offs; + int *lsave; + int lscan_lnum; + + long long rp_size; + long long report_rp_size; + kuid_t rp_uid; + kgid_t rp_gid; + + /* The below fields are used only during mounting and re-mounting */ + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; + unsigned int probing:1; + struct list_head replay_list; + struct list_head replay_buds; + unsigned long long cs_sqnum; + unsigned long long replay_sqnum; + struct list_head unclean_leb_list; + struct ubifs_mst_node *rcvrd_mst_node; + struct rb_root size_tree; + struct ubifs_mount_opts mount_opts; + + struct ubifs_debug_info *dbg; +}; + +extern struct list_head ubifs_infos; +extern spinlock_t ubifs_infos_lock; +extern atomic_long_t ubifs_clean_zn_cnt; +extern struct kmem_cache *ubifs_inode_slab; +extern const struct super_operations ubifs_super_operations; +extern const struct xattr_handler *ubifs_xattr_handlers[]; +extern const struct address_space_operations ubifs_file_address_operations; +extern const struct file_operations ubifs_file_operations; +extern const struct inode_operations ubifs_file_inode_operations; +extern const struct file_operations ubifs_dir_operations; +extern const struct inode_operations ubifs_dir_inode_operations; +extern const struct inode_operations ubifs_symlink_inode_operations; +extern struct backing_dev_info ubifs_backing_dev_info; +extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg); +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); +int ubifs_leb_unmap(struct ubifs_info *c, int lnum); +int ubifs_leb_map(struct ubifs_info *c, int lnum); +int ubifs_is_mapped(const struct ubifs_info *c, int lnum); +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs); +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs); +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs); +int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, + int offs); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc); +void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); +int ubifs_io_init(struct ubifs_info *c); +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad); +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf); +int ubifs_bg_wbufs_sync(struct ubifs_info *c); +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum); +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); + +/* scan.c */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf, int quiet); +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet); +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs); +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs); +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf); + +/* log.c */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud); +void ubifs_create_buds_lists(struct ubifs_info *c); +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs); +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum); +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum); +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum); +int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum); +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum); +int ubifs_consolidate_log(struct ubifs_info *c); + +/* journal.c */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent); +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync); +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size); +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm); +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, + const struct inode *inode2); + +/* budget.c */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui); +int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +long long ubifs_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space_nolock(struct ubifs_info *c); +int ubifs_calc_min_idx_lebs(struct ubifs_info *c); +void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, long long free); +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); + +/* find.c */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze); +int ubifs_find_free_leb_for_idx(struct ubifs_info *c); +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free); +int ubifs_find_dirty_idx_leb(struct ubifs_info *c); +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); + +/* tnc.c */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n); +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm); +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs); +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len); +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len); +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm); +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm); +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key); +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm); +void ubifs_tnc_close(struct ubifs_info *c); +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx); +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +/* Shared by tnc.c for tnc_commit.c */ +void destroy_old_idx(struct ubifs_info *c); +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); + +/* tnc_misc.c */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode); +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n); +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode); +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode); +long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr); +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip); +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node); + +/* tnc_commit.c */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int ubifs_tnc_end_commit(struct ubifs_info *c); + +/* shrinker.c */ +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); + +/* commit.c */ +int ubifs_bg_thread(void *info); +void ubifs_commit_required(struct ubifs_info *c); +void ubifs_request_bg_commit(struct ubifs_info *c); +int ubifs_run_commit(struct ubifs_info *c); +void ubifs_recovery_commit(struct ubifs_info *c); +int ubifs_gc_should_commit(struct ubifs_info *c); +void ubifs_wait_for_commit(struct ubifs_info *c); + +/* master.c */ +int ubifs_read_master(struct ubifs_info *c); +int ubifs_write_master(struct ubifs_info *c); + +/* sb.c */ +int ubifs_read_superblock(struct ubifs_info *c); +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); +int ubifs_fixup_free_space(struct ubifs_info *c); + +/* replay.c */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent); +int ubifs_replay_journal(struct ubifs_info *c); + +/* gc.c */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway); +int ubifs_gc_start_commit(struct ubifs_info *c); +int ubifs_gc_end_commit(struct ubifs_info *c); +void ubifs_destroy_idx_gc(struct ubifs_info *c); +int ubifs_get_idx_gc_leb(struct ubifs_info *c); +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp); + +/* orphan.c */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum); +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); +int ubifs_orphan_start_commit(struct ubifs_info *c); +int ubifs_orphan_end_commit(struct ubifs_info *c); +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); +int ubifs_clear_orphans(struct ubifs_info *c); + +/* lpt.c */ +int ubifs_calc_lpt_geom(struct ubifs_info *c); +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt); +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr); +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum); +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum); +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data); + +/* Shared by lpt.c for lpt_commit.c */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave); +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab); +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode); +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); +struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); +/* Needed only in debugging code in lpt_commit.c */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); + +/* lpt_commit.c */ +int ubifs_lpt_start_commit(struct ubifs_info *c); +int ubifs_lpt_end_commit(struct ubifs_info *c); +int ubifs_lpt_post_commit(struct ubifs_info *c); +void ubifs_lpt_free(struct ubifs_info *c, int wr_only); + +/* lprops.c */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst); +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat); +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops); +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops); +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops); +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt); +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean); +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp); +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); +int ubifs_calc_dark(const struct ubifs_info *c, int spc); + +/* file.c */ +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); +int ubifs_setattr(struct dentry *dentry, struct iattr *attr); + +/* dir.c */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + umode_t mode); +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ubifs_removexattr(struct dentry *dentry, const char *name); +int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr); + +/* super.c */ +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); + +/* recovery.c */ +int ubifs_recover_master_node(struct ubifs_info *c); +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int jhead); +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf); +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf); +int ubifs_rcvry_gc_commit(struct ubifs_info *c); +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size); +int ubifs_recover_size(struct ubifs_info *c); +void ubifs_destroy_size_tree(struct ubifs_info *c); + +/* ioctl.c */ +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void ubifs_set_inode_flags(struct inode *inode); +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#endif + +/* compressor.c */ +int __init ubifs_compressors_init(void); +void ubifs_compressors_exit(void); +void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, + void *out_buf, int *out_len, int *compr_type); +int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, + void *out, int *out_len, int compr_type); + +#include "debug.h" +#include "misc.h" +#include "key.h" + +#endif /* !__UBIFS_H__ */ diff --git a/kernel/fs/ubifs/xattr.c b/kernel/fs/ubifs/xattr.c new file mode 100644 index 000000000..96f3448b6 --- /dev/null +++ b/kernel/fs/ubifs/xattr.c @@ -0,0 +1,666 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS extended attributes support. + * + * Extended attributes are implemented as regular inodes with attached data, + * which limits extended attribute size to UBIFS block size (4KiB). Names of + * extended attributes are described by extended attribute entries (xentries), + * which are almost identical to directory entries, but have different key type. + * + * In other words, the situation with extended attributes is very similar to + * directories. Indeed, any inode (but of course not xattr inodes) may have a + * number of associated xentries, just like directory inodes have associated + * directory entries. Extended attribute entries store the name of the extended + * attribute, the host inode number, and the extended attribute inode number. + * Similarly, direntries store the name, the parent and the target inode + * numbers. Thus, most of the common UBIFS mechanisms may be re-used for + * extended attributes. + * + * The number of extended attributes is not limited, but there is Linux + * limitation on the maximum possible size of the list of all extended + * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure + * the sum of all extended attribute names of the inode does not exceed that + * limit. + * + * Extended attributes are synchronous, which means they are written to the + * flash media synchronously and there is no write-back for extended attribute + * inodes. The extended attribute values are not stored in compressed form on + * the media. + * + * Since extended attributes are represented by regular inodes, they are cached + * in the VFS inode cache. The xentries are cached in the LNC cache (see + * tnc.c). + * + * ACL support is not implemented. + */ + +#include "ubifs.h" +#include +#include +#include +#include + +/* + * Limit the number of extended attributes per inode so that the total size + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. + */ +#define MAX_XATTRS_PER_INODE 65535 + +/* + * Extended attribute type constants. + * + * USER_XATTR: user extended attribute ("user.*") + * TRUSTED_XATTR: trusted extended attribute ("trusted.*) + * SECURITY_XATTR: security extended attribute ("security.*") + */ +enum { + USER_XATTR, + TRUSTED_XATTR, + SECURITY_XATTR, +}; + +static const struct inode_operations empty_iops; +static const struct file_operations empty_fops; + +/** + * create_xattr - create an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @nm: extended attribute name + * @value: extended attribute value + * @size: size of extended attribute value + * + * This is a helper function which creates an extended attribute of name @nm + * and value @value for inode @host. The host inode is also updated on flash + * because the ctime and extended attribute accounting data changes. This + * function returns zero in case of success and a negative error code in case + * of failure. + */ +static int create_xattr(struct ubifs_info *c, struct inode *host, + const struct qstr *nm, const void *value, int size) +{ + int err, names_len; + struct inode *inode; + struct ubifs_inode *ui, *host_ui = ubifs_inode(host); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) { + ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more", + host->i_ino, host_ui->xattr_cnt); + return -ENOSPC; + } + /* + * Linux limits the maximum size of the extended attribute names list + * to %XATTR_LIST_MAX. This means we should not allow creating more + * extended attributes if the name list becomes larger. This limitation + * is artificial for UBIFS, though. + */ + names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1; + if (names_len > XATTR_LIST_MAX) { + ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", + host->i_ino, names_len, XATTR_LIST_MAX); + return -ENOSPC; + } + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + /* Re-define all operations to be "nothing" */ + inode->i_mapping->a_ops = &empty_aops; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + + inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; + ui = ubifs_inode(inode); + ui->xattr = 1; + ui->flags |= UBIFS_XATTR_FL; + ui->data = kmemdup(value, size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + host_ui->xattr_names += nm->len; + + err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + iput(inode); + return 0; + +out_cancel: + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + mutex_unlock(&host_ui->ui_mutex); +out_free: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @value: extended attribute value + * @size: size of extended attribute value + * + * This helper function changes the value of extended attribute @inode with new + * data from @value. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int change_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const void *value, int size) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + err = ubifs_budget_space(c, &req); + if (err) + return err; + + kfree(ui->data); + ui->data = kmemdup(value, size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + + /* + * It is important to write the host inode after the xattr inode + * because if the host inode gets synchronized (via 'fsync()'), then + * the extended attribute inode gets synchronized, because it goes + * before the host inode in the write-buffer. + */ + err = ubifs_jnl_change_xattr(c, inode, host); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_namespace - check extended attribute name-space. + * @nm: extended attribute name + * + * This function makes sure the extended attribute name belongs to one of the + * supported extended attribute name-spaces. Returns name-space index in case + * of success and a negative error code in case of failure. + */ +static int check_namespace(const struct qstr *nm) +{ + int type; + + if (nm->len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; + + if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + return -EINVAL; + type = TRUSTED_XATTR; + } else if (!strncmp(nm->name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) { + if (nm->name[XATTR_USER_PREFIX_LEN] == '\0') + return -EINVAL; + type = USER_XATTR; + } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + return -EINVAL; + type = SECURITY_XATTR; + } else + return -EOPNOTSUPP; + + return type; +} + +static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) +{ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, inum); + if (IS_ERR(inode)) { + ubifs_err(c, "dead extended attribute entry, error %d", + (int)PTR_ERR(inode)); + return inode; + } + if (ubifs_inode(inode)->xattr) + return inode; + ubifs_err(c, "corrupt extended attribute entry"); + iput(inode); + return ERR_PTR(-EINVAL); +} + +static int setxattr(struct inode *host, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = QSTR_INIT(name, strlen(name)); + struct ubifs_dent_node *xent; + union ubifs_key key; + int err, type; + + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + if (size > UBIFS_MAX_INO_DATA) + return -ERANGE; + + type = check_namespace(&nm); + if (type < 0) + return type; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* + * The extended attribute entries are stored in LNC, so multiple + * look-ups do not involve reading the flash. + */ + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err != -ENOENT) + goto out_free; + + if (flags & XATTR_REPLACE) + /* We are asked not to create the xattr */ + err = -ENODATA; + else + err = create_xattr(c, host, &nm, value, size); + goto out_free; + } + + if (flags & XATTR_CREATE) { + /* We are asked not to replace the xattr */ + err = -EEXIST; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + err = change_xattr(c, host, inode, value, size); + iput(inode); + +out_free: + kfree(xent); + return err; +} + +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", + name, d_inode(dentry)->i_ino, dentry, size); + + return setxattr(d_inode(dentry), name, value, size, flags); +} + +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct inode *inode, *host = d_inode(dentry); + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = QSTR_INIT(name, strlen(name)); + struct ubifs_inode *ui; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, + host->i_ino, dentry, size); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_unlock; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + ui = ubifs_inode(inode); + ubifs_assert(inode->i_size == ui->data_len); + ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len); + + if (buf) { + /* If @buf is %NULL we are supposed to return the length */ + if (ui->data_len > size) { + ubifs_err(c, "buffer size %zd, xattr len %d", + size, ui->data_len); + err = -ERANGE; + goto out_iput; + } + + memcpy(buf, ui->data, ui->data_len); + } + err = ui->data_len; + +out_iput: + iput(inode); +out_unlock: + kfree(xent); + return err; +} + +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + union ubifs_key key; + struct inode *host = d_inode(dentry); + struct ubifs_info *c = host->i_sb->s_fs_info; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_dent_node *xent, *pxent = NULL; + int err, len, written = 0; + struct qstr nm = { .name = NULL }; + + dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, + dentry, size); + + len = host_ui->xattr_names + host_ui->xattr_cnt; + if (!buffer) + /* + * We should return the minimum buffer size which will fit a + * null-terminated list of all the extended attribute names. + */ + return len; + + if (len > size) + return -ERANGE; + + lowest_xent_key(c, &key, host->i_ino); + while (1) { + int type; + + xent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + break; + } + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + + type = check_namespace(&nm); + if (unlikely(type < 0)) { + err = type; + break; + } + + /* Show trusted namespace only for "power" users */ + if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) { + memcpy(buffer + written, nm.name, nm.len + 1); + written += nm.len + 1; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key); + } + + kfree(pxent); + if (err != -ENOENT) { + ubifs_err(c, "cannot find next direntry, error %d", err); + return err; + } + + ubifs_assert(written <= size); + return written; +} + +static int remove_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const struct qstr *nm) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names -= nm->len; + + err = ubifs_jnl_delete_xattr(c, host, inode, nm); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + ubifs_release_budget(c, &req); + make_bad_inode(inode); + return err; +} + +int ubifs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode, *host = d_inode(dentry); + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = QSTR_INIT(name, strlen(name)); + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%pd')", name, + host->i_ino, dentry); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + ubifs_assert(inode->i_nlink == 1); + clear_nlink(inode); + err = remove_xattr(c, host, inode, &nm); + if (err) + set_nlink(inode, 1); + + /* If @i_nlink is 0, 'iput()' will delete the inode */ + iput(inode); + +out_free: + kfree(xent); + return err; +} + +static size_t security_listxattr(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int flags) +{ + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + + return total_len; +} + +static int security_getxattr(struct dentry *d, const char *name, void *buffer, + size_t size, int flags) +{ + return ubifs_getxattr(d, name, buffer, size); +} + +static int security_setxattr(struct dentry *d, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + return ubifs_setxattr(d, name, value, size, flags); +} + +static const struct xattr_handler ubifs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = security_listxattr, + .get = security_getxattr, + .set = security_setxattr, +}; + +const struct xattr_handler *ubifs_xattr_handlers[] = { + &ubifs_xattr_security_handler, + NULL, +}; + +static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + char *name; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = setxattr(inode, name, xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + + return err; +} + +int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr) +{ + int err; + + mutex_lock(&inode->i_mutex); + err = security_inode_init_security(inode, dentry, qstr, + &init_xattrs, 0); + mutex_unlock(&inode->i_mutex); + + if (err) { + struct ubifs_info *c = dentry->i_sb->s_fs_info; + ubifs_err(c, "cannot initialize security for inode %lu, error %d", + inode->i_ino, err); + } + return err; +} diff --git a/kernel/fs/udf/Kconfig b/kernel/fs/udf/Kconfig new file mode 100644 index 000000000..c6e17a744 --- /dev/null +++ b/kernel/fs/udf/Kconfig @@ -0,0 +1,20 @@ +config UDF_FS + tristate "UDF file system support" + select CRC_ITU_T + help + This is a file system used on some CD-ROMs and DVDs. Since the + file system is supported by multiple operating systems and is more + compatible with standard unix file systems, it is also suitable for + removable USB disks. Say Y if you intend to mount DVD discs or CDRW's + written in packet mode, or if you want to use UDF for removable USB + disks. Please read . + + To compile this file system support as a module, choose M here: the + module will be called udf. + + If unsure, say N. + +config UDF_NLS + bool + default y + depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) diff --git a/kernel/fs/udf/Makefile b/kernel/fs/udf/Makefile new file mode 100644 index 000000000..eb880f66c --- /dev/null +++ b/kernel/fs/udf/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the linux udf-filesystem routines. +# + +obj-$(CONFIG_UDF_FS) += udf.o + +udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ + partition.o super.o truncate.o symlink.o \ + directory.o misc.o udftime.o unicode.o diff --git a/kernel/fs/udf/balloc.c b/kernel/fs/udf/balloc.c new file mode 100644 index 000000000..6d6a96b4e --- /dev/null +++ b/kernel/fs/udf/balloc.c @@ -0,0 +1,821 @@ +/* + * balloc.c + * + * PURPOSE + * Block allocation handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2001 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" + +#include + +#include "udf_i.h" +#include "udf_sb.h" + +#define udf_clear_bit __test_and_clear_bit_le +#define udf_set_bit __test_and_set_bit_le +#define udf_test_bit test_bit_le +#define udf_find_next_one_bit find_next_bit_le + +static int read_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, unsigned int block, + unsigned long bitmap_nr) +{ + struct buffer_head *bh = NULL; + int retval = 0; + struct kernel_lb_addr loc; + + loc.logicalBlockNum = bitmap->s_extPosition; + loc.partitionReferenceNum = UDF_SB(sb)->s_partition; + + bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); + if (!bh) + retval = -EIO; + + bitmap->s_block_bitmap[bitmap_nr] = bh; + return retval; +} + +static int __load_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, + unsigned int block_group) +{ + int retval = 0; + int nr_groups = bitmap->s_nr_groups; + + if (block_group >= nr_groups) { + udf_debug("block_group (%d) > nr_groups (%d)\n", + block_group, nr_groups); + } + + if (bitmap->s_block_bitmap[block_group]) + return block_group; + + retval = read_block_bitmap(sb, bitmap, block_group, block_group); + if (retval < 0) + return retval; + + return block_group; +} + +static inline int load_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, + unsigned int block_group) +{ + int slot; + + slot = __load_block_bitmap(sb, bitmap, block_group); + + if (slot < 0) + return slot; + + if (!bitmap->s_block_bitmap[slot]) + return -EIO; + + return slot; +} + +static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + + if (!sbi->s_lvid_bh) + return; + + lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; + le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); + udf_updated_lvid(sb); +} + +static void udf_bitmap_free_blocks(struct super_block *sb, + struct udf_bitmap *bitmap, + struct kernel_lb_addr *bloc, + uint32_t offset, + uint32_t count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = NULL; + struct udf_part_map *partmap; + unsigned long block; + unsigned long block_group; + unsigned long bit; + unsigned long i; + int bitmap_nr; + unsigned long overflow; + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, + partmap->s_partition_len); + goto error_return; + } + + block = bloc->logicalBlockNum + offset + + (sizeof(struct spaceBitmapDesc) << 3); + + do { + overflow = 0; + block_group = block >> (sb->s_blocksize_bits + 3); + bit = block % (sb->s_blocksize << 3); + + /* + * Check to see if we are freeing blocks across a group boundary. + */ + if (bit + count > (sb->s_blocksize << 3)) { + overflow = bit + count - (sb->s_blocksize << 3); + count -= overflow; + } + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + + bh = bitmap->s_block_bitmap[bitmap_nr]; + for (i = 0; i < count; i++) { + if (udf_set_bit(bit + i, bh->b_data)) { + udf_debug("bit %ld already set\n", bit + i); + udf_debug("byte=%2x\n", + ((char *)bh->b_data)[(bit + i) >> 3]); + } + } + udf_add_free_space(sb, sbi->s_partition, count); + mark_buffer_dirty(bh); + if (overflow) { + block += count; + count = overflow; + } + } while (overflow); + +error_return: + mutex_unlock(&sbi->s_alloc_mutex); +} + +static int udf_bitmap_prealloc_blocks(struct super_block *sb, + struct udf_bitmap *bitmap, + uint16_t partition, uint32_t first_block, + uint32_t block_count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int alloc_count = 0; + int bit, block, block_group, group_start; + int nr_groups, bitmap_nr; + struct buffer_head *bh; + __u32 part_len; + + mutex_lock(&sbi->s_alloc_mutex); + part_len = sbi->s_partmaps[partition].s_partition_len; + if (first_block >= part_len) + goto out; + + if (first_block + block_count > part_len) + block_count = part_len - first_block; + + do { + nr_groups = udf_compute_nr_groups(sb, partition); + block = first_block + (sizeof(struct spaceBitmapDesc) << 3); + block_group = block >> (sb->s_blocksize_bits + 3); + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto out; + bh = bitmap->s_block_bitmap[bitmap_nr]; + + bit = block % (sb->s_blocksize << 3); + + while (bit < (sb->s_blocksize << 3) && block_count > 0) { + if (!udf_clear_bit(bit, bh->b_data)) + goto out; + block_count--; + alloc_count++; + bit++; + block++; + } + mark_buffer_dirty(bh); + } while (block_count > 0); + +out: + udf_add_free_space(sb, partition, -alloc_count); + mutex_unlock(&sbi->s_alloc_mutex); + return alloc_count; +} + +static int udf_bitmap_new_block(struct super_block *sb, + struct udf_bitmap *bitmap, uint16_t partition, + uint32_t goal, int *err) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int newbit, bit = 0, block, block_group, group_start; + int end_goal, nr_groups, bitmap_nr, i; + struct buffer_head *bh = NULL; + char *ptr; + int newblock = 0; + + *err = -ENOSPC; + mutex_lock(&sbi->s_alloc_mutex); + +repeat: + if (goal >= sbi->s_partmaps[partition].s_partition_len) + goal = 0; + + nr_groups = bitmap->s_nr_groups; + block = goal + (sizeof(struct spaceBitmapDesc) << 3); + block_group = block >> (sb->s_blocksize_bits + 3); + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + bh = bitmap->s_block_bitmap[bitmap_nr]; + ptr = memscan((char *)bh->b_data + group_start, 0xFF, + sb->s_blocksize - group_start); + + if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) { + bit = block % (sb->s_blocksize << 3); + if (udf_test_bit(bit, bh->b_data)) + goto got_block; + + end_goal = (bit + 63) & ~63; + bit = udf_find_next_one_bit(bh->b_data, end_goal, bit); + if (bit < end_goal) + goto got_block; + + ptr = memscan((char *)bh->b_data + (bit >> 3), 0xFF, + sb->s_blocksize - ((bit + 7) >> 3)); + newbit = (ptr - ((char *)bh->b_data)) << 3; + if (newbit < sb->s_blocksize << 3) { + bit = newbit; + goto search_back; + } + + newbit = udf_find_next_one_bit(bh->b_data, + sb->s_blocksize << 3, bit); + if (newbit < sb->s_blocksize << 3) { + bit = newbit; + goto got_block; + } + } + + for (i = 0; i < (nr_groups * 2); i++) { + block_group++; + if (block_group >= nr_groups) + block_group = 0; + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + bh = bitmap->s_block_bitmap[bitmap_nr]; + if (i < nr_groups) { + ptr = memscan((char *)bh->b_data + group_start, 0xFF, + sb->s_blocksize - group_start); + if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) { + bit = (ptr - ((char *)bh->b_data)) << 3; + break; + } + } else { + bit = udf_find_next_one_bit(bh->b_data, + sb->s_blocksize << 3, + group_start << 3); + if (bit < sb->s_blocksize << 3) + break; + } + } + if (i >= (nr_groups * 2)) { + mutex_unlock(&sbi->s_alloc_mutex); + return newblock; + } + if (bit < sb->s_blocksize << 3) + goto search_back; + else + bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3, + group_start << 3); + if (bit >= sb->s_blocksize << 3) { + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } + +search_back: + i = 0; + while (i < 7 && bit > (group_start << 3) && + udf_test_bit(bit - 1, bh->b_data)) { + ++i; + --bit; + } + +got_block: + newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - + (sizeof(struct spaceBitmapDesc) << 3); + + if (!udf_clear_bit(bit, bh->b_data)) { + udf_debug("bit already cleared for block %d\n", bit); + goto repeat; + } + + mark_buffer_dirty(bh); + + udf_add_free_space(sb, partition, -1); + mutex_unlock(&sbi->s_alloc_mutex); + *err = 0; + return newblock; + +error_return: + *err = -EIO; + mutex_unlock(&sbi->s_alloc_mutex); + return 0; +} + +static void udf_table_free_blocks(struct super_block *sb, + struct inode *table, + struct kernel_lb_addr *bloc, + uint32_t offset, + uint32_t count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *partmap; + uint32_t start, end; + uint32_t elen; + struct kernel_lb_addr eloc; + struct extent_position oepos, epos; + int8_t etype; + struct udf_inode_info *iinfo; + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, + partmap->s_partition_len); + goto error_return; + } + + iinfo = UDF_I(table); + udf_add_free_space(sb, sbi->s_partition, count); + + start = bloc->logicalBlockNum + offset; + end = bloc->logicalBlockNum + offset + count - 1; + + epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); + elen = 0; + epos.block = oepos.block = iinfo->i_location; + epos.bh = oepos.bh = NULL; + + while (count && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + if (((eloc.logicalBlockNum + + (elen >> sb->s_blocksize_bits)) == start)) { + if ((0x3FFFFFFF - elen) < + (count << sb->s_blocksize_bits)) { + uint32_t tmp = ((0x3FFFFFFF - elen) >> + sb->s_blocksize_bits); + count -= tmp; + start += tmp; + elen = (etype << 30) | + (0x40000000 - sb->s_blocksize); + } else { + elen = (etype << 30) | + (elen + + (count << sb->s_blocksize_bits)); + start += count; + count = 0; + } + udf_write_aext(table, &oepos, &eloc, elen, 1); + } else if (eloc.logicalBlockNum == (end + 1)) { + if ((0x3FFFFFFF - elen) < + (count << sb->s_blocksize_bits)) { + uint32_t tmp = ((0x3FFFFFFF - elen) >> + sb->s_blocksize_bits); + count -= tmp; + end -= tmp; + eloc.logicalBlockNum -= tmp; + elen = (etype << 30) | + (0x40000000 - sb->s_blocksize); + } else { + eloc.logicalBlockNum = start; + elen = (etype << 30) | + (elen + + (count << sb->s_blocksize_bits)); + end -= count; + count = 0; + } + udf_write_aext(table, &oepos, &eloc, elen, 1); + } + + if (epos.bh != oepos.bh) { + oepos.block = epos.block; + brelse(oepos.bh); + get_bh(epos.bh); + oepos.bh = epos.bh; + oepos.offset = 0; + } else { + oepos.offset = epos.offset; + } + } + + if (count) { + /* + * NOTE: we CANNOT use udf_add_aext here, as it can try to + * allocate a new block, and since we hold the super block + * lock already very bad things would happen :) + * + * We copy the behavior of udf_add_aext, but instead of + * trying to allocate a new block close to the existing one, + * we just steal a block from the extent we are trying to add. + * + * It would be nice if the blocks were close together, but it + * isn't required. + */ + + int adsize; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; + struct allocExtDesc *aed; + + eloc.logicalBlockNum = start; + elen = EXT_RECORDED_ALLOCATED | + (count << sb->s_blocksize_bits); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else { + brelse(oepos.bh); + brelse(epos.bh); + goto error_return; + } + + if (epos.offset + (2 * adsize) > sb->s_blocksize) { + unsigned char *sptr, *dptr; + int loffset; + + brelse(oepos.bh); + oepos = epos; + + /* Steal a block from the extent being free'd */ + epos.block.logicalBlockNum = eloc.logicalBlockNum; + eloc.logicalBlockNum++; + elen -= sb->s_blocksize; + + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &epos.block, 0)); + if (!epos.bh) { + brelse(oepos.bh); + goto error_return; + } + aed = (struct allocExtDesc *)(epos.bh->b_data); + aed->previousAllocExtLocation = + cpu_to_le32(oepos.block.logicalBlockNum); + if (epos.offset + adsize > sb->s_blocksize) { + loffset = epos.offset; + aed->lengthAllocDescs = cpu_to_le32(adsize); + sptr = iinfo->i_ext.i_data + epos.offset + - adsize; + dptr = epos.bh->b_data + + sizeof(struct allocExtDesc); + memcpy(dptr, sptr, adsize); + epos.offset = sizeof(struct allocExtDesc) + + adsize; + } else { + loffset = epos.offset + adsize; + aed->lengthAllocDescs = cpu_to_le32(0); + if (oepos.bh) { + sptr = oepos.bh->b_data + epos.offset; + aed = (struct allocExtDesc *) + oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, + adsize); + } else { + sptr = iinfo->i_ext.i_data + + epos.offset; + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(table); + } + epos.offset = sizeof(struct allocExtDesc); + } + if (sbi->s_udfrev >= 0x0200) + udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, + 3, 1, epos.block.logicalBlockNum, + sizeof(struct tag)); + else + udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, + 2, 1, epos.block.logicalBlockNum, + sizeof(struct tag)); + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)sptr; + sad->extLength = cpu_to_le32( + EXT_NEXT_EXTENT_ALLOCDECS | + sb->s_blocksize); + sad->extPosition = + cpu_to_le32(epos.block.logicalBlockNum); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)sptr; + lad->extLength = cpu_to_le32( + EXT_NEXT_EXTENT_ALLOCDECS | + sb->s_blocksize); + lad->extLocation = + cpu_to_lelb(epos.block); + break; + } + if (oepos.bh) { + udf_update_tag(oepos.bh->b_data, loffset); + mark_buffer_dirty(oepos.bh); + } else { + mark_inode_dirty(table); + } + } + + /* It's possible that stealing the block emptied the extent */ + if (elen) { + udf_write_aext(table, &epos, &eloc, elen, 1); + + if (!epos.bh) { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(table); + } else { + aed = (struct allocExtDesc *)epos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + udf_update_tag(epos.bh->b_data, epos.offset); + mark_buffer_dirty(epos.bh); + } + } + } + + brelse(epos.bh); + brelse(oepos.bh); + +error_return: + mutex_unlock(&sbi->s_alloc_mutex); + return; +} + +static int udf_table_prealloc_blocks(struct super_block *sb, + struct inode *table, uint16_t partition, + uint32_t first_block, uint32_t block_count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int alloc_count = 0; + uint32_t elen, adsize; + struct kernel_lb_addr eloc; + struct extent_position epos; + int8_t etype = -1; + struct udf_inode_info *iinfo; + + if (first_block >= sbi->s_partmaps[partition].s_partition_len) + return 0; + + iinfo = UDF_I(table); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return 0; + + mutex_lock(&sbi->s_alloc_mutex); + epos.offset = sizeof(struct unallocSpaceEntry); + epos.block = iinfo->i_location; + epos.bh = NULL; + eloc.logicalBlockNum = 0xFFFFFFFF; + + while (first_block != eloc.logicalBlockNum && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + udf_debug("eloc=%d, elen=%d, first_block=%d\n", + eloc.logicalBlockNum, elen, first_block); + ; /* empty loop body */ + } + + if (first_block == eloc.logicalBlockNum) { + epos.offset -= adsize; + + alloc_count = (elen >> sb->s_blocksize_bits); + if (alloc_count > block_count) { + alloc_count = block_count; + eloc.logicalBlockNum += alloc_count; + elen -= (alloc_count << sb->s_blocksize_bits); + udf_write_aext(table, &epos, &eloc, + (etype << 30) | elen, 1); + } else + udf_delete_aext(table, epos, eloc, + (etype << 30) | elen); + } else { + alloc_count = 0; + } + + brelse(epos.bh); + + if (alloc_count) + udf_add_free_space(sb, partition, -alloc_count); + mutex_unlock(&sbi->s_alloc_mutex); + return alloc_count; +} + +static int udf_table_new_block(struct super_block *sb, + struct inode *table, uint16_t partition, + uint32_t goal, int *err) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; + uint32_t newblock = 0, adsize; + uint32_t elen, goal_elen = 0; + struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); + struct extent_position epos, goal_epos; + int8_t etype; + struct udf_inode_info *iinfo = UDF_I(table); + + *err = -ENOSPC; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return newblock; + + mutex_lock(&sbi->s_alloc_mutex); + if (goal >= sbi->s_partmaps[partition].s_partition_len) + goal = 0; + + /* We search for the closest matching block to goal. If we find + a exact hit, we stop. Otherwise we keep going till we run out + of extents. We store the buffer_head, bloc, and extoffset + of the current closest match and use that when we are done. + */ + epos.offset = sizeof(struct unallocSpaceEntry); + epos.block = iinfo->i_location; + epos.bh = goal_epos.bh = NULL; + + while (spread && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + if (goal >= eloc.logicalBlockNum) { + if (goal < eloc.logicalBlockNum + + (elen >> sb->s_blocksize_bits)) + nspread = 0; + else + nspread = goal - eloc.logicalBlockNum - + (elen >> sb->s_blocksize_bits); + } else { + nspread = eloc.logicalBlockNum - goal; + } + + if (nspread < spread) { + spread = nspread; + if (goal_epos.bh != epos.bh) { + brelse(goal_epos.bh); + goal_epos.bh = epos.bh; + get_bh(goal_epos.bh); + } + goal_epos.block = epos.block; + goal_epos.offset = epos.offset - adsize; + goal_eloc = eloc; + goal_elen = (etype << 30) | elen; + } + } + + brelse(epos.bh); + + if (spread == 0xFFFFFFFF) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } + + /* Only allocate blocks from the beginning of the extent. + That way, we only delete (empty) extents, never have to insert an + extent because of splitting */ + /* This works, but very poorly.... */ + + newblock = goal_eloc.logicalBlockNum; + goal_eloc.logicalBlockNum++; + goal_elen -= sb->s_blocksize; + + if (goal_elen) + udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); + else + udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); + brelse(goal_epos.bh); + + udf_add_free_space(sb, partition, -1); + + mutex_unlock(&sbi->s_alloc_mutex); + *err = 0; + return newblock; +} + +void udf_free_blocks(struct super_block *sb, struct inode *inode, + struct kernel_lb_addr *bloc, uint32_t offset, + uint32_t count) +{ + uint16_t partition = bloc->partitionReferenceNum; + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { + udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { + udf_table_free_blocks(sb, map->s_uspace.s_table, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { + udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { + udf_table_free_blocks(sb, map->s_fspace.s_table, + bloc, offset, count); + } + + if (inode) { + inode_sub_bytes(inode, + ((sector_t)count) << sb->s_blocksize_bits); + } +} + +inline int udf_prealloc_blocks(struct super_block *sb, + struct inode *inode, + uint16_t partition, uint32_t first_block, + uint32_t block_count) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + int allocated; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_uspace.s_bitmap, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + allocated = udf_table_prealloc_blocks(sb, + map->s_uspace.s_table, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_fspace.s_bitmap, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + allocated = udf_table_prealloc_blocks(sb, + map->s_fspace.s_table, + partition, first_block, + block_count); + else + return 0; + + if (inode && allocated > 0) + inode_add_bytes(inode, allocated << sb->s_blocksize_bits); + return allocated; +} + +inline int udf_new_block(struct super_block *sb, + struct inode *inode, + uint16_t partition, uint32_t goal, int *err) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + int block; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + block = udf_bitmap_new_block(sb, + map->s_uspace.s_bitmap, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + block = udf_table_new_block(sb, + map->s_uspace.s_table, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + block = udf_bitmap_new_block(sb, + map->s_fspace.s_bitmap, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + block = udf_table_new_block(sb, + map->s_fspace.s_table, + partition, goal, err); + else { + *err = -EIO; + return 0; + } + if (inode && block) + inode_add_bytes(inode, sb->s_blocksize); + return block; +} diff --git a/kernel/fs/udf/dir.c b/kernel/fs/udf/dir.c new file mode 100644 index 000000000..541a12b57 --- /dev/null +++ b/kernel/fs/udf/dir.c @@ -0,0 +1,199 @@ +/* + * dir.c + * + * PURPOSE + * Directory handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2004 Ben Fennema + * + * HISTORY + * + * 10/05/98 dgb Split directory operations into its own file + * Implemented directory reads via do_udf_readdir + * 10/06/98 Made directory operations work! + * 11/17/98 Rewrote directory to support ICBTAG_FLAG_AD_LONG + * 11/25/98 blf Rewrote directory handling (readdir+lookup) to support reading + * across blocks. + * 12/12/98 Split out the lookup code to namei.c. bulk of directory + * code now in directory.c:udf_fileident_read. + */ + +#include "udfdecl.h" + +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + + +static int udf_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file_inode(file); + struct udf_inode_info *iinfo = UDF_I(dir); + struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; + struct fileIdentDesc *fi = NULL; + struct fileIdentDesc cfi; + int block, iblock; + loff_t nf_pos; + int flen; + unsigned char *fname = NULL; + unsigned char *nameptr; + uint16_t liu; + uint8_t lfi; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + struct buffer_head *tmp, *bha[16]; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + int i, num, ret = 0; + struct extent_position epos = { NULL, 0, {0, 0} }; + struct super_block *sb = dir->i_sb; + + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos = 1; + } + nf_pos = (ctx->pos - 1) << 2; + if (nf_pos >= size) + goto out; + + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) { + ret = -ENOMEM; + goto out; + } + + if (nf_pos == 0) + nf_pos = udf_ext0_offset(dir); + + fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, + &epos, &eloc, &elen, &offset) + != (EXT_RECORDED_ALLOCATED >> 30)) { + ret = -ENOENT; + goto out; + } + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == + ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else { + offset = 0; + } + + if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { + ret = -EIO; + goto out; + } + + if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { + i = 16 >> (sb->s_blocksize_bits - 9); + if (i + offset > (elen >> sb->s_blocksize_bits)) + i = (elen >> sb->s_blocksize_bits) - offset; + for (num = 0; i > 0; i--) { + block = udf_get_lb_pblock(sb, &eloc, offset + i); + tmp = udf_tgetblk(sb, block); + if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + ll_rw_block(READA, num, bha); + for (i = 0; i < num; i++) + brelse(bha[i]); + } + } + } + + while (nf_pos < size) { + struct kernel_lb_addr tloc; + + ctx->pos = (nf_pos >> 2) + 1; + + fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, + &elen, &offset); + if (!fi) + goto out; + + liu = le16_to_cpu(cfi.lengthOfImpUse); + lfi = cfi.lengthFileIdent; + + if (fibh.sbh == fibh.ebh) { + nameptr = fi->fileIdent + liu; + } else { + int poffset; /* Unpaded ending offset */ + + poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; + + if (poffset >= lfi) { + nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); + } else { + nameptr = fname; + memcpy(nameptr, fi->fileIdent + liu, + lfi - poffset); + memcpy(nameptr + lfi - poffset, + fibh.ebh->b_data, poffset); + } + } + + if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) + continue; + } + + if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) + continue; + } + + if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { + if (!dir_emit_dotdot(file, ctx)) + goto out; + continue; + } + + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + if (!flen) + continue; + + tloc = lelb_to_cpu(cfi.icb.extLocation); + iblock = udf_get_lb_pblock(sb, &tloc, 0); + if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) + goto out; + } /* end while */ + + ctx->pos = (nf_pos >> 2) + 1; + +out: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + kfree(fname); + + return ret; +} + +/* readdir and lookup functions */ +const struct file_operations udf_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = udf_readdir, + .unlocked_ioctl = udf_ioctl, + .fsync = generic_file_fsync, +}; diff --git a/kernel/fs/udf/directory.c b/kernel/fs/udf/directory.c new file mode 100644 index 000000000..c763fda25 --- /dev/null +++ b/kernel/fs/udf/directory.c @@ -0,0 +1,240 @@ +/* + * directory.c + * + * PURPOSE + * Directory related functions + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + */ + +#include "udfdecl.h" +#include "udf_i.h" + +#include +#include + +struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi, + struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, + sector_t *offset) +{ + struct fileIdentDesc *fi; + int i, num, block; + struct buffer_head *tmp, *bha[16]; + struct udf_inode_info *iinfo = UDF_I(dir); + + fibh->soffset = fibh->eoffset; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + fi = udf_get_fileident(iinfo->i_ext.i_data - + (iinfo->i_efe ? + sizeof(struct extendedFileEntry) : + sizeof(struct fileEntry)), + dir->i_sb->s_blocksize, + &(fibh->eoffset)); + if (!fi) + return NULL; + + *nf_pos += fibh->eoffset - fibh->soffset; + + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + + return fi; + } + + if (fibh->eoffset == dir->i_sb->s_blocksize) { + int lextoffset = epos->offset; + unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; + + if (udf_next_aext(dir, epos, eloc, elen, 1) != + (EXT_RECORDED_ALLOCATED >> 30)) + return NULL; + + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + + (*offset)++; + + if ((*offset << blocksize_bits) >= *elen) + *offset = 0; + else + epos->offset = lextoffset; + + brelse(fibh->sbh); + fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->sbh) + return NULL; + fibh->soffset = fibh->eoffset = 0; + + if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) { + i = 16 >> (blocksize_bits - 9); + if (i + *offset > (*elen >> blocksize_bits)) + i = (*elen >> blocksize_bits)-*offset; + for (num = 0; i > 0; i--) { + block = udf_get_lb_pblock(dir->i_sb, eloc, + *offset + i); + tmp = udf_tgetblk(dir->i_sb, block); + if (tmp && !buffer_uptodate(tmp) && + !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + ll_rw_block(READA, num, bha); + for (i = 0; i < num; i++) + brelse(bha[i]); + } + } + } else if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize, + &(fibh->eoffset)); + + if (!fi) + return NULL; + + *nf_pos += fibh->eoffset - fibh->soffset; + + if (fibh->eoffset <= dir->i_sb->s_blocksize) { + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + } else if (fibh->eoffset > dir->i_sb->s_blocksize) { + int lextoffset = epos->offset; + + if (udf_next_aext(dir, epos, eloc, elen, 1) != + (EXT_RECORDED_ALLOCATED >> 30)) + return NULL; + + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + + (*offset)++; + + if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen) + *offset = 0; + else + epos->offset = lextoffset; + + fibh->soffset -= dir->i_sb->s_blocksize; + fibh->eoffset -= dir->i_sb->s_blocksize; + + fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->ebh) + return NULL; + + if (sizeof(struct fileIdentDesc) > -fibh->soffset) { + int fi_len; + + memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset); + memcpy((uint8_t *)cfi - fibh->soffset, + fibh->ebh->b_data, + sizeof(struct fileIdentDesc) + fibh->soffset); + + fi_len = (sizeof(struct fileIdentDesc) + + cfi->lengthFileIdent + + le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3; + + *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); + fibh->eoffset = fibh->soffset + fi_len; + } else { + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + } + } + return fi; +} + +struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) +{ + struct fileIdentDesc *fi; + int lengthThisIdent; + uint8_t *ptr; + int padlen; + + if ((!buffer) || (!offset)) { + udf_debug("invalidparms, buffer=%p, offset=%p\n", + buffer, offset); + return NULL; + } + + ptr = buffer; + + if ((*offset > 0) && (*offset < bufsize)) + ptr += *offset; + fi = (struct fileIdentDesc *)ptr; + if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { + udf_debug("0x%x != TAG_IDENT_FID\n", + le16_to_cpu(fi->descTag.tagIdent)); + udf_debug("offset: %u sizeof: %lu bufsize: %u\n", + *offset, (unsigned long)sizeof(struct fileIdentDesc), + bufsize); + return NULL; + } + if ((*offset + sizeof(struct fileIdentDesc)) > bufsize) + lengthThisIdent = sizeof(struct fileIdentDesc); + else + lengthThisIdent = sizeof(struct fileIdentDesc) + + fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse); + + /* we need to figure padding, too! */ + padlen = lengthThisIdent % UDF_NAME_PAD; + if (padlen) + lengthThisIdent += (UDF_NAME_PAD - padlen); + *offset = *offset + lengthThisIdent; + + return fi; +} + +struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, + int inc) +{ + struct short_ad *sa; + + if ((!ptr) || (!offset)) { + pr_err("%s: invalidparms\n", __func__); + return NULL; + } + + if ((*offset + sizeof(struct short_ad)) > maxoffset) + return NULL; + else { + sa = (struct short_ad *)ptr; + if (sa->extLength == 0) + return NULL; + } + + if (inc) + *offset += sizeof(struct short_ad); + return sa; +} + +struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) +{ + struct long_ad *la; + + if ((!ptr) || (!offset)) { + pr_err("%s: invalidparms\n", __func__); + return NULL; + } + + if ((*offset + sizeof(struct long_ad)) > maxoffset) + return NULL; + else { + la = (struct long_ad *)ptr; + if (la->extLength == 0) + return NULL; + } + + if (inc) + *offset += sizeof(struct long_ad); + return la; +} diff --git a/kernel/fs/udf/ecma_167.h b/kernel/fs/udf/ecma_167.h new file mode 100644 index 000000000..4792b771a --- /dev/null +++ b/kernel/fs/udf/ecma_167.h @@ -0,0 +1,796 @@ +/* + * ecma_167.h + * + * This file is based on ECMA-167 3rd edition (June 1997) + * http://www.ecma.ch + * + * Copyright (c) 2001-2002 Ben Fennema + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#ifndef _ECMA_167_H +#define _ECMA_167_H 1 + +/* Character set specification (ECMA 167r3 1/7.2.1) */ +struct charspec { + uint8_t charSetType; + uint8_t charSetInfo[63]; +} __attribute__ ((packed)); + +/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ +#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ +#define CHARSPEC_TYPE_CS1 0x01 /* (1/7.2.3) */ +#define CHARSPEC_TYPE_CS2 0x02 /* (1/7.2.4) */ +#define CHARSPEC_TYPE_CS3 0x03 /* (1/7.2.5) */ +#define CHARSPEC_TYPE_CS4 0x04 /* (1/7.2.6) */ +#define CHARSPEC_TYPE_CS5 0x05 /* (1/7.2.7) */ +#define CHARSPEC_TYPE_CS6 0x06 /* (1/7.2.8) */ +#define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */ +#define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */ + +typedef uint8_t dstring; + +/* Timestamp (ECMA 167r3 1/7.3) */ +struct timestamp { + __le16 typeAndTimezone; + __le16 year; + uint8_t month; + uint8_t day; + uint8_t hour; + uint8_t minute; + uint8_t second; + uint8_t centiseconds; + uint8_t hundredsOfMicroseconds; + uint8_t microseconds; +} __attribute__ ((packed)); + +/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ +#define TIMESTAMP_TYPE_MASK 0xF000 +#define TIMESTAMP_TYPE_CUT 0x0000 +#define TIMESTAMP_TYPE_LOCAL 0x1000 +#define TIMESTAMP_TYPE_AGREEMENT 0x2000 +#define TIMESTAMP_TIMEZONE_MASK 0x0FFF + +/* Entity identifier (ECMA 167r3 1/7.4) */ +struct regid { + uint8_t flags; + uint8_t ident[23]; + uint8_t identSuffix[8]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 1/7.4.1) */ +#define ENTITYID_FLAGS_DIRTY 0x00 +#define ENTITYID_FLAGS_PROTECTED 0x01 + +/* Volume Structure Descriptor (ECMA 167r3 2/9.1) */ +#define VSD_STD_ID_LEN 5 +struct volStructDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Standard Identifier (EMCA 167r2 2/9.1.2) */ +#define VSD_STD_ID_NSR02 "NSR02" /* (3/9.1) */ + +/* Standard Identifier (ECMA 167r3 2/9.1.2) */ +#define VSD_STD_ID_BEA01 "BEA01" /* (2/9.2) */ +#define VSD_STD_ID_BOOT2 "BOOT2" /* (2/9.4) */ +#define VSD_STD_ID_CD001 "CD001" /* (ECMA-119) */ +#define VSD_STD_ID_CDW02 "CDW02" /* (ECMA-168) */ +#define VSD_STD_ID_NSR03 "NSR03" /* (3/9.1) */ +#define VSD_STD_ID_TEA01 "TEA01" /* (2/9.3) */ + +/* Beginning Extended Area Descriptor (ECMA 167r3 2/9.2) */ +struct beginningExtendedAreaDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */ +struct terminatingExtendedAreaDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Boot Descriptor (ECMA 167r3 2/9.4) */ +struct bootDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved1; + struct regid archType; + struct regid bootIdent; + __le32 bootExtLocation; + __le32 bootExtLength; + __le64 loadAddress; + __le64 startAddress; + struct timestamp descCreationDateAndTime; + __le16 flags; + uint8_t reserved2[32]; + uint8_t bootUse[1906]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 2/9.4.12) */ +#define BOOT_FLAGS_ERASE 0x01 + +/* Extent Descriptor (ECMA 167r3 3/7.1) */ +struct extent_ad { + __le32 extLength; + __le32 extLocation; +} __attribute__ ((packed)); + +struct kernel_extent_ad { + uint32_t extLength; + uint32_t extLocation; +}; + +/* Descriptor Tag (ECMA 167r3 3/7.2) */ +struct tag { + __le16 tagIdent; + __le16 descVersion; + uint8_t tagChecksum; + uint8_t reserved; + __le16 tagSerialNum; + __le16 descCRC; + __le16 descCRCLength; + __le32 tagLocation; +} __attribute__ ((packed)); + +/* Tag Identifier (ECMA 167r3 3/7.2.1) */ +#define TAG_IDENT_PVD 0x0001 +#define TAG_IDENT_AVDP 0x0002 +#define TAG_IDENT_VDP 0x0003 +#define TAG_IDENT_IUVD 0x0004 +#define TAG_IDENT_PD 0x0005 +#define TAG_IDENT_LVD 0x0006 +#define TAG_IDENT_USD 0x0007 +#define TAG_IDENT_TD 0x0008 +#define TAG_IDENT_LVID 0x0009 + +/* NSR Descriptor (ECMA 167r3 3/9.1) */ +struct NSRDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved; + uint8_t structData[2040]; +} __attribute__ ((packed)); + +/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ +struct primaryVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le32 primaryVolDescNum; + dstring volIdent[32]; + __le16 volSeqNum; + __le16 maxVolSeqNum; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + dstring volSetIdent[128]; + struct charspec descCharSet; + struct charspec explanatoryCharSet; + struct extent_ad volAbstract; + struct extent_ad volCopyright; + struct regid appIdent; + struct timestamp recordingDateAndTime; + struct regid impIdent; + uint8_t impUse[64]; + __le32 predecessorVolDescSeqLocation; + __le16 flags; + uint8_t reserved[22]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 3/10.1.21) */ +#define PVD_FLAGS_VSID_COMMON 0x0001 + +/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ +struct anchorVolDescPtr { + struct tag descTag; + struct extent_ad mainVolDescSeqExt; + struct extent_ad reserveVolDescSeqExt; + uint8_t reserved[480]; +} __attribute__ ((packed)); + +/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ +struct volDescPtr { + struct tag descTag; + __le32 volDescSeqNum; + struct extent_ad nextVolDescSeqExt; + uint8_t reserved[484]; +} __attribute__ ((packed)); + +/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ +struct impUseVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + struct regid impIdent; + uint8_t impUse[460]; +} __attribute__ ((packed)); + +/* Partition Descriptor (ECMA 167r3 3/10.5) */ +struct partitionDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le16 partitionFlags; + __le16 partitionNumber; + struct regid partitionContents; + uint8_t partitionContentsUse[128]; + __le32 accessType; + __le32 partitionStartingLocation; + __le32 partitionLength; + struct regid impIdent; + uint8_t impUse[128]; + uint8_t reserved[156]; +} __attribute__ ((packed)); + +/* Partition Flags (ECMA 167r3 3/10.5.3) */ +#define PD_PARTITION_FLAGS_ALLOC 0x0001 + +/* Partition Contents (ECMA 167r2 3/10.5.3) */ +#define PD_PARTITION_CONTENTS_NSR02 "+NSR02" + +/* Partition Contents (ECMA 167r3 3/10.5.5) */ +#define PD_PARTITION_CONTENTS_FDC01 "+FDC01" +#define PD_PARTITION_CONTENTS_CD001 "+CD001" +#define PD_PARTITION_CONTENTS_CDW02 "+CDW02" +#define PD_PARTITION_CONTENTS_NSR03 "+NSR03" + +/* Access Type (ECMA 167r3 3/10.5.7) */ +#define PD_ACCESS_TYPE_NONE 0x00000000 +#define PD_ACCESS_TYPE_READ_ONLY 0x00000001 +#define PD_ACCESS_TYPE_WRITE_ONCE 0x00000002 +#define PD_ACCESS_TYPE_REWRITABLE 0x00000003 +#define PD_ACCESS_TYPE_OVERWRITABLE 0x00000004 + +/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ +struct logicalVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + struct charspec descCharSet; + dstring logicalVolIdent[128]; + __le32 logicalBlockSize; + struct regid domainIdent; + uint8_t logicalVolContentsUse[16]; + __le32 mapTableLength; + __le32 numPartitionMaps; + struct regid impIdent; + uint8_t impUse[128]; + struct extent_ad integritySeqExt; + uint8_t partitionMaps[0]; +} __attribute__ ((packed)); + +/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ +struct genericPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t partitionMapping[0]; +} __attribute__ ((packed)); + +/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ +#define GP_PARTITION_MAP_TYPE_UNDEF 0x00 +#define GP_PARTIITON_MAP_TYPE_1 0x01 +#define GP_PARTITION_MAP_TYPE_2 0x02 + +/* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */ +struct genericPartitionMap1 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + __le16 volSeqNum; + __le16 partitionNum; +} __attribute__ ((packed)); + +/* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */ +struct genericPartitionMap2 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t partitionIdent[62]; +} __attribute__ ((packed)); + +/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ +struct unallocSpaceDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le32 numAllocDescs; + struct extent_ad allocDescs[0]; +} __attribute__ ((packed)); + +/* Terminating Descriptor (ECMA 167r3 3/10.9) */ +struct terminatingDesc { + struct tag descTag; + uint8_t reserved[496]; +} __attribute__ ((packed)); + +/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ +struct logicalVolIntegrityDesc { + struct tag descTag; + struct timestamp recordingDateAndTime; + __le32 integrityType; + struct extent_ad nextIntegrityExt; + uint8_t logicalVolContentsUse[32]; + __le32 numOfPartitions; + __le32 lengthOfImpUse; + __le32 freeSpaceTable[0]; + __le32 sizeTable[0]; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Integrity Type (ECMA 167r3 3/10.10.3) */ +#define LVID_INTEGRITY_TYPE_OPEN 0x00000000 +#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 + +/* Recorded Address (ECMA 167r3 4/7.1) */ +struct lb_addr { + __le32 logicalBlockNum; + __le16 partitionReferenceNum; +} __attribute__ ((packed)); + +/* ... and its in-core analog */ +struct kernel_lb_addr { + uint32_t logicalBlockNum; + uint16_t partitionReferenceNum; +}; + +/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ +struct short_ad { + __le32 extLength; + __le32 extPosition; +} __attribute__ ((packed)); + +/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ +struct long_ad { + __le32 extLength; + struct lb_addr extLocation; + uint8_t impUse[6]; +} __attribute__ ((packed)); + +struct kernel_long_ad { + uint32_t extLength; + struct kernel_lb_addr extLocation; + uint8_t impUse[6]; +}; + +/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ +struct ext_ad { + __le32 extLength; + __le32 recordedLength; + __le32 informationLength; + struct lb_addr extLocation; +} __attribute__ ((packed)); + +struct kernel_ext_ad { + uint32_t extLength; + uint32_t recordedLength; + uint32_t informationLength; + struct kernel_lb_addr extLocation; +}; + +/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ + +/* Tag Identifier (ECMA 167r3 4/7.2.1) */ +#define TAG_IDENT_FSD 0x0100 +#define TAG_IDENT_FID 0x0101 +#define TAG_IDENT_AED 0x0102 +#define TAG_IDENT_IE 0x0103 +#define TAG_IDENT_TE 0x0104 +#define TAG_IDENT_FE 0x0105 +#define TAG_IDENT_EAHD 0x0106 +#define TAG_IDENT_USE 0x0107 +#define TAG_IDENT_SBD 0x0108 +#define TAG_IDENT_PIE 0x0109 +#define TAG_IDENT_EFE 0x010A + +/* File Set Descriptor (ECMA 167r3 4/14.1) */ +struct fileSetDesc { + struct tag descTag; + struct timestamp recordingDateAndTime; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + __le32 fileSetNum; + __le32 fileSetDescNum; + struct charspec logicalVolIdentCharSet; + dstring logicalVolIdent[128]; + struct charspec fileSetCharSet; + dstring fileSetIdent[32]; + dstring copyrightFileIdent[32]; + dstring abstractFileIdent[32]; + struct long_ad rootDirectoryICB; + struct regid domainIdent; + struct long_ad nextExt; + struct long_ad streamDirectoryICB; + uint8_t reserved[32]; +} __attribute__ ((packed)); + +/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ +struct partitionHeaderDesc { + struct short_ad unallocSpaceTable; + struct short_ad unallocSpaceBitmap; + struct short_ad partitionIntegrityTable; + struct short_ad freedSpaceTable; + struct short_ad freedSpaceBitmap; + uint8_t reserved[88]; +} __attribute__ ((packed)); + +/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ +struct fileIdentDesc { + struct tag descTag; + __le16 fileVersionNum; + uint8_t fileCharacteristics; + uint8_t lengthFileIdent; + struct long_ad icb; + __le16 lengthOfImpUse; + uint8_t impUse[0]; + uint8_t fileIdent[0]; + uint8_t padding[0]; +} __attribute__ ((packed)); + +/* File Characteristics (ECMA 167r3 4/14.4.3) */ +#define FID_FILE_CHAR_HIDDEN 0x01 +#define FID_FILE_CHAR_DIRECTORY 0x02 +#define FID_FILE_CHAR_DELETED 0x04 +#define FID_FILE_CHAR_PARENT 0x08 +#define FID_FILE_CHAR_METADATA 0x10 + +/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ +struct allocExtDesc { + struct tag descTag; + __le32 previousAllocExtLocation; + __le32 lengthAllocDescs; +} __attribute__ ((packed)); + +/* ICB Tag (ECMA 167r3 4/14.6) */ +struct icbtag { + __le32 priorRecordedNumDirectEntries; + __le16 strategyType; + __le16 strategyParameter; + __le16 numEntries; + uint8_t reserved; + uint8_t fileType; + struct lb_addr parentICBLocation; + __le16 flags; +} __attribute__ ((packed)); + +/* Strategy Type (ECMA 167r3 4/14.6.2) */ +#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 +#define ICBTAG_STRATEGY_TYPE_1 0x0001 +#define ICBTAG_STRATEGY_TYPE_2 0x0002 +#define ICBTAG_STRATEGY_TYPE_3 0x0003 +#define ICBTAG_STRATEGY_TYPE_4 0x0004 + +/* File Type (ECMA 167r3 4/14.6.6) */ +#define ICBTAG_FILE_TYPE_UNDEF 0x00 +#define ICBTAG_FILE_TYPE_USE 0x01 +#define ICBTAG_FILE_TYPE_PIE 0x02 +#define ICBTAG_FILE_TYPE_IE 0x03 +#define ICBTAG_FILE_TYPE_DIRECTORY 0x04 +#define ICBTAG_FILE_TYPE_REGULAR 0x05 +#define ICBTAG_FILE_TYPE_BLOCK 0x06 +#define ICBTAG_FILE_TYPE_CHAR 0x07 +#define ICBTAG_FILE_TYPE_EA 0x08 +#define ICBTAG_FILE_TYPE_FIFO 0x09 +#define ICBTAG_FILE_TYPE_SOCKET 0x0A +#define ICBTAG_FILE_TYPE_TE 0x0B +#define ICBTAG_FILE_TYPE_SYMLINK 0x0C +#define ICBTAG_FILE_TYPE_STREAMDIR 0x0D + +/* Flags (ECMA 167r3 4/14.6.8) */ +#define ICBTAG_FLAG_AD_MASK 0x0007 +#define ICBTAG_FLAG_AD_SHORT 0x0000 +#define ICBTAG_FLAG_AD_LONG 0x0001 +#define ICBTAG_FLAG_AD_EXTENDED 0x0002 +#define ICBTAG_FLAG_AD_IN_ICB 0x0003 +#define ICBTAG_FLAG_SORTED 0x0008 +#define ICBTAG_FLAG_NONRELOCATABLE 0x0010 +#define ICBTAG_FLAG_ARCHIVE 0x0020 +#define ICBTAG_FLAG_SETUID 0x0040 +#define ICBTAG_FLAG_SETGID 0x0080 +#define ICBTAG_FLAG_STICKY 0x0100 +#define ICBTAG_FLAG_CONTIGUOUS 0x0200 +#define ICBTAG_FLAG_SYSTEM 0x0400 +#define ICBTAG_FLAG_TRANSFORMED 0x0800 +#define ICBTAG_FLAG_MULTIVERSIONS 0x1000 +#define ICBTAG_FLAG_STREAM 0x2000 + +/* Indirect Entry (ECMA 167r3 4/14.7) */ +struct indirectEntry { + struct tag descTag; + struct icbtag icbTag; + struct long_ad indirectICB; +} __attribute__ ((packed)); + +/* Terminal Entry (ECMA 167r3 4/14.8) */ +struct terminalEntry { + struct tag descTag; + struct icbtag icbTag; +} __attribute__ ((packed)); + +/* File Entry (ECMA 167r3 4/14.9) */ +struct fileEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp attrTime; + __le32 checkpoint; + struct long_ad extendedAttrICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +/* Permissions (ECMA 167r3 4/14.9.5) */ +#define FE_PERM_O_EXEC 0x00000001U +#define FE_PERM_O_WRITE 0x00000002U +#define FE_PERM_O_READ 0x00000004U +#define FE_PERM_O_CHATTR 0x00000008U +#define FE_PERM_O_DELETE 0x00000010U +#define FE_PERM_G_EXEC 0x00000020U +#define FE_PERM_G_WRITE 0x00000040U +#define FE_PERM_G_READ 0x00000080U +#define FE_PERM_G_CHATTR 0x00000100U +#define FE_PERM_G_DELETE 0x00000200U +#define FE_PERM_U_EXEC 0x00000400U +#define FE_PERM_U_WRITE 0x00000800U +#define FE_PERM_U_READ 0x00001000U +#define FE_PERM_U_CHATTR 0x00002000U +#define FE_PERM_U_DELETE 0x00004000U + +/* Record Format (ECMA 167r3 4/14.9.7) */ +#define FE_RECORD_FMT_UNDEF 0x00 +#define FE_RECORD_FMT_FIXED_PAD 0x01 +#define FE_RECORD_FMT_FIXED 0x02 +#define FE_RECORD_FMT_VARIABLE8 0x03 +#define FE_RECORD_FMT_VARIABLE16 0x04 +#define FE_RECORD_FMT_VARIABLE16_MSB 0x05 +#define FE_RECORD_FMT_VARIABLE32 0x06 +#define FE_RECORD_FMT_PRINT 0x07 +#define FE_RECORD_FMT_LF 0x08 +#define FE_RECORD_FMT_CR 0x09 +#define FE_RECORD_FMT_CRLF 0x0A +#define FE_RECORD_FMT_LFCR 0x0B + +/* Record Display Attributes (ECMA 167r3 4/14.9.8) */ +#define FE_RECORD_DISPLAY_ATTR_UNDEF 0x00 +#define FE_RECORD_DISPLAY_ATTR_1 0x01 +#define FE_RECORD_DISPLAY_ATTR_2 0x02 +#define FE_RECORD_DISPLAY_ATTR_3 0x03 + +/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ +struct extendedAttrHeaderDesc { + struct tag descTag; + __le32 impAttrLocation; + __le32 appAttrLocation; +} __attribute__ ((packed)); + +/* Generic Format (ECMA 167r3 4/14.10.2) */ +struct genericFormat { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + uint8_t attrData[0]; +} __attribute__ ((packed)); + +/* Character Set Information (ECMA 167r3 4/14.10.3) */ +struct charSetInfo { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 escapeSeqLength; + uint8_t charSetType; + uint8_t escapeSeq[0]; +} __attribute__ ((packed)); + +/* Alternate Permissions (ECMA 167r3 4/14.10.4) */ +struct altPerms { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le16 ownerIdent; + __le16 groupIdent; + __le16 permission; +} __attribute__ ((packed)); + +/* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */ +struct fileTimesExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 dataLength; + __le32 fileTimeExistence; + uint8_t fileTimes; +} __attribute__ ((packed)); + +/* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */ +#define FTE_CREATION 0x00000001 +#define FTE_DELETION 0x00000004 +#define FTE_EFFECTIVE 0x00000008 +#define FTE_BACKUP 0x00000002 + +/* Information Times Extended Attribute (ECMA 167r3 4/14.10.6) */ +struct infoTimesExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 dataLength; + __le32 infoTimeExistence; + uint8_t infoTimes[0]; +} __attribute__ ((packed)); + +/* Device Specification (ECMA 167r3 4/14.10.7) */ +struct deviceSpec { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 impUseLength; + __le32 majorDeviceIdent; + __le32 minorDeviceIdent; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ +struct impUseExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 impUseLength; + struct regid impIdent; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ +struct appUseExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 appUseLength; + struct regid appIdent; + uint8_t appUse[0]; +} __attribute__ ((packed)); + +#define EXTATTR_CHAR_SET 1 +#define EXTATTR_ALT_PERMS 3 +#define EXTATTR_FILE_TIMES 5 +#define EXTATTR_INFO_TIMES 6 +#define EXTATTR_DEV_SPEC 12 +#define EXTATTR_IMP_USE 2048 +#define EXTATTR_APP_USE 65536 + +/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ +struct unallocSpaceEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 lengthAllocDescs; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ +struct spaceBitmapDesc { + struct tag descTag; + __le32 numOfBits; + __le32 numOfBytes; + uint8_t bitmap[0]; +} __attribute__ ((packed)); + +/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ +struct partitionIntegrityEntry { + struct tag descTag; + struct icbtag icbTag; + struct timestamp recordingDateAndTime; + uint8_t integrityType; + uint8_t reserved[175]; + struct regid impIdent; + uint8_t impUse[256]; +} __attribute__ ((packed)); + +/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ + +/* Extent Length (ECMA 167r3 4/14.14.1.1) */ +#define EXT_RECORDED_ALLOCATED 0x00000000 +#define EXT_NOT_RECORDED_ALLOCATED 0x40000000 +#define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000 +#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000 + +/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ + +/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ + +/* Logical Volume Header Descriptor (ECMA 167r3 4/14.15) */ +struct logicalVolHeaderDesc { + __le64 uniqueID; + uint8_t reserved[24]; +} __attribute__ ((packed)); + +/* Path Component (ECMA 167r3 4/14.16.1) */ +struct pathComponent { + uint8_t componentType; + uint8_t lengthComponentIdent; + __le16 componentFileVersionNum; + dstring componentIdent[0]; +} __attribute__ ((packed)); + +/* File Entry (ECMA 167r3 4/14.17) */ +struct extendedFileEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 objectSize; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp createTime; + struct timestamp attrTime; + __le32 checkpoint; + __le32 reserved; + struct long_ad extendedAttrICB; + struct long_ad streamDirectoryICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +#endif /* _ECMA_167_H */ diff --git a/kernel/fs/udf/file.c b/kernel/fs/udf/file.c new file mode 100644 index 000000000..7a95b8fed --- /dev/null +++ b/kernel/fs/udf/file.c @@ -0,0 +1,273 @@ +/* + * file.c + * + * PURPOSE + * File handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-1999 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 10/02/98 dgb Attempt to integrate into udf.o + * 10/07/98 Switched to using generic_readpage, etc., like isofs + * And it works! + * 12/06/98 blf Added udf_file_read. uses generic_file_read for all cases but + * ICBTAG_FLAG_AD_IN_ICB. + * 04/06/99 64 bit file handling on 32 bit systems taken from ext2 file.c + * 05/12/99 Preliminary file write support + */ + +#include "udfdecl.h" +#include +#include +#include +#include /* memset */ +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +static void __udf_adinicb_readpage(struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + + kaddr = kmap(page); + memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); + memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); +} + +static int udf_adinicb_readpage(struct file *file, struct page *page) +{ + BUG_ON(!PageLocked(page)); + __udf_adinicb_readpage(page); + unlock_page(page); + + return 0; +} + +static int udf_adinicb_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + + kaddr = kmap(page); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); + mark_inode_dirty(inode); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + + return 0; +} + +static int udf_adinicb_write_begin(struct file *file, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + struct page *page; + + if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE)) + return -EIO; + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) + __udf_adinicb_readpage(page); + return 0; +} + +static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + /* Fallback to buffered I/O. */ + return 0; +} + +const struct address_space_operations udf_adinicb_aops = { + .readpage = udf_adinicb_readpage, + .writepage = udf_adinicb_writepage, + .write_begin = udf_adinicb_write_begin, + .write_end = simple_write_end, + .direct_IO = udf_adinicb_direct_IO, +}; + +static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t retval; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct udf_inode_info *iinfo = UDF_I(inode); + int err; + + mutex_lock(&inode->i_mutex); + + retval = generic_write_checks(iocb, from); + if (retval <= 0) + goto out; + + down_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + loff_t end = iocb->ki_pos + iov_iter_count(from); + + if (inode->i_sb->s_blocksize < + (udf_file_entry_alloc_offset(inode) + end)) { + err = udf_expand_file_adinicb(inode); + if (err) { + mutex_unlock(&inode->i_mutex); + udf_debug("udf_expand_adinicb: err=%d\n", err); + return err; + } + } else { + iinfo->i_lenAlloc = max(end, inode->i_size); + up_write(&iinfo->i_data_sem); + } + } else + up_write(&iinfo->i_data_sem); + + retval = __generic_file_write_iter(iocb, from); +out: + mutex_unlock(&inode->i_mutex); + + if (retval > 0) { + ssize_t err; + + mark_inode_dirty(inode); + err = generic_write_sync(file, iocb->ki_pos - retval, retval); + if (err < 0) + retval = err; + } + + return retval; +} + +long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + long old_block, new_block; + int result = -EINVAL; + + if (inode_permission(inode, MAY_READ) != 0) { + udf_debug("no permission to access inode %lu\n", inode->i_ino); + result = -EPERM; + goto out; + } + + if (!arg) { + udf_debug("invalid argument to udf_ioctl\n"); + result = -EINVAL; + goto out; + } + + switch (cmd) { + case UDF_GETVOLIDENT: + if (copy_to_user((char __user *)arg, + UDF_SB(inode->i_sb)->s_volume_ident, 32)) + result = -EFAULT; + else + result = 0; + goto out; + case UDF_RELOCATE_BLOCKS: + if (!capable(CAP_SYS_ADMIN)) { + result = -EPERM; + goto out; + } + if (get_user(old_block, (long __user *)arg)) { + result = -EFAULT; + goto out; + } + result = udf_relocate_blocks(inode->i_sb, + old_block, &new_block); + if (result == 0) + result = put_user(new_block, (long __user *)arg); + goto out; + case UDF_GETEASIZE: + result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); + goto out; + case UDF_GETEABLOCK: + result = copy_to_user((char __user *)arg, + UDF_I(inode)->i_ext.i_data, + UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; + goto out; + } + +out: + return result; +} + +static int udf_release_file(struct inode *inode, struct file *filp) +{ + if (filp->f_mode & FMODE_WRITE && + atomic_read(&inode->i_writecount) == 1) { + /* + * Grab i_mutex to avoid races with writes changing i_size + * while we are running. + */ + mutex_lock(&inode->i_mutex); + down_write(&UDF_I(inode)->i_data_sem); + udf_discard_prealloc(inode); + udf_truncate_tail_extent(inode); + up_write(&UDF_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } + return 0; +} + +const struct file_operations udf_file_operations = { + .read_iter = generic_file_read_iter, + .unlocked_ioctl = udf_ioctl, + .open = generic_file_open, + .mmap = generic_file_mmap, + .write_iter = udf_file_write_iter, + .release = udf_release_file, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, +}; + +static int udf_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = udf_setsize(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations udf_file_inode_operations = { + .setattr = udf_setattr, +}; diff --git a/kernel/fs/udf/ialloc.c b/kernel/fs/udf/ialloc.c new file mode 100644 index 000000000..e77db621e --- /dev/null +++ b/kernel/fs/udf/ialloc.c @@ -0,0 +1,133 @@ +/* + * ialloc.c + * + * PURPOSE + * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +void udf_free_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + + if (lvidiu) { + mutex_lock(&sbi->s_alloc_mutex); + if (S_ISDIR(inode->i_mode)) + le32_add_cpu(&lvidiu->numDirs, -1); + else + le32_add_cpu(&lvidiu->numFiles, -1); + udf_updated_lvid(sb); + mutex_unlock(&sbi->s_alloc_mutex); + } + + udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); +} + +struct inode *udf_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + struct inode *inode; + int block; + uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; + struct udf_inode_info *iinfo; + struct udf_inode_info *dinfo = UDF_I(dir); + struct logicalVolIntegrityDescImpUse *lvidiu; + int err; + + inode = new_inode(sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + iinfo = UDF_I(inode); + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { + iinfo->i_efe = 1; + if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) + sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); + } else { + iinfo->i_efe = 0; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); + } + if (!iinfo->i_ext.i_data) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + err = -ENOSPC; + block = udf_new_block(dir->i_sb, NULL, + dinfo->i_location.partitionReferenceNum, + start, &err); + if (err) { + iput(inode); + return ERR_PTR(err); + } + + lvidiu = udf_sb_lvidiu(sb); + if (lvidiu) { + iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; + mutex_lock(&sbi->s_alloc_mutex); + if (S_ISDIR(mode)) + le32_add_cpu(&lvidiu->numDirs, 1); + else + le32_add_cpu(&lvidiu->numFiles, 1); + udf_updated_lvid(sb); + mutex_unlock(&sbi->s_alloc_mutex); + } + + inode_init_owner(inode, dir, mode); + + iinfo->i_location.logicalBlockNum = block; + iinfo->i_location.partitionReferenceNum = + dinfo->i_location.partitionReferenceNum; + inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); + inode->i_blocks = 0; + iinfo->i_lenEAttr = 0; + iinfo->i_lenAlloc = 0; + iinfo->i_use = 0; + iinfo->i_checkpoint = 1; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + inode->i_mtime = inode->i_atime = inode->i_ctime = + iinfo->i_crtime = current_fs_time(inode->i_sb); + if (unlikely(insert_inode_locked(inode) < 0)) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EIO); + } + mark_inode_dirty(inode); + + return inode; +} diff --git a/kernel/fs/udf/inode.c b/kernel/fs/udf/inode.c new file mode 100644 index 000000000..6afac3d56 --- /dev/null +++ b/kernel/fs/udf/inode.c @@ -0,0 +1,2291 @@ +/* + * inode.c + * + * PURPOSE + * Inode handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 10/04/98 dgb Added rudimentary directory functions + * 10/07/98 Fully working udf_block_map! It works! + * 11/25/98 bmap altered to better support extents + * 12/06/98 blf partition support in udf_iget, udf_block_map + * and udf_read_inode + * 12/12/98 rewrote udf_block_map to handle next extents and descs across + * block boundaries (which is not actually allowed) + * 12/20/98 added support for strategy 4096 + * 03/07/99 rewrote udf_block_map (again) + * New funcs, inode_bmap, udf_next_aext + * 04/19/99 Support for writing device EA's for major/minor # + */ + +#include "udfdecl.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +MODULE_AUTHOR("Ben Fennema"); +MODULE_DESCRIPTION("Universal Disk Format Filesystem"); +MODULE_LICENSE("GPL"); + +#define EXTENT_MERGE_SIZE 5 + +static umode_t udf_convert_permissions(struct fileEntry *); +static int udf_update_inode(struct inode *, int); +static int udf_sync_inode(struct inode *inode); +static int udf_alloc_i_data(struct inode *inode, size_t size); +static sector_t inode_getblk(struct inode *, sector_t, int *, int *); +static int8_t udf_insert_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); +static void udf_split_extents(struct inode *, int *, int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_prealloc_extents(struct inode *, int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_merge_extents(struct inode *, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_update_extents(struct inode *, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int, + struct extent_position *); +static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); + +static void __udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->cached_extent.lstart != -1) { + brelse(iinfo->cached_extent.epos.bh); + iinfo->cached_extent.lstart = -1; + } +} + +/* Invalidate extent cache */ +static void udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + __udf_clear_extent_cache(inode); + spin_unlock(&iinfo->i_extent_cache_lock); +} + +/* Return contents of extent cache */ +static int udf_read_extent_cache(struct inode *inode, loff_t bcount, + loff_t *lbcount, struct extent_position *pos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + int ret = 0; + + spin_lock(&iinfo->i_extent_cache_lock); + if ((iinfo->cached_extent.lstart <= bcount) && + (iinfo->cached_extent.lstart != -1)) { + /* Cache hit */ + *lbcount = iinfo->cached_extent.lstart; + memcpy(pos, &iinfo->cached_extent.epos, + sizeof(struct extent_position)); + if (pos->bh) + get_bh(pos->bh); + ret = 1; + } + spin_unlock(&iinfo->i_extent_cache_lock); + return ret; +} + +/* Add extent to extent cache */ +static void udf_update_extent_cache(struct inode *inode, loff_t estart, + struct extent_position *pos, int next_epos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + /* Invalidate previously cached extent */ + __udf_clear_extent_cache(inode); + if (pos->bh) + get_bh(pos->bh); + memcpy(&iinfo->cached_extent.epos, pos, + sizeof(struct extent_position)); + iinfo->cached_extent.lstart = estart; + if (next_epos) + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + iinfo->cached_extent.epos.offset -= + sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + iinfo->cached_extent.epos.offset -= + sizeof(struct long_ad); + } + spin_unlock(&iinfo->i_extent_cache_lock); +} + +void udf_evict_inode(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) { + want_delete = 1; + udf_setsize(inode, 0); + udf_update_inode(inode, IS_SYNC(inode)); + } + truncate_inode_pages_final(&inode->i_data); + invalidate_inode_buffers(inode); + clear_inode(inode); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); + } + kfree(iinfo->i_ext.i_data); + iinfo->i_ext.i_data = NULL; + udf_clear_extent_cache(inode); + if (want_delete) { + udf_free_inode(inode); + } +} + +static void udf_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = inode->i_size; + + if (to > isize) { + truncate_pagecache(inode, isize); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } + } +} + +static int udf_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, udf_get_block, wbc); +} + +static int udf_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, udf_get_block); +} + +static int udf_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, udf_get_block); +} + +static int udf_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, udf_get_block); +} + +static int udf_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); + if (unlikely(ret)) + udf_write_failed(mapping, pos + len); + return ret; +} + +static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block); + if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) + udf_write_failed(mapping, offset + count); + return ret; +} + +static sector_t udf_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, udf_get_block); +} + +const struct address_space_operations udf_aops = { + .readpage = udf_readpage, + .readpages = udf_readpages, + .writepage = udf_writepage, + .writepages = udf_writepages, + .write_begin = udf_write_begin, + .write_end = generic_write_end, + .direct_IO = udf_direct_IO, + .bmap = udf_bmap, +}; + +/* + * Expand file stored in ICB to a normal one-block-file + * + * This function requires i_data_sem for writing and releases it. + * This function requires i_mutex held + */ +int udf_expand_file_adinicb(struct inode *inode) +{ + struct page *page; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + int err; + struct writeback_control udf_wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + }; + + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + if (!iinfo->i_lenAlloc) { + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); + mark_inode_dirty(inode); + return 0; + } + /* + * Release i_data_sem so that we can lock a page - page lock ranks + * above i_data_sem. i_mutex still protects us against file changes. + */ + up_write(&iinfo->i_data_sem); + + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + if (!page) + return -ENOMEM; + + if (!PageUptodate(page)) { + kaddr = kmap(page); + memset(kaddr + iinfo->i_lenAlloc, 0x00, + PAGE_CACHE_SIZE - iinfo->i_lenAlloc); + memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, + iinfo->i_lenAlloc); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); + } + down_write(&iinfo->i_data_sem); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, + iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); + err = inode->i_data.a_ops->writepage(page, &udf_wbc); + if (err) { + /* Restore everything back so that we don't lose data... */ + lock_page(page); + kaddr = kmap(page); + down_write(&iinfo->i_data_sem); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, + inode->i_size); + kunmap(page); + unlock_page(page); + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + inode->i_data.a_ops = &udf_adinicb_aops; + up_write(&iinfo->i_data_sem); + } + page_cache_release(page); + mark_inode_dirty(inode); + + return err; +} + +struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, + int *err) +{ + int newblock; + struct buffer_head *dbh = NULL; + struct kernel_lb_addr eloc; + uint8_t alloctype; + struct extent_position epos; + + struct udf_fileident_bh sfibh, dfibh; + loff_t f_pos = udf_ext0_offset(inode); + int size = udf_ext0_offset(inode) + inode->i_size; + struct fileIdentDesc cfi, *sfi, *dfi; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + alloctype = ICBTAG_FLAG_AD_SHORT; + else + alloctype = ICBTAG_FLAG_AD_LONG; + + if (!inode->i_size) { + iinfo->i_alloc_type = alloctype; + mark_inode_dirty(inode); + return NULL; + } + + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, err); + if (!(*block)) + return NULL; + newblock = udf_get_pblock(inode->i_sb, *block, + iinfo->i_location.partitionReferenceNum, + 0); + if (!newblock) + return NULL; + dbh = udf_tgetblk(inode->i_sb, newblock); + if (!dbh) + return NULL; + lock_buffer(dbh); + memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(dbh); + unlock_buffer(dbh); + mark_buffer_dirty_inode(dbh, inode); + + sfibh.soffset = sfibh.eoffset = + f_pos & (inode->i_sb->s_blocksize - 1); + sfibh.sbh = sfibh.ebh = NULL; + dfibh.soffset = dfibh.eoffset = 0; + dfibh.sbh = dfibh.ebh = dbh; + while (f_pos < size) { + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, + NULL, NULL, NULL); + if (!sfi) { + brelse(dbh); + return NULL; + } + iinfo->i_alloc_type = alloctype; + sfi->descTag.tagLocation = cpu_to_le32(*block); + dfibh.soffset = dfibh.eoffset; + dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); + dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); + if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, + sfi->fileIdent + + le16_to_cpu(sfi->lengthOfImpUse))) { + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + brelse(dbh); + return NULL; + } + } + mark_buffer_dirty_inode(dbh, inode); + + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0, + iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + iinfo->i_lenExtents = inode->i_size; + epos.bh = NULL; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); + /* UniqueID stuff */ + + brelse(epos.bh); + mark_inode_dirty(inode); + return dbh; +} + +static int udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int err, new; + sector_t phys = 0; + struct udf_inode_info *iinfo; + + if (!create) { + phys = udf_block_map(inode, block); + if (phys) + map_bh(bh_result, inode->i_sb, phys); + return 0; + } + + err = -EIO; + new = 0; + iinfo = UDF_I(inode); + + down_write(&iinfo->i_data_sem); + if (block == iinfo->i_next_alloc_block + 1) { + iinfo->i_next_alloc_block++; + iinfo->i_next_alloc_goal++; + } + + udf_clear_extent_cache(inode); + phys = inode_getblk(inode, block, &err, &new); + if (!phys) + goto abort; + + if (new) + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, phys); + +abort: + up_write(&iinfo->i_data_sem); + return err; +} + +static struct buffer_head *udf_getblk(struct inode *inode, long block, + int create, int *err) +{ + struct buffer_head *bh; + struct buffer_head dummy; + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + *err = udf_get_block(inode, block, &dummy, create); + if (!*err && buffer_mapped(&dummy)) { + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (buffer_new(&dummy)) { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + } + return bh; + } + + return NULL; +} + +/* Extend the file by 'blocks' blocks, return the number of extents added */ +static int udf_do_extend_file(struct inode *inode, + struct extent_position *last_pos, + struct kernel_long_ad *last_ext, + sector_t blocks) +{ + sector_t add; + int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); + struct super_block *sb = inode->i_sb; + struct kernel_lb_addr prealloc_loc = {}; + int prealloc_len = 0; + struct udf_inode_info *iinfo; + int err; + + /* The previous extent is fake and we should not extend by anything + * - there's nothing to do... */ + if (!blocks && fake) + return 0; + + iinfo = UDF_I(inode); + /* Round the last extent up to a multiple of block size */ + if (last_ext->extLength & (sb->s_blocksize - 1)) { + last_ext->extLength = + (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | + (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); + iinfo->i_lenExtents = + (iinfo->i_lenExtents + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + } + + /* Last extent are just preallocated blocks? */ + if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == + EXT_NOT_RECORDED_ALLOCATED) { + /* Save the extent so that we can reattach it to the end */ + prealloc_loc = last_ext->extLocation; + prealloc_len = last_ext->extLength; + /* Mark the extent as a hole */ + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); + last_ext->extLocation.logicalBlockNum = 0; + last_ext->extLocation.partitionReferenceNum = 0; + } + + /* Can we merge with the previous extent? */ + if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == + EXT_NOT_RECORDED_NOT_ALLOCATED) { + add = ((1 << 30) - sb->s_blocksize - + (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >> + sb->s_blocksize_bits; + if (add > blocks) + add = blocks; + blocks -= add; + last_ext->extLength += add << sb->s_blocksize_bits; + } + + if (fake) { + udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + count++; + } else + udf_write_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + + /* Managed to do everything necessary? */ + if (!blocks) + goto out; + + /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ + last_ext->extLocation.logicalBlockNum = 0; + last_ext->extLocation.partitionReferenceNum = 0; + add = (1 << (30-sb->s_blocksize_bits)) - 1; + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (add << sb->s_blocksize_bits); + + /* Create enough extents to cover the whole hole */ + while (blocks > add) { + blocks -= add; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; + count++; + } + if (blocks) { + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (blocks << sb->s_blocksize_bits); + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; + count++; + } + +out: + /* Do we have some preallocated blocks saved? */ + if (prealloc_len) { + err = udf_add_aext(inode, last_pos, &prealloc_loc, + prealloc_len, 1); + if (err) + return err; + last_ext->extLocation = prealloc_loc; + last_ext->extLength = prealloc_len; + count++; + } + + /* last_pos should point to the last written extent... */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + last_pos->offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + last_pos->offset -= sizeof(struct long_ad); + else + return -EIO; + + return count; +} + +static int udf_extend_file(struct inode *inode, loff_t newsize) +{ + + struct extent_position epos; + struct kernel_lb_addr eloc; + uint32_t elen; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = newsize >> sb->s_blocksize_bits, offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + struct kernel_long_ad extent; + int err; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + + /* File has extent covering the new size (could happen when extending + * inside a block)? */ + if (etype != -1) + return 0; + if (newsize & (sb->s_blocksize - 1)) + offset++; + /* Extended file just to the boundary of the last file block? */ + if (offset == 0) + return 0; + + /* Truncate is extending the file by 'offset' blocks */ + if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || + (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { + /* File has no extents at all or has empty last + * indirect extent! Create a fake extent... */ + extent.extLocation.logicalBlockNum = 0; + extent.extLocation.partitionReferenceNum = 0; + extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + } else { + epos.offset -= adsize; + etype = udf_next_aext(inode, &epos, &extent.extLocation, + &extent.extLength, 0); + extent.extLength |= etype << 30; + } + err = udf_do_extend_file(inode, &epos, &extent, offset); + if (err < 0) + goto out; + err = 0; + iinfo->i_lenExtents = newsize; +out: + brelse(epos.bh); + return err; +} + +static sector_t inode_getblk(struct inode *inode, sector_t block, + int *err, int *new) +{ + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; + struct extent_position prev_epos, cur_epos, next_epos; + int count = 0, startnum = 0, endnum = 0; + uint32_t elen = 0, tmpelen; + struct kernel_lb_addr eloc, tmpeloc; + int c = 1; + loff_t lbcount = 0, b_off = 0; + uint32_t newblocknum, newblock; + sector_t offset = 0; + int8_t etype; + struct udf_inode_info *iinfo = UDF_I(inode); + int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; + int lastblock = 0; + bool isBeyondEOF; + + *err = 0; + *new = 0; + prev_epos.offset = udf_file_entry_alloc_offset(inode); + prev_epos.block = iinfo->i_location; + prev_epos.bh = NULL; + cur_epos = next_epos = prev_epos; + b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; + + /* find the extent which contains the block we are looking for. + alternate between laarr[0] and laarr[1] for locations of the + current extent, and the previous extent */ + do { + if (prev_epos.bh != cur_epos.bh) { + brelse(prev_epos.bh); + get_bh(cur_epos.bh); + prev_epos.bh = cur_epos.bh; + } + if (cur_epos.bh != next_epos.bh) { + brelse(cur_epos.bh); + get_bh(next_epos.bh); + cur_epos.bh = next_epos.bh; + } + + lbcount += elen; + + prev_epos.block = cur_epos.block; + cur_epos.block = next_epos.block; + + prev_epos.offset = cur_epos.offset; + cur_epos.offset = next_epos.offset; + + etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1); + if (etype == -1) + break; + + c = !c; + + laarr[c].extLength = (etype << 30) | elen; + laarr[c].extLocation = eloc; + + if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + pgoal = eloc.logicalBlockNum + + ((elen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + + count++; + } while (lbcount + elen <= b_off); + + b_off -= lbcount; + offset = b_off >> inode->i_sb->s_blocksize_bits; + /* + * Move prev_epos and cur_epos into indirect extent if we are at + * the pointer to it + */ + udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0); + udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0); + + /* if the extent is allocated and recorded, return the block + if the extent is not a multiple of the blocksize, round up */ + + if (etype == (EXT_RECORDED_ALLOCATED >> 30)) { + if (elen & (inode->i_sb->s_blocksize - 1)) { + elen = EXT_RECORDED_ALLOCATED | + ((elen + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + udf_write_aext(inode, &cur_epos, &eloc, elen, 1); + } + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + return newblock; + } + + /* Are we beyond EOF? */ + if (etype == -1) { + int ret; + isBeyondEOF = true; + if (count) { + if (c) + laarr[0] = laarr[1]; + startnum = 1; + } else { + /* Create a fake extent when there's not one */ + memset(&laarr[0].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + /* Will udf_do_extend_file() create real extent from + a fake one? */ + startnum = (offset > 0); + } + /* Create extents for the hole between EOF and offset */ + ret = udf_do_extend_file(inode, &prev_epos, laarr, offset); + if (ret < 0) { + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + *err = ret; + return 0; + } + c = 0; + offset = 0; + count += ret; + /* We are not covered by a preallocated extent? */ + if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != + EXT_NOT_RECORDED_ALLOCATED) { + /* Is there any real extent? - otherwise we overwrite + * the fake one... */ + if (count) + c = !c; + laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + inode->i_sb->s_blocksize; + memset(&laarr[c].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + count++; + } + endnum = c + 1; + lastblock = 1; + } else { + isBeyondEOF = false; + endnum = startnum = ((count > 2) ? 2 : count); + + /* if the current extent is in position 0, + swap it with the previous */ + if (!c && count != 1) { + laarr[2] = laarr[0]; + laarr[0] = laarr[1]; + laarr[1] = laarr[2]; + c = 1; + } + + /* if the current block is located in an extent, + read the next extent */ + etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0); + if (etype != -1) { + laarr[c + 1].extLength = (etype << 30) | elen; + laarr[c + 1].extLocation = eloc; + count++; + startnum++; + endnum++; + } else + lastblock = 1; + } + + /* if the current extent is not recorded but allocated, get the + * block in the extent corresponding to the requested block */ + if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) + newblocknum = laarr[c].extLocation.logicalBlockNum + offset; + else { /* otherwise, allocate a new block */ + if (iinfo->i_next_alloc_block == block) + goal = iinfo->i_next_alloc_goal; + + if (!goal) { + if (!(goal = pgoal)) /* XXX: what was intended here? */ + goal = iinfo->i_location.logicalBlockNum + 1; + } + + newblocknum = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + goal, err); + if (!newblocknum) { + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + *err = -ENOSPC; + return 0; + } + if (isBeyondEOF) + iinfo->i_lenExtents += inode->i_sb->s_blocksize; + } + + /* if the extent the requsted block is located in contains multiple + * blocks, split the extent into at most three extents. blocks prior + * to requested block, requested block, and blocks after requested + * block */ + udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); + +#ifdef UDF_PREALLOCATE + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); +#endif + + /* merge any continuous blocks in laarr */ + udf_merge_extents(inode, laarr, &endnum); + + /* write back the new extents, inserting new extents if the new number + * of extents is greater than the old number, and deleting extents if + * the new number of extents is less than the old number */ + udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + + newblock = udf_get_pblock(inode->i_sb, newblocknum, + iinfo->i_location.partitionReferenceNum, 0); + if (!newblock) { + *err = -EIO; + return 0; + } + *new = 1; + iinfo->i_next_alloc_block = block; + iinfo->i_next_alloc_goal = newblocknum; + inode->i_ctime = current_fs_time(inode->i_sb); + + if (IS_SYNC(inode)) + udf_sync_inode(inode); + else + mark_inode_dirty(inode); + + return newblock; +} + +static void udf_split_extents(struct inode *inode, int *c, int offset, + int newblocknum, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + + if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || + (laarr[*c].extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { + int curr = *c; + int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits; + int8_t etype = (laarr[curr].extLength >> 30); + + if (blen == 1) + ; + else if (!offset || blen == offset + 1) { + laarr[curr + 2] = laarr[curr + 1]; + laarr[curr + 1] = laarr[curr]; + } else { + laarr[curr + 3] = laarr[curr + 1]; + laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; + } + + if (offset) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, + &laarr[curr].extLocation, + 0, offset); + laarr[curr].extLength = + EXT_NOT_RECORDED_NOT_ALLOCATED | + (offset << blocksize_bits); + laarr[curr].extLocation.logicalBlockNum = 0; + laarr[curr].extLocation. + partitionReferenceNum = 0; + } else + laarr[curr].extLength = (etype << 30) | + (offset << blocksize_bits); + curr++; + (*c)++; + (*endnum)++; + } + + laarr[curr].extLocation.logicalBlockNum = newblocknum; + if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + laarr[curr].extLocation.partitionReferenceNum = + UDF_I(inode)->i_location.partitionReferenceNum; + laarr[curr].extLength = EXT_RECORDED_ALLOCATED | + blocksize; + curr++; + + if (blen != offset + 1) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) + laarr[curr].extLocation.logicalBlockNum += + offset + 1; + laarr[curr].extLength = (etype << 30) | + ((blen - (offset + 1)) << blocksize_bits); + curr++; + (*endnum)++; + } + } +} + +static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + int start, length = 0, currlength = 0, i; + + if (*endnum >= (c + 1)) { + if (!lastblock) + return; + else + start = c; + } else { + if ((laarr[c + 1].extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + start = c + 1; + length = currlength = + (((laarr[c + 1].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + } else + start = c; + } + + for (i = start + 1; i <= *endnum; i++) { + if (i == *endnum) { + if (lastblock) + length += UDF_DEFAULT_PREALLOC_BLOCKS; + } else if ((laarr[i].extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { + length += (((laarr[i].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + } else + break; + } + + if (length) { + int next = laarr[start].extLocation.logicalBlockNum + + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + int numalloc = udf_prealloc_blocks(inode->i_sb, inode, + laarr[start].extLocation.partitionReferenceNum, + next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? + length : UDF_DEFAULT_PREALLOC_BLOCKS) - + currlength); + if (numalloc) { + if (start == (c + 1)) + laarr[start].extLength += + (numalloc << + inode->i_sb->s_blocksize_bits); + else { + memmove(&laarr[c + 2], &laarr[c + 1], + sizeof(struct long_ad) * (*endnum - (c + 1))); + (*endnum)++; + laarr[c + 1].extLocation.logicalBlockNum = next; + laarr[c + 1].extLocation.partitionReferenceNum = + laarr[c].extLocation. + partitionReferenceNum; + laarr[c + 1].extLength = + EXT_NOT_RECORDED_ALLOCATED | + (numalloc << + inode->i_sb->s_blocksize_bits); + start = c + 1; + } + + for (i = start + 1; numalloc && i < *endnum; i++) { + int elen = ((laarr[i].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + + if (elen > numalloc) { + laarr[i].extLength -= + (numalloc << + inode->i_sb->s_blocksize_bits); + numalloc = 0; + } else { + numalloc -= elen; + if (*endnum > (i + 1)) + memmove(&laarr[i], + &laarr[i + 1], + sizeof(struct long_ad) * + (*endnum - (i + 1))); + i--; + (*endnum)--; + } + } + UDF_I(inode)->i_lenExtents += + numalloc << inode->i_sb->s_blocksize_bits; + } + } +} + +static void udf_merge_extents(struct inode *inode, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + int i; + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + + for (i = 0; i < (*endnum - 1); i++) { + struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; + struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; + + if (((li->extLength >> 30) == (lip1->extLength >> 30)) && + (((li->extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || + ((lip1->extLocation.logicalBlockNum - + li->extLocation.logicalBlockNum) == + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits)))) { + + if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { + lip1->extLength = (lip1->extLength - + (li->extLength & + UDF_EXTENT_LENGTH_MASK) + + UDF_EXTENT_LENGTH_MASK) & + ~(blocksize - 1); + li->extLength = (li->extLength & + UDF_EXTENT_FLAG_MASK) + + (UDF_EXTENT_LENGTH_MASK + 1) - + blocksize; + lip1->extLocation.logicalBlockNum = + li->extLocation.logicalBlockNum + + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) >> + blocksize_bits); + } else { + li->extLength = lip1->extLength + + (((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~(blocksize - 1)); + if (*endnum > (i + 2)) + memmove(&laarr[i + 1], &laarr[i + 2], + sizeof(struct long_ad) * + (*endnum - (i + 2))); + i--; + (*endnum)--; + } + } else if (((li->extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) && + ((lip1->extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { + udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits); + li->extLocation.logicalBlockNum = 0; + li->extLocation.partitionReferenceNum = 0; + + if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { + lip1->extLength = (lip1->extLength - + (li->extLength & + UDF_EXTENT_LENGTH_MASK) + + UDF_EXTENT_LENGTH_MASK) & + ~(blocksize - 1); + li->extLength = (li->extLength & + UDF_EXTENT_FLAG_MASK) + + (UDF_EXTENT_LENGTH_MASK + 1) - + blocksize; + } else { + li->extLength = lip1->extLength + + (((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~(blocksize - 1)); + if (*endnum > (i + 2)) + memmove(&laarr[i + 1], &laarr[i + 2], + sizeof(struct long_ad) * + (*endnum - (i + 2))); + i--; + (*endnum)--; + } + } else if ((li->extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, + &li->extLocation, 0, + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits); + li->extLocation.logicalBlockNum = 0; + li->extLocation.partitionReferenceNum = 0; + li->extLength = (li->extLength & + UDF_EXTENT_LENGTH_MASK) | + EXT_NOT_RECORDED_NOT_ALLOCATED; + } + } +} + +static void udf_update_extents(struct inode *inode, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int startnum, int endnum, + struct extent_position *epos) +{ + int start = 0, i; + struct kernel_lb_addr tmploc; + uint32_t tmplen; + + if (startnum > endnum) { + for (i = 0; i < (startnum - endnum); i++) + udf_delete_aext(inode, *epos, laarr[i].extLocation, + laarr[i].extLength); + } else if (startnum < endnum) { + for (i = 0; i < (endnum - startnum); i++) { + udf_insert_aext(inode, *epos, laarr[i].extLocation, + laarr[i].extLength); + udf_next_aext(inode, epos, &laarr[i].extLocation, + &laarr[i].extLength, 1); + start++; + } + } + + for (i = start; i < endnum; i++) { + udf_next_aext(inode, epos, &tmploc, &tmplen, 0); + udf_write_aext(inode, epos, &laarr[i].extLocation, + laarr[i].extLength, 1); + } +} + +struct buffer_head *udf_bread(struct inode *inode, int block, + int create, int *err) +{ + struct buffer_head *bh = NULL; + + bh = udf_getblk(inode, block, create, err); + if (!bh) + return NULL; + + if (buffer_uptodate(bh)) + return bh; + + ll_rw_block(READ, 1, &bh); + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + + brelse(bh); + *err = -EIO; + return NULL; +} + +int udf_setsize(struct inode *inode, loff_t newsize) +{ + int err; + struct udf_inode_info *iinfo; + int bsize = 1 << inode->i_blkbits; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + iinfo = UDF_I(inode); + if (newsize > inode->i_size) { + down_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (bsize < + (udf_file_entry_alloc_offset(inode) + newsize)) { + err = udf_expand_file_adinicb(inode); + if (err) + return err; + down_write(&iinfo->i_data_sem); + } else { + iinfo->i_lenAlloc = newsize; + goto set_size; + } + } + err = udf_extend_file(inode, newsize); + if (err) { + up_write(&iinfo->i_data_sem); + return err; + } +set_size: + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + } else { + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + 0x00, bsize - newsize - + udf_file_entry_alloc_offset(inode)); + iinfo->i_lenAlloc = newsize; + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + goto update_time; + } + err = block_truncate_page(inode->i_mapping, newsize, + udf_get_block); + if (err) + return err; + down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); + truncate_setsize(inode, newsize); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } +update_time: + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + if (IS_SYNC(inode)) + udf_sync_inode(inode); + else + mark_inode_dirty(inode); + return 0; +} + +/* + * Maximum length of linked list formed by ICB hierarchy. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_ICB_NESTING 1024 + +static int udf_read_inode(struct inode *inode, bool hidden_inode) +{ + struct buffer_head *bh = NULL; + struct fileEntry *fe; + struct extendedFileEntry *efe; + uint16_t ident; + struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + struct kernel_lb_addr *iloc = &iinfo->i_location; + unsigned int link_count; + unsigned int indirections = 0; + int bs = inode->i_sb->s_blocksize; + int ret = -EIO; + +reread: + if (iloc->logicalBlockNum >= + sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { + udf_debug("block=%d, partition=%d out of range\n", + iloc->logicalBlockNum, iloc->partitionReferenceNum); + return -EIO; + } + + /* + * Set defaults, but the inode is still incomplete! + * Note: get_new_inode() sets the following on a new inode: + * i_sb = sb + * i_no = ino + * i_flags = sb->s_flags + * i_state = 0 + * clean_inode(): zero fills and sets + * i_count = 1 + * i_nlink = 1 + * i_op = NULL; + */ + bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); + if (!bh) { + udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); + return -EIO; + } + + if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && + ident != TAG_IDENT_USE) { + udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", + inode->i_ino, ident); + goto out; + } + + fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; + + if (fe->icbTag.strategyType == cpu_to_le16(4096)) { + struct buffer_head *ibh; + + ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); + if (ident == TAG_IDENT_IE && ibh) { + struct kernel_lb_addr loc; + struct indirectEntry *ie; + + ie = (struct indirectEntry *)ibh->b_data; + loc = lelb_to_cpu(ie->indirectICB.extLocation); + + if (ie->indirectICB.extLength) { + brelse(ibh); + memcpy(&iinfo->i_location, &loc, + sizeof(struct kernel_lb_addr)); + if (++indirections > UDF_MAX_ICB_NESTING) { + udf_err(inode->i_sb, + "too many ICBs in ICB hierarchy" + " (max %d supported)\n", + UDF_MAX_ICB_NESTING); + goto out; + } + brelse(bh); + goto reread; + } + } + brelse(ibh); + } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { + udf_err(inode->i_sb, "unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); + goto out; + } + if (fe->icbTag.strategyType == cpu_to_le16(4)) + iinfo->i_strat4096 = 0; + else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ + iinfo->i_strat4096 = 1; + + iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & + ICBTAG_FLAG_AD_MASK; + iinfo->i_unique = 0; + iinfo->i_lenEAttr = 0; + iinfo->i_lenExtents = 0; + iinfo->i_lenAlloc = 0; + iinfo->i_next_alloc_block = 0; + iinfo->i_next_alloc_goal = 0; + if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { + iinfo->i_efe = 1; + iinfo->i_use = 0; + ret = udf_alloc_i_data(inode, bs - + sizeof(struct extendedFileEntry)); + if (ret) + goto out; + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct extendedFileEntry), + bs - sizeof(struct extendedFileEntry)); + } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { + iinfo->i_efe = 0; + iinfo->i_use = 0; + ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); + if (ret) + goto out; + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct fileEntry), + bs - sizeof(struct fileEntry)); + } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { + iinfo->i_efe = 0; + iinfo->i_use = 1; + iinfo->i_lenAlloc = le32_to_cpu( + ((struct unallocSpaceEntry *)bh->b_data)-> + lengthAllocDescs); + ret = udf_alloc_i_data(inode, bs - + sizeof(struct unallocSpaceEntry)); + if (ret) + goto out; + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct unallocSpaceEntry), + bs - sizeof(struct unallocSpaceEntry)); + return 0; + } + + ret = -EIO; + read_lock(&sbi->s_cred_lock); + i_uid_write(inode, le32_to_cpu(fe->uid)); + if (!uid_valid(inode->i_uid) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) + inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + + i_gid_write(inode, le32_to_cpu(fe->gid)); + if (!gid_valid(inode->i_gid) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) + inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + + if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_fmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_fmode; + else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_dmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_dmode; + else + inode->i_mode = udf_convert_permissions(fe); + inode->i_mode &= ~sbi->s_umask; + read_unlock(&sbi->s_cred_lock); + + link_count = le16_to_cpu(fe->fileLinkCount); + if (!link_count) { + if (!hidden_inode) { + ret = -ESTALE; + goto out; + } + link_count = 1; + } + set_nlink(inode, link_count); + + inode->i_size = le64_to_cpu(fe->informationLength); + iinfo->i_lenExtents = inode->i_size; + + if (iinfo->i_efe == 0) { + inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime)) + inode->i_atime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_mtime, + fe->modificationTime)) + inode->i_mtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime)) + inode->i_ctime = sbi->s_record_time; + + iinfo->i_unique = le64_to_cpu(fe->uniqueID); + iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); + iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); + } else { + inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime)) + inode->i_atime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_mtime, + efe->modificationTime)) + inode->i_mtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime)) + iinfo->i_crtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime)) + inode->i_ctime = sbi->s_record_time; + + iinfo->i_unique = le64_to_cpu(efe->uniqueID); + iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); + iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); + } + inode->i_generation = iinfo->i_unique; + + /* + * Sanity check length of allocation descriptors and extended attrs to + * avoid integer overflows + */ + if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) + goto out; + /* Now do exact checks */ + if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) + goto out; + /* Sanity checks for files in ICB so that we don't get confused later */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + /* + * For file in ICB data is stored in allocation descriptor + * so sizes should match + */ + if (iinfo->i_lenAlloc != inode->i_size) + goto out; + /* File in ICB has to fit in there... */ + if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) + goto out; + } + + switch (fe->icbTag.fileType) { + case ICBTAG_FILE_TYPE_DIRECTORY: + inode->i_op = &udf_dir_inode_operations; + inode->i_fop = &udf_dir_operations; + inode->i_mode |= S_IFDIR; + inc_nlink(inode); + break; + case ICBTAG_FILE_TYPE_REALTIME: + case ICBTAG_FILE_TYPE_REGULAR: + case ICBTAG_FILE_TYPE_UNDEF: + case ICBTAG_FILE_TYPE_VAT20: + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + inode->i_mode |= S_IFREG; + break; + case ICBTAG_FILE_TYPE_BLOCK: + inode->i_mode |= S_IFBLK; + break; + case ICBTAG_FILE_TYPE_CHAR: + inode->i_mode |= S_IFCHR; + break; + case ICBTAG_FILE_TYPE_FIFO: + init_special_inode(inode, inode->i_mode | S_IFIFO, 0); + break; + case ICBTAG_FILE_TYPE_SOCKET: + init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); + break; + case ICBTAG_FILE_TYPE_SYMLINK: + inode->i_data.a_ops = &udf_symlink_aops; + inode->i_op = &udf_symlink_inode_operations; + inode->i_mode = S_IFLNK | S_IRWXUGO; + break; + case ICBTAG_FILE_TYPE_MAIN: + udf_debug("METADATA FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_MIRROR: + udf_debug("METADATA MIRROR FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_BITMAP: + udf_debug("METADATA BITMAP FILE-----\n"); + break; + default: + udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", + inode->i_ino, fe->icbTag.fileType); + goto out; + } + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + struct deviceSpec *dsea = + (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); + if (dsea) { + init_special_inode(inode, inode->i_mode, + MKDEV(le32_to_cpu(dsea->majorDeviceIdent), + le32_to_cpu(dsea->minorDeviceIdent))); + /* Developer ID ??? */ + } else + goto out; + } + ret = 0; +out: + brelse(bh); + return ret; +} + +static int udf_alloc_i_data(struct inode *inode, size_t size) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); + + if (!iinfo->i_ext.i_data) { + udf_err(inode->i_sb, "(ino %ld) no free memory\n", + inode->i_ino); + return -ENOMEM; + } + + return 0; +} + +static umode_t udf_convert_permissions(struct fileEntry *fe) +{ + umode_t mode; + uint32_t permissions; + uint32_t flags; + + permissions = le32_to_cpu(fe->permissions); + flags = le16_to_cpu(fe->icbTag.flags); + + mode = ((permissions) & S_IRWXO) | + ((permissions >> 2) & S_IRWXG) | + ((permissions >> 4) & S_IRWXU) | + ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | + ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | + ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); + + return mode; +} + +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +static int udf_sync_inode(struct inode *inode) +{ + return udf_update_inode(inode, 1); +} + +static int udf_update_inode(struct inode *inode, int do_sync) +{ + struct buffer_head *bh = NULL; + struct fileEntry *fe; + struct extendedFileEntry *efe; + uint64_t lb_recorded; + uint32_t udfperms; + uint16_t icbflags; + uint16_t crclen; + int err = 0; + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + struct udf_inode_info *iinfo = UDF_I(inode); + + bh = udf_tgetblk(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); + if (!bh) { + udf_debug("getblk failure\n"); + return -EIO; + } + + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; + + if (iinfo->i_use) { + struct unallocSpaceEntry *use = + (struct unallocSpaceEntry *)bh->b_data; + + use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), + iinfo->i_ext.i_data, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); + use->descTag.tagLocation = + cpu_to_le32(iinfo->i_location.logicalBlockNum); + crclen = sizeof(struct unallocSpaceEntry) + + iinfo->i_lenAlloc - sizeof(struct tag); + use->descTag.descCRCLength = cpu_to_le16(crclen); + use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + + sizeof(struct tag), + crclen)); + use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + + goto out; + } + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) + fe->uid = cpu_to_le32(-1); + else + fe->uid = cpu_to_le32(i_uid_read(inode)); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) + fe->gid = cpu_to_le32(-1); + else + fe->gid = cpu_to_le32(i_gid_read(inode)); + + udfperms = ((inode->i_mode & S_IRWXO)) | + ((inode->i_mode & S_IRWXG) << 2) | + ((inode->i_mode & S_IRWXU) << 4); + + udfperms |= (le32_to_cpu(fe->permissions) & + (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | + FE_PERM_G_DELETE | FE_PERM_G_CHATTR | + FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); + fe->permissions = cpu_to_le32(udfperms); + + if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) + fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + + fe->informationLength = cpu_to_le64(inode->i_size); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + struct regid *eid; + struct deviceSpec *dsea = + (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); + if (!dsea) { + dsea = (struct deviceSpec *) + udf_add_extendedattr(inode, + sizeof(struct deviceSpec) + + sizeof(struct regid), 12, 0x3); + dsea->attrType = cpu_to_le32(12); + dsea->attrSubtype = 1; + dsea->attrLength = cpu_to_le32( + sizeof(struct deviceSpec) + + sizeof(struct regid)); + dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); + } + eid = (struct regid *)dsea->impUse; + memset(eid, 0, sizeof(struct regid)); + strcpy(eid->ident, UDF_ID_DEVELOPER); + eid->identSuffix[0] = UDF_OS_CLASS_UNIX; + eid->identSuffix[1] = UDF_OS_ID_LINUX; + dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); + dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); + } + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + lb_recorded = 0; /* No extents => no blocks! */ + else + lb_recorded = + (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> + (blocksize_bits - 9); + + if (iinfo->i_efe == 0) { + memcpy(bh->b_data + sizeof(struct fileEntry), + iinfo->i_ext.i_data, + inode->i_sb->s_blocksize - sizeof(struct fileEntry)); + fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); + + udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + memset(&(fe->impIdent), 0, sizeof(struct regid)); + strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); + fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + fe->uniqueID = cpu_to_le64(iinfo->i_unique); + fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); + fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); + fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); + crclen = sizeof(struct fileEntry); + } else { + memcpy(bh->b_data + sizeof(struct extendedFileEntry), + iinfo->i_ext.i_data, + inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + efe->objectSize = cpu_to_le64(inode->i_size); + efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); + + if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec)) + iinfo->i_crtime = inode->i_atime; + + if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec)) + iinfo->i_crtime = inode->i_mtime; + + if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec)) + iinfo->i_crtime = inode->i_ctime; + + udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); + udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + + memset(&(efe->impIdent), 0, sizeof(struct regid)); + strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); + efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + efe->uniqueID = cpu_to_le64(iinfo->i_unique); + efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); + efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); + efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); + crclen = sizeof(struct extendedFileEntry); + } + if (iinfo->i_strat4096) { + fe->icbTag.strategyType = cpu_to_le16(4096); + fe->icbTag.strategyParameter = cpu_to_le16(1); + fe->icbTag.numEntries = cpu_to_le16(2); + } else { + fe->icbTag.strategyType = cpu_to_le16(4); + fe->icbTag.numEntries = cpu_to_le16(1); + } + + if (S_ISDIR(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; + else if (S_ISREG(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; + else if (S_ISLNK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; + else if (S_ISBLK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; + else if (S_ISCHR(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; + else if (S_ISFIFO(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; + else if (S_ISSOCK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; + + icbflags = iinfo->i_alloc_type | + ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | + ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | + ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | + (le16_to_cpu(fe->icbTag.flags) & + ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | + ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); + + fe->icbTag.flags = cpu_to_le16(icbflags); + if (sbi->s_udfrev >= 0x0200) + fe->descTag.descVersion = cpu_to_le16(3); + else + fe->descTag.descVersion = cpu_to_le16(2); + fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); + fe->descTag.tagLocation = cpu_to_le32( + iinfo->i_location.logicalBlockNum); + crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); + fe->descTag.descCRCLength = cpu_to_le16(crclen); + fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), + crclen)); + fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); + +out: + set_buffer_uptodate(bh); + unlock_buffer(bh); + + /* write the data blocks */ + mark_buffer_dirty(bh); + if (do_sync) { + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + inode->i_ino); + err = -EIO; + } + } + brelse(bh); + + return err; +} + +struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, + bool hidden_inode) +{ + unsigned long block = udf_get_lb_pblock(sb, ino, 0); + struct inode *inode = iget_locked(sb, block); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); + err = udf_read_inode(inode, hidden_inode); + if (err < 0) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + + return inode; +} + +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; + struct allocExtDesc *aed; + uint8_t *ptr; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + else + ptr = epos->bh->b_data + epos->offset; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { + unsigned char *sptr, *dptr; + struct buffer_head *nbh; + int err, loffset; + struct kernel_lb_addr obloc = epos->block; + + epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, + obloc.partitionReferenceNum, + obloc.logicalBlockNum, &err); + if (!epos->block.logicalBlockNum) + return -ENOSPC; + nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, + &epos->block, + 0)); + if (!nbh) + return -EIO; + lock_buffer(nbh); + memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(nbh); + unlock_buffer(nbh); + mark_buffer_dirty_inode(nbh, inode); + + aed = (struct allocExtDesc *)(nbh->b_data); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) + aed->previousAllocExtLocation = + cpu_to_le32(obloc.logicalBlockNum); + if (epos->offset + adsize > inode->i_sb->s_blocksize) { + loffset = epos->offset; + aed->lengthAllocDescs = cpu_to_le32(adsize); + sptr = ptr - adsize; + dptr = nbh->b_data + sizeof(struct allocExtDesc); + memcpy(dptr, sptr, adsize); + epos->offset = sizeof(struct allocExtDesc) + adsize; + } else { + loffset = epos->offset + adsize; + aed->lengthAllocDescs = cpu_to_le32(0); + sptr = ptr; + epos->offset = sizeof(struct allocExtDesc); + + if (epos->bh) { + aed = (struct allocExtDesc *)epos->bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + } else { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(inode); + } + } + if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) + udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, + epos->block.logicalBlockNum, sizeof(struct tag)); + else + udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, + epos->block.logicalBlockNum, sizeof(struct tag)); + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)sptr; + sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | + inode->i_sb->s_blocksize); + sad->extPosition = + cpu_to_le32(epos->block.logicalBlockNum); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)sptr; + lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | + inode->i_sb->s_blocksize); + lad->extLocation = cpu_to_lelb(epos->block); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + break; + } + if (epos->bh) { + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos->bh->b_data, loffset); + else + udf_update_tag(epos->bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos->bh, inode); + brelse(epos->bh); + } else { + mark_inode_dirty(inode); + } + epos->bh = nbh; + } + + udf_write_aext(inode, epos, eloc, elen, inc); + + if (!epos->bh) { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos->bh->b_data, + epos->offset + (inc ? 0 : adsize)); + else + udf_update_tag(epos->bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos->bh, inode); + } + + return 0; +} + +void udf_write_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + uint8_t *ptr; + struct short_ad *sad; + struct long_ad *lad; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + else + ptr = epos->bh->b_data + epos->offset; + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)ptr; + sad->extLength = cpu_to_le32(elen); + sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); + adsize = sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)ptr; + lad->extLength = cpu_to_le32(elen); + lad->extLocation = cpu_to_lelb(*eloc); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + adsize = sizeof(struct long_ad); + break; + default: + return; + } + + if (epos->bh) { + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { + struct allocExtDesc *aed = + (struct allocExtDesc *)epos->bh->b_data; + udf_update_tag(epos->bh->b_data, + le32_to_cpu(aed->lengthAllocDescs) + + sizeof(struct allocExtDesc)); + } + mark_buffer_dirty_inode(epos->bh, inode); + } else { + mark_inode_dirty(inode); + } + + if (inc) + epos->offset += adsize; +} + +int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) +{ + int8_t etype; + + while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == + (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + int block; + epos->block = *eloc; + epos->offset = sizeof(struct allocExtDesc); + brelse(epos->bh); + block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); + epos->bh = udf_tread(inode->i_sb, block); + if (!epos->bh) { + udf_debug("reading block %d failed!\n", block); + return -1; + } + } + + return etype; +} + +int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) +{ + int alen; + int8_t etype; + uint8_t *ptr; + struct short_ad *sad; + struct long_ad *lad; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) { + if (!epos->offset) + epos->offset = udf_file_entry_alloc_offset(inode); + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + alen = udf_file_entry_alloc_offset(inode) + + iinfo->i_lenAlloc; + } else { + if (!epos->offset) + epos->offset = sizeof(struct allocExtDesc); + ptr = epos->bh->b_data + epos->offset; + alen = sizeof(struct allocExtDesc) + + le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)-> + lengthAllocDescs); + } + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); + if (!sad) + return -1; + etype = le32_to_cpu(sad->extLength) >> 30; + eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); + eloc->partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + case ICBTAG_FLAG_AD_LONG: + lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); + if (!lad) + return -1; + etype = le32_to_cpu(lad->extLength) >> 30; + *eloc = lelb_to_cpu(lad->extLocation); + *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + default: + udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); + return -1; + } + + return etype; +} + +static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr neloc, uint32_t nelen) +{ + struct kernel_lb_addr oeloc; + uint32_t oelen; + int8_t etype; + + if (epos.bh) + get_bh(epos.bh); + + while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { + udf_write_aext(inode, &epos, &neloc, nelen, 1); + neloc = oeloc; + nelen = (etype << 30) | oelen; + } + udf_add_aext(inode, &epos, &neloc, nelen, 1); + brelse(epos.bh); + + return (nelen >> 30); +} + +int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr eloc, uint32_t elen) +{ + struct extent_position oepos; + int adsize; + int8_t etype; + struct allocExtDesc *aed; + struct udf_inode_info *iinfo; + + if (epos.bh) { + get_bh(epos.bh); + get_bh(epos.bh); + } + + iinfo = UDF_I(inode); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + adsize = 0; + + oepos = epos; + if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1) + return -1; + + while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); + if (oepos.bh != epos.bh) { + oepos.block = epos.block; + brelse(oepos.bh); + get_bh(epos.bh); + oepos.bh = epos.bh; + oepos.offset = epos.offset - adsize; + } + } + memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); + elen = 0; + + if (epos.bh != oepos.bh) { + udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + if (!oepos.bh) { + iinfo->i_lenAlloc -= (adsize * 2); + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(oepos.bh->b_data, + oepos.offset - (2 * adsize)); + else + udf_update_tag(oepos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(oepos.bh, inode); + } + } else { + udf_write_aext(inode, &oepos, &eloc, elen, 1); + if (!oepos.bh) { + iinfo->i_lenAlloc -= adsize; + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, -adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(oepos.bh->b_data, + epos.offset - adsize); + else + udf_update_tag(oepos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(oepos.bh, inode); + } + } + + brelse(epos.bh); + brelse(oepos.bh); + + return (elen >> 30); +} + +int8_t inode_bmap(struct inode *inode, sector_t block, + struct extent_position *pos, struct kernel_lb_addr *eloc, + uint32_t *elen, sector_t *offset) +{ + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + loff_t lbcount = 0, bcount = + (loff_t) block << blocksize_bits; + int8_t etype; + struct udf_inode_info *iinfo; + + iinfo = UDF_I(inode); + if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { + pos->offset = 0; + pos->block = iinfo->i_location; + pos->bh = NULL; + } + *elen = 0; + do { + etype = udf_next_aext(inode, pos, eloc, elen, 1); + if (etype == -1) { + *offset = (bcount - lbcount) >> blocksize_bits; + iinfo->i_lenExtents = lbcount; + return -1; + } + lbcount += *elen; + } while (lbcount <= bcount); + /* update extent cache */ + udf_update_extent_cache(inode, lbcount - *elen, pos, 1); + *offset = (bcount + *elen - lbcount) >> blocksize_bits; + + return etype; +} + +long udf_block_map(struct inode *inode, sector_t block) +{ + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + int ret; + + down_read(&UDF_I(inode)->i_data_sem); + + if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == + (EXT_RECORDED_ALLOCATED >> 30)) + ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + else + ret = 0; + + up_read(&UDF_I(inode)->i_data_sem); + brelse(epos.bh); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) + return udf_fixed_to_variable(ret); + else + return ret; +} diff --git a/kernel/fs/udf/lowlevel.c b/kernel/fs/udf/lowlevel.c new file mode 100644 index 000000000..6ad5a453a --- /dev/null +++ b/kernel/fs/udf/lowlevel.c @@ -0,0 +1,67 @@ +/* + * lowlevel.c + * + * PURPOSE + * Low Level Device Routines for the UDF filesystem + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2001 Ben Fennema + * + * HISTORY + * + * 03/26/99 blf Created. + */ + +#include "udfdecl.h" + +#include +#include +#include + +#include "udf_sb.h" + +unsigned int udf_get_last_session(struct super_block *sb) +{ + struct cdrom_multisession ms_info; + unsigned int vol_desc_start; + struct block_device *bdev = sb->s_bdev; + int i; + + vol_desc_start = 0; + ms_info.addr_format = CDROM_LBA; + i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + + if (i == 0) { + udf_debug("XA disk: %s, vol_desc_start=%d\n", + ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); + if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ + vol_desc_start = ms_info.addr.lba; + } else { + udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); + } + return vol_desc_start; +} + +unsigned long udf_get_last_block(struct super_block *sb) +{ + struct block_device *bdev = sb->s_bdev; + unsigned long lblock = 0; + + /* + * ioctl failed or returned obviously bogus value? + * Try using the device size... + */ + if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) || + lblock == 0) + lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + if (lblock) + return lblock - 1; + else + return 0; +} diff --git a/kernel/fs/udf/misc.c b/kernel/fs/udf/misc.c new file mode 100644 index 000000000..71d1c25f3 --- /dev/null +++ b/kernel/fs/udf/misc.c @@ -0,0 +1,298 @@ +/* + * misc.c + * + * PURPOSE + * Miscellaneous routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 04/19/99 blf partial support for reading/writing specific EA's + */ + +#include "udfdecl.h" + +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +struct buffer_head *udf_tgetblk(struct super_block *sb, int block) +{ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) + return sb_getblk(sb, udf_fixed_to_variable(block)); + else + return sb_getblk(sb, block); +} + +struct buffer_head *udf_tread(struct super_block *sb, int block) +{ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) + return sb_bread(sb, udf_fixed_to_variable(block)); + else + return sb_bread(sb, block); +} + +struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, + uint32_t type, uint8_t loc) +{ + uint8_t *ea = NULL, *ad = NULL; + int offset; + uint16_t crclen; + struct udf_inode_info *iinfo = UDF_I(inode); + + ea = iinfo->i_ext.i_data; + if (iinfo->i_lenEAttr) { + ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + } else { + ad = ea; + size += sizeof(struct extendedAttrHeaderDesc); + } + + offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) - + iinfo->i_lenAlloc; + + /* TODO - Check for FreeEASpace */ + + if (loc & 0x01 && offset >= size) { + struct extendedAttrHeaderDesc *eahd; + eahd = (struct extendedAttrHeaderDesc *)ea; + + if (iinfo->i_lenAlloc) + memmove(&ad[size], ad, iinfo->i_lenAlloc); + + if (iinfo->i_lenEAttr) { + /* check checksum/crc */ + if (eahd->descTag.tagIdent != + cpu_to_le16(TAG_IDENT_EAHD) || + le32_to_cpu(eahd->descTag.tagLocation) != + iinfo->i_location.logicalBlockNum) + return NULL; + } else { + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + + size -= sizeof(struct extendedAttrHeaderDesc); + iinfo->i_lenEAttr += + sizeof(struct extendedAttrHeaderDesc); + eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD); + if (sbi->s_udfrev >= 0x0200) + eahd->descTag.descVersion = cpu_to_le16(3); + else + eahd->descTag.descVersion = cpu_to_le16(2); + eahd->descTag.tagSerialNum = + cpu_to_le16(sbi->s_serial_number); + eahd->descTag.tagLocation = cpu_to_le32( + iinfo->i_location.logicalBlockNum); + eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF); + eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF); + } + + offset = iinfo->i_lenEAttr; + if (type < 2048) { + if (le32_to_cpu(eahd->appAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; + eahd->appAttrLocation = + cpu_to_le32(aal + size); + } + if (le32_to_cpu(eahd->impAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t ial = + le32_to_cpu(eahd->impAttrLocation); + memmove(&ea[offset - ial + size], + &ea[ial], offset - ial); + offset -= ial; + eahd->impAttrLocation = + cpu_to_le32(ial + size); + } + } else if (type < 65536) { + if (le32_to_cpu(eahd->appAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; + eahd->appAttrLocation = + cpu_to_le32(aal + size); + } + } + /* rewrite CRC + checksum of eahd */ + crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); + eahd->descTag.descCRCLength = cpu_to_le16(crclen); + eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + + sizeof(struct tag), crclen)); + eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); + iinfo->i_lenEAttr += size; + return (struct genericFormat *)&ea[offset]; + } + if (loc & 0x02) + ; + + return NULL; +} + +struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, + uint8_t subtype) +{ + struct genericFormat *gaf; + uint8_t *ea = NULL; + uint32_t offset; + struct udf_inode_info *iinfo = UDF_I(inode); + + ea = iinfo->i_ext.i_data; + + if (iinfo->i_lenEAttr) { + struct extendedAttrHeaderDesc *eahd; + eahd = (struct extendedAttrHeaderDesc *)ea; + + /* check checksum/crc */ + if (eahd->descTag.tagIdent != + cpu_to_le16(TAG_IDENT_EAHD) || + le32_to_cpu(eahd->descTag.tagLocation) != + iinfo->i_location.logicalBlockNum) + return NULL; + + if (type < 2048) + offset = sizeof(struct extendedAttrHeaderDesc); + else if (type < 65536) + offset = le32_to_cpu(eahd->impAttrLocation); + else + offset = le32_to_cpu(eahd->appAttrLocation); + + while (offset < iinfo->i_lenEAttr) { + gaf = (struct genericFormat *)&ea[offset]; + if (le32_to_cpu(gaf->attrType) == type && + gaf->attrSubtype == subtype) + return gaf; + else + offset += le32_to_cpu(gaf->attrLength); + } + } + + return NULL; +} + +/* + * udf_read_tagged + * + * PURPOSE + * Read the first block of a tagged descriptor. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, + uint32_t location, uint16_t *ident) +{ + struct tag *tag_p; + struct buffer_head *bh = NULL; + u8 checksum; + + /* Read the block */ + if (block == 0xFFFFFFFF) + return NULL; + + bh = udf_tread(sb, block); + if (!bh) { + udf_err(sb, "read failed, block=%u, location=%d\n", + block, location); + return NULL; + } + + tag_p = (struct tag *)(bh->b_data); + + *ident = le16_to_cpu(tag_p->tagIdent); + + if (location != le32_to_cpu(tag_p->tagLocation)) { + udf_debug("location mismatch block %u, tag %u != %u\n", + block, le32_to_cpu(tag_p->tagLocation), location); + goto error_out; + } + + /* Verify the tag checksum */ + checksum = udf_tag_checksum(tag_p); + if (checksum != tag_p->tagChecksum) { + udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", + block, checksum, tag_p->tagChecksum); + goto error_out; + } + + /* Verify the tag version */ + if (tag_p->descVersion != cpu_to_le16(0x0002U) && + tag_p->descVersion != cpu_to_le16(0x0003U)) { + udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", + le16_to_cpu(tag_p->descVersion), block); + goto error_out; + } + + /* Verify the descriptor CRC */ + if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || + le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, + bh->b_data + sizeof(struct tag), + le16_to_cpu(tag_p->descCRCLength))) + return bh; + + udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, + le16_to_cpu(tag_p->descCRC), + le16_to_cpu(tag_p->descCRCLength)); +error_out: + brelse(bh); + return NULL; +} + +struct buffer_head *udf_read_ptagged(struct super_block *sb, + struct kernel_lb_addr *loc, + uint32_t offset, uint16_t *ident) +{ + return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), + loc->logicalBlockNum + offset, ident); +} + +void udf_update_tag(char *data, int length) +{ + struct tag *tptr = (struct tag *)data; + length -= sizeof(struct tag); + + tptr->descCRCLength = cpu_to_le16(length); + tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); + tptr->tagChecksum = udf_tag_checksum(tptr); +} + +void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, + uint32_t loc, int length) +{ + struct tag *tptr = (struct tag *)data; + tptr->tagIdent = cpu_to_le16(ident); + tptr->descVersion = cpu_to_le16(version); + tptr->tagSerialNum = cpu_to_le16(snum); + tptr->tagLocation = cpu_to_le32(loc); + udf_update_tag(data, length); +} + +u8 udf_tag_checksum(const struct tag *t) +{ + u8 *data = (u8 *)t; + u8 checksum = 0; + int i; + for (i = 0; i < sizeof(struct tag); ++i) + if (i != 4) /* position of checksum */ + checksum += data[i]; + return checksum; +} diff --git a/kernel/fs/udf/namei.c b/kernel/fs/udf/namei.c new file mode 100644 index 000000000..5c03f0dfb --- /dev/null +++ b/kernel/fs/udf/namei.c @@ -0,0 +1,1300 @@ +/* + * namei.c + * + * PURPOSE + * Inode name handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 12/12/98 blf Created. Split out the lookup code from dir.c + * 04/19/99 blf link, mknod, symlink support + */ + +#include "udfdecl.h" + +#include "udf_i.h" +#include "udf_sb.h" +#include +#include +#include +#include +#include +#include +#include + +static inline int udf_match(int len1, const unsigned char *name1, int len2, + const unsigned char *name2) +{ + if (len1 != len2) + return 0; + + return !memcmp(name1, name2, len1); +} + +int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, + struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, + uint8_t *impuse, uint8_t *fileident) +{ + uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); + uint16_t crc; + int offset; + uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); + uint8_t lfi = cfi->lengthFileIdent; + int padlen = fibh->eoffset - fibh->soffset - liu - lfi - + sizeof(struct fileIdentDesc); + int adinicb = 0; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + adinicb = 1; + + offset = fibh->soffset + sizeof(struct fileIdentDesc); + + if (impuse) { + if (adinicb || (offset + liu < 0)) { + memcpy((uint8_t *)sfi->impUse, impuse, liu); + } else if (offset >= 0) { + memcpy(fibh->ebh->b_data + offset, impuse, liu); + } else { + memcpy((uint8_t *)sfi->impUse, impuse, -offset); + memcpy(fibh->ebh->b_data, impuse - offset, + liu + offset); + } + } + + offset += liu; + + if (fileident) { + if (adinicb || (offset + lfi < 0)) { + memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi); + } else if (offset >= 0) { + memcpy(fibh->ebh->b_data + offset, fileident, lfi); + } else { + memcpy((uint8_t *)sfi->fileIdent + liu, fileident, + -offset); + memcpy(fibh->ebh->b_data, fileident - offset, + lfi + offset); + } + } + + offset += lfi; + + if (adinicb || (offset + padlen < 0)) { + memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen); + } else if (offset >= 0) { + memset(fibh->ebh->b_data + offset, 0x00, padlen); + } else { + memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset); + memset(fibh->ebh->b_data, 0x00, padlen + offset); + } + + crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), + sizeof(struct fileIdentDesc) - sizeof(struct tag)); + + if (fibh->sbh == fibh->ebh) { + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, + crclen + sizeof(struct tag) - + sizeof(struct fileIdentDesc)); + } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { + crc = crc_itu_t(crc, fibh->ebh->b_data + + sizeof(struct fileIdentDesc) + + fibh->soffset, + crclen + sizeof(struct tag) - + sizeof(struct fileIdentDesc)); + } else { + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, + -fibh->soffset - sizeof(struct fileIdentDesc)); + crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); + } + + cfi->descTag.descCRC = cpu_to_le16(crc); + cfi->descTag.descCRCLength = cpu_to_le16(crclen); + cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag); + + if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) { + memcpy((uint8_t *)sfi, (uint8_t *)cfi, + sizeof(struct fileIdentDesc)); + } else { + memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset); + memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset, + sizeof(struct fileIdentDesc) + fibh->soffset); + } + + if (adinicb) { + mark_inode_dirty(inode); + } else { + if (fibh->sbh != fibh->ebh) + mark_buffer_dirty_inode(fibh->ebh, inode); + mark_buffer_dirty_inode(fibh->sbh, inode); + } + return 0; +} + +static struct fileIdentDesc *udf_find_entry(struct inode *dir, + const struct qstr *child, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi) +{ + struct fileIdentDesc *fi = NULL; + loff_t f_pos; + int block, flen; + unsigned char *fname = NULL; + unsigned char *nameptr; + uint8_t lfi; + uint16_t liu; + loff_t size; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo = UDF_I(dir); + int isdotdot = child->len == 2 && + child->name[0] == '.' && child->name[1] == '.'; + struct super_block *sb = dir->i_sb; + + size = udf_ext0_offset(dir) + dir->i_size; + f_pos = udf_ext0_offset(dir); + + fibh->sbh = fibh->ebh = NULL; + fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) + goto out_err; + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh->sbh = fibh->ebh = udf_tread(sb, block); + if (!fibh->sbh) + goto out_err; + } + + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) + goto out_err; + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, + &elen, &offset); + if (!fi) + goto out_err; + + liu = le16_to_cpu(cfi->lengthOfImpUse); + lfi = cfi->lengthFileIdent; + + if (fibh->sbh == fibh->ebh) { + nameptr = fi->fileIdent + liu; + } else { + int poffset; /* Unpaded ending offset */ + + poffset = fibh->soffset + sizeof(struct fileIdentDesc) + + liu + lfi; + + if (poffset >= lfi) + nameptr = (uint8_t *)(fibh->ebh->b_data + + poffset - lfi); + else { + nameptr = fname; + memcpy(nameptr, fi->fileIdent + liu, + lfi - poffset); + memcpy(nameptr + lfi - poffset, + fibh->ebh->b_data, poffset); + } + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) + continue; + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) + continue; + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && + isdotdot) + goto out_ok; + + if (!lfi) + continue; + + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + if (flen && udf_match(flen, fname, child->len, child->name)) + goto out_ok; + } + +out_err: + fi = NULL; + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); +out_ok: + brelse(epos.bh); + kfree(fname); + + return fi; +} + +static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct fileIdentDesc cfi; + struct udf_fileident_bh fibh; + + if (dentry->d_name.len > UDF_NAME_LEN - 2) + return ERR_PTR(-ENAMETOOLONG); + +#ifdef UDF_RECOVERY + /* temporary shorthand for specifying files by inode number */ + if (!strncmp(dentry->d_name.name, ".B=", 3)) { + struct kernel_lb_addr lb = { + .logicalBlockNum = 0, + .partitionReferenceNum = + simple_strtoul(dentry->d_name.name + 3, + NULL, 0), + }; + inode = udf_iget(dir->i_sb, lb); + if (IS_ERR(inode)) + return inode; + } else +#endif /* UDF_RECOVERY */ + + if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + struct kernel_lb_addr loc; + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + + loc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(dir->i_sb, &loc); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); +} + +static struct fileIdentDesc *udf_add_entry(struct inode *dir, + struct dentry *dentry, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi, int *err) +{ + struct super_block *sb = dir->i_sb; + struct fileIdentDesc *fi = NULL; + unsigned char *name = NULL; + int namelen; + loff_t f_pos; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + int nfidlen; + uint8_t lfi; + uint16_t liu; + int block; + struct kernel_lb_addr eloc; + uint32_t elen = 0; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo; + + fibh->sbh = fibh->ebh = NULL; + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + *err = -ENOMEM; + goto out_err; + } + + if (dentry) { + if (!dentry->d_name.len) { + *err = -EINVAL; + goto out_err; + } + namelen = udf_put_filename(sb, dentry->d_name.name, name, + dentry->d_name.len); + if (!namelen) { + *err = -ENAMETOOLONG; + goto out_err; + } + } else { + namelen = 0; + } + + nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3; + + f_pos = udf_ext0_offset(dir); + + fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + dinfo = UDF_I(dir); + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { + block = udf_get_lb_pblock(dir->i_sb, + &dinfo->i_location, 0); + fibh->soffset = fibh->eoffset = sb->s_blocksize; + goto add; + } + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->sbh) { + *err = -EIO; + goto out_err; + } + + block = dinfo->i_location.logicalBlockNum; + } + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, + &elen, &offset); + + if (!fi) { + *err = -EIO; + goto out_err; + } + + liu = le16_to_cpu(cfi->lengthOfImpUse); + lfi = cfi->lengthFileIdent; + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (((sizeof(struct fileIdentDesc) + + liu + lfi + 3) & ~3) == nfidlen) { + cfi->descTag.tagSerialNum = cpu_to_le16(1); + cfi->fileVersionNum = cpu_to_le16(1); + cfi->fileCharacteristics = 0; + cfi->lengthFileIdent = namelen; + cfi->lengthOfImpUse = cpu_to_le16(0); + if (!udf_write_fi(dir, cfi, fi, fibh, NULL, + name)) + goto out_ok; + else { + *err = -EIO; + goto out_err; + } + } + } + } + +add: + f_pos += nfidlen; + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && + sb->s_blocksize - fibh->eoffset < nfidlen) { + brelse(epos.bh); + epos.bh = NULL; + fibh->soffset -= udf_ext0_offset(dir); + fibh->eoffset -= udf_ext0_offset(dir); + f_pos -= udf_ext0_offset(dir); + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); + fibh->sbh = fibh->ebh = + udf_expand_dir_adinicb(dir, &block, err); + if (!fibh->sbh) + goto out_err; + epos.block = dinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(dir); + /* Load extent udf_expand_dir_adinicb() has created */ + udf_current_aext(dir, &epos, &eloc, &elen, 1); + } + + /* Entry fits into current block? */ + if (sb->s_blocksize - fibh->eoffset >= nfidlen) { + fibh->soffset = fibh->eoffset; + fibh->eoffset += nfidlen; + if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + block = dinfo->i_location.logicalBlockNum; + fi = (struct fileIdentDesc *) + (dinfo->i_ext.i_data + + fibh->soffset - + udf_ext0_offset(dir) + + dinfo->i_lenEAttr); + } else { + block = eloc.logicalBlockNum + + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + fi = (struct fileIdentDesc *) + (fibh->sbh->b_data + fibh->soffset); + } + } else { + /* Round up last extent in the file */ + elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize + - 1) & ~(sb->s_blocksize - 1); + + fibh->soffset = fibh->eoffset - sb->s_blocksize; + fibh->eoffset += nfidlen - sb->s_blocksize; + if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + block = eloc.logicalBlockNum + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + fibh->ebh = udf_bread(dir, + f_pos >> dir->i_sb->s_blocksize_bits, 1, err); + if (!fibh->ebh) + goto out_err; + /* Extents could have been merged, invalidate our position */ + brelse(epos.bh); + epos.bh = NULL; + epos.block = dinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(dir); + + if (!fibh->soffset) { + /* Find the freshly allocated block */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + block = eloc.logicalBlockNum + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + fi = (struct fileIdentDesc *)(fibh->sbh->b_data); + } else { + fi = (struct fileIdentDesc *) + (fibh->sbh->b_data + sb->s_blocksize + + fibh->soffset); + } + } + + memset(cfi, 0, sizeof(struct fileIdentDesc)); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, + sizeof(struct tag)); + else + udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, + sizeof(struct tag)); + cfi->fileVersionNum = cpu_to_le16(1); + cfi->lengthFileIdent = namelen; + cfi->lengthOfImpUse = cpu_to_le16(0); + if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { + dir->i_size += nfidlen; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + dinfo->i_lenAlloc += nfidlen; + else { + /* Find the last extent and truncate it to proper size */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + elen -= dinfo->i_lenExtents - dir->i_size; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = dir->i_size; + } + + mark_inode_dirty(dir); + goto out_ok; + } else { + *err = -EIO; + goto out_err; + } + +out_err: + fi = NULL; + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); +out_ok: + brelse(epos.bh); + kfree(name); + return fi; +} + +static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi) +{ + cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) + memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); + + return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); +} + +static int udf_add_nondir(struct dentry *dentry, struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + struct inode *dir = d_inode(dentry->d_parent); + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (unlikely(!fi)) { + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + return err; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + unlock_new_inode(inode); + d_instantiate(dentry, inode); + + return 0; +} + +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode = udf_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + return udf_add_nondir(dentry, inode); +} + +static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = udf_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; +} + +static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct inode *inode; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = udf_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + return udf_add_nondir(dentry, inode); +} + +static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + struct udf_inode_info *dinfo = UDF_I(dir); + struct udf_inode_info *iinfo; + + inode = udf_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo = UDF_I(inode); + inode->i_op = &udf_dir_inode_operations; + inode->i_fop = &udf_dir_operations; + fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); + if (!fi) { + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + goto out; + } + set_nlink(inode, 2); + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); + cfi.fileCharacteristics = + FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; + udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); + brelse(fibh.sbh); + mark_inode_dirty(inode); + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + clear_nlink(inode); + mark_inode_dirty(inode); + unlock_new_inode(inode); + iput(inode); + goto out; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); + cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + inc_nlink(dir); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + unlock_new_inode(inode); + d_instantiate(dentry, inode); + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + err = 0; + +out: + return err; +} + +static int empty_dir(struct inode *dir) +{ + struct fileIdentDesc *fi, cfi; + struct udf_fileident_bh fibh; + loff_t f_pos; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + int block; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo = UDF_I(dir); + + f_pos = udf_ext0_offset(dir); + fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + fibh.sbh = fibh.ebh = NULL; + else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, + &epos, &eloc, &elen, &offset) == + (EXT_RECORDED_ALLOCATED >> 30)) { + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block); + if (!fibh.sbh) { + brelse(epos.bh); + return 0; + } + } else { + brelse(epos.bh); + return 0; + } + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, + &elen, &offset); + if (!fi) { + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + return 0; + } + + if (cfi.lengthFileIdent && + (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) { + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + return 0; + } + } + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + + return 1; +} + +static int udf_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode = d_inode(dentry); + struct udf_fileident_bh fibh; + struct fileIdentDesc *fi, cfi; + struct kernel_lb_addr tloc; + + retval = -ENOENT; + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (!fi) + goto out; + + retval = -EIO; + tloc = lelb_to_cpu(cfi.icb.extLocation); + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) + goto end_rmdir; + retval = -ENOTEMPTY; + if (!empty_dir(inode)) + goto end_rmdir; + retval = udf_delete_entry(dir, fi, &fibh, &cfi); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n", + inode->i_nlink); + clear_nlink(inode); + inode->i_size = 0; + inode_dec_link_count(dir); + inode->i_ctime = dir->i_ctime = dir->i_mtime = + current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + +end_rmdir: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + +out: + return retval; +} + +static int udf_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode = d_inode(dentry); + struct udf_fileident_bh fibh; + struct fileIdentDesc *fi; + struct fileIdentDesc cfi; + struct kernel_lb_addr tloc; + + retval = -ENOENT; + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (!fi) + goto out; + + retval = -EIO; + tloc = lelb_to_cpu(cfi.icb.extLocation); + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + udf_debug("Deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + retval = udf_delete_entry(dir, fi, &fibh, &cfi); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + inode_dec_link_count(inode); + inode->i_ctime = dir->i_ctime; + retval = 0; + +end_unlink: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + +out: + return retval; +} + +static int udf_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); + struct pathComponent *pc; + const char *compstart; + struct extent_position epos = {}; + int eoffset, elen = 0; + uint8_t *ea; + int err; + int block; + unsigned char *name = NULL; + int namelen; + struct udf_inode_info *iinfo; + struct super_block *sb = dir->i_sb; + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo = UDF_I(inode); + down_write(&iinfo->i_data_sem); + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto out_no_entry; + } + + inode->i_data.a_ops = &udf_symlink_aops; + inode->i_op = &udf_symlink_inode_operations; + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + struct kernel_lb_addr eloc; + uint32_t bsize; + + block = udf_new_block(sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, &err); + if (!block) + goto out_no_entry; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + epos.bh = NULL; + eloc.logicalBlockNum = block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + bsize = sb->s_blocksize; + iinfo->i_lenExtents = bsize; + udf_add_aext(inode, &epos, &eloc, bsize, 0); + brelse(epos.bh); + + block = udf_get_pblock(sb, block, + iinfo->i_location.partitionReferenceNum, + 0); + epos.bh = udf_tgetblk(sb, block); + lock_buffer(epos.bh); + memset(epos.bh->b_data, 0x00, bsize); + set_buffer_uptodate(epos.bh); + unlock_buffer(epos.bh); + mark_buffer_dirty_inode(epos.bh, inode); + ea = epos.bh->b_data + udf_ext0_offset(inode); + } else + ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + + eoffset = sb->s_blocksize - udf_ext0_offset(inode); + pc = (struct pathComponent *)ea; + + if (*symname == '/') { + do { + symname++; + } while (*symname == '/'); + + pc->componentType = 1; + pc->lengthComponentIdent = 0; + pc->componentFileVersionNum = 0; + elen += sizeof(struct pathComponent); + } + + err = -ENAMETOOLONG; + + while (*symname) { + if (elen + sizeof(struct pathComponent) > eoffset) + goto out_no_entry; + + pc = (struct pathComponent *)(ea + elen); + + compstart = symname; + + do { + symname++; + } while (*symname && *symname != '/'); + + pc->componentType = 5; + pc->lengthComponentIdent = 0; + pc->componentFileVersionNum = 0; + if (compstart[0] == '.') { + if ((symname - compstart) == 1) + pc->componentType = 4; + else if ((symname - compstart) == 2 && + compstart[1] == '.') + pc->componentType = 3; + } + + if (pc->componentType == 5) { + namelen = udf_put_filename(sb, compstart, name, + symname - compstart); + if (!namelen) + goto out_no_entry; + + if (elen + sizeof(struct pathComponent) + namelen > + eoffset) + goto out_no_entry; + else + pc->lengthComponentIdent = namelen; + + memcpy(pc->componentIdent, name, namelen); + } + + elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; + + if (*symname) { + do { + symname++; + } while (*symname == '/'); + } + } + + brelse(epos.bh); + inode->i_size = elen; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + iinfo->i_lenAlloc = inode->i_size; + else + udf_truncate_tail_extent(inode); + mark_inode_dirty(inode); + up_write(&iinfo->i_data_sem); + + err = udf_add_nondir(dentry, inode); +out: + kfree(name); + return err; + +out_no_entry: + up_write(&iinfo->i_data_sem); + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + goto out; +} + +static int udf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + return err; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); + if (UDF_SB(inode->i_sb)->s_lvid_bh) { + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(lvid_get_unique_id(inode->i_sb)); + } + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(dir); + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + inc_nlink(inode); + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty(inode); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + ihold(inode); + d_instantiate(dentry, inode); + + return 0; +} + +/* Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct udf_fileident_bh ofibh, nfibh; + struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; + struct fileIdentDesc ocfi, ncfi; + struct buffer_head *dir_bh = NULL; + int retval = -ENOENT; + struct kernel_lb_addr tloc; + struct udf_inode_info *old_iinfo = UDF_I(old_inode); + + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); + if (ofi) { + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + brelse(ofibh.sbh); + } + tloc = lelb_to_cpu(ocfi.icb.extLocation); + if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) + != old_inode->i_ino) + goto end_rename; + + nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); + if (nfi) { + if (!new_inode) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + nfi = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + int offset = udf_ext0_offset(old_inode); + + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir(new_inode)) + goto end_rename; + } + retval = -EIO; + if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + dir_fi = udf_get_fileident( + old_iinfo->i_ext.i_data - + (old_iinfo->i_efe ? + sizeof(struct extendedFileEntry) : + sizeof(struct fileEntry)), + old_inode->i_sb->s_blocksize, &offset); + } else { + dir_bh = udf_bread(old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + dir_fi = udf_get_fileident(dir_bh->b_data, + old_inode->i_sb->s_blocksize, &offset); + } + if (!dir_fi) + goto end_rename; + tloc = lelb_to_cpu(dir_fi->icb.extLocation); + if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != + old_dir->i_ino) + goto end_rename; + } + if (!nfi) { + nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, + &retval); + if (!nfi) + goto end_rename; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = current_fs_time(old_inode->i_sb); + mark_inode_dirty(old_inode); + + /* + * ok, that's it + */ + ncfi.fileVersionNum = ocfi.fileVersionNum; + ncfi.fileCharacteristics = ocfi.fileCharacteristics; + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); + udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); + + /* The old fid may have moved - find it again */ + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); + udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); + + if (new_inode) { + new_inode->i_ctime = current_fs_time(new_inode->i_sb); + inode_dec_link_count(new_inode); + } + old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb); + new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); + mark_inode_dirty(old_dir); + mark_inode_dirty(new_dir); + + if (dir_fi) { + dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); + udf_update_tag((char *)dir_fi, + (sizeof(struct fileIdentDesc) + + le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3); + if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(old_inode); + else + mark_buffer_dirty_inode(dir_bh, old_inode); + + inode_dec_link_count(old_dir); + if (new_inode) + inode_dec_link_count(new_inode); + else { + inc_nlink(new_dir); + mark_inode_dirty(new_dir); + } + } + + if (ofi) { + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + brelse(ofibh.sbh); + } + + retval = 0; + +end_rename: + brelse(dir_bh); + if (nfi) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + } + + return retval; +} + +static struct dentry *udf_get_parent(struct dentry *child) +{ + struct kernel_lb_addr tloc; + struct inode *inode = NULL; + struct qstr dotdot = QSTR_INIT("..", 2); + struct fileIdentDesc cfi; + struct udf_fileident_bh fibh; + + if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi)) + return ERR_PTR(-EACCES); + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + + tloc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(d_inode(child)->i_sb, &tloc); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(inode); +} + + +static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, + u16 partref, __u32 generation) +{ + struct inode *inode; + struct kernel_lb_addr loc; + + if (block == 0) + return ERR_PTR(-ESTALE); + + loc.logicalBlockNum = block; + loc.partitionReferenceNum = partref; + inode = udf_iget(sb, &loc); + + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +static struct dentry *udf_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_len != 3 && fh_len != 5) || + (fh_type != FILEID_UDF_WITH_PARENT && + fh_type != FILEID_UDF_WITHOUT_PARENT)) + return NULL; + + return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref, + fid->udf.generation); +} + +static struct dentry *udf_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT) + return NULL; + + return udf_nfs_get_inode(sb, fid->udf.parent_block, + fid->udf.parent_partref, + fid->udf.parent_generation); +} +static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + int len = *lenp; + struct kernel_lb_addr location = UDF_I(inode)->i_location; + struct fid *fid = (struct fid *)fh; + int type = FILEID_UDF_WITHOUT_PARENT; + + if (parent && (len < 5)) { + *lenp = 5; + return FILEID_INVALID; + } else if (len < 3) { + *lenp = 3; + return FILEID_INVALID; + } + + *lenp = 3; + fid->udf.block = location.logicalBlockNum; + fid->udf.partref = location.partitionReferenceNum; + fid->udf.parent_partref = 0; + fid->udf.generation = inode->i_generation; + + if (parent) { + location = UDF_I(parent)->i_location; + fid->udf.parent_block = location.logicalBlockNum; + fid->udf.parent_partref = location.partitionReferenceNum; + fid->udf.parent_generation = inode->i_generation; + *lenp = 5; + type = FILEID_UDF_WITH_PARENT; + } + + return type; +} + +const struct export_operations udf_export_ops = { + .encode_fh = udf_encode_fh, + .fh_to_dentry = udf_fh_to_dentry, + .fh_to_parent = udf_fh_to_parent, + .get_parent = udf_get_parent, +}; + +const struct inode_operations udf_dir_inode_operations = { + .lookup = udf_lookup, + .create = udf_create, + .link = udf_link, + .unlink = udf_unlink, + .symlink = udf_symlink, + .mkdir = udf_mkdir, + .rmdir = udf_rmdir, + .mknod = udf_mknod, + .rename = udf_rename, + .tmpfile = udf_tmpfile, +}; +const struct inode_operations udf_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/kernel/fs/udf/osta_udf.h b/kernel/fs/udf/osta_udf.h new file mode 100644 index 000000000..fbff74654 --- /dev/null +++ b/kernel/fs/udf/osta_udf.h @@ -0,0 +1,279 @@ +/* + * osta_udf.h + * + * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003) + * http://www.osta.org + * + * Copyright (c) 2001-2004 Ben Fennema + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "ecma_167.h" + +#ifndef _OSTA_UDF_H +#define _OSTA_UDF_H 1 + +/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */ +#define UDF_CHAR_SET_TYPE 0 +#define UDF_CHAR_SET_INFO "OSTA Compressed Unicode" + +/* Entity Identifier (UDF 2.50 2.1.5) */ +/* Identifiers (UDF 2.50 2.1.5.2) */ +#define UDF_ID_DEVELOPER "*Linux UDFFS" +#define UDF_ID_COMPLIANT "*OSTA UDF Compliant" +#define UDF_ID_LV_INFO "*UDF LV Info" +#define UDF_ID_FREE_EA "*UDF FreeEASpace" +#define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace" +#define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info" +#define UDF_ID_OS2_EA "*UDF OS/2 EA" +#define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength" +#define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo" +#define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo" +#define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable" +#define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork" +#define UDF_ID_VIRTUAL "*UDF Virtual Partition" +#define UDF_ID_SPARABLE "*UDF Sparable Partition" +#define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl" +#define UDF_ID_SPARING "*UDF Sparing Table" +#define UDF_ID_METADATA "*UDF Metadata Partition" + +/* Identifier Suffix (UDF 2.50 2.1.5.3) */ +#define IS_DF_HARD_WRITE_PROTECT 0x01 +#define IS_DF_SOFT_WRITE_PROTECT 0x02 + +struct UDFIdentSuffix { + __le16 UDFRevision; + uint8_t OSClass; + uint8_t OSIdentifier; + uint8_t reserved[4]; +} __attribute__ ((packed)); + +struct impIdentSuffix { + uint8_t OSClass; + uint8_t OSIdentifier; + uint8_t reserved[6]; +} __attribute__ ((packed)); + +struct appIdentSuffix { + uint8_t impUse[8]; +} __attribute__ ((packed)); + +/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ +/* Implementation Use (UDF 2.50 2.2.6.4) */ +struct logicalVolIntegrityDescImpUse { + struct regid impIdent; + __le32 numFiles; + __le32 numDirs; + __le16 minUDFReadRev; + __le16 minUDFWriteRev; + __le16 maxUDFWriteRev; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ +/* Implementation Use (UDF 2.50 2.2.7.2) */ +struct impUseVolDescImpUse { + struct charspec LVICharset; + dstring logicalVolIdent[128]; + dstring LVInfo1[36]; + dstring LVInfo2[36]; + dstring LVInfo3[36]; + struct regid impIdent; + uint8_t impUse[128]; +} __attribute__ ((packed)); + +struct udfPartitionMap2 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; +} __attribute__ ((packed)); + +/* Virtual Partition Map (UDF 2.50 2.2.8) */ +struct virtualPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + uint8_t reserved2[24]; +} __attribute__ ((packed)); + +/* Sparable Partition Map (UDF 2.50 2.2.9) */ +struct sparablePartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + __le16 packetLength; + uint8_t numSparingTables; + uint8_t reserved2[1]; + __le32 sizeSparingTable; + __le32 locSparingTable[4]; +} __attribute__ ((packed)); + +/* Metadata Partition Map (UDF 2.4.0 2.2.10) */ +struct metadataPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + __le32 metadataFileLoc; + __le32 metadataMirrorFileLoc; + __le32 metadataBitmapFileLoc; + __le32 allocUnitSize; + __le16 alignUnitSize; + uint8_t flags; + uint8_t reserved2[5]; +} __attribute__ ((packed)); + +/* Virtual Allocation Table (UDF 1.5 2.2.10) */ +struct virtualAllocationTable15 { + __le32 VirtualSector[0]; + struct regid vatIdent; + __le32 previousVATICBLoc; +} __attribute__ ((packed)); + +#define ICBTAG_FILE_TYPE_VAT15 0x00U + +/* Virtual Allocation Table (UDF 2.50 2.2.11) */ +struct virtualAllocationTable20 { + __le16 lengthHeader; + __le16 lengthImpUse; + dstring logicalVolIdent[128]; + __le32 previousVATICBLoc; + __le32 numFiles; + __le32 numDirs; + __le16 minReadRevision; + __le16 minWriteRevision; + __le16 maxWriteRevision; + __le16 reserved; + uint8_t impUse[0]; + __le32 vatEntry[0]; +} __attribute__ ((packed)); + +#define ICBTAG_FILE_TYPE_VAT20 0xF8U + +/* Sparing Table (UDF 2.50 2.2.12) */ +struct sparingEntry { + __le32 origLocation; + __le32 mappedLocation; +} __attribute__ ((packed)); + +struct sparingTable { + struct tag descTag; + struct regid sparingIdent; + __le16 reallocationTableLen; + __le16 reserved; + __le32 sequenceNum; + struct sparingEntry + mapEntry[0]; +} __attribute__ ((packed)); + +/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ +#define ICBTAG_FILE_TYPE_MAIN 0xFA +#define ICBTAG_FILE_TYPE_MIRROR 0xFB +#define ICBTAG_FILE_TYPE_BITMAP 0xFC + +/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +struct allocDescImpUse { + __le16 flags; + uint8_t impUse[4]; +} __attribute__ ((packed)); + +#define AD_IU_EXT_ERASED 0x0001 + +/* Real-Time Files (UDF 2.50 6.11) */ +#define ICBTAG_FILE_TYPE_REALTIME 0xF9U + +/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */ +/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */ +struct freeEaSpace { + __le16 headerChecksum; + uint8_t freeEASpace[0]; +} __attribute__ ((packed)); + +/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ +struct DVDCopyrightImpUse { + __le16 headerChecksum; + uint8_t CGMSInfo; + uint8_t dataType; + uint8_t protectionSystemInfo[4]; +} __attribute__ ((packed)); + +/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ +/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ +struct freeAppEASpace { + __le16 headerChecksum; + uint8_t freeEASpace[0]; +} __attribute__ ((packed)); + +/* UDF Defined System Stream (UDF 2.50 3.3.7) */ +#define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" +#define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space" +#define UDF_ID_POWER_CAL "*UDF Power Cal Table" +#define UDF_ID_BACKUP "*UDF Backup" + +/* Operating System Identifiers (UDF 2.50 6.3) */ +#define UDF_OS_CLASS_UNDEF 0x00U +#define UDF_OS_CLASS_DOS 0x01U +#define UDF_OS_CLASS_OS2 0x02U +#define UDF_OS_CLASS_MAC 0x03U +#define UDF_OS_CLASS_UNIX 0x04U +#define UDF_OS_CLASS_WIN9X 0x05U +#define UDF_OS_CLASS_WINNT 0x06U +#define UDF_OS_CLASS_OS400 0x07U +#define UDF_OS_CLASS_BEOS 0x08U +#define UDF_OS_CLASS_WINCE 0x09U + +#define UDF_OS_ID_UNDEF 0x00U +#define UDF_OS_ID_DOS 0x00U +#define UDF_OS_ID_OS2 0x00U +#define UDF_OS_ID_MAC 0x00U +#define UDF_OS_ID_MAX_OSX 0x01U +#define UDF_OS_ID_UNIX 0x00U +#define UDF_OS_ID_AIX 0x01U +#define UDF_OS_ID_SOLARIS 0x02U +#define UDF_OS_ID_HPUX 0x03U +#define UDF_OS_ID_IRIX 0x04U +#define UDF_OS_ID_LINUX 0x05U +#define UDF_OS_ID_MKLINUX 0x06U +#define UDF_OS_ID_FREEBSD 0x07U +#define UDF_OS_ID_WIN9X 0x00U +#define UDF_OS_ID_WINNT 0x00U +#define UDF_OS_ID_OS400 0x00U +#define UDF_OS_ID_BEOS 0x00U +#define UDF_OS_ID_WINCE 0x00U + +#endif /* _OSTA_UDF_H */ diff --git a/kernel/fs/udf/partition.c b/kernel/fs/udf/partition.c new file mode 100644 index 000000000..5f861ed28 --- /dev/null +++ b/kernel/fs/udf/partition.c @@ -0,0 +1,338 @@ +/* + * partition.c + * + * PURPOSE + * Partition handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * + * HISTORY + * + * 12/06/98 blf Created file. + * + */ + +#include "udfdecl.h" +#include "udf_sb.h" +#include "udf_i.h" + +#include +#include +#include + +uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + if (partition >= sbi->s_partitions) { + udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n", + block, partition, offset); + return 0xFFFFFFFF; + } + map = &sbi->s_partmaps[partition]; + if (map->s_partition_func) + return map->s_partition_func(sb, block, partition, offset); + else + return map->s_partition_root + block + offset; +} + +uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct buffer_head *bh = NULL; + uint32_t newblock; + uint32_t index; + uint32_t loc; + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_virtual_data *vdata; + struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); + + map = &sbi->s_partmaps[partition]; + vdata = &map->s_type_specific.s_virtual; + + if (block > vdata->s_num_entries) { + udf_debug("Trying to access block beyond end of VAT (%d max %d)\n", + block, vdata->s_num_entries); + return 0xFFFFFFFF; + } + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data + + vdata->s_start_offset))[block]); + goto translate; + } + index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t); + if (block >= index) { + block -= index; + newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t))); + index = block % (sb->s_blocksize / sizeof(uint32_t)); + } else { + newblock = 0; + index = vdata->s_start_offset / sizeof(uint32_t) + block; + } + + loc = udf_block_map(sbi->s_vat_inode, newblock); + + bh = sb_bread(sb, loc); + if (!bh) { + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%d,%d) VAT: %d[%d]\n", + sb, block, partition, loc, index); + return 0xFFFFFFFF; + } + + loc = le32_to_cpu(((__le32 *)bh->b_data)[index]); + + brelse(bh); + +translate: + if (iinfo->i_location.partitionReferenceNum == partition) { + udf_debug("recursive call to udf_get_pblock!\n"); + return 0xFFFFFFFF; + } + + return udf_get_pblock(sb, loc, + iinfo->i_location.partitionReferenceNum, + offset); +} + +inline uint32_t udf_get_pblock_virt20(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + return udf_get_pblock_virt15(sb, block, partition, offset); +} + +uint32_t udf_get_pblock_spar15(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + int i; + struct sparingTable *st = NULL; + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + uint32_t packet; + struct udf_sparing_data *sdata; + + map = &sbi->s_partmaps[partition]; + sdata = &map->s_type_specific.s_sparing; + packet = (block + offset) & ~(sdata->s_packet_len - 1); + + for (i = 0; i < 4; i++) { + if (sdata->s_spar_map[i] != NULL) { + st = (struct sparingTable *) + sdata->s_spar_map[i]->b_data; + break; + } + } + + if (st) { + for (i = 0; i < le16_to_cpu(st->reallocationTableLen); i++) { + struct sparingEntry *entry = &st->mapEntry[i]; + u32 origLoc = le32_to_cpu(entry->origLocation); + if (origLoc >= 0xFFFFFFF0) + break; + else if (origLoc == packet) + return le32_to_cpu(entry->mappedLocation) + + ((block + offset) & + (sdata->s_packet_len - 1)); + else if (origLoc > packet) + break; + } + } + + return map->s_partition_root + block + offset; +} + +int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) +{ + struct udf_sparing_data *sdata; + struct sparingTable *st = NULL; + struct sparingEntry mapEntry; + uint32_t packet; + int i, j, k, l; + struct udf_sb_info *sbi = UDF_SB(sb); + u16 reallocationTableLen; + struct buffer_head *bh; + int ret = 0; + + mutex_lock(&sbi->s_alloc_mutex); + for (i = 0; i < sbi->s_partitions; i++) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + if (old_block > map->s_partition_root && + old_block < map->s_partition_root + map->s_partition_len) { + sdata = &map->s_type_specific.s_sparing; + packet = (old_block - map->s_partition_root) & + ~(sdata->s_packet_len - 1); + + for (j = 0; j < 4; j++) + if (sdata->s_spar_map[j] != NULL) { + st = (struct sparingTable *) + sdata->s_spar_map[j]->b_data; + break; + } + + if (!st) { + ret = 1; + goto out; + } + + reallocationTableLen = + le16_to_cpu(st->reallocationTableLen); + for (k = 0; k < reallocationTableLen; k++) { + struct sparingEntry *entry = &st->mapEntry[k]; + u32 origLoc = le32_to_cpu(entry->origLocation); + + if (origLoc == 0xFFFFFFFF) { + for (; j < 4; j++) { + int len; + bh = sdata->s_spar_map[j]; + if (!bh) + continue; + + st = (struct sparingTable *) + bh->b_data; + entry->origLocation = + cpu_to_le32(packet); + len = + sizeof(struct sparingTable) + + reallocationTableLen * + sizeof(struct sparingEntry); + udf_update_tag((char *)st, len); + mark_buffer_dirty(bh); + } + *new_block = le32_to_cpu( + entry->mappedLocation) + + ((old_block - + map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } else if (origLoc == packet) { + *new_block = le32_to_cpu( + entry->mappedLocation) + + ((old_block - + map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } else if (origLoc > packet) + break; + } + + for (l = k; l < reallocationTableLen; l++) { + struct sparingEntry *entry = &st->mapEntry[l]; + u32 origLoc = le32_to_cpu(entry->origLocation); + + if (origLoc != 0xFFFFFFFF) + continue; + + for (; j < 4; j++) { + bh = sdata->s_spar_map[j]; + if (!bh) + continue; + + st = (struct sparingTable *)bh->b_data; + mapEntry = st->mapEntry[l]; + mapEntry.origLocation = + cpu_to_le32(packet); + memmove(&st->mapEntry[k + 1], + &st->mapEntry[k], + (l - k) * + sizeof(struct sparingEntry)); + st->mapEntry[k] = mapEntry; + udf_update_tag((char *)st, + sizeof(struct sparingTable) + + reallocationTableLen * + sizeof(struct sparingEntry)); + mark_buffer_dirty(bh); + } + *new_block = + le32_to_cpu( + st->mapEntry[k].mappedLocation) + + ((old_block - map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } + + ret = 1; + goto out; + } /* if old_block */ + } + + if (i == sbi->s_partitions) { + /* outside of partitions */ + /* for now, fail =) */ + ret = 1; + } + +out: + mutex_unlock(&sbi->s_alloc_mutex); + return ret; +} + +static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct super_block *sb = inode->i_sb; + struct udf_part_map *map; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t ext_offset; + struct extent_position epos = {}; + uint32_t phyblock; + + if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) != + (EXT_RECORDED_ALLOCATED >> 30)) + phyblock = 0xFFFFFFFF; + else { + map = &UDF_SB(sb)->s_partmaps[partition]; + /* map to sparable/physical partition desc */ + phyblock = udf_get_pblock(sb, eloc.logicalBlockNum, + map->s_partition_num, ext_offset + offset); + } + + brelse(epos.bh); + return phyblock; +} + +uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + uint32_t retblk; + struct inode *inode; + + udf_debug("READING from METADATA\n"); + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe; + + /* We shouldn't mount such media... */ + BUG_ON(!inode); + retblk = udf_try_read_meta(inode, block, partition, offset); + if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { + udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); + if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) { + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_flags |= MF_MIRROR_FE_LOADED; + } + + inode = mdata->s_mirror_fe; + if (!inode) + return 0xFFFFFFFF; + retblk = udf_try_read_meta(inode, block, partition, offset); + } + + return retblk; +} diff --git a/kernel/fs/udf/super.c b/kernel/fs/udf/super.c new file mode 100644 index 000000000..6299f3419 --- /dev/null +++ b/kernel/fs/udf/super.c @@ -0,0 +1,2469 @@ +/* + * super.c + * + * PURPOSE + * Super block routines for the OSTA-UDF(tm) filesystem. + * + * DESCRIPTION + * OSTA-UDF(tm) = Optical Storage Technology Association + * Universal Disk Format. + * + * This code is based on version 2.00 of the UDF specification, + * and revision 3 of the ECMA 167 standard [equivalent to ISO 13346]. + * http://www.osta.org/ + * http://www.ecma.ch/ + * http://www.iso.org/ + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 2000 Stelias Computing Inc + * + * HISTORY + * + * 09/24/98 dgb changed to allow compiling outside of kernel, and + * added some debugging. + * 10/01/98 dgb updated to allow (some) possibility of compiling w/2.0.34 + * 10/16/98 attempting some multi-session support + * 10/17/98 added freespace count for "df" + * 11/11/98 gr added novrs option + * 11/26/98 dgb added fileset,anchor mount options + * 12/06/98 blf really hosed things royally. vat/sparing support. sequenced + * vol descs. rewrote option handling based on isofs + * 12/20/98 find the free space bitmap (if it exists) + */ + +#include "udfdecl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "udf_sb.h" +#include "udf_i.h" + +#include +#include + +#define VDS_POS_PRIMARY_VOL_DESC 0 +#define VDS_POS_UNALLOC_SPACE_DESC 1 +#define VDS_POS_LOGICAL_VOL_DESC 2 +#define VDS_POS_PARTITION_DESC 3 +#define VDS_POS_IMP_USE_VOL_DESC 4 +#define VDS_POS_VOL_DESC_PTR 5 +#define VDS_POS_TERMINATING_DESC 6 +#define VDS_POS_LENGTH 7 + +#define UDF_DEFAULT_BLOCKSIZE 2048 + +#define VSD_FIRST_SECTOR_OFFSET 32768 +#define VSD_MAX_SECTOR_OFFSET 0x800000 + +enum { UDF_MAX_LINKS = 0xffff }; + +/* These are the "meat" - everything else is stuffing */ +static int udf_fill_super(struct super_block *, void *, int); +static void udf_put_super(struct super_block *); +static int udf_sync_fs(struct super_block *, int); +static int udf_remount_fs(struct super_block *, int *, char *); +static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad); +static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *, + struct kernel_lb_addr *); +static void udf_load_fileset(struct super_block *, struct buffer_head *, + struct kernel_lb_addr *); +static void udf_open_lvid(struct super_block *); +static void udf_close_lvid(struct super_block *); +static unsigned int udf_count_free(struct super_block *); +static int udf_statfs(struct dentry *, struct kstatfs *); +static int udf_show_options(struct seq_file *, struct dentry *); + +struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) +{ + struct logicalVolIntegrityDesc *lvid; + unsigned int partnum; + unsigned int offset; + + if (!UDF_SB(sb)->s_lvid_bh) + return NULL; + lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; + partnum = le32_to_cpu(lvid->numOfPartitions); + if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) - + offsetof(struct logicalVolIntegrityDesc, impUse)) / + (2 * sizeof(uint32_t)) < partnum) { + udf_err(sb, "Logical volume integrity descriptor corrupted " + "(numOfPartitions = %u)!\n", partnum); + return NULL; + } + /* The offset is to skip freeSpaceTable and sizeTable arrays */ + offset = partnum * 2 * sizeof(uint32_t); + return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); +} + +/* UDF filesystem type */ +static struct dentry *udf_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super); +} + +static struct file_system_type udf_fstype = { + .owner = THIS_MODULE, + .name = "udf", + .mount = udf_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("udf"); + +static struct kmem_cache *udf_inode_cachep; + +static struct inode *udf_alloc_inode(struct super_block *sb) +{ + struct udf_inode_info *ei; + ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + + ei->i_unique = 0; + ei->i_lenExtents = 0; + ei->i_next_alloc_block = 0; + ei->i_next_alloc_goal = 0; + ei->i_strat4096 = 0; + init_rwsem(&ei->i_data_sem); + ei->cached_extent.lstart = -1; + spin_lock_init(&ei->i_extent_cache_lock); + + return &ei->vfs_inode; +} + +static void udf_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(udf_inode_cachep, UDF_I(inode)); +} + +static void udf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, udf_i_callback); +} + +static void init_once(void *foo) +{ + struct udf_inode_info *ei = (struct udf_inode_info *)foo; + + ei->i_ext.i_data = NULL; + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + udf_inode_cachep = kmem_cache_create("udf_inode_cache", + sizeof(struct udf_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + init_once); + if (!udf_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(udf_inode_cachep); +} + +/* Superblock operations */ +static const struct super_operations udf_sb_ops = { + .alloc_inode = udf_alloc_inode, + .destroy_inode = udf_destroy_inode, + .write_inode = udf_write_inode, + .evict_inode = udf_evict_inode, + .put_super = udf_put_super, + .sync_fs = udf_sync_fs, + .statfs = udf_statfs, + .remount_fs = udf_remount_fs, + .show_options = udf_show_options, +}; + +struct udf_options { + unsigned char novrs; + unsigned int blocksize; + unsigned int session; + unsigned int lastblock; + unsigned int anchor; + unsigned int volume; + unsigned short partition; + unsigned int fileset; + unsigned int rootdir; + unsigned int flags; + umode_t umask; + kgid_t gid; + kuid_t uid; + umode_t fmode; + umode_t dmode; + struct nls_table *nls_map; +}; + +static int __init init_udf_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&udf_fstype); + if (err) + goto out; + + return 0; + +out: + destroy_inodecache(); + +out1: + return err; +} + +static void __exit exit_udf_fs(void) +{ + unregister_filesystem(&udf_fstype); + destroy_inodecache(); +} + +module_init(init_udf_fs) +module_exit(exit_udf_fs) + +static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), + GFP_KERNEL); + if (!sbi->s_partmaps) { + udf_err(sb, "Unable to allocate space for %d partition maps\n", + count); + sbi->s_partitions = 0; + return -ENOMEM; + } + + sbi->s_partitions = count; + return 0; +} + +static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) +{ + int i; + int nr_groups = bitmap->s_nr_groups; + int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * + nr_groups); + + for (i = 0; i < nr_groups; i++) + if (bitmap->s_block_bitmap[i]) + brelse(bitmap->s_block_bitmap[i]); + + if (size <= PAGE_SIZE) + kfree(bitmap); + else + vfree(bitmap); +} + +static void udf_free_partition(struct udf_part_map *map) +{ + int i; + struct udf_meta_data *mdata; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + iput(map->s_uspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + iput(map->s_fspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + udf_sb_free_bitmap(map->s_uspace.s_bitmap); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + udf_sb_free_bitmap(map->s_fspace.s_bitmap); + if (map->s_partition_type == UDF_SPARABLE_MAP15) + for (i = 0; i < 4; i++) + brelse(map->s_type_specific.s_sparing.s_spar_map[i]); + else if (map->s_partition_type == UDF_METADATA_MAP25) { + mdata = &map->s_type_specific.s_metadata; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + + iput(mdata->s_bitmap_fe); + mdata->s_bitmap_fe = NULL; + } +} + +static void udf_sb_free_partitions(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int i; + if (sbi->s_partmaps == NULL) + return; + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); + kfree(sbi->s_partmaps); + sbi->s_partmaps = NULL; +} + +static int udf_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) + seq_puts(seq, ",nostrict"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET)) + seq_printf(seq, ",bs=%lu", sb->s_blocksize); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) + seq_puts(seq, ",unhide"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) + seq_puts(seq, ",undelete"); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB)) + seq_puts(seq, ",noadinicb"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD)) + seq_puts(seq, ",shortad"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) + seq_puts(seq, ",uid=forget"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE)) + seq_puts(seq, ",uid=ignore"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) + seq_puts(seq, ",gid=forget"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) + seq_puts(seq, ",gid=ignore"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) + seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid)); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) + seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid)); + if (sbi->s_umask != 0) + seq_printf(seq, ",umask=%ho", sbi->s_umask); + if (sbi->s_fmode != UDF_INVALID_MODE) + seq_printf(seq, ",mode=%ho", sbi->s_fmode); + if (sbi->s_dmode != UDF_INVALID_MODE) + seq_printf(seq, ",dmode=%ho", sbi->s_dmode); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) + seq_printf(seq, ",session=%u", sbi->s_session); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) + seq_printf(seq, ",lastblock=%u", sbi->s_last_block); + if (sbi->s_anchor != 0) + seq_printf(seq, ",anchor=%u", sbi->s_anchor); + /* + * volume, partition, fileset and rootdir seem to be ignored + * currently + */ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) + seq_puts(seq, ",utf8"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + + return 0; +} + +/* + * udf_parse_options + * + * PURPOSE + * Parse mount options. + * + * DESCRIPTION + * The following mount options are supported: + * + * gid= Set the default group. + * umask= Set the default umask. + * mode= Set the default file permissions. + * dmode= Set the default directory permissions. + * uid= Set the default user. + * bs= Set the block size. + * unhide Show otherwise hidden files. + * undelete Show deleted files in lists. + * adinicb Embed data in the inode (default) + * noadinicb Don't embed data in the inode + * shortad Use short ad's + * longad Use long ad's (default) + * nostrict Unset strict conformance + * iocharset= Set the NLS character set + * + * The remaining are for debugging and disaster recovery: + * + * novrs Skip volume sequence recognition + * + * The following expect a offset from 0. + * + * session= Set the CDROM session (default= last session) + * anchor= Override standard anchor location. (default= 256) + * volume= Override the VolumeDesc location. (unused) + * partition= Override the PartitionDesc location. (unused) + * lastblock= Set the last block of the filesystem/ + * + * The following expect a offset from the partition root. + * + * fileset= Override the fileset block location. (unused) + * rootdir= Override the root directory location. (unused) + * WARNING: overriding the rootdir to a non-directory may + * yield highly unpredictable results. + * + * PRE-CONDITIONS + * options Pointer to mount options string. + * uopts Pointer to mount options variable. + * + * POST-CONDITIONS + * 1 Mount options parsed okay. + * 0 Error parsing mount options. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ + +enum { + Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete, + Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad, + Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, + Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, + Opt_rootdir, Opt_utf8, Opt_iocharset, + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore, + Opt_fmode, Opt_dmode +}; + +static const match_table_t tokens = { + {Opt_novrs, "novrs"}, + {Opt_nostrict, "nostrict"}, + {Opt_bs, "bs=%u"}, + {Opt_unhide, "unhide"}, + {Opt_undelete, "undelete"}, + {Opt_noadinicb, "noadinicb"}, + {Opt_adinicb, "adinicb"}, + {Opt_shortad, "shortad"}, + {Opt_longad, "longad"}, + {Opt_uforget, "uid=forget"}, + {Opt_uignore, "uid=ignore"}, + {Opt_gforget, "gid=forget"}, + {Opt_gignore, "gid=ignore"}, + {Opt_gid, "gid=%u"}, + {Opt_uid, "uid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_session, "session=%u"}, + {Opt_lastblock, "lastblock=%u"}, + {Opt_anchor, "anchor=%u"}, + {Opt_volume, "volume=%u"}, + {Opt_partition, "partition=%u"}, + {Opt_fileset, "fileset=%u"}, + {Opt_rootdir, "rootdir=%u"}, + {Opt_utf8, "utf8"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_fmode, "mode=%o"}, + {Opt_dmode, "dmode=%o"}, + {Opt_err, NULL} +}; + +static int udf_parse_options(char *options, struct udf_options *uopt, + bool remount) +{ + char *p; + int option; + + uopt->novrs = 0; + uopt->partition = 0xFFFF; + uopt->session = 0xFFFFFFFF; + uopt->lastblock = 0; + uopt->anchor = 0; + uopt->volume = 0xFFFFFFFF; + uopt->rootdir = 0xFFFFFFFF; + uopt->fileset = 0xFFFFFFFF; + uopt->nls_map = NULL; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + unsigned n; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_novrs: + uopt->novrs = 1; + break; + case Opt_bs: + if (match_int(&args[0], &option)) + return 0; + n = option; + if (n != 512 && n != 1024 && n != 2048 && n != 4096) + return 0; + uopt->blocksize = n; + uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); + break; + case Opt_unhide: + uopt->flags |= (1 << UDF_FLAG_UNHIDE); + break; + case Opt_undelete: + uopt->flags |= (1 << UDF_FLAG_UNDELETE); + break; + case Opt_noadinicb: + uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB); + break; + case Opt_adinicb: + uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB); + break; + case Opt_shortad: + uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD); + break; + case Opt_longad: + uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD); + break; + case Opt_gid: + if (match_int(args, &option)) + return 0; + uopt->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(uopt->gid)) + return 0; + uopt->flags |= (1 << UDF_FLAG_GID_SET); + break; + case Opt_uid: + if (match_int(args, &option)) + return 0; + uopt->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uopt->uid)) + return 0; + uopt->flags |= (1 << UDF_FLAG_UID_SET); + break; + case Opt_umask: + if (match_octal(args, &option)) + return 0; + uopt->umask = option; + break; + case Opt_nostrict: + uopt->flags &= ~(1 << UDF_FLAG_STRICT); + break; + case Opt_session: + if (match_int(args, &option)) + return 0; + uopt->session = option; + if (!remount) + uopt->flags |= (1 << UDF_FLAG_SESSION_SET); + break; + case Opt_lastblock: + if (match_int(args, &option)) + return 0; + uopt->lastblock = option; + if (!remount) + uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET); + break; + case Opt_anchor: + if (match_int(args, &option)) + return 0; + uopt->anchor = option; + break; + case Opt_volume: + if (match_int(args, &option)) + return 0; + uopt->volume = option; + break; + case Opt_partition: + if (match_int(args, &option)) + return 0; + uopt->partition = option; + break; + case Opt_fileset: + if (match_int(args, &option)) + return 0; + uopt->fileset = option; + break; + case Opt_rootdir: + if (match_int(args, &option)) + return 0; + uopt->rootdir = option; + break; + case Opt_utf8: + uopt->flags |= (1 << UDF_FLAG_UTF8); + break; +#ifdef CONFIG_UDF_NLS + case Opt_iocharset: + uopt->nls_map = load_nls(args[0].from); + uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + break; +#endif + case Opt_uignore: + uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); + break; + case Opt_uforget: + uopt->flags |= (1 << UDF_FLAG_UID_FORGET); + break; + case Opt_gignore: + uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + break; + case Opt_gforget: + uopt->flags |= (1 << UDF_FLAG_GID_FORGET); + break; + case Opt_fmode: + if (match_octal(args, &option)) + return 0; + uopt->fmode = option & 0777; + break; + case Opt_dmode: + if (match_octal(args, &option)) + return 0; + uopt->dmode = option & 0777; + break; + default: + pr_err("bad mount option \"%s\" or missing value\n", p); + return 0; + } + } + return 1; +} + +static int udf_remount_fs(struct super_block *sb, int *flags, char *options) +{ + struct udf_options uopt; + struct udf_sb_info *sbi = UDF_SB(sb); + int error = 0; + struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + + sync_filesystem(sb); + if (lvidiu) { + int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); + if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) + return -EACCES; + } + + uopt.flags = sbi->s_flags; + uopt.uid = sbi->s_uid; + uopt.gid = sbi->s_gid; + uopt.umask = sbi->s_umask; + uopt.fmode = sbi->s_fmode; + uopt.dmode = sbi->s_dmode; + + if (!udf_parse_options(options, &uopt, true)) + return -EINVAL; + + write_lock(&sbi->s_cred_lock); + sbi->s_flags = uopt.flags; + sbi->s_uid = uopt.uid; + sbi->s_gid = uopt.gid; + sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; + write_unlock(&sbi->s_cred_lock); + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out_unlock; + + if (*flags & MS_RDONLY) + udf_close_lvid(sb); + else + udf_open_lvid(sb); + +out_unlock: + return error; +} + +/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ +/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ +static loff_t udf_check_vsd(struct super_block *sb) +{ + struct volStructDesc *vsd = NULL; + loff_t sector = VSD_FIRST_SECTOR_OFFSET; + int sectorsize; + struct buffer_head *bh = NULL; + int nsr02 = 0; + int nsr03 = 0; + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + if (sb->s_blocksize < sizeof(struct volStructDesc)) + sectorsize = sizeof(struct volStructDesc); + else + sectorsize = sb->s_blocksize; + + sector += (sbi->s_session << sb->s_blocksize_bits); + + udf_debug("Starting at sector %u (%ld byte sectors)\n", + (unsigned int)(sector >> sb->s_blocksize_bits), + sb->s_blocksize); + /* Process the sequence (if applicable). The hard limit on the sector + * offset is arbitrary, hopefully large enough so that all valid UDF + * filesystems will be recognised. There is no mention of an upper + * bound to the size of the volume recognition area in the standard. + * The limit will prevent the code to read all the sectors of a + * specially crafted image (like a bluray disc full of CD001 sectors), + * potentially causing minutes or even hours of uninterruptible I/O + * activity. This actually happened with uninitialised SSD partitions + * (all 0xFF) before the check for the limit and all valid IDs were + * added */ + for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET; + sector += sectorsize) { + /* Read a block */ + bh = udf_tread(sb, sector >> sb->s_blocksize_bits); + if (!bh) + break; + + /* Look for ISO descriptors */ + vsd = (struct volStructDesc *)(bh->b_data + + (sector & (sb->s_blocksize - 1))); + + if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, + VSD_STD_ID_LEN)) { + switch (vsd->structType) { + case 0: + udf_debug("ISO9660 Boot Record found\n"); + break; + case 1: + udf_debug("ISO9660 Primary Volume Descriptor found\n"); + break; + case 2: + udf_debug("ISO9660 Supplementary Volume Descriptor found\n"); + break; + case 3: + udf_debug("ISO9660 Volume Partition Descriptor found\n"); + break; + case 255: + udf_debug("ISO9660 Volume Descriptor Set Terminator found\n"); + break; + default: + udf_debug("ISO9660 VRS (%u) found\n", + vsd->structType); + break; + } + } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BEA01, + VSD_STD_ID_LEN)) + ; /* nothing */ + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01, + VSD_STD_ID_LEN)) { + brelse(bh); + break; + } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02, + VSD_STD_ID_LEN)) + nsr02 = sector; + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03, + VSD_STD_ID_LEN)) + nsr03 = sector; + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2, + VSD_STD_ID_LEN)) + ; /* nothing */ + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02, + VSD_STD_ID_LEN)) + ; /* nothing */ + else { + /* invalid id : end of volume recognition area */ + brelse(bh); + break; + } + brelse(bh); + } + + if (nsr03) + return nsr03; + else if (nsr02) + return nsr02; + else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) == + VSD_FIRST_SECTOR_OFFSET) + return -1; + else + return 0; +} + +static int udf_find_fileset(struct super_block *sb, + struct kernel_lb_addr *fileset, + struct kernel_lb_addr *root) +{ + struct buffer_head *bh = NULL; + long lastblock; + uint16_t ident; + struct udf_sb_info *sbi; + + if (fileset->logicalBlockNum != 0xFFFFFFFF || + fileset->partitionReferenceNum != 0xFFFF) { + bh = udf_read_ptagged(sb, fileset, 0, &ident); + + if (!bh) { + return 1; + } else if (ident != TAG_IDENT_FSD) { + brelse(bh); + return 1; + } + + } + + sbi = UDF_SB(sb); + if (!bh) { + /* Search backwards through the partitions */ + struct kernel_lb_addr newfileset; + +/* --> cvg: FIXME - is it reasonable? */ + return 1; + + for (newfileset.partitionReferenceNum = sbi->s_partitions - 1; + (newfileset.partitionReferenceNum != 0xFFFF && + fileset->logicalBlockNum == 0xFFFFFFFF && + fileset->partitionReferenceNum == 0xFFFF); + newfileset.partitionReferenceNum--) { + lastblock = sbi->s_partmaps + [newfileset.partitionReferenceNum] + .s_partition_len; + newfileset.logicalBlockNum = 0; + + do { + bh = udf_read_ptagged(sb, &newfileset, 0, + &ident); + if (!bh) { + newfileset.logicalBlockNum++; + continue; + } + + switch (ident) { + case TAG_IDENT_SBD: + { + struct spaceBitmapDesc *sp; + sp = (struct spaceBitmapDesc *) + bh->b_data; + newfileset.logicalBlockNum += 1 + + ((le32_to_cpu(sp->numOfBytes) + + sizeof(struct spaceBitmapDesc) + - 1) >> sb->s_blocksize_bits); + brelse(bh); + break; + } + case TAG_IDENT_FSD: + *fileset = newfileset; + break; + default: + newfileset.logicalBlockNum++; + brelse(bh); + bh = NULL; + break; + } + } while (newfileset.logicalBlockNum < lastblock && + fileset->logicalBlockNum == 0xFFFFFFFF && + fileset->partitionReferenceNum == 0xFFFF); + } + } + + if ((fileset->logicalBlockNum != 0xFFFFFFFF || + fileset->partitionReferenceNum != 0xFFFF) && bh) { + udf_debug("Fileset at block=%d, partition=%d\n", + fileset->logicalBlockNum, + fileset->partitionReferenceNum); + + sbi->s_partition = fileset->partitionReferenceNum; + udf_load_fileset(sb, bh, root); + brelse(bh); + return 0; + } + return 1; +} + +/* + * Load primary Volume Descriptor Sequence + * + * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence + * should be tried. + */ +static int udf_load_pvoldesc(struct super_block *sb, sector_t block) +{ + struct primaryVolDesc *pvoldesc; + struct ustr *instr, *outstr; + struct buffer_head *bh; + uint16_t ident; + int ret = -ENOMEM; + + instr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!instr) + return -ENOMEM; + + outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!outstr) + goto out1; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) { + ret = -EAGAIN; + goto out2; + } + + if (ident != TAG_IDENT_PVD) { + ret = -EIO; + goto out_bh; + } + + pvoldesc = (struct primaryVolDesc *)bh->b_data; + + if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, + pvoldesc->recordingDateAndTime)) { +#ifdef UDFFS_DEBUG + struct timestamp *ts = &pvoldesc->recordingDateAndTime; + udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", + le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, + ts->minute, le16_to_cpu(ts->typeAndTimezone)); +#endif + } + + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) + if (udf_CS0toUTF8(outstr, instr)) { + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); + udf_debug("volIdent[] = '%s'\n", + UDF_SB(sb)->s_volume_ident); + } + + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) + if (udf_CS0toUTF8(outstr, instr)) + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + + ret = 0; +out_bh: + brelse(bh); +out2: + kfree(outstr); +out1: + kfree(instr); + return ret; +} + +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num) +{ + struct kernel_lb_addr addr; + struct inode *metadata_fe; + + addr.logicalBlockNum = meta_file_loc; + addr.partitionReferenceNum = partition_num; + + metadata_fe = udf_iget_special(sb, &addr); + + if (IS_ERR(metadata_fe)) { + udf_warn(sb, "metadata inode efe not found\n"); + return metadata_fe; + } + if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); + iput(metadata_fe); + return ERR_PTR(-EIO); + } + + return metadata_fe; +} + +static int udf_load_metadata_files(struct super_block *sb, int partition) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + struct kernel_lb_addr addr; + struct inode *fe; + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + + /* metadata address */ + udf_debug("Metadata file location: block = %d part = %d\n", + mdata->s_meta_file_loc, map->s_partition_num); + + fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, + map->s_partition_num); + if (IS_ERR(fe)) { + /* mirror file entry */ + udf_debug("Mirror metadata file location: block = %d part = %d\n", + mdata->s_mirror_file_loc, map->s_partition_num); + + fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, + map->s_partition_num); + + if (IS_ERR(fe)) { + udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); + return PTR_ERR(fe); + } + mdata->s_mirror_fe = fe; + } else + mdata->s_metadata_fe = fe; + + + /* + * bitmap file entry + * Note: + * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102) + */ + if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) { + addr.logicalBlockNum = mdata->s_bitmap_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Bitmap file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + fe = udf_iget_special(sb, &addr); + if (IS_ERR(fe)) { + if (sb->s_flags & MS_RDONLY) + udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); + else { + udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); + return PTR_ERR(fe); + } + } else + mdata->s_bitmap_fe = fe; + } + + udf_debug("udf_load_metadata_files Ok\n"); + return 0; +} + +static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *root) +{ + struct fileSetDesc *fset; + + fset = (struct fileSetDesc *)bh->b_data; + + *root = lelb_to_cpu(fset->rootDirectoryICB.extLocation); + + UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum); + + udf_debug("Rootdir at block=%d, partition=%d\n", + root->logicalBlockNum, root->partitionReferenceNum); +} + +int udf_compute_nr_groups(struct super_block *sb, u32 partition) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + return DIV_ROUND_UP(map->s_partition_len + + (sizeof(struct spaceBitmapDesc) << 3), + sb->s_blocksize * 8); +} + +static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) +{ + struct udf_bitmap *bitmap; + int nr_groups; + int size; + + nr_groups = udf_compute_nr_groups(sb, index); + size = sizeof(struct udf_bitmap) + + (sizeof(struct buffer_head *) * nr_groups); + + if (size <= PAGE_SIZE) + bitmap = kzalloc(size, GFP_KERNEL); + else + bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ + + if (bitmap == NULL) + return NULL; + + bitmap->s_nr_groups = nr_groups; + return bitmap; +} + +static int udf_fill_partdesc_info(struct super_block *sb, + struct partitionDesc *p, int p_index) +{ + struct udf_part_map *map; + struct udf_sb_info *sbi = UDF_SB(sb); + struct partitionHeaderDesc *phd; + + map = &sbi->s_partmaps[p_index]; + + map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */ + map->s_partition_root = le32_to_cpu(p->partitionStartingLocation); + + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY)) + map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE)) + map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; + + udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n", + p_index, map->s_partition_type, + map->s_partition_root, map->s_partition_len); + + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + return 0; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + if (phd->unallocSpaceTable.extLength) { + struct kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->unallocSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + struct inode *inode; + + inode = udf_iget_special(sb, &loc); + if (IS_ERR(inode)) { + udf_debug("cannot load unallocSpaceTable (part %d)\n", + p_index); + return PTR_ERR(inode); + } + map->s_uspace.s_table = inode; + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; + udf_debug("unallocSpaceTable (part %d) @ %ld\n", + p_index, map->s_uspace.s_table->i_ino); + } + + if (phd->unallocSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return -ENOMEM; + map->s_uspace.s_bitmap = bitmap; + bitmap->s_extPosition = le32_to_cpu( + phd->unallocSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); + } + + if (phd->partitionIntegrityTable.extLength) + udf_debug("partitionIntegrityTable (part %d)\n", p_index); + + if (phd->freedSpaceTable.extLength) { + struct kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->freedSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + struct inode *inode; + + inode = udf_iget_special(sb, &loc); + if (IS_ERR(inode)) { + udf_debug("cannot load freedSpaceTable (part %d)\n", + p_index); + return PTR_ERR(inode); + } + map->s_fspace.s_table = inode; + map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; + udf_debug("freedSpaceTable (part %d) @ %ld\n", + p_index, map->s_fspace.s_table->i_ino); + } + + if (phd->freedSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return -ENOMEM; + map->s_fspace.s_bitmap = bitmap; + bitmap->s_extPosition = le32_to_cpu( + phd->freedSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; + udf_debug("freedSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); + } + return 0; +} + +static void udf_find_vat_block(struct super_block *sb, int p_index, + int type1_index, sector_t start_block) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; + sector_t vat_block; + struct kernel_lb_addr ino; + struct inode *inode; + + /* + * VAT file entry is in the last recorded block. Some broken disks have + * it a few blocks before so try a bit harder... + */ + ino.partitionReferenceNum = type1_index; + for (vat_block = start_block; + vat_block >= map->s_partition_root && + vat_block >= start_block - 3; vat_block--) { + ino.logicalBlockNum = vat_block - map->s_partition_root; + inode = udf_iget_special(sb, &ino); + if (!IS_ERR(inode)) { + sbi->s_vat_inode = inode; + break; + } + } +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; + struct buffer_head *bh = NULL; + struct udf_inode_info *vati; + uint32_t pos; + struct virtualAllocationTable20 *vat20; + sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); + if (!sbi->s_vat_inode && + sbi->s_last_block != blocks - 1) { + pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); + udf_find_vat_block(sb, p_index, type1_index, blocks - 1); + } + if (!sbi->s_vat_inode) + return -EIO; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15) { + map->s_type_specific.s_virtual.s_start_offset = 0; + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - 36) >> 2; + } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { + vati = UDF_I(sbi->s_vat_inode); + if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + pos = udf_block_map(sbi->s_vat_inode, 0); + bh = sb_bread(sb, pos); + if (!bh) + return -EIO; + vat20 = (struct virtualAllocationTable20 *)bh->b_data; + } else { + vat20 = (struct virtualAllocationTable20 *) + vati->i_ext.i_data; + } + + map->s_type_specific.s_virtual.s_start_offset = + le16_to_cpu(vat20->lengthHeader); + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - + map->s_type_specific.s_virtual. + s_start_offset) >> 2; + brelse(bh); + } + return 0; +} + +/* + * Load partition descriptor block + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor + * sequence. + */ +static int udf_load_partdesc(struct super_block *sb, sector_t block) +{ + struct buffer_head *bh; + struct partitionDesc *p; + struct udf_part_map *map; + struct udf_sb_info *sbi = UDF_SB(sb); + int i, type1_idx; + uint16_t partitionNumber; + uint16_t ident; + int ret; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return -EAGAIN; + if (ident != TAG_IDENT_PD) { + ret = 0; + goto out_bh; + } + + p = (struct partitionDesc *)bh->b_data; + partitionNumber = le16_to_cpu(p->partitionNumber); + + /* First scan for TYPE1, SPARABLE and METADATA partitions */ + for (i = 0; i < sbi->s_partitions; i++) { + map = &sbi->s_partmaps[i]; + udf_debug("Searching map: (%d == %d)\n", + map->s_partition_num, partitionNumber); + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_TYPE1_MAP15 || + map->s_partition_type == UDF_SPARABLE_MAP15)) + break; + } + + if (i >= sbi->s_partitions) { + udf_debug("Partition (%d) not found in partition map\n", + partitionNumber); + ret = 0; + goto out_bh; + } + + ret = udf_fill_partdesc_info(sb, p, i); + if (ret < 0) + goto out_bh; + + /* + * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and + * PHYSICAL partitions are already set up + */ + type1_idx = i; +#ifdef UDFFS_DEBUG + map = NULL; /* supress 'maybe used uninitialized' warning */ +#endif + for (i = 0; i < sbi->s_partitions; i++) { + map = &sbi->s_partmaps[i]; + + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25)) + break; + } + + if (i >= sbi->s_partitions) { + ret = 0; + goto out_bh; + } + + ret = udf_fill_partdesc_info(sb, p, i); + if (ret < 0) + goto out_bh; + + if (map->s_partition_type == UDF_METADATA_MAP25) { + ret = udf_load_metadata_files(sb, i); + if (ret < 0) { + udf_err(sb, "error loading MetaData partition map %d\n", + i); + goto out_bh; + } + } else { + /* + * If we have a partition with virtual map, we don't handle + * writing to it (we overwrite blocks instead of relocating + * them). + */ + if (!(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto out_bh; + } + ret = udf_load_vat(sb, i, type1_idx); + if (ret < 0) + goto out_bh; + } + ret = 0; +out_bh: + /* In case loading failed, we handle cleanup in udf_fill_super */ + brelse(bh); + return ret; +} + +static int udf_load_sparable_map(struct super_block *sb, + struct udf_part_map *map, + struct sparablePartitionMap *spm) +{ + uint32_t loc; + uint16_t ident; + struct sparingTable *st; + struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing; + int i; + struct buffer_head *bh; + + map->s_partition_type = UDF_SPARABLE_MAP15; + sdata->s_packet_len = le16_to_cpu(spm->packetLength); + if (!is_power_of_2(sdata->s_packet_len)) { + udf_err(sb, "error loading logical volume descriptor: " + "Invalid packet length %u\n", + (unsigned)sdata->s_packet_len); + return -EIO; + } + if (spm->numSparingTables > 4) { + udf_err(sb, "error loading logical volume descriptor: " + "Too many sparing tables (%d)\n", + (int)spm->numSparingTables); + return -EIO; + } + + for (i = 0; i < spm->numSparingTables; i++) { + loc = le32_to_cpu(spm->locSparingTable[i]); + bh = udf_read_tagged(sb, loc, loc, &ident); + if (!bh) + continue; + + st = (struct sparingTable *)bh->b_data; + if (ident != 0 || + strncmp(st->sparingIdent.ident, UDF_ID_SPARING, + strlen(UDF_ID_SPARING)) || + sizeof(*st) + le16_to_cpu(st->reallocationTableLen) > + sb->s_blocksize) { + brelse(bh); + continue; + } + + sdata->s_spar_map[i] = bh; + } + map->s_partition_func = udf_get_pblock_spar15; + return 0; +} + +static int udf_load_logicalvol(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) +{ + struct logicalVolDesc *lvd; + int i, offset; + uint8_t type; + struct udf_sb_info *sbi = UDF_SB(sb); + struct genericPartitionMap *gpm; + uint16_t ident; + struct buffer_head *bh; + unsigned int table_len; + int ret; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return -EAGAIN; + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; + table_len = le32_to_cpu(lvd->mapTableLength); + if (table_len > sb->s_blocksize - sizeof(*lvd)) { + udf_err(sb, "error loading logical volume descriptor: " + "Partition table too long (%u > %lu)\n", table_len, + sb->s_blocksize - sizeof(*lvd)); + ret = -EIO; + goto out_bh; + } + + ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); + if (ret) + goto out_bh; + + for (i = 0, offset = 0; + i < sbi->s_partitions && offset < table_len; + i++, offset += gpm->partitionMapLength) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + gpm = (struct genericPartitionMap *) + &(lvd->partitionMaps[offset]); + type = gpm->partitionMapType; + if (type == 1) { + struct genericPartitionMap1 *gpm1 = + (struct genericPartitionMap1 *)gpm; + map->s_partition_type = UDF_TYPE1_MAP15; + map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum); + map->s_partition_num = le16_to_cpu(gpm1->partitionNum); + map->s_partition_func = NULL; + } else if (type == 2) { + struct udfPartitionMap2 *upm2 = + (struct udfPartitionMap2 *)gpm; + if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL, + strlen(UDF_ID_VIRTUAL))) { + u16 suf = + le16_to_cpu(((__le16 *)upm2->partIdent. + identSuffix)[0]); + if (suf < 0x0200) { + map->s_partition_type = + UDF_VIRTUAL_MAP15; + map->s_partition_func = + udf_get_pblock_virt15; + } else { + map->s_partition_type = + UDF_VIRTUAL_MAP20; + map->s_partition_func = + udf_get_pblock_virt20; + } + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_SPARABLE, + strlen(UDF_ID_SPARABLE))) { + ret = udf_load_sparable_map(sb, map, + (struct sparablePartitionMap *)gpm); + if (ret < 0) + goto out_bh; + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { + struct udf_meta_data *mdata = + &map->s_type_specific.s_metadata; + struct metadataPartitionMap *mdm = + (struct metadataPartitionMap *) + &(lvd->partitionMaps[offset]); + udf_debug("Parsing Logical vol part %d type %d id=%s\n", + i, type, UDF_ID_METADATA); + + map->s_partition_type = UDF_METADATA_MAP25; + map->s_partition_func = udf_get_pblock_meta25; + + mdata->s_meta_file_loc = + le32_to_cpu(mdm->metadataFileLoc); + mdata->s_mirror_file_loc = + le32_to_cpu(mdm->metadataMirrorFileLoc); + mdata->s_bitmap_file_loc = + le32_to_cpu(mdm->metadataBitmapFileLoc); + mdata->s_alloc_unit_size = + le32_to_cpu(mdm->allocUnitSize); + mdata->s_align_unit_size = + le16_to_cpu(mdm->alignUnitSize); + if (mdm->flags & 0x01) + mdata->s_flags |= MF_DUPLICATE_MD; + + udf_debug("Metadata Ident suffix=0x%x\n", + le16_to_cpu(*(__le16 *) + mdm->partIdent.identSuffix)); + udf_debug("Metadata part num=%d\n", + le16_to_cpu(mdm->partitionNum)); + udf_debug("Metadata part alloc unit size=%d\n", + le32_to_cpu(mdm->allocUnitSize)); + udf_debug("Metadata file loc=%d\n", + le32_to_cpu(mdm->metadataFileLoc)); + udf_debug("Mirror file loc=%d\n", + le32_to_cpu(mdm->metadataMirrorFileLoc)); + udf_debug("Bitmap file loc=%d\n", + le32_to_cpu(mdm->metadataBitmapFileLoc)); + udf_debug("Flags: %d %d\n", + mdata->s_flags, mdm->flags); + } else { + udf_debug("Unknown ident: %s\n", + upm2->partIdent.ident); + continue; + } + map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum); + map->s_partition_num = le16_to_cpu(upm2->partitionNum); + } + udf_debug("Partition (%d:%d) type %d on volume %d\n", + i, map->s_partition_num, type, map->s_volumeseqnum); + } + + if (fileset) { + struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); + + *fileset = lelb_to_cpu(la->extLocation); + udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n", + fileset->logicalBlockNum, + fileset->partitionReferenceNum); + } + if (lvd->integritySeqExt.extLength) + udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); + ret = 0; +out_bh: + brelse(bh); + return ret; +} + +/* + * udf_load_logicalvolint + * + */ +static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) +{ + struct buffer_head *bh = NULL; + uint16_t ident; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + + while (loc.extLength > 0 && + (bh = udf_read_tagged(sb, loc.extLocation, + loc.extLocation, &ident)) && + ident == TAG_IDENT_LVID) { + sbi->s_lvid_bh = bh; + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + + if (lvid->nextIntegrityExt.extLength) + udf_load_logicalvolint(sb, + leea_to_cpu(lvid->nextIntegrityExt)); + + if (sbi->s_lvid_bh != bh) + brelse(bh); + loc.extLength -= sb->s_blocksize; + loc.extLocation++; + } + if (sbi->s_lvid_bh != bh) + brelse(bh); +} + +/* + * Process a main/reserve volume descriptor sequence. + * @block First block of first extent of the sequence. + * @lastblock Lastblock of first extent of the sequence. + * @fileset There we store extent containing root fileset + * + * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor + * sequence + */ +static noinline int udf_process_sequence( + struct super_block *sb, + sector_t block, sector_t lastblock, + struct kernel_lb_addr *fileset) +{ + struct buffer_head *bh = NULL; + struct udf_vds_record vds[VDS_POS_LENGTH]; + struct udf_vds_record *curr; + struct generic_desc *gd; + struct volDescPtr *vdp; + bool done = false; + uint32_t vdsn; + uint16_t ident; + long next_s = 0, next_e = 0; + int ret; + + memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + + /* + * Read the main descriptor sequence and find which descriptors + * are in it. + */ + for (; (!done && block <= lastblock); block++) { + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) { + udf_err(sb, + "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", + (unsigned long long)block); + return -EAGAIN; + } + + /* Process each descriptor (ISO 13346 3/8.3-8.4) */ + gd = (struct generic_desc *)bh->b_data; + vdsn = le32_to_cpu(gd->volDescSeqNum); + switch (ident) { + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ + curr = &vds[VDS_POS_PRIMARY_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */ + curr = &vds[VDS_POS_VOL_DESC_PTR]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + + vdp = (struct volDescPtr *)bh->b_data; + next_s = le32_to_cpu( + vdp->nextVolDescSeqExt.extLocation); + next_e = le32_to_cpu( + vdp->nextVolDescSeqExt.extLength); + next_e = next_e >> sb->s_blocksize_bits; + next_e += next_s; + } + break; + case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ + curr = &vds[VDS_POS_IMP_USE_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + curr = &vds[VDS_POS_PARTITION_DESC]; + if (!curr->block) + curr->block = block; + break; + case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ + curr = &vds[VDS_POS_LOGICAL_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ + curr = &vds[VDS_POS_UNALLOC_SPACE_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + vds[VDS_POS_TERMINATING_DESC].block = block; + if (next_e) { + block = next_s; + lastblock = next_e; + next_s = next_e = 0; + } else + done = true; + break; + } + brelse(bh); + } + /* + * Now read interesting descriptors again and process them + * in a suitable order + */ + if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { + udf_err(sb, "Primary Volume Descriptor not found!\n"); + return -EAGAIN; + } + ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block); + if (ret < 0) + return ret; + + if (vds[VDS_POS_LOGICAL_VOL_DESC].block) { + ret = udf_load_logicalvol(sb, + vds[VDS_POS_LOGICAL_VOL_DESC].block, + fileset); + if (ret < 0) + return ret; + } + + if (vds[VDS_POS_PARTITION_DESC].block) { + /* + * We rescan the whole descriptor sequence to find + * partition descriptor blocks and process them. + */ + for (block = vds[VDS_POS_PARTITION_DESC].block; + block < vds[VDS_POS_TERMINATING_DESC].block; + block++) { + ret = udf_load_partdesc(sb, block); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Load Volume Descriptor Sequence described by anchor in bh + * + * Returns <0 on error, 0 on success + */ +static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *fileset) +{ + struct anchorVolDescPtr *anchor; + sector_t main_s, main_e, reserve_s, reserve_e; + int ret; + + anchor = (struct anchorVolDescPtr *)bh->b_data; + + /* Locate the main sequence */ + main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); + main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); + main_e = main_e >> sb->s_blocksize_bits; + main_e += main_s; + + /* Locate the reserve sequence */ + reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); + reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); + reserve_e = reserve_e >> sb->s_blocksize_bits; + reserve_e += reserve_s; + + /* Process the main & reserve sequences */ + /* responsible for finding the PartitionDesc(s) */ + ret = udf_process_sequence(sb, main_s, main_e, fileset); + if (ret != -EAGAIN) + return ret; + udf_sb_free_partitions(sb); + ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset); + if (ret < 0) { + udf_sb_free_partitions(sb); + /* No sequence was OK, return -EIO */ + if (ret == -EAGAIN) + ret = -EIO; + } + return ret; +} + +/* + * Check whether there is an anchor block in the given block and + * load Volume Descriptor Sequence if so. + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor + * block + */ +static int udf_check_anchor_block(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) +{ + struct buffer_head *bh; + uint16_t ident; + int ret; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return -EAGAIN; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return -EAGAIN; + if (ident != TAG_IDENT_AVDP) { + brelse(bh); + return -EAGAIN; + } + ret = udf_load_sequence(sb, bh, fileset); + brelse(bh); + return ret; +} + +/* + * Search for an anchor volume descriptor pointer. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set + * of anchors. + */ +static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, + struct kernel_lb_addr *fileset) +{ + sector_t last[6]; + int i; + struct udf_sb_info *sbi = UDF_SB(sb); + int last_count = 0; + int ret; + + /* First try user provided anchor */ + if (sbi->s_anchor) { + ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset); + if (ret != -EAGAIN) + return ret; + } + /* + * according to spec, anchor is in either: + * block 256 + * lastblock-256 + * lastblock + * however, if the disc isn't closed, it could be 512. + */ + ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset); + if (ret != -EAGAIN) + return ret; + /* + * The trouble is which block is the last one. Drives often misreport + * this so we try various possibilities. + */ + last[last_count++] = *lastblock; + if (*lastblock >= 1) + last[last_count++] = *lastblock - 1; + last[last_count++] = *lastblock + 1; + if (*lastblock >= 2) + last[last_count++] = *lastblock - 2; + if (*lastblock >= 150) + last[last_count++] = *lastblock - 150; + if (*lastblock >= 152) + last[last_count++] = *lastblock - 152; + + for (i = 0; i < last_count; i++) { + if (last[i] >= sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits) + continue; + ret = udf_check_anchor_block(sb, last[i], fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } + if (last[i] < 256) + continue; + ret = udf_check_anchor_block(sb, last[i] - 256, fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } + } + + /* Finally try block 512 in case media is open */ + return udf_check_anchor_block(sb, sbi->s_session + 512, fileset); +} + +/* + * Find an anchor volume descriptor and load Volume Descriptor Sequence from + * area specified by it. The function expects sbi->s_lastblock to be the last + * block on the media. + * + * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor + * was not found. + */ +static int udf_find_anchor(struct super_block *sb, + struct kernel_lb_addr *fileset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + sector_t lastblock = sbi->s_last_block; + int ret; + + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) + goto out; + + /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + lastblock = udf_variable_to_fixed(sbi->s_last_block); + /* Firstly, we try to not convert number of the last block */ + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) + goto out; + + lastblock = sbi->s_last_block; + /* Secondly, we try with converted number of the last block */ + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret < 0) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + } +out: + if (ret == 0) + sbi->s_last_block = lastblock; + return ret; +} + +/* + * Check Volume Structure Descriptor, find Anchor block and load Volume + * Descriptor Sequence. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor + * block was not found. + */ +static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, + int silent, struct kernel_lb_addr *fileset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + loff_t nsr_off; + int ret; + + if (!sb_set_blocksize(sb, uopt->blocksize)) { + if (!silent) + udf_warn(sb, "Bad block size\n"); + return -EINVAL; + } + sbi->s_last_block = uopt->lastblock; + if (!uopt->novrs) { + /* Check that it is NSR02 compliant */ + nsr_off = udf_check_vsd(sb); + if (!nsr_off) { + if (!silent) + udf_warn(sb, "No VRS found\n"); + return 0; + } + if (nsr_off == -1) + udf_debug("Failed to read sector at offset %d. " + "Assuming open disc. Skipping validity " + "check\n", VSD_FIRST_SECTOR_OFFSET); + if (!sbi->s_last_block) + sbi->s_last_block = udf_get_last_block(sb); + } else { + udf_debug("Validity check skipped because of novrs option\n"); + } + + /* Look for anchor block and load Volume Descriptor Sequence */ + sbi->s_anchor = uopt->anchor; + ret = udf_find_anchor(sb, fileset); + if (ret < 0) { + if (!silent && ret == -EAGAIN) + udf_warn(sb, "No anchor found\n"); + return ret; + } + return 0; +} + +static void udf_open_lvid(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; + + if (!bh) + return; + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvidiu = udf_sb_lvidiu(sb); + if (!lvidiu) + return; + + mutex_lock(&sbi->s_alloc_mutex); + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, + CURRENT_TIME); + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); + /* Make opening of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); +} + +static void udf_close_lvid(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; + + if (!bh) + return; + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvidiu = udf_sb_lvidiu(sb); + if (!lvidiu) + return; + + mutex_lock(&sbi->s_alloc_mutex); + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); + if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) + lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) + lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) + lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); + + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + /* + * We set buffer uptodate unconditionally here to avoid spurious + * warnings from mark_buffer_dirty() when previous EIO has marked + * the buffer as !uptodate + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); + /* Make closing of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); +} + +u64 lvid_get_unique_id(struct super_block *sb) +{ + struct buffer_head *bh; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + struct logicalVolHeaderDesc *lvhd; + u64 uniqueID; + u64 ret; + + bh = sbi->s_lvid_bh; + if (!bh) + return 0; + + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse; + + mutex_lock(&sbi->s_alloc_mutex); + ret = uniqueID = le64_to_cpu(lvhd->uniqueID); + if (!(++uniqueID & 0xFFFFFFFF)) + uniqueID += 16; + lvhd->uniqueID = cpu_to_le64(uniqueID); + mutex_unlock(&sbi->s_alloc_mutex); + mark_buffer_dirty(bh); + + return ret; +} + +static int udf_fill_super(struct super_block *sb, void *options, int silent) +{ + int ret = -EINVAL; + struct inode *inode = NULL; + struct udf_options uopt; + struct kernel_lb_addr rootdir, fileset; + struct udf_sb_info *sbi; + + uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); + uopt.uid = INVALID_UID; + uopt.gid = INVALID_GID; + uopt.umask = 0; + uopt.fmode = UDF_INVALID_MODE; + uopt.dmode = UDF_INVALID_MODE; + + sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + mutex_init(&sbi->s_alloc_mutex); + + if (!udf_parse_options((char *)options, &uopt, false)) + goto parse_options_failure; + + if (uopt.flags & (1 << UDF_FLAG_UTF8) && + uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { + udf_err(sb, "utf8 cannot be combined with iocharset\n"); + goto parse_options_failure; + } +#ifdef CONFIG_UDF_NLS + if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { + uopt.nls_map = load_nls_default(); + if (!uopt.nls_map) + uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); + else + udf_debug("Using default NLS map\n"); + } +#endif + if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) + uopt.flags |= (1 << UDF_FLAG_UTF8); + + fileset.logicalBlockNum = 0xFFFFFFFF; + fileset.partitionReferenceNum = 0xFFFF; + + sbi->s_flags = uopt.flags; + sbi->s_uid = uopt.uid; + sbi->s_gid = uopt.gid; + sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; + sbi->s_nls_map = uopt.nls_map; + rwlock_init(&sbi->s_cred_lock); + + if (uopt.session == 0xFFFFFFFF) + sbi->s_session = udf_get_last_session(sb); + else + sbi->s_session = uopt.session; + + udf_debug("Multi-session=%d\n", sbi->s_session); + + /* Fill in the rest of the superblock */ + sb->s_op = &udf_sb_ops; + sb->s_export_op = &udf_export_ops; + + sb->s_magic = UDF_SUPER_MAGIC; + sb->s_time_gran = 1000; + + if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } else { + uopt.blocksize = bdev_logical_block_size(sb->s_bdev); + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (!silent) + pr_notice("Rescanning with blocksize %d\n", + UDF_DEFAULT_BLOCKSIZE); + brelse(sbi->s_lvid_bh); + sbi->s_lvid_bh = NULL; + uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } + } + if (ret < 0) { + if (ret == -EAGAIN) { + udf_warn(sb, "No partition found (1)\n"); + ret = -EINVAL; + } + goto error_out; + } + + udf_debug("Lastblock=%d\n", sbi->s_last_block); + + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDescImpUse *lvidiu = + udf_sb_lvidiu(sb); + uint16_t minUDFReadRev; + uint16_t minUDFWriteRev; + + if (!lvidiu) { + ret = -EINVAL; + goto error_out; + } + minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev); + minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev); + if (minUDFReadRev > UDF_MAX_READ_VERSION) { + udf_err(sb, "minUDFReadRev=%x (max is %x)\n", + minUDFReadRev, + UDF_MAX_READ_VERSION); + ret = -EINVAL; + goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto error_out; + } + + sbi->s_udfrev = minUDFWriteRev; + + if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE) + UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE); + if (minUDFReadRev >= UDF_VERS_USE_STREAMS) + UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS); + } + + if (!sbi->s_partitions) { + udf_warn(sb, "No partition found (2)\n"); + ret = -EINVAL; + goto error_out; + } + + if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & + UDF_PART_FLAG_READ_ONLY && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto error_out; + } + + if (udf_find_fileset(sb, &fileset, &rootdir)) { + udf_warn(sb, "No fileset found\n"); + ret = -EINVAL; + goto error_out; + } + + if (!silent) { + struct timestamp ts; + udf_time_to_disk_stamp(&ts, sbi->s_record_time); + udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n", + sbi->s_volume_ident, + le16_to_cpu(ts.year), ts.month, ts.day, + ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); + } + if (!(sb->s_flags & MS_RDONLY)) + udf_open_lvid(sb); + + /* Assign the root inode */ + /* assign inodes by physical block number */ + /* perhaps it's not extensible enough, but for now ... */ + inode = udf_iget(sb, &rootdir); + if (IS_ERR(inode)) { + udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", + rootdir.logicalBlockNum, rootdir.partitionReferenceNum); + ret = PTR_ERR(inode); + goto error_out; + } + + /* Allocate a dentry for the root inode */ + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + udf_err(sb, "Couldn't allocate root dentry\n"); + ret = -ENOMEM; + goto error_out; + } + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = UDF_MAX_LINKS; + return 0; + +error_out: + iput(sbi->s_vat_inode); +parse_options_failure: +#ifdef CONFIG_UDF_NLS + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + unload_nls(sbi->s_nls_map); +#endif + if (!(sb->s_flags & MS_RDONLY)) + udf_close_lvid(sb); + brelse(sbi->s_lvid_bh); + udf_sb_free_partitions(sb); + kfree(sbi); + sb->s_fs_info = NULL; + + return ret; +} + +void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf); + + va_end(args); +} + +void _udf_warn(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf); + + va_end(args); +} + +static void udf_put_super(struct super_block *sb) +{ + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + + iput(sbi->s_vat_inode); +#ifdef CONFIG_UDF_NLS + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + unload_nls(sbi->s_nls_map); +#endif + if (!(sb->s_flags & MS_RDONLY)) + udf_close_lvid(sb); + brelse(sbi->s_lvid_bh); + udf_sb_free_partitions(sb); + mutex_destroy(&sbi->s_alloc_mutex); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int udf_sync_fs(struct super_block *sb, int wait) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + mutex_lock(&sbi->s_alloc_mutex); + if (sbi->s_lvid_dirty) { + /* + * Blockdevice will be synced later so we don't have to submit + * the buffer for IO + */ + mark_buffer_dirty(sbi->s_lvid_bh); + sbi->s_lvid_dirty = 0; + } + mutex_unlock(&sbi->s_alloc_mutex); + + return 0; +} + +static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDescImpUse *lvidiu; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + lvidiu = udf_sb_lvidiu(sb); + buf->f_type = UDF_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; + buf->f_bfree = udf_count_free(sb); + buf->f_bavail = buf->f_bfree; + buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + + le32_to_cpu(lvidiu->numDirs)) : 0) + + buf->f_bfree; + buf->f_ffree = buf->f_bfree; + buf->f_namelen = UDF_NAME_LEN - 2; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static unsigned int udf_count_free_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap) +{ + struct buffer_head *bh = NULL; + unsigned int accum = 0; + int index; + int block = 0, newblock; + struct kernel_lb_addr loc; + uint32_t bytes; + uint8_t *ptr; + uint16_t ident; + struct spaceBitmapDesc *bm; + + loc.logicalBlockNum = bitmap->s_extPosition; + loc.partitionReferenceNum = UDF_SB(sb)->s_partition; + bh = udf_read_ptagged(sb, &loc, 0, &ident); + + if (!bh) { + udf_err(sb, "udf_count_free failed\n"); + goto out; + } else if (ident != TAG_IDENT_SBD) { + brelse(bh); + udf_err(sb, "udf_count_free failed\n"); + goto out; + } + + bm = (struct spaceBitmapDesc *)bh->b_data; + bytes = le32_to_cpu(bm->numOfBytes); + index = sizeof(struct spaceBitmapDesc); /* offset in first block only */ + ptr = (uint8_t *)bh->b_data; + + while (bytes > 0) { + u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index); + accum += bitmap_weight((const unsigned long *)(ptr + index), + cur_bytes * 8); + bytes -= cur_bytes; + if (bytes) { + brelse(bh); + newblock = udf_get_lb_pblock(sb, &loc, ++block); + bh = udf_tread(sb, newblock); + if (!bh) { + udf_debug("read failed\n"); + goto out; + } + index = 0; + ptr = (uint8_t *)bh->b_data; + } + } + brelse(bh); +out: + return accum; +} + +static unsigned int udf_count_free_table(struct super_block *sb, + struct inode *table) +{ + unsigned int accum = 0; + uint32_t elen; + struct kernel_lb_addr eloc; + int8_t etype; + struct extent_position epos; + + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); + epos.block = UDF_I(table)->i_location; + epos.offset = sizeof(struct unallocSpaceEntry); + epos.bh = NULL; + + while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) + accum += (elen >> table->i_sb->s_blocksize_bits); + + brelse(epos.bh); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); + + return accum; +} + +static unsigned int udf_count_free(struct super_block *sb) +{ + unsigned int accum = 0; + struct udf_sb_info *sbi; + struct udf_part_map *map; + + sbi = UDF_SB(sb); + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDesc *lvid = + (struct logicalVolIntegrityDesc *) + sbi->s_lvid_bh->b_data; + if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) { + accum = le32_to_cpu( + lvid->freeSpaceTable[sbi->s_partition]); + if (accum == 0xFFFFFFFF) + accum = 0; + } + } + + if (accum) + return accum; + + map = &sbi->s_partmaps[sbi->s_partition]; + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { + accum += udf_count_free_bitmap(sb, + map->s_uspace.s_bitmap); + } + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { + accum += udf_count_free_bitmap(sb, + map->s_fspace.s_bitmap); + } + if (accum) + return accum; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { + accum += udf_count_free_table(sb, + map->s_uspace.s_table); + } + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { + accum += udf_count_free_table(sb, + map->s_fspace.s_table); + } + + return accum; +} diff --git a/kernel/fs/udf/symlink.c b/kernel/fs/udf/symlink.c new file mode 100644 index 000000000..8dfbc4025 --- /dev/null +++ b/kernel/fs/udf/symlink.c @@ -0,0 +1,159 @@ +/* + * symlink.c + * + * PURPOSE + * Symlink handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 04/16/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include +#include +#include +#include +#include +#include +#include "udf_i.h" + +static int udf_pc_to_char(struct super_block *sb, unsigned char *from, + int fromlen, unsigned char *to, int tolen) +{ + struct pathComponent *pc; + int elen = 0; + int comp_len; + unsigned char *p = to; + + /* Reserve one byte for terminating \0 */ + tolen--; + while (elen < fromlen) { + pc = (struct pathComponent *)(from + elen); + elen += sizeof(struct pathComponent); + switch (pc->componentType) { + case 1: + /* + * Symlink points to some place which should be agreed + * upon between originator and receiver of the media. Ignore. + */ + if (pc->lengthComponentIdent > 0) { + elen += pc->lengthComponentIdent; + break; + } + /* Fall through */ + case 2: + if (tolen == 0) + return -ENAMETOOLONG; + p = to; + *p++ = '/'; + tolen--; + break; + case 3: + if (tolen < 3) + return -ENAMETOOLONG; + memcpy(p, "../", 3); + p += 3; + tolen -= 3; + break; + case 4: + if (tolen < 2) + return -ENAMETOOLONG; + memcpy(p, "./", 2); + p += 2; + tolen -= 2; + /* that would be . - just ignore */ + break; + case 5: + elen += pc->lengthComponentIdent; + if (elen > fromlen) + return -EIO; + comp_len = udf_get_filename(sb, pc->componentIdent, + pc->lengthComponentIdent, + p, tolen); + p += comp_len; + tolen -= comp_len; + if (tolen == 0) + return -ENAMETOOLONG; + *p++ = '/'; + tolen--; + break; + } + } + if (p > to + 1) + p[-1] = '\0'; + else + p[0] = '\0'; + return 0; +} + +static int udf_symlink_filler(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh = NULL; + unsigned char *symlink; + int err; + unsigned char *p = kmap(page); + struct udf_inode_info *iinfo; + uint32_t pos; + + /* We don't support symlinks longer than one block */ + if (inode->i_size > inode->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto out_unmap; + } + + iinfo = UDF_I(inode); + pos = udf_block_map(inode, 0); + + down_read(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + } else { + bh = sb_bread(inode->i_sb, pos); + + if (!bh) { + err = -EIO; + goto out_unlock_inode; + } + + symlink = bh->b_data; + } + + err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); + brelse(bh); + if (err) + goto out_unlock_inode; + + up_read(&iinfo->i_data_sem); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +out_unlock_inode: + up_read(&iinfo->i_data_sem); + SetPageError(page); +out_unmap: + kunmap(page); + unlock_page(page); + return err; +} + +/* + * symlinks can't do much... + */ +const struct address_space_operations udf_symlink_aops = { + .readpage = udf_symlink_filler, +}; diff --git a/kernel/fs/udf/truncate.c b/kernel/fs/udf/truncate.c new file mode 100644 index 000000000..42b8c5779 --- /dev/null +++ b/kernel/fs/udf/truncate.c @@ -0,0 +1,286 @@ +/* + * truncate.c + * + * PURPOSE + * Truncate handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2004 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +static void extent_trunc(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, + uint32_t nelen) +{ + struct kernel_lb_addr neloc = {}; + int last_block = (elen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + + if (nelen) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, eloc, 0, + last_block); + etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); + } else + neloc = *eloc; + nelen = (etype << 30) | nelen; + } + + if (elen != nelen) { + udf_write_aext(inode, epos, &neloc, nelen, 0); + if (last_block - first_block > 0) { + if (etype == (EXT_RECORDED_ALLOCATED >> 30)) + mark_inode_dirty(inode); + + if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + udf_free_blocks(inode->i_sb, inode, eloc, + first_block, + last_block - first_block); + } + } +} + +/* + * Truncate the last extent to match i_size. This function assumes + * that preallocation extent is already truncated. + */ +void udf_truncate_tail_extent(struct inode *inode) +{ + struct extent_position epos = {}; + struct kernel_lb_addr eloc; + uint32_t elen, nelen; + uint64_t lbcount = 0; + int8_t etype = -1, netype; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || + inode->i_size == iinfo->i_lenExtents) + return; + /* Are we going to delete the file anyway? */ + if (inode->i_nlink == 0) + return; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + /* Find the last extent in the file */ + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + etype = netype; + lbcount += elen; + if (lbcount > inode->i_size) { + if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) + udf_warn(inode->i_sb, + "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", + (unsigned)inode->i_ino, + (long long)inode->i_size, + (long long)lbcount, + (unsigned)eloc.logicalBlockNum, + (unsigned)elen); + nelen = elen - (lbcount - inode->i_size); + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, nelen); + epos.offset += adsize; + if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) + udf_err(inode->i_sb, + "Extent after EOF in inode %u\n", + (unsigned)inode->i_ino); + break; + } + } + /* This inode entry is in-memory only and thus we don't have to mark + * the inode dirty */ + iinfo->i_lenExtents = inode->i_size; + brelse(epos.bh); +} + +void udf_discard_prealloc(struct inode *inode) +{ + struct extent_position epos = { NULL, 0, {0, 0} }; + struct kernel_lb_addr eloc; + uint32_t elen; + uint64_t lbcount = 0; + int8_t etype = -1, netype; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || + inode->i_size == iinfo->i_lenExtents) + return; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + adsize = 0; + + epos.block = iinfo->i_location; + + /* Find the last extent in the file */ + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + etype = netype; + lbcount += elen; + } + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + epos.offset -= adsize; + lbcount -= elen; + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + if (!epos.bh) { + iinfo->i_lenAlloc = + epos.offset - + udf_file_entry_alloc_offset(inode); + mark_inode_dirty(inode); + } else { + struct allocExtDesc *aed = + (struct allocExtDesc *)(epos.bh->b_data); + aed->lengthAllocDescs = + cpu_to_le32(epos.offset - + sizeof(struct allocExtDesc)); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos.bh->b_data, epos.offset); + else + udf_update_tag(epos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos.bh, inode); + } + } + /* This inode entry is in-memory only and thus we don't have to mark + * the inode dirty */ + iinfo->i_lenExtents = lbcount; + brelse(epos.bh); +} + +static void udf_update_alloc_ext_desc(struct inode *inode, + struct extent_position *epos, + u32 lenalloc) +{ + struct super_block *sb = inode->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); + int len = sizeof(struct allocExtDesc); + + aed->lengthAllocDescs = cpu_to_le32(lenalloc); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) + len += lenalloc; + + udf_update_tag(epos->bh->b_data, len); + mark_buffer_dirty_inode(epos->bh, inode); +} + +/* + * Truncate extents of inode to inode->i_size. This function can be used only + * for making file shorter. For making file longer, udf_extend_file() has to + * be used. + */ +void udf_truncate_extents(struct inode *inode) +{ + struct extent_position epos; + struct kernel_lb_addr eloc, neloc = {}; + uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; + loff_t byte_offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + byte_offset = (offset << sb->s_blocksize_bits) + + (inode->i_size & (sb->s_blocksize - 1)); + if (etype == -1) { + /* We should extend the file? */ + WARN_ON(byte_offset); + return; + } + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); + epos.offset += adsize; + if (byte_offset) + lenalloc = epos.offset; + else + lenalloc = epos.offset - adsize; + + if (!epos.bh) + lenalloc -= udf_file_entry_alloc_offset(inode); + else + lenalloc -= sizeof(struct allocExtDesc); + + while ((etype = udf_current_aext(inode, &epos, &eloc, + &elen, 0)) != -1) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + udf_write_aext(inode, &epos, &neloc, nelen, 0); + if (indirect_ext_len) { + /* We managed to free all extents in the + * indirect extent - free it too */ + BUG_ON(!epos.bh); + udf_free_blocks(sb, NULL, &epos.block, + 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, + &epos, lenalloc); + brelse(epos.bh); + epos.offset = sizeof(struct allocExtDesc); + epos.block = eloc; + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &eloc, 0)); + if (elen) + indirect_ext_len = + (elen + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + else + indirect_ext_len = 1; + } else { + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + epos.offset += adsize; + } + } + + if (indirect_ext_len) { + BUG_ON(!epos.bh); + udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, &epos, lenalloc); + iinfo->i_lenExtents = inode->i_size; + + brelse(epos.bh); +} diff --git a/kernel/fs/udf/udf_i.h b/kernel/fs/udf/udf_i.h new file mode 100644 index 000000000..b5cd8ed2a --- /dev/null +++ b/kernel/fs/udf/udf_i.h @@ -0,0 +1,62 @@ +#ifndef _UDF_I_H +#define _UDF_I_H + +struct extent_position { + struct buffer_head *bh; + uint32_t offset; + struct kernel_lb_addr block; +}; + +struct udf_ext_cache { + /* Extent position */ + struct extent_position epos; + /* Start logical offset in bytes */ + loff_t lstart; +}; + +/* + * The i_data_sem and i_mutex serve for protection of allocation information + * of a regular files and symlinks. This includes all extents belonging to + * the file/symlink, a fact whether data are in-inode or in external data + * blocks, preallocation, goal block information... When extents are read, + * i_mutex or i_data_sem must be held (for reading is enough in case of + * i_data_sem). When extents are changed, i_data_sem must be held for writing + * and also i_mutex must be held. + * + * For directories i_mutex is used for all the necessary protection. + */ + +struct udf_inode_info { + struct timespec i_crtime; + /* Physical address of inode */ + struct kernel_lb_addr i_location; + __u64 i_unique; + __u32 i_lenEAttr; + __u32 i_lenAlloc; + __u64 i_lenExtents; + __u32 i_next_alloc_block; + __u32 i_next_alloc_goal; + __u32 i_checkpoint; + unsigned i_alloc_type : 3; + unsigned i_efe : 1; /* extendedFileEntry */ + unsigned i_use : 1; /* unallocSpaceEntry */ + unsigned i_strat4096 : 1; + unsigned reserved : 26; + union { + struct short_ad *i_sad; + struct long_ad *i_lad; + __u8 *i_data; + } i_ext; + struct rw_semaphore i_data_sem; + struct udf_ext_cache cached_extent; + /* Spinlock for protecting extent cache */ + spinlock_t i_extent_cache_lock; + struct inode vfs_inode; +}; + +static inline struct udf_inode_info *UDF_I(struct inode *inode) +{ + return list_entry(inode, struct udf_inode_info, vfs_inode); +} + +#endif /* _UDF_I_H) */ diff --git a/kernel/fs/udf/udf_sb.h b/kernel/fs/udf/udf_sb.h new file mode 100644 index 000000000..1f32c7bd9 --- /dev/null +++ b/kernel/fs/udf/udf_sb.h @@ -0,0 +1,184 @@ +#ifndef __LINUX_UDF_SB_H +#define __LINUX_UDF_SB_H + +#include +#include + +/* Since UDF 2.01 is ISO 13346 based... */ +#define UDF_SUPER_MAGIC 0x15013346 + +#define UDF_MAX_READ_VERSION 0x0250 +#define UDF_MAX_WRITE_VERSION 0x0201 + +#define UDF_FLAG_USE_EXTENDED_FE 0 +#define UDF_VERS_USE_EXTENDED_FE 0x0200 +#define UDF_FLAG_USE_STREAMS 1 +#define UDF_VERS_USE_STREAMS 0x0200 +#define UDF_FLAG_USE_SHORT_AD 2 +#define UDF_FLAG_USE_AD_IN_ICB 3 +#define UDF_FLAG_USE_FILE_CTIME_EA 4 +#define UDF_FLAG_STRICT 5 +#define UDF_FLAG_UNDELETE 6 +#define UDF_FLAG_UNHIDE 7 +#define UDF_FLAG_VARCONV 8 +#define UDF_FLAG_NLS_MAP 9 +#define UDF_FLAG_UTF8 10 +#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ +#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ +#define UDF_FLAG_GID_FORGET 13 +#define UDF_FLAG_GID_IGNORE 14 +#define UDF_FLAG_UID_SET 15 +#define UDF_FLAG_GID_SET 16 +#define UDF_FLAG_SESSION_SET 17 +#define UDF_FLAG_LASTBLOCK_SET 18 +#define UDF_FLAG_BLOCKSIZE_SET 19 + +#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 +#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 +#define UDF_PART_FLAG_FREED_BITMAP 0x0004 +#define UDF_PART_FLAG_FREED_TABLE 0x0008 +#define UDF_PART_FLAG_READ_ONLY 0x0010 +#define UDF_PART_FLAG_WRITE_ONCE 0x0020 +#define UDF_PART_FLAG_REWRITABLE 0x0040 +#define UDF_PART_FLAG_OVERWRITABLE 0x0080 + +#define UDF_MAX_BLOCK_LOADED 8 + +#define UDF_TYPE1_MAP15 0x1511U +#define UDF_VIRTUAL_MAP15 0x1512U +#define UDF_VIRTUAL_MAP20 0x2012U +#define UDF_SPARABLE_MAP15 0x1522U +#define UDF_METADATA_MAP25 0x2511U + +#define UDF_INVALID_MODE ((umode_t)-1) + +#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ + +#define MF_DUPLICATE_MD 0x01 +#define MF_MIRROR_FE_LOADED 0x02 + +struct udf_meta_data { + __u32 s_meta_file_loc; + __u32 s_mirror_file_loc; + __u32 s_bitmap_file_loc; + __u32 s_alloc_unit_size; + __u16 s_align_unit_size; + int s_flags; + struct inode *s_metadata_fe; + struct inode *s_mirror_fe; + struct inode *s_bitmap_fe; +}; + +struct udf_sparing_data { + __u16 s_packet_len; + struct buffer_head *s_spar_map[4]; +}; + +struct udf_virtual_data { + __u32 s_num_entries; + __u16 s_start_offset; +}; + +struct udf_bitmap { + __u32 s_extPosition; + int s_nr_groups; + struct buffer_head *s_block_bitmap[0]; +}; + +struct udf_part_map { + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_uspace; + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_fspace; + __u32 s_partition_root; + __u32 s_partition_len; + __u16 s_partition_type; + __u16 s_partition_num; + union { + struct udf_sparing_data s_sparing; + struct udf_virtual_data s_virtual; + struct udf_meta_data s_metadata; + } s_type_specific; + __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32); + __u16 s_volumeseqnum; + __u16 s_partition_flags; +}; + +#pragma pack() + +struct udf_sb_info { + struct udf_part_map *s_partmaps; + __u8 s_volume_ident[32]; + + /* Overall info */ + __u16 s_partitions; + __u16 s_partition; + + /* Sector headers */ + __s32 s_session; + __u32 s_anchor; + __u32 s_last_block; + + struct buffer_head *s_lvid_bh; + + /* Default permissions */ + umode_t s_umask; + kgid_t s_gid; + kuid_t s_uid; + umode_t s_fmode; + umode_t s_dmode; + /* Lock protecting consistency of above permission settings */ + rwlock_t s_cred_lock; + + /* Root Info */ + struct timespec s_record_time; + + /* Fileset Info */ + __u16 s_serial_number; + + /* highest UDF revision we have recorded to this media */ + __u16 s_udfrev; + + /* Miscellaneous flags */ + unsigned long s_flags; + + /* Encoding info */ + struct nls_table *s_nls_map; + + /* VAT inode */ + struct inode *s_vat_inode; + + struct mutex s_alloc_mutex; + /* Protected by s_alloc_mutex */ + unsigned int s_lvid_dirty; +}; + +static inline struct udf_sb_info *UDF_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb); + +int udf_compute_nr_groups(struct super_block *sb, u32 partition); + +static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag) +{ + return test_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_SET_FLAG(struct super_block *sb, int flag) +{ + set_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag) +{ + clear_bit(flag, &UDF_SB(sb)->s_flags); +} + +#endif /* __LINUX_UDF_SB_H */ diff --git a/kernel/fs/udf/udfdecl.h b/kernel/fs/udf/udfdecl.h new file mode 100644 index 000000000..47bb3f5ca --- /dev/null +++ b/kernel/fs/udf/udfdecl.h @@ -0,0 +1,255 @@ +#ifndef __UDF_DECL_H +#define __UDF_DECL_H + +#define pr_fmt(fmt) "UDF-fs: " fmt + +#include "ecma_167.h" +#include "osta_udf.h" + +#include +#include +#include +#include + +#include "udf_sb.h" +#include "udfend.h" +#include "udf_i.h" + +#define UDF_PREALLOCATE +#define UDF_DEFAULT_PREALLOC_BLOCKS 8 + +extern __printf(3, 4) void _udf_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_err(sb, fmt, ...) \ + _udf_err(sb, __func__, fmt, ##__VA_ARGS__) + +extern __printf(3, 4) void _udf_warn(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_warn(sb, fmt, ...) \ + _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) + +#define udf_info(fmt, ...) \ + pr_info("INFO " fmt, ##__VA_ARGS__) + +#undef UDFFS_DEBUG + +#ifdef UDFFS_DEBUG +#define udf_debug(fmt, ...) \ + printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) +#else +#define udf_debug(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) +#endif + +#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) +#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) + +#define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF +#define UDF_EXTENT_FLAG_MASK 0xC0000000 + +#define UDF_NAME_PAD 4 +#define UDF_NAME_LEN 256 +#define UDF_PATH_LEN 1023 + +static inline size_t udf_file_entry_alloc_offset(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + if (iinfo->i_use) + return sizeof(struct unallocSpaceEntry); + else if (iinfo->i_efe) + return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; + else + return sizeof(struct fileEntry) + iinfo->i_lenEAttr; +} + +static inline size_t udf_ext0_offset(struct inode *inode) +{ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return udf_file_entry_alloc_offset(inode); + else + return 0; +} + +/* computes tag checksum */ +u8 udf_tag_checksum(const struct tag *t); + +struct dentry; +struct inode; +struct task_struct; +struct buffer_head; +struct super_block; + +extern const struct export_operations udf_export_ops; +extern const struct inode_operations udf_dir_inode_operations; +extern const struct file_operations udf_dir_operations; +extern const struct inode_operations udf_file_inode_operations; +extern const struct file_operations udf_file_operations; +extern const struct inode_operations udf_symlink_inode_operations; +extern const struct address_space_operations udf_aops; +extern const struct address_space_operations udf_adinicb_aops; +extern const struct address_space_operations udf_symlink_aops; + +struct udf_fileident_bh { + struct buffer_head *sbh; + struct buffer_head *ebh; + int soffset; + int eoffset; +}; + +struct udf_vds_record { + uint32_t block; + uint32_t volDescSeqNum; +}; + +struct generic_desc { + struct tag descTag; + __le32 volDescSeqNum; +}; + +struct ustr { + uint8_t u_cmpID; + uint8_t u_name[UDF_NAME_LEN - 2]; + uint8_t u_len; +}; + + +/* super.c */ + +static inline void udf_updated_lvid(struct super_block *sb) +{ + struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; + + BUG_ON(!bh); + WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) + bh->b_data)->integrityType != + cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); + UDF_SB(sb)->s_lvid_dirty = 1; +} +extern u64 lvid_get_unique_id(struct super_block *sb); +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num); + +/* namei.c */ +extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, + struct fileIdentDesc *, struct udf_fileident_bh *, + uint8_t *, uint8_t *); + +/* file.c */ +extern long udf_ioctl(struct file *, unsigned int, unsigned long); +/* inode.c */ +extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, + bool hidden_inode); +static inline struct inode *udf_iget_special(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, true); +} +static inline struct inode *udf_iget(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, false); +} +extern int udf_expand_file_adinicb(struct inode *); +extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); +extern struct buffer_head *udf_bread(struct inode *, int, int, int *); +extern int udf_setsize(struct inode *, loff_t); +extern void udf_evict_inode(struct inode *); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); +extern long udf_block_map(struct inode *, sector_t); +extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_add_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern void udf_write_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern int8_t udf_delete_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); +extern int8_t udf_next_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, int); +extern int8_t udf_current_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, int); + +/* misc.c */ +extern struct buffer_head *udf_tgetblk(struct super_block *, int); +extern struct buffer_head *udf_tread(struct super_block *, int); +extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, + uint32_t, uint8_t); +extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, + uint8_t); +extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, + uint32_t, uint16_t *); +extern struct buffer_head *udf_read_ptagged(struct super_block *, + struct kernel_lb_addr *, uint32_t, + uint16_t *); +extern void udf_update_tag(char *, int); +extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); + +/* lowlevel.c */ +extern unsigned int udf_get_last_session(struct super_block *); +extern unsigned long udf_get_last_block(struct super_block *); + +/* partition.c */ +extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern int udf_relocate_blocks(struct super_block *, long, long *); + +static inline uint32_t +udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, + uint32_t offset) +{ + return udf_get_pblock(sb, loc->logicalBlockNum, + loc->partitionReferenceNum, offset); +} + +/* unicode.c */ +extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *, + int); +extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, + int); +extern int udf_build_ustr(struct ustr *, dstring *, int); +extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); + +/* ialloc.c */ +extern void udf_free_inode(struct inode *); +extern struct inode *udf_new_inode(struct inode *, umode_t); + +/* truncate.c */ +extern void udf_truncate_tail_extent(struct inode *); +extern void udf_discard_prealloc(struct inode *); +extern void udf_truncate_extents(struct inode *); + +/* balloc.c */ +extern void udf_free_blocks(struct super_block *, struct inode *, + struct kernel_lb_addr *, uint32_t, uint32_t); +extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, + uint32_t, uint32_t); +extern int udf_new_block(struct super_block *, struct inode *, uint16_t, + uint32_t, int *); + +/* directory.c */ +extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, + struct udf_fileident_bh *, + struct fileIdentDesc *, + struct extent_position *, + struct kernel_lb_addr *, uint32_t *, + sector_t *); +extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, + int *offset); +extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); +extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); + +/* udftime.c */ +extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, + struct timestamp src); +extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); + +#endif /* __UDF_DECL_H */ diff --git a/kernel/fs/udf/udfend.h b/kernel/fs/udf/udfend.h new file mode 100644 index 000000000..6a9f3a9cc --- /dev/null +++ b/kernel/fs/udf/udfend.h @@ -0,0 +1,77 @@ +#ifndef __UDF_ENDIAN_H +#define __UDF_ENDIAN_H + +#include +#include + +static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in) +{ + struct kernel_lb_addr out; + + out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); + out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); + + return out; +} + +static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in) +{ + struct lb_addr out; + + out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); + out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); + + return out; +} + +static inline struct short_ad lesa_to_cpu(struct short_ad in) +{ + struct short_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extPosition = le32_to_cpu(in.extPosition); + + return out; +} + +static inline struct short_ad cpu_to_lesa(struct short_ad in) +{ + struct short_ad out; + + out.extLength = cpu_to_le32(in.extLength); + out.extPosition = cpu_to_le32(in.extPosition); + + return out; +} + +static inline struct kernel_long_ad lela_to_cpu(struct long_ad in) +{ + struct kernel_long_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extLocation = lelb_to_cpu(in.extLocation); + + return out; +} + +static inline struct long_ad cpu_to_lela(struct kernel_long_ad in) +{ + struct long_ad out; + + out.extLength = cpu_to_le32(in.extLength); + out.extLocation = cpu_to_lelb(in.extLocation); + + return out; +} + +static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in) +{ + struct kernel_extent_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extLocation = le32_to_cpu(in.extLocation); + + return out; +} + +#endif /* __UDF_ENDIAN_H */ diff --git a/kernel/fs/udf/udftime.c b/kernel/fs/udf/udftime.c new file mode 100644 index 000000000..77c331f1a --- /dev/null +++ b/kernel/fs/udf/udftime.c @@ -0,0 +1,170 @@ +/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Paul Eggert (eggert@twinsun.com). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* + * dgb 10/02/98: ripped this from glibc source to help convert timestamps + * to unix time + * 10/04/98: added new table-based lookup after seeing how ugly + * the gnu code is + * blf 09/27/99: ripped out all the old code and inserted new table from + * John Brockmeyer (without leap second corrections) + * rewrote udf_stamp_to_time and fixed timezone accounting in + * udf_time_to_stamp. + */ + +/* + * We don't take into account leap seconds. This may be correct or incorrect. + * For more NIST information (especially dealing with leap seconds), see: + * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm + */ + +#include "udfdecl.h" + +#include +#include + +#define EPOCH_YEAR 1970 + +#ifndef __isleap +/* Nonzero if YEAR is a leap year (every 4 years, + except every 100th isn't, and every 400th is). */ +#define __isleap(year) \ + ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0)) +#endif + +/* How many days come before each month (0-12). */ +static const unsigned short int __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define MAX_YEAR_SECONDS 69 +#define SPD 0x15180 /*3600*24 */ +#define SPY(y, l, s) (SPD * (365 * y + l) + s) + +static time_t year_seconds[MAX_YEAR_SECONDS] = { +/*1970*/ SPY(0, 0, 0), SPY(1, 0, 0), SPY(2, 0, 0), SPY(3, 1, 0), +/*1974*/ SPY(4, 1, 0), SPY(5, 1, 0), SPY(6, 1, 0), SPY(7, 2, 0), +/*1978*/ SPY(8, 2, 0), SPY(9, 2, 0), SPY(10, 2, 0), SPY(11, 3, 0), +/*1982*/ SPY(12, 3, 0), SPY(13, 3, 0), SPY(14, 3, 0), SPY(15, 4, 0), +/*1986*/ SPY(16, 4, 0), SPY(17, 4, 0), SPY(18, 4, 0), SPY(19, 5, 0), +/*1990*/ SPY(20, 5, 0), SPY(21, 5, 0), SPY(22, 5, 0), SPY(23, 6, 0), +/*1994*/ SPY(24, 6, 0), SPY(25, 6, 0), SPY(26, 6, 0), SPY(27, 7, 0), +/*1998*/ SPY(28, 7, 0), SPY(29, 7, 0), SPY(30, 7, 0), SPY(31, 8, 0), +/*2002*/ SPY(32, 8, 0), SPY(33, 8, 0), SPY(34, 8, 0), SPY(35, 9, 0), +/*2006*/ SPY(36, 9, 0), SPY(37, 9, 0), SPY(38, 9, 0), SPY(39, 10, 0), +/*2010*/ SPY(40, 10, 0), SPY(41, 10, 0), SPY(42, 10, 0), SPY(43, 11, 0), +/*2014*/ SPY(44, 11, 0), SPY(45, 11, 0), SPY(46, 11, 0), SPY(47, 12, 0), +/*2018*/ SPY(48, 12, 0), SPY(49, 12, 0), SPY(50, 12, 0), SPY(51, 13, 0), +/*2022*/ SPY(52, 13, 0), SPY(53, 13, 0), SPY(54, 13, 0), SPY(55, 14, 0), +/*2026*/ SPY(56, 14, 0), SPY(57, 14, 0), SPY(58, 14, 0), SPY(59, 15, 0), +/*2030*/ SPY(60, 15, 0), SPY(61, 15, 0), SPY(62, 15, 0), SPY(63, 16, 0), +/*2034*/ SPY(64, 16, 0), SPY(65, 16, 0), SPY(66, 16, 0), SPY(67, 17, 0), +/*2038*/ SPY(68, 17, 0) +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +struct timespec * +udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) +{ + int yday; + u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); + u16 year = le16_to_cpu(src.year); + uint8_t type = typeAndTimezone >> 12; + int16_t offset; + + if (type == 1) { + offset = typeAndTimezone << 4; + /* sign extent offset */ + offset = (offset >> 4); + if (offset == -2047) /* unspecified offset */ + offset = 0; + } else + offset = 0; + + if ((year < EPOCH_YEAR) || + (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) { + return NULL; + } + dest->tv_sec = year_seconds[year - EPOCH_YEAR]; + dest->tv_sec -= offset * 60; + + yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1); + dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second; + dest->tv_nsec = 1000 * (src.centiseconds * 10000 + + src.hundredsOfMicroseconds * 100 + src.microseconds); + return dest; +} + +struct timestamp * +udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) +{ + long int days, rem, y; + const unsigned short int *ip; + int16_t offset; + + offset = -sys_tz.tz_minuteswest; + + if (!dest) + return NULL; + + dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF)); + + ts.tv_sec += offset * 60; + days = ts.tv_sec / SECS_PER_DAY; + rem = ts.tv_sec % SECS_PER_DAY; + dest->hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + dest->minute = rem / 60; + dest->second = rem % 60; + y = 1970; + +#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0)) +#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400)) + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + long int yg = y + days / 365 - (days % 365 < 0); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= ((yg - y) * 365 + + LEAPS_THRU_END_OF(yg - 1) + - LEAPS_THRU_END_OF(y - 1)); + y = yg; + } + dest->year = cpu_to_le16(y); + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < (long int)ip[y]; --y) + continue; + days -= ip[y]; + dest->month = y + 1; + dest->day = days + 1; + + dest->centiseconds = ts.tv_nsec / 10000000; + dest->hundredsOfMicroseconds = (ts.tv_nsec / 1000 - + dest->centiseconds * 10000) / 100; + dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 - + dest->hundredsOfMicroseconds * 100); + return dest; +} + +/* EOF */ diff --git a/kernel/fs/udf/unicode.c b/kernel/fs/udf/unicode.c new file mode 100644 index 000000000..b84fee372 --- /dev/null +++ b/kernel/fs/udf/unicode.c @@ -0,0 +1,496 @@ +/* + * unicode.c + * + * PURPOSE + * Routines for converting between UTF-8 and OSTA Compressed Unicode. + * Also handles filename mangling + * + * DESCRIPTION + * OSTA Compressed Unicode is explained in the OSTA UDF specification. + * http://www.osta.org/ + * UTF-8 is explained in the IETF RFC XXXX. + * ftp://ftp.internic.net/rfc/rfcxxxx.txt + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + */ + +#include "udfdecl.h" + +#include +#include /* for memset */ +#include +#include +#include + +#include "udf_sb.h" + +static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, + int); + +static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) +{ + if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2)) + return 0; + + memset(dest, 0, sizeof(struct ustr)); + memcpy(dest->u_name, src, strlen); + dest->u_cmpID = 0x08; + dest->u_len = strlen; + + return strlen; +} + +/* + * udf_build_ustr + */ +int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) +{ + int usesize; + + if (!dest || !ptr || !size) + return -1; + BUG_ON(size < 2); + + usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name)); + usesize = min(usesize, size - 2); + dest->u_cmpID = ptr[0]; + dest->u_len = usesize; + memcpy(dest->u_name, ptr + 1, usesize); + memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize); + + return 0; +} + +/* + * udf_build_ustr_exact + */ +static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) +{ + if ((!dest) || (!ptr) || (!exactsize)) + return -1; + + memset(dest, 0, sizeof(struct ustr)); + dest->u_cmpID = ptr[0]; + dest->u_len = exactsize - 1; + memcpy(dest->u_name, ptr + 1, exactsize - 1); + + return 0; +} + +/* + * udf_ocu_to_utf8 + * + * PURPOSE + * Convert OSTA Compressed Unicode to the UTF-8 equivalent. + * + * PRE-CONDITIONS + * utf Pointer to UTF-8 output buffer. + * ocu Pointer to OSTA Compressed Unicode input buffer + * of size UDF_NAME_LEN bytes. + * both of type "struct ustr *" + * + * POST-CONDITIONS + * Zero on success. + * + * HISTORY + * November 12, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) +{ + const uint8_t *ocu; + uint8_t cmp_id, ocu_len; + int i; + + ocu_len = ocu_i->u_len; + if (ocu_len == 0) { + memset(utf_o, 0, sizeof(struct ustr)); + return 0; + } + + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); + pr_err("unknown compression code (%d) stri=%s\n", + cmp_id, ocu_i->u_name); + return 0; + } + + ocu = ocu_i->u_name; + utf_o->u_len = 0; + for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { + + /* Expand OSTA compressed Unicode to Unicode */ + uint32_t c = ocu[i++]; + if (cmp_id == 16) + c = (c << 8) | ocu[i++]; + + /* Compress Unicode to UTF-8 */ + if (c < 0x80U) + utf_o->u_name[utf_o->u_len++] = (uint8_t)c; + else if (c < 0x800U) { + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0xc0 | (c >> 6)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | (c & 0x3f)); + } else { + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0xe0 | (c >> 12)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | + ((c >> 6) & 0x3f)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | (c & 0x3f)); + } + } + utf_o->u_cmpID = 8; + + return utf_o->u_len; +} + +/* + * + * udf_utf8_to_ocu + * + * PURPOSE + * Convert UTF-8 to the OSTA Compressed Unicode equivalent. + * + * DESCRIPTION + * This routine is only called by udf_lookup(). + * + * PRE-CONDITIONS + * ocu Pointer to OSTA Compressed Unicode output + * buffer of size UDF_NAME_LEN bytes. + * utf Pointer to UTF-8 input buffer. + * utf_len Length of UTF-8 input buffer in bytes. + * + * POST-CONDITIONS + * Zero on success. + * + * HISTORY + * November 12, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) +{ + unsigned c, i, max_val, utf_char; + int utf_cnt, u_len; + + memset(ocu, 0, sizeof(dstring) * length); + ocu[0] = 8; + max_val = 0xffU; + +try_again: + u_len = 0U; + utf_char = 0U; + utf_cnt = 0U; + for (i = 0U; i < utf->u_len; i++) { + c = (uint8_t)utf->u_name[i]; + + /* Complete a multi-byte UTF-8 character */ + if (utf_cnt) { + utf_char = (utf_char << 6) | (c & 0x3fU); + if (--utf_cnt) + continue; + } else { + /* Check for a multi-byte UTF-8 character */ + if (c & 0x80U) { + /* Start a multi-byte UTF-8 character */ + if ((c & 0xe0U) == 0xc0U) { + utf_char = c & 0x1fU; + utf_cnt = 1; + } else if ((c & 0xf0U) == 0xe0U) { + utf_char = c & 0x0fU; + utf_cnt = 2; + } else if ((c & 0xf8U) == 0xf0U) { + utf_char = c & 0x07U; + utf_cnt = 3; + } else if ((c & 0xfcU) == 0xf8U) { + utf_char = c & 0x03U; + utf_cnt = 4; + } else if ((c & 0xfeU) == 0xfcU) { + utf_char = c & 0x01U; + utf_cnt = 5; + } else { + goto error_out; + } + continue; + } else { + /* Single byte UTF-8 character (most common) */ + utf_char = c; + } + } + + /* Choose no compression if necessary */ + if (utf_char > max_val) { + if (max_val == 0xffU) { + max_val = 0xffffU; + ocu[0] = (uint8_t)0x10U; + goto try_again; + } + goto error_out; + } + + if (max_val == 0xffffU) + ocu[++u_len] = (uint8_t)(utf_char >> 8); + ocu[++u_len] = (uint8_t)(utf_char & 0xffU); + } + + if (utf_cnt) { +error_out: + ocu[++u_len] = '?'; + printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n")); + } + + ocu[length - 1] = (uint8_t)u_len + 1; + + return u_len + 1; +} + +static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, + const struct ustr *ocu_i) +{ + const uint8_t *ocu; + uint8_t cmp_id, ocu_len; + int i, len; + + + ocu_len = ocu_i->u_len; + if (ocu_len == 0) { + memset(utf_o, 0, sizeof(struct ustr)); + return 0; + } + + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); + pr_err("unknown compression code (%d) stri=%s\n", + cmp_id, ocu_i->u_name); + return 0; + } + + ocu = ocu_i->u_name; + utf_o->u_len = 0; + for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { + /* Expand OSTA compressed Unicode to Unicode */ + uint32_t c = ocu[i++]; + if (cmp_id == 16) + c = (c << 8) | ocu[i++]; + + len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], + UDF_NAME_LEN - utf_o->u_len); + /* Valid character? */ + if (len >= 0) + utf_o->u_len += len; + else + utf_o->u_name[utf_o->u_len++] = '?'; + } + utf_o->u_cmpID = 8; + + return utf_o->u_len; +} + +static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, + int length) +{ + int len; + unsigned i, max_val; + uint16_t uni_char; + int u_len; + + memset(ocu, 0, sizeof(dstring) * length); + ocu[0] = 8; + max_val = 0xffU; + +try_again: + u_len = 0U; + for (i = 0U; i < uni->u_len; i++) { + len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); + if (!len) + continue; + /* Invalid character, deal with it */ + if (len < 0) { + len = 1; + uni_char = '?'; + } + + if (uni_char > max_val) { + max_val = 0xffffU; + ocu[0] = (uint8_t)0x10U; + goto try_again; + } + + if (max_val == 0xffffU) + ocu[++u_len] = (uint8_t)(uni_char >> 8); + ocu[++u_len] = (uint8_t)(uni_char & 0xffU); + i += len - 1; + } + + ocu[length - 1] = (uint8_t)u_len + 1; + return u_len + 1; +} + +int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, + uint8_t *dname, int dlen) +{ + struct ustr *filename, *unifilename; + int len = 0; + + filename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!filename) + return 0; + + unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!unifilename) + goto out1; + + if (udf_build_ustr_exact(unifilename, sname, slen)) + goto out2; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { + if (!udf_CS0toUTF8(filename, unifilename)) { + udf_debug("Failed in udf_get_filename: sname = %s\n", + sname); + goto out2; + } + } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { + if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename)) { + udf_debug("Failed in udf_get_filename: sname = %s\n", + sname); + goto out2; + } + } else + goto out2; + + len = udf_translate_to_linux(dname, dlen, + filename->u_name, filename->u_len, + unifilename->u_name, unifilename->u_len); +out2: + kfree(unifilename); +out1: + kfree(filename); + return len; +} + +int udf_put_filename(struct super_block *sb, const uint8_t *sname, + uint8_t *dname, int flen) +{ + struct ustr unifilename; + int namelen; + + if (!udf_char_to_ustr(&unifilename, sname, flen)) + return 0; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { + namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN); + if (!namelen) + return 0; + } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { + namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, + &unifilename, UDF_NAME_LEN); + if (!namelen) + return 0; + } else + return 0; + + return namelen; +} + +#define ILLEGAL_CHAR_MARK '_' +#define EXT_MARK '.' +#define CRC_MARK '#' +#define EXT_SIZE 5 +/* Number of chars we need to store generated CRC to make filename unique */ +#define CRC_LEN 5 + +static int udf_translate_to_linux(uint8_t *newName, int newLen, + uint8_t *udfName, int udfLen, + uint8_t *fidName, int fidNameLen) +{ + int index, newIndex = 0, needsCRC = 0; + int extIndex = 0, newExtIndex = 0, hasExt = 0; + unsigned short valueCRC; + uint8_t curr; + + if (udfName[0] == '.' && + (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { + needsCRC = 1; + newIndex = udfLen; + memcpy(newName, udfName, udfLen); + } else { + for (index = 0; index < udfLen; index++) { + curr = udfName[index]; + if (curr == '/' || curr == 0) { + needsCRC = 1; + curr = ILLEGAL_CHAR_MARK; + while (index + 1 < udfLen && + (udfName[index + 1] == '/' || + udfName[index + 1] == 0)) + index++; + } + if (curr == EXT_MARK && + (udfLen - index - 1) <= EXT_SIZE) { + if (udfLen == index + 1) + hasExt = 0; + else { + hasExt = 1; + extIndex = index; + newExtIndex = newIndex; + } + } + if (newIndex < newLen) + newName[newIndex++] = curr; + else + needsCRC = 1; + } + } + if (needsCRC) { + uint8_t ext[EXT_SIZE]; + int localExtIndex = 0; + + if (hasExt) { + int maxFilenameLen; + for (index = 0; + index < EXT_SIZE && extIndex + index + 1 < udfLen; + index++) { + curr = udfName[extIndex + index + 1]; + + if (curr == '/' || curr == 0) { + needsCRC = 1; + curr = ILLEGAL_CHAR_MARK; + while (extIndex + index + 2 < udfLen && + (index + 1 < EXT_SIZE && + (udfName[extIndex + index + 2] == '/' || + udfName[extIndex + index + 2] == 0))) + index++; + } + ext[localExtIndex++] = curr; + } + maxFilenameLen = newLen - CRC_LEN - localExtIndex; + if (newIndex > maxFilenameLen) + newIndex = maxFilenameLen; + else + newIndex = newExtIndex; + } else if (newIndex > newLen - CRC_LEN) + newIndex = newLen - CRC_LEN; + newName[newIndex++] = CRC_MARK; + valueCRC = crc_itu_t(0, fidName, fidNameLen); + newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_hi(valueCRC); + newName[newIndex++] = hex_asc_upper_lo(valueCRC); + + if (hasExt) { + newName[newIndex++] = EXT_MARK; + for (index = 0; index < localExtIndex; index++) + newName[newIndex++] = ext[index]; + } + } + + return newIndex; +} diff --git a/kernel/fs/ufs/Kconfig b/kernel/fs/ufs/Kconfig new file mode 100644 index 000000000..0bf6e16f8 --- /dev/null +++ b/kernel/fs/ufs/Kconfig @@ -0,0 +1,43 @@ +config UFS_FS + tristate "UFS file system support (read only)" + depends on BLOCK + help + BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, + OpenBSD and NeXTstep) use a file system called UFS. Some System V + Unixes can create and mount hard disk partitions and diskettes using + this file system as well. Saying Y here will allow you to read from + these partitions; if you also want to write to them, say Y to the + experimental "UFS file system write support", below. Please read the + file for more information. + + The recently released UFS2 variant (used in FreeBSD 5.x) is + READ-ONLY supported. + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). + + When accessing NeXTstep files, you may need to convert them from the + NeXT character set to the Latin1 character set; use the program + recode ("info recode") for this purpose. + + To compile the UFS file system support as a module, choose M here: the + module will be called ufs. + + If you haven't heard about all of this before, it's safe to say N. + +config UFS_FS_WRITE + bool "UFS file system write support (DANGEROUS)" + depends on UFS_FS + help + Say Y here if you want to try writing to UFS partitions. This is + experimental, so you should back up your UFS partitions beforehand. + +config UFS_DEBUG + bool "UFS debugging" + depends on UFS_FS + help + If you are experiencing any problems with the UFS filesystem, say + Y here. This will result in _many_ additional debugging messages to be + written to the system log. diff --git a/kernel/fs/ufs/Makefile b/kernel/fs/ufs/Makefile new file mode 100644 index 000000000..4d0e02b02 --- /dev/null +++ b/kernel/fs/ufs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux ufs filesystem routines. +# + +obj-$(CONFIG_UFS_FS) += ufs.o + +ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ + namei.o super.o symlink.o truncate.o util.o +ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/kernel/fs/ufs/balloc.c b/kernel/fs/ufs/balloc.c new file mode 100644 index 000000000..a7106eda5 --- /dev/null +++ b/kernel/fs/ufs/balloc.c @@ -0,0 +1,938 @@ +/* + * linux/fs/ufs/balloc.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * UFS2 write support Evgeniy Dushistov , 2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +#define INVBLOCK ((u64)-1L) + +static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned); +static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *); +static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *); +static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned); +static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[]; +static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int); + +/* + * Free 'count' fragments from fragment number 'fragment' + */ +void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned cgno, bit, end_bit, bbase, blkmap, i; + u64 blkno; + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); + + if (ufs_fragnum(fragment) + count > uspi->s_fpg) + ufs_error (sb, "ufs_free_fragments", "internal error"); + + mutex_lock(&UFS_SB(sb)->s_lock); + + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_free_fragments", "freeing blocks are outside device"); + goto failed; + } + + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + goto failed; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno); + goto failed; + } + + end_bit = bit + count; + bbase = ufs_blknum (bit); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); + ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1); + for (i = bit; i < end_bit; i++) { + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i); + else + ufs_error (sb, "ufs_free_fragments", + "bit already cleared for fragment %u", i); + } + + fs32_add(sb, &ucg->cg_cs.cs_nffree, count); + uspi->cs_total.cs_nffree += count; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); + ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1); + + /* + * Trying to reassemble free fragments into block + */ + blkno = ufs_fragstoblks (bbase); + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); + uspi->cs_total.cs_nffree -= uspi->s_fpb; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, 1); + fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno (bbase); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(bbase)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + ufs_mark_sb_dirty(sb); + + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT\n"); + return; + +failed: + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return; +} + +/* + * Free 'count' fragments from fragment number 'fragment' (free whole blocks) + */ +void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned overflow, cgno, bit, end_bit, i; + u64 blkno; + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); + + if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) { + ufs_error (sb, "ufs_free_blocks", "internal error, " + "fragment %llu, count %u\n", + (unsigned long long)fragment, count); + goto failed; + } + + mutex_lock(&UFS_SB(sb)->s_lock); + +do_more: + overflow = 0; + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device"); + goto failed_unlock; + } + end_bit = bit + count; + if (end_bit > uspi->s_fpg) { + overflow = bit + count - uspi->s_fpg; + count -= overflow; + end_bit -= overflow; + } + + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + goto failed_unlock; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno); + goto failed_unlock; + } + + for (i = bit; i < end_bit; i += uspi->s_fpb) { + blkno = ufs_fragstoblks(i); + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); + } + ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, 1); + + fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno(i); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(i)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + if (overflow) { + fragment += count; + count = overflow; + goto do_more; + } + + ufs_mark_sb_dirty(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT\n"); + return; + +failed_unlock: + mutex_unlock(&UFS_SB(sb)->s_lock); +failed: + UFSD("EXIT (FAILED)\n"); + return; +} + +/* + * Modify inode page cache in such way: + * have - blocks with b_blocknr equal to oldb...oldb+count-1 + * get - blocks with b_blocknr equal to newb...newb+count-1 + * also we suppose that oldb...oldb+count-1 blocks + * situated at the end of file. + * + * We can come here from ufs_writepage or ufs_prepare_write, + * locked_page is argument of these functions, so we already lock it. + */ +static void ufs_change_blocknr(struct inode *inode, sector_t beg, + unsigned int count, sector_t oldb, + sector_t newb, struct page *locked_page) +{ + const unsigned blks_per_page = + 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + const unsigned mask = blks_per_page - 1; + struct address_space * const mapping = inode->i_mapping; + pgoff_t index, cur_index, last_index; + unsigned pos, j, lblock; + sector_t end, i; + struct page *page; + struct buffer_head *head, *bh; + + UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", + inode->i_ino, count, + (unsigned long long)oldb, (unsigned long long)newb); + + BUG_ON(!locked_page); + BUG_ON(!PageLocked(locked_page)); + + cur_index = locked_page->index; + end = count + beg; + last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (i = beg; i < end; i = (i | mask) + 1) { + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (likely(cur_index != index)) { + page = ufs_get_locked_page(mapping, index); + if (!page)/* it was truncated */ + continue; + if (IS_ERR(page)) {/* or EIO */ + ufs_error(inode->i_sb, __func__, + "read of page %llu failed\n", + (unsigned long long)index); + continue; + } + } else + page = locked_page; + + head = page_buffers(page); + bh = head; + pos = i & mask; + for (j = 0; j < pos; ++j) + bh = bh->b_this_page; + + + if (unlikely(index == last_index)) + lblock = end & mask; + else + lblock = blks_per_page; + + do { + if (j >= lblock) + break; + pos = (i - beg) + j; + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, oldb + pos); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ufs_error(inode->i_sb, __func__, + "read of block failed\n"); + break; + } + } + + UFSD(" change from %llu to %llu, pos %u\n", + (unsigned long long)(pos + oldb), + (unsigned long long)(pos + newb), pos); + + bh->b_blocknr = newb + pos; + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + mark_buffer_dirty(bh); + ++j; + bh = bh->b_this_page; + } while (bh != head); + + if (likely(cur_index != index)) + ufs_put_locked_page(page); + } + UFSD("EXIT\n"); +} + +static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, + int sync) +{ + struct buffer_head *bh; + sector_t end = beg + n; + + for (; beg < end; ++beg) { + bh = sb_getblk(inode->i_sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (IS_SYNC(inode) || sync) + sync_dirty_buffer(bh); + brelse(bh); + } +} + +u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, + u64 goal, unsigned count, int *err, + struct page *locked_page) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + unsigned cgno, oldcount, newcount; + u64 tmp, request, result; + + UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)goal, count); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + *err = -ENOSPC; + + mutex_lock(&UFS_SB(sb)->s_lock); + tmp = ufs_data_ptr_to_cpu(sb, p); + + if (count + ufs_fragnum(fragment) > uspi->s_fpb) { + ufs_warning(sb, "ufs_new_fragments", "internal warning" + " fragment %llu, count %u", + (unsigned long long)fragment, count); + count = uspi->s_fpb - ufs_fragnum(fragment); + } + oldcount = ufs_fragnum (fragment); + newcount = oldcount + count; + + /* + * Somebody else has just allocated our fragments + */ + if (oldcount) { + if (!tmp) { + ufs_error(sb, "ufs_new_fragments", "internal error, " + "fragment %llu, tmp %llu\n", + (unsigned long long)fragment, + (unsigned long long)tmp); + mutex_unlock(&UFS_SB(sb)->s_lock); + return INVBLOCK; + } + if (fragment < UFS_I(inode)->i_lastfrag) { + UFSD("EXIT (ALREADY ALLOCATED)\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + return 0; + } + } + else { + if (tmp) { + UFSD("EXIT (ALREADY ALLOCATED)\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + return 0; + } + } + + /* + * There is not enough space for user on the device + */ + if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return 0; + } + + if (goal >= uspi->s_size) + goal = 0; + if (goal == 0) + cgno = ufs_inotocg (inode->i_ino); + else + cgno = ufs_dtog(uspi, goal); + + /* + * allocate new fragment + */ + if (oldcount == 0) { + result = ufs_alloc_fragments (inode, cgno, goal, count, err); + if (result) { + ufs_cpu_to_data_ptr(sb, p, result); + *err = 0; + UFS_I(inode)->i_lastfrag = + max(UFS_I(inode)->i_lastfrag, fragment + count); + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); + } + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + /* + * resize block + */ + result = ufs_add_fragments(inode, tmp, oldcount, newcount); + if (result) { + *err = 0; + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + /* + * allocate new block and move data + */ + switch (fs32_to_cpu(sb, usb1->fs_optim)) { + case UFS_OPTSPACE: + request = newcount; + if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree + > uspi->s_dsize * uspi->s_minfree / (2 * 100)) + break; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + break; + default: + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + + case UFS_OPTTIME: + request = uspi->s_fpb; + if (uspi->cs_total.cs_nffree < uspi->s_dsize * + (uspi->s_minfree - 2) / 100) + break; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + break; + } + result = ufs_alloc_fragments (inode, cgno, goal, request, err); + if (result) { + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); + ufs_change_blocknr(inode, fragment - oldcount, oldcount, + uspi->s_sbbase + tmp, + uspi->s_sbbase + result, locked_page); + ufs_cpu_to_data_ptr(sb, p, result); + *err = 0; + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); + mutex_unlock(&UFS_SB(sb)->s_lock); + if (newcount < request) + ufs_free_fragments (inode, result + newcount, request - newcount); + ufs_free_fragments (inode, tmp, oldcount); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return 0; +} + +static u64 ufs_add_fragments(struct inode *inode, u64 fragment, + unsigned oldcount, unsigned newcount) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned cgno, fragno, fragoff, count, fragsize, i; + + UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", + (unsigned long long)fragment, oldcount, newcount); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + count = newcount - oldcount; + + cgno = ufs_dtog(uspi, fragment); + if (fs32_to_cpu(sb, UFS_SB(sb)->fs_cs(cgno).cs_nffree) < count) + return 0; + if ((ufs_fragnum (fragment) + newcount) > uspi->s_fpb) + return 0; + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + return 0; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_add_fragments", + "internal error, bad magic number on cg %u", cgno); + return 0; + } + + fragno = ufs_dtogd(uspi, fragment); + fragoff = ufs_fragnum (fragno); + for (i = oldcount; i < newcount; i++) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) + return 0; + /* + * Block can be extended + */ + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + for (i = newcount; i < (uspi->s_fpb - fragoff); i++) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) + break; + fragsize = i - oldcount; + if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize])) + ufs_panic (sb, "ufs_add_fragments", + "internal error or corrupted bitmap on cg %u", cgno); + fs32_sub(sb, &ucg->cg_frsum[fragsize], 1); + if (fragsize != count) + fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); + for (i = oldcount; i < newcount; i++) + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); + + fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + uspi->cs_total.cs_nffree -= count; + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + ufs_mark_sb_dirty(sb); + + UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); + + return fragment; +} + +#define UFS_TEST_FREE_SPACE_CG \ + ucg = (struct ufs_cylinder_group *) UFS_SB(sb)->s_ucg[cgno]->b_data; \ + if (fs32_to_cpu(sb, ucg->cg_cs.cs_nbfree)) \ + goto cg_found; \ + for (k = count; k < uspi->s_fpb; k++) \ + if (fs32_to_cpu(sb, ucg->cg_frsum[k])) \ + goto cg_found; + +static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, + u64 goal, unsigned count, int *err) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned oldcg, i, j, k, allocsize; + u64 result; + + UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", + inode->i_ino, cgno, (unsigned long long)goal, count); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + oldcg = cgno; + + /* + * 1. searching on preferred cylinder group + */ + UFS_TEST_FREE_SPACE_CG + + /* + * 2. quadratic rehash + */ + for (j = 1; j < uspi->s_ncg; j *= 2) { + cgno += j; + if (cgno >= uspi->s_ncg) + cgno -= uspi->s_ncg; + UFS_TEST_FREE_SPACE_CG + } + + /* + * 3. brute force search + * We start at i = 2 ( 0 is checked at 1.step, 1 at 2.step ) + */ + cgno = (oldcg + 1) % uspi->s_ncg; + for (j = 2; j < uspi->s_ncg; j++) { + cgno++; + if (cgno >= uspi->s_ncg) + cgno = 0; + UFS_TEST_FREE_SPACE_CG + } + + UFSD("EXIT (FAILED)\n"); + return 0; + +cg_found: + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + return 0; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_alloc_fragments", + "internal error, bad magic number on cg %u", cgno); + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + + if (count == uspi->s_fpb) { + result = ufs_alloccg_block (inode, ucpi, goal, err); + if (result == INVBLOCK) + return 0; + goto succed; + } + + for (allocsize = count; allocsize < uspi->s_fpb; allocsize++) + if (fs32_to_cpu(sb, ucg->cg_frsum[allocsize]) != 0) + break; + + if (allocsize == uspi->s_fpb) { + result = ufs_alloccg_block (inode, ucpi, goal, err); + if (result == INVBLOCK) + return 0; + goal = ufs_dtogd(uspi, result); + for (i = count; i < uspi->s_fpb; i++) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); + i = uspi->s_fpb - count; + + fs32_add(sb, &ucg->cg_cs.cs_nffree, i); + uspi->cs_total.cs_nffree += i; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); + fs32_add(sb, &ucg->cg_frsum[i], 1); + goto succed; + } + + result = ufs_bitmap_search (sb, ucpi, goal, allocsize); + if (result == INVBLOCK) + return 0; + for (i = 0; i < count; i++) + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); + + fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); + uspi->cs_total.cs_nffree -= count; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + fs32_sub(sb, &ucg->cg_frsum[allocsize], 1); + + if (count != allocsize) + fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1); + +succed: + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + ufs_mark_sb_dirty(sb); + + result += cgno * uspi->s_fpg; + UFSD("EXIT3, result %llu\n", (unsigned long long)result); + return result; +} + +static u64 ufs_alloccg_block(struct inode *inode, + struct ufs_cg_private_info *ucpi, + u64 goal, int *err) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cylinder_group * ucg; + u64 result, blkno; + + UFSD("ENTER, goal %llu\n", (unsigned long long)goal); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + + if (goal == 0) { + goal = ucpi->c_rotor; + goto norot; + } + goal = ufs_blknum (goal); + goal = ufs_dtogd(uspi, goal); + + /* + * If the requested block is available, use it. + */ + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) { + result = goal; + goto gotit; + } + +norot: + result = ufs_bitmap_search (sb, ucpi, goal, uspi->s_fpb); + if (result == INVBLOCK) + return INVBLOCK; + ucpi->c_rotor = result; +gotit: + blkno = ufs_fragstoblks(result); + ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, -1); + + fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree--; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno((unsigned)result); + + fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos((unsigned)result)), 1); + fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + + UFSD("EXIT, result %llu\n", (unsigned long long)result); + + return result; +} + +static unsigned ubh_scanc(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + unsigned begin, unsigned size, + unsigned char *table, unsigned char mask) +{ + unsigned rest, offset; + unsigned char *cp; + + + offset = begin & ~uspi->s_fmask; + begin >>= uspi->s_fshift; + for (;;) { + if ((offset + size) < uspi->s_fsize) + rest = size; + else + rest = uspi->s_fsize - offset; + size -= rest; + cp = ubh->bh[begin]->b_data + offset; + while ((table[*cp++] & mask) == 0 && --rest) + ; + if (rest || !size) + break; + begin++; + offset = 0; + } + return (size + rest); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * @sp: pointer to super block + * @ucpi: pointer to cylinder group info + * @goal: near which block we want find new one + * @count: specified size + */ +static u64 ufs_bitmap_search(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + u64 goal, unsigned count) +{ + /* + * Bit patterns for identifying fragments in the block map + * used as ((map & mask_arr) == want_arr) + */ + static const int mask_arr[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff + }; + static const int want_arr[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe + }; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned start, length, loc; + unsigned pos, want, blockmap, mask, end; + u64 result; + + UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, + (unsigned long long)goal, count); + + if (goal) + start = ufs_dtogd(uspi, goal) >> 3; + else + start = ucpi->c_frotor >> 3; + + length = ((uspi->s_fpg + 7) >> 3) - start; + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length, + (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, + 1 << (count - 1 + (uspi->s_fpb & 7))); + if (loc == 0) { + length = start + 1; + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length, + (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : + ufs_fragtable_other, + 1 << (count - 1 + (uspi->s_fpb & 7))); + if (loc == 0) { + ufs_error(sb, "ufs_bitmap_search", + "bitmap corrupted on cg %u, start %u," + " length %u, count %u, freeoff %u\n", + ucpi->c_cgx, start, length, count, + ucpi->c_freeoff); + return INVBLOCK; + } + start = 0; + } + result = (start + length - loc) << 3; + ucpi->c_frotor = result; + + /* + * found the byte in the map + */ + + for (end = result + 8; result < end; result += uspi->s_fpb) { + blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result); + blockmap <<= 1; + mask = mask_arr[count]; + want = want_arr[count]; + for (pos = 0; pos <= uspi->s_fpb - count; pos++) { + if ((blockmap & mask) == want) { + UFSD("EXIT, result %llu\n", + (unsigned long long)result); + return result + pos; + } + mask <<= 1; + want <<= 1; + } + } + + ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n", + ucpi->c_cgx); + UFSD("EXIT (FAILED)\n"); + return INVBLOCK; +} + +static void ufs_clusteracct(struct super_block * sb, + struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt) +{ + struct ufs_sb_private_info * uspi; + int i, start, end, forw, back; + + uspi = UFS_SB(sb)->s_uspi; + if (uspi->s_contigsumsize <= 0) + return; + + if (cnt > 0) + ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); + else + ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); + + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + uspi->s_contigsumsize; + if ( end >= ucpi->c_nclusterblks) + end = ucpi->c_nclusterblks; + i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start); + if (i > end) + i = end; + forw = i - start; + + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - uspi->s_contigsumsize; + if (end < 0 ) + end = -1; + i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end); + if ( i < end) + i = end; + back = start - i; + + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > uspi->s_contigsumsize) + i = uspi->s_contigsumsize; + fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt); + if (back > 0) + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt); + if (forw > 0) + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt); +} + + +static unsigned char ufs_fragtable_8fpb[] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x08, 0x09, 0x09, 0x0A, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0A, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0C, + 0x08, 0x09, 0x09, 0x0A, 0x09, 0x09, 0x0A, 0x0C, 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +static unsigned char ufs_fragtable_other[] = { + 0x00, 0x16, 0x16, 0x2A, 0x16, 0x16, 0x26, 0x4E, 0x16, 0x16, 0x16, 0x3E, 0x2A, 0x3E, 0x4E, 0x8A, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x26, 0x36, 0x36, 0x2E, 0x36, 0x36, 0x26, 0x6E, 0x36, 0x36, 0x36, 0x3E, 0x2E, 0x3E, 0x6E, 0xAE, + 0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE, + 0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA, + 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE, + 0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE, + 0x8A, 0x9E, 0x9E, 0xAA, 0x9E, 0x9E, 0xAE, 0xCE, 0x9E, 0x9E, 0x9E, 0xBE, 0xAA, 0xBE, 0xCE, 0x8A, +}; diff --git a/kernel/fs/ufs/cylinder.c b/kernel/fs/ufs/cylinder.c new file mode 100644 index 000000000..b4676322d --- /dev/null +++ b/kernel/fs/ufs/cylinder.c @@ -0,0 +1,201 @@ +/* + * linux/fs/ufs/cylinder.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * ext2 - inode (block) bitmap caching inspired + */ + +#include +#include +#include +#include +#include + +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * Read cylinder group into cache. The memory space for ufs_cg_private_info + * structure is already allocated during ufs_read_super. + */ +static void ufs_read_cylinder (struct super_block * sb, + unsigned cgno, unsigned bitmap_nr) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned i, j; + + UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr); + uspi = sbi->s_uspi; + ucpi = sbi->s_ucpi[bitmap_nr]; + ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data; + + UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno); + UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits; + /* + * We have already the first fragment of cylinder group block in buffer + */ + UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno]; + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) + if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i))) + goto failed; + sbi->s_cgno[bitmap_nr] = cgno; + + ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx); + ucpi->c_ncyl = fs16_to_cpu(sb, ucg->cg_ncyl); + ucpi->c_niblk = fs16_to_cpu(sb, ucg->cg_niblk); + ucpi->c_ndblk = fs32_to_cpu(sb, ucg->cg_ndblk); + ucpi->c_rotor = fs32_to_cpu(sb, ucg->cg_rotor); + ucpi->c_frotor = fs32_to_cpu(sb, ucg->cg_frotor); + ucpi->c_irotor = fs32_to_cpu(sb, ucg->cg_irotor); + ucpi->c_btotoff = fs32_to_cpu(sb, ucg->cg_btotoff); + ucpi->c_boff = fs32_to_cpu(sb, ucg->cg_boff); + ucpi->c_iusedoff = fs32_to_cpu(sb, ucg->cg_iusedoff); + ucpi->c_freeoff = fs32_to_cpu(sb, ucg->cg_freeoff); + ucpi->c_nextfreeoff = fs32_to_cpu(sb, ucg->cg_nextfreeoff); + ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff); + ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); + ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); + UFSD("EXIT\n"); + return; + +failed: + for (j = 1; j < i; j++) + brelse (sbi->s_ucg[j]); + sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; + ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno); +} + +/* + * Remove cylinder group from cache, doesn't release memory + * allocated for cylinder group (this is done at ufs_put_super only). + */ +void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned i; + + UFSD("ENTER, bitmap_nr %u\n", bitmap_nr); + + uspi = sbi->s_uspi; + if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) { + UFSD("EXIT\n"); + return; + } + ucpi = sbi->s_ucpi[bitmap_nr]; + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + + if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) { + ufs_panic (sb, "ufs_put_cylinder", "internal error"); + return; + } + /* + * rotor is not so important data, so we put it to disk + * at the end of working with cylinder + */ + ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor); + ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor); + ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) { + brelse (UCPI_UBH(ucpi)->bh[i]); + } + + sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; + UFSD("EXIT\n"); +} + +/* + * Find cylinder group in cache and return it as pointer. + * If cylinder group is not in cache, we will load it from disk. + * + * The cache is managed by LRU algorithm. + */ +struct ufs_cg_private_info * ufs_load_cylinder ( + struct super_block * sb, unsigned cgno) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + unsigned cg, i, j; + + UFSD("ENTER, cgno %u\n", cgno); + + uspi = sbi->s_uspi; + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_load_cylinder", "internal error, high number of cg"); + return NULL; + } + /* + * Cylinder group number cg it in cache and it was last used + */ + if (sbi->s_cgno[0] == cgno) { + UFSD("EXIT\n"); + return sbi->s_ucpi[0]; + } + /* + * Number of cylinder groups is not higher than UFS_MAX_GROUP_LOADED + */ + if (uspi->s_ncg <= UFS_MAX_GROUP_LOADED) { + if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) { + if (sbi->s_cgno[cgno] != cgno) { + ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache"); + UFSD("EXIT (FAILED)\n"); + return NULL; + } + else { + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; + } + } else { + ufs_read_cylinder (sb, cgno, cgno); + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; + } + } + /* + * Cylinder group number cg is in cache but it was not last used, + * we will move to the first position + */ + for (i = 0; i < sbi->s_cg_loaded && sbi->s_cgno[i] != cgno; i++); + if (i < sbi->s_cg_loaded && sbi->s_cgno[i] == cgno) { + cg = sbi->s_cgno[i]; + ucpi = sbi->s_ucpi[i]; + for (j = i; j > 0; j--) { + sbi->s_cgno[j] = sbi->s_cgno[j-1]; + sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; + } + sbi->s_cgno[0] = cg; + sbi->s_ucpi[0] = ucpi; + /* + * Cylinder group number cg is not in cache, we will read it from disk + * and put it to the first position + */ + } else { + if (sbi->s_cg_loaded < UFS_MAX_GROUP_LOADED) + sbi->s_cg_loaded++; + else + ufs_put_cylinder (sb, UFS_MAX_GROUP_LOADED-1); + ucpi = sbi->s_ucpi[sbi->s_cg_loaded - 1]; + for (j = sbi->s_cg_loaded - 1; j > 0; j--) { + sbi->s_cgno[j] = sbi->s_cgno[j-1]; + sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; + } + sbi->s_ucpi[0] = ucpi; + ufs_read_cylinder (sb, cgno, 0); + } + UFSD("EXIT\n"); + return sbi->s_ucpi[0]; +} diff --git a/kernel/fs/ufs/dir.c b/kernel/fs/ufs/dir.c new file mode 100644 index 000000000..1bfe8cabf --- /dev/null +++ b/kernel/fs/ufs/dir.c @@ -0,0 +1,662 @@ +/* + * linux/fs/ufs/ufs_dir.c + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * swab support by Francois-Rene Rideau 19970406 + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov based on ext2 code base. + */ + +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure. + * + * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. + */ +static inline int ufs_match(struct super_block *sb, int len, + const unsigned char *name, struct ufs_dir_entry *de) +{ + if (len != ufs_get_de_namlen(sb, de)) + return 0; + if (!de->d_ino) + return 0; + return !memcmp(name, de->d_name, len); +} + +static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static inline void ufs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long ufs_dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) +{ + ino_t res = 0; + struct ufs_dir_entry *de; + struct page *page; + + de = ufs_find_entry(dir, qstr, &page); + if (de) { + res = fs32_to_cpu(dir->i_sb, de->d_ino); + ufs_put_page(page); + } + return res; +} + + +/* Releases the page */ +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); + int err; + + lock_page(page); + err = ufs_prepare_chunk(page, pos, len); + BUG_ON(err); + + de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); + ufs_set_de_type(dir->i_sb, de, inode->i_mode); + + err = ufs_commit_chunk(page, pos, len); + ufs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + + +static void ufs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; + struct ufs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & chunk_mask) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct ufs_dir_entry *)(kaddr + offs); + rec_len = fs16_to_cpu(sb, p->d_reclen); + + if (rec_len < UFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p))) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~chunk_mask) + goto Espan; + if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * + UFS_SB(sb)->s_uspi->s_ncg)) + goto Einumber; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + ufs_error(sb, "ufs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "inode out of bounds"; +bad_entry: + ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<i_ino, (page->index<i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + ufs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + ufs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +ufs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static inline struct ufs_dir_entry * +ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) +{ + return (struct ufs_dir_entry *)((char *)p + + fs16_to_cpu(sb, p->d_reclen)); +} + +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = ufs_get_page(dir, 0); + struct ufs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = ufs_next_entry(dir->i_sb, + (struct ufs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +/* + * ufs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, + struct page **res_page) +{ + struct super_block *sb = dir->i_sb; + const unsigned char *name = qstr->name; + int namelen = qstr->len; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = ufs_dir_pages(dir); + struct page *page = NULL; + struct ufs_inode_info *ui = UFS_I(dir); + struct ufs_dir_entry *de; + + UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen); + + if (npages == 0 || namelen > UFS_MAXNAMLEN) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ui->i_dir_start_lookup; + + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = ufs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct ufs_dir_entry *) kaddr; + kaddr += ufs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __func__, + "zero-length directory entry"); + ufs_put_page(page); + goto out; + } + if (ufs_match(sb, namelen, name, de)) + goto found; + de = ufs_next_entry(sb, de); + } + ufs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ui->i_dir_start_lookup = n; + return de; +} + +/* + * Parent is locked. + */ +int ufs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct super_block *sb = dir->i_sb; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; + unsigned short rec_len, name_len; + struct page *page = NULL; + struct ufs_dir_entry *de; + unsigned long npages = ufs_dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + UFSD("ENTER, name %s, namelen %u\n", name, namelen); + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = ufs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + ufs_last_byte(dir, n); + de = (struct ufs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->d_reclen = cpu_to_fs16(sb, chunk_size); + de->d_ino = 0; + goto got_it; + } + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (ufs_match(sb, namelen, name, de)) + goto out_unlock; + name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)); + rec_len = fs16_to_cpu(sb, de->d_reclen); + if (!de->d_ino && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct ufs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + ufs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + err = ufs_prepare_chunk(page, pos, rec_len); + if (err) + goto out_unlock; + if (de->d_ino) { + struct ufs_dir_entry *de1 = + (struct ufs_dir_entry *) ((char *) de + name_len); + de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len); + de->d_reclen = cpu_to_fs16(sb, name_len); + + de = de1; + } + + ufs_set_de_namlen(sb, de, namelen); + memcpy(de->d_name, name, namelen + 1); + de->d_ino = cpu_to_fs32(sb, inode->i_ino); + ufs_set_de_type(sb, de, inode->i_mode); + + err = ufs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + ufs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +static inline unsigned +ufs_validate_entry(struct super_block *sb, char *base, + unsigned offset, unsigned mask) +{ + struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset); + struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask)); + while ((char*)p < (char*)de) { + if (p->d_reclen == 0) + break; + p = ufs_next_entry(sb, p); + } + return (char *)p - base; +} + + +/* + * This is blatantly stolen from ext2fs + */ +static int +ufs_readdir(struct file *file, struct dir_context *ctx) +{ + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = ufs_dir_pages(inode); + unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + int need_revalidate = file->f_version != inode->i_version; + unsigned flags = UFS_SB(sb)->s_flags; + + UFSD("BEGIN\n"); + + if (pos > inode->i_size - UFS_DIR_REC_LEN(1)) + return 0; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct ufs_dir_entry *de; + + struct page *page = ufs_get_page(inode, n); + + if (IS_ERR(page)) { + ufs_error(sb, __func__, + "bad page in #%lu", + inode->i_ino); + ctx->pos += PAGE_CACHE_SIZE - offset; + return -EIO; + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); + ctx->pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct ufs_dir_entry *)(kaddr+offset); + limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); + for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { + if (de->d_reclen == 0) { + ufs_error(sb, __func__, + "zero-length directory entry"); + ufs_put_page(page); + return -EIO; + } + if (de->d_ino) { + unsigned char d_type = DT_UNKNOWN; + + UFSD("filldir(%s,%u)\n", de->d_name, + fs32_to_cpu(sb, de->d_ino)); + UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); + + if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) + d_type = de->d_u.d_44.d_type; + + if (!dir_emit(ctx, de->d_name, + ufs_get_de_namlen(sb, de), + fs32_to_cpu(sb, de->d_ino), + d_type)) { + ufs_put_page(page); + return 0; + } + } + ctx->pos += fs16_to_cpu(sb, de->d_reclen); + } + ufs_put_page(page); + } + return 0; +} + + +/* + * ufs_delete_entry deletes a directory entry by merging it with the + * previous entry. + */ +int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, + struct page * page) +{ + struct super_block *sb = inode->i_sb; + char *kaddr = page_address(page); + unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + loff_t pos; + struct ufs_dir_entry *pde = NULL; + struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + int err; + + UFSD("ENTER\n"); + + UFSD("ino %u, reclen %u, namlen %u, name %s\n", + fs32_to_cpu(sb, de->d_ino), + fs16_to_cpu(sb, de->d_reclen), + ufs_get_de_namlen(sb, de), de->d_name); + + while ((char*)de < (char*)dir) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = ufs_next_entry(sb, de); + } + if (pde) + from = (char*)pde - (char*)page_address(page); + + pos = page_offset(page) + from; + lock_page(page); + err = ufs_prepare_chunk(page, pos, to - from); + BUG_ON(err); + if (pde) + pde->d_reclen = cpu_to_fs16(sb, to - from); + dir->d_ino = 0; + err = ufs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + ufs_put_page(page); + UFSD("EXIT\n"); + return err; +} + +int ufs_make_empty(struct inode * inode, struct inode *dir) +{ + struct super_block * sb = dir->i_sb; + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; + struct ufs_dir_entry * de; + char *base; + int err; + + if (!page) + return -ENOMEM; + + err = ufs_prepare_chunk(page, 0, chunk_size); + if (err) { + unlock_page(page); + goto fail; + } + + kmap(page); + base = (char*)page_address(page); + memset(base, 0, PAGE_CACHE_SIZE); + + de = (struct ufs_dir_entry *) base; + + de->d_ino = cpu_to_fs32(sb, inode->i_ino); + ufs_set_de_type(sb, de, inode->i_mode); + ufs_set_de_namlen(sb, de, 1); + de->d_reclen = cpu_to_fs16(sb, UFS_DIR_REC_LEN(1)); + strcpy (de->d_name, "."); + de = (struct ufs_dir_entry *) + ((char *)de + fs16_to_cpu(sb, de->d_reclen)); + de->d_ino = cpu_to_fs32(sb, dir->i_ino); + ufs_set_de_type(sb, de, dir->i_mode); + de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); + ufs_set_de_namlen(sb, de, 2); + strcpy (de->d_name, ".."); + kunmap(page); + + err = ufs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ufs_empty_dir(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct page *page = NULL; + unsigned long i, npages = ufs_dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct ufs_dir_entry *de; + page = ufs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct ufs_dir_entry *)kaddr; + kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __func__, + "zero-length directory entry: " + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->d_ino) { + u16 namelen=ufs_get_de_namlen(sb, de); + /* check for . and .. */ + if (de->d_name[0] != '.') + goto not_empty; + if (namelen > 2) + goto not_empty; + if (namelen < 2) { + if (inode->i_ino != + fs32_to_cpu(sb, de->d_ino)) + goto not_empty; + } else if (de->d_name[1] != '.') + goto not_empty; + } + de = ufs_next_entry(sb, de); + } + ufs_put_page(page); + } + return 1; + +not_empty: + ufs_put_page(page); + return 0; +} + +const struct file_operations ufs_dir_operations = { + .read = generic_read_dir, + .iterate = ufs_readdir, + .fsync = generic_file_fsync, + .llseek = generic_file_llseek, +}; diff --git a/kernel/fs/ufs/file.c b/kernel/fs/ufs/file.c new file mode 100644 index 000000000..042ddbf11 --- /dev/null +++ b/kernel/fs/ufs/file.c @@ -0,0 +1,44 @@ +/* + * linux/fs/ufs/file.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 fs regular file handling primitives + */ + +#include + +#include "ufs_fs.h" +#include "ufs.h" + +/* + * We have mostly NULL's here: the current defaults are ok for + * the ufs filesystem. + */ + +const struct file_operations ufs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = generic_file_open, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; diff --git a/kernel/fs/ufs/ialloc.c b/kernel/fs/ufs/ialloc.c new file mode 100644 index 000000000..fd0203ce1 --- /dev/null +++ b/kernel/fs/ufs/ialloc.c @@ -0,0 +1,353 @@ +/* + * linux/fs/ufs/ialloc.c + * + * Copyright (c) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * UFS2 write support added by + * Evgeniy Dushistov , 2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ufs_free_inode (struct inode * inode) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + int is_directory; + unsigned ino, cg, bit; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + ino = inode->i_ino; + + mutex_lock(&UFS_SB(sb)->s_lock); + + if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { + ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); + mutex_unlock(&UFS_SB(sb)->s_lock); + return; + } + + cg = ufs_inotocg (ino); + bit = ufs_inotocgoff (ino); + ucpi = ufs_load_cylinder (sb, cg); + if (!ucpi) { + mutex_unlock(&UFS_SB(sb)->s_lock); + return; + } + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); + + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + + is_directory = S_ISDIR(inode->i_mode); + + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) + ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); + else { + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); + if (ino < ucpi->c_irotor) + ucpi->c_irotor = ino; + fs32_add(sb, &ucg->cg_cs.cs_nifree, 1); + uspi->cs_total.cs_nifree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1); + + if (is_directory) { + fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1); + uspi->cs_total.cs_ndir--; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + ufs_mark_sb_dirty(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT\n"); +} + +/* + * Nullify new chunk of inodes, + * BSD people also set ui_gen field of inode + * during nullification, but we not care about + * that because of linux ufs do not support NFS + */ +static void ufs2_init_inodes_chunk(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + struct ufs_cylinder_group *ucg) +{ + struct buffer_head *bh; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + sector_t beg = uspi->s_sbbase + + ufs_inotofsba(ucpi->c_cgx * uspi->s_ipg + + fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk)); + sector_t end = beg + uspi->s_fpb; + + UFSD("ENTER cgno %d\n", ucpi->c_cgx); + + for (; beg < end; ++beg) { + bh = sb_getblk(sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + + fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); + ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + UFSD("EXIT\n"); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ufs_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block * sb; + struct ufs_sb_info * sbi; + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + struct inode * inode; + unsigned cg, bit, i, j, start; + struct ufs_inode_info *ufsi; + int err = -ENOSPC; + + UFSD("ENTER\n"); + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ufsi = UFS_I(inode); + sbi = UFS_SB(sb); + uspi = sbi->s_uspi; + + mutex_lock(&sbi->s_lock); + + /* + * Try to place the inode in its parent directory + */ + i = ufs_inotocg(dir->i_ino); + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + + /* + * Use a quadratic hash to find a group with a free inode + */ + for ( j = 1; j < uspi->s_ncg; j <<= 1 ) { + i += j; + if (i >= uspi->s_ncg) + i -= uspi->s_ncg; + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + } + + /* + * That failed: try linear search for a free inode + */ + i = ufs_inotocg(dir->i_ino) + 1; + for (j = 2; j < uspi->s_ncg; j++) { + i++; + if (i >= uspi->s_ncg) + i = 0; + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + } + + goto failed; + +cg_found: + ucpi = ufs_load_cylinder (sb, cg); + if (!ucpi) { + err = -EIO; + goto failed; + } + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number"); + + start = ucpi->c_irotor; + bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start); + if (!(bit < uspi->s_ipg)) { + bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start); + if (!(bit < start)) { + ufs_error (sb, "ufs_new_inode", + "cylinder group %u corrupted - error in inode bitmap\n", cg); + err = -EIO; + goto failed; + } + } + UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg); + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); + else { + ufs_panic (sb, "ufs_new_inode", "internal error"); + err = -EIO; + goto failed; + } + + if (uspi->fs_magic == UFS2_MAGIC) { + u32 initediblk = fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk); + + if (bit + uspi->s_inopb > initediblk && + initediblk < fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_niblk)) + ufs2_init_inodes_chunk(sb, ucpi, ucg); + } + + fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1); + uspi->cs_total.cs_nifree--; + fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1); + + if (S_ISDIR(mode)) { + fs32_add(sb, &ucg->cg_cs.cs_ndir, 1); + uspi->cs_total.cs_ndir++; + fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1); + } + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + ufs_mark_sb_dirty(sb); + + inode->i_ino = cg * uspi->s_ipg + bit; + inode_init_owner(inode, dir, mode); + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_flags = UFS_I(dir)->i_flags; + ufsi->i_lastfrag = 0; + ufsi->i_shadow = 0; + ufsi->i_osync = 0; + ufsi->i_oeftflag = 0; + ufsi->i_dir_start_lookup = 0; + memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); + if (insert_inode_locked(inode) < 0) { + err = -EIO; + goto failed; + } + mark_inode_dirty(inode); + + if (uspi->fs_magic == UFS2_MAGIC) { + struct buffer_head *bh; + struct ufs2_inode *ufs2_inode; + + /* + * setup birth date, we do it here because of there is no sense + * to hold it in struct ufs_inode_info, and lose 64 bit + */ + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", + "unable to read inode %lu\n", + inode->i_ino); + err = -EIO; + goto fail_remove_inode; + } + lock_buffer(bh); + ufs2_inode = (struct ufs2_inode *)bh->b_data; + ufs2_inode += ufs_inotofsbo(inode->i_ino); + ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec); + ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + mutex_unlock(&sbi->s_lock); + + UFSD("allocating inode %lu\n", inode->i_ino); + UFSD("EXIT\n"); + return inode; + +fail_remove_inode: + mutex_unlock(&sbi->s_lock); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); +failed: + mutex_unlock(&sbi->s_lock); + make_bad_inode(inode); + iput (inode); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); +} diff --git a/kernel/fs/ufs/inode.c b/kernel/fs/ufs/inode.c new file mode 100644 index 000000000..2d93ab07d --- /dev/null +++ b/kernel/fs/ufs/inode.c @@ -0,0 +1,910 @@ +/* + * linux/fs/ufs/inode.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); + +static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +{ + struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; + int ptrs = uspi->s_apb; + int ptrs_bits = uspi->s_apbshift; + const long direct_blocks = UFS_NDADDR, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + + UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks); + if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = UFS_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = UFS_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = UFS_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ufs_warning(inode->i_sb, "ufs_block_to_path", "block > big"); + } + return n; +} + +/* + * Returns the location of the fragment from + * the beginning of the filesystem. + */ + +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; + int shift = uspi->s_apbshift-uspi->s_fpbshift; + sector_t offsets[4], *p; + int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); + u64 ret = 0L; + __fs32 block; + __fs64 u2_block = 0L; + unsigned flags = UFS_SB(sb)->s_flags; + u64 temp = 0L; + + UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); + UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", + uspi->s_fpbshift, uspi->s_apbmask, + (unsigned long long)mask); + + if (depth == 0) + return 0; + + p = offsets; + + if (needs_lock) + lock_ufs(sb); + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + goto ufs2; + + block = ufsi->i_u1.i_data[*p++]; + if (!block) + goto out; + while (--depth) { + struct buffer_head *bh; + sector_t n = *p++; + + bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + if (!bh) + goto out; + block = ((__fs32 *) bh->b_data)[n & mask]; + brelse (bh); + if (!block) + goto out; + } + ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); + goto out; +ufs2: + u2_block = ufsi->i_u1.u2_i_data[*p++]; + if (!u2_block) + goto out; + + + while (--depth) { + struct buffer_head *bh; + sector_t n = *p++; + + + temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); + bh = sb_bread(sb, temp +(u64) (n>>shift)); + if (!bh) + goto out; + u2_block = ((__fs64 *)bh->b_data)[n & mask]; + brelse(bh); + if (!u2_block) + goto out; + } + temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); + ret = temp + (u64) (frag & uspi->s_fpbmask); + +out: + if (needs_lock) + unlock_ufs(sb); + return ret; +} + +/** + * ufs_inode_getfrag() - allocate new fragment(s) + * @inode: pointer to inode + * @fragment: number of `fragment' which hold pointer + * to new allocated fragment(s) + * @new_fragment: number of new allocated fragment(s) + * @required: how many fragment(s) we require + * @err: we set it if something wrong + * @phys: pointer to where we save physical number of new allocated fragments, + * NULL if we allocate not data(indirect blocks for example). + * @new: we set it if we allocate new block + * @locked_page: for ufs_new_fragments() + */ +static struct buffer_head * +ufs_inode_getfrag(struct inode *inode, u64 fragment, + sector_t new_fragment, unsigned int required, int *err, + long *phys, int *new, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * result; + unsigned blockoff, lastblockoff; + u64 tmp, goal, lastfrag, block, lastblock; + void *p, *p2; + + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " + "metadata %d\n", inode->i_ino, (unsigned long long)fragment, + (unsigned long long)new_fragment, required, !phys); + + /* TODO : to be done for write support + if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + goto ufs2; + */ + + block = ufs_fragstoblks (fragment); + blockoff = ufs_fragnum (fragment); + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + + goal = 0; + +repeat: + tmp = ufs_data_ptr_to_cpu(sb, p); + + lastfrag = ufsi->i_lastfrag; + if (tmp && fragment < lastfrag) { + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + if (tmp == ufs_data_ptr_to_cpu(sb, p)) { + UFSD("EXIT, result %llu\n", + (unsigned long long)tmp + blockoff); + return result; + } + brelse (result); + goto repeat; + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + return NULL; + } + } + + lastblock = ufs_fragstoblks (lastfrag); + lastblockoff = ufs_fragnum (lastfrag); + /* + * We will extend file into new block beyond last allocated block + */ + if (lastblock < block) { + /* + * We must reallocate last allocated block + */ + if (lastblockoff) { + p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); + tmp = ufs_new_fragments(inode, p2, lastfrag, + ufs_data_ptr_to_cpu(sb, p2), + uspi->s_fpb - lastblockoff, + err, locked_page); + if (!tmp) { + if (lastfrag != ufsi->i_lastfrag) + goto repeat; + else + return NULL; + } + lastfrag = ufsi->i_lastfrag; + + } + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, + lastblock)); + if (tmp) + goal = tmp + uspi->s_fpb; + tmp = ufs_new_fragments (inode, p, fragment - blockoff, + goal, required + blockoff, + err, + phys != NULL ? locked_page : NULL); + } else if (lastblock == block) { + /* + * We will extend last allocated block + */ + tmp = ufs_new_fragments(inode, p, fragment - + (blockoff - lastblockoff), + ufs_data_ptr_to_cpu(sb, p), + required + (blockoff - lastblockoff), + err, phys != NULL ? locked_page : NULL); + } else /* (lastblock > block) */ { + /* + * We will allocate new block before last allocated block + */ + if (block) { + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); + if (tmp) + goal = tmp + uspi->s_fpb; + } + tmp = ufs_new_fragments(inode, p, fragment - blockoff, + goal, uspi->s_fpb, err, + phys != NULL ? locked_page : NULL); + } + if (!tmp) { + if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || + (blockoff && lastfrag != ufsi->i_lastfrag)) + goto repeat; + *err = -ENOSPC; + return NULL; + } + + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + result = NULL; + *err = 0; + *new = 1; + } + + inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) + ufs_sync_inode (inode); + mark_inode_dirty(inode); + UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); + return result; + + /* This part : To be implemented .... + Required only for writing, not required for READ-ONLY. +ufs2: + + u2_block = ufs_fragstoblks(fragment); + u2_blockoff = ufs_fragnum(fragment); + p = ufsi->i_u1.u2_i_data + block; + goal = 0; + +repeat2: + tmp = fs32_to_cpu(sb, *p); + lastfrag = ufsi->i_lastfrag; + + */ +} + +/** + * ufs_inode_getblock() - allocate new block + * @inode: pointer to inode + * @bh: pointer to block which hold "pointer" to new allocated block + * @fragment: number of `fragment' which hold pointer + * to new allocated block + * @new_fragment: number of new allocated fragment + * (block will hold this fragment and also uspi->s_fpb-1) + * @err: see ufs_inode_getfrag() + * @phys: see ufs_inode_getfrag() + * @new: see ufs_inode_getfrag() + * @locked_page: see ufs_inode_getfrag() + */ +static struct buffer_head * +ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, + u64 fragment, sector_t new_fragment, int *err, + long *phys, int *new, struct page *locked_page) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * result; + unsigned blockoff; + u64 tmp, goal, block; + void *p; + + block = ufs_fragstoblks (fragment); + blockoff = ufs_fragnum (fragment); + + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)new_fragment, !phys); + + result = NULL; + if (!bh) + goto out; + if (!buffer_uptodate(bh)) { + ll_rw_block (READ, 1, &bh); + wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + goto out; + } + if (uspi->fs_magic == UFS2_MAGIC) + p = (__fs64 *)bh->b_data + block; + else + p = (__fs32 *)bh->b_data + block; +repeat: + tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) { + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + if (tmp == ufs_data_ptr_to_cpu(sb, p)) + goto out; + brelse (result); + goto repeat; + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + goto out; + } + } + + if (block && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + goal = tmp + uspi->s_fpb; + else + goal = bh->b_blocknr + uspi->s_fpb; + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, + uspi->s_fpb, err, locked_page); + if (!tmp) { + if (ufs_data_ptr_to_cpu(sb, p)) + goto repeat; + goto out; + } + + + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + *new = 1; + } + + mark_buffer_dirty(bh); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + UFSD("result %llu\n", (unsigned long long)tmp + blockoff); +out: + brelse (bh); + UFSD("EXIT\n"); + return result; +} + +/** + * ufs_getfrag_block() - `get_block_t' function, interface between UFS and + * readpage, writepage and so on + */ + +int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +{ + struct super_block * sb = inode->i_sb; + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi = sbi->s_uspi; + struct buffer_head * bh; + int ret, err, new; + unsigned long ptr,phys; + u64 phys64 = 0; + bool needs_lock = (sbi->mutex_owner != current); + + if (!create) { + phys64 = ufs_frag_map(inode, fragment, needs_lock); + UFSD("phys64 = %llu\n", (unsigned long long)phys64); + if (phys64) + map_bh(bh_result, sb, phys64); + return 0; + } + + /* This code entered only while writing ....? */ + + err = -EIO; + new = 0; + ret = 0; + bh = NULL; + + if (needs_lock) + lock_ufs(sb); + + UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); + if (fragment > + ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) + << uspi->s_fpbshift)) + goto abort_too_big; + + err = 0; + ptr = fragment; + + /* + * ok, these macros clean the logic up a bit and make + * it much more readable: + */ +#define GET_INODE_DATABLOCK(x) \ + ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ + bh_result->b_page) +#define GET_INODE_PTR(x) \ + ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ + bh_result->b_page) +#define GET_INDIRECT_DATABLOCK(x) \ + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, &phys, &new, bh_result->b_page) +#define GET_INDIRECT_PTR(x) \ + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, NULL, NULL, NULL) + + if (ptr < UFS_NDIR_FRAGMENT) { + bh = GET_INODE_DATABLOCK(ptr); + goto out; + } + ptr -= UFS_NDIR_FRAGMENT; + if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { + bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); + goto get_indirect; + } + ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); + if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { + bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); + goto get_double; + } + ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); + bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); + bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); +get_double: + bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); +get_indirect: + bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); + +#undef GET_INODE_DATABLOCK +#undef GET_INODE_PTR +#undef GET_INDIRECT_DATABLOCK +#undef GET_INDIRECT_PTR + +out: + if (err) + goto abort; + if (new) + set_buffer_new(bh_result); + map_bh(bh_result, sb, phys); +abort: + if (needs_lock) + unlock_ufs(sb); + + return err; + +abort_too_big: + ufs_warning(sb, "ufs_get_block", "block > big"); + goto abort; +} + +static int ufs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,ufs_getfrag_block,wbc); +} + +static int ufs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,ufs_getfrag_block); +} + +int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, ufs_getfrag_block); +} + +static void ufs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, inode->i_size); +} + +static int ufs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + ufs_getfrag_block); + if (unlikely(ret)) + ufs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t ufs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,ufs_getfrag_block); +} + +const struct address_space_operations ufs_aops = { + .readpage = ufs_readpage, + .writepage = ufs_writepage, + .write_begin = ufs_write_begin, + .write_end = generic_write_end, + .bmap = ufs_bmap +}; + +static void ufs_set_inode_ops(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ufs_dir_inode_operations; + inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ufs_fast_symlink_inode_operations; + else { + inode->i_op = &ufs_symlink_inode_operations; + inode->i_mapping->a_ops = &ufs_aops; + } + } else + init_special_inode(inode, inode->i_mode, + ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); +} + +static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + umode_t mode; + + /* + * Copy data to the in-core inode. + */ + inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); + set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); + if (inode->i_nlink == 0) { + ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); + return -1; + } + + /* + * Linux now has 32-bit uid and gid, so we can support EFT. + */ + i_uid_write(inode, ufs_get_inode_uid(sb, ufs_inode)); + i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); + + inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); + inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); + ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); + ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); + ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); + + + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { + memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, + sizeof(ufs_inode->ui_u2.ui_addr)); + } else { + memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink) - 1); + ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0; + } + return 0; +} + +static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + umode_t mode; + + UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); + /* + * Copy data to the in-core inode. + */ + inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); + set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); + if (inode->i_nlink == 0) { + ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); + return -1; + } + + /* + * Linux now has 32-bit uid and gid, so we can support EFT. + */ + i_uid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_uid)); + i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid)); + + inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); + inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); + inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime); + inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); + inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); + inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec); + inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); + inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); + ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); + /* + ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); + ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); + */ + + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { + memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr, + sizeof(ufs2_inode->ui_u2.ui_addr)); + } else { + memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink, + sizeof(ufs2_inode->ui_u2.ui_symlink) - 1); + ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0; + } + return 0; +} + +struct inode *ufs_iget(struct super_block *sb, unsigned long ino) +{ + struct ufs_inode_info *ufsi; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * bh; + struct inode *inode; + int err; + + UFSD("ENTER, ino %lu\n", ino); + + if (ino < UFS_ROOTINO || ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n", + ino); + return ERR_PTR(-EIO); + } + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ufsi = UFS_I(inode); + + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + inode->i_ino); + goto bad_inode; + } + if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + err = ufs2_read_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; + + err = ufs1_read_inode(inode, + ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + if (err) + goto bad_inode; + inode->i_version++; + ufsi->i_lastfrag = + (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + ufsi->i_dir_start_lookup = 0; + ufsi->i_osync = 0; + + ufs_set_inode_ops(inode); + + brelse(bh); + + UFSD("EXIT\n"); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + + ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); + ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); + + ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); + ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); + + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); + ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atime.tv_usec = 0; + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_usec = 0; + ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtime.tv_usec = 0; + ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks); + ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); + + if ((UFS_SB(sb)->s_flags & UFS_UID_MASK) == UFS_UID_EFT) { + ufs_inode->ui_u3.ui_sun.ui_shadow = cpu_to_fs32(sb, ufsi->i_shadow); + ufs_inode->ui_u3.ui_sun.ui_oeftflag = cpu_to_fs32(sb, ufsi->i_oeftflag); + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ + ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0]; + } else if (inode->i_blocks) { + memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data, + sizeof(ufs_inode->ui_u2.ui_addr)); + } + else { + memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink)); + } + + if (!inode->i_nlink) + memset (ufs_inode, 0, sizeof(struct ufs_inode)); +} + +static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + + UFSD("ENTER\n"); + ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); + ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); + + ufs_inode->ui_uid = cpu_to_fs32(sb, i_uid_read(inode)); + ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode)); + + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); + ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec); + ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); + + ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks); + ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ + ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0]; + } else if (inode->i_blocks) { + memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data, + sizeof(ufs_inode->ui_u2.ui_addr)); + } else { + memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink)); + } + + if (!inode->i_nlink) + memset (ufs_inode, 0, sizeof(struct ufs2_inode)); + UFSD("EXIT\n"); +} + +static int ufs_update_inode(struct inode * inode, int do_sync) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); + return -1; + } + + bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); + return -1; + } + if (uspi->fs_magic == UFS2_MAGIC) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_update_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *) bh->b_data; + + ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + mark_buffer_dirty(bh); + if (do_sync) + sync_dirty_buffer(bh); + brelse (bh); + + UFSD("EXIT\n"); + return 0; +} + +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + lock_ufs(inode->i_sb); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + unlock_ufs(inode->i_sb); + return ret; +} + +int ufs_sync_inode (struct inode *inode) +{ + return ufs_update_inode (inode, 1); +} + +void ufs_evict_inode(struct inode * inode) +{ + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) + want_delete = 1; + + truncate_inode_pages_final(&inode->i_data); + if (want_delete) { + loff_t old_i_size; + /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ + lock_ufs(inode->i_sb); + mark_inode_dirty(inode); + ufs_update_inode(inode, IS_SYNC(inode)); + old_i_size = inode->i_size; + inode->i_size = 0; + if (inode->i_blocks && ufs_truncate(inode, old_i_size)) + ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); + unlock_ufs(inode->i_sb); + } + + invalidate_inode_buffers(inode); + clear_inode(inode); + + if (want_delete) { + lock_ufs(inode->i_sb); + ufs_free_inode(inode); + unlock_ufs(inode->i_sb); + } +} diff --git a/kernel/fs/ufs/namei.c b/kernel/fs/ufs/namei.c new file mode 100644 index 000000000..60ee32249 --- /dev/null +++ b/kernel/fs/ufs/namei.c @@ -0,0 +1,356 @@ +/* + * linux/fs/ufs/namei.c + * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov based on ext2 code base. + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "util.h" + +static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = ufs_add_link(dentry, inode); + if (!err) { + unlock_new_inode(inode); + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > UFS_MAXNAMLEN) + return ERR_PTR(-ENAMETOOLONG); + + lock_ufs(dir->i_sb); + ino = ufs_inode_by_name(dir, &dentry->d_name); + if (ino) + inode = ufs_iget(dir->i_sb, ino); + unlock_ufs(dir->i_sb); + return d_splice_alias(inode, dentry); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + int err; + + UFSD("BEGIN\n"); + + inode = ufs_new_inode(dir, mode); + err = PTR_ERR(inode); + + if (!IS_ERR(inode)) { + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + lock_ufs(dir->i_sb); + err = ufs_add_nondir(dentry, inode); + unlock_ufs(dir->i_sb); + } + UFSD("END: err=%d\n", err); + return err; +} + +static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct inode *inode; + int err; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = ufs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); + ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); + mark_inode_dirty(inode); + lock_ufs(dir->i_sb); + err = ufs_add_nondir(dentry, inode); + unlock_ufs(dir->i_sb); + } + return err; +} + +static int ufs_symlink (struct inode * dir, struct dentry * dentry, + const char * symname) +{ + struct super_block * sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode * inode; + + if (l > sb->s_blocksize) + goto out_notlocked; + + lock_ufs(dir->i_sb); + inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { + /* slow symlink */ + inode->i_op = &ufs_symlink_inode_operations; + inode->i_mapping->a_ops = &ufs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &ufs_fast_symlink_inode_operations; + memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = ufs_add_nondir(dentry, inode); +out: + unlock_ufs(dir->i_sb); +out_notlocked: + return err; + +out_fail: + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + goto out; +} + +static int ufs_link (struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int error; + + lock_ufs(dir->i_sb); + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + error = ufs_add_link(dentry, inode); + if (error) { + inode_dec_link_count(inode); + iput(inode); + } else + d_instantiate(dentry, inode); + unlock_ufs(dir->i_sb); + return error; +} + +static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +{ + struct inode * inode; + int err; + + lock_ufs(dir->i_sb); + inode_inc_link_count(dir); + + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &ufs_dir_inode_operations; + inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; + + inode_inc_link_count(inode); + + err = ufs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = ufs_add_link(dentry, inode); + if (err) + goto out_fail; + unlock_ufs(dir->i_sb); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput (inode); +out_dir: + inode_dec_link_count(dir); + unlock_ufs(dir->i_sb); + goto out; +} + +static int ufs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode * inode = d_inode(dentry); + struct ufs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = ufs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto out; + + err = ufs_delete_entry(dir, de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int ufs_rmdir (struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = d_inode(dentry); + int err= -ENOTEMPTY; + + lock_ufs(dir->i_sb); + if (ufs_empty_dir (inode)) { + err = ufs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + unlock_ufs(dir->i_sb); + return err; +} + +static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *dir_page = NULL; + struct ufs_dir_entry * dir_de = NULL; + struct page *old_page; + struct ufs_dir_entry *old_de; + int err = -ENOENT; + + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = ufs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct ufs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !ufs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + ufs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + err = ufs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + + ufs_delete_entry(old_dir, old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + ufs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations ufs_dir_inode_operations = { + .create = ufs_create, + .lookup = ufs_lookup, + .link = ufs_link, + .unlink = ufs_unlink, + .symlink = ufs_symlink, + .mkdir = ufs_mkdir, + .rmdir = ufs_rmdir, + .mknod = ufs_mknod, + .rename = ufs_rename, +}; diff --git a/kernel/fs/ufs/super.c b/kernel/fs/ufs/super.c new file mode 100644 index 000000000..dc33f9416 --- /dev/null +++ b/kernel/fs/ufs/super.c @@ -0,0 +1,1524 @@ +/* + * linux/fs/ufs/super.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +/* Derived from + * + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +/* + * Inspired by + * + * linux/fs/ufs/super.c + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) + * + * Kernel module support added on 96/04/26 by + * Stefan Reinauer + * + * Module usage counts added on 96/04/29 by + * Gertjan van Wingerde + * + * Clean swab support on 19970406 by + * Francois-Rene Rideau + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * NeXTstep support added on February 5th 1998 by + * Niels Kristian Bech Jensen . + * + * write support Daniel Pirkl 1998 + * + * HP/UX hfs filesystem support added by + * Martin K. Petersen , August 1999 + * + * UFS2 (of FreeBSD 5.x) support added by + * Niraj Kumar , Jan 2004 + * + * UFS2 write support added by + * Evgeniy Dushistov , 2007 + */ + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +void lock_ufs(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + + mutex_lock(&sbi->mutex); + sbi->mutex_owner = current; +} + +void unlock_ufs(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + + sbi->mutex_owner = NULL; + mutex_unlock(&sbi->mutex); +} + +static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct inode *inode; + + if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg) + return ERR_PTR(-ESTALE); + + inode = ufs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_get_parent(struct dentry *child) +{ + struct qstr dot_dot = QSTR_INIT("..", 2); + ino_t ino; + + ino = ufs_inode_by_name(d_inode(child), &dot_dot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino)); +} + +static const struct export_operations ufs_export_ops = { + .fh_to_dentry = ufs_fh_to_dentry, + .fh_to_parent = ufs_fh_to_parent, + .get_parent = ufs_get_parent, +}; + +#ifdef CONFIG_UFS_DEBUG +/* + * Print contents of ufs_super_block, useful for debugging + */ +static void ufs_print_super_stuff(struct super_block *sb, + struct ufs_super_block_first *usb1, + struct ufs_super_block_second *usb2, + struct ufs_super_block_third *usb3) +{ + u32 magic = fs32_to_cpu(sb, usb3->fs_magic); + + pr_debug("ufs_print_super_stuff\n"); + pr_debug(" magic: 0x%x\n", magic); + if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { + pr_debug(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + pr_debug(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + pr_debug(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + pr_debug(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + pr_info(" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + pr_info(" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); + pr_info(" fs_maxsymlinklen: %u\n", + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); + } else { + pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + pr_debug(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + pr_debug(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + pr_debug(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + pr_debug(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + pr_debug(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + pr_debug(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + pr_debug(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + pr_debug(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + } + pr_debug("\n"); +} + +/* + * Print contents of ufs_cylinder_group, useful for debugging + */ +static void ufs_print_cylinder_stuff(struct super_block *sb, + struct ufs_cylinder_group *cg) +{ + pr_debug("\nufs_print_cylinder_stuff\n"); + pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); + pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); + pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); + pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); + pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); + pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); + pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); + pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); + pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); + pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); + pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); + pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); + pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); + pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); + pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", + fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), + fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), + fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), + fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); + pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); + pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); + pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); + pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); + pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); + pr_debug(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + pr_debug(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + pr_debug(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + pr_debug("\n"); +} +#else +# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ +# define ufs_print_cylinder_stuff(sb, cg) /**/ +#endif /* CONFIG_UFS_DEBUG */ + +static const struct super_operations ufs_super_ops; + +void ufs_error (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct va_format vaf; + va_list args; + + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + if (!(sb->s_flags & MS_RDONLY)) { + usb1->fs_clean = UFS_FSBAD; + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + ufs_mark_sb_dirty(sb); + sb->s_flags |= MS_RDONLY; + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { + case UFS_MOUNT_ONERROR_PANIC: + panic("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + case UFS_MOUNT_ONERROR_LOCK: + case UFS_MOUNT_ONERROR_UMOUNT: + case UFS_MOUNT_ONERROR_REPAIR: + pr_crit("error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + } + va_end(args); +} + +void ufs_panic (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct va_format vaf; + va_list args; + + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + if (!(sb->s_flags & MS_RDONLY)) { + usb1->fs_clean = UFS_FSBAD; + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + ufs_mark_sb_dirty(sb); + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + sb->s_flags |= MS_RDONLY; + pr_crit("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); +} + +void ufs_warning (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); +} + +enum { + Opt_type_old = UFS_MOUNT_UFSTYPE_OLD, + Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86, + Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN, + Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS, + Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD, + Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2, + Opt_type_hp = UFS_MOUNT_UFSTYPE_HP, + Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD, + Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP, + Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP, + Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC, + Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK, + Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT, + Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_type_old, "ufstype=old"}, + {Opt_type_sunx86, "ufstype=sunx86"}, + {Opt_type_sun, "ufstype=sun"}, + {Opt_type_sunos, "ufstype=sunos"}, + {Opt_type_44bsd, "ufstype=44bsd"}, + {Opt_type_ufs2, "ufstype=ufs2"}, + {Opt_type_ufs2, "ufstype=5xbsd"}, + {Opt_type_hp, "ufstype=hp"}, + {Opt_type_nextstepcd, "ufstype=nextstep-cd"}, + {Opt_type_nextstep, "ufstype=nextstep"}, + {Opt_type_openstep, "ufstype=openstep"}, +/*end of possible ufs types */ + {Opt_onerror_panic, "onerror=panic"}, + {Opt_onerror_lock, "onerror=lock"}, + {Opt_onerror_umount, "onerror=umount"}, + {Opt_onerror_repair, "onerror=repair"}, + {Opt_err, NULL} +}; + +static int ufs_parse_options (char * options, unsigned * mount_options) +{ + char * p; + + UFSD("ENTER\n"); + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_type_old: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_OLD); + break; + case Opt_type_sunx86: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_SUNx86); + break; + case Opt_type_sun: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_SUN); + break; + case Opt_type_sunos: + ufs_clear_opt(*mount_options, UFSTYPE); + ufs_set_opt(*mount_options, UFSTYPE_SUNOS); + break; + case Opt_type_44bsd: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_44BSD); + break; + case Opt_type_ufs2: + ufs_clear_opt(*mount_options, UFSTYPE); + ufs_set_opt(*mount_options, UFSTYPE_UFS2); + break; + case Opt_type_hp: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_HP); + break; + case Opt_type_nextstepcd: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD); + break; + case Opt_type_nextstep: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP); + break; + case Opt_type_openstep: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP); + break; + case Opt_onerror_panic: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_PANIC); + break; + case Opt_onerror_lock: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_LOCK); + break; + case Opt_onerror_umount: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_UMOUNT); + break; + case Opt_onerror_repair: + pr_err("Unable to do repair on error, will lock lock instead\n"); + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_REPAIR); + break; + default: + pr_err("Invalid option: \"%s\" or missing value\n", p); + return 0; + } + } + return 1; +} + +/* + * Different types of UFS hold fs_cstotal in different + * places, and use different data structure for it. + * To make things simpler we just copy fs_cstotal to ufs_sb_private_info + */ +static void ufs_setup_cstotal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + + UFSD("ENTER, mtype=%u\n", mtype); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); + uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree); + uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree); + uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree); + } else { + uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir); + uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree); + uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); + uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); + } + UFSD("EXIT\n"); +} + +/* + * Read on-disk structures associated with cylinder groups + */ +static int ufs_read_cylinder_structures(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_buffer_head * ubh; + unsigned char * base, * space; + unsigned size, blks, i; + + UFSD("ENTER\n"); + + /* + * Read cs structures from (usually) first data block + * on the device. + */ + size = uspi->s_cssize; + blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + base = space = kmalloc(size, GFP_NOFS); + if (!base) + goto failed; + sbi->s_csp = (struct ufs_csum *)space; + for (i = 0; i < blks; i += uspi->s_fpb) { + size = uspi->s_bsize; + if (i + uspi->s_fpb > blks) + size = (blks - i) * uspi->s_fsize; + + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + + if (!ubh) + goto failed; + + ubh_ubhcpymem (space, ubh, size); + + space += size; + ubh_brelse (ubh); + ubh = NULL; + } + + /* + * Read cylinder group (we read only first fragment from block + * at this time) and prepare internal data structures for cg caching. + */ + if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) + goto failed; + for (i = 0; i < uspi->s_ncg; i++) + sbi->s_ucg[i] = NULL; + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { + sbi->s_ucpi[i] = NULL; + sbi->s_cgno[i] = UFS_CGNO_EMPTY; + } + for (i = 0; i < uspi->s_ncg; i++) { + UFSD("read cg %u\n", i); + if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) + goto failed; + if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) + goto failed; + + ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); + } + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { + if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) + goto failed; + sbi->s_cgno[i] = UFS_CGNO_EMPTY; + } + sbi->s_cg_loaded = 0; + UFSD("EXIT\n"); + return 1; + +failed: + kfree (base); + if (sbi->s_ucg) { + for (i = 0; i < uspi->s_ncg; i++) + if (sbi->s_ucg[i]) + brelse (sbi->s_ucg[i]); + kfree (sbi->s_ucg); + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) + kfree (sbi->s_ucpi[i]); + } + UFSD("EXIT (FAILED)\n"); + return 0; +} + +/* + * Sync our internal copy of fs_cstotal with disk + */ +static void ufs_put_cstotal(struct super_block *sb) +{ + unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + + UFSD("ENTER\n"); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); + } else { + usb1->fs_cstotal.cs_ndir = + cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = + cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = + cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = + cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + } + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + ufs_print_super_stuff(sb, usb1, usb2, usb3); + UFSD("EXIT\n"); +} + +/** + * ufs_put_super_internal() - put on-disk intrenal structures + * @sb: pointer to super_block structure + * Put on-disk structures associated with cylinder groups + * and write them back to disk, also update cs_total on disk + */ +static void ufs_put_super_internal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_buffer_head * ubh; + unsigned char * base, * space; + unsigned blks, size, i; + + + UFSD("ENTER\n"); + + ufs_put_cstotal(sb); + size = uspi->s_cssize; + blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + base = space = (char*) sbi->s_csp; + for (i = 0; i < blks; i += uspi->s_fpb) { + size = uspi->s_bsize; + if (i + uspi->s_fpb > blks) + size = (blks - i) * uspi->s_fsize; + + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + + ubh_memcpyubh (ubh, space, size); + space += size; + ubh_mark_buffer_uptodate (ubh, 1); + ubh_mark_buffer_dirty (ubh); + ubh_brelse (ubh); + } + for (i = 0; i < sbi->s_cg_loaded; i++) { + ufs_put_cylinder (sb, i); + kfree (sbi->s_ucpi[i]); + } + for (; i < UFS_MAX_GROUP_LOADED; i++) + kfree (sbi->s_ucpi[i]); + for (i = 0; i < uspi->s_ncg; i++) + brelse (sbi->s_ucg[i]); + kfree (sbi->s_ucg); + kfree (base); + + UFSD("EXIT\n"); +} + +static int ufs_sync_fs(struct super_block *sb, int wait) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_third * usb3; + unsigned flags; + + lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); + + UFSD("ENTER\n"); + + flags = UFS_SB(sb)->s_flags; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + usb3 = ubh_get_usb_third(uspi); + + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ufs_put_cstotal(sb); + + UFSD("EXIT\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + + return 0; +} + +static void delayed_sync_fs(struct work_struct *work) +{ + struct ufs_sb_info *sbi; + + sbi = container_of(work, struct ufs_sb_info, sync_work.work); + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + ufs_sync_fs(sbi->sb, 1); +} + +void ufs_mark_sb_dirty(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + unsigned long delay; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sync_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + +static void ufs_put_super(struct super_block *sb) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + + UFSD("ENTER\n"); + + if (!(sb->s_flags & MS_RDONLY)) + ufs_put_super_internal(sb); + cancel_delayed_work_sync(&sbi->sync_work); + + ubh_brelse_uspi (sbi->s_uspi); + kfree (sbi->s_uspi); + mutex_destroy(&sbi->mutex); + kfree (sbi); + sb->s_fs_info = NULL; + UFSD("EXIT\n"); + return; +} + +static int ufs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ufs_sb_info * sbi; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_second * usb2; + struct ufs_super_block_third * usb3; + struct ufs_buffer_head * ubh; + struct inode *inode; + unsigned block_size, super_block_size; + unsigned flags; + unsigned super_block_offset; + unsigned maxsymlen; + int ret = -EINVAL; + + uspi = NULL; + ubh = NULL; + flags = 0; + + UFSD("ENTER\n"); + +#ifndef CONFIG_UFS_FS_WRITE + if (!(sb->s_flags & MS_RDONLY)) { + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); + return -EROFS; + } +#endif + + sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); + if (!sbi) + goto failed_nomem; + sb->s_fs_info = sbi; + sbi->sb = sb; + + UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); + + mutex_init(&sbi->mutex); + mutex_init(&sbi->s_lock); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); + /* + * Set default mount options + * Parse mount options + */ + sbi->s_mount_opt = 0; + ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); + if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { + pr_err("wrong mount options\n"); + goto failed; + } + if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { + if (!silent) + pr_err("You didn't specify the type of your ufs filesystem\n\n" + "mount -t ufs -o ufstype=" + "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" + ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " + "default is ufstype=old\n"); + ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD); + } + + uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); + sbi->s_uspi = uspi; + if (!uspi) + goto failed; + uspi->s_dirblksize = UFS_SECTOR_SIZE; + super_block_offset=UFS_SBLOCK; + + /* Keep 2Gig file limit. Some UFS variants need to override + this but as I don't know which I'll let those in the know loosen + the rules */ + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { + case UFS_MOUNT_UFSTYPE_44BSD: + UFSD("ufstype=44bsd\n"); + uspi->s_fsize = block_size = 512; + uspi->s_fmask = ~(512 - 1); + uspi->s_fshift = 9; + uspi->s_sbsize = super_block_size = 1536; + uspi->s_sbbase = 0; + flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + break; + case UFS_MOUNT_UFSTYPE_UFS2: + UFSD("ufstype=ufs2\n"); + super_block_offset=SBLOCK_UFS2; + uspi->s_fsize = block_size = 512; + uspi->s_fmask = ~(512 - 1); + uspi->s_fshift = 9; + uspi->s_sbsize = super_block_size = 1536; + uspi->s_sbbase = 0; + flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + break; + + case UFS_MOUNT_UFSTYPE_SUN: + UFSD("ufstype=sun\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_SUNOS: + UFSD("ufstype=sunos\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = 2048; + super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_SUNx86: + UFSD("ufstype=sunx86\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_OLD: + UFSD("ufstype=old\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + pr_info("ufstype=old is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_NEXTSTEP: + UFSD("ufstype=nextstep\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + pr_info("ufstype=nextstep is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: + UFSD("ufstype=nextstep-cd\n"); + uspi->s_fsize = block_size = 2048; + uspi->s_fmask = ~(2048 - 1); + uspi->s_fshift = 11; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + pr_info("ufstype=nextstep-cd is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_OPENSTEP: + UFSD("ufstype=openstep\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + pr_info("ufstype=openstep is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_HP: + UFSD("ufstype=hp\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + pr_info("ufstype=hp is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + default: + if (!silent) + pr_err("unknown ufstype\n"); + goto failed; + } + +again: + if (!sb_set_blocksize(sb, block_size)) { + pr_err("failed to set blocksize\n"); + goto failed; + } + + /* + * read ufs super block from device + */ + + ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size); + + if (!ubh) + goto failed; + + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + /* Sort out mod used on SunOS 4.1.3 for fs_state */ + uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); + if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) && + (uspi->s_postblformat != UFS_42POSTBLFMT)) { + flags &= ~UFS_ST_MASK; + flags |= UFS_ST_SUN; + } + + /* + * Check ufs magic number + */ + sbi->s_bytesex = BYTESEX_LE; + switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { + case UFS_MAGIC: + case UFS_MAGIC_BW: + case UFS2_MAGIC: + case UFS_MAGIC_LFN: + case UFS_MAGIC_FEA: + case UFS_MAGIC_4GB: + goto magic_found; + } + sbi->s_bytesex = BYTESEX_BE; + switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { + case UFS_MAGIC: + case UFS_MAGIC_BW: + case UFS2_MAGIC: + case UFS_MAGIC_LFN: + case UFS_MAGIC_FEA: + case UFS_MAGIC_4GB: + goto magic_found; + } + + if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) + || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) + || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) + && uspi->s_sbbase < 256) { + ubh_brelse_uspi(uspi); + ubh = NULL; + uspi->s_sbbase += 8; + goto again; + } + if (!silent) + pr_err("%s(): bad magic number\n", __func__); + goto failed; + +magic_found: + /* + * Check block and fragment sizes + */ + uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize); + uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize); + uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize); + uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); + uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); + + if (!is_power_of_2(uspi->s_fsize)) { + pr_err("%s(): fragment size %u is not a power of 2\n", + __func__, uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize < 512) { + pr_err("%s(): fragment size %u is too small\n", + __func__, uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize > 4096) { + pr_err("%s(): fragment size %u is too large\n", + __func__, uspi->s_fsize); + goto failed; + } + if (!is_power_of_2(uspi->s_bsize)) { + pr_err("%s(): block size %u is not a power of 2\n", + __func__, uspi->s_bsize); + goto failed; + } + if (uspi->s_bsize < 4096) { + pr_err("%s(): block size %u is too small\n", + __func__, uspi->s_bsize); + goto failed; + } + if (uspi->s_bsize / uspi->s_fsize > 8) { + pr_err("%s(): too many fragments per block (%u)\n", + __func__, uspi->s_bsize / uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { + ubh_brelse_uspi(uspi); + ubh = NULL; + block_size = uspi->s_fsize; + super_block_size = uspi->s_sbsize; + UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size); + goto again; + } + + sbi->s_flags = flags;/*after that line some functions use s_flags*/ + ufs_print_super_stuff(sb, usb1, usb2, usb3); + + /* + * Check, if file system was correctly unmounted. + * If not, make it read only. + */ + if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) || + ((flags & UFS_ST_MASK) == UFS_ST_OLD) || + (((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) && + (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { + switch(usb1->fs_clean) { + case UFS_FSCLEAN: + UFSD("fs is clean\n"); + break; + case UFS_FSSTABLE: + UFSD("fs is stable\n"); + break; + case UFS_FSLOG: + UFSD("fs is logging fs\n"); + break; + case UFS_FSOSF1: + UFSD("fs is DEC OSF/1\n"); + break; + case UFS_FSACTIVE: + pr_err("%s(): fs is active\n", __func__); + sb->s_flags |= MS_RDONLY; + break; + case UFS_FSBAD: + pr_err("%s(): fs is bad\n", __func__); + sb->s_flags |= MS_RDONLY; + break; + default: + pr_err("%s(): can't grok fs_clean 0x%x\n", + __func__, usb1->fs_clean); + sb->s_flags |= MS_RDONLY; + break; + } + } else { + pr_err("%s(): fs needs fsck\n", __func__); + sb->s_flags |= MS_RDONLY; + } + + /* + * Read ufs_super_block into internal data structures + */ + sb->s_op = &ufs_super_ops; + sb->s_export_op = &ufs_export_ops; + + sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); + + uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); + uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno); + uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno); + uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno); + uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset); + uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); + + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { + uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); + uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); + } + + uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg); + /* s_bsize already set */ + /* s_fsize already set */ + uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag); + uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree); + uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask); + uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); + uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); + uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); + UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, + uspi->s_fshift); + uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); + uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); + /* s_sbsize already set */ + uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask); + uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift); + uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir); + uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb); + uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf); + uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3); + uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave); + uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew); + + if (uspi->fs_magic == UFS2_MAGIC) + uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr); + else + uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); + + uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize); + uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize); + uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak); + uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect); + uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); + uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); + uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); + uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc); + uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize); + uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); + uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); + uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos); + uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); + uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); + + /* + * Compute another frequently used values + */ + uspi->s_fpbmask = uspi->s_fpb - 1; + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + uspi->s_apbshift = uspi->s_bshift - 3; + else + uspi->s_apbshift = uspi->s_bshift - 2; + + uspi->s_2apbshift = uspi->s_apbshift * 2; + uspi->s_3apbshift = uspi->s_apbshift * 3; + uspi->s_apb = 1 << uspi->s_apbshift; + uspi->s_2apb = 1 << uspi->s_2apbshift; + uspi->s_3apb = 1 << uspi->s_3apbshift; + uspi->s_apbmask = uspi->s_apb - 1; + uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; + uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; + uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift; + uspi->s_bpf = uspi->s_fsize << 3; + uspi->s_bpfshift = uspi->s_fshift + 3; + uspi->s_bpfmask = uspi->s_bpf - 1; + if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD || + (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2) + uspi->s_maxsymlinklen = + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); + + if (uspi->fs_magic == UFS2_MAGIC) + maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR); + else + maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR); + if (uspi->s_maxsymlinklen > maxsymlen) { + ufs_warning(sb, __func__, "ufs_read_super: excessive maximum " + "fast symlink size (%u)\n", uspi->s_maxsymlinklen); + uspi->s_maxsymlinklen = maxsymlen; + } + sb->s_max_links = UFS_LINK_MAX; + + inode = ufs_iget(sb, UFS_ROOTINO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto failed; + } + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + ret = -ENOMEM; + goto failed; + } + + ufs_setup_cstotal(sb); + /* + * Read cylinder group structures + */ + if (!(sb->s_flags & MS_RDONLY)) + if (!ufs_read_cylinder_structures(sb)) + goto failed; + + UFSD("EXIT\n"); + return 0; + +failed: + mutex_destroy(&sbi->mutex); + if (ubh) + ubh_brelse_uspi (uspi); + kfree (uspi); + kfree(sbi); + sb->s_fs_info = NULL; + UFSD("EXIT (FAILED)\n"); + return ret; + +failed_nomem: + UFSD("EXIT (NOMEM)\n"); + return -ENOMEM; +} + +static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_third * usb3; + unsigned new_mount_opt, ufstype; + unsigned flags; + + sync_filesystem(sb); + lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); + uspi = UFS_SB(sb)->s_uspi; + flags = UFS_SB(sb)->s_flags; + usb1 = ubh_get_usb_first(uspi); + usb3 = ubh_get_usb_third(uspi); + + /* + * Allow the "check" option to be passed as a remount option. + * It is not possible to change ufstype option during remount + */ + ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + new_mount_opt = 0; + ufs_set_opt (new_mount_opt, ONERROR_LOCK); + if (!ufs_parse_options (data, &new_mount_opt)) { + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return -EINVAL; + } + if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { + new_mount_opt |= ufstype; + } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { + pr_err("ufstype can't be changed during remount\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return -EINVAL; + } + + if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return 0; + } + + /* + * fs was mouted as rw, remounting ro + */ + if (*mount_flags & MS_RDONLY) { + ufs_put_super_internal(sb); + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN + || (flags & UFS_ST_MASK) == UFS_ST_SUNOS + || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + sb->s_flags |= MS_RDONLY; + } else { + /* + * fs was mounted as ro, remounting rw + */ +#ifndef CONFIG_UFS_FS_WRITE + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return -EINVAL; +#else + if (ufstype != UFS_MOUNT_UFSTYPE_SUN && + ufstype != UFS_MOUNT_UFSTYPE_SUNOS && + ufstype != UFS_MOUNT_UFSTYPE_44BSD && + ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && + ufstype != UFS_MOUNT_UFSTYPE_UFS2) { + pr_err("this ufstype is read-only supported\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return -EINVAL; + } + if (!ufs_read_cylinder_structures(sb)) { + pr_err("failed during remounting\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return -EPERM; + } + sb->s_flags &= ~MS_RDONLY; +#endif + } + UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); + return 0; +} + +static int ufs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct ufs_sb_info *sbi = UFS_SB(root->d_sb); + unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + const struct match_token *tp = tokens; + + while (tp->token != Opt_onerror_panic && tp->token != mval) + ++tp; + BUG_ON(tp->token == Opt_onerror_panic); + seq_printf(seq, ",%s", tp->pattern); + + mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR; + while (tp->token != Opt_err && tp->token != mval) + ++tp; + BUG_ON(tp->token == Opt_err); + seq_printf(seq, ",%s", tp->pattern); + + return 0; +} + +static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; + unsigned flags = UFS_SB(sb)->s_flags; + struct ufs_super_block_third *usb3; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + lock_ufs(sb); + + usb3 = ubh_get_usb_third(uspi); + + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + buf->f_type = UFS2_MAGIC; + buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { + buf->f_type = UFS_MAGIC; + buf->f_blocks = uspi->s_dsize; + } + buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree; + buf->f_ffree = uspi->cs_total.cs_nifree; + buf->f_bsize = sb->s_blocksize; + buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) + ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; + buf->f_files = uspi->s_ncg * uspi->s_ipg; + buf->f_namelen = UFS_MAXNAMLEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + unlock_ufs(sb); + + return 0; +} + +static struct kmem_cache * ufs_inode_cachep; + +static struct inode *ufs_alloc_inode(struct super_block *sb) +{ + struct ufs_inode_info *ei; + + ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void ufs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); +} + +static void ufs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ufs_i_callback); +} + +static void init_once(void *foo) +{ + struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", + sizeof(struct ufs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ufs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ufs_inode_cachep); +} + +static const struct super_operations ufs_super_ops = { + .alloc_inode = ufs_alloc_inode, + .destroy_inode = ufs_destroy_inode, + .write_inode = ufs_write_inode, + .evict_inode = ufs_evict_inode, + .put_super = ufs_put_super, + .sync_fs = ufs_sync_fs, + .statfs = ufs_statfs, + .remount_fs = ufs_remount, + .show_options = ufs_show_options, +}; + +static struct dentry *ufs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); +} + +static struct file_system_type ufs_fs_type = { + .owner = THIS_MODULE, + .name = "ufs", + .mount = ufs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ufs"); + +static int __init init_ufs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ufs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_ufs_fs(void) +{ + unregister_filesystem(&ufs_fs_type); + destroy_inodecache(); +} + +module_init(init_ufs_fs) +module_exit(exit_ufs_fs) +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/ufs/swab.h b/kernel/fs/ufs/swab.h new file mode 100644 index 000000000..8d974c4fd --- /dev/null +++ b/kernel/fs/ufs/swab.h @@ -0,0 +1,115 @@ +/* + * linux/fs/ufs/swab.h + * + * Copyright (C) 1997, 1998 Francois-Rene Rideau + * Copyright (C) 1998 Jakub Jelinek + * Copyright (C) 2001 Christoph Hellwig + */ + +#ifndef _UFS_SWAB_H +#define _UFS_SWAB_H + +/* + * Notes: + * HERE WE ASSUME EITHER BIG OR LITTLE ENDIAN UFSes + * in case there are ufs implementations that have strange bytesexes, + * you'll need to modify code here as well as in ufs_super.c and ufs_fs.h + * to support them. + */ + +enum { + BYTESEX_LE, + BYTESEX_BE +}; + +static inline u64 +fs64_to_cpu(struct super_block *sbp, __fs64 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le64_to_cpu((__force __le64)n); + else + return be64_to_cpu((__force __be64)n); +} + +static inline __fs64 +cpu_to_fs64(struct super_block *sbp, u64 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs64)cpu_to_le64(n); + else + return (__force __fs64)cpu_to_be64(n); +} + +static inline u32 +fs32_to_cpu(struct super_block *sbp, __fs32 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 +cpu_to_fs32(struct super_block *sbp, u32 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs32)cpu_to_le32(n); + else + return (__force __fs32)cpu_to_be32(n); +} + +static inline void +fs32_add(struct super_block *sbp, __fs32 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, d); + else + be32_add_cpu((__be32 *)n, d); +} + +static inline void +fs32_sub(struct super_block *sbp, __fs32 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, -d); + else + be32_add_cpu((__be32 *)n, -d); +} + +static inline u16 +fs16_to_cpu(struct super_block *sbp, __fs16 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 +cpu_to_fs16(struct super_block *sbp, u16 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs16)cpu_to_le16(n); + else + return (__force __fs16)cpu_to_be16(n); +} + +static inline void +fs16_add(struct super_block *sbp, __fs16 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le16_add_cpu((__le16 *)n, d); + else + be16_add_cpu((__be16 *)n, d); +} + +static inline void +fs16_sub(struct super_block *sbp, __fs16 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le16_add_cpu((__le16 *)n, -d); + else + be16_add_cpu((__be16 *)n, -d); +} + +#endif /* _UFS_SWAB_H */ diff --git a/kernel/fs/ufs/symlink.c b/kernel/fs/ufs/symlink.c new file mode 100644 index 000000000..5b537e2fd --- /dev/null +++ b/kernel/fs/ufs/symlink.c @@ -0,0 +1,53 @@ +/* + * linux/fs/ufs/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/symlink.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 symlink handling code + */ + +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" + + +static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ufs_inode_info *p = UFS_I(d_inode(dentry)); + nd_set_link(nd, (char*)p->i_u1.i_symlink); + return NULL; +} + +const struct inode_operations ufs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ufs_follow_link, + .setattr = ufs_setattr, +}; + +const struct inode_operations ufs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ufs_setattr, +}; diff --git a/kernel/fs/ufs/truncate.c b/kernel/fs/ufs/truncate.c new file mode 100644 index 000000000..21154704c --- /dev/null +++ b/kernel/fs/ufs/truncate.c @@ -0,0 +1,523 @@ +/* + * linux/fs/ufs/truncate.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/truncate.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/truncate.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +/* + * Real random numbers for secure rm added 94/02/18 + * Idea from Pierre del Perugia + */ + +/* + * Adoptation to use page cache and UFS2 write support by + * Evgeniy Dushistov , 2006-2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * Secure deletion currently doesn't work. It interacts very badly + * with buffers shared with memory mappings, and for that reason + * can't be done in the truncate() routines. It should instead be + * done separately in "release()" before calling the truncate routines + * that will release the actual file blocks. + * + * Linus + */ + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + + +static int ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + unsigned frag_to_free, free_count; + unsigned i, tmp; + int retry; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + mark_inode_dirty(inode); + frag_to_free = tmp + frag1; + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + ufs_data_ptr_clear(uspi, p); + + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + mark_inode_dirty(inode); + } + + if (free_count > 0) + ufs_free_blocks (inode, frag_to_free, free_count); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + ufs_data_ptr_clear(uspi, p); + + ufs_free_fragments (inode, tmp, frag4); + mark_inode_dirty(inode); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); + return retry; +} + + +static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * ind_ubh; + void *ind; + u64 tmp, indirect_block, i, frag_to_free; + unsigned free_count; + int retry; + + UFSD("ENTER: ino %lu, offset %llu, p: %p\n", + inode->i_ino, (unsigned long long)offset, p); + + BUG_ON(!p); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return 0; + ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (ind_ubh); + return 1; + } + if (!ind_ubh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; + for (i = indirect_block; i < uspi->s_apb; i++) { + ind = ubh_get_data_ptr(uspi, ind_ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); + if (!tmp) + continue; + + ufs_data_ptr_clear(uspi, ind); + ubh_mark_buffer_dirty(ind_ubh); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + + mark_inode_dirty(inode); + } + + if (free_count > 0) { + ufs_free_blocks (inode, frag_to_free, free_count); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, ind_ubh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks (inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(ind_ubh); + ind_ubh = NULL; + } + if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) + ubh_sync_block(ind_ubh); + ubh_brelse (ind_ubh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + + return retry; +} + +static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head *dind_bh; + u64 i, tmp, dindirect_block; + void *dind; + int retry = 0; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + dindirect_block = (DIRECT_BLOCK > offset) + ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; + retry = 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return 0; + dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (dind_bh); + return 1; + } + if (!dind_bh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + for (i = dindirect_block ; i < uspi->s_apb ; i++) { + dind = ubh_get_data_ptr(uspi, dind_bh, i); + tmp = ufs_data_ptr_to_cpu(sb, dind); + if (!tmp) + continue; + retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ubh_mark_buffer_dirty(dind_bh); + } + + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, dind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(dind_bh); + dind_bh = NULL; + } + if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) + ubh_sync_block(dind_bh); + ubh_brelse (dind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + + return retry; +} + +static int ufs_trunc_tindirect(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_inode_info *ufsi = UFS_I(inode); + struct ufs_buffer_head * tind_bh; + u64 tindirect_block, tmp, i; + void *tind, *p; + int retry; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + retry = 0; + + tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) + ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; + + p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); + if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) + return 0; + tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (tind_bh); + return 1; + } + if (!tind_bh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + for (i = tindirect_block ; i < uspi->s_apb ; i++) { + tind = ubh_get_data_ptr(uspi, tind_bh, i); + retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + + uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); + ubh_mark_buffer_dirty(tind_bh); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, tind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(tind_bh); + tind_bh = NULL; + } + if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) + ubh_sync_block(tind_bh); + ubh_brelse (tind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + return retry; +} + +static int ufs_alloc_lastblock(struct inode *inode) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +int ufs_truncate(struct inode *inode, loff_t old_i_size) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int retry, err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)i_size_read(inode), + (unsigned long long)old_i_size); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode); + + if (err) { + i_size_write(inode, old_i_size); + goto out; + } + + block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + + while (1) { + retry = ufs_trunc_direct(inode); + retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_IND_BLOCK)); + retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_DIND_BLOCK)); + retry |= ufs_trunc_tindirect (inode); + if (!retry) + break; + if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) + ufs_sync_inode (inode); + yield(); + } + + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + loff_t old_i_size = inode->i_size; + + /* XXX(truncate): truncate_setsize should be called last */ + truncate_setsize(inode, attr->ia_size); + + lock_ufs(inode->i_sb); + error = ufs_truncate(inode, old_i_size); + unlock_ufs(inode->i_sb); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/kernel/fs/ufs/ufs.h b/kernel/fs/ufs/ufs.h new file mode 100644 index 000000000..cf6368d42 --- /dev/null +++ b/kernel/fs/ufs/ufs.h @@ -0,0 +1,176 @@ +#ifndef _UFS_UFS_H +#define _UFS_UFS_H 1 + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define UFS_MAX_GROUP_LOADED 8 +#define UFS_CGNO_EMPTY ((unsigned)-1) + +struct ufs_sb_private_info; +struct ufs_cg_private_info; +struct ufs_csum; + +struct ufs_sb_info { + struct ufs_sb_private_info * s_uspi; + struct ufs_csum * s_csp; + unsigned s_bytesex; + unsigned s_flags; + struct buffer_head ** s_ucg; + struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED]; + unsigned s_cgno[UFS_MAX_GROUP_LOADED]; + unsigned short s_cg_loaded; + unsigned s_mount_opt; + struct mutex mutex; + struct task_struct *mutex_owner; + struct super_block *sb; + int work_queued; /* non-zero if the delayed work is queued */ + struct delayed_work sync_work; /* FS sync delayed work */ + spinlock_t work_lock; /* protects sync_work and work_queued */ + struct mutex s_lock; +}; + +struct ufs_inode_info { + union { + __fs32 i_data[15]; + __u8 i_symlink[2 * 4 * 15]; + __fs64 u2_i_data[15]; + } i_u1; + __u32 i_flags; + __u32 i_shadow; + __u32 i_unused1; + __u32 i_unused2; + __u32 i_oeftflag; + __u16 i_osync; + __u64 i_lastfrag; + __u32 i_dir_start_lookup; + struct inode vfs_inode; +}; + +/* mount options */ +#define UFS_MOUNT_ONERROR 0x0000000F +#define UFS_MOUNT_ONERROR_PANIC 0x00000001 +#define UFS_MOUNT_ONERROR_LOCK 0x00000002 +#define UFS_MOUNT_ONERROR_UMOUNT 0x00000004 +#define UFS_MOUNT_ONERROR_REPAIR 0x00000008 + +#define UFS_MOUNT_UFSTYPE 0x0000FFF0 +#define UFS_MOUNT_UFSTYPE_OLD 0x00000010 +#define UFS_MOUNT_UFSTYPE_44BSD 0x00000020 +#define UFS_MOUNT_UFSTYPE_SUN 0x00000040 +#define UFS_MOUNT_UFSTYPE_NEXTSTEP 0x00000080 +#define UFS_MOUNT_UFSTYPE_NEXTSTEP_CD 0x00000100 +#define UFS_MOUNT_UFSTYPE_OPENSTEP 0x00000200 +#define UFS_MOUNT_UFSTYPE_SUNx86 0x00000400 +#define UFS_MOUNT_UFSTYPE_HP 0x00000800 +#define UFS_MOUNT_UFSTYPE_UFS2 0x00001000 +#define UFS_MOUNT_UFSTYPE_SUNOS 0x00002000 + +#define ufs_clear_opt(o,opt) o &= ~UFS_MOUNT_##opt +#define ufs_set_opt(o,opt) o |= UFS_MOUNT_##opt +#define ufs_test_opt(o,opt) ((o) & UFS_MOUNT_##opt) + +/* + * Debug code + */ +#ifdef CONFIG_UFS_DEBUG +# define UFSD(f, a...) { \ + pr_debug("UFSD (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + pr_debug(f, ## a); \ + } +#else +# define UFSD(f, a...) /**/ +#endif + +/* balloc.c */ +extern void ufs_free_fragments (struct inode *, u64, unsigned); +extern void ufs_free_blocks (struct inode *, u64, unsigned); +extern u64 ufs_new_fragments(struct inode *, void *, u64, u64, + unsigned, int *, struct page *); + +/* cylinder.c */ +extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned); +extern void ufs_put_cylinder (struct super_block *, unsigned); + +/* dir.c */ +extern const struct inode_operations ufs_dir_inode_operations; +extern int ufs_add_link (struct dentry *, struct inode *); +extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); +extern int ufs_make_empty(struct inode *, struct inode *); +extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); +extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); +extern int ufs_empty_dir (struct inode *); +extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); +extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode); + +/* file.c */ +extern const struct inode_operations ufs_file_inode_operations; +extern const struct file_operations ufs_file_operations; +extern const struct address_space_operations ufs_aops; + +/* ialloc.c */ +extern void ufs_free_inode (struct inode *inode); +extern struct inode * ufs_new_inode (struct inode *, umode_t); + +/* inode.c */ +extern struct inode *ufs_iget(struct super_block *, unsigned long); +extern int ufs_write_inode (struct inode *, struct writeback_control *); +extern int ufs_sync_inode (struct inode *); +extern void ufs_evict_inode (struct inode *); +extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); + +/* namei.c */ +extern const struct file_operations ufs_dir_operations; + +/* super.c */ +extern __printf(3, 4) +void ufs_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_panic(struct super_block *, const char *, const char *, ...); +void ufs_mark_sb_dirty(struct super_block *sb); + +/* symlink.c */ +extern const struct inode_operations ufs_fast_symlink_inode_operations; +extern const struct inode_operations ufs_symlink_inode_operations; + +/* truncate.c */ +extern int ufs_truncate (struct inode *, loff_t); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); + +static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct ufs_inode_info *UFS_I(struct inode *inode) +{ + return container_of(inode, struct ufs_inode_info, vfs_inode); +} + +/* + * Give cylinder group number for a file system block. + * Give cylinder group block number for a file system block. + */ +/* #define ufs_dtog(d) ((d) / uspi->s_fpg) */ +static inline u64 ufs_dtog(struct ufs_sb_private_info * uspi, u64 b) +{ + do_div(b, uspi->s_fpg); + return b; +} +/* #define ufs_dtogd(d) ((d) % uspi->s_fpg) */ +static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) +{ + return do_div(b, uspi->s_fpg); +} + +extern void lock_ufs(struct super_block *sb); +extern void unlock_ufs(struct super_block *sb); + +#endif /* _UFS_UFS_H */ diff --git a/kernel/fs/ufs/ufs_fs.h b/kernel/fs/ufs/ufs_fs.h new file mode 100644 index 000000000..0cbd5d340 --- /dev/null +++ b/kernel/fs/ufs/ufs_fs.h @@ -0,0 +1,960 @@ +/* + * linux/include/linux/ufs_fs.h + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * Clean swab support by Fare + * just hope no one is using NNUUXXI on __?64 structure elements + * 64-bit clean thanks to Maciej W. Rozycki + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * NeXTstep support added on February 5th 1998 by + * Niels Kristian Bech Jensen . + * + * Write support by Daniel Pirkl + * + * HP/UX hfs filesystem support added by + * Martin K. Petersen , August 1999 + * + * UFS2 (of FreeBSD 5.x) support added by + * Niraj Kumar , Jan 2004 + * + */ + +#ifndef __LINUX_UFS_FS_H +#define __LINUX_UFS_FS_H + +#include +#include +#include +#include +#include + +#include +typedef __u64 __bitwise __fs64; +typedef __u32 __bitwise __fs32; +typedef __u16 __bitwise __fs16; + +#define UFS_BBLOCK 0 +#define UFS_BBSIZE 8192 +#define UFS_SBLOCK 8192 +#define UFS_SBSIZE 8192 + +#define UFS_SECTOR_SIZE 512 +#define UFS_SECTOR_BITS 9 +#define UFS_MAGIC 0x00011954 +#define UFS_MAGIC_BW 0x0f242697 +#define UFS2_MAGIC 0x19540119 +#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ + +/* Copied from FreeBSD */ +/* + * Each disk drive contains some number of filesystems. + * A filesystem consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A filesystem is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For filesystem fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * Depending on the architecture and the media, the superblock may + * reside in any one of four places. For tiny media where every block + * counts, it is placed at the very front of the partition. Historically, + * UFS1 placed it 8K from the front to leave room for the disk label and + * a small bootstrap. For UFS2 it got moved to 64K from the front to leave + * room for the disk label and a bigger bootstrap, and for really piggy + * systems we check at 256K from the front if the first three fail. In + * all cases the size of the superblock will be SBLOCKSIZE. All values are + * given in byte-offset form, so they do not imply a sector size. The + * SBLOCKSEARCH specifies the order in which the locations should be searched. + */ +#define SBLOCK_FLOPPY 0 +#define SBLOCK_UFS1 8192 +#define SBLOCK_UFS2 65536 +#define SBLOCK_PIGGY 262144 +#define SBLOCKSIZE 8192 +#define SBLOCKSEARCH \ + { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } + + +/* HP specific MAGIC values */ + +#define UFS_MAGIC_LFN 0x00095014 /* fs supports filenames > 14 chars */ +#define UFS_CIGAM_LFN 0x14500900 /* srahc 41 < semanelif stroppus sf */ + +#define UFS_MAGIC_SEC 0x00612195 /* B1 security fs */ +#define UFS_CIGAM_SEC 0x95216100 + +#define UFS_MAGIC_FEA 0x00195612 /* fs_featurebits supported */ +#define UFS_CIGAM_FEA 0x12561900 + +#define UFS_MAGIC_4GB 0x05231994 /* fs > 4 GB && fs_featurebits */ +#define UFS_CIGAM_4GB 0x94192305 + +/* Seems somebody at HP goofed here. B1 and lfs are both 0x2 !?! */ +#define UFS_FSF_LFN 0x00000001 /* long file names */ +#define UFS_FSF_B1 0x00000002 /* B1 security */ +#define UFS_FSF_LFS 0x00000002 /* large files */ +#define UFS_FSF_LUID 0x00000004 /* large UIDs */ + +/* End of HP stuff */ + + +#define UFS_BSIZE 8192 +#define UFS_MINBSIZE 4096 +#define UFS_FSIZE 1024 +#define UFS_MAXFRAG (UFS_BSIZE / UFS_FSIZE) + +#define UFS_NDADDR 12 +#define UFS_NINDIR 3 + +#define UFS_IND_BLOCK (UFS_NDADDR + 0) +#define UFS_DIND_BLOCK (UFS_NDADDR + 1) +#define UFS_TIND_BLOCK (UFS_NDADDR + 2) + +#define UFS_NDIR_FRAGMENT (UFS_NDADDR << uspi->s_fpbshift) +#define UFS_IND_FRAGMENT (UFS_IND_BLOCK << uspi->s_fpbshift) +#define UFS_DIND_FRAGMENT (UFS_DIND_BLOCK << uspi->s_fpbshift) +#define UFS_TIND_FRAGMENT (UFS_TIND_BLOCK << uspi->s_fpbshift) + +#define UFS_ROOTINO 2 +#define UFS_FIRST_INO (UFS_ROOTINO + 1) + +#define UFS_USEEFT ((__u16)65535) + +/* fs_clean values */ +#define UFS_FSOK 0x7c269d38 +#define UFS_FSACTIVE ((__s8)0x00) +#define UFS_FSCLEAN ((__s8)0x01) +#define UFS_FSSTABLE ((__s8)0x02) +#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ +#define UFS_FSBAD ((__s8)0xff) + +/* Solaris-specific fs_clean values */ +#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */ +#define UFS_FSLOG ((__s8)0xfd) /* logging fs */ +#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */ + +/* From here to next blank line, s_flags for ufs_sb_info */ +/* directory entry encoding */ +#define UFS_DE_MASK 0x00000010 /* mask for the following */ +#define UFS_DE_OLD 0x00000000 +#define UFS_DE_44BSD 0x00000010 +/* uid encoding */ +#define UFS_UID_MASK 0x00000060 /* mask for the following */ +#define UFS_UID_OLD 0x00000000 +#define UFS_UID_44BSD 0x00000020 +#define UFS_UID_EFT 0x00000040 +/* superblock state encoding */ +#define UFS_ST_MASK 0x00000700 /* mask for the following */ +#define UFS_ST_OLD 0x00000000 +#define UFS_ST_44BSD 0x00000100 +#define UFS_ST_SUN 0x00000200 /* Solaris */ +#define UFS_ST_SUNOS 0x00000300 +#define UFS_ST_SUNx86 0x00000400 /* Solaris x86 */ +/*cylinder group encoding */ +#define UFS_CG_MASK 0x00003000 /* mask for the following */ +#define UFS_CG_OLD 0x00000000 +#define UFS_CG_44BSD 0x00002000 +#define UFS_CG_SUN 0x00001000 +/* filesystem type encoding */ +#define UFS_TYPE_MASK 0x00010000 /* mask for the following */ +#define UFS_TYPE_UFS1 0x00000000 +#define UFS_TYPE_UFS2 0x00010000 + + +/* fs_inodefmt options */ +#define UFS_42INODEFMT -1 +#define UFS_44INODEFMT 2 + +/* + * MINFREE gives the minimum acceptable percentage of file system + * blocks which may be free. If the freelist drops below this level + * only the superuser may continue to allocate blocks. This may + * be set to 0 if no reserve of free blocks is deemed necessary, + * however throughput drops by fifty percent if the file system + * is run at between 95% and 100% full; thus the minimum default + * value of fs_minfree is 5%. However, to get good clustering + * performance, 10% is a better choice. hence we use 10% as our + * default value. With 10% free space, fragmentation is not a + * problem, so we choose to optimize for time. + */ +#define UFS_MINFREE 5 +#define UFS_DEFAULTOPT UFS_OPTTIME + +/* + * Turn file system block numbers into disk block addresses. + * This maps file system blocks to device size blocks. + */ +#define ufs_fsbtodb(uspi, b) ((b) << (uspi)->s_fsbtodb) +#define ufs_dbtofsb(uspi, b) ((b) >> (uspi)->s_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc file system addresses of cylinder group data structures. + */ +#define ufs_cgbase(c) (uspi->s_fpg * (c)) +#define ufs_cgstart(c) ((uspi)->fs_magic == UFS2_MAGIC ? ufs_cgbase(c) : \ + (ufs_cgbase(c) + uspi->s_cgoffset * ((c) & ~uspi->s_cgmask))) +#define ufs_cgsblock(c) (ufs_cgstart(c) + uspi->s_sblkno) /* super blk */ +#define ufs_cgcmin(c) (ufs_cgstart(c) + uspi->s_cblkno) /* cg block */ +#define ufs_cgimin(c) (ufs_cgstart(c) + uspi->s_iblkno) /* inode blk */ +#define ufs_cgdmin(c) (ufs_cgstart(c) + uspi->s_dblkno) /* 1st data */ + +/* + * Macros for handling inode numbers: + * inode number to file system block offset. + * inode number to cylinder group number. + * inode number to file system block address. + */ +#define ufs_inotocg(x) ((x) / uspi->s_ipg) +#define ufs_inotocgoff(x) ((x) % uspi->s_ipg) +#define ufs_inotofsba(x) (((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf) +#define ufs_inotofsbo(x) ((x) % uspi->s_inopf) + +/* + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define ufs_cbtocylno(bno) \ + ((bno) * uspi->s_nspf / uspi->s_spc) +#define ufs_cbtorpos(bno) \ + ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \ + (((((bno) * uspi->s_nspf % uspi->s_spc) % \ + uspi->s_nsect) * \ + uspi->s_nrpos) / uspi->s_nsect) \ + : \ + ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ + * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ + % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ + * uspi->s_nrpos) / uspi->s_npsect)) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define ufs_blkoff(loc) ((loc) & uspi->s_qbmask) +#define ufs_fragoff(loc) ((loc) & uspi->s_qfmask) +#define ufs_lblktosize(blk) ((blk) << uspi->s_bshift) +#define ufs_lblkno(loc) ((loc) >> uspi->s_bshift) +#define ufs_numfrags(loc) ((loc) >> uspi->s_fshift) +#define ufs_blkroundup(size) (((size) + uspi->s_qbmask) & uspi->s_bmask) +#define ufs_fragroundup(size) (((size) + uspi->s_qfmask) & uspi->s_fmask) +#define ufs_fragstoblks(frags) ((frags) >> uspi->s_fpbshift) +#define ufs_blkstofrags(blks) ((blks) << uspi->s_fpbshift) +#define ufs_fragnum(fsb) ((fsb) & uspi->s_fpbmask) +#define ufs_blknum(fsb) ((fsb) & ~uspi->s_fpbmask) + +#define UFS_MAXNAMLEN 255 +#define UFS_MAXMNTLEN 512 +#define UFS2_MAXMNTLEN 468 +#define UFS2_MAXVOLLEN 32 +#define UFS_MAXCSBUFS 31 +#define UFS_LINK_MAX 32000 +/* +#define UFS2_NOCSPTRS ((128 / sizeof(void *)) - 4) +*/ +#define UFS2_NOCSPTRS 28 + +/* + * UFS_DIR_PAD defines the directory entries boundaries + * (must be a multiple of 4) + */ +#define UFS_DIR_PAD 4 +#define UFS_DIR_ROUND (UFS_DIR_PAD - 1) +#define UFS_DIR_REC_LEN(name_len) (((name_len) + 1 + 8 + UFS_DIR_ROUND) & ~UFS_DIR_ROUND) + +struct ufs_timeval { + __fs32 tv_sec; + __fs32 tv_usec; +}; + +struct ufs_dir_entry { + __fs32 d_ino; /* inode number of this entry */ + __fs16 d_reclen; /* length of this entry */ + union { + __fs16 d_namlen; /* actual length of d_name */ + struct { + __u8 d_type; /* file type */ + __u8 d_namlen; /* length of string in d_name */ + } d_44; + } d_u; + __u8 d_name[UFS_MAXNAMLEN + 1]; /* file name */ +}; + +struct ufs_csum { + __fs32 cs_ndir; /* number of directories */ + __fs32 cs_nbfree; /* number of free blocks */ + __fs32 cs_nifree; /* number of free inodes */ + __fs32 cs_nffree; /* number of free frags */ +}; +struct ufs2_csum_total { + __fs64 cs_ndir; /* number of directories */ + __fs64 cs_nbfree; /* number of free blocks */ + __fs64 cs_nifree; /* number of free inodes */ + __fs64 cs_nffree; /* number of free frags */ + __fs64 cs_numclusters; /* number of free clusters */ + __fs64 cs_spare[3]; /* future expansion */ +}; + +struct ufs_csum_core { + __u64 cs_ndir; /* number of directories */ + __u64 cs_nbfree; /* number of free blocks */ + __u64 cs_nifree; /* number of free inodes */ + __u64 cs_nffree; /* number of free frags */ + __u64 cs_numclusters; /* number of free clusters */ +}; + +/* + * File system flags + */ +#define UFS_UNCLEAN 0x01 /* file system not clean at mount (unused) */ +#define UFS_DOSOFTDEP 0x02 /* file system using soft dependencies */ +#define UFS_NEEDSFSCK 0x04 /* needs sync fsck (FreeBSD compat, unused) */ +#define UFS_INDEXDIRS 0x08 /* kernel supports indexed directories */ +#define UFS_ACLS 0x10 /* file system has ACLs enabled */ +#define UFS_MULTILABEL 0x20 /* file system is MAC multi-label */ +#define UFS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */ + +#if 0 +/* + * This is the actual superblock, as it is laid out on the disk. + * Do NOT use this structure, because of sizeof(ufs_super_block) > 512 and + * it may occupy several blocks, use + * struct ufs_super_block_(first,second,third) instead. + */ +struct ufs_super_block { + union { + struct { + __fs32 fs_link; /* UNUSED */ + } fs_42; + struct { + __fs32 fs_state; /* file system state flag */ + } fs_sun; + } fs_u0; + __fs32 fs_rlink; /* UNUSED */ + __fs32 fs_sblkno; /* addr of super-block in filesys */ + __fs32 fs_cblkno; /* offset of cyl-block in filesys */ + __fs32 fs_iblkno; /* offset of inode-blocks in filesys */ + __fs32 fs_dblkno; /* offset of first data after cg */ + __fs32 fs_cgoffset; /* cylinder group offset in cylinder */ + __fs32 fs_cgmask; /* used to calc mod fs_ntrak */ + __fs32 fs_time; /* last time written -- time_t */ + __fs32 fs_size; /* number of blocks in fs */ + __fs32 fs_dsize; /* number of data blocks in fs */ + __fs32 fs_ncg; /* number of cylinder groups */ + __fs32 fs_bsize; /* size of basic blocks in fs */ + __fs32 fs_fsize; /* size of frag blocks in fs */ + __fs32 fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + __fs32 fs_minfree; /* minimum percentage of free blocks */ + __fs32 fs_rotdelay; /* num of ms for optimal next block */ + __fs32 fs_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + __fs32 fs_bmask; /* ``blkoff'' calc of blk offsets */ + __fs32 fs_fmask; /* ``fragoff'' calc of frag offsets */ + __fs32 fs_bshift; /* ``lblkno'' calc of logical blkno */ + __fs32 fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + __fs32 fs_maxcontig; /* max number of contiguous blks */ + __fs32 fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + __fs32 fs_fragshift; /* block to frag shift */ + __fs32 fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + __fs32 fs_sbsize; /* actual size of super block */ + __fs32 fs_csmask; /* csum block offset */ + __fs32 fs_csshift; /* csum block number */ + __fs32 fs_nindir; /* value of NINDIR */ + __fs32 fs_inopb; /* value of INOPB */ + __fs32 fs_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + __fs32 fs_optim; /* optimization preference, see below */ +/* these fields are derived from the hardware */ + union { + struct { + __fs32 fs_npsect; /* # sectors/track including spares */ + } fs_sun; + struct { + __fs32 fs_state; /* file system state time stamp */ + } fs_sunx86; + } fs_u1; + __fs32 fs_interleave; /* hardware sector interleave */ + __fs32 fs_trackskew; /* sector 0 skew, per track */ +/* a unique id for this filesystem (currently unused and unmaintained) */ +/* In 4.3 Tahoe this space is used by fs_headswitch and fs_trkseek */ +/* Neither of those fields is used in the Tahoe code right now but */ +/* there could be problems if they are. */ + __fs32 fs_id[2]; /* file system id */ +/* sizes determined by number of cylinder groups and their sizes */ + __fs32 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs32 fs_cssize; /* size of cyl grp summary area */ + __fs32 fs_cgsize; /* cylinder group size */ +/* these fields are derived from the hardware */ + __fs32 fs_ntrak; /* tracks per cylinder */ + __fs32 fs_nsect; /* sectors per track */ + __fs32 fs_spc; /* sectors per cylinder */ +/* this comes from the disk driver partitioning */ + __fs32 fs_ncyl; /* cylinders in file system */ +/* these fields can be computed from the others */ + __fs32 fs_cpg; /* cylinders per group */ + __fs32 fs_ipg; /* inodes per cylinder group */ + __fs32 fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct ufs_csum fs_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + __s8 fs_fmod; /* super block modified flag */ + __s8 fs_clean; /* file system is clean flag */ + __s8 fs_ronly; /* mounted read-only flag */ + __s8 fs_flags; + union { + struct { + __s8 fs_fsmnt[UFS_MAXMNTLEN];/* name mounted on */ + __fs32 fs_cgrotor; /* last cg searched */ + __fs32 fs_csp[UFS_MAXCSBUFS];/*list of fs_cs info buffers */ + __fs32 fs_maxcluster; + __fs32 fs_cpc; /* cyl per cycle in postbl */ + __fs16 fs_opostbl[16][8]; /* old rotation block list head */ + } fs_u1; + struct { + __s8 fs_fsmnt[UFS2_MAXMNTLEN]; /* name mounted on */ + __u8 fs_volname[UFS2_MAXVOLLEN]; /* volume name */ + __fs64 fs_swuid; /* system-wide uid */ + __fs32 fs_pad; /* due to alignment of fs_swuid */ + __fs32 fs_cgrotor; /* last cg searched */ + __fs32 fs_ocsp[UFS2_NOCSPTRS]; /*list of fs_cs info buffers */ + __fs32 fs_contigdirs;/*# of contiguously allocated dirs */ + __fs32 fs_csp; /* cg summary info buffer for fs_cs */ + __fs32 fs_maxcluster; + __fs32 fs_active;/* used by snapshots to track fs */ + __fs32 fs_old_cpc; /* cyl per cycle in postbl */ + __fs32 fs_maxbsize;/*maximum blocking factor permitted */ + __fs64 fs_sparecon64[17];/*old rotation block list head */ + __fs64 fs_sblockloc; /* byte offset of standard superblock */ + struct ufs2_csum_total fs_cstotal;/*cylinder summary information*/ + struct ufs_timeval fs_time; /* last time written */ + __fs64 fs_size; /* number of blocks in fs */ + __fs64 fs_dsize; /* number of data blocks in fs */ + __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs64 fs_pendingblocks;/* blocks in process of being freed */ + __fs32 fs_pendinginodes;/*inodes in process of being freed */ + } fs_u2; + } fs_u11; + union { + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_state; /* file system state time stamp */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sun; + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_npsect; /* # sectors/track including spares */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sunx86; + struct { + __fs32 fs_sparecon[50];/* reserved for future constants */ + __fs32 fs_contigsumsize;/* size of cluster summary array */ + __fs32 fs_maxsymlinklen;/* max length of an internal symlink */ + __fs32 fs_inodefmt; /* format of on-disk inodes */ + __fs32 fs_maxfilesize[2]; /* max representable file size */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + __fs32 fs_state; /* file system state time stamp */ + } fs_44; + } fs_u2; + __fs32 fs_postblformat; /* format of positional layout tables */ + __fs32 fs_nrpos; /* number of rotational positions */ + __fs32 fs_postbloff; /* (__s16) rotation block list head */ + __fs32 fs_rotbloff; /* (__u8) blocks for each rotation */ + __fs32 fs_magic; /* magic number */ + __u8 fs_space[1]; /* list of blocks for each rotation */ +}; +#endif/*struct ufs_super_block*/ + +/* + * Preference for optimization. + */ +#define UFS_OPTTIME 0 /* minimize allocation time */ +#define UFS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Rotational layout table format types + */ +#define UFS_42POSTBLFMT -1 /* 4.2BSD rotational table format */ +#define UFS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */ + +/* + * Convert cylinder group to base address of its global summary info. + */ +#define fs_cs(indx) s_csp[(indx)] + +/* + * Cylinder group block for a file system. + * + * Writable fields in the cylinder group are protected by the associated + * super block lock fs->fs_lock. + */ +#define CG_MAGIC 0x090255 +#define ufs_cg_chkmagic(sb, ucg) \ + (fs32_to_cpu((sb), (ucg)->cg_magic) == CG_MAGIC) +/* + * Macros for access to old cylinder group array structures + */ +#define ufs_ocg_blktot(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_btot) +#define ufs_ocg_blks(sb, ucg, cylno) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_b[cylno]) +#define ufs_ocg_inosused(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_iused) +#define ufs_ocg_blksfree(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_free) +#define ufs_ocg_chkmagic(sb, ucg) \ + (fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_magic) == CG_MAGIC) + +/* + * size of this structure is 172 B + */ +struct ufs_cylinder_group { + __fs32 cg_link; /* linked list of cyl groups */ + __fs32 cg_magic; /* magic number */ + __fs32 cg_time; /* time last written */ + __fs32 cg_cgx; /* we are the cgx'th cylinder group */ + __fs16 cg_ncyl; /* number of cyl's this cg */ + __fs16 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_ndblk; /* number of data blocks this cg */ + struct ufs_csum cg_cs; /* cylinder summary information */ + __fs32 cg_rotor; /* position of last used block */ + __fs32 cg_frotor; /* position of last used frag */ + __fs32 cg_irotor; /* position of last used inode */ + __fs32 cg_frsum[UFS_MAXFRAG]; /* counts of available frags */ + __fs32 cg_btotoff; /* (__u32) block totals per cylinder */ + __fs32 cg_boff; /* (short) free block positions */ + __fs32 cg_iusedoff; /* (char) used inode map */ + __fs32 cg_freeoff; /* (u_char) free block map */ + __fs32 cg_nextfreeoff; /* (u_char) next available space */ + union { + struct { + __fs32 cg_clustersumoff; /* (u_int32) counts of avail clusters */ + __fs32 cg_clusteroff; /* (u_int8) free cluster map */ + __fs32 cg_nclusterblks; /* number of clusters this cg */ + __fs32 cg_sparecon[13]; /* reserved for future use */ + } cg_44; + struct { + __fs32 cg_clustersumoff;/* (u_int32) counts of avail clusters */ + __fs32 cg_clusteroff; /* (u_int8) free cluster map */ + __fs32 cg_nclusterblks;/* number of clusters this cg */ + __fs32 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_initediblk; /* last initialized inode */ + __fs32 cg_sparecon32[3];/* reserved for future use */ + __fs64 cg_time; /* time last written */ + __fs64 cg_sparecon[3]; /* reserved for future use */ + } cg_u2; + __fs32 cg_sparecon[16]; /* reserved for future use */ + } cg_u; + __u8 cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; + +/* Historic Cylinder group info */ +struct ufs_old_cylinder_group { + __fs32 cg_link; /* linked list of cyl groups */ + __fs32 cg_rlink; /* for incore cyl groups */ + __fs32 cg_time; /* time last written */ + __fs32 cg_cgx; /* we are the cgx'th cylinder group */ + __fs16 cg_ncyl; /* number of cyl's this cg */ + __fs16 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_ndblk; /* number of data blocks this cg */ + struct ufs_csum cg_cs; /* cylinder summary information */ + __fs32 cg_rotor; /* position of last used block */ + __fs32 cg_frotor; /* position of last used frag */ + __fs32 cg_irotor; /* position of last used inode */ + __fs32 cg_frsum[8]; /* counts of available frags */ + __fs32 cg_btot[32]; /* block totals per cylinder */ + __fs16 cg_b[32][8]; /* positions of free blocks */ + __u8 cg_iused[256]; /* used inode map */ + __fs32 cg_magic; /* magic number */ + __u8 cg_free[1]; /* free block map */ +/* actually longer */ +}; + +/* + * structure of an on-disk inode + */ +struct ufs_inode { + __fs16 ui_mode; /* 0x0 */ + __fs16 ui_nlink; /* 0x2 */ + union { + struct { + __fs16 ui_suid; /* 0x4 */ + __fs16 ui_sgid; /* 0x6 */ + } oldids; + __fs32 ui_inumber; /* 0x4 lsf: inode number */ + __fs32 ui_author; /* 0x4 GNU HURD: author */ + } ui_u1; + __fs64 ui_size; /* 0x8 */ + struct ufs_timeval ui_atime; /* 0x10 access */ + struct ufs_timeval ui_mtime; /* 0x18 modification */ + struct ufs_timeval ui_ctime; /* 0x20 creation */ + union { + struct { + __fs32 ui_db[UFS_NDADDR];/* 0x28 data blocks */ + __fs32 ui_ib[UFS_NINDIR];/* 0x58 indirect blocks */ + } ui_addr; + __u8 ui_symlink[4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */ + } ui_u2; + __fs32 ui_flags; /* 0x64 immutable, append-only... */ + __fs32 ui_blocks; /* 0x68 blocks in use */ + __fs32 ui_gen; /* 0x6c like ext2 i_version, for NFS support */ + union { + struct { + __fs32 ui_shadow; /* 0x70 shadow inode with security data */ + __fs32 ui_uid; /* 0x74 long EFT version of uid */ + __fs32 ui_gid; /* 0x78 long EFT version of gid */ + __fs32 ui_oeftflag; /* 0x7c reserved */ + } ui_sun; + struct { + __fs32 ui_uid; /* 0x70 File owner */ + __fs32 ui_gid; /* 0x74 File group */ + __fs32 ui_spare[2]; /* 0x78 reserved */ + } ui_44; + struct { + __fs32 ui_uid; /* 0x70 */ + __fs32 ui_gid; /* 0x74 */ + __fs16 ui_modeh; /* 0x78 mode high bits */ + __fs16 ui_spare; /* 0x7A unused */ + __fs32 ui_trans; /* 0x7c filesystem translator */ + } ui_hurd; + } ui_u3; +}; + +#define UFS_NXADDR 2 /* External addresses in inode. */ +struct ufs2_inode { + __fs16 ui_mode; /* 0: IFMT, permissions; see below. */ + __fs16 ui_nlink; /* 2: File link count. */ + __fs32 ui_uid; /* 4: File owner. */ + __fs32 ui_gid; /* 8: File group. */ + __fs32 ui_blksize; /* 12: Inode blocksize. */ + __fs64 ui_size; /* 16: File byte count. */ + __fs64 ui_blocks; /* 24: Bytes actually held. */ + __fs64 ui_atime; /* 32: Last access time. */ + __fs64 ui_mtime; /* 40: Last modified time. */ + __fs64 ui_ctime; /* 48: Last inode change time. */ + __fs64 ui_birthtime; /* 56: Inode creation time. */ + __fs32 ui_mtimensec; /* 64: Last modified time. */ + __fs32 ui_atimensec; /* 68: Last access time. */ + __fs32 ui_ctimensec; /* 72: Last inode change time. */ + __fs32 ui_birthnsec; /* 76: Inode creation time. */ + __fs32 ui_gen; /* 80: Generation number. */ + __fs32 ui_kernflags; /* 84: Kernel flags. */ + __fs32 ui_flags; /* 88: Status flags (chflags). */ + __fs32 ui_extsize; /* 92: External attributes block. */ + __fs64 ui_extb[UFS_NXADDR];/* 96: External attributes block. */ + union { + struct { + __fs64 ui_db[UFS_NDADDR]; /* 112: Direct disk blocks. */ + __fs64 ui_ib[UFS_NINDIR];/* 208: Indirect disk blocks.*/ + } ui_addr; + __u8 ui_symlink[2*4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */ + } ui_u2; + __fs64 ui_spare[3]; /* 232: Reserved; currently unused */ +}; + + +/* FreeBSD has these in sys/stat.h */ +/* ui_flags that can be set by a file owner */ +#define UFS_UF_SETTABLE 0x0000ffff +#define UFS_UF_NODUMP 0x00000001 /* do not dump */ +#define UFS_UF_IMMUTABLE 0x00000002 /* immutable (can't "change") */ +#define UFS_UF_APPEND 0x00000004 /* append-only */ +#define UFS_UF_OPAQUE 0x00000008 /* directory is opaque (unionfs) */ +#define UFS_UF_NOUNLINK 0x00000010 /* can't be removed or renamed */ +/* ui_flags that only root can set */ +#define UFS_SF_SETTABLE 0xffff0000 +#define UFS_SF_ARCHIVED 0x00010000 /* archived */ +#define UFS_SF_IMMUTABLE 0x00020000 /* immutable (can't "change") */ +#define UFS_SF_APPEND 0x00040000 /* append-only */ +#define UFS_SF_NOUNLINK 0x00100000 /* can't be removed or renamed */ + +/* + * This structure is used for reading disk structures larger + * than the size of fragment. + */ +struct ufs_buffer_head { + __u64 fragment; /* first fragment */ + __u64 count; /* number of fragments */ + struct buffer_head * bh[UFS_MAXFRAG]; /* buffers */ +}; + +struct ufs_cg_private_info { + struct ufs_buffer_head c_ubh; + __u32 c_cgx; /* number of cylidner group */ + __u16 c_ncyl; /* number of cyl's this cg */ + __u16 c_niblk; /* number of inode blocks this cg */ + __u32 c_ndblk; /* number of data blocks this cg */ + __u32 c_rotor; /* position of last used block */ + __u32 c_frotor; /* position of last used frag */ + __u32 c_irotor; /* position of last used inode */ + __u32 c_btotoff; /* (__u32) block totals per cylinder */ + __u32 c_boff; /* (short) free block positions */ + __u32 c_iusedoff; /* (char) used inode map */ + __u32 c_freeoff; /* (u_char) free block map */ + __u32 c_nextfreeoff; /* (u_char) next available space */ + __u32 c_clustersumoff;/* (u_int32) counts of avail clusters */ + __u32 c_clusteroff; /* (u_int8) free cluster map */ + __u32 c_nclusterblks; /* number of clusters this cg */ +}; + + +struct ufs_sb_private_info { + struct ufs_buffer_head s_ubh; /* buffer containing super block */ + struct ufs_csum_core cs_total; + __u32 s_sblkno; /* offset of super-blocks in filesys */ + __u32 s_cblkno; /* offset of cg-block in filesys */ + __u32 s_iblkno; /* offset of inode-blocks in filesys */ + __u32 s_dblkno; /* offset of first data after cg */ + __u32 s_cgoffset; /* cylinder group offset in cylinder */ + __u32 s_cgmask; /* used to calc mod fs_ntrak */ + __u32 s_size; /* number of blocks (fragments) in fs */ + __u32 s_dsize; /* number of data blocks in fs */ + __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */ + __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */ + __u32 s_ncg; /* number of cylinder groups */ + __u32 s_bsize; /* size of basic blocks */ + __u32 s_fsize; /* size of fragments */ + __u32 s_fpb; /* fragments per block */ + __u32 s_minfree; /* minimum percentage of free blocks */ + __u32 s_bmask; /* `blkoff'' calc of blk offsets */ + __u32 s_fmask; /* s_fsize mask */ + __u32 s_bshift; /* `lblkno'' calc of logical blkno */ + __u32 s_fshift; /* s_fsize shift */ + __u32 s_fpbshift; /* fragments per block shift */ + __u32 s_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + __u32 s_sbsize; /* actual size of super block */ + __u32 s_csmask; /* csum block offset */ + __u32 s_csshift; /* csum block number */ + __u32 s_nindir; /* value of NINDIR */ + __u32 s_inopb; /* value of INOPB */ + __u32 s_nspf; /* value of NSPF */ + __u32 s_npsect; /* # sectors/track including spares */ + __u32 s_interleave; /* hardware sector interleave */ + __u32 s_trackskew; /* sector 0 skew, per track */ + __u64 s_csaddr; /* blk addr of cyl grp summary area */ + __u32 s_cssize; /* size of cyl grp summary area */ + __u32 s_cgsize; /* cylinder group size */ + __u32 s_ntrak; /* tracks per cylinder */ + __u32 s_nsect; /* sectors per track */ + __u32 s_spc; /* sectors per cylinder */ + __u32 s_ipg; /* inodes per cylinder group */ + __u32 s_fpg; /* fragments per group */ + __u32 s_cpc; /* cyl per cycle in postbl */ + __s32 s_contigsumsize;/* size of cluster summary array, 44bsd */ + __s64 s_qbmask; /* ~usb_bmask */ + __s64 s_qfmask; /* ~usb_fmask */ + __s32 s_postblformat; /* format of positional layout tables */ + __s32 s_nrpos; /* number of rotational positions */ + __s32 s_postbloff; /* (__s16) rotation block list head */ + __s32 s_rotbloff; /* (__u8) blocks for each rotation */ + + __u32 s_fpbmask; /* fragments per block mask */ + __u32 s_apb; /* address per block */ + __u32 s_2apb; /* address per block^2 */ + __u32 s_3apb; /* address per block^3 */ + __u32 s_apbmask; /* address per block mask */ + __u32 s_apbshift; /* address per block shift */ + __u32 s_2apbshift; /* address per block shift * 2 */ + __u32 s_3apbshift; /* address per block shift * 3 */ + __u32 s_nspfshift; /* number of sector per fragment shift */ + __u32 s_nspb; /* number of sector per block */ + __u32 s_inopf; /* inodes per fragment */ + __u32 s_sbbase; /* offset of NeXTstep superblock */ + __u32 s_bpf; /* bits per fragment */ + __u32 s_bpfshift; /* bits per fragment shift*/ + __u32 s_bpfmask; /* bits per fragment mask */ + + __u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */ + __s32 fs_magic; /* filesystem magic */ + unsigned int s_dirblksize; +}; + +/* + * Sizes of this structures are: + * ufs_super_block_first 512 + * ufs_super_block_second 512 + * ufs_super_block_third 356 + */ +struct ufs_super_block_first { + union { + struct { + __fs32 fs_link; /* UNUSED */ + } fs_42; + struct { + __fs32 fs_state; /* file system state flag */ + } fs_sun; + } fs_u0; + __fs32 fs_rlink; + __fs32 fs_sblkno; + __fs32 fs_cblkno; + __fs32 fs_iblkno; + __fs32 fs_dblkno; + __fs32 fs_cgoffset; + __fs32 fs_cgmask; + __fs32 fs_time; + __fs32 fs_size; + __fs32 fs_dsize; + __fs32 fs_ncg; + __fs32 fs_bsize; + __fs32 fs_fsize; + __fs32 fs_frag; + __fs32 fs_minfree; + __fs32 fs_rotdelay; + __fs32 fs_rps; + __fs32 fs_bmask; + __fs32 fs_fmask; + __fs32 fs_bshift; + __fs32 fs_fshift; + __fs32 fs_maxcontig; + __fs32 fs_maxbpg; + __fs32 fs_fragshift; + __fs32 fs_fsbtodb; + __fs32 fs_sbsize; + __fs32 fs_csmask; + __fs32 fs_csshift; + __fs32 fs_nindir; + __fs32 fs_inopb; + __fs32 fs_nspf; + __fs32 fs_optim; + union { + struct { + __fs32 fs_npsect; + } fs_sun; + struct { + __fs32 fs_state; + } fs_sunx86; + } fs_u1; + __fs32 fs_interleave; + __fs32 fs_trackskew; + __fs32 fs_id[2]; + __fs32 fs_csaddr; + __fs32 fs_cssize; + __fs32 fs_cgsize; + __fs32 fs_ntrak; + __fs32 fs_nsect; + __fs32 fs_spc; + __fs32 fs_ncyl; + __fs32 fs_cpg; + __fs32 fs_ipg; + __fs32 fs_fpg; + struct ufs_csum fs_cstotal; + __s8 fs_fmod; + __s8 fs_clean; + __s8 fs_ronly; + __s8 fs_flags; + __s8 fs_fsmnt[UFS_MAXMNTLEN - 212]; + +}; + +struct ufs_super_block_second { + union { + struct { + __s8 fs_fsmnt[212]; + __fs32 fs_cgrotor; + __fs32 fs_csp[UFS_MAXCSBUFS]; + __fs32 fs_maxcluster; + __fs32 fs_cpc; + __fs16 fs_opostbl[82]; + } fs_u1; + struct { + __s8 fs_fsmnt[UFS2_MAXMNTLEN - UFS_MAXMNTLEN + 212]; + __u8 fs_volname[UFS2_MAXVOLLEN]; + __fs64 fs_swuid; + __fs32 fs_pad; + __fs32 fs_cgrotor; + __fs32 fs_ocsp[UFS2_NOCSPTRS]; + __fs32 fs_contigdirs; + __fs32 fs_csp; + __fs32 fs_maxcluster; + __fs32 fs_active; + __fs32 fs_old_cpc; + __fs32 fs_maxbsize; + __fs64 fs_sparecon64[17]; + __fs64 fs_sblockloc; + __fs64 cs_ndir; + __fs64 cs_nbfree; + } fs_u2; + } fs_un; +}; + +struct ufs_super_block_third { + union { + struct { + __fs16 fs_opostbl[46]; + } fs_u1; + struct { + __fs64 cs_nifree; /* number of free inodes */ + __fs64 cs_nffree; /* number of free frags */ + __fs64 cs_numclusters; /* number of free clusters */ + __fs64 cs_spare[3]; /* future expansion */ + struct ufs_timeval fs_time; /* last time written */ + __fs64 fs_size; /* number of blocks in fs */ + __fs64 fs_dsize; /* number of data blocks in fs */ + __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs64 fs_pendingblocks;/* blocks in process of being freed */ + __fs32 fs_pendinginodes;/*inodes in process of being freed */ + } __attribute__ ((packed)) fs_u2; + } fs_un1; + union { + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_state; /* file system state time stamp */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sun; + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_npsect; /* # sectors/track including spares */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sunx86; + struct { + __fs32 fs_sparecon[50];/* reserved for future constants */ + __fs32 fs_contigsumsize;/* size of cluster summary array */ + __fs32 fs_maxsymlinklen;/* max length of an internal symlink */ + __fs32 fs_inodefmt; /* format of on-disk inodes */ + __fs32 fs_maxfilesize[2]; /* max representable file size */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + __fs32 fs_state; /* file system state time stamp */ + } fs_44; + } fs_un2; + __fs32 fs_postblformat; + __fs32 fs_nrpos; + __fs32 fs_postbloff; + __fs32 fs_rotbloff; + __fs32 fs_magic; + __u8 fs_space[1]; +}; + +#endif /* __LINUX_UFS_FS_H */ diff --git a/kernel/fs/ufs/util.c b/kernel/fs/ufs/util.c new file mode 100644 index 000000000..b6c2f94e0 --- /dev/null +++ b/kernel/fs/ufs/util.c @@ -0,0 +1,282 @@ +/* + * linux/fs/ufs/util.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, + struct super_block *sb, u64 fragment, u64 size) +{ + struct ufs_buffer_head * ubh; + unsigned i, j ; + u64 count = 0; + if (size & ~uspi->s_fmask) + return NULL; + count = size >> uspi->s_fshift; + if (count > UFS_MAXFRAG) + return NULL; + ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); + if (!ubh) + return NULL; + ubh->fragment = fragment; + ubh->count = count; + for (i = 0; i < count; i++) + if (!(ubh->bh[i] = sb_bread(sb, fragment + i))) + goto failed; + for (; i < UFS_MAXFRAG; i++) + ubh->bh[i] = NULL; + return ubh; +failed: + for (j = 0; j < i; j++) + brelse (ubh->bh[j]); + kfree(ubh); + return NULL; +} + +struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi, + struct super_block *sb, u64 fragment, u64 size) +{ + unsigned i, j; + u64 count = 0; + if (size & ~uspi->s_fmask) + return NULL; + count = size >> uspi->s_fshift; + if (count <= 0 || count > UFS_MAXFRAG) + return NULL; + USPI_UBH(uspi)->fragment = fragment; + USPI_UBH(uspi)->count = count; + for (i = 0; i < count; i++) + if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i))) + goto failed; + for (; i < UFS_MAXFRAG; i++) + USPI_UBH(uspi)->bh[i] = NULL; + return USPI_UBH(uspi); +failed: + for (j = 0; j < i; j++) + brelse (USPI_UBH(uspi)->bh[j]); + return NULL; +} + +void ubh_brelse (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for (i = 0; i < ubh->count; i++) + brelse (ubh->bh[i]); + kfree (ubh); +} + +void ubh_brelse_uspi (struct ufs_sb_private_info * uspi) +{ + unsigned i; + if (!USPI_UBH(uspi)) + return; + for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) { + brelse (USPI_UBH(uspi)->bh[i]); + USPI_UBH(uspi)->bh[i] = NULL; + } +} + +void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for ( i = 0; i < ubh->count; i++ ) + mark_buffer_dirty (ubh->bh[i]); +} + +void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) +{ + unsigned i; + if (!ubh) + return; + if (flag) { + for ( i = 0; i < ubh->count; i++ ) + set_buffer_uptodate (ubh->bh[i]); + } else { + for ( i = 0; i < ubh->count; i++ ) + clear_buffer_uptodate (ubh->bh[i]); + } +} + +void ubh_sync_block(struct ufs_buffer_head *ubh) +{ + if (ubh) { + unsigned i; + + for (i = 0; i < ubh->count; i++) + write_dirty_buffer(ubh->bh[i], WRITE); + + for (i = 0; i < ubh->count; i++) + wait_on_buffer(ubh->bh[i]); + } +} + +void ubh_bforget (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for ( i = 0; i < ubh->count; i++ ) if ( ubh->bh[i] ) + bforget (ubh->bh[i]); +} + +int ubh_buffer_dirty (struct ufs_buffer_head * ubh) +{ + unsigned i; + unsigned result = 0; + if (!ubh) + return 0; + for ( i = 0; i < ubh->count; i++ ) + result |= buffer_dirty(ubh->bh[i]); + return result; +} + +void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, + unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size) +{ + unsigned len, bhno; + if (size > (ubh->count << uspi->s_fshift)) + size = ubh->count << uspi->s_fshift; + bhno = 0; + while (size) { + len = min_t(unsigned int, size, uspi->s_fsize); + memcpy (mem, ubh->bh[bhno]->b_data, len); + mem += uspi->s_fsize; + size -= len; + bhno++; + } +} + +void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size) +{ + unsigned len, bhno; + if (size > (ubh->count << uspi->s_fshift)) + size = ubh->count << uspi->s_fshift; + bhno = 0; + while (size) { + len = min_t(unsigned int, size, uspi->s_fsize); + memcpy (ubh->bh[bhno]->b_data, mem, len); + mem += uspi->s_fsize; + size -= len; + bhno++; + } +} + +dev_t +ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) +{ + __u32 fs32; + dev_t dev; + + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]); + else + fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]); + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNx86: + case UFS_ST_SUN: + if ((fs32 & 0xffff0000) == 0 || + (fs32 & 0xffff0000) == 0xffff0000) + dev = old_decode_dev(fs32 & 0x7fff); + else + dev = MKDEV(sysv_major(fs32), sysv_minor(fs32)); + break; + + default: + dev = old_decode_dev(fs32); + break; + } + return dev; +} + +void +ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev) +{ + __u32 fs32; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNx86: + case UFS_ST_SUN: + fs32 = sysv_encode_dev(dev); + if ((fs32 & 0xffff8000) == 0) { + fs32 = old_encode_dev(dev); + } + break; + + default: + fs32 = old_encode_dev(dev); + break; + } + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32); + else + ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); +} + +/** + * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist + * read it from disk. + * @mapping: the address_space to search + * @index: the page index + * + * Locates the desired pagecache page, if not exist we'll read it, + * locks it, increments its reference + * count and returns its address. + * + */ + +struct page *ufs_get_locked_page(struct address_space *mapping, + pgoff_t index) +{ + struct page *page; + + page = find_lock_page(mapping, index); + if (!page) { + page = read_mapping_page(mapping, index, NULL); + + if (IS_ERR(page)) { + printk(KERN_ERR "ufs_change_blocknr: " + "read_mapping_page error: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping == NULL)) { + /* Truncate got there first */ + unlock_page(page); + page_cache_release(page); + page = NULL; + goto out; + } + + if (!PageUptodate(page) || PageError(page)) { + unlock_page(page); + page_cache_release(page); + + printk(KERN_ERR "ufs_change_blocknr: " + "can not read page: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + + page = ERR_PTR(-EIO); + } + } +out: + return page; +} diff --git a/kernel/fs/ufs/util.h b/kernel/fs/ufs/util.h new file mode 100644 index 000000000..954175928 --- /dev/null +++ b/kernel/fs/ufs/util.h @@ -0,0 +1,592 @@ +/* + * linux/fs/ufs/util.h + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +#include +#include +#include "swab.h" + + +/* + * some useful macros + */ +#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) + +/* + * functions used for retyping + */ +static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi) +{ + return &cpi->c_ubh; +} +static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi) +{ + return &spi->s_ubh; +} + + + +/* + * macros used for accessing structures + */ +static inline s32 +ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3) +{ + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) + return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); + /* Fall Through to UFS_ST_SUN */ + case UFS_ST_SUN: + return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); + case UFS_ST_SUNx86: + return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); + case UFS_ST_44BSD: + default: + return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state); + } +} + +static inline void +ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3, s32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) { + usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); + break; + } + /* Fall Through to UFS_ST_SUN */ + case UFS_ST_SUN: + usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); + break; + case UFS_ST_SUNx86: + usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); + break; + case UFS_ST_44BSD: + usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value); + break; + } +} + +static inline u32 +ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3) +{ + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect); + else + return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); +} + +static inline u64 +ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3) +{ + __fs64 tmp; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + case UFS_ST_SUN: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1]; + break; + case UFS_ST_SUNx86: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1]; + break; + case UFS_ST_44BSD: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1]; + break; + } + + return fs64_to_cpu(sb, tmp); +} + +static inline u64 +ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3) +{ + __fs64 tmp; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + case UFS_ST_SUN: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1]; + break; + case UFS_ST_SUNx86: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1]; + break; + case UFS_ST_44BSD: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1]; + break; + } + + return fs64_to_cpu(sb, tmp); +} + +static inline u16 +ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) + return fs16_to_cpu(sb, de->d_u.d_namlen); + else + return de->d_u.d_44.d_namlen; /* XXX this seems wrong */ +} + +static inline void +ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) + de->d_u.d_namlen = cpu_to_fs16(sb, value); + else + de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */ +} + +static inline void +ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD) + return; + + /* + * TODO turn this into a table lookup + */ + switch (mode & S_IFMT) { + case S_IFSOCK: + de->d_u.d_44.d_type = DT_SOCK; + break; + case S_IFLNK: + de->d_u.d_44.d_type = DT_LNK; + break; + case S_IFREG: + de->d_u.d_44.d_type = DT_REG; + break; + case S_IFBLK: + de->d_u.d_44.d_type = DT_BLK; + break; + case S_IFDIR: + de->d_u.d_44.d_type = DT_DIR; + break; + case S_IFCHR: + de->d_u.d_44.d_type = DT_CHR; + break; + case S_IFIFO: + de->d_u.d_44.d_type = DT_FIFO; + break; + default: + de->d_u.d_44.d_type = DT_UNKNOWN; + } +} + +static inline u32 +ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid); + case UFS_UID_EFT: + if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); + /* Fall through */ + default: + return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); + } +} + +static inline void +ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value); + inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); + break; + case UFS_UID_EFT: + inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); + if (value > 0xFFFF) + value = 0xFFFF; + /* Fall through */ + default: + inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); + break; + } +} + +static inline u32 +ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); + case UFS_UID_EFT: + if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); + /* Fall through */ + default: + return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); + } +} + +static inline void +ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value); + inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); + break; + case UFS_UID_EFT: + inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); + if (value > 0xFFFF) + value = 0xFFFF; + /* Fall through */ + default: + inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); + break; + } +} + +extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); +extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); +extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +/* + * These functions manipulate ufs buffers + */ +#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size) +extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64); +extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64); +extern void ubh_brelse (struct ufs_buffer_head *); +extern void ubh_brelse_uspi (struct ufs_sb_private_info *); +extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); +extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); +extern void ubh_sync_block(struct ufs_buffer_head *); +extern void ubh_bforget (struct ufs_buffer_head *); +extern int ubh_buffer_dirty (struct ufs_buffer_head *); +#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) +extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned); +#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size) +extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); + +/* This functions works with cache pages*/ +extern struct page *ufs_get_locked_page(struct address_space *mapping, + pgoff_t index); +static inline void ufs_put_locked_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + + +/* + * macros and inline function to get important structures from ufs_sb_private_info + */ + +static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, + unsigned int offset) +{ + unsigned int index; + + index = offset >> uspi->s_fshift; + offset &= ~uspi->s_fmask; + return uspi->s_ubh.bh[index]->b_data + offset; +} + +#define ubh_get_usb_first(uspi) \ + ((struct ufs_super_block_first *)get_usb_offset((uspi), 0)) + +#define ubh_get_usb_second(uspi) \ + ((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE)) + +#define ubh_get_usb_third(uspi) \ + ((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE)) + + +#define ubh_get_ucg(ubh) \ + ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data)) + + +/* + * Extract byte from ufs_buffer_head + * Extract the bits for a block from a map inside ufs_buffer_head + */ +#define ubh_get_addr8(ubh,begin) \ + ((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \ + ((begin) & ~uspi->s_fmask)) + +#define ubh_get_addr16(ubh,begin) \ + (((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \ + ((begin) & ((uspi->fsize>>1) - 1))) + +#define ubh_get_addr32(ubh,begin) \ + (((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \ + ((begin) & ((uspi->s_fsize>>2) - 1))) + +#define ubh_get_addr64(ubh,begin) \ + (((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \ + ((begin) & ((uspi->s_fsize>>3) - 1))) + +#define ubh_get_addr ubh_get_addr8 + +static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + u64 blk) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return ubh_get_addr64(ubh, blk); + else + return ubh_get_addr32(ubh, blk); +} + +#define ubh_blkmap(ubh,begin,bit) \ + ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve. + */ +static inline u64 +ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +{ + return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree - + (uspi->s_dsize * (percentreserved) / 100); +} + +/* + * Macros to access cylinder group array structures + */ +#define ubh_cg_blktot(ucpi,cylno) \ + (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2)))) + +#define ubh_cg_blks(ucpi,cylno,rpos) \ + (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \ + (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) + +/* + * Bitmap operations + * These functions work like classical bitmap operations. + * The difference is that we don't have the whole bitmap + * in one contiguous chunk of memory, but in several buffers. + * The parameters of each function are super_block, ufs_buffer_head and + * position of the beginning of the bitmap. + */ +#define ubh_setbit(ubh,begin,bit) \ + (*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7))) + +#define ubh_clrbit(ubh,begin,bit) \ + (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7))) + +#define ubh_isset(ubh,begin,bit) \ + (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7))) + +#define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit)) + +#define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0) + +#define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset) +static inline unsigned _ubh_find_next_zero_bit_( + struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, + unsigned begin, unsigned size, unsigned offset) +{ + unsigned base, count, pos; + + size -= offset; + begin <<= 3; + offset += begin; + base = offset >> uspi->s_bpfshift; + offset &= uspi->s_bpfmask; + for (;;) { + count = min_t(unsigned int, size + offset, uspi->s_bpf); + size -= count - offset; + pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset); + if (pos < count || !size) + break; + base++; + offset = 0; + } + return (base << uspi->s_bpfshift) + pos - begin; +} + +static inline unsigned find_last_zero_bit (unsigned char * bitmap, + unsigned size, unsigned offset) +{ + unsigned bit, i; + unsigned char * mapp; + unsigned char map; + + mapp = bitmap + (size >> 3); + map = *mapp--; + bit = 1 << (size & 7); + for (i = size; i > offset; i--) { + if ((map & bit) == 0) + break; + if ((i & 7) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << 7; + } + } + return i; +} + +#define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset) +static inline unsigned _ubh_find_last_zero_bit_( + struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, + unsigned begin, unsigned start, unsigned end) +{ + unsigned base, count, pos, size; + + size = start - end; + begin <<= 3; + start += begin; + base = start >> uspi->s_bpfshift; + start &= uspi->s_bpfmask; + for (;;) { + count = min_t(unsigned int, + size + (uspi->s_bpf - start), uspi->s_bpf) + - (uspi->s_bpf - start); + size -= count; + pos = find_last_zero_bit (ubh->bh[base]->b_data, + start, start - count); + if (pos > start - count || !size) + break; + base--; + start = uspi->s_bpf; + } + return (base << uspi->s_bpfshift) + pos - begin; +} + +#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block)) + +#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block) +static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + return (*ubh_get_addr (ubh, begin + block) == 0xff); + case 4: + return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + case 2: + return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + case 1: + return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + } + return 0; +} + +#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block) +static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + *ubh_get_addr (ubh, begin + block) = 0x00; + return; + case 4: + *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2)); + return; + case 2: + *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1)); + return; + case 1: + *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07))); + return; + } +} + +#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block) +static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + *ubh_get_addr(ubh, begin + block) = 0xff; + return; + case 4: + *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2)); + return; + case 2: + *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1)); + return; + case 1: + *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07))); + return; + } +} + +static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, + __fs32 * fraglist, int cnt) +{ + struct ufs_sb_private_info * uspi; + unsigned fragsize, pos; + + uspi = UFS_SB(sb)->s_uspi; + + fragsize = 0; + for (pos = 0; pos < uspi->s_fpb; pos++) { + if (blockmap & (1 << pos)) { + fragsize++; + } + else if (fragsize > 0) { + fs32_add(sb, &fraglist[fragsize], cnt); + fragsize = 0; + } + } + if (fragsize > 0 && fragsize < uspi->s_fpb) + fs32_add(sb, &fraglist[fragsize], cnt); +} + +static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_inode_info *ufsi, + unsigned blk) +{ + BUG_ON(blk > UFS_TIND_BLOCK); + return uspi->fs_magic == UFS2_MAGIC ? + (void *)&ufsi->i_u1.u2_i_data[blk] : + (void *)&ufsi->i_u1.i_data[blk]; +} + +static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p) +{ + return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ? + fs64_to_cpu(sb, *(__fs64 *)p) : + fs32_to_cpu(sb, *(__fs32 *)p); +} + +static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val) +{ + if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = cpu_to_fs64(sb, val); + else + *(__fs32 *)p = cpu_to_fs32(sb, val); +} + +static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = 0; + else + *(__fs32 *)p = 0; +} + +static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return *(__fs64 *)p == 0; + else + return *(__fs32 *)p == 0; +} diff --git a/kernel/fs/utimes.c b/kernel/fs/utimes.c new file mode 100644 index 000000000..aa138d645 --- /dev/null +++ b/kernel/fs/utimes.c @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __ARCH_WANT_SYS_UTIME + +/* + * sys_utime() can be implemented in user-level using sys_utimes(). + * Is this for backwards compatibility? If so, why not move it + * into the appropriate arch directory (for those architectures that + * need it). + */ + +/* If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) +{ + struct timespec tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); +} + +#endif + +static bool nsec_valid(long nsec) +{ + if (nsec == UTIME_OMIT || nsec == UTIME_NOW) + return true; + + return nsec >= 0 && nsec <= 999999999; +} + +static int utimes_common(struct path *path, struct timespec *times) +{ + int error; + struct iattr newattrs; + struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; + + error = mnt_want_write(path->mnt); + if (error) + goto out; + + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; + + newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; + if (times) { + if (times[0].tv_nsec == UTIME_OMIT) + newattrs.ia_valid &= ~ATTR_ATIME; + else if (times[0].tv_nsec != UTIME_NOW) { + newattrs.ia_atime.tv_sec = times[0].tv_sec; + newattrs.ia_atime.tv_nsec = times[0].tv_nsec; + newattrs.ia_valid |= ATTR_ATIME_SET; + } + + if (times[1].tv_nsec == UTIME_OMIT) + newattrs.ia_valid &= ~ATTR_MTIME; + else if (times[1].tv_nsec != UTIME_NOW) { + newattrs.ia_mtime.tv_sec = times[1].tv_sec; + newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; + newattrs.ia_valid |= ATTR_MTIME_SET; + } + /* + * Tell inode_change_ok(), that this is an explicit time + * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET + * were used. + */ + newattrs.ia_valid |= ATTR_TIMES_SET; + } else { + /* + * If times is NULL (or both times are UTIME_NOW), + * then we need to check permissions, because + * inode_change_ok() won't do it. + */ + error = -EACCES; + if (IS_IMMUTABLE(inode)) + goto mnt_drop_write_and_out; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; + } + } +retry_deleg: + mutex_lock(&inode->i_mutex); + error = notify_change(path->dentry, &newattrs, &delegated_inode); + mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + +mnt_drop_write_and_out: + mnt_drop_write(path->mnt); +out: + return error; +} + +/* + * do_utimes - change times on filename or file descriptor + * @dfd: open file descriptor, -1 or AT_FDCWD + * @filename: path name or NULL + * @times: new times or NULL + * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) + * + * If filename is NULL and dfd refers to an open file, then operate on + * the file. Otherwise look up filename, possibly using dfd as a + * starting point. + * + * If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +long do_utimes(int dfd, const char __user *filename, struct timespec *times, + int flags) +{ + int error = -EINVAL; + + if (times && (!nsec_valid(times[0].tv_nsec) || + !nsec_valid(times[1].tv_nsec))) { + goto out; + } + + if (flags & ~AT_SYMLINK_NOFOLLOW) + goto out; + + if (filename == NULL && dfd != AT_FDCWD) { + struct fd f; + + if (flags & AT_SYMLINK_NOFOLLOW) + goto out; + + f = fdget(dfd); + error = -EBADF; + if (!f.file) + goto out; + + error = utimes_common(&f.file->f_path, times); + fdput(f); + } else { + struct path path; + int lookup_flags = 0; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = utimes_common(&path, times); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + +out: + return error; +} + +SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, + struct timespec __user *, utimes, int, flags) +{ + struct timespec tstimes[2]; + + if (utimes) { + if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) + return -EFAULT; + + /* Nothing to do, we must not even check the path. */ + if (tstimes[0].tv_nsec == UTIME_OMIT && + tstimes[1].tv_nsec == UTIME_OMIT) + return 0; + } + + return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); +} + +SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, + struct timeval __user *, utimes) +{ + struct timeval times[2]; + struct timespec tstimes[2]; + + if (utimes) { + if (copy_from_user(×, utimes, sizeof(times))) + return -EFAULT; + + /* This test is needed to catch all invalid values. If we + would test only in do_utimes we would miss those invalid + values truncated by the multiplication with 1000. Note + that we also catch UTIME_{NOW,OMIT} here which are only + valid for utimensat. */ + if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || + times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) + return -EINVAL; + + tstimes[0].tv_sec = times[0].tv_sec; + tstimes[0].tv_nsec = 1000 * times[0].tv_usec; + tstimes[1].tv_sec = times[1].tv_sec; + tstimes[1].tv_nsec = 1000 * times[1].tv_usec; + } + + return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); +} + +SYSCALL_DEFINE2(utimes, char __user *, filename, + struct timeval __user *, utimes) +{ + return sys_futimesat(AT_FDCWD, filename, utimes); +} diff --git a/kernel/fs/xattr.c b/kernel/fs/xattr.c new file mode 100644 index 000000000..4ef698549 --- /dev/null +++ b/kernel/fs/xattr.c @@ -0,0 +1,972 @@ +/* + File: fs/xattr.c + + Extended attribute handling. + + Copyright (C) 2001 by Andreas Gruenbacher + Copyright (C) 2001 SGI - Silicon Graphics, Inc + Copyright (c) 2004 Red Hat, Inc., James Morris + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Check permissions for extended attribute access. This is a bit complicated + * because different namespaces have very different rules. + */ +static int +xattr_permission(struct inode *inode, const char *name, int mask) +{ + /* + * We can never set or remove an extended attribute on a read-only + * filesystem or on an immutable / append-only inode. + */ + if (mask & MAY_WRITE) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * No restriction for security.* and system.* from the VFS. Decision + * on these is left to the underlying filesystem / security module. + */ + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return 0; + + /* + * The trusted.* namespace can only be accessed by privileged users. + */ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + if (!capable(CAP_SYS_ADMIN)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + return 0; + } + + /* + * In the user.* namespace, only regular files and directories can have + * extended attributes. For sticky directories, only the owner and + * privileged users can write attributes. + */ + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && + (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) + return -EPERM; + } + + return inode_permission(inode, mask); +} + +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error = -EOPNOTSUPP; + int issec = !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + + if (issec) + inode->i_flags &= ~S_NOSEC; + if (inode->i_op->setxattr) { + error = inode->i_op->setxattr(dentry, name, value, size, flags); + if (!error) { + fsnotify_xattr(dentry); + security_inode_post_setxattr(dentry, name, value, + size, flags); + } + } else if (issec) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(inode, suffix, value, + size, flags); + if (!error) + fsnotify_xattr(dentry); + } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + +out: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(vfs_setxattr); + +ssize_t +xattr_getsecurity(struct inode *inode, const char *name, void *value, + size_t size) +{ + void *buffer = NULL; + ssize_t len; + + if (!value || !size) { + len = security_inode_getsecurity(inode, name, &buffer, false); + goto out_noalloc; + } + + len = security_inode_getsecurity(inode, name, &buffer, true); + if (len < 0) + return len; + if (size < len) { + len = -ERANGE; + goto out; + } + memcpy(value, buffer, len); +out: + security_release_secctx(buffer, len); +out_noalloc: + return len; +} +EXPORT_SYMBOL_GPL(xattr_getsecurity); + +/* + * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr + * + * Allocate memory, if not already allocated, or re-allocate correct size, + * before retrieving the extended attribute. + * + * Returns the result of alloc, if failed, or the getxattr operation. + */ +ssize_t +vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, + size_t xattr_size, gfp_t flags) +{ + struct inode *inode = dentry->d_inode; + char *value = *xattr_value; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + if (!inode->i_op->getxattr) + return -EOPNOTSUPP; + + error = inode->i_op->getxattr(dentry, name, NULL, 0); + if (error < 0) + return error; + + if (!value || (error > xattr_size)) { + value = krealloc(*xattr_value, error + 1, flags); + if (!value) + return -ENOMEM; + memset(value, 0, error + 1); + } + + error = inode->i_op->getxattr(dentry, name, value, error); + *xattr_value = value; + return error; +} + +/* Compare an extended attribute value with the given value */ +int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, + const char *value, size_t size, gfp_t flags) +{ + char *xattr_value = NULL; + int rc; + + rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); + if (rc < 0) + return rc; + + if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) + rc = -EINVAL; + else + rc = 0; + kfree(xattr_value); + return rc; +} + +ssize_t +vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + error = security_inode_getxattr(dentry, name); + if (error) + return error; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + int ret = xattr_getsecurity(inode, suffix, value, size); + /* + * Only overwrite the return value if a security module + * is actually active. + */ + if (ret == -EOPNOTSUPP) + goto nolsm; + return ret; + } +nolsm: + if (inode->i_op->getxattr) + error = inode->i_op->getxattr(dentry, name, value, size); + else + error = -EOPNOTSUPP; + + return error; +} +EXPORT_SYMBOL_GPL(vfs_getxattr); + +ssize_t +vfs_listxattr(struct dentry *d, char *list, size_t size) +{ + ssize_t error; + + error = security_inode_listxattr(d); + if (error) + return error; + error = -EOPNOTSUPP; + if (d->d_inode->i_op->listxattr) { + error = d->d_inode->i_op->listxattr(d, list, size); + } else { + error = security_inode_listsecurity(d->d_inode, list, size); + if (size && error > size) + error = -ERANGE; + } + return error; +} +EXPORT_SYMBOL_GPL(vfs_listxattr); + +int +vfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (!inode->i_op->removexattr) + return -EOPNOTSUPP; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_removexattr(dentry, name); + if (error) { + mutex_unlock(&inode->i_mutex); + return error; + } + + error = inode->i_op->removexattr(dentry, name); + mutex_unlock(&inode->i_mutex); + + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_removexattr(dentry, name); + } + return error; +} +EXPORT_SYMBOL_GPL(vfs_removexattr); + + +/* + * Extended attribute SET operations + */ +static long +setxattr(struct dentry *d, const char __user *name, const void __user *value, + size_t size, int flags) +{ + int error; + void *kvalue = NULL; + void *vvalue = NULL; /* If non-NULL, we used vmalloc() */ + char kname[XATTR_NAME_MAX + 1]; + + if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) + return -EINVAL; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + if (size) { + if (size > XATTR_SIZE_MAX) + return -E2BIG; + kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!kvalue) { + vvalue = vmalloc(size); + if (!vvalue) + return -ENOMEM; + kvalue = vvalue; + } + if (copy_from_user(kvalue, value, size)) { + error = -EFAULT; + goto out; + } + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_from_user(kvalue, size); + } + + error = vfs_setxattr(d, kname, kvalue, size, flags); +out: + if (vvalue) + vfree(vvalue); + else + kfree(kvalue); + return error; +} + +static int path_setxattr(const char __user *pathname, + const char __user *name, const void __user *value, + size_t size, int flags, unsigned int lookup_flags) +{ + struct path path; + int error; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = setxattr(path.dentry, name, value, size, flags); + mnt_drop_write(path.mnt); + } + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE5(setxattr, const char __user *, pathname, + const char __user *, name, const void __user *, value, + size_t, size, int, flags) +{ + return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); +} + +SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, + const char __user *, name, const void __user *, value, + size_t, size, int, flags) +{ + return path_setxattr(pathname, name, value, size, flags, 0); +} + +SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, + const void __user *,value, size_t, size, int, flags) +{ + struct fd f = fdget(fd); + int error = -EBADF; + + if (!f.file) + return error; + audit_file(f.file); + error = mnt_want_write_file(f.file); + if (!error) { + error = setxattr(f.file->f_path.dentry, name, value, size, flags); + mnt_drop_write_file(f.file); + } + fdput(f); + return error; +} + +/* + * Extended attribute GET operations + */ +static ssize_t +getxattr(struct dentry *d, const char __user *name, void __user *value, + size_t size) +{ + ssize_t error; + void *kvalue = NULL; + void *vvalue = NULL; + char kname[XATTR_NAME_MAX + 1]; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + if (size) { + if (size > XATTR_SIZE_MAX) + size = XATTR_SIZE_MAX; + kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!kvalue) { + vvalue = vmalloc(size); + if (!vvalue) + return -ENOMEM; + kvalue = vvalue; + } + } + + error = vfs_getxattr(d, kname, kvalue, size); + if (error > 0) { + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_to_user(kvalue, size); + if (size && copy_to_user(value, kvalue, error)) + error = -EFAULT; + } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { + /* The file system tried to returned a value bigger + than XATTR_SIZE_MAX bytes. Not possible. */ + error = -E2BIG; + } + if (vvalue) + vfree(vvalue); + else + kfree(kvalue); + return error; +} + +static ssize_t path_getxattr(const char __user *pathname, + const char __user *name, void __user *value, + size_t size, unsigned int lookup_flags) +{ + struct path path; + ssize_t error; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (error) + return error; + error = getxattr(path.dentry, name, value, size); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE4(getxattr, const char __user *, pathname, + const char __user *, name, void __user *, value, size_t, size) +{ + return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); +} + +SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, + const char __user *, name, void __user *, value, size_t, size) +{ + return path_getxattr(pathname, name, value, size, 0); +} + +SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, + void __user *, value, size_t, size) +{ + struct fd f = fdget(fd); + ssize_t error = -EBADF; + + if (!f.file) + return error; + audit_file(f.file); + error = getxattr(f.file->f_path.dentry, name, value, size); + fdput(f); + return error; +} + +/* + * Extended attribute LIST operations + */ +static ssize_t +listxattr(struct dentry *d, char __user *list, size_t size) +{ + ssize_t error; + char *klist = NULL; + char *vlist = NULL; /* If non-NULL, we used vmalloc() */ + + if (size) { + if (size > XATTR_LIST_MAX) + size = XATTR_LIST_MAX; + klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); + if (!klist) { + vlist = vmalloc(size); + if (!vlist) + return -ENOMEM; + klist = vlist; + } + } + + error = vfs_listxattr(d, klist, size); + if (error > 0) { + if (size && copy_to_user(list, klist, error)) + error = -EFAULT; + } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { + /* The file system tried to returned a list bigger + than XATTR_LIST_MAX bytes. Not possible. */ + error = -E2BIG; + } + if (vlist) + vfree(vlist); + else + kfree(klist); + return error; +} + +static ssize_t path_listxattr(const char __user *pathname, char __user *list, + size_t size, unsigned int lookup_flags) +{ + struct path path; + ssize_t error; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (error) + return error; + error = listxattr(path.dentry, list, size); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, + size_t, size) +{ + return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); +} + +SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, + size_t, size) +{ + return path_listxattr(pathname, list, size, 0); +} + +SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) +{ + struct fd f = fdget(fd); + ssize_t error = -EBADF; + + if (!f.file) + return error; + audit_file(f.file); + error = listxattr(f.file->f_path.dentry, list, size); + fdput(f); + return error; +} + +/* + * Extended attribute REMOVE operations + */ +static long +removexattr(struct dentry *d, const char __user *name) +{ + int error; + char kname[XATTR_NAME_MAX + 1]; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + return vfs_removexattr(d, kname); +} + +static int path_removexattr(const char __user *pathname, + const char __user *name, unsigned int lookup_flags) +{ + struct path path; + int error; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = removexattr(path.dentry, name); + mnt_drop_write(path.mnt); + } + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} + +SYSCALL_DEFINE2(removexattr, const char __user *, pathname, + const char __user *, name) +{ + return path_removexattr(pathname, name, LOOKUP_FOLLOW); +} + +SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, + const char __user *, name) +{ + return path_removexattr(pathname, name, 0); +} + +SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) +{ + struct fd f = fdget(fd); + int error = -EBADF; + + if (!f.file) + return error; + audit_file(f.file); + error = mnt_want_write_file(f.file); + if (!error) { + error = removexattr(f.file->f_path.dentry, name); + mnt_drop_write_file(f.file); + } + fdput(f); + return error; +} + + +static const char * +strcmp_prefix(const char *a, const char *a_prefix) +{ + while (*a_prefix && *a == *a_prefix) { + a++; + a_prefix++; + } + return *a_prefix ? NULL : a; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix with the generic xattr API, a filesystem should create a + * null-terminated array of struct xattr_handler (one for each prefix) and + * hang a pointer to it off of the s_xattr field of the superblock. + * + * The generic_fooxattr() functions will use this list to dispatch xattr + * operations to the correct xattr_handler. + */ +#define for_each_xattr_handler(handlers, handler) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* + * Find the xattr_handler with the matching prefix. + */ +static const struct xattr_handler * +xattr_resolve_name(const struct xattr_handler **handlers, const char **name) +{ + const struct xattr_handler *handler; + + if (!*name) + return NULL; + + for_each_xattr_handler(handlers, handler) { + const char *n = strcmp_prefix(*name, handler->prefix); + if (n) { + *name = n; + break; + } + } + return handler; +} + +/* + * Find the handler for the prefix and dispatch its get() operation. + */ +ssize_t +generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->get(dentry, name, buffer, size, handler->flags); +} + +/* + * Combine the results of the list() operation from every xattr_handler in the + * list. + */ +ssize_t +generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + unsigned int size = 0; + + if (!buffer) { + for_each_xattr_handler(handlers, handler) { + size += handler->list(dentry, NULL, 0, NULL, 0, + handler->flags); + } + } else { + char *buf = buffer; + + for_each_xattr_handler(handlers, handler) { + size = handler->list(dentry, buf, buffer_size, + NULL, 0, handler->flags); + if (size > buffer_size) + return -ERANGE; + buf += size; + buffer_size -= size; + } + size = buf - buffer; + } + return size; +} + +/* + * Find the handler for the prefix and dispatch its set() operation. + */ +int +generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) +{ + const struct xattr_handler *handler; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->set(dentry, name, value, size, flags, handler->flags); +} + +/* + * Find the handler for the prefix and dispatch its set() operation to remove + * any associated extended attribute. + */ +int +generic_removexattr(struct dentry *dentry, const char *name) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->set(dentry, name, NULL, 0, + XATTR_REPLACE, handler->flags); +} + +EXPORT_SYMBOL(generic_getxattr); +EXPORT_SYMBOL(generic_listxattr); +EXPORT_SYMBOL(generic_setxattr); +EXPORT_SYMBOL(generic_removexattr); + +/* + * Allocate new xattr and copy in the value; but leave the name to callers. + */ +struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) +{ + struct simple_xattr *new_xattr; + size_t len; + + /* wrap around? */ + len = sizeof(*new_xattr) + size; + if (len < sizeof(*new_xattr)) + return NULL; + + new_xattr = kmalloc(len, GFP_KERNEL); + if (!new_xattr) + return NULL; + + new_xattr->size = size; + memcpy(new_xattr->value, value, size); + return new_xattr; +} + +/* + * xattr GET operation for in-memory/pseudo filesystems + */ +int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, + void *buffer, size_t size) +{ + struct simple_xattr *xattr; + int ret = -ENODATA; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (strcmp(name, xattr->name)) + continue; + + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); + } + break; + } + spin_unlock(&xattrs->lock); + return ret; +} + +static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + struct simple_xattr *xattr; + struct simple_xattr *new_xattr = NULL; + int err = 0; + + /* value == NULL means remove */ + if (value) { + new_xattr = simple_xattr_alloc(value, size); + if (!new_xattr) + return -ENOMEM; + + new_xattr->name = kstrdup(name, GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } + } + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (!strcmp(name, xattr->name)) { + if (flags & XATTR_CREATE) { + xattr = new_xattr; + err = -EEXIST; + } else if (new_xattr) { + list_replace(&xattr->list, &new_xattr->list); + } else { + list_del(&xattr->list); + } + goto out; + } + } + if (flags & XATTR_REPLACE) { + xattr = new_xattr; + err = -ENODATA; + } else { + list_add(&new_xattr->list, &xattrs->head); + xattr = NULL; + } +out: + spin_unlock(&xattrs->lock); + if (xattr) { + kfree(xattr->name); + kfree(xattr); + } + return err; + +} + +/** + * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems + * @xattrs: target simple_xattr list + * @name: name of the new extended attribute + * @value: value of the new xattr. If %NULL, will remove the attribute + * @size: size of the new xattr + * @flags: %XATTR_{CREATE|REPLACE} + * + * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails + * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; + * otherwise, fails with -ENODATA. + * + * Returns 0 on success, -errno on failure. + */ +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __simple_xattr_set(xattrs, name, value, size, flags); +} + +/* + * xattr REMOVE operation for in-memory/pseudo filesystems + */ +int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name) +{ + return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE); +} + +static bool xattr_is_trusted(const char *name) +{ + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); +} + +/* + * xattr LIST operation for in-memory/pseudo filesystems + */ +ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, + size_t size) +{ + bool trusted = capable(CAP_SYS_ADMIN); + struct simple_xattr *xattr; + size_t used = 0; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + size_t len; + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + len = strlen(xattr->name) + 1; + used += len; + if (buffer) { + if (size < used) { + used = -ERANGE; + break; + } + memcpy(buffer, xattr->name, len); + buffer += len; + } + } + spin_unlock(&xattrs->lock); + + return used; +} + +/* + * Adds an extended attribute to the list + */ +void simple_xattr_list_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + spin_lock(&xattrs->lock); + list_add(&new_xattr->list, &xattrs->head); + spin_unlock(&xattrs->lock); +} diff --git a/kernel/fs/xfs/Kconfig b/kernel/fs/xfs/Kconfig new file mode 100644 index 000000000..5d47b4df6 --- /dev/null +++ b/kernel/fs/xfs/Kconfig @@ -0,0 +1,97 @@ +config XFS_FS + tristate "XFS filesystem support" + depends on BLOCK + depends on (64BIT || LBDAF) + select EXPORTFS + select LIBCRC32C + help + XFS is a high performance journaling filesystem which originated + on the SGI IRIX platform. It is completely multi-threaded, can + support large files and large filesystems, extended attributes, + variable block sizes, is extent based, and makes extensive use of + Btrees (directories, extents, free space) to aid both performance + and scalability. + + Refer to the documentation at + for complete details. This implementation is on-disk compatible + with the IRIX version of XFS. + + To compile this file system support as a module, choose M here: the + module will be called xfs. Be aware, however, that if the file + system of your root partition is compiled as a module, you'll need + to use an initial ramdisk (initrd) to boot. + +config XFS_QUOTA + bool "XFS Quota support" + depends on XFS_FS + select QUOTACTL + help + If you say Y here, you will be able to set limits for disk usage on + a per user and/or a per group basis under XFS. XFS considers quota + information as filesystem metadata and uses journaling to provide a + higher level guarantee of consistency. The on-disk data format for + quota is also compatible with the IRIX version of XFS, allowing a + filesystem to be migrated between Linux and IRIX without any need + for conversion. + + If unsure, say N. More comprehensive documentation can be found in + README.quota in the xfsprogs package. XFS quota can be used either + with or without the generic quota support enabled (CONFIG_QUOTA) - + they are completely independent subsystems. + +config XFS_POSIX_ACL + bool "XFS POSIX ACL support" + depends on XFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +config XFS_RT + bool "XFS Realtime subvolume support" + depends on XFS_FS + help + If you say Y here you will be able to mount and use XFS filesystems + which contain a realtime subvolume. The realtime subvolume is a + separate area of disk space where only file data is stored. It was + originally designed to provide deterministic data rates suitable + for media streaming applications, but is also useful as a generic + mechanism for ensuring data and metadata/log I/Os are completely + separated. Regular file I/Os are isolated to a separate device + from all other requests, and this can be done quite transparently + to applications via the inherit-realtime directory inode flag. + + See the xfs man page in section 5 for additional information. + + If unsure, say N. + +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + +config XFS_DEBUG + bool "XFS Debugging support" + depends on XFS_FS + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/kernel/fs/xfs/Makefile b/kernel/fs/xfs/Makefile new file mode 100644 index 000000000..df6828570 --- /dev/null +++ b/kernel/fs/xfs/Makefile @@ -0,0 +1,124 @@ +# +# Copyright (c) 2000-2005 Silicon Graphics, Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# + +ccflags-y += -I$(src) # needed for trace events +ccflags-y += -I$(src)/libxfs + +ccflags-$(CONFIG_XFS_DEBUG) += -g + +obj-$(CONFIG_XFS_FS) += xfs.o + +# this one should be compiled first, as the tracing macros can easily blow up +xfs-y += xfs_trace.o + +# build the libxfs code first +xfs-y += $(addprefix libxfs/, \ + xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_attr_remote.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_da_btree.o \ + xfs_da_format.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dquot_buf.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o \ + ) +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ + xfs_rtbitmap.o \ + ) + +# highlevel code +xfs-y += xfs_aops.o \ + xfs_attr_inactive.o \ + xfs_attr_list.o \ + xfs_bit.o \ + xfs_bmap_util.o \ + xfs_buf.o \ + xfs_dir2_readdir.o \ + xfs_discard.o \ + xfs_error.o \ + xfs_export.o \ + xfs_extent_busy.o \ + xfs_file.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_globals.o \ + xfs_icache.o \ + xfs_ioctl.o \ + xfs_iomap.o \ + xfs_iops.o \ + xfs_inode.o \ + xfs_itable.o \ + xfs_message.o \ + xfs_mount.o \ + xfs_mru_cache.o \ + xfs_super.o \ + xfs_symlink.o \ + xfs_sysfs.o \ + xfs_trans.o \ + xfs_xattr.o \ + kmem.o \ + uuid.o + +# low-level transaction/log code +xfs-y += xfs_log.o \ + xfs_log_cil.o \ + xfs_buf_item.o \ + xfs_extfree_item.o \ + xfs_icreate_item.o \ + xfs_inode_item.o \ + xfs_log_recover.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + +# optional features +xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o \ + xfs_quotaops.o + +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o + +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += xfs_stats.o +xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/kernel/fs/xfs/kmem.c b/kernel/fs/xfs/kmem.c new file mode 100644 index 000000000..a7a3a63bb --- /dev/null +++ b/kernel/fs/xfs/kmem.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include "kmem.h" +#include "xfs_message.h" + +/* + * Greedy allocation. May fail and may return vmalloced memory. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = vzalloc(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} + +void * +kmem_alloc(size_t size, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmalloc(size, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} + +void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + unsigned noio_flag = 0; + void *ptr; + gfp_t lflags; + + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; +} + +void * +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, + xfs_km_flags_t flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} diff --git a/kernel/fs/xfs/kmem.h b/kernel/fs/xfs/kmem.h new file mode 100644 index 000000000..cc6b768fc --- /dev/null +++ b/kernel/fs/xfs/kmem.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include +#include +#include +#include + +/* + * General memory allocation interfaces + */ + +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) +#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) + +/* + * We use a special process flag to avoid recursive callbacks into + * the filesystem during transactions. We will also issue our own + * warnings, so we explicitly skip any generic ones (silly of us). + */ +static inline gfp_t +kmem_flags_convert(xfs_km_flags_t flags) +{ + gfp_t lflags; + + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); + + if (flags & KM_NOSLEEP) { + lflags = GFP_ATOMIC | __GFP_NOWARN; + } else { + lflags = GFP_KERNEL | __GFP_NOWARN; + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + } + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + + return lflags; +} + +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +static inline void kmem_free(const void *ptr) +{ + kvfree(ptr); +} + + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + +static inline void * +kmem_zalloc(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc(size, flags | KM_ZERO); +} + +/* + * Zone interfaces + */ + +#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN +#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT +#define KM_ZONE_SPREAD SLAB_MEM_SPREAD + +#define kmem_zone kmem_cache +#define kmem_zone_t struct kmem_cache + +static inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL); +} + +static inline kmem_zone_t * +kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, + void (*construct)(void *)) +{ + return kmem_cache_create(zone_name, size, 0, flags, construct); +} + +static inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +static inline void +kmem_zone_destroy(kmem_zone_t *zone) +{ + if (zone) + kmem_cache_destroy(zone); +} + +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); + +static inline void * +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + return kmem_zone_alloc(zone, flags | KM_ZERO); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.c b/kernel/fs/xfs/libxfs/xfs_alloc.c new file mode 100644 index 000000000..516162be1 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc.c @@ -0,0 +1,2639 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" + +struct workqueue_struct *xfs_alloc_wq; + +#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) + +#define XFSA_FIXUP_BNO_OK 1 +#define XFSA_FIXUP_CNT_OK 2 + +STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + union xfs_btree_rec rec; + + rec.alloc.ar_startblock = cpu_to_be32(bno); + rec.alloc.ar_blockcount = cpu_to_be32(len); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + } + return error; +} + +/* + * Compute aligned version of the found extent. + * Takes alignment and min length into account. + */ +STATIC void +xfs_alloc_compute_aligned( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_agblock_t foundbno, /* starting block in found extent */ + xfs_extlen_t foundlen, /* length in found extent */ + xfs_agblock_t *resbno, /* result block number */ + xfs_extlen_t *reslen) /* result length */ +{ + xfs_agblock_t bno; + xfs_extlen_t len; + + /* Trim busy sections out of found extent */ + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; + } else { + *resbno = bno; + *reslen = len; + } +} + +/* + * Compute best start block and diff for "near" allocations. + * freelen >= wantlen already checked by caller. + */ +STATIC xfs_extlen_t /* difference value (absolute) */ +xfs_alloc_compute_diff( + xfs_agblock_t wantbno, /* target starting block */ + xfs_extlen_t wantlen, /* target length */ + xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ + xfs_agblock_t freebno, /* freespace's starting block */ + xfs_extlen_t freelen, /* freespace's length */ + xfs_agblock_t *newbnop) /* result: best start block from free */ +{ + xfs_agblock_t freeend; /* end of freespace extent */ + xfs_agblock_t newbno1; /* return block number */ + xfs_agblock_t newbno2; /* other new block number */ + xfs_extlen_t newlen1=0; /* length with newbno1 */ + xfs_extlen_t newlen2=0; /* length with newbno2 */ + xfs_agblock_t wantend; /* end of target extent */ + + ASSERT(freelen >= wantlen); + freeend = freebno + freelen; + wantend = wantbno + wantlen; + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { + if ((newbno1 = roundup(freebno, alignment)) >= freeend) + newbno1 = NULLAGBLOCK; + } else if (freeend >= wantend && alignment > 1) { + newbno1 = roundup(wantbno, alignment); + newbno2 = newbno1 - alignment; + if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + else + newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); + if (newbno2 < freebno) + newbno2 = NULLAGBLOCK; + else + newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); + if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { + if (newlen1 < newlen2 || + (newlen1 == newlen2 && + XFS_ABSDIFF(newbno1, wantbno) > + XFS_ABSDIFF(newbno2, wantbno))) + newbno1 = newbno2; + } else if (newbno2 != NULLAGBLOCK) + newbno1 = newbno2; + } else if (freeend >= wantend) { + newbno1 = wantbno; + } else if (alignment > 1) { + newbno1 = roundup(freeend - wantlen, alignment); + if (newbno1 > freeend - wantlen && + newbno1 - alignment >= freebno) + newbno1 -= alignment; + else if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + } else + newbno1 = freeend - wantlen; + *newbnop = newbno1; + return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); +} + +/* + * Fix up the length, based on mod and prod. + * len should be k * prod + mod for some k. + * If len is too small it is returned unchanged. + * If len hits maxlen it is left alone. + */ +STATIC void +xfs_alloc_fix_len( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_extlen_t k; + xfs_extlen_t rlen; + + ASSERT(args->mod < args->prod); + rlen = args->len; + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || + (args->mod == 0 && rlen < args->prod)) + return; + k = rlen % args->prod; + if (k == args->mod) + return; + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); + args->len = rlen; +} + +/* + * Fix up length if there is too little space left in the a.g. + * Return 1 if ok, 0 if too little, should give up. + */ +STATIC int +xfs_alloc_fix_minleft( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agf_t *agf; /* a.g. freelist header */ + int diff; /* free space difference */ + + if (args->minleft == 0) + return 1; + agf = XFS_BUF_TO_AGF(args->agbp); + diff = be32_to_cpu(agf->agf_freeblks) + - args->len - args->minleft; + if (diff >= 0) + return 1; + args->len += diff; /* shrink the allocated space */ + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) + return 1; + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Update the two btrees, logically removing from freespace the extent + * starting at rbno, rlen blocks. The extent is contained within the + * actual (current) free extent fbno for flen blocks. + * Flags are passed in indicating whether the cursors are set to the + * relevant records. + */ +STATIC int /* error code */ +xfs_alloc_fixup_trees( + xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ + xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + xfs_agblock_t fbno, /* starting block of free extent */ + xfs_extlen_t flen, /* length of free extent */ + xfs_agblock_t rbno, /* starting block of returned extent */ + xfs_extlen_t rlen, /* length of returned extent */ + int flags) /* flags, XFSA_FIXUP_... */ +{ + int error; /* error code */ + int i; /* operation results */ + xfs_agblock_t nfbno1; /* first new free startblock */ + xfs_agblock_t nfbno2; /* second new free startblock */ + xfs_extlen_t nflen1=0; /* first new free length */ + xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; + + /* + * Look up the record in the by-size tree if necessary. + */ + if (flags & XFSA_FIXUP_CNT_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + /* + * Look up the record in the by-block tree if necessary. + */ + if (flags & XFSA_FIXUP_BNO_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + +#ifdef DEBUG + if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { + struct xfs_btree_block *bnoblock; + struct xfs_btree_block *cntblock; + + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + + XFS_WANT_CORRUPTED_RETURN(mp, + bnoblock->bb_numrecs == cntblock->bb_numrecs); + } +#endif + + /* + * Deal with all four cases: the allocated record is contained + * within the freespace record, so we can have new freespace + * at either (or both) end, or no freespace remaining. + */ + if (rbno == fbno && rlen == flen) + nfbno1 = nfbno2 = NULLAGBLOCK; + else if (rbno == fbno) { + nfbno1 = rbno + rlen; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else if (rbno + rlen == fbno + flen) { + nfbno1 = fbno; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else { + nfbno1 = fbno; + nflen1 = rbno - fbno; + nfbno2 = rbno + rlen; + nflen2 = (fbno + flen) - nfbno2; + } + /* + * Delete the entry from the by-size btree. + */ + if ((error = xfs_btree_delete(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + /* + * Add new by-size btree entry(s). + */ + if (nfbno1 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + if (nfbno2 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + /* + * Fix up the by-block btree entry(s). + */ + if (nfbno1 == NULLAGBLOCK) { + /* + * No remaining freespace, just delete the by-block tree entry. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } else { + /* + * Update the by-block entry to start later|be shorter. + */ + if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) + return error; + } + if (nfbno2 != NULLAGBLOCK) { + /* + * 2 resulting free entries, need to add one. + */ + if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + return 0; +} + +static bool +xfs_agfl_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int i; + + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + return false; + } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_agfl_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + +/* + * Read in the allocation group free block array. + */ +STATIC int /* error */ +xfs_alloc_read_agfl( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* buffer for the ag free block array */ +{ + xfs_buf_t *bp; /* return value */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (error) + return error; + xfs_buf_set_ref(bp, XFS_AGFL_REF); + *bpp = bp; + return 0; +} + +STATIC int +xfs_alloc_update_counters( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agbp, + long len) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + pag->pagf_freeblks += len; + be32_add_cpu(&agf->agf_freeblks, len); + + xfs_trans_agblocks_delta(tp, len); + if (unlikely(be32_to_cpu(agf->agf_freeblks) > + be32_to_cpu(agf->agf_length))) + return -EFSCORRUPTED; + + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + return 0; +} + +/* + * Allocation group level functions. + */ + +/* + * Allocate a variable extent in the allocation group agno. + * Type and bno are used to determine where in the allocation group the + * extent will start. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent( + xfs_alloc_arg_t *args) /* argument structure for allocation */ +{ + int error=0; + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->mod < args->prod); + ASSERT(args->alignment > 0); + /* + * Branch to correct routine based on the type. + */ + args->wasfromfl = 0; + switch (args->type) { + case XFS_ALLOCTYPE_THIS_AG: + error = xfs_alloc_ag_vextent_size(args); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_alloc_ag_vextent_near(args); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_alloc_ag_vextent_exact(args); + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + + if (error || args->agbno == NULLAGBLOCK) + return error; + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->pag, + args->agbp, + -((long)(args->len))); + if (error) + return error; + + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + args->agbno, args->len)); + } + + if (!args->isfl) { + xfs_trans_mod_sb(args->tp, args->wasdel ? + XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, + -((long)(args->len))); + } + + XFS_STATS_INC(xs_allocx); + XFS_STATS_ADD(xs_allocb, args->len); + return error; +} + +/* + * Allocate a variable extent at exactly agno/bno. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_exact( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ + xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + int error; + xfs_agblock_t fbno; /* start block of found extent */ + xfs_extlen_t flen; /* length of found extent */ + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ + int i; /* success/failure of operation */ + + ASSERT(args->alignment == 1); + + /* + * Allocate/initialize a cursor for the by-number freespace btree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + + /* + * Lookup bno and minlen in the btree (minlen is irrelevant, really). + * Look for the closest free block <= bno, it must contain bno + * if any free block does. + */ + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) + goto error0; + if (!i) + goto not_found; + + /* + * Grab the freespace record. + */ + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + ASSERT(fbno <= args->agbno); + + /* + * Check for overlapping busy extents. + */ + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + + /* + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. + */ + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) + goto not_found; + + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + * + * Fix the length according to mod and prod if given. + */ + args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) + - args->agbno; + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + + ASSERT(args->agbno + args->len <= tend); + + /* + * We are allocating agbno for args->len + * Allocate/initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + ASSERT(args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + goto error0; + } + + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); + return 0; + +error0: + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + trace_xfs_alloc_exact_error(args); + return error; +} + +/* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (*sbnoa >= args->agbno + gdiff) + goto out_use_good; + } else { + if (*sbnoa <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, + args->userdata, *sbnoa, + *slena, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ + return error; +} + +/* + * Allocate a variable extent near bno in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_near( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ + xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ + xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ + xfs_agblock_t gtbno; /* start bno of right side entry */ + xfs_agblock_t gtbnoa; /* aligned ... */ + xfs_extlen_t gtdiff; /* difference to right side entry */ + xfs_extlen_t gtlen; /* length of right side entry */ + xfs_extlen_t gtlena; /* aligned ... */ + xfs_agblock_t gtnew; /* useful start bno of right side */ + int error; /* error code */ + int i; /* result code, temporary */ + int j; /* result code, temporary */ + xfs_agblock_t ltbno; /* start bno of left side entry */ + xfs_agblock_t ltbnoa; /* aligned ... */ + xfs_extlen_t ltdiff; /* difference to left side entry */ + xfs_extlen_t ltlen; /* length of left side entry */ + xfs_extlen_t ltlena; /* aligned ... */ + xfs_agblock_t ltnew; /* useful start bno of left side */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +#ifdef DEBUG + /* + * Randomly don't execute the first algorithm. + */ + int dofirst; /* set to do first algorithm */ + + dofirst = prandom_u32() & 1; +#endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + + /* + * Get a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + + /* + * See if there are any free extents as big as maxlen. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, + <len, &i))) + goto error0; + if (i == 0 || ltlen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); + return 0; + } + ASSERT(i == 1); + } + args->wasfromfl = 0; + + /* + * First algorithm. + * If the requested extent is large wrt the freespaces available + * in this a.g., then the cursor will be pointing to a btree entry + * near the right edge of the tree. If it's in the last btree leaf + * block, then we just examine all the entries in that block + * that are big enough, and pick the best one. + * This is written as a while loop so we can break out of it, + * but we never loop back to the top. + */ + while (xfs_btree_islastblock(cnt_cur, 0)) { + xfs_extlen_t bdiff; + int besti=0; + xfs_extlen_t blen=0; + xfs_agblock_t bnew=0; + +#ifdef DEBUG + if (dofirst) + break; +#endif + /* + * Start from the entry that lookup found, sequence through + * all larger free blocks. If we're actually pointing at a + * record smaller than maxlen, go to the start of this block, + * and skip all those smaller than minlen. + */ + if (ltlen || args->alignment > 1) { + cnt_cur->bc_ptrs[0] = 1; + do { + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (ltlen >= args->minlen) + break; + if ((error = xfs_btree_increment(cnt_cur, 0, &i))) + goto error0; + } while (i); + ASSERT(ltlen >= args->minlen); + if (!i) + break; + } + i = cnt_cur->bc_ptrs[0]; + for (j = 1, blen = 0, bdiff = 0; + !error && j && (blen < args->maxlen || bdiff > 0); + error = xfs_btree_increment(cnt_cur, 0, &j)) { + /* + * For each entry, decide if it's better than + * the previous best entry. + */ + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena < args->minlen) + continue; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < blen) + continue; + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, ltbnoa, + ltlena, <new); + if (ltnew != NULLAGBLOCK && + (args->len > blen || ltdiff < bdiff)) { + bdiff = ltdiff; + bnew = ltnew; + blen = args->len; + besti = cnt_cur->bc_ptrs[0]; + } + } + /* + * It didn't work. We COULD be in a case where + * there's a good record somewhere, so try again. + */ + if (blen == 0) + break; + /* + * Point at the best entry, and retrieve it again. + */ + cnt_cur->bc_ptrs[0] = besti; + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->len = blen; + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_nominleft(args); + return 0; + } + blen = args->len; + /* + * We are allocating starting at bnew for blen blocks. + */ + args->agbno = bnew; + ASSERT(bnew >= ltbno); + ASSERT(bnew + blen <= ltbno + ltlen); + /* + * Set up a cursor for the by-bno tree. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + /* + * Fix up the btree entries. + */ + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, + ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + + trace_xfs_alloc_near_first(args); + return 0; + } + /* + * Second algorithm. + * Search in the by-bno tree to the left and to the right + * simultaneously, until in each case we find a space big enough, + * or run into the edge of the tree. When we run into the edge, + * we deallocate that cursor. + * If both searches succeed, we compare the two spaces and pick + * the better one. + * With alignment, it's possible for both to fail; the upper + * level algorithm that picks allocation groups for allocations + * is not supposed to do this. + */ + /* + * Allocate and initialize the cursor for the leftward search. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + /* + * Lookup <= bno to find the leftward search's starting point. + */ + if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find anything; use this cursor for the rightward + * search. + */ + bno_cur_gt = bno_cur_lt; + bno_cur_lt = NULL; + } + /* + * Found something. Duplicate the cursor for the rightward search. + */ + else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) + goto error0; + /* + * Increment the cursor, so we will point at the entry just right + * of the leftward entry if any, or to the leftmost entry. + */ + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + /* + * It failed, there are no rightward entries. + */ + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Loop going left with the leftward cursor, right with the + * rightward cursor, until either both directions give up or + * we find an entry at least as big as minlen. + */ + do { + if (bno_cur_lt) { + if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena >= args->minlen) + break; + if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + if (bno_cur_gt) { + if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena); + if (gtlena >= args->minlen) + break; + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + } while (bno_cur_lt || bno_cur_gt); + + /* + * Got both cursors still active, need to find better entry. + */ + if (bno_cur_lt && bno_cur_gt) { + if (ltlena >= args->minlen) { + /* + * Left side is good, look for a right side entry. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, ltbnoa, + ltlena, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, + >bnoa, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + + /* + * Right side is good, look for a left side entry. + */ + args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); + xfs_alloc_fix_len(args); + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, gtbnoa, + gtlena, >new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, + <bnoa, <lena, + 1 /* search left */); + } + + if (error) + goto error0; + } + + /* + * If we couldn't get anything, give up. + */ + if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + trace_xfs_alloc_size_neither(args); + args->agbno = NULLAGBLOCK; + return 0; + } + + /* + * At this point we have selected a freespace entry, either to the + * left or to the right. If it's on the right, copy all the + * useful variables to the "left" set so we only have one + * copy of this code. + */ + if (bno_cur_gt) { + bno_cur_lt = bno_cur_gt; + bno_cur_gt = NULL; + ltbno = gtbno; + ltbnoa = gtbnoa; + ltlen = gtlen; + ltlena = gtlena; + j = 1; + } else + j = 0; + + /* + * Fix up the length and compute the useful address. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + trace_xfs_alloc_near_nominleft(args); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + args->userdata, ltbnoa, ltlena, <new); + ASSERT(ltnew >= ltbno); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); + ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->agbno = ltnew; + + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, + ltnew, rlen, XFSA_FIXUP_BNO_OK))) + goto error0; + + if (j) + trace_xfs_alloc_near_greater(args); + else + trace_xfs_alloc_near_lesser(args); + + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + return 0; + + error0: + trace_xfs_alloc_near_error(args); + if (cnt_cur != NULL) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur_lt != NULL) + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); + if (bno_cur_gt != NULL) + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); + return error; +} + +/* + * Allocate a variable extent anywhere in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_size( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + int error; /* error result */ + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + int i; /* temp status variable */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; + +restart: + /* + * Allocate and initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + bno_cur = NULL; + + /* + * Look for an entry >= maxlen+alignment-1 blocks. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, + args->maxlen + args->alignment - 1, &i))) + goto error0; + + /* + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. + */ + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) + goto error0; + if (i == 0 || flen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_noentry(args); + return 0; + } + ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } + } + + /* + * In the first case above, we got the last entry in the + * by-size btree. Now we check to see if the space hits maxlen + * once aligned; if not, we search left for something better. + * This can't happen in the second case above. + */ + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (rlen < args->maxlen) { + xfs_agblock_t bestfbno; + xfs_extlen_t bestflen; + xfs_agblock_t bestrbno; + xfs_extlen_t bestrlen; + + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + for (;;) { + if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) + goto error0; + if (i == 0) + break; + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (flen < bestrlen) + break; + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), + error0); + if (rlen > bestrlen) { + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + if (rlen == args->maxlen) + break; + } + } + if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + rlen = bestrlen; + rbno = bestrbno; + flen = bestflen; + fbno = bestfbno; + } + args->wasfromfl = 0; + /* + * Fix up the length. + */ + args->len = rlen; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; + } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; + rlen = args->len; + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); + /* + * Allocate and initialize a cursor for the by-block tree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + rbno, rlen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + cnt_cur = bno_cur = NULL; + args->len = rlen; + args->agbno = rbno; + XFS_WANT_CORRUPTED_GOTO(args->mp, + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + trace_xfs_alloc_size_done(args); + return 0; + +error0: + trace_xfs_alloc_size_error(args); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Deal with the case where only small freespaces remain. + * Either return the contents of the last freespace record, + * or allocate space from the freelist if there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_btree_cur_t *ccur, /* by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int i; + + if ((error = xfs_btree_decrement(ccur, 0, &i))) + goto error0; + if (i) { + if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + } + /* + * Nothing in the btree, try the freelist. Make sure + * to respect minleft even when pulling from the + * freelist. + */ + else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) + > args->minleft)) { + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) + goto error0; + if (fbno != NULLAGBLOCK) { + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + + if (args->userdata) { + xfs_buf_t *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); + xfs_trans_binval(args->tp, bp); + } + args->len = 1; + args->agbno = fbno; + XFS_WANT_CORRUPTED_GOTO(args->mp, + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + args->wasfromfl = 1; + trace_xfs_alloc_small_freelist(args); + *stat = 0; + return 0; + } + /* + * Nothing in the freelist. + */ + else + flen = 0; + } + /* + * Can't allocate from the freelist for some reason. + */ + else { + fbno = NULLAGBLOCK; + flen = 0; + } + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_small_notenough(args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + trace_xfs_alloc_small_done(args); + return 0; + +error0: + trace_xfs_alloc_small_error(args); + return error; +} + +/* + * Free the extent starting at agno/bno for length. + */ +STATIC int /* error */ +xfs_free_ag_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t bno, /* starting block number */ + xfs_extlen_t len, /* length of extent */ + int isfl) /* set if is freelist blocks - no sb acctg */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ + int error; /* error return value */ + xfs_agblock_t gtbno; /* start of right neighbor block */ + xfs_extlen_t gtlen; /* length of right neighbor block */ + int haveleft; /* have a left neighbor block */ + int haveright; /* have a right neighbor block */ + int i; /* temp, result code */ + xfs_agblock_t ltbno; /* start of left neighbor block */ + xfs_extlen_t ltlen; /* length of left neighbor block */ + xfs_mount_t *mp; /* mount point struct for filesystem */ + xfs_agblock_t nbno; /* new starting block of freespace */ + xfs_extlen_t nlen; /* new length of freespace */ + xfs_perag_t *pag; /* per allocation group data */ + + mp = tp->t_mountp; + /* + * Allocate and initialize a cursor for the by-block btree. + */ + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + cnt_cur = NULL; + /* + * Look for a neighboring block on the left (lower block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) + goto error0; + if (haveleft) { + /* + * There is a block to our left. + */ + if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * It's not contiguous, though. + */ + if (ltbno + ltlen < bno) + haveleft = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); + } + } + /* + * Look for a neighboring block on the right (higher block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) + goto error0; + if (haveright) { + /* + * There is a block to our right. + */ + if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * It's not contiguous, though. + */ + if (bno + len < gtbno) + haveright = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); + } + } + /* + * Now allocate and initialize a cursor for the by-size tree. + */ + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + /* + * Have both left and right contiguous neighbors. + * Merge all three into a single free block. + */ + if (haveleft && haveright) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Delete the old by-block entry for the right block. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Move the by-block cursor back to the left neighbor. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); +#ifdef DEBUG + /* + * Check that this is the right record: delete didn't + * mangle the cursor. + */ + { + xfs_agblock_t xxbno; + xfs_extlen_t xxlen; + + if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1 && xxbno == ltbno && xxlen == ltlen, + error0); + } +#endif + /* + * Update remaining by-block entry to the new, joined block. + */ + nbno = ltbno; + nlen = len + ltlen + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a left contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveleft) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Back up the by-block cursor to the left neighbor, and + * update its length. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + nbno = ltbno; + nlen = len + ltlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a right contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveright) { + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Update the starting block and length of the right + * neighbor in the by-block tree. + */ + nbno = bno; + nlen = len + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * No contiguous neighbors. + * Insert the new freespace into the by-block tree. + */ + else { + nbno = bno; + nlen = len; + if ((error = xfs_btree_insert(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + bno_cur = NULL; + /* + * In all cases we need to insert the new freespace in the by-size tree. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + cnt_cur = NULL; + + /* + * Update the freespace totals in the ag and superblock. + */ + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_perag_put(pag); + if (error) + goto error0; + + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xs_freex); + XFS_STATS_ADD(xs_freeb, len); + + trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + + return 0; + + error0: + trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Visible (exported) allocation/free functions. + * Some of these are used just by xfs_alloc_btree.c and this file. + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_ag_maxlevels = level; +} + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + */ +STATIC int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ +{ + xfs_buf_t *agbp; /* agf buffer pointer */ + xfs_agf_t *agf; /* a.g. freespace structure pointer */ + xfs_buf_t *agflbp;/* agfl buffer pointer */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t delta; /* new blocks needed in freelist */ + int error; /* error result code */ + xfs_extlen_t longest;/* longest extent in allocation group */ + xfs_mount_t *mp; /* file system mount point structure */ + xfs_extlen_t need; /* total blocks needed in freelist */ + xfs_perag_t *pag; /* per-ag information structure */ + xfs_alloc_arg_t targs; /* local allocation arguments */ + xfs_trans_t *tp; /* transaction pointer */ + + mp = args->mp; + + pag = args->pag; + tp = args->tp; + if (!pag->pagf_init) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (!pag->pagf_init) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } else + agbp = NULL; + + /* + * If this is a metadata preferred pag and we are user data + * then try somewhere else if we are not being asked to + * try harder at this point + */ + if (pag->pagf_metadata && args->userdata && + (flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + + /* + * Get the a.g. freespace buffer. + * Can fail if we're not blocking on locks, and it's held. + */ + if (agbp == NULL) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (agbp == NULL) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } + /* + * Figure out how many blocks we should have in the freelist. + */ + agf = XFS_BUF_TO_AGF(agbp); + need = XFS_MIN_FREELIST(agf, mp); + /* + * If there isn't enough total or single-extent, reject it. + */ + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + delta = need > be32_to_cpu(agf->agf_flcount) ? + (need - be32_to_cpu(agf->agf_flcount)) : 0; + longest = be32_to_cpu(agf->agf_longest); + longest = (longest > delta) ? (longest - delta) : + (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(be32_to_cpu(agf->agf_freeblks) + + be32_to_cpu(agf->agf_flcount) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + /* + * Make the freelist shorter if it's too long. + */ + while (be32_to_cpu(agf->agf_flcount) > need) { + xfs_buf_t *bp; + + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) + return error; + if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) + return error; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + xfs_trans_binval(tp, bp); + } + /* + * Initialize the args structure. + */ + memset(&targs, 0, sizeof(targs)); + targs.tp = tp; + targs.mp = mp; + targs.agbp = agbp; + targs.agno = args->agno; + targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.type = XFS_ALLOCTYPE_THIS_AG; + targs.pag = pag; + if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) + return error; + /* + * Make the freelist longer if it's too short. + */ + while (be32_to_cpu(agf->agf_flcount) < need) { + targs.agbno = 0; + targs.maxlen = need - be32_to_cpu(agf->agf_flcount); + /* + * Allocate as many blocks as possible at once. + */ + if ((error = xfs_alloc_ag_vextent(&targs))) { + xfs_trans_brelse(tp, agflbp); + return error; + } + /* + * Stop if we run out. Won't happen if callers are obeying + * the restrictions correctly. Can happen for free calls + * on a completely full ag. + */ + if (targs.agbno == NULLAGBLOCK) { + if (flags & XFS_ALLOC_FLAG_FREEING) + break; + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; + } + /* + * Put each allocated block on the list. + */ + for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) + return error; + } + } + xfs_trans_brelse(tp, agflbp); + args->agbp = agbp; + return 0; +} + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ + xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; + int error; + int logflags; + xfs_mount_t *mp = tp->t_mountp; + xfs_perag_t *pag; /* per allocation group data */ + + /* + * Freelist is empty, give up. + */ + agf = XFS_BUF_TO_AGF(agbp); + if (!agf->agf_flcount) { + *bnop = NULLAGBLOCK; + return 0; + } + /* + * Read the array of free blocks. + */ + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) + return error; + + + /* + * Get the block number and update the data structures. + */ + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + be32_add_cpu(&agf->agf_flfirst, 1); + xfs_trans_brelse(tp, agflbp); + if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) + agf->agf_flfirst = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, -1); + xfs_trans_agflist_delta(tp, -1); + pag->pagf_flcount--; + xfs_perag_put(pag); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + xfs_alloc_log_agf(tp, agbp, logflags); + *bnop = bno; + + return 0; +} + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer for a.g. freelist header */ + int fields) /* mask of fields to be logged (XFS_AGF_...) */ +{ + int first; /* first byte offset */ + int last; /* last byte offset */ + static const short offsets[] = { + offsetof(xfs_agf_t, agf_magicnum), + offsetof(xfs_agf_t, agf_versionnum), + offsetof(xfs_agf_t, agf_seqno), + offsetof(xfs_agf_t, agf_length), + offsetof(xfs_agf_t, agf_roots[0]), + offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_flfirst), + offsetof(xfs_agf_t, agf_fllast), + offsetof(xfs_agf_t, agf_flcount), + offsetof(xfs_agf_t, agf_freeblks), + offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), + sizeof(xfs_agf_t) + }; + + trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); +} + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags) /* XFS_ALLOC_FLAGS_... */ +{ + xfs_buf_t *bp; + int error; + + if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_buf_t *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + __be32 *blockp;/* pointer to array entry */ + int error; + int logflags; + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; + + agf = XFS_BUF_TO_AGF(agbp); + mp = tp->t_mountp; + + if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, + be32_to_cpu(agf->agf_seqno), &agflbp))) + return error; + be32_add_cpu(&agf->agf_fllast, 1); + if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) + agf->agf_fllast = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, 1); + xfs_trans_agflist_delta(tp, 1); + pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + xfs_perag_put(pag); + + xfs_alloc_log_agf(tp, agbp, logflags); + + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; + *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + + xfs_alloc_log_agf(tp, agbp, logflags); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); + return 0; +} + +static bool +xfs_agf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) + { + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; + + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; + + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + return false; + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; + +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_BUF_ */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + int error; + + trace_xfs_read_agf(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + if (error) + return error; + if (!*bpp) + return 0; + + ASSERT(!(*bpp)->b_error); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_alloc_read_agf(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_read_agf(mp, tp, agno, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + bpp); + if (error) + return error; + if (!*bpp) + return 0; + ASSERT(!(*bpp)->b_error); + + agf = XFS_BUF_TO_AGF(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagf_init) { + pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); + pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); + pag->pagf_longest = be32_to_cpu(agf->agf_longest); + pag->pagf_levels[XFS_BTNUM_BNOi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); + pag->pagf_levels[XFS_BTNUM_CNTi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + spin_lock_init(&pag->pagb_lock); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + pag->pagf_init = 1; + } +#ifdef DEBUG + else if (!XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); + ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); + ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); + ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); + ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); + ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + } +#endif + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate an extent (variable-size). + * Depending on the allocation type, we either look in a single allocation + * group or loop over the allocation groups to find the result. + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agblock_t agsize; /* allocation group size */ + int error; + int flags; /* XFS_ALLOC_FLAG_... locking flags */ + xfs_extlen_t minleft;/* minimum left value, temp copy */ + xfs_mount_t *mp; /* mount structure pointer */ + xfs_agnumber_t sagno; /* starting allocation group number */ + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + int no_min = 0; + xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure + * that out (xfs_bmap_alloc). + */ + agsize = mp->m_sb.sb_agblocks; + if (args->maxlen > agsize) + args->maxlen = agsize; + if (args->alignment == 0) + args->alignment = 1; + ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->minlen <= agsize); + ASSERT(args->mod < args->prod); + if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + args->minlen > args->maxlen || args->minlen > agsize || + args->mod >= args->prod) { + args->fsbno = NULLFSBLOCK; + trace_xfs_alloc_vextent_badargs(args); + return 0; + } + minleft = args->minleft; + + switch (type) { + case XFS_ALLOCTYPE_THIS_AG: + case XFS_ALLOCTYPE_NEAR_BNO: + case XFS_ALLOCTYPE_THIS_BNO: + /* + * These three force us into a single a.g. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->pag = xfs_perag_get(mp, args->agno); + args->minleft = 0; + error = xfs_alloc_fix_freelist(args, 0); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + if (!args->agbp) { + trace_xfs_alloc_vextent_noagbp(args); + break; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + case XFS_ALLOCTYPE_START_BNO: + /* + * Try near allocation first, then anywhere-in-ag after + * the first a.g. fails. + */ + if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + (mp->m_flags & XFS_MOUNT_32BITINODES)) { + args->fsbno = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + /* FALLTHROUGH */ + case XFS_ALLOCTYPE_ANY_AG: + case XFS_ALLOCTYPE_START_AG: + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. + */ + if (type == XFS_ALLOCTYPE_ANY_AG) { + /* + * Start with the last place we left off. + */ + args->agno = sagno = (mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount; + args->type = XFS_ALLOCTYPE_THIS_AG; + flags = XFS_ALLOC_FLAG_TRYLOCK; + } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* + * Start with allocation group given by bno. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; + sagno = 0; + flags = 0; + } else { + if (type == XFS_ALLOCTYPE_START_AG) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * Start with the given allocation group. + */ + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. + */ + for (;;) { + args->pag = xfs_perag_get(mp, args->agno); + if (no_min) args->minleft = 0; + error = xfs_alloc_fix_freelist(args, flags); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + /* + * If we get a buffer back then the allocation will fly. + */ + if (args->agbp) { + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + } + + trace_xfs_alloc_vextent_loopfailed(args); + + /* + * Didn't work, figure out the next iteration. + */ + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * For the first allocation, we can try any AG to get + * space. However, if we already have allocated a + * block, we don't want to try AGs whose number is below + * sagno. Otherwise, we may end up with out-of-order + * locking of AGF, which might cause deadlock. + */ + if (++(args->agno) == mp->m_sb.sb_agcount) { + if (args->firstblock != NULLFSBLOCK) + args->agno = sagno; + else + args->agno = 0; + } + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. + */ + if (args->agno == sagno) { + if (no_min == 1) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_vextent_allfailed(args); + break; + } + if (flags == 0) { + no_min = 1; + } else { + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + } + } + } + xfs_perag_put(args->pag); + } + if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (args->agno == sagno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (args->agbno == NULLAGBLOCK) + args->fsbno = NULLFSBLOCK; + else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); +#ifdef DEBUG + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), + args->len); +#endif + } + xfs_perag_put(args->pag); + return 0; +error0: + xfs_perag_put(args->pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_alloc_arg_t args; + int error; + + ASSERT(len != 0); + memset(&args, 0, sizeof(xfs_alloc_arg_t)); + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + if (args.agno >= args.mp->m_sb.sb_agcount) + return -EFSCORRUPTED; + + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + if (args.agbno >= args.mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; + + /* validate the extent size is legal now we have the agf locked */ + if (args.agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { + error = -EFSCORRUPTED; + goto error0; + } + + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); +error0: + xfs_perag_put(args.pag); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.h b/kernel/fs/xfs/libxfs/xfs_alloc.h new file mode 100644 index 000000000..d1b4b6a5c --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_H__ +#define __XFS_ALLOC_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xfs_perag; +struct xfs_trans; + +extern struct workqueue_struct *xfs_alloc_wq; + +/* + * Freespace allocation types. Argument to xfs_alloc_[v]extent. + */ +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; + +#define XFS_ALLOC_TYPES \ + { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ + { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ + { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ + { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ + { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ + { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ + { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } + +/* + * Flags for xfs_alloc_fix_freelist. + */ +#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ + +/* + * In order to avoid ENOSPC-related deadlock caused by + * out-of-order locking of AGF buffer (PV 947395), we place + * constraints on the relationship among actual allocations for + * data blocks, freelist blocks, and potential file data bmap + * btree blocks. However, these restrictions may result in no + * actual space allocated for a delayed extent, for example, a data + * block in a certain AG is allocated but there is no additional + * block for the additional bmap btree block due to a split of the + * bmap btree of the file. The result of this may lead to an + * infinite loop in xfssyncd when the file gets flushed to disk and + * all delayed extents need to be actually allocated. To get around + * this, we explicitly set aside a few blocks which will not be + * reserved in delayed allocation. Considering the minimum number of + * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap + * btree requires 1 fsb, so we set the number of set-aside blocks + * to 4 + 4*agcount. + */ +#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* + * Argument structure for xfs_alloc routines. + * This is turned into a structure to avoid having 20 arguments passed + * down several levels of the stack. + */ +typedef struct xfs_alloc_arg { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for a.g. freelist header */ + struct xfs_perag *pag; /* per-ag struct for this agno */ + xfs_fsblock_t fsbno; /* file system block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; /* allocation group-relative block # */ + xfs_extlen_t minlen; /* minimum size of extent */ + xfs_extlen_t maxlen; /* maximum size of extent */ + xfs_extlen_t mod; /* mod value for extent size */ + xfs_extlen_t prod; /* prod value for extent size */ + xfs_extlen_t minleft; /* min blocks must be left after us */ + xfs_extlen_t total; /* total blocks needed in xaction */ + xfs_extlen_t alignment; /* align answer to multiple of this */ + xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_extlen_t len; /* output: actual size of extent */ + xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ + xfs_alloctype_t otype; /* original allocation type */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ + char isfl; /* set if is freelist blocks - !acctg */ + char userdata; /* set if this is user data */ + xfs_fsblock_t firstblock; /* io first block allocated */ +} xfs_alloc_arg_t; + +/* + * Defines for userdata + */ +#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* buffer for a.g. freelist header */ + int fields);/* mask of fields to be logged (XFS_AGF_...) */ + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags); /* XFS_ALLOC_FLAGS_... */ + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for a.g. freelist header */ + struct xfs_buf *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp); /* buffer for the ag freelist header */ + +/* + * Allocate an extent (variable-size). + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args); /* allocation argument structure */ + +/* + * Free an extent. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len); /* length of extent */ + +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + +int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + +#endif /* __XFS_ALLOC_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.c b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c new file mode 100644 index 000000000..59d521c09 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" + + +STATIC struct xfs_btree_cur * +xfs_allocbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} + +STATIC void +xfs_allocbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_allocbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_allocbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); + return 0; +} + +/* + * Update the longest extent in the AGF + */ +STATIC void +xfs_allocbt_update_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, + int reason) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; + __be32 len; + int numrecs; + + ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + + switch (reason) { + case LASTREC_UPDATE: + /* + * If this is the last leaf block and it's the last record, + * then update the size of the longest extent in the AG. + */ + if (ptr != xfs_btree_get_numrecs(block)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_INSREC: + if (be32_to_cpu(rec->alloc.ar_blockcount) <= + be32_to_cpu(agf->agf_longest)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_DELREC: + numrecs = xfs_btree_get_numrecs(block); + if (ptr <= numrecs) + return; + ASSERT(ptr == numrecs + 1); + + if (numrecs) { + xfs_alloc_rec_t *rrp; + + rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); + len = rrp->ar_blockcount; + } else { + len = 0; + } + + break; + default: + ASSERT(0); + return; + } + + agf->agf_longest = len; + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); +} + +STATIC int +xfs_allocbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mnr[level != 0]; +} + +STATIC int +xfs_allocbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mxr[level != 0]; +} + +STATIC void +xfs_allocbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(rec->alloc.ar_startblock != 0); + + key->alloc.ar_startblock = rec->alloc.ar_startblock; + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->alloc.ar_startblock != 0); + + rec->alloc.ar_startblock = key->alloc.ar_startblock; + rec->alloc.ar_blockcount = key->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + ASSERT(cur->bc_rec.a.ar_startblock != 0); + + rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); + rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); +} + +STATIC void +xfs_allocbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_allocbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return (__int64_t)be32_to_cpu(kp->ar_startblock) - + rec->ar_startblock; + } + + diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + if (diff) + return diff; + + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +static bool +xfs_allocbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + default: + return false; + } + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} + +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_allocbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); + } else { + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); + } +} + +STATIC int +xfs_allocbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); + } else { + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); + } +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_allocbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_rec_from_key = xfs_allocbt_init_rec_from_key, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_allocbt_keys_inorder, + .recs_inorder = xfs_allocbt_recs_inorder, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_allocbt_ops; + + if (btnum == XFS_BTNUM_CNT) { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + } else { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + } + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + return cur; +} + +/* + * Calculate number of records in an alloc btree block. + */ +int +xfs_allocbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_ALLOC_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +} diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.h b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h new file mode 100644 index 000000000..45e189e7e --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_BTREE_H__ +#define __XFS_ALLOC_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_ALLOC_REC_ADDR(mp, block, index) \ + ((xfs_alloc_rec_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_alloc_rec_t)))) + +#define XFS_ALLOC_KEY_ADDR(mp, block, index) \ + ((xfs_alloc_key_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_alloc_key_t))) + +#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_alloc_ptr_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_alloc_key_t) + \ + ((index) - 1) * sizeof(xfs_alloc_ptr_t))) + +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, + xfs_agnumber_t, xfs_btnum_t); +extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr.c b/kernel/fs/xfs/libxfs/xfs_attr.c new file mode 100644 index 000000000..0a472fbe0 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr.c @@ -0,0 +1,1456 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * xfs_attr.c + * + * Provide the external interfaces to manage attribute lists. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute list fits inside the inode. + */ +STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is one block. + */ +STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is more than one block. + */ +STATIC int xfs_attr_node_get(xfs_da_args_t *args); +STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_fillstate(xfs_da_state_t *state); +STATIC int xfs_attr_refillstate(xfs_da_state_t *state); + + +STATIC int +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) +{ + + if (!name) + return -EINVAL; + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) + return -EFAULT; /* match IRIX behaviour */ + + args->hashval = xfs_da_hashname(args->name, args->namelen); + return 0; +} + +int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +int +xfs_attr_get( + struct xfs_inode *ip, + const unsigned char *name, + unsigned char *value, + int *valuelenp, + int flags) +{ + struct xfs_da_args args; + uint lock_mode; + int error; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (!xfs_inode_hasattr(ip)) + return -ENOATTR; + + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = *valuelenp; + + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = -ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + error = xfs_attr_shortform_getvalue(&args); + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + error = xfs_attr_leaf_get(&args); + else + error = xfs_attr_node_get(&args); + xfs_iunlock(ip, lock_mode); + + *valuelenp = args.valuelen; + return error == -EEXIST ? 0 : error; +} + +/* + * Calculate how many blocks we need for the new attribute, + */ +STATIC int +xfs_attr_calc_size( + struct xfs_da_args *args, + int *local) +{ + struct xfs_mount *mp = args->dp->i_mount; + int size; + int nblks; + + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(args, local); + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (args->geo->blksize / 2)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + return nblks; +} + +int +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + struct xfs_trans_res tres; + xfs_fsblock_t firstblock; + int rsvd = (flags & ATTR_ROOT) != 0; + int error, err2, committed, local; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); + + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); + + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (rsvd) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(args.trans, &tres, args.total, 0); + if (error) { + xfs_trans_cancel(args.trans, 0); + return error; + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_trans_ijoin(args.trans, dp, 0); + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(&args); + + /* + * Try to add the attr to the attribute list in + * the inode. + */ + error = xfs_attr_shortform_addname(&args); + if (error != -ENOSPC) { + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + ASSERT(args.trans != NULL); + + /* + * If this is a synchronous mount, make sure that + * the transaction goes to disk before returning + * to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_trans_ichgtime(args.trans, dp, + XFS_ICHGTIME_CHG); + } + err2 = xfs_trans_commit(args.trans, + XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error ? error : err2; + } + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + xfs_bmap_init(args.flist, args.firstblock); + error = xfs_attr_shortform_to_leaf(&args); + if (!error) { + error = xfs_bmap_finish(&args.trans, args.flist, + &committed); + } + if (error) { + ASSERT(committed); + args.trans = NULL; + xfs_bmap_cancel(&flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args.trans, dp, 0); + + /* + * Commit the leaf transformation. We'll need another (linked) + * transaction to add the new attribute to the leaf. + */ + + error = xfs_trans_roll(&args.trans, dp); + if (error) + goto out; + + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_addname(&args); + else + error = xfs_attr_node_addname(&args); + if (error) + goto out; + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error; + +out: + if (args.trans) { + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ +int +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; + + XFS_STATS_INC(xs_attr_remove); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.firstblock = &firstblock; + args.flist = &flist; + + /* + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. + */ + args.op_flags = XFS_DA_OP_OKNOENT; + + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (flags & ATTR_ROOT) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(args.trans, 0); + return error; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + /* + * No need to make quota reservations here. We expect to release some + * blocks not allocate in the common case. + */ + xfs_trans_ijoin(args.trans, dp, 0); + + if (!xfs_inode_hasattr(dp)) { + error = -ENOATTR; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(&args); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(&args); + } else { + error = xfs_attr_node_removename(&args); + } + + if (error) + goto out; + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error; + +out: + if (args.trans) { + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/*======================================================================== + * External routines when attribute list is inside the inode + *========================================================================*/ + +/* + * Add a name to the shortform attribute list structure + * This is the external routine. + */ +STATIC int +xfs_attr_shortform_addname(xfs_da_args_t *args) +{ + int newsize, forkoff, retval; + + trace_xfs_attr_sf_addname(args); + + retval = xfs_attr_shortform_lookup(args); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + return retval; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) + return retval; + retval = xfs_attr_shortform_remove(args); + ASSERT(retval == 0); + } + + if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || + args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return -ENOSPC; + + newsize = XFS_ATTR_SF_TOTSIZE(args->dp); + newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + + forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); + if (!forkoff) + return -ENOSPC; + + xfs_attr_shortform_add(args, forkoff); + return 0; +} + + +/*======================================================================== + * External routines when attribute list is one block + *========================================================================*/ + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + struct xfs_buf *bp; + int retval, error, committed, forkoff; + + trace_xfs_attr_leaf_addname(args); + + /* + * Read the (only) block in the attribute list in. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + /* + * Look up the given attribute in the leaf block. Figure out if + * the given flags produce an error or call for an atomic rename. + */ + retval = xfs_attr3_leaf_lookup_int(bp, args); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + xfs_trans_brelse(args->trans, bp); + return retval; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) { /* pure create op */ + xfs_trans_brelse(args->trans, bp); + return retval; + } + + trace_xfs_attr_leaf_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; + } + + /* + * Add the attribute to the leaf block, transitioning to a Btree + * if required. + */ + retval = xfs_attr3_leaf_add(bp, args); + if (retval == -ENOSPC) { + /* + * Promote the attribute list to the Btree format, then + * Commit that transaction so that the node_addname() call + * can manage its own transactions. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the current trans (including the inode) and start + * a new one. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + + /* + * Fob the whole rest of the problem off on the Btree code. + */ + error = xfs_attr_node_addname(args); + return error; + } + + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Read in the block containing the "old" attr, then + * remove the "old" attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + + /* + * Commit the remove and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, dp); + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr3_leaf_clearflag(args); + } + return error; +} + +/* + * Remove a name from the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_removename(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + struct xfs_buf *bp; + int error, committed, forkoff; + + trace_xfs_attr_leaf_removename(args); + + /* + * Remove the attribute. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(bp, args); + if (error == -ENOATTR) { + xfs_trans_brelse(args->trans, bp); + return error; + } + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + return 0; +} + +/* + * Look up a name in a leaf attribute list structure. + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_get(xfs_da_args_t *args) +{ + struct xfs_buf *bp; + int error; + + trace_xfs_attr_leaf_get(args); + + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(bp, args); + if (error != -EEXIST) { + xfs_trans_brelse(args->trans, bp); + return error; + } + error = xfs_attr3_leaf_getvalue(bp, args); + xfs_trans_brelse(args->trans, bp); + if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { + error = xfs_attr_rmtval_get(args); + } + return error; +} + +/*======================================================================== + * External routines when attribute list size > geo->blksize + *========================================================================*/ + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_mount_t *mp; + int committed, retval, error; + + trace_xfs_attr_node_addname(args); + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + dp = args->dp; + mp = dp->i_mount; +restart: + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + goto out; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) + goto out; + + trace_xfs_attr_node_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; + } + + retval = xfs_attr3_leaf_add(blk->bp, state->args); + if (retval == -ENOSPC) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. + */ + xfs_da_state_free(state); + state = NULL; + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the node conversion and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + goto restart; + } + + /* + * Split as many Btree elements as required. + * This code tracks the new and old attr's location + * in the index/blkno/rmtblkno/rmtblkcnt fields and + * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_split(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } else { + /* + * Addition succeeded, update Btree hashvals. + */ + xfs_da3_fixhashpath(state, &state->path); + } + + /* + * Kill the state structure, we're done with it and need to + * allow the buffers to come back later. + */ + xfs_da_state_free(state); + state = NULL; + + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Re-find the "old" attribute entry after any split ops. + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ + args->flags |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->inleaf = 0; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + + /* + * Commit and start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr3_leaf_clearflag(args); + if (error) + goto out; + } + retval = error = 0; + +out: + if (state) + xfs_da_state_free(state); + if (error) + return error; + return retval; +} + +/* + * Remove a name from a B-tree attribute list. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_attr_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + struct xfs_buf *bp; + int retval, error, committed, forkoff; + + trace_xfs_attr_node_removename(args); + + /* + * Tie a string around our finger to remind us where we are. + */ + dp = args->dp; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = dp->i_mount; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error || (retval != -EEXIST)) { + if (error == 0) + error = retval; + goto out; + } + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a deadlock. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if (args->rmtblkno > 0) { + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + goto out; + + /* + * Mark the attribute as INCOMPLETE, then bunmapi() the + * remote value. + */ + error = xfs_attr3_leaf_setflag(args); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the Btree join operation and start a new trans. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + } + + /* + * If the result is small enough, push it all into the inode. + */ + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); + if (error) + goto out; + + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } else + xfs_trans_brelse(args->trans, bp); + } + error = 0; + +out: + xfs_da_state_free(state); + return error; +} + +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +STATIC int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +STATIC int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} + +/* + * Look up a filename in a node attribute list. + * + * This routine gets called for any attribute fork that has more than one + * block, ie: both true Btree attr lists and for single-leaf-blocks with + * "remote" values taking up more blocks. + */ +STATIC int +xfs_attr_node_get(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int error, retval; + int i; + + trace_xfs_attr_node_get(args); + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) { + retval = error; + } else if (retval == -EEXIST) { + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Get the value, local or "remote" + */ + retval = xfs_attr3_leaf_getvalue(blk->bp, args); + if (!retval && (args->rmtblkno > 0) + && !(args->flags & ATTR_KERNOVAL)) { + retval = xfs_attr_rmtval_get(args); + } + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return retval; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.c b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c new file mode 100644 index 000000000..e9d401ce9 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c @@ -0,0 +1,2773 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dir2.h" + + +/* + * xfs_attr_leaf.c + * + * Routines to implement leaf blocks of attributes as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); + +/* + * Utility routines. + */ +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count); +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); + +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; + } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_attr3_leaf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, +}; + +int +xfs_attr3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; +} + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} + + +/*======================================================================== + * External routines when attribute fork size < XFS_LITINO(mp). + *========================================================================*/ + +/* + * Query whether the requested number of additional bytes of extended + * attribute space will be able to fit inline. + * + * Returns zero if not, else the di_forkoff fork offset to be used in the + * literal area for attribute data once the new bytes have been added. + * + * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value; + * special case for dev/uuid inodes, they have fixed size data forks. + */ +int +xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) +{ + int offset; + int minforkoff; /* lower limit on valid forkoff locations */ + int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; + xfs_mount_t *mp = dp->i_mount; + + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + case XFS_DINODE_FMT_UUID: + minforkoff = roundup(sizeof(uuid_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + } + + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + return 0; + + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + xfs_default_attroffset(dp)) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + case XFS_DINODE_FMT_BTREE: + /* + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + break; + } + + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = roundup(minforkoff, 8) >> 3; + + /* attr fork btree root can have at least this many key/ptr pairs */ + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = maxforkoff >> 3; /* rounded down */ + + if (offset >= maxforkoff) + return maxforkoff; + if (offset >= minforkoff) + return offset; + return 0; +} + +/* + * Switch on the ATTR2 superblock bit (implies also FEATURES2) + */ +STATIC void +xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +{ + if ((mp->m_flags & XFS_MOUNT_ATTR2) && + !(xfs_sb_version_hasattr2(&mp->m_sb))) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr2(&mp->m_sb)) { + xfs_sb_version_addattr2(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } else + spin_unlock(&mp->m_sb_lock); + } +} + +/* + * Create the initial contents of a shortform attribute list. + */ +void +xfs_attr_shortform_create(xfs_da_args_t *args) +{ + xfs_attr_sf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_create(args); + + dp = args->dp; + ASSERT(dp != NULL); + ifp = dp->i_afp; + ASSERT(ifp != NULL); + ASSERT(ifp->if_bytes == 0); + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_flags |= XFS_IFINLINE; + } else { + ASSERT(ifp->if_flags & XFS_IFINLINE); + } + xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); + hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); +} + +/* + * Add a name/value pair to the shortform attribute list. + * Overflow from the inode has already been checked for. + */ +void +xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i, offset, size; + xfs_mount_t *mp; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_add(args); + + dp = args->dp; + mp = dp->i_mount; + dp->i_d.di_forkoff = forkoff; + + ifp = dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { +#ifdef DEBUG + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + ASSERT(0); +#endif + } + + offset = (char *)sfe - (char *)sf; + size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + + sfe->namelen = args->namelen; + sfe->valuelen = args->valuelen; + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + memcpy(sfe->nameval, args->name, args->namelen); + memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); + sf->hdr.count++; + be16_add_cpu(&sf->hdr.totsize, size); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + xfs_sbversion_add_attr2(mp, args->trans); +} + +/* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +void +xfs_attr_fork_remove( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Remove an attribute from the shortform attribute list structure. + */ +int +xfs_attr_shortform_remove(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int base, size=0, end, totsize, i; + xfs_mount_t *mp; + xfs_inode_t *dp; + + trace_xfs_attr_sf_remove(args); + + dp = args->dp; + mp = dp->i_mount; + base = sizeof(xfs_attr_sf_hdr_t); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (sfe->namelen != args->namelen) + continue; + if (memcmp(sfe->nameval, args->name, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + break; + } + if (i == end) + return -ENOATTR; + + /* + * Fix up the attribute fork data, covering the hole + */ + end = base + size; + totsize = be16_to_cpu(sf->hdr.totsize); + if (end != totsize) + memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); + sf->hdr.count--; + be16_add_cpu(&sf->hdr.totsize, -size); + + /* + * Fix up the start offset of the attribute fork + */ + totsize -= size; + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_remove(dp, args->trans); + } else { + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); + ASSERT(dp->i_d.di_forkoff); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + xfs_trans_log_inode(args->trans, dp, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + xfs_sbversion_add_attr2(mp, args->trans); + + return 0; +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_lookup(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_lookup(args); + + ifp = args->dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + return -EEXIST; + } + return -ENOATTR; +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_getvalue(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + + ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = sfe->valuelen; + return -EEXIST; + } + if (args->valuelen < sfe->valuelen) { + args->valuelen = sfe->valuelen; + return -ERANGE; + } + args->valuelen = sfe->valuelen; + memcpy(args->value, &sfe->nameval[args->namelen], + args->valuelen); + return -EEXIST; + } + return -ENOATTR; +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_da_args_t nargs; + char *tmpbuffer; + int error, i, size; + xfs_dablk_t blkno; + struct xfs_buf *bp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_to_leaf(args); + + dp = args->dp; + ifp = dp->i_afp; + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + size = be16_to_cpu(sf->hdr.totsize); + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, ifp->if_u1.if_data, size); + sf = (xfs_attr_shortform_t *)tmpbuffer; + + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + + bp = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) { + /* + * If we hit an IO error middle of the transaction inside + * grow_inode(), we may have inconsistent data. Bail out. + */ + if (error == -EIO) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + ASSERT(blkno == 0); + error = xfs_attr3_leaf_create(args, blkno, &bp); + if (error) { + error = xfs_da_shrink_inode(args, 0, bp); + bp = NULL; + if (error) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.geo = args->geo; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; i++) { + nargs.name = sfe->nameval; + nargs.namelen = sfe->namelen; + nargs.value = &sfe->nameval[nargs.namelen]; + nargs.valuelen = sfe->valuelen; + nargs.hashval = xfs_da_hashname(sfe->nameval, + sfe->namelen); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == -ENOATTR); + error = xfs_attr3_leaf_add(bp, &nargs); + ASSERT(error != -ENOSPC); + if (error) + goto out; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + error = 0; + +out: + kmem_free(tmpbuffer); + return error; +} + +/* + * Check a leaf attribute block to see if all the entries would fit into + * a shortform attribute list. + */ +int +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + xfs_attr_leaf_name_local_t *name_loc; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; + struct xfs_mount *mp = bp->b_target->bt_mount; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + bytes = sizeof(struct xfs_attr_sf_hdr); + for (i = 0; i < leafhdr.count; entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!(entry->flags & XFS_ATTR_LOCAL)) + return 0; + name_loc = xfs_attr3_leaf_name_local(leaf, i); + if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return 0; + if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) + return 0; + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + + name_loc->namelen + + be16_to_cpu(name_loc->valuelen); + } + if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (bytes == sizeof(struct xfs_attr_sf_hdr))) + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); +} + +/* + * Convert a leaf attribute list to shortform attribute list + */ +int +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; + + trace_xfs_attr_leaf_to_sf(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + if (!tmpbuffer) + return -ENOMEM; + + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + + leaf = (xfs_attr_leafblock_t *)tmpbuffer; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ + memset(bp->b_addr, 0, args->geo->blksize); + + /* + * Clean out the prior contents of the attribute list. + */ + error = xfs_da_shrink_inode(args, 0, bp); + if (error) + goto out; + + if (forkoff == -1) { + ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + goto out; + } + + xfs_attr_shortform_create(args); + + /* + * Copy the attributes + */ + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + + for (i = 0; i < ichdr.count; entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!entry->nameidx) + continue; + ASSERT(entry->flags & XFS_ATTR_LOCAL); + name_loc = xfs_attr3_leaf_name_local(leaf, i); + nargs.name = name_loc->nameval; + nargs.namelen = name_loc->namelen; + nargs.value = &name_loc->nameval[nargs.namelen]; + nargs.valuelen = be16_to_cpu(name_loc->valuelen); + nargs.hashval = be32_to_cpu(entry->hashval); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + xfs_attr_shortform_add(&nargs, forkoff); + } + error = 0; + +out: + kmem_free(tmpbuffer); + return error; +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; + + trace_xfs_attr_leaf_to_node(args); + + error = xfs_da_grow_inode(args, &blkno); + if (error) + goto out; + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); + if (error) + goto out; + + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); + if (error) + goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); + bp2->b_ops = bp1->b_ops; + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); + + /* + * Set up the new root node. + */ + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + if (error) + goto out; + node = bp1->b_addr; + dp->d_ops->node_hdr_from_disk(&icnodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + leaf = bp2->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + /* both on-disk, don't endian-flip twice */ + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); + error = 0; +out: + return error; +} + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf attribute list + * or a leaf in a node attribute list. + */ +STATIC int +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + trace_xfs_attr_leaf_create(args); + + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); + leaf = bp->b_addr; + memset(leaf, 0, args->geo->blksize); + + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = args->geo->blksize; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); + + *bpp = bp; + return 0; +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) +{ + xfs_dablk_t blkno; + int error; + + trace_xfs_attr_leaf_split(state->args); + + /* + * Allocate space for a new leaf node. + */ + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return error; + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); + if (error) + return error; + newblk->blkno = blkno; + newblk->magic = XFS_ATTR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + * NOTE: rebalance() currently depends on the 2nd block being empty. + */ + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) + return error; + + /* + * Save info on "old" attribute for "atomic rename" ops, leaf_add() + * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the + * "new" attrs info. Will need the "old" info to remove it later. + * + * Insert the "new" entry in the correct block. + */ + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); + } else { + trace_xfs_attr_leaf_add_new(state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); + } + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + return error; +} + +/* + * Add a name to the leaf attribute list structure. + */ +int +xfs_attr3_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; + + trace_xfs_attr_leaf_add(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); + entsize = xfs_attr_leaf_newentsize(args, NULL); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; + continue; + } + if (!ichdr.freemap[i].size) + continue; /* no space in this map */ + tmp = entsize; + if (ichdr.freemap[i].base < ichdr.firstused) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; + } + sum += ichdr.freemap[i].size; + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!ichdr.holes && sum < entsize) + return -ENOSPC; + + /* + * Compact the entries to coalesce free space. + * This may change the hdr->count via dropping INCOMPLETE entries. + */ + xfs_attr3_leaf_compact(args, &ichdr, bp); + + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = -ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; +} + +/* + * Add a name to a leaf attribute list structure. + */ +STATIC int +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; + + trace_xfs_attr_leaf_add_work(args); + + leaf = bp->b_addr; + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; + tmp *= sizeof(xfs_attr_leaf_entry_t); + memmove(entry + 1, entry, tmp); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + } + ichdr->count++; + + /* + * Allocate space for the new string (at the end of the run). + */ + mp = args->trans->t_mountp; + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); + entry->hashval = cpu_to_be32(args->hashval); + entry->flags = tmp ? XFS_ATTR_LOCAL : 0; + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + if (args->op_flags & XFS_DA_OP_RENAME) { + entry->flags |= XFS_ATTR_INCOMPLETE; + if ((args->blkno2 == args->blkno) && + (args->index2 <= args->index)) { + args->index2++; + } + } + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + ASSERT((args->index == 0) || + (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); + ASSERT((args->index == ichdr->count - 1) || + (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); + + /* + * For "remote" attribute values, simply note that we need to + * allocate space for the "remote" value. We can't actually + * allocate the extents in this transaction, and we can't decide + * which blocks they should be as we might allocate more blocks + * as part of this transaction (a split operation for example). + */ + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + name_loc->namelen = args->namelen; + name_loc->valuelen = cpu_to_be16(args->valuelen); + memcpy((char *)name_loc->nameval, args->name, args->namelen); + memcpy((char *)&name_loc->nameval[args->namelen], args->value, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->namelen = args->namelen; + memcpy((char *)name_rmt->name, args->name, args->namelen); + entry->flags |= XFS_ATTR_INCOMPLETE; + /* just in case */ + name_rmt->valuelen = 0; + name_rmt->valueblk = 0; + args->rmtblkno = 1; + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; + } + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), + xfs_attr_leaf_entsize(leaf, args->index))); + + /* + * Update the control info for this leaf node + */ + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); + } + } + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; +} + +/* + * Garbage collect a leaf attribute list block by copying it to a new buffer. + */ +STATIC void +xfs_attr3_leaf_compact( + struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_dst, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; + struct xfs_trans *trans = args->trans; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; + + /* + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. + */ + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = args->geo->blksize; + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate name/value pairs packed and in sequence. + */ + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); + + kmem_free(tmpbuffer); +} + +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* + * Redistribute the attribute list entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of the + * old block. At present, all (one) callers pass in an empty second block. + * + * This code adjusts the args->index/blkno and args->index2/blkno2 fields + * to match what it is doing in splitting the attribute leaf block. Those + * values are used in "atomic rename" operations on attributes. Note that + * the "new" and "old" values can end up in different blocks. + */ +STATIC void +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); + ASSERT(ichdr2.count == 0); + args = state->args; + + trace_xfs_attr_leaf_rebalance(args); + + /* + * Check ordering of blocks, reverse if it makes things simpler. + * + * NOTE: Given that all (current) callers pass in an empty + * second block, this code should never set "swap". + */ + swap = 0; + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + swap = 1; + } + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + * + * "inleaf" is true if the new entry should be inserted into blk1. + * If "swap" is also true, then reverse the sense of "inleaf". + */ + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < ichdr1.count) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); + + } else if (count > ichdr1.count) { + /* + * I assert that since all callers pass in an empty + * second buffer, this code should never execute. + */ + ASSERT(0); + + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); + } + + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); + + /* + * Adjust the expected index for insertion. + * NOTE: this code depends on the (current) situation that the + * second block was originally empty. + * + * If the insertion point moved to the 2nd block, we must adjust + * the index. We must also track the entry just following the + * new entry for use in an "atomic rename" operation, that entry + * is always the "old" entry and the "new" entry is what we are + * inserting. The index/blkno fields refer to the "old" entry, + * while the index2/blkno2 fields refer to the "new" entry. + */ + if (blk1->index > ichdr1.count) { + ASSERT(state->inleaf == 0); + blk2->index = blk1->index - ichdr1.count; + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } else if (blk1->index == ichdr1.count) { + if (state->inleaf) { + args->index = blk1->index; + args->blkno = blk1->blkno; + args->index2 = 0; + args->blkno2 = blk2->blkno; + } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ + blk2->index = blk1->index - ichdr1.count; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } + } + } else { + ASSERT(state->inleaf == 1); + args->index = args->index2 = blk1->index; + args->blkno = args->blkno2 = blk1->blkno; + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) +{ + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args, NULL); + half /= 2; + lastdelta = state->args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf1); + for (count = index = 0; count < max; entry++, index++, count++) { + +#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, NULL); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == ichdr1->count) { + leaf1 = leaf2; + entry = xfs_attr3_leaf_entryp(leaf1); + index = 0; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, + index); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; +#undef XFS_ATTR_ABS + } + + /* + * Calculate the number of usedbytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= count * sizeof(*entry); + if (foundit) { + totallen -= sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, NULL); + } + + *countarg = count; + *usedbytesarg = totallen; + return foundit; +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + * + * GROT: allow for INCOMPLETE entries in calculation. + */ +int +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; + + trace_xfs_attr_leaf_toosmall(state->args); + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; + if (bytes > (state->args->geo->blksize >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return 0; + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (ichdr.count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (ichdr.forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 2; + } + return 0; + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink an attribute list over time. + */ + /* start with smaller blk num */ + forward = ichdr.forw < ichdr.back; + for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; + if (forward) + blkno = ichdr.forw; + else + blkno = ichdr.back; + if (blkno == 0) + continue; + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); + if (error) + return error; + + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); + + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + + xfs_trans_brelse(state->args->trans, bp); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da3_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 1; + } + return 0; +} + +/* + * Remove a name from the leaf attribute list structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; + + trace_xfs_attr_leaf_remove(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; + before = after = -1; + smallest = XFS_ATTR_LEAF_MAPSIZE - 1; + entsize = xfs_attr_leaf_entsize(leaf, args->index); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); + } + + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { + before = i; + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { + after = i; + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; + } else if (before >= 0) { + ichdr.freemap[before].size += entsize; + } else { + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; + } + } + + /* + * Did we remove the first entry? + */ + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), + entsize)); + + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); + + if (be16_to_cpu(entry->nameidx) < tmp) + tmp = be16_to_cpu(entry->nameidx); + } + ichdr.firstused = tmp; + ASSERT(ichdr.firstused != 0); + } else { + ichdr.holes = 1; /* mark as needing compaction */ + } + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < args->geo->magicpct; /* leaf is < 37% full */ +} + +/* + * Move all the attribute list entries from drop_leaf into save_leaf. + */ +void +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; + + trace_xfs_attr_leaf_unbalance(state->args); + + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (savehdr.holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count); + } else { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->args->geo->blksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); + + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count); + } else { + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count); + } + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); + } + + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, + state->args->geo->blksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf attribute list structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in args->index the index into the entry[] array of either + * the found entry, or where the entry should have been (insert before + * that entry). + * + * Don't change the args->value unless we find the attribute. + */ +int +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; + + trace_xfs_attr_leaf_lookup(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { + span /= 2; + if (be32_to_cpu(entry->hashval) < hashval) + probe += span; + else if (be32_to_cpu(entry->hashval) > hashval) + probe -= span; + else + break; + } + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { + entry--; + probe--; + } + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { + entry++; + probe++; + } + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { + args->index = probe; + return -ENOATTR; + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); + entry++, probe++) { +/* + * GROT: Add code to remove incomplete entries. + */ + /* + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ + if ((args->flags & XFS_ATTR_INCOMPLETE) != + (entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, probe); + if (name_loc->namelen != args->namelen) + continue; + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + return -EEXIST; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); + if (name_rmt->namelen != args->namelen) + continue; + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->rmtvaluelen); + return -EEXIST; + } + } + args->index = probe; + return -ENOATTR; +} + +/* + * Get the value associated with an attribute name from a leaf attribute + * list structure. + */ +int +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + ASSERT(args->index < ichdr.count); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + ASSERT(name_loc->namelen == args->namelen); + ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); + valuelen = be16_to_cpu(name_loc->valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return 0; + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return -ERANGE; + } + args->valuelen = valuelen; + memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = args->rmtvaluelen; + return 0; + } + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; + return -ERANGE; + } + args->valuelen = args->rmtvaluelen; + } + return 0; +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/*ARGSUSED*/ +STATIC void +xfs_attr3_leaf_moveents( + struct xfs_da_args *args, + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count) +{ + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < args->geo->blksize / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate attribute info packed and in sequence. + */ + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + desti = start_d; + for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); + tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); +#ifdef GROT + /* + * Code to drop INCOMPLETE entries. Difficult to use as we + * may also need to change the insertion index. Code turned + * off for 6.2, should be revisited later. + */ + if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; + entry_d--; /* to compensate for ++ in loop hdr */ + desti--; + if ((start_s + i) < offset) + result++; /* insertion index adjustment */ + } else { +#endif /* GROT */ + ichdr_d->firstused -= tmp; + /* both on-disk, don't endian flip twice */ + entry_d->hashval = entry_s->hashval; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); + entry_d->flags = entry_s->flags; + ASSERT(be16_to_cpu(entry_d->nameidx) + tmp + <= args->geo->blksize); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); + ASSERT(be16_to_cpu(entry_s->nameidx) + tmp + <= args->geo->blksize); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); +#ifdef GROT + } +#endif /* GROT */ + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == ichdr_s->count) { + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); + + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); + } + + /* + * Fill in the freemap information + */ + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); + if (count) + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); +} + +/* + * Calculate the number of bytes used to store the indicated attribute + * (whether local or remote only calculate bytes in this block). + */ +STATIC int +xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) +{ + struct xfs_attr_leaf_entry *entries; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int size; + + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); + size = xfs_attr_leaf_entsize_local(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); + size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); + } + return size; +} + +/* + * Calculate the number of bytes that would be required to store the new + * attribute (whether local or remote only calculate bytes in this block). + * This routine decides as a side effect whether the attribute will be + * a "local" or a "remote" attribute. + */ +int +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) +{ + int size; + + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) + *local = 1; + return size; + } + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); +} + + +/*======================================================================== + * Manage the INCOMPLETE flag in a leaf entry + *========================================================================*/ + +/* + * Clear the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; + xfs_attr_leaf_name_local_t *name_loc; + int namelen; + char *name; +#endif /* DEBUG */ + + trace_xfs_attr_leaf_clearflag(args); + /* + * Set up the operation. + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + leaf = bp->b_addr; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); + +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + namelen = name_loc->namelen; + name = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + namelen = name_rmt->namelen; + name = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry->hashval) == args->hashval); + ASSERT(namelen == args->namelen); + ASSERT(memcmp(name, args->name, namelen) == 0); +#endif /* DEBUG */ + + entry->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + if (args->rmtblkno) { + ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * Set the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif + + trace_xfs_attr_leaf_setflag(args); + + /* + * Set up the operation. + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + leaf = bp->b_addr; +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); + entry->flags |= XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + if ((entry->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * In a single transaction, clear the INCOMPLETE flag on the leaf entry + * given by args->blkno/index and set the INCOMPLETE flag on the leaf + * entry given by args->blkno2/index2. + * + * Note that they could be in different blocks, or in the same block. + */ +int +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + xfs_attr_leaf_name_local_t *name_loc; + int namelen1, namelen2; + char *name1, *name2; +#endif /* DEBUG */ + + trace_xfs_attr_leaf_flipflags(args); + + /* + * Read the block containing the "old" attr + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; + + /* + * Read the block containing the "new" attr, if it is different + */ + if (args->blkno2 != args->blkno) { + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; + } else { + bp2 = bp1; + } + + leaf1 = bp1->b_addr; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; + + leaf2 = bp2->b_addr; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; + +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + + if (entry1->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); + namelen1 = name_loc->namelen; + name1 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); + namelen1 = name_rmt->namelen; + name1 = (char *)name_rmt->name; + } + if (entry2->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); + namelen2 = name_loc->namelen; + name2 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); + namelen2 = name_rmt->namelen; + name2 = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval)); + ASSERT(namelen1 == namelen2); + ASSERT(memcmp(name1, name2, namelen1) == 0); +#endif /* DEBUG */ + + ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); + ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); + + entry1->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); + if (args->rmtblkno) { + ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); + } + + entry2->flags |= XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); + if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_trans_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, args->dp); + + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.h b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h new file mode 100644 index 000000000..882c8d338 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_LEAF_H__ +#define __XFS_ATTR_LEAF_H__ + +struct attrlist; +struct attrlist_cursor_kern; +struct xfs_attr_list_context; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/* + * Used to keep a list of "remote value" extents when unlinking an inode. + */ +typedef struct xfs_attr_inactive_list { + xfs_dablk_t valueblk; /* block number of value bytes */ + int valuelen; /* number of bytes in value */ +} xfs_attr_inactive_list_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute fork size < XFS_LITINO(mp). + */ +void xfs_attr_shortform_create(struct xfs_da_args *args); +void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); +int xfs_attr_shortform_lookup(struct xfs_da_args *args); +int xfs_attr_shortform_getvalue(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_shortform_list(struct xfs_attr_list_context *context); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); + +/* + * Internal routines when attribute fork size == XFS_LBSIZE(mp). + */ +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, + struct xfs_da_args *args, int forkoff); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_attr3_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, + struct xfs_da_args *args); +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, + struct xfs_attr_list_context *context); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); + +/* + * Utility routines. + */ +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from); + +#endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.c b/kernel/fs/xfs/libxfs/xfs_attr_remote.c new file mode 100644 index 000000000..20de88d1b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + xfs_buf_ioerror(bp, -EFSBADCRC); + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + break; + } + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + + if (bp->b_error) + xfs_verifier_error(bp); + else + ASSERT(len == 0); +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); + + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + ASSERT(len == 0); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +STATIC int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bno); + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Helper functions to copy attribute data in and out of the one disk extents + */ +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **dst) +{ + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return -EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= blksize; + src += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < blksize) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == blksize); + memset(dst + hdr_size + byte_cnt, 0, + blksize - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_dablk_t lblkno = args->rmtblkno; + __uint8_t *dst = args->value; + int valuelen; + int nmap; + int error; + int blkcnt = args->rmtblkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, dblkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); + xfs_buf_relse(bp); + if (error) + return error; + + /* roll attribute extent map forwards */ + lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + __uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion and have to take the header space into account. + */ + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); + + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + if (!bp) + return -ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + + /* roll attribute extent map forwards */ + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; + + trace_xfs_attr_rmtval_remove(args); + + /* + * Roll through the "value", invalidating the attribute value's blocks. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + int committed; + + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return error; + } + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.h b/kernel/fs/xfs/libxfs/xfs_attr_remote.h new file mode 100644 index 000000000..5a9acfa15 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ + +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr_sf.h b/kernel/fs/xfs/libxfs/xfs_attr_sf.h new file mode 100644 index 000000000..919756e3b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_sf.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_SF_H__ +#define __XFS_ATTR_SF_H__ + +/* + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as + * to fit into the literal area of the inode. + */ + +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; +typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; +typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; + +/* + * We generate this then sort it, attr_list() must return things in hash-order. + */ +typedef struct xfs_attr_sf_sort { + __uint8_t entno; /* entry number in original list */ + __uint8_t namelen; /* length of name value (no null) */ + __uint8_t valuelen; /* length of value */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + xfs_dahash_t hash; /* this entry's hash value */ + unsigned char *name; /* name value, pointer into buffer */ +} xfs_attr_sf_sort_t; + +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ + (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) +#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ + ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) +#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) +#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) +#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ + (be16_to_cpu(((xfs_attr_shortform_t *) \ + ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +#endif /* __XFS_ATTR_SF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bit.h b/kernel/fs/xfs/libxfs/xfs_bit.h new file mode 100644 index 000000000..e1649c0d3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bit.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BIT_H__ +#define __XFS_BIT_H__ + +/* + * XFS bit manipulation routines. + */ + +/* + * masks with n high/low bits set, 64-bit values + */ +static inline __uint64_t xfs_mask64hi(int n) +{ + return (__uint64_t)-1 << (64 - (n)); +} +static inline __uint32_t xfs_mask32lo(int n) +{ + return ((__uint32_t)1 << (n)) - 1; +} +static inline __uint64_t xfs_mask64lo(int n) +{ + return ((__uint64_t)1 << (n)) - 1; +} + +/* Get high bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + return ffs(v) - 1; +} + +/* Get low bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; + + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w) { + n = ffs(w); + if (n) + n += 32; + } + } + return n - 1; +} + +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); + +/* Count continuous one bits in map starting with start_bit */ +extern int xfs_contig_bits(uint *map, uint size, uint start_bit); + +/* Find next set bit in map */ +extern int xfs_next_bit(uint *map, uint size, uint start_bit); + +#endif /* __XFS_BIT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.c b/kernel/fs/xfs/libxfs/xfs_bmap.c new file mode 100644 index 000000000..f1026e86d --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap.c @@ -0,0 +1,5945 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_attr_leaf.h" +#include "xfs_filestream.h" + + +kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Miscellaneous helper functions + */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} + +STATIC int /* error */ +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +STATIC int /* error */ +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given + * by [off, bno, len, state]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_bmbt_update( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + union xfs_btree_rec rec; + + xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + return xfs_btree_update(cur, &rec); +} + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the caller's original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extend beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +void +xfs_bmap_local_to_extents_empty( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_bytes == 0); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + + xfs_bmap_forkoff_reset(ip, whichfork); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); +} + + +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error = 0; + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + + if (!ifp->if_bytes) { + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags = XFS_ILOG_CORE; + goto done; + } + + flags = 0; + error = 0; + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == + XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* + * Initialise the block and copy the data + * + * Note: init_fn must set the buffer log item type correctly! + */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags |= XFS_ILOG_CORE; + + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + +done: + *logflagsp = flags; + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle btree format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int error; /* error return value */ + xfs_mount_t *mp; /* file system mount struct */ + int stat; /* newroot status */ + + mp = ip->i_mount; + if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + *flags |= XFS_ILOG_DBROOT; + else { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + goto error0; + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); + if ((error = xfs_btree_new_iroot(cur, flags, &stat))) + goto error0; + if (stat == 0) { + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return -ENOSPC; + } + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + } + return 0; +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + int error; /* error return value */ + + if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + return 0; + cur = NULL; + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + flags, XFS_DATA_FORK); + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_da_args_t dargs; /* args for dir/attr code */ + + if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + return 0; + + if (S_ISDIR(ip->i_d.di_mode)) { + memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; + dargs.dp = ip; + dargs.firstblock = firstblock; + dargs.flist = flist; + dargs.total = dargs.geo->fsbcount; + dargs.whichfork = XFS_DATA_FORK; + dargs.trans = tp; + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + /* should only be called for types that support local format data */ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + int cancel_flags = 0; + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; + if (XFS_IFORK_Q(ip)) + goto trans_cancel; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = -EINVAL; + goto trans_cancel; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto bmap_cancel; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + bool log_sb = false; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + log_sb = true; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + log_sb = true; + } + spin_unlock(&mp->m_sb_lock); + if (log_sb) + xfs_log_sb(tp); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: + xfs_bmap_cancel(&flist); +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block - 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return -EIO; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error) + return error; + + if (is_empty) { + bma->aeof = 1; + return 0; + } + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return -EIO; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + +/* + * Convert a delayed allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + struct xfs_bmalloca *bma) +{ + struct xfs_bmbt_irec *new = &bma->got; + int diff; /* temp value */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ + int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + + mp = bma->tp ? bma->tp->t_mountp : NULL; + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + + /* + * Set up a bunch of variables to make the tests simpler. + */ + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_get_all(ep, &PREV); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + da_old = startblockval(PREV.br_startblock); + da_new = 0; + + /* + * Set flags determining what part of the previous delayed allocation + * extent is being replaced by a real allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); + + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left and right neighbors are both contiguous with new. + */ + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left neighbor is contiguous, the right is not. + */ + bma->idx--; + + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount, LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, + new->br_startblock, + PREV.br_blockcount + + RIGHT.br_blockcount, PREV.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Filling in all of a previously delayed allocation extent. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + new->br_blockcount, + LEFT.br_state); + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx--; + break; + + case BMAP_LEFT_FILLING: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, new_endoff); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is contiguous with the new allocation. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + RIGHT.br_state); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + RIGHT.br_blockcount, + RIGHT.br_state); + if (error) + goto done; + } + + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; + break; + + case BMAP_RIGHT_FILLING: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is not contiguous. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; + break; + + case 0: + /* + * Filling in the middle part of a previous delayed allocation. + * Contiguity is impossible here. + * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 + */ + temp = new->br_startoff - PREV.br_startoff; + temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); + ASSERT(!error); + if (error) + goto done; + } + + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), + nullstartblock((int)temp2)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + + bma->idx++; + da_new = temp + temp2; + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); +done: + bma->logflags |= rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Convert an unwritten allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_exntst_t newext; /* new extent state */ + xfs_exntst_t oldext; /* old extent state */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; + + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + + /* + * Set up a bunch of variables to make the tests simpler. + */ + error = 0; + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &PREV); + newext = new->br_state; + oldext = (newext == XFS_EXT_UNWRITTEN) ? + XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + ASSERT(PREV.br_state == oldext); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); + ip->i_d.di_nextents -= 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount, + LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + new->br_blockcount, + LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); + xfs_bmbt_set_startoff(ep, new_endoff); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_insert(ip, *idx, 1, new, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + cur->bc_rec.b = *new; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); + + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + new->br_startoff - PREV.br_startoff); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + r[0] = *new; + r[1].br_startoff = new_endoff; + r[1].br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_startblock = new->br_startblock + new->br_blockcount; + r[1].br_state = oldext; + + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + + ip->i_d.di_nextents += 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* new right extent - oldext */ + if ((error = xfs_bmbt_update(cur, r[1].br_startoff, + r[1].br_startblock, r[1].br_blockcount, + r[1].br_state))) + goto done; + /* new left extent - oldext */ + cur->bc_rec.b = PREV; + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + /* new middle extent - newext */ + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); +done: + *logflagsp |= rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Convert a hole to a delayed allocation. + */ +STATIC void +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + xfs_filblks_t temp=0; /* temp for indirect calculations */ + + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + state = 0; + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, + nullstartblock((int)newlen), temp, right.br_state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, *idx, 1, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); + /* + * Nothing to do for disk quota accounting here. + */ + } +} + +/* + * Convert a hole to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec *new = &bma->got; + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int rval=0; /* return value (logging flags) */ + int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + + mp = bma->tp ? bma->tp->t_mountp : NULL; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + + state = 0; + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + /* + * Check and set flags if this segment has a left neighbor. + */ + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if this segment has a current value. + * Not true if we're inserting into the "hole" at eof. + */ + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * We're inserting a real allocation between "left" and "right". + * Set the contiguity flags. Don't let extents get too large. + */ + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Select which case we're in here, and implement it. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with real allocations on the + * left and on the right. + * Merge all three into a single extent record. + */ + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + left.br_blockcount + new->br_blockcount + + right.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount + + right.br_blockcount, + left.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + left.br_blockcount + new->br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount, + left.br_state); + if (error) + goto done; + } + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, + right.br_startoff, + right.br_startblock, + right.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + right.br_blockcount, + right.br_state); + if (error) + goto done; + } + break; + + case 0: + /* + * New allocation is not contiguous with another + * real allocation. + * Insert a new entry. + */ + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, + new->br_startoff, + new->br_startblock, + new->br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); +done: + bma->logflags |= rval; + return error; +} + +/* + * Functions used in the extent read, allocate and remove paths + */ + +/* + * Adjust the size of the new extent based on di_extsize and rt extsize. + */ +int +xfs_bmap_extsize_align( + xfs_mount_t *mp, + xfs_bmbt_irec_t *gotp, /* next extent pointer */ + xfs_bmbt_irec_t *prevp, /* previous extent pointer */ + xfs_extlen_t extsz, /* align to this extent size */ + int rt, /* is this a realtime inode? */ + int eof, /* is extent at end-of-file? */ + int delay, /* creating delalloc extent? */ + int convert, /* overwriting unwritten extent? */ + xfs_fileoff_t *offp, /* in/out: aligned offset */ + xfs_extlen_t *lenp) /* in/out: aligned length */ +{ + xfs_fileoff_t orig_off; /* original offset */ + xfs_extlen_t orig_alen; /* original length */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_fileoff_t align_off; /* temp for offset */ + xfs_extlen_t align_alen; /* temp for length */ + xfs_extlen_t temp; /* temp for calculations */ + + if (convert) + return 0; + + orig_off = align_off = *offp; + orig_alen = align_alen = *lenp; + orig_end = orig_off + orig_alen; + + /* + * If this request overlaps an existing extent, then don't + * attempt to perform any additional alignment. + */ + if (!delay && !eof && + (orig_off >= gotp->br_startoff) && + (orig_end <= gotp->br_startoff + gotp->br_blockcount)) { + return 0; + } + + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment, or if a truncate shot us in the + * foot. + */ + temp = do_mod(orig_off, extsz); + if (temp) { + align_alen += temp; + align_off -= temp; + } + + /* Same adjustment for the end of the requested area. */ + temp = (align_alen % extsz); + if (temp) + align_alen += extsz - temp; + + /* + * For large extent hint sizes, the aligned extent might be larger than + * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls + * the length back under MAXEXTLEN. The outer allocation loops handle + * short allocation just fine, so it is safe to do this. We only want to + * do it when we are forced to, though, because it means more allocation + * operations are required. + */ + while (align_alen > MAXEXTLEN) + align_alen -= extsz; + ASSERT(align_alen <= MAXEXTLEN); + + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + if (prevp->br_startoff != NULLFILEOFF) { + if (prevp->br_startblock == HOLESTARTBLOCK) + prevo = prevp->br_startoff; + else + prevo = prevp->br_startoff + prevp->br_blockcount; + } else + prevo = 0; + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + if (!eof && gotp->br_startoff != NULLFILEOFF) { + if ((delay && gotp->br_startblock == HOLESTARTBLOCK) || + (!delay && gotp->br_startblock == DELAYSTARTBLOCK)) + nexto = gotp->br_startoff + gotp->br_blockcount; + else + nexto = gotp->br_startoff; + } else + nexto = NULLFILEOFF; + if (!eof && + align_off + align_alen != orig_end && + align_off + align_alen > nexto) + align_off = nexto > align_alen ? nexto - align_alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first valid spot and set + * the length so we hit the end. + */ + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + if (align_off + align_alen != orig_end && + align_off + align_alen > nexto && + nexto != NULLFILEOFF) { + ASSERT(nexto > prevo); + align_alen = nexto - align_off; + } + + /* + * If realtime, and the result isn't a multiple of the realtime + * extent size we need to remove blocks until it is. + */ + if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < align_off || + orig_end > align_off + align_alen || + align_alen - temp < orig_alen) + return -EINVAL; + /* + * Try to fix it by moving the start up. + */ + if (align_off + temp <= orig_off) { + align_alen -= temp; + align_off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (align_off + align_alen - temp >= orig_end) + align_alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + align_alen -= orig_off - align_off; + align_off = orig_off; + align_alen -= align_alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < align_off || orig_end > align_off + align_alen) + return -EINVAL; + } else { + ASSERT(orig_off >= align_off); + /* see MAXEXTLEN handling above */ + ASSERT(orig_end <= align_off + align_alen || + align_alen + extsz > MAXEXTLEN); + } + +#ifdef DEBUG + if (!eof && gotp->br_startoff != NULLFILEOFF) + ASSERT(align_off + align_alen <= gotp->br_startoff); + if (prevp->br_startoff != NULLFILEOFF) + ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount); +#endif + + *lenp = align_alen; + *offp = align_off; + return 0; +} + +#define XFS_ALLOC_GAP_UNITS 4 + +void +xfs_bmap_adjacent( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_fsblock_t adjust; /* adjustment to block numbers */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_mount_t *mp; /* mount point structure */ + int nullfb; /* true if ap->firstblock isn't set */ + int rt; /* true if inode is realtime */ + +#define ISVALID(x,y) \ + (rt ? \ + (x) < mp->m_sb.sb_rblocks : \ + XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) + + mp = ap->ip->i_mount; + nullfb = *ap->firstblock == NULLFSBLOCK; + rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); + /* + * If allocating at eof, and there's a previous real block, + * try to use its last block as our starting point. + */ + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; + /* + * Adjust for the gap between prevp and us. + */ + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); + if (adjust && + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; + } + /* + * If not at eof, then compare the two neighbor blocks. + * Figure out whether either one gives us a good starting point, + * and pick the better one. + */ + else if (!ap->eof) { + xfs_fsblock_t gotbno; /* right side block number */ + xfs_fsblock_t gotdiff=0; /* right side difference */ + xfs_fsblock_t prevbno; /* left side block number */ + xfs_fsblock_t prevdiff=0; /* left side difference */ + + /* + * If there's a previous (left) block, select a requested + * start block based on it. + */ + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { + /* + * Calculate gap to end of previous block. + */ + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); + /* + * Figure the startblock based on the previous block's + * end and the gap size. + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the end of the previous block. + */ + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && + ISVALID(prevbno + prevdiff, + ap->prev.br_startblock)) + prevbno += adjust; + else + prevdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) + prevbno = NULLFSBLOCK; + } + /* + * No previous block or can't follow it, just default. + */ + else + prevbno = NULLFSBLOCK; + /* + * If there's a following (right) block, select a requested + * start block based on it. + */ + if (!isnullstartblock(ap->got.br_startblock)) { + /* + * Calculate gap to start of next block. + */ + adjust = gotdiff = ap->got.br_startoff - ap->offset; + /* + * Figure the startblock based on the next block's + * start and the gap size. + */ + gotbno = ap->got.br_startblock; + /* + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the start of the next block + * offset by our length. + */ + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && + ISVALID(gotbno - gotdiff, gotbno)) + gotbno -= adjust; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; + } else + gotdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) + gotbno = NULLFSBLOCK; + } + /* + * No next block, just default. + */ + else + gotbno = NULLFSBLOCK; + /* + * If both valid, pick the better one, else the only good + * one, else ap->blkno is already set (to 0 or the inode block). + */ + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; + else if (prevbno != NULLFSBLOCK) + ap->blkno = prevbno; + else if (gotbno != NULLFSBLOCK) + ap->blkno = gotbno; + } +#undef ISVALID +} + +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; + + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + if (error) + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { + /* + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. + */ + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; + } else { + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; + } +} + +STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + while (*blen < args->maxlen) { + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); + return 0; +} + +STATIC int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); + if (error) + return error; + + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); + if (error) + return error; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); + + /* + * Set the failure fallback case to look in the selected AG as stream + * may have moved. + */ + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + return 0; +} + +STATIC int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_mount_t *mp; /* mount point structure */ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_alloc_arg_t args; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + int stripe_align; + + ASSERT(ap->length); + + mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (unlikely(align)) { + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 0, ap->eof, 0, ap->conv, + &ap->offset, &ap->length); + ASSERT(!error); + ASSERT(ap->length); + } + + + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else + ap->blkno = *ap->firstblock; + + xfs_bmap_adjacent(ap); + + /* + * If allowed, use ap->blkno; otherwise must use firstblock since + * it's in the right allocation group. + */ + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) + ; + else + ap->blkno = *ap->firstblock; + /* + * Normal allocation, done through xfs_alloc_vextent. + */ + tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); + args.tp = ap->tp; + args.mp = mp; + args.fsbno = ap->blkno; + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; + blen = 0; + if (nullfb) { + /* + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. + */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + else + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; + } else if (ap->flist->xbf_low) { + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; + args.total = args.minlen = ap->minlen; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.total = ap->total; + args.minlen = ap->minlen; + } + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { + args.prod = 1; + args.mod = 0; + } else { + args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } + /* + * If we are not low on available data blocks, and the + * underlying logical volume manager is a stripe, and + * the file offset is zero then try to allocate data + * blocks on stripe unit boundary. + * NOTE: ap->aeof is only set if the allocation length + * is >= the stripe unit and the allocation offset is + * at the end of file. + */ + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { + args.alignment = stripe_align; + atype = args.type; + isaligned = 1; + /* + * Adjust for alignment + */ + if (blen > args.alignment && blen <= args.maxlen) + args.minlen = blen - args.alignment; + args.minalignslop = 0; + } else { + /* + * First try an exact bno allocation. + * If it fails then do a near or start bno + * allocation with alignment turned on. + */ + atype = args.type; + tryagain = 1; + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.alignment = 1; + /* + * Compute the minlen+alignment for the + * next case. Set slop so that the value + * of minlen+alignment+slop doesn't go up + * between the calls. + */ + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; + else + nextminlen = args.minlen; + if (nextminlen + stripe_align > args.minlen + 1) + args.minalignslop = + nextminlen + stripe_align - + args.minlen - 1; + else + args.minalignslop = 0; + } + } else { + args.alignment = 1; + args.minalignslop = 0; + } + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.isfl = 0; + args.userdata = ap->userdata; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { + /* + * Exact allocation failed. Now try with alignment + * turned on. + */ + args.type = atype; + args.fsbno = ap->blkno; + args.alignment = stripe_align; + args.minlen = nextminlen; + args.minalignslop = 0; + isaligned = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (isaligned && args.fsbno == NULLFSBLOCK) { + /* + * allocation failed, so turn off alignment and + * try again. + */ + args.type = atype; + args.fsbno = ap->blkno; + args.alignment = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb && + args.minlen > ap->minlen) { + args.minlen = ap->minlen; + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = ap->blkno; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb) { + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = ap->minlen; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + ap->flist->xbf_low = 1; + } + if (args.fsbno != NULLFSBLOCK) { + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; + ASSERT(nullfb || fb_agno == args.agno || + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; + ap->ip->i_d.di_nblocks += args.len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args.len; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT, + (long) args.len); + } else { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + } + return 0; +} + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int +xfs_bmap_alloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + return xfs_bmap_rtalloc(ap); + return xfs_bmap_btalloc(ap); +} + +/* + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_frextents(mp, -((int64_t)extsz)); + } else { + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_frextents(mp, extsz); + else + xfs_mod_fdblocks(mp, alen, false); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +static int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + ASSERT(bma->length > 0); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(bma->flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return -EAGAIN; + return 0; +} + +/* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + ASSERT(len > 0); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + XFS_STATS_INC(xs_blk_mapw); + + if (*firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + else + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); + n = 0; + end = bno + len; + obno = bno; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; + + while (bno < end && n < *nmap) { + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.offset = bno; + bma.flags = flags; + + /* + * There's a 32/64 bit type mismatch between the + * allocation length request (which can be 64 bits in + * length) and the bma length request, which is + * xfs_extlen_t and therefore 32 bits. Hence we have to + * check for 32-bit overflows and handle them here. + */ + if (len > (xfs_filblks_t)MAXEXTLEN) + bma.length = MAXEXTLEN; + else + bma.length = len; + + ASSERT(len > 0); + ASSERT(bma.length > 0); + error = xfs_bmapi_allocate(&bma); + if (error) + goto error0; + if (bma.blkno == NULLFSBLOCK) + break; + } + + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == -EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) + break; + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else + eof = 1; + } + *nmap = n; + + /* + * Transform from btree to extents, give it cur. + */ + if (xfs_bmap_wants_extents(ip, whichfork)) { + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, + &tmp_logflags, whichfork); + bma.logflags |= tmp_logflags; + if (error) + goto error0; + } + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((bma.logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma.logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(bma.cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != -ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == -ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = -ENOSPC; + goto done; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); +done: + *logflagsp = flags; + return error; +} + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + xfs_trans_t *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* misc flags */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done) /* set if not done yet */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_irec_t del; /* extent being deleted */ + int eof; /* is deleting at eof */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t extno; /* extent number in list */ + xfs_bmbt_irec_t got; /* current extent record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int isrt; /* freeing in rt area */ + xfs_extnum_t lastx; /* last extent index used */ + int logflags; /* transaction logging flags */ + xfs_extlen_t mod; /* rt extent offset */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_irec_t prev; /* previous extent record */ + xfs_fileoff_t start; /* first file offset deleted */ + int tmp_logflags; /* partial logging flags */ + int wasdel; /* was a delayed alloc extent */ + int whichfork; /* data or attribute fork */ + xfs_fsblock_t sum; + + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(len > 0); + ASSERT(nexts >= 0); + + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *done = 1; + return 0; + } + XFS_STATS_INC(xs_blk_unmap); + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + start = bno; + bno = start + len - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + + /* + * Check to see if the given block number is past the end of the + * file, back up to the last block if so... + */ + if (eof) { + ep = xfs_iext_get_ext(ifp, --lastx); + xfs_bmbt_get_all(ep, &got); + bno = got.br_startoff + got.br_blockcount - 1; + } + logflags = 0; + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + + if (isrt) { + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + } + + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + (nexts == 0 || extno < nexts)) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. + */ + if (got.br_startoff > bno) { + if (--lastx < 0) + break; + ep = xfs_iext_get_ext(ifp, lastx); + xfs_bmbt_get_all(ep, &got); + } + /* + * Is the last block of this extent before the range + * we're supposed to delete? If so, we're done. + */ + bno = XFS_FILEOFF_MIN(bno, + got.br_startoff + got.br_blockcount - 1); + if (bno < start) + break; + /* + * Then deal with the (possibly delayed) allocated space + * we found. + */ + ASSERT(ep != NULL); + del = got; + wasdel = isnullstartblock(del.br_startblock); + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; + if (!wasdel) + del.br_startblock += start - got.br_startoff; + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent not lined up at the end. + * The extent could have been split into written + * and unwritten pieces, or we could just be + * unmapping part of it. But we can't really + * get rid of part of a realtime extent. + */ + if (del.br_state == XFS_EXT_UNWRITTEN || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * This piece is unwritten, or we're not + * using unwritten extents. Skip over it. + */ + ASSERT(bno >= mod); + bno -= mod > del.br_blockcount ? + del.br_blockcount : mod; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(xfs_iext_get_ext( + ifp, lastx), &got); + } + continue; + } + /* + * It's written, turn it unwritten. + * This is better than zeroing it. + */ + ASSERT(del.br_state == XFS_EXT_NORM); + ASSERT(xfs_trans_get_block_res(tp) > 0); + /* + * If this spans a realtime extent boundary, + * chop it back to the start of the one we end at. + */ + if (del.br_blockcount > mod) { + del.br_startoff += del.br_blockcount - mod; + del.br_startblock += del.br_blockcount - mod; + del.br_blockcount = mod; + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); + if (error) + goto error0; + goto nodelete; + } + if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent is lined up at the end but not + * at the front. We'll get rid of full extents if + * we can. + */ + mod = mp->m_sb.sb_rextsize - mod; + if (del.br_blockcount > mod) { + del.br_blockcount -= mod; + del.br_startoff += mod; + del.br_startblock += mod; + } else if ((del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + xfs_trans_get_block_res(tp) == 0)) || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * Can't make it unwritten. There isn't + * a full extent here so just skip it. + */ + ASSERT(bno >= del.br_blockcount); + bno -= del.br_blockcount; + if (got.br_startoff > bno) { + if (--lastx >= 0) { + ep = xfs_iext_get_ext(ifp, + lastx); + xfs_bmbt_get_all(ep, &got); + } + } + continue; + } else if (del.br_state == XFS_EXT_UNWRITTEN) { + /* + * This one is already unwritten. + * It must have a written left neighbor. + * Unwrite the killed part of that one and + * try again. + */ + ASSERT(lastx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + lastx - 1), &prev); + ASSERT(prev.br_state == XFS_EXT_NORM); + ASSERT(!isnullstartblock(prev.br_startblock)); + ASSERT(del.br_startblock == + prev.br_startblock + prev.br_blockcount); + if (prev.br_startoff < start) { + mod = start - prev.br_startoff; + prev.br_blockcount -= mod; + prev.br_startblock += mod; + prev.br_startoff = start; + } + prev.br_state = XFS_EXT_UNWRITTEN; + lastx--; + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); + if (error) + goto error0; + goto nodelete; + } else { + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); + if (error) + goto error0; + goto nodelete; + } + } + if (wasdel) { + ASSERT(startblockval(del.br_startblock) > 0); + /* Update realtime/data freespace, unreserve quota */ + if (isrt) { + xfs_filblks_t rtexts; + + rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, (int64_t)rtexts); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_RTBLKS); + } else { + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_REGBLKS); + } + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && xfs_trans_get_block_res(tp) == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = -ENOSPC; + goto error0; + } + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + bno = del.br_startoff - 1; +nodelete: + /* + * If not done go on to the next (previous) record. + */ + if (bno != (xfs_fileoff_t)-1 && bno >= start) { + if (lastx >= 0) { + ep = xfs_iext_get_ext(ifp, lastx); + if (xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, + lastx); + } + xfs_bmbt_get_all(ep, &got); + } + extno++; + } + } + *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from btree to extents, give it cur + */ + else if (xfs_bmap_wants_extents(ip, whichfork)) { + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from extents to local? + */ + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Determine whether an extent shift can be accomplished by a merge with the + * extent that precedes the target hole of the shift. + */ +STATIC bool +xfs_bmse_can_merge( + struct xfs_bmbt_irec *left, /* preceding extent */ + struct xfs_bmbt_irec *got, /* current extent to shift */ + xfs_fileoff_t shift) /* shift fsb */ +{ + xfs_fileoff_t startoff; + + startoff = got->br_startoff - shift; + + /* + * The extent, once shifted, must be adjacent in-file and on-disk with + * the preceding extent. + */ + if ((left->br_startoff + left->br_blockcount != startoff) || + (left->br_startblock + left->br_blockcount != got->br_startblock) || + (left->br_state != got->br_state) || + (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + return false; + + return true; +} + +/* + * A bmap extent shift adjusts the file offset of an extent to fill a preceding + * hole in the file. If an extent shift would result in the extent being fully + * adjacent to the extent that currently precedes the hole, we can merge with + * the preceding extent rather than do the shift. + * + * This function assumes the caller has verified a shift-by-merge is possible + * with the provided extents via xfs_bmse_can_merge(). + */ +STATIC int +xfs_bmse_merge( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t shift, /* shift fsb */ + int current_ext, /* idx of gotp */ + struct xfs_bmbt_rec_host *gotp, /* extent to shift */ + struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_btree_cur *cur, + int *logflags) /* output */ +{ + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + xfs_filblks_t blockcount; + int error, i; + struct xfs_mount *mp = ip->i_mount; + + xfs_bmbt_get_all(gotp, &got); + xfs_bmbt_get_all(leftp, &left); + blockcount = left.br_blockcount + got.br_blockcount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + + /* + * Merge the in-core extents. Note that the host record pointers and + * current_ext index are invalid once the extent has been removed via + * xfs_iext_remove(). + */ + xfs_bmbt_set_blockcount(leftp, blockcount); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* + * Update the on-disk extent count, the btree if necessary and log the + * inode. + */ + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + /* lookup and remove the extent to merge */ + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_btree_delete(cur, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + /* lookup and update size of the previous extent */ + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + left.br_blockcount = blockcount; + + return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); +} + +/* + * Shift a single extent. + */ +STATIC int +xfs_bmse_shift_one( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_shift_fsb, + int *current_ext, + struct xfs_bmbt_rec_host *gotp, + struct xfs_btree_cur *cur, + int *logflags, + enum shift_direction direction) +{ + struct xfs_ifork *ifp; + struct xfs_mount *mp; + xfs_fileoff_t startoff; + struct xfs_bmbt_rec_host *adj_irecp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec adj_irec; + int error; + int i; + int total_extents; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + + xfs_bmbt_get_all(gotp, &got); + + /* delalloc extents should be prevented by caller */ + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) + return -EINVAL; + + /* check whether to merge the extent or shift it down */ + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, adj_irecp, + cur, logflags); + } + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } + /* + * Increment the extent index for the next iteration, update the start + * offset of the in-core extent and update the btree if applicable. + */ +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; + xfs_bmbt_set_startoff(gotp, startoff); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + got.br_startoff = startoff; + return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); +} + +/* + * Shift extent records to the left/right to cover/create a hole. + * + * The maximum number of extents to be shifted in a single operation is + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the + * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb + * is the length by which each extent is shifted. If there is no hole to shift + * the extents into, this will be considered invalid operation and we abort + * immediately. + */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, + xfs_fileoff_t offset_shift_fsb, + int *done, + xfs_fileoff_t stop_fsb, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + enum shift_direction direction, + int num_exts) +{ + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; + int error = 0; + int whichfork = XFS_DATA_FORK; + int logflags = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } + + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * *next_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + goto del_cursor; + } + + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { + error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, + ¤t_ext, gotp, cur, &logflags, + direction); + if (error) + goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } + + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; + break; + } + gotp = xfs_iext_get_ext(ifp, current_ext); + } + + if (!*done) { + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + } + +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + + return error; +} + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.h b/kernel/fs/xfs/libxfs/xfs_bmap.h new file mode 100644 index 000000000..6aaa0c1c7 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_H__ +#define __XFS_BMAP_H__ + +struct getbmap; +struct xfs_bmbt_irec; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ + int flags; +}; + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +typedef struct xfs_bmap_free_item +{ + xfs_fsblock_t xbfi_startblock;/* starting fs block number */ + xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ + struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ +} xfs_bmap_free_item_t; + +/* + * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. + */ +typedef struct xfs_bmap_free +{ + xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + int xbf_count; /* count of items on list */ + int xbf_low; /* alloc in low mode */ +} xfs_bmap_free_t; + +#define XFS_BMAP_MAX_NMAP 4 + +/* + * Flags for xfs_bmapi_* + */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ + /* combine contig. space */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +/* + * unwritten extent conversion - this needs write cache flushing and no additional + * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts + * from written to unwritten, otherwise convert from unwritten to written. + */ +#define XFS_BMAPI_CONVERT 0x040 + +#define XFS_BMAPI_FLAGS \ + { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ + { XFS_BMAPI_METADATA, "METADATA" }, \ + { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ + { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ + { XFS_BMAPI_CONTIG, "CONTIG" }, \ + { XFS_BMAPI_CONVERT, "CONVERT" } + + +static inline int xfs_bmapi_aflag(int w) +{ + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); +} + +/* + * Special values for xfs_bmbt_irec_t br_startblock field. + */ +#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) +#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) + +static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) +{ + ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ + (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); +} + +/* + * Flags for xfs_bmap_add_extent*. + */ +#define BMAP_LEFT_CONTIG (1 << 0) +#define BMAP_RIGHT_CONTIG (1 << 1) +#define BMAP_LEFT_FILLING (1 << 2) +#define BMAP_RIGHT_FILLING (1 << 3) +#define BMAP_LEFT_DELAY (1 << 4) +#define BMAP_RIGHT_DELAY (1 << 5) +#define BMAP_LEFT_VALID (1 << 6) +#define BMAP_RIGHT_VALID (1 << 7) +#define BMAP_ATTRFORK (1 << 8) + +#define XFS_BMAP_EXT_FLAGS \ + { BMAP_LEFT_CONTIG, "LC" }, \ + { BMAP_RIGHT_CONTIG, "RC" }, \ + { BMAP_LEFT_FILLING, "LF" }, \ + { BMAP_RIGHT_FILLING, "RF" }, \ + { BMAP_ATTRFORK, "ATTR" } + + +/* + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. + */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +#ifdef DEBUG +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) +#else +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) +#endif + +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); + +#endif /* __XFS_BMAP_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.c b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c new file mode 100644 index 000000000..2c44c8e50 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c @@ -0,0 +1,883 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" + +/* + * Determine the extent state. + */ +/* ARGSUSED */ +STATIC xfs_exntst_t +xfs_extent_state( + xfs_filblks_t blks, + int extent_flag) +{ + if (extent_flag) { + ASSERT(blks != 0); /* saved for DMIG */ + return XFS_EXT_UNWRITTEN; + } + return XFS_EXT_NORM; +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +void +xfs_bmdr_to_bmbt( + struct xfs_inode *ip, + xfs_bmdr_block_t *dblock, + int dblocklen, + struct xfs_btree_block *rblock, + int rblocklen) +{ + struct xfs_mount *mp = ip->i_mount; + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + rblock->bb_level = dblock->bb_level; + ASSERT(be16_to_cpu(rblock->bb_level) > 0); + rblock->bb_numrecs = dblock->bb_numrecs; + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMDR_KEY_ADDR(dblock, 1); + tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Convert a compressed bmap extent record to an uncompressed form. + * This code must be in sync with the routines xfs_bmbt_get_startoff, + * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. + */ +STATIC void +__xfs_bmbt_get_all( + __uint64_t l0, + __uint64_t l1, + xfs_bmbt_irec_t *s) +{ + int ext_flag; + xfs_exntst_t st; + + ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); + s->br_startoff = ((xfs_fileoff_t)l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)l1) >> 21); + s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); + /* This is xfs_extent_state() in-line */ + if (ext_flag) { + ASSERT(s->br_blockcount != 0); /* saved for DMIG */ + st = XFS_EXT_UNWRITTEN; + } else + st = XFS_EXT_NORM; + s->br_state = st; +} + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(r->l0, r->l1, s); +} + +/* + * Extract the blockcount field from an in memory bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_host_t *r) +{ + return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); +} + +/* + * Extract the startblock field from an in memory bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_host_t *r) +{ + return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)r->l1) >> 21); +} + +/* + * Extract the startoff field from an in memory bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_host_t *r) +{ + return ((xfs_fileoff_t)r->l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_host_t *r) +{ + int ext_flag; + + ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_get_blockcount(r), + ext_flag); +} + +/* + * Extract the blockcount field from an on disk bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_disk_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); +} + +/* + * Extract the startoff field from a disk format bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_disk_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)be64_to_cpu(r->l0) & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + + +/* + * Set all the fields in a bmap extent record from the arguments. + */ +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_set_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + + +/* + * Set all the fields in a disk format bmap extent record from the arguments. + */ +void +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +STATIC void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + +/* + * Set the blockcount field in a bmap extent record. + */ +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_host_t *r, + xfs_filblks_t v) +{ + ASSERT((v & xfs_mask64hi(43)) == 0); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | + (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); +} + +/* + * Set the startblock field in a bmap extent record. + */ +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_host_t *r, + xfs_fsblock_t v) +{ + ASSERT((v & xfs_mask64hi(12)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | + (xfs_bmbt_rec_base_t)(v >> 43); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | + (xfs_bmbt_rec_base_t)(v << 21); +} + +/* + * Set the startoff field in a bmap extent record. + */ +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t v) +{ + ASSERT((v & xfs_mask64hi(9)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | + ((xfs_bmbt_rec_base_t)v << 9) | + (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); +} + +/* + * Set the extent state field in a bmap extent record. + */ +void +xfs_bmbt_set_state( + xfs_bmbt_rec_host_t *r, + xfs_exntst_t v) +{ + ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); + if (v == XFS_EXT_NORM) + r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); + else + r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_bmbt_to_bmdr( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + xfs_bmdr_block_t *dblock, + int dblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_level != 0); + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + tkp = XFS_BMDR_KEY_ADDR(dblock, 1); + fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Check extent records, which have just been read, for + * any bit in the extent flag field. ASSERT on debug + * kernels, as this condition should not occur. + * Return an error condition (1) if any flags found, + * otherwise return 0. + */ + +int +xfs_check_nostate_extents( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + xfs_extnum_t num) +{ + for (; num > 0; num--, idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + if ((ep->l0 >> + (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { + ASSERT(0); + return 1; + } + } + return 0; +} + + +STATIC struct xfs_btree_cur * +xfs_bmbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *new; + + new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.b.ip, cur->bc_private.b.whichfork); + + /* + * Copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + + return new; +} + +STATIC void +xfs_bmbt_update_cursor( + struct xfs_btree_cur *src, + struct xfs_btree_cur *dst) +{ + ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || + (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + + dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_private.b.firstblock = src->bc_private.b.firstblock; + + src->bc_private.b.allocated = 0; +} + +STATIC int +xfs_bmbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + error = -ENOSPC; + goto error0; + } + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + cur->bc_private.b.flist->xbf_low = 1; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + + new->l = cpu_to_be64(args.fsbno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + + error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_bmbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + + xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, bp); + return 0; +} + +STATIC int +xfs_bmbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0) / 2; + } + + return cur->bc_mp->m_bmap_dmnr[level != 0]; +} + +int +xfs_bmbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0); + } + + return cur->bc_mp->m_bmap_dmxr[level != 0]; + +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_bmbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_bmbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_bmap_dmxr[level != 0]; + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); +} + +STATIC void +xfs_bmbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); +} + +STATIC void +xfs_bmbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->bmbt.br_startoff != 0); + + xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), + 0, 0, XFS_EXT_NORM); +} + +STATIC void +xfs_bmbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); +} + +STATIC void +xfs_bmbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC __int64_t +xfs_bmbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + cur->bc_rec.b.br_startoff; +} + +static bool +xfs_bmbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. + * + * We don't know what fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; +} + +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_bmbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_lblock_calc_crc(bp); +} + +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_bmbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be64_to_cpu(k1->bmbt.br_startoff) < + be64_to_cpu(k2->bmbt.br_startoff); +} + +STATIC int +xfs_bmbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return xfs_bmbt_disk_get_startoff(&r1->bmbt) + + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= + xfs_bmbt_disk_get_startoff(&r2->bmbt); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bmbt_ops = { + .rec_len = sizeof(xfs_bmbt_rec_t), + .key_len = sizeof(xfs_bmbt_key_t), + + .dup_cursor = xfs_bmbt_dup_cursor, + .update_cursor = xfs_bmbt_update_cursor, + .alloc_block = xfs_bmbt_alloc_block, + .free_block = xfs_bmbt_free_block, + .get_maxrecs = xfs_bmbt_get_maxrecs, + .get_minrecs = xfs_bmbt_get_minrecs, + .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, + .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_rec_from_key = xfs_bmbt_init_rec_from_key, + .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, + .key_diff = xfs_bmbt_key_diff, + .buf_ops = &xfs_bmbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_bmbt_keys_inorder, + .recs_inorder = xfs_bmbt_recs_inorder, +#endif +}; + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * /* new bmap btree cursor */ +xfs_bmbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* inode owning the btree */ + int whichfork) /* data or attr fork */ +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_btnum = XFS_BTNUM_BMAP; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_bmbt_ops; + cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + + return cur; +} + +/* + * Calculate number of records in a bmap btree block. + */ +int +xfs_bmbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_BMBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Calculate number of records in a bmap btree inode root. + */ +int +xfs_bmdr_maxrecs( + int blocklen, + int leaf) +{ + blocklen -= sizeof(xfs_bmdr_block_t); + + if (leaf) + return blocklen / sizeof(xfs_bmdr_rec_t); + return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); +} + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return -ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.h b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h new file mode 100644 index 000000000..819a8a4de --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_BTREE_H__ +#define __XFS_BMAP_BTREE_H__ + +struct xfs_btree_cur; +struct xfs_btree_block; +struct xfs_mount; +struct xfs_inode; +struct xfs_trans; + +/* + * Extent state and extent format macros. + */ +#define XFS_EXTFMT_INODE(x) \ + (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ + XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) +#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) + +#define XFS_BMBT_REC_ADDR(mp, block, index) \ + ((xfs_bmbt_rec_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_rec_t))) + +#define XFS_BMBT_KEY_ADDR(mp, block, index) \ + ((xfs_bmbt_key_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_key_t))) + +#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_bmbt_ptr_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_bmbt_key_t) + \ + ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) + +#define XFS_BMDR_REC_ADDR(block, index) \ + ((xfs_bmdr_rec_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_rec_t))) + +#define XFS_BMDR_KEY_ADDR(block, index) \ + ((xfs_bmdr_key_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_key_t))) + +#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ + ((xfs_bmdr_ptr_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + (maxrecs) * sizeof(xfs_bmdr_key_t) + \ + ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) + +/* + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ + XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) + +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) + +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMDR_SPACE_CALC(nrecs) \ + (int)(sizeof(xfs_bmdr_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) + +/* + * Maximum number of bmap btree levels. + */ +#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) + +/* + * Prototypes for xfs_bmap.c to call. + */ +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, + struct xfs_btree_block *, int); +extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); +extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); +extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); +extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); + +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); + +extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); +extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); +extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); +extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); +extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); + +extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); + +extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, + xfs_bmdr_block_t *, int); + +extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); +extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); + +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); + +extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_inode *, int); + +#endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_btree.c b/kernel/fs/xfs/libxfs/xfs_btree.c new file mode 100644 index 000000000..c72283dd8 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_btree.c @@ -0,0 +1,4067 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_alloc.h" + +/* + * Cursor allocation zone. + */ +kmem_zone_t *xfs_btree_cur_zone; + +/* + * Btree magic numbers. + */ +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } +}; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] + + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer for block, if any */ +{ + int lblock_ok = 1; /* block passes checks */ + struct xfs_mount *mp; /* file system mount point */ + + mp = cur->bc_mp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block */ +{ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for ag. freespace struct */ + struct xfs_agf *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok = 1; /* block passes checks */ + + mp = cur->bc_mp; + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = be32_to_cpu(agf->agf_length); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && + block->bb_u.s.bb_rightsib; + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Debug routine: check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block, if any */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_check_lblock(cur, block, level, bp); + else + return xfs_btree_check_sblock(cur, block, level, bp); +} + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_fsblock_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + level > 0 && + bno != NULLFSBLOCK && + XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); + return 0; +} + +#ifdef DEBUG +/* + * Check that (short) pointer is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; + + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + level > 0 && + bno != NULLAGBLOCK && + bno != 0 && + bno < agblocks); + return 0; +} + +/* + * Check that block ptr is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_ptr( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_ptr *ptr, /* btree block disk address */ + int index, /* offset from ptr to check */ + int level) /* btree block level */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + return xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level); + } else { + return xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level); + } +} +#endif + +/* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + + return true; +} + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error) /* del because of error */ +{ + int i; /* btree level */ + + /* + * Clear the buffer pointers, and release the buffers. + * If we're doing this in the face of an error, we + * need to make sure to inspect all of the entries + * in the bc_bufs array for buffers to be unlocked. + * This is because some of the btree code works from + * level n down to 0, and if we get an error along + * the way we won't have initialized all the entries + * down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + else if (!error) + break; + } + /* + * Can't free a bmap cursor without having dealt with the + * allocated indirect blocks' accounting. + */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_private.b.allocated == 0); + /* + * Free the cursor. + */ + kmem_zone_free(xfs_btree_cur_zone, cur); +} + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur) /* output cursor */ +{ + xfs_buf_t *bp; /* btree block's buffer pointer */ + int error; /* error return value */ + int i; /* level number of btree block */ + xfs_mount_t *mp; /* mount structure for filesystem */ + xfs_btree_cur_t *new; /* new cursor value */ + xfs_trans_t *tp; /* transaction pointer, can be NULL */ + + tp = cur->bc_tp; + mp = cur->bc_mp; + + /* + * Allocate a new cursor like the old one. + */ + new = cur->bc_ops->dup_cursor(cur); + + /* + * Copy the record currently in the cursor. + */ + new->bc_rec = cur->bc_rec; + + /* + * For each level current, re-get the buffer and copy the ptr value. + */ + for (i = 0; i < new->bc_nlevels; i++) { + new->bc_ptrs[i] = cur->bc_ptrs[i]; + new->bc_ra[i] = cur->bc_ra[i]; + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, + cur->bc_ops->buf_ops); + if (error) { + xfs_btree_del_cursor(new, error); + *ncur = NULL; + return error; + } + } + new->bc_bufs[i] = bp; + } + *ncur = new; + return 0; +} + +/* + * XFS btree block layout and addressing: + * + * There are two types of blocks in the btree: leaf and non-leaf blocks. + * + * The leaf record start with a header then followed by records containing + * the values. A non-leaf block also starts with the same header, and + * then first contains lookup keys followed by an equal number of pointers + * to the btree blocks at the previous level. + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * The header is called struct xfs_btree_block for reasons better left unknown + * and comes in different versions for short (32bit) and long (64bit) block + * pointers. The record and key structures are defined by the btree instances + * and opaque to the btree core. The block pointers are simple disk endian + * integers, available in a short (32bit) and long (64bit) variant. + * + * The helpers below calculate the offset of a given record, key or pointer + * into a btree block (xfs_btree_*_offset) or return a pointer to the given + * record, key or pointer (xfs_btree_*_addr). Note that all addressing + * inside the btree block is done using indices starting at one, not zero! + */ + +/* + * Return size of the btree block header for this btree instance. + */ +static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; +} + +/* + * Return size of btree block pointers for this btree instance. + */ +static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + sizeof(__be64) : sizeof(__be32); +} + +/* + * Calculate offset of the n-th record in a btree block. + */ +STATIC size_t +xfs_btree_rec_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->rec_len; +} + +/* + * Calculate offset of the n-th key in a btree block. + */ +STATIC size_t +xfs_btree_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len; +} + +/* + * Calculate offset of the n-th block pointer in a btree block. + */ +STATIC size_t +xfs_btree_ptr_offset( + struct xfs_btree_cur *cur, + int n, + int level) +{ + return xfs_btree_block_len(cur) + + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + + (n - 1) * xfs_btree_ptr_len(cur); +} + +/* + * Return a pointer to the n-th record in the btree block. + */ +STATIC union xfs_btree_rec * +xfs_btree_rec_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_rec *) + ((char *)block + xfs_btree_rec_offset(cur, n)); +} + +/* + * Return a pointer to the n-th key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_key_offset(cur, n)); +} + +/* + * Return a pointer to the n-th block pointer in the btree block. + */ +STATIC union xfs_btree_ptr * +xfs_btree_ptr_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + int level = xfs_btree_get_level(block); + + ASSERT(block->bb_level != 0); + + return (union xfs_btree_ptr *) + ((char *)block + xfs_btree_ptr_offset(cur, n, level)); +} + +/* + * Get the root block which is stored in the inode. + * + * For now this btree implementation assumes the btree root is always + * stored in the if_broot field of an inode fork. + */ +STATIC struct xfs_btree_block * +xfs_btree_get_iroot( + struct xfs_btree_cur *cur) +{ + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; +} + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be an inode btree root or from a buffer. + */ +STATIC struct xfs_btree_block * /* generic btree block pointer */ +xfs_btree_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp) /* buffer containing the block */ +{ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *bpp = NULL; + return xfs_btree_get_iroot(cur); + } + + *bpp = cur->bc_bufs[level]; + return XFS_BUF_TO_BLOCK(*bpp); +} + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +xfs_buf_t * /* buffer for fsbno */ +xfs_btree_get_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); +} + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +xfs_buf_t * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); +} + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); + else + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); +} + +/* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to numrecs, that's the last record/key. + */ + cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); + return 1; +} + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets, /* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last) /* output: last byte offset */ +{ + int i; /* current bit number */ + __int64_t imask; /* mask for current bit number */ + + ASSERT(fields != 0); + /* + * Find the lowest bit, so the first byte offset. + */ + for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + if (imask & fields) { + *first = offsets[i]; + break; + } + } + /* + * Find the highest bit, so the last byte offset. + */ + for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + if (imask & fields) { + *last = offsets[i + 1] - 1; + break; + } + } +} + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); + if (error) + return error; + if (bp) + xfs_buf_set_ref(bp, refval); + *bpp = bp; + return 0; +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) +{ + xfs_daddr_t d; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) +{ + xfs_daddr_t d; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); +} + +STATIC int +xfs_btree_readahead_lblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +STATIC int +xfs_btree_readahead_sblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); + xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); + + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + left, 1, cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + right, 1, cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +STATIC int +xfs_btree_readahead( + struct xfs_btree_cur *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + struct xfs_btree_block *block; + + /* + * No readahead needed if we are at the root level and the + * btree root is stored in the inode. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (lev == cur->bc_nlevels - 1)) + return 0; + + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; + + cur->bc_ra[lev] |= lr; + block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_readahead_lblock(cur, lr, block); + return xfs_btree_readahead_sblock(cur, lr, block); +} + +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); +} + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +STATIC void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + xfs_buf_t *bp) /* new buffer to set */ +{ + struct xfs_btree_block *b; /* btree block */ + + if (cur->bc_bufs[lev]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); + cur->bc_bufs[lev] = bp; + cur->bc_ra[lev] = 0; + + b = XFS_BUF_TO_BLOCK(bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } else { + if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } +} + +STATIC int +xfs_btree_ptr_is_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return ptr->l == cpu_to_be64(NULLFSBLOCK); + else + return ptr->s == cpu_to_be32(NULLAGBLOCK); +} + +STATIC void +xfs_btree_set_ptr_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(NULLFSBLOCK); + else + ptr->s = cpu_to_be32(NULLAGBLOCK); +} + +/* + * Get/set/init sibling pointers + */ +STATIC void +xfs_btree_get_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + ptr->l = block->bb_u.l.bb_rightsib; + else + ptr->l = block->bb_u.l.bb_leftsib; + } else { + if (lr == XFS_BB_RIGHTSIB) + ptr->s = block->bb_u.s.bb_rightsib; + else + ptr->s = block->bb_u.s.bb_leftsib; + } +} + +STATIC void +xfs_btree_set_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.l.bb_rightsib = ptr->l; + else + block->bb_u.l.bb_leftsib = ptr->l; + } else { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.s.bb_rightsib = ptr->s; + else + block->bb_u.s.bb_leftsib = ptr->s; + } +} + +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + buf->bb_u.l.bb_lsn = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.s.bb_lsn = 0; + } + } +} + +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); +} + +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + int numrecs) +{ + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); +} + +/* + * Return true if ptr is the last record in the btree and + * we need to track updates to this record. The decision + * will be further refined in the update_lastrec method. + */ +STATIC int +xfs_btree_is_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level) +{ + union xfs_btree_ptr ptr; + + if (level > 0) + return 0; + if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + return 0; + + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &ptr)) + return 0; + return 1; +} + +STATIC void +xfs_btree_buf_to_ptr( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + XFS_BUF_ADDR(bp))); + else { + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, + XFS_BUF_ADDR(bp))); + } +} + +STATIC void +xfs_btree_set_refs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); + break; + case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); + break; + case XFS_BTNUM_BMAP: + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + break; + default: + ASSERT(0); + } +} + +STATIC int +xfs_btree_get_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags); + + if (!*bpp) + return -ENOMEM; + + (*bpp)->b_ops = cur->bc_ops->buf_ops; + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Read in the buffer at the given ptr and return the buffer and + * the block pointer within the buffer. + */ +STATIC int +xfs_btree_read_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + int error; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags, bpp, + cur->bc_ops->buf_ops); + if (error) + return error; + + xfs_btree_set_refs(cur, *bpp); + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Copy keys from one btree block to another. + */ +STATIC void +xfs_btree_copy_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + union xfs_btree_key *src_key, + int numkeys) +{ + ASSERT(numkeys >= 0); + memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); +} + +/* + * Copy records from one btree block to another. + */ +STATIC void +xfs_btree_copy_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *dst_rec, + union xfs_btree_rec *src_rec, + int numrecs) +{ + ASSERT(numrecs >= 0); + memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Copy block pointers from one btree block to another. + */ +STATIC void +xfs_btree_copy_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + union xfs_btree_ptr *src_ptr, + int numptrs) +{ + ASSERT(numptrs >= 0); + memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Shift keys one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + int dir, + int numkeys) +{ + char *dst_key; + + ASSERT(numkeys >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_key = (char *)key + (dir * cur->bc_ops->key_len); + memmove(dst_key, key, numkeys * cur->bc_ops->key_len); +} + +/* + * Shift records one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int dir, + int numrecs) +{ + char *dst_rec; + + ASSERT(numrecs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); + memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Shift block pointers one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int dir, + int numptrs) +{ + char *dst_ptr; + + ASSERT(numptrs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); + memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_btree_log_keys( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_key_offset(cur, first), + xfs_btree_key_offset(cur, last + 1) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_btree_log_recs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_rec_offset(cur, first), + xfs_btree_rec_offset(cur, last + 1) - 1); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_btree_log_ptrs( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int first, /* index of first pointer to log */ + int last) /* index of last pointer to log */ +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + int level = xfs_btree_get_level(block); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_ptr_offset(cur, first, level), + xfs_btree_ptr_offset(cur, last + 1, level) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log fields from a btree block header. + */ +void +xfs_btree_log_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short soffsets[] = { /* table of offsets (short) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN + }; + static const short loffsets[] = { /* table of offsets (long) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN + }; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBI(cur, bp, fields); + + if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } + xfs_btree_offsets(fields, + (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + loffsets : soffsets, + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_increment( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + int error; /* error return value */ + int lev; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the right at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* We're done if we remain in the block after the increment. */ + if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + goto out1; + + /* Fail if we just went off the right edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, increment); + + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_btree_get_block(cur, lev, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, lev, bp); + if (error) + goto error0; +#endif + + if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + break; + + /* Read-ahead the right block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + + /* + * If we went off the root then we are either seriously + * confused or have the tree root in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = -EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = 1; + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_decrement( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + xfs_buf_t *bp; + int error; /* error return value */ + int lev; + union xfs_btree_ptr ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the left at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + + /* We're done if we remain in the block after the decrement. */ + if (--cur->bc_ptrs[level] > 0) + goto out1; + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we just went off the left edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, decrement); + + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* Read-ahead the left block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + + /* + * If we went off the root then we are seriously confused. + * or the root of the tree is in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = -EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_btree_lookup_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ +{ + struct xfs_buf *bp; /* buffer pointer for btree block */ + int error = 0; + + /* special case the root block if in an inode */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *blkp = xfs_btree_get_iroot(cur); + return 0; + } + + /* + * If the old buffer at this level for the disk address we are + * looking for re-use it. + * + * Otherwise throw it away and get a new one. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + *blkp = XFS_BUF_TO_BLOCK(bp); + return 0; + } + + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); + if (error) + return error; + + xfs_btree_setbuf(cur, level, bp); + return 0; +} + +/* + * Get current search key. For level 0 we don't actually have a key + * structure so we make one up from the record. For all other levels + * we just return the right key. + */ +STATIC union xfs_btree_key * +xfs_lookup_get_search_key( + struct xfs_btree_cur *cur, + int level, + int keyno, + struct xfs_btree_block *block, + union xfs_btree_key *kp) +{ + if (level == 0) { + cur->bc_ops->init_key_from_rec(kp, + xfs_btree_rec_addr(cur, keyno, block)); + return kp; + } + + return xfs_btree_key_addr(cur, keyno, block); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * stat is set to 0 if can't find any such record, 1 for success. + */ +int /* error */ +xfs_btree_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno; /* current key number */ + int level; /* level in the btree */ + union xfs_btree_ptr *pp; /* ptr to btree block */ + union xfs_btree_ptr ptr; /* ptr to btree block */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, dir); + + XFS_BTREE_STATS_INC(cur, lookup); + + block = NULL; + keyno = 0; + + /* initialise start pointer from cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + pp = &ptr; + + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + /* Get the block we need to do the lookup on. */ + error = xfs_btree_lookup_get_block(cur, level, pp, &block); + if (error) + goto error0; + + if (diff == 0) { + /* + * If we already had a key match at a higher level, we + * know we need to use the first entry in this block. + */ + keyno = 1; + } else { + /* Otherwise search this block. Do a binary search. */ + + int high; /* high entry number */ + int low; /* low entry number */ + + /* Set low and high entry numbers, 1-based. */ + low = 1; + high = xfs_btree_get_numrecs(block); + if (!high) { + /* Block is empty, must be an empty leaf. */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Binary search the block. */ + while (low <= high) { + union xfs_btree_key key; + union xfs_btree_key *kp; + + XFS_BTREE_STATS_INC(cur, compare); + + /* keyno is average of low and high. */ + keyno = (low + high) >> 1; + + /* Get current search key */ + kp = xfs_lookup_get_search_key(cur, level, + keyno, block, &key); + + /* + * Compute difference to get next direction: + * - less than, move right + * - greater than, move left + * - equal, we're done + */ + diff = cur->bc_ops->key_diff(cur, kp); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = xfs_btree_ptr_addr(cur, keyno, block); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, pp, 0, level); + if (error) + goto error0; +#endif + cur->bc_ptrs[level] = keyno; + } + } + + /* Done with the search. See if we need to adjust the results. */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (dir == XFS_LOOKUP_GE && + keyno > xfs_btree_get_numrecs(block) && + !xfs_btree_ptr_is_null(cur, &ptr)) { + int i; + + cur->bc_ptrs[0] = keyno; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + + /* Return if we succeeded or not. */ + if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) + *stat = 0; + else if (dir != XFS_LOOKUP_EQ || diff == 0) + *stat = 1; + else + *stat = 0; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int +xfs_btree_updkey( + struct xfs_btree_cur *cur, + union xfs_btree_key *keyp, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_key *kp; + int ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIK(cur, level, keyp); + + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = xfs_btree_key_addr(cur, ptr, block); + xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Update the record referred to by cur to the value in the + * given record. This either works (return 0) or gets an + * EFSCORRUPTED error. + */ +int +xfs_btree_update( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + int error; + int ptr; + union xfs_btree_rec *rp; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGR(cur, rec); + + /* Pick up the current block. */ + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + goto error0; +#endif + /* Get the address of the rec to be updated. */ + ptr = cur->bc_ptrs[0]; + rp = xfs_btree_rec_addr(cur, ptr, block); + + /* Fill in the new contents and log them. */ + xfs_btree_copy_recs(cur, rp, rec, 1); + xfs_btree_log_recs(cur, bp, ptr, ptr); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, 0)) { + cur->bc_ops->update_lastrec(cur, block, rec, + ptr, LASTREC_UPDATE); + } + + /* Updating first rec in leaf. Pass new key value up to our parent. */ + if (ptr == 1) { + union xfs_btree_key key; + + cur->bc_ops->init_key_from_rec(&key, rec); + error = xfs_btree_updkey(cur, &key, 1); + if (error) + goto error0; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_lshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs; /* left record count */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + int rrecs; /* right record count */ + union xfs_btree_ptr lptr; /* left btree pointer */ + union xfs_btree_key *rkp = NULL; /* right btree key */ + union xfs_btree_ptr *rpp = NULL; /* right address pointer */ + union xfs_btree_rec *rrp = NULL; /* right record pointer */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) + goto out0; + + /* Set up variables for this block as "right". */ + right = xfs_btree_get_block(cur, level, &rbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, right, level, rbp); + if (error) + goto error0; +#endif + + /* If we've got no left sibling then we can't shift an entry left. */ + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &lptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) + goto out0; + + /* Set up the left neighbor as "left". */ + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + lrecs = xfs_btree_get_numrecs(left); + if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + rrecs = xfs_btree_get_numrecs(right); + + /* + * We add one entry to the left side and remove one for the right side. + * Account for it here, the changes will be updated on disk and logged + * later. + */ + lrecs++; + rrecs--; + + XFS_BTREE_STATS_INC(cur, lshift); + XFS_BTREE_STATS_ADD(cur, moves, 1); + + /* + * If non-leaf, copy a key and a ptr to the left block. + * Log the changes to the left block. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, rpp, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); + xfs_btree_copy_ptrs(cur, lpp, rpp, 1); + + xfs_btree_log_keys(cur, lbp, lrecs, lrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->keys_inorder(cur, + xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, 1); + xfs_btree_log_recs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->recs_inorder(cur, + xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); + } + + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Slide the contents of right down one entry. + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ +#ifdef DEBUG + int i; /* loop index */ + + for (i = 0; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + if (error) + goto error0; + } +#endif + xfs_btree_shift_keys(cur, + xfs_btree_key_addr(cur, 2, right), + -1, rrecs); + xfs_btree_shift_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, right), + -1, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + } else { + /* It's a leaf. operate on records */ + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, 2, right), + -1, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, right)); + rkp = &key; + } + + /* Update the parent key values of right. */ + error = xfs_btree_updkey(cur, rkp, level + 1); + if (error) + goto error0; + + /* Slide the cursor value left one. */ + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_rshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + union xfs_btree_ptr rptr; /* right block pointer */ + union xfs_btree_key *rkp; /* right btree key */ + int rrecs; /* right record count */ + int lrecs; /* left record count */ + int error; /* error return value */ + int i; /* loop counter */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) + goto out0; + + /* Set up variables for this block as "left". */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + /* If we've got no right sibling then we can't shift an entry right. */ + xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + lrecs = xfs_btree_get_numrecs(left); + if (cur->bc_ptrs[level] >= lrecs) + goto out0; + + /* Set up the right neighbor as "right". */ + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + rrecs = xfs_btree_get_numrecs(right); + if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, rshift); + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + union xfs_btree_ptr *rpp; + + lkp = xfs_btree_key_addr(cur, lrecs, left); + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = rrecs - 1; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_shift_keys(cur, rkp, 1, rrecs); + xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, lpp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, and log it. */ + xfs_btree_copy_keys(cur, rkp, lkp, 1); + xfs_btree_copy_ptrs(cur, rpp, lpp, 1); + + xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); + + ASSERT(cur->bc_ops->keys_inorder(cur, rkp, + xfs_btree_key_addr(cur, 2, right))); + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *lrp; + union xfs_btree_rec *rrp; + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_shift_recs(cur, rrp, 1, rrecs); + + /* Now put the new data in, and log it. */ + xfs_btree_copy_recs(cur, rrp, lrp, 1); + xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); + + cur->bc_ops->init_key_from_rec(&key, rrp); + rkp = &key; + + ASSERT(cur->bc_ops->recs_inorder(cur, rrp, + xfs_btree_rec_addr(cur, 2, right))); + } + + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + xfs_btree_set_numrecs(left, --lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, ++rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error1; + + error = xfs_btree_updkey(tcur, rkp, level + 1); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and the key to its first + * record (to be inserted into parent). + */ +STATIC int /* error */ +__xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rrptr; /* right-right sibling ptr */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + int lrecs; + int rrecs; + int src_index; + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); + + XFS_BTREE_STATS_INC(cur, split); + + /* Set up left block (current one). */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block as "right". */ + error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* Fill in the btree header for the new right block. */ + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); + + /* + * Split the entries between the old and the new block evenly. + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + lrecs = xfs_btree_get_numrecs(left); + rrecs = lrecs / 2; + if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + rrecs++; + src_index = (lrecs - rrecs + 1); + + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Copy btree block entries from the left block over to the + * new block, the right. Update the right block and log the + * changes. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, src_index, left); + lpp = xfs_btree_ptr_addr(cur, src_index, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = src_index; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_copy_keys(cur, rkp, lkp, rrecs); + xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + + /* Grab the keys to the entries moved to the right block */ + xfs_btree_copy_keys(cur, key, rkp, 1); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, src_index, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, rrp, lrp, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + cur->bc_ops->init_key_from_rec(key, + xfs_btree_rec_addr(cur, 1, right)); + } + + + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + + xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (!xfs_btree_ptr_is_null(cur, &rrptr)) { + error = xfs_btree_read_buf_block(cur, &rrptr, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > lrecs + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= lrecs; + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + error = xfs_btree_dup_cursor(cur, curp); + if (error) + goto error0; + (*curp)->bc_ptrs[level + 1]++; + } + *ptrp = rptr; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + +/* + * Copy the old inode root contents into a real block and make the + * broot point to it. + */ +int /* error */ +xfs_btree_new_iroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + struct xfs_buf *cbp; /* buffer for cblock */ + struct xfs_btree_block *block; /* btree block */ + struct xfs_btree_block *cblock; /* child btree block */ + union xfs_btree_key *ckp; /* child key pointer */ + union xfs_btree_ptr *cpp; /* child ptr pointer */ + union xfs_btree_key *kp; /* pointer to btree key */ + union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr nptr; /* new block addr */ + int level; /* btree level */ + int error; /* error return code */ +#ifdef DEBUG + int i; /* loop counter */ +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + level = cur->bc_nlevels - 1; + + block = xfs_btree_get_iroot(cur); + pp = xfs_btree_ptr_addr(cur, 1, block); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + if (error) + goto error0; + if (*stat == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + } + XFS_BTREE_STATS_INC(cur, alloc); + + /* Copy the root into a real block. */ + error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + if (error) + goto error0; + + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ + memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } + + be16_add_cpu(&block->bb_level, 1); + xfs_btree_set_numrecs(block, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); + + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, &nptr, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); + + xfs_iroot_realloc(cur->bc_private.b.ip, + 1 - xfs_btree_get_numrecs(cblock), + cur->bc_private.b.whichfork); + + xfs_btree_setbuf(cur, level, cbp); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + + *logflags |= + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_btree_new_root( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* one half of the old root block */ + struct xfs_buf *bp; /* buffer containing block */ + int error; /* error return value */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *nbp; /* new (root) buffer */ + struct xfs_btree_block *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rptr; + union xfs_btree_ptr lptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + /* initialise our start point from the cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &rptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block. */ + error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + if (error) + goto error0; + + /* Set the root in the holding structure increasing the level by 1. */ + cur->bc_ops->set_root(cur, &lptr, 1); + + /* + * At the previous root level there are now two blocks: the old root, + * and the new block generated when it was split. We don't know which + * one the cursor is pointing at, so we set up variables "left" and + * "right" for each case. + */ + block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); + if (error) + goto error0; +#endif + + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* Our block is left, pick up the right block. */ + lbp = bp; + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + left = block; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + bp = rbp; + nptr = 1; + } else { + /* Our block is right, pick up the left block. */ + rbp = bp; + xfs_btree_buf_to_ptr(cur, rbp, &rptr); + right = block; + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + bp = lbp; + nptr = 2; + } + /* Fill in the new block's btree header and log it. */ + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); + xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); + ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && + !xfs_btree_ptr_is_null(cur, &rptr)); + + /* Fill in the key data in the new root. */ + if (xfs_btree_get_level(left) > 0) { + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 1, new), + xfs_btree_key_addr(cur, 1, left), 1); + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 2, new), + xfs_btree_key_addr(cur, 1, right), 1); + } else { + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 1, new), + xfs_btree_rec_addr(cur, 1, left)); + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 2, new), + xfs_btree_rec_addr(cur, 1, right)); + } + xfs_btree_log_keys(cur, nbp, 1, 2); + + /* Fill in the pointer data in the new root. */ + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); + xfs_btree_log_ptrs(cur, nbp, 1, 2); + + /* Fix up the cursor. */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; +} + +STATIC int +xfs_btree_make_block_unfull( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* btree level */ + int numrecs,/* # of recs in block */ + int *oindex,/* old tree index */ + int *index, /* new tree index */ + union xfs_btree_ptr *nptr, /* new btree ptr */ + struct xfs_btree_cur **ncur, /* new btree cursor */ + union xfs_btree_rec *nrec, /* new record */ + int *stat) +{ + union xfs_btree_key key; /* new btree key value */ + int error = 0; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_inode *ip = cur->bc_private.b.ip; + + if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { + /* A root block that can be made bigger. */ + xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + } else { + /* A root block that needs replacing */ + int logflags = 0; + + error = xfs_btree_new_iroot(cur, &logflags, stat); + if (error || *stat == 0) + return error; + + xfs_trans_log_inode(cur->bc_tp, ip, logflags); + } + + return 0; + } + + /* First, try shifting an entry to the right neighbor. */ + error = xfs_btree_rshift(cur, level, stat); + if (error || *stat) + return error; + + /* Next, try shifting an entry to the left neighbor. */ + error = xfs_btree_lshift(cur, level, stat); + if (error) + return error; + + if (*stat) { + *oindex = *index = cur->bc_ptrs[level]; + return 0; + } + + /* + * Next, try splitting the current block in half. + * + * If this works we have to re-set our variables because we + * could be in a different block now. + */ + error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + if (error || *stat == 0) + return error; + + + *index = cur->bc_ptrs[level]; + cur->bc_ops->init_rec_from_key(&key, nrec); + return 0; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int +xfs_btree_insrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level to insert record at */ + union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ + union xfs_btree_rec *recp, /* i/o: record data inserted */ + struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_key key; /* btree key */ + union xfs_btree_ptr nptr; /* new block ptr */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + union xfs_btree_rec nrec; /* new record count */ + int optr; /* old key/record index */ + int ptr; /* key/record index */ + int numrecs;/* number of records */ + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + + ncur = NULL; + + /* + * If we have an external root pointer, and we've made it to the + * root level, allocate a new root block and we're done. + */ + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level >= cur->bc_nlevels)) { + error = xfs_btree_new_root(cur, stat); + xfs_btree_set_ptr_null(cur, ptrp); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return error; + } + + /* If we're off the left edge, return failure. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Make a key out of the record data to be inserted, and save it. */ + cur->bc_ops->init_key_from_rec(&key, recp); + + optr = ptr; + + XFS_BTREE_STATS_INC(cur, insrec); + + /* Get pointers to the btree buffer and block. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; + + /* Check that the new entry is being inserted in the right place. */ + if (ptr <= numrecs) { + if (level == 0) { + ASSERT(cur->bc_ops->recs_inorder(cur, recp, + xfs_btree_rec_addr(cur, ptr, block))); + } else { + ASSERT(cur->bc_ops->keys_inorder(cur, &key, + xfs_btree_key_addr(cur, ptr, block))); + } + } +#endif + + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + xfs_btree_set_ptr_null(cur, &nptr); + if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { + error = xfs_btree_make_block_unfull(cur, level, numrecs, + &optr, &ptr, &nptr, &ncur, &nrec, stat); + if (error || *stat == 0) + goto error0; + } + + /* + * The current block may have changed if the block was + * previously full and we have just made space in it. + */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + return error; +#endif + + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); + + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + + kp = xfs_btree_key_addr(cur, ptr, block); + pp = xfs_btree_ptr_addr(cur, ptr, block); + +#ifdef DEBUG + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + return error; + } +#endif + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); + xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, ptrp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_ptrs(cur, pp, ptrp, 1); + numrecs++; + xfs_btree_set_numrecs(block, numrecs); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs); + xfs_btree_log_keys(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->keys_inorder(cur, kp, + xfs_btree_key_addr(cur, ptr + 1, block))); + } +#endif + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *rp; + + rp = xfs_btree_rec_addr(cur, ptr, block); + + xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_set_numrecs(block, ++numrecs); + xfs_btree_log_recs(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->recs_inorder(cur, rp, + xfs_btree_rec_addr(cur, ptr + 1, block))); + } +#endif + } + + /* Log the new number of records in the btree header. */ + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* If we inserted at the start of a block, update the parents' keys. */ + if (optr == 1) { + error = xfs_btree_updkey(cur, &key, level + 1); + if (error) + goto error0; + } + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, recp, + ptr, LASTREC_INSREC); + } + + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *ptrp = nptr; + if (!xfs_btree_ptr_is_null(cur, &nptr)) { + *recp = nrec; + *curp = ncur; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Insert the record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. + */ +int +xfs_btree_insert( + struct xfs_btree_cur *cur, + int *stat) +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + union xfs_btree_ptr nptr; /* new block number (split result) */ + struct xfs_btree_cur *ncur; /* new cursor (split result) */ + struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_rec rec; /* record to insert */ + + level = 0; + ncur = NULL; + pcur = cur; + + xfs_btree_set_ptr_null(cur, &nptr); + cur->bc_ops->init_rec_from_cur(cur, &rec); + + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nptr into this level of the tree. + * Note if we fail, nptr will be null. + */ + error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + if (error) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + goto error0; + } + + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + level++; + + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && + (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { + /* Save the state from the cursor before we trash it */ + if (cur->bc_ops->update_cursor) + cur->bc_ops->update_cursor(pcur, cur); + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* If we got a new cursor, switch to it. */ + if (ncur) { + pcur = ncur; + ncur = NULL; + } + } while (!xfs_btree_ptr_is_null(cur, &nptr)); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Try to merge a non-leaf block back into the inode root. + * + * Note: the killroot names comes from the fact that we're effectively + * killing the old root block. But because we can't just delete the + * inode we have to copy the single block it was pointing to into the + * inode. + */ +STATIC int +xfs_btree_kill_iroot( + struct xfs_btree_cur *cur) +{ + int whichfork = cur->bc_private.b.whichfork; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_block *block; + struct xfs_btree_block *cblock; + union xfs_btree_key *kp; + union xfs_btree_key *ckp; + union xfs_btree_ptr *pp; + union xfs_btree_ptr *cpp; + struct xfs_buf *cbp; + int level; + int index; + int numrecs; +#ifdef DEBUG + union xfs_btree_ptr ptr; + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_nlevels > 1); + + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + level = cur->bc_nlevels - 1; + if (level == 1) + goto out0; + + /* + * Give up if the root has multiple children. + */ + block = xfs_btree_get_iroot(cur); + if (xfs_btree_get_numrecs(block) != 1) + goto out0; + + cblock = xfs_btree_get_block(cur, level - 1, &cbp); + numrecs = xfs_btree_get_numrecs(cblock); + + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, killroot); + +#ifdef DEBUG + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); +#endif + + index = numrecs - cur->bc_ops->get_maxrecs(cur, level); + if (index) { + xfs_iroot_realloc(cur->bc_private.b.ip, index, + cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + + be16_add_cpu(&block->bb_numrecs, index); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < numrecs; i++) { + int error; + + error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + } +#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + cur->bc_ops->free_block(cur, cbp); + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level - 1] = NULL; + be16_add_cpu(&block->bb_level, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Kill the current root node, and replace it with it's only child node. + */ +STATIC int +xfs_btree_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) +{ + int error; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); + + /* + * Update the root pointer, decreasing the level by 1 and then + * free the old root. + */ + cur->bc_ops->set_root(cur, newroot, -1); + + error = cur->bc_ops->free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level] = NULL; + cur->bc_ra[level] = 0; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +STATIC int +xfs_btree_dec_cursor( + struct xfs_btree_cur *cur, + int level, + int *stat) +{ + int error; + int i; + + if (level > 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + return error; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +/* + * Single level of the btree record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_btree_delrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + struct xfs_btree_block *block; /* btree block */ + union xfs_btree_ptr cptr; /* current block ptr */ + struct xfs_buf *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop counter */ + union xfs_btree_key key; /* storage for keyp */ + union xfs_btree_key *keyp = &key; /* passed to the next level */ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs = 0; /* left record count */ + int ptr; /* key/record index */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + int rrecs = 0; /* right record count */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + tcur = NULL; + + /* Get the index of the entry being deleted, check for nothing there. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Get the buffer & block containing the record or key/ptr. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we're off the end of the block. */ + if (ptr > numrecs) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + XFS_BTREE_STATS_INC(cur, delrec); + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); + + /* Excise the entries being deleted. */ + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + + lkp = xfs_btree_key_addr(cur, ptr + 1, block); + lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); + +#ifdef DEBUG + for (i = 0; i < numrecs - ptr; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + if (ptr < numrecs) { + xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); + xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); + xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need to pass a + * key up to the next level (updkey). + */ + if (ptr == 1) + keyp = xfs_btree_key_addr(cur, 1, block); + } else { + /* It's a leaf. operate on records */ + if (ptr < numrecs) { + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, ptr + 1, block), + -1, numrecs - ptr); + xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, block)); + keyp = &key; + } + } + + /* + * Decrement and log the number of entries in the block. + */ + xfs_btree_set_numrecs(block, --numrecs); + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, NULL, + ptr, LASTREC_DELREC); + } + + /* + * We're at the root level. First, shrink the root block in-memory. + * Try to get rid of the next level down. If we can't then there's + * nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } + + /* + * If this is the root level, and there's only one entry left, + * and it's NOT the leaf level, then we can get rid of this + * level. + */ + if (numrecs == 1 && level > 0) { + union xfs_btree_ptr *pp; + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + pp = xfs_btree_ptr_addr(cur, 1, block); + error = xfs_btree_kill_root(cur, bp, level, pp); + if (error) + goto error0; + } else if (level > 0) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + } + *stat = 1; + return 0; + } + + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1) { + error = xfs_btree_updkey(cur, keyp, level + 1); + if (error) + goto error0; + } + + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (xfs_btree_ptr_is_null(cur, &rptr) && + xfs_btree_ptr_is_null(cur, &lptr) && + level == cur->bc_nlevels - 2) { + error = xfs_btree_kill_iroot(cur); + if (!error) + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || + !xfs_btree_ptr_is_null(cur, &lptr)); + + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + /* Grab a pointer to the block. */ + right = xfs_btree_get_block(tcur, level, &rbp); +#ifdef DEBUG + error = xfs_btree_check_block(tcur, right, level, rbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); + + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (xfs_btree_get_numrecs(right) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_lshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = xfs_btree_get_numrecs(right); + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + } + } + + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + /* Grab a pointer to the block. */ + left = xfs_btree_get_block(tcur, level, &lbp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); + + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (xfs_btree_get_numrecs(left) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_rshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = xfs_btree_get_numrecs(left); + } + + /* Delete the temp cursor, we're done with it. */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + /* If here, we need to do a join to keep the tree balanced. */ + ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); + + if (!xfs_btree_ptr_is_null(cur, &lptr) && + lrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rptr = cptr; + right = block; + rbp = bp; + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* + * If that won't work, see if we can join with the right neighbor block. + */ + } else if (!xfs_btree_ptr_is_null(cur, &rptr) && + rrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lptr = cptr; + left = block; + lbp = bp; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + } else { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + rrecs = xfs_btree_get_numrecs(right); + lrecs = xfs_btree_get_numrecs(left); + + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs + 1, left); + lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + for (i = 1; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); + xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); + + xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, rrecs); + xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + + XFS_BTREE_STATS_INC(cur, join); + + /* + * Fix up the number of records and right block pointer in the + * surviving block, and log it. + */ + xfs_btree_set_numrecs(left, lrecs + rrecs); + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* If there is a right sibling, point it to the remaining block. */ + xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &cptr)) { + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + + /* Free the deleted block. */ + error = cur->bc_ops->free_block(cur, rbp); + if (error) + goto error0; + XFS_BTREE_STATS_INC(cur, free); + + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + (level + 1 < cur->bc_nlevels)) { + error = xfs_btree_increment(cur, level + 1, &i); + if (error) + goto error0; + } + + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + /* Return value means the next level up has something to do. */ + *stat = 2; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_btree_delete( + struct xfs_btree_cur *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int level; + int i; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* + * Go up the tree, starting at leaf level. + * + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + error = xfs_btree_delrec(cur, level, &i); + if (error) + goto error0; + } + + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + goto error0; + break; + } + } + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_btree_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_rec **recp, /* output: btree record */ + int *stat) /* output: success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer pointer */ + int ptr; /* record number */ +#ifdef DEBUG + int error; /* error return value */ +#endif + + ptr = cur->bc_ptrs[0]; + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + return error; +#endif + + /* + * Off the right end or left end, return failure. + */ + if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { + *stat = 0; + return 0; + } + + /* + * Point to the record and extract its data. + */ + *recp = xfs_btree_rec_addr(cur, ptr, block); + *stat = 1; + return 0; +} + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return -ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != -ENOENT) + return error; + } + + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_btree.h b/kernel/fs/xfs/libxfs/xfs_btree.h new file mode 100644 index 000000000..8f18bab73 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_btree.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BTREE_H__ +#define __XFS_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_btree_cur_zone; + +/* + * Generic key, ptr and record wrapper structures. + * + * These are disk format structures, and are converted where necessary + * by the btree specific code that needs to interpret them. + */ +union xfs_btree_ptr { + __be32 s; /* short form ptr */ + __be64 l; /* long form ptr */ +}; + +union xfs_btree_key { + xfs_bmbt_key_t bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + xfs_inobt_key_t inobt; +}; + +union xfs_btree_rec { + xfs_bmbt_rec_t bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + xfs_alloc_rec_t alloc; + xfs_inobt_rec_t inobt; +}; + +/* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) + +/* + * For logging record fields. + */ +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_NUM_BITS_CRC 9 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) + +/* + * Generic stats interface + */ +#define __XFS_BTREE_STATS_INC(type, stat) \ + XFS_STATS_INC(xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define __XFS_BTREE_STATS_ADD(type, stat, val) \ + XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ + +struct xfs_btree_ops { + /* size of the key and record structures */ + size_t key_len; + size_t rec_len; + + /* cursor operations */ + struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); + void (*update_cursor)(struct xfs_btree_cur *src, + struct xfs_btree_cur *dst); + + /* update btree root pointer */ + void (*set_root)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, int level_change); + + /* block allocation / freeing */ + int (*alloc_block)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat); + int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); + + /* update last record information */ + void (*update_lastrec)(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, int reason); + + /* records in block/level */ + int (*get_minrecs)(struct xfs_btree_cur *cur, int level); + int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); + + /* records on disk. Matter for the root in inode case. */ + int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); + + /* init values of btree structures */ + void (*init_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_key)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec); + void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); + + /* difference between key value and cursor value */ + __int64_t (*key_diff)(struct xfs_btree_cur *cur, + union xfs_btree_key *key); + + const struct xfs_buf_ops *buf_ops; + +#if defined(DEBUG) || defined(XFS_WARN) + /* check that k1 is lower than k2 */ + int (*keys_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2); + + /* check that r1 is lower than r2 */ + int (*recs_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2); +#endif +}; + +/* + * Reasons for the update_lastrec method to be called. + */ +#define LASTREC_UPDATE 0 +#define LASTREC_INSREC 1 +#define LASTREC_DELREC 2 + + +/* + * Btree cursor structure. + * This collects all information needed by the btree code in one place. + */ +typedef struct xfs_btree_cur +{ + struct xfs_trans *bc_tp; /* transaction we're in, if any */ + struct xfs_mount *bc_mp; /* file system mount struct */ + const struct xfs_btree_ops *bc_ops; + uint bc_flags; /* btree features - below */ + union { + xfs_alloc_rec_incore_t a; + xfs_bmbt_irec_t b; + xfs_inobt_rec_incore_t i; + } bc_rec; /* current insert/search record value */ + struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ + int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ + __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ +#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ + __uint8_t bc_nlevels; /* number of levels in the tree */ + __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ + union { + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } a; + struct { /* needed for BMAP */ + struct xfs_inode *ip; /* pointer to our inode */ + struct xfs_bmap_free *flist; /* list to free after */ + xfs_fsblock_t firstblock; /* 1st blk allocated */ + int allocated; /* count of alloced */ + short forksize; /* fork's inode space */ + char whichfork; /* data or attr fork */ + char flags; /* flags */ +#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ + } b; + } bc_private; /* per-btree type data */ +} xfs_btree_cur_t; + +/* cursor flags */ +#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ +#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ +#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ + + +#define XFS_BTREE_NOERROR 0 +#define XFS_BTREE_ERROR 1 + +/* + * Convert from buffer to btree block header. + */ +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) + + +/* + * Check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_fsblock_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error); /* del because of error */ + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur);/* output cursor */ + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +struct xfs_buf * /* buffer for fsbno */ +xfs_btree_get_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +struct xfs_buf * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to check */ + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets,/* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last); /* output: last byte offset */ + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops); + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +void /* error */ +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +void /* error */ +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); + +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + +/* + * Common btree core entry points. + */ +int xfs_btree_increment(struct xfs_btree_cur *, int, int *); +int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); +int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); +int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); +int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); +int xfs_btree_insert(struct xfs_btree_cur *, int *); +int xfs_btree_delete(struct xfs_btree_cur *, int *); +int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); + +/* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* + * Internal btree helpers also used by xfs_bmap.c. + */ +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); + +/* + * Helpers. + */ +static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_numrecs); +} + +static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, + __uint16_t numrecs) +{ + block->bb_numrecs = cpu_to_be16(numrecs); +} + +static inline int xfs_btree_get_level(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_level); +} + + +/* + * Min and max functions for extlen, agblock, fileoff, and filblks types. + */ +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) + +#define XFS_FSB_SANITY_CHECK(mp,fsb) \ + (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) + +/* + * Trace hooks. Currently not implemented as they need to be ported + * over to the generic tracing functionality, which is some effort. + * + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) + +#endif /* __XFS_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_cksum.h b/kernel/fs/xfs/libxfs/xfs_cksum.h new file mode 100644 index 000000000..fad1676ad --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.c b/kernel/fs/xfs/libxfs/xfs_da_btree.c new file mode 100644 index 000000000..2385f8cd0 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_btree.c @@ -0,0 +1,2660 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +/* + * xfs_da_btree.c + * + * Routines to implement directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_da3_root_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_root, + xfs_da_state_blk_t *new_child); +STATIC int xfs_da3_node_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_blk, + xfs_da_state_blk_t *split_blk, + xfs_da_state_blk_t *blk_to_add, + int treelevel, + int *result); +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *node_blk_1, + xfs_da_state_blk_t *node_blk_2); +STATIC void xfs_da3_node_add(xfs_da_state_t *state, + xfs_da_state_blk_t *old_node_blk, + xfs_da_state_blk_t *new_node_blk); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_da3_root_join(xfs_da_state_t *state, + xfs_da_state_blk_t *root_blk); +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk); +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, + xfs_da_state_blk_t *src_node_blk, + xfs_da_state_blk_t *dst_node_blk); + +/* + * Utility routines. + */ +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); + + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +static bool +xfs_da3_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + const struct xfs_dir_ops *ops; + + ops = xfs_dir_get_ops(mp, NULL); + + ops->node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; + } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) + return false; + + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_da3_node_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_da3_node_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, -EFSBADCRC); + break; + } + /* fall through */ + case XFS_DA_NODE_MAGIC: + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + break; + } + return; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; + bp->b_ops->verify_read(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + break; + } + + /* corrupt block */ + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, +}; + +int +xfs_da3_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; +} + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of an intermediate node. + */ +int +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; + struct xfs_inode *dp = args->dp; + + trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); + + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); + if (error) + return error; + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + node = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + dp->d_ops->node_hdr_to_disk(node, &ichdr); + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + *bpp = bp; + return 0; +} + +/* + * Split a leaf node, rebalance, then possibly split + * intermediate nodes, rebalance, etc. + */ +int /* error */ +xfs_da3_split( + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action = 0; + int error; + int i; + + trace_xfs_da_split(state->args); + + /* + * Walk back up the tree splitting/inserting/adjusting as necessary. + * If we need to insert and there isn't room, split the node, then + * decide which fragment to insert the new block from below into. + * Note that we may split the root this way, but we need more fixup. + */ + max = state->path.active - 1; + ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH)); + ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + addblk = &state->path.blk[max]; /* initial dummy value */ + for (i = max; (i >= 0) && addblk; state->path.active--, i--) { + oldblk = &state->path.blk[i]; + newblk = &state->altpath.blk[i]; + + /* + * If a leaf node then + * Allocate a new leaf node, then rebalance across them. + * else if an intermediate node then + * We split on the last layer, must we split the node? + */ + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr3_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != -ENOSPC)) { + return error; /* GROT: attr is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + trace_xfs_attr_leaf_split_before(state->args); + error = xfs_attr3_leaf_split(state, oldblk, + &state->extrablk); + } else { + state->extraafter = 1; /* after newblk */ + trace_xfs_attr_leaf_split_after(state->args); + error = xfs_attr3_leaf_split(state, newblk, + &state->extrablk); + } + if (error) + return error; /* GROT: attr inconsistent */ + addblk = newblk; + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_split(state, oldblk, newblk); + if (error) + return error; + addblk = newblk; + break; + case XFS_DA_NODE_MAGIC: + error = xfs_da3_node_split(state, oldblk, newblk, addblk, + max - i, &action); + addblk->bp = NULL; + if (error) + return error; /* GROT: dir is inconsistent */ + /* + * Record the newly split block for the next time thru? + */ + if (action) + addblk = newblk; + else + addblk = NULL; + break; + } + + /* + * Update the btree to show the new hashval for this child. + */ + xfs_da3_fixhashpath(state, &state->path); + } + if (!addblk) + return 0; + + /* + * Split the root node. + */ + ASSERT(state->path.active == 0); + oldblk = &state->path.blk[0]; + error = xfs_da3_root_split(state, oldblk, addblk); + if (error) { + addblk->bp = NULL; + return error; /* GROT: dir is inconsistent */ + } + + /* + * Update pointers to the node which used to be block 0 and + * just got bumped because of the addition of a new root node. + * There might be three blocks involved if a double split occurred, + * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. + */ + node = oldblk->bp->b_addr; + if (node->hdr.info.forw) { + if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->b_addr; + node->hdr.info.back = cpu_to_be32(oldblk->blkno); + xfs_trans_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + node = oldblk->bp->b_addr; + if (node->hdr.info.back) { + if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->b_addr; + node->hdr.info.forw = cpu_to_be32(oldblk->blkno); + xfs_trans_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + addblk->bp = NULL; + return 0; +} + +/* + * Split the root. We have to create a new root and point to the two + * parts (the split old root) that we just created. Copy block zero to + * the EOF, extending the inode in process. + */ +STATIC int /* error */ +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; + + trace_xfs_da_root_split(state->args); + + /* + * Copy the existing (incorrect) block from the root node position + * to a free space somewhere. + */ + args = state->args; + error = xfs_da_grow_inode(args, &blkno); + if (error) + return error; + + dp = args->dp; + tp = args->trans; + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + if (error) + return error; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr icnodehdr; + + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); + btree = dp->d_ops->node_tree_p(oldroot); + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + } else { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + leaf = (xfs_dir2_leaf_t *)oldroot; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ + memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } + xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); + blk1->bp = bp; + blk1->blkno = blkno; + + /* + * Set up the new root node. + */ + error = xfs_da3_node_create(args, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, + level + 1, &bp, args->whichfork); + if (error) + return error; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + +#ifdef DEBUG + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); + } +#endif + + /* Header is already logged by xfs_da_node_create */ + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); + + return 0; +} + +/* + * Split the node, rebalance, then add the new entry. + */ +STATIC int /* error */ +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_split(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + + /* + * With V2 dirs the extra block is data or freespace. + */ + useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK; + newcount = 1 + useextra; + /* + * Do we have to split the node? + */ + if (nodehdr.count + newcount > state->args->geo->node_ents) { + /* + * Allocate a new node, add to the doubly linked chain of + * nodes, then move some of our excess entries into it. + */ + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return error; /* GROT: dir is inconsistent */ + + error = xfs_da3_node_create(state->args, blkno, treelevel, + &newblk->bp, state->args->whichfork); + if (error) + return error; /* GROT: dir is inconsistent */ + newblk->blkno = blkno; + newblk->magic = XFS_DA_NODE_MAGIC; + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) + return error; + *result = 1; + } else { + *result = 0; + } + + /* + * Insert the new entry(s) into the correct block + * (updating last hashval in the process). + * + * xfs_da3_node_add() inserts BEFORE the given index, + * and as a result of using node_lookup_int() we always + * point to a valid entry (not after one), but a split + * operation always results in a new block whose hashvals + * FOLLOW the current block. + * + * If we had double-split op below us, then add the extra block too. + */ + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { + oldblk->index++; + xfs_da3_node_add(state, oldblk, addblk); + if (useextra) { + if (state->extraafter) + oldblk->index++; + xfs_da3_node_add(state, oldblk, &state->extrablk); + state->extravalid = 0; + } + } else { + newblk->index++; + xfs_da3_node_add(state, newblk, addblk); + if (useextra) { + if (state->extraafter) + newblk->index++; + xfs_da3_node_add(state, newblk, &state->extrablk); + state->extravalid = 0; + } + } + + return 0; +} + +/* + * Balance the btree elements between two intermediate nodes, + * usually one full and one empty. + * + * NOTE: if blk2 is empty, then it will get the upper half of blk1. + */ +STATIC void +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_rebalance(state->args); + + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + /* + * Figure out how many entries need to move, and in which direction. + * Swap the nodes around if that makes it simpler. + */ + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { + tmpnode = node1; + node1 = node2; + node2 = tmpnode; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + swap = 1; + } + + count = (nodehdr1.count - nodehdr2.count) / 2; + if (count == 0) + return; + tp = state->args->trans; + /* + * Two cases: high-to-low and low-to-high. + */ + if (count > 0) { + /* + * Move elements in node2 up to make a hole. + */ + tmp = nodehdr2.count; + if (tmp > 0) { + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[0]; + btree_d = &btree2[count]; + memmove(btree_d, btree_s, tmp); + } + + /* + * Move the req'd B-tree elements from high in node1 to + * low in node2. + */ + nodehdr2.count += count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; + memcpy(btree_d, btree_s, tmp); + nodehdr1.count -= count; + } else { + /* + * Move the req'd B-tree elements from low in node2 to + * high in node1. + */ + count = -count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; + memcpy(btree_d, btree_s, tmp); + nodehdr1.count += count; + + xfs_trans_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, btree_d, tmp)); + + /* + * Move elements in node2 down to fill the hole. + */ + tmp = nodehdr2.count - count; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[count]; + btree_d = &btree2[0]; + memmove(btree_d, btree_s, tmp); + nodehdr2.count -= count; + } + + /* + * Log header of node 1 and all current bits of node 2. + */ + dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); + xfs_trans_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + + dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); + xfs_trans_log_buf(tp, blk2->bp, + XFS_DA_LOGRANGE(node2, &node2->hdr, + dp->d_ops->node_hdr_size + + (sizeof(btree2[0]) * nodehdr2.count))); + + /* + * Record the last hashval from each block for upward propagation. + * (note: don't use the swapped node pointers) + */ + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); + + /* + * Adjust the expected index for insertion. + */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ + } +} + +/* + * Add a new entry to an intermediate node. + */ +STATIC void +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_add(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); + ASSERT(newblk->blkno != 0); + if (state->args->whichfork == XFS_DATA_FORK) + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); + + /* + * We may need to make some room before we insert the new node. + */ + tmp = 0; + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); + } + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + /* + * Copy the last hash value from the oldblk to propagate upwards. + */ + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Deallocate an empty leaf node, remove it from its parent, + * possibly deallocating that block, etc... + */ +int +xfs_da3_join( + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; + + trace_xfs_da_join(state->args); + + drop_blk = &state->path.blk[ state->path.active-1 ]; + save_blk = &state->altpath.blk[ state->path.active-1 ]; + ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC || + drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* + * Walk back up the tree joining/deallocating as necessary. + * When we stop dropping blocks, break out. + */ + for ( ; state->path.active >= 2; drop_blk--, save_blk--, + state->path.active--) { + /* + * See if we can combine the block with a neighbor. + * (action == 0) => no options, just leave + * (action == 1) => coalesce, then unlink + * (action == 2) => block empty, unlink it + */ + switch (drop_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr3_leaf_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_dir2_leafn_unbalance(state, drop_blk, save_blk); + break; + case XFS_DA_NODE_MAGIC: + /* + * Remove the offending node, fixup hashvals, + * check for a toosmall neighbor. + */ + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_da3_node_unbalance(state, drop_blk, save_blk); + break; + } + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); + xfs_da_state_kill_altpath(state); + if (error) + return error; + error = xfs_da_shrink_inode(state->args, drop_blk->blkno, + drop_blk->bp); + drop_blk->bp = NULL; + if (error) + return error; + } + /* + * We joined all the way to the top. If it turns out that + * we only have one entry in the root, make the child block + * the new root. + */ + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); + return error; +} + +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + +/* + * We have only one entry in the root. Copy the only remaining child of + * the old root to block 0 as the new root node. + */ +STATIC int +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) +{ + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_root_join(state->args); + + ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; + oldroot = root_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); + + /* + * If the root has more than one child, then don't do anything. + */ + if (oldroothdr.count > 1) + return 0; + + /* + * Read in the (only) child block, then copy those bytes into + * the root block's buffer and free the original child block. + */ + btree = dp->d_ops->node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); + ASSERT(child != 0); + error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, + args->whichfork); + if (error) + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); + + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the b_ops pointer as well to match the buffer type change + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. + */ + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); + root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); + error = xfs_da_shrink_inode(args, child, bp); + return error; +} + +/* + * Check a node block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +STATIC int +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) +{ + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_toosmall(state->args); + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->b_addr; + node = (xfs_da_intnode_t *)info; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return 0; /* blk over 50%, don't try to join */ + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (nodehdr.count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (info->forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 2; + } + return 0; + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; + count -= nodehdr.count; + + /* start with smaller blk num */ + forward = nodehdr.forw < nodehdr.back; + for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_da3_icnode_hdr thdr; + if (forward) + blkno = nodehdr.forw; + else + blkno = nodehdr.back; + if (blkno == 0) + continue; + error = xfs_da3_node_read(state->args->trans, dp, + blkno, -1, &bp, state->args->whichfork); + if (error) + return error; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&thdr, node); + xfs_trans_brelse(state->args->trans, bp); + + if (count - thdr.count >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da3_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; + } + *action = 1; + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = dp->d_ops->node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); +} + +/* + * Walk back up the tree adjusting hash values as necessary, + * when we stop making changes, return. + */ +void +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_fixhashpath(state->args); + + level = path->active-1; + blk = &path->blk[ level ]; + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + lasthash = xfs_attr_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DIR2_LEAFN_MAGIC: + lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DA_NODE_MAGIC: + lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count); + if (count == 0) + return; + break; + } + for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) + break; + blk->hashval = lasthash; + btree[blk->index].hashval = cpu_to_be32(lasthash); + xfs_trans_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); + + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); + } +} + +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_remove(state->args); + + node = drop_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); + ASSERT(drop_blk->index >= 0); + + /* + * Copy over the offending entry, or just zero it out. + */ + index = drop_blk->index; + btree = dp->d_ops->node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + memmove(&btree[index], &btree[index + 1], tmp); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; + } + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + /* + * Copy the last hash value from the block to propagate upwards. + */ + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); +} + +/* + * Unbalance the elements between two intermediate nodes, + * move all Btree elements from one node into another. + */ +STATIC void +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_unbalance(state->args); + + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); + dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); + drop_btree = dp->d_ops->node_tree_p(drop_node); + save_btree = dp->d_ops->node_tree_p(save_node); + tp = state->args->trans; + + /* + * If the dying block has lower hashvals, then move all the + * elements in the remaining block up to make a hole. + */ + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); + } else { + sindex = save_hdr.count; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); + } + + /* + * Move all the B-tree elements from drop_blk to save_blk. + */ + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + + dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_node->hdr, + dp->d_ops->node_hdr_size)); + + /* + * Save the last hashval in the remaining block for upward propagation. + */ + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Walk down the Btree looking for a particular filename, filling + * in the state structure as we go. + * + * We will set the state structure to point to each of the elements + * in each of the nodes where either the hashval is or should be. + * + * We support duplicate hashval's so for each entry in the current + * node that could contain the desired hashval, descend. This is a + * pruned depth-first tree search. + */ +int /* error */ +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + + /* + * Descend thru the B-tree searching each level for the right + * node to use, until the right hashval is found. + */ + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; + for (blk = &state->path.blk[0], state->path.active = 1; + state->path.active <= XFS_DA_NODE_MAXDEPTH; + blk++, state->path.active++) { + /* + * Read the next node down in the tree. + */ + blk->blkno = blkno; + error = xfs_da3_node_read(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); + if (error) { + blk->blkno = 0; + state->path.active--; + return error; + } + curr = blk->bp->b_addr; + blk->magic = be16_to_cpu(curr->magic); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + + + /* + * Search an intermediate node for a match. + */ + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); + + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); + } + } + + /* + * A leaf block that ends in the hashval that we are interested in + * (final hashval == search hashval) means that the next block may + * contain more entries with the same hashval, shift upward to the + * next leaf and keep searching. + */ + for (;;) { + if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + retval = xfs_dir2_leafn_lookup_int(blk->bp, args, + &blk->index, state); + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); + blk->index = args->index; + args->blkno = blk->blkno; + } else { + ASSERT(0); + return -EFSCORRUPTED; + } + if (((retval == -ENOENT) || (retval == -ENOATTR)) && + (blk->hashval == args->hashval)) { + error = xfs_da3_path_shift(state, &state->path, 1, 1, + &retval); + if (error) + return error; + if (retval == 0) { + continue; + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + /* path_shift() gives ENOENT */ + retval = -ENOATTR; + } + } + break; + } + *result = retval; + return 0; +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_inode *dp, + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + dp->d_ops->node_hdr_from_disk(&node1hdr, node1); + dp->d_ops->node_hdr_from_disk(&node2hdr, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* + * Link a new block into a doubly linked list of blocks (of whatever type). + */ +int /* error */ +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) +{ + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; + struct xfs_inode *dp = state->args->dp; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; + ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || + old_blk->magic == XFS_DIR2_LEAFN_MAGIC || + old_blk->magic == XFS_ATTR_LEAF_MAGIC); + + switch (old_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); + break; + case XFS_DIR2_LEAFN_MAGIC: + before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp); + break; + case XFS_DA_NODE_MAGIC: + before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp); + break; + } + + /* + * Link blocks in appropriate order. + */ + if (before) { + /* + * Link new block in before existing block. + */ + trace_xfs_da_link_before(args); + new_info->forw = cpu_to_be32(old_blk->blkno); + new_info->back = old_info->back; + if (old_info->back) { + error = xfs_da3_node_read(args->trans, dp, + be32_to_cpu(old_info->back), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == old_info->magic); + ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); + tmp_info->forw = cpu_to_be32(new_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + } + old_info->back = cpu_to_be32(new_blk->blkno); + } else { + /* + * Link new block in after existing block. + */ + trace_xfs_da_link_after(args); + new_info->forw = old_info->forw; + new_info->back = cpu_to_be32(old_blk->blkno); + if (old_info->forw) { + error = xfs_da3_node_read(args->trans, dp, + be32_to_cpu(old_info->forw), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == old_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); + tmp_info->back = cpu_to_be32(new_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + } + old_info->forw = cpu_to_be32(new_blk->blkno); + } + + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + return 0; +} + +/* + * Unlink a block from a doubly linked list of blocks. + */ +STATIC int /* error */ +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; + ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || + save_blk->magic == XFS_DIR2_LEAFN_MAGIC || + save_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == drop_blk->magic); + ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || + (be32_to_cpu(save_info->back) == drop_blk->blkno)); + ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) || + (be32_to_cpu(drop_info->back) == save_blk->blkno)); + + /* + * Unlink the leaf block from the doubly linked chain of leaves. + */ + if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + trace_xfs_da_unlink_back(args); + save_info->back = drop_info->back; + if (drop_info->back) { + error = xfs_da3_node_read(args->trans, args->dp, + be32_to_cpu(drop_info->back), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); + tmp_info->forw = cpu_to_be32(save_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + } + } else { + trace_xfs_da_unlink_forward(args); + save_info->forw = drop_info->forw; + if (drop_info->forw) { + error = xfs_da3_node_read(args->trans, args->dp, + be32_to_cpu(drop_info->forw), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); + tmp_info->back = cpu_to_be32(save_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + } + } + + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + return 0; +} + +/* + * Move a path "forward" or "!forward" one block at the current level. + * + * This routine will adjust a "path" to point to the next block + * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the + * Btree, including updating pointers to the intermediate nodes between + * the new bottom and the root. + */ +int /* error */ +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_path_shift(state->args); + + /* + * Roll up the Btree looking for the first block where our + * current index is not at the edge of the block. Note that + * we skip the bottom layer because we want the sibling block. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(path != NULL); + ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + level = (path->active-1) - 1; /* skip bottom layer in path */ + for (blk = &path->blk[level]; level >= 0; blk--, level--) { + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { + blk->index++; + blkno = be32_to_cpu(btree[blk->index].before); + break; + } else if (!forward && (blk->index > 0)) { + blk->index--; + blkno = be32_to_cpu(btree[blk->index].before); + break; + } + } + if (level < 0) { + *result = -ENOENT; /* we're out of our tree */ + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + return 0; + } + + /* + * Roll down the edge of the subtree until we reach the + * same depth we were at originally. + */ + for (blk++, level++; level < path->active; blk++, level++) { + /* + * Release the old block. + * (if it's dirty, trans won't actually let go) + */ + if (release) + xfs_trans_brelse(args->trans, blk->bp); + + /* + * Read the next child block. + */ + blk->blkno = blkno; + error = xfs_da3_node_read(args->trans, dp, blkno, -1, + &blk->bp, args->whichfork); + if (error) + return error; + info = blk->bp->b_addr; + ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; + node = (xfs_da_intnode_t *)info; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); + if (forward) + blk->index = 0; + else + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + default: + ASSERT(0); + break; + } + } + *result = 0; + return 0; +} + + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Implement a simple hash on a character string. + * Rotate the hash value by 7 bits, then XOR each character in. + * This is implemented with some source-level loop unrolling. + */ +xfs_dahash_t +xfs_da_hashname(const __uint8_t *name, int namelen) +{ + xfs_dahash_t hash; + + /* + * Do four characters at a time as long as we can. + */ + for (hash = 0; namelen >= 4; namelen -= 4, name += 4) + hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^ + (name[3] << 0) ^ rol32(hash, 7 * 4); + + /* + * Now do the rest of the characters. + */ + switch (namelen) { + case 3: + return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^ + rol32(hash, 7 * 3); + case 2: + return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2); + case 1: + return (name[0] << 0) ^ rol32(hash, 7 * 1); + default: /* case 0: */ + return hash; + } +} + +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + +int +xfs_da_grow_inode_int( + struct xfs_da_args *args, + xfs_fileoff_t *bno, + int count) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + int w = args->whichfork; + xfs_rfsblock_t nblks = dp->i_d.di_nblocks; + struct xfs_bmbt_irec map, *mapp; + int nmap, error, got, i, mapi; + + /* + * Find a spot in the file space to put the new block. + */ + error = xfs_bmap_first_unused(tp, dp, count, bno, w); + if (error) + return error; + + /* + * Try mapping it in one filesystem block. + */ + nmap = 1; + ASSERT(args->firstblock != NULL); + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (error) + return error; + + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; + int c; + + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + for (b = *bno, mapi = 0; b < *bno + count; ) { + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(*bno + count - b); + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist); + if (error) + goto out_free_map; + if (nmap < 1) + break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } else { + mapi = 0; + mapp = NULL; + } + + /* + * Count the blocks we got, make sure it matches the total. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + if (got != count || mapp[0].br_startoff != *bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + *bno + count) { + error = -ENOSPC; + goto out_free_map; + } + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + +out_free_map: + if (mapp != &map) + kmem_free(mapp); + return error; +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode( + struct xfs_da_args *args, + xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno; + int error; + + trace_xfs_da_grow_inode(args); + + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); + if (!error) + *new_blkno = (xfs_dablk_t)bno; + return error; +} + +/* + * Ick. We need to always be able to remove a btree block, even + * if there's no space reservation because the filesystem is full. + * This is called if xfs_bunmapi on a btree block fails due to ENOSPC. + * It swaps the target block with the last block in the file. The + * last block in the file can always be removed since it can't cause + * a bmap btree split to do that. + */ +STATIC int +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) +{ + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; + + trace_xfs_da_swap_lastblock(args); + + dead_buf = *dead_bufp; + dead_blkno = *dead_blknop; + tp = args->trans; + dp = args->dp; + w = args->whichfork; + ASSERT(w == XFS_DATA_FORK); + mp = dp->i_mount; + lastoff = args->geo->freeblk; + error = xfs_bmap_last_before(tp, dp, &lastoff, w); + if (error) + return error; + if (unlikely(lastoff == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } + /* + * Read the last block in the btree space. + */ + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; + error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); + if (error) + return error; + /* + * Copy the last block into the dead buffer and log it. + */ + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); + dead_info = dead_buf->b_addr; + /* + * Get values from the moved block. + */ + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = dp->d_ops->leaf_ents_p(dead_leaf2); + dead_level = 0; + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); + } else { + struct xfs_da3_icnode_hdr deadhdr; + + dead_node = (xfs_da_intnode_t *)dead_info; + dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); + btree = dp->d_ops->node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); + } + sib_buf = par_buf = NULL; + /* + * If the moved block has a left sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->back))) { + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) + goto done; + sib_info = sib_buf->b_addr; + if (unlikely( + be32_to_cpu(sib_info->forw) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + sib_info->forw = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->forw, + sizeof(sib_info->forw))); + sib_buf = NULL; + } + /* + * If the moved block has a right sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->forw))) { + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) + goto done; + sib_info = sib_buf->b_addr; + if (unlikely( + be32_to_cpu(sib_info->back) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + sib_info->back = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->back, + sizeof(sib_info->back))); + sib_buf = NULL; + } + par_blkno = args->geo->leafblk; + level = -1; + /* + * Walk down the tree looking for the parent of the moved block. + */ + for (;;) { + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) + goto done; + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + level = par_hdr.level; + btree = dp->d_ops->node_tree_p(par_node); + for (entno = 0; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; + entno++) + continue; + if (entno == par_hdr.count) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + par_blkno = be32_to_cpu(btree[entno].before); + if (level == dead_level + 1) + break; + xfs_trans_brelse(tp, par_buf); + par_buf = NULL; + } + /* + * We're in the right parent block. + * Look for the right entry. + */ + for (;;) { + for (; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; + entno++) + continue; + if (entno < par_hdr.count) + break; + par_blkno = par_hdr.forw; + xfs_trans_brelse(tp, par_buf); + par_buf = NULL; + if (unlikely(par_blkno == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) + goto done; + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + btree = dp->d_ops->node_tree_p(par_node); + entno = 0; + } + /* + * Update the parent entry pointing to the moved block. + */ + btree[entno].before = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); + *dead_blknop = last_blkno; + *dead_bufp = last_buf; + return 0; +done: + if (par_buf) + xfs_trans_brelse(tp, par_buf); + if (sib_buf) + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); + return error; +} + +/* + * Remove a btree block from a directory or attribute. + */ +int +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) +{ + xfs_inode_t *dp; + int done, error, w, count; + xfs_trans_t *tp; + + trace_xfs_da_shrink_inode(args); + + dp = args->dp; + w = args->whichfork; + tp = args->trans; + count = args->geo->fsbcount; + for (;;) { + /* + * Remove extents. If we get ENOSPC for a dir we have to move + * the last block to the place we want to kill. + */ + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == -ENOSPC) { + if (w != XFS_DATA_FORK) + break; + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) + break; + } else { + break; + } + } + xfs_trans_binval(tp, dead_buf); + return error; +} + +/* + * See if the mapping(s) for this btree block are valid, i.e. + * don't contain holes, are logically contiguous, and cover the whole range. + */ +STATIC int +xfs_da_map_covers_blocks( + int nmap, + xfs_bmbt_irec_t *mapp, + xfs_dablk_t bno, + int count) +{ + int i; + xfs_fileoff_t off; + + for (i = 0, off = bno; i < nmap; i++) { + if (mapp[i].br_startblock == HOLESTARTBLOCK || + mapp[i].br_startblock == DELAYSTARTBLOCK) { + return 0; + } + if (off != mapp[i].br_startoff) { + return 0; + } + off += mapp[i].br_blockcount; + } + return off == bno + count; +} + +/* + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. + */ +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + int *nmaps, + struct xfs_bmbt_irec *irecs, + int nirecs) +{ + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); + if (!map) + return -ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); + + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; + + /* + * Caller doesn't have a mapping. -2 means don't complain + * if we land in a hole. + */ + if (mappedbno == -1 || mappedbno == -2) { + /* + * Optimize the one-block case. + */ + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); + + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); + if (error) + goto out; + } else { + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; + } + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : -EFSCORRUPTED; + if (unlikely(error == -EFSCORRUPTED)) { + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; + xfs_alert(mp, "%s: bno %lld dir: inode %lld", + __func__, (long long)bno, + (long long)dp->i_ino); + for (i = 0; i < *nmaps; i++) { + xfs_alert(mp, +"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", + i, + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); + } + } + XFS_ERROR_REPORT("xfs_da_do_buf(1)", + XFS_ERRLEVEL_LOW, mp); + } + goto out; + } + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : -EIO; + if (error) { + if (bp) + xfs_trans_brelse(trans, bp); + goto out_free; + } + + *bpp = bp; + +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp, ops); + if (error) + goto out_free; + + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + *bpp = bp; +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Readahead the dir/attr block. + */ +xfs_daddr_t +xfs_da_reada_buf( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); + +out_free: + if (mapp != &map) + kmem_free(mapp); + + if (error) + return -1; + return mappedbno; +} diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.h b/kernel/fs/xfs/libxfs/xfs_da_btree.h new file mode 100644 index 000000000..6e153e399 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_btree.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_BTREE_H__ +#define __XFS_DA_BTREE_H__ + +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_trans; +struct zone; +struct xfs_dir_ops; + +/* + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. + */ +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ +}; + +/*======================================================================== + * Btree searching and modification structure definitions. + *========================================================================*/ + +/* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* + * Structure to ease passing around component names. + */ +typedef struct xfs_da_args { + struct xfs_da_geometry *geo; /* da block geometry */ + const __uint8_t *name; /* string (maybe not NULL terminated) */ + int namelen; /* length of string (maybe no NULL) */ + __uint8_t filetype; /* filetype of inode for directories */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ + int valuelen; /* length of value */ + int flags; /* argument flags (eg: ATTR_NOCREATE) */ + xfs_dahash_t hashval; /* hash value of name */ + xfs_ino_t inumber; /* input/output inode number */ + struct xfs_inode *dp; /* directory inode to manipulate */ + xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ + struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_trans *trans; /* current trans (changes over time) */ + xfs_extlen_t total; /* total blocks needed, for 1st bmap */ + int whichfork; /* data or attribute fork */ + xfs_dablk_t blkno; /* blkno of attr leaf of interest */ + int index; /* index of attr of interest in blk */ + xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ + int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ + xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ + int index2; /* index of 2nd attr in blk */ + xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ + int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ +} xfs_da_args_t; + +/* + * Operation flags: + */ +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +#define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ + { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + +/* + * Storage for holding state during Btree searches and split/join ops. + * + * Only need space for 5 intermediate nodes. With a minimum of 62-way + * fanout to the Btree, we can support over 900 million directory blocks, + * which is slightly more than enough. + */ +typedef struct xfs_da_state_blk { + struct xfs_buf *bp; /* buffer containing block */ + xfs_dablk_t blkno; /* filesystem blkno of buffer */ + xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ + int index; /* relevant index into block */ + xfs_dahash_t hashval; /* last hash value in block */ + int magic; /* blk's magic number, ie: blk type */ +} xfs_da_state_blk_t; + +typedef struct xfs_da_state_path { + int active; /* number of active levels */ + xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH]; +} xfs_da_state_path_t; + +typedef struct xfs_da_state { + xfs_da_args_t *args; /* filename arguments */ + struct xfs_mount *mp; /* filesystem mount point */ + xfs_da_state_path_t path; /* search/split paths */ + xfs_da_state_path_t altpath; /* alternate path for join */ + unsigned char inleaf; /* insert into 1->lf, 0->splf */ + unsigned char extravalid; /* T/F: extrablk is in use */ + unsigned char extraafter; /* T/F: extrablk is after new */ + xfs_da_state_blk_t extrablk; /* for double-splits on leaves */ + /* for dirv2 extrablk is data */ +} xfs_da_state_t; + +/* + * Utility macros to aid in logging changed structure fields. + */ +#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) +#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) + +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); +}; + + +/*======================================================================== + * Function prototypes. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); + +/* + * Routines used for finding things in the Btree. + */ +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result); +/* + * Utility routines. + */ +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk); +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); + +/* + * Utility routines. + */ +int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, + int count); +int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bp, int whichfork); +int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); +int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf); + +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + + +xfs_da_state_t *xfs_da_state_alloc(void); +void xfs_da_state_free(xfs_da_state_t *state); + +extern struct kmem_zone *xfs_da_state_zone; +extern const struct xfs_nameops xfs_default_nameops; + +#endif /* __XFS_DA_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.c b/kernel/fs/xfs/libxfs/xfs_da_format.c new file mode 100644 index 000000000..9d624a622 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_format.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" + +/* + * Shortform directory ops + */ +static int +xfs_dir2_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + return count; +} + +static int +xfs_dir3_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); +} + +static struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + +static struct xfs_dir2_sf_entry * +xfs_dir3_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * For filetype enabled shortform directories, the file type field is stored at + * the end of the name. Because it's only a single byte, endian conversion is + * not necessary. For non-filetype enable directories, the type is always + * unknown and we never store the value. + */ +static __uint8_t +xfs_dir2_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t ftype; + + ftype = sfep->name[sfep->namelen]; + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + sfep->name[sfep->namelen] = ftype; +} + +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +static xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. + */ +static xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); +} + +static xfs_ino_t +xfs_dir3_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); +} + +static void +xfs_dir3_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); +} + + +/* + * Directory data block operations + */ + +/* + * For special situations, the dirent size ends up fixed because we always know + * what the size of the entry is. That's true for the "." and "..", and + * therefore we know that they are a fixed size and hence their offsets are + * constant, as is the first entry. + * + * Hence, this calculation is written as a macro to be able to be calculated at + * compile time and so certain offsets can be calculated directly in the + * structure initaliser via the macro. There are two macros - one for dirents + * with ftype and without so there are no unresolvable conditionals in the + * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power + * of 2 and the compiler doesn't reject it (unlike roundup()). + */ +#define XFS_DIR2_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) + +#define XFS_DIR3_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + XFS_DIR2_DATA_ALIGN) + +static int +xfs_dir2_data_entsize( + int n) +{ + return XFS_DIR2_DATA_ENTSIZE(n); +} + +static int +xfs_dir3_data_entsize( + int n) +{ + return XFS_DIR3_DATA_ENTSIZE(n); +} + +static __uint8_t +xfs_dir2_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + __uint8_t ftype = dep->name[dep->namelen]; + + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + dep->name[dep->namelen] = type; +} + +/* + * Pointer to an entry's tag word. + */ +static __be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +static __be16 * +xfs_dir3_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * location of . and .. in data space (always block 0) + */ +static struct xfs_dir2_data_entry * +xfs_dir2_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return hdr->bestfree; +} + +static struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + + +/* + * Directory Leaf block operations + */ +static int +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return lp->__ents; +} + +static int +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return ((struct xfs_dir3_leaf *)lp)->__ents; +} + +static void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); +} + +static void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +static void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); +} + + +/* + * Directory/Attribute Node block operations + */ +static struct xfs_da_node_entry * +xfs_da2_node_tree_p(struct xfs_da_intnode *dap) +{ + return dap->__btree; +} + +static struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + return ((struct xfs_da3_intnode *)dap)->__btree; +} + +static void +xfs_da2_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +static void +xfs_da2_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); +} + +static void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); +} + + +/* + * Directory free space block operations + */ +static int +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir2_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(geo); +} + +static int +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir3_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir3_free_max_bests(geo); +} + +static void +xfs_dir2_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); +} + +static void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); +} + +static const struct xfs_dir_ops xfs_dir2_ops = { + .sf_entsize = xfs_dir2_sf_entsize, + .sf_nextentry = xfs_dir2_sf_nextentry, + .sf_get_ftype = xfs_dir2_sfe_get_ftype, + .sf_put_ftype = xfs_dir2_sfe_put_ftype, + .sf_get_ino = xfs_dir2_sfe_get_ino, + .sf_put_ino = xfs_dir2_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir2_data_entsize, + .data_get_ftype = xfs_dir2_data_get_ftype, + .data_put_ftype = xfs_dir2_data_put_ftype, + .data_entry_tag_p = xfs_dir2_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_ftype_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir3_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir3_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), + + .data_dot_entry_p = xfs_dir3_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir3_data_first_entry_p, + .data_entry_p = xfs_dir3_data_entry_p, + .data_unused_p = xfs_dir3_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir3_max_leaf_ents, + .leaf_ents_p = xfs_dir3_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), + .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, + .free_max_bests = xfs_dir3_free_max_bests, + .free_bests_p = xfs_dir3_free_bests_p, + .db_to_fdb = xfs_dir3_db_to_fdb, + .db_to_fdindex = xfs_dir3_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, +}; + +static const struct xfs_dir_ops xfs_dir3_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, +}; + +/* + * Return the ops structure according to the current config. If we are passed + * an inode, then that overrides the default config we use which is based on + * feature bits. + */ +const struct xfs_dir_ops * +xfs_dir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_dir_inode_ops) + return mp->m_dir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_ops; + if (xfs_sb_version_hasftype(&mp->m_sb)) + return &xfs_dir2_ftype_ops; + return &xfs_dir2_ops; +} + +const struct xfs_dir_ops * +xfs_nondir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_nondir_inode_ops) + return mp->m_nondir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_nondir_ops; + return &xfs_dir2_nondir_ops; +} diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.h b/kernel/fs/xfs/libxfs/xfs_da_format.h new file mode 100644 index 000000000..74bcbabfa --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_format.h @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_FORMAT_H__ +#define __XFS_DA_FORMAT_H__ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * It is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +} xfs_da_intnode_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +/* + * Directory version 2. + * + * There are 4 possible formats: + * - shortform - embedded into the inode + * - single block - data with embedded leaf at the end + * - multiple data blocks, single leaf+freeindex block + * - data blocks, node and leaf blocks (btree), freeindex blocks + * + * Note: many node blocks structures and constants are shared with the attr + * code and defined in xfs_da_btree.h. + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ + +/* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* + * Dirents in version 3 directories have a file type field. Additions to this + * list are an on-disk format change, requiring feature bits. Valid values + * are as follows: + */ +#define XFS_DIR3_FT_UNKNOWN 0 +#define XFS_DIR3_FT_REG_FILE 1 +#define XFS_DIR3_FT_DIR 2 +#define XFS_DIR3_FT_CHRDEV 3 +#define XFS_DIR3_FT_BLKDEV 4 +#define XFS_DIR3_FT_FIFO 5 +#define XFS_DIR3_FT_SOCK 6 +#define XFS_DIR3_FT_SYMLINK 7 +#define XFS_DIR3_FT_WHT 8 + +#define XFS_DIR3_FT_MAX 9 + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to fit into the + * literal area of the inode. These "shortform" directories consist of a + * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry + * structures. Due the different inode number storage size and the variable + * length name field in the xfs_dir2_sf_entry all these structure are + * variable length, and the accessors in this file should be used to iterate + * over them. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __u8 namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 name[]; /* name, variable size */ + /* + * A single byte containing the file type field follows the inode + * number for version 3 directory entries. + * + * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a + * variable offset after the name. + */ +} __arch_pack xfs_dir2_sf_entry_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return sizeof(struct xfs_dir2_sf_hdr) - + (i8count == 0) * + (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return get_unaligned_be16(&sfep->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + put_unaligned_be16(off, &sfep->offset.i); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); +} + +/* + * Data block structures. + * + * A pure data block looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + * + * In addition to the pure data blocks for the data and node formats, + * most structures are also used for the combined data/freespace "block" + * format below. + */ + +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Describe a free area in the data block. + * + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ + /* XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +/* + * Active entry in a data block. + * + * Aligned to 8 bytes. After the variable length name field there is a + * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p. + * + * For dir3 structures, there is file type field between the name and the tag. + * This can only be manipulated by helper functions. It is packed hard against + * the end of the name so any padding for rounding is between the file type and + * the tag. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[]; /* name bytes, no null */ + /* __u8 filetype; */ /* type of inode we point to */ + /* __be16 tag; */ /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. + * + * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed + * using xfs_dir2_data_unused_tag_p. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Leaf block structures. + * + * A pure leaf block looks like the following drawing on disk: + * + * +---------------------------+ + * | xfs_dir2_leaf_hdr_t | + * +---------------------------+ + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_leaf_tail_t | + * +---------------------------+ + * + * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block + * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present + * for directories with separate leaf nodes and free space blocks + * (magic = XFS_DIR2_LEAFN_MAGIC). + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ +} xfs_dir2_leaf_t; + +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; + +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Free space block defintions for the node format. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +/* + * Single block format. + * + * The single block format looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * | ... | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * +-------------------------------------------------+ + * | xfs_dir2_block_tail_t | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + + +/* + * Attribute storage layout + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + + + +/* + * Remote attribute block format definition + * + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +#endif /* __XFS_DA_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.c b/kernel/fs/xfs/libxfs/xfs_dir2.c new file mode 100644 index 000000000..a69fb3a1e --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; + +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; + +int +xfs_da_mount( + struct xfs_mount *mp) +{ + struct xfs_da_geometry *dageo; + int nodehdr_size; + + + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= + XFS_MAX_BLOCKSIZE); + + mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); + mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); + + nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return -ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; + + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); +} + +/* + * Return 1 if directory contains only "." and "..". + */ +int +xfs_dir_isempty( + xfs_inode_t *dp) +{ + xfs_dir2_sf_hdr_t *sfp; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + if (dp->i_d.di_size == 0) /* might happen during shutdown. */ + return 1; + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return 0; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + return !sfp->count; +} + +/* + * Validate a given inode number. + */ +int +xfs_dir_ino_validate( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + xfs_agblock_t agblkno; + xfs_agino_t agino; + xfs_agnumber_t agno; + int ino_ok; + int ioff; + + agno = XFS_INO_TO_AGNO(mp, ino); + agblkno = XFS_INO_TO_AGBNO(mp, ino); + ioff = XFS_INO_TO_OFFSET(mp, ino); + agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); + ino_ok = + agno < mp->m_sb.sb_agcount && + agblkno < mp->m_sb.sb_agblocks && + agblkno != 0 && + ioff < (1 << mp->m_sb.sb_inopblog) && + XFS_AGINO_TO_INO(mp, agno, agino) == ino; + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, + XFS_RANDOM_DIR_INO_VALIDATE))) { + xfs_warn(mp, "Invalid inode number 0x%Lx", + (unsigned long long) ino); + XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +int +xfs_dir_init( + xfs_trans_t *tp, + xfs_inode_t *dp, + xfs_inode_t *pdp) +{ + struct xfs_da_args *args; + int error; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) + return error; + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; +} + +/* + * Enter a name in a directory, or check for available space. + * If inum is 0, only the available space test is performed. + */ +int +xfs_dir_createname( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t inum, /* new entry inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + if (inum) { + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + XFS_STATS_INC(xs_dir_create); + } + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + if (!inum) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); + else + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); + return rval; +} + +/* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return -ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return -EEXIST; + + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + if (!args->value) + return -ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return -EEXIST; +} + +/* + * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. + */ + +int +xfs_dir_lookup( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_lookup); + + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args->op_flags |= XFS_DA_OP_CILOOKUP; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); + else + rval = xfs_dir2_node_lookup(args); + +out_check_rval: + if (rval == -EEXIST) + rval = 0; + if (!rval) { + *inum = args->inumber; + if (ci_name) { + ci_name->name = args->value; + ci_name->len = args->valuelen; + } + } +out_free: + kmem_free(args); + return rval; +} + +/* + * Remove an entry from a directory. + */ +int +xfs_dir_removename( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t ino, + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_remove); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); + else + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); + return rval; +} + +/* + * Replace the inode number of a directory entry. + */ +int +xfs_dir_replace( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, /* name of entry to replace */ + xfs_ino_t inum, /* new inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); + else + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); + return rval; +} + +/* + * See if this entry can be added to the directory without allocating space. + */ +int +xfs_dir_canenter( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name) /* name of entry to add */ +{ + return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0); +} + +/* + * Utility routines. + */ + +/* + * Add a block to the directory. + * + * This routine is for data and free blocks, not leaf/node blocks which are + * handled by xfs_da_grow_inode. + */ +int +xfs_dir2_grow_inode( + struct xfs_da_args *args, + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + int error; + + trace_xfs_dir2_grow_inode(args, space); + + /* + * Set lowest possible block in the space requested. + */ + bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); + count = args->geo->fsbcount; + + error = xfs_da_grow_inode_int(args, &bno, count); + if (error) + return error; + + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); + + /* + * Update file's size if this is the data space and it grew. + */ + if (space == XFS_DIR2_DATA_SPACE) { + xfs_fsize_t size; /* directory file (data) size */ + + size = XFS_FSB_TO_B(mp, bno + count); + if (size > dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * See if the directory is a single-block form directory. + */ +int +xfs_dir2_isblock( + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + int rval; + + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) + return rval; + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); + *vp = rval; + return 0; +} + +/* + * See if the directory is a single-leaf form directory. + */ +int +xfs_dir2_isleaf( + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + int rval; + + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) + return rval; + *vp = last == args->geo->leafblk + args->geo->fsbcount; + return 0; +} + +/* + * Remove the given block from the directory. + * This routine is used for data and free blocks, leaf/node are done + * by xfs_da_shrink_inode. + */ +int +xfs_dir2_shrink_inode( + xfs_da_args_t *args, + xfs_dir2_db_t db, + struct xfs_buf *bp) +{ + xfs_fileoff_t bno; /* directory file offset */ + xfs_dablk_t da; /* directory file offset */ + int done; /* bunmap is finished */ + xfs_inode_t *dp; + int error; + xfs_mount_t *mp; + xfs_trans_t *tp; + + trace_xfs_dir2_shrink_inode(args, db); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + da = xfs_dir2_db_to_da(args->geo, db); + /* + * Unmap the fsblock(s). + */ + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, + XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, + &done))) { + /* + * ENOSPC actually can happen if we're in a removename with + * no space reservation, and the resulting block removal + * would cause a bmap btree split or conversion from extents + * to btree. This can only happen for un-fragmented + * directory blocks, since you need to be punching out + * the middle of an extent. + * In this case we need to leave the block in the file, + * and not binval it. + * So the block has to be in a consistent empty state + * and appropriately logged. + * We don't free up the buffer, the caller can tell it + * hasn't happened since it got an error back. + */ + return error; + } + ASSERT(done); + /* + * Invalidate the buffer from the transaction. + */ + xfs_trans_binval(tp, bp); + /* + * If it's not a data block, we're done. + */ + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) + return 0; + /* + * If the block isn't the last one in the directory, we're done. + */ + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) + return 0; + bno = da; + if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { + /* + * This can't really happen unless there's kernel corruption. + */ + return error; + } + if (db == args->geo->datablk) + ASSERT(bno == 0); + else + ASSERT(bno > 0); + /* + * Set the size to the new last block. + */ + dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.h b/kernel/fs/xfs/libxfs/xfs_dir2.h new file mode 100644 index 000000000..e55353651 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2.h @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_H__ +#define __XFS_DIR2_H__ + +struct xfs_bmap_free; +struct xfs_da_args; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct xfs_dir2_sf_hdr; +struct xfs_dir2_sf_entry; +struct xfs_dir2_data_hdr; +struct xfs_dir2_data_entry; +struct xfs_dir2_data_unused; + +extern struct xfs_name xfs_name_dotdot; + +/* + * directory filetype conversion tables. + */ +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +/* + * directory operations vector for encode/decode routines + */ +struct xfs_dir_ops { + int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); + struct xfs_dir2_sf_entry * + (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype); + xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); + void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino); + + int (*data_entsize)(int len); + __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, + __uint8_t ftype); + __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); + struct xfs_dir2_data_free * + (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); + + xfs_dir2_data_aoff_t data_dot_offset; + xfs_dir2_data_aoff_t data_dotdot_offset; + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; + + struct xfs_dir2_data_entry * + (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_unused * + (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); + + int leaf_hdr_size; + void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); + void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); + struct xfs_dir2_leaf_entry * + (*leaf_ents_p)(struct xfs_dir2_leaf *lp); + + int node_hdr_size; + void (*node_hdr_to_disk)(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); + struct xfs_da_node_entry * + (*node_tree_p)(struct xfs_da_intnode *dap); + + int free_hdr_size; + void (*free_hdr_to_disk)(struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from); + void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + int (*free_max_bests)(struct xfs_da_geometry *geo); + __be16 * (*free_bests_p)(struct xfs_dir2_free *free); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); +}; + +extern const struct xfs_dir_ops * + xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); +extern const struct xfs_dir_ops * + xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); + +/* + * Generic directory interface routines + */ +extern void xfs_dir_startup(void); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + +extern int xfs_dir_isempty(struct xfs_inode *dp); +extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_inode *pdp); +extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); +extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t ino, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name); + +/* + * Direct call from the bmap code, bypassing the generic directory layer. + */ +extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); + +/* + * Interface routines used by userspace utilities + */ +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_buf *bp); + +extern void xfs_dir2_data_freescan(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); + +extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup); + +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; + +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + +#endif /* __XFS_DIR2_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_block.c b/kernel/fs/xfs/libxfs/xfs_dir2_block.c new file mode 100644 index 000000000..9354e190b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_block.c @@ -0,0 +1,1254 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" + +/* + * Local function prototypes. + */ +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, + int *entno); +static int xfs_dir2_block_sort(const void *a, const void *b); + +static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +/* + * One-time startup routine called from xfs_init(). + */ +void +xfs_dir_startup(void) +{ + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); +} + +static bool +xfs_dir3_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +static void +xfs_dir3_block_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_block_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, +}; + +int +xfs_dir3_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + int err; + + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); +} + +static void +xfs_dir2_block_need_space( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = dp->d_ops->data_bestfree_p(hdr); + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_da_args *args, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(args->dp, hdr, needlog); +} + +/* + * Add an entry to a block directory. + */ +int /* error */ +xfs_dir2_block_addname( + xfs_da_args_t *args) /* directory op arguments */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + int compact; /* need to compact leaf ents */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* directory inode */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + int error; /* error return value */ + xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ + xfs_dahash_t hash; /* hash value of found entry */ + int high; /* high index for binary srch */ + int highstale; /* high stale index */ + int lfloghigh=0; /* last final leaf to log */ + int lfloglow=0; /* first final leaf to log */ + int len; /* length of the new entry */ + int low; /* low index for binary srch */ + int lowstale; /* low stale index */ + int mid=0; /* midpoint for binary srch */ + int needlog; /* need to log header */ + int needscan; /* need to rescan freespace */ + __be16 *tagp; /* pointer to tag value */ + xfs_trans_t *tp; /* transaction structure */ + + trace_xfs_dir2_block_addname(args); + + dp = args->dp; + tp = args->trans; + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) + return error; + + len = dp->d_ops->data_entsize(args->namelen); + + /* + * Set up pointers to parts of the block. + */ + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + + /* + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. + */ + xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + + /* + * Done everything we need for a space check now. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, bp); + if (!dup) + return -ENOSPC; + return 0; + } + + /* + * If we don't have space for the new entry & leaf ... + */ + if (!dup) { + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) + return -ENOSPC; + /* + * Convert to the next larger format. + * Then add the new entry in that format. + */ + error = xfs_dir2_block_to_leaf(args, bp); + if (error) + return error; + return xfs_dir2_leaf_addname(args); + } + + needlog = needscan = 0; + + /* + * If need to compact the leaf entries, do it now. + */ + if (compact) { + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ + lfloglow = be32_to_cpu(btp->count); + lfloghigh = -1; + } + + /* + * Find the slot that's first lower than our hash value, -1 if none. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + } + while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) { + mid--; + } + /* + * No stale entries, will use enddup space to hold new leaf. + */ + if (!btp->stale) { + /* + * Mark the space needed for the new leaf entry, now in use. + */ + xfs_dir2_data_use_free(args, bp, enddup, + (xfs_dir2_data_aoff_t) + ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - + sizeof(*blp)), + (xfs_dir2_data_aoff_t)sizeof(*blp), + &needlog, &needscan); + /* + * Update the tail (entry count). + */ + be32_add_cpu(&btp->count, 1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(dp, hdr, &needlog); + needscan = 0; + } + /* + * Adjust pointer to the first leaf entry, we're about to move + * the table up one to open up space for the new leaf entry. + * Then adjust our index to match. + */ + blp--; + mid++; + if (mid) + memmove(blp, &blp[1], mid * sizeof(*blp)); + lfloglow = 0; + lfloghigh = mid; + } + /* + * Use a stale leaf for our new entry. + */ + else { + for (lowstale = mid; + lowstale >= 0 && + blp[lowstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + lowstale--) + continue; + for (highstale = mid + 1; + highstale < be32_to_cpu(btp->count) && + blp[highstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && + (lowstale < 0 || mid - lowstale > highstale - mid); + highstale++) + continue; + /* + * Move entries toward the low-numbered stale entry. + */ + if (lowstale >= 0 && + (highstale == be32_to_cpu(btp->count) || + mid - lowstale <= highstale - mid)) { + if (mid - lowstale) + memmove(&blp[lowstale], &blp[lowstale + 1], + (mid - lowstale) * sizeof(*blp)); + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(mid, lfloghigh); + } + /* + * Move entries toward the high-numbered stale entry. + */ + else { + ASSERT(highstale < be32_to_cpu(btp->count)); + mid++; + if (highstale - mid) + memmove(&blp[mid + 1], &blp[mid], + (highstale - mid) * sizeof(*blp)); + lfloglow = MIN(mid, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + be32_add_cpu(&btp->stale, -1); + } + /* + * Point to the new data entry. + */ + dep = (xfs_dir2_data_entry_t *)dup; + /* + * Fill in the leaf entry. + */ + blp[mid].hashval = cpu_to_be32(args->hashval); + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); + /* + * Mark space for the data entry used. + */ + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + /* + * Create the new data entry. + */ + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, args->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + /* + * Clean up the bestfree array and log the header, tail, and entry. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, bp); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); + return 0; +} + +/* + * Log leaf entries from the block. + */ +static void +xfs_dir2_block_log_leaf( + xfs_trans_t *tp, /* transaction structure */ + struct xfs_buf *bp, /* block buffer */ + int first, /* index of first logged leaf */ + int last) /* index of last logged leaf */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_leaf_entry_t *blp; + xfs_dir2_block_tail_t *btp; + + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); +} + +/* + * Log the block tail. + */ +static void +xfs_dir2_block_log_tail( + xfs_trans_t *tp, /* transaction structure */ + struct xfs_buf *bp) /* block buffer */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_block_tail_t *btp; + + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + (uint)((char *)(btp + 1) - (char *)hdr - 1)); +} + +/* + * Look up an entry in the block. This is the external routine, + * xfs_dir2_block_lookup_int does the real work. + */ +int /* error */ +xfs_dir2_block_lookup( + xfs_da_args_t *args) /* dir lookup arguments */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* entry index */ + int error; /* error return value */ + + trace_xfs_dir2_block_lookup(args); + + /* + * Get the buffer, look up the entry. + * If not found (ENOENT) then return, have no buffer. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) + return error; + dp = args->dp; + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Get the offset from the leaf entry, to point to the data. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + /* + * Fill in inode number, CI name if appropriate, release the block. + */ + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(args->trans, bp); + return error; +} + +/* + * Internal block lookup routine. + */ +static int /* error */ +xfs_dir2_block_lookup_int( + xfs_da_args_t *args, /* dir lookup arguments */ + struct xfs_buf **bpp, /* returned block buffer */ + int *entno) /* returned entry number */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int error; /* error return value */ + xfs_dahash_t hash; /* found hash value */ + int high; /* binary search high index */ + int low; /* binary search low index */ + int mid; /* binary search current idx */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) + return error; + + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Loop doing a binary search for our hash value. + * Find our entry, ENOENT if it's not there. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) { + ASSERT(low <= high); + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + if (low > high) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + xfs_trans_brelse(tp, bp); + return -ENOENT; + } + } + /* + * Back up to the first one with the right hash value. + */ + while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) { + mid--; + } + /* + * Now loop forward through all the entries with the + * right hash value looking for our name. + */ + do { + if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get pointer to the entry from the leaf. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *bpp = bp; + *entno = mid; + if (cmp == XFS_CMP_EXACT) + return 0; + } + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; + /* + * No match, release the buffer and return ENOENT. + */ + xfs_trans_brelse(tp, bp); + return -ENOENT; +} + +/* + * Remove an entry from a block format directory. + * If that makes the block small enough to fit in shortform, transform it. + */ +int /* error */ +xfs_dir2_block_removename( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* block leaf entry index */ + int error; /* error return value */ + int needlog; /* need to log block header */ + int needscan; /* need to fixup bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* shortform size */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_block_removename(args); + + /* + * Look up the entry in the block. Gets the buffer and entry index. + * It will always be there, the vnodeops level does a lookup first. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + tp = args->trans; + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry using the leaf entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + /* + * Mark the data entry's space free. + */ + needlog = needscan = 0; + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * Fix up the block tail. + */ + be32_add_cpu(&btp->stale, 1); + xfs_dir2_block_log_tail(tp, bp); + /* + * Remove the leaf entry by marking it stale. + */ + blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir2_block_log_leaf(tp, bp, ent, ent); + /* + * Fix up bestfree, log the header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, bp); + xfs_dir3_data_check(dp, bp); + /* + * See if the size as a shortform is good enough. + */ + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + + /* + * If it works, do the conversion. + */ + return xfs_dir2_block_to_sf(args, bp, size, &sfh); +} + +/* + * Replace an entry in a V2 block directory. + * Change the inode number to the new value. + */ +int /* error */ +xfs_dir2_block_replace( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* leaf entry index */ + int error; /* error return value */ + + trace_xfs_dir2_block_replace(args); + + /* + * Lookup the entry in the directory. Get buffer and entry index. + * This will always succeed since the caller has already done a lookup. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry we need to change. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + ASSERT(be64_to_cpu(dep->inumber) != args->inumber); + /* + * Change the inode number to the new value. + */ + dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); + return 0; +} + +/* + * Qsort comparison routine for the block leaf entries. + */ +static int /* sort order */ +xfs_dir2_block_sort( + const void *a, /* first leaf entry */ + const void *b) /* second leaf entry */ +{ + const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ + const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ + + la = a; + lb = b; + return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 : + (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0); +} + +/* + * Convert a V2 leaf directory to a V2 block directory if possible. + */ +int /* error */ +xfs_dir2_leaf_to_block( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ +{ + __be16 *bestsp; /* leaf bests table */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + int error; /* error return value */ + int from; /* leaf from index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* file system mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to scan for bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* bytes used */ + __be16 *tagp; /* end of entry (tag) */ + int to; /* block/leaf to index */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); + /* + * If there are data blocks other than the first one, take this + * opportunity to remove trailing empty data blocks that may have + * been left behind during no-space-reservation operations. + * These will show up in the leaf bests table. + */ + while (dp->i_d.di_size > args->geo->blksize) { + int hdrsz; + + hdrsz = dp->d_ops->data_entry_offset; + bestsp = xfs_dir2_leaf_bests_p(ltp); + if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == + args->geo->blksize - hdrsz) { + if ((error = + xfs_dir2_leaf_trim_data(args, lbp, + (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) + return error; + } else + return 0; + } + /* + * Read the data block if we don't already have it, give up if it fails. + */ + if (!dbp) { + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); + if (error) + return error; + } + hdr = dbp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + + /* + * Size of the "leaf" area in the block. + */ + size = (uint)sizeof(xfs_dir2_block_tail_t) + + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); + /* + * Look at the last data entry. + */ + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + /* + * If it's not free or is too short we can't do it. + */ + if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || + be16_to_cpu(dup->length) < size) + return 0; + + /* + * Start converting it to block form. + */ + xfs_dir3_block_init(mp, tp, dbp, dp); + + needlog = 1; + needscan = 0; + /* + * Use up the space at the end of the block (blp/btp). + */ + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, + &needlog, &needscan); + /* + * Initialize the block tail. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); + btp->stale = 0; + xfs_dir2_block_log_tail(tp, dbp); + /* + * Initialize the block leaf area. We compact out stale entries. + */ + lep = xfs_dir2_block_leaf_p(btp); + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + continue; + lep[to++] = ents[from]; + } + ASSERT(to == be32_to_cpu(btp->count)); + xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); + /* + * Scan the bestfree if we need it and log the data block header. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * Pitch the old leaf block. + */ + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); + if (error) + return error; + + /* + * Now see if the resulting block can be shrunken to shortform. + */ + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); +} + +/* + * Convert the shortform directory to block form. + */ +int /* error */ +xfs_dir2_sf_to_block( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_db_t blkno; /* dir-relative block # (0) */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + int dummy; /* trash */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int endoffset; /* end of data objects */ + int error; /* error return value */ + int i; /* index */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to scan block freespc */ + int newoffset; /* offset from current entry */ + int offset; /* target block offset */ + xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ + xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + xfs_dir2_sf_hdr_t *sfp; /* shortform header */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; + struct xfs_ifork *ifp; + + trace_xfs_dir2_sf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + ASSERT(ifp->if_flags & XFS_IFINLINE); + /* + * Bomb out if the shortform directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return -EIO; + } + + oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; + + ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_d.di_nextents == 0); + + /* + * Copy the directory into a temporary buffer. + * Then pitch the incore inode data so we can make extents. + */ + sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + memcpy(sfp, oldsfp, ifp->if_bytes); + + xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + dp->i_d.di_size = 0; + + /* + * Add block 0 to the inode. + */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); + if (error) { + kmem_free(sfp); + return error; + } + /* + * Initialize the data block, then convert it to block format. + */ + error = xfs_dir3_data_init(args, blkno, &bp); + if (error) { + kmem_free(sfp); + return error; + } + xfs_dir3_block_init(mp, tp, bp, dp); + hdr = bp->b_addr; + + /* + * Compute size of block "tail" area. + */ + i = (uint)sizeof(*btp) + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + /* + * The whole thing is initialized to free by the init routine. + * Say we're using the leaf and tail area. + */ + dup = dp->d_ops->data_unused_p(hdr); + needlog = needscan = 0; + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); + ASSERT(needscan == 0); + /* + * Fill in the tail. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ + btp->stale = 0; + blp = xfs_dir2_block_leaf_p(btp); + endoffset = (uint)((char *)blp - (char *)hdr); + /* + * Remove the freespace, we'll manage it. + */ + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + be16_to_cpu(dup->length), &needlog, &needscan); + /* + * Create entry for . + */ + dep = dp->d_ops->data_dot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->i_ino); + dep->namelen = 1; + dep->name[0] = '.'; + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + /* + * Create entry for .. + */ + dep = dp->d_ops->data_dotdot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); + dep->namelen = 2; + dep->name[0] = dep->name[1] = '.'; + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = dp->d_ops->data_first_offset; + /* + * Loop over existing entries, stuff them in. + */ + i = 0; + if (!sfp->count) + sfep = NULL; + else + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Need to preserve the existing offset values in the sf directory. + * Insert holes (unused entries) where necessary. + */ + while (offset < endoffset) { + /* + * sfep is null when we reach the end of the list. + */ + if (sfep == NULL) + newoffset = endoffset; + else + newoffset = xfs_dir2_sf_get_offset(sfep); + /* + * There should be a hole here, make one. + */ + if (offset < newoffset) { + dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + dup->length = cpu_to_be16(newoffset - offset); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( + ((char *)dup - (char *)hdr)); + xfs_dir2_data_log_unused(args, bp, dup); + xfs_dir2_data_freeinsert(hdr, + dp->d_ops->data_bestfree_p(hdr), + dup, &dummy); + offset += be16_to_cpu(dup->length); + continue; + } + /* + * Copy a real entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); + dep->namelen = sfep->namelen; + dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); + memcpy(dep->name, sfep->name, dep->namelen); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = (int)((char *)(tagp + 1) - (char *)hdr); + if (++i == sfp->count) + sfep = NULL; + else + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + /* Done with the temporary buffer */ + kmem_free(sfp); + /* + * Sort the leaf entries by hash value. + */ + xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort); + /* + * Log the leaf entry area and tail. + * Already logged the header in data_init, ignore needlog. + */ + ASSERT(needscan == 0); + xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir3_data_check(dp, bp); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_data.c b/kernel/fs/xfs/libxfs/xfs_dir2_data.c new file mode 100644 index 000000000..de1ea16f5 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_data.c @@ -0,0 +1,1049 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Check the consistency of the data block. + * The input can also be a block-format directory. + * Return 0 is the buffer is good, otherwise an error. + */ +int +__xfs_dir3_data_check( + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ +{ + xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ + xfs_dir2_data_free_t *bf; /* bestfree table */ + xfs_dir2_block_tail_t *btp=NULL; /* block tail */ + int count; /* count of entries found */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + char *endp; /* end of useful data */ + int freeseen; /* mask of bestfrees seen */ + xfs_dahash_t hash; /* hash of current name */ + int i; /* leaf index */ + int lastfree; /* last entry was unused */ + xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ + xfs_mount_t *mp; /* filesystem mount point */ + char *p; /* current data position */ + int stale; /* count of stale leaves */ + struct xfs_name name; + const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; + + mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; + + /* + * We can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + hdr = bp->b_addr; + p = (char *)ops->data_entry_p(hdr); + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + btp = xfs_dir2_block_tail_p(geo, hdr); + lep = xfs_dir2_block_leaf_p(btp); + endp = (char *)lep; + + /* + * The number of leaf entries is limited by the size of the + * block and the amount of space used by the data entries. + * We don't know how much space is used by the data entries yet, + * so just ensure that the count falls somewhere inside the + * block right now. + */ + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); + break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + endp = (char *)hdr + geo->blksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* + * Account for zero bestfree entries. + */ + bf = ops->data_bestfree_p(hdr); + count = lastfree = freeseen = 0; + if (!bf[0].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); + freeseen |= 1 << 0; + } + if (!bf[1].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); + freeseen |= 1 << 1; + } + if (!bf[2].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); + freeseen |= 1 << 2; + } + + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); + /* + * Loop over the data/unused entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's unused, look for the space in the bestfree table. + * If we find it, account for that, else make sure it + * doesn't need to be there. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + if (dfp) { + i = (int)(dfp - bf); + XFS_WANT_CORRUPTED_RETURN(mp, + (freeseen & (1 << i)) == 0); + freeseen |= 1 << i; + } else { + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); + } + p += be16_to_cpu(dup->length); + lastfree = 1; + continue; + } + /* + * It's a real entry. Validate the fields. + * If this is a block directory then make sure it's + * in the leaf section of the block. + * The linear search is crude but this is DEBUG code. + */ + dep = (xfs_dir2_data_entry_t *)p; + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(*ops->data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(mp, + ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); + count++; + lastfree = 0; + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if (be32_to_cpu(lep[i].address) == addr && + be32_to_cpu(lep[i].hashval) == hash) + break; + } + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); + } + p += ops->data_entsize(dep->namelen); + } + /* + * Need to have seen all the entries and all the bestfree slots. + */ + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { + if (lep[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + if (i > 0) + XFS_WANT_CORRUPTED_RETURN(mp, + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); + } + XFS_WANT_CORRUPTED_RETURN(mp, count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); + } + return 0; +} + +static bool +xfs_dir3_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir3_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); + return; + default: + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + break; + } +} + +static void +xfs_dir3_data_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_data_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + + +int +xfs_dir3_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; +} + +int +xfs_dir3_data_readahead( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); +} + +/* + * Given a data block and an unused entry from that block, + * return the bestfree entry if any that corresponds to it. + */ +xfs_dir2_data_free_t * +xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_aoff_t off; /* offset value needed */ +#ifdef DEBUG + int matched; /* matched the value */ + int seenzero; /* saw a 0 bestfree entry */ +#endif + + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + +#ifdef DEBUG + /* + * Validate some consistency in the bestfree table. + * Check order, non-overlapping entries, and if we find the + * one we're looking for it has to be exact. + */ + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) { + ASSERT(!dfp->length); + seenzero = 1; + continue; + } + ASSERT(seenzero == 0); + if (be16_to_cpu(dfp->offset) == off) { + matched = 1; + ASSERT(dfp->length == dup->length); + } else if (off < be16_to_cpu(dfp->offset)) + ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset)); + else + ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); + ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); + if (dfp > &bf[0]) + ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); + } +#endif + /* + * If this is smaller than the smallest bestfree entry, + * it can't be there since they're sorted. + */ + if (be16_to_cpu(dup->length) < + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return NULL; + /* + * Look at the three bestfree entries for our guy. + */ + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { + if (!dfp->offset) + return NULL; + if (be16_to_cpu(dfp->offset) == off) + return dfp; + } + /* + * Didn't find it. This only happens if there are duplicate lengths. + */ + return NULL; +} + +/* + * Insert an unused-space entry into the bestfree table. + */ +xfs_dir2_data_free_t * /* entry inserted */ +xfs_dir2_data_freeinsert( + struct xfs_dir2_data_hdr *hdr, /* data block pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup, /* unused space */ + int *loghead) /* log the data header (out) */ +{ + xfs_dir2_data_free_t new; /* new bestfree entry */ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + new.length = dup->length; + new.offset = cpu_to_be16((char *)dup - (char *)hdr); + + /* + * Insert at position 0, 1, or 2; or not at all. + */ + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) { + dfp[2] = dfp[1]; + dfp[1] = dfp[0]; + dfp[0] = new; + *loghead = 1; + return &dfp[0]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) { + dfp[2] = dfp[1]; + dfp[1] = new; + *loghead = 1; + return &dfp[1]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) { + dfp[2] = new; + *loghead = 1; + return &dfp[2]; + } + return NULL; +} + +/* + * Remove a bestfree entry from the table. + */ +STATIC void +xfs_dir2_data_freeremove( + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */ + int *loghead) /* out: log data header */ +{ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * It's the first entry, slide the next 2 up. + */ + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; + } + /* + * It's the second entry, slide the 3rd entry up. + */ + else if (dfp == &bf[1]) + bf[1] = bf[2]; + /* + * Must be the last entry. + */ + else + ASSERT(dfp == &bf[2]); + /* + * Clear the 3rd entry, must be zero now. + */ + bf[2].length = 0; + bf[2].offset = 0; + *loghead = 1; +} + +/* + * Given a data block, reconstruct its bestfree map. + */ +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* active data entry */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; + char *endp; /* end of block's data */ + char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * Start by clearing the table. + */ + bf = dp->d_ops->data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); + *loghead = 1; + /* + * Set up pointers. + */ + p = (char *)dp->d_ops->data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(geo, hdr); + endp = (char *)xfs_dir2_block_leaf_p(btp); + } else + endp = (char *)hdr + geo->blksize; + /* + * Loop over the block's entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's a free entry, insert it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT((char *)dup - (char *)hdr == + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); + p += be16_to_cpu(dup->length); + } + /* + * For active entries, check their tags and skip them. + */ + else { + dep = (xfs_dir2_data_entry_t *)p; + ASSERT((char *)dep - (char *)hdr == + be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); + p += dp->d_ops->data_entsize(dep->namelen); + } + } +} + +/* + * Initialize a data block at the given block number in the directory. + * Give back the buffer for the created block. + */ +int /* error */ +xfs_dir3_data_init( + xfs_da_args_t *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + struct xfs_buf **bpp) /* output block buffer */ +{ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; + int error; /* error return value */ + int i; /* bestfree index */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + int t; /* temp */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Get the buffer set up for the block. + */ + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); + + /* + * Initialize the header. + */ + hdr = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = dp->d_ops->data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); + for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { + bf[i].length = 0; + bf[i].offset = 0; + } + + /* + * Set up an unused entry for the block's body. + */ + dup = dp->d_ops->data_unused_p(hdr); + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; + bf[0].length = cpu_to_be16(t); + dup->length = cpu_to_be16(t); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); + /* + * Log it and return it. + */ + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); + *bpp = bp; + return 0; +} + +/* + * Log an active data entry from the block. + */ +void +xfs_dir2_data_log_entry( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_entry_t *dep) /* data entry pointer */ +{ + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - + (char *)hdr - 1)); +} + +/* + * Log a data block header. + */ +void +xfs_dir2_data_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ +#ifdef DEBUG + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); +#endif + + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); +} + +/* + * Log a data unused entry. + */ +void +xfs_dir2_data_log_unused( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_unused_t *dup) /* data unused pointer */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * Log the first part of the unused entry. + */ + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), + (uint)((char *)&dup->length + sizeof(dup->length) - + 1 - (char *)hdr)); + /* + * Log the end (tag) of the unused entry. + */ + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + + sizeof(xfs_dir2_data_off_t) - 1)); +} + +/* + * Make a byte range in the data block unused. + * Its current contents are unimportant. + */ +void +xfs_dir2_data_make_free( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_aoff_t offset, /* starting byte offset */ + xfs_dir2_data_aoff_t len, /* length in bytes */ + int *needlogp, /* out: log header */ + int *needscanp) /* out: regen bestfree */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block pointer */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + char *endptr; /* end of data area */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *postdup; /* unused entry after us */ + xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; + + hdr = bp->b_addr; + + /* + * Figure out where the end of the data area is. + */ + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + endptr = (char *)hdr + args->geo->blksize; + else { + xfs_dir2_block_tail_t *btp; /* block tail */ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + } + /* + * If this isn't the start of the block, then back up to + * the previous entry and see if it's free. + */ + if (offset > args->dp->d_ops->data_entry_offset) { + __be16 *tagp; /* tag just before us */ + + tagp = (__be16 *)((char *)hdr + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + prevdup = NULL; + } else + prevdup = NULL; + /* + * If this isn't the end of the block, see if the entry after + * us is free. + */ + if ((char *)hdr + offset + len < endptr) { + postdup = + (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + postdup = NULL; + } else + postdup = NULL; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * Previous and following entries are both free, + * merge everything into a single free entry. + */ + bf = args->dp->d_ops->data_bestfree_p(hdr); + if (prevdup && postdup) { + xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ + + /* + * See if prevdup and/or postdup are in bestfree table. + */ + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup); + /* + * We need a rescan unless there are exactly 2 free entries + * namely our two. Then we know what's happening, otherwise + * since the third bestfree is there, there might be more + * entries. + */ + needscan = (bf[2].length != 0); + /* + * Fix up the new big freespace. + */ + be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); + if (!needscan) { + /* + * Has to be the case that entries 0 and 1 are + * dfp and dfp2 (don't know which is which), and + * entry 2 is empty. + * Remove entry 1 first then entry 0. + */ + ASSERT(dfp && dfp2); + if (dfp == &bf[1]) { + dfp = &bf[0]; + ASSERT(dfp2 == dfp); + dfp2 = &bf[1]; + } + xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + /* + * Now insert the new entry. + */ + dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup, + needlogp); + ASSERT(dfp == &bf[0]); + ASSERT(dfp->length == prevdup->length); + ASSERT(!dfp[1].length); + ASSERT(!dfp[2].length); + } + } + /* + * The entry before us is free, merge with it. + */ + else if (prevdup) { + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + be16_add_cpu(&prevdup->length, len); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); + /* + * If the previous entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(prevdup->length) > + be16_to_cpu(bf[2].length); + } + } + /* + * The following entry is free, merge with it. + */ + else if (postdup) { + dfp = xfs_dir2_data_freefind(hdr, bf, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If the following entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(newdup->length) > + be16_to_cpu(bf[2].length); + } + } + /* + * Neither neighbor is free. Make a new entry. + */ + else { + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); + } + *needscanp = needscan; +} + +/* + * Take a byte range out of an existing unused space and make it un-free. + */ +void +xfs_dir2_data_use_free( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_unused_t *dup, /* unused entry */ + xfs_dir2_data_aoff_t offset, /* starting offset to use */ + xfs_dir2_data_aoff_t len, /* length to use */ + int *needlogp, /* out: need to log header */ + int *needscanp) /* out: need regen bestfree */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + int matchback; /* matches end of freespace */ + int matchfront; /* matches start of freespace */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; + + hdr = bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); + ASSERT(offset >= (char *)dup - (char *)hdr); + ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); + ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + /* + * Look up the entry in the bestfree table. + */ + oldlen = be16_to_cpu(dup->length); + bf = args->dp->d_ops->data_bestfree_p(hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); + /* + * Check for alignment with front and back of the entry. + */ + matchfront = (char *)dup - (char *)hdr == offset; + matchback = (char *)dup + oldlen - (char *)hdr == offset + len; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * If we matched it exactly we just need to get rid of it from + * the bestfree table. + */ + if (matchfront && matchback) { + if (dfp) { + needscan = (bf[2].offset != 0); + if (!needscan) + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + } + } + /* + * We match the first part of the entry. + * Make a new entry with the remaining freespace. + */ + else if (matchfront) { + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(oldlen - len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &bf[2]; + } + } + /* + * We match the last part of the entry. + * Trim the allocated space off the tail of the entry. + */ + else if (matchback) { + newdup = dup; + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &bf[2]; + } + } + /* + * Poking out the middle of an entry. + * Make two new entries. + */ + else { + newdup = dup; + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); + *xfs_dir2_data_unused_tag_p(newdup2) = + cpu_to_be16((char *)newdup2 - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup2); + /* + * If the old entry was in the table, we need to scan + * if the 3rd entry was valid, since these entries + * are smaller than the old one. + * If we don't need to scan that means there were 1 or 2 + * entries in the table, and removing the old and adding + * the 2 new will work. + */ + if (dfp) { + needscan = (bf[2].length != 0); + if (!needscan) { + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup2, + needlogp); + } + } + } + *needscanp = needscan; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c new file mode 100644 index 000000000..106119955 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -0,0 +1,1819 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Local function declarations. + */ +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); + +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + const struct xfs_dir_ops *ops; + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; + + /* + * we can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + if (!hdr) { + ops->leaf_hdr_from_disk(&leafhdr, leaf); + hdr = &leafhdr; + } + + ents = ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > ops->leaf_max_ents(geo)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ +static bool +xfs_dir3_leaf_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; + + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leaf->hdr.info.magic != cpu_to_be16(magic)) + return false; + } + + return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_leaf_verify(bp, magic)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); +} + +static void +xfs_dir3_leaf1_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leaf1_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leafn_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir3_leafn_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, +}; + +static int +xfs_dir3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; +} + +int +xfs_dir3_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(args, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(args, bp); + *bpp = bp; + return 0; +} + +/* + * Convert a block form directory to a leaf form directory. + */ +int /* error */ +xfs_dir2_block_to_leaf( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *dbp) /* input block's buffer */ +{ + __be16 *bestsp; /* leaf's bestsp entries */ + xfs_dablk_t blkno; /* leaf block's bno */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ + xfs_dir2_block_tail_t *btp; /* block's tail */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + struct xfs_buf *lbp; /* leaf block's buffer */ + xfs_dir2_db_t ldb; /* leaf block's bno */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ + int needlog; /* need to log block header */ + int needscan; /* need to rescan bestfree */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_block_to_leaf(args); + + dp = args->dp; + tp = args->trans; + /* + * Add the leaf block to the inode. + * This interface will only put blocks in the leaf/node range. + * Since that's empty now, we'll get the root (block 0 in range). + */ + if ((error = xfs_da_grow_inode(args, &blkno))) { + return error; + } + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); + /* + * Initialize the leaf block, get a buffer for it. + */ + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) + return error; + + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + bf = dp->d_ops->data_bestfree_p(hdr); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Set the counts in the leaf header. + */ + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + + /* + * Could compact these but I think we always do the conversion + * after squeezing out stale entries. + */ + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); + needscan = 0; + needlog = 1; + /* + * Make the space formerly occupied by the leaf entries and block + * tail be free. + */ + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - + (char *)blp), + &needlog, &needscan); + /* + * Fix up the block header, make it a data block. + */ + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Set up leaf tail and bests table. + */ + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(1); + bestsp = xfs_dir2_leaf_bests_p(ltp); + bestsp[0] = bf[0].length; + /* + * Log the data header and leaf bests table. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); + return 0; +} + +STATIC void +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, + int *lowstale, + int *highstale) +{ + /* + * Find the first stale entry before our index, if any. + */ + for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { + if (ents[*lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + } + + /* + * Find the first stale entry at or after our index, if any. + * Stop if the result would require moving more entries than using + * lowstale. + */ + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + if (*lowstale >= 0 && index - *lowstale <= *highstale - index) + break; + } +} + +struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, /* leaf table position */ + int compact, /* need to compact leaves */ + int lowstale, /* index of prev stale leaf */ + int highstale, /* index of next stale leaf */ + int *lfloglow, /* low leaf logging index */ + int *lfloghigh) /* high leaf logging index */ +{ + if (!leafhdr->stale) { + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + + /* + * Now we need to make room to insert the leaf entry. + * + * If there are no stale entries, just insert a hole at index. + */ + lep = &ents[index]; + if (index < leafhdr->count) + memmove(lep + 1, lep, + (leafhdr->count - index) * sizeof(*lep)); + + /* + * Record low and high logging indices for the leaf. + */ + *lfloglow = index; + *lfloghigh = leafhdr->count++; + return lep; + } + + /* + * There are stale entries. + * + * We will use one of them for the new entry. It's probably not at + * the right location, so we'll have to shift some up or down first. + * + * If we didn't compact before, we need to find the nearest stale + * entries before and after our insertion point. + */ + if (compact == 0) + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); + + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == leafhdr->count || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(ents[lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries up to cover the stale entry and make room + * for the new entry. + */ + if (index - lowstale - 1 > 0) { + memmove(&ents[lowstale], &ents[lowstale + 1], + (index - lowstale - 1) * + sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(lowstale, *lfloglow); + *lfloghigh = MAX(index - 1, *lfloghigh); + leafhdr->stale--; + return &ents[index - 1]; + } + + /* + * The high one is better, so use that one. + */ + ASSERT(highstale - index >= 0); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries down to cover the stale entry and make room for the + * new entry. + */ + if (highstale - index > 0) { + memmove(&ents[index + 1], &ents[index], + (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(index, *lfloglow); + *lfloghigh = MAX(highstale, *lfloghigh); + leafhdr->stale--; + return &ents[index]; +} + +/* + * Add an entry to a leaf form directory. + */ +int /* error */ +xfs_dir2_leaf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* freespace table in leaf */ + int compact; /* need to compact leaves */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry */ + int error; /* error return value */ + int grown; /* allocated new data block */ + int highstale; /* index of next stale leaf */ + int i; /* temporary, index */ + int index; /* leaf table position */ + struct xfs_buf *lbp; /* leaf's buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + int lfloglow; /* low leaf logging index */ + int lfloghigh; /* high leaf logging index */ + int lowstale; /* index of prev stale leaf */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int needbytes; /* leaf block bytes needed */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data free */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_addname(args); + + dp = args->dp; + tp = args->trans; + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) + return error; + + /* + * Look up the entry by hash value and name. + * We know it's not there, our caller has already done a lookup. + * So the index is of the entry to insert in front of. + * But if there are dup hash values the index is of the first of those. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = dp->d_ops->data_entsize(args->namelen); + + /* + * See if there are any entries with the same hash value + * and space in their block for the new entry. + * This is good because it puts multiple same-hash value entries + * in a data block, improving the lookup of those entries. + */ + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + index++, lep++) { + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + ASSERT(i < be32_to_cpu(ltp->bestcount)); + ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); + if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + /* + * Didn't find a block yet, linear search all the data blocks. + */ + if (use_block == -1) { + for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) { + /* + * Remember a block we see that's missing. + */ + if (bestsp[i] == cpu_to_be16(NULLDATAOFF) && + use_block == -1) + use_block = i; + else if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + } + /* + * How many bytes do we need in the leaf block? + */ + needbytes = 0; + if (!leafhdr.stale) + needbytes += sizeof(xfs_dir2_leaf_entry_t); + if (use_block == -1) + needbytes += sizeof(xfs_dir2_data_off_t); + + /* + * Now kill use_block if it refers to a missing block, so we + * can use it as an indication of allocation needed. + */ + if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF)) + use_block = -1; + /* + * If we don't have enough free bytes but we can make enough + * by compacting out stale entries, we'll do that. + */ + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) + compact = 1; + + /* + * Otherwise if we don't have enough free bytes we need to + * convert to node form. + */ + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { + /* + * Just checking or no space reservation, give up. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { + xfs_trans_brelse(tp, lbp); + return -ENOSPC; + } + /* + * Convert to node form. + */ + error = xfs_dir2_leaf_to_node(args, lbp); + if (error) + return error; + /* + * Then add the new entry. + */ + return xfs_dir2_node_addname(args); + } + /* + * Otherwise it will fit without compaction. + */ + else + compact = 0; + /* + * If just checking, then it will fit unless we needed to allocate + * a new data block. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, lbp); + return use_block == -1 ? -ENOSPC : 0; + } + /* + * If no allocations are allowed, return now before we've + * changed anything. + */ + if (args->total == 0 && use_block == -1) { + xfs_trans_brelse(tp, lbp); + return -ENOSPC; + } + /* + * Need to compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and we'll shift that one to our insertion + * point later. + */ + if (compact) { + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + } + /* + * There are stale entries, so we'll need log-low and log-high + * impossibly bad values later. + */ + else if (leafhdr.stale) { + lfloglow = leafhdr.count; + lfloghigh = -1; + } + /* + * If there was no data block space found, we need to allocate + * a new one. + */ + if (use_block == -1) { + /* + * Add the new data block. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, + &use_block))) { + xfs_trans_brelse(tp, lbp); + return error; + } + /* + * Initialize the block. + */ + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { + xfs_trans_brelse(tp, lbp); + return error; + } + /* + * If we're adding a new data block on the end we need to + * extend the bests table. Copy it up one entry. + */ + if (use_block >= be32_to_cpu(ltp->bestcount)) { + bestsp--; + memmove(&bestsp[0], &bestsp[1], + be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); + be32_add_cpu(<p->bestcount, 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); + } + /* + * If we're filling in a previously empty block just log it. + */ + else + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; + grown = 1; + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + grown = 0; + } + /* + * Point to the biggest freespace in our data block. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + ASSERT(be16_to_cpu(dup->length) >= length); + needscan = needlog = 0; + /* + * Mark the initial part of our freespace in use for the new entry. + */ + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, + &needlog, &needscan); + /* + * Initialize our new entry (at last). + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + /* + * Need to scan fix up the bestfree table. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Need to log the data block's header. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); + /* + * If the bests table needs to be changed, do it. + * Log the change unless we've already done that. + */ + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; + if (!grown) + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); + } + + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + + /* + * Fill in the new leaf entry. + */ + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, + be16_to_cpu(*tagp))); + /* + * Log the leaf fields and give up the buffers. + */ + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + return 0; +} + +/* + * Compact out any stale entries in the leaf. + * Log the header and changed leaf entries, if any. + */ +void +xfs_dir3_leaf_compact( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_buf *bp) /* leaf buffer */ +{ + int from; /* source leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int loglow; /* first leaf entry to log */ + int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = args->dp; + + leaf = bp->b_addr; + if (!leafhdr->stale) + return; + + /* + * Compress out the stale entries in place. + */ + ents = dp->d_ops->leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + continue; + /* + * Only actually copy the entries that are different. + */ + if (from > to) { + if (loglow == -1) + loglow = to; + ents[to] = ents[from]; + } + to++; + } + /* + * Update and log the header, log the leaf entries. + */ + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args, bp); + if (loglow != -1) + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); +} + +/* + * Compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and the caller will shift that one to our insertion + * point later. + * Return new insertion index, where the remaining stale entry is, + * and leaf logging indices. + */ +void +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int *indexp, /* insertion index */ + int *lowstalep, /* out: stale entry before us */ + int *highstalep, /* out: stale entry after us */ + int *lowlogp, /* out: low log index */ + int *highlogp) /* out: high log index */ +{ + int from; /* source copy index */ + int highstale; /* stale entry at/after index */ + int index; /* insertion index */ + int keepstale; /* source index of kept stale */ + int lowstale; /* stale entry before index */ + int newindex=0; /* new insertion index */ + int to; /* destination copy index */ + + ASSERT(leafhdr->stale > 1); + index = *indexp; + + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); + + /* + * Pick the better of lowstale and highstale. + */ + if (lowstale >= 0 && + (highstale == leafhdr->count || + index - lowstale <= highstale - index)) + keepstale = lowstale; + else + keepstale = highstale; + /* + * Copy the entries in place, removing all the stale entries + * except keepstale. + */ + for (from = to = 0; from < leafhdr->count; from++) { + /* + * Notice the new value of index. + */ + if (index == from) + newindex = to; + if (from != keepstale && + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (from == to) + *lowlogp = to; + continue; + } + /* + * Record the new keepstale value for the insertion. + */ + if (from == keepstale) + lowstale = highstale = to; + /* + * Copy only the entries that have moved. + */ + if (from > to) + ents[to] = ents[from]; + to++; + } + ASSERT(from > to); + /* + * If the insertion point was past the last entry, + * set the new insertion point accordingly. + */ + if (index == from) + newindex = to; + *indexp = newindex; + /* + * Adjust the leaf header values. + */ + leafhdr->count -= from - to; + leafhdr->stale = 1; + /* + * Remember the low/high stale value only in the "right" + * direction. + */ + if (lowstale >= newindex) + lowstale = -1; + else + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; + *lowstalep = lowstale; + *highstalep = highstale; +} + +/* + * Log the bests entries indicated from a leaf1 block. + */ +static void +xfs_dir3_leaf_log_bests( + struct xfs_da_args *args, + struct xfs_buf *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + __be16 *firstb; /* pointer to first entry */ + __be16 *lastb; /* pointer to last entry */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), + (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); +} + +/* + * Log the leaf entries indicated from a leaf1 or leafn block. + */ +void +xfs_dir3_leaf_log_ents( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, + int last) +{ + xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ + xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = args->dp->d_ops->leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), + (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); +} + +/* + * Log the header of the leaf1 or leafn block. + */ +void +xfs_dir3_leaf_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); +} + +/* + * Log the tail of the leaf1 block. + */ +STATIC void +xfs_dir3_leaf_log_tail( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); +} + +/* + * Look up the entry referred to by args in the leaf format directory. + * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which + * is also used by the node-format code. + */ +int +xfs_dir2_leaf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* found entry index */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_lookup(args); + + /* + * Look up name in the leaf block, returning both buffers and index. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + tp = args->trans; + dp = args->dp; + xfs_dir3_leaf_check(dp, lbp); + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Get to the leaf entry and contained data entry address. + */ + lep = &ents[index]; + + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + /* + * Return the found inode number & CI name if appropriate + */ + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); + return error; +} + +/* + * Look up name/hash in the leaf block. + * Fill in indexp with the found index, and dbpp with the data buffer. + * If not found dbpp will be NULL, and ENOENT comes back. + * lbpp will always be filled in with the leaf buffer unless there's an error. + */ +static int /* error */ +xfs_dir2_leaf_lookup_int( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf **lbpp, /* out: leaf buffer */ + int *indexp, /* out: index in leaf block */ + struct xfs_buf **dbpp) /* out: data buffer */ +{ + xfs_dir2_db_t curdb = -1; /* current data block number */ + struct xfs_buf *dbp = NULL; /* data buffer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index in leaf block */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) + return error; + + *lbpp = lbp; + leaf = lbp->b_addr; + xfs_dir3_leaf_check(dp, lbp); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + /* + * Look for the first leaf entry with our hash value. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + /* + * Loop over all the entries with the right hash value + * looking to match the name. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip over stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get the new data block number. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * If it's not the same as the old data block number, + * need to pitch the old one and read the new one. + */ + if (newdb != curdb) { + if (dbp) + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *indexp = index; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } + /* + * No match found, return -ENOENT. + */ + ASSERT(cidb == -1); + if (dbp) + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); + return -ENOENT; +} + +/* + * Remove an entry from a leaf format directory. + */ +int /* error */ +xfs_dir2_leaf_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* leaf block best freespace */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t db; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry structure */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_db_t i; /* temporary data block # */ + int index; /* index into leaf entries */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_dir2_data_off_t oldbest; /* old value of best free */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_removename(args); + + /* + * Lookup the leaf entry, get the leaf and data blocks read in. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + bf = dp->d_ops->data_bestfree_p(hdr); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Point to the leaf entry, use that to point to the data entry. + */ + lep = &ents[index]; + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + needscan = needlog = 0; + oldbest = be16_to_cpu(bf[0].length); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + ASSERT(be16_to_cpu(bestsp[db]) == oldbest); + /* + * Mark the former data entry unused. + */ + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * We just mark the leaf entry stale by putting a null in it. + */ + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir3_leaf_log_ents(args, lbp, index, index); + + /* + * Scan the freespace in the data block again if necessary, + * log the data block header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * If the longest freespace in the data block has changed, + * put the new value in the bests table and log that. + */ + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(args, lbp, db, db); + } + xfs_dir3_data_check(dp, dbp); + /* + * If the data block is now empty then get rid of the data block. + */ + if (be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + /* + * Nope, can't get rid of it because it caused + * allocation of a bmap btree block to do so. + * Just go on, returning success, leaving the + * empty block in place. + */ + if (error == -ENOSPC && args->total == 0) + error = 0; + xfs_dir3_leaf_check(dp, lbp); + return error; + } + dbp = NULL; + /* + * If this is the last data block then compact the + * bests table by getting rid of entries. + */ + if (db == be32_to_cpu(ltp->bestcount) - 1) { + /* + * Look for the last active entry (i). + */ + for (i = db - 1; i > 0; i--) { + if (bestsp[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + /* + * Copy the table down so inactive entries at the + * end are removed. + */ + memmove(&bestsp[db - i], bestsp, + (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); + be32_add_cpu(<p->bestcount, -(db - i)); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); + } else + bestsp[db] = cpu_to_be16(NULLDATAOFF); + } + /* + * If the data block was not the first one, drop it. + */ + else if (db != args->geo->datablk) + dbp = NULL; + + xfs_dir3_leaf_check(dp, lbp); + /* + * See if we can convert to block form. + */ + return xfs_dir2_leaf_to_block(args, lbp, dbp); +} + +/* + * Replace the inode number in a leaf format directory entry. + */ +int /* error */ +xfs_dir2_leaf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index of leaf entry */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_replace(args); + + /* + * Look up the entry. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Point to the leaf entry, get data address from it. + */ + lep = &ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + ASSERT(args->inumber != be64_to_cpu(dep->inumber)); + /* + * Put the new inode number in, log it. + */ + dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); + tp = args->trans; + xfs_dir2_data_log_entry(args, dbp, dep); + xfs_dir3_leaf_check(dp, lbp); + xfs_trans_brelse(tp, lbp); + return 0; +} + +/* + * Return index in the leaf block (lbp) which is either the first + * one with this hash value, or if there are none, the insert point + * for that hash value. + */ +int /* index value */ +xfs_dir2_leaf_search_hash( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp) /* leaf buffer */ +{ + xfs_dahash_t hash=0; /* hash from this entry */ + xfs_dahash_t hashwant; /* hash value looking for */ + int high; /* high leaf index */ + int low; /* low leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + leaf = lbp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + /* + * Note, the table cannot be empty, so we have to go through the loop. + * Binary search the leaf entries looking for our hash value. + */ + for (lep = ents, low = 0, high = leafhdr.count - 1, + hashwant = args->hashval; + low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant) + break; + if (hash < hashwant) + low = mid + 1; + else + high = mid - 1; + } + /* + * Found one, back up through all the equal hash values. + */ + if (hash == hashwant) { + while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) { + mid--; + } + } + /* + * Need to point to an entry higher than ours. + */ + else if (hash < hashwant) + mid++; + return mid; +} + +/* + * Trim off a trailing data block. We know it's empty since the leaf + * freespace table says so. + */ +int /* error */ +xfs_dir2_leaf_trim_data( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp, /* leaf buffer */ + xfs_dir2_db_t db) /* data block number */ +{ + __be16 *bestsp; /* leaf bests table */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + /* + * Read the offending data block. We need its buffer. + */ + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); + if (error) + return error; + + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + +#ifdef DEBUG +{ + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset); + ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); +} +#endif + + /* + * Get rid of the data block. + */ + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + ASSERT(error != -ENOSPC); + xfs_trans_brelse(tp, dbp); + return error; + } + /* + * Eliminate the last bests entry from the table. + */ + bestsp = xfs_dir2_leaf_bests_p(ltp); + be32_add_cpu(<p->bestcount, -1); + memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + return 0; +} + +static inline size_t +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, + int counts) +{ + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); + + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); +} + +/* + * Convert node form directory to leaf form directory. + * The root of the node form dir needs to already be a LEAFN block. + * Just return if we can't do anything. + */ +int /* error */ +xfs_dir2_node_to_leaf( + xfs_da_state_t *state) /* directory operation state */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + struct xfs_buf *fbp; /* buffer for freespace block */ + xfs_fileoff_t fo; /* freespace file offset */ + xfs_dir2_free_t *free; /* freespace structure */ + struct xfs_buf *lbp; /* buffer for leaf block */ + xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* successful free trim? */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; + + /* + * There's more than a leaf level in the btree, so there must + * be multiple leafn blocks. Give up. + */ + if (state->path.active > 1) + return 0; + args = state->args; + + trace_xfs_dir2_node_to_leaf(args); + + mp = state->mp; + dp = args->dp; + tp = args->trans; + /* + * Get the last offset in the file. + */ + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { + return error; + } + fo -= args->geo->fsbcount; + /* + * If there are freespace blocks other than the first one, + * take this opportunity to remove trailing empty freespace blocks + * that may have been left behind during no-space-reservation + * operations. + */ + while (fo > args->geo->freeblk) { + if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { + return error; + } + if (rval) + fo -= args->geo->fsbcount; + else + return 0; + } + /* + * Now find the block just before the freespace block. + */ + if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + /* + * If it's not the single leaf block, give up. + */ + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) + return 0; + lbp = state->path.blk[0].bp; + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + + /* + * Read the freespace block. + */ + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + if (error) + return error; + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); + + /* + * Now see if the leafn and free data will fit in a leaf1. + * If not, release the buffer and give up. + */ + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { + xfs_trans_brelse(tp, fbp); + return 0; + } + + /* + * If the leaf has any stale entries in it, compress them out. + */ + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); + + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; + + /* + * Set up the leaf tail from the freespace block. + */ + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + + /* + * Set up the leaf bests table. + */ + memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + + /* + * Get rid of the freespace block. + */ + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); + if (error) { + /* + * This can't fail here because it can only happen when + * punching out the middle of an extent, and this is an + * isolated block. + */ + ASSERT(error != -ENOSPC); + return error; + } + fbp = NULL; + /* + * Now see if we can convert the single-leaf directory + * down to a block form directory. + * This routine always kills the dabuf for the leaf, so + * eliminate it from the path. + */ + error = xfs_dir2_leaf_to_block(args, lbp, NULL); + state->path.blk[0].bp = NULL; + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_node.c b/kernel/fs/xfs/libxfs/xfs_dir2_node.c new file mode 100644 index 000000000..41b80d3d3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_node.c @@ -0,0 +1,2270 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Function declarations. + */ +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); +static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, + int index, xfs_da_state_blk_t *dblk, + int *rval); +static int xfs_dir2_node_addname_int(xfs_da_args_t *args, + xfs_da_state_blk_t *fblk); + +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +static bool +xfs_dir3_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; + } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; +} + +static void +xfs_dir3_free_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_free_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, +}; + + +static int +__xfs_dir3_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + +static int +xfs_dir3_free_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; + dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; +} + +/* + * Log entries from a freespace block. + */ +STATIC void +xfs_dir2_free_log_bests( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; + + free = bp->b_addr; + bests = args->dp->d_ops->free_bests_p(free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); +} + +/* + * Log header from a freespace block. + */ +static void +xfs_dir2_free_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ +#ifdef DEBUG + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); +#endif + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); +} + +/* + * Convert a leaf-format directory to a node-format directory. + * We need to change the magic number of the leaf block, and copy + * the freespace table out of the leaf block into its own block. + */ +int /* error */ +xfs_dir2_leaf_to_node( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp) /* leaf buffer */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + struct xfs_buf *fbp; /* freespace buffer */ + xfs_dir2_db_t fdb; /* freespace block number */ + xfs_dir2_free_t *free; /* freespace structure */ + __be16 *from; /* pointer to freespace entry */ + int i; /* leaf freespace index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + int n; /* count of live freespc ents */ + xfs_dir2_data_off_t off; /* freespace entry value */ + __be16 *to; /* pointer to freespace entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; + + trace_xfs_dir2_leaf_to_node(args); + + dp = args->dp; + tp = args->trans; + /* + * Add a freespace block to the directory. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { + return error; + } + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + /* + * Get the buffer for the new freespace block. + */ + error = xfs_dir3_free_get_buf(args, fdb, &fbp); + if (error) + return error; + + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / args->geo->blksize); + + /* + * Copy freespace entries from the leaf block to the new block. + * Count active entries. + */ + from = xfs_dir2_leaf_bests_p(ltp); + to = dp->d_ops->free_bests_p(free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + if ((off = be16_to_cpu(*from)) != NULLDATAOFF) + n++; + *to = cpu_to_be16(off); + } + + /* + * Now initialize the freespace block header. + */ + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); + + /* + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. + */ + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + return 0; +} + +/* + * Add a leaf entry to a leaf block in a node-form directory. + * The other work necessary is done from the caller. + */ +static int /* error */ +xfs_dir2_leafn_add( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int index) /* insertion pt for new entry */ +{ + int compact; /* compacting stale leaves */ + xfs_inode_t *dp; /* incore directory inode */ + int highstale; /* next stale entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int lfloghigh; /* high leaf entry logging */ + int lfloglow; /* low leaf entry logging */ + int lowstale; /* previous stale entry */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_add(args, index); + + dp = args->dp; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Quick check just to make sure we are not going to index + * into other peoples memory + */ + if (index < 0) + return -EFSCORRUPTED; + + /* + * If there are already the maximum number of leaf entries in + * the block, if there are no stale entries it won't fit. + * Caller will do a split. If there are stale entries we'll do + * a compact. + */ + + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (!leafhdr.stale) + return -ENOSPC; + compact = leafhdr.stale > 1; + } else + compact = 0; + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); + + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + + /* + * Compact out all but one stale leaf entry. Leaves behind + * the entry closest to index. + */ + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; + lfloghigh = -1; + } + + /* + * Insert the new entry, log everything. + */ + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, + args->blkno, args->index)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, bp); + return 0; +} + +#ifdef DEBUG +static void +xfs_dir2_free_hdr_check( + struct xfs_inode *dp, + struct xfs_buf *bp, + xfs_dir2_db_t db) +{ + struct xfs_dir3_icfree_hdr hdr; + + dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); +} +#else +#define xfs_dir2_free_hdr_check(dp, bp, db) +#endif /* DEBUG */ + +/* + * Return the last hash value in the leaf. + * Stale entries are ok. + */ +xfs_dahash_t /* hash value */ +xfs_dir2_leafn_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, /* leaf buffer */ + int *count) /* count of entries in leaf */ +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + + if (count) + *count = leafhdr.count; + if (!leafhdr.count) + return 0; + + ents = dp->d_ops->leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); +} + +/* + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_addname( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int fi; /* free entry index */ + xfs_dir2_free_t *free = NULL; /* free block structure */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new data entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_dir2_db_t newfdb; /* new free block number */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ + curbp = state->extrablk.bp; + curfdb = state->extrablk.blkno; + free = curbp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + } + length = dp->d_ops->data_entsize(args->namelen); + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * For addname, we're looking for a place to put the new entry. + * We want to use a data block with an entry of equal + * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. + */ + if (newdb != curdb) { + __be16 *bests; + + curdb = newdb; + /* + * Convert the data block to the free block + * holding its freespace information. + */ + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); + /* + * If it's not the one we have in hand, read it in. + */ + if (newfdb != curfdb) { + /* + * If we had one before, drop it. + */ + if (curbp) + xfs_trans_brelse(tp, curbp); + + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newfdb), + &curbp); + if (error) + return error; + free = curbp->b_addr; + + xfs_dir2_free_hdr_check(dp, curbp, curdb); + } + /* + * Get the index for our entry. + */ + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); + /* + * If it has room, return it. + */ + bests = dp->d_ops->free_bests_p(free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_trans_brelse(tp, curbp); + return -EFSCORRUPTED; + } + curfdb = newfdb; + if (be16_to_cpu(bests[fi]) >= length) + goto out; + } + } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } + /* + * Return the index, that will be the insertion point. + */ + *indexp = index; + return -ENOENT; +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_trans_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newdb), + -1, &curbp); + if (error) + return error; + } + xfs_dir3_data_check(dp, curbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_trans_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->b_addr); + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + if (cmp == XFS_CMP_EXACT) + return -EEXIST; + } + } + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_trans_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } + *indexp = index; + return -ENOENT; +} + +/* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* + * Move count leaf entries from source to destination leaf. + * Log entries and headers. Stale entries are preserved. + */ +static void +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ +{ + int stale; /* count stale leaves copied */ + + trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); + + /* + * Silently return if nothing to do. + */ + if (count == 0) + return; + + /* + * If the destination index is not the end of the current + * destination leaf entries, open up a hole in the destination + * to hold the new entries. + */ + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + count + dhdr->count - 1); + } + /* + * If the source has stale leaves, count the ones in the copy range + * so we can update the header correctly. + */ + if (shdr->stale) { + int i; /* temp leaf index */ + + for (i = start_s, stale = 0; i < start_s + count; i++) { + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + } else + stale = 0; + /* + * Copy the leaf entries from source to destination. + */ + memcpy(&dents[start_d], &sents[start_s], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + + /* + * If there are source entries after the ones we copied, + * delete the ones we copied by sliding the next ones down. + */ + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); + } + + /* + * Update the headers and log them. + */ + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; +} + +/* + * Determine the sort order of two leaf blocks. + * Returns 1 if both are valid and leaf2 should be before leaf1, else 0. + */ +int /* sort order */ +xfs_dir2_leafn_order( + struct xfs_inode *dp, + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ +{ + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) + return 1; + return 0; +} + +/* + * Rebalance leaf entries between two leaf blocks. + * This is actually only called when the second block is new, + * though the code deals with the general case. + * A new entry will be inserted in one of the blocks, and that + * entry is taken into account when balancing. + */ +static void +xfs_dir2_leafn_rebalance( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *blk1, /* first btree block */ + xfs_da_state_blk_t *blk2) /* second btree block */ +{ + xfs_da_args_t *args; /* operation arguments */ + int count; /* count (& direction) leaves */ + int isleft; /* new goes in left leaf */ + xfs_dir2_leaf_t *leaf1; /* first leaf structure */ + xfs_dir2_leaf_t *leaf2; /* second leaf structure */ + int mid; /* midpoint leaf index */ +#if defined(DEBUG) || defined(XFS_WARN) + int oldstale; /* old count of stale leaves */ +#endif + int oldsum; /* old total leaf count */ + int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + /* + * If the block order is wrong, swap the arguments. + */ + if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) { + xfs_da_state_blk_t *tmp; /* temp for block swap */ + + tmp = blk1; + blk1 = blk2; + blk2 = tmp; + } + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; +#if defined(DEBUG) || defined(XFS_WARN) + oldstale = hdr1.stale + hdr2.stale; +#endif + mid = oldsum >> 1; + + /* + * If the old leaf count was odd then the new one will be even, + * so we need to divide the new count evenly. + */ + if (oldsum & 1) { + xfs_dahash_t midhash; /* middle entry hash value */ + + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); + else + midhash = be32_to_cpu(ents1[mid].hashval); + isleft = args->hashval <= midhash; + } + /* + * If the old count is even then the new count is odd, so there's + * no preferred side for the new entry. + * Pick the left one. + */ + else + isleft = 1; + /* + * Calculate moved entry count. Positive means left-to-right, + * negative means right-to-left. Then move the entries. + */ + count = hdr1.count - mid + (isleft == 0); + if (count > 0) + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); + else if (count < 0) + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); + dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); + + xfs_dir3_leaf_check(dp, blk1->bp); + xfs_dir3_leaf_check(dp, blk2->bp); + + /* + * Mark whether we're inserting into the old or new leaf. + */ + if (hdr1.count < hdr2.count) + state->inleaf = swap; + else if (hdr1.count > hdr2.count) + state->inleaf = !swap; + else + state->inleaf = swap ^ (blk1->index <= hdr1.count); + /* + * Adjust the expected index for insertion. + */ + if (!state->inleaf) + blk2->index = blk1->index - hdr1.count; + + /* + * Finally sanity check just to make sure we are not returning a + * negative index + */ + if (blk2->index < 0) { + state->inleaf = 1; + blk2->index = 0; + xfs_alert(dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d", + __func__, blk1->index); + } +} + +static int +xfs_dir3_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_inode *dp = args->dp; + + dp->d_ops->free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + if (hdr) { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; + } + + /* One less used entry in the free table. */ + freehdr.nused--; + + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + freehdr.nvalid = i + 1; + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(args, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != -ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; +} + +/* + * Remove an entry from a node directory. + * This removes the leaf entry and the data entry, + * and updates the free block if necessary. + */ +static int /* error */ +xfs_dir2_leafn_remove( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp, /* leaf buffer */ + int index, /* leaf entry index */ + xfs_da_state_blk_t *dblk, /* data block */ + int *rval) /* resulting block needs join */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t db; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int longest; /* longest data free entry */ + int off; /* data block entry offset */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_remove(args, index); + + dp = args->dp; + tp = args->trans; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Point to the entry we're removing. + */ + lep = &ents[index]; + + /* + * Extract the data block and offset from the entry. + */ + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + ASSERT(dblk->blkno == db); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); + ASSERT(dblk->index == off); + + /* + * Kill the leaf entry by marking it stale. + * Log the leaf block changes. + */ + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir3_leaf_log_ents(args, bp, index, index); + + /* + * Make the data entry free. Keep track of the longest freespace + * in the data block in case it changes. + */ + dbp = dblk->bp; + hdr = dbp->b_addr; + dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); + bf = dp->d_ops->data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); + needlog = needscan = 0; + xfs_dir2_data_make_free(args, dbp, off, + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * Rescan the data block freespaces for bestfree. + * Log the data block header if needed. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_data_check(dp, dbp); + /* + * If the longest data block freespace changes, need to update + * the corresponding freeblock entry. + */ + if (longest < be16_to_cpu(bf[0].length)) { + int error; /* error return value */ + struct xfs_buf *fbp; /* freeblock buffer */ + xfs_dir2_db_t fdb; /* freeblock block number */ + int findex; /* index in freeblock entries */ + xfs_dir2_free_t *free; /* freeblock structure */ + + /* + * Convert the data block number to a free block, + * read in the free block. + */ + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), + &fbp); + if (error) + return error; + free = fbp->b_addr; +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); + } +#endif + /* + * Calculate which entry we need to fix. + */ + findex = dp->d_ops->db_to_fdindex(args->geo, db); + longest = be16_to_cpu(bf[0].length); + /* + * If the data block is now empty we can get rid of it + * (usually). + */ + if (longest == args->geo->blksize - + dp->d_ops->data_entry_offset) { + /* + * Try to punch out the data block. + */ + error = xfs_dir2_shrink_inode(args, db, dbp); + if (error == 0) { + dblk->bp = NULL; + hdr = NULL; + } + /* + * We can get ENOSPC if there's no space reservation. + * In this case just drop the buffer and some one else + * will eventually get rid of the empty block. + */ + else if (!(error == -ENOSPC && args->total == 0)) + return error; + } + /* + * If we got rid of the data block, we can eliminate that entry + * in the free block. + */ + error = xfs_dir3_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; + } + + xfs_dir3_leaf_check(dp, bp); + /* + * Return indication of whether this leaf block is empty enough + * to justify trying to join it with a neighbor. + */ + *rval = (dp->d_ops->leaf_hdr_size + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < + args->geo->magicpct; + return 0; +} + +/* + * Split the leaf entries in the old block into old and new blocks. + */ +int /* error */ +xfs_dir2_leafn_split( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *oldblk, /* original block */ + xfs_da_state_blk_t *newblk) /* newly created block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dablk_t blkno; /* new leaf block number */ + int error; /* error return value */ + struct xfs_inode *dp; + + /* + * Allocate space for a new leaf node. + */ + args = state->args; + dp = args->dp; + ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); + error = xfs_da_grow_inode(args, &blkno); + if (error) { + return error; + } + /* + * Initialize the new leaf block. + */ + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) + return error; + + newblk->blkno = blkno; + newblk->magic = XFS_DIR2_LEAFN_MAGIC; + /* + * Rebalance the entries across the two leaves, link the new + * block into the leaves. + */ + xfs_dir2_leafn_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) { + return error; + } + /* + * Insert the new entry in the correct block. + */ + if (state->inleaf) + error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index); + else + error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index); + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + xfs_dir3_leaf_check(dp, oldblk->bp); + xfs_dir3_leaf_check(dp, newblk->bp); + return error; +} + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +int /* error */ +xfs_dir2_leafn_toosmall( + xfs_da_state_t *state, /* btree cursor */ + int *action) /* resulting action to take */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dablk_t blkno; /* leaf block number */ + struct xfs_buf *bp; /* leaf buffer */ + int bytes; /* bytes in use */ + int count; /* leaf live entry count */ + int error; /* error return value */ + int forward; /* sibling block direction */ + int i; /* sibling counter */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = state->args->dp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[state->path.active - 1]; + leaf = blk->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir3_leaf_check(dp, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + if (bytes > (state->args->geo->blksize >> 1)) { + /* + * Blk over 50%, don't try to join. + */ + *action = 0; + return 0; + } + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (leafhdr.forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, + &rval); + if (error) + return error; + *action = rval ? 2 : 0; + return 0; + } + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + forward = leafhdr.forw < leafhdr.back; + for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; + if (blkno == 0) + continue; + /* + * Read the sibling leaf block. + */ + error = xfs_dir3_leafn_read(state->args->trans, dp, + blkno, -1, &bp); + if (error) + return error; + + /* + * Count bytes in the two blocks combined. + */ + count = leafhdr.count - leafhdr.stale; + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); + + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + + /* + * Fits with at least 25% to spare. + */ + if (bytes >= 0) + break; + xfs_trans_brelse(state->args->trans, bp); + } + /* + * Didn't like either block, give up. + */ + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, + &rval); + else + error = xfs_da3_path_shift(state, &state->path, forward, 0, + &rval); + if (error) { + return error; + } + *action = rval ? 0 : 1; + return 0; +} + +/* + * Move all the leaf entries from drop_blk to save_blk. + * This is done as part of a join operation. + */ +void +xfs_dir2_leafn_unbalance( + xfs_da_state_t *state, /* cursor */ + xfs_da_state_blk_t *drop_blk, /* dead block */ + xfs_da_state_blk_t *save_blk) /* surviving block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ + xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + + dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); + dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = dp->d_ops->leaf_ents_p(save_leaf); + dents = dp->d_ops->leaf_ents_p(drop_leaf); + + /* + * If there are any stale leaf entries, take this opportunity + * to purge them. + */ + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + + /* + * Move the entries from drop to the appropriate end of save. + */ + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); + if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp)) + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); + else + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); + dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); + + xfs_dir3_leaf_check(dp, save_blk->bp); + xfs_dir3_leaf_check(dp, drop_blk->bp); +} + +/* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != -ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == -ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da3_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int /* error */ +xfs_dir2_node_addname_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_da_state_blk_t *fblk) /* optional freespace block */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t dbno; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ + int error; /* error return value */ + xfs_dir2_db_t fbno; /* freespace block number */ + struct xfs_buf *fbp; /* freespace buffer */ + int findex; /* freespace entry index */ + xfs_dir2_free_t *free=NULL; /* freespace block structure */ + xfs_dir2_db_t ifbno; /* initial freespace block no */ + xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ + int length; /* length of the new entry */ + int logfree; /* need to log free entry */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + length = dp->d_ops->data_entsize(args->namelen); + /* + * If we came in with a freespace block that means that lookup + * found an entry with our hash value. This is the freespace + * block for that data entry. + */ + if (fblk) { + fbp = fblk->bp; + /* + * Remember initial freespace block number. + */ + ifbno = fblk->blkno; + free = fbp->b_addr; + findex = fblk->index; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * This means the free entry showed that the data block had + * space for our entry, so we remembered it. + * Use that data block. + */ + if (findex >= 0) { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ + dbno = -1; + findex = 0; + } + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ + ifbno = dbno = -1; + fbp = NULL; + findex = 0; + } + + /* + * If we don't have a data block yet, we're going to scan the + * freespace blocks looking for one. Figure out what the + * highest freespace block number is. + */ + if (dbno == -1) { + xfs_fileoff_t fo; /* freespace block number */ + + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) + return error; + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); + fbno = ifbno; + } + /* + * While we haven't identified a data block, search the freeblock + * data for a good data block. If we find a null freeblock entry, + * indicating a hole in the data blocks, remember that. + */ + while (dbno == -1) { + /* + * If we don't have a freeblock in hand, get the next one. + */ + if (fbp == NULL) { + /* + * Happens the first time through unless lookup gave + * us a freespace block to start with. + */ + if (++fbno == 0) + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); + /* + * If it's ifbno we already looked at it. + */ + if (fbno == ifbno) + fbno++; + /* + * If it's off the end we're done. + */ + if (fbno >= lastfbno) + break; + /* + * Read the block. There can be holes in the + * freespace blocks, so this might not succeed. + * This should be really rare, so there's no reason + * to avoid it. + */ + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) + return error; + if (!fbp) + continue; + free = fbp->b_addr; + findex = 0; + } + /* + * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. + */ + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; + else { + /* + * Are we done with the freeblock? + */ + if (++findex == freehdr.nvalid) { + /* + * Drop the block. + */ + xfs_trans_brelse(tp, fbp); + fbp = NULL; + if (fblk && fblk->bp) + fblk->bp = NULL; + } + } + } + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (unlikely(dbno == -1)) { + /* + * Not allowed to allocate, return failure. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return -ENOSPC; + + /* + * Allocate and initialize the new data block. + */ + if (unlikely((error = xfs_dir2_grow_inode(args, + XFS_DIR2_DATA_SPACE, + &dbno)) || + (error = xfs_dir3_data_init(args, dbno, &dbp)))) + return error; + + /* + * If (somehow) we have a freespace block, get rid of it. + */ + if (fbp) + xfs_trans_brelse(tp, fbp); + if (fblk && fblk->bp) + fblk->bp = NULL; + + /* + * Get the freespace block corresponding to the data block + * that was just allocated. + */ + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) + return error; + + /* + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. + */ + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) + return error; + + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { + xfs_alert(mp, + "%s: dir ino %llu needed freesp block %lld for\n" + " data block %lld, got %lld ifbno %llu lastfbno %d", + __func__, (unsigned long long)dp->i_ino, + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), + (long long)dbno, (long long)fbno, + (unsigned long long)ifbno, lastfbno); + if (fblk) { + xfs_alert(mp, + " fblk 0x%p blkno %llu index %d magic 0x%x", + fblk, + (unsigned long long)fblk->blkno, + fblk->index, + fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + XFS_ERROR_REPORT("xfs_dir2_node_addname_int", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* + * Get a buffer for the new block. + */ + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) + return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * Remember the first slot as our empty slot. + */ + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); + } else { + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + } + + /* + * Set the freespace block index from the data block number. + */ + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); + /* + * If it's after the end of the current entries in the + * freespace block, extend that table. + */ + if (findex >= freehdr.nvalid) { + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = findex + 1; + /* + * Tag new entry so nused will go up. + */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + } + /* + * If this entry was for an empty data block + * (this should always be true) then update the header. + */ + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); + } + /* + * Update the real value in the table. + * We haven't allocated the data entry yet so this will + * change again. + */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[findex] = bf[0].length; + logfree = 1; + } + /* + * We had a data block so we don't have to make a new one. + */ + else { + /* + * If just checking, we succeeded. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + + /* + * Read the data block in. + */ + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), + -1, &dbp); + if (error) + return error; + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + logfree = 0; + } + ASSERT(be16_to_cpu(bf[0].length) >= length); + /* + * Point to the existing unused space. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + needscan = needlog = 0; + /* + * Mark the first part of the unused space, inuse for us. + */ + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, + &needlog, &needscan); + /* + * Fill in the new entry and log it. + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, dbp, dep); + /* + * Rescan the block for bestfree if needed. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Log the data block header if needed. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * If the freespace entry is now wrong, update it. + */ + bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; + logfree = 1; + } + /* + * Log the freespace entry if needed. + */ + if (logfree) + xfs_dir2_free_log_bests(args, fbp, findex, findex); + /* + * Return the data block and offset in args, then drop the data block. + */ + args->blkno = (xfs_dablk_t)dbno; + args->index = be16_to_cpu(*tagp); + return 0; +} + +/* + * Lookup an entry in a node-format directory. + * All the real work happens in xfs_da3_node_lookup_int. + * The only real output is the inode number of the entry. + */ +int /* error */ +xfs_dir2_node_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + int error; /* error return value */ + int i; /* btree level */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_lookup(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Fill in the path to the entry in the cursor. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return -EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } + /* + * Release the btree blocks and leaf block. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + /* + * Release the data block if we have it. + */ + if (state->extravalid && state->extrablk.bp) { + xfs_trans_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Remove an entry from a node-format directory. + */ +int /* error */ +xfs_dir2_node_removename( + struct xfs_da_args *args) /* operation arguments */ +{ + struct xfs_da_state_blk *blk; /* leaf block */ + int error; /* error return value */ + int rval; /* operation return value */ + struct xfs_da_state *state; /* btree cursor */ + + trace_xfs_dir2_node_removename(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + + /* Look up the entry we're deleting, set up the cursor. */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + goto out_free; + + /* Didn't find it, upper layer screwed up. */ + if (rval != -EEXIST) { + error = rval; + goto out_free; + } + + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(state->extravalid); + /* + * Remove the leaf and data entries. + * Extrablk refers to the data block. + */ + error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, + &state->extrablk, &rval); + if (error) + goto out_free; + /* + * Fix the hash values up the btree. + */ + xfs_da3_fixhashpath(state, &state->path); + /* + * If we need to join leaf blocks, do it. + */ + if (rval && state->path.active > 1) + error = xfs_da3_join(state); + /* + * If no errors so far, try conversion to leaf format. + */ + if (!error) + error = xfs_dir2_node_to_leaf(state); +out_free: + xfs_da_state_free(state); + return error; +} + +/* + * Replace an entry's inode number in a node-format directory. + */ +int /* error */ +xfs_dir2_node_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry changed */ + int error; /* error return value */ + int i; /* btree level */ + xfs_ino_t inum; /* new inode number */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ + int rval; /* internal return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_replace(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + inum = args->inumber; + /* + * Lookup the entry to change in the btree. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) { + rval = error; + } + /* + * It should be found, since the vnodeops layer has looked it up + * and locked it. But paranoia is good. + */ + if (rval == -EEXIST) { + struct xfs_dir2_leaf_entry *ents; + /* + * Find the leaf entry. + */ + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + leaf = blk->bp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + lep = &ents[blk->index]; + ASSERT(state->extravalid); + /* + * Point to the data entry. + */ + hdr = state->extrablk.bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + dep = (xfs_dir2_data_entry_t *) + ((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + ASSERT(inum != be64_to_cpu(dep->inumber)); + /* + * Fill in the new inode number and log the entry. + */ + dep->inumber = cpu_to_be64(inum); + args->dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); + rval = 0; + } + /* + * Didn't find it, and we're holding a data block. Drop it. + */ + else if (state->extravalid) { + xfs_trans_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + /* + * Release all the buffers in the cursor. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Trim off a trailing empty freespace block. + * Return (in rvalp) 1 if we did it, 0 if not. + */ +int /* error */ +xfs_dir2_node_trim_free( + xfs_da_args_t *args, /* operation arguments */ + xfs_fileoff_t fo, /* free block number */ + int *rvalp) /* out: did something */ +{ + struct xfs_buf *bp; /* freespace buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; + + dp = args->dp; + tp = args->trans; + /* + * Read the freespace block. + */ + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + if (error) + return error; + /* + * There can be holes in freespace. If fo is a hole, there's + * nothing to do. + */ + if (!bp) + return 0; + free = bp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * If there are used entries, there's nothing to do. + */ + if (freehdr.nused > 0) { + xfs_trans_brelse(tp, bp); + *rvalp = 0; + return 0; + } + /* + * Blow the block away. + */ + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { + /* + * Can't fail with ENOSPC since that only happens with no + * space reservation, when breaking up an extent into two + * pieces. This is the last block of an extent. + */ + ASSERT(error != -ENOSPC); + xfs_trans_brelse(tp, bp); + return error; + } + /* + * Return that we succeeded. + */ + *rvalp = 1; + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_priv.h b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h new file mode 100644 index 000000000..ef9f6ead9 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_PRIV_H__ +#define __XFS_DIR2_PRIV_H__ + +struct dir_context; + +/* xfs_dir2.c */ +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); + + +/* xfs_dir2_block.c */ +extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf **bpp); +extern int xfs_dir2_block_addname(struct xfs_da_args *args); +extern int xfs_dir2_block_lookup(struct xfs_da_args *args); +extern int xfs_dir2_block_removename(struct xfs_da_args *args); +extern int xfs_dir2_block_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, + struct xfs_buf *lbp, struct xfs_buf *dbp); + +/* xfs_dir2_data.c */ +#ifdef DEBUG +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); +#else +#define xfs_dir3_data_check(dp,bp) +#endif + +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); + +extern struct xfs_dir2_data_free * +xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup, + int *loghead); +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_buf **bpp); + +/* xfs_dir2_leaf.c */ +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, + struct xfs_buf *dbp); +extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, + int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); +extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, + struct xfs_buf *lbp, xfs_dir2_db_t db); +extern struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); +extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + +/* xfs_dir2_node.c */ +extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, + struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); +extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +extern int xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); +extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); +extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +extern int xfs_dir2_node_addname(struct xfs_da_args *args); +extern int xfs_dir2_node_lookup(struct xfs_da_args *args); +extern int xfs_dir2_node_removename(struct xfs_da_args *args); +extern int xfs_dir2_node_replace(struct xfs_da_args *args); +extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); + +/* xfs_dir2_sf.c */ +extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_sf_addname(struct xfs_da_args *args); +extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); +extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_sf_removename(struct xfs_da_args *args); +extern int xfs_dir2_sf_replace(struct xfs_da_args *args); + +/* xfs_dir2_readdir.c */ +extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); + +#endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_sf.c b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c new file mode 100644 index 000000000..974d62e67 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c @@ -0,0 +1,1142 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" + +/* + * Prototypes for internal functions. + */ +static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args, + xfs_dir2_sf_entry_t *sfep, + xfs_dir2_data_aoff_t offset, + int new_isize); +static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange, + int new_isize); +static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange, + xfs_dir2_sf_entry_t **sfepp, + xfs_dir2_data_aoff_t *offsetp); +#ifdef DEBUG +static void xfs_dir2_sf_check(xfs_da_args_t *args); +#else +#define xfs_dir2_sf_check(args) +#endif /* DEBUG */ + +static void xfs_dir2_sf_toino4(xfs_da_args_t *args); +static void xfs_dir2_sf_toino8(xfs_da_args_t *args); + +/* + * Given a block directory (dp/block), calculate its size as a shortform (sf) + * directory and a header for the sf directory, if it will fit it the + * space currently present in the inode. If it won't fit, the output + * size is too big (but not accurate). + */ +int /* size for sf form */ +xfs_dir2_block_sfsize( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dir2_data_hdr_t *hdr, /* block directory data */ + xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */ + xfs_dir2_block_tail_t *btp; /* tail area of the block */ + int count; /* shortform entry count */ + xfs_dir2_data_entry_t *dep; /* data entry in the block */ + int i; /* block entry index */ + int i8count; /* count of big-inode entries */ + int isdot; /* entry is "." */ + int isdotdot; /* entry is ".." */ + xfs_mount_t *mp; /* mount structure pointer */ + int namelen; /* total name bytes */ + xfs_ino_t parent = 0; /* parent inode number */ + int size=0; /* total computed size */ + int has_ftype; + struct xfs_da_geometry *geo; + + mp = dp->i_mount; + geo = mp->m_dir_geo; + + /* + * if there is a filetype field, add the extra byte to the namelen + * for each entry that we see. + */ + has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + + count = i8count = namelen = 0; + btp = xfs_dir2_block_tail_p(geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + + /* + * Iterate over the block's data entries by using the leaf pointers. + */ + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Calculate the pointer to the entry at hand. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); + /* + * Detect . and .., so we can special-case them. + * . is not included in sf directories. + * .. is included by just the parent inode number. + */ + isdot = dep->namelen == 1 && dep->name[0] == '.'; + isdotdot = + dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.'; + + if (!isdot) + i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; + + /* take into account the file type field */ + if (!isdot && !isdotdot) { + count++; + namelen += dep->namelen + has_ftype; + } else if (isdotdot) + parent = be64_to_cpu(dep->inumber); + /* + * Calculate the new size, see if we should give up yet. + */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count + /* namelen */ + count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ + namelen + /* name */ + (i8count ? /* inumber */ + (uint)sizeof(xfs_dir2_ino8_t) * count : + (uint)sizeof(xfs_dir2_ino4_t) * count); + if (size > XFS_IFORK_DSIZE(dp)) + return size; /* size value is a failure */ + } + /* + * Create the output header, if it worked. + */ + sfhp->count = count; + sfhp->i8count = i8count; + dp->d_ops->sf_put_parent_ino(sfhp, parent); + return size; +} + +/* + * Convert a block format directory to shortform. + * Caller has already checked that it will fit, and built us a header. + */ +int /* error */ +xfs_dir2_block_to_sf( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp, + int size, /* shortform directory size */ + xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data pointer */ + char *endptr; /* end of data entries */ + int error; /* error return value */ + int logflags; /* inode logging flags */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data pointer */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ + + trace_xfs_dir2_block_to_sf(args); + + dp = args->dp; + mp = dp->i_mount; + + /* + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. + */ + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; + + /* + * Copy the header into the newly allocate local space. + */ + sfp = (xfs_dir2_sf_hdr_t *)dst; + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); + + /* + * Set up to loop over the block's entries. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Loop over the active and unused entries. + * Stop when we reach the leaf/tail portion of the block. + */ + while (ptr < endptr) { + /* + * If it's unused, just skip over it. + */ + dup = (xfs_dir2_data_unused_t *)ptr; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + /* + * Skip . + */ + if (dep->namelen == 1 && dep->name[0] == '.') + ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino); + /* + * Skip .., but make sure the inode number is right. + */ + else if (dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.') + ASSERT(be64_to_cpu(dep->inumber) == + dp->d_ops->sf_get_parent_ino(sfp)); + /* + * Normal entry, copy it into shortform. + */ + else { + sfep->namelen = dep->namelen; + xfs_dir2_sf_put_offset(sfep, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); + memcpy(sfep->name, dep->name, dep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ftype(sfep, + dp->d_ops->data_get_ftype(dep)); + + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + ptr += dp->d_ops->data_entsize(dep->namelen); + } + ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); + if (error) { + ASSERT(error != -ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); +out: + xfs_trans_log_inode(args->trans, dp, logflags); + kmem_free(dst); + return error; +} + +/* + * Add a name to a shortform directory. + * There are two algorithms, "easy" and "hard" which we decide on + * before changing anything. + * Convert to block form if necessary, if the new entry won't fit. + */ +int /* error */ +xfs_dir2_sf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int incr_isize; /* total change in size */ + int new_isize; /* di_size after adding name */ + int objchange; /* changing to 8-byte inodes */ + xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ + int pick; /* which algorithm to use */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ + + trace_xfs_dir2_sf_addname(args); + + ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Make sure the shortform value has some of its header. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Compute entry (and change in) size. + */ + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); + objchange = 0; + + /* + * Do we have to change to 8 byte inodes? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { + /* + * Yes, adjust the inode size. old count + (parent + new) + */ + incr_isize += + (sfp->count + 2) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + objchange = 1; + } + + new_isize = (int)dp->i_d.di_size + incr_isize; + /* + * Won't fit as shortform any more (due to size), + * or the pick routine says it won't (due to offset values). + */ + if (new_isize > XFS_IFORK_DSIZE(dp) || + (pick = + xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { + /* + * Just checking or no space reservation, it doesn't fit. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return -ENOSPC; + /* + * Convert to block form then add the name. + */ + error = xfs_dir2_sf_to_block(args); + if (error) + return error; + return xfs_dir2_block_addname(args); + } + /* + * Just checking, it fits. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + /* + * Do it the easy way - just add it at the end. + */ + if (pick == 1) + xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize); + /* + * Do it the hard way - look for a place to insert the new entry. + * Convert to 8 byte inode numbers first if necessary. + */ + else { + ASSERT(pick == 2); + if (objchange) + xfs_dir2_sf_toino8(args); + xfs_dir2_sf_addname_hard(args, objchange, new_isize); + } + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Add the new entry the "easy" way. + * This is copying the old directory and adding the new entry at the end. + * Since it's sorted by "offset" we need room after the last offset + * that's already there, and then room to convert to a block directory. + * This is already checked by the pick routine. + */ +static void +xfs_dir2_sf_addname_easy( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */ + xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ + int new_isize) /* new directory size */ +{ + int byteoff; /* byte offset in sf dir */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + byteoff = (int)((char *)sfep - (char *)sfp); + /* + * Grow the in-inode space. + */ + xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + XFS_DATA_FORK); + /* + * Need to set up again due to realloc of the inode data. + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); + /* + * Fill in the new entry. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + + /* + * Update the header and inode. + */ + sfp->count++; + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) + sfp->i8count++; + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Add the new entry the "hard" way. + * The caller has already converted to 8 byte inode numbers if necessary, + * in which case we need to leave the i8count at 1. + * Find a hole that the new entry will fit into, and copy + * the first part of the entries, the new entry, and the last part of + * the entries. + */ +/* ARGSUSED */ +static void +xfs_dir2_sf_addname_hard( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* changing inode number size */ + int new_isize) /* new directory size */ +{ + int add_datasize; /* data size need for new ent */ + char *buf; /* buffer for old */ + xfs_inode_t *dp; /* incore directory inode */ + int eof; /* reached end of old dir */ + int nbytes; /* temp for byte copies */ + xfs_dir2_data_aoff_t new_offset; /* next offset value */ + xfs_dir2_data_aoff_t offset; /* current offset value */ + int old_isize; /* previous di_size */ + xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ + xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ + xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ + + /* + * Copy the old directory to the stack buffer. + */ + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + old_isize = (int)dp->i_d.di_size; + buf = kmem_alloc(old_isize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + memcpy(oldsfp, sfp, old_isize); + /* + * Loop over the old directory finding the place we're going + * to insert the new entry. + * If it's going to end up at the end then oldsfep will point there. + */ + for (offset = dp->d_ops->data_first_offset, + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = dp->d_ops->data_entsize(args->namelen), + eof = (char *)oldsfep == &buf[old_isize]; + !eof; + offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), + eof = (char *)oldsfep == &buf[old_isize]) { + new_offset = xfs_dir2_sf_get_offset(oldsfep); + if (offset + add_datasize <= new_offset) + break; + } + /* + * Get rid of the old directory, then allocate space for + * the new one. We do this so xfs_idata_realloc won't copy + * the data. + */ + xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); + xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* + * Reset the pointer since the buffer was reallocated. + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Copy the first part of the directory, including the header. + */ + nbytes = (int)((char *)oldsfep - (char *)oldsfp); + memcpy(sfp, oldsfp, nbytes); + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes); + /* + * Fill in the new entry, and update the header counts. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + sfp->count++; + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) + sfp->i8count++; + /* + * If there's more left to copy, do that. + */ + if (!eof) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + memcpy(sfep, oldsfep, old_isize - nbytes); + } + kmem_free(buf); + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Decide if the new entry will fit at all. + * If it will fit, pick between adding the new entry to the end (easy) + * or somewhere else (hard). + * Return 0 (won't fit), 1 (easy), 2 (hard). + */ +/*ARGSUSED*/ +static int /* pick result */ +xfs_dir2_sf_addname_pick( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* inode # size changes */ + xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ + xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int holefit; /* found hole it will fit in */ + int i; /* entry number */ + xfs_dir2_data_aoff_t offset; /* data block offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + int size; /* entry's data size */ + int used; /* data bytes used */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + size = dp->d_ops->data_entsize(args->namelen); + offset = dp->d_ops->data_first_offset; + sfep = xfs_dir2_sf_firstentry(sfp); + holefit = 0; + /* + * Loop over sf entries. + * Keep track of data offset and whether we've seen a place + * to insert the new entry. + */ + for (i = 0; i < sfp->count; i++) { + if (!holefit) + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + /* + * Calculate data bytes used excluding the new entry, if this + * was a data block (block form directory). + */ + used = offset + + (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t); + /* + * If it won't fit in a block form then we can't insert it, + * we'll go back, convert to block, then try the insert and convert + * to leaf. + */ + if (used + (holefit ? 0 : size) > args->geo->blksize) + return 0; + /* + * If changing the inode number size, do it the hard way. + */ + if (objchange) + return 2; + /* + * If it won't fit at the end then do it the hard way (use the hole). + */ + if (used + size > args->geo->blksize) + return 2; + /* + * Do it the easy way. + */ + *sfepp = sfep; + *offsetp = offset; + return 1; +} + +#ifdef DEBUG +/* + * Check consistency of shortform directory, assert if bad. + */ +static void +xfs_dir2_sf_check( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry number */ + int i8count; /* number of big inode#s */ + xfs_ino_t ino; /* entry inode number */ + int offset; /* data offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + offset = dp->d_ops->data_first_offset; + ino = dp->d_ops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = dp->d_ops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + offset = + xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); + } + ASSERT(i8count == sfp->i8count); + ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT(offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); +} +#endif /* DEBUG */ + +/* + * Create a new (shortform) directory. + */ +int /* error, always 0 */ +xfs_dir2_sf_create( + xfs_da_args_t *args, /* operation arguments */ + xfs_ino_t pino) /* parent inode number */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i8count; /* parent inode is an 8-byte number */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + int size; /* directory size */ + + trace_xfs_dir2_sf_create(args); + + dp = args->dp; + + ASSERT(dp != NULL); + ASSERT(dp->i_d.di_size == 0); + /* + * If it's currently a zero-length extent file, + * convert it to local format. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + dp->i_df.if_flags |= XFS_IFINLINE; + } + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_bytes == 0); + i8count = pino > XFS_DIR2_MAX_SHORT_INUM; + size = xfs_dir2_sf_hdr_size(i8count); + /* + * Make a buffer for the data. + */ + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + /* + * Fill in the header, + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp->i8count = i8count; + /* + * Now can put in the inode number, since i8count is set. + */ + dp->d_ops->sf_put_parent_ino(sfp, pino); + sfp->count = 0; + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Lookup an entry in a shortform directory. + * Returns EEXIST if found, ENOENT if not found. + */ +int /* error */ +xfs_dir2_sf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int error; + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ + + trace_xfs_dir2_sf_lookup(args); + + xfs_dir2_sf_check(args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Special case for . + */ + if (args->namelen == 1 && args->name[0] == '.') { + args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; + return -EEXIST; + } + /* + * Special case for .. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + args->inumber = dp->d_ops->sf_get_parent_ino(sfp); + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; + return -EEXIST; + } + /* + * Loop over all the entries trying to match ours. + */ + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); + args->filetype = dp->d_ops->sf_get_ftype(sfep); + if (cmp == XFS_CMP_EXACT) + return -EEXIST; + ci_sfep = sfep; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return -ENOENT. + */ + if (!ci_sfep) + return -ENOENT; + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return error; +} + +/* + * Remove an entry from a shortform directory. + */ +int /* error */ +xfs_dir2_sf_removename( + xfs_da_args_t *args) +{ + int byteoff; /* offset of removed entry */ + xfs_inode_t *dp; /* incore directory inode */ + int entsize; /* this entry's size */ + int i; /* shortform entry index */ + int newsize; /* new inode size */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_removename(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + oldsize = (int)dp->i_d.di_size; + /* + * Bail out if the directory is way too short. + */ + if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == oldsize); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Loop over the old directory entries. + * Find the one we're deleting. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == + args->inumber); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->count) + return -ENOENT; + /* + * Calculate sizes. + */ + byteoff = (int)((char *)sfep - (char *)sfp); + entsize = dp->d_ops->sf_entsize(sfp, args->namelen); + newsize = oldsize - entsize; + /* + * Copy the part if any after the removed entry, sliding it down. + */ + if (byteoff + entsize < oldsize) + memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize, + oldsize - (byteoff + entsize)); + /* + * Fix up the header and file size. + */ + sfp->count--; + dp->i_d.di_size = newsize; + /* + * Reallocate, making it smaller. + */ + xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Are we changing inode number size? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + if (sfp->i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->i8count--; + } + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Replace the inode number of an entry in a shortform directory. + */ +int /* error */ +xfs_dir2_sf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino=0; /* entry old inode number */ + int i8elevated; /* sf_toino8 set i8count=1 */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_replace(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the shortform directory is way too small. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * New inode number is large, and need to convert to 8-byte inodes. + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { + int error; /* error return value */ + int newsize; /* new inode size */ + + newsize = + dp->i_df.if_bytes + + (sfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + /* + * Won't fit as shortform, convert to block then do replace. + */ + if (newsize > XFS_IFORK_DSIZE(dp)) { + error = xfs_dir2_sf_to_block(args); + if (error) { + return error; + } + return xfs_dir2_block_replace(args); + } + /* + * Still fits, convert to 8-byte now. + */ + xfs_dir2_sf_toino8(args); + i8elevated = 1; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + } else + i8elevated = 0; + + ASSERT(args->namelen != 1 || args->name[0] != '.'); + /* + * Replace ..'s entry. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ASSERT(args->inumber != ino); + dp->d_ops->sf_put_parent_ino(sfp, args->inumber); + } + /* + * Normal entry, look for the name. + */ + else { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ino = dp->d_ops->sf_get_ino(sfp, sfep); + ASSERT(args->inumber != ino); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->count) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (i8elevated) + xfs_dir2_sf_toino4(args); + return -ENOENT; + } + } + /* + * See if the old number was large, the new number is small. + */ + if (ino > XFS_DIR2_MAX_SHORT_INUM && + args->inumber <= XFS_DIR2_MAX_SHORT_INUM) { + /* + * And the old count was one, so need to convert to small. + */ + if (sfp->i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->i8count--; + } + /* + * See if the old number was small, the new number is large. + */ + if (ino <= XFS_DIR2_MAX_SHORT_INUM && + args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + /* + * add to the i8count unless we just converted to 8-byte + * inodes (which does an implied i8count = 1) + */ + ASSERT(sfp->i8count != 0); + if (!i8elevated) + sfp->i8count++; + } + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return 0; +} + +/* + * Convert from 8-byte inode numbers to 4-byte inode numbers. + * The last 8-byte inode number is gone, but the count is still 1. + */ +static void +xfs_dir2_sf_toino4( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino4(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 1); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize - + (oldsfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->count = oldsfp->count; + sfp->i8count = 0; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} + +/* + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. + */ +static void +xfs_dir2_sf_toino8( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino8(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 0); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size (nb: entry count + 1 for parent) + */ + newsize = + oldsize + + (oldsfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->count = oldsfp->count; + sfp->i8count = 1; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} diff --git a/kernel/fs/xfs/libxfs/xfs_dquot_buf.c b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c new file mode 100644 index 000000000..6fbf2d853 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" + +int +xfs_calc_dquots_per_chunk( + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + return errs; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk( + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool +xfs_dquot_buf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_dqid_t id = 0; + int ndquots; + int i; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_buf_verify"); + if (error) + return false; + } + return true; +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } +} + +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + diff --git a/kernel/fs/xfs/libxfs/xfs_format.h b/kernel/fs/xfs/libxfs/xfs_format.h new file mode 100644 index 000000000..4daaa6623 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_format.h @@ -0,0 +1,1461 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FORMAT_H__ +#define __XFS_FORMAT_H__ + +/* + * XFS On Disk Format Definitions + * + * This header file defines all the on-disk format definitions for + * general XFS objects. Directory and attribute related objects are defined in + * xfs_da_format.h, which log and log item formats are defined in + * xfs_log_format.h. Everything else goes here. + */ + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; + +/* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ +#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 +#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ + +#define XFS_SB_VERSION2_OKBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_rfsblock_t sb_dblocks; /* number of data blocks */ + xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ + xfs_rtblock_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_fsblock_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb structure to + * 64 bits. Some machines will be using this field for features2 bits. + * Easiest just to mark it bad and not use it for anything else. + * + * This is not kept up to date in memory; it is always overwritten by + * the value in sb_features2 when formatting the incore superblock to + * the disk buffer. + */ + __uint32_t sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_sb_t; + +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) + +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) +{ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; +} + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; +} + +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +{ + return sbp->sb_bad_features2 != sbp->sb_features2; +} + +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); +} + +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; +} + +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); +} + +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; +} + +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); +} + +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); +} + +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); +} + +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); +} + +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); +} + +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); +} + +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); +} + +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; +} + +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); +} + +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; +} + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; +} + +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); +} + +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + +/* + * end of superblock version macros + */ + +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) + +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) +#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) +#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ + XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +/* + * Allocation group header + * + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 + +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf { + /* + * Common allocation group header information + */ + __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ + __be32 agf_seqno; /* sequence # starting from 0 */ + __be32 agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __be32 agf_spare0; /* spare field */ + __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_spare1; /* spare field */ + + __be32 agf_flfirst; /* first freelist block's index */ + __be32 agf_fllast; /* last freelist block's index */ + __be32 agf_flcount; /* count of blocks in freelist */ + __be32 agf_freeblks; /* total free blocks */ + + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ +} xfs_agf_t; + +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi { + /* + * Common allocation group header information + */ + __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ + __be32 agi_seqno; /* sequence # starting from 0 */ + __be32 agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + __be32 agi_count; /* count of allocated inodes */ + __be32 agi_root; /* root of inode btree */ + __be32 agi_level; /* levels in inode btree */ + __be32 agi_freecount; /* number of free inodes */ + + __be32 agi_newino; /* new inode just allocated */ + __be32 agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ +} xfs_agi_t; + +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) + +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + + +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#define XFS_MIN_FREELIST(a,mp) \ + (XFS_MIN_FREELIST_RAW( \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + (XFS_MIN_FREELIST_RAW( \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) + +typedef struct xfs_timestamp { + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. + */ +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + +#define DI_MAX_FLUSH 0xffff + +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) + +/* + * Inode data & attribute fork sizes, per inode. + */ +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) + +#define XFS_DFORK_DSIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version)) +#define XFS_DFORK_ASIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) + +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)dip + xfs_dinode_size(dip->di_version)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#define XFS_DFORK_FORMAT(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ +#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) +#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ +#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog +#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog +#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log +#define XFS_INO_BITS(mp) \ + XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) + +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +/* + * RealTime Device format definitions + */ + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * RT Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((bp)->b_addr + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) + +/* + * Dquot and dquot block format definitions + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ +} xfs_dqblk_t; + +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + +/* + * Remote symlink format and access functions. + */ +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + + +/* + * Allocation Btree format definitions + * + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + + +/* + * Inode Allocation Btree format definitions + * + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) + + + +/* + * BMAP Btree format definitions + * + * This includes both the root block definition that sits inside an inode fork + * and the record/pointer formats for the leaf/node in the blocks. + */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + + +/* + * Generic Btree block format definitions + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + +/* + * On-disk XFS access control list structure. + */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; +}; + +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + +#endif /* __XFS_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_fs.h b/kernel/fs/xfs/libxfs/xfs_fs.h new file mode 100644 index 000000000..18dc721ca --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_fs.h @@ -0,0 +1,576 @@ +/* + * Copyright (c) 1995-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FS_H__ +#define __XFS_FS_H__ + +/* + * SGI's XFS filesystem's major stuff (constants, structures) + */ + +/* + * Direct I/O attribute record used with XFS_IOC_DIOINFO + * d_miniosz is the min xfer size, xfer size multiple and file seek offset + * alignment. + */ +#ifndef HAVE_DIOATTR +struct dioattr { + __u32 d_mem; /* data buffer memory alignment */ + __u32 d_miniosz; /* min xfer size */ + __u32 d_maxiosz; /* max xfer size */ +}; +#endif + +/* + * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. + */ +#ifndef HAVE_FSXATTR +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; +#endif + +/* + * Flags for the bs_xflags/fsx_xflags field + * There should be a one-to-one correspondence between these flags and the + * XFS_DIFLAG_s. + */ +#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + +/* + * Structure for XFS_IOC_GETBMAP. + * On input, fill in bmv_offset and bmv_length of the first structure + * to indicate the area of interest in the file, and bmv_entries with + * the number of array elements given back. The first structure is + * updated on return to give the offset and length for the next call. + */ +#ifndef HAVE_GETBMAP +struct getbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output) */ +}; +#endif + +/* + * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries + * are used exactly as in the getbmap structure. The getbmapx structure + * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field + * is only used for the first structure. It contains input flags + * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled + * in by the XFS_IOC_GETBMAPX command for each returned structure after + * the first. + */ +#ifndef HAVE_GETBMAPX +struct getbmapx { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output). */ + __s32 bmv_iflags; /* input flags (1st structure) */ + __s32 bmv_oflags; /* output flags (after 1st structure)*/ + __s32 bmv_unused1; /* future use */ + __s32 bmv_unused2; /* future use */ +}; +#endif + +/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ +#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ +#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_VALID \ + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + +/* bmv_oflags values - returned for each non-header segment */ +#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ +#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ +#define BMV_OF_LAST 0x4 /* segment is the last in the file */ + +/* + * Structure for XFS_IOC_FSSETDM. + * For use by backup and restore programs to set the XFS on-disk inode + * fields di_dmevmask and di_dmstate. These must be set to exactly and + * only values previously obtained via xfs_bulkstat! (Specifically the + * xfs_bstat_t fields bs_dmevmask and bs_dmstate.) + */ +#ifndef HAVE_FSDMIDATA +struct fsdmidata { + __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ + __u16 fsd_padding; + __u16 fsd_dmstate; /* corresponds to di_dmstate */ +}; +#endif + +/* + * File segment locking set data type for 64 bit access. + * Also used for all the RESV/FREE interfaces. + */ +typedef struct xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} xfs_flock64_t; + +/* + * Output for XFS_IOC_FSGEOMETRY_V1 + */ +typedef struct xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} xfs_fsop_geom_v1_t; + +/* + * Output for XFS_IOC_FSGEOMETRY + */ +typedef struct xfs_fsop_geom { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +} xfs_fsop_geom_t; + +/* Output for XFS_FS_COUNTS */ +typedef struct xfs_fsop_counts { + __u64 freedata; /* free data section blocks */ + __u64 freertx; /* free rt extents */ + __u64 freeino; /* free inodes */ + __u64 allocino; /* total allocated inodes */ +} xfs_fsop_counts_t; + +/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */ +typedef struct xfs_fsop_resblks { + __u64 resblks; + __u64 resblks_avail; +} xfs_fsop_resblks_t; + +#define XFS_FSOP_GEOM_VERSION 0 + +#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ +#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ + +/* + * Minimum and maximum sizes need for growth checks. + * + * Block counts are in units of filesystem blocks, not basic blocks. + */ +#define XFS_MIN_AG_BLOCKS 64 +#define XFS_MIN_LOG_BLOCKS 512ULL +#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) +#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) + +/* keep the maximum size under 2^31 by a small amount */ +#define XFS_MAX_LOG_BYTES \ + ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) + +/* Used for sanity checks on superblock */ +#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks) +#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \ + (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) + +/* + * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT + */ +typedef struct xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} xfs_growfs_data_t; + +typedef struct xfs_growfs_log { + __u32 newblocks; /* new log size, fsblocks */ + __u32 isint; /* 1 if new log is internal */ +} xfs_growfs_log_t; + +typedef struct xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} xfs_growfs_rt_t; + + +/* + * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE + */ +typedef struct xfs_bstime { + time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} xfs_bstime_t; + +typedef struct xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + xfs_bstime_t bs_atime; /* access time */ + xfs_bstime_t bs_mtime; /* modify time */ + xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} xfs_bstat_t; + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline __uint32_t +bstat_get_projid(struct xfs_bstat *bs) +{ + return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; +} + +/* + * The user-level BulkStat Request interface structure. + */ +typedef struct xfs_fsop_bulkreq { + __u64 __user *lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + void __user *ubuffer;/* user buffer for inode desc. */ + __s32 __user *ocount; /* output count pointer */ +} xfs_fsop_bulkreq_t; + + +/* + * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS). + */ +typedef struct xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} xfs_inogrp_t; + + +/* + * Error injection. + */ +typedef struct xfs_error_injection { + __s32 fd; + __s32 errtag; +} xfs_error_injection_t; + + +/* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_fs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 eof_min_file_size; + __u64 pad64[12]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm; + * kernel only, not included in + * valid mask */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) + + +/* + * The user-level Handle Request interface structure. + */ +typedef struct xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + void __user *path; /* user pathname */ + __u32 oflags; /* open flags */ + void __user *ihandle;/* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + void __user *ohandle;/* user buffer for handle */ + __u32 __user *ohandlen;/* user buffer length */ +} xfs_fsop_handlereq_t; + +/* + * Compound structures for passing args through Handle Request interfaces + * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and + * XFS_IOC_ATTRMULTI_BY_HANDLE + */ + +typedef struct xfs_fsop_setdm_handlereq { + struct xfs_fsop_handlereq hreq; /* handle information */ + struct fsdmidata __user *data; /* DMAPI data */ +} xfs_fsop_setdm_handlereq_t; + +typedef struct xfs_attrlist_cursor { + __u32 opaque[4]; +} xfs_attrlist_cursor_t; + +typedef struct xfs_fsop_attrlist_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + void __user *buffer; /* returned names */ +} xfs_fsop_attrlist_handlereq_t; + +typedef struct xfs_attr_multiop { + __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ + __s32 am_error; + void __user *am_attrname; + void __user *am_attrvalue; + __u32 am_length; + __u32 am_flags; +} xfs_attr_multiop_t; + +typedef struct xfs_fsop_attrmulti_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + struct xfs_attr_multiop __user *ops; /* attr_multi data */ +} xfs_fsop_attrmulti_handlereq_t; + +/* + * per machine unique filesystem identifier types. + */ +typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ + +typedef struct xfs_fid { + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* 64 bits inode number */ +} xfs_fid_t; + +typedef struct xfs_handle { + union { + __s64 align; /* force alignment of ha_fid */ + xfs_fsid_t _ha_fsid; /* unique file system identifier */ + } ha_u; + xfs_fid_t ha_fid; /* file system specific file ID */ +} xfs_handle_t; +#define ha_fsid ha_u._ha_fsid + +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ + - (char *) &(handle)) \ + + (handle).ha_fid.fid_len) + +/* + * Structure passed to XFS_IOC_SWAPEXT + */ +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ +#define XFS_SX_VERSION 0 + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* + * Flags for going down operation + */ +#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * ioctl commands that are used by Linux filesystems + */ +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS +#define XFS_IOC_GETVERSION FS_IOC_GETVERSION + +/* + * ioctl commands that replace IRIX fcntl()'s + * For 'documentation' purposed more than anything else, + * the "cmd #" field reflects the IRIX fcntl number. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) +#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) +#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) +#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) +#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64) +#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap) +#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr) +/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ +/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ +#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) +#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) + +/* + * ioctl commands that replace IRIX syssgi()'s + */ +#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1) +#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq) +#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq) +#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext) +#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data) +#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log) +#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt) +#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts) +#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks) +#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks) +#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) +#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) +/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ + +/* XFS_IOC_FREEZE -- FIFREEZE 119 */ +/* XFS_IOC_THAW -- FITHAW 120 */ +#ifndef FIFREEZE +#define XFS_IOC_FREEZE _IOWR('X', 119, int) +#define XFS_IOC_THAW _IOWR('X', 120, int) +#endif + +#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) +#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +/* XFS_IOC_GETFSUUID ---------- deprecated 140 */ + + +#ifndef HAVE_BBMACROS +/* + * Block I/O parameterization. A basic block (BB) is the lowest size of + * filesystem allocation, and must equal 512. Length units given to bio + * routines are in BB's. + */ +#define BBSHIFT 9 +#define BBSIZE (1<> BBSHIFT) +#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT) +#define BBTOB(bbs) ((bbs) << BBSHIFT) +#endif + +#endif /* __XFS_FS_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.c b/kernel/fs/xfs/libxfs/xfs_ialloc.c new file mode 100644 index 000000000..1c9e75521 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc.c @@ -0,0 +1,2202 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_icreate_item.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +/* + * Allocation group level functions. + */ +static inline int +xfs_ialloc_cluster_alignment( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + return mp->m_sb.sb_inoalignmt; + return 1; +} + +/* + * Lookup a record by ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + xfs_lookup_t dir, /* <=, >=, == */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); +} + +/* + * Update the record referred to by cur to the value given. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec) /* btree record */ +{ + union xfs_btree_rec rec; + + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec, /* btree record */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + } + return error; +} + +/* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + +/* + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). + */ +int +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int nbufs, blks_per_cluster, inodes_per_cluster; + int version; + int i, j; + xfs_daddr_t d; + xfs_ino_t ino = 0; + + /* + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; + + /* + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + mp->m_sb.sb_inodesize, length, gen); + } else + version = 2; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + if (!fbuf) + return -ENOMEM; + + /* Initialize the inode buffers and log them appropriately. */ + fbuf->b_ops = &xfs_inode_buf_ops; + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); + for (i = 0; i < inodes_per_cluster; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = xfs_dinode_size(version); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else if (tp) { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); + } + } + return 0; +} + +/* + * Allocate new inodes in the allocation group specified by agbp. + * Return 0 for success, else error code. + */ +STATIC int /* error code or 0 */ +xfs_ialloc_ag_alloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* alloc group buffer */ + int *alloc) +{ + xfs_agi_t *agi; /* allocation group header */ + xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_agnumber_t agno; + int error; + xfs_agino_t newino; /* new first inode's number */ + xfs_agino_t newlen; /* new number of inodes */ + int isaligned = 0; /* inode allocation at stripe unit */ + /* boundary */ + struct xfs_perag *pag; + + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * Locking will ensure that we don't have two callers in here + * at one time. + */ + newlen = args.mp->m_ialloc_inos; + if (args.mp->m_maxicount && + percpu_counter_read_positive(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) + return -ENOSPC; + args.minlen = args.maxlen = args.mp->m_ialloc_blks; + /* + * First try to allocate inodes contiguous with the last-allocated + * chunk of inodes. If the filesystem is striped, this will fill + * an entire stripe unit with inodes. + */ + agi = XFS_BUF_TO_AGI(agbp); + newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + args.mp->m_ialloc_blks; + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.prod = 1; + + /* + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. + */ + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; + + /* Allow space for the inode btree to split. */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; + } else + args.fsbno = NULLFSBLOCK; + + if (unlikely(args.fsbno == NULLFSBLOCK)) { + /* + * Set the alignment for the allocation. + * If stripe alignment is turned on then align at stripe unit + * boundary. + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size + * pieces, so don't need alignment anyway. + */ + isaligned = 0; + if (args.mp->m_sinoalign) { + ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + args.alignment = args.mp->m_dalign; + isaligned = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(args.mp); + /* + * Need to figure out where to allocate the inode blocks. + * Ideally they should be spaced out through the a.g. + * For now, just allocate blocks up front. + */ + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + /* + * Allocate a fixed-size extent of inodes. + */ + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.prod = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + /* + * If stripe alignment is turned on, then try again with cluster + * alignment. + */ + if (isaligned && args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + if (args.fsbno == NULLFSBLOCK) { + *alloc = 0; + return 0; + } + ASSERT(args.len == args.minlen); + + /* + * Stamp and write the inode buffers. + * + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. + */ + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, + args.len, prandom_u32()); + + if (error) + return error; + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + be32_add_cpu(&agi->agi_count, newlen); + be32_add_cpu(&agi->agi_freecount, newlen); + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); + agi->agi_newino = cpu_to_be32(newino); + + /* + * Insert records describing the new inode chunk into the btrees. + */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) + return error; + } + /* + * Log allocation group header fields + */ + xfs_ialloc_log_agi(tp, agbp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); + /* + * Modify/log superblock values for inode count and inode free count. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); + *alloc = 1; + return 0; +} + +STATIC xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor >= mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +/* + * Select an allocation group to look for a free inode in, based on the parent + * inode and the mode. Return the allocation group buffer. + */ +STATIC xfs_agnumber_t +xfs_ialloc_ag_select( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent directory inode number */ + umode_t mode, /* bits set to indicate file type */ + int okalloc) /* ok to allocate more space */ +{ + xfs_agnumber_t agcount; /* number of ag's in the filesystem */ + xfs_agnumber_t agno; /* current ag number */ + int flags; /* alloc buffer locking flags */ + xfs_extlen_t ineed; /* blocks needed for inode allocation */ + xfs_extlen_t longest = 0; /* longest extent available */ + xfs_mount_t *mp; /* mount point structure */ + int needspace; /* file mode implies space allocated */ + xfs_perag_t *pag; /* per allocation group data */ + xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; + + /* + * Files of these types need at least one block if length > 0 + * (and they won't fit in the inode, but that's hard to figure out). + */ + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + mp = tp->t_mountp; + agcount = mp->m_maxagi; + if (S_ISDIR(mode)) + pagno = xfs_ialloc_next_ag(mp); + else { + pagno = XFS_INO_TO_AGNO(mp, parent); + if (pagno >= agcount) + pagno = 0; + } + + ASSERT(pagno < agcount); + + /* + * Loop through allocation groups, looking for one with a little + * free space in it. Note we don't look for free inodes, exactly. + * Instead, we include whether there is a need to allocate inodes + * to mean that blocks must be allocated for them, + * if none are currently free. + */ + agno = pagno; + flags = XFS_ALLOC_FLAG_TRYLOCK; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto nextag; + } + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; + } + + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) + goto nextag; + } + + /* + * Check that there is enough free space for the file plus a + * chunk of inodes if we need to allocate some. If this is the + * first pass across the AGs, take into account the potential + * space needed for alignment of inode chunks when checking the + * longest contiguous free space in the AG - this prevents us + * from getting ENOSPC because we have free space larger than + * m_ialloc_blks but alignment constraints prevent us from using + * it. + * + * If we can't find an AG with space for full alignment slack to + * be taken into account, we must be near ENOSPC in all AGs. + * Hence we don't include alignment for the second pass and so + * if we fail allocation due to alignment issues then it is most + * likely a real ENOSPC condition. + */ + ineed = mp->m_ialloc_blks; + if (flags && ineed > 1) + ineed += xfs_ialloc_cluster_alignment(mp); + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; + } +nextag: + xfs_perag_put(pag); + /* + * No point in iterating over the rest, if we're shutting + * down. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + return NULLAGNUMBER; + agno++; + if (agno >= agcount) + agno = 0; + if (agno == pagno) { + if (flags == 0) + return NULLAGNUMBER; + flags = 0; + } + } +} + +/* + * Try to retrieve the next record to the left/right from the current one. + */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); + + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + } + + return 0; +} + +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + } + + return 0; +} + +/* + * Allocate an inode using the inobt-only algorithm. + */ +STATIC int +xfs_dialloc_ag_inobt( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; + + pag = xfs_perag_get(mp, agno); + + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + + restart_pagno: + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * If in the same AG as the parent, try to get near the parent. + */ + if (pagno == agno) { + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + int searchdistance = 10; + + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); + + if (rec.ir_freecount > 0) { + /* + * Found a free inode in the same chunk + * as the parent, done. + */ + goto alloc_inode; + } + + + /* + * In the same AG as parent, but parent's chunk is full. + */ + + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * Skip to last blocks looked up if same parent inode. + */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft); + if (error) + goto error1; + + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright); + if (error) + goto error1; + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) + goto error1; + + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + if (!--searchdistance) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; + } + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; + } + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } + if (error) + goto error1; + } + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } + + /* + * In a different AG from the parent. + * See if the most recently allocated block has any free. + */ +newino: + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) + goto error0; + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; + } + } + } + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + } + +alloc_inode: + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + error = xfs_inobt_update(cur, &rec); + if (error) + goto error0; + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); + *inop = ino; + return 0; +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) +{ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; + + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + return 0; + } + } + + /* + * Find the first inode available in the AG. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + return xfs_inobt_update(cur, &rec); +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno; + int error; + int ialloced; + int noroom = 0; + xfs_agnumber_t start_agno; + struct xfs_perag *pag; + + if (*IO_agbp) { + /* + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + goto out_alloc; + } + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { + *inop = NULLFSINO; + return 0; + } + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + * + * Read rough value of mp->m_icount by percpu_counter_read_positive, + * which will sacrifice the preciseness but improve the performance. + */ + if (mp->m_maxicount && + percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos + > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; + } + + /* + * Do a first racy fast path check if this AG is usable. + */ + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) + goto nextag_relse_buffer; + + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != -ENOSPC) + goto out_error; + + xfs_perag_put(pag); + *inop = NULLFSINO; + return 0; + } + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); + xfs_perag_put(pag); + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); +nextag: + xfs_perag_put(pag); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? -ENOSPC : 0; + } + } + +out_alloc: + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return error; +} + +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; + + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + + /* + * Initialize the cursor. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * Look for the entry describing this inode. + */ + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { + xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Get the offset in the inode chunk. + */ + off = agino - rec.ir_startino; + ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); + /* + * Mark the inode free & increment the count. + */ + rec.ir_free |= XFS_INOBT_MASK(off); + rec.ir_freecount++; + + /* + * When an inode cluster is free, it becomes eligible for removal + */ + if (!(mp->m_flags & XFS_MOUNT_IKEEP) && + (rec.ir_freecount == mp->m_ialloc_inos)) { + + *deleted = 1; + *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + + /* + * Remove the inode cluster from the AGI B+Tree, adjust the + * AGI and Superblock inode counts, and mark the disk space + * to be freed when the transaction is committed. + */ + ilen = mp->m_ialloc_inos; + be32_add_cpu(&agi->agi_count, -ilen); + be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); + + if ((error = xfs_btree_delete(cur, &i))) { + xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", + __func__, error); + goto error0; + } + + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); + } else { + *deleted = 0; + + error = xfs_inobt_update(cur, &rec); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", + __func__, error); + goto error0; + } + + /* + * Change the inode free counts and log the ag/sb changes. + */ + be32_add_cpu(&agi->agi_freecount, 1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); + } + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + *orec = rec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return -EINVAL; + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return -EINVAL; + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return -EINVAL; + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_alert(mp, + "%s: xfs_ialloc_read_agi() returned error %d, agno %d", + __func__, error, agno); + return error; + } + + /* + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = -EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + mp->m_ialloc_inos <= agino) + return -EINVAL; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return -EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ +{ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agnumber_t agno; /* allocation group number */ + int blks_per_cluster; /* num blocks per inode cluster */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + + ASSERT(ino != NULLFSINO); + + /* + * Split up the inode number into its parts. + */ + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { +#ifdef DEBUG + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) + return -EINVAL; + if (agno >= mp->m_sb.sb_agcount) { + xfs_alert(mp, + "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", + __func__, agno, mp->m_sb.sb_agcount); + } + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_alert(mp, + "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", + __func__, (unsigned long long)agbno, + (unsigned long)mp->m_sb.sb_agblocks); + } + if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_alert(mp, + "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + __func__, ino, + XFS_AGINO_TO_INO(mp, agno, agino)); + } + xfs_stack_trace(); +#endif /* DEBUG */ + return -EINVAL; + } + + blks_per_cluster = xfs_icluster_size_fsb(mp); + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; + } + + /* + * If the inode cluster size is the same as the blocksize or + * smaller we get to the buffer by simple arithmetics. + */ + if (blks_per_cluster == 1) { + offset = XFS_INO_TO_OFFSET(mp, ino); + ASSERT(offset < mp->m_sb.sb_inopblock); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_len = XFS_FSB_TO_BB(mp, 1); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + return 0; + } + + /* + * If the inode chunks are aligned then use simple maths to + * find the location. Otherwise we have to do a btree + * lookup to find the location. + */ + if (mp->m_inoalign_mask) { + offset_agbno = agbno & mp->m_inoalign_mask; + chunk_agbno = agbno - offset_agbno; + } else { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + } + +out_map: + ASSERT(agbno >= chunk_agbno); + cluster_agbno = chunk_agbno + + ((offset_agbno / blks_per_cluster) * blks_per_cluster); + offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + + XFS_INO_TO_OFFSET(mp, ino); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_alert(mp, + "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", + __func__, (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return -EINVAL; + } + return 0; +} + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> + XFS_INODES_PER_CHUNK_LOG; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_in_maxlevels = level; +} + +/* + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. + */ +void +xfs_ialloc_log_agi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* allocation group header buffer */ + int fields) /* bitmask of fields to log */ +{ + int first; /* first byte number */ + int last; /* last byte number */ + static const short offsets[] = { /* field starting offsets */ + /* keep in sync with bit definitions */ + offsetof(xfs_agi_t, agi_magicnum), + offsetof(xfs_agi_t, agi_versionnum), + offsetof(xfs_agi_t, agi_seqno), + offsetof(xfs_agi_t, agi_length), + offsetof(xfs_agi_t, agi_count), + offsetof(xfs_agi_t, agi_root), + offsetof(xfs_agi_t, agi_level), + offsetof(xfs_agi_t, agi_freecount), + offsetof(xfs_agi_t, agi_newino), + offsetof(xfs_agi_t, agi_dirino), + offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), + sizeof(xfs_agi_t) + }; +#ifdef DEBUG + xfs_agi_t *agi; /* allocation group header */ + + agi = XFS_BUF_TO_AGI(bp); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); +#endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + + /* + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. + */ + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + + /* + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. + */ + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } +} + +#ifdef DEBUG +STATIC void +xfs_check_agi_unlinked( + struct xfs_agi *agi) +{ + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); +} +#else +#define xfs_check_agi_unlinked(agi) +#endif + +static bool +xfs_agi_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; + /* + * Validate the magic number of the agi block. + */ + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; + + if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; + + xfs_check_agi_unlinked(agi); + return true; +} + +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + +/* + * Read in the allocation group header (inode allocation section) + */ +int +xfs_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + int error; + + trace_xfs_read_agi(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + if (error) + return error; + + xfs_buf_set_ref(*bpp, XFS_AGI_REF); + return 0; +} + +int +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_ialloc_read_agi(mp, agno); + + error = xfs_read_agi(mp, tp, agno, bpp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_init) { + pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); + pag->pagi_init = 1; + } + + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || + XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); + return 0; +} + +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.h b/kernel/fs/xfs/libxfs/xfs_ialloc.h new file mode 100644 index 000000000..100007d56 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_H__ +#define __XFS_IALLOC_H__ + +struct xfs_buf; +struct xfs_dinode; +struct xfs_imap; +struct xfs_mount; +struct xfs_trans; +struct xfs_btree_cur; + +/* Move inodes in clusters of this size */ +#define XFS_INODE_BIG_CLUSTER_SIZE 8192 + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} + +/* + * Make an inode pointer out of the buffer/offset. + */ +static inline struct xfs_dinode * +xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) +{ + return (struct xfs_dinode *) + (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); +} + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * To work within the constraint of one allocation per transaction, + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. If an inode is + * available without an allocation, agbp would be set to the current + * agbp and alloc_done set to false. + * If an allocation needed to be done, agbp would be set to the + * inode header of the allocation group and alloc_done set to true. + * The caller should then commit the current transaction and allocate a new + * transaction. xfs_dialloc() should then be called again with + * the agbp value returned from the previous call. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + * + * *agbp should be set to NULL on the first call, *alloc_done set to FALSE. + */ +int /* error */ +xfs_dialloc( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + umode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + struct xfs_buf **agbp, /* buf for a.g. inode header */ + xfs_ino_t *inop); /* inode number allocated */ + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int /* error */ +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino); /* first inode in deleted cluster */ + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags); /* flags for inode btree lookup */ + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* allocation group header buffer */ + int fields); /* bitmask of fields to log */ + +/* + * Read in the allocation group header (inode allocation section) + */ +int /* error */ +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp); /* allocation group hdr buf */ + +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + +/* + * Lookup a record by ino in the btree given by cur. + */ +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); + +/* + * Get the data from the pointed-to record. + */ +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); + +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); + +int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + + +#endif /* __XFS_IALLOC_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c new file mode 100644 index 000000000..964c465ca --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" + + +STATIC int +xfs_inobt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mnr[level != 0]; +} + +STATIC struct xfs_btree_cur * +xfs_inobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} + +STATIC void +xfs_inobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_root = nptr->s; + be32_add_cpu(&agi->agi_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); +} + +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); +} + +STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + xfs_agblock_t sbno = be32_to_cpu(start->s); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + + error = xfs_alloc_vextent(&args); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + + new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); + *stat = 1; + return 0; +} + +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + xfs_fsblock_t fsbno; + int error; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_free_extent(cur->bc_tp, fsbno, 1); + if (error) + return error; + + xfs_trans_binval(cur->bc_tp, bp); + return error; +} + +STATIC int +xfs_inobt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mxr[level != 0]; +} + +STATIC void +xfs_inobt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->inobt.ir_startino = rec->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = key->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); +} + +/* + * initial value of ptr for lookup + */ +STATIC void +xfs_inobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + + ptr->s = agi->agi_root; +} + +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; +} + +STATIC __int64_t +xfs_inobt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + cur->bc_rec.i.ir_startino; +} + +static int +xfs_inobt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): + break; + default: + return 0; + } + + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} + +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_inobt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_inobt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->inobt.ir_startino) < + be32_to_cpu(k2->inobt.ir_startino); +} + +STATIC int +xfs_inobt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->inobt.ir_startino); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_inobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_inobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + +/* + * Allocate a new inode btree cursor. + */ +struct xfs_btree_cur * /* new inode btree cursor */ +xfs_inobt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agi structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; + } + + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an inobt btree block. + */ +int +xfs_inobt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_INOBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +} diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h new file mode 100644 index 000000000..d7ebea72c --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_BTREE_H__ +#define __XFS_IALLOC_BTREE_H__ + +/* + * Inode map on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_INOBT_REC_ADDR(mp, block, index) \ + ((xfs_inobt_rec_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_inobt_rec_t)))) + +#define XFS_INOBT_KEY_ADDR(mp, block, index) \ + ((xfs_inobt_key_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_inobt_key_t))) + +#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_inobt_ptr_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_inobt_key_t) + \ + ((index) - 1) * sizeof(xfs_inobt_ptr_t))) + +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); +extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.c b/kernel/fs/xfs/libxfs/xfs_inode_buf.c new file mode 100644 index 000000000..002b6b3a1 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.c @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); + } + } +} +#endif + +/* + * If we are doing readahead on an inode buffer, we might be in log recovery + * reading an inode allocation buffer that hasn't yet been replayed, and hence + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * + * If the readahead buffer is invalid, we don't want to mark it with an error, + * but we do want to clear the DONE status of the buffer so that a followup read + * will re-read it from disk. This will ensure that we don't get an unnecessary + * warnings during log recovery and we don't get unnecssary panics on debug + * kernels. + */ +static void +xfs_inode_buf_verify( + struct xfs_buf *bp, + bool readahead) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; + return; + } + + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); +#ifdef DEBUG + xfs_alert(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +static void +xfs_inode_buf_readahead_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, true); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + +const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .verify_read = xfs_inode_buf_readahead_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + +/* + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. + */ +int +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) +{ + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == -EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } + + if (error == -EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return -EINVAL; + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; + } + + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + return 0; +} + +void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF); + dip->di_crc = xfs_end_cksum(crc); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + * + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); + if (error) + return error; + + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = -EFSCORRUPTED; + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat_fork() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat_fork(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + } + + /* + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.h b/kernel/fs/xfs/libxfs/xfs_inode_buf.h new file mode 100644 index 000000000..9308c47f2 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_BUF_H__ +#define __XFS_INODE_BUF_H__ + +struct xfs_inode; +struct xfs_dinode; +struct xfs_icdinode; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +#endif /* __XFS_INODE_BUF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.c b/kernel/fs/xfs/libxfs/xfs_inode_fork.c new file mode 100644 index 000000000..0defbd02f --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.c @@ -0,0 +1,1902 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_attr_sf.h" + +kmem_zone_t *xfs_ifork_zone; + +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +int +xfs_iformat_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error = 0; + xfs_fsize_t di_size; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size < 0 || + di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return -EFSCORRUPTED; + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = -EFSCORRUPTED; + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + mp, dip); + return -EFSCORRUPTED; + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * Convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned_be64(ep->l0, &dp->l0); + put_unaligned_be64(ep->l1, &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + uint count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = min(count, XFS_LINEAR_EXTS); + count -= erp->er_extcount; + if (count) + erp_idx++; + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents after adding */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* Switch from the inline extent buffer to a direct extent list */ + else { + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return erp; +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.h b/kernel/fs/xfs/libxfs/xfs_inode_fork.h new file mode 100644 index 000000000..7d3b1ed6d --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_FORK_H__ +#define __XFS_INODE_FORK_H__ + +struct xfs_inode_log_item; +struct xfs_dinode; + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + +int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, + int); + +struct xfs_bmbt_rec_host * + xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, + struct xfs_bmbt_irec *, int); +void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, + xfs_extnum_t, int); +void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(struct xfs_ifork *, int); +void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +void xfs_iext_destroy(struct xfs_ifork *); +struct xfs_bmbt_rec_host * + xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, + int); +void xfs_iext_irec_init(struct xfs_ifork *); +struct xfs_ext_irec * + xfs_iext_irec_new(struct xfs_ifork *, int); +void xfs_iext_irec_remove(struct xfs_ifork *, int); +void xfs_iext_irec_compact(struct xfs_ifork *); +void xfs_iext_irec_compact_pages(struct xfs_ifork *); +void xfs_iext_irec_compact_full(struct xfs_ifork *); +void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); + +extern struct kmem_zone *xfs_ifork_zone; + +#endif /* __XFS_INODE_FORK_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_format.h b/kernel/fs/xfs/libxfs/xfs_log_format.h new file mode 100644 index 000000000..265314690 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_format.h @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_FORMAT_H__ +#define __XFS_LOG_FORMAT_H__ + +struct xfs_mount; +struct xfs_trans_res; + +/* + * On-disk Log Format definitions. + * + * This file contains all the on-disk format definitions used within the log. It + * includes the physical log structure itself, as well as all the log item + * format structures that are written into the log and intepreted by log + * recovery. We start with the physical log format definitions, and then work + * through all the log items definitions and everything they encode into the + * log. + */ +typedef __uint32_t xlog_tid_t; + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +/* Minimum number of transactions that must fit in the log (defined by mkfs) */ +#define XFS_MIN_LOG_FACTOR 3 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +/* Log Clients */ +#define XFS_TRANSACTION 0x69 +#define XFS_VOLUME 0x2 +#define XFS_LOG 0xaa + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __le32 h_crc; /* crc of log record : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* not an on-disk structure, but needed by log recovery in userspace */ +typedef struct xfs_log_iovec { + void *i_addr; /* beginning address of region */ + int i_len; /* length in bytes of region */ + uint i_type; /* type of region */ +} xfs_log_iovec_t; + + +/* + * Transaction Header definitions. + * + * This is the structure written in the log at the head of every transaction. It + * identifies the type and id of the transaction, and contains the number of + * items logged by the transaction so we know how many to expect during + * recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ + { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + +/* + * Inode Log Item Format definitions. + * + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ + + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +/* + * Incore version of the on-disk inode core structures. We log this directly + * into the journal in host CPU format (for better or worse) and as such + * directly mirrors the xfs_dinode structure as it must contain all the same + * information. + */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_icdinode_t; + +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + +/* + * Buffer Log Format defintions + * + * These are the physical dirty bitmap defintions for the log format structure. + */ +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF (1<<0) + +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL (1<<1) + +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* + * EFI/EFD log format definitions + */ +typedef struct xfs_extent { + xfs_fsblock_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + +/* + * Dquot Log format definitions. + * + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) + +/* + * Inode create log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +#endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_recover.h b/kernel/fs/xfs/libxfs/xfs_log_recover.h new file mode 100644 index 000000000..1c55ccbb3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_recover.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_RECOVER_H__ +#define __XFS_LOG_RECOVER_H__ + +/* + * Macros, structures, prototypes for internal log manager use. + */ + +#define XLOG_RHASH_BITS 4 +#define XLOG_RHASH_SIZE 16 +#define XLOG_RHASH_SHIFT 2 +#define XLOG_RHASH(tid) \ + ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) + + +/* + * item headers are in ri_buf[0]. Additional buffers follow. + */ +typedef struct xlog_recover_item { + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ +} xlog_recover_item_t; + +struct xlog_tid; +typedef struct xlog_recover { + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ +} xlog_recover_t; + +#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) + +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_RECOVER_PASS1 1 +#define XLOG_RECOVER_PASS2 2 + +#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_rlimit.c b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c new file mode 100644 index 000000000..c10597973 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2013 Jie Liu. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_trans_space.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_bmap_btree.h" + +/* + * Calculate the maximum length in bytes that would be required for a local + * attribute value as large attributes out of line are not logged. + */ +STATIC int +xfs_log_calc_max_attrsetm_res( + struct xfs_mount *mp) +{ + int size; + int nblks; + + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - + MAXNAMELEN - 1; + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + nblks += XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); + + return M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; +} + +/* + * Iterate over the log space reservation table to figure out and return + * the maximum one in terms of the pre-calculated values which were done + * at mount time. + */ +STATIC void +xfs_log_get_max_trans_res( + struct xfs_mount *mp, + struct xfs_trans_res *max_resp) +{ + struct xfs_trans_res *resp; + struct xfs_trans_res *end_resp; + int log_space = 0; + int attr_space; + + attr_space = xfs_log_calc_max_attrsetm_res(mp); + + resp = (struct xfs_trans_res *)M_RES(mp); + end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (; resp < end_resp; resp++) { + int tmp = resp->tr_logcount > 1 ? + resp->tr_logres * resp->tr_logcount : + resp->tr_logres; + if (log_space < tmp) { + log_space = tmp; + *max_resp = *resp; /* struct copy */ + } + } + + if (attr_space > log_space) { + *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + max_resp->tr_logres = attr_space; + } +} + +/* + * Calculate the minimum valid log size for the given superblock configuration. + * Used to calculate the minimum log size at mkfs time, and to determine if + * the log is large enough or not at mount time. Returns the minimum size in + * filesystem block size units. + */ +int +xfs_log_calc_minimum_size( + struct xfs_mount *mp) +{ + struct xfs_trans_res tres = {0}; + int max_logres; + int min_logblks = 0; + int lsunit = 0; + + xfs_log_get_max_trans_res(mp, &tres); + + max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); + if (tres.tr_logcount > 1) + max_logres *= tres.tr_logcount; + + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + lsunit = BTOBB(mp->m_sb.sb_logsunit); + + /* + * Two factors should be taken into account for calculating the minimum + * log space. + * 1) The fundamental limitation is that no single transaction can be + * larger than half size of the log. + * + * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR + * define, which is set to 3. That means we can definitely fit + * maximally sized 2 transactions in the log. We'll use this same + * value here. + * + * 2) If the lsunit option is specified, a transaction requires 2 LSU + * for the reservation because there are two log writes that can + * require padding - the transaction data and the commit record which + * are written separately and both can require padding to the LSU. + * Consider that we can have an active CIL reservation holding 2*LSU, + * but the CIL is not over a push threshold, in this case, if we + * don't have enough log space for at one new transaction, which + * includes another 2*LSU in the reservation, we will run into dead + * loop situation in log space grant procedure. i.e. + * xlog_grant_head_wait(). + * + * Hence the log size needs to be able to contain two maximally sized + * and padded transactions, which is (2 * (2 * LSU + maxlres)). + * + * Also, the log size should be a multiple of the log stripe unit, round + * it up to lsunit boundary if lsunit is specified. + */ + if (lsunit) { + min_logblks = roundup_64(BTOBB(max_logres), lsunit) + + 2 * lsunit; + } else + min_logblks = BTOBB(max_logres) + 2 * BBSIZE; + min_logblks *= XFS_MIN_LOG_FACTOR; + + return XFS_BB_TO_FSB(mp, min_logblks); +} diff --git a/kernel/fs/xfs/libxfs/xfs_quota_defs.h b/kernel/fs/xfs/libxfs/xfs_quota_defs.h new file mode 100644 index 000000000..1b0a08379 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_quota_defs.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_DEFS_H__ +#define __XFS_QUOTA_DEFS_H__ + +/* + * Quota definitions shared between user and kernel source trees. + */ + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_FREEING, "FREEING" } + +/* + * We have the possibility of all three quota types being active at once, and + * hence free space modification requires modification of all three current + * dquots in a single transaction. For this case we need to have a reservation + * of at least 3 dquots. + * + * However, a chmod operation can change both UID and GID in a single + * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be + * modified. Hence for this case we need to reserve space for at least 4 dquots. + * + * And in the worst case, there's a rename operation that can be modifying up to + * 4 inodes with dquots attached to them. In reality, the only inodes that can + * have their dquots modified are the source and destination directory inodes + * due to directory name creation and removal. That can require space allocation + * and/or freeing on both directory inodes, and hence all three dquots on each + * inode can be modified. And if the directories are world writeable, all the + * dquots can be unique and so 6 dquots can be modified.... + * + * And, of course, we also need to take into account the dquot log format item + * used to describe each dquot. + */ +#define XFS_DQUOT_LOGRES(mp) \ + ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, + xfs_dqid_t id, uint type, uint flags, char *str); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_rtbitmap.c b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c new file mode 100644 index 000000000..9b59ffa1f --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_rtalloc.h" + + +/* + * Realtime allocator bitmap functions shared with userspace. + */ + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +int +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap = 1; + int error; /* error value */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + *bpp = bp; + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Read and/or modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + * + * Summary information is returned in *sum if specified. + * If no delta is specified, returns summary only. + */ +int +xfs_rtmodify_summary_int( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (*rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (*rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + *rbpp = bp; + *rsb = sb; + } + /* + * Point to the summary information, modify/log it, and/or copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + if (delta) { + uint first = (uint)((char *)sp - (char *)bp->b_addr); + + *sp += delta; + xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + } + if (sum) + *sum = *sp; + return 0; +} + +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, + delta, rbpp, rsb, NULL); +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +int +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +int +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +int +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; + + error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + if (error) + return error; + ASSERT(stat); + return 0; +} +#else +#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#endif +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + + mp = tp->t_mountp; + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + if (error) + return error; + + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + diff --git a/kernel/fs/xfs/libxfs/xfs_sb.c b/kernel/fs/xfs/libxfs/xfs_sb.c new file mode 100644 index 000000000..dc4bfc5d8 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_sb.c @@ -0,0 +1,803 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" + +/* + * Physical superblock buffer manipulations. Shared with libxfs in userspace. + */ + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + bool check_inprogress, + bool check_version) +{ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + xfs_warn(mp, "bad magic number"); + return -EWRONGFS; + } + + + if (!xfs_sb_good_version(sbp)) { + xfs_warn(mp, "bad version"); + return -EWRONGFS; + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return -EINVAL; + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return -EINVAL; + } + } + + if (xfs_sb_version_has_pquotino(sbp)) { + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + xfs_notice(mp, + "Version 5 of Super block has XFS_OQUOTA bits."); + return -EFSCORRUPTED; + } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); + return -EFSCORRUPTED; + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return -EINVAL; + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return -EINVAL; + } + + /* + * More sanity checking. Most of these were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { + xfs_notice(mp, "SB sanity check failed"); + return -EFSCORRUPTED; + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + return -ENOSYS; + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return -ENOSYS; + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + return -EFBIG; + } + + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + return -EFSCORRUPTED; + } + return 0; +} + +void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + /* + * older mkfs doesn't initialize quota inodes to NULLFSINO. This + * leads to in-core values having two different values for a quota + * inode to be invalid: 0 and NULLFSINO. Change it to a single value + * NULLFSINO. + * + * Note that this change affect only the in-core values. These + * values are not written back to disk unless any quota information + * is written to the disk. Even in that case, sb_pquotino field is + * not written to disk unless the superblock supports pquotino. + */ + if (sbp->sb_uquotino == 0) + sbp->sb_uquotino = NULLFSINO; + if (sbp->sb_gquotino == 0) + sbp->sb_gquotino = NULLFSINO; + if (sbp->sb_pquotino == 0) + sbp->sb_pquotino = NULLFSINO; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(sbp)) + return; + + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); + + if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + /* + * In older version of superblock, on-disk superblock only + * has sb_gquotino, and in-core superblock has both sb_gquotino + * and sb_pquotino. But, only one of them is supported at any + * point of time. So, if PQUOTA is set in disk superblock, + * copy over sb_gquotino to sb_pquotino. + */ + sbp->sb_pquotino = sbp->sb_gquotino; + sbp->sb_gquotino = NULLFSINO; + } +} + +static void +__xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from, + bool convert_xquota) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + /* crc is only used on disk, not in memory; just init to 0 here. */ + to->sb_crc = 0; + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* Convert on-disk flags to in-memory flags? */ + if (convert_xquota) + xfs_sb_quota_from_disk(to); +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + __xfs_sb_from_disk(to, from, true); +} + +static void +xfs_sb_quota_to_disk( + struct xfs_dsb *to, + struct xfs_sb *from) +{ + __uint16_t qflags = from->sb_qflags; + + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); + if (xfs_sb_version_has_pquotino(from)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + to->sb_pquotino = cpu_to_be64(from->sb_pquotino); + return; + } + + /* + * The in-core version of sb_qflags do not have XFS_OQUOTA_* + * flags, whereas the on-disk version does. So, convert incore + * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + + /* + * GQUOTINO and PQUOTINO cannot be used together in versions + * of superblock that do not have pquotino. from->sb_flags + * tells us which quota is active and should be copied to + * disk. If neither are active, we should NULL the inode. + * + * In all cases, the separate pquotino must remain 0 because it + * it beyond the "end" of the valid non-pquotino superblock. + */ + if (from->sb_qflags & XFS_GQUOTA_ACCT) + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + else if (from->sb_qflags & XFS_PQUOTA_ACCT) + to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } + + to->sb_pquotino = 0; +} + +void +xfs_sb_to_disk( + struct xfs_dsb *to, + struct xfs_sb *from) +{ + xfs_sb_quota_to_disk(to, from); + + to->sb_magicnum = cpu_to_be32(from->sb_magicnum); + to->sb_blocksize = cpu_to_be32(from->sb_blocksize); + to->sb_dblocks = cpu_to_be64(from->sb_dblocks); + to->sb_rblocks = cpu_to_be64(from->sb_rblocks); + to->sb_rextents = cpu_to_be64(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = cpu_to_be64(from->sb_logstart); + to->sb_rootino = cpu_to_be64(from->sb_rootino); + to->sb_rbmino = cpu_to_be64(from->sb_rbmino); + to->sb_rsumino = cpu_to_be64(from->sb_rsumino); + to->sb_rextsize = cpu_to_be32(from->sb_rextsize); + to->sb_agblocks = cpu_to_be32(from->sb_agblocks); + to->sb_agcount = cpu_to_be32(from->sb_agcount); + to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks); + to->sb_logblocks = cpu_to_be32(from->sb_logblocks); + to->sb_versionnum = cpu_to_be16(from->sb_versionnum); + to->sb_sectsize = cpu_to_be16(from->sb_sectsize); + to->sb_inodesize = cpu_to_be16(from->sb_inodesize); + to->sb_inopblock = cpu_to_be16(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = cpu_to_be64(from->sb_icount); + to->sb_ifree = cpu_to_be64(from->sb_ifree); + to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks); + to->sb_frextents = cpu_to_be64(from->sb_frextents); + + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt); + to->sb_unit = cpu_to_be32(from->sb_unit); + to->sb_width = cpu_to_be32(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize); + to->sb_logsunit = cpu_to_be32(from->sb_logsunit); + + /* + * We need to ensure that bad_features2 always matches features2. + * Hence we enforce that here rather than having to remember to do it + * everywhere else that updates features2. + */ + from->sb_bad_features2 = from->sb_features2; + to->sb_features2 = cpu_to_be32(from->sb_features2); + to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); + + if (xfs_sb_version_hascrc(from)) { + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_lsn = cpu_to_be64(from->sb_lsn); + } +} + +static int +xfs_sb_verify( + struct xfs_buf *bp, + bool check_version) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + + /* + * Use call variant which doesn't convert quota flags from disk + * format, because xfs_mount_validate_sb checks the on-disk flags. + */ + __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); +} + +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + * + * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the + * last field in V4 secondary superblocks. So for secondary superblocks, + * we are more forgiving, and ignore CRC failures if the primary doesn't + * indicate that the fs version is V5. + */ +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ + if (bp->b_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = -EFSBADCRC; + goto out_error; + } + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + xfs_buf_ioerror(bp, error); + if (error == -EFSCORRUPTED || error == -EFSBADCRC) + xfs_verifier_error(bp); + } +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, then run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, -EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_sb_mount_common( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + + /* Overwrite incore superblock counters with just-read data */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + xfs_reinit_percpu_counters(mp); + + return 0; +} + +/* + * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock + * into the superblock buffer to be logged. It does not provide the higher + * level of locking that is needed to protect the in-core superblock from + * concurrent access. + */ +void +xfs_log_sb( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); +} + +/* + * xfs_sync_sb + * + * Sync the superblock to disk. + * + * Note that the caller is responsible for checking the frozen state of the + * filesystem. This procedure uses the non-blocking transaction allocator and + * thus will allow modifications to a frozen fs. This is required because this + * code can be called during the process of freezing where use of the high-level + * allocator would deadlock. + */ +int +xfs_sync_sb( + struct xfs_mount *mp, + bool wait) +{ + struct xfs_trans *tp; + int error; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_log_sb(tp); + if (wait) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} diff --git a/kernel/fs/xfs/libxfs/xfs_sb.h b/kernel/fs/xfs/libxfs/xfs_sb.h new file mode 100644 index 000000000..b25bb9a34 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_sb.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SB_H__ +#define __XFS_SB_H__ + +/* + * perag get/put wrappers for ref counting + */ +extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +extern void xfs_perag_put(struct xfs_perag *pag); +extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +extern void xfs_sb_calc_crc(struct xfs_buf *bp); +extern void xfs_log_sb(struct xfs_trans *tp); +extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); +extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); +extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); + +#endif /* __XFS_SB_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_shared.h b/kernel/fs/xfs/libxfs/xfs_shared.h new file mode 100644 index 000000000..8dda4b321 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_shared.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SHARED_H__ +#define __XFS_SHARED_H__ + +/* + * Definitions shared between kernel and userspace that don't fit into any other + * header file that is shared with userspace. + */ +struct xfs_ifork; +struct xfs_buf; +struct xfs_buf_ops; +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; + +/* + * Buffer verifier operations are widely used, including userspace tools + */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +/* + * Transaction types. Used to distinguish types of buffers. These never reach + * the log. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_SB_CHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_FSYNC_TS 35 +#define XFS_TRANS_GROWFSRT_ALLOC 36 +#define XFS_TRANS_GROWFSRT_ZERO 37 +#define XFS_TRANS_GROWFSRT_FREE 38 +#define XFS_TRANS_SWAPEXT 39 +#define XFS_TRANS_CHECKPOINT 40 +#define XFS_TRANS_ICREATE 41 +#define XFS_TRANS_CREATE_TMPFILE 42 +#define XFS_TRANS_TYPE_MAX 43 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_ICREATE, "ICREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* log size calculation functions */ +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer reference count + * values. This determines how hard the buffer cache tries to hold onto the + * buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +#endif /* __XFS_SHARED_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_symlink_remote.c b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c new file mode 100644 index 000000000..e7e26bd64 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_symlink_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.c b/kernel/fs/xfs/libxfs/xfs_trans_resv.c new file mode 100644 index 000000000..68cb1e7bf --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} + +/* + * Logging inodes is really tricksy. They are logged in memory format, + * which means that what we write into the log doesn't directly translate into + * the amount of space they use on disk. + * + * Case in point - btree format forks in memory format use more space than the + * on-disk format. In memory, the buffer contains a normal btree block header so + * the btree code can treat it as though it is just another generic buffer. + * However, when we write it to the inode fork, we don't write all of this + * header as it isn't needed. e.g. the root is only ever in the inode, so + * there's no need for sibling pointers which would waste 16 bytes of space. + * + * Hence when we have an inode with a maximally sized btree format fork, then + * amount of information we actually log is greater than the size of the inode + * on disk. Hence we need an inode reservation function that calculates all this + * correctly. So, we log: + * + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks + * - inode log format object + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. + */ +STATIC uint +xfs_calc_inode_res( + struct xfs_mount *mp, + uint ninodes) +{ + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; +} + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 4) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); +} + +/* + * For create we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: mp->m_ialloc_blks * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +/* + * For icreate we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) + */ +STATIC uint +xfs_calc_icreate_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + +} + +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_iunlink_remove_reservation(mp) + + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Setting an attribute at mount time. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime(see + * below). + */ +STATIC uint +xfs_calc_attrsetm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: + * ext * M_RES(mp)->tr_attrsetrt.tr_logres + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space for quota file extent allocation + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); + resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); + resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); + resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); + resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); + resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; + resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp); + resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT; + resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp); + resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT; + resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT; + resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp); + resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT; + resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + /* + * The following transactions are logged in logical format with + * a default log count. + */ + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_equotaoff.tr_logres = + xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); + resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + /* The following transaction are logged in logical format */ + resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); + resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); + resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); +} diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.h b/kernel/fs/xfs/libxfs/xfs_trans_resv.h new file mode 100644 index 000000000..2d5bdfce6 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_RESV_H__ +#define __XFS_TRANS_RESV_H__ + +struct xfs_mount; + +/* + * structure for maintaining pre-calculated transaction reservations. + */ +struct xfs_trans_res { + uint tr_logres; /* log space unit in bytes per log ticket */ + int tr_logcount; /* number of log operations per log ticket */ + int tr_logflags; /* log flags, currently only used for indicating + * a reservation request is permanent or not */ +}; + +struct xfs_trans_resv { + struct xfs_trans_res tr_write; /* extent alloc trans */ + struct xfs_trans_res tr_itruncate; /* truncate trans */ + struct xfs_trans_res tr_rename; /* rename trans */ + struct xfs_trans_res tr_link; /* link trans */ + struct xfs_trans_res tr_remove; /* unlink trans */ + struct xfs_trans_res tr_symlink; /* symlink trans */ + struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ + struct xfs_trans_res tr_mkdir; /* mkdir trans */ + struct xfs_trans_res tr_ifree; /* inode free trans */ + struct xfs_trans_res tr_ichange; /* inode update trans */ + struct xfs_trans_res tr_growdata; /* fs data section grow trans */ + struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ + struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ + struct xfs_trans_res tr_attrinval; /* attr fork buffer + * invalidation */ + struct xfs_trans_res tr_attrsetm; /* set/create an attribute at + * mount time */ + struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at + * runtime */ + struct xfs_trans_res tr_attrrm; /* remove an attribute */ + struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */ + struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ + struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ + struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ + struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ + struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ + struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ + struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ + struct xfs_trans_res tr_sb; /* modify superblock */ + struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ +}; + +/* shorthand way of accessing reservation structure */ +#define M_RES(mp) (&(mp)->m_resv) + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); + +#endif /* __XFS_TRANS_RESV_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_trans_space.h b/kernel/fs/xfs/libxfs/xfs_trans_space.h new file mode 100644 index 000000000..bf9c45793 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_space.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_SPACE_H__ +#define __XFS_TRANS_SPACE_H__ + +/* + * Components of space reservations. + */ +#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ + (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) +#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) +#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) +#define XFS_DAENTER_DBS(mp,w) \ + (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) +#define XFS_DAENTER_BLOCKS(mp,w) \ + (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w)) +#define XFS_DAENTER_BMAP1B(mp,w) \ + XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w) +#define XFS_DAENTER_BMAPS(mp,w) \ + (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w)) +#define XFS_DAENTER_SPACE_RES(mp,w) \ + (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w)) +#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w) +#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1 +#define XFS_DIRENTER_SPACE_RES(mp,nl) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \ + XFS_DIRENTER_MAX_SPLIT(mp,nl)) +#define XFS_DIRREMOVE_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) +#define XFS_IALLOC_SPACE_RES(mp) \ + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) + +/* + * Space reservation values for various transactions. + */ +#define XFS_ADDAFORK_SPACE_RES(mp) \ + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) +#define XFS_ATTRRM_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) +/* This macro is not used - see inline code in xfs_attr_set */ +#define XFS_ATTRSET_SPACE_RES(mp, v) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) +#define XFS_CREATE_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_DIOSTRAT_SPACE_RES(mp, v) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) +#define XFS_GROWFS_SPACE_RES(mp) \ + (2 * XFS_AG_MAXLEVELS(mp)) +#define XFS_GROWFSRT_SPACE_RES(mp,b) \ + ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) +#define XFS_LINK_SPACE_RES(mp,nl) \ + XFS_DIRENTER_SPACE_RES(mp,nl) +#define XFS_MKDIR_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_QM_DQALLOC_SPACE_RES(mp) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ + XFS_DQUOT_CLUSTER_SIZE_FSB) +#define XFS_QM_QINOCREATE_SPACE_RES(mp) \ + XFS_IALLOC_SPACE_RES(mp) +#define XFS_REMOVE_SPACE_RES(mp) \ + XFS_DIRREMOVE_SPACE_RES(mp) +#define XFS_RENAME_SPACE_RES(mp,nl) \ + (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + + +#endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_types.h b/kernel/fs/xfs/libxfs/xfs_types.h new file mode 100644 index 000000000..b79dc66b2 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_types.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TYPES_H__ +#define __XFS_TYPES_H__ + +typedef __uint32_t prid_t; /* project ID */ + +typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ +typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef __uint32_t xfs_agnumber_t; /* allocation group number */ +typedef __int32_t xfs_extnum_t; /* # of extents in a file */ +typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef __int64_t xfs_fsize_t; /* bytes in a file */ +typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ + +typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ + +typedef __int64_t xfs_lsn_t; /* log sequence number */ +typedef __int32_t xfs_tid_t; /* transaction identifier */ + +typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ + +typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __uint64_t xfs_fileoff_t; /* block number in a file */ +typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + +typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ + +/* + * Null values for the types. + */ +#define NULLFSBLOCK ((xfs_fsblock_t)-1) +#define NULLRFSBLOCK ((xfs_rfsblock_t)-1) +#define NULLRTBLOCK ((xfs_rtblock_t)-1) +#define NULLFILEOFF ((xfs_fileoff_t)-1) + +#define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLEXTNUM ((xfs_extnum_t)-1) + +#define NULLCOMMITLSN ((xfs_lsn_t)-1) + +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + +/* + * Max values for extlen, extnum, aextnum. + */ +#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ +#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ +#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ + +/* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* + * Inode fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* + * Min numbers of data/attr fork btree root pointers. + */ +#define MINDBTPTRS 3 +#define MINABTPTRS 2 + +/* + * MAXNAMELEN is the length (including the terminating null) of + * the longest permissible file (component) name. + */ +#define MAXNAMELEN 256 + +typedef enum { + XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi +} xfs_lookup_t; + +typedef enum { + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, + XFS_BTNUM_FINOi, XFS_BTNUM_MAX +} xfs_btnum_t; + +struct xfs_name { + const unsigned char *name; + int len; + int type; +}; + +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + + +#endif /* __XFS_TYPES_H__ */ diff --git a/kernel/fs/xfs/mrlock.h b/kernel/fs/xfs/mrlock.h new file mode 100644 index 000000000..e3c92d19e --- /dev/null +++ b/kernel/fs/xfs/mrlock.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include + +typedef struct { + struct rw_semaphore mr_lock; +#if defined(DEBUG) || defined(XFS_WARN) + int mr_writer; +#endif +} mrlock_t; + +#if defined(DEBUG) || defined(XFS_WARN) +#define mrinit(mrp, name) \ + do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + +#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) +#define mrfree(mrp) do { } while (0) + +static inline void mraccess_nested(mrlock_t *mrp, int subclass) +{ + down_read_nested(&mrp->mr_lock, subclass); +} + +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) +{ + down_write_nested(&mrp->mr_lock, subclass); +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 1; +#endif +} + +static inline int mrtryaccess(mrlock_t *mrp) +{ + return down_read_trylock(&mrp->mr_lock); +} + +static inline int mrtryupdate(mrlock_t *mrp) +{ + if (!down_write_trylock(&mrp->mr_lock)) + return 0; +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 1; +#endif + return 1; +} + +static inline void mrunlock_excl(mrlock_t *mrp) +{ +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); +} + +static inline void mrunlock_shared(mrlock_t *mrp) +{ + up_read(&mrp->mr_lock); +} + +static inline void mrdemote(mrlock_t *mrp) +{ +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; +#endif + downgrade_write(&mrp->mr_lock); +} + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/kernel/fs/xfs/uuid.c b/kernel/fs/xfs/uuid.c new file mode 100644 index 000000000..b83f76b6d --- /dev/null +++ b/kernel/fs/xfs/uuid.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +/* IRIX interpretation of an uuid_t */ +typedef struct { + __be32 uu_timelow; + __be16 uu_timemid; + __be16 uu_timehi; + __be16 uu_clockseq; + __be16 uu_node[3]; +} xfs_uu_t; + +/* + * uuid_getnodeuniq - obtain the node unique fields of a UUID. + * + * This is not in any way a standard or condoned UUID function; + * it just something that's needed for user-level file handles. + */ +void +uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) +{ + xfs_uu_t *uup = (xfs_uu_t *)uuid; + + fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | + be16_to_cpu(uup->uu_timemid); + fsid[1] = be32_to_cpu(uup->uu_timelow); +} + +int +uuid_is_nil(uuid_t *uuid) +{ + int i; + char *cp = (char *)uuid; + + if (uuid == NULL) + return 0; + /* implied check of version number here... */ + for (i = 0; i < sizeof *uuid; i++) + if (*cp++) return 0; /* not nil */ + return 1; /* is nil */ +} + +int +uuid_equal(uuid_t *uuid1, uuid_t *uuid2) +{ + return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; +} diff --git a/kernel/fs/xfs/uuid.h b/kernel/fs/xfs/uuid.h new file mode 100644 index 000000000..104db0f3b --- /dev/null +++ b/kernel/fs/xfs/uuid.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_UUID_H__ +#define __XFS_SUPPORT_UUID_H__ + +typedef struct { + unsigned char __u_bits[16]; +} uuid_t; + +extern int uuid_is_nil(uuid_t *uuid); +extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); +extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); + +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + +#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/kernel/fs/xfs/xfs.h b/kernel/fs/xfs/xfs.h new file mode 100644 index 000000000..a742c47f7 --- /dev/null +++ b/kernel/fs/xfs/xfs.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_H__ +#define __XFS_H__ + +#ifdef CONFIG_XFS_DEBUG +#define STATIC +#define DEBUG 1 +#define XFS_BUF_LOCK_TRACKING 1 +#endif + +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + +#include "xfs_linux.h" + +#endif /* __XFS_H__ */ diff --git a/kernel/fs/xfs/xfs_acl.c b/kernel/fs/xfs/xfs_acl.c new file mode 100644 index 000000000..4b641676f --- /dev/null +++ b/kernel/fs/xfs/xfs_acl.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_acl.h" +#include "xfs_attr.h" +#include "xfs_trace.h" +#include +#include +#include + + +/* + * Locking scheme: + * - all ACL updates are protected by inode->i_mutex, which is taken before + * calling into this file. + */ + +STATIC struct posix_acl * +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) +{ + struct posix_acl_entry *acl_e; + struct posix_acl *acl; + struct xfs_acl_entry *ace; + unsigned int count, i; + + count = be32_to_cpu(aclp->acl_cnt); + if (count > max_entries) + return ERR_PTR(-EFSCORRUPTED); + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + acl_e = &acl->a_entries[i]; + ace = &aclp->acl_entry[i]; + + /* + * The tag is 32 bits on disk and 16 bits in core. + * + * Because every access to it goes through the core + * format first this is not a problem. + */ + acl_e->e_tag = be32_to_cpu(ace->ae_tag); + acl_e->e_perm = be16_to_cpu(ace->ae_perm); + + switch (acl_e->e_tag) { + case ACL_USER: + acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + break; + case ACL_GROUP: + acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +STATIC void +xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) +{ + const struct posix_acl_entry *acl_e; + struct xfs_acl_entry *ace; + int i; + + aclp->acl_cnt = cpu_to_be32(acl->a_count); + for (i = 0; i < acl->a_count; i++) { + ace = &aclp->acl_entry[i]; + acl_e = &acl->a_entries[i]; + + ace->ae_tag = cpu_to_be32(acl_e->e_tag); + switch (acl_e->e_tag) { + case ACL_USER: + ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + break; + case ACL_GROUP: + ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + break; + default: + ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); + break; + } + + ace->ae_perm = cpu_to_be16(acl_e->e_perm); + } +} + +struct posix_acl * +xfs_get_acl(struct inode *inode, int type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct posix_acl *acl = NULL; + struct xfs_acl *xfs_acl; + unsigned char *ea_name; + int error; + int len; + + trace_xfs_get_acl(ip); + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + ea_name = SGI_ACL_DEFAULT; + break; + default: + BUG(); + } + + /* + * If we have a cached ACLs value just return it, not need to + * go out to the disk. + */ + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return ERR_PTR(-ENOMEM); + + error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); + if (error) { + /* + * If the attribute doesn't exist make sure we have a negative + * cache entry, for any other error assume it is transient and + * leave the cache entry as ACL_NOT_CACHED. + */ + if (error == -ENOATTR) + goto out_update_cache; + goto out; + } + + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + if (IS_ERR(acl)) + goto out; + +out_update_cache: + set_cached_acl(inode, type, acl); +out: + kmem_free(xfs_acl); + return acl; +} + +STATIC int +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct xfs_inode *ip = XFS_I(inode); + unsigned char *ea_name; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + ea_name = SGI_ACL_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + struct xfs_acl *xfs_acl; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); + + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return -ENOMEM; + + xfs_acl_to_disk(xfs_acl, acl); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); + + error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, + len, ATTR_ROOT); + + kmem_free(xfs_acl); + } else { + /* + * A NULL ACL argument means we want to remove the ACL. + */ + error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; + } + + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +static int +xfs_set_mode(struct inode *inode, umode_t mode) +{ + int error = 0; + + if (mode != inode->i_mode) { + struct iattr iattr; + + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; + iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); + + error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + } + + return error; +} + +static int +xfs_acl_exists(struct inode *inode, unsigned char *name) +{ + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); + + return (xfs_attr_get(XFS_I(inode), name, NULL, &len, + ATTR_ROOT|ATTR_KERNOVAL) == 0); +} + +int +posix_acl_access_exists(struct inode *inode) +{ + return xfs_acl_exists(inode, SGI_ACL_FILE); +} + +int +posix_acl_default_exists(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode)) + return 0; + return xfs_acl_exists(inode, SGI_ACL_DEFAULT); +} + +int +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error = 0; + + if (!acl) + goto set_acl; + + error = -E2BIG; + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) + return error; + + if (type == ACL_TYPE_ACCESS) { + umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + + if (error <= 0) { + acl = NULL; + + if (error < 0) + return error; + } + + error = xfs_set_mode(inode, mode); + if (error) + return error; + } + + set_acl: + return __xfs_set_acl(inode, type, acl); +} diff --git a/kernel/fs/xfs/xfs_acl.h b/kernel/fs/xfs/xfs_acl.h new file mode 100644 index 000000000..3841b07f2 --- /dev/null +++ b/kernel/fs/xfs/xfs_acl.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ACL_H__ +#define __XFS_ACL_H__ + +struct inode; +struct posix_acl; +struct xfs_inode; + +#ifdef CONFIG_XFS_POSIX_ACL +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int posix_acl_access_exists(struct inode *inode); +extern int posix_acl_default_exists(struct inode *inode); +#else +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} +# define xfs_set_acl NULL +# define posix_acl_access_exists(inode) 0 +# define posix_acl_default_exists(inode) 0 +#endif /* CONFIG_XFS_POSIX_ACL */ +#endif /* __XFS_ACL_H__ */ diff --git a/kernel/fs/xfs/xfs_aops.c b/kernel/fs/xfs/xfs_aops.c new file mode 100644 index 000000000..a56960dd1 --- /dev/null +++ b/kernel/fs/xfs/xfs_aops.c @@ -0,0 +1,1931 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include +#include +#include +#include + +void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; +} + +/* + * We're now finished for good with this ioend structure. + * Update the page state via the associated buffer_heads, + * release holds on the inode and bio, and finally free + * up memory. Do not use the ioend after this. + */ +STATIC void +xfs_destroy_ioend( + xfs_ioend_t *ioend) +{ + struct buffer_head *bh, *next; + + for (bh = ioend->io_buffer_head; bh; bh = next) { + next = bh->b_private; + bh->b_end_io(bh, !ioend->io_error); + } + + mempool_free(ioend, xfs_ioend_pool); +} + +/* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +STATIC int +xfs_setfilesize_trans_alloc( + struct xfs_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + ioend->io_append_trans = tp; + + /* + * We may pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* + * We hand off the transaction to the completion thread now, so + * clear the flag here. + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return 0; +} + +/* + * Update on-disk file size now that data has been written to disk. + */ +STATIC int +xfs_setfilesize( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_off_t offset, + size_t size) +{ + xfs_fsize_t isize; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_new_eof(ip, offset + size); + if (!isize) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, 0); + return 0; + } + + trace_xfs_setfilesize(ip, offset, size); + + ip->i_d.di_size = isize; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return xfs_trans_commit(tp, 0); +} + +STATIC int +xfs_setfilesize_ioend( + struct xfs_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; + + /* + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as being in a transaction manually. + * Similarly for freeze protection. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); +} + +/* + * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. + */ +STATIC void +xfs_finish_ioend( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); + } +} + +/* + * IO write completion. + */ +STATIC void +xfs_end_io( + struct work_struct *work) +{ + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ioend->io_error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_append_trans) { + error = xfs_setfilesize_ioend(ioend); + } else { + ASSERT(!xfs_ioend_is_append(ioend)); + } + +done: + if (error) + ioend->io_error = error; + xfs_destroy_ioend(ioend); +} + +/* + * Allocate and initialise an IO completion structure. + * We need to track unwritten extent write completion here initially. + * We'll need to extend this for updating the ondisk inode size later + * (vs. incore size). + */ +STATIC xfs_ioend_t * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type) +{ + xfs_ioend_t *ioend; + + ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); + + /* + * Set the count to 1 initially, which will prevent an I/O + * completion callback from happening before we have started + * all the I/O from calling the completion routine too early. + */ + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_list = NULL; + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_buffer_head = NULL; + ioend->io_buffer_tail = NULL; + ioend->io_offset = 0; + ioend->io_size = 0; + ioend->io_append_trans = NULL; + + INIT_WORK(&ioend->io_work, xfs_end_io); + return ioend; +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + struct xfs_bmbt_irec *imap, + int type, + int nonblocking) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (type == XFS_IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -EAGAIN; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_super->s_maxbytes); + + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return error; + + if (type == XFS_IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return error; + } + +#ifdef DEBUG + if (type == XFS_IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; +} + +STATIC int +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; +} + +/* + * BIO completion handler for buffered IO. + */ +STATIC void +xfs_end_bio( + struct bio *bio, + int error) +{ + xfs_ioend_t *ioend = bio->bi_private; + + ASSERT(atomic_read(&bio->bi_cnt) >= 1); + ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + + /* Toss bio and pass work off to an xfsdatad thread */ + bio->bi_private = NULL; + bio->bi_end_io = NULL; + bio_put(bio); + + xfs_finish_ioend(ioend); +} + +STATIC void +xfs_submit_ioend_bio( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) +{ + atomic_inc(&ioend->io_remaining); + bio->bi_private = ioend; + bio->bi_end_io = xfs_end_bio; + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); +} + +STATIC struct bio * +xfs_alloc_ioend_bio( + struct buffer_head *bh) +{ + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + + ASSERT(bio->bi_private == NULL); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + return bio; +} + +STATIC void +xfs_start_buffer_writeback( + struct buffer_head *bh) +{ + ASSERT(buffer_mapped(bh)); + ASSERT(buffer_locked(bh)); + ASSERT(!buffer_delay(bh)); + ASSERT(!buffer_unwritten(bh)); + + mark_buffer_async_write(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); +} + +STATIC void +xfs_start_page_writeback( + struct page *page, + int clear_dirty, + int buffers) +{ + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); + + /* + * if the page was not fully cleaned, we need to ensure that the higher + * layers come back to it correctly. That means we need to keep the page + * dirty, and for WB_SYNC_ALL writeback we need to ensure the + * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to + * write this page in this writeback sweep will be made. + */ + if (clear_dirty) { + clear_page_dirty_for_io(page); + set_page_writeback(page); + } else + set_page_writeback_keepwrite(page); + + unlock_page(page); + + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) + end_page_writeback(page); +} + +static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) +{ + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/* + * Submit all of the bios for all of the ioends we have saved up, covering the + * initial writepage page and also any probed pages. + * + * Because we may have multiple ioends spanning a page, we need to start + * writeback on all the buffers before we submit them for I/O. If we mark the + * buffers as we got, then we can end up with a page that only has buffers + * marked async write and I/O complete on can occur before we mark the other + * buffers async write. + * + * The end result of this is that we trip a bug in end_page_writeback() because + * we call it twice for the one page as the code in end_buffer_async_write() + * assumes that all buffers on the page are started at the same time. + * + * The fix is two passes across the ioend list - one to start writeback on the + * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. + */ +STATIC void +xfs_submit_ioend( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + int fail) +{ + xfs_ioend_t *head = ioend; + xfs_ioend_t *next; + struct buffer_head *bh; + struct bio *bio; + sector_t lastblock = 0; + + /* Pass 1 - start writeback */ + do { + next = ioend->io_list; + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) + xfs_start_buffer_writeback(bh); + } while ((ioend = next) != NULL); + + /* Pass 2 - submit I/O */ + ioend = head; + do { + next = ioend->io_list; + bio = NULL; + + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = fail; + xfs_finish_ioend(ioend); + continue; + } + + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + + if (!bio) { + retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(wbc, ioend, bio); + xfs_finish_ioend(ioend); + } while ((ioend = next) != NULL); +} + +/* + * Cancel submission of all buffer_heads so far in this endio. + * Toss the endio too. Only ever called for the initial page + * in a writepage request, so only ever one page. + */ +STATIC void +xfs_cancel_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh, *next_bh; + + do { + next = ioend->io_list; + bh = ioend->io_buffer_head; + do { + next_bh = bh->b_private; + clear_buffer_async_write(bh); + /* + * The unwritten flag is cleared when added to the + * ioend. We're not submitting for I/O so mark the + * buffer unwritten again for next time around. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) + set_buffer_unwritten(bh); + unlock_buffer(bh); + } while ((bh = next_bh) != NULL); + + mempool_free(ioend, xfs_ioend_pool); + } while ((ioend = next) != NULL); +} + +/* + * Test to see if we've been building up a completion structure for + * earlier buffers -- if so, we try to append to this ioend if we + * can, otherwise we finish off any current ioend and start another. + * Return true if we've finished the given ioend. + */ +STATIC void +xfs_add_to_ioend( + struct inode *inode, + struct buffer_head *bh, + xfs_off_t offset, + unsigned int type, + xfs_ioend_t **result, + int need_ioend) +{ + xfs_ioend_t *ioend = *result; + + if (!ioend || need_ioend || type != ioend->io_type) { + xfs_ioend_t *previous = *result; + + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_buffer_head = bh; + ioend->io_buffer_tail = bh; + if (previous) + previous->io_list = ioend; + *result = ioend; + } else { + ioend->io_buffer_tail->b_private = bh; + ioend->io_buffer_tail = bh; + } + + bh->b_private = NULL; + ioend->io_size += bh->b_size; +} + +STATIC void +xfs_map_buffer( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); + + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); + + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); + + bh->b_blocknr = bn; + set_buffer_mapped(bh); +} + +STATIC void +xfs_map_at_offset( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + xfs_map_buffer(inode, bh, imap, offset); + set_buffer_mapped(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); +} + +/* + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. + */ +STATIC bool +xfs_check_page_type( + struct page *page, + unsigned int type, + bool check_all_buffers) +{ + struct buffer_head *bh; + struct buffer_head *head; + + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } + + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); + + return false; +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC int +xfs_convert_page( + struct inode *inode, + struct page *page, + loff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + xfs_off_t end_offset; + unsigned long p_offset; + unsigned int type; + int len, page_dirty; + int count = 0, done = 0, uptodate = 1; + xfs_off_t offset = page_offset(page); + + if (page->index != tindex) + goto fail; + if (!trylock_page(page)) + goto fail; + if (PageWriteback(page)) + goto fail_unlock_page; + if (page->mapping != inode->i_mapping) + goto fail_unlock_page; + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) + goto fail_unlock_page; + + /* + * page_dirty is initially a count of buffers on the page before + * EOF and is decremented as we move each into a cleanable state. + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. + */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + i_size_read(inode)); + + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + + len = 1 << inode->i_blkbits; + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; + page_dirty = p_offset / len; + + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ + bh = head = page_buffers(page); + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh))) { + done = 1; + break; + } + + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { + if (buffer_unwritten(bh)) + type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = XFS_IO_DELALLOC; + else + type = XFS_IO_OVERWRITE; + + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); + + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + + page_dirty--; + count++; + } else { + done = 1; + break; + } + } while (offset += len, (bh = bh->b_this_page) != head); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (count) { + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) + done = 1; + } + xfs_start_page_writeback(page, !page_dirty, count); + + return done; + fail_unlock_page: + unlock_page(page); + fail: + return 1; +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc, + pgoff_t tlast) +{ + struct pagevec pvec; + int done = 0, i; + + pagevec_init(&pvec, 0); + while (!done && tindex <= tlast) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + done = xfs_convert_page(inode, pvec.pages[i], tindex++, + imap, ioendp, wbc); + if (done) + break; + } + + pagevec_release(&pvec); + cond_resched(); + } +} + +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned int offset, + unsigned int length) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_alert(ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int error; + xfs_fileoff_t start_fsb; + + if (!buffer_delay(bh)) + goto next_buffer; + + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += 1 << inode->i_blkbits; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); + return; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. + */ +STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh, *head; + struct xfs_bmbt_irec imap; + xfs_ioend_t *ioend = NULL, *iohead = NULL; + loff_t offset; + unsigned int type; + __uint64_t end_offset; + pgoff_t end_index, last_index; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; + int count = 0; + int nonblocking = 0; + + trace_xfs_writepage(inode, page, 0, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should never happen except in the case of a VM regression so + * warn about it. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) + goto redirty; + + /* + * Given that we do not allow direct reclaim to call us, we should + * never be called while in a filesystem transaction. + */ + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + goto redirty; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. + */ + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; + } + + len = 1 << inode->i_blkbits; + + bh = head = page_buffers(page); + offset = page_offset(page); + type = XFS_IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE) + nonblocking = 1; + + do { + int new_ioend = 0; + + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + imap_valid = 0; + continue; + } + + if (buffer_unwritten(bh)) { + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; + imap_valid = 0; + } + } else if (buffer_delay(bh)) { + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; + imap_valid = 0; + } + } else if (buffer_uptodate(bh)) { + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; + imap_valid = 0; + } + } else { + if (PageUptodate(page)) + ASSERT(buffer_mapped(bh)); + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; + continue; + } + + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { + /* + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. + */ + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; + } + + if (!iohead) + iohead = ioend; + + } while (offset += len, ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + xfs_start_page_writeback(page, 1, count); + + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, end_index); + } + + + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); + + return 0; + +error: + if (iohead) + xfs_cancel_ioend(iohead); + + if (err == -EAGAIN) + goto redirty; + + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + return err; + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +STATIC int +xfs_vm_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return generic_writepages(mapping, wbc); +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. The page should already be clean. We always + * have buffer heads in this call. + * + * Returns 1 if the page is ok to release, 0 otherwise. + */ +STATIC int +xfs_vm_releasepage( + struct page *page, + gfp_t gfp_mask) +{ + int delalloc, unwritten; + + trace_xfs_releasepage(page->mapping->host, page, 0, 0); + + xfs_count_page_state(page, &delalloc, &unwritten); + + if (WARN_ON_ONCE(delalloc)) + return 0; + if (WARN_ON_ONCE(unwritten)) + return 0; + + return try_to_free_buffers(page); +} + +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + +STATIC int +__xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create, + int direct) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + struct xfs_bmbt_irec imap; + int nimaps = 1; + xfs_off_t offset; + ssize_t size; + int new = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + offset = (xfs_off_t)iblock << inode->i_blkbits; + ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } + + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; + } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; + error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); + } + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); + xfs_iunlock(ip, lockmode); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) + set_buffer_unwritten(bh_result); + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); + } + + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. + */ + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); + + /* + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || + (new || ISUNWRITTEN(&imap)))) + set_buffer_new(bh_result); + + if (imap.br_startblock == DELAYSTARTBLOCK) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +int +xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); +} + +STATIC int +xfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); +} + +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_end_io; + + /* + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. + */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + + /* + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. + */ + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); + +out_end_io: + xfs_end_io(&ioend->io_work); + return; +} + +STATIC ssize_t +xfs_vm_direct_IO( + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + + if (iov_iter_rw(iter) == WRITE) { + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); + } + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, NULL, NULL, 0); +} + +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + +STATIC void +xfs_vm_write_failed( + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) +{ + loff_t block_offset; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; + + /* + * The request pos offset might be 32 or 64 bit, this is all fine + * on 64-bit platform. However, for 64-bit pos request on 32-bit + * platform, the high 32-bit will be masked off if we evaluate the + * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is + * 0xfffff000 as an unsigned long, hence the result is incorrect + * which could cause the following ASSERT failed in most cases. + * In order to avoid this, we can evaluate the block_offset of the + * start of the page by using shifts rather than masks the mismatch + * problem. + */ + block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + + ASSERT(block_offset + from == pos); + + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; + + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); + } + +} + +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ +STATIC int +xfs_vm_write_begin( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; + + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; +} + +/* + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. + */ +STATIC int +xfs_vm_write_end( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned copied, + struct page *page, + void *fsdata) +{ + int ret; + + ASSERT(len <= PAGE_CACHE_SIZE); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; + xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); + } + } + return ret; +} + +STATIC sector_t +xfs_vm_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_vm_bmap(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + filemap_write_and_wait(mapping); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return generic_block_bmap(mapping, block, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); +} + +/* + * This is basically a copy of __set_page_dirty_buffers() with one + * small tweak: buffers beyond EOF do not get marked dirty. If we mark them + * dirty, we'll never be able to clean them because we don't write buffers + * beyond EOF, and that means we can't invalidate pages that span EOF + * that have been marked dirty. Further, the dirty state can leak into + * the file interior if the file is extended, resulting in all sorts of + * bad things happening as the state does not match the underlying data. + * + * XXX: this really indicates that bufferheads in XFS need to die. Warts like + * this only exist because of bufferheads and how the generic code manages them. + */ +STATIC int +xfs_vm_set_page_dirty( + struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + loff_t end_offset; + loff_t offset; + int newly_dirty; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + end_offset = i_size_read(inode); + offset = page_offset(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (offset < end_offset) + set_buffer_dirty(bh); + bh = bh->b_this_page; + offset += 1 << inode->i_blkbits; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) { + /* sigh - __set_page_dirty() is static, so copy it here, too */ + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return newly_dirty; +} + +const struct address_space_operations xfs_address_space_operations = { + .readpage = xfs_vm_readpage, + .readpages = xfs_vm_readpages, + .writepage = xfs_vm_writepage, + .writepages = xfs_vm_writepages, + .set_page_dirty = xfs_vm_set_page_dirty, + .releasepage = xfs_vm_releasepage, + .invalidatepage = xfs_vm_invalidatepage, + .write_begin = xfs_vm_write_begin, + .write_end = xfs_vm_write_end, + .bmap = xfs_vm_bmap, + .direct_IO = xfs_vm_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/kernel/fs/xfs/xfs_aops.h b/kernel/fs/xfs/xfs_aops.h new file mode 100644 index 000000000..ac644e013 --- /dev/null +++ b/kernel/fs/xfs/xfs_aops.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2005-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_AOPS_H__ +#define __XFS_AOPS_H__ + +extern mempool_t *xfs_ioend_pool; + +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } + +/* + * xfs_ioend struct manages large extent writes for XFS. + * It can manage several multi-page bio's at once. + */ +typedef struct xfs_ioend { + struct xfs_ioend *io_list; /* next ioend in chain */ + unsigned int io_type; /* delalloc / unwritten */ + int io_error; /* I/O error code */ + atomic_t io_remaining; /* hold count */ + struct inode *io_inode; /* file being written to */ + struct buffer_head *io_buffer_head;/* buffer linked list head */ + struct buffer_head *io_buffer_tail;/* buffer linked list tail */ + size_t io_size; /* size of the extent */ + xfs_off_t io_offset; /* offset in the file */ + struct work_struct io_work; /* xfsdatad work queue */ + struct xfs_trans *io_append_trans;/* xact. for size update */ +} xfs_ioend_t; + +extern const struct address_space_operations xfs_address_space_operations; +extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +extern void xfs_count_page_state(struct page *, int *, int *); + +#endif /* __XFS_AOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_attr.h b/kernel/fs/xfs/xfs_attr.h new file mode 100644 index 000000000..dd4824589 --- /dev/null +++ b/kernel/fs/xfs/xfs_attr.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + +/* + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * External interfaces + *========================================================================*/ + + +#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ +#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ + +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ + +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" } + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an ERANGE return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + unsigned char *, int, int, unsigned char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Overall external interface routines. + */ +int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); + + +#endif /* __XFS_ATTR_H__ */ diff --git a/kernel/fs/xfs/xfs_attr_inactive.c b/kernel/fs/xfs/xfs_attr_inactive.c new file mode 100644 index 000000000..3fbf167cf --- /dev/null +++ b/kernel/fs/xfs/xfs_attr_inactive.c @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_attr_remote.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_dir2.h" + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) +{ + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) { + return error; + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + if (!bp) + return -ENOMEM; + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return 0; +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; + struct xfs_mount *mp = bp->b_target->bt_mount; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_trans_brelse(*trans, bp); + return 0; + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr3_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free(list); + return error; +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr3_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, i; + struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + return -EIO; + } + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { + xfs_trans_brelse(*trans, bp); + return 0; + } + btree = dp->d_ops->node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < ichdr.count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return error; + if (child_bp) { + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, child_bp); + break; + } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + return 0; +} + +/* + * Indiscriminately delete the entire attribute fork + * + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return error; + blkno = bp->b_bn; + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, bp); + break; + } + if (error) + return error; + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return error; +} + +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ +int +xfs_attr_inactive( + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + struct xfs_mount *mp; + int cancel_flags = 0; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + lock_mode = 0; + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, 0); + + /* invalidate and truncate the attribute fork extents */ + if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; + } + + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); + + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, lock_mode); + return error; + +out_cancel: + xfs_trans_cancel(trans, cancel_flags); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; +} diff --git a/kernel/fs/xfs/xfs_attr_list.c b/kernel/fs/xfs/xfs_attr_list.c new file mode 100644 index 000000000..65fb37a18 --- /dev/null +++ b/kernel/fs/xfs/xfs_attr_list.c @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dir2.h" + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return -1; + } else if (sa->hash > sb->hash) { + return 1; + } else { + return sa->entno - sb->entno; + } +} + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return 0; + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return 0; + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return -EFSCORRUPTED; + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return 0; + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return 0; +} + +STATIC int +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int error, i; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + + trace_xfs_attr_node_list(context); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != -EFSCORRUPTED)) + return error; + if (bp) { + struct xfs_attr_leaf_entry *entries; + + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + __uint16_t magic; + + error = xfs_da3_node_read(NULL, dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_trans_brelse(NULL, bp); + return -EFSCORRUPTED; + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == nodehdr.count) { + xfs_trans_brelse(NULL, bp); + return 0; + } + xfs_trans_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->b_addr; + error = xfs_attr3_leaf_list_int(bp, context); + if (error) { + xfs_trans_brelse(NULL, bp); + return error; + } + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) + break; + cursor->blkno = leafhdr.forw; + xfs_trans_brelse(NULL, bp); + error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + if (error) + return error; + } + xfs_trans_brelse(NULL, bp); + return 0; +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) +{ + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + struct xfs_mount *mp = context->dp->i_mount; + + trace_xfs_attr_list_leaf(context); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == ichdr.count) { + trace_xfs_attr_list_notfound(context); + return 0; + } + } else { + entry = &entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for (; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr3_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr3_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.rmtvaluelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return retval; +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + int error; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); + + context->cursor->blkno = 0; + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_list_int(bp, context); + xfs_trans_brelse(NULL, bp); + return error; +} + +int +xfs_attr_list_int( + xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + uint lock_mode; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + /* + * Decide on what work routines to call based on the inode size. + */ + lock_mode = xfs_ilock_attr_map_shared(dp); + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + xfs_iunlock(dp, lock_mode); + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return -EINVAL; + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return -EINVAL; + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return -EFAULT; + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error <= 0); + return error; +} diff --git a/kernel/fs/xfs/xfs_bit.c b/kernel/fs/xfs/xfs_bit.c new file mode 100644 index 000000000..0e8885a59 --- /dev/null +++ b/kernel/fs/xfs/xfs_bit.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_log_format.h" +#include "xfs_bit.h" + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +/* + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. + */ +int +xfs_bitmap_empty(uint *map, uint size) +{ + uint i; + uint ret = 0; + + for (i = 0; i < size; i++) { + ret |= map[i]; + } + + return (ret == 0); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = 0; + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + ASSERT(start_bit < size); + size -= start_bit & ~(NBWORD - 1); + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to one first offset bits prior to start */ + tmp |= (~0U >> (NBWORD-start_bit)); + if (tmp != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return result - start_bit; +found: + return result + ffz(tmp) - start_bit; +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits prior to start */ + tmp &= (~0U << start_bit); + if (tmp != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return -1; +found: + return result + ffs(tmp) - 1; +} diff --git a/kernel/fs/xfs/xfs_bmap_util.c b/kernel/fs/xfs/xfs_bmap_util.c new file mode 100644 index 000000000..a52bbd3ab --- /dev/null +++ b/kernel/fs/xfs/xfs_bmap_util.c @@ -0,0 +1,1920 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" + +/* Kernel only BMAP related definitions and functions */ + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + struct xfs_trans_res tres; /* new log reservation */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + + tres.tr_logres = ntp->t_log_res; + tres.tr_logcount = ntp->t_log_count; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + error = xfs_trans_reserve(ntp, &tres, 0, 0); + if (error) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->offset, align) || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->length / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = rtx * mp->m_sb.sb_rextsize; + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->length = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->length = 0; + } + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks in use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return -EINVAL; + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return -EINVAL; + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return -EINVAL; + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return -ENOMEM; + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return -ENOMEM; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = -ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + kmem_free(out); + return error; +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will always punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VFS_I(ip)->i_mapping->nrpages == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + bool need_iolock) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + if (need_iolock) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return -EAGAIN; + } + } + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); + if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; +} + +int +xfs_alloc_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return -EINVAL; + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = -ENOSPC; + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= XFS_ISIZE(ip)) + return 0; + + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + + error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + xfs_fsb_to_db(ip, imap.br_startblock), + BTOBB(mp->m_sb.sb_blocksize), + 0, &bp, NULL); + if (error) + return error; + + memset(bp->b_addr + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + return error; + } + return error; +} + +int +xfs_free_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_off_t iendoffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + xfs_off_t rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, + iendoffset); + if (error) + goto out; + truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out: + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} + +/* + * Preallocate and zero a range of a file. This mechanism has the allocation + * semantics of fallocate and in addition converts data in the range to zeroes. + */ +int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + uint blksize; + int error; + + trace_xfs_zero_file_space(ip); + + blksize = 1 << mp->m_sb.sb_blocklog; + + /* + * Punch a hole and prealloc the range. We use hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued + * by virtue of the hole punch. + */ + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out; + + error = xfs_alloc_file_space(ip, round_down(offset, blksize), + round_up(offset + len, blksize) - + round_down(offset, blksize), + XFS_BMAPI_PREALLOC); +out: + return error; + +} + +/* + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. + */ +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) +{ + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t stop_fsb; + xfs_fileoff_t next_fsb; + xfs_fileoff_t shift_fsb; + + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } + + shift_fsb = XFS_B_TO_FSB(mp, len); + + /* + * Trim eofblocks to avoid shifting uninitialized post-eof preallocation + * into the accessible region of the file. + */ + if (xfs_can_free_eofblocks(ip, true)) { + error = xfs_free_eofblocks(mp, ip, false); + if (error) + return error; + } + + /* + * Writeback and invalidate cache for the remainder of the file as we're + * about to shift down every extent from offset to EOF. + */ + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + offset, -1); + if (error) + return error; + error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + offset >> PAGE_CACHE_SHIFT, -1); + if (error) + return error; + + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + /* + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + break; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &first_block); + + /* + * We are using the write transaction in which max 2 bmbt + * updates are allowed + */ + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + } + + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return -EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return -EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return -EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return -EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + return -EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return -EINVAL; + } + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + return -EINVAL; + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return -EINVAL; + } + + return 0; +} + +static int +xfs_swap_extent_flush( + struct xfs_inode *ip) +{ + int error; + + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + truncate_pagecache_range(VFS_I(ip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VFS_I(ip)->i_mapping->nrpages) + return -EINVAL; + return 0; +} + +int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int src_log_flags, target_log_flags; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int lock_flags; + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = -ENOMEM; + goto out; + } + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + src_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); + src_log_flags |= XFS_ILOG_DBROOT; + break; + } + + switch (tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + target_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); + break; + } + + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out; +} diff --git a/kernel/fs/xfs/xfs_bmap_util.h b/kernel/fs/xfs/xfs_bmap_util.h new file mode 100644 index 000000000..af97d9a1d --- /dev/null +++ b/kernel/fs/xfs/xfs_bmap_util.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_UTIL_H__ +#define __XFS_BMAP_UTIL_H__ + +/* Kernel only BMAP related definitions and functions */ + +struct xfs_bmbt_irec; +struct xfs_bmap_free_item; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct xfs_bmalloca; + +int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); + +/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ +void xfs_bmap_del_free(struct xfs_bmap_free *flist, + struct xfs_bmap_free_item *prev, + struct xfs_bmap_free_item *free); +int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, + struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, + int rt, int eof, int delay, int convert, + xfs_fileoff_t *offp, xfs_extlen_t *lenp); +void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *rec, + int *is_empty); + +/* preallocation and hole punch interface */ +int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len, int alloc_type); +int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); + +/* EOF block manipulation functions */ +bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + bool need_iolock); + +int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + +#endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/kernel/fs/xfs/xfs_buf.c b/kernel/fs/xfs/xfs_buf.c new file mode 100644 index 000000000..1790b00be --- /dev/null +++ b/kernel/fs/xfs/xfs_buf.c @@ -0,0 +1,1901 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +static kmem_zone_t *xfs_buf_zone; + +#ifdef XFS_BUF_LOCK_TRACKING +# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) +# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) +# define XB_GET_OWNER(bp) ((bp)->b_last_holder) +#else +# define XB_SET_OWNER(bp) do { } while (0) +# define XB_CLEAR_OWNER(bp) do { } while (0) +# define XB_GET_OWNER(bp) do { } while (0) +#endif + +#define xb_to_gfp(flags) \ + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) + + +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. + */ + return bp->b_addr && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_STALE; + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); + + ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); +} + +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->__b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return -ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->__b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + +struct xfs_buf * +_xfs_buf_alloc( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + int i; + + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); + if (unlikely(!bp)) + return NULL; + + /* + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. + */ + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + + atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); + init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); + INIT_LIST_HEAD(&bp->b_list); + RB_CLEAR_NODE(&bp->b_rbnode); + sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); + XB_SET_OWNER(bp); + bp->b_target = target; + bp->b_flags = flags; + + /* + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(xb_create); + trace_xfs_buf_init(bp, _RET_IP_); + + return bp; +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_xfs_buf_get_pages( + xfs_buf_t *bp, + int page_count) +{ + /* Make sure that we have a page list */ + if (bp->b_pages == NULL) { + bp->b_page_count = page_count; + if (page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kmem_alloc(sizeof(struct page *) * + page_count, KM_NOFS); + if (bp->b_pages == NULL) + return -ENOMEM; + } + memset(bp->b_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +STATIC void +_xfs_buf_free_pages( + xfs_buf_t *bp) +{ + if (bp->b_pages != bp->b_page_array) { + kmem_free(bp->b_pages); + bp->b_pages = NULL; + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer must not be on any hash - use xfs_buf_rele instead for + * hashed and refcounted buffers + */ +void +xfs_buf_free( + xfs_buf_t *bp) +{ + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (bp->b_flags & _XBF_PAGES) { + uint i; + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + __free_page(page); + } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); + _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); +} + +/* + * Allocates all the pages for buffer in question and builds it's page list. + */ +STATIC int +xfs_buf_allocate_memory( + xfs_buf_t *bp, + uint flags) +{ + size_t size; + size_t nbytes, offset; + gfp_t gfp_mask = xb_to_gfp(flags); + unsigned short page_count, i; + xfs_off_t start, end; + int error; + + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= _XBF_KMEM; + return 0; + } + +use_alloc_page: + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; + page_count = end - start; + error = _xfs_buf_get_pages(bp, page_count); + if (unlikely(error)) + return error; + + offset = bp->b_offset; + bp->b_flags |= _XBF_PAGES; + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page; + uint retries = 0; +retry: + page = alloc_page(gfp_mask); + if (unlikely(page == NULL)) { + if (flags & XBF_READ_AHEAD) { + bp->b_page_count = i; + error = -ENOMEM; + goto out_free_pages; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, gfp_mask); + + XFS_STATS_INC(xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + + XFS_STATS_INC(xb_page_found); + + nbytes = min_t(size_t, size, PAGE_SIZE - offset); + size -= nbytes; + bp->b_pages[i] = page; + offset = 0; + } + return 0; + +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); + return error; +} + +/* + * Map buffer into kernel address-space if necessary. + */ +STATIC int +_xfs_buf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + ASSERT(bp->b_flags & _XBF_PAGES); + if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ + bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { + int retried = 0; + unsigned noio_flag; + + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); + + if (!bp->b_addr) + return -ENOMEM; + bp->b_addr += bp->b_offset; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * Look up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. No I/O is implied by this call. + */ +xfs_buf_t * +_xfs_buf_find( + struct xfs_buftarg *btp, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + xfs_buf_t *new_bp) +{ + size_t numbytes; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; + int numblks = 0; + int i; + + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; + numbytes = BBTOB(numblks); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno < 0 || blkno >= eofs) { + /* + * XXX (dgc): we should really be returning -EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + WARN_ON(1); + return NULL; + } + + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, blkno)); + + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); + + if (blkno < bp->b_bn) + rbp = &(*rbp)->rb_left; + else if (blkno > bp->b_bn) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_length != numblks) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } + atomic_inc(&bp->b_hold); + goto found; + } + } + + /* No match found */ + if (new_bp) { + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + } else { + XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + } + return new_bp; + +found: + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) { + xfs_buf_rele(bp); + XFS_STATS_INC(xb_busy_locked); + return NULL; + } + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); + } + + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + XFS_STATS_INC(xb_get_locked); + return bp; +} + +/* + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. + */ +struct xfs_buf * +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + struct xfs_buf *new_bp; + int error = 0; + + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); + if (unlikely(!new_bp)) + return NULL; + + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { + xfs_buf_free(new_bp); + return NULL; + } + + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); + +found: + if (!bp->b_addr) { + error = _xfs_buf_map_pages(bp, flags); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pagesn", __func__); + xfs_buf_relse(bp); + return NULL; + } + } + + XFS_STATS_INC(xb_get); + trace_xfs_buf_get(bp, flags, _RET_IP_); + return bp; +} + +STATIC int +_xfs_buf_read( + xfs_buf_t *bp, + xfs_buf_flags_t flags) +{ + ASSERT(!(flags & XBF_WRITE)); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); + + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); + + if (flags & XBF_ASYNC) { + xfs_buf_submit(bp); + return 0; + } + return xfs_buf_submit_wait(bp); +} + +xfs_buf_t * +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + flags |= XBF_READ; + + bp = xfs_buf_get_map(target, map, nmaps, flags); + if (bp) { + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!XFS_BUF_ISDONE(bp)) { + XFS_STATS_INC(xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + } else if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } else { + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + } + } + + return bp; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + const struct xfs_buf_ops *ops) +{ + if (bdi_read_congested(target->bt_bdi)) + return; + + xfs_buf_read_map(target, map, nmaps, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); +} + +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +int +xfs_buf_read_uncached( + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t numblks, + int flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + *bpp = NULL; + + bp = xfs_buf_get_uncached(target, numblks, flags); + if (!bp) + return -ENOMEM; + + /* set up the buffer for a read IO */ + ASSERT(bp->b_map_count == 1); + bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ + bp->b_maps[0].bm_bn = daddr; + bp->b_flags |= XBF_READ; + bp->b_ops = ops; + + xfs_buf_submit_wait(bp); + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } + + *bpp = bp; + return 0; +} + +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t numblks) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_length = numblks; + bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if ((!is_vmalloc_addr(addr))) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +xfs_buf_associate_memory( + xfs_buf_t *bp, + void *mem, + size_t len) +{ + int rval; + int i = 0; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; + int page_count; + + pageaddr = (unsigned long)mem & PAGE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; + + /* Free any previous set of page pointers */ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_addr = mem; + + rval = _xfs_buf_get_pages(bp, page_count); + if (rval) + return rval; + + bp->b_offset = offset; + + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_SIZE; + } + + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); + + return 0; +} + +xfs_buf_t * +xfs_buf_get_uncached( + struct xfs_buftarg *target, + size_t numblks, + int flags) +{ + unsigned long page_count; + int error, i; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + + bp = _xfs_buf_alloc(target, &map, 1, 0); + if (unlikely(bp == NULL)) + goto fail; + + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; + error = _xfs_buf_get_pages(bp, page_count); + if (error) + goto fail_free_buf; + + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); + if (!bp->b_pages[i]) + goto fail_free_mem; + } + bp->b_flags |= _XBF_PAGES; + + error = _xfs_buf_map_pages(bp, 0); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pages", __func__); + goto fail_free_mem; + } + + trace_xfs_buf_get_uncached(bp, _RET_IP_); + return bp; + + fail_free_mem: + while (--i >= 0) + __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); + fail_free_buf: + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); + fail: + return NULL; +} + +/* + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * Must hold the buffer already to call this function. + */ +void +xfs_buf_hold( + xfs_buf_t *bp) +{ + trace_xfs_buf_hold(bp, _RET_IP_); + atomic_inc(&bp->b_hold); +} + +/* + * Releases a hold on the specified buffer. If the + * the hold count is 1, calls xfs_buf_free. + */ +void +xfs_buf_rele( + xfs_buf_t *bp) +{ + struct xfs_perag *pag = bp->b_pag; + + trace_xfs_buf_rele(bp, _RET_IP_); + + if (!pag) { + ASSERT(list_empty(&bp->b_lru)); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + + ASSERT(atomic_read(&bp->b_hold) > 0); + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + xfs_buf_free(bp); + } + } +} + + +/* + * Lock a buffer object, if it is not already locked. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. + */ +int +xfs_buf_trylock( + struct xfs_buf *bp) +{ + int locked; + + locked = down_trylock(&bp->b_sema) == 0; + if (locked) + XB_SET_OWNER(bp); + + trace_xfs_buf_trylock(bp, _RET_IP_); + return locked; +} + +/* + * Lock a buffer object. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. + */ +void +xfs_buf_lock( + struct xfs_buf *bp) +{ + trace_xfs_buf_lock(bp, _RET_IP_); + + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); + down(&bp->b_sema); + XB_SET_OWNER(bp); + + trace_xfs_buf_lock_done(bp, _RET_IP_); +} + +void +xfs_buf_unlock( + struct xfs_buf *bp) +{ + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + + trace_xfs_buf_unlock(bp, _RET_IP_); +} + +STATIC void +xfs_buf_wait_unpin( + xfs_buf_t *bp) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&bp->b_pin_count) == 0) + return; + + add_wait_queue(&bp->b_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&bp->b_pin_count) == 0) + break; + io_schedule(); + } + remove_wait_queue(&bp->b_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +void +xfs_buf_ioend( + struct xfs_buf *bp) +{ + bool read = bp->b_flags & XBF_READ; + + trace_xfs_buf_iodone(bp, _RET_IP_); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + + /* + * Pull in IO completion errors now. We are guaranteed to be running + * single threaded, so we don't need the lock to read b_io_error. + */ + if (!bp->b_error && bp->b_io_error) + xfs_buf_ioerror(bp, bp->b_io_error); + + /* Only validate buffers that were read without errors */ + if (read && !bp->b_error && bp->b_ops) { + ASSERT(!bp->b_iodone); + bp->b_ops->verify_read(bp); + } + + if (!bp->b_error) + bp->b_flags |= XBF_DONE; + + if (bp->b_iodone) + (*(bp->b_iodone))(bp); + else if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); +} + +static void +xfs_buf_ioend_work( + struct work_struct *work) +{ + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_ioend_work); + + xfs_buf_ioend(bp); +} + +void +xfs_buf_ioend_async( + struct xfs_buf *bp) +{ + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_ioend_wq, &bp->b_ioend_work); +} + +void +xfs_buf_ioerror( + xfs_buf_t *bp, + int error) +{ + ASSERT(error <= 0 && error >= -1000); + bp->b_error = error; + trace_xfs_buf_ioerror(bp, error, _RET_IP_); +} + +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); +} + +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | + XBF_WRITE_FAIL | XBF_DONE); + + error = xfs_buf_submit_wait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + +STATIC void +xfs_buf_bio_end_io( + struct bio *bio, + int error) +{ + xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (error) { + spin_lock(&bp->b_lock); + if (!bp->b_io_error) + bp->b_io_error = error; + spin_unlock(&bp->b_lock); + } + + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend_async(bp); + bio_put(bio); +} + +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) +{ + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; + + total_nr_pages = bp->b_page_count; + + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; + } + + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; + +next_chunk: + atomic_inc(&bp->b_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = bp->b_target->bt_bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; + + + for (; size && nr_pages; nr_pages--, page_index++) { + int rbytes, nbytes = PAGE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); + if (rbytes < nbytes) + break; + + offset = 0; + sector += BTOBB(nbytes); + size -= nbytes; + total_nr_pages--; + } + + if (likely(bio->bi_iter.bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_submit) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); + xfs_buf_ioerror(bp, -EIO); + bio_put(bio); + } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + + /* + * Initialize the I/O completion workqueue if we haven't yet or the + * submitter has not opted to specify a custom one. + */ + if (!bp->b_ioend_wq) + bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * non-crc filesystems don't attach verifiers during + * log recovery, so don't warn for such filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_warn(mp, + "%s: no ops on block 0x%llx/0x%x", + __func__, bp->b_bn, bp->b_length); + xfs_hex_dump(bp->b_addr, 64); + dump_stack(); + } + } + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); +} + +/* + * Asynchronous IO submission path. This transfers the buffer lock ownership and + * the current reference to the IO. It is not safe to reference the buffer after + * a call to this function unless the caller holds an additional reference + * itself. + */ +void +xfs_buf_submit( + struct xfs_buf *bp) +{ + trace_xfs_buf_submit(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + ASSERT(bp->b_flags & XBF_ASYNC); + + /* on shutdown we stale and complete the buffer immediately */ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + return; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * The caller's reference is released during I/O completion. + * This occurs some time after the last b_io_remaining reference is + * released, so after we drop our Io reference we have to have some + * other reference to ensure the buffer doesn't go away from underneath + * us. Take a direct reference to ensure we have safe access to the + * buffer until we are finished with it. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * If _xfs_buf_ioapply failed, we can get back here with only the IO + * reference we took above. If we drop it to zero, run completion so + * that we don't return to the caller with completion still pending. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + if (bp->b_error) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); + } + + xfs_buf_rele(bp); + /* Note: it is not safe to reference bp now we've dropped our ref */ +} + +/* + * Synchronous buffer IO submission path, read or write. + */ +int +xfs_buf_submit_wait( + struct xfs_buf *bp) +{ + int error; + + trace_xfs_buf_submit_wait(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + bp->b_flags &= ~XBF_DONE; + return -EIO; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * For synchronous IO, the IO does not inherit the submitters reference + * count, nor the buffer lock. Hence we cannot release the reference we + * are about to take until we've waited for all IO completion to occur, + * including any xfs_buf_ioend_async() work that may be pending. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * make sure we run completion synchronously if it raced with us and is + * already complete. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp); + + /* wait for completion before gathering the error from the buffer */ + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); + trace_xfs_buf_iowait_done(bp, _RET_IP_); + error = bp->b_error; + + /* + * all done now, we can release the hold that keeps the buffer + * referenced for the entire IO. + */ + xfs_buf_rele(bp); + return error; +} + +xfs_caddr_t +xfs_buf_offset( + xfs_buf_t *bp, + size_t offset) +{ + struct page *page; + + if (bp->b_addr) + return bp->b_addr + offset; + + offset += bp->b_offset; + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); +} + +/* + * Move data into or out of a buffer. + */ +void +xfs_buf_iomove( + xfs_buf_t *bp, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + void *data, /* data address */ + xfs_buf_rw_t mode) /* read/write/zero flag */ +{ + size_t bend; + + bend = boff + bsize; + while (boff < bend) { + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); + + ASSERT((csize + page_offset) <= PAGE_SIZE); + + switch (mode) { + case XBRW_ZERO: + memset(page_address(page) + page_offset, 0, csize); + break; + case XBRW_READ: + memcpy(data, page_address(page) + page_offset, csize); + break; + case XBRW_WRITE: + memcpy(page_address(page) + page_offset, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buffer targets (buftargs). + */ + +/* + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. + */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_lru_isolate_move(lru, item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +void +xfs_wait_buftarg( + struct xfs_buftarg *btp) +{ + LIST_HEAD(dispose); + int loop = 0; + + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } + xfs_buf_rele(bp); + } + if (loop++ != 0) + delay(100); + } +} + +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_lru_isolate_move(lru, item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + LIST_HEAD(dispose); + unsigned long freed; + + freed = list_lru_shrink_walk(&btp->bt_lru, sc, + xfs_buftarg_isolate, &dispose); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_shrink_count(&btp->bt_lru, sc); +} + +void +xfs_free_buftarg( + struct xfs_mount *mp, + struct xfs_buftarg *btp) +{ + unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); + + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); + + kmem_free(btp); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int sectorsize) +{ + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + + if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + + xfs_warn(btp->bt_mount, + "Cannot set_blocksize to %u on device %s", + sectorsize, name); + return -EINVAL; + } + + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + + return 0; +} + +/* + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. + */ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct xfs_mount *mp, + struct block_device *bdev) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + + btp->bt_mount = mp; + btp->bt_dev = bdev->bd_dev; + btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&btp->bt_shrinker); + return btp; + +error: + kmem_free(btp); + return NULL; +} + +/* + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. + */ +bool +xfs_buf_delwri_queue( + struct xfs_buf *bp, + struct list_head *list) +{ + ASSERT(xfs_buf_islocked(bp)); + ASSERT(!(bp->b_flags & XBF_READ)); + + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; + } + + trace_xfs_buf_delwri_queue(bp, _RET_IP_); + + /* + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. + */ + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); + } + + return true; +} + +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) +{ + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } + + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } + + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } + + list_sort(NULL, io_list, xfs_buf_cmp); + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + + /* + * we do all Io submission async. This means if we need to wait + * for IO completion we need to take an extra reference so the + * buffer is still valid on the other side. + */ + if (wait) + xfs_buf_hold(bp); + else + list_del_init(&bp->b_list); + + xfs_buf_submit(bp); + } + blk_finish_plug(&plug); + + return pinned; +} + +/* + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. + */ +int +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} + +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; + + __xfs_buf_delwri_submit(buffer_list, &io_list, true); + + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); + + list_del_init(&bp->b_list); + + /* locking the buffer will wait for async IO completion. */ + xfs_buf_lock(bp); + error2 = bp->b_error; + xfs_buf_relse(bp); + if (!error) + error = error2; + } + + return error; +} + +int __init +xfs_buf_init(void) +{ + xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", + KM_ZONE_HWALIGN, NULL); + if (!xfs_buf_zone) + goto out; + + return 0; + + out: + return -ENOMEM; +} + +void +xfs_buf_terminate(void) +{ + kmem_zone_destroy(xfs_buf_zone); +} diff --git a/kernel/fs/xfs/xfs_buf.h b/kernel/fs/xfs/xfs_buf.h new file mode 100644 index 000000000..75ff5d5a7 --- /dev/null +++ b/kernel/fs/xfs/xfs_buf.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +typedef enum { + XBRW_READ = 1, /* transfer into target memory */ + XBRW_WRITE = 2, /* transfer from target memory */ + XBRW_ZERO = 3, /* Zero target memory */ +} xfs_buf_rw_t; + +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ + +/* I/O hints for the BIO layer */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ + +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ + +/* flags used only internally */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ + +typedef unsigned int xfs_buf_flags_t; + +#define XFS_BUF_FLAGS \ + { XBF_READ, "READ" }, \ + { XBF_WRITE, "WRITE" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_ASYNC, "ASYNC" }, \ + { XBF_DONE, "DONE" }, \ + { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ + { XBF_SYNCIO, "SYNCIO" }, \ + { XBF_FUA, "FUA" }, \ + { XBF_FLUSH, "FLUSH" }, \ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_PAGES, "PAGES" }, \ + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" } + + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ + +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ +typedef struct xfs_buftarg { + dev_t bt_dev; + struct block_device *bt_bdev; + struct backing_dev_info *bt_bdi; + struct xfs_mount *bt_mount; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_lru bt_lru; +} xfs_buftarg_t; + +struct xfs_buf; +typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + + +#define XB_PAGES 2 + +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + +typedef struct xfs_buf { + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ + struct rb_node b_rbnode; /* rbtree node */ + xfs_daddr_t b_bn; /* block number of buffer */ + int b_length; /* size of buffer in BBs */ + atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ + xfs_buf_flags_t b_flags; /* status flags */ + struct semaphore b_sema; /* semaphore for lockables */ + + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ + struct list_head b_lru; /* lru list */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ + int b_io_error; /* internal IO error state */ + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + struct xfs_perag *b_pag; /* contains rbtree root */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + void *b_addr; /* virtual address of buffer */ + struct work_struct b_ioend_work; + struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ + xfs_buf_iodone_t b_iodone; /* I/O completion function */ + struct completion b_iowait; /* queue for I/O waiters */ + void *b_fspriv; + struct xfs_trans *b_transp; + struct page **b_pages; /* array of page pointers */ + struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ + int b_map_count; + int b_io_length; /* IO size in BBs */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + int b_error; /* error code on I/O */ + const struct xfs_buf_ops *b_ops; + +#ifdef XFS_BUF_LOCK_TRACKING + int b_last_holder; +#endif +} xfs_buf_t; + +/* Finding and Reading Buffers */ +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} + +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + const struct xfs_buf_ops *ops); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags, ops); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1, ops); +} + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, + size_t numblks, int flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); +void xfs_buf_hold(struct xfs_buf *bp); + +/* Releasing Buffers */ +extern void xfs_buf_free(xfs_buf_t *); +extern void xfs_buf_rele(xfs_buf_t *); + +/* Locking and Unlocking Buffers */ +extern int xfs_buf_trylock(xfs_buf_t *); +extern void xfs_buf_lock(xfs_buf_t *); +extern void xfs_buf_unlock(xfs_buf_t *); +#define xfs_buf_islocked(bp) \ + ((bp)->b_sema.count <= 0) + +/* Buffer Read and Write Routines */ +extern int xfs_bwrite(struct xfs_buf *bp); +extern void xfs_buf_ioend(struct xfs_buf *bp); +extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_submit(struct xfs_buf *bp); +extern int xfs_buf_submit_wait(struct xfs_buf *bp); +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, + xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) + +/* Buffer Utility Routines */ +extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); + +/* Delayed Write Buffer Routines */ +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); + +/* Buffer Daemon Setup Routines */ +extern int xfs_buf_init(void); +extern void xfs_buf_terminate(void); + +#define XFS_BUF_ZEROFLAGS(bp) \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) + +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) + +#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) +#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) +#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) + +#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) +#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) +#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) + +#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) +#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) +#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) + +#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) +#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) +#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) + +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) + +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} + +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +/* + * Handling of buftargs. + */ +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); + +#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) +#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) + +#endif /* __XFS_BUF_H__ */ diff --git a/kernel/fs/xfs/xfs_buf_item.c b/kernel/fs/xfs/xfs_buf_item.c new file mode 100644 index 000000000..092d652bc --- /dev/null +++ b/kernel/fs/xfs/xfs_buf_item.c @@ -0,0 +1,1155 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_buf_item_zone; + +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_buf_log_item, bli_item); +} + +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); + +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) +{ + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); +} + +/* + * This returns the number of log iovecs needed to log the + * given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure + * and 1 for each stretch of non-contiguous chunks to be logged. + * Contiguous chunks are logged in a single iovec. + * + * If the XFS_BLI_STALE flag has been set, then log nothing. + */ +STATIC void +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) +{ + struct xfs_buf *bp = bip->bli_buf; + int next_bit; + int last_bit; + + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; + + while (last_bit != -1) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); + /* + * If we run out of bits, leave the loop, + * else if we find a new set of bits bump the number of vecs, + * else keep scanning the current set of bits. + */ + if (next_bit == -1) { + break; + } else if (next_bit != last_bit + 1) { + last_bit = next_bit; + (*nvecs)++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { + last_bit = next_bit; + (*nvecs)++; + } else { + last_bit++; + } + *nbytes += XFS_BLF_CHUNK; + } +} + +/* + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. + */ +STATIC void +xfs_buf_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); + } + trace_xfs_buf_item_size(bip); +} + +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ + struct xfs_buf *bp = bip->bli_buf; + uint base_size; + int first_bit; + int last_bit; + int next_bit; + uint nbits; + + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->__bli_format.blf_flags; + + /* + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. + */ + base_size = xfs_buf_log_format_size(blfp); + + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + return; + } + + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_format_stale(bip); + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); + return; + } + + + /* + * Fill in an iovec for each set of contiguous chunks. + */ + last_bit = first_bit; + nbits = 1; + for (;;) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); + /* + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. + */ + if (next_bit == -1) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; + break; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else { + last_bit++; + nbits++; + } + } +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + ASSERT((bip->bli_flags & XFS_BLI_STALE) || + (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF + && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } + + /* + * Check to make sure everything is consistent. + */ + trace_xfs_buf_item_format(bip); +} + +/* + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. + */ +STATIC void +xfs_buf_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); + + trace_xfs_buf_item_pin(bip); + + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} + +/* + * This is called to unpin the buffer associated with the buf log + * item which was previously pinned with a call to xfs_buf_item_pin(). + * + * Also drop the reference to the buf item for the current transaction. + * If the XFS_BLI_STALE flag is set and we are the last reference, + * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. + */ +STATIC void +xfs_buf_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; + + ASSERT(bp->b_fspriv == bip); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_buf_item_unpin(bip); + + freed = atomic_dec_and_test(&bip->bli_refcount); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + + if (freed && stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + + trace_xfs_buf_item_unpin_stale(bip); + + if (remove) { + /* + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the + * buffer which we no longer have a hold on. + */ + if (lip->li_desc) + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + bp->b_transp = NULL; + } + + /* + * If we get called here because of an IO error, we may + * or may not have the item on the AIL. xfs_trans_ail_delete() + * will take care of that situation. + * xfs_trans_ail_delete() drops the AIL lock. + */ + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; + } else { + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); + ASSERT(bp->b_fspriv == NULL); + } + xfs_buf_relse(bp); + } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ + xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioerror(bp, -EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + } +} + +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + +STATIC uint +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; + + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; + return XFS_ITEM_LOCKED; + } + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + + trace_xfs_buf_item_push(bip); + + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { + xfs_warn(bp->b_target->bt_mount, +"Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); + } + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; +} + +/* + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. + * + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. + */ +STATIC void +xfs_buf_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + bool clean; + bool aborted; + int flags; + + /* Clear the buffer's association with this transaction. */ + bp->b_transp = NULL; + + /* + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. + */ + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; + /* + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. + */ + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + + /* + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. + */ + if (flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } + } + + trace_xfs_buf_item_unlock(bip); + + /* + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. + */ + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } + } + } + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } + xfs_buf_item_relse(bp); + } + } + + if (!(flags & XFS_BLI_HOLD)) + xfs_buf_relse(bp); +} + +/* + * This is called to find out where the oldest active copy of the + * buf log item in the on disk log resides now that the last log + * write of it completed at the given lsn. + * We always re-log all the dirty data in a buffer, so usually the + * latest copy in the on disk log is the only one that matters. For + * those cases we simply return the given lsn. + * + * The one exception to this is for buffers full of newly allocated + * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF + * flag set, indicating that only the di_next_unlinked fields from the + * inodes in the buffers will be replayed during recovery. If the + * original newly allocated inode images have not yet been flushed + * when the buffer is so relogged, then we need to make sure that we + * keep the old images in the 'active' portion of the log. We do this + * by returning the original lsn of that transaction here rather than + * the current one. + */ +STATIC xfs_lsn_t +xfs_buf_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + trace_xfs_buf_item_committed(bip); + + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; +} + +STATIC void +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static const struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_committing = xfs_buf_item_committing +}; + +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return -ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} + +/* + * Allocate a new buf log item to go with the given buffer. + * Set the buffer's b_fsprivate field to point to the new + * buf log item. If there are other item's attached to the + * buffer (see xfs_buf_attach_iodone() below), then put the + * buf log item at the front. + */ +void +xfs_buf_item_init( + xfs_buf_t *bp, + xfs_mount_t *mp) +{ + xfs_log_item_t *lip = bp->b_fspriv; + xfs_buf_log_item_t *bip; + int chunks; + int map_size; + int error; + int i; + + /* + * Check to see if there is already a buf log item for + * this buffer. If there is, it is guaranteed to be + * the first. If we do already have one, there is + * nothing to do here so return. + */ + ASSERT(bp->b_target->bt_mount == mp); + if (lip != NULL && lip->li_type == XFS_LI_BUF) + return; + + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); + bip->bli_buf = bp; + xfs_buf_hold(bp); + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. + */ + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } + + /* + * Put the buf item into the list of items attached to the + * buffer at the front. + */ + if (bp->b_fspriv) + bip->bli_item.li_bio_list = bp->b_fspriv; + bp->b_fspriv = bip; +} + + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +static void +xfs_buf_item_log_segment( + uint first, + uint last, + uint *map) +{ + uint first_bit; + uint last_bit; + uint bits_to_set; + uint bits_set; + uint word_num; + uint *wordp; + uint bit; + uint end_bit; + uint mask; + + /* + * Convert byte offsets to bit numbers. + */ + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; + + /* + * Calculate the total number of bits to be set. + */ + bits_to_set = last_bit - first_bit + 1; + + /* + * Get a pointer to the first word in the bitmap + * to set a bit in. + */ + word_num = first_bit >> BIT_TO_WORD_SHIFT; + wordp = &map[word_num]; + + /* + * Calculate the starting bit in the first word. + */ + bit = first_bit & (uint)(NBWORD - 1); + + /* + * First set any bits in the first word of our range. + * If it starts at bit 0 of the word, it will be + * set below rather than here. That is what the variable + * bit tells us. The variable bits_set tracks the number + * of bits that have been set so far. End_bit is the number + * of the last bit to be set in this word plus one. + */ + if (bit) { + end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + mask = ((1 << (end_bit - bit)) - 1) << bit; + *wordp |= mask; + wordp++; + bits_set = end_bit - bit; + } else { + bits_set = 0; + } + + /* + * Now set bits a whole word at a time that are between + * first_bit and last_bit. + */ + while ((bits_to_set - bits_set) >= NBWORD) { + *wordp |= 0xffffffff; + bits_set += NBWORD; + wordp++; + } + + /* + * Finally, set any bits left to be set in one last partial word. + */ + end_bit = bits_to_set - bits_set; + if (end_bit) { + mask = (1 << end_bit) - 1; + *wordp |= mask; + } +} + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } +} + + +/* + * Return 1 if the buffer has been logged or ordered in a transaction (at any + * point, not just the current transaction) and 0 if not. + */ +uint +xfs_buf_item_dirty( + xfs_buf_log_item_t *bip) +{ + return (bip->bli_flags & XFS_BLI_DIRTY); +} + +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ + xfs_buf_item_free_format(bip); + kmem_zone_free(xfs_buf_item_zone, bip); +} + +/* + * This is called when the buf log item is no longer needed. It should + * free the buf log item associated with the given buffer and clear + * the buffer's pointer to the buf log item. If there are no more + * items in the list, clear the b_iodone field of the buffer (see + * xfs_buf_attach_iodone() below). + */ +void +xfs_buf_item_relse( + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + + bp->b_fspriv = bip->bli_item.li_bio_list; + if (bp->b_fspriv == NULL) + bp->b_iodone = NULL; + + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + + +/* + * Add the given log item with its callback to the list of callbacks + * to be called when the buffer's I/O completes. If it is not set + * already, set the buffer's b_iodone() routine to be + * xfs_buf_iodone_callbacks() and link the log item into the list of + * items rooted at b_fsprivate. Items are always added as the second + * entry in the list if there is a first, because the buf item code + * assumes that the buf log item is first. + */ +void +xfs_buf_attach_iodone( + xfs_buf_t *bp, + void (*cb)(xfs_buf_t *, xfs_log_item_t *), + xfs_log_item_t *lip) +{ + xfs_log_item_t *head_lip; + + ASSERT(xfs_buf_islocked(bp)); + + lip->li_cb = cb; + head_lip = bp->b_fspriv; + if (head_lip) { + lip->li_bio_list = head_lip->li_bio_list; + head_lip->li_bio_list = lip; + } else { + bp->b_fspriv = lip; + } + + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; +} + +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ +STATIC void +xfs_buf_do_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + while ((lip = bp->b_fspriv) != NULL) { + bp->b_fspriv = lip->li_bio_list; + ASSERT(lip->li_cb != NULL); + /* + * Clear the next pointer so we don't have any + * confusion if the item is added to another buf. + * Don't touch the log item after calling its + * callback, because it could have freed itself. + */ + lip->li_bio_list = NULL; + lip->li_cb(bp, lip); + } +} + +/* + * This is the iodone() function for buffers which have had callbacks + * attached to them by xfs_buf_attach_iodone(). It should remove each + * log item from the buffer's list and call the callback of each in turn. + * When done, the buffer's fsprivate field is set to NULL and the buffer + * is unlocked with a call to iodone(). + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; + + if (likely(!bp->b_error)) + goto do_callbacks; + + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } + + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __func__); + } + lasttarg = bp->b_target; + + /* + * If the write was asynchronous then no one will be looking for the + * error. Clear the error state and write the buffer out again. + * + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * errors tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. + */ + if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; + xfs_buf_submit(bp); + } else { + xfs_buf_relse(bp); + } + + return; + } + + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + + trace_xfs_buf_error_relse(bp, _RET_IP_); + +do_callbacks: + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; + xfs_buf_ioend(bp); +} + +/* + * This is the iodone() function for buffers which have been + * logged. It is called when they are eventually flushed out. + * It should remove the buf item from the AIL, and free the buf item. + * It is called by xfs_buf_iodone_callbacks() above which will take + * care of cleaning up the buffer itself. + */ +void +xfs_buf_iodone( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(BUF_ITEM(lip)->bli_buf == bp); + + xfs_buf_rele(bp); + + /* + * If we are forcibly shutting down, this may well be + * off the AIL already. That's because we simulate the + * log-committed callbacks to unpin these buffers. Or we may never + * have put this item on AIL because of the transaction was + * aborted forcibly. xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_free(BUF_ITEM(lip)); +} diff --git a/kernel/fs/xfs/xfs_buf_item.h b/kernel/fs/xfs/xfs_buf_item.h new file mode 100644 index 000000000..3f3455a41 --- /dev/null +++ b/kernel/fs/xfs/xfs_buf_item.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_ITEM_H__ +#define __XFS_BUF_ITEM_H__ + +/* kernel only definitions */ + +/* buf log item flags */ +#define XFS_BLI_HOLD 0x01 +#define XFS_BLI_DIRTY 0x02 +#define XFS_BLI_STALE 0x04 +#define XFS_BLI_LOGGED 0x08 +#define XFS_BLI_INODE_ALLOC_BUF 0x10 +#define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 + +#define XFS_BLI_FLAGS \ + { XFS_BLI_HOLD, "HOLD" }, \ + { XFS_BLI_DIRTY, "DIRTY" }, \ + { XFS_BLI_STALE, "STALE" }, \ + { XFS_BLI_LOGGED, "LOGGED" }, \ + { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } + + +struct xfs_buf; +struct xfs_mount; +struct xfs_buf_log_item; + +/* + * This is the in core log item structure used to track information + * needed to log buffers. It tracks how many times the lock has been + * locked, and which 128 byte chunks of the buffer are dirty. + */ +typedef struct xfs_buf_log_item { + xfs_log_item_t bli_item; /* common item structure */ + struct xfs_buf *bli_buf; /* real buffer pointer */ + unsigned int bli_flags; /* misc flags */ + unsigned int bli_recur; /* lock recursion count */ + atomic_t bli_refcount; /* cnt of tp refs */ + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ +} xfs_buf_log_item_t; + +void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_relse(struct xfs_buf *); +void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +void xfs_buf_attach_iodone(struct xfs_buf *, + void(*)(struct xfs_buf *, xfs_log_item_t *), + xfs_log_item_t *); +void xfs_buf_iodone_callbacks(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); + +extern kmem_zone_t *xfs_buf_item_zone; + +#endif /* __XFS_BUF_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_dir2_readdir.c b/kernel/fs/xfs/xfs_dir2_readdir.c new file mode 100644 index 000000000..098cd78fe --- /dev/null +++ b/kernel/fs/xfs/xfs_dir2_readdir.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" + +/* + * Directory file type support functions + */ +static unsigned char xfs_dir3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, + DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, +}; + +static unsigned char +xfs_dir3_get_dtype( + struct xfs_mount *mp, + __uint8_t filetype) +{ + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return DT_UNKNOWN; + + if (filetype >= XFS_DIR3_FT_MAX) + return DT_UNKNOWN; + + return xfs_dir3_filetype_table[filetype]; +} + +STATIC int +xfs_dir2_sf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + int i; /* shortform entry number */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + struct xfs_da_geometry *geo = args->geo; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * geo->datablk + */ + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dot_offset); + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dotdot_offset); + + /* + * Put . entry unless we're starting past it. + */ + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) + return 0; + } + + /* + * Put .. entry unless we're starting past it. + */ + if (ctx->pos <= dotdot_offset) { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) + return 0; + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + __uint8_t filetype; + + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + + if (ctx->pos > off) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + continue; + } + + ino = dp->d_ops->sf_get_ino(sfp, sfep); + filetype = dp->d_ops->sf_get_ftype(sfep); + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, + xfs_dir3_get_dtype(dp->i_mount, filetype))) + return 0; + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Readdir for block directories. + */ +STATIC int +xfs_dir2_block_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + error = xfs_dir3_block_read(NULL, dp, &bp); + if (error) + return error; + + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + __uint8_t filetype; + + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += dp->d_ops->data_entsize(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)hdr < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (char *)dep - (char *)hdr); + + ctx->pos = cook & 0x7fffffff; + filetype = dp->d_ops->data_get_ftype(dep); + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) { + xfs_trans_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + xfs_trans_brelse(NULL, bp); + return 0; +} + +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_da_args *args, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; + int error = 0; + int length; + int i; + int j; + struct xfs_da_geometry *geo = args->geo; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= geo->fsbcount; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = geo->fsbcount; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(geo, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= geo->fsbcount; + + /* + * Do we need more readahead? + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += geo->fsbcount) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(dp->i_mount, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < geo->fsbcount; j += length ) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, geo->fsbcount, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + blk_finish_plug(&plug); + +out: + *bpp = bp; + return error; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +STATIC int +xfs_dir2_leaf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + char *ptr = NULL; /* pointer to current data */ + struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) + return 0; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP | KM_NOFS); + map_info->map_size = length; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); + + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + __uint8_t filetype; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) + break; + + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == + map_info->curdb); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)dp->d_ops->data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(geo, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += dp->d_ops->data_entry_offset; + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)hdr < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + dp->d_ops->data_entsize(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + geo->blksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = dp->d_ops->data_entsize(dep->namelen); + filetype = dp->d_ops->data_get_ftype(dep); + + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + kmem_free(map_info); + if (bp) + xfs_trans_brelse(NULL, bp); + return error; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_getdents); + + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(&args, ctx); + else + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + + return rval; +} diff --git a/kernel/fs/xfs/xfs_discard.c b/kernel/fs/xfs/xfs_discard.c new file mode 100644 index 000000000..e85a9519a --- /dev/null +++ b/kernel/fs/xfs/xfs_discard.c @@ -0,0 +1,239 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_discard.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_ge(cur, 0, + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + + /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); + + /* + * Too small? Give up. + */ + if (dlen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (dbno + dlen < start || dbno > end) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_daddr_t start, end, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, urange, sizeof(range))) + return -EFAULT; + + /* + * Truncating down the len isn't actually quite correct, but using + * BBTOB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) + return -EINVAL; + + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); + + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; + + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); + + for (agno = start_agno; agno <= end_agno; agno++) { + error = xfs_trim_extents(mp, agno, start, end, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -EFAULT; + return 0; +} + +int +xfs_discard_extents( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_extent_busy *busyp; + int error = 0; + + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llu,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + return error; + } + } + + return 0; +} diff --git a/kernel/fs/xfs/xfs_discard.h b/kernel/fs/xfs/xfs_discard.h new file mode 100644 index 000000000..344879aea --- /dev/null +++ b/kernel/fs/xfs/xfs_discard.h @@ -0,0 +1,10 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; +struct list_head; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); + +#endif /* XFS_DISCARD_H */ diff --git a/kernel/fs/xfs/xfs_dquot.c b/kernel/fs/xfs/xfs_dquot.c new file mode 100644 index 000000000..02c01bbbc --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot.c @@ -0,0 +1,1104 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" + +/* + * Lock order: + * + * ip->i_lock + * qi->qi_tree_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * qi->qi_lru_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ + +#ifdef DEBUG +xfs_buftarg_t *xfs_dqerror_target; +int xfs_do_dqerror; +int xfs_dqreq_num; +int xfs_dqerror_mod = 33; +#endif + +struct kmem_zone *xfs_qm_dqtrxzone; +static struct kmem_zone *xfs_qm_dqzone; + +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; + +/* + * This is called to free all the memory associated with a dquot + */ +void +xfs_qm_dqdestroy( + xfs_dquot_t *dqp) +{ + ASSERT(list_empty(&dqp->q_lru)); + + mutex_destroy(&dqp->q_qlock); + kmem_zone_free(xfs_qm_dqzone, dqp); + + XFS_STATS_DEC(xs_qm_dquot); +} + +/* + * If default limits are in force, push them into the dquot now. + * We overwrite the dquot limits only if they are zero and this + * is not the root dquot. + */ +void +xfs_qm_adjust_dqlimits( + struct xfs_mount *mp, + struct xfs_dquot *dq) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; + + ASSERT(d->d_id); + + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { + d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { + d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } + if (q->qi_isoftlimit && !d->d_ino_softlimit) + d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); + if (q->qi_ihardlimit && !d->d_ino_hardlimit) + d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); + if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) + d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); + if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) + d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); +} + +/* + * Check the limits and timers of a dquot and start or reset timers + * if necessary. + * This gets called even when quota enforcement is OFF, which makes our + * life a little less complicated. (We just don't reject any quota + * reservations in that case, when enforcement is off). + * We also return 0 as the values of the timers in Q_GETQUOTA calls, when + * enforcement's off. + * In contrast, warnings are a little different in that they don't + * 'automatically' get started when limits get exceeded. They do + * get reset to zero, however, when we find the count to be under + * the soft limit (they are only ever set non-zero via userspace). + */ +void +xfs_qm_adjust_dqtimers( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + ASSERT(d->d_id); + +#ifdef DEBUG + if (d->d_blk_hardlimit) + ASSERT(be64_to_cpu(d->d_blk_softlimit) <= + be64_to_cpu(d->d_blk_hardlimit)); + if (d->d_ino_hardlimit) + ASSERT(be64_to_cpu(d->d_ino_softlimit) <= + be64_to_cpu(d->d_ino_hardlimit)); + if (d->d_rtb_hardlimit) + ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= + be64_to_cpu(d->d_rtb_hardlimit)); +#endif + + if (!d->d_btimer) { + if ((d->d_blk_softlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_softlimit))) || + (d->d_blk_hardlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_btimelimit); + } else { + d->d_bwarns = 0; + } + } else { + if ((!d->d_blk_softlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_softlimit))) && + (!d->d_blk_hardlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = 0; + } + } + + if (!d->d_itimer) { + if ((d->d_ino_softlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_softlimit))) || + (d->d_ino_hardlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_itimelimit); + } else { + d->d_iwarns = 0; + } + } else { + if ((!d->d_ino_softlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_softlimit))) && + (!d->d_ino_hardlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = 0; + } + } + + if (!d->d_rtbtimer) { + if ((d->d_rtb_softlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_softlimit))) || + (d->d_rtb_hardlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_rtbtimelimit); + } else { + d->d_rtbwarns = 0; + } + } else { + if ((!d->d_rtb_softlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_softlimit))) && + (!d->d_rtb_hardlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = 0; + } + } +} + +/* + * initialize a buffer full of dquots and log the whole thing + */ +STATIC void +xfs_qm_init_dquot_blk( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_buf_t *bp) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + xfs_dqblk_t *d; + int curid, i; + + ASSERT(tp); + ASSERT(xfs_buf_islocked(bp)); + + d = bp->b_addr; + + /* + * ID of the first dquot in the block - id's are zero based. + */ + curid = id - (id % q->qi_dqperchunk); + ASSERT(curid >= 0); + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } + + xfs_trans_dquot_buf(tp, bp, + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); +} + +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +/* + * Allocate a block and fill it with dquots. + * This is called when the bmapi finds a hole. + */ +STATIC int +xfs_qm_dqalloc( + xfs_trans_t **tpp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + xfs_inode_t *quotip, + xfs_fileoff_t offset_fsb, + xfs_buf_t **O_bpp) +{ + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + xfs_bmbt_irec_t map; + int nmaps, error, committed; + xfs_buf_t *bp; + xfs_trans_t *tp = *tpp; + + ASSERT(tp != NULL); + + trace_xfs_dqalloc(dqp); + + /* + * Initialize the bmap freelist prior to calling bmapi code. + */ + xfs_bmap_init(&flist, &firstblock); + xfs_ilock(quotip, XFS_ILOCK_EXCL); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return -ESRCH; + } + + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + nmaps = 1; + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) + goto error0; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + /* + * Keep track of the blkno to save a lookup later + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + /* now we can just get the buffer (there's nothing to read yet) */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0); + if (!bp) { + error = -ENOMEM; + goto error1; + } + bp->b_ops = &xfs_dquot_buf_ops; + + /* + * Make a chunk of dquots out of this buffer and log + * the entire thing. + */ + xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + + /* + * xfs_bmap_finish() may commit the current transaction and + * start a second transaction if the freelist is not empty. + * + * Since we still want to modify this buffer, we need to + * ensure that the buffer is not released on commit of + * the first transaction and ensure the buffer is added to the + * second transaction. + * + * If there is only one transaction then don't stop the buffer + * from being released when it commits later on. + */ + + xfs_trans_bhold(tp, bp); + + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + goto error1; + } + + if (committed) { + tp = *tpp; + xfs_trans_bjoin(tp, bp); + } else { + xfs_trans_bhold_release(tp, bp); + } + + *O_bpp = bp; + return 0; + + error1: + xfs_bmap_cancel(&flist); + error0: + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + + return error; +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. make sure we verify it on write, though. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return error; + } + (*bpp)->b_ops = &xfs_dquot_buf_ops; + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return -EIO; + } + } + + return 0; +} + +/* + * Maps a dquot to the buffer containing its on-disk version. + * This returns a ptr to the buffer containing the on-disk dquot + * in the bpp param, and a ptr to the on-disk dquot within that buffer + */ +STATIC int +xfs_qm_dqtobp( + xfs_trans_t **tpp, + xfs_dquot_t *dqp, + xfs_disk_dquot_t **O_ddpp, + xfs_buf_t **O_bpp, + uint flags) +{ + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; + + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + + lock_mode = xfs_ilock_data_map_shared(quotip); + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we + * didn't have the quota inode lock. + */ + xfs_iunlock(quotip, lock_mode); + return -ESRCH; + } + + /* + * Find the block map; no allocations yet + */ + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); + + xfs_iunlock(quotip, lock_mode); + if (error) + return error; + + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); + + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return -ENOENT; + + ASSERT(tp); + error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, + dqp->q_fileoffset, &bp); + if (error) + return error; + tp = *tpp; + } else { + trace_xfs_dqtobp_read(dqp); + + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp, &xfs_dquot_buf_ops); + + if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } + + if (error) { + ASSERT(bp == NULL); + return error; + } + } + + ASSERT(xfs_buf_islocked(bp)); + *O_bpp = bp; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; + + return 0; +} + + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. + * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. + */ +int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; + + + dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_lru); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } + + XFS_STATS_INC(xs_qm_dquot); + + trace_xfs_dqread(dqp); + + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + + /* + * get a pointer to the on-disk dquot and the buffer containing it + * dqp already knows its own type (GROUP/USER). + */ + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; + } + + /* copy everything from disk dquot to the incore dquot */ + memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + xfs_qm_dquot_logitem_init(dqp); + + /* + * Reservation counters are defined as reservation plus current usage + * to avoid having to add every time. + */ + dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); + dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); + dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + + /* Mark the buf so that this will stay incore a little longer */ + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) + * So we need to release with xfs_trans_brelse(). + * The strategy here is identical to that of inodes; we lock + * the dquot in xfs_qm_dqget() before making it accessible to + * others. This is because dquots, like inodes, need a good level of + * concurrency, and we don't want to take locks on the entire buffers + * for dquot accesses. + * Note also that the dquot buffer may even be dirty at this point, if + * this particular dquot was repaired. We still aren't afraid to + * brelse it because we have the changes incore. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_trans_brelse(tp, bp); + + if (tp) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + } + + *O_dqpp = dqp; + return error; + +error1: + if (tp) + xfs_trans_cancel(tp, cancelflags); +error0: + xfs_qm_dqdestroy(dqp); + *O_dqpp = NULL; + return error; +} + +/* + * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a + * a locked dquot, doing an allocation (if requested) as needed. + * When both an inode and an id are given, the inode's id takes precedence. + * That is, if the id changes while we don't hold the ilock inside this + * function, the new dquot is returned, not necessarily the one requested + * in the id argument. + */ +int +xfs_qm_dqget( + xfs_mount_t *mp, + xfs_inode_t *ip, /* locked inode (optional) */ + xfs_dqid_t id, /* uid/projid/gid depending on type */ + uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ + uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ + xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || + (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { + return -ESRCH; + } + +#ifdef DEBUG + if (xfs_do_dqerror) { + if ((xfs_dqerror_target == mp->m_ddev_targp) && + (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { + xfs_debug(mp, "Returning error in dqget"); + return -EIO; + } + } + + ASSERT(type == XFS_DQ_USER || + type == XFS_DQ_PROJ || + type == XFS_DQ_GROUP); + if (ip) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + } +#endif + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (dqp) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(xs_qm_dqcachehits); + *O_dqpp = dqp; + return 0; + } + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(xs_qm_dqcachemisses); + + /* + * Dquot cache miss. We don't want to keep the inode lock across + * a (potential) disk read. Also we don't want to deal with the lock + * ordering between quotainode and this inode. OTOH, dropping the inode + * lock here means dealing with a chown that can happen before + * we re-acquire the lock. + */ + if (ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqread(mp, id, type, flags, &dqp); + + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; + + if (ip) { + /* + * A dquot could be attached to this inode by now, since + * we had dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { + xfs_qm_dqdestroy(dqp); + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; + } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return -ESRCH; + } + } + + mutex_lock(&qi->qi_tree_lock); + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(xs_qm_dquot_dups); + goto restart; + } + + /* + * We return a locked dquot to the caller, with a reference taken + */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + dqret: + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} + +/* + * Release a reference to the dquot (decrement ref-count) and unlock it. + * + * If there is a group quota attached to this dquot, carefully release that + * too without tripping over deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + struct xfs_dquot *dqp) +{ + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + trace_xfs_dqput(dqp); + + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); +} + +/* + * Release a dquot. Flush it if dirty, then dqput() it. + * dquot must not be locked. + */ +void +xfs_qm_dqrele( + xfs_dquot_t *dqp) +{ + if (!dqp) + return; + + trace_xfs_dqrele(dqp); + + xfs_dqlock(dqp); + /* + * We don't care to flush it if the dquot is dirty here. + * That will create stutters that we want to avoid. + * Instead we do a delayed write when we try to reclaim + * a dirty dquot. Also xfs_sync will take part of the burden... + */ + xfs_qm_dqput(dqp); +} + +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} + +/* + * Write a modified dquot to disk. + * The dquot must be locked and the flush lock too taken by caller. + * The flush lock will not be unlocked until the dquot reaches the disk, + * but the dquot is free to be unlocked and modified by the caller + * in the interim. Dquot is still locked on return. This behavior is + * identical to that of inodes. + */ +int +xfs_qm_dqflush( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddqp; + int error; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); + + trace_xfs_dqflush(dqp); + + *bpp = NULL; + + xfs_qm_dqunpin_wait(dqp); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this dquot + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = -EIO; + goto out_unlock; + } + + /* + * Get the buffer containing the on-disk dquot + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + if (error) + goto out_unlock; + + /* + * Calculate the location of the dquot inside the buffer. + */ + ddqp = bp->b_addr + dqp->q_bufoffset; + + /* + * A simple sanity check in case we got a corrupted dquot.. + */ + error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + XFS_QMOPT_DOWARN, "dqflush (incore copy)"); + if (error) { + xfs_buf_relse(bp); + xfs_dqfunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EIO; + } + + /* This is the only portion of data that needs to persist */ + memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + + /* + * Clear the dirty field and remember the flush lsn for later use. + */ + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, + &dqp->q_logitem.qli_item.li_lsn); + + /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + /* + * Attach an iodone routine so that we can remove this dquot from the + * AIL and release the flush lock once the dquot is synced to disk. + */ + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) { + trace_xfs_dqflush_force(dqp); + xfs_log_force(mp, 0); + } + + trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; + +out_unlock: + xfs_dqfunlock(dqp); + return -EIO; +} + +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ +void +xfs_dqlock2( + xfs_dquot_t *d1, + xfs_dquot_t *d2) +{ + if (d1 && d2) { + ASSERT(d1 != d2); + if (be32_to_cpu(d1->q_core.d_id) > + be32_to_cpu(d2->q_core.d_id)) { + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); + } else { + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); + } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); + } +} + +int __init +xfs_qm_init(void) +{ + xfs_qm_dqzone = + kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + if (!xfs_qm_dqzone) + goto out; + + xfs_qm_dqtrxzone = + kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + if (!xfs_qm_dqtrxzone) + goto out_free_dqzone; + + return 0; + +out_free_dqzone: + kmem_zone_destroy(xfs_qm_dqzone); +out: + return -ENOMEM; +} + +void +xfs_qm_exit(void) +{ + kmem_zone_destroy(xfs_qm_dqtrxzone); + kmem_zone_destroy(xfs_qm_dqzone); +} diff --git a/kernel/fs/xfs/xfs_dquot.h b/kernel/fs/xfs/xfs_dquot.h new file mode 100644 index 000000000..2f536f33c --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_H__ +#define __XFS_DQUOT_H__ + +/* + * Dquots are structures that hold quota information about a user or a group, + * much like inodes are for files. In fact, dquots share many characteristics + * with inodes. However, dquots can also be a centralized resource, relative + * to a collection of inodes. In this respect, dquots share some characteristics + * of the superblock. + * XFS dquots exploit both those in its algorithms. They make every attempt + * to not be a bottleneck when quotas are on and have minimal impact, if any, + * when quotas are off. + */ + +struct xfs_mount; +struct xfs_trans; + +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + +/* + * The incore dquot structure + */ +typedef struct xfs_dquot { + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_lru; /* global free list of dquots */ + struct xfs_mount*q_mount; /* filesystem this relates to */ + struct xfs_trans*q_transp; /* trans this belongs to currently */ + uint q_nrefs; /* # active refs from inodes */ + xfs_daddr_t q_blkno; /* blkno of dquot buffer */ + int q_bufoffset; /* off of dq in buffer (# dquots) */ + xfs_fileoff_t q_fileoffset; /* offset in quotas file */ + + xfs_disk_dquot_t q_core; /* actual usage & quotas */ + xfs_dq_logitem_t q_logitem; /* dquot log item */ + xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ + xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; /* quota lock */ + struct completion q_flush; /* flush completion queue */ + atomic_t q_pincount; /* dquot pin count */ + wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ +} xfs_dquot_t; + +/* + * Lock hierarchy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + +/* + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. + */ +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} + +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return XFS_IS_UQUOTA_ON(mp); + case XFS_DQ_GROUP: + return XFS_IS_GQUOTA_ON(mp); + case XFS_DQ_PROJ: + return XFS_IS_PQUOTA_ON(mp); + default: + return 0; + } +} + +static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return ip->i_udquot; + case XFS_DQ_GROUP: + return ip->i_gdquot; + case XFS_DQ_PROJ: + return ip->i_pdquot; + default: + return NULL; + } +} + +/* + * Check whether a dquot is under low free space conditions. We assume the quota + * is enabled and enforced. + */ +static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) +{ + int64_t freesp; + + freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + return false; +} + +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) + +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); +extern void xfs_qm_dqdestroy(xfs_dquot_t *); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); +extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); +extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, + xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); +extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, + xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern void xfs_qm_dqput(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); + +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} + +#endif /* __XFS_DQUOT_H__ */ diff --git a/kernel/fs/xfs/xfs_dquot_item.c b/kernel/fs/xfs/xfs_dquot_item.c new file mode 100644 index 000000000..814cff94e --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot_item.c @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" + +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + +/* + * returns the number of iovecs needed to log the given dquot item. + */ +STATIC void +xfs_qm_dquot_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 2; + *nbytes += sizeof(struct xfs_dq_logformat) + + sizeof(struct xfs_disk_dquot); +} + +/* + * fills in the vector of log iovecs for the given dquot log item. + */ +STATIC void +xfs_qm_dquot_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); +} + +/* + * Increment the pin count of the given dquot. + */ +STATIC void +xfs_qm_dquot_logitem_pin( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + atomic_inc(&dqp->q_pincount); +} + +/* + * Decrement the pin count of the given dquot, and wake up + * anyone in xfs_dqwait_unpin() if the count goes to 0. The + * dquot must have been previously pinned with a call to + * xfs_qm_dquot_logitem_pin(). + */ +STATIC void +xfs_qm_dquot_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(atomic_read(&dqp->q_pincount) > 0); + if (atomic_dec_and_test(&dqp->q_pincount)) + wake_up(&dqp->q_pinwait); +} + +STATIC xfs_lsn_t +xfs_qm_dquot_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + /* + * We always re-log the entire dquot when it becomes dirty, + * so, the latest copy _is_ the only one that matters. + */ + return lsn; +} + +/* + * This is called to wait for the given dquot to be unpinned. + * Most of these pin/unpin routines are plagiarized from inode code. + */ +void +xfs_qm_dqunpin_wait( + struct xfs_dquot *dqp) +{ + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (atomic_read(&dqp->q_pincount) == 0) + return; + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(dqp->q_mount, 0); + wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); +} + +STATIC uint +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; + + if (atomic_read(&dqp->q_pincount) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_dqlock_nowait(dqp)) + return XFS_ITEM_LOCKED; + + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_dqflock_nowait(dqp)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; + } + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; +} + +/* + * Unlock the dquot associated with the log item. + * Clear the fields of the dquot and dquot log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the dquot. + */ +STATIC void +xfs_qm_dquot_logitem_unlock( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * Clear the transaction pointer in the dquot + */ + dqp->q_transp = NULL; + + /* + * dquots are never 'held' from getting unlocked at the end of + * a transaction. Their locking and unlocking is hidden inside the + * transaction layer, within trans_commit. Hence, no LI_HOLD flag + * for the logitem. + */ + xfs_dqunlock(dqp); +} + +/* + * this needs to stamp an lsn into the dquot, I think. + * rpc's that look at user dquot's would then have to + * push on the dependency recorded in the dquot + */ +STATIC void +xfs_qm_dquot_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector for dquots + */ +static const struct xfs_item_ops xfs_dquot_item_ops = { + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_committing = xfs_qm_dquot_logitem_committing +}; + +/* + * Initialize the dquot log item for a newly allocated dquot. + * The dquot isn't locked at this point, but it isn't on any of the lists + * either, so we don't care. + */ +void +xfs_qm_dquot_logitem_init( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *lp = &dqp->q_logitem; + + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); + lp->qli_dquot = dqp; +} + +/*------------------ QUOTAOFF LOG ITEMS -------------------*/ + +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + +/* + * This returns the number of iovecs needed to log the given quotaoff item. + * We only need 1 iovec for an quotaoff item. It just logs the + * quotaoff_log_format structure. + */ +STATIC void +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_qoff_logitem); +} + +STATIC void +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); +} + +/* + * Pinning has no meaning for an quotaoff item, so just return. + */ +STATIC void +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an quotaoff item, unpinning does + * not either. + */ +STATIC void +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Quotaoff items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +STATIC void +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) +{ +} + +/* + * The quotaoff-start-item is logged only once and cannot be moved in the log, + * so simply return the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +STATIC xfs_lsn_t +xfs_qm_qoffend_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; + + /* + * Delete the qoff-start logitem from the AIL. + * xfs_trans_ail_delete() drops the AIL lock. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + + kmem_free(qfs); + kmem_free(qfe); + return (xfs_lsn_t)-1; +} + +/* + * XXX rcc - don't know quite what to do with this. I think we can + * just ignore it. The only time that isn't the case is if we allow + * the client to somehow see that quotas have been turned off in which + * we can't allow that to get back until the quotaoff hits the disk. + * So how would that happen? Also, do we need different routines for + * quotaoff start and quotaoff end? I suspect the answer is yes but + * to be sure, I need to look at the recovery code and see how quota off + * recovery is handled (do we roll forward or back or do something else). + * If we roll forwards or backwards, then we need two separate routines, + * one that does nothing and one that stamps in the lsn that matters + * (truly makes the quotaoff irrevocable). If we do something else, + * then maybe we don't need two. + */ +STATIC void +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * This is the ops vector shared by all quotaoff-start log items. + */ +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * Allocate and initialize an quotaoff item of the correct quota type(s). + */ +struct xfs_qoff_logitem * +xfs_qm_qoff_logitem_init( + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) +{ + struct xfs_qoff_logitem *qf; + + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); + qf->qql_item.li_mountp = mp; + qf->qql_start_lip = start; + qf->qql_flags = flags; + return qf; +} diff --git a/kernel/fs/xfs/xfs_dquot_item.h b/kernel/fs/xfs/xfs_dquot_item.h new file mode 100644 index 000000000..502e94646 --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot_item.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_ITEM_H__ +#define __XFS_DQUOT_ITEM_H__ + +struct xfs_dquot; +struct xfs_trans; +struct xfs_mount; +struct xfs_qoff_logitem; + +typedef struct xfs_dq_logitem { + xfs_log_item_t qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ +} xfs_dq_logitem_t; + +typedef struct xfs_qoff_logitem { + xfs_log_item_t qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ + unsigned int qql_flags; +} xfs_qoff_logitem_t; + + +extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); +extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, + struct xfs_qoff_logitem *, uint); +extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *, uint); +extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *); + +#endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_error.c b/kernel/fs/xfs/xfs_error.c new file mode 100644 index 000000000..338e50bbf --- /dev/null +++ b/kernel/fs/xfs/xfs_error.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_fs.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" + +#ifdef DEBUG + +int xfs_etest[XFS_NUM_INJECT_ERROR]; +int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; +char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; + +int +xfs_error_test(int error_tag, int *fsidp, char *expression, + int line, char *file, unsigned long randfactor) +{ + int i; + int64_t fsid; + + if (prandom_u32() % randfactor) + return 0; + + memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { + xfs_warn(NULL, + "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, xfs_etest_fsname[i]); + return 1; + } + } + + return 0; +} + +int +xfs_errortag_add(int error_tag, xfs_mount_t *mp) +{ + int i; + int len; + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { + xfs_warn(mp, "error tag #%d on", error_tag); + return 0; + } + } + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == 0) { + xfs_warn(mp, "Turned on XFS error tag #%d", + error_tag); + xfs_etest[i] = error_tag; + xfs_etest_fsid[i] = fsid; + len = strlen(mp->m_fsname); + xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); + strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; + return 0; + } + } + + xfs_warn(mp, "error tag overflow, too many turned on"); + + return 1; +} + +int +xfs_errortag_clearall(xfs_mount_t *mp, int loud) +{ + int64_t fsid; + int cleared = 0; + int i; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && + xfs_etest[i] != 0) { + cleared = 1; + xfs_warn(mp, "Clearing XFS error tag #%d", + xfs_etest[i]); + xfs_etest[i] = 0; + xfs_etest_fsid[i] = 0LL; + kmem_free(xfs_etest_fsname[i]); + xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; + } + } + + if (loud || cleared) + xfs_warn(mp, "Cleared all XFS error tags for filesystem"); + + return 0; +} +#endif /* DEBUG */ + +void +xfs_error_report( + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) { + xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, + "Internal error %s at line %d of file %s. Caller %pS", + tag, linenum, filename, ra); + + xfs_stack_trace(); + } +} + +void +xfs_corruption_error( + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) + xfs_hex_dump(p, 64); + xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); +} + +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} diff --git a/kernel/fs/xfs/xfs_error.h b/kernel/fs/xfs/xfs_error.h new file mode 100644 index 000000000..c0394ed12 --- /dev/null +++ b/kernel/fs/xfs/xfs_error.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ERROR_H__ +#define __XFS_ERROR_H__ + +struct xfs_mount; + +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); + +#define XFS_ERROR_REPORT(e, lvl, mp) \ + xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) +#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \ + xfs_corruption_error(e, lvl, mp, mem, \ + __FILE__, __LINE__, __return_address) + +#define XFS_ERRLEVEL_OFF 0 +#define XFS_ERRLEVEL_LOW 1 +#define XFS_ERRLEVEL_HIGH 5 + +/* + * Macros to set EFSCORRUPTED & return/branch. + */ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ + XFS_ERRLEVEL_LOW, mp); \ + error = -EFSCORRUPTED; \ + goto l; \ + } \ + } + +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ + XFS_ERRLEVEL_LOW, mp); \ + return -EFSCORRUPTED; \ + } \ + } + +/* + * error injection tags - the labels can be anything you want + * but each tag should have its own unique number + */ + +#define XFS_ERRTAG_NOERROR 0 +#define XFS_ERRTAG_IFLUSH_1 1 +#define XFS_ERRTAG_IFLUSH_2 2 +#define XFS_ERRTAG_IFLUSH_3 3 +#define XFS_ERRTAG_IFLUSH_4 4 +#define XFS_ERRTAG_IFLUSH_5 5 +#define XFS_ERRTAG_IFLUSH_6 6 +#define XFS_ERRTAG_DA_READ_BUF 7 +#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 +#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 +#define XFS_ERRTAG_ALLOC_READ_AGF 10 +#define XFS_ERRTAG_IALLOC_READ_AGI 11 +#define XFS_ERRTAG_ITOBP_INOTOBP 12 +#define XFS_ERRTAG_IUNLINK 13 +#define XFS_ERRTAG_IUNLINK_REMOVE 14 +#define XFS_ERRTAG_DIR_INO_VALIDATE 15 +#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 +#define XFS_ERRTAG_IODONE_IOERR 17 +#define XFS_ERRTAG_STRATREAD_IOERR 18 +#define XFS_ERRTAG_STRATCMPL_IOERR 19 +#define XFS_ERRTAG_DIOWRITE_IOERR 20 +#define XFS_ERRTAG_BMAPIFORMAT 21 +#define XFS_ERRTAG_MAX 22 + +/* + * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. + */ +#define XFS_RANDOM_DEFAULT 100 +#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) +#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT + +#ifdef DEBUG +extern int xfs_error_test_active; +extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); + +#define XFS_NUM_INJECT_ERROR 10 +#define XFS_TEST_ERROR(expr, mp, tag, rf) \ + ((expr) || (xfs_error_test_active && \ + xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ + (rf)))) + +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); +#else +#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) +#define xfs_errortag_add(tag, mp) (ENOSYS) +#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#endif /* DEBUG */ + +/* + * XFS panic tags -- allow a call to xfs_alert_tag() be turned into + * a panic by setting xfs_panic_mask in a sysctl. + */ +#define XFS_NO_PTAG 0 +#define XFS_PTAG_IFLUSH 0x00000001 +#define XFS_PTAG_LOGRES 0x00000002 +#define XFS_PTAG_AILDELETE 0x00000004 +#define XFS_PTAG_ERROR_REPORT 0x00000008 +#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 +#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 +#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 + +#endif /* __XFS_ERROR_H__ */ diff --git a/kernel/fs/xfs/xfs_export.c b/kernel/fs/xfs/xfs_export.c new file mode 100644 index 000000000..652cd3c5b --- /dev/null +++ b/kernel/fs/xfs/xfs_export.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_export.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_pnfs.h" + +/* + * Note that we only accept fileids which are long enough rather than allow + * the parent generation number to default to zero. XFS considers zero a + * valid generation number not an invalid/wildcard value. + */ +static int xfs_fileid_length(int fileid_type) +{ + switch (fileid_type) { + case FILEID_INO32_GEN: + return 2; + case FILEID_INO32_GEN_PARENT: + return 4; + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + return 3; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + return 6; + } + return FILEID_INVALID; +} + +STATIC int +xfs_fs_encode_fh( + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) +{ + struct fid *fid = (struct fid *)fh; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; + int fileid_type; + int len; + + /* Directories don't need their parent encoded, they have ".." */ + if (!parent) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + fileid_type |= XFS_FILEID_TYPE_64FLAG; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(fileid_type); + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + *max_len = len; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN: + fid->i32.ino = XFS_I(inode)->i_ino; + fid->i32.gen = inode->i_generation; + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + fid64->ino = XFS_I(inode)->i_ino; + fid64->gen = inode->i_generation; + break; + } + + return fileid_type; +} + +STATIC struct inode * +xfs_nfs_get_inode( + struct super_block *sb, + u64 ino, + u32 generation) + { + xfs_mount_t *mp = XFS_M(sb); + xfs_inode_t *ip; + int error; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return ERR_PTR(-ESTALE); + + /* + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. + */ + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error) { + /* + * EINVAL means the inode cluster doesn't exist anymore. + * This implies the filehandle is stale, so we should + * translate it here. + * We don't use ESTALE directly down the chain to not + * confuse applications using bulkstat that expect EINVAL. + */ + if (error == -EINVAL || error == -ENOENT) + error = -ESTALE; + return ERR_PTR(error); + } + + if (ip->i_d.di_gen != generation) { + IRELE(ip); + return ERR_PTR(-ESTALE); + } + + return VFS_I(ip); +} + +STATIC struct dentry * +xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + case FILEID_INO32_GEN: + inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, + fid->i32.parent_gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->parent_ino, + fid64->parent_gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_get_parent( + struct dentry *child) +{ + int error; + struct xfs_inode *cip; + + error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL); + if (unlikely(error)) + return ERR_PTR(error); + + return d_obtain_alias(VFS_I(cip)); +} + +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +const struct export_operations xfs_export_operations = { + .encode_fh = xfs_fs_encode_fh, + .fh_to_dentry = xfs_fs_fh_to_dentry, + .fh_to_parent = xfs_fs_fh_to_parent, + .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, +#ifdef CONFIG_NFSD_PNFS + .get_uuid = xfs_fs_get_uuid, + .map_blocks = xfs_fs_map_blocks, + .commit_blocks = xfs_fs_commit_blocks, +#endif +}; diff --git a/kernel/fs/xfs/xfs_export.h b/kernel/fs/xfs/xfs_export.h new file mode 100644 index 000000000..3272b6ae7 --- /dev/null +++ b/kernel/fs/xfs/xfs_export.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXPORT_H__ +#define __XFS_EXPORT_H__ + +/* + * Common defines for code related to exporting XFS filesystems over NFS. + * + * The NFS fileid goes out on the wire as an array of + * 32bit unsigned ints in host order. There are 5 possible + * formats. + * + * (1) fileid_type=0x00 + * (no fileid data; handled by the generic code) + * + * (2) fileid_type=0x01 + * inode-num + * generation + * + * (3) fileid_type=0x02 + * inode-num + * generation + * parent-inode-num + * parent-generation + * + * (4) fileid_type=0x81 + * inode-num-lo32 + * inode-num-hi32 + * generation + * + * (5) fileid_type=0x82 + * inode-num-lo32 + * inode-num-hi32 + * generation + * parent-inode-num-lo32 + * parent-inode-num-hi32 + * parent-generation + * + * Note, the NFS filehandle also includes an fsid portion which + * may have an inode number in it. That number is hardcoded to + * 32bits and there is no way for XFS to intercept it. In + * practice this means when exporting an XFS filesystem with 64bit + * inodes you should either export the mountpoint (rather than + * a subdirectory) or use the "fsid" export option. + */ + +struct xfs_fid64 { + u64 ino; + u32 gen; + u64 parent_ino; + u32 parent_gen; +} __attribute__((packed)); + +/* This flag goes on the wire. Don't play with it. */ +#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ + +#endif /* __XFS_EXPORT_H__ */ diff --git a/kernel/fs/xfs/xfs_extent_busy.c b/kernel/fs/xfs/xfs_extent_busy.c new file mode 100644 index 000000000..c263e0792 --- /dev/null +++ b/kernel/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_log.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entry's block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/kernel/fs/xfs/xfs_extent_busy.h b/kernel/fs/xfs/xfs_extent_busy.h new file mode 100644 index 000000000..bfff284d2 --- /dev/null +++ b/kernel/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +struct xfs_mount; +struct xfs_trans; +struct xfs_alloc_arg; + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/kernel/fs/xfs/xfs_extfree_item.c b/kernel/fs/xfs/xfs_extfree_item.c new file mode 100644 index 000000000..cb7fe64cd --- /dev/null +++ b/kernel/fs/xfs/xfs_extfree_item.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_extfree_item.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_efi_zone; +kmem_zone_t *xfs_efd_zone; + +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} + +void +xfs_efi_item_free( + struct xfs_efi_log_item *efip) +{ + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) + kmem_free(efip); + else + kmem_zone_free(xfs_efi_zone, efip); +} + +/* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (atomic_dec_and_test(&efip->efi_refcount)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); + } +} + +/* + * This returns the number of iovecs needed to log the given efi item. + * We only need 1 iovec for an efi item. It just logs the efi_log_format + * structure. + */ +static inline int +xfs_efi_item_sizeof( + struct xfs_efi_log_item *efip) +{ + return sizeof(struct xfs_efi_log_format) + + (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efi_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efi log item. We use only 1 iovec, and we point that + * at the efi_log_format structure embedded in the efi item. + * It is at this point that we assert that all of the extent + * slots in the efi item have been filled. + */ +STATIC void +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); + + efip->efi_format.efi_type = XFS_LI_EFI; + efip->efi_format.efi_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); +} + + +/* + * Pinning has no meaning for an efi item, so just return. + */ +STATIC void +xfs_efi_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. + */ +STATIC void +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + if (lip->li_desc) + xfs_trans_del_item(lip); + xfs_efi_item_free(efip); + return; + } + __xfs_efi_release(efip); +} + +/* + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. + */ +STATIC uint +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +STATIC void +xfs_efi_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); +} + +/* + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The EFI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efi log items. + */ +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing +}; + + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_efi_log_item *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efi_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = kmem_zalloc(size, KM_SLEEP); + } else { + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); + + return efip; +} + +/* + * Copy an EFI format buffer from the given buf, and into the destination + * EFI format structure. + * The given buffer can be in 32 bit or 64 bit form (which has different padding), + * one of which will be the native format for this kernel. + * It will handle the conversion of formats if necessary. + */ +int +xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) +{ + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; + uint i; + uint len = sizeof(xfs_efi_log_format_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + uint len32 = sizeof(xfs_efi_log_format_32_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + uint len64 = sizeof(xfs_efi_log_format_64_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + + if (buf->i_len == len) { + memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + return 0; + } else if (buf->i_len == len32) { + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_32->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_32->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_32->efi_extents[i].ext_len; + } + return 0; + } else if (buf->i_len == len64) { + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_64->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_64->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_64->efi_extents[i].ext_len; + } + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. + */ +void +xfs_efi_release(xfs_efi_log_item_t *efip, + uint nextents) +{ + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ + } +} + +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efd_log_item, efd_item); +} + +STATIC void +xfs_efd_item_free(struct xfs_efd_log_item *efdp) +{ + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) + kmem_free(efdp); + else + kmem_zone_free(xfs_efd_zone, efdp); +} + +/* + * This returns the number of iovecs needed to log the given efd item. + * We only need 1 iovec for an efd item. It just logs the efd_log_format + * structure. + */ +static inline int +xfs_efd_item_sizeof( + struct xfs_efd_log_item *efdp) +{ + return sizeof(xfs_efd_log_format_t) + + (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efd log item. We use only 1 iovec, and we point that + * at the efd_log_format structure embedded in the efd item. + * It is at this point that we assert that all of the extent + * slots in the efd item have been filled. + */ +STATIC void +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + + efdp->efd_format.efd_type = XFS_LI_EFD; + efdp->efd_format.efd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); +} + +/* + * Pinning has no meaning for an efd item, so just return. + */ +STATIC void +xfs_efd_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an efd item, unpinning does + * not either. + */ +STATIC void +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +STATIC void +xfs_efd_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); +} + +/* + * When the efd item is committed to disk, all we need to do + * is delete our reference to our partner efi item and then + * free ourselves. Since we're freeing ourselves we must + * return -1 to keep the transaction code from further referencing + * this item. + */ +STATIC xfs_lsn_t +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + /* + * If we got a log I/O error, it's always the case that the LR with the + * EFI got unpinned and freed before the EFD got aborted. + */ + if (!(lip->li_flags & XFS_LI_ABORTED)) + xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + + xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; +} + +/* + * The EFD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efd log items. + */ +static const struct xfs_item_ops xfs_efd_item_ops = { + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing +}; + +/* + * Allocate and initialize an efd item with the given number of extents. + */ +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) + +{ + struct xfs_efd_log_item *efdp; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efd_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efdp = kmem_zalloc(size, KM_SLEEP); + } else { + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = nextents; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return efdp; +} diff --git a/kernel/fs/xfs/xfs_extfree_item.h b/kernel/fs/xfs/xfs_extfree_item.h new file mode 100644 index 000000000..0ffbce32d --- /dev/null +++ b/kernel/fs/xfs/xfs_extfree_item.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTFREE_ITEM_H__ +#define __XFS_EXTFREE_ITEM_H__ + +/* kernel only EFI/EFD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFI_MAX_FAST_EXTENTS 16 + +/* + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_EFI_RECOVERED 1 + +/* + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. + */ +typedef struct xfs_efi_log_item { + xfs_log_item_t efi_item; + atomic_t efi_refcount; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ + xfs_efi_log_format_t efi_format; +} xfs_efi_log_item_t; + +/* + * This is the "extent free done" log item. It is used to log + * the fact that some extents earlier mentioned in an efi item + * have been freed. + */ +typedef struct xfs_efd_log_item { + xfs_log_item_t efd_item; + xfs_efi_log_item_t *efd_efip; + uint efd_next_extent; + xfs_efd_log_format_t efd_format; +} xfs_efd_log_item_t; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFD_MAX_FAST_EXTENTS 16 + +extern struct kmem_zone *xfs_efi_zone; +extern struct kmem_zone *xfs_efd_zone; + +xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, + uint); +int xfs_efi_copy_format(xfs_log_iovec_t *buf, + xfs_efi_log_format_t *dst_efi_fmt); +void xfs_efi_item_free(xfs_efi_log_item_t *); + +#endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_file.c b/kernel/fs/xfs/xfs_file.c new file mode 100644 index 000000000..3b7591224 --- /dev/null +++ b/kernel/fs/xfs/xfs_file.c @@ -0,0 +1,1534 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" + +#include +#include +#include + +static const struct vm_operations_struct xfs_file_vm_ops; + +/* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ +{ + struct page *page; + struct address_space *mapping; + int status; + + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return status; +} + +int +xfs_update_prealloc_flags( + struct xfs_inode *ip, + enum xfs_prealloc_flags flags) +{ + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(flags & XFS_PREALLOC_INVISIBLE)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (flags & XFS_PREALLOC_SET) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + if (flags & XFS_PREALLOC_CLEAR) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (flags & XFS_PREALLOC_SYNC) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} + +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +STATIC int +xfs_file_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + int log_flushed = 0; + xfs_lsn_t lsn = 0; + + trace_xfs_file_fsync(ip); + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + + /* + * All metadata updates are logged, which means that we just have + * to flush the log up to the latest LSN that touched the inode. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + if (!datasync || + (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = ip->i_itemp->ili_last_lsn; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + + return error; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = iov_iter_count(to); + ssize_t ret = 0; + int ioflags = 0; + xfs_fsize_t n; + loff_t pos = iocb->ki_pos; + + XFS_STATS_INC(xs_read_calls); + + if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + ioflags |= XFS_IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { + if (pos == i_size_read(inode)) + return 0; + return -EINVAL; + } + } + + n = mp->m_super->s_maxbytes - pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + + if (inode->i_mapping->nrpages) { + ret = filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, pos + size - 1); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } + + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + size - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } + + trace_xfs_file_read(ip, size, pos, ioflags); + + ret = generic_file_read_iter(iocb, to); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + unsigned int flags) +{ + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize, + bool *did_zeroing) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) + return 0; + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + *did_zeroing = true; + return xfs_iozero(ip, isize, zero_len); +} + +/* + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. + */ +int /* error (positive) */ +xfs_zero_eof( + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * + * We only zero a part of that block so it is handled specially. + */ + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); + if (error) + return error; + } + + /* + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + */ + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) + return error; + + *did_zeroing = true; + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + } + + return 0; +} + +/* + * Common pre-write limit and setup checks. + * + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. + */ +STATIC ssize_t +xfs_file_aio_write_checks( + struct kiocb *iocb, + struct iov_iter *from, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t error = 0; + size_t count = iov_iter_count(from); + +restart: + error = generic_write_checks(iocb, from); + if (error <= 0) + return error; + + error = xfs_break_layouts(inode, iolock, true); + if (error) + return error; + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + if (iocb->ki_pos > i_size_read(inode)) { + bool zero = false; + + spin_unlock(&ip->i_flags_lock); + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); + goto restart; + } + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + if (error) + return error; + } else + spin_unlock(&ip->i_flags_lock); + + /* + * Updating the timestamps will grab the ilock again from + * xfs_fs_dirty_inode, so we have to call it after dropping the + * lock above. Eventually we should look into a way to avoid + * the pointless lock roundtrip. + */ + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; + } + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); +} + +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. inode_dio_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) + return -EINVAL; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) + iolock = XFS_IOLOCK_EXCL; + else + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); + } + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + count = iov_iter_count(from); + pos = iocb->ki_pos; + end = pos + count - 1; + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, end); + if (ret) + goto out; + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + inode_dio_wait(inode); + else if (iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } +out: + xfs_rw_iunlock(ip, iolock); + + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; + + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + +write_retry: + trace_xfs_file_buffered_write(ip, iov_iter_count(from), + iocb->ki_pos, 0); + ret = generic_perform_write(file, from, iocb->ki_pos); + if (likely(ret >= 0)) + iocb->ki_pos += ret; + + /* + * If we hit a space limit, try to free up some lingering preallocated + * space before returning an error. In the case of ENOSPC, first try to + * write back all dirty inodes to free up some of the excess reserved + * metadata space. This reduces the chances that the eofblocks scan + * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this + * also behaves as a filter to prevent too many eofblocks scans from + * running at the same time. + */ + if (ret == -EDQUOT && !enospc) { + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + + enospc = 1; + xfs_flush_inodes(ip->i_mount); + eofb.eof_scan_owner = ip->i_ino; /* for locking */ + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + goto write_retry; + } + + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); + return ret; +} + +STATIC ssize_t +xfs_file_write_iter( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + size_t ocount = iov_iter_count(from); + + XFS_STATS_INC(xs_write_calls); + + if (ocount == 0) + return 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, from); + else + ret = xfs_file_buffered_aio_write(iocb, from); + + if (ret > 0) { + ssize_t err; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} + +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; + loff_t new_size = 0; + bool do_file_insert = 0; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, false); + if (error) + goto out_unlock; + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; + + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; + } else { + flags |= XFS_PREALLOC_SET; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + flags |= XFS_PREALLOC_SYNC; + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; + } + + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + +out_unlock: + xfs_iunlock(ip, iolock); + return error; +} + + +STATIC int +xfs_file_open( + struct inode *inode, + struct file *file) +{ + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + return -EIO; + return 0; +} + +STATIC int +xfs_dir_open( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + int mode; + int error; + + error = xfs_file_open(inode, file); + if (error) + return error; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. + */ + mode = xfs_ilock_data_map_shared(ip); + if (ip->i_d.di_nextents > 0) + xfs_dir3_data_readahead(ip, 0, -1); + xfs_iunlock(ip, mode); + return 0; +} + +STATIC int +xfs_file_release( + struct inode *inode, + struct file *filp) +{ + return xfs_release(XFS_I(inode)); +} + +STATIC int +xfs_file_readdir( + struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + xfs_inode_t *ip = XFS_I(inode); + size_t bufsize; + + /* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + */ + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + + return xfs_readdir(ip, ctx, bufsize); +} + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + vma->vm_ops = &xfs_file_vm_ops; + + file_accessed(filp); + return 0; +} + +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for xfs_seek_hole_data(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_hole_data(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +STATIC loff_t +xfs_seek_hole_data( + struct file *file, + loff_t start, + int whence) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lock = xfs_ilock_data_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = -ENXIO; + goto out_unlock; + } + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + fsbno = XFS_B_TO_FSBT(mp, start); + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = -ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in the hole we wanted? */ + if (whence == SEEK_HOLE && + map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* Landed in the data extent we wanted? */ + if (whence == SEEK_DATA && + (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock)))) + goto out; + + /* + * Landed in an unwritten extent, try to search + * for hole or data from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, + &offset)) + goto out; + } + } + + /* + * We only received one extent out of the two requested. This + * means we've hit EOF and didn't find what we are looking for. + */ + if (nmap == 1) { + /* + * If we were looking for a hole, set offset to + * the end of the file (i.e., there is an implicit + * hole at the end of any file). + */ + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + /* + * If we were looking for data, it's nowhere to be found + */ + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; + } + + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if the next reading offset is not at or beyond EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; + } + } + +out: + /* + * If at this point we have found the hole we wanted, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents. We need to deal with this + * situation in particular. + */ + if (whence == SEEK_HOLE) + offset = min_t(loff_t, offset, isize); + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out_unlock: + xfs_iunlock(ip, lock); + + if (error) + return error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int whence) +{ + switch (whence) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, whence); + case SEEK_HOLE: + case SEEK_DATA: + return xfs_seek_hole_data(file, offset, whence); + default: + return -EINVAL; + } +} + +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +const struct file_operations xfs_file_operations = { + .llseek = xfs_file_llseek, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, + .splice_read = xfs_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .mmap = xfs_file_mmap, + .open = xfs_file_open, + .release = xfs_file_release, + .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, +}; + +const struct file_operations xfs_dir_file_operations = { + .open = xfs_dir_open, + .read = generic_read_dir, + .iterate = xfs_file_readdir, + .llseek = generic_file_llseek, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .fsync = xfs_dir_fsync, +}; + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; diff --git a/kernel/fs/xfs/xfs_filestream.c b/kernel/fs/xfs/xfs_filestream.c new file mode 100644 index 000000000..da82f1cb4 --- /dev/null +++ b/kernel/fs/xfs/xfs_filestream.c @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" +#include "xfs_trace.h" + +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; + +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} + +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + + pag = xfs_perag_get(mp, ag); + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) { + xfs_perag_put(pag); + return err; + } + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + xfs_perag_put(pag); + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + xfs_perag_put(pag); + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + trace_xfs_filestream_pick(ip, *agp, free, nscan); + *agp = 0; + return 0; + } + + trace_xfs_filestream_pick(ip, *agp, free, nscan); + + if (*agp == NULLAGNUMBER) + return 0; + + err = -ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + if (!item) + goto out_put_ag; + + item->ag = *agp; + item->ip = ip; + + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); + if (err) { + if (err == -EEXIST) + err = 0; + goto out_free_item; + } + + return 0; + +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); + return err; +} + +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; + + dentry = d_find_alias(inode); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + dir = igrab(d_inode(parent)); + dput(parent); + +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; +} + +/* + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. + * + * Returns NULLAGNUMBER in case of an error. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; + + ASSERT(S_ISREG(ip->i_d.di_mode)); + + pip = xfs_filestream_get_parent(ip); + if (!pip) + return NULLAGNUMBER; + + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); + + trace_xfs_filestream_lookup(ip, ag); + goto out; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + xfs_agnumber_t rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; +} + +/* + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. + */ +int +xfs_filestream_new_ag( + struct xfs_bmalloca *ap, + xfs_agnumber_t *agp) +{ + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; + + *agp = NULLAGNUMBER; + + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; + + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; + } + + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); + + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); + + /* + * Only free the item here so we skip over the old AG earlier. + */ + if (mru) + xfs_fstrm_free_func(mru); + + IRELE(pip); +exit: + if (*agp == NULLAGNUMBER) + *agp = 0; + return err; +} + +void +xfs_filestream_deassociate( + struct xfs_inode *ip) +{ + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} + +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} + +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} diff --git a/kernel/fs/xfs/xfs_filestream.h b/kernel/fs/xfs/xfs_filestream.h new file mode 100644 index 000000000..2ef43406e --- /dev/null +++ b/kernel/fs/xfs/xfs_filestream.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_bmalloca; + +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); + +static inline int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/kernel/fs/xfs/xfs_fsops.c b/kernel/fs/xfs/xfs_fsops.c new file mode 100644 index 000000000..cb7e8a29d --- /dev/null +++ b/kernel/fs/xfs/xfs_fsops.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_fsops.h" +#include "xfs_itable.h" +#include "xfs_trans_space.h" +#include "xfs_rtalloc.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_filestream.h" + +/* + * File system operations + */ + +int +xfs_fs_geometry( + xfs_mount_t *mp, + xfs_fsop_geom_t *geo, + int new_version) +{ + + memset(geo, 0, sizeof(*geo)); + + geo->blocksize = mp->m_sb.sb_blocksize; + geo->rtextsize = mp->m_sb.sb_rextsize; + geo->agblocks = mp->m_sb.sb_agblocks; + geo->agcount = mp->m_sb.sb_agcount; + geo->logblocks = mp->m_sb.sb_logblocks; + geo->sectsize = mp->m_sb.sb_sectsize; + geo->inodesize = mp->m_sb.sb_inodesize; + geo->imaxpct = mp->m_sb.sb_imax_pct; + geo->datablocks = mp->m_sb.sb_dblocks; + geo->rtblocks = mp->m_sb.sb_rblocks; + geo->rtextents = mp->m_sb.sb_rextents; + geo->logstart = mp->m_sb.sb_logstart; + ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); + memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); + if (new_version >= 2) { + geo->sunit = mp->m_sb.sb_unit; + geo->swidth = mp->m_sb.sb_width; + } + if (new_version >= 3) { + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | + (xfs_sb_version_hasattr(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR : 0) | + (xfs_sb_version_hasquota(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | + (xfs_sb_version_hasalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | + (xfs_sb_version_hasdalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | + (xfs_sb_version_hasextflgbit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | + (xfs_sb_version_hassector(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | + (xfs_sb_version_hasattr2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0) | + (xfs_sb_version_hasftype(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? + mp->m_sb.sb_logsectsize : BBSIZE; + geo->rtsectsize = mp->m_sb.sb_blocksize; + geo->dirblocksize = mp->m_dir_geo->blksize; + } + if (new_version >= 4) { + geo->flags |= + (xfs_sb_version_haslogv2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); + geo->logsunit = mp->m_sb.sb_logsunit; + } + return 0; +} + +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + +static int +xfs_growfs_data_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_data_t *in) /* growfs data input struct */ +{ + xfs_agf_t *agf; + struct xfs_agfl *agfl; + xfs_agi_t *agi; + xfs_agnumber_t agno; + xfs_extlen_t agsize; + xfs_extlen_t tmpsize; + xfs_alloc_rec_t *arec; + xfs_buf_t *bp; + int bucket; + int dpct; + int error, saved_error = 0; + xfs_agnumber_t nagcount; + xfs_agnumber_t nagimax = 0; + xfs_rfsblock_t nb, nb_mod; + xfs_rfsblock_t new; + xfs_rfsblock_t nfree; + xfs_agnumber_t oagcount; + int pct; + xfs_trans_t *tp; + + nb = in->newblocks; + pct = in->imaxpct; + if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + return -EINVAL; + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; + dpct = pct - mp->m_sb.sb_imax_pct; + error = xfs_buf_read_uncached(mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) + return error; + xfs_buf_relse(bp); + + new = nb; /* use new as a temporary here */ + nb_mod = do_div(new, mp->m_sb.sb_agblocks); + nagcount = new + (nb_mod != 0); + if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { + nagcount--; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; + if (nb < mp->m_sb.sb_dblocks) + return -EINVAL; + } + new = nb - mp->m_sb.sb_dblocks; + oagcount = mp->m_sb.sb_agcount; + + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ + nfree = 0; + for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + + /* + * AG freespace header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agf = XFS_BUF_TO_AGF(bp); + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(agno); + if (agno == nagcount - 1) + agsize = + nb - + (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + agsize = mp->m_sb.sb_agblocks; + agf->agf_length = cpu_to_be32(agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flcount = 0; + tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * AG inode header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agi = XFS_BUF_TO_AGI(bp); + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(agno); + agi->agi_length = cpu_to_be32(agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * BNO btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * CNT btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + nfree += be32_to_cpu(arec->ar_blockcount); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * INO btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + + } + xfs_trans_agblocks_delta(tp, nfree); + /* + * There are new blocks in the old last a.g. + */ + if (new) { + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, new); + ASSERT(nagcount == oagcount || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, new); + ASSERT(be32_to_cpu(agf->agf_length) == + be32_to_cpu(agi->agi_length)); + + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + /* + * Free the new space. + */ + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agf->agf_length) - new), new); + if (error) { + goto error0; + } + } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ + if (nagcount > oagcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); + if (nb > mp->m_sb.sb_dblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, + nb - mp->m_sb.sb_dblocks); + if (nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); + if (dpct) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + if (error) + return error; + + /* New allocation groups fully initialized, so update mount struct */ + if (nagimax) + mp->m_maxagi = nagimax; + if (mp->m_sb.sb_imax_pct) { + __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); + + /* update secondary superblocks. */ + for (agno = 1; agno < nagcount; agno++) { + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, + &xfs_sb_buf_ops); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) { + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + } else + error = -ENOMEM; + } + + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ + if (error) { + xfs_warn(mp, + "error %d reading secondary superblock for ag %d", + error, agno); + saved_error = error; + continue; + } + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { + xfs_warn(mp, + "write error %d updating secondary superblock for ag %d", + error, agno); + saved_error = error; + continue; + } + } + return saved_error ? saved_error : error; + + error0: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +static int +xfs_growfs_log_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_log_t *in) /* growfs log input struct */ +{ + xfs_extlen_t nb; + + nb = in->newblocks; + if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) + return -EINVAL; + if (nb == mp->m_sb.sb_logblocks && + in->isint == (mp->m_sb.sb_logstart != 0)) + return -EINVAL; + /* + * Moving the log is hard, need new interfaces to sync + * the log first, hold off all activity while moving it. + * Can have shorter or longer log in the same space, + * or transform internal to external log or vice versa. + */ + return -ENOSYS; +} + +/* + * protected versions of growfs function acquire and release locks on the mount + * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, + * XFS_IOC_FSGROWFSRT + */ + + +int +xfs_growfs_data( + xfs_mount_t *mp, + xfs_growfs_data_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; + error = xfs_growfs_data_private(mp, in); + /* + * Increment the generation unconditionally, the error could be from + * updating the secondary superblocks, in which case the new size + * is live already. + */ + mp->m_generation++; + mutex_unlock(&mp->m_growlock); + return error; +} + +int +xfs_growfs_log( + xfs_mount_t *mp, + xfs_growfs_log_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; + error = xfs_growfs_log_private(mp, in); + mutex_unlock(&mp->m_growlock); + return error; +} + +/* + * exported through ioctl XFS_IOC_FSCOUNTS + */ + +int +xfs_fs_counts( + xfs_mount_t *mp, + xfs_fsop_counts_t *cnt) +{ + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + + spin_lock(&mp->m_sb_lock); + cnt->freertx = mp->m_sb.sb_frextents; + spin_unlock(&mp->m_sb_lock); + return 0; +} + +/* + * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS + * + * xfs_reserve_blocks is called to set m_resblks + * in the in-core mount table. The number of unused reserved blocks + * is kept in m_resblks_avail. + * + * Reserve the requested number of blocks if available. Otherwise return + * as many as possible to satisfy the request. The actual number + * reserved are returned in outval + * + * A null inval pointer indicates that only the current reserved blocks + * available should be returned no settings are changed. + */ + +int +xfs_reserve_blocks( + xfs_mount_t *mp, + __uint64_t *inval, + xfs_fsop_resblks_t *outval) +{ + __int64_t lcounter, delta, fdblks_delta; + __uint64_t request; + + /* If inval is null, report current values and return */ + if (inval == (__uint64_t *)NULL) { + if (!outval) + return -EINVAL; + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + return 0; + } + + request = *inval; + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the m_sb_lock so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + */ +retry: + spin_lock(&mp->m_sb_lock); + + /* + * If our previous reservation was larger than the current value, + * then move any unused blocks back to the free pool. + */ + fdblks_delta = 0; + if (mp->m_resblks > request) { + lcounter = mp->m_resblks_avail - request; + if (lcounter > 0) { /* release unused blocks */ + fdblks_delta = lcounter; + mp->m_resblks_avail -= lcounter; + } + mp->m_resblks = request; + } else { + __int64_t free; + + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + + delta = request - mp->m_resblks; + lcounter = free - delta; + if (lcounter < 0) { + /* We can't satisfy the request, just get what we can */ + mp->m_resblks += free; + mp->m_resblks_avail += free; + fdblks_delta = -free; + } else { + fdblks_delta = -delta; + mp->m_resblks = request; + mp->m_resblks_avail += delta; + } + } +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + spin_unlock(&mp->m_sb_lock); + + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at its max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + if (error == -ENOSPC) + goto retry; + } + return 0; +} + +int +xfs_fs_goingdown( + xfs_mount_t *mp, + __uint32_t inflags) +{ + switch (inflags) { + case XFS_FSOP_GOING_FLAGS_DEFAULT: { + struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); + + if (sb && !IS_ERR(sb)) { + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + thaw_bdev(sb->s_bdev, sb); + } + + break; + } + case XFS_FSOP_GOING_FLAGS_LOGFLUSH: + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + break; + case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: + xfs_force_shutdown(mp, + SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/kernel/fs/xfs/xfs_fsops.h b/kernel/fs/xfs/xfs_fsops.h new file mode 100644 index 000000000..1b6a98b66 --- /dev/null +++ b/kernel/fs/xfs/xfs_fsops.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FSOPS_H__ +#define __XFS_FSOPS_H__ + +extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); +extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); +extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); +extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); +extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, + xfs_fsop_resblks_t *outval); +extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); + +#endif /* __XFS_FSOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_globals.c b/kernel/fs/xfs/xfs_globals.c new file mode 100644 index 000000000..4d41b2412 --- /dev/null +++ b/kernel/fs/xfs/xfs_globals.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sysctl.h" + +/* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, + * other XFS code uses these values. Times are measured in centisecs (i.e. + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. + */ +xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, + .panic_mask = { 0, 0, 255 }, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, + .inherit_sync = { 0, 1, 1 }, + .inherit_nodump = { 0, 1, 1 }, + .inherit_noatim = { 0, 1, 1 }, + .xfs_buf_timer = { 100/2, 1*100, 30*100 }, + .xfs_buf_age = { 1*100, 15*100, 7200*100}, + .inherit_nosym = { 0, 0, 1 }, + .rotorstep = { 1, 1, 255 }, + .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, +}; + +struct xfs_globals xfs_globals = { + .log_recovery_delay = 0, /* no delay by default */ +}; diff --git a/kernel/fs/xfs/xfs_icache.c b/kernel/fs/xfs/xfs_icache.c new file mode 100644 index 000000000..76a9f2783 --- /dev/null +++ b/kernel/fs/xfs/xfs_icache.c @@ -0,0 +1,1418 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + +#include +#include + +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + XFS_STATS_INC(vn_active); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = -EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = -EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = -ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = -EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return -ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = -ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = -EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; + ip->i_pdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = -EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return -EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can setup the inode + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_existing_inode(ip); + return 0; + +out_error_or_again: + if (error == -EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return -ENOENT; + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return -ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], flags, args); + IRELE(batch[i]); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == -EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +STATIC void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. + */ + if ((flags & SYNC_TRYLOCK) && + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + return 1; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, async - requeue + * dirty, sync 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. + * + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, async => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, async => requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + struct xfs_buf *bp = NULL; + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* + * Now we have an inode that needs flushing. + * + * Note that xfs_iflush will never block on the inode buffer lock, as + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. + */ + error = xfs_iflush(ip, &bp); + if (error == -EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + + xfs_iflock(ip); +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_inode_free(ip); + return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return -EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. + */ + return 0; +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +STATIC int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + cond_resched(); + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return last_error; +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Scan a certain number of inodes for reclaim. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +long +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) +{ + /* kick background reclaimer and push the AIL */ + xfs_reclaim_work_queue(mp); + xfs_ail_push_all(mp->m_ail); + + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} + +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; + + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; +} + +/* + * A union-based inode filtering algorithm. Process the inode if any of the + * criteria match. This is for global/internal scans only. + */ +STATIC int +xfs_inode_match_id_union( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) == eofb->eof_prid) + return 1; + + return 0; +} + +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); + + /* don't revisit the inode if we're not waiting */ + if (ret == -EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + eofb, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Run eofblocks scans on the quotas applicable to the inode. For inodes with + * multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + int scan = 0; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Set the scan owner to avoid a potential livelock. Otherwise, the scan + * can repeatedly trylock on the inode we're currently processing. We + * run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + scan = 1; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + scan = 1; + } + } + + if (scan) + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + + return scan; +} + +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/kernel/fs/xfs/xfs_icache.h b/kernel/fs/xfs/xfs_icache.h new file mode 100644 index 000000000..62f1f91c3 --- /dev/null +++ b/kernel/fs/xfs/xfs_icache.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +struct xfs_eofblocks { + __u32 eof_flags; + kuid_t eof_uid; + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; + xfs_ino_t eof_scan_owner; +}; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +/* + * tags for inode radix tree + */ +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ + +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 + +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + +void xfs_reclaim_worker(struct work_struct *work); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); + +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); +void xfs_eofblocks_worker(struct work_struct *); + +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int tag); + +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return -EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return -EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return -EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + dst->eof_scan_owner = NULLFSINO; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return -EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return -EINVAL; + } + return 0; +} + +#endif diff --git a/kernel/fs/xfs/xfs_icreate_item.c b/kernel/fs/xfs/xfs_icreate_item.c new file mode 100644 index 000000000..d45ca72af --- /dev/null +++ b/kernel/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" +#include "xfs_log.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC void +xfs_icreate_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_icreate_log); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/kernel/fs/xfs/xfs_icreate_item.h b/kernel/fs/xfs/xfs_icreate_item.h new file mode 100644 index 000000000..59e89f87c --- /dev/null +++ b/kernel/fs/xfs/xfs_icreate_item.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008-2010, Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 + +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; + +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/kernel/fs/xfs/xfs_inode.c b/kernel/fs/xfs/xfs_inode.c new file mode 100644 index 000000000..539a85fdd --- /dev/null +++ b/kernel/fs/xfs/xfs_inode.c @@ -0,0 +1,3606 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_attr.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" +#include "xfs_ialloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_filestream.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" + +kmem_zone_t *xfs_inode_zone; + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); + +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); + +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + +/* + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. + * + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. + */ +uint +xfs_ilock_data_map_shared( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. + * + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. + * + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_mmaplock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_mmaplock; + } + return 1; + +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); +out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#if defined(DEBUG) || defined(XFS_WARN) +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +#ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + +/* + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. + */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) +{ + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + + return lock_mode; +} + +/* + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. + */ +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) +{ + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; + + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); + + try_lock = 0; + i = 0; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + + if (i && (ips[i] == ips[i - 1])) /* Already locked */ + continue; + + /* + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. + */ + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + try_lock++; + } + } + + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; + + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { + /* + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. + */ + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; + } + +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif +} + +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. + */ +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } +} + + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + +STATIC uint +_xfs_dic2xflags( + __uint16_t di_flags) +{ + uint flags = 0; + + if (di_flags & XFS_DIFLAG_ANY) { + if (di_flags & XFS_DIFLAG_REALTIME) + flags |= XFS_XFLAG_REALTIME; + if (di_flags & XFS_DIFLAG_PREALLOC) + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= XFS_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= XFS_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= XFS_XFLAG_NODUMP; + if (di_flags & XFS_DIFLAG_RTINHERIT) + flags |= XFS_XFLAG_RTINHERIT; + if (di_flags & XFS_DIFLAG_PROJINHERIT) + flags |= XFS_XFLAG_PROJINHERIT; + if (di_flags & XFS_DIFLAG_NOSYMLINKS) + flags |= XFS_XFLAG_NOSYMLINKS; + if (di_flags & XFS_DIFLAG_EXTSIZE) + flags |= XFS_XFLAG_EXTSIZE; + if (di_flags & XFS_DIFLAG_EXTSZINHERIT) + flags |= XFS_XFLAG_EXTSZINHERIT; + if (di_flags & XFS_DIFLAG_NODEFRAG) + flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; + } + + return flags; +} + +uint +xfs_ip2xflags( + xfs_inode_t *ip) +{ + xfs_icdinode_t *dic = &ip->i_d; + + return _xfs_dic2xflags(dic->di_flags) | + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); +} + +uint +xfs_dic2xflags( + xfs_dinode_t *dip) +{ + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); +} + +/* + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. + */ +int +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) +{ + xfs_ino_t inum; + int error; + uint lock_mode; + + trace_xfs_lookup(dp, name); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock(dp, lock_mode); + + if (error) + goto out; + + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; + + return 0; + +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; + return error; +} + +/* + * Allocate an inode on disk and return a copy of its in-core version. + * The in-core inode is locked exclusively. Set mode, nlink, and rdev + * appropriately within the inode. The uid and gid for the inode are + * set according to the contents of the given cred structure. + * + * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. + * + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. + * The caller should then commit the current transaction, start a new + * transaction, and call xfs_ialloc() again to actually get the inode. + * + * To ensure that some other process does not grab the inode that + * was allocated during the first call to xfs_ialloc(), this routine + * also returns the [locked] bp pointing to the head of the freelist + * as ialloc_context. The caller should hold this buffer across + * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. + */ +int +xfs_ialloc( + xfs_trans_t *tp, + xfs_inode_t *pip, + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, + int okalloc, + xfs_buf_t **ialloc_context, + xfs_inode_t **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_ino_t ino; + xfs_inode_t *ip; + uint flags; + int error; + struct timespec tv; + + /* + * Call the space management code to pick + * the on-disk inode to be allocated. + */ + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + ialloc_context, &ino); + if (error) + return error; + if (*ialloc_context || ino == NULLFSINO) { + *ipp = NULL; + return 0; + } + ASSERT(*ialloc_context == NULL); + + /* + * Get the in-core inode with the lock held exclusively. + * This is because we're setting fields here we need + * to prevent others from looking at until we're done. + */ + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, + XFS_ILOCK_EXCL, &ip); + if (error) + return error; + ASSERT(ip != NULL); + + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + + ip->i_d.di_mode = mode; + ip->i_d.di_onlink = 0; + ip->i_d.di_nlink = nlink; + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + xfs_set_projid(ip, prid); + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + + if (pip && XFS_INHERIT_GID(pip)) { + ip->i_d.di_gid = pip->i_d.di_gid; + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { + ip->i_d.di_mode |= S_ISGID; + } + } + + /* + * If the group ID of the new file does not match the effective group + * ID or one of the supplementary group IDs, the S_ISGID bit is cleared + * (and only if the irix_sgid_inherit compatibility variable is set). + */ + if ((irix_sgid_inherit) && + (ip->i_d.di_mode & S_ISGID) && + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { + ip->i_d.di_mode &= ~S_ISGID; + } + + ip->i_d.di_size = 0; + ip->i_d.di_nextents = 0; + ASSERT(ip->i_d.di_nblocks == 0); + + tv = current_fs_time(mp->m_super); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + + /* + * di_gen will have been taken care of in xfs_iread. + */ + ip->i_d.di_extsize = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_u2.if_rdev = rdev; + ip->i_df.if_flags = 0; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + uint di_flags = 0; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + ip->i_d.di_flags |= di_flags; + } + /* FALLTHROUGH */ + case S_IFLNK: + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_flags = XFS_IFEXTENTS; + ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; + ip->i_df.if_u1.if_extents = NULL; + break; + default: + ASSERT(0); + } + /* + * Attribute fork settings for new inode. + */ + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_anextents = 0; + + /* + * Log the new values stuffed into the inode. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, flags); + + /* now that we have an i_mode we can setup the inode structure */ + xfs_setup_inode(ip); + + *ipp = ip; + return 0; +} + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return -ENOSPC; + } + + /* + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (ialloc_context) { + struct xfs_trans_res tres; + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + tres.tr_logres = xfs_trans_get_log_res(tp); + tres.tr_logcount = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + code = xfs_trans_reserve(tp, &tres, 0, 0); + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; + } + + *ipp = ip; + *tpp = tp; + + return 0; +} + +/* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_mkdir; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_create; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, tres, resblks, 0); + } + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + goto out_trans_cancel; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == -ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != -ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == -ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = -EXDEV; + goto error_return; + } + + if (!resblks) { + error = xfs_dir_canenter(tp, tdp, target_name); + if (error) + goto error_return; + } + + xfs_bmap_init(&free_list, &first_block); + + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* + * Free up the underlying blocks past new_size. The new size must be smaller + * than the current size. This routine can be used both for the attribute and + * data fork, and does not modify the inode size, which is left to the caller. + * + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. + * + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. + */ +int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp = *tpp; + struct xfs_trans *ntp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len; + int committed; + int error = 0; + int done = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(new_size <= XFS_ISIZE(ip)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(ip->i_itemp != NULL); + ASSERT(ip->i_itemp->ili_lock_flags == 0); + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + + trace_xfs_itruncate_extents_start(ip, new_size); + + /* + * Since it is possible for space to become allocated beyond + * the end of the file (in a crash where the space is allocated + * but the inode size is not yet updated), simply remove any + * blocks which show up between the new EOF and the maximum + * possible file size. If the first block to be removed is + * beyond the maximum file size (ie it is the same as last_block), + * then there is nothing to do. + */ + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (first_unmap_block == last_block) + return 0; + + ASSERT(first_unmap_block < last_block); + unmap_len = last_block - first_unmap_block + 1; + while (!done) { + xfs_bmap_init(&free_list, &first_block); + error = xfs_bunmapi(tp, ip, + first_unmap_block, unmap_len, + xfs_bmapi_aflag(whichfork), + XFS_ITRUNC_MAX_EXTENTS, + &first_block, &free_list, + &done); + if (error) + goto out_bmap_cancel; + + /* + * Duplicate the transaction that has the permanent + * reservation and commit the old transaction. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (committed) + xfs_trans_ijoin(tp, ip, 0); + if (error) + goto out_bmap_cancel; + + if (committed) { + /* + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + + xfs_trans_ijoin(tp, ip, 0); + + if (error) + goto out; + + /* + * Transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto out; + } + + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + +out: + *tpp = tp; + return error; +out_bmap_cancel: + /* + * If the bunmapi call encounters an error, return to the caller where + * the transaction can be properly aborted. We just need to make sure + * we're not holding any resources that we were not when we came in. + */ + xfs_bmap_cancel(&free_list); + goto out; +} + +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (ip->i_delayed_blks > 0) { + error = filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != -EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive_truncate + * + * Called to perform a truncate when an inode becomes unlinked. + */ +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. + */ + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == -ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +void +xfs_inactive( + xfs_inode_t *ip) +{ + struct xfs_mount *mp; + int error; + int truncate = 0; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return; + } + + mp = ip->i_mount; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; + } + + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return; + + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. If also blows away the in-core attribute fork. + */ + if (XFS_IFORK_Q(ip)) { + error = xfs_attr_inactive(ip); + if (error) + return; + } + + ASSERT(!ip->i_afp); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); + + /* + * Free the inode. + */ + error = xfs_inactive_ifree(ip); + if (error) + return; + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +} + +/* + * This is called when the inode's link count goes to 0. + * We place the on-disk inode on a list in the AGI. It + * will be pulled from this list when the inode is freed. + */ +int +xfs_iunlink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agino_t agino; + short bucket_index; + int offset; + int error; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + mp = tp->t_mountp; + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + if (error) + return error; + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index]); + ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + + if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { + /* + * There is already another inode in the bucket we need + * to add ourselves to. Add us at the front of the list. + * Here we put the head pointer into our next pointer, + * and then we fall through to point the head at us. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) + return error; + + ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); + dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } + + /* + * Point the bucket head pointer at the inode being inserted. + */ + ASSERT(agino != 0); + agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + return 0; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_ino_t next_ino; + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agnumber_t agno; + xfs_agino_t agino; + xfs_agino_t next_agino; + xfs_buf_t *last_ibp; + xfs_dinode_t *last_dip = NULL; + short bucket_index; + int offset, last_offset = 0; + int error; + + mp = tp->t_mountp; + agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); + ASSERT(agi->agi_unlinked[bucket_index]); + + if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { + /* + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the bucket head pointer at the next inode. + */ + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + } else { + /* + * We need to search the list for the inode being freed. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + last_ibp = NULL; + while (next_agino != agino) { + struct xfs_imap imap; + + if (last_ibp) + xfs_trans_brelse(tp, last_ibp); + + imap.im_blkno = 0; + next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + last_offset = imap.im_boffset; + next_agino = be32_to_cpu(last_dip->di_next_unlinked); + ASSERT(next_agino != NULLAGINO); + ASSERT(next_agino != 0); + } + + /* + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the previous inode on the list to the next inode. + */ + last_dip->di_next_unlinked = cpu_to_be32(next_agino); + ASSERT(next_agino != 0); + offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + + xfs_trans_inode_buf(tp, last_ibp); + xfs_trans_log_buf(tp, last_ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, last_ibp); + } + return 0; +} + +/* + * A big issue when freeing the inode cluster is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ +STATIC int +xfs_ifree_cluster( + xfs_inode_t *free_ip, + xfs_trans_t *tp, + xfs_ino_t inum) +{ + xfs_mount_t *mp = free_ip->i_mount; + int blks_per_cluster; + int inodes_per_cluster; + int nbufs; + int i, j; + xfs_daddr_t blkno; + xfs_buf_t *bp; + xfs_inode_t *ip; + xfs_inode_log_item_t *iip; + xfs_log_item_t *lip; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; + + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), + XFS_INO_TO_AGBNO(mp, inum)); + + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + + if (!bp) + return -ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_ops = &xfs_inode_buf_ops; + + /* + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. + */ + lip = bp->b_fspriv; + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); + } + lip = lip->li_bio_list; + } + + + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. + * + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. + */ + for (i = 0; i < inodes_per_cluster; i++) { +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, (inum + i))); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + continue; + } + + /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ + iip = ip->i_itemp; + if (!iip || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); + + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_binval(tp, bp); + } + + xfs_perag_put(pag); + return 0; +} + +/* + * This is called to return an inode to the inode free list. + * The inode should already be truncated to 0 length and have + * no pages associated with it. This routine also assumes that + * the inode is already a part of the transaction. + * + * The on-disk copy of the inode will have been added to the list + * of unlinked inodes in the AGI. We need to remove the inode from + * that list atomically with respect to freeing it here. + */ +int +xfs_ifree( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_bmap_free_t *flist) +{ + int error; + int delete; + xfs_ino_t first_ino; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); + ASSERT(ip->i_d.di_nblocks == 0); + + /* + * Pull the on-disk inode from the AGI unlinked list. + */ + error = xfs_iunlink_remove(tp, ip); + if (error) + return error; + + error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + if (error) + return error; + + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + ip->i_d.di_gen++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (delete) + error = xfs_ifree_cluster(ip, tp, first_ino); + + return error; +} + +/* + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. + */ +static void +xfs_iunpin( + struct xfs_inode *ip) +{ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + + /* Give the log a push to start the unpinning I/O */ + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); + +} + +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + +void +xfs_iunpin_wait( + struct xfs_inode *ip) +{ + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); +} + +/* + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. + * + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. + * + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. + * + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. + */ +int +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) +{ + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + uint resblks; + + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; + + if (is_dir) + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + else + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != -ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + cancel_flags |= XFS_TRANS_ABORT; + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = -ENOTEMPTY; + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = -ENOTEMPTY; + goto out_trans_cancel; + } + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + goto out_trans_cancel; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; + + if (is_dir && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* + * Enter all inodes for a rename transaction into a sorted array. + */ +#define __XFS_SORT_INODES 5 +STATIC void +xfs_sort_for_rename( + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ +{ + int i, j; + + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; + + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 5 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + struct xfs_inode *temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } + } +} + +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + +/* + * xfs_cross_rename() + * + * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall + */ +STATIC int +xfs_cross_rename( + struct xfs_trans *tp, + struct xfs_inode *dp1, + struct xfs_name *name1, + struct xfs_inode *ip1, + struct xfs_inode *dp2, + struct xfs_name *name2, + struct xfs_inode *ip2, + struct xfs_bmap_free *free_list, + xfs_fsblock_t *first_block, + int spaceres) +{ + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, + ip2->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, + ip1->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp1); + if (error) + goto out_trans_abort; + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp2); + if (error) + goto out_trans_abort; + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; +} + +/* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* + * xfs_rename + */ +int +xfs_rename( + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) +{ + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; + + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, + inodes, &num_inodes); + + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == -ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) + goto out_trans_cancel; + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = -EXDEV; + goto out_trans_cancel; + } + + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + goto out_trans_cancel; + } + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == -ENOSPC) + goto out_bmap_cancel; + if (error) + goto out_trans_abort; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto out_trans_abort; + } + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_trans_abort; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_trans_abort; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_trans_abort; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != -EEXIST); + if (error) + goto out_trans_abort; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto out_trans_abort; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, + &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_trans_abort; + + /* + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + if (wip) + IRELE(wip); + return error; +} + +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + unsigned long inodes_per_cluster; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); + if (!ilist) + goto out_put; + + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + +out_free: + rcu_read_unlock(); + kmem_free(ilist); +out_put: + xfs_perag_put(pag); + return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + rcu_read_unlock(); + /* + * Clean up the buffer. If it was delwri, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (bp->b_iodone) { + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq, false); + kmem_free(ilist); + xfs_perag_put(pag); + return -EFSCORRUPTED; +} + +/* + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. + */ +int +xfs_iflush( + struct xfs_inode *ip, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int error; + + XFS_STATS_INC(xs_iflush_count); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + + *bpp = NULL; + + xfs_iunpin_wait(ip); + + /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_imap_to_bp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EIO; + goto abort_out; + } + + /* + * Get the buffer containing the on-disk inode. + */ + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); + if (error || !bp) { + xfs_ifunlock(ip); + return error; + } + + /* + * First flush out the inode that xfs_iflush was called with. + */ + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; + + /* + * If the buffer is pinned then push on the log now so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) + xfs_log_force(mp, 0); + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; + + *bpp = bp; + return 0; + +corrupt_out: + xfs_buf_relse(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +cluster_corrupt_out: + error = -EFSCORRUPTED; +abort_out: + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(ip, false); + return error; +} + +STATIC int +xfs_iflush_int( + struct xfs_inode *ip, + struct xfs_buf *bp) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); + + /* set *dip = inode's place in the buffer */ + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + + if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), + mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", + __func__, ip->i_ino, ip, ip->i_d.di_magic); + goto corrupt_out; + } + if (S_ISREG(ip->i_d.di_mode)) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad regular inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } else if (S_ISDIR(ip->i_d.di_mode)) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad directory inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } + if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, + XFS_RANDOM_IFLUSH_5)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: detected corrupt incore inode %Lu, " + "total extents = %d, nblocks = %Ld, ptr 0x%p", + __func__, ip->i_ino, + ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_d.di_nblocks, ip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, + mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + __func__, ip->i_ino, ip->i_d.di_forkoff, ip); + goto corrupt_out; + } + + /* + * Inode item log recovery for v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. + */ + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; + + /* + * Copy the dirty parts of the inode into the on-disk + * inode. We always copy out the core of the inode, + * because if the inode is dirty at all the core must + * be. + */ + xfs_dinode_to_disk(dip, &ip->i_d); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + xfs_inobp_check(mp, bp); + + /* + * We've recorded everything logged in the inode, so we'd like to clear + * the ili_fields bits so we don't log and flush things unnecessarily. + * However, we can't stop logging all this information until the data + * we've copied into the disk buffer is written to disk. If we did we + * might overwrite the copy of the inode in the log with all the data + * after re-logging only part of it, and in the face of a crash we + * wouldn't have all the data we need to recover. + * + * What we do is move the bits to the ili_last_fields field. When + * logging the inode, these bits are moved back to the ili_fields field. + * In the xfs_iflush_done() routine we clear ili_last_fields, since we + * know that the information those bits represent is permanently on + * disk. As long as the flush completes before the inode is logged + * again, then both ili_fields and ili_last_fields will be cleared. + * + * We can play with the ili_fields bits here, because the inode lock + * must be held exclusively in order to set bits there and the flush + * lock protects the ili_last_fields bits. Set ili_logged so the flush + * done routine can tell whether or not to look in the AIL. Also, store + * the current LSN of the inode so that we can tell whether the item has + * moved in the AIL from xfs_iflush_done(). In order to read the lsn we + * need the AIL lock, because it is a 64 bit value that cannot be read + * atomically. + */ + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; + + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); + return 0; + +corrupt_out: + return -EFSCORRUPTED; +} diff --git a/kernel/fs/xfs/xfs_inode.h b/kernel/fs/xfs/xfs_inode.h new file mode 100644 index 000000000..8f22d2036 --- /dev/null +++ b/kernel/fs/xfs/xfs_inode.h @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_H__ +#define __XFS_INODE_H__ + +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" + +/* + * Kernel only inode definitions + */ +struct xfs_dinode; +struct xfs_inode; +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_bmbt_irec; +struct xfs_inode_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot; + +typedef struct xfs_inode { + /* Inode linking and identification information. */ + struct xfs_mount *i_mount; /* fs mount struct ptr */ + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ + + /* Inode location stuff */ + xfs_ino_t i_ino; /* inode number (agno/agino)*/ + struct xfs_imap i_imap; /* location for xfs_imap() */ + + /* Extent information. */ + xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t i_df; /* data fork */ + + /* operations vectors */ + const struct xfs_dir_ops *d_ops; /* directory ops vector */ + + /* Transaction and locking information. */ + struct xfs_inode_log_item *i_itemp; /* logging information */ + mrlock_t i_lock; /* inode lock */ + mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ + atomic_t i_pincount; /* inode pin count */ + spinlock_t i_flags_lock; /* inode i_flags lock */ + /* Miscellaneous state. */ + unsigned long i_flags; /* see defined flags below */ + unsigned int i_delayed_blks; /* count of delay alloc blks */ + + xfs_icdinode_t i_d; /* most of ondisk inode */ + + /* VFS inode */ + struct inode i_vnode; /* embedded VFS inode */ +} xfs_inode_t; + +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return container_of(inode, struct xfs_inode, i_vnode); +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + +/* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* + * If this I/O goes past the on-disk inode size update it unless it would + * be past the current in-core inode size. + */ +static inline xfs_fsize_t +xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) +{ + xfs_fsize_t i_size = i_size_read(VFS_I(ip)); + + if (new_size > i_size || new_size < 0) + new_size = i_size; + return new_size > ip->i_d.di_size ? new_size : 0; +} + +/* + * i_flags helper functions + */ +static inline void +__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + ip->i_flags |= flags; +} + +static inline void +xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, flags); + spin_unlock(&ip->i_flags_lock); +} + +static inline void +xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); +} + +static inline int +__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + return (ip->i_flags & flags); +} + +static inline int +xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + spin_lock(&ip->i_flags_lock); + ret = __xfs_iflags_test(ip, flags); + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (ret) + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was chosen + * to retain compatibility with "old" filesystems). + */ +static inline prid_t +xfs_get_projid(struct xfs_inode *ip) +{ + return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; +} + +static inline void +xfs_set_projid(struct xfs_inode *ip, + prid_t projid) +{ + ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); +} + +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} + +/* + * In-core inode flags. + */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ + +/* + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. This prevents unintended behaviour on the new inode from + * ocurring. + */ +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + +/* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* + * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) + */ +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) + +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) + +#define XFS_LOCK_FLAGS \ + { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ + { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ + { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } + + +/* + * Flags for lockdep annotations. + * + * XFS_LOCK_PARENT - for directory operations that require locking a + * parent directory inode and a child entry inode. The parent gets locked + * with this flag so it gets a lockdep subclass of 1 and the child entry + * lock will have a lockdep subclass of 0. + * + * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary + * inodes do not participate in the normal lock order, and thus have their + * own subclasses. + * + * XFS_LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 4, the + * second lock will have a lockdep subclass of 5, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. + */ +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_RTBITMAP 2 +#define XFS_LOCK_RTSUM 3 +#define XFS_LOCK_INUMORDER 4 + +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) + +#define XFS_MMAPLOCK_SHIFT 20 + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) + +/* + * For multiple groups support: if S_ISGID bit is set in the parent + * directory, group of new file is set to that of the parent, and + * new subdirectory gets S_ISGID bit from parent. + */ +#define XFS_INHERIT_GID(pip) \ + (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ + ((pip)->i_d.di_mode & S_ISGID)) + +int xfs_release(struct xfs_inode *ip); +void xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, + umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, unsigned int flags); + +void xfs_ilock(xfs_inode_t *, uint); +int xfs_ilock_nowait(xfs_inode_t *, uint); +void xfs_iunlock(xfs_inode_t *, uint); +void xfs_ilock_demote(xfs_inode_t *, uint); +int xfs_isilocked(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_buf **, xfs_inode_t **); + +uint xfs_ip2xflags(struct xfs_inode *); +uint xfs_dic2xflags(struct xfs_dinode *); +int xfs_ifree(struct xfs_trans *, xfs_inode_t *, + struct xfs_bmap_free *); +int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, + int, xfs_fsize_t); +int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); + +void xfs_iext_realloc(xfs_inode_t *, int, int); + +void xfs_iunpin_wait(xfs_inode_t *); +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); + +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + +int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_inode **, int *); +int xfs_droplink(struct xfs_trans *, struct xfs_inode *); +int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); + +/* from xfs_file.c */ +enum xfs_prealloc_flags { + XFS_PREALLOC_SET = (1 << 1), + XFS_PREALLOC_CLEAR = (1 << 2), + XFS_PREALLOC_SYNC = (1 << 3), + XFS_PREALLOC_INVISIBLE = (1 << 4), +}; + +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); + + +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + ihold(VFS_I(ip)); \ + trace_xfs_ihold(ip, _THIS_IP_); \ +} while (0) + +#define IRELE(ip) \ +do { \ + trace_xfs_irele(ip, _THIS_IP_); \ + iput(VFS_I(ip)); \ +} while (0) + +extern struct kmem_zone *xfs_inode_zone; + +/* + * Flags for read/write calls + */ +#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ +#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { XFS_IO_ISDIRECT, "DIRECT" }, \ + { XFS_IO_INVIS, "INVIS"} + +#endif /* __XFS_INODE_H__ */ diff --git a/kernel/fs/xfs/xfs_inode_item.c b/kernel/fs/xfs/xfs_inode_item.c new file mode 100644 index 000000000..bf13a5a7e --- /dev/null +++ b/kernel/fs/xfs/xfs_inode_item.c @@ -0,0 +1,789 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_ili_zone; /* inode log item zone */ + +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + +STATIC void +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + /* worst case, doesn't subtract delalloc extents */ + *nbytes += XFS_IFORK_DSIZE(ip); + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { + *nbytes += ip->i_df.if_broot_bytes; + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { + *nbytes += roundup(ip->i_df.if_bytes, 4); + *nvecs += 1; + } + break; + + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_UUID: + break; + default: + ASSERT(0); + break; + } +} + +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + /* worst case, doesn't subtract unused space */ + *nbytes += XFS_IFORK_ASIZE(ip); + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { + *nbytes += ip->i_afp->if_broot_bytes; + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { + *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nvecs += 1; + } + break; + default: + ASSERT(0); + break; + } +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); + + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); +} + +STATIC void +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + + ASSERT(ip->i_df.if_u1.if_extents != NULL); + ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DEXT; + } + break; + case XFS_DINODE_FMT_BTREE: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { + ASSERT(ip->i_df.if_broot != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; + } else { + ASSERT(!(iip->ili_fields & + XFS_ILOG_DBROOT)); + iip->ili_fields &= ~XFS_ILOG_DBROOT; + } + break; + case XFS_DINODE_FMT_LOCAL: + iip->ili_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_df.if_bytes, 4); + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DDATA; + } + break; + case XFS_DINODE_FMT_DEV: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; + break; + case XFS_DINODE_FMT_UUID: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; + break; + default: + ASSERT(0); + break; + } +} + +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ip->i_d.di_anextents); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_AEXT; + } + break; + case XFS_DINODE_FMT_BTREE: + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { + ASSERT(ip->i_afp->if_broot != NULL); + + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ABROOT; + } + break; + case XFS_DINODE_FMT_LOCAL: + iip->ili_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_afp->if_bytes, 4); + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ADATA; + } + break; + default: + ASSERT(0); + break; + } +} + +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ASSERT(ip->i_d.di_version > 1); + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} + +/* + * This is called to pin the inode associated with the inode log + * item in memory so it cannot be written out. + */ +STATIC void +xfs_inode_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); +} + + +/* + * This is called to unpin the inode associated with the inode log + * item which was previously pinned with a call to xfs_inode_item_pin(). + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + */ +STATIC void +xfs_inode_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); +} + +STATIC uint +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; + + if (xfs_ipincount(ip) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + return XFS_ITEM_LOCKED; + + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Stale inode items should force out the iclog. + */ + if (ip->i_flags & XFS_ISTALE) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_iflock_nowait(ip)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; + } + + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; +} + +/* + * Unlock the inode associated with the inode log item. + * Clear the fields of the inode and inode log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the inode. + */ +STATIC void +xfs_inode_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; + + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) + xfs_iunlock(ip, lock_flags); +} + +/* + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completes before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. + */ +STATIC xfs_lsn_t +xfs_inode_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; + } + return lsn; +} + +/* + * XXX rcc - this one really has to do something. Probably needs + * to stamp in a new field in the incore inode. + */ +STATIC void +xfs_inode_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + INODE_ITEM(lip)->ili_last_lsn = lsn; +} + +/* + * This is the ops vector shared by all buf log items. + */ +static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_committing = xfs_inode_item_committing +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + */ +void +xfs_inode_item_init( + struct xfs_inode *ip, + struct xfs_mount *mp) +{ + struct xfs_inode_log_item *iip; + + ASSERT(ip->i_itemp == NULL); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + + iip->ili_inode = ip; + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); +} + +/* + * Free the inode log item and any memory hanging off of it. + */ +void +xfs_inode_item_destroy( + xfs_inode_t *ip) +{ + kmem_zone_free(xfs_ili_zone, ip->i_itemp); +} + + +/* + * This is the inode flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the inode is + * flushed to disk. It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. + */ +void +xfs_iflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; + struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = bp->b_fspriv; + prev = NULL; + while (blip != NULL) { + if (blip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + bp->b_fspriv = next; + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; + + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + /* + * We only want to pull the item from the AIL if it is + * actually there and its location in the log has not + * changed since we started the flush. Thus, we only bother + * if the ili_logged flag is set and the inode's lsn has not + * changed. First we check the lsn outside + * the lock since it's cheaper, and then we recheck while + * holding the lock before removing the inode from the AIL. + */ + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; + spin_lock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); + } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); + } + + + /* + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. + */ + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; + + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } +} + +/* + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. + */ +void +xfs_iflush_abort( + xfs_inode_t *ip, + bool stale) +{ + xfs_inode_log_item_t *iip = ip->i_itemp; + + if (iip) { + struct xfs_ail *ailp = iip->ili_item.li_ailp; + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); + } else + spin_unlock(&ailp->xa_lock); + } + iip->ili_logged = 0; + /* + * Clear the ili_last_fields bits now that we know that the + * data corresponding to them is safely on disk. + */ + iip->ili_last_fields = 0; + /* + * Clear the inode logging fields so no more flushes are + * attempted. + */ + iip->ili_fields = 0; + } + /* + * Release the inode's flush lock since we're done with it. + */ + xfs_ifunlock(ip); +} + +void +xfs_istale_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); +} + +/* + * convert an xfs_inode_log_format struct from either 32 or 64 bit versions + * (which can have different field alignments) to the native version + */ +int +xfs_inode_item_format_convert( + xfs_log_iovec_t *buf, + xfs_inode_log_format_t *in_f) +{ + if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f32->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; + } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; + + in_f->ilf_type = in_f64->ilf_type; + in_f->ilf_size = in_f64->ilf_size; + in_f->ilf_fields = in_f64->ilf_fields; + in_f->ilf_asize = in_f64->ilf_asize; + in_f->ilf_dsize = in_f64->ilf_dsize; + in_f->ilf_ino = in_f64->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f64->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f64->ilf_blkno; + in_f->ilf_len = in_f64->ilf_len; + in_f->ilf_boffset = in_f64->ilf_boffset; + return 0; + } + return -EFSCORRUPTED; +} diff --git a/kernel/fs/xfs/xfs_inode_item.h b/kernel/fs/xfs/xfs_inode_item.h new file mode 100644 index 000000000..488d81254 --- /dev/null +++ b/kernel/fs/xfs/xfs_inode_item.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_ITEM_H__ +#define __XFS_INODE_ITEM_H__ + +/* kernel only definitions */ + +struct xfs_buf; +struct xfs_bmbt_rec; +struct xfs_inode; +struct xfs_mount; + +typedef struct xfs_inode_log_item { + xfs_log_item_t ili_item; /* common portion */ + struct xfs_inode *ili_inode; /* inode ptr */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + unsigned short ili_lock_flags; /* lock flags */ + unsigned short ili_logged; /* flushed logged data */ + unsigned int ili_last_fields; /* fields when flushed */ + unsigned int ili_fields; /* fields to be logged */ +} xfs_inode_log_item_t; + +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); +} + +extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); +extern void xfs_inode_item_destroy(struct xfs_inode *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); +extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, + xfs_inode_log_format_t *); + +extern struct kmem_zone *xfs_ili_zone; + +#endif /* __XFS_INODE_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_ioctl.c b/kernel/fs/xfs/xfs_ioctl.c new file mode 100644 index 000000000..87f67c6b6 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl.c @@ -0,0 +1,1806 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ioctl.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_fsops.h" +#include "xfs_discard.h" +#include "xfs_quota.h" +#include "xfs_export.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_pnfs.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct fd f = {NULL}; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + f = fdget(hreq->fd); + if (!f.file) + return -EBADF; + inode = file_inode(f.file); + } else { + error = user_lpath((const char __user *)hreq->path, &path); + if (error) + return error; + inode = d_inode(path.dentry); + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_ino = ip->i_ino; + + hsize = XFS_HSIZE(handle); + } + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fdput(f); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + struct xfs_fid64 fid; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(file_inode(parfilp)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + fmode_t fmode; + struct path path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = d_inode(dentry); + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -EPERM; + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + permflag = hreq->oflags; + fmode = OPEN_FMODE(permflag); + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { + error = -EPERM; + goto out_dput; + } + + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -EACCES; + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { + error = -EISDIR; + goto out_dput; + } + + fd = get_unused_fd_flags(0); + if (fd < 0) { + error = fd; + goto out_dput; + } + + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (S_ISREG(inode->i_mode)) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + void *link; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!d_is_symlink(dentry)) { + error = -EINVAL; + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -EFAULT; + goto out_dput; + } + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) { + error = -ENOMEM; + goto out_dput; + } + + error = xfs_readlink(XFS_I(d_inode(dentry)), link); + if (error) + goto out_kfree; + error = readlink_copy(hreq->ohandle, olen, link); + if (error) + goto out_kfree; + + out_kfree: + kfree(link); + out_dput: + dput(dentry); + return error; +} + +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + +STATIC int +xfs_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -EPERM; + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -EFAULT; + + error = mnt_want_write_file(parfilp); + if (error) + return error; + + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); + return PTR_ERR(dentry); + } + + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { + error = -EPERM; + goto out; + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + error = -EFAULT; + goto out; + } + + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + + out: + mnt_drop_write_file(parfilp); + dput(dentry); + return error; +} + +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error = -ENOMEM; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -EFAULT; + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -EINVAL; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = -EFAULT; + + if (*len > XATTR_SIZE_MAX) + return -EINVAL; + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return -ENOMEM; + + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); + return error; +} + +int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + unsigned char *kbuf; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (len > XATTR_SIZE_MAX) + return -EINVAL; + + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); +} + +int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags) +{ + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + return xfs_attr_remove(XFS_I(inode), name, flags); +} + +STATIC int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = -ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + d_inode(dentry), attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + d_inode(dentry), attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + d_inode(dentry), attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = -EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = -EFAULT; + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf) +{ + struct iattr iattr; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; + int error; + + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) + return -EPERM; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (filp->f_flags & O_DSYNC) + flags |= XFS_PREALLOC_SYNC; + if (ioflags & XFS_IO_INVIS) + flags |= XFS_PREALLOC_INVISIBLE; + + error = mnt_want_write_file(filp); + if (error) + return error; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, false); + if (error) + goto out_unlock; + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += filp->f_pos; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + error = -EINVAL; + goto out_unlock; + } + + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) { + error = -EINVAL; + goto out_unlock; + } + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > inode->i_sb->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) { + error = -EINVAL; + goto out_unlock; + } + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + flags |= XFS_PREALLOC_SET; + error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + flags |= XFS_PREALLOC_SET; + error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, + XFS_BMAPI_PREALLOC); + break; + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + error = xfs_free_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + flags |= XFS_PREALLOC_CLEAR; + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + + error = xfs_setattr_size(ip, &iattr); + break; + default: + ASSERT(0); + error = -EINVAL; + } + + if (error) + goto out_unlock; + + error = xfs_update_prealloc_flags(ip, flags); + +out_unlock: + xfs_iunlock(ip, iolock); + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -EFAULT; + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -EFAULT; + + if ((count = bulkreq.icount) <= 0) + return -EINVAL; + + if (bulkreq.ubuffer == NULL) + return -EINVAL; + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer, + sizeof(xfs_bstat_t), NULL, &done); + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); + + if (error) + return error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -EFAULT; + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return error; + + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -EFAULT; + return 0; +} + +/* + * Linux extended inode flags interface. + */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & FS_IMMUTABLE_FL) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & FS_SYNC_FL) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & FS_NOATIME_FL) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & FS_NODUMP_FL) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= FS_SYNC_FL; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= FS_NOATIME_FL; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= FS_NODUMP_FL; + return flags; +} + +STATIC int +xfs_ioc_fsgetxattr( + xfs_inode_t *ip, + int attr, + void __user *arg) +{ + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + fa.fsx_xflags = xfs_ip2xflags(ip); + fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_projid = xfs_get_projid(ip); + + if (attr) { + if (ip->i_afp) { + if (ip->i_afp->if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_afp->if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_anextents; + } else + fa.fsx_nextents = 0; + } else { + if (ip->i_df.if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_df.if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_nextents; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +static int +xfs_ioctl_setattr_xflags( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Can't change realtime flag if any extents are allocated. */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + return -EINVAL; + + /* If realtime flag is set then must have realtime device */ + if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) + return -EINVAL; + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || + (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + XFS_STATS_INC(xs_ig_attrchg); + return 0; +} + +/* + * Set up the transaction structure for the setattr operation, checking that we + * have permission to do so. On success, return a clean transaction and the + * inode locked exclusively ready for further operation specific checks. On + * failure, return an error without modifying or locking the inode. + */ +static struct xfs_trans * +xfs_ioctl_setattr_get_trans( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return ERR_PTR(-EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return ERR_PTR(-EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_cancel; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal to the file owner + * ID, except in cases where the CAP_FSETID capability is applicable. + */ + if (!inode_owner_or_capable(VFS_I(ip))) { + error = -EPERM; + goto out_cancel; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + return tp; + +out_cancel: + xfs_trans_cancel(tp, 0); + return ERR_PTR(error); +} + +/* + * extent size hint validation is somewhat cumbersome. Rules are: + * + * 1. extent size hint is only valid for directories and regular files + * 2. XFS_XFLAG_EXTSIZE is only valid for regular files + * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. can only be changed on regular files if no extents are allocated + * 5. can be changed on directories at any time + * 6. extsize hint of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * 8. for non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. + */ +static int +xfs_ioctl_setattr_check_extsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(ip->i_d.di_mode)) + return -EINVAL; + + if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + return -EINVAL; + + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) + return -EINVAL; + + if (XFS_IS_REALTIME_INODE(ip) || + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + } + + if (fa->fsx_extsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + + return 0; +} + +static int +xfs_ioctl_setattr_check_projid( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + /* Disallow 32bit project ids if projid32bit feature is not enabled. */ + if (fa->fsx_projid > (__uint16_t)-1 && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + return -EINVAL; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (xfs_get_projid(ip) != fa->fsx_projid) + return -EINVAL; + if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) + return -EINVAL; + + return 0; +} + +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + trace_xfs_ioctl_setattr(ip); + + code = xfs_ioctl_setattr_check_projid(ip, fa); + if (code) + return code; + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp)) { + code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + if (code) + return code; + } + + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + code = PTR_ERR(tp); + goto error_free_dquots; + } + + + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && + xfs_get_projid(ip) != fa->fsx_projid) { + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_trans_cancel; + } + + code = xfs_ioctl_setattr_check_extsize(ip, fa); + if (code) + goto error_trans_cancel; + + code = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (code) + goto error_trans_cancel; + + /* + * Change file ownership. Must be the owner or privileged. CAP_FSETID + * overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be cleared upon + * successful return from chown() + */ + + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* Change the ownerships and register project quota modifications */ + if (xfs_get_projid(ip) != fa->fsx_projid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + olddquot = xfs_qm_vop_chown(tp, ip, + &ip->i_pdquot, pdqp); + } + ASSERT(ip->i_d.di_version > 1); + xfs_set_projid(ip, fa->fsx_projid); + } + + /* + * Only set the extent size hint if we've already determined that the + * extent size hint should be set on the inode. If no extent size flags + * are set on the inode then unconditionally clear the extent size hint. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + else + ip->i_d.di_extsize = 0; + + code = xfs_trans_commit(tp, 0); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + + return code; + +error_trans_cancel: + xfs_trans_cancel(tp, 0); +error_free_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + return code; +} + +STATIC int +xfs_ioc_fssetxattr( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + int error; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa); + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; + + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_setxflags( + struct xfs_inode *ip, + struct file *filp, + void __user *arg) +{ + struct xfs_trans *tp; + struct fsxattr fa; + unsigned int flags; + int error; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; + + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + + error = mnt_want_write_file(filp); + if (error) + return error; + + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + error = PTR_ERR(tp); + goto out_drop_write; + } + + error = xfs_ioctl_setattr_xflags(tp, ip, &fa); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_write; + } + + error = xfs_trans_commit(tp, 0); +out_drop_write: + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmap __user *base = (struct getbmap __user *)*ap; + + /* copy only getbmap portion (not getbmapx) */ + if (copy_to_user(base, bmv, sizeof(struct getbmap))) + return -EFAULT; + + *ap += sizeof(struct getbmap); + return 0; +} + +STATIC int +xfs_ioc_getbmap( + struct xfs_inode *ip, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + return -EFAULT; + + if (bmx.bmv_count < 2) + return -EINVAL; + + bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & XFS_IO_INVIS) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + (__force struct getbmap *)arg+1); + if (error) + return error; + + /* copy back header - only size of getbmap */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmapx __user *base = (struct getbmapx __user *)*ap; + + if (copy_to_user(base, bmv, sizeof(struct getbmapx))) + return -EFAULT; + + *ap += sizeof(struct getbmapx); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + struct xfs_inode *ip, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -EFAULT; + + if (bmx.bmv_count < 2) + return -EINVAL; + + if (bmx.bmv_iflags & (~BMV_IF_VALID)) + return -EINVAL; + + error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, + (__force struct getbmapx *)arg+1); + if (error) + return error; + + /* copy back header */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) + return -EFAULT; + + return 0; +} + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct fd f, tmp; + int error = 0; + + /* Pull information for the target fd */ + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { + error = -EINVAL; + goto out; + } + + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { + error = -EBADF; + goto out_put_file; + } + + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { + error = -EINVAL; + goto out_put_file; + } + + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { + error = -EBADF; + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { + error = -EINVAL; + goto out_put_tmp_file; + } + + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); + + if (ip->i_mount != tip->i_mount) { + error = -EINVAL; + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = -EINVAL; + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = -EIO; + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fdput(tmp); + out_put_file: + fdput(f); + out: + return error; +} + +/* + * Note: some of the ioctl's return positive numbers as a + * byte count indicating success, such as readlink_by_handle. + * So we don't "sign flip" like most other routines. This means + * true errors need to be returned as a negative value. + */ +long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + trace_xfs_file_ioctl(ip); + + switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { + xfs_flock64_t bf; + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -EFAULT; + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -EFAULT; + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -EFAULT; + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -EFAULT; + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(filp, arg); + + case XFS_IOC_READLINK_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -EFAULT; + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(filp, arg); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(filp, arg); + + case XFS_IOC_SWAPEXT: { + struct xfs_swapext sxp; + + if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -EFAULT; + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); + if (error) + return error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -EFAULT; + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -EFAULT; + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -EFAULT; + + return xfs_fs_goingdown(mp, in); + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + return xfs_errortag_add(in.errtag, mp); + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_errortag_clearall(mp, 1); + + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_fs_eofblocks eofb; + struct xfs_eofblocks keofb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -EFAULT; + + error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + if (error) + return error; + + return xfs_icache_free_eofblocks(mp, &keofb); + } + + default: + return -ENOTTY; + } +} diff --git a/kernel/fs/xfs/xfs_ioctl.h b/kernel/fs/xfs/xfs_ioctl.h new file mode 100644 index 000000000..77c02c790 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL_H__ +#define __XFS_IOCTL_H__ + +extern int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf); + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp); + +extern int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags); + +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + +extern long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p); + +extern long +xfs_file_compat_ioctl( + struct file *file, + unsigned int cmd, + unsigned long arg); + +extern int +xfs_set_dmattrs( + struct xfs_inode *ip, + u_int evmask, + u_int16_t state); + +#endif diff --git a/kernel/fs/xfs/xfs_ioctl32.c b/kernel/fs/xfs/xfs_ioctl32.c new file mode 100644 index 000000000..b88bdc85d --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl32.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_ioctl32.h" +#include "xfs_trace.h" + +#define _NATIVE_IOC(cmd, type) \ + _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) + +#ifdef BROKEN_X86_ALIGNMENT +STATIC int +xfs_compat_flock64_copyin( + xfs_flock64_t *bf, + compat_xfs_flock64_t __user *arg32) +{ + if (get_user(bf->l_type, &arg32->l_type) || + get_user(bf->l_whence, &arg32->l_whence) || + get_user(bf->l_start, &arg32->l_start) || + get_user(bf->l_len, &arg32->l_len) || + get_user(bf->l_sysid, &arg32->l_sysid) || + get_user(bf->l_pid, &arg32->l_pid) || + copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_ioc_fsgeometry_v1( + struct xfs_mount *mp, + compat_xfs_fsop_geom_v1_t __user *arg32) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return error; + /* The 32-bit variant simply has some padding at the end */ + if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_growfs_data_copyin( + struct xfs_growfs_data *in, + compat_xfs_growfs_data_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->imaxpct, &arg32->imaxpct)) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_growfs_rt_copyin( + struct xfs_growfs_rt *in, + compat_xfs_growfs_rt_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->extsize, &arg32->extsize)) + return -EFAULT; + return 0; +} + +STATIC int +xfs_inumbers_fmt_compat( + void __user *ubuffer, + const struct xfs_inogrp *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -EFAULT; + } + *written = count * sizeof(*p32); + return 0; +} + +#else +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#endif /* BROKEN_X86_ALIGNMENT */ + +STATIC int +xfs_ioctl32_bstime_copyin( + xfs_bstime_t *bstime, + compat_xfs_bstime_t __user *bstime32) +{ + compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + + if (get_user(sec32, &bstime32->tv_sec) || + get_user(bstime->tv_nsec, &bstime32->tv_nsec)) + return -EFAULT; + bstime->tv_sec = sec32; + return 0; +} + +/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +STATIC int +xfs_ioctl32_bstat_copyin( + xfs_bstat_t *bstat, + compat_xfs_bstat_t __user *bstat32) +{ + if (get_user(bstat->bs_ino, &bstat32->bs_ino) || + get_user(bstat->bs_mode, &bstat32->bs_mode) || + get_user(bstat->bs_nlink, &bstat32->bs_nlink) || + get_user(bstat->bs_uid, &bstat32->bs_uid) || + get_user(bstat->bs_gid, &bstat32->bs_gid) || + get_user(bstat->bs_rdev, &bstat32->bs_rdev) || + get_user(bstat->bs_blksize, &bstat32->bs_blksize) || + get_user(bstat->bs_size, &bstat32->bs_size) || + xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) || + get_user(bstat->bs_blocks, &bstat32->bs_size) || + get_user(bstat->bs_xflags, &bstat32->bs_size) || + get_user(bstat->bs_extsize, &bstat32->bs_extsize) || + get_user(bstat->bs_extents, &bstat32->bs_extents) || + get_user(bstat->bs_gen, &bstat32->bs_gen) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) || + get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || + get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || + get_user(bstat->bs_aextents, &bstat32->bs_aextents)) + return -EFAULT; + return 0; +} + +/* XFS_IOC_FSBULKSTAT and friends */ + +STATIC int +xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -EFAULT; + return 0; +} + +/* Return 0 on success or positive error (to xfs_bulkstat()) */ +STATIC int +xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (ubsize < sizeof(*p32)) + return -ENOMEM; + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_forkoff, &p32->bs_forkoff) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return -EFAULT; + if (ubused) + *ubused = sizeof(*p32); + return 0; +} + +STATIC int +xfs_bulkstat_one_compat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt_compat, + ubused, stat); +} + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_compat_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + compat_xfs_fsop_bulkreq_t __user *p32) +{ + u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (get_user(addr, &p32->lastip)) + return -EFAULT; + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) + return -EFAULT; + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -EFAULT; + bulkreq.ocount = compat_ptr(addr); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -EFAULT; + + if ((count = bulkreq.icount) <= 0) + return -EINVAL; + + if (bulkreq.ubuffer == NULL) + return -EINVAL; + + if (cmd == XFS_IOC_FSINUMBERS_32) { + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { + int res; + + error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, + sizeof(compat_xfs_bstat_t), NULL, &res); + } else if (cmd == XFS_IOC_FSBULKSTAT_32) { + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); + } else + error = -EINVAL; + if (error) + return error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -EFAULT; + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; + } + + return 0; +} + +STATIC int +xfs_compat_handlereq_copyin( + xfs_fsop_handlereq_t *hreq, + compat_xfs_fsop_handlereq_t __user *arg32) +{ + compat_xfs_fsop_handlereq_t hreq32; + + if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) + return -EFAULT; + + hreq->fd = hreq32.fd; + hreq->path = compat_ptr(hreq32.path); + hreq->oflags = hreq32.oflags; + hreq->ihandle = compat_ptr(hreq32.ihandle); + hreq->ihandlen = hreq32.ihandlen; + hreq->ohandle = compat_ptr(hreq32.ohandle); + hreq->ohandlen = compat_ptr(hreq32.ohandlen); + + return 0; +} + +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); +} + +STATIC int +xfs_compat_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, arg, + sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + return -EFAULT; + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -EINVAL; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -ENOMEM; + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + compat_xfs_attr_multiop_t *ops; + compat_xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, + sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = -ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + compat_ptr(ops[i].am_attrname), + MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + d_inode(dentry), attr_name, + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + d_inode(dentry), attr_name, + compat_ptr(ops[i].am_attrvalue), + ops[i].am_length, ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + d_inode(dentry), attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = -EINVAL; + } + } + + if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) + error = -EFAULT; + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + compat_xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -EPERM; + if (copy_from_user(&dmhreq, arg, + sizeof(compat_xfs_fsop_setdm_handlereq_t))) + return -EFAULT; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { + error = -EPERM; + goto out; + } + + if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { + error = -EFAULT; + goto out; + } + + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + +out: + dput(dentry); + return error; +} + +long +xfs_file_compat_ioctl( + struct file *filp, + unsigned cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + trace_xfs_file_compat_ioctl(ip); + + switch (cmd) { + /* No size or alignment issues on any arch */ + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + return xfs_file_ioctl(filp, cmd, p); +#ifndef BROKEN_X86_ALIGNMENT + /* These are handled fine if no alignment issues */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: + return xfs_file_ioctl(filp, cmd, p); +#else + case XFS_IOC_ALLOCSP_32: + case XFS_IOC_FREESP_32: + case XFS_IOC_ALLOCSP64_32: + case XFS_IOC_FREESP64_32: + case XFS_IOC_RESVSP_32: + case XFS_IOC_UNRESVSP_32: + case XFS_IOC_RESVSP64_32: + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { + struct xfs_flock64 bf; + + if (xfs_compat_flock64_copyin(&bf, arg)) + return -EFAULT; + cmd = _NATIVE_IOC(cmd, struct xfs_flock64); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_FSGEOMETRY_V1_32: + return xfs_compat_ioc_fsgeometry_v1(mp, arg); + case XFS_IOC_FSGROWFSDATA_32: { + struct xfs_growfs_data in; + + if (xfs_compat_growfs_data_copyin(&in, arg)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return error; + } + case XFS_IOC_FSGROWFSRT_32: { + struct xfs_growfs_rt in; + + if (xfs_compat_growfs_rt_copyin(&in, arg)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return error; + } +#endif + /* long changes size, but xfs only copiese out 32 bits */ + case XFS_IOC_GETXFLAGS_32: + case XFS_IOC_SETXFLAGS_32: + case XFS_IOC_GETVERSION_32: + cmd = _NATIVE_IOC(cmd, long); + return xfs_file_ioctl(filp, cmd, p); + case XFS_IOC_SWAPEXT_32: { + struct xfs_swapext sxp; + struct compat_xfs_swapext __user *sxu = arg; + + /* Bulk copy in up to the sx_stat field, then copy bstat */ + if (copy_from_user(&sxp, sxu, + offsetof(struct xfs_swapext, sx_stat)) || + xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return error; + } + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + return xfs_compat_ioc_bulkstat(mp, cmd, arg); + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_READLINK_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE_32: + return xfs_compat_attrlist_by_handle(filp, arg); + case XFS_IOC_ATTRMULTI_BY_HANDLE_32: + return xfs_compat_attrmulti_by_handle(filp, arg); + case XFS_IOC_FSSETDM_BY_HANDLE_32: + return xfs_compat_fssetdm_by_handle(filp, arg); + default: + return -ENOIOCTLCMD; + } +} diff --git a/kernel/fs/xfs/xfs_ioctl32.h b/kernel/fs/xfs/xfs_ioctl32.h new file mode 100644 index 000000000..b1bb45444 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl32.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL32_H__ +#define __XFS_IOCTL32_H__ + +#include + +/* + * on 32-bit arches, ioctl argument structures may have different sizes + * and/or alignment. We define compat structures which match the + * 32-bit sizes/alignments here, and their associated ioctl numbers. + * + * xfs_ioctl32.c contains routines to copy these structures in and out. + */ + +/* stock kernel-level ioctls we support */ +#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS +#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS +#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION + +/* + * On intel, even if sizes match, alignment and/or padding may differ. + */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#define __compat_packed __attribute__((packed)) +#else +#define __compat_packed +#endif + +typedef struct compat_xfs_bstime { + compat_time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} __compat_packed compat_xfs_bstat_t; + +typedef struct compat_xfs_fsop_bulkreq { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; + +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +/* The bstat field in the swapext struct needs translation */ +typedef struct compat_xfs_swapext { + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} __compat_packed compat_xfs_swapext_t; + +#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) + +typedef struct compat_xfs_fsop_attrlist_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + compat_uptr_t buffer; /* returned names */ +} __compat_packed compat_xfs_fsop_attrlist_handlereq_t; + +/* Note: actually this is read/write */ +#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \ + _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq) + +/* am_opcodes defined in xfs_fs.h */ +typedef struct compat_xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + compat_uptr_t am_attrname; + compat_uptr_t am_attrvalue; + __u32 am_length; + __u32 am_flags; +} compat_xfs_attr_multiop_t; + +typedef struct compat_xfs_fsop_attrmulti_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + /* ptr to compat_xfs_attr_multiop */ + compat_uptr_t ops; /* attr_multi data */ +} compat_xfs_fsop_attrmulti_handlereq_t; + +#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ + _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) + +typedef struct compat_xfs_fsop_setdm_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle information */ + /* ptr to struct fsdmidata */ + compat_uptr_t data; /* DMAPI data */ +} compat_xfs_fsop_setdm_handlereq_t; + +#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ + _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) + +#ifdef BROKEN_X86_ALIGNMENT +/* on ia32 l_start is on a 32-bit boundary */ +typedef struct compat_xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} compat_xfs_flock64_t; + +#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) +#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) +#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) +#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) + +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR('X', 100, struct compat_xfs_fsop_geom_v1) + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +/* These growfs input structures have padding on the end, so must translate */ +typedef struct compat_xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} __attribute__((packed)) compat_xfs_growfs_data_t; + +typedef struct compat_xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} __attribute__((packed)) compat_xfs_growfs_rt_t; + +#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data) +#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt) + +#endif /* BROKEN_X86_ALIGNMENT */ + +#endif /* __XFS_IOCTL32_H__ */ diff --git a/kernel/fs/xfs/xfs_iomap.c b/kernel/fs/xfs/xfs_iomap.c new file mode 100644 index 000000000..38e633bad --- /dev/null +++ b/kernel/fs/xfs/xfs_iomap.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + +STATIC int +xfs_iomap_eof_align_last_fsb( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_extlen_t align = 0; + int eof, error; + + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) + align = 0; + } + + /* + * Always round up the allocation request to an extent boundary + * (when file on a real-time subvolume or has di_extsize hint). + */ + if (extsize) { + if (align) + align = roundup_64(align, extsize); + else + align = extsize; + } + + if (align) { + xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) + return error; + if (eof) + *last_fsb = new_last_fsb; + } + return 0; +} + +STATIC int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int +xfs_iomap_write_direct( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nmaps) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb, resaligned; + xfs_fsblock_t firstfsb; + xfs_extlen_t extsz, temp; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if ((offset + count) > XFS_ISIZE(ip)) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + return error; + } else { + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) + last_fsb = MIN(last_fsb, (xfs_fileoff_t) + imap->br_blockcount + + imap->br_startoff); + } + count_fsb = last_fsb - offset_fsb; + ASSERT(count_fsb > 0); + + resaligned = count_fsb; + if (unlikely(extsz)) { + if ((temp = do_mod(offset_fsb, extsz))) + resaligned += temp; + if ((temp = do_mod(resaligned, extsz))) + resaligned += extsz - temp; + } + + if (unlikely(rt)) { + resrtextents = qblocks = resaligned; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space, note: need lock to return + */ + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + if (error) + goto out_trans_cancel; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); + if (error) + goto out_bmap_cancel; + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_unlock; + + /* + * Copy any maps to caller's array and return any error. + */ + if (nimaps == 0) { + error = -ENOSPC; + goto out_unlock; + } + + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + error = xfs_alert_fsblock_zero(ip, imap); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + goto out_unlock; +} + +/* + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. + */ +STATIC int +xfs_iomap_eof_want_preallocate( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nimaps, + int *prealloc) +{ + xfs_fileoff_t start_fsb; + xfs_filblks_t count_fsb; + int n, error, imaps; + int found_delalloc = 0; + + *prealloc = 0; + if (offset + count <= XFS_ISIZE(ip)) + return 0; + + /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* + * If there are any real blocks past eof, then don't + * do any speculative allocation. + */ + start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + while (count_fsb > 0) { + imaps = nimaps; + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); + if (error) + return error; + for (n = 0; n < imaps; n++) { + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) + return 0; + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; + } + } + if (!found_delalloc) + *prealloc = 1; + return 0; +} + +/* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount << 1; + return XFS_B_TO_FSB(mp, offset); +} + +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift, + int64_t *qfreesp) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* no dq, or over hi wmark, squash the prealloc completely */ + if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + *qfreesp = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + if (freesp < *qfreesp) + *qfreesp = freesp; + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) +{ + xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; + + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; + + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + freesp = percpu_counter_read_positive(&mp->m_fdblocks); + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + + /* + * Check each quota to cap the prealloc size, provide a shift value to + * throttle with and adjust amount of available space. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + &freesp); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + &freesp); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + &freesp); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + + return alloc_blocks; +} + +int +xfs_iomap_write_delay( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *ret_imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_off_t aligned_offset; + xfs_fileoff_t ioalign; + xfs_extlen_t extsz; + int nimaps; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int prealloc; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + return error; + + extsz = xfs_get_extsz_hint(ip); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, + imap, XFS_WRITE_IMAPS, &prealloc); + if (error) + return error; + +retry: + if (prealloc) { + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); + + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + alloc_blocks; + } else { + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + } + + if (prealloc || extsz) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + return error; + } + + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + + ASSERT(last_fsb > offset_fsb); + + nimaps = XFS_WRITE_IMAPS; + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); + switch (error) { + case 0: + case -ENOSPC: + case -EDQUOT: + break; + default: + return error; + } + + /* + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry + * without EOF preallocation. + */ + if (nimaps == 0) { + trace_xfs_delalloc_enospc(ip, offset, count); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; + } + return error ? error : -ENOSPC; + } + + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap[0]); + + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + + *ret_imap = imap[0]; + return 0; +} + +/* + * Pass in a delayed allocate extent, convert it to real extents; + * return to the caller the extent we create which maps on top of + * the originating callers request. + * + * Called without a lock on the inode. + * + * We no longer bother to look at the incoming map - all we have to + * guarantee is that whatever we allocate fills the required range. + */ +int +xfs_iomap_write_allocate( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, last_block; + xfs_fileoff_t end_fsb, map_start_fsb; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + xfs_trans_t *tp; + int nimaps, committed; + int error = 0; + int nres; + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; + + XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + + nimaps = 0; + while (nimaps == 0) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; + nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + + /* + * it is possible that the extents have changed since + * we did the read call as we dropped the ilock for a + * while. We have to be careful about truncates or hole + * punchs here - we are not allowed to allocate + * non-delalloc blocks here. + * + * The only protection against truncation is the pages + * for the range we are being asked to convert are + * locked and hence a truncate will block on them + * first. + * + * As a result, if we go beyond the range we really + * need and hit an delalloc extent boundary followed by + * a hole while we have excess blocks in the map, we + * will fill the hole incorrectly and overrun the + * transaction reservation. + * + * Using a single map prevents this as we are forced to + * check each map we look for overlap with the desired + * range and abort as soon as we find it. Also, given + * that we only return a single map, having one beyond + * what we can return is probably a bit silly. + * + * We also need to check that we don't go beyond EOF; + * this is a truncate optimisation as a truncate sets + * the new file size before block on the pages we + * currently have locked under writeback. Because they + * are about to be tossed, we don't need to write them + * back.... + */ + nimaps = 1; + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + error = xfs_bmap_last_offset(ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; + + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + error = -EAGAIN; + goto trans_cancel; + } + } + + /* + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, + &first_block, 1, + imap, &nimaps, &free_list); + if (error) + goto trans_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the callers request + */ + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { + XFS_STATS_INC(xs_xstrat_quick); + return 0; + } + + /* + * So far we have not mapped the requested part of the + * file, just surrounding data, try again. + */ + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; + } + +trans_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_iomap_write_unwritten( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_filblks_t count_fsb; + xfs_filblks_t numblks_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap; + xfs_bmap_free_t free_list; + xfs_fsize_t i_size; + uint resblks; + int committed; + int error; + + trace_xfs_unwritten_convert(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + + do { + /* + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. + */ + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Modify the unwritten extent state of the buffer. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, + 1, &imap, &nimaps, &free_list); + if (error) + goto error_on_bmapi_transaction; + + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap); + + if ((numblks_fsb = imap.br_blockcount) == 0) { + /* + * The numblks_fsb value should always get + * smaller, otherwise the loop is stuck. + */ + ASSERT(imap.br_blockcount); + break; + } + offset_fsb += numblks_fsb; + count_fsb -= numblks_fsb; + } while (count_fsb > 0); + + return 0; + +error_on_bmapi_transaction: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} diff --git a/kernel/fs/xfs/xfs_iomap.h b/kernel/fs/xfs/xfs_iomap.h new file mode 100644 index 000000000..8688e663d --- /dev/null +++ b/kernel/fs/xfs/xfs_iomap.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2003-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOMAP_H__ +#define __XFS_IOMAP_H__ + +struct xfs_inode; +struct xfs_bmbt_irec; + +int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); + +#endif /* __XFS_IOMAP_H__*/ diff --git a/kernel/fs/xfs/xfs_iops.c b/kernel/fs/xfs/xfs_iops.c new file mode 100644 index 000000000..f4cd7204e --- /dev/null +++ b/kernel/fs/xfs/xfs_iops.c @@ -0,0 +1,1305 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_acl.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_trans_space.h" +#include "xfs_pnfs.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + +static int +xfs_initxattrs( + struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + +/* + * Hook in SELinux. This is not quite correct yet, what we really need + * here (as we do for default ACLs) is a mechanism by which creation of + * these attrs can be journalled at inode creation time (along with the + * inode, of course, such that log replay can't cause these to be lost). + */ + +STATIC int +xfs_init_security( + struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); +} + +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry, + int mode) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; +} + +STATIC void +xfs_cleanup_inode( + struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + struct xfs_name teardown; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * xfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + xfs_dentry_to_name(&teardown, dentry, 0); + + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); +} + +STATIC int +xfs_generic_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + bool tmpfile) /* unnamed file */ +{ + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl, *acl; + struct xfs_name name; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; + + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } + if (unlikely(error)) + goto out_free_acl; + + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + +#ifdef CONFIG_XFS_POSIX_ACL + if (default_acl) { + error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_cleanup_inode; + } +#endif + + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + + xfs_finish_inode_setup(ip); + + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return error; + + out_cleanup_inode: + xfs_finish_inode_setup(ip); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + goto out_free_acl; +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int +xfs_vn_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) +{ + return xfs_vn_mknod(dir, dentry, mode, 0); +} + +STATIC int +xfs_vn_mkdir( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +xfs_vn_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *cip; + struct xfs_name name; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&name, dentry, 0); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); + if (unlikely(error)) { + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry, 0); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; +} + +STATIC int +xfs_vn_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, inode->i_mode); + + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); + if (unlikely(error)) + return error; + + ihold(inode); + d_instantiate(dentry, inode); + return 0; +} + +STATIC int +xfs_vn_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, 0); + + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; +} + +STATIC int +xfs_vn_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode; + + mode = S_IFLNK | + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); + xfs_dentry_to_name(&name, dentry, mode); + + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + if (unlikely(error)) + goto out; + + inode = VFS_I(cip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); + return 0; + + out_cleanup_inode: + xfs_finish_inode_setup(cip); + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + out: + return error; +} + +STATIC int +xfs_vn_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry, + unsigned int flags) +{ + struct inode *new_inode = d_inode(ndentry); + int omode = 0; + struct xfs_name oname; + struct xfs_name nname; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + /* if we are exchanging files, we need to set i_mode of both files */ + if (flags & RENAME_EXCHANGE) + omode = d_inode(ndentry)->i_mode; + + xfs_dentry_to_name(&oname, odentry, omode); + xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); + + return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), + XFS_I(ndir), &nname, + new_inode ? XFS_I(new_inode) : NULL, flags); +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC void * +xfs_vn_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + char *link; + int error = -ENOMEM; + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_err; + + error = xfs_readlink(XFS_I(d_inode(dentry)), link); + if (unlikely(error)) + goto out_kfree; + + nd_set_link(nd, link); + return NULL; + + out_kfree: + kfree(link); + out_err: + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +STATIC int +xfs_vn_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + trace_xfs_getattr(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + stat->size = XFS_ISIZE(ip); + stat->dev = inode->i_sb->s_dev; + stat->mode = ip->i_d.di_mode; + stat->nlink = ip->i_d.di_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + stat->blksize = BLKDEV_IOSIZE; + stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * If the file blocks are being allocated from a + * realtime volume, then return the inode's realtime + * extent size or the realtime volume's extent size. + */ + stat->blksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + } else + stat->blksize = xfs_preferred_iosize(mp); + stat->rdev = 0; + break; + } + + return 0; +} + +static void +xfs_setattr_mode( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + +void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + +int +xfs_setattr_nonsize( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + xfs_trans_t *tp; + int error; + kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + + trace_xfs_setattr(ip); + + /* If acls are being inherited, we already have this checked */ + if (!(flags & XFS_ATTR_NOACL)) { + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + } + + ASSERT((mask & ATTR_SIZE) == 0); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { + uint qflags = 0; + + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = inode->i_uid; + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = inode->i_gid; + } + + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), + xfs_kgid_to_gid(gid), + xfs_get_projid(ip), + qflags, &udqp, &gdqp, NULL); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_dqrele; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = inode->i_uid; + igid = inode->i_gid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; + + /* + * Do a quota reservation only if uid/gid is actually + * going to change. + */ + if (XFS_IS_QUOTA_RUNNING(mp) && + ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || + (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { + ASSERT(tp); + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + NULL, capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (error) /* out of quota */ + goto out_trans_cancel; + } + } + + xfs_trans_ijoin(tp, ip, 0); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(iuid, uid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & ATTR_UID); + ASSERT(udqp); + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = xfs_kuid_to_uid(uid); + inode->i_uid = uid; + } + if (!gid_eq(igid, gid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); + ASSERT(mask & ATTR_GID); + ASSERT(gdqp); + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = xfs_kgid_to_gid(gid); + inode->i_gid = gid; + } + } + + if (mask & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot1); + xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (error) + return error; + + /* + * XXX(hch): Updating the ACL entries is not atomic vs the i_mode + * update. We could avoid this with linked transactions + * and passing down the transaction pointer all the way + * to attr_set. No previous user of the generic + * Posix ACL code seems to care about this issue either. + */ + if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { + error = posix_acl_chmod(inode, inode->i_mode); + if (error) + return error; + } + + return 0; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + return error; +} + +/* + * Truncate file. Must have write permission and not be a directory. + */ +int +xfs_setattr_size( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t oldsize, newsize; + struct xfs_trans *tp; + int error; + uint lock_flags = 0; + uint commit_flags = 0; + bool did_zeroing = false; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + + oldsize = inode->i_size; + newsize = iattr->ia_size; + + /* + * Short circuit the truncate case for zero length files. + */ + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; + + /* + * Use the regular setattr path to update the timestamps. + */ + iattr->ia_valid &= ~ATTR_SIZE; + return xfs_setattr_nonsize(ip, iattr, 0); + } + + /* + * Make sure that the dquots are attached to the inode. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. + */ + if (newsize > oldsize) { + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); + if (error) + return error; + } + + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. + */ + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); + if (error) + return error; + } + + /* Now wait for all direct I/O to complete. */ + inode_dio_wait(inode); + + /* + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. + * + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. + */ + error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (error) + return error; + truncate_setsize(inode, newsize); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto out_trans_cancel; + + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + lock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Only change the c/mtime if we are changing the size or we are + * explicitly asked to change it. This handles the semantic difference + * between truncate() and ftruncate() as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + } + + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); + if (error) + goto out_trans_abort; + + /* + * Truncated "down", so we're removing references to old data + * here - if we delay flushing for a long time, we expose + * ourselves unduly to the notorious NULL files problem. So, + * we mark this inode and flush it when the file is closed, + * and do not wait the usual (long) time for writeout. + */ + xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); + } + + if (iattr->ia_valid & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +out_unlock: + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return error; + +out_trans_abort: + commit_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, commit_flags); + goto out_unlock; +} + +STATIC int +xfs_vn_setattr( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + uint iolock = XFS_IOLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(d_inode(dentry), &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + error = xfs_setattr_size(ip, iattr); + } + xfs_iunlock(ip, iolock); + } else { + error = xfs_setattr_nonsize(ip, iattr, 0); + } + + return error; +} + +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return xfs_trans_commit(tp, 0); +} + +#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +/* + * Call fiemap helper to fill in user data. + * Returns positive errors to xfs_getbmap. + */ +STATIC int +xfs_fiemap_format( + void **arg, + struct getbmapx *bmv, + int *full) +{ + int error; + struct fiemap_extent_info *fieinfo = *arg; + u32 fiemap_flags = 0; + u64 logical, physical, length; + + /* Do nothing for a hole */ + if (bmv->bmv_block == -1LL) + return 0; + + logical = BBTOB(bmv->bmv_offset); + physical = BBTOB(bmv->bmv_block); + length = BBTOB(bmv->bmv_length); + + if (bmv->bmv_oflags & BMV_OF_PREALLOC) + fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; + else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + physical = 0; /* no block yet */ + } + if (bmv->bmv_oflags & BMV_OF_LAST) + fiemap_flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, fiemap_flags); + if (error > 0) { + error = 0; + *full = 1; /* user array now full */ + } + + return error; +} + +STATIC int +xfs_vn_fiemap( + struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, + u64 length) +{ + xfs_inode_t *ip = XFS_I(inode); + struct getbmapx bm; + int error; + + error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); + if (error) + return error; + + /* Set up bmap header for xfs internal routine */ + bm.bmv_offset = BTOBBT(start); + /* Special case for whole file */ + if (length == FIEMAP_MAX_OFFSET) + bm.bmv_length = -1LL; + else + bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; + + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) + bm.bmv_iflags |= BMV_IF_ATTRFORK; + if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) + bm.bmv_iflags |= BMV_IF_DELALLOC; + + error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); + if (error) + return error; + + return 0; +} + +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + +static const struct inode_operations xfs_inode_operations = { + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, +}; + +static const struct inode_operations xfs_dir_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename2 = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename2 = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = xfs_vn_follow_link, + .put_link = kfree_put_link, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode and set up the operation vectors. + * + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; + + inode->i_ino = ip->i_ino; + inode->i_state = I_NEW; + + inode_sb_list_add(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); + + inode->i_mode = ip->i_d.di_mode; + set_nlink(inode, ip->i_d.di_nlink); + inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); + inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + ip->d_ops = ip->i_mount->m_dir_inode_ops; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); + } +} diff --git a/kernel/fs/xfs/xfs_iops.h b/kernel/fs/xfs/xfs_iops.h new file mode 100644 index 000000000..a0f84abb0 --- /dev/null +++ b/kernel/fs/xfs/xfs_iops.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +struct xfs_inode; + +extern const struct file_operations xfs_file_operations; +extern const struct file_operations xfs_dir_file_operations; + +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); + +/* + * Internal setattr interfaces. + */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); +extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); + +#endif /* __XFS_IOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_itable.c b/kernel/fs/xfs/xfs_itable.c new file mode 100644 index 000000000..80429891d --- /dev/null +++ b/kernel/fs/xfs/xfs_itable.c @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +STATIC int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino))); +} + +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ + + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return -EINVAL; + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return -ENOMEM; + + error = xfs_iget(mp, NULL, ino, + (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), + XFS_ILOCK_SHARED, &ip); + if (error) + goto out_free; + + ASSERT(ip != NULL); + ASSERT(ip->i_imap.im_blkno != 0); + + dic = &ip->i_d; + + /* xfs_iget returns the following without needing + * further change. + */ + buf->bs_nlink = dic->di_nlink; + buf->bs_projid_lo = dic->di_projid_lo; + buf->bs_projid_hi = dic->di_projid_hi; + buf->bs_ino = ino; + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; + buf->bs_size = dic->di_size; + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; + buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; + buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; + buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); + buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; + buf->bs_extents = dic->di_nextents; + buf->bs_gen = dic->di_gen; + memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + buf->bs_dmevmask = dic->di_dmevmask; + buf->bs_dmstate = dic->di_dmstate; + buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); + + switch (dic->di_format) { + case XFS_DINODE_FMT_DEV: + buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_blksize = BLKDEV_IOSIZE; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; + break; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); + + error = formatter(buffer, ubsize, ubused, buf); + if (!error) + *stat = BULKSTAT_RV_DIDONE; + + out_free: + kmem_free(buf); + return error; +} + +/* Return 0 on success or positive error */ +STATIC int +xfs_bulkstat_one_fmt( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + if (ubsize < sizeof(*buffer)) + return -ENOMEM; + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return -EFAULT; + if (ubused) + *ubused = sizeof(*buffer); + return 0; +} + +int +xfs_bulkstat_one( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt, ubused, stat); +} + +/* + * Loop over all clusters in a chunk for a given incore inode allocation btree + * record. Do a readahead if there are any allocated inodes in that cluster. + */ +STATIC void +xfs_bulkstat_ichunk_ra( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec) +{ + xfs_agblock_t agbno; + struct blk_plug plug; + int blks_per_cluster; + int inodes_per_cluster; + int i; /* inode chunk index */ + + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + + blk_start_plug(&plug); + for (i = 0; i < XFS_INODES_PER_CHUNK; + i += inodes_per_cluster, agbno += blks_per_cluster) { + if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster, + &xfs_inode_buf_ops); + } + } + blk_finish_plug(&plug); +} + +/* + * Lookup the inode chunk that the given inode lives in and then get the record + * if we found the chunk. If the inode was not the last in the chunk and there + * are some left allocated, update the data for the pointed-to record as well as + * return the count of grabbed inodes. + */ +STATIC int +xfs_bulkstat_grab_ichunk( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t agino, /* starting inode of chunk */ + int *icount,/* return # of inodes grabbed */ + struct xfs_inobt_rec_incore *irec) /* btree record */ +{ + int idx; /* index into inode chunk */ + int stat; + int error = 0; + + /* Lookup the inode chunk that this inode lives in */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + *icount = 0; + return error; + } + + /* Get the record, should always work */ + error = xfs_inobt_get_rec(cur, irec, &stat); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); + + /* Check if the record contains the inode in request */ + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { + *icount = 0; + return 0; + } + + idx = agino - irec->ir_startino + 1; + if (idx < XFS_INODES_PER_CHUNK && + (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) { + int i; + + /* We got a right chunk with some left inodes allocated at it. + * Grab the chunk record. Mark all the uninteresting inodes + * free -- because they're before our start point. + */ + for (i = 0; i < idx; i++) { + if (XFS_INOBT_MASK(i) & ~irec->ir_free) + irec->ir_freecount++; + } + + irec->ir_free |= xfs_inobt_maskn(0, idx); + *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + } + + return 0; +} + +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + +struct xfs_bulkstat_agichunk { + char __user **ac_ubuffer;/* pointer into user's buffer */ + int ac_ubleft; /* bytes left in user's buffer */ + int ac_ubelem; /* spaces used in user's buffer */ +}; + +/* + * Process inodes in chunk with a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + */ +static int +xfs_bulkstat_ag_ichunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irbp, + bulkstat_one_pf formatter, + size_t statstruct_size, + struct xfs_bulkstat_agichunk *acp, + xfs_agino_t *last_agino) +{ + char __user **ubufp = acp->ac_ubuffer; + int chunkidx; + int error = 0; + xfs_agino_t agino = irbp->ir_startino; + + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; + chunkidx++, agino++) { + int fmterror; + int ubused; + + /* inode won't fit in buffer, we are done */ + if (acp->ac_ubleft < statstruct_size) + break; + + /* Skip if this inode is free */ + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) + continue; + + /* Get the inode and fill in a single buffer */ + ubused = statstruct_size; + error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino), + *ubufp, acp->ac_ubleft, &ubused, &fmterror); + + if (fmterror == BULKSTAT_RV_GIVEUP || + (error && error != -ENOENT && error != -EINVAL)) { + acp->ac_ubleft = 0; + ASSERT(error); + break; + } + + /* be careful not to leak error if at end of chunk */ + if (fmterror == BULKSTAT_RV_NOTHING || error) { + error = 0; + continue; + } + + *ubufp += ubused; + acp->ac_ubleft -= ubused; + acp->ac_ubelem++; + } + + /* + * Post-update *last_agino. At this point, agino will always point one + * inode past the last inode we processed successfully. Hence we + * substract that inode when setting the *last_agino cursor so that we + * return the correct cookie to userspace. On the next bulkstat call, + * the inode under the lastino cookie will be skipped as we have already + * processed it here. + */ + *last_agino = agino - 1; + + return error; +} + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* last inode returned */ + int *ubcountp, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size, /* sizeof struct filling */ + char __user *ubuffer, /* buffer with inode stats */ + int *done) /* 1 if there are more stats to get */ +{ + xfs_buf_t *agbp; /* agi header buffer */ + xfs_agino_t agino; /* inode # in allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ + size_t irbsize; /* size of irec buffer in bytes */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + int nirbuf; /* size of irbuf */ + int ubcount; /* size of user's buffer */ + struct xfs_bulkstat_agichunk ac; + int error = 0; + + /* + * Get the last inode value, see if there's nothing to do. + */ + agno = XFS_INO_TO_AGNO(mp, *lastinop); + agino = XFS_INO_TO_AGINO(mp, *lastinop); + if (agno >= mp->m_sb.sb_agcount || + *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) { + *done = 1; + *ubcountp = 0; + return 0; + } + + ubcount = *ubcountp; /* statstruct's */ + ac.ac_ubuffer = &ubuffer; + ac.ac_ubleft = ubcount * statstruct_size; /* bytes */; + ac.ac_ubelem = 0; + + *ubcountp = 0; + *done = 0; + + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return -ENOMEM; + + nirbuf = irbsize / sizeof(*irbuf); + + /* + * Loop over the allocation groups, starting from the last + * inode returned; 0 means start of the allocation group. + */ + while (agno < mp->m_sb.sb_agcount) { + struct xfs_inobt_rec_incore *irbp = irbuf; + struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; + bool end_of_ag = false; + int icount = 0; + int stat; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + break; + /* + * Allocate and initialize a btree cursor for ialloc btree. + */ + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); + if (agino > 0) { + /* + * In the middle of an allocation group, we need to get + * the remainder of the chunk we're in. + */ + struct xfs_inobt_rec_incore r; + + error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); + if (error) + goto del_cursor; + if (icount) { + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + } + /* Increment to the next record */ + error = xfs_btree_increment(cur, 0, &stat); + } else { + /* Start of ag. Lookup the first inode chunk */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); + } + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + + /* + * Loop through inode btree records in this ag, + * until we run out of inodes or space in the buffer. + */ + while (irbp < irbufend && icount < ubcount) { + struct xfs_inobt_rec_incore r; + + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + + /* + * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. + */ + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + xfs_bulkstat_ichunk_ra(mp, agno, &r); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + } + error = xfs_btree_increment(cur, 0, &stat); + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + cond_resched(); + } + + /* + * Drop the btree buffers and the agi buffer as we can't hold any + * of the locks these represent when calling iget. If there is a + * pending error, then we are done. + */ +del_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + if (error) + break; + /* + * Now format all the good inodes into the user's buffer. The + * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer + * for the next loop iteration. + */ + irbufend = irbp; + for (irbp = irbuf; + irbp < irbufend && ac.ac_ubleft >= statstruct_size; + irbp++) { + error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, + formatter, statstruct_size, &ac, + &agino); + if (error) + break; + + cond_resched(); + } + + /* + * If we've run out of space or had a formatting error, we + * are now done + */ + if (ac.ac_ubleft < statstruct_size || error) + break; + + if (end_of_ag) { + agno++; + agino = 0; + } + } + /* + * Done, we're either out of filesystem or space to put the data. + */ + kmem_free(irbuf); + *ubcountp = ac.ac_ubelem; + + /* + * We found some inodes, so clear the error status and return them. + * The lastino pointer will point directly at the inode that triggered + * any error that occurred, so on the next call the error will be + * triggered again and propagated to userspace as there will be no + * formatted inodes in the buffer. + */ + if (ac.ac_ubelem) + error = 0; + + /* + * If we ran out of filesystem, lastino will point off the end of + * the filesystem so the next call will return immediately. + */ + *lastinop = XFS_AGINO_TO_INO(mp, agno, agino); + if (agno >= mp->m_sb.sb_agcount) + *done = 1; + + return error; +} + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const struct xfs_inogrp *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + +/* + * Return inode number table for the filesystem. + */ +int /* error status */ +xfs_inumbers( + struct xfs_mount *mp,/* mount point for filesystem */ + xfs_ino_t *lastino,/* last inode returned */ + int *count,/* size of buffer/count returned */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) +{ + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino); + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agbp = NULL; + struct xfs_inogrp *buffer; + int bcount; + int left = *count; + int bufidx = 0; + int error = 0; + + *count = 0; + if (agno >= mp->m_sb.sb_agcount || + *lastino != XFS_AGINO_TO_INO(mp, agno, agino)) + return error; + + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); + buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + do { + struct xfs_inobt_rec_incore r; + int stat; + + if (!agbp) { + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + break; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &stat); + if (error) + break; + if (!stat) + goto next_ag; + } + + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error) + break; + if (!stat) + goto next_ag; + + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; + if (++bufidx == bcount) { + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (error) + break; + ubuffer += written; + *count += bufidx; + bufidx = 0; + } + if (!--left) + break; + + error = xfs_btree_increment(cur, 0, &stat); + if (error) + break; + if (stat) + continue; + +next_ag: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + agino = 0; + agno++; + } while (agno < mp->m_sb.sb_agcount); + + if (!error) { + if (bufidx) { + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (!error) + *count += bufidx; + } + *lastino = XFS_AGINO_TO_INO(mp, agno, agino); + } + + kmem_free(buffer); + if (cur) + xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR)); + if (agbp) + xfs_buf_relse(agbp); + + return error; +} diff --git a/kernel/fs/xfs/xfs_itable.h b/kernel/fs/xfs/xfs_itable.h new file mode 100644 index 000000000..6ea8b3912 --- /dev/null +++ b/kernel/fs/xfs/xfs_itable.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ITABLE_H__ +#define __XFS_ITABLE_H__ + +/* + * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat + * structures (by the dmi library). This is a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c + */ +typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +/* + * Values for stat return value. + */ +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size,/* sizeof struct that we're filling */ + char __user *ubuffer,/* buffer with inode stats */ + int *done); /* 1 if there are more stats to get */ + +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + int ubsize, /* remaining user buffer sz */ + int *ubused, /* bytes used by formatter */ + const xfs_bstat_t *buffer); /* buffer to read from */ + +int +xfs_bulkstat_one_int( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + bulkstat_one_fmt_pf formatter, + int *ubused, + int *stat); + +int +xfs_bulkstat_one( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *last, /* last inode returned */ + int *count, /* size of buffer/count returned */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); + +#endif /* __XFS_ITABLE_H__ */ diff --git a/kernel/fs/xfs/xfs_linux.h b/kernel/fs/xfs/xfs_linux.h new file mode 100644 index 000000000..7c7842c85 --- /dev/null +++ b/kernel/fs/xfs/xfs_linux.h @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include + +/* + * Kernel specific type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* type */ +typedef unsigned long long xfs_ino_t; /* type */ +typedef __s64 xfs_daddr_t; /* type */ +typedef char * xfs_caddr_t; /* type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + +#include "xfs_types.h" + +#include "kmem.h" +#include "mrlock.h" +#include "uuid.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "xfs_fs.h" +#include "xfs_stats.h" +#include "xfs_sysctl.h" +#include "xfs_iops.h" +#include "xfs_aops.h" +#include "xfs_super.h" +#include "xfs_cksum.h" +#include "xfs_buf.h" +#include "xfs_message.h" + +#ifdef __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif + +#define irix_sgid_inherit xfs_params.sgid_inherit.val +#define irix_symlink_mode xfs_params.symlink_mode.val +#define xfs_panic_mask xfs_params.panic_mask.val +#define xfs_error_level xfs_params.error_level.val +#define xfs_syncd_centisecs xfs_params.syncd_timer.val +#define xfs_stats_clear xfs_params.stats_clear.val +#define xfs_inherit_sync xfs_params.inherit_sync.val +#define xfs_inherit_nodump xfs_params.inherit_nodump.val +#define xfs_inherit_noatime xfs_params.inherit_noatim.val +#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val +#define xfs_rotorstep xfs_params.rotorstep.val +#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val + +#define current_cpu() (raw_smp_processor_id()) +#define current_pid() (current->pid) +#define current_test_flags(f) (current->flags & (f)) +#define current_set_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags |= (f)) +#define current_clear_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags &= ~(f)) +#define current_restore_flags_nested(sp, f) \ + (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) + +#define spinlock_destroy(lock) + +#define NBBY 8 /* number of bits per byte */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT +#define BLKDEV_IOSIZE (1<> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x * y; +} + +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + +#define ASSERT_ALWAYS(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC noinline +#endif + +#else /* !DEBUG */ + +#ifdef XFS_WARN + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#endif /* XFS_WARN */ +#endif /* DEBUG */ + +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif + +#endif /* __XFS_LINUX__ */ diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c new file mode 100644 index 000000000..bcc7cfabb --- /dev/null +++ b/kernel/fs/xfs/xfs_log.c @@ -0,0 +1,4007 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" +#include "xfs_sysfs.h" +#include "xfs_sb.h" + +kmem_zone_t *xfs_log_ticket_zone; + +/* Local miscellaneous function prototypes */ +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); + +/* local state machine functions */ +STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); + +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); + +#if defined(DEBUG) +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); +#else +#define xlog_verify_dest_ptr(a,b) +#define xlog_verify_grant_tail(a) +#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_tail_lsn(a,b,c) +#endif + +STATIC int +xlog_iclogs_empty( + struct xlog *log); + +static void +xlog_grant_sub_space( + struct xlog *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +static void +xlog_grant_add_space( + struct xlog *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int tmp; + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +STATIC void +xlog_grant_head_init( + struct xlog_grant_head *head) +{ + xlog_assign_grant_head(&head->grant, 1, 0); + INIT_LIST_HEAD(&head->waiters); + spin_lock_init(&head->lock); +} + +STATIC void +xlog_grant_head_wake_all( + struct xlog_grant_head *head) +{ + struct xlog_ticket *tic; + + spin_lock(&head->lock); + list_for_each_entry(tic, &head->waiters, t_queue) + wake_up_process(tic->t_task); + spin_unlock(&head->lock); +} + +static inline int +xlog_ticket_reservation( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic) +{ + if (head == &log->l_write_head) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + return tic->t_unit_res; + } else { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + else + return tic->t_unit_res; + } +} + +STATIC bool +xlog_grant_head_wake( + struct xlog *log, + struct xlog_grant_head *head, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &head->waiters, t_queue) { + need_bytes = xlog_ticket_reservation(log, head, tic); + if (*free_bytes < need_bytes) + return false; + + *free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up_process(tic->t_task); + } + + return true; +} + +STATIC int +xlog_grant_head_wait( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) +{ + list_add_tail(&tic->t_queue, &head->waiters); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&head->lock); + + XFS_STATS_INC(xs_sleep_logspace); + + trace_xfs_log_grant_sleep(log, tic); + schedule(); + trace_xfs_log_grant_wake(log, tic); + + spin_lock(&head->lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &head->grant) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return -EIO; +} + +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto head->waiters, it will only return after the + * needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off head->waiters under head->lock, we + * only need to take that lock if we are going to add the ticket to the queue + * and sleep. We can avoid taking the lock if the ticket was never added to + * head->waiters because the t_queue list head will be empty and we hold the + * only reference to it so it can safely be checked unlocked. + */ +STATIC int +xlog_grant_head_check( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int *need_bytes) +{ + int free_bytes; + int error = 0; + + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ + *need_bytes = xlog_ticket_reservation(log, head, tic); + free_bytes = xlog_space_left(log, &head->grant); + if (!list_empty_careful(&head->waiters)) { + spin_lock(&head->lock); + if (!xlog_grant_head_wake(log, head, &free_bytes) || + free_bytes < *need_bytes) { + error = xlog_grant_head_wait(log, head, tic, + *need_bytes); + } + spin_unlock(&head->lock); + } else if (free_bytes < *need_bytes) { + spin_lock(&head->lock); + error = xlog_grant_head_wait(log, head, tic, *need_bytes); + spin_unlock(&head->lock); + } + + return error; +} + +static void +xlog_tic_reset_res(xlog_ticket_t *tic) +{ + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + tic->t_res_num_ophdrs = 0; +} + +static void +xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) +{ + if (tic->t_res_num == XLOG_TIC_LEN_MAX) { + /* add to overflow and start again */ + tic->t_res_o_flow += tic->t_res_arr_sum; + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + } + + tic->t_res_arr[tic->t_res_num].r_len = len; + tic->t_res_arr[tic->t_res_num].r_type = type; + tic->t_res_arr_sum += len; + tic->t_res_num++; +} + +/* + * Replenish the byte reservation required by moving the grant write head. + */ +int +xfs_log_regrant( + struct xfs_mount *mp, + struct xlog_ticket *tic) +{ + struct xlog *log = mp->m_log; + int need_bytes; + int error = 0; + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + XFS_STATS_INC(xs_try_logspace); + + /* + * This is a new transaction on the ticket, so we need to change the + * transaction ID so that the next transaction has a different TID in + * the log. Just add one to the existing tid so that we can see chains + * of rolling transactions in the log easily. + */ + tic->t_tid++; + + xlog_grant_push_ail(log, tic->t_unit_res); + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + + trace_xfs_log_regrant(log, tic); + + error = xlog_grant_head_check(log, &log->l_write_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_regrant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + +/* + * Reserve log space and return a ticket corresponding the reservation. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticp, + __uint8_t client, + bool permanent, + uint t_type) +{ + struct xlog *log = mp->m_log; + struct xlog_ticket *tic; + int need_bytes; + int error = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + XFS_STATS_INC(xs_try_logspace); + + ASSERT(*ticp == NULL); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, + KM_SLEEP | KM_MAYFAIL); + if (!tic) + return -ENOMEM; + + tic->t_trans_type = t_type; + *ticp = tic; + + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); + + trace_xfs_log_reserve(log, tic); + + error = xlog_grant_head_check(log, &log->l_reserve_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_reserve_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + + +/* + * NOTES: + * + * 1. currblock field gets updated at startup and after in-core logs + * marked as with WANT_SYNC. + */ + +/* + * This routine is called when a user of a log manager ticket is done with + * the reservation. If the ticket was ever used, then a commit record for + * the associated transaction is written out as a log operation header with + * no data. The flag XLOG_TIC_INITED is set when the first write occurs with + * a given ticket. If the ticket was one with a permanent reservation, then + * a few operations are done differently. Permanent reservation tickets by + * default don't release the reservation. They just commit the current + * transaction with the belief that the reservation is still needed. A flag + * must be passed in before permanent reservations are actually released. + * When these type of tickets are not released, they need to be set into + * the inited state again. By doing this, a start record will be written + * out when the next write occurs. + */ +xfs_lsn_t +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) +{ + struct xlog *log = mp->m_log; + xfs_lsn_t lsn = 0; + + if (XLOG_FORCED_SHUTDOWN(log) || + /* + * If nothing was ever written, don't write out commit record. + * If we get an error, just continue and give back the log ticket. + */ + (((ticket->t_flags & XLOG_TIC_INITED) == 0) && + (xlog_commit_record(log, ticket, iclog, &lsn)))) { + lsn = (xfs_lsn_t) -1; + if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { + flags |= XFS_LOG_REL_PERM_RESERV; + } + } + + + if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || + (flags & XFS_LOG_REL_PERM_RESERV)) { + trace_xfs_log_done_nonperm(log, ticket); + + /* + * Release ticket if not permanent reservation or a specific + * request has been made to release a permanent reservation. + */ + xlog_ungrant_log_space(log, ticket); + xfs_log_ticket_put(ticket); + } else { + trace_xfs_log_done_perm(log, ticket); + + xlog_regrant_reserve_log_space(log, ticket); + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ + ticket->t_flags |= XLOG_TIC_INITED; + } + + return lsn; +} + +/* + * Attaches a new iclog I/O completion callback routine during + * transaction commit. If the log is in error state, a non-zero + * return code is handed back and the caller is responsible for + * executing the callback at an appropriate time. + */ +int +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) +{ + int abortflg; + + spin_lock(&iclog->ic_callback_lock); + abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); + if (!abortflg) { + ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || + (iclog->ic_state == XLOG_STATE_WANT_SYNC)); + cb->cb_next = NULL; + *(iclog->ic_callback_tail) = cb; + iclog->ic_callback_tail = &(cb->cb_next); + } + spin_unlock(&iclog->ic_callback_lock); + return abortflg; +} + +int +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) +{ + if (xlog_state_release_iclog(mp->m_log, iclog)) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return -EIO; + } + + return 0; +} + +/* + * Mount a log filesystem + * + * mp - ubiquitous xfs mount point structure + * log_target - buftarg of on-disk log device + * blk_offset - Start block # where block size is 512 bytes (BBSIZE) + * num_bblocks - Number of BBSIZE blocks in on-disk log + * + * Return error or zero. + */ +int +xfs_log_mount( + xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + int error = 0; + int min_logfsbs; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { + xfs_notice(mp, +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(mp->m_log)) { + error = PTR_ERR(mp->m_log); + goto out; + } + + /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = -EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = -EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = -EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* + * Initialize the AIL now we have a log. + */ + error = xfs_trans_ail_init(mp); + if (error) { + xfs_warn(mp, "AIL initialisation failed: error %d", error); + goto out_free_log; + } + mp->m_log->l_ailp = mp->m_ail; + + /* + * skip log recovery on a norecovery mount. pretend it all + * just worked. + */ + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); + + if (readonly) + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + error = xlog_recover(mp->m_log); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (error) { + xfs_warn(mp, "log mount/recovery failed: error %d", + error); + goto out_destroy_ail; + } + } + + error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + "log"); + if (error) + goto out_destroy_ail; + + /* Normal transactions can now occur */ + mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + + return 0; + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); +out: + return error; +} + +/* + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. + * + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. + */ +int +xfs_log_mount_finish(xfs_mount_t *mp) +{ + int error = 0; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + } else { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + + return error; +} + +/* + * Final log writes as part of unmount. + * + * Mark the filesystem clean as unmount happens. Note that during relocation + * this routine needs to be executed as part of source-bag while the + * deallocation must not be done until source-end. + */ + +/* + * Unmount record used to have a string "Unmount filesystem--" in the + * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). + * We just write the magic number now since that particular field isn't + * currently architecture converted and "Unmount" is a bit foo. + * As far as I know, there weren't any dependencies on the old behaviour. + */ + +int +xfs_log_unmount_write(xfs_mount_t *mp) +{ + struct xlog *log = mp->m_log; + xlog_in_core_t *iclog; +#ifdef DEBUG + xlog_in_core_t *first_iclog; +#endif + xlog_ticket_t *tic = NULL; + xfs_lsn_t lsn; + int error; + + /* + * Don't write out unmount record on read-only mounts. + * Or, if we are doing a forced umount (typically because of IO errors). + */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + +#ifdef DEBUG + first_iclog = iclog = log->l_iclog; + do { + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } + iclog = iclog->ic_next; + } while (iclog != first_iclog); +#endif + if (! (XLOG_FORCED_SHUTDOWN(log))) { + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = &magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + /* remove inited flag, and account for space used */ + tic->t_flags = 0; + tic->t_curr_res -= sizeof(magic); + error = xlog_write(log, &vec, tic, &lsn, + NULL, XLOG_UNMOUNT_TRANS); + /* + * At this point, we're umounting anyway, + * so there's no point in transitioning log state + * to IOERROR. Just continue... + */ + } + + if (error) + xfs_alert(mp, "%s: unmount record failed", __func__); + + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + if (!(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY)) { + if (!XLOG_FORCED_SHUTDOWN(log)) { + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } else { + spin_unlock(&log->l_icloglock); + } + if (tic) { + trace_xfs_log_umount_write(log, tic); + xlog_ungrant_log_space(log, tic); + xfs_log_ticket_put(tic); + } + } else { + /* + * We're already in forced_shutdown mode, couldn't + * even attempt to write out the unmount transaction. + * + * Go through the motions of sync'ing and releasing + * the iclog, even though no I/O will actually happen, + * we need to wait for other log I/Os that may already + * be in progress. Do this as a separate section of + * code so we'll know if we ever get stuck here that + * we're in this odd situation of trying to unmount + * a file system that went into forced_shutdown as + * the result of an unmount.. + */ + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + + if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE + || iclog->ic_state == XLOG_STATE_DIRTY + || iclog->ic_state == XLOG_STATE_IOERROR) ) { + + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } + + return error; +} /* xfs_log_unmount_write */ + +/* + * Empty the log for unmount/freeze. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. + */ +void +xfs_log_quiesce( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); + + xfs_trans_ail_destroy(mp); + + xfs_sysfs_del(&mp->m_log->l_kobj); + + xlog_dealloc_log(mp->m_log); +} + +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + const struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + +/* + * Wake up processes waiting for log space after we have moved the log tail. + */ +void +xfs_log_space_wake( + struct xfs_mount *mp) +{ + struct xlog *log = mp->m_log; + int free_bytes; + + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + if (!list_empty_careful(&log->l_write_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_write_head.lock); + free_bytes = xlog_space_left(log, &log->l_write_head.grant); + xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); + spin_unlock(&log->l_write_head.lock); + } + + if (!list_empty_careful(&log->l_reserve_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_reserve_head.lock); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); + spin_unlock(&log->l_reserve_head.lock); + } +} + +/* + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. + * + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. + */ +int +xfs_log_need_covered(xfs_mount_t *mp) +{ + struct xlog *log = mp->m_log; + int needed = 0; + + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) + return 0; + + if (!xlog_cil_empty(log)) + return 0; + + spin_lock(&log->l_icloglock); + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; + default: + needed = 1; + break; + } + spin_unlock(&log->l_icloglock); + return needed; +} + +/* + * We may be holding the log iclog lock upon entering this routine. + */ +xfs_lsn_t +xlog_assign_tail_lsn_locked( + struct xfs_mount *mp) +{ + struct xlog *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&mp->m_ail->xa_lock); + + /* + * To make sure we always have a valid LSN for the log tail we keep + * track of the last LSN which was committed in log->l_last_sync_lsn, + * and use that when the AIL was empty. + */ + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); + return tail_lsn; +} + +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + +/* + * Return the space in the log between the tail and the head. The head + * is passed in the cycle/bytes formal parms. In the special case where + * the reserve head has wrapped passed the tail, this calculation is no + * longer valid. In this case, just return 0 which means there is no space + * in the log. This works for all places where this function is called + * with the reserve head. Of course, if the write head were to ever + * wrap the tail, we should blow up. Rather than catch this case here, + * we depend on other ASSERTions in other parts of the code. XXXmiken + * + * This code also handles the case where the reservation head is behind + * the tail. The details of this case are described below, but the end + * result is that we return the size of the log as the amount of space left. + */ +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) + return 0; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; + } else { + /* + * The reservation head is behind the tail. + * In this case we just want to return the size of the + * log as the amount of space left. + */ + xfs_alert(log->l_mp, + "xlog_space_left: head behind tail\n" + " tail_cycle = %d, tail_bytes = %d\n" + " GH cycle = %d, GH bytes = %d", + tail_cycle, tail_bytes, head_cycle, head_bytes); + ASSERT(0); + free_bytes = log->l_logsize; + } + return free_bytes; +} + + +/* + * Log function which is called when an io completes. + * + * The log manager needs its own routine, in order to control what + * happens with the buffer after the write completes. + */ +void +xlog_iodone(xfs_buf_t *bp) +{ + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; + + /* + * Race to shutdown the filesystem if we see an error. + */ + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, + XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); + xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); + /* + * This flag will be propagated to the trans-committed + * callback routines to let them know that the log-commit + * didn't succeed. + */ + aborted = XFS_LI_ABORTED; + } else if (iclog->ic_state & XLOG_STATE_IOERROR) { + aborted = XFS_LI_ABORTED; + } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); + xlog_state_done_syncing(iclog, aborted); + + /* + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. + */ + xfs_buf_unlock(bp); +} + +/* + * Return size of each in-core log record buffer. + * + * All machines get 8 x 32kB buffers by default, unless tuned otherwise. + * + * If the filesystem blocksize is too large, we may need to choose a + * larger size since the directory code currently logs entire blocks. + */ + +STATIC void +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) +{ + int size; + int xhdrs; + + if (mp->m_logbufs <= 0) + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + else + log->l_iclog_bufs = mp->m_logbufs; + + /* + * Buffer size passed in from mount system call. + */ + if (mp->m_logbsize > 0) { + size = log->l_iclog_size = mp->m_logbsize; + log->l_iclog_size_log = 0; + while (size != 1) { + log->l_iclog_size_log++; + size >>= 1; + } + + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + /* # headers = size / 32k + * one header holds cycles from 32k of data + */ + + xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; + if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + log->l_iclog_hsize = xhdrs << BBSHIFT; + log->l_iclog_heads = xhdrs; + } else { + ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + } + goto done; + } + + /* All machines use 32kB buffers by default. */ + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; + + /* the default log size is 16k or 32k which is one header sector */ + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + +done: + /* are we being asked to make the sizes selected above visible? */ + if (mp->m_logbufs == 0) + mp->m_logbufs = log->l_iclog_bufs; + if (mp->m_logbsize == 0) + mp->m_logbsize = log->l_iclog_size; +} /* xlog_get_iclog_buffer_size */ + + +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) { + /* + * Dump a transaction into the log that contains no real change. + * This is needed to stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty + * state back up into the VFS and then periodic inode flushing + * will prevent log covering from making progress. Hence we + * synchronously log the superblock instead to ensure the + * superblock is immediately unpinned and can be written back. + */ + xfs_sync_sb(mp, true); + } else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + +/* + * This routine initializes some of the log structure for a given mount point. + * Its primary purpose is to fill in enough, so recovery can occur. However, + * some other stuff may be filled in too. + */ +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + struct xlog *log; + xlog_rec_header_t *head; + xlog_in_core_t **iclogp; + xlog_in_core_t *iclog, *prev_iclog=NULL; + xfs_buf_t *bp; + int i; + int error = -ENOMEM; + uint log2_size = 0; + + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + if (!log) { + xfs_warn(mp, "Log allocation failed: No memory!"); + goto out; + } + + log->l_mp = mp; + log->l_targ = log_target; + log->l_logsize = BBTOB(num_bblks); + log->l_logBBstart = blk_offset; + log->l_logBBsize = num_bblks; + log->l_covered_state = XLOG_STATE_COVER_IDLE; + log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + + log->l_prev_block = -1; + /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); + log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + + xlog_grant_head_init(&log->l_reserve_head); + xlog_grant_head_init(&log->l_write_head); + + error = -EFSCORRUPTED; + if (xfs_sb_version_hassector(&mp->m_sb)) { + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", + log2_size, BBSHIFT); + goto out_free_log; + } + + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", + log2_size, mp->m_sectbb_log); + goto out_free_log; + } + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + xfs_warn(mp, + "log sector size (0x%x) invalid for configuration.", + log2_size); + goto out_free_log; + } + } + log->l_sectBBsize = 1 << log2_size; + + xlog_get_iclog_buffer_size(mp, log); + + /* + * Use a NULL block for the extra log buffer used during splits so that + * it will trigger errors if we ever try to do IO on it without first + * having set it up properly. + */ + error = -ENOMEM; + bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, + BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_log; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; + bp->b_iodone = xlog_iodone; + log->l_xbuf = bp; + + spin_lock_init(&log->l_icloglock); + init_waitqueue_head(&log->l_flush_wait); + + iclogp = &log->l_iclog; + /* + * The amount of memory to allocate for the iclog structure is + * rather funky due to the way the structure is defined. It is + * done this way so that we can use different sizes for machines + * with different amounts of memory. See the definition of + * xlog_in_core_t in xfs_log_priv.h for details. + */ + ASSERT(log->l_iclog_size >= 4096); + for (i=0; i < log->l_iclog_bufs; i++) { + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + + iclog = *iclogp; + iclog->ic_prev = prev_iclog; + prev_iclog = iclog; + + bp = xfs_buf_get_uncached(mp->m_logdev_targp, + BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_iclog; + + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; + bp->b_iodone = xlog_iodone; + iclog->ic_bp = bp; + iclog->ic_data = bp->b_addr; +#ifdef DEBUG + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); +#endif + head = &iclog->ic_header; + memset(head, 0, sizeof(xlog_rec_header_t)); + head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + head->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + head->h_size = cpu_to_be32(log->l_iclog_size); + /* new fields */ + head->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); + + iclogp = &iclog->ic_next; + } + *iclogp = log->l_iclog; /* complete ring */ + log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; + return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) + xfs_buf_free(iclog->ic_bp); + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); +out: + return ERR_PTR(error); +} /* xlog_alloc_log */ + + +/* + * Write out the commit record of a transaction associated with the given + * ticket. Return the lsn of the commit record. + */ +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) +{ + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + ASSERT_ALWAYS(iclog); + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +/* + * Push on the buffer cache code if we ever use more than 75% of the on-disk + * log space. This code pushes on the lsn which would supposedly free up + * the 25% which we want to leave free. We may need to adopt a policy which + * pushes on an lsn which is further along in the log once we reach the high + * water mark. In this manner, we would be creating a low water mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; + if (threshold_block >= log->l_logBBsize) { + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; + } + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; + + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. + */ + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_ail_push(log->l_ailp, threshold_lsn); +} + +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog = bp->b_fspriv; + + xfs_buf_lock(bp); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. + */ + return 0; + } + + xfs_buf_submit(bp); + return 0; +} + +/* + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * fashion. Previously, we should have moved the current iclog + * ptr in the log to point to the next available iclog. This allows further + * write to continue while this code syncs out an iclog ready to go. + * Before an in-core log can be written out, the data section must be scanned + * to save away the 1st word of each BBSIZE block into the header. We replace + * it with the current cycle count. Each BBSIZE block is tagged with the + * cycle count because there in an implicit assumption that drives will + * guarantee that entire 512 byte blocks get written at once. In other words, + * we can't have part of a 512 byte block written and part not written. By + * tagging each block, we will know which blocks are valid when recovering + * after an unclean shutdown. + * + * This routine is single threaded on the iclog. No other thread can be in + * this routine with the same iclog. Changing contents of iclog can there- + * fore be done without grabbing the state machine lock. Updating the global + * log will require grabbing the lock though. + * + * The entire log manager uses a logical block numbering scheme. Only + * log_sync (and then only bwrite()) know about the fact that the log may + * not start with block zero on a given device. The log block start offset + * is added immediately before calling bwrite(). + */ + +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) +{ + xfs_buf_t *bp; + int i; + uint count; /* byte count of bwrite */ + uint count_init; /* initial count before roundup */ + int roundoff; /* roundoff to BB or stripe */ + int split = 0; /* split write into two regions */ + int error; + int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; + + XFS_STATS_INC(xs_log_writes); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + roundoff = count - count_init; + ASSERT(roundoff >= 0); + ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && + roundoff < log->l_mp->m_sb.sb_logsunit) + || + (log->l_mp->m_sb.sb_logsunit <= 1 && + roundoff < BBTOB(1))); + + /* move grant heads by roundoff in sync */ + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + + /* put cycle number in every block */ + xlog_pack_data(log, iclog, roundoff); + + /* real byte length */ + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); + + bp = iclog->ic_bp; + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); + + XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + + /* Do we need to split this write into 2 parts? */ + if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); + count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } + } else { + iclog->ic_bwritecnt = 1; + } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + + bp->b_io_length = BTOBB(count); + bp->b_fspriv = iclog; + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_SYNCIO; + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + bp->b_flags |= XBF_FUA; + + /* + * Flush the data device before flushing the log to make + * sure all meta data written back from the AIL actually made + * it to disk before stamping the new log tail LSN into the + * log buffer. For an external log we need to issue the + * flush explicitly, and unfortunately synchronously here; + * for an internal log we can simply use the block layer + * state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; + } + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + xlog_verify_iclog(log, iclog, count, true); + + /* account for log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + /* + * Don't call xfs_bwrite here. We do log-syncs even when the filesystem + * is shutting down. + */ + XFS_BUF_WRITE(bp); + + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); + return error; + } + if (split) { + bp = iclog->ic_log->l_xbuf; + XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); + bp->b_fspriv = iclog; + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_SYNCIO; + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + bp->b_flags |= XBF_FUA; + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + /* account for internal log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + XFS_BUF_WRITE(bp); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); + return error; + } + } + return 0; +} /* xlog_sync */ + +/* + * Deallocate a log structure + */ +STATIC void +xlog_dealloc_log( + struct xlog *log) +{ + xlog_in_core_t *iclog, *next_iclog; + int i; + + xlog_cil_destroy(log); + + /* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); + xfs_buf_free(log->l_xbuf); + + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_free(iclog->ic_bp); + next_iclog = iclog->ic_next; + kmem_free(iclog); + iclog = next_iclog; + } + spinlock_destroy(&log->l_icloglock); + + log->l_mp->m_log = NULL; + kmem_free(log); +} /* xlog_dealloc_log */ + +/* + * Update counters atomically now that memcpy is done. + */ +/* ARGSUSED */ +static inline void +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) +{ + spin_lock(&log->l_icloglock); + + be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + iclog->ic_offset += copy_bytes; + + spin_unlock(&log->l_icloglock); +} /* xlog_state_finish_copy */ + + + + +/* + * print out info relating to regions written which consume + * the reservation + */ +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) +{ + uint i; + uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); + + /* match with XLOG_REG_TYPE_* in xfs_log.h */ + static char *res_type_str[XLOG_REG_TYPE_MAX] = { + "bformat", + "bchunk", + "efi_format", + "efd_format", + "iformat", + "icore", + "iext", + "ibroot", + "ilocal", + "iattr_ext", + "iattr_broot", + "iattr_local", + "qformat", + "dquot", + "quotaoff", + "LR header", + "unmount", + "commit", + "trans header" + }; + static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { + "SETATTR_NOT_SIZE", + "SETATTR_SIZE", + "INACTIVE", + "CREATE", + "CREATE_TRUNC", + "TRUNCATE_FILE", + "REMOVE", + "LINK", + "RENAME", + "MKDIR", + "RMDIR", + "SYMLINK", + "SET_DMATTRS", + "GROWFS", + "STRAT_WRITE", + "DIOSTRAT", + "WRITE_SYNC", + "WRITEID", + "ADDAFORK", + "ATTRINVAL", + "ATRUNCATE", + "ATTR_SET", + "ATTR_RM", + "ATTR_FLAG", + "CLEAR_AGI_BUCKET", + "QM_SBCHANGE", + "DUMMY1", + "DUMMY2", + "QM_QUOTAOFF", + "QM_DQALLOC", + "QM_SETQLIM", + "QM_DQCLUSTER", + "QM_QINOCREATE", + "QM_QUOTAOFF_END", + "SB_UNIT", + "FSYNC_TS", + "GROWFSRT_ALLOC", + "GROWFSRT_ZERO", + "GROWFSRT_FREE", + "SWAPEXT" + }; + + xfs_warn(mp, + "xlog_write: reservation summary:\n" + " trans type = %s (%u)\n" + " unit res = %d bytes\n" + " current res = %d bytes\n" + " total reg = %u bytes (o/flow = %u bytes)\n" + " ophdrs = %u (ophdr space = %u bytes)\n" + " ophdr + reg = %u bytes\n" + " num regions = %u", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), + ticket->t_trans_type, + ticket->t_unit_res, + ticket->t_curr_res, + ticket->t_res_arr_sum, ticket->t_res_o_flow, + ticket->t_res_num_ophdrs, ophdr_spc, + ticket->t_res_arr_sum + + ticket->t_res_o_flow + ophdr_spc, + ticket->t_res_num); + + for (i = 0; i < ticket->t_res_num; i++) { + uint r_type = ticket->t_res_arr[i].r_type; + xfs_warn(mp, "region[%u]: %s - %u bytes", i, + ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? + "bad-rtype" : res_type_str[r_type-1]), + ticket->t_res_arr[i].r_len); + } + + xfs_alert_tag(mp, XFS_PTAG_LOGRES, + "xlog_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct xlog *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_warn(log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct xlog *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; +} + +/* + * Write some region out to in-core log + * + * This will be called when writing externally provided regions or when + * writing out a commit record for a given transaction. + * + * General algorithm: + * 1. Find total length of this write. This may include adding to the + * lengths passed in. + * 2. Check whether we violate the tickets reservation. + * 3. While writing to this iclog + * A. Reserve as much space in this iclog as can get + * B. If this is first write, save away start lsn + * C. While writing this region: + * 1. If first write of transaction, write start record + * 2. Write log operation header (header per region) + * 3. Find out if we can fit entire region into this iclog + * 4. Potentially, verify destination memcpy ptr + * 5. Memcpy (partial) region + * 6. If partial copy, release iclog; otherwise, continue + * copying more regions into current iclog + * 4. Mark want sync bit (in simulation mode) + * 5. Release iclog for potential flush to on-disk log. + * + * ERRORS: + * 1. Panic if reservation is overrun. This should never happen since + * reservation amounts are generated internal to the filesystem. + * NOTES: + * 1. Tickets are single threaded data structures. + * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the + * syncing routine. When a single log_write region needs to span + * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set + * on all log operation writes which don't contain the end of the + * region. The XLOG_END_TRANS bit is used for the in-core log + * operation which contains the end of the continued log_write region. + * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, + * we don't really know exactly how much space will be used. As a result, + * we don't update ic_offset until the end when we know exactly how many + * bytes have been written out. + */ +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) +{ + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; + + *start_lsn = 0; + + len = xlog_write_calc_vec_length(ticket, log_vector); + + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; + + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; + + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + + reg = &vecp[index]; + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } + + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return -EIO; + + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; + + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; + + if (++index == lv->lv_niovecs) { +next_lv: + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0 && ordered == false) { + if (!lv) + return 0; + break; + } + } + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + return 0; +} + + +/***************************************************************************** + * + * State Machine functions + * + ***************************************************************************** + */ + +/* Clean iclogs starting from the head. This ordering must be + * maintained, so an iclog doesn't become ACTIVE beyond one that + * is SYNCING. This is also required to maintain the notion that we use + * a ordered wait queue to hold off would be writers to the log when every + * iclog is trying to sync to disk. + * + * State Change: DIRTY -> ACTIVE + */ +STATIC void +xlog_state_clean_log( + struct xlog *log) +{ + xlog_in_core_t *iclog; + int changed = 0; + + iclog = log->l_iclog; + do { + if (iclog->ic_state == XLOG_STATE_DIRTY) { + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + ASSERT(iclog->ic_callback == NULL); + /* + * If the number of ops in this iclog indicate it just + * contains the dummy transaction, we can + * change state into IDLE (the second time around). + * Otherwise we should change the state into + * NEED a dummy. + * We don't need to cover the dummy. + */ + if (!changed && + (be32_to_cpu(iclog->ic_header.h_num_logops) == + XLOG_COVER_OPS)) { + changed = 1; + } else { + /* + * We have two dirty iclogs so start over + * This could also be num of ops indicates + * this is not the dummy going out. + */ + changed = 2; + } + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) + /* do nothing */; + else + break; /* stop cleaning */ + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + + /* log is locked when we are called */ + /* + * Change state for the dummy log recording. + * We usually go to NEED. But we go to NEED2 if the changed indicates + * we are done writing the dummy record. + * If we are done with the second dummy recored (DONE2), then + * we go to IDLE. + */ + if (changed) { + switch (log->l_covered_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_NEED2; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE2: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_IDLE; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + default: + ASSERT(0); + } + } +} /* xlog_state_clean_log */ + +STATIC xfs_lsn_t +xlog_get_lowest_lsn( + struct xlog *log) +{ + xlog_in_core_t *lsn_log; + xfs_lsn_t lowest_lsn, lsn; + + lsn_log = log->l_iclog; + lowest_lsn = 0; + do { + if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { + lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); + if ((lsn && !lowest_lsn) || + (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + lowest_lsn = lsn; + } + } + lsn_log = lsn_log->ic_next; + } while (lsn_log != log->l_iclog); + return lowest_lsn; +} + + +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) +{ + xlog_in_core_t *iclog; + xlog_in_core_t *first_iclog; /* used to know when we've + * processed all iclogs once */ + xfs_log_callback_t *cb, *cb_next; + int flushcnt = 0; + xfs_lsn_t lowest_lsn; + int ioerrors; /* counter: iclogs with errors */ + int loopdidcallbacks; /* flag: inner loop did callbacks*/ + int funcdidcallbacks; /* flag: function did callbacks */ + int repeats; /* for issuing console warnings if + * looping too many times */ + int wake = 0; + + spin_lock(&log->l_icloglock); + first_iclog = iclog = log->l_iclog; + ioerrors = 0; + funcdidcallbacks = 0; + repeats = 0; + + do { + /* + * Scan all iclogs starting with the one pointed to by the + * log. Reset this starting point each time the log is + * unlocked (during callbacks). + * + * Keep looping through iclogs until one full pass is made + * without running any callbacks. + */ + first_iclog = log->l_iclog; + iclog = log->l_iclog; + loopdidcallbacks = 0; + repeats++; + + do { + + /* skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & + (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + iclog = iclog->ic_next; + continue; + } + + /* + * Between marking a filesystem SHUTDOWN and stopping + * the log, we do flush all iclogs to disk (if there + * wasn't a log I/O error). So, we do want things to + * go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Can only perform callbacks in order. Since + * this iclog is not in the DONE_SYNC/ + * DO_CALLBACK state, we skip the rest and + * just try to clean up. If we set our iclog + * to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the + * CALLBACK and the state cannot change since + * we are holding the l_icloglock. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | + XLOG_STATE_DO_CALLBACK))) { + if (ciclog && (ciclog->ic_state == + XLOG_STATE_DONE_SYNC)) { + ciclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + break; + } + /* + * We now have an iclog that is in either the + * DO_CALLBACK or DONE_SYNC states. The other + * states (WANT_SYNC, SYNCING, or CALLBACK were + * caught by the above if and are going to + * clean (i.e. we aren't doing their callbacks) + * see the above if. + */ + + /* + * We will do one more check here to see if we + * have chased our tail around. + */ + + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && + XFS_LSN_CMP(lowest_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { + iclog = iclog->ic_next; + continue; /* Leave this iclog for + * another thread */ + } + + iclog->ic_state = XLOG_STATE_CALLBACK; + + + /* + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the + * icloglock to ensure we are the only one that + * can update it. + */ + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); + + } else + ioerrors++; + + spin_unlock(&log->l_icloglock); + + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + while (cb) { + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_callback = NULL; + spin_unlock(&iclog->ic_callback_lock); + + /* perform callbacks in the order given */ + for (; cb; cb = cb_next) { + cb_next = cb->cb_next; + cb->cb_func(cb->cb_arg, aborted); + } + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + } + + loopdidcallbacks++; + funcdidcallbacks++; + + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) + iclog->ic_state = XLOG_STATE_DIRTY; + + /* + * Transition from DIRTY to ACTIVE if applicable. + * NOP if STATE_IOERROR. + */ + xlog_state_clean_log(log); + + /* wake up threads waiting in xfs_log_force() */ + wake_up_all(&iclog->ic_force_wait); + + iclog = iclog->ic_next; + } while (first_iclog != iclog); + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; + xfs_warn(log->l_mp, + "%s: possible infinite loop (%d iterations)", + __func__, flushcnt); + } + } while (!ioerrors && loopdidcallbacks); + + /* + * make one last gasp attempt to see if iclogs are being left in + * limbo.. + */ +#ifdef DEBUG + if (funcdidcallbacks) { + first_iclog = iclog = log->l_iclog; + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * l_icloglock + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); + } +#endif + + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; + spin_unlock(&log->l_icloglock); + + if (wake) + wake_up_all(&log->l_flush_wait); +} + + +/* + * Finish transitioning this iclog to the dirty state. + * + * Make sure that we completely execute this routine only when this is + * the last call to the iclog. There is a good chance that iclog flushes, + * when we reach the end of the physical log, get turned into 2 separate + * calls to bwrite. Hence, one iclog flush could generate two calls to this + * routine. By using the reference count bwritecnt, we guarantee that only + * the second completion goes through. + * + * Callbacks could take time, so they are done outside the scope of the + * global state machine log lock. + */ +STATIC void +xlog_state_done_syncing( + xlog_in_core_t *iclog, + int aborted) +{ + struct xlog *log = iclog->ic_log; + + spin_lock(&log->l_icloglock); + + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_IOERROR); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); + + + /* + * If we got an error, either on the first buffer, or in the case of + * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, + * and none should ever be attempted to be written to disk + * again. + */ + if (iclog->ic_state != XLOG_STATE_IOERROR) { + if (--iclog->ic_bwritecnt == 1) { + spin_unlock(&log->l_icloglock); + return; + } + iclog->ic_state = XLOG_STATE_DONE_SYNC; + } + + /* + * Someone could be sleeping prior to writing out the next + * iclog buffer, we wake them all, one will get to do the + * I/O, the others get to wait for the result. + */ + wake_up_all(&iclog->ic_write_wait); + spin_unlock(&log->l_icloglock); + xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ +} /* xlog_state_done_syncing */ + + +/* + * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. + * + * The in-core logs are used in a circular fashion. They are not used + * out-of-order even when an iclog past the head is free. + * + * return: + * * log_offset where xlog_write() can start writing into the in-core + * log's data space. + * * in-core log pointer to which xlog_write() should write. + * * boolean indicating this is a continued write to an in-core log. + * If this is the last write, then the in-core log's offset field + * needs to be incremented, depending on the amount of data which + * is copied. + */ +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) +{ + int log_offset; + xlog_rec_header_t *head; + xlog_in_core_t *iclog; + int error; + +restart: + spin_lock(&log->l_icloglock); + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + iclog = log->l_iclog; + if (iclog->ic_state != XLOG_STATE_ACTIVE) { + XFS_STATS_INC(xs_log_noiclogs); + + /* Wait for log writes to have flushed */ + xlog_wait(&log->l_flush_wait, &log->l_icloglock); + goto restart; + } + + head = &iclog->ic_header; + + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ + log_offset = iclog->ic_offset; + + /* On the 1st write to an iclog, figure out lsn. This works + * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are + * committing to. If the offset is set, that's how many blocks + * must be written. + */ + if (log_offset == 0) { + ticket->t_curr_res -= log->l_iclog_hsize; + xlog_tic_add_region(ticket, + log->l_iclog_hsize, + XLOG_REG_TYPE_LRHEADER); + head->h_cycle = cpu_to_be32(log->l_curr_cycle); + head->h_lsn = cpu_to_be64( + xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); + ASSERT(log->l_curr_block >= 0); + } + + /* If there is enough room to write everything, then do it. Otherwise, + * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC + * bit is on, so this will get flushed out. Don't update ic_offset + * until you know exactly how many bytes get copied. Therefore, wait + * until later to update ic_offset. + * + * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * can fit into remaining data section. + */ + if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + if (error) + return error; + } else { + spin_unlock(&log->l_icloglock); + } + goto restart; + } + + /* Do we have enough room to write the full amount in the remainder + * of this iclog? Or must we continue a write on the next iclog and + * mark this iclog as completely taken? In the case where we switch + * iclogs (to mark it taken), this particular iclog will release/sync + * to disk in xlog_write(). + */ + if (len <= iclog->ic_size - iclog->ic_offset) { + *continued_write = 0; + iclog->ic_offset += len; + } else { + *continued_write = 1; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + } + *iclogp = iclog; + + ASSERT(iclog->ic_offset <= iclog->ic_size); + spin_unlock(&log->l_icloglock); + + *logoffsetp = log_offset; + return 0; +} /* xlog_state_get_iclog_space */ + +/* The first cnt-1 times through here we don't need to + * move the grant write head because the permanent + * reservation has reserved cnt times the unit amount. + * Release part of current permanent unit reservation and + * reset current reservation to be one units worth. Also + * move grant reservation head forward. + */ +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) +{ + trace_xfs_log_regrant_reserve_enter(log, ticket); + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + xlog_grant_sub_space(log, &log->l_reserve_head.grant, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_write_head.grant, + ticket->t_curr_res); + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + + trace_xfs_log_regrant_reserve_sub(log, ticket); + + /* just return if we still have some of the pre-reserved space */ + if (ticket->t_cnt > 0) + return; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + + trace_xfs_log_regrant_reserve_exit(log, ticket); + + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); +} /* xlog_regrant_reserve_log_space */ + + +/* + * Give back the space left from a reservation. + * + * All the information we need to make a correct determination of space left + * is present. For non-permanent reservations, things are quite easy. The + * count should have been decremented to zero. We only need to deal with the + * space remaining in the current reservation part of the ticket. If the + * ticket contains a permanent reservation, there may be left over space which + * needs to be released. A count of N means that N-1 refills of the current + * reservation can be done before we need to ask for more space. The first + * one goes to fill up the first current reservation. Once we run out of + * space, the count will stay at zero and the only space remaining will be + * in the current reservation field. + */ +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) +{ + int bytes; + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + trace_xfs_log_ungrant_enter(log, ticket); + trace_xfs_log_ungrant_sub(log, ticket); + + /* + * If this is a permanent reservation ticket, we may be able to free + * up more space based on the remaining count. + */ + bytes = ticket->t_curr_res; + if (ticket->t_cnt > 0) { + ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); + bytes += ticket->t_unit_res*ticket->t_cnt; + } + + xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); + + trace_xfs_log_ungrant_exit(log, ticket); + + xfs_log_space_wake(log->l_mp); +} + +/* + * Flush iclog to disk if this is the last reference to the given iclog and + * the WANT_SYNC bit is set. + * + * When this function is entered, the iclog is not necessarily in the + * WANT_SYNC state. It may be sitting around waiting to get filled. + * + * + */ +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + int sync = 0; /* do we sync? */ + + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; + + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_WANT_SYNC); + + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + sync++; + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + } + spin_unlock(&log->l_icloglock); + + /* + * We let the log lock go, so it's possible that we hit a log I/O + * error or some other SHUTDOWN condition that marks the iclog + * as XLOG_STATE_IOERROR before the bwrite. However, we know that + * this iclog has consistent data, so we ignore IOERROR + * flags after this point. + */ + if (sync) + return xlog_sync(log, iclog); + return 0; +} /* xlog_state_release_iclog */ + + +/* + * This routine will mark the current iclog in the ring as WANT_SYNC + * and move the current iclog pointer to the next iclog in the ring. + * When this routine is called from xlog_state_get_iclog_space(), the + * exact size of the iclog has not yet been determined. All we know is + * that every data block. We have run out of space in this log record. + */ +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) +{ + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + if (!eventual_size) + eventual_size = iclog->ic_offset; + iclog->ic_state = XLOG_STATE_WANT_SYNC; + iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + log->l_prev_block = log->l_curr_block; + log->l_prev_cycle = log->l_curr_cycle; + + /* roll log?: ic_offset changed later */ + log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); + + /* Round up to next log-sunit */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + } + + if (log->l_curr_block >= log->l_logBBsize) { + log->l_curr_cycle++; + if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) + log->l_curr_cycle++; + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + } + ASSERT(iclog == log->l_iclog); + log->l_iclog = iclog->ic_next; +} /* xlog_state_switch_iclogs */ + +/* + * Write out all data in the in-core log as of this exact moment in time. + * + * Data may be written to the in-core log during this call. However, + * we don't guarantee this data will be written out. A change from past + * implementation means this routine will *not* write out zero length LRs. + * + * Basically, we try and perform an intelligent scan of the in-core logs. + * If we determine there is no flushable data, we just return. There is no + * flushable data if: + * + * 1. the current iclog is active and has no data; the previous iclog + * is in the active or dirty state. + * 2. the current iclog is drity, and the previous iclog is in the + * active or dirty state. + * + * We may sleep if: + * + * 1. the current iclog is not in the active nor dirty state. + * 2. the current iclog dirty, and the previous iclog is not in the + * active nor dirty state. + * 3. the current iclog is active, and there is another thread writing + * to this particular iclog. + * 4. a) the current iclog is active and has no other writers + * b) when we return from flushing out this iclog, it is still + * not in the active nor dirty state. + */ +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) +{ + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); + + xlog_cil_force(log); + + spin_lock(&log->l_icloglock); + + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + /* If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) { + /* + * If the head is dirty or (active and empty), then + * we need to look at the previous iclog. If the previous + * iclog is active or dirty we are done. There is nothing + * to sync out. Otherwise, we attach ourselves to the + * previous iclog and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_DIRTY || + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto no_sleep; + else + goto maybe_sleep; + } else { + if (atomic_read(&iclog->ic_refcnt) == 0) { + /* We are the only one with access to this + * iclog. Flush it out now. There should + * be a roundoff of zero to show that someone + * has already taken care of the roundoff from + * the previous sync. + */ + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && + iclog->ic_state != XLOG_STATE_DIRTY) + goto maybe_sleep; + else + goto no_sleep; + } else { + /* Someone else is writing to this iclog. + * Use its call to flush out the data. However, + * the other thread may not force out this LR, + * so we mark it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); + goto maybe_sleep; + } + } + } + + /* By the time we come around again, the iclog could've been filled + * which would give it another lsn. If we have a new lsn, just + * return because the relevant data has been flushed. + */ +maybe_sleep: + if (flags & XFS_LOG_SYNC) { + /* + * We must check if we're shutting down here, before + * we wait, while we're holding the l_icloglock. + * Then we check again after waking up, in case our + * sleep was disturbed by a bad news. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + if (log_flushed) + *log_flushed = 1; + } else { + +no_sleep: + spin_unlock(&log->l_icloglock); + } + return 0; +} + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, 0); + error = _xfs_log_force(mp, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Force the in-core log to disk for a specific LSN. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. + */ +int +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) +{ + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; + + ASSERT(lsn != 0); + + XFS_STATS_INC(xs_log_force); + + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } + atomic_inc(&iclog->ic_refcnt); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + } + + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ + spin_unlock(&log->l_icloglock); + } + + return 0; + } while (iclog != log->l_iclog); + + spin_unlock(&log->l_icloglock); + return 0; +} + +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, lsn); + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Called when we want to mark the current iclog as being ready to sync to + * disk. + */ +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) +{ + assert_spin_locked(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + xlog_state_switch_iclogs(log, iclog, 0); + } else { + ASSERT(iclog->ic_state & + (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); + } +} + + +/***************************************************************************** + * + * TICKET functions + * + ***************************************************************************** + */ + +/* + * Free a used ticket when its refcount falls to zero. + */ +void +xfs_log_ticket_put( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + if (atomic_dec_and_test(&ticket->t_ref)) + kmem_zone_free(xfs_log_ticket_zone, ticket); +} + +xlog_ticket_t * +xfs_log_ticket_get( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + atomic_inc(&ticket->t_ref); + return ticket; +} + +/* + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. + */ +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; + + /* + * Permanent reservations have up to 'cnt'-1 active log operations + * in the log. A unit in this case is the amount of space for one + * of these log operations. Normal reservations have a cnt of 1 + * and their unit amount is the total amount of space required. + * + * The following lines of code account for non-transaction data + * which occupy space in the on-disk log. + * + * Normal form of a transaction is: + * ... + * and then there are LR hdrs, split-recs and roundoff at end of syncs. + * + * We need to account for all the leadup data and trailer data + * around the transaction data. + * And then we need to account for the worst case in terms of using + * more space. + * The worst case will happen if: + * - the placement of the transaction happens to be such that the + * roundoff is at its maximum + * - the transaction data is synced before the commit record is synced + * i.e. | + * Therefore the commit record is in its own Log Record. + * This can happen as the commit record is called with its + * own region to xlog_write(). + * This then means that in the worst case, roundoff can happen for + * the commit-rec as well. + * The commit-rec is smaller than padding in this scenario and so it is + * not added separately. + */ + + /* for trans header */ + unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(xfs_trans_header_t); + + /* for start-rec */ + unit_bytes += sizeof(xlog_op_header_t); + + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } + unit_bytes += log->l_iclog_hsize * num_headers; + + /* for commit-rec LR header - note: padding will subsume the ophdr */ + unit_bytes += log->l_iclog_hsize; + + /* for roundoff padding for transaction data and one for commit record */ + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { + /* log su roundoff */ + unit_bytes += 2 * mp->m_sb.sb_logsunit; + } else { + /* BB roundoff */ + unit_bytes += 2 * BBSIZE; + } + + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + + atomic_set(&tic->t_ref, 1); + tic->t_task = current; + INIT_LIST_HEAD(&tic->t_queue); + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; + tic->t_cnt = cnt; + tic->t_ocnt = cnt; + tic->t_tid = prandom_u32(); + tic->t_clientid = client; + tic->t_flags = XLOG_TIC_INITED; + tic->t_trans_type = 0; + if (permanent) + tic->t_flags |= XLOG_TIC_PERM_RESERV; + + xlog_tic_reset_res(tic); + + return tic; +} + + +/****************************************************************************** + * + * Log debug routines + * + ****************************************************************************** + */ +#if defined(DEBUG) +/* + * Make sure that the destination ptr is within the valid data region of + * one of the iclogs. This uses backup pointers stored in a different + * part of the log in case we trash the log structure. + */ +void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr) +{ + int i; + int good_ptr = 0; + + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) + good_ptr++; + } + + if (!good_ptr) + xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); +} + +/* + * Check to make sure the grant write head didn't just over lap the tail. If + * the cycles are the same, we can't be overlapping. Otherwise, make sure that + * the cycles differ by exactly one and check the byte count. + * + * This check is run unlocked, so can give false positives. Rather than assert + * on failures, use a warn-once flag and a panic tag to allow the admin to + * determine if they want to panic the machine when such an error occurs. For + * debug kernels this will have the same effect as using an assert but, unlinke + * an assert, it can be turned off at runtime. + */ +STATIC void +xlog_verify_grant_tail( + struct xlog *log) +{ + int tail_cycle, tail_blocks; + int cycle, space; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + if (cycle - 1 != tail_cycle && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: cycle - 1 != tail_cycle", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + + if (space > BBTOB(tail_blocks) && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: space > BBTOB(tail_blocks)", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + } +} + +/* check if it will fit */ +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) +{ + int blocks; + + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = + log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } else { + ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); + + blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; + if (blocks < BTOBB(iclog->ic_offset) + 1) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } +} /* xlog_verify_tail_lsn */ + +/* + * Perform a number of checks on the iclog before writing to disk. + * + * 1. Make sure the iclogs are still circular + * 2. Make sure we have a good magic number + * 3. Make sure we don't have magic numbers in the data + * 4. Check fields of each log operation header for: + * A. Valid client identifier + * B. tid ptr value falls in valid ptr space (user space code) + * C. Length in log record header is correct according to the + * individual operation headers within record. + * 5. When a bwrite will occur within 5 blocks of the front of the physical + * log, check the preceding blocks of the physical log to make sure all + * the cycle numbers agree with the current cycle number. + */ +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing) +{ + xlog_op_header_t *ophead; + xlog_in_core_t *icptr; + xlog_in_core_2_t *xhdr; + xfs_caddr_t ptr; + xfs_caddr_t base_ptr; + __psint_t field_offset; + __uint8_t clientid; + int len, i, j, k, op_len; + int idx; + + /* check validity of iclog pointers */ + spin_lock(&log->l_icloglock); + icptr = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + + if (icptr != log->l_iclog) + xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); + spin_unlock(&log->l_icloglock); + + /* check log magic numbers */ + if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); + + ptr = (xfs_caddr_t) &iclog->ic_header; + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; + ptr += BBSIZE) { + if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: unexpected magic num", + __func__); + } + + /* check fields */ + len = be32_to_cpu(iclog->ic_header.h_num_logops); + ptr = iclog->ic_datap; + base_ptr = ptr; + ophead = (xlog_op_header_t *)ptr; + xhdr = iclog->ic_data; + for (i = 0; i < len; i++) { + ophead = (xlog_op_header_t *)ptr; + + /* clientid is only 1 byte */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + if (!syncing || (field_offset & 0x1ff)) { + clientid = ophead->oh_clientid; + } else { + idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + clientid = xlog_get_client_id( + xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + clientid = xlog_get_client_id( + iclog->ic_header.h_cycle_data[idx]); + } + } + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + xfs_warn(log->l_mp, + "%s: invalid clientid %d op 0x%p offset 0x%lx", + __func__, clientid, ophead, + (unsigned long)field_offset); + + /* check length */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + if (!syncing || (field_offset & 0x1ff)) { + op_len = be32_to_cpu(ophead->oh_len); + } else { + idx = BTOBBT((__psint_t)&ophead->oh_len - + (__psint_t)iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); + } + } + ptr += sizeof(xlog_op_header_t) + op_len; + } +} /* xlog_verify_iclog */ +#endif + +/* + * Mark all iclogs IOERROR. l_icloglock is held by the caller. + */ +STATIC int +xlog_state_ioerror( + struct xlog *log) +{ + xlog_in_core_t *iclog, *ic; + + iclog = log->l_iclog; + if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Mark all the incore logs IOERROR. + * From now on, no log flushes will result. + */ + ic = iclog; + do { + ic->ic_state = XLOG_STATE_IOERROR; + ic = ic->ic_next; + } while (ic != iclog); + return 0; + } + /* + * Return non-zero, if state transition has already happened. + */ + return 1; +} + +/* + * This is called from xfs_force_shutdown, when we're forcibly + * shutting down the filesystem, typically because of an IO error. + * Our main objectives here are to make sure that: + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested + * parties to find out, 'atomically'. + * c. those who're sleeping on log reservations, pinned objects and + * other resources get woken up, and be told the bad news. + * d. nothing new gets queued up after (b) and (c) are done. + * + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. + */ +int +xfs_log_force_umount( + struct xfs_mount *mp, + int logerror) +{ + struct xlog *log; + int retval; + + log = mp->m_log; + + /* + * If this happens during log recovery, don't worry about + * locking; the log isn't open for business yet. + */ + if (!log || + log->l_flags & XLOG_ACTIVE_RECOVERY) { + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + return 0; + } + + /* + * Somebody could've already done the hard work for us. + * No need to get locks for this. + */ + if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + return 1; + } + + /* + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. + */ + if (!logerror) + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + + /* + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. + */ + spin_lock(&log->l_icloglock); + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + + /* + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. + */ + log->l_flags |= XLOG_IO_ERROR; + retval = xlog_state_ioerror(log); + spin_unlock(&log->l_icloglock); + + /* + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. + */ + xlog_grant_head_wake_all(&log->l_reserve_head); + xlog_grant_head_wake_all(&log->l_write_head); + + /* + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. + */ + wake_up_all(&log->l_cilp->xc_commit_wait); + xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + +#ifdef XFSERRORDEBUG + { + xlog_in_core_t *iclog; + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + ASSERT(iclog->ic_callback == 0); + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + spin_unlock(&log->l_icloglock); + } +#endif + /* return non-zero if log IOERROR transition had already happened */ + return retval; +} + +STATIC int +xlog_iclogs_empty( + struct xlog *log) +{ + xlog_in_core_t *iclog; + + iclog = log->l_iclog; + do { + /* endianness does not matter here, zero is zero in + * any language. + */ + if (iclog->ic_header.h_num_logops) + return 0; + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + return 1; +} + diff --git a/kernel/fs/xfs/xfs_log.h b/kernel/fs/xfs/xfs_log.h new file mode 100644 index 000000000..84e0deb95 --- /dev/null +++ b/kernel/fs/xfs/xfs_log.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_H__ +#define __XFS_LOG_H__ + +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ + int lv_size; /* size of allocated lv */ +}; + +#define XFS_LOG_VEC_ORDERED (-1) + +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; + vec->i_len = len; +} + +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + +/* + * Structure used to pass callback function and the function's argument + * to the log manager. + */ +typedef struct xfs_log_callback { + struct xfs_log_callback *cb_next; + void (*cb_func)(void *, int); + void *cb_arg; +} xfs_log_callback_t; + +/* + * By comparing each component, we don't have to worry about extra + * endian issues in treating two 32 bit numbers as one 64 bit number + */ +static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) +{ + if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) + return (CYCLE_LSN(lsn1)t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct xlog *log) +{ + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; +} + +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct xlog *log, + struct xfs_log_vec *lv, + struct xfs_log_vec *old_lv, + int *diff_len, + int *diff_iovecs) +{ + /* Account for the new LV being passed in */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *diff_len += lv->lv_bytes; + *diff_iovecs += lv->lv_niovecs; + } + + /* + * If there is no old LV, this is the first time we've seen the item in + * this CIL context and so we need to pin it. If we are replacing the + * old_lv, then remove the space it accounts for and free it. + */ + if (!old_lv) + lv->lv_item->li_ops->iop_pin(lv->lv_item); + else if (old_lv != lv) { + ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + + *diff_len -= old_lv->lv_bytes; + *diff_iovecs -= old_lv->lv_niovecs; + kmem_free(old_lv); + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_insert_format_items( + struct xlog *log, + struct xfs_trans *tp, + int *diff_len, + int *diff_iovecs) +{ + struct xfs_log_item_desc *lidp; + + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + struct xfs_log_vec *old_lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* Skip items that do not have any vectors for writing */ + if (!niovecs) + continue; + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* grab the old item if it exists for reservation accounting */ + old_lv = lip->li_lv; + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); + + /* compare to existing item size */ + if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv; + lv->lv_next = NULL; + + if (ordered) + goto insert; + + /* + * set the item up as though it is a new insertion so + * that the space reservation accounting is correct. + */ + *diff_iovecs -= lv->lv_niovecs; + *diff_len -= lv->lv_bytes; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); + + lip->li_ops->iop_format(lip, lv); +insert: + ASSERT(lv->lv_buf_len <= nbytes); + xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + } +} + +/* + * Insert the log items into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we added to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + */ +static void +xlog_cil_insert_items( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + struct xfs_log_item_desc *lidp; + int len = 0; + int diff_iovecs = 0; + int iclog_space; + + ASSERT(tp); + + /* + * We can do this safely because the context can't checkpoint until we + * are done so it doesn't matter exactly how we update the CIL. + */ + xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); + ctx->nvecs += diff_iovecs; + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + tp->t_ticket->t_curr_res -= hdrs; + ASSERT(tp->t_ticket->t_curr_res >= len); + } + tp->t_ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv); + lv = next; + } +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); + + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, + (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ + spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_push_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + + if (!list_empty(&ctx->busy_extents)) { + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + xfs_discard_extents(mp, &ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + } + + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. + */ +STATIC int +xlog_cil_push( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_iovecs; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + down_write(&cil->xc_ctx_lock); + ctx = cil->xc_ctx; + + spin_lock(&cil->xc_push_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); + + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_push_lock); + goto out_skip; + } + + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) { + spin_unlock(&cil->xc_push_lock); + goto out_skip; + } + + /* + * We are now going to push this context, so add it to the committing + * list before we do anything else. This ensures that anyone waiting on + * this push can easily detect the difference between a "push in + * progress" and "CIL is empty, nothing to do". + * + * IOWs, a wait loop can now check for: + * the current sequence not being found on the committing list; + * an empty CIL; and + * an unchanged sequence number + * to detect a push that had nothing to do and therefore does not need + * waiting on. If the CIL is not empty, we get put on the committing + * list before emptying the CIL and bumping the sequence number. Hence + * an empty CIL and an unchanged sequence number means we jumped out + * above after doing nothing. + * + * Hence the waiter will either find the commit sequence on the + * committing list or the sequence number will be unchanged and the CIL + * still dirty. In that latter case, the push has not yet started, and + * so the waiter will have to continue trying to check the CIL + * committing list until it is found. In extreme cases of delay, the + * sequence may fully commit between the attempts the wait makes to wait + * on the commit sequence. + */ + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_push_lock); + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_iovecs = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + num_iovecs += lv->lv_niovecs; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. + */ + spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; + spin_unlock(&cil->xc_push_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = &thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort_free_ticket; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_push_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for our own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + } + spin_unlock(&cil->xc_push_lock); + + /* xfs_log_done always frees the ticket on error. */ + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_push_lock); + ctx->commit_lsn = commit_lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_push_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort_free_ticket: + xfs_log_ticket_put(tic); +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return -EIO; +} + +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_push_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_push_lock); + +} + +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ +static void +xlog_cil_push_now( + struct xlog *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_push_lock); + return; + } + + cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + spin_unlock(&cil->xc_push_lock); +} + +bool +xlog_cil_empty( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + bool empty = false; + + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil)) + empty = true; + spin_unlock(&cil->xc_push_lock); + return empty; +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +void +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; + int log_flags = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + /* lock out background commit */ + down_read(&cil->xc_ctx_lock); + + xlog_cil_insert_items(log, tp); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(mp, tp->t_ticket); + + tp->t_commit_lsn = cil->xc_ctx->sequence; + if (commit_lsn) + *commit_lsn = tp->t_commit_lsn; + + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + + xlog_cil_push_background(log); + + up_read(&cil->xc_ctx_lock); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ +restart: + xlog_cil_push_now(log, sequence); + + /* + * See if we can find a previous sequence still committing. + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_push_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; + if (ctx->sequence > sequence) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + if (ctx->sequence != sequence) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. This is guaranteed by the push code first adding the + * context to the committing list before emptying the CIL. + * + * Hence if we don't find the context in the committing list and the + * current sequence number is unchanged then the CIL contents are + * significant. If the CIL is empty, if means there was nothing to push + * and that means there is nothing to wait for. If the CIL is not empty, + * it means we haven't yet started the push, because if it had started + * we would have found the context on the committing list. + */ + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + + spin_unlock(&cil->xc_push_lock); + return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct xlog *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return -ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return -ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + spin_lock_init(&cil->xc_push_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct xlog *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/kernel/fs/xfs/xfs_log_priv.h b/kernel/fs/xfs/xfs_log_priv.h new file mode 100644 index 000000000..db7cbdeb2 --- /dev/null +++ b/kernel/fs/xfs/xfs_log_priv.h @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; +struct xfs_log_callback; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + uint t_trans_type; /* transaction type : 4 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct xlog *ic_log; + int ic_size; + int ic_offset; + int ic_bwritecnt; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif + +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/kernel/fs/xfs/xfs_log_recover.c b/kernel/fs/xfs/xfs_log_recover.c new file mode 100644 index 000000000..4f5784f85 --- /dev/null +++ b/kernel/fs/xfs/xfs_log_recover.c @@ -0,0 +1,4651 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" +#include "xfs_extfree_item.h" +#include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_error.h" +#include "xfs_dir2.h" + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); +#if defined(DEBUG) +STATIC void +xlog_recover_check_summary( + struct xlog *); +#else +#define xlog_recover_check_summary(log) +#endif + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* + * Sector aligned buffer routines for buffer create/read/write/access + */ + +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ + +static inline int +xlog_buf_bbcount_valid( + struct xlog *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * +xlog_get_bp( + struct xlog *log, + int nbblks) +{ + struct xfs_buf *bp; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; + } + + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to accommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accommodate this possibility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); + if (bp) + xfs_buf_unlock(bp); + return bp; +} + +STATIC void +xlog_put_bp( + xfs_buf_t *bp) +{ + xfs_buf_free(bp); +} + +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ +STATIC xfs_caddr_t +xlog_align( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); + + ASSERT(offset + nbblks <= bp->b_length); + return bp->b_addr + BBTOB(offset); +} + + +/* + * nbblks should be uint, but oh well. Just want to catch that 32-bit length. + */ +STATIC int +xlog_bread_noalign( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return -EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(nbblks <= bp->b_length); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_READ(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_buf_submit_wait(bp); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) + xfs_buf_ioerror_alert(bp, __func__); + return error; +} + +STATIC int +xlog_bread( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + +/* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + struct xlog *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + struct xfs_buf *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = bp->b_addr; + int orig_len = BBTOB(bp->b_length); + int error, error2; + + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* + * Write out the buffer at the given block for the given number of blocks. + * The buffer is kept locked across the write and is returned locked. + * This can only be used for synchronous log writes. + */ +STATIC int +xlog_bwrite( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return -EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(nbblks <= bp->b_length); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_ZEROFLAGS(bp); + xfs_buf_hold(bp); + xfs_buf_lock(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + return error; +} + +#ifdef DEBUG +/* + * dump debug superblock and log record information + */ +STATIC void +xlog_header_check_dump( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + xfs_debug(mp, " log : uuid = %pU, fmt = %d", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); +} +#else +#define xlog_header_check_dump(mp, head) +#endif + +/* + * check log record header for recovery + */ +STATIC int +xlog_header_check_recover( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); + + /* + * IRIX doesn't write the h_fmt field and leaves it zeroed + * (XLOG_FMT_UNKNOWN). This stops us from trying to recover + * a dirty log created in IRIX. + */ + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(1)", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(2)", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * read the head block of the log and check the header + */ +STATIC int +xlog_header_check_mount( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); + + if (uuid_is_nil(&head->h_fs_uuid)) { + /* + * IRIX doesn't write the h_fs_uuid or h_fmt fields. If + * h_fs_uuid is nil, we assume this log was last mounted + * by IRIX and continue. + */ + xfs_warn(mp, "nil uuid in log - IRIX style log"); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, "log has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_mount", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } + return 0; +} + +STATIC void +xlog_recover_iodone( + struct xfs_buf *bp) +{ + if (bp->b_error) { + /* + * We're not going to bother about retrying + * this during recovery. One strike! + */ + if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + } + bp->b_iodone = NULL; + xfs_buf_ioend(bp); +} + +/* + * This routine finds (to an approximation) the first block in the physical + * log which contains the given cycle. It uses a binary search algorithm. + * Note that the algorithm can not be perfect because the disk will not + * necessarily be perfect. + */ +STATIC int +xlog_find_cycle_start( + struct xlog *log, + struct xfs_buf *bp, + xfs_daddr_t first_blk, + xfs_daddr_t *last_blk, + uint cycle) +{ + xfs_caddr_t offset; + xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; + uint mid_cycle; + int error; + + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) + return error; + mid_cycle = xlog_get_cycle(offset); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); + } + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; + + return 0; +} + +/* + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. + */ +STATIC int +xlog_find_verify_cycle( + struct xlog *log, + xfs_daddr_t start_blk, + int nbblks, + uint stop_on_cycle_no, + xfs_daddr_t *new_blk) +{ + xfs_daddr_t i, j; + uint cycle; + xfs_buf_t *bp; + xfs_daddr_t bufblks; + xfs_caddr_t buf = NULL; + int error = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ + bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < log->l_sectBBsize) + return -ENOMEM; + } + + for (i = start_blk; i < start_blk + nbblks; i += bufblks) { + int bcount; + + bcount = min(bufblks, (start_blk + nbblks - i)); + + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) + goto out; + + for (j = 0; j < bcount; j++) { + cycle = xlog_get_cycle(buf); + if (cycle == stop_on_cycle_no) { + *new_blk = i+j; + goto out; + } + + buf += BBSIZE; + } + } + + *new_blk = -1; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Potentially backup over partial log record write. + * + * In the typical case, last_blk is the number of the block directly after + * a good log record. Therefore, we subtract one to get the block number + * of the last block in the given buffer. extra_bblks contains the number + * of blocks we would have read on a previous read. This happens when the + * last log record is split over the end of the physical log. + * + * extra_bblks is the number of blocks potentially verified on a previous + * call to this routine. + */ +STATIC int +xlog_find_verify_log_record( + struct xlog *log, + xfs_daddr_t start_blk, + xfs_daddr_t *last_blk, + int extra_bblks) +{ + xfs_daddr_t i; + xfs_buf_t *bp; + xfs_caddr_t offset = NULL; + xlog_rec_header_t *head = NULL; + int error = 0; + int smallmem = 0; + int num_blks = *last_blk - start_blk; + int xhdrs; + + ASSERT(start_blk != 0 || *last_blk != start_blk); + + if (!(bp = xlog_get_bp(log, num_blks))) { + if (!(bp = xlog_get_bp(log, 1))) + return -ENOMEM; + smallmem = 1; + } else { + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) + goto out; + offset += ((num_blks - 1) << BBSHIFT); + } + + for (i = (*last_blk) - 1; i >= 0; i--) { + if (i < start_blk) { + /* valid log record not found */ + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); + ASSERT(0); + error = -EIO; + goto out; + } + + if (smallmem) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out; + } + + head = (xlog_rec_header_t *)offset; + + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + break; + + if (!smallmem) + offset -= BBSIZE; + } + + /* + * We hit the beginning of the physical log & still no header. Return + * to caller. If caller can handle a return of -1, then this routine + * will be called again for the end of the physical log. + */ + if (i == -1) { + error = 1; + goto out; + } + + /* + * We have the final block of the good log (the first block + * of the log record _before_ the head. So we check the uuid. + */ + if ((error = xlog_header_check_mount(log->l_mp, head))) + goto out; + + /* + * We may have found a log record header before we expected one. + * last_blk will be the 1st block # with a given cycle #. We may end + * up reading an entire log record. In this case, we don't want to + * reset last_blk. Only when last_blk points in the middle of a log + * record do we update last_blk. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + uint h_size = be32_to_cpu(head->h_size); + + xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + } else { + xhdrs = 1; + } + + if (*last_blk - i + extra_bblks != + BTOBB(be32_to_cpu(head->h_len)) + xhdrs) + *last_blk = i; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Head is defined to be the point of the log where the next log write + * could go. This means that incomplete LR writes at the end are + * eliminated when calculating the head. We aren't guaranteed that previous + * LR have complete transactions. We only know that a cycle number of + * current cycle number -1 won't be present in the log if we start writing + * from our current block number. + * + * last_blk contains the block number of the first block with a given + * cycle number. + * + * Return: zero if normal, non-zero if error. + */ +STATIC int +xlog_find_head( + struct xlog *log, + xfs_daddr_t *return_head_blk) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; + int num_scan_bblks; + uint first_half_cycle, last_half_cycle; + uint stop_on_cycle; + int error, log_bbnum = log->l_logBBsize; + + /* Is the end of the log device zeroed? */ + error = xlog_find_zeroed(log, &first_blk); + if (error < 0) { + xfs_warn(log->l_mp, "empty log check failed"); + return error; + } + if (error == 1) { + *return_head_blk = first_blk; + + /* Is the whole lot zeroed? */ + if (!first_blk) { + /* Linux XFS shouldn't generate totally zeroed logs - + * mkfs etc write a dummy unmount record to a fresh + * log so we can store the uuid in there + */ + xfs_warn(log->l_mp, "totally zeroed log"); + } + + return 0; + } + + first_blk = 0; /* get cycle # of 1st block */ + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_half_cycle = xlog_get_cycle(offset); + + last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) + goto bp_err; + + last_half_cycle = xlog_get_cycle(offset); + ASSERT(last_half_cycle != 0); + + /* + * If the 1st half cycle number is equal to the last half cycle number, + * then the entire log is stamped with the same cycle number. In this + * case, head_blk can't be set to zero (which makes sense). The below + * math doesn't work out properly with head_blk equal to zero. Instead, + * we set it to log_bbnum which is an invalid block number, but this + * value makes the math correct. If head_blk doesn't changed through + * all the tests below, *head_blk is set to zero at the very end rather + * than log_bbnum. In a sense, log_bbnum and zero are the same block + * in a circular file. + */ + if (first_half_cycle == last_half_cycle) { + /* + * In this case we believe that the entire log should have + * cycle number last_half_cycle. We need to scan backwards + * from the end verifying that there are no holes still + * containing last_half_cycle - 1. If we find such a hole, + * then the start of that hole will be the new head. The + * simple case looks like + * x | x ... | x - 1 | x + * Another case that fits this picture would be + * x | x + 1 | x ... | x + * In this case the head really is somewhere at the end of the + * log, as one of the latest writes at the beginning was + * incomplete. + * One more case is + * x | x + 1 | x ... | x - 1 | x + * This is really the combination of the above two cases, and + * the head has to end up at the start of the x-1 hole at the + * end of the log. + * + * In the 256k log case, we will read from the beginning to the + * end of the log and search for cycle numbers equal to x-1. + * We don't worry about the x+1 blocks that we encounter, + * because we know that they cannot be the head since the log + * started with x. + */ + head_blk = log_bbnum; + stop_on_cycle = last_half_cycle - 1; + } else { + /* + * In this case we want to find the first block with cycle + * number matching last_half_cycle. We expect the log to be + * some variation on + * x + 1 ... | x ... | x + * The first block with cycle number x (last_half_cycle) will + * be where the new head belongs. First we do a binary search + * for the first occurrence of last_half_cycle. The binary + * search may not be totally accurate, so then we scan back + * from there looking for occurrences of last_half_cycle before + * us. If that backwards scan wraps around the beginning of + * the log, then we look for occurrences of last_half_cycle - 1 + * at the end of the log. The cases we're looking for look + * like + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot + * or + * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot + */ + stop_on_cycle = last_half_cycle; + if ((error = xlog_find_cycle_start(log, bp, first_blk, + &head_blk, last_half_cycle))) + goto bp_err; + } + + /* + * Now validate the answer. Scan back some number of maximum possible + * blocks and make sure each one has the expected cycle number. The + * maximum is determined by the total possible amount of buffering + * in the in-core log. The following number can be made tighter if + * we actually look at the block size of the filesystem. + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + /* + * We are guaranteed that the entire check can be performed + * in one buffer. + */ + start_blk = head_blk - num_scan_bblks; + if ((error = xlog_find_verify_cycle(log, + start_blk, num_scan_bblks, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } else { /* need to read 2 parts of log */ + /* + * We are going to scan backwards in the log in two parts. + * First we scan the physical end of the log. In this part + * of the log, we are looking for blocks with cycle number + * last_half_cycle - 1. + * If we find one, then we know that the log starts there, as + * we've found a hole that didn't get written in going around + * the end of the physical log. The simple case for this is + * x + 1 ... | x ... | x - 1 | x + * <---------> less than scan distance + * If all of the blocks at the end of the log have cycle number + * last_half_cycle, then we check the blocks at the start of + * the log looking for occurrences of last_half_cycle. If we + * find one, then our current estimate for the location of the + * first occurrence of last_half_cycle is wrong and we move + * back to the hole we've found. This case looks like + * x + 1 ... | x | x + 1 | x ... + * ^ binary search stopped here + * Another case we need to handle that only occurs in 256k + * logs is + * x + 1 ... | x ... | x+1 | x ... + * ^ binary search stops here + * In a 256k log, the scan at the end of the log will see the + * x + 1 blocks. We need to skip past those since that is + * certainly not the head of the log. By searching for + * last_half_cycle-1 we accomplish that. + */ + ASSERT(head_blk <= INT_MAX && + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); + if ((error = xlog_find_verify_cycle(log, start_blk, + num_scan_bblks - (int)head_blk, + (stop_on_cycle - 1), &new_blk))) + goto bp_err; + if (new_blk != -1) { + head_blk = new_blk; + goto validate_head; + } + + /* + * Scan beginning of log now. The last part of the physical + * log is good. This scan needs to verify that it doesn't find + * the last_half_cycle. + */ + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_cycle(log, + start_blk, (int)head_blk, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } + +validate_head: + /* + * Now we need to make sure head_blk is not pointing to a block in + * the middle of a log record. + */ + num_scan_bblks = XLOG_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ + + /* start ptr at last block ptr before head_blk */ + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + } else { + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error < 0) + goto bp_err; + if (error == 1) { + /* We hit the beginning of the log during our search */ + start_blk = log_bbnum - (num_scan_bblks - head_blk); + new_blk = log_bbnum; + ASSERT(start_blk <= INT_MAX && + (xfs_daddr_t) log_bbnum-start_blk >= 0); + ASSERT(head_blk <= INT_MAX); + error = xlog_find_verify_log_record(log, start_blk, + &new_blk, (int)head_blk); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + if (new_blk != log_bbnum) + head_blk = new_blk; + } else if (error) + goto bp_err; + } + + xlog_put_bp(bp); + if (head_blk == log_bbnum) + *return_head_blk = 0; + else + *return_head_blk = head_blk; + /* + * When returning here, we have a good block number. Bad block + * means that during a previous crash, we didn't have a clean break + * from cycle number N to cycle number N-1. In this case, we need + * to find the first block with cycle number N-1. + */ + return 0; + + bp_err: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to find log head"); + return error; +} + +/* + * Find the sync block number or the tail of the log. + * + * This will be the block number of the last record to have its + * associated buffers synced to disk. Every log record header has + * a sync lsn embedded in it. LSNs hold block numbers, so it is easy + * to get a sync block number. The only concern is to figure out which + * log record header to believe. + * + * The following algorithm uses the log record header with the largest + * lsn. The entire log record does not need to be valid. We only care + * that the header is valid. + * + * We could speed up search by using current head_blk buffer, but it is not + * available. + */ +STATIC int +xlog_find_tail( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk) +{ + xlog_rec_header_t *rhead; + xlog_op_header_t *op_head; + xfs_caddr_t offset = NULL; + xfs_buf_t *bp; + int error, i, found; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + xfs_lsn_t tail_lsn; + int hblks; + + found = 0; + + /* + * Find previous log record + */ + if ((error = xlog_find_head(log, head_blk))) + return error; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + if (*head_blk == 0) { /* special case */ + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto done; + + if (xlog_get_cycle(offset) == 0) { + *tail_blk = 0; + /* leave all other log inited values alone */ + goto done; + } + } + + /* + * Search backwards looking for log record header block + */ + ASSERT(*head_blk < INT_MAX); + for (i = (int)(*head_blk) - 1; i >= 0; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + found = 1; + break; + } + } + /* + * If we haven't found the log record header block, start looking + * again from the end of the physical log. XXXmiken: There should be + * a check here to make sure we didn't search more than N blocks in + * the previous code. + */ + if (!found) { + for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + found = 2; + break; + } + } + } + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); + ASSERT(0); + return -EIO; + } + + /* find blk_no of tail of log */ + rhead = (xlog_rec_header_t *)offset; + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = i; + log->l_curr_block = (int)*head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (found == 2) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + + /* + * Look for unmount record. If we find it, then we know there + * was a clean unmount. Since 'i' could be the last block in + * the physical log, we convert to a log block before comparing + * to the head_blk. + * + * Save the current tail lsn to use to pass to + * xlog_clear_stale_blocks() below. We won't want to clear the + * unmount record if there is one, so we pass the lsn of the + * unmount record rather than the block after it. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = (i + hblks + (int) + BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + tail_lsn = atomic64_read(&log->l_tail_lsn); + if (*head_blk == after_umount_blk && + be32_to_cpu(rhead->h_num_logops) == 1) { + umount_data_blk = (i + hblks) % log->l_logBBsize; + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + goto done; + + op_head = (xlog_op_header_t *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written + * log records will point recovery to after the + * current unmount record. + */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); + *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + } + } + + /* + * Make sure that there are no blocks in front of the head + * with the same cycle number as the head. This can happen + * because we allow multiple outstanding log writes concurrently, + * and the later writes might make it out before earlier ones. + * + * We use the lsn from before modifying it so that we'll never + * overwrite the unmount record after a clean unmount. + * + * Do this only if we are going to recover the filesystem + * + * NOTE: This used to say "if (!readonly)" + * However on Linux, we can & do recover a read-only filesystem. + * We only skip recovery if NORECOVERY is specified on mount, + * in which case we would not be here. + * + * But... if the -device- itself is readonly, just skip this. + * We can't recover this device anyway, so it won't matter. + */ + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) + error = xlog_clear_stale_blocks(log, tail_lsn); + +done: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to locate log tail"); + return error; +} + +/* + * Is the log zeroed at all? + * + * The last binary search should be changed to perform an X block read + * once X becomes small enough. You can then search linearly through + * the X blocks. This will cut down on the number of reads we need to do. + * + * If the log is partially zeroed, this routine will pass back the blkno + * of the first block with cycle number 0. It won't have a complete LR + * preceding it. + * + * Return: + * 0 => the log is completely written to + * 1 => use *blk_no as the first block of the log + * <0 => error has occurred + */ +STATIC int +xlog_find_zeroed( + struct xlog *log, + xfs_daddr_t *blk_no) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + uint first_cycle, last_cycle; + xfs_daddr_t new_blk, last_blk, start_blk; + xfs_daddr_t num_scan_bblks; + int error, log_bbnum = log->l_logBBsize; + + *blk_no = 0; + + /* check totally zeroed log */ + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_cycle = xlog_get_cycle(offset); + if (first_cycle == 0) { /* completely zeroed log */ + *blk_no = 0; + xlog_put_bp(bp); + return 1; + } + + /* check partially zeroed log */ + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) + goto bp_err; + + last_cycle = xlog_get_cycle(offset); + if (last_cycle != 0) { /* log completely written to */ + xlog_put_bp(bp); + return 0; + } else if (first_cycle != 1) { + /* + * If the cycle of the last block is zero, the cycle of + * the first block must be 1. If it's not, maybe we're + * not looking at a log... Bail out. + */ + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + error = -EINVAL; + goto bp_err; + } + + /* we have a partially zeroed log */ + last_blk = log_bbnum-1; + if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) + goto bp_err; + + /* + * Validate the answer. Because there is no way to guarantee that + * the entire log is made up of log records which are the same size, + * we scan over the defined maximum blocks. At this point, the maximum + * is not chosen to mean anything special. XXXmiken + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + ASSERT(num_scan_bblks <= INT_MAX); + + if (last_blk < num_scan_bblks) + num_scan_bblks = last_blk; + start_blk = last_blk - num_scan_bblks; + + /* + * We search for any instances of cycle number 0 that occur before + * our current estimate of the head. What we're trying to detect is + * 1 ... | 0 | 1 | 0... + * ^ binary search ends here + */ + if ((error = xlog_find_verify_cycle(log, start_blk, + (int)num_scan_bblks, 0, &new_blk))) + goto bp_err; + if (new_blk != -1) + last_blk = new_blk; + + /* + * Potentially backup over partial log record write. We don't need + * to search the end of the log because we know it is zero. + */ + error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + + *blk_no = last_blk; +bp_err: + xlog_put_bp(bp); + if (error) + return error; + return 1; +} + +/* + * These are simple subroutines used by xlog_clear_stale_blocks() below + * to initialize a buffer full of empty log record headers and write + * them into the log. + */ +STATIC void +xlog_add_record( + struct xlog *log, + xfs_caddr_t buf, + int cycle, + int block, + int tail_cycle, + int tail_block) +{ + xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + + memset(buf, 0, BBSIZE); + recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + recp->h_cycle = cpu_to_be32(cycle); + recp->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); + recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); + recp->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); +} + +STATIC int +xlog_write_log_records( + struct xlog *log, + int cycle, + int start_block, + int blocks, + int tail_cycle, + int tail_block) +{ + xfs_caddr_t offset; + xfs_buf_t *bp; + int balign, ealign; + int sectbb = log->l_sectBBsize; + int end_block = start_block + blocks; + int bufblks; + int error = 0; + int i, j = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ + bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < sectbb) + return -ENOMEM; + } + + /* We may need to do a read at the start to fill in part of + * the buffer in the starting sector not covered by the first + * write below. + */ + balign = round_down(start_block, sectbb); + if (balign != start_block) { + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + + j = start_block - balign; + } + + for (i = start_block; i < end_block; i += bufblks) { + int bcount, endcount; + + bcount = min(bufblks, end_block - start_block); + endcount = bcount - j; + + /* We may need to do a read at the end to fill in part of + * the buffer in the final sector not covered by the write. + * If this is the same sector as the above read, skip it. + */ + ealign = round_down(end_block, sectbb); + if (j == 0 && (start_block + endcount > ealign)) { + offset = bp->b_addr + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); + if (error) + break; + + } + + offset = xlog_align(log, start_block, endcount, bp); + for (; j < endcount; j++) { + xlog_add_record(log, offset, cycle, i+j, + tail_cycle, tail_block); + offset += BBSIZE; + } + error = xlog_bwrite(log, start_block, endcount, bp); + if (error) + break; + start_block += endcount; + j = 0; + } + + out_put_bp: + xlog_put_bp(bp); + return error; +} + +/* + * This routine is called to blow away any incomplete log writes out + * in front of the log head. We do this so that we won't become confused + * if we come up, write only a little bit more, and then crash again. + * If we leave the partial log records out there, this situation could + * cause us to think those partial writes are valid blocks since they + * have the current cycle number. We get rid of them by overwriting them + * with empty log records with the old cycle number rather than the + * current one. + * + * The tail lsn is passed in rather than taken from + * the log so that we will not write over the unmount record after a + * clean unmount in a 512 block log. Doing so would leave the log without + * any valid log records in it until a new one was written. If we crashed + * during that time we would not be able to recover. + */ +STATIC int +xlog_clear_stale_blocks( + struct xlog *log, + xfs_lsn_t tail_lsn) +{ + int tail_cycle, head_cycle; + int tail_block, head_block; + int tail_distance, max_distance; + int distance; + int error; + + tail_cycle = CYCLE_LSN(tail_lsn); + tail_block = BLOCK_LSN(tail_lsn); + head_cycle = log->l_curr_cycle; + head_block = log->l_curr_block; + + /* + * Figure out the distance between the new head of the log + * and the tail. We want to write over any blocks beyond the + * head that we may have written just before the crash, but + * we don't want to overwrite the tail of the log. + */ + if (head_cycle == tail_cycle) { + /* + * The tail is behind the head in the physical log, + * so the distance from the head to the tail is the + * distance from the head to the end of the log plus + * the distance from the beginning of the log to the + * tail. + */ + if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { + XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + tail_distance = tail_block + (log->l_logBBsize - head_block); + } else { + /* + * The head is behind the tail in the physical log, + * so the distance from the head to the tail is just + * the tail block minus the head block. + */ + if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ + XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + tail_distance = tail_block - head_block; + } + + /* + * If the head is right up against the tail, we can't clear + * anything. + */ + if (tail_distance <= 0) { + ASSERT(tail_distance == 0); + return 0; + } + + max_distance = XLOG_TOTAL_REC_SHIFT(log); + /* + * Take the smaller of the maximum amount of outstanding I/O + * we could have and the distance to the tail to clear out. + * We take the smaller so that we don't overwrite the tail and + * we don't waste all day writing from the head to the tail + * for no reason. + */ + max_distance = MIN(max_distance, tail_distance); + + if ((head_block + max_distance) <= log->l_logBBsize) { + /* + * We can stomp all the blocks we need to without + * wrapping around the end of the log. Just do it + * in a single write. Use the cycle number of the + * current cycle minus one so that the log will look like: + * n ... | n - 1 ... + */ + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, max_distance, tail_cycle, + tail_block); + if (error) + return error; + } else { + /* + * We need to wrap around the end of the physical log in + * order to clear all the blocks. Do it in two separate + * I/Os. The first write should be from the head to the + * end of the physical log, and it should use the current + * cycle number minus one just like above. + */ + distance = log->l_logBBsize - head_block; + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, distance, tail_cycle, + tail_block); + + if (error) + return error; + + /* + * Now write the blocks at the start of the physical log. + * This writes the remainder of the blocks we want to clear. + * It uses the current cycle number since we're now on the + * same cycle as the head so that we get: + * n ... n ... | n - 1 ... + * ^^^^^ blocks we're writing + */ + distance = max_distance - (log->l_logBBsize - head_block); + error = xlog_write_log_records(log, head_cycle, 0, distance, + tail_cycle, tail_block); + if (error) + return error; + } + + return 0; +} + +/****************************************************************************** + * + * Log recover routines + * + ****************************************************************************** + */ + +/* + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. + */ +STATIC int +xlog_recover_reorder_trans( + struct xlog *log, + struct xlog_recover *trans, + int pass) +{ + xlog_recover_item_t *item, *n; + int error = 0; + LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + + switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; + case XFS_LI_BUF: + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; + case XFS_LI_INODE: + case XFS_LI_DQUOT: + case XFS_LI_QUOTAOFF: + case XFS_LI_EFD: + case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &inode_list); + break; + default: + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); + ASSERT(0); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = -EIO; + goto out; + } + } +out: + ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; +} + +/* + * Build up the table of buf cancel records so that we don't replay + * cancelled data in the second pass. For buffer records that are + * not cancel records, there is nothing to do here so we just return. + * + * If we get a cancel record which is already in the table, this indicates + * that the buffer was cancelled multiple times. In order to ensure + * that during pass 2 we keep the record in the table until we reach its + * last occurrence in the log, we keep a reference count in the cancel + * record in the table to tell us how many times we expect to see this + * record during the second pass. + */ +STATIC int +xlog_recover_buffer_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + /* + * If this isn't a cancel buffer item, then just return. + */ + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); + return 0; + } + + /* + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. + */ + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); + return 0; + } + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, bucket); + + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; +} + +/* + * Check to see whether the buffer being recovered has a corresponding + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. + */ +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; + } + + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; + } + + /* + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. + */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; +} + +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; + + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (unlikely(*logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + XFS_ERROR_REPORT("xlog_recover_do_inode_buf", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, + next_unlinked_offset); + *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + + } + + return 0; +} + +/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int bit; + int nbits; + int error; + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* + * Do a sanity check if this is a dquot buffer. Just checking + * the first dquot in the buffer should do. XXXThis is + * probably a good thing to do for other buf types also. + */ + error = 0; + if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, + -1, 0, XFS_QMOPT_DOWARN, + "dquot_buf_recover"); + if (error) + goto next; + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<ri_total); + + xlog_recover_validate_buf_type(mp, bp, buf_f); +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF log item of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. + */ +STATIC bool +xlog_recover_do_dquot_buffer( + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (!mp->m_qflags) + return false; + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQ_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return false; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + return true; +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_buffer_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buffer_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + int error; + uint buf_flags; + xfs_lsn_t lsn; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, NULL); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); + goto out_release; + } + + /* + * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + xlog_recover_validate_buf_type(mp, bp, buf_f); + goto out_release; + } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + } + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); + } else { + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return -ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_inode_log_format_t *in_f; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + xfs_dinode_t *dip; + int len; + xfs_caddr_t src; + xfs_caddr_t dest; + int error; + int attr_index; + uint fields; + xfs_icdinode_t *dicp; + uint isize; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, + in_f->ilf_len, 0)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error; + } + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); + goto out_release; + } + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_release; + } + dicp = item->ri_buf[1].i_addr; + if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* Take the opportunity to reset the flush iteration count */ + dicp->di_flushiter = 0; + + if (unlikely(S_ISREG(dicp->di_mode))) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } else if (unlikely(S_ISDIR(dicp->di_mode))) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE) && + (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } + if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + dicp->di_nextents + dicp->di_anextents, + dicp->di_nblocks); + error = -EFSCORRUPTED; + goto out_release; + } + if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); + error = -EFSCORRUPTED; + goto out_release; + } + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); + error = -EFSCORRUPTED; + goto out_release; + } + + /* The core is in in-core format */ + xfs_dinode_to_disk(dip, dicp); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); + } + + fields = in_f->ilf_fields; + switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { + case XFS_ILOG_DEV: + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + break; + case XFS_ILOG_UUID: + memcpy(XFS_DFORK_DPTR(dip), + &in_f->ilf_u.ilfu_uuid, + sizeof(uuid_t)); + break; + } + + if (in_f->ilf_size == 2) + goto out_owner_change; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (xfs_bmdr_block_t*)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + error = -EIO; + goto out_release; + } + } + +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); +error: + if (need_free) + kmem_free(in_f); + return error; +} + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return 0; +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + struct xfs_disk_dquot *ddq, *recddq; + int error; + xfs_dq_logformat_t *dq_f; + uint type; + + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return 0; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return -EIO; + } + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return -EIO; + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return 0; + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) + return -EIO; + ASSERT(dq_f->qlf_len == 1); + + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + &xfs_dquot_buf_ops); + if (error) + return error; + + ASSERT(bp); + ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); + return 0; +} + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + xfs_mount_t *mp = log->l_mp; + xfs_efi_log_item_t *efip; + xfs_efi_log_format_t *efi_formatp; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), + &(efip->efi_format)))) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + return 0; +} + + +/* + * This routine is called when an efd format structure is found in + * a committed transaction in the log. It's purpose is to cancel + * the corresponding efi if it was still in the log. To do this + * it searches the AIL for the efi with an id equal to that in the + * efd format structure. If we find it, we remove the efi from the + * AIL and free it. + */ +STATIC int +xlog_recover_efd_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_efd_log_format_t *efd_formatp; + xfs_efi_log_item_t *efip = NULL; + xfs_log_item_t *lip; + __uint64_t efi_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + efi_id = efd_formatp->efd_efi_id; + + /* + * Search for the efi with the id in the efd format structure + * in the AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_EFI) { + efip = (xfs_efi_log_item_t *)lip; + if (efip->efi_format.efi_id == efi_id) { + /* + * xfs_trans_ail_delete() drops the + * AIL lock. + */ + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); + xfs_efi_item_free(efip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return -EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return -EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return -EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return -EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return -EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return -EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return -EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return -EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + +STATIC int +xlog_recover_commit_pass1( + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + case XFS_LI_ICREATE: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return -EIO; + } +} + +STATIC int +xlog_recover_commit_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return -EIO; + } +} + +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int +xlog_recover_commit_trans( + struct xlog *log, + struct xlog_recover *trans, + int pass) +{ + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 + + hlist_del(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) + return error; + + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: + error = xlog_recover_commit_pass1(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + + if (error) + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; +} + +STATIC void +xlog_recover_add_item( + struct list_head *head) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); +} + +STATIC int +xlog_recover_add_to_cont_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + if (list_empty(&trans->r_itemq)) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); + return 0; + } + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + memcpy(&ptr[old_len], dp, len); + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return -EIO; + } + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ + xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); + } + + if (item->ri_total == 0) { /* first region to be added */ + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + kmem_free(ptr); + return -EIO; + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); + return 0; +} + +/* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + struct xlog_recover *trans) +{ + xlog_recover_item_t *item, *n; + int i; + + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); + /* Free the item itself */ + kmem_free(item->ri_buf); + kmem_free(item); + } + /* Free the transaction recover structure */ + kmem_free(trans); +} + +/* + * On error or completion, trans is freed. + */ +STATIC int +xlog_recovery_process_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + unsigned int len, + unsigned int flags, + int pass) +{ + int error = 0; + bool freeit = false; + + /* mask off ophdr transaction container flags */ + flags &= ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + + /* + * Callees must not free the trans structure. We'll decide if we need to + * free it or not based on the operation being done and it's result. + */ + switch (flags) { + /* expected flag values */ + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(log, trans, dp, len); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(log, trans, dp, len); + break; + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, trans, pass); + /* success or fail, we are now done with this transaction. */ + freeit = true; + break; + + /* unexpected flag values */ + case XLOG_UNMOUNT_TRANS: + /* just skip trans */ + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + freeit = true; + break; + case XLOG_START_TRANS: + default: + xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); + ASSERT(0); + error = -EIO; + break; + } + if (error || freeit) + xlog_recover_free_trans(trans); + return error; +} + +/* + * Lookup the transaction recovery structure associated with the ID in the + * current ophdr. If the transaction doesn't exist and the start flag is set in + * the ophdr, then allocate a new transaction for future ID matches to find. + * Either way, return what we found during the lookup - an existing transaction + * or nothing. + */ +STATIC struct xlog_recover * +xlog_recover_ophdr_to_trans( + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead) +{ + struct xlog_recover *trans; + xlog_tid_t tid; + struct hlist_head *rhp; + + tid = be32_to_cpu(ohead->oh_tid); + rhp = &rhash[XLOG_RHASH(tid)]; + hlist_for_each_entry(trans, rhp, r_list) { + if (trans->r_log_tid == tid) + return trans; + } + + /* + * skip over non-start transaction headers - we could be + * processing slack space before the next transaction starts + */ + if (!(ohead->oh_flags & XLOG_START_TRANS)) + return NULL; + + ASSERT(be32_to_cpu(ohead->oh_len) == 0); + + /* + * This is a new transaction so allocate a new recovery container to + * hold the recovery ops that will follow. + */ + trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = be64_to_cpu(rhead->h_lsn); + INIT_LIST_HEAD(&trans->r_itemq); + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, rhp); + + /* + * Nothing more to do for this ophdr. Items to be added to this new + * transaction will be in subsequent ophdr containers. + */ + return NULL; +} + +STATIC int +xlog_recover_process_ophdr( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead, + xfs_caddr_t dp, + xfs_caddr_t end, + int pass) +{ + struct xlog_recover *trans; + unsigned int len; + + /* Do we understand who wrote this op? */ + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); + ASSERT(0); + return -EIO; + } + + /* + * Check the ophdr contains all the data it is supposed to contain. + */ + len = be32_to_cpu(ohead->oh_len); + if (dp + len > end) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); + WARN_ON(1); + return -EIO; + } + + trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); + if (!trans) { + /* nothing to do, so skip over this ophdr */ + return 0; + } + + return xlog_recovery_process_trans(log, trans, dp, len, + ohead->oh_flags, pass); +} + +/* + * There are two valid states of the r_state field. 0 indicates that the + * transaction structure is in a normal state. We have either seen the + * start of the transaction or the last operation we added was not a partial + * operation. If the last operation we added to the transaction was a + * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. + * + * NOTE: skip LRs with 0 data length. + */ +STATIC int +xlog_recover_process_data( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + int pass) +{ + struct xlog_op_header *ohead; + xfs_caddr_t end; + int num_logops; + int error; + + end = dp + be32_to_cpu(rhead->h_len); + num_logops = be32_to_cpu(rhead->h_num_logops); + + /* check the log format matches our own - else we can't recover */ + if (xlog_header_check_recover(log->l_mp, rhead)) + return -EIO; + + while ((dp < end) && num_logops) { + + ohead = (struct xlog_op_header *)dp; + dp += sizeof(*ohead); + ASSERT(dp <= end); + + /* errors will abort recovery */ + error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, + dp, end, pass); + if (error) + return error; + + dp += be32_to_cpu(ohead->oh_len); + num_logops--; + } + return 0; +} + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +STATIC int +xlog_recover_process_efi( + xfs_mount_t *mp, + xfs_efi_log_item_t *efip) +{ + xfs_efd_log_item_t *efdp; + xfs_trans_t *tp; + int i; + int error = 0; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if ((startblock_fsb == 0) || + (extp->ext_len == 0) || + (startblock_fsb >= mp->m_sb.sb_dblocks) || + (extp->ext_len >= mp->m_sb.sb_agblocks)) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + xfs_efi_release(efip, efip->efi_format.efi_nextents); + return -EIO; + } + } + + tp = xfs_trans_alloc(mp, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto abort_error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; + xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, + extp->ext_len); + } + + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp, 0); + return error; + +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +/* + * When this is called, all of the EFIs which did not have + * corresponding EFDs should be in the AIL. What we do now + * is free the extents associated with each one. + * + * Since we process the EFIs in normal transactions, they + * will be removed at some point after the commit. This prevents + * us from just walking down the list processing each one. + * We'll use a flag in the EFI to skip those that we've already + * processed and use the AIL iteration mechanism's generation + * count to try to speed this up at least a bit. + * + * When we start, we know that the EFIs are the only things in + * the AIL. As we process them, however, other items are added + * to the AIL. Since everything added to the AIL must come after + * everything already in the AIL, we stop processing as soon as + * we see something other than an EFI in the AIL. + */ +STATIC int +xlog_recover_process_efis( + struct xlog *log) +{ + xfs_log_item_t *lip; + xfs_efi_log_item_t *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + /* + * Skip EFIs that we've already processed. + */ + efip = (xfs_efi_log_item_t *)lip; + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { + lip = xfs_trans_ail_cursor_next(ailp, &cur); + continue; + } + + spin_unlock(&ailp->xa_lock); + error = xlog_recover_process_efi(log->l_mp, efip); + spin_lock(&ailp->xa_lock); + if (error) + goto out; + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } +out: + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* + * This routine performs a transaction to null out a bad inode pointer + * in an agi unlinked inode hash bucket. + */ +STATIC void +xlog_recover_clear_agi_bucket( + xfs_mount_t *mp, + xfs_agnumber_t agno, + int bucket) +{ + xfs_trans_t *tp; + xfs_agi_t *agi; + xfs_buf_t *agibp; + int offset; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + if (error) + goto out_abort; + + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + goto out_abort; + + agi = XFS_BUF_TO_AGI(agibp); + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + return; +} + +STATIC xfs_agino_t +xlog_recover_process_one_iunlink( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino, + int bucket) +{ + struct xfs_buf *ibp; + struct xfs_dinode *dip; + struct xfs_inode *ip; + xfs_ino_t ino; + int error; + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + goto fail; + + /* + * Get the on disk inode to find the next inode in the bucket. + */ + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + goto fail_iput; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + /* setup for the next pass */ + agino = be32_to_cpu(dip->di_next_unlinked); + xfs_buf_relse(ibp); + + /* + * Prevent any DMAPI event from being sent when the reference on + * the inode is dropped. + */ + ip->i_d.di_dmevmask = 0; + + IRELE(ip); + return agino; + + fail_iput: + IRELE(ip); + fail: + /* + * We can't read in the inode this bucket points to, or this inode + * is messed up. Just ditch this bucket of inodes. We will lose + * some inodes and space, but at least we won't hang. + * + * Call xlog_recover_clear_agi_bucket() to perform a transaction to + * clear the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, bucket); + return NULLAGINO; +} + +/* + * xlog_iunlink_recover + * + * This is called during recovery to process any inodes which + * we unlinked but not freed when the system crashed. These + * inodes will be on the lists in the AGI blocks. What we do + * here is scan all the AGIs and fully truncate and free any + * inodes found on the lists. Each inode is removed from the + * lists when it has been fully truncated and is freed. The + * freeing of the inode and its removal from the list must be + * atomic. + */ +STATIC void +xlog_recover_process_iunlinks( + struct xlog *log) +{ + xfs_mount_t *mp; + xfs_agnumber_t agno; + xfs_agi_t *agi; + xfs_buf_t *agibp; + xfs_agino_t agino; + int bucket; + int error; + uint mp_dmevmask; + + mp = log->l_mp; + + /* + * Prevent any DMAPI event from being sent while in this function. + */ + mp_dmevmask = mp->m_dmevmask; + mp->m_dmevmask = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + /* + * Find the agi for this ag. + */ + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt + * after we've recovered all the ag's we can.... + */ + continue; + } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ + agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + agino = xlog_recover_process_one_iunlink(mp, + agno, agino, bucket); + } + } + xfs_buf_rele(agibp); + } + + mp->m_dmevmask = mp_dmevmask; +} + +/* + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure + */ +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) +{ + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); + } + + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return -EFSCORRUPTED; + } + + return 0; +} + +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) +{ + int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + +STATIC int +xlog_valid_rec_header( + struct xlog *log, + struct xlog_rec_header *rhead, + xfs_daddr_t blkno) +{ + int hlen; + + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { + XFS_ERROR_REPORT("xlog_valid_rec_header(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + if (unlikely( + (!rhead->h_version || + (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", + __func__, be32_to_cpu(rhead->h_version)); + return -EIO; + } + + /* LR body must have data or it wouldn't have been written */ + hlen = be32_to_cpu(rhead->h_len); + if (unlikely( hlen <= 0 || hlen > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(3)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Read the log from tail to head and process the log records found. + * Handle the two cases where the tail and head are in the same cycle + * and where the active portion of the log wraps around the end of + * the physical log separately. The pass parameter is passed through + * to the routines called to process the data and is not looked at + * here. + */ +STATIC int +xlog_do_recovery_pass( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int pass) +{ + xlog_rec_header_t *rhead; + xfs_daddr_t blk_no; + xfs_caddr_t offset; + xfs_buf_t *hbp, *dbp; + int error = 0, h_size; + int bblks, split_bblks; + int hblks, split_hblks, wrapped_hblks; + struct hlist_head rhash[XLOG_RHASH_SIZE]; + + ASSERT(head_blk != tail_blk); + + /* + * Read the header of the tail block and get the iclog buffer size from + * h_size. Use this to tell how many sectors make up the log header. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + /* + * When using variable length iclogs, read first sector of + * iclog header and extract the header size from it. Get a + * new hbp that is the correct size. + */ + hbp = xlog_get_bp(log, 1); + if (!hbp) + return -ENOMEM; + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) + goto bread_err1; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, tail_blk); + if (error) + goto bread_err1; + h_size = be32_to_cpu(rhead->h_size); + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + xlog_put_bp(hbp); + hbp = xlog_get_bp(log, hblks); + } else { + hblks = 1; + } + } else { + ASSERT(log->l_sectBBsize == 1); + hblks = 1; + hbp = xlog_get_bp(log, 1); + h_size = XLOG_BIG_RECORD_BSIZE; + } + + if (!hbp) + return -ENOMEM; + dbp = xlog_get_bp(log, BTOBB(h_size)); + if (!dbp) { + xlog_put_bp(hbp); + return -ENOMEM; + } + + memset(rhash, 0, sizeof(rhash)); + blk_no = tail_blk; + if (tail_blk > head_blk) { + /* + * Perform recovery around the end of the physical log. + * When the head is not on the same cycle number as the tail, + * we can't do a sequential recovery. + */ + while (blk_no < log->l_logBBsize) { + /* + * Check for header wrapping around physical end-of-log + */ + offset = hbp->b_addr; + split_hblks = 0; + wrapped_hblks = 0; + if (blk_no + hblks <= log->l_logBBsize) { + /* Read header in one read */ + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This LR is split across physical log end */ + if (blk_no != log->l_logBBsize) { + /* some data before physical log end */ + ASSERT(blk_no <= INT_MAX); + split_hblks = log->l_logBBsize - (int)blk_no; + ASSERT(split_hblks > 0); + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + wrapped_hblks = hblks - split_hblks; + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); + if (error) + goto bread_err2; + } + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, + split_hblks ? blk_no : 0); + if (error) + goto bread_err2; + + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + blk_no += hblks; + + /* Read in data for log record */ + if (blk_no + bblks <= log->l_logBBsize) { + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This log record is split across the + * physical end of log */ + offset = dbp->b_addr; + split_bblks = 0; + if (blk_no != log->l_logBBsize) { + /* some data is before the physical + * end of log */ + ASSERT(!wrapped_hblks); + ASSERT(blk_no <= INT_MAX); + split_bblks = + log->l_logBBsize - (int)blk_no; + ASSERT(split_bblks > 0); + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + error = xlog_bread_offset(log, 0, + bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); + if (error) + goto bread_err2; + } + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks; + } + + ASSERT(blk_no >= log->l_logBBsize); + blk_no -= log->l_logBBsize; + } + + /* read first part of physical log */ + while (blk_no < head_blk) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + + /* blocks in data section */ + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks + hblks; + } + + bread_err2: + xlog_put_bp(dbp); + bread_err1: + xlog_put_bp(hbp); + return error; +} + +/* + * Do the recovery of the log. We actually do this in two phases. + * The two passes are necessary in order to implement the function + * of cancelling a record written into the log. The first pass + * determines those things which have been cancelled, and the + * second pass replays log items normally except for those which + * have been cancelled. The handling of the replay and cancellations + * takes place in the log item type specific routines. + * + * The table of items which have cancel records in the log is allocated + * and freed at this level, since only here do we know when all of + * the log recovery has been completed. + */ +STATIC int +xlog_do_log_recovery( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error, i; + + ASSERT(head_blk != tail_blk); + + /* + * First do a pass to find all of the cancelled buf log items. + * Store them in the buf_cancel_table for use in the second pass. + */ + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS1); + if (error != 0) { + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + return error; + } + /* + * Then do a second pass to actually recover the items in the log. + * When it is complete free the table of buf cancel items. + */ + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS2); +#ifdef DEBUG + if (!error) { + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); + } +#endif /* DEBUG */ + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + + return error; +} + +/* + * Do the actual recovery + */ +STATIC int +xlog_do_recover( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error; + xfs_buf_t *bp; + xfs_sb_t *sbp; + + /* + * First replay the images in the log. + */ + error = xlog_do_log_recovery(log, head_blk, tail_blk); + if (error) + return error; + + /* + * If IO errors happened during recovery, bail out. + */ + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + return -EIO; + } + + /* + * We now update the tail_lsn since much of the recovery has completed + * and there may be space available to use. If there were no extent + * or iunlinks, we can free up the entire log and set the tail_lsn to + * be the last_sync_lsn. This was set in xlog_find_tail to be the + * lsn of the last known good LR on disk. If there are extent frees + * or iunlinks they will have some entries in the AIL; so we look at + * the AIL to determine how to set the tail_lsn. + */ + xlog_assign_tail_lsn(log->l_mp); + + /* + * Now that we've finished replaying all buffer and inode + * updates, re-read in the superblock and reverify it. + */ + bp = xfs_getsb(log->l_mp, 0); + XFS_BUF_UNDONE(bp); + ASSERT(!(XFS_BUF_ISWRITE(bp))); + XFS_BUF_READ(bp); + XFS_BUF_UNASYNC(bp); + bp->b_ops = &xfs_sb_buf_ops; + + error = xfs_buf_submit_wait(bp); + if (error) { + if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_ioerror_alert(bp, __func__); + ASSERT(0); + } + xfs_buf_relse(bp); + return error; + } + + /* Convert superblock from on-disk format */ + sbp = &log->l_mp->m_sb; + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); + ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + + xfs_buf_relse(bp); + + + xlog_recover_check_summary(log); + + /* Normal transactions can now occur */ + log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + return 0; +} + +/* + * Perform recovery and re-initialize some log variables in xlog_find_tail. + * + * Return error or zero. + */ +int +xlog_recover( + struct xlog *log) +{ + xfs_daddr_t head_blk, tail_blk; + int error; + + /* find the tail of the log */ + if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) + return error; + + if (tail_blk != head_blk) { + /* There used to be a comment here: + * + * disallow recovery on read-only mounts. note -- mount + * checks for ENOSPC and turns it into an intelligent + * error message. + * ...but this is no longer true. Now, unless you specify + * NORECOVERY (in which case this function would never be + * called), we just go ahead and recover. We do this all + * under the vfs layer, so we can get away with it unless + * the device itself is read-only, in which case we fail. + */ + if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { + return error; + } + + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return -EINVAL; + } + + /* + * Delay log recovery if the debug hook is set. This is debug + * instrumention to coordinate simulation of I/O failures with + * log recovery. + */ + if (xfs_globals.log_recovery_delay) { + xfs_notice(log->l_mp, + "Delaying log recovery for %d seconds.", + xfs_globals.log_recovery_delay); + msleep(xfs_globals.log_recovery_delay * 1000); + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + + error = xlog_do_recover(log, head_blk, tail_blk); + log->l_flags |= XLOG_RECOVERY_NEEDED; + } + return error; +} + +/* + * In the first part of recovery we replay inodes and buffers and build + * up the list of extent free items which need to be processed. Here + * we process the extent free items and clean up the on disk unlinked + * inode lists. This is separated from the first part of recovery so + * that the root and real-time bitmap inodes can be read in from disk in + * between the two stages. This is necessary so that we can free space + * in the real-time portion of the file system. + */ +int +xlog_recover_finish( + struct xlog *log) +{ + /* + * Now we're ready to do the transactions needed for the + * rest of recovery. Start with completing all the extent + * free intent records and then process the unlinked inode + * lists. At this point, we essentially run in normal mode + * except that we're still performing recovery actions + * rather than accepting new requests. + */ + if (log->l_flags & XLOG_RECOVERY_NEEDED) { + int error; + error = xlog_recover_process_efis(log); + if (error) { + xfs_alert(log->l_mp, "Failed to recover EFIs"); + return error; + } + /* + * Sync the log to get all the EFIs out of the AIL. + * This isn't absolutely necessary, but it helps in + * case the unlink transactions would have problems + * pushing the EFIs out of the way. + */ + xfs_log_force(log->l_mp, XFS_LOG_SYNC); + + xlog_recover_process_iunlinks(log); + + xlog_recover_check_summary(log); + + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + log->l_flags &= ~XLOG_RECOVERY_NEEDED; + } else { + xfs_info(log->l_mp, "Ending clean mount"); + } + return 0; +} + + +#if defined(DEBUG) +/* + * Read all of the agf and agi counters and check that they + * are consistent with the superblock counters. + */ +void +xlog_recover_check_summary( + struct xlog *log) +{ + xfs_mount_t *mp; + xfs_agf_t *agfp; + xfs_buf_t *agfbp; + xfs_buf_t *agibp; + xfs_agnumber_t agno; + __uint64_t freeblks; + __uint64_t itotal; + __uint64_t ifree; + int error; + + mp = log->l_mp; + + freeblks = 0LL; + itotal = 0LL; + ifree = 0LL; + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + if (error) { + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); + } else { + agfp = XFS_BUF_TO_AGF(agfbp); + freeblks += be32_to_cpu(agfp->agf_freeblks) + + be32_to_cpu(agfp->agf_flcount); + xfs_buf_relse(agfbp); + } + + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + + itotal += be32_to_cpu(agi->agi_count); + ifree += be32_to_cpu(agi->agi_freecount); + xfs_buf_relse(agibp); + } + } +} +#endif /* DEBUG */ diff --git a/kernel/fs/xfs/xfs_message.c b/kernel/fs/xfs/xfs_message.c new file mode 100644 index 000000000..d8b67547a --- /dev/null +++ b/kernel/fs/xfs/xfs_message.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" + +/* + * XFS logging functions + */ +static void +__xfs_printk( + const char *level, + const struct xfs_mount *mp, + struct va_format *vaf) +{ + if (mp && mp->m_fsname) { + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } + printk("%sXFS: %pV\n", level, vaf); +} + +#define define_xfs_printk_level(func, kern_level) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __xfs_printk(kern_level, mp, &vaf); \ + va_end(args); \ +} \ + +define_xfs_printk_level(xfs_emerg, KERN_EMERG); +define_xfs_printk_level(xfs_alert, KERN_ALERT); +define_xfs_printk_level(xfs_crit, KERN_CRIT); +define_xfs_printk_level(xfs_err, KERN_ERR); +define_xfs_printk_level(xfs_warn, KERN_WARNING); +define_xfs_printk_level(xfs_notice, KERN_NOTICE); +define_xfs_printk_level(xfs_info, KERN_INFO); +#ifdef DEBUG +define_xfs_printk_level(xfs_debug, KERN_DEBUG); +#endif + +void +xfs_alert_tag( + const struct xfs_mount *mp, + int panic_tag, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + xfs_alert(mp, "Transforming an alert into a BUG."); + do_panic = 1; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); + + BUG_ON(do_panic); +} + +void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void +assfail(char *expr, char *file, int line) +{ + xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +void +xfs_hex_dump(void *p, int length) +{ + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); +} diff --git a/kernel/fs/xfs/xfs_message.h b/kernel/fs/xfs/xfs_message.h new file mode 100644 index 000000000..854011557 --- /dev/null +++ b/kernel/fs/xfs/xfs_message.h @@ -0,0 +1,64 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); + +#ifdef DEBUG +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#else +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + +extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/kernel/fs/xfs/xfs_mount.c b/kernel/fs/xfs/xfs_mount.c new file mode 100644 index 000000000..6f23fbdfb --- /dev/null +++ b/kernel/fs/xfs/xfs_mount.c @@ -0,0 +1,1283 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_fsops.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_sysfs.h" + + +static DEFINE_MUTEX(xfs_uuid_table_mutex); +static int xfs_uuid_table_size; +static uuid_t *xfs_uuid_table; + +/* + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. + */ +STATIC int +xfs_uuid_mount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int hole, i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return 0; + + if (uuid_is_nil(uuid)) { + xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + return -EINVAL; + } + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &xfs_uuid_table[i])) + goto out_duplicate; + } + + if (hole < 0) { + xfs_uuid_table = kmem_realloc(xfs_uuid_table, + (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), + xfs_uuid_table_size * sizeof(*xfs_uuid_table), + KM_SLEEP); + hole = xfs_uuid_table_size++; + } + xfs_uuid_table[hole] = *uuid; + mutex_unlock(&xfs_uuid_table_mutex); + + return 0; + + out_duplicate: + mutex_unlock(&xfs_uuid_table_mutex); + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); + return -EINVAL; +} + +STATIC void +xfs_uuid_unmount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return; + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) + continue; + if (!uuid_equal(uuid, &xfs_uuid_table[i])) + continue; + memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); + break; + } + ASSERT(i < xfs_uuid_table_size); + mutex_unlock(&xfs_uuid_table_mutex); +} + + +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +STATIC void +xfs_free_perag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + + /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return -EFBIG; + return 0; +} + +int +xfs_initialize_perag( + xfs_mount_t *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = 0; + xfs_perag_t *pag; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_sb_t *sbp = &mp->m_sb; + int error = -ENOMEM; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + if (!first_initialised) + first_initialised = index; + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + spin_lock_init(&pag->pag_ici_lock); + mutex_init(&pag->pag_ici_reclaim_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; + + if (radix_tree_preload(GFP_NOFS)) + goto out_unwind; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. + */ + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) + mp->m_flags |= XFS_MOUNT_32BITINODES; + else + mp->m_flags &= ~XFS_MOUNT_32BITINODES; + + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp, agcount); + else + index = xfs_set_inode64(mp, agcount); + + if (maxagi) + *maxagi = index; + return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; +} + +/* + * xfs_readsb + * + * Does the initial read of the superblock. + */ +int +xfs_readsb( + struct xfs_mount *mp, + int flags) +{ + unsigned int sector_size; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; + int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; + + ASSERT(mp->m_sb_bp == NULL); + ASSERT(mp->m_ddev_targp != NULL); + + /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* + * Allocate a (locked) buffer to hold the superblock. + * This will be kept around at all times to optimize + * access to the superblock. + */ +reread: + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, &bp, buf_ops); + if (error) { + if (loud) + xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + + /* + * Initialize the mount structure from the superblock. + */ + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = -EINVAL; + goto release_buf; + } + + /* + * We must be able to do sector-sized and sector-aligned IO. + */ + if (sector_size > sbp->sb_sectsize) { + if (loud) + xfs_warn(mp, "device supports %u byte sectors (not %u)", + sector_size, sbp->sb_sectsize); + error = -ENOSYS; + goto release_buf; + } + + if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ + xfs_buf_relse(bp); + sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; + goto reread; + } + + xfs_reinit_percpu_counters(mp); + + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + + mp->m_sb_bp = bp; + xfs_buf_unlock(bp); + return 0; + +release_buf: + xfs_buf_relse(bp); + return error; +} + +/* + * Update alignment values based on mount options and sb values + */ +STATIC int +xfs_update_alignment(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + + if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return -EINVAL; + } else { + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return -EINVAL; + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + } else { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return -EINVAL; + } + } + + /* + * Update superblock with new values + * and log changes + */ + if (xfs_sb_version_hasdalign(sbp)) { + if (sbp->sb_unit != mp->m_dalign) { + sbp->sb_unit = mp->m_dalign; + mp->m_update_sb = true; + } + if (sbp->sb_width != mp->m_swidth) { + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; + } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + xfs_sb_version_hasdalign(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; + } + + return 0; +} + +/* + * Set the maximum inode count for this filesystem + */ +STATIC void +xfs_set_maxicount(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + __uint64_t icount; + + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. + */ + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + do_div(icount, mp->m_ialloc_blks); + mp->m_maxicount = (icount * mp->m_ialloc_blks) << + sbp->sb_inopblog; + } else { + mp->m_maxicount = 0; + } +} + +/* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ +STATIC void +xfs_set_rw_sizes(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + int readio_log, writeio_log; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + if (mp->m_flags & XFS_MOUNT_WSYNC) { + readio_log = XFS_WSYNC_READIO_LOG; + writeio_log = XFS_WSYNC_WRITEIO_LOG; + } else { + readio_log = XFS_READIO_LOG_LARGE; + writeio_log = XFS_WRITEIO_LOG_LARGE; + } + } else { + readio_log = mp->m_readio_log; + writeio_log = mp->m_writeio_log; + } + + if (sbp->sb_blocklog > readio_log) { + mp->m_readio_log = sbp->sb_blocklog; + } else { + mp->m_readio_log = readio_log; + } + mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); + if (sbp->sb_blocklog > writeio_log) { + mp->m_writeio_log = sbp->sb_blocklog; + } else { + mp->m_writeio_log = writeio_log; + } + mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); +} + +/* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); + } +} + + +/* + * Set whether we're using inode alignment. + */ +STATIC void +xfs_set_inoalignment(xfs_mount_t *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; + else + mp->m_inoalign_mask = 0; + /* + * If we are using stripe alignment, check whether + * the stripe unit is a multiple of the inode alignment + */ + if (mp->m_dalign && mp->m_inoalign_mask && + !(mp->m_dalign & mp->m_inoalign_mask)) + mp->m_sinoalign = mp->m_dalign; + else + mp->m_sinoalign = 0; +} + +/* + * Check that the data (and log if separate) is an ok size. + */ +STATIC int +xfs_check_sizes( + struct xfs_mount *mp) +{ + struct xfs_buf *bp; + xfs_daddr_t d; + int error; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { + xfs_warn(mp, "filesystem size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "last sector read failed"); + return error; + } + xfs_buf_relse(bp); + + if (mp->m_logdev_targp == mp->m_ddev_targp) + return 0; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "log device read failed"); + return error; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Clear the quotaflags in memory and in the superblock. + */ +int +xfs_mount_reset_sbqflags( + struct xfs_mount *mp) +{ + mp->m_qflags = 0; + + /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ + if (mp->m_sb.sb_qflags == 0) + return 0; + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = 0; + spin_unlock(&mp->m_sb_lock); + + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) + return 0; + + return xfs_sync_sb(mp, false); +} + +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 8192); + return resblks; +} + +/* + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; + + xfs_sb_mount_common(mp, sbp); + + /* + * Check for a mismatched features2 values. Older kernels read & wrote + * into the wrong sb offset for sb_features2 on some platforms due to + * xfs_sb_t not being 64bit size aligned when sb_features2 was added, + * which made older superblock reading/writing routines swap it as a + * 64-bit value. + * + * For backwards compatibility, we make both slots equal. + * + * If we detect a mismatched field, we OR the set bits into the existing + * features2 field in case it has already been modified; we don't want + * to lose any features. We then update the bad location with the ORed + * value so that older kernels will see any features2 flags. The + * superblock writeback code ensures the new sb_features2 is copied to + * sb_bad_features2 before it is logged or written to disk. + */ + if (xfs_sb_has_mismatched_features2(sbp)) { + xfs_warn(mp, "correcting sb_features alignment problem"); + sbp->sb_features2 |= sbp->sb_bad_features2; + mp->m_update_sb = true; + + /* + * Re-check for ATTR2 in case it was found in bad_features2 + * slot. + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + mp->m_update_sb = true; + + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + mp->m_update_sb = true; + } + + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_sb = true; + } + + /* + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + xfs_set_maxicount(mp); + + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); + if (error) + goto out; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_sysfs; + + /* + * Set the minimum read and write sizes + */ + xfs_set_rw_sizes(mp); + + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + + /* + * Set the inode cluster size. + * This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. + */ + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + } + + /* + * Set inode alignment fields + */ + xfs_set_inoalignment(mp); + + /* + * Check that the data (and log if separate) is an ok size. + */ + error = xfs_check_sizes(mp); + if (error) + goto out_remove_uuid; + + /* + * Initialize realtime fields in the mount structure + */ + error = xfs_rtmount_init(mp); + if (error) { + xfs_warn(mp, "RT mount failed"); + goto out_remove_uuid; + } + + /* + * Copies the low order bits of the timestamp and the randomly + * set "sequence" number out of a UUID. + */ + uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + + mp->m_dmevmask = 0; /* not persistent; set after each mount */ + + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } + + /* + * Initialize the precomputed transaction reservations values. + */ + xfs_trans_init(mp); + + /* + * Allocate and initialize the per-ag data. + */ + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed per-ag init: %d", error); + goto out_free_dir; + } + + if (!sbp->sb_logblocks) { + xfs_warn(mp, "no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_free_perag; + } + + /* + * log's mount-time initialization. Perform 1st part recovery if needed + */ + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + xfs_warn(mp, "log mount failed"); + goto out_fail_wait; + } + + /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) + goto out_log_dealloc; + } + + /* + * Get and sanity-check the root inode. + * Save the pointer to it in the mount structure. + */ + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); + if (error) { + xfs_warn(mp, "failed to read root inode"); + goto out_log_dealloc; + } + + ASSERT(rip != NULL); + + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { + xfs_warn(mp, "corrupted root inode %llu: not a directory", + (unsigned long long)rip->i_ino); + xfs_iunlock(rip, XFS_ILOCK_EXCL); + XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, + mp); + error = -EFSCORRUPTED; + goto out_rele_rip; + } + mp->m_rootip = rip; /* save it */ + + xfs_iunlock(rip, XFS_ILOCK_EXCL); + + /* + * Initialize realtime inode pointers in the mount structure + */ + error = xfs_rtmount_inodes(mp); + if (error) { + /* + * Free up the root inode. + */ + xfs_warn(mp, "failed to read RT inodes"); + goto out_rele_rip; + } + + /* + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. + */ + if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + goto out_rtunmount; + } + } + + /* + * Initialise the XFS quota management subsystem for this mount + */ + if (XFS_IS_QUOTA_RUNNING(mp)) { + error = xfs_qm_newmount(mp, "amount, "aflags); + if (error) + goto out_rtunmount; + } else { + ASSERT(!XFS_IS_QUOTA_ON(mp)); + + /* + * If a file system had quotas running earlier, but decided to + * mount without -o uquota/pquota/gquota options, revoke the + * quotachecked license. + */ + if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { + xfs_notice(mp, "resetting quota flags"); + error = xfs_mount_reset_sbqflags(mp); + if (error) + goto out_rtunmount; + } + } + + /* + * Finish recovering the file system. This part needed to be + * delayed until after the root and real-time bitmap inodes + * were consistently read in. + */ + error = xfs_log_mount_finish(mp); + if (error) { + xfs_warn(mp, "log mount finish failed"); + goto out_rtunmount; + } + + /* + * Complete the quota initialisation, post-log-replay component. + */ + if (quotamount) { + ASSERT(mp->m_qflags == 0); + mp->m_qflags = quotaflags; + + xfs_qm_mount_quotas(mp); + } + + /* + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, + "Unable to allocate reserve blocks. Continuing without reserve pool."); + } + + return 0; + + out_rtunmount: + xfs_rtunmount_inodes(mp); + out_rele_rip: + IRELE(rip); + out_log_dealloc: + xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); + out_free_perag: + xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); + out_remove_uuid: + xfs_uuid_unmount(mp); + out_remove_sysfs: + xfs_sysfs_del(&mp->m_kobj); + out: + return error; +} + +/* + * This flushes out the inodes,dquots and the superblock, unmounts the + * log and makes sure that incore structures are freed. + */ +void +xfs_unmountfs( + struct xfs_mount *mp) +{ + __uint64_t resblks; + int error; + + cancel_delayed_work_sync(&mp->m_eofblocks_work); + + xfs_qm_unmount_quotas(mp); + xfs_rtunmount_inodes(mp); + IRELE(mp->m_rootip); + + /* + * We can potentially deadlock here if we have an inode cluster + * that has been freed has its buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. + */ + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + xfs_qm_unmount(mp); + + /* + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be absolutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... + */ + resblks = 0; + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, "Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); + + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "Unable to update superblock counters. " + "Freespace may not be correct on next mount."); + + xfs_log_unmount(mp); + xfs_da_unmount(mp); + xfs_uuid_unmount(mp); + +#if defined(DEBUG) + xfs_errortag_clearall(mp, 0); +#endif + xfs_free_perag(mp); + + xfs_sysfs_del(&mp->m_kobj); +} + +/* + * Determine whether modifications can proceed. The caller specifies the minimum + * freeze level for which modifications should not be allowed. This allows + * certain operations to proceed while the freeze sequence is in progress, if + * necessary. + */ +bool +xfs_fs_writable( + struct xfs_mount *mp, + int level) +{ + ASSERT(level > SB_UNFROZEN); + if ((mp->m_super->s_writers.frozen >= level) || + XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + return false; + + return true; +} + +/* + * xfs_log_sbcount + * + * Sync the superblock counters to disk. + * + * Note this code can be called during the process of freezing, so we use the + * transaction allocator that does not block when the transaction subsystem is + * in its frozen state. + */ +int +xfs_log_sbcount(xfs_mount_t *mp) +{ + /* allow this to proceed during the freeze sequence... */ + if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) + return 0; + + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + return xfs_sync_sb(mp, true); +} + +/* + * Deltas for the inode count are +/-64, hence we use a large batch size + * of 128 so we don't need to take the counter lock on every update. + */ +#define XFS_ICOUNT_BATCH 128 +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) +{ + __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); + return -EINVAL; + } + return 0; +} + +int +xfs_mod_ifree( + struct xfs_mount *mp, + int64_t delta) +{ + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; +} + +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 +int +xfs_mod_fdblocks( + struct xfs_mount *mp, + int64_t delta, + bool rsvd) +{ + int64_t lcounter; + long long res_used; + s32 batch; + + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } + + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } + + /* + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ + if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + XFS_FDBLOCKS_BATCH) < 0) + batch = 1; + else + batch = XFS_FDBLOCKS_BATCH; + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + XFS_FDBLOCKS_BATCH) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; + + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; + } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: + spin_unlock(&mp->m_sb_lock); + return -ENOSPC; +} + +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; + spin_unlock(&mp->m_sb_lock); + return ret; +} + +/* + * xfs_getsb() is called to obtain the buffer for the superblock. + * The buffer is returned locked and read in from disk. + * The buffer should be released with a call to xfs_brelse(). + * + * If the flags parameter is BUF_TRYLOCK, then we'll only return + * the superblock buffer if it can be locked without sleeping. + * If it can't then we'll return NULL. + */ +struct xfs_buf * +xfs_getsb( + struct xfs_mount *mp, + int flags) +{ + struct xfs_buf *bp = mp->m_sb_bp; + + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) + return NULL; + xfs_buf_lock(bp); + } + + xfs_buf_hold(bp); + ASSERT(XFS_BUF_ISDONE(bp)); + return bp; +} + +/* + * Used to free the superblock along various error paths. + */ +void +xfs_freesb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp = mp->m_sb_bp; + + xfs_buf_lock(bp); + mp->m_sb_bp = NULL; + xfs_buf_relse(bp); +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + xfs_notice(mp, "%s required on read-only device.", message); + xfs_notice(mp, "write access unavailable, cannot proceed."); + return -EROFS; + } + return 0; +} diff --git a/kernel/fs/xfs/xfs_mount.h b/kernel/fs/xfs/xfs_mount.h new file mode 100644 index 000000000..8c995a2cc --- /dev/null +++ b/kernel/fs/xfs/xfs_mount.h @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MOUNT_H__ +#define __XFS_MOUNT_H__ + +struct xlog; +struct xfs_inode; +struct xfs_mru_cache; +struct xfs_nameops; +struct xfs_ail; +struct xfs_quotainfo; +struct xfs_dir_ops; +struct xfs_da_geometry; + +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + +typedef struct xfs_mount { + struct super_block *m_super; + xfs_tid_t m_tid; /* next unused tid for fs */ + struct xfs_ail *m_ail; /* fs active log item list */ + + struct xfs_sb m_sb; /* copy of fs superblock */ + spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + + struct xfs_buf *m_sb_bp; /* buffer for superblock */ + char *m_fsname; /* filesystem name */ + int m_fsname_len; /* strlen of fs name */ + char *m_rtname; /* realtime device name */ + char *m_logname; /* external log device name */ + int m_bsize; /* fs logical block size */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_readio_log; /* min read size log bytes */ + uint m_readio_blocks; /* min read size blocks */ + uint m_writeio_log; /* min write size log bytes */ + uint m_writeio_blocks; /* min write size blocks */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ + struct xlog *m_log; /* log specific stuff */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ + struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ + struct xfs_inode *m_rsumip; /* pointer to summary inode */ + struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_quotainfo *m_quotainfo; /* disk quota information */ + xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ + xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ + xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + __uint8_t m_blkbit_log; /* blocklog + NBBY */ + __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + __uint8_t m_agno_log; /* log #ag's */ + __uint8_t m_agino_log; /* #bits for agino in inum */ + uint m_inode_cluster_size;/* min inode buf size */ + uint m_blockmask; /* sb_blocksize-1 */ + uint m_blockwsize; /* sb_blocksize in words */ + uint m_blockwmask; /* blockwsize-1 */ + uint m_alloc_mxr[2]; /* max alloc btree records */ + uint m_alloc_mnr[2]; /* min alloc btree records */ + uint m_bmap_dmxr[2]; /* max bmap btree records */ + uint m_bmap_dmnr[2]; /* min bmap btree records */ + uint m_inobt_mxr[2]; /* max inobt btree records */ + uint m_inobt_mnr[2]; /* min inobt btree records */ + uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ + uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_in_maxlevels; /* max inobt btree levels. */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ + struct mutex m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ + __uint64_t m_flags; /* global mount flags */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_inoalign_mask;/* mask sb_inoalignmt if used */ + uint m_qflags; /* quota status flags */ + struct xfs_trans_resv m_resv; /* precomputed res values */ + __uint64_t m_maxicount; /* maximum inode count */ + __uint64_t m_resblks; /* total reserved blocks */ + __uint64_t m_resblks_avail;/* available reserved blocks */ + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + int m_sinoalign; /* stripe unit inode alignment */ + __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ + const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ + const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ + uint m_chsize; /* size of next field */ + atomic_t m_active_trans; /* number trans frozen */ + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ + bool m_update_sb; /* sb needs update in mount */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ + struct xfs_kobj m_kobj; + + struct workqueue_struct *m_buf_workqueue; + struct workqueue_struct *m_data_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each + * growfs, and used by the pNFS server to ensure the client updates + * its view of the block device once it gets a layout that might + * reference the newly added blocks. Does not need to be persistent + * as long as we only allow file system size increments, but if we + * ever support shrinks it would have to be persisted in addition + * to various other kinds of pain inflicted on the pNFS server. + */ + __uint32_t m_generation; +} xfs_mount_t; + +/* + * Flags for m_flags. + */ +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops + must be synchronous except + for space allocations */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) +#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem + operations, typically for + disk errors in metadata */ +#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ +#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment + allocations */ +#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ +#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ +#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ +#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ +#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above + * 32 bits in size */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ +#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ +#define XFS_MOUNT_BARRIER (1ULL << 17) +#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ +#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width + * allocation */ +#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ +#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ +#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred + * I/O size in stat() */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ + + +/* + * Default minimum read and write sizes. + */ +#define XFS_READIO_LOG_LARGE 16 +#define XFS_WRITEIO_LOG_LARGE 16 + +/* + * Max and min values for mount-option defined I/O + * preallocation sizes. + */ +#define XFS_MAX_IO_LOG 30 /* 1G */ +#define XFS_MIN_IO_LOG PAGE_SHIFT + +/* + * Synchronous read and write sizes. This should be + * better for NFSv2 wsync filesystems. + */ +#define XFS_WSYNC_READIO_LOG 15 /* 32k */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ + +/* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of caching + * so that we don't get inefficient read/modify/write I/O from user apps. + * Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in bytes + * as the recommended I/O size. It is not a stripe and we've set a default + * buffered I/O size, return that, otherwise return the compat default. + */ +static inline unsigned long +xfs_preferred_iosize(xfs_mount_t *mp) +{ + if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) + return PAGE_CACHE_SIZE; + return (mp->m_swidth ? + (mp->m_swidth << mp->m_sb.sb_blocklog) : + ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? + (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) : + PAGE_CACHE_SIZE)); +} + +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) +#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); +#define xfs_force_shutdown(m,f) \ + xfs_do_force_shutdown(m, f, __FILE__, __LINE__) + +#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ +#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ + +/* + * Flags for xfs_mountfs + */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ + +static inline xfs_agnumber_t +xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + do_div(ld, mp->m_sb.sb_agblocks); + return (xfs_agnumber_t) ld; +} + +static inline xfs_agblock_t +xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); +} + +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + +extern int xfs_log_sbcount(xfs_mount_t *); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern int xfs_mountfs(xfs_mount_t *mp); +extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); +extern void xfs_unmountfs(xfs_mount_t *); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + +extern int xfs_mount_log_sb(xfs_mount_t *); +extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); +extern int xfs_readsb(xfs_mount_t *, int); +extern void xfs_freesb(xfs_mount_t *); +extern bool xfs_fs_writable(struct xfs_mount *mp, int level); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +extern void xfs_set_low_space_thresholds(struct xfs_mount *); + +#endif /* __XFS_MOUNT_H__ */ diff --git a/kernel/fs/xfs/xfs_mru_cache.c b/kernel/fs/xfs/xfs_mru_cache.c new file mode 100644 index 000000000..f8a674d7f --- /dev/null +++ b/kernel/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; + +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + struct xfs_mru_cache *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->queued) { + mru->queued = 1; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, + mru->grp_count * mru->grp_time); + } + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + * + * We get called holding the mru->lock, which we drop and then reacquire. + * Sparse need special help with this to tell it we know what we are doing. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) +{ + struct xfs_mru_cache_elem *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + spin_unlock(&mru->lock); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + list_del_init(&elem->list_node); + mru->free_func(elem); + } + + spin_lock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); + unsigned long now, next; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + next = _xfs_mru_cache_migrate(mru, jiffies); + _xfs_mru_cache_clear_reap_list(mru); + + mru->queued = next; + if ((mru->queued > 0)) { + now = jiffies; + if (next <= now) + next = 0; + else + next -= now; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); + } + + spin_unlock(&mru->lock); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); + if (!xfs_mru_reap_wq) + return -ENOMEM; + return 0; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + struct xfs_mru_cache **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return -EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return -EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return -ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = -ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spin_lock_init(&mru->lock); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists); + if (err && mru) + kmem_free(mru); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing and the reaper is not running. + */ +static void +xfs_mru_cache_flush( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + if (mru->queued) { + spin_unlock(&mru->lock); + cancel_delayed_work_sync(&mru->work); + spin_lock(&mru->lock); + } + + _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); + _xfs_mru_cache_clear_reap_list(mru); + + spin_unlock(&mru->lock); +} + +void +xfs_mru_cache_destroy( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + xfs_mru_cache_flush(mru); + + kmem_free(mru->lists); + kmem_free(mru); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) +{ + int error; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return -EINVAL; + + if (radix_tree_preload(GFP_NOFS)) + return -ENOMEM; + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + + spin_lock(&mru->lock); + error = radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + if (!error) + _xfs_mru_cache_list_insert(mru, elem); + spin_unlock(&mru->lock); + + return error; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_remove( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) + list_del(&elem->list_node); + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + * + * Because sparse isn't smart enough to know about conditional lock return + * status, we need to help it get it right by annotating the path that does + * not release the lock. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + __release(mru_lock); /* help sparse not be stupid */ + } else + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + struct xfs_mru_cache *mru) + __releases(mru->lock) +{ + spin_unlock(&mru->lock); +} diff --git a/kernel/fs/xfs/xfs_mru_cache.h b/kernel/fs/xfs/xfs_mru_cache.h new file mode 100644 index 000000000..fb5245ba5 --- /dev/null +++ b/kernel/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + +struct xfs_mru_cache; + +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/kernel/fs/xfs/xfs_pnfs.c b/kernel/fs/xfs/xfs_pnfs.c new file mode 100644 index 000000000..981a657ec --- /dev/null +++ b/kernel/fs/xfs/xfs_pnfs.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_pnfs.h" + +/* + * Ensure that we do not have any outstanding pNFS layouts that can be used by + * clients to directly read from or write to this inode. This must be called + * before every operation that can remove blocks from the extent map. + * Additionally we call it during the write operation, where aren't concerned + * about exposing unallocated blocks but just want to provide basic + * synchronization between a local writer and pNFS clients. mmap writes would + * also benefit from this sort of synchronization, but due to the tricky locking + * rules in the page fault path we don't bother. + */ +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + bool with_imutex) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); + error = break_layout(inode, true); + *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, *iolock); + } + + return error; +} + +/* + * Get a unique ID including its location so that the client can identify + * the exported device. + */ +int +xfs_fs_get_uuid( + struct super_block *sb, + u8 *buf, + u32 *len, + u64 *offset) +{ + struct xfs_mount *mp = XFS_M(sb); + + printk_once(KERN_NOTICE +"XFS (%s): using experimental pNFS feature, use at your own risk!\n", + mp->m_fsname); + + if (*len < sizeof(uuid_t)) + return -EINVAL; + + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + *len = sizeof(uuid_t); + *offset = offsetof(struct xfs_dsb, sb_uuid); + return 0; +} + +static void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +} + +/* + * Get a layout for the pNFS client. + */ +int +xfs_fs_map_blocks( + struct inode *inode, + loff_t offset, + u64 length, + struct iomap *iomap, + bool write, + u32 *device_generation) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + loff_t limit; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + uint lock_flags; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * We can't export inodes residing on the realtime device. The realtime + * device doesn't have a UUID to identify it, so the client has no way + * to find it. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return -ENXIO; + + /* + * Lock out any other I/O before we flush and invalidate the pagecache, + * and then hand out a layout to the remote system. This is very + * similar to direct I/O, except that the synchronization is much more + * complicated. See the comment near xfs_break_layouts for a detailed + * explanation. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = -EINVAL; + limit = mp->m_super->s_maxbytes; + if (!write) + limit = max(limit, round_up(i_size_read(inode), + inode->i_sb->s_blocksize)); + if (offset > limit) + goto out_unlock; + if (offset > limit - length) + length = limit - offset; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (WARN_ON_ONCE(error)) + return error; + + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, lock_flags); + + if (error) + goto out_unlock; + + if (write) { + enum xfs_prealloc_flags flags = 0; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + error = xfs_iomap_write_direct(ip, offset, length, + &imap, nimaps); + if (error) + goto out_unlock; + + /* + * Ensure the next transaction is committed + * synchronously so that the blocks allocated and + * handed out to the client are guaranteed to be + * present even after a server crash. + */ + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; + } + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + xfs_bmbt_to_iomap(ip, iomap, &imap); + *device_generation = mp->m_generation; + return error; +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Ensure the size update falls into a valid allocated block. + */ +static int +xfs_pnfs_validate_isize( + struct xfs_inode *ip, + xfs_off_t isize) +{ + struct xfs_bmbt_irec imap; + int nimaps = 1; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + return error; + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) + return -EIO; + return 0; +} + +/* + * Make sure the blocks described by maps are stable on disk. This includes + * converting any unwritten extents, flushing the disk cache and updating the + * time stamps. + * + * Note that we rely on the caller to always send us a timestamp update so that + * we always commit a transaction here. If that stops being true we will have + * to manually flush the cache here similar to what the fsync code path does + * for datasyncs on files that have no dirty metadata. + */ +int +xfs_fs_commit_blocks( + struct inode *inode, + struct iomap *maps, + int nr_maps, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + bool update_isize = false; + int error, i; + loff_t size; + + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + size = i_size_read(inode); + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { + update_isize = true; + size = iattr->ia_size; + } + + for (i = 0; i < nr_maps; i++) { + u64 start, length, end; + + start = maps[i].offset; + if (start > size) + continue; + + end = start + maps[i].length; + if (end > size) + end = size; + + length = end - start; + if (!length) + continue; + + /* + * Make sure reads through the pagecache see the new data. + */ + error = invalidate_inode_pages2_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(error); + + error = xfs_iomap_write_unwritten(ip, start, length); + if (error) + goto out_drop_iolock; + } + + if (update_isize) { + error = xfs_pnfs_validate_isize(ip, size); + if (error) + goto out_drop_iolock; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_iolock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + xfs_setattr_time(ip, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; + } + + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_drop_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} diff --git a/kernel/fs/xfs/xfs_pnfs.h b/kernel/fs/xfs/xfs_pnfs.h new file mode 100644 index 000000000..8147ac108 --- /dev/null +++ b/kernel/fs/xfs/xfs_pnfs.h @@ -0,0 +1,19 @@ +#ifndef _XFS_PNFS_H +#define _XFS_PNFS_H 1 + +#ifdef CONFIG_NFSD_PNFS +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, + struct iomap *iomap, bool write, u32 *device_generation); +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, + struct iattr *iattr); + +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +#else +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _XFS_PNFS_H */ diff --git a/kernel/fs/xfs/xfs_qm.c b/kernel/fs/xfs/xfs_qm.c new file mode 100644 index 000000000..5538468c7 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm.c @@ -0,0 +1,1939 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_cksum.h" + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); + + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); +/* + * We use the batch lookup interface to iterate over the dquots as it + * currently is the only interface into the radix tree code that allows + * fuzzy lookups instead of exact matches. Holding the lock over multiple + * operations is fine as all callers are used either during mount/umount + * or quotaoff. + */ +#define XFS_DQ_LOOKUP_BATCH 32 + +STATIC int +xfs_qm_dquot_walk( + struct xfs_mount *mp, + int type, + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + uint32_t next_index; + int last_error = 0; + int skipped; + int nr_found; + +restart: + skipped = 0; + next_index = 0; + nr_found = 0; + + while (1) { + struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; + int error = 0; + int i; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)batch, + next_index, XFS_DQ_LOOKUP_BATCH); + if (!nr_found) { + mutex_unlock(&qi->qi_tree_lock); + break; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_dquot *dqp = batch[i]; + + next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + + error = execute(batch[i], data); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + mutex_unlock(&qi->qi_tree_lock); + + /* bail out if the filesystem is corrupted. */ + if (last_error == -EFSCORRUPTED) { + skipped = 0; + break; + } + } + + if (skipped) { + delay(1); + goto restart; + } + + return last_error; +} + + +/* + * Purge a dquot from all tracking data structures and free it. + */ +STATIC int +xfs_qm_dqpurge( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + xfs_dqlock(dqp); + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + return -EAGAIN; + } + + dqp->dq_flags |= XFS_DQ_FREEING; + + xfs_dqflock(dqp); + + /* + * If we are turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + /* + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + xfs_dqflock(dqp); + } + + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + qi->qi_dquots--; + + /* + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. + */ + ASSERT(!list_empty(&dqp->q_lru)); + list_lru_del(&qi->qi_lru, &dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + + xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Purge the dquot cache. + */ +void +xfs_qm_dqpurge_all( + struct xfs_mount *mp, + uint flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_GQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_PQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); +} + +/* + * Just destroy the quotainfo structure. + */ +void +xfs_qm_unmount( + struct xfs_mount *mp) +{ + if (mp->m_quotainfo) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_destroy_quotainfo(mp); + } +} + +/* + * Called from the vfsops layer. + */ +void +xfs_qm_unmount_quotas( + xfs_mount_t *mp) +{ + /* + * Release the dquots that root inode, et al might be holding, + * before we flush quotas and blow away the quotainfo structure. + */ + ASSERT(mp->m_rootip); + xfs_qm_dqdetach(mp->m_rootip); + if (mp->m_rbmip) + xfs_qm_dqdetach(mp->m_rbmip); + if (mp->m_rsumip) + xfs_qm_dqdetach(mp->m_rsumip); + + /* + * Release the quota inodes. + */ + if (mp->m_quotainfo) { + if (mp->m_quotainfo->qi_uquotaip) { + IRELE(mp->m_quotainfo->qi_uquotaip); + mp->m_quotainfo->qi_uquotaip = NULL; + } + if (mp->m_quotainfo->qi_gquotaip) { + IRELE(mp->m_quotainfo->qi_gquotaip); + mp->m_quotainfo->qi_gquotaip = NULL; + } + if (mp->m_quotainfo->qi_pquotaip) { + IRELE(mp->m_quotainfo->qi_pquotaip); + mp->m_quotainfo->qi_pquotaip = NULL; + } + } +} + +STATIC int +xfs_qm_dqattach_one( + xfs_inode_t *ip, + xfs_dqid_t id, + uint type, + uint doalloc, + xfs_dquot_t **IO_idqpp) +{ + xfs_dquot_t *dqp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + error = 0; + + /* + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. + */ + dqp = *IO_idqpp; + if (dqp) { + trace_xfs_dqattach_found(dqp); + return 0; + } + + /* + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. + */ + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; + + trace_xfs_dqattach_get(dqp); + + /* + * dqget may have dropped and re-acquired the ilock, but it guarantees + * that the dquot returned is the one that should go in the inode. + */ + *IO_idqpp = dqp; + xfs_dqunlock(dqp); + return 0; +} + +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return false; + return true; +} + +/* + * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON + * into account. + * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * Inode may get unlocked and relocked in here, and the caller must deal with + * the consequences. + */ +int +xfs_qm_dqattach_locked( + xfs_inode_t *ip, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + int error = 0; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, + flags & XFS_QMOPT_DQALLOC, + &ip->i_udquot); + if (error) + goto done; + ASSERT(ip->i_udquot); + } + + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + &ip->i_gdquot); + if (error) + goto done; + ASSERT(ip->i_gdquot); + } + + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { + error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + flags & XFS_QMOPT_DQALLOC, + &ip->i_pdquot); + if (error) + goto done; + ASSERT(ip->i_pdquot); + } + +done: + /* + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. + */ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + return error; +} + +int +xfs_qm_dqattach( + struct xfs_inode *ip, + uint flags) +{ + int error; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqattach_locked(ip, flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Release dquots (and their references) if any. + * The inode should be locked EXCL except when this's called by + * xfs_ireclaim. + */ +void +xfs_qm_dqdetach( + xfs_inode_t *ip) +{ + if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) + return; + + trace_xfs_dquot_dqdetach(ip); + + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); + if (ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } +} + +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) + __releases(lru_lock) __acquires(lru_lock) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_lru_isolate(lru, &dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_shrink_walk(&qi->qi_lru, sc, + xfs_qm_dquot_isolate, &isol); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_shrink_count(&qi->qi_lru, sc); +} + +/* + * This initializes all the quota information that's kept in the + * mount structure + */ +STATIC int +xfs_qm_init_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qinf; + int error; + xfs_dquot_t *dqp; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + + error = list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; + + /* + * See if quotainodes are setup, and if not, allocate them, + * and change the superblock accordingly. + */ + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; + + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); + mutex_init(&qinf->qi_tree_lock); + + /* mutex used to serialize quotaoffs */ + mutex_init(&qinf->qi_quotaofflock); + + /* Precalc some constants */ + qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + + mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * We look at the USR dquot with id == 0 first, but if user quotas + * are not enabled we goto the GRP dquot with id == 0. + * We don't really care to keep separate default limits for user + * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + */ + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + qinf->qi_btimelimit = ddqp->d_btimer ? + be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = ddqp->d_itimer ? + be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? + be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = ddqp->d_bwarns ? + be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = ddqp->d_iwarns ? + be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? + be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; + qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + xfs_qm_dqdestroy(dqp); + } else { + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + } + + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&qinf->qi_shrinker); + return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; +} + + +/* + * Gets called when unmounting a filesystem or when all quotas get + * turned off. + * This purges the quota inodes, destroys locks and frees itself. + */ +void +xfs_qm_destroy_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qi; + + qi = mp->m_quotainfo; + ASSERT(qi != NULL); + + unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); + + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } + mutex_destroy(&qi->qi_quotaofflock); + kmem_free(qi); + mp->m_quotainfo = NULL; +} + +/* + * Create an inode and return with a reference already taken, but unlocked + * This is how we create quota inodes + */ +STATIC int +xfs_qm_qino_alloc( + xfs_mount_t *mp, + xfs_inode_t **ip, + uint flags) +{ + xfs_trans_t *tp; + int error; + int committed; + bool need_alloc = true; + + *ip = NULL; + /* + * With superblock that doesn't have separate pquotino, we + * share an inode between gquota and pquota. If the on-disk + * superblock has GQUOTA and the filesystem is now mounted + * with PQUOTA, just use sb_gquotino for sb_pquotino and + * vice-versa. + */ + if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { + xfs_ino_t ino = NULLFSINO; + + if ((flags & XFS_QMOPT_PQUOTA) && + (mp->m_sb.sb_gquotino != NULLFSINO)) { + ino = mp->m_sb.sb_gquotino; + ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + } else if ((flags & XFS_QMOPT_GQUOTA) && + (mp->m_sb.sb_pquotino != NULLFSINO)) { + ino = mp->m_sb.sb_pquotino; + ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + } + if (ino != NULLFSINO) { + error = xfs_iget(mp, NULL, ino, 0, 0, ip); + if (error) + return error; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; + } + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + if (need_alloc) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, + &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } + } + + /* + * Make the changes in the superblock, and log those too. + * sbfields arg may contain fields other than *QUOTINO; + * VERSIONNUM for example. + */ + spin_lock(&mp->m_sb_lock); + if (flags & XFS_QMOPT_SBVERSION) { + ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + + xfs_sb_version_addquota(&mp->m_sb); + mp->m_sb.sb_uquotino = NULLFSINO; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; + } + if (flags & XFS_QMOPT_UQUOTA) + mp->m_sb.sb_uquotino = (*ip)->i_ino; + else if (flags & XFS_QMOPT_GQUOTA) + mp->m_sb.sb_gquotino = (*ip)->i_ino; + else + mp->m_sb.sb_pquotino = (*ip)->i_ino; + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_alert(mp, "%s failed (error %d)!", __func__, error); + } + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; +} + + +STATIC void +xfs_qm_reset_dqcounts( + xfs_mount_t *mp, + xfs_buf_t *bp, + xfs_dqid_t id, + uint type) +{ + struct xfs_dqblk *dqb; + int j; + + trace_xfs_reset_dqcounts(bp, _RET_IP_); + + /* + * Reset all counters and timers. They'll be + * started afresh by xfs_qm_quotacheck. + */ +#ifdef DEBUG + j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + do_div(j, sizeof(xfs_dqblk_t)); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); +#endif + dqb = bp->b_addr; + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + + /* + * Do a sanity check, and if needed, repair the dqblk. Don't + * output any warnings because it's perfectly possible to + * find uninitialised dquot blks. See comment in xfs_dqcheck. + */ + xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; + ddq->d_bcount = 0; + ddq->d_icount = 0; + ddq->d_rtbcount = 0; + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } +} + +STATIC int +xfs_qm_dqiter_bufs( + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_buf *bp; + int error; + int type; + + ASSERT(blkcnt > 0); + type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : + (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); + error = 0; + + /* + * Blkcnt arg can be a very big number, and might even be + * larger than the log itself. So, we have to break it up into + * manageable-sized transactions. + * Note that we don't start a permanent transaction here; we might + * not be able to get a log reservation for the whole thing up front, + * and we don't really care to either, because we just discard + * everything if we were to crash in the middle of this loop. + */ + while (blkcnt--) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + + /* + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. + */ + if (error == -EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + + /* + * A corrupt buffer might not have a verifier attached, so + * make sure we have the correct one attached before writeback + * occurs. + */ + bp->b_ops = &xfs_dquot_buf_ops; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); + + /* goto the next block. */ + bno++; + firstid += mp->m_quotainfo->qi_dqperchunk; + } + + return error; +} + +/* + * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a + * caller supplied function for every chunk of dquots that we find. + */ +STATIC int +xfs_qm_dqiterate( + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_bmbt_irec *map; + int i, nmaps; /* number of map entries */ + int error; /* return value */ + xfs_fileoff_t lblkno; + xfs_filblks_t maxlblkcnt; + xfs_dqid_t firstid; + xfs_fsblock_t rablkno; + xfs_filblks_t rablkcnt; + + error = 0; + /* + * This looks racy, but we can't keep an inode lock across a + * trans_reserve. But, this gets called during quotacheck, and that + * happens only at mount time which is single threaded. + */ + if (qip->i_d.di_nblocks == 0) + return 0; + + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + + lblkno = 0; + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + do { + uint lock_mode; + + nmaps = XFS_DQITER_MAP_SIZE; + /* + * We aren't changing the inode itself. Just changing + * some of its data. No new blocks are added here, and + * the inode is never added to the transaction. + */ + lock_mode = xfs_ilock_data_map_shared(qip); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); + xfs_iunlock(qip, lock_mode); + if (error) + break; + + ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); + for (i = 0; i < nmaps; i++) { + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + ASSERT(map[i].br_blockcount); + + + lblkno += map[i].br_blockcount; + + if (map[i].br_startblock == HOLESTARTBLOCK) + continue; + + firstid = (xfs_dqid_t) map[i].br_startoff * + mp->m_quotainfo->qi_dqperchunk; + /* + * Do a read-ahead on the next extent. + */ + if ((i+1 < nmaps) && + (map[i+1].br_startblock != HOLESTARTBLOCK)) { + rablkcnt = map[i+1].br_blockcount; + rablkno = map[i+1].br_startblock; + while (rablkcnt--) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + mp->m_quotainfo->qi_dqchunklen, + &xfs_dquot_buf_ops); + rablkno++; + } + } + /* + * Iterate thru all the blks in the extent and + * reset the counters of all the dquots inside them. + */ + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; + } + } while (nmaps > 0); + +out: + kmem_free(map); + return error; +} + +/* + * Called by dqusage_adjust in doing a quotacheck. + * + * Given the inode, and a dquot id this updates both the incore dqout as well + * as the buffer copy. This is so that once the quotacheck is done, we can + * just log all the buffers, as opposed to logging numerous updates to + * individual dquots. + */ +STATIC int +xfs_qm_quotacheck_dqadjust( + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + xfs_qcnt_t nblks, + xfs_qcnt_t rtblks) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget(mp, ip, id, type, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + if (error) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != -ESRCH); + ASSERT(error != -ENOENT); + return error; + } + + trace_xfs_dqadjust(dqp); + + /* + * Adjust the inode count and the block count to reflect this inode's + * resource usage. + */ + be64_add_cpu(&dqp->q_core.d_icount, 1); + dqp->q_res_icount++; + if (nblks) { + be64_add_cpu(&dqp->q_core.d_bcount, nblks); + dqp->q_res_bcount += nblks; + } + if (rtblks) { + be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); + dqp->q_res_rtbcount += rtblks; + } + + /* + * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. + */ + if (dqp->q_core.d_id) { + xfs_qm_adjust_dqlimits(mp, dqp); + xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_qm_dqput(dqp); + return 0; +} + +STATIC int +xfs_qm_get_rtblks( + xfs_inode_t *ip, + xfs_qcnt_t *O_rtblks) +{ + xfs_filblks_t rtblks; /* total rt blks */ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + int error; + + ASSERT(XFS_IS_REALTIME_INODE(ip)); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + return error; + } + rtblks = 0; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0; idx < nextents; idx++) + rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); + *O_rtblks = (xfs_qcnt_t)rtblks; + return 0; +} + +/* + * callback routine supplied to bulkstat(). Given an inumber, find its + * dquots and update them to account for resources taken by that inode. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqusage_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + int *ubused, /* not used */ + int *res) /* result code value */ +{ + xfs_inode_t *ip; + xfs_qcnt_t nblks, rtblks = 0; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * rootino must have its resources accounted for, not so with the quota + * inodes. + */ + if (xfs_is_quota_inode(&mp->m_sb, ino)) { + *res = BULKSTAT_RV_NOTHING; + return -EINVAL; + } + + /* + * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget + * interface expects the inode to be exclusively locked because that's + * the case in all other instances. It's OK that we do this because + * quotacheck is done only at mount time. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + *res = BULKSTAT_RV_NOTHING; + return error; + } + + ASSERT(ip->i_delayed_blks == 0); + + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Walk thru the extent list and count the realtime blocks. + */ + error = xfs_qm_get_rtblks(ip, &rtblks); + if (error) + goto error0; + } + + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + + /* + * Add the (disk blocks and inode) resources occupied by this + * inode to its dquots. We do this adjustment in the incore dquot, + * and also copy the changes to its buffer. + * We don't care about putting these changes in a transaction + * envelope because if we crash in the middle of a 'quotacheck' + * we have to start from the beginning anyway. + * Once we're done, we'll log all the dquot bufs. + * + * The *QUOTA_ON checks below may look pretty racy, but quotachecks + * and quotaoffs don't race. (Quotachecks happen at mount time only). + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, + XFS_DQ_USER, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, + XFS_DQ_GROUP, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), + XFS_DQ_PROJ, nblks, rtblks); + if (error) + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_DIDONE; + return 0; + +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_GIVEUP; + return error; +} + +STATIC int +xfs_qm_flush_one( + struct xfs_dquot *dqp, + void *data) +{ + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; + int error = 0; + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) + goto out_unlock; + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); +out_unlock: + xfs_dqunlock(dqp); + return error; +} + +/* + * Walk thru all the filesystem inodes and construct a consistent view + * of the disk quota world. If the quotacheck fails, disable quotas. + */ +STATIC int +xfs_qm_quotacheck( + xfs_mount_t *mp) +{ + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; + struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; + + count = INT_MAX; + structsz = 1; + lastino = 0; + flags = 0; + + ASSERT(uip || gip || pip); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + xfs_notice(mp, "Quotacheck needed: Please wait."); + + /* + * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset + * their counters to zero. We need a clean slate. + * We don't log our changes till later. + */ + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_UQUOTA_CHKD; + } + + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_GQUOTA_CHKD; + } + + if (pip) { + error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_PQUOTA_CHKD; + } + + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters in core. + */ + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) + break; + + } while (!done); + + /* + * We've made all the changes that we need to make incore. Flush them + * down to disk buffers if everything was updated successfully. + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } + if (XFS_IS_GQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + if (XFS_IS_PQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + + /* + * We can get this error if we couldn't do a dquot allocation inside + * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the + * dirty dquots that might be cached, we just want to get rid of them + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ + if (error) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; + } + + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; + mp->m_qflags |= flags; + + error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + + if (error) { + xfs_warn(mp, + "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", + error); + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo != NULL); + xfs_qm_destroy_quotainfo(mp); + if (xfs_mount_reset_sbqflags(mp)) { + xfs_warn(mp, + "Quotacheck: Failed to reset quota flags."); + } + } else + xfs_notice(mp, "Quotacheck: Done."); + return error; +} + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. + */ +void +xfs_qm_mount_quotas( + struct xfs_mount *mp) +{ + int error = 0; + uint sbf; + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); + mp->m_qflags = 0; + goto write_changes; + } + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + error = xfs_qm_init_quotainfo(mp); + if (error) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; + } + } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; + + write_changes: + /* + * We actually don't have to acquire the m_sb_lock at all. + * This can only be called from mount, and that's single threaded. XXX + */ + spin_lock(&mp->m_sb_lock); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_sync_sb(mp, false)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_alert(mp, "%s: Superblock update failed!", + __func__); + } + } + + if (error) { + xfs_warn(mp, "Failed to initialize disk quotas."); + return; + } +} + +/* + * This is called after the superblock has been read in and we're ready to + * iget the quota inodes. + */ +STATIC int +xfs_qm_init_quotainos( + xfs_mount_t *mp) +{ + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + int error; + uint flags = 0; + + ASSERT(mp->m_quotainfo); + + /* + * Get the uquota and gquota inodes + */ + if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (XFS_IS_UQUOTA_ON(mp) && + mp->m_sb.sb_uquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_uquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) + return error; + } + if (XFS_IS_GQUOTA_ON(mp) && + mp->m_sb.sb_gquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_gquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; + } + if (XFS_IS_PQUOTA_ON(mp) && + mp->m_sb.sb_pquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_pquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip); + if (error) + goto error_rele; + } + } else { + flags |= XFS_QMOPT_SBVERSION; + } + + /* + * Create the three inodes, if they don't exist already. The changes + * made above will get added to a transaction and logged in one of + * the qino_alloc calls below. If the device is readonly, + * temporarily switch to read-write to do this. + */ + if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { + error = xfs_qm_qino_alloc(mp, &uip, + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { + error = xfs_qm_qino_alloc(mp, &gip, + flags | XFS_QMOPT_GQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { + error = xfs_qm_qino_alloc(mp, &pip, + flags | XFS_QMOPT_PQUOTA); + if (error) + goto error_rele; + } + + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; + mp->m_quotainfo->qi_pquotaip = pip; + + return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + if (pip) + IRELE(pip); + return error; +} + +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + mutex_lock(&qi->qi_tree_lock); + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + + qi->qi_dquots--; + mutex_unlock(&qi->qi_tree_lock); + + xfs_qm_dqdestroy(dqp); +} + +/* --------------- utility functions for vnodeops ---------------- */ + + +/* + * Given an inode, a uid, gid and prid make sure that we have + * allocated relevant dquot(s) on disk, and that we won't exceed inode + * quotas by creating this file. + * This also attaches dquot(s) to the given inode after locking it, + * and returns the dquots corresponding to the uid and/or gid. + * + * in : inode (unlocked) + * out : udquot, gdquot with references taken and unlocked + */ +int +xfs_qm_vop_dqalloc( + struct xfs_inode *ip, + xfs_dqid_t uid, + xfs_dqid_t gid, + prid_t prid, + uint flags, + struct xfs_dquot **O_udqpp, + struct xfs_dquot **O_gdqpp, + struct xfs_dquot **O_pdqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; + struct xfs_dquot *pq = NULL; + int error; + uint lockflags; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + lockflags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockflags); + + if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) + gid = ip->i_d.di_gid; + + /* + * Attach the dquot(s) to this inode, doing a dquot allocation + * if necessary. The dquot(s) will not be locked. + */ + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + if (error) { + xfs_iunlock(ip, lockflags); + return error; + } + } + + if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_d.di_uid != uid) { + /* + * What we need is the dquot that has this uid, and + * if we send the inode to dqget, the uid of the inode + * takes priority over what's sent in the uid argument. + * We must unlock inode here before calling dqget if + * we're not sending the inode, because otherwise + * we'll deadlock by doing trans_reserve while + * holding ilock. + */ + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, uid, + XFS_DQ_USER, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &uq); + if (error) { + ASSERT(error != -ENOENT); + return error; + } + /* + * Get the ilock in the right order. + */ + xfs_dqunlock(uq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + /* + * Take an extra reference, because we'll return + * this to caller + */ + ASSERT(ip->i_udquot); + uq = xfs_qm_dqhold(ip->i_udquot); + } + } + if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_d.di_gid != gid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, gid, + XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq); + if (error) { + ASSERT(error != -ENOENT); + goto error_rele; + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = xfs_qm_dqhold(ip->i_gdquot); + } + } + if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + if (xfs_get_projid(ip) != prid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &pq); + if (error) { + ASSERT(error != -ENOENT); + goto error_rele; + } + xfs_dqunlock(pq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_pdquot); + pq = xfs_qm_dqhold(ip->i_pdquot); + } + } + if (uq) + trace_xfs_dquot_dqalloc(ip); + + xfs_iunlock(ip, lockflags); + if (O_udqpp) + *O_udqpp = uq; + else + xfs_qm_dqrele(uq); + if (O_gdqpp) + *O_gdqpp = gq; + else + xfs_qm_dqrele(gq); + if (O_pdqpp) + *O_pdqpp = pq; + else + xfs_qm_dqrele(pq); + return 0; + +error_rele: + xfs_qm_dqrele(gq); + xfs_qm_dqrele(uq); + return error; +} + +/* + * Actually transfer ownership, and do dquot modifications. + * These were already reserved. + */ +xfs_dquot_t * +xfs_qm_vop_chown( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t **IO_olddq, + xfs_dquot_t *newdq) +{ + xfs_dquot_t *prevdq; + uint bfield = XFS_IS_REALTIME_INODE(ip) ? + XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + + /* old dquot */ + prevdq = *IO_olddq; + ASSERT(prevdq); + ASSERT(prevdq != newdq); + + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + + /* the sparkling new dquot */ + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + + /* + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. + */ + *IO_olddq = xfs_qm_dqhold(newdq); + + return prevdq; +} + +/* + * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). + */ +int +xfs_qm_vop_chown_reserve( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *pdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + struct xfs_dquot *pdq_delblks = NULL; + int error; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + delblks = ip->i_delayed_blks; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && + ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + udq_delblks = udqp; + /* + * If there are delayed allocation blocks, then we have to + * unreserve those from the old dquot, and add them to the + * new dquot. + */ + if (delblks) { + ASSERT(ip->i_udquot); + udq_unres = ip->i_udquot; + } + } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + gdq_delblks = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + gdq_unres = ip->i_gdquot; + } + } + + if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && + xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + prjflags = XFS_QMOPT_ENOSPC; + pdq_delblks = pdqp; + if (delblks) { + ASSERT(ip->i_pdquot); + pdq_unres = ip->i_pdquot; + } + } + + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; + + /* + * Do the delayed blks reservations/unreservations now. Since, these + * are done without the help of a transaction, if a reservation fails + * its previous reservations won't be automatically undone by trans + * code. So, we have to do it manually here. + */ + if (delblks) { + /* + * Do the reservations first. Unreservation can't fail. + */ + ASSERT(udq_delblks || gdq_delblks || pdq_delblks); + ASSERT(udq_unres || gdq_unres || pdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; + xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_unres, gdq_unres, pdq_unres, + -((xfs_qcnt_t)delblks), 0, blkflags); + } + + return 0; +} + +int +xfs_qm_vop_rename_dqattach( + struct xfs_inode **i_tab) +{ + struct xfs_mount *mp = i_tab[0]->i_mount; + int i; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + for (i = 0; (i < 4 && i_tab[i]); i++) { + struct xfs_inode *ip = i_tab[i]; + int error; + + /* + * Watch out for duplicate entries in the table. + */ + if (i == 0 || ip != i_tab[i-1]) { + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + } + } + } + return 0; +} + +void +xfs_qm_vop_create_dqattach( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (udqp && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); + xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); + xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(ip->i_pdquot == NULL); + ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + + ip->i_pdquot = xfs_qm_dqhold(pdqp); + xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); + } +} + diff --git a/kernel/fs/xfs/xfs_qm.h b/kernel/fs/xfs/xfs_qm.h new file mode 100644 index 000000000..996a04064 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QM_H__ +#define __XFS_QM_H__ + +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + +struct xfs_inode; + +extern struct kmem_zone *xfs_qm_dqtrxzone; + +/* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* + * This defines the unit of allocation of dquots. + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * XXXsup However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* + * Various quota information for individual filesystems. + * The mount structure keeps a pointer to this. + */ +typedef struct xfs_quotainfo { + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; + struct xfs_inode *qi_uquotaip; /* user quota inode */ + struct xfs_inode *qi_gquotaip; /* group quota inode */ + struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct list_lru qi_lru; + int qi_dquots; + time_t qi_btimelimit; /* limit for blks timer */ + time_t qi_itimelimit; /* limit for inodes timer */ + time_t qi_rtbtimelimit;/* limit for rt blks timer */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ + xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct shrinker qi_shrinker; +} xfs_quotainfo_t; + +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + return &qi->qi_gquota_tree; + case XFS_DQ_PROJ: + return &qi->qi_pquota_tree; + default: + ASSERT(0); + } + return NULL; +} + +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_pquotaip; + default: + ASSERT(0); + } + return NULL; +} + +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, + long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); + +/* + * We keep the usr, grp, and prj dquots separately so that locking will be + * easier to do at commit time. All transactions that we know of at this point + * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. + */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_PRJ, + XFS_QM_TRANS_DQTYPES +}; +#define XFS_QM_TRANS_MAXDQS 2 +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; + +/* + * Users are allowed to have a usage exceeding their softlimit for + * a period this long. + */ +#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ + +#define XFS_QM_BWARNLIMIT 5 +#define XFS_QM_IWARNLIMIT 5 +#define XFS_QM_RTBWARNLIMIT 5 + +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); + +/* dquot stuff */ +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); + +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct qc_dqblk *); +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, + struct qc_dqblk *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); + +#endif /* __XFS_QM_H__ */ diff --git a/kernel/fs/xfs/xfs_qm_bhv.c b/kernel/fs/xfs/xfs_qm_bhv.c new file mode 100644 index 000000000..3e52d5de7 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm_bhv.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_qm.h" + + +STATIC void +xfs_fill_statvfs_from_dquot( + struct kstatfs *statp, + struct xfs_dquot *dqp) +{ + __uint64_t limit; + + limit = dqp->q_core.d_blk_softlimit ? + be64_to_cpu(dqp->q_core.d_blk_softlimit) : + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (limit && statp->f_blocks > limit) { + statp->f_blocks = limit; + statp->f_bfree = statp->f_bavail = + (statp->f_blocks > dqp->q_res_bcount) ? + (statp->f_blocks - dqp->q_res_bcount) : 0; + } + + limit = dqp->q_core.d_ino_softlimit ? + be64_to_cpu(dqp->q_core.d_ino_softlimit) : + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (limit && statp->f_files > limit) { + statp->f_files = limit; + statp->f_ffree = + (statp->f_files > dqp->q_res_icount) ? + (statp->f_ffree - dqp->q_res_icount) : 0; + } +} + + +/* + * Directory tree accounting is implemented using project quotas, where + * the project identifier is inherited from parent directories. + * A statvfs (df, etc.) of a directory that is using project quota should + * return a statvfs of the project, not the entire filesystem. + * This makes such trees appear as if they are filesystems in themselves. + */ +void +xfs_qm_statvfs( + xfs_inode_t *ip, + struct kstatfs *statp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_dquot_t *dqp; + + if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + xfs_fill_statvfs_from_dquot(statp, dqp); + xfs_qm_dqput(dqp); + } +} + +int +xfs_qm_newmount( + xfs_mount_t *mp, + uint *needquotamount, + uint *quotaflags) +{ + uint quotaondisk; + uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; + + quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); + + if (quotaondisk) { + uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; + gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; + } + + /* + * If the device itself is read-only, we can't allow + * the user to change the state of quota on the mount - + * this would generate a transaction on the ro device, + * which would lead to an I/O error and shutdown + */ + + if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || + (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || + (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && + xfs_dev_is_read_only(mp, "changing quota state")) { + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); + return -EPERM; + } + + if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { + /* + * Call mount_quotas at this point only if we won't have to do + * a quotacheck. + */ + if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { + /* + * If an error occurred, qm_mount_quotas code + * has already disabled quotas. So, just finish + * mounting, and get on with the boring life + * without disk quotas. + */ + xfs_qm_mount_quotas(mp); + } else { + /* + * Clear the quota flags, but remember them. This + * is so that the quota code doesn't get invoked + * before we're ready. This can happen when an + * inode goes inactive and wants to free blocks, + * or via xfs_log_mount_finish. + */ + *needquotamount = true; + *quotaflags = mp->m_qflags; + mp->m_qflags = 0; + } + } + + return 0; +} diff --git a/kernel/fs/xfs/xfs_qm_syscalls.c b/kernel/fs/xfs/xfs_qm_syscalls.c new file mode 100644 index 000000000..9a25c9275 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm_syscalls.c @@ -0,0 +1,770 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); +STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, + uint); + +/* + * Turn off quota accounting and/or enforcement for all udquots and/or + * gdquots. Called only at unmount time. + * + * This assumes that there are no dquots of this file system cached + * incore, and modifies the ondisk dquot directly. Therefore, for example, + * it is an error to call this twice, without purging the cache. + */ +int +xfs_qm_scall_quotaoff( + xfs_mount_t *mp, + uint flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + uint dqtype; + int error; + uint inactivate_flags; + xfs_qoff_logitem_t *qoffstart; + + /* + * No file system can have quotas enabled on disk but not in core. + * Note that quota utilities (like quotaoff) _expect_ + * errno == -EEXIST here. + */ + if ((mp->m_qflags & flags) == 0) + return -EEXIST; + error = 0; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + + /* + * We don't want to deal with two quotaoffs messing up each other, + * so we're going to serialize it. quotaoff isn't exactly a performance + * critical thing. + * If quotaoff, then we must be dealing with the root filesystem. + */ + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); + + /* + * If we're just turning off quota enforcement, change mp and go. + */ + if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { + mp->m_qflags &= ~(flags); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&q->qi_quotaofflock); + + /* XXX what to do if error ? Revert back to old vals incore ? */ + return xfs_sync_sb(mp, false); + } + + dqtype = 0; + inactivate_flags = 0; + /* + * If accounting is off, we must turn enforcement off, clear the + * quota 'CHKD' certificate to make it known that we have to + * do a quotacheck the next time this quota is turned on. + */ + if (flags & XFS_UQUOTA_ACCT) { + dqtype |= XFS_QMOPT_UQUOTA; + flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); + inactivate_flags |= XFS_UQUOTA_ACTIVE; + } + if (flags & XFS_GQUOTA_ACCT) { + dqtype |= XFS_QMOPT_GQUOTA; + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); + inactivate_flags |= XFS_GQUOTA_ACTIVE; + } + if (flags & XFS_PQUOTA_ACCT) { + dqtype |= XFS_QMOPT_PQUOTA; + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); + inactivate_flags |= XFS_PQUOTA_ACTIVE; + } + + /* + * Nothing to do? Don't complain. This happens when we're just + * turning off quota enforcement. + */ + if ((mp->m_qflags & flags) == 0) + goto out_unlock; + + /* + * Write the LI_QUOTAOFF log record, and do SB changes atomically, + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. + */ + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_unlock; + + /* + * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct + * to take care of the race between dqget and quotaoff. We don't take + * any special locks to reset these bits. All processes need to check + * these bits *after* taking inode lock(s) to see if the particular + * quota type is in the process of being turned off. If *ACTIVE, it is + * guaranteed that all dquot structures and all quotainode ptrs will all + * stay valid as long as that inode is kept locked. + * + * There is no turning back after this. + */ + mp->m_qflags &= ~inactivate_flags; + + /* + * Give back all the dquot reference(s) held by inodes. + * Here we go thru every single incore inode in this file system, and + * do a dqrele on the i_udquot/i_gdquot that it may have. + * Essentially, as long as somebody has an inode locked, this guarantees + * that quotas will not be turned off. This is handy because in a + * transaction once we lock the inode(s) and check for quotaon, we can + * depend on the quota inodes (and other things) being valid as long as + * we keep the lock(s). + */ + xfs_qm_dqrele_all_inodes(mp, flags); + + /* + * Next we make the changes in the quota flag in the mount struct. + * This isn't protected by a particular lock directly, because we + * don't want to take a mrlock every time we depend on quotas being on. + */ + mp->m_qflags &= ~flags; + + /* + * Go through all the dquots of this file system and purge them, + * according to what was turned off. + */ + xfs_qm_dqpurge_all(mp, dqtype); + + /* + * Transactions that had started before ACTIVE state bit was cleared + * could have logged many dquots, so they'd have higher LSNs than + * the first QUOTAOFF log record does. If we happen to crash when + * the tail of the log has gone past the QUOTAOFF record, but + * before the last dquot modification, those dquots __will__ + * recover, and that's not good. + * + * So, we have QUOTAOFF start and end logitems; the start + * logitem won't get overwritten until the end logitem appears... + */ + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_unlock; + } + + /* + * If all quotas are completely turned off, close shop. + */ + if (mp->m_qflags == 0) { + mutex_unlock(&q->qi_quotaofflock); + xfs_qm_destroy_quotainfo(mp); + return 0; + } + + /* + * Release our quotainode references if we don't need them anymore. + */ + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { + IRELE(q->qi_pquotaip); + q->qi_pquotaip = NULL; + } + +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + ASSERT(ip->i_d.di_nextents == 0); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + +int +xfs_qm_scall_trunc_qfiles( + xfs_mount_t *mp, + uint flags) +{ + int error = -EINVAL; + + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x", + __func__, flags, mp->m_qflags); + return -EINVAL; + } + + if (flags & XFS_DQ_USER) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_PROJ) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + + return error; +} + +/* + * Switch on (a given) quota enforcement for a filesystem. This takes + * effect immediately. + * (Switching on quota accounting must be done at mount time.) + */ +int +xfs_qm_scall_quotaon( + xfs_mount_t *mp, + uint flags) +{ + int error; + uint qf; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + /* + * Switching on quota accounting must be done at mount time. + */ + flags &= ~(XFS_ALL_QUOTA_ACCT); + + if (flags == 0) { + xfs_debug(mp, "%s: zero flags, m_qflags=%x", + __func__, mp->m_qflags); + return -EINVAL; + } + + /* + * Can't enforce without accounting. We check the superblock + * qflags here instead of m_qflags because rootfs can have + * quota acct on ondisk without m_qflags' knowing. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { + xfs_debug(mp, + "%s: Can't enforce without acct, flags=%x sbflags=%x", + __func__, flags, mp->m_sb.sb_qflags); + return -EINVAL; + } + /* + * If everything's up to-date incore, then don't waste time. + */ + if ((mp->m_qflags & flags) == flags) + return -EEXIST; + + /* + * Change sb_qflags on disk but not incore mp->qflags + * if this is the root filesystem. + */ + spin_lock(&mp->m_sb_lock); + qf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = qf | flags; + spin_unlock(&mp->m_sb_lock); + + /* + * There's nothing to change if it's the same. + */ + if ((qf & flags) == flags) + return -EEXIST; + + error = xfs_sync_sb(mp, false); + if (error) + return error; + /* + * If we aren't trying to switch on quota enforcement, we are done. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != + (mp->m_qflags & XFS_UQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != + (mp->m_qflags & XFS_PQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != + (mp->m_qflags & XFS_GQUOTA_ACCT))) + return 0; + + if (! XFS_IS_QUOTA_RUNNING(mp)) + return -ESRCH; + + /* + * Switch on quota enforcement in core. + */ + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); + + return 0; +} + +#define XFS_QC_MASK \ + (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) + +/* + * Adjust quota limits, and start/stop timers accordingly. + */ +int +xfs_qm_scall_setqlim( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *newlim) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; + int error; + xfs_qcnt_t hard, soft; + + if (newlim->d_fieldmask & ~XFS_QC_MASK) + return -EINVAL; + if ((newlim->d_fieldmask & XFS_QC_MASK) == 0) + return 0; + + /* + * We don't want to race with a quotaoff so take the quotaoff lock. + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. + */ + mutex_lock(&q->qi_quotaofflock); + + /* + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. + */ + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { + ASSERT(error != -ENOENT); + goto out_unlock; + } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); + xfs_trans_dqjoin(tp, dqp); + ddq = &dqp->q_core; + + /* + * Make sure that hardlimits are >= soft limits before changing. + */ + hard = (newlim->d_fieldmask & QC_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : + be64_to_cpu(ddq->d_blk_hardlimit); + soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : + be64_to_cpu(ddq->d_blk_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_blk_hardlimit = cpu_to_be64(hard); + ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); + if (id == 0) { + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; + } + } else { + xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); + } + hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : + be64_to_cpu(ddq->d_rtb_hardlimit); + soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : + be64_to_cpu(ddq->d_rtb_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_rtb_hardlimit = cpu_to_be64(hard); + ddq->d_rtb_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; + } + } else { + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); + } + + hard = (newlim->d_fieldmask & QC_INO_HARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + be64_to_cpu(ddq->d_ino_hardlimit); + soft = (newlim->d_fieldmask & QC_INO_SOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + be64_to_cpu(ddq->d_ino_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_ino_hardlimit = cpu_to_be64(hard); + ddq->d_ino_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; + } + } else { + xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); + } + + /* + * Update warnings counter(s) if requested + */ + if (newlim->d_fieldmask & QC_SPC_WARNS) + ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_INO_WARNS) + ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); + + if (id == 0) { + /* + * Timelimits for the super user set the relative time + * the other users can be over quota for this file system. + * If it is zero a default is used. Ditto for the default + * soft and hard limit values (already done, above), and + * for warnings. + */ + if (newlim->d_fieldmask & QC_SPC_TIMER) { + q->qi_btimelimit = newlim->d_spc_timer; + ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + } + if (newlim->d_fieldmask & QC_INO_TIMER) { + q->qi_itimelimit = newlim->d_ino_timer; + ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); + } + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { + q->qi_rtbtimelimit = newlim->d_rt_spc_timer; + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); + } + if (newlim->d_fieldmask & QC_SPC_WARNS) + q->qi_bwarnlimit = newlim->d_spc_warns; + if (newlim->d_fieldmask & QC_INO_WARNS) + q->qi_iwarnlimit = newlim->d_ino_warns; + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; + } else { + /* + * If the user is now over quota, start the timelimit. + * The user will not be 'warned'. + * Note that we keep the timers ticking, whether enforcement + * is on or off. We don't really want to bother with iterating + * over all ondisk dquots and turning the timers on/off. + */ + xfs_qm_adjust_dqtimers(mp, ddq); + } + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_trans_log_dquot(tp, dqp); + + error = xfs_trans_commit(tp, 0); + +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + xfs_mount_t *mp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return error; +} + + +STATIC int +xfs_qm_log_quotaoff( + xfs_mount_t *mp, + xfs_qoff_logitem_t **qoffstartp, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + *qoffstartp = NULL; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_log_sb(tp); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + if (error) + goto out; + + *qoffstartp = qoffi; +out: + return error; +} + + +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = -ENOENT; + goto out_put; + } + + memset(dst, 0, sizeof(*dst)); + dst->d_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); + dst->d_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); + dst->d_ino_count = dqp->q_res_icount; + dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); + dst->d_rt_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); + dst->d_rt_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); + dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + + /* + * Internally, we don't reset all the timers when quota enforcement + * gets turned off. No need to confuse the user level code, + * so return zeroes in that case. + */ + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { + dst->d_spc_timer = 0; + dst->d_ino_timer = 0; + dst->d_rt_spc_timer = 0; + } + +#ifdef DEBUG + if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || + (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || + (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && + id != 0) { + if ((dst->d_space > dst->d_spc_softlimit) && + (dst->d_spc_softlimit > 0)) { + ASSERT(dst->d_spc_timer != 0); + } + if ((dst->d_ino_count > dst->d_ino_softlimit) && + (dst->d_ino_softlimit > 0)) { + ASSERT(dst->d_ino_timer != 0); + } + } +#endif +out_put: + xfs_qm_dqput(dqp); + return error; +} + + +STATIC int +xfs_dqrele_inode( + struct xfs_inode *ip, + int flags, + void *args) +{ + /* skip quota inodes */ + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip || + ip == ip->i_mount->m_quotainfo->qi_pquotaip) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_pdquot == NULL); + return 0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + ASSERT(mp->m_quotainfo); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); +} diff --git a/kernel/fs/xfs/xfs_quota.h b/kernel/fs/xfs/xfs_quota.h new file mode 100644 index 000000000..5376dd406 --- /dev/null +++ b/kernel/fs/xfs/xfs_quota.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_H__ +#define __XFS_QUOTA_H__ + +#include "xfs_quota_defs.h" + +/* + * Kernel only quota definitions and functions + */ + +struct xfs_trans; + +/* + * This check is done typically without holding the inode lock; + * that may seem racy, but it is harmless in the context that it is used. + * The inode cannot go inactive as long a reference is kept, and + * therefore if dquot(s) were attached, they'll stay consistent. + * If, for example, the ownership of the inode changes while + * we didn't have the inode locked, the appropriate dquot(s) will be + * attached atomically. + */ +#define XFS_NOT_DQATTACHED(mp, ip) \ + ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \ + (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ + (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) + +#define XFS_QM_NEED_QUOTACHECK(mp) \ + ((XFS_IS_UQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ + (XFS_IS_PQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) + +/* + * The structure kept inside the xfs_trans_t keep track of dquot changes + * within a transaction and apply them later. + */ +typedef struct xfs_dqtrx { + struct xfs_dquot *qt_dquot; /* the dquot this refers to */ + ulong qt_blk_res; /* blks reserved on a dquot */ + ulong qt_blk_res_used; /* blks used from the reservation */ + ulong qt_ino_res; /* inode reserved on a dquot */ + ulong qt_ino_res_used; /* inodes used from the reservation */ + long qt_bcount_delta; /* dquot blk count changes */ + long qt_delbcnt_delta; /* delayed dquot blk count changes */ + long qt_icount_delta; /* dquot inode count changes */ + ulong qt_rtblk_res; /* # blks reserved on a dquot */ + ulong qt_rtblk_res_used;/* # blks used from reservation */ + long qt_rtbcount_delta;/* dquot realtime blk changes */ + long qt_delrtb_delta; /* delayed RT blk count changes */ +} xfs_dqtrx_t; + +#ifdef CONFIG_XFS_QUOTA +extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); +extern void xfs_trans_free_dqinfo(struct xfs_trans *); +extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, + uint, long); +extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); +extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, + struct xfs_inode *, long, long, uint); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, long, long, uint); + +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, + prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, + struct xfs_dquot **); +extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); +extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); +extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, + struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); +extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, + struct xfs_dquot *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *, uint); +extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern void xfs_qm_dqdetach(struct xfs_inode *); +extern void xfs_qm_dqrele(struct xfs_dquot *); +extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); +extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +extern void xfs_qm_mount_quotas(struct xfs_mount *); +extern void xfs_qm_unmount(struct xfs_mount *); +extern void xfs_qm_unmount_quotas(struct xfs_mount *); + +#else +static inline int +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, + prid_t prid, uint flags, struct xfs_dquot **udqp, + struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) +{ + *udqp = NULL; + *gdqp = NULL; + *pdqp = NULL; + return 0; +} +#define xfs_trans_dup_dqinfo(tp, tp2) +#define xfs_trans_free_dqinfo(tp) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_apply_dquot_deltas(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + long nblks, long nions, uint flags) +{ + return 0; +} +#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) +#define xfs_qm_vop_rename_dqattach(it) (0) +#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) +#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) +#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach_locked(ip, fl) (0) +#define xfs_qm_dqdetach(ip) +#define xfs_qm_dqrele(d) +#define xfs_qm_statvfs(ip, s) +#define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_mount_quotas(mp) +#define xfs_qm_unmount(mp) +#define xfs_qm_unmount_quotas(mp) +#endif /* CONFIG_XFS_QUOTA */ + +#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ + xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) +#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ + xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ + f | XFS_QMOPT_RES_REGBLKS) + +extern int xfs_mount_reset_sbqflags(struct xfs_mount *); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/kernel/fs/xfs/xfs_quotaops.c b/kernel/fs/xfs/xfs_quotaops.c new file mode 100644 index 000000000..7795e0d01 --- /dev/null +++ b/kernel/fs/xfs/xfs_quotaops.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_qm.h" +#include + + +static void +xfs_qm_fill_state( + struct qc_type_state *tstate, + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_ino_t ino) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + bool tempqip = false; + + tstate->ino = ino; + if (!ip && ino == NULLFSINO) + return; + if (!ip) { + if (xfs_iget(mp, NULL, ino, 0, 0, &ip)) + return; + tempqip = true; + } + tstate->flags |= QCI_SYSFILE; + tstate->blocks = ip->i_d.di_nblocks; + tstate->nextents = ip->i_d.di_nextents; + tstate->spc_timelimit = q->qi_btimelimit; + tstate->ino_timelimit = q->qi_itimelimit; + tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_warnlimit = q->qi_bwarnlimit; + tstate->ino_warnlimit = q->qi_iwarnlimit; + tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + if (tempqip) + IRELE(ip); +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int +xfs_fs_get_quota_state( + struct super_block *sb, + struct qc_state *state) +{ + struct xfs_mount *mp = XFS_M(sb); + struct xfs_quotainfo *q = mp->m_quotainfo; + + memset(state, 0, sizeof(*state)); + if (!XFS_IS_QUOTA_RUNNING(mp)) + return 0; + state->s_incoredqs = q->qi_dquots; + if (XFS_IS_UQUOTA_RUNNING(mp)) + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_UQUOTA_ENFORCED(mp)) + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_GQUOTA_RUNNING(mp)) + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_GQUOTA_ENFORCED(mp)) + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_PQUOTA_RUNNING(mp)) + state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_PQUOTA_ENFORCED(mp)) + state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; + + xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, + mp->m_sb.sb_uquotino); + xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, + mp->m_sb.sb_gquotino); + xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, + mp->m_sb.sb_pquotino); + return 0; +} + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) + +/* + * Adjust quota timers & warnings + */ +static int +xfs_fs_set_info( + struct super_block *sb, + int type, + struct qc_info *info) +{ + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) + return -EINVAL; + if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) + return 0; + + newlim.d_fieldmask = info->i_fieldmask; + newlim.d_spc_timer = info->i_spc_timelimit; + newlim.d_ino_timer = info->i_ino_timelimit; + newlim.d_rt_spc_timer = info->i_rt_spc_timelimit; + newlim.d_ino_warns = info->i_ino_warnlimit; + newlim.d_spc_warns = info->i_spc_warnlimit; + newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit; + + return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim); +} + +static unsigned int +xfs_quota_flags(unsigned int uflags) +{ + unsigned int flags = 0; + + if (uflags & FS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & FS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & FS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & FS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; + + return flags; +} + +STATIC int +xfs_quota_enable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + + return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); +} + +STATIC int +xfs_quota_disable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); +} + +STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_PROJ_QUOTA) + flags |= XFS_DQ_PROJ; + + return xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int +xfs_fs_get_dqblk( + struct super_block *sb, + struct kqid qid, + struct qc_dqblk *qdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), qdq); +} + +STATIC int +xfs_fs_set_dqblk( + struct super_block *sb, + struct kqid qid, + struct qc_dqblk *qdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), qdq); +} + +const struct quotactl_ops xfs_quotactl_operations = { + .get_state = xfs_fs_get_quota_state, + .set_info = xfs_fs_set_info, + .quota_enable = xfs_quota_enable, + .quota_disable = xfs_quota_disable, + .rm_xquota = xfs_fs_rm_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, +}; diff --git a/kernel/fs/xfs/xfs_rtalloc.c b/kernel/fs/xfs/xfs_rtalloc.c new file mode 100644 index 000000000..f2079b691 --- /dev/null +++ b/kernel/fs/xfs/xfs_rtalloc.c @@ -0,0 +1,1302 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_rtalloc.h" + + +/* + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +static int +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); +} + +/* + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. + */ +STATIC int /* error */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ +{ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* + * Loop over logs of extent sizes. Order is irrelevant. + */ + for (log = low; log <= high; log++) { + /* + * Get one summary datum. + */ + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } + /* + * If there are any, return success. + */ + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} + + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); + if (error) + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); + } + } + return 0; +} +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ + + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); + return error; +} + +/* + * Attempt to allocate an extent minlen<=len<=maxlen starting from + * bitmap block bbno. If we don't get maxlen then use prod to trim + * the length, if given. Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_block( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtblock_t *nextp, /* out: next block to try */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_rtblock_t besti; /* best rtblock found so far */ + xfs_rtblock_t bestlen; /* best length found so far */ + xfs_rtblock_t end; /* last rtblock in chunk */ + int error; /* error value */ + xfs_rtblock_t i; /* current rtblock trying */ + xfs_rtblock_t next; /* next rtblock to try */ + int stat; /* status from internal calls */ + + /* + * Loop over all the extents starting in this bitmap block, + * looking for one that's long enough. + */ + for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, + end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + i <= end; + i++) { + /* + * See if there's a free extent of maxlen starting at i. + * If it's not so then next will contain the first non-free. + */ + error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + if (error) { + return error; + } + if (stat) { + /* + * i for maxlen is all free, allocate and return that. + */ + error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, + rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = i; + return 0; + } + /* + * In the case where we have a variable-sized allocation + * request, figure out how big this free piece is, + * and if it's big enough for the minimum, and the best + * so far, remember it. + */ + if (minlen < maxlen) { + xfs_rtblock_t thislen; /* this extent size */ + + thislen = next - i; + if (thislen >= minlen && thislen > bestlen) { + besti = i; + bestlen = thislen; + } + } + /* + * If not done yet, find the start of the next free space. + */ + if (next < end) { + error = xfs_rtfind_forw(mp, tp, next, end, &i); + if (error) { + return error; + } + } else + break; + } + /* + * Searched the whole thing & didn't find a maxlen free extent. + */ + if (minlen < maxlen && besti != -1) { + xfs_extlen_t p; /* amount to trim length by */ + + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1 && (p = do_mod(bestlen, prod))) + bestlen -= p; + /* + * Allocate besti for bestlen & return that. + */ + error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + if (error) { + return error; + } + *len = bestlen; + *rtblock = besti; + return 0; + } + /* + * Allocation failed. Set *nextp to the next block to try. + */ + *nextp = next; + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting at block + * bno. If we don't get maxlen then use prod to trim the length, if given. + * Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_exact( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + xfs_extlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtblock_t next; /* next block to try (dummy) */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * Check if the range in question (for maxlen) is free. + */ + error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + if (error) { + return error; + } + if (isfree) { + /* + * If it is, allocate it and return success. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; + } + /* + * If not, allocate what there is, if it's at least minlen. + */ + maxlen = next - bno; + if (maxlen < minlen) { + /* + * Failed, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + /* + * Trim off tail of extent, if prod is specified. + */ + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) { + /* + * Now we can't do it, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + } + /* + * Allocate what we can and return it. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting as near + * to bno as possible. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_near( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int any; /* any useful extents from summary */ + xfs_rtblock_t bbno; /* bitmap block number */ + int error; /* error value */ + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtblock_t n; /* next block to try */ + xfs_rtblock_t r; /* result block */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * If the block number given is off the end, silently set it to + * the last block. + */ + if (bno >= mp->m_sb.sb_rextents) + bno = mp->m_sb.sb_rextents - 1; + /* + * Try the exact allocation first. + */ + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, + rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If the exact allocation worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + bbno = XFS_BITTOBLOCK(mp, bno); + i = 0; + ASSERT(minlen != 0); + log2len = xfs_highbit32(minlen); + /* + * Loop over all bitmap blocks (bbno + i is current block). + */ + for (;;) { + /* + * Get summary information of extents of all useful levels + * starting in this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, + bbno + i, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there are any useful extents starting here, try + * allocating one. + */ + if (any) { + /* + * On the positive side of the starting location. + */ + if (i >= 0) { + /* + * Try to allocate an extent starting in + * this block. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return it. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * On the negative side of the starting location. + */ + else { /* i < 0 */ + /* + * Loop backwards through the bitmap blocks from + * the starting point-1 up to where we are now. + * There should be an extent which ends in this + * bitmap block and is long enough. + */ + for (j = -1; j > i; j--) { + /* + * Grab the summary information for + * this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, + log2len, mp->m_rsumlevels - 1, + bbno + j, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there's no extent given in the + * summary that means the extent we + * found must carry over from an + * earlier block. If there is an + * extent given, we've already tried + * that allocation, don't do it again. + */ + if (any) + continue; + error = xfs_rtallocate_extent_block(mp, + tp, bbno + j, minlen, maxlen, + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * There weren't intervening bitmap blocks + * with a long enough extent, or the + * allocation didn't work for some reason + * (i.e. it's a little * too short). + * Try to allocate from the summary block + * that we found. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + } + /* + * Loop control. If we were on the positive side, and there's + * still more blocks on the negative side, go there. + */ + if (i > 0 && (int)bbno - i >= 0) + i = -i; + /* + * If positive, and no more negative, but there are more + * positive, go there. + */ + else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1) + i++; + /* + * If negative or 0 (just started), and there are positive + * blocks to go, go there. The 0 case moves to block 1. + */ + else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1) + i = 1 - i; + /* + * If negative or 0 and there are more negative blocks, + * go there. + */ + else if (i <= 0 && (int)bbno + i > 0) + i--; + /* + * Must be done. Return failure. + */ + else + break; + } + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, with no position + * specified. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_size( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + int i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtblock_t n; /* next block to be tried */ + xfs_rtblock_t r; /* result block number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + + /* + * Loop over all the levels starting with maxlen. + * At each level, look at all the bitmap blocks, to see if there + * are extents starting there that are long enough (>= maxlen). + * Note, only on the initial level can the allocation fail if + * the summary says there's an extent. + */ + for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { + /* + * Loop over all the bitmap blocks. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, + maxlen, len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Didn't find any maxlen blocks. Try smaller ones, unless + * we're asking for a fixed size extent. + */ + if (minlen > --maxlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + + /* + * Loop over sizes, from maxlen down to minlen. + * This time, when we do the allocations, allow smaller ones + * to succeed. + */ + for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { + /* + * Loop over all the bitmap blocks, try an allocation + * starting in that block. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary information for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * If nothing there, go on to next. + */ + if (!sum) + continue; + /* + * Try the allocation. Make sure the specified + * minlen/maxlen are in the possible range for + * this summary level. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, + XFS_RTMAX(minlen, 1 << l), + XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Got nothing, return failure. + */ + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate space to the bitmap or summary file, and zero it, for growfs. + */ +STATIC int /* error */ +xfs_growfs_rt_alloc( + xfs_mount_t *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ +{ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + + /* + * Allocate space to the file, as necessary. + */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + /* + * Reserve space & log for one extent added to the file. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, + resblks, 0); + if (error) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + /* + * Lock the inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&flist, &firstblock); + /* + * Allocate blocks to the bitmap file. + */ + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = -ENOSPC; + if (error) + goto error_cancel; + /* + * Free any blocks freed up in the transaction, then commit. + */ + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; + /* + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. + */ + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); + /* + * Reserve log for one block zeroing. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); + if (error) + goto error_cancel; + /* + * Lock the bitmap inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + /* + * Get a buffer for the block. + */ + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = -EIO; +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; + } + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + /* + * Commit the transaction. + */ + error = xfs_trans_commit(tp, 0); + if (error) + goto error; + } + /* + * Go on to the next extent, if any. + */ + oblocks = map.br_startoff + map.br_blockcount; + } + return 0; + +error: + return error; +} + +/* + * Visible (exported) functions. + */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_rt_t *in) /* growfs rt input struct */ +{ + xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_buf_t *bp; /* temporary buffer */ + int error; /* error return value */ + xfs_mount_t *nmp; /* new (fake) mount structure */ + xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ + xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ + xfs_rtblock_t nrextents; /* new number of realtime extents */ + uint8_t nrextslog; /* new log2 of sb_rextents */ + xfs_extlen_t nrsumblocks; /* new number of summary blocks */ + uint nrsumlevels; /* new rt summary levels */ + uint nrsumsize; /* new size of rt summary, bytes */ + xfs_sb_t *nsbp; /* new superblock */ + xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ + xfs_extlen_t rsumblocks; /* current number of rt summary blks */ + xfs_sb_t *sbp; /* old superblock */ + xfs_fsblock_t sumbno; /* summary block number */ + + sbp = &mp->m_sb; + /* + * Initial error checking. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || + (nrblocks = in->newblocks) <= sbp->sb_rblocks || + (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) + return -EINVAL; + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; + /* + * Read in the last block of the device, make sure it exists. + */ + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) + return error; + xfs_buf_relse(bp); + + /* + * Calculate new parameters. These are the final values to be reached. + */ + nrextents = nrblocks; + do_div(nrextents, in->extsize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); + nrextslog = xfs_highbit32(nrextents); + nrsumlevels = nrextslog + 1; + nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * New summary size can't be more than half the size of + * the log. This prevents us from getting a log overflow, + * since we'll log basically the whole summary file at once. + */ + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) + return -EINVAL; + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size); + rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size); + /* + * Allocate space to the bitmap and summary files, as necessary. + */ + error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); + if (error) + return error; + error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); + if (error) + return error; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + /* + * Loop over the bitmap blocks. + * We will do everything one bitmap block at a time. + * Skip the current block if it is exactly full. + * This also deals with the case where there were no rtextents before. + */ + for (bmbno = sbp->sb_rbmblocks - + ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); + bmbno < nrbmblocks; + bmbno++) { + xfs_trans_t *tp; + int cancelflags = 0; + + *nmp = *mp; + nsbp = &nmp->m_sb; + /* + * Calculate new sb and mount fields for this round. + */ + nsbp->sb_rextsize = in->extsize; + nsbp->sb_rbmblocks = bmbno + 1; + nsbp->sb_rblocks = + XFS_RTMIN(nrblocks, + nsbp->sb_rbmblocks * NBBY * + nsbp->sb_blocksize * nsbp->sb_rextsize); + nsbp->sb_rextents = nsbp->sb_rblocks; + do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); + nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; + nrsumsize = + (uint)sizeof(xfs_suminfo_t) * nrsumlevels * + nsbp->sb_rbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * Start a transaction, get the log reservation. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, + 0, 0); + if (error) + goto error_cancel; + /* + * Lock out other callers by grabbing the bitmap inode lock. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + /* + * Update the bitmap inode's size. + */ + mp->m_rbmip->i_d.di_size = + nsbp->sb_rbmblocks * nsbp->sb_blocksize; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + cancelflags |= XFS_TRANS_ABORT; + /* + * Get the summary inode into the transaction. + */ + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + /* + * Update the summary inode's size. + */ + mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); + /* + * Copy summary data from old to new sizes. + * Do this when the real size (not block-aligned) changes. + */ + if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || + mp->m_rsumlevels != nmp->m_rsumlevels) { + error = xfs_rtcopy_summary(mp, nmp, tp); + if (error) + goto error_cancel; + } + /* + * Update superblock fields. + */ + if (nsbp->sb_rextsize != sbp->sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nsbp->sb_rextsize - sbp->sb_rextsize); + if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nsbp->sb_rbmblocks - sbp->sb_rbmblocks); + if (nsbp->sb_rblocks != sbp->sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nsbp->sb_rblocks - sbp->sb_rblocks); + if (nsbp->sb_rextents != sbp->sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + if (nsbp->sb_rextslog != sbp->sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nsbp->sb_rextslog - sbp->sb_rextslog); + /* + * Free new extent. + */ + bp = NULL; + error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + if (error) { +error_cancel: + xfs_trans_cancel(tp, cancelflags); + break; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + /* + * Update mp values into the real mp structure. + */ + mp->m_rsumlevels = nrsumlevels; + mp->m_rsumsize = nrsumsize; + + error = xfs_trans_commit(tp, 0); + if (error) + break; + } + + /* + * Free the fake mp structure. + */ + kmem_free(nmp); + + return error; +} + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_mount_t *mp = tp->t_mountp; + int error; /* error value */ + xfs_rtblock_t r; /* result allocated block */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + ASSERT(minlen > 0 && minlen <= maxlen); + + /* + * If prod is set then figure out what to do to minlen and maxlen. + */ + if (prod > 1) { + xfs_extlen_t i; + + if ((i = maxlen % prod)) + maxlen -= i; + if ((i = minlen % prod)) + minlen += prod - i; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + } + + sumbp = NULL; + /* + * Allocate by size, or near another block, or exactly at some block. + */ + switch (type) { + case XFS_ALLOCTYPE_ANY_AG: + error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, + &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + default: + error = -EIO; + ASSERT(0); + } + if (error) + return error; + + /* + * If it worked, update the superblock. + */ + if (r != NULLRTBLOCK) { + long slen = (long)*len; + + ASSERT(*len >= minlen && *len <= maxlen); + if (wasdel) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); + else + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } + *rtblock = r; + return 0; +} + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp) /* file system mount structure */ +{ + struct xfs_buf *bp; /* buffer for last block of subvolume */ + struct xfs_sb *sbp; /* filesystem superblock copy in mount */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; + + sbp = &mp->m_sb; + if (sbp->sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); + return -ENODEV; + } + mp->m_rsumlevels = sbp->sb_rextslog + 1; + mp->m_rsumsize = + (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * + sbp->sb_rbmblocks; + mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + mp->m_rbmip = mp->m_rsumip = NULL; + /* + * Check that the realtime section is an ok size. + */ + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { + xfs_warn(mp, "realtime mount -- %llu != %llu", + (unsigned long long) XFS_BB_TO_FSB(mp, d), + (unsigned long long) mp->m_sb.sb_rblocks); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "realtime device size check failed"); + return error; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + xfs_mount_t *mp) /* file system mount structure */ +{ + int error; /* error return value */ + xfs_sb_t *sbp; + + sbp = &mp->m_sb; + if (sbp->sb_rbmino == NULLFSINO) + return 0; + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); + if (error) + return error; + ASSERT(mp->m_rbmip != NULL); + ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); + if (error) { + IRELE(mp->m_rbmip); + return error; + } + ASSERT(mp->m_rsumip != NULL); + return 0; +} + +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ +{ + xfs_rtblock_t b; /* result block */ + int log2; /* log of sequence number */ + __uint64_t resid; /* residual after log removed */ + __uint64_t seq; /* sequence number of file creation */ + __uint64_t *seqp; /* pointer to seqno in inode */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *seqp = 0; + } + seq = *seqp; + if ((log2 = xfs_highbit64(seq)) == -1) + b = 0; + else { + resid = seq - (1ULL << log2); + b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> + (log2 + 1); + if (b >= mp->m_sb.sb_rextents) + b = do_mod(b, mp->m_sb.sb_rextents); + if (b + len > mp->m_sb.sb_rextents) + b = mp->m_sb.sb_rextents - len; + } + *seqp = seq + 1; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + *pick = b; + return 0; +} diff --git a/kernel/fs/xfs/xfs_rtalloc.h b/kernel/fs/xfs/xfs_rtalloc.h new file mode 100644 index 000000000..76c0a4a9b --- /dev/null +++ b/kernel/fs/xfs/xfs_rtalloc.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RTALLOC_H__ +#define __XFS_RTALLOC_H__ + +/* kernel only definitions and functions */ + +struct xfs_mount; +struct xfs_trans; + +#ifdef CONFIG_XFS_RT +/* + * Function prototypes for exported functions. + */ + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock); /* out: start block allocated */ + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len); /* length of extent freed */ + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick); /* result rt extent */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + struct xfs_mount *mp, /* file system mount structure */ + xfs_growfs_rt_t *in); /* user supplied growfs struct */ + +/* + * From xfs_rtbitmap.c + */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); +int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val, + xfs_rtblock_t *new, int *stat); +int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, + int log, xfs_rtblock_t bbno, int delta, + xfs_buf_t **rbpp, xfs_fsblock_t *rsb, + xfs_suminfo_t *sum); +int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, + xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, + xfs_fsblock_t *rsb); +int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + struct xfs_buf **rbpp, xfs_fsblock_t *rsb); + + +#else +# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtfree_extent(t,b,l) (ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) +# define xfs_growfs_rt(mp,in) (ENOSYS) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + xfs_warn(mp, "Not built with CONFIG_XFS_RT"); + return -ENOSYS; +} +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) +#endif /* CONFIG_XFS_RT */ + +#endif /* __XFS_RTALLOC_H__ */ diff --git a/kernel/fs/xfs/xfs_stats.c b/kernel/fs/xfs/xfs_stats.c new file mode 100644 index 000000000..f2240383d --- /dev/null +++ b/kernel/fs/xfs/xfs_stats.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +static int counter_val(int idx) +{ + int val = 0, cpu; + + for_each_possible_cpu(cpu) + val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + return val; +} + +static int xfs_stat_proc_show(struct seq_file *m, void *v) +{ + int i, j; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static const struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, + }; + + /* Loop over all stats groups */ + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { + seq_printf(m, "%s", xstats[i].desc); + /* inner loop does each group */ + for (; j < xstats[i].endpoint; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + } + /* extra precision counters */ + for_each_possible_cpu(i) { + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + seq_printf(m, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + seq_printf(m, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + return 0; +} + +static int xfs_stat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfs_stat_proc_show, NULL); +} + +static const struct file_operations xfs_stat_proc_fops = { + .owner = THIS_MODULE, + .open = xfs_stat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota interfaces */ +#ifdef CONFIG_XFS_QUOTA +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + 0, + counter_val(XFSSTAT_END_XQMSTAT), + 0, + counter_val(XFSSTAT_END_XQMSTAT + 1)); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota stats interface no 2 */ +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + int j; + + seq_printf(m, "qm"); + for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XFS_QUOTA */ + +int +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + goto out; + + if (!proc_create("fs/xfs/stat", 0, NULL, + &xfs_stat_proc_fops)) + goto out_remove_xfs_dir; +#ifdef CONFIG_XFS_QUOTA + if (!proc_create("fs/xfs/xqmstat", 0, NULL, + &xqmstat_proc_fops)) + goto out_remove_stat_file; + if (!proc_create("fs/xfs/xqm", 0, NULL, + &xqm_proc_fops)) + goto out_remove_xqmstat_file; +#endif + return 0; + +#ifdef CONFIG_XFS_QUOTA + out_remove_xqmstat_file: + remove_proc_entry("fs/xfs/xqmstat", NULL); + out_remove_stat_file: + remove_proc_entry("fs/xfs/stat", NULL); +#endif + out_remove_xfs_dir: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; +} + +void +xfs_cleanup_procfs(void) +{ +#ifdef CONFIG_XFS_QUOTA + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +#endif + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/kernel/fs/xfs/xfs_stats.h b/kernel/fs/xfs/xfs_stats.h new file mode 100644 index 000000000..c8f238b82 --- /dev/null +++ b/kernel/fs/xfs/xfs_stats.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +#include + +/* + * XFS global statistics + */ +struct xfsstats { +# define XFSSTAT_END_EXTENT_ALLOC 4 + __uint32_t xs_allocx; + __uint32_t xs_allocb; + __uint32_t xs_freex; + __uint32_t xs_freeb; +# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) + __uint32_t xs_abt_lookup; + __uint32_t xs_abt_compare; + __uint32_t xs_abt_insrec; + __uint32_t xs_abt_delrec; +# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) + __uint32_t xs_blk_mapr; + __uint32_t xs_blk_mapw; + __uint32_t xs_blk_unmap; + __uint32_t xs_add_exlist; + __uint32_t xs_del_exlist; + __uint32_t xs_look_exlist; + __uint32_t xs_cmp_exlist; +# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) + __uint32_t xs_bmbt_lookup; + __uint32_t xs_bmbt_compare; + __uint32_t xs_bmbt_insrec; + __uint32_t xs_bmbt_delrec; +# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) + __uint32_t xs_dir_lookup; + __uint32_t xs_dir_create; + __uint32_t xs_dir_remove; + __uint32_t xs_dir_getdents; +# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) + __uint32_t xs_trans_sync; + __uint32_t xs_trans_async; + __uint32_t xs_trans_empty; +# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) + __uint32_t xs_ig_attempts; + __uint32_t xs_ig_found; + __uint32_t xs_ig_frecycle; + __uint32_t xs_ig_missed; + __uint32_t xs_ig_dup; + __uint32_t xs_ig_reclaims; + __uint32_t xs_ig_attrchg; +# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) + __uint32_t xs_log_writes; + __uint32_t xs_log_blocks; + __uint32_t xs_log_noiclogs; + __uint32_t xs_log_force; + __uint32_t xs_log_force_sleep; +# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) + __uint32_t xs_try_logspace; + __uint32_t xs_sleep_logspace; + __uint32_t xs_push_ail; + __uint32_t xs_push_ail_success; + __uint32_t xs_push_ail_pushbuf; + __uint32_t xs_push_ail_pinned; + __uint32_t xs_push_ail_locked; + __uint32_t xs_push_ail_flushing; + __uint32_t xs_push_ail_restarts; + __uint32_t xs_push_ail_flush; +# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) + __uint32_t xs_xstrat_quick; + __uint32_t xs_xstrat_split; +# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) + __uint32_t xs_write_calls; + __uint32_t xs_read_calls; +# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) + __uint32_t xs_attr_get; + __uint32_t xs_attr_set; + __uint32_t xs_attr_remove; + __uint32_t xs_attr_list; +# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) + __uint32_t xs_iflush_count; + __uint32_t xs_icluster_flushcnt; + __uint32_t xs_icluster_flushinode; +# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) + __uint32_t vn_active; /* # vnodes not on free lists */ + __uint32_t vn_alloc; /* # times vn_alloc called */ + __uint32_t vn_get; /* # times vn_get called */ + __uint32_t vn_hold; /* # times vn_hold called */ + __uint32_t vn_rele; /* # times vn_rele called */ + __uint32_t vn_reclaim; /* # times vn_reclaim called */ + __uint32_t vn_remove; /* # times vn_remove called */ + __uint32_t vn_free; /* # times vn_free called */ +#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) + __uint32_t xb_get; + __uint32_t xb_create; + __uint32_t xb_get_locked; + __uint32_t xb_get_locked_waited; + __uint32_t xb_busy_locked; + __uint32_t xb_miss_locked; + __uint32_t xb_page_retries; + __uint32_t xb_page_found; + __uint32_t xb_get_read; +/* Version 2 btree counters */ +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) + __uint32_t xs_abtb_2_lookup; + __uint32_t xs_abtb_2_compare; + __uint32_t xs_abtb_2_insrec; + __uint32_t xs_abtb_2_delrec; + __uint32_t xs_abtb_2_newroot; + __uint32_t xs_abtb_2_killroot; + __uint32_t xs_abtb_2_increment; + __uint32_t xs_abtb_2_decrement; + __uint32_t xs_abtb_2_lshift; + __uint32_t xs_abtb_2_rshift; + __uint32_t xs_abtb_2_split; + __uint32_t xs_abtb_2_join; + __uint32_t xs_abtb_2_alloc; + __uint32_t xs_abtb_2_free; + __uint32_t xs_abtb_2_moves; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) + __uint32_t xs_abtc_2_lookup; + __uint32_t xs_abtc_2_compare; + __uint32_t xs_abtc_2_insrec; + __uint32_t xs_abtc_2_delrec; + __uint32_t xs_abtc_2_newroot; + __uint32_t xs_abtc_2_killroot; + __uint32_t xs_abtc_2_increment; + __uint32_t xs_abtc_2_decrement; + __uint32_t xs_abtc_2_lshift; + __uint32_t xs_abtc_2_rshift; + __uint32_t xs_abtc_2_split; + __uint32_t xs_abtc_2_join; + __uint32_t xs_abtc_2_alloc; + __uint32_t xs_abtc_2_free; + __uint32_t xs_abtc_2_moves; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) + __uint32_t xs_bmbt_2_lookup; + __uint32_t xs_bmbt_2_compare; + __uint32_t xs_bmbt_2_insrec; + __uint32_t xs_bmbt_2_delrec; + __uint32_t xs_bmbt_2_newroot; + __uint32_t xs_bmbt_2_killroot; + __uint32_t xs_bmbt_2_increment; + __uint32_t xs_bmbt_2_decrement; + __uint32_t xs_bmbt_2_lshift; + __uint32_t xs_bmbt_2_rshift; + __uint32_t xs_bmbt_2_split; + __uint32_t xs_bmbt_2_join; + __uint32_t xs_bmbt_2_alloc; + __uint32_t xs_bmbt_2_free; + __uint32_t xs_bmbt_2_moves; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) + __uint32_t xs_ibt_2_lookup; + __uint32_t xs_ibt_2_compare; + __uint32_t xs_ibt_2_insrec; + __uint32_t xs_ibt_2_delrec; + __uint32_t xs_ibt_2_newroot; + __uint32_t xs_ibt_2_killroot; + __uint32_t xs_ibt_2_increment; + __uint32_t xs_ibt_2_decrement; + __uint32_t xs_ibt_2_lshift; + __uint32_t xs_ibt_2_rshift; + __uint32_t xs_ibt_2_split; + __uint32_t xs_ibt_2_join; + __uint32_t xs_ibt_2_alloc; + __uint32_t xs_ibt_2_free; + __uint32_t xs_ibt_2_moves; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; +#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) + __uint32_t xs_qm_dquot; + __uint32_t xs_qm_dquot_unused; +/* Extra precision counters */ + __uint64_t xs_xstrat_bytes; + __uint64_t xs_write_bytes; + __uint64_t xs_read_bytes; +}; + +DECLARE_PER_CPU(struct xfsstats, xfsstats); + +/* + * We don't disable preempt, not too worried about poking the + * wrong CPU's stat for now (also aggregated before reporting). + */ +#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) +#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) +#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) + +extern int xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); + + +#else /* !CONFIG_PROC_FS */ + +# define XFS_STATS_INC(count) +# define XFS_STATS_DEC(count) +# define XFS_STATS_ADD(count, inc) + +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} + +#endif /* !CONFIG_PROC_FS */ + +#endif /* __XFS_STATS_H__ */ diff --git a/kernel/fs/xfs/xfs_super.c b/kernel/fs/xfs/xfs_super.c new file mode 100644 index 000000000..858e1e62b --- /dev/null +++ b/kernel/fs/xfs/xfs_super.c @@ -0,0 +1,1889 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" +#include "xfs_icache.h" +#include "xfs_trace.h" +#include "xfs_icreate_item.h" +#include "xfs_filestream.h" +#include "xfs_quota.h" +#include "xfs_sysfs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_ioend_zone; +mempool_t *xfs_ioend_pool; + +static struct kset *xfs_kset; /* top-level xfs sysfs dir */ +#ifdef DEBUG +static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ +#endif + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ +#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ +#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ +#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ +#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ +#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ +#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes + * in stat(). */ +#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ +#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ +#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ + +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, + {Opt_err, NULL} +}; + + +STATIC unsigned long +suffix_kstrtoint(char *s, unsigned int base, int *res) +{ + int last, shift_left_factor = 0, _res; + char *value = s; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + * + * Note that this function leaks the various device name allocations on + * failure. The caller takes care of them. + */ +STATIC int +xfs_parseargs( + struct xfs_mount *mp, + char *options) +{ + struct super_block *sb = mp->m_super; + char *this_char, *value; + int dsunit = 0; + int dswidth = 0; + int iosize = 0; + __uint8_t iosizelog = 0; + + /* + * set up the mount name first so all the errors will refer to the + * correct device. + */ + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return -ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (sb->s_flags & MS_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + /* + * Set some default flags that could be cleared by the mount option + * parsing. + */ + mp->m_flags |= XFS_MOUNT_BARRIER; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + + if (!options) + goto done; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &mp->m_logbufs)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_logname) + return -ENOMEM; + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_rtname) + return -ENOMEM; + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &iosize)) + return -EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (suffix_kstrtoint(value, 10, &iosize)) + return -EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_GRPID) || + !strcmp(this_char, MNTOPT_BSDGROUPS)) { + mp->m_flags |= XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_NOGRPID) || + !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + mp->m_flags &= ~XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + mp->m_flags |= XFS_MOUNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + mp->m_flags |= XFS_MOUNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + mp->m_flags |= XFS_MOUNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &dsunit)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &dswidth)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + mp->m_flags |= XFS_MOUNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + mp->m_flags |= XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + mp->m_flags |= XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + mp->m_flags &= ~XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + mp->m_flags |= XFS_MOUNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_PQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_PQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_GQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_GQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + mp->m_flags |= XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return -EINVAL; + } + } + + /* + * no recovery flag requires a read-only mount + */ + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return -EINVAL; + } + +#ifndef CONFIG_XFS_QUOTA + if (XFS_IS_QUOTA_RUNNING(mp)) { + xfs_warn(mp, "quota support not available in this kernel."); + return -EINVAL; + } +#endif + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return -EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + dswidth, dsunit); + return -EINVAL; + } + +done: + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return -EINVAL; + } + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return -EINVAL; + } + + if (iosizelog) { + if (iosizelog > XFS_MAX_IO_LOG || + iosizelog < XFS_MIN_IO_LOG) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return -EINVAL; + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = iosizelog; + mp->m_writeio_log = iosizelog; + } + + return 0; +} + +struct proc_xfs_info { + int flag; + char *str; +}; + +STATIC int +xfs_showargs( + struct xfs_mount *mp, + struct seq_file *m) +{ + static struct proc_xfs_info xfs_info_set[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, + { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, + { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { 0, NULL } + }; + static struct proc_xfs_info xfs_info_unset[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, + { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + + for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { + if (!(mp->m_flags & xfs_infop->flag)) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + (int)(1 << mp->m_writeio_log) >> 10); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + + if (mp->m_logname) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + if (mp->m_rtname) + seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_USRQUOTA); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, "," MNTOPT_UQUOTANOENF); + + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } + + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) + seq_puts(m, "," MNTOPT_NOQUOTA); + + return 0; +} +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_write_begin does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +/* + * xfs_set_inode32() and xfs_set_inode64() are passed an agcount + * because in the growfs case, mp->m_sb.sb_agcount is not updated + * yet to the potentially higher ag count. + */ +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = agcount; + } + + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + + for (index = 0; index < agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + + if (ino > XFS_MAXINUMBER_32) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + +STATIC int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + xfs_warn(mp, "Invalid device [%s], error=%d", name, error); + } + + return error; +} + +STATIC void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void +xfs_blkdev_issue_flush( + xfs_buftarg_t *buftarg) +{ + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); +} + +STATIC void +xfs_close_devices( + struct xfs_mount *mp) +{ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp, mp->m_ddev_targp); +} + +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (mp->m_logname) { + error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + if (error) + goto out; + } + + if (mp->m_rtname) { + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + xfs_warn(mp, + "Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = -EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = -ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp, mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp, mp->m_ddev_targp); + out_close_rtdev: + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; +} + +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) +{ + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; +} + +STATIC int +xfs_init_mount_workqueues( + struct xfs_mount *mp) +{ + mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + if (!mp->m_buf_workqueue) + goto out; + + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_data_workqueue) + goto out_destroy_buf; + + mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_unwritten_workqueue) + goto out_destroy_data_iodone_queue; + + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + + return 0; + +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); +out_destroy_data_iodone_queue: + destroy_workqueue(mp->m_data_workqueue); +out_destroy_buf: + destroy_workqueue(mp->m_buf_workqueue); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_mount_workqueues( + struct xfs_mount *mp) +{ + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); + destroy_workqueue(mp->m_cil_workqueue); + destroy_workqueue(mp->m_data_workqueue); + destroy_workqueue(mp->m_unwritten_workqueue); + destroy_workqueue(mp->m_buf_workqueue); +} + +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + +/* Catch misguided souls that try to use this interface on XFS */ +STATIC struct inode * +xfs_fs_alloc_inode( + struct super_block *sb) +{ + BUG(); + return NULL; +} + +/* + * Now that the generic code is guaranteed not to be accessing + * the linux inode, we can reclaim the inode. + */ +STATIC void +xfs_fs_destroy_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_destroy_inode(ip); + + XFS_STATS_INC(vn_reclaim); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. + */ + xfs_inode_set_reclaim_tag(ip); +} + +/* + * Slab object creation initialisation for the XFS inode. + * This covers only the idempotent fields in the XFS inode; + * all other fields need to be initialised on allocation + * from the slab. This avoids the need to repeatedly initialise + * fields in the xfs inode that left in the initialise state + * when freeing the inode. + */ +STATIC void +xfs_fs_inode_init_once( + void *inode) +{ + struct xfs_inode *ip = inode; + + memset(ip, 0, sizeof(struct xfs_inode)); + + /* vfs inode */ + inode_init_once(VFS_I(ip)); + + /* xfs inode */ + atomic_set(&ip->i_pincount, 0); + spin_lock_init(&ip->i_flags_lock); + + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); +} + +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + xfs_inode_t *ip = XFS_I(inode); + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + + trace_xfs_evict_inode(ip); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + + xfs_inactive(ip); +} + +/* + * We do an unlocked check for XFS_IDONTCACHE here because we are already + * serialised against cache hits here via the inode->i_lock and igrab() in + * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be + * racing with us, and it avoids needing to grab a spinlock here for every inode + * we drop the final reference on. + */ +STATIC int +xfs_fs_drop_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); +} + +STATIC void +xfs_free_fsname( + struct xfs_mount *mp) +{ + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); +} + +STATIC int +xfs_fs_sync_fs( + struct super_block *sb, + int wait) +{ + struct xfs_mount *mp = XFS_M(sb); + + /* + * Doing anything during the async pass would be counterproductive. + */ + if (!wait) + return 0; + + xfs_log_force(mp, XFS_LOG_SYNC); + if (laptop_mode) { + /* + * The disk must be active because we're syncing. + * We schedule log work now (now that the disk is + * active) instead of later (when it might not be). + */ + flush_delayed_work(&mp->m_log->l_work); + } + + return 0; +} + +STATIC int +xfs_fs_statfs( + struct dentry *dentry, + struct kstatfs *statp) +{ + struct xfs_mount *mp = XFS_M(dentry->d_sb); + xfs_sb_t *sbp = &mp->m_sb; + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; + xfs_extlen_t lsize; + __int64_t ffree; + + statp->f_type = XFS_SB_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + id = huge_encode_dev(mp->m_ddev_targp->bt_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + spin_lock(&mp->m_sb_lock); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + + fakeinos = statp->f_bfree << sbp->sb_inopblog; + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + + /* If sb_icount overshot maxicount, report actual allocation */ + statp->f_files = max_t(typeof(statp->f_files), + statp->f_files, + sbp->sb_icount); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (icount - ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + + + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) + xfs_qm_statvfs(ip, statp); + return 0; +} + +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. + * + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +static void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + xfs_log_quiesce(mp); +} + +STATIC int +xfs_fs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + struct xfs_mount *mp = XFS_M(sb); + xfs_sb_t *sbp = &mp->m_sb; + substring_t args[MAX_OPT_ARGS]; + char *p; + int error; + + sync_filesystem(sb); + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + case Opt_inode64: + mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount); + break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount); + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + xfs_info(mp, + "mount option \"%s\" not supported for remount", p); + return -EINVAL; +#else + break; +#endif + } + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_sb = false; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. + */ + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; +} + +/* + * Second stage of a freeze. The data is already frozen so we only + * need to take care of the metadata. Once that's done sync the superblock + * to the log to dirty it in case of a crash while frozen. This ensures that we + * will recover the unlinked inode lists on the next mount. + */ +STATIC int +xfs_fs_freeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + return xfs_sync_sb(mp, true); +} + +STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + return 0; +} + +STATIC int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) +{ + return xfs_showargs(XFS_M(root->d_sb), m); +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller than the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (mp->m_logbsize <= 0 && + mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (mp->m_logbsize > 0 && + mp->m_logbsize < mp->m_sb.sb_logsunit) { + xfs_warn(mp, + "logbuf size must be greater than or equal to log stripe size"); + return -EINVAL; + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { + xfs_warn(mp, + "logbuf size for version 1 logs must be 16K or 32K"); + return -EINVAL; + } + } + + /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return -EINVAL; + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + xfs_warn(mp, + "cannot mount a read-only filesystem as read-write"); + return -EROFS; + } + + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + !xfs_sb_version_has_pquotino(&mp->m_sb)) { + xfs_warn(mp, + "Super block does not support project and group quota together"); + return -EINVAL; + } + + return 0; +} + +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = -ENOMEM; + + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; + + mp->m_super = sb; + sb->s_fs_info = mp; + + error = xfs_parseargs(mp, (char *)data); + if (error) + goto out_free_fsname; + + sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; + sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA + sb->s_qcop = &xfs_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &xfs_super_operations; + + if (silent) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp); + if (error) + goto out_free_fsname; + + error = xfs_init_mount_workqueues(mp); + if (error) + goto out_close_devices; + + error = xfs_init_percpu_counters(mp); + if (error) + goto out_destroy_workqueues; + + error = xfs_readsb(mp, flags); + if (error) + goto out_destroy_counters; + + error = xfs_finish_flags(mp); + if (error) + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + */ + sb->s_magic = XFS_SB_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_max_links = XFS_MAXLINK; + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + root = igrab(VFS_I(mp->m_rootip)); + if (!root) { + error = -ENOENT; + goto out_unmount; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + error = -ENOMEM; + goto out_unmount; + } + + return 0; + + out_filestream_unmount: + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_destroy_counters: + xfs_destroy_percpu_counters(mp); +out_destroy_workqueues: + xfs_destroy_mount_workqueues(mp); + out_close_devices: + xfs_close_devices(mp); + out_free_fsname: + xfs_free_fsname(mp); + kfree(mp); + out: + return error; + + out_unmount: + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + goto out_free_sb; +} + +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + +STATIC struct dentry * +xfs_fs_mount( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); +} + +static long +xfs_fs_nr_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + return xfs_reclaim_inodes_count(XFS_M(sb)); +} + +static long +xfs_fs_free_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .evict_inode = xfs_fs_evict_inode, + .drop_inode = xfs_fs_drop_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .remount_fs = xfs_fs_remount, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, +}; + +static struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .mount = xfs_fs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("xfs"); + +STATIC int __init +xfs_init_zones(void) +{ + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_da_state_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_log_item_desc_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; + + return 0; + + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + +} + +STATIC int __init +xfs_init_workqueues(void) +{ + /* + * The allocation workqueue can be used in memory reclaim situations + * (writepage path), and parallelism is only limited by the number of + * AGs in all the filesystems mounted. Hence use the default large + * max_active value for this workqueue. + */ + xfs_alloc_wq = alloc_workqueue("xfsalloc", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + if (!xfs_alloc_wq) + return -ENOMEM; + + return 0; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_alloc_wq); +} + +STATIC int __init +init_xfs_fs(void) +{ + int error; + + printk(KERN_INFO XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"); + + xfs_dir_startup(); + + error = xfs_init_zones(); + if (error) + goto out; + + error = xfs_init_workqueues(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + + error = xfs_buf_init(); + if (error) + goto out_mru_cache_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; + + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); + if (!xfs_kset) { + error = -ENOMEM; + goto out_sysctl_unregister;; + } + +#ifdef DEBUG + xfs_dbg_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); + if (error) + goto out_kset_unregister; +#endif + + error = xfs_qm_init(); + if (error) + goto out_remove_kobj; + + error = register_filesystem(&xfs_fs_type); + if (error) + goto out_qm_exit; + return 0; + + out_qm_exit: + xfs_qm_exit(); + out_remove_kobj: +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); + out_kset_unregister: +#endif + kset_unregister(xfs_kset); + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: + xfs_buf_terminate(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); + out_destroy_zones: + xfs_destroy_zones(); + out: + return error; +} + +STATIC void __exit +exit_xfs_fs(void) +{ + xfs_qm_exit(); + unregister_filesystem(&xfs_fs_type); +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); +#endif + kset_unregister(xfs_kset); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); + xfs_buf_terminate(); + xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); + xfs_destroy_zones(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/xfs/xfs_super.h b/kernel/fs/xfs/xfs_super.h new file mode 100644 index 000000000..499058fea --- /dev/null +++ b/kernel/fs/xfs/xfs_super.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#include + +#ifdef CONFIG_XFS_QUOTA +extern int xfs_qm_init(void); +extern void xfs_qm_exit(void); +#else +# define xfs_qm_init() (0) +# define xfs_qm_exit() do { } while (0) +#endif + +#ifdef CONFIG_XFS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define XFS_ACL_STRING +# define set_posix_acl_flag(sb) do { } while (0) +#endif + +#define XFS_SECURITY_STRING "security attributes, " + +#ifdef CONFIG_XFS_RT +# define XFS_REALTIME_STRING "realtime, " +#else +# define XFS_REALTIME_STRING +#endif + +#ifdef DEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_VERSION_STRING "SGI XFS" +#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ + XFS_REALTIME_STRING \ + XFS_DBG_STRING /* DBG must be last */ + +struct xfs_inode; +struct xfs_mount; +struct xfs_buftarg; +struct block_device; + +extern __uint64_t xfs_max_file_offset(unsigned int); + +extern void xfs_flush_inodes(struct xfs_mount *mp); +extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount); + +extern const struct export_operations xfs_export_operations; +extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct quotactl_ops xfs_quotactl_operations; + +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + +#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) + +#endif /* __XFS_SUPER_H__ */ diff --git a/kernel/fs/xfs/xfs_symlink.c b/kernel/fs/xfs/xfs_symlink.c new file mode 100644 index 000000000..3df411ead --- /dev/null +++ b/kernel/fs/xfs/xfs_symlink.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_log.h" + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + error = -EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = -EFSCORRUPTED; + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return -ENAMETOOLONG; + + udqp = gdqp = NULL; + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, + xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = xfs_symlink_blocks(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + if (error == -ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = -EPERM; + goto out_trans_cancel; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + if (!resblks) { + error = xfs_dir_canenter(tp, dp, link_name); + if (error) + goto out_trans_cancel; + } + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) + goto out_trans_cancel; + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto out_bmap_cancel; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = -ENOMEM; + goto out_bmap_cancel; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto out_bmap_cancel; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + struct xfs_inode *ip) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + int size; + xfs_trans_t *tp; + + mp = ip->i_mount; + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error_trans_cancel; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = -ENOMEM; + goto error_bmap_cancel; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done); + if (error) + goto error_bmap_cancel; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_bmap_cancel; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Commit the transaction containing extent freeing and EFDs. + */ + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error_unlock; + } + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_bmap_cancel: + xfs_bmap_cancel(&free_list); +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + } + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(0); + return -EFSCORRUPTED; + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip); +} diff --git a/kernel/fs/xfs/xfs_symlink.h b/kernel/fs/xfs/xfs_symlink.h new file mode 100644 index 000000000..e75245d09 --- /dev/null +++ b/kernel/fs/xfs/xfs_symlink.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +/* Kernel only symlink defintions */ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink(struct xfs_inode *ip); + +#endif /* __XFS_SYMLINK_H */ diff --git a/kernel/fs/xfs/xfs_sysctl.c b/kernel/fs/xfs/xfs_sysctl.c new file mode 100644 index 000000000..a0c8067ce --- /dev/null +++ b/kernel/fs/xfs/xfs_sysctl.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include "xfs_error.h" + +static struct ctl_table_header *xfs_table_header; + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + + if (!ret && write && *valp) { + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} + +STATIC int +xfs_panic_mask_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static struct ctl_table xfs_table[] = { + { + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_panic_mask_proc_handler, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + + { + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, + { + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + { + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_stats_clear_proc_handler, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, +#endif /* CONFIG_PROC_FS */ + + {} +}; + +static struct ctl_table xfs_dir_table[] = { + { + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} +}; + +static struct ctl_table xfs_root_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} +}; + +int +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; +} + +void +xfs_sysctl_unregister(void) +{ + unregister_sysctl_table(xfs_table_header); +} diff --git a/kernel/fs/xfs/xfs_sysctl.h b/kernel/fs/xfs/xfs_sysctl.h new file mode 100644 index 000000000..ffef45375 --- /dev/null +++ b/kernel/fs/xfs/xfs_sysctl.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include + +/* + * Tunable xfs parameters + */ + +typedef struct xfs_sysctl_val { + int min; + int val; + int max; +} xfs_sysctl_val_t; + +typedef struct xfs_param { + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is + * not a member of parent dir GID. */ + xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ + xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ + xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ + xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ + xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ + xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ + xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ + xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ + xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ + xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ + xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ + xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ + xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ +} xfs_param_t; + +/* + * xfs_error_level: + * + * How much error reporting will be done when internal problems are + * encountered. These problems normally return an EFSCORRUPTED to their + * caller, with no other information reported. + * + * 0 No error reports + * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown + * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any + * additional errors that are known to not cause shutdowns) + * + * xfs_panic_mask bit 0x8 turns the error reports into panics + */ + +enum { + /* XFS_REFCACHE_SIZE = 1 */ + /* XFS_REFCACHE_PURGE = 2 */ + /* XFS_RESTRICT_CHOWN = 3 */ + XFS_SGID_INHERIT = 4, + XFS_SYMLINK_MODE = 5, + XFS_PANIC_MASK = 6, + XFS_ERRLEVEL = 7, + XFS_SYNCD_TIMER = 8, + /* XFS_PROBE_DMAPI = 9 */ + /* XFS_PROBE_IOOPS = 10 */ + /* XFS_PROBE_QUOTA = 11 */ + XFS_STATS_CLEAR = 12, + XFS_INHERIT_SYNC = 13, + XFS_INHERIT_NODUMP = 14, + XFS_INHERIT_NOATIME = 15, + XFS_BUF_TIMER = 16, + XFS_BUF_AGE = 17, + /* XFS_IO_BYPASS = 18 */ + XFS_INHERIT_NOSYM = 19, + XFS_ROTORSTEP = 20, + XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, +}; + +extern xfs_param_t xfs_params; + +struct xfs_globals { + int log_recovery_delay; /* log recovery delay (secs) */ +}; +extern struct xfs_globals xfs_globals; + +#ifdef CONFIG_SYSCTL +extern int xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +# define xfs_sysctl_register() (0) +# define xfs_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +#endif /* __XFS_SYSCTL_H__ */ diff --git a/kernel/fs/xfs/xfs_sysfs.c b/kernel/fs/xfs/xfs_sysfs.c new file mode 100644 index 000000000..aa0367085 --- /dev/null +++ b/kernel/fs/xfs/xfs_sysfs.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_sysfs.h" +#include "xfs_log_format.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" + +struct xfs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(char *buf, void *data); + ssize_t (*store)(const char *buf, size_t count, void *data); +}; + +static inline struct xfs_sysfs_attr * +to_attr(struct attribute *attr) +{ + return container_of(attr, struct xfs_sysfs_attr, attr); +} + +#define XFS_SYSFS_ATTR_RW(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) +#define XFS_SYSFS_ATTR_RO(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr + +/* + * xfs_mount kobject. This currently has no attributes and thus no need for show + * and store helpers. The mp kobject serves as the per-mount parent object that + * is identified by the fsname under sysfs. + */ + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, +}; + +#ifdef DEBUG +/* debug */ + +STATIC ssize_t +log_recovery_delay_store( + const char *buf, + size_t count, + void *data) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.log_recovery_delay = val; + + return count; +} + +STATIC ssize_t +log_recovery_delay_show( + char *buf, + void *data) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); +} +XFS_SYSFS_ATTR_RW(log_recovery_delay); + +static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(log_recovery_delay), + NULL, +}; + +STATIC ssize_t +xfs_dbg_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; +} + +STATIC ssize_t +xfs_dbg_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; +} + +static struct sysfs_ops xfs_dbg_ops = { + .show = xfs_dbg_show, + .store = xfs_dbg_store, +}; + +struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_dbg_ops, + .default_attrs = xfs_dbg_attrs, +}; + +#endif /* DEBUG */ + +/* xlog */ + +STATIC ssize_t +log_head_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + spin_lock(&log->l_icloglock); + cycle = log->l_curr_cycle; + block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_head_lsn); + +STATIC ssize_t +log_tail_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_tail_lsn); + +STATIC ssize_t +reserve_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(reserve_grant_head); + +STATIC ssize_t +write_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(write_grant_head); + +static struct attribute *xfs_log_attrs[] = { + ATTR_LIST(log_head_lsn), + ATTR_LIST(log_tail_lsn), + ATTR_LIST(reserve_grant_head), + ATTR_LIST(write_grant_head), + NULL, +}; + +static inline struct xlog * +to_xlog(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xlog, l_kobj); +} + +STATIC ssize_t +xfs_log_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, log) : 0; +} + +STATIC ssize_t +xfs_log_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0; +} + +static struct sysfs_ops xfs_log_ops = { + .show = xfs_log_show, + .store = xfs_log_store, +}; + +struct kobj_type xfs_log_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_log_ops, + .default_attrs = xfs_log_attrs, +}; diff --git a/kernel/fs/xfs/xfs_sysfs.h b/kernel/fs/xfs/xfs_sysfs.h new file mode 100644 index 000000000..240eee35f --- /dev/null +++ b/kernel/fs/xfs/xfs_sysfs.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __XFS_SYSFS_H__ +#define __XFS_SYSFS_H__ + +extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern struct kobj_type xfs_dbg_ktype; /* debug */ +extern struct kobj_type xfs_log_ktype; /* xlog */ + +static inline struct xfs_kobj * +to_kobj(struct kobject *kobject) +{ + return container_of(kobject, struct xfs_kobj, kobject); +} + +static inline void +xfs_sysfs_release(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + complete(&kobj->complete); +} + +static inline int +xfs_sysfs_init( + struct xfs_kobj *kobj, + struct kobj_type *ktype, + struct xfs_kobj *parent_kobj, + const char *name) +{ + init_completion(&kobj->complete); + return kobject_init_and_add(&kobj->kobject, ktype, + &parent_kobj->kobject, "%s", name); +} + +static inline void +xfs_sysfs_del( + struct xfs_kobj *kobj) +{ + kobject_del(&kobj->kobject); + kobject_put(&kobj->kobject); + wait_for_completion(&kobj->complete); +} + +#endif /* __XFS_SYSFS_H__ */ diff --git a/kernel/fs/xfs/xfs_trace.c b/kernel/fs/xfs/xfs_trace.c new file mode 100644 index 000000000..13a029806 --- /dev/null +++ b/kernel/fs/xfs/xfs_trace.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_da_btree.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_quota.h" +#include "xfs_iomap.h" +#include "xfs_aops.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_filestream.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "xfs_trace.h" diff --git a/kernel/fs/xfs/xfs_trace.h b/kernel/fs/xfs/xfs_trace.h new file mode 100644 index 000000000..615781bf4 --- /dev/null +++ b/kernel/fs/xfs/xfs_trace.h @@ -0,0 +1,2083 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs + +#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_H + +#include + +struct xfs_agf; +struct xfs_alloc_arg; +struct xfs_attr_list_context; +struct xfs_buf_log_item; +struct xfs_da_args; +struct xfs_da_node_entry; +struct xfs_dquot; +struct xfs_log_item; +struct xlog; +struct xlog_ticket; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; +struct xfs_bmbt_irec; + +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + +#define DEFINE_ATTR_LIST_EVENT(name) \ +DEFINE_EVENT(xfs_attr_list_class, name, \ + TP_PROTO(struct xfs_attr_list_context *ctx), \ + TP_ARGS(ctx)) +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); + +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); + +DECLARE_EVENT_CLASS(xfs_ag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), + TP_ARGS(mp, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + ), + TP_printk("dev %d:%d agno %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno) +); +#define DEFINE_AG_EVENT(name) \ +DEFINE_EVENT(xfs_ag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ + TP_ARGS(mp, agno)) + +DEFINE_AG_EVENT(xfs_read_agf); +DEFINE_AG_EVENT(xfs_alloc_read_agf); +DEFINE_AG_EVENT(xfs_read_agi); +DEFINE_AG_EVENT(xfs_ialloc_read_agi); + +TRACE_EVENT(xfs_attr_list_node_descend, + TP_PROTO(struct xfs_attr_list_context *ctx, + struct xfs_da_node_entry *btree), + TP_ARGS(ctx, btree), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + __field(u32, bt_hashval) + __field(u32, bt_before) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + __entry->bt_hashval = be32_to_cpu(btree->hashval); + __entry->bt_before = be32_to_cpu(btree->before); + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s " + "node hashval %u, node before %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __entry->bt_hashval, + __entry->bt_before) +); + +TRACE_EVENT(xfs_iext_insert, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, + struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), + TP_ARGS(ip, idx, r, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r->br_startoff; + __entry->startblock = r->br_startblock; + __entry->blockcount = r->br_blockcount; + __entry->state = r->br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + +#define DEFINE_BMAP_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + unsigned long caller_ip), \ + TP_ARGS(ip, idx, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_remove); +DEFINE_BMAP_EVENT(xfs_bmap_pre_update); +DEFINE_BMAP_EVENT(xfs_bmap_post_update); +DEFINE_BMAP_EVENT(xfs_extlist); + +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " + "lock %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) +DEFINE_BUF_EVENT(xfs_buf_init); +DEFINE_BUF_EVENT(xfs_buf_free); +DEFINE_BUF_EVENT(xfs_buf_hold); +DEFINE_BUF_EVENT(xfs_buf_rele); +DEFINE_BUF_EVENT(xfs_buf_iodone); +DEFINE_BUF_EVENT(xfs_buf_submit); +DEFINE_BUF_EVENT(xfs_buf_submit_wait); +DEFINE_BUF_EVENT(xfs_buf_bawrite); +DEFINE_BUF_EVENT(xfs_buf_lock); +DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock); +DEFINE_BUF_EVENT(xfs_buf_unlock); +DEFINE_BUF_EVENT(xfs_buf_iowait); +DEFINE_BUF_EVENT(xfs_buf_iowait_done); +DEFINE_BUF_EVENT(xfs_buf_delwri_queue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); +DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); +DEFINE_BUF_EVENT(xfs_bdstrat_shut); +DEFINE_BUF_EVENT(xfs_buf_item_relse); +DEFINE_BUF_EVENT(xfs_buf_item_iodone); +DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); +DEFINE_BUF_EVENT(xfs_trans_read_buf_io); +DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); + +/* not really buffer traces, but the buf provides useful information */ +DEFINE_BUF_EVENT(xfs_btree_corrupt); +DEFINE_BUF_EVENT(xfs_da_btree_corrupt); +DEFINE_BUF_EVENT(xfs_reset_dqcounts); +DEFINE_BUF_EVENT(xfs_inode_item_push); + +/* pass flags explicitly */ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) +DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); + +TRACE_EVENT(xfs_buf_ioerror, + TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_ARGS(bp, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(unsigned, flags) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->error = error; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d error %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __entry->error, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = bip->bli_buf->b_sema.count; + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); +DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); + +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); + +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_LOCK_EVENT(name) \ +DEFINE_EVENT(xfs_lock_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ + unsigned long caller_ip), \ + TP_ARGS(ip, lock_flags, caller_ip)) +DEFINE_LOCK_EVENT(xfs_ilock); +DEFINE_LOCK_EVENT(xfs_ilock_nowait); +DEFINE_LOCK_EVENT(xfs_ilock_demote); +DEFINE_LOCK_EVENT(xfs_iunlock); + +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_get_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_evict_inode); +DEFINE_INODE_EVENT(xfs_update_time); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); + +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + +DECLARE_EVENT_CLASS(xfs_iref_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(int, pincount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + __entry->pincount, + (char *)__entry->caller_ip) +) + +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ + TP_ARGS(ip, caller_ip)) +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __entry->namelen, + __get_str(name)) +) + +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __field(int, src_namelen) + __field(int, target_namelen) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + __entry->src_namelen = src_name->len; + __entry->target_namelen = target_name->len; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, + target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %.*s target name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __entry->src_namelen, + __get_str(src_name), + __entry->target_namelen, + __get_str(target_name)) +) + +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) + ), \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = be32_to_cpu(dqp->q_core.d_id); + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->id, + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) +) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) +DEFINE_DQUOT_EVENT(xfs_dqadjust); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); +DEFINE_DQUOT_EVENT(xfs_dqattach_found); +DEFINE_DQUOT_EVENT(xfs_dqattach_get); +DEFINE_DQUOT_EVENT(xfs_dqalloc); +DEFINE_DQUOT_EVENT(xfs_dqtobp_read); +DEFINE_DQUOT_EVENT(xfs_dqread); +DEFINE_DQUOT_EVENT(xfs_dqread_fail); +DEFINE_DQUOT_EVENT(xfs_dqget_hit); +DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqget_freeing); +DEFINE_DQUOT_EVENT(xfs_dqget_dup); +DEFINE_DQUOT_EVENT(xfs_dqput); +DEFINE_DQUOT_EVENT(xfs_dqput_wait); +DEFINE_DQUOT_EVENT(xfs_dqput_free); +DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqflush); +DEFINE_DQUOT_EVENT(xfs_dqflush_force); +DEFINE_DQUOT_EVENT(xfs_dqflush_done); + +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(int, reserveq) + __field(int, writeq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserveq = list_empty(&log->l_reserve_head.waiters); + __entry->writeq = list_empty(&log->l_write_head.waiters); + xlog_crack_grant_head(&log->l_reserve_head.grant, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_write_head.grant, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) + +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ + TP_ARGS(log, tic)) +DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); +DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); +DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); + +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); + +DECLARE_EVENT_CLASS(xfs_ail_class, + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), + TP_ARGS(lip, old_lsn, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, new_lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->old_lsn = old_lsn; + __entry->new_lsn = new_lsn; + ), + TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_EVENT(name) \ +DEFINE_EVENT(xfs_ail_class, name, \ + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \ + TP_ARGS(lip, old_lsn, new_lsn)) +DEFINE_AIL_EVENT(xfs_ail_insert); +DEFINE_AIL_EVENT(xfs_ail_move); +DEFINE_AIL_EVENT(xfs_ail_delete); + +TRACE_EVENT(xfs_log_assign_tail_lsn, + TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn), + TP_ARGS(log, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, new_lsn) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, last_sync_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->new_lsn = new_lsn; + __entry->old_lsn = atomic64_read(&log->l_tail_lsn); + __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + ), + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) +) + +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) +) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) +DEFINE_RW_EVENT(xfs_file_read); +DEFINE_RW_EVENT(xfs_file_buffered_write); +DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_splice_read); + +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(unsigned int, length) + __field(int, delalloc) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->length = len; + __entry->delalloc = delalloc; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "length %x delalloc %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->length, + __entry->delalloc, + __entry->unwritten) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) +DEFINE_PAGE_EVENT(xfs_writepage); +DEFINE_PAGE_EVENT(xfs_releasepage); +DEFINE_PAGE_EVENT(xfs_invalidatepage); + +DECLARE_EVENT_CLASS(xfs_imap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, type) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->type = type; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_imap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); + +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, isize) + __field(loff_t, disize) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->isize = VFS_I(ip)->i_size; + __entry->disize = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->disize, + __entry->offset, + __entry->count) +); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) +DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); +DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); +DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); + +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + +#define DEFINE_ITRUNC_EVENT(name) \ +DEFINE_EVENT(xfs_itrunc_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ + TP_ARGS(ip, new_size)) +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); + +TRACE_EVENT(xfs_pagecache_inval, + TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), + TP_ARGS(ip, start, finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_off_t, start) + __field(xfs_off_t, finish) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->start = start; + __entry->finish = finish; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->start, + __entry->finish) +); + +TRACE_EVENT(xfs_bunmap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + int flags, unsigned long caller_ip), + TP_ARGS(ip, bno, len, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fileoff_t, bno) + __field(xfs_filblks_t, len) + __field(unsigned long, caller_ip) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->bno = bno; + __entry->len = len; + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + "flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->bno, + __entry->len, + __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), + (void *)__entry->caller_ip) + +); + +DECLARE_EVENT_CLASS(xfs_extent_busy_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); + +TRACE_EVENT(xfs_extent_busy_trim, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->tbno = tbno; + __entry->tlen = tlen; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->tbno, + __entry->tlen) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->lsn) +); + +TRACE_EVENT(xfs_agf, + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, + unsigned long caller_ip), + TP_ARGS(mp, agf, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, flags) + __field(__u32, length) + __field(__u32, bno_root) + __field(__u32, cnt_root) + __field(__u32, bno_level) + __field(__u32, cnt_level) + __field(__u32, flfirst) + __field(__u32, fllast) + __field(__u32, flcount) + __field(__u32, freeblks) + __field(__u32, longest) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = be32_to_cpu(agf->agf_seqno), + __entry->flags = flags; + __entry->length = be32_to_cpu(agf->agf_length), + __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), + __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), + __entry->bno_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), + __entry->cnt_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->flfirst = be32_to_cpu(agf->agf_flfirst), + __entry->fllast = be32_to_cpu(agf->agf_fllast), + __entry->flcount = be32_to_cpu(agf->agf_flcount), + __entry->freeblks = be32_to_cpu(agf->agf_freeblks), + __entry->longest = be32_to_cpu(agf->agf_longest); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + "levels b %u c %u flfirst %u fllast %u flcount %u " + "freeblks %u longest %u caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), + __entry->length, + __entry->bno_root, + __entry->cnt_root, + __entry->bno_level, + __entry->cnt_level, + __entry->flfirst, + __entry->fllast, + __entry->flcount, + __entry->freeblks, + __entry->longest, + (void *)__entry->caller_ip) +); + +TRACE_EVENT(xfs_free_extent, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, bool isfl, int haveleft, int haveright), + TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, isfl) + __field(int, haveleft) + __field(int, haveright) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->isfl = isfl; + __entry->haveleft = haveleft; + __entry->haveright = haveright; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->isfl, + __entry->haveleft ? + (__entry->haveright ? "both" : "left") : + (__entry->haveright ? "right" : "none")) + +); + +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + (unsigned long long)__entry->firstblock) +) + +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) +DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_near_first); +DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); +DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); +DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_size_done); +DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); +DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); +DEFINE_ALLOC_EVENT(xfs_alloc_small_done); +DEFINE_ALLOC_EVENT(xfs_alloc_small_error); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); + +DECLARE_EVENT_CLASS(xfs_da_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_ATTR_EVENT(name) \ +DEFINE_EVENT(xfs_attr_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_ATTR_EVENT(xfs_attr_sf_add); +DEFINE_ATTR_EVENT(xfs_attr_sf_addname); +DEFINE_ATTR_EVENT(xfs_attr_sf_create); +DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_sf_remove); +DEFINE_ATTR_EVENT(xfs_attr_sf_removename); +DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); + +DEFINE_ATTR_EVENT(xfs_attr_leaf_add); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); +DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); +DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); +DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); +DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after); +DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); +DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); + +DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); +DEFINE_ATTR_EVENT(xfs_attr_node_lookup); +DEFINE_ATTR_EVENT(xfs_attr_node_replace); +DEFINE_ATTR_EVENT(xfs_attr_node_removename); + +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + +#define DEFINE_DA_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DA_EVENT(xfs_da_split); +DEFINE_DA_EVENT(xfs_da_join); +DEFINE_DA_EVENT(xfs_da_link_before); +DEFINE_DA_EVENT(xfs_da_link_after); +DEFINE_DA_EVENT(xfs_da_unlink_back); +DEFINE_DA_EVENT(xfs_da_unlink_forward); +DEFINE_DA_EVENT(xfs_da_root_split); +DEFINE_DA_EVENT(xfs_da_root_join); +DEFINE_DA_EVENT(xfs_da_node_add); +DEFINE_DA_EVENT(xfs_da_node_create); +DEFINE_DA_EVENT(xfs_da_node_split); +DEFINE_DA_EVENT(xfs_da_node_remove); +DEFINE_DA_EVENT(xfs_da_node_rebalance); +DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); +DEFINE_DA_EVENT(xfs_da_swap_lastblock); +DEFINE_DA_EVENT(xfs_da_grow_inode); +DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) +) + +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ + TP_PROTO(struct xfs_da_args *args, int idx), \ + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); + +TRACE_EVENT(xfs_dir2_leafn_moveents, + TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), + TP_ARGS(args, src_idx, dst_idx, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, src_idx) + __field(int, dst_idx) + __field(int, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->src_idx = src_idx; + __entry->dst_idx = dst_idx; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s " + "src_idx %d dst_idx %d count %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->src_idx, + __entry->dst_idx, + __entry->count) +); + +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + +#endif /* _TRACE_XFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE xfs_trace +#include diff --git a/kernel/fs/xfs/xfs_trans.c b/kernel/fs/xfs/xfs_trans.c new file mode 100644 index 000000000..220ef2c90 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans.c @@ -0,0 +1,1105 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_trace.h" +#include "xfs_error.h" + +kmem_zone_t *xfs_trans_zone; +kmem_zone_t *xfs_log_item_desc_zone; + +/* + * Initialize the precomputed transaction reservation values + * in the mount structure. + */ +void +xfs_trans_init( + struct xfs_mount *mp) +{ + xfs_trans_resv_calc(mp, M_RES(mp)); +} + +/* + * This routine is called to allocate a transaction structure. + * The type parameter indicates the type of the transaction. These + * are enumerated in xfs_trans.h. + * + * Dynamically allocate the transaction structure from the transaction + * zone, initialize it, and return it to the caller. + */ +xfs_trans_t * +xfs_trans_alloc( + xfs_mount_t *mp, + uint type) +{ + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; +} + +xfs_trans_t * +_xfs_trans_alloc( + xfs_mount_t *mp, + uint type, + xfs_km_flags_t memflags) +{ + xfs_trans_t *tp; + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_type = type; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + return tp; +} + +/* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + + atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* + * This is called to create a new transaction which will share the + * permanent log reservation of the given transaction. The remaining + * unused block and rt extent reservations are also inherited. This + * implies that the original transaction is no longer allowed to allocate + * blocks. Locks and log items, however, are no inherited. They must + * be added to the new transaction explicitly. + */ +xfs_trans_t * +xfs_trans_dup( + xfs_trans_t *tp) +{ + xfs_trans_t *ntp; + + ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + + /* + * Initialize the new transaction structure. + */ + ntp->t_magic = XFS_TRANS_HEADER_MAGIC; + ntp->t_type = tp->t_type; + ntp->t_mountp = tp->t_mountp; + INIT_LIST_HEAD(&ntp->t_items); + INIT_LIST_HEAD(&ntp->t_busy); + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(tp->t_ticket != NULL); + + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); + ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; + tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; + tp->t_rtx_res = tp->t_rtx_res_used; + ntp->t_pflags = tp->t_pflags; + + xfs_trans_dup_dqinfo(tp, ntp); + + atomic_inc(&tp->t_mountp->m_active_trans); + return ntp; +} + +/* + * This is called to reserve free disk blocks and log space for the + * given transaction. This must be done before allocating any resources + * within the transaction. + * + * This will return ENOSPC if there are not enough blocks available. + * It will sleep waiting for available log space. + * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which + * is used by long running transactions. If any one of the reservations + * fails then they will all be backed out. + * + * This does not do quota reservations. That typically is done by the + * caller afterwards. + */ +int +xfs_trans_reserve( + struct xfs_trans *tp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents) +{ + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* Mark this thread as being in a transaction */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* + * Attempt to reserve the needed disk blocks by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (blocks > 0) { + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + if (error != 0) { + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return -ENOSPC; + } + tp->t_blk_res += blocks; + } + + /* + * Reserve the log space needed for this transaction. + */ + if (resp->tr_logres > 0) { + bool permanent = false; + + ASSERT(tp->t_log_res == 0 || + tp->t_log_res == resp->tr_logres); + ASSERT(tp->t_log_count == 0 || + tp->t_log_count == resp->tr_logcount); + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + permanent = true; + } else { + ASSERT(tp->t_ticket == NULL); + ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + } + + if (tp->t_ticket != NULL) { + ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + } else { + error = xfs_log_reserve(tp->t_mountp, + resp->tr_logres, + resp->tr_logcount, + &tp->t_ticket, XFS_TRANSACTION, + permanent, tp->t_type); + } + + if (error) + goto undo_blocks; + + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; + } + + /* + * Attempt to reserve the needed realtime extents by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (rtextents > 0) { + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + if (error) { + error = -ENOSPC; + goto undo_log; + } + tp->t_rtx_res += rtextents; + } + + return 0; + + /* + * Error cases jump to one of these labels to undo any + * reservations which have already been performed. + */ +undo_log: + if (resp->tr_logres > 0) { + int log_flags; + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; + } + +undo_blocks: + if (blocks > 0) { + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + tp->t_blk_res = 0; + } + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + return error; +} + +/* + * Record the indicated change to the given field for application + * to the file system's superblock when the transaction commits. + * For now, just store the change in the transaction structure. + * + * Mark the transaction structure to indicate that the superblock + * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. + */ +void +xfs_trans_mod_sb( + xfs_trans_t *tp, + uint field, + int64_t delta) +{ + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; + + switch (field) { + case XFS_TRANS_SB_ICOUNT: + tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_IFREE: + tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FDBLOCKS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_blk_res_used += (uint)-delta; + ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + } + tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_RES_FDBLOCKS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FREXTENTS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_rtx_res_used += (uint)-delta; + ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); + } + tp->t_frextents_delta += delta; + break; + case XFS_TRANS_SB_RES_FREXTENTS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_frextents_delta += delta; + break; + case XFS_TRANS_SB_DBLOCKS: + ASSERT(delta > 0); + tp->t_dblocks_delta += delta; + break; + case XFS_TRANS_SB_AGCOUNT: + ASSERT(delta > 0); + tp->t_agcount_delta += delta; + break; + case XFS_TRANS_SB_IMAXPCT: + tp->t_imaxpct_delta += delta; + break; + case XFS_TRANS_SB_REXTSIZE: + tp->t_rextsize_delta += delta; + break; + case XFS_TRANS_SB_RBMBLOCKS: + tp->t_rbmblocks_delta += delta; + break; + case XFS_TRANS_SB_RBLOCKS: + tp->t_rblocks_delta += delta; + break; + case XFS_TRANS_SB_REXTENTS: + tp->t_rextents_delta += delta; + break; + case XFS_TRANS_SB_REXTSLOG: + tp->t_rextslog_delta += delta; + break; + default: + ASSERT(0); + return; + } + + tp->t_flags |= flags; +} + +/* + * xfs_trans_apply_sb_deltas() is called from the commit code + * to bring the superblock buffer into the current transaction + * and modify it as requested by earlier calls to xfs_trans_mod_sb(). + * + * For now we just look at each field allowed to change and change + * it if necessary. + */ +STATIC void +xfs_trans_apply_sb_deltas( + xfs_trans_t *tp) +{ + xfs_dsb_t *sbp; + xfs_buf_t *bp; + int whole = 0; + + bp = xfs_trans_getsb(tp, tp->t_mountp, 0); + sbp = XFS_BUF_TO_SBP(bp); + + /* + * Check that superblock mods match the mods made to AGF counters. + */ + ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == + (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + + tp->t_ag_btree_delta)); + + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta) + be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); + if (tp->t_ifree_delta) + be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta); + if (tp->t_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta); + if (tp->t_res_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); + } + + if (tp->t_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); + if (tp->t_res_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + + if (tp->t_dblocks_delta) { + be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + whole = 1; + } + if (tp->t_agcount_delta) { + be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta); + whole = 1; + } + if (tp->t_imaxpct_delta) { + sbp->sb_imax_pct += tp->t_imaxpct_delta; + whole = 1; + } + if (tp->t_rextsize_delta) { + be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); + whole = 1; + } + if (tp->t_rbmblocks_delta) { + be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta); + whole = 1; + } + if (tp->t_rblocks_delta) { + be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + whole = 1; + } + if (tp->t_rextents_delta) { + be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta); + whole = 1; + } + if (tp->t_rextslog_delta) { + sbp->sb_rextslog += tp->t_rextslog_delta; + whole = 1; + } + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + if (whole) + /* + * Log the whole thing, the fields are noncontiguous. + */ + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); + else + /* + * Since all the modifiable fields are contiguous, we + * can get away with this. + */ + xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), + offsetof(xfs_dsb_t, sb_frextents) + + sizeof(sbp->sb_frextents) - 1); +} + +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +/* + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. + */ +void +xfs_trans_unreserve_and_mod_sb( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; + + /* calculate deltas */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + idelta = tp->t_icount_delta; + ifreedelta = tp->t_ifree_delta; + } + + /* apply the per-cpu counters */ + if (blkdelta) { + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); + if (error) + goto out; + } + + if (idelta) { + error = xfs_mod_icount(mp, idelta); + if (error) + goto out_undo_fdblocks; + } + + if (ifreedelta) { + error = xfs_mod_ifree(mp, ifreedelta); + if (error) + goto out_undo_icount; + } + + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + + /* apply remaining deltas */ + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); + if (error) + goto out_undo_ifree; + } + + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; + } + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); + if (error) + goto out_undo_dblocks; + } + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); + return; + +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); + if (ifreedelta) + xfs_mod_ifree(mp, -ifreedelta); +out_undo_icount: + if (idelta) + xfs_mod_icount(mp, -idelta); +out_undo_fdblocks: + if (blkdelta) + xfs_mod_fdblocks(mp, -blkdelta, rsvd); +out: + ASSERT(error == 0); + return; +} + +/* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + lip->li_ops->iop_committing(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + lip->li_ops->iop_unlock(lip); + + xfs_trans_free_item_desc(lidp); + } +} + +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + lip->li_ops->iop_unpin(lip, 0); + } +} + +/* + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through iop_commited and iop_unlock, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; + int i = 0; + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + lip->li_ops->iop_unpin(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + lip->li_ops->iop_unpin(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, &cur, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); +} + +/* + * Commit the given transaction to the log. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. + */ +int +xfs_trans_commit( + struct xfs_trans *tp, + uint flags) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; + + /* + * Determine whether this commit is releasing a permanent + * log reservation or not. + */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } + + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EIO; + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); + } + + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. + */ + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = -EIO; + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + + XFS_STATS_INC(xs_trans_empty); + return error; +} + +/* + * Unlock all of the transaction's items and free the transaction. + * The transaction must not have modified any of its items, because + * there is no way to restore them to their previous state. + * + * If the transaction has made a log reservation, make sure to release + * it as well. + */ +void +xfs_trans_cancel( + xfs_trans_t *tp, + int flags) +{ + int log_flags; + xfs_mount_t *mp = tp->t_mountp; + + /* + * See if the caller is being too lazy to figure out if + * the transaction really needs an abort. + */ + if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) + flags &= ~XFS_TRANS_ABORT; + /* + * See if the caller is relying on us to shut down the + * filesystem. This happens in paths where we detect + * corruption and decide to give up. + */ + if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } +#ifdef DEBUG + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + } +#endif + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + if (tp->t_ticket) { + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + } + + /* mark this thread as no longer being in a transaction */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. + */ +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + struct xfs_trans_res tres; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + tres.tr_logres = trans->t_log_res; + tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + error = xfs_trans_commit(trans, 0); + if (error) + return error; + + trans = *tpp; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(trans->t_ticket); + + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(trans, &tres, 0, 0); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; + + xfs_trans_ijoin(trans, dp, 0); + return 0; +} diff --git a/kernel/fs/xfs/xfs_trans.h b/kernel/fs/xfs/xfs_trans.h new file mode 100644 index 000000000..b5bc1ab3c --- /dev/null +++ b/kernel/fs/xfs/xfs_trans.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_H__ +#define __XFS_TRANS_H__ + +/* kernel only transaction subsystem defines */ + +struct xfs_buf; +struct xfs_buftarg; +struct xfs_efd_log_item; +struct xfs_efi_log_item; +struct xfs_inode; +struct xfs_item_ops; +struct xfs_log_iovec; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_trans_res; +struct xfs_dquot_acct; +struct xfs_busy_extent; + +typedef struct xfs_log_item { + struct list_head li_ail; /* AIL pointers */ + xfs_lsn_t li_lsn; /* last on-disk lsn */ + struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ + struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xfs_ail *li_ailp; /* ptr to AIL */ + uint li_type; /* item type */ + uint li_flags; /* misc flags */ + struct xfs_log_item *li_bio_list; /* buffer item list */ + void (*li_cb)(struct xfs_buf *, + struct xfs_log_item *); + /* buffer item iodone */ + /* callback func */ + const struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ +} xfs_log_item_t; + +#define XFS_LI_IN_AIL 0x1 +#define XFS_LI_ABORTED 0x2 + +#define XFS_LI_FLAGS \ + { XFS_LI_IN_AIL, "IN_AIL" }, \ + { XFS_LI_ABORTED, "ABORTED" } + +struct xfs_item_ops { + void (*iop_size)(xfs_log_item_t *, int *, int *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); + void (*iop_pin)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int remove); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); + void (*iop_unlock)(xfs_log_item_t *); + xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); +}; + +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, + int type, const struct xfs_item_ops *ops); + +/* + * Return values for the iop_push() routines. + */ +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 + + +/* + * This is the structure maintained for every active transaction. + */ +typedef struct xfs_trans { + unsigned int t_magic; /* magic number */ + unsigned int t_type; /* transaction type */ + unsigned int t_log_res; /* amt of log space resvd */ + unsigned int t_log_count; /* count for perm log res */ + unsigned int t_blk_res; /* # of blocks resvd */ + unsigned int t_blk_res_used; /* # of resvd blocks used */ + unsigned int t_rtx_res; /* # of rt extents resvd */ + unsigned int t_rtx_res_used; /* # of resvd rt extents used */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ + xfs_lsn_t t_lsn; /* log seq num of start of + * transaction. */ + xfs_lsn_t t_commit_lsn; /* log seq num of end of + * transaction. */ + struct xfs_mount *t_mountp; /* ptr to fs mount struct */ + struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + unsigned int t_flags; /* misc flags */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ +#if defined(DEBUG) || defined(XFS_WARN) + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ +#endif + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ + struct list_head t_items; /* log item descriptors */ + struct list_head t_busy; /* list of busy extents */ + unsigned long t_pflags; /* saved process flags state */ +} xfs_trans_t; + +/* + * XFS transaction mechanism exported interfaces that are + * actually macros. + */ +#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) +#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) +#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) +#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) + +#if defined(DEBUG) || defined(XFS_WARN) +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) +#else +#define xfs_trans_agblocks_delta(tp, d) +#define xfs_trans_agflist_delta(tp, d) +#define xfs_trans_agbtree_delta(tp, d) +#endif + +/* + * XFS transaction mechanism exported interfaces. + */ +xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); +xfs_trans_t *xfs_trans_dup(xfs_trans_t *); +int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, + uint, uint); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); + +struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + uint flags); + +static inline struct xfs_buf * +xfs_trans_get_buf( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + uint flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags); +} + +int xfs_trans_read_buf_map(struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); + +static inline int +xfs_trans_read_buf( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, ops); +} + +struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); + +void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); +void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); +void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); +struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); +void xfs_efi_release(struct xfs_efi_log_item *, uint); +void xfs_trans_log_efi_extent(xfs_trans_t *, + struct xfs_efi_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + struct xfs_efi_log_item *, + uint); +void xfs_trans_log_efd_extent(xfs_trans_t *, + struct xfs_efd_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +void xfs_trans_cancel(xfs_trans_t *, int); +int xfs_trans_ail_init(struct xfs_mount *); +void xfs_trans_ail_destroy(struct xfs_mount *); + +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, + struct xfs_buf *src_bp); + +extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; + +#endif /* __XFS_TRANS_H__ */ diff --git a/kernel/fs/xfs/xfs_trans_ail.c b/kernel/fs/xfs/xfs_trans_ail.c new file mode 100644 index 000000000..573aefb5a --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_ail.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_trace.h" +#include "xfs_error.h" +#include "xfs_log.h" + +#ifdef DEBUG +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *prev_lip; + + if (list_empty(&ailp->xa_ail)) + return; + + /* + * Check the next and previous entries are valid. + */ + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +} +#else /* !DEBUG */ +#define xfs_ail_check(a,l) +#endif /* DEBUG */ + +/* + * Return a pointer to the last item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_max( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); +} + +/* + * Return a pointer to the item which follows the given item in the AIL. If + * the given item is the last item in the list, then return NULL. + */ +static xfs_log_item_t * +xfs_ail_next( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + if (lip->li_ail.next == &ailp->xa_ail) + return NULL; + + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); +} + +/* + * This is called by the log manager code to determine the LSN of the tail of + * the log. This is exactly the LSN of the first item in the AIL. If the AIL + * is empty, then this function returns 0. + * + * We need the AIL lock in order to get a coherent read of the lsn of the last + * item in the AIL. + */ +xfs_lsn_t +xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_min(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * Return the maximum lsn held in the AIL, or zero if the AIL is empty. + */ +static xfs_lsn_t +xfs_ail_max_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_max(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * The cursor keeps track of where our current traversal is up to by tracking + * the next item in the list for us. However, for this to be safe, removing an + * object from the AIL needs to invalidate any cursor that points to it. hence + * the traversal cursor needs to be linked to the struct xfs_ail so that + * deletion can search all the active cursors for invalidation. + */ +STATIC void +xfs_trans_ail_cursor_init( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_add_tail(&cur->list, &ailp->xa_cursors); +} + +/* + * Get the next item in the traversal and advance the cursor. If the cursor + * was invalidated (indicated by a lip of 1), restart the traversal. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_next( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + struct xfs_log_item *lip = cur->item; + + if ((__psint_t)lip & 1) + lip = xfs_ail_min(ailp); + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} + +/* + * When the traversal is complete, we need to remove the cursor from the list + * of traversing cursors. + */ +void +xfs_trans_ail_cursor_done( + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_del_init(&cur->list); +} + +/* + * Invalidate any cursor that is pointing to this item. This is called when an + * item is removed from the AIL. Any cursor pointing to this object is now + * invalid and the traversal needs to be terminated so it doesn't reference a + * freed object. We set the low bit of the cursor item pointer so we can + * distinguish between an invalidation and the end of the list when getting the + * next item from the cursor. + */ +STATIC void +xfs_trans_ail_cursor_clear( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_ail_cursor *cur; + + list_for_each_entry(cur, &ailp->xa_cursors, list) { + if (cur->item == lip) + cur->item = (struct xfs_log_item *) + ((__psint_t)cur->item | 1); + } +} + +/* + * Find the first item in the AIL with the given @lsn by searching in ascending + * LSN order and initialise the cursor to point to the next item for a + * ascending traversal. Pass a @lsn of zero to initialise the cursor to the + * first item in the AIL. Returns NULL if the list is empty. + */ +xfs_log_item_t * +xfs_trans_ail_cursor_first( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + xfs_trans_ail_cursor_init(ailp, cur); + + if (lsn == 0) { + lip = xfs_ail_min(ailp); + goto out; + } + + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + goto out; + } + return NULL; + +out: + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} + +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Find the last item in the AIL with the given @lsn by searching in descending + * LSN order and initialise the cursor to point to that item. If there is no + * item with the value of @lsn, then it sets the cursor to the last item with an + * LSN lower than @lsn. Returns NULL if the list is empty. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * Splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. This should not be called with an empty list. + */ +static void +xfs_ail_splice( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) +{ + struct xfs_log_item *lip; + + ASSERT(!list_empty(list)); + + /* + * Use the cursor to determine the insertion point if one is + * provided. If not, or if the one we got is not valid, + * find the place in the AIL where the items belong. + */ + lip = cur ? cur->item : NULL; + if (!lip || (__psint_t) lip & 1) + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + /* + * If a cursor is provided, we know we're processing the AIL + * in lsn order, and future items to be spliced in will + * follow the last one being inserted now. Update the + * cursor to point to that last item, now while we have a + * reliable pointer to it. + */ + if (cur) + cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); + + /* + * Finally perform the splice. Unless the AIL was empty, + * lip points to the item in the AIL _after_ which the new + * items should go. If lip is null the AIL was empty, so + * the new items go at the head of the AIL. + */ + if (lip) + list_splice(list, &lip->li_ail); + else + list_splice(list, &ailp->xa_ail); +} + +/* + * Delete the given item from the AIL. Return a pointer to the item. + */ +static void +xfs_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_ail_check(ailp, lip); + list_del(&lip->li_ail); + xfs_trans_ail_cursor_clear(ailp, lip); +} + +static long +xfsaild_push( + struct xfs_ail *ailp) +{ + xfs_mount_t *mp = ailp->xa_mount; + struct xfs_ail_cursor cur; + xfs_log_item_t *lip; + xfs_lsn_t lsn; + xfs_lsn_t target; + long tout; + int stuck = 0; + int flushing = 0; + int count = 0; + + /* + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. + */ + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { + ailp->xa_log_flush = 0; + + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + } + + spin_lock(&ailp->xa_lock); + + /* barrier matches the xa_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->xa_target; + ailp->xa_target_prev = target; + + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); + if (!lip) { + /* + * If the AIL is empty or our push has reached the end we are + * done now. + */ + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + goto out_done; + } + + XFS_STATS_INC(xs_push_ail); + + lsn = lip->li_lsn; + while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { + int lock_result; + + /* + * Note that iop_push may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. + */ + lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + + ailp->xa_last_pushed_lsn = lsn; + break; + + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); + + flushing++; + ailp->xa_last_pushed_lsn = lsn; + break; + + case XFS_ITEM_PINNED: + XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + + stuck++; + ailp->xa_log_flush++; + break; + case XFS_ITEM_LOCKED: + XFS_STATS_INC(xs_push_ail_locked); + trace_xfs_ail_locked(lip); + + stuck++; + break; + default: + ASSERT(0); + break; + } + + count++; + + /* + * Are there too many items we can't do anything with? + * + * If we we are skipping too many items because we can't flush + * them or they are already being flushed, we back off and + * given them time to complete whatever operation is being + * done. i.e. remove pressure from the AIL while we can't make + * progress so traversals don't slow down further inserts and + * removals to/from the AIL. + * + * The value of 100 is an arbitrary magic number based on + * observation. + */ + if (stuck > 100) + break; + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + if (lip == NULL) + break; + lsn = lip->li_lsn; + } + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; + + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { +out_done: + /* + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. + */ + tout = 50; + ailp->xa_last_pushed_lsn = 0; + } else if (((stuck + flushing) * 100) / count > 90) { + /* + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. + * + * Backoff a bit more to allow some I/O to complete before + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. + */ + tout = 20; + ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; + } + + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + current->flags |= PF_MEMALLOC; + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&ailp->xa_lock); + + /* + * Idle if the AIL is empty and we are not racing with a target + * update. We check the AIL after we set the task to a sleep + * state to guarantee that we either catch an xa_target update + * or that a wake_up resets the state to TASK_RUNNING. + * Otherwise, we run the risk of sleeping indefinitely. + * + * The barrier matches the xa_target update in xfs_ail_push(). + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && + ailp->xa_target == ailp->xa_target_prev) { + spin_unlock(&ailp->xa_lock); + schedule(); + tout = 0; + continue; + } + spin_unlock(&ailp->xa_lock); + + if (tout) + schedule_timeout(msecs_to_jiffies(tout)); + + __set_current_state(TASK_RUNNING); + + try_to_freeze(); + + tout = xfsaild_push(ailp); + } + + return 0; +} + +/* + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. + * + * The push is run asynchronously in a workqueue, which means the caller needs + * to handle waiting on the async flush for space to become available. + * We don't want to interrupt any push that is in progress, hence we only queue + * work if we set the pushing bit approriately. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. + */ +void +xfs_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + return; + + /* + * Ensure that the new target is noticed in push code before it clears + * the XFS_AIL_PUSHING_BIT. + */ + smp_wmb(); + xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + smp_wmb(); + + wake_up_process(ailp->xa_task); +} + +/* + * Push out all items in the AIL immediately + */ +void +xfs_ail_push_all( + struct xfs_ail *ailp) +{ + xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); + + if (threshold_lsn) + xfs_ail_push(ailp, threshold_lsn); +} + +/* + * Push out all items in the AIL immediately and wait until the AIL is empty. + */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); +} + +/* + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. + * + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. + * + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); + + ASSERT(nr_items > 0); /* Not required, but true. */ + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + trace_xfs_ail_move(lip, lip->li_lsn, lsn); + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + trace_xfs_ail_insert(lip, 0, lsn); + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); + } + + if (!list_empty(&tmp)) + xfs_ail_splice(ailp, cur, &tmp, lsn); + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); + } +} + +/* + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL + * + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. + * + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. + * + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; + + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; + + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); + } + return; + } + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; + } + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); + } +} + +int +xfs_trans_ail_init( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp; + + ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + if (!ailp) + return -ENOMEM; + + ailp->xa_mount = mp; + INIT_LIST_HEAD(&ailp->xa_ail); + INIT_LIST_HEAD(&ailp->xa_cursors); + spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + + mp->m_ail = ailp; + return 0; + +out_free_ailp: + kmem_free(ailp); + return -ENOMEM; +} + +void +xfs_trans_ail_destroy( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp = mp->m_ail; + + kthread_stop(ailp->xa_task); + kmem_free(ailp); +} diff --git a/kernel/fs/xfs/xfs_trans_buf.c b/kernel/fs/xfs/xfs_trans_buf.c new file mode 100644 index 000000000..757984128 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_buf.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps) +{ + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; + int len = 0; + int i; + + for (i = 0; i < nmaps; i++) + len += map[i].bm_len; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + blip->bli_buf->b_target == target && + XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + blip->bli_buf->b_length == len) { + ASSERT(blip->bli_buf->b_map_count == nmaps); + return blip->bli_buf; + } + } + + return NULL; +} + +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(bp->b_transp == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = bp->b_fspriv; + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &bip->bli_item); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + bp->b_transp = tp; + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it is already locked + * within the transaction, just increment its lock recursion count + * and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * get_buf() call. + */ +struct xfs_buf * +xfs_trans_get_buf_map( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + if (!tp) + return xfs_buf_get_map(target, map, nmaps, flags); + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp != NULL) { + ASSERT(xfs_buf_islocked(bp)); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + } + + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_get_buf_recur(bip); + return bp; + } + + bp = xfs_buf_get_map(target, map, nmaps, flags); + if (bp == NULL) { + return NULL; + } + + ASSERT(!bp->b_error); + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); + return bp; +} + +/* + * Get and lock the superblock buffer of this file system for the + * given transaction. + * + * We don't need to use incore_match() here, because the superblock + * buffer is a private buffer which we keep a pointer to in the + * mount structure. + */ +xfs_buf_t * +xfs_trans_getsb(xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + /* + * Default to just trying to lock the superblock buffer + * if tp is NULL. + */ + if (tp == NULL) + return xfs_getsb(mp, flags); + + /* + * If the superblock buffer already has this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = mp->m_sb_bp; + if (bp->b_transp == tp) { + bip = bp->b_fspriv; + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); + return bp; + } + + bp = xfs_getsb(mp, flags); + if (bp == NULL) + return NULL; + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); + return bp; +} + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it has not yet been + * read in, read it from disk. If it is already locked + * within the transaction and already read in, just increment its + * lock recursion count and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * read_buf() call. + */ +int +xfs_trans_read_buf_map( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp = NULL; + struct xfs_buf_log_item *bip; + int error; + + *bpp = NULL; + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. If it is already read in we just increment + * the lock recursion count and return the buffer to the caller. + * If the buffer is not yet read in, then we read it in, increment + * the lock recursion count, and return it to the caller. + */ + if (tp) + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp) { + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_transp == tp); + ASSERT(bp->b_fspriv != NULL); + ASSERT(!bp->b_error); + ASSERT(bp->b_flags & XBF_DONE); + + /* + * We never locked this buf ourselves, so we shouldn't + * brelse it either. Just get out. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; + } + + bip = bp->b_fspriv; + bip->bli_recur++; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_trans_read_buf_recur(bip); + *bpp = bp; + return 0; + } + + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); + if (!bp) { + if (!(flags & XBF_TRYLOCK)) + return -ENOMEM; + return tp ? 0 : -EAGAIN; + } + + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ + if (bp->b_error) { + error = bp->b_error; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + + if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_relse(bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; + } + + if (tp) { + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); + } + *bpp = bp; + return 0; + +} + +/* + * Release the buffer bp which was previously acquired with one of the + * xfs_trans_... buffer allocation routines if the buffer has not + * been modified within this transaction. If the buffer is modified + * within this transaction, do decrement the recursion count but do + * not release the buffer even if the count goes to 0. If the buffer is not + * modified within the transaction, decrement the recursion count and + * release the buffer if the recursion count goes to 0. + * + * If the buffer is to be released and it was not modified before + * this transaction began, then free the buf_log_item associated with it. + * + * If the transaction pointer is NULL, make this just a normal + * brelse() call. + */ +void +xfs_trans_brelse(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + /* + * Default to a normal brelse() call if the tp is NULL. + */ + if (tp == NULL) { + ASSERT(bp->b_transp == NULL); + xfs_buf_relse(bp); + return; + } + + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_brelse(bip); + + /* + * If the release is just for a recursive lock, + * then decrement the count and return. + */ + if (bip->bli_recur > 0) { + bip->bli_recur--; + return; + } + + /* + * If the buffer is dirty within this transaction, we can't + * release it until we commit. + */ + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + return; + + /* + * If the buffer has been invalidated, then we can't release + * it until the transaction commits to disk unless it is re-dirtied + * as part of this transaction. This prevents us from pulling + * the item from the AIL before we should. + */ + if (bip->bli_flags & XFS_BLI_STALE) + return; + + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + + /* + * Free up the log item descriptor tracking the released item. + */ + xfs_trans_del_item(&bip->bli_item); + + /* + * Clear the hold flag in the buf log item if it is set. + * We wouldn't want the next user of the buffer to + * get confused. + */ + if (bip->bli_flags & XFS_BLI_HOLD) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Drop our reference to the buf log item. + */ + atomic_dec(&bip->bli_refcount); + + /* + * If the buf item is not tracking data in the log, then + * we must free it before releasing the buffer back to the + * free pool. Before releasing the buffer to the free pool, + * clear the transaction pointer in b_fsprivate2 to dissolve + * its relation to this transaction. + */ + if (!xfs_buf_item_dirty(bip)) { +/*** + ASSERT(bp->b_pincount == 0); +***/ + ASSERT(atomic_read(&bip->bli_refcount) == 0); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); + xfs_buf_item_relse(bp); + } + + bp->b_transp = NULL; + xfs_buf_relse(bp); +} + +/* + * Mark the buffer as not needing to be unlocked when the buf item's + * iop_unlock() routine is called. The buffer must already be locked + * and associated with the given transaction. + */ +/* ARGSUSED */ +void +xfs_trans_bhold(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_HOLD; + trace_xfs_trans_bhold(bip); +} + +/* + * Cancel the previous buffer hold request made on this buffer + * for this transaction. + */ +void +xfs_trans_bhold_release(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT(bip->bli_flags & XFS_BLI_HOLD); + + bip->bli_flags &= ~XFS_BLI_HOLD; + trace_xfs_trans_bhold_release(bip); +} + +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf(xfs_trans_t *tp, + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + + /* + * Mark the buffer as needing to be written out eventually, + * and set its iodone function to remove the buffer's buf log + * item from the AIL and free it when the buffer is flushed + * to disk. See xfs_buf_attach_iodone() for more details + * on li_cb and xfs_buf_iodone_callbacks(). + * If we end up aborting this transaction, we trap this buffer + * inside the b_bdstrat callback so that this won't get written to + * disk. + */ + XFS_BUF_DONE(bp); + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bp->b_iodone = xfs_buf_iodone_callbacks; + bip->bli_item.li_cb = xfs_buf_iodone; + + trace_xfs_trans_log_buf(bip); + + /* + * If we invalidated the buffer within this transaction, then + * cancel the invalidation now that we're dirtying the buffer + * again. There are no races with the code in xfs_buf_item_unpin(), + * because we have a reference to the buffer this entire time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + bip->bli_flags &= ~XFS_BLI_STALE; + ASSERT(XFS_BUF_ISSTALE(bp)); + XFS_BUF_UNSTALE(bp); + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; + } + + tp->t_flags |= XFS_TRANS_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); +} + + +/* + * Invalidate a buffer that is being used within a transaction. + * + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. + * + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. + */ +void +xfs_trans_binval( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_binval(bip); + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * If the buffer is already invalidated, then + * just return. + */ + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(tp->t_flags & XFS_TRANS_DIRTY); + return; + } + + xfs_buf_stale(bp); + + bip->bli_flags |= XFS_BLI_STALE; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY; +} + +/* + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. + * + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. + */ +void +xfs_trans_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * This call is used to indicate that the buffer is going to + * be staled and was an inode buffer. This means it gets + * special processing during unpin - where any inodes + * associated with the buffer should be removed from ail. + * There is also special processing during recovery, + * any replay of the inodes in the buffer needs to be + * prevented as the buffer may have been reused. + */ +void +xfs_trans_stale_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_STALE_INODE; + bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * Mark the buffer as being one which contains newly allocated + * inodes. We need to make sure that even if this buffer is + * relogged as an 'inode buf' we still recover all of the inode + * images in the face of a crash. This works in coordination with + * xfs_buf_item_committed() to ensure that the buffer remains in the + * AIL at its original location even after it has been relogged. + */ +/* ARGSUSED */ +void +xfs_trans_inode_alloc_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. + */ +void +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); +} + +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} + +/* + * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of + * dquots. However, unlike in inode buffer recovery, dquot buffers get + * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). + * The only thing that makes dquot buffers different from regular + * buffers is that we must not replay dquot bufs when recovering + * if a _corresponding_ quotaoff has happened. We also have to distinguish + * between usr dquot bufs and grp dquot bufs, because usr and grp quotas + * can be turned off independently. + */ +/* ARGSUSED */ +void +xfs_trans_dquot_buf( + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); + + bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); +} diff --git a/kernel/fs/xfs/xfs_trans_dquot.c b/kernel/fs/xfs/xfs_trans_dquot.c new file mode 100644 index 000000000..76a16df55 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_dquot.c @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" + +STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); + +/* + * Add the locked dquot to the transaction. + * The dquot must be locked, and it cannot be associated with any + * transaction. + */ +void +xfs_trans_dqjoin( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp != tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_logitem.qli_dquot == dqp); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); + + /* + * Initialize d_transp so we can later determine if this dquot is + * associated with this transaction. + */ + dqp->q_transp = tp; +} + + +/* + * This is called to mark the dquot as needing + * to be logged when the transaction is committed. The dquot must + * already be associated with the given transaction. + * Note that it marks the entire transaction as dirty. In the ordinary + * case, this gets called via xfs_trans_commit, after the transaction + * is already dirty. However, there's nothing stop this from getting + * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY + * flag. + */ +void +xfs_trans_log_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp == tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + tp->t_flags |= XFS_TRANS_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +/* + * Carry forward whatever is left of the quota blk reservation to + * the spanky new transaction + */ +void +xfs_trans_dup_dqinfo( + xfs_trans_t *otp, + xfs_trans_t *ntp) +{ + xfs_dqtrx_t *oq, *nq; + int i,j; + xfs_dqtrx_t *oqa, *nqa; + + if (!otp->t_dqinfo) + return; + + xfs_trans_alloc_dqinfo(ntp); + + /* + * Because the quota blk reservation is carried forward, + * it is also necessary to carry forward the DQ_DIRTY flag. + */ + if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + ntp->t_flags |= XFS_TRANS_DQ_DIRTY; + + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (oqa[i].qt_dquot == NULL) + break; + oq = &oqa[i]; + nq = &nqa[i]; + + nq->qt_dquot = oq->qt_dquot; + nq->qt_bcount_delta = nq->qt_icount_delta = 0; + nq->qt_rtbcount_delta = 0; + + /* + * Transfer whatever is left of the reservations. + */ + nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; + oq->qt_blk_res = oq->qt_blk_res_used; + + nq->qt_rtblk_res = oq->qt_rtblk_res - + oq->qt_rtblk_res_used; + oq->qt_rtblk_res = oq->qt_rtblk_res_used; + + nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used; + oq->qt_ino_res = oq->qt_ino_res_used; + + } + } +} + +/* + * Wrap around mod_dquot to account for both user and group quotas. + */ +void +xfs_trans_mod_dquot_byino( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint field, + long delta) +{ + xfs_mount_t *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || + !XFS_IS_QUOTA_ON(mp) || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) + (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); +} + +STATIC struct xfs_dqtrx * +xfs_trans_get_dqtrx( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + int i; + struct xfs_dqtrx *qa; + + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else if (XFS_QM_ISGDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; + else if (XFS_QM_ISPDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; + else + return NULL; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (qa[i].qt_dquot == NULL || + qa[i].qt_dquot == dqp) + return &qa[i]; + } + + return NULL; +} + +/* + * Make the changes in the transaction structure. + * The moral equivalent to xfs_trans_mod_sb(). + * We don't touch any fields in the dquot, so we don't care + * if it's locked or not (most of the time it won't be). + */ +void +xfs_trans_mod_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp, + uint field, + long delta) +{ + xfs_dqtrx_t *qtrx; + + ASSERT(tp); + ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + qtrx = NULL; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + /* + * Find either the first free slot or the slot that belongs + * to this dquot. + */ + qtrx = xfs_trans_get_dqtrx(tp, dqp); + ASSERT(qtrx); + if (qtrx->qt_dquot == NULL) + qtrx->qt_dquot = dqp; + + switch (field) { + + /* + * regular disk blk reservation + */ + case XFS_TRANS_DQ_RES_BLKS: + qtrx->qt_blk_res += (ulong)delta; + break; + + /* + * inode reservation + */ + case XFS_TRANS_DQ_RES_INOS: + qtrx->qt_ino_res += (ulong)delta; + break; + + /* + * disk blocks used. + */ + case XFS_TRANS_DQ_BCOUNT: + if (qtrx->qt_blk_res && delta > 0) { + qtrx->qt_blk_res_used += (ulong)delta; + ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); + } + qtrx->qt_bcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELBCOUNT: + qtrx->qt_delbcnt_delta += delta; + break; + + /* + * Inode Count + */ + case XFS_TRANS_DQ_ICOUNT: + if (qtrx->qt_ino_res && delta > 0) { + qtrx->qt_ino_res_used += (ulong)delta; + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + } + qtrx->qt_icount_delta += delta; + break; + + /* + * rtblk reservation + */ + case XFS_TRANS_DQ_RES_RTBLKS: + qtrx->qt_rtblk_res += (ulong)delta; + break; + + /* + * rtblk count + */ + case XFS_TRANS_DQ_RTBCOUNT: + if (qtrx->qt_rtblk_res && delta > 0) { + qtrx->qt_rtblk_res_used += (ulong)delta; + ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); + } + qtrx->qt_rtbcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELRTBCOUNT: + qtrx->qt_delrtb_delta += delta; + break; + + default: + ASSERT(0); + } + tp->t_flags |= XFS_TRANS_DQ_DIRTY; +} + + +/* + * Given an array of dqtrx structures, lock all the dquots associated and join + * them to the transaction, provided they have been modified. We know that the + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. + */ +STATIC void +xfs_trans_dqlockedjoin( + xfs_trans_t *tp, + xfs_dqtrx_t *q) +{ + ASSERT(q[0].qt_dquot != NULL); + if (q[1].qt_dquot == NULL) { + xfs_dqlock(q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + } else { + ASSERT(XFS_QM_TRANS_MAXDQS == 2); + xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[1].qt_dquot); + } +} + + +/* + * Called by xfs_trans_commit() and similar in spirit to + * xfs_trans_apply_sb_deltas(). + * Go thru all the dquots belonging to this transaction and modify the + * INCORE dquot to reflect the actual usages. + * Unreserve just the reservations done by this transaction. + * dquot is still left locked at exit. + */ +void +xfs_trans_apply_dquot_deltas( + struct xfs_trans *tp) +{ + int i, j; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; + long totalbdelta; + long totalrtbdelta; + + if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + ASSERT(tp->t_dqinfo); + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) + continue; + + /* + * Lock all of the dquots and join them to the transaction. + */ + xfs_trans_dqlockedjoin(tp, qa); + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * The array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_transp == tp); + + /* + * adjust the actual number of blocks used + */ + d = &dqp->q_core; + + /* + * The issue here is - sometimes we don't make a blkquota + * reservation intentionally to be fair to users + * (when the amount is small). On the other hand, + * delayed allocs do make reservations, but that's + * outside of a transaction, so we have no + * idea how much was really reserved. + * So, here we've accumulated delayed allocation blks and + * non-delay blks. The assumption is that the + * delayed ones are always reserved (outside of a + * transaction), and the others may or may not have + * quota reservations. + */ + totalbdelta = qtrx->qt_bcount_delta + + qtrx->qt_delbcnt_delta; + totalrtbdelta = qtrx->qt_rtbcount_delta + + qtrx->qt_delrtb_delta; +#ifdef DEBUG + if (totalbdelta < 0) + ASSERT(be64_to_cpu(d->d_bcount) >= + -totalbdelta); + + if (totalrtbdelta < 0) + ASSERT(be64_to_cpu(d->d_rtbcount) >= + -totalrtbdelta); + + if (qtrx->qt_icount_delta < 0) + ASSERT(be64_to_cpu(d->d_icount) >= + -qtrx->qt_icount_delta); +#endif + if (totalbdelta) + be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + + if (qtrx->qt_icount_delta) + be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + + if (totalrtbdelta) + be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + + /* + * Get any default limits in use. + * Start/reset the timer(s) if needed. + */ + if (d->d_id) { + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); + xfs_qm_adjust_dqtimers(tp->t_mountp, d); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + /* + * add this to the list of items to get logged + */ + xfs_trans_log_dquot(tp, dqp); + /* + * Take off what's left of the original reservation. + * In case of delayed allocations, there's no + * reservation that a transaction structure knows of. + */ + if (qtrx->qt_blk_res != 0) { + if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_blk_res > + qtrx->qt_blk_res_used) + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res - + qtrx->qt_blk_res_used); + else + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res_used - + qtrx->qt_blk_res); + } + } else { + /* + * These blks were never reserved, either inside + * a transaction or outside one (in a delayed + * allocation). Also, this isn't always a + * negative number since we sometimes + * deliberately skip quota reservations. + */ + if (qtrx->qt_bcount_delta) { + dqp->q_res_bcount += + (xfs_qcnt_t)qtrx->qt_bcount_delta; + } + } + /* + * Adjust the RT reservation. + */ + if (qtrx->qt_rtblk_res != 0) { + if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { + if (qtrx->qt_rtblk_res > + qtrx->qt_rtblk_res_used) + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res - + qtrx->qt_rtblk_res_used); + else + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res_used - + qtrx->qt_rtblk_res); + } + } else { + if (qtrx->qt_rtbcount_delta) + dqp->q_res_rtbcount += + (xfs_qcnt_t)qtrx->qt_rtbcount_delta; + } + + /* + * Adjust the inode reservation. + */ + if (qtrx->qt_ino_res != 0) { + ASSERT(qtrx->qt_ino_res >= + qtrx->qt_ino_res_used); + if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) + dqp->q_res_icount -= (xfs_qcnt_t) + (qtrx->qt_ino_res - + qtrx->qt_ino_res_used); + } else { + if (qtrx->qt_icount_delta) + dqp->q_res_icount += + (xfs_qcnt_t)qtrx->qt_icount_delta; + } + + ASSERT(dqp->q_res_bcount >= + be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_icount >= + be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_res_rtbcount >= + be64_to_cpu(dqp->q_core.d_rtbcount)); + } + } +} + +/* + * Release the reservations, and adjust the dquots accordingly. + * This is called only when the transaction is being aborted. If by + * any chance we have done dquot modifications incore (ie. deltas) already, + * we simply throw those away, since that's the expected behavior + * when a transaction is curtailed without a commit. + */ +void +xfs_trans_unreserve_and_mod_dquots( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + bool locked; + + if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * We assume that the array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + /* + * Unreserve the original reservation. We don't care + * about the number of blocks used field, or deltas. + * Also we don't bother to zero the fields. + */ + locked = false; + if (qtrx->qt_blk_res) { + xfs_dqlock(dqp); + locked = true; + dqp->q_res_bcount -= + (xfs_qcnt_t)qtrx->qt_blk_res; + } + if (qtrx->qt_ino_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } + dqp->q_res_icount -= + (xfs_qcnt_t)qtrx->qt_ino_res; + } + + if (qtrx->qt_rtblk_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } + dqp->q_res_rtbcount -= + (xfs_qcnt_t)qtrx->qt_rtblk_res; + } + if (locked) + xfs_dqunlock(dqp); + + } + } +} + +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) +{ + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning(make_kqid(&init_user_ns, + (dqp->dq_flags & XFS_DQ_USER) ? + USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id)), + mp->m_super->s_dev, type); +} + +/* + * This reserves disk blocks and inodes against a dquot. + * Flags indicate if the dquot is to be locked here and also + * if the blk reservation is for RT or regular blocks. + * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. + */ +STATIC int +xfs_trans_dqresv( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + long nblks, + long ninos, + uint flags) +{ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t total_count; + xfs_qcnt_t *resbcountp; + xfs_quotainfo_t *q = mp->m_quotainfo; + + + xfs_dqlock(dqp); + + if (flags & XFS_TRANS_DQ_RES_BLKS) { + hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (!hardlimit) + hardlimit = q->qi_bhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!softlimit) + softlimit = q->qi_bsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_btimer); + warns = be16_to_cpu(dqp->q_core.d_bwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; + resbcountp = &dqp->q_res_bcount; + } else { + ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); + hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); + if (!hardlimit) + hardlimit = q->qi_rtbhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); + if (!softlimit) + softlimit = q->qi_rtbsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; + resbcountp = &dqp->q_res_rtbcount; + } + + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && + dqp->q_core.d_id && + ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { + if (nblks > 0) { + /* + * dquot is locked already. See if we'd go over the + * hardlimit or exceed the timelimit if we allocate + * nblks. + */ + total_count = *resbcountp + nblks; + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + goto error_return; + } + if (softlimit && total_count > softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); + goto error_return; + } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); + } + } + if (ninos > 0) { + total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; + timer = be32_to_cpu(dqp->q_core.d_itimer); + warns = be16_to_cpu(dqp->q_core.d_iwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; + hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (!hardlimit) + hardlimit = q->qi_ihardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + if (!softlimit) + softlimit = q->qi_isoftlimit; + + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + goto error_return; + } + if (softlimit && total_count > softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); + goto error_return; + } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); + } + } + } + + /* + * Change the reservation, but not the actual usage. + * Note that q_res_bcount = q_core.d_bcount + resv + */ + (*resbcountp) += (xfs_qcnt_t)nblks; + if (ninos != 0) + dqp->q_res_icount += (xfs_qcnt_t)ninos; + + /* + * note the reservation amt in the trans struct too, + * so that the transaction knows how much was reserved by + * it against this particular dquot. + * We don't do this when we are reserving for a delayed allocation, + * because we don't have the luxury of a transaction envelope then. + */ + if (tp) { + ASSERT(tp->t_dqinfo); + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + if (nblks != 0) + xfs_trans_mod_dquot(tp, dqp, + flags & XFS_QMOPT_RESBLK_MASK, + nblks); + if (ninos != 0) + xfs_trans_mod_dquot(tp, dqp, + XFS_TRANS_DQ_RES_INOS, + ninos); + } + ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + + xfs_dqunlock(dqp); + return 0; + +error_return: + xfs_dqunlock(dqp); + if (flags & XFS_QMOPT_ENOSPC) + return -ENOSPC; + return -EDQUOT; +} + + +/* + * Given dquot(s), make disk block and/or inode reservations against them. + * The fact that this does the reservation against user, group and + * project quotas is important, because this follows a all-or-nothing + * approach. + * + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. + * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks + * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks + * dquots are unlocked on return, if they were not locked by caller. + */ +int +xfs_trans_reserve_quota_bydquots( + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + long nblks, + long ninos, + uint flags) +{ + int error; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + if (tp && tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + + if (udqp) { + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; + } + + if (gdqp) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) + goto unwind_usr; + } + + if (pdqp) { + error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags); + if (error) + goto unwind_grp; + } + + /* + * Didn't change anything critical, so, no need to log + */ + return 0; + +unwind_grp: + flags |= XFS_QMOPT_FORCE_RES; + if (gdqp) + xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags); +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; +} + + +/* + * Lock the dquot and change the reservation if we can. + * This doesn't change the actual usage, just the reservation. + * The inode sent in is locked. + */ +int +xfs_trans_reserve_quota_nblks( + struct xfs_trans *tp, + struct xfs_inode *ip, + long nblks, + long ninos, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; + + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); + + /* + * Reserve nblks against these dquots, with trans as the mediator. + */ + return xfs_trans_reserve_quota_bydquots(tp, mp, + ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, + nblks, ninos, flags); +} + +/* + * This routine is called to allocate a quotaoff log item. + */ +xfs_qoff_logitem_t * +xfs_trans_get_qoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_qoff_logitem_t *q; + + ASSERT(tp != NULL); + + q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); + ASSERT(q != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &q->qql_item); + return q; +} + + +/* + * This is called to mark the quotaoff logitem as needing + * to be logged when the transaction is committed. The logitem must + * already be associated with the given transaction. + */ +void +xfs_trans_log_quotaoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *qlp) +{ + tp->t_flags |= XFS_TRANS_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +STATIC void +xfs_trans_alloc_dqinfo( + xfs_trans_t *tp) +{ + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); +} + +void +xfs_trans_free_dqinfo( + xfs_trans_t *tp) +{ + if (!tp->t_dqinfo) + return; + kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + tp->t_dqinfo = NULL; +} diff --git a/kernel/fs/xfs/xfs_trans_extfree.c b/kernel/fs/xfs/xfs_trans_extfree.c new file mode 100644 index 000000000..284397dd7 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_extfree.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + +/* + * This routine is called to allocate an "extent free intention" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efi_log_item_t * +xfs_trans_get_efi(xfs_trans_t *tp, + uint nextents) +{ + xfs_efi_log_item_t *efip; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efip = xfs_efi_init(tp->t_mountp, nextents); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as needing to be freed. It should + * be called once for each extent to be freed. + */ +void +xfs_trans_log_efi_extent(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &(efip->efi_format.efi_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; +} + + +/* + * This routine is called to allocate an "extent free done" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efd_log_item_t * +xfs_trans_get_efd(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + uint nextents) +{ + xfs_efd_log_item_t *efdp; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efdp = xfs_efd_init(tp->t_mountp, efip, nextents); + ASSERT(efdp != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as having been freed. It should + * be called once for each extent freed. + */ +void +xfs_trans_log_efd_extent(xfs_trans_t *tp, + xfs_efd_log_item_t *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efdp->efd_next_extent++; +} diff --git a/kernel/fs/xfs/xfs_trans_inode.c b/kernel/fs/xfs/xfs_trans_inode.c new file mode 100644 index 000000000..17280cd71 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_inode.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" + +/* + * Add a locked inode to the transaction. + * + * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. + */ +void +xfs_trans_ijoin( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) +{ + xfs_inode_log_item_t *iip; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (ip->i_itemp == NULL) + xfs_inode_item_init(ip, ip->i_mount); + iip = ip->i_itemp; + + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &iip->ili_item); +} + +/* + * Transactional inode timestamp update. Requires the inode to be locked and + * joined to the transaction supplied. Relies on the transaction subsystem to + * track dirty state and update/writeback the inode accordingly. + */ +void +xfs_trans_ichgtime( + struct xfs_trans *tp, + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + struct timespec tv; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { + inode->i_mtime = tv; + ip->i_d.di_mtime.t_sec = tv.tv_sec; + ip->i_d.di_mtime.t_nsec = tv.tv_nsec; + } + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { + inode->i_ctime = tv; + ip->i_d.di_ctime.t_sec = tv.tv_sec; + ip->i_d.di_ctime.t_nsec = tv.tv_nsec; + } +} + +/* + * This is called to mark the fields indicated in fieldmask as needing + * to be logged when the transaction is committed. The inode must + * already be associated with the given transaction. + * + * The values for fieldmask are defined in xfs_inode_item.h. We always + * log all of the core inode if any of it has changed, and we always log + * all of the inline data/extents/b-tree root if any of them has changed. + */ +void +xfs_trans_log_inode( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint flags) +{ + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } + + tp->t_flags |= XFS_TRANS_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * Always OR in the bits from the ili_last_fields field. + * This is to coordinate with the xfs_iflush() and xfs_iflush_done() + * routines in the eventual clearing of the ili_fields bits. + * See the big comment in xfs_iflush() for an explanation of + * this coordination mechanism. + */ + flags |= ip->i_itemp->ili_last_fields; + ip->i_itemp->ili_fields |= flags; +} diff --git a/kernel/fs/xfs/xfs_trans_priv.h b/kernel/fs/xfs/xfs_trans_priv.h new file mode 100644 index 000000000..bd1281862 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; + struct list_head xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/kernel/fs/xfs/xfs_xattr.c b/kernel/fs/xfs/xfs_xattr.c new file mode 100644 index 000000000..c03681518 --- /dev/null +++ b/kernel/fs/xfs/xfs_xattr.c @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" + +#include +#include + + +static int +xfs_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return xfs_attr_remove(ip, (unsigned char *)name, xflags); + return xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); +} + +static const struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +const struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, +#ifdef CONFIG_XFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, (char *)name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = d_inode(dentry); + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (posix_acl_access_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (posix_acl_default_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} -- cgit 1.2.3-korg